From 9b0f998d1625804f1b9f4a4f1e019eee48652a6d Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Wed, 15 Sep 2021 23:41:15 +0000
Subject: [PATCH 001/175] mm, memcg: throttle the memory reclaim given
 dirty/writeback pages to avoid early OOMs

This is the improved workaround to avoid early OOMs within cgroup v1
by throttling the memory reclaim given dirty/writeback pages
under the GFP_NOFS allocations. Increment sleeping time exponentialy
until a limit after half the number of maximum retries when writeback+dirty
pages goes beyond a certain threshold before next retry occurs.
This solution can not only help to prevent early OOMs on some extreme
workload but also avoid unnecessary throttling on general cases.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=207273
Suggested-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 mm/memcontrol.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4570d3e315cf1..80a8970202213 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2629,6 +2629,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 {
 	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
 	int nr_retries = MAX_RECLAIM_RETRIES;
+	int timeout = 1;
 	struct mem_cgroup *mem_over_limit;
 	struct page_counter *counter;
 	unsigned long nr_reclaimed;
@@ -2710,7 +2711,25 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		goto retry;
-
+	/*
+	 * Legacy memcg relies on dirty data throttling during the reclaim
+	 * but this cannot be done for GFP_NOFS requests so we might trigger
+	 * the oom way too early. Throttle here if we have way too many
+	 * dirty/writeback pages.
+	 */
+	if ((nr_retries < MAX_RECLAIM_RETRIES/2) &&
+	    !cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+	    !(gfp_mask & __GFP_FS)) {
+		unsigned long dirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+		unsigned long writeback = memcg_page_state(memcg, NR_WRITEBACK);
+
+		if (4*(dirty + writeback) >
+		    3*page_counter_read(&memcg->memory)) {
+			schedule_timeout_interruptible(timeout);
+			if (timeout < 32)
+				timeout *= 2;
+		}
+	}
 	if (nr_retries--)
 		goto retry;
 
@@ -2730,6 +2749,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			   get_order(nr_pages * PAGE_SIZE))) {
 		passed_oom = true;
 		nr_retries = MAX_RECLAIM_RETRIES;
+		timeout = 1;
 		goto retry;
 	}
 nomem:

From d620734cc562296637723f3e99dd8d4682d3b20c Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 3 Apr 2019 10:00:01 +0530
Subject: [PATCH 002/175] Sysfs memory probe interface

(/sys/devices/system/memory/probe) can accept starting physical address of an
entire memory block to be hot added into the kernel. This is in addition to
the existing ACPI based interface. This just enables it with the required
config CONFIG_ARCH_MEMORY_PROBE.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/Kconfig | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 044b98a62f7bb..2163fe52c8b1b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -328,6 +328,15 @@ config GENERIC_CALIBRATE_DELAY
 config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
 	def_bool y
 
+config ARCH_MEMORY_PROBE
+	bool "Enable /sys/devices/system/memory/probe interface"
+	depends on MEMORY_HOTPLUG
+	help
+	  This option enables a sysfs /sys/devices/system/memory/probe
+	  interface for testing. See Documentation/memory-hotplug.txt
+	  for more information. If you are unsure how to answer this
+	  question, answer N.
+
 config SMP
 	def_bool y
 

From 06b0ca6335558f0b57022f95bd5a10694f5243e7 Mon Sep 17 00:00:00 2001
From: Rohit Wali <rohiwali@amazon.com>
Date: Wed, 14 Jul 2021 17:30:08 +0000
Subject: [PATCH 003/175] arm64/mm: Enable sysfs based memory hot remove probe

Issue: Offlining non-boot memory on arm64 via
/sys/devices/system/memory/<mem_id>/state doesnt eliminate the struct page
memory associated with the offlined memory. As memory is offlined, total and
free memory reduce but the memory associated with struct page isnt given
back and is reported as 'used' memory instead. This is because offlining via
the sysfs 'state' probe doesnt remove the memmap associated with the memory
to be offlined.

Fix: Expose a sysfs probe that also removes memmap associated with the
memory block after offlining it. Probe exposed accepts the physical address of
a memory block to be removed.

Signed-off-by: Rohit Wali <rohiwali@amazon.com>
---
 arch/arm64/Kconfig    |  9 +++++++++
 drivers/base/memory.c | 31 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 2163fe52c8b1b..e04dfa6b2ba88 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -337,6 +337,15 @@ config ARCH_MEMORY_PROBE
 	  for more information. If you are unsure how to answer this
 	  question, answer N.
 
+config ARCH_MEMORY_REMOVE
+	bool "Enable /sys/devices/system/memory/remove interface"
+	depends on MEMORY_HOTREMOVE
+	help
+	   This option enables a sysfs /sys/devices/system/memory/remove
+	   interface for testing. See Documentation/memory-hotplug.txt
+	   for more information. If you are unsure how to answer this
+	   question, answer N.
+
 config SMP
 	def_bool y
 
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 5d39f3e374dae..5b080c6ca90db 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -537,6 +537,34 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR_WO(probe);
 #endif
 
+#ifdef CONFIG_ARCH_MEMORY_REMOVE
+static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	u64 phys_addr;
+	int nid, ret;
+	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
+
+	ret = kstrtoull(buf, 0, &phys_addr);
+	if (ret)
+		return ret;
+
+	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
+		return -EINVAL;
+
+	nid = memory_add_physaddr_to_nid(phys_addr);
+	ret = offline_and_remove_memory(nid, phys_addr, MIN_MEMORY_BLOCK_SIZE * sections_per_block);
+
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR_WO(remove);
+#endif
+
+
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Support for offlining pages of memory
@@ -885,6 +913,9 @@ static struct attribute *memory_root_attrs[] = {
 #ifdef CONFIG_ARCH_MEMORY_PROBE
 	&dev_attr_probe.attr,
 #endif
+#ifdef CONFIG_ARCH_MEMORY_REMOVE
+	&dev_attr_remove.attr,
+#endif
 
 #ifdef CONFIG_MEMORY_FAILURE
 	&dev_attr_soft_offline_page.attr,

From 552804aa55b01be06d162c6ff62f965beb339784 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 31 Dec 2021 00:26:45 +0000
Subject: [PATCH 004/175] memory: fix offline_and_remove_memory use

Since commit e1c158e495661 ("mm/memory_hotplug: remove nid parameter
from remove_memory() and friends"), offline_and_remove_memory() no
longer takes a node id arguments. Adapt.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/base/memory.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 5b080c6ca90db..35b99f493b883 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -542,7 +542,7 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
 	u64 phys_addr;
-	int nid, ret;
+	int ret;
 	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
 
 	ret = kstrtoull(buf, 0, &phys_addr);
@@ -552,8 +552,7 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
 		return -EINVAL;
 
-	nid = memory_add_physaddr_to_nid(phys_addr);
-	ret = offline_and_remove_memory(nid, phys_addr, MIN_MEMORY_BLOCK_SIZE * sections_per_block);
+	ret = offline_and_remove_memory(phys_addr, MIN_MEMORY_BLOCK_SIZE * sections_per_block);
 
 	if (ret)
 		return ret;

From f77963b78a99fabeda67c10281732466536b9c5a Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 9 Dec 2021 20:41:11 +0000
Subject: [PATCH 005/175] drivers/base/memory: use MHP_MEMMAP_ON_MEMORY from
 the probe interface

If it is possible to use MHP_MEMMAP_ON_MEMORY from the probe interface,
which should normally be the case, do so.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/base/memory.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 35b99f493b883..fba7cafdfa3cb 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -505,9 +505,10 @@ static DEVICE_ATTR_RW(auto_online_blocks);
 static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 			   const char *buf, size_t count)
 {
-	u64 phys_addr;
+	u64 phys_addr, size;
 	int nid, ret;
 	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
+	mhp_t mhp_flags;
 
 	ret = kstrtoull(buf, 0, &phys_addr);
 	if (ret)
@@ -520,10 +521,12 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 	if (ret)
 		return ret;
 
+	size = MIN_MEMORY_BLOCK_SIZE * sections_per_block;
+	mhp_flags = mhp_supports_memmap_on_memory(size) ?
+	    MHP_MEMMAP_ON_MEMORY : MHP_NONE;
+
 	nid = memory_add_physaddr_to_nid(phys_addr);
-	ret = __add_memory(nid, phys_addr,
-			   MIN_MEMORY_BLOCK_SIZE * sections_per_block,
-			   MHP_NONE);
+	ret = __add_memory(nid, phys_addr, size, mhp_flags);
 
 	if (ret)
 		goto out;

From 641c0c2a38b62f5cb2844a2f20e2eb193793f857 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 6 Jan 2022 19:05:17 +0000
Subject: [PATCH 006/175] mm: add offline page reporting interface

Add an interface to report offlined pages as free to the hypervisor.

Define a new entry point for page reporting drivers, report_offline.
If a driver sets it, it will be called after a range of memory
has been offlined.

This is done separately, and not with a memory notifier, since with
memmap_on_memory, there are pages that are only freed outside of
offline_pages, where the notifiers are called.

Since this will be called asynchronously (e.g. not from the page
reporting work queues), protect it with the page reporting mutex
so that a driver can't be unloaded while calling the entry point.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/base/memory.c          |  6 ++++++
 include/linux/page_reporting.h |  4 ++++
 mm/page_reporting.c            | 13 +++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index fba7cafdfa3cb..48d918d846bc3 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -19,6 +19,7 @@
 #include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 #include <linux/mm.h>
+#include <linux/page_reporting.h>
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/xarray.h>
@@ -261,6 +262,11 @@ static int memory_block_offline(struct memory_block *mem)
 		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 
 	mem->zone = NULL;
+
+#ifdef CONFIG_PAGE_REPORTING
+	page_report_offline(start_pfn, nr_pages);
+#endif
+
 out:
 	mem_hotplug_done();
 	return ret;
diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
index fe648dfa3a7ca..8ca7e623e3146 100644
--- a/include/linux/page_reporting.h
+++ b/include/linux/page_reporting.h
@@ -12,6 +12,8 @@ struct page_reporting_dev_info {
 	/* function that alters pages to make them "reported" */
 	int (*report)(struct page_reporting_dev_info *prdev,
 		      struct scatterlist *sg, unsigned int nents);
+	int (*report_offline)(struct page_reporting_dev_info *prdev,
+			      unsigned long start_pfn, unsigned int nr_pages);
 
 	/* work struct for processing reports */
 	struct delayed_work work;
@@ -23,6 +25,8 @@ struct page_reporting_dev_info {
 	unsigned int order;
 };
 
+void page_report_offline(unsigned long start_pfn, unsigned int nr_pages);
+
 /* Tear-down and bring-up for page reporting devices */
 void page_reporting_unregister(struct page_reporting_dev_info *prdev);
 int page_reporting_register(struct page_reporting_dev_info *prdev);
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 382958eef8a92..2022508304400 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -317,6 +317,19 @@ static void page_reporting_process(struct work_struct *work)
 static DEFINE_MUTEX(page_reporting_mutex);
 DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
 
+void page_report_offline(unsigned long start_pfn, unsigned int nr_pages)
+{
+	struct page_reporting_dev_info *prdev;
+
+	mutex_lock(&page_reporting_mutex);
+
+	prdev = rcu_access_pointer(pr_dev_info);
+	if (prdev && prdev->report_offline)
+		prdev->report_offline(prdev, start_pfn, nr_pages);
+
+	mutex_unlock(&page_reporting_mutex);
+}
+
 int page_reporting_register(struct page_reporting_dev_info *prdev)
 {
 	int err = 0;

From 4229729e146a237774867b1f3d3beb312a9fb68b Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 6 Jan 2022 19:16:23 +0000
Subject: [PATCH 007/175] virtio: add hack to allow pre-mapped scatterlists

When reporting offlined pages through free page reporting, and
memmap_on_memory is active, we don't want to touch the page structures
anymore, since that will lead to a reference to the range we just
offlined, as the page structures themselves reside in the range.

So, we can't use sg_phys to set the dma address. Instead, if sg_page
is set to NULL, assume that sg_dma_address is set already, and use it.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/virtio/virtio_ring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 7d320f799ca1e..e81a2e2583cfc 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -360,7 +360,8 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
 		 * depending on the direction.
 		 */
 		kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction);
-		return (dma_addr_t)sg_phys(sg);
+		return sg_page(sg) == NULL ? sg_dma_address(sg) :
+		    (dma_addr_t)sg_phys(sg);
 	}
 
 	/*

From 80423d2b080f1286a234b46dab6e35f1033b4b6b Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 10 Dec 2021 19:07:04 +0000
Subject: [PATCH 008/175] virtio-balloon: optionally report offlined memory
 ranges

A hack to report offlined memory ranges through virtio-balloon.

Do this by registering a memory notifier callback for offlining,
and then calling the normal free page reporting entry point to report
the range that was just offlined.

This is only active if the virtio_balloon.report_offline module parameter
is set.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/virtio/virtio_balloon.c | 59 ++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index aa90bd0199d7e..372f8f5262fa7 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -18,6 +18,7 @@
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/page_reporting.h>
+#include <linux/memory.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -39,6 +40,13 @@
 	(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
 #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER)
 
+static bool report_offline = false;
+module_param(report_offline, bool, 0444);
+MODULE_PARM_DESC(report_offline,
+                 "Report offlined pages to the hypervisor");
+
+static DEFINE_MUTEX(vb_page_report_lock);
+
 enum virtio_balloon_vq {
 	VIRTIO_BALLOON_VQ_INFLATE,
 	VIRTIO_BALLOON_VQ_DEFLATE,
@@ -166,6 +174,15 @@ static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_i
 	struct virtqueue *vq = vb->reporting_vq;
 	unsigned int unused, err;
 
+	/*
+	 * virtqueue callers must make sure that only one thread is
+	 * using a queue. With offline page reporting enabled, multiple
+	 * threads might be calling this function at the same time.
+	 *
+	 * So, make sure they don't get in each other's way.
+	 */
+	mutex_lock(&vb_page_report_lock);
+
 	/* We should always be able to add these buffers to an empty queue. */
 	err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN);
 
@@ -174,17 +191,55 @@ static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_i
 	 * are able to trigger an error we will simply display a warning
 	 * and exit without actually processing the pages.
 	 */
-	if (WARN_ON_ONCE(err))
+	if (WARN_ON_ONCE(err)) {
+		mutex_unlock(&vb_page_report_lock);
 		return err;
+	}
 
 	virtqueue_kick(vq);
 
 	/* When host has read buffer, this completes via balloon_ack */
 	wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
 
+	mutex_unlock(&vb_page_report_lock);
+
 	return 0;
 }
 
+/*
+ * Callback for memory offline. Takes the offlined range and passes it
+ * to the normal free page reporting entry point.
+ *
+ * Assumptions that are currently all true:
+ *
+ * 1) We're in a safe context to sleep.
+ * 2) The offlined range is <= a memory section (128M on x86, 1G on arm64),
+ *    and so the length will fit in a 32bit field.
+ */
+static int virtioballoon_free_page_report_offline(
+			struct page_reporting_dev_info *pr_dev_info,
+			unsigned long start_pfn, unsigned int nr_pages)
+{
+	struct scatterlist sgl;
+	unsigned int len = nr_pages << PAGE_SHIFT;
+	int err;
+
+	/*
+	 * Set the page to NULL to signal a "pre-mapped" address,
+	 * e.g. the virtio ring code will not touch the page
+	 * structure and will just use the dma_address passed in.
+	 */
+	sg_init_table(&sgl, 1);
+	sg_set_page(&sgl, NULL, len, 0);
+	sgl.dma_address = PFN_PHYS(start_pfn);
+
+	err = virtballoon_free_page_report(pr_dev_info, &sgl, 1);
+	if (err)
+		pr_err("virtio_balloon: offline reporting failed (%d)\n", err);
+
+	return err;
+}
+
 static void set_page_pfns(struct virtio_balloon *vb,
 			  __virtio32 pfns[], struct page *page)
 {
@@ -957,6 +1012,8 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	}
 
 	vb->pr_dev_info.report = virtballoon_free_page_report;
+	if (report_offline)
+		vb->pr_dev_info.report_offline = virtioballoon_free_page_report_offline;
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
 		unsigned int capacity;
 

From 73725e7c24536dae74875072dff157d99940fddd Mon Sep 17 00:00:00 2001
From: James Gowans <jgowans@amazon.com>
Date: Fri, 17 Sep 2021 00:45:10 +0200
Subject: [PATCH 009/175] Introduce page touching DMA ops binding

Allows enabling page touching via a kernel command line parameter.
When enabled, devices which don't have an IOMMU assigned to them will be
assigned the page touching DMA map ops which ensures that any memory
mapped for DMA by that devices will be accessed by the CPU to make it
resident.

Signed-off-by: James Gowans <jgowans@amazon.com>
Cc-Team: kaos-brimstone <kaos-brimstone@amazon.com>
Cc-Team: ec2-memo <ec2-memo@amazon.com>
---
 MAINTAINERS                       |   7 ++
 arch/arm64/mm/dma-mapping.c       |   6 ++
 include/linux/dma-page-touching.h |  39 +++++++++
 kernel/dma/Kconfig                |  10 +++
 kernel/dma/Makefile               |   1 +
 kernel/dma/page_touching.c        | 134 ++++++++++++++++++++++++++++++
 6 files changed, 197 insertions(+)
 create mode 100644 include/linux/dma-page-touching.h
 create mode 100644 kernel/dma/page_touching.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 4b19dfb5d2fd4..3ee119ef87b51 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15577,6 +15577,13 @@ F:	Documentation/mm/page_table_check.rst
 F:	include/linux/page_table_check.h
 F:	mm/page_table_check.c
 
+PAGE TOUCHING DMA
+M:	James Gowans <jgowans@amazon.com>
+L:	ec2-memo@amazon.com
+S:	Supported
+F:	include/linux/dma-page-touching.h
+F:	kernel/dma/page_touching.c
+
 PANASONIC LAPTOP ACPI EXTRAS DRIVER
 M:	Kenneth Chan <kenneth.t.chan@gmail.com>
 L:	platform-driver-x86@vger.kernel.org
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 5240f6acad648..6ed88059d8b4a 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -8,6 +8,7 @@
 #include <linux/cache.h>
 #include <linux/dma-map-ops.h>
 #include <linux/iommu.h>
+#include <linux/dma-page-touching.h>
 #include <xen/xen.h>
 
 #include <asm/cacheflush.h>
@@ -77,4 +78,9 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);
 
 	xen_setup_dma_ops(dev);
+
+#ifdef CONFIG_DMA_PAGE_TOUCHING
+	if (!dev->dma_ops)
+		setup_dma_page_touching_ops(dev);
+#endif
 }
diff --git a/include/linux/dma-page-touching.h b/include/linux/dma-page-touching.h
new file mode 100644
index 0000000000000..8ff9856e994c9
--- /dev/null
+++ b/include/linux/dma-page-touching.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Sets the supplied device's DMA ops to the page toucing DMA ops if
+ * page touching is enabled and the device does not already have
+ * DMA ops assigned.
+ */
+void setup_dma_page_touching_ops(struct device *dev);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 56866aaa2ae1a..8a6b0acb78e42 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -24,6 +24,16 @@ config DMA_OPS_BYPASS
 config ARCH_HAS_DMA_MAP_DIRECT
 	bool
 
+config DMA_PAGE_TOUCHING
+	bool "Support touching pages when allocated for DMA"
+	help
+	  Builds in support for binding page touching DMA ops to devices which
+	  don't have an IOMMU. Memory mapped for DMA by those devices will be
+	  access by the CPU via the page touching dma_map_ops to ensure that
+	  the memory is resident when running on a memory overcommit host.
+	  The capacility must still be set up at boot time via the
+	  page_touching.dma_page_touching_enable kernel command line param.
+
 config NEED_SG_DMA_LENGTH
 	bool
 
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 21926e46ef4fb..c552b9831f5b7 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
 obj-$(CONFIG_DMA_COHERENT_POOL)		+= pool.o
 obj-$(CONFIG_MMU)			+= remap.o
 obj-$(CONFIG_DMA_MAP_BENCHMARK)		+= map_benchmark.o
+obj-$(CONFIG_DMA_PAGE_TOUCHING)		+= page_touching.o
diff --git a/kernel/dma/page_touching.c b/kernel/dma/page_touching.c
new file mode 100644
index 0000000000000..a9bb7901d769e
--- /dev/null
+++ b/kernel/dma/page_touching.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2020 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dma-map-ops.h>
+#include "direct.h"
+#include <linux/moduleparam.h>
+
+/*
+ * A wrapper around dma_direct which does a readl on the memory being mapped
+ * for DMA to ensure that it becomes resident.
+ * Useful when running in a memory overcommit environment with lazy allocation
+ * and free page reporting.
+ */
+
+/*
+ * Set with kernel cmd line param:
+ * page_touching.dma_page_touching_enable=y
+ */
+static bool dma_page_touching_enable __ro_after_init;
+module_param_named(dma_page_touching_enable, dma_page_touching_enable, bool, 0400);
+MODULE_PARM_DESC(dma_page_touching_enable,
+		"Touch pages allocated for DMA to ensure they are resident");
+
+static void touch_each_page(void *start_addr, size_t size)
+{
+	int addr_offset;
+
+	for (addr_offset = 0; addr_offset < size; addr_offset += PAGE_SIZE)
+		__raw_readl((char *)start_addr + addr_offset);
+}
+
+static void *page_touching_dma_alloc(struct device *dev, size_t size,
+			dma_addr_t *dma_handle, gfp_t gfp,
+			unsigned long attrs)
+{
+	char *kaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs);
+
+	if (!kaddr)
+		return NULL;
+	touch_each_page(kaddr, size);
+	return kaddr;
+
+}
+
+static dma_addr_t page_touching_dma_map_page(struct device *dev, struct page *page,
+		       unsigned long offset, size_t size,
+		       enum dma_data_direction dir,
+		       unsigned long attrs)
+{
+	dma_addr_t dma_handle = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+
+	if (!(dma_mapping_error(dev, dma_handle)))
+		touch_each_page(page_to_virt(page) + offset, size);
+	return dma_handle;
+}
+
+static int page_touching_dma_map_sg(struct device *dev, struct scatterlist *sglist,
+	      int nents, enum dma_data_direction dir,
+	      unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i, ret = dma_direct_map_sg(dev, sglist, nents, dir, attrs);
+
+	if (!ret)
+		goto out;
+
+	for_each_sg(sglist, sg, nents, i)
+		touch_each_page(page_to_virt(sg_page(sg)) + sg->offset, sg->length);
+
+out:
+	return ret;
+
+}
+
+/*
+ * Only a portion of the dma_map_ops interface is implemented here; enough for
+ * the EC2 ENA / NVMe drivers to work.
+ * Notibly missing is alloc_pages.
+ */
+const static struct dma_map_ops page_touching_dma_ops = {
+	.alloc			= page_touching_dma_alloc,
+	.free			= dma_direct_free,
+	.mmap			= dma_common_mmap,
+	.map_page		= page_touching_dma_map_page,
+	.unmap_page		= dma_direct_unmap_page,
+	.map_sg			= page_touching_dma_map_sg,
+	.unmap_sg		= dma_direct_unmap_sg,
+	.dma_supported		= dma_direct_supported,
+	.sync_single_for_cpu	= dma_direct_sync_single_for_cpu,
+	.sync_single_for_device	= dma_direct_sync_single_for_device,
+	.sync_sg_for_cpu	= dma_direct_sync_sg_for_cpu,
+	.dma_supported		= dma_direct_supported,
+	.get_required_mask	= dma_direct_get_required_mask,
+	.max_mapping_size	= dma_direct_max_mapping_size,
+};
+
+void setup_dma_page_touching_ops(struct device *dev)
+{
+	if (!dma_page_touching_enable || dev->dma_ops)
+		return;
+
+	dev_info(dev, "binding to page touching DMA ops\n");
+	dev->dma_ops = &page_touching_dma_ops;
+}

From 23845a76b3757ee38a080b684ba659925ab7f8b9 Mon Sep 17 00:00:00 2001
From: Tighe Barris <tbarri@amazon.com>
Date: Wed, 25 May 2022 16:43:55 +0000
Subject: [PATCH 010/175] Correct read overflow in page touching DMA ops
 binding

To force a page into residence, a read operation is performed on behalf
of devices without an IOMMU. This functionality is required to facilitate
memory overcommitted hosts.

Commit 25d4ce2 ("Introduce page touching DMA ops binding") initially
introduced this logic by invoking a '__raw_readl' function. This function
can however read past the bounds of memory mapped for DMA. Instead,
it is replaced with '__raw_readb'. This limits the length of memory read
to a byte, and prevents reading past the range of mapped memory.

Fixes: 25d4ce2 ("Introduce page touching DMA ops binding")
Signed-off-by: Tighe Barris <tbarri@amazon.com>
Cc-Team: kaos-brimstone <kaos-brimstone@amazon.com>
Cc-Team: ec2-memo <ec2-memo@amazon.com>
---
 kernel/dma/page_touching.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/dma/page_touching.c b/kernel/dma/page_touching.c
index a9bb7901d769e..c5ffb90a40a51 100644
--- a/kernel/dma/page_touching.c
+++ b/kernel/dma/page_touching.c
@@ -36,7 +36,7 @@
 #include <linux/moduleparam.h>
 
 /*
- * A wrapper around dma_direct which does a readl on the memory being mapped
+ * A wrapper around dma_direct which does a readb on the memory being mapped
  * for DMA to ensure that it becomes resident.
  * Useful when running in a memory overcommit environment with lazy allocation
  * and free page reporting.
@@ -56,7 +56,7 @@ static void touch_each_page(void *start_addr, size_t size)
 	int addr_offset;
 
 	for (addr_offset = 0; addr_offset < size; addr_offset += PAGE_SIZE)
-		__raw_readl((char *)start_addr + addr_offset);
+		__raw_readb((char *)start_addr + addr_offset);
 }
 
 static void *page_touching_dma_alloc(struct device *dev, size_t size,

From 402072982f3237d7ae070e4c30b5c90d20465ab4 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@amazon.com>
Date: Wed, 12 May 2021 13:26:18 +1000
Subject: [PATCH 011/175] x86: Disable KASLR when Xen is detected

There's currently an issue with Xen and KASLR causing hibernation to
break (and possibly kexec/kdump too). Until we have got to the bottom
of this and fixed the root cause, let's disable KASLR at runtime when
running on Xen instances so we can enable it for Nitro.

This also adds a boot message to match ARM and help detect whether
this test worked as expected.

Signed-off-by: Benjamin Herrenschmidt <benh@amazon.com>
---
 arch/x86/boot/compressed/kaslr.c | 8 ++++++++
 arch/x86/kernel/setup.c          | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 9794d9174795d..9367905943bd3 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -1,3 +1,4 @@
+
 // SPDX-License-Identifier: GPL-2.0
 /*
  * kaslr.c
@@ -32,6 +33,9 @@
 #include <generated/utsversion.h>
 #include <generated/utsrelease.h>
 
+/* xen_cpuid_base/hypervisor_cpuid_base inlines */
+#include <asm/xen/hypervisor.h>
+
 #define _SETUP
 #include <asm/setup.h>	/* For COMMAND_LINE_SIZE */
 #undef _SETUP
@@ -835,6 +839,10 @@ void choose_random_location(unsigned long input,
 		warn("KASLR disabled: 'nokaslr' on cmdline.");
 		return;
 	}
+	if (xen_cpuid_base() != 0) {
+		warn("KASLR disabled: Xen hypervisor detected.");
+		return;
+	}
 
 	boot_params_ptr->hdr.loadflags |= KASLR_FLAG;
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 18a034613d94d..06352e9acf62e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -52,6 +52,7 @@
 #include <asm/thermal.h>
 #include <asm/unwind.h>
 #include <asm/vsyscall.h>
+#include <asm/setup.h>
 #include <linux/vmalloc.h>
 
 /*
@@ -885,6 +886,9 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 	boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
 #endif
+#ifdef CONFIG_RANDOMIZE_BASE
+	printk(KERN_INFO "KASLR %s\n", kaslr_enabled() ? "enabled" : "disabled");
+#endif
 
 	/*
 	 * If we have OLPC OFW, we might end up relocating the fixmap due to

From e8091e3fb7140699b5f9fcc999b971bf91aab9b5 Mon Sep 17 00:00:00 2001
From: Vladimir Aerov <vaerov@amazon.com>
Date: Mon, 22 Feb 2021 15:56:51 -0800
Subject: [PATCH 012/175] arm64: Export acpi_psci_use_hvc() symbol

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/arm64/kernel/acpi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index a5a256e3f9fe4..c70010aff59ef 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -111,6 +111,7 @@ bool acpi_psci_use_hvc(void)
 {
 	return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC;
 }
+EXPORT_SYMBOL_GPL(acpi_psci_use_hvc);
 
 /*
  * acpi_fadt_sanity_check() - Check FADT presence and carry out sanity

From 4210a4170a4679451251497266d189114bb5be12 Mon Sep 17 00:00:00 2001
From: Vladimir Aerov <vaerov@amazon.com>
Date: Mon, 22 Feb 2021 16:01:09 -0800
Subject: [PATCH 013/175] hwrng: Add Gravition RNG driver

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/char/hw_random/Kconfig        |  13 ++
 drivers/char/hw_random/Makefile       |   1 +
 drivers/char/hw_random/graviton-rng.c | 175 ++++++++++++++++++++++++++
 3 files changed, 189 insertions(+)
 create mode 100644 drivers/char/hw_random/graviton-rng.c

diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 3da8e85f8aae0..e6a50a84a8b7c 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -549,6 +549,19 @@ config HW_RANDOM_CN10K
 	 To compile this driver as a module, choose M here.
 	 The module will be called cn10k_rng. If unsure, say Y.
 
+config HW_RANDOM_GRAVITON
+	tristate "AWS Graviton Random Number Generator support"
+	depends on HW_RANDOM && ACPI && (ARM64 || COMPILE_TEST)
+	default HW_RANDOM
+	help
+	  This driver provides kernel-side support for the Random Number
+	  Generator SMC found on AWS Graviton systems.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called graviton-rng.
+
+	  If unsure, say Y.
+
 endif # HW_RANDOM
 
 config UML_RANDOM
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index 3e948cf044762..399eea6b29a1e 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -47,3 +47,4 @@ obj-$(CONFIG_HW_RANDOM_XIPHERA) += xiphera-trng.o
 obj-$(CONFIG_HW_RANDOM_ARM_SMCCC_TRNG) += arm_smccc_trng.o
 obj-$(CONFIG_HW_RANDOM_CN10K) += cn10k-rng.o
 obj-$(CONFIG_HW_RANDOM_POLARFIRE_SOC) += mpfs-rng.o
+obj-$(CONFIG_HW_RANDOM_GRAVITON) += graviton-rng.o
diff --git a/drivers/char/hw_random/graviton-rng.c b/drivers/char/hw_random/graviton-rng.c
new file mode 100644
index 0000000000000..3a8f3fe35359b
--- /dev/null
+++ b/drivers/char/hw_random/graviton-rng.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AWS Graviton TRNG driver
+ *
+ * Copyright (C) 2019 Amazon Corp.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/arm-smccc.h>
+#include <linux/device.h>
+#include <linux/hw_random.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/acpi.h>
+#include <linux/psci.h>
+#include <linux/module.h>
+
+#define ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32,	\
+			   ARM_SMCCC_OWNER_SIP, 0x00ff)
+#define AWS_GRAVITON_UUID \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_SIP, 0xFF01)
+#define AWS_GRAVITON_GET_VER \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_SIP, 0xFF03)
+
+#define AWS_GRAVITON_GET_RND \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, \
+			   ARM_SMCCC_OWNER_SIP, 0x60)
+#define AWS_GRAVITON_GET_RND_LEGACY \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_SIP, 0x60)
+
+/**
+ *  UID of the Graviton TRNG API: eb4af8a0-89d4-49c9-bc8c5b38dc54308e
+ */
+#define GRVTN_TRNG_UUID_0		0xa0f83aeb
+#define GRVTN_TRNG_UUID_1		0xc949d489
+#define GRVTN_TRNG_UUID_2		0x385b8cbc
+#define GRVTN_TRNG_UUID_3		0x8e3054dc
+
+struct grvtn_rng {
+	u64 call_id;
+	struct hwrng rng;
+};
+
+static void grvtn_smccc_conduit(u64 call_id, struct arm_smccc_res *res)
+{
+	if (acpi_psci_use_hvc())
+		arm_smccc_1_1_hvc(call_id, res);
+	else
+		arm_smccc_1_1_smc(call_id, res);
+}
+
+static int grvtn_probe_sip_feature(unsigned long feature)
+{
+	struct arm_smccc_res res = {};
+
+	if (acpi_psci_use_hvc())
+		arm_smccc_1_1_hvc(ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE,
+			feature, 0, &res);
+	else
+		arm_smccc_1_1_smc(ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE,
+			feature, 0, &res);
+
+	return res.a0;
+}
+
+static int grvtn_trng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
+{
+	struct grvtn_rng *priv = (struct grvtn_rng *)rng->priv;
+	struct arm_smccc_res res;
+	int err = 0;
+	/* timeout after one waiting period */
+	int iter_remain = 2;
+	size_t count = max > sizeof(ulong) * 2 ? sizeof(ulong) * 2 : max;
+	size_t total = count;
+
+	do {
+		if (err && wait)
+			/* Nominal wait is 5us */
+			udelay(err);
+
+		grvtn_smccc_conduit(priv->call_id, &res);
+
+		/* In the unlikely event of rolling back to legacy after probe was issued */
+		if (unlikely((res.a0 == SMCCC_RET_NOT_SUPPORTED) && (priv->call_id != AWS_GRAVITON_GET_RND_LEGACY))) {
+			grvtn_smccc_conduit(AWS_GRAVITON_GET_RND_LEGACY, &res);
+			priv->call_id = AWS_GRAVITON_GET_RND_LEGACY;
+		}
+
+		err = (int) res.a0;
+
+		if (err < 0)
+			return err;
+
+		iter_remain--;
+	} while (iter_remain && err && wait);
+
+	if (err)
+		return 0;
+
+	if (count > sizeof(ulong)) {
+		memcpy(buf, &res.a1, sizeof(ulong));
+		count -= sizeof(ulong);
+		buf += sizeof(ulong);
+	}
+	memcpy(buf, &res.a2, count);
+	return total;
+}
+
+static int grvtn_trng_probe(struct platform_device *pdev)
+{
+	int version;
+	int err;
+	struct arm_smccc_res res;
+	struct grvtn_rng *priv;
+
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->rng.name = "graviton";
+	priv->rng.read = grvtn_trng_read;
+	priv->rng.priv = (unsigned long)priv;
+	priv->rng.quality = 1024; /* all bits are sourced from a HW TRNG */
+	priv->call_id = AWS_GRAVITON_GET_RND_LEGACY; /* default mode is legacy */
+
+	grvtn_smccc_conduit(AWS_GRAVITON_UUID, &res);
+
+	if (res.a0 != GRVTN_TRNG_UUID_0 || res.a1 != GRVTN_TRNG_UUID_1 ||
+	    res.a2 != GRVTN_TRNG_UUID_2 || res.a3 != GRVTN_TRNG_UUID_3) {
+		dev_err(&pdev->dev, "failed to match UUID\n");
+		return -ENXIO;
+	}
+
+	grvtn_smccc_conduit(AWS_GRAVITON_GET_VER, &res);
+	dev_info(&pdev->dev, "Graviton TRNG, SMC version %d.%d\n",
+		(u32)res.a0, (u32)res.a1);
+
+	version = grvtn_probe_sip_feature(AWS_GRAVITON_GET_RND);
+	if (version > 0)
+		priv->call_id = AWS_GRAVITON_GET_RND;
+
+	platform_set_drvdata(pdev, priv);
+	err = devm_hwrng_register(&pdev->dev, &priv->rng);
+	if (err)
+		dev_err(&pdev->dev, "failed to register hwrng");
+	return err;
+}
+
+static const struct acpi_device_id grvtn_trng_acpi_match[] = {
+	{ "AMZN0010", },
+	{}
+};
+
+MODULE_DEVICE_TABLE(acpi, grvtn_trng_acpi_match);
+
+static struct platform_driver grvtn_trng_driver = {
+	.probe  = grvtn_trng_probe,
+	.driver = {
+		.name = "graviton-rng",
+		.owner = THIS_MODULE,
+		.acpi_match_table = ACPI_PTR(grvtn_trng_acpi_match),
+	},
+};
+
+module_platform_driver(grvtn_trng_driver);
+
+MODULE_AUTHOR("Amazon.com, Inc. or it's affiliates");
+MODULE_DESCRIPTION("Graviton TRNG driver");
+MODULE_LICENSE("GPL v2");

From 8bf056144ce4bd1b241361f751096300d72e30c9 Mon Sep 17 00:00:00 2001
From: Vallish Vaidyeshwara <vallish@amazon.com>
Date: Mon, 12 Feb 2018 22:29:56 +0000
Subject: [PATCH 014/175] drivers: introduce AMAZON_DRIVER_UPDATES

This provides a central place to maintain out-of-tree drivers.
Renamed from VENDOR_AMAZON because the name was no longer appropriate.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Guru Anbalagane <guruanb@amazon.com>

Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 drivers/Kconfig         |  1 +
 drivers/Makefile        |  1 +
 drivers/amazon/Kconfig  | 15 +++++++++++++++
 drivers/amazon/Makefile |  3 +++
 4 files changed, 20 insertions(+)
 create mode 100644 drivers/amazon/Kconfig
 create mode 100644 drivers/amazon/Makefile

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 19ee995bd0ae1..ab32f94d6304c 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -239,4 +239,5 @@ source "drivers/peci/Kconfig"
 
 source "drivers/hte/Kconfig"
 
+source "drivers/amazon/Kconfig"
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index bdf1c66141c9b..b44c6599aea35 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -189,3 +189,4 @@ obj-$(CONFIG_COUNTER)		+= counter/
 obj-$(CONFIG_MOST)		+= most/
 obj-$(CONFIG_PECI)		+= peci/
 obj-$(CONFIG_HTE)		+= hte/
+obj-$(CONFIG_AMAZON_DRIVER_UPDATES)    += amazon/
diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
new file mode 100644
index 0000000000000..7cc44c84699e8
--- /dev/null
+++ b/drivers/amazon/Kconfig
@@ -0,0 +1,15 @@
+#
+# Amazon driver updates configuration
+#
+
+config AMAZON_DRIVER_UPDATES
+	bool "Amazon Driver Updates"
+	default y
+	depends on PCI || EXPERIMENTAL
+	---help---
+	 Amazon driver updates includes out-of-tree drivers and/or modifeid
+	 versions of the drivers present in the stable kernel tree.
+
+if AMAZON_DRIVER_UPDATES
+
+endif # AMAZON_DRIVER_UPDATES
diff --git a/drivers/amazon/Makefile b/drivers/amazon/Makefile
new file mode 100644
index 0000000000000..6b4996dcbe52f
--- /dev/null
+++ b/drivers/amazon/Makefile
@@ -0,0 +1,3 @@
+#
+# Amazon Driver Updates
+#

From 58d03a1bf3e67c9c8a80021587a429601e8a32db Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Tue, 10 Jan 2023 21:07:08 +0000
Subject: [PATCH 015/175] drivers/amazon: import 5.15 drivers

EFA: driver version 2.1.0 (4929d243c60b)
ENA: driver version 2.8.0 (a03137e9f3c8)
igb_uio: add (adc137d42e4b)
---
 drivers/amazon/Kconfig                       |   29 +-
 drivers/amazon/Makefile                      |    1 +
 drivers/amazon/net/Makefile                  |    6 +
 drivers/amazon/net/efa/Makefile              |   12 +
 drivers/amazon/net/efa/config.h              |   52 +
 drivers/amazon/net/efa/efa-abi.h             |  135 +
 drivers/amazon/net/efa/efa.h                 |  315 ++
 drivers/amazon/net/efa/efa_admin_cmds_defs.h | 1013 ++++
 drivers/amazon/net/efa/efa_admin_defs.h      |  175 +
 drivers/amazon/net/efa/efa_com.c             | 1251 +++++
 drivers/amazon/net/efa/efa_com.h             |  181 +
 drivers/amazon/net/efa/efa_com_cmd.c         |  801 +++
 drivers/amazon/net/efa/efa_com_cmd.h         |  322 ++
 drivers/amazon/net/efa/efa_common_defs.h     |   31 +
 drivers/amazon/net/efa/efa_gdr.c             |  251 +
 drivers/amazon/net/efa/efa_io_defs.h         |  289 +
 drivers/amazon/net/efa/efa_main.c            |  889 ++++
 drivers/amazon/net/efa/efa_neuron.c          |  176 +
 drivers/amazon/net/efa/efa_p2p.c             |  121 +
 drivers/amazon/net/efa/efa_p2p.h             |   57 +
 drivers/amazon/net/efa/efa_regs_defs.h       |  101 +
 drivers/amazon/net/efa/efa_sysfs.c           |   62 +
 drivers/amazon/net/efa/efa_sysfs.h           |   15 +
 drivers/amazon/net/efa/efa_verbs.c           | 3022 +++++++++++
 drivers/amazon/net/efa/kcompat.h             |  243 +
 drivers/amazon/net/efa/neuron_p2p.h          |   43 +
 drivers/amazon/net/efa/nv-p2p.h              |  439 ++
 drivers/amazon/net/ena/Makefile              |   20 +
 drivers/amazon/net/ena/dim.c                 |   82 +
 drivers/amazon/net/ena/dim.h                 |  338 ++
 drivers/amazon/net/ena/ena_admin_defs.h      | 1324 +++++
 drivers/amazon/net/ena/ena_com.c             | 3243 ++++++++++++
 drivers/amazon/net/ena/ena_com.h             | 1127 ++++
 drivers/amazon/net/ena/ena_common_defs.h     |   21 +
 drivers/amazon/net/ena/ena_devlink.c         |  304 ++
 drivers/amazon/net/ena/ena_devlink.h         |   45 +
 drivers/amazon/net/ena/ena_eth_com.c         |  646 +++
 drivers/amazon/net/ena/ena_eth_com.h         |  232 +
 drivers/amazon/net/ena/ena_eth_io_defs.h     |  390 ++
 drivers/amazon/net/ena/ena_ethtool.c         | 1328 +++++
 drivers/amazon/net/ena/ena_lpc.c             |  299 ++
 drivers/amazon/net/ena/ena_lpc.h             |   38 +
 drivers/amazon/net/ena/ena_netdev.c          | 4980 ++++++++++++++++++
 drivers/amazon/net/ena/ena_netdev.h          |  618 +++
 drivers/amazon/net/ena/ena_pci_id_tbl.h      |   45 +
 drivers/amazon/net/ena/ena_phc.c             |  246 +
 drivers/amazon/net/ena/ena_phc.h             |   43 +
 drivers/amazon/net/ena/ena_regs_defs.h       |  140 +
 drivers/amazon/net/ena/ena_sysfs.c           |   70 +
 drivers/amazon/net/ena/ena_sysfs.h           |   28 +
 drivers/amazon/net/ena/ena_xdp.c             |  977 ++++
 drivers/amazon/net/ena/ena_xdp.h             |  231 +
 drivers/amazon/net/ena/kcompat.h             |  987 ++++
 drivers/amazon/net/ena/net_dim.c             |  245 +
 drivers/amazon/net/igb_uio/Makefile          |    1 +
 drivers/amazon/net/igb_uio/compat.h          |  154 +
 drivers/amazon/net/igb_uio/igb_uio.c         |  674 +++
 57 files changed, 28907 insertions(+), 1 deletion(-)
 create mode 100644 drivers/amazon/net/Makefile
 create mode 100644 drivers/amazon/net/efa/Makefile
 create mode 100644 drivers/amazon/net/efa/config.h
 create mode 100644 drivers/amazon/net/efa/efa-abi.h
 create mode 100644 drivers/amazon/net/efa/efa.h
 create mode 100644 drivers/amazon/net/efa/efa_admin_cmds_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_admin_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_com.c
 create mode 100644 drivers/amazon/net/efa/efa_com.h
 create mode 100644 drivers/amazon/net/efa/efa_com_cmd.c
 create mode 100644 drivers/amazon/net/efa/efa_com_cmd.h
 create mode 100644 drivers/amazon/net/efa/efa_common_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_gdr.c
 create mode 100644 drivers/amazon/net/efa/efa_io_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_main.c
 create mode 100644 drivers/amazon/net/efa/efa_neuron.c
 create mode 100644 drivers/amazon/net/efa/efa_p2p.c
 create mode 100644 drivers/amazon/net/efa/efa_p2p.h
 create mode 100644 drivers/amazon/net/efa/efa_regs_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_sysfs.c
 create mode 100644 drivers/amazon/net/efa/efa_sysfs.h
 create mode 100644 drivers/amazon/net/efa/efa_verbs.c
 create mode 100644 drivers/amazon/net/efa/kcompat.h
 create mode 100644 drivers/amazon/net/efa/neuron_p2p.h
 create mode 100644 drivers/amazon/net/efa/nv-p2p.h
 create mode 100644 drivers/amazon/net/ena/Makefile
 create mode 100644 drivers/amazon/net/ena/dim.c
 create mode 100644 drivers/amazon/net/ena/dim.h
 create mode 100644 drivers/amazon/net/ena/ena_admin_defs.h
 create mode 100644 drivers/amazon/net/ena/ena_com.c
 create mode 100644 drivers/amazon/net/ena/ena_com.h
 create mode 100755 drivers/amazon/net/ena/ena_common_defs.h
 create mode 100644 drivers/amazon/net/ena/ena_devlink.c
 create mode 100644 drivers/amazon/net/ena/ena_devlink.h
 create mode 100644 drivers/amazon/net/ena/ena_eth_com.c
 create mode 100644 drivers/amazon/net/ena/ena_eth_com.h
 create mode 100755 drivers/amazon/net/ena/ena_eth_io_defs.h
 create mode 100644 drivers/amazon/net/ena/ena_ethtool.c
 create mode 100644 drivers/amazon/net/ena/ena_lpc.c
 create mode 100644 drivers/amazon/net/ena/ena_lpc.h
 create mode 100644 drivers/amazon/net/ena/ena_netdev.c
 create mode 100644 drivers/amazon/net/ena/ena_netdev.h
 create mode 100755 drivers/amazon/net/ena/ena_pci_id_tbl.h
 create mode 100644 drivers/amazon/net/ena/ena_phc.c
 create mode 100644 drivers/amazon/net/ena/ena_phc.h
 create mode 100755 drivers/amazon/net/ena/ena_regs_defs.h
 create mode 100755 drivers/amazon/net/ena/ena_sysfs.c
 create mode 100755 drivers/amazon/net/ena/ena_sysfs.h
 create mode 100644 drivers/amazon/net/ena/ena_xdp.c
 create mode 100644 drivers/amazon/net/ena/ena_xdp.h
 create mode 100644 drivers/amazon/net/ena/kcompat.h
 create mode 100644 drivers/amazon/net/ena/net_dim.c
 create mode 100644 drivers/amazon/net/igb_uio/Makefile
 create mode 100644 drivers/amazon/net/igb_uio/compat.h
 create mode 100644 drivers/amazon/net/igb_uio/igb_uio.c

diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
index 7cc44c84699e8..2012cb50eb2a1 100644
--- a/drivers/amazon/Kconfig
+++ b/drivers/amazon/Kconfig
@@ -6,10 +6,37 @@ config AMAZON_DRIVER_UPDATES
 	bool "Amazon Driver Updates"
 	default y
 	depends on PCI || EXPERIMENTAL
-	---help---
+	help
 	 Amazon driver updates includes out-of-tree drivers and/or modifeid
 	 versions of the drivers present in the stable kernel tree.
 
 if AMAZON_DRIVER_UPDATES
 
+config AMAZON_ENA_ETHERNET
+	tristate "Elastic Network Adapter (ENA) support"
+	depends on PCI_MSI && !ENA_ETHERNET
+	help
+	  This driver supports Elastic Network Adapter (ENA)
+
+	  To compile this driver as a module, choose M here.
+	  The module will be called ena.
+
+config AMAZON_EFA_INFINIBAND
+	tristate "Elastic Fabric Adapter (EFA) support"
+	depends on INFINIBAND_USER_ACCESS && AMAZON_ENA_ETHERNET
+	help
+	  This driver support Elastic Fabric Adapter (EFA)
+
+	  To compile this driver as a module, choose M here.
+	  The module will be called efa
+
+config AMAZON_IGB_UIO
+	tristate "DPDK igb_uio driver"
+	help
+	  This is the direct PCI access driver for igb and
+	  other PCI network devices, for DPDK.
+
+	  To compile this driver as a module, choose M here.
+	  The module will be called igb_uio.
+
 endif # AMAZON_DRIVER_UPDATES
diff --git a/drivers/amazon/Makefile b/drivers/amazon/Makefile
index 6b4996dcbe52f..fc5f70dd7487d 100644
--- a/drivers/amazon/Makefile
+++ b/drivers/amazon/Makefile
@@ -1,3 +1,4 @@
 #
 # Amazon Driver Updates
 #
+obj-$(CONFIG_AMAZON_DRIVER_UPDATES)	+= net/
diff --git a/drivers/amazon/net/Makefile b/drivers/amazon/net/Makefile
new file mode 100644
index 0000000000000..7eb6f214798ee
--- /dev/null
+++ b/drivers/amazon/net/Makefile
@@ -0,0 +1,6 @@
+#
+# Amazon Driver Updates
+#
+obj-$(CONFIG_AMAZON_ENA_ETHERNET)	+= ena/
+obj-$(CONFIG_AMAZON_EFA_INFINIBAND)	+= efa/
+obj-$(CONFIG_AMAZON_IGB_UIO)		+= igb_uio/
diff --git a/drivers/amazon/net/efa/Makefile b/drivers/amazon/net/efa/Makefile
new file mode 100644
index 0000000000000..4399f594a93bf
--- /dev/null
+++ b/drivers/amazon/net/efa/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the Elastic Fabric Adapter (EFA) device drivers.
+# EFA Source is: https://github.com/amzn/amzn-drivers.
+
+obj-$(CONFIG_AMAZON_EFA_INFINIBAND) += efa.o
+
+efa-y := efa_com.o efa_com_cmd.o efa_gdr.o efa_main.o efa_neuron.o efa_p2p.o
+efa-y += efa_verbs.o
+
+efa-$(CONFIG_SYSFS) += efa_sysfs.o
+
+ccflags-y += -include $(srctree)/drivers/amazon/net/efa/config.h
diff --git a/drivers/amazon/net/efa/config.h b/drivers/amazon/net/efa/config.h
new file mode 100644
index 0000000000000..96c10dfc11d69
--- /dev/null
+++ b/drivers/amazon/net/efa/config.h
@@ -0,0 +1,52 @@
+#define HAVE_UMEM_SCATTERLIST_IF 1
+#define HAVE_CREATE_CQ_ATTR 1
+#define HAVE_CREATE_AH_RDMA_ATTR 1
+#define HAVE_DEV_PARENT 1
+#define HAVE_POST_CONST_WR 1
+#define HAVE_MAX_SEND_RCV_SGE 1
+#define HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS 1
+#define HAVE_IB_DEV_OPS 1
+#define HAVE_PD_CORE_ALLOCATION 1
+#define HAVE_UCONTEXT_CORE_ALLOCATION 1
+#define HAVE_NO_KVERBS_DRIVERS 1
+#define HAVE_UDATA_TO_DRV_CONTEXT 1
+#define HAVE_SAFE_IB_ALLOC_DEVICE 1
+#define HAVE_AH_CORE_ALLOCATION 1
+#define HAVE_ALLOC_PD_NO_UCONTEXT 1
+#define HAVE_DEREG_MR_UDATA 1
+#define HAVE_DESTROY_CQ_UDATA 1
+#define HAVE_DESTROY_QP_UDATA 1
+#define HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE 1
+#define HAVE_UPSTREAM_EFA 1
+#define HAVE_IB_DEVICE_OPS_COMMON 1
+#define HAVE_CQ_CORE_ALLOCATION 1
+#define HAVE_IB_PORT_PHYS_STATE_LINK_UP 1
+#define HAVE_KVZALLOC 1
+#define HAVE_IBDEV_PRINT_RATELIMITED 1
+#define HAVE_IBDEV_PRINT 1
+#define HAVE_IB_QPT_DRIVER 1
+#define HAVE_IB_IS_UDATA_CLEARED 1
+#define HAVE_IB_MR_LENGTH 1
+#define HAVE_PCI_VENDOR_ID_AMAZON 1
+#define HAVE_IB_UMEM_GET_NO_DMASYNC 1
+#define HAVE_CORE_MMAP_XA 1
+#define HAVE_RDMA_NODE_UNSPECIFIED 1
+#define HAVE_BITFIELD_H 1
+#define HAVE_IB_UMEM_GET_DEVICE_PARAM 1
+#define HAVE_IB_ACCESS_OPTIONAL 1
+#define HAVE_CREATE_AH_INIT_ATTR 1
+#define HAVE_ATOMIC64_FETCH_INC 1
+#define HAVE_DEALLOC_PD_UDATA_RC 1
+#define HAVE_AH_CORE_ALLOCATION_DESTROY_RC 1
+#define HAVE_IB_INT_DESTROY_CQ 1
+#define HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK 1
+#define HAVE_IB_UMEM_NUM_DMA_BLOCKS 1
+#define HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM 1
+#define HAVE_UVERBS_CMD_MASK_NOT_NEEDED 1
+#define HAVE_U32_PORT 1
+#define HAVE_SPLIT_STATS_ALLOC 1
+#define HAVE_SYSFS_EMIT 1
+#define HAVE_XARRAY 1
+#define HAVE_QP_CORE_ALLOCATION 1
+#define HAVE_MR_DMABUF 1
+#define HAVE_EFA_P2P 1
\ No newline at end of file
diff --git a/drivers/amazon/net/efa/efa-abi.h b/drivers/amazon/net/efa/efa-abi.h
new file mode 100644
index 0000000000000..163ac79556d68
--- /dev/null
+++ b/drivers/amazon/net/efa/efa-abi.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+/*
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef EFA_ABI_USER_H
+#define EFA_ABI_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define EFA_UVERBS_ABI_VERSION 1
+
+/*
+ * Keep structs aligned to 8 bytes.
+ * Keep reserved fields as arrays of __u8 named reserved_XXX where XXX is the
+ * hex bit offset of the field.
+ */
+
+enum {
+	EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH  = 1 << 0,
+	EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1,
+};
+
+struct efa_ibv_alloc_ucontext_cmd {
+	__u32 comp_mask;
+	__u8 reserved_20[4];
+};
+
+enum efa_ibv_user_cmds_supp_udata {
+	EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0,
+	EFA_USER_CMDS_SUPP_UDATA_CREATE_AH    = 1 << 1,
+};
+
+struct efa_ibv_alloc_ucontext_resp {
+	__u32 comp_mask;
+	__u32 cmds_supp_udata_mask;
+	__u16 sub_cqs_per_cq;
+	__u16 inline_buf_size;
+	__u32 max_llq_size; /* bytes */
+	__u16 max_tx_batch; /* units of 64 bytes */
+	__u16 min_sq_wr;
+	__u8 reserved_a0[4];
+};
+
+struct efa_ibv_alloc_pd_resp {
+	__u32 comp_mask;
+	__u16 pdn;
+	__u8 reserved_30[2];
+};
+
+enum {
+	EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL = 1 << 0,
+	EFA_CREATE_CQ_WITH_SGID               = 1 << 1,
+};
+
+struct efa_ibv_create_cq {
+	__u32 comp_mask;
+	__u32 cq_entry_size;
+	__u16 num_sub_cqs;
+	__u8 flags;
+	__u8 reserved_58[5];
+};
+
+enum {
+	EFA_CREATE_CQ_RESP_DB_OFF = 1 << 0,
+};
+
+struct efa_ibv_create_cq_resp {
+	__u32 comp_mask;
+	__u8 reserved_20[4];
+	__aligned_u64 q_mmap_key;
+	__aligned_u64 q_mmap_size;
+	__u16 cq_idx;
+	__u8 reserved_d0[2];
+	__u32 db_off;
+	__aligned_u64 db_mmap_key;
+};
+
+enum {
+	EFA_QP_DRIVER_TYPE_SRD = 0,
+};
+
+struct efa_ibv_create_qp {
+	__u32 comp_mask;
+	__u32 rq_ring_size; /* bytes */
+	__u32 sq_ring_size; /* bytes */
+	__u32 driver_qp_type;
+};
+
+struct efa_ibv_create_qp_resp {
+	__u32 comp_mask;
+	/* the offset inside the page of the rq db */
+	__u32 rq_db_offset;
+	/* the offset inside the page of the sq db */
+	__u32 sq_db_offset;
+	/* the offset inside the page of descriptors buffer */
+	__u32 llq_desc_offset;
+	__aligned_u64 rq_mmap_key;
+	__aligned_u64 rq_mmap_size;
+	__aligned_u64 rq_db_mmap_key;
+	__aligned_u64 sq_db_mmap_key;
+	__aligned_u64 llq_desc_mmap_key;
+	__u16 send_sub_cq_idx;
+	__u16 recv_sub_cq_idx;
+	__u8 reserved_1e0[4];
+};
+
+struct efa_ibv_create_ah_resp {
+	__u32 comp_mask;
+	__u16 efa_address_handle;
+	__u8 reserved_30[2];
+};
+
+enum {
+	EFA_QUERY_DEVICE_CAPS_RDMA_READ = 1 << 0,
+	EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1,
+	EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
+	EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
+};
+
+struct efa_ibv_ex_query_device_resp {
+	__u32 comp_mask;
+	__u32 max_sq_wr;
+	__u32 max_rq_wr;
+	__u16 max_sq_sge;
+	__u16 max_rq_sge;
+	__u32 max_rdma_size;
+	__u32 device_caps;
+};
+
+#endif /* EFA_ABI_USER_H */
diff --git a/drivers/amazon/net/efa/efa.h b/drivers/amazon/net/efa/efa.h
new file mode 100644
index 0000000000000..34ccbac76b451
--- /dev/null
+++ b/drivers/amazon/net/efa/efa.h
@@ -0,0 +1,315 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_H_
+#define _EFA_H_
+
+#include "kcompat.h"
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/version.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "efa-abi.h"
+#include "efa_com_cmd.h"
+
+#define DRV_MODULE_NAME         "efa"
+#define DEVICE_NAME             "Elastic Fabric Adapter (EFA)"
+
+#define EFA_IRQNAME_SIZE        40
+
+#define EFA_MGMNT_MSIX_VEC_IDX            0
+#define EFA_COMP_EQS_VEC_BASE             1
+
+struct efa_irq {
+	irq_handler_t handler;
+	void *data;
+	u32 irqn;
+	u32 vector;
+	cpumask_t affinity_hint_mask;
+	char name[EFA_IRQNAME_SIZE];
+};
+
+/* Don't use anything other than atomic64 */
+struct efa_stats {
+	atomic64_t alloc_pd_err;
+	atomic64_t create_qp_err;
+	atomic64_t create_cq_err;
+	atomic64_t reg_mr_err;
+	atomic64_t alloc_ucontext_err;
+	atomic64_t create_ah_err;
+	atomic64_t mmap_err;
+	atomic64_t keep_alive_rcvd;
+};
+
+struct efa_dev {
+	struct ib_device ibdev;
+	struct efa_com_dev edev;
+	struct pci_dev *pdev;
+	struct efa_com_get_device_attr_result dev_attr;
+
+	u64 reg_bar_addr;
+	u64 reg_bar_len;
+	u64 mem_bar_addr;
+	u64 mem_bar_len;
+	u64 db_bar_addr;
+	u64 db_bar_len;
+
+	int admin_msix_vector_idx;
+	struct efa_irq admin_irq;
+
+	struct efa_stats stats;
+
+	/* Array of completion EQs */
+	struct efa_eq *eqs;
+	unsigned int neqs;
+
+#ifdef HAVE_XARRAY
+	/* Only stores CQs with interrupts enabled */
+	struct xarray cqs_xa;
+#else
+	/* If xarray isn't available keep an array of all possible CQs */
+	struct efa_cq *cqs_arr[BIT(sizeof_field(struct efa_admin_create_cq_resp,
+						cq_idx) * 8)];
+#endif
+};
+
+struct efa_ucontext {
+	struct ib_ucontext ibucontext;
+	u16 uarn;
+#ifndef HAVE_CORE_MMAP_XA
+	/* Protects ucontext state */
+	struct mutex lock;
+	struct list_head pending_mmaps;
+	u32 mmap_page;
+#endif /* !defined(HAVE_CORE_MMAP_XA) */
+};
+
+struct efa_pd {
+	struct ib_pd ibpd;
+	u16 pdn;
+};
+
+struct efa_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+#ifdef HAVE_EFA_P2P
+	struct efa_p2pmem *p2pmem;
+	u64 p2p_ticket;
+#endif
+};
+
+struct efa_cq {
+	struct ib_cq ibcq;
+	struct efa_ucontext *ucontext;
+	dma_addr_t dma_addr;
+	void *cpu_addr;
+	struct rdma_user_mmap_entry *mmap_entry;
+	struct rdma_user_mmap_entry *db_mmap_entry;
+	size_t size;
+	u16 cq_idx;
+	/* NULL when no interrupts requested */
+	struct efa_eq *eq;
+};
+
+struct efa_qp {
+	struct ib_qp ibqp;
+	dma_addr_t rq_dma_addr;
+	void *rq_cpu_addr;
+	size_t rq_size;
+	enum ib_qp_state state;
+
+	/* Used for saving mmap_xa entries */
+	struct rdma_user_mmap_entry *sq_db_mmap_entry;
+	struct rdma_user_mmap_entry *llq_desc_mmap_entry;
+	struct rdma_user_mmap_entry *rq_db_mmap_entry;
+	struct rdma_user_mmap_entry *rq_mmap_entry;
+
+	u32 qp_handle;
+	u32 max_send_wr;
+	u32 max_recv_wr;
+	u32 max_send_sge;
+	u32 max_recv_sge;
+	u32 max_inline_data;
+};
+
+struct efa_ah {
+	struct ib_ah ibah;
+	u16 ah;
+	/* dest_addr */
+	u8 id[EFA_GID_SIZE];
+};
+
+struct efa_eq {
+	struct efa_com_eq eeq;
+	struct efa_irq irq;
+};
+
+int efa_query_device(struct ib_device *ibdev,
+		     struct ib_device_attr *props,
+		     struct ib_udata *udata);
+int efa_query_port(struct ib_device *ibdev, port_t port,
+		   struct ib_port_attr *props);
+int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask,
+		 struct ib_qp_init_attr *qp_init_attr);
+int efa_query_gid(struct ib_device *ibdev, port_t port, int index,
+		  union ib_gid *gid);
+int efa_query_pkey(struct ib_device *ibdev, port_t port, u16 index,
+		   u16 *pkey);
+#ifdef HAVE_ALLOC_PD_NO_UCONTEXT
+int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+#else
+int efa_alloc_pd(struct ib_pd *ibpd,
+		 struct ib_ucontext *ibucontext,
+		 struct ib_udata *udata);
+#endif
+#ifdef HAVE_DEALLOC_PD_UDATA_RC
+int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+#elif defined(HAVE_DEALLOC_PD_UDATA)
+void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+#elif defined(HAVE_PD_CORE_ALLOCATION)
+void efa_dealloc_pd(struct ib_pd *ibpd);
+#else
+int efa_dealloc_pd(struct ib_pd *ibpd);
+struct ib_pd *efa_kzalloc_pd(struct ib_device *ibdev,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata);
+#endif
+#ifdef HAVE_DESTROY_QP_UDATA
+int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+#else
+int efa_destroy_qp(struct ib_qp *ibqp);
+#endif
+#ifdef HAVE_QP_CORE_ALLOCATION
+int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
+		  struct ib_udata *udata);
+#else
+struct ib_qp *efa_kzalloc_qp(struct ib_pd *ibpd,
+			     struct ib_qp_init_attr *init_attr,
+			     struct ib_udata *udata);
+#endif
+#ifdef HAVE_IB_INT_DESTROY_CQ
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+#elif defined(HAVE_IB_VOID_DESTROY_CQ)
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+#elif defined(HAVE_DESTROY_CQ_UDATA)
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+#else
+int efa_destroy_cq(struct ib_cq *ibcq);
+#endif
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata);
+#ifndef HAVE_CQ_CORE_ALLOCATION
+#ifdef HAVE_CREATE_CQ_NO_UCONTEXT
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_udata *udata);
+#elif defined(HAVE_CREATE_CQ_ATTR)
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata);
+#endif
+#endif
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			 u64 virt_addr, int access_flags,
+			 struct ib_udata *udata);
+#ifdef HAVE_MR_DMABUF
+struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
+				     u64 length, u64 virt_addr,
+				     int fd, int access_flags,
+				     struct ib_udata *udata);
+#endif
+#ifdef HAVE_DEREG_MR_UDATA
+int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+#else
+int efa_dereg_mr(struct ib_mr *ibmr);
+#endif
+int efa_get_port_immutable(struct ib_device *ibdev, port_t port_num,
+			   struct ib_port_immutable *immutable);
+int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata);
+#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
+void efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
+#else
+int efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
+struct ib_ucontext *efa_kzalloc_ucontext(struct ib_device *ibdev,
+					 struct ib_udata *udata);
+#endif
+int efa_mmap(struct ib_ucontext *ibucontext,
+	     struct vm_area_struct *vma);
+#ifdef HAVE_CORE_MMAP_XA
+void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
+#endif
+int efa_create_ah(struct ib_ah *ibah,
+#ifdef HAVE_CREATE_AH_INIT_ATTR
+		  struct rdma_ah_init_attr *init_attr,
+#else
+		  struct rdma_ah_attr *ah_attr,
+		  u32 flags,
+#endif
+		  struct ib_udata *udata);
+#ifndef HAVE_AH_CORE_ALLOCATION
+#ifdef HAVE_CREATE_DESTROY_AH_FLAGS
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct rdma_ah_attr *ah_attr,
+			     u32 flags,
+			     struct ib_udata *udata);
+#elif defined(HAVE_CREATE_AH_RDMA_ATTR)
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct rdma_ah_attr *ah_attr,
+			     struct ib_udata *udata);
+#endif
+#endif
+#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags);
+#elif defined(HAVE_AH_CORE_ALLOCATION)
+void efa_destroy_ah(struct ib_ah *ibah, u32 flags);
+#elif defined(HAVE_CREATE_DESTROY_AH_FLAGS)
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags);
+#else
+int efa_destroy_ah(struct ib_ah *ibah);
+#endif
+#ifndef HAVE_NO_KVERBS_DRIVERS
+#ifdef HAVE_POST_CONST_WR
+int efa_post_send(struct ib_qp *ibqp,
+		  const struct ib_send_wr *wr,
+		  const struct ib_send_wr **bad_wr);
+#else
+int efa_post_send(struct ib_qp *ibqp,
+		  struct ib_send_wr *wr,
+		  struct ib_send_wr **bad_wr);
+#endif
+#ifdef HAVE_POST_CONST_WR
+int efa_post_recv(struct ib_qp *ibqp,
+		  const struct ib_recv_wr *wr,
+		  const struct ib_recv_wr **bad_wr);
+#else
+int efa_post_recv(struct ib_qp *ibqp,
+		  struct ib_recv_wr *wr,
+		  struct ib_recv_wr **bad_wr);
+#endif
+int efa_poll_cq(struct ib_cq *ibcq, int num_entries,
+		struct ib_wc *wc);
+int efa_req_notify_cq(struct ib_cq *ibcq,
+		      enum ib_cq_notify_flags flags);
+struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc);
+#endif
+int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_udata *udata);
+enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
+					 port_t port_num);
+#ifdef HAVE_SPLIT_STATS_ALLOC
+struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev, port_t port_num);
+struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev);
+#else
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, port_t port_num);
+#endif
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     port_t port_num, int index);
+
+#endif /* _EFA_H_ */
diff --git a/drivers/amazon/net/efa/efa_admin_cmds_defs.h b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
new file mode 100644
index 0000000000000..d4b9226088bd0
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
@@ -0,0 +1,1013 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_ADMIN_CMDS_H_
+#define _EFA_ADMIN_CMDS_H_
+
+#define EFA_ADMIN_API_VERSION_MAJOR          0
+#define EFA_ADMIN_API_VERSION_MINOR          1
+
+/* EFA admin queue opcodes */
+enum efa_admin_aq_opcode {
+	EFA_ADMIN_CREATE_QP                         = 1,
+	EFA_ADMIN_MODIFY_QP                         = 2,
+	EFA_ADMIN_QUERY_QP                          = 3,
+	EFA_ADMIN_DESTROY_QP                        = 4,
+	EFA_ADMIN_CREATE_AH                         = 5,
+	EFA_ADMIN_DESTROY_AH                        = 6,
+	EFA_ADMIN_REG_MR                            = 7,
+	EFA_ADMIN_DEREG_MR                          = 8,
+	EFA_ADMIN_CREATE_CQ                         = 9,
+	EFA_ADMIN_DESTROY_CQ                        = 10,
+	EFA_ADMIN_GET_FEATURE                       = 11,
+	EFA_ADMIN_SET_FEATURE                       = 12,
+	EFA_ADMIN_GET_STATS                         = 13,
+	EFA_ADMIN_ALLOC_PD                          = 14,
+	EFA_ADMIN_DEALLOC_PD                        = 15,
+	EFA_ADMIN_ALLOC_UAR                         = 16,
+	EFA_ADMIN_DEALLOC_UAR                       = 17,
+	EFA_ADMIN_CREATE_EQ                         = 18,
+	EFA_ADMIN_DESTROY_EQ                        = 19,
+	EFA_ADMIN_MAX_OPCODE                        = 19,
+};
+
+enum efa_admin_aq_feature_id {
+	EFA_ADMIN_DEVICE_ATTR                       = 1,
+	EFA_ADMIN_AENQ_CONFIG                       = 2,
+	EFA_ADMIN_NETWORK_ATTR                      = 3,
+	EFA_ADMIN_QUEUE_ATTR                        = 4,
+	EFA_ADMIN_HW_HINTS                          = 5,
+	EFA_ADMIN_HOST_INFO                         = 6,
+	EFA_ADMIN_EVENT_QUEUE_ATTR                  = 7,
+};
+
+/* QP transport type */
+enum efa_admin_qp_type {
+	/* Unreliable Datagram */
+	EFA_ADMIN_QP_TYPE_UD                        = 1,
+	/* Scalable Reliable Datagram */
+	EFA_ADMIN_QP_TYPE_SRD                       = 2,
+};
+
+/* QP state */
+enum efa_admin_qp_state {
+	EFA_ADMIN_QP_STATE_RESET                    = 0,
+	EFA_ADMIN_QP_STATE_INIT                     = 1,
+	EFA_ADMIN_QP_STATE_RTR                      = 2,
+	EFA_ADMIN_QP_STATE_RTS                      = 3,
+	EFA_ADMIN_QP_STATE_SQD                      = 4,
+	EFA_ADMIN_QP_STATE_SQE                      = 5,
+	EFA_ADMIN_QP_STATE_ERR                      = 6,
+};
+
+enum efa_admin_get_stats_type {
+	EFA_ADMIN_GET_STATS_TYPE_BASIC              = 0,
+	EFA_ADMIN_GET_STATS_TYPE_MESSAGES           = 1,
+	EFA_ADMIN_GET_STATS_TYPE_RDMA_READ          = 2,
+};
+
+enum efa_admin_get_stats_scope {
+	EFA_ADMIN_GET_STATS_SCOPE_ALL               = 0,
+	EFA_ADMIN_GET_STATS_SCOPE_QUEUE             = 1,
+};
+
+/*
+ * QP allocation sizes, converted by fabric QueuePair (QP) create command
+ * from QP capabilities.
+ */
+struct efa_admin_qp_alloc_size {
+	/* Send descriptor ring size in bytes */
+	u32 send_queue_ring_size;
+
+	/* Max number of WQEs that can be outstanding on send queue. */
+	u32 send_queue_depth;
+
+	/*
+	 * Recv descriptor ring size in bytes, sufficient for user-provided
+	 * number of WQEs
+	 */
+	u32 recv_queue_ring_size;
+
+	/* Max number of WQEs that can be outstanding on recv queue */
+	u32 recv_queue_depth;
+};
+
+struct efa_admin_create_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Protection Domain associated with this QP */
+	u16 pd;
+
+	/* QP type */
+	u8 qp_type;
+
+	/*
+	 * 0 : sq_virt - If set, SQ ring base address is
+	 *    virtual (IOVA returned by MR registration)
+	 * 1 : rq_virt - If set, RQ ring base address is
+	 *    virtual (IOVA returned by MR registration)
+	 * 7:2 : reserved - MBZ
+	 */
+	u8 flags;
+
+	/*
+	 * Send queue (SQ) ring base physical address. This field is not
+	 * used if this is a Low Latency Queue(LLQ).
+	 */
+	u64 sq_base_addr;
+
+	/* Receive queue (RQ) ring base address. */
+	u64 rq_base_addr;
+
+	/* Index of CQ to be associated with Send Queue completions */
+	u32 send_cq_idx;
+
+	/* Index of CQ to be associated with Recv Queue completions */
+	u32 recv_cq_idx;
+
+	/*
+	 * Memory registration key for the SQ ring, used only when not in
+	 * LLQ mode and base address is virtual
+	 */
+	u32 sq_l_key;
+
+	/*
+	 * Memory registration key for the RQ ring, used only when base
+	 * address is virtual
+	 */
+	u32 rq_l_key;
+
+	/* Requested QP allocation sizes */
+	struct efa_admin_qp_alloc_size qp_alloc_size;
+
+	/* UAR number */
+	u16 uar;
+
+	/* MBZ */
+	u16 reserved;
+
+	/* MBZ */
+	u32 reserved2;
+};
+
+struct efa_admin_create_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/*
+	 * Opaque handle to be used for consequent admin operations on the
+	 * QP
+	 */
+	u32 qp_handle;
+
+	/*
+	 * QP number in the given EFA virtual device. Least-significant bits (as
+	 * needed according to max_qp) carry unique QP ID
+	 */
+	u16 qp_num;
+
+	/* MBZ */
+	u16 reserved;
+
+	/* Index of sub-CQ for Send Queue completions */
+	u16 send_sub_cq_idx;
+
+	/* Index of sub-CQ for Receive Queue completions */
+	u16 recv_sub_cq_idx;
+
+	/* SQ doorbell address, as offset to PCIe DB BAR */
+	u32 sq_db_offset;
+
+	/* RQ doorbell address, as offset to PCIe DB BAR */
+	u32 rq_db_offset;
+
+	/*
+	 * low latency send queue ring base address as an offset to PCIe
+	 * MMIO LLQ_MEM BAR
+	 */
+	u32 llq_descriptors_offset;
+};
+
+struct efa_admin_modify_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/*
+	 * Mask indicating which fields should be updated
+	 * 0 : qp_state
+	 * 1 : cur_qp_state
+	 * 2 : qkey
+	 * 3 : sq_psn
+	 * 4 : sq_drained_async_notify
+	 * 5 : rnr_retry
+	 * 31:6 : reserved
+	 */
+	u32 modify_mask;
+
+	/* QP handle returned by create_qp command */
+	u32 qp_handle;
+
+	/* QP state */
+	u32 qp_state;
+
+	/* Override current QP state (before applying the transition) */
+	u32 cur_qp_state;
+
+	/* QKey */
+	u32 qkey;
+
+	/* SQ PSN */
+	u32 sq_psn;
+
+	/* Enable async notification when SQ is drained */
+	u8 sq_drained_async_notify;
+
+	/* Number of RNR retries (valid only for SRD QPs) */
+	u8 rnr_retry;
+
+	/* MBZ */
+	u16 reserved2;
+};
+
+struct efa_admin_modify_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_query_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* QP handle returned by create_qp command */
+	u32 qp_handle;
+};
+
+struct efa_admin_query_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* QP state */
+	u32 qp_state;
+
+	/* QKey */
+	u32 qkey;
+
+	/* SQ PSN */
+	u32 sq_psn;
+
+	/* Indicates that draining is in progress */
+	u8 sq_draining;
+
+	/* Number of RNR retries (valid only for SRD QPs) */
+	u8 rnr_retry;
+
+	/* MBZ */
+	u16 reserved2;
+};
+
+struct efa_admin_destroy_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* QP handle returned by create_qp command */
+	u32 qp_handle;
+};
+
+struct efa_admin_destroy_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * Create Address Handle command parameters. Must not be called more than
+ * once for the same destination
+ */
+struct efa_admin_create_ah_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Destination address in network byte order */
+	u8 dest_addr[16];
+
+	/* PD number */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_create_ah_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* Target interface address handle (opaque) */
+	u16 ah;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_destroy_ah_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Target interface address handle (opaque) */
+	u16 ah;
+
+	/* PD number */
+	u16 pd;
+};
+
+struct efa_admin_destroy_ah_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * Registration of MemoryRegion, required for QP working with Virtual
+ * Addresses. In standard verbs semantics, region length is limited to 2GB
+ * space, but EFA offers larger MR support for large memory space, to ease
+ * on users working with very large datasets (i.e. full GPU memory mapping).
+ */
+struct efa_admin_reg_mr_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Protection Domain */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved16_w1;
+
+	/* Physical Buffer List, each element is page-aligned. */
+	union {
+		/*
+		 * Inline array of guest-physical page addresses of user
+		 * memory pages (optimization for short region
+		 * registrations)
+		 */
+		u64 inline_pbl_array[4];
+
+		/* points to PBL (direct or indirect, chained if needed) */
+		struct efa_admin_ctrl_buff_info pbl;
+	} pbl;
+
+	/* Memory region length, in bytes. */
+	u64 mr_length;
+
+	/*
+	 * flags and page size
+	 * 4:0 : phys_page_size_shift - page size is (1 <<
+	 *    phys_page_size_shift). Page size is used for
+	 *    building the Virtual to Physical address mapping
+	 * 6:5 : reserved - MBZ
+	 * 7 : mem_addr_phy_mode_en - Enable bit for physical
+	 *    memory registration (no translation), can be used
+	 *    only by privileged clients. If set, PBL must
+	 *    contain a single entry.
+	 */
+	u8 flags;
+
+	/*
+	 * permissions
+	 * 0 : local_write_enable - Local write permissions:
+	 *    must be set for RQ buffers and buffers posted for
+	 *    RDMA Read requests
+	 * 1 : reserved1 - MBZ
+	 * 2 : remote_read_enable - Remote read permissions:
+	 *    must be set to enable RDMA read from the region
+	 * 7:3 : reserved2 - MBZ
+	 */
+	u8 permissions;
+
+	/* MBZ */
+	u16 reserved16_w5;
+
+	/* number of pages in PBL (redundant, could be calculated) */
+	u32 page_num;
+
+	/*
+	 * IO Virtual Address associated with this MR. If
+	 * mem_addr_phy_mode_en is set, contains the physical address of
+	 * the region.
+	 */
+	u64 iova;
+};
+
+struct efa_admin_reg_mr_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/*
+	 * L_Key, to be used in conjunction with local buffer references in
+	 * SQ and RQ WQE, or with virtual RQ/CQ rings
+	 */
+	u32 l_key;
+
+	/*
+	 * R_Key, to be used in RDMA messages to refer to remotely accessed
+	 * memory region
+	 */
+	u32 r_key;
+};
+
+struct efa_admin_dereg_mr_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* L_Key, memory region's l_key */
+	u32 l_key;
+};
+
+struct efa_admin_dereg_mr_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_create_cq_cmd {
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/*
+	 * 4:0 : reserved5 - MBZ
+	 * 5 : interrupt_mode_enabled - if set, cq operates
+	 *    in interrupt mode (i.e. CQ events and EQ elements
+	 *    are generated), otherwise - polling
+	 * 6 : virt - If set, ring base address is virtual
+	 *    (IOVA returned by MR registration)
+	 * 7 : reserved6 - MBZ
+	 */
+	u8 cq_caps_1;
+
+	/*
+	 * 4:0 : cq_entry_size_words - size of CQ entry in
+	 *    32-bit words, valid values: 4, 8.
+	 * 5 : set_src_addr - If set, source address will be
+	 *    filled on RX completions from unknown senders.
+	 *    Requires 8 words CQ entry size.
+	 * 7:6 : reserved7 - MBZ
+	 */
+	u8 cq_caps_2;
+
+	/* completion queue depth in # of entries. must be power of 2 */
+	u16 cq_depth;
+
+	/* EQ number assigned to this cq */
+	u16 eqn;
+
+	/* MBZ */
+	u16 reserved;
+
+	/*
+	 * CQ ring base address, virtual or physical depending on 'virt'
+	 * flag
+	 */
+	struct efa_common_mem_addr cq_ba;
+
+	/*
+	 * Memory registration key for the ring, used only when base
+	 * address is virtual
+	 */
+	u32 l_key;
+
+	/*
+	 * number of sub cqs - must be equal to sub_cqs_per_cq of queue
+	 * attributes.
+	 */
+	u16 num_sub_cqs;
+
+	/* UAR number */
+	u16 uar;
+};
+
+struct efa_admin_create_cq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	u16 cq_idx;
+
+	/* actual cq depth in number of entries */
+	u16 cq_actual_depth;
+
+	/* CQ doorbell address, as offset to PCIe DB BAR */
+	u32 db_offset;
+
+	/*
+	 * 0 : db_valid - If set, doorbell offset is valid.
+	 *    Always set when interrupts are requested.
+	 */
+	u32 flags;
+};
+
+struct efa_admin_destroy_cq_cmd {
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	u16 cq_idx;
+
+	/* MBZ */
+	u16 reserved1;
+};
+
+struct efa_admin_destroy_cq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * EFA AQ Get Statistics command. Extended statistics are placed in control
+ * buffer pointed by AQ entry
+ */
+struct efa_admin_aq_get_stats_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		/* command specific inline data */
+		u32 inline_data_w1[3];
+
+		struct efa_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	/* stats type as defined in enum efa_admin_get_stats_type */
+	u8 type;
+
+	/* stats scope defined in enum efa_admin_get_stats_scope */
+	u8 scope;
+
+	u16 scope_modifier;
+};
+
+struct efa_admin_basic_stats {
+	u64 tx_bytes;
+
+	u64 tx_pkts;
+
+	u64 rx_bytes;
+
+	u64 rx_pkts;
+
+	u64 rx_drops;
+};
+
+struct efa_admin_messages_stats {
+	u64 send_bytes;
+
+	u64 send_wrs;
+
+	u64 recv_bytes;
+
+	u64 recv_wrs;
+};
+
+struct efa_admin_rdma_read_stats {
+	u64 read_wrs;
+
+	u64 read_bytes;
+
+	u64 read_wr_err;
+
+	u64 read_resp_bytes;
+};
+
+struct efa_admin_acq_get_stats_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	union {
+		struct efa_admin_basic_stats basic_stats;
+
+		struct efa_admin_messages_stats messages_stats;
+
+		struct efa_admin_rdma_read_stats rdma_read_stats;
+	} u;
+};
+
+struct efa_admin_get_set_feature_common_desc {
+	/* MBZ */
+	u8 reserved0;
+
+	/* as appears in efa_admin_aq_feature_id */
+	u8 feature_id;
+
+	/* MBZ */
+	u16 reserved16;
+};
+
+struct efa_admin_feature_device_attr_desc {
+	/* Bitmap of efa_admin_aq_feature_id */
+	u64 supported_features;
+
+	/* Bitmap of supported page sizes in MR registrations */
+	u64 page_size_cap;
+
+	u32 fw_version;
+
+	u32 admin_api_version;
+
+	u32 device_version;
+
+	/* Bar used for SQ and RQ doorbells */
+	u16 db_bar;
+
+	/* Indicates how many bits are used on physical address access */
+	u8 phys_addr_width;
+
+	/* Indicates how many bits are used on virtual address access */
+	u8 virt_addr_width;
+
+	/*
+	 * 0 : rdma_read - If set, RDMA Read is supported on
+	 *    TX queues
+	 * 1 : rnr_retry - If set, RNR retry is supported on
+	 *    modify QP command
+	 * 31:2 : reserved - MBZ
+	 */
+	u32 device_caps;
+
+	/* Max RDMA transfer size in bytes */
+	u32 max_rdma_size;
+};
+
+struct efa_admin_feature_queue_attr_desc {
+	/* The maximum number of queue pairs supported */
+	u32 max_qp;
+
+	/* Maximum number of WQEs per Send Queue */
+	u32 max_sq_depth;
+
+	/* Maximum size of data that can be sent inline in a Send WQE */
+	u32 inline_buf_size;
+
+	/* Maximum number of buffer descriptors per Recv Queue */
+	u32 max_rq_depth;
+
+	/* The maximum number of completion queues supported per VF */
+	u32 max_cq;
+
+	/* Maximum number of CQEs per Completion Queue */
+	u32 max_cq_depth;
+
+	/* Number of sub-CQs to be created for each CQ */
+	u16 sub_cqs_per_cq;
+
+	/* Minimum number of WQEs per SQ */
+	u16 min_sq_depth;
+
+	/* Maximum number of SGEs (buffers) allowed for a single send WQE */
+	u16 max_wr_send_sges;
+
+	/* Maximum number of SGEs allowed for a single recv WQE */
+	u16 max_wr_recv_sges;
+
+	/* The maximum number of memory regions supported */
+	u32 max_mr;
+
+	/* The maximum number of pages can be registered */
+	u32 max_mr_pages;
+
+	/* The maximum number of protection domains supported */
+	u32 max_pd;
+
+	/* The maximum number of address handles supported */
+	u32 max_ah;
+
+	/* The maximum size of LLQ in bytes */
+	u32 max_llq_size;
+
+	/* Maximum number of SGEs for a single RDMA read WQE */
+	u16 max_wr_rdma_sges;
+
+	/*
+	 * Maximum number of bytes that can be written to SQ between two
+	 * consecutive doorbells (in units of 64B). Driver must ensure that only
+	 * complete WQEs are written to queue before issuing a doorbell.
+	 * Examples: max_tx_batch=16 and WQE size = 64B, means up to 16 WQEs can
+	 * be written to SQ between two consecutive doorbells. max_tx_batch=11
+	 * and WQE size = 128B, means up to 5 WQEs can be written to SQ between
+	 * two consecutive doorbells. Zero means unlimited.
+	 */
+	u16 max_tx_batch;
+};
+
+struct efa_admin_event_queue_attr_desc {
+	/* The maximum number of event queues supported */
+	u32 max_eq;
+
+	/* Maximum number of EQEs per Event Queue */
+	u32 max_eq_depth;
+
+	/* Supported events bitmask */
+	u32 event_bitmask;
+};
+
+struct efa_admin_feature_aenq_desc {
+	/* bitmask for AENQ groups the device can report */
+	u32 supported_groups;
+
+	/* bitmask for AENQ groups to report */
+	u32 enabled_groups;
+};
+
+struct efa_admin_feature_network_attr_desc {
+	/* Raw address data in network byte order */
+	u8 addr[16];
+
+	/* max packet payload size in bytes */
+	u32 mtu;
+};
+
+/*
+ * When hint value is 0, hints capabilities are not supported or driver
+ * should use its own predefined value
+ */
+struct efa_admin_hw_hints {
+	/* value in ms */
+	u16 mmio_read_timeout;
+
+	/* value in ms */
+	u16 driver_watchdog_timeout;
+
+	/* value in ms */
+	u16 admin_completion_timeout;
+
+	/* poll interval in ms */
+	u16 poll_interval;
+};
+
+struct efa_admin_get_feature_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	struct efa_admin_ctrl_buff_info control_buffer;
+
+	struct efa_admin_get_set_feature_common_desc feature_common;
+
+	u32 raw[11];
+};
+
+struct efa_admin_get_feature_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+
+		struct efa_admin_feature_device_attr_desc device_attr;
+
+		struct efa_admin_feature_aenq_desc aenq;
+
+		struct efa_admin_feature_network_attr_desc network_attr;
+
+		struct efa_admin_feature_queue_attr_desc queue_attr;
+
+		struct efa_admin_event_queue_attr_desc event_queue_attr;
+
+		struct efa_admin_hw_hints hw_hints;
+	} u;
+};
+
+struct efa_admin_set_feature_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	struct efa_admin_ctrl_buff_info control_buffer;
+
+	struct efa_admin_get_set_feature_common_desc feature_common;
+
+	union {
+		u32 raw[11];
+
+		/* AENQ configuration */
+		struct efa_admin_feature_aenq_desc aenq;
+	} u;
+};
+
+struct efa_admin_set_feature_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+	} u;
+};
+
+struct efa_admin_alloc_pd_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+};
+
+struct efa_admin_alloc_pd_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* PD number */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_pd_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* PD number */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_pd_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_alloc_uar_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+};
+
+struct efa_admin_alloc_uar_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* UAR number */
+	u16 uar;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_uar_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* UAR number */
+	u16 uar;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_uar_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_create_eq_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* Size of the EQ in entries, must be power of 2 */
+	u16 depth;
+
+	/* MSI-X table entry index */
+	u8 msix_vec;
+
+	/*
+	 * 4:0 : entry_size_words - size of EQ entry in
+	 *    32-bit words
+	 * 7:5 : reserved - MBZ
+	 */
+	u8 caps;
+
+	/* EQ ring base address */
+	struct efa_common_mem_addr ba;
+
+	/*
+	 * Enabled events on this EQ
+	 * 0 : completion_events - Enable completion events
+	 * 31:1 : reserved - MBZ
+	 */
+	u32 event_bitmask;
+
+	/* MBZ */
+	u32 reserved;
+};
+
+struct efa_admin_create_eq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* EQ number */
+	u16 eqn;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_destroy_eq_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* EQ number */
+	u16 eqn;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_destroy_eq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/* asynchronous event notification groups */
+enum efa_admin_aenq_group {
+	EFA_ADMIN_FATAL_ERROR                       = 1,
+	EFA_ADMIN_WARNING                           = 2,
+	EFA_ADMIN_NOTIFICATION                      = 3,
+	EFA_ADMIN_KEEP_ALIVE                        = 4,
+	EFA_ADMIN_AENQ_GROUPS_NUM                   = 5,
+};
+
+struct efa_admin_mmio_req_read_less_resp {
+	u16 req_id;
+
+	u16 reg_off;
+
+	/* value is valid when poll is cleared */
+	u32 reg_val;
+};
+
+enum efa_admin_os_type {
+	EFA_ADMIN_OS_LINUX                          = 0,
+};
+
+struct efa_admin_host_info {
+	/* OS distribution string format */
+	u8 os_dist_str[128];
+
+	/* Defined in enum efa_admin_os_type */
+	u32 os_type;
+
+	/* Kernel version string format */
+	u8 kernel_ver_str[32];
+
+	/* Kernel version numeric format */
+	u32 kernel_ver;
+
+	/*
+	 * 7:0 : driver_module_type
+	 * 15:8 : driver_sub_minor
+	 * 23:16 : driver_minor
+	 * 31:24 : driver_major
+	 */
+	u32 driver_ver;
+
+	/*
+	 * Device's Bus, Device and Function
+	 * 2:0 : function
+	 * 7:3 : device
+	 * 15:8 : bus
+	 */
+	u16 bdf;
+
+	/*
+	 * Spec version
+	 * 7:0 : spec_minor
+	 * 15:8 : spec_major
+	 */
+	u16 spec_ver;
+
+	/*
+	 * 0 : intree - Intree driver
+	 * 1 : gdr - GPUDirect RDMA supported
+	 * 31:2 : reserved2
+	 */
+	u32 flags;
+};
+
+/* create_qp_cmd */
+#define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK                BIT(0)
+#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK                BIT(1)
+
+/* modify_qp_cmd */
+#define EFA_ADMIN_MODIFY_QP_CMD_QP_STATE_MASK               BIT(0)
+#define EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE_MASK           BIT(1)
+#define EFA_ADMIN_MODIFY_QP_CMD_QKEY_MASK                   BIT(2)
+#define EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN_MASK                 BIT(3)
+#define EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY_MASK BIT(4)
+#define EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY_MASK              BIT(5)
+
+/* reg_mr_cmd */
+#define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK      GENMASK(4, 0)
+#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK      BIT(7)
+#define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK        BIT(0)
+#define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_MASK        BIT(2)
+
+/* create_cq_cmd */
+#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
+#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK                   BIT(6)
+#define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK    GENMASK(4, 0)
+#define EFA_ADMIN_CREATE_CQ_CMD_SET_SRC_ADDR_MASK           BIT(5)
+
+/* create_cq_resp */
+#define EFA_ADMIN_CREATE_CQ_RESP_DB_VALID_MASK              BIT(0)
+
+/* feature_device_attr_desc */
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK   BIT(0)
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
+
+/* create_eq_cmd */
+#define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK       GENMASK(4, 0)
+#define EFA_ADMIN_CREATE_EQ_CMD_VIRT_MASK                   BIT(6)
+#define EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS_MASK      BIT(0)
+
+/* host_info */
+#define EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE_MASK         GENMASK(7, 0)
+#define EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR_MASK           GENMASK(15, 8)
+#define EFA_ADMIN_HOST_INFO_DRIVER_MINOR_MASK               GENMASK(23, 16)
+#define EFA_ADMIN_HOST_INFO_DRIVER_MAJOR_MASK               GENMASK(31, 24)
+#define EFA_ADMIN_HOST_INFO_FUNCTION_MASK                   GENMASK(2, 0)
+#define EFA_ADMIN_HOST_INFO_DEVICE_MASK                     GENMASK(7, 3)
+#define EFA_ADMIN_HOST_INFO_BUS_MASK                        GENMASK(15, 8)
+#define EFA_ADMIN_HOST_INFO_SPEC_MINOR_MASK                 GENMASK(7, 0)
+#define EFA_ADMIN_HOST_INFO_SPEC_MAJOR_MASK                 GENMASK(15, 8)
+#define EFA_ADMIN_HOST_INFO_INTREE_MASK                     BIT(0)
+#define EFA_ADMIN_HOST_INFO_GDR_MASK                        BIT(1)
+
+#endif /* _EFA_ADMIN_CMDS_H_ */
diff --git a/drivers/amazon/net/efa/efa_admin_defs.h b/drivers/amazon/net/efa/efa_admin_defs.h
new file mode 100644
index 0000000000000..83f20c38a8400
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_admin_defs.h
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_ADMIN_H_
+#define _EFA_ADMIN_H_
+
+enum efa_admin_aq_completion_status {
+	EFA_ADMIN_SUCCESS                           = 0,
+	EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE       = 1,
+	EFA_ADMIN_BAD_OPCODE                        = 2,
+	EFA_ADMIN_UNSUPPORTED_OPCODE                = 3,
+	EFA_ADMIN_MALFORMED_REQUEST                 = 4,
+	/* Additional status is provided in ACQ entry extended_status */
+	EFA_ADMIN_ILLEGAL_PARAMETER                 = 5,
+	EFA_ADMIN_UNKNOWN_ERROR                     = 6,
+	EFA_ADMIN_RESOURCE_BUSY                     = 7,
+};
+
+struct efa_admin_aq_common_desc {
+	/*
+	 * 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command_id;
+
+	/* as appears in efa_admin_aq_opcode */
+	u8 opcode;
+
+	/*
+	 * 0 : phase
+	 * 1 : ctrl_data - control buffer address valid
+	 * 2 : ctrl_data_indirect - control buffer address
+	 *    points to list of pages with addresses of control
+	 *    buffers
+	 * 7:3 : reserved3
+	 */
+	u8 flags;
+};
+
+/*
+ * used in efa_admin_aq_entry. Can point directly to control data, or to a
+ * page list chunk. Used also at the end of indirect mode page list chunks,
+ * for chaining.
+ */
+struct efa_admin_ctrl_buff_info {
+	u32 length;
+
+	struct efa_common_mem_addr address;
+};
+
+struct efa_admin_aq_entry {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		u32 inline_data_w1[3];
+
+		struct efa_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	u32 inline_data_w4[12];
+};
+
+struct efa_admin_acq_common_desc {
+	/*
+	 * command identifier to associate it with the aq descriptor
+	 * 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command;
+
+	u8 status;
+
+	/*
+	 * 0 : phase
+	 * 7:1 : reserved1
+	 */
+	u8 flags;
+
+	u16 extended_status;
+
+	/*
+	 * indicates to the driver which AQ entry has been consumed by the
+	 * device and could be reused
+	 */
+	u16 sq_head_indx;
+};
+
+struct efa_admin_acq_entry {
+	struct efa_admin_acq_common_desc acq_common_descriptor;
+
+	u32 response_specific_data[14];
+};
+
+struct efa_admin_aenq_common_desc {
+	u16 group;
+
+	u16 syndrom;
+
+	/*
+	 * 0 : phase
+	 * 7:1 : reserved - MBZ
+	 */
+	u8 flags;
+
+	u8 reserved1[3];
+
+	u32 timestamp_low;
+
+	u32 timestamp_high;
+};
+
+struct efa_admin_aenq_entry {
+	struct efa_admin_aenq_common_desc aenq_common_desc;
+
+	/* command specific inline data */
+	u32 inline_data_w4[12];
+};
+
+enum efa_admin_eqe_event_type {
+	EFA_ADMIN_EQE_EVENT_TYPE_COMPLETION         = 0,
+};
+
+/* Completion event */
+struct efa_admin_comp_event {
+	/* CQ number */
+	u16 cqn;
+
+	/* MBZ */
+	u16 reserved;
+
+	/* MBZ */
+	u32 reserved2;
+};
+
+/* Event Queue Element */
+struct efa_admin_eqe {
+	/*
+	 * 0 : phase
+	 * 8:1 : event_type - Event type
+	 * 31:9 : reserved - MBZ
+	 */
+	u32 common;
+
+	/* MBZ */
+	u32 reserved;
+
+	union {
+		/* Event data */
+		u32 event_data[2];
+
+		/* Completion Event */
+		struct efa_admin_comp_event comp_event;
+	} u;
+};
+
+/* aq_common_desc */
+#define EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
+#define EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK             BIT(1)
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK    BIT(2)
+
+/* acq_common_desc */
+#define EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK           GENMASK(11, 0)
+#define EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK                BIT(0)
+
+/* aenq_common_desc */
+#define EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK               BIT(0)
+
+/* eqe */
+#define EFA_ADMIN_EQE_PHASE_MASK                            BIT(0)
+#define EFA_ADMIN_EQE_EVENT_TYPE_MASK                       GENMASK(8, 1)
+
+#endif /* _EFA_ADMIN_H_ */
diff --git a/drivers/amazon/net/efa/efa_com.c b/drivers/amazon/net/efa/efa_com.c
new file mode 100644
index 0000000000000..d0b13097a0967
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_com.c
@@ -0,0 +1,1251 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_com.h"
+#include "efa_regs_defs.h"
+
+#define ADMIN_CMD_TIMEOUT_US 30000000 /* usecs */
+
+#define EFA_REG_READ_TIMEOUT_US 50000 /* usecs */
+#define EFA_MMIO_READ_INVALID 0xffffffff
+
+#define EFA_POLL_INTERVAL_MS 100 /* msecs */
+
+#define EFA_ASYNC_QUEUE_DEPTH 16
+#define EFA_ADMIN_QUEUE_DEPTH 32
+
+#define EFA_CTRL_MAJOR          0
+#define EFA_CTRL_MINOR          0
+#define EFA_CTRL_SUB_MINOR      1
+
+enum efa_cmd_status {
+	EFA_CMD_SUBMITTED,
+	EFA_CMD_COMPLETED,
+};
+
+struct efa_comp_ctx {
+	struct completion wait_event;
+	struct efa_admin_acq_entry *user_cqe;
+	u32 comp_size;
+	enum efa_cmd_status status;
+	u8 cmd_opcode;
+	u8 occupied;
+};
+
+static const char *efa_com_cmd_str(u8 cmd)
+{
+#define EFA_CMD_STR_CASE(_cmd) case EFA_ADMIN_##_cmd: return #_cmd
+
+	switch (cmd) {
+	EFA_CMD_STR_CASE(CREATE_QP);
+	EFA_CMD_STR_CASE(MODIFY_QP);
+	EFA_CMD_STR_CASE(QUERY_QP);
+	EFA_CMD_STR_CASE(DESTROY_QP);
+	EFA_CMD_STR_CASE(CREATE_AH);
+	EFA_CMD_STR_CASE(DESTROY_AH);
+	EFA_CMD_STR_CASE(REG_MR);
+	EFA_CMD_STR_CASE(DEREG_MR);
+	EFA_CMD_STR_CASE(CREATE_CQ);
+	EFA_CMD_STR_CASE(DESTROY_CQ);
+	EFA_CMD_STR_CASE(GET_FEATURE);
+	EFA_CMD_STR_CASE(SET_FEATURE);
+	EFA_CMD_STR_CASE(GET_STATS);
+	EFA_CMD_STR_CASE(ALLOC_PD);
+	EFA_CMD_STR_CASE(DEALLOC_PD);
+	EFA_CMD_STR_CASE(ALLOC_UAR);
+	EFA_CMD_STR_CASE(DEALLOC_UAR);
+	EFA_CMD_STR_CASE(CREATE_EQ);
+	EFA_CMD_STR_CASE(DESTROY_EQ);
+	default: return "unknown command opcode";
+	}
+#undef EFA_CMD_STR_CASE
+}
+
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low)
+{
+	*addr_low = lower_32_bits(addr);
+	*addr_high = upper_32_bits(addr);
+}
+
+static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+	struct efa_admin_mmio_req_read_less_resp *read_resp;
+	unsigned long exp_time;
+	u32 mmio_read_reg = 0;
+	u32 err;
+
+	read_resp = mmio_read->read_resp;
+
+	spin_lock(&mmio_read->lock);
+	mmio_read->seq_num++;
+
+	/* trash DMA req_id to identify when hardware is done */
+	read_resp->req_id = mmio_read->seq_num + 0x9aL;
+	EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REG_OFF, offset);
+	EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REQ_ID,
+		mmio_read->seq_num);
+
+	writel(mmio_read_reg, edev->reg_bar + EFA_REGS_MMIO_REG_READ_OFF);
+
+	exp_time = jiffies + usecs_to_jiffies(mmio_read->mmio_read_timeout);
+	do {
+		if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num)
+			break;
+		udelay(1);
+	} while (time_is_after_jiffies(exp_time));
+
+	if (read_resp->req_id != mmio_read->seq_num) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Reading register timed out. expected: req id[%u] offset[%#x] actual: req id[%u] offset[%#x]\n",
+			mmio_read->seq_num, offset, read_resp->req_id,
+			read_resp->reg_off);
+		err = EFA_MMIO_READ_INVALID;
+		goto out;
+	}
+
+	if (read_resp->reg_off != offset) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Reading register failed: wrong offset provided\n");
+		err = EFA_MMIO_READ_INVALID;
+		goto out;
+	}
+
+	err = read_resp->reg_val;
+out:
+	spin_unlock(&mmio_read->lock);
+	return err;
+}
+
+static int efa_com_admin_init_sq(struct efa_com_dev *edev)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_com_admin_sq *sq = &aq->sq;
+	u16 size = aq->depth * sizeof(*sq->entries);
+	u32 aq_caps = 0;
+	u32 addr_high;
+	u32 addr_low;
+
+	sq->entries =
+		dma_alloc_coherent(aq->dmadev, size, &sq->dma_addr, GFP_KERNEL);
+	if (!sq->entries)
+		return -ENOMEM;
+
+	spin_lock_init(&sq->lock);
+
+	sq->cc = 0;
+	sq->pc = 0;
+	sq->phase = 1;
+
+	sq->db_addr = (u32 __iomem *)(edev->reg_bar + EFA_REGS_AQ_PROD_DB_OFF);
+
+	addr_high = upper_32_bits(sq->dma_addr);
+	addr_low = lower_32_bits(sq->dma_addr);
+
+	writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF);
+	writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF);
+
+	EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_DEPTH, aq->depth);
+	EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_aq_entry));
+
+	writel(aq_caps, edev->reg_bar + EFA_REGS_AQ_CAPS_OFF);
+
+	return 0;
+}
+
+static int efa_com_admin_init_cq(struct efa_com_dev *edev)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_com_admin_cq *cq = &aq->cq;
+	u16 size = aq->depth * sizeof(*cq->entries);
+	u32 acq_caps = 0;
+	u32 addr_high;
+	u32 addr_low;
+
+	cq->entries =
+		dma_alloc_coherent(aq->dmadev, size, &cq->dma_addr, GFP_KERNEL);
+	if (!cq->entries)
+		return -ENOMEM;
+
+	spin_lock_init(&cq->lock);
+
+	cq->cc = 0;
+	cq->phase = 1;
+
+	addr_high = upper_32_bits(cq->dma_addr);
+	addr_low = lower_32_bits(cq->dma_addr);
+
+	writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF);
+	writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF);
+
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_DEPTH, aq->depth);
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_acq_entry));
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR,
+		aq->msix_vector_idx);
+
+	writel(acq_caps, edev->reg_bar + EFA_REGS_ACQ_CAPS_OFF);
+
+	return 0;
+}
+
+static int efa_com_admin_init_aenq(struct efa_com_dev *edev,
+				   struct efa_aenq_handlers *aenq_handlers)
+{
+	struct efa_com_aenq *aenq = &edev->aenq;
+	u32 addr_low, addr_high;
+	u32 aenq_caps = 0;
+	u16 size;
+
+	if (!aenq_handlers) {
+		ibdev_err(edev->efa_dev, "aenq handlers pointer is NULL\n");
+		return -EINVAL;
+	}
+
+	size = EFA_ASYNC_QUEUE_DEPTH * sizeof(*aenq->entries);
+	aenq->entries = dma_alloc_coherent(edev->dmadev, size, &aenq->dma_addr,
+					   GFP_KERNEL);
+	if (!aenq->entries)
+		return -ENOMEM;
+
+	aenq->aenq_handlers = aenq_handlers;
+	aenq->depth = EFA_ASYNC_QUEUE_DEPTH;
+	aenq->cc = 0;
+	aenq->phase = 1;
+
+	addr_low = lower_32_bits(aenq->dma_addr);
+	addr_high = upper_32_bits(aenq->dma_addr);
+
+	writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF);
+	writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF);
+
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_DEPTH, aenq->depth);
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_aenq_entry));
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR,
+		aenq->msix_vector_idx);
+	writel(aenq_caps, edev->reg_bar + EFA_REGS_AENQ_CAPS_OFF);
+
+	/*
+	 * Init cons_db to mark that all entries in the queue
+	 * are initially available
+	 */
+	writel(edev->aenq.cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF);
+
+	return 0;
+}
+
+/* ID to be used with efa_com_get_comp_ctx */
+static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq)
+{
+	u16 ctx_id;
+
+	spin_lock(&aq->comp_ctx_lock);
+	ctx_id = aq->comp_ctx_pool[aq->comp_ctx_pool_next];
+	aq->comp_ctx_pool_next++;
+	spin_unlock(&aq->comp_ctx_lock);
+
+	return ctx_id;
+}
+
+static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq,
+				   u16 ctx_id)
+{
+	spin_lock(&aq->comp_ctx_lock);
+	aq->comp_ctx_pool_next--;
+	aq->comp_ctx_pool[aq->comp_ctx_pool_next] = ctx_id;
+	spin_unlock(&aq->comp_ctx_lock);
+}
+
+static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq,
+					struct efa_comp_ctx *comp_ctx)
+{
+	u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command,
+			     EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
+	u16 ctx_id = cmd_id & (aq->depth - 1);
+
+	ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id);
+	comp_ctx->occupied = 0;
+	efa_com_dealloc_ctx_id(aq, ctx_id);
+}
+
+static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq,
+						 u16 cmd_id, bool capture)
+{
+	u16 ctx_id = cmd_id & (aq->depth - 1);
+
+	if (aq->comp_ctx[ctx_id].occupied && capture) {
+		ibdev_err_ratelimited(
+			aq->efa_dev,
+			"Completion context for command_id %#x is occupied\n",
+			cmd_id);
+		return NULL;
+	}
+
+	if (capture) {
+		aq->comp_ctx[ctx_id].occupied = 1;
+		ibdev_dbg(aq->efa_dev,
+			  "Take completion ctxt for command_id %#x\n", cmd_id);
+	}
+
+	return &aq->comp_ctx[ctx_id];
+}
+
+static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
+						       struct efa_admin_aq_entry *cmd,
+						       size_t cmd_size_in_bytes,
+						       struct efa_admin_acq_entry *comp,
+						       size_t comp_size_in_bytes)
+{
+	struct efa_admin_aq_entry *aqe;
+	struct efa_comp_ctx *comp_ctx;
+	u16 queue_size_mask;
+	u16 cmd_id;
+	u16 ctx_id;
+	u16 pi;
+
+	queue_size_mask = aq->depth - 1;
+	pi = aq->sq.pc & queue_size_mask;
+
+	ctx_id = efa_com_alloc_ctx_id(aq);
+
+	/* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */
+	cmd_id = ctx_id & queue_size_mask;
+	cmd_id |= aq->sq.pc & ~queue_size_mask;
+	cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
+
+	cmd->aq_common_descriptor.command_id = cmd_id;
+	EFA_SET(&cmd->aq_common_descriptor.flags,
+		EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase);
+
+	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true);
+	if (!comp_ctx) {
+		efa_com_dealloc_ctx_id(aq, ctx_id);
+		return ERR_PTR(-EINVAL);
+	}
+
+	comp_ctx->status = EFA_CMD_SUBMITTED;
+	comp_ctx->comp_size = comp_size_in_bytes;
+	comp_ctx->user_cqe = comp;
+	comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode;
+
+	reinit_completion(&comp_ctx->wait_event);
+
+	aqe = &aq->sq.entries[pi];
+	memset(aqe, 0, sizeof(*aqe));
+	memcpy(aqe, cmd, cmd_size_in_bytes);
+
+	aq->sq.pc++;
+	atomic64_inc(&aq->stats.submitted_cmd);
+
+	if ((aq->sq.pc & queue_size_mask) == 0)
+		aq->sq.phase = !aq->sq.phase;
+
+	/* barrier not needed in case of writel */
+	writel(aq->sq.pc, aq->sq.db_addr);
+
+	return comp_ctx;
+}
+
+static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq)
+{
+	size_t pool_size = aq->depth * sizeof(*aq->comp_ctx_pool);
+	size_t size = aq->depth * sizeof(struct efa_comp_ctx);
+	struct efa_comp_ctx *comp_ctx;
+	u16 i;
+
+	aq->comp_ctx = devm_kzalloc(aq->dmadev, size, GFP_KERNEL);
+	aq->comp_ctx_pool = devm_kzalloc(aq->dmadev, pool_size, GFP_KERNEL);
+	if (!aq->comp_ctx || !aq->comp_ctx_pool) {
+		devm_kfree(aq->dmadev, aq->comp_ctx_pool);
+		devm_kfree(aq->dmadev, aq->comp_ctx);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < aq->depth; i++) {
+		comp_ctx = efa_com_get_comp_ctx(aq, i, false);
+		if (comp_ctx)
+			init_completion(&comp_ctx->wait_event);
+
+		aq->comp_ctx_pool[i] = i;
+	}
+
+	spin_lock_init(&aq->comp_ctx_lock);
+
+	aq->comp_ctx_pool_next = 0;
+
+	return 0;
+}
+
+static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
+						     struct efa_admin_aq_entry *cmd,
+						     size_t cmd_size_in_bytes,
+						     struct efa_admin_acq_entry *comp,
+						     size_t comp_size_in_bytes)
+{
+	struct efa_comp_ctx *comp_ctx;
+
+	spin_lock(&aq->sq.lock);
+	if (!test_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state)) {
+		ibdev_err_ratelimited(aq->efa_dev, "Admin queue is closed\n");
+		spin_unlock(&aq->sq.lock);
+		return ERR_PTR(-ENODEV);
+	}
+
+	comp_ctx = __efa_com_submit_admin_cmd(aq, cmd, cmd_size_in_bytes, comp,
+					      comp_size_in_bytes);
+	spin_unlock(&aq->sq.lock);
+	if (IS_ERR(comp_ctx))
+		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+	return comp_ctx;
+}
+
+static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq,
+						   struct efa_admin_acq_entry *cqe)
+{
+	struct efa_comp_ctx *comp_ctx;
+	u16 cmd_id;
+
+	cmd_id = EFA_GET(&cqe->acq_common_descriptor.command,
+			 EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
+
+	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false);
+	if (!comp_ctx) {
+		ibdev_err(
+			aq->efa_dev,
+			"comp_ctx is NULL. Changing the admin queue running state\n");
+		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+		return;
+	}
+
+	comp_ctx->status = EFA_CMD_COMPLETED;
+	memcpy(comp_ctx->user_cqe, cqe, comp_ctx->comp_size);
+
+	if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state))
+		complete(&comp_ctx->wait_event);
+}
+
+static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq)
+{
+	struct efa_admin_acq_entry *cqe;
+	u16 queue_size_mask;
+	u16 comp_num = 0;
+	u8 phase;
+	u16 ci;
+
+	queue_size_mask = aq->depth - 1;
+
+	ci = aq->cq.cc & queue_size_mask;
+	phase = aq->cq.phase;
+
+	cqe = &aq->cq.entries[ci];
+
+	/* Go over all the completions */
+	while ((READ_ONCE(cqe->acq_common_descriptor.flags) &
+		EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/*
+		 * Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		dma_rmb();
+		efa_com_handle_single_admin_completion(aq, cqe);
+
+		ci++;
+		comp_num++;
+		if (ci == aq->depth) {
+			ci = 0;
+			phase = !phase;
+		}
+
+		cqe = &aq->cq.entries[ci];
+	}
+
+	aq->cq.cc += comp_num;
+	aq->cq.phase = phase;
+	aq->sq.cc += comp_num;
+	atomic64_add(comp_num, &aq->stats.completed_cmd);
+}
+
+static int efa_com_comp_status_to_errno(u8 comp_status)
+{
+	switch (comp_status) {
+	case EFA_ADMIN_SUCCESS:
+		return 0;
+	case EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE:
+		return -ENOMEM;
+	case EFA_ADMIN_UNSUPPORTED_OPCODE:
+		return -EOPNOTSUPP;
+	case EFA_ADMIN_BAD_OPCODE:
+	case EFA_ADMIN_MALFORMED_REQUEST:
+	case EFA_ADMIN_ILLEGAL_PARAMETER:
+	case EFA_ADMIN_UNKNOWN_ERROR:
+		return -EINVAL;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_ctx,
+						     struct efa_com_admin_queue *aq)
+{
+	unsigned long timeout;
+	unsigned long flags;
+	int err;
+
+	timeout = jiffies + usecs_to_jiffies(aq->completion_timeout);
+
+	while (1) {
+		spin_lock_irqsave(&aq->cq.lock, flags);
+		efa_com_handle_admin_completion(aq);
+		spin_unlock_irqrestore(&aq->cq.lock, flags);
+
+		if (comp_ctx->status != EFA_CMD_SUBMITTED)
+			break;
+
+		if (time_is_before_jiffies(timeout)) {
+			ibdev_err_ratelimited(
+				aq->efa_dev,
+				"Wait for completion (polling) timeout\n");
+			/* EFA didn't have any completion */
+			atomic64_inc(&aq->stats.no_completion);
+
+			clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+			err = -ETIME;
+			goto out;
+		}
+
+		msleep(aq->poll_interval);
+	}
+
+	err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status);
+out:
+	efa_com_put_comp_ctx(aq, comp_ctx);
+	return err;
+}
+
+static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *comp_ctx,
+							struct efa_com_admin_queue *aq)
+{
+	unsigned long flags;
+	int err;
+
+	wait_for_completion_timeout(&comp_ctx->wait_event,
+				    usecs_to_jiffies(aq->completion_timeout));
+
+	/*
+	 * In case the command wasn't completed find out the root cause.
+	 * There might be 2 kinds of errors
+	 * 1) No completion (timeout reached)
+	 * 2) There is completion but the device didn't get any msi-x interrupt.
+	 */
+	if (comp_ctx->status == EFA_CMD_SUBMITTED) {
+		spin_lock_irqsave(&aq->cq.lock, flags);
+		efa_com_handle_admin_completion(aq);
+		spin_unlock_irqrestore(&aq->cq.lock, flags);
+
+		atomic64_inc(&aq->stats.no_completion);
+
+		if (comp_ctx->status == EFA_CMD_COMPLETED)
+			ibdev_err_ratelimited(
+				aq->efa_dev,
+				"The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (ctx: 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+				efa_com_cmd_str(comp_ctx->cmd_opcode),
+				comp_ctx->cmd_opcode, comp_ctx->status,
+				comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+		else
+			ibdev_err_ratelimited(
+				aq->efa_dev,
+				"The device didn't send any completion for admin cmd %s(%d) status %d (ctx 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+				efa_com_cmd_str(comp_ctx->cmd_opcode),
+				comp_ctx->cmd_opcode, comp_ctx->status,
+				comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+
+		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+		err = -ETIME;
+		goto out;
+	}
+
+	err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status);
+out:
+	efa_com_put_comp_ctx(aq, comp_ctx);
+	return err;
+}
+
+/*
+ * There are two types to wait for completion.
+ * Polling mode - wait until the completion is available.
+ * Async mode - wait on wait queue until the completion is ready
+ * (or the timeout expired).
+ * It is expected that the IRQ called efa_com_handle_admin_completion
+ * to mark the completions.
+ */
+static int efa_com_wait_and_process_admin_cq(struct efa_comp_ctx *comp_ctx,
+					     struct efa_com_admin_queue *aq)
+{
+	if (test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state))
+		return efa_com_wait_and_process_admin_cq_polling(comp_ctx, aq);
+
+	return efa_com_wait_and_process_admin_cq_interrupts(comp_ctx, aq);
+}
+
+/**
+ * efa_com_cmd_exec - Execute admin command
+ * @aq: admin queue.
+ * @cmd: the admin command to execute.
+ * @cmd_size: the command size.
+ * @comp: command completion return entry.
+ * @comp_size: command completion size.
+ * Submit an admin command and then wait until the device will return a
+ * completion.
+ * The completion will be copied into comp.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
+		     struct efa_admin_aq_entry *cmd,
+		     size_t cmd_size,
+		     struct efa_admin_acq_entry *comp,
+		     size_t comp_size)
+{
+	struct efa_comp_ctx *comp_ctx;
+	int err;
+
+	might_sleep();
+
+	/* In case of queue FULL */
+	down(&aq->avail_cmds);
+
+	ibdev_dbg(aq->efa_dev, "%s (opcode %d)\n",
+		  efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+		  cmd->aq_common_descriptor.opcode);
+	comp_ctx = efa_com_submit_admin_cmd(aq, cmd, cmd_size, comp, comp_size);
+	if (IS_ERR(comp_ctx)) {
+		ibdev_err_ratelimited(
+			aq->efa_dev,
+			"Failed to submit command %s (opcode %u) err %ld\n",
+			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+			cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx));
+
+		up(&aq->avail_cmds);
+		atomic64_inc(&aq->stats.cmd_err);
+		return PTR_ERR(comp_ctx);
+	}
+
+	err = efa_com_wait_and_process_admin_cq(comp_ctx, aq);
+	if (err) {
+		ibdev_err_ratelimited(
+			aq->efa_dev,
+			"Failed to process command %s (opcode %u) comp_status %d err %d\n",
+			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+			cmd->aq_common_descriptor.opcode,
+			comp_ctx->user_cqe->acq_common_descriptor.status, err);
+		atomic64_inc(&aq->stats.cmd_err);
+	}
+
+	up(&aq->avail_cmds);
+
+	return err;
+}
+
+/**
+ * efa_com_admin_destroy - Destroy the admin and the async events queues.
+ * @edev: EFA communication layer struct
+ */
+void efa_com_admin_destroy(struct efa_com_dev *edev)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_com_aenq *aenq = &edev->aenq;
+	struct efa_com_admin_cq *cq = &aq->cq;
+	struct efa_com_admin_sq *sq = &aq->sq;
+	u16 size;
+
+	clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+	devm_kfree(edev->dmadev, aq->comp_ctx_pool);
+	devm_kfree(edev->dmadev, aq->comp_ctx);
+
+	size = aq->depth * sizeof(*sq->entries);
+	dma_free_coherent(edev->dmadev, size, sq->entries, sq->dma_addr);
+
+	size = aq->depth * sizeof(*cq->entries);
+	dma_free_coherent(edev->dmadev, size, cq->entries, cq->dma_addr);
+
+	size = aenq->depth * sizeof(*aenq->entries);
+	dma_free_coherent(edev->dmadev, size, aenq->entries, aenq->dma_addr);
+}
+
+/**
+ * efa_com_set_admin_polling_mode - Set the admin completion queue polling mode
+ * @edev: EFA communication layer struct
+ * @polling: Enable/Disable polling mode
+ *
+ * Set the admin completion mode.
+ */
+void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling)
+{
+	u32 mask_value = 0;
+
+	if (polling)
+		EFA_SET(&mask_value, EFA_REGS_INTR_MASK_EN, 1);
+
+	writel(mask_value, edev->reg_bar + EFA_REGS_INTR_MASK_OFF);
+	if (polling)
+		set_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state);
+	else
+		clear_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state);
+}
+
+static void efa_com_stats_init(struct efa_com_dev *edev)
+{
+	atomic64_t *s = (atomic64_t *)&edev->aq.stats;
+	int i;
+
+	for (i = 0; i < sizeof(edev->aq.stats) / sizeof(*s); i++, s++)
+		atomic64_set(s, 0);
+}
+
+/**
+ * efa_com_admin_init - Init the admin and the async queues
+ * @edev: EFA communication layer struct
+ * @aenq_handlers: Those handlers to be called upon event.
+ *
+ * Initialize the admin submission and completion queues.
+ * Initialize the asynchronous events notification queues.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_admin_init(struct efa_com_dev *edev,
+		       struct efa_aenq_handlers *aenq_handlers)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	u32 timeout;
+	u32 dev_sts;
+	u32 cap;
+	int err;
+
+	dev_sts = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+	if (!EFA_GET(&dev_sts, EFA_REGS_DEV_STS_READY)) {
+		ibdev_err(edev->efa_dev,
+			  "Device isn't ready, abort com init %#x\n", dev_sts);
+		return -ENODEV;
+	}
+
+	aq->depth = EFA_ADMIN_QUEUE_DEPTH;
+
+	aq->dmadev = edev->dmadev;
+	aq->efa_dev = edev->efa_dev;
+	set_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state);
+
+	sema_init(&aq->avail_cmds, aq->depth);
+
+	efa_com_stats_init(edev);
+
+	err = efa_com_init_comp_ctxt(aq);
+	if (err)
+		return err;
+
+	err = efa_com_admin_init_sq(edev);
+	if (err)
+		goto err_destroy_comp_ctxt;
+
+	err = efa_com_admin_init_cq(edev);
+	if (err)
+		goto err_destroy_sq;
+
+	efa_com_set_admin_polling_mode(edev, false);
+
+	err = efa_com_admin_init_aenq(edev, aenq_handlers);
+	if (err)
+		goto err_destroy_cq;
+
+	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO);
+	if (timeout)
+		/* the resolution of timeout reg is 100ms */
+		aq->completion_timeout = timeout * 100000;
+	else
+		aq->completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+	aq->poll_interval = EFA_POLL_INTERVAL_MS;
+
+	set_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+	return 0;
+
+err_destroy_cq:
+	dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->cq.entries),
+			  aq->cq.entries, aq->cq.dma_addr);
+err_destroy_sq:
+	dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->sq.entries),
+			  aq->sq.entries, aq->sq.dma_addr);
+err_destroy_comp_ctxt:
+	devm_kfree(edev->dmadev, aq->comp_ctx);
+
+	return err;
+}
+
+/**
+ * efa_com_admin_q_comp_intr_handler - admin queue interrupt handler
+ * @edev: EFA communication layer struct
+ *
+ * This method goes over the admin completion queue and wakes up
+ * all the pending threads that wait on the commands wait event.
+ *
+ * Note: Should be called after MSI-X interrupt.
+ */
+void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&edev->aq.cq.lock, flags);
+	efa_com_handle_admin_completion(&edev->aq);
+	spin_unlock_irqrestore(&edev->aq.cq.lock, flags);
+}
+
+/*
+ * efa_handle_specific_aenq_event:
+ * return the handler that is relevant to the specific event group
+ */
+static efa_aenq_handler efa_com_get_specific_aenq_cb(struct efa_com_dev *edev,
+						     u16 group)
+{
+	struct efa_aenq_handlers *aenq_handlers = edev->aenq.aenq_handlers;
+
+	if (group < EFA_MAX_HANDLERS && aenq_handlers->handlers[group])
+		return aenq_handlers->handlers[group];
+
+	return aenq_handlers->unimplemented_handler;
+}
+
+/**
+ * efa_com_aenq_intr_handler - AENQ interrupt handler
+ * @edev: EFA communication layer struct
+ * @data: Data of interrupt handler.
+ *
+ * Go over the async event notification queue and call the proper aenq handler.
+ */
+void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data)
+{
+	struct efa_admin_aenq_common_desc *aenq_common;
+	struct efa_com_aenq *aenq = &edev->aenq;
+	struct efa_admin_aenq_entry *aenq_e;
+	efa_aenq_handler handler_cb;
+	u32 processed = 0;
+	u8 phase;
+	u32 ci;
+
+	ci = aenq->cc & (aenq->depth - 1);
+	phase = aenq->phase;
+	aenq_e = &aenq->entries[ci]; /* Get first entry */
+	aenq_common = &aenq_e->aenq_common_desc;
+
+	/* Go over all the events */
+	while ((READ_ONCE(aenq_common->flags) &
+		EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/*
+		 * Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		dma_rmb();
+
+		/* Handle specific event*/
+		handler_cb = efa_com_get_specific_aenq_cb(edev,
+							  aenq_common->group);
+		handler_cb(data, aenq_e); /* call the actual event handler*/
+
+		/* Get next event entry */
+		ci++;
+		processed++;
+
+		if (ci == aenq->depth) {
+			ci = 0;
+			phase = !phase;
+		}
+		aenq_e = &aenq->entries[ci];
+		aenq_common = &aenq_e->aenq_common_desc;
+	}
+
+	aenq->cc += processed;
+	aenq->phase = phase;
+
+	/* Don't update aenq doorbell if there weren't any processed events */
+	if (!processed)
+		return;
+
+	/* barrier not needed in case of writel */
+	writel(aenq->cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF);
+}
+
+static void efa_com_mmio_reg_read_resp_addr_init(struct efa_com_dev *edev)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+	u32 addr_high;
+	u32 addr_low;
+
+	/* dma_addr_bits is unknown at this point */
+	addr_high = (mmio_read->read_resp_dma_addr >> 32) & GENMASK(31, 0);
+	addr_low = mmio_read->read_resp_dma_addr & GENMASK(31, 0);
+
+	writel(addr_high, edev->reg_bar + EFA_REGS_MMIO_RESP_HI_OFF);
+	writel(addr_low, edev->reg_bar + EFA_REGS_MMIO_RESP_LO_OFF);
+}
+
+int efa_com_mmio_reg_read_init(struct efa_com_dev *edev)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+
+	spin_lock_init(&mmio_read->lock);
+	mmio_read->read_resp =
+		dma_alloc_coherent(edev->dmadev, sizeof(*mmio_read->read_resp),
+				   &mmio_read->read_resp_dma_addr, GFP_KERNEL);
+	if (!mmio_read->read_resp)
+		return -ENOMEM;
+
+	efa_com_mmio_reg_read_resp_addr_init(edev);
+
+	mmio_read->read_resp->req_id = 0;
+	mmio_read->seq_num = 0;
+	mmio_read->mmio_read_timeout = EFA_REG_READ_TIMEOUT_US;
+
+	return 0;
+}
+
+void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+
+	dma_free_coherent(edev->dmadev, sizeof(*mmio_read->read_resp),
+			  mmio_read->read_resp, mmio_read->read_resp_dma_addr);
+}
+
+int efa_com_validate_version(struct efa_com_dev *edev)
+{
+	u32 min_ctrl_ver = 0;
+	u32 ctrl_ver_masked;
+	u32 min_ver = 0;
+	u32 ctrl_ver;
+	u32 ver;
+
+	/*
+	 * Make sure the EFA version and the controller version are at least
+	 * as the driver expects
+	 */
+	ver = efa_com_reg_read32(edev, EFA_REGS_VERSION_OFF);
+	ctrl_ver = efa_com_reg_read32(edev,
+				      EFA_REGS_CONTROLLER_VERSION_OFF);
+
+	ibdev_dbg(edev->efa_dev, "efa device version: %d.%d\n",
+		  EFA_GET(&ver, EFA_REGS_VERSION_MAJOR_VERSION),
+		  EFA_GET(&ver, EFA_REGS_VERSION_MINOR_VERSION));
+
+	EFA_SET(&min_ver, EFA_REGS_VERSION_MAJOR_VERSION,
+		EFA_ADMIN_API_VERSION_MAJOR);
+	EFA_SET(&min_ver, EFA_REGS_VERSION_MINOR_VERSION,
+		EFA_ADMIN_API_VERSION_MINOR);
+	if (ver < min_ver) {
+		ibdev_err(
+			edev->efa_dev,
+			"EFA version is lower than the minimal version the driver supports\n");
+		return -EOPNOTSUPP;
+	}
+
+	ibdev_dbg(
+		edev->efa_dev,
+		"efa controller version: %d.%d.%d implementation version %d\n",
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION),
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION),
+		EFA_GET(&ctrl_ver,
+			EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION),
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_IMPL_ID));
+
+	ctrl_ver_masked =
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION) |
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION) |
+		EFA_GET(&ctrl_ver,
+			EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION);
+
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION,
+		EFA_CTRL_MAJOR);
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION,
+		EFA_CTRL_MINOR);
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION,
+		EFA_CTRL_SUB_MINOR);
+	/* Validate the ctrl version without the implementation ID */
+	if (ctrl_ver_masked < min_ctrl_ver) {
+		ibdev_err(
+			edev->efa_dev,
+			"EFA ctrl version is lower than the minimal ctrl version the driver supports\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/**
+ * efa_com_get_dma_width - Retrieve physical dma address width the device
+ * supports.
+ * @edev: EFA communication layer struct
+ *
+ * Retrieve the maximum physical address bits the device can handle.
+ *
+ * @return: > 0 on Success and negative value otherwise.
+ */
+int efa_com_get_dma_width(struct efa_com_dev *edev)
+{
+	u32 caps = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+	int width;
+
+	width = EFA_GET(&caps, EFA_REGS_CAPS_DMA_ADDR_WIDTH);
+
+	ibdev_dbg(edev->efa_dev, "DMA width: %d\n", width);
+
+	if (width < 32 || width > 64) {
+		ibdev_err(edev->efa_dev, "DMA width illegal value: %d\n",
+			  width);
+		return -EINVAL;
+	}
+
+	edev->dma_addr_bits = width;
+
+	return width;
+}
+
+static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout, int on)
+{
+	u32 val, i;
+
+	for (i = 0; i < timeout; i++) {
+		val = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+
+		if (EFA_GET(&val, EFA_REGS_DEV_STS_RESET_IN_PROGRESS) == on)
+			return 0;
+
+		ibdev_dbg(edev->efa_dev, "Reset indication val %d\n", val);
+		msleep(EFA_POLL_INTERVAL_MS);
+	}
+
+	return -ETIME;
+}
+
+/**
+ * efa_com_dev_reset - Perform device FLR to the device.
+ * @edev: EFA communication layer struct
+ * @reset_reason: Specify what is the trigger for the reset in case of an error.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_dev_reset(struct efa_com_dev *edev,
+		      enum efa_regs_reset_reason_types reset_reason)
+{
+	u32 stat, timeout, cap;
+	u32 reset_val = 0;
+	int err;
+
+	stat = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+
+	if (!EFA_GET(&stat, EFA_REGS_DEV_STS_READY)) {
+		ibdev_err(edev->efa_dev,
+			  "Device isn't ready, can't reset device\n");
+		return -EINVAL;
+	}
+
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_RESET_TIMEOUT);
+	if (!timeout) {
+		ibdev_err(edev->efa_dev, "Invalid timeout value\n");
+		return -EINVAL;
+	}
+
+	/* start reset */
+	EFA_SET(&reset_val, EFA_REGS_DEV_CTL_DEV_RESET, 1);
+	EFA_SET(&reset_val, EFA_REGS_DEV_CTL_RESET_REASON, reset_reason);
+	writel(reset_val, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
+
+	/* reset clears the mmio readless address, restore it */
+	efa_com_mmio_reg_read_resp_addr_init(edev);
+
+	err = wait_for_reset_state(edev, timeout, 1);
+	if (err) {
+		ibdev_err(edev->efa_dev, "Reset indication didn't turn on\n");
+		return err;
+	}
+
+	/* reset done */
+	writel(0, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
+	err = wait_for_reset_state(edev, timeout, 0);
+	if (err) {
+		ibdev_err(edev->efa_dev, "Reset indication didn't turn off\n");
+		return err;
+	}
+
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO);
+	if (timeout)
+		/* the resolution of timeout reg is 100ms */
+		edev->aq.completion_timeout = timeout * 100000;
+	else
+		edev->aq.completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+	return 0;
+}
+
+static int efa_com_create_eq(struct efa_com_dev *edev,
+			     struct efa_com_create_eq_params *params,
+			     struct efa_com_create_eq_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_create_eq_resp resp = {};
+	struct efa_admin_create_eq_cmd cmd = {};
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_CREATE_EQ;
+	EFA_SET(&cmd.caps, EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS,
+		params->entry_size_in_bytes / 4);
+	cmd.depth = params->depth;
+	cmd.event_bitmask = params->event_bitmask;
+	cmd.msix_vec = params->msix_vec;
+
+	efa_com_set_dma_addr(params->dma_addr, &cmd.ba.mem_addr_high,
+			     &cmd.ba.mem_addr_low);
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create eq[%d]\n", err);
+		return err;
+	}
+
+	result->eqn = resp.eqn;
+
+	return 0;
+}
+
+static void efa_com_destroy_eq(struct efa_com_dev *edev,
+			       struct efa_com_destroy_eq_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_destroy_eq_resp resp = {};
+	struct efa_admin_destroy_eq_cmd cmd = {};
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_DESTROY_EQ;
+	cmd.eqn = params->eqn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err)
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy EQ-%u [%d]\n", cmd.eqn,
+				      err);
+}
+
+static void efa_com_arm_eq(struct efa_com_dev *edev, struct efa_com_eq *eeq)
+{
+	u32 val = 0;
+
+	EFA_SET(&val, EFA_REGS_EQ_DB_EQN, eeq->eqn);
+	EFA_SET(&val, EFA_REGS_EQ_DB_ARM, 1);
+
+	writel(val, edev->reg_bar + EFA_REGS_EQ_DB_OFF);
+}
+
+void efa_com_eq_comp_intr_handler(struct efa_com_dev *edev,
+				  struct efa_com_eq *eeq)
+{
+	struct efa_admin_eqe *eqe;
+	u32 processed = 0;
+	u8 phase;
+	u32 ci;
+
+	ci = eeq->cc & (eeq->depth - 1);
+	phase = eeq->phase;
+	eqe = &eeq->eqes[ci];
+
+	/* Go over all the events */
+	while ((READ_ONCE(eqe->common) & EFA_ADMIN_EQE_PHASE_MASK) == phase) {
+		/*
+		 * Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		dma_rmb();
+
+		eeq->cb(eeq, eqe);
+
+		/* Get next event entry */
+		ci++;
+		processed++;
+
+		if (ci == eeq->depth) {
+			ci = 0;
+			phase = !phase;
+		}
+
+		eqe = &eeq->eqes[ci];
+	}
+
+	eeq->cc += processed;
+	eeq->phase = phase;
+	efa_com_arm_eq(eeq->edev, eeq);
+}
+
+void efa_com_eq_destroy(struct efa_com_dev *edev, struct efa_com_eq *eeq)
+{
+	struct efa_com_destroy_eq_params params = {
+		.eqn = eeq->eqn,
+	};
+
+	efa_com_destroy_eq(edev, &params);
+	dma_free_coherent(edev->dmadev, eeq->depth * sizeof(*eeq->eqes),
+			  eeq->eqes, eeq->dma_addr);
+}
+
+int efa_com_eq_init(struct efa_com_dev *edev, struct efa_com_eq *eeq,
+		    efa_eqe_handler cb, u16 depth, u8 msix_vec)
+{
+	struct efa_com_create_eq_params params = {};
+	struct efa_com_create_eq_result result = {};
+	int err;
+
+	params.depth = depth;
+	params.entry_size_in_bytes = sizeof(*eeq->eqes);
+	EFA_SET(&params.event_bitmask,
+		EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS, 1);
+	params.msix_vec = msix_vec;
+
+	eeq->eqes = dma_alloc_coherent(edev->dmadev,
+				       params.depth * sizeof(*eeq->eqes),
+				       &params.dma_addr, GFP_KERNEL);
+	if (!eeq->eqes)
+		return -ENOMEM;
+
+	err = efa_com_create_eq(edev, &params, &result);
+	if (err)
+		goto err_free_coherent;
+
+	eeq->eqn = result.eqn;
+	eeq->edev = edev;
+	eeq->dma_addr = params.dma_addr;
+	eeq->phase = 1;
+	eeq->depth = params.depth;
+	eeq->cb = cb;
+	efa_com_arm_eq(edev, eeq);
+
+	return 0;
+
+err_free_coherent:
+	dma_free_coherent(edev->dmadev, params.depth * sizeof(*eeq->eqes),
+			  eeq->eqes, params.dma_addr);
+	return err;
+}
diff --git a/drivers/amazon/net/efa/efa_com.h b/drivers/amazon/net/efa/efa_com.h
new file mode 100644
index 0000000000000..bced7c3981792
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_com.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COM_H_
+#define _EFA_COM_H_
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/semaphore.h>
+#include <linux/sched.h>
+
+#include <rdma/ib_verbs.h>
+#include "kcompat.h"
+
+#include "efa_common_defs.h"
+#include "efa_admin_defs.h"
+#include "efa_admin_cmds_defs.h"
+#include "efa_regs_defs.h"
+
+#define EFA_MAX_HANDLERS 256
+
+struct efa_com_admin_cq {
+	struct efa_admin_acq_entry *entries;
+	dma_addr_t dma_addr;
+	spinlock_t lock; /* Protects ACQ */
+
+	u16 cc; /* consumer counter */
+	u8 phase;
+};
+
+struct efa_com_admin_sq {
+	struct efa_admin_aq_entry *entries;
+	dma_addr_t dma_addr;
+	spinlock_t lock; /* Protects ASQ */
+
+	u32 __iomem *db_addr;
+
+	u16 cc; /* consumer counter */
+	u16 pc; /* producer counter */
+	u8 phase;
+
+};
+
+/* Don't use anything other than atomic64 */
+struct efa_com_stats_admin {
+	atomic64_t submitted_cmd;
+	atomic64_t completed_cmd;
+	atomic64_t cmd_err;
+	atomic64_t no_completion;
+};
+
+enum {
+	EFA_AQ_STATE_RUNNING_BIT = 0,
+	EFA_AQ_STATE_POLLING_BIT = 1,
+};
+
+struct efa_com_admin_queue {
+	void *dmadev;
+	void *efa_dev;
+	struct efa_comp_ctx *comp_ctx;
+	u32 completion_timeout; /* usecs */
+	u16 poll_interval; /* msecs */
+	u16 depth;
+	struct efa_com_admin_cq cq;
+	struct efa_com_admin_sq sq;
+	u16 msix_vector_idx;
+
+	unsigned long state;
+
+	/* Count the number of available admin commands */
+	struct semaphore avail_cmds;
+
+	struct efa_com_stats_admin stats;
+
+	spinlock_t comp_ctx_lock; /* Protects completion context pool */
+	u32 *comp_ctx_pool;
+	u16 comp_ctx_pool_next;
+};
+
+struct efa_aenq_handlers;
+struct efa_com_eq;
+typedef void (*efa_eqe_handler)(struct efa_com_eq *eeq,
+				struct efa_admin_eqe *eqe);
+
+struct efa_com_aenq {
+	struct efa_admin_aenq_entry *entries;
+	struct efa_aenq_handlers *aenq_handlers;
+	dma_addr_t dma_addr;
+	u32 cc; /* consumer counter */
+	u16 msix_vector_idx;
+	u16 depth;
+	u8 phase;
+};
+
+struct efa_com_mmio_read {
+	struct efa_admin_mmio_req_read_less_resp *read_resp;
+	dma_addr_t read_resp_dma_addr;
+	u16 seq_num;
+	u16 mmio_read_timeout; /* usecs */
+	/* serializes mmio reads */
+	spinlock_t lock;
+};
+
+struct efa_com_dev {
+	struct efa_com_admin_queue aq;
+	struct efa_com_aenq aenq;
+	u8 __iomem *reg_bar;
+	void *dmadev;
+	void *efa_dev;
+	u32 supported_features;
+	u32 dma_addr_bits;
+
+	struct efa_com_mmio_read mmio_read;
+};
+
+struct efa_com_eq {
+	struct efa_com_dev *edev;
+	struct efa_admin_eqe *eqes;
+	dma_addr_t dma_addr;
+	u32 cc; /* Consumer counter */
+	u16 eqn;
+	u16 depth;
+	u8 phase;
+	efa_eqe_handler cb;
+};
+
+struct efa_com_create_eq_params {
+	dma_addr_t dma_addr;
+	u32 event_bitmask;
+	u16 depth;
+	u8 entry_size_in_bytes;
+	u8 msix_vec;
+};
+
+struct efa_com_create_eq_result {
+	u16 eqn;
+};
+
+struct efa_com_destroy_eq_params {
+	u16 eqn;
+};
+
+typedef void (*efa_aenq_handler)(void *data,
+	      struct efa_admin_aenq_entry *aenq_e);
+
+/* Holds aenq handlers. Indexed by AENQ event group */
+struct efa_aenq_handlers {
+	efa_aenq_handler handlers[EFA_MAX_HANDLERS];
+	efa_aenq_handler unimplemented_handler;
+};
+
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low);
+int efa_com_admin_init(struct efa_com_dev *edev,
+		       struct efa_aenq_handlers *aenq_handlers);
+void efa_com_admin_destroy(struct efa_com_dev *edev);
+int efa_com_eq_init(struct efa_com_dev *edev, struct efa_com_eq *eeq,
+		    efa_eqe_handler cb, u16 depth, u8 msix_vec);
+void efa_com_eq_destroy(struct efa_com_dev *edev, struct efa_com_eq *eeq);
+int efa_com_dev_reset(struct efa_com_dev *edev,
+		      enum efa_regs_reset_reason_types reset_reason);
+void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling);
+void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev);
+int efa_com_mmio_reg_read_init(struct efa_com_dev *edev);
+void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev);
+
+int efa_com_validate_version(struct efa_com_dev *edev);
+int efa_com_get_dma_width(struct efa_com_dev *edev);
+
+int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
+		     struct efa_admin_aq_entry *cmd,
+		     size_t cmd_size,
+		     struct efa_admin_acq_entry *comp,
+		     size_t comp_size);
+void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data);
+void efa_com_eq_comp_intr_handler(struct efa_com_dev *edev,
+				  struct efa_com_eq *eeq);
+
+#endif /* _EFA_COM_H_ */
diff --git a/drivers/amazon/net/efa/efa_com_cmd.c b/drivers/amazon/net/efa/efa_com_cmd.c
new file mode 100644
index 0000000000000..e107c354bc349
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_com_cmd.c
@@ -0,0 +1,801 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_com.h"
+#include "efa_com_cmd.h"
+
+int efa_com_create_qp(struct efa_com_dev *edev,
+		      struct efa_com_create_qp_params *params,
+		      struct efa_com_create_qp_result *res)
+{
+	struct efa_admin_create_qp_cmd create_qp_cmd = {};
+	struct efa_admin_create_qp_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	create_qp_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_QP;
+
+	create_qp_cmd.pd = params->pd;
+	create_qp_cmd.qp_type = params->qp_type;
+	create_qp_cmd.rq_base_addr = params->rq_base_addr;
+	create_qp_cmd.send_cq_idx = params->send_cq_idx;
+	create_qp_cmd.recv_cq_idx = params->recv_cq_idx;
+	create_qp_cmd.qp_alloc_size.send_queue_ring_size =
+		params->sq_ring_size_in_bytes;
+	create_qp_cmd.qp_alloc_size.send_queue_depth =
+			params->sq_depth;
+	create_qp_cmd.qp_alloc_size.recv_queue_ring_size =
+			params->rq_ring_size_in_bytes;
+	create_qp_cmd.qp_alloc_size.recv_queue_depth =
+			params->rq_depth;
+	create_qp_cmd.uar = params->uarn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&create_qp_cmd,
+			       sizeof(create_qp_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create qp [%d]\n", err);
+		return err;
+	}
+
+	res->qp_handle = cmd_completion.qp_handle;
+	res->qp_num = cmd_completion.qp_num;
+	res->sq_db_offset = cmd_completion.sq_db_offset;
+	res->rq_db_offset = cmd_completion.rq_db_offset;
+	res->llq_descriptors_offset = cmd_completion.llq_descriptors_offset;
+	res->send_sub_cq_idx = cmd_completion.send_sub_cq_idx;
+	res->recv_sub_cq_idx = cmd_completion.recv_sub_cq_idx;
+
+	return 0;
+}
+
+int efa_com_modify_qp(struct efa_com_dev *edev,
+		      struct efa_com_modify_qp_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_modify_qp_cmd cmd = {};
+	struct efa_admin_modify_qp_resp resp;
+	int err;
+
+	cmd.aq_common_desc.opcode = EFA_ADMIN_MODIFY_QP;
+	cmd.modify_mask = params->modify_mask;
+	cmd.qp_handle = params->qp_handle;
+	cmd.qp_state = params->qp_state;
+	cmd.cur_qp_state = params->cur_qp_state;
+	cmd.qkey = params->qkey;
+	cmd.sq_psn = params->sq_psn;
+	cmd.sq_drained_async_notify = params->sq_drained_async_notify;
+	cmd.rnr_retry = params->rnr_retry;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to modify qp-%u modify_mask[%#x] [%d]\n",
+			cmd.qp_handle, cmd.modify_mask, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_query_qp(struct efa_com_dev *edev,
+		     struct efa_com_query_qp_params *params,
+		     struct efa_com_query_qp_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_query_qp_cmd cmd = {};
+	struct efa_admin_query_qp_resp resp;
+	int err;
+
+	cmd.aq_common_desc.opcode = EFA_ADMIN_QUERY_QP;
+	cmd.qp_handle = params->qp_handle;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to query qp-%u [%d]\n",
+				      cmd.qp_handle, err);
+		return err;
+	}
+
+	result->qp_state = resp.qp_state;
+	result->qkey = resp.qkey;
+	result->sq_draining = resp.sq_draining;
+	result->sq_psn = resp.sq_psn;
+	result->rnr_retry = resp.rnr_retry;
+
+	return 0;
+}
+
+int efa_com_destroy_qp(struct efa_com_dev *edev,
+		       struct efa_com_destroy_qp_params *params)
+{
+	struct efa_admin_destroy_qp_resp cmd_completion;
+	struct efa_admin_destroy_qp_cmd qp_cmd = {};
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	qp_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_QP;
+	qp_cmd.qp_handle = params->qp_handle;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&qp_cmd,
+			       sizeof(qp_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy qp-%u [%d]\n",
+				      qp_cmd.qp_handle, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_create_cq(struct efa_com_dev *edev,
+		      struct efa_com_create_cq_params *params,
+		      struct efa_com_create_cq_result *result)
+{
+	struct efa_admin_create_cq_resp cmd_completion = {};
+	struct efa_admin_create_cq_cmd create_cmd = {};
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	create_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_CQ;
+	EFA_SET(&create_cmd.cq_caps_2,
+		EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS,
+		params->entry_size_in_bytes / 4);
+	create_cmd.cq_depth = params->cq_depth;
+	create_cmd.num_sub_cqs = params->num_sub_cqs;
+	create_cmd.uar = params->uarn;
+	if (params->interrupt_mode_enabled) {
+		EFA_SET(&create_cmd.cq_caps_1,
+			EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED, 1);
+		create_cmd.eqn = params->eqn;
+	}
+	if (params->set_src_addr) {
+		EFA_SET(&create_cmd.cq_caps_2,
+			EFA_ADMIN_CREATE_CQ_CMD_SET_SRC_ADDR, 1);
+	}
+	efa_com_set_dma_addr(params->dma_addr,
+			     &create_cmd.cq_ba.mem_addr_high,
+			     &create_cmd.cq_ba.mem_addr_low);
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&create_cmd,
+			       sizeof(create_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create cq[%d]\n", err);
+		return err;
+	}
+
+	result->cq_idx = cmd_completion.cq_idx;
+	result->actual_depth = params->cq_depth;
+	result->db_off = cmd_completion.db_offset;
+	result->db_valid = EFA_GET(&cmd_completion.flags,
+				   EFA_ADMIN_CREATE_CQ_RESP_DB_VALID);
+
+	return 0;
+}
+
+int efa_com_destroy_cq(struct efa_com_dev *edev,
+		       struct efa_com_destroy_cq_params *params)
+{
+	struct efa_admin_destroy_cq_cmd destroy_cmd = {};
+	struct efa_admin_destroy_cq_resp destroy_resp;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	destroy_cmd.cq_idx = params->cq_idx;
+	destroy_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_CQ;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&destroy_cmd,
+			       sizeof(destroy_cmd),
+			       (struct efa_admin_acq_entry *)&destroy_resp,
+			       sizeof(destroy_resp));
+
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy CQ-%u [%d]\n",
+				      params->cq_idx, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_register_mr(struct efa_com_dev *edev,
+			struct efa_com_reg_mr_params *params,
+			struct efa_com_reg_mr_result *result)
+{
+	struct efa_admin_reg_mr_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_reg_mr_cmd mr_cmd = {};
+	int err;
+
+	mr_cmd.aq_common_desc.opcode = EFA_ADMIN_REG_MR;
+	mr_cmd.pd = params->pd;
+	mr_cmd.mr_length = params->mr_length_in_bytes;
+	EFA_SET(&mr_cmd.flags, EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT,
+		params->page_shift);
+	mr_cmd.iova = params->iova;
+	mr_cmd.permissions = params->permissions;
+
+	if (params->inline_pbl) {
+		memcpy(mr_cmd.pbl.inline_pbl_array,
+		       params->pbl.inline_pbl_array,
+		       sizeof(mr_cmd.pbl.inline_pbl_array));
+	} else {
+		mr_cmd.pbl.pbl.length = params->pbl.pbl.length;
+		mr_cmd.pbl.pbl.address.mem_addr_low =
+			params->pbl.pbl.address.mem_addr_low;
+		mr_cmd.pbl.pbl.address.mem_addr_high =
+			params->pbl.pbl.address.mem_addr_high;
+		EFA_SET(&mr_cmd.aq_common_desc.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
+		if (params->indirect)
+			EFA_SET(&mr_cmd.aq_common_desc.flags,
+				EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1);
+	}
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&mr_cmd,
+			       sizeof(mr_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to register mr [%d]\n", err);
+		return err;
+	}
+
+	result->l_key = cmd_completion.l_key;
+	result->r_key = cmd_completion.r_key;
+
+	return 0;
+}
+
+int efa_com_dereg_mr(struct efa_com_dev *edev,
+		     struct efa_com_dereg_mr_params *params)
+{
+	struct efa_admin_dereg_mr_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_dereg_mr_cmd mr_cmd = {};
+	int err;
+
+	mr_cmd.aq_common_desc.opcode = EFA_ADMIN_DEREG_MR;
+	mr_cmd.l_key = params->l_key;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&mr_cmd,
+			       sizeof(mr_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to de-register mr(lkey-%u) [%d]\n",
+			mr_cmd.l_key, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_create_ah(struct efa_com_dev *edev,
+		      struct efa_com_create_ah_params *params,
+		      struct efa_com_create_ah_result *result)
+{
+	struct efa_admin_create_ah_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_create_ah_cmd ah_cmd = {};
+	int err;
+
+	ah_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_AH;
+
+	memcpy(ah_cmd.dest_addr, params->dest_addr, sizeof(ah_cmd.dest_addr));
+	ah_cmd.pd = params->pdn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&ah_cmd,
+			       sizeof(ah_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create ah for %pI6 [%d]\n",
+				      ah_cmd.dest_addr, err);
+		return err;
+	}
+
+	result->ah = cmd_completion.ah;
+
+	return 0;
+}
+
+int efa_com_destroy_ah(struct efa_com_dev *edev,
+		       struct efa_com_destroy_ah_params *params)
+{
+	struct efa_admin_destroy_ah_resp cmd_completion;
+	struct efa_admin_destroy_ah_cmd ah_cmd = {};
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	ah_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_AH;
+	ah_cmd.ah = params->ah;
+	ah_cmd.pd = params->pdn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&ah_cmd,
+			       sizeof(ah_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy ah-%d pd-%d [%d]\n",
+				      ah_cmd.ah, ah_cmd.pd, err);
+		return err;
+	}
+
+	return 0;
+}
+
+bool
+efa_com_check_supported_feature_id(struct efa_com_dev *edev,
+				   enum efa_admin_aq_feature_id feature_id)
+{
+	u32 feature_mask = 1 << feature_id;
+
+	/* Device attributes is always supported */
+	if (feature_id != EFA_ADMIN_DEVICE_ATTR &&
+	    !(edev->supported_features & feature_mask))
+		return false;
+
+	return true;
+}
+
+static int efa_com_get_feature_ex(struct efa_com_dev *edev,
+				  struct efa_admin_get_feature_resp *get_resp,
+				  enum efa_admin_aq_feature_id feature_id,
+				  dma_addr_t control_buf_dma_addr,
+				  u32 control_buff_size)
+{
+	struct efa_admin_get_feature_cmd get_cmd = {};
+	struct efa_com_admin_queue *aq;
+	int err;
+
+	if (!efa_com_check_supported_feature_id(edev, feature_id)) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Feature %d isn't supported\n",
+				      feature_id);
+		return -EOPNOTSUPP;
+	}
+
+	aq = &edev->aq;
+
+	get_cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_FEATURE;
+
+	if (control_buff_size)
+		EFA_SET(&get_cmd.aq_common_descriptor.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
+
+	efa_com_set_dma_addr(control_buf_dma_addr,
+			     &get_cmd.control_buffer.address.mem_addr_high,
+			     &get_cmd.control_buffer.address.mem_addr_low);
+
+	get_cmd.control_buffer.length = control_buff_size;
+	get_cmd.feature_common.feature_id = feature_id;
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)
+			       &get_cmd,
+			       sizeof(get_cmd),
+			       (struct efa_admin_acq_entry *)
+			       get_resp,
+			       sizeof(*get_resp));
+
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to submit get_feature command %d [%d]\n",
+			feature_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static int efa_com_get_feature(struct efa_com_dev *edev,
+			       struct efa_admin_get_feature_resp *get_resp,
+			       enum efa_admin_aq_feature_id feature_id)
+{
+	return efa_com_get_feature_ex(edev, get_resp, feature_id, 0, 0);
+}
+
+int efa_com_get_device_attr(struct efa_com_dev *edev,
+			    struct efa_com_get_device_attr_result *result)
+{
+	struct efa_admin_get_feature_resp resp;
+	int err;
+
+	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_DEVICE_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get device attributes %d\n",
+				      err);
+		return err;
+	}
+
+	result->page_size_cap = resp.u.device_attr.page_size_cap;
+	result->fw_version = resp.u.device_attr.fw_version;
+	result->admin_api_version = resp.u.device_attr.admin_api_version;
+	result->device_version = resp.u.device_attr.device_version;
+	result->supported_features = resp.u.device_attr.supported_features;
+	result->phys_addr_width = resp.u.device_attr.phys_addr_width;
+	result->virt_addr_width = resp.u.device_attr.virt_addr_width;
+	result->db_bar = resp.u.device_attr.db_bar;
+	result->max_rdma_size = resp.u.device_attr.max_rdma_size;
+	result->device_caps = resp.u.device_attr.device_caps;
+
+	if (result->admin_api_version < 1) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to get device attr api version [%u < 1]\n",
+			result->admin_api_version);
+		return -EINVAL;
+	}
+
+	edev->supported_features = resp.u.device_attr.supported_features;
+	err = efa_com_get_feature(edev, &resp,
+				  EFA_ADMIN_QUEUE_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get queue attributes %d\n",
+				      err);
+		return err;
+	}
+
+	result->max_qp = resp.u.queue_attr.max_qp;
+	result->max_sq_depth = resp.u.queue_attr.max_sq_depth;
+	result->max_rq_depth = resp.u.queue_attr.max_rq_depth;
+	result->max_cq = resp.u.queue_attr.max_cq;
+	result->max_cq_depth = resp.u.queue_attr.max_cq_depth;
+	result->inline_buf_size = resp.u.queue_attr.inline_buf_size;
+	result->max_sq_sge = resp.u.queue_attr.max_wr_send_sges;
+	result->max_rq_sge = resp.u.queue_attr.max_wr_recv_sges;
+	result->max_mr = resp.u.queue_attr.max_mr;
+	result->max_mr_pages = resp.u.queue_attr.max_mr_pages;
+	result->max_pd = resp.u.queue_attr.max_pd;
+	result->max_ah = resp.u.queue_attr.max_ah;
+	result->max_llq_size = resp.u.queue_attr.max_llq_size;
+	result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq;
+	result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges;
+	result->max_tx_batch = resp.u.queue_attr.max_tx_batch;
+	result->min_sq_depth = resp.u.queue_attr.min_sq_depth;
+
+	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get network attributes %d\n",
+				      err);
+		return err;
+	}
+
+	memcpy(result->addr, resp.u.network_attr.addr,
+	       sizeof(resp.u.network_attr.addr));
+	result->mtu = resp.u.network_attr.mtu;
+
+	if (efa_com_check_supported_feature_id(edev,
+					       EFA_ADMIN_EVENT_QUEUE_ATTR)) {
+		err = efa_com_get_feature(edev, &resp,
+					  EFA_ADMIN_EVENT_QUEUE_ATTR);
+		if (err) {
+			ibdev_err_ratelimited(
+				edev->efa_dev,
+				"Failed to get event queue attributes %d\n",
+				err);
+			return err;
+		}
+
+		result->max_eq = resp.u.event_queue_attr.max_eq;
+		result->max_eq_depth = resp.u.event_queue_attr.max_eq_depth;
+		result->event_bitmask = resp.u.event_queue_attr.event_bitmask;
+	}
+
+	return 0;
+}
+
+int efa_com_get_hw_hints(struct efa_com_dev *edev,
+			 struct efa_com_get_hw_hints_result *result)
+{
+	struct efa_admin_get_feature_resp resp;
+	int err;
+
+	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_HW_HINTS);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get hw hints %d\n", err);
+		return err;
+	}
+
+	result->admin_completion_timeout = resp.u.hw_hints.admin_completion_timeout;
+	result->driver_watchdog_timeout = resp.u.hw_hints.driver_watchdog_timeout;
+	result->mmio_read_timeout = resp.u.hw_hints.mmio_read_timeout;
+	result->poll_interval = resp.u.hw_hints.poll_interval;
+
+	return 0;
+}
+
+int efa_com_set_feature_ex(struct efa_com_dev *edev,
+			   struct efa_admin_set_feature_resp *set_resp,
+			   struct efa_admin_set_feature_cmd *set_cmd,
+			   enum efa_admin_aq_feature_id feature_id,
+			   dma_addr_t control_buf_dma_addr,
+			   u32 control_buff_size)
+{
+	struct efa_com_admin_queue *aq;
+	int err;
+
+	if (!efa_com_check_supported_feature_id(edev, feature_id)) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Feature %d isn't supported\n",
+				      feature_id);
+		return -EOPNOTSUPP;
+	}
+
+	aq = &edev->aq;
+
+	set_cmd->aq_common_descriptor.opcode = EFA_ADMIN_SET_FEATURE;
+	if (control_buff_size) {
+		set_cmd->aq_common_descriptor.flags = 0;
+		EFA_SET(&set_cmd->aq_common_descriptor.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
+		efa_com_set_dma_addr(control_buf_dma_addr,
+				     &set_cmd->control_buffer.address.mem_addr_high,
+				     &set_cmd->control_buffer.address.mem_addr_low);
+	}
+
+	set_cmd->control_buffer.length = control_buff_size;
+	set_cmd->feature_common.feature_id = feature_id;
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)set_cmd,
+			       sizeof(*set_cmd),
+			       (struct efa_admin_acq_entry *)set_resp,
+			       sizeof(*set_resp));
+
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to submit set_feature command %d error: %d\n",
+			feature_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static int efa_com_set_feature(struct efa_com_dev *edev,
+			       struct efa_admin_set_feature_resp *set_resp,
+			       struct efa_admin_set_feature_cmd *set_cmd,
+			       enum efa_admin_aq_feature_id feature_id)
+{
+	return efa_com_set_feature_ex(edev, set_resp, set_cmd, feature_id,
+				      0, 0);
+}
+
+int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups)
+{
+	struct efa_admin_get_feature_resp get_resp;
+	struct efa_admin_set_feature_resp set_resp;
+	struct efa_admin_set_feature_cmd cmd = {};
+	int err;
+
+	ibdev_dbg(edev->efa_dev, "Configuring aenq with groups[%#x]\n", groups);
+
+	err = efa_com_get_feature(edev, &get_resp, EFA_ADMIN_AENQ_CONFIG);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get aenq attributes: %d\n",
+				      err);
+		return err;
+	}
+
+	ibdev_dbg(edev->efa_dev,
+		  "Get aenq groups: supported[%#x] enabled[%#x]\n",
+		  get_resp.u.aenq.supported_groups,
+		  get_resp.u.aenq.enabled_groups);
+
+	if ((get_resp.u.aenq.supported_groups & groups) != groups) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Trying to set unsupported aenq groups[%#x] supported[%#x]\n",
+			groups, get_resp.u.aenq.supported_groups);
+		return -EOPNOTSUPP;
+	}
+
+	cmd.u.aenq.enabled_groups = groups;
+	err = efa_com_set_feature(edev, &set_resp, &cmd,
+				  EFA_ADMIN_AENQ_CONFIG);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to set aenq attributes: %d\n",
+				      err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_alloc_pd(struct efa_com_dev *edev,
+		     struct efa_com_alloc_pd_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_alloc_pd_cmd cmd = {};
+	struct efa_admin_alloc_pd_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_PD;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to allocate pd[%d]\n", err);
+		return err;
+	}
+
+	result->pdn = resp.pd;
+
+	return 0;
+}
+
+int efa_com_dealloc_pd(struct efa_com_dev *edev,
+		       struct efa_com_dealloc_pd_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_dealloc_pd_cmd cmd = {};
+	struct efa_admin_dealloc_pd_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_PD;
+	cmd.pd = params->pdn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to deallocate pd-%u [%d]\n",
+				      cmd.pd, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_alloc_uar(struct efa_com_dev *edev,
+		      struct efa_com_alloc_uar_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_alloc_uar_cmd cmd = {};
+	struct efa_admin_alloc_uar_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_UAR;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to allocate uar[%d]\n", err);
+		return err;
+	}
+
+	result->uarn = resp.uar;
+
+	return 0;
+}
+
+int efa_com_dealloc_uar(struct efa_com_dev *edev,
+			struct efa_com_dealloc_uar_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_dealloc_uar_cmd cmd = {};
+	struct efa_admin_dealloc_uar_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_UAR;
+	cmd.uar = params->uarn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to deallocate uar-%u [%d]\n",
+				      cmd.uar, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_get_stats(struct efa_com_dev *edev,
+		      struct efa_com_get_stats_params *params,
+		      union efa_com_get_stats_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_aq_get_stats_cmd cmd = {};
+	struct efa_admin_acq_get_stats_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_STATS;
+	cmd.type = params->type;
+	cmd.scope = params->scope;
+	cmd.scope_modifier = params->scope_modifier;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to get stats type-%u scope-%u.%u [%d]\n",
+			cmd.type, cmd.scope, cmd.scope_modifier, err);
+		return err;
+	}
+
+	switch (cmd.type) {
+	case EFA_ADMIN_GET_STATS_TYPE_BASIC:
+		result->basic_stats.tx_bytes = resp.u.basic_stats.tx_bytes;
+		result->basic_stats.tx_pkts = resp.u.basic_stats.tx_pkts;
+		result->basic_stats.rx_bytes = resp.u.basic_stats.rx_bytes;
+		result->basic_stats.rx_pkts = resp.u.basic_stats.rx_pkts;
+		result->basic_stats.rx_drops = resp.u.basic_stats.rx_drops;
+		break;
+	case EFA_ADMIN_GET_STATS_TYPE_MESSAGES:
+		result->messages_stats.send_bytes = resp.u.messages_stats.send_bytes;
+		result->messages_stats.send_wrs = resp.u.messages_stats.send_wrs;
+		result->messages_stats.recv_bytes = resp.u.messages_stats.recv_bytes;
+		result->messages_stats.recv_wrs = resp.u.messages_stats.recv_wrs;
+		break;
+	case EFA_ADMIN_GET_STATS_TYPE_RDMA_READ:
+		result->rdma_read_stats.read_wrs = resp.u.rdma_read_stats.read_wrs;
+		result->rdma_read_stats.read_bytes = resp.u.rdma_read_stats.read_bytes;
+		result->rdma_read_stats.read_wr_err = resp.u.rdma_read_stats.read_wr_err;
+		result->rdma_read_stats.read_resp_bytes = resp.u.rdma_read_stats.read_resp_bytes;
+		break;
+	}
+
+	return 0;
+}
diff --git a/drivers/amazon/net/efa/efa_com_cmd.h b/drivers/amazon/net/efa/efa_com_cmd.h
new file mode 100644
index 0000000000000..0898ad5bc3405
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_com_cmd.h
@@ -0,0 +1,322 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COM_CMD_H_
+#define _EFA_COM_CMD_H_
+
+#include "efa_com.h"
+
+#define EFA_GID_SIZE 16
+
+struct efa_com_create_qp_params {
+	u64 rq_base_addr;
+	u32 send_cq_idx;
+	u32 recv_cq_idx;
+	/*
+	 * Send descriptor ring size in bytes,
+	 * sufficient for user-provided number of WQEs and SGL size
+	 */
+	u32 sq_ring_size_in_bytes;
+	/* Max number of WQEs that will be posted on send queue */
+	u32 sq_depth;
+	/* Recv descriptor ring size in bytes */
+	u32 rq_ring_size_in_bytes;
+	u32 rq_depth;
+	u16 pd;
+	u16 uarn;
+	u8 qp_type;
+};
+
+struct efa_com_create_qp_result {
+	u32 qp_handle;
+	u32 qp_num;
+	u32 sq_db_offset;
+	u32 rq_db_offset;
+	u32 llq_descriptors_offset;
+	u16 send_sub_cq_idx;
+	u16 recv_sub_cq_idx;
+};
+
+struct efa_com_modify_qp_params {
+	u32 modify_mask;
+	u32 qp_handle;
+	u32 qp_state;
+	u32 cur_qp_state;
+	u32 qkey;
+	u32 sq_psn;
+	u8 sq_drained_async_notify;
+	u8 rnr_retry;
+};
+
+struct efa_com_query_qp_params {
+	u32 qp_handle;
+};
+
+struct efa_com_query_qp_result {
+	u32 qp_state;
+	u32 qkey;
+	u32 sq_draining;
+	u32 sq_psn;
+	u8 rnr_retry;
+};
+
+struct efa_com_destroy_qp_params {
+	u32 qp_handle;
+};
+
+struct efa_com_create_cq_params {
+	/* cq physical base address in OS memory */
+	dma_addr_t dma_addr;
+	/* completion queue depth in # of entries */
+	u16 cq_depth;
+	u16 num_sub_cqs;
+	u16 uarn;
+	u16 eqn;
+	u8 entry_size_in_bytes;
+	u8 interrupt_mode_enabled : 1;
+	u8 set_src_addr : 1;
+};
+
+struct efa_com_create_cq_result {
+	/* cq identifier */
+	u16 cq_idx;
+	/* actual cq depth in # of entries */
+	u16 actual_depth;
+	u32 db_off;
+	bool db_valid;
+};
+
+struct efa_com_destroy_cq_params {
+	u16 cq_idx;
+};
+
+struct efa_com_create_ah_params {
+	u16 pdn;
+	/* Destination address in network byte order */
+	u8 dest_addr[EFA_GID_SIZE];
+};
+
+struct efa_com_create_ah_result {
+	u16 ah;
+};
+
+struct efa_com_destroy_ah_params {
+	u16 ah;
+	u16 pdn;
+};
+
+struct efa_com_get_device_attr_result {
+	u8 addr[EFA_GID_SIZE];
+	u64 page_size_cap;
+	u64 max_mr_pages;
+	u32 mtu;
+	u32 fw_version;
+	u32 admin_api_version;
+	u32 device_version;
+	u32 supported_features;
+	u32 phys_addr_width;
+	u32 virt_addr_width;
+	u32 max_qp;
+	u32 max_sq_depth; /* wqes */
+	u32 max_rq_depth; /* wqes */
+	u32 max_cq;
+	u32 max_cq_depth; /* cqes */
+	u32 inline_buf_size;
+	u32 max_mr;
+	u32 max_pd;
+	u32 max_ah;
+	u32 max_llq_size;
+	u32 max_rdma_size;
+	u32 device_caps;
+	u32 max_eq;
+	u32 max_eq_depth;
+	u32 event_bitmask; /* EQ events bitmask */
+	u16 sub_cqs_per_cq;
+	u16 max_sq_sge;
+	u16 max_rq_sge;
+	u16 max_wr_rdma_sge;
+	u16 max_tx_batch;
+	u16 min_sq_depth;
+	u8 db_bar;
+};
+
+struct efa_com_get_hw_hints_result {
+	u16 mmio_read_timeout;
+	u16 driver_watchdog_timeout;
+	u16 admin_completion_timeout;
+	u16 poll_interval;
+	u32 reserved[4];
+};
+
+struct efa_com_mem_addr {
+	u32 mem_addr_low;
+	u32 mem_addr_high;
+};
+
+/* Used at indirect mode page list chunks for chaining */
+struct efa_com_ctrl_buff_info {
+	/* indicates length of the buffer pointed by control_buffer_address. */
+	u32 length;
+	/* points to control buffer (direct or indirect) */
+	struct efa_com_mem_addr address;
+};
+
+struct efa_com_reg_mr_params {
+	/* Memory region length, in bytes. */
+	u64 mr_length_in_bytes;
+	/* IO Virtual Address associated with this MR. */
+	u64 iova;
+	/* words 8:15: Physical Buffer List, each element is page-aligned. */
+	union {
+		/*
+		 * Inline array of physical addresses of app pages
+		 * (optimization for short region reservations)
+		 */
+		u64 inline_pbl_array[4];
+		/*
+		 * Describes the next physically contiguous chunk of indirect
+		 * page list. A page list contains physical addresses of command
+		 * data pages. Data pages are 4KB; page list chunks are
+		 * variable-sized.
+		 */
+		struct efa_com_ctrl_buff_info pbl;
+	} pbl;
+	/* number of pages in PBL (redundant, could be calculated) */
+	u32 page_num;
+	/* Protection Domain */
+	u16 pd;
+	/*
+	 * phys_page_size_shift - page size is (1 << phys_page_size_shift)
+	 * Page size is used for building the Virtual to Physical
+	 * address mapping
+	 */
+	u8 page_shift;
+	/* see permissions field of struct efa_admin_reg_mr_cmd */
+	u8 permissions;
+	u8 inline_pbl;
+	u8 indirect;
+};
+
+struct efa_com_reg_mr_result {
+	/*
+	 * To be used in conjunction with local buffers references in SQ and
+	 * RQ WQE
+	 */
+	u32 l_key;
+	/*
+	 * To be used in incoming RDMA semantics messages to refer to remotely
+	 * accessed memory region
+	 */
+	u32 r_key;
+};
+
+struct efa_com_dereg_mr_params {
+	u32 l_key;
+};
+
+struct efa_com_alloc_pd_result {
+	u16 pdn;
+};
+
+struct efa_com_dealloc_pd_params {
+	u16 pdn;
+};
+
+struct efa_com_alloc_uar_result {
+	u16 uarn;
+};
+
+struct efa_com_dealloc_uar_params {
+	u16 uarn;
+};
+
+struct efa_com_get_stats_params {
+	/* see enum efa_admin_get_stats_type */
+	u8 type;
+	/* see enum efa_admin_get_stats_scope */
+	u8 scope;
+	u16 scope_modifier;
+};
+
+struct efa_com_basic_stats {
+	u64 tx_bytes;
+	u64 tx_pkts;
+	u64 rx_bytes;
+	u64 rx_pkts;
+	u64 rx_drops;
+};
+
+struct efa_com_messages_stats {
+	u64 send_bytes;
+	u64 send_wrs;
+	u64 recv_bytes;
+	u64 recv_wrs;
+};
+
+struct efa_com_rdma_read_stats {
+	u64 read_wrs;
+	u64 read_bytes;
+	u64 read_wr_err;
+	u64 read_resp_bytes;
+};
+
+union efa_com_get_stats_result {
+	struct efa_com_basic_stats basic_stats;
+	struct efa_com_messages_stats messages_stats;
+	struct efa_com_rdma_read_stats rdma_read_stats;
+};
+
+int efa_com_create_qp(struct efa_com_dev *edev,
+		      struct efa_com_create_qp_params *params,
+		      struct efa_com_create_qp_result *res);
+int efa_com_modify_qp(struct efa_com_dev *edev,
+		      struct efa_com_modify_qp_params *params);
+int efa_com_query_qp(struct efa_com_dev *edev,
+		     struct efa_com_query_qp_params *params,
+		     struct efa_com_query_qp_result *result);
+int efa_com_destroy_qp(struct efa_com_dev *edev,
+		       struct efa_com_destroy_qp_params *params);
+int efa_com_create_cq(struct efa_com_dev *edev,
+		      struct efa_com_create_cq_params *params,
+		      struct efa_com_create_cq_result *result);
+int efa_com_destroy_cq(struct efa_com_dev *edev,
+		       struct efa_com_destroy_cq_params *params);
+int efa_com_register_mr(struct efa_com_dev *edev,
+			struct efa_com_reg_mr_params *params,
+			struct efa_com_reg_mr_result *result);
+int efa_com_dereg_mr(struct efa_com_dev *edev,
+		     struct efa_com_dereg_mr_params *params);
+int efa_com_create_ah(struct efa_com_dev *edev,
+		      struct efa_com_create_ah_params *params,
+		      struct efa_com_create_ah_result *result);
+int efa_com_destroy_ah(struct efa_com_dev *edev,
+		       struct efa_com_destroy_ah_params *params);
+int efa_com_get_device_attr(struct efa_com_dev *edev,
+			    struct efa_com_get_device_attr_result *result);
+int efa_com_get_hw_hints(struct efa_com_dev *edev,
+			 struct efa_com_get_hw_hints_result *result);
+bool
+efa_com_check_supported_feature_id(struct efa_com_dev *edev,
+				   enum efa_admin_aq_feature_id feature_id);
+int efa_com_set_feature_ex(struct efa_com_dev *edev,
+			   struct efa_admin_set_feature_resp *set_resp,
+			   struct efa_admin_set_feature_cmd *set_cmd,
+			   enum efa_admin_aq_feature_id feature_id,
+			   dma_addr_t control_buf_dma_addr,
+			   u32 control_buff_size);
+int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups);
+int efa_com_alloc_pd(struct efa_com_dev *edev,
+		     struct efa_com_alloc_pd_result *result);
+int efa_com_dealloc_pd(struct efa_com_dev *edev,
+		       struct efa_com_dealloc_pd_params *params);
+int efa_com_alloc_uar(struct efa_com_dev *edev,
+		      struct efa_com_alloc_uar_result *result);
+int efa_com_dealloc_uar(struct efa_com_dev *edev,
+			struct efa_com_dealloc_uar_params *params);
+int efa_com_get_stats(struct efa_com_dev *edev,
+		      struct efa_com_get_stats_params *params,
+		      union efa_com_get_stats_result *result);
+
+#endif /* _EFA_COM_CMD_H_ */
diff --git a/drivers/amazon/net/efa/efa_common_defs.h b/drivers/amazon/net/efa/efa_common_defs.h
new file mode 100644
index 0000000000000..bbcf48f0eaca4
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_common_defs.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COMMON_H_
+#define _EFA_COMMON_H_
+
+#ifdef HAVE_BITFIELD_H
+#include <linux/bitfield.h>
+#endif
+
+#define EFA_COMMON_SPEC_VERSION_MAJOR        2
+#define EFA_COMMON_SPEC_VERSION_MINOR        0
+
+#define EFA_GET(ptr, mask) FIELD_GET(mask##_MASK, *(ptr))
+
+#define EFA_SET(ptr, mask, value)                                              \
+	({                                                                     \
+		typeof(ptr) _ptr = ptr;                                        \
+		*_ptr = (*_ptr & ~(mask##_MASK)) |                             \
+			FIELD_PREP(mask##_MASK, value);                        \
+	})
+
+struct efa_common_mem_addr {
+	u32 mem_addr_low;
+
+	u32 mem_addr_high;
+};
+
+#endif /* _EFA_COMMON_H_ */
diff --git a/drivers/amazon/net/efa/efa_gdr.c b/drivers/amazon/net/efa/efa_gdr.c
new file mode 100644
index 0000000000000..24f8a082d10d5
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_gdr.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/module.h>
+
+#include "efa_p2p.h"
+#include "nv-p2p.h"
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE BIT_ULL(GPU_PAGE_SHIFT)
+
+struct efa_nvmem_ops {
+	int (*get_pages)(u64 p2p_token, u32 va_space, u64 virtual_address,
+			 u64 length, struct nvidia_p2p_page_table **page_table,
+			 void (*free_callback)(void *data), void *data);
+	int (*dma_map_pages)(struct pci_dev *peer,
+			     struct nvidia_p2p_page_table *page_table,
+			     struct nvidia_p2p_dma_mapping **dma_mapping);
+	int (*put_pages)(u64 p2p_token, u32 va_space, u64 virtual_address,
+			 struct nvidia_p2p_page_table *page_table);
+	int (*dma_unmap_pages)(struct pci_dev *peer,
+			       struct nvidia_p2p_page_table *page_table,
+			       struct nvidia_p2p_dma_mapping *dma_mapping);
+};
+
+struct efa_nvmem {
+	struct efa_p2pmem p2pmem;
+	struct efa_nvmem_ops ops;
+	struct nvidia_p2p_page_table *pgtbl;
+	struct nvidia_p2p_dma_mapping *dma_mapping;
+	u64 virt_start;
+};
+
+static unsigned int nvmem_pgsz(struct efa_dev *dev, struct efa_p2pmem *p2pmem)
+{
+	struct efa_nvmem *nvmem;
+
+	nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem);
+
+	switch (nvmem->pgtbl->page_size) {
+	case NVIDIA_P2P_PAGE_SIZE_4KB:
+		return SZ_4K;
+	case NVIDIA_P2P_PAGE_SIZE_64KB:
+		return SZ_64K;
+	case NVIDIA_P2P_PAGE_SIZE_128KB:
+		return SZ_128K;
+	default:
+		return 0;
+	}
+}
+
+static int nvmem_get_fp(struct efa_nvmem *nvmem)
+{
+	nvmem->ops.get_pages = symbol_get(nvidia_p2p_get_pages);
+	if (!nvmem->ops.get_pages)
+		goto err_out;
+
+	nvmem->ops.put_pages = symbol_get(nvidia_p2p_put_pages);
+	if (!nvmem->ops.put_pages)
+		goto err_put_get_pages;
+
+	nvmem->ops.dma_map_pages = symbol_get(nvidia_p2p_dma_map_pages);
+	if (!nvmem->ops.dma_map_pages)
+		goto err_put_put_pages;
+
+	nvmem->ops.dma_unmap_pages = symbol_get(nvidia_p2p_dma_unmap_pages);
+	if (!nvmem->ops.dma_unmap_pages)
+		goto err_put_dma_map_pages;
+
+	return 0;
+
+err_put_dma_map_pages:
+	symbol_put(nvidia_p2p_dma_map_pages);
+err_put_put_pages:
+	symbol_put(nvidia_p2p_put_pages);
+err_put_get_pages:
+	symbol_put(nvidia_p2p_get_pages);
+err_out:
+	return -EINVAL;
+}
+
+static void nvmem_put_fp(void)
+{
+	symbol_put(nvidia_p2p_dma_unmap_pages);
+	symbol_put(nvidia_p2p_dma_map_pages);
+	symbol_put(nvidia_p2p_put_pages);
+	symbol_put(nvidia_p2p_get_pages);
+}
+
+static void nvmem_free_cb(void *data)
+{
+	pr_debug("Free callback ticket %llu\n", (u64)data);
+	efa_p2p_put((u64)data, true);
+}
+
+static int nvmem_get_pages(struct efa_dev *dev, struct efa_nvmem *nvmem,
+			   u64 addr, u64 size, u64 ticket)
+{
+	int err;
+
+	err = nvmem->ops.get_pages(0, 0, addr, size, &nvmem->pgtbl,
+				   nvmem_free_cb, (void *)ticket);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "nvidia_p2p_get_pages failed %d\n", err);
+		return err;
+	}
+
+	if (!NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(nvmem->pgtbl)) {
+		ibdev_dbg(&dev->ibdev, "Incompatible page table version %#08x\n",
+			  nvmem->pgtbl->version);
+		nvmem->ops.put_pages(0, 0, addr, nvmem->pgtbl);
+		nvmem->pgtbl = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nvmem_dma_map(struct efa_dev *dev, struct efa_nvmem *nvmem)
+{
+	int err;
+
+	err = nvmem->ops.dma_map_pages(dev->pdev, nvmem->pgtbl,
+				       &nvmem->dma_mapping);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "nvidia_p2p_dma_map_pages failed %d\n",
+			  err);
+		return err;
+	}
+
+	if (!NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(nvmem->dma_mapping)) {
+		ibdev_dbg(&dev->ibdev, "Incompatible DMA mapping version %#08x\n",
+			  nvmem->dma_mapping->version);
+		nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl,
+					   nvmem->dma_mapping);
+		nvmem->dma_mapping = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct efa_p2pmem *nvmem_get(struct efa_dev *dev, u64 ticket, u64 start,
+				    u64 length)
+{
+	struct efa_nvmem *nvmem;
+	u64 virt_start;
+	u64 virt_end;
+	u64 pinsz;
+	int err;
+
+	nvmem = kzalloc(sizeof(*nvmem), GFP_KERNEL);
+	if (!nvmem)
+		return NULL;
+
+	virt_start = ALIGN_DOWN(start, GPU_PAGE_SIZE);
+	virt_end = ALIGN(start + length, GPU_PAGE_SIZE);
+	pinsz = virt_end - virt_start;
+	nvmem->virt_start = virt_start;
+
+	err = nvmem_get_fp(nvmem);
+	if (err)
+		/* Nvidia module is not loaded */
+		goto err_free;
+
+	err = nvmem_get_pages(dev, nvmem, virt_start, pinsz, ticket);
+	if (err)
+		/* Most likely not our pages */
+		goto err_put_fp;
+
+	err = nvmem_dma_map(dev, nvmem);
+	if (err)
+		goto err_put;
+
+	return &nvmem->p2pmem;
+
+err_put:
+	nvmem->ops.put_pages(0, 0, virt_start, nvmem->pgtbl);
+err_put_fp:
+	nvmem_put_fp();
+err_free:
+	kfree(nvmem);
+	return NULL;
+}
+
+static int nvmem_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			      u64 *page_list)
+{
+	struct nvidia_p2p_dma_mapping *dma_mapping;
+	struct efa_nvmem *nvmem;
+	int i;
+
+	nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem);
+	dma_mapping = nvmem->dma_mapping;
+
+	for (i = 0; i < dma_mapping->entries; i++)
+		page_list[i] = dma_mapping->dma_addresses[i];
+
+	return 0;
+}
+
+static void nvmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			  bool in_cb)
+{
+	struct efa_nvmem *nvmem;
+
+	nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem);
+
+	if (!in_cb) {
+		nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl,
+					   nvmem->dma_mapping);
+		nvmem->ops.put_pages(0, 0, nvmem->virt_start, nvmem->pgtbl);
+	}
+
+	nvmem_put_fp();
+	kfree(nvmem);
+}
+
+bool nvmem_is_supported(void)
+{
+	struct efa_nvmem dummynv = {};
+
+	if (nvmem_get_fp(&dummynv))
+		return false;
+	nvmem_put_fp();
+
+	return true;
+}
+
+struct nvmem_provider {
+	struct efa_p2p_provider p2p;
+};
+
+static const struct nvmem_provider prov = {
+	.p2p = {
+		.ops = {
+			.try_get = nvmem_get,
+			.to_page_list = nvmem_to_page_list,
+			.release = nvmem_release,
+			.get_page_size = nvmem_pgsz,
+		},
+		.type = EFA_P2P_PROVIDER_NVMEM,
+	},
+};
+
+const struct efa_p2p_provider *nvmem_get_provider(void)
+{
+	return &prov.p2p;
+}
diff --git a/drivers/amazon/net/efa/efa_io_defs.h b/drivers/amazon/net/efa/efa_io_defs.h
new file mode 100644
index 0000000000000..17ba8984b11e9
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_io_defs.h
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_IO_H_
+#define _EFA_IO_H_
+
+#define EFA_IO_TX_DESC_NUM_BUFS              2
+#define EFA_IO_TX_DESC_NUM_RDMA_BUFS         1
+#define EFA_IO_TX_DESC_INLINE_MAX_SIZE       32
+#define EFA_IO_TX_DESC_IMM_DATA_SIZE         4
+
+enum efa_io_queue_type {
+	/* send queue (of a QP) */
+	EFA_IO_SEND_QUEUE                           = 1,
+	/* recv queue (of a QP) */
+	EFA_IO_RECV_QUEUE                           = 2,
+};
+
+enum efa_io_send_op_type {
+	/* send message */
+	EFA_IO_SEND                                 = 0,
+	/* RDMA read */
+	EFA_IO_RDMA_READ                            = 1,
+};
+
+enum efa_io_comp_status {
+	/* Successful completion */
+	EFA_IO_COMP_STATUS_OK                       = 0,
+	/* Flushed during QP destroy */
+	EFA_IO_COMP_STATUS_FLUSHED                  = 1,
+	/* Internal QP error */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2,
+	/* Bad operation type */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3,
+	/* Bad AH */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH   = 4,
+	/* LKEY not registered or does not match IOVA */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5,
+	/* Message too long */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH   = 6,
+	/* Destination ENI is down or does not run EFA */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7,
+	/* Connection was reset by remote side */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT       = 8,
+	/* Bad dest QP number (QP does not exist or is in error state) */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN = 9,
+	/* Destination resource not ready (no WQEs posted on RQ) */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR         = 10,
+	/* Receiver SGL too short */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH  = 11,
+	/* Unexpected status returned by responder */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS  = 12,
+	/* Unresponsive remote - detected locally */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE = 13,
+};
+
+struct efa_io_tx_meta_desc {
+	/* Verbs-generated Request ID */
+	u16 req_id;
+
+	/*
+	 * control flags
+	 * 3:0 : op_type - operation type: send/rdma/fast mem
+	 *    ops/etc
+	 * 4 : has_imm - immediate_data field carries valid
+	 *    data.
+	 * 5 : inline_msg - inline mode - inline message data
+	 *    follows this descriptor (no buffer descriptors).
+	 *    Note that it is different from immediate data
+	 * 6 : meta_extension - Extended metadata. MBZ
+	 * 7 : meta_desc - Indicates metadata descriptor.
+	 *    Must be set.
+	 */
+	u8 ctrl1;
+
+	/*
+	 * control flags
+	 * 0 : phase
+	 * 1 : reserved25 - MBZ
+	 * 2 : first - Indicates first descriptor in
+	 *    transaction. Must be set.
+	 * 3 : last - Indicates last descriptor in
+	 *    transaction. Must be set.
+	 * 4 : comp_req - Indicates whether completion should
+	 *    be posted, after packet is transmitted. Valid only
+	 *    for the first descriptor
+	 * 7:5 : reserved29 - MBZ
+	 */
+	u8 ctrl2;
+
+	u16 dest_qp_num;
+
+	/*
+	 * If inline_msg bit is set, length of inline message in bytes,
+	 *    otherwise length of SGL (number of buffers).
+	 */
+	u16 length;
+
+	/*
+	 * immediate data: if has_imm is set, then this field is included
+	 *    within Tx message and reported in remote Rx completion.
+	 */
+	u32 immediate_data;
+
+	u16 ah;
+
+	u16 reserved;
+
+	/* Queue key */
+	u32 qkey;
+
+	u8 reserved2[12];
+};
+
+/*
+ * Tx queue buffer descriptor, for any transport type. Preceded by metadata
+ * descriptor.
+ */
+struct efa_io_tx_buf_desc {
+	/* length in bytes */
+	u32 length;
+
+	/*
+	 * 23:0 : lkey - local memory translation key
+	 * 31:24 : reserved - MBZ
+	 */
+	u32 lkey;
+
+	/* Buffer address bits[31:0] */
+	u32 buf_addr_lo;
+
+	/* Buffer address bits[63:32] */
+	u32 buf_addr_hi;
+};
+
+struct efa_io_remote_mem_addr {
+	/* length in bytes */
+	u32 length;
+
+	/* remote memory translation key */
+	u32 rkey;
+
+	/* Buffer address bits[31:0] */
+	u32 buf_addr_lo;
+
+	/* Buffer address bits[63:32] */
+	u32 buf_addr_hi;
+};
+
+struct efa_io_rdma_req {
+	/* Remote memory address */
+	struct efa_io_remote_mem_addr remote_mem;
+
+	/* Local memory address */
+	struct efa_io_tx_buf_desc local_mem[1];
+};
+
+/*
+ * Tx WQE, composed of tx meta descriptors followed by either tx buffer
+ * descriptors or inline data
+ */
+struct efa_io_tx_wqe {
+	/* TX meta */
+	struct efa_io_tx_meta_desc meta;
+
+	union {
+		/* Send buffer descriptors */
+		struct efa_io_tx_buf_desc sgl[2];
+
+		u8 inline_data[32];
+
+		/* RDMA local and remote memory addresses */
+		struct efa_io_rdma_req rdma_req;
+	} data;
+};
+
+/*
+ * Rx buffer descriptor; RX WQE is composed of one or more RX buffer
+ * descriptors.
+ */
+struct efa_io_rx_desc {
+	/* Buffer address bits[31:0] */
+	u32 buf_addr_lo;
+
+	/* Buffer Pointer[63:32] */
+	u32 buf_addr_hi;
+
+	/* Verbs-generated request id. */
+	u16 req_id;
+
+	/* Length in bytes. */
+	u16 length;
+
+	/*
+	 * LKey and control flags
+	 * 23:0 : lkey
+	 * 29:24 : reserved - MBZ
+	 * 30 : first - Indicates first descriptor in WQE
+	 * 31 : last - Indicates last descriptor in WQE
+	 */
+	u32 lkey_ctrl;
+};
+
+/* Common IO completion descriptor */
+struct efa_io_cdesc_common {
+	/*
+	 * verbs-generated request ID, as provided in the completed tx or rx
+	 *    descriptor.
+	 */
+	u16 req_id;
+
+	u8 status;
+
+	/*
+	 * flags
+	 * 0 : phase - Phase bit
+	 * 2:1 : q_type - enum efa_io_queue_type: send/recv
+	 * 3 : has_imm - indicates that immediate data is
+	 *    present - for RX completions only
+	 * 7:4 : reserved28 - MBZ
+	 */
+	u8 flags;
+
+	/* local QP number */
+	u16 qp_num;
+
+	/* Transferred length */
+	u16 length;
+};
+
+/* Tx completion descriptor */
+struct efa_io_tx_cdesc {
+	/* Common completion info */
+	struct efa_io_cdesc_common common;
+};
+
+/* Rx Completion Descriptor */
+struct efa_io_rx_cdesc {
+	/* Common completion info */
+	struct efa_io_cdesc_common common;
+
+	/* Remote Address Handle FW index, 0xFFFF indicates invalid ah */
+	u16 ah;
+
+	u16 src_qp_num;
+
+	/* Immediate data */
+	u32 imm;
+};
+
+/* Extended Rx Completion Descriptor */
+struct efa_io_rx_cdesc_ex {
+	/* Base RX completion info */
+	struct efa_io_rx_cdesc rx_cdesc_base;
+
+	/*
+	 * Valid only in case of unknown AH (0xFFFF) and CQ set_src_addr is
+	 * enabled.
+	 */
+	u8 src_addr[16];
+};
+
+/* tx_meta_desc */
+#define EFA_IO_TX_META_DESC_OP_TYPE_MASK                    GENMASK(3, 0)
+#define EFA_IO_TX_META_DESC_HAS_IMM_MASK                    BIT(4)
+#define EFA_IO_TX_META_DESC_INLINE_MSG_MASK                 BIT(5)
+#define EFA_IO_TX_META_DESC_META_EXTENSION_MASK             BIT(6)
+#define EFA_IO_TX_META_DESC_META_DESC_MASK                  BIT(7)
+#define EFA_IO_TX_META_DESC_PHASE_MASK                      BIT(0)
+#define EFA_IO_TX_META_DESC_FIRST_MASK                      BIT(2)
+#define EFA_IO_TX_META_DESC_LAST_MASK                       BIT(3)
+#define EFA_IO_TX_META_DESC_COMP_REQ_MASK                   BIT(4)
+
+/* tx_buf_desc */
+#define EFA_IO_TX_BUF_DESC_LKEY_MASK                        GENMASK(23, 0)
+
+/* rx_desc */
+#define EFA_IO_RX_DESC_LKEY_MASK                            GENMASK(23, 0)
+#define EFA_IO_RX_DESC_FIRST_MASK                           BIT(30)
+#define EFA_IO_RX_DESC_LAST_MASK                            BIT(31)
+
+/* cdesc_common */
+#define EFA_IO_CDESC_COMMON_PHASE_MASK                      BIT(0)
+#define EFA_IO_CDESC_COMMON_Q_TYPE_MASK                     GENMASK(2, 1)
+#define EFA_IO_CDESC_COMMON_HAS_IMM_MASK                    BIT(3)
+
+#endif /* _EFA_IO_H_ */
diff --git a/drivers/amazon/net/efa/efa_main.c b/drivers/amazon/net/efa/efa_main.c
new file mode 100644
index 0000000000000..34a8e13273556
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_main.c
@@ -0,0 +1,889 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+#include <rdma/ib_user_verbs.h>
+
+#include "efa.h"
+#include "efa_sysfs.h"
+
+#ifdef HAVE_EFA_P2P
+#include "efa_p2p.h"
+#endif
+
+#ifndef HAVE_PCI_VENDOR_ID_AMAZON
+#define PCI_VENDOR_ID_AMAZON 0x1d0f
+#endif
+#define PCI_DEV_ID_EFA0_VF 0xefa0
+#define PCI_DEV_ID_EFA1_VF 0xefa1
+#define PCI_DEV_ID_EFA2_VF 0xefa2
+
+static const struct pci_device_id efa_pci_tbl[] = {
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) },
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) },
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA2_VF) },
+	{ }
+};
+
+#define DRV_MODULE_VER_MAJOR           2
+#define DRV_MODULE_VER_MINOR           1
+#define DRV_MODULE_VER_SUBMINOR        0
+
+#ifndef DRV_MODULE_VERSION
+#define DRV_MODULE_VERSION \
+	__stringify(DRV_MODULE_VER_MAJOR) "."   \
+	__stringify(DRV_MODULE_VER_MINOR) "."   \
+	__stringify(DRV_MODULE_VER_SUBMINOR) "g"
+#endif
+
+MODULE_VERSION(DRV_MODULE_VERSION);
+MODULE_SOFTDEP("pre: ib_uverbs");
+
+static char version[] = DEVICE_NAME " v" DRV_MODULE_VERSION;
+
+MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION(DEVICE_NAME);
+MODULE_DEVICE_TABLE(pci, efa_pci_tbl);
+
+#define EFA_REG_BAR 0
+#define EFA_MEM_BAR 2
+#define EFA_BASE_BAR_MASK (BIT(EFA_REG_BAR) | BIT(EFA_MEM_BAR))
+
+#define EFA_AENQ_ENABLED_GROUPS \
+	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
+	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
+
+/* This handler will called for unknown event group or unimplemented handlers */
+static void unimplemented_aenq_handler(void *data,
+				       struct efa_admin_aenq_entry *aenq_e)
+{
+	struct efa_dev *dev = (struct efa_dev *)data;
+
+	ibdev_err(&dev->ibdev,
+		  "Unknown event was received or event with unimplemented handler\n");
+}
+
+static void efa_keep_alive(void *data, struct efa_admin_aenq_entry *aenq_e)
+{
+	struct efa_dev *dev = (struct efa_dev *)data;
+
+	atomic64_inc(&dev->stats.keep_alive_rcvd);
+}
+
+static struct efa_aenq_handlers aenq_handlers = {
+	.handlers = {
+		[EFA_ADMIN_KEEP_ALIVE] = efa_keep_alive,
+	},
+	.unimplemented_handler = unimplemented_aenq_handler
+};
+
+static void efa_release_bars(struct efa_dev *dev, int bars_mask)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int release_bars;
+
+	release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & bars_mask;
+	pci_release_selected_regions(pdev, release_bars);
+}
+
+static void efa_process_comp_eqe(struct efa_dev *dev, struct efa_admin_eqe *eqe)
+{
+	u16 cqn = eqe->u.comp_event.cqn;
+	struct efa_cq *cq;
+
+#ifdef HAVE_XARRAY
+	/* Safe to load as we're in irq and removal calls synchronize_irq() */
+	cq = xa_load(&dev->cqs_xa, cqn);
+#else
+	cq = dev->cqs_arr[cqn];
+#endif
+	if (unlikely(!cq)) {
+		ibdev_err_ratelimited(&dev->ibdev,
+				      "Completion event on non-existent CQ[%u]",
+				      cqn);
+		return;
+	}
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+static void efa_process_eqe(struct efa_com_eq *eeq, struct efa_admin_eqe *eqe)
+{
+	struct efa_dev *dev = container_of(eeq->edev, struct efa_dev, edev);
+
+	if (likely(EFA_GET(&eqe->common, EFA_ADMIN_EQE_EVENT_TYPE) ==
+			   EFA_ADMIN_EQE_EVENT_TYPE_COMPLETION))
+		efa_process_comp_eqe(dev, eqe);
+	else
+		ibdev_err_ratelimited(&dev->ibdev,
+				      "Unknown event type received %lu",
+				      EFA_GET(&eqe->common,
+					      EFA_ADMIN_EQE_EVENT_TYPE));
+}
+
+static irqreturn_t efa_intr_msix_comp(int irq, void *data)
+{
+	struct efa_eq *eq = data;
+	struct efa_com_dev *edev = eq->eeq.edev;
+
+	efa_com_eq_comp_intr_handler(edev, &eq->eeq);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t efa_intr_msix_mgmnt(int irq, void *data)
+{
+	struct efa_dev *dev = data;
+
+	efa_com_admin_q_comp_intr_handler(&dev->edev);
+	efa_com_aenq_intr_handler(&dev->edev, data);
+
+	return IRQ_HANDLED;
+}
+
+static int efa_request_irq(struct efa_dev *dev, struct efa_irq *irq)
+{
+	int err;
+
+	err = request_irq(irq->irqn, irq->handler, 0, irq->name, irq->data);
+	if (err) {
+		dev_err(&dev->pdev->dev, "Failed to request irq %s (%d)\n",
+			irq->name, err);
+		return err;
+	}
+
+	irq_set_affinity_hint(irq->irqn, &irq->affinity_hint_mask);
+
+	return 0;
+}
+
+static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq,
+			       int vector)
+{
+	u32 cpu;
+
+	cpu = vector - EFA_COMP_EQS_VEC_BASE;
+	snprintf(eq->irq.name, EFA_IRQNAME_SIZE, "efa-comp%d@pci:%s", cpu,
+		 pci_name(dev->pdev));
+	eq->irq.handler = efa_intr_msix_comp;
+	eq->irq.data = eq;
+	eq->irq.vector = vector;
+	eq->irq.irqn = pci_irq_vector(dev->pdev, vector);
+	cpumask_set_cpu(cpu, &eq->irq.affinity_hint_mask);
+}
+
+static void efa_free_irq(struct efa_dev *dev, struct efa_irq *irq)
+{
+	irq_set_affinity_hint(irq->irqn, NULL);
+	free_irq(irq->irqn, irq->data);
+}
+
+static void efa_setup_mgmnt_irq(struct efa_dev *dev)
+{
+	u32 cpu;
+
+	snprintf(dev->admin_irq.name, EFA_IRQNAME_SIZE,
+		 "efa-mgmnt@pci:%s", pci_name(dev->pdev));
+	dev->admin_irq.handler = efa_intr_msix_mgmnt;
+	dev->admin_irq.data = dev;
+	dev->admin_irq.vector = dev->admin_msix_vector_idx;
+	dev->admin_irq.irqn = pci_irq_vector(dev->pdev,
+					     dev->admin_msix_vector_idx);
+	cpu = cpumask_first(cpu_online_mask);
+	cpumask_set_cpu(cpu,
+			&dev->admin_irq.affinity_hint_mask);
+	dev_info(&dev->pdev->dev, "Setup irq:%d name:%s\n",
+		 dev->admin_irq.irqn,
+		 dev->admin_irq.name);
+}
+
+static int efa_set_mgmnt_irq(struct efa_dev *dev)
+{
+	efa_setup_mgmnt_irq(dev);
+
+	return efa_request_irq(dev, &dev->admin_irq);
+}
+
+static int efa_request_doorbell_bar(struct efa_dev *dev)
+{
+	u8 db_bar_idx = dev->dev_attr.db_bar;
+	struct pci_dev *pdev = dev->pdev;
+	int bars;
+	int err;
+
+	if (!(BIT(db_bar_idx) & EFA_BASE_BAR_MASK)) {
+		bars = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(db_bar_idx);
+
+		err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+		if (err) {
+			dev_err(&dev->pdev->dev,
+				"pci_request_selected_regions for bar %d failed %d\n",
+				db_bar_idx, err);
+			return err;
+		}
+	}
+
+	dev->db_bar_addr = pci_resource_start(dev->pdev, db_bar_idx);
+	dev->db_bar_len = pci_resource_len(dev->pdev, db_bar_idx);
+
+	return 0;
+}
+
+static void efa_release_doorbell_bar(struct efa_dev *dev)
+{
+	if (!(BIT(dev->dev_attr.db_bar) & EFA_BASE_BAR_MASK))
+		efa_release_bars(dev, BIT(dev->dev_attr.db_bar));
+}
+
+static void efa_update_hw_hints(struct efa_dev *dev,
+				struct efa_com_get_hw_hints_result *hw_hints)
+{
+	struct efa_com_dev *edev = &dev->edev;
+
+	if (hw_hints->mmio_read_timeout)
+		edev->mmio_read.mmio_read_timeout =
+			hw_hints->mmio_read_timeout * 1000;
+
+	if (hw_hints->poll_interval)
+		edev->aq.poll_interval = hw_hints->poll_interval;
+
+	if (hw_hints->admin_completion_timeout)
+		edev->aq.completion_timeout =
+			hw_hints->admin_completion_timeout;
+}
+
+static void efa_stats_init(struct efa_dev *dev)
+{
+	atomic64_t *s = (atomic64_t *)&dev->stats;
+	int i;
+
+	for (i = 0; i < sizeof(dev->stats) / sizeof(*s); i++, s++)
+		atomic64_set(s, 0);
+}
+
+static void efa_set_host_info(struct efa_dev *dev)
+{
+	struct efa_admin_set_feature_resp resp = {};
+	struct efa_admin_set_feature_cmd cmd = {};
+	struct efa_admin_host_info *hinf;
+	u32 bufsz = sizeof(*hinf);
+	dma_addr_t hinf_dma;
+
+	if (!efa_com_check_supported_feature_id(&dev->edev,
+						EFA_ADMIN_HOST_INFO))
+		return;
+
+	/* Failures in host info set shall not disturb probe */
+	hinf = dma_alloc_coherent(&dev->pdev->dev, bufsz, &hinf_dma,
+				  GFP_KERNEL);
+	if (!hinf)
+		return;
+
+	strscpy(hinf->os_dist_str, utsname()->release,
+		sizeof(hinf->os_dist_str));
+	hinf->os_type = EFA_ADMIN_OS_LINUX;
+	strscpy(hinf->kernel_ver_str, utsname()->version,
+		sizeof(hinf->kernel_ver_str));
+	hinf->kernel_ver = LINUX_VERSION_CODE;
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MAJOR,
+		DRV_MODULE_VER_MAJOR);
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MINOR,
+		DRV_MODULE_VER_MINOR);
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR,
+		DRV_MODULE_VER_SUBMINOR);
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE,
+		"g"[0]);
+	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_BUS, dev->pdev->bus->number);
+	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_DEVICE,
+		PCI_SLOT(dev->pdev->devfn));
+	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_FUNCTION,
+		PCI_FUNC(dev->pdev->devfn));
+	EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MAJOR,
+		EFA_COMMON_SPEC_VERSION_MAJOR);
+	EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MINOR,
+		EFA_COMMON_SPEC_VERSION_MINOR);
+#ifdef HAVE_EFA_P2P
+	EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_GDR, 1);
+#endif
+
+	efa_com_set_feature_ex(&dev->edev, &resp, &cmd, EFA_ADMIN_HOST_INFO,
+			       hinf_dma, bufsz);
+
+	dma_free_coherent(&dev->pdev->dev, bufsz, hinf, hinf_dma);
+}
+
+static void efa_destroy_eq(struct efa_dev *dev, struct efa_eq *eq)
+{
+	efa_com_eq_destroy(&dev->edev, &eq->eeq);
+	efa_free_irq(dev, &eq->irq);
+}
+
+static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u8 msix_vec)
+{
+	int err;
+
+	efa_setup_comp_irq(dev, eq, msix_vec);
+	err = efa_request_irq(dev, &eq->irq);
+	if (err)
+		return err;
+
+	err = efa_com_eq_init(&dev->edev, &eq->eeq, efa_process_eqe,
+			      dev->dev_attr.max_eq_depth, msix_vec);
+	if (err)
+		goto err_free_comp_irq;
+
+	return 0;
+
+err_free_comp_irq:
+	efa_free_irq(dev, &eq->irq);
+	return err;
+}
+
+static int efa_create_eqs(struct efa_dev *dev)
+{
+	unsigned int neqs = dev->dev_attr.max_eq;
+	int err;
+	int i;
+
+	neqs = min_t(unsigned int, neqs, num_online_cpus());
+	dev->neqs = neqs;
+	dev->eqs = kcalloc(neqs, sizeof(*dev->eqs), GFP_KERNEL);
+	if (!dev->eqs)
+		return -ENOMEM;
+
+	for (i = 0; i < neqs; i++) {
+		err = efa_create_eq(dev, &dev->eqs[i],
+				    i + EFA_COMP_EQS_VEC_BASE);
+		if (err)
+			goto err_destroy_eqs;
+	}
+
+	return 0;
+
+err_destroy_eqs:
+	for (i--; i >= 0; i--)
+		efa_destroy_eq(dev, &dev->eqs[i]);
+	kfree(dev->eqs);
+
+	return err;
+}
+
+static void efa_destroy_eqs(struct efa_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->neqs; i++)
+		efa_destroy_eq(dev, &dev->eqs[i]);
+
+	kfree(dev->eqs);
+}
+
+#ifdef HAVE_IB_DEV_OPS
+static const struct ib_device_ops efa_dev_ops = {
+#ifdef HAVE_IB_DEVICE_OPS_COMMON
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_EFA,
+	.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION,
+#endif
+
+#ifdef HAVE_SPLIT_STATS_ALLOC
+	.alloc_hw_port_stats = efa_alloc_hw_port_stats,
+	.alloc_hw_device_stats = efa_alloc_hw_device_stats,
+#else
+	.alloc_hw_stats = efa_alloc_hw_stats,
+#endif
+#ifdef HAVE_PD_CORE_ALLOCATION
+	.alloc_pd = efa_alloc_pd,
+#else
+	.alloc_pd = efa_kzalloc_pd,
+#endif
+#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
+	.alloc_ucontext = efa_alloc_ucontext,
+#else
+	.alloc_ucontext = efa_kzalloc_ucontext,
+#endif
+#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
+#ifdef HAVE_AH_CORE_ALLOCATION
+	.create_ah = efa_create_ah,
+#else
+	.create_ah = efa_kzalloc_ah,
+#endif
+#endif
+#ifdef HAVE_CQ_CORE_ALLOCATION
+	.create_cq = efa_create_cq,
+#else
+	.create_cq = efa_kzalloc_cq,
+#endif
+#ifdef HAVE_QP_CORE_ALLOCATION
+	.create_qp = efa_create_qp,
+#else
+	.create_qp = efa_kzalloc_qp,
+#endif
+#ifdef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
+	.create_user_ah = efa_create_ah,
+#endif
+	.dealloc_pd = efa_dealloc_pd,
+	.dealloc_ucontext = efa_dealloc_ucontext,
+	.dereg_mr = efa_dereg_mr,
+	.destroy_ah = efa_destroy_ah,
+	.destroy_cq = efa_destroy_cq,
+	.destroy_qp = efa_destroy_qp,
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	.get_dma_mr = efa_get_dma_mr,
+#endif
+	.get_hw_stats = efa_get_hw_stats,
+	.get_link_layer = efa_port_link_layer,
+	.get_port_immutable = efa_get_port_immutable,
+	.mmap = efa_mmap,
+#ifdef HAVE_CORE_MMAP_XA
+	.mmap_free = efa_mmap_free,
+#endif
+	.modify_qp = efa_modify_qp,
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	.poll_cq = efa_poll_cq,
+	.post_recv = efa_post_recv,
+	.post_send = efa_post_send,
+#endif
+	.query_device = efa_query_device,
+	.query_gid = efa_query_gid,
+	.query_pkey = efa_query_pkey,
+	.query_port = efa_query_port,
+	.query_qp = efa_query_qp,
+	.reg_user_mr = efa_reg_mr,
+#ifdef HAVE_MR_DMABUF
+	.reg_user_mr_dmabuf = efa_reg_user_mr_dmabuf,
+#endif
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	.req_notify_cq = efa_req_notify_cq,
+#endif
+
+#ifdef HAVE_AH_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah),
+#endif
+#ifdef HAVE_CQ_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq),
+#endif
+#ifdef HAVE_PD_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd),
+#endif
+#ifdef HAVE_QP_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_qp, efa_qp, ibqp),
+#endif
+#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext),
+#endif
+};
+#endif
+
+static int efa_ib_device_add(struct efa_dev *dev)
+{
+	struct efa_com_get_hw_hints_result hw_hints;
+	struct pci_dev *pdev = dev->pdev;
+	int err;
+
+	efa_stats_init(dev);
+
+	err = efa_com_get_device_attr(&dev->edev, &dev->dev_attr);
+	if (err)
+		return err;
+
+	dev_dbg(&dev->pdev->dev, "Doorbells bar (%d)\n", dev->dev_attr.db_bar);
+	err = efa_request_doorbell_bar(dev);
+	if (err)
+		return err;
+
+	err = efa_com_get_hw_hints(&dev->edev, &hw_hints);
+	if (err)
+		goto err_release_doorbell_bar;
+
+	efa_update_hw_hints(dev, &hw_hints);
+
+	/* Try to enable all the available aenq groups */
+	err = efa_com_set_aenq_config(&dev->edev, EFA_AENQ_ENABLED_GROUPS);
+	if (err)
+		goto err_release_doorbell_bar;
+
+	err = efa_create_eqs(dev);
+	if (err)
+		goto err_release_doorbell_bar;
+
+	efa_set_host_info(dev);
+
+	dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED;
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = dev->neqs ?: 1;
+#ifdef HAVE_DEV_PARENT
+	dev->ibdev.dev.parent = &pdev->dev;
+#else
+	dev->ibdev.dma_device = &pdev->dev;
+#endif
+
+#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
+	dev->ibdev.uverbs_cmd_mask |=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_REG_MR) |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
+#endif
+
+#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
+	dev->ibdev.uverbs_ex_cmd_mask =
+		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
+#endif
+
+#ifndef HAVE_IB_DEVICE_OPS_COMMON
+#ifdef HAVE_DRIVER_ID
+	dev->ibdev.driver_id = RDMA_DRIVER_EFA;
+#endif
+	dev->ibdev.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION;
+	dev->ibdev.owner = THIS_MODULE;
+#endif
+#ifdef HAVE_IB_DEV_OPS
+	ib_set_device_ops(&dev->ibdev, &efa_dev_ops);
+#else
+	dev->ibdev.alloc_hw_stats = efa_alloc_hw_stats;
+	dev->ibdev.alloc_pd = efa_kzalloc_pd;
+	dev->ibdev.alloc_ucontext = efa_kzalloc_ucontext;
+	dev->ibdev.create_ah = efa_kzalloc_ah;
+	dev->ibdev.create_cq = efa_kzalloc_cq;
+	dev->ibdev.create_qp = efa_kzalloc_qp;
+	dev->ibdev.dealloc_pd = efa_dealloc_pd;
+	dev->ibdev.dealloc_ucontext = efa_dealloc_ucontext;
+	dev->ibdev.dereg_mr = efa_dereg_mr;
+	dev->ibdev.destroy_ah = efa_destroy_ah;
+	dev->ibdev.destroy_cq = efa_destroy_cq;
+	dev->ibdev.destroy_qp = efa_destroy_qp;
+	dev->ibdev.get_dma_mr = efa_get_dma_mr;
+	dev->ibdev.get_hw_stats = efa_get_hw_stats;
+	dev->ibdev.get_link_layer = efa_port_link_layer;
+	dev->ibdev.get_port_immutable = efa_get_port_immutable;
+	dev->ibdev.mmap = efa_mmap;
+	dev->ibdev.modify_qp = efa_modify_qp;
+	dev->ibdev.poll_cq = efa_poll_cq;
+	dev->ibdev.post_recv = efa_post_recv;
+	dev->ibdev.post_send = efa_post_send;
+	dev->ibdev.query_device = efa_query_device;
+	dev->ibdev.query_gid = efa_query_gid;
+	dev->ibdev.query_pkey = efa_query_pkey;
+	dev->ibdev.query_port = efa_query_port;
+	dev->ibdev.query_qp = efa_query_qp;
+	dev->ibdev.reg_user_mr = efa_reg_mr;
+	dev->ibdev.req_notify_cq = efa_req_notify_cq;
+#endif
+
+#ifdef HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM
+	err = ib_register_device(&dev->ibdev, "efa_%d", &pdev->dev);
+#elif defined(HAVE_IB_REGISTER_DEVICE_TWO_PARAMS)
+	err = ib_register_device(&dev->ibdev, "efa_%d");
+#elif defined(HAVE_IB_REGISTER_DEVICE_NAME_PARAM)
+	err = ib_register_device(&dev->ibdev, "efa_%d", NULL);
+#else
+	strscpy(dev->ibdev.name, "efa_%d",
+		sizeof(dev->ibdev.name));
+
+	err = ib_register_device(&dev->ibdev, NULL);
+#endif
+	if (err)
+		goto err_destroy_eqs;
+
+	ibdev_info(&dev->ibdev, "IB device registered\n");
+
+	return 0;
+
+err_destroy_eqs:
+	efa_destroy_eqs(dev);
+err_release_doorbell_bar:
+	efa_release_doorbell_bar(dev);
+	return err;
+}
+
+static void efa_ib_device_remove(struct efa_dev *dev)
+{
+	ibdev_info(&dev->ibdev, "Unregister ib device\n");
+	ib_unregister_device(&dev->ibdev);
+	efa_destroy_eqs(dev);
+	efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL);
+	efa_release_doorbell_bar(dev);
+}
+
+static void efa_disable_msix(struct efa_dev *dev)
+{
+	pci_free_irq_vectors(dev->pdev);
+}
+
+static int efa_enable_msix(struct efa_dev *dev)
+{
+	int msix_vecs, irq_num;
+
+	/*
+	 * Reserve the max msix vectors we might need, one vector is reserved
+	 * for admin.
+	 */
+	msix_vecs = min_t(int, pci_msix_vec_count(dev->pdev),
+			  num_online_cpus() + 1);
+	dev_dbg(&dev->pdev->dev, "Trying to enable MSI-X, vectors %d\n",
+		msix_vecs);
+
+	dev->admin_msix_vector_idx = EFA_MGMNT_MSIX_VEC_IDX;
+	irq_num = pci_alloc_irq_vectors(dev->pdev, msix_vecs,
+					msix_vecs, PCI_IRQ_MSIX);
+
+	if (irq_num < 0) {
+		dev_err(&dev->pdev->dev, "Failed to enable MSI-X. irq_num %d\n",
+			irq_num);
+		return -ENOSPC;
+	}
+
+	if (irq_num != msix_vecs) {
+		efa_disable_msix(dev);
+		dev_err(&dev->pdev->dev,
+			"Allocated %d MSI-X (out of %d requested)\n",
+			irq_num, msix_vecs);
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static int efa_device_init(struct efa_com_dev *edev, struct pci_dev *pdev)
+{
+	int dma_width;
+	int err;
+
+	err = efa_com_dev_reset(edev, EFA_REGS_RESET_NORMAL);
+	if (err)
+		return err;
+
+	err = efa_com_validate_version(edev);
+	if (err)
+		return err;
+
+	dma_width = efa_com_get_dma_width(edev);
+	if (dma_width < 0) {
+		err = dma_width;
+		return err;
+	}
+
+	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(dma_width));
+	if (err) {
+		dev_err(&pdev->dev, "dma_set_mask_and_coherent failed %d\n", err);
+		return err;
+	}
+
+	dma_set_max_seg_size(&pdev->dev, UINT_MAX);
+	return 0;
+}
+
+static struct efa_dev *efa_probe_device(struct pci_dev *pdev)
+{
+	struct efa_com_dev *edev;
+	struct efa_dev *dev;
+	int bars;
+	int err;
+
+	err = pci_enable_device_mem(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n");
+		return ERR_PTR(err);
+	}
+
+	pci_set_master(pdev);
+
+#ifdef HAVE_SAFE_IB_ALLOC_DEVICE
+	dev = ib_alloc_device(efa_dev, ibdev);
+#else
+	dev = (struct efa_dev *)ib_alloc_device(sizeof(*dev));
+#endif
+	if (!dev) {
+		dev_err(&pdev->dev, "Device alloc failed\n");
+		err = -ENOMEM;
+		goto err_disable_device;
+	}
+
+	pci_set_drvdata(pdev, dev);
+	edev = &dev->edev;
+	edev->efa_dev = dev;
+	edev->dmadev = &pdev->dev;
+	dev->pdev = pdev;
+#ifdef HAVE_XARRAY
+	xa_init(&dev->cqs_xa);
+#else
+	memset(dev->cqs_arr, 0, sizeof(dev->cqs_arr));
+#endif
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK;
+	err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n",
+			err);
+		goto err_ibdev_destroy;
+	}
+
+	dev->reg_bar_addr = pci_resource_start(pdev, EFA_REG_BAR);
+	dev->reg_bar_len = pci_resource_len(pdev, EFA_REG_BAR);
+	dev->mem_bar_addr = pci_resource_start(pdev, EFA_MEM_BAR);
+	dev->mem_bar_len = pci_resource_len(pdev, EFA_MEM_BAR);
+
+	edev->reg_bar = devm_ioremap(&pdev->dev,
+				     dev->reg_bar_addr,
+				     dev->reg_bar_len);
+	if (!edev->reg_bar) {
+		dev_err(&pdev->dev, "Failed to remap register bar\n");
+		err = -EFAULT;
+		goto err_release_bars;
+	}
+
+	err = efa_com_mmio_reg_read_init(edev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init readless MMIO\n");
+		goto err_iounmap;
+	}
+
+	err = efa_device_init(edev, pdev);
+	if (err) {
+		dev_err(&pdev->dev, "EFA device init failed\n");
+		if (err == -ETIME)
+			err = -EPROBE_DEFER;
+		goto err_reg_read_destroy;
+	}
+
+	err = efa_enable_msix(dev);
+	if (err)
+		goto err_reg_read_destroy;
+
+	edev->aq.msix_vector_idx = dev->admin_msix_vector_idx;
+	edev->aenq.msix_vector_idx = dev->admin_msix_vector_idx;
+
+	err = efa_set_mgmnt_irq(dev);
+	if (err)
+		goto err_disable_msix;
+
+	err = efa_com_admin_init(edev, &aenq_handlers);
+	if (err)
+		goto err_free_mgmnt_irq;
+
+	err = efa_sysfs_init(dev);
+	if (err)
+		goto err_admin_destroy;
+
+	return dev;
+
+err_admin_destroy:
+	efa_com_admin_destroy(edev);
+err_free_mgmnt_irq:
+	efa_free_irq(dev, &dev->admin_irq);
+err_disable_msix:
+	efa_disable_msix(dev);
+err_reg_read_destroy:
+	efa_com_mmio_reg_read_destroy(edev);
+err_iounmap:
+	devm_iounmap(&pdev->dev, edev->reg_bar);
+err_release_bars:
+	efa_release_bars(dev, EFA_BASE_BAR_MASK);
+err_ibdev_destroy:
+	ib_dealloc_device(&dev->ibdev);
+err_disable_device:
+	pci_disable_device(pdev);
+	return ERR_PTR(err);
+}
+
+static void efa_remove_device(struct pci_dev *pdev)
+{
+	struct efa_dev *dev = pci_get_drvdata(pdev);
+	struct efa_com_dev *edev;
+
+	edev = &dev->edev;
+	efa_sysfs_destroy(dev);
+	efa_com_admin_destroy(edev);
+	efa_free_irq(dev, &dev->admin_irq);
+	efa_disable_msix(dev);
+	efa_com_mmio_reg_read_destroy(edev);
+	devm_iounmap(&pdev->dev, edev->reg_bar);
+	efa_release_bars(dev, EFA_BASE_BAR_MASK);
+#ifdef HAVE_XARRAY
+	xa_destroy(&dev->cqs_xa);
+#endif
+	ib_dealloc_device(&dev->ibdev);
+	pci_disable_device(pdev);
+}
+
+static int efa_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct efa_dev *dev;
+	int err;
+
+	dev = efa_probe_device(pdev);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	err = efa_ib_device_add(dev);
+	if (err)
+		goto err_remove_device;
+
+	return 0;
+
+err_remove_device:
+	efa_remove_device(pdev);
+	return err;
+}
+
+static void efa_remove(struct pci_dev *pdev)
+{
+	struct efa_dev *dev = pci_get_drvdata(pdev);
+
+	efa_ib_device_remove(dev);
+	efa_remove_device(pdev);
+}
+
+static struct pci_driver efa_pci_driver = {
+	.name           = DRV_MODULE_NAME,
+	.id_table       = efa_pci_tbl,
+	.probe          = efa_probe,
+	.remove         = efa_remove,
+};
+
+static int __init efa_init(void)
+{
+	int err;
+
+	pr_info("%s\n", version);
+
+	err = pci_register_driver(&efa_pci_driver);
+	if (err) {
+		pr_err("Couldn't register efa driver\n");
+		return err;
+	}
+
+#ifdef HAVE_EFA_P2P
+	efa_p2p_init();
+#endif
+
+	return 0;
+}
+
+static void __exit efa_exit(void)
+{
+	pci_unregister_driver(&efa_pci_driver);
+}
+
+module_init(efa_init);
+module_exit(efa_exit);
diff --git a/drivers/amazon/net/efa/efa_neuron.c b/drivers/amazon/net/efa/efa_neuron.c
new file mode 100644
index 0000000000000..ec2644e3079c4
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_neuron.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/module.h>
+
+#include "efa_p2p.h"
+#include "neuron_p2p.h"
+
+#define NEURON_PAGE_SHIFT 12
+#define NEURON_PAGE_SIZE BIT_ULL(NEURON_PAGE_SHIFT)
+
+struct efa_neuronmem_ops {
+	int (*register_va)(u64 virtual_address, u64 length,
+			   struct neuron_p2p_va_info **vainfo,
+			   void (*free_callback)(void *data),
+			   void *data);
+	int (*unregister_va)(struct neuron_p2p_va_info *vainfo);
+};
+
+struct efa_neuronmem {
+	struct efa_p2pmem p2pmem;
+	struct efa_neuronmem_ops ops;
+	struct neuron_p2p_va_info *va_info;
+	u64 virt_start;
+};
+
+static unsigned int neuronmem_pgsz(struct efa_dev *dev,
+				   struct efa_p2pmem *p2pmem)
+{
+	struct efa_neuronmem *neuronmem;
+
+	neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem);
+	return BIT(neuronmem->va_info->shift_page_size);
+}
+
+static int neuronmem_get_fp(struct efa_neuronmem *neuronmem)
+{
+	neuronmem->ops.register_va = symbol_get(neuron_p2p_register_va);
+	if (!neuronmem->ops.register_va)
+		goto err_out;
+
+	neuronmem->ops.unregister_va = symbol_get(neuron_p2p_unregister_va);
+	if (!neuronmem->ops.unregister_va)
+		goto err_put_register_va;
+
+	return 0;
+
+err_put_register_va:
+	symbol_put(neuron_p2p_register_va);
+err_out:
+	return -EINVAL;
+}
+
+static void neuronmem_put_fp(void)
+{
+	symbol_put(neuron_p2p_unregister_va);
+	symbol_put(neuron_p2p_register_va);
+}
+
+static void neuronmem_free_cb(void *data)
+{
+	pr_debug("Free callback ticket %llu\n", (u64)data);
+	efa_p2p_put((u64)data, true);
+}
+
+static int neuronmem_register_va(struct efa_dev *dev, struct efa_neuronmem *neuronmem,
+				 u64 addr, u64 size, u64 ticket)
+{
+	int err;
+
+	err = neuronmem->ops.register_va(addr, size, &neuronmem->va_info,
+					 neuronmem_free_cb, (void *)ticket);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "neuron_p2p_register_va failed %d\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
+static struct efa_p2pmem *neuronmem_get(struct efa_dev *dev, u64 ticket, u64 start,
+					u64 length)
+{
+	struct efa_neuronmem *neuronmem;
+	u64 virt_start;
+	u64 virt_end;
+	u64 pinsz;
+	int err;
+
+	neuronmem = kzalloc(sizeof(*neuronmem), GFP_KERNEL);
+	if (!neuronmem)
+		return NULL;
+
+	virt_start = ALIGN_DOWN(start, NEURON_PAGE_SIZE);
+	virt_end = ALIGN(start + length, NEURON_PAGE_SIZE);
+	pinsz = virt_end - virt_start;
+	neuronmem->virt_start = virt_start;
+
+	err = neuronmem_get_fp(neuronmem);
+	if (err)
+		/* Neuron module is not loaded */
+		goto err_free;
+
+	err = neuronmem_register_va(dev, neuronmem, virt_start, pinsz, ticket);
+	if (err)
+		/* Most likely not our pages */
+		goto err_put_fp;
+
+	return &neuronmem->p2pmem;
+
+err_put_fp:
+	neuronmem_put_fp();
+err_free:
+	kfree(neuronmem);
+	return NULL;
+}
+
+static int neuronmem_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+				  u64 *page_list)
+{
+	struct neuron_p2p_page_info *pg_info;
+	struct neuron_p2p_va_info *va_info;
+	struct efa_neuronmem *neuronmem;
+	int ent_idx, pa_idx;
+	int pg_idx = 0;
+	u64 pa;
+
+	neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem);
+	va_info = neuronmem->va_info;
+
+	for (ent_idx = 0; ent_idx < va_info->entries; ent_idx++) {
+		pg_info = va_info->page_info + ent_idx;
+		pa = pg_info->physical_address;
+		for (pa_idx = 0; pa_idx < pg_info->page_count; pa_idx++) {
+			page_list[pg_idx++] = pa;
+			pa += BIT(va_info->shift_page_size);
+		}
+	}
+
+	return 0;
+}
+
+static void neuronmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			      bool in_cb)
+{
+	struct efa_neuronmem *neuronmem;
+
+	neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem);
+
+	neuronmem->ops.unregister_va(neuronmem->va_info);
+	neuronmem_put_fp();
+	kfree(neuronmem);
+}
+
+struct neuronmem_provider {
+	struct efa_p2p_provider p2p;
+};
+
+static const struct neuronmem_provider prov = {
+	.p2p = {
+		.ops = {
+			.try_get = neuronmem_get,
+			.to_page_list = neuronmem_to_page_list,
+			.release = neuronmem_release,
+			.get_page_size = neuronmem_pgsz,
+		},
+		.type = EFA_P2P_PROVIDER_NEURON,
+	},
+};
+
+const struct efa_p2p_provider *neuronmem_get_provider(void)
+{
+	return &prov.p2p;
+}
diff --git a/drivers/amazon/net/efa/efa_p2p.c b/drivers/amazon/net/efa/efa_p2p.c
new file mode 100644
index 0000000000000..9daf101288f43
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_p2p.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_p2p.h"
+
+static struct mutex p2p_list_lock;
+static struct list_head p2p_list;
+static atomic64_t next_p2p_ticket;
+
+static const struct efa_p2p_provider *prov_arr[EFA_P2P_PROVIDER_MAX];
+
+/* Register all providers here */
+static void p2p_providers_init(void)
+{
+	prov_arr[EFA_P2P_PROVIDER_NVMEM] = nvmem_get_provider();
+	prov_arr[EFA_P2P_PROVIDER_NEURON] = neuronmem_get_provider();
+}
+
+void efa_p2p_init(void)
+{
+	mutex_init(&p2p_list_lock);
+	INIT_LIST_HEAD(&p2p_list);
+	/*
+	 * Ideally, first ticket would be zero, but that would make callback
+	 * data NULL which is invalid.
+	 */
+	atomic64_set(&next_p2p_ticket, 1);
+
+	p2p_providers_init();
+}
+
+static struct efa_p2pmem *ticket_to_p2p(u64 ticket)
+{
+	struct efa_p2pmem *p2pmem;
+
+	lockdep_assert_held(&p2p_list_lock);
+	list_for_each_entry(p2pmem, &p2p_list, list) {
+		if (p2pmem->ticket == ticket)
+			return p2pmem;
+	}
+
+	return NULL;
+}
+
+int efa_p2p_put(u64 ticket, bool in_cb)
+{
+	struct efa_com_dereg_mr_params params = {};
+	struct efa_p2pmem *p2pmem;
+	struct efa_dev *dev;
+	int err;
+
+	mutex_lock(&p2p_list_lock);
+	p2pmem = ticket_to_p2p(ticket);
+	if (!p2pmem) {
+		pr_debug("Ticket %llu not found in the p2pmem list\n", ticket);
+		mutex_unlock(&p2p_list_lock);
+		return 0;
+	}
+
+	dev = p2pmem->dev;
+	if (p2pmem->needs_dereg) {
+		params.l_key = p2pmem->lkey;
+		err = efa_com_dereg_mr(&dev->edev, &params);
+		if (err) {
+			mutex_unlock(&p2p_list_lock);
+			return err;
+		}
+		p2pmem->needs_dereg = false;
+	}
+
+	list_del(&p2pmem->list);
+	mutex_unlock(&p2p_list_lock);
+	p2pmem->prov->ops.release(dev, p2pmem, in_cb);
+
+	return 0;
+}
+
+struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
+			       u64 length)
+{
+	const struct efa_p2p_provider *prov;
+	struct efa_p2pmem *p2pmem;
+	u64 ticket;
+	int i;
+
+	ticket = atomic64_fetch_inc(&next_p2p_ticket);
+	for (i = 0; i < EFA_P2P_PROVIDER_MAX; i++) {
+		prov = prov_arr[i];
+		p2pmem = prov->ops.try_get(dev, ticket, start, length);
+		if (p2pmem)
+			break;
+	}
+	if (!p2pmem)
+		/* No provider was found, most likely cpu pages */
+		return NULL;
+
+	p2pmem->dev = dev;
+	p2pmem->ticket = ticket;
+	p2pmem->prov = prov;
+	mr->p2p_ticket = p2pmem->ticket;
+
+	mutex_lock(&p2p_list_lock);
+	list_add(&p2pmem->list, &p2p_list);
+	mutex_unlock(&p2p_list_lock);
+
+	return p2pmem;
+}
+
+int efa_p2p_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			 u64 *page_list)
+{
+	return p2pmem->prov->ops.to_page_list(dev, p2pmem, page_list);
+}
+
+unsigned int efa_p2p_get_page_size(struct efa_dev *dev,
+				   struct efa_p2pmem *p2pmem)
+{
+	return p2pmem->prov->ops.get_page_size(dev, p2pmem);
+}
diff --git a/drivers/amazon/net/efa/efa_p2p.h b/drivers/amazon/net/efa/efa_p2p.h
new file mode 100644
index 0000000000000..89ee7a9935c11
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_p2p.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_P2P_H_
+#define _EFA_P2P_H_
+
+#include "efa.h"
+
+struct efa_p2p_ops {
+	struct efa_p2pmem *(*try_get)(struct efa_dev *dev, u64 ticket, u64 start,
+				      u64 length);
+	int (*to_page_list)(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			    u64 *page_list);
+	void (*release)(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			bool in_cb);
+	unsigned int (*get_page_size)(struct efa_dev *dev,
+				      struct efa_p2pmem *p2pmem);
+};
+
+enum efa_p2p_prov {
+	EFA_P2P_PROVIDER_NVMEM,
+	EFA_P2P_PROVIDER_NEURON,
+	EFA_P2P_PROVIDER_MAX,
+};
+
+struct efa_p2p_provider {
+	const struct efa_p2p_ops ops;
+	enum efa_p2p_prov type;
+};
+
+struct efa_p2pmem {
+	struct efa_dev *dev;
+	const struct efa_p2p_provider *prov;
+	u64 ticket;
+	u32 lkey;
+	bool needs_dereg;
+	struct list_head list; /* member of efa_p2p_list */
+};
+
+void efa_p2p_init(void);
+struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
+			       u64 length);
+unsigned int efa_p2p_get_page_size(struct efa_dev *dev,
+				   struct efa_p2pmem *p2pmem);
+int efa_p2p_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			 u64 *page_list);
+int efa_p2p_put(u64 ticket, bool in_cb);
+
+/* Provider specific stuff go here */
+const struct efa_p2p_provider *nvmem_get_provider(void);
+bool nvmem_is_supported(void);
+
+const struct efa_p2p_provider *neuronmem_get_provider(void);
+
+#endif /* _EFA_P2P_H_ */
diff --git a/drivers/amazon/net/efa/efa_regs_defs.h b/drivers/amazon/net/efa/efa_regs_defs.h
new file mode 100644
index 0000000000000..714ae62588004
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_regs_defs.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_REGS_H_
+#define _EFA_REGS_H_
+
+enum efa_regs_reset_reason_types {
+	EFA_REGS_RESET_NORMAL                       = 0,
+	/* Keep alive timeout */
+	EFA_REGS_RESET_KEEP_ALIVE_TO                = 1,
+	EFA_REGS_RESET_ADMIN_TO                     = 2,
+	EFA_REGS_RESET_INIT_ERR                     = 3,
+	EFA_REGS_RESET_DRIVER_INVALID_STATE         = 4,
+	EFA_REGS_RESET_OS_TRIGGER                   = 5,
+	EFA_REGS_RESET_SHUTDOWN                     = 6,
+	EFA_REGS_RESET_USER_TRIGGER                 = 7,
+	EFA_REGS_RESET_GENERIC                      = 8,
+};
+
+/* efa_registers offsets */
+
+/* 0 base */
+#define EFA_REGS_VERSION_OFF                                0x0
+#define EFA_REGS_CONTROLLER_VERSION_OFF                     0x4
+#define EFA_REGS_CAPS_OFF                                   0x8
+#define EFA_REGS_AQ_BASE_LO_OFF                             0x10
+#define EFA_REGS_AQ_BASE_HI_OFF                             0x14
+#define EFA_REGS_AQ_CAPS_OFF                                0x18
+#define EFA_REGS_ACQ_BASE_LO_OFF                            0x20
+#define EFA_REGS_ACQ_BASE_HI_OFF                            0x24
+#define EFA_REGS_ACQ_CAPS_OFF                               0x28
+#define EFA_REGS_AQ_PROD_DB_OFF                             0x2c
+#define EFA_REGS_AENQ_CAPS_OFF                              0x34
+#define EFA_REGS_AENQ_BASE_LO_OFF                           0x38
+#define EFA_REGS_AENQ_BASE_HI_OFF                           0x3c
+#define EFA_REGS_AENQ_CONS_DB_OFF                           0x40
+#define EFA_REGS_INTR_MASK_OFF                              0x4c
+#define EFA_REGS_DEV_CTL_OFF                                0x54
+#define EFA_REGS_DEV_STS_OFF                                0x58
+#define EFA_REGS_MMIO_REG_READ_OFF                          0x5c
+#define EFA_REGS_MMIO_RESP_LO_OFF                           0x60
+#define EFA_REGS_MMIO_RESP_HI_OFF                           0x64
+#define EFA_REGS_EQ_DB_OFF                                  0x68
+
+/* version register */
+#define EFA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
+#define EFA_REGS_VERSION_MAJOR_VERSION_MASK                 0xff00
+
+/* controller_version register */
+#define EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK   0xff
+#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK      0xff00
+#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK      0xff0000
+#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK            0xff000000
+
+/* caps register */
+#define EFA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK        0x1
+#define EFA_REGS_CAPS_RESET_TIMEOUT_MASK                    0x3e
+#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK                   0xff00
+#define EFA_REGS_CAPS_ADMIN_CMD_TO_MASK                     0xf0000
+
+/* aq_caps register */
+#define EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK                      0xffff
+#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK                 0xffff0000
+
+/* acq_caps register */
+#define EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK                    0xffff
+#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK               0xff0000
+#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK              0xff000000
+
+/* aenq_caps register */
+#define EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK                  0xffff
+#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK             0xff0000
+#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK            0xff000000
+
+/* intr_mask register */
+#define EFA_REGS_INTR_MASK_EN_MASK                          0x1
+
+/* dev_ctl register */
+#define EFA_REGS_DEV_CTL_DEV_RESET_MASK                     0x1
+#define EFA_REGS_DEV_CTL_AQ_RESTART_MASK                    0x2
+#define EFA_REGS_DEV_CTL_RESET_REASON_MASK                  0xf0000000
+
+/* dev_sts register */
+#define EFA_REGS_DEV_STS_READY_MASK                         0x1
+#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK        0x2
+#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK           0x4
+#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK             0x8
+#define EFA_REGS_DEV_STS_RESET_FINISHED_MASK                0x10
+#define EFA_REGS_DEV_STS_FATAL_ERROR_MASK                   0x20
+
+/* mmio_reg_read register */
+#define EFA_REGS_MMIO_REG_READ_REQ_ID_MASK                  0xffff
+#define EFA_REGS_MMIO_REG_READ_REG_OFF_MASK                 0xffff0000
+
+/* eq_db register */
+#define EFA_REGS_EQ_DB_EQN_MASK                             0xffff
+#define EFA_REGS_EQ_DB_ARM_MASK                             0x80000000
+
+#endif /* _EFA_REGS_H_ */
diff --git a/drivers/amazon/net/efa/efa_sysfs.c b/drivers/amazon/net/efa/efa_sysfs.c
new file mode 100644
index 0000000000000..8e8b2bd210db1
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_sysfs.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_sysfs.h"
+#include "kcompat.h"
+
+#include <linux/device.h>
+#include <linux/sysfs.h>
+
+#ifndef HAVE_SYSFS_EMIT
+#include <linux/mm.h>
+
+static int sysfs_emit(char *buf, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+
+	if (!buf)
+		return 0;
+
+	va_start(args, fmt);
+	len = vscnprintf(buf, PAGE_SIZE, fmt, args);
+	va_end(args);
+
+	return len;
+}
+#endif
+
+#ifdef HAVE_EFA_P2P
+#include "efa_p2p.h"
+
+static ssize_t gdr_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	if (nvmem_is_supported())
+		return sysfs_emit(buf, "1\n");
+
+	return sysfs_emit(buf, "0\n");
+}
+
+static DEVICE_ATTR_RO(gdr);
+#endif
+
+int efa_sysfs_init(struct efa_dev *dev)
+{
+#ifdef HAVE_EFA_P2P
+	struct device *device = &dev->pdev->dev;
+
+	if (device_create_file(device, &dev_attr_gdr))
+		dev_err(device, "Failed to create GDR sysfs file\n");
+#endif
+	return 0;
+}
+
+void efa_sysfs_destroy(struct efa_dev *dev)
+{
+#ifdef HAVE_EFA_P2P
+	device_remove_file(&dev->pdev->dev, &dev_attr_gdr);
+#endif
+}
diff --git a/drivers/amazon/net/efa/efa_sysfs.h b/drivers/amazon/net/efa/efa_sysfs.h
new file mode 100644
index 0000000000000..c390aa547e5a6
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_sysfs.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_SYSFS_H_
+#define _EFA_SYSFS_H_
+
+#include "efa.h"
+
+int efa_sysfs_init(struct efa_dev *dev);
+
+void efa_sysfs_destroy(struct efa_dev *dev);
+
+#endif /* _EFA_SYSFS_H_ */
diff --git a/drivers/amazon/net/efa/efa_verbs.c b/drivers/amazon/net/efa/efa_verbs.c
new file mode 100644
index 0000000000000..c9535ee90108b
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_verbs.c
@@ -0,0 +1,3022 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "kcompat.h"
+#ifdef HAVE_MR_DMABUF
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
+#endif
+#include <linux/vmalloc.h>
+#include <linux/log2.h>
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#ifdef HAVE_UDATA_TO_DRV_CONTEXT
+#include <rdma/uverbs_ioctl.h>
+#endif
+
+#include "efa.h"
+#include "efa_io_defs.h"
+
+#ifdef HAVE_EFA_P2P
+#include "efa_p2p.h"
+#endif
+
+enum {
+	EFA_MMAP_DMA_PAGE = 0,
+	EFA_MMAP_IO_WC,
+	EFA_MMAP_IO_NC,
+};
+
+#define EFA_AENQ_ENABLED_GROUPS \
+	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
+	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
+
+struct efa_user_mmap_entry {
+	struct rdma_user_mmap_entry rdma_entry;
+#ifndef HAVE_CORE_MMAP_XA
+	struct list_head list;
+#endif
+	u64 address;
+	u8 mmap_flag;
+};
+
+#define EFA_DEFINE_DEVICE_STATS(op) \
+	op(EFA_SUBMITTED_CMDS, "submitted_cmds") \
+	op(EFA_COMPLETED_CMDS, "completed_cmds") \
+	op(EFA_CMDS_ERR, "cmds_err") \
+	op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \
+	op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \
+	op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \
+	op(EFA_CREATE_QP_ERR, "create_qp_err") \
+	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
+	op(EFA_REG_MR_ERR, "reg_mr_err") \
+	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
+	op(EFA_CREATE_AH_ERR, "create_ah_err") \
+	op(EFA_MMAP_ERR, "mmap_err")
+
+#define EFA_DEFINE_PORT_STATS(op) \
+	op(EFA_TX_BYTES, "tx_bytes") \
+	op(EFA_TX_PKTS, "tx_pkts") \
+	op(EFA_RX_BYTES, "rx_bytes") \
+	op(EFA_RX_PKTS, "rx_pkts") \
+	op(EFA_RX_DROPS, "rx_drops") \
+	op(EFA_SEND_BYTES, "send_bytes") \
+	op(EFA_SEND_WRS, "send_wrs") \
+	op(EFA_RECV_BYTES, "recv_bytes") \
+	op(EFA_RECV_WRS, "recv_wrs") \
+	op(EFA_RDMA_READ_WRS, "rdma_read_wrs") \
+	op(EFA_RDMA_READ_BYTES, "rdma_read_bytes") \
+	op(EFA_RDMA_READ_WR_ERR, "rdma_read_wr_err") \
+	op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \
+
+#define EFA_STATS_ENUM(ename, name) ename,
+#ifdef HAVE_STAT_DESC_STRUCT
+#define EFA_STATS_STR(ename, nam) \
+	[ename].name = nam,
+#else
+#define EFA_STATS_STR(ename, nam) \
+	[ename] = nam,
+#endif
+
+enum efa_hw_device_stats {
+	EFA_DEFINE_DEVICE_STATS(EFA_STATS_ENUM)
+};
+
+#ifdef HAVE_STAT_DESC_STRUCT
+static const struct rdma_stat_desc efa_device_stats_descs[] = {
+#else
+static const char *const efa_device_stats_descs[] = {
+#endif
+	EFA_DEFINE_DEVICE_STATS(EFA_STATS_STR)
+};
+
+enum efa_hw_port_stats {
+	EFA_DEFINE_PORT_STATS(EFA_STATS_ENUM)
+};
+
+#ifdef HAVE_STAT_DESC_STRUCT
+static const struct rdma_stat_desc efa_port_stats_descs[] = {
+#else
+static const char *const efa_port_stats_descs[] = {
+#endif
+	EFA_DEFINE_PORT_STATS(EFA_STATS_STR)
+};
+
+#define EFA_CHUNK_PAYLOAD_SHIFT       12
+#define EFA_CHUNK_PAYLOAD_SIZE        BIT(EFA_CHUNK_PAYLOAD_SHIFT)
+#define EFA_CHUNK_PAYLOAD_PTR_SIZE    8
+
+#define EFA_CHUNK_SHIFT               12
+#define EFA_CHUNK_SIZE                BIT(EFA_CHUNK_SHIFT)
+#define EFA_CHUNK_PTR_SIZE            sizeof(struct efa_com_ctrl_buff_info)
+
+#define EFA_PTRS_PER_CHUNK \
+	((EFA_CHUNK_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_CHUNK_PAYLOAD_PTR_SIZE)
+
+#define EFA_CHUNK_USED_SIZE \
+	((EFA_PTRS_PER_CHUNK * EFA_CHUNK_PAYLOAD_PTR_SIZE) + EFA_CHUNK_PTR_SIZE)
+
+struct pbl_chunk {
+	dma_addr_t dma_addr;
+	u64 *buf;
+	u32 length;
+};
+
+struct pbl_chunk_list {
+	struct pbl_chunk *chunks;
+	unsigned int size;
+};
+
+struct pbl_context {
+	union {
+		struct {
+			dma_addr_t dma_addr;
+		} continuous;
+		struct {
+			u32 pbl_buf_size_in_pages;
+			struct scatterlist *sgl;
+			int sg_dma_cnt;
+			struct pbl_chunk_list chunk_list;
+		} indirect;
+	} phys;
+	u64 *pbl_buf;
+	u32 pbl_buf_size_in_bytes;
+	u8 physically_continuous;
+};
+
+static inline struct efa_dev *to_edev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct efa_dev, ibdev);
+}
+
+static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct efa_ucontext, ibucontext);
+}
+
+static inline struct efa_pd *to_epd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct efa_pd, ibpd);
+}
+
+static inline struct efa_mr *to_emr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct efa_mr, ibmr);
+}
+
+static inline struct efa_qp *to_eqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct efa_qp, ibqp);
+}
+
+static inline struct efa_cq *to_ecq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct efa_cq, ibcq);
+}
+
+static inline struct efa_ah *to_eah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct efa_ah, ibah);
+}
+
+static inline struct efa_user_mmap_entry *
+to_emmap(struct rdma_user_mmap_entry *rdma_entry)
+{
+	return container_of(rdma_entry, struct efa_user_mmap_entry, rdma_entry);
+}
+
+#define EFA_DEV_CAP(dev, cap) \
+	((dev)->dev_attr.device_caps & \
+	 EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_##cap##_MASK)
+
+#define is_reserved_cleared(reserved) \
+	!memchr_inv(reserved, 0, sizeof(reserved))
+
+static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr,
+			       size_t size, enum dma_data_direction dir)
+{
+	void *addr;
+
+	addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
+	if (!addr)
+		return NULL;
+
+	*dma_addr = dma_map_single(&dev->pdev->dev, addr, size, dir);
+	if (dma_mapping_error(&dev->pdev->dev, *dma_addr)) {
+		ibdev_err(&dev->ibdev, "Failed to map DMA address\n");
+		free_pages_exact(addr, size);
+		return NULL;
+	}
+
+	return addr;
+}
+
+static void efa_free_mapped(struct efa_dev *dev, void *cpu_addr,
+			    dma_addr_t dma_addr,
+			    size_t size, enum dma_data_direction dir)
+{
+	dma_unmap_single(&dev->pdev->dev, dma_addr, size, dir);
+	free_pages_exact(cpu_addr, size);
+}
+
+#ifndef HAVE_CORE_MMAP_XA
+/*
+ * This is only called when the ucontext is destroyed and there can be no
+ * concurrent query via mmap or allocate on the database, thus we can be sure no
+ * other thread is using the entry pointer. We also know that all the BAR
+ * pages have either been zap'd or munmaped at this point.  Normal pages are
+ * refcounted and will be freed at the proper time.
+ */
+static void mmap_entries_remove_free(struct efa_dev *dev,
+				     struct efa_ucontext *ucontext)
+{
+	struct efa_user_mmap_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) {
+		list_del(&entry->list);
+		ibdev_dbg(
+			&dev->ibdev,
+			"mmap: key[%#llx] addr[%#llx] len[%#zx] removed\n",
+			rdma_user_mmap_get_offset(&entry->rdma_entry),
+			entry->address, entry->rdma_entry.npages * PAGE_SIZE);
+		kfree(entry);
+	}
+}
+
+static int mmap_entry_validate(struct efa_ucontext *ucontext,
+			       struct vm_area_struct *vma)
+{
+	size_t length = vma->vm_end - vma->vm_start;
+
+	if (length % PAGE_SIZE != 0 || !(vma->vm_flags & VM_SHARED)) {
+		ibdev_dbg(ucontext->ibucontext.device,
+			  "length[%#zx] is not page size aligned[%#lx] or VM_SHARED is not set [%#lx]\n",
+			  length, PAGE_SIZE, vma->vm_flags);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ibucontext,
+			 struct vm_area_struct *vma)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	size_t length = vma->vm_end - vma->vm_start;
+	struct efa_user_mmap_entry *entry, *tmp;
+	u64 key = vma->vm_pgoff << PAGE_SHIFT;
+	int err;
+
+	err = mmap_entry_validate(ucontext, vma);
+	if (err)
+		return NULL;
+
+	mutex_lock(&ucontext->lock);
+	list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) {
+		if (rdma_user_mmap_get_offset(&entry->rdma_entry) == key &&
+		    entry->rdma_entry.npages * PAGE_SIZE == length) {
+			ibdev_dbg(ibucontext->device,
+				  "mmap: key[%#llx] addr[%#llx] len[%#zx] removed\n",
+				  key, entry->address,
+				  entry->rdma_entry.npages * PAGE_SIZE);
+			mutex_unlock(&ucontext->lock);
+			return &entry->rdma_entry;
+		}
+	}
+	mutex_unlock(&ucontext->lock);
+
+	return NULL;
+}
+#endif /* !defined (HAVE_CORE_MMAP_XA) */
+
+int efa_query_device(struct ib_device *ibdev,
+		     struct ib_device_attr *props,
+		     struct ib_udata *udata)
+{
+	struct efa_com_get_device_attr_result *dev_attr;
+	struct efa_ibv_ex_query_device_resp resp = {};
+	struct efa_dev *dev = to_edev(ibdev);
+	int err;
+
+	if (udata && udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		return -EINVAL;
+	}
+
+	dev_attr = &dev->dev_attr;
+
+	memset(props, 0, sizeof(*props));
+	props->max_mr_size = dev_attr->max_mr_pages * PAGE_SIZE;
+	props->page_size_cap = dev_attr->page_size_cap;
+	props->vendor_id = dev->pdev->vendor;
+	props->vendor_part_id = dev->pdev->device;
+	props->hw_ver = dev->pdev->subsystem_device;
+	props->max_qp = dev_attr->max_qp;
+	props->max_cq = dev_attr->max_cq;
+	props->max_pd = dev_attr->max_pd;
+	props->max_mr = dev_attr->max_mr;
+	props->max_ah = dev_attr->max_ah;
+	props->max_cqe = dev_attr->max_cq_depth;
+	props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth,
+				 dev_attr->max_rq_depth);
+#ifdef HAVE_MAX_SEND_RCV_SGE
+	props->max_send_sge = dev_attr->max_sq_sge;
+	props->max_recv_sge = dev_attr->max_rq_sge;
+#else
+	props->max_sge = min_t(u16, dev_attr->max_sq_sge,
+			       dev_attr->max_rq_sge);
+#endif
+	props->max_sge_rd = dev_attr->max_wr_rdma_sge;
+	props->max_pkeys = 1;
+
+	if (udata && udata->outlen) {
+		resp.max_sq_sge = dev_attr->max_sq_sge;
+		resp.max_rq_sge = dev_attr->max_rq_sge;
+		resp.max_sq_wr = dev_attr->max_sq_depth;
+		resp.max_rq_wr = dev_attr->max_rq_depth;
+		resp.max_rdma_size = dev_attr->max_rdma_size;
+
+		resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID;
+		if (EFA_DEV_CAP(dev, RDMA_READ))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ;
+
+		if (EFA_DEV_CAP(dev, RNR_RETRY))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY;
+
+		if (dev->neqs)
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
+
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(ibdev,
+				  "Failed to copy udata for query_device\n");
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+int efa_query_port(struct ib_device *ibdev, port_t port,
+		   struct ib_port_attr *props)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+
+	props->lmc = 1;
+
+	props->state = IB_PORT_ACTIVE;
+	props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_speed = IB_SPEED_EDR;
+	props->active_width = IB_WIDTH_4X;
+	props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
+	props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
+	props->max_msg_sz = dev->dev_attr.mtu;
+	props->max_vl_num = 1;
+
+	return 0;
+}
+
+int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask,
+		 struct ib_qp_init_attr *qp_init_attr)
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+	struct efa_com_query_qp_params params = {};
+	struct efa_com_query_qp_result result;
+	struct efa_qp *qp = to_eqp(ibqp);
+	int err;
+
+#define EFA_QUERY_QP_SUPP_MASK \
+	(IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \
+	 IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP | IB_QP_RNR_RETRY)
+
+	if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
+			  qp_attr_mask, EFA_QUERY_QP_SUPP_MASK);
+		return -EOPNOTSUPP;
+	}
+
+	memset(qp_attr, 0, sizeof(*qp_attr));
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+	params.qp_handle = qp->qp_handle;
+	err = efa_com_query_qp(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	qp_attr->qp_state = result.qp_state;
+	qp_attr->qkey = result.qkey;
+	qp_attr->sq_psn = result.sq_psn;
+	qp_attr->sq_draining = result.sq_draining;
+	qp_attr->port_num = 1;
+	qp_attr->rnr_retry = result.rnr_retry;
+
+	qp_attr->cap.max_send_wr = qp->max_send_wr;
+	qp_attr->cap.max_recv_wr = qp->max_recv_wr;
+	qp_attr->cap.max_send_sge = qp->max_send_sge;
+	qp_attr->cap.max_recv_sge = qp->max_recv_sge;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+	qp_init_attr->qp_type = ibqp->qp_type;
+	qp_init_attr->recv_cq = ibqp->recv_cq;
+	qp_init_attr->send_cq = ibqp->send_cq;
+	qp_init_attr->qp_context = ibqp->qp_context;
+	qp_init_attr->cap = qp_attr->cap;
+
+	return 0;
+}
+
+int efa_query_gid(struct ib_device *ibdev, port_t port, int index,
+		  union ib_gid *gid)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+
+	memcpy(gid->raw, dev->dev_attr.addr, sizeof(dev->dev_attr.addr));
+
+	return 0;
+}
+
+int efa_query_pkey(struct ib_device *ibdev, port_t port, u16 index,
+		   u16 *pkey)
+{
+	if (index > 0)
+		return -EINVAL;
+
+	*pkey = 0xffff;
+	return 0;
+}
+
+static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn)
+{
+	struct efa_com_dealloc_pd_params params = {
+		.pdn = pdn,
+	};
+
+	return efa_com_dealloc_pd(&dev->edev, &params);
+}
+
+#ifdef HAVE_ALLOC_PD_NO_UCONTEXT
+int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+#else
+int efa_alloc_pd(struct ib_pd *ibpd,
+		 struct ib_ucontext *ibucontext,
+		 struct ib_udata *udata)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_ibv_alloc_pd_resp resp = {};
+	struct efa_com_alloc_pd_result result;
+	struct efa_pd *pd = to_epd(ibpd);
+	int err;
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+	if (udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = efa_com_alloc_pd(&dev->edev, &result);
+	if (err)
+		goto err_out;
+
+	pd->pdn = result.pdn;
+	resp.pdn = result.pdn;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to copy udata for alloc_pd\n");
+			goto err_dealloc_pd;
+		}
+	}
+
+	ibdev_dbg(&dev->ibdev, "Allocated pd[%d]\n", pd->pdn);
+
+	return 0;
+
+err_dealloc_pd:
+	efa_pd_dealloc(dev, result.pdn);
+err_out:
+	atomic64_inc(&dev->stats.alloc_pd_err);
+	return err;
+}
+
+#ifndef HAVE_PD_CORE_ALLOCATION
+struct ib_pd *efa_kzalloc_pd(struct ib_device *ibdev,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_pd *pd;
+	int err;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd) {
+		atomic64_inc(&dev->stats.alloc_pd_err);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pd->ibpd.device = ibdev;
+
+#ifdef HAVE_ALLOC_PD_NO_UCONTEXT
+	err = efa_alloc_pd(&pd->ibpd, udata);
+#else
+	err = efa_alloc_pd(&pd->ibpd, ibucontext, udata);
+#endif
+	if (err)
+		goto err_free;
+
+	return &pd->ibpd;
+
+err_free:
+	kfree(pd);
+	return ERR_PTR(err);
+}
+#endif
+
+#ifdef HAVE_DEALLOC_PD_UDATA_RC
+int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+#elif defined(HAVE_DEALLOC_PD_UDATA)
+void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+#elif defined(HAVE_PD_CORE_ALLOCATION)
+void efa_dealloc_pd(struct ib_pd *ibpd)
+#else
+int efa_dealloc_pd(struct ib_pd *ibpd)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_pd *pd = to_epd(ibpd);
+
+	ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn);
+	efa_pd_dealloc(dev, pd->pdn);
+#ifndef HAVE_PD_CORE_ALLOCATION
+	kfree(pd);
+
+	return 0;
+#elif defined(HAVE_DEALLOC_PD_UDATA_RC)
+	return 0;
+#endif
+}
+
+static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
+{
+	struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle };
+
+	return efa_com_destroy_qp(&dev->edev, &params);
+}
+
+static void efa_qp_user_mmap_entries_remove(struct efa_qp *qp)
+{
+	rdma_user_mmap_entry_remove(qp->rq_mmap_entry);
+	rdma_user_mmap_entry_remove(qp->rq_db_mmap_entry);
+	rdma_user_mmap_entry_remove(qp->llq_desc_mmap_entry);
+	rdma_user_mmap_entry_remove(qp->sq_db_mmap_entry);
+}
+
+#ifdef HAVE_DESTROY_QP_UDATA
+int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+#else
+int efa_destroy_qp(struct ib_qp *ibqp)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibqp->pd->device);
+	struct efa_qp *qp = to_eqp(ibqp);
+	int err;
+
+	ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
+
+	efa_qp_user_mmap_entries_remove(qp);
+
+	err = efa_destroy_qp_handle(dev, qp->qp_handle);
+	if (err)
+		return err;
+
+	if (qp->rq_cpu_addr) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n",
+			  qp->rq_cpu_addr, qp->rq_size,
+			  &qp->rq_dma_addr);
+		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
+				qp->rq_size, DMA_TO_DEVICE);
+	}
+
+#ifndef HAVE_QP_CORE_ALLOCATION
+	kfree(qp);
+#endif
+	return 0;
+}
+
+#ifdef HAVE_CORE_MMAP_XA
+static struct rdma_user_mmap_entry*
+efa_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+			   u64 address, size_t length,
+			   u8 mmap_flag, u64 *offset)
+{
+	struct efa_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	int err;
+
+	if (!entry)
+		return NULL;
+
+	entry->address = address;
+	entry->mmap_flag = mmap_flag;
+
+	err = rdma_user_mmap_entry_insert(ucontext, &entry->rdma_entry,
+					  length);
+	if (err) {
+		kfree(entry);
+		return NULL;
+	}
+	*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
+
+	return &entry->rdma_entry;
+}
+#else
+static struct rdma_user_mmap_entry *
+efa_user_mmap_entry_insert(struct ib_ucontext *ibucontext, u64 address,
+			   size_t length, u8 mmap_flag, u64 *offset)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_user_mmap_entry *entry;
+	u64 next_mmap_page;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return NULL;
+
+	entry->address = address;
+	entry->rdma_entry.npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
+	entry->mmap_flag = mmap_flag;
+
+	mutex_lock(&ucontext->lock);
+	next_mmap_page = ucontext->mmap_page + (length >> PAGE_SHIFT);
+	if (next_mmap_page >= U32_MAX) {
+		ibdev_dbg(ucontext->ibucontext.device, "Too many mmap pages\n");
+		mutex_unlock(&ucontext->lock);
+		kfree(entry);
+		return NULL;
+	}
+
+	entry->rdma_entry.start_pgoff = ucontext->mmap_page;
+	ucontext->mmap_page = next_mmap_page;
+	list_add_tail(&entry->list, &ucontext->pending_mmaps);
+	mutex_unlock(&ucontext->lock);
+
+	*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
+	ibdev_dbg(
+		ucontext->ibucontext.device,
+		"mmap: addr[%#llx], len[%#zx], key[%#llx] inserted\n",
+		entry->address, entry->rdma_entry.npages * PAGE_SIZE,
+		rdma_user_mmap_get_offset(&entry->rdma_entry));
+
+	return &entry->rdma_entry;
+}
+#endif
+
+static int qp_mmap_entries_setup(struct efa_qp *qp,
+				 struct efa_dev *dev,
+				 struct efa_ucontext *ucontext,
+				 struct efa_com_create_qp_params *params,
+				 struct efa_ibv_create_qp_resp *resp)
+{
+	size_t length;
+	u64 address;
+
+	address = dev->db_bar_addr + resp->sq_db_offset;
+	qp->sq_db_mmap_entry =
+		efa_user_mmap_entry_insert(&ucontext->ibucontext,
+					   address,
+					   PAGE_SIZE, EFA_MMAP_IO_NC,
+					   &resp->sq_db_mmap_key);
+	if (!qp->sq_db_mmap_entry)
+		return -ENOMEM;
+
+	resp->sq_db_offset &= ~PAGE_MASK;
+
+	address = dev->mem_bar_addr + resp->llq_desc_offset;
+	length = PAGE_ALIGN(params->sq_ring_size_in_bytes +
+			    (resp->llq_desc_offset & ~PAGE_MASK));
+
+	qp->llq_desc_mmap_entry =
+		efa_user_mmap_entry_insert(&ucontext->ibucontext,
+					   address, length,
+					   EFA_MMAP_IO_WC,
+					   &resp->llq_desc_mmap_key);
+	if (!qp->llq_desc_mmap_entry)
+		goto err_remove_mmap;
+
+	resp->llq_desc_offset &= ~PAGE_MASK;
+
+	if (qp->rq_size) {
+		address = dev->db_bar_addr + resp->rq_db_offset;
+
+		qp->rq_db_mmap_entry =
+			efa_user_mmap_entry_insert(&ucontext->ibucontext,
+						   address, PAGE_SIZE,
+						   EFA_MMAP_IO_NC,
+						   &resp->rq_db_mmap_key);
+		if (!qp->rq_db_mmap_entry)
+			goto err_remove_mmap;
+
+		resp->rq_db_offset &= ~PAGE_MASK;
+
+		address = virt_to_phys(qp->rq_cpu_addr);
+		qp->rq_mmap_entry =
+			efa_user_mmap_entry_insert(&ucontext->ibucontext,
+						   address, qp->rq_size,
+						   EFA_MMAP_DMA_PAGE,
+						   &resp->rq_mmap_key);
+		if (!qp->rq_mmap_entry)
+			goto err_remove_mmap;
+
+		resp->rq_mmap_size = qp->rq_size;
+	}
+
+	return 0;
+
+err_remove_mmap:
+	efa_qp_user_mmap_entries_remove(qp);
+
+	return -ENOMEM;
+}
+
+static int efa_qp_validate_cap(struct efa_dev *dev,
+			       struct ib_qp_init_attr *init_attr)
+{
+	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested send wr[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_send_wr,
+			  dev->dev_attr.max_sq_depth);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested receive wr[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_recv_wr,
+			  dev->dev_attr.max_rq_depth);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested sge send[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested sge recv[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested inline data[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_inline_data,
+			  dev->dev_attr.inline_buf_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int efa_qp_validate_attr(struct efa_dev *dev,
+				struct ib_qp_init_attr *init_attr)
+{
+	if (init_attr->qp_type != IB_QPT_DRIVER &&
+	    init_attr->qp_type != IB_QPT_UD) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp type %d\n", init_attr->qp_type);
+		return -EOPNOTSUPP;
+	}
+
+	if (init_attr->srq) {
+		ibdev_dbg(&dev->ibdev, "SRQ is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (init_attr->create_flags) {
+		ibdev_dbg(&dev->ibdev, "Unsupported create flags\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
+		  struct ib_udata *udata)
+{
+	struct efa_com_create_qp_params create_qp_params = {};
+	struct efa_com_create_qp_result create_qp_resp;
+	struct efa_dev *dev = to_edev(ibqp->device);
+	struct efa_ibv_create_qp_resp resp = {};
+	struct efa_ibv_create_qp cmd = {};
+	struct efa_qp *qp = to_eqp(ibqp);
+	struct efa_ucontext *ucontext;
+	int err;
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+#ifdef HAVE_UDATA_TO_DRV_CONTEXT
+	ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
+					     ibucontext);
+#else
+	ucontext = ibqp->pd->uobject ? to_eucontext(ibqp->pd->uobject->context) :
+				       NULL;
+#endif
+
+	err = efa_qp_validate_cap(dev, init_attr);
+	if (err)
+		goto err_out;
+
+	err = efa_qp_validate_attr(dev, init_attr);
+	if (err)
+		goto err_out;
+
+	if (offsetofend(typeof(cmd), driver_qp_type) > udata->inlen) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, no input udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (udata->inlen > sizeof(cmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(cmd),
+				 udata->inlen - sizeof(cmd))) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "Cannot copy udata for create_qp\n");
+		goto err_out;
+	}
+
+	if (cmd.comp_mask) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	create_qp_params.uarn = ucontext->uarn;
+	create_qp_params.pd = to_epd(ibqp->pd)->pdn;
+
+	if (init_attr->qp_type == IB_QPT_UD) {
+		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
+	} else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) {
+		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD;
+	} else {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp type %d driver qp type %d\n",
+			  init_attr->qp_type, cmd.driver_qp_type);
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+
+	ibdev_dbg(&dev->ibdev, "Create QP: qp type %d driver qp type %#x\n",
+		  init_attr->qp_type, cmd.driver_qp_type);
+	create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx;
+	create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx;
+	create_qp_params.sq_depth = init_attr->cap.max_send_wr;
+	create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size;
+
+	create_qp_params.rq_depth = init_attr->cap.max_recv_wr;
+	create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size;
+	qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes);
+	if (qp->rq_size) {
+		qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr,
+						    qp->rq_size, DMA_TO_DEVICE);
+		if (!qp->rq_cpu_addr) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+
+		ibdev_dbg(&dev->ibdev,
+			  "qp->cpu_addr[0x%p] allocated: size[%lu], dma[%pad]\n",
+			  qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr);
+		create_qp_params.rq_base_addr = qp->rq_dma_addr;
+	}
+
+	err = efa_com_create_qp(&dev->edev, &create_qp_params,
+				&create_qp_resp);
+	if (err)
+		goto err_free_mapped;
+
+	resp.sq_db_offset = create_qp_resp.sq_db_offset;
+	resp.rq_db_offset = create_qp_resp.rq_db_offset;
+	resp.llq_desc_offset = create_qp_resp.llq_descriptors_offset;
+	resp.send_sub_cq_idx = create_qp_resp.send_sub_cq_idx;
+	resp.recv_sub_cq_idx = create_qp_resp.recv_sub_cq_idx;
+
+	err = qp_mmap_entries_setup(qp, dev, ucontext, &create_qp_params,
+				    &resp);
+	if (err)
+		goto err_destroy_qp;
+
+	qp->qp_handle = create_qp_resp.qp_handle;
+	qp->ibqp.qp_num = create_qp_resp.qp_num;
+	qp->max_send_wr = init_attr->cap.max_send_wr;
+	qp->max_recv_wr = init_attr->cap.max_recv_wr;
+	qp->max_send_sge = init_attr->cap.max_send_sge;
+	qp->max_recv_sge = init_attr->cap.max_recv_sge;
+	qp->max_inline_data = init_attr->cap.max_inline_data;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to copy udata for qp[%u]\n",
+				  create_qp_resp.qp_num);
+			goto err_remove_mmap_entries;
+		}
+	}
+
+	ibdev_dbg(&dev->ibdev, "Created qp[%d]\n", qp->ibqp.qp_num);
+
+	return 0;
+
+err_remove_mmap_entries:
+	efa_qp_user_mmap_entries_remove(qp);
+err_destroy_qp:
+	efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
+err_free_mapped:
+	if (qp->rq_size)
+		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
+				qp->rq_size, DMA_TO_DEVICE);
+err_out:
+	atomic64_inc(&dev->stats.create_qp_err);
+	return err;
+}
+
+#ifndef HAVE_QP_CORE_ALLOCATION
+struct ib_qp *efa_kzalloc_qp(struct ib_pd *ibpd,
+			     struct ib_qp_init_attr *init_attr,
+			     struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_qp *qp;
+	int err;
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		atomic64_inc(&dev->stats.create_qp_err);
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	qp->ibqp.device = ibpd->device;
+	qp->ibqp.pd = ibpd;
+	qp->ibqp.qp_type = init_attr->qp_type;
+	err = efa_create_qp(&qp->ibqp, init_attr, udata);
+	if (err)
+		goto err_free_qp;
+
+	return &qp->ibqp;
+
+err_free_qp:
+	kfree(qp);
+err_out:
+	return ERR_PTR(err);
+}
+#endif
+
+static const struct {
+	int			valid;
+	enum ib_qp_attr_mask	req_param;
+	enum ib_qp_attr_mask	opt_param;
+} srd_qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.req_param = IB_QP_PKEY_INDEX |
+				     IB_QP_PORT |
+				     IB_QP_QKEY,
+		},
+	},
+	[IB_QPS_INIT] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.opt_param = IB_QP_PKEY_INDEX |
+				     IB_QP_PORT |
+				     IB_QP_QKEY,
+		},
+		[IB_QPS_RTR]   = {
+			.valid = 1,
+			.opt_param = IB_QP_PKEY_INDEX |
+				     IB_QP_QKEY,
+		},
+	},
+	[IB_QPS_RTR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.req_param = IB_QP_SQ_PSN,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY |
+				     IB_QP_RNR_RETRY,
+
+		}
+	},
+	[IB_QPS_RTS] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY,
+		},
+		[IB_QPS_SQD] = {
+			.valid = 1,
+			.opt_param = IB_QP_EN_SQD_ASYNC_NOTIFY,
+		},
+	},
+	[IB_QPS_SQD] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY,
+		},
+		[IB_QPS_SQD] = {
+			.valid = 1,
+			.opt_param = IB_QP_PKEY_INDEX |
+				     IB_QP_QKEY,
+		}
+	},
+	[IB_QPS_SQE] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY,
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+	}
+};
+
+static bool efa_modify_srd_qp_is_ok(enum ib_qp_state cur_state,
+				    enum ib_qp_state next_state,
+				    enum ib_qp_attr_mask mask)
+{
+	enum ib_qp_attr_mask req_param, opt_param;
+
+	if (mask & IB_QP_CUR_STATE  &&
+	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+		return false;
+
+	if (!srd_qp_state_table[cur_state][next_state].valid)
+		return false;
+
+	req_param = srd_qp_state_table[cur_state][next_state].req_param;
+	opt_param = srd_qp_state_table[cur_state][next_state].opt_param;
+
+	if ((mask & req_param) != req_param)
+		return false;
+
+	if (mask & ~(req_param | opt_param | IB_QP_STATE))
+		return false;
+
+	return true;
+}
+
+static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp,
+				  struct ib_qp_attr *qp_attr, int qp_attr_mask,
+				  enum ib_qp_state cur_state,
+				  enum ib_qp_state new_state)
+{
+	int err;
+
+#define EFA_MODIFY_QP_SUPP_MASK \
+	(IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \
+	 IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN | \
+	 IB_QP_RNR_RETRY)
+
+	if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
+			  qp_attr_mask, EFA_MODIFY_QP_SUPP_MASK);
+		return -EOPNOTSUPP;
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_DRIVER)
+		err = !efa_modify_srd_qp_is_ok(cur_state, new_state,
+					       qp_attr_mask);
+	else
+#ifdef HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS
+		err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
+					  qp_attr_mask);
+#else
+		err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
+					  qp_attr_mask,
+					  IB_LINK_LAYER_UNSPECIFIED);
+#endif
+
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n");
+		return -EINVAL;
+	}
+
+	if ((qp_attr_mask & IB_QP_PORT) && qp_attr->port_num != 1) {
+		ibdev_dbg(&dev->ibdev, "Can't change port num\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((qp_attr_mask & IB_QP_PKEY_INDEX) && qp_attr->pkey_index) {
+		ibdev_dbg(&dev->ibdev, "Can't change pkey index\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+	struct efa_com_modify_qp_params params = {};
+	struct efa_qp *qp = to_eqp(ibqp);
+	enum ib_qp_state cur_state;
+	enum ib_qp_state new_state;
+	int err;
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		return -EOPNOTSUPP;
+	}
+#endif
+
+#ifdef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
+	if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
+		return -EOPNOTSUPP;
+#endif
+
+	if (udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		return -EINVAL;
+	}
+
+	cur_state = qp_attr_mask & IB_QP_CUR_STATE ? qp_attr->cur_qp_state :
+						     qp->state;
+	new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state;
+
+	err = efa_modify_qp_validate(dev, qp, qp_attr, qp_attr_mask, cur_state,
+				     new_state);
+	if (err)
+		return err;
+
+	params.qp_handle = qp->qp_handle;
+
+	if (qp_attr_mask & IB_QP_STATE) {
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QP_STATE,
+			1);
+		EFA_SET(&params.modify_mask,
+			EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE, 1);
+		params.cur_qp_state = cur_state;
+		params.qp_state = new_state;
+	}
+
+	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+		EFA_SET(&params.modify_mask,
+			EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY, 1);
+		params.sq_drained_async_notify = qp_attr->en_sqd_async_notify;
+	}
+
+	if (qp_attr_mask & IB_QP_QKEY) {
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QKEY, 1);
+		params.qkey = qp_attr->qkey;
+	}
+
+	if (qp_attr_mask & IB_QP_SQ_PSN) {
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN, 1);
+		params.sq_psn = qp_attr->sq_psn;
+	}
+
+	if (qp_attr_mask & IB_QP_RNR_RETRY) {
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY,
+			1);
+		params.rnr_retry = qp_attr->rnr_retry;
+	}
+
+	err = efa_com_modify_qp(&dev->edev, &params);
+	if (err)
+		return err;
+
+	qp->state = new_state;
+
+	return 0;
+}
+
+static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
+{
+	struct efa_com_destroy_cq_params params = { .cq_idx = cq_idx };
+
+	return efa_com_destroy_cq(&dev->edev, &params);
+}
+
+static void efa_cq_user_mmap_entries_remove(struct efa_cq *cq)
+{
+	rdma_user_mmap_entry_remove(cq->db_mmap_entry);
+	rdma_user_mmap_entry_remove(cq->mmap_entry);
+}
+
+#if defined(HAVE_IB_VOID_DESTROY_CQ) || defined(HAVE_IB_INT_DESTROY_CQ)
+#ifdef HAVE_IB_INT_DESTROY_CQ
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+#else
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibcq->device);
+	struct efa_cq *cq = to_ecq(ibcq);
+
+	ibdev_dbg(&dev->ibdev,
+		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
+		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
+
+	efa_cq_user_mmap_entries_remove(cq);
+	efa_destroy_cq_idx(dev, cq->cq_idx);
+	if (cq->eq) {
+#ifdef HAVE_XARRAY
+		xa_erase(&dev->cqs_xa, cq->cq_idx);
+#else
+		dev->cqs_arr[cq->cq_idx] = NULL;
+#endif
+		synchronize_irq(cq->eq->irq.irqn);
+	}
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
+#ifndef HAVE_CQ_CORE_ALLOCATION
+	kfree(cq);
+#endif
+#ifdef HAVE_IB_INT_DESTROY_CQ
+	return 0;
+#endif
+}
+#else
+#ifdef HAVE_DESTROY_CQ_UDATA
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+#else
+int efa_destroy_cq(struct ib_cq *ibcq)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibcq->device);
+	struct efa_cq *cq = to_ecq(ibcq);
+	int err;
+
+	ibdev_dbg(&dev->ibdev,
+		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
+		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
+
+	efa_cq_user_mmap_entries_remove(cq);
+	err = efa_destroy_cq_idx(dev, cq->cq_idx);
+	if (err)
+		return err;
+
+	if (cq->eq) {
+#ifdef HAVE_XARRAY
+		xa_erase(&dev->cqs_xa, cq->cq_idx);
+#else
+		dev->cqs_arr[cq->cq_idx] = NULL;
+#endif
+		synchronize_irq(cq->eq->irq.irqn);
+	}
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
+
+	kfree(cq);
+	return 0;
+}
+#endif
+
+static struct efa_eq *efa_vec2eq(struct efa_dev *dev, int vec)
+{
+	return &dev->eqs[vec];
+}
+
+static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
+				 struct efa_ibv_create_cq_resp *resp,
+				 bool db_valid)
+{
+	resp->q_mmap_size = cq->size;
+	cq->mmap_entry = efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
+						    virt_to_phys(cq->cpu_addr),
+						    cq->size, EFA_MMAP_DMA_PAGE,
+						    &resp->q_mmap_key);
+	if (!cq->mmap_entry)
+		return -ENOMEM;
+
+	if (db_valid) {
+		cq->db_mmap_entry =
+			efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
+						   dev->db_bar_addr + resp->db_off,
+						   PAGE_SIZE, EFA_MMAP_IO_NC,
+						   &resp->db_mmap_key);
+		if (!cq->db_mmap_entry) {
+			rdma_user_mmap_entry_remove(cq->mmap_entry);
+			return -ENOMEM;
+		}
+
+		resp->db_off &= ~PAGE_MASK;
+		resp->comp_mask |= EFA_CREATE_CQ_RESP_DB_OFF;
+	}
+
+	return 0;
+}
+
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata)
+{
+#ifdef HAVE_UDATA_TO_DRV_CONTEXT
+	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct efa_ucontext, ibucontext);
+#else
+	struct efa_ucontext *ucontext = to_ecq(ibcq)->ucontext;
+#endif
+	struct efa_com_create_cq_params params = {};
+	struct efa_ibv_create_cq_resp resp = {};
+	struct efa_com_create_cq_result result;
+	struct ib_device *ibdev = ibcq->device;
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_ibv_create_cq cmd = {};
+	struct efa_cq *cq = to_ecq(ibcq);
+	int entries = attr->cqe;
+	bool set_src_addr;
+	int err;
+
+	ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
+
+	if (attr->flags)
+		return -EOPNOTSUPP;
+
+	if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
+		ibdev_dbg(ibdev,
+			  "cq: requested entries[%u] non-positive or greater than max[%u]\n",
+			  entries, dev->dev_attr.max_cq_depth);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(ibdev, "udata is NULL\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+	if (offsetofend(typeof(cmd), num_sub_cqs) > udata->inlen) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, no input udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (udata->inlen > sizeof(cmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(cmd),
+				 udata->inlen - sizeof(cmd))) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(ibdev, "Cannot copy udata for create_cq\n");
+		goto err_out;
+	}
+
+	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_58)) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	set_src_addr = !!(cmd.flags & EFA_CREATE_CQ_WITH_SGID);
+	if ((cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc_ex)) &&
+		(set_src_addr || cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc))) {
+		ibdev_dbg(ibdev,
+			  "Invalid entry size [%u]\n", cmd.cq_entry_size);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) {
+		ibdev_dbg(ibdev,
+			  "Invalid number of sub cqs[%u] expected[%u]\n",
+			  cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	cq->ucontext = ucontext;
+	cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
+	cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
+					 DMA_FROM_DEVICE);
+	if (!cq->cpu_addr) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	params.uarn = cq->ucontext->uarn;
+	params.cq_depth = entries;
+	params.dma_addr = cq->dma_addr;
+	params.entry_size_in_bytes = cmd.cq_entry_size;
+	params.num_sub_cqs = cmd.num_sub_cqs;
+	params.set_src_addr = set_src_addr;
+	if (cmd.flags & EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL) {
+		cq->eq = efa_vec2eq(dev, attr->comp_vector);
+		params.eqn = cq->eq->eeq.eqn;
+		params.interrupt_mode_enabled = true;
+	}
+
+	err = efa_com_create_cq(&dev->edev, &params, &result);
+	if (err)
+		goto err_free_mapped;
+
+	resp.db_off = result.db_off;
+	resp.cq_idx = result.cq_idx;
+	cq->cq_idx = result.cq_idx;
+	cq->ibcq.cqe = result.actual_depth;
+	WARN_ON_ONCE(entries != result.actual_depth);
+
+	err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid);
+	if (err) {
+		ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n",
+			  cq->cq_idx);
+		goto err_destroy_cq;
+	}
+
+	if (cq->eq) {
+#ifdef HAVE_XARRAY
+		err = xa_err(xa_store(&dev->cqs_xa, cq->cq_idx, cq, GFP_KERNEL));
+#else
+		dev->cqs_arr[cq->cq_idx] = cq;
+#endif
+		if (err) {
+			ibdev_dbg(ibdev, "Failed to store cq[%u] in xarray\n",
+				  cq->cq_idx);
+			goto err_remove_mmap;
+		}
+	}
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(ibdev,
+				  "Failed to copy udata for create_cq\n");
+			goto err_xa_erase;
+		}
+	}
+
+	ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
+		  cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr);
+
+	return 0;
+
+err_xa_erase:
+	if (cq->eq)
+#ifdef HAVE_XARRAY
+		xa_erase(&dev->cqs_xa, cq->cq_idx);
+#else
+		dev->cqs_arr[cq->cq_idx] = NULL;
+#endif
+err_remove_mmap:
+	efa_cq_user_mmap_entries_remove(cq);
+err_destroy_cq:
+	efa_destroy_cq_idx(dev, cq->cq_idx);
+err_free_mapped:
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
+
+err_out:
+	atomic64_inc(&dev->stats.create_cq_err);
+	return err;
+}
+
+#ifndef HAVE_CQ_CORE_ALLOCATION
+#ifdef HAVE_CREATE_CQ_NO_UCONTEXT
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_udata *udata)
+#elif defined(HAVE_CREATE_CQ_ATTR)
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_cq *cq;
+	int err;
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		atomic64_inc(&dev->stats.create_cq_err);
+		return ERR_PTR(-ENOMEM);
+	}
+
+#ifdef HAVE_UDATA_TO_DRV_CONTEXT
+	cq->ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
+						 ibucontext);
+#else
+	cq->ucontext = to_eucontext(ibucontext);
+#endif
+
+	cq->ibcq.device = ibdev;
+	err = efa_create_cq(&cq->ibcq, attr, udata);
+	if (err)
+		goto err_free_cq;
+
+	return &cq->ibcq;
+
+err_free_cq:
+	kfree(cq);
+	return ERR_PTR(err);
+}
+#endif
+
+#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+static int umem_to_page_list(struct efa_dev *dev,
+			     struct ib_umem *umem,
+			     u64 *page_list,
+			     u32 hp_cnt,
+			     u8 hp_shift)
+{
+	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
+	struct ib_block_iter biter;
+	unsigned int hp_idx = 0;
+
+	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
+		  hp_cnt, pages_in_hp);
+
+	rdma_umem_for_each_dma_block(umem, &biter, BIT(hp_shift))
+		page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
+
+	return 0;
+}
+#elif defined(HAVE_SG_DMA_PAGE_ITER)
+static int umem_to_page_list(struct efa_dev *dev,
+			     struct ib_umem *umem,
+			     u64 *page_list,
+			     u32 hp_cnt,
+			     u8 hp_shift)
+{
+	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
+	struct sg_dma_page_iter sg_iter;
+	unsigned int page_idx = 0;
+	unsigned int hp_idx = 0;
+
+	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
+		  hp_cnt, pages_in_hp);
+
+	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		if (page_idx % pages_in_hp == 0) {
+			page_list[hp_idx] = sg_page_iter_dma_address(&sg_iter);
+			hp_idx++;
+		}
+
+		page_idx++;
+	}
+
+	return 0;
+}
+#elif defined(HAVE_UMEM_SCATTERLIST_IF)
+static int umem_to_page_list(struct efa_dev *dev,
+			     struct ib_umem *umem,
+			     u64 *page_list,
+			     u32 hp_cnt,
+			     u8 hp_shift)
+{
+	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
+	unsigned int page_idx = 0;
+	unsigned int pages_in_sg;
+	unsigned int hp_idx = 0;
+	struct scatterlist *sg;
+	unsigned int entry;
+	unsigned int i;
+
+	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
+		  hp_cnt, pages_in_hp);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		if (sg_dma_len(sg) & ~PAGE_MASK) {
+			ibdev_dbg(&dev->ibdev,
+				  "sg_dma_len[%u] does not divide by PAGE_SIZE[%lu]\n",
+				  sg_dma_len(sg), PAGE_SIZE);
+			return -EINVAL;
+		}
+
+		pages_in_sg = sg_dma_len(sg) >> PAGE_SHIFT;
+		for (i = 0; i < pages_in_sg; i++) {
+			if (page_idx % pages_in_hp == 0) {
+				page_list[hp_idx] = sg_dma_address(sg) +
+						    i * PAGE_SIZE;
+				hp_idx++;
+			}
+
+			page_idx++;
+		}
+	}
+
+	return 0;
+}
+#endif
+
+static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
+{
+	struct scatterlist *sglist;
+	struct page *pg;
+	int i;
+
+	sglist = kmalloc_array(page_cnt, sizeof(*sglist), GFP_KERNEL);
+	if (!sglist)
+		return NULL;
+	sg_init_table(sglist, page_cnt);
+	for (i = 0; i < page_cnt; i++) {
+		pg = vmalloc_to_page(buf);
+		if (!pg)
+			goto err;
+		sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
+		buf += PAGE_SIZE / sizeof(*buf);
+	}
+	return sglist;
+
+err:
+	kfree(sglist);
+	return NULL;
+}
+
+/*
+ * create a chunk list of physical pages dma addresses from the supplied
+ * scatter gather list
+ */
+static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
+	int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages;
+	struct scatterlist *pages_sgl = pbl->phys.indirect.sgl;
+	unsigned int chunk_list_size, chunk_idx, payload_idx;
+	int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt;
+	struct efa_com_ctrl_buff_info *ctrl_buf;
+	u64 *cur_chunk_buf, *prev_chunk_buf;
+#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+	struct ib_block_iter biter;
+#else
+	struct scatterlist *sg;
+	unsigned int entry, payloads_in_sg;
+#endif
+	dma_addr_t dma_addr;
+	int i;
+
+	/* allocate a chunk list that consists of 4KB chunks */
+	chunk_list_size = DIV_ROUND_UP(page_cnt, EFA_PTRS_PER_CHUNK);
+
+	chunk_list->size = chunk_list_size;
+	chunk_list->chunks = kcalloc(chunk_list_size,
+				     sizeof(*chunk_list->chunks),
+				     GFP_KERNEL);
+	if (!chunk_list->chunks)
+		return -ENOMEM;
+
+	ibdev_dbg(&dev->ibdev,
+		  "chunk_list_size[%u] - pages[%u]\n", chunk_list_size,
+		  page_cnt);
+
+	/* allocate chunk buffers: */
+	for (i = 0; i < chunk_list_size; i++) {
+		chunk_list->chunks[i].buf = kzalloc(EFA_CHUNK_SIZE, GFP_KERNEL);
+		if (!chunk_list->chunks[i].buf)
+			goto chunk_list_dealloc;
+
+		chunk_list->chunks[i].length = EFA_CHUNK_USED_SIZE;
+	}
+	chunk_list->chunks[chunk_list_size - 1].length =
+		((page_cnt % EFA_PTRS_PER_CHUNK) * EFA_CHUNK_PAYLOAD_PTR_SIZE) +
+			EFA_CHUNK_PTR_SIZE;
+
+	/* fill the dma addresses of sg list pages to chunks: */
+	chunk_idx = 0;
+	payload_idx = 0;
+	cur_chunk_buf = chunk_list->chunks[0].buf;
+#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+	rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt,
+			    EFA_CHUNK_PAYLOAD_SIZE) {
+		cur_chunk_buf[payload_idx++] =
+			rdma_block_iter_dma_address(&biter);
+
+		if (payload_idx == EFA_PTRS_PER_CHUNK) {
+			chunk_idx++;
+			cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
+			payload_idx = 0;
+		}
+	}
+#else
+	for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) {
+		payloads_in_sg = sg_dma_len(sg) >> EFA_CHUNK_PAYLOAD_SHIFT;
+		for (i = 0; i < payloads_in_sg; i++) {
+			cur_chunk_buf[payload_idx++] =
+				(sg_dma_address(sg) & ~(EFA_CHUNK_PAYLOAD_SIZE - 1)) +
+				(EFA_CHUNK_PAYLOAD_SIZE * i);
+
+			if (payload_idx == EFA_PTRS_PER_CHUNK) {
+				chunk_idx++;
+				cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
+				payload_idx = 0;
+			}
+		}
+	}
+#endif
+
+	/* map chunks to dma and fill chunks next ptrs */
+	for (i = chunk_list_size - 1; i >= 0; i--) {
+		dma_addr = dma_map_single(&dev->pdev->dev,
+					  chunk_list->chunks[i].buf,
+					  chunk_list->chunks[i].length,
+					  DMA_TO_DEVICE);
+		if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
+			ibdev_err(&dev->ibdev,
+				  "chunk[%u] dma_map_failed\n", i);
+			goto chunk_list_unmap;
+		}
+
+		chunk_list->chunks[i].dma_addr = dma_addr;
+		ibdev_dbg(&dev->ibdev,
+			  "chunk[%u] mapped at [%pad]\n", i, &dma_addr);
+
+		if (!i)
+			break;
+
+		prev_chunk_buf = chunk_list->chunks[i - 1].buf;
+
+		ctrl_buf = (struct efa_com_ctrl_buff_info *)
+				&prev_chunk_buf[EFA_PTRS_PER_CHUNK];
+		ctrl_buf->length = chunk_list->chunks[i].length;
+
+		efa_com_set_dma_addr(dma_addr,
+				     &ctrl_buf->address.mem_addr_high,
+				     &ctrl_buf->address.mem_addr_low);
+	}
+
+	return 0;
+
+chunk_list_unmap:
+	for (; i < chunk_list_size; i++) {
+		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
+				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
+	}
+chunk_list_dealloc:
+	for (i = 0; i < chunk_list_size; i++)
+		kfree(chunk_list->chunks[i].buf);
+
+	kfree(chunk_list->chunks);
+	return -ENOMEM;
+}
+
+static void pbl_chunk_list_destroy(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
+	int i;
+
+	for (i = 0; i < chunk_list->size; i++) {
+		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
+				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
+		kfree(chunk_list->chunks[i].buf);
+	}
+
+	kfree(chunk_list->chunks);
+}
+
+/* initialize pbl continuous mode: map pbl buffer to a dma address. */
+static int pbl_continuous_initialize(struct efa_dev *dev,
+				     struct pbl_context *pbl)
+{
+	dma_addr_t dma_addr;
+
+	dma_addr = dma_map_single(&dev->pdev->dev, pbl->pbl_buf,
+				  pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
+		ibdev_err(&dev->ibdev, "Unable to map pbl to DMA address\n");
+		return -ENOMEM;
+	}
+
+	pbl->phys.continuous.dma_addr = dma_addr;
+	ibdev_dbg(&dev->ibdev,
+		  "pbl continuous - dma_addr = %pad, size[%u]\n",
+		  &dma_addr, pbl->pbl_buf_size_in_bytes);
+
+	return 0;
+}
+
+/*
+ * initialize pbl indirect mode:
+ * create a chunk list out of the dma addresses of the physical pages of
+ * pbl buffer.
+ */
+static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE);
+	struct scatterlist *sgl;
+	int sg_dma_cnt, err;
+
+	BUILD_BUG_ON(EFA_CHUNK_PAYLOAD_SIZE > PAGE_SIZE);
+	sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages);
+	if (!sgl)
+		return -ENOMEM;
+
+	sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
+	if (!sg_dma_cnt) {
+		err = -EINVAL;
+		goto err_map;
+	}
+
+	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
+	pbl->phys.indirect.sgl = sgl;
+	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
+	err = pbl_chunk_list_create(dev, pbl);
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "chunk_list creation failed[%d]\n", err);
+		goto err_chunk;
+	}
+
+	ibdev_dbg(&dev->ibdev,
+		  "pbl indirect - size[%u], chunks[%u]\n",
+		  pbl->pbl_buf_size_in_bytes,
+		  pbl->phys.indirect.chunk_list.size);
+
+	return 0;
+
+err_chunk:
+	dma_unmap_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
+err_map:
+	kfree(sgl);
+	return err;
+}
+
+static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	pbl_chunk_list_destroy(dev, pbl);
+	dma_unmap_sg(&dev->pdev->dev, pbl->phys.indirect.sgl,
+		     pbl->phys.indirect.pbl_buf_size_in_pages, DMA_TO_DEVICE);
+	kfree(pbl->phys.indirect.sgl);
+}
+
+/* create a page buffer list from a mapped user memory region */
+static int pbl_create(struct efa_dev *dev,
+		      struct pbl_context *pbl,
+#ifdef HAVE_EFA_P2P
+		      struct efa_mr *mr,
+#else
+		      struct ib_umem *umem,
+#endif
+		      int hp_cnt,
+		      u8 hp_shift)
+{
+	int err;
+
+	pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE;
+	pbl->pbl_buf = kvzalloc(pbl->pbl_buf_size_in_bytes, GFP_KERNEL);
+	if (!pbl->pbl_buf)
+		return -ENOMEM;
+
+	if (is_vmalloc_addr(pbl->pbl_buf)) {
+		pbl->physically_continuous = 0;
+#ifdef HAVE_EFA_P2P
+		if (mr->p2pmem)
+			err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf);
+		else
+			err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt,
+						hp_shift);
+#else
+		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
+					hp_shift);
+#endif
+		if (err)
+			goto err_free;
+
+		err = pbl_indirect_initialize(dev, pbl);
+		if (err)
+			goto err_free;
+	} else {
+		pbl->physically_continuous = 1;
+#ifdef HAVE_EFA_P2P
+		if (mr->p2pmem)
+			err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf);
+		else
+			err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt,
+						hp_shift);
+#else
+		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
+					hp_shift);
+#endif
+		if (err)
+			goto err_free;
+
+		err = pbl_continuous_initialize(dev, pbl);
+		if (err)
+			goto err_free;
+	}
+
+	ibdev_dbg(&dev->ibdev,
+		  "user_pbl_created: user_pages[%u], continuous[%u]\n",
+		  hp_cnt, pbl->physically_continuous);
+
+	return 0;
+
+err_free:
+	kvfree(pbl->pbl_buf);
+	return err;
+}
+
+static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	if (pbl->physically_continuous)
+		dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr,
+				 pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
+	else
+		pbl_indirect_terminate(dev, pbl);
+
+	kvfree(pbl->pbl_buf);
+}
+
+static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
+				 struct efa_com_reg_mr_params *params)
+{
+	int err;
+
+	params->inline_pbl = 1;
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem)
+		err = efa_p2p_to_page_list(dev, mr->p2pmem,
+					   params->pbl.inline_pbl_array);
+	else
+		err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
+					params->page_num, params->page_shift);
+#else
+	err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
+				params->page_num, params->page_shift);
+#endif
+	if (err)
+		return err;
+
+	ibdev_dbg(&dev->ibdev,
+		  "inline_pbl_array - pages[%u]\n", params->page_num);
+
+	return 0;
+}
+
+static int efa_create_pbl(struct efa_dev *dev,
+			  struct pbl_context *pbl,
+			  struct efa_mr *mr,
+			  struct efa_com_reg_mr_params *params)
+{
+	int err;
+
+#ifdef HAVE_EFA_P2P
+	err = pbl_create(dev, pbl, mr, params->page_num,
+			 params->page_shift);
+#else
+	err = pbl_create(dev, pbl, mr->umem, params->page_num,
+			 params->page_shift);
+#endif
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "Failed to create pbl[%d]\n", err);
+		return err;
+	}
+
+	params->inline_pbl = 0;
+	params->indirect = !pbl->physically_continuous;
+	if (pbl->physically_continuous) {
+		params->pbl.pbl.length = pbl->pbl_buf_size_in_bytes;
+
+		efa_com_set_dma_addr(pbl->phys.continuous.dma_addr,
+				     &params->pbl.pbl.address.mem_addr_high,
+				     &params->pbl.pbl.address.mem_addr_low);
+	} else {
+		params->pbl.pbl.length =
+			pbl->phys.indirect.chunk_list.chunks[0].length;
+
+		efa_com_set_dma_addr(pbl->phys.indirect.chunk_list.chunks[0].dma_addr,
+				     &params->pbl.pbl.address.mem_addr_high,
+				     &params->pbl.pbl.address.mem_addr_low);
+	}
+
+	return 0;
+}
+
+#ifndef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+static unsigned long efa_cont_pages(struct ib_umem *umem,
+				    unsigned long page_size_cap,
+				    u64 addr)
+{
+	unsigned long max_page_shift = fls64(page_size_cap);
+	struct scatterlist *sg;
+	u64 base = ~0, p = 0;
+	unsigned long tmp;
+	unsigned long m;
+	u64 len, pfn;
+	int i = 0;
+	int entry;
+
+	addr = addr >> PAGE_SHIFT;
+	tmp = (unsigned long)addr;
+	m = find_first_bit(&tmp, BITS_PER_LONG);
+	m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE);
+		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
+		if (base + p != pfn) {
+			/*
+			 * If either the offset or the new
+			 * base are unaligned update m
+			 */
+			tmp = (unsigned long)(pfn | p);
+			if (!IS_ALIGNED(tmp, 1 << m))
+				m = find_first_bit(&tmp, BITS_PER_LONG);
+
+			base = pfn;
+			p = 0;
+		}
+
+		p += len;
+		i += len;
+	}
+
+	if (i)
+		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
+	else
+		m = 0;
+
+	return BIT(PAGE_SHIFT + m);
+}
+#endif
+
+static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
+				   struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	int supp_access_flags;
+	struct efa_mr *mr;
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		return ERR_PTR(-EINVAL);
+	}
+#endif
+
+	if (udata && udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	supp_access_flags =
+		IB_ACCESS_LOCAL_WRITE |
+		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0);
+
+#ifdef HAVE_IB_ACCESS_OPTIONAL
+	access_flags &= ~IB_ACCESS_OPTIONAL;
+#endif
+	if (access_flags & ~supp_access_flags) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported access flags[%#x], supported[%#x]\n",
+			  access_flags, supp_access_flags);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	return mr;
+}
+
+static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
+			   u64 length, u64 virt_addr, int access_flags)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_com_reg_mr_params params = {};
+	struct efa_com_reg_mr_result result = {};
+	struct pbl_context pbl;
+	unsigned int pg_sz;
+	int inline_size;
+	int err;
+
+	params.pd = to_epd(ibpd)->pdn;
+	params.iova = virt_addr;
+	params.mr_length_in_bytes = length;
+	params.permissions = access_flags;
+
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem) {
+		pg_sz = efa_p2p_get_page_size(dev, mr->p2pmem);
+		goto skip_umem_pg_sz;
+	}
+#endif
+
+#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+	pg_sz = ib_umem_find_best_pgsz(mr->umem,
+				       dev->dev_attr.page_size_cap,
+				       virt_addr);
+	if (!pg_sz) {
+		ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
+			  dev->dev_attr.page_size_cap);
+		return -EOPNOTSUPP;
+	}
+#else
+	pg_sz = efa_cont_pages(mr->umem, dev->dev_attr.page_size_cap,
+			       virt_addr);
+#endif /* defined(HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE) */
+
+#ifdef HAVE_EFA_P2P
+skip_umem_pg_sz:
+#endif
+	params.page_shift = order_base_2(pg_sz);
+#ifdef HAVE_IB_UMEM_NUM_DMA_BLOCKS
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem)
+		params.page_num = DIV_ROUND_UP(length +
+					       (virt_addr & (pg_sz - 1)),
+					       pg_sz);
+	else
+		params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
+#else
+	params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
+#endif
+#else
+	params.page_num = DIV_ROUND_UP(length + (virt_addr & (pg_sz - 1)),
+				       pg_sz);
+#endif
+
+	ibdev_dbg(&dev->ibdev,
+		  "start %#llx length %#llx params.page_shift %u params.page_num %u\n",
+		  start, length, params.page_shift, params.page_num);
+
+	inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array);
+	if (params.page_num <= inline_size) {
+		err = efa_create_inline_pbl(dev, mr, &params);
+		if (err)
+			return err;
+
+		err = efa_com_register_mr(&dev->edev, &params, &result);
+		if (err)
+			return err;
+	} else {
+		err = efa_create_pbl(dev, &pbl, mr, &params);
+		if (err)
+			return err;
+
+		err = efa_com_register_mr(&dev->edev, &params, &result);
+		pbl_destroy(dev, &pbl);
+
+		if (err)
+			return err;
+	}
+
+	mr->ibmr.lkey = result.l_key;
+	mr->ibmr.rkey = result.r_key;
+#ifdef HAVE_IB_MR_LENGTH
+	mr->ibmr.length = length;
+#endif
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem) {
+		mr->p2pmem->lkey = result.l_key;
+		mr->p2pmem->needs_dereg = true;
+	}
+#endif
+	ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);
+
+	return 0;
+}
+
+#ifdef HAVE_MR_DMABUF
+struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
+				     u64 length, u64 virt_addr,
+				     int fd, int access_flags,
+				     struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct ib_umem_dmabuf *umem_dmabuf;
+	struct efa_mr *mr;
+	int err;
+
+	mr = efa_alloc_mr(ibpd, access_flags, udata);
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto err_out;
+	}
+
+	umem_dmabuf = ib_umem_dmabuf_get_pinned(ibpd->device, start, length, fd,
+						access_flags);
+	if (IS_ERR(umem_dmabuf)) {
+		err = PTR_ERR(umem_dmabuf);
+		ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%d]\n", err);
+		goto err_free;
+	}
+
+	mr->umem = &umem_dmabuf->umem;
+	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
+	if (err)
+		goto err_release;
+
+	return &mr->ibmr;
+
+err_release:
+#ifndef HAVE_IB_UMEM_DMABUF_PINNED
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	dma_buf_unpin(umem_dmabuf->attach);
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+#endif
+	ib_umem_release(mr->umem);
+err_free:
+	kfree(mr);
+err_out:
+	atomic64_inc(&dev->stats.reg_mr_err);
+	return ERR_PTR(err);
+}
+#endif
+
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			 u64 virt_addr, int access_flags,
+			 struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_mr *mr;
+	int err;
+
+	mr = efa_alloc_mr(ibpd, access_flags, udata);
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto err_out;
+	}
+
+#ifdef HAVE_IB_UMEM_GET_DEVICE_PARAM
+	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
+#elif defined(HAVE_IB_UMEM_GET_NO_DMASYNC)
+	mr->umem = ib_umem_get(udata, start, length, access_flags);
+#elif defined(HAVE_IB_UMEM_GET_UDATA)
+	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
+#else
+	mr->umem = ib_umem_get(ibpd->uobject->context, start, length,
+			       access_flags, 0);
+#endif
+	if (IS_ERR(mr->umem)) {
+#ifdef HAVE_EFA_P2P
+		mr->p2pmem = efa_p2p_get(dev, mr, start, length);
+		if (mr->p2pmem) {
+			/* Avoid referencing an error-pointer later on */
+			mr->umem = NULL;
+			goto reg_mr;
+		}
+#endif
+		err = PTR_ERR(mr->umem);
+		ibdev_dbg(&dev->ibdev,
+			  "Failed to pin and map user space memory[%d]\n", err);
+		goto err_free;
+	}
+
+#ifdef HAVE_EFA_P2P
+reg_mr:
+#endif
+	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
+	if (err)
+		goto err_release;
+
+	return &mr->ibmr;
+
+err_release:
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem)
+		efa_p2p_put(mr->p2pmem->ticket, false);
+	else
+		ib_umem_release(mr->umem);
+#else
+	ib_umem_release(mr->umem);
+#endif
+err_free:
+	kfree(mr);
+err_out:
+	atomic64_inc(&dev->stats.reg_mr_err);
+	return ERR_PTR(err);
+}
+
+#ifdef HAVE_DEREG_MR_UDATA
+int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+#else
+int efa_dereg_mr(struct ib_mr *ibmr)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibmr->device);
+	struct efa_com_dereg_mr_params params;
+	struct efa_mr *mr = to_emr(ibmr);
+	int err;
+
+	ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey);
+
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem) {
+		err = efa_p2p_put(mr->p2p_ticket, false);
+		if (err)
+			return err;
+
+		kfree(mr);
+		return 0;
+	}
+#endif
+	params.l_key = mr->ibmr.lkey;
+	err = efa_com_dereg_mr(&dev->edev, &params);
+	if (err)
+		return err;
+
+#if defined(HAVE_MR_DMABUF) && !defined(HAVE_IB_UMEM_DMABUF_PINNED)
+	if (mr->umem->is_dmabuf) {
+		struct ib_umem_dmabuf *umem_dmabuf;
+
+		umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+		dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+		dma_buf_unpin(umem_dmabuf->attach);
+		dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+	}
+#endif
+
+	ib_umem_release(mr->umem);
+	kfree(mr);
+
+	return 0;
+}
+
+int efa_get_port_immutable(struct ib_device *ibdev, port_t port_num,
+			   struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err) {
+		ibdev_dbg(ibdev, "Couldn't query port err[%d]\n", err);
+		return err;
+	}
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
+static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn)
+{
+	struct efa_com_dealloc_uar_params params = {
+		.uarn = uarn,
+	};
+
+	return efa_com_dealloc_uar(&dev->edev, &params);
+}
+
+#define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \
+	(_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \
+		     NULL : #_attr)
+
+static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext,
+				   const struct efa_ibv_alloc_ucontext_cmd *cmd)
+{
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	char *attr_str;
+
+	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch,
+				EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str))
+		goto err;
+
+	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth,
+				EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR,
+				attr_str))
+		goto err;
+
+	return 0;
+
+err:
+	ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n",
+		  attr_str);
+	return -EOPNOTSUPP;
+}
+
+int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	struct efa_ibv_alloc_ucontext_resp resp = {};
+	struct efa_ibv_alloc_ucontext_cmd cmd = {};
+	struct efa_com_alloc_uar_result result;
+	int err;
+
+	/*
+	 * it's fine if the driver does not know all request fields,
+	 * we will ack input fields in our response.
+	 */
+
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "Cannot copy udata for alloc_ucontext\n");
+		goto err_out;
+	}
+
+	err = efa_user_comp_handshake(ibucontext, &cmd);
+	if (err)
+		goto err_out;
+
+	err = efa_com_alloc_uar(&dev->edev, &result);
+	if (err)
+		goto err_out;
+
+	ucontext->uarn = result.uarn;
+#ifndef HAVE_CORE_MMAP_XA
+	mutex_init(&ucontext->lock);
+	INIT_LIST_HEAD(&ucontext->pending_mmaps);
+#endif /* !defined(HAVE_CORE_MMAP_XA) */
+
+	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
+	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
+	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
+	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
+	resp.max_llq_size = dev->dev_attr.max_llq_size;
+	resp.max_tx_batch = dev->dev_attr.max_tx_batch;
+	resp.min_sq_wr = dev->dev_attr.min_sq_depth;
+
+	err = ib_copy_to_udata(udata, &resp,
+			       min(sizeof(resp), udata->outlen));
+	if (err)
+		goto err_dealloc_uar;
+
+	return 0;
+
+err_dealloc_uar:
+	efa_dealloc_uar(dev, result.uarn);
+err_out:
+	atomic64_inc(&dev->stats.alloc_ucontext_err);
+	return err;
+}
+
+#ifndef HAVE_UCONTEXT_CORE_ALLOCATION
+struct ib_ucontext *efa_kzalloc_ucontext(struct ib_device *ibdev,
+					 struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_ucontext *ucontext;
+	int err;
+
+	/*
+	 * it's fine if the driver does not know all request fields,
+	 * we will ack input fields in our response.
+	 */
+
+	ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL);
+	if (!ucontext) {
+		atomic64_inc(&dev->stats.alloc_ucontext_err);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ucontext->ibucontext.device = ibdev;
+	err = efa_alloc_ucontext(&ucontext->ibucontext, udata);
+	if (err)
+		goto err_free_ucontext;
+
+	return &ucontext->ibucontext;
+
+err_free_ucontext:
+	kfree(ucontext);
+	return ERR_PTR(err);
+}
+#endif
+
+#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
+void efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
+#else
+int efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
+#endif
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_dev *dev = to_edev(ibucontext->device);
+
+#ifndef HAVE_CORE_MMAP_XA
+	mmap_entries_remove_free(dev, ucontext);
+#endif
+	efa_dealloc_uar(dev, ucontext->uarn);
+#ifndef HAVE_UCONTEXT_CORE_ALLOCATION
+	kfree(ucontext);
+
+	return 0;
+#endif
+}
+
+#ifdef HAVE_CORE_MMAP_XA
+void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
+{
+	struct efa_user_mmap_entry *entry = to_emmap(rdma_entry);
+
+	kfree(entry);
+}
+#endif
+
+static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
+		      struct vm_area_struct *vma)
+{
+	struct rdma_user_mmap_entry *rdma_entry;
+	struct efa_user_mmap_entry *entry;
+	unsigned long va;
+	int err = 0;
+	u64 pfn;
+
+	rdma_entry = rdma_user_mmap_entry_get(&ucontext->ibucontext, vma);
+	if (!rdma_entry) {
+		ibdev_dbg(&dev->ibdev,
+			  "pgoff[%#lx] does not have valid entry\n",
+			  vma->vm_pgoff);
+		atomic64_inc(&dev->stats.mmap_err);
+		return -EINVAL;
+	}
+	entry = to_emmap(rdma_entry);
+
+	ibdev_dbg(&dev->ibdev,
+		  "Mapping address[%#llx], length[%#zx], mmap_flag[%d]\n",
+		  entry->address, rdma_entry->npages * PAGE_SIZE,
+		  entry->mmap_flag);
+
+	pfn = entry->address >> PAGE_SHIFT;
+	switch (entry->mmap_flag) {
+	case EFA_MMAP_IO_NC:
+#ifdef HAVE_CORE_MMAP_XA
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_noncached(vma->vm_page_prot),
+					rdma_entry);
+#elif defined(HAVE_RDMA_USER_MMAP_IO)
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_noncached(vma->vm_page_prot));
+#else
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		err = io_remap_pfn_range(vma, vma->vm_start, pfn,
+					 entry->rdma_entry.npages * PAGE_SIZE,
+					 vma->vm_page_prot);
+#endif
+		break;
+	case EFA_MMAP_IO_WC:
+#ifdef HAVE_CORE_MMAP_XA
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_writecombine(vma->vm_page_prot),
+					rdma_entry);
+#elif defined(HAVE_RDMA_USER_MMAP_IO)
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_writecombine(vma->vm_page_prot));
+#else
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		err = io_remap_pfn_range(vma, vma->vm_start, pfn,
+					 entry->rdma_entry.npages * PAGE_SIZE,
+					 vma->vm_page_prot);
+#endif
+		break;
+	case EFA_MMAP_DMA_PAGE:
+		for (va = vma->vm_start; va < vma->vm_end;
+		     va += PAGE_SIZE, pfn++) {
+			err = vm_insert_page(vma, va, pfn_to_page(pfn));
+			if (err)
+				break;
+		}
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	if (err) {
+		ibdev_dbg(
+			&dev->ibdev,
+			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
+			entry->address, rdma_entry->npages * PAGE_SIZE,
+			entry->mmap_flag, err);
+		atomic64_inc(&dev->stats.mmap_err);
+	}
+
+	rdma_user_mmap_entry_put(rdma_entry);
+	return err;
+}
+
+int efa_mmap(struct ib_ucontext *ibucontext,
+	     struct vm_area_struct *vma)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	size_t length = vma->vm_end - vma->vm_start;
+
+	ibdev_dbg(&dev->ibdev,
+		  "start %#lx, end %#lx, length = %#zx, pgoff = %#lx\n",
+		  vma->vm_start, vma->vm_end, length, vma->vm_pgoff);
+
+	return __efa_mmap(dev, ucontext, vma);
+}
+
+static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
+{
+	struct efa_com_destroy_ah_params params = {
+		.ah = ah->ah,
+		.pdn = to_epd(ah->ibah.pd)->pdn,
+	};
+
+	return efa_com_destroy_ah(&dev->edev, &params);
+}
+
+int efa_create_ah(struct ib_ah *ibah,
+#ifdef HAVE_CREATE_AH_INIT_ATTR
+		  struct rdma_ah_init_attr *init_attr,
+#else
+		  struct rdma_ah_attr *ah_attr,
+		  u32 flags,
+#endif
+		  struct ib_udata *udata)
+{
+#ifdef HAVE_CREATE_AH_INIT_ATTR
+	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
+#endif
+	struct efa_dev *dev = to_edev(ibah->device);
+	struct efa_com_create_ah_params params = {};
+	struct efa_ibv_create_ah_resp resp = {};
+	struct efa_com_create_ah_result result;
+	struct efa_ah *ah = to_eah(ibah);
+	int err;
+
+#if defined(HAVE_CREATE_DESTROY_AH_FLAGS) || defined(HAVE_CREATE_AH_INIT_ATTR)
+#ifdef HAVE_CREATE_AH_INIT_ATTR
+	if (!(init_attr->flags & RDMA_CREATE_AH_SLEEPABLE)) {
+#else
+	if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) {
+#endif
+		ibdev_dbg(&dev->ibdev,
+			  "Create address handle is not supported in atomic context\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+	if (udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+		ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	memcpy(params.dest_addr, ah_attr->grh.dgid.raw,
+	       sizeof(params.dest_addr));
+	params.pdn = to_epd(ibah->pd)->pdn;
+	err = efa_com_create_ah(&dev->edev, &params, &result);
+	if (err)
+		goto err_out;
+
+	memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id));
+	ah->ah = result.ah;
+
+	resp.efa_address_handle = result.ah;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to copy udata for create_ah response\n");
+			goto err_destroy_ah;
+		}
+	}
+	ibdev_dbg(&dev->ibdev, "Created ah[%d]\n", ah->ah);
+
+	return 0;
+
+err_destroy_ah:
+	efa_ah_destroy(dev, ah);
+err_out:
+	atomic64_inc(&dev->stats.create_ah_err);
+	return err;
+}
+
+#ifndef HAVE_AH_CORE_ALLOCATION
+#ifdef HAVE_CREATE_DESTROY_AH_FLAGS
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct rdma_ah_attr *ah_attr,
+			     u32 flags,
+			     struct ib_udata *udata)
+#elif defined(HAVE_CREATE_AH_RDMA_ATTR)
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct rdma_ah_attr *ah_attr,
+			     struct ib_udata *udata)
+#endif
+{
+	struct efa_ah *ah;
+	int err;
+#ifndef HAVE_CREATE_DESTROY_AH_FLAGS
+	u32 flags = 0;
+#endif
+
+	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	ah->ibah.device = ibpd->device;
+	ah->ibah.pd = ibpd;
+	err = efa_create_ah(&ah->ibah, ah_attr, flags, udata);
+	if (err)
+		goto err_free;
+
+	return &ah->ibah;
+
+err_free:
+	kfree(ah);
+	return ERR_PTR(err);
+}
+#endif
+
+#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
+#elif defined(HAVE_AH_CORE_ALLOCATION)
+void efa_destroy_ah(struct ib_ah *ibah, u32 flags)
+#elif defined(HAVE_CREATE_DESTROY_AH_FLAGS)
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
+#else
+int efa_destroy_ah(struct ib_ah *ibah)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibah->pd->device);
+	struct efa_ah *ah = to_eah(ibah);
+#if !defined(HAVE_AH_CORE_ALLOCATION) && !defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC)
+	int err;
+#endif
+
+	ibdev_dbg(&dev->ibdev, "Destroy ah[%d]\n", ah->ah);
+
+#if defined(HAVE_CREATE_DESTROY_AH_FLAGS)
+	if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Destroy address handle is not supported in atomic context\n");
+#if defined(HAVE_AH_CORE_ALLOCATION) && !defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC)
+		return;
+#else
+		return -EOPNOTSUPP;
+#endif
+	}
+#endif
+
+#if defined(HAVE_AH_CORE_ALLOCATION) || defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC)
+	efa_ah_destroy(dev, ah);
+#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
+	return 0;
+#endif
+#else
+	err = efa_ah_destroy(dev, ah);
+	if (err)
+		return err;
+	kfree(ah);
+	return 0;
+#endif
+}
+
+#ifdef HAVE_SPLIT_STATS_ALLOC
+struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev,
+					      port_t port_num)
+{
+	return rdma_alloc_hw_stats_struct(efa_port_stats_descs,
+					  ARRAY_SIZE(efa_port_stats_descs),
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev)
+{
+	return rdma_alloc_hw_stats_struct(efa_device_stats_descs,
+					  ARRAY_SIZE(efa_device_stats_descs),
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+#else
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, port_t port_num)
+{
+	if (port_num)
+		return rdma_alloc_hw_stats_struct(efa_port_stats_descs,
+						  ARRAY_SIZE(efa_port_stats_descs),
+						  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+	else
+		return rdma_alloc_hw_stats_struct(efa_device_stats_descs,
+						  ARRAY_SIZE(efa_device_stats_descs),
+						  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+#endif
+
+static int efa_fill_device_stats(struct efa_dev *dev,
+				 struct rdma_hw_stats *stats)
+{
+	struct efa_com_stats_admin *as = &dev->edev.aq.stats;
+	struct efa_stats *s = &dev->stats;
+
+	stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd);
+	stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd);
+	stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err);
+	stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion);
+
+	stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd);
+	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->alloc_pd_err);
+	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->create_qp_err);
+	stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->create_cq_err);
+	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->reg_mr_err);
+	stats->value[EFA_ALLOC_UCONTEXT_ERR] =
+		atomic64_read(&s->alloc_ucontext_err);
+	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err);
+	stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err);
+
+	return ARRAY_SIZE(efa_device_stats_descs);
+}
+
+static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats,
+			       port_t port_num)
+{
+	struct efa_com_get_stats_params params = {};
+	union efa_com_get_stats_result result;
+	struct efa_com_rdma_read_stats *rrs;
+	struct efa_com_messages_stats *ms;
+	struct efa_com_basic_stats *bs;
+	int err;
+
+	params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL;
+	params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC;
+
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	bs = &result.basic_stats;
+	stats->value[EFA_TX_BYTES] = bs->tx_bytes;
+	stats->value[EFA_TX_PKTS] = bs->tx_pkts;
+	stats->value[EFA_RX_BYTES] = bs->rx_bytes;
+	stats->value[EFA_RX_PKTS] = bs->rx_pkts;
+	stats->value[EFA_RX_DROPS] = bs->rx_drops;
+
+	params.type = EFA_ADMIN_GET_STATS_TYPE_MESSAGES;
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	ms = &result.messages_stats;
+	stats->value[EFA_SEND_BYTES] = ms->send_bytes;
+	stats->value[EFA_SEND_WRS] = ms->send_wrs;
+	stats->value[EFA_RECV_BYTES] = ms->recv_bytes;
+	stats->value[EFA_RECV_WRS] = ms->recv_wrs;
+
+	params.type = EFA_ADMIN_GET_STATS_TYPE_RDMA_READ;
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	rrs = &result.rdma_read_stats;
+	stats->value[EFA_RDMA_READ_WRS] = rrs->read_wrs;
+	stats->value[EFA_RDMA_READ_BYTES] = rrs->read_bytes;
+	stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err;
+	stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes;
+
+	return ARRAY_SIZE(efa_port_stats_descs);
+}
+
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     port_t port_num, int index)
+{
+	if (port_num)
+		return efa_fill_port_stats(to_edev(ibdev), stats, port_num);
+	else
+		return efa_fill_device_stats(to_edev(ibdev), stats);
+}
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+#ifdef HAVE_POST_CONST_WR
+int efa_post_send(struct ib_qp *ibqp,
+		  const struct ib_send_wr *wr,
+		  const struct ib_send_wr **bad_wr)
+#else
+int efa_post_send(struct ib_qp *ibqp,
+		  struct ib_send_wr *wr,
+		  struct ib_send_wr **bad_wr)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return -EOPNOTSUPP;
+}
+
+#ifdef HAVE_POST_CONST_WR
+int efa_post_recv(struct ib_qp *ibqp,
+		  const struct ib_recv_wr *wr,
+		  const struct ib_recv_wr **bad_wr)
+#else
+int efa_post_recv(struct ib_qp *ibqp,
+		  struct ib_recv_wr *wr,
+		  struct ib_recv_wr **bad_wr)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return -EOPNOTSUPP;
+}
+
+int efa_poll_cq(struct ib_cq *ibcq, int num_entries,
+		struct ib_wc *wc)
+{
+	struct efa_dev *dev = to_edev(ibcq->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return -EOPNOTSUPP;
+}
+
+int efa_req_notify_cq(struct ib_cq *ibcq,
+		      enum ib_cq_notify_flags flags)
+{
+	struct efa_dev *dev = to_edev(ibcq->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return -EOPNOTSUPP;
+}
+
+struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
+					 port_t port_num)
+{
+	return IB_LINK_LAYER_UNSPECIFIED;
+}
+
diff --git a/drivers/amazon/net/efa/kcompat.h b/drivers/amazon/net/efa/kcompat.h
new file mode 100644
index 0000000000000..713dcc00b394c
--- /dev/null
+++ b/drivers/amazon/net/efa/kcompat.h
@@ -0,0 +1,243 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _KCOMPAT_H_
+#define _KCOMPAT_H_
+
+#include <linux/types.h>
+
+#include "config.h"
+
+#ifndef HAVE_IB_IS_UDATA_CLEARED
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+static inline bool ib_is_udata_cleared(struct ib_udata *udata,
+				       size_t offset,
+				       size_t len)
+{
+	const void __user *p = udata->inbuf + offset;
+	bool ret = false;
+	u8 *buf;
+
+	if (len > USHRT_MAX)
+		return false;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return false;
+
+	if (copy_from_user(buf, p, len))
+		goto free;
+
+	ret = !memchr_inv(buf, 0, len);
+
+free:
+	kfree(buf);
+	return ret;
+}
+#endif
+
+#ifndef HAVE_IB_QPT_DRIVER
+#define IB_QPT_DRIVER 0xFF
+#endif
+
+#if defined(HAVE_DRIVER_ID) && !defined(HAVE_UPSTREAM_EFA)
+#define RDMA_DRIVER_EFA 17
+#endif
+
+#ifndef HAVE_IBDEV_PRINT
+#define ibdev_err(_ibdev, format, arg...) \
+	dev_err(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_dbg(_ibdev, format, arg...) \
+	dev_dbg(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_warn(_ibdev, format, arg...) \
+	dev_warn(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_info(_ibdev, format, arg...) \
+	dev_info(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#endif
+
+#ifndef HAVE_IBDEV_PRINT_RATELIMITED
+#define ibdev_err_ratelimited(_ibdev, format, arg...) \
+	dev_err_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_dbg_ratelimited(_ibdev, format, arg...) \
+	dev_dbg_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_warn_ratelimited(_ibdev, format, arg...) \
+	dev_warn_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_info_ratelimited(_ibdev, format, arg...) \
+	dev_info_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#endif
+
+#ifndef HAVE_KVZALLOC
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	void *addr;
+
+	addr = kzalloc(size, flags | __GFP_NOWARN);
+	if (addr)
+		return addr;
+
+	return vzalloc(size);
+}
+#endif
+
+#ifndef HAVE_IB_PORT_PHYS_STATE_LINK_UP
+#define IB_PORT_PHYS_STATE_LINK_UP 5
+#endif
+
+#ifndef HAVE_CORE_MMAP_XA
+#include <linux/types.h>
+#include <linux/device.h>
+
+struct rdma_user_mmap_entry {
+	struct ib_ucontext *ucontext;
+	unsigned long start_pgoff;
+	size_t npages;
+};
+
+/* Return the offset (in bytes) the user should pass to libc's mmap() */
+static inline u64
+rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry)
+{
+	return (u64)entry->start_pgoff << PAGE_SHIFT;
+}
+
+/*
+ * Backported kernels don't keep refcnt on entries, hence they should not
+ * be removed.
+ */
+static inline void
+rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
+{
+}
+
+static inline void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry)
+{
+}
+#endif
+
+#ifndef sizeof_field
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+#endif
+
+#ifndef HAVE_BITFIELD_H
+#define __bf_shf(x) (__builtin_ffsll(x) - 1)
+
+#define FIELD_PREP(_mask, _val)                                         \
+	({                                                              \
+		((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask);   \
+	})
+
+#define FIELD_GET(_mask, _reg)                                          \
+	({                                                              \
+		(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \
+	})
+#endif
+
+#ifndef HAVE_RDMA_NODE_UNSPECIFIED
+enum {
+	RDMA_NODE_UNSPECIFIED = 7,
+};
+#endif
+
+#ifndef HAVE_ATOMIC64_FETCH_INC
+static __always_inline s64
+atomic64_fetch_inc(atomic64_t *v)
+{
+	return atomic64_inc_return(v) - 1;
+}
+#endif
+
+#if !defined(HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK) && defined(HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE)
+#include <rdma/ib_umem.h>
+
+static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter,
+						struct ib_umem *umem,
+						unsigned long pgsz)
+{
+	__rdma_block_iter_start(biter, umem->sg_head.sgl, umem->nmap, pgsz);
+}
+
+/**
+ * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem
+ * @umem: umem to iterate over
+ * @pgsz: Page size to split the list into
+ *
+ * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The
+ * returned DMA blocks will be aligned to pgsz and span the range:
+ * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz)
+ *
+ * Performs exactly ib_umem_num_dma_blocks() iterations.
+ */
+#define rdma_umem_for_each_dma_block(umem, biter, pgsz)                        \
+	for (__rdma_umem_block_iter_start(biter, umem, pgsz);                  \
+	     __rdma_block_iter_next(biter);)
+#endif
+
+#ifdef HAVE_U32_PORT
+typedef u32 port_t;
+#else
+typedef u8 port_t;
+#endif
+
+#if defined(HAVE_MR_DMABUF) && !defined(HAVE_IB_UMEM_DMABUF_PINNED)
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
+#include <rdma/ib_umem.h>
+
+static inline void
+ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
+
+	ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev,
+			       "Invalidate callback should not be called when memory is pinned\n");
+}
+
+static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = {
+	.allow_peer2peer = true,
+	.move_notify = ib_umem_dmabuf_unsupported_move_notify,
+};
+
+static inline
+struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
+						 unsigned long offset,
+						 size_t size, int fd,
+						 int access)
+{
+	struct ib_umem_dmabuf *umem_dmabuf;
+	int err;
+
+	umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access,
+					 &ib_umem_dmabuf_attach_pinned_ops);
+	if (IS_ERR(umem_dmabuf))
+		return umem_dmabuf;
+
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	err = dma_buf_pin(umem_dmabuf->attach);
+	if (err)
+		goto err_release;
+
+	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+	if (err)
+		goto err_unpin;
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+	return umem_dmabuf;
+
+err_unpin:
+	dma_buf_unpin(umem_dmabuf->attach);
+err_release:
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+	ib_umem_release(&umem_dmabuf->umem);
+	return ERR_PTR(err);
+}
+#endif /* !HAVE_IB_UMEM_DMABUF_PINNED */
+
+#endif /* _KCOMPAT_H_ */
diff --git a/drivers/amazon/net/efa/neuron_p2p.h b/drivers/amazon/net/efa/neuron_p2p.h
new file mode 100644
index 0000000000000..a1ce44003463f
--- /dev/null
+++ b/drivers/amazon/net/efa/neuron_p2p.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef __NEURON_P2P_H__
+#define __NEURON_P2P_H__
+
+struct neuron_p2p_page_info {
+    u64 physical_address; // PA's that map to the VA (page aligned as defined in va_info)
+    u32 page_count; // page count each page is shift_page_size size
+};
+
+struct neuron_p2p_va_info {
+    void *virtual_address; // Virtual address for which the PA's need to be obtained
+    u64 size; // The actual size of the memory pointed by the virtual_address
+    u32 shift_page_size; // log2 of the page size
+    u32 device_index; // Neuron Device index.
+    u32 entries; // Number of page_info entries
+    struct neuron_p2p_page_info page_info[];
+};
+
+/** Given the virtual address and length returns the physical address
+ *
+ * @param[in] virtual_address   - Virtual address of device memory
+ * @param[in] length            - Length of the memory
+ * @param[out] va_info          - Set of physical addresses
+ * @param[in] free_callback     - Callback function to be called. This will be called with a lock held.
+ * @param[in] data              - Data to be used for the callback
+ *
+ * @return 0            - Success.
+ */
+int neuron_p2p_register_va(u64 virtual_address, u64 length, struct neuron_p2p_va_info **vainfo, void (*free_callback) (void *data), void *data);
+
+/** Give the pa, release the pa from being used by third-party device
+ *
+ * @param[in] va_info           - Set of physical addresses
+ *
+ * @return 0            - Success.
+ */
+int neuron_p2p_unregister_va(struct neuron_p2p_va_info *vainfo);
+
+#endif
diff --git a/drivers/amazon/net/efa/nv-p2p.h b/drivers/amazon/net/efa/nv-p2p.h
new file mode 100644
index 0000000000000..d74e024963d5a
--- /dev/null
+++ b/drivers/amazon/net/efa/nv-p2p.h
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _NV_P2P_H_
+#define _NV_P2P_H_
+
+/*
+ * NVIDIA P2P Structure Versioning
+ *
+ * For the nvidia_p2p_*_t structures allocated by the NVIDIA driver, it will
+ * set the version field of the structure according to the definition used by
+ * the NVIDIA driver. The "major" field of the version is defined as the upper
+ * 16 bits, and the "minor" field of the version is defined as the lower 16
+ * bits. The version field will always be the first 4 bytes of the structure,
+ * and third-party drivers should check the value of this field in structures
+ * allocated by the NVIDIA driver to ensure runtime compatibility.
+ *
+ * In general, version numbers will be incremented as follows:
+ * - When a backwards-compatible change is made to the structure layout, the
+ *   minor version for that structure will be incremented. Third-party drivers
+ *   built against an older minor version will continue to work with the newer
+ *   minor version used by the NVIDIA driver, without recompilation.
+ * - When a breaking change is made to the structure layout, the major version
+ *   will be incremented. Third-party drivers built against an older major
+ *   version require at least recompilation and potentially additional updates
+ *   to use the new API.
+ */
+#define NVIDIA_P2P_MAJOR_VERSION_MASK   0xffff0000
+#define NVIDIA_P2P_MINOR_VERSION_MASK   0x0000ffff
+
+#define NVIDIA_P2P_MAJOR_VERSION(v) \
+    (((v) & NVIDIA_P2P_MAJOR_VERSION_MASK) >> 16)
+
+#define NVIDIA_P2P_MINOR_VERSION(v) \
+    (((v) & NVIDIA_P2P_MINOR_VERSION_MASK))
+
+#define NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) \
+    (NVIDIA_P2P_MAJOR_VERSION((p)->version) == NVIDIA_P2P_MAJOR_VERSION(v))
+
+#define NVIDIA_P2P_VERSION_COMPATIBLE(p, v)    \
+    (NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) && \
+     (NVIDIA_P2P_MINOR_VERSION((p)->version) >= (NVIDIA_P2P_MINOR_VERSION(v))))
+
+enum {
+    NVIDIA_P2P_ARCHITECTURE_TESLA = 0,
+    NVIDIA_P2P_ARCHITECTURE_FERMI,
+    NVIDIA_P2P_ARCHITECTURE_CURRENT = NVIDIA_P2P_ARCHITECTURE_FERMI
+};
+
+#define NVIDIA_P2P_PARAMS_VERSION   0x00010001
+
+enum {
+    NVIDIA_P2P_PARAMS_ADDRESS_INDEX_GPU = 0,
+    NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE,
+    NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX = \
+        NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE
+};
+
+typedef
+struct nvidia_p2p_params {
+    u32 version;
+    u32 architecture;
+    union nvidia_p2p_mailbox_addresses {
+        struct {
+            u64 wmb_addr;
+            u64 wmb_data;
+            u64 rreq_addr;
+            u64 rcomp_addr;
+            u64 reserved[2];
+        } fermi;
+    } addresses[NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX+1];
+} nvidia_p2p_params_t;
+
+/*
+ * @brief
+ *   Initializes a third-party P2P mapping between an NVIDIA
+ *   GPU and a third-party device.
+ *
+ * @param[in]     p2p_token
+ *   A token that uniquely identifies the P2P mapping.
+ * @param[in,out] params
+ *   A pointer to a structure with P2P mapping parameters.
+ * @param[in]     destroy_callback
+ *   A pointer to the function to be invoked when the P2P mapping
+ *   is destroyed implictly.
+ * @param[in]     data
+ *   An opaque pointer to private data to be passed to the
+ *   callback function.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -ENOTSUPP    if the requested configuration is not supported.
+ *   -ENOMEM      if the driver failed to allocate memory.
+ *   -EBUSY       if the mapping has already been initialized.
+ *   -EIO         if an unknown error occurred.
+ */
+int nvidia_p2p_init_mapping(u64 p2p_token,
+        struct nvidia_p2p_params *params,
+        void (*destroy_callback)(void *data),
+        void *data);
+
+/*
+ * @brief
+ *   Tear down a previously initialized third-party P2P mapping.
+ *
+ * @param[in]     p2p_token
+ *   A token that uniquely identifies the mapping.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -ENOTSUPP    if the requested configuration is not supported.
+ *   -ENOMEM      if the driver failed to allocate memory.
+ */
+int nvidia_p2p_destroy_mapping(u64 p2p_token);
+
+enum nvidia_p2p_page_size_type {
+    NVIDIA_P2P_PAGE_SIZE_4KB = 0,
+    NVIDIA_P2P_PAGE_SIZE_64KB,
+    NVIDIA_P2P_PAGE_SIZE_128KB,
+    NVIDIA_P2P_PAGE_SIZE_COUNT
+};
+
+typedef
+struct nvidia_p2p_page {
+    u64 physical_address;
+    union nvidia_p2p_request_registers {
+        struct {
+            u32 wreqmb_h;
+            u32 rreqmb_h;
+            u32 rreqmb_0;
+            u32 reserved[3];
+        } fermi;
+    } registers;
+} nvidia_p2p_page_t;
+
+#define NVIDIA_P2P_PAGE_TABLE_VERSION   0x00010002
+
+#define NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(p) \
+    NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_PAGE_TABLE_VERSION)
+
+typedef
+struct nvidia_p2p_page_table {
+    u32 version;
+    u32 page_size; /* enum nvidia_p2p_page_size_type */
+    struct nvidia_p2p_page **pages;
+    u32 entries;
+    u8 *gpu_uuid;
+} nvidia_p2p_page_table_t;
+
+/*
+ * @brief
+ *   Make the pages underlying a range of GPU virtual memory
+ *   accessible to a third-party device.
+ *
+ *   This API only supports pinned, GPU-resident memory, such as that provided
+ *   by cudaMalloc().
+ *
+ *   This API may sleep.
+ *
+ * @param[in]     p2p_token
+ *   A token that uniquely identifies the P2P mapping.
+ * @param[in]     va_space
+ *   A GPU virtual address space qualifier.
+ * @param[in]     virtual_address
+ *   The start address in the specified virtual address space.
+ *   Address must be aligned to the 64KB boundary.
+ * @param[in]     length
+ *   The length of the requested P2P mapping.
+ *   Length must be a multiple of 64KB.
+ * @param[out]    page_table
+ *   A pointer to an array of structures with P2P PTEs.
+ * @param[in]     free_callback
+ *   A non-NULL pointer to the function to be invoked when the pages
+ *   underlying the virtual address range are freed
+ *   implicitly. Must be non NULL.
+ * @param[in]     data
+ *   A non-NULL opaque pointer to private data to be passed to the
+ *   callback function.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -ENOTSUPP    if the requested operation is not supported.
+ *   -ENOMEM      if the driver failed to allocate memory or if
+ *     insufficient resources were available to complete the operation.
+ *   -EIO         if an unknown error occurred.
+ */
+int nvidia_p2p_get_pages(u64 p2p_token, u32 va_space,
+        u64 virtual_address,
+        u64 length,
+        struct nvidia_p2p_page_table **page_table,
+        void (*free_callback)(void *data),
+        void *data);
+
+#define NVIDIA_P2P_DMA_MAPPING_VERSION   0x00020003
+
+#define NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(p) \
+    NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_DMA_MAPPING_VERSION)
+
+struct pci_dev;
+
+typedef
+struct nvidia_p2p_dma_mapping {
+    u32 version;
+    enum nvidia_p2p_page_size_type page_size_type;
+    u32 entries;
+    u64 *dma_addresses;
+    void *private;
+    struct pci_dev *pci_dev;
+} nvidia_p2p_dma_mapping_t;
+
+/*
+ * @brief
+ *   Make the physical pages retrieved using nvidia_p2p_get_pages accessible to
+ *   a third-party device.
+ *
+ * @param[in]     peer
+ *   The struct pci_dev * of the peer device that needs to DMA to/from the
+ *   mapping.
+ * @param[in]     page_table
+ *   The page table outlining the physical pages underlying the mapping, as
+ *   retrieved with nvidia_p2p_get_pages().
+ * @param[out]    dma_mapping
+ *   The DMA mapping containing the DMA addresses to use on the third-party
+ *   device.
+ *
+ * @return
+ *    0           upon successful completion.
+ *    -EINVAL     if an invalid argument was supplied.
+ *    -ENOTSUPP   if the requested operation is not supported.
+ *    -EIO        if an unknown error occurred.
+ */
+int nvidia_p2p_dma_map_pages(struct pci_dev *peer,
+        struct nvidia_p2p_page_table *page_table,
+        struct nvidia_p2p_dma_mapping **dma_mapping);
+
+/*
+ * @brief
+ *   Unmap the physical pages previously mapped to the third-party device by
+ *   nvidia_p2p_dma_map_pages().
+ *
+ * @param[in]     peer
+ *   The struct pci_dev * of the peer device that the DMA mapping belongs to.
+ * @param[in]     page_table
+ *   The page table backing the DMA mapping to be unmapped.
+ * @param[in]     dma_mapping
+ *   The DMA mapping containing the DMA addresses used by the third-party
+ *   device, as retrieved with nvidia_p2p_dma_map_pages(). After this call
+ *   returns, neither this struct nor the addresses contained within will be
+ *   valid for use by the third-party device.
+ *
+ * @return
+ *    0           upon successful completion.
+ *    -EINVAL     if an invalid argument was supplied.
+ *    -EIO        if an unknown error occurred.
+ */
+int nvidia_p2p_dma_unmap_pages(struct pci_dev *peer,
+        struct nvidia_p2p_page_table *page_table,
+        struct nvidia_p2p_dma_mapping *dma_mapping);
+
+/*
+ * @brief
+ *   Release a set of pages previously made accessible to
+ *   a third-party device.
+ *
+ * @param[in]     p2p_token
+ *   A token that uniquely identifies the P2P mapping.
+ * @param[in]     va_space
+ *   A GPU virtual address space qualifier.
+ * @param[in]     virtual_address
+ *   The start address in the specified virtual address space.
+ * @param[in]     page_table
+ *   A pointer to the array of structures with P2P PTEs.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -EIO         if an unknown error occurred.
+ */
+int nvidia_p2p_put_pages(u64 p2p_token, u32 va_space,
+        u64 virtual_address,
+        struct nvidia_p2p_page_table *page_table);
+
+/*
+ * @brief
+ *    Free a third-party P2P page table. (This function is a no-op.)
+ *
+ * @param[in]     page_table
+ *   A pointer to the array of structures with P2P PTEs.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ */
+int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table);
+
+/*
+ * @brief
+ *   Free a third-party P2P DMA mapping. (This function is a no-op.)
+ *
+ * @param[in]     dma_mapping
+ *   A pointer to the DMA mapping structure.
+ *
+ * @return
+ *    0           upon successful completion.
+ *    -EINVAL     if an invalid argument was supplied.
+ */
+int nvidia_p2p_free_dma_mapping(struct nvidia_p2p_dma_mapping *dma_mapping);
+
+#define NVIDIA_P2P_RSYNC_DRIVER_VERSION   0x00010001
+
+#define NVIDIA_P2P_RSYNC_DRIVER_VERSION_COMPATIBLE(p) \
+    NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_DRIVER_VERSION)
+
+typedef
+struct nvidia_p2p_rsync_driver {
+    u32 version;
+    int (*get_relaxed_ordering_mode)(int *mode, void *data);
+    void (*put_relaxed_ordering_mode)(int mode, void *data);
+    void (*wait_for_rsync)(struct pci_dev *gpu, void *data);
+} nvidia_p2p_rsync_driver_t;
+
+/*
+ * @brief
+ *   Registers the rsync driver.
+ *
+ * @param[in]     driver
+ *   A pointer to the rsync driver structure. The NVIDIA driver would use,
+ *
+ *   get_relaxed_ordering_mode to obtain a reference to the current relaxed
+ *   ordering mode (treated as a boolean) from the rsync driver.
+ *
+ *   put_relaxed_ordering_mode to release a reference to the current relaxed
+ *   ordering mode back to the rsync driver. The NVIDIA driver will call this
+ *   function once for each successful call to get_relaxed_ordering_mode, and
+ *   the relaxed ordering mode must not change until the last reference is
+ *   released.
+ *
+ *   wait_for_rsync to call into the rsync module to issue RSYNC. This callback
+ *   can't sleep or re-schedule as it may arrive under spinlocks.
+ * @param[in]     data
+ *   A pointer to the rsync driver's private data.
+ *
+ * @Returns
+ *   0            upon successful completion.
+ *   -EINVAL      parameters are incorrect.
+ *   -EBUSY       if a module is already registered or GPU devices are in use.
+ */
+int nvidia_p2p_register_rsync_driver(nvidia_p2p_rsync_driver_t *driver,
+                                     void *data);
+
+/*
+ * @brief
+ *   Unregisters the rsync driver.
+ *
+ * @param[in]     driver
+ *   A pointer to the rsync driver structure.
+ * @param[in]     data
+ *   A pointer to the rsync driver's private data.
+ */
+void nvidia_p2p_unregister_rsync_driver(nvidia_p2p_rsync_driver_t *driver,
+                                        void *data);
+
+#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION   0x00020001
+
+#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION_COMPATIBLE(p) \
+    NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_REG_INFO_VERSION)
+
+typedef struct nvidia_p2p_rsync_reg {
+    void *ptr;
+    size_t size;
+    struct pci_dev *ibmnpu;
+    struct pci_dev *gpu;
+    u32 cluster_id;
+    u32 socket_id;
+} nvidia_p2p_rsync_reg_t;
+
+typedef struct nvidia_p2p_rsync_reg_info {
+    u32 version;
+    nvidia_p2p_rsync_reg_t *regs;
+    size_t entries;
+} nvidia_p2p_rsync_reg_info_t;
+
+/*
+ * @brief
+ *   Gets rsync (GEN-ID) register information associated with the supported
+ *   NPUs.
+ *
+ *   The caller would use the returned information {GPU device, NPU device,
+ *   socket-id, cluster-id} to pick the optimal generation registers to issue
+ *   RSYNC (NVLink HW flush).
+ *
+ *   The interface allocates structures to return the information, hence
+ *   nvidia_p2p_put_rsync_registers() must be called to free the structures.
+ *
+ *   Note, cluster-id is hardcoded to zero as early system configurations would
+ *   only support cluster mode i.e. all devices would share the same cluster-id
+ *   (0). In the future, appropriate kernel support would be needed to query
+ *   cluster-ids.
+ *
+ * @param[out]     reg_info
+ *   A pointer to the rsync reg info structure.
+ *
+ * @Returns
+ *   0 Upon successful completion. Otherwise, returns negative value.
+ */
+int nvidia_p2p_get_rsync_registers(nvidia_p2p_rsync_reg_info_t **reg_info);
+
+/*
+ * @brief
+ *   Frees the structures allocated by nvidia_p2p_get_rsync_registers().
+ *
+ * @param[in]     reg_info
+ *   A pointer to the rsync reg info structure.
+ */
+void nvidia_p2p_put_rsync_registers(nvidia_p2p_rsync_reg_info_t *reg_info);
+
+#endif /* _NV_P2P_H_ */
diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
new file mode 100644
index 0000000000000..b61366782d8d6
--- /dev/null
+++ b/drivers/amazon/net/ena/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for the Elastic Network Adapter (ENA) device drivers.
+# ENA Source is: https://github.com/amzn/amzn-drivers.
+# Current ENA source is based on ena_linux_2.4.0 tag.
+#
+
+obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
+
+ena-y := ena_netdev.o ena_ethtool.o ena_lpc.o ena_phc.o ena_xdp.o dim.o \
+	 ena_devlink.o net_dim.o ena_com.o ena_eth_com.o
+
+ena-$(CONFIG_SYSFS) += ena_sysfs.o
+
+ifdef TEST_AF_XDP
+	ccflags-y += -DENA_TEST_AF_XDP
+endif
+
+ifdef ENA_PHC_INCLUDE
+	ccflags-y += -DENA_PHC_INCLUDE
+endif
diff --git a/drivers/amazon/net/ena/dim.c b/drivers/amazon/net/ena/dim.c
new file mode 100644
index 0000000000000..1b200be4b3709
--- /dev/null
+++ b/drivers/amazon/net/ena/dim.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include "dim.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0)
+
+bool dim_on_top(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		return true;
+	case DIM_GOING_RIGHT:
+		return (dim->steps_left > 1) && (dim->steps_right == 1);
+	default: /* DIM_GOING_LEFT */
+		return (dim->steps_right > 1) && (dim->steps_left == 1);
+	}
+}
+
+void dim_turn(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		dim->tune_state = DIM_GOING_LEFT;
+		dim->steps_left = 0;
+		break;
+	case DIM_GOING_LEFT:
+		dim->tune_state = DIM_GOING_RIGHT;
+		dim->steps_right = 0;
+		break;
+	}
+}
+
+void dim_park_on_top(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tired        = 0;
+	dim->tune_state   = DIM_PARKING_ON_TOP;
+}
+
+void dim_park_tired(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tune_state   = DIM_PARKING_TIRED;
+}
+
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats)
+{
+	/* u32 holds up to 71 minutes, should be enough */
+	u32 delta_us = ktime_us_delta(end->time, start->time);
+	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
+	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
+			     start->byte_ctr);
+	u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr,
+			     start->comp_ctr);
+
+	if (!delta_us)
+		return;
+
+	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
+	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
+	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
+					delta_us);
+	curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us);
+	if (curr_stats->epms != 0)
+		curr_stats->cpe_ratio = DIV_ROUND_DOWN_ULL(
+			curr_stats->cpms * 100, curr_stats->epms);
+	else
+		curr_stats->cpe_ratio = 0;
+
+}
+
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */
diff --git a/drivers/amazon/net/ena/dim.h b/drivers/amazon/net/ena/dim.h
new file mode 100644
index 0000000000000..633c2473e73ad
--- /dev/null
+++ b/drivers/amazon/net/ena/dim.h
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef DIM_H
+#define DIM_H
+
+#include <linux/module.h>
+#include "kcompat.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0)
+
+/**
+ * Number of events between DIM iterations.
+ * Causes a moderation of the algorithm run.
+ */
+#define DIM_NEVENTS 64
+
+/**
+ * Is a difference between values justifies taking an action.
+ * We consider 10% difference as significant.
+ */
+#define IS_SIGNIFICANT_DIFF(val, ref) \
+	(((100UL * abs((val) - (ref))) / (ref)) > 10)
+
+/**
+ * Calculate the gap between two values.
+ * Take wrap-around and variable size into consideration.
+ */
+#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
+		& (BIT_ULL(bits) - 1))
+
+/**
+ * Structure for CQ moderation values.
+ * Used for communications between DIM and its consumer.
+ *
+ * @usec: CQ timer suggestion (by DIM)
+ * @pkts: CQ packet counter suggestion (by DIM)
+ * @cq_period_mode: CQ priod count mode (from CQE/EQE)
+ */
+struct dim_cq_moder {
+	u16 usec;
+	u16 pkts;
+	u16 comps;
+	u8 cq_period_mode;
+};
+
+/**
+ * Structure for DIM sample data.
+ * Used for communications between DIM and its consumer.
+ *
+ * @time: Sample timestamp
+ * @pkt_ctr: Number of packets
+ * @byte_ctr: Number of bytes
+ * @event_ctr: Number of events
+ */
+struct dim_sample {
+	ktime_t time;
+	u32 pkt_ctr;
+	u32 byte_ctr;
+	u16 event_ctr;
+	u32 comp_ctr;
+};
+
+/**
+ * Structure for DIM stats.
+ * Used for holding current measured rates.
+ *
+ * @ppms: Packets per msec
+ * @bpms: Bytes per msec
+ * @epms: Events per msec
+ */
+struct dim_stats {
+	int ppms; /* packets per msec */
+	int bpms; /* bytes per msec */
+	int epms; /* events per msec */
+	int cpms; /* completions per msec */
+	int cpe_ratio; /* ratio of completions to events */
+};
+
+/**
+ * Main structure for dynamic interrupt moderation (DIM).
+ * Used for holding all information about a specific DIM instance.
+ *
+ * @state: Algorithm state (see below)
+ * @prev_stats: Measured rates from previous iteration (for comparison)
+ * @start_sample: Sampled data at start of current iteration
+ * @work: Work to perform on action required
+ * @priv: A pointer to the struct that points to dim
+ * @profile_ix: Current moderation profile
+ * @mode: CQ period count mode
+ * @tune_state: Algorithm tuning state (see below)
+ * @steps_right: Number of steps taken towards higher moderation
+ * @steps_left: Number of steps taken towards lower moderation
+ * @tired: Parking depth counter
+ */
+struct dim {
+	u8 state;
+	struct dim_stats prev_stats;
+	struct dim_sample start_sample;
+	struct dim_sample measuring_sample;
+	struct work_struct work;
+	void *priv;
+	u8 profile_ix;
+	u8 mode;
+	u8 tune_state;
+	u8 steps_right;
+	u8 steps_left;
+	u8 tired;
+};
+
+/**
+ * enum dim_cq_period_mode
+ *
+ * These are the modes for CQ period count.
+ *
+ * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE
+ * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset)
+ * @DIM_CQ_PERIOD_NUM_MODES: Number of modes
+ */
+enum {
+	DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+	DIM_CQ_PERIOD_NUM_MODES
+};
+
+/**
+ * enum dim_state
+ *
+ * These are the DIM algorithm states.
+ * These will determine if the algorithm is in a valid state to start an iteration.
+ *
+ * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile)
+ * @DIM_MEASURE_IN_PROGRESS: Algorithm is already in progress - check if
+ * need to perform an action
+ * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure
+ */
+enum {
+	DIM_START_MEASURE,
+	DIM_MEASURE_IN_PROGRESS,
+	DIM_APPLY_NEW_PROFILE,
+};
+
+/**
+ * enum dim_tune_state
+ *
+ * These are the DIM algorithm tune states.
+ * These will determine which action the algorithm should perform.
+ *
+ * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference
+ * @DIM_PARKING_TIRED: Algorithm found a deep top point - don't exit if tired > 0
+ * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels
+ * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels
+ */
+enum {
+	DIM_PARKING_ON_TOP,
+	DIM_PARKING_TIRED,
+	DIM_GOING_RIGHT,
+	DIM_GOING_LEFT,
+};
+
+/**
+ * enum dim_stats_state
+ *
+ * These are the DIM algorithm statistics states.
+ * These will determine the verdict of current iteration.
+ *
+ * @DIM_STATS_WORSE: Current iteration shows worse performance than before
+ * @DIM_STATS_WORSE: Current iteration shows same performance than before
+ * @DIM_STATS_WORSE: Current iteration shows better performance than before
+ */
+enum {
+	DIM_STATS_WORSE,
+	DIM_STATS_SAME,
+	DIM_STATS_BETTER,
+};
+
+/**
+ * enum dim_step_result
+ *
+ * These are the DIM algorithm step results.
+ * These describe the result of a step.
+ *
+ * @DIM_STEPPED: Performed a regular step
+ * @DIM_TOO_TIRED: Same kind of step was done multiple times - should go to
+ * tired parking
+ * @DIM_ON_EDGE: Stepped to the most left/right profile
+ */
+enum {
+	DIM_STEPPED,
+	DIM_TOO_TIRED,
+	DIM_ON_EDGE,
+};
+
+/**
+ *	dim_on_top - check if current state is a good place to stop (top location)
+ *	@dim: DIM context
+ *
+ * Check if current profile is a good place to park at.
+ * This will result in reducing the DIM checks frequency as we assume we
+ * shouldn't probably change profiles, unless traffic pattern wasn't changed.
+ */
+bool dim_on_top(struct dim *dim);
+
+/**
+ *	dim_turn - change profile alterning direction
+ *	@dim: DIM context
+ *
+ * Go left if we were going right and vice-versa.
+ * Do nothing if currently parking.
+ */
+void dim_turn(struct dim *dim);
+
+/**
+ *	dim_park_on_top - enter a parking state on a top location
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history.
+ */
+void dim_park_on_top(struct dim *dim);
+
+/**
+ *	dim_park_tired - enter a tired parking state
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history and cause DIM checks frequency to reduce.
+ */
+void dim_park_tired(struct dim *dim);
+
+/**
+ *	dim_calc_stats - calculate the difference between two samples
+ *	@start: start sample
+ *	@end: end sample
+ *	@curr_stats: delta between samples
+ *
+ * Calculate the delta between two samples (in data rates).
+ * Takes into consideration counter wrap-around.
+ */
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats);
+
+/**
+ *	dim_update_sample - set a sample's fields with give values
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
+{
+	s->time	     = ktime_get();
+	s->pkt_ctr   = packets;
+	s->byte_ctr  = bytes;
+	s->event_ctr = event_ctr;
+}
+
+/**
+ *	dim_update_sample_with_comps - set a sample's fields with given
+ *	values including the completion parameter
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@comps: number of completions to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps,
+			     struct dim_sample *s)
+{
+	dim_update_sample(event_ctr, packets, bytes, s);
+	s->comp_ctr = comps;
+}
+
+/* Net DIM */
+
+/**
+ *	net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_rx_moderation - provide the default RX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_rx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim_get_tx_moderation - provide a CQ moderation object for the given TX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_tx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_tx_moderation - provide the default TX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim - main DIM algorithm entry point
+ *	@dim: DIM instance information
+ *	@end_sample: Current data measurement
+ *
+ * Called by the consumer.
+ * This is the main logic of the algorithm, where data is processed in order to decide on next
+ * required action.
+ */
+void net_dim(struct dim *dim, struct dim_sample end_sample);
+
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */
+
+#endif /* DIM_H */
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
new file mode 100644
index 0000000000000..a52f588445039
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -0,0 +1,1324 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+#ifndef _ENA_ADMIN_H_
+#define _ENA_ADMIN_H_
+
+#define ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN 32
+#define ENA_ADMIN_EXTRA_PROPERTIES_COUNT     32
+
+#define ENA_ADMIN_RSS_KEY_PARTS              10
+
+enum ena_admin_aq_opcode {
+	ENA_ADMIN_CREATE_SQ                         = 1,
+	ENA_ADMIN_DESTROY_SQ                        = 2,
+	ENA_ADMIN_CREATE_CQ                         = 3,
+	ENA_ADMIN_DESTROY_CQ                        = 4,
+	ENA_ADMIN_GET_FEATURE                       = 8,
+	ENA_ADMIN_SET_FEATURE                       = 9,
+	ENA_ADMIN_GET_STATS                         = 11,
+};
+
+enum ena_admin_aq_completion_status {
+	ENA_ADMIN_SUCCESS                           = 0,
+	ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE       = 1,
+	ENA_ADMIN_BAD_OPCODE                        = 2,
+	ENA_ADMIN_UNSUPPORTED_OPCODE                = 3,
+	ENA_ADMIN_MALFORMED_REQUEST                 = 4,
+	/* Additional status is provided in ACQ entry extended_status */
+	ENA_ADMIN_ILLEGAL_PARAMETER                 = 5,
+	ENA_ADMIN_UNKNOWN_ERROR                     = 6,
+	ENA_ADMIN_RESOURCE_BUSY                     = 7,
+};
+
+/* subcommands for the set/get feature admin commands */
+enum ena_admin_aq_feature_id {
+	ENA_ADMIN_DEVICE_ATTRIBUTES                 = 1,
+	ENA_ADMIN_MAX_QUEUES_NUM                    = 2,
+	ENA_ADMIN_HW_HINTS                          = 3,
+	ENA_ADMIN_LLQ                               = 4,
+	ENA_ADMIN_EXTRA_PROPERTIES_STRINGS          = 5,
+	ENA_ADMIN_EXTRA_PROPERTIES_FLAGS            = 6,
+	ENA_ADMIN_MAX_QUEUES_EXT                    = 7,
+	ENA_ADMIN_RSS_HASH_FUNCTION                 = 10,
+	ENA_ADMIN_STATELESS_OFFLOAD_CONFIG          = 11,
+	ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG      = 12,
+	ENA_ADMIN_MTU                               = 14,
+	ENA_ADMIN_RSS_HASH_INPUT                    = 18,
+	ENA_ADMIN_INTERRUPT_MODERATION              = 20,
+	ENA_ADMIN_AENQ_CONFIG                       = 26,
+	ENA_ADMIN_LINK_CONFIG                       = 27,
+	ENA_ADMIN_HOST_ATTR_CONFIG                  = 28,
+	ENA_ADMIN_PHC_CONFIG                        = 29,
+	ENA_ADMIN_FEATURES_OPCODE_NUM               = 32,
+};
+
+/* device capabilities */
+enum ena_admin_aq_caps_id {
+	ENA_ADMIN_ENI_STATS                         = 0,
+	/* ENA SRD customer metrics */
+	ENA_ADMIN_ENA_SRD_INFO                      = 1,
+};
+
+enum ena_admin_placement_policy_type {
+	/* descriptors and headers are in host memory */
+	ENA_ADMIN_PLACEMENT_POLICY_HOST             = 1,
+	/* descriptors and headers are in device memory (a.k.a Low Latency
+	 * Queue)
+	 */
+	ENA_ADMIN_PLACEMENT_POLICY_DEV              = 3,
+};
+
+enum ena_admin_link_types {
+	ENA_ADMIN_LINK_SPEED_1G                     = 0x1,
+	ENA_ADMIN_LINK_SPEED_2_HALF_G               = 0x2,
+	ENA_ADMIN_LINK_SPEED_5G                     = 0x4,
+	ENA_ADMIN_LINK_SPEED_10G                    = 0x8,
+	ENA_ADMIN_LINK_SPEED_25G                    = 0x10,
+	ENA_ADMIN_LINK_SPEED_40G                    = 0x20,
+	ENA_ADMIN_LINK_SPEED_50G                    = 0x40,
+	ENA_ADMIN_LINK_SPEED_100G                   = 0x80,
+	ENA_ADMIN_LINK_SPEED_200G                   = 0x100,
+	ENA_ADMIN_LINK_SPEED_400G                   = 0x200,
+};
+
+enum ena_admin_completion_policy_type {
+	/* completion queue entry for each sq descriptor */
+	ENA_ADMIN_COMPLETION_POLICY_DESC            = 0,
+	/* completion queue entry upon request in sq descriptor */
+	ENA_ADMIN_COMPLETION_POLICY_DESC_ON_DEMAND  = 1,
+	/* current queue head pointer is updated in OS memory upon sq
+	 * descriptor request
+	 */
+	ENA_ADMIN_COMPLETION_POLICY_HEAD_ON_DEMAND  = 2,
+	/* current queue head pointer is updated in OS memory for each sq
+	 * descriptor
+	 */
+	ENA_ADMIN_COMPLETION_POLICY_HEAD            = 3,
+};
+
+/* basic stats return ena_admin_basic_stats while extanded stats return a
+ * buffer (string format) with additional statistics per queue and per
+ * device id
+ */
+enum ena_admin_get_stats_type {
+	ENA_ADMIN_GET_STATS_TYPE_BASIC              = 0,
+	ENA_ADMIN_GET_STATS_TYPE_EXTENDED           = 1,
+	/* extra HW stats for specific network interface */
+	ENA_ADMIN_GET_STATS_TYPE_ENI                = 2,
+	/* extra HW stats for ENA SRD */
+	ENA_ADMIN_GET_STATS_TYPE_ENA_SRD            = 3,
+};
+
+enum ena_admin_get_stats_scope {
+	ENA_ADMIN_SPECIFIC_QUEUE                    = 0,
+	ENA_ADMIN_ETH_TRAFFIC                       = 1,
+};
+
+enum ena_admin_get_phc_type {
+	ENA_ADMIN_PHC_TYPE_READLESS                 = 0,
+};
+
+/* ENA SRD configuration for ENI */
+enum ena_admin_ena_srd_flags {
+	/* Feature enabled */
+	ENA_ADMIN_ENA_SRD_ENABLED                   = BIT(0),
+	/* UDP support enabled */
+	ENA_ADMIN_ENA_SRD_UDP_ENABLED               = BIT(1),
+	/* Bypass Rx UDP ordering */
+	ENA_ADMIN_ENA_SRD_UDP_ORDERING_BYPASS_ENABLED = BIT(2),
+};
+
+struct ena_admin_aq_common_desc {
+	/* 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command_id;
+
+	/* as appears in ena_admin_aq_opcode */
+	u8 opcode;
+
+	/* 0 : phase
+	 * 1 : ctrl_data - control buffer address valid
+	 * 2 : ctrl_data_indirect - control buffer address
+	 *    points to list of pages with addresses of control
+	 *    buffers
+	 * 7:3 : reserved3
+	 */
+	u8 flags;
+};
+
+/* used in ena_admin_aq_entry. Can point directly to control data, or to a
+ * page list chunk. Used also at the end of indirect mode page list chunks,
+ * for chaining.
+ */
+struct ena_admin_ctrl_buff_info {
+	u32 length;
+
+	struct ena_common_mem_addr address;
+};
+
+struct ena_admin_sq {
+	u16 sq_idx;
+
+	/* 4:0 : reserved
+	 * 7:5 : sq_direction - 0x1 - Tx; 0x2 - Rx
+	 */
+	u8 sq_identity;
+
+	u8 reserved1;
+};
+
+struct ena_admin_aq_entry {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		u32 inline_data_w1[3];
+
+		struct ena_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	u32 inline_data_w4[12];
+};
+
+struct ena_admin_acq_common_desc {
+	/* command identifier to associate it with the aq descriptor
+	 * 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command;
+
+	u8 status;
+
+	/* 0 : phase
+	 * 7:1 : reserved1
+	 */
+	u8 flags;
+
+	u16 extended_status;
+
+	/* indicates to the driver which AQ entry has been consumed by the
+	 * device and could be reused
+	 */
+	u16 sq_head_indx;
+};
+
+struct ena_admin_acq_entry {
+	struct ena_admin_acq_common_desc acq_common_descriptor;
+
+	u32 response_specific_data[14];
+};
+
+struct ena_admin_aq_create_sq_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	/* 4:0 : reserved0_w1
+	 * 7:5 : sq_direction - 0x1 - Tx, 0x2 - Rx
+	 */
+	u8 sq_identity;
+
+	u8 reserved8_w1;
+
+	/* 3:0 : placement_policy - Describing where the SQ
+	 *    descriptor ring and the SQ packet headers reside:
+	 *    0x1 - descriptors and headers are in OS memory,
+	 *    0x3 - descriptors and headers in device memory
+	 *    (a.k.a Low Latency Queue)
+	 * 6:4 : completion_policy - Describing what policy
+	 *    to use for generation completion entry (cqe) in
+	 *    the CQ associated with this SQ: 0x0 - cqe for each
+	 *    sq descriptor, 0x1 - cqe upon request in sq
+	 *    descriptor, 0x2 - current queue head pointer is
+	 *    updated in OS memory upon sq descriptor request
+	 *    0x3 - current queue head pointer is updated in OS
+	 *    memory for each sq descriptor
+	 * 7 : reserved15_w1
+	 */
+	u8 sq_caps_2;
+
+	/* 0 : is_physically_contiguous - Described if the
+	 *    queue ring memory is allocated in physical
+	 *    contiguous pages or split.
+	 * 7:1 : reserved17_w1
+	 */
+	u8 sq_caps_3;
+
+	/* associated completion queue id. This CQ must be created prior to SQ
+	 * creation
+	 */
+	u16 cq_idx;
+
+	/* submission queue depth in entries */
+	u16 sq_depth;
+
+	/* SQ physical base address in OS memory. This field should not be
+	 * used for Low Latency queues. Has to be page aligned.
+	 */
+	struct ena_common_mem_addr sq_ba;
+
+	/* specifies queue head writeback location in OS memory. Valid if
+	 * completion_policy is set to completion_policy_head_on_demand or
+	 * completion_policy_head. Has to be cache aligned
+	 */
+	struct ena_common_mem_addr sq_head_writeback;
+
+	u32 reserved0_w7;
+
+	u32 reserved0_w8;
+};
+
+enum ena_admin_sq_direction {
+	ENA_ADMIN_SQ_DIRECTION_TX                   = 1,
+	ENA_ADMIN_SQ_DIRECTION_RX                   = 2,
+};
+
+struct ena_admin_acq_create_sq_resp_desc {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	u16 sq_idx;
+
+	u16 reserved;
+
+	/* queue doorbell address as an offset to PCIe MMIO REG BAR */
+	u32 sq_doorbell_offset;
+
+	/* low latency queue ring base address as an offset to PCIe MMIO
+	 * LLQ_MEM BAR
+	 */
+	u32 llq_descriptors_offset;
+
+	/* low latency queue headers' memory as an offset to PCIe MMIO
+	 * LLQ_MEM BAR
+	 */
+	u32 llq_headers_offset;
+};
+
+struct ena_admin_aq_destroy_sq_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	struct ena_admin_sq sq;
+};
+
+struct ena_admin_acq_destroy_sq_resp_desc {
+	struct ena_admin_acq_common_desc acq_common_desc;
+};
+
+struct ena_admin_aq_create_cq_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	/* 4:0 : reserved5
+	 * 5 : interrupt_mode_enabled - if set, cq operates
+	 *    in interrupt mode, otherwise - polling
+	 * 7:6 : reserved6
+	 */
+	u8 cq_caps_1;
+
+	/* 4:0 : cq_entry_size_words - size of CQ entry in
+	 *    32-bit words, valid values: 4, 8.
+	 * 7:5 : reserved7
+	 */
+	u8 cq_caps_2;
+
+	/* completion queue depth in # of entries. must be power of 2 */
+	u16 cq_depth;
+
+	/* msix vector assigned to this cq */
+	u32 msix_vector;
+
+	/* cq physical base address in OS memory. CQ must be physically
+	 * contiguous
+	 */
+	struct ena_common_mem_addr cq_ba;
+};
+
+struct ena_admin_acq_create_cq_resp_desc {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	u16 cq_idx;
+
+	/* actual cq depth in number of entries */
+	u16 cq_actual_depth;
+
+	u32 numa_node_register_offset;
+
+	u32 cq_head_db_register_offset;
+
+	u32 cq_interrupt_unmask_register_offset;
+};
+
+struct ena_admin_aq_destroy_cq_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	u16 cq_idx;
+
+	u16 reserved1;
+};
+
+struct ena_admin_acq_destroy_cq_resp_desc {
+	struct ena_admin_acq_common_desc acq_common_desc;
+};
+
+/* ENA AQ Get Statistics command. Extended statistics are placed in control
+ * buffer pointed by AQ entry
+ */
+struct ena_admin_aq_get_stats_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		/* command specific inline data */
+		u32 inline_data_w1[3];
+
+		struct ena_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	/* stats type as defined in enum ena_admin_get_stats_type */
+	u8 type;
+
+	/* stats scope defined in enum ena_admin_get_stats_scope */
+	u8 scope;
+
+	u16 reserved3;
+
+	/* queue id. used when scope is specific_queue */
+	u16 queue_idx;
+
+	/* device id, value 0xFFFF means mine. only privileged device can get
+	 * stats of other device
+	 */
+	u16 device_id;
+};
+
+/* Basic Statistics Command. */
+struct ena_admin_basic_stats {
+	u32 tx_bytes_low;
+
+	u32 tx_bytes_high;
+
+	u32 tx_pkts_low;
+
+	u32 tx_pkts_high;
+
+	u32 rx_bytes_low;
+
+	u32 rx_bytes_high;
+
+	u32 rx_pkts_low;
+
+	u32 rx_pkts_high;
+
+	u32 rx_drops_low;
+
+	u32 rx_drops_high;
+
+	u32 tx_drops_low;
+
+	u32 tx_drops_high;
+};
+
+/* ENI Statistics Command. */
+struct ena_admin_eni_stats {
+	/* The number of packets shaped due to inbound aggregate BW
+	 * allowance being exceeded
+	 */
+	u64 bw_in_allowance_exceeded;
+
+	/* The number of packets shaped due to outbound aggregate BW
+	 * allowance being exceeded
+	 */
+	u64 bw_out_allowance_exceeded;
+
+	/* The number of packets shaped due to PPS allowance being exceeded */
+	u64 pps_allowance_exceeded;
+
+	/* The number of packets shaped due to connection tracking
+	 * allowance being exceeded and leading to failure in establishment
+	 * of new connections
+	 */
+	u64 conntrack_allowance_exceeded;
+
+	/* The number of packets shaped due to linklocal packet rate
+	 * allowance being exceeded
+	 */
+	u64 linklocal_allowance_exceeded;
+};
+
+struct ena_admin_ena_srd_stats {
+	/* Number of packets transmitted over ENA SRD */
+	u64 ena_srd_tx_pkts;
+
+	/* Number of packets transmitted or could have been
+	 * transmitted over ENA SRD
+	 */
+	u64 ena_srd_eligible_tx_pkts;
+
+	/* Number of packets received over ENA SRD */
+	u64 ena_srd_rx_pkts;
+
+	/* Percentage of the ENA SRD resources that is in use */
+	u64 ena_srd_resource_utilization;
+};
+
+/* ENA SRD Statistics Command */
+struct ena_admin_ena_srd_info {
+	/* ENA SRD configuration bitmap. See ena_admin_ena_srd_flags for
+	 * details
+	 */
+	u64 flags;
+
+	struct ena_admin_ena_srd_stats ena_srd_stats;
+};
+
+struct ena_admin_acq_get_stats_resp {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u64 raw[7];
+
+		struct ena_admin_basic_stats basic_stats;
+
+		struct ena_admin_eni_stats eni_stats;
+
+		struct ena_admin_ena_srd_info ena_srd_info;
+	} u;
+};
+
+struct ena_admin_get_set_feature_common_desc {
+	/* 1:0 : select - 0x1 - current value; 0x3 - default
+	 *    value
+	 * 7:3 : reserved3
+	 */
+	u8 flags;
+
+	/* as appears in ena_admin_aq_feature_id */
+	u8 feature_id;
+
+	/* The driver specifies the max feature version it supports and the
+	 * device responds with the currently supported feature version. The
+	 * field is zero based
+	 */
+	u8 feature_version;
+
+	u8 reserved8;
+};
+
+struct ena_admin_device_attr_feature_desc {
+	u32 impl_id;
+
+	u32 device_version;
+
+	/* bitmap of ena_admin_aq_feature_id, which represents supported
+	 * subcommands for the set/get feature admin commands.
+	 */
+	u32 supported_features;
+
+	/* bitmap of ena_admin_aq_caps_id, which represents device
+	 * capabilities.
+	 */
+	u32 capabilities;
+
+	/* Indicates how many bits are used physical address access. */
+	u32 phys_addr_width;
+
+	/* Indicates how many bits are used virtual address access. */
+	u32 virt_addr_width;
+
+	/* unicast MAC address (in Network byte order) */
+	u8 mac_addr[6];
+
+	u8 reserved7[2];
+
+	u32 max_mtu;
+};
+
+enum ena_admin_llq_header_location {
+	/* header is in descriptor list */
+	ENA_ADMIN_INLINE_HEADER                     = 1,
+	/* header in a separate ring, implies 16B descriptor list entry */
+	ENA_ADMIN_HEADER_RING                       = 2,
+};
+
+enum ena_admin_llq_ring_entry_size {
+	ENA_ADMIN_LIST_ENTRY_SIZE_128B              = 1,
+	ENA_ADMIN_LIST_ENTRY_SIZE_192B              = 2,
+	ENA_ADMIN_LIST_ENTRY_SIZE_256B              = 4,
+};
+
+enum ena_admin_llq_num_descs_before_header {
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_0     = 0,
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1     = 1,
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2     = 2,
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4     = 4,
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8     = 8,
+};
+
+/* packet descriptor list entry always starts with one or more descriptors,
+ * followed by a header. The rest of the descriptors are located in the
+ * beginning of the subsequent entry. Stride refers to how the rest of the
+ * descriptors are placed. This field is relevant only for inline header
+ * mode
+ */
+enum ena_admin_llq_stride_ctrl {
+	ENA_ADMIN_SINGLE_DESC_PER_ENTRY             = 1,
+	ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY          = 2,
+};
+
+enum ena_admin_accel_mode_feat {
+	ENA_ADMIN_DISABLE_META_CACHING              = 0,
+	ENA_ADMIN_LIMIT_TX_BURST                    = 1,
+};
+
+struct ena_admin_accel_mode_get {
+	/* bit field of enum ena_admin_accel_mode_feat */
+	u16 supported_flags;
+
+	/* maximum burst size between two doorbells. The size is in bytes */
+	u16 max_tx_burst_size;
+};
+
+struct ena_admin_accel_mode_set {
+	/* bit field of enum ena_admin_accel_mode_feat */
+	u16 enabled_flags;
+
+	u16 reserved;
+};
+
+struct ena_admin_accel_mode_req {
+	union {
+		u32 raw[2];
+
+		struct ena_admin_accel_mode_get get;
+
+		struct ena_admin_accel_mode_set set;
+	} u;
+};
+
+struct ena_admin_feature_llq_desc {
+	u32 max_llq_num;
+
+	u32 max_llq_depth;
+
+	/* specify the header locations the device supports. bitfield of enum
+	 * ena_admin_llq_header_location.
+	 */
+	u16 header_location_ctrl_supported;
+
+	/* the header location the driver selected to use. */
+	u16 header_location_ctrl_enabled;
+
+	/* if inline header is specified - this is the size of descriptor list
+	 * entry. If header in a separate ring is specified - this is the size
+	 * of header ring entry. bitfield of enum ena_admin_llq_ring_entry_size.
+	 * specify the entry sizes the device supports
+	 */
+	u16 entry_size_ctrl_supported;
+
+	/* the entry size the driver selected to use. */
+	u16 entry_size_ctrl_enabled;
+
+	/* valid only if inline header is specified. First entry associated with
+	 * the packet includes descriptors and header. Rest of the entries
+	 * occupied by descriptors. This parameter defines the max number of
+	 * descriptors precedding the header in the first entry. The field is
+	 * bitfield of enum ena_admin_llq_num_descs_before_header and specify
+	 * the values the device supports
+	 */
+	u16 desc_num_before_header_supported;
+
+	/* the desire field the driver selected to use */
+	u16 desc_num_before_header_enabled;
+
+	/* valid only if inline was chosen. bitfield of enum
+	 * ena_admin_llq_stride_ctrl
+	 */
+	u16 descriptors_stride_ctrl_supported;
+
+	/* the stride control the driver selected to use */
+	u16 descriptors_stride_ctrl_enabled;
+
+	/* reserved */
+	u32 reserved1;
+
+	/* accelerated low latency queues requirement. driver needs to
+	 * support those requirements in order to use accelerated llq
+	 */
+	struct ena_admin_accel_mode_req accel_mode;
+};
+
+struct ena_admin_queue_ext_feature_fields {
+	u32 max_tx_sq_num;
+
+	u32 max_tx_cq_num;
+
+	u32 max_rx_sq_num;
+
+	u32 max_rx_cq_num;
+
+	u32 max_tx_sq_depth;
+
+	u32 max_tx_cq_depth;
+
+	u32 max_rx_sq_depth;
+
+	u32 max_rx_cq_depth;
+
+	u32 max_tx_header_size;
+
+	/* Maximum Descriptors number, including meta descriptor, allowed for a
+	 * single Tx packet
+	 */
+	u16 max_per_packet_tx_descs;
+
+	/* Maximum Descriptors number allowed for a single Rx packet */
+	u16 max_per_packet_rx_descs;
+};
+
+struct ena_admin_queue_feature_desc {
+	u32 max_sq_num;
+
+	u32 max_sq_depth;
+
+	u32 max_cq_num;
+
+	u32 max_cq_depth;
+
+	u32 max_legacy_llq_num;
+
+	u32 max_legacy_llq_depth;
+
+	u32 max_header_size;
+
+	/* Maximum Descriptors number, including meta descriptor, allowed for a
+	 * single Tx packet
+	 */
+	u16 max_packet_tx_descs;
+
+	/* Maximum Descriptors number allowed for a single Rx packet */
+	u16 max_packet_rx_descs;
+};
+
+struct ena_admin_set_feature_mtu_desc {
+	/* exclude L2 */
+	u32 mtu;
+};
+
+struct ena_admin_get_extra_properties_strings_desc {
+	u32 count;
+};
+
+struct ena_admin_get_extra_properties_flags_desc {
+	u32 flags;
+};
+
+struct ena_admin_set_feature_host_attr_desc {
+	/* host OS info base address in OS memory. host info is 4KB of
+	 * physically contiguous
+	 */
+	struct ena_common_mem_addr os_info_ba;
+
+	/* host debug area base address in OS memory. debug area must be
+	 * physically contiguous
+	 */
+	struct ena_common_mem_addr debug_ba;
+
+	/* debug area size */
+	u32 debug_area_size;
+};
+
+struct ena_admin_feature_intr_moder_desc {
+	/* interrupt delay granularity in usec */
+	u16 intr_delay_resolution;
+
+	u16 reserved;
+};
+
+struct ena_admin_get_feature_link_desc {
+	/* Link speed in Mb */
+	u32 speed;
+
+	/* bit field of enum ena_admin_link types */
+	u32 supported;
+
+	/* 0 : autoneg
+	 * 1 : duplex - Full Duplex
+	 * 31:2 : reserved2
+	 */
+	u32 flags;
+};
+
+struct ena_admin_feature_aenq_desc {
+	/* bitmask for AENQ groups the device can report */
+	u32 supported_groups;
+
+	/* bitmask for AENQ groups to report */
+	u32 enabled_groups;
+};
+
+struct ena_admin_feature_offload_desc {
+	/* 0 : TX_L3_csum_ipv4
+	 * 1 : TX_L4_ipv4_csum_part - The checksum field
+	 *    should be initialized with pseudo header checksum
+	 * 2 : TX_L4_ipv4_csum_full
+	 * 3 : TX_L4_ipv6_csum_part - The checksum field
+	 *    should be initialized with pseudo header checksum
+	 * 4 : TX_L4_ipv6_csum_full
+	 * 5 : tso_ipv4
+	 * 6 : tso_ipv6
+	 * 7 : tso_ecn
+	 */
+	u32 tx;
+
+	/* Receive side supported stateless offload
+	 * 0 : RX_L3_csum_ipv4 - IPv4 checksum
+	 * 1 : RX_L4_ipv4_csum - TCP/UDP/IPv4 checksum
+	 * 2 : RX_L4_ipv6_csum - TCP/UDP/IPv6 checksum
+	 * 3 : RX_hash - Hash calculation
+	 */
+	u32 rx_supported;
+
+	u32 rx_enabled;
+};
+
+enum ena_admin_hash_functions {
+	ENA_ADMIN_TOEPLITZ                          = 1,
+	ENA_ADMIN_CRC32                             = 2,
+};
+
+struct ena_admin_feature_rss_flow_hash_control {
+	u32 key_parts;
+
+	u32 reserved;
+
+	u32 key[ENA_ADMIN_RSS_KEY_PARTS];
+};
+
+struct ena_admin_feature_rss_flow_hash_function {
+	/* 7:0 : funcs - bitmask of ena_admin_hash_functions */
+	u32 supported_func;
+
+	/* 7:0 : selected_func - bitmask of
+	 *    ena_admin_hash_functions
+	 */
+	u32 selected_func;
+
+	/* initial value */
+	u32 init_val;
+};
+
+/* RSS flow hash protocols */
+enum ena_admin_flow_hash_proto {
+	ENA_ADMIN_RSS_TCP4                          = 0,
+	ENA_ADMIN_RSS_UDP4                          = 1,
+	ENA_ADMIN_RSS_TCP6                          = 2,
+	ENA_ADMIN_RSS_UDP6                          = 3,
+	ENA_ADMIN_RSS_IP4                           = 4,
+	ENA_ADMIN_RSS_IP6                           = 5,
+	ENA_ADMIN_RSS_IP4_FRAG                      = 6,
+	ENA_ADMIN_RSS_NOT_IP                        = 7,
+	/* TCPv6 with extension header */
+	ENA_ADMIN_RSS_TCP6_EX                       = 8,
+	/* IPv6 with extension header */
+	ENA_ADMIN_RSS_IP6_EX                        = 9,
+	ENA_ADMIN_RSS_PROTO_NUM                     = 16,
+};
+
+/* RSS flow hash fields */
+enum ena_admin_flow_hash_fields {
+	/* Ethernet Dest Addr */
+	ENA_ADMIN_RSS_L2_DA                         = BIT(0),
+	/* Ethernet Src Addr */
+	ENA_ADMIN_RSS_L2_SA                         = BIT(1),
+	/* ipv4/6 Dest Addr */
+	ENA_ADMIN_RSS_L3_DA                         = BIT(2),
+	/* ipv4/6 Src Addr */
+	ENA_ADMIN_RSS_L3_SA                         = BIT(3),
+	/* tcp/udp Dest Port */
+	ENA_ADMIN_RSS_L4_DP                         = BIT(4),
+	/* tcp/udp Src Port */
+	ENA_ADMIN_RSS_L4_SP                         = BIT(5),
+};
+
+struct ena_admin_proto_input {
+	/* flow hash fields (bitwise according to ena_admin_flow_hash_fields) */
+	u16 fields;
+
+	u16 reserved2;
+};
+
+struct ena_admin_feature_rss_hash_control {
+	struct ena_admin_proto_input supported_fields[ENA_ADMIN_RSS_PROTO_NUM];
+
+	struct ena_admin_proto_input selected_fields[ENA_ADMIN_RSS_PROTO_NUM];
+
+	struct ena_admin_proto_input reserved2[ENA_ADMIN_RSS_PROTO_NUM];
+
+	struct ena_admin_proto_input reserved3[ENA_ADMIN_RSS_PROTO_NUM];
+};
+
+struct ena_admin_feature_rss_flow_hash_input {
+	/* supported hash input sorting
+	 * 1 : L3_sort - support swap L3 addresses if DA is
+	 *    smaller than SA
+	 * 2 : L4_sort - support swap L4 ports if DP smaller
+	 *    SP
+	 */
+	u16 supported_input_sort;
+
+	/* enabled hash input sorting
+	 * 1 : enable_L3_sort - enable swap L3 addresses if
+	 *    DA smaller than SA
+	 * 2 : enable_L4_sort - enable swap L4 ports if DP
+	 *    smaller than SP
+	 */
+	u16 enabled_input_sort;
+};
+
+enum ena_admin_os_type {
+	ENA_ADMIN_OS_LINUX                          = 1,
+	ENA_ADMIN_OS_WIN                            = 2,
+	ENA_ADMIN_OS_DPDK                           = 3,
+	ENA_ADMIN_OS_FREEBSD                        = 4,
+	ENA_ADMIN_OS_IPXE                           = 5,
+	ENA_ADMIN_OS_ESXI                           = 6,
+	ENA_ADMIN_OS_MACOS                          = 7,
+	ENA_ADMIN_OS_GROUPS_NUM                     = 7,
+};
+
+struct ena_admin_host_info {
+	/* defined in enum ena_admin_os_type */
+	u32 os_type;
+
+	/* os distribution string format */
+	u8 os_dist_str[128];
+
+	/* OS distribution numeric format */
+	u32 os_dist;
+
+	/* kernel version string format */
+	u8 kernel_ver_str[32];
+
+	/* Kernel version numeric format */
+	u32 kernel_ver;
+
+	/* 7:0 : major
+	 * 15:8 : minor
+	 * 23:16 : sub_minor
+	 * 31:24 : module_type
+	 */
+	u32 driver_version;
+
+	/* features bitmap */
+	u32 supported_network_features[2];
+
+	/* ENA spec version of driver */
+	u16 ena_spec_version;
+
+	/* ENA device's Bus, Device and Function
+	 * 2:0 : function
+	 * 7:3 : device
+	 * 15:8 : bus
+	 */
+	u16 bdf;
+
+	/* Number of CPUs */
+	u16 num_cpus;
+
+	u16 reserved;
+
+	/* 0 : reserved
+	 * 1 : rx_offset
+	 * 2 : interrupt_moderation
+	 * 3 : rx_buf_mirroring
+	 * 4 : rss_configurable_function_key
+	 * 5 : reserved
+	 * 6 : rx_page_reuse
+	 * 31:7 : reserved
+	 */
+	u32 driver_supported_features;
+};
+
+struct ena_admin_rss_ind_table_entry {
+	u16 cq_idx;
+
+	u16 reserved;
+};
+
+struct ena_admin_feature_rss_ind_table {
+	/* min supported table size (2^min_size) */
+	u16 min_size;
+
+	/* max supported table size (2^max_size) */
+	u16 max_size;
+
+	/* table size (2^size) */
+	u16 size;
+
+	/* 0 : one_entry_update - The ENA device supports
+	 *    setting a single RSS table entry
+	 */
+	u8 flags;
+
+	u8 reserved;
+
+	/* index of the inline entry. 0xFFFFFFFF means invalid */
+	u32 inline_index;
+
+	/* used for updating single entry, ignored when setting the entire
+	 * table through the control buffer.
+	 */
+	struct ena_admin_rss_ind_table_entry inline_entry;
+};
+
+/* When hint value is 0, driver should use it's own predefined value */
+struct ena_admin_ena_hw_hints {
+	/* value in ms */
+	u16 mmio_read_timeout;
+
+	/* value in ms */
+	u16 driver_watchdog_timeout;
+
+	/* Per packet tx completion timeout. value in ms */
+	u16 missing_tx_completion_timeout;
+
+	u16 missed_tx_completion_count_threshold_to_reset;
+
+	/* value in ms */
+	u16 admin_completion_tx_timeout;
+
+	u16 netdev_wd_timeout;
+
+	u16 max_tx_sgl_size;
+
+	u16 max_rx_sgl_size;
+
+	u16 reserved[8];
+};
+
+struct ena_admin_get_feat_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	struct ena_admin_ctrl_buff_info control_buffer;
+
+	struct ena_admin_get_set_feature_common_desc feat_common;
+
+	u32 raw[11];
+};
+
+struct ena_admin_queue_ext_feature_desc {
+	/* version */
+	u8 version;
+
+	u8 reserved1[3];
+
+	union {
+		struct ena_admin_queue_ext_feature_fields max_queue_ext;
+
+		u32 raw[10];
+	};
+};
+
+struct ena_admin_feature_phc_desc {
+	/* PHC type as defined in enum ena_admin_get_phc_type,
+	 * used only for GET command.
+	 */
+	u8 type;
+
+	/* Reserved - MBZ */
+	u8 reserved1[3];
+
+	/* PHC doorbell address as an offset to PCIe MMIO REG BAR,
+	 * used only for GET command.
+	 */
+	u32 doorbell_offset;
+
+	/* Max time for valid PHC retrieval, passing this threshold will
+	 * fail the get-time request and block PHC requests for
+	 * block_timeout_usec, used only for GET command.
+	 */
+	u32 expire_timeout_usec;
+
+	/* PHC requests block period, blocking starts if PHC request expired
+	 * in order to prevent floods on busy device,
+	 * used only for GET command.
+	 */
+	u32 block_timeout_usec;
+
+	/* Shared PHC physical address (ena_admin_phc_resp),
+	 * used only for SET command.
+	 */
+	struct ena_common_mem_addr output_address;
+
+	/* Shared PHC Size (ena_admin_phc_resp),
+	 * used only for SET command.
+	 */
+	u32 output_length;
+};
+
+struct ena_admin_get_feat_resp {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+
+		struct ena_admin_device_attr_feature_desc dev_attr;
+
+		struct ena_admin_feature_llq_desc llq;
+
+		struct ena_admin_queue_feature_desc max_queue;
+
+		struct ena_admin_queue_ext_feature_desc max_queue_ext;
+
+		struct ena_admin_feature_aenq_desc aenq;
+
+		struct ena_admin_get_feature_link_desc link;
+
+		struct ena_admin_feature_offload_desc offload;
+
+		struct ena_admin_feature_rss_flow_hash_function flow_hash_func;
+
+		struct ena_admin_feature_rss_flow_hash_input flow_hash_input;
+
+		struct ena_admin_feature_rss_ind_table ind_table;
+
+		struct ena_admin_feature_intr_moder_desc intr_moderation;
+
+		struct ena_admin_ena_hw_hints hw_hints;
+
+		struct ena_admin_feature_phc_desc phc;
+
+		struct ena_admin_get_extra_properties_strings_desc extra_properties_strings;
+
+		struct ena_admin_get_extra_properties_flags_desc extra_properties_flags;
+	} u;
+};
+
+struct ena_admin_set_feat_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	struct ena_admin_ctrl_buff_info control_buffer;
+
+	struct ena_admin_get_set_feature_common_desc feat_common;
+
+	union {
+		u32 raw[11];
+
+		/* mtu size */
+		struct ena_admin_set_feature_mtu_desc mtu;
+
+		/* host attributes */
+		struct ena_admin_set_feature_host_attr_desc host_attr;
+
+		/* AENQ configuration */
+		struct ena_admin_feature_aenq_desc aenq;
+
+		/* rss flow hash function */
+		struct ena_admin_feature_rss_flow_hash_function flow_hash_func;
+
+		/* rss flow hash input */
+		struct ena_admin_feature_rss_flow_hash_input flow_hash_input;
+
+		/* rss indirection table */
+		struct ena_admin_feature_rss_ind_table ind_table;
+
+		/* LLQ configuration */
+		struct ena_admin_feature_llq_desc llq;
+
+		/* PHC configuration */
+		struct ena_admin_feature_phc_desc phc;
+	} u;
+};
+
+struct ena_admin_set_feat_resp {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+	} u;
+};
+
+struct ena_admin_aenq_common_desc {
+	u16 group;
+
+	u16 syndrome;
+
+	/* 0 : phase
+	 * 7:1 : reserved - MBZ
+	 */
+	u8 flags;
+
+	u8 reserved1[3];
+
+	u32 timestamp_low;
+
+	u32 timestamp_high;
+};
+
+/* asynchronous event notification groups */
+enum ena_admin_aenq_group {
+	ENA_ADMIN_LINK_CHANGE                       = 0,
+	ENA_ADMIN_FATAL_ERROR                       = 1,
+	ENA_ADMIN_WARNING                           = 2,
+	ENA_ADMIN_NOTIFICATION                      = 3,
+	ENA_ADMIN_KEEP_ALIVE                        = 4,
+	ENA_ADMIN_REFRESH_CAPABILITIES              = 5,
+	ENA_ADMIN_AENQ_GROUPS_NUM                   = 6,
+};
+
+enum ena_admin_aenq_notification_syndrome {
+	ENA_ADMIN_UPDATE_HINTS                      = 2,
+};
+
+struct ena_admin_aenq_entry {
+	struct ena_admin_aenq_common_desc aenq_common_desc;
+
+	/* command specific inline data */
+	u32 inline_data_w4[12];
+};
+
+struct ena_admin_aenq_link_change_desc {
+	struct ena_admin_aenq_common_desc aenq_common_desc;
+
+	/* 0 : link_status */
+	u32 flags;
+};
+
+struct ena_admin_aenq_keep_alive_desc {
+	struct ena_admin_aenq_common_desc aenq_common_desc;
+
+	u32 rx_drops_low;
+
+	u32 rx_drops_high;
+
+	u32 tx_drops_low;
+
+	u32 tx_drops_high;
+};
+
+struct ena_admin_ena_mmio_req_read_less_resp {
+	u16 req_id;
+
+	u16 reg_off;
+
+	/* value is valid when poll is cleared */
+	u32 reg_val;
+};
+
+struct ena_admin_phc_resp {
+	u16 req_id;
+
+	u8 reserved1[6];
+
+	u64 timestamp;
+
+	u8 reserved2[48];
+};
+
+/* aq_common_desc */
+#define ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
+#define ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT            1
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK             BIT(1)
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT   2
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK    BIT(2)
+
+/* sq */
+#define ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT                     5
+#define ENA_ADMIN_SQ_SQ_DIRECTION_MASK                      GENMASK(7, 5)
+
+/* acq_common_desc */
+#define ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK           GENMASK(11, 0)
+#define ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK                BIT(0)
+
+/* aq_create_sq_cmd */
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT       5
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK        GENMASK(7, 5)
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK    GENMASK(3, 0)
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT  4
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK   GENMASK(6, 4)
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_IS_PHYSICALLY_CONTIGUOUS_MASK BIT(0)
+
+/* aq_create_cq_cmd */
+#define ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_SHIFT 5
+#define ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
+#define ENA_ADMIN_AQ_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0)
+
+/* get_set_feature_common_desc */
+#define ENA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK   GENMASK(1, 0)
+
+/* get_feature_link_desc */
+#define ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK        BIT(0)
+#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_SHIFT        1
+#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK         BIT(1)
+
+/* feature_offload_desc */
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L3_CSUM_IPV4_MASK BIT(0)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_SHIFT 1
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK BIT(1)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_SHIFT 2
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_MASK BIT(2)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_SHIFT 3
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK BIT(3)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_SHIFT 4
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_MASK BIT(4)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_SHIFT       5
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK        BIT(5)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_SHIFT       6
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK        BIT(6)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_SHIFT        7
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK         BIT(7)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L3_CSUM_IPV4_MASK BIT(0)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_SHIFT 1
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK BIT(1)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_SHIFT 2
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK BIT(2)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_SHIFT        3
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_MASK         BIT(3)
+
+/* feature_rss_flow_hash_function */
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_FUNCTION_FUNCS_MASK GENMASK(7, 0)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_FUNCTION_SELECTED_FUNC_MASK GENMASK(7, 0)
+
+/* feature_rss_flow_hash_input */
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_SHIFT 1
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK  BIT(1)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_SHIFT 2
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK  BIT(2)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_SHIFT 1
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_MASK BIT(1)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_SHIFT 2
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_MASK BIT(2)
+
+/* host_info */
+#define ENA_ADMIN_HOST_INFO_MAJOR_MASK                      GENMASK(7, 0)
+#define ENA_ADMIN_HOST_INFO_MINOR_SHIFT                     8
+#define ENA_ADMIN_HOST_INFO_MINOR_MASK                      GENMASK(15, 8)
+#define ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT                 16
+#define ENA_ADMIN_HOST_INFO_SUB_MINOR_MASK                  GENMASK(23, 16)
+#define ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT               24
+#define ENA_ADMIN_HOST_INFO_MODULE_TYPE_MASK                GENMASK(31, 24)
+#define ENA_ADMIN_HOST_INFO_FUNCTION_MASK                   GENMASK(2, 0)
+#define ENA_ADMIN_HOST_INFO_DEVICE_SHIFT                    3
+#define ENA_ADMIN_HOST_INFO_DEVICE_MASK                     GENMASK(7, 3)
+#define ENA_ADMIN_HOST_INFO_BUS_SHIFT                       8
+#define ENA_ADMIN_HOST_INFO_BUS_MASK                        GENMASK(15, 8)
+#define ENA_ADMIN_HOST_INFO_RX_OFFSET_SHIFT                 1
+#define ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK                  BIT(1)
+#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_SHIFT      2
+#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK       BIT(2)
+#define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_SHIFT          3
+#define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK           BIT(3)
+#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_SHIFT 4
+#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK BIT(4)
+#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_SHIFT             6
+#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK              BIT(6)
+
+/* feature_rss_ind_table */
+#define ENA_ADMIN_FEATURE_RSS_IND_TABLE_ONE_ENTRY_UPDATE_MASK BIT(0)
+
+/* aenq_common_desc */
+#define ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK               BIT(0)
+
+/* aenq_link_change_desc */
+#define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK    BIT(0)
+
+#endif /* _ENA_ADMIN_H_ */
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
new file mode 100644
index 0000000000000..520dad1e549af
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -0,0 +1,3243 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_com.h"
+
+/*****************************************************************************/
+/*****************************************************************************/
+
+/* Timeout in micro-sec */
+#define ADMIN_CMD_TIMEOUT_US (3000000)
+
+#define ENA_ASYNC_QUEUE_DEPTH 16
+#define ENA_ADMIN_QUEUE_DEPTH 32
+
+
+#define ENA_CTRL_MAJOR		0
+#define ENA_CTRL_MINOR		0
+#define ENA_CTRL_SUB_MINOR	1
+
+#define MIN_ENA_CTRL_VER \
+	(((ENA_CTRL_MAJOR) << \
+	(ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT)) | \
+	((ENA_CTRL_MINOR) << \
+	(ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT)) | \
+	(ENA_CTRL_SUB_MINOR))
+
+#define ENA_DMA_ADDR_TO_UINT32_LOW(x)	((u32)((u64)(x)))
+#define ENA_DMA_ADDR_TO_UINT32_HIGH(x)	((u32)(((u64)(x)) >> 32))
+
+#define ENA_MMIO_READ_TIMEOUT 0xFFFFFFFF
+
+#define ENA_COM_BOUNCE_BUFFER_CNTRL_CNT	4
+
+#define ENA_REGS_ADMIN_INTR_MASK 1
+
+#define ENA_MIN_ADMIN_POLL_US 100
+
+#define ENA_MAX_ADMIN_POLL_US 5000
+
+/* PHC definitions */
+#define ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC 20
+#define ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC 1000
+#define ENA_PHC_TIMESTAMP_ERROR 0xFFFFFFFFFFFFFFFF
+#define ENA_PHC_REQ_ID_OFFSET 0xDEAD
+
+/*****************************************************************************/
+/*****************************************************************************/
+/*****************************************************************************/
+
+enum ena_cmd_status {
+	ENA_CMD_SUBMITTED,
+	ENA_CMD_COMPLETED,
+	/* Abort - canceled by the driver */
+	ENA_CMD_ABORTED,
+};
+
+struct ena_comp_ctx {
+	struct completion wait_event;
+	struct ena_admin_acq_entry *user_cqe;
+	u32 comp_size;
+	enum ena_cmd_status status;
+	/* status from the device */
+	u8 comp_status;
+	u8 cmd_opcode;
+	bool occupied;
+};
+
+struct ena_com_stats_ctx {
+	struct ena_admin_aq_get_stats_cmd get_cmd;
+	struct ena_admin_acq_get_stats_resp get_resp;
+};
+
+static int ena_com_mem_addr_set(struct ena_com_dev *ena_dev,
+				       struct ena_common_mem_addr *ena_addr,
+				       dma_addr_t addr)
+{
+	if ((addr & GENMASK_ULL(ena_dev->dma_addr_bits - 1, 0)) != addr) {
+		netdev_err(ena_dev->net_device,
+			   "DMA address has more bits that the device supports\n");
+		return -EINVAL;
+	}
+
+	ena_addr->mem_addr_low = lower_32_bits(addr);
+	ena_addr->mem_addr_high = (u16)upper_32_bits(addr);
+
+	return 0;
+}
+
+static int ena_com_admin_init_sq(struct ena_com_admin_queue *admin_queue)
+{
+	struct ena_com_dev *ena_dev = admin_queue->ena_dev;
+	struct ena_com_admin_sq *sq = &admin_queue->sq;
+	u16 size = ADMIN_SQ_SIZE(admin_queue->q_depth);
+
+	sq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size,
+					  &sq->dma_addr, GFP_KERNEL);
+
+	if (!sq->entries) {
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	sq->head = 0;
+	sq->tail = 0;
+	sq->phase = 1;
+
+	sq->db_addr = NULL;
+
+	return 0;
+}
+
+static int ena_com_admin_init_cq(struct ena_com_admin_queue *admin_queue)
+{
+	struct ena_com_dev *ena_dev = admin_queue->ena_dev;
+	struct ena_com_admin_cq *cq = &admin_queue->cq;
+	u16 size = ADMIN_CQ_SIZE(admin_queue->q_depth);
+
+	cq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size,
+					  &cq->dma_addr, GFP_KERNEL);
+
+	if (!cq->entries) {
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	cq->head = 0;
+	cq->phase = 1;
+
+	return 0;
+}
+
+static int ena_com_admin_init_aenq(struct ena_com_dev *ena_dev,
+				   struct ena_aenq_handlers *aenq_handlers)
+{
+	struct ena_com_aenq *aenq = &ena_dev->aenq;
+	u32 addr_low, addr_high, aenq_caps;
+	u16 size;
+
+	ena_dev->aenq.q_depth = ENA_ASYNC_QUEUE_DEPTH;
+	size = ADMIN_AENQ_SIZE(ENA_ASYNC_QUEUE_DEPTH);
+	aenq->entries = dma_zalloc_coherent(ena_dev->dmadev, size,
+					    &aenq->dma_addr, GFP_KERNEL);
+
+	if (!aenq->entries) {
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	aenq->head = aenq->q_depth;
+	aenq->phase = 1;
+
+	addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr);
+	addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr);
+
+	writel(addr_low, ena_dev->reg_bar + ENA_REGS_AENQ_BASE_LO_OFF);
+	writel(addr_high, ena_dev->reg_bar + ENA_REGS_AENQ_BASE_HI_OFF);
+
+	aenq_caps = 0;
+	aenq_caps |= ena_dev->aenq.q_depth & ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK;
+	aenq_caps |= (sizeof(struct ena_admin_aenq_entry)
+		      << ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) &
+		     ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK;
+	writel(aenq_caps, ena_dev->reg_bar + ENA_REGS_AENQ_CAPS_OFF);
+
+	if (unlikely(!aenq_handlers)) {
+		netdev_err(ena_dev->net_device,
+			   "AENQ handlers pointer is NULL\n");
+		return -EINVAL;
+	}
+
+	aenq->aenq_handlers = aenq_handlers;
+
+	return 0;
+}
+
+static void comp_ctxt_release(struct ena_com_admin_queue *queue,
+				     struct ena_comp_ctx *comp_ctx)
+{
+	comp_ctx->occupied = false;
+	atomic_dec(&queue->outstanding_cmds);
+}
+
+static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *admin_queue,
+					  u16 command_id, bool capture)
+{
+	if (unlikely(command_id >= admin_queue->q_depth)) {
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Command id is larger than the queue size. cmd_id: %u queue size %d\n",
+			   command_id, admin_queue->q_depth);
+		return NULL;
+	}
+
+	if (unlikely(!admin_queue->comp_ctx)) {
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Completion context is NULL\n");
+		return NULL;
+	}
+
+	if (unlikely(admin_queue->comp_ctx[command_id].occupied && capture)) {
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Completion context is occupied\n");
+		return NULL;
+	}
+
+	if (capture) {
+		atomic_inc(&admin_queue->outstanding_cmds);
+		admin_queue->comp_ctx[command_id].occupied = true;
+	}
+
+	return &admin_queue->comp_ctx[command_id];
+}
+
+static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queue *admin_queue,
+						       struct ena_admin_aq_entry *cmd,
+						       size_t cmd_size_in_bytes,
+						       struct ena_admin_acq_entry *comp,
+						       size_t comp_size_in_bytes)
+{
+	struct ena_comp_ctx *comp_ctx;
+	u16 tail_masked, cmd_id;
+	u16 queue_size_mask;
+	u16 cnt;
+
+	queue_size_mask = admin_queue->q_depth - 1;
+
+	tail_masked = admin_queue->sq.tail & queue_size_mask;
+
+	/* In case of queue FULL */
+	cnt = (u16)atomic_read(&admin_queue->outstanding_cmds);
+	if (cnt >= admin_queue->q_depth) {
+		netdev_dbg(admin_queue->ena_dev->net_device,
+			   "Admin queue is full.\n");
+		admin_queue->stats.out_of_space++;
+		return ERR_PTR(-ENOSPC);
+	}
+
+	cmd_id = admin_queue->curr_cmd_id;
+
+	cmd->aq_common_descriptor.flags |= admin_queue->sq.phase &
+		ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK;
+
+	cmd->aq_common_descriptor.command_id |= cmd_id &
+		ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
+
+	comp_ctx = get_comp_ctxt(admin_queue, cmd_id, true);
+	if (unlikely(!comp_ctx))
+		return ERR_PTR(-EINVAL);
+
+	comp_ctx->status = ENA_CMD_SUBMITTED;
+	comp_ctx->comp_size = (u32)comp_size_in_bytes;
+	comp_ctx->user_cqe = comp;
+	comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode;
+
+	reinit_completion(&comp_ctx->wait_event);
+
+	memcpy(&admin_queue->sq.entries[tail_masked], cmd, cmd_size_in_bytes);
+
+	admin_queue->curr_cmd_id = (admin_queue->curr_cmd_id + 1) &
+		queue_size_mask;
+
+	admin_queue->sq.tail++;
+	admin_queue->stats.submitted_cmd++;
+
+	if (unlikely((admin_queue->sq.tail & queue_size_mask) == 0))
+		admin_queue->sq.phase = !admin_queue->sq.phase;
+
+	writel(admin_queue->sq.tail, admin_queue->sq.db_addr);
+
+	return comp_ctx;
+}
+
+static int ena_com_init_comp_ctxt(struct ena_com_admin_queue *admin_queue)
+{
+	struct ena_com_dev *ena_dev = admin_queue->ena_dev;
+	size_t size = admin_queue->q_depth * sizeof(struct ena_comp_ctx);
+	struct ena_comp_ctx *comp_ctx;
+	u16 i;
+
+	admin_queue->comp_ctx =
+		devm_kzalloc(admin_queue->q_dmadev, size, GFP_KERNEL);
+	if (unlikely(!admin_queue->comp_ctx)) {
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < admin_queue->q_depth; i++) {
+		comp_ctx = get_comp_ctxt(admin_queue, i, false);
+		if (comp_ctx)
+			init_completion(&comp_ctx->wait_event);
+	}
+
+	return 0;
+}
+
+static struct ena_comp_ctx *ena_com_submit_admin_cmd(struct ena_com_admin_queue *admin_queue,
+						     struct ena_admin_aq_entry *cmd,
+						     size_t cmd_size_in_bytes,
+						     struct ena_admin_acq_entry *comp,
+						     size_t comp_size_in_bytes)
+{
+	unsigned long flags = 0;
+	struct ena_comp_ctx *comp_ctx;
+
+	spin_lock_irqsave(&admin_queue->q_lock, flags);
+	if (unlikely(!admin_queue->running_state)) {
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+		return ERR_PTR(-ENODEV);
+	}
+	comp_ctx = __ena_com_submit_admin_cmd(admin_queue, cmd,
+					      cmd_size_in_bytes,
+					      comp,
+					      comp_size_in_bytes);
+	if (IS_ERR(comp_ctx))
+		admin_queue->running_state = false;
+	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+
+	return comp_ctx;
+}
+
+static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
+			      struct ena_com_create_io_ctx *ctx,
+			      struct ena_com_io_sq *io_sq)
+{
+	size_t size;
+	int dev_node = 0;
+
+	memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr));
+
+	io_sq->dma_addr_bits = (u8)ena_dev->dma_addr_bits;
+	io_sq->desc_entry_size =
+		(io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ?
+		sizeof(struct ena_eth_io_tx_desc) :
+		sizeof(struct ena_eth_io_rx_desc);
+
+	size = io_sq->desc_entry_size * io_sq->q_depth;
+	io_sq->bus = ena_dev->bus;
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) {
+		dev_node = dev_to_node(ena_dev->dmadev);
+		set_dev_node(ena_dev->dmadev, ctx->numa_node);
+		io_sq->desc_addr.virt_addr =
+			dma_zalloc_coherent(ena_dev->dmadev, size,
+					    &io_sq->desc_addr.phys_addr,
+					    GFP_KERNEL);
+		set_dev_node(ena_dev->dmadev, dev_node);
+		if (!io_sq->desc_addr.virt_addr) {
+			io_sq->desc_addr.virt_addr =
+				dma_zalloc_coherent(ena_dev->dmadev, size,
+						    &io_sq->desc_addr.phys_addr,
+						    GFP_KERNEL);
+		}
+
+		if (!io_sq->desc_addr.virt_addr) {
+			netdev_err(ena_dev->net_device,
+				   "Memory allocation failed\n");
+			return -ENOMEM;
+		}
+	}
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		/* Allocate bounce buffers */
+		io_sq->bounce_buf_ctrl.buffer_size =
+			ena_dev->llq_info.desc_list_entry_size;
+		io_sq->bounce_buf_ctrl.buffers_num =
+			ENA_COM_BOUNCE_BUFFER_CNTRL_CNT;
+		io_sq->bounce_buf_ctrl.next_to_use = 0;
+
+		size = (size_t)io_sq->bounce_buf_ctrl.buffer_size *
+			io_sq->bounce_buf_ctrl.buffers_num;
+
+		dev_node = dev_to_node(ena_dev->dmadev);
+		set_dev_node(ena_dev->dmadev, ctx->numa_node);
+		io_sq->bounce_buf_ctrl.base_buffer =
+			devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
+		set_dev_node(ena_dev->dmadev, dev_node);
+		if (!io_sq->bounce_buf_ctrl.base_buffer)
+			io_sq->bounce_buf_ctrl.base_buffer =
+				devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
+
+		if (!io_sq->bounce_buf_ctrl.base_buffer) {
+			netdev_err(ena_dev->net_device,
+				   "Bounce buffer memory allocation failed\n");
+			return -ENOMEM;
+		}
+
+		memcpy(&io_sq->llq_info, &ena_dev->llq_info,
+		       sizeof(io_sq->llq_info));
+
+		/* Initiate the first bounce buffer */
+		io_sq->llq_buf_ctrl.curr_bounce_buf =
+			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
+		memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
+		       0x0, io_sq->llq_info.desc_list_entry_size);
+		io_sq->llq_buf_ctrl.descs_left_in_line =
+			io_sq->llq_info.descs_num_before_header;
+		io_sq->disable_meta_caching =
+			io_sq->llq_info.disable_meta_caching;
+
+		if (io_sq->llq_info.max_entries_in_tx_burst > 0)
+			io_sq->entries_in_tx_burst_left =
+				io_sq->llq_info.max_entries_in_tx_burst;
+	}
+
+	io_sq->tail = 0;
+	io_sq->next_to_comp = 0;
+	io_sq->phase = 1;
+
+	return 0;
+}
+
+static int ena_com_init_io_cq(struct ena_com_dev *ena_dev,
+			      struct ena_com_create_io_ctx *ctx,
+			      struct ena_com_io_cq *io_cq)
+{
+	size_t size;
+	int prev_node = 0;
+
+	memset(&io_cq->cdesc_addr, 0x0, sizeof(io_cq->cdesc_addr));
+
+	/* Use the basic completion descriptor for Rx */
+	io_cq->cdesc_entry_size_in_bytes =
+		(io_cq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ?
+		sizeof(struct ena_eth_io_tx_cdesc) :
+		sizeof(struct ena_eth_io_rx_cdesc_base);
+
+	size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth;
+	io_cq->bus = ena_dev->bus;
+
+	prev_node = dev_to_node(ena_dev->dmadev);
+	set_dev_node(ena_dev->dmadev, ctx->numa_node);
+	io_cq->cdesc_addr.virt_addr =
+		dma_zalloc_coherent(ena_dev->dmadev, size,
+				    &io_cq->cdesc_addr.phys_addr, GFP_KERNEL);
+	set_dev_node(ena_dev->dmadev, prev_node);
+	if (!io_cq->cdesc_addr.virt_addr) {
+		io_cq->cdesc_addr.virt_addr =
+			dma_zalloc_coherent(ena_dev->dmadev, size,
+					    &io_cq->cdesc_addr.phys_addr,
+					    GFP_KERNEL);
+	}
+
+	if (!io_cq->cdesc_addr.virt_addr) {
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	io_cq->phase = 1;
+	io_cq->head = 0;
+
+	return 0;
+}
+
+static void ena_com_handle_single_admin_completion(struct ena_com_admin_queue *admin_queue,
+						   struct ena_admin_acq_entry *cqe)
+{
+	struct ena_comp_ctx *comp_ctx;
+	u16 cmd_id;
+
+	cmd_id = cqe->acq_common_descriptor.command &
+		ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+
+	comp_ctx = get_comp_ctxt(admin_queue, cmd_id, false);
+	if (unlikely(!comp_ctx)) {
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "comp_ctx is NULL. Changing the admin queue running state\n");
+		admin_queue->running_state = false;
+		return;
+	}
+
+	comp_ctx->status = ENA_CMD_COMPLETED;
+	comp_ctx->comp_status = cqe->acq_common_descriptor.status;
+
+	if (comp_ctx->user_cqe)
+		memcpy(comp_ctx->user_cqe, (void *)cqe, comp_ctx->comp_size);
+
+	if (!admin_queue->polling)
+		complete(&comp_ctx->wait_event);
+}
+
+static void ena_com_handle_admin_completion(struct ena_com_admin_queue *admin_queue)
+{
+	struct ena_admin_acq_entry *cqe = NULL;
+	u16 comp_num = 0;
+	u16 head_masked;
+	u8 phase;
+
+	head_masked = admin_queue->cq.head & (admin_queue->q_depth - 1);
+	phase = admin_queue->cq.phase;
+
+	cqe = &admin_queue->cq.entries[head_masked];
+
+	/* Go over all the completions */
+	while ((READ_ONCE(cqe->acq_common_descriptor.flags) &
+		ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/* Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		dma_rmb();
+		ena_com_handle_single_admin_completion(admin_queue, cqe);
+
+		head_masked++;
+		comp_num++;
+		if (unlikely(head_masked == admin_queue->q_depth)) {
+			head_masked = 0;
+			phase = !phase;
+		}
+
+		cqe = &admin_queue->cq.entries[head_masked];
+	}
+
+	admin_queue->cq.head += comp_num;
+	admin_queue->cq.phase = phase;
+	admin_queue->sq.head += comp_num;
+	admin_queue->stats.completed_cmd += comp_num;
+}
+
+static int ena_com_comp_status_to_errno(struct ena_com_admin_queue *admin_queue,
+					u8 comp_status)
+{
+	if (unlikely(comp_status != 0))
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Admin command failed[%u]\n", comp_status);
+
+	switch (comp_status) {
+	case ENA_ADMIN_SUCCESS:
+		return 0;
+	case ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE:
+		return -ENOMEM;
+	case ENA_ADMIN_UNSUPPORTED_OPCODE:
+		return -EOPNOTSUPP;
+	case ENA_ADMIN_BAD_OPCODE:
+	case ENA_ADMIN_MALFORMED_REQUEST:
+	case ENA_ADMIN_ILLEGAL_PARAMETER:
+	case ENA_ADMIN_UNKNOWN_ERROR:
+		return -EINVAL;
+	case ENA_ADMIN_RESOURCE_BUSY:
+		return -EAGAIN;
+	}
+
+	return -EINVAL;
+}
+
+static void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us)
+{
+	delay_us = max_t(u32, ENA_MIN_ADMIN_POLL_US, delay_us);
+	delay_us = min_t(u32, delay_us * (1U << exp), ENA_MAX_ADMIN_POLL_US);
+	usleep_range(delay_us, 2 * delay_us);
+}
+
+static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
+						     struct ena_com_admin_queue *admin_queue)
+{
+	unsigned long flags = 0;
+	unsigned long timeout;
+	int ret;
+	u32 exp = 0;
+
+	timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout);
+
+	while (1) {
+		spin_lock_irqsave(&admin_queue->q_lock, flags);
+		ena_com_handle_admin_completion(admin_queue);
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+
+		if (comp_ctx->status != ENA_CMD_SUBMITTED)
+			break;
+
+		if (time_is_before_jiffies(timeout)) {
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "Wait for completion (polling) timeout\n");
+			/* ENA didn't have any completion */
+			spin_lock_irqsave(&admin_queue->q_lock, flags);
+			admin_queue->stats.no_completion++;
+			admin_queue->running_state = false;
+			spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+
+			ret = -ETIME;
+			goto err;
+		}
+
+		ena_delay_exponential_backoff_us(exp++,
+						 admin_queue->ena_dev->ena_min_poll_delay_us);
+	}
+
+	if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) {
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Command was aborted\n");
+		spin_lock_irqsave(&admin_queue->q_lock, flags);
+		admin_queue->stats.aborted_cmd++;
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+		ret = -ENODEV;
+		goto err;
+	}
+
+	WARN(comp_ctx->status != ENA_CMD_COMPLETED, "Invalid comp status %d\n",
+	     comp_ctx->status);
+
+	ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status);
+err:
+	comp_ctxt_release(admin_queue, comp_ctx);
+	return ret;
+}
+
+/*
+ * Set the LLQ configurations of the firmware
+ *
+ * The driver provides only the enabled feature values to the device,
+ * which in turn, checks if they are supported.
+ */
+static int ena_com_set_llq(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	struct ena_com_llq_info *llq_info = &ena_dev->llq_info;
+	int ret;
+
+	memset(&cmd, 0x0, sizeof(cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.feat_common.feature_id = ENA_ADMIN_LLQ;
+
+	cmd.u.llq.header_location_ctrl_enabled = llq_info->header_location_ctrl;
+	cmd.u.llq.entry_size_ctrl_enabled = llq_info->desc_list_entry_size_ctrl;
+	cmd.u.llq.desc_num_before_header_enabled = llq_info->descs_num_before_header;
+	cmd.u.llq.descriptors_stride_ctrl_enabled = llq_info->desc_stride_ctrl;
+
+	cmd.u.llq.accel_mode.u.set.enabled_flags =
+		BIT(ENA_ADMIN_DISABLE_META_CACHING) |
+		BIT(ENA_ADMIN_LIMIT_TX_BURST);
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to set LLQ configurations: %d\n", ret);
+
+	return ret;
+}
+
+static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
+				   struct ena_admin_feature_llq_desc *llq_features,
+				   struct ena_llq_configurations *llq_default_cfg)
+{
+	struct ena_com_llq_info *llq_info = &ena_dev->llq_info;
+	struct ena_admin_accel_mode_get llq_accel_mode_get;
+	u16 supported_feat;
+	int rc;
+
+	memset(llq_info, 0, sizeof(*llq_info));
+
+	supported_feat = llq_features->header_location_ctrl_supported;
+
+	if (likely(supported_feat & llq_default_cfg->llq_header_location)) {
+		llq_info->header_location_ctrl =
+			llq_default_cfg->llq_header_location;
+	} else {
+		netdev_err(ena_dev->net_device,
+			   "Invalid header location control, supported: 0x%x\n",
+			   supported_feat);
+		return -EINVAL;
+	}
+
+	if (likely(llq_info->header_location_ctrl == ENA_ADMIN_INLINE_HEADER)) {
+		supported_feat = llq_features->descriptors_stride_ctrl_supported;
+		if (likely(supported_feat & llq_default_cfg->llq_stride_ctrl)) {
+			llq_info->desc_stride_ctrl = llq_default_cfg->llq_stride_ctrl;
+		} else	{
+			if (supported_feat & ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY) {
+				llq_info->desc_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
+			} else if (supported_feat & ENA_ADMIN_SINGLE_DESC_PER_ENTRY) {
+				llq_info->desc_stride_ctrl = ENA_ADMIN_SINGLE_DESC_PER_ENTRY;
+			} else {
+				netdev_err(ena_dev->net_device,
+					   "Invalid desc_stride_ctrl, supported: 0x%x\n",
+					   supported_feat);
+				return -EINVAL;
+			}
+
+			netdev_err(ena_dev->net_device,
+				   "Default llq stride ctrl is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
+				   llq_default_cfg->llq_stride_ctrl,
+				   supported_feat, llq_info->desc_stride_ctrl);
+		}
+	} else {
+		llq_info->desc_stride_ctrl = 0;
+	}
+
+	supported_feat = llq_features->entry_size_ctrl_supported;
+	if (likely(supported_feat & llq_default_cfg->llq_ring_entry_size)) {
+		llq_info->desc_list_entry_size_ctrl = llq_default_cfg->llq_ring_entry_size;
+		llq_info->desc_list_entry_size = llq_default_cfg->llq_ring_entry_size_value;
+	} else {
+		if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_128B) {
+			llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_128B;
+			llq_info->desc_list_entry_size = 128;
+		} else if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_192B) {
+			llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_192B;
+			llq_info->desc_list_entry_size = 192;
+		} else if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_256B) {
+			llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
+			llq_info->desc_list_entry_size = 256;
+		} else {
+			netdev_err(ena_dev->net_device,
+				   "Invalid entry_size_ctrl, supported: 0x%x\n",
+				   supported_feat);
+			return -EINVAL;
+		}
+
+		netdev_err(ena_dev->net_device,
+			   "Default llq ring entry size is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
+			   llq_default_cfg->llq_ring_entry_size, supported_feat,
+			   llq_info->desc_list_entry_size);
+	}
+	if (unlikely(llq_info->desc_list_entry_size & 0x7)) {
+		/* The desc list entry size should be whole multiply of 8
+		 * This requirement comes from __iowrite64_copy()
+		 */
+		netdev_err(ena_dev->net_device, "Illegal entry size %d\n",
+			   llq_info->desc_list_entry_size);
+		return -EINVAL;
+	}
+
+	if (llq_info->desc_stride_ctrl == ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY)
+		llq_info->descs_per_entry = llq_info->desc_list_entry_size /
+			sizeof(struct ena_eth_io_tx_desc);
+	else
+		llq_info->descs_per_entry = 1;
+
+	supported_feat = llq_features->desc_num_before_header_supported;
+	if (likely(supported_feat & llq_default_cfg->llq_num_decs_before_header)) {
+		llq_info->descs_num_before_header = llq_default_cfg->llq_num_decs_before_header;
+	} else {
+		if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2) {
+			llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
+		} else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1) {
+			llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1;
+		} else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4) {
+			llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4;
+		} else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8) {
+			llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8;
+		} else {
+			netdev_err(ena_dev->net_device,
+				   "Invalid descs_num_before_header, supported: 0x%x\n",
+				   supported_feat);
+			return -EINVAL;
+		}
+
+		netdev_err(ena_dev->net_device,
+			   "Default llq num descs before header is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
+			   llq_default_cfg->llq_num_decs_before_header,
+			   supported_feat, llq_info->descs_num_before_header);
+	}
+	/* Check for accelerated queue supported */
+	llq_accel_mode_get = llq_features->accel_mode.u.get;
+
+	llq_info->disable_meta_caching =
+		!!(llq_accel_mode_get.supported_flags &
+		   BIT(ENA_ADMIN_DISABLE_META_CACHING));
+
+	if (llq_accel_mode_get.supported_flags & BIT(ENA_ADMIN_LIMIT_TX_BURST))
+		llq_info->max_entries_in_tx_burst =
+			llq_accel_mode_get.max_tx_burst_size /
+			llq_default_cfg->llq_ring_entry_size_value;
+
+	rc = ena_com_set_llq(ena_dev);
+	if (rc)
+		netdev_err(ena_dev->net_device,
+			   "Cannot set LLQ configuration: %d\n", rc);
+
+	return rc;
+}
+
+static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *comp_ctx,
+							struct ena_com_admin_queue *admin_queue)
+{
+	unsigned long flags = 0;
+	int ret;
+
+	wait_for_completion_timeout(&comp_ctx->wait_event,
+				    usecs_to_jiffies(
+					    admin_queue->completion_timeout));
+
+	/* In case the command wasn't completed find out the root cause.
+	 * There might be 2 kinds of errors
+	 * 1) No completion (timeout reached)
+	 * 2) There is completion but the device didn't get any msi-x interrupt.
+	 */
+	if (unlikely(comp_ctx->status == ENA_CMD_SUBMITTED)) {
+		spin_lock_irqsave(&admin_queue->q_lock, flags);
+		ena_com_handle_admin_completion(admin_queue);
+		admin_queue->stats.no_completion++;
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+
+		if (comp_ctx->status == ENA_CMD_COMPLETED) {
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "The ena device sent a completion but the driver didn't receive a MSI-X interrupt (cmd %d), autopolling mode is %s\n",
+				   comp_ctx->cmd_opcode,
+				   admin_queue->auto_polling ? "ON" : "OFF");
+			/* Check if fallback to polling is enabled */
+			if (admin_queue->auto_polling)
+				admin_queue->polling = true;
+		} else {
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "The ena device didn't send a completion for the admin cmd %d status %d\n",
+				   comp_ctx->cmd_opcode, comp_ctx->status);
+		}
+		/* Check if shifted to polling mode.
+		 * This will happen if there is a completion without an interrupt
+		 * and autopolling mode is enabled. Continuing normal execution in such case
+		 */
+		if (!admin_queue->polling) {
+			admin_queue->running_state = false;
+			ret = -ETIME;
+			goto err;
+		}
+	}
+
+	ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status);
+err:
+	comp_ctxt_release(admin_queue, comp_ctx);
+	return ret;
+}
+
+/* This method read the hardware device register through posting writes
+ * and waiting for response
+ * On timeout the function will return ENA_MMIO_READ_TIMEOUT
+ */
+static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+	volatile struct ena_admin_ena_mmio_req_read_less_resp *read_resp =
+		mmio_read->read_resp;
+	u32 mmio_read_reg, ret, i;
+	unsigned long flags = 0;
+	u32 timeout = mmio_read->reg_read_to;
+
+	might_sleep();
+
+	if (timeout == 0)
+		timeout = ENA_REG_READ_TIMEOUT;
+
+	/* If readless is disabled, perform regular read */
+	if (!mmio_read->readless_supported)
+		return readl(ena_dev->reg_bar + offset);
+
+	spin_lock_irqsave(&mmio_read->lock, flags);
+	mmio_read->seq_num++;
+
+	read_resp->req_id = mmio_read->seq_num + 0xDEAD;
+	mmio_read_reg = (offset << ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT) &
+			ENA_REGS_MMIO_REG_READ_REG_OFF_MASK;
+	mmio_read_reg |= mmio_read->seq_num &
+			ENA_REGS_MMIO_REG_READ_REQ_ID_MASK;
+
+	writel(mmio_read_reg, ena_dev->reg_bar + ENA_REGS_MMIO_REG_READ_OFF);
+
+	for (i = 0; i < timeout; i++) {
+		if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num)
+			break;
+
+		udelay(1);
+	}
+
+	if (unlikely(i == timeout)) {
+		netdev_err(ena_dev->net_device,
+			   "Reading reg failed for timeout. expected: req id[%u] offset[%u] actual: req id[%u] offset[%u]\n",
+			   mmio_read->seq_num, offset, read_resp->req_id,
+			   read_resp->reg_off);
+		ret = ENA_MMIO_READ_TIMEOUT;
+		goto err;
+	}
+
+	if (read_resp->reg_off != offset) {
+		netdev_err(ena_dev->net_device,
+			   "Read failure: wrong offset provided\n");
+		ret = ENA_MMIO_READ_TIMEOUT;
+	} else {
+		ret = read_resp->reg_val;
+	}
+err:
+	spin_unlock_irqrestore(&mmio_read->lock, flags);
+
+	return ret;
+}
+
+/* There are two types to wait for completion.
+ * Polling mode - wait until the completion is available.
+ * Async mode - wait on wait queue until the completion is ready
+ * (or the timeout expired).
+ * It is expected that the IRQ called ena_com_handle_admin_completion
+ * to mark the completions.
+ */
+static int ena_com_wait_and_process_admin_cq(struct ena_comp_ctx *comp_ctx,
+					     struct ena_com_admin_queue *admin_queue)
+{
+	if (admin_queue->polling)
+		return ena_com_wait_and_process_admin_cq_polling(comp_ctx,
+								 admin_queue);
+
+	return ena_com_wait_and_process_admin_cq_interrupts(comp_ctx,
+							    admin_queue);
+}
+
+static int ena_com_destroy_io_sq(struct ena_com_dev *ena_dev,
+				 struct ena_com_io_sq *io_sq)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_admin_aq_destroy_sq_cmd destroy_cmd;
+	struct ena_admin_acq_destroy_sq_resp_desc destroy_resp;
+	u8 direction;
+	int ret;
+
+	memset(&destroy_cmd, 0x0, sizeof(destroy_cmd));
+
+	if (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX)
+		direction = ENA_ADMIN_SQ_DIRECTION_TX;
+	else
+		direction = ENA_ADMIN_SQ_DIRECTION_RX;
+
+	destroy_cmd.sq.sq_identity |= (direction <<
+		ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT) &
+		ENA_ADMIN_SQ_SQ_DIRECTION_MASK;
+
+	destroy_cmd.sq.sq_idx = io_sq->idx;
+	destroy_cmd.aq_common_descriptor.opcode = ENA_ADMIN_DESTROY_SQ;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&destroy_cmd,
+					    sizeof(destroy_cmd),
+					    (struct ena_admin_acq_entry *)&destroy_resp,
+					    sizeof(destroy_resp));
+
+	if (unlikely(ret && (ret != -ENODEV)))
+		netdev_err(ena_dev->net_device,
+			   "Failed to destroy io sq error: %d\n", ret);
+
+	return ret;
+}
+
+static void ena_com_io_queue_free(struct ena_com_dev *ena_dev,
+				  struct ena_com_io_sq *io_sq,
+				  struct ena_com_io_cq *io_cq)
+{
+	size_t size;
+
+	if (io_cq->cdesc_addr.virt_addr) {
+		size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth;
+
+		dma_free_coherent(ena_dev->dmadev, size,
+				  io_cq->cdesc_addr.virt_addr,
+				  io_cq->cdesc_addr.phys_addr);
+
+		io_cq->cdesc_addr.virt_addr = NULL;
+	}
+
+	if (io_sq->desc_addr.virt_addr) {
+		size = io_sq->desc_entry_size * io_sq->q_depth;
+
+		dma_free_coherent(ena_dev->dmadev, size,
+				  io_sq->desc_addr.virt_addr,
+				  io_sq->desc_addr.phys_addr);
+
+		io_sq->desc_addr.virt_addr = NULL;
+	}
+
+	if (io_sq->bounce_buf_ctrl.base_buffer) {
+		devm_kfree(ena_dev->dmadev, io_sq->bounce_buf_ctrl.base_buffer);
+		io_sq->bounce_buf_ctrl.base_buffer = NULL;
+	}
+}
+
+static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
+				u16 exp_state)
+{
+	u32 val, exp = 0;
+	unsigned long timeout_stamp;
+
+	/* Convert timeout from resolution of 100ms to us resolution. */
+	timeout_stamp = jiffies + usecs_to_jiffies(100 * 1000 * timeout);
+
+	while (1) {
+		val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
+
+		if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) {
+			netdev_err(ena_dev->net_device,
+				   "Reg read timeout occurred\n");
+			return -ETIME;
+		}
+
+		if ((val & ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK) ==
+			exp_state)
+			return 0;
+
+		if (time_is_before_jiffies(timeout_stamp))
+			return -ETIME;
+
+		ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us);
+	}
+}
+
+static bool ena_com_check_supported_feature_id(struct ena_com_dev *ena_dev,
+					       enum ena_admin_aq_feature_id feature_id)
+{
+	u32 feature_mask = 1 << feature_id;
+
+	/* Device attributes is always supported */
+	if ((feature_id != ENA_ADMIN_DEVICE_ATTRIBUTES) &&
+	    !(ena_dev->supported_features & feature_mask))
+		return false;
+
+	return true;
+}
+
+static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
+				  struct ena_admin_get_feat_resp *get_resp,
+				  enum ena_admin_aq_feature_id feature_id,
+				  dma_addr_t control_buf_dma_addr,
+				  u32 control_buff_size,
+				  u8 feature_ver)
+{
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_get_feat_cmd get_cmd;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(ena_dev, feature_id)) {
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   feature_id);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&get_cmd, 0x0, sizeof(get_cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	get_cmd.aq_common_descriptor.opcode = ENA_ADMIN_GET_FEATURE;
+
+	if (control_buff_size)
+		get_cmd.aq_common_descriptor.flags =
+			ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+	else
+		get_cmd.aq_common_descriptor.flags = 0;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &get_cmd.control_buffer.address,
+				   control_buf_dma_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
+		return ret;
+	}
+
+	get_cmd.control_buffer.length = control_buff_size;
+	get_cmd.feat_common.feature_version = feature_ver;
+	get_cmd.feat_common.feature_id = feature_id;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)
+					    &get_cmd,
+					    sizeof(get_cmd),
+					    (struct ena_admin_acq_entry *)
+					    get_resp,
+					    sizeof(*get_resp));
+
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to submit get_feature command %d error: %d\n",
+			   feature_id, ret);
+
+	return ret;
+}
+
+static int ena_com_get_feature(struct ena_com_dev *ena_dev,
+			       struct ena_admin_get_feat_resp *get_resp,
+			       enum ena_admin_aq_feature_id feature_id,
+			       u8 feature_ver)
+{
+	return ena_com_get_feature_ex(ena_dev,
+				      get_resp,
+				      feature_id,
+				      0,
+				      0,
+				      feature_ver);
+}
+
+int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->rss.hash_func;
+}
+
+static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev)
+{
+	struct ena_admin_feature_rss_flow_hash_control *hash_key =
+		(ena_dev->rss).hash_key;
+
+	netdev_rss_key_fill(&hash_key->key, sizeof(hash_key->key));
+	/* The key buffer is stored in the device in an array of
+	 * uint32 elements.
+	 */
+	hash_key->key_parts = ENA_ADMIN_RSS_KEY_PARTS;
+}
+
+static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	if (!ena_com_check_supported_feature_id(ena_dev,
+						ENA_ADMIN_RSS_HASH_FUNCTION))
+		return -EOPNOTSUPP;
+
+	rss->hash_key =
+		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
+				    &rss->hash_key_dma_addr, GFP_KERNEL);
+
+	if (unlikely(!rss->hash_key))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ena_com_hash_key_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	if (rss->hash_key)
+		dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
+				  rss->hash_key, rss->hash_key_dma_addr);
+	rss->hash_key = NULL;
+}
+
+static int ena_com_hash_ctrl_init(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	rss->hash_ctrl =
+		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
+				    &rss->hash_ctrl_dma_addr, GFP_KERNEL);
+
+	if (unlikely(!rss->hash_ctrl))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ena_com_hash_ctrl_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	if (rss->hash_ctrl)
+		dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
+				  rss->hash_ctrl, rss->hash_ctrl_dma_addr);
+	rss->hash_ctrl = NULL;
+}
+
+static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev,
+					   u16 log_size)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_get_feat_resp get_resp;
+	size_t tbl_size;
+	int ret;
+
+	ret = ena_com_get_feature(ena_dev, &get_resp,
+				  ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG, 0);
+	if (unlikely(ret))
+		return ret;
+
+	if ((get_resp.u.ind_table.min_size > log_size) ||
+	    (get_resp.u.ind_table.max_size < log_size)) {
+		netdev_err(ena_dev->net_device,
+			   "Indirect table size doesn't fit. requested size: %d while min is:%d and max %d\n",
+			   1 << log_size, 1 << get_resp.u.ind_table.min_size,
+			   1 << get_resp.u.ind_table.max_size);
+		return -EINVAL;
+	}
+
+	tbl_size = (1ULL << log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	rss->rss_ind_tbl =
+		dma_zalloc_coherent(ena_dev->dmadev, tbl_size,
+				    &rss->rss_ind_tbl_dma_addr, GFP_KERNEL);
+	if (unlikely(!rss->rss_ind_tbl))
+		goto mem_err1;
+
+	tbl_size = (1ULL << log_size) * sizeof(u16);
+	rss->host_rss_ind_tbl =
+		devm_kzalloc(ena_dev->dmadev, tbl_size, GFP_KERNEL);
+	if (unlikely(!rss->host_rss_ind_tbl))
+		goto mem_err2;
+
+	rss->tbl_log_size = log_size;
+
+	return 0;
+
+mem_err2:
+	tbl_size = (1ULL << log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl,
+			  rss->rss_ind_tbl_dma_addr);
+	rss->rss_ind_tbl = NULL;
+mem_err1:
+	rss->tbl_log_size = 0;
+	return -ENOMEM;
+}
+
+static void ena_com_indirect_table_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	size_t tbl_size = (1ULL << rss->tbl_log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	if (rss->rss_ind_tbl)
+		dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl,
+				  rss->rss_ind_tbl_dma_addr);
+	rss->rss_ind_tbl = NULL;
+
+	if (rss->host_rss_ind_tbl)
+		devm_kfree(ena_dev->dmadev, rss->host_rss_ind_tbl);
+	rss->host_rss_ind_tbl = NULL;
+}
+
+static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
+				struct ena_com_io_sq *io_sq, u16 cq_idx)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_admin_aq_create_sq_cmd create_cmd;
+	struct ena_admin_acq_create_sq_resp_desc cmd_completion;
+	u8 direction;
+	int ret;
+
+	memset(&create_cmd, 0x0, sizeof(create_cmd));
+
+	create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_SQ;
+
+	if (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX)
+		direction = ENA_ADMIN_SQ_DIRECTION_TX;
+	else
+		direction = ENA_ADMIN_SQ_DIRECTION_RX;
+
+	create_cmd.sq_identity |= (direction <<
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT) &
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK;
+
+	create_cmd.sq_caps_2 |= io_sq->mem_queue_type &
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK;
+
+	create_cmd.sq_caps_2 |= (ENA_ADMIN_COMPLETION_POLICY_DESC <<
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT) &
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK;
+
+	create_cmd.sq_caps_3 |=
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_IS_PHYSICALLY_CONTIGUOUS_MASK;
+
+	create_cmd.cq_idx = cq_idx;
+	create_cmd.sq_depth = io_sq->q_depth;
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) {
+		ret = ena_com_mem_addr_set(ena_dev,
+					   &create_cmd.sq_ba,
+					   io_sq->desc_addr.phys_addr);
+		if (unlikely(ret)) {
+			netdev_err(ena_dev->net_device,
+				   "Memory address set failed\n");
+			return ret;
+		}
+	}
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&create_cmd,
+					    sizeof(create_cmd),
+					    (struct ena_admin_acq_entry *)&cmd_completion,
+					    sizeof(cmd_completion));
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed to create IO SQ. error: %d\n", ret);
+		return ret;
+	}
+
+	io_sq->idx = cmd_completion.sq_idx;
+
+	io_sq->db_addr = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
+		(uintptr_t)cmd_completion.sq_doorbell_offset);
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		io_sq->desc_addr.pbuf_dev_addr =
+			(u8 __iomem *)((uintptr_t)ena_dev->mem_bar +
+			cmd_completion.llq_descriptors_offset);
+	}
+
+	netdev_dbg(ena_dev->net_device, "Created sq[%u], depth[%u]\n",
+		   io_sq->idx, io_sq->q_depth);
+
+	return ret;
+}
+
+static int ena_com_ind_tbl_convert_to_device(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_com_io_sq *io_sq;
+	u16 qid;
+	int i;
+
+	for (i = 0; i < 1 << rss->tbl_log_size; i++) {
+		qid = rss->host_rss_ind_tbl[i];
+		if (qid >= ENA_TOTAL_NUM_QUEUES)
+			return -EINVAL;
+
+		io_sq = &ena_dev->io_sq_queues[qid];
+
+		if (io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX)
+			return -EINVAL;
+
+		rss->rss_ind_tbl[i].cq_idx = io_sq->idx;
+	}
+
+	return 0;
+}
+
+static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev,
+						 u16 intr_delay_resolution)
+{
+	u16 prev_intr_delay_resolution = ena_dev->intr_delay_resolution;
+
+	if (unlikely(!intr_delay_resolution)) {
+		netdev_err(ena_dev->net_device,
+			   "Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n");
+		intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION;
+	}
+
+	/* update Rx */
+	ena_dev->intr_moder_rx_interval =
+		ena_dev->intr_moder_rx_interval *
+		prev_intr_delay_resolution /
+		intr_delay_resolution;
+
+	/* update Tx */
+	ena_dev->intr_moder_tx_interval =
+		ena_dev->intr_moder_tx_interval *
+		prev_intr_delay_resolution /
+		intr_delay_resolution;
+
+	ena_dev->intr_delay_resolution = intr_delay_resolution;
+}
+
+/*****************************************************************************/
+/*******************************      API       ******************************/
+/*****************************************************************************/
+
+int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
+				  struct ena_admin_aq_entry *cmd,
+				  size_t cmd_size,
+				  struct ena_admin_acq_entry *comp,
+				  size_t comp_size)
+{
+	struct ena_comp_ctx *comp_ctx;
+	int ret;
+
+	comp_ctx = ena_com_submit_admin_cmd(admin_queue, cmd, cmd_size,
+					    comp, comp_size);
+	if (IS_ERR(comp_ctx)) {
+		ret = PTR_ERR(comp_ctx);
+		if (ret == -ENODEV)
+			netdev_dbg(admin_queue->ena_dev->net_device,
+				   "Failed to submit command [%d]\n", ret);
+		else
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "Failed to submit command [%d]\n", ret);
+
+		return ret;
+	}
+
+	ret = ena_com_wait_and_process_admin_cq(comp_ctx, admin_queue);
+	if (unlikely(ret)) {
+		if (admin_queue->running_state)
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "Failed to process command. ret = %d\n", ret);
+		else
+			netdev_dbg(admin_queue->ena_dev->net_device,
+				   "Failed to process command. ret = %d\n", ret);
+	}
+	return ret;
+}
+
+int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
+			 struct ena_com_io_cq *io_cq)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_admin_aq_create_cq_cmd create_cmd;
+	struct ena_admin_acq_create_cq_resp_desc cmd_completion;
+	int ret;
+
+	memset(&create_cmd, 0x0, sizeof(create_cmd));
+
+	create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_CQ;
+
+	create_cmd.cq_caps_2 |= (io_cq->cdesc_entry_size_in_bytes / 4) &
+		ENA_ADMIN_AQ_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK;
+	create_cmd.cq_caps_1 |=
+		ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK;
+
+	create_cmd.msix_vector = io_cq->msix_vector;
+	create_cmd.cq_depth = io_cq->q_depth;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &create_cmd.cq_ba,
+				   io_cq->cdesc_addr.phys_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
+		return ret;
+	}
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&create_cmd,
+					    sizeof(create_cmd),
+					    (struct ena_admin_acq_entry *)&cmd_completion,
+					    sizeof(cmd_completion));
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed to create IO CQ. error: %d\n", ret);
+		return ret;
+	}
+
+	io_cq->idx = cmd_completion.cq_idx;
+
+	io_cq->unmask_reg = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
+		cmd_completion.cq_interrupt_unmask_register_offset);
+
+	if (cmd_completion.numa_node_register_offset)
+		io_cq->numa_node_cfg_reg =
+			(u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
+			cmd_completion.numa_node_register_offset);
+
+	netdev_dbg(ena_dev->net_device, "Created cq[%u], depth[%u]\n",
+		   io_cq->idx, io_cq->q_depth);
+
+	return ret;
+}
+
+int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid,
+			    struct ena_com_io_sq **io_sq,
+			    struct ena_com_io_cq **io_cq)
+{
+	if (qid >= ENA_TOTAL_NUM_QUEUES) {
+		netdev_err(ena_dev->net_device,
+			   "Invalid queue number %d but the max is %d\n", qid,
+			   ENA_TOTAL_NUM_QUEUES);
+		return -EINVAL;
+	}
+
+	*io_sq = &ena_dev->io_sq_queues[qid];
+	*io_cq = &ena_dev->io_cq_queues[qid];
+
+	return 0;
+}
+
+void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_comp_ctx *comp_ctx;
+	u16 i;
+
+	if (!admin_queue->comp_ctx)
+		return;
+
+	for (i = 0; i < admin_queue->q_depth; i++) {
+		comp_ctx = get_comp_ctxt(admin_queue, i, false);
+		if (unlikely(!comp_ctx))
+			break;
+
+		comp_ctx->status = ENA_CMD_ABORTED;
+
+		complete(&comp_ctx->wait_event);
+	}
+}
+
+void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	unsigned long flags = 0;
+	u32 exp = 0;
+
+	spin_lock_irqsave(&admin_queue->q_lock, flags);
+	while (atomic_read(&admin_queue->outstanding_cmds) != 0) {
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+		ena_delay_exponential_backoff_us(exp++,
+						 ena_dev->ena_min_poll_delay_us);
+		spin_lock_irqsave(&admin_queue->q_lock, flags);
+	}
+	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+}
+
+int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev,
+			  struct ena_com_io_cq *io_cq)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_admin_aq_destroy_cq_cmd destroy_cmd;
+	struct ena_admin_acq_destroy_cq_resp_desc destroy_resp;
+	int ret;
+
+	memset(&destroy_cmd, 0x0, sizeof(destroy_cmd));
+
+	destroy_cmd.cq_idx = io_cq->idx;
+	destroy_cmd.aq_common_descriptor.opcode = ENA_ADMIN_DESTROY_CQ;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&destroy_cmd,
+					    sizeof(destroy_cmd),
+					    (struct ena_admin_acq_entry *)&destroy_resp,
+					    sizeof(destroy_resp));
+
+	if (unlikely(ret && (ret != -ENODEV)))
+		netdev_err(ena_dev->net_device,
+			   "Failed to destroy IO CQ. error: %d\n", ret);
+
+	return ret;
+}
+
+bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->admin_queue.running_state;
+}
+
+void ena_com_set_admin_running_state(struct ena_com_dev *ena_dev, bool state)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&admin_queue->q_lock, flags);
+	ena_dev->admin_queue.running_state = state;
+	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+}
+
+void ena_com_admin_aenq_enable(struct ena_com_dev *ena_dev)
+{
+	u16 depth = ena_dev->aenq.q_depth;
+
+	WARN(ena_dev->aenq.head != depth, "Invalid AENQ state\n");
+
+	/* Init head_db to mark that all entries in the queue
+	 * are initially available
+	 */
+	writel(depth, ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+}
+
+int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag)
+{
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	struct ena_admin_get_feat_resp get_resp;
+	int ret;
+
+	ret = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_AENQ_CONFIG, 0);
+	if (ret) {
+		dev_info(ena_dev->dmadev, "Can't get aenq configuration\n");
+		return ret;
+	}
+
+	if ((get_resp.u.aenq.supported_groups & groups_flag) != groups_flag) {
+		netdev_warn(ena_dev->net_device,
+			    "Trying to set unsupported aenq events. supported flag: 0x%x asked flag: 0x%x\n",
+			    get_resp.u.aenq.supported_groups, groups_flag);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags = 0;
+	cmd.feat_common.feature_id = ENA_ADMIN_AENQ_CONFIG;
+	cmd.u.aenq.enabled_groups = groups_flag;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to config AENQ ret: %d\n", ret);
+
+	return ret;
+}
+
+int ena_com_get_dma_width(struct ena_com_dev *ena_dev)
+{
+	u32 caps = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF);
+	u32 width;
+
+	if (unlikely(caps == ENA_MMIO_READ_TIMEOUT)) {
+		netdev_err(ena_dev->net_device, "Reg read timeout occurred\n");
+		return -ETIME;
+	}
+
+	width = (caps & ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >>
+		ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT;
+
+	netdev_dbg(ena_dev->net_device, "ENA dma width: %d\n", width);
+
+	if ((width < 32) || width > ENA_MAX_PHYS_ADDR_SIZE_BITS) {
+		netdev_err(ena_dev->net_device, "DMA width illegal value: %d\n",
+			   width);
+		return -EINVAL;
+	}
+
+	ena_dev->dma_addr_bits = width;
+
+	return width;
+}
+
+int ena_com_validate_version(struct ena_com_dev *ena_dev)
+{
+	u32 ver;
+	u32 ctrl_ver;
+	u32 ctrl_ver_masked;
+
+	/* Make sure the ENA version and the controller version are at least
+	 * as the driver expects
+	 */
+	ver = ena_com_reg_bar_read32(ena_dev, ENA_REGS_VERSION_OFF);
+	ctrl_ver = ena_com_reg_bar_read32(ena_dev,
+					  ENA_REGS_CONTROLLER_VERSION_OFF);
+
+	if (unlikely((ver == ENA_MMIO_READ_TIMEOUT) ||
+		     (ctrl_ver == ENA_MMIO_READ_TIMEOUT))) {
+		netdev_err(ena_dev->net_device, "Reg read timeout occurred\n");
+		return -ETIME;
+	}
+
+	dev_info(ena_dev->dmadev, "ENA device version: %d.%d\n",
+		 (ver & ENA_REGS_VERSION_MAJOR_VERSION_MASK) >>
+			 ENA_REGS_VERSION_MAJOR_VERSION_SHIFT,
+		 ver & ENA_REGS_VERSION_MINOR_VERSION_MASK);
+
+	dev_info(ena_dev->dmadev,
+		 "ENA controller version: %d.%d.%d implementation version %d\n",
+		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
+			 ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
+		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >>
+			 ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT,
+		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK),
+		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >>
+			 ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT);
+
+	ctrl_ver_masked =
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) |
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) |
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK);
+
+	/* Validate the ctrl version without the implementation ID */
+	if (ctrl_ver_masked < MIN_ENA_CTRL_VER) {
+		netdev_err(ena_dev->net_device,
+			   "ENA ctrl version is lower than the minimal ctrl version the driver supports\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+ena_com_free_ena_admin_queue_comp_ctx(struct ena_com_dev *ena_dev,
+				      struct ena_com_admin_queue *admin_queue)
+
+{
+	if (!admin_queue->comp_ctx)
+		return;
+
+	devm_kfree(ena_dev->dmadev, admin_queue->comp_ctx);
+
+	admin_queue->comp_ctx = NULL;
+}
+
+void ena_com_admin_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_com_admin_cq *cq = &admin_queue->cq;
+	struct ena_com_admin_sq *sq = &admin_queue->sq;
+	struct ena_com_aenq *aenq = &ena_dev->aenq;
+	u16 size;
+
+	ena_com_free_ena_admin_queue_comp_ctx(ena_dev, admin_queue);
+
+	size = ADMIN_SQ_SIZE(admin_queue->q_depth);
+	if (sq->entries)
+		dma_free_coherent(ena_dev->dmadev, size, sq->entries,
+				  sq->dma_addr);
+	sq->entries = NULL;
+
+	size = ADMIN_CQ_SIZE(admin_queue->q_depth);
+	if (cq->entries)
+		dma_free_coherent(ena_dev->dmadev, size, cq->entries,
+				  cq->dma_addr);
+	cq->entries = NULL;
+
+	size = ADMIN_AENQ_SIZE(aenq->q_depth);
+	if (ena_dev->aenq.entries)
+		dma_free_coherent(ena_dev->dmadev, size, aenq->entries,
+				  aenq->dma_addr);
+	aenq->entries = NULL;
+}
+
+void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling)
+{
+	u32 mask_value = 0;
+
+	if (polling)
+		mask_value = ENA_REGS_ADMIN_INTR_MASK;
+
+	writel(mask_value, ena_dev->reg_bar + ENA_REGS_INTR_MASK_OFF);
+	ena_dev->admin_queue.polling = polling;
+}
+
+bool ena_com_get_admin_polling_mode(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->admin_queue.polling;
+}
+
+void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev,
+					 bool polling)
+{
+	ena_dev->admin_queue.auto_polling = polling;
+}
+
+bool ena_com_phc_supported(struct ena_com_dev *ena_dev)
+{
+	return ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_PHC_CONFIG);
+}
+
+int ena_com_phc_init(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_phc_info *phc = &ena_dev->phc;
+
+	memset(phc, 0x0, sizeof(*phc));
+
+	/* Allocate shared mem used PHC timestamp retrieved from device */
+	phc->virt_addr =
+		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr),
+				    &phc->phys_addr, GFP_KERNEL);
+	if (unlikely(!phc->virt_addr))
+		return -ENOMEM;
+
+	spin_lock_init(&phc->lock);
+
+	phc->virt_addr->req_id = 0;
+	phc->virt_addr->timestamp = 0;
+
+	return 0;
+}
+
+int ena_com_phc_config(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_phc_info *phc = &ena_dev->phc;
+	struct ena_admin_get_feat_resp get_feat_resp;
+	struct ena_admin_set_feat_resp set_feat_resp;
+	struct ena_admin_set_feat_cmd set_feat_cmd;
+	int ret = 0;
+
+	/* Get device PHC default configuration */
+	ret = ena_com_get_feature(ena_dev, &get_feat_resp, ENA_ADMIN_PHC_CONFIG, 0);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed to get PHC feature configuration, error: %d\n",
+			   ret);
+		return ret;
+	}
+
+	/* Suporting only readless PHC retrieval */
+	if (get_feat_resp.u.phc.type != ENA_ADMIN_PHC_TYPE_READLESS) {
+		netdev_err(ena_dev->net_device,
+			   "Unsupprted PHC type, error: %d\n", -EOPNOTSUPP);
+		return -EOPNOTSUPP;
+	}
+
+	/* Update PHC doorbell offset according to device value, used to write req_id to PHC bar */
+	phc->doorbell_offset = get_feat_resp.u.phc.doorbell_offset;
+
+	/* Update PHC expire timeout according to device or default driver value */
+	phc->expire_timeout_usec = (get_feat_resp.u.phc.expire_timeout_usec) ?
+				    get_feat_resp.u.phc.expire_timeout_usec :
+				    ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC;
+
+	/* Update PHC block timeout according to device or default driver value */
+	phc->block_timeout_usec = (get_feat_resp.u.phc.block_timeout_usec) ?
+				   get_feat_resp.u.phc.block_timeout_usec :
+				   ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC;
+
+	/* Sanity check - expire timeout must not be above skip timeout */
+	if (phc->expire_timeout_usec > phc->block_timeout_usec)
+		phc->expire_timeout_usec = phc->block_timeout_usec;
+
+	/* Prepare PHC feature command with PHC output address */
+	memset(&set_feat_cmd, 0x0, sizeof(set_feat_cmd));
+	set_feat_cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	set_feat_cmd.feat_common.feature_id = ENA_ADMIN_PHC_CONFIG;
+	set_feat_cmd.u.phc.output_length = sizeof(*phc->virt_addr);
+	ret = ena_com_mem_addr_set(ena_dev, &set_feat_cmd.u.phc.output_address, phc->phys_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed setting PHC output address, error: %d\n",
+			   ret);
+		return ret;
+	}
+
+	/* Send PHC feature command to the device */
+	ret = ena_com_execute_admin_command(&ena_dev->admin_queue,
+					    (struct ena_admin_aq_entry *)&set_feat_cmd,
+					    sizeof(set_feat_cmd),
+					    (struct ena_admin_acq_entry *)&set_feat_resp,
+					    sizeof(set_feat_resp));
+
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed to enable PHC, error: %d\n", ret);
+		return ret;
+	}
+
+	phc->enabled = true;
+	netdev_dbg(ena_dev->net_device, "PHC is enabled\n");
+
+	return ret;
+}
+
+void ena_com_phc_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_phc_info *phc = &ena_dev->phc;
+
+	phc->enabled = false;
+
+	/* In case PHC is not supported by the device, silently exiting */
+	if (!phc->virt_addr)
+		return;
+
+	dma_free_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr),
+			  phc->virt_addr, phc->phys_addr);
+	phc->virt_addr = NULL;
+}
+
+int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
+{
+	volatile struct ena_admin_phc_resp *read_resp = ena_dev->phc.virt_addr;
+	struct ena_com_phc_info *phc = &ena_dev->phc;
+	ktime_t initial_time = ktime_set(0, 0);
+	static ktime_t start_time;
+	unsigned long flags = 0;
+	ktime_t expire_time;
+	ktime_t block_time;
+	int ret = 0;
+
+	if (!phc->enabled) {
+		netdev_err(ena_dev->net_device, "PHC feature is not enabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_irqsave(&phc->lock, flags);
+
+	/* Check if PHC is in blocked state */
+	if (unlikely(ktime_compare(start_time, initial_time))) {
+		/* Check if blocking time expired */
+		block_time = ktime_add_us(start_time, phc->block_timeout_usec);
+		if (!ktime_after(ktime_get(), block_time)) {
+			/* PHC is still in blocked state, skip PHC request */
+			phc->stats.phc_skp++;
+			ret = -EBUSY;
+			goto skip;
+		}
+
+		/* PHC is in active state, update statistics according to req_id and timestamp */
+		if ((READ_ONCE(read_resp->req_id) != phc->req_id) ||
+		    (read_resp->timestamp == ENA_PHC_TIMESTAMP_ERROR)) {
+			/* Device didn't update req_id during blocking time or timestamp is invalid,
+			 * this indicates on a device error
+			 */
+			phc->stats.phc_err++;
+		} else {
+			/* Device updated req_id during blocking time with valid timestamp */
+			phc->stats.phc_exp++;
+		}
+	}
+
+	/* Setting relative timeouts */
+	start_time = ktime_get();
+	block_time = ktime_add_us(start_time, phc->block_timeout_usec);
+	expire_time = ktime_add_us(start_time, phc->expire_timeout_usec);
+
+	/* We expect the device to return this req_id once the new PHC timestamp is updated */
+	phc->req_id++;
+
+	/* Initialize PHC shared memory with different req_id value to be able to identify once the
+	 * device changes it to req_id
+	 */
+	read_resp->req_id = phc->req_id + ENA_PHC_REQ_ID_OFFSET;
+
+	/* Writing req_id to PHC bar */
+	writel(phc->req_id, ena_dev->reg_bar + phc->doorbell_offset);
+
+	/* Stalling until the device updates req_id */
+	while (1) {
+		if (unlikely(ktime_after(ktime_get(), expire_time))) {
+			/* Gave up waiting for updated req_id, PHC enters into
+			 * blocked state until passing blocking time
+			 */
+			ret = -EBUSY;
+			break;
+		}
+
+		/* Check if req_id was updated by the device */
+		if (READ_ONCE(read_resp->req_id) != phc->req_id) {
+			/* req_id was not updated by the device, check again on next loop */
+			continue;
+		}
+
+		/* req_id was updated which indicates that PHC timestamp was updated too */
+		*timestamp = read_resp->timestamp;
+
+		/* PHC timestamp validty check */
+		if (unlikely(*timestamp == ENA_PHC_TIMESTAMP_ERROR)) {
+			/* Retrieved invalid PHC timestamp, PHC enters into
+			 * blocked state until passing blocking time
+			 */
+			ret = -EBUSY;
+			break;
+		}
+
+		/* Retrieved valid PHC timestamp */
+		phc->stats.phc_cnt++;
+
+		/* This indicates PHC state is active */
+		start_time = initial_time;
+		break;
+	}
+
+skip:
+	spin_unlock_irqrestore(&phc->lock, flags);
+
+	return ret;
+}
+
+int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+
+	spin_lock_init(&mmio_read->lock);
+	mmio_read->read_resp =
+		dma_zalloc_coherent(ena_dev->dmadev,
+				    sizeof(*mmio_read->read_resp),
+				    &mmio_read->read_resp_dma_addr, GFP_KERNEL);
+	if (unlikely(!mmio_read->read_resp))
+		goto err;
+
+	ena_com_mmio_reg_read_request_write_dev_addr(ena_dev);
+
+	mmio_read->read_resp->req_id = 0x0;
+	mmio_read->seq_num = 0x0;
+	mmio_read->readless_supported = true;
+
+	return 0;
+
+err:
+
+	return -ENOMEM;
+}
+
+void ena_com_set_mmio_read_mode(struct ena_com_dev *ena_dev, bool readless_supported)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+
+	mmio_read->readless_supported = readless_supported;
+}
+
+void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+
+	writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_LO_OFF);
+	writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_HI_OFF);
+
+	dma_free_coherent(ena_dev->dmadev, sizeof(*mmio_read->read_resp),
+			  mmio_read->read_resp, mmio_read->read_resp_dma_addr);
+
+	mmio_read->read_resp = NULL;
+}
+
+void ena_com_mmio_reg_read_request_write_dev_addr(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+	u32 addr_low, addr_high;
+
+	addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(mmio_read->read_resp_dma_addr);
+	addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(mmio_read->read_resp_dma_addr);
+
+	writel(addr_low, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_LO_OFF);
+	writel(addr_high, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_HI_OFF);
+}
+
+int ena_com_admin_init(struct ena_com_dev *ena_dev,
+		       struct ena_aenq_handlers *aenq_handlers)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	u32 aq_caps, acq_caps, dev_sts, addr_low, addr_high;
+	int ret;
+
+	dev_sts = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
+
+	if (unlikely(dev_sts == ENA_MMIO_READ_TIMEOUT)) {
+		netdev_err(ena_dev->net_device, "Reg read timeout occurred\n");
+		return -ETIME;
+	}
+
+	if (!(dev_sts & ENA_REGS_DEV_STS_READY_MASK)) {
+		netdev_err(ena_dev->net_device,
+			   "Device isn't ready, abort com init\n");
+		return -ENODEV;
+	}
+
+	admin_queue->q_depth = ENA_ADMIN_QUEUE_DEPTH;
+
+	admin_queue->bus = ena_dev->bus;
+	admin_queue->q_dmadev = ena_dev->dmadev;
+	admin_queue->polling = false;
+	admin_queue->curr_cmd_id = 0;
+
+	atomic_set(&admin_queue->outstanding_cmds, 0);
+
+	spin_lock_init(&admin_queue->q_lock);
+
+	ret = ena_com_init_comp_ctxt(admin_queue);
+	if (ret)
+		goto error;
+
+	ret = ena_com_admin_init_sq(admin_queue);
+	if (ret)
+		goto error;
+
+	ret = ena_com_admin_init_cq(admin_queue);
+	if (ret)
+		goto error;
+
+	admin_queue->sq.db_addr = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
+		ENA_REGS_AQ_DB_OFF);
+
+	addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(admin_queue->sq.dma_addr);
+	addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(admin_queue->sq.dma_addr);
+
+	writel(addr_low, ena_dev->reg_bar + ENA_REGS_AQ_BASE_LO_OFF);
+	writel(addr_high, ena_dev->reg_bar + ENA_REGS_AQ_BASE_HI_OFF);
+
+	addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(admin_queue->cq.dma_addr);
+	addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(admin_queue->cq.dma_addr);
+
+	writel(addr_low, ena_dev->reg_bar + ENA_REGS_ACQ_BASE_LO_OFF);
+	writel(addr_high, ena_dev->reg_bar + ENA_REGS_ACQ_BASE_HI_OFF);
+
+	aq_caps = 0;
+	aq_caps |= admin_queue->q_depth & ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK;
+	aq_caps |= (sizeof(struct ena_admin_aq_entry) <<
+			ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT) &
+			ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK;
+
+	acq_caps = 0;
+	acq_caps |= admin_queue->q_depth & ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK;
+	acq_caps |= (sizeof(struct ena_admin_acq_entry) <<
+		ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT) &
+		ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK;
+
+	writel(aq_caps, ena_dev->reg_bar + ENA_REGS_AQ_CAPS_OFF);
+	writel(acq_caps, ena_dev->reg_bar + ENA_REGS_ACQ_CAPS_OFF);
+	ret = ena_com_admin_init_aenq(ena_dev, aenq_handlers);
+	if (ret)
+		goto error;
+
+	admin_queue->ena_dev = ena_dev;
+	admin_queue->running_state = true;
+
+	return 0;
+error:
+	ena_com_admin_destroy(ena_dev);
+
+	return ret;
+}
+
+int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
+			    struct ena_com_create_io_ctx *ctx)
+{
+	struct ena_com_io_sq *io_sq;
+	struct ena_com_io_cq *io_cq;
+	int ret;
+
+	if (ctx->qid >= ENA_TOTAL_NUM_QUEUES) {
+		netdev_err(ena_dev->net_device,
+			   "Qid (%d) is bigger than max num of queues (%d)\n",
+			   ctx->qid, ENA_TOTAL_NUM_QUEUES);
+		return -EINVAL;
+	}
+
+	io_sq = &ena_dev->io_sq_queues[ctx->qid];
+	io_cq = &ena_dev->io_cq_queues[ctx->qid];
+
+	memset(io_sq, 0x0, sizeof(*io_sq));
+	memset(io_cq, 0x0, sizeof(*io_cq));
+
+	/* Init CQ */
+	io_cq->q_depth = ctx->queue_size;
+	io_cq->direction = ctx->direction;
+	io_cq->qid = ctx->qid;
+
+	io_cq->msix_vector = ctx->msix_vector;
+
+	io_sq->q_depth = ctx->queue_size;
+	io_sq->direction = ctx->direction;
+	io_sq->qid = ctx->qid;
+
+	io_sq->mem_queue_type = ctx->mem_queue_type;
+
+	if (ctx->direction == ENA_COM_IO_QUEUE_DIRECTION_TX)
+		/* header length is limited to 8 bits */
+		io_sq->tx_max_header_size =
+			min_t(u32, ena_dev->tx_max_header_size, SZ_256);
+
+	ret = ena_com_init_io_sq(ena_dev, ctx, io_sq);
+	if (ret)
+		goto error;
+	ret = ena_com_init_io_cq(ena_dev, ctx, io_cq);
+	if (ret)
+		goto error;
+
+	ret = ena_com_create_io_cq(ena_dev, io_cq);
+	if (ret)
+		goto error;
+
+	ret = ena_com_create_io_sq(ena_dev, io_sq, io_cq->idx);
+	if (ret)
+		goto destroy_io_cq;
+
+	return 0;
+
+destroy_io_cq:
+	ena_com_destroy_io_cq(ena_dev, io_cq);
+error:
+	ena_com_io_queue_free(ena_dev, io_sq, io_cq);
+	return ret;
+}
+
+void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid)
+{
+	struct ena_com_io_sq *io_sq;
+	struct ena_com_io_cq *io_cq;
+
+	if (qid >= ENA_TOTAL_NUM_QUEUES) {
+		netdev_err(ena_dev->net_device,
+			   "Qid (%d) is bigger than max num of queues (%d)\n",
+			   qid, ENA_TOTAL_NUM_QUEUES);
+		return;
+	}
+
+	io_sq = &ena_dev->io_sq_queues[qid];
+	io_cq = &ena_dev->io_cq_queues[qid];
+
+	ena_com_destroy_io_sq(ena_dev, io_sq);
+	ena_com_destroy_io_cq(ena_dev, io_cq);
+
+	ena_com_io_queue_free(ena_dev, io_sq, io_cq);
+}
+
+int ena_com_get_link_params(struct ena_com_dev *ena_dev,
+			    struct ena_admin_get_feat_resp *resp)
+{
+	return ena_com_get_feature(ena_dev, resp, ENA_ADMIN_LINK_CONFIG, 0);
+}
+
+int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
+			      struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	struct ena_admin_get_feat_resp get_resp;
+	int rc;
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_DEVICE_ATTRIBUTES, 0);
+	if (rc)
+		return rc;
+
+	memcpy(&get_feat_ctx->dev_attr, &get_resp.u.dev_attr,
+	       sizeof(get_resp.u.dev_attr));
+
+	ena_dev->supported_features = get_resp.u.dev_attr.supported_features;
+	ena_dev->capabilities = get_resp.u.dev_attr.capabilities;
+
+	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
+		rc = ena_com_get_feature(ena_dev, &get_resp,
+					 ENA_ADMIN_MAX_QUEUES_EXT,
+					 ENA_FEATURE_MAX_QUEUE_EXT_VER);
+		if (rc)
+			return rc;
+
+		if (get_resp.u.max_queue_ext.version !=
+		    ENA_FEATURE_MAX_QUEUE_EXT_VER)
+			return -EINVAL;
+
+		memcpy(&get_feat_ctx->max_queue_ext, &get_resp.u.max_queue_ext,
+		       sizeof(get_resp.u.max_queue_ext));
+		ena_dev->tx_max_header_size =
+			get_resp.u.max_queue_ext.max_queue_ext.max_tx_header_size;
+	} else {
+		rc = ena_com_get_feature(ena_dev, &get_resp,
+					 ENA_ADMIN_MAX_QUEUES_NUM, 0);
+		memcpy(&get_feat_ctx->max_queues, &get_resp.u.max_queue,
+		       sizeof(get_resp.u.max_queue));
+		ena_dev->tx_max_header_size =
+			get_resp.u.max_queue.max_header_size;
+
+		if (rc)
+			return rc;
+	}
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_AENQ_CONFIG, 0);
+	if (rc)
+		return rc;
+
+	memcpy(&get_feat_ctx->aenq, &get_resp.u.aenq,
+	       sizeof(get_resp.u.aenq));
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0);
+	if (rc)
+		return rc;
+
+	memcpy(&get_feat_ctx->offload, &get_resp.u.offload,
+	       sizeof(get_resp.u.offload));
+
+	/* Driver hints isn't mandatory admin command. So in case the
+	 * command isn't supported set driver hints to 0
+	 */
+	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_HW_HINTS, 0);
+
+	if (!rc)
+		memcpy(&get_feat_ctx->hw_hints, &get_resp.u.hw_hints,
+		       sizeof(get_resp.u.hw_hints));
+	else if (rc == -EOPNOTSUPP)
+		memset(&get_feat_ctx->hw_hints, 0x0,
+		       sizeof(get_feat_ctx->hw_hints));
+	else
+		return rc;
+
+	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_LLQ, 0);
+	if (!rc)
+		memcpy(&get_feat_ctx->llq, &get_resp.u.llq,
+		       sizeof(get_resp.u.llq));
+	else if (rc == -EOPNOTSUPP)
+		memset(&get_feat_ctx->llq, 0x0, sizeof(get_feat_ctx->llq));
+	else
+		return rc;
+
+	return 0;
+}
+
+void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev)
+{
+	ena_com_handle_admin_completion(&ena_dev->admin_queue);
+}
+
+/* ena_handle_specific_aenq_event:
+ * return the handler that is relevant to the specific event group
+ */
+static ena_aenq_handler ena_com_get_specific_aenq_cb(struct ena_com_dev *ena_dev,
+						     u16 group)
+{
+	struct ena_aenq_handlers *aenq_handlers = ena_dev->aenq.aenq_handlers;
+
+	if ((group < ENA_MAX_HANDLERS) && aenq_handlers->handlers[group])
+		return aenq_handlers->handlers[group];
+
+	return aenq_handlers->unimplemented_handler;
+}
+
+/* ena_aenq_intr_handler:
+ * handles the aenq incoming events.
+ * pop events from the queue and apply the specific handler
+ */
+void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data)
+{
+	struct ena_admin_aenq_entry *aenq_e;
+	struct ena_admin_aenq_common_desc *aenq_common;
+	struct ena_com_aenq *aenq  = &ena_dev->aenq;
+	u64 timestamp;
+	ena_aenq_handler handler_cb;
+	u16 masked_head, processed = 0;
+	u8 phase;
+
+	masked_head = aenq->head & (aenq->q_depth - 1);
+	phase = aenq->phase;
+	aenq_e = &aenq->entries[masked_head]; /* Get first entry */
+	aenq_common = &aenq_e->aenq_common_desc;
+
+	/* Go over all the events */
+	while ((READ_ONCE(aenq_common->flags) &
+		ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/* Make sure the phase bit (ownership) is as expected before
+		 * reading the rest of the descriptor.
+		 */
+		dma_rmb();
+
+		timestamp = (u64)aenq_common->timestamp_low |
+			((u64)aenq_common->timestamp_high << 32);
+
+		netdev_dbg(ena_dev->net_device,
+			   "AENQ! Group[%x] Syndrome[%x] timestamp: [%llus]\n",
+			   aenq_common->group, aenq_common->syndrome, timestamp);
+
+		/* Handle specific event*/
+		handler_cb = ena_com_get_specific_aenq_cb(ena_dev,
+							  aenq_common->group);
+		handler_cb(data, aenq_e); /* call the actual event handler*/
+
+		/* Get next event entry */
+		masked_head++;
+		processed++;
+
+		if (unlikely(masked_head == aenq->q_depth)) {
+			masked_head = 0;
+			phase = !phase;
+		}
+		aenq_e = &aenq->entries[masked_head];
+		aenq_common = &aenq_e->aenq_common_desc;
+	}
+
+	aenq->head += processed;
+	aenq->phase = phase;
+
+	/* Don't update aenq doorbell if there weren't any processed events */
+	if (!processed)
+		return;
+
+	/* write the aenq doorbell after all AENQ descriptors were read */
+	mb();
+	writel_relaxed((u32)aenq->head,
+		       ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+#ifndef MMIOWB_NOT_DEFINED
+	mmiowb();
+#endif
+}
+
+int ena_com_dev_reset(struct ena_com_dev *ena_dev,
+		      enum ena_regs_reset_reason_types reset_reason)
+{
+	u32 stat, timeout, cap, reset_val;
+	int rc;
+
+	stat = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
+	cap = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF);
+
+	if (unlikely((stat == ENA_MMIO_READ_TIMEOUT) ||
+		     (cap == ENA_MMIO_READ_TIMEOUT))) {
+		netdev_err(ena_dev->net_device, "Reg read32 timeout occurred\n");
+		return -ETIME;
+	}
+
+	if ((stat & ENA_REGS_DEV_STS_READY_MASK) == 0) {
+		netdev_err(ena_dev->net_device,
+			   "Device isn't ready, can't reset device\n");
+		return -EINVAL;
+	}
+
+	timeout = (cap & ENA_REGS_CAPS_RESET_TIMEOUT_MASK) >>
+			ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT;
+	if (timeout == 0) {
+		netdev_err(ena_dev->net_device, "Invalid timeout value\n");
+		return -EINVAL;
+	}
+
+	/* start reset */
+	reset_val = ENA_REGS_DEV_CTL_DEV_RESET_MASK;
+	reset_val |= (reset_reason << ENA_REGS_DEV_CTL_RESET_REASON_SHIFT) &
+		     ENA_REGS_DEV_CTL_RESET_REASON_MASK;
+	writel(reset_val, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF);
+
+	/* Write again the MMIO read request address */
+	ena_com_mmio_reg_read_request_write_dev_addr(ena_dev);
+
+	rc = wait_for_reset_state(ena_dev, timeout,
+				  ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK);
+	if (rc != 0) {
+		netdev_err(ena_dev->net_device,
+			   "Reset indication didn't turn on\n");
+		return rc;
+	}
+
+	/* reset done */
+	writel(0, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF);
+	rc = wait_for_reset_state(ena_dev, timeout, 0);
+	if (rc != 0) {
+		netdev_err(ena_dev->net_device,
+			   "Reset indication didn't turn off\n");
+		return rc;
+	}
+
+	timeout = (cap & ENA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
+		ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+	if (timeout)
+		/* the resolution of timeout reg is 100ms */
+		ena_dev->admin_queue.completion_timeout = timeout * 100000;
+	else
+		ena_dev->admin_queue.completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+	return 0;
+}
+
+static int ena_get_dev_stats(struct ena_com_dev *ena_dev,
+			     struct ena_com_stats_ctx *ctx,
+			     enum ena_admin_get_stats_type type)
+{
+	struct ena_admin_aq_get_stats_cmd *get_cmd = &ctx->get_cmd;
+	struct ena_admin_acq_get_stats_resp *get_resp = &ctx->get_resp;
+	struct ena_com_admin_queue *admin_queue;
+	int ret;
+
+	admin_queue = &ena_dev->admin_queue;
+
+	get_cmd->aq_common_descriptor.opcode = ENA_ADMIN_GET_STATS;
+	get_cmd->aq_common_descriptor.flags = 0;
+	get_cmd->type = type;
+
+	ret =  ena_com_execute_admin_command(admin_queue,
+					     (struct ena_admin_aq_entry *)get_cmd,
+					     sizeof(*get_cmd),
+					     (struct ena_admin_acq_entry *)get_resp,
+					     sizeof(*get_resp));
+
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to get stats. error: %d\n", ret);
+
+	return ret;
+}
+
+int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
+			  struct ena_admin_eni_stats *stats)
+{
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENI_STATS)) {
+		netdev_err(ena_dev->net_device,
+			   "Capability %d isn't supported\n",
+			   ENA_ADMIN_ENI_STATS);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&ctx, 0x0, sizeof(ctx));
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_ENI);
+	if (likely(ret == 0))
+		memcpy(stats, &ctx.get_resp.u.eni_stats,
+		       sizeof(ctx.get_resp.u.eni_stats));
+
+	return ret;
+}
+
+int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev,
+			      struct ena_admin_ena_srd_info *info)
+{
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+		netdev_err(ena_dev->net_device,
+			   "Capability %d isn't supported\n",
+			   ENA_ADMIN_ENA_SRD_INFO);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&ctx, 0x0, sizeof(ctx));
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_ENA_SRD);
+	if (likely(ret == 0))
+		memcpy(info, &ctx.get_resp.u.ena_srd_info,
+		       sizeof(ctx.get_resp.u.ena_srd_info));
+
+	return ret;
+}
+
+int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
+				struct ena_admin_basic_stats *stats)
+{
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	memset(&ctx, 0x0, sizeof(ctx));
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_BASIC);
+	if (likely(ret == 0))
+		memcpy(stats, &ctx.get_resp.u.basic_stats,
+		       sizeof(ctx.get_resp.u.basic_stats));
+
+	return ret;
+}
+
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu)
+{
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_MTU)) {
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   ENA_ADMIN_MTU);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags = 0;
+	cmd.feat_common.feature_id = ENA_ADMIN_MTU;
+	cmd.u.mtu.mtu = mtu;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to set mtu %d. error: %d\n", mtu, ret);
+
+	return ret;
+}
+
+int ena_com_get_offload_settings(struct ena_com_dev *ena_dev,
+				 struct ena_admin_feature_offload_desc *offload)
+{
+	int ret;
+	struct ena_admin_get_feat_resp resp;
+
+	ret = ena_com_get_feature(ena_dev, &resp,
+				  ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed to get offload capabilities %d\n", ret);
+		return ret;
+	}
+
+	memcpy(offload, &resp.u.offload, sizeof(resp.u.offload));
+
+	return 0;
+}
+
+int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	struct ena_admin_get_feat_resp get_resp;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(ena_dev,
+						ENA_ADMIN_RSS_HASH_FUNCTION)) {
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   ENA_ADMIN_RSS_HASH_FUNCTION);
+		return -EOPNOTSUPP;
+	}
+
+	/* Validate hash function is supported */
+	ret = ena_com_get_feature(ena_dev, &get_resp,
+				  ENA_ADMIN_RSS_HASH_FUNCTION, 0);
+	if (unlikely(ret))
+		return ret;
+
+	if (!(get_resp.u.flow_hash_func.supported_func & BIT(rss->hash_func))) {
+		netdev_err(ena_dev->net_device,
+			   "Func hash %d isn't supported by device, abort\n",
+			   rss->hash_func);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags =
+		ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+	cmd.feat_common.feature_id = ENA_ADMIN_RSS_HASH_FUNCTION;
+	cmd.u.flow_hash_func.init_val = rss->hash_init_val;
+	cmd.u.flow_hash_func.selected_func = 1 << rss->hash_func;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.control_buffer.address,
+				   rss->hash_key_dma_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
+		return ret;
+	}
+
+	cmd.control_buffer.length = sizeof(*rss->hash_key);
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed to set hash function %d. error: %d\n",
+			   rss->hash_func, ret);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
+			       enum ena_admin_hash_functions func,
+			       const u8 *key, u16 key_len, u32 init_val)
+{
+	struct ena_admin_feature_rss_flow_hash_control *hash_key;
+	struct ena_admin_get_feat_resp get_resp;
+	enum ena_admin_hash_functions old_func;
+	struct ena_rss *rss = &ena_dev->rss;
+	int rc;
+
+	hash_key = rss->hash_key;
+
+	/* Make sure size is a mult of DWs */
+	if (unlikely(key_len & 0x3))
+		return -EINVAL;
+
+	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+				    ENA_ADMIN_RSS_HASH_FUNCTION,
+				    rss->hash_key_dma_addr,
+				    sizeof(*rss->hash_key), 0);
+	if (unlikely(rc))
+		return rc;
+
+	if (!(BIT(func) & get_resp.u.flow_hash_func.supported_func)) {
+		netdev_err(ena_dev->net_device,
+			   "Flow hash function %d isn't supported\n", func);
+		return -EOPNOTSUPP;
+	}
+
+	if ((func == ENA_ADMIN_TOEPLITZ) && key) {
+		if (key_len != sizeof(hash_key->key)) {
+			netdev_err(ena_dev->net_device,
+				   "key len (%u) doesn't equal the supported size (%zu)\n",
+				   key_len, sizeof(hash_key->key));
+			return -EINVAL;
+		}
+		memcpy(hash_key->key, key, key_len);
+		hash_key->key_parts = key_len / sizeof(hash_key->key[0]);
+	}
+
+	rss->hash_init_val = init_val;
+	old_func = rss->hash_func;
+	rss->hash_func = func;
+	rc = ena_com_set_hash_function(ena_dev);
+
+	/* Restore the old function */
+	if (unlikely(rc))
+		rss->hash_func = old_func;
+
+	return rc;
+}
+
+int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
+			      enum ena_admin_hash_functions *func)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_get_feat_resp get_resp;
+	int rc;
+
+	if (unlikely(!func))
+		return -EINVAL;
+
+	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+				    ENA_ADMIN_RSS_HASH_FUNCTION,
+				    rss->hash_key_dma_addr,
+				    sizeof(*rss->hash_key), 0);
+	if (unlikely(rc))
+		return rc;
+
+	/* ffs() returns 1 in case the lsb is set */
+	rss->hash_func = ffs(get_resp.u.flow_hash_func.selected_func);
+	if (rss->hash_func)
+		rss->hash_func--;
+
+	*func = rss->hash_func;
+
+	return 0;
+}
+
+int ena_com_get_hash_key(struct ena_com_dev *ena_dev, u8 *key)
+{
+	struct ena_admin_feature_rss_flow_hash_control *hash_key =
+		ena_dev->rss.hash_key;
+
+	if (key)
+		memcpy(key, hash_key->key,
+		       (size_t)(hash_key->key_parts) * sizeof(hash_key->key[0]));
+
+	return 0;
+}
+
+int ena_com_get_hash_ctrl(struct ena_com_dev *ena_dev,
+			  enum ena_admin_flow_hash_proto proto,
+			  u16 *fields)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_get_feat_resp get_resp;
+	int rc;
+
+	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+				    ENA_ADMIN_RSS_HASH_INPUT,
+				    rss->hash_ctrl_dma_addr,
+				    sizeof(*rss->hash_ctrl), 0);
+	if (unlikely(rc))
+		return rc;
+
+	if (fields)
+		*fields = rss->hash_ctrl->selected_fields[proto].fields;
+
+	return 0;
+}
+
+int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_feature_rss_hash_control *hash_ctrl = rss->hash_ctrl;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(ena_dev,
+						ENA_ADMIN_RSS_HASH_INPUT)) {
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   ENA_ADMIN_RSS_HASH_INPUT);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags =
+		ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+	cmd.feat_common.feature_id = ENA_ADMIN_RSS_HASH_INPUT;
+	cmd.u.flow_hash_input.enabled_input_sort =
+		ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK |
+		ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.control_buffer.address,
+				   rss->hash_ctrl_dma_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
+		return ret;
+	}
+	cmd.control_buffer.length = sizeof(*hash_ctrl);
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to set hash input. error: %d\n", ret);
+
+	return ret;
+}
+
+int ena_com_set_default_hash_ctrl(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_feature_rss_hash_control *hash_ctrl =
+		rss->hash_ctrl;
+	u16 available_fields = 0;
+	int rc, i;
+
+	/* Get the supported hash input */
+	rc = ena_com_get_hash_ctrl(ena_dev, 0, NULL);
+	if (unlikely(rc))
+		return rc;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_TCP4].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA |
+		ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_UDP4].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA |
+		ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_TCP6].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA |
+		ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_UDP6].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA |
+		ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP4].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP6].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP4_FRAG].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_NOT_IP].fields =
+		ENA_ADMIN_RSS_L2_DA | ENA_ADMIN_RSS_L2_SA;
+
+	for (i = 0; i < ENA_ADMIN_RSS_PROTO_NUM; i++) {
+		available_fields = hash_ctrl->selected_fields[i].fields &
+				hash_ctrl->supported_fields[i].fields;
+		if (available_fields != hash_ctrl->selected_fields[i].fields) {
+			netdev_err(ena_dev->net_device,
+				   "Hash control doesn't support all the desire configuration. proto %x supported %x selected %x\n",
+				   i, hash_ctrl->supported_fields[i].fields,
+				   hash_ctrl->selected_fields[i].fields);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	rc = ena_com_set_hash_ctrl(ena_dev);
+
+	/* In case of failure, restore the old hash ctrl */
+	if (unlikely(rc))
+		ena_com_get_hash_ctrl(ena_dev, 0, NULL);
+
+	return rc;
+}
+
+int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev,
+			   enum ena_admin_flow_hash_proto proto,
+			   u16 hash_fields)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_feature_rss_hash_control *hash_ctrl = rss->hash_ctrl;
+	u16 supported_fields;
+	int rc;
+
+	if (proto >= ENA_ADMIN_RSS_PROTO_NUM) {
+		netdev_err(ena_dev->net_device, "Invalid proto num (%u)\n",
+			   proto);
+		return -EINVAL;
+	}
+
+	/* Get the ctrl table */
+	rc = ena_com_get_hash_ctrl(ena_dev, proto, NULL);
+	if (unlikely(rc))
+		return rc;
+
+	/* Make sure all the fields are supported */
+	supported_fields = hash_ctrl->supported_fields[proto].fields;
+	if ((hash_fields & supported_fields) != hash_fields) {
+		netdev_err(ena_dev->net_device,
+			   "Proto %d doesn't support the required fields %x. supports only: %x\n",
+			   proto, hash_fields, supported_fields);
+	}
+
+	hash_ctrl->selected_fields[proto].fields = hash_fields;
+
+	rc = ena_com_set_hash_ctrl(ena_dev);
+
+	/* In case of failure, restore the old hash ctrl */
+	if (unlikely(rc))
+		ena_com_get_hash_ctrl(ena_dev, 0, NULL);
+
+	return 0;
+}
+
+int ena_com_indirect_table_fill_entry(struct ena_com_dev *ena_dev,
+				      u16 entry_idx, u16 entry_value)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	if (unlikely(entry_idx >= (1 << rss->tbl_log_size)))
+		return -EINVAL;
+
+	if (unlikely((entry_value > ENA_TOTAL_NUM_QUEUES)))
+		return -EINVAL;
+
+	rss->host_rss_ind_tbl[entry_idx] = entry_value;
+
+	return 0;
+}
+
+int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(
+		    ena_dev, ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG)) {
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG);
+		return -EOPNOTSUPP;
+	}
+
+	ret = ena_com_ind_tbl_convert_to_device(ena_dev);
+	if (ret) {
+		netdev_err(ena_dev->net_device,
+			   "Failed to convert host indirection table to device table\n");
+		return ret;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags =
+		ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+	cmd.feat_common.feature_id = ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG;
+	cmd.u.ind_table.size = rss->tbl_log_size;
+	cmd.u.ind_table.inline_index = 0xFFFFFFFF;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.control_buffer.address,
+				   rss->rss_ind_tbl_dma_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
+		return ret;
+	}
+
+	cmd.control_buffer.length = (1ULL << rss->tbl_log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to set indirect table. error: %d\n", ret);
+
+	return ret;
+}
+
+int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_get_feat_resp get_resp;
+	u32 tbl_size;
+	int i, rc;
+
+	tbl_size = (1ULL << rss->tbl_log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+				    ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG,
+				    rss->rss_ind_tbl_dma_addr,
+				    tbl_size, 0);
+	if (unlikely(rc))
+		return rc;
+
+	if (!ind_tbl)
+		return 0;
+
+	for (i = 0; i < (1 << rss->tbl_log_size); i++)
+		ind_tbl[i] = rss->host_rss_ind_tbl[i];
+
+	return 0;
+}
+
+int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 indr_tbl_log_size)
+{
+	int rc;
+
+	memset(&ena_dev->rss, 0x0, sizeof(ena_dev->rss));
+
+	rc = ena_com_indirect_table_allocate(ena_dev, indr_tbl_log_size);
+	if (unlikely(rc))
+		goto err_indr_tbl;
+
+	/* The following function might return unsupported in case the
+	 * device doesn't support setting the key / hash function. We can safely
+	 * ignore this error and have indirection table support only.
+	 */
+	rc = ena_com_hash_key_allocate(ena_dev);
+	if (likely(!rc))
+		ena_com_hash_key_fill_default_key(ena_dev);
+	else if (rc != -EOPNOTSUPP)
+		goto err_hash_key;
+
+	rc = ena_com_hash_ctrl_init(ena_dev);
+	if (unlikely(rc))
+		goto err_hash_ctrl;
+
+	return 0;
+
+err_hash_ctrl:
+	ena_com_hash_key_destroy(ena_dev);
+err_hash_key:
+	ena_com_indirect_table_destroy(ena_dev);
+err_indr_tbl:
+
+	return rc;
+}
+
+void ena_com_rss_destroy(struct ena_com_dev *ena_dev)
+{
+	ena_com_indirect_table_destroy(ena_dev);
+	ena_com_hash_key_destroy(ena_dev);
+	ena_com_hash_ctrl_destroy(ena_dev);
+
+	memset(&ena_dev->rss, 0x0, sizeof(ena_dev->rss));
+}
+
+int ena_com_allocate_host_info(struct ena_com_dev *ena_dev)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+
+	host_attr->host_info =
+		dma_zalloc_coherent(ena_dev->dmadev, SZ_4K,
+				    &host_attr->host_info_dma_addr, GFP_KERNEL);
+	if (unlikely(!host_attr->host_info))
+		return -ENOMEM;
+
+	host_attr->host_info->ena_spec_version = ((ENA_COMMON_SPEC_VERSION_MAJOR <<
+		ENA_REGS_VERSION_MAJOR_VERSION_SHIFT) |
+		(ENA_COMMON_SPEC_VERSION_MINOR));
+
+	return 0;
+}
+
+int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
+				u32 debug_area_size)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+
+	host_attr->debug_area_virt_addr =
+		dma_zalloc_coherent(ena_dev->dmadev, debug_area_size,
+				    &host_attr->debug_area_dma_addr, GFP_KERNEL);
+	if (unlikely(!host_attr->debug_area_virt_addr)) {
+		host_attr->debug_area_size = 0;
+		return -ENOMEM;
+	}
+
+	host_attr->debug_area_size = debug_area_size;
+
+	return 0;
+}
+
+void ena_com_delete_host_info(struct ena_com_dev *ena_dev)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+
+	if (host_attr->host_info) {
+		dma_free_coherent(ena_dev->dmadev, SZ_4K, host_attr->host_info,
+				  host_attr->host_info_dma_addr);
+		host_attr->host_info = NULL;
+	}
+}
+
+void ena_com_delete_debug_area(struct ena_com_dev *ena_dev)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+
+	if (host_attr->debug_area_virt_addr) {
+		dma_free_coherent(ena_dev->dmadev, host_attr->debug_area_size,
+				  host_attr->debug_area_virt_addr,
+				  host_attr->debug_area_dma_addr);
+		host_attr->debug_area_virt_addr = NULL;
+	}
+}
+
+int ena_com_set_host_attributes(struct ena_com_dev *ena_dev)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+
+	int ret;
+
+	/* Host attribute config is called before ena_com_get_dev_attr_feat
+	 * so ena_com can't check if the feature is supported.
+	 */
+
+	memset(&cmd, 0x0, sizeof(cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.feat_common.feature_id = ENA_ADMIN_HOST_ATTR_CONFIG;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.u.host_attr.debug_ba,
+				   host_attr->debug_area_dma_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
+		return ret;
+	}
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.u.host_attr.os_info_ba,
+				   host_attr->host_info_dma_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
+		return ret;
+	}
+
+	cmd.u.host_attr.debug_area_size = host_attr->debug_area_size;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to set host attributes: %d\n", ret);
+
+	return ret;
+}
+
+/* Interrupt moderation */
+bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev)
+{
+	return ena_com_check_supported_feature_id(ena_dev,
+						  ENA_ADMIN_INTERRUPT_MODERATION);
+}
+
+static int ena_com_update_nonadaptive_moderation_interval(struct ena_com_dev *ena_dev,
+							  u32 coalesce_usecs,
+							  u32 intr_delay_resolution,
+							  u32 *intr_moder_interval)
+{
+	if (!intr_delay_resolution) {
+		netdev_err(ena_dev->net_device,
+			   "Illegal interrupt delay granularity value\n");
+		return -EFAULT;
+	}
+
+	*intr_moder_interval = coalesce_usecs / intr_delay_resolution;
+
+	return 0;
+}
+
+int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev,
+						      u32 tx_coalesce_usecs)
+{
+	return ena_com_update_nonadaptive_moderation_interval(ena_dev,
+							      tx_coalesce_usecs,
+							      ena_dev->intr_delay_resolution,
+							      &ena_dev->intr_moder_tx_interval);
+}
+
+int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev,
+						      u32 rx_coalesce_usecs)
+{
+	return ena_com_update_nonadaptive_moderation_interval(ena_dev,
+							      rx_coalesce_usecs,
+							      ena_dev->intr_delay_resolution,
+							      &ena_dev->intr_moder_rx_interval);
+}
+
+int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev)
+{
+	struct ena_admin_get_feat_resp get_resp;
+	u16 delay_resolution;
+	int rc;
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_INTERRUPT_MODERATION, 0);
+
+	if (rc) {
+		if (rc == -EOPNOTSUPP) {
+			netdev_dbg(ena_dev->net_device,
+				   "Feature %d isn't supported\n",
+				   ENA_ADMIN_INTERRUPT_MODERATION);
+			rc = 0;
+		} else {
+			netdev_err(ena_dev->net_device,
+				   "Failed to get interrupt moderation admin cmd. rc: %d\n",
+				   rc);
+		}
+
+		/* no moderation supported, disable adaptive support */
+		ena_com_disable_adaptive_moderation(ena_dev);
+		return rc;
+	}
+
+	/* if moderation is supported by device we set adaptive moderation */
+	delay_resolution = get_resp.u.intr_moderation.intr_delay_resolution;
+	ena_com_update_intr_delay_resolution(ena_dev, delay_resolution);
+
+	/* Disable adaptive moderation by default - can be enabled later */
+	ena_com_disable_adaptive_moderation(ena_dev);
+
+	return 0;
+}
+
+unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->intr_moder_tx_interval;
+}
+
+unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->intr_moder_rx_interval;
+}
+
+int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
+			    struct ena_admin_feature_llq_desc *llq_features,
+			    struct ena_llq_configurations *llq_default_cfg)
+{
+	struct ena_com_llq_info *llq_info = &ena_dev->llq_info;
+	int rc;
+
+	if (!llq_features->max_llq_num) {
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
+	rc = ena_com_config_llq_info(ena_dev, llq_features, llq_default_cfg);
+	if (rc)
+		return rc;
+
+	ena_dev->tx_max_header_size = llq_info->desc_list_entry_size -
+		(llq_info->descs_num_before_header * sizeof(struct ena_eth_io_tx_desc));
+
+	if (unlikely(ena_dev->tx_max_header_size == 0)) {
+		netdev_err(ena_dev->net_device,
+			   "The size of the LLQ entry is smaller than needed\n");
+		return -EINVAL;
+	}
+
+	ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_DEV;
+
+	return 0;
+}
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
new file mode 100644
index 0000000000000..ab17ba125ca3c
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -0,0 +1,1127 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef ENA_COM
+#define ENA_COM
+
+#include <linux/compiler.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/prefetch.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <linux/netdevice.h>
+
+#include "kcompat.h"
+#include "ena_common_defs.h"
+#include "ena_admin_defs.h"
+#include "ena_eth_io_defs.h"
+#include "ena_regs_defs.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define ENA_MAX_NUM_IO_QUEUES 128U
+/* We need to queues for each IO (on for Tx and one for Rx) */
+#define ENA_TOTAL_NUM_QUEUES (2 * (ENA_MAX_NUM_IO_QUEUES))
+
+#define ENA_MAX_HANDLERS 256
+
+#define ENA_MAX_PHYS_ADDR_SIZE_BITS 48
+
+/* Unit in usec */
+#define ENA_REG_READ_TIMEOUT 200000
+
+#define ADMIN_SQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_aq_entry))
+#define ADMIN_CQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_acq_entry))
+#define ADMIN_AENQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_aenq_entry))
+
+/*****************************************************************************/
+/*****************************************************************************/
+/* ENA adaptive interrupt moderation settings */
+
+#define ENA_INTR_INITIAL_TX_INTERVAL_USECS 64
+#define ENA_INTR_INITIAL_RX_INTERVAL_USECS 0
+#define ENA_DEFAULT_INTR_DELAY_RESOLUTION 1
+
+#define ENA_HASH_KEY_SIZE 40
+
+#define ENA_HW_HINTS_NO_TIMEOUT	0xFFFF
+
+#define ENA_FEATURE_MAX_QUEUE_EXT_VER 1
+
+struct ena_llq_configurations {
+	enum ena_admin_llq_header_location llq_header_location;
+	enum ena_admin_llq_ring_entry_size llq_ring_entry_size;
+	enum ena_admin_llq_stride_ctrl  llq_stride_ctrl;
+	enum ena_admin_llq_num_descs_before_header llq_num_decs_before_header;
+	u16 llq_ring_entry_size_value;
+};
+
+enum queue_direction {
+	ENA_COM_IO_QUEUE_DIRECTION_TX,
+	ENA_COM_IO_QUEUE_DIRECTION_RX
+};
+
+struct ena_com_buf {
+	dma_addr_t paddr; /**< Buffer physical address */
+	u16 len; /**< Buffer length in bytes */
+};
+
+struct ena_com_rx_buf_info {
+	u16 len;
+	u16 req_id;
+};
+
+struct ena_com_io_desc_addr {
+	u8 __iomem *pbuf_dev_addr; /* LLQ address */
+	u8 *virt_addr;
+	dma_addr_t phys_addr;
+};
+
+struct ena_com_tx_meta {
+	u16 mss;
+	u16 l3_hdr_len;
+	u16 l3_hdr_offset;
+	u16 l4_hdr_len; /* In words */
+};
+
+struct ena_com_llq_info {
+	u16 header_location_ctrl;
+	u16 desc_stride_ctrl;
+	u16 desc_list_entry_size_ctrl;
+	u16 desc_list_entry_size;
+	u16 descs_num_before_header;
+	u16 descs_per_entry;
+	u16 max_entries_in_tx_burst;
+	bool disable_meta_caching;
+};
+
+struct ena_com_io_cq {
+	struct ena_com_io_desc_addr cdesc_addr;
+	void *bus;
+
+	/* Interrupt unmask register */
+	u32 __iomem *unmask_reg;
+
+
+	/* numa configuration register (for TPH) */
+	u32 __iomem *numa_node_cfg_reg;
+
+	/* The value to write to the above register to unmask
+	 * the interrupt of this queue
+	 */
+	u32 msix_vector ____cacheline_aligned;
+
+	enum queue_direction direction;
+
+	/* holds the number of cdesc of the current packet */
+	u16 cur_rx_pkt_cdesc_count;
+	/* save the first cdesc idx of the current packet */
+	u16 cur_rx_pkt_cdesc_start_idx;
+
+	u16 q_depth;
+	/* Caller qid */
+	u16 qid;
+
+	/* Device queue index */
+	u16 idx;
+	u16 head;
+	u8 phase;
+	u8 cdesc_entry_size_in_bytes;
+
+} ____cacheline_aligned;
+
+struct ena_com_io_bounce_buffer_control {
+	u8 *base_buffer;
+	u16 next_to_use;
+	u16 buffer_size;
+	u16 buffers_num;  /* Must be a power of 2 */
+};
+
+/* This struct is to keep tracking the current location of the next llq entry */
+struct ena_com_llq_pkt_ctrl {
+	u8 *curr_bounce_buf;
+	u16 idx;
+	u16 descs_left_in_line;
+};
+
+struct ena_com_io_sq {
+	struct ena_com_io_desc_addr desc_addr;
+	void *bus;
+
+	u32 __iomem *db_addr;
+
+	enum queue_direction direction;
+	enum ena_admin_placement_policy_type mem_queue_type;
+
+	bool disable_meta_caching;
+
+	u32 msix_vector;
+	struct ena_com_tx_meta cached_tx_meta;
+	struct ena_com_llq_info llq_info;
+	struct ena_com_llq_pkt_ctrl llq_buf_ctrl;
+	struct ena_com_io_bounce_buffer_control bounce_buf_ctrl;
+
+	u16 q_depth;
+	u16 qid;
+
+	u16 idx;
+	u16 tail;
+	u16 next_to_comp;
+	u16 llq_last_copy_tail;
+	u32 tx_max_header_size;
+	u8 phase;
+	u8 desc_entry_size;
+	u8 dma_addr_bits;
+	u16 entries_in_tx_burst_left;
+} ____cacheline_aligned;
+
+struct ena_com_admin_cq {
+	struct ena_admin_acq_entry *entries;
+	dma_addr_t dma_addr;
+
+	u16 head;
+	u8 phase;
+};
+
+struct ena_com_admin_sq {
+	struct ena_admin_aq_entry *entries;
+	dma_addr_t dma_addr;
+
+	u32 __iomem *db_addr;
+
+	u16 head;
+	u16 tail;
+	u8 phase;
+
+};
+
+struct ena_com_stats_admin {
+	u64 aborted_cmd;
+	u64 submitted_cmd;
+	u64 completed_cmd;
+	u64 out_of_space;
+	u64 no_completion;
+};
+
+struct ena_com_stats_phc {
+	u64 phc_cnt;
+	u64 phc_exp;
+	u64 phc_skp;
+	u64 phc_err;
+};
+
+struct ena_com_admin_queue {
+	void *q_dmadev;
+	void *bus;
+	struct ena_com_dev *ena_dev;
+	spinlock_t q_lock; /* spinlock for the admin queue */
+
+	struct ena_comp_ctx *comp_ctx;
+	u32 completion_timeout;
+	u16 q_depth;
+	struct ena_com_admin_cq cq;
+	struct ena_com_admin_sq sq;
+
+	/* Indicate if the admin queue should poll for completion */
+	bool polling;
+
+	/* Define if fallback to polling mode should occur */
+	bool auto_polling;
+
+	u16 curr_cmd_id;
+
+	/* Indicate that the ena was initialized and can
+	 * process new admin commands
+	 */
+	bool running_state;
+
+	/* Count the number of outstanding admin commands */
+	atomic_t outstanding_cmds;
+
+	struct ena_com_stats_admin stats;
+};
+
+struct ena_aenq_handlers;
+
+struct ena_com_aenq {
+	u16 head;
+	u8 phase;
+	struct ena_admin_aenq_entry *entries;
+	dma_addr_t dma_addr;
+	u16 q_depth;
+	struct ena_aenq_handlers *aenq_handlers;
+};
+
+struct ena_com_mmio_read {
+	struct ena_admin_ena_mmio_req_read_less_resp *read_resp;
+	dma_addr_t read_resp_dma_addr;
+	u32 reg_read_to; /* in us */
+	u16 seq_num;
+	bool readless_supported;
+	/* spin lock to ensure a single outstanding read */
+	spinlock_t lock;
+};
+
+/* PTP hardware clock (PHC) MMIO read data info */
+struct ena_com_phc_info {
+	/* Internal PHC statistics */
+	struct ena_com_stats_phc stats;
+
+	/* PHC shared memory - virtual address */
+	struct ena_admin_phc_resp *virt_addr;
+
+	/* Spin lock to ensure a single outstanding PHC read */
+	spinlock_t lock;
+
+	/* PHC doorbell address as an offset to PCIe MMIO REG BAR */
+	u32 doorbell_offset;
+
+	/* Shared memory read expire timeout (usec)
+	 * Max time for valid PHC retrieval, passing this threshold will fail the get time request
+	 * and block new PHC requests for block_timeout_usec in order to prevent floods on busy
+	 * device
+	 */
+	u32 expire_timeout_usec;
+
+	/* Shared memory read abort timeout (usec)
+	 * PHC requests block period, blocking starts once PHC request expired in order to prevent
+	 * floods on busy device, any PHC requests during block period will be skipped
+	 */
+	u32 block_timeout_usec;
+
+	/* Request id sent to the device */
+	u16 req_id;
+
+	/* True if PHC is enabled */
+	bool enabled;
+
+	/* PHC shared memory - memory handle */
+
+	/* PHC shared memory - physical address */
+	dma_addr_t phys_addr;
+};
+
+struct ena_rss {
+	/* Indirect table */
+	u16 *host_rss_ind_tbl;
+	struct ena_admin_rss_ind_table_entry *rss_ind_tbl;
+	dma_addr_t rss_ind_tbl_dma_addr;
+	u16 tbl_log_size;
+
+	/* Hash key */
+	enum ena_admin_hash_functions hash_func;
+	struct ena_admin_feature_rss_flow_hash_control *hash_key;
+	dma_addr_t hash_key_dma_addr;
+	u32 hash_init_val;
+
+	/* Flow Control */
+	struct ena_admin_feature_rss_hash_control *hash_ctrl;
+	dma_addr_t hash_ctrl_dma_addr;
+
+};
+
+struct ena_host_attribute {
+	/* Debug area */
+	u8 *debug_area_virt_addr;
+	dma_addr_t debug_area_dma_addr;
+	u32 debug_area_size;
+
+	/* Host information */
+	struct ena_admin_host_info *host_info;
+	dma_addr_t host_info_dma_addr;
+};
+
+/* Each ena_dev is a PCI function. */
+struct ena_com_dev {
+	struct ena_com_admin_queue admin_queue;
+	struct ena_com_aenq aenq;
+	struct ena_com_io_cq io_cq_queues[ENA_TOTAL_NUM_QUEUES];
+	struct ena_com_io_sq io_sq_queues[ENA_TOTAL_NUM_QUEUES];
+	u8 __iomem *reg_bar;
+	void __iomem *mem_bar;
+	void *dmadev;
+	void *bus;
+	struct net_device *net_device;
+
+	enum ena_admin_placement_policy_type tx_mem_queue_type;
+	u32 tx_max_header_size;
+	u16 stats_func; /* Selected function for extended statistic dump */
+	u16 stats_queue; /* Selected queue for extended statistic dump */
+
+	u32 ena_min_poll_delay_us;
+
+	struct ena_com_mmio_read mmio_read;
+	struct ena_com_phc_info phc;
+
+	struct ena_rss rss;
+	u32 supported_features;
+	u32 capabilities;
+	u32 dma_addr_bits;
+
+	struct ena_host_attribute host_attr;
+	bool adaptive_coalescing;
+	u16 intr_delay_resolution;
+
+	/* interrupt moderation intervals are in usec divided by
+	 * intr_delay_resolution, which is supplied by the device.
+	 */
+	u32 intr_moder_tx_interval;
+	u32 intr_moder_rx_interval;
+
+	struct ena_intr_moder_entry *intr_moder_tbl;
+
+	struct ena_com_llq_info llq_info;
+};
+
+struct ena_com_dev_get_features_ctx {
+	struct ena_admin_queue_feature_desc max_queues;
+	struct ena_admin_queue_ext_feature_desc max_queue_ext;
+	struct ena_admin_device_attr_feature_desc dev_attr;
+	struct ena_admin_feature_aenq_desc aenq;
+	struct ena_admin_feature_offload_desc offload;
+	struct ena_admin_ena_hw_hints hw_hints;
+	struct ena_admin_feature_llq_desc llq;
+};
+
+struct ena_com_create_io_ctx {
+	enum ena_admin_placement_policy_type mem_queue_type;
+	enum queue_direction direction;
+	int numa_node;
+	u32 msix_vector;
+	u16 queue_size;
+	u16 qid;
+};
+
+typedef void (*ena_aenq_handler)(void *data,
+	struct ena_admin_aenq_entry *aenq_e);
+
+/* Holds aenq handlers. Indexed by AENQ event group */
+struct ena_aenq_handlers {
+	ena_aenq_handler handlers[ENA_MAX_HANDLERS];
+	ena_aenq_handler unimplemented_handler;
+};
+
+/*****************************************************************************/
+/*****************************************************************************/
+
+/* ena_com_mmio_reg_read_request_init - Init the mmio reg read mechanism
+ * @ena_dev: ENA communication layer struct
+ *
+ * Initialize the register read mechanism.
+ *
+ * @note: This method must be the first stage in the initialization sequence.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev);
+
+/* ena_com_phc_init - Allocate and initialize PHC feature
+ * @ena_dev: ENA communication layer struct
+ * @note: This method assumes PHC is supported by the device
+ * @return - 0 on success, negative value on failure
+ */
+int ena_com_phc_init(struct ena_com_dev *ena_dev);
+
+/* ena_com_phc_supported - Return if PHC feature is supported by the device
+ * @ena_dev: ENA communication layer struct
+ * @note: This method must be called after getting supported features
+ * @return - supported or not
+ */
+bool ena_com_phc_supported(struct ena_com_dev *ena_dev);
+
+/* ena_com_phc_config - Configure PHC feature
+ * @ena_dev: ENA communication layer struct
+ * Configure PHC feature in driver and device
+ * @note: This method assumes PHC is supported by the device
+ * @return - 0 on success, negative value on failure
+ */
+int ena_com_phc_config(struct ena_com_dev *ena_dev);
+
+/* ena_com_phc_destroy - Destroy PHC feature
+ * @ena_dev: ENA communication layer struct
+ */
+void ena_com_phc_destroy(struct ena_com_dev *ena_dev);
+
+/* ena_com_phc_get - Retrieve PHC timestamp
+ * @ena_dev: ENA communication layer struct
+ * @timestamp: Retrieve PHC timestamp
+ * @return - 0 on success, negative value on failure
+ */
+int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp);
+
+/* ena_com_set_mmio_read_mode - Enable/disable the indirect mmio reg read mechanism
+ * @ena_dev: ENA communication layer struct
+ * @readless_supported: readless mode (enable/disable)
+ */
+void ena_com_set_mmio_read_mode(struct ena_com_dev *ena_dev,
+				bool readless_supported);
+
+/* ena_com_mmio_reg_read_request_write_dev_addr - Write the mmio reg read return
+ * value physical address.
+ * @ena_dev: ENA communication layer struct
+ */
+void ena_com_mmio_reg_read_request_write_dev_addr(struct ena_com_dev *ena_dev);
+
+/* ena_com_mmio_reg_read_request_destroy - Destroy the mmio reg read mechanism
+ * @ena_dev: ENA communication layer struct
+ */
+void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev);
+
+/* ena_com_admin_init - Init the admin and the async queues
+ * @ena_dev: ENA communication layer struct
+ * @aenq_handlers: Those handlers to be called upon event.
+ *
+ * Initialize the admin submission and completion queues.
+ * Initialize the asynchronous events notification queues.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_admin_init(struct ena_com_dev *ena_dev,
+		       struct ena_aenq_handlers *aenq_handlers);
+
+/* ena_com_admin_destroy - Destroy the admin and the async events queues.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @note: Before calling this method, the caller must validate that the device
+ * won't send any additional admin completions/aenq.
+ * To achieve that, a FLR is recommended.
+ */
+void ena_com_admin_destroy(struct ena_com_dev *ena_dev);
+
+/* ena_com_dev_reset - Perform device FLR to the device.
+ * @ena_dev: ENA communication layer struct
+ * @reset_reason: Specify what is the trigger for the reset in case of an error.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_dev_reset(struct ena_com_dev *ena_dev,
+		      enum ena_regs_reset_reason_types reset_reason);
+
+/* ena_com_create_io_queue - Create io queue.
+ * @ena_dev: ENA communication layer struct
+ * @ctx - create context structure
+ *
+ * Create the submission and the completion queues.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
+			    struct ena_com_create_io_ctx *ctx);
+
+/* ena_com_destroy_io_queue - Destroy IO queue with the queue id - qid.
+ * @ena_dev: ENA communication layer struct
+ * @qid - the caller virtual queue id.
+ */
+void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid);
+
+/* ena_com_get_io_handlers - Return the io queue handlers
+ * @ena_dev: ENA communication layer struct
+ * @qid - the caller virtual queue id.
+ * @io_sq - IO submission queue handler
+ * @io_cq - IO completion queue handler.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid,
+			    struct ena_com_io_sq **io_sq,
+			    struct ena_com_io_cq **io_cq);
+
+/* ena_com_admin_aenq_enable - ENAble asynchronous event notifications
+ * @ena_dev: ENA communication layer struct
+ *
+ * After this method, aenq event can be received via AENQ.
+ */
+void ena_com_admin_aenq_enable(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_admin_running_state - Set the state of the admin queue
+ * @ena_dev: ENA communication layer struct
+ *
+ * Change the state of the admin queue (enable/disable)
+ */
+void ena_com_set_admin_running_state(struct ena_com_dev *ena_dev, bool state);
+
+/* ena_com_get_admin_running_state - Get the admin queue state
+ * @ena_dev: ENA communication layer struct
+ *
+ * Retrieve the state of the admin queue (enable/disable)
+ *
+ * @return - current polling mode (enable/disable)
+ */
+bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_admin_polling_mode - Set the admin completion queue polling mode
+ * @ena_dev: ENA communication layer struct
+ * @polling: ENAble/Disable polling mode
+ *
+ * Set the admin completion mode.
+ */
+void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling);
+
+/* ena_com_get_admin_polling_mode - Get the admin completion queue polling mode
+ * @ena_dev: ENA communication layer struct
+ *
+ * Get the admin completion mode.
+ * If polling mode is on, ena_com_execute_admin_command will perform a
+ * polling on the admin completion queue for the commands completion,
+ * otherwise it will wait on wait event.
+ *
+ * @return state
+ */
+bool ena_com_get_admin_polling_mode(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_admin_auto_polling_mode - Enable autoswitch to polling mode
+ * @ena_dev: ENA communication layer struct
+ * @polling: Enable/Disable polling mode
+ *
+ * Set the autopolling mode.
+ * If autopolling is on:
+ * In case of missing interrupt when data is available switch to polling.
+ */
+void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev,
+					 bool polling);
+
+/* ena_com_admin_q_comp_intr_handler - admin queue interrupt handler
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method goes over the admin completion queue and wakes up all the pending
+ * threads that wait on the commands wait event.
+ *
+ * @note: Should be called after MSI-X interrupt.
+ */
+void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev);
+
+/* ena_com_aenq_intr_handler - AENQ interrupt handler
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method goes over the async event notification queue and calls the proper
+ * aenq handler.
+ */
+void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data);
+
+/* ena_com_abort_admin_commands - Abort all the outstanding admin commands.
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method aborts all the outstanding admin commands.
+ * The caller should then call ena_com_wait_for_abort_completion to make sure
+ * all the commands were completed.
+ */
+void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev);
+
+/* ena_com_wait_for_abort_completion - Wait for admin commands abort.
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method waits until all the outstanding admin commands are completed.
+ */
+void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev);
+
+/* ena_com_validate_version - Validate the device parameters
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method verifies the device parameters are the same as the saved
+ * parameters in ena_dev.
+ * This method is useful after device reset, to validate the device mac address
+ * and the device offloads are the same as before the reset.
+ *
+ * @return - 0 on success negative value otherwise.
+ */
+int ena_com_validate_version(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_link_params - Retrieve physical link parameters.
+ * @ena_dev: ENA communication layer struct
+ * @resp: Link parameters
+ *
+ * Retrieve the physical link parameters,
+ * like speed, auto-negotiation and full duplex support.
+ *
+ * @return - 0 on Success negative value otherwise.
+ */
+int ena_com_get_link_params(struct ena_com_dev *ena_dev,
+			    struct ena_admin_get_feat_resp *resp);
+
+/* ena_com_get_dma_width - Retrieve physical dma address width the device
+ * supports.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Retrieve the maximum physical address bits the device can handle.
+ *
+ * @return: > 0 on Success and negative value otherwise.
+ */
+int ena_com_get_dma_width(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_aenq_config - Set aenq groups configurations
+ * @ena_dev: ENA communication layer struct
+ * @groups flag: bit fields flags of enum ena_admin_aenq_group.
+ *
+ * Configure which aenq event group the driver would like to receive.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag);
+
+/* ena_com_get_dev_attr_feat - Get device features
+ * @ena_dev: ENA communication layer struct
+ * @get_feat_ctx: returned context that contain the get features.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
+			      struct ena_com_dev_get_features_ctx *get_feat_ctx);
+
+/* ena_com_get_dev_basic_stats - Get device basic statistics
+ * @ena_dev: ENA communication layer struct
+ * @stats: stats return value
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
+				struct ena_admin_basic_stats *stats);
+
+/* ena_com_get_eni_stats - Get extended network interface statistics
+ * @ena_dev: ENA communication layer struct
+ * @stats: stats return value
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
+			  struct ena_admin_eni_stats *stats);
+
+/* ena_com_get_ena_srd_info - Get ENA SRD network interface statistics
+ * @ena_dev: ENA communication layer struct
+ * @info: ena srd stats and flags
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev,
+			     struct ena_admin_ena_srd_info *info);
+
+/* ena_com_set_dev_mtu - Configure the device mtu.
+ * @ena_dev: ENA communication layer struct
+ * @mtu: mtu value
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu);
+
+/* ena_com_get_offload_settings - Retrieve the device offloads capabilities
+ * @ena_dev: ENA communication layer struct
+ * @offlad: offload return value
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_offload_settings(struct ena_com_dev *ena_dev,
+				 struct ena_admin_feature_offload_desc *offload);
+
+/* ena_com_rss_init - Init RSS
+ * @ena_dev: ENA communication layer struct
+ * @log_size: indirection log size
+ *
+ * Allocate RSS/RFS resources.
+ * The caller then can configure rss using ena_com_set_hash_function,
+ * ena_com_set_hash_ctrl and ena_com_indirect_table_set.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 log_size);
+
+/* ena_com_rss_destroy - Destroy rss
+ * @ena_dev: ENA communication layer struct
+ *
+ * Free all the RSS/RFS resources.
+ */
+void ena_com_rss_destroy(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_current_hash_function - Get RSS hash function
+ * @ena_dev: ENA communication layer struct
+ *
+ * Return the current hash function.
+ * @return: 0 or one of the ena_admin_hash_functions values.
+ */
+int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev);
+
+/* ena_com_fill_hash_function - Fill RSS hash function
+ * @ena_dev: ENA communication layer struct
+ * @func: The hash function (Toeplitz or crc)
+ * @key: Hash key (for toeplitz hash)
+ * @key_len: key length (max length 10 DW)
+ * @init_val: initial value for the hash function
+ *
+ * Fill the ena_dev resources with the desire hash function, hash key, key_len
+ * and key initial value (if needed by the hash function).
+ * To flush the key into the device the caller should call
+ * ena_com_set_hash_function.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
+			       enum ena_admin_hash_functions func,
+			       const u8 *key, u16 key_len, u32 init_val);
+
+/* ena_com_set_hash_function - Flush the hash function and it dependencies to
+ * the device.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Flush the hash function and it dependencies (key, key length and
+ * initial value) if needed.
+ *
+ * @note: Prior to this method the caller should call ena_com_fill_hash_function
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_hash_function(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_hash_function - Retrieve the hash function from the device.
+ * @ena_dev: ENA communication layer struct
+ * @func: hash function
+ *
+ * Retrieve the hash function from the device.
+ *
+ * @note: If the caller called ena_com_fill_hash_function but didn't flush
+ * it to the device, the new configuration will be lost.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
+			      enum ena_admin_hash_functions *func);
+
+/* ena_com_get_hash_key - Retrieve the hash key
+ * @ena_dev: ENA communication layer struct
+ * @key: hash key
+ *
+ * Retrieve the hash key.
+ *
+ * @note: If the caller called ena_com_fill_hash_key but didn't flush
+ * it to the device, the new configuration will be lost.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_hash_key(struct ena_com_dev *ena_dev, u8 *key);
+/* ena_com_fill_hash_ctrl - Fill RSS hash control
+ * @ena_dev: ENA communication layer struct.
+ * @proto: The protocol to configure.
+ * @hash_fields: bit mask of ena_admin_flow_hash_fields
+ *
+ * Fill the ena_dev resources with the desire hash control (the ethernet
+ * fields that take part of the hash) for a specific protocol.
+ * To flush the hash control to the device, the caller should call
+ * ena_com_set_hash_ctrl.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev,
+			   enum ena_admin_flow_hash_proto proto,
+			   u16 hash_fields);
+
+/* ena_com_set_hash_ctrl - Flush the hash control resources to the device.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Flush the hash control (the ethernet fields that take part of the hash)
+ *
+ * @note: Prior to this method the caller should call ena_com_fill_hash_ctrl.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_hash_ctrl - Retrieve the hash control from the device.
+ * @ena_dev: ENA communication layer struct
+ * @proto: The protocol to retrieve.
+ * @fields: bit mask of ena_admin_flow_hash_fields.
+ *
+ * Retrieve the hash control from the device.
+ *
+ * @note: If the caller called ena_com_fill_hash_ctrl but didn't flush
+ * it to the device, the new configuration will be lost.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_hash_ctrl(struct ena_com_dev *ena_dev,
+			  enum ena_admin_flow_hash_proto proto,
+			  u16 *fields);
+
+/* ena_com_set_default_hash_ctrl - Set the hash control to a default
+ * configuration.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Fill the ena_dev resources with the default hash control configuration.
+ * To flush the hash control to the device, the caller should call
+ * ena_com_set_hash_ctrl.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_default_hash_ctrl(struct ena_com_dev *ena_dev);
+
+/* ena_com_indirect_table_fill_entry - Fill a single entry in the RSS
+ * indirection table
+ * @ena_dev: ENA communication layer struct.
+ * @entry_idx - indirection table entry.
+ * @entry_value - redirection value
+ *
+ * Fill a single entry of the RSS indirection table in the ena_dev resources.
+ * To flush the indirection table to the device, the called should call
+ * ena_com_indirect_table_set.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_indirect_table_fill_entry(struct ena_com_dev *ena_dev,
+				      u16 entry_idx, u16 entry_value);
+
+/* ena_com_indirect_table_set - Flush the indirection table to the device.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Flush the indirection hash control to the device.
+ * Prior to this method the caller should call ena_com_indirect_table_fill_entry
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_indirect_table_set(struct ena_com_dev *ena_dev);
+
+/* ena_com_indirect_table_get - Retrieve the indirection table from the device.
+ * @ena_dev: ENA communication layer struct
+ * @ind_tbl: indirection table
+ *
+ * Retrieve the RSS indirection table from the device.
+ *
+ * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flush
+ * it to the device, the new configuration will be lost.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl);
+
+/* ena_com_allocate_host_info - Allocate host info resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_allocate_host_info(struct ena_com_dev *ena_dev);
+
+/* ena_com_allocate_debug_area - Allocate debug area.
+ * @ena_dev: ENA communication layer struct
+ * @debug_area_size - debug area size.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
+				u32 debug_area_size);
+
+/* ena_com_delete_debug_area - Free the debug area resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Free the allocated debug area.
+ */
+void ena_com_delete_debug_area(struct ena_com_dev *ena_dev);
+
+/* ena_com_delete_host_info - Free the host info resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Free the allocated host info.
+ */
+void ena_com_delete_host_info(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_host_attributes - Update the device with the host
+ * attributes (debug area and host info) base address.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_host_attributes(struct ena_com_dev *ena_dev);
+
+/* ena_com_create_io_cq - Create io completion queue.
+ * @ena_dev: ENA communication layer struct
+ * @io_cq - io completion queue handler
+
+ * Create IO completion queue.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
+			 struct ena_com_io_cq *io_cq);
+
+/* ena_com_destroy_io_cq - Destroy io completion queue.
+ * @ena_dev: ENA communication layer struct
+ * @io_cq - io completion queue handler
+
+ * Destroy IO completion queue.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev,
+			  struct ena_com_io_cq *io_cq);
+
+/* ena_com_execute_admin_command - Execute admin command
+ * @admin_queue: admin queue.
+ * @cmd: the admin command to execute.
+ * @cmd_size: the command size.
+ * @cmd_completion: command completion return value.
+ * @cmd_comp_size: command completion size.
+
+ * Submit an admin command and then wait until the device returns a
+ * completion.
+ * The completion will be copied into cmd_comp.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
+				  struct ena_admin_aq_entry *cmd,
+				  size_t cmd_size,
+				  struct ena_admin_acq_entry *cmd_comp,
+				  size_t cmd_comp_size);
+
+/* ena_com_init_interrupt_moderation - Init interrupt moderation
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev);
+
+/* ena_com_interrupt_moderation_supported - Return if interrupt moderation
+ * capability is supported by the device.
+ *
+ * @return - supported or not.
+ */
+bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev);
+
+/* ena_com_update_nonadaptive_moderation_interval_tx - Update the
+ * non-adaptive interval in Tx direction.
+ * @ena_dev: ENA communication layer struct
+ * @tx_coalesce_usecs: Interval in usec.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev,
+						      u32 tx_coalesce_usecs);
+
+/* ena_com_update_nonadaptive_moderation_interval_rx - Update the
+ * non-adaptive interval in Rx direction.
+ * @ena_dev: ENA communication layer struct
+ * @rx_coalesce_usecs: Interval in usec.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev,
+						      u32 rx_coalesce_usecs);
+
+/* ena_com_get_nonadaptive_moderation_interval_tx - Retrieve the
+ * non-adaptive interval in Tx direction.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return - interval in usec
+ */
+unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_nonadaptive_moderation_interval_rx - Retrieve the
+ * non-adaptive interval in Rx direction.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return - interval in usec
+ */
+unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev);
+
+/* ena_com_config_dev_mode - Configure the placement policy of the device.
+ * @ena_dev: ENA communication layer struct
+ * @llq_features: LLQ feature descriptor, retrieve via
+ *		   ena_com_get_dev_attr_feat.
+ * @ena_llq_config: The default driver LLQ parameters configurations
+ */
+int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
+			    struct ena_admin_feature_llq_desc *llq_features,
+			    struct ena_llq_configurations *llq_default_config);
+
+/* ena_com_io_sq_to_ena_dev - Extract ena_com_dev using contained field io_sq.
+ * @io_sq: IO submit queue struct
+ *
+ * @return - ena_com_dev struct extracted from io_sq
+ */
+static inline struct ena_com_dev *ena_com_io_sq_to_ena_dev(struct ena_com_io_sq *io_sq)
+{
+	return container_of(io_sq, struct ena_com_dev, io_sq_queues[io_sq->qid]);
+}
+
+/* ena_com_io_cq_to_ena_dev - Extract ena_com_dev using contained field io_cq.
+ * @io_sq: IO submit queue struct
+ *
+ * @return - ena_com_dev struct extracted from io_sq
+ */
+static inline struct ena_com_dev *ena_com_io_cq_to_ena_dev(struct ena_com_io_cq *io_cq)
+{
+	return container_of(io_cq, struct ena_com_dev, io_cq_queues[io_cq->qid]);
+}
+
+static inline bool ena_com_get_adaptive_moderation_enabled(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->adaptive_coalescing;
+}
+
+static inline void ena_com_enable_adaptive_moderation(struct ena_com_dev *ena_dev)
+{
+	ena_dev->adaptive_coalescing = true;
+}
+
+static inline void ena_com_disable_adaptive_moderation(struct ena_com_dev *ena_dev)
+{
+	ena_dev->adaptive_coalescing = false;
+}
+
+/* ena_com_get_cap - query whether device supports a capability.
+ * @ena_dev: ENA communication layer struct
+ * @cap_id: enum value representing the capability
+ *
+ * @return - true if capability is supported or false otherwise
+ */
+static inline bool ena_com_get_cap(struct ena_com_dev *ena_dev,
+				   enum ena_admin_aq_caps_id cap_id)
+{
+	return !!(ena_dev->capabilities & BIT(cap_id));
+}
+
+/* ena_com_update_intr_reg - Prepare interrupt register
+ * @intr_reg: interrupt register to update.
+ * @rx_delay_interval: Rx interval in usecs
+ * @tx_delay_interval: Tx interval in usecs
+ * @unmask: unmask enable/disable
+ *
+ * Prepare interrupt update register with the supplied parameters.
+ */
+static inline void ena_com_update_intr_reg(struct ena_eth_io_intr_reg *intr_reg,
+					   u32 rx_delay_interval,
+					   u32 tx_delay_interval,
+					   bool unmask)
+{
+	intr_reg->intr_control = 0;
+	intr_reg->intr_control |= rx_delay_interval &
+		ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK;
+
+	intr_reg->intr_control |=
+		(tx_delay_interval << ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT)
+		& ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK;
+
+	if (unmask)
+		intr_reg->intr_control |= ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK;
+}
+
+static inline u8 *ena_com_get_next_bounce_buffer(struct ena_com_io_bounce_buffer_control *bounce_buf_ctrl)
+{
+	u16 size, buffers_num;
+	u8 *buf;
+
+	size = bounce_buf_ctrl->buffer_size;
+	buffers_num = bounce_buf_ctrl->buffers_num;
+
+	buf = bounce_buf_ctrl->base_buffer +
+		(bounce_buf_ctrl->next_to_use++ & (buffers_num - 1)) * size;
+
+	prefetchw(bounce_buf_ctrl->base_buffer +
+		(bounce_buf_ctrl->next_to_use & (buffers_num - 1)) * size);
+
+	return buf;
+}
+
+#endif /* !(ENA_COM) */
diff --git a/drivers/amazon/net/ena/ena_common_defs.h b/drivers/amazon/net/ena/ena_common_defs.h
new file mode 100755
index 0000000000000..e210c8a81fc0e
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_common_defs.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+#ifndef _ENA_COMMON_H_
+#define _ENA_COMMON_H_
+
+#define ENA_COMMON_SPEC_VERSION_MAJOR        2
+#define ENA_COMMON_SPEC_VERSION_MINOR        0
+
+/* ENA operates with 48-bit memory addresses. ena_mem_addr_t */
+struct ena_common_mem_addr {
+	u32 mem_addr_low;
+
+	u16 mem_addr_high;
+
+	/* MBZ */
+	u16 reserved16;
+};
+
+#endif /* _ENA_COMMON_H_ */
diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c
new file mode 100644
index 0000000000000..68b02270786c7
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_devlink.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_devlink.h"
+#ifdef ENA_DEVLINK_SUPPORT
+
+static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
+					   union devlink_param_value val,
+					   struct netlink_ext_ack *extack);
+
+enum ena_devlink_param_id {
+	ENA_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
+	ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+};
+
+static const struct devlink_param ena_devlink_params[] = {
+	DEVLINK_PARAM_DRIVER(ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+			     "large_llq_header", DEVLINK_PARAM_TYPE_BOOL,
+			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			     NULL, NULL, ena_devlink_llq_header_validate),
+};
+
+static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
+					   union devlink_param_value val,
+					   struct netlink_ext_ack *extack)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+	bool value = val.vbool;
+
+	if (!value)
+		return 0;
+
+	if (adapter->ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) {
+		NL_SET_ERR_MSG_MOD(extack, "Instance doesn't support LLQ");
+		return -EOPNOTSUPP;
+	}
+
+	if (!adapter->large_llq_header_supported) {
+		NL_SET_ERR_MSG_MOD(extack, "Instance doesn't support large LLQ");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+/* Determines if ena_devlink_register has been called.
+ * Prefer to check if the driver enabled reloading capabilities, but fallback
+ * to check if driver configured 'dev' devlink attribute for older kernels.
+ */
+bool ena_is_devlink_params_registered(struct devlink *devlink)
+{
+#if defined(ENA_DEVLINK_RELOAD_ENABLING_REQUIRED)
+	return devlink->reload_enabled;
+#elif !defined(ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC)
+	return devlink->dev;
+#endif
+}
+
+#endif
+void ena_devlink_params_get(struct devlink *devlink)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+	union devlink_param_value val;
+	int err;
+
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	/* If devlink params aren't registered, don't access them */
+	if (!ena_is_devlink_params_registered(devlink))
+		return;
+#endif
+	err = devlink_param_driverinit_value_get(devlink,
+						 ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+						 &val);
+	if (err) {
+		netdev_err(adapter->netdev, "Failed to query LLQ header size param\n");
+		return;
+	}
+
+	adapter->large_llq_header_enabled = val.vbool;
+}
+
+void ena_devlink_disable_large_llq_header_param(struct devlink *devlink)
+{
+	union devlink_param_value value;
+
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	/* If devlink params aren't registered, don't access them */
+	if (!ena_is_devlink_params_registered(devlink))
+		return;
+
+#endif
+	value.vbool = false;
+	devlink_param_driverinit_value_set(devlink,
+					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					   value);
+}
+
+static int ena_devlink_reload_down(struct devlink *devlink,
+#ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
+				   bool netns_change,
+#endif
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+				   enum devlink_reload_action action,
+				   enum devlink_reload_limit limit,
+#endif
+				   struct netlink_ext_ack *extack)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+
+#ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
+	if (netns_change) {
+		NL_SET_ERR_MSG_MOD(extack, "Namespace change is not supported");
+		return -EOPNOTSUPP;
+	}
+
+#endif
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+	if (action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
+		NL_SET_ERR_MSG_MOD(extack, "Action is not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (limit != DEVLINK_RELOAD_LIMIT_UNSPEC) {
+		NL_SET_ERR_MSG_MOD(extack, "Driver reload doesn't support limitations");
+		return -EOPNOTSUPP;
+	}
+
+#endif
+	rtnl_lock();
+	ena_destroy_device(adapter, false);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static int ena_devlink_reload_up(struct devlink *devlink,
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+				 enum devlink_reload_action action,
+				 enum devlink_reload_limit limit,
+				 u32 *actions_performed,
+#endif
+				 struct netlink_ext_ack *extack)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+	int err = 0;
+
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+	if (action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
+		NL_SET_ERR_MSG_MOD(extack, "Action is not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (limit != DEVLINK_RELOAD_LIMIT_UNSPEC) {
+		NL_SET_ERR_MSG_MOD(extack, "Driver reload doesn't support limitations");
+		return -EOPNOTSUPP;
+	}
+
+#endif
+	rtnl_lock();
+	/* Check that no other routine initialized the device (e.g.
+	 * ena_fw_reset_device()). Also we're under devlink_mutex here,
+	 * so devink (and ena_adapter with it) isn't freed under our
+	 * feet.
+	 */
+	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
+		err = ena_restore_device(adapter);
+	rtnl_unlock();
+
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+	if (!err)
+		*actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
+
+#endif
+	return err;
+}
+#ifndef ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED
+
+static int ena_devlink_reload(struct devlink *devlink, struct netlink_ext_ack *extack)
+{
+	/* This function always succeeds when called from this function */
+	ena_devlink_reload_down(devlink, extack);
+
+	return ena_devlink_reload_up(devlink, extack);
+}
+
+#endif
+
+static const struct devlink_ops ena_devlink_ops = {
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+	.reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT),
+#endif
+#ifdef ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED
+	.reload_down	= ena_devlink_reload_down,
+	.reload_up	= ena_devlink_reload_up,
+#else
+	.reload		= ena_devlink_reload,
+#endif
+};
+
+static int ena_devlink_configure_params(struct devlink *devlink)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+	union devlink_param_value value;
+	int rc;
+
+	rc = devlink_params_register(devlink, ena_devlink_params,
+				     ARRAY_SIZE(ena_devlink_params));
+	if (rc) {
+		netdev_err(adapter->netdev, "Failed to register devlink params\n");
+		return rc;
+	}
+
+	value.vbool = adapter->large_llq_header_enabled;
+	devlink_param_driverinit_value_set(devlink,
+					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					   value);
+
+#ifdef ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
+	devlink_set_features(devlink, DEVLINK_F_RELOAD);
+
+#endif
+#ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
+	devlink_reload_enable(devlink);
+
+#endif
+	return 0;
+}
+
+struct devlink *ena_devlink_alloc(struct ena_adapter *adapter)
+{
+#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
+	struct device *dev = &adapter->pdev->dev;
+#endif
+	struct devlink *devlink;
+
+#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
+	devlink = devlink_alloc(&ena_devlink_ops, sizeof(struct ena_adapter *), dev);
+#else
+	devlink = devlink_alloc(&ena_devlink_ops, sizeof(struct ena_adapter *));
+#endif
+	if (!devlink) {
+		netdev_err(adapter->netdev, "Failed to allocate devlink struct\n");
+		return NULL;
+	}
+
+	ENA_DEVLINK_PRIV(devlink) = adapter;
+	adapter->devlink = devlink;
+
+#ifndef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	if (ena_devlink_configure_params(devlink))
+		goto free_devlink;
+
+	return devlink;
+free_devlink:
+	devlink_free(devlink);
+
+	return NULL;
+#else
+	return devlink;
+#endif
+}
+
+static void ena_devlink_configure_params_clean(struct devlink *devlink)
+{
+#ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
+	devlink_reload_disable(devlink);
+
+#endif
+	devlink_params_unregister(devlink, ena_devlink_params,
+				  ARRAY_SIZE(ena_devlink_params));
+}
+
+void ena_devlink_free(struct devlink *devlink)
+{
+#ifndef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	ena_devlink_configure_params_clean(devlink);
+
+#endif
+	devlink_free(devlink);
+}
+
+void ena_devlink_register(struct devlink *devlink, struct device *dev)
+{
+#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
+	devlink_register(devlink);
+#else
+	devlink_register(devlink, dev);
+#endif
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	ena_devlink_configure_params(devlink);
+#endif
+}
+
+void ena_devlink_unregister(struct devlink *devlink)
+{
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	ena_devlink_configure_params_clean(devlink);
+#endif
+	devlink_unregister(devlink);
+}
+#endif /* ENA_DEVLINK_SUPPORT */
diff --git a/drivers/amazon/net/ena/ena_devlink.h b/drivers/amazon/net/ena/ena_devlink.h
new file mode 100644
index 0000000000000..8a047654b2f52
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_devlink.h
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef DEVLINK_H
+#define DEVLINK_H
+
+#include "ena_netdev.h"
+#ifndef ENA_NO_DEVLINK_HEADERS
+#include <net/devlink.h>
+#endif
+
+#ifdef ENA_DEVLINK_SUPPORT
+
+#define ENA_DEVLINK_PRIV(devlink) \
+	(*(struct ena_adapter **) devlink_priv(devlink))
+
+struct devlink *ena_devlink_alloc(struct ena_adapter *adapter);
+void ena_devlink_free(struct devlink *devlink);
+void ena_devlink_register(struct devlink *devlink, struct device *dev);
+void ena_devlink_unregister(struct devlink *devlink);
+void ena_devlink_params_get(struct devlink *devlink);
+void ena_devlink_disable_large_llq_header_param(struct devlink *devlink);
+
+#else /* ENA_DEVLINK_SUPPORT */
+
+#ifdef ENA_NO_DEVLINK_HEADERS
+struct devlink {};
+#endif
+
+/* Return a value of 1 so the caller wouldn't think the function failed (returned NULL) */
+static inline struct devlink *ena_devlink_alloc(struct ena_adapter *adapter)
+{
+	return (struct devlink *)1;
+}
+static inline void ena_devlink_free(struct devlink *devlink) { }
+static inline void ena_devlink_register(struct devlink *devlink, struct device *dev) { };
+static inline void ena_devlink_unregister(struct devlink *devlink) { }
+static inline void ena_devlink_params_get(struct devlink *devlink) { }
+static inline void ena_devlink_disable_large_llq_header_param(struct devlink *devlink) { }
+
+#endif /* ENA_DEVLINK_SUPPORT */
+
+#endif /* DEVLINK_H */
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
new file mode 100644
index 0000000000000..f9f886289b970
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -0,0 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_eth_com.h"
+
+static struct ena_eth_io_rx_cdesc_base *ena_com_get_next_rx_cdesc(
+	struct ena_com_io_cq *io_cq)
+{
+	struct ena_eth_io_rx_cdesc_base *cdesc;
+	u16 expected_phase, head_masked;
+	u16 desc_phase;
+
+	head_masked = io_cq->head & (io_cq->q_depth - 1);
+	expected_phase = io_cq->phase;
+
+	cdesc = (struct ena_eth_io_rx_cdesc_base *)(io_cq->cdesc_addr.virt_addr
+			+ (head_masked * io_cq->cdesc_entry_size_in_bytes));
+
+	desc_phase = (READ_ONCE(cdesc->status) &
+		      ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK) >>
+		     ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT;
+
+	if (desc_phase != expected_phase)
+		return NULL;
+
+	/* Make sure we read the rest of the descriptor after the phase bit
+	 * has been read
+	 */
+	dma_rmb();
+
+	return cdesc;
+}
+
+static void *get_sq_desc_regular_queue(struct ena_com_io_sq *io_sq)
+{
+	u16 tail_masked;
+	u32 offset;
+
+	tail_masked = io_sq->tail & (io_sq->q_depth - 1);
+
+	offset = tail_masked * io_sq->desc_entry_size;
+
+	return (void *)((uintptr_t)io_sq->desc_addr.virt_addr + offset);
+}
+
+static int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq,
+						     u8 *bounce_buffer)
+{
+	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
+
+	u16 dst_tail_mask;
+	u32 dst_offset;
+
+	dst_tail_mask = io_sq->tail & (io_sq->q_depth - 1);
+	dst_offset = dst_tail_mask * llq_info->desc_list_entry_size;
+
+	if (is_llq_max_tx_burst_exists(io_sq)) {
+		if (unlikely(!io_sq->entries_in_tx_burst_left)) {
+			netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+				   "Error: trying to send more packets than tx burst allows\n");
+			return -ENOSPC;
+		}
+
+		io_sq->entries_in_tx_burst_left--;
+		netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Decreasing entries_in_tx_burst_left of queue %d to %d\n",
+			   io_sq->qid, io_sq->entries_in_tx_burst_left);
+	}
+
+	/* Make sure everything was written into the bounce buffer before
+	 * writing the bounce buffer to the device
+	 */
+	wmb();
+
+	/* The line is completed. Copy it to dev */
+	__iowrite64_copy(io_sq->desc_addr.pbuf_dev_addr + dst_offset,
+			 bounce_buffer, (llq_info->desc_list_entry_size) / 8);
+
+	io_sq->tail++;
+
+	/* Switch phase bit in case of wrap around */
+	if (unlikely((io_sq->tail & (io_sq->q_depth - 1)) == 0))
+		io_sq->phase ^= 1;
+
+	return 0;
+}
+
+static int ena_com_write_header_to_bounce(struct ena_com_io_sq *io_sq,
+						 u8 *header_src,
+						 u16 header_len)
+{
+	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
+	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
+	u8 *bounce_buffer = pkt_ctrl->curr_bounce_buf;
+	u16 header_offset;
+
+	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST))
+		return 0;
+
+	header_offset =
+		llq_info->descs_num_before_header * io_sq->desc_entry_size;
+
+	if (unlikely((header_offset + header_len) >
+		     llq_info->desc_list_entry_size)) {
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Trying to write header larger than llq entry can accommodate\n");
+		return -EFAULT;
+	}
+
+	if (unlikely(!bounce_buffer)) {
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Bounce buffer is NULL\n");
+		return -EFAULT;
+	}
+
+	memcpy(bounce_buffer + header_offset, header_src, header_len);
+
+	return 0;
+}
+
+static void *get_sq_desc_llq(struct ena_com_io_sq *io_sq)
+{
+	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
+	u8 *bounce_buffer;
+	void *sq_desc;
+
+	bounce_buffer = pkt_ctrl->curr_bounce_buf;
+
+	if (unlikely(!bounce_buffer)) {
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Bounce buffer is NULL\n");
+		return NULL;
+	}
+
+	sq_desc = bounce_buffer + pkt_ctrl->idx * io_sq->desc_entry_size;
+	pkt_ctrl->idx++;
+	pkt_ctrl->descs_left_in_line--;
+
+	return sq_desc;
+}
+
+static int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq)
+{
+	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
+	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
+	int rc;
+
+	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST))
+		return 0;
+
+	/* bounce buffer was used, so write it and get a new one */
+	if (likely(pkt_ctrl->idx)) {
+		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
+							pkt_ctrl->curr_bounce_buf);
+		if (unlikely(rc)) {
+			netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+				   "Failed to write bounce buffer to device\n");
+			return rc;
+		}
+
+		pkt_ctrl->curr_bounce_buf =
+			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
+		memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
+		       0x0, llq_info->desc_list_entry_size);
+	}
+
+	pkt_ctrl->idx = 0;
+	pkt_ctrl->descs_left_in_line = llq_info->descs_num_before_header;
+	return 0;
+}
+
+static void *get_sq_desc(struct ena_com_io_sq *io_sq)
+{
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		return get_sq_desc_llq(io_sq);
+
+	return get_sq_desc_regular_queue(io_sq);
+}
+
+static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
+{
+	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
+	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
+	int rc;
+
+	if (!pkt_ctrl->descs_left_in_line) {
+		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
+							pkt_ctrl->curr_bounce_buf);
+		if (unlikely(rc)) {
+			netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+				   "Failed to write bounce buffer to device\n");
+			return rc;
+		}
+
+		pkt_ctrl->curr_bounce_buf =
+			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
+		memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
+		       0x0, llq_info->desc_list_entry_size);
+
+		pkt_ctrl->idx = 0;
+		if (unlikely(llq_info->desc_stride_ctrl == ENA_ADMIN_SINGLE_DESC_PER_ENTRY))
+			pkt_ctrl->descs_left_in_line = 1;
+		else
+			pkt_ctrl->descs_left_in_line =
+			llq_info->desc_list_entry_size / io_sq->desc_entry_size;
+	}
+
+	return 0;
+}
+
+static int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
+{
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		return ena_com_sq_update_llq_tail(io_sq);
+
+	io_sq->tail++;
+
+	/* Switch phase bit in case of wrap around */
+	if (unlikely((io_sq->tail & (io_sq->q_depth - 1)) == 0))
+		io_sq->phase ^= 1;
+
+	return 0;
+}
+
+static struct ena_eth_io_rx_cdesc_base *
+	ena_com_rx_cdesc_idx_to_ptr(struct ena_com_io_cq *io_cq, u16 idx)
+{
+	idx &= (io_cq->q_depth - 1);
+	return (struct ena_eth_io_rx_cdesc_base *)
+		((uintptr_t)io_cq->cdesc_addr.virt_addr +
+		idx * io_cq->cdesc_entry_size_in_bytes);
+}
+
+static u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
+					   u16 *first_cdesc_idx)
+{
+	struct ena_eth_io_rx_cdesc_base *cdesc;
+	u16 count = 0, head_masked;
+	u32 last = 0;
+
+	do {
+		cdesc = ena_com_get_next_rx_cdesc(io_cq);
+		if (!cdesc)
+			break;
+
+		ena_com_cq_inc_head(io_cq);
+		count++;
+		last = (READ_ONCE(cdesc->status) &
+			ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >>
+		       ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT;
+	} while (!last);
+
+	if (last) {
+		*first_cdesc_idx = io_cq->cur_rx_pkt_cdesc_start_idx;
+		count += io_cq->cur_rx_pkt_cdesc_count;
+
+		head_masked = io_cq->head & (io_cq->q_depth - 1);
+
+		io_cq->cur_rx_pkt_cdesc_count = 0;
+		io_cq->cur_rx_pkt_cdesc_start_idx = head_masked;
+
+		netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+			   "ENA q_id: %d packets were completed. first desc idx %u descs# %d\n",
+			   io_cq->qid, *first_cdesc_idx, count);
+	} else {
+		io_cq->cur_rx_pkt_cdesc_count += count;
+		count = 0;
+	}
+
+	return count;
+}
+
+static int ena_com_create_meta(struct ena_com_io_sq *io_sq,
+			       struct ena_com_tx_meta *ena_meta)
+{
+	struct ena_eth_io_tx_meta_desc *meta_desc = NULL;
+
+	meta_desc = get_sq_desc(io_sq);
+	if (unlikely(!meta_desc))
+		return -EFAULT;
+
+	memset(meta_desc, 0x0, sizeof(struct ena_eth_io_tx_meta_desc));
+
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_DESC_MASK;
+
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK;
+
+	/* bits 0-9 of the mss */
+	meta_desc->word2 |= ((u32)ena_meta->mss <<
+		ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK;
+	/* bits 10-13 of the mss */
+	meta_desc->len_ctrl |= ((ena_meta->mss >> 10) <<
+		ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK;
+
+	/* Extended meta desc */
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK;
+	meta_desc->len_ctrl |= ((u32)io_sq->phase <<
+		ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_PHASE_MASK;
+
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_FIRST_MASK;
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_STORE_MASK;
+
+	meta_desc->word2 |= ena_meta->l3_hdr_len &
+		ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK;
+	meta_desc->word2 |= (ena_meta->l3_hdr_offset <<
+		ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK;
+
+	meta_desc->word2 |= ((u32)ena_meta->l4_hdr_len <<
+		ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK;
+
+	return ena_com_sq_update_tail(io_sq);
+}
+
+static int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq,
+						 struct ena_com_tx_ctx *ena_tx_ctx,
+						 bool *have_meta)
+{
+	struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta;
+
+	/* When disable meta caching is set, don't bother to save the meta and
+	 * compare it to the stored version, just create the meta
+	 */
+	if (io_sq->disable_meta_caching) {
+		*have_meta = true;
+		return ena_com_create_meta(io_sq, ena_meta);
+	}
+
+	if (ena_com_meta_desc_changed(io_sq, ena_tx_ctx)) {
+		*have_meta = true;
+		/* Cache the meta desc */
+		memcpy(&io_sq->cached_tx_meta, ena_meta,
+		       sizeof(struct ena_com_tx_meta));
+		return ena_com_create_meta(io_sq, ena_meta);
+	}
+
+	*have_meta = false;
+	return 0;
+}
+
+static void ena_com_rx_set_flags(struct ena_com_io_cq *io_cq,
+				 struct ena_com_rx_ctx *ena_rx_ctx,
+				 struct ena_eth_io_rx_cdesc_base *cdesc)
+{
+	ena_rx_ctx->l3_proto = cdesc->status &
+		ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK;
+	ena_rx_ctx->l4_proto =
+		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT;
+	ena_rx_ctx->l3_csum_err =
+		!!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT);
+	ena_rx_ctx->l4_csum_err =
+		!!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT);
+	ena_rx_ctx->l4_csum_checked =
+		!!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_SHIFT);
+	ena_rx_ctx->hash = cdesc->hash;
+	ena_rx_ctx->frag =
+		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT;
+
+	netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+		   "l3_proto %d l4_proto %d l3_csum_err %d l4_csum_err %d hash %d frag %d cdesc_status %x\n",
+		   ena_rx_ctx->l3_proto, ena_rx_ctx->l4_proto,
+		   ena_rx_ctx->l3_csum_err, ena_rx_ctx->l4_csum_err,
+		   ena_rx_ctx->hash, ena_rx_ctx->frag, cdesc->status);
+}
+
+/*****************************************************************************/
+/*****************************     API      **********************************/
+/*****************************************************************************/
+
+int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
+		       struct ena_com_tx_ctx *ena_tx_ctx,
+		       int *nb_hw_desc)
+{
+	struct ena_eth_io_tx_desc *desc = NULL;
+	struct ena_com_buf *ena_bufs = ena_tx_ctx->ena_bufs;
+	void *buffer_to_push = ena_tx_ctx->push_header;
+	u16 header_len = ena_tx_ctx->header_len;
+	u16 num_bufs = ena_tx_ctx->num_bufs;
+	u16 start_tail = io_sq->tail;
+	int i, rc;
+	bool have_meta;
+	u64 addr_hi;
+
+	WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_TX, "wrong Q type");
+
+	/* num_bufs +1 for potential meta desc */
+	if (unlikely(!ena_com_sq_have_enough_space(io_sq, num_bufs + 1))) {
+		netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Not enough space in the tx queue\n");
+		return -ENOMEM;
+	}
+
+	if (unlikely(header_len > io_sq->tx_max_header_size)) {
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Header size is too large %d max header: %d\n",
+			   header_len, io_sq->tx_max_header_size);
+		return -EINVAL;
+	}
+
+	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV &&
+		     !buffer_to_push)) {
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Push header wasn't provided in LLQ mode\n");
+		return -EINVAL;
+	}
+
+	rc = ena_com_write_header_to_bounce(io_sq, buffer_to_push, header_len);
+	if (unlikely(rc))
+		return rc;
+
+	rc = ena_com_create_and_store_tx_meta_desc(io_sq, ena_tx_ctx, &have_meta);
+	if (unlikely(rc)) {
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Failed to create and store tx meta desc\n");
+		return rc;
+	}
+
+	/* If the caller doesn't want to send packets */
+	if (unlikely(!num_bufs && !header_len)) {
+		rc = ena_com_close_bounce_buffer(io_sq);
+		if (rc)
+			netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+				   "Failed to write buffers to LLQ\n");
+		*nb_hw_desc = io_sq->tail - start_tail;
+		return rc;
+	}
+
+	desc = get_sq_desc(io_sq);
+	if (unlikely(!desc))
+		return -EFAULT;
+	memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc));
+
+	/* Set first desc when we don't have meta descriptor */
+	if (!have_meta)
+		desc->len_ctrl |= ENA_ETH_IO_TX_DESC_FIRST_MASK;
+
+	desc->buff_addr_hi_hdr_sz |= ((u32)header_len <<
+		ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT) &
+		ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK;
+	desc->len_ctrl |= ((u32)io_sq->phase << ENA_ETH_IO_TX_DESC_PHASE_SHIFT) &
+		ENA_ETH_IO_TX_DESC_PHASE_MASK;
+
+	desc->len_ctrl |= ENA_ETH_IO_TX_DESC_COMP_REQ_MASK;
+
+	/* Bits 0-9 */
+	desc->meta_ctrl |= ((u32)ena_tx_ctx->req_id <<
+		ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT) &
+		ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK;
+
+	desc->meta_ctrl |= (ena_tx_ctx->df <<
+		ENA_ETH_IO_TX_DESC_DF_SHIFT) &
+		ENA_ETH_IO_TX_DESC_DF_MASK;
+
+	/* Bits 10-15 */
+	desc->len_ctrl |= ((ena_tx_ctx->req_id >> 10) <<
+		ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT) &
+		ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK;
+
+	if (ena_tx_ctx->meta_valid) {
+		desc->meta_ctrl |= (ena_tx_ctx->tso_enable <<
+			ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT) &
+			ENA_ETH_IO_TX_DESC_TSO_EN_MASK;
+		desc->meta_ctrl |= ena_tx_ctx->l3_proto &
+			ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK;
+		desc->meta_ctrl |= (ena_tx_ctx->l4_proto <<
+			ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT) &
+			ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK;
+		desc->meta_ctrl |= (ena_tx_ctx->l3_csum_enable <<
+			ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT) &
+			ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK;
+		desc->meta_ctrl |= (ena_tx_ctx->l4_csum_enable <<
+			ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT) &
+			ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK;
+		desc->meta_ctrl |= (ena_tx_ctx->l4_csum_partial <<
+			ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT) &
+			ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK;
+	}
+
+	for (i = 0; i < num_bufs; i++) {
+		/* The first desc share the same desc as the header */
+		if (likely(i != 0)) {
+			rc = ena_com_sq_update_tail(io_sq);
+			if (unlikely(rc)) {
+				netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+					   "Failed to update sq tail\n");
+				return rc;
+			}
+
+			desc = get_sq_desc(io_sq);
+			if (unlikely(!desc))
+				return -EFAULT;
+
+			memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc));
+
+			desc->len_ctrl |= ((u32)io_sq->phase <<
+				ENA_ETH_IO_TX_DESC_PHASE_SHIFT) &
+				ENA_ETH_IO_TX_DESC_PHASE_MASK;
+		}
+
+		desc->len_ctrl |= ena_bufs->len &
+			ENA_ETH_IO_TX_DESC_LENGTH_MASK;
+
+		addr_hi = ((ena_bufs->paddr &
+			GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32);
+
+		desc->buff_addr_lo = (u32)ena_bufs->paddr;
+		desc->buff_addr_hi_hdr_sz |= addr_hi &
+			ENA_ETH_IO_TX_DESC_ADDR_HI_MASK;
+		ena_bufs++;
+	}
+
+	/* set the last desc indicator */
+	desc->len_ctrl |= ENA_ETH_IO_TX_DESC_LAST_MASK;
+
+	rc = ena_com_sq_update_tail(io_sq);
+	if (unlikely(rc)) {
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Failed to update sq tail of the last descriptor\n");
+		return rc;
+	}
+
+	rc = ena_com_close_bounce_buffer(io_sq);
+
+	*nb_hw_desc = io_sq->tail - start_tail;
+	return rc;
+}
+
+int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
+		   struct ena_com_io_sq *io_sq,
+		   struct ena_com_rx_ctx *ena_rx_ctx)
+{
+	struct ena_com_rx_buf_info *ena_buf = &ena_rx_ctx->ena_bufs[0];
+	struct ena_eth_io_rx_cdesc_base *cdesc = NULL;
+	u16 q_depth = io_cq->q_depth;
+	u16 cdesc_idx = 0;
+	u16 nb_hw_desc;
+	u16 i = 0;
+
+	WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
+
+	nb_hw_desc = ena_com_cdesc_rx_pkt_get(io_cq, &cdesc_idx);
+	if (nb_hw_desc == 0) {
+		ena_rx_ctx->descs = nb_hw_desc;
+		return 0;
+	}
+
+	netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+		   "Fetch rx packet: queue %d completed desc: %d\n", io_cq->qid,
+		   nb_hw_desc);
+
+	if (unlikely(nb_hw_desc > ena_rx_ctx->max_bufs)) {
+		netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+			   "Too many RX cdescs (%d) > MAX(%d)\n", nb_hw_desc,
+			   ena_rx_ctx->max_bufs);
+		return -ENOSPC;
+	}
+
+	cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx);
+	ena_rx_ctx->pkt_offset = cdesc->offset;
+
+	do {
+		ena_buf[i].len = cdesc->length;
+		ena_buf[i].req_id = cdesc->req_id;
+		if (unlikely(ena_buf[i].req_id >= q_depth))
+			return -EIO;
+
+		if (++i >= nb_hw_desc)
+			break;
+
+		cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i);
+
+	} while (1);
+
+	/* Update SQ head ptr */
+	io_sq->next_to_comp += nb_hw_desc;
+
+	netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+		   "[%s][QID#%d] Updating SQ head to: %d\n", __func__,
+		   io_sq->qid, io_sq->next_to_comp);
+
+	/* Get rx flags from the last pkt */
+	ena_com_rx_set_flags(io_cq, ena_rx_ctx, cdesc);
+
+	ena_rx_ctx->descs = nb_hw_desc;
+
+	return 0;
+}
+
+int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
+			       struct ena_com_buf *ena_buf,
+			       u16 req_id)
+{
+	struct ena_eth_io_rx_desc *desc;
+
+	WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
+
+	if (unlikely(!ena_com_sq_have_enough_space(io_sq, 1)))
+		return -ENOSPC;
+
+	desc = get_sq_desc(io_sq);
+	if (unlikely(!desc))
+		return -EFAULT;
+
+	memset(desc, 0x0, sizeof(struct ena_eth_io_rx_desc));
+
+	desc->length = ena_buf->len;
+
+	desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK |
+		     ENA_ETH_IO_RX_DESC_LAST_MASK |
+		     ENA_ETH_IO_RX_DESC_COMP_REQ_MASK |
+		     (io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK);
+
+	desc->req_id = req_id;
+
+	netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+		   "[%s] Adding single RX desc, Queue: %u, req_id: %u\n",
+		   __func__, io_sq->qid, req_id);
+
+	desc->buff_addr_lo = (u32)ena_buf->paddr;
+	desc->buff_addr_hi =
+		((ena_buf->paddr & GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32);
+
+	return ena_com_sq_update_tail(io_sq);
+}
+
+bool ena_com_cq_empty(struct ena_com_io_cq *io_cq)
+{
+	struct ena_eth_io_rx_cdesc_base *cdesc;
+
+	cdesc = ena_com_get_next_rx_cdesc(io_cq);
+	if (cdesc)
+		return false;
+	else
+		return true;
+}
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
new file mode 100644
index 0000000000000..028270a069d86
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -0,0 +1,232 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef ENA_ETH_COM_H_
+#define ENA_ETH_COM_H_
+
+#include "ena_com.h"
+
+struct ena_com_tx_ctx {
+	struct ena_com_tx_meta ena_meta;
+	struct ena_com_buf *ena_bufs;
+	/* For LLQ, header buffer - pushed to the device mem space */
+	void *push_header;
+
+	enum ena_eth_io_l3_proto_index l3_proto;
+	enum ena_eth_io_l4_proto_index l4_proto;
+	u16 num_bufs;
+	u16 req_id;
+	/* For regular queue, indicate the size of the header
+	 * For LLQ, indicate the size of the pushed buffer
+	 */
+	u16 header_len;
+
+	u8 meta_valid;
+	u8 tso_enable;
+	u8 l3_csum_enable;
+	u8 l4_csum_enable;
+	u8 l4_csum_partial;
+	u8 df; /* Don't fragment */
+};
+
+struct ena_com_rx_ctx {
+	struct ena_com_rx_buf_info *ena_bufs;
+	enum ena_eth_io_l3_proto_index l3_proto;
+	enum ena_eth_io_l4_proto_index l4_proto;
+	bool l3_csum_err;
+	bool l4_csum_err;
+	u8 l4_csum_checked;
+	/* fragmented packet */
+	bool frag;
+	u32 hash;
+	u16 descs;
+	u16 max_bufs;
+	u8 pkt_offset;
+};
+
+int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
+		       struct ena_com_tx_ctx *ena_tx_ctx,
+		       int *nb_hw_desc);
+
+int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
+		   struct ena_com_io_sq *io_sq,
+		   struct ena_com_rx_ctx *ena_rx_ctx);
+
+int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
+			       struct ena_com_buf *ena_buf,
+			       u16 req_id);
+
+bool ena_com_cq_empty(struct ena_com_io_cq *io_cq);
+
+static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq,
+				       struct ena_eth_io_intr_reg *intr_reg)
+{
+	writel(intr_reg->intr_control, io_cq->unmask_reg);
+}
+
+static inline int ena_com_free_q_entries(struct ena_com_io_sq *io_sq)
+{
+	u16 tail, next_to_comp, cnt;
+
+	next_to_comp = io_sq->next_to_comp;
+	tail = io_sq->tail;
+	cnt = tail - next_to_comp;
+
+	return io_sq->q_depth - 1 - cnt;
+}
+
+/* Check if the submission queue has enough space to hold required_buffers */
+static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq,
+						u16 required_buffers)
+{
+	int temp;
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+		return ena_com_free_q_entries(io_sq) >= required_buffers;
+
+	/* This calculation doesn't need to be 100% accurate. So to reduce
+	 * the calculation overhead just Subtract 2 lines from the free descs
+	 * (one for the header line and one to compensate the devision
+	 * down calculation.
+	 */
+	temp = required_buffers / io_sq->llq_info.descs_per_entry + 2;
+
+	return ena_com_free_q_entries(io_sq) > temp;
+}
+
+static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq,
+					     struct ena_com_tx_ctx *ena_tx_ctx)
+{
+	if (!ena_tx_ctx->meta_valid)
+		return false;
+
+	return !!memcmp(&io_sq->cached_tx_meta,
+			&ena_tx_ctx->ena_meta,
+			sizeof(struct ena_com_tx_meta));
+}
+
+static inline bool is_llq_max_tx_burst_exists(struct ena_com_io_sq *io_sq)
+{
+	return (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) &&
+	       io_sq->llq_info.max_entries_in_tx_burst > 0;
+}
+
+static inline bool ena_com_is_doorbell_needed(struct ena_com_io_sq *io_sq,
+					      struct ena_com_tx_ctx *ena_tx_ctx)
+{
+	struct ena_com_llq_info *llq_info;
+	int descs_after_first_entry;
+	int num_entries_needed = 1;
+	u16 num_descs;
+
+	if (!is_llq_max_tx_burst_exists(io_sq))
+		return false;
+
+	llq_info = &io_sq->llq_info;
+	num_descs = ena_tx_ctx->num_bufs;
+
+	if (llq_info->disable_meta_caching ||
+	    unlikely(ena_com_meta_desc_changed(io_sq, ena_tx_ctx)))
+		++num_descs;
+
+	if (num_descs > llq_info->descs_num_before_header) {
+		descs_after_first_entry = num_descs - llq_info->descs_num_before_header;
+		num_entries_needed += DIV_ROUND_UP(descs_after_first_entry,
+						   llq_info->descs_per_entry);
+	}
+
+	netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+		   "Queue: %d num_descs: %d num_entries_needed: %d\n",
+		   io_sq->qid, num_descs, num_entries_needed);
+
+	return num_entries_needed > io_sq->entries_in_tx_burst_left;
+}
+
+static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
+{
+	u16 max_entries_in_tx_burst = io_sq->llq_info.max_entries_in_tx_burst;
+	u16 tail = io_sq->tail;
+
+	netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+		   "Write submission queue doorbell for queue: %d tail: %d\n",
+		   io_sq->qid, tail);
+
+	writel(tail, io_sq->db_addr);
+
+	if (is_llq_max_tx_burst_exists(io_sq)) {
+		netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Reset available entries in tx burst for queue %d to %d\n",
+			   io_sq->qid, max_entries_in_tx_burst);
+		io_sq->entries_in_tx_burst_left = max_entries_in_tx_burst;
+	}
+
+	return 0;
+}
+
+static inline void ena_com_update_numa_node(struct ena_com_io_cq *io_cq,
+					    u8 numa_node)
+{
+	struct ena_eth_io_numa_node_cfg_reg numa_cfg;
+
+	if (!io_cq->numa_node_cfg_reg)
+		return;
+
+	numa_cfg.numa_cfg = (numa_node & ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK)
+		| ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK;
+
+	writel(numa_cfg.numa_cfg, io_cq->numa_node_cfg_reg);
+}
+
+static inline void ena_com_comp_ack(struct ena_com_io_sq *io_sq, u16 elem)
+{
+	io_sq->next_to_comp += elem;
+}
+
+static inline void ena_com_cq_inc_head(struct ena_com_io_cq *io_cq)
+{
+	io_cq->head++;
+
+	/* Switch phase bit in case of wrap around */
+	if (unlikely((io_cq->head & (io_cq->q_depth - 1)) == 0))
+		io_cq->phase ^= 1;
+}
+
+static inline int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq,
+					     u16 *req_id)
+{
+	u8 expected_phase, cdesc_phase;
+	struct ena_eth_io_tx_cdesc *cdesc;
+	u16 masked_head;
+
+	masked_head = io_cq->head & (io_cq->q_depth - 1);
+	expected_phase = io_cq->phase;
+
+	cdesc = (struct ena_eth_io_tx_cdesc *)
+		((uintptr_t)io_cq->cdesc_addr.virt_addr +
+		(masked_head * io_cq->cdesc_entry_size_in_bytes));
+
+	/* When the current completion descriptor phase isn't the same as the
+	 * expected, it mean that the device still didn't update
+	 * this completion.
+	 */
+	cdesc_phase = READ_ONCE(cdesc->flags) & ENA_ETH_IO_TX_CDESC_PHASE_MASK;
+	if (cdesc_phase != expected_phase)
+		return -EAGAIN;
+
+	dma_rmb();
+
+	*req_id = READ_ONCE(cdesc->req_id);
+	if (unlikely(*req_id >= io_cq->q_depth)) {
+		netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+			   "Invalid req id %d\n", cdesc->req_id);
+		return -EINVAL;
+	}
+
+	ena_com_cq_inc_head(io_cq);
+
+	return 0;
+}
+
+#endif /* ENA_ETH_COM_H_ */
diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h
new file mode 100755
index 0000000000000..332ac0d28ac7a
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_eth_io_defs.h
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+#ifndef _ENA_ETH_IO_H_
+#define _ENA_ETH_IO_H_
+
+enum ena_eth_io_l3_proto_index {
+	ENA_ETH_IO_L3_PROTO_UNKNOWN                 = 0,
+	ENA_ETH_IO_L3_PROTO_IPV4                    = 8,
+	ENA_ETH_IO_L3_PROTO_IPV6                    = 11,
+	ENA_ETH_IO_L3_PROTO_FCOE                    = 21,
+	ENA_ETH_IO_L3_PROTO_ROCE                    = 22,
+};
+
+enum ena_eth_io_l4_proto_index {
+	ENA_ETH_IO_L4_PROTO_UNKNOWN                 = 0,
+	ENA_ETH_IO_L4_PROTO_TCP                     = 12,
+	ENA_ETH_IO_L4_PROTO_UDP                     = 13,
+	ENA_ETH_IO_L4_PROTO_ROUTEABLE_ROCE          = 23,
+};
+
+struct ena_eth_io_tx_desc {
+	/* 15:0 : length - Buffer length in bytes, must
+	 *    include any packet trailers that the ENA supposed
+	 *    to update like End-to-End CRC, Authentication GMAC
+	 *    etc. This length must not include the
+	 *    'Push_Buffer' length. This length must not include
+	 *    the 4-byte added in the end for 802.3 Ethernet FCS
+	 * 21:16 : req_id_hi - Request ID[15:10]
+	 * 22 : reserved22 - MBZ
+	 * 23 : meta_desc - MBZ
+	 * 24 : phase
+	 * 25 : reserved1 - MBZ
+	 * 26 : first - Indicates first descriptor in
+	 *    transaction
+	 * 27 : last - Indicates last descriptor in
+	 *    transaction
+	 * 28 : comp_req - Indicates whether completion
+	 *    should be posted, after packet is transmitted.
+	 *    Valid only for first descriptor
+	 * 30:29 : reserved29 - MBZ
+	 * 31 : reserved31 - MBZ
+	 */
+	u32 len_ctrl;
+
+	/* 3:0 : l3_proto_idx - L3 protocol. This field
+	 *    required when l3_csum_en,l3_csum or tso_en are set.
+	 * 4 : DF - IPv4 DF, must be 0 if packet is IPv4 and
+	 *    DF flags of the IPv4 header is 0. Otherwise must
+	 *    be set to 1
+	 * 6:5 : reserved5
+	 * 7 : tso_en - Enable TSO, For TCP only.
+	 * 12:8 : l4_proto_idx - L4 protocol. This field need
+	 *    to be set when l4_csum_en or tso_en are set.
+	 * 13 : l3_csum_en - enable IPv4 header checksum.
+	 * 14 : l4_csum_en - enable TCP/UDP checksum.
+	 * 15 : ethernet_fcs_dis - when set, the controller
+	 *    will not append the 802.3 Ethernet Frame Check
+	 *    Sequence to the packet
+	 * 16 : reserved16
+	 * 17 : l4_csum_partial - L4 partial checksum. when
+	 *    set to 0, the ENA calculates the L4 checksum,
+	 *    where the Destination Address required for the
+	 *    TCP/UDP pseudo-header is taken from the actual
+	 *    packet L3 header. when set to 1, the ENA doesn't
+	 *    calculate the sum of the pseudo-header, instead,
+	 *    the checksum field of the L4 is used instead. When
+	 *    TSO enabled, the checksum of the pseudo-header
+	 *    must not include the tcp length field. L4 partial
+	 *    checksum should be used for IPv6 packet that
+	 *    contains Routing Headers.
+	 * 20:18 : reserved18 - MBZ
+	 * 21 : reserved21 - MBZ
+	 * 31:22 : req_id_lo - Request ID[9:0]
+	 */
+	u32 meta_ctrl;
+
+	u32 buff_addr_lo;
+
+	/* address high and header size
+	 * 15:0 : addr_hi - Buffer Pointer[47:32]
+	 * 23:16 : reserved16_w2
+	 * 31:24 : header_length - Header length. For Low
+	 *    Latency Queues, this fields indicates the number
+	 *    of bytes written to the headers' memory. For
+	 *    normal queues, if packet is TCP or UDP, and longer
+	 *    than max_header_size, then this field should be
+	 *    set to the sum of L4 header offset and L4 header
+	 *    size(without options), otherwise, this field
+	 *    should be set to 0. For both modes, this field
+	 *    must not exceed the max_header_size.
+	 *    max_header_size value is reported by the Max
+	 *    Queues Feature descriptor
+	 */
+	u32 buff_addr_hi_hdr_sz;
+};
+
+struct ena_eth_io_tx_meta_desc {
+	/* 9:0 : req_id_lo - Request ID[9:0]
+	 * 11:10 : reserved10 - MBZ
+	 * 12 : reserved12 - MBZ
+	 * 13 : reserved13 - MBZ
+	 * 14 : ext_valid - if set, offset fields in Word2
+	 *    are valid Also MSS High in Word 0 and bits [31:24]
+	 *    in Word 3
+	 * 15 : reserved15
+	 * 19:16 : mss_hi
+	 * 20 : eth_meta_type - 0: Tx Metadata Descriptor, 1:
+	 *    Extended Metadata Descriptor
+	 * 21 : meta_store - Store extended metadata in queue
+	 *    cache
+	 * 22 : reserved22 - MBZ
+	 * 23 : meta_desc - MBO
+	 * 24 : phase
+	 * 25 : reserved25 - MBZ
+	 * 26 : first - Indicates first descriptor in
+	 *    transaction
+	 * 27 : last - Indicates last descriptor in
+	 *    transaction
+	 * 28 : comp_req - Indicates whether completion
+	 *    should be posted, after packet is transmitted.
+	 *    Valid only for first descriptor
+	 * 30:29 : reserved29 - MBZ
+	 * 31 : reserved31 - MBZ
+	 */
+	u32 len_ctrl;
+
+	/* 5:0 : req_id_hi
+	 * 31:6 : reserved6 - MBZ
+	 */
+	u32 word1;
+
+	/* 7:0 : l3_hdr_len
+	 * 15:8 : l3_hdr_off
+	 * 21:16 : l4_hdr_len_in_words - counts the L4 header
+	 *    length in words. there is an explicit assumption
+	 *    that L4 header appears right after L3 header and
+	 *    L4 offset is based on l3_hdr_off+l3_hdr_len
+	 * 31:22 : mss_lo
+	 */
+	u32 word2;
+
+	u32 reserved;
+};
+
+struct ena_eth_io_tx_cdesc {
+	/* Request ID[15:0] */
+	u16 req_id;
+
+	u8 status;
+
+	/* flags
+	 * 0 : phase
+	 * 7:1 : reserved1
+	 */
+	u8 flags;
+
+	u16 sub_qid;
+
+	u16 sq_head_idx;
+};
+
+struct ena_eth_io_rx_desc {
+	/* In bytes. 0 means 64KB */
+	u16 length;
+
+	/* MBZ */
+	u8 reserved2;
+
+	/* 0 : phase
+	 * 1 : reserved1 - MBZ
+	 * 2 : first - Indicates first descriptor in
+	 *    transaction
+	 * 3 : last - Indicates last descriptor in transaction
+	 * 4 : comp_req
+	 * 5 : reserved5 - MBO
+	 * 7:6 : reserved6 - MBZ
+	 */
+	u8 ctrl;
+
+	u16 req_id;
+
+	/* MBZ */
+	u16 reserved6;
+
+	u32 buff_addr_lo;
+
+	u16 buff_addr_hi;
+
+	/* MBZ */
+	u16 reserved16_w3;
+};
+
+/* 4-word format Note: all ethernet parsing information are valid only when
+ * last=1
+ */
+struct ena_eth_io_rx_cdesc_base {
+	/* 4:0 : l3_proto_idx
+	 * 6:5 : src_vlan_cnt
+	 * 7 : reserved7 - MBZ
+	 * 12:8 : l4_proto_idx
+	 * 13 : l3_csum_err - when set, either the L3
+	 *    checksum error detected, or, the controller didn't
+	 *    validate the checksum. This bit is valid only when
+	 *    l3_proto_idx indicates IPv4 packet
+	 * 14 : l4_csum_err - when set, either the L4
+	 *    checksum error detected, or, the controller didn't
+	 *    validate the checksum. This bit is valid only when
+	 *    l4_proto_idx indicates TCP/UDP packet, and,
+	 *    ipv4_frag is not set. This bit is valid only when
+	 *    l4_csum_checked below is set.
+	 * 15 : ipv4_frag - Indicates IPv4 fragmented packet
+	 * 16 : l4_csum_checked - L4 checksum was verified
+	 *    (could be OK or error), when cleared the status of
+	 *    checksum is unknown
+	 * 23:17 : reserved17 - MBZ
+	 * 24 : phase
+	 * 25 : l3_csum2 - second checksum engine result
+	 * 26 : first - Indicates first descriptor in
+	 *    transaction
+	 * 27 : last - Indicates last descriptor in
+	 *    transaction
+	 * 29:28 : reserved28
+	 * 30 : buffer - 0: Metadata descriptor. 1: Buffer
+	 *    Descriptor was used
+	 * 31 : reserved31
+	 */
+	u32 status;
+
+	u16 length;
+
+	u16 req_id;
+
+	/* 32-bit hash result */
+	u32 hash;
+
+	u16 sub_qid;
+
+	u8 offset;
+
+	u8 reserved;
+};
+
+/* 8-word format */
+struct ena_eth_io_rx_cdesc_ext {
+	struct ena_eth_io_rx_cdesc_base base;
+
+	u32 buff_addr_lo;
+
+	u16 buff_addr_hi;
+
+	u16 reserved16;
+
+	u32 reserved_w6;
+
+	u32 reserved_w7;
+};
+
+struct ena_eth_io_intr_reg {
+	/* 14:0 : rx_intr_delay
+	 * 29:15 : tx_intr_delay
+	 * 30 : intr_unmask
+	 * 31 : reserved
+	 */
+	u32 intr_control;
+};
+
+struct ena_eth_io_numa_node_cfg_reg {
+	/* 7:0 : numa
+	 * 30:8 : reserved
+	 * 31 : enabled
+	 */
+	u32 numa_cfg;
+};
+
+/* tx_desc */
+#define ENA_ETH_IO_TX_DESC_LENGTH_MASK                      GENMASK(15, 0)
+#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT                  16
+#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK                   GENMASK(21, 16)
+#define ENA_ETH_IO_TX_DESC_META_DESC_SHIFT                  23
+#define ENA_ETH_IO_TX_DESC_META_DESC_MASK                   BIT(23)
+#define ENA_ETH_IO_TX_DESC_PHASE_SHIFT                      24
+#define ENA_ETH_IO_TX_DESC_PHASE_MASK                       BIT(24)
+#define ENA_ETH_IO_TX_DESC_FIRST_SHIFT                      26
+#define ENA_ETH_IO_TX_DESC_FIRST_MASK                       BIT(26)
+#define ENA_ETH_IO_TX_DESC_LAST_SHIFT                       27
+#define ENA_ETH_IO_TX_DESC_LAST_MASK                        BIT(27)
+#define ENA_ETH_IO_TX_DESC_COMP_REQ_SHIFT                   28
+#define ENA_ETH_IO_TX_DESC_COMP_REQ_MASK                    BIT(28)
+#define ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK                GENMASK(3, 0)
+#define ENA_ETH_IO_TX_DESC_DF_SHIFT                         4
+#define ENA_ETH_IO_TX_DESC_DF_MASK                          BIT(4)
+#define ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT                     7
+#define ENA_ETH_IO_TX_DESC_TSO_EN_MASK                      BIT(7)
+#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT               8
+#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK                GENMASK(12, 8)
+#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT                 13
+#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK                  BIT(13)
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT                 14
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK                  BIT(14)
+#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_SHIFT           15
+#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_MASK            BIT(15)
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT            17
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK             BIT(17)
+#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT                  22
+#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK                   GENMASK(31, 22)
+#define ENA_ETH_IO_TX_DESC_ADDR_HI_MASK                     GENMASK(15, 0)
+#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT              24
+#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK               GENMASK(31, 24)
+
+/* tx_meta_desc */
+#define ENA_ETH_IO_TX_META_DESC_REQ_ID_LO_MASK              GENMASK(9, 0)
+#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_SHIFT             14
+#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK              BIT(14)
+#define ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT                16
+#define ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK                 GENMASK(19, 16)
+#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_SHIFT         20
+#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK          BIT(20)
+#define ENA_ETH_IO_TX_META_DESC_META_STORE_SHIFT            21
+#define ENA_ETH_IO_TX_META_DESC_META_STORE_MASK             BIT(21)
+#define ENA_ETH_IO_TX_META_DESC_META_DESC_SHIFT             23
+#define ENA_ETH_IO_TX_META_DESC_META_DESC_MASK              BIT(23)
+#define ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT                 24
+#define ENA_ETH_IO_TX_META_DESC_PHASE_MASK                  BIT(24)
+#define ENA_ETH_IO_TX_META_DESC_FIRST_SHIFT                 26
+#define ENA_ETH_IO_TX_META_DESC_FIRST_MASK                  BIT(26)
+#define ENA_ETH_IO_TX_META_DESC_LAST_SHIFT                  27
+#define ENA_ETH_IO_TX_META_DESC_LAST_MASK                   BIT(27)
+#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_SHIFT              28
+#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_MASK               BIT(28)
+#define ENA_ETH_IO_TX_META_DESC_REQ_ID_HI_MASK              GENMASK(5, 0)
+#define ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK             GENMASK(7, 0)
+#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT            8
+#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK             GENMASK(15, 8)
+#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT   16
+#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK    GENMASK(21, 16)
+#define ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT                22
+#define ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK                 GENMASK(31, 22)
+
+/* tx_cdesc */
+#define ENA_ETH_IO_TX_CDESC_PHASE_MASK                      BIT(0)
+
+/* rx_desc */
+#define ENA_ETH_IO_RX_DESC_PHASE_MASK                       BIT(0)
+#define ENA_ETH_IO_RX_DESC_FIRST_SHIFT                      2
+#define ENA_ETH_IO_RX_DESC_FIRST_MASK                       BIT(2)
+#define ENA_ETH_IO_RX_DESC_LAST_SHIFT                       3
+#define ENA_ETH_IO_RX_DESC_LAST_MASK                        BIT(3)
+#define ENA_ETH_IO_RX_DESC_COMP_REQ_SHIFT                   4
+#define ENA_ETH_IO_RX_DESC_COMP_REQ_MASK                    BIT(4)
+
+/* rx_cdesc_base */
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK          GENMASK(4, 0)
+#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_SHIFT         5
+#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_MASK          GENMASK(6, 5)
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT         8
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK          GENMASK(12, 8)
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT          13
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK           BIT(13)
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT          14
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK           BIT(14)
+#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT            15
+#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK             BIT(15)
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_SHIFT      16
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_MASK       BIT(16)
+#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT                24
+#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK                 BIT(24)
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_SHIFT             25
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_MASK              BIT(25)
+#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT                26
+#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK                 BIT(26)
+#define ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT                 27
+#define ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK                  BIT(27)
+#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_SHIFT               30
+#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_MASK                BIT(30)
+
+/* intr_reg */
+#define ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK              GENMASK(14, 0)
+#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT             15
+#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK              GENMASK(29, 15)
+#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_SHIFT               30
+#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK                BIT(30)
+
+/* numa_node_cfg_reg */
+#define ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK              GENMASK(7, 0)
+#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT          31
+#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK           BIT(31)
+
+#endif /* _ENA_ETH_IO_H_ */
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
new file mode 100644
index 0000000000000..08f7ee8fc151c
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -0,0 +1,1328 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/ethtool.h>
+#include <linux/pci.h>
+#include <linux/net_tstamp.h>
+
+#include "ena_netdev.h"
+#include "ena_xdp.h"
+#include "ena_phc.h"
+
+struct ena_stats {
+	char name[ETH_GSTRING_LEN];
+	int stat_offset;
+};
+
+#define ENA_STAT_ENA_COM_ADMIN_ENTRY(stat) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_com_stats_admin, stat) / sizeof(u64) \
+}
+
+#define ENA_STAT_ENA_COM_PHC_ENTRY(stat) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_com_stats_phc, stat) / sizeof(u64) \
+}
+
+#define ENA_STAT_ENTRY(stat, stat_type) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_stats_##stat_type, stat) / sizeof(u64) \
+}
+
+#define ENA_STAT_HW_ENTRY(stat, stat_type) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_admin_##stat_type, stat) / sizeof(u64) \
+}
+
+#define ENA_STAT_RX_ENTRY(stat) \
+	ENA_STAT_ENTRY(stat, rx)
+
+#define ENA_STAT_TX_ENTRY(stat) \
+	ENA_STAT_ENTRY(stat, tx)
+
+#define ENA_STAT_GLOBAL_ENTRY(stat) \
+	ENA_STAT_ENTRY(stat, dev)
+
+#define ENA_STAT_ENI_ENTRY(stat) \
+	ENA_STAT_HW_ENTRY(stat, eni_stats)
+
+#define ENA_STAT_ENA_SRD_ENTRY(stat) \
+	ENA_STAT_HW_ENTRY(stat, ena_srd_stats)
+
+#define ENA_STAT_ENA_SRD_MODE_ENTRY(stat) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_admin_ena_srd_info, flags) / sizeof(u64) \
+}
+
+static const struct ena_stats ena_stats_global_strings[] = {
+	ENA_STAT_GLOBAL_ENTRY(tx_timeout),
+	ENA_STAT_GLOBAL_ENTRY(suspend),
+	ENA_STAT_GLOBAL_ENTRY(resume),
+	ENA_STAT_GLOBAL_ENTRY(wd_expired),
+	ENA_STAT_GLOBAL_ENTRY(interface_up),
+	ENA_STAT_GLOBAL_ENTRY(interface_down),
+	ENA_STAT_GLOBAL_ENTRY(admin_q_pause),
+	ENA_STAT_GLOBAL_ENTRY(reset_fail),
+};
+
+static const struct ena_stats ena_stats_eni_strings[] = {
+	ENA_STAT_ENI_ENTRY(bw_in_allowance_exceeded),
+	ENA_STAT_ENI_ENTRY(bw_out_allowance_exceeded),
+	ENA_STAT_ENI_ENTRY(pps_allowance_exceeded),
+	ENA_STAT_ENI_ENTRY(conntrack_allowance_exceeded),
+	ENA_STAT_ENI_ENTRY(linklocal_allowance_exceeded),
+};
+
+static const struct ena_stats ena_srd_info_strings[] = {
+	ENA_STAT_ENA_SRD_MODE_ENTRY(ena_srd_mode),
+	ENA_STAT_ENA_SRD_ENTRY(ena_srd_tx_pkts),
+	ENA_STAT_ENA_SRD_ENTRY(ena_srd_eligible_tx_pkts),
+	ENA_STAT_ENA_SRD_ENTRY(ena_srd_rx_pkts),
+	ENA_STAT_ENA_SRD_ENTRY(ena_srd_resource_utilization)
+};
+
+static const struct ena_stats ena_stats_tx_strings[] = {
+	ENA_STAT_TX_ENTRY(cnt),
+	ENA_STAT_TX_ENTRY(bytes),
+	ENA_STAT_TX_ENTRY(queue_stop),
+	ENA_STAT_TX_ENTRY(queue_wakeup),
+	ENA_STAT_TX_ENTRY(dma_mapping_err),
+	ENA_STAT_TX_ENTRY(linearize),
+	ENA_STAT_TX_ENTRY(linearize_failed),
+	ENA_STAT_TX_ENTRY(napi_comp),
+	ENA_STAT_TX_ENTRY(tx_poll),
+	ENA_STAT_TX_ENTRY(doorbells),
+	ENA_STAT_TX_ENTRY(prepare_ctx_err),
+	ENA_STAT_TX_ENTRY(bad_req_id),
+	ENA_STAT_TX_ENTRY(llq_buffer_copy),
+	ENA_STAT_TX_ENTRY(missed_tx),
+	ENA_STAT_TX_ENTRY(unmask_interrupt),
+#ifdef ENA_AF_XDP_SUPPORT
+	ENA_STAT_TX_ENTRY(xsk_need_wakeup_set),
+	ENA_STAT_TX_ENTRY(xsk_wakeup_request),
+#endif /* ENA_AF_XDP_SUPPORT */
+};
+
+static const struct ena_stats ena_stats_rx_strings[] = {
+	ENA_STAT_RX_ENTRY(cnt),
+	ENA_STAT_RX_ENTRY(bytes),
+	ENA_STAT_RX_ENTRY(rx_copybreak_pkt),
+	ENA_STAT_RX_ENTRY(csum_good),
+	ENA_STAT_RX_ENTRY(refil_partial),
+	ENA_STAT_RX_ENTRY(csum_bad),
+	ENA_STAT_RX_ENTRY(page_alloc_fail),
+	ENA_STAT_RX_ENTRY(skb_alloc_fail),
+	ENA_STAT_RX_ENTRY(dma_mapping_err),
+	ENA_STAT_RX_ENTRY(bad_desc_num),
+#ifdef ENA_BUSY_POLL_SUPPORT
+	ENA_STAT_RX_ENTRY(bp_yield),
+	ENA_STAT_RX_ENTRY(bp_missed),
+	ENA_STAT_RX_ENTRY(bp_cleaned),
+#endif
+	ENA_STAT_RX_ENTRY(bad_req_id),
+	ENA_STAT_RX_ENTRY(empty_rx_ring),
+	ENA_STAT_RX_ENTRY(csum_unchecked),
+#ifdef ENA_XDP_SUPPORT
+	ENA_STAT_RX_ENTRY(xdp_aborted),
+	ENA_STAT_RX_ENTRY(xdp_drop),
+	ENA_STAT_RX_ENTRY(xdp_pass),
+	ENA_STAT_RX_ENTRY(xdp_tx),
+	ENA_STAT_RX_ENTRY(xdp_invalid),
+	ENA_STAT_RX_ENTRY(xdp_redirect),
+#endif
+	ENA_STAT_RX_ENTRY(lpc_warm_up),
+	ENA_STAT_RX_ENTRY(lpc_full),
+	ENA_STAT_RX_ENTRY(lpc_wrong_numa),
+#ifdef ENA_AF_XDP_SUPPORT
+	ENA_STAT_RX_ENTRY(xsk_need_wakeup_set),
+	ENA_STAT_RX_ENTRY(zc_queue_pkt_copy),
+#endif /* ENA_AF_XDP_SUPPORT */
+};
+
+static const struct ena_stats ena_stats_ena_com_admin_strings[] = {
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(aborted_cmd),
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(submitted_cmd),
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(completed_cmd),
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(out_of_space),
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(no_completion),
+};
+
+static const struct ena_stats ena_stats_ena_com_phc_strings[] = {
+	ENA_STAT_ENA_COM_PHC_ENTRY(phc_cnt),
+	ENA_STAT_ENA_COM_PHC_ENTRY(phc_exp),
+	ENA_STAT_ENA_COM_PHC_ENTRY(phc_skp),
+	ENA_STAT_ENA_COM_PHC_ENTRY(phc_err),
+};
+
+#define ENA_STATS_ARRAY_GLOBAL		ARRAY_SIZE(ena_stats_global_strings)
+#define ENA_STATS_ARRAY_TX		ARRAY_SIZE(ena_stats_tx_strings)
+#define ENA_STATS_ARRAY_RX		ARRAY_SIZE(ena_stats_rx_strings)
+#define ENA_STATS_ARRAY_ENA_COM_ADMIN	ARRAY_SIZE(ena_stats_ena_com_admin_strings)
+#define ENA_STATS_ARRAY_ENA_COM_PHC	ARRAY_SIZE(ena_stats_ena_com_phc_strings)
+#define ENA_STATS_ARRAY_ENI		ARRAY_SIZE(ena_stats_eni_strings)
+#define ENA_STATS_ARRAY_ENA_SRD		ARRAY_SIZE(ena_srd_info_strings)
+
+static const char ena_priv_flags_strings[][ETH_GSTRING_LEN] = {
+#define ENA_PRIV_FLAGS_LPC	BIT(0)
+	"local_page_cache",
+};
+
+#define ENA_PRIV_FLAGS_NR ARRAY_SIZE(ena_priv_flags_strings)
+
+static void ena_safe_update_stat(u64 *src, u64 *dst,
+				 struct u64_stats_sync *syncp)
+{
+	unsigned int start;
+
+	do {
+		start = u64_stats_fetch_begin_irq(syncp);
+		*(dst) = *src;
+	} while (u64_stats_fetch_retry_irq(syncp, start));
+}
+
+static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
+{
+	const struct ena_stats *ena_stats;
+	struct ena_ring *ring;
+
+	u64 *ptr;
+	int i, j;
+
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
+		/* Tx stats */
+		ring = &adapter->tx_ring[i];
+
+		for (j = 0; j < ENA_STATS_ARRAY_TX; j++) {
+			ena_stats = &ena_stats_tx_strings[j];
+
+			ptr = (u64 *)&ring->tx_stats + ena_stats->stat_offset;
+
+			ena_safe_update_stat(ptr, (*data)++, &ring->syncp);
+		}
+		/* XDP TX queues don't have a RX queue counterpart */
+		if (!ENA_IS_XDP_INDEX(adapter, i)) {
+			/* Rx stats */
+			ring = &adapter->rx_ring[i];
+
+			for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
+				ena_stats = &ena_stats_rx_strings[j];
+
+				ptr = (u64 *)&ring->rx_stats +
+					ena_stats->stat_offset;
+
+				ena_safe_update_stat(ptr, (*data)++, &ring->syncp);
+			}
+		}
+	}
+}
+
+static void ena_com_admin_queue_stats(struct ena_adapter *adapter, u64 **data)
+{
+	const struct ena_stats *ena_stats;
+	u64 *ptr;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_ADMIN; i++) {
+		ena_stats = &ena_stats_ena_com_admin_strings[i];
+
+		ptr = (u64 *)&adapter->ena_dev->admin_queue.stats +
+			ena_stats->stat_offset;
+
+		*(*data)++ = *ptr;
+	}
+}
+
+static void ena_com_phc_stats(struct ena_adapter *adapter, u64 **data)
+{
+	const struct ena_stats *ena_stats;
+	u64 *ptr;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_PHC; i++) {
+		ena_stats = &ena_stats_ena_com_phc_strings[i];
+		ptr = (u64 *)&adapter->ena_dev->phc.stats + ena_stats->stat_offset;
+		*(*data)++ = *ptr;
+	}
+}
+
+static void ena_get_stats(struct ena_adapter *adapter,
+			  u64 *data,
+			  bool hw_stats_needed)
+{
+	const struct ena_stats *ena_stats;
+	u64 *ptr;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
+		ena_stats = &ena_stats_global_strings[i];
+
+		ptr = (u64 *)&adapter->dev_stats + ena_stats->stat_offset;
+
+		ena_safe_update_stat(ptr, data++, &adapter->syncp);
+	}
+
+	if (hw_stats_needed) {
+		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
+			ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats);
+			/* Updating regardless of rc - once we told ethtool how many stats we have
+			 * it will print that much stats. We can't leave holes in the stats
+			 */
+			for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
+				ena_stats = &ena_stats_eni_strings[i];
+
+				ptr = (u64 *)&adapter->eni_stats +
+					ena_stats->stat_offset;
+
+				ena_safe_update_stat(ptr, data++, &adapter->syncp);
+			}
+		}
+
+		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+			ena_com_get_ena_srd_info(adapter->ena_dev, &adapter->ena_srd_info);
+			/* Get ENA SRD mode */
+			ptr = (u64 *)&adapter->ena_srd_info;
+			ena_safe_update_stat(ptr, data++, &adapter->syncp);
+			for (i = 1; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
+				ena_stats = &ena_srd_info_strings[i];
+				/* Wrapped within an outer struct - need to accommodate an
+				 * additional offset of the ENA SRD mode that was already processed
+				 */
+				ptr = (u64 *)&adapter->ena_srd_info +
+					ena_stats->stat_offset + 1;
+
+				ena_safe_update_stat(ptr, data++, &adapter->syncp);
+			}
+		}
+	}
+
+	ena_queue_stats(adapter, &data);
+	ena_com_admin_queue_stats(adapter, &data);
+
+	if (ena_phc_enabled(adapter)) {
+		ena_com_phc_stats(adapter, &data);
+	}
+}
+
+static void ena_get_ethtool_stats(struct net_device *netdev,
+				  struct ethtool_stats *stats,
+				  u64 *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	ena_get_stats(adapter, data, true);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
+static int ena_get_ts_info(struct net_device *netdev, struct ethtool_ts_info *info)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_RX_SOFTWARE |
+				SOF_TIMESTAMPING_SOFTWARE;
+
+	info->phc_index = ena_phc_get_index(adapter);
+
+	return 0;
+}
+
+#endif
+static int ena_get_sw_stats_count(struct ena_adapter *adapter)
+{
+	int count = adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+		    + adapter->xdp_num_queues * ENA_STATS_ARRAY_TX
+		    + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM_ADMIN;
+
+	if (ena_phc_enabled(adapter))
+		count += ENA_STATS_ARRAY_ENA_COM_PHC;
+
+	return count;
+}
+
+static int ena_get_hw_stats_count(struct ena_adapter *adapter)
+{
+	return ENA_STATS_ARRAY_ENI * ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS) +
+	       ENA_STATS_ARRAY_ENA_SRD * ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO);
+}
+
+int ena_get_sset_count(struct net_device *netdev, int sset)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ena_get_sw_stats_count(adapter) +
+		       ena_get_hw_stats_count(adapter);
+	case ETH_SS_PRIV_FLAGS:
+		return ENA_PRIV_FLAGS_NR;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
+{
+	const struct ena_stats *ena_stats;
+	bool is_xdp;
+	int i, j;
+
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
+		is_xdp = ENA_IS_XDP_INDEX(adapter, i);
+		/* Tx stats */
+		for (j = 0; j < ENA_STATS_ARRAY_TX; j++) {
+			ena_stats = &ena_stats_tx_strings[j];
+
+			ethtool_sprintf(data,
+					"queue_%u_%s_%s", i,
+					is_xdp ? "xdp_tx" : "tx",
+					ena_stats->name);
+		}
+
+		/* In XDP there isn't an RX queue counterpart */
+		if (is_xdp)
+			continue;
+
+		for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
+			ena_stats = &ena_stats_rx_strings[j];
+
+			ethtool_sprintf(data,
+					"queue_%u_rx_%s", i,
+					ena_stats->name);
+		}
+	}
+}
+
+static void ena_com_admin_strings(u8 **data)
+{
+	const struct ena_stats *ena_stats;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_ADMIN; i++) {
+		ena_stats = &ena_stats_ena_com_admin_strings[i];
+
+		ethtool_sprintf(data,
+				"ena_admin_q_%s", ena_stats->name);
+	}
+}
+
+static void ena_com_phc_strings(u8 **data)
+{
+	const struct ena_stats *ena_stats;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_PHC; i++) {
+		ena_stats = &ena_stats_ena_com_phc_strings[i];
+		ethtool_sprintf(data, "%s", ena_stats->name);
+	}
+}
+
+static void ena_get_strings(struct ena_adapter *adapter,
+			    u8 *data,
+			    bool hw_stats_needed)
+{
+	const struct ena_stats *ena_stats;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
+		ena_stats = &ena_stats_global_strings[i];
+		ethtool_sprintf(&data, ena_stats->name);
+	}
+
+	if (hw_stats_needed) {
+		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
+			for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
+				ena_stats = &ena_stats_eni_strings[i];
+				ethtool_sprintf(&data, ena_stats->name);
+			}
+		}
+		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+			for (i = 0; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
+				ena_stats = &ena_srd_info_strings[i];
+				ethtool_sprintf(&data, ena_stats->name);
+			}
+		}
+	}
+
+	ena_queue_strings(adapter, &data);
+	ena_com_admin_strings(&data);
+
+	if (ena_phc_enabled(adapter)) {
+		ena_com_phc_strings(&data);
+	}
+}
+
+static void ena_get_ethtool_strings(struct net_device *netdev,
+				    u32 sset,
+				    u8 *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		ena_get_strings(adapter, data, true);
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		memcpy(data, ena_priv_flags_strings, sizeof(ena_priv_flags_strings));
+		break;
+	}
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
+static int ena_get_link_ksettings(struct net_device *netdev,
+				  struct ethtool_link_ksettings *link_ksettings)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct ena_admin_get_feature_link_desc *link;
+	struct ena_admin_get_feat_resp feat_resp;
+	int rc;
+
+	rc = ena_com_get_link_params(ena_dev, &feat_resp);
+	if (rc)
+		return rc;
+
+	link = &feat_resp.u.link;
+	link_ksettings->base.speed = link->speed;
+
+	if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) {
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, Autoneg);
+	}
+
+	link_ksettings->base.autoneg =
+		(link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) ?
+		AUTONEG_ENABLE : AUTONEG_DISABLE;
+
+	link_ksettings->base.duplex = DUPLEX_FULL;
+
+	return 0;
+}
+
+#else
+static int ena_get_settings(struct net_device *netdev,
+			    struct ethtool_cmd *ecmd)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct ena_admin_get_feature_link_desc *link;
+	struct ena_admin_get_feat_resp feat_resp;
+	int rc;
+
+	rc = ena_com_get_link_params(ena_dev, &feat_resp);
+	if (rc)
+		return rc;
+
+	link = &feat_resp.u.link;
+
+	ethtool_cmd_speed_set(ecmd, link->speed);
+
+	if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK)
+		ecmd->duplex = DUPLEX_FULL;
+	else
+		ecmd->duplex = DUPLEX_HALF;
+
+	if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK)
+		ecmd->autoneg = AUTONEG_ENABLE;
+	else
+		ecmd->autoneg = AUTONEG_DISABLE;
+
+	return 0;
+}
+
+#endif
+static int ena_get_coalesce(struct net_device *net_dev,
+#ifdef ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
+			    struct ethtool_coalesce *coalesce,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
+#else
+			    struct ethtool_coalesce *coalesce)
+#endif
+{
+	struct ena_adapter *adapter = netdev_priv(net_dev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+
+	if (!ena_com_interrupt_moderation_supported(ena_dev))
+		return -EOPNOTSUPP;
+
+	coalesce->tx_coalesce_usecs =
+		ena_com_get_nonadaptive_moderation_interval_tx(ena_dev) *
+			ena_dev->intr_delay_resolution;
+
+	coalesce->rx_coalesce_usecs =
+		ena_com_get_nonadaptive_moderation_interval_rx(ena_dev)
+		* ena_dev->intr_delay_resolution;
+
+	coalesce->use_adaptive_rx_coalesce =
+		ena_com_get_adaptive_moderation_enabled(ena_dev);
+
+	return 0;
+}
+
+static void ena_update_tx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
+{
+	unsigned int val;
+	int i;
+
+	val = ena_com_get_nonadaptive_moderation_interval_tx(adapter->ena_dev);
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		adapter->tx_ring[i].smoothed_interval = val;
+}
+
+static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
+{
+	unsigned int val;
+	int i;
+
+	val = ena_com_get_nonadaptive_moderation_interval_rx(adapter->ena_dev);
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		adapter->rx_ring[i].smoothed_interval = val;
+}
+
+static int ena_set_coalesce(struct net_device *net_dev,
+#ifdef ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
+			    struct ethtool_coalesce *coalesce,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
+#else
+			    struct ethtool_coalesce *coalesce)
+#endif
+{
+	struct ena_adapter *adapter = netdev_priv(net_dev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int rc;
+
+	if (!ena_com_interrupt_moderation_supported(ena_dev))
+		return -EOPNOTSUPP;
+
+	rc = ena_com_update_nonadaptive_moderation_interval_tx(ena_dev,
+							       coalesce->tx_coalesce_usecs);
+	if (rc)
+		return rc;
+
+	ena_update_tx_rings_nonadaptive_intr_moderation(adapter);
+
+	rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev,
+							       coalesce->rx_coalesce_usecs);
+	if (rc)
+		return rc;
+
+	ena_update_rx_rings_nonadaptive_intr_moderation(adapter);
+
+	if (coalesce->use_adaptive_rx_coalesce &&
+	    !ena_com_get_adaptive_moderation_enabled(ena_dev))
+		ena_com_enable_adaptive_moderation(ena_dev);
+
+	if (!coalesce->use_adaptive_rx_coalesce &&
+	    ena_com_get_adaptive_moderation_enabled(ena_dev))
+		ena_com_disable_adaptive_moderation(ena_dev);
+
+	return 0;
+}
+
+static u32 ena_get_msglevel(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	return adapter->msg_enable;
+}
+
+static void ena_set_msglevel(struct net_device *netdev, u32 value)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	adapter->msg_enable = value;
+}
+
+static void ena_get_drvinfo(struct net_device *dev,
+			    struct ethtool_drvinfo *info)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+
+	strlcpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version));
+	strlcpy(info->bus_info, pci_name(adapter->pdev),
+		sizeof(info->bus_info));
+
+	info->n_priv_flags = ENA_PRIV_FLAGS_NR;
+}
+
+static void ena_get_ringparam(struct net_device *netdev,
+#ifdef ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
+			      struct ethtool_ringparam *ring,
+			      struct kernel_ethtool_ringparam *kernel_ring,
+			      struct netlink_ext_ack *extack)
+#else
+			      struct ethtool_ringparam *ring)
+#endif
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	ring->tx_max_pending = adapter->max_tx_ring_size;
+	ring->rx_max_pending = adapter->max_rx_ring_size;
+	ring->tx_pending = adapter->tx_ring[0].ring_size;
+	ring->rx_pending = adapter->rx_ring[0].ring_size;
+}
+
+static int ena_set_ringparam(struct net_device *netdev,
+#ifdef ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
+			     struct ethtool_ringparam *ring,
+			     struct kernel_ethtool_ringparam *kernel_ring,
+			     struct netlink_ext_ack *extack)
+#else
+			     struct ethtool_ringparam *ring)
+#endif
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	u32 new_tx_size, new_rx_size;
+
+	if (ring->rx_mini_pending || ring->rx_jumbo_pending)
+		return -EINVAL;
+
+	new_tx_size = clamp_val(ring->tx_pending, ENA_MIN_RING_SIZE,
+				adapter->max_tx_ring_size);
+	new_tx_size = rounddown_pow_of_two(new_tx_size);
+
+	new_rx_size = clamp_val(ring->rx_pending, ENA_MIN_RING_SIZE,
+				adapter->max_rx_ring_size);
+	new_rx_size = rounddown_pow_of_two(new_rx_size);
+
+	if (new_tx_size == adapter->requested_tx_ring_size &&
+	    new_rx_size == adapter->requested_rx_ring_size)
+		return 0;
+
+	return ena_update_queue_sizes(adapter, new_tx_size, new_rx_size);
+}
+
+#ifdef ETHTOOL_GRXRINGS
+static u32 ena_flow_hash_to_flow_type(u16 hash_fields)
+{
+	u32 data = 0;
+
+	if (hash_fields & ENA_ADMIN_RSS_L2_DA)
+		data |= RXH_L2DA;
+
+	if (hash_fields & ENA_ADMIN_RSS_L3_DA)
+		data |= RXH_IP_DST;
+
+	if (hash_fields & ENA_ADMIN_RSS_L3_SA)
+		data |= RXH_IP_SRC;
+
+	if (hash_fields & ENA_ADMIN_RSS_L4_DP)
+		data |= RXH_L4_B_2_3;
+
+	if (hash_fields & ENA_ADMIN_RSS_L4_SP)
+		data |= RXH_L4_B_0_1;
+
+	return data;
+}
+
+static u16 ena_flow_data_to_flow_hash(u32 hash_fields)
+{
+	u16 data = 0;
+
+	if (hash_fields & RXH_L2DA)
+		data |= ENA_ADMIN_RSS_L2_DA;
+
+	if (hash_fields & RXH_IP_DST)
+		data |= ENA_ADMIN_RSS_L3_DA;
+
+	if (hash_fields & RXH_IP_SRC)
+		data |= ENA_ADMIN_RSS_L3_SA;
+
+	if (hash_fields & RXH_L4_B_2_3)
+		data |= ENA_ADMIN_RSS_L4_DP;
+
+	if (hash_fields & RXH_L4_B_0_1)
+		data |= ENA_ADMIN_RSS_L4_SP;
+
+	return data;
+}
+
+static int ena_get_rss_hash(struct ena_com_dev *ena_dev,
+			    struct ethtool_rxnfc *cmd)
+{
+	enum ena_admin_flow_hash_proto proto;
+	u16 hash_fields;
+	int rc;
+
+	cmd->data = 0;
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		proto = ENA_ADMIN_RSS_TCP4;
+		break;
+	case UDP_V4_FLOW:
+		proto = ENA_ADMIN_RSS_UDP4;
+		break;
+	case TCP_V6_FLOW:
+		proto = ENA_ADMIN_RSS_TCP6;
+		break;
+	case UDP_V6_FLOW:
+		proto = ENA_ADMIN_RSS_UDP6;
+		break;
+	case IPV4_FLOW:
+		proto = ENA_ADMIN_RSS_IP4;
+		break;
+	case IPV6_FLOW:
+		proto = ENA_ADMIN_RSS_IP6;
+		break;
+	case ETHER_FLOW:
+		proto = ENA_ADMIN_RSS_NOT_IP;
+		break;
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+	case SCTP_V4_FLOW:
+	case AH_ESP_V4_FLOW:
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+
+	rc = ena_com_get_hash_ctrl(ena_dev, proto, &hash_fields);
+	if (rc)
+		return rc;
+
+	cmd->data = ena_flow_hash_to_flow_type(hash_fields);
+
+	return 0;
+}
+
+static int ena_set_rss_hash(struct ena_com_dev *ena_dev,
+			    struct ethtool_rxnfc *cmd)
+{
+	enum ena_admin_flow_hash_proto proto;
+	u16 hash_fields;
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		proto = ENA_ADMIN_RSS_TCP4;
+		break;
+	case UDP_V4_FLOW:
+		proto = ENA_ADMIN_RSS_UDP4;
+		break;
+	case TCP_V6_FLOW:
+		proto = ENA_ADMIN_RSS_TCP6;
+		break;
+	case UDP_V6_FLOW:
+		proto = ENA_ADMIN_RSS_UDP6;
+		break;
+	case IPV4_FLOW:
+		proto = ENA_ADMIN_RSS_IP4;
+		break;
+	case IPV6_FLOW:
+		proto = ENA_ADMIN_RSS_IP6;
+		break;
+	case ETHER_FLOW:
+		proto = ENA_ADMIN_RSS_NOT_IP;
+		break;
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+	case SCTP_V4_FLOW:
+	case AH_ESP_V4_FLOW:
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+
+	hash_fields = ena_flow_data_to_flow_hash(cmd->data);
+
+	return ena_com_fill_hash_ctrl(ena_dev, proto, hash_fields);
+}
+
+static int ena_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int rc = 0;
+
+	switch (info->cmd) {
+	case ETHTOOL_SRXFH:
+		rc = ena_set_rss_hash(adapter->ena_dev, info);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+	case ETHTOOL_SRXCLSRLINS:
+	default:
+		netif_err(adapter, drv, netdev,
+			  "Command parameter %d is not supported\n", info->cmd);
+		rc = -EOPNOTSUPP;
+	}
+
+	return rc;
+}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(3, 2, 0)
+static int ena_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info,
+			 void *rules)
+#else
+static int ena_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info,
+			 u32 *rules)
+#endif
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int rc = 0;
+
+	switch (info->cmd) {
+	case ETHTOOL_GRXRINGS:
+		info->data = adapter->num_io_queues;
+		rc = 0;
+		break;
+	case ETHTOOL_GRXFH:
+		rc = ena_get_rss_hash(adapter->ena_dev, info);
+		break;
+	case ETHTOOL_GRXCLSRLCNT:
+	case ETHTOOL_GRXCLSRULE:
+	case ETHTOOL_GRXCLSRLALL:
+	default:
+		netif_err(adapter, drv, netdev,
+			  "Command parameter %d is not supported\n", info->cmd);
+		rc = -EOPNOTSUPP;
+	}
+
+	return rc;
+}
+#endif /* ETHTOOL_GRXRINGS */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+static u32 ena_get_rxfh_indir_size(struct net_device *netdev)
+{
+	return ENA_RX_RSS_TABLE_SIZE;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+static u32 ena_get_rxfh_key_size(struct net_device *netdev)
+{
+	return ENA_HASH_KEY_SIZE;
+}
+#endif
+
+static int ena_indirection_table_set(struct ena_adapter *adapter,
+				     const u32 *indir)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int i, rc;
+
+	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
+		rc = ena_com_indirect_table_fill_entry(ena_dev,
+						       i,
+						       ENA_IO_RXQ_IDX(indir[i]));
+		if (unlikely(rc)) {
+			netif_err(adapter, drv, adapter->netdev,
+				  "Cannot fill indirect table (index is too large)\n");
+			return rc;
+		}
+	}
+
+	rc = ena_com_indirect_table_set(ena_dev);
+	if (rc) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Cannot set indirect table\n");
+		return rc == -EPERM ? -EOPNOTSUPP : rc;
+	}
+	return rc;
+}
+
+static int ena_indirection_table_get(struct ena_adapter *adapter, u32 *indir)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int i, rc;
+
+	if (!indir)
+		return 0;
+
+	rc = ena_com_indirect_table_get(ena_dev, indir);
+	if (rc)
+		return rc;
+
+	/* Our internal representation of the indices is: even indices
+	 * for Tx and uneven indices for Rx. We need to convert the Rx
+	 * indices to be consecutive
+	 */
+	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++)
+		indir[i] = ENA_IO_RXQ_IDX_TO_COMBINED_IDX(indir[i]);
+
+	return rc;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
+static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			u8 *hfunc)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	enum ena_admin_hash_functions ena_func;
+	u8 func;
+	int rc;
+
+	rc = ena_indirection_table_get(adapter, indir);
+	if (rc)
+		return rc;
+
+	/* We call this function in order to check if the device
+	 * supports getting/setting the hash function.
+	 */
+	rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			rc = 0;
+
+		return rc;
+	}
+
+	rc = ena_com_get_hash_key(adapter->ena_dev, key);
+	if (rc)
+		return rc;
+
+	switch (ena_func) {
+	case ENA_ADMIN_TOEPLITZ:
+		func = ETH_RSS_HASH_TOP;
+		break;
+	case ENA_ADMIN_CRC32:
+		func = ETH_RSS_HASH_CRC32;
+		break;
+	default:
+		netif_err(adapter, drv, netdev,
+			  "Command parameter is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (hfunc)
+		*hfunc = func;
+
+	return 0;
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	enum ena_admin_hash_functions ena_func;
+	int rc;
+
+	rc = ena_indirection_table_get(adapter, indir);
+	if (rc)
+		return rc;
+
+	/* We call this function in order to check if the device
+	 * supports getting/setting the hash function.
+	 */
+	rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			rc = 0;
+
+		return rc;
+	}
+
+	rc = ena_com_get_hash_key(adapter->ena_dev, key);
+	if (rc)
+		return rc;
+
+	return rc;
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)/* >= 3.16.0 */
+static int ena_get_rxfh(struct net_device *netdev, u32 *indir)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	return ena_indirection_table_get(adapter, indir);
+}
+#endif /* >= 3.8.0 */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
+static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
+			const u8 *key, const u8 hfunc)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
+			const u8 *key)
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	enum ena_admin_hash_functions func = 0;
+	int rc;
+
+	if (indir) {
+		rc = ena_indirection_table_set(adapter, indir);
+		if (rc)
+			return rc;
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
+	switch (hfunc) {
+	case ETH_RSS_HASH_NO_CHANGE:
+		func = ena_com_get_current_hash_function(ena_dev);
+		break;
+	case ETH_RSS_HASH_TOP:
+		func = ENA_ADMIN_TOEPLITZ;
+		break;
+	case ETH_RSS_HASH_CRC32:
+		func = ENA_ADMIN_CRC32;
+		break;
+	default:
+		netif_err(adapter, drv, netdev, "Unsupported hfunc %d\n",
+			  hfunc);
+		return -EOPNOTSUPP;
+	}
+#else /* Kernel 3.19 */
+	func = ENA_ADMIN_TOEPLITZ;
+#endif
+
+	if (key || func) {
+		rc = ena_com_fill_hash_function(ena_dev, func, key,
+						ENA_HASH_KEY_SIZE,
+						0xFFFFFFFF);
+		if (unlikely(rc)) {
+			netif_err(adapter, drv, netdev, "Cannot fill key\n");
+			return rc == -EPERM ? -EOPNOTSUPP : rc;
+		}
+	}
+
+	return 0;
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) /* Kernel > 3.16 */
+static int ena_set_rxfh(struct net_device *netdev, const u32 *indir)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int rc = 0;
+
+	if (indir)
+		rc = ena_indirection_table_set(adapter, indir);
+
+	return rc;
+}
+#endif /* Kernel >= 3.8 */
+#endif /* ETHTOOL_GRXFH */
+#ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
+
+#ifdef ETHTOOL_SCHANNELS
+static void ena_get_channels(struct net_device *netdev,
+			     struct ethtool_channels *channels)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	channels->max_combined = adapter->max_num_io_queues;
+	channels->combined_count = adapter->num_io_queues;
+}
+
+static int ena_set_channels(struct net_device *netdev,
+			    struct ethtool_channels *channels)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	u32 count = channels->combined_count;
+	/* The check for max value is already done in ethtool */
+#ifdef ENA_XDP_SUPPORT
+	if (count < ENA_MIN_NUM_IO_QUEUES ||
+	    (ena_xdp_present(adapter) &&
+	    !ena_xdp_legal_queue_count(adapter, count)))
+#else
+	if (count < ENA_MIN_NUM_IO_QUEUES)
+#endif /* ENA_XDP_SUPPORT */
+		return -EINVAL;
+
+	if (count > adapter->max_num_io_queues)
+		return -EINVAL;
+	if (count != adapter->num_io_queues && ena_is_zc_q_exist(adapter)) {
+		netdev_err(adapter->netdev,
+			   "Changing channel count not supported with xsk pool loaded\n");
+		return -EOPNOTSUPP;
+	}
+
+	return ena_update_queue_count(adapter, count);
+}
+#endif /* ETHTOOL_SCHANNELS */
+
+#endif /* HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+static int ena_get_tunable(struct net_device *netdev,
+			   const struct ethtool_tunable *tuna, void *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int ret = 0;
+
+	switch (tuna->id) {
+	case ETHTOOL_RX_COPYBREAK:
+		*(u32 *)data = adapter->rx_copybreak;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int ena_set_tunable(struct net_device *netdev,
+			   const struct ethtool_tunable *tuna,
+			   const void *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int ret = 0;
+	u32 len;
+
+	switch (tuna->id) {
+	case ETHTOOL_RX_COPYBREAK:
+		len = *(u32 *)data;
+		ret = ena_set_rx_copybreak(adapter, len);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+#endif /* 3.18.0 */
+
+static u32 ena_get_priv_flags(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	u32 priv_flags = 0;
+
+	if (adapter->rx_ring->page_cache)
+		priv_flags |= ENA_PRIV_FLAGS_LPC;
+
+	return priv_flags;
+}
+
+static int ena_set_priv_flags(struct net_device *netdev, u32 priv_flags)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	/* LPC is the only supported private flag for now */
+	return ena_set_lpc_state(adapter, !!(priv_flags & ENA_PRIV_FLAGS_LPC));
+}
+
+static const struct ethtool_ops ena_ethtool_ops = {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)
+	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+				     ETHTOOL_COALESCE_USE_ADAPTIVE_RX,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
+	.get_link_ksettings	= ena_get_link_ksettings,
+#else
+	.get_settings		= ena_get_settings,
+#endif
+	.get_drvinfo		= ena_get_drvinfo,
+	.get_msglevel		= ena_get_msglevel,
+	.set_msglevel		= ena_set_msglevel,
+	.get_link		= ethtool_op_get_link,
+	.get_coalesce		= ena_get_coalesce,
+	.set_coalesce		= ena_set_coalesce,
+	.get_ringparam		= ena_get_ringparam,
+	.set_ringparam		= ena_set_ringparam,
+	.get_sset_count         = ena_get_sset_count,
+	.get_strings		= ena_get_ethtool_strings,
+	.get_ethtool_stats      = ena_get_ethtool_stats,
+#ifdef ETHTOOL_GRXRINGS
+	.get_rxnfc		= ena_get_rxnfc,
+	.set_rxnfc		= ena_set_rxnfc,
+#endif /* ETHTOOL_GRXRINGS */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+	.get_rxfh_indir_size    = ena_get_rxfh_indir_size,
+#endif /* >= 3.8.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+	.get_rxfh_key_size	= ena_get_rxfh_key_size,
+	.get_rxfh		= ena_get_rxfh,
+	.set_rxfh		= ena_set_rxfh,
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+	.get_rxfh_indir		= ena_get_rxfh,
+	.set_rxfh_indir		= ena_set_rxfh,
+#endif /* >= 3.8.0 */
+#ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
+#ifdef ETHTOOL_SCHANNELS
+	.get_channels		= ena_get_channels,
+	.set_channels		= ena_set_channels,
+#endif /* ETHTOOL_SCHANNELS */
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+	.get_tunable		= ena_get_tunable,
+	.set_tunable		= ena_set_tunable,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
+	.get_ts_info		= ena_get_ts_info,
+#endif
+	.get_priv_flags		= ena_get_priv_flags,
+	.set_priv_flags		= ena_set_priv_flags,
+};
+
+void ena_set_ethtool_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &ena_ethtool_ops;
+}
+
+static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf)
+{
+	struct net_device *netdev = adapter->netdev;
+	u8 *strings_buf;
+	u64 *data_buf;
+	int strings_num;
+	int i, rc;
+
+	strings_num = ena_get_sw_stats_count(adapter);
+	if (strings_num <= 0) {
+		netif_err(adapter, drv, netdev, "Can't get stats num\n");
+		return;
+	}
+
+	strings_buf = devm_kcalloc(&adapter->pdev->dev,
+				   ETH_GSTRING_LEN, strings_num,
+				   GFP_ATOMIC);
+	if (!strings_buf) {
+		netif_err(adapter, drv, netdev,
+			  "Failed to allocate strings_buf\n");
+		return;
+	}
+
+	data_buf = devm_kcalloc(&adapter->pdev->dev,
+				strings_num, sizeof(u64),
+				GFP_ATOMIC);
+	if (!data_buf) {
+		netif_err(adapter, drv, netdev,
+			  "Failed to allocate data buf\n");
+		devm_kfree(&adapter->pdev->dev, strings_buf);
+		return;
+	}
+
+	ena_get_strings(adapter, strings_buf, false);
+	ena_get_stats(adapter, data_buf, false);
+
+	/* If there is a buffer, dump stats, otherwise print them to dmesg */
+	if (buf)
+		for (i = 0; i < strings_num; i++) {
+			rc = snprintf(buf, ETH_GSTRING_LEN + sizeof(u64),
+				      "%s %llu\n",
+				      strings_buf + i * ETH_GSTRING_LEN,
+				      data_buf[i]);
+			buf += rc;
+		}
+	else
+		for (i = 0; i < strings_num; i++)
+			netif_err(adapter, drv, netdev, "%s: %llu\n",
+				  strings_buf + i * ETH_GSTRING_LEN,
+				  data_buf[i]);
+
+	devm_kfree(&adapter->pdev->dev, strings_buf);
+	devm_kfree(&adapter->pdev->dev, data_buf);
+}
+
+void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf)
+{
+	if (!buf)
+		return;
+
+	ena_dump_stats_ex(adapter, buf);
+}
+
+void ena_dump_stats_to_dmesg(struct ena_adapter *adapter)
+{
+	ena_dump_stats_ex(adapter, NULL);
+}
diff --git a/drivers/amazon/net/ena/ena_lpc.c b/drivers/amazon/net/ena/ena_lpc.c
new file mode 100644
index 0000000000000..64c3d2d24f398
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_lpc.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+#include "ena_lpc.h"
+#include "ena_xdp.h"
+
+static void ena_free_ring_page_cache(struct ena_ring *rx_ring);
+
+static void ena_put_unmap_cache_page(struct ena_ring *rx_ring, struct ena_page *ena_page)
+{
+	dma_unmap_page(rx_ring->dev, ena_page->dma_addr, ENA_PAGE_SIZE,
+		       DMA_BIDIRECTIONAL);
+
+	put_page(ena_page->page);
+}
+
+/* Removes a page from page cache and allocate a new one instead. If an
+ * allocation of a new page fails, the cache entry isn't changed
+ */
+static void ena_replace_cache_page(struct ena_ring *rx_ring,
+				   struct ena_page *ena_page)
+{
+	struct page *new_page;
+	dma_addr_t dma;
+
+	new_page = ena_alloc_map_page(rx_ring, &dma);
+
+	if (unlikely(IS_ERR(new_page)))
+		return;
+
+	ena_put_unmap_cache_page(rx_ring, ena_page);
+
+	ena_page->page = new_page;
+	ena_page->dma_addr = dma;
+}
+
+/* Mark the cache page as used and return it. If the page belongs to a different
+ * NUMA than the current one, free the cache page and allocate another one
+ * instead.
+ */
+static struct page *ena_return_cache_page(struct ena_ring *rx_ring,
+					  struct ena_page *ena_page,
+					  dma_addr_t *dma)
+{
+	/* Remove pages belonging to different node than the one the CPU runs on */
+	if (unlikely(page_to_nid(ena_page->page) != numa_mem_id())) {
+		ena_increase_stat(&rx_ring->rx_stats.lpc_wrong_numa, 1, &rx_ring->syncp);
+		ena_replace_cache_page(rx_ring, ena_page);
+	}
+
+	/* Make sure no writes are pending for this page */
+	dma_sync_single_for_device(rx_ring->dev, ena_page->dma_addr,
+				   ENA_PAGE_SIZE,
+				   DMA_BIDIRECTIONAL);
+
+	/* Increase refcount to 2 so that the page is returned to the
+	 * cache after being freed
+	 */
+	page_ref_inc(ena_page->page);
+
+	*dma = ena_page->dma_addr;
+
+	return ena_page->page;
+}
+
+struct page *ena_lpc_get_page(struct ena_ring *rx_ring, dma_addr_t *dma,
+			      bool *is_lpc_page)
+{
+	struct ena_page_cache *page_cache = rx_ring->page_cache;
+	u32 head, cache_current_size;
+	struct ena_page *ena_page;
+
+	/* Cache size of zero indicates disabled cache */
+	if (!page_cache) {
+		*is_lpc_page = false;
+		return ena_alloc_map_page(rx_ring, dma);
+	}
+
+	*is_lpc_page = true;
+
+	cache_current_size = page_cache->current_size;
+	head = page_cache->head;
+
+	ena_page = &page_cache->cache[head];
+	/* Warm up phase. We fill the pages for the first time. The
+	 * phase is done in the napi context to improve the chances we
+	 * allocate on the correct NUMA node
+	 */
+	if (unlikely(cache_current_size < page_cache->max_size)) {
+		/* Check if oldest allocated page is free */
+		if (ena_page->page && page_ref_count(ena_page->page) == 1) {
+			page_cache->head = (head + 1) % cache_current_size;
+			return ena_return_cache_page(rx_ring, ena_page, dma);
+		}
+
+		ena_page = &page_cache->cache[cache_current_size];
+
+		/* Add a new page to the cache */
+		ena_page->page = ena_alloc_map_page(rx_ring, dma);
+		if (unlikely(IS_ERR(ena_page->page)))
+			return ena_page->page;
+
+		ena_page->dma_addr = *dma;
+
+		/* Increase refcount to 2 so that the page is returned to the
+		 * cache after being freed
+		 */
+		page_ref_inc(ena_page->page);
+
+		page_cache->current_size++;
+
+		ena_increase_stat(&rx_ring->rx_stats.lpc_warm_up, 1, &rx_ring->syncp);
+
+		return ena_page->page;
+	}
+
+	/* Next page is still in use, so we allocate outside the cache */
+	if (unlikely(page_ref_count(ena_page->page) != 1)) {
+		ena_increase_stat(&rx_ring->rx_stats.lpc_full, 1, &rx_ring->syncp);
+		*is_lpc_page = false;
+		return ena_alloc_map_page(rx_ring, dma);
+	}
+
+	page_cache->head = (head + 1) & (page_cache->max_size - 1);
+
+	return ena_return_cache_page(rx_ring, ena_page, dma);
+}
+
+bool ena_is_lpc_supported(struct ena_adapter *adapter,
+			  struct ena_ring *rx_ring,
+			  bool error_print)
+{
+#ifdef ENA_NETDEV_LOGS_WITHOUT_RV
+	void (*print_log)(const struct net_device *dev, const char *format, ...);
+#else
+	int (*print_log)(const struct net_device *dev, const char *format, ...);
+#endif
+	int channels_nr = adapter->num_io_queues + adapter->xdp_num_queues;
+
+	print_log = (error_print) ? netdev_err : netdev_info;
+
+	/* LPC is disabled below min number of channels */
+	if (channels_nr < ENA_LPC_MIN_NUM_OF_CHANNELS) {
+		print_log(adapter->netdev,
+			  "Local page cache is disabled for less than %d channels\n",
+			  ENA_LPC_MIN_NUM_OF_CHANNELS);
+
+		/* Disable LPC for such case. It can enabled again through
+		 * ethtool private-flag.
+		 */
+		adapter->used_lpc_size = 0;
+
+		return false;
+	}
+#ifdef ENA_XDP_SUPPORT
+
+	/* The driver doesn't support page caches under XDP */
+	if (ena_xdp_present_ring(rx_ring)) {
+		print_log(adapter->netdev,
+			  "Local page cache is disabled when using XDP\n");
+		return false;
+	}
+#endif /* ENA_XDP_SUPPORT */
+
+	return true;
+}
+
+/* Calculate the size of the Local Page Cache. If LPC should be disabled, return
+ * a size of 0.
+ */
+static u32 ena_calculate_cache_size(struct ena_adapter *adapter,
+				    struct ena_ring *rx_ring)
+{
+	u32 page_cache_size = adapter->used_lpc_size;
+
+	/* LPC cache size of 0 means disabled cache */
+	if (page_cache_size == 0)
+		return 0;
+
+	if (!ena_is_lpc_supported(adapter, rx_ring, false))
+		return 0;
+
+	/* Clap the LPC size to its maximum value */
+	if (page_cache_size > ENA_LPC_MAX_MULTIPLIER) {
+		netdev_info(adapter->netdev,
+			    "Configured LPC size %d is too large, reducing to %d (max)\n",
+			    adapter->configured_lpc_size, ENA_LPC_MAX_MULTIPLIER);
+
+		/* Override LPC size to avoid printing this message
+		 * every up/down operation
+		 */
+		adapter->configured_lpc_size = ENA_LPC_MAX_MULTIPLIER;
+		adapter->used_lpc_size = page_cache_size = ENA_LPC_MAX_MULTIPLIER;
+	}
+
+	page_cache_size = page_cache_size * ENA_LPC_MULTIPLIER_UNIT;
+	page_cache_size = roundup_pow_of_two(page_cache_size);
+
+	return page_cache_size;
+}
+
+int ena_create_page_caches(struct ena_adapter *adapter)
+{
+	struct ena_page_cache *cache;
+	u32 page_cache_size;
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		struct ena_ring *rx_ring = &adapter->rx_ring[i];
+
+		page_cache_size = ena_calculate_cache_size(adapter, rx_ring);
+
+		if (!page_cache_size)
+			return 0;
+
+		cache = vzalloc(sizeof(struct ena_page_cache) +
+				sizeof(struct ena_page) * page_cache_size);
+		if (!cache)
+			goto err_cache_alloc;
+
+		cache->max_size = page_cache_size;
+		rx_ring->page_cache = cache;
+	}
+
+	return 0;
+err_cache_alloc:
+	netif_err(adapter, ifup, adapter->netdev,
+		  "Failed to initialize local page caches (LPCs)\n");
+	while (--i >= 0) {
+		struct ena_ring *rx_ring = &adapter->rx_ring[i];
+
+		ena_free_ring_page_cache(rx_ring);
+	}
+
+	return -ENOMEM;
+}
+
+/* Release all pages from the page cache */
+static void ena_free_ring_cache_pages(struct ena_adapter *adapter, int qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+	struct ena_page_cache *page_cache;
+	int i;
+
+	/* Page cache is disabled */
+	if (!rx_ring->page_cache)
+		return;
+
+	page_cache = rx_ring->page_cache;
+
+	/* We check size value to make sure we don't
+	 * free pages that weren't allocated.
+	 */
+	for (i = 0; i < page_cache->current_size; i++) {
+		struct ena_page *ena_page = &page_cache->cache[i];
+
+		WARN_ON(!ena_page->page);
+
+		dma_unmap_page(rx_ring->dev, ena_page->dma_addr,
+			       ENA_PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+
+		/* If the page is also in the rx buffer, then this operation
+		 * would only decrease its reference count
+		 */
+		__free_page(ena_page->page);
+	}
+
+	page_cache->head = page_cache->current_size = 0;
+}
+
+void ena_free_all_cache_pages(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		ena_free_ring_cache_pages(adapter, i);
+}
+
+static void ena_free_ring_page_cache(struct ena_ring *rx_ring)
+{
+	if(!rx_ring->page_cache)
+		return;
+
+	vfree(rx_ring->page_cache);
+	rx_ring->page_cache = NULL;
+}
+
+void ena_free_page_caches(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		struct ena_ring *rx_ring = &adapter->rx_ring[i];
+
+		ena_free_ring_page_cache(rx_ring);
+	}
+}
diff --git a/drivers/amazon/net/ena/ena_lpc.h b/drivers/amazon/net/ena/ena_lpc.h
new file mode 100644
index 0000000000000..2953eb24ac4dd
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_lpc.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_netdev.h"
+
+/* LPC definitions */
+#define ENA_LPC_DEFAULT_MULTIPLIER 2
+#define ENA_LPC_MAX_MULTIPLIER 32
+#define ENA_LPC_MULTIPLIER_UNIT 1024
+#define ENA_LPC_MIN_NUM_OF_CHANNELS 16
+
+/* Store DMA address along with the page */
+struct ena_page {
+	struct page *page;
+	dma_addr_t dma_addr;
+};
+
+struct ena_page_cache {
+	/* How many pages are produced */
+	u32 head;
+	/* How many of the entries were initialized */
+	u32 current_size;
+	/* Maximum number of pages the cache can hold */
+	u32 max_size;
+
+	struct ena_page cache[0];
+} ____cacheline_aligned;
+
+int ena_create_page_caches(struct ena_adapter *adapter);
+void ena_free_page_caches(struct ena_adapter *adapter);
+void ena_free_all_cache_pages(struct ena_adapter *adapter);
+struct page *ena_lpc_get_page(struct ena_ring *rx_ring, dma_addr_t *dma,
+			      bool *is_lpc_page);
+bool ena_is_lpc_supported(struct ena_adapter *adapter,
+			  struct ena_ring *rx_ring,
+			  bool error_print);
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
new file mode 100644
index 0000000000000..fbb96d864d8c3
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -0,0 +1,4980 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif /* CONFIG_RFS_ACCEL */
+#include <linux/ethtool.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/numa.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#if defined(CONFIG_NET_RX_BUSY_POLL) && (LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0))
+#include <net/busy_poll.h>
+#endif
+#include <net/ip.h>
+
+#include "ena_netdev.h"
+#include "ena_pci_id_tbl.h"
+#include "ena_sysfs.h"
+#include "ena_xdp.h"
+
+#include "ena_lpc.h"
+
+#include "ena_phc.h"
+#include "ena_devlink.h"
+
+static char version[] = DEVICE_NAME " v" DRV_MODULE_GENERATION "\n";
+
+MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
+MODULE_DESCRIPTION(DEVICE_NAME);
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_GENERATION);
+
+/* Time in jiffies before concluding the transmitter is hung. */
+#define TX_TIMEOUT  (5 * HZ)
+
+#define ENA_MAX_RINGS min_t(unsigned int, ENA_MAX_NUM_IO_QUEUES, num_possible_cpus())
+
+#define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \
+		NETIF_MSG_IFDOWN | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR)
+#ifndef ENA_LINEAR_FRAG_SUPPORTED
+
+#define ENA_SKB_PULL_MIN_LEN 64
+#endif
+
+static int debug = -1;
+module_param(debug, int, 0444);
+MODULE_PARM_DESC(debug, "Debug level (-1=default,0=none,...,16=all)");
+
+static int rx_queue_size = ENA_DEFAULT_RING_SIZE;
+module_param(rx_queue_size, int, 0444);
+MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Max value is 8K\n");
+
+static int force_large_llq_header = 0;
+module_param(force_large_llq_header, int, 0444);
+MODULE_PARM_DESC(force_large_llq_header, "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum TX queue size by half.\n");
+
+static int num_io_queues = ENA_MAX_NUM_IO_QUEUES;
+module_param(num_io_queues, int, 0444);
+MODULE_PARM_DESC(num_io_queues, "Sets number of RX/TX queues to allocate to device. The maximum value depends on the device and number of online CPUs.\n");
+
+static int enable_bql = 0;
+module_param(enable_bql, int, 0444);
+MODULE_PARM_DESC(enable_bql, "Enable BQL.\n");
+
+static int lpc_size = ENA_LPC_DEFAULT_MULTIPLIER;
+module_param(lpc_size, uint, 0444);
+MODULE_PARM_DESC(lpc_size, "Each local page cache (lpc) holds N * 1024 pages. This parameter sets N which is rounded up to a multiplier of 2. If zero, the page cache is disabled. Max: 32\n");
+
+static struct ena_aenq_handlers aenq_handlers;
+
+static struct workqueue_struct *ena_wq;
+
+MODULE_DEVICE_TABLE(pci, ena_pci_tbl);
+
+static int ena_rss_init_default(struct ena_adapter *adapter);
+static void check_for_admin_com_state(struct ena_adapter *adapter);
+static int ena_calc_io_queue_size(struct ena_adapter *adapter,
+				   struct ena_com_dev_get_features_ctx *get_feat_ctx);
+static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
+				 struct net_device *netdev);
+
+static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue)
+{
+	enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_OS_NETDEV_WD;
+	struct ena_adapter *adapter = netdev_priv(dev);
+	unsigned int time_since_last_napi, threshold;
+	struct ena_ring *tx_ring;
+	int napi_scheduled;
+
+	if (txqueue >= adapter->num_io_queues) {
+		netdev_err(dev, "TX timeout on invalid queue %u\n", txqueue);
+		goto schedule_reset;
+	}
+
+	threshold = jiffies_to_usecs(dev->watchdog_timeo);
+	tx_ring = &adapter->tx_ring[txqueue];
+
+	time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies);
+	napi_scheduled = !!(tx_ring->napi->state & NAPIF_STATE_SCHED);
+
+	netdev_err(dev,
+		  "TX q %d is paused for too long (threshold %u). Time since last napi %u usec. napi scheduled: %d\n",
+		  txqueue,
+		  threshold,
+		  time_since_last_napi,
+		  napi_scheduled);
+
+	if (threshold < time_since_last_napi && napi_scheduled) {
+		netdev_err(dev,
+			"napi handler hasn't been called for a long time but is scheduled\n");
+			reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
+	}
+schedule_reset:
+	/* Change the state of the device to trigger reset
+	 * Check that we are not in the middle or a trigger already
+	 */
+	if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
+	ena_reset_device(adapter, reset_reason);
+	ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp);
+}
+
+#ifndef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
+/* This function is called by the kernel's watchdog and indicates that the queue
+ * has been closed longer than dev->watchdog_timeo value allows.
+ * In older kernels the called function doesn't contain the id of the queue
+ * that's been closed for too long. This helper function retrieves this
+ * information
+ */
+static void ena_find_and_timeout_queue(struct net_device *dev)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	unsigned long trans_start;
+	struct netdev_queue *txq;
+	unsigned int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		txq = netdev_get_tx_queue(dev, i);
+		trans_start = txq->trans_start;
+		if (netif_xmit_stopped(txq) &&
+			time_after(jiffies, (trans_start + dev->watchdog_timeo))) {
+			ena_tx_timeout(dev, i);
+			return;
+		}
+	}
+
+	netdev_warn(dev, "timeout was called, but no offending queue was found\n");
+
+	/* Change the state of the device to trigger reset
+	 * Check that we are not in the middle or a trigger already
+	 */
+	if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
+	ena_reset_device(adapter, ENA_REGS_RESET_OS_NETDEV_WD);
+	ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp);
+}
+
+#endif
+static void update_rx_ring_mtu(struct ena_adapter *adapter, int mtu)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		adapter->rx_ring[i].mtu = mtu;
+}
+
+static int ena_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	int ret;
+
+#ifndef HAVE_MTU_MIN_MAX_IN_NET_DEVICE
+	if ((new_mtu > adapter->max_mtu) || (new_mtu < ENA_MIN_MTU)) {
+		netif_err(adapter, drv, dev,
+			  "Invalid MTU setting. new_mtu: %d max mtu: %d min mtu: %d\n",
+			  new_mtu, adapter->max_mtu, ENA_MIN_MTU);
+		return -EINVAL;
+	}
+#endif
+	ret = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu);
+	if (!ret) {
+		netif_dbg(adapter, drv, dev, "Set MTU to %d\n", new_mtu);
+		update_rx_ring_mtu(adapter, new_mtu);
+		dev->mtu = new_mtu;
+	} else {
+		netif_err(adapter, drv, dev, "Failed to set MTU to %d\n",
+			  new_mtu);
+	}
+
+	return ret;
+}
+
+int ena_xmit_common(struct ena_adapter *adapter,
+		    struct ena_ring *ring,
+		    struct ena_tx_buffer *tx_info,
+		    struct ena_com_tx_ctx *ena_tx_ctx,
+		    u16 next_to_use,
+		    u32 bytes)
+{
+	int rc, nb_hw_desc;
+
+	if (unlikely(ena_com_is_doorbell_needed(ring->ena_com_io_sq,
+						ena_tx_ctx))) {
+		netif_dbg(adapter, tx_queued, adapter->netdev,
+			  "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n",
+			  ring->qid);
+		ena_ring_tx_doorbell(ring);
+	}
+
+	/* prepare the packet's descriptors to dma engine */
+	rc = ena_com_prepare_tx(ring->ena_com_io_sq, ena_tx_ctx,
+				&nb_hw_desc);
+
+	/* In case there isn't enough space in the queue for the packet,
+	 * we simply drop it. All other failure reasons of
+	 * ena_com_prepare_tx() are fatal and therefore require a device reset.
+	 */
+	if (unlikely(rc)) {
+		netif_err(adapter, tx_queued, adapter->netdev,
+			  "Failed to prepare tx bufs\n");
+		ena_increase_stat(&ring->tx_stats.prepare_ctx_err, 1,
+				  &ring->syncp);
+		if (rc != -ENOMEM)
+			ena_reset_device(adapter,
+					 ENA_REGS_RESET_DRIVER_INVALID_STATE);
+		return rc;
+	}
+
+	u64_stats_update_begin(&ring->syncp);
+	ring->tx_stats.cnt++;
+	ring->tx_stats.bytes += bytes;
+	u64_stats_update_end(&ring->syncp);
+
+	tx_info->tx_descs = nb_hw_desc;
+	tx_info->total_tx_size = bytes;
+	tx_info->last_jiffies = jiffies;
+	tx_info->print_once = 0;
+
+	ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
+						 ring->ring_size);
+	return 0;
+}
+
+static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter)
+{
+#ifdef CONFIG_RFS_ACCEL
+	u32 i;
+	int rc;
+
+	adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter->num_io_queues);
+	if (!adapter->netdev->rx_cpu_rmap)
+		return -ENOMEM;
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		int irq_idx = ENA_IO_IRQ_IDX(i);
+
+		rc = irq_cpu_rmap_add(adapter->netdev->rx_cpu_rmap,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+				      adapter->msix_entries[irq_idx].vector);
+#else
+				      pci_irq_vector(adapter->pdev, irq_idx));
+#endif
+		if (rc) {
+			free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
+			adapter->netdev->rx_cpu_rmap = NULL;
+			return rc;
+		}
+	}
+#endif /* CONFIG_RFS_ACCEL */
+	return 0;
+}
+
+static void ena_init_io_rings_common(struct ena_adapter *adapter,
+				     struct ena_ring *ring, u16 qid)
+{
+	ring->qid = qid;
+	ring->pdev = adapter->pdev;
+	ring->dev = &adapter->pdev->dev;
+	ring->netdev = adapter->netdev;
+	ring->napi = &adapter->ena_napi[qid].napi;
+	ring->adapter = adapter;
+	ring->ena_dev = adapter->ena_dev;
+	ring->per_napi_packets = 0;
+	ring->cpu = 0;
+	ring->numa_node = 0;
+	ring->no_interrupt_event_cnt = 0;
+	u64_stats_init(&ring->syncp);
+}
+
+void ena_init_io_rings(struct ena_adapter *adapter,
+		       int first_index, int count)
+{
+	struct ena_com_dev *ena_dev;
+	struct ena_ring *txr, *rxr;
+	int i;
+
+	ena_dev = adapter->ena_dev;
+
+	for (i = first_index; i < first_index + count; i++) {
+		txr = &adapter->tx_ring[i];
+		rxr = &adapter->rx_ring[i];
+
+		/* TX common ring state */
+		ena_init_io_rings_common(adapter, txr, i);
+
+		/* TX specific ring state */
+		txr->ring_size = adapter->requested_tx_ring_size;
+		txr->tx_max_header_size = ena_dev->tx_max_header_size;
+		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
+		txr->sgl_size = adapter->max_tx_sgl_size;
+		txr->enable_bql = enable_bql;
+		txr->smoothed_interval =
+			ena_com_get_nonadaptive_moderation_interval_tx(ena_dev);
+		txr->disable_meta_caching = adapter->disable_meta_caching;
+#ifdef ENA_XDP_SUPPORT
+		spin_lock_init(&txr->xdp_tx_lock);
+#endif
+
+		/* Don't init RX queues for xdp queues */
+		if (!ENA_IS_XDP_INDEX(adapter, i)) {
+			/* RX common ring state */
+			ena_init_io_rings_common(adapter, rxr, i);
+
+			/* RX specific ring state */
+			rxr->ring_size = adapter->requested_rx_ring_size;
+			rxr->rx_copybreak = adapter->rx_copybreak;
+			rxr->sgl_size = adapter->max_rx_sgl_size;
+			rxr->smoothed_interval =
+				ena_com_get_nonadaptive_moderation_interval_rx(ena_dev);
+			rxr->empty_rx_queue = 0;
+			rxr->rx_headroom = NET_SKB_PAD;
+			adapter->ena_napi[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+#ifdef ENA_XDP_SUPPORT
+			rxr->xdp_ring = &adapter->tx_ring[i + adapter->num_io_queues];
+#endif
+		}
+	}
+}
+
+/* ena_setup_tx_resources - allocate I/O Tx resources (Descriptors)
+ * @adapter: network interface device structure
+ * @qid: queue index
+ *
+ * Return 0 on success, negative on failure
+ */
+static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid)
+{
+	struct ena_ring *tx_ring = &adapter->tx_ring[qid];
+	struct ena_irq *ena_irq = &adapter->irq_tbl[ENA_IO_IRQ_IDX(qid)];
+	int size, i, node;
+
+	if (tx_ring->tx_buffer_info) {
+		netif_err(adapter, ifup,
+			  adapter->netdev, "tx_buffer_info info is not NULL");
+		return -EEXIST;
+	}
+
+	size = sizeof(struct ena_tx_buffer) * tx_ring->ring_size;
+	node = cpu_to_node(ena_irq->cpu);
+
+	tx_ring->tx_buffer_info = vzalloc_node(size, node);
+	if (!tx_ring->tx_buffer_info) {
+		tx_ring->tx_buffer_info = vzalloc(size);
+		if (!tx_ring->tx_buffer_info)
+			goto err_tx_buffer_info;
+	}
+
+	size = sizeof(u16) * tx_ring->ring_size;
+	tx_ring->free_ids = vzalloc_node(size, node);
+	if (!tx_ring->free_ids) {
+		tx_ring->free_ids = vzalloc(size);
+		if (!tx_ring->free_ids)
+			goto err_tx_free_ids;
+	}
+
+	size = tx_ring->tx_max_header_size;
+	tx_ring->push_buf_intermediate_buf = vzalloc_node(size, node);
+	if (!tx_ring->push_buf_intermediate_buf) {
+		tx_ring->push_buf_intermediate_buf = vzalloc(size);
+		if (!tx_ring->push_buf_intermediate_buf)
+			goto err_push_buf_intermediate_buf;
+	}
+
+	/* Req id ring for TX out of order completions */
+	for (i = 0; i < tx_ring->ring_size; i++)
+		tx_ring->free_ids[i] = i;
+
+	/* Reset tx statistics */
+	memset(&tx_ring->tx_stats, 0x0, sizeof(tx_ring->tx_stats));
+
+	tx_ring->next_to_use = 0;
+	tx_ring->next_to_clean = 0;
+	tx_ring->cpu = ena_irq->cpu;
+	tx_ring->numa_node = node;
+	return 0;
+
+err_push_buf_intermediate_buf:
+	vfree(tx_ring->free_ids);
+	tx_ring->free_ids = NULL;
+err_tx_free_ids:
+	vfree(tx_ring->tx_buffer_info);
+	tx_ring->tx_buffer_info = NULL;
+err_tx_buffer_info:
+	return -ENOMEM;
+}
+
+/* ena_free_tx_resources - Free I/O Tx Resources per Queue
+ * @adapter: network interface device structure
+ * @qid: queue index
+ *
+ * Free all transmit software resources
+ */
+static void ena_free_tx_resources(struct ena_adapter *adapter, int qid)
+{
+	struct ena_ring *tx_ring = &adapter->tx_ring[qid];
+
+	vfree(tx_ring->tx_buffer_info);
+	tx_ring->tx_buffer_info = NULL;
+
+	vfree(tx_ring->free_ids);
+	tx_ring->free_ids = NULL;
+
+	vfree(tx_ring->push_buf_intermediate_buf);
+	tx_ring->push_buf_intermediate_buf = NULL;
+}
+
+int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
+				    int first_index, int count)
+{
+	int i, rc = 0;
+
+	for (i = first_index; i < first_index + count; i++) {
+		rc = ena_setup_tx_resources(adapter, i);
+		if (rc)
+			goto err_setup_tx;
+	}
+
+	return 0;
+
+err_setup_tx:
+
+	netif_err(adapter, ifup, adapter->netdev,
+		  "Tx queue %d: allocation failed\n", i);
+
+	/* rewind the index freeing the rings as we go */
+	while (first_index < i--)
+		ena_free_tx_resources(adapter, i);
+	return rc;
+}
+
+void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter,
+						  int first_index, int count)
+{
+	int i;
+
+	for (i = first_index; i < first_index + count; i++)
+		ena_free_tx_resources(adapter, i);
+}
+
+/* ena_free_all_io_tx_resources - Free I/O Tx Resources for All Queues
+ * @adapter: board private structure
+ *
+ * Free all transmit software resources
+ */
+void ena_free_all_io_tx_resources(struct ena_adapter *adapter)
+{
+	ena_free_all_io_tx_resources_in_range(adapter,
+					      0,
+					      adapter->xdp_num_queues +
+					      adapter->num_io_queues);
+}
+
+/* ena_setup_rx_resources - allocate I/O Rx resources (Descriptors)
+ * @adapter: network interface device structure
+ * @qid: queue index
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ena_setup_rx_resources(struct ena_adapter *adapter,
+				  u32 qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+	struct ena_irq *ena_irq = &adapter->irq_tbl[ENA_IO_IRQ_IDX(qid)];
+	int size, node, i;
+
+	if (rx_ring->rx_buffer_info) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "rx_buffer_info is not NULL");
+		return -EEXIST;
+	}
+
+	/* alloc extra element so in rx path
+	 * we can always prefetch rx_info + 1
+	 */
+	size = sizeof(struct ena_rx_buffer) * (rx_ring->ring_size + 1);
+	node = cpu_to_node(ena_irq->cpu);
+
+	rx_ring->rx_buffer_info = vzalloc_node(size, node);
+	if (!rx_ring->rx_buffer_info) {
+		rx_ring->rx_buffer_info = vzalloc(size);
+		if (!rx_ring->rx_buffer_info)
+			return -ENOMEM;
+	}
+
+	size = sizeof(u16) * rx_ring->ring_size;
+	rx_ring->free_ids = vzalloc_node(size, node);
+	if (!rx_ring->free_ids) {
+		rx_ring->free_ids = vzalloc(size);
+		if (!rx_ring->free_ids) {
+			vfree(rx_ring->rx_buffer_info);
+			rx_ring->rx_buffer_info = NULL;
+			return -ENOMEM;
+		}
+	}
+
+	/* Req id ring for receiving RX pkts out of order */
+	for (i = 0; i < rx_ring->ring_size; i++)
+		rx_ring->free_ids[i] = i;
+
+	/* Reset rx statistics */
+	memset(&rx_ring->rx_stats, 0x0, sizeof(rx_ring->rx_stats));
+
+#ifdef ENA_BUSY_POLL_SUPPORT
+	ena_bp_init_lock(rx_ring);
+#endif
+	rx_ring->next_to_clean = 0;
+	rx_ring->next_to_use = 0;
+	rx_ring->cpu = ena_irq->cpu;
+	rx_ring->numa_node = node;
+
+	return 0;
+}
+
+/* ena_free_rx_resources - Free I/O Rx Resources
+ * @adapter: network interface device structure
+ * @qid: queue index
+ *
+ * Free all receive software resources
+ */
+static void ena_free_rx_resources(struct ena_adapter *adapter,
+				  u32 qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+
+	vfree(rx_ring->rx_buffer_info);
+	rx_ring->rx_buffer_info = NULL;
+
+	vfree(rx_ring->free_ids);
+	rx_ring->free_ids = NULL;
+}
+
+/* ena_setup_all_rx_resources - allocate I/O Rx queues resources for all queues
+ * @adapter: board private structure
+ *
+ * Return 0 on success, negative on failure
+ */
+static int ena_setup_all_rx_resources(struct ena_adapter *adapter)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		rc = ena_setup_rx_resources(adapter, i);
+		if (rc)
+			goto err_setup_rx;
+	}
+
+	return 0;
+
+err_setup_rx:
+
+	netif_err(adapter, ifup, adapter->netdev,
+		  "Rx queue %d: allocation failed\n", i);
+
+	/* rewind the index freeing the rings as we go */
+	while (i--)
+		ena_free_rx_resources(adapter, i);
+	return rc;
+}
+
+/* ena_free_all_io_rx_resources - Free I/O Rx Resources for All Queues
+ * @adapter: board private structure
+ *
+ * Free all receive software resources
+ */
+static void ena_free_all_io_rx_resources(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		ena_free_rx_resources(adapter, i);
+}
+
+struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma)
+{
+	struct page *page;
+
+	/* This would allocate the page on the same NUMA node the executing code
+	 * is running on.
+	 */
+	page = dev_alloc_page();
+	if (!page) {
+		ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1,
+				  &rx_ring->syncp);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	/* To enable NIC-side port-mirroring, AKA SPAN port,
+	 * we make the buffer readable from the nic as well
+	 */
+	*dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE,
+			    DMA_BIDIRECTIONAL);
+	if (unlikely(dma_mapping_error(rx_ring->dev, *dma))) {
+		ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1,
+				  &rx_ring->syncp);
+		__free_page(page);
+		return ERR_PTR(-EIO);
+	}
+
+	return page;
+}
+
+static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
+			       struct ena_rx_buffer *rx_info)
+{
+	int headroom = rx_ring->rx_headroom;
+	struct ena_com_buf *ena_buf;
+	struct page *page;
+	dma_addr_t dma;
+	int tailroom;
+
+	/* restore page offset value in case it has been changed by device */
+	rx_info->buf_offset = headroom;
+
+	/* if previous allocated page is not used */
+	if (unlikely(rx_info->page))
+		return 0;
+
+	tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	ena_buf = &rx_info->ena_buf;
+
+#ifdef ENA_AF_XDP_SUPPORT
+	if (unlikely(ENA_IS_XSK_RING(rx_ring))) {
+		struct xdp_buff *xdp;
+
+		xdp = xsk_buff_alloc(rx_ring->xsk_pool);
+		if (!xdp)
+			return -ENOMEM;
+
+		ena_buf->paddr = xsk_buff_xdp_get_dma(xdp);
+		ena_buf->len = xsk_pool_get_rx_frame_size(rx_ring->xsk_pool);
+
+		rx_info->xdp = xdp;
+
+		return 0;
+	}
+#endif /* ENA_AF_XDP_SUPPORT */
+
+	/* We handle DMA here */
+	page = ena_lpc_get_page(rx_ring, &dma, &rx_info->is_lpc_page);
+	if (unlikely(IS_ERR(page)))
+		return PTR_ERR(page);
+
+	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+		  "Allocate page %p, rx_info %p\n", page, rx_info);
+
+	rx_info->page = page;
+	rx_info->dma_addr = dma;
+	rx_info->page_offset = 0;
+	ena_buf->paddr = dma + headroom;
+	ena_buf->len = ENA_PAGE_SIZE - headroom - tailroom;
+
+	return 0;
+}
+
+static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
+			      struct ena_rx_buffer *rx_info)
+{
+	/* LPC pages are unmapped at cache destruction */
+	if (rx_info->is_lpc_page)
+		return;
+
+	dma_unmap_page(rx_ring->dev, rx_info->dma_addr,
+		       ENA_PAGE_SIZE,
+		       DMA_BIDIRECTIONAL);
+}
+
+static void ena_free_rx_page(struct ena_ring *rx_ring,
+			     struct ena_rx_buffer *rx_info)
+{
+	struct page *page = rx_info->page;
+
+	if (unlikely(!page)) {
+		netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+			   "Trying to free unallocated buffer\n");
+		return;
+	}
+
+	ena_unmap_rx_buff(rx_ring, rx_info);
+
+	__free_page(page);
+	rx_info->page = NULL;
+}
+
+int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
+{
+	u16 next_to_use, req_id;
+	u32 i;
+	int rc;
+
+	next_to_use = rx_ring->next_to_use;
+
+	for (i = 0; i < num; i++) {
+		struct ena_rx_buffer *rx_info;
+
+		req_id = rx_ring->free_ids[next_to_use];
+
+		rx_info = &rx_ring->rx_buffer_info[req_id];
+
+		rc = ena_alloc_rx_buffer(rx_ring, rx_info);
+		if (unlikely(rc < 0)) {
+			if (!ENA_IS_XSK_RING(rx_ring))
+				netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+					   "Failed to allocate buffer for rx queue %d\n",
+					   rx_ring->qid);
+			break;
+		}
+		rc = ena_com_add_single_rx_desc(rx_ring->ena_com_io_sq,
+						&rx_info->ena_buf,
+						req_id);
+		if (unlikely(rc)) {
+			netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev,
+				   "Failed to add buffer for rx queue %d\n",
+				   rx_ring->qid);
+			break;
+		}
+		next_to_use = ENA_RX_RING_IDX_NEXT(next_to_use,
+						   rx_ring->ring_size);
+	}
+
+	if (unlikely(i < num)) {
+		ena_increase_stat(&rx_ring->rx_stats.refil_partial, 1,
+				  &rx_ring->syncp);
+		if (!ENA_IS_XSK_RING(rx_ring))
+			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+				   "Refilled rx qid %d with only %d buffers (from %d)\n",
+				   rx_ring->qid, i, num);
+	}
+
+	/* ena_com_write_sq_doorbell issues a wmb() */
+	if (likely(i))
+		ena_com_write_sq_doorbell(rx_ring->ena_com_io_sq);
+
+	rx_ring->next_to_use = next_to_use;
+
+	return i;
+}
+
+static void ena_free_rx_bufs(struct ena_adapter *adapter,
+			     u32 qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+	u32 i;
+
+	if (ENA_IS_XSK_RING(rx_ring)) {
+		ena_xdp_free_rx_bufs_zc(adapter, qid);
+		return;
+	}
+
+	for (i = 0; i < rx_ring->ring_size; i++) {
+		struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i];
+
+		if (rx_info->page)
+			ena_free_rx_page(rx_ring, rx_info);
+	}
+}
+
+/* ena_refill_all_rx_bufs - allocate all queues Rx buffers
+ * @adapter: board private structure
+ */
+static void ena_refill_all_rx_bufs(struct ena_adapter *adapter)
+{
+	struct ena_ring *rx_ring;
+	int i, rc, bufs_num;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		rx_ring = &adapter->rx_ring[i];
+		bufs_num = rx_ring->ring_size - 1;
+		rc = ena_refill_rx_bufs(rx_ring, bufs_num);
+
+		if (unlikely(rc != bufs_num))
+			netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev,
+				   "Refilling Queue %d failed. allocated %d buffers from: %d\n",
+				   i, rc, bufs_num);
+	}
+}
+
+static void ena_free_all_rx_bufs(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		ena_free_rx_bufs(adapter, i);
+}
+
+void ena_unmap_tx_buff(struct ena_ring *tx_ring,
+		       struct ena_tx_buffer *tx_info)
+{
+	struct ena_com_buf *ena_buf;
+	u32 cnt;
+	int i;
+
+	ena_buf = tx_info->bufs;
+	cnt = tx_info->num_of_bufs;
+
+	if (unlikely(!cnt))
+		return;
+
+	if (tx_info->map_linear_data) {
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(ena_buf, paddr),
+				 dma_unmap_len(ena_buf, len),
+				 DMA_TO_DEVICE);
+		ena_buf++;
+		cnt--;
+	}
+
+	/* unmap remaining mapped pages */
+	for (i = 0; i < cnt; i++) {
+		dma_unmap_page(tx_ring->dev, dma_unmap_addr(ena_buf, paddr),
+			       dma_unmap_len(ena_buf, len), DMA_TO_DEVICE);
+		ena_buf++;
+	}
+}
+
+/* ena_free_tx_bufs - Free Tx Buffers per Queue
+ * @tx_ring: TX ring for which buffers be freed
+ */
+static void ena_free_tx_bufs(struct ena_ring *tx_ring)
+{
+	bool print_once = true;
+	u32 i;
+
+	for (i = 0; i < tx_ring->ring_size; i++) {
+		struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i];
+
+		if (!tx_info->skb)
+			continue;
+
+		if (print_once) {
+			netif_notice(tx_ring->adapter, ifdown, tx_ring->netdev,
+				     "Free uncompleted tx skb qid %d idx 0x%x\n",
+				     tx_ring->qid, i);
+			print_once = false;
+		} else {
+			netif_dbg(tx_ring->adapter, ifdown, tx_ring->netdev,
+				  "Free uncompleted tx skb qid %d idx 0x%x\n",
+				  tx_ring->qid, i);
+		}
+
+		ena_unmap_tx_buff(tx_ring, tx_info);
+
+		dev_kfree_skb_any(tx_info->skb);
+	}
+	netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev,
+						  tx_ring->qid));
+}
+
+static void ena_free_all_tx_bufs(struct ena_adapter *adapter)
+{
+	struct ena_ring *tx_ring;
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
+		tx_ring = &adapter->tx_ring[i];
+		if (ENA_IS_XSK_RING(tx_ring)) {
+			ena_xdp_free_tx_bufs_zc(tx_ring);
+			continue;
+		}
+		ena_free_tx_bufs(tx_ring);
+	}
+}
+
+static void ena_destroy_all_tx_queues(struct ena_adapter *adapter)
+{
+	u16 ena_qid;
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
+		ena_qid = ENA_IO_TXQ_IDX(i);
+		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
+	}
+}
+
+static void ena_destroy_all_rx_queues(struct ena_adapter *adapter)
+{
+	u16 ena_qid;
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		ena_qid = ENA_IO_RXQ_IDX(i);
+		cancel_work_sync(&adapter->ena_napi[i].dim.work);
+		ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]);
+		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
+	}
+}
+
+static void ena_destroy_all_io_queues(struct ena_adapter *adapter)
+{
+	ena_destroy_all_tx_queues(adapter);
+	ena_destroy_all_rx_queues(adapter);
+}
+
+int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
+			  struct ena_tx_buffer *tx_info, bool is_xdp)
+{
+	if (tx_info)
+		netif_err(ring->adapter,
+			  tx_done,
+			  ring->netdev,
+			  "tx_info doesn't have valid %s. qid %u req_id %u",
+			   is_xdp ? "xdp frame" : "skb", ring->qid, req_id);
+	else
+		netif_err(ring->adapter,
+			  tx_done,
+			  ring->netdev,
+			  "Invalid req_id %u in qid %u\n",
+			  req_id, ring->qid);
+
+	ena_increase_stat(&ring->tx_stats.bad_req_id, 1, &ring->syncp);
+	ena_reset_device(ring->adapter, ENA_REGS_RESET_INV_TX_REQ_ID);
+
+	return -EFAULT;
+}
+
+static int validate_tx_req_id(struct ena_ring *tx_ring, u16 req_id)
+{
+	struct ena_tx_buffer *tx_info;
+
+	tx_info = &tx_ring->tx_buffer_info[req_id];
+	if (likely(tx_info->skb))
+		return 0;
+
+	return handle_invalid_req_id(tx_ring, req_id, tx_info, false);
+}
+
+static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
+{
+	struct netdev_queue *txq;
+	bool above_thresh;
+	u32 tx_bytes = 0;
+	u32 total_done = 0;
+	u16 next_to_clean;
+	u16 req_id;
+	int tx_pkts = 0;
+	int rc;
+
+	next_to_clean = tx_ring->next_to_clean;
+	txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->qid);
+
+	while (tx_pkts < budget) {
+		struct ena_tx_buffer *tx_info;
+		struct sk_buff *skb;
+
+		rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq,
+						&req_id);
+		if (rc) {
+			if (unlikely(rc == -EINVAL))
+				handle_invalid_req_id(tx_ring, req_id, NULL,
+						      false);
+			break;
+		}
+
+		/* validate that the request id points to a valid skb */
+		rc = validate_tx_req_id(tx_ring, req_id);
+		if (rc)
+			break;
+
+		tx_info = &tx_ring->tx_buffer_info[req_id];
+		skb = tx_info->skb;
+
+		/* prefetch skb_end_pointer() to speedup skb_shinfo(skb) */
+		prefetch(&skb->end);
+
+		tx_info->skb = NULL;
+		tx_info->last_jiffies = 0;
+
+		ena_unmap_tx_buff(tx_ring, tx_info);
+
+		netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
+			  "tx_poll: q %d skb %p completed\n", tx_ring->qid,
+			  skb);
+
+		tx_bytes += tx_info->total_tx_size;
+		dev_kfree_skb(skb);
+		tx_pkts++;
+		total_done += tx_info->tx_descs;
+
+		tx_ring->free_ids[next_to_clean] = req_id;
+		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
+						     tx_ring->ring_size);
+	}
+
+	tx_ring->next_to_clean = next_to_clean;
+	ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done);
+
+	if (tx_ring->enable_bql)
+		netdev_tx_completed_queue(txq, tx_pkts, tx_bytes);
+
+	netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
+		  "tx_poll: q %d done. total pkts: %d\n",
+		  tx_ring->qid, tx_pkts);
+
+	/* need to make the rings circular update visible to
+	 * ena_start_xmit() before checking for netif_queue_stopped().
+	 */
+	smp_mb();
+
+	above_thresh = ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
+						    ENA_TX_WAKEUP_THRESH);
+	if (unlikely(netif_tx_queue_stopped(txq) && above_thresh)) {
+		__netif_tx_lock(txq, smp_processor_id());
+		above_thresh =
+			ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
+						     ENA_TX_WAKEUP_THRESH);
+		if (netif_tx_queue_stopped(txq) && above_thresh &&
+		    test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags)) {
+			netif_tx_wake_queue(txq);
+			ena_increase_stat(&tx_ring->tx_stats.queue_wakeup, 1,
+					  &tx_ring->syncp);
+		}
+		__netif_tx_unlock(txq);
+	}
+
+	return tx_pkts;
+}
+
+static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag, u16 len)
+{
+	struct sk_buff *skb;
+
+#ifdef ENA_LINEAR_FRAG_SUPPORTED
+	if (!first_frag)
+		skb = netdev_alloc_skb_ip_align(rx_ring->netdev, len);
+	else
+		skb = build_skb(first_frag, len);
+#else
+	if (!first_frag)
+		skb = netdev_alloc_skb_ip_align(rx_ring->netdev, len);
+	else
+		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+						ENA_SKB_PULL_MIN_LEN);
+#endif /* ENA_LINEAR_FRAG_SUPPORTED */
+
+	if (unlikely(!skb)) {
+		ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1,
+				  &rx_ring->syncp);
+
+		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
+			  "Failed to allocate skb. first_frag %s\n",
+			  first_frag ? "provided" : "not provided");
+	}
+
+	return skb;
+}
+
+static bool ena_try_rx_buf_page_reuse(struct ena_rx_buffer *rx_info, u16 buf_len,
+				      u16 len, int pkt_offset)
+{
+	struct ena_com_buf *ena_buf = &rx_info->ena_buf;
+
+	/* More than ENA_MIN_RX_BUF_SIZE left in the reused buffer
+	 * for data + headroom + tailroom.
+	 */
+	if (SKB_DATA_ALIGN(len + pkt_offset) + ENA_MIN_RX_BUF_SIZE <= ena_buf->len) {
+		page_ref_inc(rx_info->page);
+		rx_info->page_offset += buf_len;
+		ena_buf->paddr += buf_len;
+		ena_buf->len -= buf_len;
+		return true;
+	}
+
+	return false;
+}
+
+static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
+				  struct ena_com_rx_buf_info *ena_bufs,
+				  u32 descs,
+				  u16 *next_to_clean)
+{
+	int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	bool is_xdp_loaded = ena_xdp_present_ring(rx_ring);
+	struct ena_rx_buffer *rx_info;
+	struct ena_adapter *adapter;
+	int page_offset, pkt_offset;
+	u16 len, req_id, buf = 0;
+	bool reuse_rx_buf_page;
+	struct sk_buff *skb;
+	void *buf_addr;
+	int buf_offset;
+	u16 buf_len;
+#ifndef ENA_LINEAR_FRAG_SUPPORTED
+	void *data_addr;
+	u16 hlen;
+#endif
+
+	len = ena_bufs[buf].len;
+	req_id = ena_bufs[buf].req_id;
+
+	rx_info = &rx_ring->rx_buffer_info[req_id];
+
+	if (unlikely(!rx_info->page)) {
+		adapter = rx_ring->adapter;
+		netif_err(adapter, rx_err, rx_ring->netdev,
+			  "Page is NULL. qid %u req_id %u\n", rx_ring->qid, req_id);
+		ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, &rx_ring->syncp);
+		ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID);
+		return NULL;
+	}
+
+	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+		  "rx_info %p page %p\n",
+		  rx_info, rx_info->page);
+
+	buf_offset = rx_info->buf_offset;
+	pkt_offset = buf_offset - rx_ring->rx_headroom;
+	page_offset = rx_info->page_offset;
+	buf_addr = page_address(rx_info->page) + page_offset;
+
+	if (len <= rx_ring->rx_copybreak) {
+		skb = ena_alloc_skb(rx_ring, NULL, len);
+		if (unlikely(!skb))
+			return NULL;
+
+		/* sync this buffer for CPU use */
+		dma_sync_single_for_cpu(rx_ring->dev,
+					dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
+					len,
+					DMA_FROM_DEVICE);
+		skb_copy_to_linear_data(skb, buf_addr + buf_offset, len);
+		dma_sync_single_for_device(rx_ring->dev,
+					   dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
+					   len,
+					   DMA_FROM_DEVICE);
+
+		skb_put(skb, len);
+		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+			  "RX allocated small packet. len %d.\n", skb->len);
+#ifdef ENA_BUSY_POLL_SUPPORT
+		skb_mark_napi_id(skb, rx_ring->napi);
+#endif
+		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+		rx_ring->free_ids[*next_to_clean] = req_id;
+		*next_to_clean = ENA_RX_RING_IDX_ADD(*next_to_clean, descs,
+						     rx_ring->ring_size);
+		return skb;
+	}
+
+	buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
+
+	/* If XDP isn't loaded try to reuse part of the RX buffer */
+	reuse_rx_buf_page = !is_xdp_loaded &&
+			    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
+
+	if (!reuse_rx_buf_page)
+		ena_unmap_rx_buff(rx_ring, rx_info);
+
+	skb = ena_alloc_skb(rx_ring, buf_addr, buf_len);
+	if (unlikely(!skb))
+		return NULL;
+
+#ifdef ENA_LINEAR_FRAG_SUPPORTED
+	/* Populate skb's linear part */
+	skb_reserve(skb, buf_offset);
+	skb_put(skb, len);
+#else
+	data_addr = buf_addr + buf_offset;
+
+	/* GRO expects us to have the ethernet header in the linear part.
+	 * Copy the first ENA_SKB_PULL_MIN_LEN bytes because it is more
+	 * efficient.
+	 */
+	hlen = min_t(u16, len, ENA_SKB_PULL_MIN_LEN);
+	memcpy(__skb_put(skb, hlen), data_addr, hlen);
+	if (hlen < len)
+		skb_add_rx_frag(skb, 0, rx_info->page,
+				page_offset + buf_offset + hlen,
+				len - hlen, buf_len);
+#endif
+	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+
+	do {
+		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+			  "RX skb updated. len %d. data_len %d\n",
+			  skb->len, skb->data_len);
+
+		if (!reuse_rx_buf_page)
+			rx_info->page = NULL;
+
+		rx_ring->free_ids[*next_to_clean] = req_id;
+		*next_to_clean =
+			ENA_RX_RING_IDX_NEXT(*next_to_clean,
+					     rx_ring->ring_size);
+		if (likely(--descs == 0))
+			break;
+
+		buf++;
+		len = ena_bufs[buf].len;
+		req_id = ena_bufs[buf].req_id;
+
+		rx_info = &rx_ring->rx_buffer_info[req_id];
+
+		/* rx_info->buf_offset includes rx_ring->rx_headroom */
+		buf_offset = rx_info->buf_offset;
+		pkt_offset = buf_offset - rx_ring->rx_headroom;
+		buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
+		page_offset = rx_info->page_offset;
+
+		reuse_rx_buf_page = !is_xdp_loaded &&
+				    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
+
+		if (!reuse_rx_buf_page)
+			ena_unmap_rx_buff(rx_ring, rx_info);
+
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
+				page_offset + buf_offset, len, buf_len);
+
+	} while (1);
+
+#ifdef ENA_BUSY_POLL_SUPPORT
+	skb_mark_napi_id(skb, rx_ring->napi);
+
+#endif
+	return skb;
+}
+
+/* ena_rx_checksum - indicate in skb if hw indicated a good cksum
+ * @adapter: structure containing adapter specific data
+ * @ena_rx_ctx: received packet context/metadata
+ * @skb: skb currently being received and modified
+ */
+void ena_rx_checksum(struct ena_ring *rx_ring,
+		     struct ena_com_rx_ctx *ena_rx_ctx,
+		     struct sk_buff *skb)
+{
+	/* Rx csum disabled */
+	if (unlikely(!(rx_ring->netdev->features & NETIF_F_RXCSUM))) {
+		skb->ip_summed = CHECKSUM_NONE;
+		return;
+	}
+
+	/* For fragmented packets the checksum isn't valid */
+	if (ena_rx_ctx->frag) {
+		skb->ip_summed = CHECKSUM_NONE;
+		return;
+	}
+
+	/* if IP and error */
+	if (unlikely((ena_rx_ctx->l3_proto == ENA_ETH_IO_L3_PROTO_IPV4) &&
+		     (ena_rx_ctx->l3_csum_err))) {
+		/* ipv4 checksum error */
+		skb->ip_summed = CHECKSUM_NONE;
+		ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1,
+				  &rx_ring->syncp);
+		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
+			  "RX IPv4 header checksum error\n");
+		return;
+	}
+
+	/* if TCP/UDP */
+	if (likely((ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_TCP) ||
+		   (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP))) {
+		if (unlikely(ena_rx_ctx->l4_csum_err)) {
+			/* TCP/UDP checksum error */
+			ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1,
+					  &rx_ring->syncp);
+			netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
+				  "RX L4 checksum error\n");
+			skb->ip_summed = CHECKSUM_NONE;
+			return;
+		}
+
+		if (likely(ena_rx_ctx->l4_csum_checked)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			ena_increase_stat(&rx_ring->rx_stats.csum_good, 1,
+					  &rx_ring->syncp);
+		} else {
+			ena_increase_stat(&rx_ring->rx_stats.csum_unchecked, 1,
+					  &rx_ring->syncp);
+			skb->ip_summed = CHECKSUM_NONE;
+		}
+	} else {
+		skb->ip_summed = CHECKSUM_NONE;
+		return;
+	}
+
+}
+
+void ena_set_rx_hash(struct ena_ring *rx_ring,
+		     struct ena_com_rx_ctx *ena_rx_ctx,
+		     struct sk_buff *skb)
+{
+#ifdef NETIF_F_RXHASH
+	enum pkt_hash_types hash_type;
+
+	if (likely(rx_ring->netdev->features & NETIF_F_RXHASH)) {
+		if (likely((ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_TCP) ||
+			   (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP)))
+
+			hash_type = PKT_HASH_TYPE_L4;
+		else
+			hash_type = PKT_HASH_TYPE_NONE;
+
+		/* Override hash type if the packet is fragmented */
+		if (ena_rx_ctx->frag)
+			hash_type = PKT_HASH_TYPE_NONE;
+
+		skb_set_hash(skb, ena_rx_ctx->hash, hash_type);
+	}
+#endif /* NETIF_F_RXHASH */
+}
+
+#ifdef ENA_XDP_SUPPORT
+static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp, u16 num_descs)
+{
+	struct ena_rx_buffer *rx_info;
+	int ret;
+
+	/* XDP multi-buffer packets not supported */
+	if (unlikely(num_descs > 1)) {
+		netdev_err_once(rx_ring->adapter->netdev,
+				"xdp: dropped multi-buffer packets. RX packets must be < %lu\n",
+				ENA_XDP_MAX_MTU);
+		ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp);
+		return ENA_XDP_DROP;
+	}
+
+	rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
+	xdp_prepare_buff(xdp, page_address(rx_info->page),
+			 rx_info->buf_offset,
+			 rx_ring->ena_bufs[0].len, false);
+
+	ret = ena_xdp_execute(rx_ring, xdp);
+
+	/* The xdp program might expand the headers */
+	if (ret == ENA_XDP_PASS) {
+		rx_info->buf_offset = xdp->data - xdp->data_hard_start;
+		rx_ring->ena_bufs[0].len = xdp->data_end - xdp->data;
+	}
+
+	return ret;
+}
+
+#endif /* ENA_XDP_SUPPORT */
+/* ena_clean_rx_irq - Cleanup RX irq
+ * @rx_ring: RX ring to clean
+ * @napi: napi handler
+ * @budget: how many packets driver is allowed to clean
+ *
+ * Returns the number of cleaned buffers.
+ */
+static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
+			    u32 budget)
+{
+	u16 next_to_clean = rx_ring->next_to_clean;
+	struct ena_com_rx_ctx ena_rx_ctx;
+	struct ena_rx_buffer *rx_info;
+	struct ena_adapter *adapter;
+	u32 res_budget, work_done;
+	int rx_copybreak_pkt = 0;
+	int refill_threshold;
+	struct sk_buff *skb;
+	int refill_required;
+#ifdef ENA_XDP_SUPPORT
+	struct xdp_buff xdp;
+	int xdp_flags = 0;
+#endif /* ENA_XDP_SUPPORT */
+	int total_len = 0;
+#ifdef ENA_XDP_SUPPORT
+	int xdp_verdict;
+#endif /* ENA_XDP_SUPPORT */
+	int rc = 0;
+	int i;
+
+	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+		  "%s qid %d\n", __func__, rx_ring->qid);
+	res_budget = budget;
+#ifdef ENA_XDP_SUPPORT
+	xdp_init_buff(&xdp, ENA_PAGE_SIZE, &rx_ring->xdp_rxq);
+#endif /* ENA_XDP_SUPPORT */
+
+	do {
+#ifdef ENA_XDP_SUPPORT
+		xdp_verdict = ENA_XDP_PASS;
+		skb = NULL;
+#endif /* ENA_XDP_SUPPORT */
+		ena_rx_ctx.ena_bufs = rx_ring->ena_bufs;
+		ena_rx_ctx.max_bufs = rx_ring->sgl_size;
+		ena_rx_ctx.descs = 0;
+		ena_rx_ctx.pkt_offset = 0;
+		rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq,
+				    rx_ring->ena_com_io_sq,
+				    &ena_rx_ctx);
+		if (unlikely(rc))
+			goto error;
+
+		if (unlikely(ena_rx_ctx.descs == 0))
+			break;
+
+		/* First descriptor might have an offset set by the device */
+		rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
+		rx_info->buf_offset += ena_rx_ctx.pkt_offset;
+
+		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+			  "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
+			  rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
+			  ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
+
+#ifdef ENA_XDP_SUPPORT
+		if (ena_xdp_present_ring(rx_ring))
+			xdp_verdict = ena_xdp_handle_buff(rx_ring, &xdp, ena_rx_ctx.descs);
+
+		/* allocate skb and fill it */
+		if (xdp_verdict == ENA_XDP_PASS)
+			skb = ena_rx_skb(rx_ring,
+					 rx_ring->ena_bufs,
+					 ena_rx_ctx.descs,
+					 &next_to_clean);
+#else
+		skb = ena_rx_skb(rx_ring, rx_ring->ena_bufs, ena_rx_ctx.descs,
+				 &next_to_clean);
+#endif /* ENA_XDP_SUPPORT */
+
+		if (unlikely(!skb)) {
+			for (i = 0; i < ena_rx_ctx.descs; i++) {
+				int req_id = rx_ring->ena_bufs[i].req_id;
+
+				rx_ring->free_ids[next_to_clean] = req_id;
+				next_to_clean =
+					ENA_RX_RING_IDX_NEXT(next_to_clean,
+							     rx_ring->ring_size);
+
+#ifdef ENA_XDP_SUPPORT
+				/* Packets was passed for transmission, unmap it
+				 * from RX side.
+				 */
+				if (xdp_verdict & ENA_XDP_FORWARDED) {
+					ena_unmap_rx_buff(rx_ring,
+							  &rx_ring->rx_buffer_info[req_id]);
+					rx_ring->rx_buffer_info[req_id].page = NULL;
+				}
+#endif /* ENA_XDP_SUPPORT */
+			}
+#ifdef ENA_XDP_SUPPORT
+			if (xdp_verdict != ENA_XDP_PASS) {
+				xdp_flags |= xdp_verdict;
+				total_len += ena_rx_ctx.ena_bufs[0].len;
+				res_budget--;
+				continue;
+			}
+#endif /* ENA_XDP_SUPPORT */
+			break;
+		}
+
+		ena_rx_checksum(rx_ring, &ena_rx_ctx, skb);
+
+		ena_set_rx_hash(rx_ring, &ena_rx_ctx, skb);
+
+		skb_record_rx_queue(skb, rx_ring->qid);
+
+		if (rx_ring->ena_bufs[0].len <= rx_ring->rx_copybreak)
+			rx_copybreak_pkt++;
+
+		total_len += skb->len;
+
+#ifdef ENA_BUSY_POLL_SUPPORT
+		if (ena_bp_busy_polling(rx_ring))
+			netif_receive_skb(skb);
+		else
+			napi_gro_receive(napi, skb);
+#else
+		napi_gro_receive(napi, skb);
+#endif /* ENA_BUSY_POLL_SUPPORT */
+
+		res_budget--;
+	} while (likely(res_budget));
+
+	work_done = budget - res_budget;
+	rx_ring->per_napi_packets += work_done;
+	u64_stats_update_begin(&rx_ring->syncp);
+	rx_ring->rx_stats.bytes += total_len;
+	rx_ring->rx_stats.cnt += work_done;
+	rx_ring->rx_stats.rx_copybreak_pkt += rx_copybreak_pkt;
+	u64_stats_update_end(&rx_ring->syncp);
+
+	rx_ring->next_to_clean = next_to_clean;
+
+	refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
+	refill_threshold =
+		min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER,
+		      ENA_RX_REFILL_THRESH_PACKET);
+
+	/* Optimization, try to batch new rx buffers */
+	if (refill_required > refill_threshold)
+		ena_refill_rx_bufs(rx_ring, refill_required);
+
+#ifdef ENA_XDP_SUPPORT
+	if (xdp_flags & ENA_XDP_REDIRECT)
+		xdp_do_flush_map();
+#endif
+
+	return work_done;
+
+error:
+	adapter = netdev_priv(rx_ring->netdev);
+
+	if (rc == -ENOSPC) {
+		ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1,
+				  &rx_ring->syncp);
+		ena_reset_device(adapter, ENA_REGS_RESET_TOO_MANY_RX_DESCS);
+	} else {
+		ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1,
+				  &rx_ring->syncp);
+		ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID);
+	}
+	return 0;
+}
+
+static void ena_dim_work(struct work_struct *w)
+{
+	struct dim *dim = container_of(w, struct dim, work);
+	struct dim_cq_moder cur_moder =
+		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
+	struct ena_napi *ena_napi = container_of(dim, struct ena_napi, dim);
+
+	ena_napi->rx_ring->smoothed_interval = cur_moder.usec;
+	dim->state = DIM_START_MEASURE;
+}
+
+static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi)
+{
+	struct dim_sample dim_sample;
+	struct ena_ring *rx_ring = ena_napi->rx_ring;
+
+	if (!rx_ring->per_napi_packets)
+		return;
+
+	rx_ring->non_empty_napi_events++;
+
+	dim_update_sample(rx_ring->non_empty_napi_events,
+			  rx_ring->rx_stats.cnt,
+			  rx_ring->rx_stats.bytes,
+			  &dim_sample);
+
+	net_dim(&ena_napi->dim, dim_sample);
+
+	rx_ring->per_napi_packets = 0;
+}
+
+void ena_unmask_interrupt(struct ena_ring *tx_ring,
+			  struct ena_ring *rx_ring)
+{
+	struct ena_eth_io_intr_reg intr_reg;
+#ifdef ENA_XDP_SUPPORT
+	u32 rx_interval = tx_ring->smoothed_interval;
+#else
+	u32 rx_interval = 0;
+#endif
+	/* Rx ring can be NULL when for XDP tx queues which don't have an
+	 * accompanying rx_ring pair.
+	 */
+	if (rx_ring)
+		rx_interval = ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev) ?
+			rx_ring->smoothed_interval :
+			ena_com_get_nonadaptive_moderation_interval_rx(rx_ring->ena_dev);
+
+	/* Update intr register: rx intr delay,
+	 * tx intr delay and interrupt unmask
+	 */
+	ena_com_update_intr_reg(&intr_reg,
+				rx_interval,
+				tx_ring->smoothed_interval,
+				true);
+
+	ena_increase_stat(&tx_ring->tx_stats.unmask_interrupt, 1,
+			  &tx_ring->syncp);
+
+	/* It is a shared MSI-X.
+	 * Tx and Rx CQ have pointer to it.
+	 * So we use one of them to reach the intr reg
+	 * The Tx ring is used because the rx_ring is NULL for XDP queues
+	 */
+	ena_com_unmask_intr(tx_ring->ena_com_io_cq, &intr_reg);
+}
+
+void ena_update_ring_numa_node(struct ena_ring *tx_ring,
+			       struct ena_ring *rx_ring)
+{
+	int cpu = get_cpu();
+	int numa_node;
+
+	/* Check only one ring since the 2 rings are running on the same cpu */
+	if (likely(tx_ring->cpu == cpu))
+		goto out;
+
+	tx_ring->cpu = cpu;
+	if (rx_ring)
+		rx_ring->cpu = cpu;
+
+	numa_node = cpu_to_node(cpu);
+
+	if (likely(tx_ring->numa_node == numa_node))
+		goto out;
+
+	put_cpu();
+
+	if (numa_node != NUMA_NO_NODE) {
+		ena_com_update_numa_node(tx_ring->ena_com_io_cq, numa_node);
+		tx_ring->numa_node = numa_node;
+		if (rx_ring) {
+			rx_ring->numa_node = numa_node;
+			ena_com_update_numa_node(rx_ring->ena_com_io_cq,
+						 numa_node);
+		}
+	}
+
+	return;
+out:
+	put_cpu();
+}
+
+
+static int ena_io_poll(struct napi_struct *napi, int budget)
+{
+	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
+	struct ena_ring *tx_ring, *rx_ring;
+	int tx_work_done;
+	int rx_work_done = 0;
+	int tx_budget;
+	int napi_comp_call = 0;
+	int ret;
+
+	tx_ring = ena_napi->tx_ring;
+	rx_ring = ena_napi->rx_ring;
+
+	tx_budget = tx_ring->ring_size / ENA_TX_POLL_BUDGET_DIVIDER;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
+	    test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags)) {
+		napi_complete_done(napi, 0);
+		return 0;
+	}
+#ifdef ENA_BUSY_POLL_SUPPORT
+	if (!ena_bp_lock_napi(rx_ring))
+		return budget;
+#endif
+
+	tx_work_done = ena_clean_tx_irq(tx_ring, tx_budget);
+	/* On netpoll the budget is zero and the handler should only clean the
+	 * tx completions.
+	 */
+	if (likely(budget))
+		rx_work_done = ena_clean_rx_irq(rx_ring, napi, budget);
+
+	/* If the device is about to reset or down, avoid unmask
+	 * the interrupt and return 0 so NAPI won't reschedule
+	 */
+	if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
+		     test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags))) {
+		napi_complete_done(napi, 0);
+		ret = 0;
+
+	} else if ((budget > rx_work_done) && (tx_budget > tx_work_done)) {
+		napi_comp_call = 1;
+
+		/* Update numa and unmask the interrupt only when schedule
+		 * from the interrupt context (vs from sk_busy_loop)
+		 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+		if (napi_complete_done(napi, rx_work_done) &&
+		    READ_ONCE(ena_napi->interrupts_masked)) {
+#else
+		napi_complete_done(napi, rx_work_done);
+		if (READ_ONCE(ena_napi->interrupts_masked)) {
+#endif
+			smp_rmb(); /* make sure interrupts_masked is read */
+			WRITE_ONCE(ena_napi->interrupts_masked, false);
+			/* We apply adaptive moderation on Rx path only.
+			 * Tx uses static interrupt moderation.
+			 */
+			if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev))
+				ena_adjust_adaptive_rx_intr_moderation(ena_napi);
+
+			ena_update_ring_numa_node(tx_ring, rx_ring);
+			ena_unmask_interrupt(tx_ring, rx_ring);
+		}
+
+		ret = rx_work_done;
+	} else {
+		ret = budget;
+	}
+
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.napi_comp += napi_comp_call;
+	tx_ring->tx_stats.tx_poll++;
+	u64_stats_update_end(&tx_ring->syncp);
+
+#ifdef ENA_BUSY_POLL_SUPPORT
+	ena_bp_unlock_napi(rx_ring);
+#endif
+	tx_ring->tx_stats.last_napi_jiffies = jiffies;
+
+	return ret;
+}
+
+static irqreturn_t ena_intr_msix_mgmnt(int irq, void *data)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)data;
+
+	ena_com_admin_q_comp_intr_handler(adapter->ena_dev);
+
+	/* Don't call the aenq handler before probe is done */
+	if (likely(test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags)))
+		ena_com_aenq_intr_handler(adapter->ena_dev, data);
+
+	return IRQ_HANDLED;
+}
+
+/* ena_intr_msix_io - MSI-X Interrupt Handler for Tx/Rx
+ * @irq: interrupt number
+ * @data: pointer to a network interface private napi device structure
+ */
+static irqreturn_t ena_intr_msix_io(int irq, void *data)
+{
+	struct ena_napi *ena_napi = data;
+
+	/* Used to check HW health */
+	WRITE_ONCE(ena_napi->first_interrupt, true);
+
+	WRITE_ONCE(ena_napi->interrupts_masked, true);
+	smp_wmb(); /* write interrupts_masked before calling napi */
+
+	napi_schedule_irqoff(&ena_napi->napi);
+
+	return IRQ_HANDLED;
+}
+
+/* Reserve a single MSI-X vector for management (admin + aenq).
+ * plus reserve one vector for each potential io queue.
+ * the number of potential io queues is the minimum of what the device
+ * supports and the number of vCPUs.
+ */
+static int ena_enable_msix(struct ena_adapter *adapter)
+{
+	int msix_vecs, irq_cnt;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+	int i;
+#endif
+
+	if (test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) {
+		netif_err(adapter, probe, adapter->netdev,
+			  "Error, MSI-X is already enabled\n");
+		return -EPERM;
+	}
+
+	/* Reserved the max msix vectors we might need */
+	msix_vecs = ENA_MAX_MSIX_VEC(adapter->max_num_io_queues);
+	netif_dbg(adapter, probe, adapter->netdev,
+		  "Trying to enable MSI-X, vectors %d\n", msix_vecs);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+	adapter->msix_entries = vzalloc(msix_vecs * sizeof(struct msix_entry));
+
+	if (!adapter->msix_entries)
+		return -ENOMEM;
+
+	for (i = 0; i < msix_vecs; i++)
+		adapter->msix_entries[i].entry = i;
+
+	irq_cnt = pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
+					ENA_MIN_MSIX_VEC, msix_vecs);
+#else
+	irq_cnt = pci_alloc_irq_vectors(adapter->pdev, ENA_MIN_MSIX_VEC,
+					msix_vecs, PCI_IRQ_MSIX);
+#endif
+
+	if (irq_cnt < 0) {
+		netif_err(adapter, probe, adapter->netdev,
+			  "Failed to enable MSI-X. irq_cnt %d\n", irq_cnt);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+		vfree(adapter->msix_entries);
+		adapter->msix_entries = NULL;
+#endif
+		return -ENOSPC;
+	}
+
+	if (irq_cnt != msix_vecs) {
+		netif_notice(adapter, probe, adapter->netdev,
+			     "Enable only %d MSI-X (out of %d), reduce the number of queues\n",
+			     irq_cnt, msix_vecs);
+		adapter->num_io_queues = irq_cnt - ENA_ADMIN_MSIX_VEC;
+	}
+
+	if (ena_init_rx_cpu_rmap(adapter))
+		netif_warn(adapter, probe, adapter->netdev,
+			   "Failed to map IRQs to CPUs\n");
+
+	adapter->msix_vecs = irq_cnt;
+	set_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags);
+
+	return 0;
+}
+
+static void ena_setup_mgmnt_intr(struct ena_adapter *adapter)
+{
+	u32 cpu;
+
+	snprintf(adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].name,
+		 ENA_IRQNAME_SIZE, "ena-mgmnt@pci:%s",
+		 pci_name(adapter->pdev));
+	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].handler =
+		ena_intr_msix_mgmnt;
+	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].data = adapter;
+	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].vector =
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+		adapter->msix_entries[ENA_MGMNT_IRQ_IDX].vector;
+#else
+		pci_irq_vector(adapter->pdev, ENA_MGMNT_IRQ_IDX);
+#endif
+	cpu = cpumask_first(cpu_online_mask);
+	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].cpu = cpu;
+	cpumask_set_cpu(cpu,
+			&adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].affinity_hint_mask);
+}
+
+static void ena_setup_io_intr(struct ena_adapter *adapter)
+{
+	struct net_device *netdev;
+	int irq_idx, i, cpu;
+	int io_queue_count;
+
+	netdev = adapter->netdev;
+	io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
+
+	for (i = 0; i < io_queue_count; i++) {
+		irq_idx = ENA_IO_IRQ_IDX(i);
+		cpu = i % num_online_cpus();
+
+		snprintf(adapter->irq_tbl[irq_idx].name, ENA_IRQNAME_SIZE,
+			 "%s-Tx-Rx-%d", netdev->name, i);
+		adapter->irq_tbl[irq_idx].handler = ena_intr_msix_io;
+		adapter->irq_tbl[irq_idx].data = &adapter->ena_napi[i];
+		adapter->irq_tbl[irq_idx].vector =
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+			adapter->msix_entries[irq_idx].vector;
+#else
+			pci_irq_vector(adapter->pdev, irq_idx);
+#endif
+		adapter->irq_tbl[irq_idx].cpu = cpu;
+
+		cpumask_set_cpu(cpu,
+				&adapter->irq_tbl[irq_idx].affinity_hint_mask);
+	}
+}
+
+static int ena_request_mgmnt_irq(struct ena_adapter *adapter)
+{
+	unsigned long flags = 0;
+	struct ena_irq *irq;
+	int rc;
+
+	irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX];
+	rc = request_irq(irq->vector, irq->handler, flags, irq->name,
+			 irq->data);
+	if (rc) {
+		netif_err(adapter, probe, adapter->netdev,
+			  "Failed to request admin irq\n");
+		return rc;
+	}
+
+	netif_dbg(adapter, probe, adapter->netdev,
+		  "Set affinity hint of mgmnt irq.to 0x%lx (irq vector: %d)\n",
+		  irq->affinity_hint_mask.bits[0], irq->vector);
+
+	return rc;
+}
+
+static int ena_request_io_irq(struct ena_adapter *adapter)
+{
+	u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
+	unsigned long flags = 0;
+	struct ena_irq *irq;
+	int rc = 0, i, k;
+
+	if (!test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to request I/O IRQ: MSI-X is not enabled\n");
+		return -EINVAL;
+	}
+
+	for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) {
+		irq = &adapter->irq_tbl[i];
+		rc = request_irq(irq->vector, irq->handler, flags, irq->name,
+				 irq->data);
+		if (rc) {
+			netif_err(adapter, ifup, adapter->netdev,
+				  "Failed to request I/O IRQ. index %d rc %d\n",
+				   i, rc);
+			goto err;
+		}
+
+		netif_dbg(adapter, ifup, adapter->netdev,
+			  "Set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n",
+			  i, irq->affinity_hint_mask.bits[0], irq->vector);
+	}
+
+	return rc;
+
+err:
+	for (k = ENA_IO_IRQ_FIRST_IDX; k < i; k++) {
+		irq = &adapter->irq_tbl[k];
+		free_irq(irq->vector, irq->data);
+	}
+
+	return rc;
+}
+
+static void ena_free_mgmnt_irq(struct ena_adapter *adapter)
+{
+	struct ena_irq *irq;
+
+	irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX];
+	synchronize_irq(irq->vector);
+	irq_set_affinity_hint(irq->vector, NULL);
+	free_irq(irq->vector, irq->data);
+}
+
+static void ena_free_io_irq(struct ena_adapter *adapter)
+{
+	u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
+	struct ena_irq *irq;
+	int i;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (adapter->msix_vecs >= 1) {
+		free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
+		adapter->netdev->rx_cpu_rmap = NULL;
+	}
+#endif /* CONFIG_RFS_ACCEL */
+
+	for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) {
+		irq = &adapter->irq_tbl[i];
+		irq_set_affinity_hint(irq->vector, NULL);
+		free_irq(irq->vector, irq->data);
+	}
+}
+
+static void ena_disable_msix(struct ena_adapter *adapter)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+	if (test_and_clear_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags))
+		pci_disable_msix(adapter->pdev);
+
+	if (adapter->msix_entries)
+		vfree(adapter->msix_entries);
+	adapter->msix_entries = NULL;
+#else
+	if (test_and_clear_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags))
+		pci_free_irq_vectors(adapter->pdev);
+#endif
+}
+
+static void ena_disable_io_intr_sync(struct ena_adapter *adapter)
+{
+	u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
+	int i;
+
+	if (!netif_running(adapter->netdev))
+		return;
+
+	for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++)
+		synchronize_irq(adapter->irq_tbl[i].vector);
+}
+
+static void ena_del_napi_in_range(struct ena_adapter *adapter,
+				  int first_index,
+				  int count)
+{
+	int i;
+
+	for (i = first_index; i < first_index + count; i++) {
+#ifdef ENA_BUSY_POLL_SUPPORT
+		napi_hash_del(&adapter->ena_napi[i].napi);
+#endif /* ENA_BUSY_POLL_SUPPORT */
+		netif_napi_del(&adapter->ena_napi[i].napi);
+
+#ifdef ENA_XDP_SUPPORT
+		WARN_ON(ENA_IS_XDP_INDEX(adapter, i) &&
+			adapter->ena_napi[i].rx_ring);
+#endif /* ENA_XDP_SUPPORT */
+	}
+#ifdef ENA_BUSY_POLL_SUPPORT
+
+	/* Wait until all uses of napi struct complete */
+	synchronize_net();
+#endif /* ENA_BUSY_POLL_SUPPORT */
+}
+
+static void ena_init_napi_in_range(struct ena_adapter *adapter,
+				   int first_index, int count)
+{
+	int i;
+	int (*napi_handler)(struct napi_struct *napi, int budget);
+
+	for (i = first_index; i < first_index + count; i++) {
+		struct ena_napi *napi = &adapter->ena_napi[i];
+		struct ena_ring *rx_ring, *tx_ring;
+
+		memset(napi, 0, sizeof(*napi));
+
+		rx_ring = &adapter->rx_ring[i];
+		tx_ring = &adapter->tx_ring[i];
+
+		napi_handler = ena_io_poll;
+#ifdef ENA_XDP_SUPPORT
+		if (ENA_IS_XDP_INDEX(adapter, i) || ENA_IS_XSK_RING(rx_ring))
+			napi_handler = ena_xdp_io_poll;
+#endif /* ENA_XDP_SUPPORT */
+
+		netif_napi_add(adapter->netdev,
+			       &napi->napi,
+			       napi_handler,
+			       NAPI_POLL_WEIGHT);
+
+#ifdef ENA_BUSY_POLL_SUPPORT
+		napi_hash_add(&adapter->ena_napi[i].napi);
+
+#endif /* ENA_BUSY_POLL_SUPPORT */
+		if (!ENA_IS_XDP_INDEX(adapter, i))
+			napi->rx_ring = rx_ring;
+
+		napi->tx_ring = tx_ring;
+		napi->qid = i;
+	}
+}
+
+#ifdef ENA_BUSY_POLL_SUPPORT
+static void ena_napi_disable_in_range(struct ena_adapter *adapter,
+				      int first_index,
+				      int count)
+{
+	struct ena_ring *rx_ring;
+	int i, timeout;
+
+	for (i = first_index; i < first_index + count; i++) {
+		napi_disable(&adapter->ena_napi[i].napi);
+
+		rx_ring = &adapter->rx_ring[i];
+		timeout = 1000;
+		while (!ena_bp_disable(rx_ring)) {
+			netif_info(adapter, ifdown, adapter->netdev,
+				   "Rx queue %d locked\n", i);
+			usleep_range(1000, 2000);
+			timeout--;
+
+			if (!timeout) {
+				WARN(!ena_bp_disable(rx_ring),
+				     "Unable to disable busy poll at ring %d\n", i);
+				break;
+			}
+		}
+	}
+}
+#else
+static void ena_napi_disable_in_range(struct ena_adapter *adapter,
+				      int first_index,
+				      int count)
+{
+	int i;
+
+	for (i = first_index; i < first_index + count; i++)
+		napi_disable(&adapter->ena_napi[i].napi);
+}
+#endif
+
+static void ena_napi_enable_in_range(struct ena_adapter *adapter,
+				     int first_index,
+				     int count)
+{
+	int i;
+
+	for (i = first_index; i < first_index + count; i++)
+		napi_enable(&adapter->ena_napi[i].napi);
+}
+
+/* Configure the Rx forwarding */
+static int ena_rss_configure(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int rc;
+
+	/* In case the RSS table wasn't initialized by probe */
+	if (!ena_dev->rss.tbl_log_size) {
+		rc = ena_rss_init_default(adapter);
+		if (rc && (rc != -EOPNOTSUPP)) {
+			netif_err(adapter, ifup, adapter->netdev,
+				  "Failed to init RSS rc: %d\n", rc);
+			return rc;
+		}
+	}
+
+	/* Set indirect table */
+	rc = ena_com_indirect_table_set(ena_dev);
+	if (unlikely(rc && rc != -EOPNOTSUPP))
+		return rc;
+
+	/* Configure hash function (if supported) */
+	rc = ena_com_set_hash_function(ena_dev);
+	if (unlikely(rc && (rc != -EOPNOTSUPP)))
+		return rc;
+
+	/* Configure hash inputs (if supported) */
+	rc = ena_com_set_hash_ctrl(ena_dev);
+	if (unlikely(rc && (rc != -EOPNOTSUPP)))
+		return rc;
+
+	return 0;
+}
+
+static int ena_up_complete(struct ena_adapter *adapter)
+{
+	int rc;
+
+	rc = ena_rss_configure(adapter);
+	if (rc)
+		return rc;
+
+	ena_change_mtu(adapter->netdev, adapter->netdev->mtu);
+
+	ena_refill_all_rx_bufs(adapter);
+
+	/* enable transmits */
+	netif_tx_start_all_queues(adapter->netdev);
+
+	ena_napi_enable_in_range(adapter,
+				 0,
+				 adapter->xdp_num_queues + adapter->num_io_queues);
+
+	return 0;
+}
+
+static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
+{
+	struct ena_com_create_io_ctx ctx;
+	struct ena_com_dev *ena_dev;
+	struct ena_ring *tx_ring;
+	u32 msix_vector;
+	u16 ena_qid;
+	int rc;
+
+	ena_dev = adapter->ena_dev;
+
+	tx_ring = &adapter->tx_ring[qid];
+	msix_vector = ENA_IO_IRQ_IDX(qid);
+	ena_qid = ENA_IO_TXQ_IDX(qid);
+
+	memset(&ctx, 0x0, sizeof(ctx));
+
+	ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_TX;
+	ctx.qid = ena_qid;
+	ctx.mem_queue_type = ena_dev->tx_mem_queue_type;
+	ctx.msix_vector = msix_vector;
+	ctx.queue_size = tx_ring->ring_size;
+	ctx.numa_node = tx_ring->numa_node;
+
+	rc = ena_com_create_io_queue(ena_dev, &ctx);
+	if (rc) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to create I/O TX queue num %d rc: %d\n",
+			  qid, rc);
+		return rc;
+	}
+
+	rc = ena_com_get_io_handlers(ena_dev, ena_qid,
+				     &tx_ring->ena_com_io_sq,
+				     &tx_ring->ena_com_io_cq);
+	if (rc) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to get TX queue handlers. TX queue num %d rc: %d\n",
+			  qid, rc);
+		ena_com_destroy_io_queue(ena_dev, ena_qid);
+		return rc;
+	}
+
+	ena_com_update_numa_node(tx_ring->ena_com_io_cq, ctx.numa_node);
+	return rc;
+}
+
+int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
+				     int first_index, int count)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int rc, i;
+
+	for (i = first_index; i < first_index + count; i++) {
+		rc = ena_create_io_tx_queue(adapter, i);
+		if (rc)
+			goto create_err;
+	}
+
+	return 0;
+
+create_err:
+	while (i-- > first_index)
+		ena_com_destroy_io_queue(ena_dev, ENA_IO_TXQ_IDX(i));
+
+	return rc;
+}
+
+static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
+{
+	struct ena_com_dev *ena_dev;
+	struct ena_com_create_io_ctx ctx;
+	struct ena_ring *rx_ring;
+	u32 msix_vector;
+	u16 ena_qid;
+	int rc;
+
+	ena_dev = adapter->ena_dev;
+
+	rx_ring = &adapter->rx_ring[qid];
+	msix_vector = ENA_IO_IRQ_IDX(qid);
+	ena_qid = ENA_IO_RXQ_IDX(qid);
+
+	memset(&ctx, 0x0, sizeof(ctx));
+
+	ctx.qid = ena_qid;
+	ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_RX;
+	ctx.mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+	ctx.msix_vector = msix_vector;
+	ctx.queue_size = rx_ring->ring_size;
+	ctx.numa_node = rx_ring->numa_node;
+
+	rc = ena_com_create_io_queue(ena_dev, &ctx);
+	if (rc) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to create I/O RX queue num %d rc: %d\n",
+			  qid, rc);
+		return rc;
+	}
+
+	rc = ena_com_get_io_handlers(ena_dev, ena_qid,
+				     &rx_ring->ena_com_io_sq,
+				     &rx_ring->ena_com_io_cq);
+	if (rc) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to get RX queue handlers. RX queue num %d rc: %d\n",
+			  qid, rc);
+		goto err;
+	}
+
+	ena_com_update_numa_node(rx_ring->ena_com_io_cq, ctx.numa_node);
+
+	return rc;
+err:
+	ena_com_destroy_io_queue(ena_dev, ena_qid);
+	return rc;
+}
+
+static int ena_create_all_io_rx_queues(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int rc, i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		rc = ena_create_io_rx_queue(adapter, i);
+		if (rc)
+			goto create_err;
+		INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work);
+
+		ena_xdp_register_rxq_info(&adapter->rx_ring[i]);
+	}
+
+	return 0;
+
+create_err:
+	while (i--) {
+		ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]);
+		cancel_work_sync(&adapter->ena_napi[i].dim.work);
+		ena_com_destroy_io_queue(ena_dev, ENA_IO_RXQ_IDX(i));
+	}
+
+	return rc;
+}
+
+static void set_io_rings_size(struct ena_adapter *adapter,
+			      int new_tx_size,
+			      int new_rx_size)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		adapter->tx_ring[i].ring_size = new_tx_size;
+		adapter->rx_ring[i].ring_size = new_rx_size;
+	}
+}
+
+/* This function allows queue allocation to backoff when the system is
+ * low on memory. If there is not enough memory to allocate io queues
+ * the driver will try to allocate smaller queues.
+ *
+ * The backoff algorithm is as follows:
+ *  1. Try to allocate TX and RX and if successful.
+ *  1.1. return success
+ *
+ *  2. Divide by 2 the size of the larger of RX and TX queues (or both if their size is the same).
+ *
+ *  3. If TX or RX is smaller than 256
+ *  3.1. return failure.
+ *  4. else
+ *  4.1. go back to 1.
+ */
+static int create_queues_with_size_backoff(struct ena_adapter *adapter)
+{
+	int rc, cur_rx_ring_size, cur_tx_ring_size;
+	int new_rx_ring_size, new_tx_ring_size;
+
+	/* current queue sizes might be set to smaller than the requested
+	 * ones due to past queue allocation failures.
+	 */
+	set_io_rings_size(adapter, adapter->requested_tx_ring_size,
+			  adapter->requested_rx_ring_size);
+
+	while (1) {
+#ifdef ENA_XDP_SUPPORT
+		if (ena_xdp_present(adapter)) {
+			rc = ena_setup_and_create_all_xdp_queues(adapter);
+
+			if (rc)
+				goto err_setup_tx;
+		}
+#endif /* ENA_XDP_SUPPORT */
+		rc = ena_setup_tx_resources_in_range(adapter,
+						     0,
+						     adapter->num_io_queues);
+		if (rc)
+			goto err_setup_tx;
+
+		rc = ena_create_io_tx_queues_in_range(adapter,
+						      0,
+						      adapter->num_io_queues);
+		if (rc)
+			goto err_create_tx_queues;
+
+		rc = ena_setup_all_rx_resources(adapter);
+		if (rc)
+			goto err_setup_rx;
+
+		rc = ena_create_all_io_rx_queues(adapter);
+		if (rc)
+			goto err_create_rx_queues;
+
+		rc = ena_create_page_caches(adapter);
+		if (rc) /* Cache memory is freed in case of failure */
+			goto err_create_rx_queues;
+
+		return 0;
+
+err_create_rx_queues:
+		ena_free_all_io_rx_resources(adapter);
+err_setup_rx:
+		ena_destroy_all_tx_queues(adapter);
+err_create_tx_queues:
+		ena_free_all_io_tx_resources(adapter);
+err_setup_tx:
+		if (rc != -ENOMEM) {
+			netif_err(adapter, ifup, adapter->netdev,
+				  "Queue creation failed with error code %d\n",
+				  rc);
+			return rc;
+		}
+
+		cur_tx_ring_size = adapter->tx_ring[0].ring_size;
+		cur_rx_ring_size = adapter->rx_ring[0].ring_size;
+
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Not enough memory to create queues with sizes TX=%d, RX=%d\n",
+			  cur_tx_ring_size, cur_rx_ring_size);
+
+		new_tx_ring_size = cur_tx_ring_size;
+		new_rx_ring_size = cur_rx_ring_size;
+
+		/* Decrease the size of the larger queue, or
+		 * decrease both if they are the same size.
+		 */
+		if (cur_rx_ring_size <= cur_tx_ring_size)
+			new_tx_ring_size = cur_tx_ring_size / 2;
+		if (cur_rx_ring_size >= cur_tx_ring_size)
+			new_rx_ring_size = cur_rx_ring_size / 2;
+
+		if (new_tx_ring_size < ENA_MIN_RING_SIZE ||
+		    new_rx_ring_size < ENA_MIN_RING_SIZE) {
+			netif_err(adapter, ifup, adapter->netdev,
+				  "Queue creation failed with the smallest possible queue size of %d for both queues. Not retrying with smaller queues\n",
+				  ENA_MIN_RING_SIZE);
+			return rc;
+		}
+
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Retrying queue creation with sizes TX=%d, RX=%d\n",
+			  new_tx_ring_size,
+			  new_rx_ring_size);
+
+		set_io_rings_size(adapter, new_tx_ring_size,
+				  new_rx_ring_size);
+	}
+}
+
+int ena_up(struct ena_adapter *adapter)
+{
+	int io_queue_count, rc, i;
+
+	netif_dbg(adapter, ifup, adapter->netdev, "%s\n", __func__);
+
+	io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
+	ena_setup_io_intr(adapter);
+
+	/* napi poll functions should be initialized before running
+	 * request_irq(), to handle a rare condition where there is a pending
+	 * interrupt, causing the ISR to fire immediately while the poll
+	 * function wasn't set yet, causing a null dereference
+	 */
+	ena_init_napi_in_range(adapter, 0, io_queue_count);
+
+#ifdef CONFIG_ARM64
+	/* enable DIM by default on ARM machines, also needs to happen
+	 * before enabling IRQs since DIM is ran from napi routine
+	 */
+	if (ena_com_interrupt_moderation_supported(adapter->ena_dev))
+		ena_com_enable_adaptive_moderation(adapter->ena_dev);
+
+#endif
+	rc = ena_request_io_irq(adapter);
+	if (rc)
+		goto err_req_irq;
+
+	rc = create_queues_with_size_backoff(adapter);
+	if (rc)
+		goto err_create_queues_with_backoff;
+
+	rc = ena_up_complete(adapter);
+	if (rc)
+		goto err_up;
+
+	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
+		netif_carrier_on(adapter->netdev);
+
+	ena_increase_stat(&adapter->dev_stats.interface_up, 1,
+			  &adapter->syncp);
+
+	set_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+
+	/* Enable completion queues interrupt */
+	for (i = 0; i < adapter->num_io_queues; i++)
+		ena_unmask_interrupt(&adapter->tx_ring[i],
+				     &adapter->rx_ring[i]);
+
+	/* schedule napi in case we had pending packets
+	 * from the last time we disable napi
+	 */
+	for (i = 0; i < io_queue_count; i++)
+		napi_schedule(&adapter->ena_napi[i].napi);
+
+	return rc;
+
+err_up:
+	ena_free_page_caches(adapter);
+	ena_destroy_all_tx_queues(adapter);
+	ena_free_all_io_tx_resources(adapter);
+	ena_destroy_all_rx_queues(adapter);
+	ena_free_all_io_rx_resources(adapter);
+err_create_queues_with_backoff:
+	ena_free_io_irq(adapter);
+err_req_irq:
+	ena_del_napi_in_range(adapter, 0, io_queue_count);
+
+	return rc;
+}
+
+void ena_down(struct ena_adapter *adapter)
+{
+	int io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
+
+	netif_dbg(adapter, ifdown, adapter->netdev, "%s\n", __func__);
+
+	clear_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+
+	ena_increase_stat(&adapter->dev_stats.interface_down, 1,
+			  &adapter->syncp);
+
+	netif_carrier_off(adapter->netdev);
+	netif_tx_disable(adapter->netdev);
+
+	/* After this point the napi handler won't enable the tx queue */
+	ena_napi_disable_in_range(adapter, 0, io_queue_count);
+
+	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) {
+		int rc;
+
+		rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
+		if (rc)
+			netif_err(adapter, ifdown, adapter->netdev,
+				  "Device reset failed\n");
+		/* stop submitting admin commands on a device that was reset */
+		ena_com_set_admin_running_state(adapter->ena_dev, false);
+	}
+
+	ena_destroy_all_io_queues(adapter);
+
+	ena_disable_io_intr_sync(adapter);
+	ena_free_io_irq(adapter);
+	ena_del_napi_in_range(adapter, 0, io_queue_count);
+
+	ena_free_all_tx_bufs(adapter);
+	ena_free_all_rx_bufs(adapter);
+	ena_free_all_cache_pages(adapter);
+	ena_free_page_caches(adapter);
+	ena_free_all_io_tx_resources(adapter);
+	ena_free_all_io_rx_resources(adapter);
+}
+
+/* ena_open - Called when a network interface is made active
+ * @netdev: network interface device structure
+ *
+ * Returns 0 on success, negative value on failure
+ *
+ * The open entry point is called when a network interface is made
+ * active by the system (IFF_UP).  At this point all resources needed
+ * for transmit and receive operations are allocated, the interrupt
+ * handler is registered with the OS, the watchdog timer is started,
+ * and the stack is notified that the interface is ready.
+ */
+static int ena_open(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	/* Notify the stack of the actual queue counts. */
+	rc = netif_set_real_num_tx_queues(netdev, adapter->num_io_queues);
+	if (rc) {
+		netif_err(adapter, ifup, netdev, "Can't set num tx queues\n");
+		return rc;
+	}
+
+	rc = netif_set_real_num_rx_queues(netdev, adapter->num_io_queues);
+	if (rc) {
+		netif_err(adapter, ifup, netdev, "Can't set num rx queues\n");
+		return rc;
+	}
+
+	rc = ena_up(adapter);
+	if (rc)
+		return rc;
+
+	return rc;
+}
+
+/* ena_close - Disables a network interface
+ * @netdev: network interface device structure
+ *
+ * Returns 0, this is not allowed to fail
+ *
+ * The close entry point is called when an interface is de-activated
+ * by the OS.  The hardware is still under the drivers control, but
+ * needs to be disabled.  A global MAC reset is issued to stop the
+ * hardware, and all transmit and receive resources are freed.
+ */
+static int ena_close(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	netif_dbg(adapter, ifdown, netdev, "%s\n", __func__);
+
+	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
+		return 0;
+
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		ena_down(adapter);
+
+	/* Check for device status and issue reset if needed*/
+	check_for_admin_com_state(adapter);
+	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		netif_err(adapter, ifdown, adapter->netdev,
+			  "Destroy failure, restarting device\n");
+		ena_dump_stats_to_dmesg(adapter);
+		/* rtnl lock already obtained in dev_ioctl() layer */
+		ena_destroy_device(adapter, false);
+		ena_restore_device(adapter);
+	}
+
+	return 0;
+}
+
+int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled)
+{
+	/* In XDP, lpc_size might be positive even with LPC disabled, use cache
+	 * pointer instead.
+	 */
+	struct ena_page_cache *page_cache = adapter->rx_ring->page_cache;
+
+	/* Exit early if LPC state doesn't change */
+	if (enabled == !!page_cache)
+		return 0;
+
+	if (enabled && !ena_is_lpc_supported(adapter, adapter->rx_ring, true))
+		return -EOPNOTSUPP;
+
+	adapter->used_lpc_size = enabled ? adapter->configured_lpc_size : 0;
+
+	/* rtnl lock is already obtained in dev_ioctl() layer, so it's safe to
+	 * re-initialize IO resources.
+	 */
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) {
+		ena_close(adapter->netdev);
+		ena_up(adapter);
+	}
+
+	return 0;
+}
+
+int ena_update_queue_sizes(struct ena_adapter *adapter,
+			   u32 new_tx_size,
+			   u32 new_rx_size)
+{
+	bool dev_was_up;
+
+	dev_was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+	ena_close(adapter->netdev);
+	adapter->requested_tx_ring_size = new_tx_size;
+	adapter->requested_rx_ring_size = new_rx_size;
+	ena_init_io_rings(adapter,
+			  0,
+			  adapter->xdp_num_queues +
+			  adapter->num_io_queues);
+	return dev_was_up ? ena_up(adapter) : 0;
+}
+
+int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak)
+{
+	struct ena_ring *rx_ring;
+	int i;
+
+	if (rx_copybreak > min_t(u16, adapter->netdev->mtu, ENA_PAGE_SIZE))
+		return -EINVAL;
+
+	adapter->rx_copybreak = rx_copybreak;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		rx_ring = &adapter->rx_ring[i];
+		rx_ring->rx_copybreak = rx_copybreak;
+	}
+
+	return 0;
+}
+
+int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+#ifdef ENA_XDP_SUPPORT
+	int prev_channel_count;
+#endif /* ENA_XDP_SUPPORT */
+	bool dev_was_up;
+
+	dev_was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+	ena_close(adapter->netdev);
+#ifdef ENA_XDP_SUPPORT
+	prev_channel_count = adapter->num_io_queues;
+#endif /* ENA_XDP_SUPPORT */
+	adapter->num_io_queues = new_channel_count;
+#ifdef ENA_XDP_SUPPORT
+	if (ena_xdp_present(adapter) &&
+	    ena_xdp_allowed(adapter) == ENA_XDP_ALLOWED) {
+		adapter->xdp_first_ring = new_channel_count;
+		adapter->xdp_num_queues = new_channel_count;
+		if (prev_channel_count > new_channel_count)
+			ena_xdp_exchange_program_rx_in_range(adapter,
+							     NULL,
+							     new_channel_count,
+							     prev_channel_count);
+		else
+			ena_xdp_exchange_program_rx_in_range(adapter,
+							     adapter->xdp_bpf_prog,
+							     prev_channel_count,
+							     new_channel_count);
+	}
+#endif /* ENA_XDP_SUPPORT */
+
+	/* We need to destroy the rss table so that the indirection
+	 * table will be reinitialized by ena_up()
+	 */
+	ena_com_rss_destroy(ena_dev);
+	ena_init_io_rings(adapter,
+			  0,
+			  adapter->xdp_num_queues +
+			  adapter->num_io_queues);
+	return dev_was_up ? ena_open(adapter->netdev) : 0;
+}
+
+static void ena_tx_csum(struct ena_com_tx_ctx *ena_tx_ctx,
+			struct sk_buff *skb,
+			bool disable_meta_caching)
+{
+	u32 mss = skb_shinfo(skb)->gso_size;
+	struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta;
+	u8 l4_protocol = 0;
+
+	if ((skb->ip_summed == CHECKSUM_PARTIAL) || mss) {
+		ena_tx_ctx->l4_csum_enable = 1;
+		if (mss) {
+			ena_tx_ctx->tso_enable = 1;
+			ena_meta->l4_hdr_len = tcp_hdr(skb)->doff;
+			ena_tx_ctx->l4_csum_partial = 0;
+		} else {
+			ena_tx_ctx->tso_enable = 0;
+			ena_meta->l4_hdr_len = 0;
+			ena_tx_ctx->l4_csum_partial = 1;
+		}
+
+		switch (ip_hdr(skb)->version) {
+		case IPVERSION:
+			ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV4;
+			if (ip_hdr(skb)->frag_off & htons(IP_DF))
+				ena_tx_ctx->df = 1;
+			if (mss)
+				ena_tx_ctx->l3_csum_enable = 1;
+			l4_protocol = ip_hdr(skb)->protocol;
+			break;
+		case 6:
+			ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV6;
+			l4_protocol = ipv6_hdr(skb)->nexthdr;
+			break;
+		default:
+			break;
+		}
+
+		if (l4_protocol == IPPROTO_TCP)
+			ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_TCP;
+		else
+			ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_UDP;
+
+		ena_meta->mss = mss;
+		ena_meta->l3_hdr_len = skb_network_header_len(skb);
+		ena_meta->l3_hdr_offset = skb_network_offset(skb);
+		ena_tx_ctx->meta_valid = 1;
+	} else if (disable_meta_caching) {
+		memset(ena_meta, 0, sizeof(*ena_meta));
+		ena_tx_ctx->meta_valid = 1;
+	} else {
+		ena_tx_ctx->meta_valid = 0;
+	}
+}
+
+static int ena_check_and_linearize_skb(struct ena_ring *tx_ring,
+				       struct sk_buff *skb)
+{
+	int num_frags, header_len, rc;
+
+	num_frags = skb_shinfo(skb)->nr_frags;
+	header_len = skb_headlen(skb);
+
+	if (num_frags < tx_ring->sgl_size)
+		return 0;
+
+	if ((num_frags == tx_ring->sgl_size) &&
+	    (header_len < tx_ring->tx_max_header_size))
+		return 0;
+
+	ena_increase_stat(&tx_ring->tx_stats.linearize, 1, &tx_ring->syncp);
+
+	rc = skb_linearize(skb);
+	if (unlikely(rc)) {
+		ena_increase_stat(&tx_ring->tx_stats.linearize_failed, 1,
+				  &tx_ring->syncp);
+	}
+
+	return rc;
+}
+
+static int ena_tx_map_skb(struct ena_ring *tx_ring,
+			  struct ena_tx_buffer *tx_info,
+			  struct sk_buff *skb,
+			  void **push_hdr,
+			  u16 *header_len)
+{
+	struct ena_adapter *adapter = tx_ring->adapter;
+	struct ena_com_buf *ena_buf;
+	dma_addr_t dma;
+	u32 skb_head_len, frag_len, last_frag;
+	u16 push_len = 0;
+	u16 delta = 0;
+	int i = 0;
+
+	skb_head_len = skb_headlen(skb);
+	tx_info->skb = skb;
+	ena_buf = tx_info->bufs;
+
+	if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		/* When the device is LLQ mode, the driver will copy
+		 * the header into the device memory space.
+		 * the ena_com layer assume the header is in a linear
+		 * memory space.
+		 * This assumption might be wrong since part of the header
+		 * can be in the fragmented buffers.
+		 * Use skb_header_pointer to make sure the header is in a
+		 * linear memory space.
+		 */
+
+		push_len = min_t(u32, skb->len, tx_ring->tx_max_header_size);
+		*push_hdr = skb_header_pointer(skb, 0, push_len,
+					       tx_ring->push_buf_intermediate_buf);
+		*header_len = push_len;
+		if (unlikely(skb->data != *push_hdr)) {
+			ena_increase_stat(&tx_ring->tx_stats.llq_buffer_copy, 1,
+					  &tx_ring->syncp);
+
+			delta = push_len - skb_head_len;
+		}
+	} else {
+		*push_hdr = NULL;
+		*header_len = min_t(u32, skb_head_len,
+				    tx_ring->tx_max_header_size);
+	}
+
+	netif_dbg(adapter, tx_queued, adapter->netdev,
+		  "skb: %p header_buf->vaddr: %p push_len: %d\n", skb,
+		  *push_hdr, push_len);
+
+	if (skb_head_len > push_len) {
+		dma = dma_map_single(tx_ring->dev, skb->data + push_len,
+				     skb_head_len - push_len, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(tx_ring->dev, dma)))
+			goto error_report_dma_error;
+
+		ena_buf->paddr = dma;
+		ena_buf->len = skb_head_len - push_len;
+
+		ena_buf++;
+		tx_info->num_of_bufs++;
+		tx_info->map_linear_data = 1;
+	} else {
+		tx_info->map_linear_data = 0;
+	}
+
+	last_frag = skb_shinfo(skb)->nr_frags;
+
+	for (i = 0; i < last_frag; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		frag_len = skb_frag_size(frag);
+
+		if (unlikely(delta >= frag_len)) {
+			delta -= frag_len;
+			continue;
+		}
+
+		dma = skb_frag_dma_map(tx_ring->dev, frag, delta,
+				       frag_len - delta, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(tx_ring->dev, dma)))
+			goto error_report_dma_error;
+
+		ena_buf->paddr = dma;
+		ena_buf->len = frag_len - delta;
+		ena_buf++;
+		tx_info->num_of_bufs++;
+		delta = 0;
+	}
+
+	return 0;
+
+error_report_dma_error:
+	ena_increase_stat(&tx_ring->tx_stats.dma_mapping_err, 1,
+			  &tx_ring->syncp);
+	netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map skb\n");
+
+	tx_info->skb = NULL;
+
+	tx_info->num_of_bufs += i;
+	ena_unmap_tx_buff(tx_ring, tx_info);
+
+	return -EINVAL;
+}
+
+/* Called with netif_tx_lock. */
+static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	struct ena_tx_buffer *tx_info;
+	struct ena_com_tx_ctx ena_tx_ctx;
+	struct ena_ring *tx_ring;
+	struct netdev_queue *txq;
+	void *push_hdr;
+	u16 next_to_use, req_id, header_len;
+	int qid, rc;
+
+	netif_dbg(adapter, tx_queued, dev, "%s skb %p\n", __func__, skb);
+	/*  Determine which tx ring we will be placed on */
+	qid = skb_get_queue_mapping(skb);
+	tx_ring = &adapter->tx_ring[qid];
+	txq = netdev_get_tx_queue(dev, qid);
+
+	rc = ena_check_and_linearize_skb(tx_ring, skb);
+	if (unlikely(rc))
+		goto error_drop_packet;
+
+	next_to_use = tx_ring->next_to_use;
+	req_id = tx_ring->free_ids[next_to_use];
+	tx_info = &tx_ring->tx_buffer_info[req_id];
+	tx_info->num_of_bufs = 0;
+
+	WARN(tx_info->skb, "SKB isn't NULL req_id %d\n", req_id);
+
+	rc = ena_tx_map_skb(tx_ring, tx_info, skb, &push_hdr, &header_len);
+	if (unlikely(rc))
+		goto error_drop_packet;
+
+	memset(&ena_tx_ctx, 0x0, sizeof(struct ena_com_tx_ctx));
+	ena_tx_ctx.ena_bufs = tx_info->bufs;
+	ena_tx_ctx.push_header = push_hdr;
+	ena_tx_ctx.num_bufs = tx_info->num_of_bufs;
+	ena_tx_ctx.req_id = req_id;
+	ena_tx_ctx.header_len = header_len;
+
+	/* set flags and meta data */
+	ena_tx_csum(&ena_tx_ctx, skb, tx_ring->disable_meta_caching);
+
+	rc = ena_xmit_common(adapter,
+			     tx_ring,
+			     tx_info,
+			     &ena_tx_ctx,
+			     next_to_use,
+			     skb->len);
+	if (rc)
+		goto error_unmap_dma;
+
+	if (tx_ring->enable_bql)
+		netdev_tx_sent_queue(txq, skb->len);
+
+	/* stop the queue when no more space available, the packet can have up
+	 * to sgl_size + 2. one for the meta descriptor and one for header
+	 * (if the header is larger than tx_max_header_size).
+	 */
+	if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
+						   tx_ring->sgl_size + 2))) {
+		netif_dbg(adapter, tx_queued, dev, "%s stop queue %d\n",
+			  __func__, qid);
+
+		netif_tx_stop_queue(txq);
+		ena_increase_stat(&tx_ring->tx_stats.queue_stop, 1,
+				  &tx_ring->syncp);
+
+		/* There is a rare condition where this function decide to
+		 * stop the queue but meanwhile clean_tx_irq updates
+		 * next_to_completion and terminates.
+		 * The queue will remain stopped forever.
+		 * To solve this issue add a mb() to make sure that
+		 * netif_tx_stop_queue() write is vissible before checking if
+		 * there is additional space in the queue.
+		 */
+		smp_mb();
+
+		if (ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
+						 ENA_TX_WAKEUP_THRESH)) {
+			netif_tx_wake_queue(txq);
+			ena_increase_stat(&tx_ring->tx_stats.queue_wakeup, 1,
+					  &tx_ring->syncp);
+		}
+	}
+
+	skb_tx_timestamp(skb);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+#ifdef HAVE_NETDEV_XMIT_MORE
+	if (netif_xmit_stopped(txq) || !netdev_xmit_more())
+#else
+	if (netif_xmit_stopped(txq) || !skb->xmit_more)
+#endif /* HAVE_NETDEV_XMIT_MORE */
+#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) */
+		/* trigger the dma engine. ena_ring_tx_doorbell()
+		 * calls a memory barrier inside it.
+		 */
+		ena_ring_tx_doorbell(tx_ring);
+
+	return NETDEV_TX_OK;
+
+error_unmap_dma:
+	ena_unmap_tx_buff(tx_ring, tx_info);
+	tx_info->skb = NULL;
+
+error_drop_packet:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+#if defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    struct net_device *sb_dev)
+#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
+#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    void *accel_priv,
+			    select_queue_fallback_t fallback)
+#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL
+/* Return subqueue id on this core (one per core). */
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    void *accel_priv)
+#else
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb)
+#endif
+{
+	u16 qid;
+	/* we suspect that this is good for in--kernel network services that
+	 * want to loop incoming skb rx to tx in normal user generated traffic,
+	 * most probably we will not get to this
+	 */
+	if (skb_rx_queue_recorded(skb))
+		qid = skb_get_rx_queue(skb);
+	else
+#if (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3)
+		qid = netdev_pick_tx(dev, skb, NULL);
+#elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2)
+		qid = fallback(dev, skb, NULL);
+#elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1)
+		qid = fallback(dev, skb);
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)
+		qid = __netdev_pick_tx(dev, skb);
+#else
+		qid = skb_tx_hash(dev, skb);
+#endif /* HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2 */
+
+	return qid;
+}
+#ifdef HAVE_SET_RX_MODE
+
+/* Unicast, Multicast and Promiscuous mode set
+ * @netdev: network interface device structure
+ *
+ * The set_rx_mode entry point is called whenever the unicast or multicast
+ * address lists or the network interface flags are updated.  This routine is
+ * responsible for configuring the hardware for proper unicast, multicast,
+ * promiscuous mode, and all-multi behavior.
+ */
+static void ena_set_rx_mode(struct net_device *netdev)
+{
+/*	struct ena_adapter *adapter = netdev_priv(netdev); */
+	/* TODO set Rx mode */
+
+	if (netdev->flags & IFF_PROMISC) {
+	} else if (netdev->flags & IFF_ALLMULTI) {
+	} else if (netdev_mc_empty(netdev)) {
+	} else {
+	}
+}
+#endif /* HAVE_SET_RX_MODE */
+
+static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct ena_admin_host_info *host_info;
+	int rc;
+
+	/* Allocate only the host info */
+	rc = ena_com_allocate_host_info(ena_dev);
+	if (rc) {
+		dev_err(dev, "Cannot allocate host info\n");
+		return;
+	}
+
+	host_info = ena_dev->host_attr.host_info;
+
+	host_info->bdf = (pdev->bus->number << 8) | pdev->devfn;
+	host_info->os_type = ENA_ADMIN_OS_LINUX;
+	host_info->kernel_ver = LINUX_VERSION_CODE;
+	strlcpy(host_info->kernel_ver_str, utsname()->version,
+		sizeof(host_info->kernel_ver_str) - 1);
+	host_info->os_dist = 0;
+	strncpy(host_info->os_dist_str, utsname()->release,
+		sizeof(host_info->os_dist_str) - 1);
+	host_info->driver_version =
+		(DRV_MODULE_GEN_MAJOR) |
+		(DRV_MODULE_GEN_MINOR << ENA_ADMIN_HOST_INFO_MINOR_SHIFT) |
+		(DRV_MODULE_GEN_SUBMINOR << ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT) |
+		("g"[0] << ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT);
+	host_info->num_cpus = num_online_cpus();
+
+	host_info->driver_supported_features =
+		ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
+		ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK |
+		ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK |
+		ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK |
+		ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK;
+
+	rc = ena_com_set_host_attributes(ena_dev);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			dev_warn(dev, "Cannot set host attributes\n");
+		else
+			dev_err(dev, "Cannot set host attributes\n");
+
+		goto err;
+	}
+
+	return;
+
+err:
+	ena_com_delete_host_info(ena_dev);
+}
+
+static void ena_config_debug_area(struct ena_adapter *adapter)
+{
+	u32 debug_area_size;
+	int rc, ss_count;
+
+	ss_count = ena_get_sset_count(adapter->netdev, ETH_SS_STATS);
+	if (ss_count <= 0) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "SS count is negative\n");
+		return;
+	}
+
+	/* allocate 32 bytes for each string and 64bit for the value */
+	debug_area_size = ss_count * ETH_GSTRING_LEN + sizeof(u64) * ss_count;
+
+	rc = ena_com_allocate_debug_area(adapter->ena_dev, debug_area_size);
+	if (rc) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Cannot allocate debug area\n");
+		return;
+	}
+
+	rc = ena_com_set_host_attributes(adapter->ena_dev);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			netif_warn(adapter, drv, adapter->netdev,
+				   "Cannot set host attributes\n");
+		else
+			netif_err(adapter, drv, adapter->netdev,
+				  "Cannot set host attributes\n");
+		goto err;
+	}
+
+	return;
+err:
+	ena_com_delete_debug_area(adapter->ena_dev);
+}
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36))
+#ifdef NDO_GET_STATS_64_V2
+static void ena_get_stats64(struct net_device *netdev,
+			    struct rtnl_link_stats64 *stats)
+#else
+static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
+						 struct rtnl_link_stats64 *stats)
+#endif
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_ring *rx_ring, *tx_ring;
+	u64 xdp_rx_drops = 0;
+	unsigned int start;
+	u64 rx_drops;
+	u64 tx_drops;
+	int i;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+#ifdef NDO_GET_STATS_64_V2
+		return;
+#else
+		return NULL;
+#endif
+
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
+		u64 bytes, packets;
+
+		tx_ring = &adapter->tx_ring[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			packets = tx_ring->tx_stats.cnt;
+			bytes = tx_ring->tx_stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+
+		stats->tx_packets += packets;
+		stats->tx_bytes += bytes;
+
+		/* In XDP there isn't an RX queue counterpart */
+		if (ENA_IS_XDP_INDEX(adapter, i))
+			continue;
+
+		rx_ring = &adapter->rx_ring[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
+			packets = rx_ring->rx_stats.cnt;
+			bytes = rx_ring->rx_stats.bytes;
+			xdp_rx_drops += ena_ring_xdp_drops_cnt(rx_ring);
+		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
+
+		stats->rx_packets += packets;
+		stats->rx_bytes += bytes;
+	}
+
+	do {
+		start = u64_stats_fetch_begin_irq(&adapter->syncp);
+		rx_drops = adapter->dev_stats.rx_drops;
+		tx_drops = adapter->dev_stats.tx_drops;
+	} while (u64_stats_fetch_retry_irq(&adapter->syncp, start));
+
+	stats->rx_dropped = rx_drops + xdp_rx_drops;
+	stats->tx_dropped = tx_drops;
+
+	stats->multicast = 0;
+	stats->collisions = 0;
+
+	stats->rx_length_errors = 0;
+	stats->rx_crc_errors = 0;
+	stats->rx_frame_errors = 0;
+	stats->rx_fifo_errors = 0;
+	stats->rx_missed_errors = 0;
+	stats->tx_window_errors = 0;
+
+	stats->rx_errors = 0;
+	stats->tx_errors = 0;
+#ifndef NDO_GET_STATS_64_V2
+		return stats;
+#endif
+}
+#else /* kernel > 2.6.36 */
+static struct net_device_stats *ena_get_stats(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_ring *rx_ring, *tx_ring;
+	unsigned long rx_drops;
+	struct net_device_stats *stats = &netdev->stats;
+	unsigned int start;
+	int i;
+
+	memset(stats, 0, sizeof(*stats));
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		unsigned long bytes, packets;
+
+		tx_ring = &adapter->tx_ring[i];
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			packets = (unsigned long)tx_ring->tx_stats.cnt;
+			bytes = (unsigned long)tx_ring->tx_stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+
+		stats->tx_packets += packets;
+		stats->tx_bytes += bytes;
+
+		rx_ring = &adapter->rx_ring[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			packets = (unsigned long)rx_ring->rx_stats.cnt;
+			bytes = (unsigned long)rx_ring->rx_stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+
+		stats->rx_packets += packets;
+		stats->rx_bytes += bytes;
+	}
+
+	do {
+		start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+		rx_drops = (unsigned long)adapter->dev_stats.rx_drops;
+	} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+
+	stats->rx_dropped = rx_drops;
+
+	stats->multicast = 0;
+	stats->collisions = 0;
+
+	stats->rx_length_errors = 0;
+	stats->rx_crc_errors = 0;
+	stats->rx_frame_errors = 0;
+	stats->rx_fifo_errors = 0;
+	stats->rx_missed_errors = 0;
+	stats->tx_window_errors = 0;
+
+	stats->rx_errors = 0;
+	stats->tx_errors = 0;
+
+	return stats;
+}
+#endif
+#ifdef ENA_BUSY_POLL_SUPPORT
+
+#define ENA_BP_NAPI_BUDGET 8
+static int ena_busy_poll(struct napi_struct *napi)
+{
+	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
+	struct ena_ring *rx_ring = ena_napi->rx_ring;
+	struct ena_adapter *adapter= rx_ring->adapter;
+	int done;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return LL_FLUSH_FAILED;
+
+	if (!ena_bp_lock_poll(rx_ring))
+		return LL_FLUSH_BUSY;
+
+	done = ena_clean_rx_irq(rx_ring, napi, ENA_BP_NAPI_BUDGET);
+	if (likely(done))
+		rx_ring->rx_stats.bp_cleaned += done;
+	else
+		rx_ring->rx_stats.bp_missed++;
+
+	ena_bp_unlock_poll(rx_ring);
+
+	return done;
+}
+#endif
+
+static const struct net_device_ops ena_netdev_ops = {
+	.ndo_open		= ena_open,
+	.ndo_stop		= ena_close,
+	.ndo_start_xmit		= ena_start_xmit,
+	.ndo_select_queue	= ena_select_queue,
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36))
+	.ndo_get_stats64	= ena_get_stats64,
+#else
+	.ndo_get_stats		= ena_get_stats,
+#endif
+#ifdef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
+	.ndo_tx_timeout		= ena_tx_timeout,
+#else
+	.ndo_tx_timeout		= ena_find_and_timeout_queue,
+#endif
+	.ndo_change_mtu		= ena_change_mtu,
+	.ndo_set_mac_address	= NULL,
+#ifdef	HAVE_SET_RX_MODE
+	.ndo_set_rx_mode	= ena_set_rx_mode,
+#endif
+	.ndo_validate_addr	= eth_validate_addr,
+#ifdef ENA_BUSY_POLL_SUPPORT
+	.ndo_busy_poll		= ena_busy_poll,
+#endif
+#ifdef ENA_XDP_SUPPORT
+	.ndo_bpf		= ena_xdp,
+	.ndo_xdp_xmit		= ena_xdp_xmit,
+#if defined(ENA_TEST_AF_XDP) && defined(ENA_AF_XDP_SUPPORT)
+	.ndo_xsk_wakeup         = ena_xdp_xsk_wakeup,
+#endif /* defined(ENA_TEST_AF_XDP) && defined(ENA_AF_XDP_SUPPORT) */
+#endif /* ENA_XDP_SUPPORT */
+};
+
+static int ena_device_validate_params(struct ena_adapter *adapter,
+				      struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	struct net_device *netdev = adapter->netdev;
+	int rc;
+
+	rc = ether_addr_equal(get_feat_ctx->dev_attr.mac_addr,
+			      adapter->mac_addr);
+	if (!rc) {
+		netif_err(adapter, drv, netdev,
+			  "Error, mac address are different\n");
+		return -EINVAL;
+	}
+
+	if (get_feat_ctx->dev_attr.max_mtu < netdev->mtu) {
+		netif_err(adapter, drv, netdev,
+			  "Error, device max mtu is smaller than netdev MTU\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void set_default_llq_configurations(struct ena_adapter *adapter,
+					   struct ena_llq_configurations *llq_config,
+					   struct ena_admin_feature_llq_desc *llq)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+
+	llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER;
+	llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
+	llq_config->llq_num_decs_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
+
+	adapter->large_llq_header_supported =
+		!!(ena_dev->supported_features & (1 << ENA_ADMIN_LLQ));
+	adapter->large_llq_header_supported &=
+		!!(llq->entry_size_ctrl_supported &
+			ENA_ADMIN_LIST_ENTRY_SIZE_256B);
+
+	if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
+		adapter->large_llq_header_enabled) {
+		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
+		llq_config->llq_ring_entry_size_value = 256;
+	} else {
+		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_128B;
+		llq_config->llq_ring_entry_size_value = 128;
+	}
+}
+
+static int ena_set_queues_placement_policy(struct pci_dev *pdev,
+					   struct ena_com_dev *ena_dev,
+					   struct ena_admin_feature_llq_desc *llq,
+					   struct ena_llq_configurations *llq_default_configurations)
+{
+	int rc;
+	u32 llq_feature_mask;
+
+	llq_feature_mask = 1 << ENA_ADMIN_LLQ;
+	if (!(ena_dev->supported_features & llq_feature_mask)) {
+		dev_warn(&pdev->dev,
+			"LLQ is not supported Fallback to host mode policy.\n");
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
+	if (!ena_dev->mem_bar) {
+		netdev_err(ena_dev->net_device,
+			   "LLQ is advertised as supported but device doesn't expose mem bar\n");
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
+	rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations);
+	if (unlikely(rc)) {
+		dev_err(&pdev->dev,
+			"Failed to configure the device mode.  Fallback to host mode policy.\n");
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+	}
+
+	return 0;
+}
+
+static int ena_map_llq_mem_bar(struct pci_dev *pdev, struct ena_com_dev *ena_dev,
+			       int bars)
+{
+	bool has_mem_bar = !!(bars & BIT(ENA_MEM_BAR));
+
+	if (!has_mem_bar)
+		return 0;
+
+	ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev,
+					   pci_resource_start(pdev, ENA_MEM_BAR),
+					   pci_resource_len(pdev, ENA_MEM_BAR));
+
+	if (!ena_dev->mem_bar)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
+			   struct ena_com_dev_get_features_ctx *get_feat_ctx,
+			   bool *wd_state)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct net_device *netdev = adapter->netdev;
+	struct ena_llq_configurations llq_config;
+	netdev_features_t prev_netdev_features;
+	struct device *dev = &pdev->dev;
+	bool readless_supported;
+	u32 aenq_groups;
+	int dma_width;
+	int rc;
+
+	rc = ena_com_mmio_reg_read_request_init(ena_dev);
+	if (rc) {
+		dev_err(dev, "Failed to init mmio read less\n");
+		return rc;
+	}
+
+	/* The PCIe configuration space revision id indicate if mmio reg
+	 * read is disabled
+	 */
+	readless_supported = !(pdev->revision & ENA_MMIO_DISABLE_REG_READ);
+	ena_com_set_mmio_read_mode(ena_dev, readless_supported);
+
+	rc = ena_com_dev_reset(ena_dev, ENA_REGS_RESET_NORMAL);
+	if (rc) {
+		dev_err(dev, "Can not reset device\n");
+		goto err_mmio_read_less;
+	}
+
+	rc = ena_com_validate_version(ena_dev);
+	if (rc) {
+		dev_err(dev, "Device version is too low\n");
+		goto err_mmio_read_less;
+	}
+
+	dma_width = ena_com_get_dma_width(ena_dev);
+	if (dma_width < 0) {
+		dev_err(dev, "Invalid dma width value %d", dma_width);
+		rc = dma_width;
+		goto err_mmio_read_less;
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)
+	rc = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(dma_width));
+	if (rc) {
+		dev_err(dev, "dma_set_mask_and_coherent failed %d\n", rc);
+		goto err_mmio_read_less;
+	}
+#else
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(dma_width));
+	if (rc) {
+		dev_err(dev, "pci_set_dma_mask failed %d\n", rc);
+		goto err_mmio_read_less;
+	}
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(dma_width));
+	if (rc) {
+		dev_err(dev, "err_pci_set_consistent_dma_mask failed %d\n",
+			rc);
+		goto err_mmio_read_less;
+	}
+#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */
+
+	ena_devlink_params_get(adapter->devlink);
+
+	/* ENA admin level init */
+	rc = ena_com_admin_init(ena_dev, &aenq_handlers);
+	if (rc) {
+		dev_err(dev,
+			"Can not initialize ena admin queue with device\n");
+		goto err_mmio_read_less;
+	}
+
+	/* To enable the msix interrupts the driver needs to know the number
+	 * of queues. So the driver uses polling mode to retrieve this
+	 * information
+	 */
+	ena_com_set_admin_polling_mode(ena_dev, true);
+
+	ena_config_host_info(ena_dev, pdev);
+
+	/* Get Device Attributes*/
+	rc = ena_com_get_dev_attr_feat(ena_dev, get_feat_ctx);
+	if (rc) {
+		dev_err(dev, "Cannot get attribute for ena device rc=%d\n", rc);
+		goto err_admin_init;
+	}
+
+	/* Try to turn all the available aenq groups */
+	aenq_groups = BIT(ENA_ADMIN_LINK_CHANGE) |
+		BIT(ENA_ADMIN_FATAL_ERROR) |
+		BIT(ENA_ADMIN_WARNING) |
+		BIT(ENA_ADMIN_NOTIFICATION) |
+		BIT(ENA_ADMIN_KEEP_ALIVE);
+
+	aenq_groups &= get_feat_ctx->aenq.supported_groups;
+
+	rc = ena_com_set_aenq_config(ena_dev, aenq_groups);
+	if (rc) {
+		dev_err(dev, "Cannot configure aenq groups rc= %d\n", rc);
+		goto err_admin_init;
+	}
+
+	*wd_state = !!(aenq_groups & BIT(ENA_ADMIN_KEEP_ALIVE));
+
+	set_default_llq_configurations(adapter, &llq_config, &get_feat_ctx->llq);
+
+	rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx->llq,
+					     &llq_config);
+	if (rc) {
+		netdev_err(netdev, "Cannot set queues placement policy rc= %d\n", rc);
+		goto err_admin_init;
+	}
+
+	rc = ena_calc_io_queue_size(adapter, get_feat_ctx);
+	if (unlikely(rc))
+		goto err_admin_init;
+
+	/* Turned on features shouldn't change due to reset. */
+	prev_netdev_features = adapter->netdev->features;
+	ena_set_dev_offloads(get_feat_ctx, adapter->netdev);
+	adapter->netdev->features = prev_netdev_features;
+
+	rc = ena_phc_init(adapter);
+	if (unlikely(rc && (rc != -EOPNOTSUPP))) {
+		netdev_err(netdev, "Failed initiating PHC, error: %d\n", rc);
+		goto err_admin_init;
+	}
+
+	return 0;
+
+err_admin_init:
+	ena_com_abort_admin_commands(ena_dev);
+	ena_com_wait_for_abort_completion(ena_dev);
+	ena_com_delete_host_info(ena_dev);
+	ena_com_admin_destroy(ena_dev);
+err_mmio_read_less:
+	ena_com_mmio_reg_read_request_destroy(ena_dev);
+
+	return rc;
+}
+
+static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct device *dev = &adapter->pdev->dev;
+	int rc;
+
+	rc = ena_enable_msix(adapter);
+	if (rc) {
+		dev_err(dev, "Can not reserve msix vectors\n");
+		return rc;
+	}
+
+	ena_setup_mgmnt_intr(adapter);
+
+	rc = ena_request_mgmnt_irq(adapter);
+	if (rc) {
+		dev_err(dev, "Can not setup management interrupts\n");
+		goto err_disable_msix;
+	}
+
+	ena_com_set_admin_polling_mode(ena_dev, false);
+
+	ena_com_admin_aenq_enable(ena_dev);
+
+	return 0;
+
+err_disable_msix:
+	ena_disable_msix(adapter);
+
+	return rc;
+}
+
+int ena_destroy_device(struct ena_adapter *adapter, bool graceful)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	bool dev_up;
+	int rc = 0;
+
+	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
+		return 0;
+
+	netif_carrier_off(netdev);
+
+	del_timer_sync(&adapter->timer_service);
+
+	dev_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+	adapter->dev_up_before_reset = dev_up;
+	ena_sysfs_terminate(&adapter->pdev->dev);
+	if (!graceful)
+		ena_com_set_admin_running_state(ena_dev, false);
+
+	if (dev_up)
+		ena_down(adapter);
+
+	/* Stop the device from sending AENQ events (in case reset flag is set
+	 *  and device is up, ena_down() already reset the device.
+	 */
+	if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up))
+		rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
+
+	ena_free_mgmnt_irq(adapter);
+
+	ena_disable_msix(adapter);
+
+	ena_com_abort_admin_commands(ena_dev);
+
+	ena_com_wait_for_abort_completion(ena_dev);
+
+	ena_com_admin_destroy(ena_dev);
+
+	ena_phc_destroy(adapter);
+
+	ena_com_mmio_reg_read_request_destroy(ena_dev);
+
+	/* return reset reason to default value */
+	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
+
+	clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+	clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
+
+	return rc;
+}
+
+int ena_restore_device(struct ena_adapter *adapter)
+{
+	struct ena_com_dev_get_features_ctx get_feat_ctx;
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct pci_dev *pdev = adapter->pdev;
+	bool wd_state;
+	int rc;
+
+	set_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
+	rc = ena_device_init(adapter, adapter->pdev, &get_feat_ctx, &wd_state);
+	if (rc) {
+		dev_err(&pdev->dev, "Can not initialize device\n");
+		goto err;
+	}
+	adapter->wd_state = wd_state;
+
+	rc = ena_device_validate_params(adapter, &get_feat_ctx);
+	if (rc) {
+		dev_err(&pdev->dev, "Validation of device parameters failed\n");
+		goto err_device_destroy;
+	}
+
+	rc = ena_enable_msix_and_set_admin_interrupts(adapter);
+	if (rc) {
+		dev_err(&pdev->dev, "Enable MSI-X failed\n");
+		goto err_device_destroy;
+	}
+	rc = ena_sysfs_init(&pdev->dev);
+	if (rc) {
+		dev_err(&pdev->dev, "Cannot initialize sysfs\n");
+		goto err_disable_msix;
+	}
+	/* If the interface was up before the reset bring it up */
+	if (adapter->dev_up_before_reset) {
+		rc = ena_up(adapter);
+		if (rc) {
+			dev_err(&pdev->dev, "Failed to create I/O queues\n");
+			goto err_sysfs_terminate;
+		}
+	}
+
+	set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
+
+	clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
+	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
+		netif_carrier_on(adapter->netdev);
+
+	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
+	adapter->last_keep_alive_jiffies = jiffies;
+
+	return rc;
+err_sysfs_terminate:
+	ena_sysfs_terminate(&pdev->dev);
+err_disable_msix:
+	ena_free_mgmnt_irq(adapter);
+	ena_disable_msix(adapter);
+err_device_destroy:
+	ena_com_abort_admin_commands(ena_dev);
+	ena_com_wait_for_abort_completion(ena_dev);
+	ena_com_admin_destroy(ena_dev);
+	ena_com_dev_reset(ena_dev, ENA_REGS_RESET_DRIVER_INVALID_STATE);
+	ena_phc_destroy(adapter);
+	ena_com_mmio_reg_read_request_destroy(ena_dev);
+err:
+	clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
+	clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
+	dev_err(&pdev->dev,
+		"Reset attempt failed. Can not reset the device\n");
+
+	return rc;
+}
+
+static void ena_fw_reset_device(struct work_struct *work)
+{
+	int rc = 0;
+
+	struct ena_adapter *adapter =
+		container_of(work, struct ena_adapter, reset_task);
+
+	rtnl_lock();
+
+	if (likely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		rc |= ena_destroy_device(adapter, false);
+		rc |= ena_restore_device(adapter);
+		adapter->dev_stats.reset_fail += !!rc;
+
+		dev_err(&adapter->pdev->dev,
+			"Device reset completed successfully, Driver info: %s\n",
+			version);
+	}
+
+	rtnl_unlock();
+}
+
+static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
+					struct ena_ring *rx_ring)
+{
+	struct ena_napi *ena_napi = container_of(rx_ring->napi, struct ena_napi, napi);
+
+	if (likely(READ_ONCE(ena_napi->first_interrupt)))
+		return 0;
+
+	if (ena_com_cq_empty(rx_ring->ena_com_io_cq))
+		return 0;
+
+	rx_ring->no_interrupt_event_cnt++;
+
+	if (rx_ring->no_interrupt_event_cnt == ENA_MAX_NO_INTERRUPT_ITERATIONS) {
+		netif_err(adapter, rx_err, adapter->netdev,
+			  "Potential MSIX issue on Rx side Queue = %d. Reset the device\n",
+			  rx_ring->qid);
+
+		ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
+					  struct ena_ring *tx_ring)
+{
+	struct ena_napi *ena_napi = container_of(tx_ring->napi, struct ena_napi, napi);
+	enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_MISS_TX_CMPL;
+	unsigned int time_since_last_napi;
+	unsigned int missing_tx_comp_to;
+	bool is_tx_comp_time_expired;
+	struct ena_tx_buffer *tx_buf;
+	unsigned long last_jiffies;
+	int napi_scheduled;
+	u32 missed_tx = 0;
+	int i, rc = 0;
+
+	missing_tx_comp_to = jiffies_to_msecs(adapter->missing_tx_completion_to);
+
+	for (i = 0; i < tx_ring->ring_size; i++) {
+		tx_buf = &tx_ring->tx_buffer_info[i];
+		last_jiffies = tx_buf->last_jiffies;
+
+		if (last_jiffies == 0)
+			/* no pending Tx at this location */
+			continue;
+
+		is_tx_comp_time_expired = time_is_before_jiffies(last_jiffies +
+			 2 * adapter->missing_tx_completion_to);
+
+		if (unlikely(!READ_ONCE(ena_napi->first_interrupt) && is_tx_comp_time_expired)) {
+			/* If after graceful period interrupt is still not
+			 * received, we schedule a reset
+			 */
+			netif_err(adapter, tx_err, adapter->netdev,
+				  "Potential MSIX issue on Tx side Queue = %d. Reset the device\n",
+				  tx_ring->qid);
+			ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT);
+			return -EIO;
+		}
+
+		is_tx_comp_time_expired = time_is_before_jiffies(last_jiffies +
+			adapter->missing_tx_completion_to);
+
+		if (unlikely(is_tx_comp_time_expired)) {
+
+			time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies);
+			napi_scheduled = !!(ena_napi->napi.state & NAPIF_STATE_SCHED);
+
+			if (missing_tx_comp_to < time_since_last_napi && napi_scheduled) {
+				/* We suspect napi isn't called because the
+				 * bottom half is not run. Require a bigger
+				 * timeout for these cases
+				 */
+				if (!time_is_before_jiffies(last_jiffies +
+					2 * adapter->missing_tx_completion_to))
+					continue;
+
+				reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
+			}
+
+			missed_tx++;
+
+			if (tx_buf->print_once)
+				continue;
+
+			netif_notice(adapter, tx_err, adapter->netdev,
+				     "TX hasn't completed, qid %d, index %d. %u usecs from last napi execution, napi scheduled: %d\n",
+				     tx_ring->qid, i, time_since_last_napi, napi_scheduled);
+
+			tx_buf->print_once = 1;
+		}
+	}
+
+	if (unlikely(missed_tx > adapter->missing_tx_completion_threshold)) {
+		netif_err(adapter, tx_err, adapter->netdev,
+			  "Lost TX completions are above the threshold (%d > %d). Completion transmission timeout: %u.\n",
+			  missed_tx,
+			  adapter->missing_tx_completion_threshold,
+			  missing_tx_comp_to);
+		netif_err(adapter, tx_err, adapter->netdev,
+			  "Resetting the device\n");
+
+		ena_reset_device(adapter, reset_reason);
+		rc = -EIO;
+	}
+
+	ena_increase_stat(&tx_ring->tx_stats.missed_tx, missed_tx,
+			  &tx_ring->syncp);
+
+	return rc;
+}
+
+static void check_for_missing_completions(struct ena_adapter *adapter)
+{
+	struct ena_ring *tx_ring;
+	struct ena_ring *rx_ring;
+	int i, budget, rc;
+	int io_queue_count;
+
+	io_queue_count = adapter->xdp_num_queues + adapter->num_io_queues;
+	/* Make sure the driver doesn't turn the device in other process */
+	smp_rmb();
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return;
+
+	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
+	if (adapter->missing_tx_completion_to == ENA_HW_HINTS_NO_TIMEOUT)
+		return;
+
+	budget = ENA_MONITORED_TX_QUEUES;
+
+	for (i = adapter->last_monitored_tx_qid; i < io_queue_count; i++) {
+		tx_ring = &adapter->tx_ring[i];
+		rx_ring = &adapter->rx_ring[i];
+
+		rc = check_missing_comp_in_tx_queue(adapter, tx_ring);
+		if (unlikely(rc))
+			return;
+
+		rc =  !ENA_IS_XDP_INDEX(adapter, i) ?
+			check_for_rx_interrupt_queue(adapter, rx_ring) : 0;
+		if (unlikely(rc))
+			return;
+
+		budget--;
+		if (!budget)
+			break;
+	}
+
+	adapter->last_monitored_tx_qid = i % io_queue_count;
+}
+
+/* trigger napi schedule after 2 consecutive detections */
+#define EMPTY_RX_REFILL 2
+/* For the rare case where the device runs out of Rx descriptors and the
+ * napi handler failed to refill new Rx descriptors (due to a lack of memory
+ * for example).
+ * This case will lead to a deadlock:
+ * The device won't send interrupts since all the new Rx packets will be dropped
+ * The napi handler won't allocate new Rx descriptors so the device will be
+ * able to send new packets.
+ *
+ * This scenario can happen when the kernel's vm.min_free_kbytes is too small.
+ * It is recommended to have at least 512MB, with a minimum of 128MB for
+ * constrained environment).
+ *
+ * When such a situation is detected - Reschedule napi
+ */
+static void check_for_empty_rx_ring(struct ena_adapter *adapter)
+{
+	struct ena_ring *rx_ring;
+	int i, refill_required;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return;
+
+	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		rx_ring = &adapter->rx_ring[i];
+
+		/* If using UMEM, app might not provide RX buffers and the ring
+		 * can be empty
+		 */
+		if (ENA_IS_XSK_RING(rx_ring))
+			continue;
+
+		refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
+		if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
+			rx_ring->empty_rx_queue++;
+
+			if (rx_ring->empty_rx_queue >= EMPTY_RX_REFILL) {
+				ena_increase_stat(&rx_ring->rx_stats.empty_rx_ring, 1,
+						  &rx_ring->syncp);
+
+				netif_err(adapter, drv, adapter->netdev,
+					  "Trigger refill for ring %d\n", i);
+
+				napi_schedule(rx_ring->napi);
+				rx_ring->empty_rx_queue = 0;
+			}
+		} else {
+			rx_ring->empty_rx_queue = 0;
+		}
+	}
+}
+
+/* Check for keep alive expiration */
+static void check_for_missing_keep_alive(struct ena_adapter *adapter)
+{
+	unsigned long keep_alive_expired;
+
+	if (!adapter->wd_state)
+		return;
+
+	if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT)
+		return;
+
+	keep_alive_expired = adapter->last_keep_alive_jiffies +
+			     adapter->keep_alive_timeout;
+	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Keep alive watchdog timeout.\n");
+		ena_increase_stat(&adapter->dev_stats.wd_expired, 1,
+				  &adapter->syncp);
+		ena_reset_device(adapter, ENA_REGS_RESET_KEEP_ALIVE_TO);
+	}
+}
+
+static void check_for_admin_com_state(struct ena_adapter *adapter)
+{
+	if (unlikely(!ena_com_get_admin_running_state(adapter->ena_dev))) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "ENA admin queue is not in running state!\n");
+		ena_increase_stat(&adapter->dev_stats.admin_q_pause, 1,
+				  &adapter->syncp);
+		ena_reset_device(adapter, ENA_REGS_RESET_ADMIN_TO);
+	}
+}
+
+static void ena_update_hints(struct ena_adapter *adapter,
+			     struct ena_admin_ena_hw_hints *hints)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (hints->admin_completion_tx_timeout)
+		adapter->ena_dev->admin_queue.completion_timeout =
+			hints->admin_completion_tx_timeout * 1000;
+
+	if (hints->mmio_read_timeout)
+		/* convert to usec */
+		adapter->ena_dev->mmio_read.reg_read_to =
+			hints->mmio_read_timeout * 1000;
+
+	if (hints->missed_tx_completion_count_threshold_to_reset)
+		adapter->missing_tx_completion_threshold =
+			hints->missed_tx_completion_count_threshold_to_reset;
+
+	if (hints->missing_tx_completion_timeout) {
+		if (hints->missing_tx_completion_timeout == ENA_HW_HINTS_NO_TIMEOUT)
+			adapter->missing_tx_completion_to = ENA_HW_HINTS_NO_TIMEOUT;
+		else
+			adapter->missing_tx_completion_to =
+				msecs_to_jiffies(hints->missing_tx_completion_timeout);
+	}
+
+	if (hints->netdev_wd_timeout)
+		netdev->watchdog_timeo = msecs_to_jiffies(hints->netdev_wd_timeout);
+
+	if (hints->driver_watchdog_timeout) {
+		if (hints->driver_watchdog_timeout == ENA_HW_HINTS_NO_TIMEOUT)
+			adapter->keep_alive_timeout = ENA_HW_HINTS_NO_TIMEOUT;
+		else
+			adapter->keep_alive_timeout =
+				msecs_to_jiffies(hints->driver_watchdog_timeout);
+	}
+}
+
+static void ena_update_host_info(struct ena_admin_host_info *host_info,
+				 struct net_device *netdev)
+{
+	host_info->supported_network_features[0] =
+		netdev->features & GENMASK_ULL(31, 0);
+	host_info->supported_network_features[1] =
+		(netdev->features & GENMASK_ULL(63, 32)) >> 32;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+static void ena_timer_service(struct timer_list *t)
+{
+	struct ena_adapter *adapter = from_timer(adapter, t, timer_service);
+#else
+static void ena_timer_service(unsigned long data)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)data;
+#endif
+	u8 *debug_area = adapter->ena_dev->host_attr.debug_area_virt_addr;
+	struct ena_admin_host_info *host_info =
+		adapter->ena_dev->host_attr.host_info;
+
+	check_for_missing_keep_alive(adapter);
+
+	check_for_admin_com_state(adapter);
+
+	check_for_missing_completions(adapter);
+
+	check_for_empty_rx_ring(adapter);
+
+	if (debug_area)
+		ena_dump_stats_to_buf(adapter, debug_area);
+
+	if (host_info)
+		ena_update_host_info(host_info, adapter->netdev);
+
+	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		/* We don't destroy driver resources if we're not able to
+		 * communicate with the device. Failure in validating the
+		 * version implies unresponsive device.
+		 */
+		if (ena_com_validate_version(adapter->ena_dev) == -ETIME) {
+			netif_err(adapter, drv, adapter->netdev,
+				  "FW isn't responsive, skipping reset routine\n");
+			mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
+			return;
+		}
+
+		netif_err(adapter, drv, adapter->netdev,
+			  "Trigger reset is on\n");
+
+		if (adapter->reset_reason != ENA_REGS_RESET_NORMAL)
+			ena_dump_stats_to_dmesg(adapter);
+
+		queue_work(ena_wq, &adapter->reset_task);
+		return;
+	}
+
+	/* Reset the timer */
+	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
+}
+
+static u32 ena_calc_max_io_queue_num(struct pci_dev *pdev,
+				     struct ena_com_dev *ena_dev,
+				     struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	u32 io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues;
+
+	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
+		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
+			&get_feat_ctx->max_queue_ext.max_queue_ext;
+		io_rx_num = min_t(u32, max_queue_ext->max_rx_sq_num,
+				  max_queue_ext->max_rx_cq_num);
+
+		io_tx_sq_num = max_queue_ext->max_tx_sq_num;
+		io_tx_cq_num = max_queue_ext->max_tx_cq_num;
+	} else {
+		struct ena_admin_queue_feature_desc *max_queues =
+			&get_feat_ctx->max_queues;
+		io_tx_sq_num = max_queues->max_sq_num;
+		io_tx_cq_num = max_queues->max_cq_num;
+		io_rx_num = min_t(u32, io_tx_sq_num, io_tx_cq_num);
+	}
+
+	/* In case of LLQ use the llq fields for the tx SQ/CQ */
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		io_tx_sq_num = get_feat_ctx->llq.max_llq_num;
+
+	max_num_io_queues = min_t(u32, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES);
+	max_num_io_queues = min_t(u32, max_num_io_queues, io_rx_num);
+	max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_sq_num);
+	max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_cq_num);
+	/* 1 IRQ for mgmnt and 1 IRQs for each IO direction */
+	max_num_io_queues = min_t(u32, max_num_io_queues, pci_msix_vec_count(pdev) - 1);
+
+	return max_num_io_queues;
+}
+
+static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
+				 struct net_device *netdev)
+{
+	netdev_features_t dev_features = 0;
+
+	/* Set offload features */
+	if (feat->offload.tx &
+		ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK)
+		dev_features |= NETIF_F_IP_CSUM;
+
+	if (feat->offload.tx &
+		ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK)
+		dev_features |= NETIF_F_IPV6_CSUM;
+
+	if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK)
+		dev_features |= NETIF_F_TSO;
+
+	if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK)
+		dev_features |= NETIF_F_TSO6;
+
+	if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK)
+		dev_features |= NETIF_F_TSO_ECN;
+
+	if (feat->offload.rx_supported &
+		ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK)
+		dev_features |= NETIF_F_RXCSUM;
+
+	if (feat->offload.rx_supported &
+		ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK)
+		dev_features |= NETIF_F_RXCSUM;
+
+	netdev->features =
+		dev_features |
+		NETIF_F_SG |
+#ifdef NETIF_F_RXHASH
+		NETIF_F_RXHASH |
+#endif /* NETIF_F_RXHASH */
+		NETIF_F_HIGHDMA;
+
+#ifdef HAVE_RHEL6_NET_DEVICE_OPS_EXT
+	do {
+		u32 hw_features = get_netdev_hw_features(netdev);
+		hw_features |= netdev->features;
+		set_netdev_hw_features(netdev, hw_features);
+	} while (0);
+#else
+	netdev->hw_features |= netdev->features;
+#endif
+	netdev->vlan_features |= netdev->features;
+}
+
+static void ena_set_conf_feat_params(struct ena_adapter *adapter,
+				     struct ena_com_dev_get_features_ctx *feat)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	/* Copy mac address */
+	if (!is_valid_ether_addr(feat->dev_attr.mac_addr)) {
+		eth_hw_addr_random(netdev);
+		ether_addr_copy(adapter->mac_addr, netdev->dev_addr);
+	} else {
+		ether_addr_copy(adapter->mac_addr, feat->dev_attr.mac_addr);
+		eth_hw_addr_set(netdev, adapter->mac_addr);
+	}
+
+	/* Set offload features */
+	ena_set_dev_offloads(feat, netdev);
+
+	adapter->max_mtu = feat->dev_attr.max_mtu;
+#ifdef HAVE_MTU_MIN_MAX_IN_NET_DEVICE
+	netdev->max_mtu = adapter->max_mtu;
+	netdev->min_mtu = ENA_MIN_MTU;
+#endif
+}
+
+static int ena_rss_init_default(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct device *dev = &adapter->pdev->dev;
+	int rc, i;
+	u32 val;
+
+	rc = ena_com_rss_init(ena_dev, ENA_RX_RSS_TABLE_LOG_SIZE);
+	if (unlikely(rc)) {
+		dev_err(dev, "Cannot init indirect table\n");
+		goto err_rss_init;
+	}
+
+	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
+		val = ethtool_rxfh_indir_default(i, adapter->num_io_queues);
+		rc = ena_com_indirect_table_fill_entry(ena_dev, i,
+						       ENA_IO_RXQ_IDX(val));
+		if (unlikely(rc)) {
+			dev_err(dev, "Cannot fill indirect table\n");
+			goto err_fill_indir;
+		}
+	}
+
+	rc = ena_com_fill_hash_function(ena_dev, ENA_ADMIN_TOEPLITZ, NULL,
+					ENA_HASH_KEY_SIZE, 0xFFFFFFFF);
+	if (unlikely(rc && (rc != -EOPNOTSUPP))) {
+		dev_err(dev, "Cannot fill hash function\n");
+		goto err_fill_indir;
+	}
+
+	rc = ena_com_set_default_hash_ctrl(ena_dev);
+	if (unlikely(rc && (rc != -EOPNOTSUPP))) {
+		dev_err(dev, "Cannot fill hash control\n");
+		goto err_fill_indir;
+	}
+
+	return 0;
+
+err_fill_indir:
+	ena_com_rss_destroy(ena_dev);
+err_rss_init:
+
+	return rc;
+}
+
+static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
+{
+	int release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK;
+
+	pci_release_selected_regions(pdev, release_bars);
+}
+
+
+static int ena_calc_io_queue_size(struct ena_adapter *adapter,
+				   struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	struct ena_admin_feature_llq_desc *llq = &get_feat_ctx->llq;
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	u32 tx_queue_size = ENA_DEFAULT_RING_SIZE;
+	bool tx_configured, rx_configured;
+	u32 max_tx_queue_size;
+	u32 max_rx_queue_size;
+
+	/* If this function is called after driver load, the ring sizes have
+	 * already been configured. Take it into account when recalculating ring
+	 * size.
+	 */
+	tx_configured = !!adapter->tx_ring[0].ring_size;
+	rx_configured = !!adapter->rx_ring[0].ring_size;
+	tx_queue_size = tx_configured ? adapter->tx_ring[0].ring_size : tx_queue_size;
+	rx_queue_size = rx_configured ? adapter->rx_ring[0].ring_size : rx_queue_size;
+
+	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
+		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
+			&get_feat_ctx->max_queue_ext.max_queue_ext;
+		max_rx_queue_size = min_t(u32, max_queue_ext->max_rx_cq_depth,
+					  max_queue_ext->max_rx_sq_depth);
+		max_tx_queue_size = max_queue_ext->max_tx_cq_depth;
+
+		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  llq->max_llq_depth);
+		else
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  max_queue_ext->max_tx_sq_depth);
+
+		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+						 max_queue_ext->max_per_packet_tx_descs);
+		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+						 max_queue_ext->max_per_packet_rx_descs);
+	} else {
+		struct ena_admin_queue_feature_desc *max_queues =
+			&get_feat_ctx->max_queues;
+		max_rx_queue_size = min_t(u32, max_queues->max_cq_depth,
+					  max_queues->max_sq_depth);
+		max_tx_queue_size = max_queues->max_cq_depth;
+
+		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  llq->max_llq_depth);
+		else
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  max_queues->max_sq_depth);
+
+		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+						 max_queues->max_packet_tx_descs);
+		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+						 max_queues->max_packet_rx_descs);
+	}
+
+	max_tx_queue_size = rounddown_pow_of_two(max_tx_queue_size);
+	max_rx_queue_size = rounddown_pow_of_two(max_rx_queue_size);
+
+	if (max_tx_queue_size < ENA_MIN_RING_SIZE) {
+		netdev_err(adapter->netdev, "Device max TX queue size: %d < minimum: %d\n",
+			   max_tx_queue_size, ENA_MIN_RING_SIZE);
+		return -EFAULT;
+	}
+
+	if (max_rx_queue_size < ENA_MIN_RING_SIZE) {
+		netdev_err(adapter->netdev, "Device max RX queue size: %d < minimum: %d\n",
+			   max_rx_queue_size, ENA_MIN_RING_SIZE);
+		return -EFAULT;
+	}
+
+	/* When forcing large headers, we multiply the entry size by 2,
+	 * and therefore divide the queue size by 2, leaving the amount
+	 * of memory used by the queues unchanged.
+	 */
+	if (adapter->large_llq_header_enabled) {
+		if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
+		    (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)) {
+			max_tx_queue_size /= 2;
+			dev_info(&adapter->pdev->dev, "Forcing large headers and decreasing maximum TX queue size to %d\n",
+				 max_tx_queue_size);
+		} else {
+			dev_err(&adapter->pdev->dev, "Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
+
+			adapter->large_llq_header_enabled = false;
+			ena_devlink_disable_large_llq_header_param(adapter->devlink);
+		}
+	}
+
+	tx_queue_size = clamp_val(tx_queue_size, ENA_MIN_RING_SIZE,
+				  max_tx_queue_size);
+	rx_queue_size = clamp_val(rx_queue_size, ENA_MIN_RING_SIZE,
+				  max_rx_queue_size);
+
+	tx_queue_size = rounddown_pow_of_two(tx_queue_size);
+	rx_queue_size = rounddown_pow_of_two(rx_queue_size);
+
+	adapter->max_tx_ring_size  = max_tx_queue_size;
+	adapter->max_rx_ring_size = max_rx_queue_size;
+	adapter->requested_tx_ring_size = tx_queue_size;
+	adapter->requested_rx_ring_size = rx_queue_size;
+
+	return 0;
+}
+
+/* ena_probe - Device Initialization Routine
+ * @pdev: PCI device information struct
+ * @ent: entry in ena_pci_tbl
+ *
+ * Returns 0 on success, negative on failure
+ *
+ * ena_probe initializes an adapter identified by a pci_dev structure.
+ * The OS initialization, configuring of the adapter private structure,
+ * and a hardware reset occur.
+ */
+static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct ena_com_dev_get_features_ctx get_feat_ctx;
+	struct ena_com_dev *ena_dev = NULL;
+	struct ena_adapter *adapter;
+	struct net_device *netdev;
+	static int adapters_found;
+	struct devlink *devlink;
+	u32 max_num_io_queues;
+	bool wd_state;
+	int bars, rc;
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+
+	dev_info_once(&pdev->dev, "%s", version);
+
+	rc = pci_enable_device_mem(pdev);
+	if (rc) {
+		dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n");
+		return rc;
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)
+	rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS));
+	if (rc) {
+		dev_err(&pdev->dev, "dma_set_mask_and_coherent failed %d\n", rc);
+		goto err_disable_device;
+	}
+#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS));
+	if (rc) {
+		dev_err(&pdev->dev, "pci_set_dma_mask failed %d\n", rc);
+		goto err_disable_device;
+	}
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS));
+	if (rc) {
+		dev_err(&pdev->dev, "err_pci_set_consistent_dma_mask failed %d\n",
+			rc);
+		goto err_disable_device;
+	}
+#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */
+
+	pci_set_master(pdev);
+
+	ena_dev = vzalloc(sizeof(*ena_dev));
+	if (!ena_dev) {
+		rc = -ENOMEM;
+		goto err_disable_device;
+	}
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK;
+	rc = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+	if (rc) {
+		dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n",
+			rc);
+		goto err_free_ena_dev;
+	}
+
+	ena_dev->reg_bar = devm_ioremap(&pdev->dev,
+					pci_resource_start(pdev, ENA_REG_BAR),
+					pci_resource_len(pdev, ENA_REG_BAR));
+	if (!ena_dev->reg_bar) {
+		dev_err(&pdev->dev, "Failed to remap regs bar\n");
+		rc = -EFAULT;
+		goto err_free_region;
+	}
+
+	ena_dev->ena_min_poll_delay_us = ENA_ADMIN_POLL_DELAY_US;
+
+	ena_dev->dmadev = &pdev->dev;
+
+	netdev = alloc_etherdev_mq(sizeof(struct ena_adapter), ENA_MAX_RINGS);
+	if (!netdev) {
+		dev_err(&pdev->dev, "alloc_etherdev_mq failed\n");
+		rc = -ENOMEM;
+		goto err_free_region;
+	}
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+	adapter = netdev_priv(netdev);
+	adapter->ena_dev = ena_dev;
+	adapter->netdev = netdev;
+	adapter->pdev = pdev;
+	adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
+
+	ena_dev->net_device = netdev;
+
+	pci_set_drvdata(pdev, adapter);
+
+	adapter->large_llq_header_enabled = !!force_large_llq_header;
+
+	devlink = ena_devlink_alloc(adapter);
+	if (!devlink) {
+		netdev_err(netdev, "ena_devlink_alloc failed\n");
+		goto err_netdev_destroy;
+	}
+
+	rc = ena_map_llq_mem_bar(pdev, ena_dev, bars);
+	if (rc) {
+		dev_err(&pdev->dev, "ENA LLQ bar mapping failed\n");
+		goto err_devlink_destroy;
+	}
+
+	rc = ena_device_init(adapter, pdev, &get_feat_ctx, &wd_state);
+	if (rc) {
+		dev_err(&pdev->dev, "ENA device init failed\n");
+		if (rc == -ETIME)
+			rc = -EPROBE_DEFER;
+		goto err_devlink_destroy;
+	}
+
+	/* Initial TX and RX interrupt delay. Assumes 1 usec granularity.
+	 * Updated during device initialization with the real granularity
+	 */
+	ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS;
+	ena_dev->intr_moder_rx_interval = ENA_INTR_INITIAL_RX_INTERVAL_USECS;
+	ena_dev->intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION;
+	max_num_io_queues = ena_calc_max_io_queue_num(pdev, ena_dev, &get_feat_ctx);
+	if (unlikely(!max_num_io_queues)) {
+		rc = -EFAULT;
+		goto err_device_destroy;
+	}
+
+	ena_set_conf_feat_params(adapter, &get_feat_ctx);
+
+	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
+
+	adapter->num_io_queues = clamp_val(num_io_queues, ENA_MIN_NUM_IO_QUEUES,
+					   max_num_io_queues);
+	adapter->used_lpc_size = lpc_size;
+	/* When LPC is enabled after driver load, the configured_lpc_size is
+	 * used. Leaving it as 0, wouldn't change LPC state so we set it to
+	 * different value
+	 */
+	adapter->configured_lpc_size = lpc_size ? : ENA_LPC_DEFAULT_MULTIPLIER;
+	adapter->max_num_io_queues = max_num_io_queues;
+	adapter->last_monitored_tx_qid = 0;
+
+	adapter->xdp_first_ring = 0;
+	adapter->xdp_num_queues = 0;
+
+	adapter->rx_copybreak = ENA_DEFAULT_RX_COPYBREAK;
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		adapter->disable_meta_caching =
+			!!(get_feat_ctx.llq.accel_mode.u.get.supported_flags &
+			   BIT(ENA_ADMIN_DISABLE_META_CACHING));
+
+	adapter->wd_state = wd_state;
+
+	snprintf(adapter->name, ENA_NAME_MAX_LEN, "ena_%d", adapters_found);
+
+	rc = ena_com_init_interrupt_moderation(adapter->ena_dev);
+	if (rc) {
+		dev_err(&pdev->dev,
+			"Failed to query interrupt moderation feature\n");
+		goto err_device_destroy;
+	}
+
+	ena_init_io_rings(adapter,
+			  0,
+			  adapter->xdp_num_queues +
+			  adapter->num_io_queues);
+
+	netdev->netdev_ops = &ena_netdev_ops;
+	netdev->watchdog_timeo = TX_TIMEOUT;
+	ena_set_ethtool_ops(netdev);
+
+#if defined(NETIF_F_MQ_TX_LOCK_OPT)
+	netdev->features &= ~NETIF_F_MQ_TX_LOCK_OPT;
+#endif /* defined(NETIF_F_MQ_TX_LOCK_OPT) */
+#ifdef IFF_UNICAST_FLT
+	netdev->priv_flags |= IFF_UNICAST_FLT;
+#endif /* IFF_UNICAST_FLT */
+
+	u64_stats_init(&adapter->syncp);
+
+	rc = ena_enable_msix_and_set_admin_interrupts(adapter);
+	if (rc) {
+		dev_err(&pdev->dev,
+			"Failed to enable and set the admin interrupts\n");
+		goto err_worker_destroy;
+	}
+	rc = ena_sysfs_init(&adapter->pdev->dev);
+	if (rc) {
+		dev_err(&pdev->dev, "Cannot init sysfs\n");
+		goto err_free_msix;
+	}
+	rc = ena_rss_init_default(adapter);
+	if (rc && (rc != -EOPNOTSUPP)) {
+		dev_err(&pdev->dev, "Cannot init RSS rc: %d\n", rc);
+		goto err_terminate_sysfs;
+	}
+
+	ena_config_debug_area(adapter);
+
+	memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len);
+
+	netif_carrier_off(netdev);
+
+	rc = register_netdev(netdev);
+	if (rc) {
+		dev_err(&pdev->dev, "Cannot register net device\n");
+		goto err_rss;
+	}
+
+	INIT_WORK(&adapter->reset_task, ena_fw_reset_device);
+
+	adapter->last_keep_alive_jiffies = jiffies;
+	adapter->keep_alive_timeout = ENA_DEVICE_KALIVE_TIMEOUT;
+	adapter->missing_tx_completion_to = TX_TIMEOUT;
+	adapter->missing_tx_completion_threshold = MAX_NUM_OF_TIMEOUTED_PACKETS;
+
+	ena_update_hints(adapter, &get_feat_ctx.hw_hints);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+	timer_setup(&adapter->timer_service, ena_timer_service, 0);
+#else
+	setup_timer(&adapter->timer_service, ena_timer_service,
+		    (unsigned long)adapter);
+#endif
+	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
+
+	dev_info(&pdev->dev,
+		 "%s found at mem %lx, mac addr %pM\n",
+		 DEVICE_NAME, (long)pci_resource_start(pdev, 0),
+		 netdev->dev_addr);
+
+	set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
+
+	adapters_found++;
+
+	ena_devlink_register(devlink, &pdev->dev);
+
+	return 0;
+
+err_rss:
+	ena_com_delete_debug_area(ena_dev);
+	ena_com_rss_destroy(ena_dev);
+err_terminate_sysfs:
+	ena_sysfs_terminate(&pdev->dev);
+err_free_msix:
+	ena_com_dev_reset(ena_dev, ENA_REGS_RESET_INIT_ERR);
+	/* stop submitting admin commands on a device that was reset */
+	ena_com_set_admin_running_state(ena_dev, false);
+	ena_free_mgmnt_irq(adapter);
+	ena_disable_msix(adapter);
+err_worker_destroy:
+	del_timer(&adapter->timer_service);
+err_device_destroy:
+	ena_com_delete_host_info(ena_dev);
+	ena_com_admin_destroy(ena_dev);
+err_devlink_destroy:
+	ena_devlink_free(devlink);
+err_netdev_destroy:
+	free_netdev(netdev);
+err_free_region:
+	ena_release_bars(ena_dev, pdev);
+err_free_ena_dev:
+	vfree(ena_dev);
+err_disable_device:
+	pci_disable_device(pdev);
+	return rc;
+}
+
+/*****************************************************************************/
+
+/* __ena_shutoff - Helper used in both PCI remove/shutdown routines
+ * @pdev: PCI device information struct
+ * @shutdown: Is it a shutdown operation? If false, means it is a removal
+ *
+ * __ena_shutoff is a helper routine that does the real work on shutdown and
+ * removal paths; the difference between those paths is with regards to whether
+ * dettach or unregister the netdevice.
+ */
+static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
+{
+	struct ena_adapter *adapter = pci_get_drvdata(pdev);
+	struct ena_com_dev *ena_dev;
+	struct net_device *netdev;
+	struct devlink *devlink;
+
+	ena_dev = adapter->ena_dev;
+	netdev = adapter->netdev;
+
+	devlink = adapter->devlink;
+	ena_devlink_unregister(devlink);
+	ena_devlink_free(devlink);
+
+#ifdef CONFIG_RFS_ACCEL
+	if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) {
+		free_irq_cpu_rmap(netdev->rx_cpu_rmap);
+		netdev->rx_cpu_rmap = NULL;
+	}
+
+#endif /* CONFIG_RFS_ACCEL */
+	/* Make sure timer and reset routine won't be called after
+	 * freeing device resources.
+	 */
+	del_timer_sync(&adapter->timer_service);
+	cancel_work_sync(&adapter->reset_task);
+
+	rtnl_lock(); /* lock released inside the below if-else block */
+	adapter->reset_reason = ENA_REGS_RESET_SHUTDOWN;
+	ena_destroy_device(adapter, true);
+
+	if (shutdown) {
+		netif_device_detach(netdev);
+		dev_close(netdev);
+		rtnl_unlock();
+	} else {
+		rtnl_unlock();
+		unregister_netdev(netdev);
+		free_netdev(netdev);
+	}
+
+	ena_com_rss_destroy(ena_dev);
+
+	ena_com_delete_debug_area(ena_dev);
+
+	ena_com_delete_host_info(ena_dev);
+
+	ena_release_bars(ena_dev, pdev);
+
+	pci_disable_device(pdev);
+
+	vfree(ena_dev);
+}
+
+/* ena_remove - Device Removal Routine
+ * @pdev: PCI device information struct
+ *
+ * ena_remove is called by the PCI subsystem to alert the driver
+ * that it should release a PCI device.
+ */
+
+static void ena_remove(struct pci_dev *pdev)
+{
+	__ena_shutoff(pdev, false);
+}
+
+/* ena_shutdown - Device Shutdown Routine
+ * @pdev: PCI device information struct
+ *
+ * ena_shutdown is called by the PCI subsystem to alert the driver that
+ * a shutdown/reboot (or kexec) is happening and device must be disabled.
+ */
+
+static void ena_shutdown(struct pci_dev *pdev)
+{
+	__ena_shutoff(pdev, true);
+}
+
+#ifdef CONFIG_PM
+#ifdef ENA_GENERIC_PM_OPS
+/* ena_suspend - PM suspend callback
+ * @dev_d: Device information struct
+ */
+static int __maybe_unused ena_suspend(struct device *dev_d)
+{
+	struct pci_dev *pdev = to_pci_dev(dev_d);
+#else /* ENA_GENERIC_PM_OPS */
+/* ena_suspend - PM suspend callback
+ * @pdev: PCI device information struct
+ * @state:power state
+ */
+static int ena_suspend(struct pci_dev *pdev,  pm_message_t state)
+{
+#endif /* ENA_GENERIC_PM_OPS */
+	struct ena_adapter *adapter = pci_get_drvdata(pdev);
+
+	ena_increase_stat(&adapter->dev_stats.suspend, 1, &adapter->syncp);
+
+	rtnl_lock();
+	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		dev_err(&pdev->dev,
+			"Ignoring device reset request as the device is being suspended\n");
+		clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+	}
+	ena_destroy_device(adapter, true);
+	rtnl_unlock();
+	return 0;
+}
+
+#ifdef ENA_GENERIC_PM_OPS
+/* ena_resume - PM resume callback
+ * @dev_d: Device information struct
+ */
+static int __maybe_unused ena_resume(struct device *dev_d)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev_d);
+#else /* ENA_GENERIC_PM_OPS */
+/* ena_resume - PM resume callback
+ * @pdev: PCI device information struct
+ *
+ */
+static int ena_resume(struct pci_dev *pdev)
+{
+	struct ena_adapter *adapter = pci_get_drvdata(pdev);
+#endif /* ENA_GENERIC_PM_OPS */
+	int rc;
+
+	ena_increase_stat(&adapter->dev_stats.resume, 1, &adapter->syncp);
+
+	rtnl_lock();
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,5,0)
+	pci_set_power_state(pdev, PCI_D0);
+#endif
+	rc = ena_restore_device(adapter);
+	rtnl_unlock();
+	return rc;
+}
+#endif /* CONFIG_PM */
+#ifdef ENA_GENERIC_PM_OPS
+
+static SIMPLE_DEV_PM_OPS(ena_pm_ops, ena_suspend, ena_resume);
+#endif /* ENA_GENERIC_PM_OPS */
+
+static struct pci_driver ena_pci_driver = {
+	.name		= DRV_MODULE_NAME,
+	.id_table	= ena_pci_tbl,
+	.probe		= ena_probe,
+	.remove		= ena_remove,
+	.shutdown	= ena_shutdown,
+#ifdef ENA_GENERIC_PM_OPS
+	.driver.pm	= &ena_pm_ops,
+#else /* ENA_GENERIC_PM_OPS */
+#ifdef CONFIG_PM
+	.suspend    = ena_suspend,
+	.resume     = ena_resume,
+#endif /* CONFIG_PM */
+#endif /* ENA_GENERIC_PM_OPS */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0)
+	.sriov_configure = pci_sriov_configure_simple,
+#endif
+};
+
+static int __init ena_init(void)
+{
+	ena_wq = create_singlethread_workqueue(DRV_MODULE_NAME);
+	if (!ena_wq) {
+		pr_err("Failed to create workqueue\n");
+		return -ENOMEM;
+	}
+
+	return pci_register_driver(&ena_pci_driver);
+}
+
+static void __exit ena_cleanup(void)
+{
+	pci_unregister_driver(&ena_pci_driver);
+
+	if (ena_wq) {
+		destroy_workqueue(ena_wq);
+		ena_wq = NULL;
+	}
+}
+
+/******************************************************************************
+ ******************************** AENQ Handlers *******************************
+ *****************************************************************************/
+/* ena_update_on_link_change:
+ * Notify the network interface about the change in link status
+ */
+static void ena_update_on_link_change(void *adapter_data,
+				      struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+	struct ena_admin_aenq_link_change_desc *aenq_desc =
+		(struct ena_admin_aenq_link_change_desc *)aenq_e;
+	int status = aenq_desc->flags &
+		ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK;
+
+	if (status) {
+		netif_dbg(adapter, ifup, adapter->netdev, "%s\n", __func__);
+		set_bit(ENA_FLAG_LINK_UP, &adapter->flags);
+		if (!test_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags))
+			netif_carrier_on(adapter->netdev);
+	} else {
+		clear_bit(ENA_FLAG_LINK_UP, &adapter->flags);
+		netif_carrier_off(adapter->netdev);
+	}
+}
+
+static void ena_keep_alive_wd(void *adapter_data,
+			      struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+	struct ena_admin_aenq_keep_alive_desc *desc;
+	u64 rx_drops;
+	u64 tx_drops;
+
+	desc = (struct ena_admin_aenq_keep_alive_desc *)aenq_e;
+	adapter->last_keep_alive_jiffies = jiffies;
+
+	rx_drops = ((u64)desc->rx_drops_high << 32) | desc->rx_drops_low;
+	tx_drops = ((u64)desc->tx_drops_high << 32) | desc->tx_drops_low;
+
+	u64_stats_update_begin(&adapter->syncp);
+	/* These stats are accumulated by the device, so the counters indicate
+	 * all drops since last reset.
+	 */
+	adapter->dev_stats.rx_drops = rx_drops;
+	adapter->dev_stats.tx_drops = tx_drops;
+	u64_stats_update_end(&adapter->syncp);
+}
+
+static void ena_notification(void *adapter_data,
+			     struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+	struct ena_admin_ena_hw_hints *hints;
+
+	WARN(aenq_e->aenq_common_desc.group != ENA_ADMIN_NOTIFICATION,
+	     "Invalid group(%x) expected %x\n",
+	     aenq_e->aenq_common_desc.group,
+	     ENA_ADMIN_NOTIFICATION);
+
+	switch (aenq_e->aenq_common_desc.syndrome) {
+	case ENA_ADMIN_UPDATE_HINTS:
+		hints = (struct ena_admin_ena_hw_hints *)
+			(&aenq_e->inline_data_w4);
+		ena_update_hints(adapter, hints);
+		break;
+	default:
+		netif_err(adapter, drv, adapter->netdev,
+			  "Invalid aenq notification link state %d\n",
+			  aenq_e->aenq_common_desc.syndrome);
+	}
+}
+
+static void ena_refresh_fw_capabilites(void *adapter_data,
+				       struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+
+	netdev_info(adapter->netdev, "Received requet to refresh capabilities\n");
+
+	set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+}
+
+/* This handler will called for unknown event group or unimplemented handlers*/
+static void unimplemented_aenq_handler(void *data,
+				       struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)data;
+
+	netif_err(adapter, drv, adapter->netdev,
+		  "Unknown event was received or event with unimplemented handler\n");
+}
+
+static struct ena_aenq_handlers aenq_handlers = {
+	.handlers = {
+		[ENA_ADMIN_LINK_CHANGE] = ena_update_on_link_change,
+		[ENA_ADMIN_NOTIFICATION] = ena_notification,
+		[ENA_ADMIN_KEEP_ALIVE] = ena_keep_alive_wd,
+		[ENA_ADMIN_REFRESH_CAPABILITIES] = ena_refresh_fw_capabilites,
+	},
+	.unimplemented_handler = unimplemented_aenq_handler
+};
+
+module_init(ena_init);
+module_exit(ena_cleanup);
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
new file mode 100644
index 0000000000000..7b373cf6545e9
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -0,0 +1,618 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef ENA_H
+#define ENA_H
+
+#include "kcompat.h"
+#include <linux/bitops.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0)
+#include "dim.h"
+#else
+#include <linux/dim.h>
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/inetdevice.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#ifdef HAS_BPF_HEADER
+#include <uapi/linux/bpf.h>
+#endif
+#include <linux/u64_stats_sync.h>
+
+#include "ena_com.h"
+#include "ena_eth_com.h"
+
+#define DRV_MODULE_GEN_MAJOR	2
+#define DRV_MODULE_GEN_MINOR	8
+#define DRV_MODULE_GEN_SUBMINOR 0
+
+#define DRV_MODULE_NAME		"ena"
+#ifndef DRV_MODULE_GENERATION
+#define DRV_MODULE_GENERATION \
+	__stringify(DRV_MODULE_GEN_MAJOR) "."	\
+	__stringify(DRV_MODULE_GEN_MINOR) "."	\
+	__stringify(DRV_MODULE_GEN_SUBMINOR) "g"
+#endif
+
+#define DEVICE_NAME	"Elastic Network Adapter (ENA)"
+
+/* 1 for AENQ + ADMIN */
+#define ENA_ADMIN_MSIX_VEC		1
+#define ENA_MAX_MSIX_VEC(io_queues)	(ENA_ADMIN_MSIX_VEC + (io_queues))
+
+/* The ENA buffer length fields is 16 bit long. So when PAGE_SIZE == 64kB the
+ * driver passes 0.
+ * Since the max packet size the ENA handles is ~9kB limit the buffer length to
+ * 16kB.
+ */
+#if PAGE_SIZE > SZ_16K
+#define ENA_PAGE_SIZE (_AC(SZ_16K, UL))
+#else
+#define ENA_PAGE_SIZE PAGE_SIZE
+#endif
+
+#define ENA_MIN_MSIX_VEC		2
+
+#define ENA_REG_BAR			0
+#define ENA_MEM_BAR			2
+#define ENA_BAR_MASK (BIT(ENA_REG_BAR) | BIT(ENA_MEM_BAR))
+
+#define ENA_DEFAULT_RING_SIZE	(1024)
+#define ENA_MIN_RING_SIZE	(256)
+
+#define ENA_MIN_RX_BUF_SIZE (2048)
+
+#define ENA_MIN_NUM_IO_QUEUES	(1)
+
+#define ENA_TX_WAKEUP_THRESH		(MAX_SKB_FRAGS + 2)
+#define ENA_DEFAULT_RX_COPYBREAK	(256 - NET_IP_ALIGN)
+
+#define ENA_MIN_MTU		128
+
+#define ENA_NAME_MAX_LEN	20
+#define ENA_IRQNAME_SIZE	40
+
+#define ENA_PKT_MAX_BUFS	19
+
+#define ENA_RX_RSS_TABLE_LOG_SIZE  7
+#define ENA_RX_RSS_TABLE_SIZE	(1 << ENA_RX_RSS_TABLE_LOG_SIZE)
+
+/* The number of tx packet completions that will be handled each NAPI poll
+ * cycle is ring_size / ENA_TX_POLL_BUDGET_DIVIDER.
+ */
+#define ENA_TX_POLL_BUDGET_DIVIDER	4
+
+/* Refill Rx queue when number of required descriptors is above
+ * QUEUE_SIZE / ENA_RX_REFILL_THRESH_DIVIDER or ENA_RX_REFILL_THRESH_PACKET
+ */
+#define ENA_RX_REFILL_THRESH_DIVIDER	8
+#define ENA_RX_REFILL_THRESH_PACKET	256
+
+/* Number of queues to check for missing queues per timer service */
+#define ENA_MONITORED_TX_QUEUES	4
+/* Max timeout packets before device reset */
+#define MAX_NUM_OF_TIMEOUTED_PACKETS 128
+
+#define ENA_TX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1))
+
+#define ENA_RX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1))
+#define ENA_RX_RING_IDX_ADD(idx, n, ring_size) \
+	(((idx) + (n)) & ((ring_size) - 1))
+
+#define ENA_IO_TXQ_IDX(q)	(2 * (q))
+#define ENA_IO_RXQ_IDX(q)	(2 * (q) + 1)
+#define ENA_IO_TXQ_IDX_TO_COMBINED_IDX(q)	((q) / 2)
+#define ENA_IO_RXQ_IDX_TO_COMBINED_IDX(q)	(((q) - 1) / 2)
+
+#define ENA_MGMNT_IRQ_IDX		0
+#define ENA_IO_IRQ_FIRST_IDX		1
+#define ENA_IO_IRQ_IDX(q)		(ENA_IO_IRQ_FIRST_IDX + (q))
+
+#define ENA_ADMIN_POLL_DELAY_US 5000
+
+/* ENA device should send keep alive msg every 1 sec.
+ * We wait for 6 sec just to be on the safe side.
+ */
+#define ENA_DEVICE_KALIVE_TIMEOUT	(6 * HZ)
+#define ENA_MAX_NO_INTERRUPT_ITERATIONS 3
+
+#define ENA_MMIO_DISABLE_REG_READ	BIT(0)
+
+struct ena_page_cache;
+
+struct ena_phc_info;
+
+struct ena_irq {
+	irq_handler_t handler;
+	void *data;
+	int cpu;
+	u32 vector;
+	cpumask_t affinity_hint_mask;
+	char name[ENA_IRQNAME_SIZE];
+};
+
+struct ena_napi {
+	u8 first_interrupt ____cacheline_aligned;
+	u8 interrupts_masked;
+	struct napi_struct napi;
+	struct ena_ring *tx_ring;
+	struct ena_ring *rx_ring;
+	u32 qid;
+	struct dim dim;
+};
+
+struct ena_tx_buffer {
+	union {
+		struct sk_buff *skb;
+#ifdef ENA_XDP_SUPPORT
+		/* XDP buffer structure which is used for sending packets in
+		 * the xdp queues
+		 */
+		struct xdp_frame *xdpf;
+#endif /* ENA_XDP_SUPPORT */
+	};
+	/* num of ena desc for this specific skb
+	 * (includes data desc and metadata desc)
+	 */
+	u32 tx_descs;
+	/* num of buffers used by this skb */
+	u32 num_of_bufs;
+
+	/* Total size of all buffers */
+	u32 total_tx_size;
+
+	/* Indicate if bufs[0] map the linear data of the skb. */
+	u8 map_linear_data;
+
+	/* Used for detect missing tx packets to limit the number of prints */
+	u8 print_once;
+	/* Save the last jiffies to detect missing tx packets
+	 *
+	 * sets to non zero value on ena_start_xmit and set to zero on
+	 * napi and timer_Service_routine.
+	 *
+	 * while this value is not protected by lock,
+	 * a given packet is not expected to be handled by ena_start_xmit
+	 * and by napi/timer_service at the same time.
+	 */
+	unsigned long last_jiffies;
+	struct ena_com_buf bufs[ENA_PKT_MAX_BUFS];
+} ____cacheline_aligned;
+
+struct ena_rx_buffer {
+	struct sk_buff *skb;
+	union {
+		struct {
+			struct page *page;
+			dma_addr_t dma_addr;
+		};
+#ifdef ENA_XDP_SUPPORT
+		/* XSK pool buffer */
+		struct xdp_buff *xdp;
+#endif
+	};
+	u32 page_offset;
+	u32 buf_offset;
+	struct ena_com_buf ena_buf;
+	bool is_lpc_page;
+} ____cacheline_aligned;
+
+struct ena_stats_tx {
+	u64 cnt;
+	u64 bytes;
+	u64 queue_stop;
+	u64 prepare_ctx_err;
+	u64 queue_wakeup;
+	u64 dma_mapping_err;
+	u64 linearize;
+	u64 linearize_failed;
+	u64 napi_comp;
+	u64 tx_poll;
+	u64 doorbells;
+	u64 bad_req_id;
+	u64 llq_buffer_copy;
+	u64 missed_tx;
+	u64 unmask_interrupt;
+	u64 last_napi_jiffies;
+#ifdef ENA_AF_XDP_SUPPORT
+	u64 xsk_need_wakeup_set;
+	u64 xsk_wakeup_request;
+#endif /* ENA_AF_XDP_SUPPORT */
+};
+
+struct ena_stats_rx {
+	u64 cnt;
+	u64 bytes;
+	u64 rx_copybreak_pkt;
+	u64 csum_good;
+	u64 refil_partial;
+	u64 csum_bad;
+	u64 page_alloc_fail;
+	u64 skb_alloc_fail;
+	u64 dma_mapping_err;
+	u64 bad_desc_num;
+#ifdef ENA_BUSY_POLL_SUPPORT
+	u64 bp_yield;
+	u64 bp_missed;
+	u64 bp_cleaned;
+#endif
+	u64 bad_req_id;
+	u64 empty_rx_ring;
+	u64 csum_unchecked;
+#ifdef ENA_XDP_SUPPORT
+	u64 xdp_aborted;
+	u64 xdp_drop;
+	u64 xdp_pass;
+	u64 xdp_tx;
+	u64 xdp_invalid;
+	u64 xdp_redirect;
+#endif
+	u64 lpc_warm_up;
+	u64 lpc_full;
+	u64 lpc_wrong_numa;
+#ifdef ENA_AF_XDP_SUPPORT
+	u64 xsk_need_wakeup_set;
+	u64 zc_queue_pkt_copy;
+#endif /* ENA_AF_XDP_SUPPORT */
+};
+
+struct ena_ring {
+	/* Holds the empty requests for TX/RX
+	 * out of order completions
+	 */
+	u16 *free_ids;
+
+	union {
+		struct ena_tx_buffer *tx_buffer_info;
+		struct ena_rx_buffer *rx_buffer_info;
+	};
+
+	/* cache ptr to avoid using the adapter */
+	struct device *dev;
+	struct pci_dev *pdev;
+	struct napi_struct *napi;
+	struct net_device *netdev;
+	struct ena_page_cache *page_cache;
+	struct ena_com_dev *ena_dev;
+	struct ena_adapter *adapter;
+	struct ena_com_io_cq *ena_com_io_cq;
+	struct ena_com_io_sq *ena_com_io_sq;
+#ifdef ENA_XDP_SUPPORT
+	struct bpf_prog *xdp_bpf_prog;
+	struct xdp_rxq_info xdp_rxq;
+	spinlock_t xdp_tx_lock;	/* synchronize XDP TX/Redirect traffic */
+	/* Used for rx queues only to point to the xdp tx ring, to
+	 * which traffic should be redirected from this rx ring.
+	 */
+	struct ena_ring *xdp_ring;
+#ifdef ENA_AF_XDP_SUPPORT
+	struct xsk_buff_pool *xsk_pool;
+#endif /* ENA_AF_XDP_SUPPORT */
+#endif /* ENA_XDP_SUPPORT */
+
+	u16 next_to_use;
+	u16 next_to_clean;
+	u16 rx_copybreak;
+	u16 rx_headroom;
+	u16 qid;
+	u16 mtu;
+	u16 sgl_size;
+	u8 enable_bql;
+
+	/* The maximum header length the device can handle */
+	u8 tx_max_header_size;
+
+	bool disable_meta_caching;
+	u16 no_interrupt_event_cnt;
+
+	/* cpu and NUMA for TPH */
+	int cpu;
+	int numa_node;
+
+	/* number of tx/rx_buffer_info's entries */
+	int ring_size;
+
+	enum ena_admin_placement_policy_type tx_mem_queue_type;
+
+	struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS];
+	u32  smoothed_interval;
+	u32  per_napi_packets;
+	u16 non_empty_napi_events;
+	struct u64_stats_sync syncp;
+	union {
+		struct ena_stats_tx tx_stats;
+		struct ena_stats_rx rx_stats;
+	};
+
+	u8 *push_buf_intermediate_buf;
+	int empty_rx_queue;
+#ifdef ENA_BUSY_POLL_SUPPORT
+	atomic_t bp_state;
+#endif
+} ____cacheline_aligned;
+
+#ifdef ENA_BUSY_POLL_SUPPORT
+enum ena_busy_poll_state_t {
+	ENA_BP_STATE_IDLE = 0,
+	ENA_BP_STATE_NAPI,
+	ENA_BP_STATE_POLL,
+	ENA_BP_STATE_DISABLE
+};
+#endif
+struct ena_stats_dev {
+	u64 tx_timeout;
+	u64 suspend;
+	u64 resume;
+	u64 wd_expired;
+	u64 interface_up;
+	u64 interface_down;
+	u64 admin_q_pause;
+	u64 rx_drops;
+	u64 tx_drops;
+	u64 reset_fail;
+};
+
+enum ena_flags_t {
+	ENA_FLAG_DEVICE_RUNNING,
+	ENA_FLAG_DEV_UP,
+	ENA_FLAG_LINK_UP,
+	ENA_FLAG_MSIX_ENABLED,
+	ENA_FLAG_TRIGGER_RESET,
+	ENA_FLAG_ONGOING_RESET
+};
+
+/* adapter specific private data structure */
+struct ena_adapter {
+	struct ena_com_dev *ena_dev;
+	/* OS defined structs */
+	struct net_device *netdev;
+	struct pci_dev *pdev;
+
+	struct devlink *devlink;
+
+	/* rx packets that are shorter than this len will be copied to the skb
+	 * header
+	 */
+	u32 rx_copybreak;
+	u32 max_mtu;
+
+	u32 num_io_queues;
+	u32 max_num_io_queues;
+	/* Local page cache size when it's enabled */
+	u32 configured_lpc_size;
+	/* Current Local page cache size */
+	u32 used_lpc_size;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+	struct msix_entry *msix_entries;
+#endif
+	int msix_vecs;
+
+	u32 missing_tx_completion_threshold;
+
+	u32 requested_tx_ring_size;
+	u32 requested_rx_ring_size;
+
+	u32 max_tx_ring_size;
+	u32 max_rx_ring_size;
+
+	u32 msg_enable;
+
+	/* The flag is used for two purposes:
+	 * 1. Indicates that large LLQ has been requested.
+	 * 2. Indicates whether large LLQ is set or not after device
+	 *    initialization / configuration.
+	 */
+	bool large_llq_header_enabled;
+	bool large_llq_header_supported;
+
+	u16 max_tx_sgl_size;
+	u16 max_rx_sgl_size;
+
+	u8 mac_addr[ETH_ALEN];
+
+	unsigned long keep_alive_timeout;
+	unsigned long missing_tx_completion_to;
+
+	char name[ENA_NAME_MAX_LEN];
+
+	unsigned long flags;
+	/* TX */
+	struct ena_ring tx_ring[ENA_MAX_NUM_IO_QUEUES]
+		____cacheline_aligned_in_smp;
+
+	/* RX */
+	struct ena_ring rx_ring[ENA_MAX_NUM_IO_QUEUES]
+		____cacheline_aligned_in_smp;
+
+	struct ena_napi ena_napi[ENA_MAX_NUM_IO_QUEUES];
+
+	struct ena_irq irq_tbl[ENA_MAX_MSIX_VEC(ENA_MAX_NUM_IO_QUEUES)];
+
+	/* timer service */
+	struct work_struct reset_task;
+	struct timer_list timer_service;
+
+	bool wd_state;
+	bool dev_up_before_reset;
+	bool disable_meta_caching;
+	unsigned long last_keep_alive_jiffies;
+
+	struct u64_stats_sync syncp;
+	struct ena_stats_dev dev_stats;
+	struct ena_admin_eni_stats eni_stats;
+	struct ena_admin_ena_srd_info ena_srd_info;
+
+	/* last queue index that was checked for uncompleted tx packets */
+	u32 last_monitored_tx_qid;
+
+	enum ena_regs_reset_reason_types reset_reason;
+
+#ifdef ENA_XDP_SUPPORT
+	struct bpf_prog *xdp_bpf_prog;
+#endif
+	u32 xdp_first_ring;
+	u32 xdp_num_queues;
+
+	struct ena_phc_info *phc_info;
+};
+
+void ena_set_ethtool_ops(struct net_device *netdev);
+
+void ena_dump_stats_to_dmesg(struct ena_adapter *adapter);
+
+void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf);
+
+
+int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled);
+
+int ena_update_queue_sizes(struct ena_adapter *adapter,
+			   u32 new_tx_size,
+			   u32 new_rx_size);
+
+int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count);
+
+int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak);
+
+int ena_get_sset_count(struct net_device *netdev, int sset);
+#ifdef ENA_BUSY_POLL_SUPPORT
+static inline void ena_bp_init_lock(struct ena_ring *rx_ring)
+{
+	/* reset state to idle */
+	atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE);
+}
+
+/* called from the napi routine to get ownership of the ring */
+static inline bool ena_bp_lock_napi(struct ena_ring *rx_ring)
+{
+	int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE,
+				ENA_BP_STATE_NAPI);
+	if (rc != ENA_BP_STATE_IDLE) {
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.bp_yield++;
+		u64_stats_update_end(&rx_ring->syncp);
+	}
+
+	return rc == ENA_BP_STATE_IDLE;
+}
+
+static inline void ena_bp_unlock_napi(struct ena_ring *rx_ring)
+{
+	WARN_ON(atomic_read(&rx_ring->bp_state) != ENA_BP_STATE_NAPI);
+
+	/* flush any outstanding Rx frames */
+	if (rx_ring->napi->gro_list)
+		napi_gro_flush(rx_ring->napi, false);
+
+	/* reset state to idle */
+	atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE);
+}
+
+/* called from ena_ll_busy_poll() */
+static inline bool ena_bp_lock_poll(struct ena_ring *rx_ring)
+{
+	int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE,
+				ENA_BP_STATE_POLL);
+	if (rc != ENA_BP_STATE_IDLE) {
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.bp_yield++;
+		u64_stats_update_end(&rx_ring->syncp);
+	}
+
+	return rc == ENA_BP_STATE_IDLE;
+}
+
+static inline void ena_bp_unlock_poll(struct ena_ring *rx_ring)
+{
+	WARN_ON(atomic_read(&rx_ring->bp_state) != ENA_BP_STATE_POLL);
+
+	/* reset state to idle */
+	atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE);
+}
+
+/* true if a socket is polling, even if it did not get the lock */
+static inline bool ena_bp_busy_polling(struct ena_ring *rx_ring)
+{
+	return atomic_read(&rx_ring->bp_state) == ENA_BP_STATE_POLL;
+}
+
+static inline bool ena_bp_disable(struct ena_ring *rx_ring)
+{
+	int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE,
+				ENA_BP_STATE_DISABLE);
+
+	return rc == ENA_BP_STATE_IDLE;
+}
+#endif /* ENA_BUSY_POLL_SUPPORT */
+
+static inline void ena_reset_device(struct ena_adapter *adapter,
+				    enum ena_regs_reset_reason_types reset_reason)
+{
+	adapter->reset_reason = reset_reason;
+	/* Make sure reset reason is set before triggering the reset */
+	smp_mb__before_atomic();
+	set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+}
+
+/* Allocate a page and DMA map it
+ * @rx_ring: The IO queue pair which requests the allocation
+ *
+ * @return: address of the mapped page in DMA and allocated page address is
+ * succeeded, or NULL
+ */
+struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma);
+
+int ena_destroy_device(struct ena_adapter *adapter, bool graceful);
+int ena_restore_device(struct ena_adapter *adapter);
+int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
+			  struct ena_tx_buffer *tx_info, bool is_xdp);
+
+/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */
+static inline void ena_increase_stat(u64 *statp, u64 cnt,
+			      struct u64_stats_sync *syncp)
+{
+	u64_stats_update_begin(syncp);
+	(*statp) += cnt;
+	u64_stats_update_end(syncp);
+}
+
+static inline void ena_ring_tx_doorbell(struct ena_ring *tx_ring)
+{
+	ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
+	ena_increase_stat(&tx_ring->tx_stats.doorbells, 1, &tx_ring->syncp);
+}
+
+int ena_xmit_common(struct ena_adapter *adapter,
+		    struct ena_ring *ring,
+		    struct ena_tx_buffer *tx_info,
+		    struct ena_com_tx_ctx *ena_tx_ctx,
+		    u16 next_to_use,
+		    u32 bytes);
+void ena_unmap_tx_buff(struct ena_ring *tx_ring,
+		       struct ena_tx_buffer *tx_info);
+void ena_init_io_rings(struct ena_adapter *adapter,
+		       int first_index, int count);
+int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
+				     int first_index, int count);
+int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
+				    int first_index, int count);
+void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter,
+					int first_index, int count);
+void ena_free_all_io_tx_resources(struct ena_adapter *adapter);
+void ena_down(struct ena_adapter *adapter);
+int ena_up(struct ena_adapter *adapter);
+void ena_unmask_interrupt(struct ena_ring *tx_ring, struct ena_ring *rx_ring);
+void ena_update_ring_numa_node(struct ena_ring *tx_ring,
+			       struct ena_ring *rx_ring);
+void ena_rx_checksum(struct ena_ring *rx_ring,
+		     struct ena_com_rx_ctx *ena_rx_ctx,
+		     struct sk_buff *skb);
+void ena_set_rx_hash(struct ena_ring *rx_ring,
+		     struct ena_com_rx_ctx *ena_rx_ctx,
+		     struct sk_buff *skb);
+int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num);
+#endif /* !(ENA_H) */
diff --git a/drivers/amazon/net/ena/ena_pci_id_tbl.h b/drivers/amazon/net/ena/ena_pci_id_tbl.h
new file mode 100755
index 0000000000000..3ecdf29160ca7
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_pci_id_tbl.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef ENA_PCI_ID_TBL_H_
+#define ENA_PCI_ID_TBL_H_
+
+#ifndef PCI_VENDOR_ID_AMAZON
+#define PCI_VENDOR_ID_AMAZON 0x1d0f
+#endif
+
+#ifndef PCI_DEV_ID_ENA_PF
+#define PCI_DEV_ID_ENA_PF	0x0ec2
+#endif
+
+#ifndef PCI_DEV_ID_ENA_LLQ_PF
+#define PCI_DEV_ID_ENA_LLQ_PF	0x1ec2
+#endif
+
+#ifndef PCI_DEV_ID_ENA_VF
+#define PCI_DEV_ID_ENA_VF	0xec20
+#endif
+
+#ifndef PCI_DEV_ID_ENA_LLQ_VF
+#define PCI_DEV_ID_ENA_LLQ_VF	0xec21
+#endif
+
+#ifndef PCI_DEV_ID_ENA_RESRV0
+#define PCI_DEV_ID_ENA_RESRV0	0x0051
+#endif
+
+#define ENA_PCI_ID_TABLE_ENTRY(devid) \
+	{PCI_DEVICE(PCI_VENDOR_ID_AMAZON, devid)},
+
+static const struct pci_device_id ena_pci_tbl[] = {
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_RESRV0)
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_PF)
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_LLQ_PF)
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_VF)
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_LLQ_VF)
+	{ }
+};
+
+#endif /* ENA_PCI_ID_TBL_H_ */
diff --git a/drivers/amazon/net/ena/ena_phc.c b/drivers/amazon/net/ena/ena_phc.c
new file mode 100644
index 0000000000000..46e21d3202a1b
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_phc.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_phc.h"
+
+#ifdef ENA_PHC_SUPPORT
+
+static int ena_phc_adjfreq(struct ptp_clock_info *clock_info, s32 ppb)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ena_phc_adjtime(struct ptp_clock_info *clock_info, s64 delta)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ena_phc_enable(struct ptp_clock_info *clock_info, struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+#ifdef ENA_PHC_SUPPORT_GETTIME64
+#ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED
+static int ena_phc_gettimex64(struct ptp_clock_info *clock_info, struct timespec64 *ts,
+			      struct ptp_system_timestamp *sts)
+{
+	struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info);
+	unsigned long flags;
+	u64 timestamp_nsec;
+	int rc;
+
+	spin_lock_irqsave(&phc_info->lock, flags);
+
+	ptp_read_system_prets(sts);
+
+	rc = ena_com_phc_get(phc_info->adapter->ena_dev, &timestamp_nsec);
+
+	ptp_read_system_postts(sts);
+
+	spin_unlock_irqrestore(&phc_info->lock, flags);
+
+	*ts = ns_to_timespec64(timestamp_nsec);
+
+	return rc;
+}
+
+#else /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+static int ena_phc_gettime64(struct ptp_clock_info *clock_info, struct timespec64 *ts)
+{
+	struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info);
+	unsigned long flags;
+	u64 timestamp_nsec;
+	int rc;
+
+	spin_lock_irqsave(&phc_info->lock, flags);
+
+	rc = ena_com_phc_get(phc_info->adapter->ena_dev, &timestamp_nsec);
+
+	spin_unlock_irqrestore(&phc_info->lock, flags);
+
+	*ts = ns_to_timespec64(timestamp_nsec);
+
+	return rc;
+}
+
+#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+static int ena_phc_settime64(struct ptp_clock_info *clock_info,
+			     const struct timespec64 *ts)
+{
+	return -EOPNOTSUPP;
+}
+
+#else /* ENA_PHC_SUPPORT_GETTIME64 */
+static int ena_phc_gettime(struct ptp_clock_info *clock_info, struct timespec *ts)
+{
+	struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info);
+	unsigned long flags;
+	u64 timestamp_nsec;
+	u32 remainder;
+	int rc;
+
+	spin_lock_irqsave(&phc_info->lock, flags);
+
+	rc = ena_com_phc_get(phc_info->adapter->ena_dev, &timestamp_nsec);
+
+	spin_unlock_irqrestore(&phc_info->lock, flags);
+
+	ts->tv_sec = div_u64_rem(timestamp_nsec, NSEC_PER_SEC, &remainder);
+	ts->tv_nsec = remainder;
+
+	return rc;
+}
+
+static int ena_phc_settime(struct ptp_clock_info *clock_info, const struct timespec *ts)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif /* ENA_PHC_SUPPORT_GETTIME64 */
+
+static struct ptp_clock_info ena_ptp_clock_info = {
+	.owner		= THIS_MODULE,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.pps		= 0,
+	.adjfreq	= ena_phc_adjfreq,
+	.adjtime	= ena_phc_adjtime,
+#ifdef ENA_PHC_SUPPORT_GETTIME64
+#ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED
+	.gettimex64	= ena_phc_gettimex64,
+#else /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+	.gettime64	= ena_phc_gettime64,
+#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+	.settime64	= ena_phc_settime64,
+#else /* ENA_PHC_SUPPORT_GETTIME64 */
+	.gettime	= ena_phc_gettime,
+	.settime	= ena_phc_settime,
+#endif /* ENA_PHC_SUPPORT_GETTIME64 */
+	.enable		= ena_phc_enable,
+};
+
+static int ena_phc_register(struct ena_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	struct ptp_clock_info *clock_info;
+	struct ena_phc_info *phc_info;
+	int rc = 0;
+
+	phc_info = adapter->phc_info;
+	clock_info = &phc_info->clock_info;
+
+	phc_info->adapter = adapter;
+
+	spin_lock_init(&phc_info->lock);
+
+	/* Fill the ptp_clock_info struct and register PTP clock */
+	*clock_info = ena_ptp_clock_info;
+	snprintf(clock_info->name,
+		 sizeof(clock_info->name),
+		 "ena-ptp-%02x",
+		 PCI_SLOT(pdev->devfn));
+
+	phc_info->clock = ptp_clock_register(clock_info, &pdev->dev);
+	if (IS_ERR(phc_info->clock)) {
+		rc = PTR_ERR(phc_info->clock);
+		netdev_err(adapter->netdev, "Failed registering ptp clock, error: %d\n", rc);
+		phc_info->clock = NULL;
+	}
+
+	return rc;
+}
+
+bool ena_phc_enabled(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	return (phc_info && phc_info->clock);
+}
+
+static void ena_phc_unregister(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	if (ena_phc_enabled(adapter))
+		ptp_clock_unregister(phc_info->clock);
+}
+
+int ena_phc_init(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct net_device *netdev = adapter->netdev;
+	int rc = -EOPNOTSUPP;
+
+	/* Validate phc feature is supported in the device */
+	if (!ena_com_phc_supported(ena_dev)) {
+		netdev_dbg(netdev, "PHC feature is not supported\n");
+		goto err_ena_com_phc_init;
+	}
+
+	/* Allocate and initialize device specific PHC info */
+	rc = ena_com_phc_init(ena_dev);
+	if (unlikely(rc)) {
+		netdev_err(netdev, "Failed to init phc, error: %d\n", rc);
+		goto err_ena_com_phc_init;
+	}
+
+	/* Configure PHC feature in driver and device */
+	rc = ena_com_phc_config(ena_dev);
+	if (unlikely(rc)) {
+		netdev_err(netdev, "Failed to config phc, error: %d\n", rc);
+		goto err_ena_com_phc_config;
+	}
+
+	/* Allocate and initialize driver specific PHC info */
+	adapter->phc_info = vzalloc(sizeof(*adapter->phc_info));
+	if (unlikely(!adapter->phc_info)) {
+		rc = -ENOMEM;
+		netdev_err(netdev, "Failed to alloc phc_info, error: %d\n", rc);
+		goto err_ena_com_phc_config;
+	}
+
+	/* Register to PTP class driver */
+	rc = ena_phc_register(adapter);
+	if (unlikely(rc)) {
+		netdev_err(netdev, "Failed to register phc, error: %d\n", rc);
+		goto err_ena_phc_register;
+	}
+
+	return 0;
+
+err_ena_phc_register:
+	vfree(adapter->phc_info);
+	adapter->phc_info = NULL;
+err_ena_com_phc_config:
+	ena_com_phc_destroy(ena_dev);
+err_ena_com_phc_init:
+	return rc;
+}
+
+void ena_phc_destroy(struct ena_adapter *adapter)
+{
+	ena_phc_unregister(adapter);
+
+	if (likely(adapter->phc_info)) {
+		vfree(adapter->phc_info);
+		adapter->phc_info = NULL;
+	}
+
+	ena_com_phc_destroy(adapter->ena_dev);
+}
+
+int ena_phc_get_index(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	if (ena_phc_enabled(adapter))
+		return ptp_clock_index(phc_info->clock);
+
+	return -1;
+}
+
+#endif /* ENA_PHC_SUPPORT */
diff --git a/drivers/amazon/net/ena/ena_phc.h b/drivers/amazon/net/ena/ena_phc.h
new file mode 100644
index 0000000000000..f08ff473bd1e4
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_phc.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef ENA_PHC_H
+#define ENA_PHC_H
+
+#include "ena_netdev.h"
+
+#ifdef ENA_PHC_SUPPORT
+
+#include <linux/ptp_clock_kernel.h>
+
+struct ena_phc_info {
+	/* PTP hardware capabilities */
+	struct ptp_clock_info clock_info;
+
+	/* Registered PTP clock device */
+	struct ptp_clock *clock;
+
+	/* Adapter specific private data structure */
+	struct ena_adapter *adapter;
+
+	/* PHC lock */
+	spinlock_t lock;
+};
+
+bool ena_phc_enabled(struct ena_adapter *adapter);
+int ena_phc_get_index(struct ena_adapter *adapter);
+int ena_phc_init(struct ena_adapter *adapter);
+void ena_phc_destroy(struct ena_adapter *adapter);
+
+#else /* ENA_PHC_SUPPORT */
+
+static inline bool ena_phc_enabled(struct ena_adapter *adapter) {return false; }
+static inline int ena_phc_get_index(struct ena_adapter *adapter) { return -1; }
+static inline int ena_phc_init(struct ena_adapter *adapter) { return 0; }
+static inline void ena_phc_destroy(struct ena_adapter *adapter) { }
+
+#endif /* ENA_PHC_SUPPORT */
+
+#endif /* ENA_PHC_H */
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
new file mode 100755
index 0000000000000..ded18aa5162bc
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+#ifndef _ENA_REGS_H_
+#define _ENA_REGS_H_
+
+enum ena_regs_reset_reason_types {
+	ENA_REGS_RESET_NORMAL                       = 0,
+	ENA_REGS_RESET_KEEP_ALIVE_TO                = 1,
+	ENA_REGS_RESET_ADMIN_TO                     = 2,
+	ENA_REGS_RESET_MISS_TX_CMPL                 = 3,
+	ENA_REGS_RESET_INV_RX_REQ_ID                = 4,
+	ENA_REGS_RESET_INV_TX_REQ_ID                = 5,
+	ENA_REGS_RESET_TOO_MANY_RX_DESCS            = 6,
+	ENA_REGS_RESET_INIT_ERR                     = 7,
+	ENA_REGS_RESET_DRIVER_INVALID_STATE         = 8,
+	ENA_REGS_RESET_OS_TRIGGER                   = 9,
+	ENA_REGS_RESET_OS_NETDEV_WD                 = 10,
+	ENA_REGS_RESET_SHUTDOWN                     = 11,
+	ENA_REGS_RESET_USER_TRIGGER                 = 12,
+	ENA_REGS_RESET_GENERIC                      = 13,
+	ENA_REGS_RESET_MISS_INTERRUPT               = 14,
+	ENA_REGS_RESET_SUSPECTED_POLL_STARVATION    = 15,
+	ENA_REGS_RESET_LAST,
+};
+
+/* ena_registers offsets */
+
+/* 0 base */
+#define ENA_REGS_VERSION_OFF                                0x0
+#define ENA_REGS_CONTROLLER_VERSION_OFF                     0x4
+#define ENA_REGS_CAPS_OFF                                   0x8
+#define ENA_REGS_CAPS_EXT_OFF                               0xc
+#define ENA_REGS_AQ_BASE_LO_OFF                             0x10
+#define ENA_REGS_AQ_BASE_HI_OFF                             0x14
+#define ENA_REGS_AQ_CAPS_OFF                                0x18
+#define ENA_REGS_ACQ_BASE_LO_OFF                            0x20
+#define ENA_REGS_ACQ_BASE_HI_OFF                            0x24
+#define ENA_REGS_ACQ_CAPS_OFF                               0x28
+#define ENA_REGS_AQ_DB_OFF                                  0x2c
+#define ENA_REGS_ACQ_TAIL_OFF                               0x30
+#define ENA_REGS_AENQ_CAPS_OFF                              0x34
+#define ENA_REGS_AENQ_BASE_LO_OFF                           0x38
+#define ENA_REGS_AENQ_BASE_HI_OFF                           0x3c
+#define ENA_REGS_AENQ_HEAD_DB_OFF                           0x40
+#define ENA_REGS_AENQ_TAIL_OFF                              0x44
+#define ENA_REGS_INTR_MASK_OFF                              0x4c
+#define ENA_REGS_DEV_CTL_OFF                                0x54
+#define ENA_REGS_DEV_STS_OFF                                0x58
+#define ENA_REGS_MMIO_REG_READ_OFF                          0x5c
+#define ENA_REGS_MMIO_RESP_LO_OFF                           0x60
+#define ENA_REGS_MMIO_RESP_HI_OFF                           0x64
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_OFF                   0x68
+
+/* phc_registers offsets */
+
+/* 100 base */
+#define ENA_REGS_PHC_DB_OFF                                 0x100
+
+/* version register */
+#define ENA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
+#define ENA_REGS_VERSION_MAJOR_VERSION_SHIFT                8
+#define ENA_REGS_VERSION_MAJOR_VERSION_MASK                 0xff00
+
+/* controller_version register */
+#define ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK   0xff
+#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT     8
+#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK      0xff00
+#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT     16
+#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK      0xff0000
+#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT           24
+#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK            0xff000000
+
+/* caps register */
+#define ENA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK        0x1
+#define ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT                   1
+#define ENA_REGS_CAPS_RESET_TIMEOUT_MASK                    0x3e
+#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT                  8
+#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK                   0xff00
+#define ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT                    16
+#define ENA_REGS_CAPS_ADMIN_CMD_TO_MASK                     0xf0000
+
+/* aq_caps register */
+#define ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK                      0xffff
+#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT                16
+#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK                 0xffff0000
+
+/* acq_caps register */
+#define ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK                    0xffff
+#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT              16
+#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK               0xffff0000
+
+/* aenq_caps register */
+#define ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK                  0xffff
+#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT            16
+#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK             0xffff0000
+
+/* dev_ctl register */
+#define ENA_REGS_DEV_CTL_DEV_RESET_MASK                     0x1
+#define ENA_REGS_DEV_CTL_AQ_RESTART_SHIFT                   1
+#define ENA_REGS_DEV_CTL_AQ_RESTART_MASK                    0x2
+#define ENA_REGS_DEV_CTL_QUIESCENT_SHIFT                    2
+#define ENA_REGS_DEV_CTL_QUIESCENT_MASK                     0x4
+#define ENA_REGS_DEV_CTL_IO_RESUME_SHIFT                    3
+#define ENA_REGS_DEV_CTL_IO_RESUME_MASK                     0x8
+#define ENA_REGS_DEV_CTL_RESET_REASON_SHIFT                 28
+#define ENA_REGS_DEV_CTL_RESET_REASON_MASK                  0xf0000000
+
+/* dev_sts register */
+#define ENA_REGS_DEV_STS_READY_MASK                         0x1
+#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT       1
+#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK        0x2
+#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT          2
+#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK           0x4
+#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT            3
+#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK             0x8
+#define ENA_REGS_DEV_STS_RESET_FINISHED_SHIFT               4
+#define ENA_REGS_DEV_STS_RESET_FINISHED_MASK                0x10
+#define ENA_REGS_DEV_STS_FATAL_ERROR_SHIFT                  5
+#define ENA_REGS_DEV_STS_FATAL_ERROR_MASK                   0x20
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_SHIFT  6
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_MASK   0x40
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_SHIFT     7
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_MASK      0x80
+
+/* mmio_reg_read register */
+#define ENA_REGS_MMIO_REG_READ_REQ_ID_MASK                  0xffff
+#define ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT                16
+#define ENA_REGS_MMIO_REG_READ_REG_OFF_MASK                 0xffff0000
+
+/* rss_ind_entry_update register */
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_INDEX_MASK            0xffff
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT          16
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK           0xffff0000
+
+/* phc_db_req_id register */
+#define ENA_REGS_PHC_DB_REQ_ID_MASK                         0xffff
+
+#endif /* _ENA_REGS_H_ */
diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c
new file mode 100755
index 0000000000000..98e1f7ecd0f09
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_sysfs.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/sysfs.h>
+
+#include "ena_com.h"
+#include "ena_netdev.h"
+#include "ena_sysfs.h"
+
+
+static ssize_t ena_store_rx_copybreak(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t len)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	unsigned long rx_copybreak;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &rx_copybreak);
+	if (rc < 0)
+		goto exit;
+
+	rtnl_lock();
+	rc = ena_set_rx_copybreak(adapter, rx_copybreak);
+	if (rc)
+		goto unlock;
+	rtnl_unlock();
+
+	return len;
+unlock:
+	rtnl_unlock();
+exit:
+	return rc;
+}
+
+#define ENA_RX_COPYBREAK_STR_MAX_LEN 7
+
+static ssize_t ena_show_rx_copybreak(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+
+	return snprintf(buf, ENA_RX_COPYBREAK_STR_MAX_LEN, "%d\n",
+			adapter->rx_copybreak);
+}
+
+static DEVICE_ATTR(rx_copybreak, S_IRUGO | S_IWUSR, ena_show_rx_copybreak,
+		   ena_store_rx_copybreak);
+
+/******************************************************************************
+ *****************************************************************************/
+int ena_sysfs_init(struct device *dev)
+{
+
+	if (device_create_file(dev, &dev_attr_rx_copybreak))
+		dev_err(dev, "Failed to create rx_copybreak sysfs entry");
+	return 0;
+}
+
+/******************************************************************************
+ *****************************************************************************/
+void ena_sysfs_terminate(struct device *dev)
+{
+	device_remove_file(dev, &dev_attr_rx_copybreak);
+}
diff --git a/drivers/amazon/net/ena/ena_sysfs.h b/drivers/amazon/net/ena/ena_sysfs.h
new file mode 100755
index 0000000000000..8c572eee268f3
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_sysfs.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef __ENA_SYSFS_H__
+#define __ENA_SYSFS_H__
+
+#ifdef CONFIG_SYSFS
+
+int ena_sysfs_init(struct device *dev);
+
+void ena_sysfs_terminate(struct device *dev);
+
+#else /* CONFIG_SYSFS */
+
+static inline int ena_sysfs_init(struct device *dev)
+{
+	return 0;
+}
+
+static inline void ena_sysfs_terminate(struct device *dev)
+{
+}
+
+#endif /* CONFIG_SYSFS */
+
+#endif /* __ENA_SYSFS_H__ */
diff --git a/drivers/amazon/net/ena/ena_xdp.c b/drivers/amazon/net/ena/ena_xdp.c
new file mode 100644
index 0000000000000..4d8c1709598de
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_xdp.c
@@ -0,0 +1,977 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_xdp.h"
+#ifdef ENA_XDP_SUPPORT
+
+static int validate_xdp_req_id(struct ena_ring *tx_ring, u16 req_id)
+{
+	struct ena_tx_buffer *tx_info;
+
+	tx_info = &tx_ring->tx_buffer_info[req_id];
+	if (likely(tx_info->total_tx_size))
+		return 0;
+
+	return handle_invalid_req_id(tx_ring, req_id, tx_info, true);
+}
+
+static int ena_xdp_tx_map_frame(struct ena_ring *tx_ring,
+				struct ena_tx_buffer *tx_info,
+				struct xdp_frame *xdpf,
+				struct ena_com_tx_ctx *ena_tx_ctx)
+{
+	struct ena_adapter *adapter = tx_ring->adapter;
+	struct ena_com_buf *ena_buf;
+	int push_len = 0;
+	dma_addr_t dma;
+	void *data;
+	u32 size;
+
+	tx_info->xdpf = xdpf;
+	data = tx_info->xdpf->data;
+	size = tx_info->xdpf->len;
+
+	if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		/* Designate part of the packet for LLQ */
+		push_len = min_t(u32, size, tx_ring->tx_max_header_size);
+
+		ena_tx_ctx->push_header = data;
+
+		size -= push_len;
+		data += push_len;
+	}
+
+	ena_tx_ctx->header_len = push_len;
+
+	if (size > 0) {
+		dma = dma_map_single(tx_ring->dev,
+				     data,
+				     size,
+				     DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(tx_ring->dev, dma)))
+			goto error_report_dma_error;
+
+		tx_info->map_linear_data = 0;
+
+		ena_buf = tx_info->bufs;
+		ena_buf->paddr = dma;
+		ena_buf->len = size;
+
+		ena_tx_ctx->ena_bufs = ena_buf;
+		ena_tx_ctx->num_bufs = tx_info->num_of_bufs = 1;
+	}
+
+	return 0;
+
+error_report_dma_error:
+	ena_increase_stat(&tx_ring->tx_stats.dma_mapping_err, 1,
+			  &tx_ring->syncp);
+	netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map xdp buff\n");
+
+	return -EINVAL;
+}
+
+int ena_xdp_xmit_frame(struct ena_ring *tx_ring,
+		       struct ena_adapter *adapter,
+		       struct xdp_frame *xdpf,
+		       int flags)
+{
+	struct ena_com_tx_ctx ena_tx_ctx = {};
+	struct ena_tx_buffer *tx_info;
+	u16 next_to_use, req_id;
+	int rc;
+
+	next_to_use = tx_ring->next_to_use;
+	req_id = tx_ring->free_ids[next_to_use];
+	tx_info = &tx_ring->tx_buffer_info[req_id];
+	tx_info->num_of_bufs = 0;
+
+	rc = ena_xdp_tx_map_frame(tx_ring, tx_info, xdpf, &ena_tx_ctx);
+	if (unlikely(rc))
+		return rc;
+
+	ena_tx_ctx.req_id = req_id;
+
+	rc = ena_xmit_common(adapter,
+			     tx_ring,
+			     tx_info,
+			     &ena_tx_ctx,
+			     next_to_use,
+			     xdpf->len);
+	if (rc)
+		goto error_unmap_dma;
+
+	/* trigger the dma engine. ena_ring_tx_doorbell()
+	 * calls a memory barrier inside it.
+	 */
+	if (flags & XDP_XMIT_FLUSH)
+		ena_ring_tx_doorbell(tx_ring);
+
+	return rc;
+
+error_unmap_dma:
+	ena_unmap_tx_buff(tx_ring, tx_info);
+	tx_info->xdpf = NULL;
+	return rc;
+}
+
+int ena_xdp_xmit(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	struct ena_ring *tx_ring;
+	int qid, i, nxmit = 0;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return -ENETDOWN;
+
+	/* We assume that all rings have the same XDP program */
+	if (!READ_ONCE(adapter->rx_ring->xdp_bpf_prog))
+		return -ENXIO;
+
+	qid = smp_processor_id() % adapter->xdp_num_queues;
+	qid += adapter->xdp_first_ring;
+	tx_ring = &adapter->tx_ring[qid];
+
+	/* Other CPU ids might try to send thorugh this queue */
+	spin_lock(&tx_ring->xdp_tx_lock);
+
+	for (i = 0; i < n; i++) {
+		if (ena_xdp_xmit_frame(tx_ring, adapter, frames[i], 0))
+			break;
+		nxmit++;
+	}
+
+	/* Ring doorbell to make device aware of the packets */
+	if (flags & XDP_XMIT_FLUSH)
+		ena_ring_tx_doorbell(tx_ring);
+
+	spin_unlock(&tx_ring->xdp_tx_lock);
+
+#ifndef ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
+	for (i = nxmit; unlikely(i < n); i++)
+		xdp_return_frame(frames[i]);
+
+#endif
+	/* Return number of packets sent */
+	return nxmit;
+}
+
+static void ena_init_all_xdp_queues(struct ena_adapter *adapter)
+{
+	adapter->xdp_first_ring = adapter->num_io_queues;
+	adapter->xdp_num_queues = adapter->num_io_queues;
+
+	ena_init_io_rings(adapter,
+			  adapter->xdp_first_ring,
+			  adapter->xdp_num_queues);
+}
+
+int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter)
+{
+	int rc = 0;
+
+	rc = ena_setup_tx_resources_in_range(adapter, adapter->xdp_first_ring,
+					     adapter->xdp_num_queues);
+	if (rc)
+		goto setup_err;
+
+	rc = ena_create_io_tx_queues_in_range(adapter,
+					      adapter->xdp_first_ring,
+					      adapter->xdp_num_queues);
+	if (rc)
+		goto create_err;
+
+	return 0;
+
+create_err:
+	ena_free_all_io_tx_resources_in_range(adapter, adapter->xdp_first_ring,
+					      adapter->xdp_num_queues);
+setup_err:
+	return rc;
+}
+
+/* Provides a way for both kernel and bpf-prog to know
+ * more about the RX-queue a given XDP frame arrived on.
+ */
+int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
+{
+	int rc;
+
+#ifdef AF_XDP_BUSY_POLL_SUPPORTED
+	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid,
+			      rx_ring->napi->napi_id < 0);
+#else
+	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid);
+#endif
+
+	netif_dbg(rx_ring->adapter, ifup, rx_ring->netdev, "Registering RX info for queue %d",
+		  rx_ring->qid);
+	if (rc) {
+		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
+			  "Failed to register xdp rx queue info. RX queue num %d rc: %d\n",
+			  rx_ring->qid, rc);
+		goto err;
+	}
+
+	if (ENA_IS_XSK_RING(rx_ring)) {
+		rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL);
+		xsk_pool_set_rxq_info(rx_ring->xsk_pool, &rx_ring->xdp_rxq);
+	} else {
+		rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED,
+						NULL);
+	}
+
+	if (rc) {
+		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
+			  "Failed to register xdp rx queue info memory model. RX queue num %d rc: %d\n",
+			  rx_ring->qid, rc);
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+	}
+
+err:
+	return rc;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring)
+{
+	struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool;
+	int i, xsk_frames = 0;
+
+	for (i = 0; i < tx_ring->ring_size; i++) {
+		struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i];
+
+		if (tx_info->last_jiffies)
+			xsk_frames++;
+
+		tx_info->last_jiffies = 0;
+	}
+
+	if (xsk_frames)
+		xsk_tx_completed(xsk_pool, xsk_frames);
+}
+
+void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+	int i = 0;
+
+	for (i = 0; i < rx_ring->ring_size; i++) {
+		struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i];
+
+		if (rx_info->xdp)
+			xsk_buff_free(rx_info->xdp);
+
+		rx_info->xdp = NULL;
+	}
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring)
+{
+	netif_dbg(rx_ring->adapter, ifdown, rx_ring->netdev,
+		  "Unregistering RX info for queue %d",
+		  rx_ring->qid);
+	xdp_rxq_info_unreg_mem_model(&rx_ring->xdp_rxq);
+	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+}
+
+void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
+						 struct bpf_prog *prog,
+						 int first, int count)
+{
+	struct bpf_prog *old_bpf_prog;
+	struct ena_ring *rx_ring;
+	int i = 0;
+
+	for (i = first; i < count; i++) {
+		rx_ring = &adapter->rx_ring[i];
+		old_bpf_prog = xchg(&rx_ring->xdp_bpf_prog, prog);
+
+		if (!old_bpf_prog && prog) {
+			rx_ring->rx_headroom = XDP_PACKET_HEADROOM;
+		} else if (old_bpf_prog && !prog) {
+			rx_ring->rx_headroom = NET_SKB_PAD;
+		}
+	}
+}
+
+static void ena_xdp_exchange_program(struct ena_adapter *adapter,
+				     struct bpf_prog *prog)
+{
+	struct bpf_prog *old_bpf_prog = xchg(&adapter->xdp_bpf_prog, prog);
+
+	ena_xdp_exchange_program_rx_in_range(adapter,
+					     prog,
+					     0,
+					     adapter->num_io_queues);
+
+	if (old_bpf_prog)
+		bpf_prog_put(old_bpf_prog);
+}
+
+static int ena_destroy_and_free_all_xdp_queues(struct ena_adapter *adapter)
+{
+	bool was_up;
+	int rc;
+
+	was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+
+	if (was_up)
+		ena_down(adapter);
+
+	adapter->xdp_first_ring = 0;
+	adapter->xdp_num_queues = 0;
+	ena_xdp_exchange_program(adapter, NULL);
+	if (was_up) {
+		rc = ena_up(adapter);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct bpf_prog *prog = bpf->prog;
+	struct bpf_prog *old_bpf_prog;
+	int rc, prev_mtu;
+	bool is_up;
+
+	is_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+	rc = ena_xdp_allowed(adapter);
+	if (rc == ENA_XDP_ALLOWED) {
+		old_bpf_prog = adapter->xdp_bpf_prog;
+		if (prog) {
+			if (!is_up) {
+				ena_init_all_xdp_queues(adapter);
+			} else if (!old_bpf_prog) {
+				ena_down(adapter);
+				ena_init_all_xdp_queues(adapter);
+			}
+			ena_xdp_exchange_program(adapter, prog);
+
+			netif_dbg(adapter, drv, adapter->netdev, "Set a new XDP program\n");
+
+			if (is_up && !old_bpf_prog) {
+				rc = ena_up(adapter);
+				if (rc)
+					return rc;
+			}
+		} else if (old_bpf_prog) {
+			netif_dbg(adapter, drv, adapter->netdev,
+				  "Removing XDP program\n");
+
+			rc = ena_destroy_and_free_all_xdp_queues(adapter);
+			if (rc)
+				return rc;
+		}
+
+		prev_mtu = netdev->max_mtu;
+		netdev->max_mtu = prog ? ENA_XDP_MAX_MTU : adapter->max_mtu;
+
+		if (!old_bpf_prog)
+			netif_info(adapter, drv, adapter->netdev,
+				   "XDP program is set, changing the max_mtu from %d to %d",
+				   prev_mtu, netdev->max_mtu);
+
+	} else if (rc == ENA_XDP_CURRENT_MTU_TOO_LARGE) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Failed to set xdp program, the current MTU (%d) is larger than the maximum allowed MTU (%lu) while xdp is on",
+			  netdev->mtu, ENA_XDP_MAX_MTU);
+		NL_SET_ERR_MSG_MOD(bpf->extack,
+				   "Failed to set xdp program, the current MTU is larger than the maximum allowed MTU. Check the dmesg for more info");
+		return -EINVAL;
+	} else if (rc == ENA_XDP_NO_ENOUGH_QUEUES) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Failed to set xdp program, the Rx/Tx channel count should be at most half of the maximum allowed channel count. The current queue count (%d), the maximal queue count (%d)\n",
+			  adapter->num_io_queues, adapter->max_num_io_queues);
+		NL_SET_ERR_MSG_MOD(bpf->extack,
+				   "Failed to set xdp program, there is no enough space for allocating XDP queues, Check the dmesg for more info");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+static bool ena_is_xsk_pool_params_allowed(struct xsk_buff_pool *pool)
+{
+	return xsk_pool_get_headroom(pool) == 0 &&
+	       xsk_pool_get_chunk_size(pool) == ENA_PAGE_SIZE;
+}
+
+static int ena_xsk_pool_enable(struct ena_adapter *adapter,
+			       struct xsk_buff_pool *pool,
+			       u16 qid)
+{
+	struct ena_ring *rx_ring, *tx_ring;
+	bool dev_was_up = false;
+	int err;
+
+	if (!ena_xdp_legal_queue_count(adapter, qid)) {
+		netdev_err(adapter->netdev,
+			   "Max qid for XSK pool is %d (received %d)\n",
+			   adapter->max_num_io_queues >> 1, qid);
+		return -EINVAL;
+	}
+
+	if (ena_is_xsk_pool_params_allowed(pool))
+		return -EINVAL;
+
+	rx_ring = &adapter->rx_ring[qid];
+	tx_ring = &adapter->tx_ring[qid];
+
+	err = xsk_pool_dma_map(pool, adapter->ena_dev->dmadev, 0);
+	if (err) {
+		ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1,
+				  &rx_ring->syncp);
+		netif_err(adapter, drv, adapter->netdev,
+			  "Failed to DMA map XSK pool for qid %d\n", qid);
+		return err;
+	}
+
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) {
+		dev_was_up = true;
+		ena_down(adapter);
+	}
+
+	rx_ring->xsk_pool = tx_ring->xsk_pool = pool;
+
+	netif_dbg(adapter, drv, adapter->netdev,
+		  "Setting XSK pool for queue %d\n", qid);
+
+	return dev_was_up ? ena_up(adapter) : 0;
+}
+
+static int ena_xsk_pool_disable(struct ena_adapter *adapter,
+				u16 qid)
+{
+	struct ena_ring *rx_ring, *tx_ring;
+	bool dev_was_up = false;
+
+	if (qid >= adapter->num_io_queues)
+		return -EINVAL;
+
+	rx_ring = &adapter->rx_ring[qid];
+	tx_ring = &adapter->tx_ring[qid];
+
+	/* XSK pool isn't attached to this ring */
+	if (!rx_ring->xsk_pool)
+		return 0;
+
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) {
+		dev_was_up = true;
+		ena_down(adapter);
+	}
+
+	xsk_pool_dma_unmap(rx_ring->xsk_pool, 0);
+
+	rx_ring->xsk_pool = tx_ring->xsk_pool = NULL;
+
+	netif_dbg(adapter, drv, adapter->netdev,
+		  "Removing XSK pool for queue %d\n", qid);
+
+	return dev_was_up ? ena_up(adapter) : 0;
+}
+
+static int ena_xsk_pool_setup(struct ena_adapter *adapter,
+			      struct xsk_buff_pool *pool,
+			      u16 qid)
+{
+	return pool ? ena_xsk_pool_enable(adapter, pool, qid) :
+		      ena_xsk_pool_disable(adapter, qid);
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+/* This is the main xdp callback, it's used by the kernel to set/unset the xdp
+ * program as well as to query the current xdp program id.
+ */
+int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf)
+{
+#if !defined(ENA_XDP_QUERY_IN_KERNEL) || defined(ENA_AF_XDP_SUPPORT)
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+#endif /* ENA_XDP_QUERY_IN_KERNEL || ENA_AF_XDP_SUPPORT */
+	switch (bpf->command) {
+	case XDP_SETUP_PROG:
+		return ena_xdp_set(netdev, bpf);
+#ifdef ENA_AF_XDP_SUPPORT
+	case XDP_SETUP_XSK_POOL:
+		return ena_xsk_pool_setup(adapter, bpf->xsk.pool, bpf->xsk.queue_id);
+#endif /* ENA_AF_XDP_SUPPORT */
+#ifndef ENA_XDP_QUERY_IN_KERNEL
+	case XDP_QUERY_PROG:
+		bpf->prog_id = adapter->xdp_bpf_prog ?
+			adapter->xdp_bpf_prog->aux->id : 0;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_ring *tx_ring;
+	struct napi_struct *napi;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return -ENETDOWN;
+
+	if (qid >= adapter->num_io_queues)
+		return -EINVAL;
+
+	if (!adapter->xdp_bpf_prog)
+		return -ENXIO;
+
+	tx_ring = &adapter->tx_ring[qid];
+
+	if (!ENA_IS_XSK_RING(tx_ring))
+		return -ENXIO;
+
+	ena_increase_stat(&tx_ring->tx_stats.xsk_wakeup_request, 1,
+			  &tx_ring->syncp);
+
+	napi = tx_ring->napi;
+
+	napi_schedule(napi);
+
+	return 0;
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
+{
+
+	bool is_zc_q = ENA_IS_XSK_RING(tx_ring);
+	u32 total_done = 0;
+	u16 next_to_clean;
+	bool needs_wakeup;
+	u32 tx_bytes = 0;
+	int tx_pkts = 0;
+	u16 req_id;
+	int rc;
+
+	if (unlikely(!tx_ring))
+		return 0;
+	next_to_clean = tx_ring->next_to_clean;
+
+	while (tx_pkts < budget) {
+		struct ena_tx_buffer *tx_info;
+		struct xdp_frame *xdpf;
+
+		rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq,
+						&req_id);
+		if (rc) {
+			if (unlikely(rc == -EINVAL))
+				handle_invalid_req_id(tx_ring, req_id, NULL,
+						      true);
+			break;
+		}
+
+		/* validate that the request id points to a valid xdp_frame */
+		rc = validate_xdp_req_id(tx_ring, req_id);
+		if (rc)
+			break;
+
+		tx_info = &tx_ring->tx_buffer_info[req_id];
+
+		tx_info->last_jiffies = 0;
+
+		if (!is_zc_q) {
+			xdpf = tx_info->xdpf;
+			tx_info->xdpf = NULL;
+			ena_unmap_tx_buff(tx_ring, tx_info);
+			xdp_return_frame(xdpf);
+		}
+
+		netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
+			  "tx_poll: q %d pkt #%d req_id %d\n", tx_ring->qid, tx_pkts, req_id);
+
+		tx_bytes += tx_info->total_tx_size;
+		tx_pkts++;
+		total_done += tx_info->tx_descs;
+
+		tx_info->total_tx_size = 0;
+
+		tx_ring->free_ids[next_to_clean] = req_id;
+		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
+						     tx_ring->ring_size);
+	}
+
+	tx_ring->next_to_clean = next_to_clean;
+	ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done);
+
+	netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
+		  "tx_poll: q %d done. total pkts: %d\n",
+		  tx_ring->qid, tx_pkts);
+
+	needs_wakeup = tx_pkts < budget;
+#ifdef ENA_AF_XDP_SUPPORT
+	if (is_zc_q) {
+		struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool;
+
+		if (tx_pkts)
+			xsk_tx_completed(xsk_pool, tx_pkts);
+
+		if (xsk_uses_need_wakeup(xsk_pool)) {
+			if (needs_wakeup)
+				xsk_set_tx_need_wakeup(xsk_pool);
+			else
+				xsk_clear_tx_need_wakeup(xsk_pool);
+		}
+	}
+#endif /* ENA_AF_XDP_SUPPORT */
+
+	return needs_wakeup;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+static bool ena_xdp_xmit_irq_zc(struct ena_ring *tx_ring,
+				struct napi_struct *napi,
+				int budget)
+{
+	struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool;
+	int size, rc, push_len = 0, work_done = 0;
+	struct ena_tx_buffer *tx_info;
+	struct ena_com_buf *ena_buf;
+	u16 next_to_use, req_id;
+	bool need_wakeup = true;
+	struct xdp_desc desc;
+	dma_addr_t dma;
+
+	while (likely(work_done < budget)) {
+		struct ena_com_tx_ctx ena_tx_ctx = {};
+
+		/* We assume the maximum number of descriptors, which is two
+		 * (meta data included)
+		 */
+		if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, 2)))
+			break;
+
+		if (!xsk_tx_peek_desc(xsk_pool, &desc))
+			break;
+
+		next_to_use = tx_ring->next_to_use;
+		req_id = tx_ring->free_ids[next_to_use];
+		tx_info = &tx_ring->tx_buffer_info[req_id];
+
+		size = desc.len;
+
+		if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+			/* Designate part of the packet for LLQ */
+			push_len = min_t(u32, size, tx_ring->tx_max_header_size);
+			ena_tx_ctx.push_header = xsk_buff_raw_get_data(xsk_pool, desc.addr);
+			ena_tx_ctx.header_len = push_len;
+
+			size -= push_len;
+			if (!size)
+				goto xmit_desc;
+		}
+
+		/* Pass the rest of the descriptor as a DMA address. Assuming
+		 * single page descriptor.
+		 */
+		dma  = xsk_buff_raw_get_dma(xsk_pool, desc.addr);
+		ena_buf = tx_info->bufs;
+		ena_buf->paddr = dma + push_len;
+		ena_buf->len = size;
+
+		ena_tx_ctx.ena_bufs = ena_buf;
+		ena_tx_ctx.num_bufs = 1;
+
+xmit_desc:
+		ena_tx_ctx.req_id = req_id;
+
+		netif_dbg(tx_ring->adapter, tx_queued, tx_ring->netdev,
+			  "Queueing zc packet on q %d, %s DMA part (req-id %d)\n",
+			  tx_ring->qid, ena_tx_ctx.num_bufs ? "with" : "without", req_id);
+
+		rc = ena_xmit_common(tx_ring->adapter,
+				     tx_ring,
+				     tx_info,
+				     &ena_tx_ctx,
+				     next_to_use,
+				     desc.len);
+		if (rc)
+			break;
+
+		work_done++;
+	}
+
+	if (work_done) {
+		xsk_tx_release(xsk_pool);
+		ena_ring_tx_doorbell(tx_ring);
+	}
+
+	if (work_done == budget) {
+		need_wakeup = false;
+		if (xsk_uses_need_wakeup(xsk_pool))
+			xsk_clear_tx_need_wakeup(xsk_pool);
+	}
+
+	return need_wakeup;
+}
+
+static struct sk_buff *ena_xdp_rx_skb_zc(struct ena_ring *rx_ring, struct xdp_buff *xdp)
+{
+	u32 headroom, data_len;
+	struct sk_buff *skb;
+	void *data_addr;
+
+	/* Assuming single-page packets for XDP */
+	headroom  = xdp->data - xdp->data_hard_start;
+	data_len  = xdp->data_end - xdp->data;
+	data_addr = xdp->data;
+
+	/* allocate a skb to store the frags */
+	skb = __napi_alloc_skb(rx_ring->napi,
+			       headroom + data_len,
+			       GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!skb)) {
+		ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1,
+				  &rx_ring->syncp);
+		netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
+			  "Failed to allocate skb in zc queue %d\n", rx_ring->qid);
+		return NULL;
+	}
+
+	skb_reserve(skb, headroom);
+	memcpy(__skb_put(skb, data_len), data_addr, data_len);
+
+	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+
+	return skb;
+}
+
+static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring,
+				   struct napi_struct *napi,
+				   int budget)
+{
+	int i, refill_required, work_done, refill_threshold, pkt_copy;
+	u16 next_to_clean = rx_ring->next_to_clean;
+	int xdp_verdict, req_id, rc, total_len;
+	struct ena_com_rx_ctx ena_rx_ctx;
+	struct ena_rx_buffer *rx_info;
+	bool xdp_prog_present;
+	struct xdp_buff *xdp;
+	struct sk_buff *skb;
+	u32 xdp_flags = 0;
+
+	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+		  "%s qid %d\n", __func__, rx_ring->qid);
+
+	ena_rx_ctx.ena_bufs = rx_ring->ena_bufs;
+	ena_rx_ctx.max_bufs = rx_ring->sgl_size;
+
+	xdp_prog_present = ena_xdp_present_ring(rx_ring);
+
+	work_done = 0;
+	total_len = 0;
+	pkt_copy = 0;
+
+	do {
+		xdp_verdict = ENA_XDP_PASS;
+
+		/* Poll a packet from HW */
+		rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq,
+				    rx_ring->ena_com_io_sq,
+				    &ena_rx_ctx);
+		if (unlikely(rc))
+			break;
+
+		/* Polled all RX packets */
+		if (unlikely(ena_rx_ctx.descs == 0))
+			break;
+
+		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+			  "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
+			  rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
+			  ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
+
+		/* First descriptor might have an offset set by the device */
+		rx_info = &rx_ring->rx_buffer_info[ena_rx_ctx.ena_bufs[0].req_id];
+		xdp = rx_info->xdp;
+		xdp->data += ena_rx_ctx.pkt_offset;
+		xdp->data_end = xdp->data + ena_rx_ctx.ena_bufs[0].len;
+		xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool);
+
+		/* XDP multi-buffer packets not supported */
+		if (unlikely(ena_rx_ctx.descs > 1)) {
+			netdev_err_once(rx_ring->adapter->netdev,
+					"xdp: dropped multi-buffer packets. RX packets must be < %lu\n",
+					ENA_XDP_MAX_MTU);
+			ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp);
+			xdp_verdict = ENA_XDP_DROP;
+			goto skip_xdp_prog;
+		}
+
+		if (likely(xdp_prog_present))
+			xdp_verdict = ena_xdp_execute(rx_ring, xdp);
+
+skip_xdp_prog:
+		/* Note that there can be several descriptors, since device
+		 * might not honor MTU
+		 */
+		for (i = 0; i < ena_rx_ctx.descs; i++) {
+			req_id = rx_ring->ena_bufs[i].req_id;
+			rx_ring->free_ids[next_to_clean] = req_id;
+			next_to_clean =
+				ENA_RX_RING_IDX_NEXT(next_to_clean,
+						     rx_ring->ring_size);
+		}
+
+		if (likely(xdp_verdict)) {
+			work_done++;
+			total_len += ena_rx_ctx.ena_bufs[0].len;
+			xdp_flags |= xdp_verdict;
+
+			/* Mark buffer as consumed when it is redirected */
+			if (likely(xdp_verdict & ENA_XDP_FORWARDED))
+				rx_info->xdp = NULL;
+
+			continue;
+		}
+
+		/* XDP PASS */
+		skb = ena_xdp_rx_skb_zc(rx_ring, xdp);
+		if (unlikely(!skb)) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		pkt_copy++;
+		work_done++;
+		total_len += ena_rx_ctx.ena_bufs[0].len;
+		ena_rx_checksum(rx_ring, &ena_rx_ctx, skb);
+		ena_set_rx_hash(rx_ring, &ena_rx_ctx, skb);
+		skb_record_rx_queue(skb, rx_ring->qid);
+		napi_gro_receive(napi, skb);
+
+	} while (likely(work_done <= budget));
+
+	rx_ring->per_napi_packets += work_done;
+	u64_stats_update_begin(&rx_ring->syncp);
+	rx_ring->rx_stats.bytes += total_len;
+	rx_ring->rx_stats.cnt += work_done;
+	rx_ring->rx_stats.zc_queue_pkt_copy += pkt_copy;
+	u64_stats_update_end(&rx_ring->syncp);
+
+	rx_ring->next_to_clean = next_to_clean;
+
+	if (xdp_flags & ENA_XDP_REDIRECT)
+		xdp_do_flush_map();
+
+	refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
+	refill_threshold =
+		min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER,
+		      ENA_RX_REFILL_THRESH_PACKET);
+	/* Optimization, try to batch new rx buffers */
+	if (refill_required > refill_threshold)
+		ena_refill_rx_bufs(rx_ring, refill_required);
+
+	if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
+		if (likely(rc || work_done < budget)) {
+			xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
+			ena_increase_stat(&rx_ring->rx_stats.xsk_need_wakeup_set, 1,
+					  &rx_ring->syncp);
+		} else {
+			xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
+		}
+	}
+
+	if (unlikely(rc)) {
+		struct ena_adapter *adapter = netdev_priv(rx_ring->netdev);
+
+		if (rc == -ENOSPC) {
+			ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1,
+					  &rx_ring->syncp);
+			ena_reset_device(adapter,
+					 ENA_REGS_RESET_TOO_MANY_RX_DESCS);
+		} else if (rc == -EIO) {
+			ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1,
+					  &rx_ring->syncp);
+			ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID);
+		}
+
+		return 0;
+	}
+
+	return work_done;
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+/* This is the XDP napi callback. XDP queues use a separate napi callback
+ * than Rx/Tx queues.
+ */
+int ena_xdp_io_poll(struct napi_struct *napi, int budget)
+{
+	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
+	struct ena_ring *rx_ring, *tx_ring;
+	bool needs_wakeup = true;
+	u32 rx_work_done = 0;
+	int ret;
+
+	rx_ring = ena_napi->rx_ring;
+	tx_ring = ena_napi->tx_ring;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
+	    test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags)) {
+		napi_complete_done(napi, 0);
+		return 0;
+	}
+
+	needs_wakeup &= ena_clean_xdp_irq(tx_ring, budget);
+
+#ifdef ENA_AF_XDP_SUPPORT
+	if (!ENA_IS_XSK_RING(tx_ring))
+		goto polling_done;
+
+	needs_wakeup &= ena_xdp_xmit_irq_zc(tx_ring, napi, budget);
+
+	rx_work_done = ena_xdp_clean_rx_irq_zc(rx_ring, napi, budget);
+	needs_wakeup &= rx_work_done < budget;
+
+polling_done:
+#endif /* ENA_AF_XDP_SUPPORT */
+	/* If the device is about to reset or down, avoid unmask
+	 * the interrupt and return 0 so NAPI won't reschedule
+	 */
+	if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags))) {
+		napi_complete_done(napi, 0);
+		ret = 0;
+	} else if (needs_wakeup) {
+		ena_increase_stat(&tx_ring->tx_stats.napi_comp, 1,
+				  &tx_ring->syncp);
+		if (napi_complete_done(napi, rx_work_done) &&
+		    READ_ONCE(ena_napi->interrupts_masked)) {
+			smp_rmb(); /* make sure interrupts_masked is read */
+			WRITE_ONCE(ena_napi->interrupts_masked, false);
+			ena_unmask_interrupt(tx_ring, NULL);
+		}
+
+		ena_update_ring_numa_node(tx_ring, NULL);
+		ret = rx_work_done;
+	} else {
+		ret = budget;
+	}
+
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.tx_poll++;
+	u64_stats_update_end(&tx_ring->syncp);
+	tx_ring->tx_stats.last_napi_jiffies = jiffies;
+
+	return ret;
+}
+#endif /* ENA_XDP_SUPPORT */
diff --git a/drivers/amazon/net/ena/ena_xdp.h b/drivers/amazon/net/ena/ena_xdp.h
new file mode 100644
index 0000000000000..f6b60c0e5d7c6
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_xdp.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef ENA_XDP_H
+#define ENA_XDP_H
+
+#include "ena_netdev.h"
+#ifdef ENA_XDP_SUPPORT
+#include <linux/bpf_trace.h>
+#ifdef ENA_AF_XDP_SUPPORT
+#include <net/xdp_sock_drv.h>
+#endif /* ENA_AF_XDP_SUPPORT */
+
+#ifdef ENA_AF_XDP_SUPPORT
+#define ENA_IS_XSK_RING(ring) (!!(ring)->xsk_pool)
+#endif /* ENA_AF_XDP_SUPPORT */
+
+/* The max MTU size is configured to be the ethernet frame size without
+ * the overhead of the ethernet header, which can have a VLAN header, and
+ * a frame check sequence (FCS).
+ * The buffer size we share with the device is defined to be ENA_PAGE_SIZE
+ */
+#ifdef XDP_HAS_FRAME_SZ
+#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN -	\
+			 VLAN_HLEN - XDP_PACKET_HEADROOM -		\
+			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+#else
+#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \
+			 VLAN_HLEN - XDP_PACKET_HEADROOM)
+#endif
+
+#define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \
+	((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues))
+
+enum ENA_XDP_ACTIONS {
+	ENA_XDP_PASS		= 0,
+	ENA_XDP_TX		= BIT(0),
+	ENA_XDP_REDIRECT	= BIT(1),
+	ENA_XDP_DROP		= BIT(2)
+};
+#define ENA_XDP_FORWARDED (ENA_XDP_TX | ENA_XDP_REDIRECT)
+
+int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter);
+void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
+					  struct bpf_prog *prog,
+					  int first, int count);
+int ena_xdp_io_poll(struct napi_struct *napi, int budget);
+int ena_xdp_xmit_frame(struct ena_ring *tx_ring,
+		       struct ena_adapter *adapter,
+		       struct xdp_frame *xdpf,
+		       int flags);
+int ena_xdp_xmit(struct net_device *dev, int n,
+		 struct xdp_frame **frames, u32 flags);
+int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf);
+int ena_xdp_register_rxq_info(struct ena_ring *rx_ring);
+void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring);
+#ifdef ENA_AF_XDP_SUPPORT
+void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring);
+void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid);
+int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags);
+#endif
+
+enum ena_xdp_errors_t {
+	ENA_XDP_ALLOWED = 0,
+	ENA_XDP_CURRENT_MTU_TOO_LARGE,
+	ENA_XDP_NO_ENOUGH_QUEUES,
+};
+
+static inline bool ena_xdp_present(struct ena_adapter *adapter)
+{
+	return !!adapter->xdp_bpf_prog;
+}
+
+static inline bool ena_xdp_present_ring(struct ena_ring *ring)
+{
+	return !!ring->xdp_bpf_prog;
+}
+
+static inline bool ena_xdp_legal_queue_count(struct ena_adapter *adapter,
+					     u32 queues)
+{
+	return 2 * queues <= adapter->max_num_io_queues;
+}
+
+static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter)
+{
+	enum ena_xdp_errors_t rc = ENA_XDP_ALLOWED;
+
+	if (adapter->netdev->mtu > ENA_XDP_MAX_MTU)
+		rc = ENA_XDP_CURRENT_MTU_TOO_LARGE;
+	else if (!ena_xdp_legal_queue_count(adapter, adapter->num_io_queues))
+		rc = ENA_XDP_NO_ENOUGH_QUEUES;
+
+	return rc;
+}
+
+static inline u64 ena_ring_xdp_drops_cnt(struct ena_ring *rx_ring)
+{
+	return rx_ring->rx_stats.xdp_drop;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		if (ENA_IS_XSK_RING(&adapter->rx_ring[i]))
+			return true;
+
+	return false;
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+static inline int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
+{
+	u32 verdict = ENA_XDP_PASS;
+	struct bpf_prog *xdp_prog;
+	struct ena_ring *xdp_ring;
+	struct xdp_frame *xdpf;
+	u64 *xdp_stat;
+
+	xdp_prog = READ_ONCE(rx_ring->xdp_bpf_prog);
+
+	verdict = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	switch (verdict) {
+	case XDP_TX:
+#ifdef XDP_CONVERT_TO_FRAME_NAME_CHANGED
+		xdpf = xdp_convert_buff_to_frame(xdp);
+#else
+		xdpf = convert_to_xdp_frame(xdp);
+#endif
+		if (unlikely(!xdpf)) {
+			trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+			xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+			verdict = ENA_XDP_DROP;
+			break;
+		}
+
+		/* Find xmit queue */
+		xdp_ring = rx_ring->xdp_ring;
+
+		/* The XDP queues are shared between XDP_TX and XDP_REDIRECT */
+		spin_lock(&xdp_ring->xdp_tx_lock);
+
+		if (ena_xdp_xmit_frame(xdp_ring, rx_ring->adapter, xdpf,
+				       XDP_XMIT_FLUSH))
+			xdp_return_frame(xdpf);
+
+		spin_unlock(&xdp_ring->xdp_tx_lock);
+		xdp_stat = &rx_ring->rx_stats.xdp_tx;
+		verdict = ENA_XDP_TX;
+		break;
+	case XDP_REDIRECT:
+		if (likely(!xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog))) {
+			xdp_stat = &rx_ring->rx_stats.xdp_redirect;
+			verdict = ENA_XDP_REDIRECT;
+			break;
+		}
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+		xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+		verdict = ENA_XDP_DROP;
+		break;
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+		xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+		verdict = ENA_XDP_DROP;
+		break;
+	case XDP_DROP:
+		xdp_stat = &rx_ring->rx_stats.xdp_drop;
+		verdict = ENA_XDP_DROP;
+		break;
+	case XDP_PASS:
+		xdp_stat = &rx_ring->rx_stats.xdp_pass;
+		verdict = ENA_XDP_PASS;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, verdict);
+		xdp_stat = &rx_ring->rx_stats.xdp_invalid;
+		verdict = ENA_XDP_DROP;
+	}
+
+	ena_increase_stat(xdp_stat, 1, &rx_ring->syncp);
+
+	return verdict;
+}
+#else /* ENA_XDP_SUPPORT */
+
+#define ENA_IS_XDP_INDEX(adapter, index) (false)
+
+static inline bool ena_xdp_present_ring(struct ena_ring *ring)
+{
+	return false;
+}
+
+static inline u64 ena_ring_xdp_drops_cnt(struct ena_ring *rx_ring)
+{
+	return 0;
+}
+
+static inline int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
+{
+	return 0;
+}
+
+static inline void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring) {}
+
+#endif /* ENA_XDP_SUPPORT */
+#ifndef ENA_AF_XDP_SUPPORT /* stabs for AF XDP code */
+
+/* Define (or override if it's defined) these enum and function to make sure
+ * that the code that uses them would always compile. If AF XDP isn't supported, it
+ * won't be used anyway.
+ */
+#define MEM_TYPE_XSK_BUFF_POOL 0
+#define xsk_pool_set_rxq_info(pool, rxq)
+
+static inline void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring) {}
+static inline void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid) {}
+
+#define ENA_IS_XSK_RING(ring) false
+
+static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter)
+{
+	return false;
+}
+#endif /* ENA_AF_XDP_SUPPORT */
+#endif /* ENA_XDP_H */
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
new file mode 100644
index 0000000000000..fd7e80d0347ba
--- /dev/null
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -0,0 +1,987 @@
+/*******************************************************************************
+Modified by Amazon 2015-2016.
+Copyright 2015-2016, Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Modifications subject to the terms and conditions of the GNU General
+Public License, version 2.
+*******************************************************************************/
+
+/*******************************************************************************
+
+Intel 10 Gigabit PCI Express Linux driver
+Copyright(c) 1999 - 2013 Intel Corporation.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms and conditions of the GNU General Public License,
+version 2, as published by the Free Software Foundation.
+
+This program is distributed in the hope it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+
+The full GNU General Public License is included in this distribution in
+the file called "COPYING".
+
+Contact Information:
+e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+#ifndef _KCOMPAT_H_
+#define _KCOMPAT_H_
+
+#ifndef LINUX_VERSION_CODE
+#include <linux/version.h>
+#endif
+
+#ifndef KERNEL_VERSION
+#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
+#endif
+
+#include <asm/io.h>
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) )
+#include <linux/utsrelease.h>
+#else
+#include <generated/utsrelease.h>
+#endif
+
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/ip.h>
+#include <linux/kconfig.h>
+#include <linux/list.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/udp.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0)
+#include <linux/sizes.h>
+#endif
+
+/* For ACCESS_ONCE, WRITE_ONCE and READ_ONCE macros */
+#include<linux/compiler.h>
+
+#ifndef SZ_256
+#define SZ_256 0x0000100
+#endif
+
+#ifndef SZ_4K
+#define SZ_4K 0x00001000
+#endif
+
+#ifndef SZ_16K
+#define SZ_16K 0x00004000
+#endif
+
+#ifdef HAVE_POLL_CONTROLLER
+#define CONFIG_NET_POLL_CONTROLLER
+#endif
+
+#ifndef __GFP_COLD
+#define __GFP_COLD 0
+#endif
+
+#if defined(CONFIG_NET_RX_BUSY_POLL) && \
+	LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \
+	LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+#define ENA_BUSY_POLL_SUPPORT
+#endif
+/******************************************************************************/
+/************************** Ubuntu macros *************************************/
+/******************************************************************************/
+
+/* Ubuntu Release ABI is the 4th digit of their kernel version. You can find
+ * it in /usr/src/linux/$(uname -r)/include/generated/utsrelease.h for new
+ * enough versions of Ubuntu. Otherwise you can simply see it in the output of
+ * uname as the 4th digit of the kernel. The UTS_UBUNTU_RELEASE_ABI is not in
+ * the linux-source package, but in the linux-headers package. It begins to
+ * appear in later releases of 14.04 and 14.10.
+ *
+ * Ex:
+ * <Ubuntu 14.04.1>
+ *  $uname -r
+ *  3.13.0-45-generic
+ * ABI is 45
+ *
+ * <Ubuntu 14.10>
+ *  $uname -r
+ *  3.16.0-23-generic
+ * ABI is 23
+ */
+#ifdef UTS_UBUNTU_RELEASE_ABI
+
+#if UTS_UBUNTU_RELEASE_ABI > 255
+#undef UTS_UBUNTU_RELEASE_ABI
+#define UTS_UBUNTU_RELEASE_ABI 0
+#endif /* UTS_UBUNTU_RELEASE_ABI > 255 */
+
+/* Ubuntu does not provide actual release version macro, so we use the kernel
+ * version plus the ABI to generate a unique version code specific to Ubuntu.
+ * In addition, we mask the lower 8 bits of LINUX_VERSION_CODE in order to
+ * ignore differences in sublevel which are not important since we have the
+ * ABI value. Otherwise, it becomes impossible to correlate ABI to version for
+ * ordering checks.
+ */
+#define UBUNTU_VERSION_CODE (((LINUX_VERSION_CODE & ~0xFF) << 8) + (UTS_UBUNTU_RELEASE_ABI))
+
+#endif /* UTS_UBUNTU_RELEASE_ABI */
+
+/* Note that the 3rd digit is always zero, and will be ignored. This is
+ * because Ubuntu kernels are based on x.y.0-ABI values, and while their linux
+ * version codes are 3 digit, this 3rd digit is superseded by the ABI value.
+ */
+#define UBUNTU_VERSION(a,b,c,d) ((KERNEL_VERSION(a,b,0) << 8) + (d))
+
+/******************************************************************************/
+/**************************** SuSE macros *************************************/
+/******************************************************************************/
+
+/* SuSE version macro is the same as Linux kernel version */
+#ifndef SLE_VERSION
+#define SLE_VERSION(a,b,c) KERNEL_VERSION(a,b,c)
+#endif
+#ifdef CONFIG_SUSE_KERNEL
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 14)
+#include <linux/suse_version.h>
+#endif
+#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,28) )
+/* SLES12 is at least 3.12.28+ based */
+#define SLE_VERSION_CODE SLE_VERSION(12,0,0)
+#endif /* LINUX_VERSION_CODE == KERNEL_VERSION(x,y,z) */
+#endif /* CONFIG_SUSE_KERNEL */
+#ifndef SLE_VERSION_CODE
+#define SLE_VERSION_CODE 0
+#endif /* SLE_VERSION_CODE */
+#ifndef SUSE_VERSION
+#define SUSE_VERSION 0
+#endif /* SUSE_VERSION */
+
+
+/******************************************************************************/
+/**************************** RHEL macros *************************************/
+/******************************************************************************/
+
+#ifndef RHEL_RELEASE_VERSION
+#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))
+#endif
+#ifndef AX_RELEASE_VERSION
+#define AX_RELEASE_VERSION(a,b) (((a) << 8) + (b))
+#endif
+
+#ifndef AX_RELEASE_CODE
+#define AX_RELEASE_CODE 0
+#endif
+
+#ifndef RHEL_RELEASE_CODE
+#define RHEL_RELEASE_CODE 0
+#endif
+
+#if (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,0))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,0)
+#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,1))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,1)
+#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,2))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,3)
+#endif
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,6)) && \
+     (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+#define HAVE_RHEL6_NET_DEVICE_OPS_EXT
+#endif
+
+#if (RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,4)) && \
+     (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+#define HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
+#endif /* RHEL >= 6.4 && RHEL < 7.0 */
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) || \
+	(SLE_VERSION_CODE && \
+	 LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,48)))
+#define HAVE_MTU_MIN_MAX_IN_NET_DEVICE
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0) || \
+     (RHEL_RELEASE_CODE && \
+      RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)) || \
+     (SLE_VERSION_CODE && \
+      LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,50)))
+#define NDO_GET_STATS_64_V2
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) || \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,5))
+#include <net/busy_poll.h>
+#endif
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) )
+/* The function netif_set_real_num_tx_queues() doesn't return value for
+ * kernels < 2.6.37
+ */
+static inline int _kc_netif_set_real_num_tx_queues(struct net_device *dev,
+                                                   unsigned int txq)
+{
+        netif_set_real_num_tx_queues(dev, txq);
+        return 0;
+}
+#define netif_set_real_num_tx_queues(dev, txq) \
+        _kc_netif_set_real_num_tx_queues(dev, txq)
+
+#endif /* < 2.6.37 */
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) )
+#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,5))
+typedef u32 netdev_features_t;
+#endif
+#undef PCI_EXP_TYPE_RC_EC
+#define  PCI_EXP_TYPE_RC_EC	0xa	/* Root Complex Event Collector */
+#ifndef CONFIG_BQL
+#define netdev_tx_completed_queue(_q, _p, _b) do {} while (0)
+#define netdev_completed_queue(_n, _p, _b) do {} while (0)
+#define netdev_tx_sent_queue(_q, _b) do {} while (0)
+#define netdev_sent_queue(_n, _b) do {} while (0)
+#define netdev_tx_reset_queue(_q) do {} while (0)
+#define netdev_reset_queue(_n) do {} while (0)
+#endif
+
+#endif /* < 3.3.0 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) )
+#ifdef NET_ADDR_RANDOM
+#define eth_hw_addr_random(N) do { \
+	eth_random_addr(N->dev_addr); \
+	N->addr_assign_type |= NET_ADDR_RANDOM; \
+	} while (0)
+#else /* NET_ADDR_RANDOM */
+#define eth_hw_addr_random(N) eth_random_addr(N->dev_addr)
+#endif /* NET_ADDR_RANDOM */
+#if !(RHEL_RELEASE_CODE)
+/* If probe retry doesn't define, return no device */
+#define EPROBE_DEFER ENODEV
+#endif
+#endif /* >= 3.4.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) )
+#if !(RHEL_RELEASE_CODE)
+static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
+{
+	const u16 *a = (const u16 *)addr1;
+	const u16 *b = (const u16 *)addr2;
+
+	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
+}
+#endif
+#endif /* >= 3.5.0 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) )
+#ifndef eth_random_addr
+#define eth_random_addr _kc_eth_random_addr
+static inline void _kc_eth_random_addr(u8 *addr)
+{
+        get_random_bytes(addr, ETH_ALEN);
+        addr[0] &= 0xfe; /* clear multicast */
+        addr[0] |= 0x02; /* set local assignment */
+}
+#endif
+#endif /* < 3.6.0 */
+
+/******************************************************************************/
+#ifndef CONFIG_NET_RX_BUSY_POLL
+static inline void skb_mark_napi_id(struct sk_buff *skb,
+				    struct napi_struct *napi)
+{
+
+}
+
+static inline void napi_hash_del(struct napi_struct *napi)
+{
+
+}
+
+static inline void napi_hash_add(struct napi_struct *napi)
+{
+
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) )
+/* cpu_rmap is buggy on older version and causes dead lock */
+#ifdef CONFIG_RFS_ACCEL
+#undef CONFIG_RFS_ACCEL
+#endif
+
+#if !(RHEL_RELEASE_CODE)
+static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
+{
+	return index % n_rx_rings;
+}
+#endif
+#endif /* >= 3.8.0 */
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5,2,0))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4,19,0)) || \
+      (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0))) || \
+      (SUSE_VERSION && ((SUSE_VERSION == 12 && SUSE_PATCHLEVEL >= 5) || \
+		        (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 1) || \
+			(SUSE_VERSION > 15)))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2
+#else
+
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) && \
+      RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))) || \
+     (LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0) && \
+      SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \
+     (LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0)
+#if defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24)
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
+#else
+#define HAVE_NDO_SELECT_QUEUE_ACCEL
+#endif
+#endif /* >= 3.13 */
+#endif /* < 4.19 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0)
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+# define u64_stats_init(syncp)  seqcount_init(syncp.seq)
+#else
+# define u64_stats_init(syncp)  do { } while (0)
+#endif
+
+#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \
+	!(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \
+	                        (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \
+                            || (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)))) && \
+     !defined(UEK3_RELEASE)
+static inline void reinit_completion(struct completion *x)
+{
+         x->done = 0;
+}
+#endif /* SLE 12 */
+
+#endif /* < 3.13.0 */
+
+#if  (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) ) && \
+     (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,0) && \
+       RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0))) \
+     && !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))&& \
+     !defined(UEK3_RELEASE))) || \
+     (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30))
+static inline int pci_enable_msix_range(struct pci_dev *dev,
+					struct msix_entry *entries,
+					int minvec,
+					int maxvec)
+{
+	int nvec = maxvec;
+	int rc;
+
+	if (maxvec < minvec)
+		return -ERANGE;
+
+	do {
+		rc = pci_enable_msix(dev, entries, nvec);
+		if (rc < 0) {
+			return rc;
+		} else if (rc > 0) {
+			if (rc < minvec)
+				return -ENOSPC;
+			nvec = rc;
+		}
+	} while (rc);
+
+	return nvec;
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) && \
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1))
+static inline void *devm_kcalloc(struct device *dev,
+				 size_t n, size_t size, gfp_t flags)
+{
+	return devm_kzalloc(dev, n * size, flags | __GFP_ZERO);
+}
+#endif
+
+/*****************************************************************************/
+#if (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,8) ) && \
+     !RHEL_RELEASE_CODE && \
+     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))) || \
+     (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30))
+enum pkt_hash_types {
+	PKT_HASH_TYPE_NONE,	/* Undefined type */
+	PKT_HASH_TYPE_L2,	/* Input: src_MAC, dest_MAC */
+	PKT_HASH_TYPE_L3,	/* Input: src_IP, dst_IP */
+	PKT_HASH_TYPE_L4,	/* Input: src_IP, dst_IP, src_port, dst_port */
+};
+
+static inline void skb_set_hash(struct sk_buff *skb, __u32 hash,
+	enum pkt_hash_types type)
+{
+	skb->l4_rxhash = (type == PKT_HASH_TYPE_L4);
+	skb->rxhash = hash;
+}
+#endif
+
+/*****************************************************************************/
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0)
+#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) && \
+			        RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(6,6)) \
+    && !(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,105))
+static inline int pci_msix_vec_count(struct pci_dev *dev)
+{
+	int pos;
+	u16 control;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!pos)
+		return -EINVAL;
+
+	pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
+	return (control & 0x7FF) + 1;
+}
+#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \
+    !(RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,0))
+static inline void ether_addr_copy(u8 *dst, const u8 *src)
+{
+	memcpy(dst, src, 6);
+}
+#endif /* SLE 12 */
+#endif /* RHEL 7 */
+#endif
+
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(6,8)))
+#define napi_gro_flush(napi, flush_old) napi_gro_flush(napi)
+#endif
+
+#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) || \
+	(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,30))) || \
+	(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \
+	(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) \
+	                     && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,1))
+#else
+static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
+					     unsigned int start)
+{
+	return u64_stats_fetch_retry(syncp, start);
+}
+
+static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
+{
+	return u64_stats_fetch_begin(syncp);
+}
+
+#endif
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) && \
+      !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1))))
+
+#define smp_mb__before_atomic()	smp_mb()
+
+#endif
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) )
+#undef GENMASK
+#define GENMASK(h, l)	(((U32_C(1) << ((h) - (l) + 1)) - 1) << (l))
+#undef GENMASK_ULL
+#define GENMASK_ULL(h, l) (((U64_C(1) << ((h) - (l) + 1)) - 1) << (l))
+#endif
+/*****************************************************************************/
+
+#ifndef dma_rmb
+#define dma_rmb rmb
+#endif
+
+#ifndef writel_relaxed
+#define writel_relaxed writel
+#endif
+
+#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) ) \
+	|| (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) \
+	|| (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0))
+#else
+static inline void netdev_rss_key_fill(void *buffer, size_t len)
+{
+	get_random_bytes(buffer, len);
+}
+#endif
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) ) && \
+    !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))
+
+static inline void napi_schedule_irqoff(struct napi_struct *n)
+{
+	napi_schedule(n);
+}
+
+static inline void __napi_schedule_irqoff(struct napi_struct *n)
+{
+	__napi_schedule(n);
+}
+
+#ifndef READ_ONCE
+#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var))))
+#endif
+#endif /* Kernel 3.19 */
+
+/*****************************************************************************/
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) \
+	|| (RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,7)) && \
+	                          (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \
+	                      || RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) \
+	|| (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,19,0,51))
+#else
+static inline void napi_complete_done(struct napi_struct *n, int work_done)
+{
+	napi_complete(n);
+}
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) \
+	|| (defined(UBUNTU_VERSION_CODE) && \
+	(UBUNTU_VERSION(3,13,0,126) <= UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,14,0,0))) \
+	|| (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))
+
+#else
+
+static inline void ioremap_release(struct device *dev, void *res)
+{
+	iounmap(*(void __iomem **)res);
+}
+
+
+static inline void __iomem *devm_ioremap_wc(struct device *dev,
+					    resource_size_t offset,
+					    resource_size_t size)
+{
+	void __iomem **ptr, *addr;
+
+	ptr = devres_alloc(ioremap_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	addr = ioremap_wc(offset, size);
+	if (addr) {
+		*ptr = addr;
+		devres_add(dev, ptr);
+	} else
+		devres_free(ptr);
+
+	return addr;
+}
+#endif
+
+#if RHEL_RELEASE_CODE && \
+    RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5) && \
+    LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0)
+#define ndo_change_mtu ndo_change_mtu_rh74
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0))
+#ifndef dma_zalloc_coherent
+#define dma_zalloc_coherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+#endif
+#endif
+
+#ifndef dev_info_once
+#ifdef CONFIG_PRINTK
+#define dev_info_once(dev, fmt, ...)			\
+do {									\
+	static bool __print_once __read_mostly;				\
+									\
+	if (!__print_once) {						\
+		__print_once = true;					\
+		dev_info(dev, fmt, ##__VA_ARGS__);			\
+	}								\
+} while (0)
+#else
+#define dev_info_once(dev, fmt, ...)			\
+do {									\
+	if (0)								\
+		dev_info(dev, fmt, ##__VA_ARGS__);			\
+} while (0)
+#endif
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)) || \
+	(RHEL_RELEASE_CODE && \
+	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 2))
+#define HAVE_NETDEV_XMIT_MORE
+#endif
+
+#ifndef mmiowb
+#define MMIOWB_NOT_DEFINED
+#endif
+
+/* In the driver we currently only support CRC32 and Toeplitz.
+ * Since in kernel erlier than 4.12 the CRC32 define didn't exist
+ * We define it here to be XOR. Any user who wishes to select CRC32
+ * as the hash function, can do so by choosing xor through ethtool.
+ */
+#ifndef ETH_RSS_HASH_CRC32
+#define ETH_RSS_HASH_CRC32 ETH_RSS_HASH_XOR
+#endif
+
+#ifndef _ULL
+#define _ULL(x) (_AC(x, ULL))
+#endif
+
+#ifndef ULL
+#define ULL(x) (_ULL(x))
+#endif
+
+#ifndef BIT_ULL
+#define BIT_ULL(nr) (ULL(1) << (nr))
+#endif
+
+#ifndef BITS_PER_TYPE
+#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
+#endif
+
+#ifndef DIV_ROUND_DOWN_ULL
+#define DIV_ROUND_DOWN_ULL(ll, d) \
+	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
+#endif
+
+/* values are taken from here: https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md */
+
+#if defined(CONFIG_BPF) && LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
+#define ENA_XDP_SUPPORT
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5,8,0)) || \
+	(SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3)
+#define XDP_HAS_FRAME_SZ
+#define XDP_CONVERT_TO_FRAME_NAME_CHANGED
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
+#define ENA_XDP_QUERY_IN_KERNEL
+#endif
+
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) || \
+	(defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 3))) || \
+	(SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3)
+
+#define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
+#endif
+
+#if defined(CONFIG_NET_DEVLINK) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)
+#define ENA_DEVLINK_SUPPORT
+#endif
+
+#if !defined(CONFIG_NET_DEVLINK) && !defined(CONFIG_NET_DEVLINK_MODULE) && !defined(CONFIG_MAY_USE_DEVLINK)
+#define ENA_NO_DEVLINK_HEADERS
+#endif
+
+#if defined(CONFIG_NET_DEVLINK) &&					\
+	(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) ||		\
+	 (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18)))
+#define ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED
+#endif
+
+#if defined(CONFIG_NET_DEVLINK) && \
+	(KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0))
+#define ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
+#endif
+
+#if defined(CONFIG_NET_DEVLINK) &&					\
+	(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 5, 0) ||		\
+	 (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18)))
+#define ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
+#endif
+
+#if defined(CONFIG_NET_DEVLINK) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+#define ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+#define ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
+#define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0)
+#define ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) && \
+    !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1)) && \
+			    (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6, 6)))) && \
+			    !defined(UBUNTU_VERSION_CODE) && \
+			    !defined(UEK3_RELEASE) && (!defined(DEBIAN_VERSION) || DEBIAN_VERSION != 8)
+
+#define DO_ONCE(func, ...)						     \
+	({								     \
+		static bool ___done = false;				     \
+		if (unlikely(!___done)) {				     \
+				func(__VA_ARGS__);			     \
+				___done = true;				     \
+		}							     \
+	})
+
+#define get_random_once(buf, nbytes)					     \
+	DO_ONCE(get_random_bytes, (buf), (nbytes))
+
+#define net_get_random_once(buf, nbytes)				     \
+	get_random_once((buf), (nbytes))
+
+/* RSS keys are 40 or 52 bytes long */
+#define NETDEV_RSS_KEY_LEN 52
+static u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+
+static inline void netdev_rss_key_fill(void *buffer, size_t len)
+{
+	BUG_ON(len > sizeof(netdev_rss_key));
+	net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key));
+	memcpy(buffer, netdev_rss_key, len);
+}
+#endif
+
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, val) (ACCESS_ONCE(x) = val)
+#endif
+#ifndef READ_ONCE
+#define READ_ONCE(x) ACCESS_ONCE(x)
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9 ,0)
+#define ENA_GENERIC_PM_OPS
+#endif
+
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4, 6 ,0)) && \
+     !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)))
+/* Linux versions 4.4.216 - 4.5 (non inclusive) back propagated page_ref_count
+ * function from kernel 4.6. To make things more difficult, Ubuntu didn't add
+ * these changes to its 4.4.* kernels
+ */
+#if !(KERNEL_VERSION(4, 4 ,216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) ||\
+      defined(UBUNTU_VERSION_CODE)
+static inline int page_ref_count(struct page *page)
+{
+	return atomic_read(&page->_count);
+}
+#endif /* !(KERNEL_VERSION(4, 4 ,216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) */
+
+static inline void page_ref_inc(struct page *page)
+{
+	atomic_inc(&page->_count);
+}
+#endif
+
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)) && \
+     !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))
+static inline struct page *dev_alloc_page(void)
+{
+	gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
+
+	gfp_mask |= __GFP_COLD | __GFP_COMP;
+
+	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0);
+}
+#endif
+
+/* This entry might seem strange because of the #ifndef numa_mem_id(),
+ * but these defines were taken from the Linux kernel
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34)
+#ifndef numa_mem_id
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+static inline int numa_mem_id(void)
+{
+	return __this_cpu_read(_numa_mem_);
+}
+#else /* CONFIG_HAVE_MEMORYLESS_NODES */
+static inline int numa_mem_id(void)
+{
+	return numa_node_id();
+}
+#endif /* CONFIG_HAVE_MEMORYLESS_NODES */
+#endif /* numa_mem_id */
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) */
+
+#ifndef fallthrough
+#define fallthrough do {} while (0)  /* fallthrough */
+#endif
+
+#ifndef NAPI_POLL_WEIGHT
+#define NAPI_POLL_WEIGHT 64
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
+#define AF_XDP_BUSY_POLL_SUPPORTED
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
+#define ENA_LINEAR_FRAG_SUPPORTED
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
+#define ENA_NETDEV_LOGS_WITHOUT_RV
+#endif
+
+#if defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
+static __always_inline void
+xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
+{
+	xdp->rxq = rxq;
+#ifdef XDP_HAS_FRAME_SZ
+	xdp->frame_sz = frame_sz;
+#endif
+}
+
+static __always_inline void
+xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
+		 int headroom, int data_len, const bool meta_valid)
+{
+	unsigned char *data = hard_start + headroom;
+
+	xdp->data_hard_start = hard_start;
+	xdp->data = data;
+	xdp->data_end = data + data_len;
+	xdp->data_meta = meta_valid ? data : data + 1;
+}
+
+#endif /* defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE <= KERNEL_VERSION(5, 12, 0) */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0)
+#define ethtool_sprintf(data, fmt, args...)			\
+	do {							\
+		snprintf(*data, ETH_GSTRING_LEN, fmt, ##args);	\
+		(*data) += ETH_GSTRING_LEN;			\
+	} while(0)
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 13, 0)
+#define ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6))
+static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
+{
+	memcpy(dev->dev_addr, addr, ETH_ALEN);
+}
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6))
+#define ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0)
+#define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
+#endif
+
+#if defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+#define ENA_AF_XDP_SUPPORT
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0)
+/* kernels older than 3.3.0 didn't have this function and
+ * used netif_tx_queue_stopped() for the same purpose
+ */
+static inline int netif_xmit_stopped(const struct netdev_queue *dev_queue)
+{
+	return netif_tx_queue_stopped(dev_queue);
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
+#define NAPIF_STATE_SCHED BIT(NAPI_STATE_SCHED)
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
+#define bpf_warn_invalid_xdp_action(netdev, xdp_prog, verdict) \
+	bpf_warn_invalid_xdp_action(verdict)
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+#define HAS_BPF_HEADER
+#endif
+
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0)) && \
+	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 7))))
+static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
+{
+	if (cmp1.tv64 < cmp2.tv64)
+		return -1;
+	if (cmp1.tv64 > cmp2.tv64)
+		return 1;
+	return 0;
+}
+#endif
+
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)) && \
+	!(RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 7)) && \
+	(RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 0)) && \
+	(RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1))))
+static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
+{
+	return ktime_compare(cmp1, cmp2) > 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
+
+#if defined(ENA_PHC_INCLUDE) && ((LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)) || \
+	(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4)))
+#define ENA_PHC_SUPPORT
+#endif /* ENA_PHC_SUPPORT */
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 1, 0)) || \
+	(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 2))
+#define ENA_PHC_SUPPORT_GETTIME64
+#endif /* ENA_PHC_SUPPORT_GETTIME64 */
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)) || \
+	(RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 7)) && \
+	(RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(8, 0)))
+#define ENA_PHC_SUPPORT_GETTIME64_EXTENDED
+#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0)) && \
+	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))))
+#define ptp_clock_register(info, parent) ptp_clock_register(info)
+#endif
+
+#endif /* CONFIG_PTP_1588_CLOCK */
+
+#endif /* _KCOMPAT_H_ */
diff --git a/drivers/amazon/net/ena/net_dim.c b/drivers/amazon/net/ena/net_dim.c
new file mode 100644
index 0000000000000..af46903cd53e2
--- /dev/null
+++ b/drivers/amazon/net/ena/net_dim.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include "dim.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0)
+
+/*
+ * Net DIM profiles:
+ *        There are different set of profiles for each CQ period mode.
+ *        There are different set of profiles for RX/TX CQs.
+ *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
+ */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
+#define NET_DIM_RX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+}
+
+#define NET_DIM_RX_CQE_PROFILES { \
+	{2,  256},             \
+	{8,  128},             \
+	{16, 64},              \
+	{32, 64},              \
+	{64, 64}               \
+}
+
+#define NET_DIM_TX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
+}
+
+#define NET_DIM_TX_CQE_PROFILES { \
+	{5,  128},  \
+	{8,  64},  \
+	{16, 32},  \
+	{32, 32},  \
+	{64, 32}   \
+}
+
+static const struct dim_cq_moder
+rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_RX_EQE_PROFILES,
+	NET_DIM_RX_CQE_PROFILES,
+};
+
+static const struct dim_cq_moder
+tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_TX_EQE_PROFILES,
+	NET_DIM_TX_CQE_PROFILES,
+};
+
+struct dim_cq_moder
+net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+struct dim_cq_moder
+net_dim_get_def_rx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
+}
+
+struct dim_cq_moder
+net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+struct dim_cq_moder
+net_dim_get_def_tx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
+}
+
+static int net_dim_step(struct dim *dim)
+{
+	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
+		return DIM_TOO_TIRED;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+		break;
+	case DIM_GOING_LEFT:
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+		break;
+	}
+
+	dim->tired++;
+	return DIM_STEPPED;
+}
+
+static void net_dim_exit_parking(struct dim *dim)
+{
+	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT;
+	net_dim_step(dim);
+}
+
+static int net_dim_stats_compare(struct dim_stats *curr,
+				 struct dim_stats *prev)
+{
+	if (!prev->bpms)
+		return curr->bpms ? DIM_STATS_BETTER : DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
+		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->ppms)
+		return curr->ppms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
+		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->epms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
+		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_state = dim->tune_state;
+	int prev_ix = dim->profile_ix;
+	int stats_res;
+	int step_res;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_SAME)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_PARKING_TIRED:
+		dim->tired--;
+		if (!dim->tired)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_GOING_RIGHT:
+	case DIM_GOING_LEFT:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_BETTER)
+			dim_turn(dim);
+
+		if (dim_on_top(dim)) {
+			dim_park_on_top(dim);
+			break;
+		}
+
+		step_res = net_dim_step(dim);
+		switch (step_res) {
+		case DIM_ON_EDGE:
+			dim_park_on_top(dim);
+			break;
+		case DIM_TOO_TIRED:
+			dim_park_tired(dim);
+			break;
+		}
+
+		break;
+	}
+
+	if (prev_state != DIM_PARKING_ON_TOP ||
+	    dim->tune_state != DIM_PARKING_ON_TOP)
+		dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void net_dim(struct dim *dim, struct dim_sample end_sample)
+{
+	struct dim_stats curr_stats;
+	u16 nevents;
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = BIT_GAP(BITS_PER_TYPE(u16),
+				  end_sample.event_ctr,
+				  dim->start_sample.event_ctr);
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
+		if (net_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				  end_sample.byte_ctr, &dim->start_sample);
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
+
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */
diff --git a/drivers/amazon/net/igb_uio/Makefile b/drivers/amazon/net/igb_uio/Makefile
new file mode 100644
index 0000000000000..ebced2786f7c8
--- /dev/null
+++ b/drivers/amazon/net/igb_uio/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_AMAZON_IGB_UIO) += igb_uio.o
diff --git a/drivers/amazon/net/igb_uio/compat.h b/drivers/amazon/net/igb_uio/compat.h
new file mode 100644
index 0000000000000..8dbb896ae1185
--- /dev/null
+++ b/drivers/amazon/net/igb_uio/compat.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Minimal wrappers to allow compiling igb_uio on older kernels.
+ */
+
+#ifndef RHEL_RELEASE_VERSION
+#define RHEL_RELEASE_VERSION(a, b) (((a) << 8) + (b))
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0)
+#define pci_cfg_access_lock   pci_block_user_cfg_access
+#define pci_cfg_access_unlock pci_unblock_user_cfg_access
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
+#define HAVE_PTE_MASK_PAGE_IOMAP
+#endif
+
+#ifndef PCI_MSIX_ENTRY_SIZE
+#define PCI_MSIX_ENTRY_SIZE            16
+#define PCI_MSIX_ENTRY_VECTOR_CTRL     12
+#define PCI_MSIX_ENTRY_CTRL_MASKBIT    1
+#endif
+
+/*
+ * for kernels < 2.6.38 and backported patch that moves MSI-X entry definition
+ * to pci_regs.h Those kernels has PCI_MSIX_ENTRY_SIZE defined but not
+ * PCI_MSIX_ENTRY_CTRL_MASKBIT
+ */
+#ifndef PCI_MSIX_ENTRY_CTRL_MASKBIT
+#define PCI_MSIX_ENTRY_CTRL_MASKBIT    1
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) && \
+	(!(defined(RHEL_RELEASE_CODE) && \
+	 RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(5, 9)))
+
+static int pci_num_vf(struct pci_dev *dev)
+{
+	struct iov {
+		int pos;
+		int nres;
+		u32 cap;
+		u16 ctrl;
+		u16 total;
+		u16 initial;
+		u16 nr_virtfn;
+	} *iov = (struct iov *)dev->sriov;
+
+	if (!dev->is_physfn)
+		return 0;
+
+	return iov->nr_virtfn;
+}
+
+#endif /* < 2.6.34 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 39) && \
+	(!(defined(RHEL_RELEASE_CODE) && \
+	   RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4)))
+
+#define kstrtoul strict_strtoul
+
+#endif /* < 2.6.39 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) && \
+	(!(defined(RHEL_RELEASE_CODE) && \
+	   RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 3)))
+
+/* Check if INTX works to control irq's.
+ * Set's INTX_DISABLE flag and reads it back
+ */
+static bool pci_intx_mask_supported(struct pci_dev *pdev)
+{
+	bool mask_supported = false;
+	uint16_t orig, new;
+
+	pci_block_user_cfg_access(pdev);
+	pci_read_config_word(pdev, PCI_COMMAND, &orig);
+	pci_write_config_word(pdev, PCI_COMMAND,
+			      orig ^ PCI_COMMAND_INTX_DISABLE);
+	pci_read_config_word(pdev, PCI_COMMAND, &new);
+
+	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
+		dev_err(&pdev->dev, "Command register changed from "
+			"0x%x to 0x%x: driver or hardware bug?\n", orig, new);
+	} else if ((new ^ orig) & PCI_COMMAND_INTX_DISABLE) {
+		mask_supported = true;
+		pci_write_config_word(pdev, PCI_COMMAND, orig);
+	}
+	pci_unblock_user_cfg_access(pdev);
+
+	return mask_supported;
+}
+
+static bool pci_check_and_mask_intx(struct pci_dev *pdev)
+{
+	bool pending;
+	uint32_t status;
+
+	pci_block_user_cfg_access(pdev);
+	pci_read_config_dword(pdev, PCI_COMMAND, &status);
+
+	/* interrupt is not ours, goes to out */
+	pending = (((status >> 16) & PCI_STATUS_INTERRUPT) != 0);
+	if (pending) {
+		uint16_t old, new;
+
+		old = status;
+		if (status != 0)
+			new = old & (~PCI_COMMAND_INTX_DISABLE);
+		else
+			new = old | PCI_COMMAND_INTX_DISABLE;
+
+		if (old != new)
+			pci_write_config_word(pdev, PCI_COMMAND, new);
+	}
+	pci_unblock_user_cfg_access(pdev);
+
+	return pending;
+}
+
+#endif /* < 3.3.0 */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+#define HAVE_PCI_IS_BRIDGE_API 1
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
+#define HAVE_MSI_LIST_IN_GENERIC_DEVICE 1
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+#define HAVE_PCI_MSI_MASK_IRQ 1
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
+#define HAVE_ALLOC_IRQ_VECTORS 1
+#endif
+
+static inline bool igbuio_kernel_is_locked_down(void)
+{
+#ifdef CONFIG_LOCK_DOWN_KERNEL
+#ifdef CONFIG_LOCK_DOWN_IN_EFI_SECURE_BOOT
+	return kernel_is_locked_down(NULL);
+#elif defined(CONFIG_EFI_SECURE_BOOT_LOCK_DOWN)
+	return kernel_is_locked_down();
+#else
+	return false;
+#endif
+#else
+	return false;
+#endif
+}
diff --git a/drivers/amazon/net/igb_uio/igb_uio.c b/drivers/amazon/net/igb_uio/igb_uio.c
new file mode 100644
index 0000000000000..ea439d131de1a
--- /dev/null
+++ b/drivers/amazon/net/igb_uio/igb_uio.c
@@ -0,0 +1,674 @@
+// SPDX-License-Identifier: GPL-2.0
+/*-
+ * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/uio_driver.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <linux/version.h>
+#include <linux/slab.h>
+
+/**
+ * These enum and macro definitions are copied from the
+ * file rte_pci_dev_features.h
+ */
+enum rte_intr_mode {
+	RTE_INTR_MODE_NONE = 0,
+	RTE_INTR_MODE_LEGACY,
+	RTE_INTR_MODE_MSI,
+	RTE_INTR_MODE_MSIX
+};
+#define RTE_INTR_MODE_NONE_NAME "none"
+#define RTE_INTR_MODE_LEGACY_NAME "legacy"
+#define RTE_INTR_MODE_MSI_NAME "msi"
+#define RTE_INTR_MODE_MSIX_NAME "msix"
+
+
+#include "compat.h"
+
+/**
+ * A structure describing the private information for a uio device.
+ */
+struct rte_uio_pci_dev {
+	struct uio_info info;
+	struct pci_dev *pdev;
+	enum rte_intr_mode mode;
+	atomic_t refcnt;
+};
+
+static int wc_activate;
+static char *intr_mode;
+static enum rte_intr_mode igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX;
+/* sriov sysfs */
+static ssize_t
+show_max_vfs(struct device *dev, struct device_attribute *attr,
+	     char *buf)
+{
+	return snprintf(buf, 10, "%u\n", dev_num_vf(dev));
+}
+
+static ssize_t
+store_max_vfs(struct device *dev, struct device_attribute *attr,
+	      const char *buf, size_t count)
+{
+	int err = 0;
+	unsigned long max_vfs;
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (0 != kstrtoul(buf, 0, &max_vfs))
+		return -EINVAL;
+
+	if (0 == max_vfs)
+		pci_disable_sriov(pdev);
+	else if (0 == pci_num_vf(pdev))
+		err = pci_enable_sriov(pdev, max_vfs);
+	else /* do nothing if change max_vfs number */
+		err = -EINVAL;
+
+	return err ? err : count;
+}
+
+static DEVICE_ATTR(max_vfs, S_IRUGO | S_IWUSR, show_max_vfs, store_max_vfs);
+
+static struct attribute *dev_attrs[] = {
+	&dev_attr_max_vfs.attr,
+	NULL,
+};
+
+static const struct attribute_group dev_attr_grp = {
+	.attrs = dev_attrs,
+};
+
+#ifndef HAVE_PCI_MSI_MASK_IRQ
+/*
+ * It masks the msix on/off of generating MSI-X messages.
+ */
+static void
+igbuio_msix_mask_irq(struct msi_desc *desc, s32 state)
+{
+	u32 mask_bits = desc->masked;
+	unsigned int offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+						PCI_MSIX_ENTRY_VECTOR_CTRL;
+
+	if (state != 0)
+		mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
+	else
+		mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
+
+	if (mask_bits != desc->masked) {
+		writel(mask_bits, desc->mask_base + offset);
+		readl(desc->mask_base);
+		desc->masked = mask_bits;
+	}
+}
+
+/*
+ * It masks the msi on/off of generating MSI messages.
+ */
+static void
+igbuio_msi_mask_irq(struct pci_dev *pdev, struct msi_desc *desc, int32_t state)
+{
+	u32 mask_bits = desc->masked;
+	u32 offset = desc->irq - pdev->irq;
+	u32 mask = 1 << offset;
+
+	if (!desc->msi_attrib.maskbit)
+		return;
+
+	if (state != 0)
+		mask_bits &= ~mask;
+	else
+		mask_bits |= mask;
+
+	if (mask_bits != desc->masked) {
+		pci_write_config_dword(pdev, desc->mask_pos, mask_bits);
+		desc->masked = mask_bits;
+	}
+}
+
+static void
+igbuio_mask_irq(struct pci_dev *pdev, enum rte_intr_mode mode, s32 irq_state)
+{
+	struct msi_desc *desc;
+	struct list_head *msi_list;
+
+#ifdef HAVE_MSI_LIST_IN_GENERIC_DEVICE
+	msi_list = &pdev->dev.msi_list;
+#else
+	msi_list = &pdev->msi_list;
+#endif
+
+	if (mode == RTE_INTR_MODE_MSIX) {
+		list_for_each_entry(desc, msi_list, list)
+			igbuio_msix_mask_irq(desc, irq_state);
+	} else if (mode == RTE_INTR_MODE_MSI) {
+		list_for_each_entry(desc, msi_list, list)
+			igbuio_msi_mask_irq(pdev, desc, irq_state);
+	}
+}
+#endif
+
+/**
+ * This is the irqcontrol callback to be registered to uio_info.
+ * It can be used to disable/enable interrupt from user space processes.
+ *
+ * @param info
+ *  pointer to uio_info.
+ * @param irq_state
+ *  state value. 1 to enable interrupt, 0 to disable interrupt.
+ *
+ * @return
+ *  - On success, 0.
+ *  - On failure, a negative value.
+ */
+static int
+igbuio_pci_irqcontrol(struct uio_info *info, s32 irq_state)
+{
+	struct rte_uio_pci_dev *udev = info->priv;
+	struct pci_dev *pdev = udev->pdev;
+
+#ifdef HAVE_PCI_MSI_MASK_IRQ
+	struct irq_data *irq = irq_get_irq_data(udev->info.irq);
+#endif
+
+	pci_cfg_access_lock(pdev);
+
+	if (udev->mode == RTE_INTR_MODE_MSIX || udev->mode == RTE_INTR_MODE_MSI) {
+#ifdef HAVE_PCI_MSI_MASK_IRQ
+		if (irq_state == 1)
+			pci_msi_unmask_irq(irq);
+		else
+			pci_msi_mask_irq(irq);
+#else
+		igbuio_mask_irq(pdev, udev->mode, irq_state);
+#endif
+	}
+
+	if (udev->mode == RTE_INTR_MODE_LEGACY)
+		pci_intx(pdev, !!irq_state);
+
+	pci_cfg_access_unlock(pdev);
+
+	return 0;
+}
+
+/**
+ * This is interrupt handler which will check if the interrupt is for the right device.
+ * If yes, disable it here and will be enable later.
+ */
+static irqreturn_t
+igbuio_pci_irqhandler(int irq, void *dev_id)
+{
+	struct rte_uio_pci_dev *udev = (struct rte_uio_pci_dev *)dev_id;
+	struct uio_info *info = &udev->info;
+
+	/* Legacy mode need to mask in hardware */
+	if (udev->mode == RTE_INTR_MODE_LEGACY &&
+	    !pci_check_and_mask_intx(udev->pdev))
+		return IRQ_NONE;
+
+	uio_event_notify(info);
+
+	/* Message signal mode, no share IRQ and automasked */
+	return IRQ_HANDLED;
+}
+
+static int
+igbuio_pci_enable_interrupts(struct rte_uio_pci_dev *udev)
+{
+	int err = 0;
+#ifndef HAVE_ALLOC_IRQ_VECTORS
+	struct msix_entry msix_entry;
+#endif
+
+	switch (igbuio_intr_mode_preferred) {
+	case RTE_INTR_MODE_MSIX:
+		/* Only 1 msi-x vector needed */
+#ifndef HAVE_ALLOC_IRQ_VECTORS
+		msix_entry.entry = 0;
+		if (pci_enable_msix(udev->pdev, &msix_entry, 1) == 0) {
+			dev_dbg(&udev->pdev->dev, "using MSI-X");
+			udev->info.irq_flags = IRQF_NO_THREAD;
+			udev->info.irq = msix_entry.vector;
+			udev->mode = RTE_INTR_MODE_MSIX;
+			break;
+		}
+#else
+		if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSIX) == 1) {
+			dev_dbg(&udev->pdev->dev, "using MSI-X");
+			udev->info.irq_flags = IRQF_NO_THREAD;
+			udev->info.irq = pci_irq_vector(udev->pdev, 0);
+			udev->mode = RTE_INTR_MODE_MSIX;
+			break;
+		}
+#endif
+
+	/* falls through - to MSI */
+	case RTE_INTR_MODE_MSI:
+#ifndef HAVE_ALLOC_IRQ_VECTORS
+		if (pci_enable_msi(udev->pdev) == 0) {
+			dev_dbg(&udev->pdev->dev, "using MSI");
+			udev->info.irq_flags = IRQF_NO_THREAD;
+			udev->info.irq = udev->pdev->irq;
+			udev->mode = RTE_INTR_MODE_MSI;
+			break;
+		}
+#else
+		if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSI) == 1) {
+			dev_dbg(&udev->pdev->dev, "using MSI");
+			udev->info.irq_flags = IRQF_NO_THREAD;
+			udev->info.irq = pci_irq_vector(udev->pdev, 0);
+			udev->mode = RTE_INTR_MODE_MSI;
+			break;
+		}
+#endif
+	/* falls through - to INTX */
+	case RTE_INTR_MODE_LEGACY:
+		if (pci_intx_mask_supported(udev->pdev)) {
+			dev_dbg(&udev->pdev->dev, "using INTX");
+			udev->info.irq_flags = IRQF_SHARED | IRQF_NO_THREAD;
+			udev->info.irq = udev->pdev->irq;
+			udev->mode = RTE_INTR_MODE_LEGACY;
+			break;
+		}
+		dev_notice(&udev->pdev->dev, "PCI INTX mask not supported\n");
+	/* falls through - to no IRQ */
+	case RTE_INTR_MODE_NONE:
+		udev->mode = RTE_INTR_MODE_NONE;
+		udev->info.irq = UIO_IRQ_NONE;
+		break;
+
+	default:
+		dev_err(&udev->pdev->dev, "invalid IRQ mode %u",
+			igbuio_intr_mode_preferred);
+		udev->info.irq = UIO_IRQ_NONE;
+		err = -EINVAL;
+	}
+
+	if (udev->info.irq != UIO_IRQ_NONE)
+		err = request_irq(udev->info.irq, igbuio_pci_irqhandler,
+				  udev->info.irq_flags, udev->info.name,
+				  udev);
+	dev_info(&udev->pdev->dev, "uio device registered with irq %ld\n",
+		 udev->info.irq);
+
+	return err;
+}
+
+static void
+igbuio_pci_disable_interrupts(struct rte_uio_pci_dev *udev)
+{
+	if (udev->info.irq) {
+		free_irq(udev->info.irq, udev);
+		udev->info.irq = 0;
+	}
+
+#ifndef HAVE_ALLOC_IRQ_VECTORS
+	if (udev->mode == RTE_INTR_MODE_MSIX)
+		pci_disable_msix(udev->pdev);
+	if (udev->mode == RTE_INTR_MODE_MSI)
+		pci_disable_msi(udev->pdev);
+#else
+	if (udev->mode == RTE_INTR_MODE_MSIX ||
+	    udev->mode == RTE_INTR_MODE_MSI)
+		pci_free_irq_vectors(udev->pdev);
+#endif
+}
+
+
+/**
+ * This gets called while opening uio device file.
+ */
+static int
+igbuio_pci_open(struct uio_info *info, struct inode *inode)
+{
+	struct rte_uio_pci_dev *udev = info->priv;
+	struct pci_dev *dev = udev->pdev;
+	int err;
+
+	if (atomic_inc_return(&udev->refcnt) != 1)
+		return 0;
+
+	/* set bus master, which was cleared by the reset function */
+	pci_set_master(dev);
+
+	/* enable interrupts */
+	err = igbuio_pci_enable_interrupts(udev);
+	if (err) {
+		atomic_dec(&udev->refcnt);
+		dev_err(&dev->dev, "Enable interrupt fails\n");
+	}
+	return err;
+}
+
+static int
+igbuio_pci_release(struct uio_info *info, struct inode *inode)
+{
+	struct rte_uio_pci_dev *udev = info->priv;
+	struct pci_dev *dev = udev->pdev;
+
+	if (atomic_dec_and_test(&udev->refcnt)) {
+		/* disable interrupts */
+		igbuio_pci_disable_interrupts(udev);
+
+		/* stop the device from further DMA */
+		pci_clear_master(dev);
+	}
+
+	return 0;
+}
+
+/* Remap pci resources described by bar #pci_bar in uio resource n. */
+static int
+igbuio_pci_setup_iomem(struct pci_dev *dev, struct uio_info *info,
+		       int n, int pci_bar, const char *name)
+{
+	unsigned long addr, len;
+	void *internal_addr;
+
+	if (n >= ARRAY_SIZE(info->mem))
+		return -EINVAL;
+
+	addr = pci_resource_start(dev, pci_bar);
+	len = pci_resource_len(dev, pci_bar);
+	if (addr == 0 || len == 0)
+		return -1;
+	if (wc_activate == 0) {
+		internal_addr = ioremap(addr, len);
+		if (internal_addr == NULL)
+			return -1;
+	} else {
+		internal_addr = NULL;
+	}
+	info->mem[n].name = name;
+	info->mem[n].addr = addr;
+	info->mem[n].internal_addr = internal_addr;
+	info->mem[n].size = len;
+	info->mem[n].memtype = UIO_MEM_PHYS;
+	return 0;
+}
+
+/* Get pci port io resources described by bar #pci_bar in uio resource n. */
+static int
+igbuio_pci_setup_ioport(struct pci_dev *dev, struct uio_info *info,
+		int n, int pci_bar, const char *name)
+{
+	unsigned long addr, len;
+
+	if (n >= ARRAY_SIZE(info->port))
+		return -EINVAL;
+
+	addr = pci_resource_start(dev, pci_bar);
+	len = pci_resource_len(dev, pci_bar);
+	if (addr == 0 || len == 0)
+		return -EINVAL;
+
+	info->port[n].name = name;
+	info->port[n].start = addr;
+	info->port[n].size = len;
+	info->port[n].porttype = UIO_PORT_X86;
+
+	return 0;
+}
+
+/* Unmap previously ioremap'd resources */
+static void
+igbuio_pci_release_iomem(struct uio_info *info)
+{
+	int i;
+
+	for (i = 0; i < MAX_UIO_MAPS; i++) {
+		if (info->mem[i].internal_addr)
+			iounmap(info->mem[i].internal_addr);
+	}
+}
+
+static int
+igbuio_setup_bars(struct pci_dev *dev, struct uio_info *info)
+{
+	int i, iom, iop, ret;
+	unsigned long flags;
+	static const char *bar_names[PCI_STD_RESOURCE_END + 1]  = {
+		"BAR0",
+		"BAR1",
+		"BAR2",
+		"BAR3",
+		"BAR4",
+		"BAR5",
+	};
+
+	iom = 0;
+	iop = 0;
+
+	for (i = 0; i < ARRAY_SIZE(bar_names); i++) {
+		if (pci_resource_len(dev, i) != 0 &&
+				pci_resource_start(dev, i) != 0) {
+			flags = pci_resource_flags(dev, i);
+			if (flags & IORESOURCE_MEM) {
+				ret = igbuio_pci_setup_iomem(dev, info, iom,
+							     i, bar_names[i]);
+				if (ret != 0)
+					return ret;
+				iom++;
+			} else if (flags & IORESOURCE_IO) {
+				ret = igbuio_pci_setup_ioport(dev, info, iop,
+							      i, bar_names[i]);
+				if (ret != 0)
+					return ret;
+				iop++;
+			}
+		}
+	}
+
+	return (iom != 0 || iop != 0) ? ret : -ENOENT;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0)
+static int __devinit
+#else
+static int
+#endif
+igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	struct rte_uio_pci_dev *udev;
+	dma_addr_t map_dma_addr;
+	void *map_addr;
+	int err;
+
+#ifdef HAVE_PCI_IS_BRIDGE_API
+	if (pci_is_bridge(dev)) {
+		dev_warn(&dev->dev, "Ignoring PCI bridge device\n");
+		return -ENODEV;
+	}
+#endif
+
+	udev = kzalloc(sizeof(struct rte_uio_pci_dev), GFP_KERNEL);
+	if (!udev)
+		return -ENOMEM;
+
+	/*
+	 * enable device: ask low-level code to enable I/O and
+	 * memory
+	 */
+	err = pci_enable_device(dev);
+	if (err != 0) {
+		dev_err(&dev->dev, "Cannot enable PCI device\n");
+		goto fail_free;
+	}
+
+	/* enable bus mastering on the device */
+	pci_set_master(dev);
+
+	/* remap IO memory */
+	err = igbuio_setup_bars(dev, &udev->info);
+	if (err != 0)
+		goto fail_release_iomem;
+
+	/* set 64-bit DMA mask */
+	err = pci_set_dma_mask(dev,  DMA_BIT_MASK(64));
+	if (err != 0) {
+		dev_err(&dev->dev, "Cannot set DMA mask\n");
+		goto fail_release_iomem;
+	}
+
+	err = pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(64));
+	if (err != 0) {
+		dev_err(&dev->dev, "Cannot set consistent DMA mask\n");
+		goto fail_release_iomem;
+	}
+
+	/* fill uio infos */
+	udev->info.name = "igb_uio";
+	udev->info.version = "0.1";
+	udev->info.irqcontrol = igbuio_pci_irqcontrol;
+	udev->info.open = igbuio_pci_open;
+	udev->info.release = igbuio_pci_release;
+	udev->info.priv = udev;
+	udev->pdev = dev;
+	atomic_set(&udev->refcnt, 0);
+
+	err = sysfs_create_group(&dev->dev.kobj, &dev_attr_grp);
+	if (err != 0)
+		goto fail_release_iomem;
+
+	/* register uio driver */
+	err = uio_register_device(&dev->dev, &udev->info);
+	if (err != 0)
+		goto fail_remove_group;
+
+	pci_set_drvdata(dev, udev);
+
+	/*
+	 * Doing a harmless dma mapping for attaching the device to
+	 * the iommu identity mapping if kernel boots with iommu=pt.
+	 * Note this is not a problem if no IOMMU at all.
+	 */
+	map_addr = dma_alloc_coherent(&dev->dev, 1024, &map_dma_addr,
+			GFP_KERNEL);
+	if (map_addr)
+		memset(map_addr, 0, 1024);
+
+	if (!map_addr)
+		dev_info(&dev->dev, "dma mapping failed\n");
+	else {
+		dev_info(&dev->dev, "mapping 1K dma=%#llx host=%p\n",
+			 (unsigned long long)map_dma_addr, map_addr);
+
+		dma_free_coherent(&dev->dev, 1024, map_addr, map_dma_addr);
+		dev_info(&dev->dev, "unmapping 1K dma=%#llx host=%p\n",
+			 (unsigned long long)map_dma_addr, map_addr);
+	}
+
+	return 0;
+
+fail_remove_group:
+	sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
+fail_release_iomem:
+	igbuio_pci_release_iomem(&udev->info);
+	pci_disable_device(dev);
+fail_free:
+	kfree(udev);
+
+	return err;
+}
+
+static void
+igbuio_pci_remove(struct pci_dev *dev)
+{
+	struct rte_uio_pci_dev *udev = pci_get_drvdata(dev);
+
+	igbuio_pci_release(&udev->info, NULL);
+
+	sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
+	uio_unregister_device(&udev->info);
+	igbuio_pci_release_iomem(&udev->info);
+	pci_disable_device(dev);
+	pci_set_drvdata(dev, NULL);
+	kfree(udev);
+}
+
+static int
+igbuio_config_intr_mode(char *intr_str)
+{
+	if (!intr_str) {
+		pr_info("Use MSIX interrupt by default\n");
+		return 0;
+	}
+
+	if (!strcmp(intr_str, RTE_INTR_MODE_MSIX_NAME)) {
+		igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX;
+		pr_info("Use MSIX interrupt\n");
+	} else if (!strcmp(intr_str, RTE_INTR_MODE_MSI_NAME)) {
+		igbuio_intr_mode_preferred = RTE_INTR_MODE_MSI;
+		pr_info("Use MSI interrupt\n");
+	} else if (!strcmp(intr_str, RTE_INTR_MODE_LEGACY_NAME)) {
+		igbuio_intr_mode_preferred = RTE_INTR_MODE_LEGACY;
+		pr_info("Use legacy interrupt\n");
+	} else {
+		pr_info("Error: bad parameter - %s\n", intr_str);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct pci_driver igbuio_pci_driver = {
+	.name = "igb_uio",
+	.id_table = NULL,
+	.probe = igbuio_pci_probe,
+	.remove = igbuio_pci_remove,
+};
+
+static int __init
+igbuio_pci_init_module(void)
+{
+	int ret;
+
+	if (igbuio_kernel_is_locked_down()) {
+		pr_err("Not able to use module, kernel lock down is enabled\n");
+		return -EINVAL;
+	}
+
+	if (wc_activate != 0)
+		pr_info("wc_activate is set\n");
+
+	ret = igbuio_config_intr_mode(intr_mode);
+	if (ret < 0)
+		return ret;
+
+	return pci_register_driver(&igbuio_pci_driver);
+}
+
+static void __exit
+igbuio_pci_exit_module(void)
+{
+	pci_unregister_driver(&igbuio_pci_driver);
+}
+
+module_init(igbuio_pci_init_module);
+module_exit(igbuio_pci_exit_module);
+
+module_param(intr_mode, charp, S_IRUGO);
+MODULE_PARM_DESC(intr_mode,
+"igb_uio interrupt mode (default=msix):\n"
+"    " RTE_INTR_MODE_MSIX_NAME "       Use MSIX interrupt\n"
+"    " RTE_INTR_MODE_MSI_NAME "        Use MSI interrupt\n"
+"    " RTE_INTR_MODE_LEGACY_NAME "     Use Legacy interrupt\n"
+"\n");
+
+module_param(wc_activate, int, 0);
+MODULE_PARM_DESC(wc_activate,
+"Activate support for write combining (WC) (default=0)\n"
+"    0 - disable\n"
+"    other - enable\n");
+
+MODULE_DESCRIPTION("UIO driver for Intel IGB PCI cards");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");

From ad9c829ecd381a7c988efb075d986ce628a7f55a Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Tue, 10 Jan 2023 21:26:58 +0000
Subject: [PATCH 016/175] ENA: Update to v2.8.1

Source: https://github.com/amzn/amzn-drivers/

Change Log:

## r2.8.1 release notes
**New Features**
* Add extended metrics mechanism support
* Add conntrack customer metric to ethtool

**Bug Fixes**
* Fix compilation issues on SLES 15 SP4
* Fix compilation errors in RHEL 8.7, 9.0
* Configure TX rings mem policy in reset flow

**Minor Changes**
* Add napi_build_skb support
* Add napi_consume_skb
* Align ena_alloc_map_page signature
* Move from strlcpy with unused retval to strscpy
* Add status check for strscpy calls
* Backport napi_alloc_skb usage
---
 drivers/amazon/net/ena/ena_admin_defs.h  |  31 ++++
 drivers/amazon/net/ena/ena_com.c         | 157 +++++++++++++++----
 drivers/amazon/net/ena/ena_com.h         |  59 +++++++
 drivers/amazon/net/ena/ena_common_defs.h |   0
 drivers/amazon/net/ena/ena_devlink.c     |   2 +
 drivers/amazon/net/ena/ena_eth_io_defs.h |   0
 drivers/amazon/net/ena/ena_ethtool.c     | 187 ++++++++++++++++-------
 drivers/amazon/net/ena/ena_netdev.c      |  77 ++++++----
 drivers/amazon/net/ena/ena_netdev.h      |   4 +-
 drivers/amazon/net/ena/ena_pci_id_tbl.h  |   0
 drivers/amazon/net/ena/ena_regs_defs.h   |   0
 drivers/amazon/net/ena/ena_sysfs.c       |   0
 drivers/amazon/net/ena/ena_sysfs.h       |   0
 drivers/amazon/net/ena/kcompat.h         | 102 +++++++++++--
 14 files changed, 496 insertions(+), 123 deletions(-)
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_common_defs.h
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_eth_io_defs.h
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_pci_id_tbl.h
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_regs_defs.h
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_sysfs.c
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_sysfs.h

diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index a52f588445039..b3a9f1aec52b3 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -10,6 +10,21 @@
 
 #define ENA_ADMIN_RSS_KEY_PARTS              10
 
+#define ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK 0x3F
+#define ENA_ADMIN_CUSTOMER_METRICS_MIN_SUPPORT_MASK 0x1F
+
+ /* customer metrics - in correlation with
+  * ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK
+  */
+enum ena_admin_customer_metrics_id {
+	ENA_ADMIN_BW_IN_ALLOWANCE_EXCEEDED         = 0,
+	ENA_ADMIN_BW_OUT_ALLOWANCE_EXCEEDED        = 1,
+	ENA_ADMIN_PPS_ALLOWANCE_EXCEEDED           = 2,
+	ENA_ADMIN_CONNTRACK_ALLOWANCE_EXCEEDED     = 3,
+	ENA_ADMIN_LINKLOCAL_ALLOWANCE_EXCEEDED     = 4,
+	ENA_ADMIN_CONNTRACK_ALLOWANCE_AVAILABLE    = 5,
+};
+
 enum ena_admin_aq_opcode {
 	ENA_ADMIN_CREATE_SQ                         = 1,
 	ENA_ADMIN_DESTROY_SQ                        = 2,
@@ -59,6 +74,7 @@ enum ena_admin_aq_caps_id {
 	ENA_ADMIN_ENI_STATS                         = 0,
 	/* ENA SRD customer metrics */
 	ENA_ADMIN_ENA_SRD_INFO                      = 1,
+	ENA_ADMIN_CUSTOMER_METRICS                  = 2,
 };
 
 enum ena_admin_placement_policy_type {
@@ -109,6 +125,8 @@ enum ena_admin_get_stats_type {
 	ENA_ADMIN_GET_STATS_TYPE_ENI                = 2,
 	/* extra HW stats for ENA SRD */
 	ENA_ADMIN_GET_STATS_TYPE_ENA_SRD            = 3,
+	ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS   = 4,
+
 };
 
 enum ena_admin_get_stats_scope {
@@ -387,6 +405,9 @@ struct ena_admin_aq_get_stats_cmd {
 	 * stats of other device
 	 */
 	u16 device_id;
+
+	/* a bitmap representing the requested metric values */
+	u64 requested_metrics;
 };
 
 /* Basic Statistics Command. */
@@ -469,6 +490,14 @@ struct ena_admin_ena_srd_info {
 	struct ena_admin_ena_srd_stats ena_srd_stats;
 };
 
+/* Customer Metrics Command. */
+struct ena_admin_customer_metrics {
+	/* A bitmap representing the reported customer metrics according to
+	 * the order they are reported
+	 */
+	u64 reported_metrics;
+};
+
 struct ena_admin_acq_get_stats_resp {
 	struct ena_admin_acq_common_desc acq_common_desc;
 
@@ -480,6 +509,8 @@ struct ena_admin_acq_get_stats_resp {
 		struct ena_admin_eni_stats eni_stats;
 
 		struct ena_admin_ena_srd_info ena_srd_info;
+
+		struct ena_admin_customer_metrics customer_metrics;
 	} u;
 };
 
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 520dad1e549af..9bd064ff0f6c7 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -2176,6 +2176,58 @@ int ena_com_get_link_params(struct ena_com_dev *ena_dev,
 	return ena_com_get_feature(ena_dev, resp, ENA_ADMIN_LINK_CONFIG, 0);
 }
 
+static int ena_get_dev_stats(struct ena_com_dev *ena_dev,
+			     struct ena_com_stats_ctx *ctx,
+			     enum ena_admin_get_stats_type type)
+{
+	struct ena_admin_acq_get_stats_resp *get_resp = &ctx->get_resp;
+	struct ena_admin_aq_get_stats_cmd *get_cmd = &ctx->get_cmd;
+	struct ena_com_admin_queue *admin_queue;
+	int ret;
+
+	admin_queue = &ena_dev->admin_queue;
+
+	get_cmd->aq_common_descriptor.opcode = ENA_ADMIN_GET_STATS;
+	get_cmd->aq_common_descriptor.flags = 0;
+	get_cmd->type = type;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)get_cmd,
+					    sizeof(*get_cmd),
+					    (struct ena_admin_acq_entry *)get_resp,
+					    sizeof(*get_resp));
+
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to get stats. error: %d\n", ret);
+
+	return ret;
+}
+
+static void ena_com_set_supported_customer_metrics(struct ena_com_dev *ena_dev)
+{
+	struct ena_customer_metrics *customer_metrics;
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	customer_metrics = &ena_dev->customer_metrics;
+	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_CUSTOMER_METRICS)) {
+		customer_metrics->supported_metrics = ENA_ADMIN_CUSTOMER_METRICS_MIN_SUPPORT_MASK;
+		return;
+	}
+
+	memset(&ctx, 0x0, sizeof(ctx));
+	ctx.get_cmd.requested_metrics = ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK;
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS);
+	if (likely(ret == 0))
+		customer_metrics->supported_metrics =
+			ctx.get_resp.u.customer_metrics.reported_metrics;
+	else
+		netdev_err(ena_dev->net_device,
+			   "Failed to query customer metrics support. error: %d\n",
+			   ret);
+}
+
 int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 			      struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
@@ -2259,6 +2311,8 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	else
 		return rc;
 
+	ena_com_set_supported_customer_metrics(ena_dev);
+
 	return 0;
 }
 
@@ -2413,34 +2467,6 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 	return 0;
 }
 
-static int ena_get_dev_stats(struct ena_com_dev *ena_dev,
-			     struct ena_com_stats_ctx *ctx,
-			     enum ena_admin_get_stats_type type)
-{
-	struct ena_admin_aq_get_stats_cmd *get_cmd = &ctx->get_cmd;
-	struct ena_admin_acq_get_stats_resp *get_resp = &ctx->get_resp;
-	struct ena_com_admin_queue *admin_queue;
-	int ret;
-
-	admin_queue = &ena_dev->admin_queue;
-
-	get_cmd->aq_common_descriptor.opcode = ENA_ADMIN_GET_STATS;
-	get_cmd->aq_common_descriptor.flags = 0;
-	get_cmd->type = type;
-
-	ret =  ena_com_execute_admin_command(admin_queue,
-					     (struct ena_admin_aq_entry *)get_cmd,
-					     sizeof(*get_cmd),
-					     (struct ena_admin_acq_entry *)get_resp,
-					     sizeof(*get_resp));
-
-	if (unlikely(ret))
-		netdev_err(ena_dev->net_device,
-			   "Failed to get stats. error: %d\n", ret);
-
-	return ret;
-}
-
 int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
 			  struct ena_admin_eni_stats *stats)
 {
@@ -2500,6 +2526,53 @@ int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 	return ret;
 }
 
+int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32 len)
+{
+	struct ena_admin_aq_get_stats_cmd *get_cmd;
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	if (unlikely(len > ena_dev->customer_metrics.buffer_len)) {
+		netdev_err(ena_dev->net_device,
+			   "Invalid buffer size %u. The given buffer is too big.\n",
+			   len);
+		return -EINVAL;
+	}
+
+	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_CUSTOMER_METRICS)) {
+		netdev_err(ena_dev->net_device, "Capability %d not supported.\n",
+			   ENA_ADMIN_CUSTOMER_METRICS);
+		return -EOPNOTSUPP;
+	}
+
+	if (!ena_dev->customer_metrics.supported_metrics) {
+		netdev_err(ena_dev->net_device,
+			   "No supported customer metrics.\n");
+		return -EOPNOTSUPP;
+	}
+
+	get_cmd = &ctx.get_cmd;
+	memset(&ctx, 0x0, sizeof(ctx));
+	ret = ena_com_mem_addr_set(ena_dev,
+		&get_cmd->u.control_buffer.address,
+		ena_dev->customer_metrics.buffer_dma_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device, "Memory address set failed.\n");
+		return ret;
+	}
+
+	get_cmd->u.control_buffer.length = ena_dev->customer_metrics.buffer_len;
+	get_cmd->requested_metrics = ena_dev->customer_metrics.supported_metrics;
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS);
+	if (likely(ret == 0))
+		memcpy(buffer, ena_dev->customer_metrics.buffer_virt_addr, len);
+	else
+		netdev_err(ena_dev->net_device,
+			   "Failed to get customer metrics. error: %d\n", ret);
+
+	return ret;
+}
+
 int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu)
 {
 	struct ena_com_admin_queue *admin_queue;
@@ -3052,6 +3125,22 @@ int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
 	return 0;
 }
 
+int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev)
+{
+	struct ena_customer_metrics *customer_metrics = &ena_dev->customer_metrics;
+
+	customer_metrics->buffer_len = ENA_CUSTOMER_METRICS_BUFFER_SIZE;
+	customer_metrics->buffer_virt_addr =
+		dma_zalloc_coherent(ena_dev->dmadev,
+				    customer_metrics->buffer_len,
+				    &customer_metrics->buffer_dma_addr,
+				    GFP_KERNEL);
+	if (!customer_metrics->buffer_virt_addr)
+		return -ENOMEM;
+
+	return 0;
+}
+
 void ena_com_delete_host_info(struct ena_com_dev *ena_dev)
 {
 	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
@@ -3075,6 +3164,18 @@ void ena_com_delete_debug_area(struct ena_com_dev *ena_dev)
 	}
 }
 
+void ena_com_delete_customer_metrics_buffer(struct ena_com_dev *ena_dev)
+{
+	struct ena_customer_metrics *customer_metrics = &ena_dev->customer_metrics;
+
+	if (customer_metrics->buffer_virt_addr) {
+		dma_free_coherent(ena_dev->dmadev, customer_metrics->buffer_len,
+				  customer_metrics->buffer_virt_addr,
+				  customer_metrics->buffer_dma_addr);
+		customer_metrics->buffer_virt_addr = NULL;
+	}
+}
+
 int ena_com_set_host_attributes(struct ena_com_dev *ena_dev)
 {
 	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index ab17ba125ca3c..3fd86b6f14e6b 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -42,6 +42,8 @@
 #define ADMIN_CQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_acq_entry))
 #define ADMIN_AENQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_aenq_entry))
 
+#define ENA_CUSTOMER_METRICS_BUFFER_SIZE 512
+
 /*****************************************************************************/
 /*****************************************************************************/
 /* ENA adaptive interrupt moderation settings */
@@ -328,6 +330,16 @@ struct ena_rss {
 
 };
 
+struct ena_customer_metrics {
+	/* in correlation with ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK
+	 * and ena_admin_customer_metrics_id
+	 */
+	u64 supported_metrics;
+	dma_addr_t buffer_dma_addr;
+	void *buffer_virt_addr;
+	u32 buffer_len;
+};
+
 struct ena_host_attribute {
 	/* Debug area */
 	u8 *debug_area_virt_addr;
@@ -379,6 +391,8 @@ struct ena_com_dev {
 	struct ena_intr_moder_entry *intr_moder_tbl;
 
 	struct ena_com_llq_info llq_info;
+
+	struct ena_customer_metrics customer_metrics;
 };
 
 struct ena_com_dev_get_features_ctx {
@@ -702,6 +716,15 @@ int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
 int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev,
 			     struct ena_admin_ena_srd_info *info);
 
+/* ena_com_get_customer_metrics - Get customer metrics for network interface
+ * @ena_dev: ENA communication layer struct
+ * @buffer: buffer for returned customer metrics
+ * @len: size of the buffer
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32 len);
+
 /* ena_com_set_dev_mtu - Configure the device mtu.
  * @ena_dev: ENA communication layer struct
  * @mtu: mtu value
@@ -912,6 +935,13 @@ int ena_com_allocate_host_info(struct ena_com_dev *ena_dev);
 int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
 				u32 debug_area_size);
 
+/* ena_com_allocate_customer_metrics_buffer - Allocate customer metrics resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev);
+
 /* ena_com_delete_debug_area - Free the debug area resources.
  * @ena_dev: ENA communication layer struct
  *
@@ -926,6 +956,13 @@ void ena_com_delete_debug_area(struct ena_com_dev *ena_dev);
  */
 void ena_com_delete_host_info(struct ena_com_dev *ena_dev);
 
+/* ena_com_delete_customer_metrics_buffer - Free the customer metrics resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Free the allocated customer metrics area.
+ */
+void ena_com_delete_customer_metrics_buffer(struct ena_com_dev *ena_dev);
+
 /* ena_com_set_host_attributes - Update the device with the host
  * attributes (debug area and host info) base address.
  * @ena_dev: ENA communication layer struct
@@ -1082,6 +1119,28 @@ static inline bool ena_com_get_cap(struct ena_com_dev *ena_dev,
 	return !!(ena_dev->capabilities & BIT(cap_id));
 }
 
+/* ena_com_get_customer_metric_support - query whether device supports a given customer metric.
+ * @ena_dev: ENA communication layer struct
+ * @metric_id: enum value representing the customer metric
+ *
+ * @return - true if customer metric is supported or false otherwise
+ */
+static inline bool ena_com_get_customer_metric_support(struct ena_com_dev *ena_dev,
+						       enum ena_admin_customer_metrics_id metric_id)
+{
+	return !!(ena_dev->customer_metrics.supported_metrics & BIT(metric_id));
+}
+
+/* ena_com_get_customer_metric_count - return the number of supported customer metrics.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return - the number of supported customer metrics
+ */
+static inline int ena_com_get_customer_metric_count(struct ena_com_dev *ena_dev)
+{
+	return hweight64(ena_dev->customer_metrics.supported_metrics);
+}
+
 /* ena_com_update_intr_reg - Prepare interrupt register
  * @intr_reg: interrupt register to update.
  * @rx_delay_interval: Rx interval in usecs
diff --git a/drivers/amazon/net/ena/ena_common_defs.h b/drivers/amazon/net/ena/ena_common_defs.h
old mode 100755
new mode 100644
diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c
index 68b02270786c7..fce8d6c795a8b 100644
--- a/drivers/amazon/net/ena/ena_devlink.c
+++ b/drivers/amazon/net/ena/ena_devlink.c
@@ -3,6 +3,8 @@
  * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
+#include "linux/pci.h"
+
 #include "ena_devlink.h"
 #ifdef ENA_DEVLINK_SUPPORT
 
diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h
old mode 100755
new mode 100644
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 08f7ee8fc151c..797ca14a28b3a 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -16,6 +16,10 @@ struct ena_stats {
 	int stat_offset;
 };
 
+struct ena_hw_metrics {
+	char name[ETH_GSTRING_LEN];
+};
+
 #define ENA_STAT_ENA_COM_ADMIN_ENTRY(stat) { \
 	.name = #stat, \
 	.stat_offset = offsetof(struct ena_com_stats_admin, stat) / sizeof(u64) \
@@ -56,6 +60,10 @@ struct ena_stats {
 	.stat_offset = offsetof(struct ena_admin_ena_srd_info, flags) / sizeof(u64) \
 }
 
+#define ENA_METRIC_ENI_ENTRY(stat) { \
+	.name = #stat \
+}
+
 static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(tx_timeout),
 	ENA_STAT_GLOBAL_ENTRY(suspend),
@@ -67,6 +75,9 @@ static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(reset_fail),
 };
 
+/* A partial list of hw stats. Used when admin command
+ * with type ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS is not supported
+ */
 static const struct ena_stats ena_stats_eni_strings[] = {
 	ENA_STAT_ENI_ENTRY(bw_in_allowance_exceeded),
 	ENA_STAT_ENI_ENTRY(bw_out_allowance_exceeded),
@@ -75,6 +86,15 @@ static const struct ena_stats ena_stats_eni_strings[] = {
 	ENA_STAT_ENI_ENTRY(linklocal_allowance_exceeded),
 };
 
+static const struct ena_hw_metrics ena_hw_stats_strings[] = {
+	ENA_METRIC_ENI_ENTRY(bw_in_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(bw_out_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(pps_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(conntrack_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(linklocal_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(conntrack_allowance_available),
+};
+
 static const struct ena_stats ena_srd_info_strings[] = {
 	ENA_STAT_ENA_SRD_MODE_ENTRY(ena_srd_mode),
 	ENA_STAT_ENA_SRD_ENTRY(ena_srd_tx_pkts),
@@ -163,6 +183,7 @@ static const struct ena_stats ena_stats_ena_com_phc_strings[] = {
 #define ENA_STATS_ARRAY_ENA_COM_PHC	ARRAY_SIZE(ena_stats_ena_com_phc_strings)
 #define ENA_STATS_ARRAY_ENI		ARRAY_SIZE(ena_stats_eni_strings)
 #define ENA_STATS_ARRAY_ENA_SRD		ARRAY_SIZE(ena_srd_info_strings)
+#define ENA_METRICS_ARRAY_ENI		ARRAY_SIZE(ena_hw_stats_strings)
 
 static const char ena_priv_flags_strings[][ETH_GSTRING_LEN] = {
 #define ENA_PRIV_FLAGS_LPC	BIT(0)
@@ -177,9 +198,61 @@ static void ena_safe_update_stat(u64 *src, u64 *dst,
 	unsigned int start;
 
 	do {
-		start = u64_stats_fetch_begin_irq(syncp);
+		start = ena_u64_stats_fetch_begin(syncp);
 		*(dst) = *src;
-	} while (u64_stats_fetch_retry_irq(syncp, start));
+	} while (ena_u64_stats_fetch_retry(syncp, start));
+}
+
+
+static void ena_metrics_stats(struct ena_adapter *adapter, u64 **data)
+{
+	struct ena_com_dev *dev = adapter->ena_dev;
+	const struct ena_stats *ena_stats;
+	u64 *ptr;
+	int i;
+
+	if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) {
+		u32 supported_metrics_count;
+		int len;
+
+		supported_metrics_count = ena_com_get_customer_metric_count(dev);
+		len = supported_metrics_count * sizeof(u64);
+
+		/* Fill the data buffer, and advance its pointer */
+		ena_com_get_customer_metrics(adapter->ena_dev, (char *)(*data), len);
+		(*data) += supported_metrics_count;
+
+	} else if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
+		ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats);
+		/* Updating regardless of rc - once we told ethtool how many stats we have
+		 * it will print that much stats. We can't leave holes in the stats
+		 */
+		for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
+			ena_stats = &ena_stats_eni_strings[i];
+
+			ptr = (u64 *)&adapter->eni_stats +
+				ena_stats->stat_offset;
+
+			ena_safe_update_stat(ptr, (*data)++, &adapter->syncp);
+		}
+	}
+
+	if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+		ena_com_get_ena_srd_info(adapter->ena_dev, &adapter->ena_srd_info);
+		/* Get ENA SRD mode */
+		ptr = (u64 *)&adapter->ena_srd_info;
+		ena_safe_update_stat(ptr, (*data)++, &adapter->syncp);
+		for (i = 1; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
+			ena_stats = &ena_srd_info_strings[i];
+			/* Wrapped within an outer struct - need to accommodate an
+			 * additional offset of the ENA SRD mode that was already processed
+			 */
+			ptr = (u64 *)&adapter->ena_srd_info +
+				ena_stats->stat_offset + 1;
+
+			ena_safe_update_stat(ptr, (*data)++, &adapter->syncp);
+		}
+	}
 }
 
 static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
@@ -263,39 +336,8 @@ static void ena_get_stats(struct ena_adapter *adapter,
 		ena_safe_update_stat(ptr, data++, &adapter->syncp);
 	}
 
-	if (hw_stats_needed) {
-		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
-			ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats);
-			/* Updating regardless of rc - once we told ethtool how many stats we have
-			 * it will print that much stats. We can't leave holes in the stats
-			 */
-			for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
-				ena_stats = &ena_stats_eni_strings[i];
-
-				ptr = (u64 *)&adapter->eni_stats +
-					ena_stats->stat_offset;
-
-				ena_safe_update_stat(ptr, data++, &adapter->syncp);
-			}
-		}
-
-		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
-			ena_com_get_ena_srd_info(adapter->ena_dev, &adapter->ena_srd_info);
-			/* Get ENA SRD mode */
-			ptr = (u64 *)&adapter->ena_srd_info;
-			ena_safe_update_stat(ptr, data++, &adapter->syncp);
-			for (i = 1; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
-				ena_stats = &ena_srd_info_strings[i];
-				/* Wrapped within an outer struct - need to accommodate an
-				 * additional offset of the ENA SRD mode that was already processed
-				 */
-				ptr = (u64 *)&adapter->ena_srd_info +
-					ena_stats->stat_offset + 1;
-
-				ena_safe_update_stat(ptr, data++, &adapter->syncp);
-			}
-		}
-	}
+	if (hw_stats_needed)
+		ena_metrics_stats(adapter, &data);
 
 	ena_queue_stats(adapter, &data);
 	ena_com_admin_queue_stats(adapter, &data);
@@ -343,8 +385,16 @@ static int ena_get_sw_stats_count(struct ena_adapter *adapter)
 
 static int ena_get_hw_stats_count(struct ena_adapter *adapter)
 {
-	return ENA_STATS_ARRAY_ENI * ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS) +
-	       ENA_STATS_ARRAY_ENA_SRD * ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO);
+	struct ena_com_dev *dev = adapter->ena_dev;
+	int count = ENA_STATS_ARRAY_ENA_SRD *
+			ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO);
+
+	if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS))
+		count += ena_com_get_customer_metric_count(dev);
+	else if (ena_com_get_cap(dev, ENA_ADMIN_ENI_STATS))
+		count += ENA_STATS_ARRAY_ENI;
+
+	return count;
 }
 
 int ena_get_sset_count(struct net_device *netdev, int sset)
@@ -362,6 +412,35 @@ int ena_get_sset_count(struct net_device *netdev, int sset)
 	return -EOPNOTSUPP;
 }
 
+static void ena_metrics_stats_strings(struct ena_adapter *adapter, u8 **data)
+{
+	struct ena_com_dev *dev = adapter->ena_dev;
+	const struct ena_hw_metrics *ena_metrics;
+	const struct ena_stats *ena_stats;
+	int i;
+
+	if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) {
+		for (i = 0; i < ENA_METRICS_ARRAY_ENI; i++) {
+			if (ena_com_get_customer_metric_support(dev, i)) {
+				ena_metrics = &ena_hw_stats_strings[i];
+				ethtool_sprintf(data, ena_metrics->name);
+			}
+		}
+	} else if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
+		for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
+			ena_stats = &ena_stats_eni_strings[i];
+			ethtool_sprintf(data, ena_stats->name);
+		}
+	}
+
+	if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+		for (i = 0; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
+			ena_stats = &ena_srd_info_strings[i];
+			ethtool_sprintf(data, ena_stats->name);
+		}
+	}
+}
+
 static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
 {
 	const struct ena_stats *ena_stats;
@@ -430,20 +509,8 @@ static void ena_get_strings(struct ena_adapter *adapter,
 		ethtool_sprintf(&data, ena_stats->name);
 	}
 
-	if (hw_stats_needed) {
-		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
-			for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
-				ena_stats = &ena_stats_eni_strings[i];
-				ethtool_sprintf(&data, ena_stats->name);
-			}
-		}
-		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
-			for (i = 0; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
-				ena_stats = &ena_srd_info_strings[i];
-				ethtool_sprintf(&data, ena_stats->name);
-			}
-		}
-	}
+	if (hw_stats_needed)
+		ena_metrics_stats_strings(adapter, &data);
 
 	ena_queue_strings(adapter, &data);
 	ena_com_admin_strings(&data);
@@ -644,11 +711,23 @@ static void ena_get_drvinfo(struct net_device *dev,
 			    struct ethtool_drvinfo *info)
 {
 	struct ena_adapter *adapter = netdev_priv(dev);
+	ssize_t ret = 0;
+
+	ret = strscpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
+	if (ret < 0)
+		netif_info(adapter, drv, dev,
+			   "module name will be truncated, status = %zd\n", ret);
+
+	ret = strscpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version));
+	if (ret < 0)
+		netif_info(adapter, drv, dev,
+			   "module version will be truncated, status = %zd\n", ret);
 
-	strlcpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
-	strlcpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version));
-	strlcpy(info->bus_info, pci_name(adapter->pdev),
+	ret = strscpy(info->bus_info, pci_name(adapter->pdev),
 		sizeof(info->bus_info));
+	if (ret < 0)
+		netif_info(adapter, drv, dev,
+			   "bus info will be truncated, status = %zd\n", ret);
 
 	info->n_priv_flags = ENA_PRIV_FLAGS_NR;
 }
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index fbb96d864d8c3..0595bb82a6eb6 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -57,7 +57,7 @@ MODULE_PARM_DESC(debug, "Debug level (-1=default,0=none,...,16=all)");
 
 static int rx_queue_size = ENA_DEFAULT_RING_SIZE;
 module_param(rx_queue_size, int, 0444);
-MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Max value is 8K\n");
+MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Depending on instance type, max value can be up to 16K\n");
 
 static int force_large_llq_header = 0;
 module_param(force_large_llq_header, int, 0444);
@@ -600,7 +600,8 @@ static void ena_free_all_io_rx_resources(struct ena_adapter *adapter)
 		ena_free_rx_resources(adapter, i);
 }
 
-struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma)
+struct page *ena_alloc_map_page(struct ena_ring *rx_ring,
+				dma_addr_t *dma)
 {
 	struct page *page;
 
@@ -869,7 +870,7 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 
 		ena_unmap_tx_buff(tx_ring, tx_info);
 
-		dev_kfree_skb_any(tx_info->skb);
+		napi_consume_skb(tx_info->skb, 0);
 	}
 	netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev,
 						  tx_ring->qid));
@@ -1001,7 +1002,7 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 			  skb);
 
 		tx_bytes += tx_info->total_tx_size;
-		dev_kfree_skb(skb);
+		napi_consume_skb(skb, budget);
 		tx_pkts++;
 		total_done += tx_info->tx_descs;
 
@@ -1050,15 +1051,15 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag,
 
 #ifdef ENA_LINEAR_FRAG_SUPPORTED
 	if (!first_frag)
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev, len);
+		skb = napi_alloc_skb(rx_ring->napi, len);
 	else
-		skb = build_skb(first_frag, len);
+		skb = ena_build_skb(first_frag, len);
 #else
 	if (!first_frag)
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev, len);
+		skb = napi_alloc_skb(rx_ring->napi, len);
 	else
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
-						ENA_SKB_PULL_MIN_LEN);
+		skb = napi_alloc_skb(rx_ring->napi,
+				     ENA_SKB_PULL_MIN_LEN);
 #endif /* ENA_LINEAR_FRAG_SUPPORTED */
 
 	if (unlikely(!skb)) {
@@ -2036,10 +2037,7 @@ static void ena_init_napi_in_range(struct ena_adapter *adapter,
 			napi_handler = ena_xdp_io_poll;
 #endif /* ENA_XDP_SUPPORT */
 
-		netif_napi_add(adapter->netdev,
-			       &napi->napi,
-			       napi_handler,
-			       NAPI_POLL_WEIGHT);
+		ena_netif_napi_add(adapter->netdev, &napi->napi, napi_handler);
 
 #ifdef ENA_BUSY_POLL_SUPPORT
 		napi_hash_add(&adapter->ena_napi[i].napi);
@@ -3006,7 +3004,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_info->skb = NULL;
 
 error_drop_packet:
-	dev_kfree_skb(skb);
+	napi_consume_skb(skb, 0);
 	return NETDEV_TX_OK;
 }
 
@@ -3078,6 +3076,7 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 {
 	struct device *dev = &pdev->dev;
 	struct ena_admin_host_info *host_info;
+	ssize_t ret;
 	int rc;
 
 	/* Allocate only the host info */
@@ -3092,8 +3091,11 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 	host_info->bdf = (pdev->bus->number << 8) | pdev->devfn;
 	host_info->os_type = ENA_ADMIN_OS_LINUX;
 	host_info->kernel_ver = LINUX_VERSION_CODE;
-	strlcpy(host_info->kernel_ver_str, utsname()->version,
+	ret = strscpy(host_info->kernel_ver_str, utsname()->version,
 		sizeof(host_info->kernel_ver_str) - 1);
+	if (ret < 0)
+		dev_info(dev,
+			 "kernel version string will be truncated, status = %zd\n", ret);
 	host_info->os_dist = 0;
 	strncpy(host_info->os_dist_str, utsname()->release,
 		sizeof(host_info->os_dist_str) - 1);
@@ -3195,10 +3197,10 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		tx_ring = &adapter->tx_ring[i];
 
 		do {
-			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			start = ena_u64_stats_fetch_begin(&tx_ring->syncp);
 			packets = tx_ring->tx_stats.cnt;
 			bytes = tx_ring->tx_stats.bytes;
-		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+		} while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start));
 
 		stats->tx_packets += packets;
 		stats->tx_bytes += bytes;
@@ -3210,21 +3212,21 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		rx_ring = &adapter->rx_ring[i];
 
 		do {
-			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
+			start = ena_u64_stats_fetch_begin(&rx_ring->syncp);
 			packets = rx_ring->rx_stats.cnt;
 			bytes = rx_ring->rx_stats.bytes;
 			xdp_rx_drops += ena_ring_xdp_drops_cnt(rx_ring);
-		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
+		} while (ena_u64_stats_fetch_retry(&rx_ring->syncp, start));
 
 		stats->rx_packets += packets;
 		stats->rx_bytes += bytes;
 	}
 
 	do {
-		start = u64_stats_fetch_begin_irq(&adapter->syncp);
+		start = ena_u64_stats_fetch_begin(&adapter->syncp);
 		rx_drops = adapter->dev_stats.rx_drops;
 		tx_drops = adapter->dev_stats.tx_drops;
-	} while (u64_stats_fetch_retry_irq(&adapter->syncp, start));
+	} while (ena_u64_stats_fetch_retry(&adapter->syncp, start));
 
 	stats->rx_dropped = rx_drops + xdp_rx_drops;
 	stats->tx_dropped = tx_drops;
@@ -3261,10 +3263,10 @@ static struct net_device_stats *ena_get_stats(struct net_device *netdev)
 
 		tx_ring = &adapter->tx_ring[i];
 		do {
-			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			start = ena_u64_stats_fetch_begin(&tx_ring->syncp);
 			packets = (unsigned long)tx_ring->tx_stats.cnt;
 			bytes = (unsigned long)tx_ring->tx_stats.bytes;
-		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+		} while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start));
 
 		stats->tx_packets += packets;
 		stats->tx_bytes += bytes;
@@ -3272,19 +3274,19 @@ static struct net_device_stats *ena_get_stats(struct net_device *netdev)
 		rx_ring = &adapter->rx_ring[i];
 
 		do {
-			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			start = ena_u64_stats_fetch_begin(&tx_ring->syncp);
 			packets = (unsigned long)rx_ring->rx_stats.cnt;
 			bytes = (unsigned long)rx_ring->rx_stats.bytes;
-		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+		} while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start));
 
 		stats->rx_packets += packets;
 		stats->rx_bytes += bytes;
 	}
 
 	do {
-		start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+		start = ena_u64_stats_fetch_begin(&tx_ring->syncp);
 		rx_drops = (unsigned long)adapter->dev_stats.rx_drops;
-	} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+	} while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start));
 
 	stats->rx_dropped = rx_drops;
 
@@ -3699,8 +3701,9 @@ int ena_restore_device(struct ena_adapter *adapter)
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	struct pci_dev *pdev = adapter->pdev;
+	struct ena_ring *txr;
+	int rc, count, i;
 	bool wd_state;
-	int rc;
 
 	set_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
 	rc = ena_device_init(adapter, adapter->pdev, &get_feat_ctx, &wd_state);
@@ -3710,6 +3713,12 @@ int ena_restore_device(struct ena_adapter *adapter)
 	}
 	adapter->wd_state = wd_state;
 
+	count =  adapter->xdp_num_queues + adapter->num_io_queues;
+	for (i = 0 ; i < count; i++) {
+		txr = &adapter->tx_ring[i];
+		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
+	}
+
 	rc = ena_device_validate_params(adapter, &get_feat_ctx);
 	if (rc) {
 		dev_err(&pdev->dev, "Validation of device parameters failed\n");
@@ -4507,10 +4516,16 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	adapter->large_llq_header_enabled = !!force_large_llq_header;
 
+	rc = ena_com_allocate_customer_metrics_buffer(ena_dev);
+	if (rc) {
+		netdev_err(netdev, "ena_com_allocate_customer_metrics_buffer failed\n");
+		goto err_netdev_destroy;
+	}
+
 	devlink = ena_devlink_alloc(adapter);
 	if (!devlink) {
 		netdev_err(netdev, "ena_devlink_alloc failed\n");
-		goto err_netdev_destroy;
+		goto err_metrics_destroy;
 	}
 
 	rc = ena_map_llq_mem_bar(pdev, ena_dev, bars);
@@ -4671,6 +4686,8 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ena_devlink_free(devlink);
 err_netdev_destroy:
 	free_netdev(netdev);
+err_metrics_destroy:
+	ena_com_delete_customer_metrics_buffer(ena_dev);
 err_free_region:
 	ena_release_bars(ena_dev, pdev);
 err_free_ena_dev:
@@ -4737,6 +4754,8 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 
 	ena_com_delete_host_info(ena_dev);
 
+	ena_com_delete_customer_metrics_buffer(ena_dev);
+
 	ena_release_bars(ena_dev, pdev);
 
 	pci_disable_device(pdev);
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 7b373cf6545e9..60409fa4a4b98 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -29,7 +29,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	8
-#define DRV_MODULE_GEN_SUBMINOR 0
+#define DRV_MODULE_GEN_SUBMINOR 1
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -383,10 +383,12 @@ struct ena_adapter {
 
 	u32 num_io_queues;
 	u32 max_num_io_queues;
+
 	/* Local page cache size when it's enabled */
 	u32 configured_lpc_size;
 	/* Current Local page cache size */
 	u32 used_lpc_size;
+
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
 	struct msix_entry *msix_entries;
 #endif
diff --git a/drivers/amazon/net/ena/ena_pci_id_tbl.h b/drivers/amazon/net/ena/ena_pci_id_tbl.h
old mode 100755
new mode 100644
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
old mode 100755
new mode 100644
diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c
old mode 100755
new mode 100644
diff --git a/drivers/amazon/net/ena/ena_sysfs.h b/drivers/amazon/net/ena/ena_sysfs.h
old mode 100755
new mode 100644
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index fd7e80d0347ba..8e7aab52fb507 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -73,6 +73,7 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 #include <linux/udp.h>
+#include <linux/u64_stats_sync.h>
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0)
 #include <linux/sizes.h>
@@ -503,6 +504,25 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync
 
 #endif
 
+static inline bool ena_u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+					     unsigned int start)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+	return u64_stats_fetch_retry_irq(syncp, start);
+#else
+	return u64_stats_fetch_retry(syncp, start);
+#endif
+}
+
+static inline unsigned int ena_u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+	return u64_stats_fetch_begin_irq(syncp);
+#else
+	return u64_stats_fetch_begin(syncp);
+#endif
+}
+
 #if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) && \
       !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1))))
 
@@ -714,7 +734,9 @@ do {									\
 #endif
 
 #if defined(CONFIG_NET_DEVLINK) && \
-	(KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0))
+	(KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)))
 #define ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
 #endif
 
@@ -728,15 +750,20 @@ do {									\
 #define ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) || \
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
 #define ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) || \
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
 #define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
 #define ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
 #endif
 
@@ -839,16 +866,31 @@ static inline int numa_mem_id(void)
 #define fallthrough do {} while (0)  /* fallthrough */
 #endif
 
-#ifndef NAPI_POLL_WEIGHT
-#define NAPI_POLL_WEIGHT 64
-#endif
-
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
 #define AF_XDP_BUSY_POLL_SUPPORTED
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
 #define ENA_LINEAR_FRAG_SUPPORTED
+static __always_inline struct sk_buff*
+ena_build_skb(void *data, unsigned int frag_size)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 12, 0)
+	return napi_build_skb(data, frag_size);
+#else
+	return build_skb(data, frag_size);
+#endif
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 6, 0) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 3)) && \
+	!(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(4, 2, 0, 42))
+static __always_inline
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+	dev_kfree_skb_any(skb);
+}
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
@@ -892,7 +934,8 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 #endif
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
-	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6))
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6)) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
 static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 {
 	memcpy(dev->dev_addr, addr, ETH_ALEN);
@@ -900,11 +943,15 @@ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \
-	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6))
+	(defined(RHEL_RELEASE_CODE) && \
+	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6) && \
+	RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0)) || \
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
 #define ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) || \
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(8, 7))
 #define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
 #endif
 
@@ -984,4 +1031,37 @@ static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
 
 #endif /* CONFIG_PTP_1588_CLOCK */
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)) && \
+	!(RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 2)))
+static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
+					     unsigned int length)
+{
+	return netdev_alloc_skb_ip_align(napi->dev, length);
+}
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)) && \
+	!(RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 7)))
+static inline ssize_t strscpy(char *dest, const char *src, size_t count)
+{
+	return (ssize_t)strlcpy(dest, src, count);
+}
+#endif
+
+static inline void ena_netif_napi_add(struct net_device *dev,
+				      struct napi_struct *napi,
+				      int (*poll)(struct napi_struct *, int))
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+#ifndef NAPI_POLL_WEIGHT
+#define NAPI_POLL_WEIGHT 64
+#endif
+	netif_napi_add(dev, napi, poll, NAPI_POLL_WEIGHT);
+#else
+	netif_napi_add(dev, napi, poll);
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) */
+}
+
 #endif /* _KCOMPAT_H_ */

From 3eaf1dddef4ded943545cd01ef31ffbe7ac11b9c Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Tue, 10 Jan 2023 23:55:12 +0000
Subject: [PATCH 017/175] EFA: Update to v2.1.1

Source: https://github.com/amzn/amzn-drivers/

Change Log:

## r2.1.1 release notes
* Fix dmabuf backport for some kernels
---
 drivers/amazon/net/efa/config.h   | 43 +++++++++++++++++--------------
 drivers/amazon/net/efa/efa_main.c |  2 +-
 drivers/amazon/net/efa/kcompat.h  |  6 ++++-
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/drivers/amazon/net/efa/config.h b/drivers/amazon/net/efa/config.h
index 96c10dfc11d69..b4e7d9072b5e2 100644
--- a/drivers/amazon/net/efa/config.h
+++ b/drivers/amazon/net/efa/config.h
@@ -1,52 +1,55 @@
 #define HAVE_UMEM_SCATTERLIST_IF 1
 #define HAVE_CREATE_CQ_ATTR 1
-#define HAVE_CREATE_AH_RDMA_ATTR 1
+#define HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS 1
+#define HAVE_MAX_SEND_RCV_SGE 1
 #define HAVE_DEV_PARENT 1
+#define HAVE_CREATE_AH_RDMA_ATTR 1
 #define HAVE_POST_CONST_WR 1
-#define HAVE_MAX_SEND_RCV_SGE 1
-#define HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS 1
 #define HAVE_IB_DEV_OPS 1
 #define HAVE_PD_CORE_ALLOCATION 1
 #define HAVE_UCONTEXT_CORE_ALLOCATION 1
 #define HAVE_NO_KVERBS_DRIVERS 1
-#define HAVE_UDATA_TO_DRV_CONTEXT 1
 #define HAVE_SAFE_IB_ALLOC_DEVICE 1
+#define HAVE_UDATA_TO_DRV_CONTEXT 1
 #define HAVE_AH_CORE_ALLOCATION 1
 #define HAVE_ALLOC_PD_NO_UCONTEXT 1
 #define HAVE_DEREG_MR_UDATA 1
 #define HAVE_DESTROY_CQ_UDATA 1
-#define HAVE_DESTROY_QP_UDATA 1
-#define HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE 1
 #define HAVE_UPSTREAM_EFA 1
-#define HAVE_IB_DEVICE_OPS_COMMON 1
+#define HAVE_KVZALLOC 1
+#define HAVE_IB_IS_UDATA_CLEARED 1
 #define HAVE_CQ_CORE_ALLOCATION 1
+#define HAVE_DESTROY_QP_UDATA 1
+#define HAVE_IB_QPT_DRIVER 1
 #define HAVE_IB_PORT_PHYS_STATE_LINK_UP 1
-#define HAVE_KVZALLOC 1
-#define HAVE_IBDEV_PRINT_RATELIMITED 1
+#define HAVE_IB_DEVICE_OPS_COMMON 1
+#define HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE 1
 #define HAVE_IBDEV_PRINT 1
-#define HAVE_IB_QPT_DRIVER 1
-#define HAVE_IB_IS_UDATA_CLEARED 1
+#define HAVE_IBDEV_PRINT_RATELIMITED 1
 #define HAVE_IB_MR_LENGTH 1
+#define HAVE_BITFIELD_H 1
+#define HAVE_RDMA_NODE_UNSPECIFIED 1
 #define HAVE_PCI_VENDOR_ID_AMAZON 1
 #define HAVE_IB_UMEM_GET_NO_DMASYNC 1
-#define HAVE_CORE_MMAP_XA 1
-#define HAVE_RDMA_NODE_UNSPECIFIED 1
-#define HAVE_BITFIELD_H 1
-#define HAVE_IB_UMEM_GET_DEVICE_PARAM 1
 #define HAVE_IB_ACCESS_OPTIONAL 1
-#define HAVE_CREATE_AH_INIT_ATTR 1
 #define HAVE_ATOMIC64_FETCH_INC 1
-#define HAVE_DEALLOC_PD_UDATA_RC 1
+#define HAVE_CORE_MMAP_XA 1
+#define HAVE_CREATE_AH_INIT_ATTR 1
 #define HAVE_AH_CORE_ALLOCATION_DESTROY_RC 1
+#define HAVE_IB_UMEM_GET_DEVICE_PARAM 1
 #define HAVE_IB_INT_DESTROY_CQ 1
+#define HAVE_DEALLOC_PD_UDATA_RC 1
 #define HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK 1
-#define HAVE_IB_UMEM_NUM_DMA_BLOCKS 1
-#define HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM 1
 #define HAVE_UVERBS_CMD_MASK_NOT_NEEDED 1
+#define HAVE_SYSFS_EMIT 1
 #define HAVE_U32_PORT 1
 #define HAVE_SPLIT_STATS_ALLOC 1
-#define HAVE_SYSFS_EMIT 1
 #define HAVE_XARRAY 1
+#define HAVE_STAT_DESC_STRUCT 1
+#define HAVE_IB_UMEM_NUM_DMA_BLOCKS 1
+#define HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM 1
+#define HAVE_IB_UMEM_DMABUF_PINNED 1
 #define HAVE_QP_CORE_ALLOCATION 1
 #define HAVE_MR_DMABUF 1
+#define HAVE_MODULE_IMPORT_NS 1
 #define HAVE_EFA_P2P 1
\ No newline at end of file
diff --git a/drivers/amazon/net/efa/efa_main.c b/drivers/amazon/net/efa/efa_main.c
index 34a8e13273556..cc8200ac141de 100644
--- a/drivers/amazon/net/efa/efa_main.c
+++ b/drivers/amazon/net/efa/efa_main.c
@@ -33,7 +33,7 @@ static const struct pci_device_id efa_pci_tbl[] = {
 
 #define DRV_MODULE_VER_MAJOR           2
 #define DRV_MODULE_VER_MINOR           1
-#define DRV_MODULE_VER_SUBMINOR        0
+#define DRV_MODULE_VER_SUBMINOR        1
 
 #ifndef DRV_MODULE_VERSION
 #define DRV_MODULE_VERSION \
diff --git a/drivers/amazon/net/efa/kcompat.h b/drivers/amazon/net/efa/kcompat.h
index 713dcc00b394c..81e8f819e4f88 100644
--- a/drivers/amazon/net/efa/kcompat.h
+++ b/drivers/amazon/net/efa/kcompat.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _KCOMPAT_H_
@@ -191,6 +191,10 @@ typedef u8 port_t;
 #include <linux/dma-resv.h>
 #include <rdma/ib_umem.h>
 
+#ifdef HAVE_MODULE_IMPORT_NS
+MODULE_IMPORT_NS(DMA_BUF);
+#endif
+
 static inline void
 ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach)
 {

From f8ba65a44d2d4d11ede6023d0b061bd86a95f994 Mon Sep 17 00:00:00 2001
From: Alakesh Haloi <alakeshh@amazon.com>
Date: Fri, 27 Oct 2017 00:36:15 +0000
Subject: [PATCH 018/175] Enable Algorithims for Amazon Linux 6.1.y

Squash the following 2 patches into 1 as they accomplish the same goal -
setting which algorithims are availble for fips use in 6.1.

not-for-upstream: testmgr config changes to enable FIPS boot

The Federal Information Processing Standard (FIPS) Publication 140-2, is a
computer security standard, developed by a U.S. Government and industry working
group to validate the quality of cryptographic modules. Enabling FIPS mode
involves the following steps:
a. prelinking needs to be disabled. PRELINKING=no in /etc/sysconfig/prelink
b. Install dracut-fips package
   # yum install dracut-fips.
   Installing dracut-fipes enables module signing by default and also enables
   scripts that do FIPS integrity verification, regardless of whether FIPS mode
   is on. If FIPS mode is on, and verification failure is detected, then syste
   will panic.
c. Recreate initramfs
   # dracut -v -f
d. Modify kernel command line to include the following option fips=1. For gaub2
   based system add fips=1 to the end of the CMDLINE in /etc/default/grub and
   then run the following command
   # grub2-mkconfig -o /boot/grub2/grub.cfg
e. Reboot the system.

In FIPS mode, some self tests are run by dracut-fips package which is otherwise
not the case for kernel not running in FIPS mode. The changes in the tests
mentioned in this CR is only relevant for kernel running in FIPS mode.

In this changeset, we enable/disable cryptographic algorithms in FIPS mode to
make sure that we enable the tests that are supportedand disable the tests that
are not supported in our kernel. Among the tests that are not supported are the
SHA3 family of tests and their hmac versions. Also gcm(aesni) is disabled as
the support is currently missing in the kernel. Also we should remember that,
this change is not an effort to make the kernel FIPS compliant. FIPS compliance
needs to be done by certified authority. This change is about adding support
for FIPS mode. Running official FIPS compliance may necessiate support for
additional cryptographic algorithms or remove fips_enabed flag in the tests for
few algorithms as the need may arise.

FIPS mode for a test is disabled by removing fips_enabled = 1 from the test
description in testmgr.c. Adding support is more involved. The test needs to
be implemented and pointed to in the structure used to describe the test.

In FIPS mode, only the tests that are tagged with fips_enabled=1 are run and
rest of the tests are ignored. So if you are not sure about an algorithm which
needs to be enabled in FIPS mode, it needs to be disabled in testmgr.c.

NU: because FIPS enablement is distro specific.

Signed-off-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>

Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Frederick Lefebvre <fredlef@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>

enable rfc4106(gcm(aes)) for fips

This alogrithim works with no additional changes required and has been
requested by a customer, so enable it
---
 crypto/testmgr.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 56c39a0c94952..c74ef009fb999 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5743,6 +5743,10 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(xxhash64_tv_template)
 		}
+	}, {
+		.alg = "zlib",
+		.test = alg_test_null,
+		.fips_allowed = 1,
 	}, {
 		.alg = "zlib-deflate",
 		.test = alg_test_comp,

From f26fb12e7a00ebc44e2fc367cff932fad05bde2e Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Wed, 12 Jul 2017 23:35:17 +0000
Subject: [PATCH 019/175] xen/manage: keep track of the on-going suspend mode

To differentiate between Xen suspend, PM suspend and PM hibernation,
keep track of the on-going suspend mode by mainly using a new PM
notifier. Since Xen suspend doesn't have corresponding PM event, its
main logic is modfied to acquire pm_mutex and set the current mode.

Note that we may see deadlock if PM suspend/hibernation is interrupted
by Xen suspend. PM suspend/hibernation depends on xenwatch thread to
process xenbus state transactions, but the thread will sleep to wait
pm_mutex which is already held by PM suspend/hibernation context in the
scenario. Though, acquirng pm_mutex is still right thing to do, and we
would need to modify Xen shutdown code to avoid the issue. This will be
fixed by a separate patch.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Sebastian Biemueller <sbiemue@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
[6.1: Handle sleep flags for unlock_system_sleep()]
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/xen/manage.c | 59 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c16df629907e1..2f265b0a5d085 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -14,6 +14,7 @@
 #include <linux/freezer.h>
 #include <linux/syscore_ops.h>
 #include <linux/export.h>
+#include <linux/suspend.h>
 
 #include <xen/xen.h>
 #include <xen/xenbus.h>
@@ -40,6 +41,16 @@ enum shutdown_state {
 /* Ignore multiple shutdown requests. */
 static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
 
+enum suspend_modes {
+        NO_SUSPEND = 0,
+        XEN_SUSPEND,
+        PM_SUSPEND,
+        PM_HIBERNATION,
+};
+
+/* Protected by pm_mutex */
+static enum suspend_modes suspend_mode = NO_SUSPEND;
+
 struct suspend_info {
 	int cancelled;
 };
@@ -98,6 +109,11 @@ static void do_suspend(void)
 {
 	int err;
 	struct suspend_info si;
+	unsigned int sleep_flags;
+
+	sleep_flags = lock_system_sleep();
+
+	suspend_mode = XEN_SUSPEND;
 
 	shutting_down = SHUTDOWN_SUSPEND;
 
@@ -162,6 +178,10 @@ static void do_suspend(void)
 	thaw_processes();
 out:
 	shutting_down = SHUTDOWN_INVALID;
+
+	suspend_mode = NO_SUSPEND;
+
+	unlock_system_sleep(sleep_flags);
 }
 #endif	/* CONFIG_HIBERNATE_CALLBACKS */
 
@@ -388,3 +408,42 @@ int xen_setup_shutdown_event(void)
 EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
 
 subsys_initcall(xen_setup_shutdown_event);
+
+static int xen_pm_notifier(struct notifier_block *notifier,
+			   unsigned long pm_event, void *unused)
+{
+	switch (pm_event) {
+	case PM_SUSPEND_PREPARE:
+		suspend_mode = PM_SUSPEND;
+		break;
+	case PM_HIBERNATION_PREPARE:
+	case PM_RESTORE_PREPARE:
+		suspend_mode = PM_HIBERNATION;
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_RESTORE:
+	case PM_POST_HIBERNATION:
+		/* Set back to the default */
+		suspend_mode = NO_SUSPEND;
+		break;
+	default:
+		pr_warn("Receive unknown PM event 0x%lx\n", pm_event);
+		return -EINVAL;
+	}
+
+	return 0;
+};
+
+static struct notifier_block xen_pm_notifier_block = {
+	.notifier_call = xen_pm_notifier
+};
+
+static int xen_setup_pm_notifier(void)
+{
+	if (!xen_hvm_domain())
+		return -ENODEV;
+
+	return register_pm_notifier(&xen_pm_notifier_block);
+}
+
+subsys_initcall(xen_setup_pm_notifier);

From 6b6211ee65e058bfcfbe6ff7f1bd6bb9af8a9d8d Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 13 Jul 2017 00:12:32 +0000
Subject: [PATCH 020/175] xen/manage: introduce helper function to know the
 on-going suspend mode

Introduce simple functions which help to know the on-going suspend mode
so that other Xen-related code can behave differently according to the
current suspend mode.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Sebastian Biemueller <sbiemue@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/xen/manage.c  | 15 +++++++++++++++
 include/xen/xen-ops.h |  4 ++++
 2 files changed, 19 insertions(+)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 2f265b0a5d085..de89fed344ca6 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -51,6 +51,21 @@ enum suspend_modes {
 /* Protected by pm_mutex */
 static enum suspend_modes suspend_mode = NO_SUSPEND;
 
+bool xen_suspend_mode_is_xen_suspend(void)
+{
+	return suspend_mode == XEN_SUSPEND;
+}
+
+bool xen_suspend_mode_is_pm_suspend(void)
+{
+	return suspend_mode == PM_SUSPEND;
+}
+
+bool xen_suspend_mode_is_pm_hibernation(void)
+{
+	return suspend_mode == PM_HIBERNATION;
+}
+
 struct suspend_info {
 	int cancelled;
 };
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index a34f4271a2e9f..d8f01a762f7ad 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -41,6 +41,10 @@ u64 xen_steal_clock(int cpu);
 
 int xen_setup_shutdown_event(void);
 
+bool xen_suspend_mode_is_xen_suspend(void);
+bool xen_suspend_mode_is_pm_suspend(void);
+bool xen_suspend_mode_is_pm_hibernation(void);
+
 extern unsigned long *xen_contiguous_bitmap;
 
 #if defined(CONFIG_XEN_PV)

From 6e5aa466ebaa2a07c031ed94a12bc1ea6b67f9d5 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 13 Jul 2017 02:00:31 +0000
Subject: [PATCH 021/175] xenbus: add freeze/thaw/restore callbacks support

Since commit b3e96c0c7562 ("xen: use freeze/restore/thaw PM events for
suspend/resume/chkpt"), xenbus uses PMSG_FREEZE, PMSG_THAW and
PMSG_RESTORE events for Xen suspend. However, they're actually assigned
to xenbus_dev_suspend(), xenbus_dev_cancel() and xenbus_dev_resume()
respectively, and only suspend and resume callbacks are supported at
driver level. To support PM suspend and PM hibernation, modify the bus
level PM callbacks to invoke not only device driver's suspend/resume but
also freeze/thaw/restore.

Note that we'll use freeze/restore callbacks even for PM suspend whereas
suspend/resume callbacks are normally used in the case, becausae the
existing xenbus device drivers already have suspend/resume callbacks
specifically designed for Xen suspend. So we can allow the device
drivers to keep the existing callbacks wihtout modification.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/xen/xenbus/xenbus_probe.c | 99 ++++++++++++++++++++++++++-----
 include/xen/xenbus.h              |  3 +
 2 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 1a9ded0cddcb0..fcf00a41422db 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -50,6 +50,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
 
 #include <asm/page.h>
 #include <asm/xen/hypervisor.h>
@@ -668,26 +669,47 @@ int xenbus_dev_suspend(struct device *dev)
 	struct xenbus_driver *drv;
 	struct xenbus_device *xdev
 		= container_of(dev, struct xenbus_device, dev);
+	int (*cb)(struct xenbus_device *) = NULL;
+	bool xen_suspend = xen_suspend_mode_is_xen_suspend();
 
 	DPRINTK("%s", xdev->nodename);
 
 	if (dev->driver == NULL)
 		return 0;
 	drv = to_xenbus_driver(dev->driver);
-	if (drv->suspend)
-		err = drv->suspend(xdev);
-	if (err)
-		dev_warn(dev, "suspend failed: %i\n", err);
+
+	if (xen_suspend)
+		cb = drv->suspend;
+	else
+		cb = drv->freeze;
+
+	if (cb)
+		err = cb(xdev);
+
+	if (err) {
+		dev_warn(dev, "%s failed: %i\n", xen_suspend ?
+			"suspend" : "freeze", err);
+		return err;
+	}
+
+	if (!xen_suspend) {
+		/* Forget otherend since this can become stale after restore */
+		free_otherend_watch(xdev);
+		free_otherend_details(xdev);
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
 
 int xenbus_dev_resume(struct device *dev)
 {
-	int err;
+	int err = 0;
 	struct xenbus_driver *drv;
 	struct xenbus_device *xdev
 		= container_of(dev, struct xenbus_device, dev);
+	int (*cb)(struct xenbus_device *) = NULL;
+	bool xen_suspend = xen_suspend_mode_is_xen_suspend();
 
 	DPRINTK("%s", xdev->nodename);
 
@@ -696,23 +718,32 @@ int xenbus_dev_resume(struct device *dev)
 	drv = to_xenbus_driver(dev->driver);
 	err = talk_to_otherend(xdev);
 	if (err) {
-		dev_warn(dev, "resume (talk_to_otherend) failed: %i\n", err);
+		dev_warn(dev, "%s (talk_to_otherend) failed: %i\n",
+			xen_suspend ? "resume" : "restore", err);
 		return err;
 	}
 
-	xdev->state = XenbusStateInitialising;
+	if (xen_suspend)
+		xdev->state = XenbusStateInitialising;
 
-	if (drv->resume) {
-		err = drv->resume(xdev);
-		if (err) {
-			dev_warn(dev, "resume failed: %i\n", err);
-			return err;
-		}
+	if (xen_suspend)
+		cb = drv->resume;
+	else
+		cb = drv->restore;
+
+	if (cb)
+		err = cb(xdev);
+
+	if (err) {
+		dev_warn(dev, "%s failed: %i\n",
+			xen_suspend ? "resume" : "restore", err);
+		return err;
 	}
 
 	err = watch_otherend(xdev);
 	if (err) {
-		dev_warn(dev, "resume (watch_otherend) failed: %d\n", err);
+		dev_warn(dev, "%s (watch_otherend) failed: %d.\n",
+			xen_suspend ? "resume" : "restore", err);
 		return err;
 	}
 
@@ -722,8 +753,44 @@ EXPORT_SYMBOL_GPL(xenbus_dev_resume);
 
 int xenbus_dev_cancel(struct device *dev)
 {
-	/* Do nothing */
-	DPRINTK("cancel");
+	int err = 0;
+	struct xenbus_driver *drv;
+	struct xenbus_device *xdev
+		= container_of(dev, struct xenbus_device, dev);
+	bool xen_suspend = xen_suspend_mode_is_xen_suspend();
+
+	if (xen_suspend) {
+		/* Do nothing */
+		DPRINTK("cancel");
+		return 0;
+	}
+
+	DPRINTK("%s", xdev->nodename);
+
+	if (dev->driver == NULL)
+		return 0;
+	drv = to_xenbus_driver(dev->driver);
+
+	err = talk_to_otherend(xdev);
+	if (err) {
+		dev_warn(dev, "thaw (talk_to_otherend) failed: %d.\n", err);
+		return err;
+	}
+
+	if (drv->thaw) {
+		err = drv->thaw(xdev);
+		if (err) {
+			dev_warn(dev, "thaw failed: %i\n", err);
+			return err;
+		}
+	}
+
+	err = watch_otherend(xdev);
+	if (err) {
+		dev_warn(dev, "thaw (watch_otherend) failed: %d.\n", err);
+		return err;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_cancel);
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index eaa932b99d8ac..3d8684d373d59 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -120,6 +120,9 @@ struct xenbus_driver {
 	int (*remove)(struct xenbus_device *dev);
 	int (*suspend)(struct xenbus_device *dev);
 	int (*resume)(struct xenbus_device *dev);
+	int (*freeze)(struct xenbus_device *dev);
+	int (*thaw)(struct xenbus_device *dev);
+	int (*restore)(struct xenbus_device *dev);
 	int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *);
 	struct device_driver driver;
 	int (*read_otherend_details)(struct xenbus_device *dev);

From 6c4cf92684794e869423fb41e0ab7a03dd94b834 Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Thu, 22 Feb 2018 21:52:42 +0000
Subject: [PATCH 022/175] x86/xen: Introduce new function to map
 HYPERVISOR_shared_info on Resume

Introduce a small function which re-uses shared page's PA allocated
during guest initialization time in reserve_shared_info() and not
allocate new page during resume flow.
It also  does the mapping of shared_info_page by calling
xen_hvm_init_shared_info() to use the function.

Backport Notes:
We don't need this commit 8d5ce0dad4ab2a4c8c8a3c36f6fb8c46b695b053 ("x86/xen:
decouple shared_info mapping from xen_hvm_init_shared_info()") here since
xen_hvm_init_shared_info changed in 4.14 kernel just to do the mapping and
allocation of shared page is done in a separate function.
We don't need to decouple this kernel API anymore

Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Sebastian Biemueller <sbiemue@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/x86/xen/enlighten_hvm.c | 7 +++++++
 arch/x86/xen/xen-ops.h       | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index c66807dd02703..1a00917612951 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -36,6 +36,13 @@ static unsigned long shared_info_pfn;
 __ro_after_init bool xen_percpu_upcall;
 EXPORT_SYMBOL_GPL(xen_percpu_upcall);
 
+void xen_hvm_map_shared_info(void)
+{
+        xen_hvm_init_shared_info();
+        if(shared_info_pfn)
+                 HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn));
+}
+
 void xen_hvm_init_shared_info(void)
 {
 	struct xen_add_to_physmap xatp;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b2b2f4315b78d..5c626793c11e2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -56,6 +56,8 @@ void xen_enable_sysenter(void);
 void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 
+void xen_callback_vector(void);
+void xen_hvm_map_shared_info(void);
 void xen_hvm_init_shared_info(void);
 void xen_unplug_emulated_devices(void);
 

From ba8519773f86af6a1fa5a2c03807856a6b5eff9f Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Sat, 11 Feb 2017 00:53:56 +0000
Subject: [PATCH 023/175] x86/xen: add system core suspend and resume callbacks

Add Xen PVHVM specific system core callbacks for PM suspend and
hibernation support. The callbacks suspend and resume Xen primitives,
like shared_info, pvclock and grant table. Note that Xen suspend can
handle them in a different manner, but system core callbacks are called
from the context. So if the callbacks are called from Xen suspend
context, return immediately.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/x86/xen/enlighten_hvm.c |  1 +
 arch/x86/xen/suspend.c       | 53 ++++++++++++++++++++++++++++++++++++
 include/xen/xen-ops.h        |  2 ++
 3 files changed, 56 insertions(+)

diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 1a00917612951..ec87cd2def64c 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -235,6 +235,7 @@ static void __init xen_hvm_guest_init(void)
 
 	xen_panic_handler_init();
 
+	xen_setup_syscore_ops();
 	xen_hvm_smp_init();
 	WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm));
 	xen_unplug_emulated_devices();
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d83152c761bc..784c4484100bb 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -2,17 +2,22 @@
 #include <linux/types.h>
 #include <linux/tick.h>
 #include <linux/percpu-defs.h>
+#include <linux/syscore_ops.h>
+#include <linux/kernel_stat.h>
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
 #include <xen/grant_table.h>
 #include <xen/events.h>
+#include <xen/xen-ops.h>
 
 #include <asm/cpufeatures.h>
 #include <asm/msr-index.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
 #include <asm/fixmap.h>
+#include <asm/pvclock.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -82,3 +87,51 @@ void xen_arch_suspend(void)
 
 	on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
 }
+
+static int xen_syscore_suspend(void)
+{
+	struct xen_remove_from_physmap xrfp;
+	int ret;
+
+	/* Xen suspend does similar stuffs in its own logic */
+	if (xen_suspend_mode_is_xen_suspend())
+		return 0;
+
+	xrfp.domid = DOMID_SELF;
+	xrfp.gpfn = __pa(HYPERVISOR_shared_info) >> PAGE_SHIFT;
+
+	ret = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrfp);
+	if (!ret)
+		HYPERVISOR_shared_info = &xen_dummy_shared_info;
+
+	return ret;
+}
+
+static void xen_syscore_resume(void)
+{
+	/* Xen suspend does similar stuffs in its own logic */
+	if (xen_suspend_mode_is_xen_suspend())
+		return;
+
+	/* No need to setup vcpu_info as it's already moved off */
+	xen_hvm_map_shared_info();
+
+	pvclock_resume();
+
+	gnttab_resume();
+}
+
+/*
+ * These callbacks will be called with interrupts disabled and when having only
+ * one CPU online.
+ */
+static struct syscore_ops xen_hvm_syscore_ops = {
+	.suspend = xen_syscore_suspend,
+	.resume = xen_syscore_resume
+};
+
+void __init xen_setup_syscore_ops(void)
+{
+	if (xen_hvm_domain())
+		register_syscore_ops(&xen_hvm_syscore_ops);
+}
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index d8f01a762f7ad..9904dd5bf32ec 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -45,6 +45,8 @@ bool xen_suspend_mode_is_xen_suspend(void);
 bool xen_suspend_mode_is_pm_suspend(void);
 bool xen_suspend_mode_is_pm_hibernation(void);
 
+void xen_setup_syscore_ops(void);
+
 extern unsigned long *xen_contiguous_bitmap;
 
 #if defined(CONFIG_XEN_PV)

From 2cff65f7a3818111ec79be63b388376444436d61 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 8 Jun 2017 19:15:55 +0000
Subject: [PATCH 024/175] xen-blkfront: add callbacks for PM suspend and
 hibernation

Add freeze and restore callbacks for PM suspend and hibernation support.
The freeze handler stops a block-layer queue and disconnect the frontend
from the backend while freeing ring_info and associated resources. The
restore handler re-allocates ring_info and re-connect to the backedend,
so the rest of the kernel can continue to use the block device
transparently.Also, the handlers are used for both PM
suspend and hibernation so that we can keep the existing suspend/resume
callbacks for Xen suspend without modification.
If a backend doesn't have commit 12ea729645ac ("xen/blkback: unmap all
persistent grants when frontend gets disconnected"), the frontend may see
massive amount of grant table warning when freeing resources.

 [   36.852659] deferring g.e. 0xf9 (pfn 0xffffffffffffffff)
 [   36.855089] xen:grant_table: WARNING: g.e. 0x112 still in use!

In this case, persistent grants would need to be disabled.

Ensure no reqs/rsps in rings before disconnecting. When disconnecting
the frontend from the backend in blkfront_freeze(), there still may be
unconsumed requests or responses in the rings, especially when the
backend is backed by network-based device. If the frontend gets
disconnected with such reqs/rsps remaining there, it can cause
grant warnings and/or losing reqs/rsps by freeing pages afterward.
This can lead resumed kernel into unrecoverable state like unexpected
freeing of grant page and/or hung task due to the lost reqs or rsps.
Therefore we have to ensure that there is no unconsumed requests or
responses before disconnecting.

Actually, the frontend just needs to wait for some amount of time so that
the backend can process the requests, put responses and notify the
frontend back. Timeout used here is based on some heuristic. If we somehow
hit the timeout, it would mean something serious happens in the backend,
the frontend will just return an error to PM core and PM suspend/hibernation
will be aborted. This may be something should be fixed by the backend side,
but a frontend side fix is probably still worth doing to work with
broader backends.

Backport Note:
Unlike 4.9 kernel, blk-mq is default for 4.14 kernel and request-based
mode cod eis not included in this frontend driver.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/block/xen-blkfront.c | 163 +++++++++++++++++++++++++++++++++--
 1 file changed, 155 insertions(+), 8 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5ddf393aa390f..ab2d9c23b8ffd 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -49,6 +49,8 @@
 #include <linux/list.h>
 #include <linux/workqueue.h>
 #include <linux/sched/mm.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
 
 #include <xen/xen.h>
 #include <xen/xenbus.h>
@@ -82,6 +84,8 @@ enum blkif_state {
 	BLKIF_STATE_CONNECTED,
 	BLKIF_STATE_SUSPENDED,
 	BLKIF_STATE_ERROR,
+	BLKIF_STATE_FREEZING,
+	BLKIF_STATE_FROZEN
 };
 
 struct grant {
@@ -231,6 +235,7 @@ struct blkfront_info
 	struct list_head requests;
 	struct bio_list bio_list;
 	struct list_head info_list;
+	struct completion wait_backend_disconnected;
 };
 
 static unsigned int nr_minors;
@@ -270,6 +275,16 @@ static DEFINE_SPINLOCK(minor_lock);
 static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
 static void blkfront_gather_backend_features(struct blkfront_info *info);
 static int negotiate_mq(struct blkfront_info *info);
+static void __blkif_free(struct blkfront_info *info);
+
+static inline bool blkfront_ring_is_busy(struct blkif_front_ring *ring)
+{
+	if (RING_SIZE(ring) > RING_FREE_REQUESTS(ring) ||
+	    RING_HAS_UNCONSUMED_RESPONSES(ring))
+		return true;
+	else
+		return false;
+}
 
 #define for_each_rinfo(info, ptr, idx)				\
 	for ((ptr) = (info)->rinfo, (idx) = 0;			\
@@ -1163,6 +1178,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	info->sector_size = sector_size;
 	info->physical_sector_size = physical_sector_size;
 	blkif_set_queue_limits(info);
+	init_completion(&info->wait_backend_disconnected);
 
 	xlvbd_flush(info);
 
@@ -1183,6 +1199,8 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 /* Already hold rinfo->ring_lock. */
 static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
 {
+	if (unlikely(rinfo->dev_info->connected == BLKIF_STATE_FREEZING))
+                return;
 	if (!RING_FULL(&rinfo->ring))
 		blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
 }
@@ -1300,9 +1318,6 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
-	unsigned int i;
-	struct blkfront_ring_info *rinfo;
-
 	/* Prevent new requests being issued until we fix things up. */
 	info->connected = suspend ?
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
@@ -1310,6 +1325,14 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	if (info->rq)
 		blk_mq_stop_hw_queues(info->rq);
 
+	__blkif_free(info);
+}
+
+static void __blkif_free(struct blkfront_info *info)
+{
+	unsigned int i;
+	struct blkfront_ring_info *rinfo;
+
 	for_each_rinfo(info, rinfo, i)
 		blkif_free_ring(rinfo);
 
@@ -1521,8 +1544,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS;
 
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-		xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS);
-		return IRQ_HANDLED;
+		if (info->connected != BLKIF_STATE_FREEZING) {
+			xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS);
+			return IRQ_HANDLED;
+		}
 	}
 
 	spin_lock_irqsave(&rinfo->ring_lock, flags);
@@ -2013,6 +2038,7 @@ static int blkif_recover(struct blkfront_info *info)
 	unsigned int segs;
 	struct blkfront_ring_info *rinfo;
 
+	bool frozen = info->connected == BLKIF_STATE_FROZEN;
 	blkfront_gather_backend_features(info);
 	/* Reset limits changed by blk_mq_update_nr_hw_queues(). */
 	blkif_set_queue_limits(info);
@@ -2034,6 +2060,9 @@ static int blkif_recover(struct blkfront_info *info)
 		kick_pending_request_queues(rinfo);
 	}
 
+	if (frozen)
+		return 0;
+
 	list_for_each_entry_safe(req, n, &info->requests, queuelist) {
 		/* Requeue pending requests (flush or discard) */
 		list_del_init(&req->queuelist);
@@ -2336,6 +2365,7 @@ static void blkfront_connect(struct blkfront_info *info)
 
 		return;
 	case BLKIF_STATE_SUSPENDED:
+	case BLKIF_STATE_FROZEN:
 		/*
 		 * If we are recovering from suspension, we need to wait
 		 * for the backend to announce it's features before
@@ -2460,12 +2490,36 @@ static void blkback_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateClosed:
-		if (dev->state == XenbusStateClosed)
+		if (dev->state == XenbusStateClosed) {
+			if (info->connected == BLKIF_STATE_FREEZING) {
+				__blkif_free(info);
+				info->connected = BLKIF_STATE_FROZEN;
+				complete(&info->wait_backend_disconnected);
+				break;
+			}
+
+			break;
+		}
+
+		/*
+		 * We may somehow receive backend's Closed again while thawing
+		 * or restoring and it causes thawing or restoring to fail.
+		 * Ignore such unexpected state anyway.
+		*/
+		if (info->connected == BLKIF_STATE_FROZEN &&
+				dev->state == XenbusStateInitialised) {
+			dev_dbg(&dev->dev,
+					"ignore the backend's Closed state: %s",
+					dev->nodename);
 			break;
+		}		
 		fallthrough;
 	case XenbusStateClosing:
-		blkfront_closing(info);
-		break;
+		if (info->connected == BLKIF_STATE_FREEZING)
+			xenbus_frontend_closed(dev);
+		else
+			blkfront_closing(info);
+                break;
 	}
 }
 
@@ -2500,6 +2554,96 @@ static int blkfront_is_ready(struct xenbus_device *dev)
 	return info->is_ready && info->xbdev;
 }
 
+static int blkfront_freeze(struct xenbus_device *dev)
+{
+	unsigned int i;
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+        struct blkfront_ring_info *rinfo;
+        struct blkif_front_ring *ring;
+	/* This would be reasonable timeout as used in xenbus_dev_shutdown() */
+	unsigned int timeout = 5 * HZ;
+	int err = 0;
+	
+	info->connected = BLKIF_STATE_FREEZING;
+	
+	blk_mq_stop_hw_queues(info->rq);
+	
+	for (i = 0; i < info->nr_rings; i++) {
+		rinfo = &info->rinfo[i];
+
+		gnttab_cancel_free_callback(&rinfo->callback);
+		flush_work(&rinfo->work);
+	}
+	
+	for (i = 0; i < info->nr_rings; i++) {
+		spinlock_t *lock;
+		bool busy;
+		unsigned long req_timeout_ms = 25;
+		unsigned long ring_timeout;
+
+		rinfo = &info->rinfo[i];
+		ring = &rinfo->ring;
+
+		lock = &rinfo->ring_lock;
+
+		ring_timeout = jiffies +
+		    msecs_to_jiffies(req_timeout_ms * RING_SIZE(ring));
+
+		do {
+			spin_lock_irq(lock);
+			busy = blkfront_ring_is_busy(ring);
+			spin_unlock_irq(lock);
+
+			if (busy)
+				msleep(req_timeout_ms);
+			else
+				break;
+		} while (time_is_after_jiffies(ring_timeout));
+
+                /* Timed out */
+		if (busy) {
+			xenbus_dev_error(dev, err, "the ring is still busy");
+			info->connected = BLKIF_STATE_CONNECTED;
+			return -EBUSY;
+		}
+	}
+	
+	/* Kick the backend to disconnect */
+	xenbus_switch_state(dev, XenbusStateClosing);
+
+	/*
+	 * We don't want to move forward before the frontend is diconnected
+	 * from the backend cleanly.
+	 */
+	timeout = wait_for_completion_timeout(&info->wait_backend_disconnected,
+					      timeout);
+	if (!timeout) {
+		err = -EBUSY;
+		xenbus_dev_error(dev, err, "Freezing timed out;"
+				 "the device may become inconsistent state");
+	}
+
+	return err;
+}
+
+static int blkfront_restore(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+	int err = 0;
+
+	err = negotiate_mq(info);
+	if (err)
+		goto out;
+
+	err = talk_to_blkback(dev, info);
+	if (err)
+		goto out;
+	blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
+
+out:
+	return err;
+}
+
 static const struct block_device_operations xlvbd_block_fops =
 {
 	.owner = THIS_MODULE,
@@ -2521,6 +2665,9 @@ static struct xenbus_driver blkfront_driver = {
 	.resume = blkfront_resume,
 	.otherend_changed = blkback_changed,
 	.is_ready = blkfront_is_ready,
+	.freeze = blkfront_freeze,
+	.thaw = blkfront_restore,
+	.restore = blkfront_restore
 };
 
 static void purge_persistent_grants(struct blkfront_info *info)

From c77408a6e968a98df3f34176e978f61f10744716 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Mon, 9 Jan 2017 23:36:52 +0000
Subject: [PATCH 025/175] xen-netfront: add callbacks for PM suspend and
 hibernation support

Add freeze and restore callbacks for PM suspend and hibernation support.
The freeze handler simply disconnects the frotnend from the backend and
frees resources associated with queues after disabling the net_device
from the system. The restore handler just changes the frontend state and
let the xenbus handler to re-allocate the resources and re-connect to the
backend. This can be performed transparently to the rest of the system.
The handlers are used for both PM suspend and hibernation so that we can
keep the existing suspend/resume callbacks for Xen suspend without
modification. Freezing netfront devices is normally expected to finish within a few
hundred milliseconds, but it can rarely take more than 5 seconds and
hit the hard coded timeout, it would depend on backend state which may
be congested and/or have complex configuration. While it's rare case,
longer default timeout seems a bit more reasonable here to avoid hitting
the timeout. Also, make it configurable via module parameter so that we
can cover broader setups than what we know currently.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/net/xen-netfront.c | 97 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 95b5ab4b964e2..7f09ee596ca6f 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -43,6 +43,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/completion.h>
 #include <net/ip.h>
 #include <linux/bpf.h>
 #include <net/page_pool.h>
@@ -59,6 +60,12 @@
 #include <xen/interface/memory.h>
 #include <xen/interface/grant_table.h>
 
+enum netif_freeze_state {
+	NETIF_FREEZE_STATE_UNFROZEN,
+	NETIF_FREEZE_STATE_FREEZING,
+	NETIF_FREEZE_STATE_FROZEN,
+};
+
 /* Module parameters */
 #define MAX_QUEUES_DEFAULT 8
 static unsigned int xennet_max_queues;
@@ -72,6 +79,12 @@ MODULE_PARM_DESC(trusted, "Is the backend trusted");
 
 #define XENNET_TIMEOUT  (5 * HZ)
 
+static unsigned int netfront_freeze_timeout_secs = 10;
+module_param_named(freeze_timeout_secs,
+		   netfront_freeze_timeout_secs, uint, 0644);
+MODULE_PARM_DESC(freeze_timeout_secs,
+		 "timeout when freezing netfront device in seconds");
+
 static const struct ethtool_ops xennet_ethtool_ops;
 
 struct netfront_cb {
@@ -181,6 +194,10 @@ struct netfront_info {
 	bool bounce;
 
 	atomic_t rx_gso_checksum_fixup;
+
+	int freeze_state;
+
+	struct completion wait_backend_disconnected;
 };
 
 struct netfront_rx_info {
@@ -910,6 +927,21 @@ static void xennet_set_rx_rsp_cons(struct netfront_queue *queue, RING_IDX val)
 	spin_unlock_irqrestore(&queue->rx_cons_lock, flags);
 }
 
+static int xennet_disable_interrupts(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	unsigned int num_queues = dev->real_num_tx_queues;
+	unsigned int i;
+	struct netfront_queue *queue;
+
+	for (i = 0; i < num_queues; ++i) {
+		queue = &np->queues[i];
+		disable_irq(queue->tx_irq);
+		disable_irq(queue->rx_irq);
+	}
+	return 0;
+}
+
 static void xennet_move_rx_slot(struct netfront_queue *queue, struct sk_buff *skb,
 				grant_ref_t ref)
 {
@@ -1719,6 +1751,8 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
 
 	np->queues = NULL;
 
+	init_completion(&np->wait_backend_disconnected);
+
 	err = -ENOMEM;
 	np->rx_stats = netdev_alloc_pcpu_stats(struct netfront_stats);
 	if (np->rx_stats == NULL)
@@ -2245,6 +2279,50 @@ static int xennet_create_queues(struct netfront_info *info,
 	return 0;
 }
 
+static int netfront_freeze(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev_get_drvdata(&dev->dev);
+	unsigned long timeout = netfront_freeze_timeout_secs * HZ;
+	int err = 0;
+
+	xennet_disable_interrupts(info->netdev);
+
+	netif_device_detach(info->netdev);
+
+	info->freeze_state = NETIF_FREEZE_STATE_FREEZING;
+
+	/* Kick the backend to disconnect */
+	xenbus_switch_state(dev, XenbusStateClosing);
+
+	/* We don't want to move forward before the frontend is diconnected
+	 * from the backend cleanly.
+	 */
+	timeout = wait_for_completion_timeout(&info->wait_backend_disconnected,
+					      timeout);
+	if (!timeout) {
+		err = -EBUSY;
+		xenbus_dev_error(dev, err, "Freezing timed out;"
+				 "the device may become inconsistent state");
+		return err;
+	}
+
+	/* Tear down queues */
+	xennet_disconnect_backend(info);
+	xennet_destroy_queues(info);
+
+	info->freeze_state = NETIF_FREEZE_STATE_FROZEN;
+
+	return err;
+}
+
+static int netfront_restore(struct xenbus_device *dev)
+{
+	/* Kick the backend to re-connect */
+	xenbus_switch_state(dev, XenbusStateInitialising);
+
+	return 0;
+}
+
 /* Common code used when first setting up, and when resuming. */
 static int talk_to_netback(struct xenbus_device *dev,
 			   struct netfront_info *info)
@@ -2475,6 +2553,8 @@ static int xennet_connect(struct net_device *dev)
 		spin_unlock_bh(&queue->rx_lock);
 	}
 
+	np->freeze_state = NETIF_FREEZE_STATE_UNFROZEN;
+
 	return 0;
 }
 
@@ -2512,10 +2592,22 @@ static void netback_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateClosed:
-		if (dev->state == XenbusStateClosed)
+		if (dev->state == XenbusStateClosed) {
+			/* dpm context is waiting for the backend */
+			if (np->freeze_state == NETIF_FREEZE_STATE_FREEZING)
+				complete(&np->wait_backend_disconnected);
 			break;
+		}
 		fallthrough;	/* Missed the backend's CLOSING state */
 	case XenbusStateClosing:
+		/* We may see unexpected Closed or Closing from the backend.
+		 * Just ignore it not to prevent the frontend from being
+		 * re-connected in the case of PM suspend or hibernation.
+		 */
+		if (np->freeze_state == NETIF_FREEZE_STATE_FROZEN &&
+				dev->state == XenbusStateInitialising) {
+			break;
+		}
 		xenbus_frontend_closed(dev);
 		break;
 	}
@@ -2677,6 +2769,9 @@ static struct xenbus_driver netfront_driver = {
 	.probe = netfront_probe,
 	.remove = xennet_remove,
 	.resume = netfront_resume,
+	.freeze = netfront_freeze,
+	.thaw	= netfront_restore,
+	.restore = netfront_restore,
 	.otherend_changed = netback_changed,
 };
 

From f39ccbb91998d0ea0d21784aae99921ac09b6b40 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 13 Jul 2017 07:22:39 +0000
Subject: [PATCH 026/175] xen/time: introduce xen_{save,restore}_steal_clock

Currently, steal time accounting code in scheduler expects steal clock
callback to provide monotonically increasing value. If the accounting
code receives a smaller value than previous one, it uses a negative
value to calculate steal time and results in incorrectly updated idle
and steal time accounting. This breaks userspace tools which read
/proc/stat.

top - 08:05:35 up  2:12,  3 users,  load average: 0.00, 0.07, 0.23
Tasks:  80 total,   1 running,  79 sleeping,   0 stopped,   0 zombie
Cpu(s):  0.0%us,  0.0%sy,  0.0%ni,30100.0%id,  0.0%wa,  0.0%hi, 0.0%si,-1253874204672.0%st

This can actually happen when a Xen PVHVM guest gets restored from
hibernation, because such a restored guest is just a fresh domain from
Xen perspective and the time information in runstate info starts over
from scratch.

This patch introduces xen_save_steal_clock() which saves current values
in runstate info into per-cpu variables. Its couterpart,
xen_restore_steal_clock(), sets offset if it found the current values in
runstate info are smaller than previous ones. xen_steal_clock() is also
modified to use the offset to ensure that scheduler only sees
monotonically increasing number.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/xen/time.c    | 29 ++++++++++++++++++++++++++++-
 include/xen/xen-ops.h |  2 ++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/time.c b/drivers/xen/time.c
index 152dd33bb2236..bf41e5cf1332d 100644
--- a/drivers/xen/time.c
+++ b/drivers/xen/time.c
@@ -24,6 +24,9 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
 
 static DEFINE_PER_CPU(u64[4], old_runstate_time);
 
+static DEFINE_PER_CPU(u64, xen_prev_steal_clock);
+static DEFINE_PER_CPU(u64, xen_steal_clock_offset);
+
 /* return an consistent snapshot of 64-bit time/counter value */
 static u64 get64(const u64 *p)
 {
@@ -150,7 +153,7 @@ bool xen_vcpu_stolen(int vcpu)
 	return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
 }
 
-u64 xen_steal_clock(int cpu)
+static u64 __xen_steal_clock(int cpu)
 {
 	struct vcpu_runstate_info state;
 
@@ -158,6 +161,30 @@ u64 xen_steal_clock(int cpu)
 	return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline];
 }
 
+u64 xen_steal_clock(int cpu)
+{
+	return __xen_steal_clock(cpu) + per_cpu(xen_steal_clock_offset, cpu);
+}
+
+void xen_save_steal_clock(int cpu)
+{
+	per_cpu(xen_prev_steal_clock, cpu) = xen_steal_clock(cpu);
+}
+
+void xen_restore_steal_clock(int cpu)
+{
+	u64 steal_clock = __xen_steal_clock(cpu);
+
+	if (per_cpu(xen_prev_steal_clock, cpu) > steal_clock) {
+		/* Need to update the offset */
+		per_cpu(xen_steal_clock_offset, cpu) =
+		    per_cpu(xen_prev_steal_clock, cpu) - steal_clock;
+	} else {
+		/* Avoid unnecessary steal clock warp */
+		per_cpu(xen_steal_clock_offset, cpu) = 0;
+	}
+}
+
 void xen_setup_runstate_info(int cpu)
 {
 	struct vcpu_register_runstate_memory_area area;
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 9904dd5bf32ec..58231409cdddb 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -38,6 +38,8 @@ void xen_time_setup_guest(void);
 void xen_manage_runstate_time(int action);
 void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
 u64 xen_steal_clock(int cpu);
+void xen_save_steal_clock(int cpu);
+void xen_restore_steal_clock(int cpu);
 
 int xen_setup_shutdown_event(void);
 

From 46f424f3ea3fdf69a8fbe51e01555b39186ee8de Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Fri, 21 Jul 2017 06:06:12 +0000
Subject: [PATCH 027/175] x86/xen: save and restore steal clock

Save steal clock values of all present CPUs in the system core ops
suspend callbacks. Also, restore a boot CPU's steal clock in the system
core resume callback. For non-boot CPUs, restore after they're brought
up, because runstate info for non-boot CPUs are not active until then.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/x86/xen/suspend.c | 13 ++++++++++++-
 arch/x86/xen/time.c    |  3 +++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 784c4484100bb..dae0f74f5390d 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -91,12 +91,20 @@ void xen_arch_suspend(void)
 static int xen_syscore_suspend(void)
 {
 	struct xen_remove_from_physmap xrfp;
-	int ret;
+	int cpu, ret;
 
 	/* Xen suspend does similar stuffs in its own logic */
 	if (xen_suspend_mode_is_xen_suspend())
 		return 0;
 
+	for_each_present_cpu(cpu) {
+		/*
+		 * Nonboot CPUs are already offline, but the last copy of
+		 * runstate info is still accessible.
+		 */
+		xen_save_steal_clock(cpu);
+	}
+
 	xrfp.domid = DOMID_SELF;
 	xrfp.gpfn = __pa(HYPERVISOR_shared_info) >> PAGE_SHIFT;
 
@@ -118,6 +126,9 @@ static void xen_syscore_resume(void)
 
 	pvclock_resume();
 
+	/* Nonboot CPUs will be resumed when they're brought up */
+	xen_restore_steal_clock(smp_processor_id());
+
 	gnttab_resume();
 }
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 9ef0a5cca96ee..cc11dd2e2f481 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -548,6 +548,9 @@ static void xen_hvm_setup_cpu_clockevents(void)
 {
 	int cpu = smp_processor_id();
 	xen_setup_runstate_info(cpu);
+	if (cpu)
+		xen_restore_steal_clock(cpu);
+
 	/*
 	 * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
 	 * doing it xen_hvm_cpu_notify (which gets called by smp_init during

From 8376115e3005b91458984e264f582ae7c5d2e562 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 24 Aug 2017 22:54:14 +0000
Subject: [PATCH 028/175] xen/events: add xen_shutdown_pirqs helper function

Add a simple helper function to "shutdown" active PIRQs, which actually
closes event channels but keeps related IRQ structures intact. PM
suspend/hibernation code will rely on this.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/xen/events/events_base.c | 12 ++++++++++++
 include/xen/events.h             |  1 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 96b96516c9806..86e9b062d5183 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -2118,6 +2118,18 @@ void xen_irq_resume(void)
 	restore_pirqs();
 }
 
+void xen_shutdown_pirqs(void)
+{
+	struct irq_info *info;
+
+	list_for_each_entry(info, &xen_irq_list_head, list) {
+		if (info->type != IRQT_PIRQ || !VALID_EVTCHN(info->evtchn))
+			continue;
+
+		shutdown_pirq(irq_get_irq_data(info->irq));
+	}
+}
+
 static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.name			= "xen-dyn",
 
diff --git a/include/xen/events.h b/include/xen/events.h
index b303bd24e2a6c..9e5e97cd1a460 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -86,6 +86,7 @@ static inline void notify_remote_via_evtchn(evtchn_port_t port)
 void notify_remote_via_irq(int irq);
 
 void xen_irq_resume(void);
+void xen_shutdown_pirqs(void);
 
 /* Clear an irq's pending state, in preparation for polling on it */
 void xen_clear_irq_pending(int irq);

From 48a0b163612c43e11c84d30c205e52919f331fec Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 24 Aug 2017 22:56:36 +0000
Subject: [PATCH 029/175] x86/xen: close event channels for PIRQs in system
 core suspend callback

Close event channels allocated for devices which are backed by PIRQ and
still active when suspending the system core. Normally, the devices are
emulated legacy devices, e.g. PS/2 keyboard, floppy controller and etc.

Without this, in PM hibernation, information about the event channel
remains in hibernation image, but there is no guarantee that the same
event channel numbers are assigned to the devices when restoring the
system. This may cause conflict like the following and prevent some
devices from being restored correctly.

[  102.330821] ------------[ cut here ]------------
[  102.333264] WARNING: CPU: 0 PID: 2324 at
drivers/xen/events/events_base.c:878 bind_evtchn_to_irq+0x88/0xf0
...
[  102.348057] Call Trace:
[  102.348057]  [<ffffffff813001df>] dump_stack+0x63/0x84
[  102.348057]  [<ffffffff81071811>] __warn+0xd1/0xf0
[  102.348057]  [<ffffffff810718fd>] warn_slowpath_null+0x1d/0x20
[  102.348057]  [<ffffffff8139a1f8>] bind_evtchn_to_irq+0x88/0xf0
[  102.348057]  [<ffffffffa00cd420>] ? blkif_copy_from_grant+0xb0/0xb0 [xen_blkfront]
[  102.348057]  [<ffffffff8139a307>] bind_evtchn_to_irqhandler+0x27/0x80
[  102.348057]  [<ffffffffa00cc785>] talk_to_blkback+0x425/0xcd0 [xen_blkfront]
[  102.348057]  [<ffffffff811e0c8a>] ? __kmalloc+0x1ea/0x200
[  102.348057]  [<ffffffffa00ce84d>] blkfront_restore+0x2d/0x60 [xen_blkfront]
[  102.348057]  [<ffffffff813a0078>] xenbus_dev_restore+0x58/0x100
[  102.348057]  [<ffffffff813a1ff0>] ?  xenbus_frontend_delayed_resume+0x20/0x20
[  102.348057]  [<ffffffff813a200e>] xenbus_dev_cond_restore+0x1e/0x30
[  102.348057]  [<ffffffff813f797e>] dpm_run_callback+0x4e/0x130
[  102.348057]  [<ffffffff813f7f17>] device_resume+0xe7/0x210
[  102.348057]  [<ffffffff813f7810>] ? pm_dev_dbg+0x80/0x80
[  102.348057]  [<ffffffff813f9374>] dpm_resume+0x114/0x2f0
[  102.348057]  [<ffffffff810c00cf>] hibernation_snapshot+0x15f/0x380
[  102.348057]  [<ffffffff810c0ac3>] hibernate+0x183/0x290
[  102.348057]  [<ffffffff810be1af>] state_store+0xcf/0xe0
[  102.348057]  [<ffffffff813020bf>] kobj_attr_store+0xf/0x20
[  102.348057]  [<ffffffff8127c88a>] sysfs_kf_write+0x3a/0x50
[  102.348057]  [<ffffffff8127c3bb>] kernfs_fop_write+0x10b/0x190
[  102.348057]  [<ffffffff81200008>] __vfs_write+0x28/0x120
[  102.348057]  [<ffffffff81200c19>] ? rw_verify_area+0x49/0xb0
[  102.348057]  [<ffffffff81200e62>] vfs_write+0xb2/0x1b0
[  102.348057]  [<ffffffff81202196>] SyS_write+0x46/0xa0
[  102.348057]  [<ffffffff81520cf7>] entry_SYSCALL_64_fastpath+0x1a/0xa9
[  102.423005] ---[ end trace b8d6718e22e2b107 ]---
[  102.425031] genirq: Flags mismatch irq 6. 00000000 (blkif) vs. 00000000 (floppy)

Note that we don't explicitly re-allocate event channels for such
devices in the resume callback. Re-allocation will occur when PM core
re-enable IRQs for the devices at later point.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/x86/xen/suspend.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index dae0f74f5390d..affa63d4b6bdc 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -105,6 +105,8 @@ static int xen_syscore_suspend(void)
 		xen_save_steal_clock(cpu);
 	}
 
+	xen_shutdown_pirqs();
+
 	xrfp.domid = DOMID_SELF;
 	xrfp.gpfn = __pa(HYPERVISOR_shared_info) >> PAGE_SHIFT;
 

From 1f9e7476c83777a64074985e2f4342b3bc35df61 Mon Sep 17 00:00:00 2001
From: Aleksei Besogonov <cyberax@amazon.com>
Date: Fri, 27 Oct 2017 17:59:18 +0000
Subject: [PATCH 030/175] PM / hibernate: update the resume offset on
 SNAPSHOT_SET_SWAP_AREA

The SNAPSHOT_SET_SWAP_AREA is supposed to be used to set the hibernation
offset on a running kernel to enable hibernating to a swap file.
However, it doesn't actually update the swsusp_resume_block variable. As
a result, the hibernation fails at the last step (after all the data is
written out) in the validation of the swap signature in
mark_swapfiles().

Before this patch, the command line processing was the only place where
swsusp_resume_block was set.

Signed-off-by: Aleksei Besogonov <cyberax@amazon.com>
Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 kernel/power/user.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3a4e70366f354..0d9a3f899c380 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -243,6 +243,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data,
 	if (data->swap < 0)
 		return swdev ? -ENODEV : -EINVAL;
 	data->dev = swdev;
+
+	swsusp_resume_device = swdev;
+	swsusp_resume_block = offset;
+
 	return 0;
 }
 

From c86378ab4cd92d20ddda9c5676fb44be02e1fb61 Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Tue, 27 Mar 2018 17:23:50 +0000
Subject: [PATCH 031/175] Revert "xen: dont fiddle with event channel masking
 in suspend/resume"

This reverts commit e91b2b1194335ca83d8a40fa4e0efd480bf2babe.
evtchn are supposed to be masked during resume however they are not
which causes special interrupts like PV spinlock to cause kernel
BUG() as its expects the IRQ to be masked. This causes instances
that are live migrated successfully to crash after few minutes.

Signed-off--by: Anchal Agarwal <anchalag@amazon.com>
Signed-off--by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Vallish Vaidyeshwara <vallish@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/xen/events/events_base.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 86e9b062d5183..52aa2d5199c30 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -550,6 +550,14 @@ static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu,
 	channels_on_cpu_inc(info);
 }
 
+static void xen_evtchn_mask_all(void)
+{
+	evtchn_port_t evtchn;
+
+	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
+		mask_evtchn(evtchn);
+}
+
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
  * @irq: irq of event channel to send event to
@@ -2098,6 +2106,7 @@ void xen_irq_resume(void)
 	struct irq_info *info;
 
 	/* New event-channel space is not 'live' yet. */
+	xen_evtchn_mask_all();
 	xen_evtchn_resume();
 
 	/* No IRQ <-> event-channel mappings. */
@@ -2288,7 +2297,6 @@ static int xen_evtchn_cpu_dead(unsigned int cpu)
 void __init xen_init_IRQ(void)
 {
 	int ret = -EINVAL;
-	evtchn_port_t evtchn;
 
 	if (xen_fifo_events)
 		ret = xen_evtchn_fifo_init();
@@ -2308,8 +2316,7 @@ void __init xen_init_IRQ(void)
 	BUG_ON(!evtchn_to_irq);
 
 	/* No event channels are 'live' right now. */
-	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
-		mask_evtchn(evtchn);
+	xen_evtchn_mask_all();
 
 	pirq_needs_eoi = pirq_needs_eoi_flag;
 

From af27756473a38addbce6a4bbedcf7a649eb36562 Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Tue, 5 Jun 2018 20:51:31 +0000
Subject: [PATCH 032/175] xen-blkfront: Fixed blkfront_restore to remove a call
 to negotiate_mq

The code for talk_to_blkback API changed in kernel-4.14.45 to include a call to
negotiate_mq. Subsequent calls causes kernel panic
[   84.440105] Call Trace:
[   84.443707]  talk_to_blkback+0x6d/0x8b0 [xen_blkfront]
[   84.449147]  blkfront_restore+0x33/0x60 [xen_blkfront]
[   84.453336]  ? xenbus_read_otherend_details+0x50/0xb0
[   84.457804]  xenbus_dev_cancel+0x5f/0x160
[   84.463286]  ? xenbus_dev_resume+0x170/0x170
[   84.466891]  dpm_run_callback+0x3b/0x100
[   84.470516]  device_resume+0x10d/0x420
[   84.473844]  dpm_resume+0xfd/0x2f0
[   84.476984]  hibernation_snapshot+0x218/0x410
[   84.480794]  hibernate+0x14b/0x270
[   84.484030]  state_store+0x50/0x60
[   84.487443]  kernfs_fop_write+0x105/0x180
[   84.492695]  __vfs_write+0x36/0x160
[   84.496672]  ? __audit_syscall_entry+0xbc/0x110
[   84.502123]  vfs_write+0xad/0x1a0
[   84.506857]  SyS_write+0x52/0xc0
[   84.511420]  do_syscall_64+0x67/0x100
[   84.516365]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[   84.522571] RIP: 0033:0x7f44a03407e4
[   84.526210] RSP: 002b:00007ffd5e0ec3c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   84.534041] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f44a03407e4
[   84.542571] RDX: 0000000000000004 RSI: 0000000001e94990 RDI: 0000000000000001
[   84.549142] RBP: 0000000001e94990 R08: 00007f44a060c8c0 R09: 00007f44a0c57740
[   84.554658] R10: 00007f44a03cd320 R11: 0000000000000246 R12: 0000000000000004
[   84.560411] R13: 0000000000000001 R14: 00007f44a060b760 R15: 0000000000000004
[   84.565744] Code: 39 ab e8 00 00 00 77 8a 31 c0 5b 5d c3 44 8b 05 50 57 00 00 45 85 c0 0f 84 2f ff ff ff 89 c0 48 69 f8 e0 40 01 00 e9 30 ff ff ff <0f> 0b 48 8b 7b 28 48 c7 c2 78 58 16 a0 be f4 ff ff ff e8 7e 37
[   84.580594] RIP: negotiate_mq+0x12b/0x150 [xen_blkfront] RSP: ffffc90000ebbc70

Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Vallish Vaidyeshwara <vallish@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/block/xen-blkfront.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index ab2d9c23b8ffd..9647dc92b27c1 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2630,11 +2630,6 @@ static int blkfront_restore(struct xenbus_device *dev)
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 	int err = 0;
-
-	err = negotiate_mq(info);
-	if (err)
-		goto out;
-
 	err = talk_to_blkback(dev, info);
 	if (err)
 		goto out;

From 389b52cacf4b76841130fdb684b82b0e98148972 Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduval@amazon.com>
Date: Mon, 9 Apr 2018 20:50:29 +0000
Subject: [PATCH 033/175] x86: tsc: avoid system instability in hibernation

System instability are seen during resume from hibernation when system
is under heavy CPU load. This is due to the lack of update of sched
clock data, and the scheduler would then think that heavy CPU hog
tasks need more time in CPU, causing the system to freeze
during the unfreezing of tasks. For example, threaded irqs,
and kernel processes servicing network interface may be delayed
for several tens of seconds, causing the system to be unreachable.

Situation like this can be reported by using lockup detectors
such as workqueue lockup detectors:

[root@ip-172-31-67-114 ec2-user]# echo disk > /sys/power/state

Message from syslogd@ip-172-31-67-114 at May  7 18:23:21 ...
 kernel:BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 57s!

Message from syslogd@ip-172-31-67-114 at May  7 18:23:21 ...
 kernel:BUG: workqueue lockup - pool cpus=1 node=0 flags=0x0 nice=0 stuck for 57s!

Message from syslogd@ip-172-31-67-114 at May  7 18:23:21 ...
 kernel:BUG: workqueue lockup - pool cpus=3 node=0 flags=0x1 nice=0 stuck for 57s!

Message from syslogd@ip-172-31-67-114 at May  7 18:29:06 ...
 kernel:BUG: workqueue lockup - pool cpus=3 node=0 flags=0x1 nice=0 stuck for 403s!

The fix for this situation is to mark the sched clock as unstable
as early as possible in the resume path, leaving it unstable
for the duration of the resume process. This will force the
scheduler to attempt to align the sched clock across CPUs using
the delta with time of day, updating sched clock data. In a post
hibernation event, we can then mark the sched clock as stable
again, avoiding unnecessary syncs with time of day on systems
in which TSC is reliable.

Reviewed-by: Erik Quanstrom <quanstro@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Balbir Singh <sblbir@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Tested-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Eduardo Valentin <eduval@amazon.com>
[6.1: Account for notrace]
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/x86/kernel/tsc.c       | 29 +++++++++++++++++++++++++++++
 include/linux/sched/clock.h |  5 +++++
 kernel/sched/clock.c        |  4 ++--
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cafacb2e58cce..cece805541284 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -15,6 +15,7 @@
 #include <linux/timex.h>
 #include <linux/static_key.h>
 #include <linux/static_call.h>
+#include <linux/suspend.h>
 
 #include <asm/hpet.h>
 #include <asm/timer.h>
@@ -1575,3 +1576,31 @@ unsigned long calibrate_delay_is_known(void)
 	return 0;
 }
 #endif
+
+static int tsc_pm_notifier(struct notifier_block *notifier,
+                          unsigned long pm_event, void *unused)
+{
+	switch (pm_event) {
+	case PM_HIBERNATION_PREPARE:
+		clear_sched_clock_stable();
+		break;
+	case PM_POST_HIBERNATION:
+		/* Set back to the default */
+		if (!check_tsc_unstable())
+			set_sched_clock_stable();
+		break;
+	}
+
+	return 0;
+};
+
+static struct notifier_block tsc_pm_notifier_block = {
+       .notifier_call = tsc_pm_notifier,
+};
+
+static int tsc_setup_pm_notifier(void)
+{
+       return register_pm_notifier(&tsc_pm_notifier_block);
+}
+
+subsys_initcall(tsc_setup_pm_notifier);
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 867d588314e03..902654ac5f7e7 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -32,6 +32,10 @@ static inline void clear_sched_clock_stable(void)
 {
 }
 
+static inline void set_sched_clock_stable(void)
+{
+}
+
 static inline void sched_clock_idle_sleep_event(void)
 {
 }
@@ -51,6 +55,7 @@ static inline u64 local_clock(void)
 }
 #else
 extern int sched_clock_stable(void);
+extern void set_sched_clock_stable(void);
 extern void clear_sched_clock_stable(void);
 
 /*
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e374c0c923dae..1b3fb1604a536 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -114,7 +114,7 @@ notrace static void __scd_stamp(struct sched_clock_data *scd)
 	scd->tick_raw = sched_clock();
 }
 
-notrace static void __set_sched_clock_stable(void)
+notrace void set_sched_clock_stable(void)
 {
 	struct sched_clock_data *scd;
 
@@ -234,7 +234,7 @@ static int __init sched_clock_init_late(void)
 	smp_mb(); /* matches {set,clear}_sched_clock_stable() */
 
 	if (__sched_clock_stable_early)
-		__set_sched_clock_stable();
+		set_sched_clock_stable();
 
 	return 0;
 }

From 66bcb92a8c23338f288c9d1b4de52d808de727e2 Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduval@amazon.com>
Date: Thu, 18 Oct 2018 22:13:48 +0000
Subject: [PATCH 034/175] block: xen-blkfront: consider new dom0 features on
 restore

On regular start, the instance will perform a regular boot, in which rootfs
is mounted accordingly to the xen-blkback features (in particular
feature-barrier and feature-flush-cache). That will setup the journal
accordingly to the provided features on SB.
On a start from hibernation, the instance boots, detects that a hibernation
image is present, push the image to memory and jumps back where it was. There
is no regular mount of the rootfs, it uses the data structures already in
the previous saved memory image.
Now, When the instance hibernates, it may move from its original dom0 to a new dom0
when it is restarted.
So, given the above, if the xen-blkback features change then the guest
can be in trouble. And I see the original assumption was that the
dom0 environment would be preserved. I did a couple of experiments,
and I confirm that these particular features change quite a lot across
hibernation attempts:
[ 2343.157903] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2444.712339] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2537.105884] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2636.641298] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2729.868349] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2827.118979] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2924.812599] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3018.063399] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3116.685040] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3209.164475] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3317.981362] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3415.939725] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3514.202478] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3619.355791] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;

Now, considering the above, this patch fixes the following scenario:
a. Instance boots and sets up bio queue on a dom0 A with softbarrier supported.
b. hibernates
c. When asked to restore, the instance is back on dom0 B with unsupported
softbarrier.
d. Restoration goes well until next journal commit is issued. Remember that
it is still using the previous image rootfs data structures, therefore
is gonna request a softbarrier.
e. The bio will error out and throw a "operation not supported" message
and cause the journal to fail, and it will decide to remount
the rootfs as RO.
[ 1138.909290] print_req_error: operation not supported error, dev xvda, sector 4470400, flags 6008
[ 1139.025685] Aborting journal on device xvda1-8.
[ 1139.029758] print_req_error: operation not supported error, dev xvda, sector 4460544, flags 26008
[ 1139.326119] Buffer I/O error on dev xvda1, logical block 0, lost sync page write
[ 1139.331398] EXT4-fs error (device xvda1): ext4_journal_check_start:61: Detected aborted journal
[ 1139.337296] EXT4-fs (xvda1): Remounting filesystem read-only
[ 1139.341006] EXT4-fs (xvda1): previous I/O error to superblock detected
[ 1139.345704] print_req_error: operation not supported error, dev xvda, sector 4096, flags 26008

The fix is essentially to read xenbus to query the new xen
blkback capabilities and update them into the request queue.

Reviewed-by: Balbir Singh <sblbir@amazon.com>
Reviewed-by: Vallish Vaidyeshwara <vallish@amazon.com>
Signed-off-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/block/xen-blkfront.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9647dc92b27c1..ce01525493d85 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2630,6 +2630,9 @@ static int blkfront_restore(struct xenbus_device *dev)
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 	int err = 0;
+
+	blkfront_gather_backend_features(info);
+	xlvbd_flush(info);
 	err = talk_to_blkback(dev, info);
 	if (err)
 		goto out;

From f0035c1a11282727e98ae6b99efec68ad1afccf4 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 26 Oct 2018 21:27:54 +0000
Subject: [PATCH 035/175] xen: restore pirqs on resume from hibernation.

The hibernation code unlinks event channels from these (legacy) IRQs, so they
must be reinitialized on wakeup, much like in the Xen suspend/resume case.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/x86/xen/suspend.c           | 2 ++
 drivers/xen/events/events_base.c | 5 +++++
 include/xen/events.h             | 1 +
 3 files changed, 8 insertions(+)

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index affa63d4b6bdc..39644923b623e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -132,6 +132,8 @@ static void xen_syscore_resume(void)
 	xen_restore_steal_clock(smp_processor_id());
 
 	gnttab_resume();
+
+	xen_restore_pirqs();
 }
 
 /*
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 52aa2d5199c30..b8fb81ff5d0d4 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -2139,6 +2139,11 @@ void xen_shutdown_pirqs(void)
 	}
 }
 
+void xen_restore_pirqs(void)
+{
+	restore_pirqs();
+}
+
 static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.name			= "xen-dyn",
 
diff --git a/include/xen/events.h b/include/xen/events.h
index 9e5e97cd1a460..52f635c6ce94b 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -87,6 +87,7 @@ void notify_remote_via_irq(int irq);
 
 void xen_irq_resume(void);
 void xen_shutdown_pirqs(void);
+void xen_restore_pirqs(void);
 
 /* Clear an irq's pending state, in preparation for polling on it */
 void xen_clear_irq_pending(int irq);

From 83663c67e37a8104ffd857573721e56f5f12f121 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Sat, 10 Nov 2018 00:18:32 +0000
Subject: [PATCH 036/175] xen: Only restore the ACPI SCI interrupt in
 xen_restore_pirqs.

Restoring all PIRQs, which is the right thing to do, was causing problems
on larger instances. This is a horrible workaround until this issue is fully
understood.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Qian Lu <luqia@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/xen/events/events_base.c | 42 +++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8fb81ff5d0d4..7ae1f1d9fb18e 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -68,6 +68,10 @@
 #include <xen/xenbus.h>
 #include <asm/hw_irq.h>
 
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#endif
+
 #include "events_internal.h"
 
 #undef MODULE_PARAM_PREFIX
@@ -2139,9 +2143,45 @@ void xen_shutdown_pirqs(void)
 	}
 }
 
+/*
+ * For now, only restore the ACPI SCI pirq.
+ */
 void xen_restore_pirqs(void)
 {
-	restore_pirqs();
+#ifdef CONFIG_ACPI
+	int pirq, rc, irq, gsi;
+	struct physdev_map_pirq map_irq;
+	struct irq_info *info;
+
+	list_for_each_entry(info, &xen_irq_list_head, list) {
+		if (info->type != IRQT_PIRQ)
+			continue;
+
+		pirq = info->u.pirq.pirq;
+		gsi = info->u.pirq.gsi;
+		irq = info->irq;
+
+		if (gsi != acpi_gbl_FADT.sci_interrupt)
+			continue;
+
+		map_irq.domid = DOMID_SELF;
+		map_irq.type = MAP_PIRQ_TYPE_GSI;
+		map_irq.index = gsi;
+		map_irq.pirq = pirq;
+
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+		if (rc) {
+			pr_warn("xen: ACPI SCI interrupt map failed, rc=%d\n",
+				rc);
+			xen_free_irq(irq);
+			continue;
+		}
+
+		printk(KERN_DEBUG "xen: restored ACPI SCI interrupt\n");
+
+		__startup_pirq(irq);
+	}
+#endif
 }
 
 static struct irq_chip xen_dynamic_chip __read_mostly = {

From 7829314ecc619ddd19070224d7ff2b7c0ef94e21 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 31 Jan 2019 21:50:37 +0000
Subject: [PATCH 037/175] xen-netfront: call netif_device_attach on resume

When xennet_connect is called in the resume path, it needs
to re-attach the netif, otherwise it will no longer be
found by various operations (such as ethtool ioctls, etc).

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Vallish Vaidyeshwara <vallish@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/net/xen-netfront.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 7f09ee596ca6f..a608901bade5d 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -2524,6 +2524,13 @@ static int xennet_connect(struct net_device *dev)
 			device_unregister(&np->xbdev->dev);
 			return err;
 		}
+	} else {
+		/*
+		 * In the resume / thaw case, the netif needs to be
+		 * reattached, as it was detached in netfront_freeze().
+		 */
+		if (np->freeze_state == NETIF_FREEZE_STATE_FROZEN)
+			netif_device_attach(dev);
 	}
 
 	rtnl_lock();

From f23f294157003bbef4c729ac14197abb0584145d Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Thu, 15 Aug 2019 22:26:27 +0000
Subject: [PATCH 038/175] xen: Restore xen-pirqs on resume from hibernation

shutdown_pirq is invoked during hibernation path and hence
PIRQs should be restarted during resume. [Commit: xen: Only
restore the ACPI SCI interrupt in xen_restore_pirqs] restores
only ACPI SCI interrupt however, that is not the right thing
to do as all pirqs should be enabled as a part of
resume_device_irqs during suspend/resume device interrupts.
Apparently, chip->irq_startup is called only if IRQD_IRQ_STARTED
is unset during irq_startup on resume. This flag gets cleared by
free_irq->irq_shutdown during suspend.

free_irq() never gets explicitly called for ioapic-edge and
ioapic-level interrupts as respective drivers do nothing during
suspend/resume. So we shut them down explicitly in the first place
in syscore_suspend path to clear IRQ<>event channel mapping.
shutdown_pirq being called explicitly during suspend does not
clear this flags, hence .irq_enable is called in irq_startup
during resume instead and pirq's never start up.

This commit exports irq_state_clr_started API to clear the flag
during shutdown_pirq. Also, following the order in which
ipis/virqs/pirqs are restored during xen resume, the same order
should be followed for hibernation path. As per the flow of
hibernation_platform_enter, we should not restore pirqs explicitly
in syscore_resume ops and it should be done in resume devices path.

Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/x86/xen/suspend.c           |  1 -
 drivers/xen/events/events_base.c | 42 +-------------------------------
 include/linux/irq.h              |  2 ++
 kernel/irq/chip.c                |  4 +--
 4 files changed, 5 insertions(+), 44 deletions(-)

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 39644923b623e..8be6ffa6bfbea 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -133,7 +133,6 @@ static void xen_syscore_resume(void)
 
 	gnttab_resume();
 
-	xen_restore_pirqs();
 }
 
 /*
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 7ae1f1d9fb18e..048f8ea9c1b1e 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -2140,50 +2140,10 @@ void xen_shutdown_pirqs(void)
 			continue;
 
 		shutdown_pirq(irq_get_irq_data(info->irq));
+		irq_state_clr_started(irq_to_desc(info->irq));
 	}
 }
 
-/*
- * For now, only restore the ACPI SCI pirq.
- */
-void xen_restore_pirqs(void)
-{
-#ifdef CONFIG_ACPI
-	int pirq, rc, irq, gsi;
-	struct physdev_map_pirq map_irq;
-	struct irq_info *info;
-
-	list_for_each_entry(info, &xen_irq_list_head, list) {
-		if (info->type != IRQT_PIRQ)
-			continue;
-
-		pirq = info->u.pirq.pirq;
-		gsi = info->u.pirq.gsi;
-		irq = info->irq;
-
-		if (gsi != acpi_gbl_FADT.sci_interrupt)
-			continue;
-
-		map_irq.domid = DOMID_SELF;
-		map_irq.type = MAP_PIRQ_TYPE_GSI;
-		map_irq.index = gsi;
-		map_irq.pirq = pirq;
-
-		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
-		if (rc) {
-			pr_warn("xen: ACPI SCI interrupt map failed, rc=%d\n",
-				rc);
-			xen_free_irq(irq);
-			continue;
-		}
-
-		printk(KERN_DEBUG "xen: restored ACPI SCI interrupt\n");
-
-		__startup_pirq(irq);
-	}
-#endif
-}
-
 static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.name			= "xen-dyn",
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 06c692cc0accb..9509c21b07452 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -799,6 +799,8 @@ extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
 				struct msi_desc *entry);
 extern struct irq_data *irq_get_irq_data(unsigned int irq);
 
+extern void irq_state_clr_started(struct irq_desc *desc);
+
 static inline struct irq_chip *irq_get_chip(unsigned int irq)
 {
 	struct irq_data *d = irq_get_irq_data(irq);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 8ac37e8e738a3..891a895ac218d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -170,11 +170,11 @@ static void irq_state_clr_masked(struct irq_desc *desc)
 	irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
 }
 
-static void irq_state_clr_started(struct irq_desc *desc)
+void irq_state_clr_started(struct irq_desc *desc)
 {
 	irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED);
 }
-
+EXPORT_SYMBOL_GPL(irq_state_clr_started);
 static void irq_state_set_started(struct irq_desc *desc)
 {
 	irqd_set(&desc->irq_data, IRQD_IRQ_STARTED);

From 1fa9103e06dff502f677ad7c7b3dc3e3bfb07844 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 27 Nov 2019 22:14:28 +0000
Subject: [PATCH 039/175] block/xen-blkfront: bump the maximum number of
 indirect segments up to 64

Bump the maximum number of indirect segments up to 64. For blk-mq without
a scheduler, which is the default for multi-queued devices, this is needed
to avoid a throughput regression for 'st1' EBS volumes.

On a c4.8xlarge instance:

sudo fio --bs=1M --name=seqread --ioengine=libaio --rw=read --direct=1 --filename=/dev/sdg --clocksource=clock_gettime --size=1G --numjobs=8

before:
READ: io=8192.0MB, aggrb=44772KB/s, minb=5596KB/s, maxb=5922KB/s, mint=177050msec, maxt=187361msec

after:
READ: io=8192.0MB, aggrb=136059KB/s, minb=17007KB/s, maxb=17022KB/s, mint=61600msec, maxt=61654msec

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/block/xen-blkfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index ce01525493d85..5e5a600a543bf 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -139,7 +139,7 @@ static LIST_HEAD(info_list);
  * by the backend driver.
  */
 
-static unsigned int xen_blkif_max_segments = 32;
+static unsigned int xen_blkif_max_segments = 64;
 module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444);
 MODULE_PARM_DESC(max_indirect_segments,
 		 "Maximum amount of segments in indirect requests (default is 32)");

From 2688adaeb435bc503c88812799d8727cf791badd Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <samjonas@amazon.com>
Date: Fri, 20 Jan 2023 18:09:39 +0000
Subject: [PATCH 040/175] Revert "PCI/MSI: Let core code free MSI descriptors"

This reverts commit 9fb9eb4b59acc607e978288c96ac7efa917153d4.

This commit causes issues with hibernation on Xen guests (null-pointer
exception on freeze), most likey due to the Xen driver trying to work on
interrupts after they've been freed.

While the downstream Xen patches are updated to work properly, revert
this change. This is modified to still use the new accessors introduced
in the same series, and just call out to msi_free_msi_descs() which does
exactly the same work.

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/pci/msi/irqdomain.c |  1 -
 drivers/pci/msi/msi.c       | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
index 34877a1f43a15..2d5083003987f 100644
--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -28,7 +28,6 @@ void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
 		msi_domain_free_irqs_descs_locked(domain, &dev->dev);
 	else
 		pci_msi_legacy_teardown_msi_irqs(dev);
-	msi_free_msi_descs(&dev->dev);
 }
 
 /**
diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index fdd2ec09651e9..5474dd02b3eb0 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -224,8 +224,18 @@ EXPORT_SYMBOL_GPL(pci_write_msi_msg);
 
 static void free_msi_irqs(struct pci_dev *dev)
 {
+	struct msi_desc *desc;
+	int i;
+
+	msi_for_each_desc(desc, &dev->dev, MSI_DESC_NOTASSOCIATED)
+		if (desc->irq)
+			for (i = 0; i < desc->nvec_used; i++)
+				BUG_ON(irq_has_action(desc->irq + i));
+
 	pci_msi_teardown_msi_irqs(dev);
 
+	msi_free_msi_descs(&dev->dev);
+
 	if (dev->msix_base) {
 		iounmap(dev->msix_base);
 		dev->msix_base = NULL;

From 40d7f321873fb8105d6c0b0bc7298c49182f3250 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <samjonas@amazon.com>
Date: Tue, 24 Jan 2023 20:12:36 +0000
Subject: [PATCH 041/175] Revert "xen/x2apic: enable x2apic mode when supported
 for HVM"

This reverts commit c8980fcb210851138cb34c9a8cb0cf0c09f07bf9.

This commit causes issues with hibernation on Xen guests on thaw,
resulting in all CPUs aside from 0 failing to come up.

While the commit is meant to be functionally equivalent, Xen guests would
previously trigger on
	xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback
and not enable x2apic support. Now that they do, there appear to be
issues with x2apic support and the downstream hibernation patches. While
that is investigated, revert this change.

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/x86/include/asm/xen/hypervisor.h | 14 ++++++++++++++
 arch/x86/xen/enlighten_hvm.c          | 12 +++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 16f548a661cf6..1bf2ad34188ad 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -43,6 +43,20 @@ static inline uint32_t xen_cpuid_base(void)
 	return hypervisor_cpuid_base("XenVMMXenVMM", 2);
 }
 
+#ifdef CONFIG_XEN
+extern bool __init xen_hvm_need_lapic(void);
+
+static inline bool __init xen_x2apic_para_available(void)
+{
+	return xen_hvm_need_lapic();
+}
+#else
+static inline bool __init xen_x2apic_para_available(void)
+{
+	return (xen_cpuid_base() != 0);
+}
+#endif
+
 struct pci_dev;
 
 #ifdef CONFIG_XEN_PV_DOM0
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index ec87cd2def64c..2d1284f8cd2ae 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -266,9 +266,15 @@ static __init int xen_parse_no_vector_callback(char *arg)
 }
 early_param("xen_no_vector_callback", xen_parse_no_vector_callback);
 
-static __init bool xen_x2apic_available(void)
+bool __init xen_hvm_need_lapic(void)
 {
-	return x2apic_supported();
+	if (xen_pv_domain())
+		return false;
+	if (!xen_hvm_domain())
+		return false;
+	if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+		return false;
+	return true;
 }
 
 static bool __init msi_ext_dest_id(void)
@@ -335,7 +341,7 @@ struct hypervisor_x86 x86_hyper_xen_hvm __initdata = {
 	.detect                 = xen_platform_hvm,
 	.type			= X86_HYPER_XEN_HVM,
 	.init.init_platform     = xen_hvm_guest_init,
-	.init.x2apic_available  = xen_x2apic_available,
+	.init.x2apic_available  = xen_x2apic_para_available,
 	.init.init_mem_mapping	= xen_hvm_init_mem_mapping,
 	.init.guest_late_init	= xen_hvm_guest_late_init,
 	.init.msi_ext_dest_id   = msi_ext_dest_id,

From 31a1d12789ce4f4c1156b0f28faa8ddb49ea949c Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@amazon.com>
Date: Mon, 20 Mar 2023 02:39:35 +0000
Subject: [PATCH 042/175] msr: disable MSR writes by default

Suggested by Benjamin Herrenschmidt <benh@amazon.com>, increases security
by disallowing user-space to change CPU behavior.

MSR writes can be reenabled by doing one of the following:

o Writing "on" or "default" to /sys/module/msr/parameters/allow_writes
o Booting with msr.allow_writes=<on|default>

Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 arch/x86/kernel/msr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index ed8ac6bcbafb2..a780572062bdc 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -48,7 +48,7 @@ enum allow_write_msrs {
 	MSR_WRITES_DEFAULT,
 };
 
-static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
+static enum allow_write_msrs allow_writes = MSR_WRITES_OFF;
 
 static ssize_t msr_read(struct file *file, char __user *buf,
 			size_t count, loff_t *ppos)

From c08c396aa00b983014138cde6a4a769e11044d97 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 3 Mar 2023 07:51:43 +0000
Subject: [PATCH 043/175] udp: Fix memleaks of sk and zerocopy skbs with TX
 timestamp.

syzkaller reported [0] memory leaks of an UDP socket and ZEROCOPY
skbs.  We can reproduce the problem with these sequences:

  sk = socket(AF_INET, SOCK_DGRAM, 0)
  sk.setsockopt(SOL_SOCKET, SO_TIMESTAMPING, SOF_TIMESTAMPING_TX_SOFTWARE)
  sk.setsockopt(SOL_SOCKET, SO_ZEROCOPY, 1)
  sk.sendto(b'', MSG_ZEROCOPY, ('127.0.0.1', 53))
  sk.close()

sendmsg() calls msg_zerocopy_alloc(), which allocates a skb, sets
skb->cb->ubuf.refcnt to 1, and calls sock_hold().  Here, struct
ubuf_info_msgzc indirectly holds a refcnt of the socket.  When the
skb is sent, __skb_tstamp_tx() clones it and puts the clone into
the socket's error queue with the TX timestamp.

When the original skb is received locally, skb_copy_ubufs() calls
skb_unclone(), and pskb_expand_head() increments skb->cb->ubuf.refcnt.
This additional count is decremented while freeing the skb, but struct
ubuf_info_msgzc still has a refcnt, so __msg_zerocopy_callback() is
not called.

The last refcnt is not released unless we retrieve the TX timestamped
skb by recvmsg().  When we close() the socket holding such skb, we
never call sock_put() and leak the count.

To avoid this problem, we must call skb_queue_purge() while we close()
UDP sockets.

Note that TCP does not have this problem because skb_queue_purge() is
called by sk_stream_kill_queues() during close().

[0]:
BUG: memory leak
unreferenced object 0xffff88800c6d2d00 (size 1152):
  comm "syz-executor392", pid 264, jiffies 4294785440 (age 13.044s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 cd af e8 81 00 00 00 00  ................
    02 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
  backtrace:
    [<0000000055636812>] sk_prot_alloc+0x64/0x2a0 net/core/sock.c:2024
    [<0000000054d77b7a>] sk_alloc+0x3b/0x800 net/core/sock.c:2083
    [<0000000066f3c7e0>] inet_create net/ipv4/af_inet.c:319 [inline]
    [<0000000066f3c7e0>] inet_create+0x31e/0xe40 net/ipv4/af_inet.c:245
    [<000000009b83af97>] __sock_create+0x2ab/0x550 net/socket.c:1515
    [<00000000b9b11231>] sock_create net/socket.c:1566 [inline]
    [<00000000b9b11231>] __sys_socket_create net/socket.c:1603 [inline]
    [<00000000b9b11231>] __sys_socket_create net/socket.c:1588 [inline]
    [<00000000b9b11231>] __sys_socket+0x138/0x250 net/socket.c:1636
    [<000000004fb45142>] __do_sys_socket net/socket.c:1649 [inline]
    [<000000004fb45142>] __se_sys_socket net/socket.c:1647 [inline]
    [<000000004fb45142>] __x64_sys_socket+0x73/0xb0 net/socket.c:1647
    [<0000000066999e0e>] do_syscall_x64 arch/x86/entry/common.c:50 [inline]
    [<0000000066999e0e>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80
    [<0000000017f238c1>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

BUG: memory leak
unreferenced object 0xffff888017633a00 (size 240):
  comm "syz-executor392", pid 264, jiffies 4294785440 (age 13.044s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 2d 6d 0c 80 88 ff ff  .........-m.....
  backtrace:
    [<000000002b1c4368>] __alloc_skb+0x229/0x320 net/core/skbuff.c:497
    [<00000000143579a6>] alloc_skb include/linux/skbuff.h:1265 [inline]
    [<00000000143579a6>] sock_omalloc+0xaa/0x190 net/core/sock.c:2596
    [<00000000be626478>] msg_zerocopy_alloc net/core/skbuff.c:1294 [inline]
    [<00000000be626478>] msg_zerocopy_realloc+0x1ce/0x7f0 net/core/skbuff.c:1370
    [<00000000cbfc9870>] __ip_append_data+0x2adf/0x3b30 net/ipv4/ip_output.c:1037
    [<0000000089869146>] ip_make_skb+0x26c/0x2e0 net/ipv4/ip_output.c:1652
    [<00000000098015c2>] udp_sendmsg+0x1bac/0x2390 net/ipv4/udp.c:1253
    [<0000000045e0e95e>] inet_sendmsg+0x10a/0x150 net/ipv4/af_inet.c:819
    [<000000008d31bfde>] sock_sendmsg_nosec net/socket.c:714 [inline]
    [<000000008d31bfde>] sock_sendmsg+0x141/0x190 net/socket.c:734
    [<0000000021e21aa4>] __sys_sendto+0x243/0x360 net/socket.c:2117
    [<00000000ac0af00c>] __do_sys_sendto net/socket.c:2129 [inline]
    [<00000000ac0af00c>] __se_sys_sendto net/socket.c:2125 [inline]
    [<00000000ac0af00c>] __x64_sys_sendto+0xe1/0x1c0 net/socket.c:2125
    [<0000000066999e0e>] do_syscall_x64 arch/x86/entry/common.c:50 [inline]
    [<0000000066999e0e>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80
    [<0000000017f238c1>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: b5947e5d1e71 ("udp: msg_zerocopy")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/net/udp.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/net/udp.h b/include/net/udp.h
index fa4cdbe55552c..17a78e61692b7 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -186,6 +186,11 @@ void udp_lib_rehash(struct sock *sk, u16 new_hash);
 
 static inline void udp_lib_close(struct sock *sk, long timeout)
 {
+	/* A zerocopy skb has a refcnt of sk and may be
+	 * put into sk_error_queue with TX timestamp
+	 */
+	skb_queue_purge(&sk->sk_error_queue);
+
 	sk_common_release(sk);
 }
 

From 9cf5a0290356dd2dbc5cd3a0dba90074dbca2378 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <samjonas@amazon.com>
Date: Thu, 30 Mar 2023 15:05:55 +0000
Subject: [PATCH 044/175] ENA: Update to v2.8.3

Source: https://github.com/amzn/amzn-drivers/

Change Log:

## r2.8.3 release notes
**New Features**
* PHC module param enablement
* PHC devlink param enablement
* Add hint for interrupt moderation for the device
* Change initial static RX interrupt moderation interval
* Enable DIM by default on all CPU Architectures

**Buf Fixes**
* DMA sync for CPU before accessing buffer
* Fix ena_probe destroy order
* Validate completion descriptors consistency
* Fix TX packets missing completion counter

**Minor Changes**
* Compilation fixes for RHEL 9.0, 9.1 and SLES 15SP4
* PHC info dynamic allocation
* Publish devlink reload for RHEL 9.0 and 9.1
* Add ENA Express documentation

## r2.8.2 release notes
**Buf Fixes**
* Fix devlink large LLQ config not fully applied

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/amazon/net/ena/ena_com.c         |  11 +--
 drivers/amazon/net/ena/ena_com.h         |  15 +++-
 drivers/amazon/net/ena/ena_devlink.c     |  80 ++++++++++++++++-
 drivers/amazon/net/ena/ena_devlink.h     |  11 ++-
 drivers/amazon/net/ena/ena_eth_com.c     |  38 +++++---
 drivers/amazon/net/ena/ena_eth_io_defs.h |   5 +-
 drivers/amazon/net/ena/ena_ethtool.c     |  22 +++--
 drivers/amazon/net/ena/ena_netdev.c      | 106 +++++++++++++++++------
 drivers/amazon/net/ena/ena_netdev.h      |  19 ++--
 drivers/amazon/net/ena/ena_phc.c         | 101 ++++++++++++++-------
 drivers/amazon/net/ena/ena_phc.h         |  19 ++--
 drivers/amazon/net/ena/ena_regs_defs.h   |   1 +
 drivers/amazon/net/ena/ena_xdp.h         |   1 +
 drivers/amazon/net/ena/kcompat.h         |  17 +++-
 14 files changed, 335 insertions(+), 111 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 9bd064ff0f6c7..889d3412a72df 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -1823,8 +1823,8 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev)
 		return ret;
 	}
 
-	phc->enabled = true;
-	netdev_dbg(ena_dev->net_device, "PHC is enabled\n");
+	phc->active = true;
+	netdev_dbg(ena_dev->net_device, "PHC is active in the device\n");
 
 	return ret;
 }
@@ -1833,7 +1833,7 @@ void ena_com_phc_destroy(struct ena_com_dev *ena_dev)
 {
 	struct ena_com_phc_info *phc = &ena_dev->phc;
 
-	phc->enabled = false;
+	phc->active = false;
 
 	/* In case PHC is not supported by the device, silently exiting */
 	if (!phc->virt_addr)
@@ -1855,8 +1855,9 @@ int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
 	ktime_t block_time;
 	int ret = 0;
 
-	if (!phc->enabled) {
-		netdev_err(ena_dev->net_device, "PHC feature is not enabled\n");
+	if (!phc->active) {
+		netdev_err(ena_dev->net_device,
+			   "PHC feature is not active in the device\n");
 		return -EOPNOTSUPP;
 	}
 
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 3fd86b6f14e6b..f44e59176e459 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -49,7 +49,7 @@
 /* ENA adaptive interrupt moderation settings */
 
 #define ENA_INTR_INITIAL_TX_INTERVAL_USECS 64
-#define ENA_INTR_INITIAL_RX_INTERVAL_USECS 0
+#define ENA_INTR_INITIAL_RX_INTERVAL_USECS 20
 #define ENA_DEFAULT_INTR_DELAY_RESOLUTION 1
 
 #define ENA_HASH_KEY_SIZE 40
@@ -302,8 +302,8 @@ struct ena_com_phc_info {
 	/* Request id sent to the device */
 	u16 req_id;
 
-	/* True if PHC is enabled */
-	bool enabled;
+	/* True if PHC is active in the device */
+	bool active;
 
 	/* PHC shared memory - memory handle */
 
@@ -1146,13 +1146,16 @@ static inline int ena_com_get_customer_metric_count(struct ena_com_dev *ena_dev)
  * @rx_delay_interval: Rx interval in usecs
  * @tx_delay_interval: Tx interval in usecs
  * @unmask: unmask enable/disable
+ * @no_moderation_update: 0 - Indicates that any of the TX/RX intervals was
+ *                        updated, 1 - otherwise
  *
  * Prepare interrupt update register with the supplied parameters.
  */
 static inline void ena_com_update_intr_reg(struct ena_eth_io_intr_reg *intr_reg,
 					   u32 rx_delay_interval,
 					   u32 tx_delay_interval,
-					   bool unmask)
+					   bool unmask,
+					   bool no_moderation_update)
 {
 	intr_reg->intr_control = 0;
 	intr_reg->intr_control |= rx_delay_interval &
@@ -1164,6 +1167,10 @@ static inline void ena_com_update_intr_reg(struct ena_eth_io_intr_reg *intr_reg,
 
 	if (unmask)
 		intr_reg->intr_control |= ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK;
+
+	intr_reg->intr_control |=
+		(((u32)no_moderation_update) << ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_SHIFT) &
+			ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_MASK;
 }
 
 static inline u8 *ena_com_get_next_bounce_buffer(struct ena_com_io_bounce_buffer_control *bounce_buf_ctrl)
diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c
index fce8d6c795a8b..f140d024ef166 100644
--- a/drivers/amazon/net/ena/ena_devlink.c
+++ b/drivers/amazon/net/ena/ena_devlink.c
@@ -1,12 +1,19 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2015-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "linux/pci.h"
 
 #include "ena_devlink.h"
 #ifdef ENA_DEVLINK_SUPPORT
+#ifdef ENA_PHC_SUPPORT
+#include "ena_phc.h"
+
+static int ena_devlink_phc_enable_validate(struct devlink *devlink, u32 id,
+					   union devlink_param_value val,
+					   struct netlink_ext_ack *extack);
+#endif /* ENA_PHC_SUPPORT */
 
 static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
 					   union devlink_param_value val,
@@ -15,6 +22,9 @@ static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
 enum ena_devlink_param_id {
 	ENA_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
 	ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+#ifdef ENA_PHC_SUPPORT
+	ENA_DEVLINK_PARAM_ID_PHC_ENABLE,
+#endif /* ENA_PHC_SUPPORT */
 };
 
 static const struct devlink_param ena_devlink_params[] = {
@@ -22,6 +32,12 @@ static const struct devlink_param ena_devlink_params[] = {
 			     "large_llq_header", DEVLINK_PARAM_TYPE_BOOL,
 			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
 			     NULL, NULL, ena_devlink_llq_header_validate),
+#ifdef ENA_PHC_SUPPORT
+	DEVLINK_PARAM_DRIVER(ENA_DEVLINK_PARAM_ID_PHC_ENABLE,
+			     "phc_enable", DEVLINK_PARAM_TYPE_BOOL,
+			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			     NULL, NULL, ena_devlink_phc_enable_validate),
+ #endif /* ENA_PHC_SUPPORT */
 };
 
 static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
@@ -47,6 +63,25 @@ static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
 	return 0;
 }
 
+#ifdef ENA_PHC_SUPPORT
+static int ena_devlink_phc_enable_validate(struct devlink *devlink, u32 id,
+					   union devlink_param_value val,
+					   struct netlink_ext_ack *extack)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+
+	if (!val.vbool)
+		return 0;
+
+	if (!ena_com_phc_supported(adapter->ena_dev)) {
+		NL_SET_ERR_MSG_MOD(extack, "Device doesn't support PHC");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+#endif /* ENA_PHC_SUPPORT */
 #ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
 /* Determines if ena_devlink_register has been called.
  * Prefer to check if the driver enabled reloading capabilities, but fallback
@@ -82,6 +117,16 @@ void ena_devlink_params_get(struct devlink *devlink)
 	}
 
 	adapter->large_llq_header_enabled = val.vbool;
+#ifdef ENA_PHC_SUPPORT
+
+	err = devlink_param_driverinit_value_get(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, &val);
+	if (err) {
+		netdev_err(adapter->netdev, "Failed to query PHC param\n");
+		return;
+	}
+
+	ena_phc_enable(adapter, val.vbool);
+#endif /* ENA_PHC_SUPPORT */
 }
 
 void ena_devlink_disable_large_llq_header_param(struct devlink *devlink)
@@ -100,6 +145,22 @@ void ena_devlink_disable_large_llq_header_param(struct devlink *devlink)
 					   value);
 }
 
+#ifdef ENA_PHC_SUPPORT
+void ena_devlink_disable_phc_param(struct devlink *devlink)
+{
+	union devlink_param_value value;
+
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	/* If devlink params aren't registered, don't access them */
+	if (!ena_is_devlink_params_registered(devlink))
+		return;
+
+#endif
+	value.vbool = false;
+	devlink_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
+}
+
+#endif /* ENA_PHC_SUPPORT */
 static int ena_devlink_reload_down(struct devlink *devlink,
 #ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
 				   bool netns_change,
@@ -164,11 +225,11 @@ static int ena_devlink_reload_up(struct devlink *devlink,
 	rtnl_lock();
 	/* Check that no other routine initialized the device (e.g.
 	 * ena_fw_reset_device()). Also we're under devlink_mutex here,
-	 * so devink (and ena_adapter with it) isn't freed under our
-	 * feet.
+	 * so devlink isn't freed under our feet.
 	 */
 	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
 		err = ena_restore_device(adapter);
+
 	rtnl_unlock();
 
 #ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
@@ -220,9 +281,18 @@ static int ena_devlink_configure_params(struct devlink *devlink)
 					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
 					   value);
 
+#ifdef ENA_PHC_SUPPORT
+	value.vbool = ena_phc_is_enabled(adapter);
+	devlink_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
+
+#endif /* ENA_PHC_SUPPORT */
 #ifdef ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
 	devlink_set_features(devlink, DEVLINK_F_RELOAD);
 
+#endif
+#ifdef ENA_DEVLINK_PUBLISH_REQUIRED
+	devlink_params_publish(devlink);
+
 #endif
 #ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
 	devlink_reload_enable(devlink);
@@ -270,6 +340,10 @@ static void ena_devlink_configure_params_clean(struct devlink *devlink)
 #ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
 	devlink_reload_disable(devlink);
 
+#endif
+#ifdef ENA_DEVLINK_PUBLISH_REQUIRED
+	devlink_params_unpublish(devlink);
+
 #endif
 	devlink_params_unregister(devlink, ena_devlink_params,
 				  ARRAY_SIZE(ena_devlink_params));
diff --git a/drivers/amazon/net/ena/ena_devlink.h b/drivers/amazon/net/ena/ena_devlink.h
index 8a047654b2f52..85c05cba00bd1 100644
--- a/drivers/amazon/net/ena/ena_devlink.h
+++ b/drivers/amazon/net/ena/ena_devlink.h
@@ -1,6 +1,6 @@
-// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2015-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef DEVLINK_H
@@ -12,9 +12,8 @@
 #endif
 
 #ifdef ENA_DEVLINK_SUPPORT
-
 #define ENA_DEVLINK_PRIV(devlink) \
-	(*(struct ena_adapter **) devlink_priv(devlink))
+	(*(struct ena_adapter **)devlink_priv(devlink))
 
 struct devlink *ena_devlink_alloc(struct ena_adapter *adapter);
 void ena_devlink_free(struct devlink *devlink);
@@ -22,9 +21,9 @@ void ena_devlink_register(struct devlink *devlink, struct device *dev);
 void ena_devlink_unregister(struct devlink *devlink);
 void ena_devlink_params_get(struct devlink *devlink);
 void ena_devlink_disable_large_llq_header_param(struct devlink *devlink);
+void ena_devlink_disable_phc_param(struct devlink *devlink);
 
 #else /* ENA_DEVLINK_SUPPORT */
-
 #ifdef ENA_NO_DEVLINK_HEADERS
 struct devlink {};
 #endif
@@ -39,7 +38,7 @@ static inline void ena_devlink_register(struct devlink *devlink, struct device *
 static inline void ena_devlink_unregister(struct devlink *devlink) { }
 static inline void ena_devlink_params_get(struct devlink *devlink) { }
 static inline void ena_devlink_disable_large_llq_header_param(struct devlink *devlink) { }
+static inline void ena_devlink_disable_phc_param(struct devlink *devlink) { }
 
 #endif /* ENA_DEVLINK_SUPPORT */
-
 #endif /* DEVLINK_H */
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index f9f886289b970..50afe66efb57a 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -233,31 +233,43 @@ static struct ena_eth_io_rx_cdesc_base *
 		idx * io_cq->cdesc_entry_size_in_bytes);
 }
 
-static u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
-					   u16 *first_cdesc_idx)
+static int ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
+				    u16 *first_cdesc_idx,
+				    u16 *num_descs)
 {
+	u16 count = io_cq->cur_rx_pkt_cdesc_count, head_masked;
 	struct ena_eth_io_rx_cdesc_base *cdesc;
-	u16 count = 0, head_masked;
 	u32 last = 0;
 
 	do {
+		u32 status;
+
 		cdesc = ena_com_get_next_rx_cdesc(io_cq);
 		if (!cdesc)
 			break;
+		status = READ_ONCE(cdesc->status);
 
 		ena_com_cq_inc_head(io_cq);
+		if (unlikely((status & ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK) >>
+		    ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT && count != 0)) {
+			struct ena_com_dev *dev = ena_com_io_cq_to_ena_dev(io_cq);
+
+			netdev_err(dev->net_device,
+				   "First bit is on in descriptor #%d on q_id: %d, req_id: %u\n",
+				   count, io_cq->qid, cdesc->req_id);
+			return -EFAULT;
+		}
 		count++;
-		last = (READ_ONCE(cdesc->status) &
-			ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >>
-		       ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT;
+		last = (status & ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >>
+			ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT;
 	} while (!last);
 
 	if (last) {
 		*first_cdesc_idx = io_cq->cur_rx_pkt_cdesc_start_idx;
-		count += io_cq->cur_rx_pkt_cdesc_count;
 
 		head_masked = io_cq->head & (io_cq->q_depth - 1);
 
+		*num_descs = count;
 		io_cq->cur_rx_pkt_cdesc_count = 0;
 		io_cq->cur_rx_pkt_cdesc_start_idx = head_masked;
 
@@ -265,11 +277,11 @@ static u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 			   "ENA q_id: %d packets were completed. first desc idx %u descs# %d\n",
 			   io_cq->qid, *first_cdesc_idx, count);
 	} else {
-		io_cq->cur_rx_pkt_cdesc_count += count;
-		count = 0;
+		io_cq->cur_rx_pkt_cdesc_count = count;
+		*num_descs = 0;
 	}
 
-	return count;
+	return 0;
 }
 
 static int ena_com_create_meta(struct ena_com_io_sq *io_sq,
@@ -546,10 +558,14 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 	u16 cdesc_idx = 0;
 	u16 nb_hw_desc;
 	u16 i = 0;
+	int rc;
 
 	WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
 
-	nb_hw_desc = ena_com_cdesc_rx_pkt_get(io_cq, &cdesc_idx);
+	rc = ena_com_cdesc_rx_pkt_get(io_cq, &cdesc_idx, &nb_hw_desc);
+	if (unlikely(rc != 0))
+		return -EFAULT;
+
 	if (nb_hw_desc == 0) {
 		ena_rx_ctx->descs = nb_hw_desc;
 		return 0;
diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h
index 332ac0d28ac7a..a4d6d0ee0193c 100644
--- a/drivers/amazon/net/ena/ena_eth_io_defs.h
+++ b/drivers/amazon/net/ena/ena_eth_io_defs.h
@@ -261,7 +261,8 @@ struct ena_eth_io_intr_reg {
 	/* 14:0 : rx_intr_delay
 	 * 29:15 : tx_intr_delay
 	 * 30 : intr_unmask
-	 * 31 : reserved
+	 * 31 : no_moderation_update - 0 - moderation
+	 *    updated, 1 - moderation not updated
 	 */
 	u32 intr_control;
 };
@@ -381,6 +382,8 @@ struct ena_eth_io_numa_node_cfg_reg {
 #define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK              GENMASK(29, 15)
 #define ENA_ETH_IO_INTR_REG_INTR_UNMASK_SHIFT               30
 #define ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK                BIT(30)
+#define ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_SHIFT      31
+#define ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_MASK       BIT(31)
 
 /* numa_node_cfg_reg */
 #define ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK              GENMASK(7, 0)
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 797ca14a28b3a..f09801591d840 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -342,9 +342,8 @@ static void ena_get_stats(struct ena_adapter *adapter,
 	ena_queue_stats(adapter, &data);
 	ena_com_admin_queue_stats(adapter, &data);
 
-	if (ena_phc_enabled(adapter)) {
+	if (ena_phc_is_active(adapter))
 		ena_com_phc_stats(adapter, &data);
-	}
 }
 
 static void ena_get_ethtool_stats(struct net_device *netdev,
@@ -377,7 +376,7 @@ static int ena_get_sw_stats_count(struct ena_adapter *adapter)
 		    + adapter->xdp_num_queues * ENA_STATS_ARRAY_TX
 		    + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM_ADMIN;
 
-	if (ena_phc_enabled(adapter))
+	if (ena_phc_is_active(adapter))
 		count += ENA_STATS_ARRAY_ENA_COM_PHC;
 
 	return count;
@@ -515,9 +514,8 @@ static void ena_get_strings(struct ena_adapter *adapter,
 	ena_queue_strings(adapter, &data);
 	ena_com_admin_strings(&data);
 
-	if (ena_phc_enabled(adapter)) {
+	if (ena_phc_is_active(adapter))
 		ena_com_phc_strings(&data);
-	}
 }
 
 static void ena_get_ethtool_strings(struct net_device *netdev,
@@ -637,8 +635,11 @@ static void ena_update_tx_rings_nonadaptive_intr_moderation(struct ena_adapter *
 
 	val = ena_com_get_nonadaptive_moderation_interval_tx(adapter->ena_dev);
 
-	for (i = 0; i < adapter->num_io_queues; i++)
-		adapter->tx_ring[i].smoothed_interval = val;
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		adapter->tx_ring[i].interrupt_interval_changed =
+			adapter->tx_ring[i].interrupt_interval != val;
+		adapter->tx_ring[i].interrupt_interval = val;
+	}
 }
 
 static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
@@ -648,8 +649,11 @@ static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *
 
 	val = ena_com_get_nonadaptive_moderation_interval_rx(adapter->ena_dev);
 
-	for (i = 0; i < adapter->num_io_queues; i++)
-		adapter->rx_ring[i].smoothed_interval = val;
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		adapter->rx_ring[i].interrupt_interval_changed =
+			adapter->rx_ring[i].interrupt_interval != val;
+		adapter->rx_ring[i].interrupt_interval = val;
+	}
 }
 
 static int ena_set_coalesce(struct net_device *net_dev,
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 0595bb82a6eb6..5308f35e29f5a 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -75,6 +75,12 @@ static int lpc_size = ENA_LPC_DEFAULT_MULTIPLIER;
 module_param(lpc_size, uint, 0444);
 MODULE_PARM_DESC(lpc_size, "Each local page cache (lpc) holds N * 1024 pages. This parameter sets N which is rounded up to a multiplier of 2. If zero, the page cache is disabled. Max: 32\n");
 
+#ifdef ENA_PHC_SUPPORT
+static int phc_enable = 0;
+module_param(phc_enable, uint, 0444);
+MODULE_PARM_DESC(phc_enable, "Enable PHC.\n");
+
+#endif /* ENA_PHC_SUPPORT */
 static struct ena_aenq_handlers aenq_handlers;
 
 static struct workqueue_struct *ena_wq;
@@ -319,8 +325,10 @@ void ena_init_io_rings(struct ena_adapter *adapter,
 		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
 		txr->sgl_size = adapter->max_tx_sgl_size;
 		txr->enable_bql = enable_bql;
-		txr->smoothed_interval =
+		txr->interrupt_interval =
 			ena_com_get_nonadaptive_moderation_interval_tx(ena_dev);
+		/* Initial value, mark as true */
+		txr->interrupt_interval_changed = true;
 		txr->disable_meta_caching = adapter->disable_meta_caching;
 #ifdef ENA_XDP_SUPPORT
 		spin_lock_init(&txr->xdp_tx_lock);
@@ -335,8 +343,10 @@ void ena_init_io_rings(struct ena_adapter *adapter,
 			rxr->ring_size = adapter->requested_rx_ring_size;
 			rxr->rx_copybreak = adapter->rx_copybreak;
 			rxr->sgl_size = adapter->max_rx_sgl_size;
-			rxr->smoothed_interval =
+			rxr->interrupt_interval =
 				ena_com_get_nonadaptive_moderation_interval_rx(ena_dev);
+			/* Initial value, mark as true */
+			rxr->interrupt_interval_changed = true;
 			rxr->empty_rx_queue = 0;
 			rxr->rx_headroom = NET_SKB_PAD;
 			adapter->ena_napi[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
@@ -1103,6 +1113,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	struct ena_rx_buffer *rx_info;
 	struct ena_adapter *adapter;
 	int page_offset, pkt_offset;
+	dma_addr_t pre_reuse_paddr;
 	u16 len, req_id, buf = 0;
 	bool reuse_rx_buf_page;
 	struct sk_buff *skb;
@@ -1168,12 +1179,19 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
 	buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
 
+	pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr);
+
 	/* If XDP isn't loaded try to reuse part of the RX buffer */
 	reuse_rx_buf_page = !is_xdp_loaded &&
 			    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
 	if (!reuse_rx_buf_page)
 		ena_unmap_rx_buff(rx_ring, rx_info);
+	else
+		dma_sync_single_for_cpu(rx_ring->dev,
+					pre_reuse_paddr + pkt_offset,
+					len,
+					DMA_FROM_DEVICE);
 
 	skb = ena_alloc_skb(rx_ring, buf_addr, buf_len);
 	if (unlikely(!skb))
@@ -1226,11 +1244,18 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
 		page_offset = rx_info->page_offset;
 
+		pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr);
+
 		reuse_rx_buf_page = !is_xdp_loaded &&
 				    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
 		if (!reuse_rx_buf_page)
 			ena_unmap_rx_buff(rx_ring, rx_info);
+		else
+			dma_sync_single_for_cpu(rx_ring->dev,
+						pre_reuse_paddr + pkt_offset,
+						len,
+						DMA_FROM_DEVICE);
 
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
 				page_offset + buf_offset, len, buf_len);
@@ -1528,6 +1553,8 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 		ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1,
 				  &rx_ring->syncp);
 		ena_reset_device(adapter, ENA_REGS_RESET_TOO_MANY_RX_DESCS);
+	} else if (rc == -EFAULT) {
+		ena_reset_device(adapter, ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED);
 	} else {
 		ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1,
 				  &rx_ring->syncp);
@@ -1543,7 +1570,10 @@ static void ena_dim_work(struct work_struct *w)
 		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 	struct ena_napi *ena_napi = container_of(dim, struct ena_napi, dim);
 
-	ena_napi->rx_ring->smoothed_interval = cur_moder.usec;
+	ena_napi->rx_ring->interrupt_interval = cur_moder.usec;
+	/* DIM will schedule the work in case there was a change in the profile. */
+	ena_napi->rx_ring->interrupt_interval_changed = true;
+
 	dim->state = DIM_START_MEASURE;
 }
 
@@ -1570,27 +1600,33 @@ static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi)
 void ena_unmask_interrupt(struct ena_ring *tx_ring,
 			  struct ena_ring *rx_ring)
 {
+	u32 rx_interval = tx_ring->interrupt_interval;
 	struct ena_eth_io_intr_reg intr_reg;
-#ifdef ENA_XDP_SUPPORT
-	u32 rx_interval = tx_ring->smoothed_interval;
-#else
-	u32 rx_interval = 0;
-#endif
+	bool no_moderation_update = true;
+
 	/* Rx ring can be NULL when for XDP tx queues which don't have an
 	 * accompanying rx_ring pair.
 	 */
-	if (rx_ring)
+	if (rx_ring) {
 		rx_interval = ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev) ?
-			rx_ring->smoothed_interval :
+			rx_ring->interrupt_interval :
 			ena_com_get_nonadaptive_moderation_interval_rx(rx_ring->ena_dev);
 
+		no_moderation_update &= !rx_ring->interrupt_interval_changed;
+		rx_ring->interrupt_interval_changed = false;
+	}
+
+	no_moderation_update &= !tx_ring->interrupt_interval_changed;
+	tx_ring->interrupt_interval_changed = false;
+
 	/* Update intr register: rx intr delay,
 	 * tx intr delay and interrupt unmask
 	 */
 	ena_com_update_intr_reg(&intr_reg,
 				rx_interval,
-				tx_ring->smoothed_interval,
-				true);
+				tx_ring->interrupt_interval,
+				true,
+				no_moderation_update);
 
 	ena_increase_stat(&tx_ring->tx_stats.unmask_interrupt, 1,
 			  &tx_ring->syncp);
@@ -2439,14 +2475,12 @@ int ena_up(struct ena_adapter *adapter)
 	 */
 	ena_init_napi_in_range(adapter, 0, io_queue_count);
 
-#ifdef CONFIG_ARM64
-	/* enable DIM by default on ARM machines, also needs to happen
-	 * before enabling IRQs since DIM is ran from napi routine
+	/* Enabling DIM needs to happen before enabling IRQs since DIM
+	 * is run from napi routine
 	 */
 	if (ena_com_interrupt_moderation_supported(adapter->ena_dev))
 		ena_com_enable_adaptive_moderation(adapter->ena_dev);
 
-#endif
 	rc = ena_request_io_irq(adapter);
 	if (rc)
 		goto err_req_irq;
@@ -3407,7 +3441,7 @@ static void set_default_llq_configurations(struct ena_adapter *adapter,
 			ENA_ADMIN_LIST_ENTRY_SIZE_256B);
 
 	if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
-		adapter->large_llq_header_enabled) {
+	    adapter->large_llq_header_enabled) {
 		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
 		llq_config->llq_ring_entry_size_value = 256;
 	} else {
@@ -3717,6 +3751,7 @@ int ena_restore_device(struct ena_adapter *adapter)
 	for (i = 0 ; i < count; i++) {
 		txr = &adapter->tx_ring[i];
 		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
+		txr->tx_max_header_size = ena_dev->tx_max_header_size;
 	}
 
 	rc = ena_device_validate_params(adapter, &get_feat_ctx);
@@ -3880,8 +3915,6 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 				reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
 			}
 
-			missed_tx++;
-
 			if (tx_buf->print_once)
 				continue;
 
@@ -3889,6 +3922,7 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 				     "TX hasn't completed, qid %d, index %d. %u usecs from last napi execution, napi scheduled: %d\n",
 				     tx_ring->qid, i, time_since_last_napi, napi_scheduled);
 
+			missed_tx++;
 			tx_buf->print_once = 1;
 		}
 	}
@@ -4388,10 +4422,12 @@ static int ena_calc_io_queue_size(struct ena_adapter *adapter,
 		if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
 		    (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)) {
 			max_tx_queue_size /= 2;
-			dev_info(&adapter->pdev->dev, "Forcing large headers and decreasing maximum TX queue size to %d\n",
+			dev_info(&adapter->pdev->dev,
+				 "Forcing large headers and decreasing maximum TX queue size to %d\n",
 				 max_tx_queue_size);
 		} else {
-			dev_err(&adapter->pdev->dev, "Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
+			dev_err(&adapter->pdev->dev,
+				"Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
 
 			adapter->large_llq_header_enabled = false;
 			ena_devlink_disable_large_llq_header_param(adapter->devlink);
@@ -4514,12 +4550,22 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	pci_set_drvdata(pdev, adapter);
 
+	rc = ena_phc_alloc(adapter);
+	if (rc) {
+		netdev_err(netdev, "ena_phc_alloc failed\n");
+		goto err_netdev_destroy;
+	}
+
 	adapter->large_llq_header_enabled = !!force_large_llq_header;
 
+#ifdef ENA_PHC_SUPPORT
+	ena_phc_enable(adapter, !!phc_enable);
+
+#endif /* ENA_PHC_SUPPORT */
 	rc = ena_com_allocate_customer_metrics_buffer(ena_dev);
 	if (rc) {
 		netdev_err(netdev, "ena_com_allocate_customer_metrics_buffer failed\n");
-		goto err_netdev_destroy;
+		goto err_free_phc;
 	}
 
 	devlink = ena_devlink_alloc(adapter);
@@ -4684,10 +4730,12 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ena_com_admin_destroy(ena_dev);
 err_devlink_destroy:
 	ena_devlink_free(devlink);
-err_netdev_destroy:
-	free_netdev(netdev);
 err_metrics_destroy:
 	ena_com_delete_customer_metrics_buffer(ena_dev);
+err_free_phc:
+	ena_phc_free(adapter);
+err_netdev_destroy:
+	free_netdev(netdev);
 err_free_region:
 	ena_release_bars(ena_dev, pdev);
 err_free_ena_dev:
@@ -4756,6 +4804,8 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 
 	ena_com_delete_customer_metrics_buffer(ena_dev);
 
+	ena_phc_free(adapter);
+
 	ena_release_bars(ena_dev, pdev);
 
 	pci_disable_device(pdev);
@@ -4873,13 +4923,19 @@ static struct pci_driver ena_pci_driver = {
 
 static int __init ena_init(void)
 {
+	int ret;
+
 	ena_wq = create_singlethread_workqueue(DRV_MODULE_NAME);
 	if (!ena_wq) {
 		pr_err("Failed to create workqueue\n");
 		return -ENOMEM;
 	}
 
-	return pci_register_driver(&ena_pci_driver);
+	ret = pci_register_driver(&ena_pci_driver);
+	if (ret)
+		destroy_workqueue(ena_wq);
+
+	return ret;
 }
 
 static void __exit ena_cleanup(void)
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 60409fa4a4b98..5098ac28966c5 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -29,7 +29,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	8
-#define DRV_MODULE_GEN_SUBMINOR 1
+#define DRV_MODULE_GEN_SUBMINOR 3
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -125,8 +125,10 @@
 
 struct ena_page_cache;
 
+#ifdef ENA_PHC_SUPPORT
 struct ena_phc_info;
 
+#endif
 struct ena_irq {
 	irq_handler_t handler;
 	void *data;
@@ -320,8 +322,13 @@ struct ena_ring {
 	enum ena_admin_placement_policy_type tx_mem_queue_type;
 
 	struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS];
-	u32  smoothed_interval;
-	u32  per_napi_packets;
+	u32 interrupt_interval;
+	/* Indicates whether interrupt interval has changed since previous set.
+	 * This flag will be kept up, until cleared by the routine which updates
+	 * the device with the modified interrupt interval value.
+	 */
+	bool interrupt_interval_changed;
+	u32 per_napi_packets;
 	u16 non_empty_napi_events;
 	struct u64_stats_sync syncp;
 	union {
@@ -421,6 +428,10 @@ struct ena_adapter {
 	unsigned long missing_tx_completion_to;
 
 	char name[ENA_NAME_MAX_LEN];
+#ifdef ENA_PHC_SUPPORT
+
+	struct ena_phc_info *phc_info;
+#endif
 
 	unsigned long flags;
 	/* TX */
@@ -459,8 +470,6 @@ struct ena_adapter {
 #endif
 	u32 xdp_first_ring;
 	u32 xdp_num_queues;
-
-	struct ena_phc_info *phc_info;
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev);
diff --git a/drivers/amazon/net/ena/ena_phc.c b/drivers/amazon/net/ena/ena_phc.c
index 46e21d3202a1b..8b89ae9efb4ec 100644
--- a/drivers/amazon/net/ena/ena_phc.c
+++ b/drivers/amazon/net/ena/ena_phc.c
@@ -3,6 +3,7 @@
  * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
+#include "ena_devlink.h"
 #include "ena_phc.h"
 
 #ifdef ENA_PHC_SUPPORT
@@ -17,7 +18,8 @@ static int ena_phc_adjtime(struct ptp_clock_info *clock_info, s64 delta)
 	return -EOPNOTSUPP;
 }
 
-static int ena_phc_enable(struct ptp_clock_info *clock_info, struct ptp_clock_request *rq, int on)
+static int ena_phc_feature_enable(struct ptp_clock_info *clock_info, struct ptp_clock_request *rq,
+				  int on)
 {
 	return -EOPNOTSUPP;
 }
@@ -120,9 +122,38 @@ static struct ptp_clock_info ena_ptp_clock_info = {
 	.gettime	= ena_phc_gettime,
 	.settime	= ena_phc_settime,
 #endif /* ENA_PHC_SUPPORT_GETTIME64 */
-	.enable		= ena_phc_enable,
+	.enable		= ena_phc_feature_enable,
 };
 
+/* Enable/Disable PHC by the kernel, affects on the next init flow */
+void ena_phc_enable(struct ena_adapter *adapter, bool enable)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	if (!phc_info) {
+		netdev_err(adapter->netdev, "phc_info is not allocated\n");
+		return;
+	}
+
+	phc_info->enabled = enable;
+}
+
+/* Check if PHC is enabled by the kernel */
+bool ena_phc_is_enabled(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	return (phc_info && phc_info->enabled);
+}
+
+/* PHC is activated if ptp clock is registered in the kernel */
+bool ena_phc_is_active(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	return (phc_info && phc_info->clock);
+}
+
 static int ena_phc_register(struct ena_adapter *adapter)
 {
 	struct pci_dev *pdev = adapter->pdev;
@@ -154,19 +185,34 @@ static int ena_phc_register(struct ena_adapter *adapter)
 	return rc;
 }
 
-bool ena_phc_enabled(struct ena_adapter *adapter)
+static void ena_phc_unregister(struct ena_adapter *adapter)
 {
 	struct ena_phc_info *phc_info = adapter->phc_info;
 
-	return (phc_info && phc_info->clock);
+	if (ena_phc_is_active(adapter)) {
+		ptp_clock_unregister(phc_info->clock);
+		phc_info->clock = NULL;
+	}
 }
 
-static void ena_phc_unregister(struct ena_adapter *adapter)
+int ena_phc_alloc(struct ena_adapter *adapter)
 {
-	struct ena_phc_info *phc_info = adapter->phc_info;
+	/* Allocate driver specific PHC info */
+	adapter->phc_info = vzalloc(sizeof(*adapter->phc_info));
+	if (unlikely(!adapter->phc_info)) {
+		netdev_err(adapter->netdev, "Failed to alloc phc_info\n");
+		return -ENOMEM;
+	}
 
-	if (ena_phc_enabled(adapter))
-		ptp_clock_unregister(phc_info->clock);
+	return 0;
+}
+
+void ena_phc_free(struct ena_adapter *adapter)
+{
+	if (adapter->phc_info) {
+		vfree(adapter->phc_info);
+		adapter->phc_info = NULL;
+	}
 }
 
 int ena_phc_init(struct ena_adapter *adapter)
@@ -175,13 +221,19 @@ int ena_phc_init(struct ena_adapter *adapter)
 	struct net_device *netdev = adapter->netdev;
 	int rc = -EOPNOTSUPP;
 
-	/* Validate phc feature is supported in the device */
+	/* Validate PHC feature is supported in the device */
 	if (!ena_com_phc_supported(ena_dev)) {
-		netdev_dbg(netdev, "PHC feature is not supported\n");
+		netdev_dbg(netdev, "PHC feature is not supported by the device\n");
+		goto err_ena_com_phc_init;
+	}
+
+	/* Validate PHC feature is enabled by the kernel */
+	if (!ena_phc_is_enabled(adapter)) {
+		netdev_dbg(netdev, "PHC feature is not enabled by the kernel\n");
 		goto err_ena_com_phc_init;
 	}
 
-	/* Allocate and initialize device specific PHC info */
+	/* Initialize device specific PHC info */
 	rc = ena_com_phc_init(ena_dev);
 	if (unlikely(rc)) {
 		netdev_err(netdev, "Failed to init phc, error: %d\n", rc);
@@ -195,50 +247,33 @@ int ena_phc_init(struct ena_adapter *adapter)
 		goto err_ena_com_phc_config;
 	}
 
-	/* Allocate and initialize driver specific PHC info */
-	adapter->phc_info = vzalloc(sizeof(*adapter->phc_info));
-	if (unlikely(!adapter->phc_info)) {
-		rc = -ENOMEM;
-		netdev_err(netdev, "Failed to alloc phc_info, error: %d\n", rc);
-		goto err_ena_com_phc_config;
-	}
-
 	/* Register to PTP class driver */
 	rc = ena_phc_register(adapter);
 	if (unlikely(rc)) {
 		netdev_err(netdev, "Failed to register phc, error: %d\n", rc);
-		goto err_ena_phc_register;
+		goto err_ena_com_phc_config;
 	}
 
 	return 0;
 
-err_ena_phc_register:
-	vfree(adapter->phc_info);
-	adapter->phc_info = NULL;
 err_ena_com_phc_config:
 	ena_com_phc_destroy(ena_dev);
 err_ena_com_phc_init:
+	ena_phc_enable(adapter, false);
+	ena_devlink_disable_phc_param(adapter->devlink);
 	return rc;
 }
 
 void ena_phc_destroy(struct ena_adapter *adapter)
 {
 	ena_phc_unregister(adapter);
-
-	if (likely(adapter->phc_info)) {
-		vfree(adapter->phc_info);
-		adapter->phc_info = NULL;
-	}
-
 	ena_com_phc_destroy(adapter->ena_dev);
 }
 
 int ena_phc_get_index(struct ena_adapter *adapter)
 {
-	struct ena_phc_info *phc_info = adapter->phc_info;
-
-	if (ena_phc_enabled(adapter))
-		return ptp_clock_index(phc_info->clock);
+	if (ena_phc_is_active(adapter))
+		return ptp_clock_index(adapter->phc_info->clock);
 
 	return -1;
 }
diff --git a/drivers/amazon/net/ena/ena_phc.h b/drivers/amazon/net/ena/ena_phc.h
index f08ff473bd1e4..bb644d5f928fa 100644
--- a/drivers/amazon/net/ena/ena_phc.h
+++ b/drivers/amazon/net/ena/ena_phc.h
@@ -6,8 +6,6 @@
 #ifndef ENA_PHC_H
 #define ENA_PHC_H
 
-#include "ena_netdev.h"
-
 #ifdef ENA_PHC_SUPPORT
 
 #include <linux/ptp_clock_kernel.h>
@@ -24,20 +22,29 @@ struct ena_phc_info {
 
 	/* PHC lock */
 	spinlock_t lock;
+
+	/* Enabled by kernel */
+	bool enabled;
 };
 
-bool ena_phc_enabled(struct ena_adapter *adapter);
+void ena_phc_enable(struct ena_adapter *adapter, bool enable);
+bool ena_phc_is_enabled(struct ena_adapter *adapter);
+bool ena_phc_is_active(struct ena_adapter *adapter);
 int ena_phc_get_index(struct ena_adapter *adapter);
 int ena_phc_init(struct ena_adapter *adapter);
 void ena_phc_destroy(struct ena_adapter *adapter);
-
+int ena_phc_alloc(struct ena_adapter *adapter);
+void ena_phc_free(struct ena_adapter *adapter);
 #else /* ENA_PHC_SUPPORT */
 
-static inline bool ena_phc_enabled(struct ena_adapter *adapter) {return false; }
+static inline void ena_phc_enable(struct ena_adapter *adapter, bool enable) { }
+static inline bool ena_phc_is_enabled(struct ena_adapter *adapter) { return false; }
+static inline bool ena_phc_is_active(struct ena_adapter *adapter) { return false; }
 static inline int ena_phc_get_index(struct ena_adapter *adapter) { return -1; }
 static inline int ena_phc_init(struct ena_adapter *adapter) { return 0; }
 static inline void ena_phc_destroy(struct ena_adapter *adapter) { }
-
+static inline int ena_phc_alloc(struct ena_adapter *adapter) { return 0; }
+static inline void ena_phc_free(struct ena_adapter *adapter) { }
 #endif /* ENA_PHC_SUPPORT */
 
 #endif /* ENA_PHC_H */
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
index ded18aa5162bc..bdbbc8b18df63 100644
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -22,6 +22,7 @@ enum ena_regs_reset_reason_types {
 	ENA_REGS_RESET_GENERIC                      = 13,
 	ENA_REGS_RESET_MISS_INTERRUPT               = 14,
 	ENA_REGS_RESET_SUSPECTED_POLL_STARVATION    = 15,
+	ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED	    = 16,
 	ENA_REGS_RESET_LAST,
 };
 
diff --git a/drivers/amazon/net/ena/ena_xdp.h b/drivers/amazon/net/ena/ena_xdp.h
index f6b60c0e5d7c6..dde8f9053f707 100644
--- a/drivers/amazon/net/ena/ena_xdp.h
+++ b/drivers/amazon/net/ena/ena_xdp.h
@@ -40,6 +40,7 @@ enum ENA_XDP_ACTIONS {
 	ENA_XDP_REDIRECT	= BIT(1),
 	ENA_XDP_DROP		= BIT(2)
 };
+
 #define ENA_XDP_FORWARDED (ENA_XDP_TX | ENA_XDP_REDIRECT)
 
 int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter);
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 8e7aab52fb507..fd44a3ebe0414 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -727,6 +727,14 @@ do {									\
 #define ENA_NO_DEVLINK_HEADERS
 #endif
 
+#if defined(CONFIG_NET_DEVLINK) && \
+	(KERNEL_VERSION(5, 1, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \
+	!((SUSE_VERSION != 0) && (SUSE_VERSION == 15 && (SUSE_PATCHLEVEL < 2 || SUSE_PATCHLEVEL >= 4))) && \
+	!(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE > UBUNTU_VERSION(5, 16, 0, 0)) && \
+	!(RHEL_RELEASE_CODE))
+#define ENA_DEVLINK_PUBLISH_REQUIRED
+#endif
+
 #if defined(CONFIG_NET_DEVLINK) &&					\
 	(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) ||		\
 	 (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18)))
@@ -757,7 +765,8 @@ do {									\
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) || \
-	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))  || \
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
 #define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
 #endif
 
@@ -945,13 +954,15 @@ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \
 	(defined(RHEL_RELEASE_CODE) && \
 	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6) && \
-	RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0)) || \
+	RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0)) || \
 	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
 #define ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) || \
-	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(8, 7))
+	(defined(RHEL_RELEASE_CODE) && \
+	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7) && \
+	RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0))
 #define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
 #endif
 

From a28c176b3d3506068e5799d63a50e3c2d2ec9e38 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@amazon.com>
Date: Fri, 14 Apr 2023 19:43:26 +0000
Subject: [PATCH 045/175] Revert "selinux: runtime disable is deprecated, add
 some ssleep() discomfort"

This reverts commit 43b666622c60bc001f2f8a19f5f97946ff53a5cc.

Let's keep selinux run-time disable behavior the same we have
in 5.15, since AL2 user-space expects to be able to disable
selinux at run-time.

Note that the change being reverted is only in effect when
CONFIG_SECURITY_SELINUX_DISABLE=y. Which is not the case
for AL2023 but is the case for AL2.

No discomfort for AL2 users ;)

Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 security/selinux/selinuxfs.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index a00d191394365..3f0adb987c1ee 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -293,8 +293,6 @@ static ssize_t sel_write_disable(struct file *file, const char __user *buf,
 	 *       kernel releases until eventually it is removed
 	 */
 	pr_err("SELinux:  Runtime disable is deprecated, use selinux=0 on the kernel cmdline.\n");
-	pr_err("SELinux:  https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-runtime-disable\n");
-	ssleep(5);
 
 	if (count >= PAGE_SIZE)
 		return -ENOMEM;

From e05c4aef776303557beb4c63a02645a31828b089 Mon Sep 17 00:00:00 2001
From: Arthur Kiyanovski <akiyano@amazon.com>
Date: Wed, 17 May 2023 18:19:59 +0000
Subject: [PATCH 046/175] AL2023 6.1 Update ena driver to 2.8.6g

Signed-off-by: Arthur Kiyanovski <akiyano@amazon.com>
---
 drivers/amazon/net/ena/ena_netdev.c |  50 +++++-----
 drivers/amazon/net/ena/ena_netdev.h |   2 +-
 drivers/amazon/net/ena/kcompat.h    | 136 ++++++++++++++++++++++------
 3 files changed, 136 insertions(+), 52 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 5308f35e29f5a..072be72e14d56 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -693,16 +693,22 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
 	return 0;
 }
 
-static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
-			      struct ena_rx_buffer *rx_info)
+static void ena_unmap_rx_buff_attrs(struct ena_ring *rx_ring,
+				    struct ena_rx_buffer *rx_info,
+				    unsigned long attrs)
 {
 	/* LPC pages are unmapped at cache destruction */
 	if (rx_info->is_lpc_page)
 		return;
 
-	dma_unmap_page(rx_ring->dev, rx_info->dma_addr,
-		       ENA_PAGE_SIZE,
-		       DMA_BIDIRECTIONAL);
+	ena_dma_unmap_page_attrs(rx_ring->dev, rx_info->dma_addr, ENA_PAGE_SIZE,
+				 DMA_BIDIRECTIONAL, attrs);
+}
+
+static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
+			      struct ena_rx_buffer *rx_info)
+{
+	ena_unmap_rx_buff_attrs(rx_ring, rx_info, 0);
 }
 
 static void ena_free_rx_page(struct ena_ring *rx_ring,
@@ -880,7 +886,7 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 
 		ena_unmap_tx_buff(tx_ring, tx_info);
 
-		napi_consume_skb(tx_info->skb, 0);
+		dev_kfree_skb_any(tx_info->skb);
 	}
 	netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev,
 						  tx_ring->qid));
@@ -1012,7 +1018,7 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 			  skb);
 
 		tx_bytes += tx_info->total_tx_size;
-		napi_consume_skb(skb, budget);
+		dev_kfree_skb(skb);
 		tx_pkts++;
 		total_done += tx_info->tx_descs;
 
@@ -1063,7 +1069,7 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag,
 	if (!first_frag)
 		skb = napi_alloc_skb(rx_ring->napi, len);
 	else
-		skb = ena_build_skb(first_frag, len);
+		skb = build_skb(first_frag, len);
 #else
 	if (!first_frag)
 		skb = napi_alloc_skb(rx_ring->napi, len);
@@ -1185,13 +1191,14 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	reuse_rx_buf_page = !is_xdp_loaded &&
 			    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
+	dma_sync_single_for_cpu(rx_ring->dev,
+				pre_reuse_paddr + pkt_offset,
+				len,
+				DMA_FROM_DEVICE);
+
 	if (!reuse_rx_buf_page)
-		ena_unmap_rx_buff(rx_ring, rx_info);
-	else
-		dma_sync_single_for_cpu(rx_ring->dev,
-					pre_reuse_paddr + pkt_offset,
-					len,
-					DMA_FROM_DEVICE);
+		ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC);
+
 
 	skb = ena_alloc_skb(rx_ring, buf_addr, buf_len);
 	if (unlikely(!skb))
@@ -1249,13 +1256,14 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		reuse_rx_buf_page = !is_xdp_loaded &&
 				    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
+		dma_sync_single_for_cpu(rx_ring->dev,
+					pre_reuse_paddr + pkt_offset,
+					len,
+					DMA_FROM_DEVICE);
+
 		if (!reuse_rx_buf_page)
-			ena_unmap_rx_buff(rx_ring, rx_info);
-		else
-			dma_sync_single_for_cpu(rx_ring->dev,
-						pre_reuse_paddr + pkt_offset,
-						len,
-						DMA_FROM_DEVICE);
+			ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC);
+
 
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
 				page_offset + buf_offset, len, buf_len);
@@ -3038,7 +3046,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_info->skb = NULL;
 
 error_drop_packet:
-	napi_consume_skb(skb, 0);
+	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
 
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 5098ac28966c5..92e03d79971f7 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -29,7 +29,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	8
-#define DRV_MODULE_GEN_SUBMINOR 3
+#define DRV_MODULE_GEN_SUBMINOR 6
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index fd44a3ebe0414..1b3e7edf570b0 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -107,6 +107,33 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 	LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
 #define ENA_BUSY_POLL_SUPPORT
 #endif
+
+/* Distribution kernel version comparison macros.
+ * Distribution kernel versioning format may be A.B.C-D.E.F and standard
+ * KERNEL_VERSION macro covers only the first 3 subversions.
+ * Using 20bit per subversion, as in some cases, subversion D may be a large
+ * number (6 digits).
+ */
+#define ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3) ((SV1 << 40) | (SV2 << 20) | (SV3))
+#define ENA_KERNEL_VERSION_MAJOR(SV1, SV2, SV3) ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3)
+#define ENA_KERNEL_VERSION_MINOR(SV1, SV2, SV3) ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3)
+
+#define ENA_KERNEL_VERSION_GTE(SV1, SV2, SV3, SV4, SV5, SV6) \
+	((ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) > \
+	  ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3))) || \
+	 (ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) == \
+	  ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3)) && \
+	  ENA_KERNEL_VERSION_MINOR(ENA_KERNEL_SUBVERSION_4, ENA_KERNEL_SUBVERSION_5, ENA_KERNEL_SUBVERSION_6) >= \
+	  ENA_KERNEL_VERSION_MINOR((SV4), (SV5), (SV6))))
+
+#define ENA_KERNEL_VERSION_LTE(SV1, SV2, SV3, SV4, SV5, SV6) \
+	((ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) < \
+	  ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3))) || \
+	 (ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) == \
+	  ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3)) && \
+	  ENA_KERNEL_VERSION_MINOR(ENA_KERNEL_SUBVERSION_4, ENA_KERNEL_SUBVERSION_5, ENA_KERNEL_SUBVERSION_6) <= \
+	  ENA_KERNEL_VERSION_MINOR((SV4), (SV5), (SV6))))
+
 /******************************************************************************/
 /************************** Ubuntu macros *************************************/
 /******************************************************************************/
@@ -177,7 +204,6 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #define SUSE_VERSION 0
 #endif /* SUSE_VERSION */
 
-
 /******************************************************************************/
 /**************************** RHEL macros *************************************/
 /******************************************************************************/
@@ -822,17 +848,32 @@ static inline void netdev_rss_key_fill(void *buffer, size_t len)
 
 #if ((LINUX_VERSION_CODE < KERNEL_VERSION(4, 6 ,0)) && \
      !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)))
-/* Linux versions 4.4.216 - 4.5 (non inclusive) back propagated page_ref_count
- * function from kernel 4.6. To make things more difficult, Ubuntu didn't add
- * these changes to its 4.4.* kernels
+/* Linux versions 4.4.216 - 4.5 (non inclusive) back propagated
+ * page_ref_count() from kernel 4.6.
+ * Ubuntu didn't add these changes to its 4.4.* kernels.
+ * UEK added this function in kernel 4.1.12-124.43.1
+ * Here is a figure that shows all of the cases:
+ * Legend:
+ * -------- page_ref_count() is present in the kernel
+ * ******** page_ref_count() is missing in the kernel
+ *
+ * Distro\Kernel      4.1.12-124.43.1   4.4.216    4.5    4.6
+ *                           |              |        |      |
+ * Upstrem kernel ***********|**************|--------|******|
+ *                           |              |        |      |
+ * Ubuntu         ***********|**************|********|******|
+ *                           |              |        |      |
+ * UEK            ***********|--------------|--------|------|
  */
-#if !(KERNEL_VERSION(4, 4 ,216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) ||\
-      defined(UBUNTU_VERSION_CODE)
+#if (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) || \
+    (defined(ubuntu)) || \
+    (!defined(IS_UEK) && !defined(ubuntu) && \
+     !(KERNEL_VERSION(4, 4, 216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)))
 static inline int page_ref_count(struct page *page)
 {
 	return atomic_read(&page->_count);
 }
-#endif /* !(KERNEL_VERSION(4, 4 ,216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) */
+#endif /* (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) ... */
 
 static inline void page_ref_inc(struct page *page)
 {
@@ -881,32 +922,16 @@ static inline int numa_mem_id(void)
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
 #define ENA_LINEAR_FRAG_SUPPORTED
-static __always_inline struct sk_buff*
-ena_build_skb(void *data, unsigned int frag_size)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 12, 0)
-	return napi_build_skb(data, frag_size);
-#else
-	return build_skb(data, frag_size);
-#endif
-}
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 6, 0) && \
-	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 3)) && \
-	!(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(4, 2, 0, 42))
-static __always_inline
-void napi_consume_skb(struct sk_buff *skb, int budget)
-{
-	dev_kfree_skb_any(skb);
-}
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
 #define ENA_NETDEV_LOGS_WITHOUT_RV
 #endif
 
-#if defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
+#if defined(ENA_XDP_SUPPORT) && \
+	(LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \
+	 ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 49)))
 static __always_inline void
 xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
 {
@@ -928,7 +953,7 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 	xdp->data_meta = meta_valid ? data : data + 1;
 }
 
-#endif /* defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE <= KERNEL_VERSION(5, 12, 0) */
+#endif /* defined(ENA_XDP_SUPPORT) && (LINUX_VERSION_CODE <= KERNEL_VERSION(5, 12, 0) && !SUSE_VERSION(...)) */
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0)
 #define ethtool_sprintf(data, fmt, args...)			\
@@ -944,7 +969,9 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
 	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6)) && \
-	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \
+	  ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 43))
 static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 {
 	memcpy(dev->dev_addr, addr, ETH_ALEN);
@@ -1065,7 +1092,8 @@ static inline void ena_netif_napi_add(struct net_device *dev,
 				      struct napi_struct *napi,
 				      int (*poll)(struct napi_struct *, int))
 {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)) && \
+	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 2)))
 #ifndef NAPI_POLL_WEIGHT
 #define NAPI_POLL_WEIGHT 64
 #endif
@@ -1075,4 +1103,52 @@ static inline void ena_netif_napi_add(struct net_device *dev,
 #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) */
 }
 
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4))) || \
+    (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \
+    (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 105, 0, 0))
+static inline void dma_unmap_page_attrs(struct device *dev,
+					dma_addr_t addr, size_t size,
+					enum dma_data_direction dir,
+					struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, attrs);
+	debug_dma_unmap_page(dev, addr, size, dir, false);
+}
+#endif /* RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4)) */
+
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 9)) && \
+     (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 0)) && \
+     (LINUX_VERSION_CODE != KERNEL_VERSION(4, 14, 0))) || \
+    (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \
+    (defined(IS_UEK) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 13))
+#define ENA_DMA_ATTR_SKIP_CPU_SYNC (1 << DMA_ATTR_SKIP_CPU_SYNC)
+#elif (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(6, 10)))
+#define	ENA_DMA_ATTR_SKIP_CPU_SYNC 0
+#else
+#define ENA_DMA_ATTR_SKIP_CPU_SYNC DMA_ATTR_SKIP_CPU_SYNC
+#endif
+
+static inline void ena_dma_unmap_page_attrs(struct device *dev,
+					    dma_addr_t addr, size_t size,
+					    enum dma_data_direction dir,
+					    unsigned long attrs)
+{
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 9)) && \
+     (LINUX_VERSION_CODE != KERNEL_VERSION(4, 14, 0))) || \
+    (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \
+    (defined(IS_UEK) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 13))
+	struct dma_attrs dma_attrs;
+
+	init_dma_attrs(&dma_attrs);
+	dma_attrs.flags[0] = attrs;
+	dma_unmap_page_attrs(dev, addr, size, dir, &dma_attrs);
+#else
+	dma_unmap_page_attrs(dev, addr, size, dir, attrs);
+#endif
+}
+
 #endif /* _KCOMPAT_H_ */

From 695212bdabbd2e81a0365a4b27bce17485c66db4 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 14 Mar 2023 13:31:27 -0700
Subject: [PATCH 047/175] objtool: Add generic symbol for relocation type

objtool uses R_X86_64_X as relocation type. Add abstraction
for them so that other architectures can use its own reloc enums.

Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/arch/x86/include/arch/elf.h |  5 ++++-
 tools/objtool/arch/x86/special.c          |  5 +++--
 tools/objtool/check.c                     | 14 +++++++-------
 tools/objtool/orc_gen.c                   |  3 ++-
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/tools/objtool/arch/x86/include/arch/elf.h b/tools/objtool/arch/x86/include/arch/elf.h
index 69cc4264b28a8..7b737fcfcb9c9 100644
--- a/tools/objtool/arch/x86/include/arch/elf.h
+++ b/tools/objtool/arch/x86/include/arch/elf.h
@@ -1,6 +1,9 @@
 #ifndef _OBJTOOL_ARCH_ELF
 #define _OBJTOOL_ARCH_ELF
 
-#define R_NONE R_X86_64_NONE
+#define R_NONE	R_X86_64_NONE
+#define R_ABS64	R_X86_64_64
+#define R_REL32	R_X86_64_PC32
+#define R_PLT32	R_X86_64_PLT32
 
 #endif /* _OBJTOOL_ARCH_ELF */
diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c
index 7c97b73912799..fbe0745a9ed74 100644
--- a/tools/objtool/arch/x86/special.c
+++ b/tools/objtool/arch/x86/special.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 #include <string.h>
 
+#include <arch/elf.h>
 #include <objtool/special.h>
 #include <objtool/builtin.h>
 
@@ -108,7 +109,7 @@ struct reloc *arch_find_switch_table(struct objtool_file *file,
 	table_offset = text_reloc->addend;
 	table_sec = text_reloc->sym->sec;
 
-	if (text_reloc->type == R_X86_64_PC32)
+	if (text_reloc->type == R_REL32)
 		table_offset += 4;
 
 	/*
@@ -138,7 +139,7 @@ struct reloc *arch_find_switch_table(struct objtool_file *file,
 	 * indicates a rare GCC quirk/bug which can leave dead
 	 * code behind.
 	 */
-	if (text_reloc->type == R_X86_64_PC32)
+	if (text_reloc->type == R_REL32)
 		file->ignore_unreachables = true;
 
 	return rodata_reloc;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index cb363b507a329..7c0341035ed81 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -655,7 +655,7 @@ static int create_static_call_sections(struct objtool_file *file)
 		/* populate reloc for 'addr' */
 		if (elf_add_reloc_to_insn(file->elf, sec,
 					  idx * sizeof(struct static_call_site),
-					  R_X86_64_PC32,
+					  R_REL32,
 					  insn->sec, insn->offset))
 			return -1;
 
@@ -698,7 +698,7 @@ static int create_static_call_sections(struct objtool_file *file)
 		/* populate reloc for 'key' */
 		if (elf_add_reloc(file->elf, sec,
 				  idx * sizeof(struct static_call_site) + 4,
-				  R_X86_64_PC32, key_sym,
+				  R_REL32, key_sym,
 				  is_sibling_call(insn) * STATIC_CALL_SITE_TAIL))
 			return -1;
 
@@ -742,7 +742,7 @@ static int create_retpoline_sites_sections(struct objtool_file *file)
 
 		if (elf_add_reloc_to_insn(file->elf, sec,
 					  idx * sizeof(int),
-					  R_X86_64_PC32,
+					  R_REL32,
 					  insn->sec, insn->offset)) {
 			WARN("elf_add_reloc_to_insn: .retpoline_sites");
 			return -1;
@@ -788,7 +788,7 @@ static int create_return_sites_sections(struct objtool_file *file)
 
 		if (elf_add_reloc_to_insn(file->elf, sec,
 					  idx * sizeof(int),
-					  R_X86_64_PC32,
+					  R_REL32,
 					  insn->sec, insn->offset)) {
 			WARN("elf_add_reloc_to_insn: .return_sites");
 			return -1;
@@ -840,7 +840,7 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file)
 
 		if (elf_add_reloc_to_insn(file->elf, sec,
 					  idx * sizeof(int),
-					  R_X86_64_PC32,
+					  R_REL32,
 					  insn->sec, insn->offset)) {
 			WARN("elf_add_reloc_to_insn: .ibt_endbr_seal");
 			return -1;
@@ -885,7 +885,7 @@ static int create_mcount_loc_sections(struct objtool_file *file)
 
 		if (elf_add_reloc_to_insn(file->elf, sec,
 					  idx * sizeof(unsigned long),
-					  R_X86_64_64,
+					  R_ABS64,
 					  insn->sec, insn->offset))
 			return -1;
 
@@ -4051,7 +4051,7 @@ static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn
 			continue;
 
 		off = reloc->sym->offset;
-		if (reloc->type == R_X86_64_PC32 || reloc->type == R_X86_64_PLT32)
+		if (reloc->type == R_REL32 || reloc->type == R_PLT32)
 			off += arch_dest_reloc_offset(reloc->addend);
 		else
 			off += reloc->addend;
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index dd3c64af9db23..ca7bba45a46e5 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -9,6 +9,7 @@
 #include <linux/objtool.h>
 #include <asm/orc_types.h>
 
+#include <arch/elf.h>
 #include <objtool/check.h>
 #include <objtool/warn.h>
 #include <objtool/endianness.h>
@@ -101,7 +102,7 @@ static int write_orc_entry(struct elf *elf, struct section *orc_sec,
 	orc->bp_offset = bswap_if_needed(orc->bp_offset);
 
 	/* populate reloc for ip */
-	if (elf_add_reloc_to_insn(elf, ip_sec, idx * sizeof(int), R_X86_64_PC32,
+	if (elf_add_reloc_to_insn(elf, ip_sec, idx * sizeof(int), R_REL32,
 				  insn_sec, insn_off))
 		return -1;
 

From 1a21b2c977fe630d45bd20e6f71acb378692aba4 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 14 Mar 2023 13:32:34 -0700
Subject: [PATCH 048/175] objtool: Specify host-arch for making LIBSUBCMD

When singly cross-compiling objtool, LIBSUBCMD MAKE uses target-arch and
failed to LD with other object files.

Explicitly specify host-arch for LIBSUBCMD so it can be correctly
cross-compiled.

Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index a3a9cc24e0e37..0cfab07328e8b 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -58,7 +58,7 @@ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
 
 
 $(LIBSUBCMD): fixdep FORCE
-	$(Q)$(MAKE) -C $(SUBCMD_SRCDIR) OUTPUT=$(LIBSUBCMD_OUTPUT)
+	$(Q)$(MAKE) -C $(SUBCMD_SRCDIR) OUTPUT=$(LIBSUBCMD_OUTPUT) AR=$(AR) CC=$(CC) LD=$(LD)
 
 clean:
 	$(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL)

From c76288a374b1c8fa41078fb386ff37a0530ec0a7 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:45 +0800
Subject: [PATCH 049/175] tools: arm64: Make aarch64 instruction decoder
 available to tools

Add aarch64 encoder/decoder implementation under tools/.

The insn.h/.c are fixed version for user space tools. Some functions
(mainly about instructions generator code) and macros are deleted so that
it won't depend on too much head files.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/arch/arm64/include/asm/insn.h | 458 ++++++++++++++++++++++++++++
 tools/arch/arm64/lib/insn.c         | 335 ++++++++++++++++++++
 2 files changed, 793 insertions(+)
 create mode 100644 tools/arch/arm64/include/asm/insn.h
 create mode 100644 tools/arch/arm64/lib/insn.c

diff --git a/tools/arch/arm64/include/asm/insn.h b/tools/arch/arm64/include/asm/insn.h
new file mode 100644
index 0000000000000..8393456922147
--- /dev/null
+++ b/tools/arch/arm64/include/asm/insn.h
@@ -0,0 +1,458 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Huawei Ltd.
+ * Author: Jiang Liu <liuj97@gmail.com>
+ *
+ * Copyright (C) 2014 Zi Shen Lim <zlim.lnx@gmail.com>
+ */
+#ifndef	__ASM_INSN_H
+#define	__ASM_INSN_H
+#include <linux/build_bug.h>
+#include <linux/types.h>
+
+/* A64 instructions are always 32 bits. */
+#define AARCH64_INSN_SIZE	4
+
+#ifndef __ASSEMBLY__
+/*
+ * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a
+ * Section C3.1 "A64 instruction index by encoding":
+ * AArch64 main encoding table
+ *  Bit position
+ *   28 27 26 25	Encoding Group
+ *   0  0  -  -		Unallocated
+ *   1  0  0  -		Data processing, immediate
+ *   1  0  1  -		Branch, exception generation and system instructions
+ *   -  1  -  0		Loads and stores
+ *   -  1  0  1		Data processing - register
+ *   0  1  1  1		Data processing - SIMD and floating point
+ *   1  1  1  1		Data processing - SIMD and floating point
+ * "-" means "don't care"
+ */
+enum aarch64_insn_encoding_class {
+	AARCH64_INSN_CLS_UNKNOWN,	/* UNALLOCATED */
+	AARCH64_INSN_CLS_SVE,		/* SVE instructions */
+	AARCH64_INSN_CLS_DP_IMM,	/* Data processing - immediate */
+	AARCH64_INSN_CLS_DP_REG,	/* Data processing - register */
+	AARCH64_INSN_CLS_DP_FPSIMD,	/* Data processing - SIMD and FP */
+	AARCH64_INSN_CLS_LDST,		/* Loads and stores */
+	AARCH64_INSN_CLS_BR_SYS,	/* Branch, exception generation and
+					 * system instructions */
+};
+
+enum aarch64_insn_hint_cr_op {
+	AARCH64_INSN_HINT_NOP	= 0x0 << 5,
+	AARCH64_INSN_HINT_YIELD	= 0x1 << 5,
+	AARCH64_INSN_HINT_WFE	= 0x2 << 5,
+	AARCH64_INSN_HINT_WFI	= 0x3 << 5,
+	AARCH64_INSN_HINT_SEV	= 0x4 << 5,
+	AARCH64_INSN_HINT_SEVL	= 0x5 << 5,
+
+	AARCH64_INSN_HINT_XPACLRI    = 0x07 << 5,
+	AARCH64_INSN_HINT_PACIA_1716 = 0x08 << 5,
+	AARCH64_INSN_HINT_PACIB_1716 = 0x0A << 5,
+	AARCH64_INSN_HINT_AUTIA_1716 = 0x0C << 5,
+	AARCH64_INSN_HINT_AUTIB_1716 = 0x0E << 5,
+	AARCH64_INSN_HINT_PACIAZ     = 0x18 << 5,
+	AARCH64_INSN_HINT_PACIASP    = 0x19 << 5,
+	AARCH64_INSN_HINT_PACIBZ     = 0x1A << 5,
+	AARCH64_INSN_HINT_PACIBSP    = 0x1B << 5,
+	AARCH64_INSN_HINT_AUTIAZ     = 0x1C << 5,
+	AARCH64_INSN_HINT_AUTIASP    = 0x1D << 5,
+	AARCH64_INSN_HINT_AUTIBZ     = 0x1E << 5,
+	AARCH64_INSN_HINT_AUTIBSP    = 0x1F << 5,
+
+	AARCH64_INSN_HINT_ESB  = 0x10 << 5,
+	AARCH64_INSN_HINT_PSB  = 0x11 << 5,
+	AARCH64_INSN_HINT_TSB  = 0x12 << 5,
+	AARCH64_INSN_HINT_CSDB = 0x14 << 5,
+
+	AARCH64_INSN_HINT_BTI   = 0x20 << 5,
+	AARCH64_INSN_HINT_BTIC  = 0x22 << 5,
+	AARCH64_INSN_HINT_BTIJ  = 0x24 << 5,
+	AARCH64_INSN_HINT_BTIJC = 0x26 << 5,
+};
+
+enum aarch64_insn_imm_type {
+	AARCH64_INSN_IMM_ADR,
+	AARCH64_INSN_IMM_26,
+	AARCH64_INSN_IMM_19,
+	AARCH64_INSN_IMM_16,
+	AARCH64_INSN_IMM_14,
+	AARCH64_INSN_IMM_12,
+	AARCH64_INSN_IMM_9,
+	AARCH64_INSN_IMM_7,
+	AARCH64_INSN_IMM_6,
+	AARCH64_INSN_IMM_S,
+	AARCH64_INSN_IMM_R,
+	AARCH64_INSN_IMM_N,
+	AARCH64_INSN_IMM_MAX
+};
+
+enum aarch64_insn_register_type {
+	AARCH64_INSN_REGTYPE_RT,
+	AARCH64_INSN_REGTYPE_RN,
+	AARCH64_INSN_REGTYPE_RT2,
+	AARCH64_INSN_REGTYPE_RM,
+	AARCH64_INSN_REGTYPE_RD,
+	AARCH64_INSN_REGTYPE_RA,
+	AARCH64_INSN_REGTYPE_RS,
+};
+
+enum aarch64_insn_register {
+	AARCH64_INSN_REG_0  = 0,
+	AARCH64_INSN_REG_1  = 1,
+	AARCH64_INSN_REG_2  = 2,
+	AARCH64_INSN_REG_3  = 3,
+	AARCH64_INSN_REG_4  = 4,
+	AARCH64_INSN_REG_5  = 5,
+	AARCH64_INSN_REG_6  = 6,
+	AARCH64_INSN_REG_7  = 7,
+	AARCH64_INSN_REG_8  = 8,
+	AARCH64_INSN_REG_9  = 9,
+	AARCH64_INSN_REG_10 = 10,
+	AARCH64_INSN_REG_11 = 11,
+	AARCH64_INSN_REG_12 = 12,
+	AARCH64_INSN_REG_13 = 13,
+	AARCH64_INSN_REG_14 = 14,
+	AARCH64_INSN_REG_15 = 15,
+	AARCH64_INSN_REG_16 = 16,
+	AARCH64_INSN_REG_17 = 17,
+	AARCH64_INSN_REG_18 = 18,
+	AARCH64_INSN_REG_19 = 19,
+	AARCH64_INSN_REG_20 = 20,
+	AARCH64_INSN_REG_21 = 21,
+	AARCH64_INSN_REG_22 = 22,
+	AARCH64_INSN_REG_23 = 23,
+	AARCH64_INSN_REG_24 = 24,
+	AARCH64_INSN_REG_25 = 25,
+	AARCH64_INSN_REG_26 = 26,
+	AARCH64_INSN_REG_27 = 27,
+	AARCH64_INSN_REG_28 = 28,
+	AARCH64_INSN_REG_29 = 29,
+	AARCH64_INSN_REG_FP = 29, /* Frame pointer */
+	AARCH64_INSN_REG_30 = 30,
+	AARCH64_INSN_REG_LR = 30, /* Link register */
+	AARCH64_INSN_REG_ZR = 31, /* Zero: as source register */
+	AARCH64_INSN_REG_SP = 31  /* Stack pointer: as load/store base reg */
+};
+
+enum aarch64_insn_special_register {
+	AARCH64_INSN_SPCLREG_SPSR_EL1	= 0xC200,
+	AARCH64_INSN_SPCLREG_ELR_EL1	= 0xC201,
+	AARCH64_INSN_SPCLREG_SP_EL0	= 0xC208,
+	AARCH64_INSN_SPCLREG_SPSEL	= 0xC210,
+	AARCH64_INSN_SPCLREG_CURRENTEL	= 0xC212,
+	AARCH64_INSN_SPCLREG_DAIF	= 0xDA11,
+	AARCH64_INSN_SPCLREG_NZCV	= 0xDA10,
+	AARCH64_INSN_SPCLREG_FPCR	= 0xDA20,
+	AARCH64_INSN_SPCLREG_DSPSR_EL0	= 0xDA28,
+	AARCH64_INSN_SPCLREG_DLR_EL0	= 0xDA29,
+	AARCH64_INSN_SPCLREG_SPSR_EL2	= 0xE200,
+	AARCH64_INSN_SPCLREG_ELR_EL2	= 0xE201,
+	AARCH64_INSN_SPCLREG_SP_EL1	= 0xE208,
+	AARCH64_INSN_SPCLREG_SPSR_INQ	= 0xE218,
+	AARCH64_INSN_SPCLREG_SPSR_ABT	= 0xE219,
+	AARCH64_INSN_SPCLREG_SPSR_UND	= 0xE21A,
+	AARCH64_INSN_SPCLREG_SPSR_FIQ	= 0xE21B,
+	AARCH64_INSN_SPCLREG_SPSR_EL3	= 0xF200,
+	AARCH64_INSN_SPCLREG_ELR_EL3	= 0xF201,
+	AARCH64_INSN_SPCLREG_SP_EL2	= 0xF210
+};
+
+enum aarch64_insn_variant {
+	AARCH64_INSN_VARIANT_32BIT,
+	AARCH64_INSN_VARIANT_64BIT
+};
+
+enum aarch64_insn_condition {
+	AARCH64_INSN_COND_EQ = 0x0, /* == */
+	AARCH64_INSN_COND_NE = 0x1, /* != */
+	AARCH64_INSN_COND_CS = 0x2, /* unsigned >= */
+	AARCH64_INSN_COND_CC = 0x3, /* unsigned < */
+	AARCH64_INSN_COND_MI = 0x4, /* < 0 */
+	AARCH64_INSN_COND_PL = 0x5, /* >= 0 */
+	AARCH64_INSN_COND_VS = 0x6, /* overflow */
+	AARCH64_INSN_COND_VC = 0x7, /* no overflow */
+	AARCH64_INSN_COND_HI = 0x8, /* unsigned > */
+	AARCH64_INSN_COND_LS = 0x9, /* unsigned <= */
+	AARCH64_INSN_COND_GE = 0xa, /* signed >= */
+	AARCH64_INSN_COND_LT = 0xb, /* signed < */
+	AARCH64_INSN_COND_GT = 0xc, /* signed > */
+	AARCH64_INSN_COND_LE = 0xd, /* signed <= */
+	AARCH64_INSN_COND_AL = 0xe, /* always */
+};
+
+enum aarch64_insn_branch_type {
+	AARCH64_INSN_BRANCH_NOLINK,
+	AARCH64_INSN_BRANCH_LINK,
+	AARCH64_INSN_BRANCH_RETURN,
+	AARCH64_INSN_BRANCH_COMP_ZERO,
+	AARCH64_INSN_BRANCH_COMP_NONZERO,
+};
+
+enum aarch64_insn_size_type {
+	AARCH64_INSN_SIZE_8,
+	AARCH64_INSN_SIZE_16,
+	AARCH64_INSN_SIZE_32,
+	AARCH64_INSN_SIZE_64,
+};
+
+enum aarch64_insn_ldst_type {
+	AARCH64_INSN_LDST_LOAD_REG_OFFSET,
+	AARCH64_INSN_LDST_STORE_REG_OFFSET,
+	AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX,
+	AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX,
+	AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX,
+	AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX,
+	AARCH64_INSN_LDST_LOAD_EX,
+	AARCH64_INSN_LDST_STORE_EX,
+};
+
+enum aarch64_insn_adsb_type {
+	AARCH64_INSN_ADSB_ADD,
+	AARCH64_INSN_ADSB_SUB,
+	AARCH64_INSN_ADSB_ADD_SETFLAGS,
+	AARCH64_INSN_ADSB_SUB_SETFLAGS
+};
+
+enum aarch64_insn_movewide_type {
+	AARCH64_INSN_MOVEWIDE_ZERO,
+	AARCH64_INSN_MOVEWIDE_KEEP,
+	AARCH64_INSN_MOVEWIDE_INVERSE
+};
+
+enum aarch64_insn_bitfield_type {
+	AARCH64_INSN_BITFIELD_MOVE,
+	AARCH64_INSN_BITFIELD_MOVE_UNSIGNED,
+	AARCH64_INSN_BITFIELD_MOVE_SIGNED
+};
+
+enum aarch64_insn_data1_type {
+	AARCH64_INSN_DATA1_REVERSE_16,
+	AARCH64_INSN_DATA1_REVERSE_32,
+	AARCH64_INSN_DATA1_REVERSE_64,
+};
+
+enum aarch64_insn_data2_type {
+	AARCH64_INSN_DATA2_UDIV,
+	AARCH64_INSN_DATA2_SDIV,
+	AARCH64_INSN_DATA2_LSLV,
+	AARCH64_INSN_DATA2_LSRV,
+	AARCH64_INSN_DATA2_ASRV,
+	AARCH64_INSN_DATA2_RORV,
+};
+
+enum aarch64_insn_data3_type {
+	AARCH64_INSN_DATA3_MADD,
+	AARCH64_INSN_DATA3_MSUB,
+};
+
+enum aarch64_insn_logic_type {
+	AARCH64_INSN_LOGIC_AND,
+	AARCH64_INSN_LOGIC_BIC,
+	AARCH64_INSN_LOGIC_ORR,
+	AARCH64_INSN_LOGIC_ORN,
+	AARCH64_INSN_LOGIC_EOR,
+	AARCH64_INSN_LOGIC_EON,
+	AARCH64_INSN_LOGIC_AND_SETFLAGS,
+	AARCH64_INSN_LOGIC_BIC_SETFLAGS
+};
+
+enum aarch64_insn_prfm_type {
+	AARCH64_INSN_PRFM_TYPE_PLD,
+	AARCH64_INSN_PRFM_TYPE_PLI,
+	AARCH64_INSN_PRFM_TYPE_PST,
+};
+
+enum aarch64_insn_prfm_target {
+	AARCH64_INSN_PRFM_TARGET_L1,
+	AARCH64_INSN_PRFM_TARGET_L2,
+	AARCH64_INSN_PRFM_TARGET_L3,
+};
+
+enum aarch64_insn_prfm_policy {
+	AARCH64_INSN_PRFM_POLICY_KEEP,
+	AARCH64_INSN_PRFM_POLICY_STRM,
+};
+
+enum aarch64_insn_adr_type {
+	AARCH64_INSN_ADR_TYPE_ADRP,
+	AARCH64_INSN_ADR_TYPE_ADR,
+};
+
+#define	__AARCH64_INSN_FUNCS(abbr, mask, val)				\
+static __always_inline bool aarch64_insn_is_##abbr(u32 code)		\
+{									\
+	BUILD_BUG_ON(~(mask) & (val));					\
+	return (code & (mask)) == (val);				\
+}									\
+static __always_inline u32 aarch64_insn_get_##abbr##_value(void)	\
+{									\
+	return (val);							\
+}
+
+__AARCH64_INSN_FUNCS(adr,	0x9F000000, 0x10000000)
+__AARCH64_INSN_FUNCS(adrp,	0x9F000000, 0x90000000)
+__AARCH64_INSN_FUNCS(prfm,	0x3FC00000, 0x39800000)
+__AARCH64_INSN_FUNCS(prfm_lit,	0xFF000000, 0xD8000000)
+__AARCH64_INSN_FUNCS(store_imm,	0x3FC00000, 0x39000000)
+__AARCH64_INSN_FUNCS(load_imm,	0x3FC00000, 0x39400000)
+__AARCH64_INSN_FUNCS(store_pre,	0x3FE00C00, 0x38000C00)
+__AARCH64_INSN_FUNCS(load_pre,	0x3FE00C00, 0x38400C00)
+__AARCH64_INSN_FUNCS(store_post,	0x3FE00C00, 0x38000400)
+__AARCH64_INSN_FUNCS(load_post,	0x3FE00C00, 0x38400400)
+__AARCH64_INSN_FUNCS(str_reg,	0x3FE0EC00, 0x38206800)
+__AARCH64_INSN_FUNCS(ldadd,	0x3F20FC00, 0x38200000)
+__AARCH64_INSN_FUNCS(ldr_reg,	0x3FE0EC00, 0x38606800)
+__AARCH64_INSN_FUNCS(ldr_lit,	0xBF000000, 0x18000000)
+__AARCH64_INSN_FUNCS(ldrsw_lit,	0xFF000000, 0x98000000)
+__AARCH64_INSN_FUNCS(exclusive,	0x3F800000, 0x08000000)
+__AARCH64_INSN_FUNCS(load_ex,	0x3F400000, 0x08400000)
+__AARCH64_INSN_FUNCS(store_ex,	0x3F400000, 0x08000000)
+__AARCH64_INSN_FUNCS(stp,	0x7FC00000, 0x29000000)
+__AARCH64_INSN_FUNCS(ldp,	0x7FC00000, 0x29400000)
+__AARCH64_INSN_FUNCS(stp_post,	0x7FC00000, 0x28800000)
+__AARCH64_INSN_FUNCS(ldp_post,	0x7FC00000, 0x28C00000)
+__AARCH64_INSN_FUNCS(stp_pre,	0x7FC00000, 0x29800000)
+__AARCH64_INSN_FUNCS(ldp_pre,	0x7FC00000, 0x29C00000)
+__AARCH64_INSN_FUNCS(add_imm,	0x7F000000, 0x11000000)
+__AARCH64_INSN_FUNCS(adds_imm,	0x7F000000, 0x31000000)
+__AARCH64_INSN_FUNCS(sub_imm,	0x7F000000, 0x51000000)
+__AARCH64_INSN_FUNCS(subs_imm,	0x7F000000, 0x71000000)
+__AARCH64_INSN_FUNCS(movn,	0x7F800000, 0x12800000)
+__AARCH64_INSN_FUNCS(sbfm,	0x7F800000, 0x13000000)
+__AARCH64_INSN_FUNCS(bfm,	0x7F800000, 0x33000000)
+__AARCH64_INSN_FUNCS(movz,	0x7F800000, 0x52800000)
+__AARCH64_INSN_FUNCS(ubfm,	0x7F800000, 0x53000000)
+__AARCH64_INSN_FUNCS(movk,	0x7F800000, 0x72800000)
+__AARCH64_INSN_FUNCS(add,	0x7F200000, 0x0B000000)
+__AARCH64_INSN_FUNCS(adds,	0x7F200000, 0x2B000000)
+__AARCH64_INSN_FUNCS(sub,	0x7F200000, 0x4B000000)
+__AARCH64_INSN_FUNCS(subs,	0x7F200000, 0x6B000000)
+__AARCH64_INSN_FUNCS(madd,	0x7FE08000, 0x1B000000)
+__AARCH64_INSN_FUNCS(msub,	0x7FE08000, 0x1B008000)
+__AARCH64_INSN_FUNCS(udiv,	0x7FE0FC00, 0x1AC00800)
+__AARCH64_INSN_FUNCS(sdiv,	0x7FE0FC00, 0x1AC00C00)
+__AARCH64_INSN_FUNCS(lslv,	0x7FE0FC00, 0x1AC02000)
+__AARCH64_INSN_FUNCS(lsrv,	0x7FE0FC00, 0x1AC02400)
+__AARCH64_INSN_FUNCS(asrv,	0x7FE0FC00, 0x1AC02800)
+__AARCH64_INSN_FUNCS(rorv,	0x7FE0FC00, 0x1AC02C00)
+__AARCH64_INSN_FUNCS(rev16,	0x7FFFFC00, 0x5AC00400)
+__AARCH64_INSN_FUNCS(rev32,	0x7FFFFC00, 0x5AC00800)
+__AARCH64_INSN_FUNCS(rev64,	0x7FFFFC00, 0x5AC00C00)
+__AARCH64_INSN_FUNCS(and,	0x7F200000, 0x0A000000)
+__AARCH64_INSN_FUNCS(bic,	0x7F200000, 0x0A200000)
+__AARCH64_INSN_FUNCS(orr,	0x7F200000, 0x2A000000)
+__AARCH64_INSN_FUNCS(mov_reg,	0x7FE0FFE0, 0x2A0003E0)
+__AARCH64_INSN_FUNCS(orn,	0x7F200000, 0x2A200000)
+__AARCH64_INSN_FUNCS(eor,	0x7F200000, 0x4A000000)
+__AARCH64_INSN_FUNCS(eon,	0x7F200000, 0x4A200000)
+__AARCH64_INSN_FUNCS(ands,	0x7F200000, 0x6A000000)
+__AARCH64_INSN_FUNCS(bics,	0x7F200000, 0x6A200000)
+__AARCH64_INSN_FUNCS(and_imm,	0x7F800000, 0x12000000)
+__AARCH64_INSN_FUNCS(orr_imm,	0x7F800000, 0x32000000)
+__AARCH64_INSN_FUNCS(eor_imm,	0x7F800000, 0x52000000)
+__AARCH64_INSN_FUNCS(ands_imm,	0x7F800000, 0x72000000)
+__AARCH64_INSN_FUNCS(extr,	0x7FA00000, 0x13800000)
+__AARCH64_INSN_FUNCS(b,		0xFC000000, 0x14000000)
+__AARCH64_INSN_FUNCS(bl,	0xFC000000, 0x94000000)
+__AARCH64_INSN_FUNCS(cbz,	0x7F000000, 0x34000000)
+__AARCH64_INSN_FUNCS(cbnz,	0x7F000000, 0x35000000)
+__AARCH64_INSN_FUNCS(tbz,	0x7F000000, 0x36000000)
+__AARCH64_INSN_FUNCS(tbnz,	0x7F000000, 0x37000000)
+__AARCH64_INSN_FUNCS(bcond,	0xFF000010, 0x54000000)
+__AARCH64_INSN_FUNCS(svc,	0xFFE0001F, 0xD4000001)
+__AARCH64_INSN_FUNCS(hvc,	0xFFE0001F, 0xD4000002)
+__AARCH64_INSN_FUNCS(smc,	0xFFE0001F, 0xD4000003)
+__AARCH64_INSN_FUNCS(brk,	0xFFE0001F, 0xD4200000)
+__AARCH64_INSN_FUNCS(exception,	0xFF000000, 0xD4000000)
+__AARCH64_INSN_FUNCS(hint,	0xFFFFF01F, 0xD503201F)
+__AARCH64_INSN_FUNCS(br,	0xFFFFFC1F, 0xD61F0000)
+__AARCH64_INSN_FUNCS(br_auth,	0xFEFFF800, 0xD61F0800)
+__AARCH64_INSN_FUNCS(blr,	0xFFFFFC1F, 0xD63F0000)
+__AARCH64_INSN_FUNCS(blr_auth,	0xFEFFF800, 0xD63F0800)
+__AARCH64_INSN_FUNCS(ret,	0xFFFFFC1F, 0xD65F0000)
+__AARCH64_INSN_FUNCS(ret_auth,	0xFFFFFBFF, 0xD65F0BFF)
+__AARCH64_INSN_FUNCS(eret,	0xFFFFFFFF, 0xD69F03E0)
+__AARCH64_INSN_FUNCS(eret_auth,	0xFFFFFBFF, 0xD69F0BFF)
+__AARCH64_INSN_FUNCS(mrs,	0xFFF00000, 0xD5300000)
+__AARCH64_INSN_FUNCS(msr_imm,	0xFFF8F01F, 0xD500401F)
+__AARCH64_INSN_FUNCS(msr_reg,	0xFFF00000, 0xD5100000)
+__AARCH64_INSN_FUNCS(dmb,	0xFFFFF0FF, 0xD50330BF)
+__AARCH64_INSN_FUNCS(dsb_base,	0xFFFFF0FF, 0xD503309F)
+__AARCH64_INSN_FUNCS(dsb_nxs,	0xFFFFF3FF, 0xD503323F)
+__AARCH64_INSN_FUNCS(isb,	0xFFFFF0FF, 0xD50330DF)
+__AARCH64_INSN_FUNCS(sb,	0xFFFFFFFF, 0xD50330FF)
+__AARCH64_INSN_FUNCS(clrex,	0xFFFFF0FF, 0xD503305F)
+__AARCH64_INSN_FUNCS(ssbb,	0xFFFFFFFF, 0xD503309F)
+__AARCH64_INSN_FUNCS(pssbb,	0xFFFFFFFF, 0xD503349F)
+
+#undef	__AARCH64_INSN_FUNCS
+
+bool aarch64_insn_is_steppable_hint(u32 insn);
+bool aarch64_insn_is_branch_imm(u32 insn);
+
+static inline bool aarch64_insn_is_adr_adrp(u32 insn)
+{
+	return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn);
+}
+
+static inline bool aarch64_insn_is_dsb(u32 insn)
+{
+	return (aarch64_insn_is_dsb_base(insn) && (insn & 0xb00)) ||
+		aarch64_insn_is_dsb_nxs(insn);
+}
+
+static inline bool aarch64_insn_is_barrier(u32 insn)
+{
+	return aarch64_insn_is_dmb(insn) || aarch64_insn_is_dsb(insn) ||
+	       aarch64_insn_is_isb(insn) || aarch64_insn_is_sb(insn) ||
+	       aarch64_insn_is_clrex(insn) || aarch64_insn_is_ssbb(insn) ||
+	       aarch64_insn_is_pssbb(insn);
+}
+
+static inline bool aarch64_insn_is_store_single(u32 insn)
+{
+	return aarch64_insn_is_store_imm(insn) ||
+	       aarch64_insn_is_store_pre(insn) ||
+	       aarch64_insn_is_store_post(insn);
+}
+
+static inline bool aarch64_insn_is_store_pair(u32 insn)
+{
+	return aarch64_insn_is_stp(insn) ||
+	       aarch64_insn_is_stp_pre(insn) ||
+	       aarch64_insn_is_stp_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_single(u32 insn)
+{
+	return aarch64_insn_is_load_imm(insn) ||
+	       aarch64_insn_is_load_pre(insn) ||
+	       aarch64_insn_is_load_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_pair(u32 insn)
+{
+	return aarch64_insn_is_ldp(insn) ||
+	       aarch64_insn_is_ldp_pre(insn) ||
+	       aarch64_insn_is_ldp_post(insn);
+}
+
+enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
+bool aarch64_insn_uses_literal(u32 insn);
+bool aarch64_insn_is_branch(u32 insn);
+u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn);
+u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type,
+					 u32 insn);
+u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op);
+u32 aarch64_insn_gen_nop(void);
+u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
+				enum aarch64_insn_branch_type type);
+s32 aarch64_get_branch_offset(u32 insn);
+s32 aarch64_insn_adrp_get_offset(u32 insn);
+
+#endif /* __ASSEMBLY__ */
+
+#endif	/* __ASM_INSN_H */
diff --git a/tools/arch/arm64/lib/insn.c b/tools/arch/arm64/lib/insn.c
new file mode 100644
index 0000000000000..b0cc984fcf6a8
--- /dev/null
+++ b/tools/arch/arm64/lib/insn.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 Huawei Ltd.
+ * Author: Jiang Liu <liuj97@gmail.com>
+ *
+ * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
+ */
+#include <linux/bitops.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include <asm/errno.h>
+#include <asm/insn.h>
+
+#define AARCH64_DECODE_FAULT 0xFFFFFFFF
+
+#define AARCH64_INSN_SF_BIT	BIT(31)
+#define AARCH64_INSN_N_BIT	BIT(22)
+#define AARCH64_INSN_LSL_12	BIT(22)
+
+static const int aarch64_insn_encoding_class[] = {
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_SVE,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_REG,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_FPSIMD,
+	AARCH64_INSN_CLS_DP_IMM,
+	AARCH64_INSN_CLS_DP_IMM,
+	AARCH64_INSN_CLS_BR_SYS,
+	AARCH64_INSN_CLS_BR_SYS,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_REG,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_FPSIMD,
+};
+
+enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn)
+{
+	return aarch64_insn_encoding_class[(insn >> 25) & 0xf];
+}
+
+bool aarch64_insn_is_steppable_hint(u32 insn)
+{
+	if (!aarch64_insn_is_hint(insn))
+		return false;
+
+	switch (insn & 0xFE0) {
+	case AARCH64_INSN_HINT_XPACLRI:
+	case AARCH64_INSN_HINT_PACIA_1716:
+	case AARCH64_INSN_HINT_PACIB_1716:
+	case AARCH64_INSN_HINT_PACIAZ:
+	case AARCH64_INSN_HINT_PACIASP:
+	case AARCH64_INSN_HINT_PACIBZ:
+	case AARCH64_INSN_HINT_PACIBSP:
+	case AARCH64_INSN_HINT_BTI:
+	case AARCH64_INSN_HINT_BTIC:
+	case AARCH64_INSN_HINT_BTIJ:
+	case AARCH64_INSN_HINT_BTIJC:
+	case AARCH64_INSN_HINT_NOP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+bool aarch64_insn_is_branch_imm(u32 insn)
+{
+	return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) ||
+		aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) ||
+		aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+		aarch64_insn_is_bcond(insn));
+}
+
+bool aarch64_insn_uses_literal(u32 insn)
+{
+	/* ldr/ldrsw (literal), prfm */
+
+	return aarch64_insn_is_ldr_lit(insn) ||
+		aarch64_insn_is_ldrsw_lit(insn) ||
+		aarch64_insn_is_adr_adrp(insn) ||
+		aarch64_insn_is_prfm_lit(insn);
+}
+
+bool aarch64_insn_is_branch(u32 insn)
+{
+	/* b, bl, cb*, tb*, ret*, b.cond, br*, blr* */
+
+	return aarch64_insn_is_b(insn) ||
+		aarch64_insn_is_bl(insn) ||
+		aarch64_insn_is_cbz(insn) ||
+		aarch64_insn_is_cbnz(insn) ||
+		aarch64_insn_is_tbz(insn) ||
+		aarch64_insn_is_tbnz(insn) ||
+		aarch64_insn_is_ret(insn) ||
+		aarch64_insn_is_ret_auth(insn) ||
+		aarch64_insn_is_br(insn) ||
+		aarch64_insn_is_br_auth(insn) ||
+		aarch64_insn_is_blr(insn) ||
+		aarch64_insn_is_blr_auth(insn) ||
+		aarch64_insn_is_bcond(insn);
+}
+
+static int aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
+						u32 *maskp, int *shiftp)
+{
+	u32 mask;
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_IMM_26:
+		mask = BIT(26) - 1;
+		shift = 0;
+		break;
+	case AARCH64_INSN_IMM_19:
+		mask = BIT(19) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_16:
+		mask = BIT(16) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_14:
+		mask = BIT(14) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_12:
+		mask = BIT(12) - 1;
+		shift = 10;
+		break;
+	case AARCH64_INSN_IMM_9:
+		mask = BIT(9) - 1;
+		shift = 12;
+		break;
+	case AARCH64_INSN_IMM_7:
+		mask = BIT(7) - 1;
+		shift = 15;
+		break;
+	case AARCH64_INSN_IMM_6:
+	case AARCH64_INSN_IMM_S:
+		mask = BIT(6) - 1;
+		shift = 10;
+		break;
+	case AARCH64_INSN_IMM_R:
+		mask = BIT(6) - 1;
+		shift = 16;
+		break;
+	case AARCH64_INSN_IMM_N:
+		mask = 1;
+		shift = 22;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*maskp = mask;
+	*shiftp = shift;
+
+	return 0;
+}
+
+#define ADR_IMM_HILOSPLIT	2
+#define ADR_IMM_SIZE		SZ_2M
+#define ADR_IMM_LOMASK		((1 << ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_HIMASK		((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_LOSHIFT		29
+#define ADR_IMM_HISHIFT		5
+
+u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn)
+{
+	u32 immlo, immhi, mask;
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_IMM_ADR:
+		shift = 0;
+		immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK;
+		immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK;
+		insn = (immhi << ADR_IMM_HILOSPLIT) | immlo;
+		mask = ADR_IMM_SIZE - 1;
+		break;
+	default:
+		if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+			WARN("aarch64_insn_decode_immediate: unknown immediate encoding %d\n",
+					type);
+			return AARCH64_DECODE_FAULT;
+		}
+	}
+
+	return (insn >> shift) & mask;
+}
+
+u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type,
+					u32 insn)
+{
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_REGTYPE_RT:
+	case AARCH64_INSN_REGTYPE_RD:
+		shift = 0;
+		break;
+	case AARCH64_INSN_REGTYPE_RN:
+		shift = 5;
+		break;
+	case AARCH64_INSN_REGTYPE_RT2:
+	case AARCH64_INSN_REGTYPE_RA:
+		shift = 10;
+		break;
+	case AARCH64_INSN_REGTYPE_RM:
+		shift = 16;
+		break;
+	default:
+		WARN("%s: unknown register type encoding %d\n", __func__,
+				type);
+		return AARCH64_DECODE_FAULT;
+	}
+
+	return (insn >> shift) & GENMASK(4, 0);
+}
+
+u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op)
+{
+	return aarch64_insn_get_hint_value() | op;
+}
+
+u32 aarch64_insn_gen_nop(void)
+{
+	return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
+}
+
+static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
+					u32 insn,
+					enum aarch64_insn_register reg)
+{
+	int shift;
+
+	if (insn == AARCH64_DECODE_FAULT)
+		return AARCH64_DECODE_FAULT;
+
+	if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) {
+		WARN("%s: unknown register encoding %d\n", __func__, reg);
+		return AARCH64_DECODE_FAULT;
+	}
+
+	switch (type) {
+	case AARCH64_INSN_REGTYPE_RT:
+	case AARCH64_INSN_REGTYPE_RD:
+		shift = 0;
+		break;
+	case AARCH64_INSN_REGTYPE_RN:
+		shift = 5;
+		break;
+	case AARCH64_INSN_REGTYPE_RT2:
+	case AARCH64_INSN_REGTYPE_RA:
+		shift = 10;
+		break;
+	case AARCH64_INSN_REGTYPE_RM:
+	case AARCH64_INSN_REGTYPE_RS:
+		shift = 16;
+		break;
+	default:
+		WARN("%s: unknown register type encoding %d\n", __func__,
+		       type);
+		return AARCH64_DECODE_FAULT;
+	}
+
+	insn &= ~(GENMASK(4, 0) << shift);
+	insn |= reg << shift;
+
+	return insn;
+}
+
+u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
+				enum aarch64_insn_branch_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_BRANCH_NOLINK:
+		insn = aarch64_insn_get_br_value();
+		break;
+	case AARCH64_INSN_BRANCH_LINK:
+		insn = aarch64_insn_get_blr_value();
+		break;
+	case AARCH64_INSN_BRANCH_RETURN:
+		insn = aarch64_insn_get_ret_value();
+		break;
+	default:
+		WARN("%s: unknown branch encoding %d\n", __func__, type);
+		return AARCH64_DECODE_FAULT;
+	}
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg);
+}
+
+/*
+ * Decode the imm field of a branch, and return the byte offset as a
+ * signed value (so it can be used when computing a new branch
+ * target).
+ */
+s32 aarch64_get_branch_offset(u32 insn)
+{
+	s32 imm;
+
+	if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn);
+		return (imm << 6) >> 4;
+	}
+
+	if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+	    aarch64_insn_is_bcond(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn);
+		return (imm << 13) >> 11;
+	}
+
+	if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn);
+		return (imm << 18) >> 16;
+	}
+
+	WARN("Unhandled instruction %x", insn);
+	return AARCH64_DECODE_FAULT;
+}
+
+s32 aarch64_insn_adrp_get_offset(u32 insn)
+{
+	if (!aarch64_insn_is_adrp(insn)) {
+		WARN("Unhandled instruction %x", insn);
+		return AARCH64_DECODE_FAULT;
+	}
+	return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12;
+}

From b58c1cb038c7b5ddf055213df2743d4c9af14a1c Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:46 +0800
Subject: [PATCH 050/175] objtool: arm64: Add base definition for arm64 backend

Provide needed definitions for a new architecture instruction decoder.
No proper decoding is done yet.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 include/linux/objtool.h                       |   2 +
 tools/include/linux/objtool.h                 |   2 +
 tools/objtool/Makefile                        |   4 +
 tools/objtool/arch/arm64/Build                |   8 +
 tools/objtool/arch/arm64/decode.c             | 138 ++++++++++++++++++
 .../arch/arm64/include/arch/cfi_regs.h        |  14 ++
 tools/objtool/arch/arm64/include/arch/elf.h   |  12 ++
 .../arch/arm64/include/arch/endianness.h      |   9 ++
 .../objtool/arch/arm64/include/arch/special.h |  22 +++
 tools/objtool/arch/arm64/special.c            |  21 +++
 10 files changed, 232 insertions(+)
 create mode 100644 tools/objtool/arch/arm64/Build
 create mode 100644 tools/objtool/arch/arm64/decode.c
 create mode 100644 tools/objtool/arch/arm64/include/arch/cfi_regs.h
 create mode 100644 tools/objtool/arch/arm64/include/arch/elf.h
 create mode 100644 tools/objtool/arch/arm64/include/arch/endianness.h
 create mode 100644 tools/objtool/arch/arm64/include/arch/special.h
 create mode 100644 tools/objtool/arch/arm64/special.c

diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 62c54ffbeeaac..bf494bd6e191a 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -45,7 +45,9 @@ struct unwind_hint {
 
 #ifdef CONFIG_OBJTOOL
 
+#ifndef CONFIG_ARM64
 #include <asm/asm.h>
+#endif
 
 #ifndef __ASSEMBLY__
 
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 62c54ffbeeaac..bf494bd6e191a 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -45,7 +45,9 @@ struct unwind_hint {
 
 #ifdef CONFIG_OBJTOOL
 
+#ifndef CONFIG_ARM64
 #include <asm/asm.h>
+#endif
 
 #ifndef __ASSEMBLY__
 
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 0cfab07328e8b..9772ac5993ed5 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -45,6 +45,10 @@ ifeq ($(SRCARCH),x86)
 	BUILD_ORC := y
 endif
 
+ifeq ($(SRCARCH),arm64)
+	CFLAGS  += -Wno-nested-externs
+endif
+
 export BUILD_ORC
 export srctree OUTPUT CFLAGS SRCARCH AWK
 include $(srctree)/tools/build/Makefile.include
diff --git a/tools/objtool/arch/arm64/Build b/tools/objtool/arch/arm64/Build
new file mode 100644
index 0000000000000..f3de3a50d5411
--- /dev/null
+++ b/tools/objtool/arch/arm64/Build
@@ -0,0 +1,8 @@
+objtool-y += special.o
+objtool-y += decode.o
+
+objtool-y += libhweight.o
+
+$(OUTPUT)arch/arm64/libhweight.o: ../lib/hweight.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
new file mode 100644
index 0000000000000..afe22c4593c8c
--- /dev/null
+++ b/tools/objtool/arch/arm64/decode.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <asm/insn.h>
+
+#include <objtool/check.h>
+#include <objtool/arch.h>
+#include <objtool/elf.h>
+#include <objtool/warn.h>
+#include <objtool/builtin.h>
+#include <arch/cfi_regs.h>
+
+#include "../../../arch/arm64/lib/insn.c"
+
+bool arch_callee_saved_reg(unsigned char reg)
+{
+	switch (reg) {
+	case AARCH64_INSN_REG_19:
+	case AARCH64_INSN_REG_20:
+	case AARCH64_INSN_REG_21:
+	case AARCH64_INSN_REG_22:
+	case AARCH64_INSN_REG_23:
+	case AARCH64_INSN_REG_24:
+	case AARCH64_INSN_REG_25:
+	case AARCH64_INSN_REG_26:
+	case AARCH64_INSN_REG_27:
+	case AARCH64_INSN_REG_28:
+	case AARCH64_INSN_REG_FP:
+	case AARCH64_INSN_REG_LR:
+		return true;
+	default:
+		return false;
+	}
+}
+
+void arch_initial_func_cfi_state(struct cfi_init_state *state)
+{
+	int i;
+
+	for (i = 0; i < CFI_NUM_REGS; i++) {
+		state->regs[i].base = CFI_UNDEFINED;
+		state->regs[i].offset = 0;
+	}
+
+	/* initial CFA (call frame address) */
+	state->cfa.base = CFI_SP;
+	state->cfa.offset = 0;
+}
+
+unsigned long arch_dest_reloc_offset(int addend)
+{
+	return addend;
+}
+
+unsigned long arch_jump_destination(struct instruction *insn)
+{
+	return insn->offset + insn->immediate;
+}
+
+const char *arch_nop_insn(int len)
+{
+	static u32 nop;
+
+	if (len != AARCH64_INSN_SIZE)
+		WARN("invalid NOP size: %d\n", len);
+
+	if (!nop)
+		nop = aarch64_insn_gen_nop();
+
+	return (const char *)&nop;
+}
+
+const char *arch_ret_insn(int len)
+{
+	static u32 ret;
+
+	if (len != AARCH64_INSN_SIZE)
+		WARN("invalid RET size: %d\n", len);
+
+	if (!ret) {
+		ret = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_LR,
+				AARCH64_INSN_BRANCH_RETURN);
+	}
+
+	return (const char *)&ret;
+}
+
+static int is_arm64(const struct elf *elf)
+{
+	switch (elf->ehdr.e_machine) {
+	case EM_AARCH64: //0xB7
+		return 1;
+	default:
+		WARN("unexpected ELF machine type %x",
+		     elf->ehdr.e_machine);
+		return 0;
+	}
+}
+
+int arch_decode_hint_reg(u8 sp_reg, int *base)
+{
+	return -1;
+}
+
+int arch_decode_instruction(struct objtool_file *file, const struct section *sec,
+			    unsigned long offset, unsigned int maxlen,
+			    unsigned int *len, enum insn_type *type,
+			    unsigned long *immediate,
+			    struct list_head *ops_list)
+{
+	const struct elf *elf = file->elf;
+	u32 insn;
+
+	if (!is_arm64(elf))
+		return -1;
+
+	if (maxlen < AARCH64_INSN_SIZE)
+		return 0;
+
+	*len = AARCH64_INSN_SIZE;
+	*immediate = 0;
+	*type = INSN_OTHER;
+
+	insn = *(u32 *)(sec->data->d_buf + offset);
+
+	switch (aarch64_get_insn_class(insn)) {
+	case AARCH64_INSN_CLS_UNKNOWN:
+		WARN("can't decode instruction at %s:0x%lx", sec->name, offset);
+		return -1;
+	default:
+		break;
+	}
+
+	return 0;
+}
diff --git a/tools/objtool/arch/arm64/include/arch/cfi_regs.h b/tools/objtool/arch/arm64/include/arch/cfi_regs.h
new file mode 100644
index 0000000000000..a5185649686b7
--- /dev/null
+++ b/tools/objtool/arch/arm64/include/arch/cfi_regs.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _OBJTOOL_CFI_REGS_H
+#define _OBJTOOL_CFI_REGS_H
+
+#include <asm/insn.h>
+
+#define CFI_BP			AARCH64_INSN_REG_FP
+#define CFI_RA			AARCH64_INSN_REG_LR
+#define CFI_SP			AARCH64_INSN_REG_SP
+
+#define CFI_NUM_REGS		32
+
+#endif /* _OBJTOOL_CFI_REGS_H */
diff --git a/tools/objtool/arch/arm64/include/arch/elf.h b/tools/objtool/arch/arm64/include/arch/elf.h
new file mode 100644
index 0000000000000..a59888a906b5d
--- /dev/null
+++ b/tools/objtool/arch/arm64/include/arch/elf.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+
+#ifndef _OBJTOOL_ARCH_ELF
+#define _OBJTOOL_ARCH_ELF
+
+#define R_NTYPE	-1
+#define R_NONE	R_AARCH64_NONE
+#define R_ABS64	R_AARCH64_ABS64
+#define R_REL32	R_AARCH64_PREL32
+#define R_PLT32	R_NTYPE
+
+#endif /* _OBJTOOL_ARCH_ELF */
diff --git a/tools/objtool/arch/arm64/include/arch/endianness.h b/tools/objtool/arch/arm64/include/arch/endianness.h
new file mode 100644
index 0000000000000..7c362527da205
--- /dev/null
+++ b/tools/objtool/arch/arm64/include/arch/endianness.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ARCH_ENDIANNESS_H
+#define _ARCH_ENDIANNESS_H
+
+#include <endian.h>
+
+#define __TARGET_BYTE_ORDER __LITTLE_ENDIAN
+
+#endif /* _ARCH_ENDIANNESS_H */
diff --git a/tools/objtool/arch/arm64/include/arch/special.h b/tools/objtool/arch/arm64/include/arch/special.h
new file mode 100644
index 0000000000000..63a705c622a4b
--- /dev/null
+++ b/tools/objtool/arch/arm64/include/arch/special.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _ARM64_ARCH_SPECIAL_H
+#define _ARM64_ARCH_SPECIAL_H
+
+#define EX_ENTRY_SIZE		12
+#define EX_ORIG_OFFSET		0
+#define EX_NEW_OFFSET		4
+
+#define JUMP_ENTRY_SIZE		16
+#define JUMP_ORIG_OFFSET	0
+#define JUMP_NEW_OFFSET		4
+#define JUMP_KEY_OFFSET		8
+
+#define ALT_ENTRY_SIZE		12
+#define ALT_ORIG_OFFSET		0
+#define ALT_NEW_OFFSET		4
+#define ALT_FEATURE_OFFSET	8
+#define ALT_ORIG_LEN_OFFSET	10
+#define ALT_NEW_LEN_OFFSET	11
+
+#endif /* _ARM64_ARCH_SPECIAL_H */
diff --git a/tools/objtool/arch/arm64/special.c b/tools/objtool/arch/arm64/special.c
new file mode 100644
index 0000000000000..45f283283091f
--- /dev/null
+++ b/tools/objtool/arch/arm64/special.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <objtool/special.h>
+
+void arch_handle_alternative(unsigned short feature, struct special_alt *alt)
+{
+}
+
+bool arch_support_alt_relocation(struct special_alt *special_alt,
+				 struct instruction *insn,
+				 struct reloc *reloc)
+{
+	return false;
+}
+
+
+struct reloc *arch_find_switch_table(struct objtool_file *file,
+				     struct instruction *insn)
+{
+	return NULL;
+}

From 767594e3abb46f30d9022482f5f442a793ecfca9 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:47 +0800
Subject: [PATCH 051/175] objtool: arm64: Decode add/sub instructions

Decode aarch64 additions and substractions and create stack_ops for
instructions interacting with SP or FP.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/arch/arm64/decode.c | 82 +++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index afe22c4593c8c..d8c32703874df 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -15,6 +15,22 @@
 
 #include "../../../arch/arm64/lib/insn.c"
 
+#define is_SP(reg)		(reg == AARCH64_INSN_REG_SP)
+#define is_FP(reg)		(reg == AARCH64_INSN_REG_FP)
+#define is_SPFP(reg)	(reg == AARCH64_INSN_REG_SP || reg == AARCH64_INSN_REG_FP)
+
+#define ADD_OP(op) \
+	if (!(op = calloc(1, sizeof(*op)))) \
+		return -1; \
+	else for (list_add_tail(&op->list, ops_list); op; op = NULL)
+
+static unsigned long sign_extend(unsigned long x, int nbits)
+{
+	unsigned long sign_bit = (x >> (nbits - 1)) & 1;
+
+	return ((~0UL + (sign_bit ^ 1)) << nbits) | x;
+}
+
 bool arch_callee_saved_reg(unsigned char reg)
 {
 	switch (reg) {
@@ -105,6 +121,42 @@ int arch_decode_hint_reg(u8 sp_reg, int *base)
 	return -1;
 }
 
+static inline void make_add_op(enum aarch64_insn_register dest,
+					enum aarch64_insn_register src,
+					int val, struct stack_op *op)
+{
+	op->dest.type = OP_DEST_REG;
+	op->dest.reg = dest;
+	op->src.reg = src;
+	op->src.type = val != 0 ? OP_SRC_ADD : OP_SRC_REG;
+	op->src.offset = val;
+}
+
+static void decode_add_sub_imm(u32 instr, bool set_flags,
+				  unsigned long *immediate,
+				  struct stack_op *op)
+{
+	u32 rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, instr);
+	u32 rn = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, instr);
+
+	*immediate = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, instr);
+
+	if (instr & AARCH64_INSN_LSL_12)
+		*immediate <<= 12;
+
+	if ((!set_flags && is_SP(rd)) || is_FP(rd)
+			|| is_SPFP(rn)) {
+		int value;
+
+		if (aarch64_insn_is_subs_imm(instr) || aarch64_insn_is_sub_imm(instr))
+			value = -*immediate;
+		else
+			value = *immediate;
+
+		make_add_op(rd, rn, value, op);
+	}
+}
+
 int arch_decode_instruction(struct objtool_file *file, const struct section *sec,
 			    unsigned long offset, unsigned int maxlen,
 			    unsigned int *len, enum insn_type *type,
@@ -112,6 +164,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			    struct list_head *ops_list)
 {
 	const struct elf *elf = file->elf;
+	struct stack_op *op = NULL;
 	u32 insn;
 
 	if (!is_arm64(elf))
@@ -130,6 +183,35 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 	case AARCH64_INSN_CLS_UNKNOWN:
 		WARN("can't decode instruction at %s:0x%lx", sec->name, offset);
 		return -1;
+	case AARCH64_INSN_CLS_DP_IMM:
+		/* Mov register to and from SP are aliases of add_imm */
+		if (aarch64_insn_is_add_imm(insn) ||
+		    aarch64_insn_is_sub_imm(insn)) {
+			ADD_OP(op) {
+				decode_add_sub_imm(insn, false, immediate, op);
+			}
+		}
+		else if (aarch64_insn_is_adds_imm(insn) ||
+			     aarch64_insn_is_subs_imm(insn)) {
+			ADD_OP(op) {
+				decode_add_sub_imm(insn, true, immediate, op);
+			}
+		}
+		break;
+	case AARCH64_INSN_CLS_DP_REG:
+		if (aarch64_insn_is_mov_reg(insn)) {
+			enum aarch64_insn_register rd;
+			enum aarch64_insn_register rm;
+
+			rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn);
+			rm = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RM, insn);
+			if (is_FP(rd) || is_FP(rm)) {
+				ADD_OP(op) {
+					make_add_op(rd, rm, 0, op);
+				}
+			}
+		}
+		break;
 	default:
 		break;
 	}

From 7af251491773d248c7a28d4bb1329a92a6af0438 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:48 +0800
Subject: [PATCH 052/175] objtool: arm64: Decode jump and call related
 instructions

Decode branch, branch and link (aarch64's call) and return instructions.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/arch/arm64/decode.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index d8c32703874df..40ada17d0842f 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -212,6 +212,27 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			}
 		}
 		break;
+	case AARCH64_INSN_CLS_BR_SYS:
+		if (aarch64_insn_is_ret(insn) &&
+		    aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn)
+			== AARCH64_INSN_REG_LR) {
+			*type = INSN_RETURN;
+		} else if (aarch64_insn_is_bl(insn)) {
+			*type = INSN_CALL;
+			*immediate = aarch64_get_branch_offset(insn);
+		} else if (aarch64_insn_is_blr(insn)) {
+			*type = INSN_CALL_DYNAMIC;
+		} else if (aarch64_insn_is_b(insn)) {
+			*type = INSN_JUMP_UNCONDITIONAL;
+			*immediate = aarch64_get_branch_offset(insn);
+		} else if (aarch64_insn_is_br(insn)) {
+			*type = INSN_JUMP_DYNAMIC;
+		} else if (aarch64_insn_is_branch_imm(insn)) {
+			/* Remaining branch opcodes are conditional */
+			*type = INSN_JUMP_CONDITIONAL;
+			*immediate = aarch64_get_branch_offset(insn);
+		}
+		break;
 	default:
 		break;
 	}

From 8dae17c8991bcda34daa930d54d303ec8ed0bed5 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:49 +0800
Subject: [PATCH 053/175] objtool: arm64: Decode other system instructions

Decode ERET, BRK and NOPs

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/arch/arm64/decode.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index 40ada17d0842f..19840862f3aac 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -231,6 +231,14 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			/* Remaining branch opcodes are conditional */
 			*type = INSN_JUMP_CONDITIONAL;
 			*immediate = aarch64_get_branch_offset(insn);
+		} else if (aarch64_insn_is_eret(insn)) {
+			*type = INSN_CONTEXT_SWITCH;
+		} else if (aarch64_insn_is_hint(insn) ||
+				   aarch64_insn_is_barrier(insn)) {
+			*type = INSN_NOP;
+		} else if (aarch64_insn_is_brk(insn)) {
+			*type = INSN_BUG;
+			*immediate = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn);
 		}
 		break;
 	default:

From 9c50315500fe6069d58637d33d222d9f1a9a958a Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:50 +0800
Subject: [PATCH 054/175] objtool: arm64: Decode load/store instructions

Decode load/store operations and create corresponding stack_ops for
operations targeting SP or FP.

Operations storing/loading multiple registers are split into separate
stack_ops storing single registers.

Operations modifying the base register get an additional stack_op
for the register update. Since the atomic register(s) load/store + base
register update gets split into multiple operations, to make sure
objtool always sees a valid stack, consider store instruction to perform
stack allocations (i.e. modifying the base pointer before the storing)
and loads de-allocations (i.e. modifying the base pointer after the
load).

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/arch/arm64/decode.c | 112 ++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index 19840862f3aac..8ce9d91ff0db3 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -132,6 +132,114 @@ static inline void make_add_op(enum aarch64_insn_register dest,
 	op->src.offset = val;
 }
 
+static inline void make_store_op(enum aarch64_insn_register base,
+					  enum aarch64_insn_register reg,
+					  int offset, struct stack_op *op)
+{
+	op->dest.type = OP_DEST_REG_INDIRECT;
+	op->dest.reg = base;
+	op->dest.offset = offset;
+	op->src.type = OP_SRC_REG;
+	op->src.reg = reg;
+	op->src.offset = 0;
+}
+
+static inline void make_load_op(enum aarch64_insn_register base,
+					 enum aarch64_insn_register reg,
+					 int offset, struct stack_op *op)
+{
+	op->dest.type = OP_DEST_REG;
+	op->dest.reg = reg;
+	op->dest.offset = 0;
+	op->src.type = OP_SRC_REG_INDIRECT;
+	op->src.reg = base;
+	op->src.offset = offset;
+}
+
+static inline bool aarch64_insn_is_ldst_pre(u32 insn)
+{
+	return aarch64_insn_is_store_pre(insn) ||
+		   aarch64_insn_is_load_pre(insn) ||
+		   aarch64_insn_is_stp_pre(insn) ||
+		   aarch64_insn_is_ldp_pre(insn);
+}
+
+static inline bool aarch64_insn_is_ldst_post(u32 insn)
+{
+	return aarch64_insn_is_store_post(insn) ||
+		   aarch64_insn_is_load_post(insn) ||
+		   aarch64_insn_is_stp_post(insn) ||
+		   aarch64_insn_is_ldp_post(insn);
+}
+
+static int decode_load_store(u32 insn, unsigned long *immediate,
+				 struct list_head *ops_list)
+{
+	enum aarch64_insn_register base;
+	enum aarch64_insn_register rt;
+	struct stack_op *op;
+	int size;
+	int offset;
+
+	if (aarch64_insn_is_store_single(insn) ||
+			aarch64_insn_is_load_single(insn))
+		size = 1 << ((insn & GENMASK(31, 30)) >> 30);
+	else
+		size = 4 << ((insn >> 31) & 1);
+
+	if (aarch64_insn_is_store_pair(insn) ||
+			aarch64_insn_is_load_pair(insn))
+		*immediate = size * sign_extend(aarch64_insn_decode_immediate(AARCH64_INSN_IMM_7,
+									      insn), 7);
+	else if (aarch64_insn_is_store_imm(insn) ||
+			aarch64_insn_is_load_imm(insn))
+		*immediate = size * aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, insn);
+	else /* load/store_pre/post */
+		*immediate = sign_extend(aarch64_insn_decode_immediate(AARCH64_INSN_IMM_9,
+								       insn), 9);
+
+	base = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn);
+	if (!is_SPFP(base))
+		return 0;
+
+	if (aarch64_insn_is_ldst_post(insn))
+		offset = 0;
+	else
+		offset = *immediate;
+
+	/* First register */
+	rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn);
+	ADD_OP(op) {
+		if (aarch64_insn_is_store_single(insn) ||
+			aarch64_insn_is_store_pair(insn))
+			make_store_op(base, rt, offset, op);
+		else
+			make_load_op(base, rt, offset, op);
+	}
+
+	/* Second register (if present) */
+	if (aarch64_insn_is_store_pair(insn) ||
+			aarch64_insn_is_load_pair(insn)) {
+		rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT2,
+						  insn);
+		ADD_OP(op) {
+			if (aarch64_insn_is_store_pair(insn))
+				make_store_op(base, rt, offset + size, op);
+			else
+				make_load_op(base, rt, offset + size, op);
+		}
+	}
+
+	if (aarch64_insn_is_ldst_pre(insn) ||
+			aarch64_insn_is_ldst_post(insn)) {
+		ADD_OP(op) {
+			make_add_op(base, base, *immediate, op);
+		}
+	}
+
+	return 0;
+}
+
 static void decode_add_sub_imm(u32 instr, bool set_flags,
 				  unsigned long *immediate,
 				  struct stack_op *op)
@@ -241,6 +349,10 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			*immediate = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn);
 		}
 		break;
+	case AARCH64_INSN_CLS_LDST:
+	{
+		return decode_load_store(insn, immediate, ops_list);
+	}
 	default:
 		break;
 	}

From c5dfd1cd734292d314bbe5c535f2eb7e1be34086 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:51 +0800
Subject: [PATCH 055/175] objtool: arm64: Decode LDR instructions

Load literal instructions can generate constants inside code sections.
Record the locations of the constants in order to be able to remove
their corresponding "struct instruction".

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/arch/arm64/decode.c    | 88 +++++++++++++++++++++++++++-
 tools/objtool/arch/x86/decode.c      |  5 ++
 tools/objtool/check.c                |  3 +
 tools/objtool/include/objtool/arch.h |  2 +
 4 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index 8ce9d91ff0db3..30300d05c8f38 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -31,6 +31,64 @@ static unsigned long sign_extend(unsigned long x, int nbits)
 	return ((~0UL + (sign_bit ^ 1)) << nbits) | x;
 }
 
+struct insn_loc {
+	const struct section *sec;
+	unsigned long offset;
+	struct hlist_node hnode;
+};
+
+DEFINE_HASHTABLE(invalid_insns, 16);
+
+static int record_invalid_insn(const struct section *sec,
+			       unsigned long offset)
+{
+	struct insn_loc *loc;
+	struct hlist_head *l;
+
+	l = &invalid_insns[hash_min(offset, HASH_BITS(invalid_insns))];
+	if (!hlist_empty(l)) {
+		loc = hlist_entry(l->first, struct insn_loc, hnode);
+		return 0;
+	}
+
+	loc = malloc(sizeof(*loc));
+	if (!loc) {
+		WARN("malloc failed");
+		return -1;
+	}
+
+	loc->sec = sec;
+	loc->offset = offset;
+
+	hash_add(invalid_insns, &loc->hnode, loc->offset);
+
+	return 0;
+}
+
+int arch_post_process_instructions(struct objtool_file *file)
+{
+	struct hlist_node *tmp;
+	struct insn_loc *loc;
+	unsigned int bkt;
+	int res = 0;
+
+	hash_for_each_safe(invalid_insns, bkt, tmp, loc, hnode) {
+		struct instruction *insn;
+
+		insn = find_insn(file, (struct section *) loc->sec, loc->offset);
+		if (insn) {
+			list_del(&insn->list);
+			hash_del(&insn->hash);
+			free(insn);
+		}
+
+		hash_del(&loc->hnode);
+		free(loc);
+	}
+
+	return res;
+}
+
 bool arch_callee_saved_reg(unsigned char reg)
 {
 	switch (reg) {
@@ -351,7 +409,35 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		break;
 	case AARCH64_INSN_CLS_LDST:
 	{
-		return decode_load_store(insn, immediate, ops_list);
+		int ret;
+
+		ret = decode_load_store(insn, immediate, ops_list);
+		if (ret <= 0)
+			return ret;
+
+		/*
+		 * For LDR ops, assembler can generate the data to be
+		 * loaded in the code section
+		 * Record and remove these data because they
+		 * are never excuted
+		 */
+		if (aarch64_insn_is_ldr_lit(insn)) {
+			long pc_offset;
+
+			pc_offset = insn & GENMASK(23, 5);
+			/* Sign extend and multiply by 4 */
+			pc_offset = (pc_offset << (64 - 23));
+			pc_offset = ((pc_offset >> (64 - 23)) >> 5) << 2;
+
+			ret = record_invalid_insn(sec, offset + pc_offset);
+
+			/* 64-bit literal */
+			if (insn & BIT(30))
+				ret = record_invalid_insn(sec, offset + pc_offset + 4);
+
+			return ret;
+		}
+		break;
 	}
 	default:
 		break;
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 1ed49ab4e871f..8548ef5867e6b 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -699,6 +699,11 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 	return 0;
 }
 
+int arch_post_process_instructions(struct objtool_file *file)
+{
+	return 0;
+}
+
 void arch_initial_func_cfi_state(struct cfi_init_state *state)
 {
 	int i;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 7c0341035ed81..821c906e81c7d 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -444,6 +444,9 @@ static int decode_instructions(struct objtool_file *file)
 	if (opts.stats)
 		printf("nr_insns: %lu\n", nr_insns);
 
+	if (arch_post_process_instructions(file))
+		return -1;
+
 	return 0;
 
 err:
diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index 861c0c60ac81e..bc305cbe048a9 100644
--- a/tools/objtool/include/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -77,6 +77,8 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			    unsigned long *immediate,
 			    struct list_head *ops_list);
 
+int arch_post_process_instructions(struct objtool_file *file);
+
 bool arch_callee_saved_reg(unsigned char reg);
 
 unsigned long arch_jump_destination(struct instruction *insn);

From bab4f741fed44fc62220ebb4548b441a9a602431 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:52 +0800
Subject: [PATCH 056/175] objtool: arm64: Accept non-instruction data in code
 sections

The compiler can generate some '0x0' words in code sections to pad the
end of functions. Also some pesudo-instructions can generate data in
code sections. Mark them as INSN_NOP.

If there are other undecoded instructions, just record and remove them
from validation list.

These doesn't influence check and orc generation because these undecoded
instructions also won't be excuted.

Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/arch/arm64/decode.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index 30300d05c8f38..771d37d872c8a 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -347,8 +347,14 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 
 	switch (aarch64_get_insn_class(insn)) {
 	case AARCH64_INSN_CLS_UNKNOWN:
-		WARN("can't decode instruction at %s:0x%lx", sec->name, offset);
-		return -1;
+		if (insn == 0x0) {
+			*type = INSN_NOP;
+		} else {
+			WARN("undecoded insn at %s:0x%lx", sec->name, offset);
+			return record_invalid_insn(sec, offset);
+		}
+
+		break;
 	case AARCH64_INSN_CLS_DP_IMM:
 		/* Mov register to and from SP are aliases of add_imm */
 		if (aarch64_insn_is_add_imm(insn) ||

From 158954482f89a08d7f8875eb3cbb7cd5ec03c00b Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:53 +0800
Subject: [PATCH 057/175] objtool: check: Support data in text section

Arm64 assembly code can mix code and data in text sections through the use
of SYM_DATA_*() macros. Skip the content of these symbols when decoding
instructions of text sections.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/check.c               | 14 ++++++++++++--
 tools/objtool/elf.c                 | 14 ++++++++++++++
 tools/objtool/include/objtool/elf.h |  1 +
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 821c906e81c7d..08d84fb19e89f 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -363,7 +363,7 @@ static int decode_instructions(struct objtool_file *file)
 {
 	struct section *sec;
 	struct symbol *func;
-	unsigned long offset;
+	unsigned long offset, next_offset;
 	struct instruction *insn;
 	int ret;
 
@@ -382,7 +382,15 @@ static int decode_instructions(struct objtool_file *file)
 		    !strncmp(sec->name, ".text..__x86.", 13))
 			sec->noinstr = true;
 
-		for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) {
+		for (offset = 0; offset < sec->sh.sh_size; offset = next_offset) {
+			struct symbol *obj_sym = find_object_containing(sec, offset);
+
+			if (obj_sym) {
+				/* This is data in the middle of text section, skip it */
+				next_offset = obj_sym->offset + obj_sym->len;
+				continue;
+			}
+
 			insn = malloc(sizeof(*insn));
 			if (!insn) {
 				WARN("malloc failed");
@@ -415,6 +423,8 @@ static int decode_instructions(struct objtool_file *file)
 			hash_add(file->insn_hash, &insn->hash, sec_offset_hash(sec, insn->offset));
 			list_add_tail(&insn->list, &file->insn_list);
 			nr_insns++;
+
+			next_offset = offset + insn->len;
 		}
 
 		list_for_each_entry(func, &sec->symbol_list, list) {
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 7e24b09b1163a..0f76525d6f8e6 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -236,6 +236,20 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset)
 	return NULL;
 }
 
+struct symbol *find_object_containing(const struct section *sec, unsigned long offset)
+{
+	struct rb_node *node;
+
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
+		struct symbol *s = rb_entry(node, struct symbol, node);
+
+		if (s->type == STT_OBJECT)
+			return s;
+	}
+
+	return NULL;
+}
+
 struct symbol *find_symbol_by_name(const struct elf *elf, const char *name)
 {
 	struct symbol *sym;
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 5d4a841fbd311..4df6f1dcbc64e 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -169,6 +169,7 @@ struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, uns
 struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec,
 				     unsigned long offset, unsigned int len);
 struct symbol *find_func_containing(struct section *sec, unsigned long offset);
+struct symbol *find_object_containing(const struct section *sec, unsigned long offset);
 
 #define for_each_sec(file, sec)						\
 	list_for_each_entry(sec, &file->elf->sections, list)

From eb36ce3bdd394249688bb90a71003fa210f7148a Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:54 +0800
Subject: [PATCH 058/175] objtool: arm64: Handle supported relocations in
 alternatives

Based on get_alt_insn() in arch/arm64/kernel/alternative.c, arm64
alternative code adapts offsets for static branches and adrp
instructions.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/arch/arm64/special.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/objtool/arch/arm64/special.c b/tools/objtool/arch/arm64/special.c
index 45f283283091f..a70b91e8bd7de 100644
--- a/tools/objtool/arch/arm64/special.c
+++ b/tools/objtool/arch/arm64/special.c
@@ -10,7 +10,11 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
 				 struct instruction *insn,
 				 struct reloc *reloc)
 {
-	return false;
+	u32 opcode = *(u32 *)(insn->sec->data->d_buf + insn->offset);
+
+	return aarch64_insn_is_branch_imm(opcode) ||
+	       aarch64_insn_is_adrp(opcode) ||
+	       !aarch64_insn_uses_literal(opcode);
 }
 
 

From 298efb4b10cdb9984465309cd98f7bdecdce59e7 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:55 +0800
Subject: [PATCH 059/175] objtool: arm64: Ignore replacement section for
 alternative callback

ARM64_CB_PATCH doesn't have static replacement instructions. Skip
trying to validate the alternative section.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 tools/objtool/arch/arm64/special.c | 11 +++++++++++
 tools/objtool/check.c              |  3 +++
 2 files changed, 14 insertions(+)

diff --git a/tools/objtool/arch/arm64/special.c b/tools/objtool/arch/arm64/special.c
index a70b91e8bd7de..8bb1ebd2132a3 100644
--- a/tools/objtool/arch/arm64/special.c
+++ b/tools/objtool/arch/arm64/special.c
@@ -4,6 +4,17 @@
 
 void arch_handle_alternative(unsigned short feature, struct special_alt *alt)
 {
+	/*
+	 * ARM64_CB_PATCH has no alternative instruction.
+	 * a callback is called at alternative replacement time
+	 * to dynamically change the original instructions.
+	 *
+	 * ARM64_CB_PATCH is the last ARM64 feature, it's value changes
+	 * every time a new feature is added. So the orig/alt region
+	 * length are used to detect those alternatives
+	 */
+	if (alt->orig_len && !alt->new_len)
+		alt->skip_alt = true;
 }
 
 bool arch_support_alt_relocation(struct special_alt *special_alt,
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 08d84fb19e89f..6eb1ce881533d 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1828,6 +1828,9 @@ static int add_special_section_alts(struct objtool_file *file)
 				continue;
 			}
 
+			if (special_alt->skip_alt && !special_alt->new_len)
+				continue;
+
 			ret = handle_group_alt(file, special_alt, orig_insn,
 					       &new_insn);
 			if (ret)

From 48ef2cb69d8a02b2bb1e7966b7369a55d3f65c0a Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:56 +0800
Subject: [PATCH 060/175] objtool: arm64: Enable stack validation for arm64

Add build option to run stack validation at compile time.

When requiring stack validation, jump tables are disabled as it
simplifies objtool analysis (without having to introduce unreliable
artifacs). In local testing, this does not appear to significaly
affect final binary size nor system performance.

Signed-off-by: Raphael Gault <raphael.gault@arm.com>
Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/Kconfig       |  2 ++
 arch/arm64/Kconfig.debug | 21 +++++++++++++++++++++
 arch/arm64/Makefile      |  4 ++++
 3 files changed, 27 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e04dfa6b2ba88..979b97c328023 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -204,6 +204,8 @@ config ARM64
 	select MMU_GATHER_RCU_TABLE_FREE
 	select HAVE_RSEQ
 	select HAVE_STACKPROTECTOR
+	select HAVE_OBJTOOL
+	select HAVE_STACK_VALIDATION
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 265c4461031f4..c2c68c6f75578 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -20,4 +20,25 @@ config ARM64_RELOC_TEST
 	depends on m
 	tristate "Relocation testing module"
 
+choice
+    prompt "Choose kernel unwinder"
+    default UNWINDER_FRAME_POINTER
+    help
+      This determines which method will be used for unwinding kernel stack
+      traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
+      livepatch, lockdep, and more.
+
+config UNWINDER_FRAME_POINTER
+    bool "Frame pointer unwinder"
+    select FRAME_POINTER
+    help
+      This option enables the frame pointer unwinder for unwinding kernel
+      stack traces.
+
+      The unwinder itself is fast and it uses less RAM than the ORC
+      unwinder, but the kernel text size will grow by ~3% and the kernel's
+      overall performance will degrade by roughly 5-10%.
+
+endchoice
+
 source "drivers/hwtracing/coresight/Kconfig"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index c9496539c3351..265810b8b04ec 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -133,6 +133,10 @@ ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y)
   CC_FLAGS_FTRACE := -fpatchable-function-entry=2
 endif
 
+ifeq ($(CONFIG_STACK_VALIDATION),y)
+KBUILD_CFLAGS	+= -fno-jump-tables
+endif
+
 ifeq ($(CONFIG_KASAN_SW_TAGS), y)
 KASAN_SHADOW_SCALE_SHIFT := 4
 else ifeq ($(CONFIG_KASAN_GENERIC), y)

From 1519c6835dec891d9abe51f510795ed2e97975fb Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 28 Mar 2023 11:05:10 -0700
Subject: [PATCH 061/175] Revert "arm64: alternatives: add shared NOP callback"

This reverts commit d926079f17bf8aa47485b6a55be1fc0175dbd1db.

The above commit converts alternative_has_feature_likely() to use the
alternative callback mechanism to patch the nops rather than requiring that they
be stored in the alternative section. This is done to save space.

On a 6.1 kernel with the currently used AL2023 defconfig this saved about 11kB
in the vmlinux.

However the issue with this approach is that objtool cannot determine which
instructions will be patched in by the callback and so can't validate the code
path. This leads to "unreachable instruction" warnings for every use of
alternative_has_feature_likely(). We could add a "reachable" annotation however
this would mean that the alternative code path in all these scenarios wouldn't
be validated by objtool.

Thus revert the patch due to the limited space savings and the improved
reliability achieved by having objtool analyse these alternative code paths.

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/alternative-macros.h | 2 +-
 arch/arm64/kernel/alternative.c             | 8 --------
 arch/arm64/kernel/image-vars.h              | 1 -
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index 51738c56e96cd..1c4e9d0430b77 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -230,7 +230,7 @@ alternative_has_feature_likely(unsigned long feature)
 			   "feature must be < ARM64_NCAPS");
 
 	asm goto(
-	ALTERNATIVE_CB("b	%l[l_no]", %[feature], alt_cb_patch_nops)
+	ALTERNATIVE("b	%l[l_no]", "nop", %[feature])
 	:
 	: [feature] "i" (feature)
 	:
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 91263d09ea650..43aec57da78ac 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -291,11 +291,3 @@ void apply_alternatives_module(void *start, size_t length)
 	__apply_alternatives(&region, true, &all_capabilities[0]);
 }
 #endif
-
-noinstr void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr,
-			       __le32 *updptr, int nr_inst)
-{
-	for (int i = 0; i < nr_inst; i++)
-		updptr[i] = cpu_to_le32(aarch64_insn_gen_nop());
-}
-EXPORT_SYMBOL(alt_cb_patch_nops);
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8151412653de2..a47e6185efa33 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -60,7 +60,6 @@ KVM_NVHE_ALIAS(spectre_bhb_patch_loop_iter);
 KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable);
 KVM_NVHE_ALIAS(spectre_bhb_patch_wa3);
 KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb);
-KVM_NVHE_ALIAS(alt_cb_patch_nops);
 
 /* Global kernel state accessed by nVHE hyp code. */
 KVM_NVHE_ALIAS(kvm_vgic_global_state);

From 7adc19cc3665c7f56e8d9b1c1ef7436f40cb7cb9 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:58 +0800
Subject: [PATCH 062/175] objtool: arm64: Add annotate_reachable() for objtools

x86 removed annotate_reachable and replaced it with ASM_REACHABLE
which is not suitable for arm64 micro because there are some cases
GCC will merge duplicate inline asm.

Re-add annotation_reachable() for arm64.

Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 include/linux/compiler.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 973a1bfd7ef53..92689cff87d97 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -117,6 +117,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
  */
 #define __stringify_label(n) #n
 
+#define __annotate_reachable(c) ({					\
+	asm volatile(__stringify_label(c) ":\n\t"			\
+			".pushsection .discard.reachable\n\t"		\
+			".long " __stringify_label(c) "b - .\n\t"		\
+			".popsection\n\t");				\
+})
+#define annotate_reachable() __annotate_reachable(__COUNTER__)
+
 #define __annotate_unreachable(c) ({					\
 	asm volatile(__stringify_label(c) ":\n\t"			\
 		     ".pushsection .discard.unreachable\n\t"		\
@@ -129,6 +137,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #define __annotate_jump_table __section(".rodata..c_jump_table")
 
 #else /* !CONFIG_OBJTOOL */
+#define annotate_reachable()
 #define annotate_unreachable()
 #define __annotate_jump_table
 #endif /* CONFIG_OBJTOOL */

From bacfabdbde27c61db655ecd826fd7fbb13071c56 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:48:59 +0800
Subject: [PATCH 063/175] arm64: bug: Add reachable annotation to warning
 macros

WARN* and BUG* both use brk #0x800 opcodes and the distinction is
provided by the contents of the bug table. This table is not accessible
to objtool, so add an annotation to WARN* macros to let objtool know
that brk handler will return an resume the execution of the instructions
following the WARN's brk.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/include/asm/bug.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 28be048db3f63..9917429971d48 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -19,7 +19,11 @@
 	unreachable();					\
 } while (0)
 
-#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))
+#define __WARN_FLAGS(flags)			\
+do {						\
+	__BUG_FLAGS(BUGFLAG_WARNING|(flags));	\
+	annotate_reachable();			\
+} while (0)
 
 #define HAVE_ARCH_BUG
 

From 822c8631fa098ef0b87a51cbebef81f98f12aef0 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:00 +0800
Subject: [PATCH 064/175] arm64: kgdb: Add reachable annotation after kgdb brk

In the general use case, KGDB breakpoint handler should return normally
to the instruction following the brk.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/include/asm/kgdb.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h
index 21fc85e9d2bed..a8cb91d8d59b3 100644
--- a/arch/arm64/include/asm/kgdb.h
+++ b/arch/arm64/include/asm/kgdb.h
@@ -19,6 +19,7 @@
 static inline void arch_kgdb_breakpoint(void)
 {
 	asm ("brk %0" : : "I" (KGDB_COMPILED_DBG_BRK_IMM));
+	annotate_reachable();
 }
 
 extern void kgdb_handle_bus_error(void);

From 3e00c5f059f52cbbeaf0b00f2b16fbe842da2589 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:01 +0800
Subject: [PATCH 065/175] objtool: arm64: Add unwind_hint support

Provide unwind hint defines for arm64 and objtool hint decoding.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/include/asm/unwind_hints.h       | 27 +++++++++++++++++++++
 tools/arch/arm64/include/asm/unwind_hints.h | 27 +++++++++++++++++++++
 tools/objtool/arch/arm64/decode.c           | 14 ++++++++++-
 3 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/unwind_hints.h
 create mode 100644 tools/arch/arm64/include/asm/unwind_hints.h

diff --git a/arch/arm64/include/asm/unwind_hints.h b/arch/arm64/include/asm/unwind_hints.h
new file mode 100644
index 0000000000000..8655058aa63c8
--- /dev/null
+++ b/arch/arm64/include/asm/unwind_hints.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_UNWIND_HINTS_H
+#define __ASM_UNWIND_HINTS_H
+
+#include <linux/objtool.h>
+
+#define UNWIND_HINT_REG_UNDEFINED	0xff
+#define UNWIND_HINT_REG_SP		31
+
+#ifdef __ASSEMBLY__
+
+.macro UNWIND_HINT_EMPTY
+	UNWIND_HINT sp_reg=UNWIND_HINT_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=0
+	UNWIND_HINT sp_reg=UNWIND_HINT_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL
+.endm
+
+.macro UNWIND_HINT_REGS base=UNWIND_HINT_REG_SP offset=0
+	UNWIND_HINT sp_reg=\base sp_offset=\offset type=UNWIND_HINT_TYPE_REGS
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_UNWIND_HINTS_H */
diff --git a/tools/arch/arm64/include/asm/unwind_hints.h b/tools/arch/arm64/include/asm/unwind_hints.h
new file mode 100644
index 0000000000000..8655058aa63c8
--- /dev/null
+++ b/tools/arch/arm64/include/asm/unwind_hints.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_UNWIND_HINTS_H
+#define __ASM_UNWIND_HINTS_H
+
+#include <linux/objtool.h>
+
+#define UNWIND_HINT_REG_UNDEFINED	0xff
+#define UNWIND_HINT_REG_SP		31
+
+#ifdef __ASSEMBLY__
+
+.macro UNWIND_HINT_EMPTY
+	UNWIND_HINT sp_reg=UNWIND_HINT_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=0
+	UNWIND_HINT sp_reg=UNWIND_HINT_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL
+.endm
+
+.macro UNWIND_HINT_REGS base=UNWIND_HINT_REG_SP offset=0
+	UNWIND_HINT sp_reg=\base sp_offset=\offset type=UNWIND_HINT_TYPE_REGS
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_UNWIND_HINTS_H */
diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index 771d37d872c8a..c585bef3b75a1 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -5,6 +5,7 @@
 #include <stdint.h>
 
 #include <asm/insn.h>
+#include <asm/unwind_hints.h>
 
 #include <objtool/check.h>
 #include <objtool/arch.h>
@@ -176,7 +177,18 @@ static int is_arm64(const struct elf *elf)
 
 int arch_decode_hint_reg(u8 sp_reg, int *base)
 {
-	return -1;
+	switch (sp_reg) {
+	case UNWIND_HINT_REG_UNDEFINED:
+		*base = CFI_UNDEFINED;
+		break;
+	case UNWIND_HINT_REG_SP:
+		*base = CFI_SP;
+		break;
+	default:
+		return -1;
+	}
+
+	return 0;
 }
 
 static inline void make_add_op(enum aarch64_insn_register dest,

From a26fa032fd88f52e597ad54c466837a8f9f43b6d Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:02 +0800
Subject: [PATCH 066/175] arm64: Change symbol type annotations

Code symbols not following the aarch64 procedure call convention should
be annotated with SYM_CODE_* instead of SYM_FUNC_*

Mark relevant symbols as generic code symbols.

Also replace SYM_INNER_LABEL for __swpan_entry_el0 becasuse
SYM_INNER_LABEL generates zero-size-label which can't be correctly loaded
in objtool.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/kernel/entry.S  | 10 ++++--
 arch/arm64/kernel/head.S   | 68 +++++++++++++++++++-------------------
 arch/arm64/kernel/sleep.S  |  4 +--
 arch/arm64/kvm/hyp/entry.S |  4 +--
 4 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 62146d48dba73..2ee5826427401 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -464,11 +464,15 @@ SYM_CODE_START_LOCAL(__swpan_entry_el1)
 	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
 	b.eq	1f				// TTBR0 access already disabled
 	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
-SYM_INNER_LABEL(__swpan_entry_el0, SYM_L_LOCAL)
 	__uaccess_ttbr0_disable x21
 1:	ret
 SYM_CODE_END(__swpan_entry_el1)
 
+SYM_CODE_START_LOCAL(__swpan_entry_el0)
+	__uaccess_ttbr0_disable x21
+1:	ret
+SYM_CODE_END(__swpan_entry_el0)
+
 	/*
 	 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
 	 * PAN bit checking.
@@ -771,11 +775,11 @@ SYM_CODE_START_NOALIGN(tramp_vectors)
 	generate_tramp_vector	kpti=1, bhb=BHB_MITIGATION_NONE
 SYM_CODE_END(tramp_vectors)
 
-SYM_CODE_START(tramp_exit_native)
+SYM_CODE_START_LOCAL(tramp_exit_native)
 	tramp_exit
 SYM_CODE_END(tramp_exit_native)
 
-SYM_CODE_START(tramp_exit_compat)
+SYM_CODE_START_LOCAL(tramp_exit_compat)
 	tramp_exit	32
 SYM_CODE_END(tramp_exit_compat)
 	.popsection				// .entry.tramp.text
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index cdbbc95eb49d0..6790fd2f7d158 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -126,7 +126,7 @@ SYM_CODE_START_LOCAL(preserve_boot_args)
 	b	dcache_inval_poc		// tail call
 SYM_CODE_END(preserve_boot_args)
 
-SYM_FUNC_START_LOCAL(clear_page_tables)
+SYM_CODE_START_LOCAL(clear_page_tables)
 	/*
 	 * Clear the init page tables.
 	 */
@@ -135,7 +135,7 @@ SYM_FUNC_START_LOCAL(clear_page_tables)
 	sub	x2, x1, x0
 	mov	x1, xzr
 	b	__pi_memset			// tail call
-SYM_FUNC_END(clear_page_tables)
+SYM_CODE_END(clear_page_tables)
 
 /*
  * Macro to populate page table entries, these entries can be pointers to the next level
@@ -259,7 +259,7 @@ SYM_FUNC_END(clear_page_tables)
  * x5: attributes to set on the updated region
  * x6: order of the last level mappings
  */
-SYM_FUNC_START_LOCAL(remap_region)
+SYM_CODE_START_LOCAL(remap_region)
 	sub	x3, x3, #1		// make end inclusive
 
 	// Get the index offset for the start of the last level table
@@ -278,9 +278,9 @@ SYM_FUNC_START_LOCAL(remap_region)
 
 	populate_entries x0, x4, x2, x3, x5, x6, x7
 	ret
-SYM_FUNC_END(remap_region)
+SYM_CODE_END(remap_region)
 
-SYM_FUNC_START_LOCAL(create_idmap)
+SYM_CODE_START_LOCAL(create_idmap)
 	mov	x28, lr
 	/*
 	 * The ID map carries a 1:1 mapping of the physical address range
@@ -366,9 +366,9 @@ SYM_FUNC_START_LOCAL(create_idmap)
 	adrp	x1, init_idmap_pg_end
 	bl	dcache_inval_poc
 	ret	x28
-SYM_FUNC_END(create_idmap)
+SYM_CODE_END(create_idmap)
 
-SYM_FUNC_START_LOCAL(create_kernel_mapping)
+SYM_CODE_START_LOCAL(create_kernel_mapping)
 	adrp	x0, init_pg_dir
 	mov_q	x5, KIMAGE_VADDR		// compile time __va(_text)
 #ifdef CONFIG_RELOCATABLE
@@ -384,7 +384,7 @@ SYM_FUNC_START_LOCAL(create_kernel_mapping)
 
 	dsb	ishst				// sync with page table walker
 	ret
-SYM_FUNC_END(create_kernel_mapping)
+SYM_CODE_END(create_kernel_mapping)
 
 	/*
 	 * Initialize CPU registers with task-specific and cpu-specific context.
@@ -417,7 +417,7 @@ SYM_FUNC_END(create_kernel_mapping)
  *
  *   x0 = __pa(KERNEL_START)
  */
-SYM_FUNC_START_LOCAL(__primary_switched)
+SYM_CODE_START_LOCAL(__primary_switched)
 	adr_l	x4, init_task
 	init_cpu_task x4, x5, x6
 
@@ -467,7 +467,7 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 	ldp	x29, x30, [sp], #16
 	bl	start_kernel
 	ASM_BUG()
-SYM_FUNC_END(__primary_switched)
+SYM_CODE_END(__primary_switched)
 
 /*
  * end early head section, begin head code that is also used for
@@ -487,7 +487,7 @@ SYM_FUNC_END(__primary_switched)
  * booted in EL1 or EL2 respectively, with the top 32 bits containing
  * potential context flags. These flags are *not* stored in __boot_cpu_mode.
  */
-SYM_FUNC_START(init_kernel_el)
+SYM_CODE_START(init_kernel_el)
 	mrs	x0, CurrentEL
 	cmp	x0, #CurrentEL_EL2
 	b.eq	init_el2
@@ -538,26 +538,26 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
 	mov	w0, #BOOT_CPU_MODE_EL2
 	orr	x0, x0, x2
 	eret
-SYM_FUNC_END(init_kernel_el)
+SYM_CODE_END(init_kernel_el)
 
 /*
  * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
  * in w0. See arch/arm64/include/asm/virt.h for more info.
  */
-SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
+SYM_CODE_START_LOCAL(set_cpu_boot_mode_flag)
 	adr_l	x1, __boot_cpu_mode
 	cmp	w0, #BOOT_CPU_MODE_EL2
 	b.ne	1f
 	add	x1, x1, #4
 1:	str	w0, [x1]			// Save CPU boot mode
 	ret
-SYM_FUNC_END(set_cpu_boot_mode_flag)
+SYM_CODE_END(set_cpu_boot_mode_flag)
 
 	/*
 	 * This provides a "holding pen" for platforms to hold all secondary
 	 * cores are held until we're ready for them to initialise.
 	 */
-SYM_FUNC_START(secondary_holding_pen)
+SYM_CODE_START(secondary_holding_pen)
 	bl	init_kernel_el			// w0=cpu_boot_mode
 	mrs	x2, mpidr_el1
 	mov_q	x1, MPIDR_HWID_BITMASK
@@ -568,18 +568,18 @@ pen:	ldr	x4, [x3]
 	b.eq	secondary_startup
 	wfe
 	b	pen
-SYM_FUNC_END(secondary_holding_pen)
+SYM_CODE_END(secondary_holding_pen)
 
 	/*
 	 * Secondary entry point that jumps straight into the kernel. Only to
 	 * be used where CPUs are brought online dynamically by the kernel.
 	 */
-SYM_FUNC_START(secondary_entry)
+SYM_CODE_START(secondary_entry)
 	bl	init_kernel_el			// w0=cpu_boot_mode
 	b	secondary_startup
-SYM_FUNC_END(secondary_entry)
+SYM_CODE_END(secondary_entry)
 
-SYM_FUNC_START_LOCAL(secondary_startup)
+SYM_CODE_START_LOCAL(secondary_startup)
 	/*
 	 * Common entry point for secondary CPUs.
 	 */
@@ -595,9 +595,9 @@ SYM_FUNC_START_LOCAL(secondary_startup)
 	bl	__enable_mmu
 	ldr	x8, =__secondary_switched
 	br	x8
-SYM_FUNC_END(secondary_startup)
+SYM_CODE_END(secondary_startup)
 
-SYM_FUNC_START_LOCAL(__secondary_switched)
+SYM_CODE_START_LOCAL(__secondary_switched)
 	mov	x0, x20
 	bl	set_cpu_boot_mode_flag
 	str_l	xzr, __early_cpu_boot_status, x3
@@ -617,13 +617,13 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
 
 	bl	secondary_start_kernel
 	ASM_BUG()
-SYM_FUNC_END(__secondary_switched)
+SYM_CODE_END(__secondary_switched)
 
-SYM_FUNC_START_LOCAL(__secondary_too_slow)
+SYM_CODE_START_LOCAL(__secondary_too_slow)
 	wfe
 	wfi
 	b	__secondary_too_slow
-SYM_FUNC_END(__secondary_too_slow)
+SYM_CODE_END(__secondary_too_slow)
 
 /*
  * The booting CPU updates the failed status @__early_cpu_boot_status,
@@ -656,7 +656,7 @@ SYM_FUNC_END(__secondary_too_slow)
  * Checks if the selected granule size is supported by the CPU.
  * If it isn't, park the CPU
  */
-SYM_FUNC_START(__enable_mmu)
+SYM_CODE_START(__enable_mmu)
 	mrs	x3, ID_AA64MMFR0_EL1
 	ubfx	x3, x3, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4
 	cmp     x3, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN
@@ -670,9 +670,9 @@ SYM_FUNC_START(__enable_mmu)
 	set_sctlr_el1	x0
 
 	ret
-SYM_FUNC_END(__enable_mmu)
+SYM_CODE_END(__enable_mmu)
 
-SYM_FUNC_START(__cpu_secondary_check52bitva)
+SYM_CODE_START_LOCAL(__cpu_secondary_check52bitva)
 #if VA_BITS > 48
 	ldr_l	x0, vabits_actual
 	cmp	x0, #52
@@ -690,9 +690,9 @@ SYM_FUNC_START(__cpu_secondary_check52bitva)
 
 #endif
 2:	ret
-SYM_FUNC_END(__cpu_secondary_check52bitva)
+SYM_CODE_END(__cpu_secondary_check52bitva)
 
-SYM_FUNC_START_LOCAL(__no_granule_support)
+SYM_CODE_START_LOCAL(__no_granule_support)
 	/* Indicate that this CPU can't boot and is stuck in the kernel */
 	update_early_cpu_boot_status \
 		CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_NO_GRAN, x1, x2
@@ -700,10 +700,10 @@ SYM_FUNC_START_LOCAL(__no_granule_support)
 	wfe
 	wfi
 	b	1b
-SYM_FUNC_END(__no_granule_support)
+SYM_CODE_END(__no_granule_support)
 
 #ifdef CONFIG_RELOCATABLE
-SYM_FUNC_START_LOCAL(__relocate_kernel)
+SYM_CODE_START_LOCAL(__relocate_kernel)
 	/*
 	 * Iterate over each entry in the relocation table, and apply the
 	 * relocations in place.
@@ -790,10 +790,10 @@ SYM_FUNC_START_LOCAL(__relocate_kernel)
 #endif
 	ret
 
-SYM_FUNC_END(__relocate_kernel)
+SYM_CODE_END(__relocate_kernel)
 #endif
 
-SYM_FUNC_START_LOCAL(__primary_switch)
+SYM_CODE_START_LOCAL(__primary_switch)
 	adrp	x1, reserved_pg_dir
 	adrp	x2, init_idmap_pg_dir
 	bl	__enable_mmu
@@ -822,4 +822,4 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 	ldr	x8, =__primary_switched
 	adrp	x0, KERNEL_START		// __pa(KERNEL_START)
 	br	x8
-SYM_FUNC_END(__primary_switch)
+SYM_CODE_END(__primary_switch)
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 97c9de57725df..82c10817125ef 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -115,7 +115,7 @@ SYM_CODE_END(cpu_resume)
 	.ltorg
 	.popsection
 
-SYM_FUNC_START(_cpu_resume)
+SYM_CODE_START(_cpu_resume)
 	mrs	x1, mpidr_el1
 	adr_l	x8, mpidr_hash		// x8 = struct mpidr_hash virt address
 
@@ -151,4 +151,4 @@ SYM_FUNC_START(_cpu_resume)
 	ldp	x29, lr, [x29]
 	mov	x0, #0
 	ret
-SYM_FUNC_END(_cpu_resume)
+SYM_CODE_END(_cpu_resume)
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 435346ea1504e..045d4481c8209 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -21,7 +21,7 @@
 /*
  * u64 __guest_enter(struct kvm_vcpu *vcpu);
  */
-SYM_FUNC_START(__guest_enter)
+SYM_CODE_START(__guest_enter)
 	// x0: vcpu
 	// x1-x17: clobbered by macros
 	// x29: guest context
@@ -212,4 +212,4 @@ abort_guest_exit_end:
 	msr	spsr_el2, x4
 	orr	x0, x0, x5
 1:	ret
-SYM_FUNC_END(__guest_enter)
+SYM_CODE_END(__guest_enter)

From 6bcd8bf5db9546ef314f52edee9bfd92c1a25b0a Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:03 +0800
Subject: [PATCH 067/175] arm64: Annotate unwind_hint for symbols with empty
 stack

Some assembly symbols contain code that might be executed with an
unspecified stack state (e.g. invalid stack pointer,
no stackframe, code after alt_cb, ...).

Annotate those symbol with UNWIND_HINT_EMPTY to let objtool be aware of
them.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/include/asm/assembler.h  |  2 ++
 arch/arm64/kernel/cpu-reset.S       |  4 +++-
 arch/arm64/kernel/efi-entry.S       |  2 ++
 arch/arm64/kernel/entry.S           |  7 +++++++
 arch/arm64/kernel/head.S            | 17 ++++++++++++++++-
 arch/arm64/kernel/hibernate-asm.S   |  2 ++
 arch/arm64/kernel/relocate_kernel.S |  2 ++
 arch/arm64/kernel/sleep.S           |  3 +++
 arch/arm64/kvm/hyp/hyp-entry.S      |  1 +
 arch/arm64/mm/trans_pgd-asm.S       |  3 +++
 10 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index e5957a53be398..edfe1f0c2a0e1 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -25,6 +25,7 @@
 #include <asm/pgtable-hwdef.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
+#include <asm/unwind_hints.h>
 
 	/*
 	 * Provide a wxN alias for each wN register so what we can paste a xN
@@ -147,6 +148,7 @@ lr	.req	x30		// link register
  */
 	 .macro	ventry	label
 	.align	7
+	UNWIND_HINT_EMPTY
 	b	\label
 	.endm
 
diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S
index 6b752fe897451..2ab2d4255d8bf 100644
--- a/arch/arm64/kernel/cpu-reset.S
+++ b/arch/arm64/kernel/cpu-reset.S
@@ -11,6 +11,7 @@
 #include <linux/cfi_types.h>
 #include <asm/assembler.h>
 #include <asm/sysreg.h>
+#include <asm/unwind_hints.h>
 #include <asm/virt.h>
 
 .text
@@ -29,7 +30,8 @@
  * branch to what would be the reset vector. It must be executed with the
  * flat identity mapping.
  */
-SYM_TYPED_FUNC_START(cpu_soft_restart)
+SYM_CODE_START(cpu_soft_restart)
+	UNWIND_HINT_EMPTY
 	mov_q	x12, INIT_SCTLR_EL1_MMU_OFF
 	pre_disable_mmu_workaround
 	/*
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index 61a87fa1c3055..9a1a94c3c4dbf 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -9,10 +9,12 @@
 #include <linux/init.h>
 
 #include <asm/assembler.h>
+#include <asm/unwind_hints.h>
 
 	__INIT
 
 SYM_CODE_START(efi_enter_kernel)
+	UNWIND_HINT_EMPTY
 	/*
 	 * efi_pe_entry() will have copied the kernel image if necessary and we
 	 * end up here with device tree address in x1 and the kernel entry
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2ee5826427401..1f075c54d28ba 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -28,6 +28,7 @@
 #include <asm/thread_info.h>
 #include <asm/asm-uaccess.h>
 #include <asm/unistd.h>
+#include <asm/unwind_hints.h>
 
 	.macro	clear_gp_regs
 	.irp	n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
@@ -37,6 +38,7 @@
 
 	.macro kernel_ventry, el:req, ht:req, regsize:req, label:req
 	.align 7
+	UNWIND_HINT_EMPTY
 .Lventry_start\@:
 	.if	\el == 0
 	/*
@@ -44,6 +46,7 @@
 	 * skipped by the trampoline vectors, to trigger the cleanup.
 	 */
 	b	.Lskip_tramp_vectors_cleanup\@
+	UNWIND_HINT_EMPTY
 	.if	\regsize == 64
 	mrs	x30, tpidrro_el0
 	msr	tpidrro_el0, xzr
@@ -417,6 +420,7 @@ alternative_else_nop_endif
 	ldp	x24, x25, [sp, #16 * 12]
 	ldp	x26, x27, [sp, #16 * 13]
 	ldp	x28, x29, [sp, #16 * 14]
+	UNWIND_HINT_EMPTY
 
 	.if	\el == 0
 alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD
@@ -676,6 +680,7 @@ alternative_else_nop_endif
 
 	.macro tramp_ventry, vector_start, regsize, kpti, bhb
 	.align	7
+	UNWIND_HINT_EMPTY
 1:
 	.if	\regsize == 64
 	msr	tpidrro_el0, x30	// Restored in kernel_ventry
@@ -701,6 +706,7 @@ alternative_else_nop_endif
 	 * enter the full-fat kernel vectors.
 	 */
 	bl	2f
+	UNWIND_HINT_EMPTY
 	b	.
 2:
 	tramp_map_kernel	x30
@@ -731,6 +737,7 @@ alternative_else_nop_endif
 	.endm
 
 	.macro tramp_exit, regsize = 64
+	UNWIND_HINT_EMPTY
 	tramp_data_read_var	x30, this_cpu_vector
 	get_this_cpu_offset x29
 	ldr	x30, [x30, x29]
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 6790fd2f7d158..a3012c37b1465 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -33,6 +33,7 @@
 #include <asm/smp.h>
 #include <asm/sysreg.h>
 #include <asm/thread_info.h>
+#include <asm/unwind_hints.h>
 #include <asm/virt.h>
 
 #include "efi-header.S"
@@ -58,6 +59,7 @@
 	 * DO NOT MODIFY. Image header expected by Linux boot-loaders.
 	 */
 	efi_signature_nop			// special NOP to identity as PE/COFF executable
+	UNWIND_HINT_EMPTY
 	b	primary_entry			// branch to kernel start, magic
 	.quad	0				// Image load offset from start of RAM, little-endian
 	le64sym	_kernel_size_le			// Effective size of kernel image, little-endian
@@ -113,6 +115,7 @@ SYM_CODE_END(primary_entry)
  * Preserve the arguments passed by the bootloader in x0 .. x3
  */
 SYM_CODE_START_LOCAL(preserve_boot_args)
+	UNWIND_HINT_EMPTY
 	mov	x21, x0				// x21=FDT
 
 	adr_l	x0, boot_args			// record the contents of
@@ -127,6 +130,7 @@ SYM_CODE_START_LOCAL(preserve_boot_args)
 SYM_CODE_END(preserve_boot_args)
 
 SYM_CODE_START_LOCAL(clear_page_tables)
+	UNWIND_HINT_EMPTY
 	/*
 	 * Clear the init page tables.
 	 */
@@ -260,7 +264,7 @@ SYM_CODE_END(clear_page_tables)
  * x6: order of the last level mappings
  */
 SYM_CODE_START_LOCAL(remap_region)
-	sub	x3, x3, #1		// make end inclusive
+	UNWIND_HINT_EMPTY
 
 	// Get the index offset for the start of the last level table
 	lsr	x1, x1, x6
@@ -281,6 +285,7 @@ SYM_CODE_START_LOCAL(remap_region)
 SYM_CODE_END(remap_region)
 
 SYM_CODE_START_LOCAL(create_idmap)
+	UNWIND_HINT_EMPTY
 	mov	x28, lr
 	/*
 	 * The ID map carries a 1:1 mapping of the physical address range
@@ -369,6 +374,7 @@ SYM_CODE_START_LOCAL(create_idmap)
 SYM_CODE_END(create_idmap)
 
 SYM_CODE_START_LOCAL(create_kernel_mapping)
+	UNWIND_HINT_EMPTY
 	adrp	x0, init_pg_dir
 	mov_q	x5, KIMAGE_VADDR		// compile time __va(_text)
 #ifdef CONFIG_RELOCATABLE
@@ -488,6 +494,7 @@ SYM_CODE_END(__primary_switched)
  * potential context flags. These flags are *not* stored in __boot_cpu_mode.
  */
 SYM_CODE_START(init_kernel_el)
+	UNWIND_HINT_EMPTY
 	mrs	x0, CurrentEL
 	cmp	x0, #CurrentEL_EL2
 	b.eq	init_el2
@@ -545,6 +552,7 @@ SYM_CODE_END(init_kernel_el)
  * in w0. See arch/arm64/include/asm/virt.h for more info.
  */
 SYM_CODE_START_LOCAL(set_cpu_boot_mode_flag)
+	UNWIND_HINT_EMPTY
 	adr_l	x1, __boot_cpu_mode
 	cmp	w0, #BOOT_CPU_MODE_EL2
 	b.ne	1f
@@ -558,6 +566,7 @@ SYM_CODE_END(set_cpu_boot_mode_flag)
 	 * cores are held until we're ready for them to initialise.
 	 */
 SYM_CODE_START(secondary_holding_pen)
+	UNWIND_HINT_EMPTY
 	bl	init_kernel_el			// w0=cpu_boot_mode
 	mrs	x2, mpidr_el1
 	mov_q	x1, MPIDR_HWID_BITMASK
@@ -575,6 +584,7 @@ SYM_CODE_END(secondary_holding_pen)
 	 * be used where CPUs are brought online dynamically by the kernel.
 	 */
 SYM_CODE_START(secondary_entry)
+	UNWIND_HINT_EMPTY
 	bl	init_kernel_el			// w0=cpu_boot_mode
 	b	secondary_startup
 SYM_CODE_END(secondary_entry)
@@ -598,6 +608,7 @@ SYM_CODE_START_LOCAL(secondary_startup)
 SYM_CODE_END(secondary_startup)
 
 SYM_CODE_START_LOCAL(__secondary_switched)
+	UNWIND_HINT_EMPTY
 	mov	x0, x20
 	bl	set_cpu_boot_mode_flag
 	str_l	xzr, __early_cpu_boot_status, x3
@@ -620,6 +631,7 @@ SYM_CODE_START_LOCAL(__secondary_switched)
 SYM_CODE_END(__secondary_switched)
 
 SYM_CODE_START_LOCAL(__secondary_too_slow)
+	UNWIND_HINT_EMPTY
 	wfe
 	wfi
 	b	__secondary_too_slow
@@ -657,6 +669,7 @@ SYM_CODE_END(__secondary_too_slow)
  * If it isn't, park the CPU
  */
 SYM_CODE_START(__enable_mmu)
+	UNWIND_HINT_EMPTY
 	mrs	x3, ID_AA64MMFR0_EL1
 	ubfx	x3, x3, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4
 	cmp     x3, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN
@@ -673,6 +686,7 @@ SYM_CODE_START(__enable_mmu)
 SYM_CODE_END(__enable_mmu)
 
 SYM_CODE_START_LOCAL(__cpu_secondary_check52bitva)
+	UNWIND_HINT_EMPTY
 #if VA_BITS > 48
 	ldr_l	x0, vabits_actual
 	cmp	x0, #52
@@ -704,6 +718,7 @@ SYM_CODE_END(__no_granule_support)
 
 #ifdef CONFIG_RELOCATABLE
 SYM_CODE_START_LOCAL(__relocate_kernel)
+	UNWIND_HINT_EMPTY
 	/*
 	 * Iterate over each entry in the relocation table, and apply the
 	 * relocations in place.
diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
index 0e1d9c3c6a933..c0bec20bf0e09 100644
--- a/arch/arm64/kernel/hibernate-asm.S
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -13,6 +13,7 @@
 #include <asm/cputype.h>
 #include <asm/memory.h>
 #include <asm/page.h>
+#include <asm/unwind_hints.h>
 #include <asm/virt.h>
 
 /*
@@ -46,6 +47,7 @@
  */
 .pushsection    ".hibernate_exit.text", "ax"
 SYM_CODE_START(swsusp_arch_suspend_exit)
+	UNWIND_HINT_EMPTY
 	/*
 	 * We execute from ttbr0, change ttbr1 to our copied linear map tables
 	 * with a break-before-make via the zero page
diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
index 413f899e4ac63..3ec70802a7f8e 100644
--- a/arch/arm64/kernel/relocate_kernel.S
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -16,6 +16,7 @@
 #include <asm/page.h>
 #include <asm/sysreg.h>
 #include <asm/virt.h>
+#include <asm/unwind_hints.h>
 
 .macro turn_off_mmu tmp1, tmp2
 	mov_q   \tmp1, INIT_SCTLR_EL1_MMU_OFF
@@ -37,6 +38,7 @@
  * safe memory that has been set up to be preserved during the copy operation.
  */
 SYM_CODE_START(arm64_relocate_new_kernel)
+	UNWIND_HINT_EMPTY
 	/*
 	 * The kimage structure isn't allocated specially and may be clobbered
 	 * during relocation. We must load any values we need from it prior to
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 82c10817125ef..07b5d019a991f 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -4,6 +4,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/assembler.h>
 #include <asm/smp.h>
+#include <asm/unwind_hints.h>
 
 	.text
 /*
@@ -99,6 +100,7 @@ SYM_FUNC_END(__cpu_suspend_enter)
 
 	.pushsection ".idmap.text", "awx"
 SYM_CODE_START(cpu_resume)
+	UNWIND_HINT_EMPTY
 	bl	init_kernel_el
 	bl	finalise_el2
 #if VA_BITS > 48
@@ -116,6 +118,7 @@ SYM_CODE_END(cpu_resume)
 	.popsection
 
 SYM_CODE_START(_cpu_resume)
+	UNWIND_HINT_EMPTY
 	mrs	x1, mpidr_el1
 	adr_l	x8, mpidr_hash		// x8 = struct mpidr_hash virt address
 
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 8f3f93fa119ed..535ef052aa039 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -14,6 +14,7 @@
 #include <asm/kvm_asm.h>
 #include <asm/mmu.h>
 #include <asm/spectre.h>
+#include <asm/unwind_hints.h>
 
 .macro save_caller_saved_regs_vect
 	/* x0 and x1 were saved in the vector entry */
diff --git a/arch/arm64/mm/trans_pgd-asm.S b/arch/arm64/mm/trans_pgd-asm.S
index 021c31573bcb6..148435248860d 100644
--- a/arch/arm64/mm/trans_pgd-asm.S
+++ b/arch/arm64/mm/trans_pgd-asm.S
@@ -8,10 +8,12 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/kvm_asm.h>
+#include <asm/unwind_hints.h>
 
 .macro invalid_vector	label
 SYM_CODE_START_LOCAL(\label)
 	.align 7
+	UNWIND_HINT_EMPTY
 	b	\label
 SYM_CODE_END(\label)
 .endm
@@ -19,6 +21,7 @@ SYM_CODE_END(\label)
 .macro el1_sync_vector
 SYM_CODE_START_LOCAL(el1_sync)
 	.align 7
+	UNWIND_HINT_EMPTY
 	cmp	x0, #HVC_SET_VECTORS	/* Called from hibernate */
 	b.ne	1f
 	msr	vbar_el2, x1

From afe5ce2e51cc2a19cac727b83a21bd4561773ffb Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:04 +0800
Subject: [PATCH 068/175] arm64: entry: Annotate unwind_hint for entry

When taking an exception/interrupt, add UNWIND_HINT_REGS to indicate
from which point the pt_regs is on stack.

Whether returning to userland or creating a new task, sp is
pointing to a pt_regs frame, add UNWIND_HINT_REGS after that.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/kernel/entry.S | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 1f075c54d28ba..ce90b4a8f064b 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -120,6 +120,7 @@
 alternative_cb	ARM64_ALWAYS_SYSTEM, spectre_v4_patch_fw_mitigation_enable
 	b	.L__asm_ssbd_skip\@		// Patched to NOP
 alternative_cb_end
+	UNWIND_HINT_REGS
 	ldr_this_cpu	\tmp2, arm64_ssbd_callback_required, \tmp1
 	cbz	\tmp2,	.L__asm_ssbd_skip\@
 	ldr	\tmp2, [tsk, #TSK_TI_FLAGS]
@@ -218,6 +219,7 @@ alternative_cb_end
 	stp	x24, x25, [sp, #16 * 12]
 	stp	x26, x27, [sp, #16 * 13]
 	stp	x28, x29, [sp, #16 * 14]
+	UNWIND_HINT_REGS
 
 	.if	\el == 0
 	clear_gp_regs
@@ -605,6 +607,7 @@ SYM_CODE_START_LOCAL(ret_to_kernel)
 SYM_CODE_END(ret_to_kernel)
 
 SYM_CODE_START_LOCAL(ret_to_user)
+	UNWIND_HINT_REGS
 	ldr	x19, [tsk, #TSK_TI_FLAGS]	// re-check for single-step
 	enable_step_tsk x19, x2
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
@@ -869,6 +872,7 @@ NOKPROBE(cpu_switch_to)
  * This is how we return from a fork.
  */
 SYM_CODE_START(ret_from_fork)
+	UNWIND_HINT_REGS
 	bl	schedule_tail
 	cbz	x19, 1f				// not a kernel thread
 	mov	x0, x20

From 141c857dce42572d096bcbae3b0dc71c01b77736 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:05 +0800
Subject: [PATCH 069/175] arm64: kvm: Annotate unwind_hint for hyp entry

Symbol __guest_enter and kvm_hyp_vector saved x0, x1 on stack.

Symbol __guest_exit and __guest_exit_panic is reached when resuming
EL2 execution, and the previous stack pointer gets restored.

Add adequate unwind hints.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/kvm/hyp/entry.S     | 5 +++++
 arch/arm64/kvm/hyp/hyp-entry.S | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 045d4481c8209..81e30d56bf28b 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -15,6 +15,7 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_mte.h>
 #include <asm/kvm_ptrauth.h>
+#include <asm/unwind_hints.h>
 
 	.text
 
@@ -22,6 +23,7 @@
  * u64 __guest_enter(struct kvm_vcpu *vcpu);
  */
 SYM_CODE_START(__guest_enter)
+	UNWIND_HINT_FUNC
 	// x0: vcpu
 	// x1-x17: clobbered by macros
 	// x29: guest context
@@ -88,6 +90,7 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
 	// vcpu x0-x1 on the stack
 
 	// If the hyp context is loaded, go straight to hyp_panic
+	UNWIND_HINT_FUNC
 	get_loaded_vcpu x0, x1
 	cbnz	x0, 1f
 	b	hyp_panic
@@ -110,6 +113,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
 	// x1: vcpu
 	// x2-x29,lr: vcpu regs
 	// vcpu x0-x1 on the stack
+	UNWIND_HINT_FUNC sp_offset=16
 
 	add	x1, x1, #VCPU_CONTEXT
 
@@ -199,6 +203,7 @@ abort_guest_exit_end:
 	msr	daifset, #4	// Mask aborts
 	ret
 
+	UNWIND_HINT_FUNC
 	_kvm_extable	abort_guest_exit_start, 9997f
 	_kvm_extable	abort_guest_exit_end, 9997f
 9997:
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 535ef052aa039..d80fe00450368 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -151,6 +151,7 @@ SYM_CODE_END(\label)
 
 .macro valid_vect target
 	.align 7
+	UNWIND_HINT_FUNC
 661:
 	esb
 	stp	x0, x1, [sp, #-16]!
@@ -162,6 +163,7 @@ check_preamble_length 661b, 662b
 
 .macro invalid_vect target
 	.align 7
+	UNWIND_HINT_FUNC
 661:
 	nop
 	stp	x0, x1, [sp, #-16]!
@@ -209,6 +211,7 @@ SYM_CODE_END(__kvm_hyp_vector)
 .macro hyp_ventry	indirect, spectrev2
 	.align	7
 1:	esb
+	UNWIND_HINT_FUNC
 	.if \spectrev2 != 0
 	spectrev2_smccc_wa1_smc
 	.else

From e4125410c678264204e80761f4d610c098e3c9cc Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:06 +0800
Subject: [PATCH 070/175] arm64: efi-header: Mark efi header as data

This file only contains a set of constants forming the efi header.

Make the constants part of a data symbol.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/kernel/efi-header.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S
index d731b4655df8e..241b148bbf333 100644
--- a/arch/arm64/kernel/efi-header.S
+++ b/arch/arm64/kernel/efi-header.S
@@ -28,6 +28,7 @@
 	.macro	__EFI_PE_HEADER
 #ifdef CONFIG_EFI
 	.set	.Lpe_header_offset, . - .L_head
+SYM_DATA_START_LOCAL(arm64_efi_header)
 	.long	PE_MAGIC
 	.short	IMAGE_FILE_MACHINE_ARM64		// Machine
 	.short	.Lsection_count				// NumberOfSections
@@ -160,6 +161,7 @@
 
 	.balign	SEGMENT_ALIGN
 .Lefi_header_end:
+SYM_DATA_END_LABEL(arm64_efi_header, SYM_L_LOCAL, efi_header_end)
 #else
 	.set	.Lpe_header_offset, 0x0
 #endif

From db665c00b3f8545e16dfbc78891b1edd634c38ff Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:07 +0800
Subject: [PATCH 071/175] arm64: head: Mark constants as data

Add data annotations to constants part of the image header.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/kernel/head.S | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index a3012c37b1465..2cffaad364fea 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -58,10 +58,11 @@
 	/*
 	 * DO NOT MODIFY. Image header expected by Linux boot-loaders.
 	 */
-	efi_signature_nop			// special NOP to identity as PE/COFF executable
+SYM_DATA_LOCAL(efi_nop, efi_signature_nop)	// special NOP to identity as PE/COFF executable
 	UNWIND_HINT_EMPTY
 	b	primary_entry			// branch to kernel start, magic
-	.quad	0				// Image load offset from start of RAM, little-endian
+SYM_DATA_LOCAL(_zero_reserved, .quad	0)	// Image load offset from start of RAM, little-endian
+SYM_DATA_START_LOCAL(_arm64_common_header)
 	le64sym	_kernel_size_le			// Effective size of kernel image, little-endian
 	le64sym	_kernel_flags_le		// Informative flags, little-endian
 	.quad	0				// reserved
@@ -69,6 +70,7 @@
 	.quad	0				// reserved
 	.ascii	ARM64_IMAGE_MAGIC		// Magic number
 	.long	.Lpe_header_offset		// Offset to the PE header.
+SYM_DATA_END(_arm64_common_header)
 
 	__EFI_PE_HEADER
 

From 425083bf0d1cd455aadf55cd1b0b8fb828f95ab2 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:09 +0800
Subject: [PATCH 072/175] arm64: crypto: Mark constant as data

Use SYM_DATA_* macros to annotate data bytes in the middle of .text
sections.

For local symbols, ".L" prefix needs to be dropped as the assembler
exclude the symbols from the .o symbol table, making objtool unable
to see them.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/crypto/aes-neonbs-core.S | 14 +++++++-------
 arch/arm64/crypto/poly1305-armv8.pl |  4 ++++
 arch/arm64/crypto/sha512-armv8.pl   | 24 ++++++++++++++----------
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index b2062eeee59e2..1b41899aa9f14 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -368,15 +368,15 @@
 
 
 	.align		6
-M0:	.octa		0x0004080c0105090d02060a0e03070b0f
+SYM_DATA_LOCAL(M0,	.octa		0x0004080c0105090d02060a0e03070b0f)
 
-M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
-SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
-SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
+SYM_DATA_LOCAL(M0SR,	.octa		0x0004080c05090d010a0e02060f03070b)
+SYM_DATA_LOCAL(SR,	.octa		0x0f0e0d0c0a09080b0504070600030201)
+SYM_DATA_LOCAL(SRM0,	.octa		0x01060b0c0207080d0304090e00050a0f)
 
-M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
-ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
-ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
+SYM_DATA_LOCAL(M0ISR,	.octa		0x0004080c0d0105090a0e0206070b0f03)
+SYM_DATA_LOCAL(ISR,	.octa		0x0f0e0d0c080b0a090504070602010003)
+SYM_DATA_LOCAL(ISRM0,	.octa		0x0306090c00070a0d01040b0e0205080f)
 
 	/*
 	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl
index cbc980fb02e33..f460f33c127a0 100644
--- a/arch/arm64/crypto/poly1305-armv8.pl
+++ b/arch/arm64/crypto/poly1305-armv8.pl
@@ -47,6 +47,8 @@
 my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
 
 $code.=<<___;
+#include <linux/linkage.h>
+
 #ifndef __KERNEL__
 # include "arm_arch.h"
 .extern	OPENSSL_armcap_P
@@ -888,8 +890,10 @@
 .align	5
 .Lzeros:
 .long	0,0,0,0,0,0,0,0
+SYM_DATA_START_LOCAL(POLY1305_str)
 .asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
 .align	2
+SYM_DATA_END(POLY1305_str)
 #if !defined(__KERNEL__) && !defined(_WIN64)
 .comm	OPENSSL_armcap_P,4,4
 .hidden	OPENSSL_armcap_P
diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl
index 35ec9ae99fe16..1882c41100265 100644
--- a/arch/arm64/crypto/sha512-armv8.pl
+++ b/arch/arm64/crypto/sha512-armv8.pl
@@ -193,6 +193,8 @@ sub BODY_00_xx {
 }
 
 $code.=<<___;
+#include <linux/linkage.h>
+
 #ifndef	__KERNEL__
 # include "arm_arch.h"
 #endif
@@ -208,11 +210,11 @@ sub BODY_00_xx {
 $code.=<<___	if ($SZ==4);
 #ifndef	__KERNEL__
 # ifdef	__ILP32__
-	ldrsw	x16,.LOPENSSL_armcap_P
+	ldrsw	x16,OPENSSL_armcap_P_rel
 # else
-	ldr	x16,.LOPENSSL_armcap_P
+	ldr	x16,OPENSSL_armcap_P_rel
 # endif
-	adr	x17,.LOPENSSL_armcap_P
+	adr	x17,OPENSSL_armcap_P_rel
 	add	x16,x16,x17
 	ldr	w16,[x16]
 	tst	w16,#ARMV8_SHA256
@@ -237,7 +239,7 @@ sub BODY_00_xx {
 	ldp	$E,$F,[$ctx,#4*$SZ]
 	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
 	ldp	$G,$H,[$ctx,#6*$SZ]
-	adr	$Ktbl,.LK$BITS
+	adr	$Ktbl,K$BITS
 	stp	$ctx,$num,[x29,#96]
 
 .Loop:
@@ -287,8 +289,7 @@ sub BODY_00_xx {
 .size	$func,.-$func
 
 .align	6
-.type	.LK$BITS,%object
-.LK$BITS:
+SYM_DATA_START_LOCAL(K$BITS)
 ___
 $code.=<<___ if ($SZ==8);
 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
@@ -353,18 +354,21 @@ sub BODY_00_xx {
 	.long	0	//terminator
 ___
 $code.=<<___;
-.size	.LK$BITS,.-.LK$BITS
+SYM_DATA_END(K$BITS)
 #ifndef	__KERNEL__
 .align	3
-.LOPENSSL_armcap_P:
+SYM_DATA_START_LOCAL(OPENSSL_armcap_P_rel)
 # ifdef	__ILP32__
 	.long	OPENSSL_armcap_P-.
 # else
 	.quad	OPENSSL_armcap_P-.
 # endif
+SYM_DATA_END(OPENSSL_armcap_P_rel)
 #endif
+SYM_DATA_START_LOCAL(OPENSSL_str)
 .asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
+SYM_DATA_END(OPENSSL_str)
 ___
 
 if ($SZ==4) {
@@ -385,7 +389,7 @@ sub BODY_00_xx {
 	add		x29,sp,#0
 
 	ld1.32		{$ABCD,$EFGH},[$ctx]
-	adr		$Ktbl,.LK256
+	adr		$Ktbl,K256
 
 .Loop_hw:
 	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
@@ -648,7 +652,7 @@ ()
 	mov	x29, sp
 	sub	sp,sp,#16*4
 
-	adr	$Ktbl,.LK256
+	adr	$Ktbl,K256
 	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
 
 	ld1.8	{@X[0]},[$inp], #16

From 8e67ddbdf19d48560646924f381571974027204e Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:10 +0800
Subject: [PATCH 073/175] arm64: crypto: Remove unnecessary stackframe

The way sha256_block_neon restore the stackframe confuses objtool.
But it turns out this function is a leaf function and does not use
FP nor LR as scratch register.

Do not create a stackframe in this function as it is not necessary.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/crypto/sha512-armv8.pl | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl
index 1882c41100265..6e2a96e05c5a5 100644
--- a/arch/arm64/crypto/sha512-armv8.pl
+++ b/arch/arm64/crypto/sha512-armv8.pl
@@ -648,8 +648,6 @@ ()
 .align	4
 sha256_block_neon:
 .Lneon_entry:
-	stp	x29, x30, [sp, #-16]!
-	mov	x29, sp
 	sub	sp,sp,#16*4
 
 	adr	$Ktbl,K256
@@ -736,8 +734,7 @@ ()
 	 mov	$Xfer,sp
 	b.ne	.L_00_48
 
-	ldr	x29,[x29]
-	add	sp,sp,#16*4+16
+	add	sp,sp,#16*4
 	ret
 .size	sha256_block_neon,.-sha256_block_neon
 ___

From 3be9018a1cd708791323e635cfe309a68b56fdad Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:11 +0800
Subject: [PATCH 074/175] arm64: Set intra-function call annotations

Stack validation requires BL instructions to an address,
within the symbol containing the BL should be annotated as intra-function
calls.

Make __pmull_p8_core normally set frame because there's a intra-function
call destinating middle of it and the caller have set the frame. When
analyzing the insns there will be a cfi state mismatch between normal-call
and intra-call.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/crypto/crct10dif-ce-core.S | 5 +++++
 arch/arm64/kernel/entry.S             | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index dce6dcebfca18..b3b8e56cb87d3 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -63,6 +63,7 @@
 //
 
 #include <linux/linkage.h>
+#include <linux/objtool.h>
 #include <asm/assembler.h>
 
 	.text
@@ -132,6 +133,8 @@
 	.endm
 
 SYM_FUNC_START_LOCAL(__pmull_p8_core)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
 .L__pmull_p8_core:
 	ext		t4.8b, ad.8b, ad.8b, #1			// A1
 	ext		t5.8b, ad.8b, ad.8b, #2			// A2
@@ -193,6 +196,7 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
 
 	eor		t4.16b, t4.16b, t5.16b
 	eor		t6.16b, t6.16b, t3.16b
+	ldp		x29, x30, [sp], #16
 	ret
 SYM_FUNC_END(__pmull_p8_core)
 
@@ -207,6 +211,7 @@ SYM_FUNC_END(__pmull_p8_core)
 	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
 	.endif
 
+	ANNOTATE_INTRA_FUNCTION_CALL
 	bl		.L__pmull_p8_core\i
 
 	eor		\rq\().16b, \rq\().16b, t4.16b
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ce90b4a8f064b..4d2ffeffd06ce 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -10,6 +10,7 @@
 #include <linux/arm-smccc.h>
 #include <linux/init.h>
 #include <linux/linkage.h>
+#include <linux/objtool.h>
 
 #include <asm/alternative.h>
 #include <asm/assembler.h>
@@ -708,6 +709,7 @@ alternative_else_nop_endif
 	 * entry onto the return stack and using a RET instruction to
 	 * enter the full-fat kernel vectors.
 	 */
+	ANNOTATE_INTRA_FUNCTION_CALL
 	bl	2f
 	UNWIND_HINT_EMPTY
 	b	.

From 3a81d6abbc6e57f84c9d9cefafdd7ba1117c0c39 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:12 +0800
Subject: [PATCH 075/175] arm64: sleep: Properly set frame pointer before call

In __cpu_suspend_enter, the FP and LR are properly saved on the stack to
form a stack frame, but the frame pointer is not set afterwards.

Have the frame pointer point to the new frame.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/kernel/sleep.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 07b5d019a991f..82e38d1e7a8ba 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -92,6 +92,7 @@ SYM_FUNC_START(__cpu_suspend_enter)
 	str	x0, [x1]
 	add	x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS
 	stp	x29, lr, [sp, #-16]!
+	mov	x29, sp
 	bl	cpu_do_suspend
 	ldp	x29, lr, [sp], #16
 	mov	x0, #1

From 040dfcbb8b49a9dc56346eb4fdd7696217449c90 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:14 +0800
Subject: [PATCH 076/175] arm64: entry: Align stack size for alternative

In kernel_exit there is a alternative branch for KPTI which causes
stack size conflict for two instruction boundaries.

To fix that, make both branch move the sp and then revert it in
tramp_exit branch.

Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/kernel/entry.S | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 4d2ffeffd06ce..43463e4e1c2d9 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -434,7 +434,11 @@ alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0
 	ldr	lr, [sp, #S_LR]
 	add	sp, sp, #PT_REGS_SIZE		// restore sp
 	eret
-alternative_else_nop_endif
+alternative_else
+	nop
+	add sp, sp, #PT_REGS_SIZE       // restore sp
+	nop
+alternative_endif
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	bne	4f
 	msr	far_el1, x29
@@ -743,6 +747,7 @@ alternative_else_nop_endif
 
 	.macro tramp_exit, regsize = 64
 	UNWIND_HINT_EMPTY
+	sub sp, sp, #PT_REGS_SIZE       // revert sp
 	tramp_data_read_var	x30, this_cpu_vector
 	get_this_cpu_offset x29
 	ldr	x30, [x30, x29]

From 21debd45b7740b6f8770619a4b73b5df08dafbaa Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Thu, 23 Jun 2022 09:49:15 +0800
Subject: [PATCH 077/175] arm64: kernel: Skip validation of proton-pack.c

qcom_link_stack_sanitisation() repeatly calls itself, but we can't
mark the asm code as intra-call so it should be marked as non_standard.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
---
 arch/arm64/kernel/proton-pack.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index bfce41c2a53b3..dd07406d1391a 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -22,6 +22,7 @@
 #include <linux/cpu.h>
 #include <linux/device.h>
 #include <linux/nospec.h>
+#include <linux/objtool.h>
 #include <linux/prctl.h>
 #include <linux/sched/task_stack.h>
 
@@ -257,6 +258,7 @@ static noinstr void qcom_link_stack_sanitisation(void)
 		     "mov	x30, %0		\n"
 		     : "=&r" (tmp));
 }
+STACK_FRAME_NON_STANDARD(qcom_link_stack_sanitisation);
 
 static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void)
 {

From b0ddc68432cd8d25219374e873234eb3215782d7 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Fri, 15 Oct 2021 13:56:40 -0700
Subject: [PATCH 078/175] arm64: kvm: vgic-v3-sr: Bug when trying to read
 invalid APRs

There are 4 interrupt controller active priorities group [0-1] registers which
are read through __vgic_v3_read_ap0rn() and __vgic_v3_read_ap1rn(). When these
functions are passed an argument which isn't 0-3 they fall through to what ever
happens to be after them in memory. To avoid this BUG() in the case where an
invalid argument is passed.

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/hyp/vgic-v3-sr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 6cb638b184b18..493b630eacf2f 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -166,7 +166,7 @@ static u32 __vgic_v3_read_ap0rn(int n)
 		val = read_gicreg(ICH_AP0R3_EL2);
 		break;
 	default:
-		unreachable();
+		BUG();
 	}
 
 	return val;
@@ -190,7 +190,7 @@ static u32 __vgic_v3_read_ap1rn(int n)
 		val = read_gicreg(ICH_AP1R3_EL2);
 		break;
 	default:
-		unreachable();
+		BUG();
 	}
 
 	return val;

From f9df96115348f8a7e6f97384adb8e4cfecc23d18 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Wed, 26 May 2021 16:49:16 -0500
Subject: [PATCH 079/175] arm64: Introduce stack trace reliability checks in
 the unwinder

The unwinder should check for the presence of various features and
conditions that can render the stack trace unreliable and mark the
the stack trace as unreliable for the benefit of the caller.

Introduce the first reliability check - If a return PC is not a valid
kernel text address, consider the stack trace unreliable. It could be
some generated code.

Other reliability checks will be added in the future.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
---
 arch/arm64/kernel/stacktrace.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 83154303e682c..7f0d62338d2ea 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -76,7 +76,7 @@ static __always_inline void unwind_init_from_task(struct unwind_state *state,
  * records (e.g. a cycle), determined based on the location and fp value of A
  * and the location (but not the fp value) of B.
  */
-static int notrace unwind_next(struct unwind_state *state)
+static int notrace unwind_next(struct unwind_state *state, int *reliable)
 {
 	struct task_struct *tsk = state->task;
 	unsigned long fp = state->fp;
@@ -87,8 +87,11 @@ static int notrace unwind_next(struct unwind_state *state)
 		return -ENOENT;
 
 	err = unwind_next_frame_record(state);
-	if (err)
+	if (err) {
+		if (reliable)
+			*reliable = 0;
 		return err;
+	}
 
 	state->pc = ptrauth_strip_insn_pac(state->pc);
 
@@ -114,11 +117,27 @@ static int notrace unwind_next(struct unwind_state *state)
 		state->pc = kretprobe_find_ret_addr(tsk, (void *)state->fp, &state->kr_cur);
 #endif
 
+	/*
+	 * Check the return PC for conditions that make unwinding unreliable.
+	 * In each case, mark the stack trace as such.
+	 */
+
+	/*
+	 * Make sure that the return address is a proper kernel text address.
+	 * A NULL or invalid return address could mean:
+	 *
+	 *	- generated code such as eBPF and optprobe trampolines
+	 *	- Foreign code (e.g. EFI runtime services)
+	 *	- Procedure Linkage Table (PLT) entries and veneer functions
+	 */
+	if (reliable && !__kernel_text_address(state->pc))
+		*reliable = 0;
+
 	return 0;
 }
 NOKPROBE_SYMBOL(unwind_next);
 
-static void notrace unwind(struct unwind_state *state,
+static void notrace unwind(struct unwind_state *state, int *reliable,
 			   stack_trace_consume_fn consume_entry, void *cookie)
 {
 	while (1) {
@@ -126,7 +145,7 @@ static void notrace unwind(struct unwind_state *state,
 
 		if (!consume_entry(cookie, state->pc))
 			break;
-		ret = unwind_next(state);
+		ret = unwind_next(state, reliable);
 		if (ret < 0)
 			break;
 	}
@@ -228,5 +247,5 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry,
 		unwind_init_from_task(&state, task);
 	}
 
-	unwind(&state, consume_entry, cookie);
+	unwind(&state, NULL, consume_entry, cookie);
 }

From 8cc58044b8343d4925203c8922a8333998c3c4ef Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Wed, 26 May 2021 16:49:17 -0500
Subject: [PATCH 080/175] erm64: Create a list of SYM_CODE functions, check
 return PC against list

The unwinder should check if the return PC falls in any function that
is considered unreliable from an unwinding perspective. If it does,
mark the stack trace unreliable.

Function types
==============

The compiler generates code for C functions and assigns the type STT_FUNC
to them.

Assembly functions are manually assigned a type:

	- STT_FUNC for functions defined with SYM_FUNC*() macros

	- STT_NONE for functions defined with SYM_CODE*() macros

In the future, STT_FUNC functions will be analyzed by objtool and "fixed"
as necessary. So, they are not "interesting" to the reliable unwinder in
the kernel.

That leaves SYM_CODE*() functions. These contain low-level code that is
difficult or impossible for objtool to analyze. So, objtool ignores them
leaving them to the reliable unwinder. These functions must be considered
unreliable from an unwinding perspective.

Define a special section for unreliable functions
=================================================

Define a SYM_CODE_END() macro for arm64 that adds the function address
range to a new section called "sym_code_functions".

Linker file
===========

Include the "sym_code_functions" section under initdata in vmlinux.lds.S.

Initialization
==============

Define an early_initcall() to copy the function address ranges from the
"sym_code_functions" section to an array by the same name.

Unwinder check
==============

Define a function called unwinder_is_unreliable() that compares a return
PC with sym_code_functions[]. If there is a match, then mark the stack trace
as unreliable. Call unwinder_is_unreliable() from unwind_frame().

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
[Move final frame check; if a SYM_CODE function occurs in the very last frame in
 the stack trace then it is not considered unreliable because there is no more
 unwinding to do]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/linkage.h  |  12 ++++
 arch/arm64/include/asm/sections.h |   1 +
 arch/arm64/kernel/stacktrace.c    | 116 ++++++++++++++++++++++++++++--
 arch/arm64/kernel/vmlinux.lds.S   |   7 ++
 4 files changed, 131 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index 1436fa1cde24d..7d8906737cce9 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -43,4 +43,16 @@
 	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
 	bti c ;
 
+/*
+ * Record the address range of each SYM_CODE function in a struct code_range
+ * in a special section.
+ */
+#define SYM_CODE_END(name)				\
+	SYM_END(name, SYM_T_NONE)			;\
+	99:						;\
+	.pushsection "sym_code_functions", "aw"		;\
+	.quad	name					;\
+	.quad	99b					;\
+	.popsection
+
 #endif
diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h
index 40971ac1303f9..50cfd1083563b 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -22,6 +22,7 @@ extern char __irqentry_text_start[], __irqentry_text_end[];
 extern char __mmuoff_data_start[], __mmuoff_data_end[];
 extern char __entry_tramp_text_start[], __entry_tramp_text_end[];
 extern char __relocate_new_kernel_start[], __relocate_new_kernel_end[];
+extern char __sym_code_functions_start[], __sym_code_functions_end[];
 
 static inline size_t entry_tramp_text_size(void)
 {
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 7f0d62338d2ea..7a1ea9cc8c96b 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -12,12 +12,116 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
+#include <linux/slab.h>
 
 #include <asm/efi.h>
 #include <asm/irq.h>
 #include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 
+struct code_range {
+	unsigned long	start;
+	unsigned long	end;
+};
+
+static struct code_range	*sym_code_functions;
+static int			num_sym_code_functions;
+
+int __init init_sym_code_functions(void)
+{
+	size_t size;
+
+	size = (unsigned long)__sym_code_functions_end -
+	       (unsigned long)__sym_code_functions_start;
+
+	sym_code_functions = kmalloc(size, GFP_KERNEL);
+	if (!sym_code_functions)
+		return -ENOMEM;
+
+	memcpy(sym_code_functions, __sym_code_functions_start, size);
+	/* Update num_sym_code_functions after copying sym_code_functions. */
+	smp_mb();
+	num_sym_code_functions = size / sizeof(struct code_range);
+
+	return 0;
+}
+early_initcall(init_sym_code_functions);
+
+/*
+ * Check the return PC against sym_code_functions[]. If there is a match, then
+ * the consider the stack frame unreliable. These functions contain low-level
+ * code where the frame pointer and/or the return address register cannot be
+ * relied upon. This addresses the following situations:
+ *
+ *	- Exception handlers and entry assembly
+ *	- Trampoline assembly (e.g., ftrace, kprobes)
+ *	- Hypervisor-related assembly
+ *	- Hibernation-related assembly
+ *	- CPU start-stop, suspend-resume assembly
+ *	- Kernel relocation assembly
+ *
+ * Some special cases covered by sym_code_functions[] deserve a mention here:
+ *
+ *	- All EL1 interrupt and exception stack traces will be considered
+ *	  unreliable. This is the correct behavior as interrupts and exceptions
+ *	  can happen on any instruction including ones in the frame pointer
+ *	  prolog and epilog. Unless stack metadata is available so the unwinder
+ *	  can unwind through these special cases, such stack traces will be
+ *	  considered unreliable.
+ *
+ *	- A task can get preempted at the end of an interrupt. Stack traces
+ *	  of preempted tasks will show the interrupt frame in the stack trace
+ *	  and will be considered unreliable.
+ *
+ *	- Breakpoints are exceptions. So, all stack traces in the break point
+ *	  handler (including probes) will be considered unreliable.
+ *
+ *	- All of the ftrace entry trampolines are considered unreliable. So,
+ *	  all stack traces taken from tracer functions will be considered
+ *	  unreliable.
+ *
+ *	- The Function Graph Tracer return trampoline (return_to_handler)
+ *	  and the Kretprobe return trampoline (kretprobe_trampoline) are
+ *	  also considered unreliable.
+ *
+ * Some of the special cases above can be unwound through using special logic
+ * in unwind_frame().
+ *
+ *	- return_to_handler() is handled by the unwinder by attempting to
+ *	  retrieve the original return address from the per-task return
+ *	  address stack.
+ *
+ *	- kretprobe_trampoline() can be handled in a similar fashion by
+ *	  attempting to retrieve the original return address from the per-task
+ *	  kretprobe instance list.
+ *
+ *	- I reckon optprobes can be handled in a similar fashion in the future?
+ *
+ *	- Stack traces taken from the FTrace tracer functions can be handled
+ *	  as well. ftrace_call is an inner label defined in the Ftrace entry
+ *	  trampoline. This is the location where the call to a tracer function
+ *	  is patched. So, if the return PC equals ftrace_call+4, it is
+ *	  reliable. At that point, proper stack frames have already been set
+ *	  up for the traced function and its caller.
+ */
+static bool unwinder_is_unreliable(unsigned long pc)
+{
+	const struct code_range *range;
+	int i;
+
+	/*
+	 * If sym_code_functions[] were sorted, a binary search could be
+	 * done to make this more performant.
+	 */
+	for (i = 0; i < num_sym_code_functions; i++) {
+		range = &sym_code_functions[i];
+		if (pc >= range->start && pc < range->end)
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * Start an unwind from a pt_regs.
  *
@@ -79,13 +183,8 @@ static __always_inline void unwind_init_from_task(struct unwind_state *state,
 static int notrace unwind_next(struct unwind_state *state, int *reliable)
 {
 	struct task_struct *tsk = state->task;
-	unsigned long fp = state->fp;
 	int err;
 
-	/* Final frame; nothing to unwind */
-	if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
-		return -ENOENT;
-
 	err = unwind_next_frame_record(state);
 	if (err) {
 		if (reliable)
@@ -133,6 +232,13 @@ static int notrace unwind_next(struct unwind_state *state, int *reliable)
 	if (reliable && !__kernel_text_address(state->pc))
 		*reliable = 0;
 
+	/* Final frame; nothing to unwind */
+	if (state->fp == (unsigned long)task_pt_regs(tsk)->stackframe)
+		return -ENOENT;
+
+	if (reliable && unwinder_is_unreliable(state->pc))
+		*reliable = 0;
+
 	return 0;
 }
 NOKPROBE_SYMBOL(unwind_next);
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 45131e354e27f..d62f24c288b9c 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -121,6 +121,12 @@ jiffies = jiffies_64;
 #define TRAMP_TEXT
 #endif
 
+#define SYM_CODE_FUNCTIONS                                     \
+       . = ALIGN(16);                                           \
+       __sym_code_functions_start = .;                         \
+       KEEP(*(sym_code_functions))                             \
+       __sym_code_functions_end = .;
+
 /*
  * The size of the PE/COFF section that covers the kernel image, which
  * runs from _stext to _edata, must be a round multiple of the PE/COFF
@@ -246,6 +252,7 @@ SECTIONS
 		CON_INITCALL
 		INIT_RAM_FS
 		*(.init.altinstructions .init.bss)	/* from the EFI stub */
+               SYM_CODE_FUNCTIONS
 	}
 	.exit.data : {
 		EXIT_DATA

From 65798ad75a5b0fe4126154751dd2ed573c1196c8 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Mon, 15 Mar 2021 11:58:00 -0500
Subject: [PATCH 081/175] arm64: Implement arch_stack_walk_reliable()

unwind_frame() already sets the reliable flag in the stack frame during
a stack walk to indicate whether the stack trace is reliable or not.

Implement arch_stack_walk_reliable() like arch_stack_walk() but abort
the stack walk as soon as the reliable flag is set to false for any
reason.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
---
 arch/arm64/Kconfig             |  1 +
 arch/arm64/kernel/stacktrace.c | 41 +++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 979b97c328023..113e54c5c14ac 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -182,6 +182,7 @@ config ARM64
 	select HAVE_DYNAMIC_FTRACE
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
 		if DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_RELIABLE_STACKTRACE
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_FAST_GUP
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 7a1ea9cc8c96b..6f010945c3bbb 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -252,7 +252,7 @@ static void notrace unwind(struct unwind_state *state, int *reliable,
 		if (!consume_entry(cookie, state->pc))
 			break;
 		ret = unwind_next(state, reliable);
-		if (ret < 0)
+		if ((ret < 0) || (reliable && !(*reliable)))
 			break;
 	}
 }
@@ -355,3 +355,42 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry,
 
 	unwind(&state, NULL, consume_entry, cookie);
 }
+
+/*
+ * Walk the stack like arch_stack_walk() but stop the walk as soon as
+ * some unreliability is detected in the stack.
+ */
+noinline noinstr int arch_stack_walk_reliable(
+				stack_trace_consume_fn consume_entry,
+				void *cookie, struct task_struct *task)
+{
+	struct stack_info stacks[] = {
+		stackinfo_get_task(task),
+		STACKINFO_CPU(irq),
+#if defined(CONFIG_VMAP_STACK)
+		STACKINFO_CPU(overflow),
+#endif
+#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_ARM_SDE_INTERFACE)
+		STACKINFO_SDEI(normal),
+		STACKINFO_SDEI(critical),
+#endif
+#ifdef CONFIG_EFI
+		STACKINFO_EFI,
+#endif
+	};
+	struct unwind_state state = {
+		.stacks = stacks,
+		.nr_stacks = ARRAY_SIZE(stacks),
+	};
+	int reliable = 1;
+
+	if (task == current) {
+		unwind_init_from_caller(&state);
+	} else {
+		unwind_init_from_task(&state, task);
+	}
+
+	unwind(&state, &reliable, consume_entry, cookie);
+
+	return reliable ? 0 : -EINVAL;
+}

From 68e9c203a1e0e23150532bfd21b900d9ff7b7528 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Thu, 2 Feb 2023 01:40:34 -0600
Subject: [PATCH 082/175] arm64: Define HAVE_DYNAMIC_FTRACE_WITH_ARGS

	- Define HAVE_DYNAMIC_FTRACE_WITH_ARGS to support livepatch.

	- Supply the arch code for HAVE_DYNAMIC_FTRACE_WITH_ARGS.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
---
 arch/arm64/Kconfig              |  1 +
 arch/arm64/include/asm/ftrace.h | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 113e54c5c14ac..5c10b37a769b7 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -180,6 +180,7 @@ config ARM64
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
+	select HAVE_DYNAMIC_FTRACE_WITH_ARGS
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
 		if DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_RELIABLE_STACKTRACE
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index 329dbbd4d50b6..0bc03ecfb257c 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -78,6 +78,26 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	return addr;
 }
 
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
+struct ftrace_regs {
+	struct pt_regs		regs;
+};
+
+static __always_inline struct pt_regs *
+arch_ftrace_get_regs(struct ftrace_regs *fregs)
+{
+	return &fregs->regs;
+}
+
+static __always_inline void ftrace_instruction_pointer_set(
+				struct ftrace_regs *fregs, unsigned long pc)
+{
+	fregs->regs.pc = pc;
+}
+
+#endif
+
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 struct dyn_ftrace;
 struct ftrace_ops;

From edcc6389cbeef9db950cfb424cb6cdd2e7ae23a9 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Mon, 3 May 2021 17:43:13 -0700
Subject: [PATCH 083/175] arm64: implement live patching

It's my understanding that the two pieces of work required to enable live
patching on arm are in flight upstream;
- Reliable stack traces as implemented by Madhavan T. Venkataraman [1]
- Objtool as implemented by Julien Thierry [2]

This is the remaining part required to enable live patching on arm.
Based on work by Torsten Duwe [3]

Allocate a task flag used to represent the patch pending state for the
task. Also implement generic functions klp_arch_set_pc() &
klp_get_ftrace_location().

In klp_arch_set_pc() it is sufficient to set regs->pc as in
ftrace_common_return() the return address is loaded from the stack.

ldr     x9, [sp, #S_PC]
<snip>
ret     x9

In klp_get_ftrace_location() it is necessary to advance the address by
AARCH64_INSN_SIZE (4) to point to the BL in the callsite as 2 nops were
placed at the start of the function, one to be patched to save the LR and
another to be patched to branch to the ftrace call, and
klp_get_ftrace_location() is expected to return the address of the BL. It
may also be necessary to advance the address by another AARCH64_INSN_SIZE
if CONFIG_ARM64_BTI_KERNEL is enabled due to the instruction placed at the
branch target to satisfy BTI,

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>

[1] https://lkml.org/lkml/2021/5/26/1212
[2] https://lkml.org/lkml/2021/3/3/1135
[3] https://lkml.org/lkml/2018/10/26/536

[ 6.1 fp: Drop livepatch.h as this functionality now added by previous patch ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/Kconfig                   | 2 ++
 arch/arm64/include/asm/thread_info.h | 4 +++-
 arch/arm64/kernel/signal.c           | 4 ++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5c10b37a769b7..cc39b1ca95f8e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -195,6 +195,7 @@ config ARM64
 	select HAVE_IOREMAP_PROT
 	select HAVE_IRQ_TIME_ACCOUNTING
 	select HAVE_KVM
+	select HAVE_LIVEPATCH
 	select HAVE_NMI
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_REGS
@@ -2326,3 +2327,4 @@ source "drivers/acpi/Kconfig"
 
 source "arch/arm64/kvm/Kconfig"
 
+source "kernel/livepatch/Kconfig"
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 848739c15de82..42ba9d37e8d83 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -68,6 +68,7 @@ int arch_dup_task_struct(struct task_struct *dst,
 #define TIF_UPROBE		4	/* uprobe breakpoint or singlestep */
 #define TIF_MTE_ASYNC_FAULT	5	/* MTE Asynchronous Tag Check Fault */
 #define TIF_NOTIFY_SIGNAL	6	/* signal notifications exist */
+#define TIF_PATCH_PENDING	7	/* pending live patching update */
 #define TIF_SYSCALL_TRACE	8	/* syscall trace active */
 #define TIF_SYSCALL_AUDIT	9	/* syscall auditing */
 #define TIF_SYSCALL_TRACEPOINT	10	/* syscall tracepoint for ftrace */
@@ -100,11 +101,12 @@ int arch_dup_task_struct(struct task_struct *dst,
 #define _TIF_SVE		(1 << TIF_SVE)
 #define _TIF_MTE_ASYNC_FAULT	(1 << TIF_MTE_ASYNC_FAULT)
 #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_PATCH_PENDING	(1 << TIF_PATCH_PENDING)
 
 #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
 				 _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
-				 _TIF_NOTIFY_SIGNAL)
+				 _TIF_NOTIFY_SIGNAL | _TIF_PATCH_PENDING)
 
 #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 82f4572c8ddfc..58b7d599ff19b 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -17,6 +17,7 @@
 #include <linux/sizes.h>
 #include <linux/string.h>
 #include <linux/resume_user_mode.h>
+#include <linux/livepatch.h>
 #include <linux/ratelimit.h>
 #include <linux/syscalls.h>
 
@@ -1125,6 +1126,9 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
 					       (void __user *)NULL, current);
 			}
 
+			if (thread_flags & _TIF_PATCH_PENDING)
+				klp_update_patch_state(current);
+
 			if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 				do_signal(regs);
 

From 283e6bf49b297a6dc5a936f141bc81cb95e70c3f Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 2 Nov 2021 18:23:07 -0700
Subject: [PATCH 084/175] arm64: module: Use aarch64_insn_write when updating
 relocations later on

apply_relocate_add() is called in module init to apply the relocations which
must be computed at module load time. This is normally called from
apply_relocations() before the module text is mapped read-only in
complete_formation(). However for live patching modules it is also called
after the module text has been marked read-only causing it to fault.

Avoid this fault by calling aarch64_insn_write() to update the instruction
if the module text has already been marked read-only. Preserve the current
behaviour if called before this has been done.

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kernel/module.c | 81 ++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 76b41e4ca9fa3..a2b755718d2ed 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -19,6 +19,7 @@
 #include <asm/alternative.h>
 #include <asm/insn.h>
 #include <asm/sections.h>
+#include <asm/patching.h>
 
 void *module_alloc(unsigned long size)
 {
@@ -156,7 +157,8 @@ enum aarch64_insn_movw_imm_type {
 };
 
 static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
-			   int lsb, enum aarch64_insn_movw_imm_type imm_type)
+			   int lsb, enum aarch64_insn_movw_imm_type imm_type,
+			   bool early)
 {
 	u64 imm;
 	s64 sval;
@@ -188,7 +190,10 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
 
 	/* Update the instruction with the new encoding. */
 	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
-	*place = cpu_to_le32(insn);
+	if (early)
+		*place = cpu_to_le32(insn);
+	else
+		aarch64_insn_write(place, insn);
 
 	if (imm > U16_MAX)
 		return -ERANGE;
@@ -197,7 +202,8 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
 }
 
 static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
-			  int lsb, int len, enum aarch64_insn_imm_type imm_type)
+			  int lsb, int len, enum aarch64_insn_imm_type imm_type,
+			  bool early)
 {
 	u64 imm, imm_mask;
 	s64 sval;
@@ -213,7 +219,10 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
 
 	/* Update the instruction's immediate field. */
 	insn = aarch64_insn_encode_immediate(imm_type, insn, imm);
-	*place = cpu_to_le32(insn);
+	if (early)
+		*place = cpu_to_le32(insn);
+	else
+		aarch64_insn_write(place, insn);
 
 	/*
 	 * Extract the upper value bits (including the sign bit) and
@@ -232,17 +241,17 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
 }
 
 static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs,
-			   __le32 *place, u64 val)
+			   __le32 *place, u64 val, bool early)
 {
 	u32 insn;
 
 	if (!is_forbidden_offset_for_adrp(place))
 		return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21,
-				      AARCH64_INSN_IMM_ADR);
+				      AARCH64_INSN_IMM_ADR, early);
 
 	/* patch ADRP to ADR if it is in range */
 	if (!reloc_insn_imm(RELOC_OP_PREL, place, val & ~0xfff, 0, 21,
-			    AARCH64_INSN_IMM_ADR)) {
+			    AARCH64_INSN_IMM_ADR, early)) {
 		insn = le32_to_cpu(*place);
 		insn &= ~BIT(31);
 	} else {
@@ -254,7 +263,10 @@ static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs,
 						   AARCH64_INSN_BRANCH_NOLINK);
 	}
 
-	*place = cpu_to_le32(insn);
+	if (early)
+		*place = cpu_to_le32(insn);
+	else
+		aarch64_insn_write(place, insn);
 	return 0;
 }
 
@@ -271,6 +283,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 	void *loc;
 	u64 val;
 	Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
+	bool early = me->state == MODULE_STATE_UNFORMED;
 
 	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
 		/* loc corresponds to P in the AArch64 ELF document. */
@@ -323,88 +336,88 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 			fallthrough;
 		case R_AARCH64_MOVW_UABS_G0:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_UABS_G1_NC:
 			overflow_check = false;
 			fallthrough;
 		case R_AARCH64_MOVW_UABS_G1:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_UABS_G2_NC:
 			overflow_check = false;
 			fallthrough;
 		case R_AARCH64_MOVW_UABS_G2:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_UABS_G3:
 			/* We're using the top bits so we can't overflow. */
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_SABS_G0:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_SABS_G1:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_SABS_G2:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G0_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G0:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G1_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G1:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G2_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G2:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G3:
 			/* We're using the top bits so we can't overflow. */
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 48,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 
 		/* Immediate instruction relocations. */
 		case R_AARCH64_LD_PREL_LO19:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19,
-					     AARCH64_INSN_IMM_19);
+					     AARCH64_INSN_IMM_19, early);
 			break;
 		case R_AARCH64_ADR_PREL_LO21:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21,
-					     AARCH64_INSN_IMM_ADR);
+					     AARCH64_INSN_IMM_ADR, early);
 			break;
 		case R_AARCH64_ADR_PREL_PG_HI21_NC:
 			overflow_check = false;
 			fallthrough;
 		case R_AARCH64_ADR_PREL_PG_HI21:
-			ovf = reloc_insn_adrp(me, sechdrs, loc, val);
+			ovf = reloc_insn_adrp(me, sechdrs, loc, val, early);
 			if (ovf && ovf != -ERANGE)
 				return ovf;
 			break;
@@ -412,40 +425,40 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 		case R_AARCH64_LDST8_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 0, 12,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_LDST16_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 1, 11,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_LDST32_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 2, 10,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_LDST64_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 3, 9,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_LDST128_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 4, 8,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_TSTBR14:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 14,
-					     AARCH64_INSN_IMM_14);
+					     AARCH64_INSN_IMM_14, early);
 			break;
 		case R_AARCH64_CONDBR19:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19,
-					     AARCH64_INSN_IMM_19);
+					     AARCH64_INSN_IMM_19, early);
 			break;
 		case R_AARCH64_JUMP26:
 		case R_AARCH64_CALL26:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26,
-					     AARCH64_INSN_IMM_26);
+					     AARCH64_INSN_IMM_26, early);
 
 			if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
 			    ovf == -ERANGE) {
@@ -453,7 +466,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 				if (!val)
 					return -ENOEXEC;
 				ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2,
-						     26, AARCH64_INSN_IMM_26);
+						     26, AARCH64_INSN_IMM_26, early);
 			}
 			break;
 

From 964654b02e283f5fd300ba52a7c7b1088e35f1b6 Mon Sep 17 00:00:00 2001
From: Vladis Dronov <vdronov@redhat.com>
Date: Tue, 17 Jan 2023 18:20:06 +0100
Subject: [PATCH 085/175] crypto: testmgr - disallow certain DRBG hash
 functions in FIPS mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to FIPS 140-3 IG, section D.R "Hash Functions Acceptable for
Use in the SP 800-90A DRBGs", modules certified after May 16th, 2023
must not support the use of: SHA-224, SHA-384, SHA512-224, SHA512-256,
SHA3-224, SHA3-384. Disallow HMAC and HASH DRBGs using SHA-384 in FIPS
mode.

Signed-off-by: Vladis Dronov <vdronov@redhat.com>
Reviewed-by: Stephan Müller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index c74ef009fb999..a6c2d1a563ba8 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4848,7 +4848,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		/* covered by drbg_nopr_hmac_sha256 test */
 		.alg = "drbg_nopr_hmac_sha384",
-		.fips_allowed = 1,
 		.test = alg_test_null,
 	}, {
 		.alg = "drbg_nopr_hmac_sha512",
@@ -4871,7 +4870,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		/* covered by drbg_nopr_sha256 test */
 		.alg = "drbg_nopr_sha384",
-		.fips_allowed = 1,
 		.test = alg_test_null,
 	}, {
 		.alg = "drbg_nopr_sha512",
@@ -4907,7 +4905,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		/* covered by drbg_pr_hmac_sha256 test */
 		.alg = "drbg_pr_hmac_sha384",
-		.fips_allowed = 1,
 		.test = alg_test_null,
 	}, {
 		.alg = "drbg_pr_hmac_sha512",
@@ -4927,7 +4924,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		/* covered by drbg_pr_sha256 test */
 		.alg = "drbg_pr_sha384",
-		.fips_allowed = 1,
 		.test = alg_test_null,
 	}, {
 		.alg = "drbg_pr_sha512",

From 4a4f4684ece15886c90d6ba63be1130af844d052 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nstange@suse.de>
Date: Thu, 29 Dec 2022 22:17:08 +0100
Subject: [PATCH 086/175] crypto: testmgr - disallow plain cbcmac(aes) in FIPS
 mode

cbcmac(aes) may be used only as part of the ccm(aes) construction in FIPS
mode. Since commit d6097b8d5d55 ("crypto: api - allow algs only in specific
constructions in FIPS mode") there's support for using spawns which by
itself are marked as non-approved from approved template instantiations.
So simply mark plain cbcmac(aes) as non-approved in testmgr to block any
attempts of direct instantiations in FIPS mode.

Signed-off-by: Nicolai Stange <nstange@suse.de>
Signed-off-by: Vladis Dronov <vdronov@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index a6c2d1a563ba8..4b4a4e1168243 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4573,7 +4573,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 #endif
 		.alg = "cbcmac(aes)",
-		.fips_allowed = 1,
 		.test = alg_test_hash,
 		.suite = {
 			.hash = __VECS(aes_cbcmac_tv_template)

From acc71d8d9c63e84c0bc11ab5367c1e451adaab2b Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nstange@suse.de>
Date: Thu, 29 Dec 2022 22:17:09 +0100
Subject: [PATCH 087/175] crypto: testmgr - disallow plain ghash in FIPS mode

ghash may be used only as part of the gcm(aes) construction in FIPS
mode. Since commit d6097b8d5d55 ("crypto: api - allow algs only in specific
constructions in FIPS mode") there's support for using spawns which by
itself are marked as non-approved from approved template instantiations.
So simply mark plain ghash as non-approved in testmgr to block any attempts
of direct instantiations in FIPS mode.

Signed-off-by: Nicolai Stange <nstange@suse.de>
Signed-off-by: Vladis Dronov <vdronov@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 4b4a4e1168243..3bad7678c57a2 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5187,7 +5187,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "ghash",
 		.test = alg_test_hash,
-		.fips_allowed = 1,
 		.suite = {
 			.hash = __VECS(ghash_tv_template)
 		}

From 4e0b83ad54876e243373bc42e1307d575289a3f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stephan=20M=C3=BCller?= <smueller@chronox.de>
Date: Fri, 21 Apr 2023 08:08:04 +0200
Subject: [PATCH 088/175] crypto: jitter - replace LFSR with SHA3-256

Using the kernel crypto API, the SHA3-256 algorithm is used as
conditioning element to replace the LFSR in the Jitter RNG. All other
parts of the Jitter RNG are unchanged.

The application and use of the SHA-3 conditioning operation is identical
to the user space Jitter RNG 3.4.0 by applying the following concept:

- the Jitter RNG initializes a SHA-3 state which acts as the "entropy
  pool" when the Jitter RNG is allocated.

- When a new time delta is obtained, it is inserted into the "entropy
  pool" with a SHA-3 update operation. Note, this operation in most of
  the cases is a simple memcpy() onto the SHA-3 stack.

- To cause a true SHA-3 operation for each time delta operation, a
  second SHA-3 operation is performed hashing Jitter RNG status
  information. The final message digest is also inserted into the
  "entropy pool" with a SHA-3 update operation. Yet, this data is not
  considered to provide any entropy, but it shall stir the entropy pool.

- To generate a random number, a SHA-3 final operation is performed to
  calculate a message digest followed by an immediate SHA-3 init to
  re-initialize the "entropy pool". The obtained message digest is one
  block of the Jitter RNG that is returned to the caller.

Mathematically speaking, the random number generated by the Jitter RNG
is:

aux_t = SHA-3(Jitter RNG state data)

Jitter RNG block = SHA-3(time_i || aux_i || time_(i-1) || aux_(i-1) ||
                         ... || time_(i-255) || aux_(i-255))

when assuming that the OSR = 1, i.e. the default value.

This operation implies that the Jitter RNG has an output-blocksize of
256 bits instead of the 64 bits of the LFSR-based Jitter RNG that is
replaced with this patch.

The patch also replaces the varying number of invocations of the
conditioning function with one fixed number of invocations. The use
of the conditioning function consistent with the userspace Jitter RNG
library version 3.4.0.

The code is tested with a system that exhibited the least amount of
entropy generated by the Jitter RNG: the SiFive Unmatched RISC-V
system. The measured entropy rate is well above the heuristically
implied entropy value of 1 bit of entropy per time delta. On all other
tested systems, the measured entropy rate is even higher by orders
of magnitude. The measurement was performed using updated tooling
provided with the user space Jitter RNG library test framework.

The performance of the Jitter RNG with this patch is about en par
with the performance of the Jitter RNG without the patch.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/Kconfig               |   1 +
 crypto/jitterentropy-kcapi.c | 183 +++++++++++++++++++++++++++++++----
 crypto/jitterentropy.c       | 145 +++++++++------------------
 crypto/jitterentropy.h       |  10 +-
 4 files changed, 219 insertions(+), 120 deletions(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index edf193aff23e7..69907bc4a0984 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1280,6 +1280,7 @@ endif	# if CRYPTO_DRBG_MENU
 config CRYPTO_JITTERENTROPY
 	tristate "CPU Jitter Non-Deterministic RNG (Random Number Generator)"
 	select CRYPTO_RNG
+	select CRYPTO_SHA3
 	help
 	  CPU Jitter RNG (Random Number Generator) from the Jitterentropy library
 
diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c
index b9edfaa51b273..4b50cbc8a2faf 100644
--- a/crypto/jitterentropy-kcapi.c
+++ b/crypto/jitterentropy-kcapi.c
@@ -2,7 +2,7 @@
  * Non-physical true random number generator based on timing jitter --
  * Linux Kernel Crypto API specific code
  *
- * Copyright Stephan Mueller <smueller@chronox.de>, 2015
+ * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -37,6 +37,8 @@
  * DAMAGE.
  */
 
+#include <crypto/hash.h>
+#include <crypto/sha3.h>
 #include <linux/fips.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -46,6 +48,8 @@
 
 #include "jitterentropy.h"
 
+#define JENT_CONDITIONING_HASH	"sha3-256-generic"
+
 /***************************************************************************
  * Helper function
  ***************************************************************************/
@@ -60,11 +64,6 @@ void jent_zfree(void *ptr)
 	kfree_sensitive(ptr);
 }
 
-void jent_memcpy(void *dest, const void *src, unsigned int n)
-{
-	memcpy(dest, src, n);
-}
-
 /*
  * Obtain a high-resolution time stamp value. The time stamp is used to measure
  * the execution time of a given code path and its variations. Hence, the time
@@ -91,6 +90,91 @@ void jent_get_nstime(__u64 *out)
 	*out = tmp;
 }
 
+int jent_hash_time(void *hash_state, __u64 time, u8 *addtl,
+		   unsigned int addtl_len, __u64 hash_loop_cnt,
+		   unsigned int stuck)
+{
+	struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state;
+	SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm);
+	u8 intermediary[SHA3_256_DIGEST_SIZE];
+	__u64 j = 0;
+	int ret;
+
+	desc->tfm = hash_state_desc->tfm;
+
+	if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) {
+		pr_warn_ratelimited("Unexpected digest size\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * This loop fills a buffer which is injected into the entropy pool.
+	 * The main reason for this loop is to execute something over which we
+	 * can perform a timing measurement. The injection of the resulting
+	 * data into the pool is performed to ensure the result is used and
+	 * the compiler cannot optimize the loop away in case the result is not
+	 * used at all. Yet that data is considered "additional information"
+	 * considering the terminology from SP800-90A without any entropy.
+	 *
+	 * Note, it does not matter which or how much data you inject, we are
+	 * interested in one Keccack1600 compression operation performed with
+	 * the crypto_shash_final.
+	 */
+	for (j = 0; j < hash_loop_cnt; j++) {
+		ret = crypto_shash_init(desc) ?:
+		      crypto_shash_update(desc, intermediary,
+					  sizeof(intermediary)) ?:
+		      crypto_shash_finup(desc, addtl, addtl_len, intermediary);
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * Inject the data from the previous loop into the pool. This data is
+	 * not considered to contain any entropy, but it stirs the pool a bit.
+	 */
+	ret = crypto_shash_update(desc, intermediary, sizeof(intermediary));
+	if (ret)
+		goto err;
+
+	/*
+	 * Insert the time stamp into the hash context representing the pool.
+	 *
+	 * If the time stamp is stuck, do not finally insert the value into the
+	 * entropy pool. Although this operation should not do any harm even
+	 * when the time stamp has no entropy, SP800-90B requires that any
+	 * conditioning operation to have an identical amount of input data
+	 * according to section 3.1.5.
+	 */
+	if (!stuck) {
+		ret = crypto_shash_update(hash_state_desc, (u8 *)&time,
+					  sizeof(__u64));
+	}
+
+err:
+	shash_desc_zero(desc);
+	memzero_explicit(intermediary, sizeof(intermediary));
+
+	return ret;
+}
+
+int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len)
+{
+	struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state;
+	u8 jent_block[SHA3_256_DIGEST_SIZE];
+	/* Obtain data from entropy pool and re-initialize it */
+	int ret = crypto_shash_final(hash_state_desc, jent_block) ?:
+		  crypto_shash_init(hash_state_desc) ?:
+		  crypto_shash_update(hash_state_desc, jent_block,
+				      sizeof(jent_block));
+
+	if (!ret && dst_len)
+		memcpy(dst, jent_block, dst_len);
+
+	memzero_explicit(jent_block, sizeof(jent_block));
+	return ret;
+}
+
 /***************************************************************************
  * Kernel crypto API interface
  ***************************************************************************/
@@ -98,32 +182,82 @@ void jent_get_nstime(__u64 *out)
 struct jitterentropy {
 	spinlock_t jent_lock;
 	struct rand_data *entropy_collector;
+	struct crypto_shash *tfm;
+	struct shash_desc *sdesc;
 };
 
-static int jent_kcapi_init(struct crypto_tfm *tfm)
+static void jent_kcapi_cleanup(struct crypto_tfm *tfm)
 {
 	struct jitterentropy *rng = crypto_tfm_ctx(tfm);
-	int ret = 0;
 
-	rng->entropy_collector = jent_entropy_collector_alloc(1, 0);
-	if (!rng->entropy_collector)
-		ret = -ENOMEM;
+	spin_lock(&rng->jent_lock);
 
-	spin_lock_init(&rng->jent_lock);
-	return ret;
-}
+	if (rng->sdesc) {
+		shash_desc_zero(rng->sdesc);
+		kfree(rng->sdesc);
+	}
+	rng->sdesc = NULL;
 
-static void jent_kcapi_cleanup(struct crypto_tfm *tfm)
-{
-	struct jitterentropy *rng = crypto_tfm_ctx(tfm);
+	if (rng->tfm)
+		crypto_free_shash(rng->tfm);
+	rng->tfm = NULL;
 
-	spin_lock(&rng->jent_lock);
 	if (rng->entropy_collector)
 		jent_entropy_collector_free(rng->entropy_collector);
 	rng->entropy_collector = NULL;
 	spin_unlock(&rng->jent_lock);
 }
 
+static int jent_kcapi_init(struct crypto_tfm *tfm)
+{
+	struct jitterentropy *rng = crypto_tfm_ctx(tfm);
+	struct crypto_shash *hash;
+	struct shash_desc *sdesc;
+	int size, ret = 0;
+
+	spin_lock_init(&rng->jent_lock);
+
+	/*
+	 * Use SHA3-256 as conditioner. We allocate only the generic
+	 * implementation as we are not interested in high-performance. The
+	 * execution time of the SHA3 operation is measured and adds to the
+	 * Jitter RNG's unpredictable behavior. If we have a slower hash
+	 * implementation, the execution timing variations are larger. When
+	 * using a fast implementation, we would need to call it more often
+	 * as its variations are lower.
+	 */
+	hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
+	if (IS_ERR(hash)) {
+		pr_err("Cannot allocate conditioning digest\n");
+		return PTR_ERR(hash);
+	}
+	rng->tfm = hash;
+
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(hash);
+	sdesc = kmalloc(size, GFP_KERNEL);
+	if (!sdesc) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	sdesc->tfm = hash;
+	crypto_shash_init(sdesc);
+	rng->sdesc = sdesc;
+
+	rng->entropy_collector = jent_entropy_collector_alloc(1, 0, sdesc);
+	if (!rng->entropy_collector) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	spin_lock_init(&rng->jent_lock);
+	return 0;
+
+err:
+	jent_kcapi_cleanup(tfm);
+	return ret;
+}
+
 static int jent_kcapi_random(struct crypto_rng *tfm,
 			     const u8 *src, unsigned int slen,
 			     u8 *rdata, unsigned int dlen)
@@ -180,15 +314,24 @@ static struct rng_alg jent_alg = {
 		.cra_module             = THIS_MODULE,
 		.cra_init               = jent_kcapi_init,
 		.cra_exit               = jent_kcapi_cleanup,
-
 	}
 };
 
 static int __init jent_mod_init(void)
 {
+	SHASH_DESC_ON_STACK(desc, tfm);
+	struct crypto_shash *tfm;
 	int ret = 0;
 
-	ret = jent_entropy_init();
+	tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	desc->tfm = tfm;
+	crypto_shash_init(desc);
+	ret = jent_entropy_init(desc);
+	shash_desc_zero(desc);
+	crypto_free_shash(tfm);
 	if (ret) {
 		/* Handle permanent health test error */
 		if (fips_enabled)
diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c
index 227cedfa4f0ae..c7d7f2caa7793 100644
--- a/crypto/jitterentropy.c
+++ b/crypto/jitterentropy.c
@@ -2,7 +2,7 @@
  * Non-physical true random number generator based on timing jitter --
  * Jitter RNG standalone code.
  *
- * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2020
+ * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023
  *
  * Design
  * ======
@@ -47,7 +47,7 @@
 
 /*
  * This Jitterentropy RNG is based on the jitterentropy library
- * version 2.2.0 provided at https://www.chronox.de/jent.html
+ * version 3.4.0 provided at https://www.chronox.de/jent.html
  */
 
 #ifdef __OPTIMIZE__
@@ -57,21 +57,22 @@
 typedef	unsigned long long	__u64;
 typedef	long long		__s64;
 typedef	unsigned int		__u32;
+typedef unsigned char		u8;
 #define NULL    ((void *) 0)
 
 /* The entropy pool */
 struct rand_data {
+	/* SHA3-256 is used as conditioner */
+#define DATA_SIZE_BITS 256
 	/* all data values that are vital to maintain the security
 	 * of the RNG are marked as SENSITIVE. A user must not
 	 * access that information while the RNG executes its loops to
 	 * calculate the next random value. */
-	__u64 data;		/* SENSITIVE Actual random number */
-	__u64 old_data;		/* SENSITIVE Previous random number */
-	__u64 prev_time;	/* SENSITIVE Previous time stamp */
-#define DATA_SIZE_BITS ((sizeof(__u64)) * 8)
-	__u64 last_delta;	/* SENSITIVE stuck test */
-	__s64 last_delta2;	/* SENSITIVE stuck test */
-	unsigned int osr;	/* Oversample rate */
+	void *hash_state;		/* SENSITIVE hash state entropy pool */
+	__u64 prev_time;		/* SENSITIVE Previous time stamp */
+	__u64 last_delta;		/* SENSITIVE stuck test */
+	__s64 last_delta2;		/* SENSITIVE stuck test */
+	unsigned int osr;		/* Oversample rate */
 #define JENT_MEMORY_BLOCKS 64
 #define JENT_MEMORY_BLOCKSIZE 32
 #define JENT_MEMORY_ACCESSLOOPS 128
@@ -301,15 +302,13 @@ static int jent_permanent_health_failure(struct rand_data *ec)
  * an entropy collection.
  *
  * Input:
- * @ec entropy collector struct -- may be NULL
  * @bits is the number of low bits of the timer to consider
  * @min is the number of bits we shift the timer value to the right at
  *	the end to make sure we have a guaranteed minimum value
  *
  * @return Newly calculated loop counter
  */
-static __u64 jent_loop_shuffle(struct rand_data *ec,
-			       unsigned int bits, unsigned int min)
+static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min)
 {
 	__u64 time = 0;
 	__u64 shuffle = 0;
@@ -317,12 +316,7 @@ static __u64 jent_loop_shuffle(struct rand_data *ec,
 	unsigned int mask = (1<<bits) - 1;
 
 	jent_get_nstime(&time);
-	/*
-	 * Mix the current state of the random number into the shuffle
-	 * calculation to balance that shuffle a bit more.
-	 */
-	if (ec)
-		time ^= ec->data;
+
 	/*
 	 * We fold the time value as much as possible to ensure that as many
 	 * bits of the time stamp are included as possible.
@@ -344,81 +338,32 @@ static __u64 jent_loop_shuffle(struct rand_data *ec,
  *			      execution time jitter
  *
  * This function injects the individual bits of the time value into the
- * entropy pool using an LFSR.
+ * entropy pool using a hash.
  *
- * The code is deliberately inefficient with respect to the bit shifting
- * and shall stay that way. This function is the root cause why the code
- * shall be compiled without optimization. This function not only acts as
- * folding operation, but this function's execution is used to measure
- * the CPU execution time jitter. Any change to the loop in this function
- * implies that careful retesting must be done.
- *
- * @ec [in] entropy collector struct
- * @time [in] time stamp to be injected
- * @loop_cnt [in] if a value not equal to 0 is set, use the given value as
- *		  number of loops to perform the folding
- * @stuck [in] Is the time stamp identified as stuck?
+ * ec [in] entropy collector
+ * time [in] time stamp to be injected
+ * stuck [in] Is the time stamp identified as stuck?
  *
  * Output:
- * updated ec->data
- *
- * @return Number of loops the folding operation is performed
+ * updated hash context in the entropy collector or error code
  */
-static void jent_lfsr_time(struct rand_data *ec, __u64 time, __u64 loop_cnt,
-			   int stuck)
+static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck)
 {
-	unsigned int i;
-	__u64 j = 0;
-	__u64 new = 0;
-#define MAX_FOLD_LOOP_BIT 4
-#define MIN_FOLD_LOOP_BIT 0
-	__u64 fold_loop_cnt =
-		jent_loop_shuffle(ec, MAX_FOLD_LOOP_BIT, MIN_FOLD_LOOP_BIT);
-
-	/*
-	 * testing purposes -- allow test app to set the counter, not
-	 * needed during runtime
-	 */
-	if (loop_cnt)
-		fold_loop_cnt = loop_cnt;
-	for (j = 0; j < fold_loop_cnt; j++) {
-		new = ec->data;
-		for (i = 1; (DATA_SIZE_BITS) >= i; i++) {
-			__u64 tmp = time << (DATA_SIZE_BITS - i);
-
-			tmp = tmp >> (DATA_SIZE_BITS - 1);
-
-			/*
-			* Fibonacci LSFR with polynomial of
-			*  x^64 + x^61 + x^56 + x^31 + x^28 + x^23 + 1 which is
-			*  primitive according to
-			*   http://poincare.matf.bg.ac.rs/~ezivkovm/publications/primpol1.pdf
-			* (the shift values are the polynomial values minus one
-			* due to counting bits from 0 to 63). As the current
-			* position is always the LSB, the polynomial only needs
-			* to shift data in from the left without wrap.
-			*/
-			tmp ^= ((new >> 63) & 1);
-			tmp ^= ((new >> 60) & 1);
-			tmp ^= ((new >> 55) & 1);
-			tmp ^= ((new >> 30) & 1);
-			tmp ^= ((new >> 27) & 1);
-			tmp ^= ((new >> 22) & 1);
-			new <<= 1;
-			new ^= tmp;
-		}
-	}
-
-	/*
-	 * If the time stamp is stuck, do not finally insert the value into
-	 * the entropy pool. Although this operation should not do any harm
-	 * even when the time stamp has no entropy, SP800-90B requires that
-	 * any conditioning operation (SP800-90B considers the LFSR to be a
-	 * conditioning operation) to have an identical amount of input
-	 * data according to section 3.1.5.
-	 */
-	if (!stuck)
-		ec->data = new;
+#define SHA3_HASH_LOOP (1<<3)
+	struct {
+		int rct_count;
+		unsigned int apt_observations;
+		unsigned int apt_count;
+		unsigned int apt_base;
+	} addtl = {
+		ec->rct_count,
+		ec->apt_observations,
+		ec->apt_count,
+		ec->apt_base
+	};
+
+	return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl),
+			      SHA3_HASH_LOOP, stuck);
 }
 
 /*
@@ -452,7 +397,7 @@ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt)
 #define MAX_ACC_LOOP_BIT 7
 #define MIN_ACC_LOOP_BIT 0
 	__u64 acc_loop_cnt =
-		jent_loop_shuffle(ec, MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT);
+		jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT);
 
 	if (NULL == ec || NULL == ec->mem)
 		return;
@@ -520,14 +465,15 @@ static int jent_measure_jitter(struct rand_data *ec)
 	stuck = jent_stuck(ec, current_delta);
 
 	/* Now call the next noise sources which also injects the data */
-	jent_lfsr_time(ec, current_delta, 0, stuck);
+	if (jent_condition_data(ec, current_delta, stuck))
+		stuck = 1;
 
 	return stuck;
 }
 
 /*
  * Generator of one 64 bit random number
- * Function fills rand_data->data
+ * Function fills rand_data->hash_state
  *
  * @ec [in] Reference to entropy collector
  */
@@ -574,7 +520,7 @@ static void jent_gen_entropy(struct rand_data *ec)
  * @return 0 when request is fulfilled or an error
  *
  * The following error codes can occur:
- *	-1	entropy_collector is NULL
+ *	-1	entropy_collector is NULL or the generation failed
  *	-2	Intermittent health failure
  *	-3	Permanent health failure
  */
@@ -604,7 +550,7 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data,
 			 * Perform startup health tests and return permanent
 			 * error if it fails.
 			 */
-			if (jent_entropy_init())
+			if (jent_entropy_init(ec->hash_state))
 				return -3;
 
 			return -2;
@@ -614,7 +560,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data,
 			tocopy = (DATA_SIZE_BITS / 8);
 		else
 			tocopy = len;
-		jent_memcpy(p, &ec->data, tocopy);
+		if (jent_read_random_block(ec->hash_state, p, tocopy))
+			return -1;
 
 		len -= tocopy;
 		p += tocopy;
@@ -628,7 +575,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data,
  ***************************************************************************/
 
 struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
-					       unsigned int flags)
+					       unsigned int flags,
+					       void *hash_state)
 {
 	struct rand_data *entropy_collector;
 
@@ -655,6 +603,8 @@ struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
 		osr = 1; /* minimum sampling rate is 1 */
 	entropy_collector->osr = osr;
 
+	entropy_collector->hash_state = hash_state;
+
 	/* fill the data pad with non-zero values */
 	jent_gen_entropy(entropy_collector);
 
@@ -668,7 +618,7 @@ void jent_entropy_collector_free(struct rand_data *entropy_collector)
 	jent_zfree(entropy_collector);
 }
 
-int jent_entropy_init(void)
+int jent_entropy_init(void *hash_state)
 {
 	int i;
 	__u64 delta_sum = 0;
@@ -681,6 +631,7 @@ int jent_entropy_init(void)
 
 	/* Required for RCT */
 	ec.osr = 1;
+	ec.hash_state = hash_state;
 
 	/* We could perform statistical tests here, but the problem is
 	 * that we only have a few loop counts to do testing. These
@@ -718,7 +669,7 @@ int jent_entropy_init(void)
 		/* Invoke core entropy collection logic */
 		jent_get_nstime(&time);
 		ec.prev_time = time;
-		jent_lfsr_time(&ec, time, 0, 0);
+		jent_condition_data(&ec, time, 0);
 		jent_get_nstime(&time2);
 
 		/* test whether timer works */
diff --git a/crypto/jitterentropy.h b/crypto/jitterentropy.h
index 5cc583f6bc6b8..b3890ff26a023 100644
--- a/crypto/jitterentropy.h
+++ b/crypto/jitterentropy.h
@@ -2,14 +2,18 @@
 
 extern void *jent_zalloc(unsigned int len);
 extern void jent_zfree(void *ptr);
-extern void jent_memcpy(void *dest, const void *src, unsigned int n);
 extern void jent_get_nstime(__u64 *out);
+extern int jent_hash_time(void *hash_state, __u64 time, u8 *addtl,
+			  unsigned int addtl_len, __u64 hash_loop_cnt,
+			  unsigned int stuck);
+int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len);
 
 struct rand_data;
-extern int jent_entropy_init(void);
+extern int jent_entropy_init(void *hash_state);
 extern int jent_read_entropy(struct rand_data *ec, unsigned char *data,
 			     unsigned int len);
 
 extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
-						      unsigned int flags);
+						      unsigned int flags,
+						      void *hash_state);
 extern void jent_entropy_collector_free(struct rand_data *entropy_collector);

From 8ffc63ea13d927dc14dfe6aeb15c3efee2d802f6 Mon Sep 17 00:00:00 2001
From: Hailey Mothershead <hailmo@amazon.com>
Date: Tue, 6 Jun 2023 19:39:51 +0000
Subject: [PATCH 089/175] crypto: testmgr - Remove xts4096(paes) and
 xts512(paes)

As of commit a93492c `crypto: ccree - remove data unit size support`
These algorithms are not supported in the kernel.
---
 crypto/testmgr.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 3bad7678c57a2..f8cba28b0e230 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5723,14 +5723,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 #endif
-		.alg = "xts4096(paes)",
-		.test = alg_test_null,
-		.fips_allowed = 1,
-	}, {
-		.alg = "xts512(paes)",
-		.test = alg_test_null,
-		.fips_allowed = 1,
-	}, {
 		.alg = "xxhash64",
 		.test = alg_test_hash,
 		.fips_allowed = 1,

From 1e007626ce7ea5b40335b5daba2da016275a26be Mon Sep 17 00:00:00 2001
From: Hailey Mothershead <hailmo@amazon.com>
Date: Tue, 6 Jun 2023 19:10:02 +0000
Subject: [PATCH 090/175] crypto: ecdh - zeroize crpytographic keys after use

FIPS 140-3 requires that variables used in the creation of public keys
must be zeroized once they are no longer in use. Accomplish this by
having kfree_sensitive use memzero_explicit.

It also specifies that Sensitive Security Parameters (SSPs) must be
zeroized after use and that overwriting these variables with a new SSP
is not sufficient for zeroization. So explicitly zeroize the private key
before it is overwritten.
---
 crypto/ecdh.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/ecdh.c b/crypto/ecdh.c
index 3049f147e0117..71599cadf0bc7 100644
--- a/crypto/ecdh.c
+++ b/crypto/ecdh.c
@@ -113,7 +113,7 @@ static int ecdh_compute_value(struct kpp_request *req)
 free_all:
 	kfree_sensitive(shared_secret);
 free_pubkey:
-	kfree(public_key);
+	kfree_sensitive(public_key);
 	return ret;
 }
 

From 5b914022e702fdf8753f19b4e294d17bcfbaff22 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <samjonas@amazon.com>
Date: Thu, 13 Apr 2023 18:54:27 +0000
Subject: [PATCH 091/175] Revert "drm: fb_helper: improve CONFIG_FB dependency"

This reverts commit 9d6366e743f37d36ef69347924ead7bcc596076e.
---
 drivers/gpu/drm/Kconfig | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index f30f99166531f..e0264211ca84b 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -124,8 +124,9 @@ config DRM_DEBUG_MODESET_LOCK
 
 config DRM_FBDEV_EMULATION
 	bool "Enable legacy fbdev support for your modesetting driver"
-	depends on DRM_KMS_HELPER
-	depends on FB=y || FB=DRM_KMS_HELPER
+	depends on DRM
+	depends on FB=y || FB=DRM
+	select DRM_KMS_HELPER
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
 	select FB_CFB_IMAGEBLIT

From b6a11a924b40b2b7bf52e77070d0c38c2327ea61 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert.xu@redhat.com>
Date: Tue, 10 Aug 2021 17:54:28 +0800
Subject: [PATCH 092/175] random: Add hook to override device reads and
 getrandom(2)

Bugzilla: https://bugzilla.redhat.com/2079030
Upstream Status: RHEL only

This patch introduces a hook mechanism to drivers/char/random
to allow the reads on /dev/*random as well as getrandom(2) to
be overridden by an external RNG.

This will be used to override drivers/char/random with a FIPS
RNG in a subsequent patch.

Signed-off-by: Herbert Xu <herbert.xu@redhat.com>
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
(cherry picked from commit 8787488953d2a17f27bb2f93f6c6d7b720888687)
[6.1: Handle some context in getrandom() and random.h, and convert the
extrng_*_fops to use random_write_iter().]
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/char/random.c  | 115 +++++++++++++++++++++++++++++++++++++++++
 include/linux/random.h |   7 +++
 2 files changed, 122 insertions(+)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index fd57eb372d492..ff8fdb6b013e6 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -53,6 +53,7 @@
 #include <linux/uaccess.h>
 #include <linux/suspend.h>
 #include <linux/siphash.h>
+#include <linux/rcupdate.h>
 #include <crypto/chacha.h>
 #include <crypto/blake2s.h>
 #include <asm/processor.h>
@@ -283,6 +284,11 @@ static unsigned int crng_reseed_interval(void)
 	return CRNG_RESEED_INTERVAL;
 }
 
+/*
+ * Hook for external RNG.
+ */
+static const struct random_extrng __rcu *extrng;
+
 /*
  * This function returns a ChaCha state that you may use for generating
  * random data. It also returns up to 32 bytes on its own of random data
@@ -570,6 +576,9 @@ int __cold random_prepare_cpu(unsigned int cpu)
 #endif
 
 
+static const struct file_operations extrng_random_fops;
+static const struct file_operations extrng_urandom_fops;
+
 /**********************************************************************
  *
  * Entropy accumulation and extraction routines.
@@ -936,6 +945,19 @@ void __init add_bootloader_randomness(const void *buf, size_t len)
 		credit_init_bits(len * 8);
 }
 
+void random_register_extrng(const struct random_extrng *rng)
+{
+	rcu_assign_pointer(extrng, rng);
+}
+EXPORT_SYMBOL_GPL(random_register_extrng);
+
+void random_unregister_extrng(void)
+{
+	RCU_INIT_POINTER(extrng, NULL);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(random_unregister_extrng);
+
 #if IS_ENABLED(CONFIG_VMGENID)
 static BLOCKING_NOTIFIER_HEAD(vmfork_chain);
 
@@ -1307,6 +1329,7 @@ static void __cold try_to_generate_entropy(void)
 
 SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
 {
+	const struct random_extrng *rng;
 	struct iov_iter iter;
 	struct iovec iov;
 	int ret;
@@ -1321,6 +1344,18 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags
 	if ((flags & (GRND_INSECURE | GRND_RANDOM)) == (GRND_INSECURE | GRND_RANDOM))
 		return -EINVAL;
 
+	rcu_read_lock();
+	rng = rcu_dereference(extrng);
+	if (rng && !try_module_get(rng->owner))
+		rng = NULL;
+	rcu_read_unlock();
+
+	if (rng) {
+		ret = rng->extrng_read(ubuf, len);
+		module_put(rng->owner);
+		return ret;
+	}
+
 	if (!crng_ready() && !(flags & GRND_INSECURE)) {
 		if (flags & GRND_NONBLOCK)
 			return -EAGAIN;
@@ -1329,6 +1364,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags
 			return ret;
 	}
 
+
 	ret = import_single_range(ITER_DEST, ubuf, len, &iov, &iter);
 	if (unlikely(ret))
 		return ret;
@@ -1341,6 +1377,13 @@ static __poll_t random_poll(struct file *file, poll_table *wait)
 	return crng_ready() ? EPOLLIN | EPOLLRDNORM : EPOLLOUT | EPOLLWRNORM;
 }
 
+static __poll_t
+extrng_poll(struct file *file, poll_table * wait)
+{
+	/* extrng pool is always full, always read, no writes */
+	return EPOLLIN | EPOLLRDNORM;
+}
+
 static ssize_t write_pool_user(struct iov_iter *iter)
 {
 	u8 block[BLAKE2S_BLOCK_SIZE];
@@ -1482,9 +1525,60 @@ static int random_fasync(int fd, struct file *filp, int on)
 	return fasync_helper(fd, filp, on, &fasync);
 }
 
+static int random_open(struct inode *inode, struct file *filp)
+{
+	const struct random_extrng *rng;
+
+	rcu_read_lock();
+	rng = rcu_dereference(extrng);
+	if (rng && !try_module_get(rng->owner))
+		rng = NULL;
+	rcu_read_unlock();
+
+	if (!rng)
+		return 0;
+
+	filp->f_op = &extrng_random_fops;
+	filp->private_data = rng->owner;
+
+	return 0;
+}
+
+static int urandom_open(struct inode *inode, struct file *filp)
+{
+	const struct random_extrng *rng;
+
+	rcu_read_lock();
+	rng = rcu_dereference(extrng);
+	if (rng && !try_module_get(rng->owner))
+		rng = NULL;
+	rcu_read_unlock();
+
+	if (!rng)
+		return 0;
+
+	filp->f_op = &extrng_urandom_fops;
+	filp->private_data = rng->owner;
+
+	return 0;
+}
+
+static int extrng_release(struct inode *inode, struct file *filp)
+{
+	module_put(filp->private_data);
+	return 0;
+}
+
+static ssize_t
+extrng_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+{
+	return rcu_dereference_raw(extrng)->extrng_read(buf, nbytes);
+}
+
 const struct file_operations random_fops = {
 	.read_iter = random_read_iter,
 	.write_iter = random_write_iter,
+	.open  = random_open,
 	.poll = random_poll,
 	.unlocked_ioctl = random_ioctl,
 	.compat_ioctl = compat_ptr_ioctl,
@@ -1497,6 +1591,7 @@ const struct file_operations random_fops = {
 const struct file_operations urandom_fops = {
 	.read_iter = urandom_read_iter,
 	.write_iter = random_write_iter,
+	.open  = urandom_open,
 	.unlocked_ioctl = random_ioctl,
 	.compat_ioctl = compat_ptr_ioctl,
 	.fasync = random_fasync,
@@ -1505,6 +1600,26 @@ const struct file_operations urandom_fops = {
 	.splice_write = iter_file_splice_write,
 };
 
+static const struct file_operations extrng_random_fops = {
+	.open  = random_open,
+	.read  = extrng_read,
+	.write_iter = random_write_iter,
+	.poll  = extrng_poll,
+	.unlocked_ioctl = random_ioctl,
+	.fasync = random_fasync,
+	.llseek = noop_llseek,
+	.release = extrng_release,
+};
+
+static const struct file_operations extrng_urandom_fops = {
+	.open  = urandom_open,
+	.read  = extrng_read,
+	.write_iter = random_write_iter,
+	.unlocked_ioctl = random_ioctl,
+	.fasync = random_fasync,
+	.llseek = noop_llseek,
+	.release = extrng_release,
+};
 
 /********************************************************************
  *
diff --git a/include/linux/random.h b/include/linux/random.h
index 51133627ba73a..76a1b085bcf33 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -10,6 +10,11 @@
 
 #include <uapi/linux/random.h>
 
+struct random_extrng {
+	ssize_t (*extrng_read)(void __user *buf, size_t buflen);
+	struct module *owner;
+};
+
 struct notifier_block;
 
 void add_device_randomness(const void *buf, size_t len);
@@ -139,6 +144,8 @@ void __init random_init_early(const char *command_line);
 void __init random_init(void);
 bool rng_is_initialized(void);
 int wait_for_random_bytes(void);
+void random_register_extrng(const struct random_extrng *rng);
+void random_unregister_extrng(void);
 
 /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes).
  * Returns the result of the call to wait_for_random_bytes. */

From 4bc7040a889749186806a524dfe2fe96c9a2cc61 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert.xu@redhat.com>
Date: Tue, 10 Aug 2021 17:56:42 +0800
Subject: [PATCH 093/175] crypto: rng - Override drivers/char/random in FIPS
 mode

Upstream: RHEL only
Bugzilla: 1984784

This patch overrides the drivers/char/random RNGs with the FIPS
RNG from Crypto API when FIPS mode is enabled.

Signed-off-by: Herbert Xu <herbert.xu@redhat.com>
(cherry picked from commit 37e0042aaf43d4494bcbea2113605366d0fe6187)
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 crypto/rng.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/crypto/rng.c b/crypto/rng.c
index fea082b25fe4b..50a9d040bed1c 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -11,14 +11,17 @@
 #include <linux/atomic.h>
 #include <crypto/internal/rng.h>
 #include <linux/err.h>
+#include <linux/fips.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cryptouser.h>
-#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -224,5 +227,73 @@ void crypto_unregister_rngs(struct rng_alg *algs, int count)
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_rngs);
 
+static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen)
+{
+	u8 tmp[256];
+	ssize_t ret;
+
+	if (!buflen)
+		return 0;
+
+	ret = crypto_get_default_rng();
+	if (ret)
+		return ret;
+
+	for (;;) {
+		int err;
+		int i;
+
+		i = min_t(int, buflen, sizeof(tmp));
+		err = crypto_rng_get_bytes(crypto_default_rng, tmp, i);
+		if (err) {
+			ret = err;
+			break;
+		}
+
+		if (copy_to_user(buf, tmp, i)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		buflen -= i;
+		buf += i;
+		ret += i;
+
+		if (!buflen)
+			break;
+
+		if (need_resched()) {
+			if (signal_pending(current))
+				break;
+			schedule();
+		}
+	}
+
+	crypto_put_default_rng();
+	memzero_explicit(tmp, sizeof(tmp));
+
+	return ret;
+}
+
+static const struct random_extrng crypto_devrandom_rng = {
+	.extrng_read = crypto_devrandom_read,
+	.owner = THIS_MODULE,
+};
+
+static int __init crypto_rng_init(void)
+{
+	if (fips_enabled)
+		random_register_extrng(&crypto_devrandom_rng);
+	return 0;
+}
+
+static void __exit crypto_rng_exit(void)
+{
+	random_unregister_extrng();
+}
+
+late_initcall(crypto_rng_init);
+module_exit(crypto_rng_exit);
+
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Random Number Generator");

From 05c6dd4804c838c165ffdf83b9caf01d6a24373e Mon Sep 17 00:00:00 2001
From: Mahmoud Adam <mngyadam@amazon.com>
Date: Tue, 6 Jun 2023 14:06:31 +0000
Subject: [PATCH 094/175] crypto: rsa - allow only odd e and restrict value in
 FIPS mode

check if rsa public exponent is odd and check its value is between
2^16 < e < 2^256.

FIPS 186-5 DSS (page 35)[1] specify that:
1. The public exponent e shall be selected with the following constraints:
  (a) The public verification exponent e shall be selected prior to
  generating the primes, p and q, and the private signature exponent
  d.
  (b) The exponent e shall be an odd positive integer such that:
   2^16 < e < 2^256.

[1] https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.186-5.pdf

Signed-off-by: Mahmoud Adam <mngyadam@amazon.com>
---
 crypto/rsa.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/crypto/rsa.c b/crypto/rsa.c
index c50f2d2a4d064..c79613cdce6e4 100644
--- a/crypto/rsa.c
+++ b/crypto/rsa.c
@@ -205,6 +205,32 @@ static int rsa_check_key_length(unsigned int len)
 	return -EINVAL;
 }
 
+static int rsa_check_exponent_fips(MPI e)
+{
+	MPI e_max = NULL;
+
+	/* check if odd */
+	if (!mpi_test_bit(e, 0)) {
+		return -EINVAL;
+	}
+
+	/* check if 2^16 < e < 2^256. */
+	if (mpi_cmp_ui(e, 65536) <= 0) {
+		return -EINVAL;
+	}
+
+	e_max = mpi_alloc(0);
+	mpi_set_bit(e_max, 256);
+
+	if (mpi_cmp(e, e_max) >= 0) {
+		mpi_free(e_max);
+		return -EINVAL;
+	}
+
+	mpi_free(e_max);
+	return 0;
+}
+
 static int rsa_set_pub_key(struct crypto_akcipher *tfm, const void *key,
 			   unsigned int keylen)
 {
@@ -232,6 +258,11 @@ static int rsa_set_pub_key(struct crypto_akcipher *tfm, const void *key,
 		return -EINVAL;
 	}
 
+	if (fips_enabled && rsa_check_exponent_fips(mpi_key->e)) {
+		rsa_free_mpi_key(mpi_key);
+		return -EINVAL;
+	}
+
 	return 0;
 
 err:
@@ -290,6 +321,11 @@ static int rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key,
 		return -EINVAL;
 	}
 
+	if (fips_enabled && rsa_check_exponent_fips(mpi_key->e)) {
+		rsa_free_mpi_key(mpi_key);
+		return -EINVAL;
+	}
+
 	return 0;
 
 err:

From c5a2d382a5659524f92fa909fc5583545b70aef1 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <samjonas@amazon.com>
Date: Fri, 9 Jun 2023 11:49:58 -0700
Subject: [PATCH 095/175] crypto: Only allow GCM in FIPS when instantiated via
 seqiv

FIPS 140-3 requires that GCM can be approved IFF it is instantiated via
seqiv. Set a new flag CRYPTO_ALG_FIPS140_COMPLIANT in a separate init
function aead_init_seqiv() to represent that seqiv was used instead of
geniv.

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 crypto/seqiv.c         | 8 +++++++-
 include/linux/crypto.h | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/crypto/seqiv.c b/crypto/seqiv.c
index b1bcfe537daf1..952dbeae7a08e 100644
--- a/crypto/seqiv.c
+++ b/crypto/seqiv.c
@@ -133,6 +133,12 @@ static int seqiv_aead_decrypt(struct aead_request *req)
 	return crypto_aead_decrypt(subreq);
 }
 
+static int aead_init_seqiv(struct crypto_aead *aead)
+{
+	crypto_aead_set_flags(aead, CRYPTO_ALG_FIPS140_COMPLIANT);
+	return aead_init_geniv(aead);
+}
+
 static int seqiv_aead_create(struct crypto_template *tmpl, struct rtattr **tb)
 {
 	struct aead_instance *inst;
@@ -150,7 +156,7 @@ static int seqiv_aead_create(struct crypto_template *tmpl, struct rtattr **tb)
 	inst->alg.encrypt = seqiv_aead_encrypt;
 	inst->alg.decrypt = seqiv_aead_decrypt;
 
-	inst->alg.init = aead_init_geniv;
+	inst->alg.init = aead_init_seqiv;
 	inst->alg.exit = aead_exit_geniv;
 
 	inst->alg.base.cra_ctxsize = sizeof(struct aead_geniv_ctx);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index e3c4be29aaccb..8098891d38a84 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -141,6 +141,12 @@
  */
 #define CRYPTO_ALG_FIPS_INTERNAL	0x00020000
 
+/*
+ * Mark an algorithm as approved for FIPS 140-3. This is intended to be used
+ * for algorithms that are only approved under certain conditions
+ */
+#define CRYPTO_ALG_FIPS140_COMPLIANT	0x00040000
+
 /*
  * Transform masks and values (for crt_flags).
  */

From 5ef57524be6a7bf044e6d93c54031b3524c6061c Mon Sep 17 00:00:00 2001
From: Hailey Mothershead <hailmo@amazon.com>
Date: Sat, 17 Jun 2023 16:12:15 +0000
Subject: [PATCH 096/175] crypto: tcrypt.c - Add selftest for ffdhe algorithims

For fips 140-3 at least one self-test for this algorithim must be run on
boot. Add an invocation here.
---
 crypto/tcrypt.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index b23235d58a122..8e0ab5c2dad04 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1702,6 +1702,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret = min(ret, tcrypt_test("gcm(aria)"));
 		break;
 
+	case 59:
+		ret = min(ret, tcrypt_test("ffdhe2048(dh)"));
+		break;
+
 	case 100:
 		ret = min(ret, tcrypt_test("hmac(md5)"));
 		break;

From 3ab19909880f61af278e6ba7d44556a37c4023a1 Mon Sep 17 00:00:00 2001
From: Stewart Smith <trawets@amazon.com>
Date: Tue, 14 Feb 2023 18:46:17 -0800
Subject: [PATCH 097/175] net/ipv6: Improve performance of inet6_ehashfn()

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 net/ipv6/inet6_hashtables.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3616225c89ef6..fd0fc09a4e304 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -33,8 +33,14 @@ u32 inet6_ehashfn(const struct net *net,
 	net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
 	net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
 
-	lhash = (__force u32)laddr->s6_addr32[3];
-	fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret);
+	lhash = jhash_3words((__force u32)laddr->s6_addr32[3],
+			    (((u32)lport) << 16) | (__force u32)fport,
+			    (__force u32)faddr->s6_addr32[0],
+			    ipv6_hash_secret);
+	fhash = jhash_3words((__force u32)faddr->s6_addr32[1],
+			    (__force u32)faddr->s6_addr32[2],
+			    (__force u32)faddr->s6_addr32[3],
+			    ipv6_hash_secret);
 
 	return __inet6_ehashfn(lhash, lport, fhash, fport,
 			       inet6_ehash_secret + net_hash_mix(net));

From cd9da36dae2ca07d1f412571c681b2742e610027 Mon Sep 17 00:00:00 2001
From: Daiki Ueno <dueno@redhat.com>
Date: Wed, 3 Aug 2022 16:09:02 +0900
Subject: [PATCH 098/175] random: allow reseeding DRBG with getrandom

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2114854

Upstream Status: RHEL only

According to SP800-90C, when multiple DRBGs are chained, meaning that
a target DRBG is seeded using the output of the source DRBG, the
source DRBG shall be reseeded with fresh entropy before generating the
output.  This patch extends the getrandom syscall to allow userspace
programs to request the kernel to reseed the internal DRBG at the same
time it generates new pseudorandom bytes, repurposing the currently
unused GRND_RANDOM flag.

Signed-off-by: Daiki Ueno <dueno@redhat.com>
(cherry picked from RHEL commit c72f5361)
[6.1 fix minor conflict in context in getrandom()]
---
 crypto/drbg.c          | 12 ++++--
 crypto/rng.c           | 84 +++++++++++++++++++++++++++++++++++++-----
 drivers/char/random.c  |  4 +-
 include/linux/crypto.h |  1 +
 include/linux/random.h |  2 +-
 5 files changed, 87 insertions(+), 16 deletions(-)

diff --git a/crypto/drbg.c b/crypto/drbg.c
index ff4ebbc68efab..c58d2f871855f 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1510,13 +1510,14 @@ static int drbg_generate(struct drbg_state *drbg,
  * Wrapper around drbg_generate which can pull arbitrary long strings
  * from the DRBG without hitting the maximum request limitation.
  *
- * Parameters: see drbg_generate
+ * Parameters: see drbg_generate, except @reseed, which triggers reseeding
  * Return codes: see drbg_generate -- if one drbg_generate request fails,
  *		 the entire drbg_generate_long request fails
  */
 static int drbg_generate_long(struct drbg_state *drbg,
 			      unsigned char *buf, unsigned int buflen,
-			      struct drbg_string *addtl)
+			      struct drbg_string *addtl,
+			      bool reseed)
 {
 	unsigned int len = 0;
 	unsigned int slice = 0;
@@ -1526,6 +1527,8 @@ static int drbg_generate_long(struct drbg_state *drbg,
 		slice = ((buflen - len) / drbg_max_request_bytes(drbg));
 		chunk = slice ? drbg_max_request_bytes(drbg) : (buflen - len);
 		mutex_lock(&drbg->drbg_mutex);
+		if (reseed)
+			drbg->seeded = DRBG_SEED_STATE_UNSEEDED;
 		err = drbg_generate(drbg, buf + len, chunk, addtl);
 		mutex_unlock(&drbg->drbg_mutex);
 		if (0 > err)
@@ -1959,7 +1962,10 @@ static int drbg_kcapi_random(struct crypto_rng *tfm,
 		addtl = &string;
 	}
 
-	return drbg_generate_long(drbg, dst, dlen, addtl);
+	return drbg_generate_long(drbg, dst, dlen, addtl,
+				  (crypto_tfm_get_flags(crypto_rng_tfm(tfm)) &
+				   CRYPTO_TFM_REQ_NEED_RESEED) ==
+				  CRYPTO_TFM_REQ_NEED_RESEED);
 }
 
 /*
diff --git a/crypto/rng.c b/crypto/rng.c
index 50a9d040bed1c..5f4d0c65e557f 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -116,31 +116,37 @@ struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask)
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_rng);
 
-int crypto_get_default_rng(void)
+static int __crypto_get_default_rng(void)
 {
 	struct crypto_rng *rng;
 	int err;
 
-	mutex_lock(&crypto_default_rng_lock);
 	if (!crypto_default_rng) {
 		rng = crypto_alloc_rng("stdrng", 0, 0);
 		err = PTR_ERR(rng);
 		if (IS_ERR(rng))
-			goto unlock;
+			return err;
 
 		err = crypto_rng_reset(rng, NULL, crypto_rng_seedsize(rng));
 		if (err) {
 			crypto_free_rng(rng);
-			goto unlock;
+			return err;
 		}
 
 		crypto_default_rng = rng;
 	}
 
 	crypto_default_rng_refcnt++;
-	err = 0;
 
-unlock:
+	return 0;
+}
+
+int crypto_get_default_rng(void)
+{
+	int err;
+
+	mutex_lock(&crypto_default_rng_lock);
+	err = __crypto_get_default_rng();
 	mutex_unlock(&crypto_default_rng_lock);
 
 	return err;
@@ -227,7 +233,8 @@ void crypto_unregister_rngs(struct rng_alg *algs, int count)
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_rngs);
 
-static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen)
+static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen,
+				     bool reseed)
 {
 	u8 tmp[256];
 	ssize_t ret;
@@ -235,9 +242,65 @@ static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen)
 	if (!buflen)
 		return 0;
 
-	ret = crypto_get_default_rng();
-	if (ret)
-		return ret;
+	if (reseed) {
+		int err;
+		int i;
+		u32 flags = 0;
+
+		/* If reseeding is requested, acquire a lock on
+		 * crypto_default_rng so it is not swapped out until
+		 * the initial random bytes are generated.
+		 *
+		 * The algorithm implementation is also protected with
+		 * a separate mutex (drbg->drbg_mutex) around the
+		 * reseed-and-generate operation.
+		 */
+		mutex_lock(&crypto_default_rng_lock);
+
+		/* If crypto_default_rng is not set, it will be seeded
+		 * at creation in __crypto_get_default_rng and thus no
+		 * reseeding is needed.
+		 */
+		if (crypto_default_rng)
+			flags |= CRYPTO_TFM_REQ_NEED_RESEED;
+
+		ret = __crypto_get_default_rng();
+		if (ret) {
+			mutex_unlock(&crypto_default_rng_lock);
+			return ret;
+		}
+
+		crypto_tfm_set_flags(crypto_rng_tfm(crypto_default_rng),
+				     flags);
+
+		i = min_t(int, buflen, sizeof(tmp));
+		err = crypto_rng_get_bytes(crypto_default_rng, tmp, i);
+
+		crypto_tfm_clear_flags(crypto_rng_tfm(crypto_default_rng),
+				       flags);
+
+		mutex_unlock(&crypto_default_rng_lock);
+		if (err) {
+			ret = err;
+			goto out;
+		}
+
+		if (copy_to_user(buf, tmp, i)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		buflen -= i;
+		buf += i;
+		ret += i;
+
+		if (!buflen)
+			goto out;
+	} else {
+		ret = crypto_get_default_rng();
+		if (ret)
+			return ret;
+	}
 
 	for (;;) {
 		int err;
@@ -269,6 +332,7 @@ static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen)
 		}
 	}
 
+ out:
 	crypto_put_default_rng();
 	memzero_explicit(tmp, sizeof(tmp));
 
diff --git a/drivers/char/random.c b/drivers/char/random.c
index ff8fdb6b013e6..55f26452975a5 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1351,7 +1351,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags
 	rcu_read_unlock();
 
 	if (rng) {
-		ret = rng->extrng_read(ubuf, len);
+		ret = rng->extrng_read(ubuf, len, !!(flags & GRND_RANDOM));
 		module_put(rng->owner);
 		return ret;
 	}
@@ -1572,7 +1572,7 @@ static int extrng_release(struct inode *inode, struct file *filp)
 static ssize_t
 extrng_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	return rcu_dereference_raw(extrng)->extrng_read(buf, nbytes);
+	return rcu_dereference_raw(extrng)->extrng_read(buf, nbytes, false);
 }
 
 const struct file_operations random_fops = {
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 8098891d38a84..2d36bebc4799c 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -156,6 +156,7 @@
 #define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS	0x00000100
 #define CRYPTO_TFM_REQ_MAY_SLEEP	0x00000200
 #define CRYPTO_TFM_REQ_MAY_BACKLOG	0x00000400
+#define CRYPTO_TFM_REQ_NEED_RESEED	0x00000800
 
 /*
  * Miscellaneous stuff.
diff --git a/include/linux/random.h b/include/linux/random.h
index 76a1b085bcf33..fc5f73f4f2d33 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -11,7 +11,7 @@
 #include <uapi/linux/random.h>
 
 struct random_extrng {
-	ssize_t (*extrng_read)(void __user *buf, size_t buflen);
+	ssize_t (*extrng_read)(void __user *buf, size_t buflen, bool reseed);
 	struct module *owner;
 };
 

From 9755413a444dd0b1deaa3f1043bd5ea9e51b6562 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert.xu@redhat.com>
Date: Fri, 3 Mar 2023 16:32:36 +0800
Subject: [PATCH 099/175] crypto: rng - Use a different crypto_rng for
 reseeding

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2174928

Upstream Status: RHEL only

Currently, passing GRND_RANDOM flag to getrandom(2) in FIPS mode will
cause reseeding of the crypto_default_rng which will then slow down
other callers of getrandom(2) without the GRND_RANDOM flag. To avoid that
slowdown, a separate crypto_reseed_rng (DRBG) is added for reseeding.

Now those callers with the GRND_RANDOM flag set will reseed the
crypto_reseed_rng and retrieve random data from it.

This also fixes a bug where a large-enough GRND_RANDOM read could
be broken into three or more reads, in which case a separate user
request without GRND_RANDOM may come through in the middle without
triggering any reseeds.

Move the two RNGs onto separate cachelines to avoid interference.

Signed-off-by: Herbert Xu <herbert.xu@redhat.com>
(cherry picked from RHEL commit 746aec2f)
[resolve minor conflict in crypto_devrandom_read - our distro did not
port some changes that are ultimately removed by this patch]
---
 crypto/rng.c | 85 +++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 48 deletions(-)

diff --git a/crypto/rng.c b/crypto/rng.c
index 5f4d0c65e557f..2ed6a8a0ce5c2 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -26,7 +26,9 @@
 
 #include "internal.h"
 
-static DEFINE_MUTEX(crypto_default_rng_lock);
+static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_reseed_rng_lock);
+static struct crypto_rng *crypto_reseed_rng;
+static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock);
 struct crypto_rng *crypto_default_rng;
 EXPORT_SYMBOL_GPL(crypto_default_rng);
 static int crypto_default_rng_refcnt;
@@ -116,12 +118,12 @@ struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask)
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_rng);
 
-static int __crypto_get_default_rng(void)
+static int crypto_get_rng(struct crypto_rng **rngp)
 {
 	struct crypto_rng *rng;
 	int err;
 
-	if (!crypto_default_rng) {
+	if (!*rngp) {
 		rng = crypto_alloc_rng("stdrng", 0, 0);
 		err = PTR_ERR(rng);
 		if (IS_ERR(rng))
@@ -133,11 +135,9 @@ static int __crypto_get_default_rng(void)
 			return err;
 		}
 
-		crypto_default_rng = rng;
+		*rngp = rng;
 	}
 
-	crypto_default_rng_refcnt++;
-
 	return 0;
 }
 
@@ -146,7 +146,9 @@ int crypto_get_default_rng(void)
 	int err;
 
 	mutex_lock(&crypto_default_rng_lock);
-	err = __crypto_get_default_rng();
+	err = crypto_get_rng(&crypto_default_rng);
+	if (!err)
+		crypto_default_rng_refcnt++;
 	mutex_unlock(&crypto_default_rng_lock);
 
 	return err;
@@ -162,16 +164,17 @@ void crypto_put_default_rng(void)
 EXPORT_SYMBOL_GPL(crypto_put_default_rng);
 
 #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE)
-int crypto_del_default_rng(void)
+static int crypto_del_rng(struct crypto_rng **rngp, int *refcntp,
+		      struct mutex *lock)
 {
 	int err = -EBUSY;
 
-	mutex_lock(&crypto_default_rng_lock);
-	if (crypto_default_rng_refcnt)
+	mutex_lock(lock);
+	if (refcntp && *refcntp)
 		goto out;
 
-	crypto_free_rng(crypto_default_rng);
-	crypto_default_rng = NULL;
+	crypto_free_rng(*rngp);
+	*rngp = NULL;
 
 	err = 0;
 
@@ -180,6 +183,14 @@ int crypto_del_default_rng(void)
 
 	return err;
 }
+
+int crypto_del_default_rng(void)
+{
+	return crypto_del_rng(&crypto_default_rng, &crypto_default_rng_refcnt,
+			      &crypto_default_rng_lock) ?:
+	       crypto_del_rng(&crypto_reseed_rng, NULL,
+			      &crypto_reseed_rng_lock);
+}
 EXPORT_SYMBOL_GPL(crypto_del_default_rng);
 #endif
 
@@ -236,6 +247,7 @@ EXPORT_SYMBOL_GPL(crypto_unregister_rngs);
 static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen,
 				     bool reseed)
 {
+	struct crypto_rng *rng;
 	u8 tmp[256];
 	ssize_t ret;
 
@@ -243,63 +255,38 @@ static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen,
 		return 0;
 
 	if (reseed) {
-		int err;
-		int i;
 		u32 flags = 0;
 
 		/* If reseeding is requested, acquire a lock on
-		 * crypto_default_rng so it is not swapped out until
+		 * crypto_reseed_rng so it is not swapped out until
 		 * the initial random bytes are generated.
 		 *
 		 * The algorithm implementation is also protected with
 		 * a separate mutex (drbg->drbg_mutex) around the
 		 * reseed-and-generate operation.
 		 */
-		mutex_lock(&crypto_default_rng_lock);
+		mutex_lock(&crypto_reseed_rng_lock);
 
 		/* If crypto_default_rng is not set, it will be seeded
 		 * at creation in __crypto_get_default_rng and thus no
 		 * reseeding is needed.
 		 */
-		if (crypto_default_rng)
+		if (crypto_reseed_rng)
 			flags |= CRYPTO_TFM_REQ_NEED_RESEED;
 
-		ret = __crypto_get_default_rng();
+		ret = crypto_get_rng(&crypto_reseed_rng);
 		if (ret) {
-			mutex_unlock(&crypto_default_rng_lock);
+			mutex_unlock(&crypto_reseed_rng_lock);
 			return ret;
 		}
 
-		crypto_tfm_set_flags(crypto_rng_tfm(crypto_default_rng),
-				     flags);
-
-		i = min_t(int, buflen, sizeof(tmp));
-		err = crypto_rng_get_bytes(crypto_default_rng, tmp, i);
-
-		crypto_tfm_clear_flags(crypto_rng_tfm(crypto_default_rng),
-				       flags);
-
-		mutex_unlock(&crypto_default_rng_lock);
-		if (err) {
-			ret = err;
-			goto out;
-		}
-
-		if (copy_to_user(buf, tmp, i)) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		buflen -= i;
-		buf += i;
-		ret += i;
-
-		if (!buflen)
-			goto out;
+		rng = crypto_reseed_rng;
+		crypto_tfm_set_flags(crypto_rng_tfm(rng), flags);
 	} else {
 		ret = crypto_get_default_rng();
 		if (ret)
 			return ret;
+		rng = crypto_default_rng;
 	}
 
 	for (;;) {
@@ -307,7 +294,7 @@ static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen,
 		int i;
 
 		i = min_t(int, buflen, sizeof(tmp));
-		err = crypto_rng_get_bytes(crypto_default_rng, tmp, i);
+		err = crypto_rng_get_bytes(rng, tmp, i);
 		if (err) {
 			ret = err;
 			break;
@@ -332,8 +319,10 @@ static ssize_t crypto_devrandom_read(void __user *buf, size_t buflen,
 		}
 	}
 
- out:
-	crypto_put_default_rng();
+	if (reseed)
+		mutex_unlock(&crypto_reseed_rng_lock);
+	else
+		crypto_put_default_rng();
 	memzero_explicit(tmp, sizeof(tmp));
 
 	return ret;

From fc001ed505f9a1926ebf35f2a1f923f21b1ab721 Mon Sep 17 00:00:00 2001
From: Mahmoud Adam <mngyadam@amazon.com>
Date: Fri, 23 Jun 2023 17:08:41 +0000
Subject: [PATCH 100/175] crypto: dh - Add SP800-56A rev 3 Pair-wise
 Consistency check

add pair-wise consistency check in fips mode as per SP800-56 rev 3
section 5.6.2.1.4:

For an FFC key pair (x, y): Use the private key, x,
along with the generator g and prime modulus p included in the domain
parameters associated with the key pair to compute g^x mod p. Compare
the result to the public key, y.

Signed-off-by: Mahmoud Adam <mngyadam@amazon.com>

[The original patch reads 'ret = _compute_val(ctx, ctx->g, y);' in
dh_pairwaise_test - that will overwrite the public key with the value we
intend to check it against. Instead, use the previously allocated val to
hold this value. Also use mpi_cmp which expects two MPIs instead of
mpi_cmp_ui, which expects an MPI and an unsigned long -hailmo ]
---
 crypto/dh.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/crypto/dh.c b/crypto/dh.c
index 99c3b2ef7adca..5e81fdca41ff6 100644
--- a/crypto/dh.c
+++ b/crypto/dh.c
@@ -163,6 +163,37 @@ static int dh_is_pubkey_valid(struct dh_ctx *ctx, MPI y)
 	return 0;
 }
 
+/*
+ * SP800-56A pair-wise consistency check:
+ * section 5.6.2.1.4:
+ * For an FFC key pair (x, y): Use the private key, x, along with the generator g and
+ * prime modulus p included in the domain parameters associated with the key pair
+ * to compute g^x mod p. Compare the result to the public key, y.
+ */
+static int dh_pairwise_test(struct dh_ctx *ctx, MPI y)
+{
+	int ret;
+	MPI val;
+	val = mpi_alloc(0);
+	if (!val)
+		return -ENOMEM;
+
+	ret = _compute_val(ctx, ctx->g, val); /*g^x mod p*/
+	if (ret) {
+		goto err_free_val;
+	}
+
+	if (mpi_cmp(val, y)) {
+		ret = -EINVAL;
+		goto err_free_val;
+	}
+
+	ret = 0;
+err_free_val:
+	mpi_free(val);
+	return ret;
+}
+
 static int dh_compute_value(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
@@ -226,6 +257,11 @@ static int dh_compute_value(struct kpp_request *req)
 
 		/* SP800-56A rev 3 5.6.2.1.3 key check */
 		} else {
+			if (fips_enabled && dh_pairwise_test(ctx, val)) {
+				fips_fail_notify();
+				panic("dh_pairwise_test failed");
+			}
+
 			if (dh_is_pubkey_valid(ctx, val)) {
 				ret = -EAGAIN;
 				goto err_free_val;

From 1b7ecd5c44054e294e0386e4b0245adcb2fa7af3 Mon Sep 17 00:00:00 2001
From: Mahmoud Adam <mngyadam@amazon.com>
Date: Fri, 23 Jun 2023 18:01:06 +0000
Subject: [PATCH 101/175] crypto: ecc - Add SP800-56A rev 3 Pair-wise
 Consistency check

add pair-wise consistency check in fips mode as per SP800-56 rev 3
section 5.6.2.1.4:
For an ECC key pair (d, Q): Use the private key, d, along with the generator G
and other domain parameters associated with the key pair, to compute dG
(according to the rules of elliptic-curve arithmetic). Compare the
result to the public key, Q. If dG is not equal to Q, then the
pair-wise consistency test fails.

Signed-off-by: Mahmoud Adam <mngyadam@amazon.com>

[ Since ecc_pairwise_test is only used in this function, remove it from
ecc.h and mark it as static. Also move it to be declared before it is
used. - hailmo ]
---
 crypto/ecc.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/crypto/ecc.c b/crypto/ecc.c
index 7315217c8f733..7e25fa1e122f5 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -1505,6 +1505,56 @@ int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, u64 *privkey)
 }
 EXPORT_SYMBOL(ecc_gen_privkey);
 
+/**
+* SP800-56A section 5.6.2.1.4 Pair-Wise Consistency Test
+* ecc_pairwise_test() - Pair-wise Consistency test
+*
+* @curve: 		elliptic curve domain parameters
+* @private_key: 	pregenerated private key for the given curve
+* @pk: 		public key as a point
+* @ndigits: 		curve's number of digits
+*
+* Pair-wise Consistency test according to SP800-56A section 5.6.2.1.4
+*
+* Return: 0 if test is successful, -EINVAL if test is failed.
+*/
+static int ecc_pairwise_test(const struct ecc_curve *curve,
+		      const u64 *private_key,
+		      struct ecc_point *pk,
+		      unsigned int ndigits)
+{
+	u64 priv[ECC_MAX_DIGITS];
+	struct ecc_point *epk;
+	int ret;
+
+	ecc_swap_digits(private_key, priv, ndigits);
+
+	epk = ecc_alloc_point(ndigits);
+	if (!epk) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ecc_point_mult(epk, &curve->g, priv, NULL, curve, ndigits);
+
+	/* check expected public key against the public_key */
+	if (vli_cmp(epk->x, pk->x, ndigits)) {
+		ret = -EINVAL;
+		goto err_free_point;
+	}
+
+	if (vli_cmp(epk->y, pk->y, ndigits)) {
+		ret = -EINVAL;
+		goto err_free_point;
+	}
+
+	ret = 0;
+err_free_point:
+	ecc_free_point(epk);
+err:
+	return ret;
+}
+
 int ecc_make_pub_key(unsigned int curve_id, unsigned int ndigits,
 		     const u64 *private_key, u64 *public_key)
 {
@@ -1534,6 +1584,12 @@ int ecc_make_pub_key(unsigned int curve_id, unsigned int ndigits,
 		goto err_free_point;
 	}
 
+	if (fips_enabled &&
+	    ecc_pairwise_test(curve, private_key, pk, ndigits)) {
+		fips_fail_notify();
+		panic("ecc_pairwise_test failed");
+	}
+
 	ecc_swap_digits(pk->x, public_key, ndigits);
 	ecc_swap_digits(pk->y, &public_key[ndigits], ndigits);
 

From 4262bc2eb1dc8c8d94b31bdbd4ce93f30cd045e5 Mon Sep 17 00:00:00 2001
From: Mahmoud Adam <mngyadam@amazon.com>
Date: Fri, 9 Jun 2023 14:45:11 +0000
Subject: [PATCH 102/175] KEYS: use kfree_sensitive with key

key member might contain private part of the key, so better use
kfree_sensitive to free it

Signed-off-by: Mahmoud Adam <mngyadam@amazon.com>

[refactored for 6.1.34 -hailmo@]
---
 crypto/asymmetric_keys/public_key.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
index 50c933f86b218..170f069823815 100644
--- a/crypto/asymmetric_keys/public_key.c
+++ b/crypto/asymmetric_keys/public_key.c
@@ -43,7 +43,7 @@ static void public_key_describe(const struct key *asymmetric_key,
 void public_key_free(struct public_key *key)
 {
 	if (key) {
-		kfree(key->key);
+		kfree_sensitive(key->key);
 		kfree(key->params);
 		kfree(key);
 	}
@@ -218,7 +218,7 @@ static int software_key_query(const struct kernel_pkey_params *params,
 	ret = 0;
 
 error_free_key:
-	kfree(key);
+	kfree_sensitive(key);
 error_free_tfm:
 	crypto_free_akcipher(tfm);
 	pr_devel("<==%s() = %d\n", __func__, ret);
@@ -303,7 +303,7 @@ static int software_key_eds_op(struct kernel_pkey_params *params,
 		ret = req->dst_len;
 
 error_free_key:
-	kfree(key);
+	kfree_sensitive(key);
 error_free_req:
 	akcipher_request_free(req);
 error_free_tfm:
@@ -460,7 +460,7 @@ int public_key_verify_signature(const struct public_key *pkey,
 	ret = crypto_wait_req(crypto_akcipher_verify(req), &cwait);
 
 error_free_buf:
-	kfree(buf);
+	kfree_sensitive(buf);
 error_free_req:
 	akcipher_request_free(req);
 error_free_tfm:

From 47238af324cbcd3e0071128f621165225342a03f Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Fri, 18 Nov 2022 16:51:56 -0800
Subject: [PATCH 103/175] mm: add bdi_set_strict_limit() function

commit 8e9d5ead865a1a7af74a444d2f00f1ef4539bfba upstream.

Patch series "mm/block: add bdi sysfs knobs", v4.

At meta network block devices (nbd) are used to implement remote block
storage.  In testing and during production it has been observed that these
network block devices can consume a huge portion of the dirty writeback
cache and writeback can take a considerable time.

To be able to give stricter limits, I'm proposing the following changes:

1) introduce strictlimit knob

  Currently the max_ratio knob exists to limit the dirty_memory. However
  this knob only applies once (dirty_ratio + dirty_background_ratio) / 2
  has been reached.
  With the BDI_CAP_STRICTLIMIT flag, the max_ratio can be applied without
  reaching that limit. This change exposes that knob.

  This knob can also be useful for NFS, fuse filesystems and USB devices.

2) Use part of 1000000 internal calculation

  The max_ratio is based on percentage. With the current machine sizes
  percentage values can be very high (1% of a 256GB main memory is already
  2.5GB). This change uses part of 1000000 instead of percentages for the
  internal calculations.

3) Introduce two new sysfs knobs: min_bytes and max_bytes.

  Currently all calculations are based on ratio, but for a user it often
  more convenient to specify a limit in bytes. The new knobs will not
  store bytes values, instead they will translate the byte value to a
  corresponding ratio. As the internal values are now part of 1000, the
  ratio is closer to the specified value. However the value should be more
  seen as an approximation as it can fluctuate over time.

3) Introduce two new sysfs knobs: min_ratio_fine and max_ratio_fine.

  The granularity for the existing sysfs bdi knobs min_ratio and max_ratio
  is based on percentage values. The new sysfs bdi knobs min_ratio_fine
  and max_ratio_fine allow to specify the ratio as part of 1 million.

This patch (of 20):

This adds the bdi_set_strict_limit function to be able to set/unset the
BDI_CAP_STRICTLIMIT flag.

Link: https://lkml.kernel.org/r/20221119005215.3052436-1-shr@devkernel.io
Link: https://lkml.kernel.org/r/20221119005215.3052436-2-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Chris Mason <clm@meta.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/linux/backing-dev.h |  1 +
 mm/page-writeback.c         | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 439815cc1ab96..9c984ffc8a0aa 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -104,6 +104,7 @@ static inline unsigned long wb_stat_error(void)
 
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
+int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit);
 
 /*
  * Flags in backing_dev_info::capability
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7dbac6ede7242..bc34e3330f3da 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -720,6 +720,21 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
 
+int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit)
+{
+	if (strict_limit > 1)
+		return -EINVAL;
+
+	spin_lock_bh(&bdi_lock);
+	if (strict_limit)
+		bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+	else
+		bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
+	spin_unlock_bh(&bdi_lock);
+
+	return 0;
+}
+
 static unsigned long dirty_freerun_ceiling(unsigned long thresh,
 					   unsigned long bg_thresh)
 {

From 415d434363964f57e85554f56c71ecd457dff66c Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Fri, 18 Nov 2022 16:51:57 -0800
Subject: [PATCH 104/175] mm: add knob /sys/class/bdi/<bdi>/strict_limit

commit 27bbe9d48d4e298864e18b39f091342c68b81637 upstream.

Add a new knob to /sys/class/bdi/<bdi>/strict_limit. This new knob
allows to set/unset the flag BDI_CAP_STRICTLIMIT in the bdi
capabilities.

Link: https://lkml.kernel.org/r/20221119005215.3052436-3-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Cc: Chris Mason <clm@meta.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 mm/backing-dev.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index bf5525c2e561a..e581102276f23 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -209,11 +209,40 @@ static ssize_t stable_pages_required_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(stable_pages_required);
 
+static ssize_t strict_limit_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	unsigned int strict_limit;
+	ssize_t ret;
+
+	ret = kstrtouint(buf, 10, &strict_limit);
+	if (ret < 0)
+		return ret;
+
+	ret = bdi_set_strict_limit(bdi, strict_limit);
+	if (!ret)
+		ret = count;
+
+	return ret;
+}
+
+static ssize_t strict_limit_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%d\n",
+			!!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
+}
+static DEVICE_ATTR_RW(strict_limit);
+
 static struct attribute *bdi_dev_attrs[] = {
 	&dev_attr_read_ahead_kb.attr,
 	&dev_attr_min_ratio.attr,
 	&dev_attr_max_ratio.attr,
 	&dev_attr_stable_pages_required.attr,
+	&dev_attr_strict_limit.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(bdi_dev);

From b504fb9441d63de31069a5b3eba2dbdff67f773e Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Fri, 18 Nov 2022 16:51:58 -0800
Subject: [PATCH 105/175] mm: document /sys/class/bdi/<bdi>/strict_limit knob

commit 16b837eb84e6948f92411eb32e97a05f89733ddc upstream.

This documents the new /sys/class/bdi/<bdi>/strict_limit knob.

Link: https://lkml.kernel.org/r/20221119005215.3052436-4-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Cc: Chris Mason <clm@meta.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 Documentation/ABI/testing/sysfs-class-bdi | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi
index 6d2a2fc189dd6..68b5d4018c2f7 100644
--- a/Documentation/ABI/testing/sysfs-class-bdi
+++ b/Documentation/ABI/testing/sysfs-class-bdi
@@ -55,6 +55,17 @@ Description:
 	mount that is prone to get stuck, or a FUSE mount which cannot
 	be trusted to play fair.
 
+	(read-write)
+What:		/sys/class/bdi/<bdi>/strict_limit
+Date:		October 2022
+Contact:	Stefan Roesch <shr@devkernel.io>
+Description:
+	Forces per-BDI checks for the share of given device in the write-back
+	cache even before the global background dirty limit is reached. This
+	is useful in situations where the global limit is much higher than
+	affordable for given relatively slow (or untrusted) device. Turning
+	strictlimit on has no visible effect if max_ratio is equal to 100%.
+
 	(read-write)
 What:		/sys/class/bdi/<bdi>/stable_pages_required
 Date:		January 2008

From c8d557f103bd181d9fc7ab5ac19a16792e4f7b1a Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 27 Jul 2023 18:49:29 +0000
Subject: [PATCH 106/175] ip: Bump default ttl to 127.

In 4.14 and 5.4, the default TTL was bumped up to 255, but we moved
the change to sysctl-defaults package.  However, sysctl config is not
applied to netns.

Let's bump it again but to 127 as some nodes could block packets with
TTL 255.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/uapi/linux/ip.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 283dec7e36451..bf7ece5b157b3 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -67,7 +67,7 @@
 
 #define IPVERSION	4
 #define MAXTTL		255
-#define IPDEFTTL	64
+#define IPDEFTTL	127
 
 #define IPOPT_OPTVAL 0
 #define IPOPT_OLEN   1

From 2bf046f799da455522785e07d2cc079917ab0cb4 Mon Sep 17 00:00:00 2001
From: David de Bruyn <daviddb@amazon.com>
Date: Thu, 9 Feb 2023 09:19:09 +0000
Subject: [PATCH 107/175] Enable ptIOMMU for all supported platforms

Currently ptIOMMU functionality is only enabled for ARM64. This patch
removes the architecture specific code to enable the ptIOMMU and
replaces it with an achitecture-independent strategy.

Signed-off-by: David de Bruyn <daviddb@amazon.com>
Signed-off-by: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
---
 arch/arm64/mm/dma-mapping.c | 6 ------
 drivers/acpi/scan.c         | 6 ++++++
 drivers/of/device.c         | 6 ++++++
 include/linux/dma-map-ops.h | 5 +++++
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 6ed88059d8b4a..5240f6acad648 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -8,7 +8,6 @@
 #include <linux/cache.h>
 #include <linux/dma-map-ops.h>
 #include <linux/iommu.h>
-#include <linux/dma-page-touching.h>
 #include <xen/xen.h>
 
 #include <asm/cacheflush.h>
@@ -78,9 +77,4 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);
 
 	xen_setup_dma_ops(dev);
-
-#ifdef CONFIG_DMA_PAGE_TOUCHING
-	if (!dev->dma_ops)
-		setup_dma_page_touching_ops(dev);
-#endif
 }
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 293cdf486fd81..f335d810fd1c1 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -13,6 +13,7 @@
 #include <linux/acpi_iort.h>
 #include <linux/acpi_viot.h>
 #include <linux/iommu.h>
+#include <linux/dma-page-touching.h>
 #include <linux/signal.h>
 #include <linux/kthread.h>
 #include <linux/dmi.h>
@@ -1635,6 +1636,11 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
 	arch_setup_dma_ops(dev, 0, U64_MAX,
 				iommu, attr == DEV_DMA_COHERENT);
 
+#ifdef CONFIG_DMA_PAGE_TOUCHING
+	if (!dev->dma_ops)
+		setup_dma_page_touching_ops(dev);
+#endif
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(acpi_dma_configure_id);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index ce225d2590b54..0cc46dec4107f 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -8,6 +8,7 @@
 #include <linux/of_reserved_mem.h>
 #include <linux/dma-direct.h> /* for bus_dma_region */
 #include <linux/dma-map-ops.h>
+#include <linux/dma-page-touching.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
@@ -216,6 +217,11 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
 
 	arch_setup_dma_ops(dev, dma_start, size, iommu, coherent);
 
+#ifdef CONFIG_DMA_PAGE_TOUCHING
+	if (!dev->dma_ops)
+		setup_dma_page_touching_ops(dev);
+#endif
+
 	if (!iommu)
 		of_dma_set_restricted_buffer(dev, np);
 
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index d678afeb8a13a..2a91f3ded2787 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -7,6 +7,7 @@
 #define _LINUX_DMA_MAP_OPS_H
 
 #include <linux/dma-mapping.h>
+#include <linux/dma-page-touching.h>
 #include <linux/pgtable.h>
 
 struct cma;
@@ -368,6 +369,10 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
 		u64 size, const struct iommu_ops *iommu, bool coherent)
 {
+#ifdef CONFIG_DMA_PAGE_TOUCHING
+	if (!dev->dma_ops)
+		setup_dma_page_touching_ops(dev);
+#endif
 }
 #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */
 

From bbd8c9ba77419c8e56311426d37c8bf7a1f42552 Mon Sep 17 00:00:00 2001
From: Robert Holmes <robeholmes@gmail.com>
Date: Tue, 23 Apr 2019 07:39:29 +0000
Subject: [PATCH 108/175] KEYS: Make use of platform keyring for module
 signature verify

This patch completes commit 278311e417be ("kexec, KEYS: Make use of
platform keyring for signature verify") which, while adding the
platform keyring for bzImage verification, neglected to also add
this keyring for module verification.

As such, kernel modules signed with keys from the MokList variable
were not successfully verified.

Signed-off-by: Robert Holmes <robeholmes@gmail.com>
Signed-off-by: Jeremy Cline <jcline@redhat.com>
---
 kernel/module/signing.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/module/signing.c b/kernel/module/signing.c
index a2ff4242e623d..f0d2be1ee4f1c 100644
--- a/kernel/module/signing.c
+++ b/kernel/module/signing.c
@@ -61,10 +61,17 @@ int mod_verify_sig(const void *mod, struct load_info *info)
 	modlen -= sig_len + sizeof(ms);
 	info->len = modlen;
 
-	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+	ret = verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
 				      VERIFY_USE_SECONDARY_KEYRING,
 				      VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
+	if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+		ret = verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+				VERIFY_USE_PLATFORM_KEYRING,
+				VERIFYING_MODULE_SIGNATURE,
+				NULL, NULL);
+	}
+	return ret;
 }
 
 int module_sig_check(struct load_info *info, int flags)

From b625a027ae0846cda13f4b18cf1808e14f3c1ba7 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <samjonas@amazon.com>
Date: Tue, 11 Jul 2023 18:46:54 -0700
Subject: [PATCH 109/175] scripts/sign_file: Add option to keep signing
 certificate

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 scripts/sign-file.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/scripts/sign-file.c b/scripts/sign-file.c
index 3edb156ae52c3..2549c880c2019 100644
--- a/scripts/sign-file.c
+++ b/scripts/sign-file.c
@@ -77,7 +77,7 @@ static __attribute__((noreturn))
 void format(void)
 {
 	fprintf(stderr,
-		"Usage: scripts/sign-file [-dp] <hash algo> <key> <x509> <module> [<dest>]\n");
+		"Usage: scripts/sign-file [-dpc] <hash algo> <key> <x509> <module> [<dest>]\n");
 	fprintf(stderr,
 		"       scripts/sign-file -s <raw sig> <hash algo> <x509> <module> [<dest>]\n");
 	exit(2);
@@ -222,6 +222,7 @@ int main(int argc, char **argv)
 	bool save_sig = false, replace_orig;
 	bool sign_only = false;
 	bool raw_sig = false;
+	bool cert_flags = CMS_NOCERTS;
 	unsigned char buf[4096];
 	unsigned long module_size, sig_size;
 	unsigned int use_signed_attrs;
@@ -249,11 +250,12 @@ int main(int argc, char **argv)
 #endif
 
 	do {
-		opt = getopt(argc, argv, "sdpk");
+		opt = getopt(argc, argv, "sdpkc");
 		switch (opt) {
 		case 's': raw_sig = true; break;
 		case 'p': save_sig = true; break;
 		case 'd': sign_only = true; save_sig = true; break;
+		case 'c': cert_flags = 0; break;
 #ifndef USE_PKCS7
 		case 'k': use_keyid = CMS_USE_KEYID; break;
 #endif
@@ -313,16 +315,16 @@ int main(int argc, char **argv)
 #ifndef USE_PKCS7
 		/* Load the signature message from the digest buffer. */
 		cms = CMS_sign(NULL, NULL, NULL, NULL,
-			       CMS_NOCERTS | CMS_PARTIAL | CMS_BINARY |
+			       cert_flags | CMS_PARTIAL | CMS_BINARY |
 			       CMS_DETACHED | CMS_STREAM);
 		ERR(!cms, "CMS_sign");
 
 		ERR(!CMS_add1_signer(cms, x509, private_key, digest_algo,
-				     CMS_NOCERTS | CMS_BINARY |
+				     cert_flags | CMS_BINARY |
 				     CMS_NOSMIMECAP | use_keyid |
 				     use_signed_attrs),
 		    "CMS_add1_signer");
-		ERR(CMS_final(cms, bm, NULL, CMS_NOCERTS | CMS_BINARY) != 1,
+		ERR(CMS_final(cms, bm, NULL, cert_flags | CMS_BINARY) != 1,
 		    "CMS_final");
 
 #else

From d2cc6d5cbc48860ad273ed9a3652f605a336ec53 Mon Sep 17 00:00:00 2001
From: David Arinzon <darinzon@amazon.com>
Date: Tue, 8 Aug 2023 08:06:22 +0000
Subject: [PATCH 110/175] AL2023 6.1 Update ena driver to 2.8.9g

Singed-off-by: David Arinzon <darinzon@amazon.com>
Signed-off-by: SeongJae Park <sjpark@amazon.com>
---
 drivers/amazon/net/ena/ena_admin_defs.h |  8 +++++++
 drivers/amazon/net/ena/ena_devlink.c    | 24 +++++++++----------
 drivers/amazon/net/ena/ena_netdev.c     | 14 ++++++++---
 drivers/amazon/net/ena/ena_netdev.h     |  3 ++-
 drivers/amazon/net/ena/ena_phc.c        |  5 +++-
 drivers/amazon/net/ena/kcompat.h        | 31 +++++++++++++++++++------
 6 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index b3a9f1aec52b3..f34b44a6fa230 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -435,6 +435,10 @@ struct ena_admin_basic_stats {
 	u32 tx_drops_low;
 
 	u32 tx_drops_high;
+
+	u32 rx_overruns_low;
+
+	u32 rx_overruns_high;
 };
 
 /* ENI Statistics Command. */
@@ -1223,6 +1227,10 @@ struct ena_admin_aenq_keep_alive_desc {
 	u32 tx_drops_low;
 
 	u32 tx_drops_high;
+
+	u32 rx_overruns_low;
+
+	u32 rx_overruns_high;
 };
 
 struct ena_admin_ena_mmio_req_read_less_resp {
diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c
index f140d024ef166..43ce1ae2cebaa 100644
--- a/drivers/amazon/net/ena/ena_devlink.c
+++ b/drivers/amazon/net/ena/ena_devlink.c
@@ -108,9 +108,9 @@ void ena_devlink_params_get(struct devlink *devlink)
 	if (!ena_is_devlink_params_registered(devlink))
 		return;
 #endif
-	err = devlink_param_driverinit_value_get(devlink,
-						 ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-						 &val);
+	err = devl_param_driverinit_value_get(devlink,
+					      ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					      &val);
 	if (err) {
 		netdev_err(adapter->netdev, "Failed to query LLQ header size param\n");
 		return;
@@ -119,7 +119,7 @@ void ena_devlink_params_get(struct devlink *devlink)
 	adapter->large_llq_header_enabled = val.vbool;
 #ifdef ENA_PHC_SUPPORT
 
-	err = devlink_param_driverinit_value_get(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, &val);
+	err = devl_param_driverinit_value_get(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, &val);
 	if (err) {
 		netdev_err(adapter->netdev, "Failed to query PHC param\n");
 		return;
@@ -140,9 +140,9 @@ void ena_devlink_disable_large_llq_header_param(struct devlink *devlink)
 
 #endif
 	value.vbool = false;
-	devlink_param_driverinit_value_set(devlink,
-					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-					   value);
+	devl_param_driverinit_value_set(devlink,
+					ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					value);
 }
 
 #ifdef ENA_PHC_SUPPORT
@@ -157,7 +157,7 @@ void ena_devlink_disable_phc_param(struct devlink *devlink)
 
 #endif
 	value.vbool = false;
-	devlink_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
+	devl_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
 }
 
 #endif /* ENA_PHC_SUPPORT */
@@ -277,13 +277,13 @@ static int ena_devlink_configure_params(struct devlink *devlink)
 	}
 
 	value.vbool = adapter->large_llq_header_enabled;
-	devlink_param_driverinit_value_set(devlink,
-					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-					   value);
+	devl_param_driverinit_value_set(devlink,
+					ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					value);
 
 #ifdef ENA_PHC_SUPPORT
 	value.vbool = ena_phc_is_enabled(adapter);
-	devlink_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
+	devl_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
 
 #endif /* ENA_PHC_SUPPORT */
 #ifdef ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 072be72e14d56..759926e8f8716 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -46,6 +46,8 @@ MODULE_VERSION(DRV_MODULE_GENERATION);
 
 #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \
 		NETIF_MSG_IFDOWN | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR)
+
+#define ENA_HIGH_LOW_TO_U64(high, low) ((((u64)(high)) << 32) | (low))
 #ifndef ENA_LINEAR_FRAG_SUPPORTED
 
 #define ENA_SKB_PULL_MIN_LEN 64
@@ -3222,6 +3224,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 	struct ena_ring *rx_ring, *tx_ring;
 	u64 xdp_rx_drops = 0;
 	unsigned int start;
+	u64 rx_overruns;
 	u64 rx_drops;
 	u64 tx_drops;
 	int i;
@@ -3268,6 +3271,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		start = ena_u64_stats_fetch_begin(&adapter->syncp);
 		rx_drops = adapter->dev_stats.rx_drops;
 		tx_drops = adapter->dev_stats.tx_drops;
+		rx_overruns = adapter->dev_stats.rx_overruns;
 	} while (ena_u64_stats_fetch_retry(&adapter->syncp, start));
 
 	stats->rx_dropped = rx_drops + xdp_rx_drops;
@@ -3282,8 +3286,9 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 	stats->rx_fifo_errors = 0;
 	stats->rx_missed_errors = 0;
 	stats->tx_window_errors = 0;
+	stats->rx_over_errors = rx_overruns;
 
-	stats->rx_errors = 0;
+	stats->rx_errors = stats->rx_over_errors;
 	stats->tx_errors = 0;
 #ifndef NDO_GET_STATS_64_V2
 		return stats;
@@ -4987,14 +4992,16 @@ static void ena_keep_alive_wd(void *adapter_data,
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
 	struct ena_admin_aenq_keep_alive_desc *desc;
+	u64 rx_overruns;
 	u64 rx_drops;
 	u64 tx_drops;
 
 	desc = (struct ena_admin_aenq_keep_alive_desc *)aenq_e;
 	adapter->last_keep_alive_jiffies = jiffies;
 
-	rx_drops = ((u64)desc->rx_drops_high << 32) | desc->rx_drops_low;
-	tx_drops = ((u64)desc->tx_drops_high << 32) | desc->tx_drops_low;
+	rx_drops = ENA_HIGH_LOW_TO_U64(desc->rx_drops_high, desc->rx_drops_low);
+	tx_drops = ENA_HIGH_LOW_TO_U64(desc->tx_drops_high, desc->tx_drops_low);
+	rx_overruns = ENA_HIGH_LOW_TO_U64(desc->rx_overruns_high, desc->rx_overruns_low);
 
 	u64_stats_update_begin(&adapter->syncp);
 	/* These stats are accumulated by the device, so the counters indicate
@@ -5002,6 +5009,7 @@ static void ena_keep_alive_wd(void *adapter_data,
 	 */
 	adapter->dev_stats.rx_drops = rx_drops;
 	adapter->dev_stats.tx_drops = tx_drops;
+	adapter->dev_stats.rx_overruns = rx_overruns;
 	u64_stats_update_end(&adapter->syncp);
 }
 
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 92e03d79971f7..97bdd08853400 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -29,7 +29,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	8
-#define DRV_MODULE_GEN_SUBMINOR 6
+#define DRV_MODULE_GEN_SUBMINOR 9
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -361,6 +361,7 @@ struct ena_stats_dev {
 	u64 admin_q_pause;
 	u64 rx_drops;
 	u64 tx_drops;
+	u64 rx_overruns;
 	u64 reset_fail;
 };
 
diff --git a/drivers/amazon/net/ena/ena_phc.c b/drivers/amazon/net/ena/ena_phc.c
index 8b89ae9efb4ec..5b637ef79bc04 100644
--- a/drivers/amazon/net/ena/ena_phc.c
+++ b/drivers/amazon/net/ena/ena_phc.c
@@ -7,11 +7,12 @@
 #include "ena_phc.h"
 
 #ifdef ENA_PHC_SUPPORT
-
+#ifdef ENA_PHC_SUPPORT_ADJFREQ
 static int ena_phc_adjfreq(struct ptp_clock_info *clock_info, s32 ppb)
 {
 	return -EOPNOTSUPP;
 }
+#endif /* ENA_PHC_SUPPORT_ADJFREQ */
 
 static int ena_phc_adjtime(struct ptp_clock_info *clock_info, s64 delta)
 {
@@ -109,7 +110,9 @@ static struct ptp_clock_info ena_ptp_clock_info = {
 	.n_ext_ts	= 0,
 	.n_per_out	= 0,
 	.pps		= 0,
+#ifdef ENA_PHC_SUPPORT_ADJFREQ
 	.adjfreq	= ena_phc_adjfreq,
+#endif /* ENA_PHC_SUPPORT_ADJFREQ */
 	.adjtime	= ena_phc_adjtime,
 #ifdef ENA_PHC_SUPPORT_GETTIME64
 #ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 1b3e7edf570b0..62ddd400e787f 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -790,9 +790,9 @@ do {									\
 #define ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) || \
+#if (KERNEL_VERSION(5, 16, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0)) || \
 	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))  || \
-	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0) && !(defined(FEDORA_RELEASE)))
 #define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
 #endif
 
@@ -866,8 +866,8 @@ static inline void netdev_rss_key_fill(void *buffer, size_t len)
  * UEK            ***********|--------------|--------|------|
  */
 #if (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) || \
-    (defined(ubuntu)) || \
-    (!defined(IS_UEK) && !defined(ubuntu) && \
+    (defined(UBUNTU_VERSION_CODE)) || \
+    (!defined(IS_UEK) && !defined(UBUNTU_VERSION_CODE) && \
      !(KERNEL_VERSION(4, 4, 216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)))
 static inline int page_ref_count(struct page *page)
 {
@@ -967,7 +967,11 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 #define ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
+	!(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 188) && \
+	 LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0)) && \
+	!(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 251) && \
+	 LINUX_VERSION_CODE < KERNEL_VERSION(5, 5, 0))) && \
 	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6)) && \
 	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
 	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \
@@ -989,7 +993,8 @@ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) || \
 	(defined(RHEL_RELEASE_CODE) && \
 	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7) && \
-	RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0))
+	RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0)) || \
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 5))
 #define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
 #endif
 
@@ -1062,6 +1067,10 @@ static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
 #define ENA_PHC_SUPPORT_GETTIME64_EXTENDED
 #endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
 
+#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6, 2, 0)))
+#define ENA_PHC_SUPPORT_ADJFREQ
+#endif /* ENA_PHC_SUPPORT_ADJFREQ */
+
 #if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0)) && \
 	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))))
 #define ptp_clock_register(info, parent) ptp_clock_register(info)
@@ -1093,7 +1102,10 @@ static inline void ena_netif_napi_add(struct net_device *dev,
 				      int (*poll)(struct napi_struct *, int))
 {
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)) && \
-	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 2)))
+	!(RHEL_RELEASE_CODE && \
+	  ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 8)) && \
+	   (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0))) || \
+	  (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 2)))
 #ifndef NAPI_POLL_WEIGHT
 #define NAPI_POLL_WEIGHT 64
 #endif
@@ -1103,6 +1115,11 @@ static inline void ena_netif_napi_add(struct net_device *dev,
 #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) */
 }
 
+#if defined(ENA_DEVLINK_SUPPORT) && LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0)
+#define devl_param_driverinit_value_get devlink_param_driverinit_value_get
+#define devl_param_driverinit_value_set devlink_param_driverinit_value_set
+#endif
+
 #if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4))) || \
     (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \
     (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 105, 0, 0))

From 493ef6724ec2fd2324e21ecaeb50b7e69e287255 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 15 Nov 2022 09:46:33 +0000
Subject: [PATCH 111/175] KVM: arm64: Discard any SVE state when entering KVM
 guests

commit 93ae6b01bafee8fa385aa25ee7ebdb40057f6abe upstream.

Since 8383741ab2e773a99 (KVM: arm64: Get rid of host SVE tracking/saving)
KVM has not tracked the host SVE state, relying on the fact that we
currently disable SVE whenever we perform a syscall. This may not be true
in future since performance optimisation may result in us keeping SVE
enabled in order to avoid needing to take access traps to reenable it.
Handle this by clearing TIF_SVE and converting the stored task state to
FPSIMD format when preparing to run the guest.  This is done with a new
call fpsimd_kvm_prepare() to keep the direct state manipulation
functions internal to fpsimd.c.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221115094640.112848-2-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/fpsimd.h |  1 +
 arch/arm64/kernel/fpsimd.c      | 23 +++++++++++++++++++++++
 arch/arm64/kvm/fpsimd.c         |  3 ++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 930b0e6c94622..3544dfcc67a1e 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -56,6 +56,7 @@ extern void fpsimd_signal_preserve_current_state(void);
 extern void fpsimd_preserve_current_state(void);
 extern void fpsimd_restore_current_state(void);
 extern void fpsimd_update_current_state(struct user_fpsimd_state const *state);
+extern void fpsimd_kvm_prepare(void);
 
 extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
 				     void *sve_state, unsigned int sve_vl,
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 59b5a16bab5d6..cc368d9ba38ab 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1642,6 +1642,29 @@ void fpsimd_signal_preserve_current_state(void)
 		sve_to_fpsimd(current);
 }
 
+/*
+ * Called by KVM when entering the guest.
+ */
+void fpsimd_kvm_prepare(void)
+{
+	if (!system_supports_sve())
+		return;
+
+	/*
+	 * KVM does not save host SVE state since we can only enter
+	 * the guest from a syscall so the ABI means that only the
+	 * non-saved SVE state needs to be saved.  If we have left
+	 * SVE enabled for performance reasons then update the task
+	 * state to be FPSIMD only.
+	 */
+	get_cpu_fpsimd_context();
+
+	if (test_and_clear_thread_flag(TIF_SVE))
+		sve_to_fpsimd(current);
+
+	put_cpu_fpsimd_context();
+}
+
 /*
  * Associate current's FPSIMD context with this cpu
  * The caller must have ownership of the cpu FPSIMD context before calling
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index ec8e4494873d4..51ca78b31b952 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -75,11 +75,12 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
 {
 	BUG_ON(!current->mm);
-	BUG_ON(test_thread_flag(TIF_SVE));
 
 	if (!system_supports_fpsimd())
 		return;
 
+	fpsimd_kvm_prepare();
+
 	vcpu->arch.fp_state = FP_STATE_HOST_OWNED;
 
 	vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);

From 903c25db842fcdb827a036daa62c443beba687f3 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 15 Nov 2022 09:46:34 +0000
Subject: [PATCH 112/175] arm64/fpsimd: Track the saved FPSIMD state type
 separately to TIF_SVE

commit baa8515281b30861cff3da7db70662d2a25c6440 upstream.

When we save the state for the floating point registers this can be done
in the form visible through either the FPSIMD V registers or the SVE Z and
P registers. At present we track which format is currently used based on
TIF_SVE and the SME streaming mode state but particularly in the SVE case
this limits our options for optimising things, especially around syscalls.
Introduce a new enum which we place together with saved floating point
state in both thread_struct and the KVM guest state which explicitly
states which format is active and keep it up to date when we change it.

At present we do not use this state except to verify that it has the
expected value when loading the state, future patches will introduce
functional changes.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221115094640.112848-3-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
[ bp: Adjust context due to
      "arm64/ptrace: Don't enable SVE when setting streaming SVE"
      being backported to this stable tree
]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/fpsimd.h    |  2 +-
 arch/arm64/include/asm/kvm_host.h  | 12 ++++++-
 arch/arm64/include/asm/processor.h |  6 ++++
 arch/arm64/kernel/fpsimd.c         | 58 ++++++++++++++++++++++--------
 arch/arm64/kernel/process.c        |  2 ++
 arch/arm64/kernel/ptrace.c         |  3 ++
 arch/arm64/kernel/signal.c         |  7 +++-
 arch/arm64/kvm/fpsimd.c            |  3 +-
 8 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 3544dfcc67a1e..e10894100c739 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -61,7 +61,7 @@ extern void fpsimd_kvm_prepare(void);
 extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
 				     void *sve_state, unsigned int sve_vl,
 				     void *za_state, unsigned int sme_vl,
-				     u64 *svcr);
+				     u64 *svcr, enum fp_type *type);
 
 extern void fpsimd_flush_task_state(struct task_struct *target);
 extern void fpsimd_save_and_flush_cpu_state(void);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 577cf444c1135..0e9b093adc672 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -309,8 +309,18 @@ struct vcpu_reset_state {
 struct kvm_vcpu_arch {
 	struct kvm_cpu_context ctxt;
 
-	/* Guest floating point state */
+	/*
+	 * Guest floating point state
+	 *
+	 * The architecture has two main floating point extensions,
+	 * the original FPSIMD and SVE.  These have overlapping
+	 * register views, with the FPSIMD V registers occupying the
+	 * low 128 bits of the SVE Z registers.  When the core
+	 * floating point code saves the register state of a task it
+	 * records which view it saved in fp_type.
+	 */
 	void *sve_state;
+	enum fp_type fp_type;
 	unsigned int sve_max_vl;
 	u64 svcr;
 
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 400f8956328b9..208434a2e9247 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -122,6 +122,11 @@ enum vec_type {
 	ARM64_VEC_MAX,
 };
 
+enum fp_type {
+	FP_STATE_FPSIMD,
+	FP_STATE_SVE,
+};
+
 struct cpu_context {
 	unsigned long x19;
 	unsigned long x20;
@@ -152,6 +157,7 @@ struct thread_struct {
 		struct user_fpsimd_state fpsimd_state;
 	} uw;
 
+	enum fp_type		fp_type;	/* registers FPSIMD or SVE? */
 	unsigned int		fpsimd_cpu;
 	void			*sve_state;	/* SVE registers, if any */
 	void			*za_state;	/* ZA register, if any */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index cc368d9ba38ab..39e25992f11c0 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -125,6 +125,7 @@ struct fpsimd_last_state_struct {
 	u64 *svcr;
 	unsigned int sve_vl;
 	unsigned int sme_vl;
+	enum fp_type *fp_type;
 };
 
 static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);
@@ -330,15 +331,6 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type,
  *    The task can execute SVE instructions while in userspace without
  *    trapping to the kernel.
  *
- *    When stored, Z0-Z31 (incorporating Vn in bits[127:0] or the
- *    corresponding Zn), P0-P15 and FFR are encoded in
- *    task->thread.sve_state, formatted appropriately for vector
- *    length task->thread.sve_vl or, if SVCR.SM is set,
- *    task->thread.sme_vl.
- *
- *    task->thread.sve_state must point to a valid buffer at least
- *    sve_state_size(task) bytes in size.
- *
  *    During any syscall, the kernel may optionally clear TIF_SVE and
  *    discard the vector state except for the FPSIMD subset.
  *
@@ -348,7 +340,15 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type,
  *    do_sve_acc() to be called, which does some preparation and then
  *    sets TIF_SVE.
  *
- *    When stored, FPSIMD registers V0-V31 are encoded in
+ * During any syscall, the kernel may optionally clear TIF_SVE and
+ * discard the vector state except for the FPSIMD subset.
+ *
+ * The data will be stored in one of two formats:
+ *
+ *  * FPSIMD only - FP_STATE_FPSIMD:
+ *
+ *    When the FPSIMD only state stored task->thread.fp_type is set to
+ *    FP_STATE_FPSIMD, the FPSIMD registers V0-V31 are encoded in
  *    task->thread.uw.fpsimd_state; bits [max : 128] for each of Z0-Z31 are
  *    logically zero but not stored anywhere; P0-P15 and FFR are not
  *    stored and have unspecified values from userspace's point of
@@ -358,6 +358,19 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type,
  *    task->thread.sve_state does not need to be non-NULL, valid or any
  *    particular size: it must not be dereferenced.
  *
+ *  * SVE state - FP_STATE_SVE:
+ *
+ *    When the full SVE state is stored task->thread.fp_type is set to
+ *    FP_STATE_SVE and Z0-Z31 (incorporating Vn in bits[127:0] or the
+ *    corresponding Zn), P0-P15 and FFR are encoded in in
+ *    task->thread.sve_state, formatted appropriately for vector
+ *    length task->thread.sve_vl or, if SVCR.SM is set,
+ *    task->thread.sme_vl. The storage for the vector registers in
+ *    task->thread.uw.fpsimd_state should be ignored.
+ *
+ *    task->thread.sve_state must point to a valid buffer at least
+ *    sve_state_size(task) bytes in size.
+ *
  *  * FPSR and FPCR are always stored in task->thread.uw.fpsimd_state
  *    irrespective of whether TIF_SVE is clear or set, since these are
  *    not vector length dependent.
@@ -404,12 +417,15 @@ static void task_fpsimd_load(void)
 		}
 	}
 
-	if (restore_sve_regs)
+	if (restore_sve_regs) {
+		WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE);
 		sve_load_state(sve_pffr(&current->thread),
 			       &current->thread.uw.fpsimd_state.fpsr,
 			       restore_ffr);
-	else
+	} else {
+		WARN_ON_ONCE(current->thread.fp_type != FP_STATE_FPSIMD);
 		fpsimd_load_state(&current->thread.uw.fpsimd_state);
+	}
 }
 
 /*
@@ -474,8 +490,10 @@ static void fpsimd_save(void)
 		sve_save_state((char *)last->sve_state +
 					sve_ffr_offset(vl),
 			       &last->st->fpsr, save_ffr);
+		*last->fp_type = FP_STATE_SVE;
 	} else {
 		fpsimd_save_state(last->st);
+		*last->fp_type = FP_STATE_FPSIMD;
 	}
 }
 
@@ -851,8 +869,10 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type,
 
 	fpsimd_flush_task_state(task);
 	if (test_and_clear_tsk_thread_flag(task, TIF_SVE) ||
-	    thread_sm_enabled(&task->thread))
+	    thread_sm_enabled(&task->thread)) {
 		sve_to_fpsimd(task);
+		task->thread.fp_type = FP_STATE_FPSIMD;
+	}
 
 	if (system_supports_sme()) {
 		if (type == ARM64_VEC_SME ||
@@ -1383,6 +1403,7 @@ static void sve_init_regs(void)
 		fpsimd_bind_task_to_cpu();
 	} else {
 		fpsimd_to_sve(current);
+		current->thread.fp_type = FP_STATE_SVE;
 	}
 }
 
@@ -1611,6 +1632,8 @@ void fpsimd_flush_thread(void)
 		current->thread.svcr = 0;
 	}
 
+	current->thread.fp_type = FP_STATE_FPSIMD;
+
 	put_cpu_fpsimd_context();
 	kfree(sve_state);
 	kfree(za_state);
@@ -1659,8 +1682,10 @@ void fpsimd_kvm_prepare(void)
 	 */
 	get_cpu_fpsimd_context();
 
-	if (test_and_clear_thread_flag(TIF_SVE))
+	if (test_and_clear_thread_flag(TIF_SVE)) {
 		sve_to_fpsimd(current);
+		current->thread.fp_type = FP_STATE_FPSIMD;
+	}
 
 	put_cpu_fpsimd_context();
 }
@@ -1682,6 +1707,7 @@ static void fpsimd_bind_task_to_cpu(void)
 	last->sve_vl = task_get_sve_vl(current);
 	last->sme_vl = task_get_sme_vl(current);
 	last->svcr = &current->thread.svcr;
+	last->fp_type = &current->thread.fp_type;
 	current->thread.fpsimd_cpu = smp_processor_id();
 
 	/*
@@ -1705,7 +1731,8 @@ static void fpsimd_bind_task_to_cpu(void)
 
 void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state,
 			      unsigned int sve_vl, void *za_state,
-			      unsigned int sme_vl, u64 *svcr)
+			      unsigned int sme_vl, u64 *svcr,
+			      enum fp_type *type)
 {
 	struct fpsimd_last_state_struct *last =
 		this_cpu_ptr(&fpsimd_last_state);
@@ -1719,6 +1746,7 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state,
 	last->za_state = za_state;
 	last->sve_vl = sve_vl;
 	last->sme_vl = sme_vl;
+	last->fp_type = type;
 }
 
 /*
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 044a7d7f1f6ad..19cd05eea3f0e 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -331,6 +331,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 		clear_tsk_thread_flag(dst, TIF_SME);
 	}
 
+	dst->thread.fp_type = FP_STATE_FPSIMD;
+
 	/* clear any pending asynchronous tag fault raised by the parent */
 	clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT);
 
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index d02dd2be17b3b..595625cb7c624 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -917,6 +917,7 @@ static int sve_set_common(struct task_struct *target,
 		clear_tsk_thread_flag(target, TIF_SVE);
 		if (type == ARM64_VEC_SME)
 			fpsimd_force_sync_to_sve(target);
+		target->thread.fp_type = FP_STATE_FPSIMD;
 		goto out;
 	}
 
@@ -939,6 +940,7 @@ static int sve_set_common(struct task_struct *target,
 	if (!target->thread.sve_state) {
 		ret = -ENOMEM;
 		clear_tsk_thread_flag(target, TIF_SVE);
+		target->thread.fp_type = FP_STATE_FPSIMD;
 		goto out;
 	}
 
@@ -952,6 +954,7 @@ static int sve_set_common(struct task_struct *target,
 	fpsimd_sync_to_sve(target);
 	if (type == ARM64_VEC_SVE)
 		set_tsk_thread_flag(target, TIF_SVE);
+	target->thread.fp_type = FP_STATE_SVE;
 
 	BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
 	start = SVE_PT_SVE_OFFSET;
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 58b7d599ff19b..86f2c5b66bc68 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -208,6 +208,7 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
 	__get_user_error(fpsimd.fpcr, &ctx->fpcr, err);
 
 	clear_thread_flag(TIF_SVE);
+	current->thread.fp_type = FP_STATE_FPSIMD;
 
 	/* load the hardware registers from the fpsimd_state structure */
 	if (!err)
@@ -298,6 +299,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user)
 	if (sve.head.size <= sizeof(*user->sve)) {
 		clear_thread_flag(TIF_SVE);
 		current->thread.svcr &= ~SVCR_SM_MASK;
+		current->thread.fp_type = FP_STATE_FPSIMD;
 		goto fpsimd_only;
 	}
 
@@ -333,6 +335,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user)
 		current->thread.svcr |= SVCR_SM_MASK;
 	else
 		set_thread_flag(TIF_SVE);
+	current->thread.fp_type = FP_STATE_SVE;
 
 fpsimd_only:
 	/* copy the FP and status/control registers */
@@ -938,9 +941,11 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 		 * FPSIMD register state - flush the saved FPSIMD
 		 * register state in case it gets loaded.
 		 */
-		if (current->thread.svcr & SVCR_SM_MASK)
+		if (current->thread.svcr & SVCR_SM_MASK) {
 			memset(&current->thread.uw.fpsimd_state, 0,
 			       sizeof(current->thread.uw.fpsimd_state));
+			current->thread.fp_type = FP_STATE_FPSIMD;
+		}
 
 		current->thread.svcr &= ~(SVCR_ZA_MASK |
 					  SVCR_SM_MASK);
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 51ca78b31b952..a4b4502ad850a 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -140,7 +140,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
 		fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.fp_regs,
 					 vcpu->arch.sve_state,
 					 vcpu->arch.sve_max_vl,
-					 NULL, 0, &vcpu->arch.svcr);
+					 NULL, 0, &vcpu->arch.svcr,
+					 &vcpu->arch.fp_type);
 
 		clear_thread_flag(TIF_FOREIGN_FPSTATE);
 		update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu));

From 604a849f0081c8850f08b3e4cadd8c5e9c1f96ca Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 15 Nov 2022 09:46:35 +0000
Subject: [PATCH 113/175] arm64/fpsimd: Have KVM explicitly say which FP
 registers to save

commit deeb8f9a80fdae5a62525656d65c7070c28bd3a4 upstream.

In order to avoid needlessly saving and restoring the guest registers KVM
relies on the host FPSMID code to save the guest registers when we context
switch away from the guest. This is done by binding the KVM guest state to
the CPU on top of the task state that was originally there, then carefully
managing the TIF_SVE flag for the task to cause the host to save the full
SVE state when needed regardless of the needs of the host task. This works
well enough but isn't terribly direct about what is going on and makes it
much more complicated to try to optimise what we're doing with the SVE
register state.

Let's instead have KVM pass in the register state it wants saving when it
binds to the CPU. We introduce a new FP_STATE_CURRENT for use
during normal task binding to indicate that we should base our
decisions on the current task. This should not be used when
actually saving. Ideally we might want to use a separate enum for
the type to save but this enum and the enum values would then
need to be named which has problems with clarity and ambiguity.

In order to ease any future debugging that might be required this patch
does not actually update any of the decision making about what to save,
it merely starts tracking the new information and warns if the requested
state is not what we would otherwise have decided to save.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221115094640.112848-4-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/fpsimd.h    |  3 ++-
 arch/arm64/include/asm/processor.h |  1 +
 arch/arm64/kernel/fpsimd.c         | 27 ++++++++++++++++++++++++---
 arch/arm64/kvm/fpsimd.c            |  9 ++++++++-
 4 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index e10894100c739..7622782d0bb97 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -61,7 +61,8 @@ extern void fpsimd_kvm_prepare(void);
 extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
 				     void *sve_state, unsigned int sve_vl,
 				     void *za_state, unsigned int sme_vl,
-				     u64 *svcr, enum fp_type *type);
+				     u64 *svcr, enum fp_type *type,
+				     enum fp_type to_save);
 
 extern void fpsimd_flush_task_state(struct task_struct *target);
 extern void fpsimd_save_and_flush_cpu_state(void);
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 208434a2e9247..1b822e618bb4b 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -123,6 +123,7 @@ enum vec_type {
 };
 
 enum fp_type {
+	FP_STATE_CURRENT,	/* Save based on current task state. */
 	FP_STATE_FPSIMD,
 	FP_STATE_SVE,
 };
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 39e25992f11c0..f10fc366da76f 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -126,6 +126,7 @@ struct fpsimd_last_state_struct {
 	unsigned int sve_vl;
 	unsigned int sme_vl;
 	enum fp_type *fp_type;
+	enum fp_type to_save;
 };
 
 static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);
@@ -356,7 +357,8 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type,
  *    but userspace is discouraged from relying on this.
  *
  *    task->thread.sve_state does not need to be non-NULL, valid or any
- *    particular size: it must not be dereferenced.
+ *    particular size: it must not be dereferenced and any data stored
+ *    there should be considered stale and not referenced.
  *
  *  * SVE state - FP_STATE_SVE:
  *
@@ -369,7 +371,9 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type,
  *    task->thread.uw.fpsimd_state should be ignored.
  *
  *    task->thread.sve_state must point to a valid buffer at least
- *    sve_state_size(task) bytes in size.
+ *    sve_state_size(task) bytes in size. The data stored in
+ *    task->thread.uw.fpsimd_state.vregs should be considered stale
+ *    and not referenced.
  *
  *  * FPSR and FPCR are always stored in task->thread.uw.fpsimd_state
  *    irrespective of whether TIF_SVE is clear or set, since these are
@@ -459,6 +463,21 @@ static void fpsimd_save(void)
 		vl = last->sve_vl;
 	}
 
+	/*
+	 * Validate that an explicitly specified state to save is
+	 * consistent with the task state.
+	 */
+	switch (last->to_save) {
+	case FP_STATE_CURRENT:
+		break;
+	case FP_STATE_FPSIMD:
+		WARN_ON_ONCE(save_sve_regs);
+		break;
+	case FP_STATE_SVE:
+		WARN_ON_ONCE(!save_sve_regs);
+		break;
+	}
+
 	if (system_supports_sme()) {
 		u64 *svcr = last->svcr;
 
@@ -1708,6 +1727,7 @@ static void fpsimd_bind_task_to_cpu(void)
 	last->sme_vl = task_get_sme_vl(current);
 	last->svcr = &current->thread.svcr;
 	last->fp_type = &current->thread.fp_type;
+	last->to_save = FP_STATE_CURRENT;
 	current->thread.fpsimd_cpu = smp_processor_id();
 
 	/*
@@ -1732,7 +1752,7 @@ static void fpsimd_bind_task_to_cpu(void)
 void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state,
 			      unsigned int sve_vl, void *za_state,
 			      unsigned int sme_vl, u64 *svcr,
-			      enum fp_type *type)
+			      enum fp_type *type, enum fp_type to_save)
 {
 	struct fpsimd_last_state_struct *last =
 		this_cpu_ptr(&fpsimd_last_state);
@@ -1747,6 +1767,7 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state,
 	last->sve_vl = sve_vl;
 	last->sme_vl = sme_vl;
 	last->fp_type = type;
+	last->to_save = to_save;
 }
 
 /*
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index a4b4502ad850a..89c02ce797b87 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -130,9 +130,16 @@ void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu)
  */
 void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
 {
+	enum fp_type fp_type;
+
 	WARN_ON_ONCE(!irqs_disabled());
 
 	if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) {
+		if (vcpu_has_sve(vcpu))
+			fp_type = FP_STATE_SVE;
+		else
+			fp_type = FP_STATE_FPSIMD;
+
 		/*
 		 * Currently we do not support SME guests so SVCR is
 		 * always 0 and we just need a variable to point to.
@@ -141,7 +148,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
 					 vcpu->arch.sve_state,
 					 vcpu->arch.sve_max_vl,
 					 NULL, 0, &vcpu->arch.svcr,
-					 &vcpu->arch.fp_type);
+					 &vcpu->arch.fp_type, fp_type);
 
 		clear_thread_flag(TIF_FOREIGN_FPSTATE);
 		update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu));

From 898db9e5555577ff3be62848e6744bfce516df1a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 15 Nov 2022 09:46:36 +0000
Subject: [PATCH 114/175] arm64/fpsimd: Stop using TIF_SVE to manage register
 saving in KVM

commit 62021cc36add7b2c015b837f7893f2fb4b8c2586 upstream.

Now that we are explicitly telling the host FP code which register state
it needs to save we can remove the manipulation of TIF_SVE from the KVM
code, simplifying it and allowing us to optimise our handling of normal
tasks. Remove the manipulation of TIF_SVE from KVM and instead rely on
to_save to ensure we save the correct data for it.

There should be no functional or performance impact from this change.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221115094640.112848-5-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kernel/fpsimd.c | 22 ++++------------------
 arch/arm64/kvm/fpsimd.c    |  3 ---
 2 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index f10fc366da76f..a9945ca38d23b 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -439,8 +439,8 @@ static void task_fpsimd_load(void)
  * last, if KVM is involved this may be the guest VM context rather
  * than the host thread for the VM pointed to by current. This means
  * that we must always reference the state storage via last rather
- * than via current, other than the TIF_ flags which KVM will
- * carefully maintain for us.
+ * than via current, if we are saving KVM state then it will have
+ * ensured that the type of registers to save is set in last->to_save.
  */
 static void fpsimd_save(void)
 {
@@ -457,27 +457,13 @@ static void fpsimd_save(void)
 	if (test_thread_flag(TIF_FOREIGN_FPSTATE))
 		return;
 
-	if (test_thread_flag(TIF_SVE)) {
+	if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE)) ||
+	    last->to_save == FP_STATE_SVE) {
 		save_sve_regs = true;
 		save_ffr = true;
 		vl = last->sve_vl;
 	}
 
-	/*
-	 * Validate that an explicitly specified state to save is
-	 * consistent with the task state.
-	 */
-	switch (last->to_save) {
-	case FP_STATE_CURRENT:
-		break;
-	case FP_STATE_FPSIMD:
-		WARN_ON_ONCE(save_sve_regs);
-		break;
-	case FP_STATE_SVE:
-		WARN_ON_ONCE(!save_sve_regs);
-		break;
-	}
-
 	if (system_supports_sme()) {
 		u64 *svcr = last->svcr;
 
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 89c02ce797b87..ec82d0191f767 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -151,7 +151,6 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
 					 &vcpu->arch.fp_type, fp_type);
 
 		clear_thread_flag(TIF_FOREIGN_FPSTATE);
-		update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu));
 	}
 }
 
@@ -208,7 +207,5 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
 			sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0);
 	}
 
-	update_thread_flag(TIF_SVE, 0);
-
 	local_irq_restore(flags);
 }

From 317b87e04ebb17ec6ef39550aaae61e021745882 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 15 Nov 2022 09:46:37 +0000
Subject: [PATCH 115/175] arm64/fpsimd: Load FP state based on recorded data
 type

commit a0136be443d51803da4900b52223302f3913812f upstream.

Now that we are recording the type of floating point register state we
are saving when we write the register state out to memory we can use
that information when we load from memory to decide which format to
load, bringing TIF_SVE into line with what we saved rather than relying
on TIF_SVE to determine what to load.

The SME state details are already recorded directly in the saved
SVCR and handled based on the information there.

Since we are not changing any of the save paths there should be no
functional change from this patch, further patches will make use of this
to optimise and clarify the code.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221115094640.112848-6-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kernel/fpsimd.c | 40 ++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index a9945ca38d23b..9cbda0a3bc8b0 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -395,11 +395,37 @@ static void task_fpsimd_load(void)
 	WARN_ON(!system_supports_fpsimd());
 	WARN_ON(!have_cpu_fpsimd_context());
 
-	/* Check if we should restore SVE first */
-	if (IS_ENABLED(CONFIG_ARM64_SVE) && test_thread_flag(TIF_SVE)) {
-		sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1);
-		restore_sve_regs = true;
-		restore_ffr = true;
+	if (system_supports_sve()) {
+		switch (current->thread.fp_type) {
+		case FP_STATE_FPSIMD:
+			/* Stop tracking SVE for this task until next use. */
+			if (test_and_clear_thread_flag(TIF_SVE))
+				sve_user_disable();
+			break;
+		case FP_STATE_SVE:
+			if (!thread_sm_enabled(&current->thread) &&
+			    !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)))
+				sve_user_enable();
+
+			if (test_thread_flag(TIF_SVE))
+				sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1);
+
+			restore_sve_regs = true;
+			restore_ffr = true;
+			break;
+		default:
+			/*
+			 * This indicates either a bug in
+			 * fpsimd_save() or memory corruption, we
+			 * should always record an explicit format
+			 * when we save. We always at least have the
+			 * memory allocated for FPSMID registers so
+			 * try that and hope for the best.
+			 */
+			WARN_ON_ONCE(1);
+			clear_thread_flag(TIF_SVE);
+			break;
+		}
 	}
 
 	/* Restore SME, override SVE register configuration if needed */
@@ -415,10 +441,8 @@ static void task_fpsimd_load(void)
 		if (thread_za_enabled(&current->thread))
 			za_load_state(current->thread.za_state);
 
-		if (thread_sm_enabled(&current->thread)) {
-			restore_sve_regs = true;
+		if (thread_sm_enabled(&current->thread))
 			restore_ffr = system_supports_fa64();
-		}
 	}
 
 	if (restore_sve_regs) {

From 0cb2d6e9d59568b6a668ea1bde21ca0609154747 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 15 Nov 2022 09:46:38 +0000
Subject: [PATCH 116/175] arm64/fpsimd: SME no longer requires SVE register
 state

commit bbc6172eefdb276be140415fd6ac6cc8a14a5263 upstream.

Now that we track the type of the stored register state separately to
what is active in the task, it is valid to have the FPSIMD register
state stored while in streaming mode. Remove the special case handling
for SME when setting FPSIMD register state.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221115094640.112848-7-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kernel/fpsimd.c | 3 +--
 arch/arm64/kernel/ptrace.c | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 9cbda0a3bc8b0..cc61c199afc42 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -815,8 +815,7 @@ void fpsimd_sync_to_sve(struct task_struct *task)
  */
 void sve_sync_to_fpsimd(struct task_struct *task)
 {
-	if (test_tsk_thread_flag(task, TIF_SVE) ||
-	    thread_sm_enabled(&task->thread))
+	if (task->thread.fp_type == FP_STATE_SVE)
 		sve_to_fpsimd(task);
 }
 
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 595625cb7c624..18ba01eb2a0e3 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -915,8 +915,6 @@ static int sve_set_common(struct task_struct *target,
 		ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
 				SVE_PT_FPSIMD_OFFSET);
 		clear_tsk_thread_flag(target, TIF_SVE);
-		if (type == ARM64_VEC_SME)
-			fpsimd_force_sync_to_sve(target);
 		target->thread.fp_type = FP_STATE_FPSIMD;
 		goto out;
 	}

From 241876751db9ecec99e72bc49313d3dc069f7219 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 15 Nov 2022 09:46:39 +0000
Subject: [PATCH 117/175] arm64/sve: Leave SVE enabled on syscall if we don't
 context switch

commit 8c845e2731041f0fdf9287dea80b039b26332c9f upstream.

The syscall ABI says that the SVE register state not shared with FPSIMD
may not be preserved on syscall, and this is the only mechanism we have
in the ABI to stop tracking the extra SVE state for a process. Currently
we do this unconditionally by means of disabling SVE for the process on
syscall, causing userspace to take a trap to EL1 if it uses SVE again.
These extra traps result in a noticeable overhead for using SVE instead
of FPSIMD in some workloads, especially for simple syscalls where we can
return directly to userspace and would not otherwise need to update the
floating point registers. Tests with fp-pidbench show an approximately
70% overhead on a range of implementations when SVE is in use - while
this is an extreme and entirely artificial benchmark it is clear that
there is some useful room for improvement here.

Now that we have the ability to track the decision about what to save
seprately to TIF_SVE we can improve things by leaving TIF_SVE enabled on
syscall but only saving the FPSIMD registers if we are in a syscall.
This means that if we need to restore the register state from memory
(eg, after a context switch or kernel mode NEON) we will drop TIF_SVE
and reenable traps for userspace but if we can just return to userspace
then traps will remain disabled.

Since our current implementation and hence ABI has the effect of zeroing
all the SVE register state not shared with FPSIMD on syscall we replace
the disabling of TIF_SVE with a flush of the non-shared register state,
this means that there is still some overhead for syscalls when SVE is in
use but it is very much reduced.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221115094640.112848-8-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kernel/fpsimd.c  |  8 +++++++-
 arch/arm64/kernel/syscall.c | 19 +++++--------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index cc61c199afc42..ab0ea49620c52 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -481,7 +481,13 @@ static void fpsimd_save(void)
 	if (test_thread_flag(TIF_FOREIGN_FPSTATE))
 		return;
 
-	if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE)) ||
+	/*
+	 * If a task is in a syscall the ABI allows us to only
+	 * preserve the state shared with FPSIMD so don't bother
+	 * saving the full SVE state in that case.
+	 */
+	if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) &&
+	     !in_syscall(current_pt_regs())) ||
 	    last->to_save == FP_STATE_SVE) {
 		save_sve_regs = true;
 		save_ffr = true;
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index c771e94568b9b..9fa3f3472acf4 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -181,21 +181,12 @@ static inline void fp_user_discard(void)
 	if (!system_supports_sve())
 		return;
 
-	/*
-	 * If SME is not active then disable SVE, the registers will
-	 * be cleared when userspace next attempts to access them and
-	 * we do not need to track the SVE register state until then.
-	 */
-	clear_thread_flag(TIF_SVE);
+	if (test_thread_flag(TIF_SVE)) {
+		unsigned int sve_vq_minus_one;
 
-	/*
-	 * task_fpsimd_load() won't be called to update CPACR_EL1 in
-	 * ret_to_user unless TIF_FOREIGN_FPSTATE is still set, which only
-	 * happens if a context switch or kernel_neon_begin() or context
-	 * modification (sigreturn, ptrace) intervenes.
-	 * So, ensure that CPACR_EL1 is already correct for the fast-path case.
-	 */
-	sve_user_disable();
+		sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1;
+		sve_flush_live(true, sve_vq_minus_one);
+	}
 }
 
 void do_el0_svc(struct pt_regs *regs)

From ca35de20d21d480cb4832f84cb4ae76f507b142f Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 10 Nov 2022 19:02:40 +0000
Subject: [PATCH 118/175] KVM: arm64: Prevent the donation of no-map pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 43c1ff8b75011bc3e3e923adf31ba815864a2494 upstream.

Memory regions marked as "no-map" in the host device-tree routinely
include TrustZone carev-outs and DMA pools. Although donating such pages
to the hypervisor may not breach confidentiality, it could be used to
corrupt its state in uncontrollable ways. To prevent this, let's block
host-initiated memory transitions targeting "no-map" pages altogether in
nVHE protected mode as there should be no valid reason to do this in
current operation.

Thankfully, the pKVM EL2 hypervisor has a full copy of the host's list
of memblock regions, so we can easily check for the presence of the
MEMBLOCK_NOMAP flag on a region containing pages being donated from the
host.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221110190259.26861-8-will@kernel.org
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 07f9dc9848ef1..0f6c053686c78 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -195,7 +195,7 @@ struct kvm_mem_range {
 	u64 end;
 };
 
-static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
 {
 	int cur, left = 0, right = hyp_memblock_nr;
 	struct memblock_region *reg;
@@ -218,18 +218,28 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
 		} else {
 			range->start = reg->base;
 			range->end = end;
-			return true;
+			return reg;
 		}
 	}
 
-	return false;
+	return NULL;
 }
 
 bool addr_is_memory(phys_addr_t phys)
 {
 	struct kvm_mem_range range;
 
-	return find_mem_range(phys, &range);
+	return !!find_mem_range(phys, &range);
+}
+
+static bool addr_is_allowed_memory(phys_addr_t phys)
+{
+	struct memblock_region *reg;
+	struct kvm_mem_range range;
+
+	reg = find_mem_range(phys, &range);
+
+	return reg && !(reg->flags & MEMBLOCK_NOMAP);
 }
 
 static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
@@ -348,7 +358,7 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr
 static int host_stage2_idmap(u64 addr)
 {
 	struct kvm_mem_range range;
-	bool is_memory = find_mem_range(addr, &range);
+	bool is_memory = !!find_mem_range(addr, &range);
 	enum kvm_pgtable_prot prot;
 	int ret;
 
@@ -425,7 +435,7 @@ static int __check_page_state_visitor(u64 addr, u64 end, u32 level,
 	struct check_walk_data *d = arg;
 	kvm_pte_t pte = *ptep;
 
-	if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte)))
+	if (kvm_pte_valid(pte) && !addr_is_allowed_memory(kvm_pte_to_phys(pte)))
 		return -EINVAL;
 
 	return d->get_page_state(pte) == d->desired ? 0 : -EPERM;

From aa54ec9308c8c01a5c7d0e2f6a6bee4713c44c6c Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 18 May 2023 10:58:44 +0100
Subject: [PATCH 119/175] KVM: arm64: Prevent unconditional donation of
 unmapped regions from the host

commit 09cce60bddd6461a93a5bf434265a47827d1bc6f upstream.

Since host stage-2 mappings are created lazily, we cannot rely solely on
the pte in order to recover the target physical address when checking a
host-initiated memory transition as this permits donation of unmapped
regions corresponding to MMIO or "no-map" memory.

Instead of inspecting the pte, move the addr_is_allowed_memory() check
into the host callback function where it is passed the physical address
directly from the walker.

Cc: Quentin Perret <qperret@google.com>
Fixes: e82edcc75c4e ("KVM: arm64: Implement do_share() helper for sharing memory")
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230518095844.1178-1-will@kernel.org
[ bp: s/ctx->addr/addr in __check_page_state_visitor due to missing commit
      "KVM: arm64: Combine visitor arguments into a context structure"
      in stable.
]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 0f6c053686c78..0faa330a41edb 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -424,7 +424,7 @@ struct pkvm_mem_share {
 
 struct check_walk_data {
 	enum pkvm_page_state	desired;
-	enum pkvm_page_state	(*get_page_state)(kvm_pte_t pte);
+	enum pkvm_page_state	(*get_page_state)(kvm_pte_t pte, u64 addr);
 };
 
 static int __check_page_state_visitor(u64 addr, u64 end, u32 level,
@@ -435,10 +435,7 @@ static int __check_page_state_visitor(u64 addr, u64 end, u32 level,
 	struct check_walk_data *d = arg;
 	kvm_pte_t pte = *ptep;
 
-	if (kvm_pte_valid(pte) && !addr_is_allowed_memory(kvm_pte_to_phys(pte)))
-		return -EINVAL;
-
-	return d->get_page_state(pte) == d->desired ? 0 : -EPERM;
+	return d->get_page_state(pte, addr) == d->desired ? 0 : -EPERM;
 }
 
 static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
@@ -453,8 +450,11 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
 	return kvm_pgtable_walk(pgt, addr, size, &walker);
 }
 
-static enum pkvm_page_state host_get_page_state(kvm_pte_t pte)
+static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr)
 {
+	if (!addr_is_allowed_memory(addr))
+		return PKVM_NOPAGE;
+
 	if (!kvm_pte_valid(pte) && pte)
 		return PKVM_NOPAGE;
 
@@ -521,7 +521,7 @@ static int host_initiate_unshare(u64 *completer_addr,
 	return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED);
 }
 
-static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte)
+static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr)
 {
 	if (!kvm_pte_valid(pte))
 		return PKVM_NOPAGE;

From 8b8f00588996d79eb4045a1288071b111cdce19d Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@amazon.com>
Date: Wed, 27 Sep 2023 14:25:40 +0000
Subject: [PATCH 120/175] cgroup: add cgroup_favordynmods= command-line option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have a need of using favordynmods with cgroup v1, which doesn't support
changing mount flags during remount. Enabling CONFIG_CGROUP_FAVOR_DYNMODS at
build-time is not an option because we want to be able to selectively
enable it for certain systems.

This commit addresses this by introducing the cgroup_favordynmods=
command-line option. This option works for both cgroup v1 and v2 and also
allows for disabling favorynmods when the kernel built with
CONFIG_CGROUP_FAVOR_DYNMODS=y.

Also, note that when cgroup_favordynmods=true favordynmods is never
disabled in cgroup_destroy_root().

Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
(cherry picked from commit 9b81d3a5be05d350ac93d99762c7ee91fe29b4cb)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 .../admin-guide/kernel-parameters.txt          |  4 ++++
 kernel/cgroup/cgroup.c                         | 18 ++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index e6f0570cf4900..182362217d5fc 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -554,6 +554,10 @@
 			named mounts. Specifying both "all" and "named" disables
 			all v1 hierarchies.
 
+	cgroup_favordynmods= [KNL] Enable or Disable favordynmods.
+			Format: { "true" | "false" }
+			Defaults to the value of CONFIG_CGROUP_FAVOR_DYNMODS.
+
 	cgroup.memory=	[KNL] Pass options to the cgroup memory controller.
 			Format: <string>
 			nosocket -- Disable socket memory accounting.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 97ecca43386d9..2e74ef9750202 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -207,6 +207,8 @@ static u16 have_exit_callback __read_mostly;
 static u16 have_release_callback __read_mostly;
 static u16 have_canfork_callback __read_mostly;
 
+static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
 	.ns.count	= REFCOUNT_INIT(2),
@@ -1383,7 +1385,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 		cgroup_root_count--;
 	}
 
-	cgroup_favor_dynmods(root, false);
+	if (!have_favordynmods)
+		cgroup_favor_dynmods(root, false);
+
 	cgroup_exit_root_id(root);
 
 	cgroup_unlock();
@@ -2266,9 +2270,9 @@ static int cgroup_init_fs_context(struct fs_context *fc)
 	fc->user_ns = get_user_ns(ctx->ns->user_ns);
 	fc->global = true;
 
-#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
-	ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
-#endif
+	if (have_favordynmods)
+		ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+
 	return 0;
 }
 
@@ -6762,6 +6766,12 @@ static int __init enable_cgroup_debug(char *str)
 }
 __setup("cgroup_debug", enable_cgroup_debug);
 
+static int __init cgroup_favordynmods_setup(char *str)
+{
+	return (kstrtobool(str, &have_favordynmods) == 0);
+}
+__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);
+
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest

From 02472c6d927bd02c371b9a0d5251ad66bd509811 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 7 Dec 2022 12:16:47 -0800
Subject: [PATCH 121/175] selftests/bpf: add generic BPF program tester-loader

It's become a common pattern to have a collection of small BPF programs
in one BPF object file, each representing one test case. On user-space
side of such tests we maintain a table of program names and expected
failure or success, along with optional expected verifier log message.

This works, but each set of tests reimplement this mundane code over and
over again, which is a waste of time for anyone trying to add a new set
of tests. Furthermore, it's quite error prone as it's way too easy to miss
some entries in these manually maintained test tables (as evidences by
dynptr_fail tests, in which ringbuf_release_uninit_dynptr subtest was
accidentally missed; this is fixed in next patch).

So this patch implements generic test_loader, which accepts skeleton
name and handles the rest of details: opens and loads BPF object file,
making sure each program is tested in isolation. Optionally each test
case can specify expected BPF verifier log message. In case of failure,
tester makes sure to report verifier log, but it also reports verifier
log in verbose mode unconditionally.

Now, the interesting deviation from existing custom implementations is
the use of btf_decl_tag attribute to specify expected-to-fail vs
expected-to-succeed markers and, optionally, expected log message
directly next to BPF program source code, eliminating the need to
manually create and update table of tests.

We define few macros wrapping btf_decl_tag with a convention that all
values of btf_decl_tag start with "comment:" prefix, and then utilizing
a very simple "just_some_text_tag" or "some_key_name=<value>" pattern to
define things like expected success/failure, expected verifier message,
extra verifier log level (if necessary). This approach is demonstrated
by next patch in which two existing sets of failure tests are converted.

Tester supports both expected-to-fail and expected-to-succeed programs,
though this patch set didn't convert any existing expected-to-succeed
programs yet, as existing tests couple BPF program loading with their
further execution through attach or test_prog_run. One way to allow
testing scenarios like this would be ability to specify custom callback,
executed for each successfully loaded BPF program. This is left for
follow up patches, after some more analysis of existing test cases.

This test_loader is, hopefully, a start of a test_verifier-like runner,
but integrated into test_progs infrastructure. It will allow much better
"user experience" of defining low-level verification tests that can take
advantage of all the libbpf-provided nicety features on BPF side: global
variables, declarative maps, etc.  All while having a choice of defining
it in C or as BPF assembly (through __attribute__((naked)) functions and
using embedded asm), depending on what makes most sense in each
particular case. This will be explored in follow up patches as well.

Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20221207201648.2990661-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile         |   2 +-
 tools/testing/selftests/bpf/progs/bpf_misc.h |   5 +
 tools/testing/selftests/bpf/test_loader.c    | 233 +++++++++++++++++++
 tools/testing/selftests/bpf/test_progs.h     |  33 +++
 4 files changed, 272 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_loader.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3b57fbf8fff4a..31c425a4fff30 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -519,7 +519,7 @@ TRUNNER_BPF_PROGS_DIR := progs
 TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c	\
 			 network_helpers.c testing_helpers.c		\
 			 btf_helpers.c flow_dissector_load.h		\
-			 cap_helpers.c
+			 cap_helpers.c test_loader.c
 TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko	\
 		       $(OUTPUT)/liburandom_read.so			\
 		       $(OUTPUT)/xdp_synproxy				\
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 5bb11fe595a43..4a01ea9113bfd 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -2,6 +2,11 @@
 #ifndef __BPF_MISC_H__
 #define __BPF_MISC_H__
 
+#define __msg(msg)		__attribute__((btf_decl_tag("comment:test_expect_msg=" msg)))
+#define __failure		__attribute__((btf_decl_tag("comment:test_expect_failure")))
+#define __success		__attribute__((btf_decl_tag("comment:test_expect_success")))
+#define __log_level(lvl)	__attribute__((btf_decl_tag("comment:test_log_level="#lvl)))
+
 #if defined(__TARGET_ARCH_x86)
 #define SYSCALL_WRAPPER 1
 #define SYS_PREFIX "__x64_"
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
new file mode 100644
index 0000000000000..679efb3aa785e
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <stdlib.h>
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+#define str_has_pfx(str, pfx) \
+	(strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0)
+
+#define TEST_LOADER_LOG_BUF_SZ 1048576
+
+#define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure"
+#define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success"
+#define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg="
+#define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level="
+
+struct test_spec {
+	const char *name;
+	bool expect_failure;
+	const char *expect_msg;
+	int log_level;
+};
+
+static int tester_init(struct test_loader *tester)
+{
+	if (!tester->log_buf) {
+		tester->log_buf_sz = TEST_LOADER_LOG_BUF_SZ;
+		tester->log_buf = malloc(tester->log_buf_sz);
+		if (!ASSERT_OK_PTR(tester->log_buf, "tester_log_buf"))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void test_loader_fini(struct test_loader *tester)
+{
+	if (!tester)
+		return;
+
+	free(tester->log_buf);
+}
+
+static int parse_test_spec(struct test_loader *tester,
+			   struct bpf_object *obj,
+			   struct bpf_program *prog,
+			   struct test_spec *spec)
+{
+	struct btf *btf;
+	int func_id, i;
+
+	memset(spec, 0, sizeof(*spec));
+
+	spec->name = bpf_program__name(prog);
+
+	btf = bpf_object__btf(obj);
+	if (!btf) {
+		ASSERT_FAIL("BPF object has no BTF");
+		return -EINVAL;
+	}
+
+	func_id = btf__find_by_name_kind(btf, spec->name, BTF_KIND_FUNC);
+	if (func_id < 0) {
+		ASSERT_FAIL("failed to find FUNC BTF type for '%s'", spec->name);
+		return -EINVAL;
+	}
+
+	for (i = 1; i < btf__type_cnt(btf); i++) {
+		const struct btf_type *t;
+		const char *s;
+
+		t = btf__type_by_id(btf, i);
+		if (!btf_is_decl_tag(t))
+			continue;
+
+		if (t->type != func_id || btf_decl_tag(t)->component_idx != -1)
+			continue;
+
+		s = btf__str_by_offset(btf, t->name_off);
+		if (strcmp(s, TEST_TAG_EXPECT_FAILURE) == 0) {
+			spec->expect_failure = true;
+		} else if (strcmp(s, TEST_TAG_EXPECT_SUCCESS) == 0) {
+			spec->expect_failure = false;
+		} else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) {
+			spec->expect_msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1;
+		} else if (str_has_pfx(s, TEST_TAG_LOG_LEVEL_PFX)) {
+			errno = 0;
+			spec->log_level = strtol(s + sizeof(TEST_TAG_LOG_LEVEL_PFX) - 1, NULL, 0);
+			if (errno) {
+				ASSERT_FAIL("failed to parse test log level from '%s'", s);
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void prepare_case(struct test_loader *tester,
+			 struct test_spec *spec,
+			 struct bpf_object *obj,
+			 struct bpf_program *prog)
+{
+	int min_log_level = 0;
+
+	if (env.verbosity > VERBOSE_NONE)
+		min_log_level = 1;
+	if (env.verbosity > VERBOSE_VERY)
+		min_log_level = 2;
+
+	bpf_program__set_log_buf(prog, tester->log_buf, tester->log_buf_sz);
+
+	/* Make sure we set at least minimal log level, unless test requirest
+	 * even higher level already. Make sure to preserve independent log
+	 * level 4 (verifier stats), though.
+	 */
+	if ((spec->log_level & 3) < min_log_level)
+		bpf_program__set_log_level(prog, (spec->log_level & 4) | min_log_level);
+	else
+		bpf_program__set_log_level(prog, spec->log_level);
+
+	tester->log_buf[0] = '\0';
+}
+
+static void emit_verifier_log(const char *log_buf, bool force)
+{
+	if (!force && env.verbosity == VERBOSE_NONE)
+		return;
+	fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf);
+}
+
+static void validate_case(struct test_loader *tester,
+			  struct test_spec *spec,
+			  struct bpf_object *obj,
+			  struct bpf_program *prog,
+			  int load_err)
+{
+	if (spec->expect_msg) {
+		char *match;
+
+		match = strstr(tester->log_buf, spec->expect_msg);
+		if (!ASSERT_OK_PTR(match, "expect_msg")) {
+			/* if we are in verbose mode, we've already emitted log */
+			if (env.verbosity == VERBOSE_NONE)
+				emit_verifier_log(tester->log_buf, true /*force*/);
+			fprintf(stderr, "EXPECTED MSG: '%s'\n", spec->expect_msg);
+			return;
+		}
+	}
+}
+
+/* this function is forced noinline and has short generic name to look better
+ * in test_progs output (in case of a failure)
+ */
+static noinline
+void run_subtest(struct test_loader *tester,
+		 const char *skel_name,
+		 skel_elf_bytes_fn elf_bytes_factory)
+{
+	LIBBPF_OPTS(bpf_object_open_opts, open_opts, .object_name = skel_name);
+	struct bpf_object *obj = NULL, *tobj;
+	struct bpf_program *prog, *tprog;
+	const void *obj_bytes;
+	size_t obj_byte_cnt;
+	int err;
+
+	if (tester_init(tester) < 0)
+		return; /* failed to initialize tester */
+
+	obj_bytes = elf_bytes_factory(&obj_byte_cnt);
+	obj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, &open_opts);
+	if (!ASSERT_OK_PTR(obj, "obj_open_mem"))
+		return;
+
+	bpf_object__for_each_program(prog, obj) {
+		const char *prog_name = bpf_program__name(prog);
+		struct test_spec spec;
+
+		if (!test__start_subtest(prog_name))
+			continue;
+
+		/* if we can't derive test specification, go to the next test */
+		err = parse_test_spec(tester, obj, prog, &spec);
+		if (!ASSERT_OK(err, "parse_test_spec"))
+			continue;
+
+		tobj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, &open_opts);
+		if (!ASSERT_OK_PTR(tobj, "obj_open_mem")) /* shouldn't happen */
+			continue;
+
+		bpf_object__for_each_program(tprog, tobj)
+			bpf_program__set_autoload(tprog, false);
+
+		bpf_object__for_each_program(tprog, tobj) {
+			/* only load specified program */
+			if (strcmp(bpf_program__name(tprog), prog_name) == 0) {
+				bpf_program__set_autoload(tprog, true);
+				break;
+			}
+		}
+
+		prepare_case(tester, &spec, tobj, tprog);
+
+		err = bpf_object__load(tobj);
+		if (spec.expect_failure) {
+			if (!ASSERT_ERR(err, "unexpected_load_success")) {
+				emit_verifier_log(tester->log_buf, false /*force*/);
+				goto tobj_cleanup;
+			}
+		} else {
+			if (!ASSERT_OK(err, "unexpected_load_failure")) {
+				emit_verifier_log(tester->log_buf, true /*force*/);
+				goto tobj_cleanup;
+			}
+		}
+
+		emit_verifier_log(tester->log_buf, false /*force*/);
+		validate_case(tester, &spec, tobj, tprog, err);
+
+tobj_cleanup:
+		bpf_object__close(tobj);
+	}
+
+	bpf_object__close(obj);
+}
+
+void test_loader__run_subtests(struct test_loader *tester,
+			       const char *skel_name,
+			       skel_elf_bytes_fn elf_bytes_factory)
+{
+	/* see comment in run_subtest() for why we do this function nesting */
+	run_subtest(tester, skel_name, elf_bytes_factory);
+}
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index b090996daee5c..3f058dfadbaf1 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -1,4 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TEST_PROGS_H
+#define __TEST_PROGS_H
+
 #include <stdio.h>
 #include <unistd.h>
 #include <errno.h>
@@ -210,6 +213,12 @@ int test__join_cgroup(const char *path);
 #define CHECK_ATTR(condition, tag, format...) \
 	_CHECK(condition, tag, tattr.duration, format)
 
+#define ASSERT_FAIL(fmt, args...) ({					\
+	static int duration = 0;					\
+	CHECK(false, "", fmt"\n", ##args);				\
+	false;								\
+})
+
 #define ASSERT_TRUE(actual, name) ({					\
 	static int duration = 0;					\
 	bool ___ok = (actual);						\
@@ -397,3 +406,27 @@ int write_sysctl(const char *sysctl, const char *value);
 #endif
 
 #define BPF_TESTMOD_TEST_FILE "/sys/kernel/bpf_testmod"
+
+struct test_loader {
+	char *log_buf;
+	size_t log_buf_sz;
+
+	struct bpf_object *obj;
+};
+
+typedef const void *(*skel_elf_bytes_fn)(size_t *sz);
+
+extern void test_loader__run_subtests(struct test_loader *tester,
+				      const char *skel_name,
+				      skel_elf_bytes_fn elf_bytes_factory);
+
+extern void test_loader_fini(struct test_loader *tester);
+
+#define RUN_TESTS(skel) ({						       \
+	struct test_loader tester = {};					       \
+									       \
+	test_loader__run_subtests(&tester, #skel, skel##__elf_bytes);	       \
+	test_loader_fini(&tester);					       \
+})
+
+#endif /* __TEST_PROGS_H */

From a60edfd774be730ada80ecbab45d73afdd767747 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 7 Dec 2022 12:16:48 -0800
Subject: [PATCH 122/175] selftests/bpf: convert dynptr_fail and map_kptr_fail
 subtests to generic tester

Convert big chunks of dynptr and map_kptr subtests to use generic
verification_tester. They are switched from using manually maintained
tables of test cases, specifying program name and expected error
verifier message, to btf_decl_tag-based annotations directly on
corresponding BPF programs: __failure to specify that BPF program is
expected to fail verification, and __msg() to specify expected log
message.

Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221207201648.2990661-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
[v6.1: replace ringbuf_mem with alloc_mem]
Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 .../testing/selftests/bpf/prog_tests/dynptr.c | 80 +------------------
 .../selftests/bpf/prog_tests/map_kptr.c       | 80 +------------------
 .../testing/selftests/bpf/progs/dynptr_fail.c | 31 +++++++
 .../selftests/bpf/progs/dynptr_success.c      |  1 +
 .../selftests/bpf/progs/map_kptr_fail.c       | 27 +++++++
 5 files changed, 64 insertions(+), 155 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index 8fc4e6c02bfda..7faaf6d9e0d47 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -5,86 +5,16 @@
 #include "dynptr_fail.skel.h"
 #include "dynptr_success.skel.h"
 
-static size_t log_buf_sz = 1048576; /* 1 MB */
-static char obj_log_buf[1048576];
-
 static struct {
 	const char *prog_name;
 	const char *expected_err_msg;
 } dynptr_tests[] = {
-	/* failure cases */
-	{"ringbuf_missing_release1", "Unreleased reference id=1"},
-	{"ringbuf_missing_release2", "Unreleased reference id=2"},
-	{"ringbuf_missing_release_callback", "Unreleased reference id"},
-	{"use_after_invalid", "Expected an initialized dynptr as arg #3"},
-	{"ringbuf_invalid_api", "type=mem expected=alloc_mem"},
-	{"add_dynptr_to_map1", "invalid indirect read from stack"},
-	{"add_dynptr_to_map2", "invalid indirect read from stack"},
-	{"data_slice_out_of_bounds_ringbuf", "value is outside of the allowed memory range"},
-	{"data_slice_out_of_bounds_map_value", "value is outside of the allowed memory range"},
-	{"data_slice_use_after_release1", "invalid mem access 'scalar'"},
-	{"data_slice_use_after_release2", "invalid mem access 'scalar'"},
-	{"data_slice_missing_null_check1", "invalid mem access 'mem_or_null'"},
-	{"data_slice_missing_null_check2", "invalid mem access 'mem_or_null'"},
-	{"invalid_helper1", "invalid indirect read from stack"},
-	{"invalid_helper2", "Expected an initialized dynptr as arg #3"},
-	{"invalid_write1", "Expected an initialized dynptr as arg #1"},
-	{"invalid_write2", "Expected an initialized dynptr as arg #3"},
-	{"invalid_write3", "Expected an initialized dynptr as arg #1"},
-	{"invalid_write4", "arg 1 is an unacquired reference"},
-	{"invalid_read1", "invalid read from stack"},
-	{"invalid_read2", "cannot pass in dynptr at an offset"},
-	{"invalid_read3", "invalid read from stack"},
-	{"invalid_read4", "invalid read from stack"},
-	{"invalid_offset", "invalid write to stack"},
-	{"global", "type=map_value expected=fp"},
-	{"release_twice", "arg 1 is an unacquired reference"},
-	{"release_twice_callback", "arg 1 is an unacquired reference"},
-	{"dynptr_from_mem_invalid_api",
-		"Unsupported reg type fp for bpf_dynptr_from_mem data"},
-
 	/* success cases */
 	{"test_read_write", NULL},
 	{"test_data_slice", NULL},
 	{"test_ringbuf", NULL},
 };
 
-static void verify_fail(const char *prog_name, const char *expected_err_msg)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	struct bpf_program *prog;
-	struct dynptr_fail *skel;
-	int err;
-
-	opts.kernel_log_buf = obj_log_buf;
-	opts.kernel_log_size = log_buf_sz;
-	opts.kernel_log_level = 1;
-
-	skel = dynptr_fail__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "dynptr_fail__open_opts"))
-		goto cleanup;
-
-	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
-	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
-		goto cleanup;
-
-	bpf_program__set_autoload(prog, true);
-
-	bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize());
-
-	err = dynptr_fail__load(skel);
-	if (!ASSERT_ERR(err, "unexpected load success"))
-		goto cleanup;
-
-	if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) {
-		fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg);
-		fprintf(stderr, "Verifier output: %s\n", obj_log_buf);
-	}
-
-cleanup:
-	dynptr_fail__destroy(skel);
-}
-
 static void verify_success(const char *prog_name)
 {
 	struct dynptr_success *skel;
@@ -97,8 +27,6 @@ static void verify_success(const char *prog_name)
 
 	skel->bss->pid = getpid();
 
-	bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize());
-
 	dynptr_success__load(skel);
 	if (!ASSERT_OK_PTR(skel, "dynptr_success__load"))
 		goto cleanup;
@@ -129,10 +57,8 @@ void test_dynptr(void)
 		if (!test__start_subtest(dynptr_tests[i].prog_name))
 			continue;
 
-		if (dynptr_tests[i].expected_err_msg)
-			verify_fail(dynptr_tests[i].prog_name,
-				    dynptr_tests[i].expected_err_msg);
-		else
-			verify_success(dynptr_tests[i].prog_name);
+		verify_success(dynptr_tests[i].prog_name);
 	}
+
+	RUN_TESTS(dynptr_fail);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c
index 0d66b15242089..3533a4ecad018 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c
@@ -5,83 +5,6 @@
 #include "map_kptr.skel.h"
 #include "map_kptr_fail.skel.h"
 
-static char log_buf[1024 * 1024];
-
-struct {
-	const char *prog_name;
-	const char *err_msg;
-} map_kptr_fail_tests[] = {
-	{ "size_not_bpf_dw", "kptr access size must be BPF_DW" },
-	{ "non_const_var_off", "kptr access cannot have variable offset" },
-	{ "non_const_var_off_kptr_xchg", "R1 doesn't have constant offset. kptr has to be" },
-	{ "misaligned_access_write", "kptr access misaligned expected=8 off=7" },
-	{ "misaligned_access_read", "kptr access misaligned expected=8 off=1" },
-	{ "reject_var_off_store", "variable untrusted_ptr_ access var_off=(0x0; 0x1e0)" },
-	{ "reject_bad_type_match", "invalid kptr access, R1 type=untrusted_ptr_prog_test_ref_kfunc" },
-	{ "marked_as_untrusted_or_null", "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_" },
-	{ "correct_btf_id_check_size", "access beyond struct prog_test_ref_kfunc at off 32 size 4" },
-	{ "inherit_untrusted_on_walk", "R1 type=untrusted_ptr_ expected=percpu_ptr_" },
-	{ "reject_kptr_xchg_on_unref", "off=8 kptr isn't referenced kptr" },
-	{ "reject_kptr_get_no_map_val", "arg#0 expected pointer to map value" },
-	{ "reject_kptr_get_no_null_map_val", "arg#0 expected pointer to map value" },
-	{ "reject_kptr_get_no_kptr", "arg#0 no referenced kptr at map value offset=0" },
-	{ "reject_kptr_get_on_unref", "arg#0 no referenced kptr at map value offset=8" },
-	{ "reject_kptr_get_bad_type_match", "kernel function bpf_kfunc_call_test_kptr_get args#0" },
-	{ "mark_ref_as_untrusted_or_null", "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_" },
-	{ "reject_untrusted_store_to_ref", "store to referenced kptr disallowed" },
-	{ "reject_bad_type_xchg", "invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member" },
-	{ "reject_untrusted_xchg", "R2 type=untrusted_ptr_ expected=ptr_" },
-	{ "reject_member_of_ref_xchg", "invalid kptr access, R2 type=ptr_prog_test_ref_kfunc" },
-	{ "reject_indirect_helper_access", "kptr cannot be accessed indirectly by helper" },
-	{ "reject_indirect_global_func_access", "kptr cannot be accessed indirectly by helper" },
-	{ "kptr_xchg_ref_state", "Unreleased reference id=5 alloc_insn=" },
-	{ "kptr_get_ref_state", "Unreleased reference id=3 alloc_insn=" },
-};
-
-static void test_map_kptr_fail_prog(const char *prog_name, const char *err_msg)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts, .kernel_log_buf = log_buf,
-						.kernel_log_size = sizeof(log_buf),
-						.kernel_log_level = 1);
-	struct map_kptr_fail *skel;
-	struct bpf_program *prog;
-	int ret;
-
-	skel = map_kptr_fail__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "map_kptr_fail__open_opts"))
-		return;
-
-	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
-	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
-		goto end;
-
-	bpf_program__set_autoload(prog, true);
-
-	ret = map_kptr_fail__load(skel);
-	if (!ASSERT_ERR(ret, "map_kptr__load must fail"))
-		goto end;
-
-	if (!ASSERT_OK_PTR(strstr(log_buf, err_msg), "expected error message")) {
-		fprintf(stderr, "Expected: %s\n", err_msg);
-		fprintf(stderr, "Verifier: %s\n", log_buf);
-	}
-
-end:
-	map_kptr_fail__destroy(skel);
-}
-
-static void test_map_kptr_fail(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(map_kptr_fail_tests); i++) {
-		if (!test__start_subtest(map_kptr_fail_tests[i].prog_name))
-			continue;
-		test_map_kptr_fail_prog(map_kptr_fail_tests[i].prog_name,
-					map_kptr_fail_tests[i].err_msg);
-	}
-}
-
 static void test_map_kptr_success(bool test_run)
 {
 	LIBBPF_OPTS(bpf_test_run_opts, opts,
@@ -145,5 +68,6 @@ void test_map_kptr(void)
 		 */
 		test_map_kptr_success(true);
 	}
-	test_map_kptr_fail();
+
+	RUN_TESTS(map_kptr_fail);
 }
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index b0f08ff024fb8..3774cef5639b0 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -43,6 +43,7 @@ struct sample {
 
 struct {
 	__uint(type, BPF_MAP_TYPE_RINGBUF);
+	__uint(max_entries, 4096);
 } ringbuf SEC(".maps");
 
 int err, val;
@@ -66,6 +67,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr)
  * bpf_ringbuf_submit/discard_dynptr call
  */
 SEC("?raw_tp")
+__failure __msg("Unreleased reference id=1")
 int ringbuf_missing_release1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -78,6 +80,7 @@ int ringbuf_missing_release1(void *ctx)
 }
 
 SEC("?raw_tp")
+__failure __msg("Unreleased reference id=2")
 int ringbuf_missing_release2(void *ctx)
 {
 	struct bpf_dynptr ptr1, ptr2;
@@ -113,6 +116,7 @@ static int missing_release_callback_fn(__u32 index, void *data)
 
 /* Any dynptr initialized within a callback must have bpf_dynptr_put called */
 SEC("?raw_tp")
+__failure __msg("Unreleased reference id")
 int ringbuf_missing_release_callback(void *ctx)
 {
 	bpf_loop(10, missing_release_callback_fn, NULL, 0);
@@ -121,6 +125,7 @@ int ringbuf_missing_release_callback(void *ctx)
 
 /* Can't call bpf_ringbuf_submit/discard_dynptr on a non-initialized dynptr */
 SEC("?raw_tp")
+__failure __msg("arg 1 is an unacquired reference")
 int ringbuf_release_uninit_dynptr(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -133,6 +138,7 @@ int ringbuf_release_uninit_dynptr(void *ctx)
 
 /* A dynptr can't be used after it has been invalidated */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #3")
 int use_after_invalid(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -152,6 +158,7 @@ int use_after_invalid(void *ctx)
 
 /* Can't call non-dynptr ringbuf APIs on a dynptr ringbuf sample */
 SEC("?raw_tp")
+__failure __msg("type=mem expected=alloc_mem")
 int ringbuf_invalid_api(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -174,6 +181,7 @@ int ringbuf_invalid_api(void *ctx)
 
 /* Can't add a dynptr to a map */
 SEC("?raw_tp")
+__failure __msg("invalid indirect read from stack")
 int add_dynptr_to_map1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -191,6 +199,7 @@ int add_dynptr_to_map1(void *ctx)
 
 /* Can't add a struct with an embedded dynptr to a map */
 SEC("?raw_tp")
+__failure __msg("invalid indirect read from stack")
 int add_dynptr_to_map2(void *ctx)
 {
 	struct test_info x;
@@ -208,6 +217,7 @@ int add_dynptr_to_map2(void *ctx)
 
 /* A data slice can't be accessed out of bounds */
 SEC("?raw_tp")
+__failure __msg("value is outside of the allowed memory range")
 int data_slice_out_of_bounds_ringbuf(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -228,6 +238,7 @@ int data_slice_out_of_bounds_ringbuf(void *ctx)
 }
 
 SEC("?raw_tp")
+__failure __msg("value is outside of the allowed memory range")
 int data_slice_out_of_bounds_map_value(void *ctx)
 {
 	__u32 key = 0, map_val;
@@ -248,6 +259,7 @@ int data_slice_out_of_bounds_map_value(void *ctx)
 
 /* A data slice can't be used after it has been released */
 SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
 int data_slice_use_after_release1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -279,6 +291,7 @@ int data_slice_use_after_release1(void *ctx)
  * ptr2 is at fp - 16).
  */
 SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
 int data_slice_use_after_release2(void *ctx)
 {
 	struct bpf_dynptr ptr1, ptr2;
@@ -310,6 +323,7 @@ int data_slice_use_after_release2(void *ctx)
 
 /* A data slice must be first checked for NULL */
 SEC("?raw_tp")
+__failure __msg("invalid mem access 'mem_or_null'")
 int data_slice_missing_null_check1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -330,6 +344,7 @@ int data_slice_missing_null_check1(void *ctx)
 
 /* A data slice can't be dereferenced if it wasn't checked for null */
 SEC("?raw_tp")
+__failure __msg("invalid mem access 'mem_or_null'")
 int data_slice_missing_null_check2(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -352,6 +367,7 @@ int data_slice_missing_null_check2(void *ctx)
  * dynptr argument
  */
 SEC("?raw_tp")
+__failure __msg("invalid indirect read from stack")
 int invalid_helper1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -366,6 +382,7 @@ int invalid_helper1(void *ctx)
 
 /* A dynptr can't be passed into a helper function at a non-zero offset */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #3")
 int invalid_helper2(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -381,6 +398,7 @@ int invalid_helper2(void *ctx)
 
 /* A bpf_dynptr is invalidated if it's been written into */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #1")
 int invalid_write1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -402,6 +420,7 @@ int invalid_write1(void *ctx)
  * offset
  */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #3")
 int invalid_write2(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -425,6 +444,7 @@ int invalid_write2(void *ctx)
  * non-const offset
  */
 SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #1")
 int invalid_write3(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -456,6 +476,7 @@ static int invalid_write4_callback(__u32 index, void *data)
  * be invalidated as a dynptr
  */
 SEC("?raw_tp")
+__failure __msg("arg 1 is an unacquired reference")
 int invalid_write4(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -472,7 +493,9 @@ int invalid_write4(void *ctx)
 
 /* A globally-defined bpf_dynptr can't be used (it must reside as a stack frame) */
 struct bpf_dynptr global_dynptr;
+
 SEC("?raw_tp")
+__failure __msg("type=map_value expected=fp")
 int global(void *ctx)
 {
 	/* this should fail */
@@ -485,6 +508,7 @@ int global(void *ctx)
 
 /* A direct read should fail */
 SEC("?raw_tp")
+__failure __msg("invalid read from stack")
 int invalid_read1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -501,6 +525,7 @@ int invalid_read1(void *ctx)
 
 /* A direct read at an offset should fail */
 SEC("?raw_tp")
+__failure __msg("cannot pass in dynptr at an offset")
 int invalid_read2(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -516,6 +541,7 @@ int invalid_read2(void *ctx)
 
 /* A direct read at an offset into the lower stack slot should fail */
 SEC("?raw_tp")
+__failure __msg("invalid read from stack")
 int invalid_read3(void *ctx)
 {
 	struct bpf_dynptr ptr1, ptr2;
@@ -542,6 +568,7 @@ static int invalid_read4_callback(__u32 index, void *data)
 
 /* A direct read within a callback function should fail */
 SEC("?raw_tp")
+__failure __msg("invalid read from stack")
 int invalid_read4(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -557,6 +584,7 @@ int invalid_read4(void *ctx)
 
 /* Initializing a dynptr on an offset should fail */
 SEC("?raw_tp")
+__failure __msg("invalid write to stack")
 int invalid_offset(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -571,6 +599,7 @@ int invalid_offset(void *ctx)
 
 /* Can't release a dynptr twice */
 SEC("?raw_tp")
+__failure __msg("arg 1 is an unacquired reference")
 int release_twice(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -597,6 +626,7 @@ static int release_twice_callback_fn(__u32 index, void *data)
  * within a calback function, fails
  */
 SEC("?raw_tp")
+__failure __msg("arg 1 is an unacquired reference")
 int release_twice_callback(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -612,6 +642,7 @@ int release_twice_callback(void *ctx)
 
 /* Reject unsupported local mem types for dynptr_from_mem API */
 SEC("?raw_tp")
+__failure __msg("Unsupported reg type fp for bpf_dynptr_from_mem data")
 int dynptr_from_mem_invalid_api(void *ctx)
 {
 	struct bpf_dynptr ptr;
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index a3a6103c85694..35db7c6c1fc74 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -20,6 +20,7 @@ struct sample {
 
 struct {
 	__uint(type, BPF_MAP_TYPE_RINGBUF);
+	__uint(max_entries, 4096);
 } ringbuf SEC(".maps");
 
 struct {
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
index 05e209b1b12aa..760e41e1a6326 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
@@ -3,6 +3,7 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
+#include "bpf_misc.h"
 
 struct map_value {
 	char buf[8];
@@ -23,6 +24,7 @@ extern struct prog_test_ref_kfunc *
 bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **p, int a, int b) __ksym;
 
 SEC("?tc")
+__failure __msg("kptr access size must be BPF_DW")
 int size_not_bpf_dw(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -37,6 +39,7 @@ int size_not_bpf_dw(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("kptr access cannot have variable offset")
 int non_const_var_off(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -55,6 +58,7 @@ int non_const_var_off(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R1 doesn't have constant offset. kptr has to be")
 int non_const_var_off_kptr_xchg(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -73,6 +77,7 @@ int non_const_var_off_kptr_xchg(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("kptr access misaligned expected=8 off=7")
 int misaligned_access_write(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -88,6 +93,7 @@ int misaligned_access_write(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("kptr access misaligned expected=8 off=1")
 int misaligned_access_read(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -101,6 +107,7 @@ int misaligned_access_read(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("variable untrusted_ptr_ access var_off=(0x0; 0x1e0)")
 int reject_var_off_store(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *unref_ptr;
@@ -124,6 +131,7 @@ int reject_var_off_store(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("invalid kptr access, R1 type=untrusted_ptr_prog_test_ref_kfunc")
 int reject_bad_type_match(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *unref_ptr;
@@ -144,6 +152,7 @@ int reject_bad_type_match(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_")
 int marked_as_untrusted_or_null(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -158,6 +167,7 @@ int marked_as_untrusted_or_null(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("access beyond struct prog_test_ref_kfunc at off 32 size 4")
 int correct_btf_id_check_size(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
@@ -175,6 +185,7 @@ int correct_btf_id_check_size(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R1 type=untrusted_ptr_ expected=percpu_ptr_")
 int inherit_untrusted_on_walk(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *unref_ptr;
@@ -194,6 +205,7 @@ int inherit_untrusted_on_walk(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("off=8 kptr isn't referenced kptr")
 int reject_kptr_xchg_on_unref(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -208,6 +220,7 @@ int reject_kptr_xchg_on_unref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("arg#0 expected pointer to map value")
 int reject_kptr_get_no_map_val(struct __sk_buff *ctx)
 {
 	bpf_kfunc_call_test_kptr_get((void *)&ctx, 0, 0);
@@ -215,6 +228,7 @@ int reject_kptr_get_no_map_val(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("arg#0 expected pointer to map value")
 int reject_kptr_get_no_null_map_val(struct __sk_buff *ctx)
 {
 	bpf_kfunc_call_test_kptr_get(bpf_map_lookup_elem(&array_map, &(int){0}), 0, 0);
@@ -222,6 +236,7 @@ int reject_kptr_get_no_null_map_val(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("arg#0 no referenced kptr at map value offset=0")
 int reject_kptr_get_no_kptr(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -236,6 +251,7 @@ int reject_kptr_get_no_kptr(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("arg#0 no referenced kptr at map value offset=8")
 int reject_kptr_get_on_unref(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -250,6 +266,7 @@ int reject_kptr_get_on_unref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("kernel function bpf_kfunc_call_test_kptr_get args#0")
 int reject_kptr_get_bad_type_match(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -264,6 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_")
 int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -278,6 +296,7 @@ int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("store to referenced kptr disallowed")
 int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
@@ -297,6 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("R2 type=untrusted_ptr_ expected=ptr_")
 int reject_untrusted_xchg(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
@@ -315,6 +335,8 @@ int reject_untrusted_xchg(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure
+__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member")
 int reject_bad_type_xchg(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *ref_ptr;
@@ -333,6 +355,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc")
 int reject_member_of_ref_xchg(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *ref_ptr;
@@ -351,6 +374,7 @@ int reject_member_of_ref_xchg(struct __sk_buff *ctx)
 }
 
 SEC("?syscall")
+__failure __msg("kptr cannot be accessed indirectly by helper")
 int reject_indirect_helper_access(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -371,6 +395,7 @@ int write_func(int *p)
 }
 
 SEC("?tc")
+__failure __msg("kptr cannot be accessed indirectly by helper")
 int reject_indirect_global_func_access(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -384,6 +409,7 @@ int reject_indirect_global_func_access(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("Unreleased reference id=5 alloc_insn=")
 int kptr_xchg_ref_state(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
@@ -402,6 +428,7 @@ int kptr_xchg_ref_state(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
+__failure __msg("Unreleased reference id=3 alloc_insn=")
 int kptr_get_ref_state(struct __sk_buff *ctx)
 {
 	struct map_value *v;

From fdeee9e993ed99ee640212e9d41c2977378eadae Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 21 Jan 2023 05:52:37 +0530
Subject: [PATCH 123/175] selftests/bpf: convenience macro for use with 'asm
 volatile' blocks

A set of macros useful for writing naked BPF functions using inline
assembly. E.g. as follows:

struct map_struct {
	...
} map SEC(".maps");

SEC(...)
__naked int foo_test(void)
{
	asm volatile(
		"r0 = 0;"
		"*(u64*)(r10 - 8) = r0;"
		"r1 = %[map] ll;"
		"r2 = r10;"
		"r2 += -8;"
		"call %[bpf_map_lookup_elem];"
		"r0 = 0;"
		"exit;"
		:
		: __imm(bpf_map_lookup_elem),
		  __imm_addr(map)
		: __clobber_all);
}

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
[ Kartikeya: Add acks, include __clobber_common from Andrii ]
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-9-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/bpf_misc.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 4a01ea9113bfd..2d7b89b447b27 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -7,6 +7,13 @@
 #define __success		__attribute__((btf_decl_tag("comment:test_expect_success")))
 #define __log_level(lvl)	__attribute__((btf_decl_tag("comment:test_log_level="#lvl)))
 
+/* Convenience macro for use with 'asm volatile' blocks */
+#define __naked __attribute__((naked))
+#define __clobber_all "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "memory"
+#define __clobber_common "r0", "r1", "r2", "r3", "r4", "r5", "memory"
+#define __imm(name) [name]"i"(name)
+#define __imm_addr(name) [name]"i"(&name)
+
 #if defined(__TARGET_ARCH_x86)
 #define SYSCALL_WRAPPER 1
 #define SYS_PREFIX "__x64_"

From 324b2f420bed0975afa0d7a8058dd796be96f175 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:38 +0530
Subject: [PATCH 124/175] selftests/bpf: Add dynptr pruning tests

Add verifier tests that verify the new pruning behavior for STACK_DYNPTR
slots, and ensure that state equivalence takes into account changes to
the old and current verifier state correctly. Also ensure that the
stacksafe changes are actually enabling pruning in case states are
equivalent from pruning PoV.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-10-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/dynptr_fail.c | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 3774cef5639b0..20c7c8b5c3adf 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -35,6 +35,13 @@ struct {
 	__type(value, __u32);
 } array_map3 SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, __u64);
+} array_map4 SEC(".maps");
+
 struct sample {
 	int pid;
 	long value;
@@ -653,3 +660,137 @@ int dynptr_from_mem_invalid_api(void *ctx)
 
 	return 0;
 }
+
+SEC("?tc")
+__failure __msg("cannot overwrite referenced dynptr") __log_level(2)
+int dynptr_pruning_overwrite(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r9 = 0xeB9F;				\
+		 r6 = %[ringbuf] ll;			\
+		 r1 = r6;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 if r0 == 0 goto pjmp1;			\
+		 goto pjmp2;				\
+	pjmp1:						\
+		 *(u64 *)(r10 - 16) = r9;		\
+	pjmp2:						\
+		 r1 = r10;				\
+		 r1 += -16;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm_addr(ringbuf)
+		: __clobber_all
+	);
+	return 0;
+}
+
+SEC("?tc")
+__success __msg("12: safe") __log_level(2)
+int dynptr_pruning_stacksafe(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r9 = 0xeB9F;				\
+		 r6 = %[ringbuf] ll;			\
+		 r1 = r6;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 if r0 == 0 goto stjmp1;		\
+		 goto stjmp2;				\
+	stjmp1:						\
+		 r9 = r9;				\
+	stjmp2:						\
+		 r1 = r10;				\
+		 r1 += -16;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm_addr(ringbuf)
+		: __clobber_all
+	);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("cannot overwrite referenced dynptr") __log_level(2)
+int dynptr_pruning_type_confusion(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r6 = %[array_map4] ll;			\
+		 r7 = %[ringbuf] ll;			\
+		 r1 = r6;				\
+		 r2 = r10;				\
+		 r2 += -8;				\
+		 r9 = 0;				\
+		 *(u64 *)(r2 + 0) = r9;			\
+		 r3 = r10;				\
+		 r3 += -24;				\
+		 r9 = 0xeB9FeB9F;			\
+		 *(u64 *)(r10 - 16) = r9;		\
+		 *(u64 *)(r10 - 24) = r9;		\
+		 r9 = 0;				\
+		 r4 = 0;				\
+		 r8 = r2;				\
+		 call %[bpf_map_update_elem];		\
+		 r1 = r6;				\
+		 r2 = r8;				\
+		 call %[bpf_map_lookup_elem];		\
+		 if r0 != 0 goto tjmp1;			\
+		 exit;					\
+	tjmp1:						\
+		 r8 = r0;				\
+		 r1 = r7;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 r0 = *(u64 *)(r0 + 0);			\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 if r0 == 0 goto tjmp2;			\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 goto tjmp3;				\
+	tjmp2:						\
+		 *(u64 *)(r10 - 8) = r9;		\
+		 *(u64 *)(r10 - 16) = r9;		\
+		 r1 = r8;				\
+		 r1 += 8;				\
+		 r2 = 0;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 call %[bpf_dynptr_from_mem];		\
+	tjmp3:						\
+		 r1 = r10;				\
+		 r1 += -16;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_map_update_elem),
+		  __imm(bpf_map_lookup_elem),
+		  __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_dynptr_from_mem),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm_addr(array_map4),
+		  __imm_addr(ringbuf)
+		: __clobber_all
+	);
+	return 0;
+}

From bd5606f91cae3840c524f6dda9575cbda459cd35 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:39 +0530
Subject: [PATCH 125/175] selftests/bpf: Add dynptr var_off tests

Ensure that variable offset is handled correctly, and verifier takes
both fixed and variable part into account. Also ensures that only
constant var_off is allowed.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-11-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/dynptr_fail.c | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 20c7c8b5c3adf..0b9488a72bddf 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -794,3 +794,43 @@ int dynptr_pruning_type_confusion(struct __sk_buff *ctx)
 	);
 	return 0;
 }
+
+SEC("?tc")
+__failure __msg("dynptr has to be at a constant offset") __log_level(2)
+int dynptr_var_off_overwrite(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r9 = 16;				\
+		 *(u32 *)(r10 - 4) = r9;		\
+		 r8 = *(u32 *)(r10 - 4);		\
+		 if r8 >= 0 goto vjmp1;			\
+		 r0 = 1;				\
+		 exit;					\
+	vjmp1:						\
+		 if r8 <= 16 goto vjmp2;		\
+		 r0 = 1;				\
+		 exit;					\
+	vjmp2:						\
+		 r8 &= 16;				\
+		 r1 = %[ringbuf] ll;			\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -32;				\
+		 r4 += r8;				\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 r9 = 0xeB9F;				\
+		 *(u64 *)(r10 - 16) = r9;		\
+		 r1 = r10;				\
+		 r1 += -32;				\
+		 r1 += r8;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm_addr(ringbuf)
+		: __clobber_all
+	);
+	return 0;
+}

From 40722d8763c48713583734119536fca93aa7078a Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:40 +0530
Subject: [PATCH 126/175] selftests/bpf: Add dynptr partial slot overwrite
 tests

Try creating a dynptr, then overwriting second slot with first slot of
another dynptr. Then, the first slot of first dynptr should also be
invalidated, but without our fix that does not happen. As a consequence,
the unfixed case allows passing first dynptr (as the kernel check only
checks for slot_type and then first_slot == true).

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-12-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/dynptr_fail.c | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 0b9488a72bddf..b13aeea78d13a 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -834,3 +834,69 @@ int dynptr_var_off_overwrite(struct __sk_buff *ctx)
 	);
 	return 0;
 }
+
+SEC("?tc")
+__failure __msg("cannot overwrite referenced dynptr") __log_level(2)
+int dynptr_partial_slot_invalidate(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r6 = %[ringbuf] ll;			\
+		 r7 = %[array_map4] ll;			\
+		 r1 = r7;				\
+		 r2 = r10;				\
+		 r2 += -8;				\
+		 r9 = 0;				\
+		 *(u64 *)(r2 + 0) = r9;			\
+		 r3 = r2;				\
+		 r4 = 0;				\
+		 r8 = r2;				\
+		 call %[bpf_map_update_elem];		\
+		 r1 = r7;				\
+		 r2 = r8;				\
+		 call %[bpf_map_lookup_elem];		\
+		 if r0 != 0 goto sjmp1;			\
+		 exit;					\
+	sjmp1:						\
+		 r7 = r0;				\
+		 r1 = r6;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -24;				\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 *(u64 *)(r10 - 16) = r9;		\
+		 r1 = r7;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 call %[bpf_dynptr_from_mem];		\
+		 r1 = r10;				\
+		 r1 += -512;				\
+		 r2 = 488;				\
+		 r3 = r10;				\
+		 r3 += -24;				\
+		 r4 = 0;				\
+		 r5 = 0;				\
+		 call %[bpf_dynptr_read];		\
+		 r8 = 1;				\
+		 if r0 != 0 goto sjmp2;			\
+		 r8 = 0;				\
+	sjmp2:						\
+		 r1 = r10;				\
+		 r1 += -24;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_map_update_elem),
+		  __imm(bpf_map_lookup_elem),
+		  __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm(bpf_dynptr_from_mem),
+		  __imm(bpf_dynptr_read),
+		  __imm_addr(ringbuf),
+		  __imm_addr(array_map4)
+		: __clobber_all
+	);
+	return 0;
+}

From fcc162723b651c199d1ae99af7a5abc484b55280 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 8 Dec 2022 02:11:35 +0530
Subject: [PATCH 127/175] bpf: Refactor ARG_PTR_TO_DYNPTR checks into
 process_dynptr_func

ARG_PTR_TO_DYNPTR is akin to ARG_PTR_TO_TIMER, ARG_PTR_TO_KPTR, where
the underlying register type is subjected to more special checks to
determine the type of object represented by the pointer and its state
consistency.

Move dynptr checks to their own 'process_dynptr_func' function so that
is consistent and in-line with existing code. This also makes it easier
to reuse this code for kfunc handling.

Then, reuse this consolidated function in kfunc dynptr handling too.
Note that for kfuncs, the arg_type constraint of DYNPTR_TYPE_LOCAL has
been lifted.

Acked-by: David Vernet <void@manifault.com>
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221207204141.308952-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
[v6.1: move changes in check_kfunc_args() in verifier.c to
btf_check_func_arg_match() in btf.c]
Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 include/linux/bpf_verifier.h                  |   8 +-
 kernel/bpf/btf.c                              |  24 +---
 kernel/bpf/verifier.c                         | 115 ++++++++++--------
 .../bpf/prog_tests/kfunc_dynptr_param.c       |   7 +-
 .../bpf/progs/test_kfunc_dynptr_param.c       |  12 --
 5 files changed, 75 insertions(+), 91 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f080ccf27d256..35cc34a7a625e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -594,11 +594,9 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
 			     u32 regno);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size);
-bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
-			      struct bpf_reg_state *reg);
-bool is_dynptr_type_expected(struct bpf_verifier_env *env,
-			     struct bpf_reg_state *reg,
-			     enum bpf_arg_type arg_type);
+struct bpf_call_arg_meta;
+int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta);
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7582ec4fd4131..2875ac2c694ee 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6491,29 +6491,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				}
 
 				if (arg_dynptr) {
-					if (reg->type != PTR_TO_STACK) {
-						bpf_log(log, "arg#%d pointer type %s %s not to stack\n",
+					if (reg->type != PTR_TO_STACK &&
+					    reg->type != PTR_TO_DYNPTR) {
+						bpf_log(log, "arg#%d pointer type %s %s not to stack or dynptr_ptr\n",
 							i, btf_type_str(ref_t),
 							ref_tname);
 						return -EINVAL;
 					}
 
-					if (!is_dynptr_reg_valid_init(env, reg)) {
-						bpf_log(log,
-							"arg#%d pointer type %s %s must be valid and initialized\n",
-							i, btf_type_str(ref_t),
-							ref_tname);
-						return -EINVAL;
-					}
-
-					if (!is_dynptr_type_expected(env, reg,
-							ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
-						bpf_log(log,
-							"arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
-							i, btf_type_str(ref_t),
-							ref_tname);
-						return -EINVAL;
-					}
+					ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR, NULL);
+				      if (ret < 0)
+					      return ret;
 
 					continue;
 				}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 56a5c8beb553d..aae653c04f773 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -791,8 +791,7 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 	return true;
 }
 
-bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
-			      struct bpf_reg_state *reg)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
 	int spi = get_spi(reg->off);
@@ -811,9 +810,8 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
 	return true;
 }
 
-bool is_dynptr_type_expected(struct bpf_verifier_env *env,
-			     struct bpf_reg_state *reg,
-			     enum bpf_arg_type arg_type)
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				    enum bpf_arg_type arg_type)
 {
 	struct bpf_func_state *state = func(env, reg);
 	enum bpf_dynptr_type dynptr_type;
@@ -5854,6 +5852,65 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+			enum bpf_arg_type arg_type,
+			struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+
+	/* We only need to check for initialized / uninitialized helper
+	 * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
+	 * assumption is that if it is, that a helper function
+	 * initialized the dynptr on behalf of the BPF program.
+	 */
+	if (base_type(reg->type) == PTR_TO_DYNPTR)
+		return 0;
+	if (arg_type & MEM_UNINIT) {
+		if (!is_dynptr_reg_valid_uninit(env, reg)) {
+			verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+			return -EINVAL;
+		}
+
+		/* We only support one dynptr being uninitialized at the moment,
+		 * which is sufficient for the helper functions we have right now.
+		 */
+		if (meta->uninit_dynptr_regno) {
+			verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
+			return -EFAULT;
+		}
+
+		meta->uninit_dynptr_regno = regno;
+	} else {
+		if (!is_dynptr_reg_valid_init(env, reg)) {
+			verbose(env,
+				"Expected an initialized dynptr as arg #%d\n",
+				regno);
+			return -EINVAL;
+		}
+
+		if (!is_dynptr_type_expected(env, reg, arg_type)) {
+			const char *err_extra = "";
+
+			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+			case DYNPTR_TYPE_LOCAL:
+				err_extra = "local";
+				break;
+			case DYNPTR_TYPE_RINGBUF:
+				err_extra = "ringbuf";
+				break;
+			default:
+				err_extra = "<unknown>";
+				break;
+			}
+			verbose(env,
+				"Expected a dynptr of type %s as arg #%d\n",
+				err_extra, regno);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
 static bool arg_type_is_mem_size(enum bpf_arg_type type)
 {
 	return type == ARG_CONST_SIZE ||
@@ -6367,52 +6424,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		err = check_mem_size_reg(env, reg, regno, true, meta);
 		break;
 	case ARG_PTR_TO_DYNPTR:
-		/* We only need to check for initialized / uninitialized helper
-		 * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
-		 * assumption is that if it is, that a helper function
-		 * initialized the dynptr on behalf of the BPF program.
-		 */
-		if (base_type(reg->type) == PTR_TO_DYNPTR)
-			break;
-		if (arg_type & MEM_UNINIT) {
-			if (!is_dynptr_reg_valid_uninit(env, reg)) {
-				verbose(env, "Dynptr has to be an uninitialized dynptr\n");
-				return -EINVAL;
-			}
-
-			/* We only support one dynptr being uninitialized at the moment,
-			 * which is sufficient for the helper functions we have right now.
-			 */
-			if (meta->uninit_dynptr_regno) {
-				verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
-				return -EFAULT;
-			}
-
-			meta->uninit_dynptr_regno = regno;
-		} else if (!is_dynptr_reg_valid_init(env, reg)) {
-			verbose(env,
-				"Expected an initialized dynptr as arg #%d\n",
-				arg + 1);
-			return -EINVAL;
-		} else if (!is_dynptr_type_expected(env, reg, arg_type)) {
-			const char *err_extra = "";
-
-			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
-			case DYNPTR_TYPE_LOCAL:
-				err_extra = "local";
-				break;
-			case DYNPTR_TYPE_RINGBUF:
-				err_extra = "ringbuf";
-				break;
-			default:
-				err_extra = "<unknown>";
-				break;
-			}
-			verbose(env,
-				"Expected a dynptr of type %s as arg #%d\n",
-				err_extra, arg + 1);
-			return -EINVAL;
-		}
+		if (process_dynptr_func(env, regno, arg_type, meta))
+			return -EACCES;
 		break;
 	case ARG_CONST_ALLOC_SIZE_OR_ZERO:
 		if (!tnum_is_const(reg->var_off)) {
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
index c210657d4d0aa..6c4f0b64bb4ff 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
@@ -18,11 +18,8 @@ static struct {
 	const char *expected_verifier_err_msg;
 	int expected_runtime_err;
 } kfunc_dynptr_tests[] = {
-	{"dynptr_type_not_supp",
-	 "arg#0 pointer type STRUCT bpf_dynptr_kern points to unsupported dynamic pointer type", 0},
-	{"not_valid_dynptr",
-	 "arg#0 pointer type STRUCT bpf_dynptr_kern must be valid and initialized", 0},
-	{"not_ptr_to_stack", "arg#0 pointer type STRUCT bpf_dynptr_kern not to stack", 0},
+	{"not_valid_dynptr", "Expected an initialized dynptr as arg #1", 0},
+	{"not_ptr_to_stack", "arg#0 pointer type STRUCT bpf_dynptr_kern not to stack or dynptr_ptr", 0},
 	{"dynptr_data_null", NULL, -EBADMSG},
 };
 
diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
index ce39d096bba34..f4a8250329b2d 100644
--- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
@@ -32,18 +32,6 @@ int err, pid;
 
 char _license[] SEC("license") = "GPL";
 
-SEC("?lsm.s/bpf")
-int BPF_PROG(dynptr_type_not_supp, int cmd, union bpf_attr *attr,
-	     unsigned int size)
-{
-	char write_data[64] = "hello there, world!!";
-	struct bpf_dynptr ptr;
-
-	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr);
-
-	return bpf_verify_pkcs7_signature(&ptr, &ptr, NULL);
-}
-
 SEC("?lsm.s/bpf")
 int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size)
 {

From 42819c781290b8989d2bcafb7f9d211bbc9fd6d2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 8 Dec 2022 02:11:36 +0530
Subject: [PATCH 128/175] bpf: Propagate errors from process_* checks in
 check_func_arg

Currently, we simply ignore the errors in process_spin_lock,
process_timer_func, process_kptr_func, process_dynptr_func. Instead,
bubble up the error by storing and checking err variable.

Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221207204141.308952-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aae653c04f773..60e4125666732 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6389,19 +6389,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		break;
 	case ARG_PTR_TO_SPIN_LOCK:
 		if (meta->func_id == BPF_FUNC_spin_lock) {
-			if (process_spin_lock(env, regno, true))
-				return -EACCES;
+			err = process_spin_lock(env, regno, true);
+			if (err)
+				return err;
 		} else if (meta->func_id == BPF_FUNC_spin_unlock) {
-			if (process_spin_lock(env, regno, false))
-				return -EACCES;
+			err = process_spin_lock(env, regno, false);
+			if (err)
+				return err;
 		} else {
 			verbose(env, "verifier internal error\n");
 			return -EFAULT;
 		}
 		break;
 	case ARG_PTR_TO_TIMER:
-		if (process_timer_func(env, regno, meta))
-			return -EACCES;
+		err = process_timer_func(env, regno, meta);
+		if (err)
+			return err;
 		break;
 	case ARG_PTR_TO_FUNC:
 		meta->subprogno = reg->subprogno;
@@ -6424,8 +6427,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		err = check_mem_size_reg(env, reg, regno, true, meta);
 		break;
 	case ARG_PTR_TO_DYNPTR:
-		if (process_dynptr_func(env, regno, arg_type, meta))
-			return -EACCES;
+		err = process_dynptr_func(env, regno, arg_type, meta);
+		if (err)
+			return err;
 		break;
 	case ARG_CONST_ALLOC_SIZE_OR_ZERO:
 		if (!tnum_is_const(reg->var_off)) {
@@ -6492,8 +6496,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		break;
 	}
 	case ARG_PTR_TO_KPTR:
-		if (process_kptr_func(env, regno, meta))
-			return -EACCES;
+		err = process_kptr_func(env, regno, meta);
+		if (err)
+			return err;
 		break;
 	}
 

From 5800d655bf47c653b87410978d55827105f6865e Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 8 Dec 2022 02:11:37 +0530
Subject: [PATCH 129/175] bpf: Rework process_dynptr_func

Recently, user ringbuf support introduced a PTR_TO_DYNPTR register type
for use in callback state, because in case of user ringbuf helpers,
there is no dynptr on the stack that is passed into the callback. To
reflect such a state, a special register type was created.

However, some checks have been bypassed incorrectly during the addition
of this feature. First, for arg_type with MEM_UNINIT flag which
initialize a dynptr, they must be rejected for such register type.
Secondly, in the future, there are plans to add dynptr helpers that
operate on the dynptr itself and may change its offset and other
properties.

In all of these cases, PTR_TO_DYNPTR shouldn't be allowed to be passed
to such helpers, however the current code simply returns 0.

The rejection for helpers that release the dynptr is already handled.

For fixing this, we take a step back and rework existing code in a way
that will allow fitting in all classes of helpers and have a coherent
model for dealing with the variety of use cases in which dynptr is used.

First, for ARG_PTR_TO_DYNPTR, it can either be set alone or together
with a DYNPTR_TYPE_* constant that denotes the only type it accepts.

Next, helpers which initialize a dynptr use MEM_UNINIT to indicate this
fact. To make the distinction clear, use MEM_RDONLY flag to indicate
that the helper only operates on the memory pointed to by the dynptr,
not the dynptr itself. In C parlance, it would be equivalent to taking
the dynptr as a point to const argument.

When either of these flags are not present, the helper is allowed to
mutate both the dynptr itself and also the memory it points to.
Currently, the read only status of the memory is not tracked in the
dynptr, but it would be trivial to add this support inside dynptr state
of the register.

With these changes and renaming PTR_TO_DYNPTR to CONST_PTR_TO_DYNPTR to
better reflect its usage, it can no longer be passed to helpers that
initialize a dynptr, i.e. bpf_dynptr_from_mem, bpf_ringbuf_reserve_dynptr.

A note to reviewers is that in code that does mark_stack_slots_dynptr,
and unmark_stack_slots_dynptr, we implicitly rely on the fact that
PTR_TO_STACK reg is the only case that can reach that code path, as one
cannot pass CONST_PTR_TO_DYNPTR to helpers that don't set MEM_RDONLY. In
both cases such helpers won't be setting that flag.

The next patch will add a couple of selftest cases to make sure this
doesn't break.

Fixes: 205715673844 ("bpf: Add bpf_user_ringbuf_drain() helper")
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221207204141.308952-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
[v6.1: move changes in check_kfunc_args() in verifier.c to
btf_check_func_arg_match() in btf.c]
Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 include/linux/bpf.h                           |   4 +-
 include/uapi/linux/bpf.h                      |   8 +-
 kernel/bpf/btf.c                              |   4 +-
 kernel/bpf/helpers.c                          |  18 +-
 kernel/bpf/verifier.c                         | 223 +++++++++++++-----
 scripts/bpf_doc.py                            |   1 +
 tools/include/uapi/linux/bpf.h                |   8 +-
 .../selftests/bpf/prog_tests/user_ringbuf.c   |   4 +-
 8 files changed, 191 insertions(+), 79 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1ca1902af23e9..5c10057a99d9a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -674,7 +674,7 @@ enum bpf_reg_type {
 	PTR_TO_MEM,		 /* reg points to valid memory region */
 	PTR_TO_BUF,		 /* reg points to a read/write buffer */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
-	PTR_TO_DYNPTR,		 /* reg points to a dynptr */
+	CONST_PTR_TO_DYNPTR,	 /* reg points to a const struct bpf_dynptr */
 	__BPF_REG_TYPE_MAX,
 
 	/* Extended reg_types. */
@@ -2780,7 +2780,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 		     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 int bpf_dynptr_check_size(u32 size);
-u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr);
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
 
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a17688011440e..eac87609ca170 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5301,7 +5301,7 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
@@ -5311,7 +5311,7 @@ union bpf_attr {
  *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
  *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
@@ -5321,7 +5321,7 @@ union bpf_attr {
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
  *		is a read-only dynptr or if *flags* is not 0.
  *
- * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
  *		Get a pointer to the underlying dynptr data.
  *
@@ -5422,7 +5422,7 @@ union bpf_attr {
  *		Drain samples from the specified user ring buffer, and invoke
  *		the provided callback for each such sample:
  *
- *		long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *		long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
  *
  *		If **callback_fn** returns 0, the helper will continue to try
  *		and drain the next sample, up to a maximum of
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2875ac2c694ee..2571c37360acf 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6492,14 +6492,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 
 				if (arg_dynptr) {
 					if (reg->type != PTR_TO_STACK &&
-					    reg->type != PTR_TO_DYNPTR) {
+					    reg->type != CONST_PTR_TO_DYNPTR) {
 						bpf_log(log, "arg#%d pointer type %s %s not to stack or dynptr_ptr\n",
 							i, btf_type_str(ref_t),
 							ref_tname);
 						return -EINVAL;
 					}
 
-					ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR, NULL);
+					ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL);
 				      if (ret < 0)
 					      return ret;
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 758510b46d87b..bee33e1b555a6 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1422,7 +1422,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
 #define DYNPTR_SIZE_MASK	0xFFFFFF
 #define DYNPTR_RDONLY_BIT	BIT(31)
 
-static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_RDONLY_BIT;
 }
@@ -1432,7 +1432,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
-u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_SIZE_MASK;
 }
@@ -1456,7 +1456,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
 	memset(ptr, 0, sizeof(*ptr));
 }
 
-static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
 {
 	u32 size = bpf_dynptr_get_size(ptr);
 
@@ -1501,7 +1501,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
 };
 
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
 	   u32, offset, u64, flags)
 {
 	int err;
@@ -1524,12 +1524,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_DYNPTR,
+	.arg3_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
 	.arg4_type	= ARG_ANYTHING,
 	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
 	   u32, len, u64, flags)
 {
 	int err;
@@ -1550,14 +1550,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.func		= bpf_dynptr_write,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_DYNPTR,
+	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
 {
 	int err;
 
@@ -1578,7 +1578,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
 	.func		= bpf_dynptr_data,
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
-	.arg1_type	= ARG_PTR_TO_DYNPTR,
+	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 60e4125666732..070bfa823e203 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -572,7 +572,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 		[PTR_TO_BUF]		= "buf",
 		[PTR_TO_FUNC]		= "func",
 		[PTR_TO_MAP_KEY]	= "map_key",
-		[PTR_TO_DYNPTR]		= "dynptr_ptr",
+		[CONST_PTR_TO_DYNPTR]	= "dynptr_ptr",
 	};
 
 	if (type & PTR_MAYBE_NULL) {
@@ -706,6 +706,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
 	return type == BPF_DYNPTR_TYPE_RINGBUF;
 }
 
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+			      enum bpf_dynptr_type type,
+			      bool first_slot);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+				struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1,
+				   struct bpf_reg_state *sreg2,
+				   enum bpf_dynptr_type type)
+{
+	__mark_dynptr_reg(sreg1, type, true);
+	__mark_dynptr_reg(sreg2, type, false);
+}
+
+static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
+			       enum bpf_dynptr_type type)
+{
+	__mark_dynptr_reg(reg, type, true);
+}
+
+
 static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				   enum bpf_arg_type arg_type, int insn_idx)
 {
@@ -727,9 +749,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 	if (type == BPF_DYNPTR_TYPE_INVALID)
 		return -EINVAL;
 
-	state->stack[spi].spilled_ptr.dynptr.first_slot = true;
-	state->stack[spi].spilled_ptr.dynptr.type = type;
-	state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+	mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr,
+			       &state->stack[spi - 1].spilled_ptr, type);
 
 	if (dynptr_type_refcounted(type)) {
 		/* The id is used to track proper releasing */
@@ -737,8 +758,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 		if (id < 0)
 			return id;
 
-		state->stack[spi].spilled_ptr.id = id;
-		state->stack[spi - 1].spilled_ptr.id = id;
+		state->stack[spi].spilled_ptr.ref_obj_id = id;
+		state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
 	}
 
 	return 0;
@@ -760,25 +781,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 	}
 
 	/* Invalidate any slices associated with this dynptr */
-	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
-		release_reference(env, state->stack[spi].spilled_ptr.id);
-		state->stack[spi].spilled_ptr.id = 0;
-		state->stack[spi - 1].spilled_ptr.id = 0;
-	}
-
-	state->stack[spi].spilled_ptr.dynptr.first_slot = false;
-	state->stack[spi].spilled_ptr.dynptr.type = 0;
-	state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
+		WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
 
+	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
 	return 0;
 }
 
 static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int spi = get_spi(reg->off);
-	int i;
+	int spi, i;
+
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return false;
 
+	spi = get_spi(reg->off);
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
 		return true;
 
@@ -794,9 +813,14 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int spi = get_spi(reg->off);
+	int spi;
 	int i;
 
+	/* This already represents first slot of initialized bpf_dynptr */
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return true;
+
+	spi = get_spi(reg->off);
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
 	    !state->stack[spi].spilled_ptr.dynptr.first_slot)
 		return false;
@@ -815,15 +839,19 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 {
 	struct bpf_func_state *state = func(env, reg);
 	enum bpf_dynptr_type dynptr_type;
-	int spi = get_spi(reg->off);
+	int spi;
 
 	/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
 	if (arg_type == ARG_PTR_TO_DYNPTR)
 		return true;
 
 	dynptr_type = arg_to_dynptr_type(arg_type);
-
-	return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+	if (reg->type == CONST_PTR_TO_DYNPTR) {
+		return reg->dynptr.type == dynptr_type;
+	} else {
+		spi = get_spi(reg->off);
+		return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+	}
 }
 
 /* The reg state of a pointer or a bounded scalar was saved when
@@ -1336,9 +1364,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
 	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
 
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
-				struct bpf_reg_state *reg);
-
 /* This helper doesn't clear reg->id */
 static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
 {
@@ -1401,6 +1426,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
 	__mark_reg_known_zero(regs + regno);
 }
 
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+			      bool first_slot)
+{
+	/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+	 * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+	 * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+	 */
+	__mark_reg_known_zero(reg);
+	reg->type = CONST_PTR_TO_DYNPTR;
+	reg->dynptr.type = type;
+	reg->dynptr.first_slot = first_slot;
+}
+
 static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 {
 	if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
@@ -5852,19 +5890,58 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
 int process_dynptr_func(struct bpf_verifier_env *env, int regno,
-			enum bpf_arg_type arg_type,
-			struct bpf_call_arg_meta *meta)
+			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 
-	/* We only need to check for initialized / uninitialized helper
-	 * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
-	 * assumption is that if it is, that a helper function
-	 * initialized the dynptr on behalf of the BPF program.
+	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+	 */
+	if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+		verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
+		return -EFAULT;
+	}
+	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for
+	 *		 constructing a mutable bpf_dynptr object.
+	 *
+	 *		 Currently, this is only possible with PTR_TO_STACK
+	 *		 pointing to a region of at least 16 bytes which doesn't
+	 *		 contain an existing bpf_dynptr.
+	 *
+	 *  MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+	 *		 mutated or destroyed. However, the memory it points to
+	 *		 may be mutated.
+	 *
+	 *  None       - Points to a initialized dynptr that can be mutated and
+	 *		 destroyed, including mutation of the memory it points
+	 *		 to.
 	 */
-	if (base_type(reg->type) == PTR_TO_DYNPTR)
-		return 0;
 	if (arg_type & MEM_UNINIT) {
 		if (!is_dynptr_reg_valid_uninit(env, reg)) {
 			verbose(env, "Dynptr has to be an uninitialized dynptr\n");
@@ -5880,7 +5957,13 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 		}
 
 		meta->uninit_dynptr_regno = regno;
-	} else {
+	} else /* MEM_RDONLY and None case from above */ {
+		/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+		if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+			verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+			return -EINVAL;
+		}
+
 		if (!is_dynptr_reg_valid_init(env, reg)) {
 			verbose(env,
 				"Expected an initialized dynptr as arg #%d\n",
@@ -5888,7 +5971,8 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 			return -EINVAL;
 		}
 
-		if (!is_dynptr_type_expected(env, reg, arg_type)) {
+		/* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+		if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
 			const char *err_extra = "";
 
 			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -6043,7 +6127,7 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }
 static const struct bpf_reg_types dynptr_types = {
 	.types = {
 		PTR_TO_STACK,
-		PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL,
+		CONST_PTR_TO_DYNPTR,
 	}
 };
 
@@ -6219,12 +6303,16 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
 }
 
-static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int spi = get_spi(reg->off);
+	int spi;
+
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return reg->ref_obj_id;
 
-	return state->stack[spi].spilled_ptr.id;
+	spi = get_spi(reg->off);
+	return state->stack[spi].spilled_ptr.ref_obj_id;
 }
 
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
@@ -6288,11 +6376,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	if (arg_type_is_release(arg_type)) {
 		if (arg_type_is_dynptr(arg_type)) {
 			struct bpf_func_state *state = func(env, reg);
-			int spi = get_spi(reg->off);
+			int spi;
 
-			if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
-			    !state->stack[spi].spilled_ptr.id) {
-				verbose(env, "arg %d is an unacquired reference\n", regno);
+			/* Only dynptr created on stack can be released, thus
+			 * the get_spi and stack state checks for spilled_ptr
+			 * should only be done before process_dynptr_func for
+			 * PTR_TO_STACK.
+			 */
+			if (reg->type == PTR_TO_STACK) {
+				spi = get_spi(reg->off);
+				if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+				    !state->stack[spi].spilled_ptr.ref_obj_id) {
+					verbose(env, "arg %d is an unacquired reference\n", regno);
+					return -EINVAL;
+				}
+			} else {
+				verbose(env, "cannot release unowned const bpf_dynptr\n");
 				return -EINVAL;
 			}
 		} else if (!reg->ref_obj_id && !register_is_null(reg)) {
@@ -7259,11 +7358,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
 {
 	/* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
 	 *			  callback_ctx, u64 flags);
-	 * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx);
+	 * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
 	 */
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
-	callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL;
-	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+	mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
 	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
 
 	/* unused */
@@ -7649,7 +7747,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	regs = cur_regs(env);
 
+	/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+	 * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr
+	 * is safe to do directly.
+	 */
 	if (meta.uninit_dynptr_regno) {
+		if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) {
+			verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n");
+			return -EFAULT;
+		}
 		/* we write BPF_DW bits (8 bytes) at a time */
 		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
 			err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
@@ -7667,15 +7773,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	if (meta.release_regno) {
 		err = -EINVAL;
-		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+		/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+		 * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+		 * is safe to do directly.
+		 */
+		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+			if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
+				verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
+				return -EFAULT;
+			}
 			err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
-		else if (meta.ref_obj_id)
+		} else if (meta.ref_obj_id) {
 			err = release_reference(env, meta.ref_obj_id);
-		/* meta.ref_obj_id can only be 0 if register that is meant to be
-		 * released is NULL, which must be > R0.
-		 */
-		else if (register_is_null(&regs[meta.release_regno]))
+		} else if (register_is_null(&regs[meta.release_regno])) {
+			/* meta.ref_obj_id can only be 0 if register that is meant to be
+			 * released is NULL, which must be > R0.
+			 */
 			err = 0;
+		}
 		if (err) {
 			verbose(env, "func %s#%d reference has not been acquired before\n",
 				func_id_name(func_id), func_id);
@@ -7749,11 +7864,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 					return -EFAULT;
 				}
 
-				if (base_type(reg->type) != PTR_TO_DYNPTR)
-					/* Find the id of the dynptr we're
-					 * tracking the reference of
-					 */
-					meta.ref_obj_id = stack_slot_get_id(env, reg);
+				meta.ref_obj_id = dynptr_ref_obj_id(env, reg);
 				break;
 			}
 		}
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index c2da6ed32104f..d8a7abbe787ee 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -732,6 +732,7 @@ def __init__(self, parser):
             'struct bpf_timer',
             'struct mptcp_sock',
             'struct bpf_dynptr',
+            'const struct bpf_dynptr',
             'struct iphdr',
             'struct ipv6hdr',
     }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a17688011440e..eac87609ca170 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5301,7 +5301,7 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
@@ -5311,7 +5311,7 @@ union bpf_attr {
  *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
  *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
@@ -5321,7 +5321,7 @@ union bpf_attr {
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
  *		is a read-only dynptr or if *flags* is not 0.
  *
- * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
  *		Get a pointer to the underlying dynptr data.
  *
@@ -5422,7 +5422,7 @@ union bpf_attr {
  *		Drain samples from the specified user ring buffer, and invoke
  *		the provided callback for each such sample:
  *
- *		long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *		long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
  *
  *		If **callback_fn** returns 0, the helper will continue to try
  *		and drain the next sample, up to a maximum of
diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
index 02b18d018b36a..aefa0a474e582 100644
--- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
+++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
@@ -673,8 +673,8 @@ static struct {
 	{"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"},
 	{"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"},
 	{"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"},
-	{"user_ringbuf_callback_discard_dynptr", "arg 1 is an unacquired reference"},
-	{"user_ringbuf_callback_submit_dynptr", "arg 1 is an unacquired reference"},
+	{"user_ringbuf_callback_discard_dynptr", "cannot release unowned const bpf_dynptr"},
+	{"user_ringbuf_callback_submit_dynptr", "cannot release unowned const bpf_dynptr"},
 	{"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"},
 };
 

From c591e00334743f2b04386aae53ba52c8c805b88a Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 8 Dec 2022 02:11:38 +0530
Subject: [PATCH 130/175] bpf: Rework check_func_arg_reg_off

While check_func_arg_reg_off is the place which performs generic checks
needed by various candidates of reg->type, there is some handling for
special cases, like ARG_PTR_TO_DYNPTR, OBJ_RELEASE, and
ARG_PTR_TO_RINGBUF_MEM.

This commit aims to streamline these special cases and instead leave
other things up to argument type specific code to handle. The function
will be restrictive by default, and cover all possible cases when
OBJ_RELEASE is set, without having to update the function again (and
missing to do that being a bug).

This is done primarily for two reasons: associating back reg->type to
its argument leaves room for the list getting out of sync when a new
reg->type is supported by an arg_type.

The other case is ARG_PTR_TO_RINGBUF_MEM. The problem there is something
we already handle, whenever a release argument is expected, it should
be passed as the pointer that was received from the acquire function.
Hence zero fixed and variable offset.

There is nothing special about ARG_PTR_TO_RINGBUF_MEM, where technically
its target register type PTR_TO_MEM | MEM_RINGBUF can already be passed
with non-zero offset to other helper functions, which makes sense.

Hence, lift the arg_type_is_release check for reg->off and cover all
possible register types, instead of duplicating the same kind of check
twice for current OBJ_RELEASE arg_types (alloc_mem and ptr_to_btf_id).

For the release argument, arg_type_is_dynptr is the special case, where
we go to actual object being freed through the dynptr, so the offset of
the pointer still needs to allow fixed and variable offset and
process_dynptr_func will verify them later for the release argument case
as well.

This is not specific to ARG_PTR_TO_DYNPTR though, we will need to make
this exception for any future object on the stack that needs to be
released. In this sense, PTR_TO_STACK as a candidate for object on stack
argument is a special case for release offset checks, and they need to
be done by the helper releasing the object on stack.

Since the check has been lifted above all register type checks, remove
the duplicated check that is being done for PTR_TO_BTF_ID.

Acked-by: Joanne Koong <joannelkoong@gmail.com>
Acked-by: David Vernet <void@manifault.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221207204141.308952-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
[v6.1: resolved contextual conflict between ARG_PTR_TO_ALLOC_MEM and
ARG_PTR_TO_RINGBUF_MEM]
Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 kernel/bpf/verifier.c                         | 63 +++++++++++--------
 .../testing/selftests/bpf/verifier/ringbuf.c  |  2 +-
 2 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 070bfa823e203..ea2155dd695c4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6251,11 +6251,37 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			   const struct bpf_reg_state *reg, int regno,
 			   enum bpf_arg_type arg_type)
 {
-	enum bpf_reg_type type = reg->type;
-	bool fixed_off_ok = false;
+	u32 type = reg->type;
+
+	/* When referenced register is passed to release function, its fixed
+	 * offset must be 0.
+	 *
+	 * We will check arg_type_is_release reg has ref_obj_id when storing
+	 * meta->release_regno.
+	 */
+	if (arg_type_is_release(arg_type)) {
+		/* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
+		 * may not directly point to the object being released, but to
+		 * dynptr pointing to such object, which might be at some offset
+		 * on the stack. In that case, we simply to fallback to the
+		 * default handling.
+		 */
+		if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
+			return 0;
+		/* Doing check_ptr_off_reg check for the offset will catch this
+		 * because fixed_off_ok is false, but checking here allows us
+		 * to give the user a better error message.
+		 */
+		if (reg->off) {
+			verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
+				regno);
+			return -EINVAL;
+		}
+		return __check_ptr_off_reg(env, reg, regno, false);
+	}
 
-	switch ((u32)type) {
-	/* Pointer types where reg offset is explicitly allowed: */
+	switch (type) {
+	/* Pointer types where both fixed and variable offset is explicitly allowed: */
 	case PTR_TO_STACK:
 		if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) {
 			verbose(env, "cannot pass in dynptr at an offset\n");
@@ -6272,35 +6298,22 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_BUF:
 	case PTR_TO_BUF | MEM_RDONLY:
 	case SCALAR_VALUE:
-		/* Some of the argument types nevertheless require a
-		 * zero register offset.
-		 */
-		if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM)
-			return 0;
-		break;
+		return 0;
 	/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
 	 * fixed offset.
 	 */
 	case PTR_TO_BTF_ID:
 		/* When referenced PTR_TO_BTF_ID is passed to release function,
-		 * it's fixed offset must be 0.	In the other cases, fixed offset
-		 * can be non-zero.
+		 * its fixed offset must be 0. In the other cases, fixed offset
+		 * can be non-zero. This was already checked above. So pass
+		 * fixed_off_ok as true to allow fixed offset for all other
+		 * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
+		 * still need to do checks instead of returning.
 		 */
-		if (arg_type_is_release(arg_type) && reg->off) {
-			verbose(env, "R%d must have zero offset when passed to release func\n",
-				regno);
-			return -EINVAL;
-		}
-		/* For arg is release pointer, fixed_off_ok must be false, but
-		 * we already checked and rejected reg->off != 0 above, so set
-		 * to true to allow fixed offset for all other cases.
-		 */
-		fixed_off_ok = true;
-		break;
+		return __check_ptr_off_reg(env, reg, regno, true);
 	default:
-		break;
+		return __check_ptr_off_reg(env, reg, regno, false);
 	}
-	return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
 }
 
 static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c
index b64d33e4833c8..92e3f6a61a798 100644
--- a/tools/testing/selftests/bpf/verifier/ringbuf.c
+++ b/tools/testing/selftests/bpf/verifier/ringbuf.c
@@ -28,7 +28,7 @@
 	},
 	.fixup_map_ringbuf = { 1 },
 	.result = REJECT,
-	.errstr = "dereference of modified alloc_mem ptr R1",
+	.errstr = "R1 must have zero offset when passed to release func",
 },
 {
 	"ringbuf: invalid reservation offset 2",

From 5a6355ba920414f2b8f663dd9872f26ba0e5fc23 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 8 Dec 2022 02:11:39 +0530
Subject: [PATCH 131/175] bpf: Move PTR_TO_STACK alignment check to
 process_dynptr_func

After previous commit, we are minimizing helper specific assumptions
from check_func_arg_reg_off, making it generic, and offloading checks
for a specific argument type to their respective functions called after
check_func_arg_reg_off has been called.

This allows relying on a consistent set of guarantees after that call
and then relying on them in code that deals with registers for each
argument type later. This is in line with how process_spin_lock,
process_timer_func, process_kptr_func check reg->var_off to be constant.
The same reasoning is used here to move the alignment check into
process_dynptr_func. Note that it also needs to check for constant
var_off, and accumulate the constant var_off when computing the spi in
get_spi, but that fix will come in later changes.

Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221207204141.308952-6-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ea2155dd695c4..77f455ce7aec0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5927,6 +5927,14 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 		verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
 		return -EFAULT;
 	}
+	/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+	 * check_func_arg_reg_off's logic. We only need to check offset
+	 * alignment for PTR_TO_STACK.
+	 */
+	if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) {
+		verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off);
+		return -EINVAL;
+	}
 	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for
 	 *		 constructing a mutable bpf_dynptr object.
 	 *
@@ -6283,11 +6291,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	switch (type) {
 	/* Pointer types where both fixed and variable offset is explicitly allowed: */
 	case PTR_TO_STACK:
-		if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) {
-			verbose(env, "cannot pass in dynptr at an offset\n");
-			return -EINVAL;
-		}
-		fallthrough;
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
 	case PTR_TO_MAP_KEY:

From 1c3c778c7e6b62bdf47f5c242e79b4693f291be5 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 8 Dec 2022 02:11:40 +0530
Subject: [PATCH 132/175] bpf: Use memmove for bpf_dynptr_{read,write}

It may happen that destination buffer memory overlaps with memory dynptr
points to. Hence, we must use memmove to correctly copy from dynptr to
destination buffer, or source buffer to dynptr.

This actually isn't a problem right now, as memcpy implementation falls
back to memmove on detecting overlap and warns about it, but we
shouldn't be relying on that.

Acked-by: Joanne Koong <joannelkoong@gmail.com>
Acked-by: David Vernet <void@manifault.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221207204141.308952-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bee33e1b555a6..47a1bdf7ac595 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1513,7 +1513,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
 	if (err)
 		return err;
 
-	memcpy(dst, src->data + src->offset + offset, len);
+	/* Source and destination may possibly overlap, hence use memmove to
+	 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+	 * pointing to overlapping PTR_TO_MAP_VALUE regions.
+	 */
+	memmove(dst, src->data + src->offset + offset, len);
 
 	return 0;
 }
@@ -1541,7 +1545,11 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
 	if (err)
 		return err;
 
-	memcpy(dst->data + dst->offset + offset, src, len);
+	/* Source and destination may possibly overlap, hence use memmove to
+	 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+	 * pointing to overlapping PTR_TO_MAP_VALUE regions.
+	 */
+	memmove(dst->data + dst->offset + offset, src, len);
 
 	return 0;
 }

From 43927e052a6f39d4a0c1ed5246ebca02bfeacd0c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:30 +0530
Subject: [PATCH 133/175] bpf: Fix state pruning for STACK_DYNPTR stack slots

The root of the problem is missing liveness marking for STACK_DYNPTR
slots. This leads to all kinds of problems inside stacksafe.

The verifier by default inside stacksafe ignores spilled_ptr in stack
slots which do not have REG_LIVE_READ marks. Since this is being checked
in the 'old' explored state, it must have already done clean_live_states
for this old bpf_func_state. Hence, it won't be receiving any more
liveness marks from to be explored insns (it has received REG_LIVE_DONE
marking from liveness point of view).

What this means is that verifier considers that it's safe to not compare
the stack slot if was never read by children states. While liveness
marks are usually propagated correctly following the parentage chain for
spilled registers (SCALAR_VALUE and PTR_* types), the same is not the
case for STACK_DYNPTR.

clean_live_states hence simply rewrites these stack slots to the type
STACK_INVALID since it sees no REG_LIVE_READ marks.

The end result is that we will never see STACK_DYNPTR slots in explored
state. Even if verifier was conservatively matching !REG_LIVE_READ
slots, very next check continuing the stacksafe loop on seeing
STACK_INVALID would again prevent further checks.

Now as long as verifier stores an explored state which we can compare to
when reaching a pruning point, we can abuse this bug to make verifier
prune search for obviously unsafe paths using STACK_DYNPTR slots
thinking they are never used hence safe.

Doing this in unprivileged mode is a bit challenging. add_new_state is
only set when seeing BPF_F_TEST_STATE_FREQ (which requires privileges)
or when jmps_processed difference is >= 2 and insn_processed difference
is >= 8. So coming up with the unprivileged case requires a little more
work, but it is still totally possible. The test case being discussed
below triggers the heuristic even in unprivileged mode.

However, it no longer works since commit
8addbfc7b308 ("bpf: Gate dynptr API behind CAP_BPF").

Let's try to study the test step by step.

Consider the following program (C style BPF ASM):

0  r0 = 0;
1  r6 = &ringbuf_map;
3  r1 = r6;
4  r2 = 8;
5  r3 = 0;
6  r4 = r10;
7  r4 -= -16;
8  call bpf_ringbuf_reserve_dynptr;
9  if r0 == 0 goto pc+1;
10 goto pc+1;
11 *(r10 - 16) = 0xeB9F;
12 r1 = r10;
13 r1 -= -16;
14 r2 = 0;
15 call bpf_ringbuf_discard_dynptr;
16 r0 = 0;
17 exit;

We know that insn 12 will be a pruning point, hence if we force
add_new_state for it, it will first verify the following path as
safe in straight line exploration:
0 1 3 4 5 6 7 8 9 -> 10 -> (12) 13 14 15 16 17

Then, when we arrive at insn 12 from the following path:
0 1 3 4 5 6 7 8 9 -> 11 (12)

We will find a state that has been verified as safe already at insn 12.
Since register state is same at this point, regsafe will pass. Next, in
stacksafe, for spi = 0 and spi = 1 (location of our dynptr) is skipped
seeing !REG_LIVE_READ. The rest matches, so stacksafe returns true.
Next, refsafe is also true as reference state is unchanged in both
states.

The states are considered equivalent and search is pruned.

Hence, we are able to construct a dynptr with arbitrary contents and use
the dynptr API to operate on this arbitrary pointer and arbitrary size +
offset.

To fix this, first define a mark_dynptr_read function that propagates
liveness marks whenever a valid initialized dynptr is accessed by dynptr
helpers. REG_LIVE_WRITTEN is marked whenever we initialize an
uninitialized dynptr. This is done in mark_stack_slots_dynptr. It allows
screening off mark_reg_read and not propagating marks upwards from that
point.

This ensures that we either set REG_LIVE_READ64 on both dynptr slots, or
none, so clean_live_states either sets both slots to STACK_INVALID or
none of them. This is the invariant the checks inside stacksafe rely on.

Next, do a complete comparison of both stack slots whenever they have
STACK_DYNPTR. Compare the dynptr type stored in the spilled_ptr, and
also whether both form the same first_slot. Only then is the later path
safe.

Fixes: 97e03f521050 ("bpf: Add verifier support for dynptrs")
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 88 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 77f455ce7aec0..527ecbe71531c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -762,6 +762,9 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 		state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
 	}
 
+	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
 	return 0;
 }
 
@@ -786,6 +789,31 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 
 	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
 	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+	/* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
+	 *
+	 * While we don't allow reading STACK_INVALID, it is still possible to
+	 * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
+	 * helpers or insns can do partial read of that part without failing,
+	 * but check_stack_range_initialized, check_stack_read_var_off, and
+	 * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
+	 * the slot conservatively. Hence we need to prevent those liveness
+	 * marking walks.
+	 *
+	 * This was not a problem before because STACK_INVALID is only set by
+	 * default (where the default reg state has its reg->parent as NULL), or
+	 * in clean_live_states after REG_LIVE_DONE (at which point
+	 * mark_reg_read won't walk reg->parent chain), but not randomly during
+	 * verifier state exploration (like we did above). Hence, for our case
+	 * parentage chain will still be live (i.e. reg->parent may be
+	 * non-NULL), while earlier reg->parent was NULL, so we need
+	 * REG_LIVE_WRITTEN to screen off read marker propagation when it is
+	 * done later on reads or by mark_dynptr_read as well to unnecessary
+	 * mark registers in verifier state.
+	 */
+	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
 	return 0;
 }
 
@@ -2370,6 +2398,30 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, ret;
+
+	/* For CONST_PTR_TO_DYNPTR, it must have already been done by
+	 * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
+	 * check_kfunc_call.
+	 */
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return 0;
+	spi = get_spi(reg->off);
+	/* Caller ensures dynptr is valid and initialized, which means spi is in
+	 * bounds and spi is the first dynptr slot. Simply mark stack slot as
+	 * read.
+	 */
+	ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
+			    state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
+	if (ret)
+		return ret;
+	return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
+			     state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
+}
+
 /* This function is supposed to be used by the following 32-bit optimization
  * code only. It returns TRUE if the source or destination register operates
  * on 64-bit, otherwise return FALSE.
@@ -5966,6 +6018,8 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 
 		meta->uninit_dynptr_regno = regno;
 	} else /* MEM_RDONLY and None case from above */ {
+		int err;
+
 		/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
 		if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
 			verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
@@ -5999,6 +6053,10 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 				err_extra, regno);
 			return -EINVAL;
 		}
+
+		err = mark_dynptr_read(env, reg);
+		if (err)
+			return err;
 	}
 	return 0;
 }
@@ -12135,10 +12193,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			return false;
 		if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
 			continue;
-		if (!is_spilled_reg(&old->stack[spi]))
-			continue;
-		if (!regsafe(env, &old->stack[spi].spilled_ptr,
-			     &cur->stack[spi].spilled_ptr, idmap))
+		/* Both old and cur are having same slot_type */
+		switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+		case STACK_SPILL:
 			/* when explored and current stack slot are both storing
 			 * spilled registers, check that stored pointers types
 			 * are the same as well.
@@ -12149,7 +12206,30 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			 * such verifier states are not equivalent.
 			 * return false to continue verification of this path
 			 */
+			if (!regsafe(env, &old->stack[spi].spilled_ptr,
+				     &cur->stack[spi].spilled_ptr, idmap))
+				return false;
+			break;
+		case STACK_DYNPTR:
+		{
+			const struct bpf_reg_state *old_reg, *cur_reg;
+
+			old_reg = &old->stack[spi].spilled_ptr;
+			cur_reg = &cur->stack[spi].spilled_ptr;
+			if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+			    old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+				return false;
+			break;
+		}
+		case STACK_MISC:
+		case STACK_ZERO:
+		case STACK_INVALID:
+			continue;
+		/* Ensure that new unhandled slot types return false by default */
+		default:
 			return false;
+		}
 	}
 	return true;
 }

From 3b06f577145fbd7136d2901814d81963b99a1de9 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:31 +0530
Subject: [PATCH 134/175] bpf: Fix missing var_off check for ARG_PTR_TO_DYNPTR

Currently, the dynptr function is not checking the variable offset part
of PTR_TO_STACK that it needs to check. The fixed offset is considered
when computing the stack pointer index, but if the variable offset was
not a constant (such that it could not be accumulated in reg->off), we
will end up a discrepency where runtime pointer does not point to the
actual stack slot we mark as STACK_DYNPTR.

It is impossible to precisely track dynptr state when variable offset is
not constant, hence, just like bpf_timer, kptr, bpf_spin_lock, etc.
simply reject the case where reg->var_off is not constant. Then,
consider both reg->off and reg->var_off.value when computing the stack
pointer index.

A new helper dynptr_get_spi is introduced to hide over these details
since the dynptr needs to be located in multiple places outside the
process_dynptr_func checks, hence once we know it's a PTR_TO_STACK, we
need to enforce these checks in all places.

Note that it is disallowed for unprivileged users to have a non-constant
var_off, so this problem should only be possible to trigger from
programs having CAP_PERFMON. However, its effects can vary.

Without the fix, it is possible to replace the contents of the dynptr
arbitrarily by making verifier mark different stack slots than actual
location and then doing writes to the actual stack address of dynptr at
runtime.

Fixes: 97e03f521050 ("bpf: Add verifier support for dynptrs")
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                         | 84 +++++++++++++++----
 .../bpf/prog_tests/kfunc_dynptr_param.c       |  2 +-
 .../testing/selftests/bpf/progs/dynptr_fail.c |  4 +-
 3 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 527ecbe71531c..86a1e109e5024 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -619,11 +619,34 @@ static void print_liveness(struct bpf_verifier_env *env,
 		verbose(env, "D");
 }
 
-static int get_spi(s32 off)
+static int __get_spi(s32 off)
 {
 	return (-off - 1) / BPF_REG_SIZE;
 }
 
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	int off, spi;
+
+	if (!tnum_is_const(reg->var_off)) {
+		verbose(env, "dynptr has to be at a constant offset\n");
+		return -EINVAL;
+	}
+
+	off = reg->off + reg->var_off.value;
+	if (off % BPF_REG_SIZE) {
+		verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+		return -EINVAL;
+	}
+
+	spi = __get_spi(off);
+	if (spi < 1) {
+		verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+		return -EINVAL;
+	}
+	return spi;
+}
+
 static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
 {
 	int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
@@ -735,7 +758,9 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 	enum bpf_dynptr_type type;
 	int spi, i, id;
 
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
 
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
 		return -EINVAL;
@@ -773,7 +798,9 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 	struct bpf_func_state *state = func(env, reg);
 	int spi, i;
 
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
 
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
 		return -EINVAL;
@@ -825,7 +852,11 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return false;
 
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return false;
+
+	/* We will do check_mem_access to check and update stack bounds later */
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
 		return true;
 
@@ -841,14 +872,15 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int spi;
-	int i;
+	int spi, i;
 
 	/* This already represents first slot of initialized bpf_dynptr */
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return true;
 
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return false;
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
 	    !state->stack[spi].spilled_ptr.dynptr.first_slot)
 		return false;
@@ -877,7 +909,9 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 	if (reg->type == CONST_PTR_TO_DYNPTR) {
 		return reg->dynptr.type == dynptr_type;
 	} else {
-		spi = get_spi(reg->off);
+		spi = dynptr_get_spi(env, reg);
+		if (spi < 0)
+			return false;
 		return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
 	}
 }
@@ -2409,7 +2443,9 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
 	 */
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return 0;
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
 	/* Caller ensures dynptr is valid and initialized, which means spi is in
 	 * bounds and spi is the first dynptr slot. Simply mark stack slot as
 	 * read.
@@ -5981,12 +6017,15 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 	}
 	/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
 	 * check_func_arg_reg_off's logic. We only need to check offset
-	 * alignment for PTR_TO_STACK.
+	 * and its alignment for PTR_TO_STACK.
 	 */
-	if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) {
-		verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off);
-		return -EINVAL;
+	if (reg->type == PTR_TO_STACK) {
+		int err = dynptr_get_spi(env, reg);
+
+		if (err < 0)
+			return err;
 	}
+
 	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for
 	 *		 constructing a mutable bpf_dynptr object.
 	 *
@@ -6377,15 +6416,16 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	}
 }
 
-static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
 	int spi;
 
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return reg->ref_obj_id;
-
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
 	return state->stack[spi].spilled_ptr.ref_obj_id;
 }
 
@@ -6458,7 +6498,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			 * PTR_TO_STACK.
 			 */
 			if (reg->type == PTR_TO_STACK) {
-				spi = get_spi(reg->off);
+				spi = dynptr_get_spi(env, reg);
+				if (spi < 0)
+					return spi;
 				if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
 				    !state->stack[spi].spilled_ptr.ref_obj_id) {
 					verbose(env, "arg %d is an unacquired reference\n", regno);
@@ -7932,13 +7974,19 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
 			if (arg_type_is_dynptr(fn->arg_type[i])) {
 				struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+				int ref_obj_id;
 
 				if (meta.ref_obj_id) {
 					verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
 					return -EFAULT;
 				}
 
-				meta.ref_obj_id = dynptr_ref_obj_id(env, reg);
+				ref_obj_id = dynptr_ref_obj_id(env, reg);
+				if (ref_obj_id < 0) {
+					verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+					return ref_obj_id;
+				}
+				meta.ref_obj_id = ref_obj_id;
 				break;
 			}
 		}
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
index 6c4f0b64bb4ff..612907d1fbeab 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
@@ -18,7 +18,7 @@ static struct {
 	const char *expected_verifier_err_msg;
 	int expected_runtime_err;
 } kfunc_dynptr_tests[] = {
-	{"not_valid_dynptr", "Expected an initialized dynptr as arg #1", 0},
+      {"not_valid_dynptr", "cannot pass in dynptr at an offset=-8", 0},
 	{"not_ptr_to_stack", "arg#0 pointer type STRUCT bpf_dynptr_kern not to stack or dynptr_ptr", 0},
 	{"dynptr_data_null", NULL, -EBADMSG},
 };
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index b13aeea78d13a..54dd8e22b5f26 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -389,7 +389,7 @@ int invalid_helper1(void *ctx)
 
 /* A dynptr can't be passed into a helper function at a non-zero offset */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #3")
+__failure __msg("cannot pass in dynptr at an offset=-8")
 int invalid_helper2(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -591,7 +591,7 @@ int invalid_read4(void *ctx)
 
 /* Initializing a dynptr on an offset should fail */
 SEC("?raw_tp")
-__failure __msg("invalid write to stack")
+__failure __msg("cannot pass in dynptr at an offset=0")
 int invalid_offset(void *ctx)
 {
 	struct bpf_dynptr ptr;

From fa5e8de5053eb9fa948b9594328a5acaa755fd81 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:32 +0530
Subject: [PATCH 135/175] bpf: Fix partial dynptr stack slot reads/writes

Currently, while reads are disallowed for dynptr stack slots, writes are
not. Reads don't work from both direct access and helpers, while writes
do work in both cases, but have the effect of overwriting the slot_type.

While this is fine, handling for a few edge cases is missing. Firstly,
a user can overwrite the stack slots of dynptr partially.

Consider the following layout:
spi: [d][d][?]
      2  1  0

First slot is at spi 2, second at spi 1.
Now, do a write of 1 to 8 bytes for spi 1.

This will essentially either write STACK_MISC for all slot_types or
STACK_MISC and STACK_ZERO (in case of size < BPF_REG_SIZE partial write
of zeroes). The end result is that slot is scrubbed.

Now, the layout is:
spi: [d][m][?]
      2  1  0

Suppose if user initializes spi = 1 as dynptr.
We get:
spi: [d][d][d]
      2  1  0

But this time, both spi 2 and spi 1 have first_slot = true.

Now, when passing spi 2 to dynptr helper, it will consider it as
initialized as it does not check whether second slot has first_slot ==
false. And spi 1 should already work as normal.

This effectively replaced size + offset of first dynptr, hence allowing
invalid OOB reads and writes.

Make a few changes to protect against this:
When writing to PTR_TO_STACK using BPF insns, when we touch spi of a
STACK_DYNPTR type, mark both first and second slot (regardless of which
slot we touch) as STACK_INVALID. Reads are already prevented.

Second, prevent writing	to stack memory from helpers if the range may
contain any STACK_DYNPTR slots. Reads are already prevented.

For helpers, we cannot allow it to destroy dynptrs from the writes as
depending on arguments, helper may take uninit_mem and dynptr both at
the same time. This would mean that helper may write to uninit_mem
before it reads the dynptr, which would be bad.

PTR_TO_MEM: [?????dd]

Depending on the code inside the helper, it may end up overwriting the
dynptr contents first and then read those as the dynptr argument.

Verifier would only simulate destruction when it does byte by byte
access simulation in check_helper_call for meta.access_size, and
fail to catch this case, as it happens after argument checks.

The same would need to be done for any other non-trivial objects created
on the stack in the future, such as bpf_list_head on stack, or
bpf_rb_root on stack.

A common misunderstanding in the current code is that MEM_UNINIT means
writes, but note that writes may also be performed even without
MEM_UNINIT in case of helpers, in that case the code after handling meta
&& meta->raw_mode will complain when it sees STACK_DYNPTR. So that
invalid read case also covers writes to potential STACK_DYNPTR slots.
The only loophole was in case of meta->raw_mode which simulated writes
through instructions which could overwrite them.

A future series sequenced after this will focus on the clean up of
helper access checks and bugs around that.

Fixes: 97e03f521050 ("bpf: Add verifier support for dynptrs")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                         | 88 +++++++++++++++++++
 .../testing/selftests/bpf/progs/dynptr_fail.c |  6 +-
 2 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 86a1e109e5024..20935d865c69f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -750,6 +750,8 @@ static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
 	__mark_dynptr_reg(reg, type, true);
 }
 
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+				        struct bpf_func_state *state, int spi);
 
 static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				   enum bpf_arg_type arg_type, int insn_idx)
@@ -844,6 +846,55 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 	return 0;
 }
 
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+			       struct bpf_reg_state *reg);
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+				        struct bpf_func_state *state, int spi)
+{
+	int i;
+
+	/* We always ensure that STACK_DYNPTR is never set partially,
+	 * hence just checking for slot_type[0] is enough. This is
+	 * different for STACK_SPILL, where it may be only set for
+	 * 1 byte, so code has to use is_spilled_reg.
+	 */
+	if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
+		return 0;
+
+	/* Reposition spi to first slot */
+	if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+		spi = spi + 1;
+
+	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+		verbose(env, "cannot overwrite referenced dynptr\n");
+		return -EINVAL;
+	}
+
+	mark_stack_slot_scratched(env, spi);
+	mark_stack_slot_scratched(env, spi - 1);
+
+	/* Writing partially to one dynptr stack slot destroys both. */
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		state->stack[spi].slot_type[i] = STACK_INVALID;
+		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+	}
+
+	/* TODO: Invalidate any slices associated with this dynptr */
+
+	/* Do not release reference state, we are destroying dynptr on stack,
+	 * not using some helper to release it. Just reset register.
+	 */
+	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+	/* Same reason as unmark_stack_slots_dynptr above */
+	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+	return 0;
+}
+
 static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
@@ -3433,6 +3484,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 			env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
 	}
 
+	err = destroy_if_dynptr_stack_slot(env, state, spi);
+	if (err)
+		return err;
+
 	mark_stack_slot_scratched(env, spi);
 	if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
 	    !register_is_null(reg) && env->bpf_capable) {
@@ -3557,6 +3612,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
 	if (err)
 		return err;
 
+	for (i = min_off; i < max_off; i++) {
+		int spi;
+
+		spi = __get_spi(i);
+		err = destroy_if_dynptr_stack_slot(env, state, spi);
+		if (err)
+			return err;
+	}
 
 	/* Variable offset writes destroy any spilled pointers in range. */
 	for (i = min_off; i < max_off; i++) {
@@ -5535,6 +5598,31 @@ static int check_stack_range_initialized(
 	}
 
 	if (meta && meta->raw_mode) {
+		/* Ensure we won't be overwriting dynptrs when simulating byte
+		 * by byte access in check_helper_call using meta.access_size.
+		 * This would be a problem if we have a helper in the future
+		 * which takes:
+		 *
+		 *	helper(uninit_mem, len, dynptr)
+		 *
+		 * Now, uninint_mem may overlap with dynptr pointer. Hence, it
+		 * may end up writing to dynptr itself when touching memory from
+		 * arg 1. This can be relaxed on a case by case basis for known
+		 * safe cases, but reject due to the possibilitiy of aliasing by
+		 * default.
+		 */
+		for (i = min_off; i < max_off + access_size; i++) {
+			int stack_off = -i - 1;
+
+			spi = __get_spi(i);
+			/* raw_mode may write past allocated_stack */
+			if (state->allocated_stack <= stack_off)
+				continue;
+			if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
+				verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
+				return -EACCES;
+			}
+		}
 		meta->access_size = access_size;
 		meta->regno = regno;
 		return 0;
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 54dd8e22b5f26..eb359c95579fa 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -427,7 +427,7 @@ int invalid_write1(void *ctx)
  * offset
  */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #3")
+__failure __msg("cannot overwrite referenced dynptr")
 int invalid_write2(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -451,7 +451,7 @@ int invalid_write2(void *ctx)
  * non-const offset
  */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #1")
+__failure __msg("cannot overwrite referenced dynptr")
 int invalid_write3(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -483,7 +483,7 @@ static int invalid_write4_callback(__u32 index, void *data)
  * be invalidated as a dynptr
  */
 SEC("?raw_tp")
-__failure __msg("arg 1 is an unacquired reference")
+__failure __msg("cannot overwrite referenced dynptr")
 int invalid_write4(void *ctx)
 {
 	struct bpf_dynptr ptr;

From b122496a2dbb39a454d08011ce48c566447a67ca Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Sun, 22 Oct 2023 06:02:03 +0000
Subject: [PATCH 136/175] Revert "perf/x86/amd/core: Fix overflow reset on
 hotplug"

There's unchecked MSR access error to MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR
in all zen4 virtualized instances during amd_pmu_cpu_starting.
This commit clearer LBR freeze bit of PerfCntrGLobalStatus register
and moved the reset sequence in amd_pmu_cpu start and dead but reverting
either change can fade the MSR access error away.

It could be possible that hypervisor does not allow the write to this
register and this commit made reset in effect. To prevent error
call traces in the dmesg and unblock the release, we need to revert this
commit for now.

This reverts commit e485a69d9b4433ad48673a1d31c705b09b3548c0.

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 arch/x86/events/amd/core.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 3ac069a4559b0..b9a58954dc535 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -534,12 +534,8 @@ static void amd_pmu_cpu_reset(int cpu)
 	/* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */
 	wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
 
-	/*
-	 * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze
-	 * and PerfCntrGLobalStatus.PerfCntrOvfl
-	 */
-	wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
-	       GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask);
+	/* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */
+	wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask);
 }
 
 static int amd_pmu_cpu_prepare(int cpu)
@@ -574,7 +570,6 @@ static void amd_pmu_cpu_starting(int cpu)
 	int i, nb_id;
 
 	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-	amd_pmu_cpu_reset(cpu);
 
 	if (!x86_pmu.amd_nb_constraints)
 		return;
@@ -596,6 +591,8 @@ static void amd_pmu_cpu_starting(int cpu)
 
 	cpuc->amd_nb->nb_id = nb_id;
 	cpuc->amd_nb->refcnt++;
+
+	amd_pmu_cpu_reset(cpu);
 }
 
 static void amd_pmu_cpu_dead(int cpu)
@@ -616,6 +613,8 @@ static void amd_pmu_cpu_dead(int cpu)
 
 		cpuhw->amd_nb = NULL;
 	}
+
+	amd_pmu_cpu_reset(cpu);
 }
 
 static inline void amd_pmu_set_global_ctl(u64 ctl)

From 0c8e3a88a4321a62d9ab3e82e04385d22383c47c Mon Sep 17 00:00:00 2001
From: David Arinzon <darinzon@amazon.com>
Date: Tue, 31 Oct 2023 13:49:28 +0000
Subject: [PATCH 137/175] AL2023-6.1-Update-ena-driver-to-2.10.0g

Signed-off-by: David Arinzon <darinzon@amazon.com>
---
 drivers/amazon/net/ena/Makefile          |   6 +-
 drivers/amazon/net/ena/config.h          |   5 +
 drivers/amazon/net/ena/ena_admin_defs.h  |  68 ++-
 drivers/amazon/net/ena/ena_com.c         | 201 ++++---
 drivers/amazon/net/ena/ena_com.h         |  35 +-
 drivers/amazon/net/ena/ena_devlink.c     | 380 -------------
 drivers/amazon/net/ena/ena_devlink.h     |  44 --
 drivers/amazon/net/ena/ena_eth_com.c     |  23 +-
 drivers/amazon/net/ena/ena_eth_com.h     |  19 +-
 drivers/amazon/net/ena/ena_eth_io_defs.h |  14 +-
 drivers/amazon/net/ena/ena_ethtool.c     | 129 ++++-
 drivers/amazon/net/ena/ena_netdev.c      | 676 +++++++++++++----------
 drivers/amazon/net/ena/ena_netdev.h      | 104 +++-
 drivers/amazon/net/ena/ena_phc.c         |  24 +-
 drivers/amazon/net/ena/ena_phc.h         |   5 +
 drivers/amazon/net/ena/ena_regs_defs.h   |   3 +
 drivers/amazon/net/ena/ena_sysfs.c       |  86 +++
 drivers/amazon/net/ena/ena_xdp.c         |  54 +-
 drivers/amazon/net/ena/ena_xdp.h         |  17 +-
 drivers/amazon/net/ena/kcompat.h         |  96 +---
 20 files changed, 1038 insertions(+), 951 deletions(-)
 create mode 100644 drivers/amazon/net/ena/config.h
 delete mode 100644 drivers/amazon/net/ena/ena_devlink.c
 delete mode 100644 drivers/amazon/net/ena/ena_devlink.h

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index b61366782d8d6..0c40c453562f5 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -1,13 +1,13 @@
 #
 # Makefile for the Elastic Network Adapter (ENA) device drivers.
 # ENA Source is: https://github.com/amzn/amzn-drivers.
-# Current ENA source is based on ena_linux_2.4.0 tag.
+# Current ENA source is based on ena_linux_2.10.0 tag.
 #
 
 obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
 
 ena-y := ena_netdev.o ena_ethtool.o ena_lpc.o ena_phc.o ena_xdp.o dim.o \
-	 ena_devlink.o net_dim.o ena_com.o ena_eth_com.o
+	 net_dim.o ena_com.o ena_eth_com.o
 
 ena-$(CONFIG_SYSFS) += ena_sysfs.o
 
@@ -18,3 +18,5 @@ endif
 ifdef ENA_PHC_INCLUDE
 	ccflags-y += -DENA_PHC_INCLUDE
 endif
+
+ccflags-y += -include $(srctree)/drivers/amazon/net/ena/config.h
diff --git a/drivers/amazon/net/ena/config.h b/drivers/amazon/net/ena/config.h
new file mode 100644
index 0000000000000..b4c9875108fb4
--- /dev/null
+++ b/drivers/amazon/net/ena/config.h
@@ -0,0 +1,5 @@
+#ifndef _ENA_CONFIG_H_
+#define _ENA_CONFIG_H_
+#define ENA_HAVE_PCI_DEV_ID 1
+#define ENA_HAVE_XDP_DO_FLUSH 1
+#endif /* _ENA_CONFIG_H_ */
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index f34b44a6fa230..61ca71af11cf5 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -69,12 +69,22 @@ enum ena_admin_aq_feature_id {
 	ENA_ADMIN_FEATURES_OPCODE_NUM               = 32,
 };
 
+/* feature version for the set/get ENA_ADMIN_LLQ feature admin commands */
+enum ena_admin_llq_feature_version {
+	/* legacy base version in older drivers */
+	ENA_ADMIN_LLQ_FEATURE_VERSION_0_LEGACY      = 0,
+	/* support entry_size recommendation by device */
+	ENA_ADMIN_LLQ_FEATURE_VERSION_1             = 1,
+};
+
 /* device capabilities */
 enum ena_admin_aq_caps_id {
 	ENA_ADMIN_ENI_STATS                         = 0,
 	/* ENA SRD customer metrics */
 	ENA_ADMIN_ENA_SRD_INFO                      = 1,
 	ENA_ADMIN_CUSTOMER_METRICS                  = 2,
+	ENA_ADMIN_EXTENDED_RESET_REASONS	    = 3,
+	ENA_ADMIN_CDESC_MBZ                         = 4,
 };
 
 enum ena_admin_placement_policy_type {
@@ -134,8 +144,14 @@ enum ena_admin_get_stats_scope {
 	ENA_ADMIN_ETH_TRAFFIC                       = 1,
 };
 
-enum ena_admin_get_phc_type {
-	ENA_ADMIN_PHC_TYPE_READLESS                 = 0,
+enum ena_admin_phc_feature_version {
+	/* Readless with error_bound */
+	ENA_ADMIN_PHC_FEATURE_VERSION_0             = 0,
+};
+
+enum ena_admin_phc_error_flags {
+	ENA_ADMIN_PHC_ERROR_FLAG_TIMESTAMP   = BIT(0),
+	ENA_ADMIN_PHC_ERROR_FLAG_ERROR_BOUND = BIT(1),
 };
 
 /* ENA SRD configuration for ENI */
@@ -671,8 +687,17 @@ struct ena_admin_feature_llq_desc {
 	/* the stride control the driver selected to use */
 	u16 descriptors_stride_ctrl_enabled;
 
+	/* feature version of device resp to either GET/SET commands. */
+	u8 feature_version;
+
+	/* llq entry size recommended by the device,
+	 * values correlated to enum ena_admin_llq_ring_entry_size.
+	 * used only for GET command.
+	 */
+	u8 entry_size_recommended;
+
 	/* reserved */
-	u32 reserved1;
+	u8 reserved1[2];
 
 	/* accelerated low latency queues requirement. driver needs to
 	 * support those requirements in order to use accelerated llq
@@ -967,7 +992,9 @@ struct ena_admin_host_info {
 	 * 4 : rss_configurable_function_key
 	 * 5 : reserved
 	 * 6 : rx_page_reuse
-	 * 31:7 : reserved
+	 * 7 : tx_ipv6_csum_offload
+	 * 8 : phc
+	 * 31:9 : reserved
 	 */
 	u32 driver_supported_features;
 };
@@ -1053,10 +1080,10 @@ struct ena_admin_queue_ext_feature_desc {
 };
 
 struct ena_admin_feature_phc_desc {
-	/* PHC type as defined in enum ena_admin_get_phc_type,
-	 * used only for GET command.
+	/* PHC version as defined in enum ena_admin_phc_feature_version,
+	 * used only for GET command as max supported PHC version by the device.
 	 */
-	u8 type;
+	u8 version;
 
 	/* Reserved - MBZ */
 	u8 reserved1[3];
@@ -1196,7 +1223,8 @@ enum ena_admin_aenq_group {
 	ENA_ADMIN_NOTIFICATION                      = 3,
 	ENA_ADMIN_KEEP_ALIVE                        = 4,
 	ENA_ADMIN_REFRESH_CAPABILITIES              = 5,
-	ENA_ADMIN_AENQ_GROUPS_NUM                   = 6,
+	ENA_ADMIN_CONF_NOTIFICATIONS		    = 6,
+	ENA_ADMIN_AENQ_GROUPS_NUM                   = 7,
 };
 
 enum ena_admin_aenq_notification_syndrome {
@@ -1233,6 +1261,14 @@ struct ena_admin_aenq_keep_alive_desc {
 	u32 rx_overruns_high;
 };
 
+struct ena_admin_aenq_conf_notifications_desc {
+	struct ena_admin_aenq_common_desc aenq_common_desc;
+
+	u64 notifications_bitmap;
+
+	u64 reserved;
+};
+
 struct ena_admin_ena_mmio_req_read_less_resp {
 	u16 req_id;
 
@@ -1243,13 +1279,23 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 };
 
 struct ena_admin_phc_resp {
+	/* Request Id, received from DB register */
 	u16 req_id;
 
 	u8 reserved1[6];
 
+	/* PHC timestamp (nsec) */
 	u64 timestamp;
 
-	u8 reserved2[48];
+	u8 reserved2[8];
+
+	/* Timestamp error limit (nsec) */
+	u32 error_bound;
+
+	/* Bit field of enum ena_admin_phc_error_flags */
+	u32 error_flags;
+
+	u8 reserved3[32];
 };
 
 /* aq_common_desc */
@@ -1350,6 +1396,10 @@ struct ena_admin_phc_resp {
 #define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK BIT(4)
 #define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_SHIFT             6
 #define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK              BIT(6)
+#define ENA_ADMIN_HOST_INFO_TX_IPV6_CSUM_OFFLOAD_SHIFT      7
+#define ENA_ADMIN_HOST_INFO_TX_IPV6_CSUM_OFFLOAD_MASK       BIT(7)
+#define ENA_ADMIN_HOST_INFO_PHC_SHIFT                       8
+#define ENA_ADMIN_HOST_INFO_PHC_MASK                        BIT(8)
 
 /* feature_rss_ind_table */
 #define ENA_ADMIN_FEATURE_RSS_IND_TABLE_ONE_ENTRY_UPDATE_MASK BIT(0)
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 889d3412a72df..d4f73b8b200b3 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -35,15 +35,19 @@
 
 #define ENA_REGS_ADMIN_INTR_MASK 1
 
+#define ENA_MAX_BACKOFF_DELAY_EXP 16U
+
 #define ENA_MIN_ADMIN_POLL_US 100
 
 #define ENA_MAX_ADMIN_POLL_US 5000
 
 /* PHC definitions */
-#define ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC 20
+#define ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC 10
 #define ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC 1000
-#define ENA_PHC_TIMESTAMP_ERROR 0xFFFFFFFFFFFFFFFF
+#define ENA_PHC_MAX_ERROR_BOUND 0xFFFFFFFF
 #define ENA_PHC_REQ_ID_OFFSET 0xDEAD
+#define ENA_PHC_ERROR_FLAGS (ENA_ADMIN_PHC_ERROR_FLAG_TIMESTAMP | \
+			     ENA_ADMIN_PHC_ERROR_FLAG_ERROR_BOUND)
 
 /*****************************************************************************/
 /*****************************************************************************/
@@ -76,7 +80,8 @@ static int ena_com_mem_addr_set(struct ena_com_dev *ena_dev,
 				       struct ena_common_mem_addr *ena_addr,
 				       dma_addr_t addr)
 {
-	if ((addr & GENMASK_ULL(ena_dev->dma_addr_bits - 1, 0)) != addr) {
+	if (unlikely((addr & GENMASK_ULL(ena_dev->dma_addr_bits - 1, 0)) !=
+		     addr)) {
 		netdev_err(ena_dev->net_device,
 			   "DMA address has more bits that the device supports\n");
 		return -EINVAL;
@@ -97,7 +102,7 @@ static int ena_com_admin_init_sq(struct ena_com_admin_queue *admin_queue)
 	sq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size,
 					  &sq->dma_addr, GFP_KERNEL);
 
-	if (!sq->entries) {
+	if (unlikely(!sq->entries)) {
 		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
 	}
@@ -120,7 +125,7 @@ static int ena_com_admin_init_cq(struct ena_com_admin_queue *admin_queue)
 	cq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size,
 					  &cq->dma_addr, GFP_KERNEL);
 
-	if (!cq->entries) {
+	if (unlikely(!cq->entries)) {
 		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
 	}
@@ -143,7 +148,7 @@ static int ena_com_admin_init_aenq(struct ena_com_dev *ena_dev,
 	aenq->entries = dma_zalloc_coherent(ena_dev->dmadev, size,
 					    &aenq->dma_addr, GFP_KERNEL);
 
-	if (!aenq->entries) {
+	if (unlikely(!aenq->entries)) {
 		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
 	}
@@ -178,6 +183,7 @@ static int ena_com_admin_init_aenq(struct ena_com_dev *ena_dev,
 static void comp_ctxt_release(struct ena_com_admin_queue *queue,
 				     struct ena_comp_ctx *comp_ctx)
 {
+	comp_ctx->user_cqe = NULL;
 	comp_ctx->occupied = false;
 	atomic_dec(&queue->outstanding_cmds);
 }
@@ -229,7 +235,7 @@ static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queu
 
 	/* In case of queue FULL */
 	cnt = (u16)atomic_read(&admin_queue->outstanding_cmds);
-	if (cnt >= admin_queue->q_depth) {
+	if (unlikely(cnt >= admin_queue->q_depth)) {
 		netdev_dbg(admin_queue->ena_dev->net_device,
 			   "Admin queue is full.\n");
 		admin_queue->stats.out_of_space++;
@@ -352,7 +358,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 						    GFP_KERNEL);
 		}
 
-		if (!io_sq->desc_addr.virt_addr) {
+		if (unlikely(!io_sq->desc_addr.virt_addr)) {
 			netdev_err(ena_dev->net_device,
 				   "Memory allocation failed\n");
 			return -ENOMEM;
@@ -379,7 +385,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 			io_sq->bounce_buf_ctrl.base_buffer =
 				devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
 
-		if (!io_sq->bounce_buf_ctrl.base_buffer) {
+		if (unlikely(!io_sq->bounce_buf_ctrl.base_buffer)) {
 			netdev_err(ena_dev->net_device,
 				   "Bounce buffer memory allocation failed\n");
 			return -ENOMEM;
@@ -441,7 +447,7 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev,
 					    GFP_KERNEL);
 	}
 
-	if (!io_cq->cdesc_addr.virt_addr) {
+	if (unlikely(!io_cq->cdesc_addr.virt_addr)) {
 		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
 	}
@@ -469,6 +475,9 @@ static void ena_com_handle_single_admin_completion(struct ena_com_admin_queue *a
 		return;
 	}
 
+	if (!comp_ctx->occupied)
+		return;
+
 	comp_ctx->status = ENA_CMD_COMPLETED;
 	comp_ctx->comp_status = cqe->acq_common_descriptor.status;
 
@@ -544,8 +553,9 @@ static int ena_com_comp_status_to_errno(struct ena_com_admin_queue *admin_queue,
 
 static void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us)
 {
+	exp = min_t(u32, ENA_MAX_BACKOFF_DELAY_EXP, exp);
 	delay_us = max_t(u32, ENA_MIN_ADMIN_POLL_US, delay_us);
-	delay_us = min_t(u32, delay_us * (1U << exp), ENA_MAX_ADMIN_POLL_US);
+	delay_us = min_t(u32, ENA_MAX_ADMIN_POLL_US, delay_us * (1U << exp));
 	usleep_range(delay_us, 2 * delay_us);
 }
 
@@ -567,7 +577,7 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
 		if (comp_ctx->status != ENA_CMD_SUBMITTED)
 			break;
 
-		if (time_is_before_jiffies(timeout)) {
+		if (unlikely(time_is_before_jiffies(timeout))) {
 			netdev_err(admin_queue->ena_dev->net_device,
 				   "Wait for completion (polling) timeout\n");
 			/* ENA didn't have any completion */
@@ -771,7 +781,7 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 			llq_default_cfg->llq_ring_entry_size_value;
 
 	rc = ena_com_set_llq(ena_dev);
-	if (rc)
+	if (unlikely(rc))
 		netdev_err(ena_dev->net_device,
 			   "Cannot set LLQ configuration: %d\n", rc);
 
@@ -878,7 +888,7 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
 		goto err;
 	}
 
-	if (read_resp->reg_off != offset) {
+	if (unlikely(read_resp->reg_off != offset)) {
 		netdev_err(ena_dev->net_device,
 			   "Read failure: wrong offset provided\n");
 		ret = ENA_MMIO_READ_TIMEOUT;
@@ -999,7 +1009,7 @@ static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
 			exp_state)
 			return 0;
 
-		if (time_is_before_jiffies(timeout_stamp))
+		if (unlikely(time_is_before_jiffies(timeout_stamp)))
 			return -ETIME;
 
 		ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us);
@@ -1447,7 +1457,7 @@ int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid,
 			    struct ena_com_io_sq **io_sq,
 			    struct ena_com_io_cq **io_cq)
 {
-	if (qid >= ENA_TOTAL_NUM_QUEUES) {
+	if (unlikely(qid >= ENA_TOTAL_NUM_QUEUES)) {
 		netdev_err(ena_dev->net_device,
 			   "Invalid queue number %d but the max is %d\n", qid,
 			   ENA_TOTAL_NUM_QUEUES);
@@ -1558,7 +1568,7 @@ int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag)
 	int ret;
 
 	ret = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_AENQ_CONFIG, 0);
-	if (ret) {
+	if (unlikely(ret)) {
 		dev_info(ena_dev->dmadev, "Can't get aenq configuration\n");
 		return ret;
 	}
@@ -1606,7 +1616,7 @@ int ena_com_get_dma_width(struct ena_com_dev *ena_dev)
 
 	netdev_dbg(ena_dev->net_device, "ENA dma width: %d\n", width);
 
-	if ((width < 32) || width > ENA_MAX_PHYS_ADDR_SIZE_BITS) {
+	if (unlikely((width < 32) || width > ENA_MAX_PHYS_ADDR_SIZE_BITS)) {
 		netdev_err(ena_dev->net_device, "DMA width illegal value: %d\n",
 			   width);
 		return -EINVAL;
@@ -1764,8 +1774,11 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev)
 	struct ena_admin_set_feat_cmd set_feat_cmd;
 	int ret = 0;
 
-	/* Get device PHC default configuration */
-	ret = ena_com_get_feature(ena_dev, &get_feat_resp, ENA_ADMIN_PHC_CONFIG, 0);
+	/* Get default device PHC configuration */
+	ret = ena_com_get_feature(ena_dev,
+				  &get_feat_resp,
+				  ENA_ADMIN_PHC_CONFIG,
+				  ENA_ADMIN_PHC_FEATURE_VERSION_0);
 	if (unlikely(ret)) {
 		netdev_err(ena_dev->net_device,
 			   "Failed to get PHC feature configuration, error: %d\n",
@@ -1773,10 +1786,11 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev)
 		return ret;
 	}
 
-	/* Suporting only readless PHC retrieval */
-	if (get_feat_resp.u.phc.type != ENA_ADMIN_PHC_TYPE_READLESS) {
+	/* Supporting only PHC V0 (readless mode with error bound) */
+	if (get_feat_resp.u.phc.version != ENA_ADMIN_PHC_FEATURE_VERSION_0) {
 		netdev_err(ena_dev->net_device,
-			   "Unsupprted PHC type, error: %d\n", -EOPNOTSUPP);
+			   "Unsupprted PHC version (0x%X), error: %d\n",
+			   get_feat_resp.u.phc.version, -EOPNOTSUPP);
 		return -EOPNOTSUPP;
 	}
 
@@ -1793,11 +1807,11 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev)
 				   get_feat_resp.u.phc.block_timeout_usec :
 				   ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC;
 
-	/* Sanity check - expire timeout must not be above skip timeout */
+	/* Sanity check - expire timeout must not exceed block timeout */
 	if (phc->expire_timeout_usec > phc->block_timeout_usec)
 		phc->expire_timeout_usec = phc->block_timeout_usec;
 
-	/* Prepare PHC feature command with PHC output address */
+	/* Prepare PHC config feature command */
 	memset(&set_feat_cmd, 0x0, sizeof(set_feat_cmd));
 	set_feat_cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
 	set_feat_cmd.feat_common.feature_id = ENA_ADMIN_PHC_CONFIG;
@@ -1832,27 +1846,29 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev)
 void ena_com_phc_destroy(struct ena_com_dev *ena_dev)
 {
 	struct ena_com_phc_info *phc = &ena_dev->phc;
-
-	phc->active = false;
+	unsigned long flags = 0;
 
 	/* In case PHC is not supported by the device, silently exiting */
 	if (!phc->virt_addr)
 		return;
 
+	spin_lock_irqsave(&phc->lock, flags);
+	phc->active = false;
+	spin_unlock_irqrestore(&phc->lock, flags);
+
 	dma_free_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr),
 			  phc->virt_addr, phc->phys_addr);
 	phc->virt_addr = NULL;
 }
 
-int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
+int ena_com_phc_get_timestamp(struct ena_com_dev *ena_dev, u64 *timestamp)
 {
 	volatile struct ena_admin_phc_resp *read_resp = ena_dev->phc.virt_addr;
+	const ktime_t zero_system_time = ktime_set(0, 0);
 	struct ena_com_phc_info *phc = &ena_dev->phc;
-	ktime_t initial_time = ktime_set(0, 0);
-	static ktime_t start_time;
-	unsigned long flags = 0;
 	ktime_t expire_time;
 	ktime_t block_time;
+	unsigned long flags = 0;
 	int ret = 0;
 
 	if (!phc->active) {
@@ -1864,9 +1880,10 @@ int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
 	spin_lock_irqsave(&phc->lock, flags);
 
 	/* Check if PHC is in blocked state */
-	if (unlikely(ktime_compare(start_time, initial_time))) {
+	if (unlikely(ktime_compare(phc->system_time, zero_system_time))) {
 		/* Check if blocking time expired */
-		block_time = ktime_add_us(start_time, phc->block_timeout_usec);
+		block_time =
+			ktime_add_us(phc->system_time, phc->block_timeout_usec);
 		if (!ktime_after(ktime_get(), block_time)) {
 			/* PHC is still in blocked state, skip PHC request */
 			phc->stats.phc_skp++;
@@ -1874,9 +1891,9 @@ int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
 			goto skip;
 		}
 
-		/* PHC is in active state, update statistics according to req_id and timestamp */
+		/* PHC is in active state, update statistics according to req_id and error_flags */
 		if ((READ_ONCE(read_resp->req_id) != phc->req_id) ||
-		    (read_resp->timestamp == ENA_PHC_TIMESTAMP_ERROR)) {
+		    (read_resp->error_flags & ENA_PHC_ERROR_FLAGS)) {
 			/* Device didn't update req_id during blocking time or timestamp is invalid,
 			 * this indicates on a device error
 			 */
@@ -1888,9 +1905,9 @@ int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
 	}
 
 	/* Setting relative timeouts */
-	start_time = ktime_get();
-	block_time = ktime_add_us(start_time, phc->block_timeout_usec);
-	expire_time = ktime_add_us(start_time, phc->expire_timeout_usec);
+	phc->system_time = ktime_get();
+	block_time = ktime_add_us(phc->system_time, phc->block_timeout_usec);
+	expire_time = ktime_add_us(phc->system_time, phc->expire_timeout_usec);
 
 	/* We expect the device to return this req_id once the new PHC timestamp is updated */
 	phc->req_id++;
@@ -1907,35 +1924,46 @@ int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
 	while (1) {
 		if (unlikely(ktime_after(ktime_get(), expire_time))) {
 			/* Gave up waiting for updated req_id, PHC enters into
-			 * blocked state until passing blocking time
+			 * blocked state until passing blocking time, during
+			 * this time any get PHC timestamp or error bound
+			 * requests will fail with device busy error
 			 */
+			phc->error_bound = ENA_PHC_MAX_ERROR_BOUND;
 			ret = -EBUSY;
 			break;
 		}
 
 		/* Check if req_id was updated by the device */
 		if (READ_ONCE(read_resp->req_id) != phc->req_id) {
-			/* req_id was not updated by the device, check again on next loop */
+			/* req_id was not updated by the device yet, check again on next loop */
 			continue;
 		}
 
-		/* req_id was updated which indicates that PHC timestamp was updated too */
-		*timestamp = read_resp->timestamp;
-
-		/* PHC timestamp validty check */
-		if (unlikely(*timestamp == ENA_PHC_TIMESTAMP_ERROR)) {
-			/* Retrieved invalid PHC timestamp, PHC enters into
-			 * blocked state until passing blocking time
+		/* req_id was updated by the device which indicates that PHC timestamp, error_bound
+		 * and error_flags are updated too, checking errors before retrieving timestamp and
+		 * error_bound values
+		 */
+		if (unlikely(read_resp->error_flags & ENA_PHC_ERROR_FLAGS)) {
+			/* Retrieved timestamp or error bound errors, PHC enters into blocked state
+			 * until passing blocking time, during this time any get PHC timestamp or
+			 * error bound requests will fail with device busy error
 			 */
+			phc->error_bound = ENA_PHC_MAX_ERROR_BOUND;
 			ret = -EBUSY;
 			break;
 		}
 
-		/* Retrieved valid PHC timestamp */
+		/* PHC timestamp value is returned to the caller */
+		*timestamp = read_resp->timestamp;
+
+		/* Error bound value is cached for future retrieval by caller */
+		phc->error_bound = read_resp->error_bound;
+
+		/* Update statistic on valid PHC timestamp retrieval */
 		phc->stats.phc_cnt++;
 
 		/* This indicates PHC state is active */
-		start_time = initial_time;
+		phc->system_time = zero_system_time;
 		break;
 	}
 
@@ -1945,6 +1973,25 @@ int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
 	return ret;
 }
 
+int ena_com_phc_get_error_bound(struct ena_com_dev *ena_dev, u32 *error_bound)
+{
+	struct ena_com_phc_info *phc = &ena_dev->phc;
+	u32 local_error_bound = phc->error_bound;
+
+	if (!phc->active) {
+		netdev_err(ena_dev->net_device,
+			   "PHC feature is not active in the device\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (local_error_bound == ENA_PHC_MAX_ERROR_BOUND)
+		return -EBUSY;
+
+	*error_bound = local_error_bound;
+
+	return 0;
+}
+
 int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev)
 {
 	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
@@ -2034,15 +2081,15 @@ int ena_com_admin_init(struct ena_com_dev *ena_dev,
 	spin_lock_init(&admin_queue->q_lock);
 
 	ret = ena_com_init_comp_ctxt(admin_queue);
-	if (ret)
+	if (unlikely(ret))
 		goto error;
 
 	ret = ena_com_admin_init_sq(admin_queue);
-	if (ret)
+	if (unlikely(ret))
 		goto error;
 
 	ret = ena_com_admin_init_cq(admin_queue);
-	if (ret)
+	if (unlikely(ret))
 		goto error;
 
 	admin_queue->sq.db_addr = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
@@ -2075,7 +2122,7 @@ int ena_com_admin_init(struct ena_com_dev *ena_dev,
 	writel(aq_caps, ena_dev->reg_bar + ENA_REGS_AQ_CAPS_OFF);
 	writel(acq_caps, ena_dev->reg_bar + ENA_REGS_ACQ_CAPS_OFF);
 	ret = ena_com_admin_init_aenq(ena_dev, aenq_handlers);
-	if (ret)
+	if (unlikely(ret))
 		goto error;
 
 	admin_queue->ena_dev = ena_dev;
@@ -2095,7 +2142,7 @@ int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
 	struct ena_com_io_cq *io_cq;
 	int ret;
 
-	if (ctx->qid >= ENA_TOTAL_NUM_QUEUES) {
+	if (unlikely(ctx->qid >= ENA_TOTAL_NUM_QUEUES)) {
 		netdev_err(ena_dev->net_device,
 			   "Qid (%d) is bigger than max num of queues (%d)\n",
 			   ctx->qid, ENA_TOTAL_NUM_QUEUES);
@@ -2127,18 +2174,18 @@ int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
 			min_t(u32, ena_dev->tx_max_header_size, SZ_256);
 
 	ret = ena_com_init_io_sq(ena_dev, ctx, io_sq);
-	if (ret)
+	if (unlikely(ret))
 		goto error;
 	ret = ena_com_init_io_cq(ena_dev, ctx, io_cq);
-	if (ret)
+	if (unlikely(ret))
 		goto error;
 
 	ret = ena_com_create_io_cq(ena_dev, io_cq);
-	if (ret)
+	if (unlikely(ret))
 		goto error;
 
 	ret = ena_com_create_io_sq(ena_dev, io_sq, io_cq->idx);
-	if (ret)
+	if (unlikely(ret))
 		goto destroy_io_cq;
 
 	return 0;
@@ -2155,7 +2202,7 @@ void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid)
 	struct ena_com_io_sq *io_sq;
 	struct ena_com_io_cq *io_cq;
 
-	if (qid >= ENA_TOTAL_NUM_QUEUES) {
+	if (unlikely(qid >= ENA_TOTAL_NUM_QUEUES)) {
 		netdev_err(ena_dev->net_device,
 			   "Qid (%d) is bigger than max num of queues (%d)\n",
 			   qid, ENA_TOTAL_NUM_QUEUES);
@@ -2303,7 +2350,8 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	else
 		return rc;
 
-	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_LLQ, 0);
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_LLQ, ENA_ADMIN_LLQ_FEATURE_VERSION_1);
 	if (!rc)
 		memcpy(&get_feat_ctx->llq, &get_resp.u.llq,
 		       sizeof(get_resp.u.llq));
@@ -2406,6 +2454,7 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data)
 int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 		      enum ena_regs_reset_reason_types reset_reason)
 {
+	u32 reset_reason_msb, reset_reason_lsb;
 	u32 stat, timeout, cap, reset_val;
 	int rc;
 
@@ -2433,8 +2482,28 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 
 	/* start reset */
 	reset_val = ENA_REGS_DEV_CTL_DEV_RESET_MASK;
-	reset_val |= (reset_reason << ENA_REGS_DEV_CTL_RESET_REASON_SHIFT) &
-		     ENA_REGS_DEV_CTL_RESET_REASON_MASK;
+
+	/* For backward compatibility, device will interpret
+	 * bits 24-27 as MSB, bits 28-31 as LSB
+	 */
+	reset_reason_lsb = ENA_FIELD_GET(reset_reason, ENA_RESET_REASON_LSB_MASK,
+					 ENA_RESET_REASON_LSB_OFFSET);
+
+	reset_reason_msb = ENA_FIELD_GET(reset_reason, ENA_RESET_REASON_MSB_MASK,
+					 ENA_RESET_REASON_MSB_OFFSET);
+
+	reset_val |= reset_reason_lsb << ENA_REGS_DEV_CTL_RESET_REASON_SHIFT;
+
+	if (ena_com_get_cap(ena_dev, ENA_ADMIN_EXTENDED_RESET_REASONS))
+		reset_val |= reset_reason_msb << ENA_REGS_DEV_CTL_RESET_REASON_EXT_SHIFT;
+	else if (reset_reason_msb) {
+		/* In case the device does not support intended
+		 * extended reset reason fallback to generic
+		 */
+		reset_val = ENA_REGS_DEV_CTL_DEV_RESET_MASK;
+		reset_val |= (ENA_REGS_RESET_GENERIC << ENA_REGS_DEV_CTL_RESET_REASON_SHIFT) &
+			      ENA_REGS_DEV_CTL_RESET_REASON_MASK;
+	}
 	writel(reset_val, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF);
 
 	/* Write again the MMIO read request address */
@@ -2442,7 +2511,7 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 
 	rc = wait_for_reset_state(ena_dev, timeout,
 				  ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK);
-	if (rc != 0) {
+	if (unlikely(rc)) {
 		netdev_err(ena_dev->net_device,
 			   "Reset indication didn't turn on\n");
 		return rc;
@@ -2451,7 +2520,7 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 	/* reset done */
 	writel(0, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF);
 	rc = wait_for_reset_state(ena_dev, timeout, 0);
-	if (rc != 0) {
+	if (unlikely(rc)) {
 		netdev_err(ena_dev->net_device,
 			   "Reset indication didn't turn off\n");
 		return rc;
@@ -3136,7 +3205,7 @@ int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev)
 				    customer_metrics->buffer_len,
 				    &customer_metrics->buffer_dma_addr,
 				    GFP_KERNEL);
-	if (!customer_metrics->buffer_virt_addr)
+	if (unlikely(!customer_metrics->buffer_virt_addr))
 		return -ENOMEM;
 
 	return 0;
@@ -3327,7 +3396,7 @@ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
 	}
 
 	rc = ena_com_config_llq_info(ena_dev, llq_features, llq_default_cfg);
-	if (rc)
+	if (unlikely(rc))
 		return rc;
 
 	ena_dev->tx_max_header_size = llq_info->desc_list_entry_size -
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index f44e59176e459..00776c433f7cf 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -42,6 +42,14 @@
 #define ADMIN_CQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_acq_entry))
 #define ADMIN_AENQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_aenq_entry))
 
+/* Macros used to extract LSB/MSB from the
+ * enums defining the reset reasons
+ */
+#define ENA_RESET_REASON_LSB_OFFSET			    0
+#define ENA_RESET_REASON_LSB_MASK			    0xf
+#define ENA_RESET_REASON_MSB_OFFSET			    4
+#define ENA_RESET_REASON_MSB_MASK			    0xf0
+
 #define ENA_CUSTOMER_METRICS_BUFFER_SIZE 512
 
 /*****************************************************************************/
@@ -280,6 +288,9 @@ struct ena_com_phc_info {
 	/* PHC shared memory - virtual address */
 	struct ena_admin_phc_resp *virt_addr;
 
+	/* System time of last PHC request */
+	ktime_t system_time;
+
 	/* Spin lock to ensure a single outstanding PHC read */
 	spinlock_t lock;
 
@@ -299,16 +310,17 @@ struct ena_com_phc_info {
 	 */
 	u32 block_timeout_usec;
 
+	/* PHC shared memory - physical address */
+	dma_addr_t phys_addr;
+
+	/* Cached error bound per timestamp sample */
+	u32 error_bound;
+
 	/* Request id sent to the device */
 	u16 req_id;
 
 	/* True if PHC is active in the device */
 	bool active;
-
-	/* PHC shared memory - memory handle */
-
-	/* PHC shared memory - physical address */
-	dma_addr_t phys_addr;
 };
 
 struct ena_rss {
@@ -464,12 +476,19 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev);
  */
 void ena_com_phc_destroy(struct ena_com_dev *ena_dev);
 
-/* ena_com_phc_get - Retrieve PHC timestamp
+/* ena_com_phc_get_timestamp - Retrieve PHC timestamp
+ * @ena_dev: ENA communication layer struct
+ * @timestamp: Retrieved PHC timestamp
+ * @return - 0 on success, negative value on failure
+ */
+int ena_com_phc_get_timestamp(struct ena_com_dev *ena_dev, u64 *timestamp);
+
+/* ena_com_phc_get_error_bound - Retrieve cached PHC error bound
  * @ena_dev: ENA communication layer struct
- * @timestamp: Retrieve PHC timestamp
+ * @error_bound: Cached PHC error bound
  * @return - 0 on success, negative value on failure
  */
-int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp);
+int ena_com_phc_get_error_bound(struct ena_com_dev *ena_dev, u32 *error_bound);
 
 /* ena_com_set_mmio_read_mode - Enable/disable the indirect mmio reg read mechanism
  * @ena_dev: ENA communication layer struct
diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c
deleted file mode 100644
index 43ce1ae2cebaa..0000000000000
--- a/drivers/amazon/net/ena/ena_devlink.c
+++ /dev/null
@@ -1,380 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
-/*
- * Copyright 2015-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
- */
-
-#include "linux/pci.h"
-
-#include "ena_devlink.h"
-#ifdef ENA_DEVLINK_SUPPORT
-#ifdef ENA_PHC_SUPPORT
-#include "ena_phc.h"
-
-static int ena_devlink_phc_enable_validate(struct devlink *devlink, u32 id,
-					   union devlink_param_value val,
-					   struct netlink_ext_ack *extack);
-#endif /* ENA_PHC_SUPPORT */
-
-static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
-					   union devlink_param_value val,
-					   struct netlink_ext_ack *extack);
-
-enum ena_devlink_param_id {
-	ENA_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
-	ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-#ifdef ENA_PHC_SUPPORT
-	ENA_DEVLINK_PARAM_ID_PHC_ENABLE,
-#endif /* ENA_PHC_SUPPORT */
-};
-
-static const struct devlink_param ena_devlink_params[] = {
-	DEVLINK_PARAM_DRIVER(ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-			     "large_llq_header", DEVLINK_PARAM_TYPE_BOOL,
-			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
-			     NULL, NULL, ena_devlink_llq_header_validate),
-#ifdef ENA_PHC_SUPPORT
-	DEVLINK_PARAM_DRIVER(ENA_DEVLINK_PARAM_ID_PHC_ENABLE,
-			     "phc_enable", DEVLINK_PARAM_TYPE_BOOL,
-			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
-			     NULL, NULL, ena_devlink_phc_enable_validate),
- #endif /* ENA_PHC_SUPPORT */
-};
-
-static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
-					   union devlink_param_value val,
-					   struct netlink_ext_ack *extack)
-{
-	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
-	bool value = val.vbool;
-
-	if (!value)
-		return 0;
-
-	if (adapter->ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) {
-		NL_SET_ERR_MSG_MOD(extack, "Instance doesn't support LLQ");
-		return -EOPNOTSUPP;
-	}
-
-	if (!adapter->large_llq_header_supported) {
-		NL_SET_ERR_MSG_MOD(extack, "Instance doesn't support large LLQ");
-		return -EOPNOTSUPP;
-	}
-
-	return 0;
-}
-
-#ifdef ENA_PHC_SUPPORT
-static int ena_devlink_phc_enable_validate(struct devlink *devlink, u32 id,
-					   union devlink_param_value val,
-					   struct netlink_ext_ack *extack)
-{
-	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
-
-	if (!val.vbool)
-		return 0;
-
-	if (!ena_com_phc_supported(adapter->ena_dev)) {
-		NL_SET_ERR_MSG_MOD(extack, "Device doesn't support PHC");
-		return -EOPNOTSUPP;
-	}
-
-	return 0;
-}
-
-#endif /* ENA_PHC_SUPPORT */
-#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
-/* Determines if ena_devlink_register has been called.
- * Prefer to check if the driver enabled reloading capabilities, but fallback
- * to check if driver configured 'dev' devlink attribute for older kernels.
- */
-bool ena_is_devlink_params_registered(struct devlink *devlink)
-{
-#if defined(ENA_DEVLINK_RELOAD_ENABLING_REQUIRED)
-	return devlink->reload_enabled;
-#elif !defined(ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC)
-	return devlink->dev;
-#endif
-}
-
-#endif
-void ena_devlink_params_get(struct devlink *devlink)
-{
-	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
-	union devlink_param_value val;
-	int err;
-
-#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
-	/* If devlink params aren't registered, don't access them */
-	if (!ena_is_devlink_params_registered(devlink))
-		return;
-#endif
-	err = devl_param_driverinit_value_get(devlink,
-					      ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-					      &val);
-	if (err) {
-		netdev_err(adapter->netdev, "Failed to query LLQ header size param\n");
-		return;
-	}
-
-	adapter->large_llq_header_enabled = val.vbool;
-#ifdef ENA_PHC_SUPPORT
-
-	err = devl_param_driverinit_value_get(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, &val);
-	if (err) {
-		netdev_err(adapter->netdev, "Failed to query PHC param\n");
-		return;
-	}
-
-	ena_phc_enable(adapter, val.vbool);
-#endif /* ENA_PHC_SUPPORT */
-}
-
-void ena_devlink_disable_large_llq_header_param(struct devlink *devlink)
-{
-	union devlink_param_value value;
-
-#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
-	/* If devlink params aren't registered, don't access them */
-	if (!ena_is_devlink_params_registered(devlink))
-		return;
-
-#endif
-	value.vbool = false;
-	devl_param_driverinit_value_set(devlink,
-					ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-					value);
-}
-
-#ifdef ENA_PHC_SUPPORT
-void ena_devlink_disable_phc_param(struct devlink *devlink)
-{
-	union devlink_param_value value;
-
-#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
-	/* If devlink params aren't registered, don't access them */
-	if (!ena_is_devlink_params_registered(devlink))
-		return;
-
-#endif
-	value.vbool = false;
-	devl_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
-}
-
-#endif /* ENA_PHC_SUPPORT */
-static int ena_devlink_reload_down(struct devlink *devlink,
-#ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
-				   bool netns_change,
-#endif
-#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
-				   enum devlink_reload_action action,
-				   enum devlink_reload_limit limit,
-#endif
-				   struct netlink_ext_ack *extack)
-{
-	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
-
-#ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
-	if (netns_change) {
-		NL_SET_ERR_MSG_MOD(extack, "Namespace change is not supported");
-		return -EOPNOTSUPP;
-	}
-
-#endif
-#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
-	if (action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
-		NL_SET_ERR_MSG_MOD(extack, "Action is not supported");
-		return -EOPNOTSUPP;
-	}
-
-	if (limit != DEVLINK_RELOAD_LIMIT_UNSPEC) {
-		NL_SET_ERR_MSG_MOD(extack, "Driver reload doesn't support limitations");
-		return -EOPNOTSUPP;
-	}
-
-#endif
-	rtnl_lock();
-	ena_destroy_device(adapter, false);
-	rtnl_unlock();
-
-	return 0;
-}
-
-static int ena_devlink_reload_up(struct devlink *devlink,
-#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
-				 enum devlink_reload_action action,
-				 enum devlink_reload_limit limit,
-				 u32 *actions_performed,
-#endif
-				 struct netlink_ext_ack *extack)
-{
-	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
-	int err = 0;
-
-#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
-	if (action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
-		NL_SET_ERR_MSG_MOD(extack, "Action is not supported");
-		return -EOPNOTSUPP;
-	}
-
-	if (limit != DEVLINK_RELOAD_LIMIT_UNSPEC) {
-		NL_SET_ERR_MSG_MOD(extack, "Driver reload doesn't support limitations");
-		return -EOPNOTSUPP;
-	}
-
-#endif
-	rtnl_lock();
-	/* Check that no other routine initialized the device (e.g.
-	 * ena_fw_reset_device()). Also we're under devlink_mutex here,
-	 * so devlink isn't freed under our feet.
-	 */
-	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
-		err = ena_restore_device(adapter);
-
-	rtnl_unlock();
-
-#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
-	if (!err)
-		*actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
-
-#endif
-	return err;
-}
-#ifndef ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED
-
-static int ena_devlink_reload(struct devlink *devlink, struct netlink_ext_ack *extack)
-{
-	/* This function always succeeds when called from this function */
-	ena_devlink_reload_down(devlink, extack);
-
-	return ena_devlink_reload_up(devlink, extack);
-}
-
-#endif
-
-static const struct devlink_ops ena_devlink_ops = {
-#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
-	.reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT),
-#endif
-#ifdef ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED
-	.reload_down	= ena_devlink_reload_down,
-	.reload_up	= ena_devlink_reload_up,
-#else
-	.reload		= ena_devlink_reload,
-#endif
-};
-
-static int ena_devlink_configure_params(struct devlink *devlink)
-{
-	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
-	union devlink_param_value value;
-	int rc;
-
-	rc = devlink_params_register(devlink, ena_devlink_params,
-				     ARRAY_SIZE(ena_devlink_params));
-	if (rc) {
-		netdev_err(adapter->netdev, "Failed to register devlink params\n");
-		return rc;
-	}
-
-	value.vbool = adapter->large_llq_header_enabled;
-	devl_param_driverinit_value_set(devlink,
-					ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-					value);
-
-#ifdef ENA_PHC_SUPPORT
-	value.vbool = ena_phc_is_enabled(adapter);
-	devl_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
-
-#endif /* ENA_PHC_SUPPORT */
-#ifdef ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
-	devlink_set_features(devlink, DEVLINK_F_RELOAD);
-
-#endif
-#ifdef ENA_DEVLINK_PUBLISH_REQUIRED
-	devlink_params_publish(devlink);
-
-#endif
-#ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
-	devlink_reload_enable(devlink);
-
-#endif
-	return 0;
-}
-
-struct devlink *ena_devlink_alloc(struct ena_adapter *adapter)
-{
-#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
-	struct device *dev = &adapter->pdev->dev;
-#endif
-	struct devlink *devlink;
-
-#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
-	devlink = devlink_alloc(&ena_devlink_ops, sizeof(struct ena_adapter *), dev);
-#else
-	devlink = devlink_alloc(&ena_devlink_ops, sizeof(struct ena_adapter *));
-#endif
-	if (!devlink) {
-		netdev_err(adapter->netdev, "Failed to allocate devlink struct\n");
-		return NULL;
-	}
-
-	ENA_DEVLINK_PRIV(devlink) = adapter;
-	adapter->devlink = devlink;
-
-#ifndef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
-	if (ena_devlink_configure_params(devlink))
-		goto free_devlink;
-
-	return devlink;
-free_devlink:
-	devlink_free(devlink);
-
-	return NULL;
-#else
-	return devlink;
-#endif
-}
-
-static void ena_devlink_configure_params_clean(struct devlink *devlink)
-{
-#ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
-	devlink_reload_disable(devlink);
-
-#endif
-#ifdef ENA_DEVLINK_PUBLISH_REQUIRED
-	devlink_params_unpublish(devlink);
-
-#endif
-	devlink_params_unregister(devlink, ena_devlink_params,
-				  ARRAY_SIZE(ena_devlink_params));
-}
-
-void ena_devlink_free(struct devlink *devlink)
-{
-#ifndef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
-	ena_devlink_configure_params_clean(devlink);
-
-#endif
-	devlink_free(devlink);
-}
-
-void ena_devlink_register(struct devlink *devlink, struct device *dev)
-{
-#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
-	devlink_register(devlink);
-#else
-	devlink_register(devlink, dev);
-#endif
-#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
-	ena_devlink_configure_params(devlink);
-#endif
-}
-
-void ena_devlink_unregister(struct devlink *devlink)
-{
-#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
-	ena_devlink_configure_params_clean(devlink);
-#endif
-	devlink_unregister(devlink);
-}
-#endif /* ENA_DEVLINK_SUPPORT */
diff --git a/drivers/amazon/net/ena/ena_devlink.h b/drivers/amazon/net/ena/ena_devlink.h
deleted file mode 100644
index 85c05cba00bd1..0000000000000
--- a/drivers/amazon/net/ena/ena_devlink.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
-/*
- * Copyright 2015-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
- */
-
-#ifndef DEVLINK_H
-#define DEVLINK_H
-
-#include "ena_netdev.h"
-#ifndef ENA_NO_DEVLINK_HEADERS
-#include <net/devlink.h>
-#endif
-
-#ifdef ENA_DEVLINK_SUPPORT
-#define ENA_DEVLINK_PRIV(devlink) \
-	(*(struct ena_adapter **)devlink_priv(devlink))
-
-struct devlink *ena_devlink_alloc(struct ena_adapter *adapter);
-void ena_devlink_free(struct devlink *devlink);
-void ena_devlink_register(struct devlink *devlink, struct device *dev);
-void ena_devlink_unregister(struct devlink *devlink);
-void ena_devlink_params_get(struct devlink *devlink);
-void ena_devlink_disable_large_llq_header_param(struct devlink *devlink);
-void ena_devlink_disable_phc_param(struct devlink *devlink);
-
-#else /* ENA_DEVLINK_SUPPORT */
-#ifdef ENA_NO_DEVLINK_HEADERS
-struct devlink {};
-#endif
-
-/* Return a value of 1 so the caller wouldn't think the function failed (returned NULL) */
-static inline struct devlink *ena_devlink_alloc(struct ena_adapter *adapter)
-{
-	return (struct devlink *)1;
-}
-static inline void ena_devlink_free(struct devlink *devlink) { }
-static inline void ena_devlink_register(struct devlink *devlink, struct device *dev) { };
-static inline void ena_devlink_unregister(struct devlink *devlink) { }
-static inline void ena_devlink_params_get(struct devlink *devlink) { }
-static inline void ena_devlink_disable_large_llq_header_param(struct devlink *devlink) { }
-static inline void ena_devlink_disable_phc_param(struct devlink *devlink) { }
-
-#endif /* ENA_DEVLINK_SUPPORT */
-#endif /* DEVLINK_H */
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index 50afe66efb57a..a3bbd70983476 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -237,6 +237,7 @@ static int ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 				    u16 *first_cdesc_idx,
 				    u16 *num_descs)
 {
+	struct ena_com_dev *dev = ena_com_io_cq_to_ena_dev(io_cq);
 	u16 count = io_cq->cur_rx_pkt_cdesc_count, head_masked;
 	struct ena_eth_io_rx_cdesc_base *cdesc;
 	u32 last = 0;
@@ -251,14 +252,23 @@ static int ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 
 		ena_com_cq_inc_head(io_cq);
 		if (unlikely((status & ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK) >>
-		    ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT && count != 0)) {
-			struct ena_com_dev *dev = ena_com_io_cq_to_ena_dev(io_cq);
-
+				     ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT &&
+			     count != 0)) {
 			netdev_err(dev->net_device,
 				   "First bit is on in descriptor #%d on q_id: %d, req_id: %u\n",
 				   count, io_cq->qid, cdesc->req_id);
 			return -EFAULT;
 		}
+
+		if (unlikely((status & (ENA_ETH_IO_RX_CDESC_BASE_MBZ7_MASK |
+					ENA_ETH_IO_RX_CDESC_BASE_MBZ17_MASK)) &&
+			     ena_com_get_cap(dev, ENA_ADMIN_CDESC_MBZ))) {
+			netdev_err(dev->net_device,
+				   "Corrupted RX descriptor #%d on q_id: %d, req_id: %u\n",
+				   count, io_cq->qid, cdesc->req_id);
+			return -EFAULT;
+		}
+
 		count++;
 		last = (status & ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >>
 			ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT;
@@ -441,7 +451,7 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	/* If the caller doesn't want to send packets */
 	if (unlikely(!num_bufs && !header_len)) {
 		rc = ena_com_close_bounce_buffer(io_sq);
-		if (rc)
+		if (unlikely(rc))
 			netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
 				   "Failed to write buffers to LLQ\n");
 		*nb_hw_desc = io_sq->tail - start_tail;
@@ -624,9 +634,8 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
 	if (unlikely(!ena_com_sq_have_enough_space(io_sq, 1)))
 		return -ENOSPC;
 
-	desc = get_sq_desc(io_sq);
-	if (unlikely(!desc))
-		return -EFAULT;
+	/* virt_addr allocation success is checked before calling this function */
+	desc = get_sq_desc_regular_queue(io_sq);
 
 	memset(desc, 0x0, sizeof(struct ena_eth_io_rx_desc));
 
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
index 028270a069d86..6768905d44bd2 100644
--- a/drivers/amazon/net/ena/ena_eth_com.h
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -8,6 +8,11 @@
 
 #include "ena_com.h"
 
+/* we allow 2 DMA descriptors per LLQ entry */
+#define ENA_LLQ_ENTRY_DESC_CHUNK_SIZE	(2 * sizeof(struct ena_eth_io_tx_desc))
+#define ENA_LLQ_HEADER		(128UL - ENA_LLQ_ENTRY_DESC_CHUNK_SIZE)
+#define ENA_LLQ_LARGE_HEADER	(256UL - ENA_LLQ_ENTRY_DESC_CHUNK_SIZE)
+
 struct ena_com_tx_ctx {
 	struct ena_com_tx_meta ena_meta;
 	struct ena_com_buf *ena_bufs;
@@ -196,9 +201,11 @@ static inline void ena_com_cq_inc_head(struct ena_com_io_cq *io_cq)
 static inline int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq,
 					     u16 *req_id)
 {
+	struct ena_com_dev *dev = ena_com_io_cq_to_ena_dev(io_cq);
 	u8 expected_phase, cdesc_phase;
 	struct ena_eth_io_tx_cdesc *cdesc;
 	u16 masked_head;
+	u8 flags;
 
 	masked_head = io_cq->head & (io_cq->q_depth - 1);
 	expected_phase = io_cq->phase;
@@ -207,14 +214,24 @@ static inline int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq,
 		((uintptr_t)io_cq->cdesc_addr.virt_addr +
 		(masked_head * io_cq->cdesc_entry_size_in_bytes));
 
+	flags = READ_ONCE(cdesc->flags);
+
 	/* When the current completion descriptor phase isn't the same as the
 	 * expected, it mean that the device still didn't update
 	 * this completion.
 	 */
-	cdesc_phase = READ_ONCE(cdesc->flags) & ENA_ETH_IO_TX_CDESC_PHASE_MASK;
+	cdesc_phase = flags & ENA_ETH_IO_TX_CDESC_PHASE_MASK;
 	if (cdesc_phase != expected_phase)
 		return -EAGAIN;
 
+	if (unlikely((flags & ENA_ETH_IO_TX_CDESC_MBZ6_MASK) &&
+		     ena_com_get_cap(dev, ENA_ADMIN_CDESC_MBZ))) {
+		netdev_err(dev->net_device,
+			   "Corrupted TX descriptor on q_id: %d, req_id: %u\n",
+			   io_cq->qid, cdesc->req_id);
+		return -EFAULT;
+	}
+
 	dma_rmb();
 
 	*req_id = READ_ONCE(cdesc->req_id);
diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h
index a4d6d0ee0193c..35b59ee9b0134 100644
--- a/drivers/amazon/net/ena/ena_eth_io_defs.h
+++ b/drivers/amazon/net/ena/ena_eth_io_defs.h
@@ -152,7 +152,8 @@ struct ena_eth_io_tx_cdesc {
 
 	/* flags
 	 * 0 : phase
-	 * 7:1 : reserved1
+	 * 5:1 : reserved1
+	 * 7:6 : mbz6 - MBZ
 	 */
 	u8 flags;
 
@@ -198,7 +199,7 @@ struct ena_eth_io_rx_desc {
 struct ena_eth_io_rx_cdesc_base {
 	/* 4:0 : l3_proto_idx
 	 * 6:5 : src_vlan_cnt
-	 * 7 : reserved7 - MBZ
+	 * 7 : mbz7 - MBZ
 	 * 12:8 : l4_proto_idx
 	 * 13 : l3_csum_err - when set, either the L3
 	 *    checksum error detected, or, the controller didn't
@@ -214,7 +215,8 @@ struct ena_eth_io_rx_cdesc_base {
 	 * 16 : l4_csum_checked - L4 checksum was verified
 	 *    (could be OK or error), when cleared the status of
 	 *    checksum is unknown
-	 * 23:17 : reserved17 - MBZ
+	 * 17 : mbz17 - MBZ
+	 * 23:18 : reserved18
 	 * 24 : phase
 	 * 25 : l3_csum2 - second checksum engine result
 	 * 26 : first - Indicates first descriptor in
@@ -341,6 +343,8 @@ struct ena_eth_io_numa_node_cfg_reg {
 
 /* tx_cdesc */
 #define ENA_ETH_IO_TX_CDESC_PHASE_MASK                      BIT(0)
+#define ENA_ETH_IO_TX_CDESC_MBZ6_SHIFT                      6
+#define ENA_ETH_IO_TX_CDESC_MBZ6_MASK                       GENMASK(7, 6)
 
 /* rx_desc */
 #define ENA_ETH_IO_RX_DESC_PHASE_MASK                       BIT(0)
@@ -355,6 +359,8 @@ struct ena_eth_io_numa_node_cfg_reg {
 #define ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK          GENMASK(4, 0)
 #define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_SHIFT         5
 #define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_MASK          GENMASK(6, 5)
+#define ENA_ETH_IO_RX_CDESC_BASE_MBZ7_SHIFT                 7
+#define ENA_ETH_IO_RX_CDESC_BASE_MBZ7_MASK                  BIT(7)
 #define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT         8
 #define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK          GENMASK(12, 8)
 #define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT          13
@@ -365,6 +371,8 @@ struct ena_eth_io_numa_node_cfg_reg {
 #define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK             BIT(15)
 #define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_SHIFT      16
 #define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_MASK       BIT(16)
+#define ENA_ETH_IO_RX_CDESC_BASE_MBZ17_SHIFT                17
+#define ENA_ETH_IO_RX_CDESC_BASE_MBZ17_MASK                 BIT(17)
 #define ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT                24
 #define ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK                 BIT(24)
 #define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_SHIFT             25
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index f09801591d840..ada1b9b0c4eef 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -65,14 +65,25 @@ struct ena_hw_metrics {
 }
 
 static const struct ena_stats ena_stats_global_strings[] = {
+	ENA_STAT_GLOBAL_ENTRY(total_resets),
+	ENA_STAT_GLOBAL_ENTRY(reset_fail),
 	ENA_STAT_GLOBAL_ENTRY(tx_timeout),
+	ENA_STAT_GLOBAL_ENTRY(wd_expired),
+	ENA_STAT_GLOBAL_ENTRY(admin_q_pause),
+	ENA_STAT_GLOBAL_ENTRY(bad_tx_req_id),
+	ENA_STAT_GLOBAL_ENTRY(bad_rx_req_id),
+	ENA_STAT_GLOBAL_ENTRY(bad_rx_desc_num),
+	ENA_STAT_GLOBAL_ENTRY(missing_intr),
+	ENA_STAT_GLOBAL_ENTRY(suspected_poll_starvation),
+	ENA_STAT_GLOBAL_ENTRY(missing_tx_cmpl),
+	ENA_STAT_GLOBAL_ENTRY(rx_desc_malformed),
+	ENA_STAT_GLOBAL_ENTRY(tx_desc_malformed),
+	ENA_STAT_GLOBAL_ENTRY(invalid_state),
+	ENA_STAT_GLOBAL_ENTRY(os_netdev_wd),
 	ENA_STAT_GLOBAL_ENTRY(suspend),
 	ENA_STAT_GLOBAL_ENTRY(resume),
-	ENA_STAT_GLOBAL_ENTRY(wd_expired),
-	ENA_STAT_GLOBAL_ENTRY(interface_up),
 	ENA_STAT_GLOBAL_ENTRY(interface_down),
-	ENA_STAT_GLOBAL_ENTRY(admin_q_pause),
-	ENA_STAT_GLOBAL_ENTRY(reset_fail),
+	ENA_STAT_GLOBAL_ENTRY(interface_up),
 };
 
 /* A partial list of hw stats. Used when admin command
@@ -719,19 +730,19 @@ static void ena_get_drvinfo(struct net_device *dev,
 
 	ret = strscpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
 	if (ret < 0)
-		netif_info(adapter, drv, dev,
-			   "module name will be truncated, status = %zd\n", ret);
+		netif_dbg(adapter, drv, dev,
+			  "module name will be truncated, status = %zd\n", ret);
 
 	ret = strscpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version));
 	if (ret < 0)
-		netif_info(adapter, drv, dev,
-			   "module version will be truncated, status = %zd\n", ret);
+		netif_dbg(adapter, drv, dev,
+			  "module version will be truncated, status = %zd\n", ret);
 
 	ret = strscpy(info->bus_info, pci_name(adapter->pdev),
 		sizeof(info->bus_info));
 	if (ret < 0)
-		netif_info(adapter, drv, dev,
-			   "bus info will be truncated, status = %zd\n", ret);
+		netif_dbg(adapter, drv, dev,
+			  "bus info will be truncated, status = %zd\n", ret);
 
 	info->n_priv_flags = ENA_PRIV_FLAGS_NR;
 }
@@ -749,6 +760,23 @@ static void ena_get_ringparam(struct net_device *netdev,
 
 	ring->tx_max_pending = adapter->max_tx_ring_size;
 	ring->rx_max_pending = adapter->max_rx_ring_size;
+#ifdef ENA_LARGE_LLQ_ETHTOOL
+	if (adapter->ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		bool large_llq_supported = adapter->large_llq_header_supported;
+
+		kernel_ring->tx_push = true;
+		kernel_ring->tx_push_buf_len = adapter->ena_dev->tx_max_header_size;
+		if (large_llq_supported)
+			kernel_ring->tx_push_buf_max_len = ENA_LLQ_LARGE_HEADER;
+		else
+			kernel_ring->tx_push_buf_max_len = ENA_LLQ_HEADER;
+	} else {
+		kernel_ring->tx_push = false;
+		kernel_ring->tx_push_buf_max_len = 0;
+		kernel_ring->tx_push_buf_len = 0;
+	}
+
+#endif
 	ring->tx_pending = adapter->tx_ring[0].ring_size;
 	ring->rx_pending = adapter->rx_ring[0].ring_size;
 }
@@ -763,7 +791,8 @@ static int ena_set_ringparam(struct net_device *netdev,
 #endif
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
-	u32 new_tx_size, new_rx_size;
+	u32 new_tx_size, new_rx_size, new_tx_push_buf_len;
+	bool changed = false;
 
 	if (ring->rx_mini_pending || ring->rx_jumbo_pending)
 		return -EINVAL;
@@ -776,11 +805,53 @@ static int ena_set_ringparam(struct net_device *netdev,
 				adapter->max_rx_ring_size);
 	new_rx_size = rounddown_pow_of_two(new_rx_size);
 
-	if (new_tx_size == adapter->requested_tx_ring_size &&
-	    new_rx_size == adapter->requested_rx_ring_size)
+	changed |= new_tx_size != adapter->requested_tx_ring_size ||
+		   new_rx_size != adapter->requested_rx_ring_size;
+
+	/* This value is ignored if LLQ is not supported */
+	new_tx_push_buf_len = adapter->ena_dev->tx_max_header_size;
+#ifdef ENA_LARGE_LLQ_ETHTOOL
+
+	if ((adapter->ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) !=
+	    kernel_ring->tx_push) {
+		NL_SET_ERR_MSG_MOD(extack, "Push mode state cannot be modified");
+		return -EINVAL;
+	}
+
+	/* Validate that the push buffer is supported on the underlying device */
+	if (kernel_ring->tx_push_buf_len) {
+		enum ena_admin_placement_policy_type placement;
+
+		new_tx_push_buf_len = kernel_ring->tx_push_buf_len;
+
+		placement = adapter->ena_dev->tx_mem_queue_type;
+		if (placement == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+			return -EOPNOTSUPP;
+
+		if (new_tx_push_buf_len != ENA_LLQ_HEADER &&
+		    new_tx_push_buf_len != ENA_LLQ_LARGE_HEADER) {
+			bool large_llq_sup = adapter->large_llq_header_supported;
+			char large_llq_size_str[40];
+
+			snprintf(large_llq_size_str, 40, ", %lu", ENA_LLQ_LARGE_HEADER);
+
+			NL_SET_ERR_MSG_FMT_MOD(extack,
+					       "Supported tx push buff values: [%lu%s]",
+					       ENA_LLQ_HEADER,
+					       large_llq_sup ? large_llq_size_str : "");
+
+			return -EINVAL;
+		}
+
+		changed |= new_tx_push_buf_len != adapter->ena_dev->tx_max_header_size;
+	}
+
+#endif
+	if (!changed)
 		return 0;
 
-	return ena_update_queue_sizes(adapter, new_tx_size, new_rx_size);
+	return ena_update_queue_params(adapter, new_tx_size, new_rx_size,
+				       new_tx_push_buf_len);
 }
 
 #ifdef ETHTOOL_GRXRINGS
@@ -871,7 +942,7 @@ static int ena_get_rss_hash(struct ena_com_dev *ena_dev,
 	}
 
 	rc = ena_com_get_hash_ctrl(ena_dev, proto, &hash_fields);
-	if (rc)
+	if (unlikely(rc))
 		return rc;
 
 	cmd->data = ena_flow_hash_to_flow_type(hash_fields);
@@ -1023,7 +1094,7 @@ static int ena_indirection_table_get(struct ena_adapter *adapter, u32 *indir)
 		return 0;
 
 	rc = ena_com_indirect_table_get(ena_dev, indir);
-	if (rc)
+	if (unlikely(rc))
 		return rc;
 
 	/* Our internal representation of the indices is: even indices
@@ -1046,7 +1117,7 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 	int rc;
 
 	rc = ena_indirection_table_get(adapter, indir);
-	if (rc)
+	if (unlikely(rc))
 		return rc;
 
 	/* We call this function in order to check if the device
@@ -1090,7 +1161,7 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
 	int rc;
 
 	rc = ena_indirection_table_get(adapter, indir);
-	if (rc)
+	if (unlikely(rc))
 		return rc;
 
 	/* We call this function in order to check if the device
@@ -1202,15 +1273,21 @@ static int ena_set_channels(struct net_device *netdev,
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	u32 count = channels->combined_count;
 	/* The check for max value is already done in ethtool */
-#ifdef ENA_XDP_SUPPORT
-	if (count < ENA_MIN_NUM_IO_QUEUES ||
-	    (ena_xdp_present(adapter) &&
-	    !ena_xdp_legal_queue_count(adapter, count)))
-#else
 	if (count < ENA_MIN_NUM_IO_QUEUES)
-#endif /* ENA_XDP_SUPPORT */
 		return -EINVAL;
 
+	if (!ena_xdp_legal_queue_count(adapter, count)) {
+		if (ena_xdp_present(adapter))
+			return -EINVAL;
+
+		xdp_clear_features_flag(netdev);
+	} else {
+		xdp_set_features_flag(netdev,
+				      NETDEV_XDP_ACT_BASIC |
+				      NETDEV_XDP_ACT_REDIRECT);
+	}
+
+
 	if (count > adapter->max_num_io_queues)
 		return -EINVAL;
 	if (count != adapter->num_io_queues && ena_is_zc_q_exist(adapter)) {
@@ -1289,6 +1366,10 @@ static const struct ethtool_ops ena_ethtool_ops = {
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_USE_ADAPTIVE_RX,
 #endif
+#ifdef ENA_LARGE_LLQ_ETHTOOL
+	.supported_ring_params	= ETHTOOL_RING_USE_TX_PUSH_BUF_LEN |
+				  ETHTOOL_RING_USE_TX_PUSH,
+#endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
 	.get_link_ksettings	= ena_get_link_ksettings,
 #else
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 759926e8f8716..932c075f5a2ef 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -30,8 +30,6 @@
 #include "ena_lpc.h"
 
 #include "ena_phc.h"
-#include "ena_devlink.h"
-
 static char version[] = DEVICE_NAME " v" DRV_MODULE_GENERATION "\n";
 
 MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
@@ -61,7 +59,8 @@ static int rx_queue_size = ENA_DEFAULT_RING_SIZE;
 module_param(rx_queue_size, int, 0444);
 MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Depending on instance type, max value can be up to 16K\n");
 
-static int force_large_llq_header = 0;
+#define FORCE_LARGE_LLQ_HEADER_UNINIT_VALUE 0xFFFF
+static int force_large_llq_header = FORCE_LARGE_LLQ_HEADER_UNINIT_VALUE;
 module_param(force_large_llq_header, int, 0444);
 MODULE_PARM_DESC(force_large_llq_header, "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum TX queue size by half.\n");
 
@@ -91,8 +90,6 @@ MODULE_DEVICE_TABLE(pci, ena_pci_tbl);
 
 static int ena_rss_init_default(struct ena_adapter *adapter);
 static void check_for_admin_com_state(struct ena_adapter *adapter);
-static int ena_calc_io_queue_size(struct ena_adapter *adapter,
-				   struct ena_com_dev_get_features_ctx *get_feat_ctx);
 static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
 				 struct net_device *netdev);
 
@@ -252,7 +249,7 @@ int ena_xmit_common(struct ena_adapter *adapter,
 
 	tx_info->tx_descs = nb_hw_desc;
 	tx_info->total_tx_size = bytes;
-	tx_info->last_jiffies = jiffies;
+	tx_info->tx_sent_jiffies = jiffies;
 	tx_info->print_once = 0;
 
 	ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
@@ -707,12 +704,6 @@ static void ena_unmap_rx_buff_attrs(struct ena_ring *rx_ring,
 				 DMA_BIDIRECTIONAL, attrs);
 }
 
-static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
-			      struct ena_rx_buffer *rx_info)
-{
-	ena_unmap_rx_buff_attrs(rx_ring, rx_info, 0);
-}
-
 static void ena_free_rx_page(struct ena_ring *rx_ring,
 			     struct ena_rx_buffer *rx_info)
 {
@@ -724,7 +715,7 @@ static void ena_free_rx_page(struct ena_ring *rx_ring,
 		return;
 	}
 
-	ena_unmap_rx_buff(rx_ring, rx_info);
+	ena_unmap_rx_buff_attrs(rx_ring, rx_info, 0);
 
 	__free_page(page);
 	rx_info->page = NULL;
@@ -867,31 +858,41 @@ void ena_unmap_tx_buff(struct ena_ring *tx_ring,
 static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 {
 	bool print_once = true;
+	bool is_xdp_ring;
 	u32 i;
 
+	is_xdp_ring = ENA_IS_XDP_INDEX(tx_ring->adapter, tx_ring->qid);
+
 	for (i = 0; i < tx_ring->ring_size; i++) {
 		struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i];
+		unsigned long jiffies_since_submitted;
 
 		if (!tx_info->skb)
 			continue;
 
+		jiffies_since_submitted = jiffies - tx_info->tx_sent_jiffies;
 		if (print_once) {
 			netif_notice(tx_ring->adapter, ifdown, tx_ring->netdev,
-				     "Free uncompleted tx skb qid %d idx 0x%x\n",
-				     tx_ring->qid, i);
+				     "Free uncompleted tx skb qid %d, idx 0x%x, %u msecs since submission\n",
+				     tx_ring->qid, i, jiffies_to_msecs(jiffies_since_submitted));
 			print_once = false;
 		} else {
 			netif_dbg(tx_ring->adapter, ifdown, tx_ring->netdev,
-				  "Free uncompleted tx skb qid %d idx 0x%x\n",
-				  tx_ring->qid, i);
+				  "Free uncompleted tx skb qid %d, idx 0x%x, %u msecs since submission\n",
+				  tx_ring->qid, i, jiffies_to_msecs(jiffies_since_submitted));
 		}
 
 		ena_unmap_tx_buff(tx_ring, tx_info);
 
-		dev_kfree_skb_any(tx_info->skb);
+		if (is_xdp_ring)
+			xdp_return_frame(tx_info->xdpf);
+		else
+			dev_kfree_skb_any(tx_info->skb);
 	}
-	netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev,
-						  tx_ring->qid));
+
+	if (!is_xdp_ring)
+		netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev,
+							  tx_ring->qid));
 }
 
 static void ena_free_all_tx_bufs(struct ena_adapter *adapter)
@@ -996,12 +997,17 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 			if (unlikely(rc == -EINVAL))
 				handle_invalid_req_id(tx_ring, req_id, NULL,
 						      false);
+			else if (unlikely(rc == -EFAULT)) {
+				ena_reset_device(
+					tx_ring->adapter,
+					ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED);
+			}
 			break;
 		}
 
 		/* validate that the request id points to a valid skb */
 		rc = validate_tx_req_id(tx_ring, req_id);
-		if (rc)
+		if (unlikely(rc))
 			break;
 
 		tx_info = &tx_ring->tx_buffer_info[req_id];
@@ -1011,7 +1017,7 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 		prefetch(&skb->end);
 
 		tx_info->skb = NULL;
-		tx_info->last_jiffies = 0;
+		tx_info->tx_sent_jiffies = 0;
 
 		ena_unmap_tx_buff(tx_ring, tx_info);
 
@@ -1161,11 +1167,6 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		if (unlikely(!skb))
 			return NULL;
 
-		/* sync this buffer for CPU use */
-		dma_sync_single_for_cpu(rx_ring->dev,
-					dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
-					len,
-					DMA_FROM_DEVICE);
 		skb_copy_to_linear_data(skb, buf_addr + buf_offset, len);
 		dma_sync_single_for_device(rx_ring->dev,
 					   dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
@@ -1187,19 +1188,17 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
 	buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
 
-	pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr);
-
 	/* If XDP isn't loaded try to reuse part of the RX buffer */
 	reuse_rx_buf_page = !is_xdp_loaded &&
 			    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
-	dma_sync_single_for_cpu(rx_ring->dev,
-				pre_reuse_paddr + pkt_offset,
-				len,
-				DMA_FROM_DEVICE);
-
-	if (!reuse_rx_buf_page)
+	if (!reuse_rx_buf_page) {
 		ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC);
+		/* Make sure buf_len represents the actual size used
+		 * by the buffer as expected from skb->truesize
+		 */
+		buf_len = ENA_PAGE_SIZE - page_offset;
+	}
 
 
 	skb = ena_alloc_skb(rx_ring, buf_addr, buf_len);
@@ -1263,8 +1262,13 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 					len,
 					DMA_FROM_DEVICE);
 
-		if (!reuse_rx_buf_page)
+		if (!reuse_rx_buf_page) {
 			ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC);
+			/* Make sure buf_len represents the actual size used
+			 * by the buffer as expected from skb->truesize
+			 */
+			buf_len = ENA_PAGE_SIZE - page_offset;
+		}
 
 
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
@@ -1461,6 +1465,11 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 			  rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
 			  ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
 
+		dma_sync_single_for_cpu(rx_ring->dev,
+					dma_unmap_addr(&rx_info->ena_buf, paddr) + ena_rx_ctx.pkt_offset,
+					rx_ring->ena_bufs[0].len,
+					DMA_FROM_DEVICE);
+
 #ifdef ENA_XDP_SUPPORT
 		if (ena_xdp_present_ring(rx_ring))
 			xdp_verdict = ena_xdp_handle_buff(rx_ring, &xdp, ena_rx_ctx.descs);
@@ -1490,8 +1499,9 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 				 * from RX side.
 				 */
 				if (xdp_verdict & ENA_XDP_FORWARDED) {
-					ena_unmap_rx_buff(rx_ring,
-							  &rx_ring->rx_buffer_info[req_id]);
+					ena_unmap_rx_buff_attrs(rx_ring,
+								&rx_ring->rx_buffer_info[req_id],
+								ENA_DMA_ATTR_SKIP_CPU_SYNC);
 					rx_ring->rx_buffer_info[req_id].page = NULL;
 				}
 #endif /* ENA_XDP_SUPPORT */
@@ -1552,6 +1562,8 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 #ifdef ENA_XDP_SUPPORT
 	if (xdp_flags & ENA_XDP_REDIRECT)
 		xdp_do_flush_map();
+	if (xdp_flags & ENA_XDP_TX)
+		ena_ring_tx_doorbell(rx_ring->xdp_ring);
 #endif
 
 	return work_done;
@@ -1791,7 +1803,7 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 	struct ena_napi *ena_napi = data;
 
 	/* Used to check HW health */
-	WRITE_ONCE(ena_napi->first_interrupt, true);
+	WRITE_ONCE(ena_napi->last_intr_jiffies, jiffies);
 
 	WRITE_ONCE(ena_napi->interrupts_masked, true);
 	smp_wmb(); /* write interrupts_masked before calling napi */
@@ -2155,7 +2167,7 @@ static int ena_rss_configure(struct ena_adapter *adapter)
 	/* In case the RSS table wasn't initialized by probe */
 	if (!ena_dev->rss.tbl_log_size) {
 		rc = ena_rss_init_default(adapter);
-		if (rc && (rc != -EOPNOTSUPP)) {
+		if (unlikely(rc && (rc != -EOPNOTSUPP))) {
 			netif_err(adapter, ifup, adapter->netdev,
 				  "Failed to init RSS rc: %d\n", rc);
 			return rc;
@@ -2185,7 +2197,7 @@ static int ena_up_complete(struct ena_adapter *adapter)
 	int rc;
 
 	rc = ena_rss_configure(adapter);
-	if (rc)
+	if (unlikely(rc))
 		return rc;
 
 	ena_change_mtu(adapter->netdev, adapter->netdev->mtu);
@@ -2227,7 +2239,7 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
 	ctx.numa_node = tx_ring->numa_node;
 
 	rc = ena_com_create_io_queue(ena_dev, &ctx);
-	if (rc) {
+	if (unlikely(rc)) {
 		netif_err(adapter, ifup, adapter->netdev,
 			  "Failed to create I/O TX queue num %d rc: %d\n",
 			  qid, rc);
@@ -2237,7 +2249,7 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
 	rc = ena_com_get_io_handlers(ena_dev, ena_qid,
 				     &tx_ring->ena_com_io_sq,
 				     &tx_ring->ena_com_io_cq);
-	if (rc) {
+	if (unlikely(rc)) {
 		netif_err(adapter, ifup, adapter->netdev,
 			  "Failed to get TX queue handlers. TX queue num %d rc: %d\n",
 			  qid, rc);
@@ -2257,7 +2269,7 @@ int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
 
 	for (i = first_index; i < first_index + count; i++) {
 		rc = ena_create_io_tx_queue(adapter, i);
-		if (rc)
+		if (unlikely(rc))
 			goto create_err;
 	}
 
@@ -2295,7 +2307,7 @@ static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
 	ctx.numa_node = rx_ring->numa_node;
 
 	rc = ena_com_create_io_queue(ena_dev, &ctx);
-	if (rc) {
+	if (unlikely(rc)) {
 		netif_err(adapter, ifup, adapter->netdev,
 			  "Failed to create I/O RX queue num %d rc: %d\n",
 			  qid, rc);
@@ -2305,7 +2317,7 @@ static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
 	rc = ena_com_get_io_handlers(ena_dev, ena_qid,
 				     &rx_ring->ena_com_io_sq,
 				     &rx_ring->ena_com_io_cq);
-	if (rc) {
+	if (unlikely(rc)) {
 		netif_err(adapter, ifup, adapter->netdev,
 			  "Failed to get RX queue handlers. RX queue num %d rc: %d\n",
 			  qid, rc);
@@ -2327,7 +2339,7 @@ static int ena_create_all_io_rx_queues(struct ena_adapter *adapter)
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		rc = ena_create_io_rx_queue(adapter, i);
-		if (rc)
+		if (unlikely(rc))
 			goto create_err;
 		INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work);
 
@@ -2500,7 +2512,7 @@ int ena_up(struct ena_adapter *adapter)
 		goto err_create_queues_with_backoff;
 
 	rc = ena_up_complete(adapter);
-	if (rc)
+	if (unlikely(rc))
 		goto err_up;
 
 	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
@@ -2629,6 +2641,7 @@ static int ena_open(struct net_device *netdev)
 static int ena_close(struct net_device *netdev)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
+	u8 *debug_area;
 
 	netif_dbg(adapter, ifdown, netdev, "%s\n", __func__);
 
@@ -2643,6 +2656,10 @@ static int ena_close(struct net_device *netdev)
 	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
 		netif_err(adapter, ifdown, adapter->netdev,
 			  "Destroy failure, restarting device\n");
+
+		debug_area = adapter->ena_dev->host_attr.debug_area_virt_addr;
+		if (debug_area)
+			ena_dump_stats_to_buf(adapter, debug_area);
 		ena_dump_stats_to_dmesg(adapter);
 		/* rtnl lock already obtained in dev_ioctl() layer */
 		ena_destroy_device(adapter, false);
@@ -2679,11 +2696,13 @@ int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled)
 	return 0;
 }
 
-int ena_update_queue_sizes(struct ena_adapter *adapter,
-			   u32 new_tx_size,
-			   u32 new_rx_size)
+int ena_update_queue_params(struct ena_adapter *adapter,
+			    u32 new_tx_size,
+			    u32 new_rx_size,
+			    u32 new_llq_header_len)
 {
-	bool dev_was_up;
+	bool dev_was_up, large_llq_changed = false;
+	int rc = 0;
 
 	dev_was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 	ena_close(adapter->netdev);
@@ -2693,7 +2712,27 @@ int ena_update_queue_sizes(struct ena_adapter *adapter,
 			  0,
 			  adapter->xdp_num_queues +
 			  adapter->num_io_queues);
-	return dev_was_up ? ena_up(adapter) : 0;
+
+#ifdef ENA_LARGE_LLQ_ETHTOOL
+	large_llq_changed = adapter->ena_dev->tx_mem_queue_type ==
+			    ENA_ADMIN_PLACEMENT_POLICY_DEV;
+	large_llq_changed &=
+		new_llq_header_len != adapter->ena_dev->tx_max_header_size;
+
+#endif /* ENA_LARGE_LLQ_ETHTOOL */
+	/* a check that the configuration is valid is done by caller */
+	if (large_llq_changed) {
+		bool large_llq_requested = new_llq_header_len == ENA_LLQ_LARGE_HEADER;
+
+		adapter->llq_policy = large_llq_requested ?
+					ENA_LLQ_HEADER_SIZE_POLICY_LARGE :
+					ENA_LLQ_HEADER_SIZE_POLICY_NORMAL;
+
+		ena_destroy_device(adapter, false);
+		rc = ena_restore_device(adapter);
+	}
+
+	return dev_was_up && !rc ? ena_up(adapter) : rc;
 }
 
 int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak)
@@ -2935,7 +2974,6 @@ static int ena_tx_map_skb(struct ena_ring *tx_ring,
 
 	tx_info->skb = NULL;
 
-	tx_info->num_of_bufs += i;
 	ena_unmap_tx_buff(tx_ring, tx_info);
 
 	return -EINVAL;
@@ -2990,7 +3028,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			     &ena_tx_ctx,
 			     next_to_use,
 			     skb->len);
-	if (rc)
+	if (unlikely(rc))
 		goto error_unmap_dma;
 
 	if (tx_ring->enable_bql)
@@ -3076,9 +3114,11 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb)
 	 * want to loop incoming skb rx to tx in normal user generated traffic,
 	 * most probably we will not get to this
 	 */
-	if (skb_rx_queue_recorded(skb))
+	if (skb_rx_queue_recorded(skb)) {
 		qid = skb_get_rx_queue(skb);
-	else
+		if (qid >= dev->real_num_tx_queues)
+			qid %= dev->real_num_tx_queues;
+	} else {
 #if (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3)
 		qid = netdev_pick_tx(dev, skb, NULL);
 #elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2)
@@ -3090,6 +3130,7 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb)
 #else
 		qid = skb_tx_hash(dev, skb);
 #endif /* HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2 */
+	}
 
 	return qid;
 }
@@ -3125,21 +3166,21 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 
 	/* Allocate only the host info */
 	rc = ena_com_allocate_host_info(ena_dev);
-	if (rc) {
+	if (unlikely(rc)) {
 		dev_err(dev, "Cannot allocate host info\n");
 		return;
 	}
 
 	host_info = ena_dev->host_attr.host_info;
 
-	host_info->bdf = (pdev->bus->number << 8) | pdev->devfn;
+	host_info->bdf = pci_dev_id(pdev);
 	host_info->os_type = ENA_ADMIN_OS_LINUX;
 	host_info->kernel_ver = LINUX_VERSION_CODE;
 	ret = strscpy(host_info->kernel_ver_str, utsname()->version,
 		sizeof(host_info->kernel_ver_str) - 1);
 	if (ret < 0)
-		dev_info(dev,
-			 "kernel version string will be truncated, status = %zd\n", ret);
+		dev_dbg(dev,
+			"kernel version string will be truncated, status = %zd\n", ret);
 	host_info->os_dist = 0;
 	strncpy(host_info->os_dist_str, utsname()->release,
 		sizeof(host_info->os_dist_str) - 1);
@@ -3155,10 +3196,12 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 		ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK |
 		ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK |
 		ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK |
-		ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK;
+		ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK |
+		ENA_ADMIN_HOST_INFO_TX_IPV6_CSUM_OFFLOAD_MASK |
+		ENA_ADMIN_HOST_INFO_PHC_MASK;
 
 	rc = ena_com_set_host_attributes(ena_dev);
-	if (rc) {
+	if (unlikely(rc)) {
 		if (rc == -EOPNOTSUPP)
 			dev_warn(dev, "Cannot set host attributes\n");
 		else
@@ -3189,14 +3232,14 @@ static void ena_config_debug_area(struct ena_adapter *adapter)
 	debug_area_size = ss_count * ETH_GSTRING_LEN + sizeof(u64) * ss_count;
 
 	rc = ena_com_allocate_debug_area(adapter->ena_dev, debug_area_size);
-	if (rc) {
+	if (unlikely(rc)) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "Cannot allocate debug area\n");
 		return;
 	}
 
 	rc = ena_com_set_host_attributes(adapter->ena_dev);
-	if (rc) {
+	if (unlikely(rc)) {
 		if (rc == -EOPNOTSUPP)
 			netif_warn(adapter, drv, adapter->netdev,
 				   "Cannot set host attributes\n");
@@ -3414,6 +3457,110 @@ static const struct net_device_ops ena_netdev_ops = {
 #endif /* ENA_XDP_SUPPORT */
 };
 
+static int ena_calc_io_queue_size(struct ena_adapter *adapter,
+				  struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	struct ena_admin_feature_llq_desc *llq = &get_feat_ctx->llq;
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	u32 tx_queue_size = ENA_DEFAULT_RING_SIZE;
+	u32 max_tx_queue_size;
+	u32 max_rx_queue_size;
+
+	/* If this function is called after driver load, the ring sizes have already
+	 * been configured. Take it into account when recalculating ring size.
+	 */
+	if (adapter->tx_ring->ring_size)
+		tx_queue_size = adapter->tx_ring->ring_size;
+
+	if (adapter->rx_ring->ring_size)
+		rx_queue_size = adapter->rx_ring->ring_size;
+
+	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
+		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
+			&get_feat_ctx->max_queue_ext.max_queue_ext;
+		max_rx_queue_size = min_t(u32, max_queue_ext->max_rx_cq_depth,
+					  max_queue_ext->max_rx_sq_depth);
+		max_tx_queue_size = max_queue_ext->max_tx_cq_depth;
+
+		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  llq->max_llq_depth);
+		else
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  max_queue_ext->max_tx_sq_depth);
+
+		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+						 max_queue_ext->max_per_packet_tx_descs);
+		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+						 max_queue_ext->max_per_packet_rx_descs);
+	} else {
+		struct ena_admin_queue_feature_desc *max_queues =
+			&get_feat_ctx->max_queues;
+		max_rx_queue_size = min_t(u32, max_queues->max_cq_depth,
+					  max_queues->max_sq_depth);
+		max_tx_queue_size = max_queues->max_cq_depth;
+
+		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  llq->max_llq_depth);
+		else
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  max_queues->max_sq_depth);
+
+		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+						 max_queues->max_packet_tx_descs);
+		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+						 max_queues->max_packet_rx_descs);
+	}
+
+	max_tx_queue_size = rounddown_pow_of_two(max_tx_queue_size);
+	max_rx_queue_size = rounddown_pow_of_two(max_rx_queue_size);
+
+	if (max_tx_queue_size < ENA_MIN_RING_SIZE) {
+		netdev_err(adapter->netdev, "Device max TX queue size: %d < minimum: %d\n",
+			   max_tx_queue_size, ENA_MIN_RING_SIZE);
+		return -EFAULT;
+	}
+
+	if (max_rx_queue_size < ENA_MIN_RING_SIZE) {
+		netdev_err(adapter->netdev, "Device max RX queue size: %d < minimum: %d\n",
+			   max_rx_queue_size, ENA_MIN_RING_SIZE);
+		return -EFAULT;
+	}
+
+	/* When forcing large headers, we multiply the entry size by 2, and therefore divide
+	 * the queue size by 2, leaving the amount of memory used by the queues unchanged.
+	 */
+	if (adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE) {
+		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+			max_tx_queue_size /= 2;
+			dev_info(&adapter->pdev->dev,
+				 "Forcing large headers and decreasing maximum TX queue size to %d\n",
+				 max_tx_queue_size);
+		} else {
+			dev_err(&adapter->pdev->dev,
+				"Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
+
+			adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_NORMAL;
+		}
+	}
+
+	tx_queue_size = clamp_val(tx_queue_size, ENA_MIN_RING_SIZE,
+				  max_tx_queue_size);
+	rx_queue_size = clamp_val(rx_queue_size, ENA_MIN_RING_SIZE,
+				  max_rx_queue_size);
+
+	tx_queue_size = rounddown_pow_of_two(tx_queue_size);
+	rx_queue_size = rounddown_pow_of_two(rx_queue_size);
+
+	adapter->max_tx_ring_size  = max_tx_queue_size;
+	adapter->max_rx_ring_size = max_rx_queue_size;
+	adapter->requested_tx_ring_size = tx_queue_size;
+	adapter->requested_rx_ring_size = rx_queue_size;
+
+	return 0;
+}
+
 static int ena_device_validate_params(struct ena_adapter *adapter,
 				      struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
@@ -3437,30 +3584,52 @@ static int ena_device_validate_params(struct ena_adapter *adapter,
 	return 0;
 }
 
-static void set_default_llq_configurations(struct ena_adapter *adapter,
-					   struct ena_llq_configurations *llq_config,
-					   struct ena_admin_feature_llq_desc *llq)
+static void ena_set_forced_llq_size_policy(struct ena_adapter *adapter)
+{
+	/* policy will be set according to device recommendation unless user
+	 * forced either large/normal size
+	 */
+	if (force_large_llq_header != FORCE_LARGE_LLQ_HEADER_UNINIT_VALUE) {
+		/* user selection is prioritized on top of device recommendation */
+		adapter->llq_policy = force_large_llq_header ? ENA_LLQ_HEADER_SIZE_POLICY_LARGE :
+							       ENA_LLQ_HEADER_SIZE_POLICY_NORMAL;
+	}
+}
+
+static int ena_set_llq_configurations(struct ena_adapter *adapter,
+				      struct ena_llq_configurations *llq_config,
+				      struct ena_admin_feature_llq_desc *llq)
 {
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	bool use_large_llq;
 
 	llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER;
 	llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
 	llq_config->llq_num_decs_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
 
 	adapter->large_llq_header_supported =
-		!!(ena_dev->supported_features & (1 << ENA_ADMIN_LLQ));
+		!!(ena_dev->supported_features & BIT(ENA_ADMIN_LLQ));
 	adapter->large_llq_header_supported &=
 		!!(llq->entry_size_ctrl_supported &
 			ENA_ADMIN_LIST_ENTRY_SIZE_256B);
 
-	if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
-	    adapter->large_llq_header_enabled) {
-		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
-		llq_config->llq_ring_entry_size_value = 256;
-	} else {
+	use_large_llq = adapter->llq_policy != ENA_LLQ_HEADER_SIZE_POLICY_NORMAL;
+	use_large_llq &= adapter->large_llq_header_supported;
+
+	if (adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_UNSPECIFIED)
+		use_large_llq &= (llq->entry_size_recommended == ENA_ADMIN_LIST_ENTRY_SIZE_256B);
+
+	if (!use_large_llq) {
 		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_128B;
 		llq_config->llq_ring_entry_size_value = 128;
+		adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_NORMAL;
+	} else {
+		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
+		llq_config->llq_ring_entry_size_value = 256;
+		adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_LARGE;
 	}
+
+	return 0;
 }
 
 static int ena_set_queues_placement_policy(struct pci_dev *pdev,
@@ -3529,7 +3698,7 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 	int rc;
 
 	rc = ena_com_mmio_reg_read_request_init(ena_dev);
-	if (rc) {
+	if (unlikely(rc)) {
 		dev_err(dev, "Failed to init mmio read less\n");
 		return rc;
 	}
@@ -3553,7 +3722,7 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 	}
 
 	dma_width = ena_com_get_dma_width(ena_dev);
-	if (dma_width < 0) {
+	if (unlikely(dma_width < 0)) {
 		dev_err(dev, "Invalid dma width value %d", dma_width);
 		rc = dma_width;
 		goto err_mmio_read_less;
@@ -3580,11 +3749,10 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 	}
 #endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */
 
-	ena_devlink_params_get(adapter->devlink);
 
 	/* ENA admin level init */
 	rc = ena_com_admin_init(ena_dev, &aenq_handlers);
-	if (rc) {
+	if (unlikely(rc)) {
 		dev_err(dev,
 			"Can not initialize ena admin queue with device\n");
 		goto err_mmio_read_less;
@@ -3610,7 +3778,8 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 		BIT(ENA_ADMIN_FATAL_ERROR) |
 		BIT(ENA_ADMIN_WARNING) |
 		BIT(ENA_ADMIN_NOTIFICATION) |
-		BIT(ENA_ADMIN_KEEP_ALIVE);
+		BIT(ENA_ADMIN_KEEP_ALIVE) |
+		BIT(ENA_ADMIN_CONF_NOTIFICATIONS);
 
 	aenq_groups &= get_feat_ctx->aenq.supported_groups;
 
@@ -3622,7 +3791,11 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 
 	*wd_state = !!(aenq_groups & BIT(ENA_ADMIN_KEEP_ALIVE));
 
-	set_default_llq_configurations(adapter, &llq_config, &get_feat_ctx->llq);
+	rc = ena_set_llq_configurations(adapter, &llq_config, &get_feat_ctx->llq);
+	if (rc) {
+		netdev_err(netdev, "Cannot set llq configuration rc= %d\n", rc);
+		goto err_admin_init;
+	}
 
 	rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx->llq,
 					     &llq_config);
@@ -3707,7 +3880,6 @@ int ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 
 	dev_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 	adapter->dev_up_before_reset = dev_up;
-	ena_sysfs_terminate(&adapter->pdev->dev);
 	if (!graceful)
 		ena_com_set_admin_running_state(ena_dev, false);
 
@@ -3778,17 +3950,13 @@ int ena_restore_device(struct ena_adapter *adapter)
 		dev_err(&pdev->dev, "Enable MSI-X failed\n");
 		goto err_device_destroy;
 	}
-	rc = ena_sysfs_init(&pdev->dev);
-	if (rc) {
-		dev_err(&pdev->dev, "Cannot initialize sysfs\n");
-		goto err_disable_msix;
-	}
+
 	/* If the interface was up before the reset bring it up */
 	if (adapter->dev_up_before_reset) {
 		rc = ena_up(adapter);
 		if (rc) {
 			dev_err(&pdev->dev, "Failed to create I/O queues\n");
-			goto err_sysfs_terminate;
+			goto err_disable_msix;
 		}
 	}
 
@@ -3802,8 +3970,6 @@ int ena_restore_device(struct ena_adapter *adapter)
 	adapter->last_keep_alive_jiffies = jiffies;
 
 	return rc;
-err_sysfs_terminate:
-	ena_sysfs_terminate(&pdev->dev);
 err_disable_msix:
 	ena_free_mgmnt_irq(adapter);
 	ena_disable_msix(adapter);
@@ -3850,7 +4016,7 @@ static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
 {
 	struct ena_napi *ena_napi = container_of(rx_ring->napi, struct ena_napi, napi);
 
-	if (likely(READ_ONCE(ena_napi->first_interrupt)))
+	if (likely(READ_ONCE(ena_napi->last_intr_jiffies) != 0))
 		return 0;
 
 	if (ena_com_cq_empty(rx_ring->ena_com_io_cq))
@@ -3870,59 +4036,82 @@ static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
 	return 0;
 }
 
-static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
-					  struct ena_ring *tx_ring)
+enum ena_regs_reset_reason_types check_cdesc_in_tx_cq(struct ena_adapter *adapter,
+						      struct ena_ring *tx_ring)
+{
+	struct net_device *netdev = adapter->netdev;
+	u16 req_id;
+	int rc;
+
+	rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, &req_id);
+
+	/* TX CQ is empty */
+	if (rc == -EAGAIN) {
+		netif_err(adapter, tx_err, netdev,
+			  "No completion descriptors found in CQ %d",
+			  tx_ring->qid);
+
+		return ENA_REGS_RESET_MISS_TX_CMPL;
+	}
+
+	/* TX CQ has cdescs */
+	netif_err(adapter, tx_err, netdev,
+		  "Completion descriptors found in CQ %d", tx_ring->qid);
+
+	return ENA_REGS_RESET_MISS_INTERRUPT;
+}
+
+static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter, struct ena_ring *tx_ring)
 {
+	unsigned long miss_tx_comp_to_jiffies = adapter->missing_tx_completion_to_jiffies;
 	struct ena_napi *ena_napi = container_of(tx_ring->napi, struct ena_napi, napi);
 	enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_MISS_TX_CMPL;
-	unsigned int time_since_last_napi;
-	unsigned int missing_tx_comp_to;
-	bool is_tx_comp_time_expired;
+	u32 missed_tx_thresh = adapter->missing_tx_completion_threshold;
+	struct net_device *netdev = adapter->netdev;
+	unsigned long jiffies_since_last_napi;
+	unsigned long jiffies_since_last_intr;
+	unsigned long graceful_timeout;
 	struct ena_tx_buffer *tx_buf;
-	unsigned long last_jiffies;
+	unsigned long timeout;
 	int napi_scheduled;
 	u32 missed_tx = 0;
+	bool is_expired;
 	int i, rc = 0;
 
-	missing_tx_comp_to = jiffies_to_msecs(adapter->missing_tx_completion_to);
-
 	for (i = 0; i < tx_ring->ring_size; i++) {
 		tx_buf = &tx_ring->tx_buffer_info[i];
-		last_jiffies = tx_buf->last_jiffies;
-
-		if (last_jiffies == 0)
-			/* no pending Tx at this location */
+		if (tx_buf->tx_sent_jiffies == 0)
+			/* No pending Tx at this location */
 			continue;
 
-		is_tx_comp_time_expired = time_is_before_jiffies(last_jiffies +
-			 2 * adapter->missing_tx_completion_to);
+		timeout = tx_buf->tx_sent_jiffies + miss_tx_comp_to_jiffies;
+		graceful_timeout = timeout + miss_tx_comp_to_jiffies;
 
-		if (unlikely(!READ_ONCE(ena_napi->first_interrupt) && is_tx_comp_time_expired)) {
-			/* If after graceful period interrupt is still not
-			 * received, we schedule a reset
-			 */
-			netif_err(adapter, tx_err, adapter->netdev,
+		/* Checking if current TX ring didn't get first interrupt */
+		is_expired = time_is_before_jiffies(graceful_timeout);
+		if (unlikely(READ_ONCE(ena_napi->last_intr_jiffies) == 0 && is_expired)) {
+			/* If first interrupt is still not received, schedule a reset */
+			netif_err(adapter, tx_err, netdev,
 				  "Potential MSIX issue on Tx side Queue = %d. Reset the device\n",
 				  tx_ring->qid);
 			ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT);
 			return -EIO;
 		}
 
-		is_tx_comp_time_expired = time_is_before_jiffies(last_jiffies +
-			adapter->missing_tx_completion_to);
-
-		if (unlikely(is_tx_comp_time_expired)) {
-
-			time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies);
-			napi_scheduled = !!(ena_napi->napi.state & NAPIF_STATE_SCHED);
-
-			if (missing_tx_comp_to < time_since_last_napi && napi_scheduled) {
-				/* We suspect napi isn't called because the
-				 * bottom half is not run. Require a bigger
-				 * timeout for these cases
+		/* Checking if current TX buffer got timeout */
+		is_expired = time_is_before_jiffies(timeout);
+		if (unlikely(is_expired)) {
+			/* Checking if current TX ring got NAPI timeout */
+			unsigned long last_napi = READ_ONCE(tx_ring->tx_stats.last_napi_jiffies);
+
+			jiffies_since_last_napi = jiffies - last_napi;
+			jiffies_since_last_intr = jiffies - READ_ONCE(ena_napi->last_intr_jiffies);
+			napi_scheduled = !!(READ_ONCE(ena_napi->napi.state) & NAPIF_STATE_SCHED);
+			if (jiffies_since_last_napi > miss_tx_comp_to_jiffies && napi_scheduled) {
+				/* We suspect napi isn't called because the bottom half is not run.
+				 * Require a bigger timeout for these cases.
 				 */
-				if (!time_is_before_jiffies(last_jiffies +
-					2 * adapter->missing_tx_completion_to))
+				if (time_is_after_jiffies(graceful_timeout))
 					continue;
 
 				reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
@@ -3931,30 +4120,43 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 			if (tx_buf->print_once)
 				continue;
 
-			netif_notice(adapter, tx_err, adapter->netdev,
-				     "TX hasn't completed, qid %d, index %d. %u usecs from last napi execution, napi scheduled: %d\n",
-				     tx_ring->qid, i, time_since_last_napi, napi_scheduled);
+			netif_notice(adapter, tx_err, netdev,
+				     "TX hasn't completed, qid %d, index %d. %u msecs since last interrupt, %u msecs since last napi execution, napi scheduled: %d\n",
+				     tx_ring->qid, i, jiffies_to_msecs(jiffies_since_last_intr),
+				     jiffies_to_msecs(jiffies_since_last_napi), napi_scheduled);
 
 			missed_tx++;
 			tx_buf->print_once = 1;
 		}
 	}
 
-	if (unlikely(missed_tx > adapter->missing_tx_completion_threshold)) {
-		netif_err(adapter, tx_err, adapter->netdev,
-			  "Lost TX completions are above the threshold (%d > %d). Completion transmission timeout: %u.\n",
+	/* Checking if this TX ring got to max missing TX completes */
+	if (unlikely(missed_tx > missed_tx_thresh)) {
+		jiffies_since_last_intr = jiffies - READ_ONCE(ena_napi->last_intr_jiffies);
+		jiffies_since_last_napi = jiffies - READ_ONCE(tx_ring->tx_stats.last_napi_jiffies);
+		netif_err(adapter, tx_err, netdev,
+			  "Lost TX completions are above the threshold (%d > %d). Completion transmission timeout: %u (msec). %u msecs since last interrupt, %u msecs since last napi execution.\n",
 			  missed_tx,
-			  adapter->missing_tx_completion_threshold,
-			  missing_tx_comp_to);
-		netif_err(adapter, tx_err, adapter->netdev,
-			  "Resetting the device\n");
-
+			  missed_tx_thresh,
+			  jiffies_to_msecs(miss_tx_comp_to_jiffies),
+			  jiffies_to_msecs(jiffies_since_last_intr),
+			  jiffies_to_msecs(jiffies_since_last_napi));
+		netif_err(adapter, tx_err, netdev, "Resetting the device\n");
+		/* Set the reset flag to prevent NAPI from running */
+		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		/* Need to make sure that reset reason is visible to ena_io_poll to prevent it
+		 * from accessing CQ concurrently with check_cdesc_in_tx_cq()
+		 */
+		smp_mb();
+		napi_scheduled = !!(READ_ONCE(ena_napi->napi.state) & NAPIF_STATE_SCHED);
+		if (!napi_scheduled)
+			reset_reason = check_cdesc_in_tx_cq(adapter, tx_ring);
+		/* Update reset reason */
 		ena_reset_device(adapter, reset_reason);
 		rc = -EIO;
 	}
 
-	ena_increase_stat(&tx_ring->tx_stats.missed_tx, missed_tx,
-			  &tx_ring->syncp);
+	ena_increase_stat(&tx_ring->tx_stats.missed_tx, missed_tx, &tx_ring->syncp);
 
 	return rc;
 }
@@ -3963,10 +4165,11 @@ static void check_for_missing_completions(struct ena_adapter *adapter)
 {
 	struct ena_ring *tx_ring;
 	struct ena_ring *rx_ring;
-	int i, budget, rc;
+	int qid, budget, rc;
 	int io_queue_count;
 
 	io_queue_count = adapter->xdp_num_queues + adapter->num_io_queues;
+
 	/* Make sure the driver doesn't turn the device in other process */
 	smp_rmb();
 
@@ -3976,30 +4179,32 @@ static void check_for_missing_completions(struct ena_adapter *adapter)
 	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
 		return;
 
-	if (adapter->missing_tx_completion_to == ENA_HW_HINTS_NO_TIMEOUT)
+	if (adapter->missing_tx_completion_to_jiffies == ENA_HW_HINTS_NO_TIMEOUT)
 		return;
 
-	budget = ENA_MONITORED_TX_QUEUES;
+	budget = min_t(u32, io_queue_count, ENA_MONITORED_TX_QUEUES);
 
-	for (i = adapter->last_monitored_tx_qid; i < io_queue_count; i++) {
-		tx_ring = &adapter->tx_ring[i];
-		rx_ring = &adapter->rx_ring[i];
+	qid = adapter->last_monitored_tx_qid;
+
+	while (budget) {
+		qid = (qid + 1) % io_queue_count;
+
+		tx_ring = &adapter->tx_ring[qid];
+		rx_ring = &adapter->rx_ring[qid];
 
 		rc = check_missing_comp_in_tx_queue(adapter, tx_ring);
 		if (unlikely(rc))
 			return;
 
-		rc =  !ENA_IS_XDP_INDEX(adapter, i) ?
+		rc =  !ENA_IS_XDP_INDEX(adapter, qid) ?
 			check_for_rx_interrupt_queue(adapter, rx_ring) : 0;
 		if (unlikely(rc))
 			return;
 
 		budget--;
-		if (!budget)
-			break;
 	}
 
-	adapter->last_monitored_tx_qid = i % io_queue_count;
+	adapter->last_monitored_tx_qid = qid;
 }
 
 /* trigger napi schedule after 2 consecutive detections */
@@ -4074,8 +4279,6 @@ static void check_for_missing_keep_alive(struct ena_adapter *adapter)
 	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "Keep alive watchdog timeout.\n");
-		ena_increase_stat(&adapter->dev_stats.wd_expired, 1,
-				  &adapter->syncp);
 		ena_reset_device(adapter, ENA_REGS_RESET_KEEP_ALIVE_TO);
 	}
 }
@@ -4085,8 +4288,6 @@ static void check_for_admin_com_state(struct ena_adapter *adapter)
 	if (unlikely(!ena_com_get_admin_running_state(adapter->ena_dev))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "ENA admin queue is not in running state!\n");
-		ena_increase_stat(&adapter->dev_stats.admin_q_pause, 1,
-				  &adapter->syncp);
 		ena_reset_device(adapter, ENA_REGS_RESET_ADMIN_TO);
 	}
 }
@@ -4111,9 +4312,9 @@ static void ena_update_hints(struct ena_adapter *adapter,
 
 	if (hints->missing_tx_completion_timeout) {
 		if (hints->missing_tx_completion_timeout == ENA_HW_HINTS_NO_TIMEOUT)
-			adapter->missing_tx_completion_to = ENA_HW_HINTS_NO_TIMEOUT;
+			adapter->missing_tx_completion_to_jiffies = ENA_HW_HINTS_NO_TIMEOUT;
 		else
-			adapter->missing_tx_completion_to =
+			adapter->missing_tx_completion_to_jiffies =
 				msecs_to_jiffies(hints->missing_tx_completion_timeout);
 	}
 
@@ -4354,115 +4555,6 @@ static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
 	pci_release_selected_regions(pdev, release_bars);
 }
 
-
-static int ena_calc_io_queue_size(struct ena_adapter *adapter,
-				   struct ena_com_dev_get_features_ctx *get_feat_ctx)
-{
-	struct ena_admin_feature_llq_desc *llq = &get_feat_ctx->llq;
-	struct ena_com_dev *ena_dev = adapter->ena_dev;
-	u32 tx_queue_size = ENA_DEFAULT_RING_SIZE;
-	bool tx_configured, rx_configured;
-	u32 max_tx_queue_size;
-	u32 max_rx_queue_size;
-
-	/* If this function is called after driver load, the ring sizes have
-	 * already been configured. Take it into account when recalculating ring
-	 * size.
-	 */
-	tx_configured = !!adapter->tx_ring[0].ring_size;
-	rx_configured = !!adapter->rx_ring[0].ring_size;
-	tx_queue_size = tx_configured ? adapter->tx_ring[0].ring_size : tx_queue_size;
-	rx_queue_size = rx_configured ? adapter->rx_ring[0].ring_size : rx_queue_size;
-
-	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
-		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
-			&get_feat_ctx->max_queue_ext.max_queue_ext;
-		max_rx_queue_size = min_t(u32, max_queue_ext->max_rx_cq_depth,
-					  max_queue_ext->max_rx_sq_depth);
-		max_tx_queue_size = max_queue_ext->max_tx_cq_depth;
-
-		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
-			max_tx_queue_size = min_t(u32, max_tx_queue_size,
-						  llq->max_llq_depth);
-		else
-			max_tx_queue_size = min_t(u32, max_tx_queue_size,
-						  max_queue_ext->max_tx_sq_depth);
-
-		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-						 max_queue_ext->max_per_packet_tx_descs);
-		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-						 max_queue_ext->max_per_packet_rx_descs);
-	} else {
-		struct ena_admin_queue_feature_desc *max_queues =
-			&get_feat_ctx->max_queues;
-		max_rx_queue_size = min_t(u32, max_queues->max_cq_depth,
-					  max_queues->max_sq_depth);
-		max_tx_queue_size = max_queues->max_cq_depth;
-
-		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
-			max_tx_queue_size = min_t(u32, max_tx_queue_size,
-						  llq->max_llq_depth);
-		else
-			max_tx_queue_size = min_t(u32, max_tx_queue_size,
-						  max_queues->max_sq_depth);
-
-		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-						 max_queues->max_packet_tx_descs);
-		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-						 max_queues->max_packet_rx_descs);
-	}
-
-	max_tx_queue_size = rounddown_pow_of_two(max_tx_queue_size);
-	max_rx_queue_size = rounddown_pow_of_two(max_rx_queue_size);
-
-	if (max_tx_queue_size < ENA_MIN_RING_SIZE) {
-		netdev_err(adapter->netdev, "Device max TX queue size: %d < minimum: %d\n",
-			   max_tx_queue_size, ENA_MIN_RING_SIZE);
-		return -EFAULT;
-	}
-
-	if (max_rx_queue_size < ENA_MIN_RING_SIZE) {
-		netdev_err(adapter->netdev, "Device max RX queue size: %d < minimum: %d\n",
-			   max_rx_queue_size, ENA_MIN_RING_SIZE);
-		return -EFAULT;
-	}
-
-	/* When forcing large headers, we multiply the entry size by 2,
-	 * and therefore divide the queue size by 2, leaving the amount
-	 * of memory used by the queues unchanged.
-	 */
-	if (adapter->large_llq_header_enabled) {
-		if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
-		    (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)) {
-			max_tx_queue_size /= 2;
-			dev_info(&adapter->pdev->dev,
-				 "Forcing large headers and decreasing maximum TX queue size to %d\n",
-				 max_tx_queue_size);
-		} else {
-			dev_err(&adapter->pdev->dev,
-				"Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
-
-			adapter->large_llq_header_enabled = false;
-			ena_devlink_disable_large_llq_header_param(adapter->devlink);
-		}
-	}
-
-	tx_queue_size = clamp_val(tx_queue_size, ENA_MIN_RING_SIZE,
-				  max_tx_queue_size);
-	rx_queue_size = clamp_val(rx_queue_size, ENA_MIN_RING_SIZE,
-				  max_rx_queue_size);
-
-	tx_queue_size = rounddown_pow_of_two(tx_queue_size);
-	rx_queue_size = rounddown_pow_of_two(rx_queue_size);
-
-	adapter->max_tx_ring_size  = max_tx_queue_size;
-	adapter->max_rx_ring_size = max_rx_queue_size;
-	adapter->requested_tx_ring_size = tx_queue_size;
-	adapter->requested_rx_ring_size = rx_queue_size;
-
-	return 0;
-}
-
 /* ena_probe - Device Initialization Routine
  * @pdev: PCI device information struct
  * @ent: entry in ena_pci_tbl
@@ -4480,7 +4572,6 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	struct ena_adapter *adapter;
 	struct net_device *netdev;
 	static int adapters_found;
-	struct devlink *devlink;
 	u32 max_num_io_queues;
 	bool wd_state;
 	int bars, rc;
@@ -4569,28 +4660,23 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_netdev_destroy;
 	}
 
-	adapter->large_llq_header_enabled = !!force_large_llq_header;
+	adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_UNSPECIFIED;
+	ena_set_forced_llq_size_policy(adapter);
 
 #ifdef ENA_PHC_SUPPORT
 	ena_phc_enable(adapter, !!phc_enable);
 
 #endif /* ENA_PHC_SUPPORT */
 	rc = ena_com_allocate_customer_metrics_buffer(ena_dev);
-	if (rc) {
+	if (unlikely(rc)) {
 		netdev_err(netdev, "ena_com_allocate_customer_metrics_buffer failed\n");
 		goto err_free_phc;
 	}
 
-	devlink = ena_devlink_alloc(adapter);
-	if (!devlink) {
-		netdev_err(netdev, "ena_devlink_alloc failed\n");
-		goto err_metrics_destroy;
-	}
-
 	rc = ena_map_llq_mem_bar(pdev, ena_dev, bars);
 	if (rc) {
 		dev_err(&pdev->dev, "ENA LLQ bar mapping failed\n");
-		goto err_devlink_destroy;
+		goto err_metrics_destroy;
 	}
 
 	rc = ena_device_init(adapter, pdev, &get_feat_ctx, &wd_state);
@@ -4598,7 +4684,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		dev_err(&pdev->dev, "ENA device init failed\n");
 		if (rc == -ETIME)
 			rc = -EPROBE_DEFER;
-		goto err_devlink_destroy;
+		goto err_metrics_destroy;
 	}
 
 	/* Initial TX and RX interrupt delay. Assumes 1 usec granularity.
@@ -4678,13 +4764,19 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_free_msix;
 	}
 	rc = ena_rss_init_default(adapter);
-	if (rc && (rc != -EOPNOTSUPP)) {
+	if (unlikely(rc && (rc != -EOPNOTSUPP))) {
 		dev_err(&pdev->dev, "Cannot init RSS rc: %d\n", rc);
 		goto err_terminate_sysfs;
 	}
 
 	ena_config_debug_area(adapter);
 
+#ifdef ENA_XDP_NETLINK_ADVERTISEMENT
+	if (ena_xdp_legal_queue_count(adapter, adapter->num_io_queues))
+		netdev->xdp_features = NETDEV_XDP_ACT_BASIC |
+				       NETDEV_XDP_ACT_REDIRECT;
+
+#endif
 	memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len);
 
 	netif_carrier_off(netdev);
@@ -4699,7 +4791,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	adapter->last_keep_alive_jiffies = jiffies;
 	adapter->keep_alive_timeout = ENA_DEVICE_KALIVE_TIMEOUT;
-	adapter->missing_tx_completion_to = TX_TIMEOUT;
+	adapter->missing_tx_completion_to_jiffies = TX_TIMEOUT;
 	adapter->missing_tx_completion_threshold = MAX_NUM_OF_TIMEOUTED_PACKETS;
 
 	ena_update_hints(adapter, &get_feat_ctx.hw_hints);
@@ -4721,8 +4813,6 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	adapters_found++;
 
-	ena_devlink_register(devlink, &pdev->dev);
-
 	return 0;
 
 err_rss:
@@ -4741,8 +4831,6 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 err_device_destroy:
 	ena_com_delete_host_info(ena_dev);
 	ena_com_admin_destroy(ena_dev);
-err_devlink_destroy:
-	ena_devlink_free(devlink);
 err_metrics_destroy:
 	ena_com_delete_customer_metrics_buffer(ena_dev);
 err_free_phc:
@@ -4773,15 +4861,10 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 	struct ena_adapter *adapter = pci_get_drvdata(pdev);
 	struct ena_com_dev *ena_dev;
 	struct net_device *netdev;
-	struct devlink *devlink;
 
 	ena_dev = adapter->ena_dev;
 	netdev = adapter->netdev;
 
-	devlink = adapter->devlink;
-	ena_devlink_unregister(devlink);
-	ena_devlink_free(devlink);
-
 #ifdef CONFIG_RFS_ACCEL
 	if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) {
 		free_irq_cpu_rmap(netdev->rx_cpu_rmap);
@@ -4789,6 +4872,7 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 	}
 
 #endif /* CONFIG_RFS_ACCEL */
+	ena_sysfs_terminate(&adapter->pdev->dev);
 	/* Make sure timer and reset routine won't be called after
 	 * freeing device resources.
 	 */
@@ -5047,6 +5131,31 @@ static void ena_refresh_fw_capabilites(void *adapter_data,
 	set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 }
 
+
+static void ena_conf_notification(void *adapter_data,
+				  struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+	struct ena_admin_aenq_conf_notifications_desc *desc;
+	u64 bitmap;
+	int bit;
+
+	desc = (struct ena_admin_aenq_conf_notifications_desc *)aenq_e;
+	bitmap = desc->notifications_bitmap;
+
+	if (bitmap == 0) {
+		netif_dbg(adapter, drv, adapter->netdev,
+			  "Empty configuration notification bitmap\n");
+		return;
+	}
+
+	for_each_set_bit(bit, (unsigned long *)&bitmap, BITS_PER_TYPE(bitmap)) {
+		netif_info(adapter, drv, adapter->netdev,
+			  "Sub-optimal configuration notification code: %d. Refer to AWS ENA documentation for additional details and mitigation options.\n",
+			  bit + 1);
+	}
+}
+
 /* This handler will called for unknown event group or unimplemented handlers*/
 static void unimplemented_aenq_handler(void *data,
 				       struct ena_admin_aenq_entry *aenq_e)
@@ -5062,6 +5171,7 @@ static struct ena_aenq_handlers aenq_handlers = {
 		[ENA_ADMIN_LINK_CHANGE] = ena_update_on_link_change,
 		[ENA_ADMIN_NOTIFICATION] = ena_notification,
 		[ENA_ADMIN_KEEP_ALIVE] = ena_keep_alive_wd,
+		[ENA_ADMIN_CONF_NOTIFICATIONS] = ena_conf_notification,
 		[ENA_ADMIN_REFRESH_CAPABILITIES] = ena_refresh_fw_capabilites,
 	},
 	.unimplemented_handler = unimplemented_aenq_handler
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 97bdd08853400..9b3b20a5b680f 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -19,6 +19,9 @@
 #include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#ifdef ENA_XDP_SUPPORT
+#include <net/xdp.h>
+#endif
 #ifdef HAS_BPF_HEADER
 #include <uapi/linux/bpf.h>
 #endif
@@ -28,8 +31,8 @@
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_GEN_MAJOR	2
-#define DRV_MODULE_GEN_MINOR	8
-#define DRV_MODULE_GEN_SUBMINOR 9
+#define DRV_MODULE_GEN_MINOR	10
+#define DRV_MODULE_GEN_SUBMINOR 0
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -139,7 +142,7 @@ struct ena_irq {
 };
 
 struct ena_napi {
-	u8 first_interrupt ____cacheline_aligned;
+	unsigned long last_intr_jiffies ____cacheline_aligned;
 	u8 interrupts_masked;
 	struct napi_struct napi;
 	struct ena_ring *tx_ring;
@@ -182,7 +185,7 @@ struct ena_tx_buffer {
 	 * a given packet is not expected to be handled by ena_start_xmit
 	 * and by napi/timer_service at the same time.
 	 */
-	unsigned long last_jiffies;
+	unsigned long tx_sent_jiffies;
 	struct ena_com_buf bufs[ENA_PKT_MAX_BUFS];
 } ____cacheline_aligned;
 
@@ -363,6 +366,17 @@ struct ena_stats_dev {
 	u64 tx_drops;
 	u64 rx_overruns;
 	u64 reset_fail;
+	u64 total_resets;
+	u64 bad_tx_req_id;
+	u64 bad_rx_req_id;
+	u64 bad_rx_desc_num;
+	u64 missing_intr;
+	u64 suspected_poll_starvation;
+	u64 missing_tx_cmpl;
+	u64 rx_desc_malformed;
+	u64 tx_desc_malformed;
+	u64 invalid_state;
+	u64 os_netdev_wd;
 };
 
 enum ena_flags_t {
@@ -374,6 +388,17 @@ enum ena_flags_t {
 	ENA_FLAG_ONGOING_RESET
 };
 
+enum ena_llq_header_size_policy_t {
+	/* Intermediate policy until llq configuration is initialized
+	 * to either NORMAL or LARGE
+	 */
+	ENA_LLQ_HEADER_SIZE_POLICY_UNSPECIFIED = 0,
+	/* Policy for Normal size LLQ entry (128B) */
+	ENA_LLQ_HEADER_SIZE_POLICY_NORMAL,
+	/* Policy for Large size LLQ entry (256B) */
+	ENA_LLQ_HEADER_SIZE_POLICY_LARGE
+};
+
 /* adapter specific private data structure */
 struct ena_adapter {
 	struct ena_com_dev *ena_dev;
@@ -381,8 +406,6 @@ struct ena_adapter {
 	struct net_device *netdev;
 	struct pci_dev *pdev;
 
-	struct devlink *devlink;
-
 	/* rx packets that are shorter than this len will be copied to the skb
 	 * header
 	 */
@@ -412,12 +435,12 @@ struct ena_adapter {
 
 	u32 msg_enable;
 
-	/* The flag is used for two purposes:
-	 * 1. Indicates that large LLQ has been requested.
+	/* The policy is used for two purposes:
+	 * 1. Indicates who decided on LLQ entry size (user / device)
 	 * 2. Indicates whether large LLQ is set or not after device
 	 *    initialization / configuration.
 	 */
-	bool large_llq_header_enabled;
+	enum ena_llq_header_size_policy_t llq_policy;
 	bool large_llq_header_supported;
 
 	u16 max_tx_sgl_size;
@@ -426,7 +449,7 @@ struct ena_adapter {
 	u8 mac_addr[ETH_ALEN];
 
 	unsigned long keep_alive_timeout;
-	unsigned long missing_tx_completion_to;
+	unsigned long missing_tx_completion_to_jiffies;
 
 	char name[ENA_NAME_MAX_LEN];
 #ifdef ENA_PHC_SUPPORT
@@ -473,6 +496,32 @@ struct ena_adapter {
 	u32 xdp_num_queues;
 };
 
+#define ENA_RESET_STATS_ENTRY(reset_reason, stat) \
+	[reset_reason] = { \
+	.stat_offset = offsetof(struct ena_stats_dev, stat) / sizeof(u64), \
+	.has_counter = true \
+}
+
+struct ena_reset_stats_offset {
+	int stat_offset;
+	bool has_counter;
+};
+
+static const struct ena_reset_stats_offset resets_to_stats_offset_map[ENA_REGS_RESET_LAST] = {
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_KEEP_ALIVE_TO, wd_expired),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_ADMIN_TO, admin_q_pause),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISS_TX_CMPL, missing_tx_cmpl),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_INV_RX_REQ_ID, bad_rx_req_id),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_INV_TX_REQ_ID, bad_tx_req_id),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_TOO_MANY_RX_DESCS, bad_rx_desc_num),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_DRIVER_INVALID_STATE, invalid_state),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_OS_NETDEV_WD, os_netdev_wd),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISS_INTERRUPT, missing_intr),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_SUSPECTED_POLL_STARVATION, suspected_poll_starvation),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED, rx_desc_malformed),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED, tx_desc_malformed),
+};
+
 void ena_set_ethtool_ops(struct net_device *netdev);
 
 void ena_dump_stats_to_dmesg(struct ena_adapter *adapter);
@@ -482,14 +531,24 @@ void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf);
 
 int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled);
 
-int ena_update_queue_sizes(struct ena_adapter *adapter,
-			   u32 new_tx_size,
-			   u32 new_rx_size);
+int ena_update_queue_params(struct ena_adapter *adapter,
+			    u32 new_tx_size,
+			    u32 new_rx_size,
+			    u32 new_llq_header_len);
 
 int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count);
 
 int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak);
 
+/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */
+static inline void ena_increase_stat(u64 *statp, u64 cnt,
+			      struct u64_stats_sync *syncp)
+{
+	u64_stats_update_begin(syncp);
+	(*statp) += cnt;
+	u64_stats_update_end(syncp);
+}
+
 int ena_get_sset_count(struct net_device *netdev, int sset);
 #ifdef ENA_BUSY_POLL_SUPPORT
 static inline void ena_bp_init_lock(struct ena_ring *rx_ring)
@@ -564,6 +623,16 @@ static inline bool ena_bp_disable(struct ena_ring *rx_ring)
 static inline void ena_reset_device(struct ena_adapter *adapter,
 				    enum ena_regs_reset_reason_types reset_reason)
 {
+	const struct ena_reset_stats_offset *ena_reset_stats_offset =
+		&resets_to_stats_offset_map[reset_reason];
+
+	if (ena_reset_stats_offset->has_counter) {
+		u64 *stat_ptr = (u64 *)&adapter->dev_stats + ena_reset_stats_offset->stat_offset;
+
+		ena_increase_stat(stat_ptr, 1, &adapter->syncp);
+	}
+
+	ena_increase_stat(&adapter->dev_stats.total_resets, 1, &adapter->syncp);
 	adapter->reset_reason = reset_reason;
 	/* Make sure reset reason is set before triggering the reset */
 	smp_mb__before_atomic();
@@ -583,15 +652,6 @@ int ena_restore_device(struct ena_adapter *adapter);
 int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
 			  struct ena_tx_buffer *tx_info, bool is_xdp);
 
-/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */
-static inline void ena_increase_stat(u64 *statp, u64 cnt,
-			      struct u64_stats_sync *syncp)
-{
-	u64_stats_update_begin(syncp);
-	(*statp) += cnt;
-	u64_stats_update_end(syncp);
-}
-
 static inline void ena_ring_tx_doorbell(struct ena_ring *tx_ring)
 {
 	ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
diff --git a/drivers/amazon/net/ena/ena_phc.c b/drivers/amazon/net/ena/ena_phc.c
index 5b637ef79bc04..705824aab2ef5 100644
--- a/drivers/amazon/net/ena/ena_phc.c
+++ b/drivers/amazon/net/ena/ena_phc.c
@@ -3,7 +3,8 @@
  * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
-#include "ena_devlink.h"
+#include <linux/pci.h>
+#include "ena_netdev.h"
 #include "ena_phc.h"
 
 #ifdef ENA_PHC_SUPPORT
@@ -39,7 +40,7 @@ static int ena_phc_gettimex64(struct ptp_clock_info *clock_info, struct timespec
 
 	ptp_read_system_prets(sts);
 
-	rc = ena_com_phc_get(phc_info->adapter->ena_dev, &timestamp_nsec);
+	rc = ena_com_phc_get_timestamp(phc_info->adapter->ena_dev, &timestamp_nsec);
 
 	ptp_read_system_postts(sts);
 
@@ -60,7 +61,7 @@ static int ena_phc_gettime64(struct ptp_clock_info *clock_info, struct timespec6
 
 	spin_lock_irqsave(&phc_info->lock, flags);
 
-	rc = ena_com_phc_get(phc_info->adapter->ena_dev, &timestamp_nsec);
+	rc = ena_com_phc_get_timestamp(phc_info->adapter->ena_dev, &timestamp_nsec);
 
 	spin_unlock_irqrestore(&phc_info->lock, flags);
 
@@ -87,7 +88,7 @@ static int ena_phc_gettime(struct ptp_clock_info *clock_info, struct timespec *t
 
 	spin_lock_irqsave(&phc_info->lock, flags);
 
-	rc = ena_com_phc_get(phc_info->adapter->ena_dev, &timestamp_nsec);
+	rc = ena_com_phc_get_timestamp(phc_info->adapter->ena_dev, &timestamp_nsec);
 
 	spin_unlock_irqrestore(&phc_info->lock, flags);
 
@@ -167,6 +168,10 @@ static int ena_phc_register(struct ena_adapter *adapter)
 	phc_info = adapter->phc_info;
 	clock_info = &phc_info->clock_info;
 
+	/* PHC may already be registered in case of a reset */
+	if (ena_phc_is_active(adapter))
+		return 0;
+
 	phc_info->adapter = adapter;
 
 	spin_lock_init(&phc_info->lock);
@@ -192,7 +197,8 @@ static void ena_phc_unregister(struct ena_adapter *adapter)
 {
 	struct ena_phc_info *phc_info = adapter->phc_info;
 
-	if (ena_phc_is_active(adapter)) {
+	/* During reset flow, PHC must stay registered to keep kernel's PHC index */
+	if (ena_phc_is_active(adapter) && !test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) {
 		ptp_clock_unregister(phc_info->clock);
 		phc_info->clock = NULL;
 	}
@@ -263,7 +269,6 @@ int ena_phc_init(struct ena_adapter *adapter)
 	ena_com_phc_destroy(ena_dev);
 err_ena_com_phc_init:
 	ena_phc_enable(adapter, false);
-	ena_devlink_disable_phc_param(adapter->devlink);
 	return rc;
 }
 
@@ -281,4 +286,11 @@ int ena_phc_get_index(struct ena_adapter *adapter)
 	return -1;
 }
 
+int ena_phc_get_error_bound(struct ena_adapter *adapter, u32 *error_bound_nsec)
+{
+	if (!ena_phc_is_active(adapter))
+		return -EOPNOTSUPP;
+
+	return ena_com_phc_get_error_bound(adapter->ena_dev, error_bound_nsec);
+}
 #endif /* ENA_PHC_SUPPORT */
diff --git a/drivers/amazon/net/ena/ena_phc.h b/drivers/amazon/net/ena/ena_phc.h
index bb644d5f928fa..5252fc7081199 100644
--- a/drivers/amazon/net/ena/ena_phc.h
+++ b/drivers/amazon/net/ena/ena_phc.h
@@ -35,6 +35,7 @@ int ena_phc_init(struct ena_adapter *adapter);
 void ena_phc_destroy(struct ena_adapter *adapter);
 int ena_phc_alloc(struct ena_adapter *adapter);
 void ena_phc_free(struct ena_adapter *adapter);
+int ena_phc_get_error_bound(struct ena_adapter *adapter, u32 *error_bound);
 #else /* ENA_PHC_SUPPORT */
 
 static inline void ena_phc_enable(struct ena_adapter *adapter, bool enable) { }
@@ -45,6 +46,10 @@ static inline int ena_phc_init(struct ena_adapter *adapter) { return 0; }
 static inline void ena_phc_destroy(struct ena_adapter *adapter) { }
 static inline int ena_phc_alloc(struct ena_adapter *adapter) { return 0; }
 static inline void ena_phc_free(struct ena_adapter *adapter) { }
+static inline int ena_phc_get_error_bound(struct ena_adapter *adapter, u32 *error_bound)
+{
+	return 0;
+}
 #endif /* ENA_PHC_SUPPORT */
 
 #endif /* ENA_PHC_H */
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
index bdbbc8b18df63..9a5a22fb4114e 100644
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -23,6 +23,7 @@ enum ena_regs_reset_reason_types {
 	ENA_REGS_RESET_MISS_INTERRUPT               = 14,
 	ENA_REGS_RESET_SUSPECTED_POLL_STARVATION    = 15,
 	ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED	    = 16,
+	ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED	    = 17,
 	ENA_REGS_RESET_LAST,
 };
 
@@ -105,6 +106,8 @@ enum ena_regs_reset_reason_types {
 #define ENA_REGS_DEV_CTL_QUIESCENT_MASK                     0x4
 #define ENA_REGS_DEV_CTL_IO_RESUME_SHIFT                    3
 #define ENA_REGS_DEV_CTL_IO_RESUME_MASK                     0x8
+#define ENA_REGS_DEV_CTL_RESET_REASON_EXT_SHIFT             24
+#define ENA_REGS_DEV_CTL_RESET_REASON_EXT_MASK              0xf000000
 #define ENA_REGS_DEV_CTL_RESET_REASON_SHIFT                 28
 #define ENA_REGS_DEV_CTL_RESET_REASON_MASK                  0xf0000000
 
diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c
index 98e1f7ecd0f09..109203f5b349a 100644
--- a/drivers/amazon/net/ena/ena_sysfs.c
+++ b/drivers/amazon/net/ena/ena_sysfs.c
@@ -10,6 +10,9 @@
 
 #include "ena_com.h"
 #include "ena_netdev.h"
+#ifdef ENA_PHC_SUPPORT
+#include "ena_phc.h"
+#endif /* ENA_PHC_SUPPORT */
 #include "ena_sysfs.h"
 
 
@@ -51,6 +54,76 @@ static ssize_t ena_show_rx_copybreak(struct device *dev,
 
 static DEVICE_ATTR(rx_copybreak, S_IRUGO | S_IWUSR, ena_show_rx_copybreak,
 		   ena_store_rx_copybreak);
+#ifdef ENA_PHC_SUPPORT
+/* Max PHC error bound string size takes into account max u32 value, null and new line characters */
+#define ENA_PHC_ERROR_BOUND_STR_MAX_LEN 12
+
+static ssize_t ena_show_phc_error_bound(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	u32 error_bound_nsec = 0;
+	int rc;
+
+	rc = ena_phc_get_error_bound(adapter, &error_bound_nsec);
+	if (rc != 0)
+		return rc;
+
+	return snprintf(buf, ENA_PHC_ERROR_BOUND_STR_MAX_LEN, "%u\n", error_bound_nsec);
+}
+
+static DEVICE_ATTR(phc_error_bound, S_IRUGO, ena_show_phc_error_bound, NULL);
+#endif /* ENA_PHC_SUPPORT */
+
+static ssize_t ena_large_llq_set(struct device *dev,
+				 struct device_attribute *attr, const char *buf,
+				 size_t len)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	enum ena_llq_header_size_policy_t new_llq_policy;
+	unsigned long large_llq_enabled;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &large_llq_enabled);
+	if (rc < 0)
+		return rc;
+
+	if (large_llq_enabled != 0 && large_llq_enabled != 1)
+		return -EINVAL;
+
+	rtnl_lock();
+	new_llq_policy = large_llq_enabled ? ENA_LLQ_HEADER_SIZE_POLICY_LARGE :
+					     ENA_LLQ_HEADER_SIZE_POLICY_NORMAL;
+	if (adapter->llq_policy == new_llq_policy)
+		goto unlock;
+
+	adapter->llq_policy = new_llq_policy;
+
+	ena_destroy_device(adapter, false);
+	rc = ena_restore_device(adapter);
+unlock:
+	rtnl_unlock();
+
+	return rc ? rc : len;
+}
+
+#define ENA_LARGE_LLQ_STR_MAX_LEN 3
+
+static ssize_t ena_large_llq_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	bool large_llq_enabled;
+
+	large_llq_enabled = adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE;
+
+	return snprintf(buf, ENA_LARGE_LLQ_STR_MAX_LEN, "%d\n",
+			large_llq_enabled);
+}
+
+static DEVICE_ATTR(large_llq_header, S_IRUGO | S_IWUSR, ena_large_llq_show,
+		   ena_large_llq_set);
 
 /******************************************************************************
  *****************************************************************************/
@@ -59,6 +132,15 @@ int ena_sysfs_init(struct device *dev)
 
 	if (device_create_file(dev, &dev_attr_rx_copybreak))
 		dev_err(dev, "Failed to create rx_copybreak sysfs entry");
+
+#ifdef ENA_PHC_SUPPORT
+	if (device_create_file(dev, &dev_attr_phc_error_bound))
+		dev_err(dev, "Failed to create phc_error_bound sysfs entry");
+
+#endif /* ENA_PHC_SUPPORT */
+
+	if (device_create_file(dev, &dev_attr_large_llq_header))
+		dev_err(dev, "Failed to create large_llq_header sysfs entry");
 	return 0;
 }
 
@@ -67,4 +149,8 @@ int ena_sysfs_init(struct device *dev)
 void ena_sysfs_terminate(struct device *dev)
 {
 	device_remove_file(dev, &dev_attr_rx_copybreak);
+#ifdef ENA_PHC_SUPPORT
+	device_remove_file(dev, &dev_attr_phc_error_bound);
+#endif /* ENA_PHC_SUPPORT */
+	device_remove_file(dev, &dev_attr_large_llq_header);
 }
diff --git a/drivers/amazon/net/ena/ena_xdp.c b/drivers/amazon/net/ena/ena_xdp.c
index 4d8c1709598de..f4a443401a37c 100644
--- a/drivers/amazon/net/ena/ena_xdp.c
+++ b/drivers/amazon/net/ena/ena_xdp.c
@@ -75,8 +75,7 @@ static int ena_xdp_tx_map_frame(struct ena_ring *tx_ring,
 
 int ena_xdp_xmit_frame(struct ena_ring *tx_ring,
 		       struct ena_adapter *adapter,
-		       struct xdp_frame *xdpf,
-		       int flags)
+		       struct xdp_frame *xdpf)
 {
 	struct ena_com_tx_ctx ena_tx_ctx = {};
 	struct ena_tx_buffer *tx_info;
@@ -90,7 +89,7 @@ int ena_xdp_xmit_frame(struct ena_ring *tx_ring,
 
 	rc = ena_xdp_tx_map_frame(tx_ring, tx_info, xdpf, &ena_tx_ctx);
 	if (unlikely(rc))
-		return rc;
+		goto err;
 
 	ena_tx_ctx.req_id = req_id;
 
@@ -103,17 +102,13 @@ int ena_xdp_xmit_frame(struct ena_ring *tx_ring,
 	if (rc)
 		goto error_unmap_dma;
 
-	/* trigger the dma engine. ena_ring_tx_doorbell()
-	 * calls a memory barrier inside it.
-	 */
-	if (flags & XDP_XMIT_FLUSH)
-		ena_ring_tx_doorbell(tx_ring);
-
 	return rc;
 
 error_unmap_dma:
 	ena_unmap_tx_buff(tx_ring, tx_info);
+err:
 	tx_info->xdpf = NULL;
+
 	return rc;
 }
 
@@ -142,7 +137,7 @@ int ena_xdp_xmit(struct net_device *dev, int n,
 	spin_lock(&tx_ring->xdp_tx_lock);
 
 	for (i = 0; i < n; i++) {
-		if (ena_xdp_xmit_frame(tx_ring, adapter, frames[i], 0))
+		if (ena_xdp_xmit_frame(tx_ring, adapter, frames[i]))
 			break;
 		nxmit++;
 	}
@@ -205,13 +200,14 @@ int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
 
 #ifdef AF_XDP_BUSY_POLL_SUPPORTED
 	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid,
-			      rx_ring->napi->napi_id < 0);
+			      rx_ring->napi->napi_id);
 #else
 	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid);
 #endif
 
-	netif_dbg(rx_ring->adapter, ifup, rx_ring->netdev, "Registering RX info for queue %d",
-		  rx_ring->qid);
+	netif_dbg(rx_ring->adapter, ifup, rx_ring->netdev,
+		  "Registering RX info for queue %d with napi id %d\n",
+		  rx_ring->qid, rx_ring->napi->napi_id);
 	if (rc) {
 		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
 			  "Failed to register xdp rx queue info. RX queue num %d rc: %d\n",
@@ -247,10 +243,10 @@ void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring)
 	for (i = 0; i < tx_ring->ring_size; i++) {
 		struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i];
 
-		if (tx_info->last_jiffies)
+		if (tx_info->tx_sent_jiffies)
 			xsk_frames++;
 
-		tx_info->last_jiffies = 0;
+		tx_info->tx_sent_jiffies = 0;
 	}
 
 	if (xsk_frames)
@@ -365,7 +361,9 @@ static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf)
 				if (rc)
 					return rc;
 			}
+			xdp_features_set_redirect_target(netdev, false);
 		} else if (old_bpf_prog) {
+			xdp_features_clear_redirect_target(netdev);
 			netif_dbg(adapter, drv, adapter->netdev,
 				  "Removing XDP program\n");
 
@@ -416,10 +414,10 @@ static int ena_xsk_pool_enable(struct ena_adapter *adapter,
 	bool dev_was_up = false;
 	int err;
 
-	if (!ena_xdp_legal_queue_count(adapter, qid)) {
+	if (qid >= adapter->num_io_queues) {
 		netdev_err(adapter->netdev,
 			   "Max qid for XSK pool is %d (received %d)\n",
-			   adapter->max_num_io_queues >> 1, qid);
+			   adapter->num_io_queues, qid);
 		return -EINVAL;
 	}
 
@@ -496,10 +494,10 @@ static int ena_xsk_pool_setup(struct ena_adapter *adapter,
  */
 int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf)
 {
-#if !defined(ENA_XDP_QUERY_IN_KERNEL) || defined(ENA_AF_XDP_SUPPORT)
+#if defined(ENA_XDP_QUERY_IN_DRIVER) || defined(ENA_AF_XDP_SUPPORT)
 	struct ena_adapter *adapter = netdev_priv(netdev);
 
-#endif /* ENA_XDP_QUERY_IN_KERNEL || ENA_AF_XDP_SUPPORT */
+#endif /* ENA_XDP_QUERY_IN_DRIVER || ENA_AF_XDP_SUPPORT */
 	switch (bpf->command) {
 	case XDP_SETUP_PROG:
 		return ena_xdp_set(netdev, bpf);
@@ -507,7 +505,7 @@ int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf)
 	case XDP_SETUP_XSK_POOL:
 		return ena_xsk_pool_setup(adapter, bpf->xsk.pool, bpf->xsk.queue_id);
 #endif /* ENA_AF_XDP_SUPPORT */
-#ifndef ENA_XDP_QUERY_IN_KERNEL
+#ifdef ENA_XDP_QUERY_IN_DRIVER
 	case XDP_QUERY_PROG:
 		bpf->prog_id = adapter->xdp_bpf_prog ?
 			adapter->xdp_bpf_prog->aux->id : 0;
@@ -558,13 +556,10 @@ static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 	u32 total_done = 0;
 	u16 next_to_clean;
 	bool needs_wakeup;
-	u32 tx_bytes = 0;
 	int tx_pkts = 0;
 	u16 req_id;
 	int rc;
 
-	if (unlikely(!tx_ring))
-		return 0;
 	next_to_clean = tx_ring->next_to_clean;
 
 	while (tx_pkts < budget) {
@@ -577,6 +572,11 @@ static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 			if (unlikely(rc == -EINVAL))
 				handle_invalid_req_id(tx_ring, req_id, NULL,
 						      true);
+			else if (unlikely(rc == -EFAULT)) {
+				ena_reset_device(
+					tx_ring->adapter,
+					ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED);
+			}
 			break;
 		}
 
@@ -587,7 +587,7 @@ static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 
 		tx_info = &tx_ring->tx_buffer_info[req_id];
 
-		tx_info->last_jiffies = 0;
+		tx_info->tx_sent_jiffies = 0;
 
 		if (!is_zc_q) {
 			xdpf = tx_info->xdpf;
@@ -599,7 +599,6 @@ static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 		netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
 			  "tx_poll: q %d pkt #%d req_id %d\n", tx_ring->qid, tx_pkts, req_id);
 
-		tx_bytes += tx_info->total_tx_size;
 		tx_pkts++;
 		total_done += tx_info->tx_descs;
 
@@ -871,7 +870,7 @@ static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring,
 	rx_ring->next_to_clean = next_to_clean;
 
 	if (xdp_flags & ENA_XDP_REDIRECT)
-		xdp_do_flush_map();
+		xdp_do_flush();
 
 	refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
 	refill_threshold =
@@ -903,6 +902,9 @@ static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring,
 			ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1,
 					  &rx_ring->syncp);
 			ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID);
+		} else if (rc == -EFAULT) {
+			ena_reset_device(adapter,
+					 ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED);
 		}
 
 		return 0;
diff --git a/drivers/amazon/net/ena/ena_xdp.h b/drivers/amazon/net/ena/ena_xdp.h
index dde8f9053f707..5729569058a7c 100644
--- a/drivers/amazon/net/ena/ena_xdp.h
+++ b/drivers/amazon/net/ena/ena_xdp.h
@@ -50,8 +50,7 @@ void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
 int ena_xdp_io_poll(struct napi_struct *napi, int budget);
 int ena_xdp_xmit_frame(struct ena_ring *tx_ring,
 		       struct ena_adapter *adapter,
-		       struct xdp_frame *xdpf,
-		       int flags);
+		       struct xdp_frame *xdpf);
 int ena_xdp_xmit(struct net_device *dev, int n,
 		 struct xdp_frame **frames, u32 flags);
 int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf);
@@ -147,8 +146,7 @@ static inline int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp
 		/* The XDP queues are shared between XDP_TX and XDP_REDIRECT */
 		spin_lock(&xdp_ring->xdp_tx_lock);
 
-		if (ena_xdp_xmit_frame(xdp_ring, rx_ring->adapter, xdpf,
-				       XDP_XMIT_FLUSH))
+		if (ena_xdp_xmit_frame(xdp_ring, rx_ring->adapter, xdpf))
 			xdp_return_frame(xdpf);
 
 		spin_unlock(&xdp_ring->xdp_tx_lock);
@@ -191,6 +189,7 @@ static inline int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp
 #else /* ENA_XDP_SUPPORT */
 
 #define ENA_IS_XDP_INDEX(adapter, index) (false)
+#define xdp_return_frame(frame) do {} while (0)
 
 static inline bool ena_xdp_present_ring(struct ena_ring *ring)
 {
@@ -209,6 +208,16 @@ static inline int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
 
 static inline void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring) {}
 
+static inline bool ena_xdp_legal_queue_count(struct ena_adapter *adapter,
+					     u32 queues)
+{
+	return false;
+}
+
+static inline bool ena_xdp_present(struct ena_adapter *adapter)
+{
+	return false;
+}
 #endif /* ENA_XDP_SUPPORT */
 #ifndef ENA_AF_XDP_SUPPORT /* stabs for AF XDP code */
 
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 62ddd400e787f..9926a8463fc2b 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -36,6 +36,8 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #ifndef _KCOMPAT_H_
 #define _KCOMPAT_H_
 
+#include "config.h"
+
 #ifndef LINUX_VERSION_CODE
 #include <linux/version.h>
 #endif
@@ -94,10 +96,6 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #define SZ_16K 0x00004000
 #endif
 
-#ifdef HAVE_POLL_CONTROLLER
-#define CONFIG_NET_POLL_CONTROLLER
-#endif
-
 #ifndef __GFP_COLD
 #define __GFP_COLD 0
 #endif
@@ -732,8 +730,8 @@ do {									\
 #define XDP_CONVERT_TO_FRAME_NAME_CHANGED
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
-#define ENA_XDP_QUERY_IN_KERNEL
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0)
+#define ENA_XDP_QUERY_IN_DRIVER
 #endif
 
 #endif
@@ -745,63 +743,6 @@ do {									\
 #define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 #endif
 
-#if defined(CONFIG_NET_DEVLINK) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)
-#define ENA_DEVLINK_SUPPORT
-#endif
-
-#if !defined(CONFIG_NET_DEVLINK) && !defined(CONFIG_NET_DEVLINK_MODULE) && !defined(CONFIG_MAY_USE_DEVLINK)
-#define ENA_NO_DEVLINK_HEADERS
-#endif
-
-#if defined(CONFIG_NET_DEVLINK) && \
-	(KERNEL_VERSION(5, 1, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \
-	!((SUSE_VERSION != 0) && (SUSE_VERSION == 15 && (SUSE_PATCHLEVEL < 2 || SUSE_PATCHLEVEL >= 4))) && \
-	!(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE > UBUNTU_VERSION(5, 16, 0, 0)) && \
-	!(RHEL_RELEASE_CODE))
-#define ENA_DEVLINK_PUBLISH_REQUIRED
-#endif
-
-#if defined(CONFIG_NET_DEVLINK) &&					\
-	(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) ||		\
-	 (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18)))
-#define ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED
-#endif
-
-#if defined(CONFIG_NET_DEVLINK) && \
-	(KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \
-	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
-	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)))
-#define ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
-#endif
-
-#if defined(CONFIG_NET_DEVLINK) &&					\
-	(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 5, 0) ||		\
-	 (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18)))
-#define ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
-#endif
-
-#if defined(CONFIG_NET_DEVLINK) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
-#define ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
-#endif
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \
-	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) || \
-	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
-#define ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
-#endif
-
-#if (KERNEL_VERSION(5, 16, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0)) || \
-	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))  || \
-	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0) && !(defined(FEDORA_RELEASE)))
-#define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \
-	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
-	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
-#define ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
-#endif
-
 #if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) && \
     !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1)) && \
 			    (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6, 6)))) && \
@@ -1115,9 +1056,24 @@ static inline void ena_netif_napi_add(struct net_device *dev,
 #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) */
 }
 
-#if defined(ENA_DEVLINK_SUPPORT) && LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0)
-#define devl_param_driverinit_value_get devlink_param_driverinit_value_get
-#define devl_param_driverinit_value_set devlink_param_driverinit_value_set
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)
+#define ENA_LARGE_LLQ_ETHTOOL
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
+#include <linux/bitfield.h>
+#define ENA_FIELD_GET(value, mask, offset) FIELD_GET(mask, value)
+#else
+#define ENA_FIELD_GET(value, mask, offset) ((typeof(mask))((value & mask) >> offset))
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0)
+#define xdp_features_set_redirect_target(netdev, xdp_xmit_supported)
+#define xdp_features_clear_redirect_target(netdev)
+#define xdp_clear_features_flag(netdev)
+#define xdp_set_features_flag(netdev, features)
+#else /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) */
+#define ENA_XDP_NETLINK_ADVERTISEMENT
 #endif
 
 #if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4))) || \
@@ -1168,4 +1124,12 @@ static inline void ena_dma_unmap_page_attrs(struct device *dev,
 #endif
 }
 
+#ifndef ENA_HAVE_PCI_DEV_ID
+#define pci_dev_id(pdev) ((((u16)(pdev->bus->number)) << 8) | (pdev->devfn))
+#endif /* ENA_HAVE_PCI_DEV_ID */
+
+#ifndef ENA_HAVE_XDP_DO_FLUSH
+#define xdp_do_flush xdp_do_flush_map
+#endif /* ENA_HAVE_XDP_DO_FLUSH */
+
 #endif /* _KCOMPAT_H_ */

From fb4f98aed49439ed8537f0374a23be5cf86bcd71 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sun, 13 Nov 2022 16:38:17 +0000
Subject: [PATCH 138/175] arm64: Add ID_DFR0_EL1.PerfMon values for PMUv3p7 and
 IMP_DEF

commit d017eeabd5092565c3dd1c8a7b00ba724c33c18f upstream.

Align the ID_DFR0_EL1.PerfMon values with ID_AA64DFR0_EL1.PMUver.

Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221113163832.3154370-2-maz@kernel.org
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/sysreg.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 3a448ab0924b3..c362c45c91771 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -704,6 +704,8 @@
 #define ID_DFR0_PERFMON_8_1		0x4
 #define ID_DFR0_PERFMON_8_4		0x5
 #define ID_DFR0_PERFMON_8_5		0x6
+#define ID_DFR0_PERFMON_8_7		0x7
+#define ID_DFR0_PERFMON_IMP_DEF		0xf
 
 #define ID_ISAR4_SWP_FRAC_SHIFT		28
 #define ID_ISAR4_PSR_M_SHIFT		24

From 31bd10c569f6d0cf610594a12747d48e020a3064 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sun, 13 Nov 2022 16:38:26 +0000
Subject: [PATCH 139/175] KVM: arm64: PMU: Move the ID_AA64DFR0_EL1.PMUver
 limit to VM creation

commit 3d0dba5764b94308b8c4257ad64e383f11ce0c92 upstream.

As further patches will enable the selection of a PMU revision
from userspace, sample the supported PMU revision at VM creation
time, rather than building each time the ID_AA64DFR0_EL1 register
is accessed.

This shouldn't result in any change in behaviour.

Reviewed-by: Reiji Watanabe <reijiw@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221113163832.3154370-11-maz@kernel.org
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/kvm_host.h |  4 ++++
 arch/arm64/kvm/arm.c              |  6 ++++++
 arch/arm64/kvm/pmu-emul.c         | 11 ++++++++++
 arch/arm64/kvm/sys_regs.c         | 36 ++++++++++++++++++++++++-------
 include/kvm/arm_pmu.h             |  6 ++++++
 5 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 0e9b093adc672..ac005eb5f0219 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -166,6 +166,10 @@ struct kvm_arch {
 
 	u8 pfr0_csv2;
 	u8 pfr0_csv3;
+	struct {
+		u8 imp:4;
+		u8 unimp:4;
+	} dfr0_pmuver;
 
 	/* Hypercall features firmware registers' descriptor */
 	struct kvm_smccc_features smccc_feat;
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index de94515fb17c6..0093ad6a59cd1 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -174,6 +174,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	set_default_spectre(kvm);
 	kvm_arm_init_hypercalls(kvm);
 
+	/*
+	 * Initialise the default PMUver before there is a chance to
+	 * create an actual PMU.
+	 */
+	kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit();
+
 	return ret;
 out_free_stage2_pgd:
 	kvm_free_stage2_pgd(&kvm->arch.mmu);
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index c7e5f6a28c28b..a93648bc47041 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -1014,3 +1014,14 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 
 	return -ENXIO;
 }
+
+u8 kvm_arm_pmu_get_pmuver_limit(void)
+{
+	u64 tmp;
+
+	tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
+	tmp = cpuid_feature_cap_perfmon_field(tmp,
+					      ID_AA64DFR0_EL1_PMUVer_SHIFT,
+					      ID_AA64DFR0_EL1_PMUVer_V3P4);
+	return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp);
+}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 457e74f1f6717..fb7f2f4451b08 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1079,6 +1079,27 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
 	return true;
 }
 
+static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu)
+{
+	if (kvm_vcpu_has_pmu(vcpu))
+		return vcpu->kvm->arch.dfr0_pmuver.imp;
+
+	return vcpu->kvm->arch.dfr0_pmuver.unimp;
+}
+
+static u8 pmuver_to_perfmon(u8 pmuver)
+{
+	switch (pmuver) {
+	case ID_AA64DFR0_EL1_PMUVer_IMP:
+		return ID_DFR0_PERFMON_8_0;
+	case ID_AA64DFR0_EL1_PMUVer_IMP_DEF:
+		return ID_DFR0_PERFMON_IMP_DEF;
+	default:
+		/* Anything ARMv8.1+ and NI have the same value. For now. */
+		return pmuver;
+	}
+}
+
 /* Read a sanitised cpufeature ID register by sys_reg_desc */
 static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r)
 {
@@ -1128,18 +1149,17 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
 		/* Limit debug to ARMv8.0 */
 		val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer);
 		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6);
-		/* Limit guests to PMUv3 for ARMv8.4 */
-		val = cpuid_feature_cap_perfmon_field(val,
-						      ID_AA64DFR0_EL1_PMUVer_SHIFT,
-						      kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_EL1_PMUVer_V3P4 : 0);
+		/* Set PMUver to the required version */
+		val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
+		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
+				  vcpu_pmuver(vcpu));
 		/* Hide SPE from guests */
 		val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer);
 		break;
 	case SYS_ID_DFR0_EL1:
-		/* Limit guests to PMUv3 for ARMv8.4 */
-		val = cpuid_feature_cap_perfmon_field(val,
-						      ID_DFR0_PERFMON_SHIFT,
-						      kvm_vcpu_has_pmu(vcpu) ? ID_DFR0_PERFMON_8_4 : 0);
+		val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON);
+		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON),
+				  pmuver_to_perfmon(vcpu_pmuver(vcpu)));
 		break;
 	}
 
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 6196b71c5eb58..06d17f8335f91 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -89,6 +89,8 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
 			vcpu->arch.pmu.events = *kvm_get_pmu_events();	\
 	} while (0)
 
+u8 kvm_arm_pmu_get_pmuver_limit(void);
+
 #else
 struct kvm_pmu {
 };
@@ -154,6 +156,10 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
 static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {}
 static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {}
 static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {}
+static inline u8 kvm_arm_pmu_get_pmuver_limit(void)
+{
+	return 0;
+}
 
 #endif
 

From cfac5effbf5e09278aab63d8d49a446051687c8f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sun, 13 Nov 2022 16:38:27 +0000
Subject: [PATCH 140/175] KVM: arm64: PMU: Allow ID_AA64DFR0_EL1.PMUver to be
 set from userspace

commit 60e651ff1f48bfdf8fec80d35510bd89ecf8c766 upstream.

Allow userspace to write ID_AA64DFR0_EL1, on the condition that only
the PMUver field can be altered and be at most the one that was
initially computed for the guest.

Reviewed-by: Reiji Watanabe <reijiw@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221113163832.3154370-12-maz@kernel.org
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/sys_regs.c | 42 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index fb7f2f4451b08..75f83be1ce1b5 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1259,6 +1259,45 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
+			       const struct sys_reg_desc *rd,
+			       u64 val)
+{
+	u8 pmuver, host_pmuver;
+	bool valid_pmu;
+
+	host_pmuver = kvm_arm_pmu_get_pmuver_limit();
+
+	/*
+	 * Allow AA64DFR0_EL1.PMUver to be set from userspace as long
+	 * as it doesn't promise more than what the HW gives us. We
+	 * allow an IMPDEF PMU though, only if no PMU is supported
+	 * (KVM backward compatibility handling).
+	 */
+	pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val);
+	if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver))
+		return -EINVAL;
+
+	valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF);
+
+	/* Make sure view register and PMU support do match */
+	if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
+		return -EINVAL;
+
+	/* We can only differ with PMUver, and anything else is an error */
+	val ^= read_id_reg(vcpu, rd);
+	val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
+	if (val)
+		return -EINVAL;
+
+	if (valid_pmu)
+		vcpu->kvm->arch.dfr0_pmuver.imp = pmuver;
+	else
+		vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver;
+
+	return 0;
+}
+
 /*
  * cpufeature ID register user accessors
  *
@@ -1520,7 +1559,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	ID_UNALLOCATED(4,7),
 
 	/* CRm=5 */
-	ID_SANITISED(ID_AA64DFR0_EL1),
+	{ SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg,
+	  .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, },
 	ID_SANITISED(ID_AA64DFR1_EL1),
 	ID_UNALLOCATED(5,2),
 	ID_UNALLOCATED(5,3),

From 40f034b63714d16fb1fc7bab872c1207acea77b4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sun, 13 Nov 2022 16:38:28 +0000
Subject: [PATCH 141/175] KVM: arm64: PMU: Allow ID_DFR0_EL1.PerfMon to be set
 from userspace

commit d82e0dfdfda73f91e7282e1083a2cd7cd366ea87 upstream.

Allow userspace to write ID_DFR0_EL1, on the condition that only
the PerfMon field can be altered and be something that is compatible
with what was computed for the AArch64 view of the guest.

Reviewed-by: Reiji Watanabe <reijiw@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221113163832.3154370-13-maz@kernel.org
[ bp: clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/sys_regs.c | 57 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 75f83be1ce1b5..74dbde56f7b1b 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1087,6 +1087,19 @@ static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu)
 	return vcpu->kvm->arch.dfr0_pmuver.unimp;
 }
 
+static u8 perfmon_to_pmuver(u8 perfmon)
+{
+	switch (perfmon) {
+	case ID_DFR0_PERFMON_8_0:
+		return ID_AA64DFR0_EL1_PMUVer_IMP;
+	case ID_DFR0_PERFMON_IMP_DEF:
+		return ID_AA64DFR0_EL1_PMUVer_IMP_DEF;
+	default:
+		/* Anything ARMv8.1+ and NI have the same value. For now. */
+		return perfmon;
+	}
+}
+
 static u8 pmuver_to_perfmon(u8 pmuver)
 {
 	switch (pmuver) {
@@ -1298,6 +1311,46 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
+			   const struct sys_reg_desc *rd,
+			   u64 val)
+{
+	u8 perfmon, host_perfmon;
+	bool valid_pmu;
+
+	host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+
+	/*
+	 * Allow DFR0_EL1.PerfMon to be set from userspace as long as
+	 * it doesn't promise more than what the HW gives us on the
+	 * AArch64 side (as everything is emulated with that), and
+	 * that this is a PMUv3.
+	 */
+	perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), val);
+	if ((perfmon != ID_DFR0_PERFMON_IMP_DEF && perfmon > host_perfmon) ||
+	    (perfmon != 0 && perfmon < ID_DFR0_PERFMON_8_0))
+		return -EINVAL;
+
+	valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_PERFMON_IMP_DEF);
+
+	/* Make sure view register and PMU support do match */
+	if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
+		return -EINVAL;
+
+	/* We can only differ with PerfMon, and anything else is an error */
+	val ^= read_id_reg(vcpu, rd);
+	val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON);
+	if (val)
+		return -EINVAL;
+
+	if (valid_pmu)
+		vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon);
+	else
+		vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon);
+
+	return 0;
+}
+
 /*
  * cpufeature ID register user accessors
  *
@@ -1519,7 +1572,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	/* CRm=1 */
 	AA32_ID_SANITISED(ID_PFR0_EL1),
 	AA32_ID_SANITISED(ID_PFR1_EL1),
-	AA32_ID_SANITISED(ID_DFR0_EL1),
+	{ SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg,
+	  .get_user = get_id_reg, .set_user = set_id_dfr0_el1,
+	  .visibility = aa32_id_visibility, },
 	ID_HIDDEN(ID_AFR0_EL1),
 	AA32_ID_SANITISED(ID_MMFR0_EL1),
 	AA32_ID_SANITISED(ID_MMFR1_EL1),

From 51633e1b1c5205734d2fce47b0971cc92500da9e Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 2 Jun 2023 00:51:13 +0000
Subject: [PATCH 142/175] KVM: arm64: Save ID registers' sanitized value per
 guest

Introduce id_regs[] in kvm_arch as a storage of guest's ID registers,
and save ID registers' sanitized value in the array at KVM_CREATE_VM.
Use the saved ones when ID registers are read by the guest or
userspace (via KVM_GET_ONE_REG).

No functional change intended.

Co-developed-by: Reiji Watanabe <reijiw@google.com>
Signed-off-by: Reiji Watanabe <reijiw@google.com>
Signed-off-by: Jing Zhang <jingzhangos@google.com>
[ bp: Context adjustment ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/kvm_host.h | 20 +++++++++
 arch/arm64/kvm/arm.c              |  1 +
 arch/arm64/kvm/sys_regs.c         | 72 +++++++++++++++++++++++++------
 arch/arm64/kvm/sys_regs.h         |  7 +++
 4 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ac005eb5f0219..472754b435836 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -115,6 +115,21 @@ struct kvm_smccc_features {
 	unsigned long vendor_hyp_bmap;
 };
 
+/*
+ * Emulated CPU ID registers per VM
+ * (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it
+ * is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8.
+ *
+ * These emulated idregs are VM-wide, but accessed from the context of a vCPU.
+ * Atomic access to multiple idregs are guarded by kvm_arch.config_lock.
+ */
+#define IDREG_IDX(id)		(((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id))
+#define IDREG(kvm, id)		((kvm)->arch.idregs.regs[IDREG_IDX(id)])
+#define KVM_ARM_ID_REG_NUM	(IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1)
+struct kvm_idregs {
+	u64 regs[KVM_ARM_ID_REG_NUM];
+};
+
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;
 
@@ -173,6 +188,9 @@ struct kvm_arch {
 
 	/* Hypercall features firmware registers' descriptor */
 	struct kvm_smccc_features smccc_feat;
+
+	/* Emulated CPU ID registers */
+	struct kvm_idregs idregs;
 };
 
 struct kvm_vcpu_fault_info {
@@ -919,6 +937,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 				struct kvm_arm_copy_mte_tags *copy_tags);
 
+void kvm_arm_init_id_regs(struct kvm *kvm);
+
 /* Guest/host FPSIMD coordination helpers */
 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 0093ad6a59cd1..27010691fca78 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -173,6 +173,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	set_default_spectre(kvm);
 	kvm_arm_init_hypercalls(kvm);
+	kvm_arm_init_id_regs(kvm);
 
 	/*
 	 * Initialise the default PMUver before there is a chance to
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 74dbde56f7b1b..e2e9ebd7a63fa 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -39,6 +39,7 @@
  * 64bit interface.
  */
 
+static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding);
 static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
 
 static bool read_from_write_only(struct kvm_vcpu *vcpu,
@@ -270,7 +271,7 @@ static bool trap_loregion(struct kvm_vcpu *vcpu,
 			  struct sys_reg_params *p,
 			  const struct sys_reg_desc *r)
 {
-	u64 val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+	u64 val = kvm_arm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1);
 	u32 sr = reg_to_encoding(r);
 
 	if (!(val & (0xfUL << ID_AA64MMFR1_EL1_LO_SHIFT))) {
@@ -1113,18 +1114,11 @@ static u8 pmuver_to_perfmon(u8 pmuver)
 	}
 }
 
-/* Read a sanitised cpufeature ID register by sys_reg_desc */
-static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r)
+static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
 {
-	u32 id = reg_to_encoding(r);
-	u64 val;
+	u64 val = IDREG(vcpu->kvm, encoding);
 
-	if (sysreg_visible_as_raz(vcpu, r))
-		return 0;
-
-	val = read_sanitised_ftr_reg(id);
-
-	switch (id) {
+	switch (encoding) {
 	case SYS_ID_AA64PFR0_EL1:
 		if (!vcpu_has_sve(vcpu))
 			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE);
@@ -1179,6 +1173,26 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
 	return val;
 }
 
+/* Read a sanitised cpufeature ID register by sys_reg_desc */
+static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r)
+{
+	if (sysreg_visible_as_raz(vcpu, r))
+		return 0;
+
+	return kvm_arm_read_id_reg(vcpu, reg_to_encoding(r));
+}
+
+/*
+ * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is
+ * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8.
+ */
+static inline bool is_id_reg(u32 id)
+{
+	return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
+		sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
+		sys_reg_CRm(id) < 8);
+}
+
 static unsigned int id_visibility(const struct kvm_vcpu *vcpu,
 				  const struct sys_reg_desc *r)
 {
@@ -1938,6 +1952,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 },
 };
 
+static const struct sys_reg_desc *first_idreg;
+
 static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
 			struct sys_reg_params *p,
 			const struct sys_reg_desc *r)
@@ -1945,8 +1961,8 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
 	if (p->is_write) {
 		return ignore_write(vcpu, p);
 	} else {
-		u64 dfr = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
-		u64 pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+		u64 dfr = kvm_arm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1);
+		u64 pfr = kvm_arm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1);
 		u32 el3 = !!cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR0_EL1_EL3_SHIFT);
 
 		p->regval = ((((dfr >> ID_AA64DFR0_EL1_WRPs_SHIFT) & 0xf) << 28) |
@@ -3077,8 +3093,32 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 	return write_demux_regids(uindices);
 }
 
+/*
+ * Set the guest's ID registers with ID_SANITISED() to the host's sanitized value.
+ */
+void kvm_arm_init_id_regs(struct kvm *kvm)
+{
+	const struct sys_reg_desc *idreg = first_idreg;
+	u32 id = reg_to_encoding(idreg);
+
+	/* Initialize all idregs */
+	while (is_id_reg(id)) {
+		/*
+		 * Some hidden ID registers which are not in arm64_ftr_regs[]
+		 * would cause warnings from read_sanitised_ftr_reg().
+		 * Skip those ID registers to avoid the warnings.
+		 */
+		if (idreg->visibility != raz_visibility)
+			IDREG(kvm, id) = read_sanitised_ftr_reg(id);
+
+		idreg++;
+		id = reg_to_encoding(idreg);
+	}
+}
+
 int kvm_sys_reg_table_init(void)
 {
+	struct sys_reg_params params;
 	bool valid = true;
 	unsigned int i;
 	struct sys_reg_desc clidr;
@@ -3115,6 +3155,12 @@ int kvm_sys_reg_table_init(void)
 			break;
 	/* Clear all higher bits. */
 	cache_levels &= (1 << (i*3))-1;
+	
+	/* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */
+	params = encoding_to_params(SYS_ID_PFR0_EL1);
+	first_idreg = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
+	if (!first_idreg)
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index e4ebb3a379fdb..946c724661e1d 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -27,6 +27,13 @@ struct sys_reg_params {
 	bool	is_write;
 };
 
+#define encoding_to_params(reg)						\
+	((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg),		\
+				  .Op1 = sys_reg_Op1(reg),		\
+				  .CRn = sys_reg_CRn(reg),		\
+				  .CRm = sys_reg_CRm(reg),		\
+				  .Op2 = sys_reg_Op2(reg) })
+
 #define esr_sys64_to_params(esr)                                               \
 	((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3,                    \
 				  .Op1 = ((esr) >> 14) & 0x7,                  \

From 3912c14e95409ed007c76be7d0ed7ac707948b47 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 2 Jun 2023 00:51:14 +0000
Subject: [PATCH 143/175] KVM: arm64: Use per guest ID register for
 ID_AA64PFR0_EL1.[CSV2|CSV3]

With per guest ID registers, ID_AA64PFR0_EL1.[CSV2|CSV3] settings from
userspace can be stored in its corresponding ID register.

The setting of CSV bits for protected VMs are removed according to the
discussion from Fuad below:
https://lore.kernel.org/all/CA+EHjTwXA9TprX4jeG+-D+c8v9XG+oFdU1o6TSkvVye145_OvA@mail.gmail.com

Besides the removal of CSV bits setting for protected VMs and using
kvm_arch.config_lock to guard VM-scope idreg accesses, no other
functional change intended.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
[ bp: Clean ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/kvm_host.h |  2 --
 arch/arm64/kvm/arm.c              | 17 ---------
 arch/arm64/kvm/sys_regs.c         | 58 +++++++++++++++++++++++++------
 3 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 472754b435836..0850b1d989744 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -179,8 +179,6 @@ struct kvm_arch {
 
 	cpumask_var_t supported_cpus;
 
-	u8 pfr0_csv2;
-	u8 pfr0_csv3;
 	struct {
 		u8 imp:4;
 		u8 unimp:4;
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 27010691fca78..f3eec707982f8 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -114,22 +114,6 @@ static int kvm_arm_default_max_vcpus(void)
 	return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
 }
 
-static void set_default_spectre(struct kvm *kvm)
-{
-	/*
-	 * The default is to expose CSV2 == 1 if the HW isn't affected.
-	 * Although this is a per-CPU feature, we make it global because
-	 * asymmetric systems are just a nuisance.
-	 *
-	 * Userspace can override this as long as it doesn't promise
-	 * the impossible.
-	 */
-	if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
-		kvm->arch.pfr0_csv2 = 1;
-	if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED)
-		kvm->arch.pfr0_csv3 = 1;
-}
-
 /**
  * kvm_arch_init_vm - initializes a VM data structure
  * @kvm:	pointer to the KVM struct
@@ -171,7 +155,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	/* The maximum number of VCPUs is limited by the host's GIC model */
 	kvm->max_vcpus = kvm_arm_default_max_vcpus();
 
-	set_default_spectre(kvm);
 	kvm_arm_init_hypercalls(kvm);
 	kvm_arm_init_id_regs(kvm);
 
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e2e9ebd7a63fa..927bbed5ea965 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1123,10 +1123,6 @@ static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
 		if (!vcpu_has_sve(vcpu))
 			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE);
 		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2);
-		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3);
-		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3);
 		if (kvm_vgic_global_state.type == VGIC_V3) {
 			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC);
 			val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1);
@@ -1255,6 +1251,7 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
 			       const struct sys_reg_desc *rd,
 			       u64 val)
 {
+	u64 new_val = val;
 	u8 csv2, csv3;
 
 	/*
@@ -1280,9 +1277,7 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
 	if (val)
 		return -EINVAL;
 
-	vcpu->kvm->arch.pfr0_csv2 = csv2;
-	vcpu->kvm->arch.pfr0_csv3 = csv3;
-
+	IDREG(vcpu->kvm, reg_to_encoding(rd)) = new_val;
 	return 0;
 }
 
@@ -1368,9 +1363,9 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
 /*
  * cpufeature ID register user accessors
  *
- * For now, these registers are immutable for userspace, so no values
- * are stored, and for set_id_reg() we don't allow the effective value
- * to be changed.
+ * For now, only some registers or some part of registers are mutable for
+ * userspace. For those registers immutable for userspace, in set_id_reg()
+ * we don't allow the effective value to be changed.
  */
 static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 		      u64 *val)
@@ -2903,6 +2898,9 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
 	if (!r)
 		return -ENOENT;
 
+	if (is_id_reg(reg_to_encoding(r)))
+		mutex_lock(&vcpu->kvm->arch.config_lock);
+
 	if (r->get_user) {
 		ret = (r->get_user)(vcpu, r, &val);
 	} else {
@@ -2910,6 +2908,9 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
 		ret = 0;
 	}
 
+	if (is_id_reg(reg_to_encoding(r)))
+		mutex_unlock(&vcpu->kvm->arch.config_lock);
+
 	if (!ret)
 		ret = put_user(val, uaddr);
 
@@ -2947,9 +2948,21 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
 	if (!r)
 		return -ENOENT;
 
+	/* Only allow userspace to change the idregs before VM running */
+	if (is_id_reg(reg_to_encoding(r)) &&
+	    test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &vcpu->kvm->arch.flags) ) {
+		if (val == read_id_reg(vcpu, r))
+			return 0;
+		return -EBUSY;
+	}
+
 	if (sysreg_user_write_ignore(vcpu, r))
 		return 0;
 
+	/* ID regs are global to the VM and cannot be updated concurrently */
+	if (is_id_reg(reg_to_encoding(r)))
+		mutex_lock(&vcpu->kvm->arch.config_lock);
+
 	if (r->set_user) {
 		ret = (r->set_user)(vcpu, r, val);
 	} else {
@@ -2957,6 +2970,9 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
 		ret = 0;
 	}
 
+	if (is_id_reg(reg_to_encoding(r)))
+		mutex_unlock(&vcpu->kvm->arch.config_lock);
+
 	return ret;
 }
 
@@ -3100,6 +3116,7 @@ void kvm_arm_init_id_regs(struct kvm *kvm)
 {
 	const struct sys_reg_desc *idreg = first_idreg;
 	u32 id = reg_to_encoding(idreg);
+	u64 val;
 
 	/* Initialize all idregs */
 	while (is_id_reg(id)) {
@@ -3114,6 +3131,27 @@ void kvm_arm_init_id_regs(struct kvm *kvm)
 		idreg++;
 		id = reg_to_encoding(idreg);
 	}
+
+	/*
+	 * The default is to expose CSV2 == 1 if the HW isn't affected.
+	 * Although this is a per-CPU feature, we make it global because
+	 * asymmetric systems are just a nuisance.
+	 *
+	 * Userspace can override this as long as it doesn't promise
+	 * the impossible.
+	 */
+	val = IDREG(kvm, SYS_ID_AA64PFR0_EL1);
+
+	if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) {
+		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2);
+		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), 1);
+	}
+	if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) {
+		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3);
+		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), 1);
+	}
+
+	IDREG(kvm, SYS_ID_AA64PFR0_EL1) = val;
 }
 
 int kvm_sys_reg_table_init(void)

From c0a30ac537b93f16e153f5c065e57a5bda175d88 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 2 Jun 2023 00:51:15 +0000
Subject: [PATCH 144/175] KVM: arm64: Use per guest ID register for
 ID_AA64DFR0_EL1.PMUVer

With per guest ID registers, PMUver settings from userspace
can be stored in its corresponding ID register.

No functional change intended.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
[ bp:
  - kvm_pmu_is_3p5() doesn't exist downstream
  - Rename ID_DFR0 macros
]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/kvm_host.h | 11 +++----
 arch/arm64/kvm/arm.c              |  6 ----
 arch/arm64/kvm/sys_regs.c         | 50 +++++++++++++++++++++++--------
 3 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 0850b1d989744..5424c7bd24531 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -167,6 +167,12 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_EL1_32BIT				4
 	/* PSCI SYSTEM_SUSPEND enabled for the guest */
 #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
+	/*
+	 * AA64DFR0_EL1.PMUver was set as ID_AA64DFR0_EL1_PMUVer_IMP_DEF
+	 * or DFR0_EL1.PerfMon was set as ID_DFR0_EL1_PerfMon_IMPDEF from
+	 * userspace for VCPUs without PMU.
+	 */
+#define KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU		6
 
 	unsigned long flags;
 
@@ -179,11 +185,6 @@ struct kvm_arch {
 
 	cpumask_var_t supported_cpus;
 
-	struct {
-		u8 imp:4;
-		u8 unimp:4;
-	} dfr0_pmuver;
-
 	/* Hypercall features firmware registers' descriptor */
 	struct kvm_smccc_features smccc_feat;
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index f3eec707982f8..85024cccd9d71 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -158,12 +158,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm_arm_init_hypercalls(kvm);
 	kvm_arm_init_id_regs(kvm);
 
-	/*
-	 * Initialise the default PMUver before there is a chance to
-	 * create an actual PMU.
-	 */
-	kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit();
-
 	return ret;
 out_free_stage2_pgd:
 	kvm_free_stage2_pgd(&kvm->arch.mmu);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 927bbed5ea965..c6fdfc30d8605 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1083,9 +1083,12 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
 static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu)
 {
 	if (kvm_vcpu_has_pmu(vcpu))
-		return vcpu->kvm->arch.dfr0_pmuver.imp;
+		return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
+				 IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1));
+	else if (test_bit(KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU, &vcpu->kvm->arch.flags))
+		return ID_AA64DFR0_EL1_PMUVer_IMP_DEF;
 
-	return vcpu->kvm->arch.dfr0_pmuver.unimp;
+	return 0;
 }
 
 static u8 perfmon_to_pmuver(u8 perfmon)
@@ -1114,6 +1117,26 @@ static u8 pmuver_to_perfmon(u8 pmuver)
 	}
 }
 
+static void pmuver_update(struct kvm_vcpu *vcpu, u8 pmuver, bool valid_pmu)
+{
+	u64 val;
+
+	if (valid_pmu) {
+		val = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1);
+		val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
+		val |= FIELD_PREP(ID_AA64DFR0_EL1_PMUVer_MASK, pmuver);
+		IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1) = val;
+
+		val = IDREG(vcpu->kvm, SYS_ID_DFR0_EL1);
+		val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON);
+		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), pmuver_to_perfmon(pmuver));
+		IDREG(vcpu->kvm, SYS_ID_DFR0_EL1) = val;
+	} else {
+		assign_bit(KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU, &vcpu->kvm->arch.flags,
+			   pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF);
+	}
+}
+
 static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
 {
 	u64 val = IDREG(vcpu->kvm, encoding);
@@ -1312,11 +1335,7 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
 	if (val)
 		return -EINVAL;
 
-	if (valid_pmu)
-		vcpu->kvm->arch.dfr0_pmuver.imp = pmuver;
-	else
-		vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver;
-
+	pmuver_update(vcpu, pmuver, valid_pmu);
 	return 0;
 }
 
@@ -1352,11 +1371,7 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
 	if (val)
 		return -EINVAL;
 
-	if (valid_pmu)
-		vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon);
-	else
-		vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon);
-
+	pmuver_update(vcpu, perfmon_to_pmuver(perfmon), valid_pmu);
 	return 0;
 }
 
@@ -3152,6 +3167,17 @@ void kvm_arm_init_id_regs(struct kvm *kvm)
 	}
 
 	IDREG(kvm, SYS_ID_AA64PFR0_EL1) = val;
+	/*
+	 * Initialise the default PMUver before there is a chance to
+	 * create an actual PMU.
+	 */
+	val = IDREG(kvm, SYS_ID_AA64DFR0_EL1);
+
+	val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
+	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
+			  kvm_arm_pmu_get_pmuver_limit());
+
+	IDREG(kvm, SYS_ID_AA64DFR0_EL1) = val;
 }
 
 int kvm_sys_reg_table_init(void)

From cb058f1ebaa94fe630e39111834401757dc4bd68 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 2 Jun 2023 00:51:16 +0000
Subject: [PATCH 145/175] KVM: arm64: Reuse fields of sys_reg_desc for idreg

sys_reg_desc::{reset, val} are presently unused for ID register
descriptors. Repurpose these fields to support user-configurable ID
registers.
Use the ::reset() function pointer to return the sanitised value of a
given ID register, optionally with KVM-specific feature sanitisation.
Additionally, keep a mask of writable register fields in ::val.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
[ bp:
  - Massage reset_pmcr() as variable named `val` not `pmcr`
  - Massage reset_clidr() as clidr not writable from userspace
    i.e. missing: 7af0c2534f4c KVM: arm64: Normalize cache configuration
]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/sys_regs.c | 97 ++++++++++++++++++++++++++++-----------
 arch/arm64/kvm/sys_regs.h | 15 ++++--
 2 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c6fdfc30d8605..2ebb55bc5b27b 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -450,10 +450,11 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 	return 0;
 }
 
-static void reset_bvr(struct kvm_vcpu *vcpu,
+static u64 reset_bvr(struct kvm_vcpu *vcpu,
 		      const struct sys_reg_desc *rd)
 {
 	vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val;
+	return rd->val;
 }
 
 static bool trap_bcr(struct kvm_vcpu *vcpu,
@@ -486,10 +487,11 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 	return 0;
 }
 
-static void reset_bcr(struct kvm_vcpu *vcpu,
+static u64 reset_bcr(struct kvm_vcpu *vcpu,
 		      const struct sys_reg_desc *rd)
 {
 	vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val;
+	return rd->val;
 }
 
 static bool trap_wvr(struct kvm_vcpu *vcpu,
@@ -523,10 +525,11 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 	return 0;
 }
 
-static void reset_wvr(struct kvm_vcpu *vcpu,
+static u64 reset_wvr(struct kvm_vcpu *vcpu,
 		      const struct sys_reg_desc *rd)
 {
 	vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val;
+	return rd->val;
 }
 
 static bool trap_wcr(struct kvm_vcpu *vcpu,
@@ -559,25 +562,28 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 	return 0;
 }
 
-static void reset_wcr(struct kvm_vcpu *vcpu,
+static u64 reset_wcr(struct kvm_vcpu *vcpu,
 		      const struct sys_reg_desc *rd)
 {
 	vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val;
+	return rd->val;
 }
 
-static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	u64 amair = read_sysreg(amair_el1);
 	vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1);
+	return amair;
 }
 
-static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	u64 actlr = read_sysreg(actlr_el1);
 	vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1);
+	return actlr;
 }
 
-static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	u64 mpidr;
 
@@ -591,7 +597,10 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 	mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
 	mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
 	mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
-	vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1);
+	mpidr |= (1ULL << 31);
+	vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1);
+
+	return mpidr;
 }
 
 static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
@@ -603,13 +612,13 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
 	return REG_HIDDEN;
 }
 
-static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
 
 	/* No PMU available, any PMU reg may UNDEF... */
 	if (!kvm_arm_support_pmu_v3())
-		return;
+		return 0;
 
 	n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
 	n &= ARMV8_PMU_PMCR_N_MASK;
@@ -618,33 +627,41 @@ static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 
 	reset_unknown(vcpu, r);
 	__vcpu_sys_reg(vcpu, r->reg) &= mask;
+
+	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
-static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	reset_unknown(vcpu, r);
 	__vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);
+
+	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
-static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	reset_unknown(vcpu, r);
 	__vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK;
+
+	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
-static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	reset_unknown(vcpu, r);
 	__vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK;
+
+	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
-static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	u64 pmcr, val;
 
 	/* No PMU available, PMCR_EL0 may UNDEF... */
 	if (!kvm_arm_support_pmu_v3())
-		return;
+		return 0;
 
 	pmcr = read_sysreg(pmcr_el0);
 	/*
@@ -656,6 +673,8 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 	if (!kvm_supports_32bit_el0())
 		val |= ARMV8_PMU_PMCR_LC;
 	__vcpu_sys_reg(vcpu, r->reg) = val;
+
+	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
 static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
@@ -1137,6 +1156,11 @@ static void pmuver_update(struct kvm_vcpu *vcpu, u8 pmuver, bool valid_pmu)
 	}
 }
 
+static u64 general_read_kvm_sanitised_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd)
+{
+	return read_sanitised_ftr_reg(reg_to_encoding(rd));
+}
+
 static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
 {
 	u64 val = IDREG(vcpu->kvm, encoding);
@@ -1489,6 +1513,17 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
 	.visibility = mte_visibility,		\
 }
 
+/*
+ * Since reset() callback and field val are not used for idregs, they will be
+ * used for specific purposes for idregs.
+ * The reset() would return KVM sanitised register value. The value would be the
+ * same as the host kernel sanitised value if there is no KVM sanitisation.
+ * The val would be used as a mask indicating writable fields for the idreg.
+ * Only bits with 1 are writable from userspace. This mask might not be
+ * necessary in the future whenever all ID registers are enabled as writable
+ * from userspace.
+ */
+
 /* sys_reg_desc initialiser for known cpufeature ID registers */
 #define ID_SANITISED(name) {			\
 	SYS_DESC(SYS_##name),			\
@@ -1496,6 +1531,8 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
 	.get_user = get_id_reg,			\
 	.set_user = set_id_reg,			\
 	.visibility = id_visibility,		\
+	.reset = general_read_kvm_sanitised_reg,\
+	.val = 0,				\
 }
 
 /* sys_reg_desc initialiser for known cpufeature ID registers */
@@ -1505,6 +1542,8 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
 	.get_user = get_id_reg,			\
 	.set_user = set_id_reg,			\
 	.visibility = aa32_id_visibility,	\
+	.reset = general_read_kvm_sanitised_reg,\
+	.val = 0,				\
 }
 
 /*
@@ -1517,7 +1556,9 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
 	.access = access_id_reg,			\
 	.get_user = get_id_reg,				\
 	.set_user = set_id_reg,				\
-	.visibility = raz_visibility			\
+	.visibility = raz_visibility,			\
+	.reset = NULL,					\
+	.val = 0,					\
 }
 
 /*
@@ -1531,6 +1572,8 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
 	.get_user = get_id_reg,			\
 	.set_user = set_id_reg,			\
 	.visibility = raz_visibility,		\
+	.reset = NULL,				\
+	.val = 0,				\
 }
 
 /*
@@ -2762,10 +2805,11 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id,
  */
 
 #define FUNCTION_INVARIANT(reg)						\
-	static void get_##reg(struct kvm_vcpu *v,			\
+	static u64 get_##reg(struct kvm_vcpu *v,			\
 			      const struct sys_reg_desc *r)		\
 	{								\
 		((struct sys_reg_desc *)r)->val = read_sysreg(reg);	\
+		return ((struct sys_reg_desc *)r)->val;			\
 	}
 
 FUNCTION_INVARIANT(midr_el1)
@@ -2773,9 +2817,10 @@ FUNCTION_INVARIANT(revidr_el1)
 FUNCTION_INVARIANT(clidr_el1)
 FUNCTION_INVARIANT(aidr_el1)
 
-static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
+static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
 {
 	((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0);
+	return ((struct sys_reg_desc *)r)->val;
 }
 
 /* ->val is filled in by kvm_sys_reg_table_init() */
@@ -3124,9 +3169,7 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 	return write_demux_regids(uindices);
 }
 
-/*
- * Set the guest's ID registers with ID_SANITISED() to the host's sanitized value.
- */
+/* Initialize the guest's ID registers with KVM sanitised values. */
 void kvm_arm_init_id_regs(struct kvm *kvm)
 {
 	const struct sys_reg_desc *idreg = first_idreg;
@@ -3135,13 +3178,11 @@ void kvm_arm_init_id_regs(struct kvm *kvm)
 
 	/* Initialize all idregs */
 	while (is_id_reg(id)) {
-		/*
-		 * Some hidden ID registers which are not in arm64_ftr_regs[]
-		 * would cause warnings from read_sanitised_ftr_reg().
-		 * Skip those ID registers to avoid the warnings.
-		 */
-		if (idreg->visibility != raz_visibility)
-			IDREG(kvm, id) = read_sanitised_ftr_reg(id);
+		val = 0;
+		/* Read KVM sanitised register value if available */
+		if (idreg->reset)
+			val = idreg->reset(NULL, idreg);
+		IDREG(kvm, id) = val;
 
 		idreg++;
 		id = reg_to_encoding(idreg);
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 946c724661e1d..ec3749147cd7c 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -71,13 +71,16 @@ struct sys_reg_desc {
 		       struct sys_reg_params *,
 		       const struct sys_reg_desc *);
 
-	/* Initialization for vcpu. */
-	void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
+	/*
+	 * Initialization for vcpu. Return initialized value, or KVM
+	 * sanitized value for ID registers.
+	 */
+	u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
 
 	/* Index into sys_reg[], or 0 if we don't need to save it. */
 	int reg;
 
-	/* Value (usually reset value) */
+	/* Value (usually reset value), or write mask for idregs */
 	u64 val;
 
 	/* Custom get/set_user functions, fallback to generic if NULL */
@@ -129,19 +132,21 @@ static inline bool read_zero(struct kvm_vcpu *vcpu,
 }
 
 /* Reset functions */
-static inline void reset_unknown(struct kvm_vcpu *vcpu,
+static inline u64 reset_unknown(struct kvm_vcpu *vcpu,
 				 const struct sys_reg_desc *r)
 {
 	BUG_ON(!r->reg);
 	BUG_ON(r->reg >= NR_SYS_REGS);
 	__vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
+	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
-static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	BUG_ON(!r->reg);
 	BUG_ON(r->reg >= NR_SYS_REGS);
 	__vcpu_sys_reg(vcpu, r->reg) = r->val;
+	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
 static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu,

From eef70adddc5457858c2b83b01cd017f1586cd6fc Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 2 Jun 2023 00:51:17 +0000
Subject: [PATCH 146/175] KVM: arm64: Refactor writings for PMUVer/CSV2/CSV3

Refactor writings for ID_AA64PFR0_EL1.[CSV2|CSV3],
ID_AA64DFR0_EL1.PMUVer and ID_DFR0_ELF.PerfMon based on utilities
specific to ID register.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
[ bp: Massage ID_DFR0 macros ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/cpufeature.h |   1 +
 arch/arm64/kernel/cpufeature.c      |   2 +-
 arch/arm64/kvm/sys_regs.c           | 291 +++++++++++++++++++---------
 3 files changed, 203 insertions(+), 91 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index a0badda3a8d1c..15d043e5c3e3b 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -908,6 +908,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1)
 	return 8;
 }
 
+s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur);
 struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id);
 
 extern struct arm64_ftr_override id_aa64mmfr1_override;
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 770a31c6ed81b..292a9f94e617a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -779,7 +779,7 @@ static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg,
 	return reg;
 }
 
-static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
+s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
 				s64 cur)
 {
 	s64 ret = 0;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 2ebb55bc5b27b..7d947bc14320f 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -39,6 +39,7 @@
  * 64bit interface.
  */
 
+static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val);
 static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding);
 static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
 
@@ -1099,6 +1100,86 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
 	return true;
 }
 
+static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp,
+				    s64 new, s64 cur)
+{
+	struct arm64_ftr_bits kvm_ftr = *ftrp;
+
+	/* Some features have different safe value type in KVM than host features */
+	switch (id) {
+	case SYS_ID_AA64DFR0_EL1:
+		if (kvm_ftr.shift == ID_AA64DFR0_EL1_PMUVer_SHIFT)
+			kvm_ftr.type = FTR_LOWER_SAFE;
+		break;
+	case SYS_ID_DFR0_EL1:
+		if (kvm_ftr.shift == ID_DFR0_PERFMON_SHIFT)
+			kvm_ftr.type = FTR_LOWER_SAFE;
+		break;
+	}
+
+	return arm64_ftr_safe_value(&kvm_ftr, new, cur);
+}
+
+/**
+ * arm64_check_features() - Check if a feature register value constitutes
+ * a subset of features indicated by the idreg's KVM sanitised limit.
+ *
+ * This function will check if each feature field of @val is the "safe" value
+ * against idreg's KVM sanitised limit return from reset() callback.
+ * If a field value in @val is the same as the one in limit, it is always
+ * considered the safe value regardless For register fields that are not in
+ * writable, only the value in limit is considered the safe value.
+ *
+ * Return: 0 if all the fields are safe. Otherwise, return negative errno.
+ */
+static int arm64_check_features(struct kvm_vcpu *vcpu,
+				const struct sys_reg_desc *rd,
+				u64 val)
+{
+	const struct arm64_ftr_reg *ftr_reg;
+	const struct arm64_ftr_bits *ftrp = NULL;
+	u32 id = reg_to_encoding(rd);
+	u64 writable_mask = rd->val;
+	u64 limit = 0;
+	u64 mask = 0;
+
+	/* For hidden and unallocated idregs without reset, only val = 0 is allowed. */
+	if (rd->reset) {
+		limit = rd->reset(vcpu, rd);
+		ftr_reg = get_arm64_ftr_reg(id);
+		if (!ftr_reg)
+			return -EINVAL;
+		ftrp = ftr_reg->ftr_bits;
+	}
+
+	for (; ftrp && ftrp->width; ftrp++) {
+		s64 f_val, f_lim, safe_val;
+		u64 ftr_mask;
+
+		ftr_mask = arm64_ftr_mask(ftrp);
+		if ((ftr_mask & writable_mask) != ftr_mask)
+			continue;
+
+		f_val = arm64_ftr_value(ftrp, val);
+		f_lim = arm64_ftr_value(ftrp, limit);
+		mask |= ftr_mask;
+
+		if (f_val == f_lim)
+			safe_val = f_val;
+		else
+			safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim);
+
+		if (safe_val != f_val)
+			return -E2BIG;
+	}
+
+	/* For fields that are not writable, values in limit are the safe values. */
+	if ((val & ~mask) != (limit & ~mask))
+		return -E2BIG;
+
+	return 0;
+}
+
 static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu)
 {
 	if (kvm_vcpu_has_pmu(vcpu))
@@ -1136,9 +1217,17 @@ static u8 pmuver_to_perfmon(u8 pmuver)
 	}
 }
 
-static void pmuver_update(struct kvm_vcpu *vcpu, u8 pmuver, bool valid_pmu)
+static int pmuver_update(struct kvm_vcpu *vcpu,
+			  const struct sys_reg_desc *rd,
+			  u64 val,
+			  u8 pmuver,
+			  bool valid_pmu)
 {
-	u64 val;
+	int ret;
+
+	ret = set_id_reg(vcpu, rd, val);
+	if (ret)
+		return ret;
 
 	if (valid_pmu) {
 		val = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1);
@@ -1154,6 +1243,8 @@ static void pmuver_update(struct kvm_vcpu *vcpu, u8 pmuver, bool valid_pmu)
 		assign_bit(KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU, &vcpu->kvm->arch.flags,
 			   pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF);
 	}
+
+	return 0;
 }
 
 static u64 general_read_kvm_sanitised_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd)
@@ -1169,7 +1260,6 @@ static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
 	case SYS_ID_AA64PFR0_EL1:
 		if (!vcpu_has_sve(vcpu))
 			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU);
 		if (kvm_vgic_global_state.type == VGIC_V3) {
 			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC);
 			val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1);
@@ -1196,15 +1286,10 @@ static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
 			val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
 		break;
 	case SYS_ID_AA64DFR0_EL1:
-		/* Limit debug to ARMv8.0 */
-		val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer);
-		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6);
 		/* Set PMUver to the required version */
 		val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
 		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
 				  vcpu_pmuver(vcpu));
-		/* Hide SPE from guests */
-		val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer);
 		break;
 	case SYS_ID_DFR0_EL1:
 		val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON);
@@ -1294,38 +1379,56 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
 	return REG_HIDDEN;
 }
 
-static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
-			       const struct sys_reg_desc *rd,
-			       u64 val)
+static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
+					  const struct sys_reg_desc *rd)
 {
-	u64 new_val = val;
-	u8 csv2, csv3;
+	u64 val;
+	u32 id = reg_to_encoding(rd);
 
+	val = read_sanitised_ftr_reg(id);
 	/*
-	 * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as
-	 * it doesn't promise more than what is actually provided (the
-	 * guest could otherwise be covered in ectoplasmic residue).
+	 * The default is to expose CSV2 == 1 if the HW isn't affected.
+	 * Although this is a per-CPU feature, we make it global because
+	 * asymmetric systems are just a nuisance.
+	 *
+	 * Userspace can override this as long as it doesn't promise
+	 * the impossible.
 	 */
-	csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT);
-	if (csv2 > 1 ||
-	    (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED))
-		return -EINVAL;
+	if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) {
+		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2);
+		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), 1);
+	}
+	if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) {
+		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3);
+		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), 1);
+	}
 
-	/* Same thing for CSV3 */
-	csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT);
-	if (csv3 > 1 ||
-	    (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED))
-		return -EINVAL;
+	val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU);
 
-	/* We can only differ with CSV[23], and anything else is an error */
-	val ^= read_id_reg(vcpu, rd);
-	val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
-		 ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
-	if (val)
-		return -EINVAL;
+	return val;
+}
 
-	IDREG(vcpu->kvm, reg_to_encoding(rd)) = new_val;
-	return 0;
+static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
+					  const struct sys_reg_desc *rd)
+{
+	u64 val;
+	u32 id = reg_to_encoding(rd);
+
+	val = read_sanitised_ftr_reg(id);
+	/* Limit debug to ARMv8.0 */
+	val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer);
+	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6);
+	/*
+	 * Initialise the default PMUver before there is a chance to
+	 * create an actual PMU.
+	 */
+	val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
+	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
+			  kvm_arm_pmu_get_pmuver_limit());
+	/* Hide SPE from guests */
+	val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer);
+
+	return val;
 }
 
 static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
@@ -1353,14 +1456,35 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
 	if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
 		return -EINVAL;
 
-	/* We can only differ with PMUver, and anything else is an error */
-	val ^= read_id_reg(vcpu, rd);
-	val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
-	if (val)
-		return -EINVAL;
+	if (!valid_pmu) {
+		/*
+		 * Ignore the PMUVer field in @val. The PMUVer would be determined
+		 * by arch flags bit KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU,
+		 */
+		pmuver = FIELD_GET(ID_AA64DFR0_EL1_PMUVer_MASK,
+				   IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1));
+		val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
+		val |= FIELD_PREP(ID_AA64DFR0_EL1_PMUVer_MASK, pmuver);
+	}
 
-	pmuver_update(vcpu, pmuver, valid_pmu);
-	return 0;
+	return pmuver_update(vcpu, rd, val, pmuver, valid_pmu);
+}
+
+static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu,
+				      const struct sys_reg_desc *rd)
+{
+	u64 val;
+	u32 id = reg_to_encoding(rd);
+
+	val = read_sanitised_ftr_reg(id);
+	/*
+	 * Initialise the default PMUver before there is a chance to
+	 * create an actual PMU.
+	 */
+	val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON);
+	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), kvm_arm_pmu_get_pmuver_limit());
+
+	return val;
 }
 
 static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
@@ -1389,14 +1513,18 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
 	if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
 		return -EINVAL;
 
-	/* We can only differ with PerfMon, and anything else is an error */
-	val ^= read_id_reg(vcpu, rd);
-	val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON);
-	if (val)
-		return -EINVAL;
+	if (!valid_pmu) {
+		/*
+		 * Ignore the PerfMon field in @val. The PerfMon would be determined
+		 * by arch flags bit KVM_ARCH_FLAG_VCPU_HAS_IMP_DEF_PMU,
+		 */
+		perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_PERFMON),
+				    IDREG(vcpu->kvm, SYS_ID_DFR0_EL1));
+		val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON);
+		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), perfmon);
+	}
 
-	pmuver_update(vcpu, perfmon_to_pmuver(perfmon), valid_pmu);
-	return 0;
+	return pmuver_update(vcpu, rd, val, perfmon_to_pmuver(perfmon), valid_pmu);
 }
 
 /*
@@ -1416,11 +1544,14 @@ static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 		      u64 val)
 {
-	/* This is what we mean by invariant: you can't change it. */
-	if (val != read_id_reg(vcpu, rd))
-		return -EINVAL;
+	u32 id = reg_to_encoding(rd);
+	int ret = 0;
 
-	return 0;
+	ret = arm64_check_features(vcpu, rd, val);
+	if (!ret)
+		IDREG(vcpu->kvm, id) = val;
+
+	return ret;
 }
 
 static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
@@ -1639,9 +1770,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	/* CRm=1 */
 	AA32_ID_SANITISED(ID_PFR0_EL1),
 	AA32_ID_SANITISED(ID_PFR1_EL1),
-	{ SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg,
-	  .get_user = get_id_reg, .set_user = set_id_dfr0_el1,
-	  .visibility = aa32_id_visibility, },
+	{ SYS_DESC(SYS_ID_DFR0_EL1),
+	  .access = access_id_reg,
+	  .get_user = get_id_reg,
+	  .set_user = set_id_dfr0_el1,
+	  .visibility = aa32_id_visibility,
+	  .reset = read_sanitised_id_dfr0_el1,
+	  .val = ARM64_FEATURE_MASK(ID_DFR0_PERFMON), },
 	ID_HIDDEN(ID_AFR0_EL1),
 	AA32_ID_SANITISED(ID_MMFR0_EL1),
 	AA32_ID_SANITISED(ID_MMFR1_EL1),
@@ -1670,8 +1805,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
 	/* AArch64 ID registers */
 	/* CRm=4 */
-	{ SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg,
-	  .get_user = get_id_reg, .set_user = set_id_aa64pfr0_el1, },
+	{ SYS_DESC(SYS_ID_AA64PFR0_EL1),
+	  .access = access_id_reg,
+	  .get_user = get_id_reg,
+	  .set_user = set_id_reg,
+	  .reset = read_sanitised_id_aa64pfr0_el1,
+	  .val = ID_AA64PFR0_EL1_CSV2_MASK | ID_AA64PFR0_EL1_CSV3_MASK, },
 	ID_SANITISED(ID_AA64PFR1_EL1),
 	ID_UNALLOCATED(4,2),
 	ID_UNALLOCATED(4,3),
@@ -1681,8 +1820,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	ID_UNALLOCATED(4,7),
 
 	/* CRm=5 */
-	{ SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg,
-	  .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, },
+	{ SYS_DESC(SYS_ID_AA64DFR0_EL1),
+	  .access = access_id_reg,
+	  .get_user = get_id_reg,
+	  .set_user = set_id_aa64dfr0_el1,
+	  .reset = read_sanitised_id_aa64dfr0_el1,
+	  .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
 	ID_SANITISED(ID_AA64DFR1_EL1),
 	ID_UNALLOCATED(5,2),
 	ID_UNALLOCATED(5,3),
@@ -3187,38 +3330,6 @@ void kvm_arm_init_id_regs(struct kvm *kvm)
 		idreg++;
 		id = reg_to_encoding(idreg);
 	}
-
-	/*
-	 * The default is to expose CSV2 == 1 if the HW isn't affected.
-	 * Although this is a per-CPU feature, we make it global because
-	 * asymmetric systems are just a nuisance.
-	 *
-	 * Userspace can override this as long as it doesn't promise
-	 * the impossible.
-	 */
-	val = IDREG(kvm, SYS_ID_AA64PFR0_EL1);
-
-	if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) {
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2);
-		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), 1);
-	}
-	if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) {
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3);
-		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), 1);
-	}
-
-	IDREG(kvm, SYS_ID_AA64PFR0_EL1) = val;
-	/*
-	 * Initialise the default PMUver before there is a chance to
-	 * create an actual PMU.
-	 */
-	val = IDREG(kvm, SYS_ID_AA64DFR0_EL1);
-
-	val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
-	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
-			  kvm_arm_pmu_get_pmuver_limit());
-
-	IDREG(kvm, SYS_ID_AA64DFR0_EL1) = val;
 }
 
 int kvm_sys_reg_table_init(void)

From 58959177c65ebbff464f5a38ad7bb94d3a72dac4 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Fri, 2 Jun 2023 12:29:33 -0700
Subject: [PATCH 147/175] KVM: arm64: Update id_reg limit value based on per
 vcpu flags

There are multiple features the availability of which is enabled/disabled
and tracked on a per vcpu level in vcpu->arch.flagset e.g. sve, ptrauth,
and pmu. While the vm wide value of the id regs which represent the
availability of these features is stored in the id_regs kvm struct their
value needs to be manipulated on a per vcpu basis. This is done at read
time in kvm_arm_read_id_reg().

The value of these per vcpu flags needs to be factored in when calculating
the id_reg limit value in check_features() as otherwise we can run into the
following scenario.

[ running on cpu which supports sve ]
1. AA64PFR0.SVE set in id_reg by kvm_arm_init_id_regs() (cpu supports it
   and so is set in value returned from read_sanitised_ftr_reg())
2. vcpus created without sve feature enabled
3. vmm reads AA64PFR0 and attempts to write the same value back
   (writing the same value back is allowed)
4. write fails in check_features() as limit has AA64PFR0.SVE set however it
   is not set in the value being written and although a lower value is
   allowed for this feature it is not in the mask of bits which can be
   modified and so much match exactly.

Thus add a step in check_features() to update the limit returned from
id_reg->reset() with the per vcpu features which may have been
enabled/disabled at vcpu creation time after the id_regs were initialised.
Split this update into a new function named kvm_arm_update_id_reg() so it
can be called from check_features() as well as kvm_arm_read_id_reg() to
dedup code.

Note: Processing of the DFR0 and AA64DFR0 registers remains in
      kvm_arm_read_id_reg() as the value of these cannot be modified based
      on vcpu feature flags, all changes to these registers must go through
      the id_regs mechanism.

Also add a check for RAZ (read a zero) registers to arm64_check_features()
as even though the host may support a non-zero value for these, it is only
valid to set them to zero from userspace.

Finally return -EINVAL from set_id_reg() rather than -E2BIG to remain
consistent with the UAPI.

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/sys_regs.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 7d947bc14320f..265741cedf819 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -40,6 +40,7 @@
  */
 
 static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val);
+static u64 kvm_arm_update_id_reg(const struct kvm_vcpu *vcpu, u32 id, u64 val);
 static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding);
 static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
 
@@ -1143,9 +1144,14 @@ static int arm64_check_features(struct kvm_vcpu *vcpu,
 	u64 limit = 0;
 	u64 mask = 0;
 
+	/* If the register is RAZ we know the only safe value is 0. */
+	if (sysreg_visible_as_raz(vcpu, rd))
+		return val ? -E2BIG : 0;
+
 	/* For hidden and unallocated idregs without reset, only val = 0 is allowed. */
 	if (rd->reset) {
 		limit = rd->reset(vcpu, rd);
+		limit = kvm_arm_update_id_reg(vcpu, id, limit);
 		ftr_reg = get_arm64_ftr_reg(id);
 		if (!ftr_reg)
 			return -EINVAL;
@@ -1252,10 +1258,8 @@ static u64 general_read_kvm_sanitised_reg(struct kvm_vcpu *vcpu, const struct sy
 	return read_sanitised_ftr_reg(reg_to_encoding(rd));
 }
 
-static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
+static u64 kvm_arm_update_id_reg(const struct kvm_vcpu *vcpu, u32 encoding, u64 val)
 {
-	u64 val = IDREG(vcpu->kvm, encoding);
-
 	switch (encoding) {
 	case SYS_ID_AA64PFR0_EL1:
 		if (!vcpu_has_sve(vcpu))
@@ -1285,6 +1289,16 @@ static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
 		if (!cpus_have_final_cap(ARM64_HAS_WFXT))
 			val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
 		break;
+	}
+
+	return val;
+}
+
+static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
+{
+	u64 val = IDREG(vcpu->kvm, encoding);
+
+	switch (encoding) {
 	case SYS_ID_AA64DFR0_EL1:
 		/* Set PMUver to the required version */
 		val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
@@ -1298,7 +1312,7 @@ static u64 kvm_arm_read_id_reg(const struct kvm_vcpu *vcpu, u32 encoding)
 		break;
 	}
 
-	return val;
+	return kvm_arm_update_id_reg(vcpu, encoding, val);
 }
 
 /* Read a sanitised cpufeature ID register by sys_reg_desc */
@@ -1551,6 +1565,16 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 	if (!ret)
 		IDREG(vcpu->kvm, id) = val;
 
+	/*
+	 * arm64_check_features() returns -E2BIG to indicate the register's
+	 * feature set is a superset of the maximally-allowed register value.
+	 * While it would be nice to precisely describe this to userspace, the
+	 * existing UAPI for KVM_SET_ONE_REG has it that invalid register
+	 * writes return -EINVAL.
+	 */
+	if (ret == -E2BIG)
+		ret = -EINVAL;
+
 	return ret;
 }
 

From 4666819b369b56bd3a79624e7aa040f879077912 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Fri, 2 Jun 2023 12:30:17 -0700
Subject: [PATCH 148/175] KVM: arm64: Move non per vcpu flag checks out of
 kvm_arm_update_id_reg()

There are features which are masked in kvm_arm_update_id_reg() which cannot
change throughout the lifecycle of a VM. Thus rather than masking them each
time the register is read, mask them at idreg init time so that the value
in the kvm id_reg correctly reflects the state of support for that feature.

Move masking of AA64PFR0_EL1.GIC and AA64PFR0_EL1.AMU into
read_sanitised_id_aa64pfr0_el1().
Create read_sanitised_id_aa64pfr1_el1() and mask AA64PFR1_EL1.SME.
Create read_sanitised_id_aa64isar2_el1() and mask AA64ISAR2_EL1.WFxT.
Create read_sanitised_id_[mmfr4|aa64mmfr2] and mask CCIDX.

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/sys_regs.c | 98 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 86 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 265741cedf819..6da1174e6748d 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1264,16 +1264,10 @@ static u64 kvm_arm_update_id_reg(const struct kvm_vcpu *vcpu, u32 encoding, u64
 	case SYS_ID_AA64PFR0_EL1:
 		if (!vcpu_has_sve(vcpu))
 			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE);
-		if (kvm_vgic_global_state.type == VGIC_V3) {
-			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC);
-			val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1);
-		}
 		break;
 	case SYS_ID_AA64PFR1_EL1:
 		if (!kvm_has_mte(vcpu->kvm))
 			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
-
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
 		break;
 	case SYS_ID_AA64ISAR1_EL1:
 		if (!vcpu_has_ptrauth(vcpu))
@@ -1286,8 +1280,6 @@ static u64 kvm_arm_update_id_reg(const struct kvm_vcpu *vcpu, u32 encoding, u64
 		if (!vcpu_has_ptrauth(vcpu))
 			val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
 				 ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
-		if (!cpus_have_final_cap(ARM64_HAS_WFXT))
-			val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
 		break;
 	}
 
@@ -1393,6 +1385,20 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
 	return REG_HIDDEN;
 }
 
+static u64 read_sanitised_id_mmfr4_el1(struct kvm_vcpu *vcpu,
+				       const struct sys_reg_desc *rd)
+{
+	u64 val;
+	u32 id = reg_to_encoding(rd);
+
+	val = read_sanitised_ftr_reg(id);
+
+	/* CCIDX is not supported */
+	val &= ~ARM64_FEATURE_MASK(ID_MMFR4_CCIDX);
+
+	return val;
+}
+
 static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
 					  const struct sys_reg_desc *rd)
 {
@@ -1419,6 +1425,25 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
 
 	val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU);
 
+	if (kvm_vgic_global_state.type == VGIC_V3) {
+		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC);
+		val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1);
+	}
+
+	return val;
+}
+
+static u64 read_sanitised_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
+					  const struct sys_reg_desc *rd)
+{
+	u64 val;
+	u32 id = reg_to_encoding(rd);
+
+	val = read_sanitised_ftr_reg(id);
+
+	/* SME is not supported */
+	val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
+
 	return val;
 }
 
@@ -1541,6 +1566,34 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
 	return pmuver_update(vcpu, rd, val, perfmon_to_pmuver(perfmon), valid_pmu);
 }
 
+static u64 read_sanitised_id_aa64isar2_el1(struct kvm_vcpu *vcpu,
+					   const struct sys_reg_desc *rd)
+{
+	u64 val;
+	u32 id = reg_to_encoding(rd);
+
+	val = read_sanitised_ftr_reg(id);
+
+	if (!cpus_have_final_cap(ARM64_HAS_WFXT))
+		val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
+
+	return val;
+}
+
+static u64 read_sanitised_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu,
+					   const struct sys_reg_desc *rd)
+{
+	u64 val;
+	u32 id = reg_to_encoding(rd);
+
+	val = read_sanitised_ftr_reg(id);
+
+	/* CCIDX is not supported */
+	val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
+
+	return val;
+}
+
 /*
  * cpufeature ID register user accessors
  *
@@ -1814,7 +1867,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	AA32_ID_SANITISED(ID_ISAR3_EL1),
 	AA32_ID_SANITISED(ID_ISAR4_EL1),
 	AA32_ID_SANITISED(ID_ISAR5_EL1),
-	AA32_ID_SANITISED(ID_MMFR4_EL1),
+	{ SYS_DESC(SYS_ID_MMFR4_EL1),
+	  .access     = access_id_reg,
+	  .get_user   = get_id_reg,
+	  .set_user   = set_id_reg,
+	  .visibility = aa32_id_visibility,
+	  .reset      = read_sanitised_id_mmfr4_el1,
+	  .val        = 0, },
 	AA32_ID_SANITISED(ID_ISAR6_EL1),
 
 	/* CRm=3 */
@@ -1835,7 +1894,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	  .set_user = set_id_reg,
 	  .reset = read_sanitised_id_aa64pfr0_el1,
 	  .val = ID_AA64PFR0_EL1_CSV2_MASK | ID_AA64PFR0_EL1_CSV3_MASK, },
-	ID_SANITISED(ID_AA64PFR1_EL1),
+	{ SYS_DESC(SYS_ID_AA64PFR1_EL1),
+	  .access   = access_id_reg,
+	  .get_user = get_id_reg,
+	  .set_user = set_id_reg,
+	  .reset    = read_sanitised_id_aa64pfr1_el1,
+	  .val      = 0, },
 	ID_UNALLOCATED(4,2),
 	ID_UNALLOCATED(4,3),
 	ID_SANITISED(ID_AA64ZFR0_EL1),
@@ -1861,7 +1925,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	/* CRm=6 */
 	ID_SANITISED(ID_AA64ISAR0_EL1),
 	ID_SANITISED(ID_AA64ISAR1_EL1),
-	ID_SANITISED(ID_AA64ISAR2_EL1),
+	{ SYS_DESC(SYS_ID_AA64ISAR2_EL1),
+	  .access   = access_id_reg,
+	  .get_user = get_id_reg,
+	  .set_user = set_id_reg,
+	  .reset    = read_sanitised_id_aa64isar2_el1,
+	  .val      = 0, },
 	ID_UNALLOCATED(6,3),
 	ID_UNALLOCATED(6,4),
 	ID_UNALLOCATED(6,5),
@@ -1871,7 +1940,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	/* CRm=7 */
 	ID_SANITISED(ID_AA64MMFR0_EL1),
 	ID_SANITISED(ID_AA64MMFR1_EL1),
-	ID_SANITISED(ID_AA64MMFR2_EL1),
+	{ SYS_DESC(SYS_ID_AA64MMFR2_EL1),
+	  .access   = access_id_reg,
+	  .get_user = get_id_reg,
+	  .set_user = set_id_reg,
+	  .reset    = read_sanitised_id_aa64mmfr2_el1,
+	  .val      = 0, },
 	ID_UNALLOCATED(7,3),
 	ID_UNALLOCATED(7,4),
 	ID_UNALLOCATED(7,5),

From 60e047bef19f21772cb8b5232755fe7c5f418818 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Wed, 7 Jun 2023 19:45:51 +0000
Subject: [PATCH 149/175] KVM: arm64: Enable writable for ID_AA64DFR0_EL1

Since number of context-aware breakpoints must be no more than number
of supported breakpoints according to Arm ARM, return an error if
userspace tries to set CTX_CMPS field to such value.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 arch/arm64/kvm/sys_regs.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 6da1174e6748d..5200de5ea0365 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1474,9 +1474,14 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
 			       const struct sys_reg_desc *rd,
 			       u64 val)
 {
-	u8 pmuver, host_pmuver;
+	u8 pmuver, host_pmuver, brps, ctx_cmps;
 	bool valid_pmu;
 
+	brps = FIELD_GET(ID_AA64DFR0_EL1_BRPs_MASK, val);
+	ctx_cmps = FIELD_GET(ID_AA64DFR0_EL1_CTX_CMPs_MASK, val);
+	if (ctx_cmps > brps)
+		return -EINVAL;
+
 	host_pmuver = kvm_arm_pmu_get_pmuver_limit();
 
 	/*
@@ -1913,7 +1918,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	  .get_user = get_id_reg,
 	  .set_user = set_id_aa64dfr0_el1,
 	  .reset = read_sanitised_id_aa64dfr0_el1,
-	  .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
+	  .val = GENMASK(63, 0), },
 	ID_SANITISED(ID_AA64DFR1_EL1),
 	ID_UNALLOCATED(5,2),
 	ID_UNALLOCATED(5,3),

From 4811d5e68c91fff9437d579b065f6261ae26f74f Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Wed, 7 Jun 2023 19:45:52 +0000
Subject: [PATCH 150/175] KVM: arm64: Enable writable for ID_DFR0_EL1

All valid fields in ID_DFR0_EL1 are writable from usrespace with this
change.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 arch/arm64/kvm/sys_regs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 5200de5ea0365..9b3d90788d523 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1858,7 +1858,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	  .set_user = set_id_dfr0_el1,
 	  .visibility = aa32_id_visibility,
 	  .reset = read_sanitised_id_dfr0_el1,
-	  .val = ARM64_FEATURE_MASK(ID_DFR0_PERFMON), },
+	  .val = GENMASK(63, 0), },
 	ID_HIDDEN(ID_AFR0_EL1),
 	AA32_ID_SANITISED(ID_MMFR0_EL1),
 	AA32_ID_SANITISED(ID_MMFR1_EL1),

From 594c474b83461ae4c8eb298998869399c5516143 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Wed, 7 Jun 2023 19:45:53 +0000
Subject: [PATCH 151/175] KVM: arm64: Enable writable for ID_AA64PFR0_EL1

Return an error if userspace tries to set SVE field of the register
to a value that conflicts with SVE configuration for the guest.
SIMD/FP/SVE fields of the requested value are validated according to
Arm ARM.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 arch/arm64/kvm/sys_regs.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 9b3d90788d523..c4c9e8ab0c99d 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1444,9 +1444,36 @@ static u64 read_sanitised_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
 	/* SME is not supported */
 	val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
 
+	if (!system_supports_sve())
+		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE);
+
 	return val;
 }
 
+static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
+			       const struct sys_reg_desc *rd,
+			       u64 val)
+{
+	int fp, simd;
+	bool has_sve = id_aa64pfr0_sve(val);
+
+	simd = cpuid_feature_extract_signed_field(val, ID_AA64PFR0_EL1_AdvSIMD_SHIFT);
+	fp = cpuid_feature_extract_signed_field(val, ID_AA64PFR0_EL1_FP_SHIFT);
+	/* AdvSIMD field must have the same value as FP field */
+	if (simd != fp)
+		return -EINVAL;
+
+	/* fp must be supported when sve is supported */
+	if (has_sve && (fp < 0))
+		return -EINVAL;
+
+	/* Check if there is a conflict with a request via KVM_ARM_VCPU_INIT */
+	if (vcpu_has_sve(vcpu) ^ has_sve)
+		return -EPERM;
+
+	return set_id_reg(vcpu, rd, val);
+}
+
 static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
 					  const struct sys_reg_desc *rd)
 {
@@ -1896,9 +1923,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_ID_AA64PFR0_EL1),
 	  .access = access_id_reg,
 	  .get_user = get_id_reg,
-	  .set_user = set_id_reg,
+	  .set_user = set_id_aa64pfr0_el1,
 	  .reset = read_sanitised_id_aa64pfr0_el1,
-	  .val = ID_AA64PFR0_EL1_CSV2_MASK | ID_AA64PFR0_EL1_CSV3_MASK, },
+	  .val = GENMASK(63, 0), },
 	{ SYS_DESC(SYS_ID_AA64PFR1_EL1),
 	  .access   = access_id_reg,
 	  .get_user = get_id_reg,

From b69852859941be8d500f214673e0d3751488d818 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Wed, 7 Jun 2023 19:45:54 +0000
Subject: [PATCH 152/175] KVM: arm64: Enable writable for ID_AA64MMFR{0, 1,
 2}_EL1

Enable writable from userspace for ID_AA64MMFR{0, 1, 2}_EL1.
Added a macro for defining general writable idregs.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 arch/arm64/kvm/sys_regs.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c4c9e8ab0c99d..a206a8f958f6b 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1775,6 +1775,16 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
 	.val = 0,				\
 }
 
+#define ID_SANITISED_WRITABLE(name) {		\
+	SYS_DESC(SYS_##name),			\
+	.access	= access_id_reg,		\
+	.get_user = get_id_reg,			\
+	.set_user = set_id_reg,			\
+	.visibility = id_visibility,		\
+	.reset = general_read_kvm_sanitised_reg,\
+	.val = GENMASK(63, 0),			\
+}
+
 /* sys_reg_desc initialiser for known cpufeature ID registers */
 #define AA32_ID_SANITISED(name) {		\
 	SYS_DESC(SYS_##name),			\
@@ -1970,14 +1980,14 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	ID_UNALLOCATED(6,7),
 
 	/* CRm=7 */
-	ID_SANITISED(ID_AA64MMFR0_EL1),
-	ID_SANITISED(ID_AA64MMFR1_EL1),
+	ID_SANITISED_WRITABLE(ID_AA64MMFR0_EL1),
+	ID_SANITISED_WRITABLE(ID_AA64MMFR1_EL1),
 	{ SYS_DESC(SYS_ID_AA64MMFR2_EL1),
 	  .access   = access_id_reg,
 	  .get_user = get_id_reg,
 	  .set_user = set_id_reg,
 	  .reset    = read_sanitised_id_aa64mmfr2_el1,
-	  .val      = 0, },
+	  .val      = GENMASK(63, 0), },
 	ID_UNALLOCATED(7,3),
 	ID_UNALLOCATED(7,4),
 	ID_UNALLOCATED(7,5),

From 28713741b11f9f98c706b893e1abb9b0650a7227 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Thu, 22 Jun 2023 15:26:41 -0700
Subject: [PATCH 153/175] KVM: arm64: Enable writable for ID_AA64ISAR0_EL0

All valid fields in ID_AA64ISAR0_EL0 are writable from userspace with this
change.

Return an error if userspace tries to set sha[1|2|3] to an invalid
configuration based on the Arm Architecture Reference Manual for A-profile
architecture [1].

[1] https://developer.arm.com/documentation/ddi0487/latest/

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/sys_regs.c | 54 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a206a8f958f6b..18991a41aa46b 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1598,6 +1598,53 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
 	return pmuver_update(vcpu, rd, val, perfmon_to_pmuver(perfmon), valid_pmu);
 }
 
+static int set_id_aa64isar0_el1(struct kvm_vcpu *vcpu,
+				const struct sys_reg_desc *rd,
+				u64 val)
+{
+	u8 sm4, sm3, sha1, sha2, sha3;
+
+	sm4 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4), val);
+	sm3 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3), val);
+	sha1 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1), val);
+	sha2 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2), val);
+	sha3 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3), val);
+
+	/*
+	 * From Arm Architecture Reference Manual for A-profile architecture
+	 * (https://developer.arm.com/documentation/ddi0487/latest/)
+	 * D19.2.61:
+	 * SM4, bits [43:40]
+	 *   This field must have the same value as ID_AA64ISAR0_EL1.SM3.
+	 */
+	if (sm4 != sm3)
+		return -EINVAL;
+
+	/*
+	 * From Arm Architecture Reference Manual for A-profile architecture
+	 * (https://developer.arm.com/documentation/ddi0487/latest/)
+	 * D19.2.61:
+	 * SHA1, bits [11:8]
+	 *   If the value of ID_AA64ISAR0_EL1.SHA2 is 0b0000,
+	 *   this field must have the value 0b0000.
+	 * SHA2, bits [15:12]
+	 *   If the value of this field is 0b0010,
+	 *   ID_AA64ISAR0_EL1.SHA3 must have the value 0b0001.
+	 * SHA3, bits [35:32]
+	 *   If the value of ID_AA64ISAR0_EL1.SHA1 is 0b0000,
+	 *   this field must have the value 0b0000.
+	 */
+	if (!sha1) {
+		if (sha2 || sha3)
+			return -EINVAL;
+	} else {
+		if (sha3 && (sha2 != 0b0010))
+			return -EINVAL;
+	}
+
+	return set_id_reg(vcpu, rd, val);
+}
+
 static u64 read_sanitised_id_aa64isar2_el1(struct kvm_vcpu *vcpu,
 					   const struct sys_reg_desc *rd)
 {
@@ -1965,7 +2012,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	ID_UNALLOCATED(5,7),
 
 	/* CRm=6 */
-	ID_SANITISED(ID_AA64ISAR0_EL1),
+	{ SYS_DESC(SYS_ID_AA64ISAR0_EL1),
+	  .access   = access_id_reg,
+	  .get_user = get_id_reg,
+	  .set_user = set_id_aa64isar0_el1,
+	  .reset    = general_read_kvm_sanitised_reg,
+	  .val      = GENMASK(63, 0), },
 	ID_SANITISED(ID_AA64ISAR1_EL1),
 	{ SYS_DESC(SYS_ID_AA64ISAR2_EL1),
 	  .access   = access_id_reg,

From 8d9cb25ae1a6c0fc030e4f655f8a31a748324ae4 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Fri, 23 Jun 2023 08:02:57 -0700
Subject: [PATCH 154/175] KVM: arm64: Enable writable for ID_AA64ISAR1_EL0

All valid fields in ID_AA64ISAR1_EL0 are writable from userspace with this
change.

Return an error if userspace tries to set i8mm, bf16, gpi or gpa to an
invalid configuration based on the Arm Architecture Reference Manual for
A-profile architecture [1].

[1] https://developer.arm.com/documentation/ddi0487/latest/

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/sys_regs.c | 105 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 104 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 18991a41aa46b..4f694974ce6f3 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1645,6 +1645,104 @@ static int set_id_aa64isar0_el1(struct kvm_vcpu *vcpu,
 	return set_id_reg(vcpu, rd, val);
 }
 
+static int set_id_aa64isar1_el1(struct kvm_vcpu *vcpu,
+				const struct sys_reg_desc *rd,
+				u64 val)
+{
+	u8 zfr0_i8mm, zfr0_bf16, gpa3, sme;
+	u8 i8mm, bf16, gpi, gpa;
+	int advsimd;
+
+	/* Fields in the register we're trying to set - ISAR1 */
+	i8mm = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM), val);
+	bf16 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16), val);
+	gpi = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), val);
+	gpa = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), val);
+
+	/* Fields in ZFR0 */
+	zfr0_i8mm = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ZFR0_EL1_I8MM),
+			      IDREG(vcpu->kvm, SYS_ID_AA64ZFR0_EL1));
+	zfr0_bf16 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ZFR0_EL1_BF16),
+			      IDREG(vcpu->kvm, SYS_ID_AA64ZFR0_EL1));
+
+	/* Fields in ISAR2 */
+	gpa3 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3),
+			 IDREG(vcpu->kvm, SYS_ID_AA64ISAR2_EL1));
+
+	/* Fields in PFR1 */
+	sme = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME),
+			IDREG(vcpu->kvm, SYS_ID_AA64PFR1_EL1));
+
+	/* Fields in PFR0 */
+	advsimd = cpuid_feature_extract_signed_field(IDREG(vcpu->kvm,
+							   SYS_ID_AA64PFR0_EL1),
+						     ID_AA64PFR0_EL1_AdvSIMD_SHIFT);
+
+	/*
+	 * From Arm Architecture Reference Manual for A-profile architecture
+	 * (https://developer.arm.com/documentation/ddi0487/latest/)
+	 * D19.2.62:
+	 * I8MM, bits [55:52]
+	 *   When Advanced SIMD and SVE are both implemented, this field must
+	 *   return the same value as ID_AA64ZFR0_EL1.I8MM.
+	 */
+	if (vcpu_has_sve(vcpu) && advsimd) {
+		if (i8mm != zfr0_i8mm)
+			return -EINVAL;
+	}
+
+	/*
+	 * From Arm Architecture Reference Manual for A-profile architecture
+	 * (https://developer.arm.com/documentation/ddi0487/latest/)
+	 * D19.2.62:
+	 * BF16, bits [47:44]
+	 *   When FEAT_SVE or FEAT_SME is implemented, this field must return
+	 *   the same value as ID_AA64ZFR0_EL1.BF16.
+	 */
+	if (vcpu_has_sve(vcpu) || sme) {
+		if (bf16 != zfr0_bf16)
+			return -EINVAL;
+	}
+
+	/*
+	 * From Arm Architecture Reference Manual for A-profile architecture
+	 * (https://developer.arm.com/documentation/ddi0487/latest/)
+	 * D19.2.62:
+	 * GPI, bits [31:28]
+	 *   If the value of ID_AA64ISAR1_EL1.GPA is nonzero, or the value of
+	 *   ID_AA64ISAR2_EL1.GPA3 is nonzero, this field must have the value
+	 *   0b0000.
+	 */
+	if (gpi && (gpa || gpa3)) {
+		return -EINVAL;
+	}
+
+	/*
+	 * From Arm Architecture Reference Manual for A-profile architecture
+	 * (https://developer.arm.com/documentation/ddi0487/latest/)
+	 * D19.2.62:
+	 * GPA, bits [27:24]
+	 *   If the value of ID_AA64ISAR1_EL1.GPI is nonzero, or the value of
+	 *   ID_AA64ISAR2_EL1.GPA3 is nonzero, this field must have the value
+	 *   0b0000.
+	 */
+	if (gpa && (gpi || gpa3)) {
+		return -EINVAL;
+	}
+
+	/* Check ptrauth state matches that requested in vcpu features */
+	if ((gpi || gpa || gpa3) != vcpu_has_ptrauth(vcpu))
+		return -EINVAL;
+
+	/*
+	 * No need to validate API or APA, since they are FTR_EXACT they must
+	 * match the host value. And who are we to argue if the host screwed
+	 * these up.
+	 */
+
+	return set_id_reg(vcpu, rd, val);
+}
+
 static u64 read_sanitised_id_aa64isar2_el1(struct kvm_vcpu *vcpu,
 					   const struct sys_reg_desc *rd)
 {
@@ -2018,7 +2116,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	  .set_user = set_id_aa64isar0_el1,
 	  .reset    = general_read_kvm_sanitised_reg,
 	  .val      = GENMASK(63, 0), },
-	ID_SANITISED(ID_AA64ISAR1_EL1),
+	{ SYS_DESC(SYS_ID_AA64ISAR1_EL1),
+	  .access   = access_id_reg,
+	  .get_user = get_id_reg,
+	  .set_user = set_id_aa64isar1_el1,
+	  .reset    = general_read_kvm_sanitised_reg,
+	  .val      = GENMASK(63, 0), },
 	{ SYS_DESC(SYS_ID_AA64ISAR2_EL1),
 	  .access   = access_id_reg,
 	  .get_user = get_id_reg,

From 5bb6ca006c00ca3b158ea8c15f147a33c59222c5 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Fri, 23 Jun 2023 14:09:53 -0700
Subject: [PATCH 155/175] KVM: arm64: Enable writable for ID_AA64ISAR2_EL0

All valid fields in ID_AA64ISAR2_EL0 are writable from userspace with this
change.

Return an error if userspace tries to set gpa3 to an invalid configuration
based on the Arm Architecture Reference Manual for A-profile architecture [1].

[1] https://developer.arm.com/documentation/ddi0487/latest/

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/sys_regs.c | 44 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 4f694974ce6f3..9040b20970aff 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1743,6 +1743,46 @@ static int set_id_aa64isar1_el1(struct kvm_vcpu *vcpu,
 	return set_id_reg(vcpu, rd, val);
 }
 
+static int set_id_aa64isar2_el1(struct kvm_vcpu *vcpu,
+				const struct sys_reg_desc *rd,
+				u64 val)
+{
+	u8 gpi, gpa, gpa3;
+
+	/* Fields in the register we're trying to set - ISAR2 */
+	gpa3 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3), val);
+
+	/* Fields in ISAR1 */
+	gpi = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI),
+			IDREG(vcpu->kvm, SYS_ID_AA64ISAR1_EL1));
+	gpa = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA),
+			IDREG(vcpu->kvm, SYS_ID_AA64ISAR1_EL1));
+
+	/*
+	 * From Arm Architecture Reference Manual for A-profile architecture
+	 * (https://developer.arm.com/documentation/ddi0487/latest/)
+	 * D19.2.63:
+	 * GPA3, bits [11:8]
+	 *   If the value of ID_AA64ISAR1_EL1.GPI is nonzero, or the value of
+	 *   ID_AA64ISAR1_EL1.GPA is nonzero, this field must have the value
+	 *   0b0000.
+	 */
+	if (gpa3 && (gpi || gpa)) {
+		return -EINVAL;
+	}
+
+	/* Check ptrauth state matches that requested in vcpu features */
+	if ((gpi || gpa || gpa3) != vcpu_has_ptrauth(vcpu))
+		return -EINVAL;
+
+	/*
+	 * No need to validate APA3, since it is FTR_EXACT it must match the
+	 * host value. And who are we to argue if the host screwed it up.
+	 */
+
+	return set_id_reg(vcpu, rd, val);
+}
+
 static u64 read_sanitised_id_aa64isar2_el1(struct kvm_vcpu *vcpu,
 					   const struct sys_reg_desc *rd)
 {
@@ -2125,9 +2165,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_ID_AA64ISAR2_EL1),
 	  .access   = access_id_reg,
 	  .get_user = get_id_reg,
-	  .set_user = set_id_reg,
+	  .set_user = set_id_aa64isar2_el1,
 	  .reset    = read_sanitised_id_aa64isar2_el1,
-	  .val      = 0, },
+	  .val      = GENMASK(63, 0), },
 	ID_UNALLOCATED(6,3),
 	ID_UNALLOCATED(6,4),
 	ID_UNALLOCATED(6,5),

From 2c20ec451c21064cc190351ed5b3db9facb10752 Mon Sep 17 00:00:00 2001
From: Andrew Paniakin <apanyaki@amazon.com>
Date: Wed, 13 Dec 2023 02:51:38 +0000
Subject: [PATCH 156/175] Revert "objtool: Propagate early errors"

This reverts commit 21f99a5adbc522c8e8126132e94d353297fd3c16.
---
 tools/objtool/objtool.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index cda649644e32d..a7ecc32e35125 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -146,5 +146,7 @@ int main(int argc, const char **argv)
 	exec_cmd_init("objtool", UNUSED, UNUSED, UNUSED);
 	pager_init(UNUSED);
 
-	return objtool_run(argc, argv);
+	objtool_run(argc, argv);
+
+	return 0;
 }

From 7dec40e0c5035e631e353cc643bd37e440c4398e Mon Sep 17 00:00:00 2001
From: David Arinzon <darinzon@amazon.com>
Date: Tue, 28 Nov 2023 17:48:53 +0000
Subject: [PATCH 157/175] AL2023 6.1 Compile ENA driver with PHC flag

Allow PHC functionality by setting ENA_PHC_INCLUDE=1
in the compilation flags.

Signed-off-by: David Arinzon <darinzon@amazon.com>
---
 drivers/amazon/net/ena/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index 0c40c453562f5..5060120a596a7 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -15,8 +15,6 @@ ifdef TEST_AF_XDP
 	ccflags-y += -DENA_TEST_AF_XDP
 endif
 
-ifdef ENA_PHC_INCLUDE
-	ccflags-y += -DENA_PHC_INCLUDE
-endif
+ccflags-y += -DENA_PHC_INCLUDE
 
 ccflags-y += -include $(srctree)/drivers/amazon/net/ena/config.h

From 77604600338fc6147d820651457c5e78d3c79d46 Mon Sep 17 00:00:00 2001
From: David Arinzon <darinzon@amazon.com>
Date: Tue, 28 Nov 2023 17:49:39 +0000
Subject: [PATCH 158/175] AL2023 6.1 Update ENA driver to 2.11.0g

Signed-off-by: David Arinzon <darinzon@amazon.com>
---
 drivers/amazon/net/ena/Makefile         |  2 +-
 drivers/amazon/net/ena/ena_admin_defs.h |  4 +-
 drivers/amazon/net/ena/ena_com.c        |  2 +
 drivers/amazon/net/ena/ena_com.h        | 12 ++++
 drivers/amazon/net/ena/ena_ethtool.c    |  3 +-
 drivers/amazon/net/ena/ena_netdev.c     | 81 +++++++++++++++++--------
 drivers/amazon/net/ena/ena_netdev.h     | 12 ++--
 drivers/amazon/net/ena/ena_regs_defs.h  |  1 +
 drivers/amazon/net/ena/kcompat.h        |  9 ++-
 9 files changed, 90 insertions(+), 36 deletions(-)

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index 5060120a596a7..0a84642cc49ba 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the Elastic Network Adapter (ENA) device drivers.
 # ENA Source is: https://github.com/amzn/amzn-drivers.
-# Current ENA source is based on ena_linux_2.10.0 tag.
+# Current ENA source is based on ena_linux_2.11.0 tag.
 #
 
 obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index 61ca71af11cf5..daf2961af2b75 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -696,8 +696,8 @@ struct ena_admin_feature_llq_desc {
 	 */
 	u8 entry_size_recommended;
 
-	/* reserved */
-	u8 reserved1[2];
+	/* max depth of wide llq, or 0 for N/A */
+	u16 max_wide_llq_depth;
 
 	/* accelerated low latency queues requirement. driver needs to
 	 * support those requirements in order to use accelerated llq
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index d4f73b8b200b3..fdc46ff1c2400 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -810,6 +810,7 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
 		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
 
 		if (comp_ctx->status == ENA_CMD_COMPLETED) {
+			admin_queue->is_missing_admin_interrupt = true;
 			netdev_err(admin_queue->ena_dev->net_device,
 				   "The ena device sent a completion but the driver didn't receive a MSI-X interrupt (cmd %d), autopolling mode is %s\n",
 				   comp_ctx->cmd_opcode,
@@ -2127,6 +2128,7 @@ int ena_com_admin_init(struct ena_com_dev *ena_dev,
 
 	admin_queue->ena_dev = ena_dev;
 	admin_queue->running_state = true;
+	admin_queue->is_missing_admin_interrupt = false;
 
 	return 0;
 error:
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 00776c433f7cf..efe7168fc37e0 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -253,6 +253,8 @@ struct ena_com_admin_queue {
 	 */
 	bool running_state;
 
+	bool is_missing_admin_interrupt;
+
 	/* Count the number of outstanding admin commands */
 	atomic_t outstanding_cmds;
 
@@ -1091,6 +1093,16 @@ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
 			    struct ena_admin_feature_llq_desc *llq_features,
 			    struct ena_llq_configurations *llq_default_config);
 
+/* ena_com_get_missing_admin_interrupt - Return if there is a missing admin interrupt
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return - true if there is a missing admin interrupt or false otherwise
+ */
+static inline bool ena_com_get_missing_admin_interrupt(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->admin_queue.is_missing_admin_interrupt;
+}
+
 /* ena_com_io_sq_to_ena_dev - Extract ena_com_dev using contained field io_sq.
  * @io_sq: IO submit queue struct
  *
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index ada1b9b0c4eef..2a0496172ff91 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -80,6 +80,8 @@ static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(tx_desc_malformed),
 	ENA_STAT_GLOBAL_ENTRY(invalid_state),
 	ENA_STAT_GLOBAL_ENTRY(os_netdev_wd),
+	ENA_STAT_GLOBAL_ENTRY(missing_admin_interrupt),
+	ENA_STAT_GLOBAL_ENTRY(admin_to),
 	ENA_STAT_GLOBAL_ENTRY(suspend),
 	ENA_STAT_GLOBAL_ENTRY(resume),
 	ENA_STAT_GLOBAL_ENTRY(interface_down),
@@ -1287,7 +1289,6 @@ static int ena_set_channels(struct net_device *netdev,
 				      NETDEV_XDP_ACT_REDIRECT);
 	}
 
-
 	if (count > adapter->max_num_io_queues)
 		return -EINVAL;
 	if (count != adapter->num_io_queues && ena_is_zc_q_exist(adapter)) {
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 932c075f5a2ef..089142aa07ea6 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -1561,7 +1561,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
 #ifdef ENA_XDP_SUPPORT
 	if (xdp_flags & ENA_XDP_REDIRECT)
-		xdp_do_flush_map();
+		xdp_do_flush();
 	if (xdp_flags & ENA_XDP_TX)
 		ena_ring_tx_doorbell(rx_ring->xdp_ring);
 #endif
@@ -3462,15 +3462,21 @@ static int ena_calc_io_queue_size(struct ena_adapter *adapter,
 {
 	struct ena_admin_feature_llq_desc *llq = &get_feat_ctx->llq;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
-	u32 tx_queue_size = ENA_DEFAULT_RING_SIZE;
 	u32 max_tx_queue_size;
 	u32 max_rx_queue_size;
+	u32 tx_queue_size;
 
 	/* If this function is called after driver load, the ring sizes have already
 	 * been configured. Take it into account when recalculating ring size.
 	 */
-	if (adapter->tx_ring->ring_size)
+	if (adapter->tx_ring->ring_size) {
 		tx_queue_size = adapter->tx_ring->ring_size;
+	} else if (adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE &&
+		   ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		tx_queue_size = ENA_DEFAULT_WIDE_LLQ_RING_SIZE;
+	} else {
+		tx_queue_size = ENA_DEFAULT_RING_SIZE;
+	}
 
 	if (adapter->rx_ring->ring_size)
 		rx_queue_size = adapter->rx_ring->ring_size;
@@ -3513,6 +3519,33 @@ static int ena_calc_io_queue_size(struct ena_adapter *adapter,
 						 max_queues->max_packet_rx_descs);
 	}
 
+	if (adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE) {
+		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+			u32 max_wide_llq_size = max_tx_queue_size;
+
+			if (llq->max_wide_llq_depth == 0) {
+				/* if there is no large llq max depth from device, we divide
+				 * the queue size by 2, leaving the amount of memory
+				 * used by the queues unchanged.
+				 */
+				max_wide_llq_size /= 2;
+			} else if (llq->max_wide_llq_depth < max_wide_llq_size) {
+				max_wide_llq_size = llq->max_wide_llq_depth;
+			}
+			if (max_wide_llq_size != max_tx_queue_size) {
+				max_tx_queue_size = max_wide_llq_size;
+				dev_info(&adapter->pdev->dev,
+					 "Forcing large headers and decreasing maximum TX queue size to %d\n",
+					 max_tx_queue_size);
+			}
+		} else {
+			dev_err(&adapter->pdev->dev,
+				"Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
+
+			adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_NORMAL;
+		}
+	}
+
 	max_tx_queue_size = rounddown_pow_of_two(max_tx_queue_size);
 	max_rx_queue_size = rounddown_pow_of_two(max_rx_queue_size);
 
@@ -3528,23 +3561,6 @@ static int ena_calc_io_queue_size(struct ena_adapter *adapter,
 		return -EFAULT;
 	}
 
-	/* When forcing large headers, we multiply the entry size by 2, and therefore divide
-	 * the queue size by 2, leaving the amount of memory used by the queues unchanged.
-	 */
-	if (adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE) {
-		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
-			max_tx_queue_size /= 2;
-			dev_info(&adapter->pdev->dev,
-				 "Forcing large headers and decreasing maximum TX queue size to %d\n",
-				 max_tx_queue_size);
-		} else {
-			dev_err(&adapter->pdev->dev,
-				"Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
-
-			adapter->llq_policy = ENA_LLQ_HEADER_SIZE_POLICY_NORMAL;
-		}
-	}
-
 	tx_queue_size = clamp_val(tx_queue_size, ENA_MIN_RING_SIZE,
 				  max_tx_queue_size);
 	rx_queue_size = clamp_val(rx_queue_size, ENA_MIN_RING_SIZE,
@@ -3808,6 +3824,11 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 	if (unlikely(rc))
 		goto err_admin_init;
 
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		dev_info(&pdev->dev, "ENA Large LLQ is %s\n",
+			adapter->llq_policy == ENA_LLQ_HEADER_SIZE_POLICY_LARGE ?
+			"enabled" : "disabled");
+
 	/* Turned on features shouldn't change due to reset. */
 	prev_netdev_features = adapter->netdev->features;
 	ena_set_dev_offloads(get_feat_ctx, adapter->netdev);
@@ -4070,11 +4091,11 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter, struct en
 	struct net_device *netdev = adapter->netdev;
 	unsigned long jiffies_since_last_napi;
 	unsigned long jiffies_since_last_intr;
+	u32 missed_tx = 0, new_missed_tx = 0;
 	unsigned long graceful_timeout;
 	struct ena_tx_buffer *tx_buf;
 	unsigned long timeout;
 	int napi_scheduled;
-	u32 missed_tx = 0;
 	bool is_expired;
 	int i, rc = 0;
 
@@ -4117,20 +4138,24 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter, struct en
 				reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
 			}
 
+			missed_tx++;
+
 			if (tx_buf->print_once)
 				continue;
 
+			/* Add new TX completions which are missed */
+			new_missed_tx++;
+
 			netif_notice(adapter, tx_err, netdev,
 				     "TX hasn't completed, qid %d, index %d. %u msecs since last interrupt, %u msecs since last napi execution, napi scheduled: %d\n",
 				     tx_ring->qid, i, jiffies_to_msecs(jiffies_since_last_intr),
 				     jiffies_to_msecs(jiffies_since_last_napi), napi_scheduled);
 
-			missed_tx++;
 			tx_buf->print_once = 1;
 		}
 	}
 
-	/* Checking if this TX ring got to max missing TX completes */
+	/* Checking if this TX ring missing TX completions have passed the threshold */
 	if (unlikely(missed_tx > missed_tx_thresh)) {
 		jiffies_since_last_intr = jiffies - READ_ONCE(ena_napi->last_intr_jiffies);
 		jiffies_since_last_napi = jiffies - READ_ONCE(tx_ring->tx_stats.last_napi_jiffies);
@@ -4156,7 +4181,8 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter, struct en
 		rc = -EIO;
 	}
 
-	ena_increase_stat(&tx_ring->tx_stats.missed_tx, missed_tx, &tx_ring->syncp);
+	/* Add the newly discovered missing TX completions */
+	ena_increase_stat(&tx_ring->tx_stats.missed_tx, new_missed_tx, &tx_ring->syncp);
 
 	return rc;
 }
@@ -4288,7 +4314,12 @@ static void check_for_admin_com_state(struct ena_adapter *adapter)
 	if (unlikely(!ena_com_get_admin_running_state(adapter->ena_dev))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "ENA admin queue is not in running state!\n");
-		ena_reset_device(adapter, ENA_REGS_RESET_ADMIN_TO);
+		ena_increase_stat(&adapter->dev_stats.admin_q_pause, 1,
+				  &adapter->syncp);
+		if (ena_com_get_missing_admin_interrupt(adapter->ena_dev))
+			ena_reset_device(adapter, ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT);
+		else
+			ena_reset_device(adapter, ENA_REGS_RESET_ADMIN_TO);
 	}
 }
 
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 9b3b20a5b680f..19d0dd50dca7b 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -31,7 +31,7 @@
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_GEN_MAJOR	2
-#define DRV_MODULE_GEN_MINOR	10
+#define DRV_MODULE_GEN_MINOR	11
 #define DRV_MODULE_GEN_SUBMINOR 0
 
 #define DRV_MODULE_NAME		"ena"
@@ -65,8 +65,9 @@
 #define ENA_MEM_BAR			2
 #define ENA_BAR_MASK (BIT(ENA_REG_BAR) | BIT(ENA_MEM_BAR))
 
-#define ENA_DEFAULT_RING_SIZE	(1024)
-#define ENA_MIN_RING_SIZE	(256)
+#define ENA_DEFAULT_RING_SIZE		(1024)
+#define ENA_DEFAULT_WIDE_LLQ_RING_SIZE	(512)
+#define ENA_MIN_RING_SIZE		(256)
 
 #define ENA_MIN_RX_BUF_SIZE (2048)
 
@@ -377,6 +378,8 @@ struct ena_stats_dev {
 	u64 tx_desc_malformed;
 	u64 invalid_state;
 	u64 os_netdev_wd;
+	u64 missing_admin_interrupt;
+	u64 admin_to;
 };
 
 enum ena_flags_t {
@@ -509,7 +512,7 @@ struct ena_reset_stats_offset {
 
 static const struct ena_reset_stats_offset resets_to_stats_offset_map[ENA_REGS_RESET_LAST] = {
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_KEEP_ALIVE_TO, wd_expired),
-	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_ADMIN_TO, admin_q_pause),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_ADMIN_TO, admin_to),
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISS_TX_CMPL, missing_tx_cmpl),
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_INV_RX_REQ_ID, bad_rx_req_id),
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_INV_TX_REQ_ID, bad_tx_req_id),
@@ -520,6 +523,7 @@ static const struct ena_reset_stats_offset resets_to_stats_offset_map[ENA_REGS_R
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_SUSPECTED_POLL_STARVATION, suspected_poll_starvation),
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED, rx_desc_malformed),
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED, tx_desc_malformed),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT, missing_admin_interrupt),
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev);
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
index 9a5a22fb4114e..af1e52cd7819c 100644
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -24,6 +24,7 @@ enum ena_regs_reset_reason_types {
 	ENA_REGS_RESET_SUSPECTED_POLL_STARVATION    = 15,
 	ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED	    = 16,
 	ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED	    = 17,
+	ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT      = 18,
 	ENA_REGS_RESET_LAST,
 };
 
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 9926a8463fc2b..7b4122d365f19 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -531,7 +531,8 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync
 static inline bool ena_u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					     unsigned int start)
 {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) && \
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 3))
 	return u64_stats_fetch_retry_irq(syncp, start);
 #else
 	return u64_stats_fetch_retry(syncp, start);
@@ -540,7 +541,8 @@ static inline bool ena_u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 
 static inline unsigned int ena_u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) && \
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 3))
 	return u64_stats_fetch_begin_irq(syncp);
 #else
 	return u64_stats_fetch_begin(syncp);
@@ -957,7 +959,8 @@ static inline int netif_xmit_stopped(const struct netdev_queue *dev_queue)
 #define NAPIF_STATE_SCHED BIT(NAPI_STATE_SCHED)
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) && \
+	!(defined(IS_UEK) && ENA_KERNEL_VERSION_GTE(5, 15, 0, 100, 96, 32))
 #define bpf_warn_invalid_xdp_action(netdev, xdp_prog, verdict) \
 	bpf_warn_invalid_xdp_action(verdict)
 #endif

From 16f9218e73df902305381a377ee2b33bc29bd8e0 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 31 Jan 2023 10:58:09 +0000
Subject: [PATCH 159/175] arm64: pauth: don't sign leaf functions

Commit c68cf5285e1896a2b725ec01a1351f08610165b8 upstream

Currently, when CONFIG_ARM64_PTR_AUTH_KERNEL=y (and
CONFIG_UNWIND_PATCH_PAC_INTO_SCS=n), we enable pointer authentication
for all functions, including leaf functions. This isn't necessary, and
is unfortunate for a few reasons:

* Any PACIASP instruction is implicitly a `BTI C` landing pad, and
  forcing the addition of a PACIASP in every function introduces a
  larger set of BTI gadgets than is necessary.

* The PACIASP and AUTIASP instructions make leaf functions larger than
  necessary, bloating the kernel Image. For a defconfig v6.2-rc3 kernel,
  this appears to add ~64KiB relative to not signing leaf functions,
  which is unfortunate but not entirely onerous.

* The PACIASP and AUTIASP instructions potentially make leaf functions
  more expensive in terms of performance and/or power. For many trivial
  leaf functions, this is clearly unnecessary, e.g.

  | <arch_local_save_flags>:
  |        d503233f        paciasp
  |        d53b4220        mrs     x0, daif
  |        d50323bf        autiasp
  |        d65f03c0        ret

  | <calibration_delay_done>:
  |        d503233f        paciasp
  |        d50323bf        autiasp
  |        d65f03c0        ret
  |        d503201f        nop

* When CONFIG_UNWIND_PATCH_PAC_INTO_SCS=y we disable pointer
  authentication for leaf functions, so clearly this is not functionally
  necessary, indicates we have an inconsistent threat model, and
  convolutes the Makefile logic.

We've used pointer authentication in leaf functions since the
introduction of in-kernel pointer authentication in commit:

  74afda4016a7437e ("arm64: compile the kernel with ptrauth return address signing")

... but at the time we had no rationale for signing leaf functions.

Subsequently, we considered avoiding signing leaf functions:

  https://lore.kernel.org/linux-arm-kernel/1586856741-26839-1-git-send-email-amit.kachhap@arm.com/
  https://lore.kernel.org/linux-arm-kernel/1588149371-20310-1-git-send-email-amit.kachhap@arm.com/

... however at the time we didn't have an abundance of reasons to avoid
signing leaf functions as above (e.g. the BTI case), we had no hardware
to make performance measurements, and it was reasoned that this gave
some level of protection against a limited set of code-reuse gadgets
which would fall through to a RET. We documented this in commit:

  717b938e22f8dbf0 ("arm64: Document why we enable PAC support for leaf functions")

Notably, this was before we supported any forward-edge CFI scheme (e.g.
Arm BTI, or Clang CFI/kCFI), which would prevent jumping into the middle
of a function.

In addition, even with signing forced for leaf functions, AUTIASP may be
placed before a number of instructions which might constitute such a
gadget, e.g.

| <user_regs_reset_single_step>:
|        f9400022        ldr     x2, [x1]
|        d503233f        paciasp
|        d50323bf        autiasp
|        f9408401        ldr     x1, [x0, #264]
|        720b005f        tst     w2, #0x200000
|        b26b0022        orr     x2, x1, #0x200000
|        926af821        and     x1, x1, #0xffffffffffdfffff
|        9a820021        csel    x1, x1, x2, eq  // eq = none
|        f9008401        str     x1, [x0, #264]
|        d65f03c0        ret

| <fpsimd_cpu_dead>:
|        2a0003e3        mov     w3, w0
|        9000ff42        adrp    x2, ffff800009ffd000 <xen_dynamic_chip+0x48>
|        9120e042        add     x2, x2, #0x838
|        52800000        mov     w0, #0x0                        // #0
|        d503233f        paciasp
|        f000d041        adrp    x1, ffff800009a20000 <this_cpu_vector>
|        d50323bf        autiasp
|        9102c021        add     x1, x1, #0xb0
|        f8635842        ldr     x2, [x2, w3, uxtw #3]
|        f821685f        str     xzr, [x2, x1]
|        d65f03c0        ret
|        d503201f        nop

So generally, trying to use AUTIASP to detect such gadgetization is not
robust, and this is dealt with far better by forward-edge CFI (which is
designed to prevent such cases). We should bite the bullet and stop
pretending that AUTIASP is a mitigation for such forward-edge
gadgetization.

For the above reasons, this patch has the kernel consistently sign
non-leaf functions and avoid signing leaf functions.

Considering a defconfig v6.2-rc3 kernel built with LLVM 15.0.6:

* The vmlinux is ~43KiB smaller:

  | [mark@lakrids:~/src/linux]% ls -al vmlinux-*
  | -rwxr-xr-x 1 mark mark 338547808 Jan 25 17:17 vmlinux-after
  | -rwxr-xr-x 1 mark mark 338591472 Jan 25 17:22 vmlinux-before

* The resulting Image is 64KiB smaller:

  | [mark@lakrids:~/src/linux]% ls -al Image-*
  | -rwxr-xr-x 1 mark mark 32702976 Jan 25 17:17 Image-after
  | -rwxr-xr-x 1 mark mark 32768512 Jan 25 17:22 Image-before

* There are ~400 fewer BTI gadgets:

  | [mark@lakrids:~/src/linux]% usekorg 12.1.0 aarch64-linux-objdump -d vmlinux-before 2> /dev/null | grep -ow 'paciasp\|bti\sc\?' | sort | uniq -c
  |    1219 bti     c
  |   61982 paciasp

  | [mark@lakrids:~/src/linux]% usekorg 12.1.0 aarch64-linux-objdump -d vmlinux-after 2> /dev/null | grep -ow 'paciasp\|bti\sc\?' | sort | uniq -c
  |   10099 bti     c
  |   52699 paciasp

  Which is +8880 BTIs, and -9283 PACIASPs, for -403 unnecessary BTI
  gadgets. While this is small relative to the total, distinguishing the
  two cases will make it easier to analyse and reduce this set further
  in future.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Cc: Amit Daniel Kachhap <amit.kachhap@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20230131105809.991288-3-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
[resolved conflicts]
Signed-off-by: Mahmoud Adam <mngyadam@amazon.com>
---
 arch/arm64/Makefile | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 265810b8b04ec..ef29e10cb1f69 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -63,20 +63,19 @@ ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
 asm-arch := armv8.2-a
 endif
 
-# Ensure that if the compiler supports branch protection we default it
-# off, this will be overridden if we are using branch protection.
-branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
-
-ifeq ($(CONFIG_ARM64_PTR_AUTH_KERNEL),y)
-branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all
-# We enable additional protection for leaf functions as there is some
-# narrow potential for ROP protection benefits and no substantial
-# performance impact has been observed.
 ifeq ($(CONFIG_ARM64_BTI_KERNEL),y)
-branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET_BTI) := -mbranch-protection=pac-ret+leaf+bti
+  KBUILD_CFLAGS += -mbranch-protection=pac-ret+bti
+else ifeq ($(CONFIG_ARM64_PTR_AUTH_KERNEL),y)
+  ifeq ($(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET),y)
+    KBUILD_CFLAGS += -mbranch-protection=pac-ret
+  else
+    KBUILD_CFLAGS += -msign-return-address=non-leaf
+  endif
 else
-branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pac-ret+leaf
+  KBUILD_CFLAGS += $(call cc-option,-mbranch-protection=none)
 endif
+
+ifeq ($(CONFIG_ARM64_PTR_AUTH_KERNEL),y)
 # -march=armv8.3-a enables the non-nops instructions for PAC, to avoid the
 # compiler to generate them and consequently to break the single image contract
 # we pass it only to the assembler. This option is utilized only in case of non
@@ -86,8 +85,6 @@ asm-arch := armv8.3-a
 endif
 endif
 
-KBUILD_CFLAGS += $(branch-prot-flags-y)
-
 ifeq ($(CONFIG_AS_HAS_ARMV8_4), y)
 # make sure to pass the newest target architecture to -march.
 asm-arch := armv8.4-a

From ab50f51bb8b30e3e76cfb7ec7ae21f31601545d1 Mon Sep 17 00:00:00 2001
From: David Arinzon <darinzon@amazon.com>
Date: Wed, 7 Feb 2024 19:39:05 +0000
Subject: [PATCH 160/175] AL2023-6.1-Update-ena-driver-to-2.11.1g

Also some minor misalignment in kcompat.h vs 2.11.0
to void a diff.

Signed-off-by: David Arinzon <darinzon@amazon.com>
---
 drivers/amazon/net/ena/Makefile     | 2 +-
 drivers/amazon/net/ena/ena_netdev.c | 4 ++--
 drivers/amazon/net/ena/ena_netdev.h | 2 +-
 drivers/amazon/net/ena/kcompat.h    | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index 0a84642cc49ba..e1118db6dc823 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the Elastic Network Adapter (ENA) device drivers.
 # ENA Source is: https://github.com/amzn/amzn-drivers.
-# Current ENA source is based on ena_linux_2.11.0 tag.
+# Current ENA source is based on ena_linux_2.11.1 tag.
 #
 
 obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 089142aa07ea6..5edd2d3256b25 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -4914,6 +4914,8 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 	adapter->reset_reason = ENA_REGS_RESET_SHUTDOWN;
 	ena_destroy_device(adapter, true);
 
+	ena_phc_free(adapter);
+
 	if (shutdown) {
 		netif_device_detach(netdev);
 		dev_close(netdev);
@@ -4932,8 +4934,6 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 
 	ena_com_delete_customer_metrics_buffer(ena_dev);
 
-	ena_phc_free(adapter);
-
 	ena_release_bars(ena_dev, pdev);
 
 	pci_disable_device(pdev);
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 19d0dd50dca7b..8beb6970b20e4 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -32,7 +32,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	11
-#define DRV_MODULE_GEN_SUBMINOR 0
+#define DRV_MODULE_GEN_SUBMINOR 1
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 7b4122d365f19..7b4c2c5041082 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -532,7 +532,7 @@ static inline bool ena_u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					     unsigned int start)
 {
 #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) && \
-    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 3))
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9,3))
 	return u64_stats_fetch_retry_irq(syncp, start);
 #else
 	return u64_stats_fetch_retry(syncp, start);
@@ -542,7 +542,7 @@ static inline bool ena_u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 static inline unsigned int ena_u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
 #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) && \
-    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 3))
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9,3))
 	return u64_stats_fetch_begin_irq(syncp);
 #else
 	return u64_stats_fetch_begin(syncp);

From 35fa0bfd207dc23f292fe76972e86d7d0df30ef3 Mon Sep 17 00:00:00 2001
From: Michael Margolin <mrgolin@amazon.com>
Date: Tue, 20 Feb 2024 18:22:47 +0200
Subject: [PATCH 161/175] AL2023 6.1 Update EFA driver to 2.8.0

Signed-off-by: Michael Margolin <mrgolin@amazon.com>
---
 drivers/amazon/net/efa/Makefile              |   8 +-
 drivers/amazon/net/efa/config.h              |  55 --
 drivers/amazon/net/efa/efa-abi.h             | 135 ---
 drivers/amazon/net/efa/efa.h                 | 143 +--
 drivers/amazon/net/efa/efa_admin_cmds_defs.h |  61 +-
 drivers/amazon/net/efa/efa_com_cmd.c         |  17 +-
 drivers/amazon/net/efa/efa_com_cmd.h         |  20 +-
 drivers/amazon/net/efa/efa_common_defs.h     |   2 -
 drivers/amazon/net/efa/efa_gdr.c             | 121 ++-
 drivers/amazon/net/efa/efa_io_defs.h         |  42 +-
 drivers/amazon/net/efa/efa_main.c            | 176 +---
 drivers/amazon/net/efa/efa_neuron.c          |  14 +-
 drivers/amazon/net/efa/efa_p2p.c             |  25 +-
 drivers/amazon/net/efa/efa_p2p.h             |   5 +-
 drivers/amazon/net/efa/efa_sysfs.c           |  42 +-
 drivers/amazon/net/efa/efa_verbs.c           | 953 ++-----------------
 drivers/amazon/net/efa/kcompat.h             | 229 -----
 drivers/amazon/net/efa/nv-p2p.h              | 125 ++-
 include/uapi/rdma/efa-abi.h                  |  23 +-
 19 files changed, 500 insertions(+), 1696 deletions(-)
 delete mode 100644 drivers/amazon/net/efa/config.h
 delete mode 100644 drivers/amazon/net/efa/efa-abi.h

diff --git a/drivers/amazon/net/efa/Makefile b/drivers/amazon/net/efa/Makefile
index 4399f594a93bf..121bdd4bf6ec4 100644
--- a/drivers/amazon/net/efa/Makefile
+++ b/drivers/amazon/net/efa/Makefile
@@ -1,6 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
+#
+# Makefile for Amazon Elastic Fabric Adapter (EFA) device driver.
 #
-# Makefile for the Elastic Fabric Adapter (EFA) device drivers.
-# EFA Source is: https://github.com/amzn/amzn-drivers.
 
 obj-$(CONFIG_AMAZON_EFA_INFINIBAND) += efa.o
 
@@ -8,5 +10,3 @@ efa-y := efa_com.o efa_com_cmd.o efa_gdr.o efa_main.o efa_neuron.o efa_p2p.o
 efa-y += efa_verbs.o
 
 efa-$(CONFIG_SYSFS) += efa_sysfs.o
-
-ccflags-y += -include $(srctree)/drivers/amazon/net/efa/config.h
diff --git a/drivers/amazon/net/efa/config.h b/drivers/amazon/net/efa/config.h
deleted file mode 100644
index b4e7d9072b5e2..0000000000000
--- a/drivers/amazon/net/efa/config.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#define HAVE_UMEM_SCATTERLIST_IF 1
-#define HAVE_CREATE_CQ_ATTR 1
-#define HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS 1
-#define HAVE_MAX_SEND_RCV_SGE 1
-#define HAVE_DEV_PARENT 1
-#define HAVE_CREATE_AH_RDMA_ATTR 1
-#define HAVE_POST_CONST_WR 1
-#define HAVE_IB_DEV_OPS 1
-#define HAVE_PD_CORE_ALLOCATION 1
-#define HAVE_UCONTEXT_CORE_ALLOCATION 1
-#define HAVE_NO_KVERBS_DRIVERS 1
-#define HAVE_SAFE_IB_ALLOC_DEVICE 1
-#define HAVE_UDATA_TO_DRV_CONTEXT 1
-#define HAVE_AH_CORE_ALLOCATION 1
-#define HAVE_ALLOC_PD_NO_UCONTEXT 1
-#define HAVE_DEREG_MR_UDATA 1
-#define HAVE_DESTROY_CQ_UDATA 1
-#define HAVE_UPSTREAM_EFA 1
-#define HAVE_KVZALLOC 1
-#define HAVE_IB_IS_UDATA_CLEARED 1
-#define HAVE_CQ_CORE_ALLOCATION 1
-#define HAVE_DESTROY_QP_UDATA 1
-#define HAVE_IB_QPT_DRIVER 1
-#define HAVE_IB_PORT_PHYS_STATE_LINK_UP 1
-#define HAVE_IB_DEVICE_OPS_COMMON 1
-#define HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE 1
-#define HAVE_IBDEV_PRINT 1
-#define HAVE_IBDEV_PRINT_RATELIMITED 1
-#define HAVE_IB_MR_LENGTH 1
-#define HAVE_BITFIELD_H 1
-#define HAVE_RDMA_NODE_UNSPECIFIED 1
-#define HAVE_PCI_VENDOR_ID_AMAZON 1
-#define HAVE_IB_UMEM_GET_NO_DMASYNC 1
-#define HAVE_IB_ACCESS_OPTIONAL 1
-#define HAVE_ATOMIC64_FETCH_INC 1
-#define HAVE_CORE_MMAP_XA 1
-#define HAVE_CREATE_AH_INIT_ATTR 1
-#define HAVE_AH_CORE_ALLOCATION_DESTROY_RC 1
-#define HAVE_IB_UMEM_GET_DEVICE_PARAM 1
-#define HAVE_IB_INT_DESTROY_CQ 1
-#define HAVE_DEALLOC_PD_UDATA_RC 1
-#define HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK 1
-#define HAVE_UVERBS_CMD_MASK_NOT_NEEDED 1
-#define HAVE_SYSFS_EMIT 1
-#define HAVE_U32_PORT 1
-#define HAVE_SPLIT_STATS_ALLOC 1
-#define HAVE_XARRAY 1
-#define HAVE_STAT_DESC_STRUCT 1
-#define HAVE_IB_UMEM_NUM_DMA_BLOCKS 1
-#define HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM 1
-#define HAVE_IB_UMEM_DMABUF_PINNED 1
-#define HAVE_QP_CORE_ALLOCATION 1
-#define HAVE_MR_DMABUF 1
-#define HAVE_MODULE_IMPORT_NS 1
-#define HAVE_EFA_P2P 1
\ No newline at end of file
diff --git a/drivers/amazon/net/efa/efa-abi.h b/drivers/amazon/net/efa/efa-abi.h
deleted file mode 100644
index 163ac79556d68..0000000000000
--- a/drivers/amazon/net/efa/efa-abi.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
-/*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
- */
-
-#ifndef EFA_ABI_USER_H
-#define EFA_ABI_USER_H
-
-#include <linux/types.h>
-
-/*
- * Increment this value if any changes that break userspace ABI
- * compatibility are made.
- */
-#define EFA_UVERBS_ABI_VERSION 1
-
-/*
- * Keep structs aligned to 8 bytes.
- * Keep reserved fields as arrays of __u8 named reserved_XXX where XXX is the
- * hex bit offset of the field.
- */
-
-enum {
-	EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH  = 1 << 0,
-	EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1,
-};
-
-struct efa_ibv_alloc_ucontext_cmd {
-	__u32 comp_mask;
-	__u8 reserved_20[4];
-};
-
-enum efa_ibv_user_cmds_supp_udata {
-	EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0,
-	EFA_USER_CMDS_SUPP_UDATA_CREATE_AH    = 1 << 1,
-};
-
-struct efa_ibv_alloc_ucontext_resp {
-	__u32 comp_mask;
-	__u32 cmds_supp_udata_mask;
-	__u16 sub_cqs_per_cq;
-	__u16 inline_buf_size;
-	__u32 max_llq_size; /* bytes */
-	__u16 max_tx_batch; /* units of 64 bytes */
-	__u16 min_sq_wr;
-	__u8 reserved_a0[4];
-};
-
-struct efa_ibv_alloc_pd_resp {
-	__u32 comp_mask;
-	__u16 pdn;
-	__u8 reserved_30[2];
-};
-
-enum {
-	EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL = 1 << 0,
-	EFA_CREATE_CQ_WITH_SGID               = 1 << 1,
-};
-
-struct efa_ibv_create_cq {
-	__u32 comp_mask;
-	__u32 cq_entry_size;
-	__u16 num_sub_cqs;
-	__u8 flags;
-	__u8 reserved_58[5];
-};
-
-enum {
-	EFA_CREATE_CQ_RESP_DB_OFF = 1 << 0,
-};
-
-struct efa_ibv_create_cq_resp {
-	__u32 comp_mask;
-	__u8 reserved_20[4];
-	__aligned_u64 q_mmap_key;
-	__aligned_u64 q_mmap_size;
-	__u16 cq_idx;
-	__u8 reserved_d0[2];
-	__u32 db_off;
-	__aligned_u64 db_mmap_key;
-};
-
-enum {
-	EFA_QP_DRIVER_TYPE_SRD = 0,
-};
-
-struct efa_ibv_create_qp {
-	__u32 comp_mask;
-	__u32 rq_ring_size; /* bytes */
-	__u32 sq_ring_size; /* bytes */
-	__u32 driver_qp_type;
-};
-
-struct efa_ibv_create_qp_resp {
-	__u32 comp_mask;
-	/* the offset inside the page of the rq db */
-	__u32 rq_db_offset;
-	/* the offset inside the page of the sq db */
-	__u32 sq_db_offset;
-	/* the offset inside the page of descriptors buffer */
-	__u32 llq_desc_offset;
-	__aligned_u64 rq_mmap_key;
-	__aligned_u64 rq_mmap_size;
-	__aligned_u64 rq_db_mmap_key;
-	__aligned_u64 sq_db_mmap_key;
-	__aligned_u64 llq_desc_mmap_key;
-	__u16 send_sub_cq_idx;
-	__u16 recv_sub_cq_idx;
-	__u8 reserved_1e0[4];
-};
-
-struct efa_ibv_create_ah_resp {
-	__u32 comp_mask;
-	__u16 efa_address_handle;
-	__u8 reserved_30[2];
-};
-
-enum {
-	EFA_QUERY_DEVICE_CAPS_RDMA_READ = 1 << 0,
-	EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1,
-	EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
-	EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
-};
-
-struct efa_ibv_ex_query_device_resp {
-	__u32 comp_mask;
-	__u32 max_sq_wr;
-	__u32 max_rq_wr;
-	__u16 max_sq_sge;
-	__u16 max_rq_sge;
-	__u32 max_rdma_size;
-	__u32 device_caps;
-};
-
-#endif /* EFA_ABI_USER_H */
diff --git a/drivers/amazon/net/efa/efa.h b/drivers/amazon/net/efa/efa.h
index 34ccbac76b451..f1dc5739dcd70 100644
--- a/drivers/amazon/net/efa/efa.h
+++ b/drivers/amazon/net/efa/efa.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_H_
@@ -12,9 +12,9 @@
 #include <linux/pci.h>
 #include <linux/version.h>
 
+#include <rdma/efa-abi.h>
 #include <rdma/ib_verbs.h>
 
-#include "efa-abi.h"
 #include "efa_com_cmd.h"
 
 #define DRV_MODULE_NAME         "efa"
@@ -68,25 +68,13 @@ struct efa_dev {
 	struct efa_eq *eqs;
 	unsigned int neqs;
 
-#ifdef HAVE_XARRAY
 	/* Only stores CQs with interrupts enabled */
 	struct xarray cqs_xa;
-#else
-	/* If xarray isn't available keep an array of all possible CQs */
-	struct efa_cq *cqs_arr[BIT(sizeof_field(struct efa_admin_create_cq_resp,
-						cq_idx) * 8)];
-#endif
 };
 
 struct efa_ucontext {
 	struct ib_ucontext ibucontext;
 	u16 uarn;
-#ifndef HAVE_CORE_MMAP_XA
-	/* Protects ucontext state */
-	struct mutex lock;
-	struct list_head pending_mmaps;
-	u32 mmap_page;
-#endif /* !defined(HAVE_CORE_MMAP_XA) */
 };
 
 struct efa_pd {
@@ -94,13 +82,21 @@ struct efa_pd {
 	u16 pdn;
 };
 
+struct efa_mr_interconnect_info {
+	u16 recv_ic_id;
+	u16 rdma_read_ic_id;
+	u16 rdma_recv_ic_id;
+	u8 recv_ic_id_valid : 1;
+	u8 rdma_read_ic_id_valid : 1;
+	u8 rdma_recv_ic_id_valid : 1;
+};
+
 struct efa_mr {
 	struct ib_mr ibmr;
 	struct ib_umem *umem;
-#ifdef HAVE_EFA_P2P
+	struct efa_mr_interconnect_info ic_info;
 	struct efa_p2pmem *p2pmem;
 	u64 p2p_ticket;
-#endif
 };
 
 struct efa_cq {
@@ -161,154 +157,39 @@ int efa_query_gid(struct ib_device *ibdev, port_t port, int index,
 		  union ib_gid *gid);
 int efa_query_pkey(struct ib_device *ibdev, port_t port, u16 index,
 		   u16 *pkey);
-#ifdef HAVE_ALLOC_PD_NO_UCONTEXT
 int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
-#else
-int efa_alloc_pd(struct ib_pd *ibpd,
-		 struct ib_ucontext *ibucontext,
-		 struct ib_udata *udata);
-#endif
-#ifdef HAVE_DEALLOC_PD_UDATA_RC
 int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
-#elif defined(HAVE_DEALLOC_PD_UDATA)
-void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
-#elif defined(HAVE_PD_CORE_ALLOCATION)
-void efa_dealloc_pd(struct ib_pd *ibpd);
-#else
-int efa_dealloc_pd(struct ib_pd *ibpd);
-struct ib_pd *efa_kzalloc_pd(struct ib_device *ibdev,
-			     struct ib_ucontext *ibucontext,
-			     struct ib_udata *udata);
-#endif
-#ifdef HAVE_DESTROY_QP_UDATA
 int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
-#else
-int efa_destroy_qp(struct ib_qp *ibqp);
-#endif
-#ifdef HAVE_QP_CORE_ALLOCATION
 int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
 		  struct ib_udata *udata);
-#else
-struct ib_qp *efa_kzalloc_qp(struct ib_pd *ibpd,
-			     struct ib_qp_init_attr *init_attr,
-			     struct ib_udata *udata);
-#endif
-#ifdef HAVE_IB_INT_DESTROY_CQ
 int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
-#elif defined(HAVE_IB_VOID_DESTROY_CQ)
-void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
-#elif defined(HAVE_DESTROY_CQ_UDATA)
-int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
-#else
-int efa_destroy_cq(struct ib_cq *ibcq);
-#endif
 int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		  struct ib_udata *udata);
-#ifndef HAVE_CQ_CORE_ALLOCATION
-#ifdef HAVE_CREATE_CQ_NO_UCONTEXT
-struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_udata *udata);
-#elif defined(HAVE_CREATE_CQ_ATTR)
-struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_ucontext *ibucontext,
-			     struct ib_udata *udata);
-#endif
-#endif
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 			 u64 virt_addr, int access_flags,
 			 struct ib_udata *udata);
-#ifdef HAVE_MR_DMABUF
 struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 				     u64 length, u64 virt_addr,
 				     int fd, int access_flags,
 				     struct ib_udata *udata);
-#endif
-#ifdef HAVE_DEREG_MR_UDATA
 int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
-#else
-int efa_dereg_mr(struct ib_mr *ibmr);
-#endif
 int efa_get_port_immutable(struct ib_device *ibdev, port_t port_num,
 			   struct ib_port_immutable *immutable);
 int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata);
-#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
 void efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
-#else
-int efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
-struct ib_ucontext *efa_kzalloc_ucontext(struct ib_device *ibdev,
-					 struct ib_udata *udata);
-#endif
 int efa_mmap(struct ib_ucontext *ibucontext,
 	     struct vm_area_struct *vma);
-#ifdef HAVE_CORE_MMAP_XA
 void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
-#endif
 int efa_create_ah(struct ib_ah *ibah,
-#ifdef HAVE_CREATE_AH_INIT_ATTR
 		  struct rdma_ah_init_attr *init_attr,
-#else
-		  struct rdma_ah_attr *ah_attr,
-		  u32 flags,
-#endif
 		  struct ib_udata *udata);
-#ifndef HAVE_AH_CORE_ALLOCATION
-#ifdef HAVE_CREATE_DESTROY_AH_FLAGS
-struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
-			     struct rdma_ah_attr *ah_attr,
-			     u32 flags,
-			     struct ib_udata *udata);
-#elif defined(HAVE_CREATE_AH_RDMA_ATTR)
-struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
-			     struct rdma_ah_attr *ah_attr,
-			     struct ib_udata *udata);
-#endif
-#endif
-#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
-int efa_destroy_ah(struct ib_ah *ibah, u32 flags);
-#elif defined(HAVE_AH_CORE_ALLOCATION)
-void efa_destroy_ah(struct ib_ah *ibah, u32 flags);
-#elif defined(HAVE_CREATE_DESTROY_AH_FLAGS)
 int efa_destroy_ah(struct ib_ah *ibah, u32 flags);
-#else
-int efa_destroy_ah(struct ib_ah *ibah);
-#endif
-#ifndef HAVE_NO_KVERBS_DRIVERS
-#ifdef HAVE_POST_CONST_WR
-int efa_post_send(struct ib_qp *ibqp,
-		  const struct ib_send_wr *wr,
-		  const struct ib_send_wr **bad_wr);
-#else
-int efa_post_send(struct ib_qp *ibqp,
-		  struct ib_send_wr *wr,
-		  struct ib_send_wr **bad_wr);
-#endif
-#ifdef HAVE_POST_CONST_WR
-int efa_post_recv(struct ib_qp *ibqp,
-		  const struct ib_recv_wr *wr,
-		  const struct ib_recv_wr **bad_wr);
-#else
-int efa_post_recv(struct ib_qp *ibqp,
-		  struct ib_recv_wr *wr,
-		  struct ib_recv_wr **bad_wr);
-#endif
-int efa_poll_cq(struct ib_cq *ibcq, int num_entries,
-		struct ib_wc *wc);
-int efa_req_notify_cq(struct ib_cq *ibcq,
-		      enum ib_cq_notify_flags flags);
-struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc);
-#endif
 int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		  int qp_attr_mask, struct ib_udata *udata);
 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
 					 port_t port_num);
-#ifdef HAVE_SPLIT_STATS_ALLOC
 struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev, port_t port_num);
 struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev);
-#else
-struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, port_t port_num);
-#endif
 int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 		     port_t port_num, int index);
 
diff --git a/drivers/amazon/net/efa/efa_admin_cmds_defs.h b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
index d4b9226088bd0..7377c8a9f4d5d 100644
--- a/drivers/amazon/net/efa/efa_admin_cmds_defs.h
+++ b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_ADMIN_CMDS_H_
@@ -66,6 +66,7 @@ enum efa_admin_get_stats_type {
 	EFA_ADMIN_GET_STATS_TYPE_BASIC              = 0,
 	EFA_ADMIN_GET_STATS_TYPE_MESSAGES           = 1,
 	EFA_ADMIN_GET_STATS_TYPE_RDMA_READ          = 2,
+	EFA_ADMIN_GET_STATS_TYPE_RDMA_WRITE         = 3,
 };
 
 enum efa_admin_get_stats_scope {
@@ -376,7 +377,9 @@ struct efa_admin_reg_mr_cmd {
 	 * 0 : local_write_enable - Local write permissions:
 	 *    must be set for RQ buffers and buffers posted for
 	 *    RDMA Read requests
-	 * 1 : reserved1 - MBZ
+	 * 1 : remote_write_enable - Remote write
+	 *    permissions: must be set to enable RDMA write to
+	 *    the region
 	 * 2 : remote_read_enable - Remote read permissions:
 	 *    must be set to enable RDMA read from the region
 	 * 7:3 : reserved2 - MBZ
@@ -412,6 +415,32 @@ struct efa_admin_reg_mr_resp {
 	 * memory region
 	 */
 	u32 r_key;
+
+	/*
+	 * Mask indicating which fields have valid values
+	 * 0 : recv_ic_id
+	 * 1 : rdma_read_ic_id
+	 * 2 : rdma_recv_ic_id
+	 */
+	u8 validity;
+
+	/*
+	 * Physical interconnect used by the device to reach the MR for receive
+	 * operation
+	 */
+	u8 recv_ic_id;
+
+	/*
+	 * Physical interconnect used by the device to reach the MR for RDMA
+	 * read operation
+	 */
+	u8 rdma_read_ic_id;
+
+	/*
+	 * Physical interconnect used by the device to reach the MR for RDMA
+	 * write receive
+	 */
+	u8 rdma_recv_ic_id;
 };
 
 struct efa_admin_dereg_mr_cmd {
@@ -568,6 +597,16 @@ struct efa_admin_rdma_read_stats {
 	u64 read_resp_bytes;
 };
 
+struct efa_admin_rdma_write_stats {
+	u64 write_wrs;
+
+	u64 write_bytes;
+
+	u64 write_wr_err;
+
+	u64 write_recv_bytes;
+};
+
 struct efa_admin_acq_get_stats_resp {
 	struct efa_admin_acq_common_desc acq_common_desc;
 
@@ -577,6 +616,8 @@ struct efa_admin_acq_get_stats_resp {
 		struct efa_admin_messages_stats messages_stats;
 
 		struct efa_admin_rdma_read_stats rdma_read_stats;
+
+		struct efa_admin_rdma_write_stats rdma_write_stats;
 	} u;
 };
 
@@ -618,7 +659,11 @@ struct efa_admin_feature_device_attr_desc {
 	 *    TX queues
 	 * 1 : rnr_retry - If set, RNR retry is supported on
 	 *    modify QP command
-	 * 31:2 : reserved - MBZ
+	 * 2 : data_polling_128 - If set, 128 bytes data
+	 *    polling is supported
+	 * 3 : rdma_write - If set, RDMA Write is supported
+	 *    on TX queues
+	 * 31:4 : reserved - MBZ
 	 */
 	u32 device_caps;
 
@@ -672,7 +717,7 @@ struct efa_admin_feature_queue_attr_desc {
 	/* The maximum size of LLQ in bytes */
 	u32 max_llq_size;
 
-	/* Maximum number of SGEs for a single RDMA read WQE */
+	/* Maximum number of SGEs for a single RDMA read/write WQE */
 	u16 max_wr_rdma_sges;
 
 	/*
@@ -977,8 +1022,14 @@ struct efa_admin_host_info {
 #define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK      GENMASK(4, 0)
 #define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK      BIT(7)
 #define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK        BIT(0)
+#define EFA_ADMIN_REG_MR_CMD_REMOTE_WRITE_ENABLE_MASK       BIT(1)
 #define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_MASK        BIT(2)
 
+/* reg_mr_resp */
+#define EFA_ADMIN_REG_MR_RESP_RECV_IC_ID_MASK               BIT(0)
+#define EFA_ADMIN_REG_MR_RESP_RDMA_READ_IC_ID_MASK          BIT(1)
+#define EFA_ADMIN_REG_MR_RESP_RDMA_RECV_IC_ID_MASK          BIT(2)
+
 /* create_cq_cmd */
 #define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
 #define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK                   BIT(6)
@@ -991,6 +1042,8 @@ struct efa_admin_host_info {
 /* feature_device_attr_desc */
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK   BIT(0)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_DATA_POLLING_128_MASK BIT(2)
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_WRITE_MASK  BIT(3)
 
 /* create_eq_cmd */
 #define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK       GENMASK(4, 0)
diff --git a/drivers/amazon/net/efa/efa_com_cmd.c b/drivers/amazon/net/efa/efa_com_cmd.c
index e107c354bc349..43f79cb197d2d 100644
--- a/drivers/amazon/net/efa/efa_com_cmd.c
+++ b/drivers/amazon/net/efa/efa_com_cmd.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_com.h"
@@ -270,6 +270,15 @@ int efa_com_register_mr(struct efa_com_dev *edev,
 
 	result->l_key = cmd_completion.l_key;
 	result->r_key = cmd_completion.r_key;
+	result->ic_info.recv_ic_id = cmd_completion.recv_ic_id;
+	result->ic_info.rdma_read_ic_id = cmd_completion.rdma_read_ic_id;
+	result->ic_info.rdma_recv_ic_id = cmd_completion.rdma_recv_ic_id;
+	result->ic_info.recv_ic_id_valid = EFA_GET(&cmd_completion.validity,
+						   EFA_ADMIN_REG_MR_RESP_RECV_IC_ID);
+	result->ic_info.rdma_read_ic_id_valid = EFA_GET(&cmd_completion.validity,
+							EFA_ADMIN_REG_MR_RESP_RDMA_READ_IC_ID);
+	result->ic_info.rdma_recv_ic_id_valid = EFA_GET(&cmd_completion.validity,
+							EFA_ADMIN_REG_MR_RESP_RDMA_RECV_IC_ID);
 
 	return 0;
 }
@@ -795,6 +804,12 @@ int efa_com_get_stats(struct efa_com_dev *edev,
 		result->rdma_read_stats.read_wr_err = resp.u.rdma_read_stats.read_wr_err;
 		result->rdma_read_stats.read_resp_bytes = resp.u.rdma_read_stats.read_resp_bytes;
 		break;
+	case EFA_ADMIN_GET_STATS_TYPE_RDMA_WRITE:
+		result->rdma_write_stats.write_wrs = resp.u.rdma_write_stats.write_wrs;
+		result->rdma_write_stats.write_bytes = resp.u.rdma_write_stats.write_bytes;
+		result->rdma_write_stats.write_wr_err = resp.u.rdma_write_stats.write_wr_err;
+		result->rdma_write_stats.write_recv_bytes = resp.u.rdma_write_stats.write_recv_bytes;
+		break;
 	}
 
 	return 0;
diff --git a/drivers/amazon/net/efa/efa_com_cmd.h b/drivers/amazon/net/efa/efa_com_cmd.h
index 0898ad5bc3405..720a99ba0f7d1 100644
--- a/drivers/amazon/net/efa/efa_com_cmd.h
+++ b/drivers/amazon/net/efa/efa_com_cmd.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_COM_CMD_H_
@@ -199,6 +199,15 @@ struct efa_com_reg_mr_params {
 	u8 indirect;
 };
 
+struct efa_com_mr_interconnect_info {
+	u16 recv_ic_id;
+	u16 rdma_read_ic_id;
+	u16 rdma_recv_ic_id;
+	u8 recv_ic_id_valid : 1;
+	u8 rdma_read_ic_id_valid : 1;
+	u8 rdma_recv_ic_id_valid : 1;
+};
+
 struct efa_com_reg_mr_result {
 	/*
 	 * To be used in conjunction with local buffers references in SQ and
@@ -210,6 +219,7 @@ struct efa_com_reg_mr_result {
 	 * accessed memory region
 	 */
 	u32 r_key;
+	struct efa_com_mr_interconnect_info ic_info;
 };
 
 struct efa_com_dereg_mr_params {
@@ -262,10 +272,18 @@ struct efa_com_rdma_read_stats {
 	u64 read_resp_bytes;
 };
 
+struct efa_com_rdma_write_stats {
+	u64 write_wrs;
+	u64 write_bytes;
+	u64 write_wr_err;
+	u64 write_recv_bytes;
+};
+
 union efa_com_get_stats_result {
 	struct efa_com_basic_stats basic_stats;
 	struct efa_com_messages_stats messages_stats;
 	struct efa_com_rdma_read_stats rdma_read_stats;
+	struct efa_com_rdma_write_stats rdma_write_stats;
 };
 
 int efa_com_create_qp(struct efa_com_dev *edev,
diff --git a/drivers/amazon/net/efa/efa_common_defs.h b/drivers/amazon/net/efa/efa_common_defs.h
index bbcf48f0eaca4..90af1c82c9c62 100644
--- a/drivers/amazon/net/efa/efa_common_defs.h
+++ b/drivers/amazon/net/efa/efa_common_defs.h
@@ -6,9 +6,7 @@
 #ifndef _EFA_COMMON_H_
 #define _EFA_COMMON_H_
 
-#ifdef HAVE_BITFIELD_H
 #include <linux/bitfield.h>
-#endif
 
 #define EFA_COMMON_SPEC_VERSION_MAJOR        2
 #define EFA_COMMON_SPEC_VERSION_MINOR        0
diff --git a/drivers/amazon/net/efa/efa_gdr.c b/drivers/amazon/net/efa/efa_gdr.c
index 24f8a082d10d5..eb588f0369664 100644
--- a/drivers/amazon/net/efa/efa_gdr.c
+++ b/drivers/amazon/net/efa/efa_gdr.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2019-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/module.h>
@@ -11,6 +11,23 @@
 #define GPU_PAGE_SHIFT 16
 #define GPU_PAGE_SIZE BIT_ULL(GPU_PAGE_SHIFT)
 
+int efa_nv_peermem_p2p_get_pages(u64 p2p_token, u32 va_space,
+				 u64 virtual_address, u64 length,
+				 struct nvidia_p2p_page_table **page_table,
+				 void (*free_callback)(void *data), void *data);
+
+int efa_nv_peermem_p2p_dma_map_pages(struct pci_dev *peer,
+				     struct nvidia_p2p_page_table *page_table,
+				     struct nvidia_p2p_dma_mapping **dma_mapping);
+
+int efa_nv_peermem_p2p_dma_unmap_pages(struct pci_dev *peer,
+				       struct nvidia_p2p_page_table *page_table,
+				       struct nvidia_p2p_dma_mapping *dma_mapping);
+
+int efa_nv_peermem_p2p_put_pages(u64 p2p_token,
+				 u32 va_space, u64 virtual_address,
+				 struct nvidia_p2p_page_table *page_table);
+
 struct efa_nvmem_ops {
 	int (*get_pages)(u64 p2p_token, u32 va_space, u64 virtual_address,
 			 u64 length, struct nvidia_p2p_page_table **page_table,
@@ -23,6 +40,7 @@ struct efa_nvmem_ops {
 	int (*dma_unmap_pages)(struct pci_dev *peer,
 			       struct nvidia_p2p_page_table *page_table,
 			       struct nvidia_p2p_dma_mapping *dma_mapping);
+	bool using_peermem_fp;
 };
 
 struct efa_nvmem {
@@ -51,22 +69,53 @@ static unsigned int nvmem_pgsz(struct efa_dev *dev, struct efa_p2pmem *p2pmem)
 	}
 }
 
-static int nvmem_get_fp(struct efa_nvmem *nvmem)
+static int nvmem_get_peermem_fp(struct efa_nvmem_ops *ops)
+{
+	ops->get_pages = symbol_get(efa_nv_peermem_p2p_get_pages);
+	if (!ops->get_pages)
+		goto err_out;
+
+	ops->put_pages = symbol_get(efa_nv_peermem_p2p_put_pages);
+	if (!ops->put_pages)
+		goto err_put_get_pages;
+
+	ops->dma_map_pages = symbol_get(efa_nv_peermem_p2p_dma_map_pages);
+	if (!ops->dma_map_pages)
+		goto err_put_put_pages;
+
+	ops->dma_unmap_pages = symbol_get(efa_nv_peermem_p2p_dma_unmap_pages);
+	if (!ops->dma_unmap_pages)
+		goto err_put_dma_map_pages;
+
+	ops->using_peermem_fp = true;
+	return 0;
+
+err_put_dma_map_pages:
+	symbol_put(efa_nv_peermem_p2p_dma_map_pages);
+err_put_put_pages:
+	symbol_put(efa_nv_peermem_p2p_put_pages);
+err_put_get_pages:
+	symbol_put(efa_nv_peermem_p2p_get_pages);
+err_out:
+	return -EINVAL;
+}
+
+static int nvmem_get_nvidia_fp(struct efa_nvmem_ops *ops)
 {
-	nvmem->ops.get_pages = symbol_get(nvidia_p2p_get_pages);
-	if (!nvmem->ops.get_pages)
+	ops->get_pages = symbol_get(nvidia_p2p_get_pages);
+	if (!ops->get_pages)
 		goto err_out;
 
-	nvmem->ops.put_pages = symbol_get(nvidia_p2p_put_pages);
-	if (!nvmem->ops.put_pages)
+	ops->put_pages = symbol_get(nvidia_p2p_put_pages);
+	if (!ops->put_pages)
 		goto err_put_get_pages;
 
-	nvmem->ops.dma_map_pages = symbol_get(nvidia_p2p_dma_map_pages);
-	if (!nvmem->ops.dma_map_pages)
+	ops->dma_map_pages = symbol_get(nvidia_p2p_dma_map_pages);
+	if (!ops->dma_map_pages)
 		goto err_put_put_pages;
 
-	nvmem->ops.dma_unmap_pages = symbol_get(nvidia_p2p_dma_unmap_pages);
-	if (!nvmem->ops.dma_unmap_pages)
+	ops->dma_unmap_pages = symbol_get(nvidia_p2p_dma_unmap_pages);
+	if (!ops->dma_unmap_pages)
 		goto err_put_dma_map_pages;
 
 	return 0;
@@ -81,8 +130,24 @@ static int nvmem_get_fp(struct efa_nvmem *nvmem)
 	return -EINVAL;
 }
 
-static void nvmem_put_fp(void)
+static int nvmem_get_fp(struct efa_nvmem_ops *ops)
 {
+	if (!nvmem_get_peermem_fp(ops))
+		return 0;
+
+	return nvmem_get_nvidia_fp(ops);
+}
+
+static void nvmem_put_fp(struct efa_nvmem_ops *ops)
+{
+	if (ops->using_peermem_fp) {
+		symbol_put(efa_nv_peermem_p2p_dma_unmap_pages);
+		symbol_put(efa_nv_peermem_p2p_dma_map_pages);
+		symbol_put(efa_nv_peermem_p2p_put_pages);
+		symbol_put(efa_nv_peermem_p2p_get_pages);
+		return;
+	}
+
 	symbol_put(nvidia_p2p_dma_unmap_pages);
 	symbol_put(nvidia_p2p_dma_map_pages);
 	symbol_put(nvidia_p2p_put_pages);
@@ -160,7 +225,7 @@ static struct efa_p2pmem *nvmem_get(struct efa_dev *dev, u64 ticket, u64 start,
 	pinsz = virt_end - virt_start;
 	nvmem->virt_start = virt_start;
 
-	err = nvmem_get_fp(nvmem);
+	err = nvmem_get_fp(&nvmem->ops);
 	if (err)
 		/* Nvidia module is not loaded */
 		goto err_free;
@@ -179,7 +244,7 @@ static struct efa_p2pmem *nvmem_get(struct efa_dev *dev, u64 ticket, u64 start,
 err_put:
 	nvmem->ops.put_pages(0, 0, virt_start, nvmem->pgtbl);
 err_put_fp:
-	nvmem_put_fp();
+	nvmem_put_fp(&nvmem->ops);
 err_free:
 	kfree(nvmem);
 	return NULL;
@@ -214,19 +279,22 @@ static void nvmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
 		nvmem->ops.put_pages(0, 0, nvmem->virt_start, nvmem->pgtbl);
 	}
 
-	nvmem_put_fp();
+	nvmem_put_fp(&nvmem->ops);
 	kfree(nvmem);
 }
 
-bool nvmem_is_supported(void)
+static char *nvmem_provider_string(void)
 {
-	struct efa_nvmem dummynv = {};
+	struct efa_nvmem_ops ops = {};
+	char *prov_string;
+
+	if (nvmem_get_fp(&ops))
+		return "";
 
-	if (nvmem_get_fp(&dummynv))
-		return false;
-	nvmem_put_fp();
+	prov_string = ops.using_peermem_fp ? "NVIDIA peermem" : "NVIDIA";
+	nvmem_put_fp(&ops);
 
-	return true;
+	return prov_string;
 }
 
 struct nvmem_provider {
@@ -236,6 +304,7 @@ struct nvmem_provider {
 static const struct nvmem_provider prov = {
 	.p2p = {
 		.ops = {
+			.get_provider_string = nvmem_provider_string,
 			.try_get = nvmem_get,
 			.to_page_list = nvmem_to_page_list,
 			.release = nvmem_release,
@@ -247,5 +316,17 @@ static const struct nvmem_provider prov = {
 
 const struct efa_p2p_provider *nvmem_get_provider(void)
 {
+	struct efa_nvmem_ops ops = {};
+	int err;
+
+	err = request_module("nvidia");
+	if (!err) {
+		err = nvmem_get_nvidia_fp(&ops);
+		if (err)
+			request_module("efa_nv_peermem");
+		else
+			nvmem_put_fp(&ops);
+	}
+
 	return &prov.p2p;
 }
diff --git a/drivers/amazon/net/efa/efa_io_defs.h b/drivers/amazon/net/efa/efa_io_defs.h
index 17ba8984b11e9..2d8eb96eaa81b 100644
--- a/drivers/amazon/net/efa/efa_io_defs.h
+++ b/drivers/amazon/net/efa/efa_io_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_IO_H_
@@ -23,6 +23,8 @@ enum efa_io_send_op_type {
 	EFA_IO_SEND                                 = 0,
 	/* RDMA read */
 	EFA_IO_RDMA_READ                            = 1,
+	/* RDMA write */
+	EFA_IO_RDMA_WRITE                           = 2,
 };
 
 enum efa_io_comp_status {
@@ -62,8 +64,7 @@ struct efa_io_tx_meta_desc {
 
 	/*
 	 * control flags
-	 * 3:0 : op_type - operation type: send/rdma/fast mem
-	 *    ops/etc
+	 * 3:0 : op_type - enum efa_io_send_op_type
 	 * 4 : has_imm - immediate_data field carries valid
 	 *    data.
 	 * 5 : inline_msg - inline mode - inline message data
@@ -219,21 +220,22 @@ struct efa_io_cdesc_common {
 	 * 2:1 : q_type - enum efa_io_queue_type: send/recv
 	 * 3 : has_imm - indicates that immediate data is
 	 *    present - for RX completions only
-	 * 7:4 : reserved28 - MBZ
+	 * 6:4 : op_type - enum efa_io_send_op_type
+	 * 7 : reserved31 - MBZ
 	 */
 	u8 flags;
 
 	/* local QP number */
 	u16 qp_num;
-
-	/* Transferred length */
-	u16 length;
 };
 
 /* Tx completion descriptor */
 struct efa_io_tx_cdesc {
 	/* Common completion info */
 	struct efa_io_cdesc_common common;
+
+	/* MBZ */
+	u16 reserved16;
 };
 
 /* Rx Completion Descriptor */
@@ -241,6 +243,9 @@ struct efa_io_rx_cdesc {
 	/* Common completion info */
 	struct efa_io_cdesc_common common;
 
+	/* Transferred length bits[15:0] */
+	u16 length;
+
 	/* Remote Address Handle FW index, 0xFFFF indicates invalid ah */
 	u16 ah;
 
@@ -250,16 +255,26 @@ struct efa_io_rx_cdesc {
 	u32 imm;
 };
 
+/* Rx Completion Descriptor RDMA write info */
+struct efa_io_rx_cdesc_rdma_write {
+	/* Transferred length bits[31:16] */
+	u16 length_hi;
+};
+
 /* Extended Rx Completion Descriptor */
 struct efa_io_rx_cdesc_ex {
 	/* Base RX completion info */
-	struct efa_io_rx_cdesc rx_cdesc_base;
+	struct efa_io_rx_cdesc base;
 
-	/*
-	 * Valid only in case of unknown AH (0xFFFF) and CQ set_src_addr is
-	 * enabled.
-	 */
-	u8 src_addr[16];
+	union {
+		struct efa_io_rx_cdesc_rdma_write rdma_write;
+
+		/*
+		 * Valid only in case of unknown AH (0xFFFF) and CQ
+		 * set_src_addr is enabled.
+		 */
+		u8 src_addr[16];
+	} u;
 };
 
 /* tx_meta_desc */
@@ -285,5 +300,6 @@ struct efa_io_rx_cdesc_ex {
 #define EFA_IO_CDESC_COMMON_PHASE_MASK                      BIT(0)
 #define EFA_IO_CDESC_COMMON_Q_TYPE_MASK                     GENMASK(2, 1)
 #define EFA_IO_CDESC_COMMON_HAS_IMM_MASK                    BIT(3)
+#define EFA_IO_CDESC_COMMON_OP_TYPE_MASK                    GENMASK(6, 4)
 
 #endif /* _EFA_IO_H_ */
diff --git a/drivers/amazon/net/efa/efa_main.c b/drivers/amazon/net/efa/efa_main.c
index cc8200ac141de..7332c296936e6 100644
--- a/drivers/amazon/net/efa/efa_main.c
+++ b/drivers/amazon/net/efa/efa_main.c
@@ -1,25 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
+#include "kcompat.h"
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/utsname.h>
 #include <linux/version.h>
 
 #include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "efa.h"
 #include "efa_sysfs.h"
 
-#ifdef HAVE_EFA_P2P
 #include "efa_p2p.h"
-#endif
 
-#ifndef HAVE_PCI_VENDOR_ID_AMAZON
-#define PCI_VENDOR_ID_AMAZON 0x1d0f
-#endif
 #define PCI_DEV_ID_EFA0_VF 0xefa0
 #define PCI_DEV_ID_EFA1_VF 0xefa1
 #define PCI_DEV_ID_EFA2_VF 0xefa2
@@ -32,14 +29,14 @@ static const struct pci_device_id efa_pci_tbl[] = {
 };
 
 #define DRV_MODULE_VER_MAJOR           2
-#define DRV_MODULE_VER_MINOR           1
-#define DRV_MODULE_VER_SUBMINOR        1
+#define DRV_MODULE_VER_MINOR           8
+#define DRV_MODULE_VER_SUBMINOR        0
 
 #ifndef DRV_MODULE_VERSION
 #define DRV_MODULE_VERSION \
 	__stringify(DRV_MODULE_VER_MAJOR) "."   \
 	__stringify(DRV_MODULE_VER_MINOR) "."   \
-	__stringify(DRV_MODULE_VER_SUBMINOR) "g"
+	__stringify(DRV_MODULE_VER_SUBMINOR) "a"
 #endif
 
 MODULE_VERSION(DRV_MODULE_VERSION);
@@ -60,6 +57,8 @@ MODULE_DEVICE_TABLE(pci, efa_pci_tbl);
 	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
 	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
 
+extern const struct uapi_definition efa_uapi_defs[];
+
 /* This handler will called for unknown event group or unimplemented handlers */
 static void unimplemented_aenq_handler(void *data,
 				       struct efa_admin_aenq_entry *aenq_e)
@@ -98,12 +97,8 @@ static void efa_process_comp_eqe(struct efa_dev *dev, struct efa_admin_eqe *eqe)
 	u16 cqn = eqe->u.comp_event.cqn;
 	struct efa_cq *cq;
 
-#ifdef HAVE_XARRAY
 	/* Safe to load as we're in irq and removal calls synchronize_irq() */
 	cq = xa_load(&dev->cqs_xa, cqn);
-#else
-	cq = dev->cqs_arr[cqn];
-#endif
 	if (unlikely(!cq)) {
 		ibdev_err_ratelimited(&dev->ibdev,
 				      "Completion event on non-existent CQ[%u]",
@@ -299,7 +294,7 @@ static void efa_set_host_info(struct efa_dev *dev)
 	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR,
 		DRV_MODULE_VER_SUBMINOR);
 	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE,
-		"g"[0]);
+		"a"[0]);
 	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_BUS, dev->pdev->bus->number);
 	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_DEVICE,
 		PCI_SLOT(dev->pdev->devfn));
@@ -309,9 +304,8 @@ static void efa_set_host_info(struct efa_dev *dev)
 		EFA_COMMON_SPEC_VERSION_MAJOR);
 	EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MINOR,
 		EFA_COMMON_SPEC_VERSION_MINOR);
-#ifdef HAVE_EFA_P2P
+	EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_INTREE, 1);
 	EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_GDR, 1);
-#endif
 
 	efa_com_set_feature_ex(&dev->edev, &resp, &cmd, EFA_ADMIN_HOST_INFO,
 			       hinf_dma, bufsz);
@@ -385,102 +379,44 @@ static void efa_destroy_eqs(struct efa_dev *dev)
 	kfree(dev->eqs);
 }
 
-#ifdef HAVE_IB_DEV_OPS
 static const struct ib_device_ops efa_dev_ops = {
-#ifdef HAVE_IB_DEVICE_OPS_COMMON
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_EFA,
 	.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION,
-#endif
 
-#ifdef HAVE_SPLIT_STATS_ALLOC
 	.alloc_hw_port_stats = efa_alloc_hw_port_stats,
 	.alloc_hw_device_stats = efa_alloc_hw_device_stats,
-#else
-	.alloc_hw_stats = efa_alloc_hw_stats,
-#endif
-#ifdef HAVE_PD_CORE_ALLOCATION
 	.alloc_pd = efa_alloc_pd,
-#else
-	.alloc_pd = efa_kzalloc_pd,
-#endif
-#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
 	.alloc_ucontext = efa_alloc_ucontext,
-#else
-	.alloc_ucontext = efa_kzalloc_ucontext,
-#endif
-#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
-#ifdef HAVE_AH_CORE_ALLOCATION
-	.create_ah = efa_create_ah,
-#else
-	.create_ah = efa_kzalloc_ah,
-#endif
-#endif
-#ifdef HAVE_CQ_CORE_ALLOCATION
 	.create_cq = efa_create_cq,
-#else
-	.create_cq = efa_kzalloc_cq,
-#endif
-#ifdef HAVE_QP_CORE_ALLOCATION
 	.create_qp = efa_create_qp,
-#else
-	.create_qp = efa_kzalloc_qp,
-#endif
-#ifdef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
 	.create_user_ah = efa_create_ah,
-#endif
 	.dealloc_pd = efa_dealloc_pd,
 	.dealloc_ucontext = efa_dealloc_ucontext,
 	.dereg_mr = efa_dereg_mr,
 	.destroy_ah = efa_destroy_ah,
 	.destroy_cq = efa_destroy_cq,
 	.destroy_qp = efa_destroy_qp,
-#ifndef HAVE_NO_KVERBS_DRIVERS
-	.get_dma_mr = efa_get_dma_mr,
-#endif
 	.get_hw_stats = efa_get_hw_stats,
 	.get_link_layer = efa_port_link_layer,
 	.get_port_immutable = efa_get_port_immutable,
 	.mmap = efa_mmap,
-#ifdef HAVE_CORE_MMAP_XA
 	.mmap_free = efa_mmap_free,
-#endif
 	.modify_qp = efa_modify_qp,
-#ifndef HAVE_NO_KVERBS_DRIVERS
-	.poll_cq = efa_poll_cq,
-	.post_recv = efa_post_recv,
-	.post_send = efa_post_send,
-#endif
 	.query_device = efa_query_device,
 	.query_gid = efa_query_gid,
 	.query_pkey = efa_query_pkey,
 	.query_port = efa_query_port,
 	.query_qp = efa_query_qp,
 	.reg_user_mr = efa_reg_mr,
-#ifdef HAVE_MR_DMABUF
 	.reg_user_mr_dmabuf = efa_reg_user_mr_dmabuf,
-#endif
-#ifndef HAVE_NO_KVERBS_DRIVERS
-	.req_notify_cq = efa_req_notify_cq,
-#endif
 
-#ifdef HAVE_AH_CORE_ALLOCATION
 	INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah),
-#endif
-#ifdef HAVE_CQ_CORE_ALLOCATION
 	INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq),
-#endif
-#ifdef HAVE_PD_CORE_ALLOCATION
 	INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd),
-#endif
-#ifdef HAVE_QP_CORE_ALLOCATION
 	INIT_RDMA_OBJ_SIZE(ib_qp, efa_qp, ibqp),
-#endif
-#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext),
-#endif
 };
-#endif
 
 static int efa_ib_device_add(struct efa_dev *dev)
 {
@@ -519,89 +455,13 @@ static int efa_ib_device_add(struct efa_dev *dev)
 	dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED;
 	dev->ibdev.phys_port_cnt = 1;
 	dev->ibdev.num_comp_vectors = dev->neqs ?: 1;
-#ifdef HAVE_DEV_PARENT
 	dev->ibdev.dev.parent = &pdev->dev;
-#else
-	dev->ibdev.dma_device = &pdev->dev;
-#endif
-
-#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
-	dev->ibdev.uverbs_cmd_mask |=
-		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
-		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
-		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
-		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
-		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
-		(1ull << IB_USER_VERBS_CMD_REG_MR) |
-		(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
-		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
-		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
-		(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
-		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
-		(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
-		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
-		(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
-		(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
-#endif
-
-#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
-	dev->ibdev.uverbs_ex_cmd_mask =
-		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
-#endif
 
-#ifndef HAVE_IB_DEVICE_OPS_COMMON
-#ifdef HAVE_DRIVER_ID
-	dev->ibdev.driver_id = RDMA_DRIVER_EFA;
-#endif
-	dev->ibdev.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION;
-	dev->ibdev.owner = THIS_MODULE;
-#endif
-#ifdef HAVE_IB_DEV_OPS
 	ib_set_device_ops(&dev->ibdev, &efa_dev_ops);
-#else
-	dev->ibdev.alloc_hw_stats = efa_alloc_hw_stats;
-	dev->ibdev.alloc_pd = efa_kzalloc_pd;
-	dev->ibdev.alloc_ucontext = efa_kzalloc_ucontext;
-	dev->ibdev.create_ah = efa_kzalloc_ah;
-	dev->ibdev.create_cq = efa_kzalloc_cq;
-	dev->ibdev.create_qp = efa_kzalloc_qp;
-	dev->ibdev.dealloc_pd = efa_dealloc_pd;
-	dev->ibdev.dealloc_ucontext = efa_dealloc_ucontext;
-	dev->ibdev.dereg_mr = efa_dereg_mr;
-	dev->ibdev.destroy_ah = efa_destroy_ah;
-	dev->ibdev.destroy_cq = efa_destroy_cq;
-	dev->ibdev.destroy_qp = efa_destroy_qp;
-	dev->ibdev.get_dma_mr = efa_get_dma_mr;
-	dev->ibdev.get_hw_stats = efa_get_hw_stats;
-	dev->ibdev.get_link_layer = efa_port_link_layer;
-	dev->ibdev.get_port_immutable = efa_get_port_immutable;
-	dev->ibdev.mmap = efa_mmap;
-	dev->ibdev.modify_qp = efa_modify_qp;
-	dev->ibdev.poll_cq = efa_poll_cq;
-	dev->ibdev.post_recv = efa_post_recv;
-	dev->ibdev.post_send = efa_post_send;
-	dev->ibdev.query_device = efa_query_device;
-	dev->ibdev.query_gid = efa_query_gid;
-	dev->ibdev.query_pkey = efa_query_pkey;
-	dev->ibdev.query_port = efa_query_port;
-	dev->ibdev.query_qp = efa_query_qp;
-	dev->ibdev.reg_user_mr = efa_reg_mr;
-	dev->ibdev.req_notify_cq = efa_req_notify_cq;
-#endif
 
-#ifdef HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM
+	dev->ibdev.driver_def = efa_uapi_defs;
+
 	err = ib_register_device(&dev->ibdev, "efa_%d", &pdev->dev);
-#elif defined(HAVE_IB_REGISTER_DEVICE_TWO_PARAMS)
-	err = ib_register_device(&dev->ibdev, "efa_%d");
-#elif defined(HAVE_IB_REGISTER_DEVICE_NAME_PARAM)
-	err = ib_register_device(&dev->ibdev, "efa_%d", NULL);
-#else
-	strscpy(dev->ibdev.name, "efa_%d",
-		sizeof(dev->ibdev.name));
-
-	err = ib_register_device(&dev->ibdev, NULL);
-#endif
 	if (err)
 		goto err_destroy_eqs;
 
@@ -708,11 +568,7 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev)
 
 	pci_set_master(pdev);
 
-#ifdef HAVE_SAFE_IB_ALLOC_DEVICE
 	dev = ib_alloc_device(efa_dev, ibdev);
-#else
-	dev = (struct efa_dev *)ib_alloc_device(sizeof(*dev));
-#endif
 	if (!dev) {
 		dev_err(&pdev->dev, "Device alloc failed\n");
 		err = -ENOMEM;
@@ -724,11 +580,7 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev)
 	edev->efa_dev = dev;
 	edev->dmadev = &pdev->dev;
 	dev->pdev = pdev;
-#ifdef HAVE_XARRAY
 	xa_init(&dev->cqs_xa);
-#else
-	memset(dev->cqs_arr, 0, sizeof(dev->cqs_arr));
-#endif
 
 	bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK;
 	err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
@@ -819,9 +671,7 @@ static void efa_remove_device(struct pci_dev *pdev)
 	efa_com_mmio_reg_read_destroy(edev);
 	devm_iounmap(&pdev->dev, edev->reg_bar);
 	efa_release_bars(dev, EFA_BASE_BAR_MASK);
-#ifdef HAVE_XARRAY
 	xa_destroy(&dev->cqs_xa);
-#endif
 	ib_dealloc_device(&dev->ibdev);
 	pci_disable_device(pdev);
 }
@@ -873,9 +723,7 @@ static int __init efa_init(void)
 		return err;
 	}
 
-#ifdef HAVE_EFA_P2P
 	efa_p2p_init();
-#endif
 
 	return 0;
 }
diff --git a/drivers/amazon/net/efa/efa_neuron.c b/drivers/amazon/net/efa/efa_neuron.c
index ec2644e3079c4..15a9917ac2cc2 100644
--- a/drivers/amazon/net/efa/efa_neuron.c
+++ b/drivers/amazon/net/efa/efa_neuron.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2021-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/module.h>
@@ -154,6 +154,17 @@ static void neuronmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
 	kfree(neuronmem);
 }
 
+static char *neuronmem_provider_string(void)
+{
+	struct efa_neuronmem dummy = {};
+
+	if (neuronmem_get_fp(&dummy))
+		return "";
+
+	neuronmem_put_fp();
+	return "NEURON";
+}
+
 struct neuronmem_provider {
 	struct efa_p2p_provider p2p;
 };
@@ -161,6 +172,7 @@ struct neuronmem_provider {
 static const struct neuronmem_provider prov = {
 	.p2p = {
 		.ops = {
+			.get_provider_string = neuronmem_provider_string,
 			.try_get = neuronmem_get,
 			.to_page_list = neuronmem_to_page_list,
 			.release = neuronmem_release,
diff --git a/drivers/amazon/net/efa/efa_p2p.c b/drivers/amazon/net/efa/efa_p2p.c
index 9daf101288f43..ab05f3a5d170b 100644
--- a/drivers/amazon/net/efa/efa_p2p.c
+++ b/drivers/amazon/net/efa/efa_p2p.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2019-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_p2p.h"
@@ -44,6 +44,23 @@ static struct efa_p2pmem *ticket_to_p2p(u64 ticket)
 	return NULL;
 }
 
+char *efa_p2p_provider_string(void)
+{
+	const struct efa_p2p_provider *prov;
+	char *prov_string;
+	int i;
+
+	for (i = 0; i < EFA_P2P_PROVIDER_MAX; i++) {
+		prov = prov_arr[i];
+		prov_string = prov->ops.get_provider_string();
+		if (prov_string[0] != '\0')
+			/* Only the first available provider is returned */
+			return prov_string;
+	}
+
+	return "";
+}
+
 int efa_p2p_put(u64 ticket, bool in_cb)
 {
 	struct efa_com_dereg_mr_params params = {};
@@ -81,6 +98,7 @@ struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start
 			       u64 length)
 {
 	const struct efa_p2p_provider *prov;
+	static bool message_printed;
 	struct efa_p2pmem *p2pmem;
 	u64 ticket;
 	int i;
@@ -101,6 +119,11 @@ struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start
 	p2pmem->prov = prov;
 	mr->p2p_ticket = p2pmem->ticket;
 
+	if (!message_printed) {
+		pr_info("efa: Acquired peer memory using P2P");
+		message_printed = true;
+	}
+
 	mutex_lock(&p2p_list_lock);
 	list_add(&p2pmem->list, &p2p_list);
 	mutex_unlock(&p2p_list_lock);
diff --git a/drivers/amazon/net/efa/efa_p2p.h b/drivers/amazon/net/efa/efa_p2p.h
index 89ee7a9935c11..5a4bf353ec633 100644
--- a/drivers/amazon/net/efa/efa_p2p.h
+++ b/drivers/amazon/net/efa/efa_p2p.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2019-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_P2P_H_
@@ -9,6 +9,7 @@
 #include "efa.h"
 
 struct efa_p2p_ops {
+	char *(*get_provider_string)(void);
 	struct efa_p2pmem *(*try_get)(struct efa_dev *dev, u64 ticket, u64 start,
 				      u64 length);
 	int (*to_page_list)(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
@@ -40,6 +41,7 @@ struct efa_p2pmem {
 };
 
 void efa_p2p_init(void);
+char *efa_p2p_provider_string(void);
 struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
 			       u64 length);
 unsigned int efa_p2p_get_page_size(struct efa_dev *dev,
@@ -50,7 +52,6 @@ int efa_p2p_put(u64 ticket, bool in_cb);
 
 /* Provider specific stuff go here */
 const struct efa_p2p_provider *nvmem_get_provider(void);
-bool nvmem_is_supported(void);
 
 const struct efa_p2p_provider *neuronmem_get_provider(void);
 
diff --git a/drivers/amazon/net/efa/efa_sysfs.c b/drivers/amazon/net/efa/efa_sysfs.c
index 8e8b2bd210db1..1cd729c758159 100644
--- a/drivers/amazon/net/efa/efa_sysfs.c
+++ b/drivers/amazon/net/efa/efa_sysfs.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_sysfs.h"
@@ -9,54 +9,26 @@
 #include <linux/device.h>
 #include <linux/sysfs.h>
 
-#ifndef HAVE_SYSFS_EMIT
-#include <linux/mm.h>
-
-static int sysfs_emit(char *buf, const char *fmt, ...)
-{
-	va_list args;
-	int len;
-
-	if (!buf)
-		return 0;
-
-	va_start(args, fmt);
-	len = vscnprintf(buf, PAGE_SIZE, fmt, args);
-	va_end(args);
-
-	return len;
-}
-#endif
-
-#ifdef HAVE_EFA_P2P
 #include "efa_p2p.h"
 
-static ssize_t gdr_show(struct device *dev, struct device_attribute *attr,
+static ssize_t p2p_show(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
-	if (nvmem_is_supported())
-		return sysfs_emit(buf, "1\n");
-
-	return sysfs_emit(buf, "0\n");
+	return sysfs_emit(buf, "%s\n", efa_p2p_provider_string());
 }
 
-static DEVICE_ATTR_RO(gdr);
-#endif
+static DEVICE_ATTR_RO(p2p);
 
 int efa_sysfs_init(struct efa_dev *dev)
 {
-#ifdef HAVE_EFA_P2P
 	struct device *device = &dev->pdev->dev;
 
-	if (device_create_file(device, &dev_attr_gdr))
-		dev_err(device, "Failed to create GDR sysfs file\n");
-#endif
+	if (device_create_file(device, &dev_attr_p2p))
+		dev_err(device, "Failed to create P2P sysfs file\n");
 	return 0;
 }
 
 void efa_sysfs_destroy(struct efa_dev *dev)
 {
-#ifdef HAVE_EFA_P2P
-	device_remove_file(&dev->pdev->dev, &dev_attr_gdr);
-#endif
+	device_remove_file(&dev->pdev->dev, &dev_attr_p2p);
 }
diff --git a/drivers/amazon/net/efa/efa_verbs.c b/drivers/amazon/net/efa/efa_verbs.c
index c9535ee90108b..6ed927b005a5c 100644
--- a/drivers/amazon/net/efa/efa_verbs.c
+++ b/drivers/amazon/net/efa/efa_verbs.c
@@ -1,13 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "kcompat.h"
-#ifdef HAVE_MR_DMABUF
 #include <linux/dma-buf.h>
 #include <linux/dma-resv.h>
-#endif
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 
@@ -15,16 +13,15 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_verbs.h>
-#ifdef HAVE_UDATA_TO_DRV_CONTEXT
 #include <rdma/uverbs_ioctl.h>
-#endif
+#define UVERBS_MODULE_NAME efa_ib
+#include <rdma/uverbs_named_ioctl.h>
+#include <rdma/ib_user_ioctl_cmds.h>
 
 #include "efa.h"
 #include "efa_io_defs.h"
 
-#ifdef HAVE_EFA_P2P
 #include "efa_p2p.h"
-#endif
 
 enum {
 	EFA_MMAP_DMA_PAGE = 0,
@@ -38,9 +35,6 @@ enum {
 
 struct efa_user_mmap_entry {
 	struct rdma_user_mmap_entry rdma_entry;
-#ifndef HAVE_CORE_MMAP_XA
-	struct list_head list;
-#endif
 	u64 address;
 	u8 mmap_flag;
 };
@@ -73,25 +67,20 @@ struct efa_user_mmap_entry {
 	op(EFA_RDMA_READ_BYTES, "rdma_read_bytes") \
 	op(EFA_RDMA_READ_WR_ERR, "rdma_read_wr_err") \
 	op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \
+	op(EFA_RDMA_WRITE_WRS, "rdma_write_wrs") \
+	op(EFA_RDMA_WRITE_BYTES, "rdma_write_bytes") \
+	op(EFA_RDMA_WRITE_WR_ERR, "rdma_write_wr_err") \
+	op(EFA_RDMA_WRITE_RECV_BYTES, "rdma_write_recv_bytes") \
 
 #define EFA_STATS_ENUM(ename, name) ename,
-#ifdef HAVE_STAT_DESC_STRUCT
 #define EFA_STATS_STR(ename, nam) \
 	[ename].name = nam,
-#else
-#define EFA_STATS_STR(ename, nam) \
-	[ename] = nam,
-#endif
 
 enum efa_hw_device_stats {
 	EFA_DEFINE_DEVICE_STATS(EFA_STATS_ENUM)
 };
 
-#ifdef HAVE_STAT_DESC_STRUCT
 static const struct rdma_stat_desc efa_device_stats_descs[] = {
-#else
-static const char *const efa_device_stats_descs[] = {
-#endif
 	EFA_DEFINE_DEVICE_STATS(EFA_STATS_STR)
 };
 
@@ -99,11 +88,7 @@ enum efa_hw_port_stats {
 	EFA_DEFINE_PORT_STATS(EFA_STATS_ENUM)
 };
 
-#ifdef HAVE_STAT_DESC_STRUCT
 static const struct rdma_stat_desc efa_port_stats_descs[] = {
-#else
-static const char *const efa_port_stats_descs[] = {
-#endif
 	EFA_DEFINE_PORT_STATS(EFA_STATS_STR)
 };
 
@@ -224,77 +209,6 @@ static void efa_free_mapped(struct efa_dev *dev, void *cpu_addr,
 	free_pages_exact(cpu_addr, size);
 }
 
-#ifndef HAVE_CORE_MMAP_XA
-/*
- * This is only called when the ucontext is destroyed and there can be no
- * concurrent query via mmap or allocate on the database, thus we can be sure no
- * other thread is using the entry pointer. We also know that all the BAR
- * pages have either been zap'd or munmaped at this point.  Normal pages are
- * refcounted and will be freed at the proper time.
- */
-static void mmap_entries_remove_free(struct efa_dev *dev,
-				     struct efa_ucontext *ucontext)
-{
-	struct efa_user_mmap_entry *entry, *tmp;
-
-	list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) {
-		list_del(&entry->list);
-		ibdev_dbg(
-			&dev->ibdev,
-			"mmap: key[%#llx] addr[%#llx] len[%#zx] removed\n",
-			rdma_user_mmap_get_offset(&entry->rdma_entry),
-			entry->address, entry->rdma_entry.npages * PAGE_SIZE);
-		kfree(entry);
-	}
-}
-
-static int mmap_entry_validate(struct efa_ucontext *ucontext,
-			       struct vm_area_struct *vma)
-{
-	size_t length = vma->vm_end - vma->vm_start;
-
-	if (length % PAGE_SIZE != 0 || !(vma->vm_flags & VM_SHARED)) {
-		ibdev_dbg(ucontext->ibucontext.device,
-			  "length[%#zx] is not page size aligned[%#lx] or VM_SHARED is not set [%#lx]\n",
-			  length, PAGE_SIZE, vma->vm_flags);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-struct rdma_user_mmap_entry *
-rdma_user_mmap_entry_get(struct ib_ucontext *ibucontext,
-			 struct vm_area_struct *vma)
-{
-	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
-	size_t length = vma->vm_end - vma->vm_start;
-	struct efa_user_mmap_entry *entry, *tmp;
-	u64 key = vma->vm_pgoff << PAGE_SHIFT;
-	int err;
-
-	err = mmap_entry_validate(ucontext, vma);
-	if (err)
-		return NULL;
-
-	mutex_lock(&ucontext->lock);
-	list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) {
-		if (rdma_user_mmap_get_offset(&entry->rdma_entry) == key &&
-		    entry->rdma_entry.npages * PAGE_SIZE == length) {
-			ibdev_dbg(ibucontext->device,
-				  "mmap: key[%#llx] addr[%#llx] len[%#zx] removed\n",
-				  key, entry->address,
-				  entry->rdma_entry.npages * PAGE_SIZE);
-			mutex_unlock(&ucontext->lock);
-			return &entry->rdma_entry;
-		}
-	}
-	mutex_unlock(&ucontext->lock);
-
-	return NULL;
-}
-#endif /* !defined (HAVE_CORE_MMAP_XA) */
-
 int efa_query_device(struct ib_device *ibdev,
 		     struct ib_device_attr *props,
 		     struct ib_udata *udata)
@@ -327,13 +241,8 @@ int efa_query_device(struct ib_device *ibdev,
 	props->max_cqe = dev_attr->max_cq_depth;
 	props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth,
 				 dev_attr->max_rq_depth);
-#ifdef HAVE_MAX_SEND_RCV_SGE
 	props->max_send_sge = dev_attr->max_sq_sge;
 	props->max_recv_sge = dev_attr->max_rq_sge;
-#else
-	props->max_sge = min_t(u16, dev_attr->max_sq_sge,
-			       dev_attr->max_rq_sge);
-#endif
 	props->max_sge_rd = dev_attr->max_wr_rdma_sge;
 	props->max_pkeys = 1;
 
@@ -351,6 +260,12 @@ int efa_query_device(struct ib_device *ibdev,
 		if (EFA_DEV_CAP(dev, RNR_RETRY))
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY;
 
+		if (EFA_DEV_CAP(dev, DATA_POLLING_128))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128;
+
+		if (EFA_DEV_CAP(dev, RDMA_WRITE))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE;
+
 		if (dev->neqs)
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
 
@@ -467,13 +382,7 @@ static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn)
 	return efa_com_dealloc_pd(&dev->edev, &params);
 }
 
-#ifdef HAVE_ALLOC_PD_NO_UCONTEXT
 int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
-#else
-int efa_alloc_pd(struct ib_pd *ibpd,
-		 struct ib_ucontext *ibucontext,
-		 struct ib_udata *udata)
-#endif
 {
 	struct efa_dev *dev = to_edev(ibpd->device);
 	struct efa_ibv_alloc_pd_resp resp = {};
@@ -481,14 +390,6 @@ int efa_alloc_pd(struct ib_pd *ibpd,
 	struct efa_pd *pd = to_epd(ibpd);
 	int err;
 
-#ifndef HAVE_NO_KVERBS_DRIVERS
-	if (!udata) {
-		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
-		err = -EOPNOTSUPP;
-		goto err_out;
-	}
-#endif
-
 	if (udata->inlen &&
 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
 		ibdev_dbg(&dev->ibdev,
@@ -525,61 +426,14 @@ int efa_alloc_pd(struct ib_pd *ibpd,
 	return err;
 }
 
-#ifndef HAVE_PD_CORE_ALLOCATION
-struct ib_pd *efa_kzalloc_pd(struct ib_device *ibdev,
-			     struct ib_ucontext *ibucontext,
-			     struct ib_udata *udata)
-{
-	struct efa_dev *dev = to_edev(ibdev);
-	struct efa_pd *pd;
-	int err;
-
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd) {
-		atomic64_inc(&dev->stats.alloc_pd_err);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	pd->ibpd.device = ibdev;
-
-#ifdef HAVE_ALLOC_PD_NO_UCONTEXT
-	err = efa_alloc_pd(&pd->ibpd, udata);
-#else
-	err = efa_alloc_pd(&pd->ibpd, ibucontext, udata);
-#endif
-	if (err)
-		goto err_free;
-
-	return &pd->ibpd;
-
-err_free:
-	kfree(pd);
-	return ERR_PTR(err);
-}
-#endif
-
-#ifdef HAVE_DEALLOC_PD_UDATA_RC
 int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
-#elif defined(HAVE_DEALLOC_PD_UDATA)
-void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
-#elif defined(HAVE_PD_CORE_ALLOCATION)
-void efa_dealloc_pd(struct ib_pd *ibpd)
-#else
-int efa_dealloc_pd(struct ib_pd *ibpd)
-#endif
 {
 	struct efa_dev *dev = to_edev(ibpd->device);
 	struct efa_pd *pd = to_epd(ibpd);
 
 	ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn);
 	efa_pd_dealloc(dev, pd->pdn);
-#ifndef HAVE_PD_CORE_ALLOCATION
-	kfree(pd);
-
 	return 0;
-#elif defined(HAVE_DEALLOC_PD_UDATA_RC)
-	return 0;
-#endif
 }
 
 static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
@@ -597,11 +451,7 @@ static void efa_qp_user_mmap_entries_remove(struct efa_qp *qp)
 	rdma_user_mmap_entry_remove(qp->sq_db_mmap_entry);
 }
 
-#ifdef HAVE_DESTROY_QP_UDATA
 int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
-#else
-int efa_destroy_qp(struct ib_qp *ibqp)
-#endif
 {
 	struct efa_dev *dev = to_edev(ibqp->pd->device);
 	struct efa_qp *qp = to_eqp(ibqp);
@@ -609,12 +459,12 @@ int efa_destroy_qp(struct ib_qp *ibqp)
 
 	ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
 
-	efa_qp_user_mmap_entries_remove(qp);
-
 	err = efa_destroy_qp_handle(dev, qp->qp_handle);
 	if (err)
 		return err;
 
+	efa_qp_user_mmap_entries_remove(qp);
+
 	if (qp->rq_cpu_addr) {
 		ibdev_dbg(&dev->ibdev,
 			  "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n",
@@ -624,13 +474,9 @@ int efa_destroy_qp(struct ib_qp *ibqp)
 				qp->rq_size, DMA_TO_DEVICE);
 	}
 
-#ifndef HAVE_QP_CORE_ALLOCATION
-	kfree(qp);
-#endif
 	return 0;
 }
 
-#ifdef HAVE_CORE_MMAP_XA
 static struct rdma_user_mmap_entry*
 efa_user_mmap_entry_insert(struct ib_ucontext *ucontext,
 			   u64 address, size_t length,
@@ -655,47 +501,6 @@ efa_user_mmap_entry_insert(struct ib_ucontext *ucontext,
 
 	return &entry->rdma_entry;
 }
-#else
-static struct rdma_user_mmap_entry *
-efa_user_mmap_entry_insert(struct ib_ucontext *ibucontext, u64 address,
-			   size_t length, u8 mmap_flag, u64 *offset)
-{
-	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
-	struct efa_user_mmap_entry *entry;
-	u64 next_mmap_page;
-
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return NULL;
-
-	entry->address = address;
-	entry->rdma_entry.npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
-	entry->mmap_flag = mmap_flag;
-
-	mutex_lock(&ucontext->lock);
-	next_mmap_page = ucontext->mmap_page + (length >> PAGE_SHIFT);
-	if (next_mmap_page >= U32_MAX) {
-		ibdev_dbg(ucontext->ibucontext.device, "Too many mmap pages\n");
-		mutex_unlock(&ucontext->lock);
-		kfree(entry);
-		return NULL;
-	}
-
-	entry->rdma_entry.start_pgoff = ucontext->mmap_page;
-	ucontext->mmap_page = next_mmap_page;
-	list_add_tail(&entry->list, &ucontext->pending_mmaps);
-	mutex_unlock(&ucontext->lock);
-
-	*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
-	ibdev_dbg(
-		ucontext->ibucontext.device,
-		"mmap: addr[%#llx], len[%#zx], key[%#llx] inserted\n",
-		entry->address, entry->rdma_entry.npages * PAGE_SIZE,
-		rdma_user_mmap_get_offset(&entry->rdma_entry));
-
-	return &entry->rdma_entry;
-}
-#endif
 
 static int qp_mmap_entries_setup(struct efa_qp *qp,
 				 struct efa_dev *dev,
@@ -839,21 +644,8 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
 	struct efa_ucontext *ucontext;
 	int err;
 
-#ifndef HAVE_NO_KVERBS_DRIVERS
-	if (!udata) {
-		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
-		err = -EOPNOTSUPP;
-		goto err_out;
-	}
-#endif
-
-#ifdef HAVE_UDATA_TO_DRV_CONTEXT
 	ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
 					     ibucontext);
-#else
-	ucontext = ibqp->pd->uobject ? to_eucontext(ibqp->pd->uobject->context) :
-				       NULL;
-#endif
 
 	err = efa_qp_validate_cap(dev, init_attr);
 	if (err)
@@ -985,38 +777,6 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
 	return err;
 }
 
-#ifndef HAVE_QP_CORE_ALLOCATION
-struct ib_qp *efa_kzalloc_qp(struct ib_pd *ibpd,
-			     struct ib_qp_init_attr *init_attr,
-			     struct ib_udata *udata)
-{
-	struct efa_dev *dev = to_edev(ibpd->device);
-	struct efa_qp *qp;
-	int err;
-
-	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-	if (!qp) {
-		atomic64_inc(&dev->stats.create_qp_err);
-		err = -ENOMEM;
-		goto err_out;
-	}
-
-	qp->ibqp.device = ibpd->device;
-	qp->ibqp.pd = ibpd;
-	qp->ibqp.qp_type = init_attr->qp_type;
-	err = efa_create_qp(&qp->ibqp, init_attr, udata);
-	if (err)
-		goto err_free_qp;
-
-	return &qp->ibqp;
-
-err_free_qp:
-	kfree(qp);
-err_out:
-	return ERR_PTR(err);
-}
-#endif
-
 static const struct {
 	int			valid;
 	enum ib_qp_attr_mask	req_param;
@@ -1149,14 +909,8 @@ static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp,
 		err = !efa_modify_srd_qp_is_ok(cur_state, new_state,
 					       qp_attr_mask);
 	else
-#ifdef HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS
 		err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
 					  qp_attr_mask);
-#else
-		err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
-					  qp_attr_mask,
-					  IB_LINK_LAYER_UNSPECIFIED);
-#endif
 
 	if (err) {
 		ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n");
@@ -1186,17 +940,8 @@ int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 	enum ib_qp_state new_state;
 	int err;
 
-#ifndef HAVE_NO_KVERBS_DRIVERS
-	if (!udata) {
-		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
-		return -EOPNOTSUPP;
-	}
-#endif
-
-#ifdef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
 	if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
 		return -EOPNOTSUPP;
-#endif
 
 	if (udata->inlen &&
 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
@@ -1269,12 +1014,7 @@ static void efa_cq_user_mmap_entries_remove(struct efa_cq *cq)
 	rdma_user_mmap_entry_remove(cq->mmap_entry);
 }
 
-#if defined(HAVE_IB_VOID_DESTROY_CQ) || defined(HAVE_IB_INT_DESTROY_CQ)
-#ifdef HAVE_IB_INT_DESTROY_CQ
 int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
-#else
-void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
-#endif
 {
 	struct efa_dev *dev = to_edev(ibcq->device);
 	struct efa_cq *cq = to_ecq(ibcq);
@@ -1283,60 +1023,16 @@ void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
 		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
 
-	efa_cq_user_mmap_entries_remove(cq);
 	efa_destroy_cq_idx(dev, cq->cq_idx);
-	if (cq->eq) {
-#ifdef HAVE_XARRAY
-		xa_erase(&dev->cqs_xa, cq->cq_idx);
-#else
-		dev->cqs_arr[cq->cq_idx] = NULL;
-#endif
-		synchronize_irq(cq->eq->irq.irqn);
-	}
-	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
-			DMA_FROM_DEVICE);
-#ifndef HAVE_CQ_CORE_ALLOCATION
-	kfree(cq);
-#endif
-#ifdef HAVE_IB_INT_DESTROY_CQ
-	return 0;
-#endif
-}
-#else
-#ifdef HAVE_DESTROY_CQ_UDATA
-int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
-#else
-int efa_destroy_cq(struct ib_cq *ibcq)
-#endif
-{
-	struct efa_dev *dev = to_edev(ibcq->device);
-	struct efa_cq *cq = to_ecq(ibcq);
-	int err;
-
-	ibdev_dbg(&dev->ibdev,
-		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
-		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
-
 	efa_cq_user_mmap_entries_remove(cq);
-	err = efa_destroy_cq_idx(dev, cq->cq_idx);
-	if (err)
-		return err;
-
 	if (cq->eq) {
-#ifdef HAVE_XARRAY
 		xa_erase(&dev->cqs_xa, cq->cq_idx);
-#else
-		dev->cqs_arr[cq->cq_idx] = NULL;
-#endif
 		synchronize_irq(cq->eq->irq.irqn);
 	}
 	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
 			DMA_FROM_DEVICE);
-
-	kfree(cq);
 	return 0;
 }
-#endif
 
 static struct efa_eq *efa_vec2eq(struct efa_dev *dev, int vec)
 {
@@ -1376,12 +1072,8 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
 int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		  struct ib_udata *udata)
 {
-#ifdef HAVE_UDATA_TO_DRV_CONTEXT
 	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
 		udata, struct efa_ucontext, ibucontext);
-#else
-	struct efa_ucontext *ucontext = to_ecq(ibcq)->ucontext;
-#endif
 	struct efa_com_create_cq_params params = {};
 	struct efa_ibv_create_cq_resp resp = {};
 	struct efa_com_create_cq_result result;
@@ -1406,14 +1098,6 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		goto err_out;
 	}
 
-#ifndef HAVE_NO_KVERBS_DRIVERS
-	if (!udata) {
-		ibdev_dbg(ibdev, "udata is NULL\n");
-		err = -EOPNOTSUPP;
-		goto err_out;
-	}
-#endif
-
 	if (offsetofend(typeof(cmd), num_sub_cqs) > udata->inlen) {
 		ibdev_dbg(ibdev,
 			  "Incompatible ABI params, no input udata\n");
@@ -1500,11 +1184,7 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	}
 
 	if (cq->eq) {
-#ifdef HAVE_XARRAY
 		err = xa_err(xa_store(&dev->cqs_xa, cq->cq_idx, cq, GFP_KERNEL));
-#else
-		dev->cqs_arr[cq->cq_idx] = cq;
-#endif
 		if (err) {
 			ibdev_dbg(ibdev, "Failed to store cq[%u] in xarray\n",
 				  cq->cq_idx);
@@ -1529,11 +1209,7 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 
 err_xa_erase:
 	if (cq->eq)
-#ifdef HAVE_XARRAY
 		xa_erase(&dev->cqs_xa, cq->cq_idx);
-#else
-		dev->cqs_arr[cq->cq_idx] = NULL;
-#endif
 err_remove_mmap:
 	efa_cq_user_mmap_entries_remove(cq);
 err_destroy_cq:
@@ -1547,49 +1223,6 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	return err;
 }
 
-#ifndef HAVE_CQ_CORE_ALLOCATION
-#ifdef HAVE_CREATE_CQ_NO_UCONTEXT
-struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_udata *udata)
-#elif defined(HAVE_CREATE_CQ_ATTR)
-struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_ucontext *ibucontext,
-			     struct ib_udata *udata)
-#endif
-{
-	struct efa_dev *dev = to_edev(ibdev);
-	struct efa_cq *cq;
-	int err;
-
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq) {
-		atomic64_inc(&dev->stats.create_cq_err);
-		return ERR_PTR(-ENOMEM);
-	}
-
-#ifdef HAVE_UDATA_TO_DRV_CONTEXT
-	cq->ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
-						 ibucontext);
-#else
-	cq->ucontext = to_eucontext(ibucontext);
-#endif
-
-	cq->ibcq.device = ibdev;
-	err = efa_create_cq(&cq->ibcq, attr, udata);
-	if (err)
-		goto err_free_cq;
-
-	return &cq->ibcq;
-
-err_free_cq:
-	kfree(cq);
-	return ERR_PTR(err);
-}
-#endif
-
-#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
 static int umem_to_page_list(struct efa_dev *dev,
 			     struct ib_umem *umem,
 			     u64 *page_list,
@@ -1608,73 +1241,6 @@ static int umem_to_page_list(struct efa_dev *dev,
 
 	return 0;
 }
-#elif defined(HAVE_SG_DMA_PAGE_ITER)
-static int umem_to_page_list(struct efa_dev *dev,
-			     struct ib_umem *umem,
-			     u64 *page_list,
-			     u32 hp_cnt,
-			     u8 hp_shift)
-{
-	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
-	struct sg_dma_page_iter sg_iter;
-	unsigned int page_idx = 0;
-	unsigned int hp_idx = 0;
-
-	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
-		  hp_cnt, pages_in_hp);
-
-	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
-		if (page_idx % pages_in_hp == 0) {
-			page_list[hp_idx] = sg_page_iter_dma_address(&sg_iter);
-			hp_idx++;
-		}
-
-		page_idx++;
-	}
-
-	return 0;
-}
-#elif defined(HAVE_UMEM_SCATTERLIST_IF)
-static int umem_to_page_list(struct efa_dev *dev,
-			     struct ib_umem *umem,
-			     u64 *page_list,
-			     u32 hp_cnt,
-			     u8 hp_shift)
-{
-	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
-	unsigned int page_idx = 0;
-	unsigned int pages_in_sg;
-	unsigned int hp_idx = 0;
-	struct scatterlist *sg;
-	unsigned int entry;
-	unsigned int i;
-
-	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
-		  hp_cnt, pages_in_hp);
-
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		if (sg_dma_len(sg) & ~PAGE_MASK) {
-			ibdev_dbg(&dev->ibdev,
-				  "sg_dma_len[%u] does not divide by PAGE_SIZE[%lu]\n",
-				  sg_dma_len(sg), PAGE_SIZE);
-			return -EINVAL;
-		}
-
-		pages_in_sg = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (i = 0; i < pages_in_sg; i++) {
-			if (page_idx % pages_in_hp == 0) {
-				page_list[hp_idx] = sg_dma_address(sg) +
-						    i * PAGE_SIZE;
-				hp_idx++;
-			}
-
-			page_idx++;
-		}
-	}
-
-	return 0;
-}
-#endif
 
 static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
 {
@@ -1713,12 +1279,7 @@ static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
 	int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt;
 	struct efa_com_ctrl_buff_info *ctrl_buf;
 	u64 *cur_chunk_buf, *prev_chunk_buf;
-#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
 	struct ib_block_iter biter;
-#else
-	struct scatterlist *sg;
-	unsigned int entry, payloads_in_sg;
-#endif
 	dma_addr_t dma_addr;
 	int i;
 
@@ -1752,7 +1313,6 @@ static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
 	chunk_idx = 0;
 	payload_idx = 0;
 	cur_chunk_buf = chunk_list->chunks[0].buf;
-#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
 	rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt,
 			    EFA_CHUNK_PAYLOAD_SIZE) {
 		cur_chunk_buf[payload_idx++] =
@@ -1764,22 +1324,6 @@ static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
 			payload_idx = 0;
 		}
 	}
-#else
-	for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) {
-		payloads_in_sg = sg_dma_len(sg) >> EFA_CHUNK_PAYLOAD_SHIFT;
-		for (i = 0; i < payloads_in_sg; i++) {
-			cur_chunk_buf[payload_idx++] =
-				(sg_dma_address(sg) & ~(EFA_CHUNK_PAYLOAD_SIZE - 1)) +
-				(EFA_CHUNK_PAYLOAD_SIZE * i);
-
-			if (payload_idx == EFA_PTRS_PER_CHUNK) {
-				chunk_idx++;
-				cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
-				payload_idx = 0;
-			}
-		}
-	}
-#endif
 
 	/* map chunks to dma and fill chunks next ptrs */
 	for (i = chunk_list_size - 1; i >= 0; i--) {
@@ -1868,7 +1412,7 @@ static int pbl_continuous_initialize(struct efa_dev *dev,
  */
 static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
 {
-	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE);
+	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, EFA_CHUNK_PAYLOAD_SIZE);
 	struct scatterlist *sgl;
 	int sg_dma_cnt, err;
 
@@ -1918,11 +1462,7 @@ static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl)
 /* create a page buffer list from a mapped user memory region */
 static int pbl_create(struct efa_dev *dev,
 		      struct pbl_context *pbl,
-#ifdef HAVE_EFA_P2P
 		      struct efa_mr *mr,
-#else
-		      struct ib_umem *umem,
-#endif
 		      int hp_cnt,
 		      u8 hp_shift)
 {
@@ -1935,16 +1475,11 @@ static int pbl_create(struct efa_dev *dev,
 
 	if (is_vmalloc_addr(pbl->pbl_buf)) {
 		pbl->physically_continuous = 0;
-#ifdef HAVE_EFA_P2P
 		if (mr->p2pmem)
 			err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf);
 		else
 			err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt,
 						hp_shift);
-#else
-		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
-					hp_shift);
-#endif
 		if (err)
 			goto err_free;
 
@@ -1953,16 +1488,11 @@ static int pbl_create(struct efa_dev *dev,
 			goto err_free;
 	} else {
 		pbl->physically_continuous = 1;
-#ifdef HAVE_EFA_P2P
 		if (mr->p2pmem)
 			err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf);
 		else
 			err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt,
 						hp_shift);
-#else
-		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
-					hp_shift);
-#endif
 		if (err)
 			goto err_free;
 
@@ -1999,17 +1529,12 @@ static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
 	int err;
 
 	params->inline_pbl = 1;
-#ifdef HAVE_EFA_P2P
 	if (mr->p2pmem)
 		err = efa_p2p_to_page_list(dev, mr->p2pmem,
 					   params->pbl.inline_pbl_array);
 	else
 		err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
 					params->page_num, params->page_shift);
-#else
-	err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
-				params->page_num, params->page_shift);
-#endif
 	if (err)
 		return err;
 
@@ -2026,13 +1551,8 @@ static int efa_create_pbl(struct efa_dev *dev,
 {
 	int err;
 
-#ifdef HAVE_EFA_P2P
 	err = pbl_create(dev, pbl, mr, params->page_num,
 			 params->page_shift);
-#else
-	err = pbl_create(dev, pbl, mr->umem, params->page_num,
-			 params->page_shift);
-#endif
 	if (err) {
 		ibdev_dbg(&dev->ibdev, "Failed to create pbl[%d]\n", err);
 		return err;
@@ -2058,54 +1578,6 @@ static int efa_create_pbl(struct efa_dev *dev,
 	return 0;
 }
 
-#ifndef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
-static unsigned long efa_cont_pages(struct ib_umem *umem,
-				    unsigned long page_size_cap,
-				    u64 addr)
-{
-	unsigned long max_page_shift = fls64(page_size_cap);
-	struct scatterlist *sg;
-	u64 base = ~0, p = 0;
-	unsigned long tmp;
-	unsigned long m;
-	u64 len, pfn;
-	int i = 0;
-	int entry;
-
-	addr = addr >> PAGE_SHIFT;
-	tmp = (unsigned long)addr;
-	m = find_first_bit(&tmp, BITS_PER_LONG);
-	m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
-
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE);
-		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
-		if (base + p != pfn) {
-			/*
-			 * If either the offset or the new
-			 * base are unaligned update m
-			 */
-			tmp = (unsigned long)(pfn | p);
-			if (!IS_ALIGNED(tmp, 1 << m))
-				m = find_first_bit(&tmp, BITS_PER_LONG);
-
-			base = pfn;
-			p = 0;
-		}
-
-		p += len;
-		i += len;
-	}
-
-	if (i)
-		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
-	else
-		m = 0;
-
-	return BIT(PAGE_SHIFT + m);
-}
-#endif
-
 static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
 				   struct ib_udata *udata)
 {
@@ -2113,13 +1585,6 @@ static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
 	int supp_access_flags;
 	struct efa_mr *mr;
 
-#ifndef HAVE_NO_KVERBS_DRIVERS
-	if (!udata) {
-		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
-		return ERR_PTR(-EINVAL);
-	}
-#endif
-
 	if (udata && udata->inlen &&
 	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
 		ibdev_dbg(&dev->ibdev,
@@ -2129,11 +1594,10 @@ static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
 
 	supp_access_flags =
 		IB_ACCESS_LOCAL_WRITE |
-		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0);
+		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0) |
+		(EFA_DEV_CAP(dev, RDMA_WRITE) ? IB_ACCESS_REMOTE_WRITE : 0);
 
-#ifdef HAVE_IB_ACCESS_OPTIONAL
 	access_flags &= ~IB_ACCESS_OPTIONAL;
-#endif
 	if (access_flags & ~supp_access_flags) {
 		ibdev_dbg(&dev->ibdev,
 			  "Unsupported access flags[%#x], supported[%#x]\n",
@@ -2164,14 +1628,11 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
 	params.mr_length_in_bytes = length;
 	params.permissions = access_flags;
 
-#ifdef HAVE_EFA_P2P
 	if (mr->p2pmem) {
 		pg_sz = efa_p2p_get_page_size(dev, mr->p2pmem);
 		goto skip_umem_pg_sz;
 	}
-#endif
 
-#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
 	pg_sz = ib_umem_find_best_pgsz(mr->umem,
 				       dev->dev_attr.page_size_cap,
 				       virt_addr);
@@ -2180,30 +1641,15 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
 			  dev->dev_attr.page_size_cap);
 		return -EOPNOTSUPP;
 	}
-#else
-	pg_sz = efa_cont_pages(mr->umem, dev->dev_attr.page_size_cap,
-			       virt_addr);
-#endif /* defined(HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE) */
 
-#ifdef HAVE_EFA_P2P
 skip_umem_pg_sz:
-#endif
 	params.page_shift = order_base_2(pg_sz);
-#ifdef HAVE_IB_UMEM_NUM_DMA_BLOCKS
-#ifdef HAVE_EFA_P2P
 	if (mr->p2pmem)
 		params.page_num = DIV_ROUND_UP(length +
 					       (virt_addr & (pg_sz - 1)),
 					       pg_sz);
 	else
 		params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
-#else
-	params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
-#endif
-#else
-	params.page_num = DIV_ROUND_UP(length + (virt_addr & (pg_sz - 1)),
-				       pg_sz);
-#endif
 
 	ibdev_dbg(&dev->ibdev,
 		  "start %#llx length %#llx params.page_shift %u params.page_num %u\n",
@@ -2232,21 +1678,22 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
 
 	mr->ibmr.lkey = result.l_key;
 	mr->ibmr.rkey = result.r_key;
-#ifdef HAVE_IB_MR_LENGTH
 	mr->ibmr.length = length;
-#endif
-#ifdef HAVE_EFA_P2P
+	mr->ic_info.recv_ic_id = result.ic_info.recv_ic_id;
+	mr->ic_info.rdma_read_ic_id = result.ic_info.rdma_read_ic_id;
+	mr->ic_info.rdma_recv_ic_id = result.ic_info.rdma_recv_ic_id;
+	mr->ic_info.recv_ic_id_valid = result.ic_info.recv_ic_id_valid;
+	mr->ic_info.rdma_read_ic_id_valid = result.ic_info.rdma_read_ic_id_valid;
+	mr->ic_info.rdma_recv_ic_id_valid = result.ic_info.rdma_recv_ic_id_valid;
 	if (mr->p2pmem) {
 		mr->p2pmem->lkey = result.l_key;
 		mr->p2pmem->needs_dereg = true;
 	}
-#endif
 	ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);
 
 	return 0;
 }
 
-#ifdef HAVE_MR_DMABUF
 struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 				     u64 length, u64 virt_addr,
 				     int fd, int access_flags,
@@ -2279,11 +1726,6 @@ struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 	return &mr->ibmr;
 
 err_release:
-#ifndef HAVE_IB_UMEM_DMABUF_PINNED
-	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
-	dma_buf_unpin(umem_dmabuf->attach);
-	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
-#endif
 	ib_umem_release(mr->umem);
 err_free:
 	kfree(mr);
@@ -2291,7 +1733,6 @@ struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 	atomic64_inc(&dev->stats.reg_mr_err);
 	return ERR_PTR(err);
 }
-#endif
 
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 			 u64 virt_addr, int access_flags,
@@ -2307,34 +1748,21 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 		goto err_out;
 	}
 
-#ifdef HAVE_IB_UMEM_GET_DEVICE_PARAM
 	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
-#elif defined(HAVE_IB_UMEM_GET_NO_DMASYNC)
-	mr->umem = ib_umem_get(udata, start, length, access_flags);
-#elif defined(HAVE_IB_UMEM_GET_UDATA)
-	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
-#else
-	mr->umem = ib_umem_get(ibpd->uobject->context, start, length,
-			       access_flags, 0);
-#endif
 	if (IS_ERR(mr->umem)) {
-#ifdef HAVE_EFA_P2P
 		mr->p2pmem = efa_p2p_get(dev, mr, start, length);
 		if (mr->p2pmem) {
 			/* Avoid referencing an error-pointer later on */
 			mr->umem = NULL;
 			goto reg_mr;
 		}
-#endif
 		err = PTR_ERR(mr->umem);
 		ibdev_dbg(&dev->ibdev,
 			  "Failed to pin and map user space memory[%d]\n", err);
 		goto err_free;
 	}
 
-#ifdef HAVE_EFA_P2P
 reg_mr:
-#endif
 	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
 	if (err)
 		goto err_release;
@@ -2342,14 +1770,10 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	return &mr->ibmr;
 
 err_release:
-#ifdef HAVE_EFA_P2P
 	if (mr->p2pmem)
 		efa_p2p_put(mr->p2pmem->ticket, false);
 	else
 		ib_umem_release(mr->umem);
-#else
-	ib_umem_release(mr->umem);
-#endif
 err_free:
 	kfree(mr);
 err_out:
@@ -2357,11 +1781,40 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	return ERR_PTR(err);
 }
 
-#ifdef HAVE_DEREG_MR_UDATA
+static int UVERBS_HANDLER(EFA_IB_METHOD_MR_QUERY)(struct uverbs_attr_bundle *attrs)
+{
+	struct ib_mr *ibmr = uverbs_attr_get_obj(attrs, EFA_IB_ATTR_QUERY_MR_HANDLE);
+	struct efa_mr *mr = to_emr(ibmr);
+	u16 ic_id_validity = 0;
+	int ret;
+
+	ret = uverbs_copy_to(attrs, EFA_IB_ATTR_QUERY_MR_RESP_RECV_IC_ID,
+			     &mr->ic_info.recv_ic_id, sizeof(mr->ic_info.recv_ic_id));
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_to(attrs, EFA_IB_ATTR_QUERY_MR_RESP_RDMA_READ_IC_ID,
+			     &mr->ic_info.rdma_read_ic_id, sizeof(mr->ic_info.rdma_read_ic_id));
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_to(attrs, EFA_IB_ATTR_QUERY_MR_RESP_RDMA_RECV_IC_ID,
+			     &mr->ic_info.rdma_recv_ic_id, sizeof(mr->ic_info.rdma_recv_ic_id));
+	if (ret)
+		return ret;
+
+	if (mr->ic_info.recv_ic_id_valid)
+		ic_id_validity |= EFA_QUERY_MR_VALIDITY_RECV_IC_ID;
+	if (mr->ic_info.rdma_read_ic_id_valid)
+		ic_id_validity |= EFA_QUERY_MR_VALIDITY_RDMA_READ_IC_ID;
+	if (mr->ic_info.rdma_recv_ic_id_valid)
+		ic_id_validity |= EFA_QUERY_MR_VALIDITY_RDMA_RECV_IC_ID;
+
+	return uverbs_copy_to(attrs, EFA_IB_ATTR_QUERY_MR_RESP_IC_ID_VALIDITY,
+			      &ic_id_validity, sizeof(ic_id_validity));
+}
+
 int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
-#else
-int efa_dereg_mr(struct ib_mr *ibmr)
-#endif
 {
 	struct efa_dev *dev = to_edev(ibmr->device);
 	struct efa_com_dereg_mr_params params;
@@ -2370,7 +1823,6 @@ int efa_dereg_mr(struct ib_mr *ibmr)
 
 	ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey);
 
-#ifdef HAVE_EFA_P2P
 	if (mr->p2pmem) {
 		err = efa_p2p_put(mr->p2p_ticket, false);
 		if (err)
@@ -2379,23 +1831,11 @@ int efa_dereg_mr(struct ib_mr *ibmr)
 		kfree(mr);
 		return 0;
 	}
-#endif
 	params.l_key = mr->ibmr.lkey;
 	err = efa_com_dereg_mr(&dev->edev, &params);
 	if (err)
 		return err;
 
-#if defined(HAVE_MR_DMABUF) && !defined(HAVE_IB_UMEM_DMABUF_PINNED)
-	if (mr->umem->is_dmabuf) {
-		struct ib_umem_dmabuf *umem_dmabuf;
-
-		umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
-		dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
-		dma_buf_unpin(umem_dmabuf->attach);
-		dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
-	}
-#endif
-
 	ib_umem_release(mr->umem);
 	kfree(mr);
 
@@ -2487,10 +1927,6 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
 		goto err_out;
 
 	ucontext->uarn = result.uarn;
-#ifndef HAVE_CORE_MMAP_XA
-	mutex_init(&ucontext->lock);
-	INIT_LIST_HEAD(&ucontext->pending_mmaps);
-#endif /* !defined(HAVE_CORE_MMAP_XA) */
 
 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
@@ -2514,66 +1950,20 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
 	return err;
 }
 
-#ifndef HAVE_UCONTEXT_CORE_ALLOCATION
-struct ib_ucontext *efa_kzalloc_ucontext(struct ib_device *ibdev,
-					 struct ib_udata *udata)
-{
-	struct efa_dev *dev = to_edev(ibdev);
-	struct efa_ucontext *ucontext;
-	int err;
-
-	/*
-	 * it's fine if the driver does not know all request fields,
-	 * we will ack input fields in our response.
-	 */
-
-	ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL);
-	if (!ucontext) {
-		atomic64_inc(&dev->stats.alloc_ucontext_err);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	ucontext->ibucontext.device = ibdev;
-	err = efa_alloc_ucontext(&ucontext->ibucontext, udata);
-	if (err)
-		goto err_free_ucontext;
-
-	return &ucontext->ibucontext;
-
-err_free_ucontext:
-	kfree(ucontext);
-	return ERR_PTR(err);
-}
-#endif
-
-#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
 void efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
-#else
-int efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
-#endif
 {
 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
 	struct efa_dev *dev = to_edev(ibucontext->device);
 
-#ifndef HAVE_CORE_MMAP_XA
-	mmap_entries_remove_free(dev, ucontext);
-#endif
 	efa_dealloc_uar(dev, ucontext->uarn);
-#ifndef HAVE_UCONTEXT_CORE_ALLOCATION
-	kfree(ucontext);
-
-	return 0;
-#endif
 }
 
-#ifdef HAVE_CORE_MMAP_XA
 void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
 {
 	struct efa_user_mmap_entry *entry = to_emmap(rdma_entry);
 
 	kfree(entry);
 }
-#endif
 
 static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
 		      struct vm_area_struct *vma)
@@ -2602,38 +1992,16 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
 	pfn = entry->address >> PAGE_SHIFT;
 	switch (entry->mmap_flag) {
 	case EFA_MMAP_IO_NC:
-#ifdef HAVE_CORE_MMAP_XA
 		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
 					entry->rdma_entry.npages * PAGE_SIZE,
 					pgprot_noncached(vma->vm_page_prot),
 					rdma_entry);
-#elif defined(HAVE_RDMA_USER_MMAP_IO)
-		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
-					entry->rdma_entry.npages * PAGE_SIZE,
-					pgprot_noncached(vma->vm_page_prot));
-#else
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		err = io_remap_pfn_range(vma, vma->vm_start, pfn,
-					 entry->rdma_entry.npages * PAGE_SIZE,
-					 vma->vm_page_prot);
-#endif
 		break;
 	case EFA_MMAP_IO_WC:
-#ifdef HAVE_CORE_MMAP_XA
 		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
 					entry->rdma_entry.npages * PAGE_SIZE,
 					pgprot_writecombine(vma->vm_page_prot),
 					rdma_entry);
-#elif defined(HAVE_RDMA_USER_MMAP_IO)
-		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
-					entry->rdma_entry.npages * PAGE_SIZE,
-					pgprot_writecombine(vma->vm_page_prot));
-#else
-		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-		err = io_remap_pfn_range(vma, vma->vm_start, pfn,
-					 entry->rdma_entry.npages * PAGE_SIZE,
-					 vma->vm_page_prot);
-#endif
 		break;
 	case EFA_MMAP_DMA_PAGE:
 		for (va = vma->vm_start; va < vma->vm_end;
@@ -2685,17 +2053,10 @@ static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
 }
 
 int efa_create_ah(struct ib_ah *ibah,
-#ifdef HAVE_CREATE_AH_INIT_ATTR
 		  struct rdma_ah_init_attr *init_attr,
-#else
-		  struct rdma_ah_attr *ah_attr,
-		  u32 flags,
-#endif
 		  struct ib_udata *udata)
 {
-#ifdef HAVE_CREATE_AH_INIT_ATTR
 	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
-#endif
 	struct efa_dev *dev = to_edev(ibah->device);
 	struct efa_com_create_ah_params params = {};
 	struct efa_ibv_create_ah_resp resp = {};
@@ -2703,26 +2064,12 @@ int efa_create_ah(struct ib_ah *ibah,
 	struct efa_ah *ah = to_eah(ibah);
 	int err;
 
-#if defined(HAVE_CREATE_DESTROY_AH_FLAGS) || defined(HAVE_CREATE_AH_INIT_ATTR)
-#ifdef HAVE_CREATE_AH_INIT_ATTR
 	if (!(init_attr->flags & RDMA_CREATE_AH_SLEEPABLE)) {
-#else
-	if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) {
-#endif
 		ibdev_dbg(&dev->ibdev,
 			  "Create address handle is not supported in atomic context\n");
 		err = -EOPNOTSUPP;
 		goto err_out;
 	}
-#endif
-
-#ifndef HAVE_NO_KVERBS_DRIVERS
-	if (!udata) {
-		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
-		err = -EOPNOTSUPP;
-		goto err_out;
-	}
-#endif
 
 	if (udata->inlen &&
 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
@@ -2763,87 +2110,17 @@ int efa_create_ah(struct ib_ah *ibah,
 	return err;
 }
 
-#ifndef HAVE_AH_CORE_ALLOCATION
-#ifdef HAVE_CREATE_DESTROY_AH_FLAGS
-struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
-			     struct rdma_ah_attr *ah_attr,
-			     u32 flags,
-			     struct ib_udata *udata)
-#elif defined(HAVE_CREATE_AH_RDMA_ATTR)
-struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
-			     struct rdma_ah_attr *ah_attr,
-			     struct ib_udata *udata)
-#endif
-{
-	struct efa_ah *ah;
-	int err;
-#ifndef HAVE_CREATE_DESTROY_AH_FLAGS
-	u32 flags = 0;
-#endif
-
-	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
-
-	ah->ibah.device = ibpd->device;
-	ah->ibah.pd = ibpd;
-	err = efa_create_ah(&ah->ibah, ah_attr, flags, udata);
-	if (err)
-		goto err_free;
-
-	return &ah->ibah;
-
-err_free:
-	kfree(ah);
-	return ERR_PTR(err);
-}
-#endif
-
-#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
 int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
-#elif defined(HAVE_AH_CORE_ALLOCATION)
-void efa_destroy_ah(struct ib_ah *ibah, u32 flags)
-#elif defined(HAVE_CREATE_DESTROY_AH_FLAGS)
-int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
-#else
-int efa_destroy_ah(struct ib_ah *ibah)
-#endif
 {
 	struct efa_dev *dev = to_edev(ibah->pd->device);
 	struct efa_ah *ah = to_eah(ibah);
-#if !defined(HAVE_AH_CORE_ALLOCATION) && !defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC)
-	int err;
-#endif
 
 	ibdev_dbg(&dev->ibdev, "Destroy ah[%d]\n", ah->ah);
 
-#if defined(HAVE_CREATE_DESTROY_AH_FLAGS)
-	if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) {
-		ibdev_dbg(&dev->ibdev,
-			  "Destroy address handle is not supported in atomic context\n");
-#if defined(HAVE_AH_CORE_ALLOCATION) && !defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC)
-		return;
-#else
-		return -EOPNOTSUPP;
-#endif
-	}
-#endif
-
-#if defined(HAVE_AH_CORE_ALLOCATION) || defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC)
 	efa_ah_destroy(dev, ah);
-#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
 	return 0;
-#endif
-#else
-	err = efa_ah_destroy(dev, ah);
-	if (err)
-		return err;
-	kfree(ah);
-	return 0;
-#endif
 }
 
-#ifdef HAVE_SPLIT_STATS_ALLOC
 struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev,
 					      port_t port_num)
 {
@@ -2858,19 +2135,6 @@ struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev)
 					  ARRAY_SIZE(efa_device_stats_descs),
 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 }
-#else
-struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, port_t port_num)
-{
-	if (port_num)
-		return rdma_alloc_hw_stats_struct(efa_port_stats_descs,
-						  ARRAY_SIZE(efa_port_stats_descs),
-						  RDMA_HW_STATS_DEFAULT_LIFESPAN);
-	else
-		return rdma_alloc_hw_stats_struct(efa_device_stats_descs,
-						  ARRAY_SIZE(efa_device_stats_descs),
-						  RDMA_HW_STATS_DEFAULT_LIFESPAN);
-}
-#endif
 
 static int efa_fill_device_stats(struct efa_dev *dev,
 				 struct rdma_hw_stats *stats)
@@ -2901,6 +2165,7 @@ static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats,
 {
 	struct efa_com_get_stats_params params = {};
 	union efa_com_get_stats_result result;
+	struct efa_com_rdma_write_stats *rws;
 	struct efa_com_rdma_read_stats *rrs;
 	struct efa_com_messages_stats *ms;
 	struct efa_com_basic_stats *bs;
@@ -2942,6 +2207,19 @@ static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats,
 	stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err;
 	stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes;
 
+	if (EFA_DEV_CAP(dev, RDMA_WRITE)) {
+		params.type = EFA_ADMIN_GET_STATS_TYPE_RDMA_WRITE;
+		err = efa_com_get_stats(&dev->edev, &params, &result);
+		if (err)
+			return err;
+
+		rws = &result.rdma_write_stats;
+		stats->value[EFA_RDMA_WRITE_WRS] = rws->write_wrs;
+		stats->value[EFA_RDMA_WRITE_BYTES] = rws->write_bytes;
+		stats->value[EFA_RDMA_WRITE_WR_ERR] = rws->write_wr_err;
+		stats->value[EFA_RDMA_WRITE_RECV_BYTES] = rws->write_recv_bytes;
+	}
+
 	return ARRAY_SIZE(efa_port_stats_descs);
 }
 
@@ -2954,69 +2232,36 @@ int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 		return efa_fill_device_stats(to_edev(ibdev), stats);
 }
 
-#ifndef HAVE_NO_KVERBS_DRIVERS
-#ifdef HAVE_POST_CONST_WR
-int efa_post_send(struct ib_qp *ibqp,
-		  const struct ib_send_wr *wr,
-		  const struct ib_send_wr **bad_wr)
-#else
-int efa_post_send(struct ib_qp *ibqp,
-		  struct ib_send_wr *wr,
-		  struct ib_send_wr **bad_wr)
-#endif
-{
-	struct efa_dev *dev = to_edev(ibqp->device);
-
-	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
-	return -EOPNOTSUPP;
-}
-
-#ifdef HAVE_POST_CONST_WR
-int efa_post_recv(struct ib_qp *ibqp,
-		  const struct ib_recv_wr *wr,
-		  const struct ib_recv_wr **bad_wr)
-#else
-int efa_post_recv(struct ib_qp *ibqp,
-		  struct ib_recv_wr *wr,
-		  struct ib_recv_wr **bad_wr)
-#endif
-{
-	struct efa_dev *dev = to_edev(ibqp->device);
-
-	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
-	return -EOPNOTSUPP;
-}
-
-int efa_poll_cq(struct ib_cq *ibcq, int num_entries,
-		struct ib_wc *wc)
-{
-	struct efa_dev *dev = to_edev(ibcq->device);
-
-	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
-	return -EOPNOTSUPP;
-}
-
-int efa_req_notify_cq(struct ib_cq *ibcq,
-		      enum ib_cq_notify_flags flags)
-{
-	struct efa_dev *dev = to_edev(ibcq->device);
-
-	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
-	return -EOPNOTSUPP;
-}
-
-struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc)
-{
-	struct efa_dev *dev = to_edev(ibpd->device);
-
-	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
-	return ERR_PTR(-EOPNOTSUPP);
-}
-#endif
-
 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
 					 port_t port_num)
 {
 	return IB_LINK_LAYER_UNSPECIFIED;
 }
 
+DECLARE_UVERBS_NAMED_METHOD(EFA_IB_METHOD_MR_QUERY,
+			    UVERBS_ATTR_IDR(EFA_IB_ATTR_QUERY_MR_HANDLE,
+					    UVERBS_OBJECT_MR,
+					    UVERBS_ACCESS_READ,
+					    UA_MANDATORY),
+			    UVERBS_ATTR_PTR_OUT(EFA_IB_ATTR_QUERY_MR_RESP_IC_ID_VALIDITY,
+						UVERBS_ATTR_TYPE(u16),
+						UA_MANDATORY),
+			    UVERBS_ATTR_PTR_OUT(EFA_IB_ATTR_QUERY_MR_RESP_RECV_IC_ID,
+						UVERBS_ATTR_TYPE(u16),
+						UA_MANDATORY),
+			    UVERBS_ATTR_PTR_OUT(EFA_IB_ATTR_QUERY_MR_RESP_RDMA_READ_IC_ID,
+						UVERBS_ATTR_TYPE(u16),
+						UA_MANDATORY),
+			    UVERBS_ATTR_PTR_OUT(EFA_IB_ATTR_QUERY_MR_RESP_RDMA_RECV_IC_ID,
+						UVERBS_ATTR_TYPE(u16),
+						UA_MANDATORY));
+
+ADD_UVERBS_METHODS(efa_mr,
+		   UVERBS_OBJECT_MR,
+		   &UVERBS_METHOD(EFA_IB_METHOD_MR_QUERY));
+
+const struct uapi_definition efa_uapi_defs[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR,
+				&efa_mr),
+	{},
+};
diff --git a/drivers/amazon/net/efa/kcompat.h b/drivers/amazon/net/efa/kcompat.h
index 81e8f819e4f88..c7b520c633fa7 100644
--- a/drivers/amazon/net/efa/kcompat.h
+++ b/drivers/amazon/net/efa/kcompat.h
@@ -8,240 +8,11 @@
 
 #include <linux/types.h>
 
-#include "config.h"
-
-#ifndef HAVE_IB_IS_UDATA_CLEARED
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <rdma/ib_verbs.h>
-
-static inline bool ib_is_udata_cleared(struct ib_udata *udata,
-				       size_t offset,
-				       size_t len)
-{
-	const void __user *p = udata->inbuf + offset;
-	bool ret = false;
-	u8 *buf;
-
-	if (len > USHRT_MAX)
-		return false;
-
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf)
-		return false;
-
-	if (copy_from_user(buf, p, len))
-		goto free;
-
-	ret = !memchr_inv(buf, 0, len);
-
-free:
-	kfree(buf);
-	return ret;
-}
-#endif
-
-#ifndef HAVE_IB_QPT_DRIVER
-#define IB_QPT_DRIVER 0xFF
-#endif
-
-#if defined(HAVE_DRIVER_ID) && !defined(HAVE_UPSTREAM_EFA)
-#define RDMA_DRIVER_EFA 17
-#endif
-
-#ifndef HAVE_IBDEV_PRINT
-#define ibdev_err(_ibdev, format, arg...) \
-	dev_err(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
-#define ibdev_dbg(_ibdev, format, arg...) \
-	dev_dbg(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
-#define ibdev_warn(_ibdev, format, arg...) \
-	dev_warn(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
-#define ibdev_info(_ibdev, format, arg...) \
-	dev_info(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
-#endif
-
-#ifndef HAVE_IBDEV_PRINT_RATELIMITED
-#define ibdev_err_ratelimited(_ibdev, format, arg...) \
-	dev_err_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
-#define ibdev_dbg_ratelimited(_ibdev, format, arg...) \
-	dev_dbg_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
-#define ibdev_warn_ratelimited(_ibdev, format, arg...) \
-	dev_warn_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
-#define ibdev_info_ratelimited(_ibdev, format, arg...) \
-	dev_info_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
-#endif
-
-#ifndef HAVE_KVZALLOC
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-static inline void *kvzalloc(size_t size, gfp_t flags)
-{
-	void *addr;
-
-	addr = kzalloc(size, flags | __GFP_NOWARN);
-	if (addr)
-		return addr;
-
-	return vzalloc(size);
-}
-#endif
-
-#ifndef HAVE_IB_PORT_PHYS_STATE_LINK_UP
-#define IB_PORT_PHYS_STATE_LINK_UP 5
-#endif
-
-#ifndef HAVE_CORE_MMAP_XA
-#include <linux/types.h>
-#include <linux/device.h>
-
-struct rdma_user_mmap_entry {
-	struct ib_ucontext *ucontext;
-	unsigned long start_pgoff;
-	size_t npages;
-};
-
-/* Return the offset (in bytes) the user should pass to libc's mmap() */
-static inline u64
-rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry)
-{
-	return (u64)entry->start_pgoff << PAGE_SHIFT;
-}
-
-/*
- * Backported kernels don't keep refcnt on entries, hence they should not
- * be removed.
- */
-static inline void
-rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
-{
-}
-
-static inline void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry)
-{
-}
-#endif
 
 #ifndef sizeof_field
 #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
 #endif
 
-#ifndef HAVE_BITFIELD_H
-#define __bf_shf(x) (__builtin_ffsll(x) - 1)
-
-#define FIELD_PREP(_mask, _val)                                         \
-	({                                                              \
-		((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask);   \
-	})
-
-#define FIELD_GET(_mask, _reg)                                          \
-	({                                                              \
-		(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \
-	})
-#endif
-
-#ifndef HAVE_RDMA_NODE_UNSPECIFIED
-enum {
-	RDMA_NODE_UNSPECIFIED = 7,
-};
-#endif
-
-#ifndef HAVE_ATOMIC64_FETCH_INC
-static __always_inline s64
-atomic64_fetch_inc(atomic64_t *v)
-{
-	return atomic64_inc_return(v) - 1;
-}
-#endif
-
-#if !defined(HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK) && defined(HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE)
-#include <rdma/ib_umem.h>
-
-static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter,
-						struct ib_umem *umem,
-						unsigned long pgsz)
-{
-	__rdma_block_iter_start(biter, umem->sg_head.sgl, umem->nmap, pgsz);
-}
-
-/**
- * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem
- * @umem: umem to iterate over
- * @pgsz: Page size to split the list into
- *
- * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The
- * returned DMA blocks will be aligned to pgsz and span the range:
- * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz)
- *
- * Performs exactly ib_umem_num_dma_blocks() iterations.
- */
-#define rdma_umem_for_each_dma_block(umem, biter, pgsz)                        \
-	for (__rdma_umem_block_iter_start(biter, umem, pgsz);                  \
-	     __rdma_block_iter_next(biter);)
-#endif
-
-#ifdef HAVE_U32_PORT
 typedef u32 port_t;
-#else
-typedef u8 port_t;
-#endif
-
-#if defined(HAVE_MR_DMABUF) && !defined(HAVE_IB_UMEM_DMABUF_PINNED)
-#include <linux/dma-buf.h>
-#include <linux/dma-resv.h>
-#include <rdma/ib_umem.h>
-
-#ifdef HAVE_MODULE_IMPORT_NS
-MODULE_IMPORT_NS(DMA_BUF);
-#endif
-
-static inline void
-ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach)
-{
-	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
-
-	ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev,
-			       "Invalidate callback should not be called when memory is pinned\n");
-}
-
-static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = {
-	.allow_peer2peer = true,
-	.move_notify = ib_umem_dmabuf_unsupported_move_notify,
-};
-
-static inline
-struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
-						 unsigned long offset,
-						 size_t size, int fd,
-						 int access)
-{
-	struct ib_umem_dmabuf *umem_dmabuf;
-	int err;
-
-	umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access,
-					 &ib_umem_dmabuf_attach_pinned_ops);
-	if (IS_ERR(umem_dmabuf))
-		return umem_dmabuf;
-
-	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
-	err = dma_buf_pin(umem_dmabuf->attach);
-	if (err)
-		goto err_release;
-
-	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
-	if (err)
-		goto err_unpin;
-	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
-
-	return umem_dmabuf;
-
-err_unpin:
-	dma_buf_unpin(umem_dmabuf->attach);
-err_release:
-	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
-	ib_umem_release(&umem_dmabuf->umem);
-	return ERR_PTR(err);
-}
-#endif /* !HAVE_IB_UMEM_DMABUF_PINNED */
 
 #endif /* _KCOMPAT_H_ */
diff --git a/drivers/amazon/net/efa/nv-p2p.h b/drivers/amazon/net/efa/nv-p2p.h
index d74e024963d5a..93350530a3eb2 100644
--- a/drivers/amazon/net/efa/nv-p2p.h
+++ b/drivers/amazon/net/efa/nv-p2p.h
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -13,7 +14,7 @@
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
@@ -75,6 +76,8 @@ enum {
         NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE
 };
 
+#define NVIDIA_P2P_GPU_UUID_LEN   16
+
 typedef
 struct nvidia_p2p_params {
     u32 version;
@@ -91,28 +94,13 @@ struct nvidia_p2p_params {
 } nvidia_p2p_params_t;
 
 /*
- * @brief
- *   Initializes a third-party P2P mapping between an NVIDIA
- *   GPU and a third-party device.
- *
- * @param[in]     p2p_token
- *   A token that uniquely identifies the P2P mapping.
- * @param[in,out] params
- *   A pointer to a structure with P2P mapping parameters.
- * @param[in]     destroy_callback
- *   A pointer to the function to be invoked when the P2P mapping
- *   is destroyed implictly.
- * @param[in]     data
- *   An opaque pointer to private data to be passed to the
- *   callback function.
- *
- * @return
- *    0           upon successful completion.
- *   -EINVAL      if an invalid argument was supplied.
- *   -ENOTSUPP    if the requested configuration is not supported.
- *   -ENOMEM      if the driver failed to allocate memory.
- *   -EBUSY       if the mapping has already been initialized.
- *   -EIO         if an unknown error occurred.
+ * Macro for users to detect
+ * driver support for persistent pages.
+ */
+#define NVIDIA_P2P_CAP_GET_PAGES_PERSISTENT_API
+
+/*
+ * This API is not supported.
  */
 int nvidia_p2p_init_mapping(u64 p2p_token,
         struct nvidia_p2p_params *params,
@@ -120,17 +108,7 @@ int nvidia_p2p_init_mapping(u64 p2p_token,
         void *data);
 
 /*
- * @brief
- *   Tear down a previously initialized third-party P2P mapping.
- *
- * @param[in]     p2p_token
- *   A token that uniquely identifies the mapping.
- *
- * @return
- *    0           upon successful completion.
- *   -EINVAL      if an invalid argument was supplied.
- *   -ENOTSUPP    if the requested configuration is not supported.
- *   -ENOMEM      if the driver failed to allocate memory.
+ * This API is not supported.
  */
 int nvidia_p2p_destroy_mapping(u64 p2p_token);
 
@@ -191,9 +169,9 @@ struct nvidia_p2p_page_table {
  * @param[out]    page_table
  *   A pointer to an array of structures with P2P PTEs.
  * @param[in]     free_callback
- *   A non-NULL pointer to the function to be invoked when the pages
+ *   A pointer to the function to be invoked when the pages
  *   underlying the virtual address range are freed
- *   implicitly. Must be non NULL.
+ *   implicitly.
  * @param[in]     data
  *   A non-NULL opaque pointer to private data to be passed to the
  *   callback function.
@@ -206,12 +184,48 @@ struct nvidia_p2p_page_table {
  *     insufficient resources were available to complete the operation.
  *   -EIO         if an unknown error occurred.
  */
-int nvidia_p2p_get_pages(u64 p2p_token, u32 va_space,
-        u64 virtual_address,
+int nvidia_p2p_get_pages( u64 p2p_token, u32 va_space,
+        u64 virtual_address, u64 length,
+        struct nvidia_p2p_page_table **page_table,
+        void (*free_callback)(void *data), void *data);
+
+/*
+ * @brief
+ *   Pin and make the pages underlying a range of GPU virtual memory
+ *   accessible to a third-party device. The pages will persist until
+ *   explicitly freed by nvidia_p2p_put_pages_persistent().
+ *
+ *   Persistent GPU memory mappings are not supported on PowerPC,
+ *   MIG-enabled devices and vGPU.
+ *
+ *   This API only supports pinned, GPU-resident memory, such as that provided
+ *   by cudaMalloc().
+ *
+ *   This API may sleep.
+ *
+ * @param[in]     virtual_address
+ *   The start address in the specified virtual address space.
+ *   Address must be aligned to the 64KB boundary.
+ * @param[in]     length
+ *   The length of the requested P2P mapping.
+ *   Length must be a multiple of 64KB.
+ * @param[out]    page_table
+ *   A pointer to an array of structures with P2P PTEs.
+ * @param[in]     flags
+ *   Must be set to zero for now.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -ENOTSUPP    if the requested operation is not supported.
+ *   -ENOMEM      if the driver failed to allocate memory or if
+ *     insufficient resources were available to complete the operation.
+ *   -EIO         if an unknown error occurred.
+ */
+int nvidia_p2p_get_pages_persistent(u64 virtual_address,
         u64 length,
         struct nvidia_p2p_page_table **page_table,
-        void (*free_callback)(void *data),
-        void *data);
+        u32 flags);
 
 #define NVIDIA_P2P_DMA_MAPPING_VERSION   0x00020003
 
@@ -284,6 +298,8 @@ int nvidia_p2p_dma_unmap_pages(struct pci_dev *peer,
  *   Release a set of pages previously made accessible to
  *   a third-party device.
  *
+ *   This API may sleep.
+ *
  * @param[in]     p2p_token
  *   A token that uniquely identifies the P2P mapping.
  * @param[in]     va_space
@@ -298,10 +314,33 @@ int nvidia_p2p_dma_unmap_pages(struct pci_dev *peer,
  *   -EINVAL      if an invalid argument was supplied.
  *   -EIO         if an unknown error occurred.
  */
-int nvidia_p2p_put_pages(u64 p2p_token, u32 va_space,
-        u64 virtual_address,
+int nvidia_p2p_put_pages(u64 p2p_token,
+        u32 va_space, u64 virtual_address,
         struct nvidia_p2p_page_table *page_table);
 
+/*
+ * @brief
+ *   Release a set of persistent pages previously made accessible to
+ *   a third-party device.
+ *
+ *   This API may sleep.
+ *
+ * @param[in]     virtual_address
+ *   The start address in the specified virtual address space.
+ * @param[in]     page_table
+ *   A pointer to the array of structures with P2P PTEs.
+ * @param[in]     flags
+ *   Must be set to zero for now.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -EIO         if an unknown error occurred.
+ */
+int nvidia_p2p_put_pages_persistent(u64 virtual_address,
+        struct nvidia_p2p_page_table *page_table,
+        u32 flags);
+
 /*
  * @brief
  *    Free a third-party P2P page table. (This function is a no-op.)
diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
index 163ac79556d68..701e2d567e411 100644
--- a/include/uapi/rdma/efa-abi.h
+++ b/include/uapi/rdma/efa-abi.h
@@ -1,12 +1,13 @@
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
 /*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef EFA_ABI_USER_H
 #define EFA_ABI_USER_H
 
 #include <linux/types.h>
+#include <rdma/ib_user_ioctl_cmds.h>
 
 /*
  * Increment this value if any changes that break userspace ABI
@@ -120,6 +121,8 @@ enum {
 	EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1,
 	EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
 	EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
+	EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
+	EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
 };
 
 struct efa_ibv_ex_query_device_resp {
@@ -132,4 +135,22 @@ struct efa_ibv_ex_query_device_resp {
 	__u32 device_caps;
 };
 
+enum {
+	EFA_QUERY_MR_VALIDITY_RECV_IC_ID = 1 << 0,
+	EFA_QUERY_MR_VALIDITY_RDMA_READ_IC_ID = 1 << 1,
+	EFA_QUERY_MR_VALIDITY_RDMA_RECV_IC_ID = 1 << 2,
+};
+
+enum efa_query_mr_attrs {
+	EFA_IB_ATTR_QUERY_MR_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	EFA_IB_ATTR_QUERY_MR_RESP_IC_ID_VALIDITY,
+	EFA_IB_ATTR_QUERY_MR_RESP_RECV_IC_ID,
+	EFA_IB_ATTR_QUERY_MR_RESP_RDMA_READ_IC_ID,
+	EFA_IB_ATTR_QUERY_MR_RESP_RDMA_RECV_IC_ID,
+};
+
+enum efa_mr_methods {
+	EFA_IB_METHOD_MR_QUERY = (1U << UVERBS_ID_NS_SHIFT),
+};
+
 #endif /* EFA_ABI_USER_H */

From a089375202cde99efae5c03f6202695f91d2dd63 Mon Sep 17 00:00:00 2001
From: Jacob Wolf <jacwolf@amazon.com>
Date: Sat, 9 Mar 2024 18:22:58 +0000
Subject: [PATCH 162/175] Config glue for 2.15 Lustre client

---
 drivers/staging/Kconfig                       |  2 +
 drivers/staging/Makefile                      |  1 +
 drivers/staging/lustrefsx/Kconfig             |  3 ++
 drivers/staging/lustrefsx/Makefile            |  3 ++
 drivers/staging/lustrefsx/Makefile.rules      |  8 ++++
 drivers/staging/lustrefsx/libcfs/Kconfig      |  3 ++
 drivers/staging/lustrefsx/libcfs/Makefile     |  1 +
 .../staging/lustrefsx/libcfs/libcfs/Makefile  | 27 ++++++++++++
 drivers/staging/lustrefsx/lnet/Kconfig        | 37 +++++++++++++++++
 drivers/staging/lustrefsx/lnet/Makefile       |  3 ++
 drivers/staging/lustrefsx/lnet/klnds/Makefile |  2 +
 .../lustrefsx/lnet/klnds/o2iblnd/Makefile     |  5 +++
 .../lustrefsx/lnet/klnds/socklnd/Makefile     |  6 +++
 drivers/staging/lustrefsx/lnet/lnet/Makefile  |  8 ++++
 .../staging/lustrefsx/lnet/selftest/Makefile  |  6 +++
 drivers/staging/lustrefsx/lustre/Kconfig      | 41 +++++++++++++++++++
 drivers/staging/lustrefsx/lustre/Makefile     |  8 ++++
 drivers/staging/lustrefsx/lustre/fid/Makefile |  5 +++
 drivers/staging/lustrefsx/lustre/fld/Makefile |  8 ++++
 .../staging/lustrefsx/lustre/llite/Makefile   | 15 +++++++
 drivers/staging/lustrefsx/lustre/lmv/Makefile |  5 +++
 drivers/staging/lustrefsx/lustre/lov/Makefile |  8 ++++
 drivers/staging/lustrefsx/lustre/mdc/Makefile |  7 ++++
 drivers/staging/lustrefsx/lustre/mgc/Makefile |  5 +++
 .../lustrefsx/lustre/obdclass/Makefile        | 15 +++++++
 .../staging/lustrefsx/lustre/obdecho/Makefile |  5 +++
 drivers/staging/lustrefsx/lustre/osc/Makefile |  6 +++
 .../staging/lustrefsx/lustre/ptlrpc/Makefile  | 25 +++++++++++
 28 files changed, 268 insertions(+)
 create mode 100644 drivers/staging/lustrefsx/Kconfig
 create mode 100644 drivers/staging/lustrefsx/Makefile
 create mode 100644 drivers/staging/lustrefsx/Makefile.rules
 create mode 100644 drivers/staging/lustrefsx/libcfs/Kconfig
 create mode 100644 drivers/staging/lustrefsx/libcfs/Makefile
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/Kconfig
 create mode 100644 drivers/staging/lustrefsx/lnet/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/Kconfig
 create mode 100644 drivers/staging/lustrefsx/lustre/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/mgc/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/obdecho/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/Makefile

diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 5cfabd5376cc2..c1885029397c5 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -60,6 +60,8 @@ source "drivers/staging/board/Kconfig"
 
 source "drivers/staging/gdm724x/Kconfig"
 
+source "drivers/staging/lustrefsx/Kconfig"
+
 source "drivers/staging/fbtft/Kconfig"
 
 source "drivers/staging/most/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index f8c3aa9c24182..e28ee59ab6681 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_USB_EMXX)		+= emxx_udc/
 obj-$(CONFIG_MFD_NVEC)		+= nvec/
 obj-$(CONFIG_STAGING_BOARD)	+= board/
 obj-$(CONFIG_LTE_GDM724X)	+= gdm724x/
+obj-$(CONFIG_LUSTREFSX_LNET)	+= lustrefsx/
 obj-$(CONFIG_FB_TFT)		+= fbtft/
 obj-$(CONFIG_MOST)		+= most/
 obj-$(CONFIG_KS7010)		+= ks7010/
diff --git a/drivers/staging/lustrefsx/Kconfig b/drivers/staging/lustrefsx/Kconfig
new file mode 100644
index 0000000000000..81e9bc1043d76
--- /dev/null
+++ b/drivers/staging/lustrefsx/Kconfig
@@ -0,0 +1,3 @@
+source "drivers/staging/lustrefsx/libcfs/Kconfig"
+source "drivers/staging/lustrefsx/lnet/Kconfig"
+source "drivers/staging/lustrefsx/lustre/Kconfig"
diff --git a/drivers/staging/lustrefsx/Makefile b/drivers/staging/lustrefsx/Makefile
new file mode 100644
index 0000000000000..20c7929213c3f
--- /dev/null
+++ b/drivers/staging/lustrefsx/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTREFSX_LNET)	+= lnet/
+obj-$(CONFIG_LUSTREFSX_FS)	+= lustre/
+obj-$(CONFIG_LUSTREFSX_LIBCFS)	+= libcfs/
diff --git a/drivers/staging/lustrefsx/Makefile.rules b/drivers/staging/lustrefsx/Makefile.rules
new file mode 100644
index 0000000000000..2336cfb1c80ae
--- /dev/null
+++ b/drivers/staging/lustrefsx/Makefile.rules
@@ -0,0 +1,8 @@
+ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/undef.h
+ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/config.h
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/libcfs/include
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lnet/include
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lnet/include/uapi
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include/uapi
+ccflags-y += -Wno-format-truncation -Werror
diff --git a/drivers/staging/lustrefsx/libcfs/Kconfig b/drivers/staging/lustrefsx/libcfs/Kconfig
new file mode 100644
index 0000000000000..3675b8381af2e
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/Kconfig
@@ -0,0 +1,3 @@
+config LUSTREFSX_LIBCFS
+	depends on m
+	tristate "Lustre helper library"
diff --git a/drivers/staging/lustrefsx/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/Makefile
new file mode 100644
index 0000000000000..6c5ff83ac791a
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LUSTREFSX_LIBCFS) +=	libcfs/
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
new file mode 100644
index 0000000000000..0cbfedc0f087e
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
@@ -0,0 +1,27 @@
+obj-$(CONFIG_LUSTREFSX_LIBCFS)	+= libcfs.o
+
+libcfs-linux-objs	:= linux-prim.o
+libcfs-linux-objs	+= linux-hash.o
+libcfs-linux-objs	+= linux-wait.o
+libcfs-linux-objs	+= glob.o
+libcfs-linux-objs	+= xarray.o
+
+libcfs-crypto-objs	:= crypto.o fname.o hkdf.o hooks.o keyring.o
+libcfs-crypto-objs	+= keysetup.o keysetup_v1.o policy.o
+
+libcfs-linux-objs	:= $(addprefix linux/,$(libcfs-linux-objs))
+libcfs-crypto-objs	:= $(addprefix crypto/,$(libcfs-crypto-objs))
+
+
+libcfs-all-objs		:= libcfs_cpu.o
+libcfs-all-objs		+= debug.o fail.o module.o tracefile.o
+libcfs-all-objs		+= libcfs_string.o hash.o workitem.o
+libcfs-all-objs		+= libcfs_mem.o libcfs_lock.o
+libcfs-all-objs		+= linux-crypto.o linux-crypto-adler.o
+
+libcfs-y		+= $(libcfs-linux-objs) $(libcfs-all-objs)
+libcfs-y		+= $(libcfs-crypto-objs)
+
+ccflags-y		+= -I$(src)
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lnet/Kconfig b/drivers/staging/lustrefsx/lnet/Kconfig
new file mode 100644
index 0000000000000..0d0686a25fe1e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/Kconfig
@@ -0,0 +1,37 @@
+config LUSTREFSX_LNET
+	tristate "Lustre networking subsystem (LNet)"
+	select LUSTREFSX_LIBCFS
+	depends on m
+	depends on INET
+	help
+	  The Lustre network layer, also known as LNet, is a networking abstaction
+	  level API that was initially created to allow Lustre Filesystem to utilize
+	  very different networks like tcp and ib verbs in a uniform way. In the
+	  case of Lustre routers only the LNet layer is required. Lately other
+	  projects are also looking into using LNet as their networking API as well.
+
+config LUSTREFSX_LNET_SELFTEST
+	tristate "Lustre networking self testing"
+	depends on m
+	depends on LUSTREFSX_LNET
+	help
+	  Choose Y here if you want to do lnet self testing. To compile this
+	  as a module, choose M here: the module will be called lnet_selftest.
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LUSTREFSX_LNET_XPRT_IB
+	tristate "LNET infiniband support"
+	depends on m
+	depends on LUSTREFSX_LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+	default LUSTREFSX_LNET && INFINIBAND
+	help
+	  This option allows the LNET users to use infiniband as an
+	  RDMA-enabled transport.
+
+	  To compile this as a kernel module, choose M here and it will be
+	  called ko2iblnd.
+
+	  If unsure, say N.
diff --git a/drivers/staging/lustrefsx/lnet/Makefile b/drivers/staging/lustrefsx/lnet/Makefile
new file mode 100644
index 0000000000000..7ee52eb559025
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTREFSX_LNET)			+=	lnet/
+obj-$(CONFIG_LUSTREFSX_LNET)			+=	klnds/
+obj-$(CONFIG_LUSTREFSX_LNET_SELFTEST)	+=	selftest/
diff --git a/drivers/staging/lustrefsx/lnet/klnds/Makefile b/drivers/staging/lustrefsx/lnet/klnds/Makefile
new file mode 100644
index 0000000000000..cd375ca2cc67f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/Makefile
@@ -0,0 +1,2 @@
+obj-y		+= o2iblnd/
+obj-y		+= socklnd/
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile
new file mode 100644
index 0000000000000..5ce6dc99ffe1a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_LNET_XPRT_IB)	+= ko2iblnd.o
+
+ko2iblnd-y		:= o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile b/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile
new file mode 100644
index 0000000000000..6e6ec925b891f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTREFSX_LNET)	+= ksocklnd.o
+
+ksocklnd-y		:= socklnd.o socklnd_cb.o socklnd_lib.o
+ksocklnd-y		+= socklnd_modparams.o socklnd_proto.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lnet/lnet/Makefile b/drivers/staging/lustrefsx/lnet/lnet/Makefile
new file mode 100644
index 0000000000000..95813fbdafda6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTREFSX_LNET)	+= lnet.o
+
+lnet-y 		:= api-ni.o config.o nidstrings.o lnet_rdma.o
+lnet-y		+= lib-me.o lib-msg.o lib-md.o lib-ptl.o
+lnet-y		+= lib-socket.o lib-move.o module.o lo.o
+lnet-y		+= router.o router_proc.o acceptor.o peer.o net_fault.o udsp.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lnet/selftest/Makefile b/drivers/staging/lustrefsx/lnet/selftest/Makefile
new file mode 100644
index 0000000000000..5380812715f7f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTREFSX_LNET_SELFTEST)	+= lnet_selftest.o
+
+lnet_selftest-y 	:= console.o conrpc.o conctl.o framework.o timer.o
+lnet_selftest-y		+= rpc.o module.o ping_test.o brw_test.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/Kconfig b/drivers/staging/lustrefsx/lustre/Kconfig
new file mode 100644
index 0000000000000..c565c870d805b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/Kconfig
@@ -0,0 +1,41 @@
+config LUSTREFSX_FS
+	tristate "Lustre file system client support"
+	depends on m
+	select LUSTREFSX_LIBCFS
+	depends on LUSTREFSX_LNET
+	select CRYPTO
+	select CRYPTO_CRC32
+	select CRYPTO_CRC32_PCLMUL if X86
+	select CRYPTO_CRC32C
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	depends on MULTIUSER
+	help
+	  This option enables Lustre file system client support. Choose Y
+	  here if you want to access a Lustre file system cluster. To compile
+	  this file system support as a module, choose M here: the module will
+	  be called lustre.
+
+	  To mount Lustre file systems, you also need to install the user space
+	  mount.lustre and other user space commands which can be found in the
+	  lustre-client package.
+
+	  Lustre file system is the most popular cluster file system in high
+	  performance computing. Source code of both kernel space and user space
+	  Lustre components can also be found at
+	  http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LUSTRE_DEBUG_EXPENSIVE_CHECK
+	bool "Enable Lustre DEBUG checks"
+	depends on LUSTREFSX_FS
+	help
+	  This option is mainly for debug purpose. It enables Lustre code to do
+	  expensive checks that may have a performance impact.
+
+	  Use with caution. If unsure, say N.
diff --git a/drivers/staging/lustrefsx/lustre/Makefile b/drivers/staging/lustrefsx/lustre/Makefile
new file mode 100644
index 0000000000000..207cab53c0633
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTREFSX_FS) += fid/
+obj-$(CONFIG_LUSTREFSX_FS) += obdclass/
+obj-$(CONFIG_LUSTREFSX_FS) += ptlrpc/
+obj-$(CONFIG_LUSTREFSX_FS) += obdecho/
+obj-$(CONFIG_LUSTREFSX_FS) += mgc/
+obj-$(CONFIG_LUSTREFSX_FS) += lov/ osc/ mdc/ lmv/ llite/ fld/
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/fid/Makefile b/drivers/staging/lustrefsx/lustre/fid/Makefile
new file mode 100644
index 0000000000000..22be6773ba08f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_LNET)	+= fid.o
+
+fid-y		:= fid_request.o lproc_fid.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/fld/Makefile b/drivers/staging/lustrefsx/lustre/fld/Makefile
new file mode 100644
index 0000000000000..722c19fe30409
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTREFSX_LNET)        += fld.o
+
+ccflags-y	+= -I$(srctree)/drivers/staging/lustrefsx/include
+
+fld-y		:= fld_request.o fld_cache.o lproc_fld.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
+
diff --git a/drivers/staging/lustrefsx/lustre/llite/Makefile b/drivers/staging/lustrefsx/lustre/llite/Makefile
new file mode 100644
index 0000000000000..4650e91efc0df
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/Makefile
@@ -0,0 +1,15 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= lustre.o
+
+lustre-y	:= dcache.o dir.o file.o llite_lib.o llite_nfs.o
+lustre-y	+= rw.o lproc_llite.o namei.o symlink.o llite_mmap.o
+lustre-y	+= xattr.o xattr_cache.o
+lustre-y	+= rw26.o super25.o statahead.o xattr_security.o
+lustre-y	+= glimpse.o
+lustre-y	+= lcommon_cl.o
+lustre-y	+= lcommon_misc.o
+lustre-y	+= vvp_dev.o vvp_page.o vvp_io.o vvp_object.o
+lustre-y	+= pcc.o crypto.o
+lustre-y	+= llite_foreign.o llite_foreign_symlink.o
+lustre-y 	+= acl.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/lmv/Makefile b/drivers/staging/lustrefsx/lustre/lmv/Makefile
new file mode 100644
index 0000000000000..40626f49283fb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= lmv.o
+
+lmv-y	:= lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/lov/Makefile b/drivers/staging/lustrefsx/lustre/lov/Makefile
new file mode 100644
index 0000000000000..dae11b1647cbe
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= lov.o
+
+lov-y	:= lov_dev.o lov_ea.o lov_io.o lov_lock.o lov_merge.o lov_obd.o
+lov-y	+= lov_object.o lov_offset.o lov_pack.o lov_page.o lov_pool.o
+lov-y	+= lov_request.o lovsub_dev.o lovsub_object.o
+lov-y	+= lproc_lov.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/mdc/Makefile b/drivers/staging/lustrefsx/lustre/mdc/Makefile
new file mode 100644
index 0000000000000..5e997efd3b33a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= mdc.o
+
+mdc-y		:= mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
+mdc-y		+= mdc_changelog.o mdc_dev.o
+mdc-y		+= mdc_acl.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/mgc/Makefile b/drivers/staging/lustrefsx/lustre/mgc/Makefile
new file mode 100644
index 0000000000000..7353c95e42cca
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mgc/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= mgc.o
+
+mgc-y	:= mgc_request.o lproc_mgc.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/Makefile b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
new file mode 100644
index 0000000000000..eaa614e1a33cd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
@@ -0,0 +1,15 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= obdclass.o
+
+obdclass-y := llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o
+obdclass-y += class_obd.o genops.o llog_ioctl.o
+obdclass-y += lprocfs_status.o lprocfs_counters.o
+obdclass-y += lustre_handles.o lustre_peer.o local_storage.o
+obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o obd_sysfs.o
+obdclass-y += lu_object.o dt_object.o
+obdclass-y += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o
+obdclass-y += linkea.o kernelcomm.o jobid.o 
+obdclass-y += integrity.o obd_cksum.o
+obdclass-y += lu_tgt_descs.o lu_tgt_pool.o
+obdclass-y += range_lock.o interval_tree.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/Makefile b/drivers/staging/lustrefsx/lustre/obdecho/Makefile
new file mode 100644
index 0000000000000..3a2ba7082c3f4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdecho/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= obdecho.o
+
+obdecho-y	:= echo_client.o debug.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/osc/Makefile b/drivers/staging/lustrefsx/lustre/osc/Makefile
new file mode 100644
index 0000000000000..223e42283bf92
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= osc.o
+
+osc-y := osc_request.o lproc_osc.o osc_dev.o osc_object.o osc_page.o
+osc-y += osc_lock.o osc_io.o osc_quota.o osc_cache.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
new file mode 100644
index 0000000000000..2765abe6ee44c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
@@ -0,0 +1,25 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= ptlrpc.o
+
+LDLM := ../../lustre/ldlm/
+TARGET := ../../lustre/target/
+
+ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o
+ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
+ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
+ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
+ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
+ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)ldlm_reclaim.o
+
+ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
+ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
+ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
+ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
+ptlrpc_objs += sec.o sec_ctx.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
+ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o nrs_delay.o heap.o
+ptlrpc_objs += errno.o
+
+ptlrpc-y	:= $(ldlm_objs) $(ptlrpc_objs) $(TARGET)barrier.o
+
+ccflags-y	+= -I$(srctree)/drivers/staging/lustrefsx/ldlm
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules

From 7cb42730d7386dedc2d8b99b98e8ff4d9ef10aa3 Mon Sep 17 00:00:00 2001
From: Jacob Wolf <jacwolf@amazon.com>
Date: Sat, 9 Mar 2024 18:24:28 +0000
Subject: [PATCH 163/175] Initial 2.15 Lustre client commit

---
 drivers/staging/lustrefsx/config.h            | 1258 ++++
 .../lustrefsx/libcfs/include/libcfs/bitmap.h  |  118 +
 .../libcfs/include/libcfs/crypto/llcrypt.h    |  798 +++
 .../lustrefsx/libcfs/include/libcfs/libcfs.h  |  143 +
 .../libcfs/include/libcfs/libcfs_cpu.h        |  462 ++
 .../libcfs/include/libcfs/libcfs_crypto.h     |  319 +
 .../libcfs/include/libcfs/libcfs_debug.h      |  328 +
 .../libcfs/include/libcfs/libcfs_fail.h       |  226 +
 .../libcfs/include/libcfs/libcfs_hash.h       |  869 +++
 .../libcfs/include/libcfs/libcfs_private.h    |  341 +
 .../libcfs/include/libcfs/libcfs_string.h     |   86 +
 .../libcfs/include/libcfs/libcfs_workitem.h   |  103 +
 .../libcfs/include/libcfs/linux/glob.h        |   13 +
 .../libcfs/include/libcfs/linux/linux-cpu.h   |   52 +
 .../libcfs/linux/linux-fortify-string.h       |  296 +
 .../libcfs/include/libcfs/linux/linux-fs.h    |   87 +
 .../libcfs/include/libcfs/linux/linux-hash.h  |  345 +
 .../libcfs/include/libcfs/linux/linux-list.h  |   32 +
 .../libcfs/include/libcfs/linux/linux-mem.h   |  143 +
 .../libcfs/include/libcfs/linux/linux-misc.h  |  189 +
 .../libcfs/include/libcfs/linux/linux-net.h   |  162 +
 .../libcfs/include/libcfs/linux/linux-time.h  |  250 +
 .../libcfs/include/libcfs/linux/linux-uuid.h  |   63 +
 .../libcfs/include/libcfs/linux/linux-wait.h  |  593 ++
 .../libcfs/include/libcfs/linux/processor.h   |   79 +
 .../libcfs/include/libcfs/linux/refcount.h    |   40 +
 .../libcfs/include/libcfs/linux/xarray.h      | 1766 +++++
 .../libcfs/include/libcfs/util/hash.h         |  103 +
 .../libcfs/include/libcfs/util/ioctl.h        |   67 +
 .../libcfs/include/libcfs/util/list.h         |  499 ++
 .../libcfs/include/libcfs/util/param.h        |   40 +
 .../libcfs/include/libcfs/util/parser.h       |  114 +
 .../libcfs/include/libcfs/util/string.h       |  143 +
 .../libcfs/include/uapi/linux/llcrypt.h       |  186 +
 .../lustrefsx/libcfs/libcfs/crypto/crypto.c   |  559 ++
 .../lustrefsx/libcfs/libcfs/crypto/fname.c    |  567 ++
 .../lustrefsx/libcfs/libcfs/crypto/hkdf.c     |  188 +
 .../lustrefsx/libcfs/libcfs/crypto/hooks.c    |  322 +
 .../lustrefsx/libcfs/libcfs/crypto/keyring.c  | 1012 +++
 .../lustrefsx/libcfs/libcfs/crypto/keysetup.c |  635 ++
 .../libcfs/libcfs/crypto/keysetup_v1.c        |  350 +
 .../libcfs/libcfs/crypto/llcrypt_private.h    |  499 ++
 .../lustrefsx/libcfs/libcfs/crypto/policy.c   |  594 ++
 .../staging/lustrefsx/libcfs/libcfs/debug.c   |  736 ++
 .../staging/lustrefsx/libcfs/libcfs/fail.c    |  153 +
 .../staging/lustrefsx/libcfs/libcfs/hash.c    | 2126 ++++++
 .../lustrefsx/libcfs/libcfs/libcfs_cpu.c      | 1270 ++++
 .../lustrefsx/libcfs/libcfs/libcfs_lock.c     |  156 +
 .../lustrefsx/libcfs/libcfs/libcfs_mem.c      |  176 +
 .../lustrefsx/libcfs/libcfs/libcfs_string.c   |  561 ++
 .../libcfs/libcfs/linux-crypto-adler.c        |  137 +
 .../lustrefsx/libcfs/libcfs/linux-crypto.c    |  487 ++
 .../lustrefsx/libcfs/libcfs/linux-crypto.h    |   37 +
 .../lustrefsx/libcfs/libcfs/linux/glob.c      |  117 +
 .../libcfs/libcfs/linux/linux-hash.c          |   57 +
 .../libcfs/libcfs/linux/linux-prim.c          |  275 +
 .../libcfs/libcfs/linux/linux-wait.c          |  174 +
 .../lustrefsx/libcfs/libcfs/linux/xarray.c    | 2101 ++++++
 .../staging/lustrefsx/libcfs/libcfs/module.c  |  936 +++
 .../lustrefsx/libcfs/libcfs/tracefile.c       | 1213 ++++
 .../lustrefsx/libcfs/libcfs/tracefile.h       |  190 +
 .../lustrefsx/libcfs/libcfs/util/l_ioctl.c    |  191 +
 .../lustrefsx/libcfs/libcfs/util/nidstrings.c | 1647 +++++
 .../lustrefsx/libcfs/libcfs/util/param.c      |  155 +
 .../lustrefsx/libcfs/libcfs/util/parser.c     |  850 +++
 .../lustrefsx/libcfs/libcfs/util/string.c     |  526 ++
 .../lustrefsx/libcfs/libcfs/workitem.c        |  462 ++
 drivers/staging/lustrefsx/lnet/LICENSE        |  363 +
 .../staging/lustrefsx/lnet/include/lnet/api.h |  171 +
 .../lustrefsx/lnet/include/lnet/lib-lnet.h    | 1243 ++++
 .../lustrefsx/lnet/include/lnet/lib-types.h   | 1338 ++++
 .../lustrefsx/lnet/include/lnet/lnet_rdma.h   |   89 +
 .../lustrefsx/lnet/include/lnet/socklnd.h     |   99 +
 .../lustrefsx/lnet/include/lnet/udsp.h        |  143 +
 .../include/uapi/linux/lnet/libcfs_debug.h    |  157 +
 .../include/uapi/linux/lnet/libcfs_ioctl.h    |  162 +
 .../lnet/include/uapi/linux/lnet/lnet-dlc.h   |  399 ++
 .../lnet/include/uapi/linux/lnet/lnet-idl.h   |  298 +
 .../lnet/include/uapi/linux/lnet/lnet-nl.h    |   87 +
 .../lnet/include/uapi/linux/lnet/lnet-types.h |  635 ++
 .../lnet/include/uapi/linux/lnet/lnetctl.h    |  154 +
 .../lnet/include/uapi/linux/lnet/lnetst.h     |  537 ++
 .../lnet/include/uapi/linux/lnet/nidstr.h     |  107 +
 .../lnet/include/uapi/linux/lnet/socklnd.h    |   43 +
 .../lnet/klnds/o2iblnd/o2iblnd-idl.h          |  155 +
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c    | 3596 ++++++++++
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h    | 1170 +++
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c | 4021 +++++++++++
 .../lnet/klnds/o2iblnd/o2iblnd_modparams.c    |  332 +
 .../lustrefsx/lnet/klnds/socklnd/socklnd.c    | 2596 +++++++
 .../lustrefsx/lnet/klnds/socklnd/socklnd.h    |  682 ++
 .../lustrefsx/lnet/klnds/socklnd/socklnd_cb.c | 2694 +++++++
 .../lnet/klnds/socklnd/socklnd_lib.c          |  698 ++
 .../lnet/klnds/socklnd/socklnd_modparams.c    |  380 +
 .../lnet/klnds/socklnd/socklnd_proto.c        | 1001 +++
 .../staging/lustrefsx/lnet/lnet/acceptor.c    |  570 ++
 drivers/staging/lustrefsx/lnet/lnet/api-ni.c  | 4884 +++++++++++++
 drivers/staging/lustrefsx/lnet/lnet/config.c  | 1636 +++++
 drivers/staging/lustrefsx/lnet/lnet/lib-md.c  |  558 ++
 drivers/staging/lustrefsx/lnet/lnet/lib-me.c  |  155 +
 .../staging/lustrefsx/lnet/lnet/lib-move.c    | 5456 ++++++++++++++
 drivers/staging/lustrefsx/lnet/lnet/lib-msg.c | 1346 ++++
 drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c |  991 +++
 .../staging/lustrefsx/lnet/lnet/lib-socket.c  |  434 ++
 .../staging/lustrefsx/lnet/lnet/lnet_rdma.c   |  208 +
 drivers/staging/lustrefsx/lnet/lnet/lo.c      |   92 +
 drivers/staging/lustrefsx/lnet/lnet/module.c  |  277 +
 .../staging/lustrefsx/lnet/lnet/net_fault.c   | 1114 +++
 .../staging/lustrefsx/lnet/lnet/nidstrings.c  | 1190 ++++
 drivers/staging/lustrefsx/lnet/lnet/peer.c    | 4314 ++++++++++++
 drivers/staging/lustrefsx/lnet/lnet/router.c  | 1835 +++++
 .../staging/lustrefsx/lnet/lnet/router_proc.c |  902 +++
 drivers/staging/lustrefsx/lnet/lnet/udsp.c    | 1557 ++++
 .../lustrefsx/lnet/selftest/brw_test.c        |  524 ++
 .../staging/lustrefsx/lnet/selftest/conctl.c  |  929 +++
 .../staging/lustrefsx/lnet/selftest/conrpc.c  | 1398 ++++
 .../staging/lustrefsx/lnet/selftest/conrpc.h  |  145 +
 .../staging/lustrefsx/lnet/selftest/console.c | 2105 ++++++
 .../staging/lustrefsx/lnet/selftest/console.h |  262 +
 .../lustrefsx/lnet/selftest/framework.c       | 1766 +++++
 .../staging/lustrefsx/lnet/selftest/module.c  |  170 +
 .../lustrefsx/lnet/selftest/ping_test.c       |  226 +
 drivers/staging/lustrefsx/lnet/selftest/rpc.c | 1685 +++++
 drivers/staging/lustrefsx/lnet/selftest/rpc.h |  296 +
 .../lustrefsx/lnet/selftest/selftest.h        |  613 ++
 .../staging/lustrefsx/lnet/selftest/timer.c   |  244 +
 .../staging/lustrefsx/lnet/selftest/timer.h   |   48 +
 drivers/staging/lustrefsx/lustre/LICENSE      |  372 +
 .../lustrefsx/lustre/fid/fid_handler.c        |  616 ++
 .../lustrefsx/lustre/fid/fid_internal.h       |   95 +
 .../staging/lustrefsx/lustre/fid/fid_lib.c    |   98 +
 .../lustrefsx/lustre/fid/fid_request.c        |  523 ++
 .../staging/lustrefsx/lustre/fid/fid_store.c  |  249 +
 .../staging/lustrefsx/lustre/fid/lproc_fid.c  |  635 ++
 .../staging/lustrefsx/lustre/fld/fld_cache.c  |  492 ++
 .../lustrefsx/lustre/fld/fld_handler.c        |  485 ++
 .../staging/lustrefsx/lustre/fld/fld_index.c  |  531 ++
 .../lustrefsx/lustre/fld/fld_internal.h       |  214 +
 .../lustrefsx/lustre/fld/fld_request.c        |  544 ++
 .../staging/lustrefsx/lustre/fld/lproc_fld.c  |  357 +
 .../lustrefsx/lustre/include/cl_object.h      | 2710 +++++++
 .../lustrefsx/lustre/include/dt_object.h      | 3054 ++++++++
 .../lustrefsx/lustre/include/interval_tree.h  |  130 +
 .../lustrefsx/lustre/include/llog_swab.h      |   70 +
 .../lustrefsx/lustre/include/lprocfs_status.h | 1124 +++
 .../lustrefsx/lustre/include/lu_object.h      | 1760 +++++
 .../staging/lustrefsx/lustre/include/lu_ref.h |  256 +
 .../lustrefsx/lustre/include/lu_target.h      |  740 ++
 .../lustrefsx/lustre/include/lustre/libiam.h  |  140 +
 .../lustre/include/lustre/liblustreapi.h      |   38 +
 .../lustre/include/lustre/ll_fiemap.h         |   48 +
 .../include/lustre/lustre_barrier_user.h      |   40 +
 .../lustre/include/lustre/lustre_lfsck_user.h |   40 +
 .../lustre/include/lustre/lustre_user.h       |   47 +
 .../lustre/include/lustre/lustreapi.h         | 1245 ++++
 .../lustrefsx/lustre/include/lustre_acl.h     |   51 +
 .../lustrefsx/lustre/include/lustre_barrier.h |   44 +
 .../lustrefsx/lustre/include/lustre_compat.h  |  637 ++
 .../lustrefsx/lustre/include/lustre_crypto.h  |  230 +
 .../lustrefsx/lustre/include/lustre_disk.h    |  383 +
 .../lustrefsx/lustre/include/lustre_dlm.h     | 1865 +++++
 .../lustre/include/lustre_dlm_flags.h         |  444 ++
 .../lustrefsx/lustre/include/lustre_errno.h   |  218 +
 .../lustrefsx/lustre/include/lustre_export.h  |  519 ++
 .../lustrefsx/lustre/include/lustre_fid.h     |  953 +++
 .../lustrefsx/lustre/include/lustre_fld.h     |  200 +
 .../lustrefsx/lustre/include/lustre_ha.h      |   59 +
 .../lustrefsx/lustre/include/lustre_handles.h |   75 +
 .../lustrefsx/lustre/include/lustre_idmap.h   |   70 +
 .../lustrefsx/lustre/include/lustre_import.h  |  430 ++
 .../lustrefsx/lustre/include/lustre_intent.h  |   67 +
 .../lustre/include/lustre_kernelcomm.h        |   57 +
 .../lustrefsx/lustre/include/lustre_lfsck.h   |  130 +
 .../lustrefsx/lustre/include/lustre_lib.h     |   99 +
 .../lustrefsx/lustre/include/lustre_linkea.h  |   98 +
 .../lustrefsx/lustre/include/lustre_lmv.h     |  539 ++
 .../lustrefsx/lustre/include/lustre_log.h     |  572 ++
 .../lustrefsx/lustre/include/lustre_mdc.h     |  126 +
 .../lustrefsx/lustre/include/lustre_mds.h     |   84 +
 .../lustrefsx/lustre/include/lustre_net.h     | 2673 +++++++
 .../lustrefsx/lustre/include/lustre_nodemap.h |  241 +
 .../lustrefsx/lustre/include/lustre_nrs.h     |  752 ++
 .../lustrefsx/lustre/include/lustre_nrs_crr.h |  128 +
 .../lustre/include/lustre_nrs_delay.h         |   87 +
 .../lustre/include/lustre_nrs_fifo.h          |   70 +
 .../lustrefsx/lustre/include/lustre_nrs_orr.h |  225 +
 .../lustrefsx/lustre/include/lustre_nrs_tbf.h |  380 +
 .../lustrefsx/lustre/include/lustre_obdo.h    |   53 +
 .../lustrefsx/lustre/include/lustre_osc.h     |  983 +++
 .../lustrefsx/lustre/include/lustre_quota.h   |  279 +
 .../lustre/include/lustre_req_layout.h        |  428 ++
 .../lustrefsx/lustre/include/lustre_scrub.h   |  392 ++
 .../lustrefsx/lustre/include/lustre_sec.h     | 1208 ++++
 .../lustrefsx/lustre/include/lustre_swab.h    |  139 +
 .../lustrefsx/lustre/include/lustre_update.h  |  709 ++
 .../staging/lustrefsx/lustre/include/lvfs.h   |   72 +
 .../lustrefsx/lustre/include/md_object.h      |  733 ++
 .../staging/lustrefsx/lustre/include/obd.h    | 1376 ++++
 .../lustrefsx/lustre/include/obd_cache.h      |   34 +
 .../lustrefsx/lustre/include/obd_cksum.h      |  193 +
 .../lustrefsx/lustre/include/obd_class.h      | 1954 +++++
 .../lustrefsx/lustre/include/obd_support.h    | 1055 +++
 .../lustrefsx/lustre/include/obd_target.h     |   73 +
 .../lustrefsx/lustre/include/obj_update.h     |  115 +
 .../lustrefsx/lustre/include/range_lock.h     |   77 +
 .../lustrefsx/lustre/include/seq_range.h      |  192 +
 .../lustre/include/uapi/linux/lustre/lgss.h   |   58 +
 .../uapi/linux/lustre/lustre_access_log.h     |   85 +
 .../uapi/linux/lustre/lustre_barrier_user.h   |   74 +
 .../include/uapi/linux/lustre/lustre_cfg.h    |  346 +
 .../include/uapi/linux/lustre/lustre_disk.h   |  231 +
 .../include/uapi/linux/lustre/lustre_fid.h    |  364 +
 .../include/uapi/linux/lustre/lustre_fiemap.h |   99 +
 .../include/uapi/linux/lustre/lustre_idl.h    | 3755 ++++++++++
 .../include/uapi/linux/lustre/lustre_ioctl.h  |  231 +
 .../uapi/linux/lustre/lustre_kernelcomm.h     |   98 +
 .../uapi/linux/lustre/lustre_lfsck_user.h     |  238 +
 .../uapi/linux/lustre/lustre_log_user.h       |   80 +
 .../include/uapi/linux/lustre/lustre_ostid.h  |  237 +
 .../include/uapi/linux/lustre/lustre_param.h  |   95 +
 .../include/uapi/linux/lustre/lustre_user.h   | 2795 ++++++++
 .../include/uapi/linux/lustre/lustre_ver.h    |   33 +
 .../lustrefsx/lustre/include/upcall_cache.h   |  153 +
 .../staging/lustrefsx/lustre/ldlm/l_lock.c    |   73 +
 .../lustrefsx/lustre/ldlm/ldlm_extent.c       | 1095 +++
 .../lustrefsx/lustre/ldlm/ldlm_flock.c        |  958 +++
 .../lustrefsx/lustre/ldlm/ldlm_inodebits.c    |  667 ++
 .../lustrefsx/lustre/ldlm/ldlm_internal.h     |  424 ++
 .../staging/lustrefsx/lustre/ldlm/ldlm_lib.c  | 3569 ++++++++++
 .../staging/lustrefsx/lustre/ldlm/ldlm_lock.c | 2898 ++++++++
 .../lustrefsx/lustre/ldlm/ldlm_lockd.c        | 3488 +++++++++
 .../lustrefsx/lustre/ldlm/ldlm_plain.c        |  180 +
 .../staging/lustrefsx/lustre/ldlm/ldlm_pool.c | 1568 +++++
 .../lustrefsx/lustre/ldlm/ldlm_reclaim.c      |  415 ++
 .../lustrefsx/lustre/ldlm/ldlm_request.c      | 2650 +++++++
 .../lustrefsx/lustre/ldlm/ldlm_resource.c     | 1809 +++++
 drivers/staging/lustrefsx/lustre/llite/acl.c  |  136 +
 .../staging/lustrefsx/lustre/llite/crypto.c   |  562 ++
 .../staging/lustrefsx/lustre/llite/dcache.c   |  388 +
 drivers/staging/lustrefsx/lustre/llite/dir.c  | 2616 +++++++
 drivers/staging/lustrefsx/lustre/llite/file.c | 6260 +++++++++++++++++
 .../lustrefsx/lustre/llite/foreign_symlink.h  |   48 +
 .../staging/lustrefsx/lustre/llite/glimpse.c  |  228 +
 .../lustrefsx/lustre/llite/lcommon_cl.c       |  287 +
 .../lustrefsx/lustre/llite/lcommon_misc.c     |  189 +
 .../lustrefsx/lustre/llite/llite_foreign.c    |  281 +
 .../lustre/llite/llite_foreign_symlink.c      |  865 +++
 .../lustrefsx/lustre/llite/llite_internal.h   | 1860 +++++
 .../lustrefsx/lustre/llite/llite_lib.c        | 3909 ++++++++++
 .../lustrefsx/lustre/llite/llite_mmap.c       |  616 ++
 .../lustrefsx/lustre/llite/llite_nfs.c        |  401 ++
 .../lustrefsx/lustre/llite/lproc_llite.c      | 2585 +++++++
 .../staging/lustrefsx/lustre/llite/namei.c    | 2222 ++++++
 drivers/staging/lustrefsx/lustre/llite/pcc.c  | 2748 ++++++++
 drivers/staging/lustrefsx/lustre/llite/pcc.h  |  268 +
 drivers/staging/lustrefsx/lustre/llite/rw.c   | 2046 ++++++
 drivers/staging/lustrefsx/lustre/llite/rw26.c | 1023 +++
 .../lustrefsx/lustre/llite/statahead.c        | 1790 +++++
 .../staging/lustrefsx/lustre/llite/super25.c  |  340 +
 .../staging/lustrefsx/lustre/llite/symlink.c  |  338 +
 .../staging/lustrefsx/lustre/llite/vvp_dev.c  |  623 ++
 .../lustrefsx/lustre/llite/vvp_internal.h     |  311 +
 .../staging/lustrefsx/lustre/llite/vvp_io.c   | 1853 +++++
 .../lustrefsx/lustre/llite/vvp_object.c       |  324 +
 .../staging/lustrefsx/lustre/llite/vvp_page.c |  485 ++
 .../staging/lustrefsx/lustre/llite/xattr.c    |  934 +++
 .../lustrefsx/lustre/llite/xattr_cache.c      |  671 ++
 .../lustrefsx/lustre/llite/xattr_security.c   |  328 +
 .../staging/lustrefsx/lustre/lmv/lmv_fld.c    |   87 +
 .../staging/lustrefsx/lustre/lmv/lmv_intent.c |  595 ++
 .../lustrefsx/lustre/lmv/lmv_internal.h       |  202 +
 .../staging/lustrefsx/lustre/lmv/lmv_obd.c    | 3915 +++++++++++
 .../staging/lustrefsx/lustre/lmv/lproc_lmv.c  |  322 +
 .../lustrefsx/lustre/lov/lov_cl_internal.h    |  815 +++
 .../staging/lustrefsx/lustre/lov/lov_dev.c    |  592 ++
 drivers/staging/lustrefsx/lustre/lov/lov_ea.c |  716 ++
 .../lustrefsx/lustre/lov/lov_internal.h       |  375 +
 drivers/staging/lustrefsx/lustre/lov/lov_io.c | 1987 ++++++
 .../staging/lustrefsx/lustre/lov/lov_lock.c   |  382 +
 .../staging/lustrefsx/lustre/lov/lov_merge.c  |  108 +
 .../staging/lustrefsx/lustre/lov/lov_obd.c    | 1350 ++++
 .../staging/lustrefsx/lustre/lov/lov_object.c | 2336 ++++++
 .../staging/lustrefsx/lustre/lov/lov_offset.c |  308 +
 .../staging/lustrefsx/lustre/lov/lov_pack.c   |  483 ++
 .../staging/lustrefsx/lustre/lov/lov_page.c   |  197 +
 .../staging/lustrefsx/lustre/lov/lov_pool.c   |  484 ++
 .../lustrefsx/lustre/lov/lov_request.c        |  392 ++
 .../staging/lustrefsx/lustre/lov/lovsub_dev.c |  145 +
 .../lustrefsx/lustre/lov/lovsub_object.c      |  202 +
 .../staging/lustrefsx/lustre/lov/lproc_lov.c  |  310 +
 .../staging/lustrefsx/lustre/mdc/lproc_mdc.c  |  795 +++
 .../staging/lustrefsx/lustre/mdc/mdc_acl.c    |   63 +
 .../lustrefsx/lustre/mdc/mdc_changelog.c      |  881 +++
 .../staging/lustrefsx/lustre/mdc/mdc_dev.c    | 1627 +++++
 .../lustrefsx/lustre/mdc/mdc_internal.h       |  198 +
 .../staging/lustrefsx/lustre/mdc/mdc_lib.c    |  674 ++
 .../staging/lustrefsx/lustre/mdc/mdc_locks.c  | 1466 ++++
 .../staging/lustrefsx/lustre/mdc/mdc_reint.c  |  536 ++
 .../lustrefsx/lustre/mdc/mdc_request.c        | 3077 ++++++++
 .../staging/lustrefsx/lustre/mgc/lproc_mgc.c  |  132 +
 .../lustrefsx/lustre/mgc/mgc_internal.h       |   74 +
 .../lustrefsx/lustre/mgc/mgc_request.c        | 2333 ++++++
 drivers/staging/lustrefsx/lustre/nodist       |    9 +
 .../lustrefsx/lustre/obdclass/cl_internal.h   |   57 +
 .../staging/lustrefsx/lustre/obdclass/cl_io.c | 1439 ++++
 .../lustrefsx/lustre/obdclass/cl_lock.c       |  289 +
 .../lustrefsx/lustre/obdclass/cl_object.c     | 1118 +++
 .../lustrefsx/lustre/obdclass/cl_page.c       | 1291 ++++
 .../lustrefsx/lustre/obdclass/class_obd.c     |  974 +++
 .../lustrefsx/lustre/obdclass/dt_object.c     | 1292 ++++
 .../lustrefsx/lustre/obdclass/genops.c        | 2348 +++++++
 .../staging/lustrefsx/lustre/obdclass/idmap.c |  161 +
 .../lustrefsx/lustre/obdclass/integrity.c     |  277 +
 .../lustrefsx/lustre/obdclass/interval_tree.c |  772 ++
 .../staging/lustrefsx/lustre/obdclass/jobid.c |  932 +++
 .../lustrefsx/lustre/obdclass/kernelcomm.c    |  262 +
 .../lustrefsx/lustre/obdclass/linkea.c        |  330 +
 .../staging/lustrefsx/lustre/obdclass/llog.c  | 1539 ++++
 .../lustrefsx/lustre/obdclass/llog_cat.c      | 1198 ++++
 .../lustrefsx/lustre/obdclass/llog_internal.h |  102 +
 .../lustrefsx/lustre/obdclass/llog_ioctl.c    |  551 ++
 .../lustrefsx/lustre/obdclass/llog_obd.c      |  248 +
 .../lustrefsx/lustre/obdclass/llog_osd.c      | 2242 ++++++
 .../lustrefsx/lustre/obdclass/llog_swab.c     |  488 ++
 .../lustrefsx/lustre/obdclass/llog_test.c     | 2288 ++++++
 .../lustrefsx/lustre/obdclass/local_storage.c |  987 +++
 .../lustrefsx/lustre/obdclass/local_storage.h |   94 +
 .../lustre/obdclass/lprocfs_counters.c        |  136 +
 .../lustre/obdclass/lprocfs_jobstats.c        |  691 ++
 .../lustre/obdclass/lprocfs_status.c          | 2331 ++++++
 .../lustre/obdclass/lprocfs_status_server.c   | 1121 +++
 .../lustrefsx/lustre/obdclass/lu_object.c     | 2597 +++++++
 .../lustrefsx/lustre/obdclass/lu_ref.c        |  437 ++
 .../lustrefsx/lustre/obdclass/lu_tgt_descs.c  |  687 ++
 .../lustrefsx/lustre/obdclass/lu_tgt_pool.c   |  244 +
 .../lustrefsx/lustre/obdclass/lu_ucred.c      |  102 +
 .../lustre/obdclass/lustre_handles.c          |  219 +
 .../lustrefsx/lustre/obdclass/lustre_peer.c   |  247 +
 .../lustrefsx/lustre/obdclass/md_attrs.c      |  198 +
 .../lustrefsx/lustre/obdclass/obd_cksum.c     |  149 +
 .../lustrefsx/lustre/obdclass/obd_config.c    | 2479 +++++++
 .../lustrefsx/lustre/obdclass/obd_mount.c     | 1689 +++++
 .../lustre/obdclass/obd_mount_server.c        | 2112 ++++++
 .../lustrefsx/lustre/obdclass/obd_sysfs.c     |  687 ++
 .../staging/lustrefsx/lustre/obdclass/obdo.c  |  225 +
 .../lustrefsx/lustre/obdclass/obdo_server.c   |  163 +
 .../lustrefsx/lustre/obdclass/range_lock.c    |  179 +
 .../staging/lustrefsx/lustre/obdclass/scrub.c | 1356 ++++
 .../lustrefsx/lustre/obdclass/statfs_pack.c   |   72 +
 .../lustrefsx/lustre/obdclass/upcall_cache.c  |  454 ++
 .../staging/lustrefsx/lustre/obdecho/debug.c  |   99 +
 .../staging/lustrefsx/lustre/obdecho/echo.c   |  980 +++
 .../lustrefsx/lustre/obdecho/echo_client.c    | 3171 +++++++++
 .../lustrefsx/lustre/obdecho/echo_internal.h  |   59 +
 .../staging/lustrefsx/lustre/osc/lproc_osc.c  |  916 +++
 .../staging/lustrefsx/lustre/osc/osc_cache.c  | 3303 +++++++++
 .../staging/lustrefsx/lustre/osc/osc_dev.c    |  252 +
 .../lustrefsx/lustre/osc/osc_internal.h       |  222 +
 drivers/staging/lustrefsx/lustre/osc/osc_io.c | 1321 ++++
 .../staging/lustrefsx/lustre/osc/osc_lock.c   | 1300 ++++
 .../staging/lustrefsx/lustre/osc/osc_object.c |  497 ++
 .../staging/lustrefsx/lustre/osc/osc_page.c   | 1158 +++
 .../staging/lustrefsx/lustre/osc/osc_quota.c  |  321 +
 .../lustrefsx/lustre/osc/osc_request.c        | 3942 +++++++++++
 .../staging/lustrefsx/lustre/ptlrpc/client.c  | 3712 ++++++++++
 .../lustrefsx/lustre/ptlrpc/connection.c      |  174 +
 .../staging/lustrefsx/lustre/ptlrpc/errno.c   |  411 ++
 .../staging/lustrefsx/lustre/ptlrpc/events.c  |  655 ++
 .../lustrefsx/lustre/ptlrpc/gss/gss_api.h     |  185 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_asn1.h    |   84 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_bulk.c    |  516 ++
 .../lustre/ptlrpc/gss/gss_cli_upcall.c        |  429 ++
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.c  |  463 ++
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.h  |  131 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_err.h     |  193 +
 .../lustre/ptlrpc/gss/gss_generic_token.c     |  284 +
 .../lustre/ptlrpc/gss/gss_internal.h          |  509 ++
 .../lustrefsx/lustre/ptlrpc/gss/gss_keyring.c | 1652 +++++
 .../lustrefsx/lustre/ptlrpc/gss/gss_krb5.h    |  160 +
 .../lustre/ptlrpc/gss/gss_krb5_mech.c         | 1604 +++++
 .../lustre/ptlrpc/gss/gss_mech_switch.c       |  361 +
 .../lustre/ptlrpc/gss/gss_null_mech.c         |  220 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c  | 1255 ++++
 .../lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c  |  240 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c |  960 +++
 .../lustre/ptlrpc/gss/gss_svc_upcall.c        | 1190 ++++
 .../lustrefsx/lustre/ptlrpc/gss/lproc_gss.c   |  278 +
 .../lustrefsx/lustre/ptlrpc/gss/sec_gss.c     | 2929 ++++++++
 .../staging/lustrefsx/lustre/ptlrpc/heap.c    |  497 ++
 .../staging/lustrefsx/lustre/ptlrpc/heap.h    |  188 +
 .../staging/lustrefsx/lustre/ptlrpc/import.c  | 2069 ++++++
 .../staging/lustrefsx/lustre/ptlrpc/layout.c  | 2719 +++++++
 .../lustrefsx/lustre/ptlrpc/llog_client.c     |  352 +
 .../lustrefsx/lustre/ptlrpc/llog_net.c        |   67 +
 .../lustrefsx/lustre/ptlrpc/llog_server.c     |  288 +
 .../lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    | 1480 ++++
 .../staging/lustrefsx/lustre/ptlrpc/niobuf.c  | 1028 +++
 .../lustre/ptlrpc/nodemap_internal.h          |  204 +
 drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c | 1786 +++++
 .../staging/lustrefsx/lustre/ptlrpc/nrs_crr.c |  830 +++
 .../lustrefsx/lustre/ptlrpc/nrs_delay.c       |  829 +++
 .../lustrefsx/lustre/ptlrpc/nrs_fifo.c        |  271 +
 .../staging/lustrefsx/lustre/ptlrpc/nrs_orr.c | 1970 ++++++
 .../staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c | 3712 ++++++++++
 .../lustrefsx/lustre/ptlrpc/pack_generic.c    | 3001 ++++++++
 .../lustrefsx/lustre/ptlrpc/pack_server.c     |  137 +
 .../staging/lustrefsx/lustre/ptlrpc/pers.c    |   73 +
 .../staging/lustrefsx/lustre/ptlrpc/pinger.c  |  571 ++
 .../lustrefsx/lustre/ptlrpc/ptlrpc_internal.h |  441 ++
 .../lustrefsx/lustre/ptlrpc/ptlrpc_module.c   |  147 +
 .../staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c |  993 +++
 .../staging/lustrefsx/lustre/ptlrpc/recover.c |  377 +
 drivers/staging/lustrefsx/lustre/ptlrpc/sec.c | 2762 ++++++++
 .../lustrefsx/lustre/ptlrpc/sec_bulk.c        | 1005 +++
 .../lustrefsx/lustre/ptlrpc/sec_config.c      |  979 +++
 .../staging/lustrefsx/lustre/ptlrpc/sec_ctx.c |  110 +
 .../staging/lustrefsx/lustre/ptlrpc/sec_gc.c  |  198 +
 .../lustrefsx/lustre/ptlrpc/sec_lproc.c       |  381 +
 .../lustrefsx/lustre/ptlrpc/sec_null.c        |  451 ++
 .../lustrefsx/lustre/ptlrpc/sec_plain.c       | 1032 +++
 .../staging/lustrefsx/lustre/ptlrpc/service.c | 3672 ++++++++++
 .../staging/lustrefsx/lustre/ptlrpc/wirehdr.c |   46 +
 .../lustrefsx/lustre/ptlrpc/wiretest.c        | 6095 ++++++++++++++++
 .../staging/lustrefsx/lustre/target/barrier.c |  412 ++
 .../lustrefsx/lustre/target/out_handler.c     | 1254 ++++
 .../staging/lustrefsx/lustre/target/out_lib.c | 1276 ++++
 .../staging/lustrefsx/lustre/target/tgt_fmd.c |  363 +
 .../lustrefsx/lustre/target/tgt_grant.c       | 1704 +++++
 .../lustrefsx/lustre/target/tgt_handler.c     | 3028 ++++++++
 .../lustrefsx/lustre/target/tgt_internal.h    |  302 +
 .../lustrefsx/lustre/target/tgt_lastrcvd.c    | 2282 ++++++
 .../lustrefsx/lustre/target/tgt_main.c        |  853 +++
 .../lustrefsx/lustre/target/update_records.c  | 1232 ++++
 .../lustrefsx/lustre/target/update_recovery.c | 1451 ++++
 .../lustrefsx/lustre/target/update_trans.c    | 1743 +++++
 drivers/staging/lustrefsx/undef.h             | 1256 ++++
 436 files changed, 358652 insertions(+)
 create mode 100644 drivers/staging/lustrefsx/config.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/crypto/llcrypt.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/glob.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fortify-string.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-uuid.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/processor.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/refcount.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/xarray.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/uapi/linux/llcrypt.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/crypto/crypto.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/crypto/fname.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/crypto/hkdf.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/crypto/hooks.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/crypto/keyring.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup_v1.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/crypto/llcrypt_private.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/crypto/policy.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/debug.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/fail.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/hash.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto-adler.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/glob.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/xarray.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/module.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
 create mode 100644 drivers/staging/lustrefsx/lnet/LICENSE
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/api.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/lnet_rdma.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/udsp.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-idl.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-nl.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd-idl.h
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/acceptor.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/api-ni.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/config.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-md.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-me.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-move.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lnet_rdma.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lo.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/module.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/net_fault.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/peer.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/router.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/router_proc.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/udsp.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/brw_test.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/conctl.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/conrpc.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/conrpc.h
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/console.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/console.h
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/framework.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/module.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/ping_test.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/rpc.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/rpc.h
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/selftest.h
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/timer.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/timer.h
 create mode 100644 drivers/staging/lustrefsx/lustre/LICENSE
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_handler.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_store.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_cache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_handler.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_index.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
 create mode 100644 drivers/staging/lustrefsx/lustre/include/cl_object.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/dt_object.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/interval_tree.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/llog_swab.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lu_object.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lu_ref.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lu_target.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/libiam.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_acl.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_compat.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_crypto.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_disk.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_errno.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_export.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_fid.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_fld.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_ha.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_handles.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_import.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_intent.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_lib.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_log.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_mds.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_net.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_osc.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_quota.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_sec.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_swab.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_update.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lvfs.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/md_object.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_cache.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_cksum.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_class.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_support.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_target.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obj_update.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/range_lock.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/seq_range.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lgss.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_access_log.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/upcall_cache.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/l_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/acl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/crypto.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/dcache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/dir.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/file.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/foreign_symlink.h
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/glimpse.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_foreign.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_foreign_symlink.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/namei.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/pcc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/pcc.h
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/rw.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/rw26.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/statahead.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/super25.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/symlink.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_io.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/xattr.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/xattr_security.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_ea.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_io.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_merge.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_obd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_offset.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_pack.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_pool.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_acl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/nodist
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/genops.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/idmap.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/integrity.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/interval_tree.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/jobid.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linkea.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/local_storage.h
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_pool.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obdo.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/range_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/scrub.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdecho/debug.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdecho/echo.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_cache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_io.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_quota.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/client.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/connection.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/events.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/heap.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/heap.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/import.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/pack_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/service.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/barrier.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/out_handler.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/out_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_fmd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_grant.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_handler.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_main.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/update_records.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/update_recovery.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/update_trans.c
 create mode 100644 drivers/staging/lustrefsx/undef.h

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
new file mode 100644
index 0000000000000..7baa3cd739b71
--- /dev/null
+++ b/drivers/staging/lustrefsx/config.h
@@ -0,0 +1,1258 @@
+/* config.h.  Generated from config.h.in by configure.  */
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* enable libcfs CDEBUG, CWARN */
+#define CDEBUG_ENABLED 1
+
+/* enable libcfs ENTRY/EXIT */
+#define CDEBUG_ENTRY_EXIT 1
+
+/* enable page state tracking code */
+/* #undef CONFIG_DEBUG_PAGESTATE_TRACKING */
+
+/* enable encryption for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_ENCRYPTION */
+
+/* posix acls for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_POSIX_ACL */
+
+/* enable rw access for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_RW */
+
+/* fs security for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_SECURITY */
+
+/* extened attributes for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_XATTR */
+
+/* embedded llcrypt */
+#define CONFIG_LL_ENCRYPTION 1
+
+/* enable invariant checking */
+/* #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+
+/* enable lu_ref reference tracking code */
+/* #undef CONFIG_LUSTRE_DEBUG_LU_REF */
+
+/* Use the Pinger */
+#define CONFIG_LUSTRE_FS_PINGER 1
+
+/* Enable POSIX acl */
+#define CONFIG_LUSTRE_FS_POSIX_ACL 1
+
+/* name of ldiskfs debug program */
+#define DEBUGFS "debugfs"
+
+/* name of ldiskfs dump program */
+#define DUMPE2FS "dumpe2fs"
+
+/* name of ldiskfs fsck program */
+#define E2FSCK "e2fsck"
+
+/* name of ldiskfs e2fsprogs package */
+#define E2FSPROGS "e2fsprogs"
+
+/* name of ldiskfs label program */
+#define E2LABEL "e2label"
+
+/* do data checksums */
+#define ENABLE_CHECKSUM 1
+
+/* enable flock by default */
+#define ENABLE_FLOCK 1
+
+/* filldir_t return type is bool or int */
+#define FILLDIR_TYPE bool
+
+/* rhashtable_walk_init() has 3 args */
+/* #undef HAVE_3ARG_RHASHTABLE_WALK_INIT */
+
+/* account_page_dirtied takes three arguments */
+/* #undef HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS */
+
+/* account_page_dirtied is exported */
+/* #undef HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT */
+
+/* 'get_acl' and 'set_acl' use dentry argument */
+/* #undef HAVE_ACL_WITH_DENTRY */
+
+/* aes-sha2 is supported by krb5 */
+/* #undef HAVE_AES_SHA2_SUPPORT */
+
+/* aio_complete defined */
+/* #undef HAVE_AIO_COMPLETE */
+
+/* 'alloc_file_pseudo' exist */
+#define HAVE_ALLOC_FILE_PSEUDO 1
+
+/* alloc_inode_sb() exists */
+#define HAVE_ALLOC_INODE_SB 1
+
+/* struct address_space_operations() has migrate_folio() */
+#define HAVE_AOPS_MIGRATE_FOLIO 1
+
+/* struct address_space_operations() has read_folio() */
+#define HAVE_AOPS_READ_FOLIO 1
+
+/* struct address_space_operations() has release_folio() */
+#define HAVE_AOPS_RELEASE_FOLIO 1
+
+/* Define to 1 if you have the <asm/types.h> header file. */
+#define HAVE_ASM_TYPES_H 1
+
+/* backing_dev_info exist */
+/* #undef HAVE_BACKING_DEV_INFO */
+
+/* BDI_CAP_MAP_COPY exist */
+/* #undef HAVE_BDI_CAP_MAP_COPY */
+
+/* backing_dev_info has io_pages */
+#define HAVE_BDI_IO_PAGES 1
+
+/* struct bio has bi_phys_segments member */
+/* #undef HAVE_BIO_BI_PHYS_SEGMENTS */
+
+/* bio_endio takes only one argument */
+#define HAVE_BIO_ENDIO_USES_ONE_ARG 1
+
+/* 'bio_integrity_enabled' is available */
+/* #undef HAVE_BIO_INTEGRITY_ENABLED */
+
+/* kernel has bio_integrity_prep_fn */
+/* #undef HAVE_BIO_INTEGRITY_PREP_FN */
+
+/* bio_integrity_prep_fn returns bool */
+#define HAVE_BIO_INTEGRITY_PREP_FN_RETURNS_BOOL 1
+
+/* 'bio_set_dev' is available */
+#define HAVE_BIO_SET_DEV 1
+
+/* bio_integrity_payload.bip_iter exist */
+#define HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD 1
+
+/* Linux bitmap can be allocated */
+#define HAVE_BITMAP_ALLOC 1
+
+/* 'bi_bdev' is available */
+#define HAVE_BI_BDEV 1
+
+/* struct bio has bi_opf */
+#define HAVE_BI_OPF 1
+
+/* 'bi_status' is available */
+#define HAVE_BI_STATUS 1
+
+/* kernel has struct blk_integrity_iter */
+#define HAVE_BLK_INTEGRITY_ITER 1
+
+/* kernel hash_64() is broken */
+/* #undef HAVE_BROKEN_HASH_64 */
+
+/* kernel has struct bvec_iter */
+#define HAVE_BVEC_ITER 1
+
+/* if bvec_iter_all exists for multi-page bvec iternation */
+#define HAVE_BVEC_ITER_ALL 1
+
+/* struct cache_detail has writers */
+#define HAVE_CACHE_DETAIL_WRITERS 1
+
+/* if cache_detail->hash_lock is a spinlock */
+#define HAVE_CACHE_HASH_SPINLOCK 1
+
+/* cache_head has hlist cache_list */
+#define HAVE_CACHE_HEAD_HLIST 1
+
+/* crypto/internal/cipher.h is present */
+#define HAVE_CIPHER_H 1
+
+/* kernel has clean_bdev_aliases */
+#define HAVE_CLEAN_BDEV_ALIASES 1
+
+/* 'clear_and_wake_up_bit' is available */
+#define HAVE_CLEAR_AND_WAKE_UP_BIT 1
+
+/* compat rdma found */
+/* #undef HAVE_COMPAT_RDMA */
+
+/* copy_file_range() is supported */
+#define HAVE_COPY_FILE_RANGE 1
+
+/* 'cpus_read_lock' exist */
+#define HAVE_CPUS_READ_LOCK 1
+
+/* crypto_alloc_skcipher is defined */
+#define HAVE_CRYPTO_ALLOC_SKCIPHER 1
+
+/* crypto hash helper functions are available */
+#define HAVE_CRYPTO_HASH_HELPERS 1
+
+/* 'CRYPTO_MAX_ALG_NAME' is 128 */
+#define HAVE_CRYPTO_MAX_ALG_NAME_128 1
+
+/* crypto/sha2.h is present */
+#define HAVE_CRYPTO_SHA2_HEADER 1
+
+/* current_time() has replaced CURRENT_TIME */
+#define HAVE_CURRENT_TIME 1
+
+/* Have db_dirty_records list_t */
+/* #undef HAVE_DB_DIRTY_RECORDS_LIST */
+
+/* default_file_splice_read is exported */
+/* #undef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT */
+
+/* delete_from_page_cache is exported */
+/* #undef HAVE_DELETE_FROM_PAGE_CACHE */
+
+/* dentry.d_child exist */
+#define HAVE_DENTRY_D_CHILD 1
+
+/* list dentry.d_u.d_alias exist */
+#define HAVE_DENTRY_D_U_D_ALIAS 1
+
+/* DES3 enctype is supported by krb5 */
+/* #undef HAVE_DES3_SUPPORT */
+
+/* direct_IO has 2 arguments */
+#define HAVE_DIRECTIO_2ARGS 1
+
+/* direct IO uses iov_iter */
+/* #undef HAVE_DIRECTIO_ITER */
+
+/* address_spaace_operaions->dirty_folio() member exists */
+#define HAVE_DIRTY_FOLIO 1
+
+/* dir_context exist */
+#define HAVE_DIR_CONTEXT 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Have dmu_object_alloc_dnsize in ZFS */
+/* #undef HAVE_DMU_OBJECT_ALLOC_DNSIZE */
+
+/* Have dmu_objset_disown() with 3 args */
+/* #undef HAVE_DMU_OBJSET_DISOWN_3ARG */
+
+/* Have dmu_objset_own() with 6 args */
+/* #undef HAVE_DMU_OBJSET_OWN_6ARG */
+
+/* Have dmu_offset_next() exported */
+/* #undef HAVE_DMU_OFFSET_NEXT */
+
+/* Have 6 argument dmu_pretch in ZFS */
+/* #undef HAVE_DMU_PREFETCH_6ARG */
+
+/* Have dmu_read_by_dnode() in ZFS */
+/* #undef HAVE_DMU_READ_BY_DNODE */
+
+/* Have dmu_tx_hold_write_by_dnode() in ZFS */
+/* #undef HAVE_DMU_TX_HOLD_WRITE_BY_DNODE */
+
+/* Have dmu_tx_hold_zap_by_dnode() in ZFS */
+/* #undef HAVE_DMU_TX_HOLD_ZAP_BY_DNODE */
+
+/* Have dmu_tx_mark_netfree */
+/* #undef HAVE_DMU_TX_MARK_NETFREE */
+
+/* Have native dnode accounting in ZFS */
+/* #undef HAVE_DMU_USEROBJ_ACCOUNTING */
+
+/* Have dmu_write_by_dnode() in ZFS */
+/* #undef HAVE_DMU_WRITE_BY_DNODE */
+
+/* down_write_killable function exists */
+#define HAVE_DOWN_WRITE_KILLABLE 1
+
+/* quotactl_ops.set_dqblk takes struct kqid */
+#define HAVE_DQUOT_KQID 1
+
+/* quotactl_ops.set_dqblk takes struct qc_dqblk */
+#define HAVE_DQUOT_QC_DQBLK 1
+
+/* dquot_transfer() has user_ns argument */
+#define HAVE_DQUOT_TRANSFER_WITH_USER_NS 1
+
+/* Have dsl_pool_config_enter/exit in ZFS */
+/* #undef HAVE_DSL_POOL_CONFIG */
+
+/* Have dsl_sync_task_do_nowait in ZFS */
+/* #undef HAVE_DSL_SYNC_TASK_DO_NOWAIT */
+
+/* d_compare need 4 arguments */
+#define HAVE_D_COMPARE_4ARGS 1
+
+/* d_compare need 5 arguments */
+/* #undef HAVE_D_COMPARE_5ARGS */
+
+/* d_count exist */
+#define HAVE_D_COUNT 1
+
+/* 'd_init' exists */
+#define HAVE_D_INIT 1
+
+/* d_in_lookup is defined */
+#define HAVE_D_IN_LOOKUP 1
+
+/* 'd_is_positive' is available */
+#define HAVE_D_IS_POSITIVE 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+#define HAVE_ENDIAN_H 1
+
+/* ethtool_link_settings is defined */
+#define HAVE_ETHTOOL_LINK_SETTINGS 1
+
+/* Define to 1 if you have the <ext2fs/ext2fs.h> header file. */
+/* #undef HAVE_EXT2FS_EXT2FS_H */
+
+/* ext4_bread takes 4 arguments */
+/* #undef HAVE_EXT4_BREAD_4ARGS */
+
+/* ext4_(inc|dec)_count() has 2 arguments */
+/* #undef HAVE_EXT4_INC_DEC_COUNT_2ARGS */
+
+/* i_dquot is in ext4_inode_info */
+/* #undef HAVE_EXT4_INFO_DQUOT */
+
+/* ext4_free_blocks do not require struct buffer_head */
+/* #undef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD */
+
+/* file handle and related syscalls are supported */
+#define HAVE_FHANDLE_GLIBC_SUPPORT 1
+
+/* union is unnamed */
+/* #undef HAVE_FID2PATH_ANON_UNIONS */
+
+/* filemap_get_folios_contig() is available */
+#define HAVE_FILEMAP_GET_FOLIOS_CONTIG 1
+
+/* kernel has file_dentry */
+#define HAVE_FILE_DENTRY 1
+
+/* file_operations.[read|write]_iter functions exist */
+#define HAVE_FILE_OPERATIONS_READ_WRITE_ITER 1
+
+/* filldir_t needs struct dir_context as argument */
+#define HAVE_FILLDIR_USE_CTX 1
+
+/* filldir_t needs struct dir_context and returns bool */
+#define HAVE_FILLDIR_USE_CTX_RETURN_BOOL 1
+
+/* FMR pool API is available */
+/* #undef HAVE_FMR_POOL_API */
+
+/* file_operations has iterate_shared */
+#define HAVE_FOP_ITERATE_SHARED 1
+
+/* force_sig() has task parameter */
+/* #undef HAVE_FORCE_SIG_WITH_TASK */
+
+/* 'struct fscrypt_digested_name' exists */
+/* #undef HAVE_FSCRYPT_DIGESTED_NAME */
+
+/* embedded llcrypt uses llcrypt_dummy_context_enabled() */
+#define HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED 1
+
+/* fscrypt_is_nokey_name() exists */
+#define HAVE_FSCRYPT_IS_NOKEY_NAME 1
+
+/* full_name_hash need 3 arguments */
+#define HAVE_FULL_NAME_HASH_3ARGS 1
+
+/* generic_write_sync has 2 arguments */
+#define HAVE_GENERIC_WRITE_SYNC_2ARGS 1
+
+/* struct genl_dumpit_info has family field */
+#define HAVE_GENL_DUMPIT_INFO 1
+
+/* Define to 1 if you have the `gethostbyname' function. */
+#define HAVE_GETHOSTBYNAME 1
+
+/* 'get_acl' has a rcu argument */
+#define HAVE_GET_ACL_RCU_ARG 1
+
+/* get_inode_usage function exists */
+#define HAVE_GET_INODE_USAGE 1
+
+/* get_random_[u32|u64] are available */
+#define HAVE_GET_RANDOM_U32_AND_U64 1
+
+/* get_random_u32_below() is available */
+#define HAVE_GET_RANDOM_U32_BELOW 1
+
+/* get_request_key_auth() is available */
+#define HAVE_GET_REQUEST_KEY_AUTH 1
+
+/* get_user_pages takes 6 arguments */
+/* #undef HAVE_GET_USER_PAGES_6ARG */
+
+/* get_user_pages takes gup_flags in arguments */
+#define HAVE_GET_USER_PAGES_GUP_FLAGS 1
+
+/* glob_match() is available */
+#define HAVE_GLOB 1
+
+/* grab_cache_page_write_begin() has flags argument */
+/* #undef HAVE_GRAB_CACHE_PAGE_WRITE_BEGIN_WITH_FLAGS */
+
+/* struct group_info has member gid */
+#define HAVE_GROUP_INFO_GID 1
+
+/* Define this is if you enable gss */
+/* #undef HAVE_GSS */
+
+/* Define this if you enable gss keyring backend */
+#define HAVE_GSS_KEYRING 1
+
+/* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
+/* #undef HAVE_GSS_KRB5_CCACHE_NAME */
+
+/* '__rhashtable_insert_fast()' returns int */
+/* #undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT */
+
+/* Define this if you have Heimdal Kerberos libraries */
+/* #undef HAVE_HEIMDAL */
+
+/* hlist_add_after is available */
+/* #undef HAVE_HLIST_ADD_AFTER */
+
+/* hotplug state machine is supported */
+#define HAVE_HOTPLUG_STATE_MACHINE 1
+
+/* hypervisor_is_type function exists */
+#define HAVE_HYPERVISOR_IS_TYPE 1
+
+/* ib_alloc_fast_reg_mr is defined */
+/* #undef HAVE_IB_ALLOC_FAST_REG_MR */
+
+/* ib_alloc_pd has 2 arguments */
+#define HAVE_IB_ALLOC_PD_2ARGS 1
+
+/* struct ib_cq_init_attr is used by ib_create_cq */
+#define HAVE_IB_CQ_INIT_ATTR 1
+
+/* struct ib_device.attrs is defined */
+#define HAVE_IB_DEVICE_ATTRS 1
+
+/* if struct ib_device_ops is defined */
+/* #undef HAVE_IB_DEVICE_OPS */
+
+/* ib_get_dma_mr is defined */
+/* #undef HAVE_IB_GET_DMA_MR */
+
+/* function ib_inc_rkey exist */
+#define HAVE_IB_INC_RKEY 1
+
+/* ib_map_mr_sg exists */
+#define HAVE_IB_MAP_MR_SG 1
+
+/* ib_map_mr_sg has 5 arguments */
+#define HAVE_IB_MAP_MR_SG_5ARGS 1
+
+/* ib_post_send and ib_post_recv have const parameters */
+#define HAVE_IB_POST_SEND_RECV_CONST 1
+
+/* struct ib_rdma_wr is defined */
+#define HAVE_IB_RDMA_WR 1
+
+/* if ib_sg_dma_address wrapper exists */
+/* #undef HAVE_IB_SG_DMA_ADDRESS */
+
+/* inode_operations .getattr member function can gather advance stats */
+/* #undef HAVE_INODEOPS_ENHANCED_GETATTR */
+
+/* inode_lock is defined */
+#define HAVE_INODE_LOCK 1
+
+/* inode times are using timespec64 */
+#define HAVE_INODE_TIMESPEC64 1
+
+/* blk_integrity.interval exist */
+/* #undef HAVE_INTERVAL_BLK_INTEGRITY */
+
+/* blk_integrity.interval_exp exist */
+#define HAVE_INTERVAL_EXP_BLK_INTEGRITY 1
+
+/* interval trees use rb_tree_cached */
+#define HAVE_INTERVAL_TREE_CACHED 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* address_spaace_operaions->invalidate_folio() member exists */
+#define HAVE_INVALIDATE_FOLIO 1
+
+/* address_space invalidate_lock member exists */
+#define HAVE_INVALIDATE_LOCK 1
+
+/* address_space_operations.invalidatepage needs 3 arguments */
+/* #undef HAVE_INVALIDATE_RANGE */
+
+/* have in_compat_syscall */
+#define HAVE_IN_COMPAT_SYSCALL 1
+
+/* 'in_dev_for_each_ifa_rtnl' is defined */
+#define HAVE_IN_DEV_FOR_EACH_IFA_RTNL 1
+
+/* inode_operations->rename need flags as argument */
+/* #undef HAVE_IOPS_RENAME_WITH_FLAGS */
+
+/* generic_readlink has been removed */
+/* #undef HAVE_IOP_GENERIC_READLINK */
+
+/* have iop get_link */
+#define HAVE_IOP_GET_LINK 1
+
+/* inode_operations has .set_acl member function */
+#define HAVE_IOP_SET_ACL 1
+
+/* inode_operations has {get,set,remove}xattr members */
+/* #undef HAVE_IOP_XATTR */
+
+/* iov_iter_get_pages_alloc2() is available */
+#define HAVE_IOV_ITER_GET_PAGES_ALLOC2 1
+
+/* if iov_iter has member iter_type */
+#define HAVE_IOV_ITER_HAS_ITER_TYPE_MEMBER 1
+
+/* if iov_iter has member type */
+/* #undef HAVE_IOV_ITER_HAS_TYPE_MEMBER */
+
+/* iov_iter_init handles directional tag */
+#define HAVE_IOV_ITER_INIT_DIRECTION 1
+
+/* iov_iter_rw exist */
+#define HAVE_IOV_ITER_RW 1
+
+/* iov_iter_truncate exists */
+#define HAVE_IOV_ITER_TRUNCATE 1
+
+/* if iov_iter_type exists */
+#define HAVE_IOV_ITER_TYPE 1
+
+/* is_root_inode defined */
+#define HAVE_IS_ROOT_INODE 1
+
+/* 'iter_file_splice_write' exists */
+#define HAVE_ITER_FILE_SPLICE_WRITE 1
+
+/* struct address_space has i_pages */
+#define HAVE_I_PAGES 1
+
+/* if jbd2_journal_get_max_txn_bufs is available */
+/* #undef HAVE_JBD2_JOURNAL_GET_MAX_TXN_BUFS */
+
+/* struct jbd2_journal_handle has h_total_credits member */
+/* #undef HAVE_JOURNAL_TOTAL_CREDITS */
+
+/* kallsyms_lookup_name is exported by kernel */
+/* #undef HAVE_KALLSYMS_LOOKUP_NAME */
+
+/* 'kernel_param_[un]lock' is available */
+#define HAVE_KERNEL_PARAM_LOCK 1
+
+/* 'struct kernel_param_ops' is available */
+#define HAVE_KERNEL_PARAM_OPS 1
+
+/* kernel_read() signature ends with loff_t *pos */
+#define HAVE_KERNEL_READ_LAST_POSP 1
+
+/* kernel_setsockopt still in use */
+/* #undef HAVE_KERNEL_SETSOCKOPT */
+
+/* 'getname' has two args */
+#define HAVE_KERN_SOCK_GETNAME_2ARGS 1
+
+/* keyring_search has 4 args */
+#define HAVE_KEYRING_SEARCH_4ARGS 1
+
+/* struct key_match_data exist */
+#define HAVE_KEY_MATCH_DATA 1
+
+/* payload.data is an array */
+#define HAVE_KEY_PAYLOAD_DATA_ARRAY 1
+
+/* key_type->instantiate has two args */
+/* #undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS */
+
+/* key.usage is of type refcount_t */
+#define HAVE_KEY_USAGE_REFCOUNT 1
+
+/* kfree_sensitive() is available. */
+#define HAVE_KFREE_SENSITIVE 1
+
+/* kiocb->ki_complete() has 2 arguments */
+#define HAVE_KIOCB_COMPLETE_2ARGS 1
+
+/* ki_left exist */
+/* #undef HAVE_KIOCB_KI_LEFT */
+
+/* ki_nbytes field exist */
+/* #undef HAVE_KI_NBYTES */
+
+/* kmap_to_page is exported by the kernel */
+/* #undef HAVE_KMAP_TO_PAGE */
+
+/* struct kobj_type has 'default_groups' member */
+#define HAVE_KOBJ_TYPE_DEFAULT_GROUPS 1
+
+/* Define this if you have MIT Kerberos libraries */
+/* #undef HAVE_KRB5 */
+
+/* Define this if the function krb5int_derive_key is available */
+/* #undef HAVE_KRB5INT_DERIVE_KEY */
+
+/* Define this if the function krb5_derive_key is available */
+/* #undef HAVE_KRB5_DERIVE_KEY */
+
+/* Define this if the function krb5_get_error_message is available */
+/* #undef HAVE_KRB5_GET_ERROR_MESSAGE */
+
+/* Define this if the function krb5_get_init_creds_opt_set_addressless is
+   available */
+/* #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS */
+
+/* kref_read() is available */
+#define HAVE_KREF_READ 1
+
+/* kset_find_obj is exported by the kernel */
+#define HAVE_KSET_FIND_OBJ 1
+
+/* kernel has kstrtobool_from_user */
+#define HAVE_KSTRTOBOOL_FROM_USER 1
+
+/* kthread_worker found */
+/* #undef HAVE_KTHREAD_WORK */
+
+/* ktime_add is available */
+#define HAVE_KTIME_ADD 1
+
+/* ktime_after is available */
+#define HAVE_KTIME_AFTER 1
+
+/* ktime_before is available */
+#define HAVE_KTIME_BEFORE 1
+
+/* ktime_compare is available */
+#define HAVE_KTIME_COMPARE 1
+
+/* 'ktime_get_real_seconds' is available */
+#define HAVE_KTIME_GET_REAL_SECONDS 1
+
+/* 'ktime_get_real_ts64' is available */
+#define HAVE_KTIME_GET_REAL_TS64 1
+
+/* 'ktime_get_seconds' is available */
+#define HAVE_KTIME_GET_SECONDS 1
+
+/* 'ktime_get_ts64' is available */
+#define HAVE_KTIME_GET_TS64 1
+
+/* 'ktime_ms_delta' is available */
+#define HAVE_KTIME_MS_DELTA 1
+
+/* 'ktime_to_timespec64' is available */
+#define HAVE_KTIME_TO_TIMESPEC64 1
+
+/* ldiskfsfs_dirhash takes an inode argument */
+/* #undef HAVE_LDISKFSFS_GETHASH_INODE_ARG */
+
+/* enable use of ldiskfsprogs package */
+/* #undef HAVE_LDISKFSPROGS */
+
+/* EXT4_GET_BLOCKS_KEEP_SIZE exists */
+/* #undef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE */
+
+/* if ldiskfs_iget takes a flags argument */
+/* #undef HAVE_LDISKFS_IGET_WITH_FLAGS */
+
+/* 'ext4_journal_ensure_credits' exists */
+/* #undef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS */
+
+/* Enable ldiskfs osd */
+/* #undef HAVE_LDISKFS_OSD */
+
+/* libefence support is requested */
+/* #undef HAVE_LIBEFENCE */
+
+/* Define to 1 if you have the `keyutils' library (-lkeyutils). */
+#define HAVE_LIBKEYUTILS 1
+
+/* use libpthread for libcfs library */
+#define HAVE_LIBPTHREAD 1
+
+/* readline library is available */
+/* #undef HAVE_LIBREADLINE */
+
+/* linux/blk-integrity.h is present */
+#define HAVE_LINUX_BLK_INTEGRITY_HEADER 1
+
+/* linux/fortify-string.h header available */
+#define HAVE_LINUX_FORTIFY_STRING_HEADER 1
+
+/* linux/stdarg.h is present */
+#define HAVE_LINUX_STDARG_HEADER 1
+
+/* list_cmp_func_t type is defined */
+#define HAVE_LIST_CMP_FUNC_T 1
+
+/* lock_manager_operations has lm_compare_owner */
+/* #undef HAVE_LM_COMPARE_OWNER */
+
+/* kernel has locks_lock_file_wait */
+#define HAVE_LOCKS_LOCK_FILE_WAIT 1
+
+/* lock_page_memcg is defined */
+#define HAVE_LOCK_PAGE_MEMCG 1
+
+/* lookup_user_key() is available */
+#define HAVE_LOOKUP_USER_KEY 1
+
+/* Enable lru resize support */
+#define HAVE_LRU_RESIZE_SUPPORT 1
+
+/* lsmcontext_init is available */
+/* #undef HAVE_LSMCONTEXT_INIT */
+
+/* Define this if the Kerberos GSS library supports
+   gss_krb5_export_lucid_sec_context */
+/* #undef HAVE_LUCID_CONTEXT_SUPPORT */
+
+/* Enable Lustre client crypto via embedded llcrypt */
+#define HAVE_LUSTRE_CRYPTO 1
+
+/* enum mapping_flags has AS_EXITING flag */
+#define HAVE_MAPPING_AS_EXITING_FLAG 1
+
+/* match_wildcard() is available */
+#define HAVE_MATCH_WILDCARD 1
+
+/* memalloc_noreclaim_{save,restore}() is supported */
+#define HAVE_MEMALLOC_RECLAIM 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* mmap_lock API is available. */
+#define HAVE_MMAP_LOCK 1
+
+/* kernel module loading is possible */
+#define HAVE_MODULE_LOADING_SUPPORT 1
+
+/* Define to 1 if you have the `name_to_handle_at' function. */
+#define HAVE_NAME_TO_HANDLE_AT 1
+
+/* support native Linux client */
+/* #undef HAVE_NATIVE_LINUX_CLIENT */
+
+/* Define to 1 if you have the <netdb.h> header file. */
+#define HAVE_NETDB_H 1
+
+/* struct genl_ops has 'start' callback */
+#define HAVE_NETLINK_CALLBACK_START 1
+
+/* DEFINE_TIMER uses only 2 arguements */
+#define HAVE_NEW_DEFINE_TIMER 1
+
+/* 'kernel_write' aligns with read/write helpers */
+#define HAVE_NEW_KERNEL_WRITE 1
+
+/* libnl3 supports nla_get_s32 */
+#define HAVE_NLA_GET_S32 1
+
+/* libnl3 supports nla_get_s64 */
+#define HAVE_NLA_GET_S64 1
+
+/* 'nla_strdup' is available */
+#define HAVE_NLA_STRDUP 1
+
+/* 'nla_strlcpy' is available */
+/* #undef HAVE_NLA_STRLCPY */
+
+/* netlink_ext_ack is handled for Netlink dump handlers */
+#define HAVE_NL_DUMP_WITH_EXT_ACK 1
+
+/* netlink_ext_ack is an argument to nla_parse type function */
+#define HAVE_NL_PARSE_WITH_EXT_ACK 1
+
+/* no_llseek() is available */
+/* #undef HAVE_NO_LLSEEK */
+
+/* NR_UNSTABLE_NFS is still in use. */
+/* #undef HAVE_NR_UNSTABLE_NFS */
+
+/* ns_to_timespec64() is available */
+#define HAVE_NS_TO_TIMESPEC64 1
+
+/* with oldsize */
+/* #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE */
+
+/* OpenSSL EVP_PKEY_get_params */
+/* #undef HAVE_OPENSSL_EVP_PKEY */
+
+/* openssl-devel is present */
+/* #undef HAVE_OPENSSL_GETSEPOL */
+
+/* OpenSSL HMAC functions needed for SSK */
+/* #undef HAVE_OPENSSL_SSK */
+
+/* if Oracle OFED Extensions are enabled */
+/* #undef HAVE_ORACLE_OFED_EXTENSIONS */
+
+/* 'pagevec_init' takes one parameter */
+#define HAVE_PAGEVEC_INIT_ONE_PARAM 1
+
+/* linux/panic_notifier.h is present */
+#define HAVE_PANIC_NOTIFIER_H 1
+
+/* 'param_set_uint_minmax' is available */
+#define HAVE_PARAM_SET_UINT_MINMAX 1
+
+/* percpu_counter_init uses GFP_* flag */
+#define HAVE_PERCPU_COUNTER_INIT_GFP_FLAG 1
+
+/* 'struct nsproxy' has 'pid_ns_for_children' */
+#define HAVE_PID_NS_FOR_CHILDREN 1
+
+/* 'posix_acl_update_mode' is available */
+/* #undef HAVE_POSIX_ACL_UPDATE_MODE */
+
+/* posix_acl_valid takes struct user_namespace */
+#define HAVE_POSIX_ACL_VALID_USER_NS 1
+
+/* 'prepare_to_wait_event' is available */
+#define HAVE_PREPARE_TO_WAIT_EVENT 1
+
+/* processor.h is present */
+#define HAVE_PROCESSOR_H 1
+
+/* struct proc_ops exists */
+#define HAVE_PROC_OPS 1
+
+/* get_projid function exists */
+#define HAVE_PROJECT_QUOTA 1
+
+/* 'PTR_ERR_OR_ZERO' exist */
+#define HAVE_PTR_ERR_OR_ZERO 1
+
+/* If available, contains the Python version number currently in use. */
+#define HAVE_PYTHON "3.9"
+
+/* radix_tree_tag_set exists */
+#define HAVE_RADIX_TREE_TAG_SET 1
+
+/* rdma_connect_locked is defined */
+#define HAVE_RDMA_CONNECT_LOCKED 1
+
+/* rdma_create_id wants 4 args */
+/* #undef HAVE_RDMA_CREATE_ID_4ARG */
+
+/* rdma_create_id wants 5 args */
+#define HAVE_RDMA_CREATE_ID_5ARG 1
+
+/* rdma_reject has 4 arguments */
+#define HAVE_RDMA_REJECT_4ARGS 1
+
+/* read_cache_page() filler_t needs struct file */
+#define HAVE_READ_CACHE_PAGE_WANTS_FILE 1
+
+/* refcount_t is supported */
+#define HAVE_REFCOUNT_T 1
+
+/* register_shrinker() returns status */
+#define HAVE_REGISTER_SHRINKER_FORMAT_NAMED 1
+
+/* register_shrinker() returns status */
+/* #undef HAVE_REGISTER_SHRINKER_RET */
+
+/* rhashtable_lookup() is available */
+#define HAVE_RHASHTABLE_LOOKUP 1
+
+/* rhashtable_lookup_get_insert_fast() is available */
+#define HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST 1
+
+/* rhashtable_replace_fast() is available */
+#define HAVE_RHASHTABLE_REPLACE 1
+
+/* rhashtable_walk_enter() is available */
+#define HAVE_RHASHTABLE_WALK_ENTER 1
+
+/* struct rhltable exist */
+#define HAVE_RHLTABLE 1
+
+/* rht_bucket_var() is available */
+#define HAVE_RHT_BUCKET_VAR 1
+
+/* save_stack_trace_tsk is exported */
+/* #undef HAVE_SAVE_STACK_TRACE_TSK */
+
+/* Have sa_spill_alloc in ZFS */
+/* #undef HAVE_SA_SPILL_ALLOC */
+
+/* linux/sched header directory exist */
+#define HAVE_SCHED_HEADERS 1
+
+/* security_dentry_init_security needs lsmcontext */
+/* #undef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX */
+
+/* security_dentry_init_security() returns xattr name */
+#define HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG 1
+
+/* security_release_secctx has 1 arg. */
+/* #undef HAVE_SEC_RELEASE_SECCTX_1ARG */
+
+/* support for selinux */
+#define HAVE_SELINUX 1
+
+/* Define to 1 if you have the <selinux/selinux.h> header file. */
+#define HAVE_SELINUX_SELINUX_H 1
+
+/* support server */
+/* #undef HAVE_SERVER_SUPPORT */
+
+/* Define this if the Kerberos GSS library supports
+   gss_krb5_set_allowable_enctypes */
+/* #undef HAVE_SET_ALLOWABLE_ENCTYPES */
+
+/* shrinker has count_objects member */
+#define HAVE_SHRINKER_COUNT 1
+
+/* sk_data_ready uses only one argument */
+#define HAVE_SK_DATA_READY_ONE_ARG 1
+
+/* sock_create_kern use net as first parameter */
+#define HAVE_SOCK_CREATE_KERN_USE_NET 1
+
+/* Have spa_maxblocksize in ZFS */
+/* #undef HAVE_SPA_MAXBLOCKSIZE */
+
+/* struct stacktrace_ops exists */
+/* #undef HAVE_STACKTRACE_OPS */
+
+/* Define to 1 if you have the `statx' function. */
+#define HAVE_STATX 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* stringhash.h is present */
+#define HAVE_STRINGHASH 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strnlen' function. */
+#define HAVE_STRNLEN 1
+
+/* kernel strscpy is available */
+/* #undef HAVE_STRSCPY */
+
+/* struct posix_acl_xattr_{header,entry} defined */
+#define HAVE_STRUCT_POSIX_ACL_XATTR 1
+
+/* submit_bio takes two arguments */
+/* #undef HAVE_SUBMIT_BIO_2ARGS */
+
+/* 'super_setup_bdi_name' is available */
+#define HAVE_SUPER_SETUP_BDI_NAME 1
+
+/* symlink inode operations need struct nameidata argument */
+/* #undef HAVE_SYMLINK_OPS_USE_NAMEIDATA */
+
+/* new_sync_[read|write] is exported by the kernel */
+/* #undef HAVE_SYNC_READ_WRITE */
+
+/* Define to 1 if you have <sys/quota.h>. */
+#define HAVE_SYS_QUOTA_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* 's_uuid' is an uuid_t */
+#define HAVE_S_UUID_AS_UUID_T 1
+
+/* task_is_running() is defined */
+#define HAVE_TASK_IS_RUNNING 1
+
+/* 'tcp_sock_set_keepcnt()' exists */
+#define HAVE_TCP_SOCK_SET_KEEPCNT 1
+
+/* 'tcp_sock_set_keepidle()' exists */
+#define HAVE_TCP_SOCK_SET_KEEPIDLE 1
+
+/* 'tcp_sock_set_keepintvl()' exists */
+#define HAVE_TCP_SOCK_SET_KEEPINTVL 1
+
+/* 'tcp_sock_set_nodelay()' exists */
+#define HAVE_TCP_SOCK_SET_NODELAY 1
+
+/* 'tcp_sock_set_quickack()' exists */
+#define HAVE_TCP_SOCK_SET_QUICKACK 1
+
+/* timer_setup has replaced setup_timer */
+#define HAVE_TIMER_SETUP 1
+
+/* 'struct timespec64' is available */
+#define HAVE_TIMESPEC64 1
+
+/* 'timespec64_sub' is available */
+#define HAVE_TIMESPEC64_SUB 1
+
+/* 'timespec64_to_ktime' is available */
+#define HAVE_TIMESPEC64_TO_KTIME 1
+
+/* topology_sibling_cpumask is available */
+#define HAVE_TOPOLOGY_SIBLING_CPUMASK 1
+
+/* if totalram_pages is a function */
+#define HAVE_TOTALRAM_PAGES_AS_FUNC 1
+
+/* kernel has truncate_inode_pages_final */
+#define HAVE_TRUNCATE_INODE_PAGES_FINAL 1
+
+/* if MS_RDONLY was moved to uapi/linux/mount.h */
+#define HAVE_UAPI_LINUX_MOUNT_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* 'inode_operations' members have user namespace argument */
+#define HAVE_USER_NAMESPACE_ARG 1
+
+/* 'enum nlmsgerr_attrs' exists */
+#define HAVE_USRSPC_NLMSGERR 1
+
+/* RDMA_PS_TCP exists */
+#define HAVE_USRSPC_RDMA_PS_TCP 1
+
+/* 'uuid_t' exist */
+#define HAVE_UUID_T 1
+
+/* kernel has vfs_rename with 5 args */
+/* #undef HAVE_VFS_RENAME_5ARGS */
+
+/* kernel has vfs_rename with 6 args */
+/* #undef HAVE_VFS_RENAME_6ARGS */
+
+/* '__vfs_setxattr' is available */
+/* #undef HAVE_VFS_SETXATTR */
+
+/* kernel has vfs_unlink with 3 args */
+/* #undef HAVE_VFS_UNLINK_3ARGS */
+
+/* __vmalloc only takes 2 args. */
+#define HAVE_VMALLOC_2ARGS 1
+
+/* virtual_address has been replaced by address field */
+#define HAVE_VM_FAULT_ADDRESS 1
+
+/* if VM_FAULT_RETRY is defined */
+#define HAVE_VM_FAULT_RETRY 1
+
+/* if vm_fault_t type exists */
+#define HAVE_VM_FAULT_T 1
+
+/* 'struct vm_operations' remove struct vm_area_struct argument */
+#define HAVE_VM_OPS_USE_VM_FAULT_ONLY 1
+
+/* wait_bit.h is present */
+#define HAVE_WAIT_BIT_HEADER_H 1
+
+/* if struct wait_bit_queue_entry exists */
+#define HAVE_WAIT_BIT_QUEUE_ENTRY 1
+
+/* 'wait_queue_entry_t' is available */
+#define HAVE_WAIT_QUEUE_ENTRY 1
+
+/* linux wait_queue_head_t list_head is name head */
+#define HAVE_WAIT_QUEUE_ENTRY_LIST 1
+
+/* 'wait_var_event' is available */
+#define HAVE_WAIT_VAR_EVENT 1
+
+/* 'wait_woken, is available' */
+#define HAVE_WAIT_WOKEN 1
+
+/* kernel Xarray implementation lacks 'xa_is_value' */
+#define HAVE_XARRAY_SUPPORT 1
+
+/* needs inode parameter */
+/* #undef HAVE_XATTR_HANDLER_INODE_PARAM */
+
+/* xattr_handler has a name member */
+#define HAVE_XATTR_HANDLER_NAME 1
+
+/* handler pointer is parameter */
+/* #undef HAVE_XATTR_HANDLER_SIMPLIFIED */
+
+/* Have zap_add_by_dnode() in ZFS */
+/* #undef HAVE_ZAP_ADD_BY_DNODE */
+
+/* Have zap_lookup_by_dnode() in ZFS */
+/* #undef HAVE_ZAP_LOOKUP_BY_DNODE */
+
+/* Have zap_remove_by_dnode() in ZFS */
+/* #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE */
+
+/* Have inode_timespec_t */
+/* #undef HAVE_ZFS_INODE_TIMESPEC */
+
+/* Have multihost protection in ZFS */
+/* #undef HAVE_ZFS_MULTIHOST */
+
+/* Enable zfs osd */
+/* #undef HAVE_ZFS_OSD */
+
+/* Have zfs_refcount_add */
+/* #undef HAVE_ZFS_REFCOUNT_ADD */
+
+/* Have zfs_refcount.h */
+/* #undef HAVE_ZFS_REFCOUNT_HEADER */
+
+/* struct bio has __bi_cnt */
+#define HAVE___BI_CNT 1
+
+/* if __ldiskfs_find_entry is available */
+/* #undef HAVE___LDISKFS_FIND_ENTRY */
+
+/* function pde_data() available */
+#define HAVE_pde_data 1
+
+/* ext4_journal_start takes 3 arguments */
+/* #undef JOURNAL_START_HAS_3ARGS */
+
+/* Define this as the Kerberos version number */
+/* #undef KRB5_VERSION */
+
+/* enable libcfs LASSERT, LASSERTF */
+#define LIBCFS_DEBUG 1
+
+/* use dumplog on panic */
+/* #undef LNET_DUMP_ON_PANIC */
+
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
+#define LT_OBJDIR ".libs/"
+
+/* Fourth number in the Lustre version */
+#define LUSTRE_FIX 0
+
+/* First number in the Lustre version */
+#define LUSTRE_MAJOR 2
+
+/* Second number in the Lustre version */
+#define LUSTRE_MINOR 15
+
+/* Third number in the Lustre version */
+#define LUSTRE_PATCH 3
+
+/* A copy of PACKAGE_VERSION */
+#define LUSTRE_VERSION_STRING "2.15.3_114_gb61b66c_dirty"
+
+/* maximum number of MDS threads */
+/* #undef MDS_MAX_THREADS */
+
+/* Report minimum OST free space */
+/* #undef MIN_DF */
+
+/* name of ldiskfs mkfs program */
+#define MKE2FS "mke2fs"
+
+/* 'ktime_get_ns' is not available */
+/* #undef NEED_KTIME_GET_NS */
+
+/* 'ktime_get_real_ns' is not available */
+/* #undef NEED_KTIME_GET_REAL_NS */
+
+/* lockdep_is_held() argument is const */
+/* #undef NEED_LOCKDEP_IS_HELD_DISCARD_CONST */
+
+/* Name of package */
+#define PACKAGE "lustre"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "https://jira.whamcloud.com/"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "Lustre"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "Lustre 2.15.3_114_gb61b66c_dirty"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "lustre"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.15.3_114_gb61b66c_dirty"
+
+/* name of parallel fsck program */
+#define PFSCK "fsck"
+
+/* enable randomly alloc failure */
+#define RANDOM_FAIL_ALLOC 1
+
+/* The size of `unsigned long long', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_LONG_LONG 8
+
+/* use tunable backoff TCP */
+/* #undef SOCKNAL_BACKOFF */
+
+/* tunable backoff TCP in ms */
+/* #undef SOCKNAL_BACKOFF_MS */
+
+/* 'struct stacktrace_ops' address function returns an int */
+/* #undef STACKTRACE_OPS_ADDRESS_RETURN_INT */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* name of ldiskfs tune program */
+#define TUNE2FS "tune2fs"
+
+/* Define this if the private function, gss_krb5_cache_name, must be used to
+   tell the Kerberos library which credentials cache to use. Otherwise, this
+   is done by setting the KRB5CCNAME environment variable */
+/* #undef USE_GSS_KRB5_CCACHE_NAME */
+
+/* Write when Checking Health */
+/* #undef USE_HEALTH_CHECK_WRITE */
+
+/* Version number of package */
+#define VERSION "2.15.3_114_gb61b66c_dirty"
+
+/* vfs_setxattr() value argument is non-const */
+#define VFS_SETXATTR_VALUE(value) (value)
+
+/* zfs fix version */
+/* #undef ZFS_FIX */
+
+/* zfs major version */
+/* #undef ZFS_MAJOR */
+
+/* zfs minor version */
+/* #undef ZFS_MINOR */
+
+/* zfs patch version */
+/* #undef ZFS_PATCH */
+
+/* get_random_u32() is not available, use prandom_u32 */
+/* #undef get_random_u32 */
+
+/* get_random_u32_below() is not available */
+/* #undef get_random_u32_below */
+
+/* function pde_data() unavailable */
+/* #undef pde_data */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
new file mode 100644
index 0000000000000..b2bd5991632c7
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
@@ -0,0 +1,118 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#ifndef _LIBCFS_BITMAP_H_
+#define _LIBCFS_BITMAP_H_
+
+#include <linux/interrupt.h>
+#include <libcfs/libcfs_private.h>
+
+struct cfs_bitmap {
+	unsigned int size;
+	unsigned long data[0];
+};
+
+#define CFS_BITMAP_SIZE(nbits) \
+	(BITS_TO_LONGS(nbits) * sizeof(long) + sizeof(struct cfs_bitmap))
+
+static inline
+struct cfs_bitmap *CFS_ALLOCATE_BITMAP(int size)
+{
+	struct cfs_bitmap *ptr;
+
+	LIBCFS_ALLOC(ptr, CFS_BITMAP_SIZE(size));
+	if (ptr == NULL)
+		RETURN(ptr);
+
+	ptr->size = size;
+
+	RETURN(ptr);
+}
+
+static inline void CFS_RESET_BITMAP(struct cfs_bitmap *bitmap)
+{
+	if (bitmap->size > 0) {
+		int nbits = bitmap->size;
+
+		memset(bitmap, 0, CFS_BITMAP_SIZE(nbits));
+		bitmap->size = nbits;
+	}
+}
+
+#define CFS_FREE_BITMAP(ptr)	LIBCFS_FREE(ptr, CFS_BITMAP_SIZE(ptr->size))
+
+static inline
+void cfs_bitmap_set(struct cfs_bitmap *bitmap, int nbit)
+{
+	set_bit(nbit, bitmap->data);
+}
+
+static inline
+void cfs_bitmap_clear(struct cfs_bitmap *bitmap, int nbit)
+{
+	test_and_clear_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_check(struct cfs_bitmap *bitmap, int nbit)
+{
+	return test_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_test_and_clear(struct cfs_bitmap *bitmap, int nbit)
+{
+	return test_and_clear_bit(nbit, bitmap->data);
+}
+
+/* return 0 is bitmap has none set bits */
+static inline
+int cfs_bitmap_check_empty(struct cfs_bitmap *bitmap)
+{
+	return find_first_bit(bitmap->data, bitmap->size) == bitmap->size;
+}
+
+static inline
+void cfs_bitmap_copy(struct cfs_bitmap *new, struct cfs_bitmap *old)
+{
+	size_t newsize;
+
+	LASSERT(new->size >= old->size);
+	newsize = new->size;
+	memcpy(new, old, CFS_BITMAP_SIZE(old->size));
+	new->size = newsize;
+}
+
+#define cfs_foreach_bit(bitmap, pos)					\
+	for ((pos) = find_first_bit((bitmap)->data, bitmap->size);	\
+	     (pos) < (bitmap)->size;					\
+	     (pos) = find_next_bit((bitmap)->data, (bitmap)->size, (pos) + 1))
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/crypto/llcrypt.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/crypto/llcrypt.h
new file mode 100644
index 0000000000000..d05ff2af4200b
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/crypto/llcrypt.h
@@ -0,0 +1,798 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * llcrypt.h: declarations for per-file encryption
+ *
+ * Filesystems that implement per-file encryption must include this header
+ * file.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+#ifndef _LINUX_LLCRYPT_H
+#define _LINUX_LLCRYPT_H
+
+#ifndef DCACHE_ENCRYPTED_NAME
+#define DCACHE_ENCRYPTED_NAME 0x02000000
+#endif
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <lustre_disk.h>
+#include <uapi/linux/llcrypt.h>
+
+#define LL_CRYPTO_BLOCK_SIZE		16
+
+struct llcrypt_ctx;
+struct llcrypt_info;
+
+struct llcrypt_str {
+	unsigned char *name;
+	u32 len;
+};
+
+struct llcrypt_name {
+	const struct qstr *usr_fname;
+	struct llcrypt_str disk_name;
+	u32 hash;
+	u32 minor_hash;
+	struct llcrypt_str crypto_buf;
+	bool is_ciphertext_name;
+};
+
+#define LLTR_INIT(n, l)		{ .name = n, .len = l }
+#define LLTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
+#define lname_name(p)		((p)->disk_name.name)
+#define lname_len(p)		((p)->disk_name.len)
+
+/* Maximum value for the third parameter of llcrypt_operations.set_context(). */
+#define LLCRYPT_SET_CONTEXT_MAX_SIZE	40
+#define LLCRYPT_DIGESTED_CHAR_OLD	'_'
+#define LLCRYPT_DIGESTED_CHAR		'+'
+
+#ifdef CONFIG_LL_ENCRYPTION
+/*
+ * llcrypt superblock flags
+ */
+#define LL_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto operations for filesystems
+ */
+struct llcrypt_operations {
+	unsigned int flags;
+	const char *key_prefix;
+	int (*get_context)(struct inode *, void *, size_t);
+	int (*set_context)(struct inode *, const void *, size_t, void *);
+	bool (*dummy_context)(struct inode *);
+	bool (*empty_dir)(struct inode *);
+	unsigned int max_namelen;
+};
+
+/* Decryption work */
+struct llcrypt_ctx {
+	union {
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		};
+		struct list_head free_list;	/* Free list */
+	};
+	u8 flags;				/* Flags */
+};
+
+extern bool llcrypt_has_encryption_key(const struct inode *inode);
+
+static inline bool llcrypt_dummy_context_enabled(struct inode *inode)
+{
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+
+	if (unlikely(!lsi))
+		return false;
+
+	return lsi->lsi_cop->dummy_context &&
+		lsi->lsi_cop->dummy_context(inode);
+}
+
+/*
+ * When d_splice_alias() moves a directory's encrypted alias to its decrypted
+ * alias as a result of the encryption key being added, DCACHE_ENCRYPTED_NAME
+ * must be cleared.  Note that we don't have to support arbitrary moves of this
+ * flag because llcrypt doesn't allow encrypted aliases to be the source or
+ * target of a rename().
+ */
+static inline void llcrypt_handle_d_move(struct dentry *dentry)
+{
+	dentry->d_flags &= ~DCACHE_ENCRYPTED_NAME;
+}
+
+/* crypto.c */
+extern int __init llcrypt_init(void);
+extern void __exit llcrypt_exit(void);
+extern void llcrypt_enqueue_decrypt_work(struct work_struct *);
+extern struct llcrypt_ctx *llcrypt_get_ctx(gfp_t);
+extern void llcrypt_release_ctx(struct llcrypt_ctx *);
+
+extern struct page *llcrypt_encrypt_pagecache_blocks(struct page *page,
+						     unsigned int len,
+						     unsigned int offs,
+						     gfp_t gfp_flags);
+extern int llcrypt_encrypt_block(const struct inode *inode, struct page *src,
+			 struct page *dst, unsigned int len,
+			 unsigned int offs, u64 lblk_num, gfp_t gfp_flags);
+
+extern int llcrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len,
+					    unsigned int offs);
+
+extern int llcrypt_decrypt_block(const struct inode *inode, struct page *src,
+			 struct page *dst, unsigned int len,
+			 unsigned int offs, u64 lblk_num, gfp_t gfp_flags);
+
+static inline int llcrypt_decrypt_block_inplace(const struct inode *inode,
+						struct page *page,
+						unsigned int len,
+						unsigned int offs,
+						u64 lblk_num)
+{
+	return llcrypt_decrypt_block(inode, page, page, len, offs, lblk_num,
+				     GFP_NOFS);
+}
+
+static inline bool llcrypt_is_bounce_page(struct page *page)
+{
+	return page->mapping == NULL;
+}
+
+static inline struct page *llcrypt_pagecache_page(struct page *bounce_page)
+{
+	return (struct page *)page_private(bounce_page);
+}
+
+extern void llcrypt_free_bounce_page(struct page *bounce_page);
+
+/* policy.c */
+extern int llcrypt_ioctl_set_policy(struct file *, const void __user *);
+extern int llcrypt_ioctl_get_policy(struct file *, void __user *);
+extern int llcrypt_ioctl_get_policy_ex(struct file *, void __user *);
+extern int llcrypt_has_permitted_context(struct inode *, struct inode *);
+extern int llcrypt_inherit_context(struct inode *, struct inode *,
+					void *, bool);
+extern bool llcrypt_policy_has_filename_enc(struct inode *inode);
+/* keyring.c */
+extern void llcrypt_sb_free(struct super_block *sb);
+extern int llcrypt_ioctl_add_key(struct file *filp, void __user *arg);
+extern int llcrypt_ioctl_remove_key(struct file *filp, void __user *arg);
+extern int llcrypt_ioctl_remove_key_all_users(struct file *filp,
+					      void __user *arg);
+extern int llcrypt_ioctl_get_key_status(struct file *filp, void __user *arg);
+
+/* keysetup.c */
+extern int llcrypt_get_encryption_info(struct inode *);
+extern void llcrypt_put_encryption_info(struct inode *);
+extern void llcrypt_free_inode(struct inode *);
+extern int llcrypt_drop_inode(struct inode *inode);
+
+/* fname.c */
+extern int llcrypt_setup_filename(struct inode *, const struct qstr *,
+				int lookup, struct llcrypt_name *);
+
+static inline void llcrypt_free_filename(struct llcrypt_name *fname)
+{
+	kfree(fname->crypto_buf.name);
+}
+
+extern int llcrypt_fname_alloc_buffer(const struct inode *, u32,
+				struct llcrypt_str *);
+extern void llcrypt_fname_free_buffer(struct llcrypt_str *);
+extern int llcrypt_fname_disk_to_usr(struct inode *, u32, u32,
+			const struct llcrypt_str *, struct llcrypt_str *);
+
+#define LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE	32
+
+/* Extracts the second-to-last ciphertext block; see explanation below */
+#define LLCRYPT_FNAME_DIGEST(name, len)	\
+	((name) + round_down((len) - LL_CRYPTO_BLOCK_SIZE - 1, \
+			     LL_CRYPTO_BLOCK_SIZE))
+
+#define LLCRYPT_FNAME_DIGEST_SIZE	LL_CRYPTO_BLOCK_SIZE
+
+/**
+ * llcrypt_digested_name - alternate identifier for an on-disk filename
+ *
+ * When userspace lists an encrypted directory without access to the key,
+ * filenames whose ciphertext is longer than LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE
+ * bytes are shown in this abbreviated form (base64-encoded) rather than as the
+ * full ciphertext (base64-encoded).  This is necessary to allow supporting
+ * filenames up to NAME_MAX bytes, since base64 encoding expands the length.
+ *
+ * To make it possible for filesystems to still find the correct directory entry
+ * despite not knowing the full on-disk name, we encode any filesystem-specific
+ * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups,
+ * followed by the second-to-last ciphertext block of the filename.  Due to the
+ * use of the CBC-CTS encryption mode, the second-to-last ciphertext block
+ * depends on the full plaintext.  (Note that ciphertext stealing causes the
+ * last two blocks to appear "flipped".)  This makes accidental collisions very
+ * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they
+ * share the same filesystem-specific hashes.
+ *
+ * However, this scheme isn't immune to intentional collisions, which can be
+ * created by anyone able to create arbitrary plaintext filenames and view them
+ * without the key.  Making the "digest" be a real cryptographic hash like
+ * SHA-256 over the full ciphertext would prevent this, although it would be
+ * less efficient and harder to implement, especially since the filesystem would
+ * need to calculate it for each directory entry examined during a search.
+ */
+struct llcrypt_digested_name {
+	u32 hash;
+	u32 minor_hash;
+	u8 digest[LLCRYPT_FNAME_DIGEST_SIZE];
+};
+
+/**
+ * llcrypt_match_name() - test whether the given name matches a directory entry
+ * @fname: the name being searched for
+ * @de_name: the name from the directory entry
+ * @de_name_len: the length of @de_name in bytes
+ *
+ * Normally @fname->disk_name will be set, and in that case we simply compare
+ * that to the name stored in the directory entry.  The only exception is that
+ * if we don't have the key for an encrypted directory and a filename in it is
+ * very long, then we won't have the full disk_name and we'll instead need to
+ * match against the llcrypt_digested_name.
+ *
+ * Return: %true if the name matches, otherwise %false.
+ */
+static inline bool llcrypt_match_name(const struct llcrypt_name *fname,
+				      const u8 *de_name, u32 de_name_len)
+{
+	if (unlikely(!fname->disk_name.name)) {
+		const struct llcrypt_digested_name *n =
+			(const void *)fname->crypto_buf.name;
+		if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_'))
+			return false;
+		if (de_name_len <= LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE)
+			return false;
+		return !memcmp(LLCRYPT_FNAME_DIGEST(de_name, de_name_len),
+			       n->digest, LLCRYPT_FNAME_DIGEST_SIZE);
+	}
+
+	if (de_name_len != fname->disk_name.len)
+		return false;
+	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* hooks.c */
+extern int llcrypt_file_open(struct inode *inode, struct file *filp);
+extern int __llcrypt_prepare_link(struct inode *inode, struct inode *dir,
+				  struct dentry *dentry);
+extern int __llcrypt_prepare_rename(struct inode *old_dir,
+				    struct dentry *old_dentry,
+				    struct inode *new_dir,
+				    struct dentry *new_dentry,
+				    unsigned int flags);
+extern int __llcrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
+				    struct llcrypt_name *fname);
+extern int __llcrypt_prepare_symlink(struct inode *dir, unsigned int len,
+				     unsigned int max_len,
+				     struct llcrypt_str *disk_link);
+extern int __llcrypt_encrypt_symlink(struct inode *inode, const char *target,
+				     unsigned int len,
+				     struct llcrypt_str *disk_link);
+extern const char *llcrypt_get_symlink(struct inode *inode, const void *caddr,
+				       unsigned int max_size,
+				       struct delayed_call *done);
+static inline void llcrypt_set_ops(struct super_block *sb,
+				   const struct llcrypt_operations *lsi_cop)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (lsi)
+		lsi->lsi_cop = lsi_cop;
+}
+#else  /* !CONFIG_LL_ENCRYPTION */
+
+struct llcrypt_operations;
+#define llcrypt_init()         0
+#define llcrypt_exit()         {}
+
+#undef IS_ENCRYPTED
+#define IS_ENCRYPTED(x)        0
+
+static inline bool llcrypt_has_encryption_key(const struct inode *inode)
+{
+	return false;
+}
+
+static inline bool llcrypt_dummy_context_enabled(struct inode *inode)
+{
+	return false;
+}
+
+static inline void llcrypt_handle_d_move(struct dentry *dentry)
+{
+}
+
+/* crypto.c */
+static inline void llcrypt_enqueue_decrypt_work(struct work_struct *work)
+{
+}
+
+static inline struct llcrypt_ctx *llcrypt_get_ctx(gfp_t gfp_flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void llcrypt_release_ctx(struct llcrypt_ctx *ctx)
+{
+	return;
+}
+
+static inline struct page *llcrypt_encrypt_pagecache_blocks(struct page *page,
+							    unsigned int len,
+							    unsigned int offs,
+							    gfp_t gfp_flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline int llcrypt_encrypt_block(const struct inode *inode,
+					struct page *src, struct page *dst,
+					unsigned int len, unsigned int offs,
+					u64 lblk_num, gfp_t gfp_flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int llcrypt_decrypt_pagecache_blocks(struct page *page,
+						   unsigned int len,
+						   unsigned int offs)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int llcrypt_decrypt_block(const struct inode *inode,
+					struct page *src, struct page *dst,
+					unsigned int len, unsigned int offs,
+					u64 lblk_num, gfp_t gfp_flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int llcrypt_decrypt_block_inplace(const struct inode *inode,
+						struct page *page,
+						unsigned int len,
+						unsigned int offs, u64 lblk_num)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool llcrypt_is_bounce_page(struct page *page)
+{
+	return false;
+}
+
+static inline struct page *llcrypt_pagecache_page(struct page *bounce_page)
+{
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void llcrypt_free_bounce_page(struct page *bounce_page)
+{
+}
+
+/* policy.c */
+static inline int llcrypt_ioctl_set_policy(struct file *filp,
+					   const void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int llcrypt_ioctl_get_policy(struct file *filp, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int llcrypt_ioctl_get_policy_ex(struct file *filp,
+					      void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int llcrypt_has_permitted_context(struct inode *parent,
+						struct inode *child)
+{
+	return 0;
+}
+
+static inline int llcrypt_inherit_context(struct inode *parent,
+					  struct inode *child,
+					  void *fs_data, bool preload)
+{
+	return -EOPNOTSUPP;
+}
+static inline bool llcrypt_policy_has_filename_enc(struct inode *inode)
+{
+	return false;
+}
+
+/* keyring.c */
+static inline void llcrypt_sb_free(struct super_block *sb)
+{
+}
+
+static inline int llcrypt_ioctl_add_key(struct file *filp, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int llcrypt_ioctl_remove_key(struct file *filp, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int llcrypt_ioctl_remove_key_all_users(struct file *filp,
+						     void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int llcrypt_ioctl_get_key_status(struct file *filp,
+					       void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+/* keysetup.c */
+static inline int llcrypt_get_encryption_info(struct inode *inode)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void llcrypt_put_encryption_info(struct inode *inode)
+{
+	return;
+}
+
+static inline void llcrypt_free_inode(struct inode *inode)
+{
+}
+
+static inline int llcrypt_drop_inode(struct inode *inode)
+{
+	return 0;
+}
+
+ /* fname.c */
+static inline int llcrypt_setup_filename(struct inode *dir,
+					 const struct qstr *iname,
+					 int lookup, struct llcrypt_name *fname)
+{
+	if (IS_ENCRYPTED(dir))
+		return -EOPNOTSUPP;
+
+	memset(fname, 0, sizeof(*fname));
+	fname->usr_fname = iname;
+	fname->disk_name.name = (unsigned char *)iname->name;
+	fname->disk_name.len = iname->len;
+	return 0;
+}
+
+static inline void llcrypt_free_filename(struct llcrypt_name *fname)
+{
+	return;
+}
+
+static inline int llcrypt_fname_alloc_buffer(const struct inode *inode,
+					     u32 max_encrypted_len,
+					     struct llcrypt_str *crypto_str)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void llcrypt_fname_free_buffer(struct llcrypt_str *crypto_str)
+{
+	return;
+}
+
+static inline int llcrypt_fname_disk_to_usr(struct inode *inode,
+					    u32 hash, u32 minor_hash,
+					    const struct llcrypt_str *iname,
+					    struct llcrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool llcrypt_match_name(const struct llcrypt_name *fname,
+				      const u8 *de_name, u32 de_name_len)
+{
+	/* Encryption support disabled; use standard comparison */
+	if (de_name_len != fname->disk_name.len)
+		return false;
+	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* hooks.c */
+
+static inline int llcrypt_file_open(struct inode *inode, struct file *filp)
+{
+	if (IS_ENCRYPTED(inode))
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static inline int __llcrypt_prepare_link(struct inode *inode, struct inode *dir,
+					 struct dentry *dentry)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __llcrypt_prepare_rename(struct inode *old_dir,
+					   struct dentry *old_dentry,
+					   struct inode *new_dir,
+					   struct dentry *new_dentry,
+					   unsigned int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __llcrypt_prepare_lookup(struct inode *dir,
+					   struct dentry *dentry,
+					   struct llcrypt_name *fname)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __llcrypt_prepare_symlink(struct inode *dir,
+					    unsigned int len,
+					    unsigned int max_len,
+					    struct llcrypt_str *disk_link)
+{
+	return -EOPNOTSUPP;
+}
+
+
+static inline int __llcrypt_encrypt_symlink(struct inode *inode,
+					    const char *target,
+					    unsigned int len,
+					    struct llcrypt_str *disk_link)
+{
+	return -EOPNOTSUPP;
+}
+
+#define llcrypt_get_symlink(inode, caddr, max_size, done)   ERR_PTR(-EOPNOTSUPP)
+
+static inline void llcrypt_set_ops(struct super_block *sb,
+				   const struct llcrypt_operations *lsi_cop)
+{
+}
+
+#endif	/* !CONFIG_LL_ENCRYPTION */
+
+/**
+ * llcrypt_require_key - require an inode's encryption key
+ * @inode: the inode we need the key for
+ *
+ * If the inode is encrypted, set up its encryption key if not already done.
+ * Then require that the key be present and return -ENOKEY otherwise.
+ *
+ * No locks are needed, and the key will live as long as the struct inode --- so
+ * it won't go away from under you.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int llcrypt_require_key(struct inode *inode)
+{
+	if (IS_ENCRYPTED(inode)) {
+		int err = llcrypt_get_encryption_info(inode);
+
+		if (err)
+			return err;
+		if (!llcrypt_has_encryption_key(inode))
+			return -ENOKEY;
+	}
+	return 0;
+}
+
+/**
+ * llcrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory
+ * @old_dentry: an existing dentry for the inode being linked
+ * @dir: the target directory
+ * @dentry: negative dentry for the target filename
+ *
+ * A new link can only be added to an encrypted directory if the directory's
+ * encryption key is available --- since otherwise we'd have no way to encrypt
+ * the filename.  Therefore, we first set up the directory's encryption key (if
+ * not already done) and return an error if it's unavailable.
+ *
+ * We also verify that the link will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
+ * -EXDEV if the link would result in an inconsistent encryption policy, or
+ * another -errno code.
+ */
+static inline int llcrypt_prepare_link(struct dentry *old_dentry,
+				       struct inode *dir,
+				       struct dentry *dentry)
+{
+	if (IS_ENCRYPTED(dir))
+		return __llcrypt_prepare_link(d_inode(old_dentry), dir, dentry);
+	return 0;
+}
+
+/**
+ * llcrypt_prepare_rename - prepare for a rename between possibly-encrypted directories
+ * @old_dir: source directory
+ * @old_dentry: dentry for source file
+ * @new_dir: target directory
+ * @new_dentry: dentry for target location (may be negative unless exchanging)
+ * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
+ *
+ * Prepare for ->rename() where the source and/or target directories may be
+ * encrypted.  A new link can only be added to an encrypted directory if the
+ * directory's encryption key is available --- since otherwise we'd have no way
+ * to encrypt the filename.  A rename to an existing name, on the other hand,
+ * *is* cryptographically possible without the key.  However, we take the more
+ * conservative approach and just forbid all no-key renames.
+ *
+ * We also verify that the rename will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the
+ * rename would cause inconsistent encryption policies, or another -errno code.
+ */
+static inline int llcrypt_prepare_rename(struct inode *old_dir,
+					 struct dentry *old_dentry,
+					 struct inode *new_dir,
+					 struct dentry *new_dentry,
+					 unsigned int flags)
+{
+	if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
+		return __llcrypt_prepare_rename(old_dir, old_dentry,
+						new_dir, new_dentry, flags);
+	return 0;
+}
+
+/**
+ * llcrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory
+ * @dir: directory being searched
+ * @dentry: filename being looked up
+ * @fname: (output) the name to use to search the on-disk directory
+ *
+ * Prepare for ->lookup() in a directory which may be encrypted by determining
+ * the name that will actually be used to search the directory on-disk.  Lookups
+ * can be done with or without the directory's encryption key; without the key,
+ * filenames are presented in encrypted form.  Therefore, we'll try to set up
+ * the directory's encryption key, but even without it the lookup can continue.
+ *
+ * This also installs a custom ->d_revalidate() method which will invalidate the
+ * dentry if it was created without the key and the key is later added.
+ *
+ * Return: 0 on success; -ENOENT if key is unavailable but the filename isn't a
+ * correctly formed encoded ciphertext name, so a negative dentry should be
+ * created; or another -errno code.
+ */
+static inline int llcrypt_prepare_lookup(struct inode *dir,
+					 struct dentry *dentry,
+					 struct llcrypt_name *fname)
+{
+	if (IS_ENCRYPTED(dir))
+		return __llcrypt_prepare_lookup(dir, dentry, fname);
+
+	memset(fname, 0, sizeof(*fname));
+	fname->usr_fname = &dentry->d_name;
+	fname->disk_name.name = (unsigned char *)dentry->d_name.name;
+	fname->disk_name.len = dentry->d_name.len;
+	return 0;
+}
+
+/**
+ * llcrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes
+ * @dentry: dentry through which the inode is being changed
+ * @attr: attributes to change
+ *
+ * Prepare for ->setattr() on a possibly-encrypted inode.  On an encrypted file,
+ * most attribute changes are allowed even without the encryption key.  However,
+ * without the encryption key we do have to forbid truncates.  This is needed
+ * because the size being truncated to may not be a multiple of the filesystem
+ * block size, and in that case we'd have to decrypt the final block, zero the
+ * portion past i_size, and re-encrypt it.  (We *could* allow truncating to a
+ * filesystem block boundary, but it's simpler to just forbid all truncates ---
+ * and we already forbid all other contents modifications without the key.)
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int llcrypt_prepare_setattr(struct dentry *dentry,
+					  struct iattr *attr)
+{
+	if (attr->ia_valid & ATTR_SIZE)
+		return llcrypt_require_key(d_inode(dentry));
+	return 0;
+}
+
+/**
+ * llcrypt_prepare_symlink - prepare to create a possibly-encrypted symlink
+ * @dir: directory in which the symlink is being created
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @max_len: space the filesystem has available to store the symlink target
+ * @disk_link: (out) the on-disk symlink target being prepared
+ *
+ * This function computes the size the symlink target will require on-disk,
+ * stores it in @disk_link->len, and validates it against @max_len.  An
+ * encrypted symlink may be longer than the original.
+ *
+ * Additionally, @disk_link->name is set to @target if the symlink will be
+ * unencrypted, but left NULL if the symlink will be encrypted.  For encrypted
+ * symlinks, the filesystem must call llcrypt_encrypt_symlink() to create the
+ * on-disk target later.  (The reason for the two-step process is that some
+ * filesystems need to know the size of the symlink target before creating the
+ * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.)
+ *
+ * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long,
+ * -ENOKEY if the encryption key is missing, or another -errno code if a problem
+ * occurred while setting up the encryption key.
+ */
+static inline int llcrypt_prepare_symlink(struct inode *dir,
+					  const char *target,
+					  unsigned int len,
+					  unsigned int max_len,
+					  struct llcrypt_str *disk_link)
+{
+	if ((IS_ENCRYPTED(dir) || llcrypt_dummy_context_enabled(dir)) &&
+	    llcrypt_policy_has_filename_enc(dir))
+		return __llcrypt_prepare_symlink(dir, len, max_len, disk_link);
+
+	disk_link->name = (unsigned char *)target;
+	disk_link->len = len + 1;
+	if (disk_link->len > max_len)
+		return -ENAMETOOLONG;
+	return 0;
+}
+
+/**
+ * llcrypt_encrypt_symlink - encrypt the symlink target if needed
+ * @inode: symlink inode
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @disk_link: (in/out) the on-disk symlink target being prepared
+ *
+ * If the symlink target needs to be encrypted, then this function encrypts it
+ * into @disk_link->name.  llcrypt_prepare_symlink() must have been called
+ * previously to compute @disk_link->len.  If the filesystem did not allocate a
+ * buffer for @disk_link->name after calling llcrypt_prepare_link(), then one
+ * will be kmalloc()'ed and the filesystem will be responsible for freeing it.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static inline int llcrypt_encrypt_symlink(struct inode *inode,
+					  const char *target,
+					  unsigned int len,
+					  struct llcrypt_str *disk_link)
+{
+	if (IS_ENCRYPTED(inode))
+		return __llcrypt_encrypt_symlink(inode, target, len, disk_link);
+	return 0;
+}
+
+/* If *pagep is a bounce page, free it and set *pagep to the pagecache page */
+static inline void llcrypt_finalize_bounce_page(struct page **pagep)
+{
+	struct page *page = *pagep;
+
+	if (llcrypt_is_bounce_page(page)) {
+		*pagep = llcrypt_pagecache_page(page);
+		llcrypt_free_bounce_page(page);
+	}
+}
+
+#endif	/* _LINUX_LLCRYPT_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
new file mode 100644
index 0000000000000..79ba6089c3664
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
@@ -0,0 +1,143 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LIBCFS_LIBCFS_H__
+#define __LIBCFS_LIBCFS_H__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
+
+#include <libcfs/linux/linux-misc.h>
+#include <libcfs/linux/linux-mem.h>
+#include <libcfs/linux/linux-time.h>
+#include <libcfs/linux/linux-wait.h>
+#include <libcfs/linux/linux-fortify-string.h>
+
+#include <uapi/linux/lnet/libcfs_ioctl.h>
+#include <libcfs/libcfs_debug.h>
+#include <libcfs/libcfs_private.h>
+#include <libcfs/bitmap.h>
+#include <libcfs/libcfs_cpu.h>
+#include <libcfs/libcfs_string.h>
+#include <libcfs/libcfs_workitem.h>
+#include <libcfs/libcfs_hash.h>
+#include <libcfs/libcfs_fail.h>
+
+#define LIBCFS_VERSION	"0.7.1"
+
+/* Sparse annotations */
+#if !defined(__must_hold)
+# ifdef __CHECKER__
+#  define __must_hold(x) __attribute__((context(x, 1, 1)))
+# else	/* __CHECKER__ */
+#  define __must_hold(x)
+# endif /* !__CHECKER__ */
+#endif /* !__must_hold */
+
+typedef s32 timeout_t;
+
+/* need both kernel and user-land acceptor */
+#define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
+#define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
+
+extern struct blocking_notifier_head libcfs_ioctl_list;
+static inline int notifier_from_ioctl_errno(int err)
+{
+	if (err == -EINVAL)
+		return NOTIFY_OK;
+	return notifier_from_errno(err) | NOTIFY_STOP_MASK;
+}
+
+int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
+
+extern struct workqueue_struct *cfs_rehash_wq;
+
+void lnet_insert_debugfs(struct ctl_table *table);
+void lnet_remove_debugfs(struct ctl_table *table);
+
+/* helper for sysctl handlers */
+int debugfs_doint(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos);
+
+/*
+ * Memory
+ */
+#if BITS_PER_LONG == 32
+/* limit to lowmem on 32-bit systems */
+#define NUM_CACHEPAGES \
+	min(cfs_totalram_pages(), 1UL << (30 - PAGE_SHIFT) * 3 / 4)
+#else
+#define NUM_CACHEPAGES cfs_totalram_pages()
+#endif
+
+#define wait_var_event_warning(var, condition, format, ...)		\
+do {									\
+	int counter = 4;						\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			  if (schedule_timeout(cfs_time_seconds(1)) == 0)\
+				  CDEBUG(is_power_of_2(counter++) ?	\
+					 D_WARNING : D_NET,		\
+					 format, ## __VA_ARGS__)	\
+		);							\
+} while (0)
+
+/* atomic-context safe vfree */
+void libcfs_vfree_atomic(const void *addr);
+
+/* interval tree */
+
+#ifdef HAVE_INTERVAL_TREE_CACHED
+#define interval_tree_root rb_root_cached
+#define interval_tree_first rb_first_cached
+#define INTERVAL_TREE_ROOT RB_ROOT_CACHED
+#else
+#define interval_tree_root rb_root
+#define interval_tree_first rb_first
+#define INTERVAL_TREE_ROOT RB_ROOT
+#endif /* HAVE_INTERVAL_TREE_CACHED */
+
+#ifndef unsafe_memcpy
+#define unsafe_memcpy(to, from, size, reason)	memcpy((to), (from), (size))
+#endif
+
+#define FLEXIBLE_OBJECT \
+	"Struct contains a flexible member, the size of object is checked" \
+	"and can be safely copied in a single memcpy()"
+
+#endif /* _LIBCFS_LIBCFS_H_ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
new file mode 100644
index 0000000000000..cb2539e426255
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
@@ -0,0 +1,462 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/libcfs_cpu.h
+ *
+ * CPU partition
+ *   . CPU partition is virtual processing unit
+ *
+ *   . CPU partition can present 1-N cores, or 1-N NUMA nodes,
+ *     in other words, CPU partition is a processors pool.
+ *
+ * CPU Partition Table (CPT)
+ *   . a set of CPU partitions
+ *
+ *   . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP
+ *
+ *   . User can specify total number of CPU partitions while creating a
+ *     CPT, ID of CPU partition is always start from 0.
+ *
+ *     Example: if there are 8 cores on the system, while creating a CPT
+ *     with cpu_npartitions=4:
+ *		core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *		core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *
+ *          cpu_npartitions=1:
+ *		core[0, 1, ... 7] = partition[0]
+ *
+ *   . User can also specify CPU partitions by string pattern
+ *
+ *     Examples: cpu_partitions="0[0,1], 1[2,3]"
+ *		 cpu_partitions="N 0[0-3], 1[4-8]"
+ *
+ *     The first character "N" means following numbers are numa ID
+ *
+ *   . NUMA allocators, CPU affinity threads are built over CPU partitions,
+ *     instead of HW CPUs or HW nodes.
+ *
+ *   . By default, Lustre modules should refer to the global cfs_cpt_tab,
+ *     instead of accessing HW CPUs directly, so concurrency of Lustre can be
+ *     configured by cpu_npartitions of the global cfs_cpt_tab
+ *
+ *   . If cpu_npartitions=1(all CPUs in one pool), lustre should work the
+ *     same way as 2.2 or earlier versions
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_CPU_H__
+#define __LIBCFS_CPU_H__
+
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/slab.h>
+#include <linux/topology.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
+#include <libcfs/linux/linux-cpu.h>
+
+/* any CPU partition */
+#define CFS_CPT_ANY		(-1)
+
+struct cfs_cpt_table;
+
+#ifdef CONFIG_SMP
+extern struct cfs_cpt_table	*cfs_cpt_tab;
+
+/**
+ * destroy a CPU partition table
+ */
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab);
+/**
+ * create a cfs_cpt_table with \a ncpt number of partitions
+ */
+struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt);
+/**
+ * print string information of cpt-table
+ */
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
+ * print distance information of cpt-table
+ */
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
+ * return total number of CPU partitions in \a cptab
+ */
+int cfs_cpt_number(struct cfs_cpt_table *cptab);
+/**
+ * return number of HW cores or hyper-threadings in a CPU partition \a cpt
+ */
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * is there any online CPU in CPU partition \a cpt
+ */
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return cpumask of CPU partition \a cpt
+ */
+cpumask_var_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return nodemask of CPU partition \a cpt
+ */
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * shadow current HW processor ID to CPU-partition ID of \a cptab
+ */
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap);
+/**
+ * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
+/**
+ * shadow HW node ID \a NODE to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node);
+/**
+ * NUMA distance between \a cpt1 and \a cpt2 in \a cptab
+ */
+unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
+/**
+ * bind current thread on a CPU-partition \a cpt of \a cptab
+ */
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success,
+ * otherwise 0 is returned
+ */
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * remove \a cpu from CPU partition \a cpt of \a cptab
+ */
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * add all cpus in \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			const cpumask_t *mask);
+/**
+ * remove all cpus in \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			   const cpumask_t *mask);
+/**
+ * add all cpus in NUMA node \a node to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node);
+/**
+ * remove all cpus in NUMA node \a node from CPU partition \a cpt
+ */
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node);
+/**
+ * add all cpus in node mask \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			 const nodemask_t *mask);
+/**
+ * remove all cpus in node mask \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			    const nodemask_t *mask);
+/**
+ * convert partition id \a cpt to numa node id, if there are more than one
+ * nodes in this partition, it might return a different node id each time.
+ */
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
+
+int cfs_cpu_init(void);
+void cfs_cpu_fini(void);
+
+#else /* !CONFIG_SMP */
+
+#define cfs_cpt_tab ((struct cfs_cpt_table *)NULL)
+
+static inline void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+}
+
+static inline struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
+{
+	return NULL;
+}
+
+static inline int cfs_cpt_table_print(struct cfs_cpt_table *cptab,
+				      char *buf, int len)
+{
+	int rc;
+
+	rc = snprintf(buf, len, "0\t: 0\n");
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+
+static inline int cfs_cpt_distance_print(struct cfs_cpt_table *cptab,
+					 char *buf, int len)
+{
+	int rc;
+
+	rc = snprintf(buf, len, "0\t: 0:1\n");
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+
+static inline cpumask_var_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab,
+					     int cpt)
+{
+	return (cpumask_var_t *) cpu_online_mask;
+}
+
+static inline int cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return 1;
+}
+
+static inline int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+
+static inline nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab,
+					   int cpt)
+{
+	return &node_online_map;
+}
+
+static inline unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab,
+					    int cpt1, int cpt2)
+{
+	return 1;
+}
+
+static inline int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt,
+				   int node)
+{
+	return 1;
+}
+
+static inline int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+
+static inline int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	return 0;
+}
+
+static inline int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+	return 0;
+}
+
+static inline int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+
+static inline int cfs_cpu_init(void)
+{
+	return 0;
+}
+
+static inline void cfs_cpu_fini(void)
+{
+}
+
+#endif /* CONFIG_SMP */
+
+static inline
+struct workqueue_struct *cfs_cpt_bind_workqueue(const char *wq_name,
+						struct cfs_cpt_table *tbl,
+						int flags, int cpt, int nthrs)
+{
+	cpumask_var_t *mask = cfs_cpt_cpumask(tbl, cpt);
+	struct workqueue_attrs attrs = { };
+	struct workqueue_struct *wq;
+
+	wq = alloc_workqueue(wq_name, WQ_UNBOUND | flags, nthrs);
+	if (!wq)
+		return ERR_PTR(-ENOMEM);
+
+	if (mask && alloc_cpumask_var(&attrs.cpumask, GFP_KERNEL)) {
+		cpumask_copy(attrs.cpumask, *mask);
+		cpus_read_lock();
+		cfs_apply_workqueue_attrs(wq, &attrs);
+		cpus_read_unlock();
+		free_cpumask_var(attrs.cpumask);
+	}
+
+	return wq;
+}
+
+/*
+ * allocate per-cpu-partition data, returned value is an array of pointers,
+ * variable can be indexed by CPU ID.
+ *	cptab != NULL: size of array is number of CPU partitions
+ *	cptab == NULL: size of array is number of HW cores
+ */
+void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
+/*
+ * destroy per-cpu-partition variable
+ */
+void cfs_percpt_free(void *vars);
+int cfs_percpt_number(void *vars);
+
+#define cfs_percpt_for_each(var, i, vars)		\
+	for (i = 0; i < cfs_percpt_number(vars) &&	\
+		((var) = (vars)[i]) != NULL; i++)
+
+/*
+ * percpu partition lock
+ *
+ * There are some use-cases like this in Lustre:
+ * . each CPU partition has it's own private data which is frequently changed,
+ *   and mostly by the local CPU partition.
+ * . all CPU partitions share some global data, these data are rarely changed.
+ *
+ * LNet is typical example.
+ * CPU partition lock is designed for this kind of use-cases:
+ * . each CPU partition has it's own private lock
+ * . change on private data just needs to take the private lock
+ * . read on shared data just needs to take _any_ of private locks
+ * . change on shared data needs to take _all_ private locks,
+ *   which is slow and should be really rare.
+ */
+enum {
+	CFS_PERCPT_LOCK_EX	= -1,	/* negative */
+};
+
+struct cfs_percpt_lock {
+	/* cpu-partition-table for this lock */
+	struct cfs_cpt_table	 *pcl_cptab;
+	/* exclusively locked */
+	unsigned int		  pcl_locked;
+	/* private lock table */
+	spinlock_t		**pcl_locks;
+};
+
+/* return number of private locks */
+#define cfs_percpt_lock_num(pcl)	cfs_cpt_number(pcl->pcl_cptab)
+
+/*
+ * create a cpu-partition lock based on CPU partition table \a cptab,
+ * each private lock has extra \a psize bytes padding data
+ */
+struct cfs_percpt_lock *cfs_percpt_lock_create(struct cfs_cpt_table *cptab,
+					       struct lock_class_key *keys);
+/* destroy a cpu-partition lock */
+void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
+
+/* lock private lock \a index of \a pcl */
+void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
+
+/* unlock private lock \a index of \a pcl */
+void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
+
+#define CFS_PERCPT_LOCK_KEYS	256
+
+/* NB: don't allocate keys dynamically, lockdep needs them to be in ".data" */
+#define cfs_percpt_lock_alloc(cptab)					\
+({									\
+	static struct lock_class_key ___keys[CFS_PERCPT_LOCK_KEYS];	\
+	struct cfs_percpt_lock *___lk;					\
+									\
+	if (cfs_cpt_number(cptab) > CFS_PERCPT_LOCK_KEYS)		\
+		___lk = cfs_percpt_lock_create(cptab, NULL);		\
+	else								\
+		___lk = cfs_percpt_lock_create(cptab, ___keys);		\
+	___lk;								\
+})
+
+/**
+ * allocate \a nr_bytes of physical memory from a contiguous region with the
+ * properties of \a flags which are bound to the partition id \a cpt. This
+ * function should only be used for the case when only a few pages of memory
+ * are need.
+ */
+static inline void *
+cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes,
+	       gfp_t flags)
+{
+	return kmalloc_node(nr_bytes, flags,
+			    cfs_cpt_spread_node(cptab, cpt));
+}
+
+/**
+ * allocate \a nr_bytes of virtually contiguous memory that is bound to the
+ * partition id \a cpt.
+ */
+static inline void *
+cfs_cpt_vzalloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes)
+{
+	/* vzalloc_node() sets __GFP_FS by default but no current Kernel
+	 * exported entry-point allows for both a NUMA node specification
+	 * and a custom allocation flags mask. This may be an issue since
+	 * __GFP_FS usage can cause some deadlock situations in our code,
+	 * like when memory reclaim started, within the same context of a
+	 * thread doing FS operations, that can also attempt conflicting FS
+	 * operations, ...
+	 */
+	return vzalloc_node(nr_bytes, cfs_cpt_spread_node(cptab, cpt));
+}
+
+/**
+ * allocate a single page of memory with the properties of \a flags were
+ * that page is bound to the partition id \a cpt.
+ */
+static inline struct page *
+cfs_page_cpt_alloc(struct cfs_cpt_table *cptab, int cpt, gfp_t flags)
+{
+	return alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), flags, 0);
+}
+
+/**
+ * allocate a chunck of memory from a memory pool that is bound to the
+ * partition id \a cpt with the properites of \a flags.
+ */
+static inline void *
+cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab,
+			int cpt, gfp_t flags)
+{
+	return kmem_cache_alloc_node(cachep, flags,
+				     cfs_cpt_spread_node(cptab, cpt));
+}
+
+/**
+ * iterate over all CPU partitions in \a cptab
+ */
+#define cfs_cpt_for_each(i, cptab)	\
+	for (i = 0; i < cfs_cpt_number(cptab); i++)
+
+#endif /* __LIBCFS_CPU_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
new file mode 100644
index 0000000000000..f271676ff4948
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
@@ -0,0 +1,319 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+
+#ifndef _LIBCFS_CRYPTO_H
+#define _LIBCFS_CRYPTO_H
+
+struct cfs_crypto_hash_type {
+	char		*cht_name;      /**< hash algorithm name, equal to
+					 * format name for crypto api */
+	unsigned int    cht_key;	/**< init key by default (vaild for
+					 * 4 bytes context like crc32, adler */
+	unsigned int    cht_size;       /**< hash digest size */
+};
+
+struct cfs_crypto_crypt_type {
+	char	       *cct_name;	  /**< crypto algorithm name, equal to
+					   * format name for crypto api */
+	unsigned int    cct_size;         /**< crypto key size */
+};
+
+enum cfs_crypto_hash_alg {
+	CFS_HASH_ALG_NULL	= 0,
+	CFS_HASH_ALG_ADLER32,
+	CFS_HASH_ALG_CRC32,
+	CFS_HASH_ALG_CRC32C,
+	/* hashes before here will be speed-tested at module load */
+	CFS_HASH_ALG_MD5,
+	CFS_HASH_ALG_SHA1,
+	CFS_HASH_ALG_SHA256,
+	CFS_HASH_ALG_SHA384,
+	CFS_HASH_ALG_SHA512,
+	CFS_HASH_ALG_MAX,
+	CFS_HASH_ALG_SPEED_MAX = CFS_HASH_ALG_MD5,
+	CFS_HASH_ALG_UNKNOWN	= 0xff
+};
+
+enum cfs_crypto_crypt_alg {
+	CFS_CRYPT_ALG_NULL	= 0,
+	CFS_CRYPT_ALG_AES256_CTR,
+	CFS_CRYPT_ALG_MAX,
+	CFS_CRYPT_ALG_UNKNOWN	= 0xff
+};
+
+static struct cfs_crypto_hash_type hash_types[] = {
+	[CFS_HASH_ALG_NULL] = {
+		.cht_name	= "null",
+		.cht_key	= 0,
+		.cht_size	= 0
+	},
+	[CFS_HASH_ALG_ADLER32] = {
+		.cht_name	= "adler32",
+		.cht_key	= 1,
+		.cht_size	= 4
+	},
+	[CFS_HASH_ALG_CRC32] = {
+		.cht_name	= "crc32",
+		.cht_key	= ~0,
+		.cht_size	= 4
+	},
+	[CFS_HASH_ALG_CRC32C] = {
+		.cht_name	= "crc32c",
+		.cht_key	= ~0,
+		.cht_size	= 4
+	},
+	[CFS_HASH_ALG_MD5] = {
+		.cht_name	= "md5",
+		.cht_key	= 0,
+		.cht_size	= 16
+	},
+	[CFS_HASH_ALG_SHA1] = {
+		.cht_name	= "sha1",
+		.cht_key	= 0,
+		.cht_size	= 20
+	},
+	[CFS_HASH_ALG_SHA256] = {
+		.cht_name	= "sha256",
+		.cht_key	= 0,
+		.cht_size	= 32
+	},
+	[CFS_HASH_ALG_SHA384] = {
+		.cht_name	= "sha384",
+		.cht_key	= 0,
+		.cht_size	= 48
+	},
+	[CFS_HASH_ALG_SHA512] = {
+		.cht_name	= "sha512",
+		.cht_key	= 0,
+		.cht_size	= 64
+	},
+	[CFS_HASH_ALG_MAX] = {
+		.cht_name	= NULL,
+		.cht_key	= 0,
+		.cht_size	= 64
+	}
+};
+
+static struct cfs_crypto_crypt_type crypt_types[] = {
+	[CFS_CRYPT_ALG_NULL] = {
+		.cct_name	= "null",
+		.cct_size	= 0
+	},
+	[CFS_CRYPT_ALG_AES256_CTR] = {
+		.cct_name	= "ctr(aes)",
+		.cct_size	= 32
+	}
+};
+
+/* Maximum size of hash_types[].cht_size */
+#define CFS_CRYPTO_HASH_DIGESTSIZE_MAX 64
+
+/*  Array of hash algorithm speed in MByte per second */
+extern int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+
+/**
+ * Return hash algorithm information for the specified algorithm identifier
+ *
+ * Hash information includes algorithm name, initial seed, hash size.
+ *
+ * \retval		cfs_crypto_hash_type for valid ID (CFS_HASH_ALG_*)
+ * \retval		NULL for unknown algorithm identifier
+ */
+static inline const struct
+cfs_crypto_hash_type *cfs_crypto_hash_type(enum cfs_crypto_hash_alg hash_alg)
+{
+	struct cfs_crypto_hash_type *ht;
+
+	if (hash_alg < CFS_HASH_ALG_MAX) {
+		ht = &hash_types[hash_alg];
+		if (ht->cht_name != NULL)
+			return ht;
+	}
+	return NULL;
+}
+
+/**
+ * Return hash name for hash algorithm identifier
+ *
+ * \param[in] hash_alg	hash alrgorithm id (CFS_HASH_ALG_*)
+ *
+ * \retval		string name of known hash algorithm
+ * \retval		"unknown" if hash algorithm is unknown
+ */
+static inline const
+char *cfs_crypto_hash_name(enum cfs_crypto_hash_alg hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht)
+		return ht->cht_name;
+
+	return "unknown";
+}
+
+/**
+ * Return digest size for hash algorithm type
+ *
+ * \param[in] hash_alg	hash alrgorithm id (CFS_HASH_ALG_*)
+ *
+ * \retval		hash algorithm digest size in bytes
+ * \retval		0 if hash algorithm type is unknown
+ */
+static inline
+unsigned int cfs_crypto_hash_digestsize(enum cfs_crypto_hash_alg hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht != NULL)
+		return ht->cht_size;
+
+	return 0;
+}
+
+/**
+ * Find hash algorithm ID for the specified algorithm name
+ *
+ * \retval		hash algorithm ID for valid ID (CFS_HASH_ALG_*)
+ * \retval		CFS_HASH_ALG_UNKNOWN for unknown algorithm name
+ */
+static inline unsigned char cfs_crypto_hash_alg(const char *algname)
+{
+	enum cfs_crypto_hash_alg hash_alg;
+
+	for (hash_alg = 0; hash_alg < CFS_HASH_ALG_MAX; hash_alg++)
+		if (strcmp(hash_types[hash_alg].cht_name, algname) == 0)
+			return hash_alg;
+
+	return CFS_HASH_ALG_UNKNOWN;
+}
+
+/**
+ * Return crypt algorithm information for the specified algorithm identifier
+ *
+ * Crypt information includes algorithm name, key size.
+ *
+ * \retval		cfs_crypto_crupt_type for valid ID (CFS_CRYPT_ALG_*)
+ * \retval		NULL for unknown algorithm identifier
+ */
+static inline const struct
+cfs_crypto_crypt_type *cfs_crypto_crypt_type(
+	enum cfs_crypto_crypt_alg crypt_alg)
+{
+	struct cfs_crypto_crypt_type *ct;
+
+	if (crypt_alg < CFS_CRYPT_ALG_MAX) {
+		ct = &crypt_types[crypt_alg];
+		if (ct->cct_name != NULL)
+			return ct;
+	}
+	return NULL;
+}
+
+/**
+ * Return crypt name for crypt algorithm identifier
+ *
+ * \param[in] crypt_alg	crypt alrgorithm id (CFS_CRYPT_ALG_*)
+ *
+ * \retval		string name of known crypt algorithm
+ * \retval		"unknown" if hash algorithm is unknown
+ */
+static inline const
+char *cfs_crypto_crypt_name(enum cfs_crypto_crypt_alg crypt_alg)
+{
+	const struct cfs_crypto_crypt_type *ct;
+
+	ct = cfs_crypto_crypt_type(crypt_alg);
+	if (ct)
+		return ct->cct_name;
+
+	return "unknown";
+}
+
+
+/**
+ * Return key size for crypto algorithm type
+ *
+ * \param[in] crypt_alg	crypt alrgorithm id (CFS_CRYPT_ALG_*)
+ *
+ * \retval		crypt algorithm key size in bytes
+ * \retval		0 if crypt algorithm type is unknown
+ */
+static inline
+unsigned int cfs_crypto_crypt_keysize(enum cfs_crypto_crypt_alg crypt_alg)
+{
+	const struct cfs_crypto_crypt_type *ct;
+
+	ct = cfs_crypto_crypt_type(crypt_alg);
+	if (ct != NULL)
+		return ct->cct_size;
+
+	return 0;
+}
+
+/**
+ * Find crypto algorithm ID for the specified algorithm name
+ *
+ * \retval		crypto algorithm ID for valid ID (CFS_CRYPT_ALG_*)
+ * \retval		CFS_CRYPT_ALG_UNKNOWN for unknown algorithm name
+ */
+static inline unsigned char cfs_crypto_crypt_alg(const char *algname)
+{
+	enum cfs_crypto_crypt_alg crypt_alg;
+
+	for (crypt_alg = 0; crypt_alg < CFS_CRYPT_ALG_MAX; crypt_alg++)
+		if (strcmp(crypt_types[crypt_alg].cct_name, algname) == 0)
+			return crypt_alg;
+
+	return CFS_CRYPT_ALG_UNKNOWN;
+}
+
+int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len);
+
+/* cfs crypto hash descriptor */
+struct page;
+
+struct ahash_request *
+	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
+			     unsigned char *key, unsigned int key_len);
+int cfs_crypto_hash_update_page(struct ahash_request *req,
+				struct page *page, unsigned int offset,
+				unsigned int len);
+int cfs_crypto_hash_update(struct ahash_request *req, const void *buf,
+			   unsigned int buf_len);
+int cfs_crypto_hash_final(struct ahash_request *req,
+			  unsigned char *hash, unsigned int *hash_len);
+int cfs_crypto_register(void);
+void cfs_crypto_unregister(void);
+int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg);
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
new file mode 100644
index 0000000000000..f7d5bd9dd0126
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
@@ -0,0 +1,328 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __LIBCFS_DEBUG_H__
+#define __LIBCFS_DEBUG_H__
+
+#include <linux/tty.h>
+#include <linux/limits.h>
+#include <uapi/linux/lnet/libcfs_debug.h>
+
+/*
+ *  Debugging
+ */
+extern unsigned int libcfs_subsystem_debug;
+extern unsigned int libcfs_stack;
+extern unsigned int libcfs_debug;
+extern unsigned int libcfs_printk;
+extern unsigned int libcfs_watchdog_ratelimit;
+extern unsigned int libcfs_console_ratelimit;
+extern unsigned int libcfs_console_max_delay;
+extern unsigned int libcfs_console_min_delay;
+extern unsigned int libcfs_console_backoff;
+extern unsigned int libcfs_debug_binary;
+extern char *libcfs_debug_file_path;
+
+struct task_struct;
+
+int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys);
+int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
+void libcfs_debug_dumpstack(struct task_struct *tsk);
+
+/* Has there been an LBUG? */
+extern unsigned int libcfs_catastrophe;
+extern unsigned int libcfs_panic_on_lbug;
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600))         /* jiffies */
+#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */
+#define CDEBUG_DEFAULT_BACKOFF   2
+struct cfs_debug_limit_state {
+	unsigned long	cdls_next;
+	unsigned int	cdls_delay;
+	int		cdls_count;
+};
+
+struct libcfs_debug_msg_data {
+	const char			*msg_file;
+	const char			*msg_fn;
+	int				 msg_subsys;
+	int				 msg_line;
+	int				 msg_mask;
+	struct cfs_debug_limit_state	*msg_cdls;
+};
+
+#define LIBCFS_DEBUG_MSG_DATA_INIT(file, func, line, msgdata, mask, cdls)\
+do {									\
+	(msgdata)->msg_subsys = DEBUG_SUBSYSTEM;			\
+	(msgdata)->msg_file   = (file);					\
+	(msgdata)->msg_fn     = (func);					\
+	(msgdata)->msg_line   = (line);					\
+	(msgdata)->msg_mask   = (mask);					\
+	(msgdata)->msg_cdls   = (cdls);					\
+} while (0)
+
+#define LIBCFS_DEBUG_MSG_DATA_DECL_LOC(file, func, line, msgdata, mask, cdls)\
+	static struct libcfs_debug_msg_data msgdata = {			\
+		.msg_subsys = DEBUG_SUBSYSTEM,				\
+		.msg_file   = (file),					\
+		.msg_fn     = (func),					\
+		.msg_line   = (line),					\
+		.msg_cdls   = (cdls) };					\
+	msgdata.msg_mask   = (mask)
+
+#define LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, cdls)			\
+	LIBCFS_DEBUG_MSG_DATA_DECL_LOC(__FILE__, __func__, __LINE__,	\
+				       msgdata, mask, cdls)
+
+#ifdef CDEBUG_ENABLED
+
+#if !defined(__x86_64__)
+# ifdef __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -					\
+			  ((unsigned long)__builtin_dwarf_cfa() &	\
+			   (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -					\
+			  ((unsigned long)__builtin_frame_address(0) &	\
+			   (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK_WITH_LOC(file, func, line, msgdata, mask, cdls)	\
+do {									\
+	if (unlikely(CDEBUG_STACK() > libcfs_stack)) {			\
+		LIBCFS_DEBUG_MSG_DATA_INIT(file, func, line, msgdata,	\
+					   D_WARNING, NULL);		\
+		libcfs_stack = CDEBUG_STACK();				\
+		libcfs_debug_msg(msgdata, "maximum lustre stack %u\n",	\
+				 libcfs_stack);				\
+		(msgdata)->msg_mask = mask;				\
+		(msgdata)->msg_cdls = cdls;				\
+		dump_stack();						\
+		/*panic("LBUG");*/					\
+	}								\
+} while (0)
+#else /* __x86_64__ */
+#define CDEBUG_STACK() (0L)
+#define __CHECK_STACK_WITH_LOC(file, func, line, msgdata, mask, cdls)	\
+	do {} while (0)
+#endif /* __x86_64__ */
+
+#define CFS_CHECK_STACK(msgdata, mask, cdls)				\
+	__CHECK_STACK_WITH_LOC(__FILE__, __func__, __LINE__,		\
+			       msgdata, mask, cdls)
+/**
+ * Filters out logging messages based on mask and subsystem.
+ */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+	return mask & D_CANTMASK ||
+	       ((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem));
+}
+
+#  define __CDEBUG_WITH_LOC(file, func, line, mask, cdls, format, ...)	\
+do {									\
+	static struct libcfs_debug_msg_data msgdata;			\
+									\
+	__CHECK_STACK_WITH_LOC(file, func, line, &msgdata, mask, cdls);	\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {			\
+		LIBCFS_DEBUG_MSG_DATA_INIT(file, func, line,		\
+					   &msgdata, mask, cdls);	\
+		libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__);	\
+	}								\
+} while (0)
+
+#  define CDEBUG(mask, format, ...)					\
+	__CDEBUG_WITH_LOC(__FILE__, __func__, __LINE__,			\
+			  mask, NULL, format, ## __VA_ARGS__)
+
+#  define CDEBUG_LIMIT(mask, format, ...)				\
+do {									\
+	static struct cfs_debug_limit_state cdls;			\
+									\
+	__CDEBUG_WITH_LOC(__FILE__, __func__, __LINE__,			\
+			  mask, &cdls, format, ## __VA_ARGS__);		\
+} while (0)
+
+# else /* !CDEBUG_ENABLED */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+	return 0;
+}
+#  define CDEBUG(mask, format, ...) (void)(0)
+#  define CDEBUG_LIMIT(mask, format, ...) (void)(0)
+#  warning "CDEBUG IS DISABLED. THIS SHOULD NEVER BE DONE FOR PRODUCTION!"
+# endif /* CDEBUG_ENABLED */
+
+/*
+ * Lustre Error Checksum: calculates checksum
+ * of Hex number by XORing each bit.
+ */
+#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \
+			   ((hexnum) >> 8 & 0xf))
+
+#define CWARN(format, ...)          CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__)
+#define CERROR(format, ...)         CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__)
+#define CNETERR(format, a...)       CDEBUG_LIMIT(D_NETERROR, format, ## a)
+#define CEMERG(format, ...)         CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__)
+
+#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__)
+#define LCONSOLE_INFO(format, ...)  CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__)
+#define LCONSOLE_WARN(format, ...)  CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__)
+#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \
+                           "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__)
+#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__)
+
+#define LCONSOLE_EMERG(format, ...) \
+	CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+		     const char *format1, ...)
+	__printf(2, 3);
+
+/* other external symbols that tracefile provides: */
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+			     const char *knl_buffer, char *append);
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#if defined(CDEBUG_ENTRY_EXIT)
+
+static inline long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc)
+{
+	libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
+			 rc, rc, rc);
+	return rc;
+}
+
+static inline void libcfs_log_goto(struct libcfs_debug_msg_data *msgdata,
+				   const char *label, long rc)
+{
+	libcfs_debug_msg(msgdata,
+			 "Process leaving via %s (rc=%lu : %ld : %#lx)\n",
+			 label, rc, rc, rc);
+}
+
+# define GOTO(label, rc)						      \
+do {									      \
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		      \
+		LIBCFS_DEBUG_MSG_DATA_DECL(_goto_data, D_TRACE, NULL);	      \
+		libcfs_log_goto(&_goto_data, #label, (long)(rc));	      \
+	} else {							      \
+		(void)(rc);						      \
+	}								      \
+									      \
+	goto label;							      \
+} while (0)
+
+# if BITS_PER_LONG > 32
+#  define RETURN(rc)							      \
+do {									      \
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		      \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);	      \
+		return (typeof(rc))libcfs_log_return(&msgdata,		      \
+						     (long)(rc));	      \
+	}								      \
+									      \
+	return rc;							      \
+} while (0)
+# else /* BITS_PER_LONG == 32 */
+/* We need an on-stack variable, because we cannot case a 32-bit pointer
+ * directly to (long long) without generating a complier warning/error, yet
+ * casting directly to (long) will truncate 64-bit return values. The log
+ * values will print as 32-bit values, but they always have been. LU-1436
+ */
+#  define RETURN(rc)							      \
+do {									      \
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		      \
+		typeof(rc) __rc = (rc);					      \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);	      \
+		libcfs_log_return(&msgdata, (long)__rc);		      \
+		return __rc;						      \
+	}								      \
+									      \
+	return rc;							      \
+} while (0)
+
+# endif /* BITS_PER_LONG > 32 */
+
+# define ENTRY	CDEBUG(D_TRACE, "Process entered\n")
+# define EXIT	CDEBUG(D_TRACE, "Process leaving\n")
+
+#else /* !CDEBUG_ENTRY_EXIT */
+
+# define GOTO(label, rc)						\
+	do {								\
+		((void)(rc));						\
+		goto label;						\
+	} while (0)
+
+# define RETURN(rc) return (rc)
+# define ENTRY	do { } while (0)
+# define EXIT	do { } while (0)
+
+#endif /* CDEBUG_ENTRY_EXIT */
+
+#define RETURN_EXIT							\
+do {									\
+	EXIT;								\
+	return;								\
+} while (0)
+
+void cfs_debug_init(void);
+
+static inline void cfs_tty_write_msg(const char *msg)
+{
+	struct tty_struct *tty;
+
+	tty = get_current_tty();
+	if (!tty)
+		return;
+	mutex_lock(&tty->atomic_write_lock);
+	tty_lock(tty);
+	if (tty->ops->write && tty->count > 0)
+		tty->ops->write(tty, msg, strlen(msg));
+	tty_unlock(tty);
+	mutex_unlock(&tty->atomic_write_lock);
+	wake_up_interruptible_poll(&tty->write_wait, POLL_OUT);
+	tty_kref_put(tty);
+}
+
+#endif	/* __LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
new file mode 100644
index 0000000000000..9e57506974d23
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#ifndef _LIBCFS_FAIL_H
+#define _LIBCFS_FAIL_H
+
+extern unsigned long cfs_fail_loc;
+extern unsigned int cfs_fail_val;
+extern int cfs_fail_err;
+
+extern wait_queue_head_t cfs_race_waitq;
+extern int cfs_race_state;
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set);
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set);
+
+enum {
+        CFS_FAIL_LOC_NOSET      = 0,
+        CFS_FAIL_LOC_ORSET      = 1,
+        CFS_FAIL_LOC_RESET      = 2,
+        CFS_FAIL_LOC_VALUE      = 3
+};
+
+/* Failure ranges
+	"0x0100 - 0x3fff" for Lustre
+	"0xe000 - 0xefff" for LNet
+	"0xf000 - 0xffff" for LNDs */
+/* Failure injection control */
+#define CFS_FAIL_MASK_SYS    0x0000FF00
+#define CFS_FAIL_MASK_LOC   (0x000000FF | CFS_FAIL_MASK_SYS)
+
+#define CFS_FAILED_BIT       30
+/* CFS_FAILED is 0x40000000 */
+#define CFS_FAILED          BIT(CFS_FAILED_BIT)
+
+#define CFS_FAIL_ONCE_BIT    31
+/* CFS_FAIL_ONCE is 0x80000000 */
+#define CFS_FAIL_ONCE       BIT(CFS_FAIL_ONCE_BIT)
+
+/* The following flags aren't made to be combined */
+#define CFS_FAIL_SKIP        0x20000000 /* skip N times then fail */
+#define CFS_FAIL_SOME        0x10000000 /* only fail N times */
+#define CFS_FAIL_RAND        0x08000000 /* fail 1/N of the times */
+#define CFS_FAIL_USR1        0x04000000 /* user flag */
+
+/* CFS_FAULT may be combined with any one of the above flags. */
+#define CFS_FAULT	     0x02000000 /* match any CFS_FAULT_CHECK */
+
+static inline bool CFS_FAIL_PRECHECK(__u32 id)
+{
+	return cfs_fail_loc != 0 &&
+	      ((cfs_fail_loc & CFS_FAIL_MASK_LOC) == (id & CFS_FAIL_MASK_LOC) ||
+	       (cfs_fail_loc & id & CFS_FAULT));
+}
+
+static inline int cfs_fail_check_set(__u32 id, __u32 value, int set, int quiet)
+{
+	unsigned long failed_once = cfs_fail_loc & CFS_FAILED; /* ok if racy */
+	int ret = 0;
+
+	if (unlikely(CFS_FAIL_PRECHECK(id) &&
+		     (ret = __cfs_fail_check_set(id, value, set)))) {
+		if (quiet && failed_once) {
+			CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n",
+			       id, value);
+		} else {
+			LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n",
+				      id, value);
+		}
+	}
+
+	return ret;
+}
+
+/* If id hit cfs_fail_loc, return 1, otherwise return 0 */
+#define CFS_FAIL_CHECK(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0)
+#define CFS_FAIL_CHECK_QUIET(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1)
+
+/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_VALUE(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0)
+#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_ORSET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0)
+#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_RESET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0)
+#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1)
+
+static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+        if (unlikely(CFS_FAIL_PRECHECK(id)))
+                return __cfs_fail_timeout_set(id, value, ms, set);
+        else
+                return 0;
+}
+
+/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT(id, secs) \
+        cfs_fail_timeout_set(id, 0, (secs) * 1000, CFS_FAIL_LOC_NOSET)
+
+#define CFS_FAIL_TIMEOUT_MS(id, ms) \
+        cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and
+ * sleep seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \
+        cfs_fail_timeout_set(id, value, (secs) * 1000, CFS_FAIL_LOC_ORSET)
+
+#define CFS_FAIL_TIMEOUT_RESET(id, value, secs) \
+	cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_RESET)
+
+#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \
+        cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET)
+
+#define CFS_FAULT_CHECK(id)			\
+	CFS_FAIL_CHECK(CFS_FAULT | (id))
+
+/* The idea here is to synchronise two threads to force a race. The
+ * first thread that calls this with a matching fail_loc is put to
+ * sleep. The next thread that calls with the same fail_loc wakes up
+ * the first and continues. */
+static inline void cfs_race(__u32 id)
+{
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			int rc;
+			cfs_race_state = 0;
+			CERROR("cfs_race id %x sleeping\n", id);
+			/*
+			 * XXX: don't wait forever as there is no guarantee
+			 * that this branch is executed first. for testing
+			 * purposes this construction works good enough
+			 */
+			rc = wait_event_interruptible_timeout(cfs_race_waitq,
+						      cfs_race_state != 0,
+						      cfs_time_seconds(5));
+			CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc);
+		} else {
+			CERROR("cfs_fail_race id %x waking\n", id);
+			cfs_race_state = 1;
+			wake_up(&cfs_race_waitq);
+		}
+	}
+}
+#define CFS_RACE(id) cfs_race(id)
+
+/**
+ * Wait on race.
+ *
+ * The first thread that calls this with a matching fail_loc is put to sleep,
+ * but subseqent callers of this won't sleep. Until another thread that calls
+ * cfs_race_wakeup(), the first thread will be woken up and continue.
+ */
+static inline void cfs_race_wait(__u32 id)
+{
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			int rc;
+
+			cfs_race_state = 0;
+			CERROR("cfs_race id %x sleeping\n", id);
+			rc = wait_event_interruptible(cfs_race_waitq,
+						      cfs_race_state != 0);
+			CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc);
+		}
+	}
+}
+#define CFS_RACE_WAIT(id) cfs_race_wait(id)
+
+/**
+ * Wake up the thread that is waiting on the matching fail_loc.
+ */
+static inline void cfs_race_wakeup(__u32 id)
+{
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (likely(!__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			CERROR("cfs_fail_race id %x waking\n", id);
+			cfs_race_state = 1;
+			wake_up(&cfs_race_waitq);
+		}
+	}
+}
+#define CFS_RACE_WAKEUP(id) cfs_race_wakeup(id)
+
+#endif /* _LIBCFS_FAIL_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h
new file mode 100644
index 0000000000000..bdf3cdd37754f
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h
@@ -0,0 +1,869 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/libcfs_hash.h
+ *
+ * Hashing routines
+ *
+ */
+
+#ifndef __LIBCFS_HASH_H__
+#define __LIBCFS_HASH_H__
+
+#include <linux/hash.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL
+
+/** disable debug */
+#define CFS_HASH_DEBUG_NONE	0
+/** record hash depth and output to console when it's too deep,
+ *  computing overhead is low but consume more memory */
+#define CFS_HASH_DEBUG_1	1
+/** expensive, check key validation */
+#define CFS_HASH_DEBUG_2	2
+
+#define CFS_HASH_DEBUG_LEVEL	CFS_HASH_DEBUG_NONE
+
+struct cfs_hash_ops;
+struct cfs_hash_lock_ops;
+struct cfs_hash_hlist_ops;
+
+union cfs_hash_lock {
+	rwlock_t		rw;		/**< rwlock */
+	spinlock_t		spin;		/**< spinlock */
+	struct rw_semaphore	rw_sem;		/**< rwsem */
+};
+
+/**
+ * cfs_hash_bucket is a container of:
+ * - lock, counter ...
+ * - array of hash-head starting from hsb_head[0], hash-head can be one of
+ *   . struct cfs_hash_head
+ *   . struct cfs_hash_head_dep
+ *   . struct cfs_hash_dhead
+ *   . struct cfs_hash_dhead_dep
+ *   which depends on requirement of user
+ * - some extra bytes (caller can require it while creating hash)
+ */
+struct cfs_hash_bucket {
+	union cfs_hash_lock	hsb_lock;	/**< bucket lock */
+	__u32			hsb_count;	/**< current entries */
+	__u32			hsb_version;	/**< change version */
+	unsigned int		hsb_index;	/**< index of bucket */
+	int			hsb_depmax;	/**< max depth on bucket */
+	long			hsb_head[0];	/**< hash-head array */
+};
+
+/**
+ * cfs_hash bucket descriptor, it's normally in stack of caller
+ */
+struct cfs_hash_bd {
+	/**< address of bucket */
+	struct cfs_hash_bucket	*bd_bucket;
+	/**< offset in bucket */
+	unsigned int		 bd_offset;
+};
+
+#define CFS_HASH_NAME_LEN           16      /**< default name length */
+#define CFS_HASH_BIGNAME_LEN        64      /**< bigname for param tree */
+
+#define CFS_HASH_BKT_BITS           3       /**< default bits of bucket */
+#define CFS_HASH_BITS_MAX           30      /**< max bits of bucket */
+#define CFS_HASH_BITS_MIN           CFS_HASH_BKT_BITS
+
+/**
+ * common hash attributes.
+ */
+enum cfs_hash_tag {
+	/**
+	 * don't need any lock, caller will protect operations with it's
+	 * own lock. With this flag:
+	 *  . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK
+	 *    will be ignored.
+	 *  . Some functions will be disabled with this flag, i.e:
+	 *    cfs_hash_for_each_empty, cfs_hash_rehash
+	 */
+	CFS_HASH_NO_LOCK	= BIT(0),
+	/** no bucket lock, use one spinlock to protect the whole hash */
+	CFS_HASH_NO_BKTLOCK	= BIT(1),
+	/** rwlock to protect bucket */
+	CFS_HASH_RW_BKTLOCK	= BIT(2),
+	/** spinlock to protect bucket */
+	CFS_HASH_SPIN_BKTLOCK	= BIT(3),
+	/** always add new item to tail */
+	CFS_HASH_ADD_TAIL	= BIT(4),
+	/** hash-table doesn't have refcount on item */
+	CFS_HASH_NO_ITEMREF	= BIT(5),
+	/** big name for param-tree */
+	CFS_HASH_BIGNAME	= BIT(6),
+	/** track global count */
+	CFS_HASH_COUNTER	= BIT(7),
+	/** rehash item by new key */
+	CFS_HASH_REHASH_KEY	= BIT(8),
+	/** Enable dynamic hash resizing */
+	CFS_HASH_REHASH		= BIT(9),
+	/** can shrink hash-size */
+	CFS_HASH_SHRINK		= BIT(10),
+	/** assert hash is empty on exit */
+	CFS_HASH_ASSERT_EMPTY	= BIT(11),
+	/** record hlist depth */
+	CFS_HASH_DEPTH		= BIT(12),
+	/**
+	 * rehash is always scheduled in a different thread, so current
+	 * change on hash table is non-blocking
+	 */
+	CFS_HASH_NBLK_CHANGE	= BIT(13),
+	/** rw semaphore lock to protect bucket */
+	CFS_HASH_RW_SEM_BKTLOCK	= BIT(14),
+	/** NB, we typed hs_flags as  __u16, please change it
+	 * if you need to extend >=16 flags
+	 */
+};
+
+/** most used attributes */
+#define CFS_HASH_DEFAULT       (CFS_HASH_RW_BKTLOCK | \
+                                CFS_HASH_COUNTER | CFS_HASH_REHASH)
+
+/**
+ * cfs_hash is a hash-table implementation for general purpose, it can support:
+ *    . two refcount modes
+ *      hash-table with & without refcount
+ *    . four lock modes
+ *      nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock
+ *    . general operations
+ *      lookup, add(add_tail or add_head), delete
+ *    . rehash
+ *      grows or shrink
+ *    . iteration
+ *      locked iteration and unlocked iteration
+ *    . bigname
+ *      support long name hash
+ *    . debug
+ *      trace max searching depth
+ *
+ * Rehash:
+ * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker)
+ * is spawned to handle the rehash in the background, it's possible that other
+ * processes can concurrently perform additions, deletions, and lookups
+ * without being blocked on rehash completion, because rehash will release
+ * the global wrlock for each bucket.
+ *
+ * rehash and iteration can't run at the same time because it's too tricky
+ * to keep both of them safe and correct.
+ * As they are relatively rare operations, so:
+ *   . if iteration is in progress while we try to launch rehash, then
+ *     it just giveup, iterator will launch rehash at the end.
+ *   . if rehash is in progress while we try to iterate the hash table,
+ *     then we just wait (shouldn't be very long time), anyway, nobody
+ *     should expect iteration of whole hash-table to be non-blocking.
+ *
+ * During rehashing, a (key,object) pair may be in one of two buckets,
+ * depending on whether the worker task has yet to transfer the object
+ * to its new location in the table. Lookups and deletions need to search both
+ * locations; additions must take care to only insert into the new bucket.
+ */
+
+struct cfs_hash {
+	/** serialize with rehash, or serialize all operations if
+	 * the hash-table has CFS_HASH_NO_BKTLOCK */
+	union cfs_hash_lock		hs_lock;
+	/** hash operations */
+	struct cfs_hash_ops		*hs_ops;
+	/** hash lock operations */
+	struct cfs_hash_lock_ops	*hs_lops;
+	/** hash list operations */
+	struct cfs_hash_hlist_ops	*hs_hops;
+	/** hash buckets-table */
+	struct cfs_hash_bucket		**hs_buckets;
+	/** total number of items on this hash-table */
+	atomic_t			hs_count;
+	/** hash flags, see cfs_hash_tag for detail */
+	__u16                       hs_flags;
+	/** # of extra-bytes for bucket, for user saving extended attributes */
+        __u16                       hs_extra_bytes;
+        /** wants to iterate */
+        __u8                        hs_iterating;
+        /** hash-table is dying */
+        __u8                        hs_exiting;
+        /** current hash bits */
+        __u8                        hs_cur_bits;
+        /** min hash bits */
+        __u8                        hs_min_bits;
+        /** max hash bits */
+        __u8                        hs_max_bits;
+        /** bits for rehash */
+        __u8                        hs_rehash_bits;
+        /** bits for each bucket */
+        __u8                        hs_bkt_bits;
+        /** resize min threshold */
+        __u16                       hs_min_theta;
+        /** resize max threshold */
+        __u16                       hs_max_theta;
+        /** resize count */
+        __u32                       hs_rehash_count;
+        /** # of iterators (caller of cfs_hash_for_each_*) */
+        __u32                       hs_iterators;
+	/** rehash workitem */
+	struct work_struct		hs_rehash_work;
+	/** refcount on this hash table */
+	atomic_t			hs_refcount;
+	/** rehash buckets-table */
+	struct cfs_hash_bucket		**hs_rehash_buckets;
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+        /** serialize debug members */
+	spinlock_t		    hs_dep_lock;
+        /** max depth */
+        unsigned int                hs_dep_max;
+        /** id of the deepest bucket */
+        unsigned int                hs_dep_bkt;
+        /** offset in the deepest bucket */
+        unsigned int                hs_dep_off;
+        /** bits when we found the max depth */
+        unsigned int                hs_dep_bits;
+        /** workitem to output max depth */
+	struct work_struct		hs_dep_work;
+#endif
+        /** name of htable */
+        char                        hs_name[0];
+};
+
+struct cfs_hash_lock_ops {
+        /** lock the hash table */
+	void    (*hs_lock)(union cfs_hash_lock *lock, int exclusive);
+        /** unlock the hash table */
+	void    (*hs_unlock)(union cfs_hash_lock *lock, int exclusive);
+        /** lock the hash bucket */
+	void    (*hs_bkt_lock)(union cfs_hash_lock *lock, int exclusive);
+        /** unlock the hash bucket */
+	void    (*hs_bkt_unlock)(union cfs_hash_lock *lock, int exclusive);
+};
+
+struct cfs_hash_hlist_ops {
+	/** return hlist_head of hash-head of @bd */
+	struct hlist_head *(*hop_hhead)(struct cfs_hash *hs, struct cfs_hash_bd *bd);
+	/** return hash-head size */
+	int (*hop_hhead_size)(struct cfs_hash *hs);
+	/** add @hnode to hash-head of @bd */
+	int (*hop_hnode_add)(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode);
+	/** remove @hnode from hash-head of @bd */
+	int (*hop_hnode_del)(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode);
+};
+
+struct cfs_hash_ops {
+	/** return hashed value from @key */
+	unsigned (*hs_hash)(struct cfs_hash *hs, const void *key, unsigned mask);
+	/** return key address of @hnode */
+	void *   (*hs_key)(struct hlist_node *hnode);
+	/** copy key from @hnode to @key */
+	void     (*hs_keycpy)(struct hlist_node *hnode, void *key);
+	/**
+	 *  compare @key with key of @hnode
+	 *  returns 1 on a match
+	 */
+	int      (*hs_keycmp)(const void *key, struct hlist_node *hnode);
+	/** return object address of @hnode, i.e: container_of(...hnode) */
+	void *   (*hs_object)(struct hlist_node *hnode);
+	/** get refcount of item, always called with holding bucket-lock */
+	void     (*hs_get)(struct cfs_hash *hs, struct hlist_node *hnode);
+	/** release refcount of item */
+	void     (*hs_put)(struct cfs_hash *hs, struct hlist_node *hnode);
+	/** release refcount of item, always called with holding bucket-lock */
+	void     (*hs_put_locked)(struct cfs_hash *hs, struct hlist_node *hnode);
+	/** it's called before removing of @hnode */
+	void     (*hs_exit)(struct cfs_hash *hs, struct hlist_node *hnode);
+};
+
+/** total number of buckets in @hs */
+#define CFS_HASH_NBKT(hs)       \
+        (1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits))
+
+/** total number of buckets in @hs while rehashing */
+#define CFS_HASH_RH_NBKT(hs)    \
+        (1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits))
+
+/** number of hlist for in bucket */
+#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits)
+
+/** total number of hlist in @hs */
+#define CFS_HASH_NHLIST(hs)     (1U << (hs)->hs_cur_bits)
+
+/** total number of hlist in @hs while rehashing */
+#define CFS_HASH_RH_NHLIST(hs)  (1U << (hs)->hs_rehash_bits)
+
+static inline int
+cfs_hash_with_no_lock(struct cfs_hash *hs)
+{
+        /* caller will serialize all operations for this hash-table */
+        return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_no_bktlock(struct cfs_hash *hs)
+{
+        /* no bucket lock, one single lock to protect the hash-table */
+        return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_rw_bktlock(struct cfs_hash *hs)
+{
+        /* rwlock to protect hash bucket */
+        return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_spin_bktlock(struct cfs_hash *hs)
+{
+        /* spinlock to protect hash bucket */
+        return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_rw_sem_bktlock(struct cfs_hash *hs)
+{
+	/* rw sem lock to protect hash bucket */
+	return (hs->hs_flags & CFS_HASH_RW_SEM_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_add_tail(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0;
+}
+
+static inline int
+cfs_hash_with_no_itemref(struct cfs_hash *hs)
+{
+        /* hash-table doesn't keep refcount on item,
+         * item can't be removed from hash unless it's
+         * ZERO refcount */
+        return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0;
+}
+
+static inline int
+cfs_hash_with_bigname(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_BIGNAME) != 0;
+}
+
+static inline int
+cfs_hash_with_counter(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_COUNTER) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_REHASH) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash_key(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0;
+}
+
+static inline int
+cfs_hash_with_shrink(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_SHRINK) != 0;
+}
+
+static inline int
+cfs_hash_with_assert_empty(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0;
+}
+
+static inline int
+cfs_hash_with_depth(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_DEPTH) != 0;
+}
+
+static inline int
+cfs_hash_with_nblk_change(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0;
+}
+
+static inline int
+cfs_hash_is_exiting(struct cfs_hash *hs)
+{       /* cfs_hash_destroy is called */
+        return hs->hs_exiting;
+}
+
+static inline int
+cfs_hash_is_rehashing(struct cfs_hash *hs)
+{       /* rehash is launched */
+        return hs->hs_rehash_bits != 0;
+}
+
+static inline int
+cfs_hash_is_iterating(struct cfs_hash *hs)
+{       /* someone is calling cfs_hash_for_each_* */
+        return hs->hs_iterating || hs->hs_iterators != 0;
+}
+
+static inline int
+cfs_hash_bkt_size(struct cfs_hash *hs)
+{
+	return offsetof(struct cfs_hash_bucket, hsb_head[0]) +
+               hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) +
+               hs->hs_extra_bytes;
+}
+
+static inline unsigned
+cfs_hash_id(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return hs->hs_ops->hs_hash(hs, key, mask);
+}
+
+static inline void *
+cfs_hash_key(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_key(hnode);
+}
+
+static inline void
+cfs_hash_keycpy(struct cfs_hash *hs, struct hlist_node *hnode, void *key)
+{
+	if (hs->hs_ops->hs_keycpy != NULL)
+		hs->hs_ops->hs_keycpy(hnode, key);
+}
+
+/**
+ * Returns 1 on a match,
+ */
+static inline int
+cfs_hash_keycmp(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_keycmp(key, hnode);
+}
+
+static inline void *
+cfs_hash_object(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_object(hnode);
+}
+
+static inline void
+cfs_hash_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_get(hs, hnode);
+}
+
+static inline void
+cfs_hash_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_put_locked(hs, hnode);
+}
+
+static inline void
+cfs_hash_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_put(hs, hnode);
+}
+
+static inline void
+cfs_hash_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	if (hs->hs_ops->hs_exit)
+		hs->hs_ops->hs_exit(hs, hnode);
+}
+
+static inline void cfs_hash_lock(struct cfs_hash *hs, int excl)
+{
+        hs->hs_lops->hs_lock(&hs->hs_lock, excl);
+}
+
+static inline void cfs_hash_unlock(struct cfs_hash *hs, int excl)
+{
+        hs->hs_lops->hs_unlock(&hs->hs_lock, excl);
+}
+
+static inline int cfs_hash_dec_and_lock(struct cfs_hash *hs,
+					atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_no_bktlock(hs));
+	return atomic_dec_and_lock(condition, &hs->hs_lock.spin);
+}
+
+static inline void cfs_hash_bd_lock(struct cfs_hash *hs,
+				    struct cfs_hash_bd *bd, int excl)
+{
+        hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+static inline void cfs_hash_bd_unlock(struct cfs_hash *hs,
+				      struct cfs_hash_bd *bd, int excl)
+{
+        hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are normally for hash-table without rehash
+ */
+void cfs_hash_bd_get(struct cfs_hash *hs, const void *key,
+		     struct cfs_hash_bd *bd);
+
+static inline void
+cfs_hash_bd_get_and_lock(struct cfs_hash *hs, const void *key,
+			 struct cfs_hash_bd *bd, int excl)
+{
+        cfs_hash_bd_get(hs, key, bd);
+        cfs_hash_bd_lock(hs, bd, excl);
+}
+
+static inline unsigned
+cfs_hash_bd_index_get(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+        return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits);
+}
+
+static inline void
+cfs_hash_bd_index_set(struct cfs_hash *hs, unsigned index,
+		      struct cfs_hash_bd *bd)
+{
+        bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits];
+        bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U);
+}
+
+static inline void *
+cfs_hash_bd_extra_get(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+        return (void *)bd->bd_bucket +
+               cfs_hash_bkt_size(hs) - hs->hs_extra_bytes;
+}
+
+static inline __u32
+cfs_hash_bd_version_get(struct cfs_hash_bd *bd)
+{
+        /* need hold cfs_hash_bd_lock */
+        return bd->bd_bucket->hsb_version;
+}
+
+static inline __u32
+cfs_hash_bd_count_get(struct cfs_hash_bd *bd)
+{
+        /* need hold cfs_hash_bd_lock */
+        return bd->bd_bucket->hsb_count;
+}
+
+static inline int
+cfs_hash_bd_depmax_get(struct cfs_hash_bd *bd)
+{
+        return bd->bd_bucket->hsb_depmax;
+}
+
+static inline int
+cfs_hash_bd_compare(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2)
+{
+        if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index)
+                return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index;
+
+        if (bd1->bd_offset != bd2->bd_offset)
+                return bd1->bd_offset - bd2->bd_offset;
+
+        return 0;
+}
+
+void cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old,
+			     struct cfs_hash_bd *bd_new,
+			     struct hlist_node *hnode);
+
+static inline int
+cfs_hash_bd_dec_and_lock(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			 atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_spin_bktlock(hs));
+	return atomic_dec_and_lock(condition, &bd->bd_bucket->hsb_lock.spin);
+}
+
+static inline struct hlist_head *
+cfs_hash_bd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	return hs->hs_hops->hop_hhead(hs, bd);
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			  const void *key);
+struct hlist_node *
+cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			const void *key);
+struct hlist_node *
+cfs_hash_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   const void *key, struct hlist_node *hnode,
+			   int insist_add);
+struct hlist_node *
+cfs_hash_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   const void *key, struct hlist_node *hnode);
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are safe for hash-table with rehash
+ */
+void cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key,
+			  struct cfs_hash_bd *bds);
+void cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+			   int excl);
+void cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+			     int excl);
+
+static inline void
+cfs_hash_dual_bd_get_and_lock(struct cfs_hash *hs, const void *key,
+			      struct cfs_hash_bd *bds, int excl)
+{
+	cfs_hash_dual_bd_get(hs, key, bds);
+	cfs_hash_dual_bd_lock(hs, bds, excl);
+}
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key);
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode,
+				int insist_add);
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode);
+
+/* Hash init/cleanup functions */
+struct cfs_hash *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+		unsigned bkt_bits, unsigned extra_bytes,
+		unsigned min_theta, unsigned max_theta,
+		struct cfs_hash_ops *ops, unsigned flags);
+
+struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs);
+void cfs_hash_putref(struct cfs_hash *hs);
+
+/* Hash addition functions */
+void cfs_hash_add(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode);
+int cfs_hash_add_unique(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode);
+void *cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key,
+			      struct hlist_node *hnode);
+
+/* Hash deletion functions */
+void *cfs_hash_del(struct cfs_hash *hs, const void *key,
+		   struct hlist_node *hnode);
+void *cfs_hash_del_key(struct cfs_hash *hs, const void *key);
+
+/* Hash lookup/for_each functions */
+#define CFS_HASH_LOOP_HOG       1024
+
+typedef int (*cfs_hash_for_each_cb_t)(struct cfs_hash *hs,
+				      struct cfs_hash_bd *bd,
+				      struct hlist_node *node,
+				      void *data);
+void *
+cfs_hash_lookup(struct cfs_hash *hs, const void *key);
+void
+cfs_hash_for_each(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data);
+void
+cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data);
+int
+cfs_hash_for_each_nolock(struct cfs_hash *hs, cfs_hash_for_each_cb_t,
+			 void *data, int start);
+int
+cfs_hash_for_each_empty(struct cfs_hash *hs, cfs_hash_for_each_cb_t,
+			void *data);
+void
+cfs_hash_for_each_key(struct cfs_hash *hs, const void *key,
+		      cfs_hash_for_each_cb_t, void *data);
+typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data);
+void
+cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t, void *data);
+
+void
+cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex,
+			cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_is_empty(struct cfs_hash *hs);
+__u64 cfs_hash_size_get(struct cfs_hash *hs);
+
+/*
+ * Rehash - Theta is calculated to be the average chained
+ * hash depth assuming a perfectly uniform hash function.
+ */
+void cfs_hash_rehash_cancel_locked(struct cfs_hash *hs);
+void cfs_hash_rehash_cancel(struct cfs_hash *hs);
+void cfs_hash_rehash(struct cfs_hash *hs, int do_rehash);
+void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key,
+			void *new_key, struct hlist_node *hnode);
+
+#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1
+/* Validate hnode references the correct key */
+static inline void
+cfs_hash_key_validate(struct cfs_hash *hs, const void *key,
+		      struct hlist_node *hnode)
+{
+	LASSERT(cfs_hash_keycmp(hs, key, hnode));
+}
+
+/* Validate hnode is in the correct bucket */
+static inline void
+cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			struct hlist_node *hnode)
+{
+	struct cfs_hash_bd bds[2];
+
+	cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds);
+	LASSERT(bds[0].bd_bucket == bd->bd_bucket ||
+		bds[1].bd_bucket == bd->bd_bucket);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */
+
+static inline void
+cfs_hash_key_validate(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode) {}
+
+static inline void
+cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			struct hlist_node *hnode) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL */
+
+#define CFS_HASH_THETA_BITS  10
+#define CFS_HASH_MIN_THETA  (1U << (CFS_HASH_THETA_BITS - 1))
+#define CFS_HASH_MAX_THETA  (1U << (CFS_HASH_THETA_BITS + 1))
+
+/* Return integer component of theta */
+static inline int __cfs_hash_theta_int(int theta)
+{
+        return (theta >> CFS_HASH_THETA_BITS);
+}
+
+/* Return a fractional value between 0 and 999 */
+static inline int __cfs_hash_theta_frac(int theta)
+{
+        return ((theta * 1000) >> CFS_HASH_THETA_BITS) -
+               (__cfs_hash_theta_int(theta) * 1000);
+}
+
+static inline int __cfs_hash_theta(struct cfs_hash *hs)
+{
+	return (atomic_read(&hs->hs_count) <<
+		CFS_HASH_THETA_BITS) >> hs->hs_cur_bits;
+}
+
+static inline void
+__cfs_hash_set_theta(struct cfs_hash *hs, int min, int max)
+{
+        LASSERT(min < max);
+        hs->hs_min_theta = (__u16)min;
+        hs->hs_max_theta = (__u16)max;
+}
+
+/* Generic debug formatting routines mainly for proc handler */
+struct seq_file;
+void cfs_hash_debug_header(struct seq_file *m);
+void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m);
+
+/*
+ * Generic djb2 hash algorithm for character arrays.
+ */
+static inline unsigned
+cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask)
+{
+        unsigned i, hash = 5381;
+
+        LASSERT(key != NULL);
+
+        for (i = 0; i < size; i++)
+                hash = hash * 33 + ((char *)key)[i];
+
+        return (hash & mask);
+}
+
+/*
+ * Generic u32 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u32_hash(const __u32 key, unsigned mask)
+{
+        return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask);
+}
+
+/*
+ * Generic u64 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u64_hash(const __u64 key, unsigned mask)
+{
+        return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask);
+}
+
+/** iterate over all buckets in @bds (array of struct cfs_hash_bd) */
+#define cfs_hash_for_each_bd(bds, n, i) \
+        for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++)
+
+/** iterate over all buckets of @hs */
+#define cfs_hash_for_each_bucket(hs, bd, pos)                   \
+        for (pos = 0;                                           \
+             pos < CFS_HASH_NBKT(hs) &&                         \
+             ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++)
+
+/** iterate over all hlist of bucket @bd */
+#define cfs_hash_bd_for_each_hlist(hs, bd, hlist)               \
+        for ((bd)->bd_offset = 0;                               \
+             (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) &&       \
+             (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL;       \
+             (bd)->bd_offset++)
+
+/* !__LIBCFS__HASH_H__ */
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
new file mode 100644
index 0000000000000..a60f14286f511
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
@@ -0,0 +1,341 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/libcfs_private.h
+ *
+ * Various defines for libcfs.
+ *
+ */
+
+#ifndef __LIBCFS_PRIVATE_H__
+#define __LIBCFS_PRIVATE_H__
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#ifdef LIBCFS_DEBUG
+
+/*
+ * When this is on, LASSERT macro includes check for assignment used instead
+ * of equality check, but doesn't have unlikely(). Turn this on from time to
+ * time to make test-builds. This shouldn't be on for production release.
+ */
+#define LASSERT_CHECKED (0)
+
+#if LASSERT_CHECKED
+/*
+ * Assertion.
+ *
+ * Strange construction with empty "then" clause is used to trigger compiler
+ * warnings on the assertions of the form LASSERT(a = b);
+ *
+ * "warning: suggest parentheses around assignment used as truth value"
+ *
+ * requires -Wall. Unfortunately this rules out use of likely/unlikely.
+ */
+#define LASSERTF(cond, fmt, ...)					\
+do {									\
+	if (cond)							\
+		;							\
+	else {								\
+		LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL);	\
+		libcfs_debug_msg(&__msg_data,				\
+				 "ASSERTION( %s ) failed: " fmt, #cond,	\
+				 ## __VA_ARGS__);			\
+		lbug_with_loc(&__msg_data);				\
+	}								\
+} while (0)
+
+#define LASSERT(cond) LASSERTF(cond, "\n")
+
+#else /* !LASSERT_CHECKED */
+
+#define LASSERTF(cond, fmt, ...)					\
+do {									\
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL);	\
+		libcfs_debug_msg(&__msg_data,				\
+				 "ASSERTION( %s ) failed: " fmt, #cond,	\
+				 ## __VA_ARGS__);			\
+		lbug_with_loc(&__msg_data);				\
+	}								\
+} while (0)
+
+#define LASSERT(cond) LASSERTF(cond, "\n")
+#endif /* !LASSERT_CHECKED */
+#else /* !LIBCFS_DEBUG */
+/* sizeof is to use expression without evaluating it. */
+# define LASSERT(e) ((void)sizeof!!(e))
+# define LASSERTF(cond, ...) ((void)sizeof!!(cond))
+#endif /* !LIBCFS_DEBUG */
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+/**
+ * This is for more expensive checks that one doesn't want to be enabled all
+ * the time. LINVRNT() has to be explicitly enabled by --enable-invariants
+ * configure option.
+ */
+# define LINVRNT(exp) LASSERT(exp)
+#else
+# define LINVRNT(exp) ((void)sizeof!!(exp))
+#endif
+
+void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msg);
+
+#define LBUG()                                                          \
+do {                                                                    \
+        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);             \
+        lbug_with_loc(&msgdata);                                        \
+} while(0)
+
+/*
+ * Memory
+ */
+#ifdef LIBCFS_DEBUG
+
+extern atomic64_t libcfs_kmem;
+
+# define libcfs_kmem_inc(ptr, size)		\
+do {						\
+	atomic64_add(size, &libcfs_kmem);	\
+} while (0)
+
+# define libcfs_kmem_dec(ptr, size)		\
+do {						\
+	atomic64_sub(size, &libcfs_kmem);	\
+} while (0)
+
+# define libcfs_kmem_read()			\
+	(long long)atomic64_read(&libcfs_kmem)
+
+#else
+# define libcfs_kmem_inc(ptr, size) do {} while (0)
+# define libcfs_kmem_dec(ptr, size) do {} while (0)
+# define libcfs_kmem_read()	(0)
+#endif /* LIBCFS_DEBUG */
+
+#ifndef LIBCFS_VMALLOC_SIZE
+#define LIBCFS_VMALLOC_SIZE        (2 << PAGE_SHIFT) /* 2 pages */
+#endif
+
+#define LIBCFS_ALLOC_PRE(size, mask)					    \
+do {									    \
+	LASSERT(!in_interrupt() ||					    \
+		((size) <= LIBCFS_VMALLOC_SIZE &&			    \
+		 ((mask) & GFP_ATOMIC)) != 0);				    \
+} while (0)
+
+#define LIBCFS_ALLOC_POST(ptr, size)					      \
+do {									      \
+	if (unlikely((ptr) == NULL)) {					      \
+		CERROR("LNET: out of memory at %s:%d (tried to alloc '"	      \
+		       #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));    \
+		CERROR("LNET: %lld total bytes allocated by lnet\n",	      \
+		       libcfs_kmem_read());			      \
+	} else {							      \
+		libcfs_kmem_inc((ptr), (size));				      \
+		CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %lld).\n",  \
+		       (int)(size), (ptr), libcfs_kmem_read());		      \
+	}                                                                     \
+} while (0)
+
+/**
+ * allocate memory with GFP flags @mask
+ * The allocated memory is zeroed-out.
+ */
+#define LIBCFS_ALLOC_GFP(ptr, size, mask)				    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		kzalloc((size), (mask)) : vzalloc(size);		    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/**
+ * default allocator
+ */
+#define LIBCFS_ALLOC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, GFP_NOFS)
+
+/**
+ * non-sleeping allocator
+ */
+#define LIBCFS_ALLOC_ATOMIC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC)
+
+/**
+ * allocate memory for specified CPU partition
+ *   \a cptab != NULL, \a cpt is CPU partition id of \a cptab
+ *   \a cptab == NULL, \a cpt is HW NUMA node id
+ * The allocated memory is zeroed-out.
+ */
+#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask)		    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		cfs_cpt_malloc((cptab), (cpt), (size), (mask) | __GFP_ZERO) : \
+		cfs_cpt_vzalloc((cptab), (cpt), (size));		    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/** default numa allocator */
+#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)				    \
+	LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
+
+void init_libcfs_vfree_atomic(void);
+void exit_libcfs_vfree_atomic(void);
+
+#define LIBCFS_FREE(ptr, size)						\
+do {									\
+	int s = (size);							\
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "	\
+		       "%s:%d\n", s, __FILE__, __LINE__);		\
+		break;							\
+	}								\
+	libcfs_kmem_dec((ptr), s);					\
+	CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %lld).\n",	\
+	       s, (ptr), libcfs_kmem_read());				\
+	if (unlikely(s > LIBCFS_VMALLOC_SIZE))				\
+		libcfs_vfree_atomic(ptr);				\
+	else								\
+		kfree(ptr);						\
+} while (0)
+
+/******************************************************************************/
+
+void libcfs_debug_dumplog(void);
+int libcfs_debug_init(unsigned long bufsize);
+int libcfs_debug_cleanup(void);
+int libcfs_debug_clear_buffer(void);
+int libcfs_debug_mark_buffer(const char *text);
+
+#define LASSERT_ATOMIC_ENABLED          (1)
+
+#if LASSERT_ATOMIC_ENABLED
+
+/** assert value of @a is equal to @v */
+#define LASSERT_ATOMIC_EQ(a, v)				\
+	LASSERTF(atomic_read(a) == v, "value: %d\n", atomic_read((a)));
+
+/** assert value of @a is unequal to @v */
+#define LASSERT_ATOMIC_NE(a, v)				\
+	LASSERTF(atomic_read(a) != v, "value: %d\n", atomic_read((a)));
+
+/** assert value of @a is little than @v */
+#define LASSERT_ATOMIC_LT(a, v)				\
+	LASSERTF(atomic_read(a) < v, "value: %d\n", atomic_read((a)));
+
+/** assert value of @a is little/equal to @v */
+#define LASSERT_ATOMIC_LE(a, v)				\
+	LASSERTF(atomic_read(a) <= v, "value: %d\n", atomic_read((a)));
+
+/** assert value of @a is great than @v */
+#define LASSERT_ATOMIC_GT(a, v)				\
+	LASSERTF(atomic_read(a) > v, "value: %d\n", atomic_read((a)));
+
+/** assert value of @a is great/equal to @v */
+#define LASSERT_ATOMIC_GE(a, v)				\
+	LASSERTF(atomic_read(a) >= v, "value: %d\n", atomic_read((a)));
+
+/** assert value of @a is great than @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)			\
+do {							\
+	int __v = atomic_read(a);			\
+	LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v);\
+} while (0)
+
+/** assert value of @a is great than @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)			\
+do {							\
+	int __v = atomic_read(a);			\
+	LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v);\
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)			\
+do {							\
+	int __v = atomic_read(a);			\
+	LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v);\
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)                         \
+do {                                                            \
+	int __v = atomic_read(a);				\
+	LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v);   \
+} while (0)
+
+#else /* !LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_EQ(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_NE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_LT(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_LE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GT(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)         do {} while (0)
+
+#endif /* LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_ZERO(a)                  LASSERT_ATOMIC_EQ(a, 0)
+#define LASSERT_ATOMIC_POS(a)                   LASSERT_ATOMIC_GT(a, 0)
+
+#define CFS_ALLOC_PTR(ptr)      LIBCFS_ALLOC(ptr, sizeof(*(ptr)));
+#define CFS_ALLOC_PTR_ARRAY(ptr, count)			\
+	LIBCFS_ALLOC(ptr, (count) * sizeof(*(ptr)))
+
+#define CFS_FREE_PTR(ptr)       LIBCFS_FREE(ptr, sizeof(*(ptr)));
+#define CFS_FREE_PTR_ARRAY(ptr, count)			\
+	LIBCFS_FREE(ptr, (count) * sizeof(*(ptr)))
+
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
+
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline size_t cfs_size_round(size_t val)
+{
+        return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
new file mode 100644
index 0000000000000..bc2e03cf3fb42
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
@@ -0,0 +1,86 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/libcfs_string.h
+ *
+ * Generic string manipulation functions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#ifndef __LIBCFS_STRING_H__
+#define __LIBCFS_STRING_H__
+
+/* libcfs_string.c */
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+		 int *oldmask, int minmask, int allmask, int defmask);
+
+/**
+ * Structure to represent NULL-less strings.
+ */
+struct cfs_lstr {
+	char		*ls_str;
+	int		ls_len;
+};
+
+/*
+ * Structure to represent \<range_expr\> token of the syntax.
+ */
+struct cfs_range_expr {
+	/*
+	 * Link to cfs_expr_list::el_exprs.
+	 */
+	struct list_head	re_link;
+	__u32			re_lo;
+	__u32			re_hi;
+	__u32			re_stride;
+};
+
+struct cfs_expr_list {
+	struct list_head	el_link;
+	struct list_head	el_exprs;
+};
+
+int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
+int cfs_str2num_check(char *str, int nob, unsigned *num,
+		      unsigned min, unsigned max);
+int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
+int cfs_expr_list_print(char *buffer, int count,
+			struct cfs_expr_list *expr_list);
+int cfs_expr_list_values(struct cfs_expr_list *expr_list,
+			 int max, __u32 **values);
+void cfs_expr_list_values_free(__u32 *values, int num);
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
+int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+			struct cfs_expr_list **elpp);
+void cfs_expr_list_free_list(struct list_head *list);
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h
new file mode 100644
index 0000000000000..d10ec77ca2cd6
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h
@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/libcfs_workitem.h
+ *
+ * Author: Isaac Huang  <he.h.huang@oracle.com>
+ *         Liang Zhen   <zhen.liang@sun.com>
+ *
+ * A workitems is deferred work with these semantics:
+ * - a workitem always runs in thread context.
+ * - a workitem can be concurrent with other workitems but is strictly
+ *   serialized with respect to itself.
+ * - no CPU affinity, a workitem does not necessarily run on the same CPU
+ *   that schedules it. However, this might change in the future.
+ * - if a workitem is scheduled again before it has a chance to run, it
+ *   runs only once.
+ * - if a workitem is scheduled while it runs, it runs again after it
+ *   completes; this ensures that events occurring while other events are
+ *   being processed receive due attention. This behavior also allows a
+ *   workitem to reschedule itself.
+ *
+ * Usage notes:
+ * - a workitem can sleep but it should be aware of how that sleep might
+ *   affect others.
+ * - a workitem runs inside a kernel thread so there's no user space to access.
+ * - do not use a workitem if the scheduling latency can't be tolerated.
+ *
+ * When wi_action returns non-zero, it means the workitem has either been
+ * freed or reused and workitem scheduler won't touch it any more.
+ */
+
+#ifndef __LIBCFS_WORKITEM_H__
+#define __LIBCFS_WORKITEM_H__
+
+struct cfs_wi_sched;
+
+void cfs_wi_sched_destroy(struct cfs_wi_sched *);
+int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt,
+			int nthrs, struct cfs_wi_sched **);
+
+struct cfs_workitem;
+
+typedef int (*cfs_wi_action_t) (struct cfs_workitem *);
+
+struct cfs_workitem {
+	/** chain on runq or rerunq */
+	struct list_head	wi_list;
+	/** working function */
+	cfs_wi_action_t		wi_action;
+	/** in running */
+	unsigned short		wi_running:1;
+	/** scheduled */
+	unsigned short		wi_scheduled:1;
+};
+
+static inline void
+cfs_wi_init(struct cfs_workitem *wi, cfs_wi_action_t action)
+{
+	INIT_LIST_HEAD(&wi->wi_list);
+
+	wi->wi_running	 = 0;
+	wi->wi_scheduled = 0;
+	wi->wi_action	 = action;
+}
+
+void cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi);
+int  cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi);
+void cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi);
+
+int  cfs_wi_startup(void);
+void cfs_wi_shutdown(void);
+
+/** # workitem scheduler loops before reschedule */
+#define CFS_WI_RESCHED    128
+
+#endif /* __LIBCFS_WORKITEM_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/glob.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/glob.h
new file mode 100644
index 0000000000000..fca03b5526878
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/glob.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_GLOB_H
+#define _LINUX_GLOB_H
+
+#ifndef HAVE_GLOB
+
+#include <linux/types.h>	/* For bool */
+#include <linux/compiler.h>	/* For __pure */
+
+bool __pure glob_match(char const *pat, char const *str);
+#endif /* !HAVE_GLOB */
+
+#endif	/* _LINUX_GLOB_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
new file mode 100644
index 0000000000000..22ffe71a4c3e7
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
@@ -0,0 +1,52 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_LINUX_CPU_H__
+#define __LIBCFS_LINUX_CPU_H__
+
+#include <linux/cpu.h>
+
+#ifndef HAVE_TOPOLOGY_SIBLING_CPUMASK
+# define topology_sibling_cpumask(cpu)	topology_thread_cpumask(cpu)
+#endif /* HAVE_TOPOLOGY_SIBLING_CPUMASK */
+
+#ifndef HAVE_CPUS_READ_LOCK
+# define cpus_read_lock		get_online_cpus
+# define cpus_read_unlock	put_online_cpus
+#endif
+
+#endif /* __LIBCFS_LINUX_CPU_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fortify-string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fortify-string.h
new file mode 100644
index 0000000000000..aeed8c5a0614c
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fortify-string.h
@@ -0,0 +1,296 @@
+#ifndef _LIBCFS_FORTIFY_STRING_H
+#define _LIBCFS_FORTIFY_STRING_H
+
+#ifdef HAVE_LINUX_FORTIFY_STRING_HEADER
+#include <linux/fortify-string.h>
+
+/*
+ * Linux v5.11-11104-ga28a6e860c6c introduces fortify-string.h
+ * where an unsafe_memcpy is provided in Linux v5.18-rc5-1405-g43213daed6d6
+ *
+ * This following is excerpted from the Linux v6.1 fortified memcpy()
+ * which resolves some corner cases, one of which is triggered in lustre
+ */
+#ifndef unsafe_memcpy
+
+#include <linux/bug.h>
+#include <linux/const.h>
+#include <linux/limits.h>
+
+#ifndef __RENAME
+#define __RENAME(x) __asm__(#x)
+#endif
+
+void fortify_panic(const char *name) __noreturn __cold;
+void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)");
+void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)");
+void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?");
+void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)");
+void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?");
+
+#define __compiletime_strlen(p)					\
+({								\
+	char *__p = (char *)(p);				\
+	size_t __ret = SIZE_MAX;				\
+	size_t __p_size = __member_size(p);			\
+	if (__p_size != SIZE_MAX &&				\
+	    __builtin_constant_p(*__p)) {			\
+		size_t __p_len = __p_size - 1;			\
+		if (__builtin_constant_p(__p[__p_len]) &&	\
+		    __p[__p_len] == '\0')			\
+			__ret = __builtin_strlen(__p);		\
+	}							\
+	__ret;							\
+})
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
+extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
+extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
+extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
+extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
+extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
+extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
+extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
+extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
+extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
+#else
+
+#if defined(__SANITIZE_MEMORY__)
+/*
+ * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the
+ * corresponding __msan_XXX functions.
+ */
+#include <linux/kmsan_string.h>
+#define __underlying_memcpy	__msan_memcpy
+#define __underlying_memmove	__msan_memmove
+#define __underlying_memset	__msan_memset
+#else
+#define __underlying_memcpy	__builtin_memcpy
+#define __underlying_memmove	__builtin_memmove
+#define __underlying_memset	__builtin_memset
+#endif
+
+#define __underlying_memchr	__builtin_memchr
+#define __underlying_memcmp	__builtin_memcmp
+#define __underlying_strcat	__builtin_strcat
+#define __underlying_strcpy	__builtin_strcpy
+#define __underlying_strlen	__builtin_strlen
+#define __underlying_strncat	__builtin_strncat
+#define __underlying_strncpy	__builtin_strncpy
+#endif
+
+/**
+ * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking
+ *
+ * @dst: Destination memory address to write to
+ * @src: Source memory address to read from
+ * @bytes: How many bytes to write to @dst from @src
+ * @justification: Free-form text or comment describing why the use is needed
+ *
+ * This should be used for corner cases where the compiler cannot do the
+ * right thing, or during transitions between APIs, etc. It should be used
+ * very rarely, and includes a place for justification detailing where bounds
+ * checking has happened, and why existing solutions cannot be employed.
+ */
+#define unsafe_memcpy(dst, src, bytes, justification)		\
+	__underlying_memcpy(dst, src, bytes)
+
+/*
+ * Clang's use of __builtin_*object_size() within inlines needs hinting via
+ * __pass_*object_size(). The preference is to only ever use type 1 (member
+ * size, rather than struct size), but there remain some stragglers using
+ * type 0 that will be converted in the future.
+ */
+#define POS			__pass_object_size(1)
+#define POS0			__pass_object_size(0)
+#define __struct_size(p)	__builtin_object_size(p, 0)
+#define __member_size(p)	__builtin_object_size(p, 1)
+
+#define __compiletime_lessthan(bounds, length)	(	\
+	__builtin_constant_p((bounds) < (length)) &&	\
+	(bounds) < (length)				\
+)
+
+
+/*
+ * To make sure the compiler can enforce protection against buffer overflows,
+ * memcpy(), memmove(), and memset() must not be used beyond individual
+ * struct members. If you need to copy across multiple members, please use
+ * struct_group() to create a named mirror of an anonymous struct union.
+ * (e.g. see struct sk_buff.) Read overflow checking is currently only
+ * done when a write overflow is also present, or when building with W=1.
+ *
+ * Mitigation coverage matrix
+ *					Bounds checking at:
+ *					+-------+-------+-------+-------+
+ *					| Compile time  |   Run time    |
+ * memcpy() argument sizes:		| write | read  | write | read  |
+ *        dest     source   length      +-------+-------+-------+-------+
+ * memcpy(known,   known,   constant)	|   y   |   y   |  n/a  |  n/a  |
+ * memcpy(known,   unknown, constant)	|   y   |   n   |  n/a  |   V   |
+ * memcpy(known,   known,   dynamic)	|   n   |   n   |   B   |   B   |
+ * memcpy(known,   unknown, dynamic)	|   n   |   n   |   B   |   V   |
+ * memcpy(unknown, known,   constant)	|   n   |   y   |   V   |  n/a  |
+ * memcpy(unknown, unknown, constant)	|   n   |   n   |   V   |   V   |
+ * memcpy(unknown, known,   dynamic)	|   n   |   n   |   V   |   B   |
+ * memcpy(unknown, unknown, dynamic)	|   n   |   n   |   V   |   V   |
+ *					+-------+-------+-------+-------+
+ *
+ * y = perform deterministic compile-time bounds checking
+ * n = cannot perform deterministic compile-time bounds checking
+ * n/a = no run-time bounds checking needed since compile-time deterministic
+ * B = can perform run-time bounds checking (currently unimplemented)
+ * V = vulnerable to run-time overflow (will need refactoring to solve)
+ *
+ */
+extern __always_inline __gnu_inline
+bool fortify_memcpy_chk(__kernel_size_t size,
+					 const size_t p_size,
+					 const size_t q_size,
+					 const size_t p_size_field,
+					 const size_t q_size_field,
+					 const char *func)
+{
+	if (__builtin_constant_p(size)) {
+		/*
+		 * Length argument is a constant expression, so we
+		 * can perform compile-time bounds checking where
+		 * buffer sizes are also known at compile time.
+		 */
+
+		/* Error when size is larger than enclosing struct. */
+		if (__compiletime_lessthan(p_size_field, p_size) &&
+		    __compiletime_lessthan(p_size, size))
+			__write_overflow();
+		if (__compiletime_lessthan(q_size_field, q_size) &&
+		    __compiletime_lessthan(q_size, size))
+			__read_overflow2();
+
+		/* Warn when write size argument larger than dest field. */
+		if (__compiletime_lessthan(p_size_field, size))
+			__write_overflow_field(p_size_field, size);
+		/*
+		 * Warn for source field over-read when building with W=1
+		 * or when an over-write happened, so both can be fixed at
+		 * the same time.
+		 */
+		if ((IS_ENABLED(KBUILD_EXTRA_WARN1) ||
+		     __compiletime_lessthan(p_size_field, size)) &&
+		    __compiletime_lessthan(q_size_field, size))
+			__read_overflow2_field(q_size_field, size);
+	}
+	/*
+	 * At this point, length argument may not be a constant expression,
+	 * so run-time bounds checking can be done where buffer sizes are
+	 * known. (This is not an "else" because the above checks may only
+	 * be compile-time warnings, and we want to still warn for run-time
+	 * overflows.)
+	 */
+
+	/*
+	 * Always stop accesses beyond the struct that contains the
+	 * field, when the buffer's remaining size is known.
+	 * (The SIZE_MAX test is to optimize away checks where the buffer
+	 * lengths are unknown.)
+	 */
+	if ((p_size != SIZE_MAX && p_size < size) ||
+	    (q_size != SIZE_MAX && q_size < size))
+		fortify_panic(func);
+
+	/*
+	 * Warn when writing beyond destination field size.
+	 *
+	 * We must ignore p_size_field == 0 for existing 0-element
+	 * fake flexible arrays, until they are all converted to
+	 * proper flexible arrays.
+	 *
+	 * The implementation of __builtin_*object_size() behaves
+	 * like sizeof() when not directly referencing a flexible
+	 * array member, which means there will be many bounds checks
+	 * that will appear at run-time, without a way for them to be
+	 * detected at compile-time (as can be done when the destination
+	 * is specifically the flexible array member).
+	 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832
+	 */
+	if (p_size_field != 0 && p_size_field != SIZE_MAX &&
+	    p_size != p_size_field && p_size_field < size)
+		return true;
+
+	return false;
+}
+
+#define __fortify_memcpy_chk(p, q, size, p_size, q_size,		\
+			     p_size_field, q_size_field, op) ({		\
+	const size_t __fortify_size = (size_t)(size);			\
+	const size_t __p_size = (p_size);				\
+	const size_t __q_size = (q_size);				\
+	const size_t __p_size_field = (p_size_field);			\
+	const size_t __q_size_field = (q_size_field);			\
+	WARN_ONCE(fortify_memcpy_chk(__fortify_size, __p_size,		\
+				     __q_size, __p_size_field,		\
+				     __q_size_field, #op),		\
+		  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
+		  __fortify_size,					\
+		  "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
+		  __p_size_field);					\
+	__underlying_##op(p, q, __fortify_size);			\
+})
+
+/*
+ * Notes about compile-time buffer size detection:
+ *
+ * With these types...
+ *
+ *	struct middle {
+ *		u16 a;
+ *		u8 middle_buf[16];
+ *		int b;
+ *	};
+ *	struct end {
+ *		u16 a;
+ *		u8 end_buf[16];
+ *	};
+ *	struct flex {
+ *		int a;
+ *		u8 flex_buf[];
+ *	};
+ *
+ *	void func(TYPE *ptr) { ... }
+ *
+ * Cases where destination size cannot be currently detected:
+ * - the size of ptr's object (seemingly by design, gcc & clang fail):
+ *	__builtin_object_size(ptr, 1) == SIZE_MAX
+ * - the size of flexible arrays in ptr's obj (by design, dynamic size):
+ *	__builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX
+ * - the size of ANY array at the end of ptr's obj (gcc and clang bug):
+ *	__builtin_object_size(ptr->end_buf, 1) == SIZE_MAX
+ *	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836
+ *
+ * Cases where destination size is currently detected:
+ * - the size of non-array members within ptr's object:
+ *	__builtin_object_size(ptr->a, 1) == 2
+ * - the size of non-flexible-array in the middle of ptr's obj:
+ *	__builtin_object_size(ptr->middle_buf, 1) == 16
+ *
+ */
+
+/*
+ * __struct_size() vs __member_size() must be captured here to avoid
+ * evaluating argument side-effects further into the macro layers.
+ */
+#define memcpy(p, q, s)  __fortify_memcpy_chk(p, q, s,			\
+		__struct_size(p), __struct_size(q),			\
+		__member_size(p), __member_size(q),			\
+		memcpy)
+
+#endif /* HAVE_LINUX_FORTIFY_STRING_HEADER */
+#endif /* unsafe_memcpy */
+
+/* a catch all to ensure an unsafe_memcpy() exists */
+#ifndef unsafe_memcpy
+#define unsafe_memcpy(dst, src, bytes, justification)		\
+	memcpy(dst, src, bytes)
+#endif
+
+#endif /* _LIBCFS_FORTIFY_STRING_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
new file mode 100644
index 0000000000000..6ef6b0716aa6d
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/linux/linux-fs.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_FS_H__
+#define __LIBCFS_LINUX_CFS_FS_H__
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/mount.h>
+#include <linux/backing-dev.h>
+#include <linux/pagemap.h>
+
+#ifndef HAVE_FILE_DENTRY
+static inline struct dentry *file_dentry(const struct file *file)
+{
+	return file->f_path.dentry;
+}
+#endif
+
+#ifndef S_DT_SHIFT
+#define S_DT_SHIFT		12
+#endif
+
+#ifndef S_DT
+#define S_DT(type)		(((type) & S_IFMT) >> S_DT_SHIFT)
+#endif
+#ifndef DTTOIF
+#define DTTOIF(dirtype)		((dirtype) << S_DT_SHIFT)
+#endif
+
+#ifdef HAVE_PROC_OPS
+#define PROC_OWNER(_fn)
+#else
+#define proc_ops file_operations
+#define PROC_OWNER(_owner)		.owner = (_owner),
+#define proc_open			open
+#define proc_read			read
+#define proc_write			write
+#define proc_lseek			llseek
+#define proc_release			release
+#define proc_poll			poll
+#define proc_ioctl			unlocked_ioctl
+#define proc_compat_ioctl		compat_ioctl
+#define proc_mmap			mmap
+#define proc_get_unmapped_area		get_unmapped_area
+#endif
+
+static inline void mapping_clear_exiting(struct address_space *mapping)
+{
+#ifdef HAVE_MAPPING_AS_EXITING_FLAG
+	clear_bit(AS_EXITING, &mapping->flags);
+#endif
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
new file mode 100644
index 0000000000000..3c615bd0df703
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
@@ -0,0 +1,345 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#ifndef __LIBCFS_LINUX_HASH_H__
+#define __LIBCFS_LINUX_HASH_H__
+
+#include <linux/dcache.h>
+#include <linux/rhashtable.h>
+
+u64 cfs_hashlen_string(const void *salt, const char *name);
+
+#ifndef hashlen_hash
+#define hashlen_hash(hashlen) ((u32)(hashlen))
+#endif
+
+#ifndef HAVE_STRINGHASH
+#ifndef hashlen_create
+#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
+#endif
+#endif /* !HAVE_STRINGHASH */
+
+#ifdef HAVE_BROKEN_HASH_64
+
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull
+
+static inline u32 cfs_hash_32(u32 val, unsigned int bits)
+{
+	/* High bits are more random, so use them. */
+	return (val * GOLDEN_RATIO_32) >> (32 - bits);
+}
+
+static __always_inline u32 cfs_hash_64(u64 val, unsigned int bits)
+{
+#if BITS_PER_LONG == 64
+	/* 64x64-bit multiply is efficient on all 64-bit processors */
+	return val * GOLDEN_RATIO_64 >> (64 - bits);
+#else
+	/* Hash 64 bits using only 32x32-bit multiply. */
+	return cfs_hash_32(((u32)val ^ ((val >> 32) * GOLDEN_RATIO_32)), bits);
+#endif
+}
+#else
+
+#define cfs_hash_32	hash_32
+#define cfs_hash_64	hash_64
+
+#endif /* HAVE_BROKEN_HASH_64 */
+
+#ifndef HAVE_RHASHTABLE_WALK_ENTER
+static int rhashtable_walk_enter(struct rhashtable *ht,
+				 struct rhashtable_iter *iter)
+{
+#ifdef HAVE_3ARG_RHASHTABLE_WALK_INIT
+	return rhashtable_walk_init(ht, iter, GFP_KERNEL);
+#else
+	return rhashtable_walk_init(ht, iter);
+#endif
+}
+#endif
+
+#ifndef HAVE_RHLTABLE
+struct rhlist_head {
+	struct rhash_head		rhead;
+	struct rhlist_head __rcu	*next;
+};
+
+struct rhltable {
+	struct rhashtable ht;
+};
+
+#define rhl_for_each_entry_rcu(tpos, pos, list, member)                 \
+	for (pos = list; pos && rht_entry(tpos, pos, member);           \
+		pos = rcu_dereference_raw(pos->next))
+
+static inline int rhltable_init(struct rhltable *hlt,
+				const struct rhashtable_params *params)
+{
+	return rhashtable_init(&hlt->ht, params);
+}
+
+static inline struct rhlist_head *rhltable_lookup(
+	struct rhltable *hlt, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhashtable *ht = &hlt->ht;
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	struct bucket_table *tbl;
+	struct rhash_head *he;
+	unsigned int hash;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+	hash = rht_key_hashfn(ht, tbl, key, params);
+	rht_for_each_rcu(he, tbl, hash) {
+		if (params.obj_cmpfn ?
+		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+		    rhashtable_compare(&arg, rht_obj(ht, he)))
+			continue;
+		return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+	}
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
+		goto restart;
+
+	return NULL;
+}
+
+static inline int rhltable_insert_key(
+	struct rhltable *hlt, const void *key, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+#ifdef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT
+	return __rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+					params);
+#else
+	return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+						params));
+#endif
+}
+
+static inline int rhltable_remove(
+	struct rhltable *hlt, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	return rhashtable_remove_fast(&hlt->ht, &list->rhead, params);
+}
+
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+					     void (*free_fn)(void *ptr,
+							     void *arg),
+					     void *arg)
+{
+	rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+	rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+				       struct rhashtable_iter *iter)
+{
+	rhashtable_walk_enter(&hlt->ht, iter);
+}
+#endif /* !HAVE_RHLTABLE */
+
+#ifndef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht:         hash table
+ * @obj:        pointer to hash head inside object
+ * @params:     hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	const char *key;
+	void *ret;
+	int rc;
+
+	rc = rhashtable_lookup_insert_fast(ht, obj, params);
+	switch (rc) {
+	case -EEXIST:
+		key = rht_obj(ht, obj);
+		ret = rhashtable_lookup_fast(ht, key, params);
+		break;
+	case 0:
+		ret = NULL;
+		break;
+	default:
+		ret = ERR_PTR(rc);
+		break;
+	}
+	return ret;
+}
+#endif /* !HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST */
+
+#ifndef HAVE_RHASHTABLE_LOOKUP
+/*
+ * The function rhashtable_lookup() and rhashtable_lookup_fast()
+ * are almost the same except rhashtable_lookup() doesn't
+ * take the RCU read lock. Since this is the case and only
+ * SLES12 SP3 lacks rhashtable_lookup() just duplicate the
+ * SLES12 SP3 rhashtable_lookup_fast() minus the RCU read lock.
+ */
+static inline void *rhashtable_lookup(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	const struct bucket_table *tbl;
+	struct rhash_head *he;
+	unsigned int hash;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+	hash = rht_key_hashfn(ht, tbl, key, params);
+	rht_for_each_rcu(he, tbl, hash) {
+		if (params.obj_cmpfn ?
+		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+		    rhashtable_compare(&arg, rht_obj(ht, he)))
+			continue;
+		return rht_obj(ht, he);
+	}
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
+		goto restart;
+
+	return NULL;
+}
+#endif /* !HAVE_RHASHTABLE_LOOKUP */
+
+#ifndef HAVE_RHT_BUCKET_VAR
+static inline struct rhash_head __rcu **rht_bucket_var(
+	struct bucket_table *tbl, unsigned int hash)
+{
+	return &tbl->buckets[hash];
+}
+#endif
+
+#ifndef HAVE_RHASHTABLE_REPLACE
+/* Internal function, please use rhashtable_replace_fast() instead */
+static inline int __rhashtable_replace_fast(
+	struct rhashtable *ht, struct bucket_table *tbl,
+	struct rhash_head *obj_old, struct rhash_head *obj_new,
+	const struct rhashtable_params params)
+{
+	struct rhash_head __rcu **pprev;
+	struct rhash_head *he;
+	spinlock_t *lock;
+	unsigned int hash;
+	int err = -ENOENT;
+
+	/* Minimally, the old and new objects must have same hash
+	 * (which should mean identifiers are the same).
+	 */
+	hash = rht_head_hashfn(ht, tbl, obj_old, params);
+	if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
+		return -EINVAL;
+
+	lock = rht_bucket_lock(tbl, hash);
+
+	spin_lock_bh(lock);
+
+	pprev = rht_bucket_var(tbl, hash);
+	rht_for_each_continue(he, *pprev, tbl, hash) {
+		if (he != obj_old) {
+			pprev = &he->next;
+			continue;
+		}
+
+		rcu_assign_pointer(obj_new->next, obj_old->next);
+		rcu_assign_pointer(*pprev, obj_new);
+		err = 0;
+		break;
+	}
+
+	spin_unlock_bh(lock);
+
+	return err;
+}
+
+/**
+ * rhashtable_replace_fast - replace an object in hash table
+ * @ht:		hash table
+ * @obj_old:	pointer to hash head inside object being replaced
+ * @obj_new:	pointer to hash head inside object which is new
+ * @params:	hash table parameters
+ *
+ * Replacing an object doesn't affect the number of elements in the hash table
+ * or bucket, so we don't need to worry about shrinking or expanding the
+ * table here.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found,
+ * -EINVAL if hash is not the same for the old and new objects.
+ */
+static inline int rhashtable_replace_fast(
+	struct rhashtable *ht, struct rhash_head *obj_old,
+	struct rhash_head *obj_new,
+	const struct rhashtable_params params)
+{
+	struct bucket_table *tbl;
+	int err;
+
+	rcu_read_lock();
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	/* Because we have already taken (and released) the bucket
+	 * lock in old_tbl, if we find that future_tbl is not yet
+	 * visible then that guarantees the entry to still be in
+	 * the old tbl if it exists.
+	 */
+	while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
+						obj_new, params)) &&
+	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+		;
+
+	rcu_read_unlock();
+
+	return err;
+}
+#endif /* HAVE_RHASHTABLE_REPLACE */
+
+#endif /* __LIBCFS_LINUX_HASH_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h
new file mode 100644
index 0000000000000..c457bee35e160
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h
@@ -0,0 +1,32 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+
+#ifndef __LIBCFS_LINUX_LIST_H__
+#define __LIBCFS_LINUX_LIST_H__
+
+#include <linux/list.h>
+
+#ifdef HAVE_HLIST_ADD_AFTER
+#define hlist_add_behind(hnode, tail)	hlist_add_after(tail, hnode)
+#endif /* HAVE_HLIST_ADD_AFTER */
+
+#endif /* __LIBCFS_LINUX_LIST_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
new file mode 100644
index 0000000000000..548eb96a2db33
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
@@ -0,0 +1,143 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_MEM_H__
+#define __LIBCFS_LINUX_CFS_MEM_H__
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#ifdef HAVE_MM_INLINE
+# include <linux/mm_inline.h>
+#endif
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/mm.h>
+#endif
+
+#ifdef HAVE_TOTALRAM_PAGES_AS_FUNC
+ #ifndef cfs_totalram_pages
+  #define cfs_totalram_pages() totalram_pages()
+ #endif
+#else
+ #ifndef cfs_totalram_pages
+  #define cfs_totalram_pages() totalram_pages
+ #endif
+#endif
+
+#ifndef HAVE_MEMALLOC_RECLAIM
+static inline unsigned int memalloc_noreclaim_save(void)
+{
+	unsigned int flags = current->flags & PF_MEMALLOC;
+
+	current->flags |= PF_MEMALLOC;
+	return flags;
+}
+
+static inline void memalloc_noreclaim_restore(unsigned int flags)
+{
+	current->flags = (current->flags & ~PF_MEMALLOC) | flags;
+}
+#endif /* !HAVE_MEMALLOC_RECLAIM */
+
+#ifndef HAVE_BITMAP_ALLOC
+static inline unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
+{
+	return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
+			     flags);
+}
+
+static inline unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
+{
+	return bitmap_alloc(nbits, flags | __GFP_ZERO);
+}
+
+static inline void bitmap_free(const unsigned long *bitmap)
+{
+	kfree(bitmap);
+}
+#endif /* !HAVE_BITMAP_ALLOC */
+
+/*
+ * Shrinker
+ */
+#ifndef SHRINK_STOP
+# define SHRINK_STOP (~0UL)
+#endif
+
+#ifndef HAVE_MMAP_LOCK
+static inline void mmap_write_lock(struct mm_struct *mm)
+{
+	down_write(&mm->mmap_sem);
+}
+
+static inline bool mmap_write_trylock(struct mm_struct *mm)
+{
+	return down_write_trylock(&mm->mmap_sem) != 0;
+}
+
+static inline void mmap_write_unlock(struct mm_struct *mm)
+{
+	up_write(&mm->mmap_sem);
+}
+
+static inline void mmap_read_lock(struct mm_struct *mm)
+{
+	down_read(&mm->mmap_sem);
+}
+
+static inline bool mmap_read_trylock(struct mm_struct *mm)
+{
+	return down_read_trylock(&mm->mmap_sem) != 0;
+}
+
+static inline void mmap_read_unlock(struct mm_struct *mm)
+{
+	up_read(&mm->mmap_sem);
+}
+#endif
+
+#ifdef HAVE_VMALLOC_2ARGS
+#define __ll_vmalloc(size, flags) __vmalloc(size, flags)
+#else
+#define __ll_vmalloc(size, flags) __vmalloc(size, flags, PAGE_KERNEL)
+#endif
+
+#ifndef HAVE_KFREE_SENSITIVE
+#define kfree_sensitive(x)      kzfree(x)
+#endif
+
+#endif /* __LINUX_CFS_MEM_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
new file mode 100644
index 0000000000000..841db69e28742
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LIBCFS_LINUX_MISC_H__
+#define __LIBCFS_LINUX_MISC_H__
+
+#include <linux/fs.h>
+/* Since Commit 2f8b544477e6 ("block,fs: untangle fs.h and blk_types.h")
+ * fs.h doesn't include blk_types.h, but we need it.
+ */
+#include <linux/blk_types.h>
+#include <linux/mutex.h>
+#include <linux/user_namespace.h>
+#include <linux/uio.h>
+#include <linux/kallsyms.h>
+
+#ifndef HAVE_IOV_ITER_TYPE
+#ifdef HAVE_IOV_ITER_HAS_TYPE_MEMBER
+#define iter_is_iovec(iter)		((iter)->type & ITER_IOVEC)
+#define iov_iter_is_kvec(iter)		((iter)->type & ITER_KVEC)
+#define iov_iter_is_bvec(iter)		((iter)->type & ITER_BVEC)
+#define iov_iter_is_pipe(iter)		((iter)->type & ITER_PIPE)
+#define iov_iter_is_discard(iter)	((iter)->type & ITER_DISCARD)
+#else
+#define iter_is_iovec(iter)		1
+#define iov_iter_is_kvec(iter)		0
+#define iov_iter_is_bvec(iter)		0
+#define iov_iter_is_pipe(iter)		0
+#define iov_iter_is_discard(iter)	0
+#endif
+#endif /* HAVE_IOV_ITER_TYPE */
+
+int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
+		     loff_t *pos);
+ssize_t cfs_kernel_read(struct file *file, void *buf, size_t count,
+			loff_t *pos);
+
+/*
+ * For RHEL6 struct kernel_parm_ops doesn't exist. Also
+ * the arguments for .set and .get take different
+ * parameters which is handled below
+ */
+#ifdef HAVE_KERNEL_PARAM_OPS
+#define cfs_kernel_param_arg_t const struct kernel_param
+#else
+#define cfs_kernel_param_arg_t struct kernel_param_ops
+#define kernel_param_ops kernel_param
+#endif /* ! HAVE_KERNEL_PARAM_OPS */
+
+#ifndef HAVE_KERNEL_PARAM_LOCK
+static inline void kernel_param_unlock(struct module *mod)
+{
+	__kernel_param_unlock();
+}
+
+static inline void kernel_param_lock(struct module *mod)
+{
+	__kernel_param_lock();
+}
+#endif /* ! HAVE_KERNEL_PARAM_LOCK */
+
+int cfs_apply_workqueue_attrs(struct workqueue_struct *wq,
+			      const struct workqueue_attrs *attrs);
+
+#ifndef HAVE_KSTRTOBOOL_FROM_USER
+
+#define kstrtobool strtobool
+
+int kstrtobool_from_user(const char __user *s, size_t count, bool *res);
+#endif /* HAVE_KSTRTOBOOL_FROM_USER */
+
+#ifndef HAVE_MATCH_WILDCARD
+bool match_wildcard(const char *pattern, const char *str);
+#endif /* !HAVE_MATCH_WILDCARD */
+
+#ifndef HAVE_KREF_READ
+static inline int kref_read(const struct kref *kref)
+{
+	return atomic_read(&kref->refcount);
+}
+#endif /* HAVE_KREF_READ */
+
+#ifdef HAVE_FORCE_SIG_WITH_TASK
+#define cfs_force_sig(sig, task)	force_sig((sig), (task))
+#else
+#define cfs_force_sig(sig, task)					\
+do {									\
+	unsigned long flags;						\
+									\
+	spin_lock_irqsave(&task->sighand->siglock, flags);		\
+	task->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;		\
+	send_sig(sig, task, 1);						\
+	spin_unlock_irqrestore(&task->sighand->siglock, flags);         \
+} while (0)
+#endif
+
+void cfs_arch_init(void);
+
+#ifndef container_of_safe
+/**
+ * container_of_safe - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged.
+ *
+ * Note: Copied from Linux 5.6, with BUILD_BUG_ON_MSG section removed.
+ */
+#define container_of_safe(ptr, type, member) ({				\
+	void *__mptr = (void *)(ptr);					\
+	IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) :			\
+		((type *)(__mptr - offsetof(type, member))); })
+#endif
+
+/*
+ * Linux v4.15-rc2-5-g4229a470175b added sizeof_field()
+ * Linux v5.5-rc4-1-g1f07dcc459d5 removed FIELD_SIZEOF()
+ * Proved a sizeof_field in terms of FIELD_SIZEOF() when one is not provided
+ */
+#ifndef sizeof_field
+#define sizeof_field(type, member)	FIELD_SIZEOF(type, member)
+#endif
+
+#ifndef HAVE_TASK_IS_RUNNING
+#define task_is_running(task)		(task->state == TASK_RUNNING)
+#endif
+
+#ifndef memset_startat
+/** from linux 5.19 include/linux/string.h: */
+#define memset_startat(obj, v, member)					\
+({									\
+	u8 *__ptr = (u8 *)(obj);					\
+	typeof(v) __val = (v);						\
+	memset(__ptr + offsetof(typeof(*(obj)), member), __val,		\
+	       sizeof(*(obj)) - offsetof(typeof(*(obj)), member));	\
+})
+#endif /* memset_startat() */
+
+#ifdef HAVE_KALLSYMS_LOOKUP_NAME
+static inline void *cfs_kallsyms_lookup_name(const char *name)
+{
+	return (void *)kallsyms_lookup_name(name);
+}
+#else
+static inline void *cfs_kallsyms_lookup_name(const char *name)
+{
+	return NULL;
+}
+#endif
+
+#ifndef HAVE_KOBJ_TYPE_DEFAULT_GROUPS
+#define default_groups			default_attrs
+#define KOBJ_ATTR_GROUPS(_name)		_name##_attrs
+#define KOBJ_ATTRIBUTE_GROUPS(_name)
+#else
+#define KOBJ_ATTR_GROUPS(_name)		_name##_groups
+#define KOBJ_ATTRIBUTE_GROUPS(_name)	ATTRIBUTE_GROUPS(_name)
+#endif
+
+#endif /* __LIBCFS_LINUX_MISC_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h
new file mode 100644
index 0000000000000..17b1b30be45b6
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h
@@ -0,0 +1,162 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#ifndef __LIBCFS_LINUX_NET_H__
+#define __LIBCFS_LINUX_NET_H__
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#ifndef HAVE_NLA_STRDUP
+char *nla_strdup(const struct nlattr *nla, gfp_t flags);
+#endif /* !HAVE_NLA_STRDUP */
+
+#ifdef HAVE_NLA_STRLCPY
+#define nla_strscpy	nla_strlcpy
+#endif /* HAVE_NLA_STRLCPY */
+
+#ifndef HAVE_NL_PARSE_WITH_EXT_ACK
+
+#define NL_SET_BAD_ATTR(extack, attr)
+
+/* this can be increased when necessary - don't expose to userland */
+#define NETLINK_MAX_COOKIE_LEN  20
+
+/**
+ * struct netlink_ext_ack - netlink extended ACK report struct
+ * @_msg: message string to report - don't access directly, use
+ *      %NL_SET_ERR_MSG
+ * @bad_attr: attribute with error
+ * @cookie: cookie data to return to userspace (for success)
+ * @cookie_len: actual cookie data length
+ */
+struct netlink_ext_ack {
+	const char *_msg;
+	const struct nlattr *bad_attr;
+	u8 cookie[NETLINK_MAX_COOKIE_LEN];
+	u8 cookie_len;
+};
+
+#define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG(NULL, msg)
+
+static inline int cfs_nla_parse(struct nlattr **tb, int maxtype,
+				const struct nlattr *head, int len,
+				const struct nla_policy *policy,
+				struct netlink_ext_ack *extack)
+{
+	return nla_parse(tb, maxtype, head, len, policy);
+}
+
+static inline int cfs_nla_parse_nested(struct nlattr *tb[], int maxtype,
+				       const struct nlattr *nla,
+				       const struct nla_policy *policy,
+				       struct netlink_ext_ack *extack)
+{
+	return nla_parse_nested(tb, maxtype, nla, policy);
+}
+
+#else /* !HAVE_NL_PARSE_WITH_EXT_ACK */
+
+#define cfs_nla_parse_nested    nla_parse_nested
+#define cfs_nla_parse           nla_parse
+
+#endif
+
+#ifndef HAVE_GENL_DUMPIT_INFO
+struct cfs_genl_dumpit_info {
+	const struct genl_family *family;
+	const struct genl_ops *ops;
+	struct nlattr **attrs;
+};
+
+static inline const struct cfs_genl_dumpit_info *
+lnet_genl_dumpit_info(struct netlink_callback *cb)
+{
+	return (const struct cfs_genl_dumpit_info *)cb->args[1];
+}
+#else
+#define cfs_genl_dumpit_info	genl_dumpit_info
+
+static inline const struct cfs_genl_dumpit_info *
+lnet_genl_dumpit_info(struct netlink_callback *cb)
+{
+	return (const struct cfs_genl_dumpit_info *)genl_dumpit_info(cb);
+}
+#endif /* HAVE_GENL_DUMPIT_INFO */
+
+#ifdef HAVE_KERNEL_SETSOCKOPT
+
+#include <net/tcp.h>
+
+#if !defined(HAVE_TCP_SOCK_SET_QUICKACK)
+static inline void tcp_sock_set_quickack(struct sock *sk, int opt)
+{
+	struct socket *sock = sk->sk_socket;
+
+	kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			  (char *)&opt, sizeof(opt));
+}
+#endif /* HAVE_TCP_SOCK_SET_QUICKACK */
+
+#if !defined(HAVE_TCP_SOCK_SET_NODELAY)
+static inline void tcp_sock_set_nodelay(struct sock *sk)
+{
+	int opt = 1;
+	struct socket *sock = sk->sk_socket;
+
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+			  (char *)&opt, sizeof(opt));
+}
+#endif /* HAVE_TCP_SOCK_SET_NODELAY */
+
+#if !defined(HAVE_TCP_SOCK_SET_KEEPIDLE)
+static inline int tcp_sock_set_keepidle(struct sock *sk, int opt)
+{
+	struct socket *sock = sk->sk_socket;
+
+	return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+				 (char *)&opt, sizeof(opt));
+}
+#endif /* HAVE_TCP_SOCK_SET_KEEPIDLE */
+
+#if !defined(HAVE_TCP_SOCK_SET_KEEPINTVL)
+static inline int tcp_sock_set_keepintvl(struct sock *sk, int opt)
+{
+	struct socket *sock = sk->sk_socket;
+
+	return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+				 (char *)&opt, sizeof(opt));
+}
+#endif /* HAVE_TCP_SOCK_SET_KEEPINTVL */
+
+#if !defined(HAVE_TCP_SOCK_SET_KEEPCNT)
+static inline int tcp_sock_set_keepcnt(struct sock *sk, int opt)
+{
+	struct socket *sock = sk->sk_socket;
+
+	return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+				 (char *)&opt, sizeof(opt));
+}
+#endif /* HAVE_TCP_SOCK_SET_KEEPCNT */
+#endif /* HAVE_KERNEL_SETSOCKOPT */
+
+#endif /* __LIBCFS_LINUX_NET_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
new file mode 100644
index 0000000000000..4a61fe1143858
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -0,0 +1,250 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/linux/linux-time.h
+ *
+ * Implementation of portable time API for Linux (kernel and user-level).
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_LINUX_LINUX_TIME_H__
+#define __LIBCFS_LINUX_LINUX_TIME_H__
+
+/* Portable time API */
+#include <linux/hrtimer.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/jiffies.h>
+#include <linux/hrtimer.h>
+#include <linux/types.h>
+#include <linux/time.h>
+#include <asm/div64.h>
+
+/*
+ * Generic kernel stuff
+ */
+#ifndef HAVE_TIMESPEC64
+
+typedef __s64 time64_t;
+
+#if __BITS_PER_LONG == 64
+
+# define timespec64 timespec
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	return ts;
+}
+
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts)
+{
+	return ts;
+}
+
+#else
+struct timespec64 {
+	time64_t	tv_sec;		/* seconds */
+	long		tv_nsec;	/* nanoseconds */
+};
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	struct timespec64 ret;
+
+	ret.tv_sec = ts.tv_sec;
+	ret.tv_nsec = ts.tv_nsec;
+	return ret;
+}
+
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	struct timespec ret;
+
+	ret.tv_sec = (time_t)ts64.tv_sec;
+	ret.tv_nsec = ts64.tv_nsec;
+	return ret;
+}
+#endif /* __BITS_PER_LONG != 64 */
+
+#endif /* HAVE_TIMESPEC64 */
+
+#ifndef HAVE_NS_TO_TIMESPEC64
+static inline struct timespec64 ns_to_timespec64(const s64 nsec)
+{
+	struct timespec64 ts;
+	s32 rem;
+
+	if (!nsec)
+		return (struct timespec64) {0, 0};
+
+	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+	if (unlikely(rem < 0)) {
+		ts.tv_sec--;
+		rem += NSEC_PER_SEC;
+	}
+	ts.tv_nsec = rem;
+
+	return ts;
+}
+#endif
+
+#ifndef HAVE_KTIME_ADD
+# define ktime_add(lhs, rhs) ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; })
+#endif /* !HAVE_KTIME_ADD */
+
+#ifndef HAVE_KTIME_AFTER
+static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
+{
+	return cmp1.tv64 > cmp2.tv64;
+}
+#endif /* !HAVE_KTIME_AFTER */
+
+#ifndef HAVE_KTIME_BEFORE
+static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2)
+{
+	return cmp1.tv64 < cmp2.tv64;
+}
+#endif /* !HAVE_KTIME_BEFORE */
+
+#ifndef HAVE_KTIME_COMPARE
+static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
+{
+	if (cmp1.tv64 < cmp2.tv64)
+		return -1;
+	if (cmp1.tv64 > cmp2.tv64)
+		return 1;
+	return 0;
+}
+#endif /* !HAVE_KTIME_COMPARE */
+
+#ifndef HAVE_KTIME_GET_TS64
+void ktime_get_ts64(struct timespec64 *ts);
+#endif /* HAVE_KTIME_GET_TS */
+
+#ifndef HAVE_KTIME_GET_REAL_TS64
+void ktime_get_real_ts64(struct timespec64 *ts);
+#endif /* HAVE_KTIME_GET_REAL_TS */
+
+#ifndef HAVE_KTIME_GET_REAL_SECONDS
+time64_t ktime_get_real_seconds(void);
+#endif /* HAVE_KTIME_GET_REAL_SECONDS */
+
+#ifndef HAVE_KTIME_GET_SECONDS
+time64_t ktime_get_seconds(void);
+#endif /* HAVE_KTIME_GET_SECONDS */
+
+#ifdef NEED_KTIME_GET_NS
+static inline u64 ktime_get_ns(void)
+{
+	return ktime_to_ns(ktime_get());
+}
+#endif /* NEED_KTIME_GET_NS */
+
+#ifdef NEED_KTIME_GET_REAL_NS
+static inline u64 ktime_get_real_ns(void)
+{
+	return ktime_to_ns(ktime_get_real());
+}
+#endif /* NEED_KTIME_GET_REAL_NS */
+
+#ifndef HAVE_KTIME_MS_DELTA
+static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier)
+{
+	return ktime_to_ms(ktime_sub(later, earlier));
+}
+#endif /* HAVE_KTIME_MS_DELTA */
+
+#ifndef HAVE_KTIME_TO_TIMESPEC64
+static inline struct timespec64 ktime_to_timespec64(ktime_t kt)
+{
+	struct timespec ts = ns_to_timespec((kt).tv64);
+
+	return timespec_to_timespec64(ts);
+}
+#endif /* HAVE_KTIME_TO_TIMESPEC64 */
+
+#ifndef HAVE_TIMESPEC64_SUB
+static inline struct timespec64
+timespec64_sub(struct timespec64 later, struct timespec64 earlier)
+{
+	struct timespec diff;
+
+	diff = timespec_sub(timespec64_to_timespec(later),
+			    timespec64_to_timespec(earlier));
+	return timespec_to_timespec64(diff);
+}
+#endif
+
+#ifndef HAVE_TIMESPEC64_TO_KTIME
+static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
+{
+	return ktime_set(ts.tv_sec, ts.tv_nsec);
+}
+#endif
+
+static inline unsigned long cfs_time_seconds(time64_t seconds)
+{
+	return nsecs_to_jiffies64(seconds * NSEC_PER_SEC);
+}
+
+#ifdef HAVE_NEW_DEFINE_TIMER
+# ifndef TIMER_DATA_TYPE
+# define TIMER_DATA_TYPE struct timer_list *
+# endif
+
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function))
+#else
+# ifndef TIMER_DATA_TYPE
+# define TIMER_DATA_TYPE unsigned long
+# endif
+
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function), (_expires), (_data))
+#endif
+
+#ifdef HAVE_TIMER_SETUP
+#define cfs_timer_cb_arg_t struct timer_list *
+#define cfs_from_timer(var, callback_timer, timer_fieldname) \
+	from_timer(var, callback_timer, timer_fieldname)
+#define cfs_timer_setup(timer, callback, data, flags) \
+	timer_setup((timer), (callback), (flags))
+#define cfs_timer_cb_arg(var, timer_fieldname) (&(var)->timer_fieldname)
+#else
+#define cfs_timer_cb_arg_t unsigned long
+#define cfs_from_timer(var, data, timer_fieldname) (typeof(var))(data)
+#define cfs_timer_setup(timer, callback, data, flags) \
+	setup_timer((timer), (callback), (data))
+#define cfs_timer_cb_arg(var, timer_fieldname) (cfs_timer_cb_arg_t)(var)
+#endif
+
+#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-uuid.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-uuid.h
new file mode 100644
index 0000000000000..df877c0f62813
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-uuid.h
@@ -0,0 +1,63 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LIBCFS_LINUX_UUID_H__
+#define __LIBCFS_LINUX_UUID_H__
+
+#include <linux/uuid.h>
+
+#define UUID_SIZE 16
+
+/*
+ * The original linux UUID code had uuid_be and uuid_le.
+ * Later uuid_le was changed to guid_t and uuid_be
+ * to uuid_t. See for details kernel commit:
+ *
+ * f9727a17db9bab71ddae91f74f11a8a2f9a0ece6
+ */
+#ifndef HAVE_UUID_T
+typedef struct {
+	__u8 b[UUID_SIZE];
+} uuid_t;
+
+static inline void uuid_copy(uuid_t *dst, uuid_t *src)
+{
+	memcpy(dst, src, sizeof(uuid_t));
+}
+
+static inline bool uuid_equal(const uuid_t *u1, const uuid_t *u2)
+{
+	return memcmp(u1, u2, sizeof(uuid_t)) == 0;
+}
+
+#endif
+
+#endif /* __LIBCFS_LINUX_UUID_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
new file mode 100644
index 0000000000000..aa257fcdf0c8b
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
@@ -0,0 +1,593 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LIBCFS_LINUX_WAIT_BIT_H
+#define __LIBCFS_LINUX_WAIT_BIT_H
+
+/* Make sure we can see if we have TASK_NOLOAD */
+#include <linux/sched.h>
+/*
+ * Linux wait-bit related types and methods:
+ */
+#ifdef HAVE_WAIT_BIT_HEADER_H
+#include <linux/wait_bit.h>
+#endif
+#include <linux/wait.h>
+
+#ifndef HAVE_WAIT_QUEUE_ENTRY
+#define wait_queue_entry_t wait_queue_t
+#endif
+
+#ifndef HAVE_WAIT_BIT_HEADER_H
+struct wait_bit_queue_entry {
+	struct wait_bit_key	key;
+	wait_queue_entry_t	wq_entry;
+};
+
+#define ___wait_is_interruptible(state)                                         \
+	(!__builtin_constant_p(state) ||                                        \
+		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)          \
+
+#endif /* ! HAVE_WAIT_BIT_HEADER_H */
+
+#ifndef HAVE_PREPARE_TO_WAIT_EVENT
+extern long prepare_to_wait_event(wait_queue_head_t *wq_head,
+				  wait_queue_entry_t *wq_entry, int state);
+#endif
+
+/* ___wait_cond_timeout changed number of args in v3.12-rc1-78-g35a2af94c7ce
+ * so let's define our own ___wait_cond_timeout1
+ */
+
+#define ___wait_cond_timeout1(condition)				\
+({									\
+	bool __cond = (condition);					\
+	if (__cond && !__ret)						\
+		__ret = 1;						\
+	__cond || !__ret;						\
+})
+
+#ifndef HAVE_CLEAR_AND_WAKE_UP_BIT
+/**
+ * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
+ *
+ * @bit: the bit of the word being waited on
+ * @word: the word being waited on, a kernel virtual address
+ *
+ * You can use this helper if bitflags are manipulated atomically rather than
+ * non-atomically under a lock.
+ */
+static inline void clear_and_wake_up_bit(int bit, void *word)
+{
+	clear_bit_unlock(bit, word);
+	/* See wake_up_bit() for which memory barrier you need to use. */
+	smp_mb__after_atomic();
+	wake_up_bit(word, bit);
+}
+#endif /* ! HAVE_CLEAR_AND_WAKE_UP_BIT */
+
+#ifndef HAVE_WAIT_VAR_EVENT
+extern void __init wait_bit_init(void);
+extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry,
+				void *var, int flags);
+extern void wake_up_var(void *var);
+extern wait_queue_head_t *__var_waitqueue(void *p);
+
+#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)	\
+({									\
+	__label__ __out;						\
+	wait_queue_head_t *__wq_head = __var_waitqueue(var);		\
+	struct wait_bit_queue_entry __wbq_entry;			\
+	long __ret = ret; /* explicit shadow */				\
+									\
+	init_wait_var_entry(&__wbq_entry, var,				\
+			    exclusive ? WQ_FLAG_EXCLUSIVE : 0);		\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(__wq_head,		\
+						   &__wbq_entry.wq_entry, \
+						   state);		\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(__wq_head, &__wbq_entry.wq_entry);			\
+__out:	__ret;								\
+})
+
+#define __wait_var_event(var, condition)				\
+	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			  schedule())
+
+#define wait_var_event(var, condition)					\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__wait_var_event(var, condition);				\
+} while (0)
+
+#define __wait_var_event_killable(var, condition)			\
+	___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,		\
+			  schedule())
+
+#define wait_var_event_killable(var, condition)				\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_var_event_killable(var, condition);	\
+	__ret;								\
+})
+
+#define __wait_var_event_timeout(var, condition, timeout)		\
+	___wait_var_event(var, ___wait_cond_timeout1(condition),	\
+			  TASK_UNINTERRUPTIBLE, 0, timeout,		\
+			  __ret = schedule_timeout(__ret))
+
+#define wait_var_event_timeout(var, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_var_event_timeout(var, condition, timeout); \
+	__ret;								\
+})
+#else /* !HAVE_WAIT_VAR_EVENT */
+/* linux-3.10.0-1062.el7 defines wait_var_event_timeout() using
+ * __wait_cond_timeout(), but doesn't define __wait_cond_timeout !!!
+ */
+# ifndef __wait_cond_timeout
+# define ___wait_cond_timeout(condition)				\
+({									\
+	bool __cond = (condition);					\
+	if (__cond && !__ret)						\
+		__ret = 1;						\
+	__cond || !__ret;						\
+})
+# endif /* __wait_cond_timeout */
+
+#endif /* ! HAVE_WAIT_VAR_EVENT */
+
+/*
+ * prepare_to_wait_event() does not support an exclusive
+ * lifo wait.
+ * However it will not relink the wait_queue_entry if
+ * it is already linked.  So we link to the head of the
+ * queue here, and it will stay there.
+ */
+static inline void prepare_to_wait_exclusive_head(
+	wait_queue_head_t *waitq, wait_queue_entry_t *link)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&(waitq->lock), flags);
+#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST
+	if (list_empty(&link->entry))
+#else
+	if (list_empty(&link->task_list))
+#endif
+		__add_wait_queue_exclusive(waitq, link);
+	spin_unlock_irqrestore(&((waitq)->lock), flags);
+}
+
+#ifndef ___wait_event
+/*
+ * The below macro ___wait_event() has an explicit shadow of the __ret
+ * variable when used from the wait_event_*() macros.
+ *
+ * This is so that both can use the ___wait_cond_timeout1() construct
+ * to wrap the condition.
+ *
+ * The type inconsistency of the wait_event_*() __ret variable is also
+ * on purpose; we use long where we can return timeout values and int
+ * otherwise.
+ */
+
+#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)	\
+({									\
+	__label__ __out;						\
+	wait_queue_entry_t __wq_entry;					\
+	long __ret = ret;	/* explicit shadow */			\
+									\
+	init_wait(&__wq_entry);						\
+	if (exclusive)							\
+		__wq_entry.flags = WQ_FLAG_EXCLUSIVE;			\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(&wq_head,		\
+						  &__wq_entry, state);	\
+									\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(&wq_head, &__wq_entry);				\
+__out:	__ret;								\
+})
+#endif
+
+#ifndef TASK_NOLOAD
+
+#define TASK_IDLE TASK_INTERRUPTIBLE
+
+#define ___wait_event_idle(wq_head, condition, exclusive, ret, cmd)	\
+({									\
+	wait_queue_entry_t __wq_entry;					\
+	unsigned long flags;						\
+	long __ret = ret;	/* explicit shadow */			\
+	sigset_t __old_blocked, __new_blocked;				\
+									\
+	siginitset(&__new_blocked, LUSTRE_FATAL_SIGS);			\
+	sigprocmask(0, &__new_blocked, &__old_blocked);			\
+	init_wait(&__wq_entry);						\
+	if (exclusive)							\
+		__wq_entry.flags = WQ_FLAG_EXCLUSIVE;			\
+	for (;;) {							\
+		prepare_to_wait_event(&wq_head,				\
+				   &__wq_entry,				\
+				   TASK_INTERRUPTIBLE);			\
+									\
+		if (condition)						\
+			break;						\
+		/* We have to do this here because some signals */	\
+		/* are not blockable - ie from strace(1).       */	\
+		/* In these cases we want to schedule_timeout() */	\
+		/* again, because we don't want that to return  */	\
+		/* -EINTR when the RPC actually succeeded.      */	\
+		/* the recalc_sigpending() below will deliver the */	\
+		/* signal properly.                             */	\
+		if (signal_pending(current)) {				\
+			spin_lock_irqsave(&current->sighand->siglock,	\
+					  flags);			\
+			clear_tsk_thread_flag(current, TIF_SIGPENDING);	\
+			spin_unlock_irqrestore(&current->sighand->siglock,\
+					       flags);			\
+		}							\
+		cmd;							\
+	}								\
+	finish_wait(&wq_head, &__wq_entry);				\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#define wait_event_idle(wq_head, condition)				\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event_idle(wq_head, condition, 0, 0, schedule());\
+} while (0)
+
+#define wait_event_idle_exclusive(wq_head, condition)			\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event_idle(wq_head, condition, 1, 0, schedule());\
+} while (0)
+
+#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)\
+	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
+			   1, timeout,					\
+			   __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout(		\
+			wq_head, condition, timeout);			\
+	__ret;								\
+})
+
+#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,	\
+						timeout, cmd1, cmd2)	\
+	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
+			   1, timeout,					\
+			   cmd1; __ret = schedule_timeout(__ret); cmd2)
+
+#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
+					      cmd1, cmd2)		\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
+			wq_head, condition, timeout, cmd1, cmd2);	\
+	__ret;								\
+})
+
+#define __wait_event_idle_timeout(wq_head, condition, timeout)		\
+	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
+			   0, timeout,					\
+			   __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_timeout(wq_head, condition, timeout)		\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_timeout(wq_head, condition,	\
+						  timeout);		\
+	__ret;								\
+})
+
+#else /* TASK_IDLE */
+#ifndef wait_event_idle
+/**
+ * wait_event_idle - wait for a condition without contributing to system load
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ */
+#define wait_event_idle(wq_head, condition)				\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event(wq_head, condition, TASK_IDLE, 0, 0,	\
+			      schedule());				\
+} while (0)
+#endif
+#ifndef wait_event_idle_exclusive
+/**
+ * wait_event_idle_exclusive - wait for a condition without contributing to
+ *               system load
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus if other processes wait on the same list, when this
+ * process is woken further processes are not considered.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ */
+#define wait_event_idle_exclusive(wq_head, condition)			\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event(wq_head, condition, TASK_IDLE, 1, 0,	\
+			      schedule());				\
+} while (0)
+#endif
+#ifndef wait_event_idle_exclusive_timeout
+/**
+ * wait_event_idle_exclusive_timeout - sleep without load until a condition
+ *                       becomes true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus if other processes wait on the same list, when this
+ * process is woken further processes are not considered.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout(wq_head,	\
+							    condition,	\
+							    timeout);	\
+	__ret;								\
+})
+#endif
+#ifndef wait_event_idle_exclusive_timeout_cmd
+#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,	\
+						timeout, cmd1, cmd2)	\
+	___wait_event(wq_head, ___wait_cond_timeout1(condition),	\
+		      TASK_IDLE, 1, timeout,				\
+		      cmd1; __ret = schedule_timeout(__ret); cmd2)
+
+#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
+					      cmd1, cmd2)		\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
+			wq_head, condition, timeout, cmd1, cmd2);	\
+	__ret;								\
+})
+#endif
+
+#ifndef wait_event_idle_timeout
+
+#define __wait_event_idle_timeout(wq_head, condition, timeout)		\
+	___wait_event(wq_head, ___wait_cond_timeout1(condition),	\
+		      TASK_IDLE, 0, timeout,				\
+		      __ret = schedule_timeout(__ret))
+
+/**
+ * wait_event_idle_timeout - sleep without load until a condition becomes
+ *                           true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define wait_event_idle_timeout(wq_head, condition, timeout)		\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_timeout(wq_head, condition,	\
+						  timeout);		\
+	__ret;								\
+})
+#endif
+#endif /* TASK_IDLE */
+
+/* ___wait_event_lifo is used for lifo exclusive 'idle' waits */
+#ifdef TASK_NOLOAD
+
+#define ___wait_event_lifo(wq_head, condition, ret, cmd)		\
+({									\
+	wait_queue_entry_t	 __wq_entry;				\
+	long __ret = ret;	/* explicit shadow */			\
+									\
+	init_wait(&__wq_entry);						\
+	__wq_entry.flags =  WQ_FLAG_EXCLUSIVE;				\
+	for (;;) {							\
+		prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);	\
+		prepare_to_wait_event(&wq_head, &__wq_entry, TASK_IDLE);\
+									\
+		if (condition)						\
+			break;						\
+									\
+		cmd;							\
+	}								\
+	finish_wait(&wq_head, &__wq_entry);				\
+	__ret;								\
+})
+#else
+#define ___wait_event_lifo(wq_head, condition, ret, cmd)		\
+({									\
+	wait_queue_entry_t __wq_entry;					\
+	unsigned long flags;						\
+	long __ret = ret;	/* explicit shadow */			\
+	sigset_t __old_blocked, __new_blocked;				\
+									\
+	siginitset(&__new_blocked, LUSTRE_FATAL_SIGS);			\
+	sigprocmask(0, &__new_blocked, &__old_blocked);			\
+	init_wait(&__wq_entry);						\
+	__wq_entry.flags = WQ_FLAG_EXCLUSIVE;				\
+	for (;;) {							\
+		prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);	\
+		prepare_to_wait_event(&wq_head, &__wq_entry,		\
+				      TASK_INTERRUPTIBLE);		\
+									\
+		if (condition)						\
+			break;						\
+		/* See justification in ___wait_event_idle */		\
+		if (signal_pending(current)) {				\
+			spin_lock_irqsave(&current->sighand->siglock,	\
+					  flags);			\
+			clear_tsk_thread_flag(current, TIF_SIGPENDING);	\
+			spin_unlock_irqrestore(&current->sighand->siglock,\
+					       flags);			\
+		}							\
+		cmd;							\
+	}								\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	finish_wait(&wq_head, &__wq_entry);				\
+	__ret;								\
+})
+#endif
+
+#define wait_event_idle_exclusive_lifo(wq_head, condition)		\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event_lifo(wq_head, condition, 0, schedule());	\
+} while (0)
+
+#define __wait_event_idle_lifo_timeout(wq_head, condition, timeout)	\
+	___wait_event_lifo(wq_head, ___wait_cond_timeout1(condition),	\
+			   timeout,					\
+			   __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_exclusive_lifo_timeout(wq_head, condition, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_lifo_timeout(wq_head,		\
+						       condition,	\
+						       timeout);	\
+	__ret;								\
+})
+
+/* l_wait_event_abortable() is a bit like wait_event_killable()
+ * except there is a fixed set of signals which will abort:
+ * LUSTRE_FATAL_SIGS
+ */
+#define LUSTRE_FATAL_SIGS					 \
+	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \
+	 sigmask(SIGQUIT) | sigmask(SIGALRM))
+
+#define l_wait_event_abortable(wq, condition)				\
+({									\
+	sigset_t __new_blocked, __old_blocked;				\
+	int __ret = 0;							\
+	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
+	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
+	__ret = wait_event_interruptible(wq, condition);		\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#define l_wait_event_abortable_timeout(wq, condition, timeout)		\
+({									\
+	sigset_t __new_blocked, __old_blocked;				\
+	int __ret = 0;							\
+	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
+	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
+	__ret = wait_event_interruptible_timeout(wq, condition, timeout);\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#define l_wait_event_abortable_exclusive(wq, condition)			\
+({									\
+	sigset_t __new_blocked, __old_blocked;				\
+	int __ret = 0;							\
+	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
+	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
+	__ret = wait_event_interruptible_exclusive(wq, condition);	\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#ifndef HAVE_WAIT_WOKEN
+#define WQ_FLAG_WOKEN		0x02
+long wait_woken(wait_queue_entry_t *wait, unsigned int mode, long timeout);
+int woken_wake_function(wait_queue_entry_t *wait, unsigned int mode,
+			int sync, void *key);
+#endif /* HAVE_WAIT_WOKEN */
+
+#endif /* __LICBFS_LINUX_WAIT_BIT_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/processor.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/processor.h
new file mode 100644
index 0000000000000..700d01e53db40
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/processor.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Misc low level processor primitives */
+#ifndef _LINUX_PROCESSOR_H
+#define _LINUX_PROCESSOR_H
+
+#include <asm/processor.h>
+
+/*
+ * spin_begin is used before beginning a busy-wait loop, and must be paired
+ * with spin_end when the loop is exited. spin_cpu_relax must be called
+ * within the loop.
+ *
+ * The loop body should be as small and fast as possible, on the order of
+ * tens of instructions/cycles as a guide. It should and avoid calling
+ * cpu_relax, or any "spin" or sleep type of primitive including nested uses
+ * of these primitives. It should not lock or take any other resource.
+ * Violations of these guidelies will not cause a bug, but may cause sub
+ * optimal performance.
+ *
+ * These loops are optimized to be used where wait times are expected to be
+ * less than the cost of a context switch (and associated overhead).
+ *
+ * Detection of resource owner and decision to spin or sleep or guest-yield
+ * (e.g., spin lock holder vcpu preempted, or mutex owner not on CPU) can be
+ * tested within the loop body.
+ */
+#ifndef spin_begin
+#ifdef CONFIG_PPC64
+#define spin_begin()	HMT_low()
+#else
+#define spin_begin()
+#endif /* CONFIG_PPC64 */
+#endif /* spin_begin */
+
+#ifndef spin_cpu_relax
+#define spin_cpu_relax() cpu_relax()
+#endif
+
+/*
+ * spin_cpu_yield may be called to yield (undirected) to the hypervisor if
+ * necessary. This should be used if the wait is expected to take longer
+ * than context switch overhead, but we can't sleep or do a directed yield.
+ */
+#ifndef spin_cpu_yield
+#define spin_cpu_yield() cpu_relax_yield()
+#endif
+
+#ifndef spin_end
+#ifdef CONFIG_PPC64
+#define spin_end()	HMT_medium()
+#else
+#define spin_end()
+#endif /* CONFIG_PPC64 */
+#endif /* spin_end */
+
+/*
+ * spin_until_cond can be used to wait for a condition to become true. It
+ * may be expected that the first iteration will true in the common case
+ * (no spinning), so that callers should not require a first "likely" test
+ * for the uncontended case before using this primitive.
+ *
+ * Usage and implementation guidelines are the same as for the spin_begin
+ * primitives, above.
+ */
+#ifndef spin_until_cond
+#define spin_until_cond(cond)					\
+do {								\
+	if (unlikely(!(cond))) {				\
+		spin_begin();					\
+		do {						\
+			spin_cpu_relax();			\
+		} while (!(cond));				\
+		spin_end();					\
+	}							\
+} while (0)
+
+#endif
+
+#endif /* _LINUX_PROCESSOR_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/refcount.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/refcount.h
new file mode 100644
index 0000000000000..ecbf38561372f
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/refcount.h
@@ -0,0 +1,40 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+#ifndef __LIBCFS_LINUX_REFCOUNT_H__
+#define __LIBCFS_LINUX_REFCOUNT_H__
+
+#include <linux/atomic.h>
+
+#ifndef HAVE_REFCOUNT_T
+
+#define refcount_t		atomic_t
+
+#define refcount_set		atomic_set
+#define refcount_inc		atomic_inc
+#define refcount_inc_not_zero	atomic_inc_not_zero
+#define refcount_dec		atomic_dec
+#define refcount_dec_and_test	atomic_dec_and_test
+#define refcount_read		atomic_read
+
+#endif
+
+#endif /* __LIBCFS_LINUX_REFCOUNT_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/xarray.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/xarray.h
new file mode 100644
index 0000000000000..74397ab3a080d
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/xarray.h
@@ -0,0 +1,1766 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef _LINUX_XARRAY_H
+#define _LINUX_XARRAY_H
+/*
+ * eXtensible Arrays
+ * Copyright (c) 2017 Microsoft Corporation
+ * Author: Matthew Wilcox <willy@infradead.org>
+ *
+ * This is taken from kernel commit:
+ *
+ * 7b785645e ("mm: fix page cache convergence regression")
+ *
+ * at kernel verison 5.2-rc2
+ *
+ * See Documentation/core-api/xarray.rst for how to use the XArray.
+ */
+#ifndef HAVE_XARRAY_SUPPORT
+
+#if defined(NEED_LOCKDEP_IS_HELD_DISCARD_CONST) \
+ && defined(CONFIG_LOCKDEP) \
+ && defined(lockdep_is_held)
+#undef lockdep_is_held
+	#define lockdep_is_held(lock) \
+		lock_is_held((struct lockdep_map *)&(lock)->dep_map)
+#endif
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/gfp.h>
+#include <linux/kconfig.h>
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+/*
+ * The bottom two bits of the entry determine how the XArray interprets
+ * the contents:
+ *
+ * 00: Pointer entry
+ * 10: Internal entry
+ * x1: Value entry or tagged pointer
+ *
+ * Attempting to store internal entries in the XArray is a bug.
+ *
+ * Most internal entries are pointers to the next node in the tree.
+ * The following internal entries have a special meaning:
+ *
+ * 0-62: Sibling entries
+ * 256: Zero entry
+ * 257: Retry entry
+ *
+ * Errors are also represented as internal entries, but use the negative
+ * space (-4094 to -2).  They're never stored in the slots array; only
+ * returned by the normal API.
+ */
+
+#define BITS_PER_XA_VALUE	(BITS_PER_LONG - 1)
+
+/**
+ * xa_mk_value() - Create an XArray entry from an integer.
+ * @v: Value to store in XArray.
+ *
+ * Context: Any context.
+ * Return: An entry suitable for storing in the XArray.
+ */
+static inline void *xa_mk_value(unsigned long v)
+{
+	WARN_ON((long)v < 0);
+	return (void *)((v << 1) | 1);
+}
+
+/**
+ * xa_to_value() - Get value stored in an XArray entry.
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: The value stored in the XArray entry.
+ */
+static inline unsigned long xa_to_value(const void *entry)
+{
+	return (unsigned long)entry >> 1;
+}
+
+/**
+ * xa_is_value() - Determine if an entry is a value.
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: True if the entry is a value, false if it is a pointer.
+ */
+static inline bool xa_is_value(const void *entry)
+{
+	return (unsigned long)entry & 1;
+}
+
+/**
+ * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
+ * @p: Plain pointer.
+ * @tag: Tag value (0, 1 or 3).
+ *
+ * If the user of the XArray prefers, they can tag their pointers instead
+ * of storing value entries.  Three tags are available (0, 1 and 3).
+ * These are distinct from the xa_mark_t as they are not replicated up
+ * through the array and cannot be searched for.
+ *
+ * Context: Any context.
+ * Return: An XArray entry.
+ */
+static inline void *xa_tag_pointer(void *p, unsigned long tag)
+{
+	return (void *)((unsigned long)p | tag);
+}
+
+/**
+ * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
+ * @entry: XArray entry.
+ *
+ * If you have stored a tagged pointer in the XArray, call this function
+ * to get the untagged version of the pointer.
+ *
+ * Context: Any context.
+ * Return: A pointer.
+ */
+static inline void *xa_untag_pointer(void *entry)
+{
+	return (void *)((unsigned long)entry & ~3UL);
+}
+
+/**
+ * xa_pointer_tag() - Get the tag stored in an XArray entry.
+ * @entry: XArray entry.
+ *
+ * If you have stored a tagged pointer in the XArray, call this function
+ * to get the tag of that pointer.
+ *
+ * Context: Any context.
+ * Return: A tag.
+ */
+static inline unsigned int xa_pointer_tag(void *entry)
+{
+	return (unsigned long)entry & 3UL;
+}
+
+/*
+ * xa_mk_internal() - Create an internal entry.
+ * @v: Value to turn into an internal entry.
+ *
+ * Internal entries are used for a number of purposes.  Entries 0-255 are
+ * used for sibling entries (only 0-62 are used by the current code).  256
+ * is used for the retry entry.  257 is used for the reserved / zero entry.
+ * Negative internal entries are used to represent errnos.  Node pointers
+ * are also tagged as internal entries in some situations.
+ *
+ * Context: Any context.
+ * Return: An XArray internal entry corresponding to this value.
+ */
+static inline void *xa_mk_internal(unsigned long v)
+{
+	return (void *)((v << 2) | 2);
+}
+
+/*
+ * xa_to_internal() - Extract the value from an internal entry.
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: The value which was stored in the internal entry.
+ */
+static inline unsigned long xa_to_internal(const void *entry)
+{
+	return (unsigned long)entry >> 2;
+}
+
+/*
+ * xa_is_internal() - Is the entry an internal entry?
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: %true if the entry is an internal entry.
+ */
+static inline bool xa_is_internal(const void *entry)
+{
+	return ((unsigned long)entry & 3) == 2;
+}
+
+#define XA_ZERO_ENTRY		xa_mk_internal(257)
+
+/**
+ * xa_is_zero() - Is the entry a zero entry?
+ * @entry: Entry retrieved from the XArray
+ *
+ * The normal API will return NULL as the contents of a slot containing
+ * a zero entry.  You can only see zero entries by using the advanced API.
+ *
+ * Return: %true if the entry is a zero entry.
+ */
+static inline bool xa_is_zero(const void *entry)
+{
+	return unlikely(entry == XA_ZERO_ENTRY);
+}
+
+/**
+ * xa_is_err() - Report whether an XArray operation returned an error
+ * @entry: Result from calling an XArray function
+ *
+ * If an XArray operation cannot complete an operation, it will return
+ * a special value indicating an error.  This function tells you
+ * whether an error occurred; xa_err() tells you which error occurred.
+ *
+ * Context: Any context.
+ * Return: %true if the entry indicates an error.
+ */
+static inline bool xa_is_err(const void *entry)
+{
+	return unlikely(xa_is_internal(entry) &&
+			entry >= xa_mk_internal(-MAX_ERRNO));
+}
+
+/**
+ * xa_err() - Turn an XArray result into an errno.
+ * @entry: Result from calling an XArray function.
+ *
+ * If an XArray operation cannot complete an operation, it will return
+ * a special pointer value which encodes an errno.  This function extracts
+ * the errno from the pointer value, or returns 0 if the pointer does not
+ * represent an errno.
+ *
+ * Context: Any context.
+ * Return: A negative errno or 0.
+ */
+static inline int xa_err(void *entry)
+{
+	/* xa_to_internal() would not do sign extension. */
+	if (xa_is_err(entry))
+		return (long)entry >> 2;
+	return 0;
+}
+
+/**
+ * struct xa_limit - Represents a range of IDs.
+ * @min: The lowest ID to allocate (inclusive).
+ * @max: The maximum ID to allocate (inclusive).
+ *
+ * This structure is used either directly or via the XA_LIMIT() macro
+ * to communicate the range of IDs that are valid for allocation.
+ * Two common ranges are predefined for you:
+ * * xa_limit_32b	- [0 - UINT_MAX]
+ * * xa_limit_31b	- [0 - INT_MAX]
+ */
+struct xa_limit {
+	u32 max;
+	u32 min;
+};
+
+#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }
+
+#define xa_limit_32b	XA_LIMIT(0, UINT_MAX)
+#define xa_limit_31b	XA_LIMIT(0, INT_MAX)
+
+typedef unsigned __bitwise xa_mark_t;
+#define XA_MARK_0		((__force xa_mark_t)0U)
+#define XA_MARK_1		((__force xa_mark_t)1U)
+#define XA_MARK_2		((__force xa_mark_t)2U)
+#define XA_PRESENT		((__force xa_mark_t)8U)
+#define XA_MARK_MAX		XA_MARK_2
+#define XA_FREE_MARK		XA_MARK_0
+
+enum xa_lock_type {
+	XA_LOCK_IRQ = 1,
+	XA_LOCK_BH = 2,
+};
+
+/*
+ * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
+ * and we remain compatible with that.
+ */
+#define XA_FLAGS_LOCK_IRQ	((__force gfp_t)XA_LOCK_IRQ)
+#define XA_FLAGS_LOCK_BH	((__force gfp_t)XA_LOCK_BH)
+#define XA_FLAGS_TRACK_FREE	((__force gfp_t)4U)
+#define XA_FLAGS_ZERO_BUSY	((__force gfp_t)8U)
+#define XA_FLAGS_ALLOC_WRAPPED	((__force gfp_t)16U)
+#define XA_FLAGS_ACCOUNT	((__force gfp_t)32U)
+#define XA_FLAGS_MARK(mark)	((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
+						(__force unsigned)(mark)))
+
+/* ALLOC is for a normal 0-based alloc.  ALLOC1 is for an 1-based alloc */
+#define XA_FLAGS_ALLOC	(XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
+#define XA_FLAGS_ALLOC1	(XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)
+
+/**
+ * struct xarray - The anchor of the XArray.
+ * @xa_lock: Lock that protects the contents of the XArray.
+ *
+ * To use the xarray, define it statically or embed it in your data structure.
+ * It is a very small data structure, so it does not usually make sense to
+ * allocate it separately and keep a pointer to it in your data structure.
+ *
+ * You may use the xa_lock to protect your own data structures as well.
+ */
+/*
+ * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
+ * If the only non-NULL entry in the array is at index 0, @xa_head is that
+ * entry.  If any other entry in the array is non-NULL, @xa_head points
+ * to an @xa_node.
+ */
+struct xarray {
+	spinlock_t	xa_lock;
+/* private: The rest of the data structure is not to be used directly. */
+	gfp_t		xa_flags;
+	void __rcu	*xa_head;
+};
+
+#define XARRAY_INIT(name, flags) {				\
+	.xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock),		\
+	.xa_flags = flags,					\
+	.xa_head = NULL,					\
+}
+
+/**
+ * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
+ * @name: A string that names your XArray.
+ * @flags: XA_FLAG values.
+ *
+ * This is intended for file scope definitions of XArrays.  It declares
+ * and initialises an empty XArray with the chosen name and flags.  It is
+ * equivalent to calling xa_init_flags() on the array, but it does the
+ * initialisation at compiletime instead of runtime.
+ */
+#define DEFINE_XARRAY_FLAGS(name, flags)				\
+	struct xarray name = XARRAY_INIT(name, flags)
+
+/**
+ * DEFINE_XARRAY() - Define an XArray.
+ * @name: A string that names your XArray.
+ *
+ * This is intended for file scope definitions of XArrays.  It declares
+ * and initialises an empty XArray with the chosen name.  It is equivalent
+ * to calling xa_init() on the array, but it does the initialisation at
+ * compiletime instead of runtime.
+ */
+#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)
+
+/**
+ * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
+ * @name: A string that names your XArray.
+ *
+ * This is intended for file scope definitions of allocating XArrays.
+ * See also DEFINE_XARRAY().
+ */
+#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)
+
+/**
+ * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
+ * @name: A string that names your XArray.
+ *
+ * This is intended for file scope definitions of allocating XArrays.
+ * See also DEFINE_XARRAY().
+ */
+#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)
+
+void *xa_load(struct xarray *, unsigned long index);
+void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
+void *xa_erase(struct xarray *, unsigned long index);
+void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
+			void *entry, gfp_t);
+bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
+void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
+void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
+void *xa_find(struct xarray *xa, unsigned long *index,
+		unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
+void *xa_find_after(struct xarray *xa, unsigned long *index,
+		unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
+unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
+		unsigned long max, unsigned int n, xa_mark_t);
+void xa_destroy(struct xarray *);
+
+/**
+ * xa_init_flags() - Initialise an empty XArray with flags.
+ * @xa: XArray.
+ * @flags: XA_FLAG values.
+ *
+ * If you need to initialise an XArray with special flags (eg you need
+ * to take the lock from interrupt context), use this function instead
+ * of xa_init().
+ *
+ * Context: Any context.
+ */
+static inline void xa_init_flags(struct xarray *xa, gfp_t flags)
+{
+	spin_lock_init(&xa->xa_lock);
+	xa->xa_flags = flags;
+	xa->xa_head = NULL;
+}
+
+/**
+ * xa_init() - Initialise an empty XArray.
+ * @xa: XArray.
+ *
+ * An empty XArray is full of NULL entries.
+ *
+ * Context: Any context.
+ */
+static inline void xa_init(struct xarray *xa)
+{
+	xa_init_flags(xa, 0);
+}
+
+/**
+ * xa_empty() - Determine if an array has any present entries.
+ * @xa: XArray.
+ *
+ * Context: Any context.
+ * Return: %true if the array contains only NULL pointers.
+ */
+static inline bool xa_empty(const struct xarray *xa)
+{
+	return xa->xa_head == NULL;
+}
+
+/**
+ * xa_marked() - Inquire whether any entry in this array has a mark set
+ * @xa: Array
+ * @mark: Mark value
+ *
+ * Context: Any context.
+ * Return: %true if any entry has this mark set.
+ */
+static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
+{
+	return xa->xa_flags & XA_FLAGS_MARK(mark);
+}
+
+/**
+ * xa_for_each_start() - Iterate over a portion of an XArray.
+ * @xa: XArray.
+ * @index: Index of @entry.
+ * @entry: Entry retrieved from array.
+ * @start: First index to retrieve from array.
+ *
+ * During the iteration, @entry will have the value of the entry stored
+ * in @xa at @index.  You may modify @index during the iteration if you
+ * want to skip or reprocess indices.  It is safe to modify the array
+ * during the iteration.  At the end of the iteration, @entry will be set
+ * to NULL and @index will have a value less than or equal to max.
+ *
+ * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n).  You have
+ * to handle your own locking with xas_for_each(), and if you have to unlock
+ * after each iteration, it will also end up being O(n.log(n)).
+ * xa_for_each_start() will spin if it hits a retry entry; if you intend to
+ * see retry entries, you should use the xas_for_each() iterator instead.
+ * The xas_for_each() iterator will expand into more inline code than
+ * xa_for_each_start().
+ *
+ * Context: Any context.  Takes and releases the RCU lock.
+ */
+#define xa_for_each_start(xa, index, entry, start)			\
+	for (index = start,						\
+	     entry = xa_find(xa, &index, ULONG_MAX, XA_PRESENT);	\
+	     entry;							\
+	     entry = xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT))
+
+/**
+ * xa_for_each() - Iterate over present entries in an XArray.
+ * @xa: XArray.
+ * @index: Index of @entry.
+ * @entry: Entry retrieved from array.
+ *
+ * During the iteration, @entry will have the value of the entry stored
+ * in @xa at @index.  You may modify @index during the iteration if you want
+ * to skip or reprocess indices.  It is safe to modify the array during the
+ * iteration.  At the end of the iteration, @entry will be set to NULL and
+ * @index will have a value less than or equal to max.
+ *
+ * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n).  You have
+ * to handle your own locking with xas_for_each(), and if you have to unlock
+ * after each iteration, it will also end up being O(n.log(n)).  xa_for_each()
+ * will spin if it hits a retry entry; if you intend to see retry entries,
+ * you should use the xas_for_each() iterator instead.  The xas_for_each()
+ * iterator will expand into more inline code than xa_for_each().
+ *
+ * Context: Any context.  Takes and releases the RCU lock.
+ */
+#define xa_for_each(xa, index, entry) \
+	xa_for_each_start(xa, index, entry, 0)
+
+/**
+ * xa_for_each_marked() - Iterate over marked entries in an XArray.
+ * @xa: XArray.
+ * @index: Index of @entry.
+ * @entry: Entry retrieved from array.
+ * @filter: Selection criterion.
+ *
+ * During the iteration, @entry will have the value of the entry stored
+ * in @xa at @index.  The iteration will skip all entries in the array
+ * which do not match @filter.  You may modify @index during the iteration
+ * if you want to skip or reprocess indices.  It is safe to modify the array
+ * during the iteration.  At the end of the iteration, @entry will be set to
+ * NULL and @index will have a value less than or equal to max.
+ *
+ * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n).
+ * You have to handle your own locking with xas_for_each(), and if you have
+ * to unlock after each iteration, it will also end up being O(n.log(n)).
+ * xa_for_each_marked() will spin if it hits a retry entry; if you intend to
+ * see retry entries, you should use the xas_for_each_marked() iterator
+ * instead.  The xas_for_each_marked() iterator will expand into more inline
+ * code than xa_for_each_marked().
+ *
+ * Context: Any context.  Takes and releases the RCU lock.
+ */
+#define xa_for_each_marked(xa, index, entry, filter) \
+	for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \
+	     entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))
+
+#define xa_trylock(xa)		spin_trylock(&(xa)->xa_lock)
+#define xa_lock(xa)		spin_lock(&(xa)->xa_lock)
+#define xa_unlock(xa)		spin_unlock(&(xa)->xa_lock)
+#define xa_lock_bh(xa)		spin_lock_bh(&(xa)->xa_lock)
+#define xa_unlock_bh(xa)	spin_unlock_bh(&(xa)->xa_lock)
+#define xa_lock_irq(xa)		spin_lock_irq(&(xa)->xa_lock)
+#define xa_unlock_irq(xa)	spin_unlock_irq(&(xa)->xa_lock)
+#define xa_lock_irqsave(xa, flags) \
+				spin_lock_irqsave(&(xa)->xa_lock, flags)
+#define xa_unlock_irqrestore(xa, flags) \
+				spin_unlock_irqrestore(&(xa)->xa_lock, flags)
+
+/*
+ * Versions of the normal API which require the caller to hold the
+ * xa_lock.  If the GFP flags allow it, they will drop the lock to
+ * allocate memory, then reacquire it afterwards.  These functions
+ * may also re-enable interrupts if the XArray flags indicate the
+ * locking should be interrupt safe.
+ */
+void *__xa_erase(struct xarray *, unsigned long index);
+void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
+void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
+		void *entry, gfp_t);
+int __must_check __xa_insert(struct xarray *, unsigned long index,
+		void *entry, gfp_t);
+int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
+		struct xa_limit, gfp_t);
+int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
+		struct xa_limit, u32 *next, gfp_t);
+void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
+void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
+
+/**
+ * xa_store_bh() - Store this entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * This function is like calling xa_store() except it disables softirqs
+ * while holding the array lock.
+ *
+ * Context: Any context.  Takes and releases the xa_lock while
+ * disabling softirqs.
+ * Return: The entry which used to be at this index.
+ */
+static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
+		void *entry, gfp_t gfp)
+{
+	void *curr;
+
+	xa_lock_bh(xa);
+	curr = __xa_store(xa, index, entry, gfp);
+	xa_unlock_bh(xa);
+
+	return curr;
+}
+
+/**
+ * xa_store_irq() - Store this entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * This function is like calling xa_store() except it disables interrupts
+ * while holding the array lock.
+ *
+ * Context: Process context.  Takes and releases the xa_lock while
+ * disabling interrupts.
+ * Return: The entry which used to be at this index.
+ */
+static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
+		void *entry, gfp_t gfp)
+{
+	void *curr;
+
+	xa_lock_irq(xa);
+	curr = __xa_store(xa, index, entry, gfp);
+	xa_unlock_irq(xa);
+
+	return curr;
+}
+
+/**
+ * xa_erase_bh() - Erase this entry from the XArray.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * After this function returns, loading from @index will return %NULL.
+ * If the index is part of a multi-index entry, all indices will be erased
+ * and none of the entries will be part of a multi-index entry.
+ *
+ * Context: Any context.  Takes and releases the xa_lock while
+ * disabling softirqs.
+ * Return: The entry which used to be at this index.
+ */
+static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
+{
+	void *entry;
+
+	xa_lock_bh(xa);
+	entry = __xa_erase(xa, index);
+	xa_unlock_bh(xa);
+
+	return entry;
+}
+
+/**
+ * xa_erase_irq() - Erase this entry from the XArray.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * After this function returns, loading from @index will return %NULL.
+ * If the index is part of a multi-index entry, all indices will be erased
+ * and none of the entries will be part of a multi-index entry.
+ *
+ * Context: Process context.  Takes and releases the xa_lock while
+ * disabling interrupts.
+ * Return: The entry which used to be at this index.
+ */
+static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
+{
+	void *entry;
+
+	xa_lock_irq(xa);
+	entry = __xa_erase(xa, index);
+	xa_unlock_irq(xa);
+
+	return entry;
+}
+
+/**
+ * xa_cmpxchg() - Conditionally replace an entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @old: Old value to test against.
+ * @entry: New value to place in array.
+ * @gfp: Memory allocation flags.
+ *
+ * If the entry at @index is the same as @old, replace it with @entry.
+ * If the return value is equal to @old, then the exchange was successful.
+ *
+ * Context: Any context.  Takes and releases the xa_lock.  May sleep
+ * if the @gfp flags permit.
+ * Return: The old value at this index or xa_err() if an error happened.
+ */
+static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
+			void *old, void *entry, gfp_t gfp)
+{
+	void *curr;
+
+	xa_lock(xa);
+	curr = __xa_cmpxchg(xa, index, old, entry, gfp);
+	xa_unlock(xa);
+
+	return curr;
+}
+
+/**
+ * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @old: Old value to test against.
+ * @entry: New value to place in array.
+ * @gfp: Memory allocation flags.
+ *
+ * This function is like calling xa_cmpxchg() except it disables softirqs
+ * while holding the array lock.
+ *
+ * Context: Any context.  Takes and releases the xa_lock while
+ * disabling softirqs.  May sleep if the @gfp flags permit.
+ * Return: The old value at this index or xa_err() if an error happened.
+ */
+static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
+			void *old, void *entry, gfp_t gfp)
+{
+	void *curr;
+
+	xa_lock_bh(xa);
+	curr = __xa_cmpxchg(xa, index, old, entry, gfp);
+	xa_unlock_bh(xa);
+
+	return curr;
+}
+
+/**
+ * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @old: Old value to test against.
+ * @entry: New value to place in array.
+ * @gfp: Memory allocation flags.
+ *
+ * This function is like calling xa_cmpxchg() except it disables interrupts
+ * while holding the array lock.
+ *
+ * Context: Process context.  Takes and releases the xa_lock while
+ * disabling interrupts.  May sleep if the @gfp flags permit.
+ * Return: The old value at this index or xa_err() if an error happened.
+ */
+static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
+			void *old, void *entry, gfp_t gfp)
+{
+	void *curr;
+
+	xa_lock_irq(xa);
+	curr = __xa_cmpxchg(xa, index, old, entry, gfp);
+	xa_unlock_irq(xa);
+
+	return curr;
+}
+
+/**
+ * xa_insert() - Store this entry in the XArray unless another entry is
+ *			already present.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * Inserting a NULL entry will store a reserved entry (like xa_reserve())
+ * if no entry is present.  Inserting will fail if a reserved entry is
+ * present, even though loading from this index will return NULL.
+ *
+ * Context: Any context.  Takes and releases the xa_lock.  May sleep if
+ * the @gfp flags permit.
+ * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
+ * -ENOMEM if memory could not be allocated.
+ */
+static inline int __must_check xa_insert(struct xarray *xa,
+		unsigned long index, void *entry, gfp_t gfp)
+{
+	int err;
+
+	xa_lock(xa);
+	err = __xa_insert(xa, index, entry, gfp);
+	xa_unlock(xa);
+
+	return err;
+}
+
+/**
+ * xa_insert_bh() - Store this entry in the XArray unless another entry is
+ *			already present.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * Inserting a NULL entry will store a reserved entry (like xa_reserve())
+ * if no entry is present.  Inserting will fail if a reserved entry is
+ * present, even though loading from this index will return NULL.
+ *
+ * Context: Any context.  Takes and releases the xa_lock while
+ * disabling softirqs.  May sleep if the @gfp flags permit.
+ * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
+ * -ENOMEM if memory could not be allocated.
+ */
+static inline int __must_check xa_insert_bh(struct xarray *xa,
+		unsigned long index, void *entry, gfp_t gfp)
+{
+	int err;
+
+	xa_lock_bh(xa);
+	err = __xa_insert(xa, index, entry, gfp);
+	xa_unlock_bh(xa);
+
+	return err;
+}
+
+/**
+ * xa_insert_irq() - Store this entry in the XArray unless another entry is
+ *			already present.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * Inserting a NULL entry will store a reserved entry (like xa_reserve())
+ * if no entry is present.  Inserting will fail if a reserved entry is
+ * present, even though loading from this index will return NULL.
+ *
+ * Context: Process context.  Takes and releases the xa_lock while
+ * disabling interrupts.  May sleep if the @gfp flags permit.
+ * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
+ * -ENOMEM if memory could not be allocated.
+ */
+static inline int __must_check xa_insert_irq(struct xarray *xa,
+		unsigned long index, void *entry, gfp_t gfp)
+{
+	int err;
+
+	xa_lock_irq(xa);
+	err = __xa_insert(xa, index, entry, gfp);
+	xa_unlock_irq(xa);
+
+	return err;
+}
+
+/**
+ * xa_alloc() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ *
+ * Context: Any context.  Takes and releases the xa_lock.  May sleep if
+ * the @gfp flags permit.
+ * Return: 0 on success, -ENOMEM if memory could not be allocated or
+ * -EBUSY if there are no free entries in @limit.
+ */
+static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
+		void *entry, struct xa_limit limit, gfp_t gfp)
+{
+	int err;
+
+	xa_lock(xa);
+	err = __xa_alloc(xa, id, entry, limit, gfp);
+	xa_unlock(xa);
+
+	return err;
+}
+
+/**
+ * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ *
+ * Context: Any context.  Takes and releases the xa_lock while
+ * disabling softirqs.  May sleep if the @gfp flags permit.
+ * Return: 0 on success, -ENOMEM if memory could not be allocated or
+ * -EBUSY if there are no free entries in @limit.
+ */
+static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
+		void *entry, struct xa_limit limit, gfp_t gfp)
+{
+	int err;
+
+	xa_lock_bh(xa);
+	err = __xa_alloc(xa, id, entry, limit, gfp);
+	xa_unlock_bh(xa);
+
+	return err;
+}
+
+/**
+ * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ *
+ * Context: Process context.  Takes and releases the xa_lock while
+ * disabling interrupts.  May sleep if the @gfp flags permit.
+ * Return: 0 on success, -ENOMEM if memory could not be allocated or
+ * -EBUSY if there are no free entries in @limit.
+ */
+static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
+		void *entry, struct xa_limit limit, gfp_t gfp)
+{
+	int err;
+
+	xa_lock_irq(xa);
+	err = __xa_alloc(xa, id, entry, limit, gfp);
+	xa_unlock_irq(xa);
+
+	return err;
+}
+
+/**
+ * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of allocated ID.
+ * @next: Pointer to next ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ * The search for an empty entry will start at @next and will wrap
+ * around if necessary.
+ *
+ * Context: Any context.  Takes and releases the xa_lock.  May sleep if
+ * the @gfp flags permit.
+ * Return: 0 if the allocation succeeded without wrapping.  1 if the
+ * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * allocated or -EBUSY if there are no free entries in @limit.
+ */
+static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, u32 *next, gfp_t gfp)
+{
+	int err;
+
+	xa_lock(xa);
+	err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
+	xa_unlock(xa);
+
+	return err;
+}
+
+/**
+ * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of allocated ID.
+ * @next: Pointer to next ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ * The search for an empty entry will start at @next and will wrap
+ * around if necessary.
+ *
+ * Context: Any context.  Takes and releases the xa_lock while
+ * disabling softirqs.  May sleep if the @gfp flags permit.
+ * Return: 0 if the allocation succeeded without wrapping.  1 if the
+ * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * allocated or -EBUSY if there are no free entries in @limit.
+ */
+static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, u32 *next, gfp_t gfp)
+{
+	int err;
+
+	xa_lock_bh(xa);
+	err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
+	xa_unlock_bh(xa);
+
+	return err;
+}
+
+/**
+ * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of allocated ID.
+ * @next: Pointer to next ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ * The search for an empty entry will start at @next and will wrap
+ * around if necessary.
+ *
+ * Context: Process context.  Takes and releases the xa_lock while
+ * disabling interrupts.  May sleep if the @gfp flags permit.
+ * Return: 0 if the allocation succeeded without wrapping.  1 if the
+ * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * allocated or -EBUSY if there are no free entries in @limit.
+ */
+static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, u32 *next, gfp_t gfp)
+{
+	int err;
+
+	xa_lock_irq(xa);
+	err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
+	xa_unlock_irq(xa);
+
+	return err;
+}
+
+/**
+ * xa_reserve() - Reserve this index in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @gfp: Memory allocation flags.
+ *
+ * Ensures there is somewhere to store an entry at @index in the array.
+ * If there is already something stored at @index, this function does
+ * nothing.  If there was nothing there, the entry is marked as reserved.
+ * Loading from a reserved entry returns a %NULL pointer.
+ *
+ * If you do not use the entry that you have reserved, call xa_release()
+ * or xa_erase() to free any unnecessary memory.
+ *
+ * Context: Any context.  Takes and releases the xa_lock.
+ * May sleep if the @gfp flags permit.
+ * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
+ */
+static inline __must_check
+int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
+{
+	return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
+}
+
+/**
+ * xa_reserve_bh() - Reserve this index in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @gfp: Memory allocation flags.
+ *
+ * A softirq-disabling version of xa_reserve().
+ *
+ * Context: Any context.  Takes and releases the xa_lock while
+ * disabling softirqs.
+ * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
+ */
+static inline __must_check
+int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
+{
+	return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
+}
+
+/**
+ * xa_reserve_irq() - Reserve this index in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @gfp: Memory allocation flags.
+ *
+ * An interrupt-disabling version of xa_reserve().
+ *
+ * Context: Process context.  Takes and releases the xa_lock while
+ * disabling interrupts.
+ * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
+ */
+static inline __must_check
+int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
+{
+	return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
+}
+
+/**
+ * xa_release() - Release a reserved entry.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * After calling xa_reserve(), you can call this function to release the
+ * reservation.  If the entry at @index has been stored to, this function
+ * will do nothing.
+ */
+static inline void xa_release(struct xarray *xa, unsigned long index)
+{
+	xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
+}
+
+/* Everything below here is the Advanced API.  Proceed with caution. */
+
+/*
+ * The xarray is constructed out of a set of 'chunks' of pointers.  Choosing
+ * the best chunk size requires some tradeoffs.  A power of two recommends
+ * itself so that we can walk the tree based purely on shifts and masks.
+ * Generally, the larger the better; as the number of slots per level of the
+ * tree increases, the less tall the tree needs to be.  But that needs to be
+ * balanced against the memory consumption of each node.  On a 64-bit system,
+ * xa_node is currently 576 bytes, and we get 7 of them per 4kB page.  If we
+ * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
+ */
+#ifndef XA_CHUNK_SHIFT
+#define XA_CHUNK_SHIFT		(CONFIG_BASE_SMALL ? 4 : 6)
+#endif
+#define XA_CHUNK_SIZE		(1UL << XA_CHUNK_SHIFT)
+#define XA_CHUNK_MASK		(XA_CHUNK_SIZE - 1)
+#define XA_MAX_MARKS		3
+#define XA_MARK_LONGS		DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
+
+/*
+ * @count is the count of every non-NULL element in the ->slots array
+ * whether that is a value entry, a retry entry, a user pointer,
+ * a sibling entry or a pointer to the next level of the tree.
+ * @nr_values is the count of every element in ->slots which is
+ * either a value entry or a sibling of a value entry.
+ */
+struct xa_node {
+	unsigned char	shift;		/* Bits remaining in each slot */
+	unsigned char	offset;		/* Slot offset in parent */
+	unsigned char	count;		/* Total entry count */
+	unsigned char	nr_values;	/* Value entry count */
+	struct xa_node __rcu *parent;	/* NULL at top of tree */
+	struct xarray	*array;		/* The array we belong to */
+	union {
+		struct list_head private_list;	/* For tree user */
+		struct rcu_head	rcu_head;	/* Used when freeing node */
+	};
+	void __rcu	*slots[XA_CHUNK_SIZE];
+	union {
+		unsigned long	tags[XA_MAX_MARKS][XA_MARK_LONGS];
+		unsigned long	marks[XA_MAX_MARKS][XA_MARK_LONGS];
+	};
+};
+
+void xa_dump(const struct xarray *);
+void xa_dump_node(const struct xa_node *);
+
+#ifdef XA_DEBUG
+#define XA_BUG_ON(xa, x) do {					\
+		if (x) {					\
+			xa_dump(xa);				\
+			BUG();					\
+		}						\
+	} while (0)
+#define XA_NODE_BUG_ON(node, x) do {				\
+		if (x) {					\
+			if (node) xa_dump_node(node);		\
+			BUG();					\
+		}						\
+	} while (0)
+#else
+#define XA_BUG_ON(xa, x)	do { } while (0)
+#define XA_NODE_BUG_ON(node, x)	do { } while (0)
+#endif
+
+/* Private */
+static inline void *xa_head(const struct xarray *xa)
+{
+	return rcu_dereference_check(xa->xa_head,
+						lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_head_locked(const struct xarray *xa)
+{
+	return rcu_dereference_protected(xa->xa_head,
+						lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_entry(const struct xarray *xa,
+				const struct xa_node *node, unsigned int offset)
+{
+	XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
+	return rcu_dereference_check(node->slots[offset],
+						lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_entry_locked(const struct xarray *xa,
+				const struct xa_node *node, unsigned int offset)
+{
+	XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
+	return rcu_dereference_protected(node->slots[offset],
+						lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline struct xa_node *xa_parent(const struct xarray *xa,
+					const struct xa_node *node)
+{
+	return rcu_dereference_check(node->parent,
+						lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
+					const struct xa_node *node)
+{
+	return rcu_dereference_protected(node->parent,
+						lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_mk_node(const struct xa_node *node)
+{
+	return (void *)((unsigned long)node | 2);
+}
+
+/* Private */
+static inline struct xa_node *xa_to_node(const void *entry)
+{
+	return (struct xa_node *)((unsigned long)entry - 2);
+}
+
+/* Private */
+static inline bool xa_is_node(const void *entry)
+{
+	return xa_is_internal(entry) && (unsigned long)entry > 4096;
+}
+
+/* Private */
+static inline void *xa_mk_sibling(unsigned int offset)
+{
+	return xa_mk_internal(offset);
+}
+
+/* Private */
+static inline unsigned long xa_to_sibling(const void *entry)
+{
+	return xa_to_internal(entry);
+}
+
+/**
+ * xa_is_sibling() - Is the entry a sibling entry?
+ * @entry: Entry retrieved from the XArray
+ *
+ * Return: %true if the entry is a sibling entry.
+ */
+static inline bool xa_is_sibling(const void *entry)
+{
+	return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
+		(entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
+}
+
+#define XA_RETRY_ENTRY		xa_mk_internal(256)
+
+/**
+ * xa_is_retry() - Is the entry a retry entry?
+ * @entry: Entry retrieved from the XArray
+ *
+ * Return: %true if the entry is a retry entry.
+ */
+static inline bool xa_is_retry(const void *entry)
+{
+	return unlikely(entry == XA_RETRY_ENTRY);
+}
+
+/**
+ * xa_is_advanced() - Is the entry only permitted for the advanced API?
+ * @entry: Entry to be stored in the XArray.
+ *
+ * Return: %true if the entry cannot be stored by the normal API.
+ */
+static inline bool xa_is_advanced(const void *entry)
+{
+	return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY);
+}
+
+/**
+ * typedef xa_update_node_t - A callback function from the XArray.
+ * @node: The node which is being processed
+ *
+ * This function is called every time the XArray updates the count of
+ * present and value entries in a node.  It allows advanced users to
+ * maintain the private_list in the node.
+ *
+ * Context: The xa_lock is held and interrupts may be disabled.
+ *	    Implementations should not drop the xa_lock, nor re-enable
+ *	    interrupts.
+ */
+typedef void (*xa_update_node_t)(struct xa_node *node);
+
+/*
+ * The xa_state is opaque to its users.  It contains various different pieces
+ * of state involved in the current operation on the XArray.  It should be
+ * declared on the stack and passed between the various internal routines.
+ * The various elements in it should not be accessed directly, but only
+ * through the provided accessor functions.  The below documentation is for
+ * the benefit of those working on the code, not for users of the XArray.
+ *
+ * @xa_node usually points to the xa_node containing the slot we're operating
+ * on (and @xa_offset is the offset in the slots array).  If there is a
+ * single entry in the array at index 0, there are no allocated xa_nodes to
+ * point to, and so we store %NULL in @xa_node.  @xa_node is set to
+ * the value %XAS_RESTART if the xa_state is not walked to the correct
+ * position in the tree of nodes for this operation.  If an error occurs
+ * during an operation, it is set to an %XAS_ERROR value.  If we run off the
+ * end of the allocated nodes, it is set to %XAS_BOUNDS.
+ */
+struct xa_state {
+	struct xarray *xa;
+	unsigned long xa_index;
+	unsigned char xa_shift;
+	unsigned char xa_sibs;
+	unsigned char xa_offset;
+	unsigned char xa_pad;		/* Helps gcc generate better code */
+	struct xa_node *xa_node;
+	struct xa_node *xa_alloc;
+	xa_update_node_t xa_update;
+};
+
+/*
+ * We encode errnos in the xas->xa_node.  If an error has happened, we need to
+ * drop the lock to fix it, and once we've done so the xa_state is invalid.
+ */
+#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
+#define XAS_BOUNDS	((struct xa_node *)1UL)
+#define XAS_RESTART	((struct xa_node *)3UL)
+
+#define __XA_STATE(array, index, shift, sibs)  {	\
+	.xa = array,					\
+	.xa_index = index,				\
+	.xa_shift = shift,				\
+	.xa_sibs = sibs,				\
+	.xa_offset = 0,					\
+	.xa_pad = 0,					\
+	.xa_node = XAS_RESTART,				\
+	.xa_alloc = NULL,				\
+	.xa_update = NULL				\
+}
+
+/**
+ * XA_STATE() - Declare an XArray operation state.
+ * @name: Name of this operation state (usually xas).
+ * @array: Array to operate on.
+ * @index: Initial index of interest.
+ *
+ * Declare and initialise an xa_state on the stack.
+ */
+#define XA_STATE(name, array, index)				\
+	struct xa_state name = __XA_STATE(array, index, 0, 0)
+
+/**
+ * XA_STATE_ORDER() - Declare an XArray operation state.
+ * @name: Name of this operation state (usually xas).
+ * @array: Array to operate on.
+ * @index: Initial index of interest.
+ * @order: Order of entry.
+ *
+ * Declare and initialise an xa_state on the stack.  This variant of
+ * XA_STATE() allows you to specify the 'order' of the element you
+ * want to operate on.`
+ */
+#define XA_STATE_ORDER(name, array, index, order)		\
+	struct xa_state name = __XA_STATE(array,		\
+			(index >> order) << order,		\
+			order - (order % XA_CHUNK_SHIFT),	\
+			(1U << (order % XA_CHUNK_SHIFT)) - 1)
+
+#define xas_marked(xas, mark)	xa_marked((xas)->xa, (mark))
+#define xas_trylock(xas)	xa_trylock((xas)->xa)
+#define xas_lock(xas)		xa_lock((xas)->xa)
+#define xas_unlock(xas)		xa_unlock((xas)->xa)
+#define xas_lock_bh(xas)	xa_lock_bh((xas)->xa)
+#define xas_unlock_bh(xas)	xa_unlock_bh((xas)->xa)
+#define xas_lock_irq(xas)	xa_lock_irq((xas)->xa)
+#define xas_unlock_irq(xas)	xa_unlock_irq((xas)->xa)
+#define xas_lock_irqsave(xas, flags) \
+				xa_lock_irqsave((xas)->xa, flags)
+#define xas_unlock_irqrestore(xas, flags) \
+				xa_unlock_irqrestore((xas)->xa, flags)
+
+/**
+ * xas_error() - Return an errno stored in the xa_state.
+ * @xas: XArray operation state.
+ *
+ * Return: 0 if no error has been noted.  A negative errno if one has.
+ */
+static inline int xas_error(const struct xa_state *xas)
+{
+	return xa_err(xas->xa_node);
+}
+
+/**
+ * xas_set_err() - Note an error in the xa_state.
+ * @xas: XArray operation state.
+ * @err: Negative error number.
+ *
+ * Only call this function with a negative @err; zero or positive errors
+ * will probably not behave the way you think they should.  If you want
+ * to clear the error from an xa_state, use xas_reset().
+ */
+static inline void xas_set_err(struct xa_state *xas, long err)
+{
+	xas->xa_node = XA_ERROR(err);
+}
+
+/**
+ * xas_invalid() - Is the xas in a retry or error state?
+ * @xas: XArray operation state.
+ *
+ * Return: %true if the xas cannot be used for operations.
+ */
+static inline bool xas_invalid(const struct xa_state *xas)
+{
+	return (unsigned long)xas->xa_node & 3;
+}
+
+/**
+ * xas_valid() - Is the xas a valid cursor into the array?
+ * @xas: XArray operation state.
+ *
+ * Return: %true if the xas can be used for operations.
+ */
+static inline bool xas_valid(const struct xa_state *xas)
+{
+	return !xas_invalid(xas);
+}
+
+/**
+ * xas_is_node() - Does the xas point to a node?
+ * @xas: XArray operation state.
+ *
+ * Return: %true if the xas currently references a node.
+ */
+static inline bool xas_is_node(const struct xa_state *xas)
+{
+	return xas_valid(xas) && xas->xa_node;
+}
+
+/* True if the pointer is something other than a node */
+static inline bool xas_not_node(struct xa_node *node)
+{
+	return ((unsigned long)node & 3) || !node;
+}
+
+/* True if the node represents RESTART or an error */
+static inline bool xas_frozen(struct xa_node *node)
+{
+	return (unsigned long)node & 2;
+}
+
+/* True if the node represents head-of-tree, RESTART or BOUNDS */
+static inline bool xas_top(struct xa_node *node)
+{
+	return node <= XAS_RESTART;
+}
+
+/**
+ * xas_reset() - Reset an XArray operation state.
+ * @xas: XArray operation state.
+ *
+ * Resets the error or walk state of the @xas so future walks of the
+ * array will start from the root.  Use this if you have dropped the
+ * xarray lock and want to reuse the xa_state.
+ *
+ * Context: Any context.
+ */
+static inline void xas_reset(struct xa_state *xas)
+{
+	xas->xa_node = XAS_RESTART;
+}
+
+/**
+ * xas_retry() - Retry the operation if appropriate.
+ * @xas: XArray operation state.
+ * @entry: Entry from xarray.
+ *
+ * The advanced functions may sometimes return an internal entry, such as
+ * a retry entry or a zero entry.  This function sets up the @xas to restart
+ * the walk from the head of the array if needed.
+ *
+ * Context: Any context.
+ * Return: true if the operation needs to be retried.
+ */
+static inline bool xas_retry(struct xa_state *xas, const void *entry)
+{
+	if (xa_is_zero(entry))
+		return true;
+	if (!xa_is_retry(entry))
+		return false;
+	xas_reset(xas);
+	return true;
+}
+
+void *xas_load(struct xa_state *);
+void *xas_store(struct xa_state *, void *entry);
+void *xas_find(struct xa_state *, unsigned long max);
+void *xas_find_conflict(struct xa_state *);
+
+bool xas_get_mark(const struct xa_state *, xa_mark_t);
+void xas_set_mark(const struct xa_state *, xa_mark_t);
+void xas_clear_mark(const struct xa_state *, xa_mark_t);
+void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
+void xas_init_marks(const struct xa_state *);
+
+bool xas_nomem(struct xa_state *, gfp_t);
+void xas_pause(struct xa_state *);
+
+void xas_create_range(struct xa_state *);
+
+/**
+ * xas_reload() - Refetch an entry from the xarray.
+ * @xas: XArray operation state.
+ *
+ * Use this function to check that a previously loaded entry still has
+ * the same value.  This is useful for the lockless pagecache lookup where
+ * we walk the array with only the RCU lock to protect us, lock the page,
+ * then check that the page hasn't moved since we looked it up.
+ *
+ * The caller guarantees that @xas is still valid.  If it may be in an
+ * error or restart state, call xas_load() instead.
+ *
+ * Return: The entry at this location in the xarray.
+ */
+static inline void *xas_reload(struct xa_state *xas)
+{
+	struct xa_node *node = xas->xa_node;
+
+	if (node)
+		return xa_entry(xas->xa, node, xas->xa_offset);
+	return xa_head(xas->xa);
+}
+
+/**
+ * xas_set() - Set up XArray operation state for a different index.
+ * @xas: XArray operation state.
+ * @index: New index into the XArray.
+ *
+ * Move the operation state to refer to a different index.  This will
+ * have the effect of starting a walk from the top; see xas_next()
+ * to move to an adjacent index.
+ */
+static inline void xas_set(struct xa_state *xas, unsigned long index)
+{
+	xas->xa_index = index;
+	xas->xa_node = XAS_RESTART;
+}
+
+/**
+ * xas_set_order() - Set up XArray operation state for a multislot entry.
+ * @xas: XArray operation state.
+ * @index: Target of the operation.
+ * @order: Entry occupies 2^@order indices.
+ */
+static inline void xas_set_order(struct xa_state *xas, unsigned long index,
+					unsigned int order)
+{
+#ifdef CONFIG_XARRAY_MULTI
+	xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
+	xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
+	xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
+	xas->xa_node = XAS_RESTART;
+#else
+	BUG_ON(order > 0);
+	xas_set(xas, index);
+#endif
+}
+
+/**
+ * xas_set_update() - Set up XArray operation state for a callback.
+ * @xas: XArray operation state.
+ * @update: Function to call when updating a node.
+ *
+ * The XArray can notify a caller after it has updated an xa_node.
+ * This is advanced functionality and is only needed by the page cache.
+ */
+static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
+{
+	xas->xa_update = update;
+}
+
+/**
+ * xas_next_entry() - Advance iterator to next present entry.
+ * @xas: XArray operation state.
+ * @max: Highest index to return.
+ *
+ * xas_next_entry() is an inline function to optimise xarray traversal for
+ * speed.  It is equivalent to calling xas_find(), and will call xas_find()
+ * for all the hard cases.
+ *
+ * Return: The next present entry after the one currently referred to by @xas.
+ */
+static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
+{
+	struct xa_node *node = xas->xa_node;
+	void *entry;
+
+	if (unlikely(xas_not_node(node) || node->shift ||
+			xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
+		return xas_find(xas, max);
+
+	do {
+		if (unlikely(xas->xa_index >= max))
+			return xas_find(xas, max);
+		if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
+			return xas_find(xas, max);
+		entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
+		if (unlikely(xa_is_internal(entry)))
+			return xas_find(xas, max);
+		xas->xa_offset++;
+		xas->xa_index++;
+	} while (!entry);
+
+	return entry;
+}
+
+/* Private */
+static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
+		xa_mark_t mark)
+{
+	unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
+	unsigned int offset = xas->xa_offset;
+
+	if (advance)
+		offset++;
+	if (XA_CHUNK_SIZE == BITS_PER_LONG) {
+		if (offset < XA_CHUNK_SIZE) {
+			unsigned long data = *addr & (~0UL << offset);
+			if (data)
+				return __ffs(data);
+		}
+		return XA_CHUNK_SIZE;
+	}
+
+	return find_next_bit(addr, XA_CHUNK_SIZE, offset);
+}
+
+/**
+ * xas_next_marked() - Advance iterator to next marked entry.
+ * @xas: XArray operation state.
+ * @max: Highest index to return.
+ * @mark: Mark to search for.
+ *
+ * xas_next_marked() is an inline function to optimise xarray traversal for
+ * speed.  It is equivalent to calling xas_find_marked(), and will call
+ * xas_find_marked() for all the hard cases.
+ *
+ * Return: The next marked entry after the one currently referred to by @xas.
+ */
+static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
+								xa_mark_t mark)
+{
+	struct xa_node *node = xas->xa_node;
+	unsigned int offset;
+
+	if (unlikely(xas_not_node(node) || node->shift))
+		return xas_find_marked(xas, max, mark);
+	offset = xas_find_chunk(xas, true, mark);
+	xas->xa_offset = offset;
+	xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
+	if (xas->xa_index > max)
+		return NULL;
+	if (offset == XA_CHUNK_SIZE)
+		return xas_find_marked(xas, max, mark);
+	return xa_entry(xas->xa, node, offset);
+}
+
+/*
+ * If iterating while holding a lock, drop the lock and reschedule
+ * every %XA_CHECK_SCHED loops.
+ */
+enum {
+	XA_CHECK_SCHED = 4096,
+};
+
+/**
+ * xas_for_each() - Iterate over a range of an XArray.
+ * @xas: XArray operation state.
+ * @entry: Entry retrieved from the array.
+ * @max: Maximum index to retrieve from array.
+ *
+ * The loop body will be executed for each entry present in the xarray
+ * between the current xas position and @max.  @entry will be set to
+ * the entry retrieved from the xarray.  It is safe to delete entries
+ * from the array in the loop body.  You should hold either the RCU lock
+ * or the xa_lock while iterating.  If you need to drop the lock, call
+ * xas_pause() first.
+ */
+#define xas_for_each(xas, entry, max) \
+	for (entry = xas_find(xas, max); entry; \
+	     entry = xas_next_entry(xas, max))
+
+/**
+ * xas_for_each_marked() - Iterate over a range of an XArray.
+ * @xas: XArray operation state.
+ * @entry: Entry retrieved from the array.
+ * @max: Maximum index to retrieve from array.
+ * @mark: Mark to search for.
+ *
+ * The loop body will be executed for each marked entry in the xarray
+ * between the current xas position and @max.  @entry will be set to
+ * the entry retrieved from the xarray.  It is safe to delete entries
+ * from the array in the loop body.  You should hold either the RCU lock
+ * or the xa_lock while iterating.  If you need to drop the lock, call
+ * xas_pause() first.
+ */
+#define xas_for_each_marked(xas, entry, max, mark) \
+	for (entry = xas_find_marked(xas, max, mark); entry; \
+	     entry = xas_next_marked(xas, max, mark))
+
+/**
+ * xas_for_each_conflict() - Iterate over a range of an XArray.
+ * @xas: XArray operation state.
+ * @entry: Entry retrieved from the array.
+ *
+ * The loop body will be executed for each entry in the XArray that lies
+ * within the range specified by @xas.  If the loop completes successfully,
+ * any entries that lie in this range will be replaced by @entry.  The caller
+ * may break out of the loop; if they do so, the contents of the XArray will
+ * be unchanged.  The operation may fail due to an out of memory condition.
+ * The caller may also call xa_set_err() to exit the loop while setting an
+ * error to record the reason.
+ */
+#define xas_for_each_conflict(xas, entry) \
+	while ((entry = xas_find_conflict(xas)))
+
+void *__xas_next(struct xa_state *);
+void *__xas_prev(struct xa_state *);
+
+/**
+ * xas_prev() - Move iterator to previous index.
+ * @xas: XArray operation state.
+ *
+ * If the @xas was in an error state, it will remain in an error state
+ * and this function will return %NULL.  If the @xas has never been walked,
+ * it will have the effect of calling xas_load().  Otherwise one will be
+ * subtracted from the index and the state will be walked to the correct
+ * location in the array for the next operation.
+ *
+ * If the iterator was referencing index 0, this function wraps
+ * around to %ULONG_MAX.
+ *
+ * Return: The entry at the new index.  This may be %NULL or an internal
+ * entry.
+ */
+static inline void *xas_prev(struct xa_state *xas)
+{
+	struct xa_node *node = xas->xa_node;
+
+	if (unlikely(xas_not_node(node) || node->shift ||
+				xas->xa_offset == 0))
+		return __xas_prev(xas);
+
+	xas->xa_index--;
+	xas->xa_offset--;
+	return xa_entry(xas->xa, node, xas->xa_offset);
+}
+
+/**
+ * xas_next() - Move state to next index.
+ * @xas: XArray operation state.
+ *
+ * If the @xas was in an error state, it will remain in an error state
+ * and this function will return %NULL.  If the @xas has never been walked,
+ * it will have the effect of calling xas_load().  Otherwise one will be
+ * added to the index and the state will be walked to the correct
+ * location in the array for the next operation.
+ *
+ * If the iterator was referencing index %ULONG_MAX, this function wraps
+ * around to 0.
+ *
+ * Return: The entry at the new index.  This may be %NULL or an internal
+ * entry.
+ */
+static inline void *xas_next(struct xa_state *xas)
+{
+	struct xa_node *node = xas->xa_node;
+
+	if (unlikely(xas_not_node(node) || node->shift ||
+				xas->xa_offset == XA_CHUNK_MASK))
+		return __xas_next(xas);
+
+	xas->xa_index++;
+	xas->xa_offset++;
+	return xa_entry(xas->xa, node, xas->xa_offset);
+}
+#endif /* !HAVE_XARRAY_SUPPORT */
+
+#endif /* _LINUX_XARRAY_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
new file mode 100644
index 0000000000000..45818dddedd94
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#ifndef _LINUX_HASH_H
+#define _LINUX_HASH_H
+/* Fast hashing routine for ints,  longs and pointers.
+   (C) 2002 Nadia Yvette Chambers, IBM */
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+
+#include <linux/types.h>
+
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
+
+#if __BITS_PER_LONG == 32
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
+#define hash_long(val, bits) hash_32(val, bits)
+#elif __BITS_PER_LONG == 64
+#define hash_long(val, bits) hash_64(val, bits)
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
+#else
+#error Wordsize not 32 or 64
+#endif
+
+static __always_inline __u64 hash_64(__u64 val, unsigned int bits)
+{
+	__u64 hash = val;
+
+	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
+	__u64 n = hash;
+	n <<= 18;
+	hash -= n;
+	n <<= 33;
+	hash -= n;
+	n <<= 3;
+	hash += n;
+	n <<= 3;
+	hash -= n;
+	n <<= 4;
+	hash += n;
+	n <<= 2;
+	hash += n;
+
+	/* High bits are more random, so use them. */
+	return hash >> (64 - bits);
+}
+
+static inline __u32 hash_32(__u32 val, unsigned int bits)
+{
+	/* On some cpus multiply is faster, on others gcc will do shifts */
+	__u32 hash = val * GOLDEN_RATIO_PRIME_32;
+
+	/* High bits are more random, so use them. */
+	return hash >> (32 - bits);
+}
+
+static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
+{
+	return hash_long((unsigned long)ptr, bits);
+}
+
+static inline __u32 hash32_ptr(const void *ptr)
+{
+	unsigned long val = (unsigned long)ptr;
+
+#if __BITS_PER_LONG == 64
+	val ^= (val >> 32);
+#endif
+	return (__u32)val;
+}
+
+#endif /* _LINUX_HASH_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
new file mode 100644
index 0000000000000..a59e2c97ba2ff
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/util/ioctl.h
+ *
+ * Utility functions for calling ioctls.
+ *
+ */
+#ifndef _LIBCFS_IOCTL_H_
+#define _LIBCFS_IOCTL_H_
+
+#include <stdbool.h>
+#include <linux/types.h>
+
+/* Sparse annotation. */
+#define __user
+
+#include <linux/lnet/libcfs_ioctl.h>
+
+#define LIBCFS_IOC_INIT(data)					\
+do {								\
+	memset(&(data), 0, sizeof(data));			\
+	(data).ioc_hdr.ioc_version = LIBCFS_IOCTL_VERSION;	\
+	(data).ioc_hdr.ioc_len = sizeof(data);			\
+} while (0)
+
+#define LIBCFS_IOC_INIT_V2(data, hdr)			\
+do {							\
+	memset(&(data), 0, sizeof(data));		\
+	(data).hdr.ioc_version = LIBCFS_IOCTL_VERSION2;	\
+	(data).hdr.ioc_len = sizeof(data);		\
+} while (0)
+
+/* FIXME - rename these to libcfs_ */
+int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf, int max);
+void libcfs_ioctl_unpack(struct libcfs_ioctl_data *data, char *pbuf);
+int register_ioc_dev(int dev_id, const char *dev_name);
+void unregister_ioc_dev(int dev_id);
+int l_ioctl(int dev_id, unsigned int opc, void *buf);
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h
new file mode 100644
index 0000000000000..ef69efed6cf1e
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h
@@ -0,0 +1,499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+
+#ifndef __LIBCFS_UTIL_LIST_H__
+#define __LIBCFS_UTIL_LIST_H__
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+#define prefetch(a) ((void)a)
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define INIT_LIST_HEAD(ptr) do { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/**
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head * new,
+				  struct list_head * prev,
+				  struct list_head * next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+/**
+ * Insert an entry at the start of a list.
+ * \param new  new entry to be inserted
+ * \param head list to add it to
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new,
+				struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+/**
+ * Insert an entry at the end of a list.
+ * \param new  new entry to be inserted
+ * \param head list to add it to
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new,
+				     struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head *prev,
+				  struct list_head *next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * Remove an entry from the list it is currently in.
+ * \param entry the entry to remove
+ * Note: list_empty(entry) does not return true after this, the entry is in an
+ * undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+/**
+ * Remove an entry from the list it is currently in and reinitialize it.
+ * \param entry the entry to remove.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * Remove an entry from the list it is currently in and insert it at the start
+ * of another list.
+ * \param list the entry to move
+ * \param head the list to move it to
+ */
+static inline void list_move(struct list_head *list,
+				 struct list_head *head)
+{
+	__list_del(list->prev, list->next);
+	list_add(list, head);
+}
+
+/**
+ * Remove an entry from the list it is currently in and insert it at the end of
+ * another list.
+ * \param list the entry to move
+ * \param head the list to move it to
+ */
+static inline void list_move_tail(struct list_head *list,
+				      struct list_head *head)
+{
+	__list_del(list->prev, list->next);
+	list_add_tail(list, head);
+}
+
+/**
+ * Test whether a list is empty
+ * \param head the list to test.
+ */
+static inline int list_empty(struct list_head *head)
+{
+	return head->next == head;
+}
+
+/**
+ * Test whether a list is empty and not being modified
+ * \param head the list to test
+ *
+ * Tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+	struct list_head *next = head->next;
+	return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+				     struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+/**
+ * Join two lists
+ * \param list the new list to add.
+ * \param head the place to add it in the first list.
+ *
+ * The contents of \a list are added at the start of \a head.  \a list is in an
+ * undefined state on return.
+ */
+static inline void list_splice(struct list_head *list,
+				   struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+static inline void list_splice_tail(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head->prev);
+}
+
+/**
+ * Join two lists and reinitialise the emptied list.
+ * \param list the new list to add.
+ * \param head the place to add it in the first list.
+ *
+ * The contents of \a list are added at the start of \a head.  \a list is empty
+ * on return.
+ */
+static inline void list_splice_init(struct list_head *list,
+					struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * Get the container of a list
+ * \param ptr	 the embedded list.
+ * \param type	 the type of the struct this is embedded in.
+ * \param member the member name of the list within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+
+/**
+ * Iterate over a list
+ * \param pos	the iterator
+ * \param head	the list to iterate over
+ *
+ * Behaviour is undefined if \a pos is removed from the list in the body of the
+ * loop.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+		pos = pos->next, prefetch(pos->next))
+
+/**
+ * Iterate over a list safely
+ * \param pos	the iterator
+ * \param n     temporary storage
+ * \param head	the list to iterate over
+ *
+ * This is safe to use if \a pos could be removed from the list in the body of
+ * the loop.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * Iterate over a list continuing after existing point
+ * \param pos    the type * to use as a loop counter
+ * \param head   the list head
+ * \param member the name of the list_struct within the struct
+ */
+#define list_for_each_entry_continue(pos, head, member)                 \
+	for (pos = list_entry(pos->member.next, typeof(*pos), member);  \
+	     prefetch(pos->member.next), &pos->member != (head);        \
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * \defgroup hlist Hash List
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is too
+ * wasteful.  You lose the ability to access the tail in O(1).
+ * @{
+ */
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+/* @} */
+
+/*
+ * "NULL" might not be defined at this point
+ */
+#ifdef NULL
+#define NULL_P NULL
+#else
+#define NULL_P ((void *)0)
+#endif
+
+/**
+ * \addtogroup hlist
+ * @{
+ */
+
+#define HLIST_HEAD_INIT { NULL_P }
+#define HLIST_HEAD(name) struct hlist_head name = { NULL_P }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL_P)
+#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL_P, (ptr)->pprev = NULL_P)
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+	return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+	struct hlist_node *next = n->next;
+	struct hlist_node **pprev = n->pprev;
+	*pprev = next;
+	if (next)
+		next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+	__hlist_del(n);
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+	if (n->pprev)  {
+		__hlist_del(n);
+		INIT_HLIST_NODE(n);
+	}
+}
+
+static inline void hlist_add_head(struct hlist_node *n,
+				      struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+	n->next = first;
+	if (first)
+		first->pprev = &n->next;
+	h->first = n;
+	n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+
+static inline void hlist_add_after(struct hlist_node *n,
+				       struct hlist_node *next)
+{
+	next->next = n->next;
+	n->next = next;
+	next->pprev = &n->next;
+
+	if(next->next)
+		next->next->pprev  = &next->next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+	for (pos = (head)->first; pos && (prefetch(pos->next), 1); \
+	     pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->first; pos && (n = pos->next, 1); \
+	     pos = n)
+
+/**
+ * Iterate over an hlist of given type
+ * \param tpos	 the type * to use as a loop counter.
+ * \param pos	 the &struct hlist_node to use as a loop counter.
+ * \param head	 the head for your list.
+ * \param member the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)                    \
+	for (pos = (head)->first;                                            \
+	     pos && ({ prefetch(pos->next); 1;}) &&                          \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * Iterate over an hlist continuing after existing point
+ * \param tpos	 the type * to use as a loop counter.
+ * \param pos	 the &struct hlist_node to use as a loop counter.
+ * \param member the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member)                 \
+	for (pos = (pos)->next;                                              \
+	     pos && ({ prefetch(pos->next); 1;}) &&                          \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * Iterate over an hlist continuing from an existing point
+ * \param tpos	 the type * to use as a loop counter.
+ * \param pos	 the &struct hlist_node to use as a loop counter.
+ * \param member the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member)			 \
+	for (; pos && ({ prefetch(pos->next); 1;}) &&                        \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * Iterate over an hlist of given type safe against removal of list entry
+ * \param tpos	 the type * to use as a loop counter.
+ * \param pos	 the &struct hlist_node to use as a loop counter.
+ * \param n	 another &struct hlist_node to use as temporary storage
+ * \param head	 the head for your list.
+ * \param member the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member)            \
+	for (pos = (head)->first;                                            \
+	     pos && ({ n = pos->next; 1; }) &&                               \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = n)
+
+/* @} */
+
+/**
+ * Iterate over a list in reverse order
+ * \param pos	the &struct list_head to use as a loop counter.
+ * \param head	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev, prefetch(pos->prev); pos != (head);     \
+		pos = pos->prev, prefetch(pos->prev))
+
+/**
+ * Iterate over a list of given type
+ * \param pos        the type * to use as a loop counter.
+ * \param head       the head for your list.
+ * \param member     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)                          \
+	for (pos = list_entry((head)->next, typeof(*pos), member),      \
+		     prefetch(pos->member.next);                            \
+	     &pos->member != (head);                                        \
+	     pos = list_entry(pos->member.next, typeof(*pos), member),  \
+	     prefetch(pos->member.next))
+
+/**
+ * Iterate backwards over a list of given type.
+ * \param pos        the type * to use as a loop counter.
+ * \param head       the head for your list.
+ * \param member     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)                  \
+	for (pos = list_entry((head)->prev, typeof(*pos), member);      \
+	     prefetch(pos->member.prev), &pos->member != (head);            \
+	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * Iterate over a list of given type safe against removal of list entry
+ * \param pos        the type * to use as a loop counter.
+ * \param n          another type * to use as temporary storage
+ * \param head       the head for your list.
+ * \param member     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)                   \
+	for (pos = list_entry((head)->next, typeof(*pos), member),       \
+		n = list_entry(pos->member.next, typeof(*pos), member);  \
+	     &pos->member != (head);                                         \
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * Iterate backwards over a list of given type safely against removal of entry
+ * \param pos        the type * to use as a loop counter.
+ * \param n          another type * to use as temporary storage
+ * \param head       the head for your list.
+ * \param member     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     &pos->member != (head);					\
+	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+#endif /* __LIBCFS_UTIL_LIST_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h
new file mode 100644
index 0000000000000..2fd1e36b07354
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h
@@ -0,0 +1,40 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * LGPL HEADER END
+ *
+ * Copyright (c) 2015, James Simmons
+ *
+ * Author:
+ *   James Simmons <jsimmons@infradead.org>
+ */
+#ifndef _LIBCFS_UTIL_PARAM_H_
+#define _LIBCFS_UTIL_PARAM_H_
+
+#include <glob.h>
+#include <stdbool.h>
+
+static inline void cfs_free_param_data(glob_t *paths)
+{
+	globfree(paths);
+}
+
+int cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
+		       __attribute__((__format__(__printf__, 2, 3)));
+
+#endif /* _LIBCFS_UTIL_PARAM_H_ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
new file mode 100644
index 0000000000000..7827718f55a48
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
@@ -0,0 +1,114 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/util/parser.h
+ *
+ * A command line parser.
+ *
+ */
+
+#ifndef _PARSER_H_
+#define _PARSER_H_
+
+#define HISTORY	100		/* Don't let history grow unbounded    */
+#define MAXARGS 512
+
+#define CMD_COMPLETE	0
+#define CMD_INCOMPLETE	1
+#define CMD_NONE	2
+#define CMD_AMBIG	3
+#define CMD_HELP	4
+
+typedef struct parser_cmd {
+	char 	*pc_name;
+	int 	(* pc_func)(int, char **);
+	struct parser_cmd * pc_sub_cmd;
+	char *pc_help;
+} command_t;
+
+typedef struct argcmd {
+	char    *ac_name;
+	int      (*ac_func)(int, char **);
+	char     *ac_help;
+} argcmd_t;
+
+typedef struct network {
+	char	*type;
+	char	*server;
+	int	port;
+} network_t;
+
+int Parser_quit(int argc, char **argv);
+int Parser_version(int argc, char **argv);
+void Parser_init(char *, command_t *);	/* Set prompt and load command list */
+int Parser_commands(void);			/* Start the command parser */
+void Parser_qhelp(int, char **);	/* Quick help routine */
+int Parser_help(int, char **);		/* Detailed help routine */
+void Parser_ignore_errors(int ignore);	/* Set the ignore errors flag */
+void Parser_printhelp(char *);		/* Detailed help routine */
+void Parser_exit(int, char **);		/* Shuts down command parser */
+int Parser_execarg(int argc, char **argv, command_t cmds[]);
+int execute_line(char * line);
+int Parser_list_commands(const command_t *cmdlist, char *buffer,
+			 size_t buf_size, const char *parent_cmd,
+			 int col_start, int col_num);
+
+/* Converts a string to an integer */
+int Parser_int(char *, int *);
+
+/* Prompts for a string, with default values and a maximum length */
+char *Parser_getstr(const char *prompt, const char *deft, char *res, 
+		    size_t len);
+
+/* Prompts for an integer, with minimum, maximum and default values and base */
+int Parser_getint(const char *prompt, long min, long max, long deft,
+		  int base);
+
+/* Prompts for a yes/no, with default */
+int Parser_getbool(const char *prompt, int deft);
+
+/* Extracts an integer from a string, or prompts if it cannot get one */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+		   int min, int max, int base);
+
+/* Extracts a word from the input, or propmts if it cannot get one */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+		    char *answer, int len);
+
+/* Extracts an integer from a string  with a base */
+int Parser_arg2int(const char *inp, long *result, int base);
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size(unsigned long *sizep, char *str);
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool(int *b, char *str);
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
new file mode 100644
index 0000000000000..97d9adb6984d3
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
@@ -0,0 +1,143 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/libcfs_string.h
+ *
+ * Generic string manipulation functions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#ifndef __LIBCFS_UTIL_STRING_H__
+#define __LIBCFS_UTIL_STRING_H__
+
+#include <stddef.h>
+#include <stdarg.h>
+
+#include <linux/types.h>
+#include <linux/lnet/lnet-types.h>
+#include <libcfs/util/list.h>
+
+static inline
+int vscnprintf(char *buf, size_t bufsz, const char *format, va_list args)
+{
+	int ret;
+
+	if (!bufsz)
+		return 0;
+
+	ret = vsnprintf(buf, bufsz, format, args);
+	return (bufsz > ret) ? ret : bufsz - 1;
+}
+
+/* __printf from linux kernel */
+#ifndef __printf
+#define __printf(a, b)		__attribute__((__format__(printf, a, b)))
+#endif
+
+__printf(3, 4)
+static inline int scnprintf(char *buf, size_t bufsz, const char *format, ...)
+{
+	int ret;
+	va_list args;
+
+	va_start(args, format);
+	ret = vscnprintf(buf, bufsz, format, args);
+	va_end(args);
+
+	return ret;
+}
+
+struct netstrfns {
+	__u32	nf_type;
+	char	*nf_name;
+	char	*nf_modname;
+	void	(*nf_addr2str)(__u32 addr, char *str, size_t size);
+	int	(*nf_str2addr)(const char *str, int nob, __u32 *addr);
+	int	(*nf_parse_addrlist)(char *str, int len,
+				     struct list_head *list);
+	int	(*nf_print_addrlist)(char *buffer, int count,
+				     struct list_head *list);
+	int	(*nf_match_addr)(__u32 addr, struct list_head *list);
+	int	(*nf_min_max)(struct list_head *nidlist, __u32 *min_nid,
+			      __u32 *max_nid);
+	int	(*nf_expand_addrrange)(struct list_head *addrranges,
+				       __u32 *addrs, int max_addrs);
+};
+
+/**
+ * Structure to represent NULL-less strings.
+ */
+struct cfs_lstr {
+	char		*ls_str;
+	int		ls_len;
+};
+
+/*
+ * Structure to represent \<range_expr\> token of the syntax.
+ */
+struct cfs_range_expr {
+	/*
+	 * Link to cfs_expr_list::el_exprs.
+	 */
+	struct list_head	re_link;
+	__u32			re_lo;
+	__u32			re_hi;
+	__u32			re_stride;
+};
+
+struct cfs_expr_list {
+	struct list_head	el_link;
+	struct list_head	el_exprs;
+};
+
+int cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp);
+int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
+int cfs_str2num_check(char *str, int nob, unsigned *num,
+		      unsigned min, unsigned max);
+int cfs_expr2str(struct list_head *list, char *str, size_t size);
+int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
+int cfs_expr_list_print(char *buffer, int count,
+			struct cfs_expr_list *expr_list);
+int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+			struct cfs_expr_list **elpp);
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
+void cfs_expr_list_free_list(struct list_head *list);
+int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
+int cfs_ip_addr_range_gen(__u32 *ip_list, int count,
+			  struct list_head *ip_addr_expr);
+int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+int cfs_expand_nidlist(struct list_head *nidlist, lnet_nid_t *lnet_nidlist,
+		       int max_nids);
+int cfs_parse_nid_parts(char *str, struct list_head *addr,
+			struct list_head *net_num, __u32 *net_type);
+int cfs_abs_path(const char *request_path, char **resolved_path);
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/uapi/linux/llcrypt.h b/drivers/staging/lustrefsx/libcfs/include/uapi/linux/llcrypt.h
new file mode 100644
index 0000000000000..c133859bc2169
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/uapi/linux/llcrypt.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * llcrypt user API
+ *
+ * These ioctls can be used on filesystems that support llcrypt.  See the
+ * "User API" section of Documentation/filesystems/llcrypt.rst.
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+#ifndef _UAPI_LINUX_LLCRYPT_H
+#define _UAPI_LINUX_LLCRYPT_H
+
+#include <linux/types.h>
+
+/* Encryption policy flags */
+#define LLCRYPT_POLICY_FLAGS_PAD_4		0x00
+#define LLCRYPT_POLICY_FLAGS_PAD_8		0x01
+#define LLCRYPT_POLICY_FLAGS_PAD_16		0x02
+#define LLCRYPT_POLICY_FLAGS_PAD_32		0x03
+#define LLCRYPT_POLICY_FLAGS_PAD_MASK		0x03
+#define LLCRYPT_POLICY_FLAG_DIRECT_KEY		0x04
+#define LLCRYPT_POLICY_FLAGS_VALID		0x07
+
+/* Encryption algorithms */
+#define LLCRYPT_MODE_NULL			0
+#define LLCRYPT_MODE_AES_256_XTS		1
+#define LLCRYPT_MODE_AES_256_CTS		4
+#define LLCRYPT_MODE_AES_128_CBC		5
+#define LLCRYPT_MODE_AES_128_CTS		6
+#define LLCRYPT_MODE_ADIANTUM			9
+#define __LLCRYPT_MODE_MAX			9
+
+/*
+ * Legacy policy version; ad-hoc KDF and no key verification.
+ * For new encrypted directories, use llcrypt_policy_v2 instead.
+ *
+ * Careful: the .version field for this is actually 0, not 1.
+ */
+#define LLCRYPT_POLICY_V1		0
+#define LLCRYPT_KEY_DESCRIPTOR_SIZE	8
+struct llcrypt_policy_v1 {
+	__u8 version;
+	__u8 contents_encryption_mode;
+	__u8 filenames_encryption_mode;
+	__u8 flags;
+	__u8 master_key_descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE];
+};
+#define llcrypt_policy	llcrypt_policy_v1
+
+/*
+ * Process-subscribed "logon" key description prefix and payload format.
+ * Deprecated; prefer LL_IOC_ADD_ENCRYPTION_KEY instead.
+ */
+#define LLCRYPT_KEY_DESC_PREFIX		"fscrypt:"
+#define LLCRYPT_KEY_DESC_PREFIX_SIZE	8
+#define LLCRYPT_MAX_KEY_SIZE		64
+struct llcrypt_key {
+	__u32 mode;
+	__u8 raw[LLCRYPT_MAX_KEY_SIZE];
+	__u32 size;
+};
+
+/*
+ * New policy version with HKDF and key verification (recommended).
+ */
+#define LLCRYPT_POLICY_V2		2
+#define LLCRYPT_KEY_IDENTIFIER_SIZE	16
+struct llcrypt_policy_v2 {
+	__u8 version;
+	__u8 contents_encryption_mode;
+	__u8 filenames_encryption_mode;
+	__u8 flags;
+	__u8 __reserved[4];
+	__u8 master_key_identifier[LLCRYPT_KEY_IDENTIFIER_SIZE];
+};
+
+/* Struct passed to LL_IOC_GET_ENCRYPTION_POLICY_EX */
+struct llcrypt_get_policy_ex_arg {
+	__u64 policy_size; /* input/output */
+	union {
+		__u8 version;
+		struct llcrypt_policy_v1 v1;
+		struct llcrypt_policy_v2 v2;
+	} policy; /* output */
+};
+
+/*
+ * v1 policy keys are specified by an arbitrary 8-byte key "descriptor",
+ * matching llcrypt_policy_v1::master_key_descriptor.
+ */
+#define LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR	1
+
+/*
+ * v2 policy keys are specified by a 16-byte key "identifier" which the kernel
+ * calculates as a cryptographic hash of the key itself,
+ * matching llcrypt_policy_v2::master_key_identifier.
+ */
+#define LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER	2
+
+/*
+ * Specifies a key, either for v1 or v2 policies.  This doesn't contain the
+ * actual key itself; this is just the "name" of the key.
+ */
+struct llcrypt_key_specifier {
+	__u32 type;	/* one of LLCRYPT_KEY_SPEC_TYPE_* */
+	__u32 __reserved;
+	union {
+		__u8 __reserved[32]; /* reserve some extra space */
+		__u8 descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE];
+		__u8 identifier[LLCRYPT_KEY_IDENTIFIER_SIZE];
+	} u;
+};
+
+/* Struct passed to LL_IOC_ADD_ENCRYPTION_KEY */
+struct llcrypt_add_key_arg {
+	struct llcrypt_key_specifier key_spec;
+	__u32 raw_size;
+	__u32 __reserved[9];
+	__u8 raw[];
+};
+
+/* Struct passed to LL_IOC_REMOVE_ENCRYPTION_KEY */
+struct llcrypt_remove_key_arg {
+	struct llcrypt_key_specifier key_spec;
+#define LLCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY	0x00000001
+#define LLCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS	0x00000002
+	__u32 removal_status_flags;	/* output */
+	__u32 __reserved[5];
+};
+
+/* Struct passed to LL_IOC_GET_ENCRYPTION_KEY_STATUS */
+struct llcrypt_get_key_status_arg {
+	/* input */
+	struct llcrypt_key_specifier key_spec;
+	__u32 __reserved[6];
+
+	/* output */
+#define LLCRYPT_KEY_STATUS_ABSENT		1
+#define LLCRYPT_KEY_STATUS_PRESENT		2
+#define LLCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED	3
+	__u32 status;
+#define LLCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF   0x00000001
+	__u32 status_flags;
+	__u32 user_count;
+	__u32 __out_reserved[13];
+};
+
+#define LL_IOC_SET_ENCRYPTION_POLICY		_IOR('f', 19, struct llcrypt_policy)
+#define LL_IOC_GET_ENCRYPTION_PWSALT		_IOW('f', 20, __u8[16])
+#define LL_IOC_GET_ENCRYPTION_POLICY		_IOW('f', 21, struct llcrypt_policy)
+#define LL_IOC_GET_ENCRYPTION_POLICY_EX		_IOWR('f', 22, __u8[9]) /* size + version */
+#define LL_IOC_ADD_ENCRYPTION_KEY		_IOWR('f', 23, struct llcrypt_add_key_arg)
+#define LL_IOC_REMOVE_ENCRYPTION_KEY		_IOWR('f', 24, struct llcrypt_remove_key_arg)
+#define LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS	_IOWR('f', 25, struct llcrypt_remove_key_arg)
+#define LL_IOC_GET_ENCRYPTION_KEY_STATUS	_IOWR('f', 26, struct llcrypt_get_key_status_arg)
+
+/**********************************************************************/
+
+/* old names; don't add anything new here! */
+#ifndef __KERNEL__
+#define LL_KEY_DESCRIPTOR_SIZE		LLCRYPT_KEY_DESCRIPTOR_SIZE
+#define LL_POLICY_FLAGS_PAD_4		LLCRYPT_POLICY_FLAGS_PAD_4
+#define LL_POLICY_FLAGS_PAD_8		LLCRYPT_POLICY_FLAGS_PAD_8
+#define LL_POLICY_FLAGS_PAD_16		LLCRYPT_POLICY_FLAGS_PAD_16
+#define LL_POLICY_FLAGS_PAD_32		LLCRYPT_POLICY_FLAGS_PAD_32
+#define LL_POLICY_FLAGS_PAD_MASK	LLCRYPT_POLICY_FLAGS_PAD_MASK
+#define LL_POLICY_FLAG_DIRECT_KEY	LLCRYPT_POLICY_FLAG_DIRECT_KEY
+#define LL_POLICY_FLAGS_VALID		LLCRYPT_POLICY_FLAGS_VALID
+#define LL_ENCRYPTION_MODE_INVALID	0	/* never used */
+#define LL_ENCRYPTION_MODE_AES_256_XTS	LLCRYPT_MODE_AES_256_XTS
+#define LL_ENCRYPTION_MODE_AES_256_GCM	2	/* never used */
+#define LL_ENCRYPTION_MODE_AES_256_CBC	3	/* never used */
+#define LL_ENCRYPTION_MODE_AES_256_CTS	LLCRYPT_MODE_AES_256_CTS
+#define LL_ENCRYPTION_MODE_AES_128_CBC	LLCRYPT_MODE_AES_128_CBC
+#define LL_ENCRYPTION_MODE_AES_128_CTS	LLCRYPT_MODE_AES_128_CTS
+#define LL_ENCRYPTION_MODE_SPECK128_256_XTS	7	/* removed */
+#define LL_ENCRYPTION_MODE_SPECK128_256_CTS	8	/* removed */
+#define LL_ENCRYPTION_MODE_ADIANTUM	LLCRYPT_MODE_ADIANTUM
+#define LL_KEY_DESC_PREFIX		LLCRYPT_KEY_DESC_PREFIX
+#define LL_KEY_DESC_PREFIX_SIZE		LLCRYPT_KEY_DESC_PREFIX_SIZE
+#define LL_MAX_KEY_SIZE			LLCRYPT_MAX_KEY_SIZE
+#endif /* !__KERNEL__ */
+
+#endif /* _UAPI_LINUX_LLCRYPT_H */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/crypto.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/crypto.c
new file mode 100644
index 0000000000000..3d18715a06c3d
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/crypto.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *	Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *	Ildar Muslukhov, 2014
+ * Add llcrypt_pullback_bio_page()
+ *	Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <crypto/aes.h>
+#include <crypto/skcipher.h>
+#include "llcrypt_private.h"
+
+#ifdef HAVE_CIPHER_H
+#include <crypto/internal/cipher.h>
+
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
+#endif
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+		"Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+		"Number of crypto contexts to preallocate");
+
+static mempool_t *llcrypt_bounce_page_pool = NULL;
+
+static LIST_HEAD(llcrypt_free_ctxs);
+static DEFINE_SPINLOCK(llcrypt_ctx_lock);
+
+static struct workqueue_struct *llcrypt_read_workqueue;
+static DEFINE_MUTEX(llcrypt_init_mutex);
+
+static struct kmem_cache *llcrypt_ctx_cachep;
+struct kmem_cache *llcrypt_info_cachep;
+
+void llcrypt_enqueue_decrypt_work(struct work_struct *work)
+{
+	queue_work(llcrypt_read_workqueue, work);
+}
+EXPORT_SYMBOL(llcrypt_enqueue_decrypt_work);
+
+/**
+ * llcrypt_release_ctx() - Release a decryption context
+ * @ctx: The decryption context to release.
+ *
+ * If the decryption context was allocated from the pre-allocated pool, return
+ * it to that pool.  Else, free it.
+ */
+void llcrypt_release_ctx(struct llcrypt_ctx *ctx)
+{
+	unsigned long flags;
+
+	if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+		kmem_cache_free(llcrypt_ctx_cachep, ctx);
+	} else {
+		spin_lock_irqsave(&llcrypt_ctx_lock, flags);
+		list_add(&ctx->free_list, &llcrypt_free_ctxs);
+		spin_unlock_irqrestore(&llcrypt_ctx_lock, flags);
+	}
+}
+EXPORT_SYMBOL(llcrypt_release_ctx);
+
+/**
+ * llcrypt_get_ctx() - Get a decryption context
+ * @gfp_flags:   The gfp flag for memory allocation
+ *
+ * Allocate and initialize a decryption context.
+ *
+ * Return: A new decryption context on success; an ERR_PTR() otherwise.
+ */
+struct llcrypt_ctx *llcrypt_get_ctx(gfp_t gfp_flags)
+{
+	struct llcrypt_ctx *ctx;
+	unsigned long flags;
+
+	/*
+	 * First try getting a ctx from the free list so that we don't have to
+	 * call into the slab allocator.
+	 */
+	spin_lock_irqsave(&llcrypt_ctx_lock, flags);
+	ctx = list_first_entry_or_null(&llcrypt_free_ctxs,
+					struct llcrypt_ctx, free_list);
+	if (ctx)
+		list_del(&ctx->free_list);
+	spin_unlock_irqrestore(&llcrypt_ctx_lock, flags);
+	if (!ctx) {
+		ctx = kmem_cache_zalloc(llcrypt_ctx_cachep, gfp_flags);
+		if (!ctx)
+			return ERR_PTR(-ENOMEM);
+		ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	} else {
+		ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	}
+	return ctx;
+}
+EXPORT_SYMBOL(llcrypt_get_ctx);
+
+struct page *llcrypt_alloc_bounce_page(gfp_t gfp_flags)
+{
+	return mempool_alloc(llcrypt_bounce_page_pool, gfp_flags);
+}
+
+/**
+ * llcrypt_free_bounce_page() - free a ciphertext bounce page
+ *
+ * Free a bounce page that was allocated by llcrypt_encrypt_pagecache_blocks(),
+ * or by llcrypt_alloc_bounce_page() directly.
+ */
+void llcrypt_free_bounce_page(struct page *bounce_page)
+{
+	if (!bounce_page)
+		return;
+	set_page_private(bounce_page, (unsigned long)NULL);
+	ClearPagePrivate(bounce_page);
+	mempool_free(bounce_page, llcrypt_bounce_page_pool);
+}
+EXPORT_SYMBOL(llcrypt_free_bounce_page);
+
+void llcrypt_generate_iv(union llcrypt_iv *iv, u64 lblk_num,
+			 const struct llcrypt_info *ci)
+{
+	memset(iv, 0, ci->ci_mode->ivsize);
+	iv->lblk_num = cpu_to_le64(lblk_num);
+
+	if (llcrypt_is_direct_key_policy(&ci->ci_policy))
+		memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+
+	if (ci->ci_essiv_tfm != NULL)
+		crypto_cipher_encrypt_one(ci->ci_essiv_tfm, iv->raw, iv->raw);
+}
+
+/* Encrypt or decrypt a single filesystem block of file contents */
+int llcrypt_crypt_block(const struct inode *inode, llcrypt_direction_t rw,
+			u64 lblk_num, struct page *src_page,
+			struct page *dest_page, unsigned int len,
+			unsigned int offs, gfp_t gfp_flags)
+{
+	union llcrypt_iv iv;
+	struct skcipher_request *req = NULL;
+	DECLARE_CRYPTO_WAIT(wait);
+	struct scatterlist dst, src;
+	struct llcrypt_info *ci = llcrypt_info(inode);
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	int res = 0;
+
+	if (tfm == NULL) {
+		if (dest_page != src_page)
+			memcpy(page_address(dest_page), page_address(src_page),
+			       PAGE_SIZE);
+		return 0;
+	}
+
+	if (WARN_ON_ONCE(len <= 0))
+		return -EINVAL;
+	if (WARN_ON_ONCE(len % LL_CRYPTO_BLOCK_SIZE != 0))
+		return -EINVAL;
+
+	llcrypt_generate_iv(&iv, lblk_num, ci);
+
+	req = skcipher_request_alloc(tfm, gfp_flags);
+	if (!req)
+		return -ENOMEM;
+
+	skcipher_request_set_callback(
+		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		crypto_req_done, &wait);
+
+	sg_init_table(&dst, 1);
+	sg_set_page(&dst, dest_page, len, offs);
+	sg_init_table(&src, 1);
+	sg_set_page(&src, src_page, len, offs);
+	skcipher_request_set_crypt(req, &src, &dst, len, &iv);
+	if (rw == FS_DECRYPT)
+		res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
+	else
+		res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+	skcipher_request_free(req);
+	if (res) {
+		llcrypt_err(inode, "%scryption failed for block %llu: %d",
+			    (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res);
+		return res;
+	}
+	return 0;
+}
+
+/**
+ * llcrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page
+ * @page:      The locked pagecache page containing the block(s) to encrypt
+ * @len:       Total size of the block(s) to encrypt.  Must be a nonzero
+ *		multiple of the filesystem's block size.
+ * @offs:      Byte offset within @page of the first block to encrypt.  Must be
+ *		a multiple of the filesystem's block size.
+ * @gfp_flags: Memory allocation flags
+ *
+ * A new bounce page is allocated, and the specified block(s) are encrypted into
+ * it.  In the bounce page, the ciphertext block(s) will be located at the same
+ * offsets at which the plaintext block(s) were located in the source page; any
+ * other parts of the bounce page will be left uninitialized.  However, normally
+ * blocksize == PAGE_SIZE and the whole page is encrypted at once.
+ *
+ * This is for use by the filesystem's ->writepages() method.
+ *
+ * Return: the new encrypted bounce page on success; an ERR_PTR() on failure
+ */
+struct page *llcrypt_encrypt_pagecache_blocks(struct page *page,
+					      unsigned int len,
+					      unsigned int offs,
+					      gfp_t gfp_flags)
+
+{
+	const struct inode *inode = page->mapping->host;
+	const unsigned int blockbits = inode->i_blkbits;
+	const unsigned int blocksize = 1 << blockbits;
+	struct page *ciphertext_page;
+	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
+		       (offs >> blockbits);
+	unsigned int i;
+	int err;
+
+	if (WARN_ON_ONCE(!PageLocked(page)))
+		return ERR_PTR(-EINVAL);
+
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+		return ERR_PTR(-EINVAL);
+
+	ciphertext_page = llcrypt_alloc_bounce_page(gfp_flags);
+	if (!ciphertext_page)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+		err = llcrypt_crypt_block(inode, FS_ENCRYPT, lblk_num,
+					  page, ciphertext_page,
+					  blocksize, i, gfp_flags);
+		if (err) {
+			llcrypt_free_bounce_page(ciphertext_page);
+			return ERR_PTR(err);
+		}
+	}
+	SetPagePrivate(ciphertext_page);
+	set_page_private(ciphertext_page, (unsigned long)page);
+	return ciphertext_page;
+}
+EXPORT_SYMBOL(llcrypt_encrypt_pagecache_blocks);
+
+/**
+ * llcrypt_encrypt_block() - Encrypt a filesystem block in a page
+ * @inode:     The inode to which this block belongs
+ * @src:       The page containing the block to encrypt
+ * @dst:       The page which will contain the encrypted data
+ * @len:       Size of block to encrypt.  Doesn't need to be a multiple of the
+ *		fs block size, but must be a multiple of LL_CRYPTO_BLOCK_SIZE.
+ * @offs:      Byte offset within @page at which the block to encrypt begins
+ * @lblk_num:  Filesystem logical block number of the block, i.e. the 0-based
+ *		number of the block within the file
+ * @gfp_flags: Memory allocation flags
+ *
+ * Encrypt a possibly-compressed filesystem block that is located in an
+ * arbitrary page, not necessarily in the original pagecache page.  The @inode
+ * and @lblk_num must be specified, as they can't be determined from @page.
+ * The decrypted data will be stored in @dst.
+ *
+ * Return: 0 on success; -errno on failure
+ */
+int llcrypt_encrypt_block(const struct inode *inode, struct page *src,
+			  struct page *dst, unsigned int len, unsigned int offs,
+			  u64 lblk_num, gfp_t gfp_flags)
+{
+	return llcrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, src, dst,
+				   len, offs, gfp_flags);
+}
+EXPORT_SYMBOL(llcrypt_encrypt_block);
+
+/**
+ * llcrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page
+ * @page:      The locked pagecache page containing the block(s) to decrypt
+ * @len:       Total size of the block(s) to decrypt.  Must be a nonzero
+ *		multiple of the filesystem's block size.
+ * @offs:      Byte offset within @page of the first block to decrypt.  Must be
+ *		a multiple of the filesystem's block size.
+ *
+ * The specified block(s) are decrypted in-place within the pagecache page,
+ * which must still be locked and not uptodate.  Normally, blocksize ==
+ * PAGE_SIZE and the whole page is decrypted at once.
+ *
+ * This is for use by the filesystem's ->readpages() method.
+ *
+ * Return: 0 on success; -errno on failure
+ */
+int llcrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len,
+				     unsigned int offs)
+{
+	const struct inode *inode = page->mapping->host;
+	const unsigned int blockbits = inode->i_blkbits;
+	const unsigned int blocksize = 1 << blockbits;
+	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
+		       (offs >> blockbits);
+	unsigned int i;
+	int err;
+
+	if (WARN_ON_ONCE(!PageLocked(page)))
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+		return -EINVAL;
+
+	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+		err = llcrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page,
+					  page, blocksize, i, GFP_NOFS);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(llcrypt_decrypt_pagecache_blocks);
+
+/**
+ * llcrypt_decrypt_block() - Cache a decrypted filesystem block in a page
+ * @inode:     The inode to which this block belongs
+ * @src:       The page containing the block to decrypt
+ * @dst:       The page which will contain the plain data
+ * @len:       Size of block to decrypt.  Doesn't need to be a multiple of the
+ *		fs block size, but must be a multiple of LL_CRYPTO_BLOCK_SIZE.
+ * @offs:      Byte offset within @page at which the block to decrypt begins
+ * @lblk_num:  Filesystem logical block number of the block, i.e. the 0-based
+ *		number of the block within the file
+ *
+ * Decrypt a possibly-compressed filesystem block that is located in an
+ * arbitrary page, not necessarily in the original pagecache page.  The @inode
+ * and @lblk_num must be specified, as they can't be determined from @page.
+ * The encrypted data will be stored in @dst.
+ *
+ * Return: 0 on success; -errno on failure
+ */
+int llcrypt_decrypt_block(const struct inode *inode, struct page *src,
+			  struct page *dst, unsigned int len, unsigned int offs,
+			  u64 lblk_num, gfp_t gfp_flags)
+{
+	return llcrypt_crypt_block(inode, FS_DECRYPT, lblk_num, src, dst,
+				   len, offs, gfp_flags);
+}
+EXPORT_SYMBOL(llcrypt_decrypt_block);
+
+/*
+ * Validate dentries in encrypted directories to make sure we aren't potentially
+ * caching stale dentries after a key has been added.
+ */
+static int llcrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct dentry *dir;
+	int err;
+	int valid;
+
+	/*
+	 * Plaintext names are always valid, since llcrypt doesn't support
+	 * reverting to ciphertext names without evicting the directory's inode
+	 * -- which implies eviction of the dentries in the directory.
+	 */
+	if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
+		return 1;
+
+	/*
+	 * Ciphertext name; valid if the directory's key is still unavailable.
+	 *
+	 * Although llcrypt forbids rename() on ciphertext names, we still must
+	 * use dget_parent() here rather than use ->d_parent directly.  That's
+	 * because a corrupted fs image may contain directory hard links, which
+	 * the VFS handles by moving the directory's dentry tree in the dcache
+	 * each time ->lookup() finds the directory and it already has a dentry
+	 * elsewhere.  Thus ->d_parent can be changing, and we must safely grab
+	 * a reference to some ->d_parent to prevent it from being freed.
+	 */
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	dir = dget_parent(dentry);
+	err = llcrypt_get_encryption_info(d_inode(dir));
+	valid = !llcrypt_has_encryption_key(d_inode(dir));
+	dput(dir);
+
+	if (err < 0)
+		return err;
+
+	return valid;
+}
+
+const struct dentry_operations llcrypt_d_ops = {
+	.d_revalidate = llcrypt_d_revalidate,
+};
+
+static void llcrypt_destroy(void)
+{
+	struct llcrypt_ctx *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &llcrypt_free_ctxs, free_list)
+		kmem_cache_free(llcrypt_ctx_cachep, pos);
+	INIT_LIST_HEAD(&llcrypt_free_ctxs);
+	mempool_destroy(llcrypt_bounce_page_pool);
+	llcrypt_bounce_page_pool = NULL;
+}
+
+/**
+ * llcrypt_initialize() - allocate major buffers for fs encryption.
+ * @cop_flags:  llcrypt operations flags
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int llcrypt_initialize(unsigned int cop_flags)
+{
+	int i, res = -ENOMEM;
+
+	/* No need to allocate a bounce page pool if this FS won't use it. */
+	if (cop_flags & LL_CFLG_OWN_PAGES)
+		return 0;
+
+	mutex_lock(&llcrypt_init_mutex);
+	if (llcrypt_bounce_page_pool)
+		goto already_initialized;
+
+	for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+		struct llcrypt_ctx *ctx;
+
+		ctx = kmem_cache_zalloc(llcrypt_ctx_cachep, GFP_NOFS);
+		if (!ctx)
+			goto fail;
+		list_add(&ctx->free_list, &llcrypt_free_ctxs);
+	}
+
+	llcrypt_bounce_page_pool =
+		mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+	if (!llcrypt_bounce_page_pool)
+		goto fail;
+
+already_initialized:
+	mutex_unlock(&llcrypt_init_mutex);
+	return 0;
+fail:
+	llcrypt_destroy();
+	mutex_unlock(&llcrypt_init_mutex);
+	return res;
+}
+
+void llcrypt_msg(const struct inode *inode, int mask,
+		 const char *fmt, ...)
+{
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	struct va_format vaf;
+	va_list args;
+
+	if (!__ratelimit(&rs))
+		return;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	if (inode)
+		CDEBUG(mask, "llcrypt (%s, inode %lu): %pV\n",
+		       inode->i_sb->s_id, inode->i_ino, &vaf);
+	else
+		CDEBUG(mask, "llcrypt: %pV\n", &vaf);
+	va_end(args);
+}
+
+/**
+ * llcrypt_init() - Set up for fs encryption.
+ */
+int __init llcrypt_init(void)
+{
+	int err = -ENOMEM;
+
+	/*
+	 * Use an unbound workqueue to allow bios to be decrypted in parallel
+	 * even when they happen to complete on the same CPU.  This sacrifices
+	 * locality, but it's worthwhile since decryption is CPU-intensive.
+	 *
+	 * Also use a high-priority workqueue to prioritize decryption work,
+	 * which blocks reads from completing, over regular application tasks.
+	 */
+	llcrypt_read_workqueue = alloc_workqueue("llcrypt_read_queue",
+						 WQ_UNBOUND | WQ_HIGHPRI,
+						 num_online_cpus());
+	if (!llcrypt_read_workqueue)
+		goto fail;
+
+	llcrypt_ctx_cachep = KMEM_CACHE(llcrypt_ctx, SLAB_RECLAIM_ACCOUNT);
+	if (!llcrypt_ctx_cachep)
+		goto fail_free_queue;
+
+	llcrypt_info_cachep = KMEM_CACHE(llcrypt_info, SLAB_RECLAIM_ACCOUNT);
+	if (!llcrypt_info_cachep)
+		goto fail_free_ctx;
+
+	err = llcrypt_init_keyring();
+	if (err)
+		goto fail_free_info;
+
+	return 0;
+
+fail_free_info:
+	kmem_cache_destroy(llcrypt_info_cachep);
+fail_free_ctx:
+	kmem_cache_destroy(llcrypt_ctx_cachep);
+fail_free_queue:
+	destroy_workqueue(llcrypt_read_workqueue);
+fail:
+	return err;
+}
+
+/**
+ * llcrypt_exit() - Clean up for fs encryption.
+ */
+void __exit llcrypt_exit(void)
+{
+	llcrypt_exit_keyring();
+
+	llcrypt_destroy();
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
+	kmem_cache_destroy(llcrypt_info_cachep);
+	kmem_cache_destroy(llcrypt_ctx_cachep);
+	destroy_workqueue(llcrypt_read_workqueue);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/fname.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/fname.c
new file mode 100644
index 0000000000000..65b6b422cb343
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/fname.c
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This contains functions for filename crypto management
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Uday Savagaonkar, 2014.
+ * Modified by Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+
+#include <linux/scatterlist.h>
+#include <crypto/skcipher.h>
+#include "llcrypt_private.h"
+
+static inline bool llcrypt_is_dot_dotdot(const struct qstr *str)
+{
+	if (str->len == 1 && str->name[0] == '.')
+		return true;
+
+	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+		return true;
+
+	return false;
+}
+
+/**
+ * fname_encrypt() - encrypt a filename
+ *
+ * The output buffer must be at least as large as the input buffer.
+ * Any extra space is filled with NUL padding before encryption.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fname_encrypt(struct inode *inode, const struct qstr *iname,
+		  u8 *out, unsigned int olen)
+{
+	struct skcipher_request *req = NULL;
+	DECLARE_CRYPTO_WAIT(wait);
+	struct llcrypt_info *ci = llcrypt_info(inode);
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	union llcrypt_iv iv;
+	struct scatterlist sg;
+	int res;
+
+	/*
+	 * Copy the filename to the output buffer for encrypting in-place and
+	 * pad it with the needed number of NUL bytes.
+	 */
+	if (WARN_ON(olen < iname->len))
+		return -ENOBUFS;
+	memcpy(out, iname->name, iname->len);
+	memset(out + iname->len, 0, olen - iname->len);
+
+	if (tfm == NULL)
+		return 0;
+
+	/* Initialize the IV */
+	llcrypt_generate_iv(&iv, 0, ci);
+
+	/* Set up the encryption request */
+	req = skcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+	skcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			crypto_req_done, &wait);
+	sg_init_one(&sg, out, olen);
+	skcipher_request_set_crypt(req, &sg, &sg, olen, &iv);
+
+	/* Do the encryption */
+	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+	skcipher_request_free(req);
+	if (res < 0) {
+		llcrypt_err(inode, "Filename encryption failed: %d", res);
+		return res;
+	}
+
+	return 0;
+}
+
+/**
+ * fname_decrypt() - decrypt a filename
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int fname_decrypt(struct inode *inode,
+				const struct llcrypt_str *iname,
+				struct llcrypt_str *oname)
+{
+	struct skcipher_request *req = NULL;
+	DECLARE_CRYPTO_WAIT(wait);
+	struct scatterlist src_sg, dst_sg;
+	struct llcrypt_info *ci = llcrypt_info(inode);
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	union llcrypt_iv iv;
+	int res;
+
+	if (tfm == NULL) {
+		memcpy(oname->name, iname->name, iname->len);
+		oname->name[iname->len] = '\0';
+		oname->len = iname->len;
+		return 0;
+	}
+
+	/* Allocate request */
+	req = skcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+	skcipher_request_set_callback(req,
+		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		crypto_req_done, &wait);
+
+	/* Initialize IV */
+	llcrypt_generate_iv(&iv, 0, ci);
+
+	/* Create decryption request */
+	sg_init_one(&src_sg, iname->name, iname->len);
+	sg_init_one(&dst_sg, oname->name, oname->len);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv);
+	res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
+	skcipher_request_free(req);
+	if (res < 0) {
+		llcrypt_err(inode, "Filename decryption failed: %d", res);
+		return res;
+	}
+
+	oname->len = strnlen(oname->name, iname->len);
+	return 0;
+}
+
+/*
+ * Old fashion base64 encoding, taken from Linux 5.4.
+ *
+ * This base64 encoding is specific to fscrypt and has been replaced since then
+ * with an RFC 4648 compliant base64-url encoding, see llcrypt_base64url_*
+ * below.
+ * The old fashion base64 encoding is kept for compatibility with older clients.
+ */
+
+static const char lookup_table[65] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+
+#define LLCRYPT_BASE64_CHARS(nbytes)	DIV_ROUND_UP((nbytes) * 4, 3)
+
+/**
+ * base64_encode() -
+ *
+ * Encodes the input string using characters from the set [A-Za-z0-9+,].
+ * The encoded string is roughly 4/3 times the size of the input string.
+ *
+ * Return: length of the encoded string
+ */
+static inline int llcrypt_base64_encode(const u8 *src, int len, char *dst)
+{
+	int i, bits = 0, ac = 0;
+	char *cp = dst;
+
+	for (i = 0; i < len; i++) {
+		ac += src[i] << bits;
+		bits += 8;
+		do {
+			*cp++ = lookup_table[ac & 0x3f];
+			ac >>= 6;
+			bits -= 6;
+		} while (bits >= 6);
+	}
+	if (bits)
+		*cp++ = lookup_table[ac & 0x3f];
+	return cp - dst;
+}
+
+static inline int llcrypt_base64_decode(const char *src, int len, u8 *dst)
+{
+	int i, bits = 0, ac = 0;
+	const char *p;
+	u8 *cp = dst;
+
+	for (i = 0; i < len; i++) {
+		p = strchr(lookup_table, src[i]);
+		if (p == NULL || src[i] == 0)
+			return -2;
+		ac += (p - lookup_table) << bits;
+		bits += 6;
+		if (bits >= 8) {
+			*cp++ = ac & 0xff;
+			ac >>= 8;
+			bits -= 8;
+		}
+	}
+	if (ac)
+		return -1;
+	return cp - dst;
+}
+
+/*
+ * New fashion base64 encoding, taken from Linux 5.14.
+ *
+ * This base64 encoding is RFC 4648 compliant base64-url encoding.
+ */
+
+static const char base64url_table[] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
+
+#define LLCRYPT_BASE64URL_CHARS(nbytes)	DIV_ROUND_UP((nbytes) * 4, 3)
+
+/**
+ * llcrypt_base64url_encode() - base64url-encode some binary data
+ * @src: the binary data to encode
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the base64url-encoded string.  Not NUL-terminated.
+ *
+ * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL
+ * and Filename Safe Alphabet" specified by RFC 4648.  '='-padding isn't used,
+ * as it's unneeded and not required by the RFC.  base64url is used instead of
+ * base64 to avoid the '/' character, which isn't allowed in filenames.
+ *
+ * Return: the length of the resulting base64url-encoded string in bytes.
+ *	   This will be equal to LLCRYPT_BASE64URL_CHARS(srclen).
+ */
+static inline int llcrypt_base64url_encode(const u8 *src, int srclen, char *dst)
+{
+	u32 ac = 0;
+	int bits = 0;
+	int i;
+	char *cp = dst;
+
+	for (i = 0; i < srclen; i++) {
+		ac = (ac << 8) | src[i];
+		bits += 8;
+		do {
+			bits -= 6;
+			*cp++ = base64url_table[(ac >> bits) & 0x3f];
+		} while (bits >= 6);
+	}
+	if (bits)
+		*cp++ = base64url_table[(ac << (6 - bits)) & 0x3f];
+	return cp - dst;
+}
+
+/**
+ * llcrypt_base64url_decode() - base64url-decode a string
+ * @src: the string to decode.  Doesn't need to be NUL-terminated.
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the decoded binary data
+ *
+ * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with
+ * URL and Filename Safe Alphabet" specified by RFC 4648.  '='-padding isn't
+ * accepted, nor are non-encoding characters such as whitespace.
+ *
+ * This implementation hasn't been optimized for performance.
+ *
+ * Return: the length of the resulting decoded binary data in bytes,
+ *	   or -1 if the string isn't a valid base64url string.
+ */
+static inline int llcrypt_base64url_decode(const char *src, int srclen, u8 *dst)
+{
+	u32 ac = 0;
+	int bits = 0;
+	int i;
+	u8 *bp = dst;
+
+	for (i = 0; i < srclen; i++) {
+		const char *p = strchr(base64url_table, src[i]);
+
+		if (p == NULL || src[i] == 0)
+			return -1;
+		ac = (ac << 6) | (p - base64url_table);
+		bits += 6;
+		if (bits >= 8) {
+			bits -= 8;
+			*bp++ = (u8)(ac >> bits);
+		}
+	}
+	if (ac & ((1 << bits) - 1))
+		return -1;
+	return bp - dst;
+}
+
+static inline int base64_chars(struct lustre_sb_info *lsi, int nbytes)
+{
+	if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI))
+		return LLCRYPT_BASE64URL_CHARS(nbytes);
+	else
+		return LLCRYPT_BASE64_CHARS(nbytes);
+}
+
+bool llcrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+				  u32 max_len, u32 *encrypted_len_ret)
+{
+	const struct llcrypt_info *ci = llcrypt_info(inode);
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	int padding = 4 << (llcrypt_policy_flags(&ci->ci_policy) &
+			    LLCRYPT_POLICY_FLAGS_PAD_MASK);
+	u32 encrypted_len;
+
+	if (orig_len > max_len)
+		return false;
+	if (tfm == NULL) {
+		*encrypted_len_ret = orig_len;
+	} else {
+		encrypted_len = max(orig_len, (u32)LL_CRYPTO_BLOCK_SIZE);
+		encrypted_len = round_up(encrypted_len, padding);
+		*encrypted_len_ret = min(encrypted_len, max_len);
+	}
+	return true;
+}
+
+/**
+ * llcrypt_fname_alloc_buffer - allocate a buffer for presented filenames
+ *
+ * Allocate a buffer that is large enough to hold any decrypted or encoded
+ * filename (null-terminated), for the given maximum encrypted filename length.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int llcrypt_fname_alloc_buffer(const struct inode *inode,
+			       u32 max_encrypted_len,
+			       struct llcrypt_str *crypto_str)
+{
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	const u32 max_encoded_len =
+		max_t(u32,
+		   base64_chars(lsi, LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE),
+		   1 + base64_chars(lsi, sizeof(struct llcrypt_digested_name)));
+	u32 max_presented_len;
+
+	max_presented_len = max(max_encoded_len, max_encrypted_len);
+
+	crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS);
+	if (!crypto_str->name)
+		return -ENOMEM;
+	crypto_str->len = max_presented_len;
+	return 0;
+}
+EXPORT_SYMBOL(llcrypt_fname_alloc_buffer);
+
+/**
+ * llcrypt_fname_free_buffer - free the buffer for presented filenames
+ *
+ * Free the buffer allocated by llcrypt_fname_alloc_buffer().
+ */
+void llcrypt_fname_free_buffer(struct llcrypt_str *crypto_str)
+{
+	if (!crypto_str)
+		return;
+	kfree(crypto_str->name);
+	crypto_str->name = NULL;
+}
+EXPORT_SYMBOL(llcrypt_fname_free_buffer);
+
+/**
+ * llcrypt_fname_disk_to_usr() - converts a filename from disk space to user
+ * space
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * If the key is available, we'll decrypt the disk name; otherwise, we'll encode
+ * it for presentation.  Short names are directly base64-encoded, while long
+ * names are encoded in llcrypt_digested_name format.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int llcrypt_fname_disk_to_usr(struct inode *inode,
+			u32 hash, u32 minor_hash,
+			const struct llcrypt_str *iname,
+			struct llcrypt_str *oname)
+{
+	int (*b64_encode)(const u8 *src, int srclen, char *dst);
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	const struct qstr qname = LLTR_TO_QSTR(iname);
+	struct llcrypt_digested_name digested_name;
+
+	if (llcrypt_is_dot_dotdot(&qname)) {
+		oname->name[0] = '.';
+		oname->name[iname->len - 1] = '.';
+		oname->len = iname->len;
+		return 0;
+	}
+
+	if (llcrypt_has_encryption_key(inode)) {
+		struct llcrypt_info *ci = llcrypt_info(inode);
+		struct crypto_skcipher *tfm = ci->ci_ctfm;
+
+		if (tfm && iname->len < LL_CRYPTO_BLOCK_SIZE)
+			return -EUCLEAN;
+
+		return fname_decrypt(inode, iname, oname);
+	}
+
+	if (!llcrypt_policy_has_filename_enc(inode)) {
+		memcpy(oname->name, iname->name, iname->len);
+		oname->name[iname->len] = '\0';
+		oname->len = iname->len;
+		return 0;
+	}
+
+	if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI))
+		b64_encode = llcrypt_base64url_encode;
+	else
+		b64_encode = llcrypt_base64_encode;
+
+	if (iname->len <= LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
+		oname->len = b64_encode(iname->name, iname->len, oname->name);
+		return 0;
+	}
+	if (hash) {
+		digested_name.hash = hash;
+		digested_name.minor_hash = minor_hash;
+	} else {
+		digested_name.hash = 0;
+		digested_name.minor_hash = 0;
+	}
+	memcpy(digested_name.digest,
+	       LLCRYPT_FNAME_DIGEST(iname->name, iname->len),
+	       LLCRYPT_FNAME_DIGEST_SIZE);
+	if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI))
+		oname->name[0] = LLCRYPT_DIGESTED_CHAR;
+	else
+		oname->name[0] = LLCRYPT_DIGESTED_CHAR_OLD;
+	oname->len = 1 + b64_encode((const u8 *)&digested_name,
+				    sizeof(digested_name), oname->name + 1);
+	return 0;
+}
+EXPORT_SYMBOL(llcrypt_fname_disk_to_usr);
+
+/**
+ * llcrypt_setup_filename() - prepare to search a possibly encrypted directory
+ * @dir: the directory that will be searched
+ * @iname: the user-provided filename being searched for
+ * @lookup: 1 if we're allowed to proceed without the key because it's
+ *	->lookup() or we're finding the dir_entry for deletion; 0 if we cannot
+ *	proceed without the key because we're going to create the dir_entry.
+ * @fname: the filename information to be filled in
+ *
+ * Given a user-provided filename @iname, this function sets @fname->disk_name
+ * to the name that would be stored in the on-disk directory entry, if possible.
+ * If the directory is unencrypted this is simply @iname.  Else, if we have the
+ * directory's encryption key, then @iname is the plaintext, so we encrypt it to
+ * get the disk_name.
+ *
+ * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
+ * we decode it to get either the ciphertext disk_name (for short names) or the
+ * llcrypt_digested_name (for long names).  Non-@lookup operations will be
+ * impossible in this case, so we fail them with ENOKEY.
+ *
+ * If successful, llcrypt_free_filename() must be called later to clean up.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int llcrypt_setup_filename(struct inode *dir, const struct qstr *iname,
+			      int lookup, struct llcrypt_name *fname)
+{
+	struct lustre_sb_info *lsi = s2lsi(dir->i_sb);
+	int ret;
+	int digested;
+
+	memset(fname, 0, sizeof(struct llcrypt_name));
+	fname->usr_fname = iname;
+
+	if (!IS_ENCRYPTED(dir) || llcrypt_is_dot_dotdot(iname)) {
+		fname->disk_name.name = (unsigned char *)iname->name;
+		fname->disk_name.len = iname->len;
+		return 0;
+	}
+	ret = llcrypt_get_encryption_info(dir);
+	if (ret)
+		return ret;
+
+	if (llcrypt_has_encryption_key(dir)) {
+		struct lustre_sb_info *lsi = s2lsi(dir->i_sb);
+
+		if (!llcrypt_fname_encrypted_size(dir, iname->len,
+						  lsi ?
+						    lsi->lsi_cop->max_namelen :
+						    NAME_MAX,
+						  &fname->crypto_buf.len))
+			return -ENAMETOOLONG;
+		fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
+						 GFP_NOFS);
+		if (!fname->crypto_buf.name)
+			return -ENOMEM;
+
+		ret = fname_encrypt(dir, iname, fname->crypto_buf.name,
+				    fname->crypto_buf.len);
+		if (ret)
+			goto errout;
+		fname->disk_name.name = fname->crypto_buf.name;
+		fname->disk_name.len = fname->crypto_buf.len;
+		return 0;
+	}
+	if (!lookup)
+		return -ENOKEY;
+
+	if (!llcrypt_policy_has_filename_enc(dir)) {
+		fname->disk_name.name = (unsigned char *)iname->name;
+		fname->disk_name.len = iname->len;
+		return 0;
+	}
+
+	fname->is_ciphertext_name = true;
+
+	/*
+	 * We don't have the key and we are doing a lookup; decode the
+	 * user-supplied name
+	 */
+	if ((!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI) &&
+	     iname->name[0] == LLCRYPT_DIGESTED_CHAR) ||
+	    ((lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI) &&
+	     iname->name[0] == LLCRYPT_DIGESTED_CHAR_OLD)) {
+		if (iname->len != 1 + base64_chars(lsi,
+					sizeof(struct llcrypt_digested_name))) {
+			return -ENOENT;
+		}
+		digested = 1;
+	} else {
+		if (iname->len >
+		    base64_chars(lsi, LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE))
+			return -ENOENT;
+		digested = 0;
+	}
+
+	fname->crypto_buf.name =
+		kmalloc(max_t(size_t, LLCRYPT_FNAME_MAX_UNDIGESTED_SIZE,
+			      sizeof(struct llcrypt_digested_name)),
+			GFP_KERNEL);
+	if (fname->crypto_buf.name == NULL)
+		return -ENOMEM;
+
+	if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI))
+		ret = llcrypt_base64url_decode(iname->name + digested,
+					       iname->len - digested,
+					       fname->crypto_buf.name);
+	else
+		ret = llcrypt_base64_decode(iname->name + digested,
+					    iname->len - digested,
+					    fname->crypto_buf.name);
+
+	if (ret < 0) {
+		ret = -ENOENT;
+		goto errout;
+	}
+	fname->crypto_buf.len = ret;
+	if (digested) {
+		const struct llcrypt_digested_name *n =
+			(const void *)fname->crypto_buf.name;
+		fname->hash = n->hash;
+		fname->minor_hash = n->minor_hash;
+	} else {
+		fname->disk_name.name = fname->crypto_buf.name;
+		fname->disk_name.len = fname->crypto_buf.len;
+	}
+	return 0;
+
+errout:
+	kfree(fname->crypto_buf.name);
+	return ret;
+}
+EXPORT_SYMBOL(llcrypt_setup_filename);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hkdf.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hkdf.c
new file mode 100644
index 0000000000000..8874bcb0a527b
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hkdf.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
+ * Function"), aka RFC 5869.  See also the original paper (Krawczyk 2010):
+ * "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
+ *
+ * This is used to derive keys from the llcrypt master keys.
+ *
+ * Copyright 2019 Google LLC
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+
+#include <crypto/hash.h>
+#ifdef HAVE_CRYPTO_SHA2_HEADER
+#include <crypto/sha2.h>
+#else
+#include <crypto/sha.h>
+#endif
+#include "llcrypt_private.h"
+
+/*
+ * HKDF supports any unkeyed cryptographic hash algorithm, but llcrypt uses
+ * SHA-512 because it is reasonably secure and efficient; and since it produces
+ * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of
+ * entropy from the master key and requires only one iteration of HKDF-Expand.
+ */
+#define HKDF_HMAC_ALG		"hmac(sha512)"
+#define HKDF_HASHLEN		SHA512_DIGEST_SIZE
+
+/*
+ * HKDF consists of two steps:
+ *
+ * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from
+ *    the input keying material and optional salt.
+ * 2. HKDF-Expand: expand the pseudorandom key into output keying material of
+ *    any length, parameterized by an application-specific info string.
+ *
+ * HKDF-Extract can be skipped if the input is already a pseudorandom key of
+ * length HKDF_HASHLEN bytes.  However, cipher modes other than AES-256-XTS take
+ * shorter keys, and we don't want to force users of those modes to provide
+ * unnecessarily long master keys.  Thus llcrypt still does HKDF-Extract.  No
+ * salt is used, since llcrypt master keys should already be pseudorandom and
+ * there's no way to persist a random salt per master key from kernel mode.
+ */
+
+/* HKDF-Extract (RFC 5869 section 2.2), unsalted */
+static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
+			unsigned int ikmlen, u8 prk[HKDF_HASHLEN])
+{
+	static const u8 default_salt[HKDF_HASHLEN];
+	SHASH_DESC_ON_STACK(desc, hmac_tfm);
+	int err;
+
+	err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN);
+	if (err)
+		return err;
+
+	desc->tfm = hmac_tfm;
+	err = crypto_shash_digest(desc, ikm, ikmlen, prk);
+	shash_desc_zero(desc);
+	return err;
+}
+
+/*
+ * Compute HKDF-Extract using the given master key as the input keying material,
+ * and prepare an HMAC transform object keyed by the resulting pseudorandom key.
+ *
+ * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many
+ * times without having to recompute HKDF-Extract each time.
+ */
+int llcrypt_init_hkdf(struct llcrypt_hkdf *hkdf, const u8 *master_key,
+		      unsigned int master_key_size)
+{
+	struct crypto_shash *hmac_tfm;
+	u8 prk[HKDF_HASHLEN];
+	int err;
+
+	hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0);
+	if (IS_ERR(hmac_tfm)) {
+		llcrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld",
+			    PTR_ERR(hmac_tfm));
+		return PTR_ERR(hmac_tfm);
+	}
+
+	if (WARN_ON(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) {
+		err = -EINVAL;
+		goto err_free_tfm;
+	}
+
+	err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk);
+	if (err)
+		goto err_free_tfm;
+
+	err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk));
+	if (err)
+		goto err_free_tfm;
+
+	hkdf->hmac_tfm = hmac_tfm;
+	goto out;
+
+err_free_tfm:
+	crypto_free_shash(hmac_tfm);
+out:
+	memzero_explicit(prk, sizeof(prk));
+	return err;
+}
+
+/*
+ * HKDF-Expand (RFC 5869 section 2.3).  This expands the pseudorandom key, which
+ * was already keyed into 'hkdf->hmac_tfm' by llcrypt_init_hkdf(), into 'okmlen'
+ * bytes of output keying material parameterized by the application-specific
+ * 'info' of length 'infolen' bytes, prefixed by "llcrypt\0" and the 'context'
+ * byte.  This is thread-safe and may be called by multiple threads in parallel.
+ *
+ * ('context' isn't part of the HKDF specification; it's just a prefix llcrypt
+ * adds to its application-specific info strings to guarantee that it doesn't
+ * accidentally repeat an info string when using HKDF for different purposes.)
+ */
+int llcrypt_hkdf_expand(struct llcrypt_hkdf *hkdf, u8 context,
+			const u8 *info, unsigned int infolen,
+			u8 *okm, unsigned int okmlen)
+{
+	SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm);
+	u8 prefix[9];
+	unsigned int i;
+	int err;
+	const u8 *prev = NULL;
+	u8 counter = 1;
+	u8 tmp[HKDF_HASHLEN];
+
+	if (WARN_ON(okmlen > 255 * HKDF_HASHLEN))
+		return -EINVAL;
+
+	desc->tfm = hkdf->hmac_tfm;
+
+	memcpy(prefix, "fscrypt\0", 8);
+	prefix[8] = context;
+
+	for (i = 0; i < okmlen; i += HKDF_HASHLEN) {
+
+		err = crypto_shash_init(desc);
+		if (err)
+			goto out;
+
+		if (prev) {
+			err = crypto_shash_update(desc, prev, HKDF_HASHLEN);
+			if (err)
+				goto out;
+		}
+
+		err = crypto_shash_update(desc, prefix, sizeof(prefix));
+		if (err)
+			goto out;
+
+		err = crypto_shash_update(desc, info, infolen);
+		if (err)
+			goto out;
+
+		BUILD_BUG_ON(sizeof(counter) != 1);
+		if (okmlen - i < HKDF_HASHLEN) {
+			err = crypto_shash_finup(desc, &counter, 1, tmp);
+			if (err)
+				goto out;
+			memcpy(&okm[i], tmp, okmlen - i);
+			memzero_explicit(tmp, sizeof(tmp));
+		} else {
+			err = crypto_shash_finup(desc, &counter, 1, &okm[i]);
+			if (err)
+				goto out;
+		}
+		counter++;
+		prev = &okm[i];
+	}
+	err = 0;
+out:
+	if (unlikely(err))
+		memzero_explicit(okm, okmlen); /* so caller doesn't need to */
+	shash_desc_zero(desc);
+	return err;
+}
+
+void llcrypt_destroy_hkdf(struct llcrypt_hkdf *hkdf)
+{
+	crypto_free_shash(hkdf->hmac_tfm);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hooks.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hooks.c
new file mode 100644
index 0000000000000..36399511b7fb0
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/hooks.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fs/crypto/hooks.c
+ *
+ * Encryption hooks for higher-level filesystem operations.
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+
+#include "llcrypt_private.h"
+
+/**
+ * llcrypt_file_open - prepare to open a possibly-encrypted regular file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * Currently, an encrypted regular file can only be opened if its encryption key
+ * is available; access to the raw encrypted contents is not supported.
+ * Therefore, we first set up the inode's encryption key (if not already done)
+ * and return an error if it's unavailable.
+ *
+ * We also verify that if the parent directory (from the path via which the file
+ * is being opened) is encrypted, then the inode being opened uses the same
+ * encryption policy.  This is needed as part of the enforcement that all files
+ * in an encrypted directory tree use the same encryption policy, as a
+ * protection against certain types of offline attacks.  Note that this check is
+ * needed even when opening an *unencrypted* file, since it's forbidden to have
+ * an unencrypted file in an encrypted directory.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ */
+int llcrypt_file_open(struct inode *inode, struct file *filp)
+{
+	int err;
+	struct dentry *dir;
+
+	err = llcrypt_require_key(inode);
+	if (err)
+		return err;
+
+	dir = dget_parent(file_dentry(filp));
+	if (IS_ENCRYPTED(d_inode(dir)) &&
+	    !llcrypt_has_permitted_context(d_inode(dir), inode)) {
+		llcrypt_warn(inode,
+			     "Inconsistent encryption context (parent directory: %lu)",
+			     d_inode(dir)->i_ino);
+		err = -EPERM;
+	}
+	dput(dir);
+	return err;
+}
+EXPORT_SYMBOL_GPL(llcrypt_file_open);
+
+int __llcrypt_prepare_link(struct inode *inode, struct inode *dir,
+			   struct dentry *dentry)
+{
+	int err;
+
+	err = llcrypt_require_key(dir);
+	if (err)
+		return err;
+
+	/* ... in case we looked up ciphertext name before key was added */
+	if (dentry->d_flags & DCACHE_ENCRYPTED_NAME)
+		return -ENOKEY;
+
+	if (!llcrypt_has_permitted_context(dir, inode))
+		return -EXDEV;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__llcrypt_prepare_link);
+
+int __llcrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry,
+			     unsigned int flags)
+{
+	int err;
+
+	err = llcrypt_require_key(old_dir);
+	if (err)
+		return err;
+
+	err = llcrypt_require_key(new_dir);
+	if (err)
+		return err;
+
+	/* ... in case we looked up ciphertext name(s) before key was added */
+	if ((old_dentry->d_flags | new_dentry->d_flags) &
+	    DCACHE_ENCRYPTED_NAME)
+		return -ENOKEY;
+
+	if (old_dir != new_dir) {
+		if (IS_ENCRYPTED(new_dir) &&
+		    !llcrypt_has_permitted_context(new_dir,
+						   d_inode(old_dentry)))
+			return -EXDEV;
+
+		if ((flags & RENAME_EXCHANGE) &&
+		    IS_ENCRYPTED(old_dir) &&
+		    !llcrypt_has_permitted_context(old_dir,
+						   d_inode(new_dentry)))
+			return -EXDEV;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__llcrypt_prepare_rename);
+
+int __llcrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
+			     struct llcrypt_name *fname)
+{
+	int err = llcrypt_setup_filename(dir, &dentry->d_name, 1, fname);
+
+	if (err && err != -ENOENT)
+		return err;
+
+	if (fname->is_ciphertext_name) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags |= DCACHE_ENCRYPTED_NAME;
+		spin_unlock(&dentry->d_lock);
+		d_set_d_op(dentry, &llcrypt_d_ops);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(__llcrypt_prepare_lookup);
+
+int __llcrypt_prepare_symlink(struct inode *dir, unsigned int len,
+			      unsigned int max_len,
+			      struct llcrypt_str *disk_link)
+{
+	int err;
+
+	/*
+	 * To calculate the size of the encrypted symlink target we need to know
+	 * the amount of NUL padding, which is determined by the flags set in
+	 * the encryption policy which will be inherited from the directory.
+	 * The easiest way to get access to this is to just load the directory's
+	 * llcrypt_info, since we'll need it to create the dir_entry anyway.
+	 *
+	 * Note: in test_dummy_encryption mode, @dir may be unencrypted.
+	 */
+	err = llcrypt_get_encryption_info(dir);
+	if (err)
+		return err;
+	if (!llcrypt_has_encryption_key(dir))
+		return -ENOKEY;
+
+	/*
+	 * Calculate the size of the encrypted symlink and verify it won't
+	 * exceed max_len.  Note that for historical reasons, encrypted symlink
+	 * targets are prefixed with the ciphertext length, despite this
+	 * actually being redundant with i_size.  This decreases by 2 bytes the
+	 * longest symlink target we can accept.
+	 *
+	 * We could recover 1 byte by not counting a null terminator, but
+	 * counting it (even though it is meaningless for ciphertext) is simpler
+	 * for now since filesystems will assume it is there and subtract it.
+	 */
+	if (!llcrypt_fname_encrypted_size(dir, len,
+					  max_len - sizeof(struct llcrypt_symlink_data),
+					  &disk_link->len))
+		return -ENAMETOOLONG;
+	disk_link->len += sizeof(struct llcrypt_symlink_data);
+
+	disk_link->name = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__llcrypt_prepare_symlink);
+
+int __llcrypt_encrypt_symlink(struct inode *inode, const char *target,
+			      unsigned int len, struct llcrypt_str *disk_link)
+{
+	int err;
+	struct qstr iname = QSTR_INIT(target, len);
+	struct llcrypt_symlink_data *sd;
+	unsigned int ciphertext_len;
+
+	if (!llcrypt_policy_has_filename_enc(inode))
+		return 0;
+
+	err = llcrypt_require_key(inode);
+	if (err)
+		return err;
+
+	if (disk_link->name) {
+		/* filesystem-provided buffer */
+		sd = (struct llcrypt_symlink_data *)disk_link->name;
+	} else {
+		sd = kmalloc(disk_link->len, GFP_NOFS);
+		if (!sd)
+			return -ENOMEM;
+	}
+	ciphertext_len = disk_link->len - sizeof(*sd);
+	sd->len = cpu_to_le16(ciphertext_len);
+
+	err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len);
+	if (err)
+		goto err_free_sd;
+
+	/*
+	 * Null-terminating the ciphertext doesn't make sense, but we still
+	 * count the null terminator in the length, so we might as well
+	 * initialize it just in case the filesystem writes it out.
+	 */
+	sd->encrypted_path[ciphertext_len] = '\0';
+
+	/* Cache the plaintext symlink target for later use by get_link() */
+	err = -ENOMEM;
+	inode->i_link = kmemdup(target, len + 1, GFP_NOFS);
+	if (!inode->i_link)
+		goto err_free_sd;
+
+	if (!disk_link->name)
+		disk_link->name = (unsigned char *)sd;
+	return 0;
+
+err_free_sd:
+	if (!disk_link->name)
+		kfree(sd);
+	return err;
+}
+EXPORT_SYMBOL_GPL(__llcrypt_encrypt_symlink);
+
+/**
+ * llcrypt_get_symlink - get the target of an encrypted symlink
+ * @inode: the symlink inode
+ * @caddr: the on-disk contents of the symlink
+ * @max_size: size of @caddr buffer
+ * @done: if successful, will be set up to free the returned target if needed
+ *
+ * If the symlink's encryption key is available, we decrypt its target.
+ * Otherwise, we encode its target for presentation.
+ *
+ * This may sleep, so the filesystem must have dropped out of RCU mode already.
+ *
+ * Return: the presentable symlink target or an ERR_PTR()
+ */
+const char *llcrypt_get_symlink(struct inode *inode, const void *caddr,
+				unsigned int max_size,
+				struct delayed_call *done)
+{
+	const struct llcrypt_symlink_data *sd;
+	struct llcrypt_str cstr, pstr;
+	bool has_key;
+	int err;
+
+	/* This is for encrypted symlinks only */
+	if (WARN_ON(!IS_ENCRYPTED(inode)))
+		return ERR_PTR(-EINVAL);
+
+	/* If the decrypted target is already cached, just return it. */
+	pstr.name = READ_ONCE(inode->i_link);
+	if (pstr.name)
+		return pstr.name;
+
+	/*
+	 * Try to set up the symlink's encryption key, but we can continue
+	 * regardless of whether the key is available or not.
+	 */
+	err = llcrypt_get_encryption_info(inode);
+	if (err)
+		return ERR_PTR(err);
+	has_key = llcrypt_has_encryption_key(inode);
+
+	/*
+	 * For historical reasons, encrypted symlink targets are prefixed with
+	 * the ciphertext length, even though this is redundant with i_size.
+	 */
+
+	if (!llcrypt_policy_has_filename_enc(inode)) {
+		cstr.name = (unsigned char *)caddr;
+		cstr.len = strlen(cstr.name);
+
+		if (cstr.len == 0)
+			return ERR_PTR(-EUCLEAN);
+	} else {
+		if (max_size < sizeof(*sd))
+			return ERR_PTR(-EUCLEAN);
+		sd = caddr;
+		cstr.name = (unsigned char *)sd->encrypted_path;
+		cstr.len = le16_to_cpu(sd->len);
+
+		if (cstr.len == 0)
+			return ERR_PTR(-EUCLEAN);
+
+		if (cstr.len + sizeof(*sd) - 1 > max_size)
+			return ERR_PTR(-EUCLEAN);
+	}
+
+	err = llcrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+	if (err)
+		return ERR_PTR(err);
+
+	err = llcrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
+	if (err)
+		goto err_kfree;
+
+	err = -EUCLEAN;
+	if (pstr.name[0] == '\0')
+		goto err_kfree;
+
+	pstr.name[pstr.len] = '\0';
+
+	/*
+	 * Cache decrypted symlink targets in i_link for later use.  Don't cache
+	 * symlink targets encoded without the key, since those become outdated
+	 * once the key is added.  This pairs with the READ_ONCE() above and in
+	 * the VFS path lookup code.
+	 */
+	if (!has_key ||
+	    cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL)
+		set_delayed_call(done, kfree_link, pstr.name);
+
+	return pstr.name;
+
+err_kfree:
+	kfree(pstr.name);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(llcrypt_get_symlink);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keyring.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keyring.c
new file mode 100644
index 0000000000000..358dda2591245
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keyring.c
@@ -0,0 +1,1012 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Filesystem-level keyring for llcrypt
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * This file implements management of llcrypt master keys in the
+ * filesystem-level keyring, including the ioctls:
+ *
+ * - LL_IOC_ADD_ENCRYPTION_KEY
+ * - LL_IOC_REMOVE_ENCRYPTION_KEY
+ * - LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS
+ * - LL_IOC_GET_ENCRYPTION_KEY_STATUS
+ *
+ * See the "User API" section of Documentation/filesystems/llcrypt.rst for more
+ * information about these ioctls.
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+
+#include <crypto/skcipher.h>
+#include <linux/key-type.h>
+#include <linux/seq_file.h>
+#include <libcfs/linux/linux-misc.h>
+
+#include "llcrypt_private.h"
+
+static void wipe_master_key_secret(struct llcrypt_master_key_secret *secret)
+{
+	llcrypt_destroy_hkdf(&secret->hkdf);
+	memzero_explicit(secret, sizeof(*secret));
+}
+
+static void move_master_key_secret(struct llcrypt_master_key_secret *dst,
+				   struct llcrypt_master_key_secret *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+	memzero_explicit(src, sizeof(*src));
+}
+
+static void free_master_key(struct llcrypt_master_key *mk)
+{
+	size_t i;
+
+	wipe_master_key_secret(&mk->mk_secret);
+
+	for (i = 0; i < ARRAY_SIZE(mk->mk_mode_keys); i++)
+		crypto_free_skcipher(mk->mk_mode_keys[i]);
+
+	key_put(mk->mk_users);
+	kfree_sensitive(mk);
+}
+
+static inline bool valid_key_spec(const struct llcrypt_key_specifier *spec)
+{
+	if (spec->__reserved)
+		return false;
+	return master_key_spec_len(spec) != 0;
+}
+
+static int llcrypt_key_instantiate(struct key *key,
+				   struct key_preparsed_payload *prep)
+{
+	key->payload.data[0] = (struct llcrypt_master_key *)prep->data;
+	return 0;
+}
+
+static void llcrypt_key_destroy(struct key *key)
+{
+	free_master_key(key->payload.data[0]);
+}
+
+static void llcrypt_key_describe(const struct key *key, struct seq_file *m)
+{
+	seq_puts(m, key->description);
+
+	if (key_is_positive(key)) {
+		const struct llcrypt_master_key *mk = key->payload.data[0];
+
+		if (!is_master_key_secret_present(&mk->mk_secret))
+			seq_puts(m, ": secret removed");
+	}
+}
+
+/*
+ * Type of key in ->lsi_master_keys.  Each key of this type represents a master
+ * key which has been added to the filesystem.  Its payload is a
+ * 'struct llcrypt_master_key'.  The "." prefix in the key type name prevents
+ * users from adding keys of this type via the keyrings syscalls rather than via
+ * the intended method of LL_IOC_ADD_ENCRYPTION_KEY.
+ */
+static struct key_type key_type_llcrypt = {
+	.name			= "._llcrypt",
+	.instantiate		= llcrypt_key_instantiate,
+	.destroy		= llcrypt_key_destroy,
+	.describe		= llcrypt_key_describe,
+};
+
+static int llcrypt_user_key_instantiate(struct key *key,
+					struct key_preparsed_payload *prep)
+{
+	/*
+	 * We just charge LLCRYPT_MAX_KEY_SIZE bytes to the user's key quota for
+	 * each key, regardless of the exact key size.  The amount of memory
+	 * actually used is greater than the size of the raw key anyway.
+	 */
+	return key_payload_reserve(key, LLCRYPT_MAX_KEY_SIZE);
+}
+
+static void llcrypt_user_key_describe(const struct key *key, struct seq_file *m)
+{
+	seq_puts(m, key->description);
+}
+
+/*
+ * Type of key in ->mk_users.  Each key of this type represents a particular
+ * user who has added a particular master key.
+ *
+ * Note that the name of this key type really should be something like
+ * ".llcrypt-user" instead of simply ".llcrypt".  But the shorter name is chosen
+ * mainly for simplicity of presentation in /proc/keys when read by a non-root
+ * user.  And it is expected to be rare that a key is actually added by multiple
+ * users, since users should keep their encryption keys confidential.
+ */
+static struct key_type key_type_llcrypt_user = {
+	.name			= ".llcrypt",
+	.instantiate		= llcrypt_user_key_instantiate,
+	.describe		= llcrypt_user_key_describe,
+};
+
+/* Search ->lsi_master_keys or ->mk_users */
+static struct key *search_llcrypt_keyring(struct key *keyring,
+					  struct key_type *type,
+					  const char *description)
+{
+	/*
+	 * We need to mark the keyring reference as "possessed" so that we
+	 * acquire permission to search it, via the KEY_POS_SEARCH permission.
+	 */
+	key_ref_t keyref = make_key_ref(keyring, true /* possessed */);
+
+#ifdef HAVE_KEYRING_SEARCH_4ARGS
+	keyref = keyring_search(keyref, type, description, false);
+#else
+	keyref = keyring_search(keyref, type, description);
+#endif
+	if (IS_ERR(keyref)) {
+		if (PTR_ERR(keyref) == -EAGAIN || /* not found */
+		    PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
+			keyref = ERR_PTR(-ENOKEY);
+		return ERR_CAST(keyref);
+	}
+	return key_ref_to_ptr(keyref);
+}
+
+#define LLCRYPT_FS_KEYRING_DESCRIPTION_SIZE	\
+	(CONST_STRLEN("llcrypt-") + sizeof_field(struct super_block, s_id))
+
+#define LLCRYPT_MK_DESCRIPTION_SIZE	(2 * LLCRYPT_KEY_IDENTIFIER_SIZE + 1)
+
+#define LLCRYPT_MK_USERS_DESCRIPTION_SIZE	\
+	(CONST_STRLEN("llcrypt-") + 2 * LLCRYPT_KEY_IDENTIFIER_SIZE + \
+	 CONST_STRLEN("-users") + 1)
+
+#define LLCRYPT_MK_USER_DESCRIPTION_SIZE	\
+	(2 * LLCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1)
+
+static void format_fs_keyring_description(
+			char description[LLCRYPT_FS_KEYRING_DESCRIPTION_SIZE],
+			const struct super_block *sb)
+{
+	sprintf(description, "llcrypt-%s", sb->s_id);
+}
+
+static void format_mk_description(
+			char description[LLCRYPT_MK_DESCRIPTION_SIZE],
+			const struct llcrypt_key_specifier *mk_spec)
+{
+	sprintf(description, "%*phN",
+		master_key_spec_len(mk_spec), (u8 *)&mk_spec->u);
+}
+
+static void format_mk_users_keyring_description(
+			char description[LLCRYPT_MK_USERS_DESCRIPTION_SIZE],
+			const u8 mk_identifier[LLCRYPT_KEY_IDENTIFIER_SIZE])
+{
+	sprintf(description, "llcrypt-%*phN-users",
+		LLCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier);
+}
+
+static void format_mk_user_description(
+			char description[LLCRYPT_MK_USER_DESCRIPTION_SIZE],
+			const u8 mk_identifier[LLCRYPT_KEY_IDENTIFIER_SIZE])
+{
+
+	sprintf(description, "%*phN.uid.%u", LLCRYPT_KEY_IDENTIFIER_SIZE,
+		mk_identifier, __kuid_val(current_fsuid()));
+}
+
+/* Create ->lsi_master_keys if needed.  Synchronized by llcrypt_add_key_mutex. */
+static int allocate_filesystem_keyring(struct super_block *sb)
+{
+	char description[LLCRYPT_FS_KEYRING_DESCRIPTION_SIZE];
+	struct key *keyring;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (!lsi)
+		return -EINVAL;
+
+	if (lsi->lsi_master_keys)
+		return 0;
+
+	format_fs_keyring_description(description, sb);
+	keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				current_cred(), KEY_POS_SEARCH |
+				  KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
+	/* Pairs with READ_ONCE() in llcrypt_find_master_key() */
+	smp_store_release(&lsi->lsi_master_keys, keyring);
+	return 0;
+}
+
+void llcrypt_sb_free(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (lsi != NULL) {
+		key_put(lsi->lsi_master_keys);
+		lsi->lsi_master_keys = NULL;
+	}
+}
+EXPORT_SYMBOL(llcrypt_sb_free);
+
+/*
+ * Find the specified master key in ->lsi_master_keys.
+ * Returns ERR_PTR(-ENOKEY) if not found.
+ */
+struct key *llcrypt_find_master_key(struct super_block *sb,
+				    const struct llcrypt_key_specifier *mk_spec)
+{
+	struct key *keyring;
+	char description[LLCRYPT_MK_DESCRIPTION_SIZE];
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (!lsi)
+		return ERR_PTR(-EINVAL);
+
+	/* pairs with smp_store_release() in allocate_filesystem_keyring() */
+	keyring = READ_ONCE(lsi->lsi_master_keys);
+	if (keyring == NULL)
+		return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
+
+	format_mk_description(description, mk_spec);
+	return search_llcrypt_keyring(keyring, &key_type_llcrypt, description);
+}
+
+static int allocate_master_key_users_keyring(struct llcrypt_master_key *mk)
+{
+	char description[LLCRYPT_MK_USERS_DESCRIPTION_SIZE];
+	struct key *keyring;
+
+	format_mk_users_keyring_description(description,
+					    mk->mk_spec.u.identifier);
+	keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				current_cred(), KEY_POS_SEARCH |
+				  KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
+	mk->mk_users = keyring;
+	return 0;
+}
+
+/*
+ * Find the current user's "key" in the master key's ->mk_users.
+ * Returns ERR_PTR(-ENOKEY) if not found.
+ */
+static struct key *find_master_key_user(struct llcrypt_master_key *mk)
+{
+	char description[LLCRYPT_MK_USER_DESCRIPTION_SIZE];
+
+	format_mk_user_description(description, mk->mk_spec.u.identifier);
+	return search_llcrypt_keyring(mk->mk_users, &key_type_llcrypt_user,
+				      description);
+}
+
+/*
+ * Give the current user a "key" in ->mk_users.  This charges the user's quota
+ * and marks the master key as added by the current user, so that it cannot be
+ * removed by another user with the key.  Either the master key's key->sem must
+ * be held for write, or the master key must be still undergoing initialization.
+ */
+static int add_master_key_user(struct llcrypt_master_key *mk)
+{
+	char description[LLCRYPT_MK_USER_DESCRIPTION_SIZE];
+	struct key *mk_user;
+	int err;
+
+	format_mk_user_description(description, mk->mk_spec.u.identifier);
+	mk_user = key_alloc(&key_type_llcrypt_user, description,
+			    current_fsuid(), current_gid(), current_cred(),
+			    KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL);
+	if (IS_ERR(mk_user))
+		return PTR_ERR(mk_user);
+
+	err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL);
+	key_put(mk_user);
+	return err;
+}
+
+/*
+ * Remove the current user's "key" from ->mk_users.
+ * The master key's key->sem must be held for write.
+ *
+ * Returns 0 if removed, -ENOKEY if not found, or another -errno code.
+ */
+static int remove_master_key_user(struct llcrypt_master_key *mk)
+{
+	struct key *mk_user;
+	int err;
+
+	mk_user = find_master_key_user(mk);
+	if (IS_ERR(mk_user))
+		return PTR_ERR(mk_user);
+	err = key_unlink(mk->mk_users, mk_user);
+	key_put(mk_user);
+	return err;
+}
+
+/*
+ * Allocate a new llcrypt_master_key which contains the given secret, set it as
+ * the payload of a new 'struct key' of type llcrypt, and link the 'struct key'
+ * into the given keyring.  Synchronized by llcrypt_add_key_mutex.
+ */
+static int add_new_master_key(struct llcrypt_master_key_secret *secret,
+			      const struct llcrypt_key_specifier *mk_spec,
+			      struct key *keyring)
+{
+	struct llcrypt_master_key *mk;
+	char description[LLCRYPT_MK_DESCRIPTION_SIZE];
+	struct key *key;
+	int err;
+
+	mk = kzalloc(sizeof(*mk), GFP_KERNEL);
+	if (!mk)
+		return -ENOMEM;
+
+	mk->mk_spec = *mk_spec;
+
+	move_master_key_secret(&mk->mk_secret, secret);
+	init_rwsem(&mk->mk_secret_sem);
+
+	refcount_set(&mk->mk_refcount, 1); /* secret is present */
+	INIT_LIST_HEAD(&mk->mk_decrypted_inodes);
+	spin_lock_init(&mk->mk_decrypted_inodes_lock);
+
+	if (mk_spec->type == LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
+		err = allocate_master_key_users_keyring(mk);
+		if (err)
+			goto out_free_mk;
+		err = add_master_key_user(mk);
+		if (err)
+			goto out_free_mk;
+	}
+
+	/*
+	 * Note that we don't charge this key to anyone's quota, since when
+	 * ->mk_users is in use those keys are charged instead, and otherwise
+	 * (when ->mk_users isn't in use) only root can add these keys.
+	 */
+	format_mk_description(description, mk_spec);
+	key = key_alloc(&key_type_llcrypt, description,
+			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+			KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW,
+			KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		goto out_free_mk;
+	}
+	err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL);
+	key_put(key);
+	if (err)
+		goto out_free_mk;
+
+	return 0;
+
+out_free_mk:
+	free_master_key(mk);
+	return err;
+}
+
+#define KEY_DEAD	1
+
+static int add_existing_master_key(struct llcrypt_master_key *mk,
+				   struct llcrypt_master_key_secret *secret)
+{
+	struct key *mk_user;
+	bool rekey;
+	int err;
+
+	/*
+	 * If the current user is already in ->mk_users, then there's nothing to
+	 * do.  (Not applicable for v1 policy keys, which have NULL ->mk_users.)
+	 */
+	if (mk->mk_users) {
+		mk_user = find_master_key_user(mk);
+		if (mk_user != ERR_PTR(-ENOKEY)) {
+			if (IS_ERR(mk_user))
+				return PTR_ERR(mk_user);
+			key_put(mk_user);
+			return 0;
+		}
+	}
+
+	/* If we'll be re-adding ->mk_secret, try to take the reference. */
+	rekey = !is_master_key_secret_present(&mk->mk_secret);
+	if (rekey && !refcount_inc_not_zero(&mk->mk_refcount))
+		return KEY_DEAD;
+
+	/* Add the current user to ->mk_users, if applicable. */
+	if (mk->mk_users) {
+		err = add_master_key_user(mk);
+		if (err) {
+			if (rekey && refcount_dec_and_test(&mk->mk_refcount))
+				return KEY_DEAD;
+			return err;
+		}
+	}
+
+	/* Re-add the secret if needed. */
+	if (rekey) {
+		down_write(&mk->mk_secret_sem);
+		move_master_key_secret(&mk->mk_secret, secret);
+		up_write(&mk->mk_secret_sem);
+	}
+	return 0;
+}
+
+static int add_master_key(struct super_block *sb,
+			  struct llcrypt_master_key_secret *secret,
+			  const struct llcrypt_key_specifier *mk_spec)
+{
+	static DEFINE_MUTEX(llcrypt_add_key_mutex);
+	struct key *key;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	int err;
+
+	if (!lsi)
+		return -EINVAL;
+
+	mutex_lock(&llcrypt_add_key_mutex); /* serialize find + link */
+retry:
+	key = llcrypt_find_master_key(sb, mk_spec);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		if (err != -ENOKEY)
+			goto out_unlock;
+		/* Didn't find the key in ->lsi_master_keys.  Add it. */
+		err = allocate_filesystem_keyring(sb);
+		if (err)
+			goto out_unlock;
+		err = add_new_master_key(secret, mk_spec,
+					 lsi->lsi_master_keys);
+	} else {
+		/*
+		 * Found the key in ->lsi_master_keys.  Re-add the secret if
+		 * needed, and add the user to ->mk_users if needed.
+		 */
+		down_write(&key->sem);
+		err = add_existing_master_key(key->payload.data[0], secret);
+		up_write(&key->sem);
+		if (err == KEY_DEAD) {
+			/* Key being removed or needs to be removed */
+			key_invalidate(key);
+			key_put(key);
+			goto retry;
+		}
+		key_put(key);
+	}
+out_unlock:
+	mutex_unlock(&llcrypt_add_key_mutex);
+	return err;
+}
+
+/*
+ * Add a master encryption key to the filesystem, causing all files which were
+ * encrypted with it to appear "unlocked" (decrypted) when accessed.
+ *
+ * When adding a key for use by v1 encryption policies, this ioctl is
+ * privileged, and userspace must provide the 'key_descriptor'.
+ *
+ * When adding a key for use by v2+ encryption policies, this ioctl is
+ * unprivileged.  This is needed, in general, to allow non-root users to use
+ * encryption without encountering the visibility problems of process-subscribed
+ * keyrings and the inability to properly remove keys.  This works by having
+ * each key identified by its cryptographically secure hash --- the
+ * 'key_identifier'.  The cryptographic hash ensures that a malicious user
+ * cannot add the wrong key for a given identifier.  Furthermore, each added key
+ * is charged to the appropriate user's quota for the keyrings service, which
+ * prevents a malicious user from adding too many keys.  Finally, we forbid a
+ * user from removing a key while other users have added it too, which prevents
+ * a user who knows another user's key from causing a denial-of-service by
+ * removing it at an inopportune time.  (We tolerate that a user who knows a key
+ * can prevent other users from removing it.)
+ *
+ * For more details, see the "LL_IOC_ADD_ENCRYPTION_KEY" section of
+ * Documentation/filesystems/llcrypt.rst.
+ */
+int llcrypt_ioctl_add_key(struct file *filp, void __user *_uarg)
+{
+	struct super_block *sb = file_inode(filp)->i_sb;
+	struct llcrypt_add_key_arg __user *uarg = _uarg;
+	struct llcrypt_add_key_arg arg;
+	struct llcrypt_master_key_secret secret;
+	int err;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (!valid_key_spec(&arg.key_spec))
+		return -EINVAL;
+
+	if (arg.raw_size < LLCRYPT_MIN_KEY_SIZE ||
+	    arg.raw_size > LLCRYPT_MAX_KEY_SIZE)
+		return -EINVAL;
+
+	if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+		return -EINVAL;
+
+	memset(&secret, 0, sizeof(secret));
+	secret.size = arg.raw_size;
+	err = -EFAULT;
+	if (copy_from_user(secret.raw, uarg->raw, secret.size))
+		goto out_wipe_secret;
+
+	switch (arg.key_spec.type) {
+	case LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+		/*
+		 * Only root can add keys that are identified by an arbitrary
+		 * descriptor rather than by a cryptographic hash --- since
+		 * otherwise a malicious user could add the wrong key.
+		 */
+		err = -EACCES;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_wipe_secret;
+		break;
+	case LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+		err = llcrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size);
+		if (err)
+			goto out_wipe_secret;
+
+		/*
+		 * Now that the HKDF context is initialized, the raw key is no
+		 * longer needed.
+		 */
+		memzero_explicit(secret.raw, secret.size);
+
+		/* Calculate the key identifier and return it to userspace. */
+		err = llcrypt_hkdf_expand(&secret.hkdf,
+					  HKDF_CONTEXT_KEY_IDENTIFIER,
+					  NULL, 0, arg.key_spec.u.identifier,
+					  LLCRYPT_KEY_IDENTIFIER_SIZE);
+		if (err)
+			goto out_wipe_secret;
+		err = -EFAULT;
+		if (copy_to_user(uarg->key_spec.u.identifier,
+				 arg.key_spec.u.identifier,
+				 LLCRYPT_KEY_IDENTIFIER_SIZE))
+			goto out_wipe_secret;
+		break;
+	default:
+		WARN_ON(1);
+		err = -EINVAL;
+		goto out_wipe_secret;
+	}
+
+	err = add_master_key(sb, &secret, &arg.key_spec);
+out_wipe_secret:
+	wipe_master_key_secret(&secret);
+	return err;
+}
+EXPORT_SYMBOL_GPL(llcrypt_ioctl_add_key);
+
+/*
+ * Verify that the current user has added a master key with the given identifier
+ * (returns -ENOKEY if not).  This is needed to prevent a user from encrypting
+ * their files using some other user's key which they don't actually know.
+ * Cryptographically this isn't much of a problem, but the semantics of this
+ * would be a bit weird, so it's best to just forbid it.
+ *
+ * The system administrator (CAP_FOWNER) can override this, which should be
+ * enough for any use cases where encryption policies are being set using keys
+ * that were chosen ahead of time but aren't available at the moment.
+ *
+ * Note that the key may have already removed by the time this returns, but
+ * that's okay; we just care whether the key was there at some point.
+ *
+ * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code
+ */
+int llcrypt_verify_key_added(struct super_block *sb,
+			     const u8 identifier[LLCRYPT_KEY_IDENTIFIER_SIZE])
+{
+	struct llcrypt_key_specifier mk_spec;
+	struct key *key, *mk_user;
+	struct llcrypt_master_key *mk;
+	int err;
+
+	mk_spec.type = LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
+	memcpy(mk_spec.u.identifier, identifier, LLCRYPT_KEY_IDENTIFIER_SIZE);
+
+	key = llcrypt_find_master_key(sb, &mk_spec);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		goto out;
+	}
+	mk = key->payload.data[0];
+	mk_user = find_master_key_user(mk);
+	if (IS_ERR(mk_user)) {
+		err = PTR_ERR(mk_user);
+	} else {
+		key_put(mk_user);
+		err = 0;
+	}
+	key_put(key);
+out:
+	if (err == -ENOKEY && capable(CAP_FOWNER))
+		err = 0;
+	return err;
+}
+
+/*
+ * Try to evict the inode's dentries from the dentry cache.  If the inode is a
+ * directory, then it can have at most one dentry; however, that dentry may be
+ * pinned by child dentries, so first try to evict the children too.
+ */
+static void shrink_dcache_inode(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	if (S_ISDIR(inode->i_mode)) {
+		dentry = d_find_any_alias(inode);
+		if (dentry) {
+			shrink_dcache_parent(dentry);
+			dput(dentry);
+		}
+	}
+	d_prune_aliases(inode);
+}
+
+static void evict_dentries_for_decrypted_inodes(struct llcrypt_master_key *mk)
+{
+	struct llcrypt_info *ci;
+	struct inode *inode;
+	struct inode *toput_inode = NULL;
+
+	spin_lock(&mk->mk_decrypted_inodes_lock);
+
+	list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) {
+		inode = ci->ci_inode;
+		if (igrab(inode) == NULL)
+			continue;
+		spin_unlock(&mk->mk_decrypted_inodes_lock);
+
+		shrink_dcache_inode(inode);
+		iput(toput_inode);
+		toput_inode = inode;
+
+		spin_lock(&mk->mk_decrypted_inodes_lock);
+	}
+
+	spin_unlock(&mk->mk_decrypted_inodes_lock);
+	iput(toput_inode);
+}
+
+static int check_for_busy_inodes(struct super_block *sb,
+				 struct llcrypt_master_key *mk)
+{
+	struct list_head *pos;
+	size_t busy_count = 0;
+	unsigned long ino;
+	struct dentry *dentry;
+	char _path[256];
+	char *path = NULL;
+
+	spin_lock(&mk->mk_decrypted_inodes_lock);
+
+	list_for_each(pos, &mk->mk_decrypted_inodes)
+		busy_count++;
+
+	if (busy_count == 0) {
+		spin_unlock(&mk->mk_decrypted_inodes_lock);
+		return 0;
+	}
+
+	{
+		/* select an example file to show for debugging purposes */
+		struct inode *inode =
+			list_first_entry(&mk->mk_decrypted_inodes,
+					 struct llcrypt_info,
+					 ci_master_key_link)->ci_inode;
+		ino = inode->i_ino;
+		dentry = d_find_alias(inode);
+	}
+	spin_unlock(&mk->mk_decrypted_inodes_lock);
+
+	if (dentry) {
+		path = dentry_path_raw(dentry, _path, sizeof(_path));
+		dput(dentry);
+	}
+	if (IS_ERR_OR_NULL(path))
+		path = "(unknown)";
+
+	llcrypt_warn(NULL,
+		     "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu (%s)",
+		     sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec),
+		     master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u,
+		     ino, path);
+	return -EBUSY;
+}
+
+static int try_to_lock_encrypted_files(struct super_block *sb,
+				       struct llcrypt_master_key *mk)
+{
+	int err1;
+	int err2;
+
+	/*
+	 * An inode can't be evicted while it is dirty or has dirty pages.
+	 * Thus, we first have to clean the inodes in ->mk_decrypted_inodes.
+	 *
+	 * Just do it the easy way: call sync_filesystem().  It's overkill, but
+	 * it works, and it's more important to minimize the amount of caches we
+	 * drop than the amount of data we sync.  Also, unprivileged users can
+	 * already call sync_filesystem() via sys_syncfs() or sys_sync().
+	 */
+	down_read(&sb->s_umount);
+	err1 = sync_filesystem(sb);
+	up_read(&sb->s_umount);
+	/* If a sync error occurs, still try to evict as much as possible. */
+
+	/*
+	 * Inodes are pinned by their dentries, so we have to evict their
+	 * dentries.  shrink_dcache_sb() would suffice, but would be overkill
+	 * and inappropriate for use by unprivileged users.  So instead go
+	 * through the inodes' alias lists and try to evict each dentry.
+	 */
+	evict_dentries_for_decrypted_inodes(mk);
+
+	/*
+	 * evict_dentries_for_decrypted_inodes() already iput() each inode in
+	 * the list; any inodes for which that dropped the last reference will
+	 * have been evicted due to llcrypt_drop_inode() detecting the key
+	 * removal and telling the VFS to evict the inode.  So to finish, we
+	 * just need to check whether any inodes couldn't be evicted.
+	 */
+	err2 = check_for_busy_inodes(sb, mk);
+
+	return err1 ?: err2;
+}
+
+/*
+ * Try to remove an llcrypt master encryption key.
+ *
+ * LL_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's
+ * claim to the key, then removes the key itself if no other users have claims.
+ * LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the
+ * key itself.
+ *
+ * To "remove the key itself", first we wipe the actual master key secret, so
+ * that no more inodes can be unlocked with it.  Then we try to evict all cached
+ * inodes that had been unlocked with the key.
+ *
+ * If all inodes were evicted, then we unlink the llcrypt_master_key from the
+ * keyring.  Otherwise it remains in the keyring in the "incompletely removed"
+ * state (without the actual secret key) where it tracks the list of remaining
+ * inodes.  Userspace can execute the ioctl again later to retry eviction, or
+ * alternatively can re-add the secret key again.
+ *
+ * For more details, see the "Removing keys" section of
+ * Documentation/filesystems/llcrypt.rst.
+ */
+static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
+{
+	struct super_block *sb = file_inode(filp)->i_sb;
+	struct llcrypt_remove_key_arg __user *uarg = _uarg;
+	struct llcrypt_remove_key_arg arg;
+	struct key *key;
+	struct llcrypt_master_key *mk;
+	u32 status_flags = 0;
+	int err;
+	bool dead;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (!valid_key_spec(&arg.key_spec))
+		return -EINVAL;
+
+	if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+		return -EINVAL;
+
+	/*
+	 * Only root can add and remove keys that are identified by an arbitrary
+	 * descriptor rather than by a cryptographic hash.
+	 */
+	if (arg.key_spec.type == LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	/* Find the key being removed. */
+	key = llcrypt_find_master_key(sb, &arg.key_spec);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+	mk = key->payload.data[0];
+
+	down_write(&key->sem);
+
+	/* If relevant, remove current user's (or all users) claim to the key */
+	if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) {
+		if (all_users)
+			err = keyring_clear(mk->mk_users);
+		else
+			err = remove_master_key_user(mk);
+		if (err) {
+			up_write(&key->sem);
+			goto out_put_key;
+		}
+		if (mk->mk_users->keys.nr_leaves_on_tree != 0) {
+			/*
+			 * Other users have still added the key too.  We removed
+			 * the current user's claim to the key, but we still
+			 * can't remove the key itself.
+			 */
+			status_flags |=
+				LLCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS;
+			err = 0;
+			up_write(&key->sem);
+			goto out_put_key;
+		}
+	}
+
+	/* No user claims remaining.  Go ahead and wipe the secret. */
+	dead = false;
+	if (is_master_key_secret_present(&mk->mk_secret)) {
+		down_write(&mk->mk_secret_sem);
+		wipe_master_key_secret(&mk->mk_secret);
+		dead = refcount_dec_and_test(&mk->mk_refcount);
+		up_write(&mk->mk_secret_sem);
+	}
+	up_write(&key->sem);
+	if (dead) {
+		/*
+		 * No inodes reference the key, and we wiped the secret, so the
+		 * key object is free to be removed from the keyring.
+		 */
+		key_invalidate(key);
+		err = 0;
+	} else {
+		/* Some inodes still reference this key; try to evict them. */
+		err = try_to_lock_encrypted_files(sb, mk);
+		if (err == -EBUSY) {
+			status_flags |=
+				LLCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY;
+			err = 0;
+		}
+	}
+	/*
+	 * We return 0 if we successfully did something: removed a claim to the
+	 * key, wiped the secret, or tried locking the files again.  Users need
+	 * to check the informational status flags if they care whether the key
+	 * has been fully removed including all files locked.
+	 */
+out_put_key:
+	key_put(key);
+	if (err == 0)
+		err = put_user(status_flags, &uarg->removal_status_flags);
+	return err;
+}
+
+int llcrypt_ioctl_remove_key(struct file *filp, void __user *uarg)
+{
+	return do_remove_key(filp, uarg, false);
+}
+EXPORT_SYMBOL_GPL(llcrypt_ioctl_remove_key);
+
+int llcrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	return do_remove_key(filp, uarg, true);
+}
+EXPORT_SYMBOL_GPL(llcrypt_ioctl_remove_key_all_users);
+
+/*
+ * Retrieve the status of an llcrypt master encryption key.
+ *
+ * We set ->status to indicate whether the key is absent, present, or
+ * incompletely removed.  "Incompletely removed" means that the master key
+ * secret has been removed, but some files which had been unlocked with it are
+ * still in use.  This field allows applications to easily determine the state
+ * of an encrypted directory without using a hack such as trying to open a
+ * regular file in it (which can confuse the "incompletely removed" state with
+ * absent or present).
+ *
+ * In addition, for v2 policy keys we allow applications to determine, via
+ * ->status_flags and ->user_count, whether the key has been added by the
+ * current user, by other users, or by both.  Most applications should not need
+ * this, since ordinarily only one user should know a given key.  However, if a
+ * secret key is shared by multiple users, applications may wish to add an
+ * already-present key to prevent other users from removing it.  This ioctl can
+ * be used to check whether that really is the case before the work is done to
+ * add the key --- which might e.g. require prompting the user for a passphrase.
+ *
+ * For more details, see the "LL_IOC_GET_ENCRYPTION_KEY_STATUS" section of
+ * Documentation/filesystems/llcrypt.rst.
+ */
+int llcrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
+{
+	struct super_block *sb = file_inode(filp)->i_sb;
+	struct llcrypt_get_key_status_arg arg;
+	struct key *key;
+	struct llcrypt_master_key *mk;
+	int err;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (!valid_key_spec(&arg.key_spec))
+		return -EINVAL;
+
+	if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+		return -EINVAL;
+
+	arg.status_flags = 0;
+	arg.user_count = 0;
+	memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved));
+
+	key = llcrypt_find_master_key(sb, &arg.key_spec);
+	if (IS_ERR(key)) {
+		if (key != ERR_PTR(-ENOKEY))
+			return PTR_ERR(key);
+		arg.status = LLCRYPT_KEY_STATUS_ABSENT;
+		err = 0;
+		goto out;
+	}
+	mk = key->payload.data[0];
+	down_read(&key->sem);
+
+	if (!is_master_key_secret_present(&mk->mk_secret)) {
+		arg.status = LLCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED;
+		err = 0;
+		goto out_release_key;
+	}
+
+	arg.status = LLCRYPT_KEY_STATUS_PRESENT;
+	if (mk->mk_users) {
+		struct key *mk_user;
+
+		arg.user_count = mk->mk_users->keys.nr_leaves_on_tree;
+		mk_user = find_master_key_user(mk);
+		if (!IS_ERR(mk_user)) {
+			arg.status_flags |=
+				LLCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF;
+			key_put(mk_user);
+		} else if (mk_user != ERR_PTR(-ENOKEY)) {
+			err = PTR_ERR(mk_user);
+			goto out_release_key;
+		}
+	}
+	err = 0;
+out_release_key:
+	up_read(&key->sem);
+	key_put(key);
+out:
+	if (!err && copy_to_user(uarg, &arg, sizeof(arg)))
+		err = -EFAULT;
+	return err;
+}
+EXPORT_SYMBOL_GPL(llcrypt_ioctl_get_key_status);
+
+int __init llcrypt_init_keyring(void)
+{
+	int err;
+
+	err = register_key_type(&key_type_llcrypt);
+	if (err)
+		return err;
+
+	err = register_key_type(&key_type_llcrypt_user);
+	if (err)
+		goto err_unregister_llcrypt;
+
+	return 0;
+
+err_unregister_llcrypt:
+	unregister_key_type(&key_type_llcrypt);
+	return err;
+}
+
+void __exit llcrypt_exit_keyring(void)
+{
+	unregister_key_type(&key_type_llcrypt_user);
+	unregister_key_type(&key_type_llcrypt);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup.c
new file mode 100644
index 0000000000000..67fe888f895db
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup.c
@@ -0,0 +1,635 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Key setup facility for FS encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
+ * Heavily modified since then.
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+
+#include <crypto/aes.h>
+#ifdef HAVE_CRYPTO_SHA2_HEADER
+#include <crypto/sha2.h>
+#else
+#include <crypto/sha.h>
+#endif
+#include <crypto/skcipher.h>
+#include <linux/key.h>
+
+#include "llcrypt_private.h"
+
+#ifdef HAVE_CIPHER_H
+#include <crypto/internal/cipher.h>
+
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
+#endif
+
+static struct crypto_shash *essiv_hash_tfm;
+
+static struct llcrypt_mode available_modes[] = {
+	[LLCRYPT_MODE_NULL] = {
+		.friendly_name = "NULL",
+		.cipher_str = "null",
+		.keysize = 0,
+		.ivsize = 0,
+	},
+	[LLCRYPT_MODE_AES_256_XTS] = {
+		.friendly_name = "AES-256-XTS",
+		.cipher_str = "xts(aes)",
+		.keysize = 64,
+		.ivsize = 16,
+	},
+	[LLCRYPT_MODE_AES_256_CTS] = {
+		.friendly_name = "AES-256-CTS-CBC",
+		.cipher_str = "cts(cbc(aes))",
+		.keysize = 32,
+		.ivsize = 16,
+	},
+	[LLCRYPT_MODE_AES_128_CBC] = {
+		.friendly_name = "AES-128-CBC",
+		.cipher_str = "cbc(aes)",
+		.keysize = 16,
+		.ivsize = 16,
+		.needs_essiv = true,
+	},
+	[LLCRYPT_MODE_AES_128_CTS] = {
+		.friendly_name = "AES-128-CTS-CBC",
+		.cipher_str = "cts(cbc(aes))",
+		.keysize = 16,
+		.ivsize = 16,
+	},
+	[LLCRYPT_MODE_ADIANTUM] = {
+		.friendly_name = "Adiantum",
+		.cipher_str = "adiantum(xchacha12,aes)",
+		.keysize = 32,
+		.ivsize = 32,
+	},
+};
+
+static struct llcrypt_mode *
+select_encryption_mode(const union llcrypt_policy *policy,
+		       const struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode))
+		return &available_modes[llcrypt_policy_contents_mode(policy)];
+
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		return &available_modes[llcrypt_policy_fnames_mode(policy)];
+
+	WARN_ONCE(1, "llcrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+		  inode->i_ino, (inode->i_mode & S_IFMT));
+	return ERR_PTR(-EINVAL);
+}
+
+/* Create a symmetric cipher object for the given encryption mode and key */
+struct crypto_skcipher *llcrypt_allocate_skcipher(struct llcrypt_mode *mode,
+						  const u8 *raw_key,
+						  const struct inode *inode)
+{
+	struct crypto_skcipher *tfm;
+	int err;
+
+	if (!strcmp(mode->cipher_str, "null"))
+		return NULL;
+
+	tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+	if (IS_ERR(tfm)) {
+		if (PTR_ERR(tfm) == -ENOENT) {
+			llcrypt_warn(inode,
+				     "Missing crypto API support for %s (API name: \"%s\")",
+				     mode->friendly_name, mode->cipher_str);
+			return ERR_PTR(-ENOPKG);
+		}
+		llcrypt_err(inode, "Error allocating '%s' transform: %ld",
+			    mode->cipher_str, PTR_ERR(tfm));
+		return tfm;
+	}
+	if (unlikely(!mode->logged_impl_name)) {
+		/*
+		 * llcrypt performance can vary greatly depending on which
+		 * crypto algorithm implementation is used.  Help people debug
+		 * performance problems by logging the ->cra_driver_name the
+		 * first time a mode is used.  Note that multiple threads can
+		 * race here, but it doesn't really matter.
+		 */
+		mode->logged_impl_name = true;
+		pr_info("llcrypt: %s using implementation \"%s\"\n",
+			mode->friendly_name,
+			crypto_skcipher_alg(tfm)->base.cra_driver_name);
+	}
+	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+	err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
+	if (err)
+		goto err_free_tfm;
+
+	return tfm;
+
+err_free_tfm:
+	crypto_free_skcipher(tfm);
+	return ERR_PTR(err);
+}
+
+static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
+{
+	struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
+
+	/* init hash transform on demand */
+	if (unlikely(!tfm)) {
+		struct crypto_shash *prev_tfm;
+
+		tfm = crypto_alloc_shash("sha256", 0, 0);
+		if (IS_ERR(tfm)) {
+			if (PTR_ERR(tfm) == -ENOENT) {
+				llcrypt_warn(NULL,
+					     "Missing crypto API support for SHA-256");
+				return -ENOPKG;
+			}
+			llcrypt_err(NULL,
+				    "Error allocating SHA-256 transform: %ld",
+				    PTR_ERR(tfm));
+			return PTR_ERR(tfm);
+		}
+		prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
+		if (prev_tfm) {
+			crypto_free_shash(tfm);
+			tfm = prev_tfm;
+		}
+	}
+
+	{
+		SHASH_DESC_ON_STACK(desc, tfm);
+		desc->tfm = tfm;
+
+		return crypto_shash_digest(desc, key, keysize, salt);
+	}
+}
+
+static int init_essiv_generator(struct llcrypt_info *ci, const u8 *raw_key,
+				int keysize)
+{
+	int err;
+	struct crypto_cipher *essiv_tfm;
+	u8 salt[SHA256_DIGEST_SIZE];
+
+	if (WARN_ON(ci->ci_mode->ivsize != AES_BLOCK_SIZE))
+		return -EINVAL;
+
+	essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(essiv_tfm))
+		return PTR_ERR(essiv_tfm);
+
+	ci->ci_essiv_tfm = essiv_tfm;
+
+	err = derive_essiv_salt(raw_key, keysize, salt);
+	if (err)
+		goto out;
+
+	/*
+	 * Using SHA256 to derive the salt/key will result in AES-256 being
+	 * used for IV generation. File contents encryption will still use the
+	 * configured keysize (AES-128) nevertheless.
+	 */
+	err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
+	if (err)
+		goto out;
+
+out:
+	memzero_explicit(salt, sizeof(salt));
+	return err;
+}
+
+/* Given the per-file key, set up the file's crypto transform object(s) */
+int llcrypt_set_derived_key(struct llcrypt_info *ci, const u8 *derived_key)
+{
+	struct llcrypt_mode *mode = ci->ci_mode;
+	struct crypto_skcipher *ctfm;
+	int err;
+
+	ctfm = llcrypt_allocate_skcipher(mode, derived_key, ci->ci_inode);
+	if (IS_ERR(ctfm))
+		return PTR_ERR(ctfm);
+
+	ci->ci_ctfm = ctfm;
+
+	if (mode->needs_essiv) {
+		err = init_essiv_generator(ci, derived_key, mode->keysize);
+		if (err) {
+			llcrypt_warn(ci->ci_inode,
+				     "Error initializing ESSIV generator: %d",
+				     err);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static int setup_per_mode_key(struct llcrypt_info *ci,
+			      struct llcrypt_master_key *mk)
+{
+	struct llcrypt_mode *mode = ci->ci_mode;
+	u8 mode_num = mode - available_modes;
+	struct crypto_skcipher *tfm, *prev_tfm;
+	u8 mode_key[LLCRYPT_MAX_KEY_SIZE];
+	int err;
+
+	if (WARN_ON(mode_num >= ARRAY_SIZE(mk->mk_mode_keys)))
+		return -EINVAL;
+
+	/* pairs with cmpxchg() below */
+	tfm = READ_ONCE(mk->mk_mode_keys[mode_num]);
+	if (likely(tfm != NULL))
+		goto done;
+
+	BUILD_BUG_ON(sizeof(mode_num) != 1);
+	err = llcrypt_hkdf_expand(&mk->mk_secret.hkdf,
+				  HKDF_CONTEXT_PER_MODE_KEY,
+				  &mode_num, sizeof(mode_num),
+				  mode_key, mode->keysize);
+	if (err)
+		return err;
+	tfm = llcrypt_allocate_skcipher(mode, mode_key, ci->ci_inode);
+	memzero_explicit(mode_key, mode->keysize);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	/* pairs with READ_ONCE() above */
+	prev_tfm = cmpxchg(&mk->mk_mode_keys[mode_num], NULL, tfm);
+	if (prev_tfm != NULL) {
+		crypto_free_skcipher(tfm);
+		tfm = prev_tfm;
+	}
+done:
+	ci->ci_ctfm = tfm;
+	return 0;
+}
+
+static int llcrypt_setup_v2_file_key(struct llcrypt_info *ci,
+				     struct llcrypt_master_key *mk)
+{
+	u8 derived_key[LLCRYPT_MAX_KEY_SIZE];
+	int err;
+
+	if (ci->ci_policy.v2.flags & LLCRYPT_POLICY_FLAG_DIRECT_KEY) {
+		/*
+		 * DIRECT_KEY: instead of deriving per-file keys, the per-file
+		 * nonce will be included in all the IVs.  But unlike v1
+		 * policies, for v2 policies in this case we don't encrypt with
+		 * the master key directly but rather derive a per-mode key.
+		 * This ensures that the master key is consistently used only
+		 * for HKDF, avoiding key reuse issues.
+		 */
+		if (!llcrypt_mode_supports_direct_key(ci->ci_mode)) {
+			llcrypt_warn(ci->ci_inode,
+				     "Direct key flag not allowed with %s",
+				     ci->ci_mode->friendly_name);
+			return -EINVAL;
+		}
+		return setup_per_mode_key(ci, mk);
+	}
+
+	err = llcrypt_hkdf_expand(&mk->mk_secret.hkdf,
+				  HKDF_CONTEXT_PER_FILE_KEY,
+				  ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE,
+				  derived_key, ci->ci_mode->keysize);
+	if (err)
+		return err;
+
+	err = llcrypt_set_derived_key(ci, derived_key);
+	memzero_explicit(derived_key, ci->ci_mode->keysize);
+	return err;
+}
+
+/*
+ * Find the master key, then set up the inode's actual encryption key.
+ *
+ * If the master key is found in the filesystem-level keyring, then the
+ * corresponding 'struct key' is returned in *master_key_ret with
+ * ->mk_secret_sem read-locked.  This is needed to ensure that only one task
+ * links the llcrypt_info into ->mk_decrypted_inodes (as multiple tasks may race
+ * to create an llcrypt_info for the same inode), and to synchronize the master
+ * key being removed with a new inode starting to use it.
+ */
+static int setup_file_encryption_key(struct llcrypt_info *ci,
+				     struct key **master_key_ret)
+{
+	struct key *key;
+	struct llcrypt_master_key *mk = NULL;
+	struct llcrypt_key_specifier mk_spec;
+	int err;
+
+	switch (ci->ci_policy.version) {
+	case LLCRYPT_POLICY_V1:
+		mk_spec.type = LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
+		memcpy(mk_spec.u.descriptor,
+		       ci->ci_policy.v1.master_key_descriptor,
+		       LLCRYPT_KEY_DESCRIPTOR_SIZE);
+		break;
+	case LLCRYPT_POLICY_V2:
+		mk_spec.type = LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
+		memcpy(mk_spec.u.identifier,
+		       ci->ci_policy.v2.master_key_identifier,
+		       LLCRYPT_KEY_IDENTIFIER_SIZE);
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	key = llcrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
+	if (IS_ERR(key)) {
+		if (key != ERR_PTR(-ENOKEY) ||
+		    ci->ci_policy.version != LLCRYPT_POLICY_V1)
+			return PTR_ERR(key);
+
+		/*
+		 * As a legacy fallback for v1 policies, search for the key in
+		 * the current task's subscribed keyrings too.  Don't move this
+		 * to before the search of ->lsi_master_keys, since users
+		 * shouldn't be able to override filesystem-level keys.
+		 */
+		return llcrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
+	}
+
+	mk = key->payload.data[0];
+	down_read(&mk->mk_secret_sem);
+
+	/* Has the secret been removed (via LL_IOC_REMOVE_ENCRYPTION_KEY)? */
+	if (!is_master_key_secret_present(&mk->mk_secret)) {
+		err = -ENOKEY;
+		goto out_release_key;
+	}
+
+	/*
+	 * Require that the master key be at least as long as the derived key.
+	 * Otherwise, the derived key cannot possibly contain as much entropy as
+	 * that required by the encryption mode it will be used for.  For v1
+	 * policies it's also required for the KDF to work at all.
+	 */
+	if (mk->mk_secret.size < ci->ci_mode->keysize) {
+		llcrypt_warn(NULL,
+			     "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+			     master_key_spec_type(&mk_spec),
+			     master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
+			     mk->mk_secret.size, ci->ci_mode->keysize);
+		err = -ENOKEY;
+		goto out_release_key;
+	}
+
+	switch (ci->ci_policy.version) {
+	case LLCRYPT_POLICY_V1:
+		err = llcrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
+		break;
+	case LLCRYPT_POLICY_V2:
+		err = llcrypt_setup_v2_file_key(ci, mk);
+		break;
+	default:
+		WARN_ON(1);
+		err = -EINVAL;
+		break;
+	}
+	if (err)
+		goto out_release_key;
+
+	*master_key_ret = key;
+	return 0;
+
+out_release_key:
+	up_read(&mk->mk_secret_sem);
+	key_put(key);
+	return err;
+}
+
+static void put_crypt_info(struct llcrypt_info *ci)
+{
+	struct key *key;
+
+	if (!ci)
+		return;
+
+	if (ci->ci_direct_key) {
+		llcrypt_put_direct_key(ci->ci_direct_key);
+	} else if ((ci->ci_ctfm != NULL || ci->ci_essiv_tfm != NULL) &&
+		   !llcrypt_is_direct_key_policy(&ci->ci_policy)) {
+		if (ci->ci_ctfm)
+			crypto_free_skcipher(ci->ci_ctfm);
+		crypto_free_cipher(ci->ci_essiv_tfm);
+	}
+
+	key = ci->ci_master_key;
+	if (key) {
+		struct llcrypt_master_key *mk = key->payload.data[0];
+
+		/*
+		 * Remove this inode from the list of inodes that were unlocked
+		 * with the master key.
+		 *
+		 * In addition, if we're removing the last inode from a key that
+		 * already had its secret removed, invalidate the key so that it
+		 * gets removed from ->lsi_master_keys.
+		 */
+		spin_lock(&mk->mk_decrypted_inodes_lock);
+		list_del(&ci->ci_master_key_link);
+		spin_unlock(&mk->mk_decrypted_inodes_lock);
+		if (refcount_dec_and_test(&mk->mk_refcount))
+			key_invalidate(key);
+		key_put(key);
+	}
+	kmem_cache_free(llcrypt_info_cachep, ci);
+}
+
+int llcrypt_get_encryption_info(struct inode *inode)
+{
+	struct llcrypt_info *crypt_info;
+	union llcrypt_context ctx;
+	struct llcrypt_mode *mode;
+	struct key *master_key = NULL;
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	int res;
+
+	if (llcrypt_has_encryption_key(inode))
+		return 0;
+
+	if (!lsi)
+		return -ENOKEY;
+
+	res = llcrypt_initialize(lsi->lsi_cop->flags);
+	if (res)
+		return res;
+
+	res = lsi->lsi_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res < 0) {
+		if (!llcrypt_dummy_context_enabled(inode) ||
+		    IS_ENCRYPTED(inode)) {
+			llcrypt_warn(inode,
+				     "Error %d getting encryption context",
+				     res);
+			return res;
+		}
+		/* Fake up a context for an unencrypted directory */
+		memset(&ctx, 0, sizeof(ctx));
+		ctx.version = LLCRYPT_CONTEXT_V1;
+		ctx.v1.contents_encryption_mode = LLCRYPT_MODE_AES_256_XTS;
+		if (lsi->lsi_flags & LSI_FILENAME_ENC) {
+			ctx.v1.filenames_encryption_mode =
+				LLCRYPT_MODE_AES_256_CTS;
+		} else {
+			llcrypt_warn(inode,
+			"dummy enc: forcing filenames_encryption_mode to null");
+			ctx.v1.filenames_encryption_mode = LLCRYPT_MODE_NULL;
+		}
+		memset(ctx.v1.master_key_descriptor, 0x42,
+		       LLCRYPT_KEY_DESCRIPTOR_SIZE);
+		res = sizeof(ctx.v1);
+	}
+
+	crypt_info = kmem_cache_zalloc(llcrypt_info_cachep, GFP_NOFS);
+	if (!crypt_info)
+		return -ENOMEM;
+
+	crypt_info->ci_inode = inode;
+
+	res = llcrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res);
+	if (res) {
+		llcrypt_warn(inode,
+			     "Unrecognized or corrupt encryption context");
+		goto out;
+	}
+
+	switch (ctx.version) {
+	case LLCRYPT_CONTEXT_V1:
+		memcpy(crypt_info->ci_nonce, ctx.v1.nonce,
+		       FS_KEY_DERIVATION_NONCE_SIZE);
+		break;
+	case LLCRYPT_CONTEXT_V2:
+		memcpy(crypt_info->ci_nonce, ctx.v2.nonce,
+		       FS_KEY_DERIVATION_NONCE_SIZE);
+		break;
+	default:
+		WARN_ON(1);
+		res = -EINVAL;
+		goto out;
+	}
+
+	if (!llcrypt_supported_policy(&crypt_info->ci_policy, inode)) {
+		res = -EINVAL;
+		goto out;
+	}
+
+	mode = select_encryption_mode(&crypt_info->ci_policy, inode);
+	if (IS_ERR(mode)) {
+		res = PTR_ERR(mode);
+		goto out;
+	}
+	WARN_ON(mode->ivsize > LLCRYPT_MAX_IV_SIZE);
+	crypt_info->ci_mode = mode;
+
+	res = setup_file_encryption_key(crypt_info, &master_key);
+	if (res)
+		goto out;
+
+	if (cmpxchg_release(&(llcrypt_info_nocast(inode)), NULL,
+			    crypt_info) == NULL) {
+		if (master_key) {
+			struct llcrypt_master_key *mk =
+				master_key->payload.data[0];
+
+			refcount_inc(&mk->mk_refcount);
+			crypt_info->ci_master_key = key_get(master_key);
+			spin_lock(&mk->mk_decrypted_inodes_lock);
+			list_add(&crypt_info->ci_master_key_link,
+				 &mk->mk_decrypted_inodes);
+			spin_unlock(&mk->mk_decrypted_inodes_lock);
+		}
+		crypt_info = NULL;
+	}
+	res = 0;
+out:
+	if (master_key) {
+		struct llcrypt_master_key *mk = master_key->payload.data[0];
+
+		up_read(&mk->mk_secret_sem);
+		key_put(master_key);
+	}
+	if (res == -ENOKEY)
+		res = 0;
+	put_crypt_info(crypt_info);
+	return res;
+}
+EXPORT_SYMBOL(llcrypt_get_encryption_info);
+
+/**
+ * llcrypt_put_encryption_info - free most of an inode's llcrypt data
+ *
+ * Free the inode's llcrypt_info.  Filesystems must call this when the inode is
+ * being evicted.  An RCU grace period need not have elapsed yet.
+ */
+void llcrypt_put_encryption_info(struct inode *inode)
+{
+	put_crypt_info(llcrypt_info(inode));
+	llcrypt_info_nocast(inode) = NULL;
+}
+EXPORT_SYMBOL(llcrypt_put_encryption_info);
+
+/**
+ * llcrypt_free_inode - free an inode's llcrypt data requiring RCU delay
+ *
+ * Free the inode's cached decrypted symlink target, if any.  Filesystems must
+ * call this after an RCU grace period, just before they free the inode.
+ */
+void llcrypt_free_inode(struct inode *inode)
+{
+	if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) {
+		kfree(inode->i_link);
+		inode->i_link = NULL;
+	}
+}
+EXPORT_SYMBOL(llcrypt_free_inode);
+
+/**
+ * llcrypt_drop_inode - check whether the inode's master key has been removed
+ *
+ * Filesystems supporting llcrypt must call this from their ->drop_inode()
+ * method so that encrypted inodes are evicted as soon as they're no longer in
+ * use and their master key has been removed.
+ *
+ * Return: 1 if llcrypt wants the inode to be evicted now, otherwise 0
+ */
+int llcrypt_drop_inode(struct inode *inode)
+{
+	const struct llcrypt_info *ci;
+	const struct llcrypt_master_key *mk;
+
+	ci = (struct llcrypt_info *)READ_ONCE(llcrypt_info_nocast(inode));
+	/*
+	 * If ci is NULL, then the inode doesn't have an encryption key set up
+	 * so it's irrelevant.  If ci_master_key is NULL, then the master key
+	 * was provided via the legacy mechanism of the process-subscribed
+	 * keyrings, so we don't know whether it's been removed or not.
+	 */
+	if (!ci || !ci->ci_master_key)
+		return 0;
+	mk = ci->ci_master_key->payload.data[0];
+
+	/*
+	 * Note: since we aren't holding ->mk_secret_sem, the result here can
+	 * immediately become outdated.  But there's no correctness problem with
+	 * unnecessarily evicting.  Nor is there a correctness problem with not
+	 * evicting while iput() is racing with the key being removed, since
+	 * then the thread removing the key will either evict the inode itself
+	 * or will correctly detect that it wasn't evicted due to the race.
+	 */
+	return !is_master_key_secret_present(&mk->mk_secret);
+}
+EXPORT_SYMBOL_GPL(llcrypt_drop_inode);
+
+bool llcrypt_has_encryption_key(const struct inode *inode)
+{
+	/* pairs with cmpxchg_release() in llcrypt_get_encryption_info() */
+	return READ_ONCE(llcrypt_info_nocast(inode)) != NULL;
+}
+EXPORT_SYMBOL_GPL(llcrypt_has_encryption_key);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup_v1.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup_v1.c
new file mode 100644
index 0000000000000..e56bce3717d9a
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/keysetup_v1.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Key setup for v1 encryption policies
+ *
+ * Copyright 2015, 2019 Google LLC
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+
+/*
+ * This file implements compatibility functions for the original encryption
+ * policy version ("v1"), including:
+ *
+ * - Deriving per-file keys using the AES-128-ECB based KDF
+ *   (rather than the new method of using HKDF-SHA512)
+ *
+ * - Retrieving llcrypt master keys from process-subscribed keyrings
+ *   (rather than the new method of using a filesystem-level keyring)
+ *
+ * - Handling policies with the DIRECT_KEY flag set using a master key table
+ *   (rather than the new method of implementing DIRECT_KEY with per-mode keys
+ *    managed alongside the master keys in the filesystem-level keyring)
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+#include <linux/hashtable.h>
+#include <linux/scatterlist.h>
+
+#include "llcrypt_private.h"
+
+/* Table of keys referenced by DIRECT_KEY policies */
+static DEFINE_HASHTABLE(llcrypt_direct_keys, 6); /* 6 bits = 64 buckets */
+static DEFINE_SPINLOCK(llcrypt_direct_keys_lock);
+
+/*
+ * v1 key derivation function.  This generates the derived key by encrypting the
+ * master key with AES-128-ECB using the nonce as the AES key.  This provides a
+ * unique derived key with sufficient entropy for each inode.  However, it's
+ * nonstandard, non-extensible, doesn't evenly distribute the entropy from the
+ * master key, and is trivially reversible: an attacker who compromises a
+ * derived key can "decrypt" it to get back to the master key, then derive any
+ * other key.  For all new code, use HKDF instead.
+ *
+ * The master key must be at least as long as the derived key.  If the master
+ * key is longer, then only the first 'derived_keysize' bytes are used.
+ */
+static int derive_key_aes(const u8 *master_key,
+			  const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE],
+			  u8 *derived_key, unsigned int derived_keysize)
+{
+	int res = 0;
+	struct skcipher_request *req = NULL;
+	DECLARE_CRYPTO_WAIT(wait);
+	struct scatterlist src_sg, dst_sg;
+	struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+
+	if (IS_ERR(tfm)) {
+		res = PTR_ERR(tfm);
+		tfm = NULL;
+		goto out;
+	}
+	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+	req = skcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		res = -ENOMEM;
+		goto out;
+	}
+	skcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			crypto_req_done, &wait);
+	res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+	if (res < 0)
+		goto out;
+
+	sg_init_one(&src_sg, master_key, derived_keysize);
+	sg_init_one(&dst_sg, derived_key, derived_keysize);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
+				   NULL);
+	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+out:
+	skcipher_request_free(req);
+	crypto_free_skcipher(tfm);
+	return res;
+}
+
+/*
+ * Search the current task's subscribed keyrings for a "logon" key with
+ * description prefix:descriptor, and if found acquire a read lock on it and
+ * return a pointer to its validated payload in *payload_ret.
+ */
+static struct key *
+find_and_lock_process_key(const char *prefix,
+			  const u8 descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE],
+			  unsigned int min_keysize,
+			  const struct llcrypt_key **payload_ret)
+{
+	char *description;
+	struct key *key;
+	const struct user_key_payload *ukp;
+	const struct llcrypt_key *payload;
+
+	description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+				LLCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
+	if (!description)
+		return ERR_PTR(-ENOMEM);
+
+	key = request_key(&key_type_logon, description, NULL);
+	kfree(description);
+	if (IS_ERR(key))
+		return key;
+
+	down_read(&key->sem);
+	ukp = user_key_payload_locked(key);
+
+	if (!ukp) /* was the key revoked before we acquired its semaphore? */
+		goto invalid;
+
+	payload = (const struct llcrypt_key *)ukp->data;
+
+	if (ukp->datalen != sizeof(struct llcrypt_key) ||
+	    payload->size < 1 || payload->size > LLCRYPT_MAX_KEY_SIZE) {
+		llcrypt_warn(NULL,
+			     "key with description '%s' has invalid payload",
+			     key->description);
+		goto invalid;
+	}
+
+	if (payload->size < min_keysize) {
+		llcrypt_warn(NULL,
+			     "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
+			     key->description, payload->size, min_keysize);
+		goto invalid;
+	}
+
+	*payload_ret = payload;
+	return key;
+
+invalid:
+	up_read(&key->sem);
+	key_put(key);
+	return ERR_PTR(-ENOKEY);
+}
+
+/* Master key referenced by DIRECT_KEY policy */
+struct llcrypt_direct_key {
+	struct hlist_node		dk_node;
+	refcount_t			dk_refcount;
+	const struct llcrypt_mode	*dk_mode;
+	struct crypto_skcipher		*dk_ctfm;
+	u8				dk_descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE];
+	u8				dk_raw[LLCRYPT_MAX_KEY_SIZE];
+};
+
+static void free_direct_key(struct llcrypt_direct_key *dk)
+{
+	if (dk) {
+		crypto_free_skcipher(dk->dk_ctfm);
+		kfree_sensitive(dk);
+	}
+}
+
+void llcrypt_put_direct_key(struct llcrypt_direct_key *dk)
+{
+	if (!refcount_dec_and_lock(&dk->dk_refcount, &llcrypt_direct_keys_lock))
+		return;
+	hash_del(&dk->dk_node);
+	spin_unlock(&llcrypt_direct_keys_lock);
+
+	free_direct_key(dk);
+}
+
+/*
+ * Find/insert the given key into the llcrypt_direct_keys table.  If found, it
+ * is returned with elevated refcount, and 'to_insert' is freed if non-NULL.  If
+ * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise
+ * NULL is returned.
+ */
+static struct llcrypt_direct_key *
+find_or_insert_direct_key(struct llcrypt_direct_key *to_insert,
+			  const u8 *raw_key, const struct llcrypt_info *ci)
+{
+	unsigned long hash_key;
+	struct llcrypt_direct_key *dk;
+
+	/*
+	 * Careful: to avoid potentially leaking secret key bytes via timing
+	 * information, we must key the hash table by descriptor rather than by
+	 * raw key, and use crypto_memneq() when comparing raw keys.
+	 */
+
+	BUILD_BUG_ON(sizeof(hash_key) > LLCRYPT_KEY_DESCRIPTOR_SIZE);
+	memcpy(&hash_key, ci->ci_policy.v1.master_key_descriptor,
+	       sizeof(hash_key));
+
+	spin_lock(&llcrypt_direct_keys_lock);
+	hash_for_each_possible(llcrypt_direct_keys, dk, dk_node, hash_key) {
+		if (memcmp(ci->ci_policy.v1.master_key_descriptor,
+			   dk->dk_descriptor, LLCRYPT_KEY_DESCRIPTOR_SIZE) != 0)
+			continue;
+		if (ci->ci_mode != dk->dk_mode)
+			continue;
+		if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize))
+			continue;
+		/* using existing tfm with same (descriptor, mode, raw_key) */
+		refcount_inc(&dk->dk_refcount);
+		spin_unlock(&llcrypt_direct_keys_lock);
+		free_direct_key(to_insert);
+		return dk;
+	}
+	if (to_insert)
+		hash_add(llcrypt_direct_keys, &to_insert->dk_node, hash_key);
+	spin_unlock(&llcrypt_direct_keys_lock);
+	return to_insert;
+}
+
+/* Prepare to encrypt directly using the master key in the given mode */
+static struct llcrypt_direct_key *
+llcrypt_get_direct_key(const struct llcrypt_info *ci, const u8 *raw_key)
+{
+	struct llcrypt_direct_key *dk;
+	int err;
+
+	/* Is there already a tfm for this key? */
+	dk = find_or_insert_direct_key(NULL, raw_key, ci);
+	if (dk)
+		return dk;
+
+	/* Nope, allocate one. */
+	dk = kzalloc(sizeof(*dk), GFP_NOFS);
+	if (!dk)
+		return ERR_PTR(-ENOMEM);
+	refcount_set(&dk->dk_refcount, 1);
+	dk->dk_mode = ci->ci_mode;
+	dk->dk_ctfm = llcrypt_allocate_skcipher(ci->ci_mode, raw_key,
+						ci->ci_inode);
+	if (IS_ERR(dk->dk_ctfm)) {
+		err = PTR_ERR(dk->dk_ctfm);
+		dk->dk_ctfm = NULL;
+		goto err_free_dk;
+	}
+	memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor,
+	       LLCRYPT_KEY_DESCRIPTOR_SIZE);
+	memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize);
+
+	return find_or_insert_direct_key(dk, raw_key, ci);
+
+err_free_dk:
+	free_direct_key(dk);
+	return ERR_PTR(err);
+}
+
+/* v1 policy, DIRECT_KEY: use the master key directly */
+static int setup_v1_file_key_direct(struct llcrypt_info *ci,
+				    const u8 *raw_master_key)
+{
+	const struct llcrypt_mode *mode = ci->ci_mode;
+	struct llcrypt_direct_key *dk;
+
+	if (!llcrypt_mode_supports_direct_key(mode)) {
+		llcrypt_warn(ci->ci_inode,
+			     "Direct key mode not allowed with %s",
+			     mode->friendly_name);
+		return -EINVAL;
+	}
+
+	if (ci->ci_policy.v1.contents_encryption_mode !=
+	    ci->ci_policy.v1.filenames_encryption_mode) {
+		llcrypt_warn(ci->ci_inode,
+			     "Direct key mode not allowed with different contents and filenames modes");
+		return -EINVAL;
+	}
+
+	/* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
+	if (WARN_ON(mode->needs_essiv))
+		return -EINVAL;
+
+	dk = llcrypt_get_direct_key(ci, raw_master_key);
+	if (IS_ERR(dk))
+		return PTR_ERR(dk);
+	ci->ci_direct_key = dk;
+	ci->ci_ctfm = dk->dk_ctfm;
+	return 0;
+}
+
+/* v1 policy, !DIRECT_KEY: derive the file's encryption key */
+static int setup_v1_file_key_derived(struct llcrypt_info *ci,
+				     const u8 *raw_master_key)
+{
+	u8 *derived_key;
+	int err;
+
+	/*
+	 * This cannot be a stack buffer because it will be passed to the
+	 * scatterlist crypto API during derive_key_aes().
+	 */
+	derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
+	if (!derived_key)
+		return -ENOMEM;
+
+	err = derive_key_aes(raw_master_key, ci->ci_nonce,
+			     derived_key, ci->ci_mode->keysize);
+	if (err)
+		goto out;
+
+	err = llcrypt_set_derived_key(ci, derived_key);
+out:
+	kfree_sensitive(derived_key);
+	return err;
+}
+
+int llcrypt_setup_v1_file_key(struct llcrypt_info *ci, const u8 *raw_master_key)
+{
+	if (ci->ci_policy.v1.flags & LLCRYPT_POLICY_FLAG_DIRECT_KEY)
+		return setup_v1_file_key_direct(ci, raw_master_key);
+	else
+		return setup_v1_file_key_derived(ci, raw_master_key);
+}
+
+int llcrypt_setup_v1_file_key_via_subscribed_keyrings(struct llcrypt_info *ci)
+{
+	struct key *key;
+	const struct llcrypt_key *payload;
+	int err;
+
+	key = find_and_lock_process_key(LLCRYPT_KEY_DESC_PREFIX,
+					ci->ci_policy.v1.master_key_descriptor,
+					ci->ci_mode->keysize, &payload);
+	if (key == ERR_PTR(-ENOKEY)) {
+		struct lustre_sb_info *lsi = s2lsi(ci->ci_inode->i_sb);
+
+		if (lsi && lsi->lsi_cop->key_prefix) {
+			key =
+			    find_and_lock_process_key(lsi->lsi_cop->key_prefix,
+						      ci->ci_policy.v1.master_key_descriptor,
+						      ci->ci_mode->keysize,
+						      &payload);
+		}
+	}
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	err = llcrypt_setup_v1_file_key(ci, payload->raw);
+	up_read(&key->sem);
+	key_put(key);
+	return err;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/llcrypt_private.h b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/llcrypt_private.h
new file mode 100644
index 0000000000000..06eafaf2b80a9
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/llcrypt_private.h
@@ -0,0 +1,499 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * llcrypt_private.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
+ * Heavily modified since then.
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+
+#ifndef _LLCRYPT_PRIVATE_H
+#define _LLCRYPT_PRIVATE_H
+
+#include <libcfs/crypto/llcrypt.h>
+#include <crypto/hash.h>
+#include <lustre_disk.h>
+
+#ifndef CRYPTO_TFM_REQ_FORBID_WEAK_KEYS
+#define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS CRYPTO_TFM_REQ_WEAK_KEY
+#endif
+
+#define llcrypt_info(inode)	     ((struct llcrypt_info *)(inode)->i_private)
+#define llcrypt_info_nocast(inode)   ((inode)->i_private)
+
+#define CONST_STRLEN(str)	(sizeof(str) - 1)
+
+#define FS_KEY_DERIVATION_NONCE_SIZE	16
+
+#define LLCRYPT_MIN_KEY_SIZE		16
+
+#define LLCRYPT_CONTEXT_V1	1
+#define LLCRYPT_CONTEXT_V2	2
+
+struct llcrypt_context_v1 {
+	u8 version; /* LLCRYPT_CONTEXT_V1 */
+	u8 contents_encryption_mode;
+	u8 filenames_encryption_mode;
+	u8 flags;
+	u8 master_key_descriptor[LLCRYPT_KEY_DESCRIPTOR_SIZE];
+	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+};
+
+struct llcrypt_context_v2 {
+	u8 version; /* LLCRYPT_CONTEXT_V2 */
+	u8 contents_encryption_mode;
+	u8 filenames_encryption_mode;
+	u8 flags;
+	u8 __reserved[4];
+	u8 master_key_identifier[LLCRYPT_KEY_IDENTIFIER_SIZE];
+	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+};
+
+/**
+ * llcrypt_context - the encryption context of an inode
+ *
+ * This is the on-disk equivalent of an llcrypt_policy, stored alongside each
+ * encrypted file usually in a hidden extended attribute.  It contains the
+ * fields from the llcrypt_policy, in order to identify the encryption algorithm
+ * and key with which the file is encrypted.  It also contains a nonce that was
+ * randomly generated by llcrypt itself; this is used as KDF input or as a tweak
+ * to cause different files to be encrypted differently.
+ */
+union llcrypt_context {
+	u8 version;
+	struct llcrypt_context_v1 v1;
+	struct llcrypt_context_v2 v2;
+};
+
+/*
+ * Return the size expected for the given llcrypt_context based on its version
+ * number, or 0 if the context version is unrecognized.
+ */
+static inline int llcrypt_context_size(const union llcrypt_context *ctx)
+{
+	switch (ctx->version) {
+	case LLCRYPT_CONTEXT_V1:
+		BUILD_BUG_ON(sizeof(ctx->v1) != 28);
+		return sizeof(ctx->v1);
+	case LLCRYPT_CONTEXT_V2:
+		BUILD_BUG_ON(sizeof(ctx->v2) != 40);
+		return sizeof(ctx->v2);
+	}
+	return 0;
+}
+
+#undef llcrypt_policy
+union llcrypt_policy {
+	u8 version;
+	struct llcrypt_policy_v1 v1;
+	struct llcrypt_policy_v2 v2;
+};
+
+/*
+ * Return the size expected for the given llcrypt_policy based on its version
+ * number, or 0 if the policy version is unrecognized.
+ */
+static inline int llcrypt_policy_size(const union llcrypt_policy *policy)
+{
+	switch (policy->version) {
+	case LLCRYPT_POLICY_V1:
+		return sizeof(policy->v1);
+	case LLCRYPT_POLICY_V2:
+		return sizeof(policy->v2);
+	}
+	return 0;
+}
+
+/* Return the contents encryption mode of a valid encryption policy */
+static inline u8
+llcrypt_policy_contents_mode(const union llcrypt_policy *policy)
+{
+	switch (policy->version) {
+	case LLCRYPT_POLICY_V1:
+		return policy->v1.contents_encryption_mode;
+	case LLCRYPT_POLICY_V2:
+		return policy->v2.contents_encryption_mode;
+	}
+	BUG();
+}
+
+/* Return the filenames encryption mode of a valid encryption policy */
+static inline u8
+llcrypt_policy_fnames_mode(const union llcrypt_policy *policy)
+{
+	switch (policy->version) {
+	case LLCRYPT_POLICY_V1:
+		return policy->v1.filenames_encryption_mode;
+	case LLCRYPT_POLICY_V2:
+		return policy->v2.filenames_encryption_mode;
+	}
+	BUG();
+}
+
+/* Return the flags (LLCRYPT_POLICY_FLAG*) of a valid encryption policy */
+static inline u8
+llcrypt_policy_flags(const union llcrypt_policy *policy)
+{
+	switch (policy->version) {
+	case LLCRYPT_POLICY_V1:
+		return policy->v1.flags;
+	case LLCRYPT_POLICY_V2:
+		return policy->v2.flags;
+	}
+	BUG();
+}
+
+static inline bool
+llcrypt_is_direct_key_policy(const union llcrypt_policy *policy)
+{
+	return llcrypt_policy_flags(policy) & LLCRYPT_POLICY_FLAG_DIRECT_KEY;
+}
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct llcrypt_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __packed;
+
+/*
+ * llcrypt_info - the "encryption key" for an inode
+ *
+ * When an encrypted file's key is made available, an instance of this struct is
+ * allocated and stored in '(struct llcrypt_info *)inode->i_private'.
+ * Once created, it remains until the inode is evicted.
+ */
+struct llcrypt_info {
+
+	/* The actual crypto transform used for encryption and decryption */
+	struct crypto_skcipher *ci_ctfm;
+
+	/*
+	 * Cipher for ESSIV IV generation.  Only set for CBC contents
+	 * encryption, otherwise is NULL.
+	 */
+	struct crypto_cipher *ci_essiv_tfm;
+
+	/*
+	 * Encryption mode used for this inode.  It corresponds to either the
+	 * contents or filenames encryption mode, depending on the inode type.
+	 */
+	struct llcrypt_mode *ci_mode;
+
+	/* Back-pointer to the inode */
+	struct inode *ci_inode;
+
+	/*
+	 * The master key with which this inode was unlocked (decrypted).  This
+	 * will be NULL if the master key was found in a process-subscribed
+	 * keyring rather than in the filesystem-level keyring.
+	 */
+	struct key *ci_master_key;
+
+	/*
+	 * Link in list of inodes that were unlocked with the master key.
+	 * Only used when ->ci_master_key is set.
+	 */
+	struct list_head ci_master_key_link;
+
+	/*
+	 * If non-NULL, then encryption is done using the master key directly
+	 * and ci_ctfm will equal ci_direct_key->dk_ctfm.
+	 */
+	struct llcrypt_direct_key *ci_direct_key;
+
+	/* The encryption policy used by this inode */
+	union llcrypt_policy ci_policy;
+
+	/* This inode's nonce, copied from the llcrypt_context */
+	u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+};
+
+typedef enum {
+	FS_DECRYPT = 0,
+	FS_ENCRYPT,
+} llcrypt_direction_t;
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
+
+static inline bool llcrypt_valid_enc_modes(u32 contents_mode,
+					   u32 filenames_mode)
+{
+	if (contents_mode == LLCRYPT_MODE_AES_128_CBC &&
+	    (filenames_mode == LLCRYPT_MODE_AES_128_CTS ||
+	     filenames_mode == LLCRYPT_MODE_NULL))
+		return true;
+
+	if (contents_mode == LLCRYPT_MODE_AES_256_XTS &&
+	    (filenames_mode == LLCRYPT_MODE_AES_256_CTS ||
+	     filenames_mode == LLCRYPT_MODE_NULL))
+		return true;
+
+	if (contents_mode == LLCRYPT_MODE_ADIANTUM &&
+	    (filenames_mode == LLCRYPT_MODE_ADIANTUM ||
+	     filenames_mode == LLCRYPT_MODE_NULL))
+		return true;
+
+	return false;
+}
+
+/* crypto.c */
+extern struct kmem_cache *llcrypt_info_cachep;
+extern int llcrypt_initialize(unsigned int cop_flags);
+extern int llcrypt_crypt_block(const struct inode *inode,
+			       llcrypt_direction_t rw, u64 lblk_num,
+			       struct page *src_page, struct page *dest_page,
+			       unsigned int len, unsigned int offs,
+			       gfp_t gfp_flags);
+extern struct page *llcrypt_alloc_bounce_page(gfp_t gfp_flags);
+extern const struct dentry_operations llcrypt_d_ops;
+
+extern void __printf(3, 4) __cold
+llcrypt_msg(const struct inode *inode, int mask, const char *fmt, ...);
+
+#define llcrypt_warn(inode, fmt, ...)		\
+	llcrypt_msg((inode), D_SEC, fmt, ##__VA_ARGS__)
+#define llcrypt_err(inode, fmt, ...)		\
+	llcrypt_msg((inode), D_ERROR, fmt, ##__VA_ARGS__)
+
+#define LLCRYPT_MAX_IV_SIZE	32
+
+union llcrypt_iv {
+	struct {
+		/* logical block number within the file */
+		__le64 lblk_num;
+
+		/* per-file nonce; only set in DIRECT_KEY mode */
+		u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+	};
+	u8 raw[LLCRYPT_MAX_IV_SIZE];
+};
+
+void llcrypt_generate_iv(union llcrypt_iv *iv, u64 lblk_num,
+			 const struct llcrypt_info *ci);
+
+/* fname.c */
+extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
+			 u8 *out, unsigned int olen);
+extern bool llcrypt_fname_encrypted_size(const struct inode *inode,
+					 u32 orig_len, u32 max_len,
+					 u32 *encrypted_len_ret);
+
+/* hkdf.c */
+
+struct llcrypt_hkdf {
+	struct crypto_shash *hmac_tfm;
+};
+
+extern int llcrypt_init_hkdf(struct llcrypt_hkdf *hkdf, const u8 *master_key,
+			     unsigned int master_key_size);
+
+/*
+ * The list of contexts in which llcrypt uses HKDF.  These values are used as
+ * the first byte of the HKDF application-specific info string to guarantee that
+ * info strings are never repeated between contexts.  This ensures that all HKDF
+ * outputs are unique and cryptographically isolated, i.e. knowledge of one
+ * output doesn't reveal another.
+ */
+#define HKDF_CONTEXT_KEY_IDENTIFIER	1
+#define HKDF_CONTEXT_PER_FILE_KEY	2
+#define HKDF_CONTEXT_PER_MODE_KEY	3
+
+extern int llcrypt_hkdf_expand(struct llcrypt_hkdf *hkdf, u8 context,
+			       const u8 *info, unsigned int infolen,
+			       u8 *okm, unsigned int okmlen);
+
+extern void llcrypt_destroy_hkdf(struct llcrypt_hkdf *hkdf);
+
+/* keyring.c */
+
+/*
+ * llcrypt_master_key_secret - secret key material of an in-use master key
+ */
+struct llcrypt_master_key_secret {
+
+	/*
+	 * For v2 policy keys: HKDF context keyed by this master key.
+	 * For v1 policy keys: not set (hkdf.hmac_tfm == NULL).
+	 */
+	struct llcrypt_hkdf	hkdf;
+
+	/* Size of the raw key in bytes.  Set even if ->raw isn't set. */
+	u32			size;
+
+	/* For v1 policy keys: the raw key.  Wiped for v2 policy keys. */
+	u8			raw[LLCRYPT_MAX_KEY_SIZE];
+
+} __randomize_layout;
+
+/*
+ * llcrypt_master_key - an in-use master key
+ *
+ * This represents a master encryption key which has been added to the
+ * filesystem and can be used to "unlock" the encrypted files which were
+ * encrypted with it.
+ */
+struct llcrypt_master_key {
+
+	/*
+	 * The secret key material.  After LL_IOC_REMOVE_ENCRYPTION_KEY is
+	 * executed, this is wiped and no new inodes can be unlocked with this
+	 * key; however, there may still be inodes in ->mk_decrypted_inodes
+	 * which could not be evicted.  As long as some inodes still remain,
+	 * LL_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
+	 * LL_IOC_ADD_ENCRYPTION_KEY can add the secret again.
+	 *
+	 * Locking: protected by key->sem (outer) and mk_secret_sem (inner).
+	 * The reason for two locks is that key->sem also protects modifying
+	 * mk_users, which ranks it above the semaphore for the keyring key
+	 * type, which is in turn above page faults (via keyring_read).  But
+	 * sometimes filesystems call llcrypt_get_encryption_info() from within
+	 * a transaction, which ranks it below page faults.  So we need a
+	 * separate lock which protects mk_secret but not also mk_users.
+	 */
+	struct llcrypt_master_key_secret	mk_secret;
+	struct rw_semaphore			mk_secret_sem;
+
+	/*
+	 * For v1 policy keys: an arbitrary key descriptor which was assigned by
+	 * userspace (->descriptor).
+	 *
+	 * For v2 policy keys: a cryptographic hash of this key (->identifier).
+	 */
+	struct llcrypt_key_specifier		mk_spec;
+
+	/*
+	 * Keyring which contains a key of type 'key_type_llcrypt_user' for each
+	 * user who has added this key.  Normally each key will be added by just
+	 * one user, but it's possible that multiple users share a key, and in
+	 * that case we need to keep track of those users so that one user can't
+	 * remove the key before the others want it removed too.
+	 *
+	 * This is NULL for v1 policy keys; those can only be added by root.
+	 *
+	 * Locking: in addition to this keyrings own semaphore, this is
+	 * protected by the master key's key->sem, so we can do atomic
+	 * search+insert.  It can also be searched without taking any locks, but
+	 * in that case the returned key may have already been removed.
+	 */
+	struct key		*mk_users;
+
+	/*
+	 * Length of ->mk_decrypted_inodes, plus one if mk_secret is present.
+	 * Once this goes to 0, the master key is removed from ->lsi_master_keys.
+	 * The 'struct llcrypt_master_key' will continue to live as long as the
+	 * 'struct key' whose payload it is, but we won't let this reference
+	 * count rise again.
+	 */
+	refcount_t		mk_refcount;
+
+	/*
+	 * List of inodes that were unlocked using this key.  This allows the
+	 * inodes to be evicted efficiently if the key is removed.
+	 */
+	struct list_head	mk_decrypted_inodes;
+	spinlock_t		mk_decrypted_inodes_lock;
+
+	/* Per-mode tfms for DIRECT_KEY policies, allocated on-demand */
+	struct crypto_skcipher	*mk_mode_keys[__LLCRYPT_MODE_MAX + 1];
+
+} __randomize_layout;
+
+static inline bool
+is_master_key_secret_present(const struct llcrypt_master_key_secret *secret)
+{
+	/*
+	 * The READ_ONCE() is only necessary for llcrypt_drop_inode() and
+	 * llcrypt_key_describe().  These run in atomic context, so they can't
+	 * take ->mk_secret_sem and thus 'secret' can change concurrently which
+	 * would be a data race.  But they only need to know whether the secret
+	 * *was* present at the time of check, so READ_ONCE() suffices.
+	 */
+	return READ_ONCE(secret->size) != 0;
+}
+
+static inline const char *master_key_spec_type(
+				const struct llcrypt_key_specifier *spec)
+{
+	switch (spec->type) {
+	case LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+		return "descriptor";
+	case LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+		return "identifier";
+	}
+	return "[unknown]";
+}
+
+static inline int master_key_spec_len(const struct llcrypt_key_specifier *spec)
+{
+	switch (spec->type) {
+	case LLCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+		return LLCRYPT_KEY_DESCRIPTOR_SIZE;
+	case LLCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+		return LLCRYPT_KEY_IDENTIFIER_SIZE;
+	}
+	return 0;
+}
+
+extern struct key *
+llcrypt_find_master_key(struct super_block *sb,
+			const struct llcrypt_key_specifier *mk_spec);
+
+extern int llcrypt_verify_key_added(struct super_block *sb,
+				    const u8 identifier[LLCRYPT_KEY_IDENTIFIER_SIZE]);
+
+extern int __init llcrypt_init_keyring(void);
+
+extern void __exit llcrypt_exit_keyring(void);
+
+/* keysetup.c */
+
+struct llcrypt_mode {
+	const char *friendly_name;
+	const char *cipher_str;
+	int keysize;
+	int ivsize;
+	bool logged_impl_name;
+	bool needs_essiv;
+};
+
+static inline bool
+llcrypt_mode_supports_direct_key(const struct llcrypt_mode *mode)
+{
+	return mode->ivsize >= offsetofend(union llcrypt_iv, nonce);
+}
+
+extern struct crypto_skcipher *
+llcrypt_allocate_skcipher(struct llcrypt_mode *mode, const u8 *raw_key,
+			  const struct inode *inode);
+
+extern int llcrypt_set_derived_key(struct llcrypt_info *ci,
+				   const u8 *derived_key);
+
+/* keysetup_v1.c */
+
+extern void llcrypt_put_direct_key(struct llcrypt_direct_key *dk);
+
+extern int llcrypt_setup_v1_file_key(struct llcrypt_info *ci,
+				     const u8 *raw_master_key);
+
+extern int llcrypt_setup_v1_file_key_via_subscribed_keyrings(
+					struct llcrypt_info *ci);
+/* policy.c */
+
+extern bool llcrypt_policies_equal(const union llcrypt_policy *policy1,
+				   const union llcrypt_policy *policy2);
+extern bool llcrypt_supported_policy(const union llcrypt_policy *policy_u,
+				     const struct inode *inode);
+extern int llcrypt_policy_from_context(union llcrypt_policy *policy_u,
+				       const union llcrypt_context *ctx_u,
+				       int ctx_size);
+
+#endif /* _LLCRYPT_PRIVATE_H */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/crypto/policy.c b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/policy.c
new file mode 100644
index 0000000000000..5d094d53b01a4
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/crypto/policy.c
@@ -0,0 +1,594 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Encryption policy functions for per-file encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility.
+ *
+ * Originally written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ * Modified by Eric Biggers, 2019 for v2 policy support.
+ */
+/*
+ * Linux commit 219d54332a09
+ * tags/v5.4
+ */
+
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <lustre_compat.h>
+#include "llcrypt_private.h"
+
+/**
+ * llcrypt_policies_equal - check whether two encryption policies are the same
+ *
+ * Return: %true if equal, else %false
+ */
+bool llcrypt_policies_equal(const union llcrypt_policy *policy1,
+			    const union llcrypt_policy *policy2)
+{
+	if (policy1->version != policy2->version)
+		return false;
+
+	return !memcmp(policy1, policy2, llcrypt_policy_size(policy1));
+}
+
+/**
+ * llcrypt_supported_policy - check whether an encryption policy is supported
+ *
+ * Given an encryption policy, check whether all its encryption modes and other
+ * settings are supported by this kernel.  (But we don't currently don't check
+ * for crypto API support here, so attempting to use an algorithm not configured
+ * into the crypto API will still fail later.)
+ *
+ * Return: %true if supported, else %false
+ */
+bool llcrypt_supported_policy(const union llcrypt_policy *policy_u,
+			      const struct inode *inode)
+{
+	switch (policy_u->version) {
+	case LLCRYPT_POLICY_V1: {
+		const struct llcrypt_policy_v1 *policy = &policy_u->v1;
+
+		if (!llcrypt_valid_enc_modes(policy->contents_encryption_mode,
+					     policy->filenames_encryption_mode)) {
+			llcrypt_warn(inode,
+				     "Unsupported encryption modes (contents %d, filenames %d)",
+				     policy->contents_encryption_mode,
+				     policy->filenames_encryption_mode);
+			return false;
+		}
+
+		if (policy->flags & ~LLCRYPT_POLICY_FLAGS_VALID) {
+			llcrypt_warn(inode,
+				     "Unsupported encryption flags (0x%02x)",
+				     policy->flags);
+			return false;
+		}
+
+		return true;
+	}
+	case LLCRYPT_POLICY_V2: {
+		const struct llcrypt_policy_v2 *policy = &policy_u->v2;
+
+		if (!llcrypt_valid_enc_modes(policy->contents_encryption_mode,
+					     policy->filenames_encryption_mode)) {
+			llcrypt_warn(inode,
+				     "Unsupported encryption modes (contents %d, filenames %d)",
+				     policy->contents_encryption_mode,
+				     policy->filenames_encryption_mode);
+			return false;
+		}
+
+		if (policy->flags & ~LLCRYPT_POLICY_FLAGS_VALID) {
+			llcrypt_warn(inode,
+				     "Unsupported encryption flags (0x%02x)",
+				     policy->flags);
+			return false;
+		}
+
+		if (memchr_inv(policy->__reserved, 0,
+			       sizeof(policy->__reserved))) {
+			llcrypt_warn(inode,
+				     "Reserved bits set in encryption policy");
+			return false;
+		}
+
+		return true;
+	}
+	}
+	return false;
+}
+
+/**
+ * llcrypt_new_context_from_policy - create a new llcrypt_context from a policy
+ *
+ * Create an llcrypt_context for an inode that is being assigned the given
+ * encryption policy.  A new nonce is randomly generated.
+ *
+ * Return: the size of the new context in bytes.
+ */
+static int llcrypt_new_context_from_policy(union llcrypt_context *ctx_u,
+					   const union llcrypt_policy *policy_u)
+{
+	memset(ctx_u, 0, sizeof(*ctx_u));
+
+	switch (policy_u->version) {
+	case LLCRYPT_POLICY_V1: {
+		const struct llcrypt_policy_v1 *policy = &policy_u->v1;
+		struct llcrypt_context_v1 *ctx = &ctx_u->v1;
+
+		ctx->version = LLCRYPT_CONTEXT_V1;
+		ctx->contents_encryption_mode =
+			policy->contents_encryption_mode;
+		ctx->filenames_encryption_mode =
+			policy->filenames_encryption_mode;
+		ctx->flags = policy->flags;
+		memcpy(ctx->master_key_descriptor,
+		       policy->master_key_descriptor,
+		       sizeof(ctx->master_key_descriptor));
+		get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+		return sizeof(*ctx);
+	}
+	case LLCRYPT_POLICY_V2: {
+		const struct llcrypt_policy_v2 *policy = &policy_u->v2;
+		struct llcrypt_context_v2 *ctx = &ctx_u->v2;
+
+		ctx->version = LLCRYPT_CONTEXT_V2;
+		ctx->contents_encryption_mode =
+			policy->contents_encryption_mode;
+		ctx->filenames_encryption_mode =
+			policy->filenames_encryption_mode;
+		ctx->flags = policy->flags;
+		memcpy(ctx->master_key_identifier,
+		       policy->master_key_identifier,
+		       sizeof(ctx->master_key_identifier));
+		get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+		return sizeof(*ctx);
+	}
+	}
+	BUG();
+}
+
+/**
+ * llcrypt_policy_from_context - convert an llcrypt_context to an llcrypt_policy
+ *
+ * Given an llcrypt_context, build the corresponding llcrypt_policy.
+ *
+ * Return: 0 on success, or -EINVAL if the llcrypt_context has an unrecognized
+ * version number or size.
+ *
+ * This does *not* validate the settings within the policy itself, e.g. the
+ * modes, flags, and reserved bits.  Use llcrypt_supported_policy() for that.
+ */
+int llcrypt_policy_from_context(union llcrypt_policy *policy_u,
+				const union llcrypt_context *ctx_u,
+				int ctx_size)
+{
+	memset(policy_u, 0, sizeof(*policy_u));
+
+	if (ctx_size <= 0 || ctx_size != llcrypt_context_size(ctx_u))
+		return -EINVAL;
+
+	switch (ctx_u->version) {
+	case LLCRYPT_CONTEXT_V1: {
+		const struct llcrypt_context_v1 *ctx = &ctx_u->v1;
+		struct llcrypt_policy_v1 *policy = &policy_u->v1;
+
+		policy->version = LLCRYPT_POLICY_V1;
+		policy->contents_encryption_mode =
+			ctx->contents_encryption_mode;
+		policy->filenames_encryption_mode =
+			ctx->filenames_encryption_mode;
+		policy->flags = ctx->flags;
+		memcpy(policy->master_key_descriptor,
+		       ctx->master_key_descriptor,
+		       sizeof(policy->master_key_descriptor));
+		return 0;
+	}
+	case LLCRYPT_CONTEXT_V2: {
+		const struct llcrypt_context_v2 *ctx = &ctx_u->v2;
+		struct llcrypt_policy_v2 *policy = &policy_u->v2;
+
+		policy->version = LLCRYPT_POLICY_V2;
+		policy->contents_encryption_mode =
+			ctx->contents_encryption_mode;
+		policy->filenames_encryption_mode =
+			ctx->filenames_encryption_mode;
+		policy->flags = ctx->flags;
+		memcpy(policy->__reserved, ctx->__reserved,
+		       sizeof(policy->__reserved));
+		memcpy(policy->master_key_identifier,
+		       ctx->master_key_identifier,
+		       sizeof(policy->master_key_identifier));
+		return 0;
+	}
+	}
+	/* unreachable */
+	return -EINVAL;
+}
+
+/* Retrieve an inode's encryption policy */
+static int llcrypt_get_policy(struct inode *inode, union llcrypt_policy *policy)
+{
+	const struct llcrypt_info *ci;
+	union llcrypt_context ctx;
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	int ret;
+
+	ci = (struct llcrypt_info *)READ_ONCE(llcrypt_info_nocast(inode));
+	if (ci) {
+		/* key available, use the cached policy */
+		*policy = ci->ci_policy;
+		return 0;
+	}
+
+	if (!IS_ENCRYPTED(inode))
+		return -ENODATA;
+
+	if (!lsi)
+		return -ENODATA;
+
+	ret = lsi->lsi_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (ret < 0)
+		return (ret == -ERANGE) ? -EINVAL : ret;
+
+	return llcrypt_policy_from_context(policy, &ctx, ret);
+}
+
+static int set_encryption_policy(struct inode *inode,
+				 const union llcrypt_policy *policy)
+{
+	union llcrypt_context ctx;
+	int ctxsize;
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	int err;
+
+	if (!llcrypt_supported_policy(policy, inode))
+		return -EINVAL;
+
+	switch (policy->version) {
+	case LLCRYPT_POLICY_V1:
+		/*
+		 * The original encryption policy version provided no way of
+		 * verifying that the correct master key was supplied, which was
+		 * insecure in scenarios where multiple users have access to the
+		 * same encrypted files (even just read-only access).  The new
+		 * encryption policy version fixes this and also implies use of
+		 * an improved key derivation function and allows non-root users
+		 * to securely remove keys.  So as long as compatibility with
+		 * old kernels isn't required, it is recommended to use the new
+		 * policy version for all new encrypted directories.
+		 */
+		pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n",
+			     current->comm, current->pid);
+		break;
+	case LLCRYPT_POLICY_V2:
+		err = llcrypt_verify_key_added(inode->i_sb,
+					       policy->v2.master_key_identifier);
+		if (err)
+			return err;
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	ctxsize = llcrypt_new_context_from_policy(&ctx, policy);
+
+	if (!lsi)
+		return -EINVAL;
+
+	return lsi->lsi_cop->set_context(inode, &ctx, ctxsize, NULL);
+}
+
+/* Tell if an inode's encryption policy has filename encryption */
+bool llcrypt_policy_has_filename_enc(struct inode *inode)
+{
+	union llcrypt_policy policy;
+	int err;
+
+	err = llcrypt_get_policy(inode, &policy);
+	if (err)
+		return true;
+
+	if ((policy.version == LLCRYPT_POLICY_V1 &&
+	     policy.v1.filenames_encryption_mode == LLCRYPT_MODE_NULL) ||
+	    (policy.version == LLCRYPT_POLICY_V2 &&
+	     policy.v2.filenames_encryption_mode == LLCRYPT_MODE_NULL))
+		return false;
+	return true;
+}
+EXPORT_SYMBOL(llcrypt_policy_has_filename_enc);
+
+int llcrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
+{
+	union llcrypt_policy policy;
+	union llcrypt_policy existing_policy;
+	struct inode *inode = file_inode(filp);
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	u8 version;
+	int size;
+	int ret;
+
+	if (get_user(policy.version, (const u8 __user *)arg))
+		return -EFAULT;
+
+	size = llcrypt_policy_size(&policy);
+	if (size <= 0)
+		return -EINVAL;
+
+	/*
+	 * We should just copy the remaining 'size - 1' bytes here, but a
+	 * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to
+	 * think that size can be 0 here (despite the check above!) *and* that
+	 * it's a compile-time constant.  Thus it would think copy_from_user()
+	 * is passed compile-time constant ULONG_MAX, causing the compile-time
+	 * buffer overflow check to fail, breaking the build. This only occurred
+	 * when building an i386 kernel with -Os and branch profiling enabled.
+	 *
+	 * Work around it by just copying the first byte again...
+	 */
+	version = policy.version;
+	if (copy_from_user(&policy, arg, size))
+		return -EFAULT;
+	policy.version = version;
+
+	/* Force file/directory name encryption policy to null if
+	 * LSI_FILENAME_ENC flag is not set on sb.
+	 * This allows enabling filename encryption separately from data
+	 * encryption, and can be useful for interoperability with
+	 * encryption-unaware clients.
+	 */
+	if (!(lsi->lsi_flags & LSI_FILENAME_ENC)) {
+		CWARN("inode %lu: forcing policy filenames_encryption_mode to null\n",
+		      inode->i_ino);
+		cfs_tty_write_msg("\n\nForcing policy filenames_encryption_mode to null.\n\n");
+		switch (policy.version) {
+		case LLCRYPT_POLICY_V1:
+			policy.v1.filenames_encryption_mode = LLCRYPT_MODE_NULL;
+			break;
+		case LLCRYPT_POLICY_V2:
+			policy.v2.filenames_encryption_mode = LLCRYPT_MODE_NULL;
+			break;
+		}
+	}
+
+	if (!inode_owner_or_capable(&init_user_ns, inode))
+		return -EACCES;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	inode_lock(inode);
+
+	ret = llcrypt_get_policy(inode, &existing_policy);
+	if (ret == -ENODATA) {
+		struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+
+		if (!S_ISDIR(inode->i_mode))
+			ret = -ENOTDIR;
+		else if (IS_DEADDIR(inode))
+			ret = -ENOENT;
+		else if (lsi && !lsi->lsi_cop->empty_dir(inode))
+			ret = -ENOTEMPTY;
+		else
+			ret = set_encryption_policy(inode, &policy);
+	} else if (ret == -EINVAL ||
+		   (ret == 0 && !llcrypt_policies_equal(&policy,
+							&existing_policy))) {
+		/* The file already uses a different encryption policy. */
+		ret = -EEXIST;
+	}
+
+	inode_unlock(inode);
+
+	mnt_drop_write_file(filp);
+	return ret;
+}
+EXPORT_SYMBOL(llcrypt_ioctl_set_policy);
+
+/* Original ioctl version; can only get the original policy version */
+int llcrypt_ioctl_get_policy(struct file *filp, void __user *arg)
+{
+	union llcrypt_policy policy;
+	int err;
+
+	err = llcrypt_get_policy(file_inode(filp), &policy);
+	if (err)
+		return err;
+
+	if (policy.version != LLCRYPT_POLICY_V1)
+		return -EINVAL;
+
+	if (copy_to_user(arg, &policy, sizeof(policy.v1)))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL(llcrypt_ioctl_get_policy);
+
+/* Valid filenames_encryption_mode associated with contents_encryption_mode,
+ * as imposed by llcrypt_valid_enc_modes()
+ */
+static inline u8 contents2filenames_encmode(u8 contents_encryption_mode)
+{
+	if (contents_encryption_mode == LLCRYPT_MODE_AES_128_CBC)
+		return LLCRYPT_MODE_AES_128_CTS;
+	if (contents_encryption_mode == LLCRYPT_MODE_AES_256_XTS)
+		return LLCRYPT_MODE_AES_256_CTS;
+	if (contents_encryption_mode == LLCRYPT_MODE_ADIANTUM)
+		return LLCRYPT_MODE_ADIANTUM;
+	return LLCRYPT_MODE_NULL;
+}
+
+/* Extended ioctl version; can get policies of any version */
+int llcrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg)
+{
+	struct llcrypt_get_policy_ex_arg arg;
+	union llcrypt_policy *policy = (union llcrypt_policy *)&arg.policy;
+	size_t policy_size;
+	struct inode *inode = file_inode(filp);
+	int err;
+
+	/* arg is policy_size, then policy */
+	BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0);
+	BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) !=
+		     offsetof(typeof(arg), policy));
+	BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy));
+
+	err = llcrypt_get_policy(file_inode(filp), policy);
+	if (err)
+		return err;
+	policy_size = llcrypt_policy_size(policy);
+
+	if (copy_from_user(&arg, uarg, sizeof(arg.policy_size)))
+		return -EFAULT;
+
+	if (policy_size > arg.policy_size)
+		return -EOVERFLOW;
+	arg.policy_size = policy_size;
+
+	/* Do not return null filenames_encryption_mode to userspace, as it is
+	 * unknown. Instead, return valid mode associated with
+	 * contents_encryption_mode, as imposed by llcrypt_valid_enc_modes().
+	 */
+	switch (policy->version) {
+	case LLCRYPT_POLICY_V1:
+		if (policy->v1.filenames_encryption_mode == LLCRYPT_MODE_NULL) {
+			policy->v1.filenames_encryption_mode =
+				contents2filenames_encmode(
+					policy->v1.contents_encryption_mode);
+			CWARN("inode %lu: returning policy filenames_encryption_mode as %d, but is in fact null\n",
+			      inode->i_ino,
+			      policy->v1.filenames_encryption_mode);
+		}
+		break;
+	case LLCRYPT_POLICY_V2:
+		if (policy->v2.filenames_encryption_mode == LLCRYPT_MODE_NULL) {
+			policy->v2.filenames_encryption_mode =
+				contents2filenames_encmode(
+					policy->v2.contents_encryption_mode);
+			CWARN("inode %lu: returning policy filenames_encryption_mode as %d, but is in fact null\n",
+			      inode->i_ino,
+			      policy->v2.filenames_encryption_mode);
+		}
+		break;
+	}
+
+	if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(llcrypt_ioctl_get_policy_ex);
+
+/**
+ * llcrypt_has_permitted_context() - is a file's encryption policy permitted
+ *				     within its directory?
+ *
+ * @parent: inode for parent directory
+ * @child: inode for file being looked up, opened, or linked into @parent
+ *
+ * Filesystems must call this before permitting access to an inode in a
+ * situation where the parent directory is encrypted (either before allowing
+ * ->lookup() to succeed, or for a regular file before allowing it to be opened)
+ * and before any operation that involves linking an inode into an encrypted
+ * directory, including link, rename, and cross rename.  It enforces the
+ * constraint that within a given encrypted directory tree, all files use the
+ * same encryption policy.  The pre-access check is needed to detect potentially
+ * malicious offline violations of this constraint, while the link and rename
+ * checks are needed to prevent online violations of this constraint.
+ *
+ * Return: 1 if permitted, 0 if forbidden.
+ */
+int llcrypt_has_permitted_context(struct inode *parent, struct inode *child)
+{
+	union llcrypt_policy parent_policy, child_policy;
+	int err;
+
+	/* No restrictions on file types which are never encrypted */
+	if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
+	    !S_ISLNK(child->i_mode))
+		return 1;
+
+	/* No restrictions if the parent directory is unencrypted */
+	if (!IS_ENCRYPTED(parent))
+		return 1;
+
+	/* Encrypted directories must not contain unencrypted files */
+	if (!IS_ENCRYPTED(child))
+		return 0;
+
+	/*
+	 * Both parent and child are encrypted, so verify they use the same
+	 * encryption policy.  Compare the llcrypt_info structs if the keys are
+	 * available, otherwise retrieve and compare the llcrypt_contexts.
+	 *
+	 * Note that the llcrypt_context retrieval will be required frequently
+	 * when accessing an encrypted directory tree without the key.
+	 * Performance-wise this is not a big deal because we already don't
+	 * really optimize for file access without the key (to the extent that
+	 * such access is even possible), given that any attempted access
+	 * already causes a llcrypt_context retrieval and keyring search.
+	 *
+	 * In any case, if an unexpected error occurs, fall back to "forbidden".
+	 */
+
+	err = llcrypt_get_encryption_info(parent);
+	if (err)
+		return 0;
+	err = llcrypt_get_encryption_info(child);
+	if (err)
+		return 0;
+
+	err = llcrypt_get_policy(parent, &parent_policy);
+	if (err)
+		return 0;
+
+	err = llcrypt_get_policy(child, &child_policy);
+	if (err)
+		return 0;
+
+	return llcrypt_policies_equal(&parent_policy, &child_policy);
+}
+EXPORT_SYMBOL(llcrypt_has_permitted_context);
+
+/**
+ * llcrypt_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child:  Child inode that inherits the context from @parent.
+ * @fs_data:  private data given by FS.
+ * @preload:  preload child crypt info if true
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int llcrypt_inherit_context(struct inode *parent, struct inode *child,
+						void *fs_data, bool preload)
+{
+	union llcrypt_context ctx;
+	int ctxsize;
+	struct llcrypt_info *ci;
+	struct lustre_sb_info *lsi = s2lsi(parent->i_sb);
+	int res;
+
+	res = llcrypt_get_encryption_info(parent);
+	if (res < 0)
+		return res;
+
+	ci = (struct llcrypt_info *)READ_ONCE(llcrypt_info_nocast(parent));
+	if (ci == NULL)
+		return -ENOKEY;
+
+	if (!lsi)
+		return -ENOKEY;
+
+	ctxsize = llcrypt_new_context_from_policy(&ctx, &ci->ci_policy);
+
+	BUILD_BUG_ON(sizeof(ctx) != LLCRYPT_SET_CONTEXT_MAX_SIZE);
+	res = lsi->lsi_cop->set_context(child, &ctx, ctxsize, fs_data);
+	if (res)
+		return res;
+	return preload ? llcrypt_get_encryption_info(child): 0;
+}
+EXPORT_SYMBOL(llcrypt_inherit_context);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
new file mode 100644
index 0000000000000..f8ad1461cb6d9
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
@@ -0,0 +1,736 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/libcfs/debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <libcfs/libcfs_string.h>
+#include <linux/kthread.h>
+#include <linux/stacktrace.h>
+#include <linux/utsname.h>
+#include <linux/kallsyms.h>
+#ifdef HAVE_PANIC_NOTIFIER_H
+#include <linux/panic_notifier.h>
+#endif
+#include "tracefile.h"
+
+static char debug_file_name[1024];
+
+unsigned int libcfs_subsystem_debug = LIBCFS_S_DEFAULT;
+EXPORT_SYMBOL(libcfs_subsystem_debug);
+module_param(libcfs_subsystem_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_subsystem_debug, "Lustre kernel debug subsystem mask");
+
+unsigned int libcfs_debug = LIBCFS_D_DEFAULT;
+EXPORT_SYMBOL(libcfs_debug);
+module_param(libcfs_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask");
+
+static int libcfs_param_debug_mb_set(const char *val,
+				     cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned int num;
+
+	rc = kstrtouint(val, 0, &num);
+	if (rc < 0)
+		return rc;
+
+	num = cfs_trace_set_debug_mb(num);
+
+	*((unsigned int *)kp->arg) = num;
+	num = cfs_trace_get_debug_mb();
+	if (num)
+		/* This value is more precise */
+		*((unsigned int *)kp->arg) = num;
+
+	return 0;
+}
+
+/* While debug_mb setting look like unsigned int, in fact
+ * it needs quite a bunch of extra processing, so we define special
+ * debug_mb parameter type with corresponding methods to handle this case
+ */
+static const struct kernel_param_ops param_ops_debug_mb = {
+	.set = libcfs_param_debug_mb_set,
+	.get = param_get_uint,
+};
+
+#define param_check_debug_mb(name, p) \
+		__param_check(name, p, unsigned int)
+
+static unsigned int libcfs_debug_mb;
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_debug_mb, debug_mb, 0644);
+#else
+module_param_call(libcfs_debug_mb, libcfs_param_debug_mb_set, param_get_uint,
+		  &param_ops_debug_mb, 0644);
+#endif
+MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size.");
+
+unsigned int libcfs_printk = D_CANTMASK;
+module_param(libcfs_printk, uint, 0644);
+MODULE_PARM_DESC(libcfs_printk, "Lustre kernel debug console mask");
+
+unsigned int libcfs_console_ratelimit = 1;
+module_param(libcfs_console_ratelimit, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)");
+
+static int param_set_delay_minmax(const char *val,
+				  cfs_kernel_param_arg_t *kp,
+				  long min, long max)
+{
+	long d;
+	int sec;
+	int rc;
+
+	rc = kstrtoint(val, 0, &sec);
+	if (rc)
+		return -EINVAL;
+
+	/* The sysfs setting is in centiseconds */
+	d = cfs_time_seconds(sec) / 100;
+	if (d < min || d > max)
+		return -EINVAL;
+
+	*((unsigned int *)kp->arg) = d;
+
+	return 0;
+}
+
+static int param_get_delay(char *buffer, cfs_kernel_param_arg_t *kp)
+{
+	unsigned int d = *(unsigned int *)kp->arg;
+
+	param_get_byte(buffer, kp);
+	return sprintf(buffer, "%lu%c", jiffies_to_msecs(d * 10) / MSEC_PER_SEC,
+		       strnchr(buffer, PAGE_SIZE, '\n') ? '\n' : '\0');
+}
+
+unsigned int libcfs_console_max_delay;
+unsigned int libcfs_console_min_delay;
+
+static int param_set_console_max_delay(const char *val,
+				       cfs_kernel_param_arg_t *kp)
+{
+	return param_set_delay_minmax(val, kp,
+				      libcfs_console_min_delay, INT_MAX);
+}
+
+static const struct kernel_param_ops param_ops_console_max_delay = {
+	.set = param_set_console_max_delay,
+	.get = param_get_delay,
+};
+
+#define param_check_console_max_delay(name, p) \
+		__param_check(name, p, unsigned int)
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_console_max_delay, console_max_delay, 0644);
+#else
+module_param_call(libcfs_console_max_delay, param_set_console_max_delay,
+		  param_get_delay, &param_ops_console_max_delay, 0644);
+#endif
+MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)");
+
+static int param_set_console_min_delay(const char *val,
+				       cfs_kernel_param_arg_t *kp)
+{
+	return param_set_delay_minmax(val, kp,
+				      1, libcfs_console_max_delay);
+}
+
+static const struct kernel_param_ops param_ops_console_min_delay = {
+	.set = param_set_console_min_delay,
+	.get = param_get_delay,
+};
+
+#define param_check_console_min_delay(name, p) \
+		__param_check(name, p, unsigned int)
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_console_min_delay, console_min_delay, 0644);
+#else
+module_param_call(libcfs_console_min_delay, param_set_console_min_delay,
+		  param_get_delay, &param_ops_console_min_delay, 0644);
+#endif
+MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)");
+
+#ifndef HAVE_PARAM_SET_UINT_MINMAX
+static int param_set_uint_minmax(const char *val,
+				 cfs_kernel_param_arg_t *kp,
+				 unsigned int min, unsigned int max)
+{
+	unsigned int num;
+	int ret;
+
+	if (!val)
+		return -EINVAL;
+
+	ret = kstrtouint(val, 0, &num);
+	if (ret < 0 || num < min || num > max)
+		return -EINVAL;
+
+	*((unsigned int *)kp->arg) = num;
+	return 0;
+}
+#endif
+
+static int param_set_uintpos(const char *val,
+			     cfs_kernel_param_arg_t *kp)
+{
+	return param_set_uint_minmax(val, kp, 1, -1);
+}
+
+static const struct kernel_param_ops param_ops_uintpos = {
+	.set = param_set_uintpos,
+	.get = param_get_uint,
+};
+
+#define param_check_uintpos(name, p) \
+		__param_check(name, p, unsigned int)
+
+unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_console_backoff, uintpos, 0644);
+#else
+module_param_call(libcfs_console_backoff, param_set_uintpos, param_get_uint,
+		  &param_ops_uintpos, 0644);
+#endif
+MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor");
+
+unsigned int libcfs_debug_binary = 1;
+
+unsigned int libcfs_stack = 3 * THREAD_SIZE / 4;
+EXPORT_SYMBOL(libcfs_stack);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+unsigned int libcfs_watchdog_ratelimit = 300;
+EXPORT_SYMBOL(libcfs_watchdog_ratelimit);
+
+unsigned int libcfs_panic_on_lbug = 1;
+module_param(libcfs_panic_on_lbug, uint, 0644);
+MODULE_PARM_DESC(libcfs_panic_on_lbug, "Lustre kernel panic on LBUG");
+
+atomic64_t libcfs_kmem = ATOMIC64_INIT(0);
+EXPORT_SYMBOL(libcfs_kmem);
+
+static DECLARE_COMPLETION(debug_complete);
+
+/* We need to pass a pointer here, but elsewhere this must be a const */
+char *libcfs_debug_file_path = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+EXPORT_SYMBOL(libcfs_debug_file_path);
+module_param(libcfs_debug_file_path, charp, 0644);
+MODULE_PARM_DESC(libcfs_debug_file_path,
+		 "Path for dumping debug logs, set 'NONE' to prevent log dumping");
+
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned string in lower-case */
+static const char *libcfs_debug_subsys2str(int subsys)
+{
+	static const char *const libcfs_debug_subsystems[] =
+		LIBCFS_DEBUG_SUBSYS_NAMES;
+
+	if (subsys >= ARRAY_SIZE(libcfs_debug_subsystems))
+		return NULL;
+
+	return libcfs_debug_subsystems[subsys];
+}
+
+/* libcfs_debug_token2mask() expects the returned string in lower-case */
+static const char *libcfs_debug_dbg2str(int debug)
+{
+	static const char * const libcfs_debug_masks[] =
+		LIBCFS_DEBUG_MASKS_NAMES;
+
+	if (debug >= ARRAY_SIZE(libcfs_debug_masks))
+		return NULL;
+
+	return libcfs_debug_masks[debug];
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int len = 0;
+	const char *token;
+	int i;
+
+	if (mask == 0) {			/* "0" */
+		if (size > 0)
+			str[0] = '0';
+		len = 1;
+	} else {				/* space-separated tokens */
+		for (i = 0; i < 32; i++) {
+			if ((mask & BIT(i)) == 0)
+				continue;
+
+			token = fn(i);
+			if (!token)	/* unused bit */
+				continue;
+
+			if (len > 0) {		/* separator? */
+				if (len < size)
+					str[len] = ' ';
+				len++;
+			}
+
+			while (*token != 0) {
+				if (len < size)
+					str[len] = *token;
+				token++;
+				len++;
+			}
+		}
+	}
+
+	/* terminate 'str' */
+	if (len < size)
+		str[len] = 0;
+	else
+		str[size - 1] = 0;
+
+	return len;
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int m = 0;
+	int matched;
+	int n;
+	int t;
+
+	/* Allow a number for backwards compatibility */
+	for (n = strlen(str); n > 0; n--)
+		if (!isspace(str[n-1]))
+			break;
+	matched = n;
+	t = sscanf(str, "%i%n", &m, &matched);
+	if (t >= 1 && matched == n) {
+		/* don't print warning for lctl set_param debug=0 or -1 */
+		if (m != 0 && m != -1)
+			CWARN("You are trying to use a numerical value for the mask - this will be deprecated in a future release.\n");
+		*mask = m;
+		return 0;
+	}
+
+	return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK, ~0,
+			    is_subsys ? LIBCFS_S_DEFAULT : LIBCFS_D_DEFAULT);
+}
+
+char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall";
+
+/* Upcall function once a Lustre log has been dumped.
+ *
+ * @file	path of the dumped log
+ */
+static void libcfs_run_debug_log_upcall(char *file)
+{
+	char *argv[3];
+	int rc;
+	static const char * const envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL
+	};
+
+	ENTRY;
+	argv[0] = lnet_debug_log_upcall;
+
+	LASSERTF(file, "called on a null filename\n");
+	argv[1] = file; /* only need to pass the path of the file */
+
+	argv[2] = NULL;
+
+	rc = call_usermodehelper(argv[0], argv, (char **)envp, 1);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("Error %d invoking LNET debug log upcall %s %s; check /sys/kernel/debug/lnet/debug_log_upcall\n",
+		       rc, argv[0], argv[1]);
+	} else {
+		CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n",
+		       argv[0], argv[1]);
+	}
+}
+
+/**
+ * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages()
+ */
+static void libcfs_debug_dumplog_internal(void *arg)
+{
+	static time64_t last_dump_time;
+	time64_t current_time;
+
+	current_time = ktime_get_real_seconds();
+
+	if (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 &&
+	    current_time > last_dump_time) {
+		last_dump_time = current_time;
+		snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+			 "%s.%lld.%ld", libcfs_debug_file_path,
+			 (s64)current_time, (uintptr_t)arg);
+		pr_alert("LustreError: dumping log to %s\n", debug_file_name);
+		cfs_tracefile_dump_all_pages(debug_file_name);
+		libcfs_run_debug_log_upcall(debug_file_name);
+	}
+}
+
+static int libcfs_debug_dumplog_thread(void *arg)
+{
+	libcfs_debug_dumplog_internal(arg);
+	complete(&debug_complete);
+	return 0;
+}
+
+static DEFINE_MUTEX(libcfs_debug_dumplog_lock);
+
+void libcfs_debug_dumplog(void)
+{
+	struct task_struct *dumper;
+
+	ENTRY;
+
+	if (mutex_trylock(&libcfs_debug_dumplog_lock) == 0)
+		return;
+
+	/* If a previous call was interrupted, debug_complete->done
+	 * might be elevated, and so we won't actually wait here.
+	 * So we reinit the completion to ensure we wait for
+	 * one thread to complete, though it might not be the one
+	 * we start if there are overlaping thread.
+	 */
+	reinit_completion(&debug_complete);
+	dumper = kthread_run(libcfs_debug_dumplog_thread,
+			     (void *)(long)current->pid,
+			     "libcfs_debug_dumper");
+	if (IS_ERR(dumper))
+		pr_err("LustreError: cannot start log dump thread: rc = %ld\n",
+		       PTR_ERR(dumper));
+	else
+		wait_for_completion_interruptible(&debug_complete);
+
+	mutex_unlock(&libcfs_debug_dumplog_lock);
+}
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+
+/* coverity[+kill] */
+void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
+{
+	libcfs_catastrophe = 1;
+	libcfs_debug_msg(msgdata, "LBUG\n");
+
+	if (in_interrupt()) {
+		panic("LBUG in interrupt.\n");
+		/* not reached */
+	}
+
+	libcfs_debug_dumpstack(NULL);
+	if (libcfs_panic_on_lbug)
+		panic("LBUG");
+	else
+		libcfs_debug_dumplog();
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (1)
+		schedule();
+}
+EXPORT_SYMBOL(lbug_with_loc);
+
+#ifdef CONFIG_STACKTRACE
+
+#ifndef HAVE_SAVE_STACK_TRACE_TSK
+#define save_stack_trace_tsk(tsk, trace)				       \
+do {									       \
+	if (tsk == current)						       \
+		save_stack_trace(trace);				       \
+	else								       \
+		pr_info("No stack, save_stack_trace_tsk() not exported\n");    \
+} while (0)
+#endif
+
+static void cfs_print_stack_trace(unsigned long *entries, unsigned int nr)
+{
+	unsigned int i;
+
+	/* Prefer %pB for backtraced symbolic names since it was added in:
+	 * Linux v2.6.38-6557-g0f77a8d37825
+	 * vsprintf: Introduce %pB format specifier
+	 */
+	for (i = 0; i < nr; i++)
+		pr_info("[<0>] %pB\n", (void *)entries[i]);
+}
+
+#define MAX_ST_ENTRIES	100
+static DEFINE_SPINLOCK(st_lock);
+
+/* Linux v5.1-rc5 214d8ca6ee ("stacktrace: Provide common infrastructure")
+ * CONFIG_ARCH_STACKWALK indicates that save_stack_trace_tsk symbol is not
+ * exported. Use symbol_get() to find if save_stack_trace_tsk is available.
+ */
+#ifdef CONFIG_ARCH_STACKWALK
+typedef unsigned int (stack_trace_save_tsk_t)(struct task_struct *task,
+					      unsigned long *store,
+					      unsigned int size,
+					      unsigned int skipnr);
+static stack_trace_save_tsk_t *task_dump_stack;
+#endif
+
+void __init cfs_debug_init(void)
+{
+#ifdef CONFIG_ARCH_STACKWALK
+	task_dump_stack = (void *)
+			cfs_kallsyms_lookup_name("stack_trace_save_tsk");
+
+#endif
+}
+
+static void libcfs_call_trace(struct task_struct *tsk)
+{
+	static unsigned long entries[MAX_ST_ENTRIES];
+#ifdef CONFIG_ARCH_STACKWALK
+	unsigned int nr_entries;
+
+	spin_lock(&st_lock);
+	pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm,
+		init_utsname()->release, init_utsname()->version);
+	pr_info("Call Trace TBD:\n");
+	if (task_dump_stack) {
+		nr_entries = task_dump_stack(tsk, entries, MAX_ST_ENTRIES, 0);
+		cfs_print_stack_trace(entries, nr_entries);
+	}
+	spin_unlock(&st_lock);
+#else
+	struct stack_trace trace;
+
+	trace.nr_entries = 0;
+	trace.max_entries = MAX_ST_ENTRIES;
+	trace.entries = entries;
+	trace.skip = 0;
+
+	spin_lock(&st_lock);
+	pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm,
+		init_utsname()->release, init_utsname()->version);
+	pr_info("Call Trace:\n");
+	save_stack_trace_tsk(tsk, &trace);
+	cfs_print_stack_trace(trace.entries, trace.nr_entries);
+	spin_unlock(&st_lock);
+#endif
+}
+
+#else /* !CONFIG_STACKTRACE */
+
+#ifdef CONFIG_X86
+#include <linux/nmi.h>
+#include <asm/stacktrace.h>
+
+#ifdef HAVE_STACKTRACE_OPS
+static int print_trace_stack(void *data, char *name)
+{
+	printk(" <%s> ", name);
+	return 0;
+}
+
+#ifdef STACKTRACE_OPS_ADDRESS_RETURN_INT
+static int
+#else
+static void
+#endif
+print_trace_address(void *data, unsigned long addr, int reliable)
+{
+	char fmt[32];
+
+	touch_nmi_watchdog();
+	sprintf(fmt, " [<%016lx>] %s%%s\n", addr, reliable ? "" : "? ");
+	__print_symbol(fmt, addr);
+#ifdef STACKTRACE_OPS_ADDRESS_RETURN_INT
+	return 0;
+#endif
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+	.stack		= print_trace_stack,
+	.address	= print_trace_address,
+	.walk_stack	= print_context_stack,
+};
+#endif /* HAVE_STACKTRACE_OPS */
+
+static void libcfs_call_trace(struct task_struct *tsk)
+{
+#ifdef HAVE_STACKTRACE_OPS
+	printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm);
+	printk("\nCall Trace:\n");
+	dump_trace(tsk, NULL, NULL, 0, &print_trace_ops, NULL);
+	printk("\n");
+#else /* !HAVE_STACKTRACE_OPS */
+	if (tsk == current)
+		dump_stack();
+	else
+		CWARN("can't show stack: kernel doesn't export show_task\n");
+#endif /* HAVE_STACKTRACE_OPS */
+}
+
+#else /* !CONFIG_X86 */
+
+static void libcfs_call_trace(struct task_struct *tsk)
+{
+	if (tsk == current)
+		dump_stack();
+	else
+		CWARN("can't show stack: kernel doesn't export show_task\n");
+}
+
+#endif /* CONFIG_X86 */
+
+#endif /* CONFIG_STACKTRACE */
+
+void libcfs_debug_dumpstack(struct task_struct *tsk)
+{
+	libcfs_call_trace(tsk ?: current);
+}
+EXPORT_SYMBOL(libcfs_debug_dumpstack);
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+			  void *unused2)
+{
+	if (libcfs_panic_in_progress)
+		return 0;
+
+	libcfs_panic_in_progress = 1;
+	mb();
+
+#ifdef LNET_DUMP_ON_PANIC
+	/* This is currently disabled because it spews far too much to the
+	 * console on the rare cases it is ever triggered. */
+
+	if (in_interrupt()) {
+		cfs_trace_debug_print();
+	} else {
+		libcfs_debug_dumplog_internal((void *)(long)current->pid);
+	}
+#endif
+	return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+	.notifier_call		= panic_notifier,
+	.next			= NULL,
+	.priority		= 10000,
+};
+
+static void libcfs_register_panic_notifier(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list,
+				       &libcfs_panic_notifier);
+}
+
+static void libcfs_unregister_panic_notifier(void)
+{
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+					 &libcfs_panic_notifier);
+}
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+	unsigned int max = libcfs_debug_mb;
+	int rc = 0;
+
+	if (libcfs_console_max_delay <= 0 || /* not set by user or */
+	    libcfs_console_min_delay <= 0 || /* set to invalid values */
+	    libcfs_console_min_delay >= libcfs_console_max_delay) {
+		libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY;
+		libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY;
+	}
+
+	/* If libcfs_debug_mb is uninitialized then just make the
+	 * total buffers smp_num_cpus * TCD_MAX_PAGES
+	 */
+	if (max < num_possible_cpus()) {
+		max = TCD_MAX_PAGES;
+	} else {
+		max = (max / num_possible_cpus());
+		max <<= (20 - PAGE_SHIFT);
+	}
+
+	rc = cfs_tracefile_init(max);
+	if (rc)
+		return rc;
+
+	libcfs_register_panic_notifier();
+	kernel_param_lock(THIS_MODULE);
+	if (libcfs_debug_mb == 0)
+		libcfs_debug_mb = cfs_trace_get_debug_mb();
+	kernel_param_unlock(THIS_MODULE);
+	return rc;
+}
+
+int libcfs_debug_cleanup(void)
+{
+	libcfs_unregister_panic_notifier();
+	kernel_param_lock(THIS_MODULE);
+	cfs_tracefile_exit();
+	kernel_param_unlock(THIS_MODULE);
+	return 0;
+}
+
+int libcfs_debug_clear_buffer(void)
+{
+	cfs_trace_flush_pages();
+	return 0;
+}
+
+/* Debug markers, although printed by S_LNET should not be be marked as such. */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int libcfs_debug_mark_buffer(const char *text)
+{
+	CDEBUG(D_TRACE,
+	       "**************************************************\n");
+	LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+	CDEBUG(D_TRACE,
+	       "**************************************************\n");
+
+	return 0;
+}
+
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_LNET
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/fail.c b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c
new file mode 100644
index 0000000000000..5623e3f226fa6
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c
@@ -0,0 +1,153 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <libcfs/libcfs.h>
+
+unsigned long cfs_fail_loc;
+EXPORT_SYMBOL(cfs_fail_loc);
+
+unsigned int cfs_fail_val;
+EXPORT_SYMBOL(cfs_fail_val);
+
+int cfs_fail_err;
+EXPORT_SYMBOL(cfs_fail_err);
+
+DECLARE_WAIT_QUEUE_HEAD(cfs_race_waitq);
+EXPORT_SYMBOL(cfs_race_waitq);
+
+int cfs_race_state;
+EXPORT_SYMBOL(cfs_race_state);
+
+int __cfs_fail_check_set(u32 id, u32 value, int set)
+{
+	static atomic_t cfs_fail_count = ATOMIC_INIT(0);
+
+	LASSERT(!(id & CFS_FAIL_ONCE));
+
+	if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) ==
+	    (CFS_FAILED | CFS_FAIL_ONCE)) {
+		atomic_set(&cfs_fail_count, 0); /* paranoia */
+		return 0;
+	}
+
+	/* Fail 1/cfs_fail_val times */
+	if (cfs_fail_loc & CFS_FAIL_RAND) {
+		if (cfs_fail_val < 2 || get_random_u32_below(cfs_fail_val) > 0)
+			return 0;
+	}
+
+	/* Skip the first cfs_fail_val, then fail */
+	if (cfs_fail_loc & CFS_FAIL_SKIP) {
+		if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val)
+			return 0;
+	}
+
+	/* check cfs_fail_val... */
+	if (set == CFS_FAIL_LOC_VALUE) {
+		if (cfs_fail_val != -1 && cfs_fail_val != value)
+			return 0;
+	}
+
+	/* Fail cfs_fail_val times, overridden by FAIL_ONCE */
+	if (cfs_fail_loc & CFS_FAIL_SOME &&
+	    (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) {
+		int count = atomic_inc_return(&cfs_fail_count);
+
+		if (count >= cfs_fail_val) {
+			set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+			atomic_set(&cfs_fail_count, 0);
+			/* we are lost race to increase  */
+			if (count > cfs_fail_val)
+				return 0;
+		}
+	}
+
+	/* Take into account the current call for FAIL_ONCE for ORSET only,
+	 * as RESET is a new fail_loc, it does not change the current call
+	 */
+	if ((set == CFS_FAIL_LOC_ORSET) && (value & CFS_FAIL_ONCE))
+		set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+	/* Lost race to set CFS_FAILED_BIT. */
+	if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) {
+		/* If CFS_FAIL_ONCE is valid, only one process can fail,
+		 * otherwise multi-process can fail at the same time.
+		 */
+		if (cfs_fail_loc & CFS_FAIL_ONCE)
+			return 0;
+	}
+
+	switch (set) {
+	case CFS_FAIL_LOC_NOSET:
+	case CFS_FAIL_LOC_VALUE:
+		break;
+	case CFS_FAIL_LOC_ORSET:
+		cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE);
+		break;
+	case CFS_FAIL_LOC_RESET:
+		cfs_fail_loc = value;
+		atomic_set(&cfs_fail_count, 0);
+		break;
+	default:
+		LASSERTF(0, "called with bad set %u\n", set);
+		break;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(__cfs_fail_check_set);
+
+int __cfs_fail_timeout_set(u32 id, u32 value, int ms, int set)
+{
+	ktime_t till = ktime_add_ms(ktime_get(), ms);
+	int ret;
+
+	ret = __cfs_fail_check_set(id, value, set);
+	if (ret && likely(ms > 0)) {
+		CERROR("cfs_fail_timeout id %x sleeping for %dms\n", id, ms);
+		while (ktime_before(ktime_get(), till)) {
+			schedule_timeout_uninterruptible(cfs_time_seconds(1)
+							 / 10);
+			set_current_state(TASK_RUNNING);
+			if (!cfs_fail_loc) {
+				CERROR("cfs_fail_timeout interrupted\n");
+				break;
+			}
+		}
+		if (cfs_fail_loc)
+			CERROR("cfs_fail_timeout id %x awake\n", id);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cfs_fail_timeout_set);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/hash.c
new file mode 100644
index 0000000000000..ca234ef096229
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/hash.c
@@ -0,0 +1,2126 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/libcfs/hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ *   * CFS_HASH_DEBUG additional validation
+ *   * CFS_HASH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
+ *
+ * 2009-07-31: Liang Zhen <zhen.liang@sun.com>
+ * - move all stuff to libcfs
+ * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH
+ * - ignore hs_rwlock if without CFS_HASH_REHASH setting
+ * - buckets are allocated one by one(instead of contiguous memory),
+ *   to avoid unnecessary cacheline conflict
+ *
+ * 2010-03-01: Liang Zhen <zhen.liang@sun.com>
+ * - "bucket" is a group of hlist_head now, user can specify bucket size
+ *   by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share
+ *   one lock for reducing memory overhead.
+ *
+ * - support lockless hash, caller will take care of locks:
+ *   avoid lock overhead for hash tables that are already protected
+ *   by locking in the caller for another reason
+ *
+ * - support both spin_lock/rwlock for bucket:
+ *   overhead of spinlock contention is lower than read/write
+ *   contention of rwlock, so using spinlock to serialize operations on
+ *   bucket is more reasonable for those frequently changed hash tables
+ *
+ * - support one-single lock mode:
+ *   one lock to protect all hash operations to avoid overhead of
+ *   multiple locks if hash table is always small
+ *
+ * - removed a lot of unnecessary addref & decref on hash element:
+ *   addref & decref are atomic operations in many use-cases which
+ *   are expensive.
+ *
+ * - support non-blocking cfs_hash_add() and cfs_hash_findadd():
+ *   some lustre use-cases require these functions to be strictly
+ *   non-blocking, we need to schedule required rehash on a different
+ *   thread on those cases.
+ *
+ * - safer rehash on large hash table
+ *   In old implementation, rehash function will exclusively lock the
+ *   hash table and finish rehash in one batch, it's dangerous on SMP
+ *   system because rehash millions of elements could take long time.
+ *   New implemented rehash can release lock and relax CPU in middle
+ *   of rehash, it's safe for another thread to search/change on the
+ *   hash table even it's in rehasing.
+ *
+ * - support two different refcount modes
+ *   . hash table has refcount on element
+ *   . hash table doesn't change refcount on adding/removing element
+ *
+ * - support long name hash table (for param-tree)
+ *
+ * - fix a bug for cfs_hash_rehash_key:
+ *   in old implementation, cfs_hash_rehash_key could screw up the
+ *   hash-table because @key is overwritten without any protection.
+ *   Now we need user to define hs_keycpy for those rehash enabled
+ *   hash tables, cfs_hash_rehash_key will overwrite hash-key
+ *   inside lock by calling hs_keycpy.
+ *
+ * - better hash iteration:
+ *   Now we support both locked iteration & lockless iteration of hash
+ *   table. Also, user can break the iteration by return 1 in callback.
+ */
+#include <linux/seq_file.h>
+#include <linux/log2.h>
+
+#include <libcfs/linux/linux-list.h>
+#include <libcfs/libcfs.h>
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static unsigned int warn_on_depth = 8;
+module_param(warn_on_depth, uint, 0644);
+MODULE_PARM_DESC(warn_on_depth, "warning when hash depth is high.");
+#endif
+
+struct workqueue_struct *cfs_rehash_wq;
+
+static inline void
+cfs_hash_nl_lock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_nl_unlock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_spin_lock(union cfs_hash_lock *lock, int exclusive)
+	__acquires(&lock->spin)
+{
+	spin_lock(&lock->spin);
+}
+
+static inline void
+cfs_hash_spin_unlock(union cfs_hash_lock *lock, int exclusive)
+	__releases(&lock->spin)
+{
+	spin_unlock(&lock->spin);
+}
+
+static inline void
+cfs_hash_rw_lock(union cfs_hash_lock *lock, int exclusive)
+	__acquires(&lock->rw)
+{
+	if (!exclusive)
+		read_lock(&lock->rw);
+	else
+		write_lock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_unlock(union cfs_hash_lock *lock, int exclusive)
+	__releases(&lock->rw)
+{
+	if (!exclusive)
+		read_unlock(&lock->rw);
+	else
+		write_unlock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_sem_lock(union cfs_hash_lock *lock, int exclusive)
+	__acquires(&lock->rw_sem)
+{
+	if (!exclusive)
+		down_read(&lock->rw_sem);
+	else
+		down_write(&lock->rw_sem);
+}
+
+static inline void
+cfs_hash_rw_sem_unlock(union cfs_hash_lock *lock, int exclusive)
+	__releases(&lock->rw_sem)
+{
+	if (!exclusive)
+		up_read(&lock->rw_sem);
+	else
+		up_write(&lock->rw_sem);
+}
+
+/** No lock hash */
+static struct cfs_hash_lock_ops cfs_hash_nl_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock	= cfs_hash_nl_unlock,
+	.hs_bkt_lock	= cfs_hash_nl_lock,
+	.hs_bkt_unlock	= cfs_hash_nl_unlock,
+};
+
+/** no bucket lock, one spinlock to protect everything */
+static struct cfs_hash_lock_ops cfs_hash_nbl_lops = {
+	.hs_lock	= cfs_hash_spin_lock,
+	.hs_unlock	= cfs_hash_spin_unlock,
+	.hs_bkt_lock	= cfs_hash_nl_lock,
+	.hs_bkt_unlock	= cfs_hash_nl_unlock,
+};
+
+/** spin bucket lock, rehash is enabled */
+static struct cfs_hash_lock_ops cfs_hash_bkt_spin_lops = {
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock	= cfs_hash_rw_unlock,
+	.hs_bkt_lock	= cfs_hash_spin_lock,
+	.hs_bkt_unlock	= cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is enabled */
+static struct cfs_hash_lock_ops cfs_hash_bkt_rw_lops = {
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock	= cfs_hash_rw_unlock,
+	.hs_bkt_lock	= cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+/** spin bucket lock, rehash is disabled */
+static struct cfs_hash_lock_ops cfs_hash_nr_bkt_spin_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock	= cfs_hash_nl_unlock,
+	.hs_bkt_lock	= cfs_hash_spin_lock,
+	.hs_bkt_unlock	= cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is disabled */
+static struct cfs_hash_lock_ops cfs_hash_nr_bkt_rw_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock	= cfs_hash_nl_unlock,
+	.hs_bkt_lock	= cfs_hash_rw_lock,
+	.hs_bkt_unlock	= cfs_hash_rw_unlock,
+};
+
+/** rw_sem bucket lock, rehash is disabled */
+static struct cfs_hash_lock_ops cfs_hash_nr_bkt_rw_sem_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock	= cfs_hash_nl_unlock,
+	.hs_bkt_lock	= cfs_hash_rw_sem_lock,
+	.hs_bkt_unlock	= cfs_hash_rw_sem_unlock,
+};
+
+/** rw_sem bucket lock, rehash is enabled */
+static struct cfs_hash_lock_ops cfs_hash_bkt_rw_sem_lops = {
+	.hs_lock	= cfs_hash_rw_sem_lock,
+	.hs_unlock	= cfs_hash_rw_sem_unlock,
+	.hs_bkt_lock	= cfs_hash_rw_sem_lock,
+	.hs_bkt_unlock	= cfs_hash_rw_sem_unlock,
+};
+
+static void
+cfs_hash_lock_setup(struct cfs_hash *hs)
+{
+	if (cfs_hash_with_no_lock(hs)) {
+		hs->hs_lops = &cfs_hash_nl_lops;
+
+	} else if (cfs_hash_with_no_bktlock(hs)) {
+		hs->hs_lops = &cfs_hash_nbl_lops;
+		spin_lock_init(&hs->hs_lock.spin);
+
+	} else if (cfs_hash_with_rehash(hs)) {
+		if (cfs_hash_with_rw_sem_bktlock(hs)) {
+			init_rwsem(&hs->hs_lock.rw_sem);
+			hs->hs_lops = &cfs_hash_bkt_rw_sem_lops;
+		} else {
+			rwlock_init(&hs->hs_lock.rw);
+
+			if (cfs_hash_with_rw_bktlock(hs))
+				hs->hs_lops = &cfs_hash_bkt_rw_lops;
+			else if (cfs_hash_with_spin_bktlock(hs))
+				hs->hs_lops = &cfs_hash_bkt_spin_lops;
+			else
+				LBUG();
+		}
+        } else {
+                if (cfs_hash_with_rw_bktlock(hs))
+                        hs->hs_lops = &cfs_hash_nr_bkt_rw_lops;
+                else if (cfs_hash_with_spin_bktlock(hs))
+                        hs->hs_lops = &cfs_hash_nr_bkt_spin_lops;
+		else if (cfs_hash_with_rw_sem_bktlock(hs))
+			hs->hs_lops = &cfs_hash_nr_bkt_rw_sem_lops;
+                else
+                        LBUG();
+        }
+}
+
+/**
+ * Simple hash head without depth tracking
+ * new element is always added to head of hlist
+ */
+struct cfs_hash_head {
+	struct hlist_head	hh_head;	/**< entries list */
+};
+
+static int
+cfs_hash_hh_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(struct cfs_hash_head);
+}
+
+static struct hlist_head *
+cfs_hash_hh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	struct cfs_hash_head *head;
+
+	head = (struct cfs_hash_head *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].hh_head;
+}
+
+static int
+cfs_hash_hh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd));
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_hh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_del_init(hnode);
+	return -1; /* unknown depth */
+}
+
+/**
+ * Simple hash head with depth tracking
+ * new element is always added to head of hlist
+ */
+struct cfs_hash_head_dep {
+	struct hlist_head	hd_head;	/**< entries list */
+	unsigned int		hd_depth;	/**< list length */
+};
+
+static int
+cfs_hash_hd_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(struct cfs_hash_head_dep);
+}
+
+static struct hlist_head *
+cfs_hash_hd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	struct cfs_hash_head_dep   *head;
+
+	head = (struct cfs_hash_head_dep *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].hd_head;
+}
+
+static int
+cfs_hash_hd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	struct cfs_hash_head_dep *hh;
+
+	hh = container_of(cfs_hash_hd_hhead(hs, bd),
+			  struct cfs_hash_head_dep, hd_head);
+	hlist_add_head(hnode, &hh->hd_head);
+	return ++hh->hd_depth;
+}
+
+static int
+cfs_hash_hd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	struct cfs_hash_head_dep *hh;
+
+	hh = container_of(cfs_hash_hd_hhead(hs, bd),
+			  struct cfs_hash_head_dep, hd_head);
+	hlist_del_init(hnode);
+	return --hh->hd_depth;
+}
+
+/**
+ * double links hash head without depth tracking
+ * new element is always added to tail of hlist
+ */
+struct cfs_hash_dhead {
+	struct hlist_head	dh_head;	/**< entries list */
+	struct hlist_node	*dh_tail;	/**< the last entry */
+};
+
+static int
+cfs_hash_dh_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(struct cfs_hash_dhead);
+}
+
+static struct hlist_head *
+cfs_hash_dh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	struct cfs_hash_dhead *head;
+
+	head = (struct cfs_hash_dhead *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dh_head;
+}
+
+static int
+cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	struct cfs_hash_dhead *dh;
+
+	dh = container_of(cfs_hash_dh_hhead(hs, bd),
+			  struct cfs_hash_dhead, dh_head);
+	if (dh->dh_tail != NULL) /* not empty */
+		hlist_add_behind(hnode, dh->dh_tail);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dh_head);
+	dh->dh_tail = hnode;
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_dh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnd)
+{
+	struct cfs_hash_dhead *dh;
+
+	dh = container_of(cfs_hash_dh_hhead(hs, bd),
+			  struct cfs_hash_dhead, dh_head);
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return -1; /* unknown depth */
+}
+
+/**
+ * double links hash head with depth tracking
+ * new element is always added to tail of hlist
+ */
+struct cfs_hash_dhead_dep {
+	struct hlist_head	dd_head;	/**< entries list */
+	struct hlist_node	*dd_tail;	/**< the last entry */
+	unsigned int		dd_depth;	/**< list length */
+};
+
+static int
+cfs_hash_dd_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(struct cfs_hash_dhead_dep);
+}
+
+static struct hlist_head *
+cfs_hash_dd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	struct cfs_hash_dhead_dep *head;
+
+	head = (struct cfs_hash_dhead_dep *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dd_head;
+}
+
+static int
+cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	struct cfs_hash_dhead_dep *dh;
+
+	dh = container_of(cfs_hash_dd_hhead(hs, bd),
+			  struct cfs_hash_dhead_dep, dd_head);
+	if (dh->dd_tail != NULL) /* not empty */
+		hlist_add_behind(hnode, dh->dd_tail);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dd_head);
+	dh->dd_tail = hnode;
+	return ++dh->dd_depth;
+}
+
+static int
+cfs_hash_dd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnd)
+{
+	struct cfs_hash_dhead_dep *dh;
+
+	dh = container_of(cfs_hash_dd_hhead(hs, bd),
+			  struct cfs_hash_dhead_dep, dd_head);
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return --dh->dd_depth;
+}
+
+static struct cfs_hash_hlist_ops cfs_hash_hh_hops = {
+       .hop_hhead      = cfs_hash_hh_hhead,
+       .hop_hhead_size = cfs_hash_hh_hhead_size,
+       .hop_hnode_add  = cfs_hash_hh_hnode_add,
+       .hop_hnode_del  = cfs_hash_hh_hnode_del,
+};
+
+static struct cfs_hash_hlist_ops cfs_hash_hd_hops = {
+       .hop_hhead      = cfs_hash_hd_hhead,
+       .hop_hhead_size = cfs_hash_hd_hhead_size,
+       .hop_hnode_add  = cfs_hash_hd_hnode_add,
+       .hop_hnode_del  = cfs_hash_hd_hnode_del,
+};
+
+static struct cfs_hash_hlist_ops cfs_hash_dh_hops = {
+       .hop_hhead      = cfs_hash_dh_hhead,
+       .hop_hhead_size = cfs_hash_dh_hhead_size,
+       .hop_hnode_add  = cfs_hash_dh_hnode_add,
+       .hop_hnode_del  = cfs_hash_dh_hnode_del,
+};
+
+static struct cfs_hash_hlist_ops cfs_hash_dd_hops = {
+       .hop_hhead      = cfs_hash_dd_hhead,
+       .hop_hhead_size = cfs_hash_dd_hhead_size,
+       .hop_hnode_add  = cfs_hash_dd_hnode_add,
+       .hop_hnode_del  = cfs_hash_dd_hnode_del,
+};
+
+static void
+cfs_hash_hlist_setup(struct cfs_hash *hs)
+{
+        if (cfs_hash_with_add_tail(hs)) {
+                hs->hs_hops = cfs_hash_with_depth(hs) ?
+                              &cfs_hash_dd_hops : &cfs_hash_dh_hops;
+        } else {
+                hs->hs_hops = cfs_hash_with_depth(hs) ?
+                              &cfs_hash_hd_hops : &cfs_hash_hh_hops;
+        }
+}
+
+static void
+cfs_hash_bd_from_key(struct cfs_hash *hs, struct cfs_hash_bucket **bkts,
+		     unsigned int bits, const void *key, struct cfs_hash_bd *bd)
+{
+        unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1);
+
+        LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits);
+
+        bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)];
+        bd->bd_offset = index >> (bits - hs->hs_bkt_bits);
+}
+
+void
+cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd)
+{
+        /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+        if (likely(hs->hs_rehash_buckets == NULL)) {
+                cfs_hash_bd_from_key(hs, hs->hs_buckets,
+                                     hs->hs_cur_bits, key, bd);
+        } else {
+                LASSERT(hs->hs_rehash_bits != 0);
+                cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                                     hs->hs_rehash_bits, key, bd);
+        }
+}
+EXPORT_SYMBOL(cfs_hash_bd_get);
+
+static inline void
+cfs_hash_bd_dep_record(struct cfs_hash *hs, struct cfs_hash_bd *bd, int dep_cur)
+{
+        if (likely(dep_cur <= bd->bd_bucket->hsb_depmax))
+                return;
+
+        bd->bd_bucket->hsb_depmax = dep_cur;
+# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+        if (likely(warn_on_depth == 0 ||
+                   max(warn_on_depth, hs->hs_dep_max) >= dep_cur))
+                return;
+
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_max  = dep_cur;
+	hs->hs_dep_bkt  = bd->bd_bucket->hsb_index;
+	hs->hs_dep_off  = bd->bd_offset;
+	hs->hs_dep_bits = hs->hs_cur_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	queue_work(cfs_rehash_wq, &hs->hs_dep_work);
+# endif
+}
+
+void
+cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			struct hlist_node *hnode)
+{
+	int rc;
+
+	rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode);
+	cfs_hash_bd_dep_record(hs, bd, rc);
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+	bd->bd_bucket->hsb_count++;
+
+	if (cfs_hash_with_counter(hs))
+		atomic_inc(&hs->hs_count);
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_get(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_add_locked);
+
+void
+cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		       struct hlist_node *hnode)
+{
+	hs->hs_hops->hop_hnode_del(hs, bd, hnode);
+
+	LASSERT(bd->bd_bucket->hsb_count > 0);
+	bd->bd_bucket->hsb_count--;
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+
+	if (cfs_hash_with_counter(hs)) {
+		LASSERT(atomic_read(&hs->hs_count) > 0);
+		atomic_dec(&hs->hs_count);
+	}
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_put_locked(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_del_locked);
+
+void
+cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old,
+			struct cfs_hash_bd *bd_new, struct hlist_node *hnode)
+{
+	struct cfs_hash_bucket *obkt = bd_old->bd_bucket;
+	struct cfs_hash_bucket *nbkt = bd_new->bd_bucket;
+        int                rc;
+
+        if (cfs_hash_bd_compare(bd_old, bd_new) == 0)
+                return;
+
+        /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops
+         * in cfs_hash_bd_del/add_locked */
+        hs->hs_hops->hop_hnode_del(hs, bd_old, hnode);
+        rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode);
+        cfs_hash_bd_dep_record(hs, bd_new, rc);
+
+        LASSERT(obkt->hsb_count > 0);
+        obkt->hsb_count--;
+        obkt->hsb_version++;
+        if (unlikely(obkt->hsb_version == 0))
+                obkt->hsb_version++;
+        nbkt->hsb_count++;
+        nbkt->hsb_version++;
+        if (unlikely(nbkt->hsb_version == 0))
+                nbkt->hsb_version++;
+}
+
+enum {
+	/** always set, for sanity (avoid ZERO intent) */
+	CFS_HS_LOOKUP_MASK_FIND		= BIT(0),
+	/** return entry with a ref */
+	CFS_HS_LOOKUP_MASK_REF		= BIT(1),
+	/** add entry if not existing */
+	CFS_HS_LOOKUP_MASK_ADD		= BIT(2),
+	/** delete entry, ignore other masks */
+	CFS_HS_LOOKUP_MASK_DEL		= BIT(3),
+};
+
+enum cfs_hash_lookup_intent {
+        /** return item w/o refcount */
+        CFS_HS_LOOKUP_IT_PEEK       = CFS_HS_LOOKUP_MASK_FIND,
+        /** return item with refcount */
+        CFS_HS_LOOKUP_IT_FIND       = (CFS_HS_LOOKUP_MASK_FIND |
+                                       CFS_HS_LOOKUP_MASK_REF),
+        /** return item w/o refcount if existed, otherwise add */
+        CFS_HS_LOOKUP_IT_ADD        = (CFS_HS_LOOKUP_MASK_FIND |
+                                       CFS_HS_LOOKUP_MASK_ADD),
+        /** return item with refcount if existed, otherwise add */
+        CFS_HS_LOOKUP_IT_FINDADD    = (CFS_HS_LOOKUP_IT_FIND |
+                                       CFS_HS_LOOKUP_MASK_ADD),
+        /** delete if existed */
+        CFS_HS_LOOKUP_IT_FINDDEL    = (CFS_HS_LOOKUP_MASK_FIND |
+                                       CFS_HS_LOOKUP_MASK_DEL)
+};
+
+static struct hlist_node *
+cfs_hash_bd_lookup_intent(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			  const void *key, struct hlist_node *hnode,
+			  enum cfs_hash_lookup_intent intent)
+
+{
+	struct hlist_head  *hhead = cfs_hash_bd_hhead(hs, bd);
+	struct hlist_node  *ehnode;
+	struct hlist_node  *match;
+	int  intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0;
+
+	/* with this function, we can avoid a lot of useless refcount ops,
+	 * which are expensive atomic operations most time. */
+	match = intent_add ? NULL : hnode;
+	hlist_for_each(ehnode, hhead) {
+		if (!cfs_hash_keycmp(hs, key, ehnode))
+			continue;
+
+                if (match != NULL && match != ehnode) /* can't match */
+                        continue;
+
+                /* match and ... */
+                if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) {
+                        cfs_hash_bd_del_locked(hs, bd, ehnode);
+                        return ehnode;
+                }
+
+                /* caller wants refcount? */
+                if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0)
+                        cfs_hash_get(hs, ehnode);
+                return ehnode;
+        }
+        /* no match item */
+        if (!intent_add)
+                return NULL;
+
+        LASSERT(hnode != NULL);
+        cfs_hash_bd_add_locked(hs, bd, hnode);
+        return hnode;
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			  const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					CFS_HS_LOOKUP_IT_FIND);
+}
+EXPORT_SYMBOL(cfs_hash_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					CFS_HS_LOOKUP_IT_PEEK);
+}
+EXPORT_SYMBOL(cfs_hash_bd_peek_locked);
+
+static void
+cfs_hash_multi_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+                       unsigned n, int excl)
+{
+	struct cfs_hash_bucket *prev = NULL;
+        int                i;
+
+        /**
+         * bds must be ascendantly ordered by bd->bd_bucket->hsb_index.
+         * NB: it's possible that several bds point to the same bucket but
+         * have different bd::bd_offset, so need take care of deadlock.
+         */
+        cfs_hash_for_each_bd(bds, n, i) {
+                if (prev == bds[i].bd_bucket)
+                        continue;
+
+                LASSERT(prev == NULL ||
+                        prev->hsb_index < bds[i].bd_bucket->hsb_index);
+                cfs_hash_bd_lock(hs, &bds[i], excl);
+                prev = bds[i].bd_bucket;
+        }
+}
+
+static void
+cfs_hash_multi_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+                         unsigned n, int excl)
+{
+	struct cfs_hash_bucket *prev = NULL;
+        int                i;
+
+        cfs_hash_for_each_bd(bds, n, i) {
+                if (prev != bds[i].bd_bucket) {
+                        cfs_hash_bd_unlock(hs, &bds[i], excl);
+                        prev = bds[i].bd_bucket;
+                }
+        }
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				unsigned n, const void *key)
+{
+	struct hlist_node *ehnode;
+	unsigned          i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL,
+							CFS_HS_LOOKUP_IT_FIND);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				 unsigned n, const void *key,
+				 struct hlist_node *hnode, int noref)
+{
+	struct hlist_node *ehnode;
+	int		  intent;
+	unsigned	  i;
+
+        LASSERT(hnode != NULL);
+        intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF);
+
+        cfs_hash_for_each_bd(bds, n, i) {
+                ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key,
+                                                   NULL, intent);
+                if (ehnode != NULL)
+                        return ehnode;
+        }
+
+        if (i == 1) { /* only one bucket */
+                cfs_hash_bd_add_locked(hs, &bds[0], hnode);
+        } else {
+		struct cfs_hash_bd      mybd;
+
+                cfs_hash_bd_get(hs, key, &mybd);
+                cfs_hash_bd_add_locked(hs, &mybd, hnode);
+        }
+
+        return hnode;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				 unsigned n, const void *key,
+				 struct hlist_node *hnode)
+{
+	struct hlist_node *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode,
+						   CFS_HS_LOOKUP_IT_FINDDEL);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static void
+cfs_hash_bd_order(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2)
+{
+        int     rc;
+
+        if (bd2->bd_bucket == NULL)
+                return;
+
+        if (bd1->bd_bucket == NULL) {
+                *bd1 = *bd2;
+                bd2->bd_bucket = NULL;
+                return;
+        }
+
+        rc = cfs_hash_bd_compare(bd1, bd2);
+        if (rc == 0) {
+                bd2->bd_bucket = NULL;
+
+	} else if (rc > 0) {
+		swap(*bd1, *bd2); /* swab bd1 and bd2 */
+        }
+}
+
+void
+cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key,
+		     struct cfs_hash_bd *bds)
+{
+        /* NB: caller should hold hs_lock.rw if REHASH is set */
+        cfs_hash_bd_from_key(hs, hs->hs_buckets,
+                             hs->hs_cur_bits, key, &bds[0]);
+        if (likely(hs->hs_rehash_buckets == NULL)) {
+                /* no rehash or not rehashing */
+                bds[1].bd_bucket = NULL;
+                return;
+        }
+
+        LASSERT(hs->hs_rehash_bits != 0);
+        cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                             hs->hs_rehash_bits, key, &bds[1]);
+
+        cfs_hash_bd_order(&bds[0], &bds[1]);
+}
+
+void
+cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+        cfs_hash_multi_bd_lock(hs, bds, 2, excl);
+}
+
+void
+cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+        cfs_hash_multi_bd_unlock(hs, bds, 2, excl);
+}
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+                               const void *key)
+{
+        return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key);
+}
+
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode,
+				int noref)
+{
+	return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key,
+						hnode, noref);
+}
+
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode);
+}
+
+static void
+cfs_hash_buckets_free(struct cfs_hash_bucket **buckets,
+                      int bkt_size, int prev_size, int size)
+{
+        int     i;
+
+        for (i = prev_size; i < size; i++) {
+                if (buckets[i] != NULL)
+                        LIBCFS_FREE(buckets[i], bkt_size);
+        }
+
+        LIBCFS_FREE(buckets, sizeof(buckets[0]) * size);
+}
+
+/*
+ * Create or grow bucket memory. Return old_buckets if no allocation was
+ * needed, the newly allocated buckets if allocation was needed and
+ * successful, and NULL on error.
+ */
+static struct cfs_hash_bucket **
+cfs_hash_buckets_realloc(struct cfs_hash *hs, struct cfs_hash_bucket **old_bkts,
+                         unsigned int old_size, unsigned int new_size)
+{
+	struct cfs_hash_bucket **new_bkts;
+        int                 i;
+
+        LASSERT(old_size == 0 || old_bkts != NULL);
+
+        if (old_bkts != NULL && old_size == new_size)
+                return old_bkts;
+
+        LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size);
+        if (new_bkts == NULL)
+                return NULL;
+
+        if (old_bkts != NULL) {
+                memcpy(new_bkts, old_bkts,
+                       min(old_size, new_size) * sizeof(*old_bkts));
+        }
+
+	for (i = old_size; i < new_size; i++) {
+		struct hlist_head *hhead;
+		struct cfs_hash_bd     bd;
+
+                LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs));
+                if (new_bkts[i] == NULL) {
+                        cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs),
+                                              old_size, new_size);
+                        return NULL;
+                }
+
+		new_bkts[i]->hsb_index   = i;
+		new_bkts[i]->hsb_version = 1;  /* shouldn't be zero */
+		new_bkts[i]->hsb_depmax  = -1; /* unknown */
+		bd.bd_bucket = new_bkts[i];
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead)
+			INIT_HLIST_HEAD(hhead);
+
+                if (cfs_hash_with_no_lock(hs) ||
+                    cfs_hash_with_no_bktlock(hs))
+                        continue;
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			rwlock_init(&new_bkts[i]->hsb_lock.rw);
+		else if (cfs_hash_with_spin_bktlock(hs))
+			spin_lock_init(&new_bkts[i]->hsb_lock.spin);
+		else if (cfs_hash_with_rw_sem_bktlock(hs))
+			init_rwsem(&new_bkts[i]->hsb_lock.rw_sem);
+		else
+			LBUG(); /* invalid use-case */
+	}
+	return new_bkts;
+}
+
+/**
+ * Initialize new libcfs hash, where:
+ * @name     - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops      - Registered hash table operations
+ * @flags    - CFS_HASH_REHASH enable synamic hash resizing
+ *           - CFS_HASH_SORT enable chained hash sort
+ */
+static void cfs_hash_rehash_worker(struct work_struct *work);
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static void cfs_hash_dep_print(struct work_struct *work)
+{
+	struct cfs_hash *hs = container_of(work, struct cfs_hash, hs_dep_work);
+	int         dep;
+	int         bkt;
+	int         off;
+	int         bits;
+
+	spin_lock(&hs->hs_dep_lock);
+	dep  = hs->hs_dep_max;
+	bkt  = hs->hs_dep_bkt;
+	off  = hs->hs_dep_off;
+	bits = hs->hs_dep_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n",
+		      hs->hs_name, bits, dep, bkt, off);
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_bits = 0; /* mark as workitem done */
+	spin_unlock(&hs->hs_dep_lock);
+	return 0;
+}
+
+static void cfs_hash_depth_wi_init(struct cfs_hash *hs)
+{
+	spin_lock_init(&hs->hs_dep_lock);
+	INIT_WORK(&hs->hs_dep_work, cfs_hash_dep_print);
+}
+
+static void cfs_hash_depth_wi_cancel(struct cfs_hash *hs)
+{
+	cancel_work_sync(&hs->hs_dep_work);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */
+
+static inline void cfs_hash_depth_wi_init(struct cfs_hash *hs) {}
+static inline void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */
+
+struct cfs_hash *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+                unsigned bkt_bits, unsigned extra_bytes,
+                unsigned min_theta, unsigned max_theta,
+		struct cfs_hash_ops *ops, unsigned flags)
+{
+	struct cfs_hash *hs;
+        int         len;
+
+        ENTRY;
+
+	BUILD_BUG_ON(CFS_HASH_THETA_BITS >= 15);
+
+        LASSERT(name != NULL);
+        LASSERT(ops != NULL);
+        LASSERT(ops->hs_key);
+        LASSERT(ops->hs_hash);
+        LASSERT(ops->hs_object);
+        LASSERT(ops->hs_keycmp);
+        LASSERT(ops->hs_get != NULL);
+	LASSERT(ops->hs_put != NULL || ops->hs_put_locked != NULL);
+
+        if ((flags & CFS_HASH_REHASH) != 0)
+                flags |= CFS_HASH_COUNTER; /* must have counter */
+
+        LASSERT(cur_bits > 0);
+        LASSERT(cur_bits >= bkt_bits);
+        LASSERT(max_bits >= cur_bits && max_bits < 31);
+        LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits));
+        LASSERT(ergo((flags & CFS_HASH_REHASH) != 0,
+                     (flags & CFS_HASH_NO_LOCK) == 0));
+        LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0,
+                      ops->hs_keycpy != NULL));
+
+        len = (flags & CFS_HASH_BIGNAME) == 0 ?
+              CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN;
+	LIBCFS_ALLOC(hs, offsetof(struct cfs_hash, hs_name[len]));
+        if (hs == NULL)
+                RETURN(NULL);
+
+	strlcpy(hs->hs_name, name, len);
+	hs->hs_flags = flags;
+
+	atomic_set(&hs->hs_refcount, 1);
+	atomic_set(&hs->hs_count, 0);
+
+	cfs_hash_lock_setup(hs);
+	cfs_hash_hlist_setup(hs);
+
+        hs->hs_cur_bits = (__u8)cur_bits;
+        hs->hs_min_bits = (__u8)cur_bits;
+        hs->hs_max_bits = (__u8)max_bits;
+        hs->hs_bkt_bits = (__u8)bkt_bits;
+
+        hs->hs_ops         = ops;
+        hs->hs_extra_bytes = extra_bytes;
+        hs->hs_rehash_bits = 0;
+	INIT_WORK(&hs->hs_rehash_work, cfs_hash_rehash_worker);
+        cfs_hash_depth_wi_init(hs);
+
+        if (cfs_hash_with_rehash(hs))
+                __cfs_hash_set_theta(hs, min_theta, max_theta);
+
+        hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0,
+                                                  CFS_HASH_NBKT(hs));
+        if (hs->hs_buckets != NULL)
+                return hs;
+
+	LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[len]));
+        RETURN(NULL);
+}
+EXPORT_SYMBOL(cfs_hash_create);
+
+/**
+ * Cleanup libcfs hash @hs.
+ */
+static void
+cfs_hash_destroy(struct cfs_hash *hs)
+{
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	struct cfs_hash_bd         bd;
+	int                   i;
+	ENTRY;
+
+	LASSERT(hs != NULL);
+	LASSERT(!cfs_hash_is_exiting(hs) &&
+		!cfs_hash_is_iterating(hs));
+
+        /**
+         * prohibit further rehashes, don't need any lock because
+         * I'm the only (last) one can change it.
+         */
+        hs->hs_exiting = 1;
+        if (cfs_hash_with_rehash(hs))
+                cfs_hash_rehash_cancel(hs);
+
+        cfs_hash_depth_wi_cancel(hs);
+        /* rehash should be done/canceled */
+        LASSERT(hs->hs_buckets != NULL &&
+                hs->hs_rehash_buckets == NULL);
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		LASSERT(bd.bd_bucket != NULL);
+		/* no need to take this lock, just for consistent code */
+		cfs_hash_bd_lock(hs, &bd, 1);
+
+                cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+					LASSERTF(!cfs_hash_with_assert_empty(hs),
+					"hash %s bucket %u(%u) is not "
+					" empty: %u items left\n",
+					hs->hs_name, bd.bd_bucket->hsb_index,
+					bd.bd_offset, bd.bd_bucket->hsb_count);
+				/* can't assert key valicate, because we
+				 * can interrupt rehash */
+				cfs_hash_bd_del_locked(hs, &bd, hnode);
+				cfs_hash_exit(hs, hnode);
+			}
+		}
+		LASSERT(bd.bd_bucket->hsb_count == 0);
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		cond_resched();
+	}
+
+	LASSERT(atomic_read(&hs->hs_count) == 0);
+
+	cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs),
+			      0, CFS_HASH_NBKT(hs));
+	i = cfs_hash_with_bigname(hs) ?
+	    CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN;
+	LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[i]));
+
+	EXIT;
+}
+
+struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs)
+{
+	if (atomic_inc_not_zero(&hs->hs_refcount))
+		return hs;
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_getref);
+
+void cfs_hash_putref(struct cfs_hash *hs)
+{
+	if (atomic_dec_and_test(&hs->hs_refcount))
+		cfs_hash_destroy(hs);
+}
+EXPORT_SYMBOL(cfs_hash_putref);
+
+static inline int
+cfs_hash_rehash_bits(struct cfs_hash *hs)
+{
+        if (cfs_hash_with_no_lock(hs) ||
+            !cfs_hash_with_rehash(hs))
+                return -EOPNOTSUPP;
+
+        if (unlikely(cfs_hash_is_exiting(hs)))
+                return -ESRCH;
+
+        if (unlikely(cfs_hash_is_rehashing(hs)))
+                return -EALREADY;
+
+        if (unlikely(cfs_hash_is_iterating(hs)))
+                return -EAGAIN;
+
+        /* XXX: need to handle case with max_theta != 2.0
+         *      and the case with min_theta != 0.5 */
+        if ((hs->hs_cur_bits < hs->hs_max_bits) &&
+            (__cfs_hash_theta(hs) > hs->hs_max_theta))
+                return hs->hs_cur_bits + 1;
+
+        if (!cfs_hash_with_shrink(hs))
+                return 0;
+
+        if ((hs->hs_cur_bits > hs->hs_min_bits) &&
+            (__cfs_hash_theta(hs) < hs->hs_min_theta))
+                return hs->hs_cur_bits - 1;
+
+        return 0;
+}
+
+/**
+ * don't allow inline rehash if:
+ * - user wants non-blocking change (add/del) on hash table
+ * - too many elements
+ */
+static inline int
+cfs_hash_rehash_inline(struct cfs_hash *hs)
+{
+	return !cfs_hash_with_nblk_change(hs) &&
+	       atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called when the item is added.
+ */
+void
+cfs_hash_add(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	struct cfs_hash_bd   bd;
+        int             bits;
+
+	LASSERT(hlist_unhashed(hnode));
+
+        cfs_hash_lock(hs, 0);
+        cfs_hash_bd_get_and_lock(hs, key, &bd, 1);
+
+        cfs_hash_key_validate(hs, key, hnode);
+        cfs_hash_bd_add_locked(hs, &bd, hnode);
+
+        cfs_hash_bd_unlock(hs, &bd, 1);
+
+        bits = cfs_hash_rehash_bits(hs);
+        cfs_hash_unlock(hs, 0);
+        if (bits > 0)
+                cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+}
+EXPORT_SYMBOL(cfs_hash_add);
+
+static struct hlist_node *
+cfs_hash_find_or_add(struct cfs_hash *hs, const void *key,
+		     struct hlist_node *hnode, int noref)
+{
+	struct hlist_node *ehnode;
+	struct cfs_hash_bd     bds[2];
+	int               bits = 0;
+
+	LASSERTF(hlist_unhashed(hnode), "hnode = %p\n", hnode);
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key,
+						 hnode, noref);
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+
+	if (ehnode == hnode) /* new item added */
+		bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return ehnode;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+cfs_hash_add_unique(struct cfs_hash *hs, const void *key,
+		    struct hlist_node *hnode)
+{
+	return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ?
+	       -EALREADY : 0;
+}
+EXPORT_SYMBOL(cfs_hash_add_unique);
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  If this @key
+ * already exists in the hash then ops->hs_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->hs_get is called on the item which was added.
+ */
+void *
+cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode)
+{
+	hnode = cfs_hash_find_or_add(hs, key, hnode, 0);
+
+	return cfs_hash_object(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_findadd_unique);
+
+/**
+ * Delete item @hnode from the libcfs hash @hs using @key.  The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket.  The object
+ * removed from the hash will be returned and obs->hs_put is called
+ * on the removed object.
+ */
+void *
+cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+        void           *obj  = NULL;
+        int             bits = 0;
+	struct cfs_hash_bd   bds[2];
+
+        cfs_hash_lock(hs, 0);
+        cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	/* NB: do nothing if @hnode is not in hash table */
+	if (hnode == NULL || !hlist_unhashed(hnode)) {
+		if (bds[1].bd_bucket == NULL && hnode != NULL) {
+			cfs_hash_bd_del_locked(hs, &bds[0], hnode);
+		} else {
+			hnode = cfs_hash_dual_bd_finddel_locked(hs, bds,
+								key, hnode);
+		}
+	}
+
+        if (hnode != NULL) {
+                obj  = cfs_hash_object(hs, hnode);
+                bits = cfs_hash_rehash_bits(hs);
+        }
+
+        cfs_hash_dual_bd_unlock(hs, bds, 1);
+        cfs_hash_unlock(hs, 0);
+        if (bits > 0)
+                cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+        return obj;
+}
+EXPORT_SYMBOL(cfs_hash_del);
+
+/**
+ * Delete item given @key in libcfs hash @hs.  The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @hs this function must be called once per key.  The removed object
+ * will be returned and ops->hs_put is called on the removed object.
+ */
+void *
+cfs_hash_del_key(struct cfs_hash *hs, const void *key)
+{
+        return cfs_hash_del(hs, key, NULL);
+}
+EXPORT_SYMBOL(cfs_hash_del_key);
+
+/**
+ * Lookup an item using @key in the libcfs hash @hs and return it.
+ * If the @key is found in the hash hs->hs_get() is called and the
+ * matching objects is returned.  It is the callers responsibility
+ * to call the counterpart ops->hs_put using the cfs_hash_put() macro
+ * when when finished with the object.  If the @key was not found
+ * in the hash @hs NULL is returned.
+ */
+void *
+cfs_hash_lookup(struct cfs_hash *hs, const void *key)
+{
+        void                 *obj = NULL;
+	struct hlist_node     *hnode;
+	struct cfs_hash_bd         bds[2];
+
+        cfs_hash_lock(hs, 0);
+        cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+        hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key);
+        if (hnode != NULL)
+                obj = cfs_hash_object(hs, hnode);
+
+        cfs_hash_dual_bd_unlock(hs, bds, 0);
+        cfs_hash_unlock(hs, 0);
+
+        return obj;
+}
+EXPORT_SYMBOL(cfs_hash_lookup);
+
+static void
+cfs_hash_for_each_enter(struct cfs_hash *hs)
+{
+        LASSERT(!cfs_hash_is_exiting(hs));
+
+        if (!cfs_hash_with_rehash(hs))
+                return;
+        /*
+         * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter
+         * because it's just an unreliable signal to rehash-thread,
+	 * rehash-thread will try to finish rehash ASAP when seeing this.
+         */
+        hs->hs_iterating = 1;
+
+	cfs_hash_lock(hs, 1);
+	hs->hs_iterators++;
+	cfs_hash_unlock(hs, 1);
+
+	/* NB: iteration is mostly called by service thread,
+	 * we tend to cancel pending rehash-request, instead of
+	 * blocking service thread, we will relaunch rehash request
+	 * after iteration
+	 */
+	if (cfs_hash_is_rehashing(hs))
+		cfs_hash_rehash_cancel(hs);
+}
+
+static void
+cfs_hash_for_each_exit(struct cfs_hash *hs)
+{
+	int remained;
+	int bits;
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	cfs_hash_lock(hs, 1);
+	remained = --hs->hs_iterators;
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 1);
+	/* NB: it's race on cfs_has_t::hs_iterating, see above */
+	if (remained == 0)
+		hs->hs_iterating = 0;
+	if (bits > 0) {
+		cfs_hash_rehash(hs, atomic_read(&hs->hs_count) <
+				    CFS_HASH_LOOP_HOG);
+	}
+}
+
+/**
+ * For each item in the libcfs hash @hs call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ *
+ * a) the function may sleep!
+ * b) during the callback:
+ *    . the bucket lock is held so the callback must never sleep.
+ *    . if @removal_safe is true, use can remove current item by
+ *      cfs_hash_bd_del_locked
+ */
+static __u64
+cfs_hash_for_each_tight(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+			void *data, int remove_safe)
+{
+	struct hlist_node	*hnode;
+	struct hlist_node	*pos;
+	struct cfs_hash_bd	bd;
+	__u64			count = 0;
+	int			excl  = !!remove_safe;
+	int			loop  = 0;
+	int			i;
+	ENTRY;
+
+	cfs_hash_for_each_enter(hs);
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, excl);
+		if (func == NULL) { /* only glimpse size */
+			count += bd.bd_bucket->hsb_count;
+			cfs_hash_bd_unlock(hs, &bd, excl);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				count++;
+				loop++;
+				if (func(hs, &bd, hnode, data)) {
+					cfs_hash_bd_unlock(hs, &bd, excl);
+					goto out;
+				}
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, excl);
+		if (loop < CFS_HASH_LOOP_HOG)
+			continue;
+		loop = 0;
+		cfs_hash_unlock(hs, 0);
+		cond_resched();
+		cfs_hash_lock(hs, 0);
+	}
+ out:
+	cfs_hash_unlock(hs, 0);
+
+	cfs_hash_for_each_exit(hs);
+	RETURN(count);
+}
+
+struct cfs_hash_cond_arg {
+        cfs_hash_cond_opt_cb_t  func;
+        void                   *arg;
+};
+
+static int
+cfs_hash_cond_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			 struct hlist_node *hnode, void *data)
+{
+	struct cfs_hash_cond_arg *cond = data;
+
+        if (cond->func(cfs_hash_object(hs, hnode), cond->arg))
+                cfs_hash_bd_del_locked(hs, bd, hnode);
+        return 0;
+}
+
+/**
+ * Delete item from the libcfs hash @hs when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t func, void *data)
+{
+	struct cfs_hash_cond_arg arg = {
+                .func   = func,
+                .arg    = data,
+        };
+
+        cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1);
+}
+EXPORT_SYMBOL(cfs_hash_cond_del);
+
+void
+cfs_hash_for_each(struct cfs_hash *hs,
+                  cfs_hash_for_each_cb_t func, void *data)
+{
+        cfs_hash_for_each_tight(hs, func, data, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each);
+
+void
+cfs_hash_for_each_safe(struct cfs_hash *hs,
+                       cfs_hash_for_each_cb_t func, void *data)
+{
+        cfs_hash_for_each_tight(hs, func, data, 1);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_safe);
+
+static int
+cfs_hash_peek(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+	      struct hlist_node *hnode, void *data)
+{
+	*(int *)data = 0;
+	return 1; /* return 1 to break the loop */
+}
+
+int
+cfs_hash_is_empty(struct cfs_hash *hs)
+{
+        int empty = 1;
+
+        cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0);
+        return empty;
+}
+EXPORT_SYMBOL(cfs_hash_is_empty);
+
+__u64
+cfs_hash_size_get(struct cfs_hash *hs)
+{
+	return cfs_hash_with_counter(hs) ?
+	       atomic_read(&hs->hs_count) :
+	       cfs_hash_for_each_tight(hs, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(cfs_hash_size_get);
+
+/*
+ * cfs_hash_for_each_relax:
+ * Iterate the hash table and call @func on each item without
+ * any lock. This function can't guarantee to finish iteration
+ * if these features are enabled:
+ *
+ *  a. if rehash_key is enabled, an item can be moved from
+ *     one bucket to another bucket
+ *  b. user can remove non-zero-ref item from hash-table,
+ *     so the item can be removed from hash-table, even worse,
+ *     it's possible that user changed key and insert to another
+ *     hash bucket.
+ * there's no way for us to finish iteration correctly on previous
+ * two cases, so iteration has to be stopped on change.
+ */
+static int
+cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+			void *data, int start)
+{
+	struct hlist_node	*hnode;
+	struct hlist_node	*next = NULL;
+	struct cfs_hash_bd	bd;
+	__u32			version;
+	int			count = 0;
+	int			stop_on_change;
+	int			has_put_locked;
+	int			rc = 0;
+	int			i, end = -1;
+	ENTRY;
+
+	stop_on_change = cfs_hash_with_rehash_key(hs) ||
+			 !cfs_hash_with_no_itemref(hs);
+	has_put_locked = hs->hs_ops->hs_put_locked != NULL;
+	cfs_hash_lock(hs, 0);
+again:
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		if (i < start)
+			continue;
+		else if (end > 0 && i >= end)
+			break;
+
+                cfs_hash_bd_lock(hs, &bd, 0);
+                version = cfs_hash_bd_version_get(&bd);
+
+                cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hnode = hhead->first;
+			if (hnode == NULL)
+				continue;
+			cfs_hash_get(hs, hnode);
+			for (; hnode != NULL; hnode = next) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				next = hnode->next;
+				if (next != NULL)
+					cfs_hash_get(hs, next);
+                                cfs_hash_bd_unlock(hs, &bd, 0);
+                                cfs_hash_unlock(hs, 0);
+
+				rc = func(hs, &bd, hnode, data);
+				if (stop_on_change || !has_put_locked)
+					cfs_hash_put(hs, hnode);
+
+				cond_resched();
+				count++;
+
+                                cfs_hash_lock(hs, 0);
+                                cfs_hash_bd_lock(hs, &bd, 0);
+				if (stop_on_change) {
+					if (version !=
+					    cfs_hash_bd_version_get(&bd))
+						rc = -EINTR;
+				} else if (has_put_locked) {
+					cfs_hash_put_locked(hs, hnode);
+				}
+                                if (rc) /* callback wants to break iteration */
+                                        break;
+                        }
+			if (next != NULL) {
+				if (has_put_locked) {
+					cfs_hash_put_locked(hs, next);
+					next = NULL;
+				}
+				break;
+			} else if (rc != 0) {
+				break;
+			}
+                }
+                cfs_hash_bd_unlock(hs, &bd, 0);
+		if (next != NULL && !has_put_locked) {
+			cfs_hash_put(hs, next);
+			next = NULL;
+		}
+		if (rc) /* callback wants to break iteration */
+			break;
+        }
+
+	if (start > 0 && rc == 0) {
+		end = start;
+		start = 0;
+		goto again;
+	}
+
+	cfs_hash_unlock(hs, 0);
+	return count;
+}
+
+int
+cfs_hash_for_each_nolock(struct cfs_hash *hs,
+			 cfs_hash_for_each_cb_t func, void *data, int start)
+{
+        ENTRY;
+
+        if (cfs_hash_with_no_lock(hs) ||
+            cfs_hash_with_rehash_key(hs) ||
+            !cfs_hash_with_no_itemref(hs))
+                RETURN(-EOPNOTSUPP);
+
+	if (hs->hs_ops->hs_get == NULL ||
+	   (hs->hs_ops->hs_put == NULL &&
+	    hs->hs_ops->hs_put_locked == NULL))
+		RETURN(-EOPNOTSUPP);
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_for_each_relax(hs, func, data, start);
+	cfs_hash_for_each_exit(hs);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_nolock);
+
+/**
+ * For each hash bucket in the libcfs hash @hs call the passed callback
+ * @func until all the hash buckets are empty.  The passed callback @func
+ * or the previously registered callback hs->hs_put must remove the item
+ * from the hash.  You may either use the cfs_hash_del() or hlist_del()
+ * functions.  No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed.  This function will not terminate until the
+ * hash is empty.  Note it is still possible to concurrently add new
+ * items in to the hash.  It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+int
+cfs_hash_for_each_empty(struct cfs_hash *hs,
+                        cfs_hash_for_each_cb_t func, void *data)
+{
+        unsigned  i = 0;
+        ENTRY;
+
+        if (cfs_hash_with_no_lock(hs))
+                return -EOPNOTSUPP;
+
+	if (hs->hs_ops->hs_get == NULL ||
+	   (hs->hs_ops->hs_put == NULL &&
+	    hs->hs_ops->hs_put_locked == NULL))
+		return -EOPNOTSUPP;
+
+	cfs_hash_for_each_enter(hs);
+	while (cfs_hash_for_each_relax(hs, func, data, 0)) {
+		CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+		       hs->hs_name, i++);
+	}
+	cfs_hash_for_each_exit(hs);
+	RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_empty);
+
+void
+cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_head *hhead;
+	struct hlist_node *hnode;
+	struct cfs_hash_bd	   bd;
+
+        cfs_hash_for_each_enter(hs);
+        cfs_hash_lock(hs, 0);
+        if (hindex >= CFS_HASH_NHLIST(hs))
+                goto out;
+
+	cfs_hash_bd_index_set(hs, hindex, &bd);
+
+	cfs_hash_bd_lock(hs, &bd, 0);
+	hhead = cfs_hash_bd_hhead(hs, &bd);
+	hlist_for_each(hnode, hhead) {
+		if (func(hs, &bd, hnode, data))
+			break;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 0);
+out:
+	cfs_hash_unlock(hs, 0);
+	cfs_hash_for_each_exit(hs);
+}
+
+EXPORT_SYMBOL(cfs_hash_hlist_for_each);
+
+/*
+ * For each item in the libcfs hash @hs which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data. During the callback the bucket lock
+ * is held so the callback must never sleep.
+   */
+void
+cfs_hash_for_each_key(struct cfs_hash *hs, const void *key,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_node *hnode;
+	struct cfs_hash_bd	   bds[2];
+	unsigned	   i;
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	cfs_hash_for_each_bd(bds, 2, i) {
+		struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]);
+
+		hlist_for_each(hnode, hlist) {
+			cfs_hash_bucket_validate(hs, &bds[i], hnode);
+
+			if (cfs_hash_keycmp(hs, key, hnode)) {
+				if (func(hs, &bds[i], hnode, data))
+					break;
+			}
+		}
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_key);
+
+/**
+ * Rehash the libcfs hash @hs to the given @bits.  This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed.  When the CFS_HASH_REHASH
+ * flag is set in @hs the libcfs hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the hs->hs_min_theta or hs->max_theta values.  By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function.  The
+ * theta thresholds for @hs are tunable via cfs_hash_set_theta().
+ */
+void
+cfs_hash_rehash_cancel(struct cfs_hash *hs)
+{
+	LASSERT(cfs_hash_with_rehash(hs));
+	cancel_work_sync(&hs->hs_rehash_work);
+}
+
+void
+cfs_hash_rehash(struct cfs_hash *hs, int do_rehash)
+{
+        int     rc;
+
+        LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs));
+
+        cfs_hash_lock(hs, 1);
+
+	rc = cfs_hash_rehash_bits(hs);
+	if (rc <= 0) {
+		cfs_hash_unlock(hs, 1);
+		return;
+	}
+
+	hs->hs_rehash_bits = rc;
+	if (!do_rehash) {
+		/* launch and return */
+		queue_work(cfs_rehash_wq, &hs->hs_rehash_work);
+		cfs_hash_unlock(hs, 1);
+		return;
+	}
+
+	/* rehash right now */
+	cfs_hash_unlock(hs, 1);
+
+	cfs_hash_rehash_worker(&hs->hs_rehash_work);
+}
+
+static int
+cfs_hash_rehash_bd(struct cfs_hash *hs, struct cfs_hash_bd *old)
+{
+	struct cfs_hash_bd      new;
+	struct hlist_head *hhead;
+	struct hlist_node *hnode;
+	struct hlist_node *pos;
+	void		  *key;
+	int		   c = 0;
+
+	/* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */
+	cfs_hash_bd_for_each_hlist(hs, old, hhead) {
+		hlist_for_each_safe(hnode, pos, hhead) {
+			key = cfs_hash_key(hs, hnode);
+			LASSERT(key != NULL);
+			/* Validate hnode is in the correct bucket. */
+			cfs_hash_bucket_validate(hs, old, hnode);
+			/*
+			 * Delete from old hash bucket; move to new bucket.
+			 * ops->hs_key must be defined.
+			 */
+			cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+					     hs->hs_rehash_bits, key, &new);
+			cfs_hash_bd_move_locked(hs, old, &new, hnode);
+			c++;
+		}
+	}
+	return c;
+}
+
+static void
+cfs_hash_rehash_worker(struct work_struct *work)
+{
+	struct cfs_hash *hs = container_of(work, struct cfs_hash,
+					   hs_rehash_work);
+	struct cfs_hash_bucket **bkts;
+	struct cfs_hash_bd	bd;
+	unsigned int		old_size;
+	unsigned int		new_size;
+	int			bsize;
+	int			count = 0;
+	int			rc = 0;
+	int			i;
+
+	LASSERT(hs != NULL && cfs_hash_with_rehash(hs));
+
+        cfs_hash_lock(hs, 0);
+        LASSERT(cfs_hash_is_rehashing(hs));
+
+        old_size = CFS_HASH_NBKT(hs);
+        new_size = CFS_HASH_RH_NBKT(hs);
+
+        cfs_hash_unlock(hs, 0);
+
+        /*
+         * don't need hs::hs_rwlock for hs::hs_buckets,
+         * because nobody can change bkt-table except me.
+         */
+        bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets,
+                                        old_size, new_size);
+        cfs_hash_lock(hs, 1);
+        if (bkts == NULL) {
+                rc = -ENOMEM;
+                goto out;
+        }
+
+        if (bkts == hs->hs_buckets) {
+                bkts = NULL; /* do nothing */
+                goto out;
+        }
+
+        rc = __cfs_hash_theta(hs);
+        if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) {
+                /* free the new allocated bkt-table */
+                old_size = new_size;
+                new_size = CFS_HASH_NBKT(hs);
+                rc = -EALREADY;
+                goto out;
+        }
+
+        LASSERT(hs->hs_rehash_buckets == NULL);
+        hs->hs_rehash_buckets = bkts;
+
+        rc = 0;
+        cfs_hash_for_each_bucket(hs, &bd, i) {
+                if (cfs_hash_is_exiting(hs)) {
+                        rc = -ESRCH;
+                        /* someone wants to destroy the hash, abort now */
+                        if (old_size < new_size) /* OK to free old bkt-table */
+                                break;
+                        /* it's shrinking, need free new bkt-table */
+                        hs->hs_rehash_buckets = NULL;
+                        old_size = new_size;
+                        new_size = CFS_HASH_NBKT(hs);
+                        goto out;
+                }
+
+                count += cfs_hash_rehash_bd(hs, &bd);
+                if (count < CFS_HASH_LOOP_HOG ||
+                    cfs_hash_is_iterating(hs)) { /* need to finish ASAP */
+                        continue;
+                }
+
+		count = 0;
+		cfs_hash_unlock(hs, 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+
+        hs->hs_rehash_count++;
+
+        bkts = hs->hs_buckets;
+        hs->hs_buckets = hs->hs_rehash_buckets;
+        hs->hs_rehash_buckets = NULL;
+
+	hs->hs_cur_bits = hs->hs_rehash_bits;
+out:
+	hs->hs_rehash_bits = 0;
+	bsize = cfs_hash_bkt_size(hs);
+	cfs_hash_unlock(hs, 1);
+	/* can't refer to @hs anymore because it could be destroyed */
+	if (bkts != NULL)
+		cfs_hash_buckets_free(bkts, bsize, new_size, old_size);
+	if (rc != 0)
+		CDEBUG(D_INFO, "early quit of rehashing: %d\n", rc);
+}
+
+/**
+ * Rehash the object referenced by @hnode in the libcfs hash @hs.  The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a cfs_hash_add() + cfs_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash.  When an object is being rehashed
+ * the registered cfs_hash_get() and cfs_hash_put() functions will
+ * not be called.
+ */
+void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key,
+			 void *new_key, struct hlist_node *hnode)
+{
+	struct cfs_hash_bd        bds[3];
+	struct cfs_hash_bd        old_bds[2];
+	struct cfs_hash_bd        new_bd;
+
+	LASSERT(!hlist_unhashed(hnode));
+
+        cfs_hash_lock(hs, 0);
+
+        cfs_hash_dual_bd_get(hs, old_key, old_bds);
+        cfs_hash_bd_get(hs, new_key, &new_bd);
+
+        bds[0] = old_bds[0];
+        bds[1] = old_bds[1];
+        bds[2] = new_bd;
+
+        /* NB: bds[0] and bds[1] are ordered already */
+        cfs_hash_bd_order(&bds[1], &bds[2]);
+        cfs_hash_bd_order(&bds[0], &bds[1]);
+
+        cfs_hash_multi_bd_lock(hs, bds, 3, 1);
+        if (likely(old_bds[1].bd_bucket == NULL)) {
+                cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode);
+        } else {
+                cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode);
+                cfs_hash_bd_add_locked(hs, &new_bd, hnode);
+        }
+        /* overwrite key inside locks, otherwise may screw up with
+         * other operations, i.e: rehash */
+        cfs_hash_keycpy(hs, hnode, new_key);
+
+        cfs_hash_multi_bd_unlock(hs, bds, 3, 1);
+        cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_key);
+
+void cfs_hash_debug_header(struct seq_file *m)
+{
+	seq_printf(m, "%-*s   cur   min   max theta t-min t-max flags rehash   count  maxdep maxdepb distribution\n",
+		   CFS_HASH_BIGNAME_LEN, "name");
+}
+EXPORT_SYMBOL(cfs_hash_debug_header);
+
+static struct cfs_hash_bucket **
+cfs_hash_full_bkts(struct cfs_hash *hs)
+{
+        /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+        if (hs->hs_rehash_buckets == NULL)
+                return hs->hs_buckets;
+
+        LASSERT(hs->hs_rehash_bits != 0);
+        return hs->hs_rehash_bits > hs->hs_cur_bits ?
+               hs->hs_rehash_buckets : hs->hs_buckets;
+}
+
+static unsigned int
+cfs_hash_full_nbkt(struct cfs_hash *hs)
+{
+        /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+        if (hs->hs_rehash_buckets == NULL)
+                return CFS_HASH_NBKT(hs);
+
+        LASSERT(hs->hs_rehash_bits != 0);
+        return hs->hs_rehash_bits > hs->hs_cur_bits ?
+               CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs);
+}
+
+void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m)
+{
+	int dist[8] = { 0, };
+	int maxdep = -1;
+	int maxdepb = -1;
+	int total = 0;
+	int theta;
+	int i;
+
+	cfs_hash_lock(hs, 0);
+	theta = __cfs_hash_theta(hs);
+
+	seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d  0x%02x %6d ",
+		   CFS_HASH_BIGNAME_LEN, hs->hs_name,
+		   1 << hs->hs_cur_bits, 1 << hs->hs_min_bits,
+		   1 << hs->hs_max_bits,
+		   __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta),
+		   __cfs_hash_theta_int(hs->hs_min_theta),
+		   __cfs_hash_theta_frac(hs->hs_min_theta),
+		   __cfs_hash_theta_int(hs->hs_max_theta),
+		   __cfs_hash_theta_frac(hs->hs_max_theta),
+		   hs->hs_flags, hs->hs_rehash_count);
+
+	/*
+	 * The distribution is a summary of the chained hash depth in
+	 * each of the libcfs hash buckets.  Each buckets hsb_count is
+	 * divided by the hash theta value and used to generate a
+	 * histogram of the hash distribution.  A uniform hash will
+	 * result in all hash buckets being close to the average thus
+	 * only the first few entries in the histogram will be non-zero.
+	 * If you hash function results in a non-uniform hash the will
+	 * be observable by outlier bucks in the distribution histogram.
+	 *
+	 * Uniform hash distribution:		128/128/0/0/0/0/0/0
+	 * Non-Uniform hash distribution:	128/125/0/0/0/0/2/1
+	 */
+	for (i = 0; i < cfs_hash_full_nbkt(hs); i++) {
+		struct cfs_hash_bd bd;
+
+		bd.bd_bucket = cfs_hash_full_bkts(hs)[i];
+		cfs_hash_bd_lock(hs, &bd, 0);
+		if (maxdep < bd.bd_bucket->hsb_depmax) {
+			maxdep  = bd.bd_bucket->hsb_depmax;
+			maxdepb = ffz(~maxdep);
+		}
+		total += bd.bd_bucket->hsb_count;
+		dist[min(fls(bd.bd_bucket->hsb_count / max(theta, 1)), 7)]++;
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+
+	seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb);
+	for (i = 0; i < 8; i++)
+		seq_printf(m, "%d%c",  dist[i], (i == 7) ? '\n' : '/');
+
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_debug_str);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
new file mode 100644
index 0000000000000..2616fc9fe9386
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
@@ -0,0 +1,1270 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_cpu.h>
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+	/* CPUs mask for this partition */
+	cpumask_var_t			cpt_cpumask;
+	/* nodes mask for this partition */
+	nodemask_t			*cpt_nodemask;
+	/* NUMA distance between CPTs */
+	unsigned int			*cpt_distance;
+	/* spread rotor for NUMA allocator */
+	unsigned int			cpt_spread_rotor;
+	/* NUMA node if cpt_nodemask is empty */
+	int				cpt_node;
+};
+
+/** descriptor for CPU partitions */
+struct cfs_cpt_table {
+	/* spread rotor for NUMA allocator */
+	unsigned int			ctb_spread_rotor;
+	/* maximum NUMA distance between all nodes in table */
+	unsigned int			ctb_distance;
+	/* # of CPU partitions */
+	int				ctb_nparts;
+	/* partitions tables */
+	struct cfs_cpu_partition	*ctb_parts;
+	/* shadow HW CPU to CPU partition ID */
+	int				*ctb_cpu2cpt;
+	/* all cpus in this partition table */
+	cpumask_var_t			ctb_cpumask;
+	/* shadow HW node to CPU partition ID */
+	int				*ctb_node2cpt;
+	/* all nodes in this partition table */
+	nodemask_t			*ctb_nodemask;
+};
+
+/** Global CPU partition table */
+struct cfs_cpt_table *cfs_cpt_tab __read_mostly;
+EXPORT_SYMBOL(cfs_cpt_tab);
+
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int cpu_npartitions;
+module_param(cpu_npartitions, int, 0444);
+MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
+
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char *cpu_pattern = "N";
+module_param(cpu_pattern, charp, 0444);
+MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
+
+struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+	int i;
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (!cptab)
+		return NULL;
+
+	cptab->ctb_nparts = ncpt;
+
+	if (!zalloc_cpumask_var(&cptab->ctb_cpumask, GFP_NOFS))
+		goto failed_alloc_cpumask;
+
+	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (!cptab->ctb_nodemask)
+		goto failed_alloc_nodemask;
+
+	CFS_ALLOC_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids);
+	if (!cptab->ctb_cpu2cpt)
+		goto failed_alloc_cpu2cpt;
+
+	memset(cptab->ctb_cpu2cpt, -1,
+	       nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+
+	CFS_ALLOC_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids);
+	if (!cptab->ctb_node2cpt)
+		goto failed_alloc_node2cpt;
+
+	memset(cptab->ctb_node2cpt, -1,
+	       nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+
+	CFS_ALLOC_PTR_ARRAY(cptab->ctb_parts, ncpt);
+	if (!cptab->ctb_parts)
+		goto failed_alloc_ctb_parts;
+
+	memset(cptab->ctb_parts, -1, ncpt * sizeof(cptab->ctb_parts[0]));
+
+	for (i = 0; i < ncpt; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (!zalloc_cpumask_var(&part->cpt_cpumask, GFP_NOFS))
+			goto failed_setting_ctb_parts;
+
+		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+		if (!part->cpt_nodemask)
+			goto failed_setting_ctb_parts;
+
+		CFS_ALLOC_PTR_ARRAY(part->cpt_distance, cptab->ctb_nparts);
+		if (!part->cpt_distance)
+			goto failed_setting_ctb_parts;
+
+		memset(part->cpt_distance, -1,
+		       cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
+	}
+
+	return cptab;
+
+failed_setting_ctb_parts:
+	while (i-- >= 0) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		free_cpumask_var(part->cpt_cpumask);
+
+		if (part->cpt_distance) {
+			CFS_FREE_PTR_ARRAY(part->cpt_distance,
+					   cptab->ctb_nparts);
+		}
+	}
+
+	if (cptab->ctb_parts)
+		CFS_FREE_PTR_ARRAY(cptab->ctb_parts, cptab->ctb_nparts);
+
+failed_alloc_ctb_parts:
+	if (cptab->ctb_node2cpt)
+		CFS_FREE_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids);
+
+failed_alloc_node2cpt:
+	if (cptab->ctb_cpu2cpt)
+		CFS_FREE_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids);
+
+failed_alloc_cpu2cpt:
+	if (cptab->ctb_nodemask)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+failed_alloc_nodemask:
+	free_cpumask_var(cptab->ctb_cpumask);
+failed_alloc_cpumask:
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	int i;
+
+	if (cptab->ctb_cpu2cpt)
+		CFS_FREE_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids);
+
+	if (cptab->ctb_node2cpt)
+		CFS_FREE_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids);
+
+	for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		free_cpumask_var(part->cpt_cpumask);
+
+		if (part->cpt_distance)
+			CFS_FREE_PTR_ARRAY(part->cpt_distance,
+					   cptab->ctb_nparts);
+	}
+
+	if (cptab->ctb_parts)
+		CFS_FREE_PTR_ARRAY(cptab->ctb_parts, cptab->ctb_nparts);
+
+	if (cptab->ctb_nodemask)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	free_cpumask_var(cptab->ctb_cpumask);
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	char *tmp = buf;
+	int rc;
+	int i;
+	int j;
+
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len <= 0)
+			goto err;
+
+		rc = snprintf(tmp, len, "%d\t:", i);
+		len -= rc;
+
+		if (len <= 0)
+			goto err;
+
+		tmp += rc;
+		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
+			rc = snprintf(tmp, len, " %d", j);
+			len -= rc;
+			if (len <= 0)
+				goto err;
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+	return tmp - buf;
+err:
+	return -E2BIG;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	char *tmp = buf;
+	int rc;
+	int i;
+	int j;
+
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len <= 0)
+			goto err;
+
+		rc = snprintf(tmp, len, "%d\t:", i);
+		len -= rc;
+
+		if (len <= 0)
+			goto err;
+
+		tmp += rc;
+		for (j = 0; j < cptab->ctb_nparts; j++) {
+			rc = snprintf(tmp, len, " %d:%d", j,
+				      cptab->ctb_parts[i].cpt_distance[j]);
+			len -= rc;
+			if (len <= 0)
+				goto err;
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+	return tmp - buf;
+err:
+	return -E2BIG;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
+int cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return cptab->ctb_nparts;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_weight(cptab->ctb_cpumask) :
+	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_any_and(cptab->ctb_cpumask,
+			       cpu_online_mask) < nr_cpu_ids :
+	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
+			       cpu_online_mask) < nr_cpu_ids;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_var_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       &cptab->ctb_cpumask : &cptab->ctb_parts[cpt].cpt_cpumask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+{
+	LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
+	LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
+
+	if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
+		return cptab->ctb_distance;
+
+	return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
+}
+EXPORT_SYMBOL(cfs_cpt_distance);
+
+/*
+ * Calculate the maximum NUMA distance between all nodes in the
+ * from_mask and all nodes in the to_mask.
+ */
+static unsigned int cfs_cpt_distance_calculate(nodemask_t *from_mask,
+					       nodemask_t *to_mask)
+{
+	unsigned int maximum;
+	unsigned int distance;
+	int from;
+	int to;
+
+	maximum = 0;
+	for_each_node_mask(from, *from_mask) {
+		for_each_node_mask(to, *to_mask) {
+			distance = node_distance(from, to);
+			if (maximum < distance)
+				maximum = distance;
+		}
+	}
+	return maximum;
+}
+
+static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	cptab->ctb_cpu2cpt[cpu] = cpt;
+
+	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+}
+
+static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+
+	cptab->ctb_cpu2cpt[cpu] = -1;
+}
+
+static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	struct cfs_cpu_partition *part;
+
+	if (!node_isset(node, *cptab->ctb_nodemask)) {
+		unsigned int dist;
+
+		/* first time node is added to the CPT table */
+		node_set(node, *cptab->ctb_nodemask);
+		cptab->ctb_node2cpt[node] = cpt;
+
+		dist = cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+						  cptab->ctb_nodemask);
+		cptab->ctb_distance = dist;
+	}
+
+	part = &cptab->ctb_parts[cpt];
+	if (!node_isset(node, *part->cpt_nodemask)) {
+		int cpt2;
+
+		/* first time node is added to this CPT */
+		node_set(node, *part->cpt_nodemask);
+		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+			struct cfs_cpu_partition *part2;
+			unsigned int dist;
+
+			part2 = &cptab->ctb_parts[cpt2];
+			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
+							  part2->cpt_nodemask);
+			part->cpt_distance[cpt2] = dist;
+			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
+							  part->cpt_nodemask);
+			part2->cpt_distance[cpt] = dist;
+		}
+	}
+}
+
+static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
+	int cpu;
+
+	for_each_cpu(cpu, part->cpt_cpumask) {
+		/* this CPT has other CPU belonging to this node? */
+		if (cpu_to_node(cpu) == node)
+			break;
+	}
+
+	if (cpu >= nr_cpu_ids && node_isset(node,  *part->cpt_nodemask)) {
+		int cpt2;
+
+		/* No more CPUs in the node for this CPT. */
+		node_clear(node, *part->cpt_nodemask);
+		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+			struct cfs_cpu_partition *part2;
+			unsigned int dist;
+
+			part2 = &cptab->ctb_parts[cpt2];
+			if (node_isset(node, *part2->cpt_nodemask))
+				cptab->ctb_node2cpt[node] = cpt2;
+
+			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
+							  part2->cpt_nodemask);
+			part->cpt_distance[cpt2] = dist;
+			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
+							  part->cpt_nodemask);
+			part2->cpt_distance[cpt] = dist;
+		}
+	}
+
+	for_each_cpu(cpu, cptab->ctb_cpumask) {
+		/* this CPT-table has other CPUs belonging to this node? */
+		if (cpu_to_node(cpu) == node)
+			break;
+	}
+
+	if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
+		/* No more CPUs in the table for this node. */
+		node_clear(node, *cptab->ctb_nodemask);
+		cptab->ctb_node2cpt[node] = -1;
+		cptab->ctb_distance =
+			cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+						   cptab->ctb_nodemask);
+	}
+}
+
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+		return 0;
+	}
+
+	if (cptab->ctb_cpu2cpt[cpu] != -1) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
+		CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
+		return 0;
+	}
+
+	if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	cfs_cpt_add_cpu(cptab, cpt, cpu);
+	cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+		return;
+	}
+
+	if (cpt == CFS_CPT_ANY) {
+		/* caller doesn't know the partition ID */
+		cpt = cptab->ctb_cpu2cpt[cpu];
+		if (cpt < 0) { /* not set in this CPT-table */
+			CDEBUG(D_INFO,
+			       "Try to unset cpu %d which is not in CPT-table %p\n",
+			       cpt, cptab);
+			return;
+		}
+
+	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+		CDEBUG(D_INFO,
+		       "CPU %d is not in CPU partition %d\n", cpu, cpt);
+		return;
+	}
+
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+
+	cfs_cpt_del_cpu(cptab, cpt, cpu);
+	cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			const cpumask_t *mask)
+{
+	int cpu;
+
+	if (!cpumask_weight(mask) ||
+	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
+		CDEBUG(D_INFO,
+		       "No online CPU is found in the CPU mask for CPU partition %d\n",
+		       cpt);
+		return 0;
+	}
+
+	for_each_cpu(cpu, mask) {
+		cfs_cpt_add_cpu(cptab, cpt, cpu);
+		cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			   const cpumask_t *mask)
+{
+	int cpu;
+
+	for_each_cpu(cpu, mask) {
+		cfs_cpt_del_cpu(cptab, cpt, cpu);
+		cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
+	}
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	const cpumask_t *mask;
+	int cpu;
+
+	if (node < 0 || node >= nr_node_ids) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return 0;
+	}
+
+	mask = cpumask_of_node(node);
+
+	for_each_cpu(cpu, mask)
+		cfs_cpt_add_cpu(cptab, cpt, cpu);
+
+	cfs_cpt_add_node(cptab, cpt, node);
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	const cpumask_t *mask;
+	int cpu;
+
+	if (node < 0 || node >= nr_node_ids) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return;
+	}
+
+	mask = cpumask_of_node(node);
+
+	for_each_cpu(cpu, mask)
+		cfs_cpt_del_cpu(cptab, cpt, cpu);
+
+	cfs_cpt_del_node(cptab, cpt, node);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			 const nodemask_t *mask)
+{
+	int node;
+
+	for_each_node_mask(node, *mask)
+		cfs_cpt_set_node(cptab, cpt, node);
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			    const nodemask_t *mask)
+{
+	int node;
+
+	for_each_node_mask(node, *mask)
+		cfs_cpt_unset_node(cptab, cpt, node);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	nodemask_t *mask;
+	int weight;
+	unsigned int rotor;
+	int node = 0;
+
+	/* convert CPU partition ID to HW node id */
+
+	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+		mask = cptab->ctb_nodemask;
+		rotor = cptab->ctb_spread_rotor++;
+	} else {
+		mask = cptab->ctb_parts[cpt].cpt_nodemask;
+		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+		node  = cptab->ctb_parts[cpt].cpt_node;
+	}
+
+	weight = nodes_weight(*mask);
+	if (weight > 0) {
+		rotor %= weight;
+
+		for_each_node_mask(node, *mask) {
+			if (!rotor--)
+				return node;
+		}
+	}
+
+	return node;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	int cpu;
+	int cpt;
+
+	preempt_disable();
+	cpu = smp_processor_id();
+	cpt = cptab->ctb_cpu2cpt[cpu];
+
+	if (cpt < 0 && remap) {
+		/* don't return negative value for safety of upper layer,
+		 * instead we shadow the unknown cpu to a valid partition ID
+		 */
+		cpt = cpu % cptab->ctb_nparts;
+	}
+	preempt_enable();
+	return cpt;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
+
+	return cptab->ctb_cpu2cpt[cpu];
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+	if (node < 0 || node > nr_node_ids)
+		return CFS_CPT_ANY;
+
+	return cptab->ctb_node2cpt[node];
+}
+EXPORT_SYMBOL(cfs_cpt_of_node);
+
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	nodemask_t *nodemask;
+	cpumask_t *cpumask;
+	int cpu;
+	int rc;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpt == CFS_CPT_ANY) {
+		cpumask = cptab->ctb_cpumask;
+		nodemask = cptab->ctb_nodemask;
+	} else {
+		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+	}
+
+	if (!cpumask_intersects(cpumask, cpu_online_mask)) {
+		CDEBUG(D_INFO,
+		       "No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
+			cpt);
+		return -ENODEV;
+	}
+
+	for_each_online_cpu(cpu) {
+		if (cpumask_test_cpu(cpu, cpumask))
+			continue;
+
+		rc = set_cpus_allowed_ptr(current, cpumask);
+		set_mems_allowed(*nodemask);
+		if (!rc)
+			schedule(); /* switch to allowed CPU */
+
+		return rc;
+	}
+
+	/* don't need to set affinity because all online CPUs are covered */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+				cpumask_t *node_mask, int number)
+{
+	cpumask_var_t socket_mask;
+	cpumask_var_t core_mask;
+	int rc = 0;
+	int cpu;
+	int i;
+
+	LASSERT(number > 0);
+
+	if (number >= cpumask_weight(node_mask)) {
+		while (!cpumask_empty(node_mask)) {
+			cpu = cpumask_first(node_mask);
+			cpumask_clear_cpu(cpu, node_mask);
+
+			if (!cpu_online(cpu))
+				continue;
+
+			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+			if (!rc)
+				return -EINVAL;
+		}
+		return 0;
+	}
+
+	/*
+	 * Allocate scratch buffers
+	 * As we cannot initialize a cpumask_var_t, we need
+	 * to alloc both before we can risk trying to free either
+	 */
+	if (!zalloc_cpumask_var(&socket_mask, GFP_NOFS))
+		rc = -ENOMEM;
+	if (!zalloc_cpumask_var(&core_mask, GFP_NOFS))
+		rc = -ENOMEM;
+	if (rc)
+		goto out;
+
+	while (!cpumask_empty(node_mask)) {
+		cpu = cpumask_first(node_mask);
+
+		/* get cpumask for cores in the same socket */
+		cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
+		while (!cpumask_empty(socket_mask)) {
+			/* get cpumask for hts in the same core */
+			cpumask_and(core_mask, topology_sibling_cpumask(cpu),
+				    node_mask);
+
+			for_each_cpu(i, core_mask) {
+				cpumask_clear_cpu(i, socket_mask);
+				cpumask_clear_cpu(i, node_mask);
+
+				if (!cpu_online(i))
+					continue;
+
+				rc = cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					rc = -EINVAL;
+					goto out;
+				}
+
+				if (!--number)
+					goto out;
+			}
+			cpu = cpumask_first(socket_mask);
+		}
+	}
+
+out:
+	free_cpumask_var(socket_mask);
+	free_cpumask_var(core_mask);
+	return rc;
+}
+
+#define CPT_WEIGHT_MIN	4u
+
+static unsigned int cfs_cpt_num_estimate(void)
+{
+	unsigned int nthr;
+	unsigned int ncpu = num_online_cpus();
+	unsigned int ncpt = 1;
+
+	preempt_disable();
+	nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
+	preempt_enable();
+
+	if (ncpu > CPT_WEIGHT_MIN)
+		for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++)
+			; /* nothing */
+
+#if (BITS_PER_LONG == 32)
+	/* config many CPU partitions on 32-bit system could consume
+	 * too much memory
+	 */
+	ncpt = min(2U, ncpt);
+#endif
+	while (ncpu % ncpt)
+		ncpt--; /* worst case is 1 */
+
+	return ncpt;
+}
+
+static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
+{
+	struct cfs_cpt_table *cptab = NULL;
+	cpumask_var_t node_mask;
+	int cpt = 0;
+	int node;
+	int num;
+	int rem;
+	int rc = 0;
+
+	num = cfs_cpt_num_estimate();
+	if (ncpt <= 0)
+		ncpt = num;
+
+	if (ncpt > num_online_cpus()) {
+		rc = -EINVAL;
+		CERROR("libcfs: CPU partition count %d > cores %d: rc = %d\n",
+		       ncpt, num_online_cpus(), rc);
+		goto failed;
+	}
+
+	if (ncpt > 4 * num) {
+		CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
+		      ncpt, num);
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (!cptab) {
+		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	if (!zalloc_cpumask_var(&node_mask, GFP_NOFS)) {
+		CERROR("Failed to allocate scratch cpumask\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	num = num_online_cpus() / ncpt;
+	rem = num_online_cpus() % ncpt;
+	for_each_online_node(node) {
+		cpumask_copy(node_mask, cpumask_of_node(node));
+
+		while (cpt < ncpt && !cpumask_empty(node_mask)) {
+			struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
+			int ncpu = cpumask_weight(part->cpt_cpumask);
+
+			rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
+						  (rem > 0) + num - ncpu);
+			if (rc < 0) {
+				rc = -EINVAL;
+				goto failed_mask;
+			}
+
+			ncpu = cpumask_weight(part->cpt_cpumask);
+			if (ncpu == num + !!(rem > 0)) {
+				cpt++;
+				rem--;
+			}
+		}
+	}
+
+	free_cpumask_var(node_mask);
+
+	return cptab;
+
+failed_mask:
+	free_cpumask_var(node_mask);
+failed:
+	CERROR("Failed (rc = %d) to setup CPU partition table with %d partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
+	       rc, ncpt, num_online_nodes(), num_online_cpus());
+
+	if (cptab)
+		cfs_cpt_table_free(cptab);
+
+	return ERR_PTR(rc);
+}
+
+static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern)
+{
+	struct cfs_cpt_table *cptab;
+	char *pattern_dup;
+	char *bracket;
+	char *str;
+	int node = 0;
+	int ncpt = 0;
+	int cpt = 0;
+	int high;
+	int rc;
+	int c;
+	int i;
+
+	pattern_dup = kstrdup(pattern, GFP_KERNEL);
+	if (!pattern_dup) {
+		CERROR("Failed to duplicate pattern '%s'\n", pattern);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	str = strim(pattern_dup);
+	if (*str == 'n' || *str == 'N') {
+		str++; /* skip 'N' char */
+		node = 1; /* NUMA pattern */
+		if (*str == '\0') {
+			node = -1;
+			for_each_online_node(i) {
+				if (!cpumask_empty(cpumask_of_node(i)))
+					ncpt++;
+			}
+			if (ncpt == 1) { /* single NUMA node */
+				kfree(pattern_dup);
+				return cfs_cpt_table_create(cpu_npartitions);
+			}
+		}
+	}
+
+	if (!ncpt) { /* scanning bracket which is mark of partition */
+		bracket = str;
+		while ((bracket = strchr(bracket, '['))) {
+			bracket++;
+			ncpt++;
+		}
+	}
+
+	if (!ncpt ||
+	    (node && ncpt > num_online_nodes()) ||
+	    (!node && ncpt > num_online_cpus())) {
+		CERROR("Invalid pattern '%s', or too many partitions %d\n",
+		       pattern_dup, ncpt);
+		rc = -EINVAL;
+		goto err_free_str;
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (!cptab) {
+		CERROR("Failed to allocate CPU partition table\n");
+		rc = -ENOMEM;
+		goto err_free_str;
+	}
+
+	if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */
+		for_each_online_node(i) {
+			if (cpumask_empty(cpumask_of_node(i)))
+				continue;
+
+			rc = cfs_cpt_set_node(cptab, cpt++, i);
+			if (!rc) {
+				rc = -EINVAL;
+				goto err_free_table;
+			}
+		}
+		kfree(pattern_dup);
+		return cptab;
+	}
+
+	high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
+
+	for (str = strim(str), c = 0; /* until break */; c++) {
+		struct cfs_range_expr *range;
+		struct cfs_expr_list *el;
+		int n;
+
+		bracket = strchr(str, '[');
+		if (!bracket) {
+			if (*str) {
+				CERROR("Invalid pattern '%s'\n", str);
+				rc = -EINVAL;
+				goto err_free_table;
+			} else if (c != ncpt) {
+				CERROR("Expect %d partitions but found %d\n",
+				       ncpt, c);
+				rc = -EINVAL;
+				goto err_free_table;
+			}
+			break;
+		}
+
+		if (sscanf(str, "%d%n", &cpt, &n) < 1) {
+			CERROR("Invalid CPU pattern '%s'\n", str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		if (cpt < 0 || cpt >= ncpt) {
+			CERROR("Invalid partition id %d, total partitions %d\n",
+			       cpt, ncpt);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		if (cfs_cpt_weight(cptab, cpt)) {
+			CERROR("Partition %d has already been set.\n", cpt);
+			rc = -EPERM;
+			goto err_free_table;
+		}
+
+		str = strim(str + n);
+		if (str != bracket) {
+			CERROR("Invalid pattern '%s'\n", str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		bracket = strchr(str, ']');
+		if (!bracket) {
+			CERROR("Missing right bracket for partition %d in '%s'\n",
+			       cpt, str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high,
+					 &el);
+		if (rc) {
+			CERROR("Can't parse number range in '%s'\n", str);
+			rc = -ERANGE;
+			goto err_free_table;
+		}
+
+		list_for_each_entry(range, &el->el_exprs, re_link) {
+			for (i = range->re_lo; i <= range->re_hi; i++) {
+				if ((i - range->re_lo) % range->re_stride)
+					continue;
+
+				rc = node ? cfs_cpt_set_node(cptab, cpt, i)
+					  : cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					cfs_expr_list_free(el);
+					rc = -EINVAL;
+					goto err_free_table;
+				}
+			}
+		}
+
+		cfs_expr_list_free(el);
+
+		if (!cfs_cpt_online(cptab, cpt)) {
+			CERROR("No online CPU is found on partition %d\n", cpt);
+			rc = -ENODEV;
+			goto err_free_table;
+		}
+
+		str = strim(bracket + 1);
+	}
+
+	kfree(pattern_dup);
+	return cptab;
+
+err_free_table:
+	cfs_cpt_table_free(cptab);
+err_free_str:
+	kfree(pattern_dup);
+	return ERR_PTR(rc);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+static enum cpuhp_state lustre_cpu_online;
+
+static int cfs_cpu_online(unsigned int cpu)
+{
+	return 0;
+}
+#endif
+
+static int cfs_cpu_dead(unsigned int cpu)
+{
+	bool warn;
+
+	/* if all HTs in a core are offline, it may break affinity */
+	warn = cpumask_any_and(topology_sibling_cpumask(cpu),
+			       cpu_online_mask) >= nr_cpu_ids;
+	CDEBUG(warn ? D_WARNING : D_INFO,
+	       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n",
+	       cpu);
+	return 0;
+}
+
+#ifndef HAVE_HOTPLUG_STATE_MACHINE
+static int cfs_cpu_notify(struct notifier_block *self, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	default:
+		if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
+			CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
+			       cpu, action);
+			break;
+		}
+
+		cfs_cpu_dead(cpu);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+	.notifier_call	= cfs_cpu_notify,
+	.priority	= 0
+};
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void cfs_cpu_fini(void)
+{
+	if (!IS_ERR_OR_NULL(cfs_cpt_tab))
+		cfs_cpt_table_free(cfs_cpt_tab);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	if (lustre_cpu_online > 0)
+		cpuhp_remove_state_nocalls(lustre_cpu_online);
+	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
+#else
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+}
+
+int cfs_cpu_init(void)
+{
+	int ret;
+
+	LASSERT(!cfs_cpt_tab);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD,
+					"fs/lustre/cfe:dead", NULL,
+					cfs_cpu_dead);
+	if (ret < 0)
+		goto failed_cpu_dead;
+
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"fs/lustre/cfe:online",
+					cfs_cpu_online, NULL);
+	if (ret < 0)
+		goto failed_cpu_online;
+
+	lustre_cpu_online = ret;
+#else
+	register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+
+	cpus_read_lock();
+	if (*cpu_pattern) {
+		cfs_cpt_tab = cfs_cpt_table_create_pattern(cpu_pattern);
+		if (IS_ERR(cfs_cpt_tab)) {
+			CERROR("Failed to create cptab from pattern '%s'\n",
+			       cpu_pattern);
+			ret = PTR_ERR(cfs_cpt_tab);
+			goto failed_alloc_table;
+		}
+
+	} else {
+		cfs_cpt_tab = cfs_cpt_table_create(cpu_npartitions);
+		if (IS_ERR(cfs_cpt_tab)) {
+			CERROR("Failed to create cptab with npartitions %d\n",
+			       cpu_npartitions);
+			ret = PTR_ERR(cfs_cpt_tab);
+			goto failed_alloc_table;
+		}
+	}
+
+	cpus_read_unlock();
+
+	LCONSOLE(0, "HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n",
+		 num_online_nodes(), num_online_cpus(),
+		 cfs_cpt_number(cfs_cpt_tab));
+	return 0;
+
+failed_alloc_table:
+	cpus_read_unlock();
+
+	if (!IS_ERR_OR_NULL(cfs_cpt_tab))
+		cfs_cpt_table_free(cfs_cpt_tab);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	if (lustre_cpu_online > 0)
+		cpuhp_remove_state_nocalls(lustre_cpu_online);
+failed_cpu_online:
+	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
+failed_cpu_dead:
+#else
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+	return ret;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c
new file mode 100644
index 0000000000000..c4ad568654a13
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c
@@ -0,0 +1,156 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+/** destroy cpu-partition lock, see libcfs_private.h for more detail */
+void
+cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
+{
+	LASSERT(pcl->pcl_locks != NULL);
+	LASSERT(!pcl->pcl_locked);
+
+	cfs_percpt_free(pcl->pcl_locks);
+	LIBCFS_FREE(pcl, sizeof(*pcl));
+}
+EXPORT_SYMBOL(cfs_percpt_lock_free);
+
+/**
+ * create cpu-partition lock, see libcfs_private.h for more detail.
+ *
+ * cpu-partition lock is designed for large-scale SMP system, so we need to
+ * reduce cacheline conflict as possible as we can, that's the
+ * reason we always allocate cacheline-aligned memory block.
+ */
+struct cfs_percpt_lock *
+cfs_percpt_lock_create(struct cfs_cpt_table *cptab,
+		       struct lock_class_key *keys)
+{
+	struct cfs_percpt_lock	*pcl;
+	spinlock_t		*lock;
+	int			i;
+
+	/* NB: cptab can be NULL, pcl will be for HW CPUs on that case */
+	LIBCFS_ALLOC(pcl, sizeof(*pcl));
+	if (pcl == NULL)
+		return NULL;
+
+	pcl->pcl_cptab = cptab;
+	pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock));
+	if (pcl->pcl_locks == NULL) {
+		LIBCFS_FREE(pcl, sizeof(*pcl));
+		return NULL;
+	}
+
+	if (keys == NULL) {
+		CWARN("Cannot setup class key for percpt lock, you may see "
+		      "recursive locking warnings which are actually fake.\n");
+	}
+
+	cfs_percpt_for_each(lock, i, pcl->pcl_locks) {
+		spin_lock_init(lock);
+		if (keys != NULL)
+			lockdep_set_class(lock, &keys[i]);
+	}
+
+	return pcl;
+}
+EXPORT_SYMBOL(cfs_percpt_lock_create);
+
+/**
+ * lock a CPU partition
+ *
+ * \a index != CFS_PERCPT_LOCK_EX
+ *     hold private lock indexed by \a index
+ *
+ * \a index == CFS_PERCPT_LOCK_EX
+ *     exclusively lock @pcl and nobody can take private lock
+ */
+void
+cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index)
+__acquires(pcl->pcl_locks)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt);
+
+	if (ncpt == 1) {
+		index = 0;
+	} else { /* serialize with exclusive lock */
+		while (pcl->pcl_locked)
+			cpu_relax();
+	}
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_lock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	/* exclusive lock request */
+	for (i = 0; i < ncpt; i++) {
+		spin_lock(pcl->pcl_locks[i]);
+		if (i == 0) {
+			LASSERT(!pcl->pcl_locked);
+			/* nobody should take private lock after this
+			 * so I wouldn't starve for too long time */
+			pcl->pcl_locked = 1;
+		}
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_lock);
+
+/** unlock a CPU partition */
+void
+cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index)
+__releases(pcl->pcl_locks)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	index = ncpt == 1 ? 0 : index;
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_unlock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	for (i = ncpt - 1; i >= 0; i--) {
+		if (i == 0) {
+			LASSERT(pcl->pcl_locked);
+			pcl->pcl_locked = 0;
+		}
+		spin_unlock(pcl->pcl_locks[i]);
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_unlock);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
new file mode 100644
index 0000000000000..d514b017b2eaa
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/workqueue.h>
+#include <libcfs/libcfs.h>
+#include <lustre_compat.h>
+
+struct cfs_var_array {
+	unsigned int		va_count;	/* # of buffers */
+	unsigned int		va_size;	/* size of each var */
+	struct cfs_cpt_table	*va_cptab;	/* cpu partition table */
+	void			*va_ptrs[0];	/* buffer addresses */
+};
+
+/*
+ * free per-cpu data, see more detail in cfs_percpt_free
+ */
+void
+cfs_percpt_free(void *vars)
+{
+	struct	cfs_var_array *arr;
+	int	i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] != NULL)
+			LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_percpt_free);
+
+/*
+ * allocate per cpu-partition variables, returned value is an array of pointers,
+ * variable can be indexed by CPU partition ID, i.e:
+ *
+ *	arr = cfs_percpt_alloc(cfs_cpu_pt, size);
+ *	then caller can access memory block for CPU 0 by arr[0],
+ *	memory block for CPU 1 by arr[1]...
+ *	memory block for CPU N by arr[N]...
+ *
+ * cacheline aligned.
+ */
+void *
+cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			count;
+	int			i;
+
+	count = cfs_cpt_number(cptab);
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_size	= size = L1_CACHE_ALIGN(size);
+	arr->va_count	= count;
+	arr->va_cptab	= cptab;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_percpt_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_percpt_alloc);
+
+/*
+ * return number of CPUs (or number of elements in per-cpu data)
+ * according to cptab of @vars
+ */
+int
+cfs_percpt_number(void *vars)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	return arr->va_count;
+}
+EXPORT_SYMBOL(cfs_percpt_number);
+
+
+/*
+ * This is opencoding of vfree_atomic from Linux kernel added in 4.10 with
+ * minimum changes needed to work on older kernels too.
+ */
+
+#ifndef llist_for_each_safe
+#define llist_for_each_safe(pos, n, node)                       \
+        for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+#endif
+
+struct vfree_deferred {
+	struct llist_head list;
+	struct work_struct wq;
+};
+static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
+
+static void free_work(struct work_struct *w)
+{
+	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
+	struct llist_node *t, *llnode;
+
+	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
+		vfree((void *)llnode);
+}
+
+void libcfs_vfree_atomic(const void *addr)
+{
+	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+
+	if (!addr)
+		return;
+
+	if (llist_add((struct llist_node *)addr, &p->list))
+		schedule_work(&p->wq);
+}
+EXPORT_SYMBOL(libcfs_vfree_atomic);
+
+void __init init_libcfs_vfree_atomic(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct vfree_deferred *p;
+
+		p = &per_cpu(vfree_deferred, i);
+		init_llist_head(&p->list);
+		INIT_WORK(&p->wq, free_work);
+	}
+}
+
+void __exit exit_libcfs_vfree_atomic(void)
+{
+	flush_scheduled_work();
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
new file mode 100644
index 0000000000000..a3ff59c5970e6
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
@@ -0,0 +1,561 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/libcfs_string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#include <linux/ctype.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_string.h>
+
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+		 int *oldmask, int minmask, int allmask, int defmask)
+{
+	const char *debugstr;
+	char op = 0;
+	int newmask = minmask, i, len, found = 0;
+
+	ENTRY;
+	/* <str> must be a list of tokens separated by whitespace or comma,
+	 * and optionally an operator ('+' or '-').  If an operator
+	 * appears first in <str>, '*oldmask' is used as the starting point
+	 * (relative), otherwise minmask is used (absolute).  An operator
+	 * applies to all following tokens up to the next operator.
+	 */
+	while (*str != 0) {
+		while (isspace(*str) || *str == ',')
+			str++;
+		if (*str == 0)
+			break;
+		if (*str == '+' || *str == '-') {
+			op = *str++;
+			if (!found)
+				/* only if first token is relative */
+				newmask = *oldmask;
+			while (isspace(*str))
+				str++;
+			if (*str == 0)		/* trailing op */
+				return -EINVAL;
+		}
+
+		/* find token length */
+		for (len = 0; str[len] != 0 && !isspace(str[len]) &&
+			str[len] != '+' && str[len] != '-' && str[len] != ',';
+		     len++);
+
+		/* match token */
+		found = 0;
+		for (i = 0; i < 32; i++) {
+			debugstr = bit2str(i);
+			if (debugstr != NULL &&
+			    strlen(debugstr) == len &&
+			    strncasecmp(str, debugstr, len) == 0) {
+				if (op == '-')
+					newmask &= ~BIT(i);
+				else
+					newmask |= BIT(i);
+				found = 1;
+				break;
+			}
+		}
+		if (!found && len == 3 &&
+		    (strncasecmp(str, "ALL", len) == 0)) {
+			if (op == '-')
+				newmask = minmask;
+			else
+				newmask = allmask;
+			found = 1;
+		}
+		if (!found && strcasecmp(str, "DEFAULT") == 0) {
+			if (op == '-')
+				newmask = (newmask & ~defmask) | minmask;
+			else if (op == '+')
+				newmask |= defmask;
+			else
+				newmask = defmask;
+			found = 1;
+		}
+		if (!found) {
+			CWARN("unknown mask '%.*s'.\n"
+			      "mask usage: [+|-]<all|type> ...\n", len, str);
+			return -EINVAL;
+		}
+		str += len;
+	}
+
+	*oldmask = newmask;
+	return 0;
+}
+EXPORT_SYMBOL(cfs_str2mask);
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+	char *end;
+
+	if (next->ls_str == NULL)
+		return 0;
+
+	/* skip leading white spaces */
+	while (next->ls_len) {
+		if (!isspace(*next->ls_str))
+			break;
+		next->ls_str++;
+		next->ls_len--;
+	}
+
+	if (next->ls_len == 0) /* whitespaces only */
+		return 0;
+
+	if (*next->ls_str == delim) {
+		/* first non-writespace is the delimiter */
+		return 0;
+	}
+
+	res->ls_str = next->ls_str;
+	end = memchr(next->ls_str, delim, next->ls_len);
+	if (end == NULL) {
+		/* there is no the delimeter in the string */
+		end = next->ls_str + next->ls_len;
+		next->ls_str = NULL;
+		next->ls_len = 0;
+	} else {
+		next->ls_str = end + 1;
+		next->ls_len -= (end - res->ls_str + 1);
+	}
+
+	/* skip ending whitespaces */
+	while (--end != res->ls_str) {
+		if (!isspace(*end))
+			break;
+	}
+
+	res->ls_len = end - res->ls_str + 1;
+	return 1;
+}
+EXPORT_SYMBOL(cfs_gettok);
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+		  unsigned min, unsigned max)
+{
+	bool all_numbers = true;
+	char *endp, cache;
+	int len;
+	int rc;
+
+	endp = strim(str);
+	/**
+	 * kstrouint can only handle strings composed
+	 * of only numbers. We need to scan the string
+	 * passed in for the first non-digit character
+	 * and end the string at that location. If we
+	 * don't find any non-digit character we still
+	 * need to place a '\0' at position len since
+	 * we are not interested in the rest of the
+	 * string which is longer than len in size.
+	 * After we are done the character at the
+	 * position we placed '\0' must be restored.
+	 */
+	len = min((int)strlen(endp), nob);
+	for (; endp < str + len; endp++) {
+		if (!isxdigit(*endp) && *endp != '-' &&
+		    *endp != '+') {
+			all_numbers = false;
+			break;
+		}
+	}
+
+	/* Eat trailing space */
+	if (!all_numbers && isspace(*endp)) {
+		all_numbers = true;
+		endp--;
+	}
+
+	cache = *endp;
+	*endp = '\0';
+
+	rc = kstrtouint(str, 0, num);
+	*endp = cache;
+	if (rc || !all_numbers)
+		return 0;
+
+	return (*num >= min && *num <= max);
+}
+EXPORT_SYMBOL(cfs_str2num_check);
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ `* src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+static int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+		     int bracketed, struct cfs_range_expr **expr)
+{
+	struct cfs_range_expr *re;
+	struct cfs_lstr	tok;
+
+	LIBCFS_ALLOC(re, sizeof(*re));
+	if (re == NULL)
+		return -ENOMEM;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		re->re_lo = min;
+		re->re_hi = max;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_lo, min, max)) {
+		/* <number> is parsed */
+		re->re_hi = re->re_lo;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (!bracketed || !cfs_gettok(src, '-', &tok))
+		goto failed;
+
+	if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+			       &re->re_lo, min, max))
+		goto failed;
+
+	/* <number> - */
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_hi, min, max)) {
+		/* <number> - <number> is parsed */
+		re->re_stride = 1;
+		goto out;
+	}
+
+	/* go to check <number> '-' <number> '/' <number> */
+	if (cfs_gettok(src, '/', &tok)) {
+		if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+				       &re->re_hi, min, max))
+			goto failed;
+
+		/* <number> - <number> / ... */
+		if (cfs_str2num_check(src->ls_str, src->ls_len,
+				      &re->re_stride, min, max)) {
+			/* <number> - <number> / <number> is parsed */
+			goto out;
+		}
+	}
+
+out:
+	*expr = re;
+	return 0;
+
+failed:
+	LIBCFS_FREE(re, sizeof(*re));
+	return -EINVAL;
+}
+
+/**
+ * Print the range expression \a re into specified \a buffer.
+ * If \a bracketed is true, expression does not need additional
+ * brackets.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_range_expr_print(char *buffer, int count, struct cfs_range_expr *expr,
+		     bool bracketed)
+{
+	int i;
+	char s[] = "[";
+	char e[] = "]";
+
+	if (bracketed)
+		s[0] = e[0] = '\0';
+
+	if (expr->re_lo == expr->re_hi)
+		i = scnprintf(buffer, count, "%u", expr->re_lo);
+	else if (expr->re_stride == 1)
+		i = scnprintf(buffer, count, "%s%u-%u%s",
+			      s, expr->re_lo, expr->re_hi, e);
+	else
+		i = scnprintf(buffer, count, "%s%u-%u/%u%s",
+			      s, expr->re_lo, expr->re_hi,
+			      expr->re_stride, e);
+	return i;
+}
+
+/**
+ * Print a list of range expressions (\a expr_list) into specified \a buffer.
+ * If the list contains several expressions, separate them with comma
+ * and surround the list with brackets.
+ *
+ * \retval number of characters written
+ */
+int
+cfs_expr_list_print(char *buffer, int count, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr *expr;
+	int i = 0, j = 0;
+	int numexprs = 0;
+
+	if (count <= 0)
+		return 0;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link)
+		numexprs++;
+
+	if (numexprs > 1)
+		i += scnprintf(buffer + i, count - i, "[");
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (j++ != 0)
+			i += scnprintf(buffer + i, count - i, ",");
+		i += cfs_range_expr_print(buffer + i, count - i, expr,
+					  numexprs > 1);
+	}
+
+	if (numexprs > 1)
+		i += scnprintf(buffer + i, count - i, "]");
+
+	return i;
+}
+EXPORT_SYMBOL(cfs_expr_list_print);
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr *expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (value >= expr->re_lo && value <= expr->re_hi &&
+		    ((value - expr->re_lo) % expr->re_stride) == 0)
+			return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_expr_list_match);
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+	struct cfs_range_expr *expr;
+	__u32 *val;
+	int count = 0;
+	int i;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				count++;
+		}
+	}
+
+	if (count == 0) /* empty expression list */
+		return 0;
+
+	if (count > max) {
+		CERROR("Number of values %d exceeds max allowed %d\n",
+		       max, count);
+		return -EINVAL;
+	}
+
+	CFS_ALLOC_PTR_ARRAY(val, count);
+	if (val == NULL)
+		return -ENOMEM;
+
+	count = 0;
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				val[count++] = i;
+		}
+	}
+
+	*valpp = val;
+	return count;
+}
+EXPORT_SYMBOL(cfs_expr_list_values);
+
+void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+	/* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+	 * by OBD_FREE() if it's called by module other than libcfs & LNet,
+	 * otherwise we will see fake memory leak */
+	CFS_FREE_PTR_ARRAY(values, num);
+}
+EXPORT_SYMBOL(cfs_expr_list_values_free);
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+	while (!list_empty(&expr_list->el_exprs)) {
+		struct cfs_range_expr *expr;
+
+		expr = list_entry(expr_list->el_exprs.next,
+				      struct cfs_range_expr, re_link);
+		list_del(&expr->re_link);
+		LIBCFS_FREE(expr, sizeof(*expr));
+	}
+
+	LIBCFS_FREE(expr_list, sizeof(*expr_list));
+}
+EXPORT_SYMBOL(cfs_expr_list_free);
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 0 if \a str parses to \<number\> | \<expr_list\>
+ * \retval -errno otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+		    struct cfs_expr_list **elpp)
+{
+	struct cfs_expr_list *expr_list;
+	struct cfs_range_expr *expr;
+	struct cfs_lstr	src;
+	int rc;
+
+	LIBCFS_ALLOC(expr_list, sizeof(*expr_list));
+	if (expr_list == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+
+	INIT_LIST_HEAD(&expr_list->el_exprs);
+
+	if (src.ls_str[0] == '[' &&
+	    src.ls_str[src.ls_len - 1] == ']') {
+		src.ls_str++;
+		src.ls_len -= 2;
+
+		rc = -EINVAL;
+		while (src.ls_str != NULL) {
+			struct cfs_lstr tok;
+
+			if (!cfs_gettok(&src, ',', &tok)) {
+				rc = -EINVAL;
+				break;
+			}
+
+			rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+			if (rc != 0)
+				break;
+
+			list_add_tail(&expr->re_link, &expr_list->el_exprs);
+		}
+	} else {
+		rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+		if (rc == 0)
+			list_add_tail(&expr->re_link, &expr_list->el_exprs);
+	}
+
+	if (rc != 0)
+		cfs_expr_list_free(expr_list);
+	else
+		*elpp = expr_list;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_expr_list_parse);
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+	struct cfs_expr_list *el;
+
+	while (!list_empty(list)) {
+		el = list_entry(list->next,
+				    struct cfs_expr_list, el_link);
+		list_del(&el->el_link);
+		cfs_expr_list_free(el);
+	}
+}
+EXPORT_SYMBOL(cfs_expr_list_free_list);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto-adler.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto-adler.c
new file mode 100644
index 0000000000000..6f19bcad2dc33
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto-adler.c
@@ -0,0 +1,137 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to zlib_adler32.
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include <crypto/internal/hash.h>
+#include "linux-crypto.h"
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+static int adler32_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 1;
+
+	return 0;
+}
+
+static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
+			  unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32))
+		return -EINVAL;
+
+	*mctx = *(u32 *)key;
+	return 0;
+}
+
+static int adler32_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = *mctx;
+
+	return 0;
+}
+
+static int adler32_update(struct shash_desc *desc, const u8 *data,
+			  unsigned int len)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = zlib_adler32(*cksump, data, len);
+	return 0;
+}
+static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len,
+			   u8 *out)
+{
+	*(u32 *)out = zlib_adler32(*cksump, data, len);
+	return 0;
+}
+
+static int adler32_finup(struct shash_desc *desc, const u8 *data,
+			 unsigned int len, u8 *out)
+{
+	return __adler32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int adler32_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*(u32 *)out = *cksump;
+	return 0;
+}
+
+static int adler32_digest(struct shash_desc *desc, const u8 *data,
+			  unsigned int len, u8 *out)
+{
+	return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+static struct shash_alg alg = {
+	.setkey		= adler32_setkey,
+	.init		= adler32_init,
+	.update		= adler32_update,
+	.final		= adler32_final,
+	.finup		= adler32_finup,
+	.digest		= adler32_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "adler32",
+		.cra_driver_name	= "adler32-zlib",
+		.cra_priority		= 100,
+#ifdef CRYPTO_ALG_OPTIONAL_KEY
+		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+#endif
+		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(u32),
+		.cra_module		= NULL,
+		.cra_init		= adler32_cra_init,
+	}
+};
+
+int cfs_crypto_adler32_register(void)
+{
+	return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_adler32_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.c
new file mode 100644
index 0000000000000..e210b8076445e
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.c
@@ -0,0 +1,487 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+
+#include <crypto/hash.h>
+#include <linux/scatterlist.h>
+#include <linux/pagemap.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_crypto.h>
+#include "linux-crypto.h"
+
+#ifndef HAVE_CRYPTO_HASH_HELPERS
+static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm)
+{
+	return crypto_tfm_alg_name(crypto_ahash_tfm(tfm));
+}
+
+static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm)
+{
+	return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
+}
+#endif
+
+/**
+ *  Array of hash algorithm speed in MByte per second
+ */
+int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+EXPORT_SYMBOL(cfs_crypto_hash_speeds);
+
+/**
+ * Initialize the state descriptor for the specified hash algorithm.
+ *
+ * An internal routine to allocate the hash-specific state in \a hdesc for
+ * use with cfs_crypto_hash_digest() to compute the hash of a single message,
+ * though possibly in multiple chunks.  The descriptor internal state should
+ * be freed with cfs_crypto_hash_final().
+ *
+ * \param[in]  hash_alg	hash algorithm id (CFS_HASH_ALG_*)
+ * \param[out] type	pointer to the hash description in hash_types[] array
+ * \param[in,out] req	ahash request to be initialized
+ * \param[in]  key	initial hash value/state, NULL to use default value
+ * \param[in]  key_len	length of \a key
+ *
+ * \retval		0 on success
+ * \retval		negative errno on failure
+ */
+static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
+				 const struct cfs_crypto_hash_type **type,
+				 struct ahash_request **req,
+				 unsigned char *key,
+				 unsigned int key_len)
+{
+	struct crypto_ahash *tfm;
+	int err = 0;
+
+	*type = cfs_crypto_hash_type(hash_alg);
+	if (!*type) {
+		CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
+		      hash_alg, CFS_HASH_ALG_MAX);
+		return -EINVAL;
+	}
+
+	/* Keys are only supported for the hmac version */
+	if (key && key_len > 0) {
+		char *algo_name;
+
+		algo_name = kasprintf(GFP_KERNEL, "hmac(%s)",
+				      (*type)->cht_name);
+		if (!algo_name)
+			return -ENOMEM;
+
+		tfm = crypto_alloc_ahash(algo_name, 0, CRYPTO_ALG_ASYNC);
+		kfree(algo_name);
+	} else {
+		tfm = crypto_alloc_ahash((*type)->cht_name, 0,
+					 CRYPTO_ALG_ASYNC);
+	}
+	if (IS_ERR(tfm)) {
+		CDEBUG_LIMIT(PTR_ERR(tfm) == -ENOMEM ? D_ERROR : D_INFO,
+			     "Failed to alloc crypto hash %s: rc = %d\n",
+			     (*type)->cht_name, (int)PTR_ERR(tfm));
+		return PTR_ERR(tfm);
+	}
+
+	*req = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!*req) {
+		CDEBUG(D_INFO, "Failed to alloc ahash_request for %s\n",
+		       (*type)->cht_name);
+		GOTO(out_free_tfm, err = -ENOMEM);
+	}
+
+	ahash_request_set_callback(*req, 0, NULL, NULL);
+
+	if (key)
+		err = crypto_ahash_setkey(tfm, key, key_len);
+	else if ((*type)->cht_key != 0)
+		err = crypto_ahash_setkey(tfm,
+					 (unsigned char *)&((*type)->cht_key),
+					 (*type)->cht_size);
+	if (err)
+		GOTO(out_free_req, err);
+
+	CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
+	       crypto_ahash_alg_name(tfm), crypto_ahash_driver_name(tfm),
+	       cfs_crypto_hash_speeds[hash_alg]);
+
+	err = crypto_ahash_init(*req);
+	if (err) {
+out_free_req:
+		ahash_request_free(*req);
+out_free_tfm:
+		crypto_free_ahash(tfm);
+	}
+	return err;
+}
+
+/**
+ * Calculate hash digest for the passed buffer.
+ *
+ * This should be used when computing the hash on a single contiguous buffer.
+ * It combines the hash initialization, computation, and cleanup.
+ *
+ * \param[in] hash_alg	id of hash algorithm (CFS_HASH_ALG_*)
+ * \param[in] buf	data buffer on which to compute hash
+ * \param[in] buf_len	length of \a buf in bytes
+ * \param[in] key	initial value/state for algorithm, if \a key = NULL
+ *			use default initial value
+ * \param[in] key_len	length of \a key in bytes
+ * \param[out] hash	pointer to computed hash value, if \a hash = NULL then
+ *			\a hash_len is to digest size in bytes, retval -ENOSPC
+ * \param[in,out] hash_len size of \a hash buffer
+ *
+ * \retval -EINVAL       \a buf, \a buf_len, \a hash_len, \a hash_alg invalid
+ * \retval -ENOENT       \a hash_alg is unsupported
+ * \retval -ENOSPC       \a hash is NULL, or \a hash_len less than digest size
+ * \retval		0 for success
+ * \retval		negative errno for other errors from lower layers.
+ */
+int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len)
+{
+	struct scatterlist	sl;
+	struct ahash_request *req;
+	int			err;
+	const struct cfs_crypto_hash_type	*type;
+
+	if (!buf || buf_len == 0 || !hash_len)
+		return -EINVAL;
+
+	err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len);
+	if (err != 0)
+		return err;
+
+	if (!hash || *hash_len < type->cht_size) {
+		*hash_len = type->cht_size;
+		crypto_free_ahash(crypto_ahash_reqtfm(req));
+		ahash_request_free(req);
+		return -ENOSPC;
+	}
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	ahash_request_set_crypt(req, &sl, hash, sl.length);
+	err = crypto_ahash_digest(req);
+	crypto_free_ahash(crypto_ahash_reqtfm(req));
+	ahash_request_free(req);
+
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_digest);
+
+/**
+ * Allocate and initialize desriptor for hash algorithm.
+ *
+ * This should be used to initialize a hash descriptor for multiple calls
+ * to a single hash function when computing the hash across multiple
+ * separate buffers or pages using cfs_crypto_hash_update{,_page}().
+ *
+ * The hash descriptor should be freed with cfs_crypto_hash_final().
+ *
+ * \param[in] hash_alg	algorithm id (CFS_HASH_ALG_*)
+ * \param[in] key	initial value/state for algorithm, if \a key = NULL
+ *			use default initial value
+ * \param[in] key_len	length of \a key in bytes
+ *
+ * \retval		pointer to ahash request
+ * \retval		ERR_PTR(errno) in case of error
+ */
+struct ahash_request *
+	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
+			     unsigned char *key, unsigned int key_len)
+{
+	struct ahash_request *req;
+	int					err;
+	const struct cfs_crypto_hash_type       *type;
+
+	err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len);
+	if (err)
+		return ERR_PTR(err);
+	return req;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_init);
+
+/**
+ * Update hash digest computed on data within the given \a page
+ *
+ * \param[in] req	ahash request
+ * \param[in] page	data page on which to compute the hash
+ * \param[in] offset	offset within \a page at which to start hash
+ * \param[in] len	length of data on which to compute hash
+ *
+ * \retval		0 for success
+ * \retval		negative errno on failure
+ */
+int cfs_crypto_hash_update_page(struct ahash_request *req,
+				struct page *page, unsigned int offset,
+				unsigned int len)
+{
+	struct scatterlist sl;
+
+	sg_init_table(&sl, 1);
+	sg_set_page(&sl, page, len, offset & ~PAGE_MASK);
+
+	ahash_request_set_crypt(req, &sl, NULL, sl.length);
+	return crypto_ahash_update(req);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update_page);
+
+/**
+ * Update hash digest computed on the specified data
+ *
+ * \param[in] req	ahash request
+ * \param[in] buf	data buffer on which to compute the hash
+ * \param[in] buf_len	length of \buf on which to compute hash
+ *
+ * \retval		0 for success
+ * \retval		negative errno on failure
+ */
+int cfs_crypto_hash_update(struct ahash_request *req,
+			   const void *buf, unsigned int buf_len)
+{
+	struct scatterlist sl;
+
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	ahash_request_set_crypt(req, &sl, NULL, sl.length);
+	return crypto_ahash_update(req);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update);
+
+/**
+ * Finish hash calculation, copy hash digest to buffer, clean up hash descriptor
+ *
+ * \param[in]	req		ahash request
+ * \param[out]	hash		pointer to hash buffer to store hash digest
+ * \param[in,out] hash_len	pointer to hash buffer size, if \a hash == NULL
+ *				or hash_len == NULL only free \a hdesc instead
+ *				of computing the hash
+ *
+ * \retval		0 for success
+ * \retval		-EOVERFLOW if hash_len is too small for the hash digest
+ * \retval		negative errno for other errors from lower layers
+ */
+int cfs_crypto_hash_final(struct ahash_request *req,
+			  unsigned char *hash, unsigned int *hash_len)
+{
+	int size = crypto_ahash_digestsize(crypto_ahash_reqtfm(req));
+	int err;
+
+	if (!hash || !hash_len) {
+		err = 0;
+		goto free;
+	}
+	if (*hash_len < size) {
+		err = -EOVERFLOW;
+		goto free;
+	}
+
+	ahash_request_set_crypt(req, NULL, hash, 0);
+	err = crypto_ahash_final(req);
+	if (err == 0)
+		*hash_len = size;
+free:
+	crypto_free_ahash(crypto_ahash_reqtfm(req));
+	ahash_request_free(req);
+
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_final);
+
+/**
+ * Compute the speed of specified hash function
+ *
+ * Run a speed test on the given hash algorithm on buffer using a 1MB buffer
+ * size.  This is a reasonable buffer size for Lustre RPCs, even if the actual
+ * RPC size is larger or smaller.
+ *
+ * The speed is stored internally in the cfs_crypto_hash_speeds[] array, and
+ * is available through the cfs_crypto_hash_speed() function.
+ *
+ * This function needs to stay the same as obd_t10_performance_test() so that
+ * the speeds are comparable.
+ *
+ * \param[in] hash_alg	hash algorithm id (CFS_HASH_ALG_*)
+ * \param[in] buf	data buffer on which to compute the hash
+ * \param[in] buf_len	length of \buf on which to compute hash
+ */
+static void cfs_crypto_performance_test(enum cfs_crypto_hash_alg hash_alg)
+{
+	int			buf_len = max(PAGE_SIZE, 1048576UL);
+	void			*buf;
+	unsigned long		start, end;
+	int			err = 0;
+	unsigned long		bcount;
+	struct page		*page;
+	unsigned char		hash[CFS_CRYPTO_HASH_DIGESTSIZE_MAX];
+	unsigned int		hash_len = sizeof(hash);
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	buf = kmap(page);
+	memset(buf, 0xAD, PAGE_SIZE);
+	kunmap(page);
+
+	for (start = jiffies, end = start + cfs_time_seconds(1) / 4,
+	     bcount = 0; time_before(jiffies, end) && err == 0; bcount++) {
+		struct ahash_request *req;
+		int i;
+
+		req = cfs_crypto_hash_init(hash_alg, NULL, 0);
+		if (IS_ERR(req)) {
+			err = PTR_ERR(req);
+			break;
+		}
+
+		for (i = 0; i < buf_len / PAGE_SIZE; i++) {
+			err = cfs_crypto_hash_update_page(req, page, 0,
+							  PAGE_SIZE);
+			if (err != 0)
+				break;
+		}
+
+		err = cfs_crypto_hash_final(req, hash, &hash_len);
+		if (err != 0)
+			break;
+	}
+	end = jiffies;
+	__free_page(page);
+out_err:
+	if (err != 0) {
+		cfs_crypto_hash_speeds[hash_alg] = err;
+		CDEBUG(D_INFO, "Crypto hash algorithm %s test error: rc = %d\n",
+		       cfs_crypto_hash_name(hash_alg), err);
+	} else {
+		unsigned long   tmp;
+
+		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+		       1000) / (1024 * 1024);
+		cfs_crypto_hash_speeds[hash_alg] = (int)tmp;
+		CDEBUG(D_CONFIG, "Crypto hash algorithm %s speed = %d MB/s\n",
+		       cfs_crypto_hash_name(hash_alg),
+		       cfs_crypto_hash_speeds[hash_alg]);
+	}
+}
+
+/**
+ * hash speed in Mbytes per second for valid hash algorithm
+ *
+ * Return the performance of the specified \a hash_alg that was
+ * computed using cfs_crypto_performance_test().  If the performance
+ * has not yet been computed, do that when it is first requested.
+ * That avoids computing the speed when it is not actually needed.
+ * To avoid competing threads computing the checksum speed at the
+ * same time, only compute a single checksum speed at one time.
+ *
+ * \param[in] hash_alg	hash algorithm id (CFS_HASH_ALG_*)
+ *
+ * \retval		positive speed of the hash function in MB/s
+ * \retval		-ENOENT if \a hash_alg is unsupported
+ * \retval		negative errno if \a hash_alg speed is unavailable
+ */
+int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg)
+{
+	if (hash_alg < CFS_HASH_ALG_MAX) {
+		if (unlikely(cfs_crypto_hash_speeds[hash_alg] == 0)) {
+			static DEFINE_MUTEX(crypto_hash_speed_mutex);
+
+			mutex_lock(&crypto_hash_speed_mutex);
+			if (cfs_crypto_hash_speeds[hash_alg] == 0)
+				cfs_crypto_performance_test(hash_alg);
+			mutex_unlock(&crypto_hash_speed_mutex);
+		}
+		return cfs_crypto_hash_speeds[hash_alg];
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_speed);
+
+/**
+ * Run the performance test for all hash algorithms.
+ *
+ * Run the cfs_crypto_performance_test() benchmark for some of the available
+ * hash functions at module load time.  This can't be reliably done at runtime
+ * since the CPUs may be under load from thousands of connecting clients when
+ * the first client connects and the checksum speeds are needed.
+ *
+ * Since the setup cost and computation speed of various hash algorithms is
+ * a function of the buffer size (and possibly internal contention of offload
+ * engines), this speed only represents an estimate of the actual speed under
+ * actual usage, but is reasonable for comparing available algorithms.
+ *
+ * The actual speeds are available via cfs_crypto_hash_speed() for later
+ * comparison.
+ *
+ * \retval		0 on success
+ * \retval		-ENOMEM if no memory is available for test buffer
+ */
+static int cfs_crypto_test_hashes(void)
+{
+	enum cfs_crypto_hash_alg hash_alg;
+
+	for (hash_alg = 1; hash_alg < CFS_HASH_ALG_SPEED_MAX; hash_alg++)
+		cfs_crypto_performance_test(hash_alg);
+
+	return 0;
+}
+
+static int adler32;
+
+/**
+ * Register available hash functions
+ *
+ * \retval		0
+ */
+int cfs_crypto_register(void)
+{
+	request_module("crc32c");
+
+	if (cfs_crypto_adler32_register() == 0)
+		adler32 = 1;
+
+	/* check all algorithms and do performance test */
+	cfs_crypto_test_hashes();
+
+	return 0;
+}
+
+/**
+ * Unregister previously registered hash functions
+ */
+void cfs_crypto_unregister(void)
+{
+	if (adler32)
+		cfs_crypto_adler32_unregister();
+	adler32 = 0;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.h b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.h
new file mode 100644
index 0000000000000..05610dbf3362e
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux-crypto.h
@@ -0,0 +1,37 @@
+ /*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/**
+ * Linux crypto hash specific functions.
+ */
+
+/**
+ * Functions for start/stop shash adler32 algorithm.
+ */
+int cfs_crypto_adler32_register(void);
+void cfs_crypto_adler32_unregister(void);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/glob.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/glob.c
new file mode 100644
index 0000000000000..90192466e6614
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/glob.c
@@ -0,0 +1,117 @@
+#ifndef HAVE_GLOB
+#include <linux/module.h>
+#include "libcfs/linux/glob.h"
+
+/**
+ * glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0)
+ * @pat: Shell-style pattern to match, e.g. "*.[ch]".
+ * @str: String to match.  The pattern must match the entire string.
+ *
+ * Perform shell-style glob matching, returning true (1) if the match
+ * succeeds, or false (0) if it fails.  Equivalent to !fnmatch(@pat, @str, 0).
+ *
+ * Pattern metacharacters are ?, *, [ and \.
+ * (And, inside character classes, !, - and ].)
+ *
+ * This is small and simple implementation intended for device blacklists
+ * where a string is matched against a number of patterns.  Thus, it
+ * does not preprocess the patterns.  It is non-recursive, and run-time
+ * is at most quadratic: strlen(@str)*strlen(@pat).
+ *
+ * An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa");
+ * it takes 6 passes over the pattern before matching the string.
+ *
+ * Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT
+ * treat / or leading . specially; it isn't actually used for pathnames.
+ *
+ * Note that according to glob(7) (and unlike bash), character classes
+ * are complemented by a leading !; this does not support the regex-style
+ * [^a-z] syntax.
+ *
+ * An opening bracket without a matching close is matched literally.
+ */
+bool __pure glob_match(char const *pat, char const *str)
+{
+	/*
+	 * Backtrack to previous * on mismatch and retry starting one
+	 * character later in the string.  Because * matches all characters
+	 * (no exception for /), it can be easily proved that there's
+	 * never a need to backtrack multiple levels.
+	 */
+	char const *back_pat = NULL, *back_str = back_str;
+
+	/*
+	 * Loop over each token (character or class) in pat, matching
+	 * it against the remaining unmatched tail of str.  Return false
+	 * on mismatch, or true after matching the trailing nul bytes.
+	 */
+	for (;;) {
+		unsigned char c = *str++;
+		unsigned char d = *pat++;
+
+		switch (d) {
+		case '?':	/* Wildcard: anything but nul */
+			if (c == '\0')
+				return false;
+			break;
+		case '*':	/* Any-length wildcard */
+			if (*pat == '\0')	/* Optimize trailing * case */
+				return true;
+			back_pat = pat;
+			back_str = --str;	/* Allow zero-length match */
+			break;
+		case '[': {	/* Character class */
+			bool match = false, inverted = (*pat == '!');
+			char const *class = pat + inverted;
+			unsigned char a = *class++;
+
+			/*
+			 * Iterate over each span in the character class.
+			 * A span is either a single character a, or a
+			 * range a-b.  The first span may begin with ']'.
+			 */
+			do {
+				unsigned char b = a;
+
+				if (a == '\0')	/* Malformed */
+					goto literal;
+
+				if (class[0] == '-' && class[1] != ']') {
+					b = class[1];
+
+					if (b == '\0')
+						goto literal;
+
+					class += 2;
+					/* Any special action if a > b? */
+				}
+				match |= (a <= c && c <= b);
+			} while ((a = *class++) != ']');
+
+			if (match == inverted)
+				goto backtrack;
+			pat = class;
+			}
+			break;
+		case '\\':
+			d = *pat++;
+			/*FALLTHROUGH*/
+		default:	/* Literal character */
+literal:
+			if (c == d) {
+				if (d == '\0')
+					return true;
+				break;
+			}
+backtrack:
+			if (c == '\0' || !back_pat)
+				return false;	/* No point continuing */
+			/* Try again from last *, one character later in str. */
+			pat = back_pat;
+			str = ++back_str;
+			break;
+		}
+	}
+}
+EXPORT_SYMBOL(glob_match);
+#endif /* ! HAVE_GLOB */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
new file mode 100644
index 0000000000000..e4e67c20cee5d
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/module.h>
+#ifdef HAVE_STRINGHASH
+#include <linux/stringhash.h>
+#else
+#include <linux/dcache.h>
+#endif
+#include <linux/hash.h>
+
+#include <libcfs/linux/linux-hash.h>
+
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+/* The kernel equivalent is in fs/namei.c but for some strange reason
+ * RHEL7.5 stuck it in dax/super.c instead. This placement never existed
+ * upstream so to make life easier we just have the equavilent
+ */
+u64 cfs_hashlen_string(const void *salt, const char *name)
+{
+#ifdef HAVE_FULL_NAME_HASH_3ARGS
+	unsigned long hash = init_name_hash(salt);
+#else
+	unsigned long hash = init_name_hash();
+#endif
+	unsigned long len = 0, c;
+
+	c = (unsigned char)*name;
+	while (c) {
+		len++;
+		hash = partial_name_hash(c, hash);
+		c = (unsigned char)name[len];
+	}
+	return hashlen_create(end_name_hash(hash), len);
+}
+EXPORT_SYMBOL(cfs_hashlen_string);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
new file mode 100644
index 0000000000000..5f2f6aefb77bb
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/mm.h>
+#endif
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <net/netlink.h>
+
+#if defined(CONFIG_KGDB)
+#include <asm/kgdb.h>
+#endif
+
+#include <libcfs/linux/linux-time.h>
+#include <libcfs/linux/linux-wait.h>
+#include <libcfs/linux/linux-misc.h>
+
+#ifndef HAVE_KTIME_GET_TS64
+void ktime_get_ts64(struct timespec64 *ts)
+{
+	struct timespec now;
+
+	ktime_get_ts(&now);
+	*ts = timespec_to_timespec64(now);
+}
+EXPORT_SYMBOL(ktime_get_ts64);
+#endif /* HAVE_KTIME_GET_TS64 */
+
+#ifndef HAVE_KTIME_GET_REAL_TS64
+void ktime_get_real_ts64(struct timespec64 *ts)
+{
+	struct timespec now;
+
+	getnstimeofday(&now);
+	*ts = timespec_to_timespec64(now);
+}
+EXPORT_SYMBOL(ktime_get_real_ts64);
+#endif /* HAVE_KTIME_GET_REAL_TS64 */
+
+#ifndef HAVE_KTIME_GET_REAL_SECONDS
+/*
+ * Get the seconds portion of CLOCK_REALTIME (wall clock).
+ * This is the clock that can be altered by NTP and is
+ * independent of a reboot.
+ */
+time64_t ktime_get_real_seconds(void)
+{
+	return (time64_t)get_seconds();
+}
+EXPORT_SYMBOL(ktime_get_real_seconds);
+#endif /* HAVE_KTIME_GET_REAL_SECONDS */
+
+#ifndef HAVE_KTIME_GET_SECONDS
+/*
+ * Get the seconds portion of CLOCK_MONOTONIC
+ * This clock is immutable and is reset across
+ * reboots. For older platforms this is a
+ * wrapper around get_seconds which is valid
+ * until 2038. By that time this will be gone
+ * one would hope.
+ */
+time64_t ktime_get_seconds(void)
+{
+	struct timespec64 now;
+
+	ktime_get_ts64(&now);
+	return now.tv_sec;
+}
+EXPORT_SYMBOL(ktime_get_seconds);
+#endif /* HAVE_KTIME_GET_SECONDS */
+
+static int (*cfs_apply_workqueue_attrs_t)(struct workqueue_struct *wq,
+					  const struct workqueue_attrs *attrs);
+
+int cfs_apply_workqueue_attrs(struct workqueue_struct *wq,
+			      const struct workqueue_attrs *attrs)
+{
+	if (cfs_apply_workqueue_attrs_t)
+		return cfs_apply_workqueue_attrs_t(wq, attrs);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfs_apply_workqueue_attrs);
+
+#ifndef HAVE_XARRAY_SUPPORT
+struct kmem_cache (*radix_tree_node_cachep);
+#endif
+
+void __init cfs_arch_init(void)
+{
+#ifndef HAVE_WAIT_VAR_EVENT
+	wait_bit_init();
+#endif
+	cfs_apply_workqueue_attrs_t =
+		(void *)cfs_kallsyms_lookup_name("apply_workqueue_attrs");
+#ifndef HAVE_XARRAY_SUPPORT
+	radix_tree_node_cachep =
+		(void *)cfs_kallsyms_lookup_name("radix_tree_node_cachep");
+#endif
+}
+
+int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
+		     loff_t *pos)
+{
+#ifdef HAVE_NEW_KERNEL_WRITE
+	return kernel_write(filp, buf, count, pos);
+#else
+	mm_segment_t __old_fs = get_fs();
+	int rc;
+
+	set_fs(KERNEL_DS);
+	rc = vfs_write(filp, (__force const char __user *)buf, count, pos);
+	set_fs(__old_fs);
+
+	return rc;
+#endif
+}
+EXPORT_SYMBOL(cfs_kernel_write);
+
+ssize_t cfs_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
+{
+#ifdef HAVE_KERNEL_READ_LAST_POSP
+	return kernel_read(file, buf, count, pos);
+#else
+	ssize_t size = kernel_read(file, *pos, buf, count);
+
+	if (size > 0)
+		*pos += size;
+	return size;
+#endif
+}
+EXPORT_SYMBOL(cfs_kernel_read);
+
+#ifndef HAVE_KSET_FIND_OBJ
+struct kobject *kset_find_obj(struct kset *kset, const char *name)
+{
+	struct kobject *ret = NULL;
+	struct kobject *k;
+
+	spin_lock(&kset->list_lock);
+
+	list_for_each_entry(k, &kset->list, entry) {
+		if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
+			if (kref_get_unless_zero(&k->kref))
+				ret = k;
+			break;
+		}
+	}
+
+	spin_unlock(&kset->list_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kset_find_obj);
+#endif
+
+#ifndef HAVE_MATCH_WILDCARD
+/**
+ * match_wildcard: - parse if a string matches given wildcard pattern
+ * @pattern: wildcard pattern
+ * @str: the string to be parsed
+ *
+ * Description: Parse the string @str to check if matches wildcard
+ * pattern @pattern. The pattern may contain two type wildcardes:
+ *   '*' - matches zero or more characters
+ *   '?' - matches one character
+ * If it's matched, return true, else return false.
+ */
+bool match_wildcard(const char *pattern, const char *str)
+{
+	const char *s = str;
+	const char *p = pattern;
+	bool star = false;
+
+	while (*s) {
+		switch (*p) {
+		case '?':
+			s++;
+			p++;
+			break;
+		case '*':
+			star = true;
+			str = s;
+			if (!*++p)
+				return true;
+			pattern = p;
+			break;
+		default:
+			if (*s == *p) {
+				s++;
+				p++;
+			} else {
+				if (!star)
+					return false;
+				str++;
+				s = str;
+				p = pattern;
+			}
+			break;
+		}
+	}
+
+	if (*p == '*')
+		++p;
+	return !*p;
+}
+EXPORT_SYMBOL(match_wildcard);
+#endif /* !HAVE_MATCH_WILDCARD */
+
+#ifndef HAVE_KSTRTOBOOL_FROM_USER
+int kstrtobool_from_user(const char __user *s, size_t count, bool *res)
+{
+	/* Longest string needed to differentiate, newline, terminator */
+	char buf[4];
+
+	count = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, s, count))
+		return -EFAULT;
+	buf[count] = '\0';
+	return strtobool(buf, res);
+}
+EXPORT_SYMBOL(kstrtobool_from_user);
+#endif /* !HAVE_KSTRTOBOOL_FROM_USER */
+
+#ifndef HAVE_NLA_STRDUP
+char *nla_strdup(const struct nlattr *nla, gfp_t flags)
+{
+	size_t srclen = nla_len(nla);
+	char *src = nla_data(nla), *dst;
+
+	if (srclen > 0 && src[srclen - 1] == '\0')
+		srclen--;
+
+	dst = kmalloc(srclen + 1, flags);
+	if (dst != NULL) {
+		memcpy(dst, src, srclen);
+		dst[srclen] = '\0';
+	}
+	return dst;
+}
+EXPORT_SYMBOL(nla_strdup);
+#endif /* !HAVE_NLA_STRDUP */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
new file mode 100644
index 0000000000000..33117c25a1302
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
@@ -0,0 +1,174 @@
+/*
+ * The implementation of the wait_bit*() and related waiting APIs:
+ */
+#include <linux/hash.h>
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
+#include <libcfs/linux/linux-wait.h>
+
+#ifndef HAVE_PREPARE_TO_WAIT_EVENT
+
+#define __add_wait_queue_entry_tail __add_wait_queue_tail
+
+long prepare_to_wait_event(wait_queue_head_t *wq_head,
+			   wait_queue_entry_t *wq_entry, int state)
+{
+	unsigned long flags;
+	long ret = 0;
+
+	spin_lock_irqsave(&wq_head->lock, flags);
+	if (unlikely(signal_pending_state(state, current))) {
+		/*
+		 * Exclusive waiter must not fail if it was selected by wakeup,
+		 * it should "consume" the condition we were waiting for.
+		 *
+		 * The caller will recheck the condition and return success if
+		 * we were already woken up, we can not miss the event because
+		 * wakeup locks/unlocks the same wq_head->lock.
+		 *
+		 * But we need to ensure that set-condition + wakeup after that
+		 * can't see us, it should wake up another exclusive waiter if
+		 * we fail.
+		 */
+		list_del_init(&wq_entry->task_list);
+		ret = -ERESTARTSYS;
+	} else {
+		if (list_empty(&wq_entry->task_list)) {
+			if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+				__add_wait_queue_entry_tail(wq_head, wq_entry);
+			else
+				__add_wait_queue(wq_head, wq_entry);
+		}
+		set_current_state(state);
+	}
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+#endif /* !HAVE_PREPARE_TO_WAIT_EVENT */
+
+#ifndef HAVE_WAIT_VAR_EVENT
+
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *__var_waitqueue(void *p)
+{
+	return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(__var_waitqueue);
+
+static int
+var_wake_function(wait_queue_entry_t *wq_entry, unsigned int mode,
+		  int sync, void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue_entry *wbq_entry =
+		container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+	if (wbq_entry->key.flags != key->flags ||
+	    wbq_entry->key.bit_nr != key->bit_nr)
+		return 0;
+
+	return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+
+void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var,
+			 int flags)
+{
+	*wbq_entry = (struct wait_bit_queue_entry){
+		.key = {
+			.flags	= (var),
+			.bit_nr = -1,
+		},
+		.wq_entry = {
+			.private = current,
+			.func = var_wake_function,
+#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST
+			.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
+#else
+			.task_list = LIST_HEAD_INIT(wbq_entry->wq_entry.task_list),
+#endif
+		},
+	};
+}
+EXPORT_SYMBOL(init_wait_var_entry);
+
+void wake_up_var(void *var)
+{
+	__wake_up_bit(__var_waitqueue(var), var, -1);
+}
+EXPORT_SYMBOL(wake_up_var);
+
+void __init wait_bit_init(void)
+{
+	int i;
+
+	for (i = 0; i < WAIT_TABLE_SIZE; i++)
+		init_waitqueue_head(bit_wait_table + i);
+}
+#endif /* ! HAVE_WAIT_VAR_EVENT */
+
+#ifndef HAVE_WAIT_WOKEN
+/*
+ * DEFINE_WAIT_FUNC(wait, woken_wake_func);
+ *
+ * add_wait_queue(&wq_head, &wait);
+ * for (;;) {
+ *     if (condition)
+ *         break;
+ *
+ *     // in wait_woken()			// in woken_wake_function()
+ *
+ *     p->state = mode;				wq_entry->flags |= WQ_FLAG_WOKEN;
+ *     smp_mb(); // A				try_to_wake_up():
+ *     if (!(wq_entry->flags & WQ_FLAG_WOKEN))	   <full barrier>
+ *         schedule()				   if (p->state & mode)
+ *     p->state = TASK_RUNNING;			      p->state = TASK_RUNNING;
+ *     wq_entry->flags &= ~WQ_FLAG_WOKEN;	~~~~~~~~~~~~~~~~~~
+ *     smp_mb(); // B				condition = true;
+ * }						smp_mb(); // C
+ * remove_wait_queue(&wq_head, &wait);		wq_entry->flags |= WQ_FLAG_WOKEN;
+ */
+long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode,
+		long timeout)
+{
+	/*
+	 * The below executes an smp_mb(), which matches with the full barrier
+	 * executed by the try_to_wake_up() in woken_wake_function() such that
+	 * either we see the store to wq_entry->flags in woken_wake_function()
+	 * or woken_wake_function() sees our store to current->state.
+	 */
+	set_current_state(mode); /* A */
+	if (!(wq_entry->flags & WQ_FLAG_WOKEN))
+		timeout = schedule_timeout(timeout);
+	__set_current_state(TASK_RUNNING);
+
+	/*
+	 * The below executes an smp_mb(), which matches with the smp_mb() (C)
+	 * in woken_wake_function() such that either we see the wait condition
+	 * being true or the store to wq_entry->flags in woken_wake_function()
+	 * follows ours in the coherence order.
+	 */
+	smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */
+
+	return timeout;
+}
+EXPORT_SYMBOL(wait_woken);
+
+int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
+			int sync, void *key)
+{
+	/* Pairs with the smp_store_mb() in wait_woken(). */
+	smp_mb(); /* C */
+	wq_entry->flags |= WQ_FLAG_WOKEN;
+
+	return default_wake_function(wq_entry, mode, sync, key);
+}
+EXPORT_SYMBOL(woken_wake_function);
+#endif /* HAVE_WAIT_WOKEN */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/xarray.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/xarray.c
new file mode 100644
index 0000000000000..fea97febdf2ce
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/xarray.c
@@ -0,0 +1,2101 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * XArray implementation
+ * Copyright (c) 2017 Microsoft Corporation
+ * Author: Matthew Wilcox <willy@infradead.org>
+ *
+ * This is taken from kernel commit:
+ *
+ * 7b785645e ("mm: fix page cache convergence regression")
+ *
+ * at kernel verison 5.2-rc2
+ */
+#ifndef HAVE_XARRAY_SUPPORT
+#include <linux/bitmap.h>
+#include <linux/export.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <libcfs/linux/xarray.h>
+
+/*
+ * Coding conventions in this file:
+ *
+ * @xa is used to refer to the entire xarray.
+ * @xas is the 'xarray operation state'.  It may be either a pointer to
+ * an xa_state, or an xa_state stored on the stack.  This is an unfortunate
+ * ambiguity.
+ * @index is the index of the entry being operated on
+ * @mark is an xa_mark_t; a small number indicating one of the mark bits.
+ * @node refers to an xa_node; usually the primary one being operated on by
+ * this function.
+ * @offset is the index into the slots array inside an xa_node.
+ * @parent refers to the @xa_node closer to the head than @node.
+ * @entry refers to something stored in a slot in the xarray
+ */
+
+static inline unsigned int xa_lock_type(const struct xarray *xa)
+{
+	return (__force unsigned int)xa->xa_flags & 3;
+}
+
+static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
+{
+	if (lock_type == XA_LOCK_IRQ)
+		xas_lock_irq(xas);
+	else if (lock_type == XA_LOCK_BH)
+		xas_lock_bh(xas);
+	else
+		xas_lock(xas);
+}
+
+static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
+{
+	if (lock_type == XA_LOCK_IRQ)
+		xas_unlock_irq(xas);
+	else if (lock_type == XA_LOCK_BH)
+		xas_unlock_bh(xas);
+	else
+		xas_unlock(xas);
+}
+
+static inline bool xa_track_free(const struct xarray *xa)
+{
+	return xa->xa_flags & XA_FLAGS_TRACK_FREE;
+}
+
+static inline bool xa_zero_busy(const struct xarray *xa)
+{
+	return xa->xa_flags & XA_FLAGS_ZERO_BUSY;
+}
+
+static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
+{
+	if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
+		xa->xa_flags |= XA_FLAGS_MARK(mark);
+}
+
+static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
+{
+	if (xa->xa_flags & XA_FLAGS_MARK(mark))
+		xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
+}
+
+static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
+{
+	return node->marks[(__force unsigned)mark];
+}
+
+static inline bool node_get_mark(struct xa_node *node,
+		unsigned int offset, xa_mark_t mark)
+{
+	return test_bit(offset, node_marks(node, mark));
+}
+
+/* returns true if the bit was set */
+static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
+				xa_mark_t mark)
+{
+	return __test_and_set_bit(offset, node_marks(node, mark));
+}
+
+/* returns true if the bit was set */
+static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
+				xa_mark_t mark)
+{
+	return __test_and_clear_bit(offset, node_marks(node, mark));
+}
+
+static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
+{
+	return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
+}
+
+static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
+{
+	bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
+}
+
+#define mark_inc(mark) do { \
+	mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
+} while (0)
+
+/*
+ * xas_squash_marks() - Merge all marks to the first entry
+ * @xas: Array operation state.
+ *
+ * Set a mark on the first entry if any entry has it set.  Clear marks on
+ * all sibling entries.
+ */
+static void xas_squash_marks(const struct xa_state *xas)
+{
+	unsigned int mark = 0;
+	unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;
+
+	if (!xas->xa_sibs)
+		return;
+
+	do {
+		unsigned long *marks = xas->xa_node->marks[mark];
+		if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit)
+			continue;
+		__set_bit(xas->xa_offset, marks);
+		bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
+	} while (mark++ != (__force unsigned)XA_MARK_MAX);
+}
+
+/* extracts the offset within this node from the index */
+static unsigned int get_offset(unsigned long index, struct xa_node *node)
+{
+	return (index >> node->shift) & XA_CHUNK_MASK;
+}
+
+static void xas_set_offset(struct xa_state *xas)
+{
+	xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
+}
+
+/* move the index either forwards (find) or backwards (sibling slot) */
+static void xas_move_index(struct xa_state *xas, unsigned long offset)
+{
+	unsigned int shift = xas->xa_node->shift;
+	xas->xa_index &= ~XA_CHUNK_MASK << shift;
+	xas->xa_index += offset << shift;
+}
+
+static void xas_advance(struct xa_state *xas)
+{
+	xas->xa_offset++;
+	xas_move_index(xas, xas->xa_offset);
+}
+
+static void *set_bounds(struct xa_state *xas)
+{
+	xas->xa_node = XAS_BOUNDS;
+	return NULL;
+}
+
+/*
+ * Starts a walk.  If the @xas is already valid, we assume that it's on
+ * the right path and just return where we've got to.  If we're in an
+ * error state, return NULL.  If the index is outside the current scope
+ * of the xarray, return NULL without changing @xas->xa_node.  Otherwise
+ * set @xas->xa_node to NULL and return the current head of the array.
+ */
+static void *xas_start(struct xa_state *xas)
+{
+	void *entry;
+
+	if (xas_valid(xas))
+		return xas_reload(xas);
+	if (xas_error(xas))
+		return NULL;
+
+	entry = xa_head(xas->xa);
+	if (!xa_is_node(entry)) {
+		if (xas->xa_index)
+			return set_bounds(xas);
+	} else {
+		if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
+			return set_bounds(xas);
+	}
+
+	xas->xa_node = NULL;
+	return entry;
+}
+
+static void *xas_descend(struct xa_state *xas, struct xa_node *node)
+{
+	unsigned int offset = get_offset(xas->xa_index, node);
+	void *entry = xa_entry(xas->xa, node, offset);
+
+	xas->xa_node = node;
+	if (xa_is_sibling(entry)) {
+		offset = xa_to_sibling(entry);
+		entry = xa_entry(xas->xa, node, offset);
+	}
+
+	xas->xa_offset = offset;
+	return entry;
+}
+
+/**
+ * xas_load() - Load an entry from the XArray (advanced).
+ * @xas: XArray operation state.
+ *
+ * Usually walks the @xas to the appropriate state to load the entry
+ * stored at xa_index.  However, it will do nothing and return %NULL if
+ * @xas is in an error state.  xas_load() will never expand the tree.
+ *
+ * If the xa_state is set up to operate on a multi-index entry, xas_load()
+ * may return %NULL or an internal entry, even if there are entries
+ * present within the range specified by @xas.
+ *
+ * Context: Any context.  The caller should hold the xa_lock or the RCU lock.
+ * Return: Usually an entry in the XArray, but see description for exceptions.
+ */
+void *xas_load(struct xa_state *xas)
+{
+	void *entry = xas_start(xas);
+
+	while (xa_is_node(entry)) {
+		struct xa_node *node = xa_to_node(entry);
+
+		if (xas->xa_shift > node->shift)
+			break;
+		entry = xas_descend(xas, node);
+		if (node->shift == 0)
+			break;
+	}
+	return entry;
+}
+EXPORT_SYMBOL_GPL(xas_load);
+
+/* Move the radix tree node cache here */
+extern struct kmem_cache *radix_tree_node_cachep;
+
+static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
+			     int offset)
+{
+	__clear_bit(offset, node->tags[tag]);
+}
+
+static void radix_tree_node_rcu_free(struct rcu_head *head)
+{
+	struct radix_tree_node *node =
+		container_of(head, struct radix_tree_node, rcu_head);
+	int i;
+
+	/*
+	 * must only free zeroed nodes into the slab. radix_tree_shrink
+	 * can leave us with a non-NULL entry in the first slot, so clear
+	 * that here to make sure.
+	 */
+	for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
+		tag_clear(node, i, 0);
+
+	node->slots[0] = NULL;
+	node->count = 0;
+
+	kmem_cache_free(radix_tree_node_cachep, node);
+}
+
+#define XA_RCU_FREE	((struct xarray *)1)
+
+static void xa_node_free(struct xa_node *node)
+{
+	XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
+	node->array = XA_RCU_FREE;
+	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
+}
+
+/*
+ * xas_destroy() - Free any resources allocated during the XArray operation.
+ * @xas: XArray operation state.
+ *
+ * This function is now internal-only.
+ */
+static void xas_destroy(struct xa_state *xas)
+{
+	struct xa_node *node = xas->xa_alloc;
+
+	if (!node)
+		return;
+	XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
+	kmem_cache_free(radix_tree_node_cachep, node);
+	xas->xa_alloc = NULL;
+}
+
+/**
+ * xas_nomem() - Allocate memory if needed.
+ * @xas: XArray operation state.
+ * @gfp: Memory allocation flags.
+ *
+ * If we need to add new nodes to the XArray, we try to allocate memory
+ * with GFP_NOWAIT while holding the lock, which will usually succeed.
+ * If it fails, @xas is flagged as needing memory to continue.  The caller
+ * should drop the lock and call xas_nomem().  If xas_nomem() succeeds,
+ * the caller should retry the operation.
+ *
+ * Forward progress is guaranteed as one node is allocated here and
+ * stored in the xa_state where it will be found by xas_alloc().  More
+ * nodes will likely be found in the slab allocator, but we do not tie
+ * them up here.
+ *
+ * Return: true if memory was needed, and was successfully allocated.
+ */
+bool xas_nomem(struct xa_state *xas, gfp_t gfp)
+{
+	if (xas->xa_node != XA_ERROR(-ENOMEM)) {
+		xas_destroy(xas);
+		return false;
+	}
+#ifdef __GFP_ACCOUNT
+	if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
+		gfp |= __GFP_ACCOUNT;
+#endif
+	xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
+	if (!xas->xa_alloc)
+		return false;
+	XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
+	xas->xa_node = XAS_RESTART;
+	return true;
+}
+EXPORT_SYMBOL_GPL(xas_nomem);
+
+/*
+ * __xas_nomem() - Drop locks and allocate memory if needed.
+ * @xas: XArray operation state.
+ * @gfp: Memory allocation flags.
+ *
+ * Internal variant of xas_nomem().
+ *
+ * Return: true if memory was needed, and was successfully allocated.
+ */
+static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
+	__must_hold(xas->xa->xa_lock)
+{
+	unsigned int lock_type = xa_lock_type(xas->xa);
+
+	if (xas->xa_node != XA_ERROR(-ENOMEM)) {
+		xas_destroy(xas);
+		return false;
+	}
+#ifdef __GFP_ACCOUNT
+	if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
+		gfp |= __GFP_ACCOUNT;
+#endif
+	if (gfpflags_allow_blocking(gfp)) {
+		xas_unlock_type(xas, lock_type);
+		xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
+		xas_lock_type(xas, lock_type);
+	} else {
+		xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
+	}
+	if (!xas->xa_alloc)
+		return false;
+	XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
+	xas->xa_node = XAS_RESTART;
+	return true;
+}
+
+static void xas_update(struct xa_state *xas, struct xa_node *node)
+{
+	if (xas->xa_update)
+		xas->xa_update(node);
+	else
+		XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
+}
+
+static void *xas_alloc(struct xa_state *xas, unsigned int shift)
+{
+	struct xa_node *parent = xas->xa_node;
+	struct xa_node *node = xas->xa_alloc;
+
+	if (xas_invalid(xas))
+		return NULL;
+
+	if (node) {
+		xas->xa_alloc = NULL;
+	} else {
+		gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;
+#ifdef __GFP_ACCOUNT
+		if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
+			gfp |= __GFP_ACCOUNT;
+#endif
+		node = kmem_cache_alloc(radix_tree_node_cachep, gfp);
+		if (!node) {
+			xas_set_err(xas, -ENOMEM);
+			return NULL;
+		}
+	}
+
+	if (parent) {
+		node->offset = xas->xa_offset;
+		parent->count++;
+		XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
+		xas_update(xas, parent);
+	}
+	XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
+	XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
+	node->shift = shift;
+	node->count = 0;
+	node->nr_values = 0;
+	RCU_INIT_POINTER(node->parent, xas->xa_node);
+	node->array = xas->xa;
+
+	return node;
+}
+
+#ifdef CONFIG_XARRAY_MULTI
+/* Returns the number of indices covered by a given xa_state */
+static unsigned long xas_size(const struct xa_state *xas)
+{
+	return (xas->xa_sibs + 1UL) << xas->xa_shift;
+}
+#endif
+
+/*
+ * Use this to calculate the maximum index that will need to be created
+ * in order to add the entry described by @xas.  Because we cannot store a
+ * multiple-index entry at index 0, the calculation is a little more complex
+ * than you might expect.
+ */
+static unsigned long xas_max(struct xa_state *xas)
+{
+	unsigned long max = xas->xa_index;
+
+#ifdef CONFIG_XARRAY_MULTI
+	if (xas->xa_shift || xas->xa_sibs) {
+		unsigned long mask = xas_size(xas) - 1;
+		max |= mask;
+		if (mask == max)
+			max++;
+	}
+#endif
+
+	return max;
+}
+
+/* The maximum index that can be contained in the array without expanding it */
+static unsigned long max_index(void *entry)
+{
+	if (!xa_is_node(entry))
+		return 0;
+	return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
+}
+
+static void xas_shrink(struct xa_state *xas)
+{
+	struct xarray *xa = xas->xa;
+	struct xa_node *node = xas->xa_node;
+
+	for (;;) {
+		void *entry;
+
+		XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
+		if (node->count != 1)
+			break;
+		entry = xa_entry_locked(xa, node, 0);
+		if (!entry)
+			break;
+		if (!xa_is_node(entry) && node->shift)
+			break;
+		if (xa_is_zero(entry) && xa_zero_busy(xa))
+			entry = NULL;
+		xas->xa_node = XAS_BOUNDS;
+
+		RCU_INIT_POINTER(xa->xa_head, entry);
+		if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
+			xa_mark_clear(xa, XA_FREE_MARK);
+
+		node->count = 0;
+		node->nr_values = 0;
+		if (!xa_is_node(entry))
+			RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
+		xas_update(xas, node);
+		xa_node_free(node);
+		if (!xa_is_node(entry))
+			break;
+		node = xa_to_node(entry);
+		node->parent = NULL;
+	}
+}
+
+/*
+ * xas_delete_node() - Attempt to delete an xa_node
+ * @xas: Array operation state.
+ *
+ * Attempts to delete the @xas->xa_node.  This will fail if xa->node has
+ * a non-zero reference count.
+ */
+static void xas_delete_node(struct xa_state *xas)
+{
+	struct xa_node *node = xas->xa_node;
+
+	for (;;) {
+		struct xa_node *parent;
+
+		XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
+		if (node->count)
+			break;
+
+		parent = xa_parent_locked(xas->xa, node);
+		xas->xa_node = parent;
+		xas->xa_offset = node->offset;
+		xa_node_free(node);
+
+		if (!parent) {
+			xas->xa->xa_head = NULL;
+			xas->xa_node = XAS_BOUNDS;
+			return;
+		}
+
+		parent->slots[xas->xa_offset] = NULL;
+		parent->count--;
+		XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
+		node = parent;
+		xas_update(xas, node);
+	}
+
+	if (!node->parent)
+		xas_shrink(xas);
+}
+
+/**
+ * xas_free_nodes() - Free this node and all nodes that it references
+ * @xas: Array operation state.
+ * @top: Node to free
+ *
+ * This node has been removed from the tree.  We must now free it and all
+ * of its subnodes.  There may be RCU walkers with references into the tree,
+ * so we must replace all entries with retry markers.
+ */
+static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
+{
+	unsigned int offset = 0;
+	struct xa_node *node = top;
+
+	for (;;) {
+		void *entry = xa_entry_locked(xas->xa, node, offset);
+
+		if (node->shift && xa_is_node(entry)) {
+			node = xa_to_node(entry);
+			offset = 0;
+			continue;
+		}
+		if (entry)
+			RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
+		offset++;
+		while (offset == XA_CHUNK_SIZE) {
+			struct xa_node *parent;
+
+			parent = xa_parent_locked(xas->xa, node);
+			offset = node->offset + 1;
+			node->count = 0;
+			node->nr_values = 0;
+			xas_update(xas, node);
+			xa_node_free(node);
+			if (node == top)
+				return;
+			node = parent;
+		}
+	}
+}
+
+/*
+ * xas_expand adds nodes to the head of the tree until it has reached
+ * sufficient height to be able to contain @xas->xa_index
+ */
+static int xas_expand(struct xa_state *xas, void *head)
+{
+	struct xarray *xa = xas->xa;
+	struct xa_node *node = NULL;
+	unsigned int shift = 0;
+	unsigned long max = xas_max(xas);
+
+	if (!head) {
+		if (max == 0)
+			return 0;
+		while ((max >> shift) >= XA_CHUNK_SIZE)
+			shift += XA_CHUNK_SHIFT;
+		return shift + XA_CHUNK_SHIFT;
+	} else if (xa_is_node(head)) {
+		node = xa_to_node(head);
+		shift = node->shift + XA_CHUNK_SHIFT;
+	}
+	xas->xa_node = NULL;
+
+	while (max > max_index(head)) {
+		xa_mark_t mark = 0;
+
+		XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
+		node = xas_alloc(xas, shift);
+		if (!node)
+			return -ENOMEM;
+
+		node->count = 1;
+		if (xa_is_value(head))
+			node->nr_values = 1;
+		RCU_INIT_POINTER(node->slots[0], head);
+
+		/* Propagate the aggregated mark info to the new child */
+		for (;;) {
+			if (xa_track_free(xa) && mark == XA_FREE_MARK) {
+				node_mark_all(node, XA_FREE_MARK);
+				if (!xa_marked(xa, XA_FREE_MARK)) {
+					node_clear_mark(node, 0, XA_FREE_MARK);
+					xa_mark_set(xa, XA_FREE_MARK);
+				}
+			} else if (xa_marked(xa, mark)) {
+				node_set_mark(node, 0, mark);
+			}
+			if (mark == XA_MARK_MAX)
+				break;
+			mark_inc(mark);
+		}
+
+		/*
+		 * Now that the new node is fully initialised, we can add
+		 * it to the tree
+		 */
+		if (xa_is_node(head)) {
+			xa_to_node(head)->offset = 0;
+			rcu_assign_pointer(xa_to_node(head)->parent, node);
+		}
+		head = xa_mk_node(node);
+		rcu_assign_pointer(xa->xa_head, head);
+		xas_update(xas, node);
+
+		shift += XA_CHUNK_SHIFT;
+	}
+
+	xas->xa_node = node;
+	return shift;
+}
+
+/*
+ * xas_create() - Create a slot to store an entry in.
+ * @xas: XArray operation state.
+ * @allow_root: %true if we can store the entry in the root directly
+ *
+ * Most users will not need to call this function directly, as it is called
+ * by xas_store().  It is useful for doing conditional store operations
+ * (see the xa_cmpxchg() implementation for an example).
+ *
+ * Return: If the slot already existed, returns the contents of this slot.
+ * If the slot was newly created, returns %NULL.  If it failed to create the
+ * slot, returns %NULL and indicates the error in @xas.
+ */
+static void *xas_create(struct xa_state *xas, bool allow_root)
+{
+	struct xarray *xa = xas->xa;
+	void *entry;
+	void __rcu **slot;
+	struct xa_node *node = xas->xa_node;
+	int shift;
+	unsigned int order = xas->xa_shift;
+
+	if (xas_top(node)) {
+		entry = xa_head_locked(xa);
+		xas->xa_node = NULL;
+		if (!entry && xa_zero_busy(xa))
+			entry = XA_ZERO_ENTRY;
+		shift = xas_expand(xas, entry);
+		if (shift < 0)
+			return NULL;
+		if (!shift && !allow_root)
+			shift = XA_CHUNK_SHIFT;
+		entry = xa_head_locked(xa);
+		slot = &xa->xa_head;
+	} else if (xas_error(xas)) {
+		return NULL;
+	} else if (node) {
+		unsigned int offset = xas->xa_offset;
+
+		shift = node->shift;
+		entry = xa_entry_locked(xa, node, offset);
+		slot = &node->slots[offset];
+	} else {
+		shift = 0;
+		entry = xa_head_locked(xa);
+		slot = &xa->xa_head;
+	}
+
+	while (shift > order) {
+		shift -= XA_CHUNK_SHIFT;
+		if (!entry) {
+			node = xas_alloc(xas, shift);
+			if (!node)
+				break;
+			if (xa_track_free(xa))
+				node_mark_all(node, XA_FREE_MARK);
+			rcu_assign_pointer(*slot, xa_mk_node(node));
+		} else if (xa_is_node(entry)) {
+			node = xa_to_node(entry);
+		} else {
+			break;
+		}
+		entry = xas_descend(xas, node);
+		slot = &node->slots[xas->xa_offset];
+	}
+
+	return entry;
+}
+
+/**
+ * xas_create_range() - Ensure that stores to this range will succeed
+ * @xas: XArray operation state.
+ *
+ * Creates all of the slots in the range covered by @xas.  Sets @xas to
+ * create single-index entries and positions it at the beginning of the
+ * range.  This is for the benefit of users which have not yet been
+ * converted to use multi-index entries.
+ */
+void xas_create_range(struct xa_state *xas)
+{
+	unsigned long index = xas->xa_index;
+	unsigned char shift = xas->xa_shift;
+	unsigned char sibs = xas->xa_sibs;
+
+	xas->xa_index |= ((sibs + 1) << shift) - 1;
+	if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift)
+		xas->xa_offset |= sibs;
+	xas->xa_shift = 0;
+	xas->xa_sibs = 0;
+
+	for (;;) {
+		xas_create(xas, true);
+		if (xas_error(xas))
+			goto restore;
+		if (xas->xa_index <= (index | XA_CHUNK_MASK))
+			goto success;
+		xas->xa_index -= XA_CHUNK_SIZE;
+
+		for (;;) {
+			struct xa_node *node = xas->xa_node;
+			xas->xa_node = xa_parent_locked(xas->xa, node);
+			xas->xa_offset = node->offset - 1;
+			if (node->offset != 0)
+				break;
+		}
+	}
+
+restore:
+	xas->xa_shift = shift;
+	xas->xa_sibs = sibs;
+	xas->xa_index = index;
+	return;
+success:
+	xas->xa_index = index;
+	if (xas->xa_node)
+		xas_set_offset(xas);
+}
+EXPORT_SYMBOL_GPL(xas_create_range);
+
+static void update_node(struct xa_state *xas, struct xa_node *node,
+		int count, int values)
+{
+	if (!node || (!count && !values))
+		return;
+
+	node->count += count;
+	node->nr_values += values;
+	XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
+	XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
+	xas_update(xas, node);
+	if (count < 0)
+		xas_delete_node(xas);
+}
+
+/**
+ * xas_store() - Store this entry in the XArray.
+ * @xas: XArray operation state.
+ * @entry: New entry.
+ *
+ * If @xas is operating on a multi-index entry, the entry returned by this
+ * function is essentially meaningless (it may be an internal entry or it
+ * may be %NULL, even if there are non-NULL entries at some of the indices
+ * covered by the range).  This is not a problem for any current users,
+ * and can be changed if needed.
+ *
+ * Return: The old entry at this index.
+ */
+void *xas_store(struct xa_state *xas, void *entry)
+{
+	struct xa_node *node;
+	void __rcu **slot = &xas->xa->xa_head;
+	unsigned int offset, max;
+	int count = 0;
+	int values = 0;
+	void *first, *next;
+	bool value = xa_is_value(entry);
+
+	if (entry) {
+		bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry);
+		first = xas_create(xas, allow_root);
+	} else {
+		first = xas_load(xas);
+	}
+
+	if (xas_invalid(xas))
+		return first;
+	node = xas->xa_node;
+	if (node && (xas->xa_shift < node->shift))
+		xas->xa_sibs = 0;
+	if ((first == entry) && !xas->xa_sibs)
+		return first;
+
+	next = first;
+	offset = xas->xa_offset;
+	max = xas->xa_offset + xas->xa_sibs;
+	if (node) {
+		slot = &node->slots[offset];
+		if (xas->xa_sibs)
+			xas_squash_marks(xas);
+	}
+	if (!entry)
+		xas_init_marks(xas);
+
+	for (;;) {
+		/*
+		 * Must clear the marks before setting the entry to NULL,
+		 * otherwise xas_for_each_marked may find a NULL entry and
+		 * stop early.  rcu_assign_pointer contains a release barrier
+		 * so the mark clearing will appear to happen before the
+		 * entry is set to NULL.
+		 */
+		rcu_assign_pointer(*slot, entry);
+		if (xa_is_node(next) && (!node || node->shift))
+			xas_free_nodes(xas, xa_to_node(next));
+		if (!node)
+			break;
+		count += !next - !entry;
+		values += !xa_is_value(first) - !value;
+		if (entry) {
+			if (offset == max)
+				break;
+			if (!xa_is_sibling(entry))
+				entry = xa_mk_sibling(xas->xa_offset);
+		} else {
+			if (offset == XA_CHUNK_MASK)
+				break;
+		}
+		next = xa_entry_locked(xas->xa, node, ++offset);
+		if (!xa_is_sibling(next)) {
+			if (!entry && (offset > max))
+				break;
+			first = next;
+		}
+		slot++;
+	}
+
+	update_node(xas, node, count, values);
+	return first;
+}
+EXPORT_SYMBOL_GPL(xas_store);
+
+/**
+ * xas_get_mark() - Returns the state of this mark.
+ * @xas: XArray operation state.
+ * @mark: Mark number.
+ *
+ * Return: true if the mark is set, false if the mark is clear or @xas
+ * is in an error state.
+ */
+bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
+{
+	if (xas_invalid(xas))
+		return false;
+	if (!xas->xa_node)
+		return xa_marked(xas->xa, mark);
+	return node_get_mark(xas->xa_node, xas->xa_offset, mark);
+}
+EXPORT_SYMBOL_GPL(xas_get_mark);
+
+/**
+ * xas_set_mark() - Sets the mark on this entry and its parents.
+ * @xas: XArray operation state.
+ * @mark: Mark number.
+ *
+ * Sets the specified mark on this entry, and walks up the tree setting it
+ * on all the ancestor entries.  Does nothing if @xas has not been walked to
+ * an entry, or is in an error state.
+ */
+void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
+{
+	struct xa_node *node = xas->xa_node;
+	unsigned int offset = xas->xa_offset;
+
+	if (xas_invalid(xas))
+		return;
+
+	while (node) {
+		if (node_set_mark(node, offset, mark))
+			return;
+		offset = node->offset;
+		node = xa_parent_locked(xas->xa, node);
+	}
+
+	if (!xa_marked(xas->xa, mark))
+		xa_mark_set(xas->xa, mark);
+}
+EXPORT_SYMBOL_GPL(xas_set_mark);
+
+/**
+ * xas_clear_mark() - Clears the mark on this entry and its parents.
+ * @xas: XArray operation state.
+ * @mark: Mark number.
+ *
+ * Clears the specified mark on this entry, and walks back to the head
+ * attempting to clear it on all the ancestor entries.  Does nothing if
+ * @xas has not been walked to an entry, or is in an error state.
+ */
+void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
+{
+	struct xa_node *node = xas->xa_node;
+	unsigned int offset = xas->xa_offset;
+
+	if (xas_invalid(xas))
+		return;
+
+	while (node) {
+		if (!node_clear_mark(node, offset, mark))
+			return;
+		if (node_any_mark(node, mark))
+			return;
+
+		offset = node->offset;
+		node = xa_parent_locked(xas->xa, node);
+	}
+
+	if (xa_marked(xas->xa, mark))
+		xa_mark_clear(xas->xa, mark);
+}
+EXPORT_SYMBOL_GPL(xas_clear_mark);
+
+/**
+ * xas_init_marks() - Initialise all marks for the entry
+ * @xas: Array operations state.
+ *
+ * Initialise all marks for the entry specified by @xas.  If we're tracking
+ * free entries with a mark, we need to set it on all entries.  All other
+ * marks are cleared.
+ *
+ * This implementation is not as efficient as it could be; we may walk
+ * up the tree multiple times.
+ */
+void xas_init_marks(const struct xa_state *xas)
+{
+	xa_mark_t mark = 0;
+
+	for (;;) {
+		if (xa_track_free(xas->xa) && mark == XA_FREE_MARK)
+			xas_set_mark(xas, mark);
+		else
+			xas_clear_mark(xas, mark);
+		if (mark == XA_MARK_MAX)
+			break;
+		mark_inc(mark);
+	}
+}
+EXPORT_SYMBOL_GPL(xas_init_marks);
+
+/**
+ * xas_pause() - Pause a walk to drop a lock.
+ * @xas: XArray operation state.
+ *
+ * Some users need to pause a walk and drop the lock they're holding in
+ * order to yield to a higher priority thread or carry out an operation
+ * on an entry.  Those users should call this function before they drop
+ * the lock.  It resets the @xas to be suitable for the next iteration
+ * of the loop after the user has reacquired the lock.  If most entries
+ * found during a walk require you to call xas_pause(), the xa_for_each()
+ * iterator may be more appropriate.
+ *
+ * Note that xas_pause() only works for forward iteration.  If a user needs
+ * to pause a reverse iteration, we will need a xas_pause_rev().
+ */
+void xas_pause(struct xa_state *xas)
+{
+	struct xa_node *node = xas->xa_node;
+
+	if (xas_invalid(xas))
+		return;
+
+	if (node) {
+		unsigned int offset = xas->xa_offset;
+		while (++offset < XA_CHUNK_SIZE) {
+			if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
+				break;
+		}
+		xas->xa_index += (offset - xas->xa_offset) << node->shift;
+	} else {
+		xas->xa_index++;
+	}
+	xas->xa_node = XAS_RESTART;
+}
+EXPORT_SYMBOL_GPL(xas_pause);
+
+/*
+ * __xas_prev() - Find the previous entry in the XArray.
+ * @xas: XArray operation state.
+ *
+ * Helper function for xas_prev() which handles all the complex cases
+ * out of line.
+ */
+void *__xas_prev(struct xa_state *xas)
+{
+	void *entry;
+
+	if (!xas_frozen(xas->xa_node))
+		xas->xa_index--;
+	if (!xas->xa_node)
+		return set_bounds(xas);
+	if (xas_not_node(xas->xa_node))
+		return xas_load(xas);
+
+	if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
+		xas->xa_offset--;
+
+	while (xas->xa_offset == 255) {
+		xas->xa_offset = xas->xa_node->offset - 1;
+		xas->xa_node = xa_parent(xas->xa, xas->xa_node);
+		if (!xas->xa_node)
+			return set_bounds(xas);
+	}
+
+	for (;;) {
+		entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+		if (!xa_is_node(entry))
+			return entry;
+
+		xas->xa_node = xa_to_node(entry);
+		xas_set_offset(xas);
+	}
+}
+EXPORT_SYMBOL_GPL(__xas_prev);
+
+/*
+ * __xas_next() - Find the next entry in the XArray.
+ * @xas: XArray operation state.
+ *
+ * Helper function for xas_next() which handles all the complex cases
+ * out of line.
+ */
+void *__xas_next(struct xa_state *xas)
+{
+	void *entry;
+
+	if (!xas_frozen(xas->xa_node))
+		xas->xa_index++;
+	if (!xas->xa_node)
+		return set_bounds(xas);
+	if (xas_not_node(xas->xa_node))
+		return xas_load(xas);
+
+	if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
+		xas->xa_offset++;
+
+	while (xas->xa_offset == XA_CHUNK_SIZE) {
+		xas->xa_offset = xas->xa_node->offset + 1;
+		xas->xa_node = xa_parent(xas->xa, xas->xa_node);
+		if (!xas->xa_node)
+			return set_bounds(xas);
+	}
+
+	for (;;) {
+		entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+		if (!xa_is_node(entry))
+			return entry;
+
+		xas->xa_node = xa_to_node(entry);
+		xas_set_offset(xas);
+	}
+}
+EXPORT_SYMBOL_GPL(__xas_next);
+
+/**
+ * xas_find() - Find the next present entry in the XArray.
+ * @xas: XArray operation state.
+ * @max: Highest index to return.
+ *
+ * If the @xas has not yet been walked to an entry, return the entry
+ * which has an index >= xas.xa_index.  If it has been walked, the entry
+ * currently being pointed at has been processed, and so we move to the
+ * next entry.
+ *
+ * If no entry is found and the array is smaller than @max, the iterator
+ * is set to the smallest index not yet in the array.  This allows @xas
+ * to be immediately passed to xas_store().
+ *
+ * Return: The entry, if found, otherwise %NULL.
+ */
+void *xas_find(struct xa_state *xas, unsigned long max)
+{
+	void *entry;
+
+	if (xas_error(xas))
+		return NULL;
+
+	if (!xas->xa_node) {
+		xas->xa_index = 1;
+		return set_bounds(xas);
+	} else if (xas_top(xas->xa_node)) {
+		entry = xas_load(xas);
+		if (entry || xas_not_node(xas->xa_node))
+			return entry;
+	} else if (!xas->xa_node->shift &&
+		    xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) {
+		xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
+	}
+
+	xas_advance(xas);
+
+	while (xas->xa_node && (xas->xa_index <= max)) {
+		if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
+			xas->xa_offset = xas->xa_node->offset + 1;
+			xas->xa_node = xa_parent(xas->xa, xas->xa_node);
+			continue;
+		}
+
+		entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+		if (xa_is_node(entry)) {
+			xas->xa_node = xa_to_node(entry);
+			xas->xa_offset = 0;
+			continue;
+		}
+		if (entry && !xa_is_sibling(entry))
+			return entry;
+
+		xas_advance(xas);
+	}
+
+	if (!xas->xa_node)
+		xas->xa_node = XAS_BOUNDS;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(xas_find);
+
+/**
+ * xas_find_marked() - Find the next marked entry in the XArray.
+ * @xas: XArray operation state.
+ * @max: Highest index to return.
+ * @mark: Mark number to search for.
+ *
+ * If the @xas has not yet been walked to an entry, return the marked entry
+ * which has an index >= xas.xa_index.  If it has been walked, the entry
+ * currently being pointed at has been processed, and so we return the
+ * first marked entry with an index > xas.xa_index.
+ *
+ * If no marked entry is found and the array is smaller than @max, @xas is
+ * set to the bounds state and xas->xa_index is set to the smallest index
+ * not yet in the array.  This allows @xas to be immediately passed to
+ * xas_store().
+ *
+ * If no entry is found before @max is reached, @xas is set to the restart
+ * state.
+ *
+ * Return: The entry, if found, otherwise %NULL.
+ */
+void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
+{
+	bool advance = true;
+	unsigned int offset;
+	void *entry;
+
+	if (xas_error(xas))
+		return NULL;
+
+	if (!xas->xa_node) {
+		xas->xa_index = 1;
+		goto out;
+	} else if (xas_top(xas->xa_node)) {
+		advance = false;
+		entry = xa_head(xas->xa);
+		xas->xa_node = NULL;
+		if (xas->xa_index > max_index(entry))
+			goto out;
+		if (!xa_is_node(entry)) {
+			if (xa_marked(xas->xa, mark))
+				return entry;
+			xas->xa_index = 1;
+			goto out;
+		}
+		xas->xa_node = xa_to_node(entry);
+		xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
+	}
+
+	while (xas->xa_index <= max) {
+		if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
+			xas->xa_offset = xas->xa_node->offset + 1;
+			xas->xa_node = xa_parent(xas->xa, xas->xa_node);
+			if (!xas->xa_node)
+				break;
+			advance = false;
+			continue;
+		}
+
+		if (!advance) {
+			entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+			if (xa_is_sibling(entry)) {
+				xas->xa_offset = xa_to_sibling(entry);
+				xas_move_index(xas, xas->xa_offset);
+			}
+		}
+
+		offset = xas_find_chunk(xas, advance, mark);
+		if (offset > xas->xa_offset) {
+			advance = false;
+			xas_move_index(xas, offset);
+			/* Mind the wrap */
+			if ((xas->xa_index - 1) >= max)
+				goto max;
+			xas->xa_offset = offset;
+			if (offset == XA_CHUNK_SIZE)
+				continue;
+		}
+
+		entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+		if (!xa_is_node(entry))
+			return entry;
+		xas->xa_node = xa_to_node(entry);
+		xas_set_offset(xas);
+	}
+
+out:
+	if (xas->xa_index > max)
+		goto max;
+	return set_bounds(xas);
+max:
+	xas->xa_node = XAS_RESTART;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(xas_find_marked);
+
+/**
+ * xas_find_conflict() - Find the next present entry in a range.
+ * @xas: XArray operation state.
+ *
+ * The @xas describes both a range and a position within that range.
+ *
+ * Context: Any context.  Expects xa_lock to be held.
+ * Return: The next entry in the range covered by @xas or %NULL.
+ */
+void *xas_find_conflict(struct xa_state *xas)
+{
+	void *curr;
+
+	if (xas_error(xas))
+		return NULL;
+
+	if (!xas->xa_node)
+		return NULL;
+
+	if (xas_top(xas->xa_node)) {
+		curr = xas_start(xas);
+		if (!curr)
+			return NULL;
+		while (xa_is_node(curr)) {
+			struct xa_node *node = xa_to_node(curr);
+			curr = xas_descend(xas, node);
+		}
+		if (curr)
+			return curr;
+	}
+
+	if (xas->xa_node->shift > xas->xa_shift)
+		return NULL;
+
+	for (;;) {
+		if (xas->xa_node->shift == xas->xa_shift) {
+			if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
+				break;
+		} else if (xas->xa_offset == XA_CHUNK_MASK) {
+			xas->xa_offset = xas->xa_node->offset;
+			xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
+			if (!xas->xa_node)
+				break;
+			continue;
+		}
+		curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
+		if (xa_is_sibling(curr))
+			continue;
+		while (xa_is_node(curr)) {
+			xas->xa_node = xa_to_node(curr);
+			xas->xa_offset = 0;
+			curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
+		}
+		if (curr)
+			return curr;
+	}
+	xas->xa_offset -= xas->xa_sibs;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(xas_find_conflict);
+
+/**
+ * xa_load() - Load an entry from an XArray.
+ * @xa: XArray.
+ * @index: index into array.
+ *
+ * Context: Any context.  Takes and releases the RCU lock.
+ * Return: The entry at @index in @xa.
+ */
+void *xa_load(struct xarray *xa, unsigned long index)
+{
+	XA_STATE(xas, xa, index);
+	void *entry;
+
+	rcu_read_lock();
+	do {
+		entry = xas_load(&xas);
+		if (xa_is_zero(entry))
+			entry = NULL;
+	} while (xas_retry(&xas, entry));
+	rcu_read_unlock();
+
+	return entry;
+}
+EXPORT_SYMBOL(xa_load);
+
+static void *xas_result(struct xa_state *xas, void *curr)
+{
+	if (xa_is_zero(curr))
+		return NULL;
+	if (xas_error(xas))
+		curr = xas->xa_node;
+	return curr;
+}
+
+/**
+ * __xa_erase() - Erase this entry from the XArray while locked.
+ * @xa: XArray.
+ * @index: Index into array.
+ *
+ * After this function returns, loading from @index will return %NULL.
+ * If the index is part of a multi-index entry, all indices will be erased
+ * and none of the entries will be part of a multi-index entry.
+ *
+ * Context: Any context.  Expects xa_lock to be held on entry.
+ * Return: The entry which used to be at this index.
+ */
+void *__xa_erase(struct xarray *xa, unsigned long index)
+{
+	XA_STATE(xas, xa, index);
+	return xas_result(&xas, xas_store(&xas, NULL));
+}
+EXPORT_SYMBOL(__xa_erase);
+
+/**
+ * xa_erase() - Erase this entry from the XArray.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * After this function returns, loading from @index will return %NULL.
+ * If the index is part of a multi-index entry, all indices will be erased
+ * and none of the entries will be part of a multi-index entry.
+ *
+ * Context: Any context.  Takes and releases the xa_lock.
+ * Return: The entry which used to be at this index.
+ */
+void *xa_erase(struct xarray *xa, unsigned long index)
+{
+	void *entry;
+
+	xa_lock(xa);
+	entry = __xa_erase(xa, index);
+	xa_unlock(xa);
+
+	return entry;
+}
+EXPORT_SYMBOL(xa_erase);
+
+/**
+ * __xa_store() - Store this entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * You must already be holding the xa_lock when calling this function.
+ * It will drop the lock if needed to allocate memory, and then reacquire
+ * it afterwards.
+ *
+ * Context: Any context.  Expects xa_lock to be held on entry.  May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: The old entry at this index or xa_err() if an error happened.
+ */
+void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
+{
+	XA_STATE(xas, xa, index);
+	void *curr;
+
+	if (WARN_ON_ONCE(xa_is_advanced(entry)))
+		return XA_ERROR(-EINVAL);
+	if (xa_track_free(xa) && !entry)
+		entry = XA_ZERO_ENTRY;
+
+	do {
+		curr = xas_store(&xas, entry);
+		if (xa_track_free(xa))
+			xas_clear_mark(&xas, XA_FREE_MARK);
+	} while (__xas_nomem(&xas, gfp));
+
+	return xas_result(&xas, curr);
+}
+EXPORT_SYMBOL(__xa_store);
+
+/**
+ * xa_store() - Store this entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * After this function returns, loads from this index will return @entry.
+ * Storing into an existing multislot entry updates the entry of every index.
+ * The marks associated with @index are unaffected unless @entry is %NULL.
+ *
+ * Context: Any context.  Takes and releases the xa_lock.
+ * May sleep if the @gfp flags permit.
+ * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
+ * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
+ * failed.
+ */
+void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
+{
+	void *curr;
+
+	xa_lock(xa);
+	curr = __xa_store(xa, index, entry, gfp);
+	xa_unlock(xa);
+
+	return curr;
+}
+EXPORT_SYMBOL(xa_store);
+
+/**
+ * __xa_cmpxchg() - Store this entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @old: Old value to test against.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * You must already be holding the xa_lock when calling this function.
+ * It will drop the lock if needed to allocate memory, and then reacquire
+ * it afterwards.
+ *
+ * Context: Any context.  Expects xa_lock to be held on entry.  May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: The old entry at this index or xa_err() if an error happened.
+ */
+void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
+			void *old, void *entry, gfp_t gfp)
+{
+	XA_STATE(xas, xa, index);
+	void *curr;
+
+	if (WARN_ON_ONCE(xa_is_advanced(entry)))
+		return XA_ERROR(-EINVAL);
+
+	do {
+		curr = xas_load(&xas);
+		if (curr == old) {
+			xas_store(&xas, entry);
+			if (xa_track_free(xa) && entry && !curr)
+				xas_clear_mark(&xas, XA_FREE_MARK);
+		}
+	} while (__xas_nomem(&xas, gfp));
+
+	return xas_result(&xas, curr);
+}
+EXPORT_SYMBOL(__xa_cmpxchg);
+
+/**
+ * __xa_insert() - Store this entry in the XArray if no entry is present.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * Inserting a NULL entry will store a reserved entry (like xa_reserve())
+ * if no entry is present.  Inserting will fail if a reserved entry is
+ * present, even though loading from this index will return NULL.
+ *
+ * Context: Any context.  Expects xa_lock to be held on entry.  May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
+ * -ENOMEM if memory could not be allocated.
+ */
+int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
+{
+	XA_STATE(xas, xa, index);
+	void *curr;
+
+	if (WARN_ON_ONCE(xa_is_advanced(entry)))
+		return -EINVAL;
+	if (!entry)
+		entry = XA_ZERO_ENTRY;
+
+	do {
+		curr = xas_load(&xas);
+		if (!curr) {
+			xas_store(&xas, entry);
+			if (xa_track_free(xa))
+				xas_clear_mark(&xas, XA_FREE_MARK);
+		} else {
+			xas_set_err(&xas, -EBUSY);
+		}
+	} while (__xas_nomem(&xas, gfp));
+
+	return xas_error(&xas);
+}
+EXPORT_SYMBOL(__xa_insert);
+
+#ifdef CONFIG_XARRAY_MULTI
+static void xas_set_range(struct xa_state *xas, unsigned long first,
+		unsigned long last)
+{
+	unsigned int shift = 0;
+	unsigned long sibs = last - first;
+	unsigned int offset = XA_CHUNK_MASK;
+
+	xas_set(xas, first);
+
+	while ((first & XA_CHUNK_MASK) == 0) {
+		if (sibs < XA_CHUNK_MASK)
+			break;
+		if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
+			break;
+		shift += XA_CHUNK_SHIFT;
+		if (offset == XA_CHUNK_MASK)
+			offset = sibs & XA_CHUNK_MASK;
+		sibs >>= XA_CHUNK_SHIFT;
+		first >>= XA_CHUNK_SHIFT;
+	}
+
+	offset = first & XA_CHUNK_MASK;
+	if (offset + sibs > XA_CHUNK_MASK)
+		sibs = XA_CHUNK_MASK - offset;
+	if ((((first + sibs + 1) << shift) - 1) > last)
+		sibs -= 1;
+
+	xas->xa_shift = shift;
+	xas->xa_sibs = sibs;
+}
+
+/**
+ * xa_store_range() - Store this entry at a range of indices in the XArray.
+ * @xa: XArray.
+ * @first: First index to affect.
+ * @last: Last index to affect.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * After this function returns, loads from any index between @first and @last,
+ * inclusive will return @entry.
+ * Storing into an existing multislot entry updates the entry of every index.
+ * The marks associated with @index are unaffected unless @entry is %NULL.
+ *
+ * Context: Process context.  Takes and releases the xa_lock.  May sleep
+ * if the @gfp flags permit.
+ * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
+ * an XArray, or xa_err(-ENOMEM) if memory allocation failed.
+ */
+void *xa_store_range(struct xarray *xa, unsigned long first,
+		unsigned long last, void *entry, gfp_t gfp)
+{
+	XA_STATE(xas, xa, 0);
+
+	if (WARN_ON_ONCE(xa_is_internal(entry)))
+		return XA_ERROR(-EINVAL);
+	if (last < first)
+		return XA_ERROR(-EINVAL);
+
+	do {
+		xas_lock(&xas);
+		if (entry) {
+			unsigned int order = BITS_PER_LONG;
+			if (last + 1)
+				order = __ffs(last + 1);
+			xas_set_order(&xas, last, order);
+			xas_create(&xas, true);
+			if (xas_error(&xas))
+				goto unlock;
+		}
+		do {
+			xas_set_range(&xas, first, last);
+			xas_store(&xas, entry);
+			if (xas_error(&xas))
+				goto unlock;
+			first += xas_size(&xas);
+		} while (first <= last);
+unlock:
+		xas_unlock(&xas);
+	} while (xas_nomem(&xas, gfp));
+
+	return xas_result(&xas, NULL);
+}
+EXPORT_SYMBOL(xa_store_range);
+#endif /* CONFIG_XARRAY_MULTI */
+
+/**
+ * __xa_alloc() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @limit: Range for allocated ID.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ *
+ * Context: Any context.  Expects xa_lock to be held on entry.  May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: 0 on success, -ENOMEM if memory could not be allocated or
+ * -EBUSY if there are no free entries in @limit.
+ */
+int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, gfp_t gfp)
+{
+	XA_STATE(xas, xa, 0);
+
+	if (WARN_ON_ONCE(xa_is_advanced(entry)))
+		return -EINVAL;
+	if (WARN_ON_ONCE(!xa_track_free(xa)))
+		return -EINVAL;
+
+	if (!entry)
+		entry = XA_ZERO_ENTRY;
+
+	do {
+		xas.xa_index = limit.min;
+		xas_find_marked(&xas, limit.max, XA_FREE_MARK);
+		if (xas.xa_node == XAS_RESTART)
+			xas_set_err(&xas, -EBUSY);
+		else
+			*id = xas.xa_index;
+		xas_store(&xas, entry);
+		xas_clear_mark(&xas, XA_FREE_MARK);
+	} while (__xas_nomem(&xas, gfp));
+
+	return xas_error(&xas);
+}
+EXPORT_SYMBOL(__xa_alloc);
+
+/**
+ * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of allocated ID.
+ * @next: Pointer to next ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ * The search for an empty entry will start at @next and will wrap
+ * around if necessary.
+ *
+ * Context: Any context.  Expects xa_lock to be held on entry.  May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: 0 if the allocation succeeded without wrapping.  1 if the
+ * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * allocated or -EBUSY if there are no free entries in @limit.
+ */
+int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, u32 *next, gfp_t gfp)
+{
+	u32 min = limit.min;
+	int ret;
+
+	limit.min = max(min, *next);
+	ret = __xa_alloc(xa, id, entry, limit, gfp);
+	if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) {
+		xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED;
+		ret = 1;
+	}
+
+	if (ret < 0 && limit.min > min) {
+		limit.min = min;
+		ret = __xa_alloc(xa, id, entry, limit, gfp);
+		if (ret == 0)
+			ret = 1;
+	}
+
+	if (ret >= 0) {
+		*next = *id + 1;
+		if (*next == 0)
+			xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__xa_alloc_cyclic);
+
+/**
+ * __xa_set_mark() - Set this mark on this entry while locked.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * Attempting to set a mark on a %NULL entry does not succeed.
+ *
+ * Context: Any context.  Expects xa_lock to be held on entry.
+ */
+void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+	XA_STATE(xas, xa, index);
+	void *entry = xas_load(&xas);
+
+	if (entry)
+		xas_set_mark(&xas, mark);
+}
+EXPORT_SYMBOL(__xa_set_mark);
+
+/**
+ * __xa_clear_mark() - Clear this mark on this entry while locked.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * Context: Any context.  Expects xa_lock to be held on entry.
+ */
+void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+	XA_STATE(xas, xa, index);
+	void *entry = xas_load(&xas);
+
+	if (entry)
+		xas_clear_mark(&xas, mark);
+}
+EXPORT_SYMBOL(__xa_clear_mark);
+
+/**
+ * xa_get_mark() - Inquire whether this mark is set on this entry.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * This function uses the RCU read lock, so the result may be out of date
+ * by the time it returns.  If you need the result to be stable, use a lock.
+ *
+ * Context: Any context.  Takes and releases the RCU lock.
+ * Return: True if the entry at @index has this mark set, false if it doesn't.
+ */
+bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+	XA_STATE(xas, xa, index);
+	void *entry;
+
+	rcu_read_lock();
+	entry = xas_start(&xas);
+	while (xas_get_mark(&xas, mark)) {
+		if (!xa_is_node(entry))
+			goto found;
+		entry = xas_descend(&xas, xa_to_node(entry));
+	}
+	rcu_read_unlock();
+	return false;
+ found:
+	rcu_read_unlock();
+	return true;
+}
+EXPORT_SYMBOL(xa_get_mark);
+
+/**
+ * xa_set_mark() - Set this mark on this entry.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * Attempting to set a mark on a %NULL entry does not succeed.
+ *
+ * Context: Process context.  Takes and releases the xa_lock.
+ */
+void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+	xa_lock(xa);
+	__xa_set_mark(xa, index, mark);
+	xa_unlock(xa);
+}
+EXPORT_SYMBOL(xa_set_mark);
+
+/**
+ * xa_clear_mark() - Clear this mark on this entry.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * Clearing a mark always succeeds.
+ *
+ * Context: Process context.  Takes and releases the xa_lock.
+ */
+void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+	xa_lock(xa);
+	__xa_clear_mark(xa, index, mark);
+	xa_unlock(xa);
+}
+EXPORT_SYMBOL(xa_clear_mark);
+
+/**
+ * xa_find() - Search the XArray for an entry.
+ * @xa: XArray.
+ * @indexp: Pointer to an index.
+ * @max: Maximum index to search to.
+ * @filter: Selection criterion.
+ *
+ * Finds the entry in @xa which matches the @filter, and has the lowest
+ * index that is at least @indexp and no more than @max.
+ * If an entry is found, @indexp is updated to be the index of the entry.
+ * This function is protected by the RCU read lock, so it may not find
+ * entries which are being simultaneously added.  It will not return an
+ * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
+ *
+ * Context: Any context.  Takes and releases the RCU lock.
+ * Return: The entry, if found, otherwise %NULL.
+ */
+void *xa_find(struct xarray *xa, unsigned long *indexp,
+			unsigned long max, xa_mark_t filter)
+{
+	XA_STATE(xas, xa, *indexp);
+	void *entry;
+
+	rcu_read_lock();
+	do {
+		if ((__force unsigned int)filter < XA_MAX_MARKS)
+			entry = xas_find_marked(&xas, max, filter);
+		else
+			entry = xas_find(&xas, max);
+	} while (xas_retry(&xas, entry));
+	rcu_read_unlock();
+
+	if (entry)
+		*indexp = xas.xa_index;
+	return entry;
+}
+EXPORT_SYMBOL(xa_find);
+
+/**
+ * xa_find_after() - Search the XArray for a present entry.
+ * @xa: XArray.
+ * @indexp: Pointer to an index.
+ * @max: Maximum index to search to.
+ * @filter: Selection criterion.
+ *
+ * Finds the entry in @xa which matches the @filter and has the lowest
+ * index that is above @indexp and no more than @max.
+ * If an entry is found, @indexp is updated to be the index of the entry.
+ * This function is protected by the RCU read lock, so it may miss entries
+ * which are being simultaneously added.  It will not return an
+ * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
+ *
+ * Context: Any context.  Takes and releases the RCU lock.
+ * Return: The pointer, if found, otherwise %NULL.
+ */
+void *xa_find_after(struct xarray *xa, unsigned long *indexp,
+			unsigned long max, xa_mark_t filter)
+{
+	XA_STATE(xas, xa, *indexp + 1);
+	void *entry;
+
+	rcu_read_lock();
+	for (;;) {
+		if ((__force unsigned int)filter < XA_MAX_MARKS)
+			entry = xas_find_marked(&xas, max, filter);
+		else
+			entry = xas_find(&xas, max);
+		if (xas.xa_node == XAS_BOUNDS)
+			break;
+		if (xas.xa_shift) {
+			if (xas.xa_index & ((1UL << xas.xa_shift) - 1))
+				continue;
+		} else {
+			if (xas.xa_offset < (xas.xa_index & XA_CHUNK_MASK))
+				continue;
+		}
+		if (!xas_retry(&xas, entry))
+			break;
+	}
+	rcu_read_unlock();
+
+	if (entry)
+		*indexp = xas.xa_index;
+	return entry;
+}
+EXPORT_SYMBOL(xa_find_after);
+
+static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
+			unsigned long max, unsigned int n)
+{
+	void *entry;
+	unsigned int i = 0;
+
+	rcu_read_lock();
+	xas_for_each(xas, entry, max) {
+		if (xas_retry(xas, entry))
+			continue;
+		dst[i++] = entry;
+		if (i == n)
+			break;
+	}
+	rcu_read_unlock();
+
+	return i;
+}
+
+static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
+			unsigned long max, unsigned int n, xa_mark_t mark)
+{
+	void *entry;
+	unsigned int i = 0;
+
+	rcu_read_lock();
+	xas_for_each_marked(xas, entry, max, mark) {
+		if (xas_retry(xas, entry))
+			continue;
+		dst[i++] = entry;
+		if (i == n)
+			break;
+	}
+	rcu_read_unlock();
+
+	return i;
+}
+
+/**
+ * xa_extract() - Copy selected entries from the XArray into a normal array.
+ * @xa: The source XArray to copy from.
+ * @dst: The buffer to copy entries into.
+ * @start: The first index in the XArray eligible to be selected.
+ * @max: The last index in the XArray eligible to be selected.
+ * @n: The maximum number of entries to copy.
+ * @filter: Selection criterion.
+ *
+ * Copies up to @n entries that match @filter from the XArray.  The
+ * copied entries will have indices between @start and @max, inclusive.
+ *
+ * The @filter may be an XArray mark value, in which case entries which are
+ * marked with that mark will be copied.  It may also be %XA_PRESENT, in
+ * which case all entries which are not %NULL will be copied.
+ *
+ * The entries returned may not represent a snapshot of the XArray at a
+ * moment in time.  For example, if another thread stores to index 5, then
+ * index 10, calling xa_extract() may return the old contents of index 5
+ * and the new contents of index 10.  Indices not modified while this
+ * function is running will not be skipped.
+ *
+ * If you need stronger guarantees, holding the xa_lock across calls to this
+ * function will prevent concurrent modification.
+ *
+ * Context: Any context.  Takes and releases the RCU lock.
+ * Return: The number of entries copied.
+ */
+unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
+			unsigned long max, unsigned int n, xa_mark_t filter)
+{
+	XA_STATE(xas, xa, start);
+
+	if (!n)
+		return 0;
+
+	if ((__force unsigned int)filter < XA_MAX_MARKS)
+		return xas_extract_marked(&xas, dst, max, n, filter);
+	return xas_extract_present(&xas, dst, max, n);
+}
+EXPORT_SYMBOL(xa_extract);
+
+/**
+ * xa_destroy() - Free all internal data structures.
+ * @xa: XArray.
+ *
+ * After calling this function, the XArray is empty and has freed all memory
+ * allocated for its internal data structures.  You are responsible for
+ * freeing the objects referenced by the XArray.
+ *
+ * Context: Any context.  Takes and releases the xa_lock, interrupt-safe.
+ */
+void xa_destroy(struct xarray *xa)
+{
+	XA_STATE(xas, xa, 0);
+	unsigned long flags;
+	void *entry;
+
+	xas.xa_node = NULL;
+	xas_lock_irqsave(&xas, flags);
+	entry = xa_head_locked(xa);
+	RCU_INIT_POINTER(xa->xa_head, NULL);
+	xas_init_marks(&xas);
+	if (xa_zero_busy(xa))
+		xa_mark_clear(xa, XA_FREE_MARK);
+	/* lockdep checks we're still holding the lock in xas_free_nodes() */
+	if (xa_is_node(entry))
+		xas_free_nodes(&xas, xa_to_node(entry));
+	xas_unlock_irqrestore(&xas, flags);
+}
+EXPORT_SYMBOL(xa_destroy);
+
+#ifdef XA_DEBUG
+void xa_dump_node(const struct xa_node *node)
+{
+	unsigned i, j;
+
+	if (!node)
+		return;
+	if ((unsigned long)node & 3) {
+		pr_cont("node %px\n", node);
+		return;
+	}
+
+	pr_cont("node %px %s %d parent %px shift %d count %d values %d "
+		"array %px list %px %px marks",
+		node, node->parent ? "offset" : "max", node->offset,
+		node->parent, node->shift, node->count, node->nr_values,
+		node->array, node->private_list.prev, node->private_list.next);
+	for (i = 0; i < XA_MAX_MARKS; i++)
+		for (j = 0; j < XA_MARK_LONGS; j++)
+			pr_cont(" %lx", node->marks[i][j]);
+	pr_cont("\n");
+}
+
+void xa_dump_index(unsigned long index, unsigned int shift)
+{
+	if (!shift)
+		pr_info("%lu: ", index);
+	else if (shift >= BITS_PER_LONG)
+		pr_info("0-%lu: ", ~0UL);
+	else
+		pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
+}
+
+void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
+{
+	if (!entry)
+		return;
+
+	xa_dump_index(index, shift);
+
+	if (xa_is_node(entry)) {
+		if (shift == 0) {
+			pr_cont("%px\n", entry);
+		} else {
+			unsigned long i;
+			struct xa_node *node = xa_to_node(entry);
+			xa_dump_node(node);
+			for (i = 0; i < XA_CHUNK_SIZE; i++)
+				xa_dump_entry(node->slots[i],
+				      index + (i << node->shift), node->shift);
+		}
+	} else if (xa_is_value(entry))
+		pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
+						xa_to_value(entry), entry);
+	else if (!xa_is_internal(entry))
+		pr_cont("%px\n", entry);
+	else if (xa_is_retry(entry))
+		pr_cont("retry (%ld)\n", xa_to_internal(entry));
+	else if (xa_is_sibling(entry))
+		pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
+	else if (xa_is_zero(entry))
+		pr_cont("zero (%ld)\n", xa_to_internal(entry));
+	else
+		pr_cont("UNKNOWN ENTRY (%px)\n", entry);
+}
+
+void xa_dump(const struct xarray *xa)
+{
+	void *entry = xa->xa_head;
+	unsigned int shift = 0;
+
+	pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
+			xa->xa_flags, xa_marked(xa, XA_MARK_0),
+			xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
+	if (xa_is_node(entry))
+		shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
+	xa_dump_entry(entry, 0, shift);
+}
+#endif
+#endif /* !HAVE_XARRAY_SUPPORT */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/module.c b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
new file mode 100644
index 0000000000000..82a75a7fd3e43
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
@@ -0,0 +1,936 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+#include <linux/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/list.h>
+
+#include <linux/sysctl.h>
+#include <linux/debugfs.h>
+#include <asm/div64.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_crypto.h>
+#include <lnet/lib-lnet.h>
+#include <lustre_crypto.h>
+#include "tracefile.h"
+
+struct lnet_debugfs_symlink_def {
+	const char *name;
+	const char *target;
+};
+
+static struct dentry *lnet_debugfs_root;
+
+BLOCKING_NOTIFIER_HEAD(libcfs_ioctl_list);
+EXPORT_SYMBOL(libcfs_ioctl_list);
+
+static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
+{
+	size_t len = sizeof(*data);
+
+	len += (data->ioc_inllen1 + 7) & ~7;
+	len += (data->ioc_inllen2 + 7) & ~7;
+	return len;
+}
+
+static bool libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data)
+{
+	const int maxlen = 1 << 30;
+	if (data->ioc_hdr.ioc_len > maxlen)
+		return true;
+
+	if (data->ioc_inllen1 > maxlen)
+		return true;
+
+	if (data->ioc_inllen2 > maxlen)
+		return true;
+
+	if (data->ioc_inlbuf1 && !data->ioc_inllen1)
+		return true;
+
+	if (data->ioc_inlbuf2 && !data->ioc_inllen2)
+		return true;
+
+	if (data->ioc_pbuf1 && !data->ioc_plen1)
+		return true;
+
+	if (data->ioc_pbuf2 && !data->ioc_plen2)
+		return true;
+
+	if (data->ioc_plen1 && !data->ioc_pbuf1)
+		return true;
+
+	if (data->ioc_plen2 && !data->ioc_pbuf2)
+		return true;
+
+	if (libcfs_ioctl_packlen(data) != data->ioc_hdr.ioc_len)
+		return true;
+
+	if (data->ioc_inllen1 &&
+		data->ioc_bulk[((data->ioc_inllen1 + 7) & ~7) +
+			       data->ioc_inllen2 - 1] != '\0')
+		return true;
+
+	return false;
+}
+
+int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data)
+{
+	ENTRY;
+
+	if (libcfs_ioctl_is_invalid(data)) {
+		CERROR("libcfs ioctl: parameter not correctly formatted\n");
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1 != 0)
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+	if (data->ioc_inllen2 != 0)
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+				    cfs_size_round(data->ioc_inllen1);
+
+	RETURN(0);
+}
+
+int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
+			 struct libcfs_ioctl_hdr __user *uhdr)
+{
+	struct libcfs_ioctl_hdr hdr;
+	int err;
+
+	ENTRY;
+	if (copy_from_user(&hdr, uhdr, sizeof(hdr)))
+		RETURN(-EFAULT);
+
+	if (hdr.ioc_version != LIBCFS_IOCTL_VERSION &&
+	    hdr.ioc_version != LIBCFS_IOCTL_VERSION2) {
+		CERROR("libcfs ioctl: version mismatch expected %#x, got %#x\n",
+		       LIBCFS_IOCTL_VERSION, hdr.ioc_version);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len < sizeof(struct libcfs_ioctl_hdr)) {
+		CERROR("libcfs ioctl: user buffer too small for ioctl\n");
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len > LIBCFS_IOC_DATA_MAX) {
+		CERROR("libcfs ioctl: user buffer is too large %d/%d\n",
+		       hdr.ioc_len, LIBCFS_IOC_DATA_MAX);
+		RETURN(-EINVAL);
+	}
+
+	LIBCFS_ALLOC(*hdr_pp, hdr.ioc_len);
+	if (*hdr_pp == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(*hdr_pp, uhdr, hdr.ioc_len))
+		GOTO(free, err = -EFAULT);
+
+	if ((*hdr_pp)->ioc_version != hdr.ioc_version ||
+		(*hdr_pp)->ioc_len != hdr.ioc_len) {
+		GOTO(free, err = -EINVAL);
+	}
+
+	RETURN(0);
+
+free:
+	LIBCFS_FREE(*hdr_pp, hdr.ioc_len);
+	RETURN(err);
+}
+
+static int libcfs_ioctl(unsigned long cmd, void __user *uparam)
+{
+	struct libcfs_ioctl_data *data = NULL;
+	struct libcfs_ioctl_hdr  *hdr;
+	int			  err;
+	ENTRY;
+
+	/* 'cmd' and permissions get checked in our arch-specific caller */
+	err = libcfs_ioctl_getdata(&hdr, uparam);
+	if (err != 0) {
+		CDEBUG_LIMIT(D_ERROR,
+			     "libcfs ioctl: data header error %d\n", err);
+		RETURN(err);
+	}
+
+	if (hdr->ioc_version == LIBCFS_IOCTL_VERSION) {
+		/* The libcfs_ioctl_data_adjust() function performs adjustment
+		 * operations on the libcfs_ioctl_data structure to make
+		 * it usable by the code.  This doesn't need to be called
+		 * for new data structures added. */
+		data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr);
+		err = libcfs_ioctl_data_adjust(data);
+		if (err != 0)
+			GOTO(out, err);
+	}
+
+	CDEBUG(D_IOCTL, "libcfs ioctl cmd %lu\n", cmd);
+	switch (cmd) {
+	case IOC_LIBCFS_CLEAR_DEBUG:
+		libcfs_debug_clear_buffer();
+		break;
+	case IOC_LIBCFS_MARK_DEBUG:
+		if (data == NULL ||
+		    data->ioc_inlbuf1 == NULL ||
+		    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+			GOTO(out, err = -EINVAL);
+
+		libcfs_debug_mark_buffer(data->ioc_inlbuf1);
+		break;
+
+	default:
+		err = blocking_notifier_call_chain(&libcfs_ioctl_list,
+						   cmd, hdr);
+		if (!(err & NOTIFY_STOP_MASK))
+			/* No-one claimed the ioctl */
+			err = -EINVAL;
+		else
+			err = notifier_to_errno(err);
+		if (copy_to_user(uparam, hdr, hdr->ioc_len) && !err)
+			err = -EFAULT;
+		break;
+	}
+out:
+	LIBCFS_FREE(hdr, hdr->ioc_len);
+	RETURN(err);
+}
+
+static long
+libcfs_psdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (_IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+	    _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  ||
+	    _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR) {
+		CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+		       _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+		return -EINVAL;
+	}
+
+	return libcfs_ioctl(cmd, (void __user *)arg);
+}
+
+static const struct file_operations libcfs_fops = {
+	.owner			= THIS_MODULE,
+	.unlocked_ioctl		= libcfs_psdev_ioctl,
+};
+
+static struct miscdevice libcfs_dev = {
+	.minor			= MISC_DYNAMIC_MINOR,
+	.name			= "lnet",
+	.fops			= &libcfs_fops,
+};
+
+static int proc_dobitmasks(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	const int     tmpstrlen = 512;
+	char         *tmpstr = NULL;
+	int           rc;
+	size_t nob = *lenp;
+	loff_t pos = *ppos;
+	unsigned int *mask = table->data;
+	int           is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+	int           is_printk = (mask == &libcfs_printk) ? 1 : 0;
+
+	if (!write) {
+		tmpstr = kmalloc(tmpstrlen, GFP_KERNEL | __GFP_ZERO);
+		if (!tmpstr)
+			return -ENOMEM;
+		libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys);
+		rc = strlen(tmpstr);
+
+		if (pos >= rc) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+						      tmpstr + pos, "\n");
+		}
+	} else {
+		tmpstr = memdup_user_nul(buffer, nob);
+		if (IS_ERR(tmpstr))
+			return PTR_ERR(tmpstr);
+
+		rc = libcfs_debug_str2mask(mask, strim(tmpstr), is_subsys);
+		/* Always print LBUG/LASSERT to console, so keep this mask */
+		if (is_printk)
+			*mask |= D_EMERG;
+	}
+
+	kfree(tmpstr);
+	return rc;
+}
+
+static int min_watchdog_ratelimit;		/* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
+static int proc_dump_kernel(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	size_t nob = *lenp;
+
+	if (!write)
+		return 0;
+
+	return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
+}
+
+static int proc_daemon_file(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	size_t nob = *lenp;
+	loff_t pos = *ppos;
+
+	if (!write) {
+		int len = strlen(cfs_tracefile);
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob,
+						cfs_tracefile + pos, "\n");
+	}
+
+	return cfs_trace_daemon_command_usrstr(buffer, nob);
+}
+
+static int libcfs_force_lbug(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	if (write)
+		LBUG();
+	return 0;
+}
+
+static int proc_fail_loc(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc;
+	long old_fail_loc = cfs_fail_loc;
+
+	if (!*lenp || *ppos) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		char *kbuf = memdup_user_nul(buffer, *lenp);
+
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
+		rc = kstrtoul(kbuf, 0, &cfs_fail_loc);
+		kfree(kbuf);
+		*ppos += *lenp;
+	} else {
+		char kbuf[64/3+3];
+
+		rc = scnprintf(kbuf, sizeof(kbuf), "%lu\n", cfs_fail_loc);
+		if (copy_to_user(buffer, kbuf, rc))
+			rc = -EFAULT;
+		else {
+			*lenp = rc;
+			*ppos += rc;
+		}
+	}
+
+	if (old_fail_loc != cfs_fail_loc) {
+		cfs_race_state = 1;
+		wake_up(&cfs_race_waitq);
+	}
+	return rc;
+}
+
+int debugfs_doint(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc;
+
+	if (!*lenp || *ppos) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		char *kbuf = memdup_user_nul(buffer, *lenp);
+		int val;
+
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
+
+		rc = kstrtoint(kbuf, 0, &val);
+		kfree(kbuf);
+		if (!rc) {
+			if (table->extra1 && val < *(int *)table->extra1)
+				val = *(int *)table->extra1;
+			if (table->extra2 && val > *(int *)table->extra2)
+				val = *(int *)table->extra2;
+			*(int *)table->data = val;
+		}
+		*ppos += *lenp;
+	} else {
+		char kbuf[64/3+3];
+
+		rc = scnprintf(kbuf, sizeof(kbuf), "%u\n", *(int *)table->data);
+		if (copy_to_user(buffer, kbuf, rc))
+			rc = -EFAULT;
+		else {
+			*lenp = rc;
+			*ppos += rc;
+		}
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(debugfs_doint);
+
+static int debugfs_dou64(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc;
+
+	if (!*lenp || *ppos) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		char *kbuf = memdup_user_nul(buffer, *lenp);
+		unsigned long long val;
+
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
+
+		rc = kstrtoull(kbuf, 0, &val);
+		kfree(kbuf);
+		if (!rc)
+			*(u64 *)table->data = val;
+		*ppos += *lenp;
+	} else {
+		char kbuf[64/3+3];
+
+		rc = scnprintf(kbuf, sizeof(kbuf), "%llu\n",
+			       (unsigned long long)*(u64 *)table->data);
+		if (copy_to_user(buffer, kbuf, rc))
+			rc = -EFAULT;
+		else {
+			*lenp = rc;
+			*ppos += rc;
+		}
+	}
+
+	return rc;
+}
+
+static int debugfs_dostring(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int len = *lenp;
+	char *kbuf = table->data;
+
+	if (!len || *ppos) {
+		*lenp = 0;
+		return 0;
+	}
+	if (len > table->maxlen)
+		len = table->maxlen;
+	if (write) {
+		if (copy_from_user(kbuf, buffer, len))
+			return -EFAULT;
+		memset(kbuf+len, 0, table->maxlen - len);
+		*ppos = *lenp;
+	} else {
+		len = strnlen(kbuf, len);
+		if (copy_to_user(buffer, kbuf, len))
+			return -EFAULT;
+		if (len < *lenp) {
+			if (copy_to_user(buffer+len, "\n", 1))
+				return -EFAULT;
+			len += 1;
+		}
+		*ppos += len;
+		*lenp -= len;
+	}
+	return len;
+}
+
+static int proc_cpt_table(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	size_t nob = *lenp;
+	loff_t pos = *ppos;
+	char *buf = NULL;
+	int   len = 4096;
+	int   rc  = 0;
+
+	if (write)
+		return -EPERM;
+
+	while (1) {
+		LIBCFS_ALLOC(buf, len);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		rc = cfs_cpt_table_print(cfs_cpt_tab, buf, len);
+		if (rc >= 0)
+			break;
+
+		if (rc == -EFBIG) {
+			LIBCFS_FREE(buf, len);
+			len <<= 1;
+			continue;
+		}
+		goto out;
+	}
+
+	if (pos >= rc) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+out:
+	if (buf != NULL)
+		LIBCFS_FREE(buf, len);
+	return rc;
+}
+
+static int proc_cpt_distance(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	size_t nob = *lenp;
+	loff_t pos = *ppos;
+	char *buf = NULL;
+	int   len = 4096;
+	int   rc  = 0;
+
+	if (write)
+		return -EPERM;
+
+	while (1) {
+		LIBCFS_ALLOC(buf, len);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		rc = cfs_cpt_distance_print(cfs_cpt_tab, buf, len);
+		if (rc >= 0)
+			break;
+
+		if (rc == -EFBIG) {
+			LIBCFS_FREE(buf, len);
+			len <<= 1;
+			continue;
+		}
+		goto out;
+	}
+
+	if (pos >= rc) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+	if (buf != NULL)
+		LIBCFS_FREE(buf, len);
+	return rc;
+}
+
+static struct ctl_table lnet_table[] = {
+	{
+		.procname	= "debug",
+		.data		= &libcfs_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dobitmasks,
+	},
+	{
+		.procname	= "subsystem_debug",
+		.data		= &libcfs_subsystem_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dobitmasks,
+	},
+	{
+		.procname	= "printk",
+		.data		= &libcfs_printk,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dobitmasks,
+	},
+	{
+		.procname	= "cpu_partition_table",
+		.maxlen		= 128,
+		.mode		= 0444,
+		.proc_handler	= &proc_cpt_table,
+	},
+	{
+		.procname	= "cpu_partition_distance",
+		.maxlen		= 128,
+		.mode		= 0444,
+		.proc_handler	= &proc_cpt_distance,
+	},
+	{
+		.procname	= "debug_log_upcall",
+		.data		= lnet_debug_log_upcall,
+		.maxlen		= sizeof(lnet_debug_log_upcall),
+		.mode		= 0644,
+		.proc_handler	= &debugfs_dostring,
+	},
+	{
+		.procname	= "lnet_memused",
+		.data		= (u64 *)&libcfs_kmem.counter,
+		.maxlen		= sizeof(u64),
+		.mode		= 0444,
+		.proc_handler	= &debugfs_dou64,
+	},
+	{
+		.procname	= "catastrophe",
+		.data		= &libcfs_catastrophe,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &debugfs_doint,
+	},
+	{
+		.procname	= "dump_kernel",
+		.maxlen		= 256,
+		.mode		= 0200,
+		.proc_handler	= &proc_dump_kernel,
+	},
+	{
+		.procname	= "daemon_file",
+		.mode		= 0644,
+		.maxlen		= 256,
+		.proc_handler	= &proc_daemon_file,
+	},
+	{
+		.procname	= "watchdog_ratelimit",
+		.data		= &libcfs_watchdog_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &debugfs_doint,
+		.extra1		= &min_watchdog_ratelimit,
+		.extra2		= &max_watchdog_ratelimit,
+	},
+	{
+		.procname	= "force_lbug",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0200,
+		.proc_handler	= &libcfs_force_lbug
+	},
+	{
+		.procname	= "fail_loc",
+		.data		= &cfs_fail_loc,
+		.maxlen		= sizeof(cfs_fail_loc),
+		.mode		= 0644,
+		.proc_handler	= &proc_fail_loc
+	},
+	{
+		.procname	= "fail_val",
+		.data		= &cfs_fail_val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &debugfs_doint
+	},
+	{
+		.procname	= "fail_err",
+		.data		= &cfs_fail_err,
+		.maxlen		= sizeof(cfs_fail_err),
+		.mode		= 0644,
+		.proc_handler	= &debugfs_doint,
+	},
+	{
+	}
+};
+
+static const struct lnet_debugfs_symlink_def lnet_debugfs_symlinks[] = {
+	{ .name		= "console_ratelimit",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_ratelimit" },
+	{ .name		= "debug_path",
+	  .target	= "../../../module/libcfs/parameters/libcfs_debug_file_path" },
+	{ .name		= "panic_on_lbug",
+	  .target	= "../../../module/libcfs/parameters/libcfs_panic_on_lbug" },
+	{ .name		= "console_backoff",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_backoff" },
+	{ .name		= "debug_mb",
+	  .target	= "../../../module/libcfs/parameters/libcfs_debug_mb" },
+	{ .name		= "console_min_delay_centisecs",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_min_delay" },
+	{ .name		= "console_max_delay_centisecs",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_max_delay" },
+	{ .name		= NULL },
+};
+
+static ssize_t lnet_debugfs_read(struct file *filp, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct ctl_table *table = filp->private_data;
+	loff_t old_pos = *ppos;
+	ssize_t rc = -EINVAL;
+
+	if (table)
+		rc = table->proc_handler(table, 0, (void __user *)buf,
+					 &count, ppos);
+	/*
+	 * On success, the length read is either in error or in count.
+	 * If ppos changed, then use count, else use error
+	 */
+	if (!rc && *ppos != old_pos)
+		rc = count;
+	else if (rc > 0)
+		*ppos += rc;
+
+	return rc;
+}
+
+static ssize_t lnet_debugfs_write(struct file *filp, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct ctl_table *table = filp->private_data;
+	loff_t old_pos = *ppos;
+	ssize_t rc = -EINVAL;
+
+	if (table)
+		rc = table->proc_handler(table, 1, (void __user *)buf, &count,
+					 ppos);
+	if (rc)
+		return rc;
+
+	if (*ppos == old_pos)
+		*ppos += count;
+
+	return count;
+}
+
+static const struct file_operations lnet_debugfs_file_operations_rw = {
+	.open		= simple_open,
+	.read		= lnet_debugfs_read,
+	.write		= lnet_debugfs_write,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations lnet_debugfs_file_operations_ro = {
+	.open		= simple_open,
+	.read		= lnet_debugfs_read,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations lnet_debugfs_file_operations_wo = {
+	.open		= simple_open,
+	.write		= lnet_debugfs_write,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations *lnet_debugfs_fops_select(umode_t mode)
+{
+	if (!(mode & S_IWUGO))
+		return &lnet_debugfs_file_operations_ro;
+
+	if (!(mode & S_IRUGO))
+		return &lnet_debugfs_file_operations_wo;
+
+	return &lnet_debugfs_file_operations_rw;
+}
+
+void lnet_insert_debugfs(struct ctl_table *table)
+{
+	if (!lnet_debugfs_root)
+		lnet_debugfs_root = debugfs_create_dir("lnet", NULL);
+
+	/* Even if we cannot create, just ignore it altogether) */
+	if (IS_ERR_OR_NULL(lnet_debugfs_root))
+		return;
+
+	/* We don't save the dentry returned in next two calls, because
+	 * we don't call debugfs_remove() but rather remove_recursive()
+	 */
+	for (; table && table->procname; table++)
+		debugfs_create_file(table->procname, table->mode,
+				    lnet_debugfs_root, table,
+				    lnet_debugfs_fops_select(table->mode));
+}
+EXPORT_SYMBOL_GPL(lnet_insert_debugfs);
+
+static void lnet_insert_debugfs_links(
+		const struct lnet_debugfs_symlink_def *symlinks)
+{
+	for (; symlinks && symlinks->name; symlinks++)
+		debugfs_create_symlink(symlinks->name, lnet_debugfs_root,
+				       symlinks->target);
+}
+
+void lnet_remove_debugfs(struct ctl_table *table)
+{
+	for (; table && table->procname; table++) {
+		struct qstr dname = QSTR_INIT(table->procname,
+					      strlen(table->procname));
+		struct dentry *dentry;
+
+		dentry = d_hash_and_lookup(lnet_debugfs_root, &dname);
+		debugfs_remove(dentry);
+	}
+}
+EXPORT_SYMBOL_GPL(lnet_remove_debugfs);
+
+static int __init libcfs_init(void)
+{
+	int rc;
+
+	cfs_arch_init();
+
+	init_libcfs_vfree_atomic();
+
+	rc = libcfs_debug_init(5 * 1024 * 1024);
+	if (rc < 0) {
+		pr_err("LustreError: libcfs_debug_init: rc = %d\n", rc);
+		return (rc);
+	}
+
+	cfs_debug_init();
+
+	rc = cfs_cpu_init();
+	if (rc != 0)
+		goto cleanup_debug;
+
+	rc = misc_register(&libcfs_dev);
+	if (rc) {
+		CERROR("misc_register: error %d\n", rc);
+		goto cleanup_cpu;
+	}
+
+	rc = cfs_wi_startup();
+	if (rc) {
+		CERROR("initialize workitem: error %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	cfs_rehash_wq = alloc_workqueue("cfs_rh", WQ_SYSFS, 4);
+	if (!cfs_rehash_wq) {
+		rc = -ENOMEM;
+		CERROR("libcfs: failed to start rehash workqueue: rc = %d\n",
+		       rc);
+		goto cleanup_deregister;
+	}
+
+	rc = cfs_crypto_register();
+	if (rc) {
+		CERROR("cfs_crypto_regster: error %d\n", rc);
+		goto cleanup_wi;
+	}
+
+	lnet_insert_debugfs(lnet_table);
+	if (!IS_ERR_OR_NULL(lnet_debugfs_root))
+		lnet_insert_debugfs_links(lnet_debugfs_symlinks);
+
+	rc = llcrypt_init();
+	if (rc) {
+		CERROR("llcrypt_init: error %d\n", rc);
+		goto cleanup_crypto;
+	}
+
+	CDEBUG(D_OTHER, "portals setup OK\n");
+	return 0;
+cleanup_crypto:
+	cfs_crypto_unregister();
+cleanup_wi:
+	cfs_wi_shutdown();
+cleanup_deregister:
+	misc_deregister(&libcfs_dev);
+cleanup_cpu:
+	cfs_cpu_fini();
+cleanup_debug:
+	libcfs_debug_cleanup();
+	return rc;
+}
+
+static void __exit libcfs_exit(void)
+{
+	int rc;
+
+	/* Remove everthing */
+	debugfs_remove_recursive(lnet_debugfs_root);
+	lnet_debugfs_root = NULL;
+
+	CDEBUG(D_MALLOC, "before Portals cleanup: kmem %lld\n",
+	       libcfs_kmem_read());
+
+	llcrypt_exit();
+
+	if (cfs_rehash_wq) {
+		destroy_workqueue(cfs_rehash_wq);
+		cfs_rehash_wq = NULL;
+	}
+
+	cfs_crypto_unregister();
+	cfs_wi_shutdown();
+
+	misc_deregister(&libcfs_dev);
+
+	cfs_cpu_fini();
+
+	/* the below message is checked in test-framework.sh check_mem_leak() */
+	if (libcfs_kmem_read() != 0)
+		CERROR("Portals memory leaked: %lld bytes\n",
+		       libcfs_kmem_read());
+
+	rc = libcfs_debug_cleanup();
+	if (rc)
+		pr_err("LustreError: libcfs_debug_cleanup: rc = %d\n", rc);
+
+	exit_libcfs_vfree_atomic();
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre helper library");
+MODULE_VERSION(LIBCFS_VERSION);
+MODULE_LICENSE("GPL");
+
+module_init(libcfs_init);
+module_exit(libcfs_exit);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
new file mode 100644
index 0000000000000..ac473c5eae651
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
@@ -0,0 +1,1213 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/libcfs/tracefile.c
+ *
+ * Author: Zach Brown <zab@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "tracefile.h"
+
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/uaccess.h>
+#include <libcfs/linux/linux-fs.h>
+#include <libcfs/libcfs.h>
+
+
+enum cfs_trace_buf_type {
+	CFS_TCD_TYPE_PROC = 0,
+	CFS_TCD_TYPE_SOFTIRQ,
+	CFS_TCD_TYPE_IRQ,
+	CFS_TCD_TYPE_CNT
+};
+
+union cfs_trace_data_union (*cfs_trace_data[CFS_TCD_TYPE_CNT])[NR_CPUS] __cacheline_aligned;
+
+/* Pages containing records already processed by daemon.
+ * Link via ->lru, use size in ->private
+ */
+static LIST_HEAD(daemon_pages);
+static long daemon_pages_count;
+static long daemon_pages_max;
+
+char cfs_tracefile[TRACEFILE_NAME_SIZE];
+long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+
+struct task_struct *tctl_task;
+
+static atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
+static DECLARE_RWSEM(cfs_tracefile_sem);
+
+/* trace file lock routines */
+/* The walking argument indicates the locking comes from all tcd types
+ * iterator and we must lock it and dissable local irqs to avoid deadlocks
+ * with other interrupt locks that might be happening. See LU-1311
+ * for details.
+ */
+int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+	__acquires(&tcd->tcd_lock)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_CNT);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_lock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_lock_irq(&tcd->tcd_lock);
+	else
+		spin_lock(&tcd->tcd_lock);
+	return 1;
+}
+
+void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+	__releases(&tcd->tcd_lock)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_CNT);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_unlock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_unlock_irq(&tcd->tcd_lock);
+	else
+		spin_unlock(&tcd->tcd_lock);
+}
+
+#define cfs_tcd_for_each(tcd, i, j)					\
+	for (i = 0; i < CFS_TCD_TYPE_CNT && cfs_trace_data[i]; i++)	\
+		for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd);	\
+		     j < num_possible_cpus();				\
+		     j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
+
+#define cfs_tcd_for_each_type_lock(tcd, i, cpu)				\
+	for (i = 0; i < CFS_TCD_TYPE_CNT && cfs_trace_data[i] &&	\
+	     (tcd = &(*cfs_trace_data[i])[cpu].tcd) &&			\
+	     cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
+
+enum cfs_trace_buf_type cfs_trace_buf_idx_get(void)
+{
+	if (in_irq())
+		return CFS_TCD_TYPE_IRQ;
+	if (in_softirq())
+		return CFS_TCD_TYPE_SOFTIRQ;
+	return CFS_TCD_TYPE_PROC;
+}
+
+static inline struct cfs_trace_cpu_data *
+cfs_trace_get_tcd(void)
+{
+	struct cfs_trace_cpu_data *tcd =
+		&(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
+
+	cfs_trace_lock_tcd(tcd, 0);
+
+	return tcd;
+}
+
+static inline void cfs_trace_put_tcd(struct cfs_trace_cpu_data *tcd)
+{
+	cfs_trace_unlock_tcd(tcd, 0);
+
+	put_cpu();
+}
+
+static inline struct cfs_trace_page *
+cfs_tage_from_list(struct list_head *list)
+{
+	return list_entry(list, struct cfs_trace_page, linkage);
+}
+
+static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp)
+{
+	struct page            *page;
+	struct cfs_trace_page *tage;
+
+	/* My caller is trying to free memory */
+	if (!in_interrupt() && (current->flags & PF_MEMALLOC))
+		return NULL;
+
+	/*
+	 * Don't spam console with allocation failures: they will be reported
+	 * by upper layer anyway.
+	 */
+	gfp |= __GFP_NOWARN;
+	page = alloc_page(gfp);
+	if (page == NULL)
+		return NULL;
+
+	tage = kmalloc(sizeof(*tage), gfp);
+	if (tage == NULL) {
+		__free_page(page);
+		return NULL;
+	}
+
+	tage->page = page;
+	atomic_inc(&cfs_tage_allocated);
+	return tage;
+}
+
+static void cfs_tage_free(struct cfs_trace_page *tage)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(tage->page != NULL);
+
+	__free_page(tage->page);
+	kfree(tage);
+	atomic_dec(&cfs_tage_allocated);
+}
+
+static void cfs_tage_to_tail(struct cfs_trace_page *tage,
+			     struct list_head *queue)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(queue != NULL);
+
+	list_move_tail(&tage->linkage, queue);
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *
+cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
+{
+	struct cfs_trace_page *tage;
+	struct task_struct *tsk;
+
+	if (tcd->tcd_cur_pages > 0) {
+		__LASSERT(!list_empty(&tcd->tcd_pages));
+		tage = cfs_tage_from_list(tcd->tcd_pages.prev);
+		if (tage->used + len <= PAGE_SIZE)
+			return tage;
+	}
+
+	if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
+		if (tcd->tcd_cur_stock_pages > 0) {
+			tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
+			--tcd->tcd_cur_stock_pages;
+			list_del_init(&tage->linkage);
+		} else {
+			tage = cfs_tage_alloc(GFP_ATOMIC);
+			if (unlikely(tage == NULL)) {
+				if ((!(current->flags & PF_MEMALLOC) ||
+				     in_interrupt()) && printk_ratelimit())
+					pr_warn("Lustre: cannot allocate a tage (%ld)\n",
+						tcd->tcd_cur_pages);
+				return NULL;
+			}
+		}
+
+		tage->used = 0;
+		tage->cpu = smp_processor_id();
+		tage->type = tcd->tcd_type;
+		list_add_tail(&tage->linkage, &tcd->tcd_pages);
+		tcd->tcd_cur_pages++;
+
+		tsk = tctl_task;
+		if (tcd->tcd_cur_pages > 8 && tsk)
+			/*
+			 * wake up tracefiled to process some pages.
+			 */
+			wake_up_process(tsk);
+
+		return tage;
+	}
+	return NULL;
+}
+
+static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
+{
+	int pgcount = tcd->tcd_cur_pages / 10;
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (printk_ratelimit())
+		pr_warn("Lustre: debug daemon buffer overflowed; discarding 10%% of pages (%d of %ld)\n",
+			pgcount + 1, tcd->tcd_cur_pages);
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+
+	list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+		if (pgcount-- == 0)
+			break;
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+		tcd->tcd_cur_pages--;
+	}
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
+						 unsigned long len)
+{
+	struct cfs_trace_page *tage;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (len > PAGE_SIZE) {
+		pr_err("LustreError: cowardly refusing to write %lu bytes in a page\n",
+		       len);
+		return NULL;
+	}
+
+	tage = cfs_trace_get_tage_try(tcd, len);
+	if (tage != NULL)
+		return tage;
+	if (tctl_task)
+		cfs_tcd_shrink(tcd);
+	if (tcd->tcd_cur_pages > 0) {
+		tage = cfs_tage_from_list(tcd->tcd_pages.next);
+		tage->used = 0;
+		cfs_tage_to_tail(tage, &tcd->tcd_pages);
+	}
+	return tage;
+}
+
+static void cfs_set_ptldebug_header(struct ptldebug_header *header,
+				    struct libcfs_debug_msg_data *msgdata,
+				    unsigned long stack)
+{
+	struct timespec64 ts;
+
+	ktime_get_real_ts64(&ts);
+
+	header->ph_subsys = msgdata->msg_subsys;
+	header->ph_mask = msgdata->msg_mask;
+	header->ph_cpu_id = smp_processor_id();
+	header->ph_type = cfs_trace_buf_idx_get();
+	/* y2038 safe since all user space treats this as unsigned, but
+	 * will overflow in 2106
+	 */
+	header->ph_sec = (u32)ts.tv_sec;
+	header->ph_usec = ts.tv_nsec / NSEC_PER_USEC;
+	header->ph_stack = stack;
+	header->ph_pid = current->pid;
+	header->ph_line_num = msgdata->msg_line;
+	header->ph_extern_pid = 0;
+}
+
+static void cfs_vprint_to_console(struct ptldebug_header *hdr, int mask,
+				  struct va_format *vaf, const char *file,
+				  const char *fn)
+{
+	char *prefix = "Lustre";
+
+	if (hdr->ph_subsys == S_LND || hdr->ph_subsys == S_LNET)
+		prefix = "LNet";
+
+	if (mask & D_CONSOLE) {
+		if (mask & D_EMERG)
+			pr_emerg("%sError: %pV", prefix, vaf);
+		else if (mask & D_ERROR)
+			pr_err("%sError: %pV", prefix, vaf);
+		else if (mask & D_WARNING)
+			pr_warn("%s: %pV", prefix, vaf);
+		else if (mask & libcfs_printk)
+			pr_info("%s: %pV", prefix, vaf);
+	} else {
+		if (mask & D_EMERG)
+			pr_emerg("%sError: %d:%d:(%s:%d:%s()) %pV", prefix,
+				 hdr->ph_pid, hdr->ph_extern_pid, file,
+				 hdr->ph_line_num, fn, vaf);
+		else if (mask & D_ERROR)
+			pr_err("%sError: %d:%d:(%s:%d:%s()) %pV", prefix,
+			       hdr->ph_pid, hdr->ph_extern_pid, file,
+			       hdr->ph_line_num, fn, vaf);
+		else if (mask & D_WARNING)
+			pr_warn("%s: %d:%d:(%s:%d:%s()) %pV", prefix,
+				hdr->ph_pid, hdr->ph_extern_pid, file,
+				hdr->ph_line_num, fn, vaf);
+		else if (mask & (D_CONSOLE | libcfs_printk))
+			pr_info("%s: %pV", prefix, vaf);
+	}
+}
+
+static void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+				 const char *file, const char *fn,
+				 const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	cfs_vprint_to_console(hdr, mask, &vaf, file, fn);
+}
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+		     const char *format, ...)
+{
+	struct cfs_trace_cpu_data *tcd = NULL;
+	struct ptldebug_header header = {0};
+	struct cfs_trace_page *tage;
+	/* string_buf is used only if tcd != NULL, and is always set then */
+	char *string_buf = NULL;
+	char *debug_buf;
+	int known_size;
+	int needed = 85; /* seeded with average message length */
+	int max_nob;
+	va_list ap;
+	int retry;
+	int mask = msgdata->msg_mask;
+	char *file = (char *)msgdata->msg_file;
+	struct cfs_debug_limit_state *cdls = msgdata->msg_cdls;
+
+	if (strchr(file, '/'))
+		file = strrchr(file, '/') + 1;
+
+	tcd = cfs_trace_get_tcd();
+
+	/* cfs_trace_get_tcd() grabs a lock, which disables preemption and
+	 * pins us to a particular CPU.  This avoids an smp_processor_id()
+	 * warning on Linux when debugging is enabled.
+	 */
+	cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
+
+	if (!tcd)                /* arch may not log in IRQ context */
+		goto console;
+
+	if (tcd->tcd_cur_pages == 0)
+		header.ph_flags |= PH_FLAG_FIRST_RECORD;
+
+	if (tcd->tcd_shutting_down) {
+		cfs_trace_put_tcd(tcd);
+		tcd = NULL;
+		goto console;
+	}
+
+	known_size = strlen(file) + 1;
+	if (msgdata->msg_fn)
+		known_size += strlen(msgdata->msg_fn) + 1;
+
+	if (libcfs_debug_binary)
+		known_size += sizeof(header);
+
+	/*
+	 * May perform an additional pass to update 'needed' and increase
+	 * tage buffer size to match vsnprintf reported size required
+	 * On the second pass (retry=1) use vscnprintf [which returns
+	 * number of bytes written not including the terminating nul]
+	 * to clarify `needed` is used as number of bytes written
+	 * for the remainder of this function
+	 */
+	for (retry = 0; retry < 2; retry++) {
+		tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
+		if (!tage) {
+			if (needed + known_size > PAGE_SIZE)
+				mask |= D_ERROR;
+
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		string_buf = (char *)page_address(tage->page) +
+			     tage->used + known_size;
+
+		max_nob = PAGE_SIZE - tage->used - known_size;
+		if (max_nob <= 0) {
+			pr_emerg("LustreError: negative max_nob: %d\n",
+				 max_nob);
+			mask |= D_ERROR;
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		va_start(ap, format);
+		if (retry)
+			needed = vscnprintf(string_buf, max_nob, format, ap);
+		else
+			needed = vsnprintf(string_buf, max_nob, format, ap);
+		va_end(ap);
+
+		if (needed < max_nob) /* well. printing ok.. */
+			break;
+	}
+
+	/* `needed` is actual bytes written to string_buf */
+	if (*(string_buf + needed - 1) != '\n') {
+		pr_info("Lustre: format at %s:%d:%s doesn't end in newline\n",
+			file, msgdata->msg_line, msgdata->msg_fn);
+	}
+
+	header.ph_len = known_size + needed;
+	debug_buf = (char *)page_address(tage->page) + tage->used;
+
+	if (libcfs_debug_binary) {
+		memcpy(debug_buf, &header, sizeof(header));
+		tage->used += sizeof(header);
+		debug_buf += sizeof(header);
+	}
+
+	strlcpy(debug_buf, file, PAGE_SIZE - tage->used);
+	tage->used += strlen(file) + 1;
+	debug_buf += strlen(file) + 1;
+
+	if (msgdata->msg_fn) {
+		strlcpy(debug_buf, msgdata->msg_fn, PAGE_SIZE - tage->used);
+		tage->used += strlen(msgdata->msg_fn) + 1;
+		debug_buf += strlen(msgdata->msg_fn) + 1;
+	}
+
+	__LASSERT(debug_buf == string_buf);
+
+	tage->used += needed;
+	__LASSERT(tage->used <= PAGE_SIZE);
+
+console:
+	if ((mask & libcfs_printk) == 0) {
+		/* no console output requested */
+		if (tcd != NULL)
+			cfs_trace_put_tcd(tcd);
+		return 1;
+	}
+
+	if (cdls != NULL) {
+		if (libcfs_console_ratelimit &&
+		    cdls->cdls_next != 0 &&	/* not first time ever */
+		    time_before(jiffies, cdls->cdls_next)) {
+			/* skipping a console message */
+			cdls->cdls_count++;
+			if (tcd != NULL)
+				cfs_trace_put_tcd(tcd);
+			return 1;
+		}
+
+		if (time_after(jiffies, cdls->cdls_next +
+					libcfs_console_max_delay +
+					cfs_time_seconds(10))) {
+			/* last timeout was a long time ago */
+			cdls->cdls_delay /= libcfs_console_backoff * 4;
+		} else {
+			cdls->cdls_delay *= libcfs_console_backoff;
+		}
+
+		if (cdls->cdls_delay < libcfs_console_min_delay)
+			cdls->cdls_delay = libcfs_console_min_delay;
+		else if (cdls->cdls_delay > libcfs_console_max_delay)
+			cdls->cdls_delay = libcfs_console_max_delay;
+
+		/* ensure cdls_next is never zero after it's been seen */
+		cdls->cdls_next = (jiffies + cdls->cdls_delay) | 1;
+	}
+
+	if (tcd) {
+		cfs_print_to_console(&header, mask, file, msgdata->msg_fn,
+				     "%s", string_buf);
+		cfs_trace_put_tcd(tcd);
+	} else {
+		struct va_format vaf;
+
+		va_start(ap, format);
+		vaf.fmt = format;
+		vaf.va = &ap;
+		cfs_vprint_to_console(&header, mask,
+				      &vaf, file, msgdata->msg_fn);
+		va_end(ap);
+	}
+
+	if (cdls != NULL && cdls->cdls_count != 0) {
+		cfs_print_to_console(&header, mask, file,
+				     msgdata->msg_fn,
+				     "Skipped %d previous similar message%s\n",
+				     cdls->cdls_count,
+				     (cdls->cdls_count > 1) ? "s" : "");
+
+		cdls->cdls_count = 0;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_msg);
+
+void
+cfs_trace_assertion_failed(const char *str,
+			   struct libcfs_debug_msg_data *msgdata)
+{
+	struct ptldebug_header hdr;
+
+	libcfs_panic_in_progress = 1;
+	libcfs_catastrophe = 1;
+	smp_mb();
+
+	cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
+
+	cfs_print_to_console(&hdr, D_EMERG, msgdata->msg_file, msgdata->msg_fn,
+			     "%s", str);
+
+	panic("Lustre debug assertion failure\n");
+
+	/* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+	/* Do the collect_pages job on a single CPU: assumes that all other
+	 * CPUs have been stopped during a panic.  If this isn't true for some
+	 * arch, this will have to be implemented separately in each arch.  */
+	int			   i;
+	int			   j;
+	struct cfs_trace_cpu_data *tcd;
+
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	cfs_tcd_for_each(tcd, i, j) {
+		list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+		tcd->tcd_cur_pages = 0;
+	}
+}
+
+static void collect_pages_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+			tcd->tcd_cur_pages = 0;
+		}
+	}
+}
+
+static void collect_pages(struct page_collection *pc)
+{
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	if (libcfs_panic_in_progress)
+		panic_collect_pages(pc);
+	else
+		collect_pages_on_all_cpus(pc);
+}
+
+static void put_pages_back_on_all_cpus(struct page_collection *pc)
+{
+        struct cfs_trace_cpu_data *tcd;
+	struct list_head *cur_head;
+        struct cfs_trace_page *tage;
+        struct cfs_trace_page *tmp;
+        int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+                cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+                        cur_head = tcd->tcd_pages.next;
+
+			list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
+						 linkage) {
+
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				if (tage->cpu != cpu || tage->type != i)
+					continue;
+
+				cfs_tage_to_tail(tage, cur_head);
+				tcd->tcd_cur_pages++;
+			}
+		}
+	}
+}
+
+static void put_pages_back(struct page_collection *pc)
+{
+        if (!libcfs_panic_in_progress)
+                put_pages_back_on_all_cpus(pc);
+}
+
+#ifdef LNET_DUMP_ON_PANIC
+void cfs_trace_debug_print(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	struct page *page;
+
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+		char *p, *file, *fn;
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		page = tage->page;
+		p = page_address(page);
+		while (p < ((char *)page_address(page) + tage->used)) {
+			struct ptldebug_header *hdr;
+			int len;
+			hdr = (void *)p;
+			p += sizeof(*hdr);
+			file = p;
+			p += strlen(file) + 1;
+			fn = p;
+			p += strlen(fn) + 1;
+			len = hdr->ph_len - (int)(p - (char *)hdr);
+
+			cfs_print_to_console(hdr, D_EMERG, file, fn,
+					     "%.*s", len, p);
+
+			p += len;
+		}
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+	down_write(&cfs_tracefile_sem);
+	while ((page = list_first_entry_or_null(&daemon_pages,
+						struct page, lru)) != NULL) {
+		char *p, *file, *fn;
+
+		p = page_address(page);
+		while (p < ((char *)page_address(page) + page->private)) {
+			struct ptldebug_header *hdr;
+			int len;
+
+			hdr = (void *)p;
+			p += sizeof(*hdr);
+			file = p;
+			p += strlen(file) + 1;
+			fn = p;
+			p += strlen(fn) + 1;
+			len = hdr->ph_len - (int)(p - (char *)hdr);
+
+			cfs_print_to_console(hdr, D_EMERG, file, fn,
+					     "%.*s", len, p);
+
+			p += len;
+		}
+		list_del_init(&page->lru);
+		daemon_pages_count -= 1;
+		put_page(page);
+	}
+	up_write(&cfs_tracefile_sem);
+}
+#endif /* LNET_DUMP_ON_PANIC */
+
+int cfs_tracefile_dump_all_pages(char *filename)
+{
+	struct page_collection pc;
+	struct file *filp;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	char *buf;
+	struct page *page;
+	int rc;
+
+	down_write(&cfs_tracefile_sem);
+
+	filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		filp = NULL;
+		pr_err("LustreError: can't open %s for dump: rc = %d\n",
+		       filename, rc);
+		goto out;
+	}
+
+	collect_pages(&pc);
+	if (list_empty(&pc.pc_pages)) {
+		rc = 0;
+		goto close;
+	}
+
+	/* ok, for now, just write the pages.  in the future we'll be building
+	 * iobufs with the pages and calling generic_direct_IO */
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		buf = kmap(tage->page);
+		rc = cfs_kernel_write(filp, buf, tage->used, &filp->f_pos);
+		kunmap(tage->page);
+		if (rc != (int)tage->used) {
+			pr_warn("Lustre: wanted to write %u but wrote %d\n",
+				tage->used, rc);
+			put_pages_back(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			break;
+		}
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+	while ((page = list_first_entry_or_null(&daemon_pages,
+						struct page, lru)) != NULL) {
+		buf = page_address(page);
+		rc = cfs_kernel_write(filp, buf, page->private, &filp->f_pos);
+		if (rc != (int)page->private) {
+			pr_warn("Lustre: wanted to write %u but wrote %d\n",
+				(int)page->private, rc);
+			break;
+		}
+		list_del(&page->lru);
+		daemon_pages_count -= 1;
+		put_page(page);
+	}
+	rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1);
+	if (rc)
+		pr_err("LustreError: sync returns: rc = %d\n", rc);
+close:
+	filp_close(filp, NULL);
+out:
+	up_write(&cfs_tracefile_sem);
+	return rc;
+}
+
+void cfs_trace_flush_pages(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct page *page;
+
+	collect_pages(&pc);
+	while (!list_empty(&pc.pc_pages)) {
+		tage = list_first_entry(&pc.pc_pages,
+					struct cfs_trace_page, linkage);
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+
+	down_write(&cfs_tracefile_sem);
+	while ((page = list_first_entry_or_null(&daemon_pages,
+						struct page, lru)) != NULL) {
+		list_del(&page->lru);
+		daemon_pages_count -= 1;
+		put_page(page);
+	}
+	up_write(&cfs_tracefile_sem);
+}
+
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+                             const char *knl_buffer, char *append)
+{
+        /* NB if 'append' != NULL, it's a single character to append to the
+         * copied out string - usually "\n", for /proc entries and "" (i.e. a
+         * terminating zero byte) for sysctl entries */
+        int   nob = strlen(knl_buffer);
+
+        if (nob > usr_buffer_nob)
+                nob = usr_buffer_nob;
+
+	if (copy_to_user(usr_buffer, knl_buffer, nob))
+                return -EFAULT;
+
+        if (append != NULL && nob < usr_buffer_nob) {
+		if (copy_to_user(usr_buffer + nob, append, 1))
+                        return -EFAULT;
+
+                nob++;
+        }
+
+        return nob;
+}
+EXPORT_SYMBOL(cfs_trace_copyout_string);
+
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob)
+{
+	char *str;
+	char *path;
+	int rc;
+
+	str = memdup_user_nul(usr_str, usr_str_nob);
+	if (IS_ERR(str))
+		return PTR_ERR(str);
+
+	path = strim(str);
+	if (path[0] != '/')
+		rc = -EINVAL;
+	else
+		rc = cfs_tracefile_dump_all_pages(path);
+	kfree(str);
+
+	return rc;
+}
+
+int cfs_trace_daemon_command(char *str)
+{
+        int       rc = 0;
+
+	down_write(&cfs_tracefile_sem);
+
+        if (strcmp(str, "stop") == 0) {
+		up_write(&cfs_tracefile_sem);
+                cfs_trace_stop_thread();
+		down_write(&cfs_tracefile_sem);
+                memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
+
+	} else if (strncmp(str, "size=", 5) == 0) {
+		unsigned long tmp;
+
+		rc = kstrtoul(str + 5, 10, &tmp);
+		if (!rc) {
+			if (tmp < 10 || tmp > 20480)
+				cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+			else
+				cfs_tracefile_size = tmp << 20;
+		}
+        } else if (strlen(str) >= sizeof(cfs_tracefile)) {
+                rc = -ENAMETOOLONG;
+        } else if (str[0] != '/') {
+                rc = -EINVAL;
+        } else {
+		strcpy(cfs_tracefile, str);
+
+		pr_info("Lustre: debug daemon will attempt to start writing to %s (%lukB max)\n",
+			cfs_tracefile, (long)(cfs_tracefile_size >> 10));
+
+		cfs_trace_start_thread();
+        }
+
+	up_write(&cfs_tracefile_sem);
+        return rc;
+}
+
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob)
+{
+	char *str;
+	int   rc;
+
+	str = memdup_user_nul(usr_str, usr_str_nob);
+	if (IS_ERR(str))
+		return PTR_ERR(str);
+
+	rc = cfs_trace_daemon_command(strim(str));
+	kfree(str);
+
+	return rc;
+}
+
+int cfs_trace_set_debug_mb(int mb)
+{
+	int i;
+	int j;
+	unsigned long pages;
+	unsigned long total_mb = (cfs_totalram_pages() >> (20 - PAGE_SHIFT));
+	unsigned long limit = max_t(unsigned long, 512, (total_mb * 4) / 5);
+	struct cfs_trace_cpu_data *tcd;
+
+	if (mb < num_possible_cpus()) {
+		pr_warn("Lustre: %d MB is too small for debug buffer size, setting it to %d MB.\n",
+			mb, num_possible_cpus());
+		mb = num_possible_cpus();
+	}
+
+	if (mb > limit) {
+		pr_warn("Lustre: %d MB is too large for debug buffer size, setting it to %lu MB.\n",
+			mb, limit);
+		mb = limit;
+	}
+
+	mb /= num_possible_cpus();
+	pages = mb << (20 - PAGE_SHIFT);
+
+	down_write(&cfs_tracefile_sem);
+
+	cfs_tcd_for_each(tcd, i, j)
+		tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
+
+	daemon_pages_max = pages;
+	up_write(&cfs_tracefile_sem);
+
+	return mb;
+}
+
+int cfs_trace_get_debug_mb(void)
+{
+        int i;
+        int j;
+        struct cfs_trace_cpu_data *tcd;
+        int total_pages = 0;
+
+	down_read(&cfs_tracefile_sem);
+
+        cfs_tcd_for_each(tcd, i, j)
+                total_pages += tcd->tcd_max_pages;
+
+	up_read(&cfs_tracefile_sem);
+
+	return (total_pages >> (20 - PAGE_SHIFT)) + 1;
+}
+
+static int tracefiled(void *arg)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	struct file *filp;
+	char *buf;
+	int last_loop = 0;
+	int rc;
+
+	while (!last_loop) {
+		LIST_HEAD(for_daemon_pages);
+		int for_daemon_pages_count = 0;
+		schedule_timeout_interruptible(cfs_time_seconds(1));
+		if (kthread_should_stop())
+			last_loop = 1;
+		collect_pages(&pc);
+		if (list_empty(&pc.pc_pages))
+			continue;
+
+		filp = NULL;
+		down_read(&cfs_tracefile_sem);
+		if (cfs_tracefile[0] != 0) {
+			filp = filp_open(cfs_tracefile,
+					 O_CREAT | O_RDWR | O_LARGEFILE,
+					 0600);
+			if (IS_ERR(filp)) {
+				rc = PTR_ERR(filp);
+				filp = NULL;
+				pr_warn("Lustre: couldn't open %s: rc = %d\n",
+					cfs_tracefile, rc);
+			}
+		}
+		up_read(&cfs_tracefile_sem);
+
+		list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+			__LASSERT_TAGE_INVARIANT(tage);
+
+			if (filp) {
+				struct dentry *de = file_dentry(filp);
+				static loff_t f_pos;
+
+				if (f_pos >= (off_t)cfs_tracefile_size)
+					f_pos = 0;
+				else if (f_pos > i_size_read(de->d_inode))
+					f_pos = i_size_read(de->d_inode);
+
+				buf = kmap(tage->page);
+				rc = cfs_kernel_write(filp, buf, tage->used,
+						      &f_pos);
+				kunmap(tage->page);
+				if (rc != (int)tage->used) {
+					pr_warn("Lustre: wanted to write %u but wrote %d\n",
+						tage->used, rc);
+					put_pages_back(&pc);
+					__LASSERT(list_empty(&pc.pc_pages));
+					break;
+				}
+			}
+			list_del_init(&tage->linkage);
+			list_add_tail(&tage->page->lru, &for_daemon_pages);
+			for_daemon_pages_count += 1;
+
+			tage->page->private = (int)tage->used;
+			kfree(tage);
+			atomic_dec(&cfs_tage_allocated);
+		}
+
+		if (filp)
+			filp_close(filp, NULL);
+
+		down_write(&cfs_tracefile_sem);
+		list_splice_tail(&for_daemon_pages, &daemon_pages);
+		daemon_pages_count += for_daemon_pages_count;
+		while (daemon_pages_count > daemon_pages_max) {
+			struct page *p = list_first_entry(&daemon_pages,
+							  struct page, lru);
+			list_del(&p->lru);
+			put_page(p);
+			daemon_pages_count -= 1;
+		}
+		up_write(&cfs_tracefile_sem);
+
+		if (!list_empty(&pc.pc_pages)) {
+			int i;
+
+			pr_alert("Lustre: trace pages aren't empty\n");
+			pr_err("Lustre: total cpus(%d): ", num_possible_cpus());
+			for (i = 0; i < num_possible_cpus(); i++)
+				if (cpu_online(i))
+					pr_cont("%d(on) ", i);
+				else
+					pr_cont("%d(off) ", i);
+			pr_cont("\n");
+
+			i = 0;
+			list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						 linkage)
+				pr_err("Lustre: page %d belongs to cpu %d\n",
+				       ++i, tage->cpu);
+			pr_err("Lustre: There are %d pages unwritten\n", i);
+		}
+		__LASSERT(list_empty(&pc.pc_pages));
+	}
+
+	return 0;
+}
+
+int cfs_trace_start_thread(void)
+{
+	struct task_struct *tsk;
+	int rc = 0;
+
+	if (tctl_task)
+		return 0;
+
+	tsk = kthread_create(tracefiled, NULL, "ktracefiled");
+	if (IS_ERR(tsk))
+		rc = -ECHILD;
+	else if (cmpxchg(&tctl_task, NULL, tsk) != NULL)
+		/* already running */
+		kthread_stop(tsk);
+	else
+		wake_up_process(tsk);
+
+	return rc;
+}
+
+void cfs_trace_stop_thread(void)
+{
+	struct task_struct *tsk;
+
+	tsk = xchg(&tctl_task, NULL);
+	if (tsk) {
+		pr_info("Lustre: shutting down debug daemon thread...\n");
+		kthread_stop(tsk);
+	}
+}
+
+/* percents to share the total debug memory for each type */
+static unsigned int pages_factor[CFS_TCD_TYPE_CNT] = {
+	80, /* 80% pages for CFS_TCD_TYPE_PROC */
+	10, /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
+	10  /* 10% pages for CFS_TCD_TYPE_IRQ */
+};
+
+int cfs_tracefile_init(int max_pages)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i;
+	int j;
+
+	/* initialize trace_data */
+	memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
+	for (i = 0; i < CFS_TCD_TYPE_CNT; i++) {
+		cfs_trace_data[i] =
+			kmalloc_array(num_possible_cpus(),
+				      sizeof(union cfs_trace_data_union),
+				      GFP_KERNEL);
+		if (!cfs_trace_data[i])
+			goto out_trace_data;
+	}
+
+	/* arch related info initialized */
+	cfs_tcd_for_each(tcd, i, j) {
+		int factor = pages_factor[i];
+
+		spin_lock_init(&tcd->tcd_lock);
+		tcd->tcd_pages_factor = factor;
+		tcd->tcd_type = i;
+		tcd->tcd_cpu = j;
+
+		INIT_LIST_HEAD(&tcd->tcd_pages);
+		INIT_LIST_HEAD(&tcd->tcd_stock_pages);
+		tcd->tcd_cur_pages = 0;
+		tcd->tcd_cur_stock_pages = 0;
+		tcd->tcd_max_pages = (max_pages * factor) / 100;
+		LASSERT(tcd->tcd_max_pages > 0);
+		tcd->tcd_shutting_down = 0;
+	}
+	daemon_pages_max = max_pages;
+
+	return 0;
+
+out_trace_data:
+	for (i = 0; cfs_trace_data[i]; i++) {
+		kfree(cfs_trace_data[i]);
+		cfs_trace_data[i] = NULL;
+	}
+	pr_err("lnet: Not enough memory\n");
+	return -ENOMEM;
+}
+
+static void trace_cleanup_on_all_cpus(void)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct cfs_trace_page *tage;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			if (!tcd->tcd_pages_factor)
+				/* Not initialised */
+				continue;
+			tcd->tcd_shutting_down = 1;
+
+			while (!list_empty(&tcd->tcd_pages)) {
+				tage = list_first_entry(&tcd->tcd_pages,
+							struct cfs_trace_page,
+							linkage);
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				list_del(&tage->linkage);
+				cfs_tage_free(tage);
+			}
+			tcd->tcd_cur_pages = 0;
+		}
+	}
+}
+
+static void cfs_trace_cleanup(void)
+{
+	struct page_collection pc;
+	int i;
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+
+	trace_cleanup_on_all_cpus();
+
+	for (i = 0; i < CFS_TCD_TYPE_CNT && cfs_trace_data[i]; i++) {
+		kfree(cfs_trace_data[i]);
+		cfs_trace_data[i] = NULL;
+	}
+}
+
+void cfs_tracefile_exit(void)
+{
+	cfs_trace_stop_thread();
+	cfs_trace_flush_pages();
+	cfs_trace_cleanup();
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
new file mode 100644
index 0000000000000..406a8d5a1fc5c
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
@@ -0,0 +1,190 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LIBCFS_TRACEFILE_H__
+#define __LIBCFS_TRACEFILE_H__
+
+#include <libcfs/libcfs.h>
+
+#define TRACEFILE_NAME_SIZE 1024
+extern char      cfs_tracefile[TRACEFILE_NAME_SIZE];
+extern long long cfs_tracefile_size;
+
+/**
+ * The path of debug log dump upcall script.
+ */
+extern char lnet_debug_log_upcall[1024];
+
+int cfs_tracefile_dump_all_pages(char *filename);
+void cfs_trace_debug_print(void);
+void cfs_trace_flush_pages(void);
+int cfs_trace_start_thread(void);
+void cfs_trace_stop_thread(void);
+int cfs_tracefile_init(int max_pages);
+void cfs_tracefile_exit(void);
+
+
+
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+                             const char *knl_str, char *append);
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_daemon_command(char *str);
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_get_debug_mb(void);
+
+extern int  libcfs_panic_in_progress;
+
+#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+union cfs_trace_data_union {
+	struct cfs_trace_cpu_data {
+		/*
+		 * Even though this structure is meant to be per-CPU, locking
+		 * is needed because in some places the data may be accessed
+		 * from other CPUs. This lock is directly used in trace_get_tcd
+		 * and trace_put_tcd, which are called in libcfs_debug_msg and
+		 * tcd_for_each_type_lock
+		 */
+		spinlock_t		tcd_lock;
+		unsigned long           tcd_lock_flags;
+
+		/*
+		 * pages with trace records not yet processed by tracefiled.
+		 */
+		struct list_head	tcd_pages;
+		/* number of pages on ->tcd_pages */
+		unsigned long		tcd_cur_pages;
+
+		/*
+		 * Maximal number of pages allowed on ->tcd_pages
+		 * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current
+		 * implementation.
+		 */
+		unsigned long           tcd_max_pages;
+
+		/*
+		 * preallocated pages to write trace records into. Pages from
+		 * ->tcd_stock_pages are moved to ->tcd_pages by
+		 * portals_debug_msg().
+		 *
+		 * This list is necessary, because on some platforms it's
+		 * impossible to perform efficient atomic page allocation in a
+		 * non-blockable context.
+		 *
+		 * Such platforms fill ->tcd_stock_pages "on occasion", when
+		 * tracing code is entered in blockable context.
+		 *
+		 * trace_get_tage_try() tries to get a page from
+		 * ->tcd_stock_pages first and resorts to atomic page
+		 * allocation only if this queue is empty. ->tcd_stock_pages
+		 * is replenished when tracing code is entered in blocking
+		 * context (darwin-tracefile.c:trace_get_tcd()). We try to
+		 * maintain TCD_STOCK_PAGES (40 by default) pages in this
+		 * queue. Atomic allocation is only required if more than
+		 * TCD_STOCK_PAGES pagesful are consumed by trace records all
+		 * emitted in non-blocking contexts. Which is quite unlikely.
+		 */
+		struct list_head	tcd_stock_pages;
+		/* number of pages on ->tcd_stock_pages */
+		unsigned long           tcd_cur_stock_pages;
+
+		unsigned short          tcd_shutting_down;
+		unsigned short          tcd_cpu;
+		unsigned short          tcd_type;
+		/* The factors to share debug memory. */
+		unsigned short          tcd_pages_factor;
+	} tcd;
+	char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))];
+};
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct page_collection {
+	struct list_head	pc_pages;
+};
+
+/*
+ * small data-structure for each page owned by tracefiled.
+ */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct cfs_trace_page {
+	/*
+	 * page itself
+	 */
+	struct page		*page;
+	/*
+	 * linkage into one of the lists in trace_data_union or
+	 * page_collection
+	 */
+	struct list_head	linkage;
+	/*
+	 * number of bytes used within this page
+	 */
+	unsigned int		used;
+	/*
+	 * cpu that owns this page
+	 */
+	unsigned short		cpu;
+	/*
+	 * type(context) of this page
+	 */
+	unsigned short		type;
+};
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+                      struct cfs_trace_page *tage);
+
+extern void cfs_trace_assertion_failed(const char *str,
+                                       struct libcfs_debug_msg_data *m);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond)							\
+do {									\
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);	\
+		cfs_trace_assertion_failed("ASSERTION("#cond") failed",	\
+					   &msgdata);			\
+	}								\
+} while (0)
+
+#define __LASSERT_TAGE_INVARIANT(tage)					\
+do {									\
+	__LASSERT(tage != NULL);					\
+	__LASSERT(tage->page != NULL);					\
+	__LASSERT(tage->used <= PAGE_SIZE);				\
+	__LASSERT(page_count(tage->page) > 0);				\
+} while (0)
+
+#endif /* __LIBCFS_TRACEFILE_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
new file mode 100644
index 0000000000000..15e3d330b3bea
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define __USE_FILE_OFFSET64
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/types.h>
+
+#include <libcfs/util/ioctl.h>
+#include <linux/lnet/lnetctl.h>
+
+struct ioc_dev {
+	const char *dev_name;
+	int dev_fd;
+};
+
+static struct ioc_dev ioc_dev_list[10];
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
+#endif /* !ARRAY_SIZE */
+
+static int
+open_ioc_dev(int dev_id)
+{
+	const char *dev_name;
+
+	if (dev_id < 0 || dev_id >= ARRAY_SIZE(ioc_dev_list)) {
+		errno = EINVAL;
+		return -errno;
+	}
+
+	dev_name = ioc_dev_list[dev_id].dev_name;
+	if (!dev_name) {
+		fprintf(stderr, "unknown device id: %d\n", dev_id);
+		errno = EINVAL;
+		return -errno;
+	}
+
+	if (ioc_dev_list[dev_id].dev_fd < 0) {
+		int fd = open(dev_name, O_RDWR);
+
+		if (fd < 0) {
+			fprintf(stderr, "opening %s failed: %s\n"
+				"hint: the kernel modules may not be loaded\n",
+				dev_name, strerror(errno));
+			return -errno;
+		}
+		ioc_dev_list[dev_id].dev_fd = fd;
+	}
+
+	return ioc_dev_list[dev_id].dev_fd;
+}
+
+int l_ioctl(int dev_id, unsigned int opc, void *buf)
+{
+	int fd, rc;
+
+	fd = open_ioc_dev(dev_id);
+	if (fd < 0)
+		return fd;
+
+	rc = ioctl(fd, opc, buf);
+
+	return rc < 0 ? -errno : rc;
+}
+
+/* register a device to send ioctls to. */
+int
+register_ioc_dev(int dev_id, const char *dev_name)
+{
+	if (dev_id < 0 ||
+	    dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0]))
+		return -EINVAL;
+
+	unregister_ioc_dev(dev_id);
+
+	ioc_dev_list[dev_id].dev_name = dev_name;
+	ioc_dev_list[dev_id].dev_fd = -1;
+
+	return dev_id;
+}
+
+void
+unregister_ioc_dev(int dev_id)
+{
+	if (dev_id < 0 ||
+	    dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0]))
+		return;
+
+	if (ioc_dev_list[dev_id].dev_name &&
+	    ioc_dev_list[dev_id].dev_fd >= 0)
+		close(ioc_dev_list[dev_id].dev_fd);
+
+	ioc_dev_list[dev_id].dev_name = NULL;
+	ioc_dev_list[dev_id].dev_fd = -1;
+}
+
+static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
+{
+	size_t len = sizeof(*data);
+
+	len += (data->ioc_inllen1 + 7) & ~7;
+	len += (data->ioc_inllen2 + 7) & ~7;
+	return len;
+}
+
+int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf, int max)
+{
+	char *ptr;
+	struct libcfs_ioctl_data *overlay;
+
+	data->ioc_hdr.ioc_len = libcfs_ioctl_packlen(data);
+	data->ioc_hdr.ioc_version = LIBCFS_IOCTL_VERSION;
+
+	if (*pbuf && libcfs_ioctl_packlen(data) > max)
+		return 1;
+	if (!*pbuf)
+		*pbuf = malloc(data->ioc_hdr.ioc_len);
+	if (!*pbuf)
+		return 1;
+	overlay = (struct libcfs_ioctl_data *)*pbuf;
+	memcpy(*pbuf, data, sizeof(*data));
+
+	ptr = overlay->ioc_bulk;
+	if (data->ioc_inlbuf1) {
+		memcpy((char *)ptr, (const char *)data->ioc_inlbuf1,
+		       data->ioc_inllen1);
+		ptr += ((data->ioc_inllen1 + 7) & ~7);
+	}
+	if (data->ioc_inlbuf2) {
+		memcpy((char *)ptr, (const char *)data->ioc_inlbuf2,
+		       data->ioc_inllen2);
+		ptr += ((data->ioc_inllen2 + 7) & ~7);
+	}
+
+	return 0;
+}
+
+void
+libcfs_ioctl_unpack(struct libcfs_ioctl_data *data, char *pbuf)
+{
+	struct libcfs_ioctl_data *overlay = (struct libcfs_ioctl_data *)pbuf;
+	char *ptr;
+
+	/* Preserve the caller's buffer pointers */
+	overlay->ioc_inlbuf1 = data->ioc_inlbuf1;
+	overlay->ioc_inlbuf2 = data->ioc_inlbuf2;
+
+	memcpy(data, pbuf, sizeof(*data));
+	ptr = &overlay->ioc_bulk[0];
+
+	if (data->ioc_inlbuf1) {
+		memcpy((char *)data->ioc_inlbuf1, (const char *)ptr,
+		       data->ioc_inllen1);
+		ptr += ((data->ioc_inllen1 + 7) & ~7);
+	}
+	if (data->ioc_inlbuf2) {
+		memcpy((char *)data->ioc_inlbuf2, (const char *)ptr,
+		       data->ioc_inllen2);
+		ptr += ((data->ioc_inllen2 + 7) & ~7);
+	}
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
new file mode 100644
index 0000000000000..780a8ab1ac21f
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
@@ -0,0 +1,1647 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/libcfs/util/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <libcfs/util/string.h>
+#include <linux/lnet/lnet-types.h>
+#include <linux/lnet/nidstr.h>
+#ifdef HAVE_NETDB_H
+# include <netdb.h>
+#endif
+
+/* max value for numeric network address */
+#define MAX_NUMERIC_VALUE 0xffffffff
+
+#define IPSTRING_LENGTH 16
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char      libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int       libcfs_nidstring_idx;
+
+char *
+libcfs_next_nidstring(void)
+{
+	char          *str;
+
+	str = libcfs_nidstrings[libcfs_nidstring_idx++];
+	if (libcfs_nidstring_idx ==
+	    sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0]))
+		libcfs_nidstring_idx = 0;
+
+	return str;
+}
+
+static int
+libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+	*addr = 0;
+	return 1;
+}
+
+static void
+libcfs_ip_addr2str(__u32 addr, char *str, size_t size)
+{
+	snprintf(str, size, "%u.%u.%u.%u",
+		 (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+		 (addr >> 8) & 0xff, addr & 0xff);
+}
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+static int
+libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+	unsigned int	a;
+	unsigned int	b;
+	unsigned int	c;
+	unsigned int	d;
+	int		n = nob; /* XscanfX */
+
+	/* numeric IP? */
+	if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+	    n == nob &&
+	    (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+	    (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+		*addr = ((a<<24)|(b<<16)|(c<<8)|d);
+		return 1;
+	}
+
+#ifdef HAVE_GETHOSTBYNAME
+	/* known hostname? */
+	if (('a' <= str[0] && str[0] <= 'z') ||
+	    ('A' <= str[0] && str[0] <= 'Z')) {
+		char *tmp;
+
+		tmp = calloc(1, nob + 1);
+		if (tmp != NULL) {
+			struct hostent *he;
+
+			memcpy(tmp, str, nob);
+			tmp[nob] = 0;
+
+			he = gethostbyname(tmp);
+
+			free(tmp);
+
+			if (he != NULL) {
+				__u32 ip = *(__u32 *)he->h_addr;
+
+				*addr = ntohl(ip);
+				return 1;
+			}
+		}
+	}
+#endif
+	return 0;
+}
+
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	struct cfs_lstr src;
+	int rc;
+	int i;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	i = 0;
+
+	while (src.ls_str != NULL) {
+		struct cfs_lstr res;
+
+		if (!cfs_gettok(&src, '.', &res)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+		if (rc != 0)
+			goto out;
+
+		list_add_tail(&el->el_link, list);
+		i++;
+	}
+
+	if (i == 4)
+		return 0;
+
+	rc = -EINVAL;
+out:
+	cfs_expr_list_free_list(list);
+
+	return rc;
+}
+
+int
+cfs_expr2str(struct list_head *list, char *str, size_t size)
+{
+	struct cfs_expr_list *expr;
+	struct cfs_range_expr *range;
+	char tmp[LNET_NIDSTR_SIZE];
+	size_t len;
+	bool first;
+	bool bracket = false;
+	char *format;
+	char *tmpc;
+
+	list_for_each_entry(expr, list, el_link) {
+		first = true;
+		list_for_each_entry(range, &expr->el_exprs, re_link) {
+			if (range->re_lo == range->re_hi) {
+				snprintf(tmp,
+					 LNET_NIDSTR_SIZE,
+					 "%u.", range->re_lo);
+			} else if (range->re_lo < range->re_hi) {
+				if (range->re_stride > 1) {
+					if (first)
+						format = "[%u-%u/%u,";
+					else
+						format = "%u-%u/%u,";
+					snprintf(tmp, LNET_NIDSTR_SIZE,
+						format, range->re_lo,
+						range->re_hi, range->re_stride);
+					bracket = true;
+				} else {
+					if (first)
+						format = "[%u-%u,";
+					else
+						format = "%u-%u,";
+					snprintf(tmp, LNET_NIDSTR_SIZE,
+						format, range->re_lo,
+						range->re_hi);
+					bracket = true;
+				}
+			} else {
+				return -EINVAL;
+			}
+			len = strlen(tmp);
+			size -= (len + 1);
+			if (size < 0)
+				return -ENOBUFS;
+
+			strncat(str, tmp, size + len);
+			first = false;
+		}
+		if (bracket) {
+			tmpc = str + (strlen(str) - 1);
+			size -= 1;
+			if (size < 0)
+				return -ENOBUFS;
+			*tmpc = ']';
+			*(tmpc+1) = '.';
+			bracket = false;
+		}
+	}
+
+	/*
+	 * get rid of the trailing '.' at the end of the string
+	 * only if we actually had something on the list passed in.
+	 * otherwise we could write outside the array
+	 */
+	if (!list_empty(list))
+		str[strlen(str)-1] = '\0';
+	return size;
+}
+
+static int
+libcfs_num_addr_range_expand(struct list_head *addrranges, __u32 *addrs,
+			     int max_addrs)
+{
+	struct cfs_expr_list *expr_list;
+	struct cfs_range_expr *range;
+	int i;
+	int max_idx = max_addrs - 1;
+	int addrs_idx = max_idx;
+
+	list_for_each_entry(expr_list, addrranges, el_link) {
+		list_for_each_entry(range, &expr_list->el_exprs, re_link) {
+			for (i = range->re_lo; i <= range->re_hi;
+			     i += range->re_stride) {
+				if (addrs_idx < 0)
+					return -1;
+
+				addrs[addrs_idx] = i;
+				addrs_idx--;
+			}
+		}
+	}
+
+	return max_idx - addrs_idx;
+}
+
+static int
+libcfs_ip_addr_range_expand(struct list_head *addrranges, __u32 *addrs,
+			    int max_addrs)
+{
+	int rc = 0;
+
+	rc = cfs_ip_addr_range_gen(addrs, max_addrs, addrranges);
+
+	if (rc == -1)
+		return rc;
+	else
+		return max_addrs - rc - 1;
+}
+
+static int
+libcfs_ip_addr_range_print(char *buffer, int count, struct list_head *list)
+{
+	int i = 0, j = 0;
+	struct cfs_expr_list *el;
+
+	list_for_each_entry(el, list, el_link) {
+		assert(j++ < 4);
+		if (i != 0)
+			i += scnprintf(buffer + i, count - i, ".");
+		i += cfs_expr_list_print(buffer + i, count - i, el);
+	}
+	return i;
+}
+
+static int
+cfs_ip_addr_range_gen_recurse(__u32 *ip_list, int *count, int shift,
+			      __u32 result, struct list_head *head_el,
+			      struct cfs_expr_list *octet_el)
+{
+	__u32 value = 0;
+	int i;
+	struct cfs_expr_list *next_octet_el;
+	struct cfs_range_expr *octet_expr;
+
+	/*
+	 * each octet can have multiple expressions so we need to traverse
+	 * all of the expressions
+	 */
+	list_for_each_entry(octet_expr, &octet_el->el_exprs, re_link) {
+		for (i = octet_expr->re_lo; i <= octet_expr->re_hi; i++) {
+			if (((i - octet_expr->re_lo) % octet_expr->re_stride) == 0) {
+				/*
+				 * we have a hit calculate the result and
+				 * pass it forward to the next iteration
+				 * of the recursion.
+				 */
+				next_octet_el =
+					list_entry(octet_el->el_link.next,
+							typeof(*next_octet_el),
+							el_link);
+				value = result | (i << (shift * 8));
+				if (next_octet_el->el_link.next != head_el) {
+					/*
+					 * We still have more octets in
+					 * the IP address so traverse
+					 * that. We're doing a depth first
+					 * recursion here.
+					 */
+					if (cfs_ip_addr_range_gen_recurse(ip_list, count,
+									  shift - 1, value,
+									  head_el,
+									  next_octet_el) == -1)
+						return -1;
+				} else {
+					/*
+					 * We have hit a leaf so store the
+					 * calculated IP address in the
+					 * list. If we have run out of
+					 * space stop the recursion.
+					 */
+					if (*count == -1)
+						return -1;
+					/* add ip to the list */
+					ip_list[*count] = value;
+					(*count)--;
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * only generate maximum of count ip addresses from the given expression
+ */
+int
+cfs_ip_addr_range_gen(__u32 *ip_list, int count, struct list_head *ip_addr_expr)
+{
+	struct cfs_expr_list *octet_el;
+	int idx = count - 1;
+
+	octet_el = list_entry(ip_addr_expr->next, typeof(*octet_el), el_link);
+
+	(void) cfs_ip_addr_range_gen_recurse(ip_list, &idx, 3, 0, &octet_el->el_link, octet_el);
+
+	return idx;
+}
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0;
+
+	list_for_each_entry_reverse(el, list, el_link) {
+		if (!cfs_expr_list_match(addr & 0xff, el))
+			return 0;
+		addr >>= 8;
+		i++;
+	}
+
+	return i == 4;
+}
+
+static void
+libcfs_decnum_addr2str(__u32 addr, char *str, size_t size)
+{
+	snprintf(str, size, "%u", addr);
+}
+
+static int
+libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int     n;
+
+	n = nob;
+	if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int	rc;
+
+	rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+	if (rc == 0)
+		list_add_tail(&el->el_link, list);
+
+	return rc;
+}
+
+static int
+libcfs_num_addr_range_print(char *buffer, int count, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0, j = 0;
+
+	list_for_each_entry(el, list, el_link) {
+		assert(j++ < 1);
+		i += cfs_expr_list_print(buffer + i, count - i, el);
+	}
+	return i;
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+	struct cfs_expr_list *el;
+
+	assert(!list_empty(numaddr));
+	el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+	return cfs_expr_list_match(addr, el);
+}
+
+static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min, __u32 *max);
+static int cfs_num_min_max(struct list_head *nidlist, __u32 *min, __u32 *max);
+
+static struct netstrfns libcfs_netstrfns[] = {
+	{
+		.nf_type		= LOLND,
+		.nf_name		= "lo",
+		.nf_modname		= "klolnd",
+		.nf_addr2str		= libcfs_decnum_addr2str,
+		.nf_str2addr		= libcfs_lo_str2addr,
+		.nf_parse_addrlist	= libcfs_num_parse,
+		.nf_print_addrlist	= libcfs_num_addr_range_print,
+		.nf_match_addr		= libcfs_num_match,
+		.nf_min_max		= cfs_num_min_max,
+		.nf_expand_addrrange	= libcfs_num_addr_range_expand
+	},
+	{
+		.nf_type		= SOCKLND,
+		.nf_name		= "tcp",
+		.nf_modname		= "ksocklnd",
+		.nf_addr2str		= libcfs_ip_addr2str,
+		.nf_str2addr		= libcfs_ip_str2addr,
+		.nf_parse_addrlist	= cfs_ip_addr_parse,
+		.nf_print_addrlist	= libcfs_ip_addr_range_print,
+		.nf_match_addr		= cfs_ip_addr_match,
+		.nf_min_max		= cfs_ip_min_max,
+		.nf_expand_addrrange	= libcfs_ip_addr_range_expand
+	},
+	{
+		.nf_type		= O2IBLND,
+		.nf_name		= "o2ib",
+		.nf_modname		= "ko2iblnd",
+		.nf_addr2str		= libcfs_ip_addr2str,
+		.nf_str2addr		= libcfs_ip_str2addr,
+		.nf_parse_addrlist	= cfs_ip_addr_parse,
+		.nf_print_addrlist	= libcfs_ip_addr_range_print,
+		.nf_match_addr		= cfs_ip_addr_match,
+		.nf_min_max		= cfs_ip_min_max,
+		.nf_expand_addrrange	= libcfs_ip_addr_range_expand
+	},
+	{
+		.nf_type		= GNILND,
+		.nf_name		= "gni",
+		.nf_modname		= "kgnilnd",
+		.nf_addr2str		= libcfs_decnum_addr2str,
+		.nf_str2addr		= libcfs_num_str2addr,
+		.nf_parse_addrlist	= libcfs_num_parse,
+		.nf_print_addrlist	= libcfs_num_addr_range_print,
+		.nf_match_addr		= libcfs_num_match,
+		.nf_min_max		= cfs_num_min_max,
+		.nf_expand_addrrange	= libcfs_num_addr_range_expand
+	},
+	{
+		.nf_type		= GNIIPLND,
+		.nf_name		= "gip",
+		.nf_modname		= "kgnilnd",
+		.nf_addr2str		= libcfs_ip_addr2str,
+		.nf_str2addr		= libcfs_ip_str2addr,
+		.nf_parse_addrlist	= cfs_ip_addr_parse,
+		.nf_print_addrlist	= libcfs_ip_addr_range_print,
+		.nf_match_addr		= cfs_ip_addr_match,
+		.nf_min_max		= cfs_ip_min_max,
+		.nf_expand_addrrange	= libcfs_ip_addr_range_expand
+	},
+	{
+		.nf_type		= PTL4LND,
+		.nf_name		= "ptlf",
+		.nf_modname		= "kptl4lnd",
+		.nf_addr2str		= libcfs_decnum_addr2str,
+		.nf_str2addr		= libcfs_num_str2addr,
+		.nf_parse_addrlist	= libcfs_num_parse,
+		.nf_print_addrlist	= libcfs_num_addr_range_print,
+		.nf_match_addr		= libcfs_num_match,
+		.nf_min_max		= cfs_num_min_max,
+		.nf_expand_addrrange	= libcfs_num_addr_range_expand
+	},
+	{
+		.nf_type		= KFILND,
+		.nf_name		= "kfi",
+		.nf_modname		= "kkfilnd",
+		.nf_addr2str		= libcfs_decnum_addr2str,
+		.nf_str2addr		= libcfs_num_str2addr,
+		.nf_parse_addrlist	= libcfs_num_parse,
+		.nf_print_addrlist	= libcfs_num_addr_range_print,
+		.nf_match_addr		= libcfs_num_match,
+		.nf_min_max		= cfs_num_min_max,
+		.nf_expand_addrrange	= libcfs_num_addr_range_expand
+	}
+};
+
+static const size_t libcfs_nnetstrfns =
+	sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]);
+
+static struct netstrfns *
+libcfs_lnd2netstrfns(__u32 lnd)
+{
+	int	i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (lnd == libcfs_netstrfns[i].nf_type)
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+	struct netstrfns *nf;
+	int               i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (!strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+			return nf;
+	}
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+	int    i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (!strcmp(libcfs_netstrfns[i].nf_name, name))
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+int
+libcfs_isknown_lnd(__u32 lnd)
+{
+	return libcfs_lnd2netstrfns(lnd) != NULL;
+}
+
+char *
+libcfs_lnd2modname(__u32 lnd)
+{
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	return (nf == NULL) ? NULL : nf->nf_modname;
+}
+
+int
+libcfs_str2lnd(const char *str)
+{
+	struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+	if (nf != NULL)
+		return nf->nf_type;
+
+	return -1;
+}
+
+char *
+libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size)
+{
+	struct netstrfns *nf;
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL)
+		snprintf(buf, buf_size, "?%u?", lnd);
+	else
+		snprintf(buf, buf_size, "%s", nf->nf_name);
+
+	return buf;
+}
+
+char *
+libcfs_net2str_r(__u32 net, char *buf, size_t buf_size)
+{
+	__u32		  nnum = LNET_NETNUM(net);
+	__u32		  lnd  = LNET_NETTYP(net);
+	struct netstrfns *nf;
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL)
+		snprintf(buf, buf_size, "<%u:%u>", lnd, nnum);
+	else if (nnum == 0)
+		snprintf(buf, buf_size, "%s", nf->nf_name);
+	else
+		snprintf(buf, buf_size, "%s%u", nf->nf_name, nnum);
+
+	return buf;
+}
+
+char *
+libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size)
+{
+	__u32		  addr = LNET_NIDADDR(nid);
+	__u32		  net  = LNET_NIDNET(nid);
+	__u32		  nnum = LNET_NETNUM(net);
+	__u32		  lnd  = LNET_NETTYP(net);
+	struct netstrfns *nf;
+
+	if (nid == LNET_NID_ANY) {
+		strncpy(buf, "<?>", buf_size);
+		buf[buf_size - 1] = '\0';
+		return buf;
+	}
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL) {
+		snprintf(buf, buf_size, "%x@<%u:%u>", addr, lnd, nnum);
+	} else {
+		size_t addr_len;
+
+		nf->nf_addr2str(addr, buf, buf_size);
+		addr_len = strlen(buf);
+		if (nnum == 0)
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s",
+				 nf->nf_name);
+		else
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s%u",
+				 nf->nf_name, nnum);
+	}
+
+	return buf;
+}
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+	struct netstrfns *nf = NULL;
+	int		  nob;
+	unsigned int	  netnum;
+	int		  i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (!strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+			break;
+	}
+
+	if (i == libcfs_nnetstrfns)
+		return NULL;
+
+	nob = strlen(nf->nf_name);
+
+	if (strlen(str) == (unsigned int)nob) {
+		netnum = 0;
+	} else {
+		if (nf->nf_type == LOLND) /* net number not allowed */
+			return NULL;
+
+		str += nob;
+		i = strlen(str);
+		if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+		    i != (int)strlen(str))
+			return NULL;
+	}
+
+	*net = LNET_MKNET(nf->nf_type, netnum);
+	return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+	__u32  net;
+
+	if (libcfs_str2net_internal(str, &net) != NULL)
+		return net;
+
+	return LNET_NET_ANY;
+}
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+	const char       *sep = strchr(str, '@');
+	struct netstrfns *nf;
+	__u32             net;
+	__u32             addr;
+
+	if (sep != NULL) {
+		nf = libcfs_str2net_internal(sep + 1, &net);
+		if (nf == NULL)
+			return LNET_NID_ANY;
+	} else {
+		sep = str + strlen(str);
+		net = LNET_MKNET(SOCKLND, 0);
+		nf = libcfs_lnd2netstrfns(SOCKLND);
+		assert(nf != NULL);
+	}
+
+	if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+		return LNET_NID_ANY;
+
+	return LNET_MKNID(net, addr);
+}
+
+char *
+libcfs_id2str(struct lnet_process_id id)
+{
+	char *str = libcfs_next_nidstring();
+
+	if (id.pid == LNET_PID_ANY) {
+		snprintf(str, LNET_NIDSTR_SIZE,
+			 "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+		return str;
+	}
+
+	snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+		 ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+		 (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+	return str;
+}
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+	if (!strcmp(str, "*")) {
+		*nidp = LNET_NID_ANY;
+		return 1;
+	}
+
+	*nidp = libcfs_str2nid(str);
+	return *nidp != LNET_NID_ANY;
+}
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>         :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>        :== <addrrange> '@' <net>
+ * <addrrange>       :== '*' |
+ *                       <ipaddr_range> |
+ *			 <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *			 <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *                       <expr_list>
+ * <expr_list>       :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *                       <number> '-' <number> |
+ *                       <number> '-' <number> '/' <number>
+ * <net>             :== <netname> | <netname><number>
+ * <netname>         :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *                       "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+	/**
+	 * Link to list of this structures which is built on nid range
+	 * list parsing.
+	 */
+	struct list_head nr_link;
+	/**
+	 * List head for addrrange::ar_link.
+	 */
+	struct list_head nr_addrranges;
+	/**
+	 * Flag indicating that *@<net> is found.
+	 */
+	int nr_all;
+	/**
+	 * Pointer to corresponding element of libcfs_netstrfns.
+	 */
+	struct netstrfns *nr_netstrfns;
+	/**
+	 * Number of network. E.g. 5 if \<net\> is "elan5".
+	 */
+	int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+	/**
+	 * Link to nidrange::nr_addrranges.
+	 */
+	struct list_head ar_link;
+	/**
+	 * List head for cfs_expr_list::el_list.
+	 */
+	struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 0 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval -errno otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+	struct addrrange *addrrange;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		nidrange->nr_all = 1;
+		return 0;
+	}
+
+	addrrange = calloc(1, sizeof(struct addrrange));
+	if (addrrange == NULL)
+		return -ENOMEM;
+	list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+	INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+	return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+						src->ls_len,
+						&addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+	     struct list_head *nidlist)
+{
+	struct netstrfns *nf;
+	struct nidrange *nr;
+	int endlen;
+	unsigned netnum;
+
+	if (src->ls_len >= LNET_NIDSTR_SIZE)
+		return NULL;
+
+	nf = libcfs_namenum2netstrfns(src->ls_str);
+	if (nf == NULL)
+		return NULL;
+	endlen = src->ls_len - strlen(nf->nf_name);
+	if (endlen == 0)
+		/* network name only, e.g. "elan" or "tcp" */
+		netnum = 0;
+	else {
+		/* e.g. "elan25" or "tcp23", refuse to parse if
+		 * network name is not appended with decimal or
+		 * hexadecimal number */
+		if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+				       endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+			return NULL;
+	}
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns != nf)
+			continue;
+		if (nr->nr_netnum != netnum)
+			continue;
+		return nr;
+	}
+
+	nr = calloc(1, sizeof(struct nidrange));
+	if (nr == NULL)
+		return NULL;
+	list_add_tail(&nr->nr_link, nidlist);
+	INIT_LIST_HEAD(&nr->nr_addrranges);
+	nr->nr_netstrfns = nf;
+	nr->nr_all = 0;
+	nr->nr_netnum = netnum;
+
+	return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+	struct cfs_lstr addrrange;
+	struct cfs_lstr net;
+	struct cfs_lstr tmp;
+	struct nidrange *nr;
+
+	tmp = *src;
+	if (cfs_gettok(src, '@', &addrrange) == 0)
+		goto failed;
+
+	if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+		goto failed;
+
+	nr = add_nidrange(&net, nidlist);
+	if (nr == NULL)
+		goto failed;
+
+	if (parse_addrange(&addrrange, nr) != 0)
+		goto failed;
+
+	return 1;
+ failed:
+	fprintf(stderr, "can't parse nidrange: \"%.*s\"\n",
+		tmp.ls_len, tmp.ls_str);
+	return 0;
+}
+
+static __u32
+libcfs_net_str_len(const char *str)
+{
+	int i;
+	struct netstrfns *nf = NULL;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (!strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+			return strlen(nf->nf_name);
+	}
+
+	return 0;
+}
+
+int
+parse_net_range(char *str, __u32 len, struct list_head *net_num,
+		__u32 *net_type)
+{
+	struct cfs_lstr next;
+	__u32 net_type_len;
+	__u32 net;
+	char *bracket;
+	char *star;
+
+	if (!str)
+		return -EINVAL;
+
+	next.ls_str = str;
+	next.ls_len = len;
+
+	net_type_len = libcfs_net_str_len(str);
+
+	if (net_type_len < len) {
+		char c = str[net_type_len];
+
+		str[net_type_len] = '\0';
+		net = libcfs_str2net(str);
+		str[net_type_len] = c;
+	} else {
+		net = libcfs_str2net(str);
+	}
+
+	if (net == LNET_NIDNET(LNET_NID_ANY))
+		return -EINVAL;
+
+	*net_type = LNET_NETTYP(net);
+
+	/*
+	 * the net is either followed with an absolute number, *, or an
+	 * expression enclosed in []
+	 */
+	bracket = strchr(next.ls_str, '[');
+	star = strchr(next.ls_str, '*');
+
+	/* "*[" pattern not allowed */
+	if (bracket && star && star < bracket)
+		return -EINVAL;
+
+	if (!bracket) {
+		next.ls_str = str + net_type_len;
+		next.ls_len = strlen(next.ls_str);
+	} else {
+		next.ls_str = bracket;
+		next.ls_len = strlen(bracket);
+	}
+
+	/* if there is no net number just return */
+	if (next.ls_len == 0)
+		return 0;
+
+	return libcfs_num_parse(next.ls_str, next.ls_len,
+				net_num);
+}
+
+int
+parse_address(struct cfs_lstr *src, const __u32 net_type,
+	      struct list_head *addr)
+{
+	int i;
+	struct netstrfns *nf = NULL;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (net_type == nf->nf_type)
+			return nf->nf_parse_addrlist(src->ls_str, src->ls_len,
+						     addr);
+	}
+
+	return -EINVAL;
+}
+
+int
+cfs_parse_nid_parts(char *str, struct list_head *addr,
+		    struct list_head *net_num, __u32 *net_type)
+{
+	struct cfs_lstr next;
+	struct cfs_lstr addrrange;
+	bool found = false;
+	int rc;
+
+	if (!str)
+		return -EINVAL;
+
+	next.ls_str = str;
+	next.ls_len = strlen(str);
+
+	rc = cfs_gettok(&next, '@', &addrrange);
+	if (!rc)
+		return -EINVAL;
+
+	if (!next.ls_str) {
+		/* only net is present */
+		next.ls_str = str;
+		next.ls_len = strlen(str);
+	} else {
+		found = true;
+	}
+
+	/* assume only net is present */
+	rc = parse_net_range(next.ls_str, next.ls_len, net_num, net_type);
+
+	/*
+	 * if we successfully parsed the net range and there is no
+	 * address, or if we fail to parse the net range then return
+	 */
+	if ((!rc && !found) || rc)
+		return rc;
+
+	return parse_address(&addrrange, *net_type, addr);
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct addrrange *ar;
+
+		ar = list_entry(list->next, struct addrrange, ar_link);
+
+		cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+		list_del(&ar->ar_link);
+		free(ar);
+	}
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	struct nidrange *nr;
+
+	list_for_each_safe(pos, next, list) {
+		nr = list_entry(pos, struct nidrange, nr_link);
+		free_addrranges(&nr->nr_addrranges);
+		list_del(pos);
+		free(nr);
+	}
+}
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(nidlist);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+		rc = parse_nidrange(&res, nidlist);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_all)
+			return 1;
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+			if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+							&ar->ar_numaddr_ranges))
+				return 1;
+	}
+	return 0;
+}
+
+int
+cfs_match_net(__u32 net_id, __u32 net_type, struct list_head *net_num_list)
+{
+	__u32 net_num;
+
+	if (!net_num_list)
+		return 0;
+
+	if (net_type != LNET_NETTYP(net_id))
+		return 0;
+
+	net_num = LNET_NETNUM(net_id);
+
+	/*
+	 * if there is a net number but the list passed in is empty, then
+	 * there is no match.
+	 */
+	if (!net_num && list_empty(net_num_list))
+		return 1;
+	else if (list_empty(net_num_list))
+		return 0;
+
+	if (!libcfs_num_match(net_num, net_num_list))
+		return 0;
+
+	return 1;
+}
+
+/**
+ * Print the network part of the nidrange \a nr into the specified \a buffer.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_print_network(char *buffer, int count, struct nidrange *nr)
+{
+	struct netstrfns *nf = nr->nr_netstrfns;
+
+	if (nr->nr_netnum == 0)
+		return scnprintf(buffer, count, "@%s", nf->nf_name);
+	else
+		return scnprintf(buffer, count, "@%s%u",
+				 nf->nf_name, nr->nr_netnum);
+}
+
+
+/**
+ * Print a list of addrrange (\a addrranges) into the specified \a buffer.
+ * At max \a count characters can be printed into \a buffer.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_print_addrranges(char *buffer, int count, struct list_head *addrranges,
+		     struct nidrange *nr)
+{
+	int i = 0;
+	struct addrrange *ar;
+	struct netstrfns *nf = nr->nr_netstrfns;
+
+	list_for_each_entry(ar, addrranges, ar_link) {
+		if (i != 0)
+			i += scnprintf(buffer + i, count - i, " ");
+		i += nf->nf_print_addrlist(buffer + i, count - i,
+					   &ar->ar_numaddr_ranges);
+		i += cfs_print_network(buffer + i, count - i, nr);
+	}
+	return i;
+}
+
+/**
+ * Print a list of nidranges (\a nidlist) into the specified \a buffer.
+ * At max \a count characters can be printed into \a buffer.
+ * Nidranges are separated by a space character.
+ *
+ * \retval number of characters written
+ */
+int cfs_print_nidlist(char *buffer, int count, struct list_head *nidlist)
+{
+	int i = 0;
+	struct nidrange *nr;
+
+	if (count <= 0)
+		return 0;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (i != 0)
+			i += scnprintf(buffer + i, count - i, " ");
+
+		if (nr->nr_all != 0) {
+			assert(list_empty(&nr->nr_addrranges));
+			i += scnprintf(buffer + i, count - i, "*");
+			i += cfs_print_network(buffer + i, count - i, nr);
+		} else {
+			i += cfs_print_addrranges(buffer + i, count - i,
+						  &nr->nr_addrranges, nr);
+		}
+	}
+	return i;
+}
+
+/**
+ * Determines minimum and maximum addresses for a single
+ * numeric address range
+ *
+ * \param	ar
+ * \param[out]	*min_nid __u32 representation of min NID
+ * \param[out]	*max_nid __u32 representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+static int cfs_ip_ar_min_max(struct addrrange *ar, __u32 *min_nid,
+			      __u32 *max_nid)
+{
+	struct cfs_expr_list *expr_list;
+	struct cfs_range_expr *range;
+	unsigned int min_ip[4] = {0};
+	unsigned int max_ip[4] = {0};
+	int cur_octet = 0;
+	bool expect_full_octet = false;
+
+	list_for_each_entry(expr_list, &ar->ar_numaddr_ranges, el_link) {
+		int re_count = 0;
+
+		list_for_each_entry(range, &expr_list->el_exprs, re_link) {
+			/* XXX: add support for multiple & non-contig. re's */
+			if (re_count > 0)
+				return -EINVAL;
+
+			/* if a previous octet was ranged, then all remaining
+			 * octets must be full for contiguous range */
+			if (expect_full_octet && (range->re_lo != 0 ||
+						  range->re_hi != 255))
+				return -ERANGE;
+
+			if (range->re_stride != 1)
+				return -ERANGE;
+
+			if (range->re_lo > range->re_hi)
+				return -EINVAL;
+
+			if (range->re_lo != range->re_hi)
+				expect_full_octet = true;
+
+			min_ip[cur_octet] = range->re_lo;
+			max_ip[cur_octet] = range->re_hi;
+
+			re_count++;
+		}
+
+		cur_octet++;
+	}
+
+	if (min_nid != NULL)
+		*min_nid = ((min_ip[0] << 24) | (min_ip[1] << 16) |
+			    (min_ip[2] << 8) | min_ip[3]);
+
+	if (max_nid != NULL)
+		*max_nid = ((max_ip[0] << 24) | (max_ip[1] << 16) |
+			    (max_ip[2] << 8) | max_ip[3]);
+
+	return 0;
+}
+
+/**
+ * Determines minimum and maximum addresses for a single
+ * numeric address range
+ *
+ * \param	ar
+ * \param[out]	*min_nid __u32 representation of min NID
+ * \param[out]	*max_nid __u32 representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ */
+static int cfs_num_ar_min_max(struct addrrange *ar, __u32 *min_nid,
+			       __u32 *max_nid)
+{
+	struct cfs_expr_list *el;
+	struct cfs_range_expr *re;
+	unsigned int min_addr = 0;
+	unsigned int max_addr = 0;
+
+	list_for_each_entry(el, &ar->ar_numaddr_ranges, el_link) {
+		int re_count = 0;
+
+		list_for_each_entry(re, &el->el_exprs, re_link) {
+			if (re_count > 0)
+				return -EINVAL;
+			if (re->re_lo > re->re_hi)
+				return -EINVAL;
+
+			if (re->re_lo < min_addr || min_addr == 0)
+				min_addr = re->re_lo;
+			if (re->re_hi > max_addr)
+				max_addr = re->re_hi;
+
+			re_count++;
+		}
+	}
+
+	if (min_nid != NULL)
+		*min_nid = min_addr;
+	if (max_nid != NULL)
+		*max_nid = max_addr;
+
+	return 0;
+}
+
+/**
+ * Takes a linked list of nidrange expressions, determines the minimum
+ * and maximum nid and creates appropriate nid structures
+ *
+ * \param	*nidlist
+ * \param[out]	*min_nid string representation of min NID
+ * \param[out]	*max_nid string representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid,
+			      char *max_nid, size_t nidstr_length)
+{
+	struct nidrange *first_nidrange;
+	int netnum;
+	struct netstrfns *nf;
+	char *lndname;
+	__u32 min_addr;
+	__u32 max_addr;
+	char min_addr_str[IPSTRING_LENGTH];
+	char max_addr_str[IPSTRING_LENGTH];
+	int rc;
+
+	first_nidrange = list_entry(nidlist->next, struct nidrange, nr_link);
+
+	netnum = first_nidrange->nr_netnum;
+	nf = first_nidrange->nr_netstrfns;
+	lndname = nf->nf_name;
+
+	rc = nf->nf_min_max(nidlist, &min_addr, &max_addr);
+	if (rc < 0)
+		return rc;
+
+	nf->nf_addr2str(min_addr, min_addr_str, sizeof(min_addr_str));
+	nf->nf_addr2str(max_addr, max_addr_str, sizeof(max_addr_str));
+
+	snprintf(min_nid, nidstr_length, "%s@%s%d", min_addr_str, lndname,
+		 netnum);
+	snprintf(max_nid, nidstr_length, "%s@%s%d", max_addr_str, lndname,
+		 netnum);
+
+	return 0;
+}
+
+/**
+ * Determines the min and max NID values for num LNDs
+ *
+ * \param	*nidlist
+ * \param[out]	*min_nid if provided, returns string representation of min NID
+ * \param[out]	*max_nid if provided, returns string representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+static int cfs_num_min_max(struct list_head *nidlist, __u32 *min_nid,
+			    __u32 *max_nid)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+	unsigned int tmp_min_addr = 0;
+	unsigned int tmp_max_addr = 0;
+	unsigned int min_addr = 0;
+	unsigned int max_addr = 0;
+	int nidlist_count = 0;
+	int rc;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nidlist_count > 0)
+			return -EINVAL;
+
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link) {
+			rc = cfs_num_ar_min_max(ar, &tmp_min_addr,
+						&tmp_max_addr);
+			if (rc < 0)
+				return rc;
+
+			if (tmp_min_addr < min_addr || min_addr == 0)
+				min_addr = tmp_min_addr;
+			if (tmp_max_addr > max_addr)
+				max_addr = tmp_min_addr;
+		}
+	}
+	if (max_nid != NULL)
+		*max_nid = max_addr;
+	if (min_nid != NULL)
+		*min_nid = min_addr;
+
+	return 0;
+}
+
+/**
+ * Takes an nidlist and determines the minimum and maximum
+ * ip addresses.
+ *
+ * \param	*nidlist
+ * \param[out]	*min_nid if provided, returns string representation of min NID
+ * \param[out]	*max_nid if provided, returns string representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min_nid,
+			   __u32 *max_nid)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+	__u32 tmp_min_ip_addr = 0;
+	__u32 tmp_max_ip_addr = 0;
+	__u32 min_ip_addr = 0;
+	__u32 max_ip_addr = 0;
+	int nidlist_count = 0;
+	int rc;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nidlist_count > 0)
+			return -EINVAL;
+
+		if (nr->nr_all) {
+			min_ip_addr = 0;
+			max_ip_addr = 0xffffffff;
+			break;
+		}
+
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link) {
+			rc = cfs_ip_ar_min_max(ar, &tmp_min_ip_addr,
+					       &tmp_max_ip_addr);
+			if (rc < 0)
+				return rc;
+
+			if (tmp_min_ip_addr < min_ip_addr || min_ip_addr == 0)
+				min_ip_addr = tmp_min_ip_addr;
+			if (tmp_max_ip_addr > max_ip_addr)
+				max_ip_addr = tmp_max_ip_addr;
+		}
+
+		nidlist_count++;
+	}
+
+	if (max_nid != NULL)
+		*max_nid = max_ip_addr;
+	if (min_nid != NULL)
+		*min_nid = min_ip_addr;
+
+	return 0;
+}
+
+static int
+libcfs_expand_nidrange(struct nidrange *nr, __u32 *addrs, int max_nids)
+{
+	struct addrrange *ar;
+	int rc = 0, count = max_nids;
+	struct netstrfns *nf = nr->nr_netstrfns;
+
+	list_for_each_entry(ar, &nr->nr_addrranges, ar_link) {
+		rc = nf->nf_expand_addrrange(&ar->ar_numaddr_ranges, addrs,
+					     count);
+		if (rc < 0)
+			return rc;
+
+		count -= rc;
+	}
+
+	return max_nids - count;
+}
+
+int cfs_expand_nidlist(struct list_head *nidlist, lnet_nid_t *lnet_nidlist,
+		       int max_nids)
+{
+	struct nidrange *nr;
+	int rc = 0, count = max_nids;
+	int i, j = 0;
+	__u32 *addrs;
+	struct netstrfns *nf;
+	__u32 net;
+
+	addrs = calloc(max_nids, sizeof(__u32));
+	if (!addrs)
+		return -ENOMEM;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		rc = libcfs_expand_nidrange(nr, addrs, count);
+
+		if (rc < 0) {
+			free(addrs);
+			return rc;
+		}
+
+		nf = nr->nr_netstrfns;
+		net = LNET_MKNET(nf->nf_type, nr->nr_netnum);
+
+		for (i = count - 1; i >= count - rc; i--)
+			lnet_nidlist[j++] = LNET_MKNID(net, addrs[i]);
+
+		count -= rc;
+	}
+
+	free(addrs);
+	return max_nids - count;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
new file mode 100644
index 0000000000000..18fe84dc53f6a
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
@@ -0,0 +1,155 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the GNU Lesser General Public License
+ * (LGPL) version 2.1 or (at your discretion) any later version.
+ * (LGPL) version 2.1 accompanies this distribution, and is available at
+ * http://www.gnu.org/licenses/lgpl-2.1.html
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * LGPL HEADER END
+ */
+/*
+ * libcfs/libcfs/utils/param.c
+ *
+ * This code handles user interaction with the configuration interface
+ * to the Lustre file system to fine tune it.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <glob.h>
+#include <mntent.h>
+#include <paths.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <linux/limits.h>
+#include <libcfs/util/string.h>
+#include <sys/vfs.h>
+#include <linux/magic.h>
+
+/**
+ * Get parameter path matching the pattern
+ *
+ * \param[out] paths	glob_t structure used to hold the final result
+ * \param[in]  pattern	the pattern containing sprintf format specifiers
+ *			which will be used to create the path to match
+ *
+ * The \param pattern is appended to the default path glob to complete the
+ * absolute path to the file the caller is requesting. If the results point
+ * to one or more files that exist those results are stored in the \param
+ * paths glob_t structure that is passed by the caller.
+ *
+ * Lustre tunables traditionally were in /proc/{sys,fs}/{lnet,lustre}
+ * but in upstream kernels starting with Linux 4.2 these parameters
+ * have been moved to /sys/fs/lustre and /sys/kernel/debug/{lnet,lustre}
+ * so the user tools need to check both locations.
+ *
+ * \retval	 0 for success, with results stored in \param paths.
+ * \retval	-1 for failure with errno set to report the reason.
+ */
+int
+cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
+{
+	char topdir[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre},"
+				"/proc/{fs,sys}/{lnet,lustre}}";
+	static bool test_mounted = false;
+	char path[PATH_MAX];
+	char buf[PATH_MAX];
+	struct statfs statfsbuf;
+	va_list args;
+	int rc;
+
+
+	if (test_mounted)
+		goto skip_mounting;
+	test_mounted = true;
+
+	rc = statfs("/sys/kernel/debug/", &statfsbuf);
+	if (rc == 0 && statfsbuf.f_type == DEBUGFS_MAGIC)
+		goto skip_mounting;
+
+	if (mount("none", "/sys/kernel/debug", "debugfs", 0, "") == -1) {
+		/* Already mounted or don't have permission to mount is okay */
+		if (errno != EPERM && errno != EBUSY)
+			fprintf(stderr, "Warning: failed to mount debug: %s\n",
+				strerror(errno));
+	} else {
+		struct stat mtab;
+
+		/* This is all for RHEL6 which is old school. Can be removed
+		 * later when RHEL6 client support is dropped. */
+		rc = lstat(_PATH_MOUNTED, &mtab);
+		if (!rc && !S_ISLNK(mtab.st_mode)) {
+			FILE *fp = setmntent(_PATH_MOUNTED, "r+");
+
+			if (fp != NULL) {
+				const struct mntent fs = {
+					.mnt_fsname	= "debugfs",
+					.mnt_dir	= "/sys/kernel/debug",
+					.mnt_type	= "debugfs",
+					.mnt_opts	= "rw,relatime",
+				};
+
+				rc = addmntent(fp, &fs);
+				if (rc) {
+					fprintf(stderr,
+						"failed to add debugfs to %s: %s\n",
+						_PATH_MOUNTED, strerror(errno));
+				}
+				endmntent(fp);
+			} else {
+				fprintf(stderr, "could not open %s: %s\n",
+					_PATH_MOUNTED, strerror(errno));
+			}
+		}
+	}
+skip_mounting:
+	va_start(args, pattern);
+	rc = vsnprintf(buf, sizeof(buf), pattern, args);
+	va_end(args);
+	if (rc < 0) {
+		return rc;
+	} else if (rc >= sizeof(buf)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (snprintf(path, sizeof(path), "%s/%s", topdir, buf) >=
+	    sizeof(path)) {
+		errno = E2BIG;
+		return -1;
+	}
+
+	rc = glob(path, GLOB_BRACE, NULL, paths);
+	if (rc != 0) {
+		switch (rc) {
+		case GLOB_NOSPACE:
+			errno = ENOMEM;
+			break;
+		case GLOB_ABORTED:
+			errno = ENODEV;
+			break;
+		case GLOB_NOMATCH:
+		default:
+			errno = ENOENT;
+			break;
+		}
+		rc = -1;
+	}
+
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
new file mode 100644
index 0000000000000..c5c8947ef3d7d
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
@@ -0,0 +1,850 @@
+/*
+ * Copyright (C) 2001 Cluster File Systems, Inc.
+ *
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <malloc.h>
+#ifdef HAVE_LIBREADLINE
+# include <readline/history.h>
+# include <readline/readline.h>
+#endif /* HAVE_LIBREADLINE */
+#include <string.h>
+#include <unistd.h>
+
+#include <libcfs/util/parser.h>
+#include <linux/lustre/lustre_ver.h>
+
+/* Top level of commands, initialized by InitParser */
+static command_t *top_level;
+/* Parser prompt, set by InitParser */
+static char *parser_prompt;
+/* Set to 1 if user types exit or quit */
+static int done;
+/*
+ * Normally, the parser will quit when an error occurs in non-interacive
+ * mode. Setting this to non-zero will force it to keep buggering on.
+ */
+static int ignore_errors;
+
+/* static functions */
+static char *skipwhitespace(char *s);
+static char *skiptowhitespace(char *s);
+static command_t *find_cmd(char *name, command_t cmds[], char **next);
+static int process(char *s, char **next, command_t *lookup, command_t **result,
+		   char **prev);
+
+static char *skipwhitespace(char *s)
+{
+	char *t;
+	int len;
+
+	len = (int)strlen(s);
+	for (t = s; t <= s + len && isspace(*t); t++)
+		;
+	return t;
+}
+
+static char *skiptowhitespace(char *s)
+{
+	char *t;
+
+	for (t = s; *t && !isspace(*t); t++)
+		;
+	return t;
+}
+
+static int line2args(char *line, char **argv, int maxargs)
+{
+	char *arg;
+	int i = 0;
+
+	arg = strtok(line, " \t");
+	if (!arg || maxargs < 1)
+		return 0;
+
+	argv[i++] = arg;
+	while ((arg = strtok(NULL, " \t")) != NULL && i < maxargs)
+		argv[i++] = arg;
+	return i;
+}
+
+/* find a command -- return it if unique otherwise print alternatives */
+static command_t *Parser_findargcmd(char *name, command_t cmds[])
+{
+	command_t *cmd;
+
+	for (cmd = cmds; cmd->pc_name; cmd++) {
+		if (strcmp(name, cmd->pc_name) == 0)
+			return cmd;
+	}
+	return NULL;
+}
+
+void Parser_ignore_errors(int ignore)
+{
+	ignore_errors = ignore;
+}
+
+int Parser_execarg(int argc, char **argv, command_t cmds[])
+{
+	command_t *cmd;
+
+	cmd = Parser_findargcmd(argv[0], cmds);
+	if (cmd && cmd->pc_func) {
+		int rc = cmd->pc_func(argc, argv);
+
+		if (rc == CMD_HELP)
+			fprintf(stderr, "%s\n", cmd->pc_help);
+		return rc;
+	}
+	printf("Try interactive use without arguments or use one of:\n");
+	for (cmd = cmds; cmd->pc_name; cmd++)
+		printf("\"%s\"\n", cmd->pc_name);
+	printf("as argument.\n");
+
+	return -1;
+}
+
+/*
+ * Returns the command_t * (NULL if not found) corresponding to a
+ * _partial_ match with the first token in name.  It sets *next to
+ * point to the following token. Does not modify *name.
+ */
+static command_t *find_cmd(char *name, command_t cmds[], char **next)
+{
+	int i, len;
+
+	if (!cmds || !name)
+		return NULL;
+
+	/*
+	 * This sets name to point to the first non-white space character,
+	 * and next to the first whitespace after name, len to the length: do
+	 * this with strtok
+	 */
+	name = skipwhitespace(name);
+	*next = skiptowhitespace(name);
+	len = (int)(*next - name);
+	if (len == 0)
+		return NULL;
+
+	for (i = 0; cmds[i].pc_name; i++) {
+		if (strncasecmp(name, cmds[i].pc_name, len) == 0) {
+			*next = skipwhitespace(*next);
+			return &cmds[i];
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Recursively process a command line string s and find the command
+ * corresponding to it. This can be ambiguous, full, incomplete,
+ * non-existent.
+ */
+static int process(char *s, char **next, command_t *lookup,
+		   command_t **result, char **prev)
+{
+	*result = find_cmd(s, lookup, next);
+	*prev = s;
+
+	/* non existent */
+	if (!*result)
+		return CMD_NONE;
+
+	/*
+	 * found entry: is it ambigous, i.e. not exact command name and
+	 * more than one command in the list matches.  Note that find_cmd
+	 * points to the first ambiguous entry
+	 */
+	if (strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name))) {
+		char *another_next;
+		int found_another = 0;
+
+		command_t *another_result = find_cmd(s, (*result) + 1,
+						     &another_next);
+		while (another_result) {
+			if (strncasecmp(s, another_result->pc_name,
+					strlen(another_result->pc_name)) == 0) {
+				*result = another_result;
+				*next = another_next;
+				goto got_it;
+			}
+			another_result = find_cmd(s, another_result + 1,
+						  &another_next);
+			found_another = 1;
+		}
+		if (found_another)
+			return CMD_AMBIG;
+	}
+
+got_it:
+	/* found a unique command: component or full? */
+	if ((*result)->pc_func)
+		return CMD_COMPLETE;
+
+	if (**next == '\0')
+		return CMD_INCOMPLETE;
+	return process(*next, next, (*result)->pc_sub_cmd,
+		       result, prev);
+}
+
+#ifdef HAVE_LIBREADLINE
+static command_t *match_tbl; /* Command completion against this table */
+static char *command_generator(const char *text, int state)
+{
+	static int index, len;
+	char *name;
+
+	/* Do we have a match table? */
+	if (!match_tbl)
+		return NULL;
+
+	/* If this is the first time called on this word, state is 0 */
+	if (!state) {
+		index = 0;
+		len = (int)strlen(text);
+	}
+
+	/* Return next name in the command list that paritally matches test */
+	while ((name = (match_tbl + index)->pc_name)) {
+		index++;
+
+		if (strncasecmp(name, text, len) == 0)
+			return strdup(name);
+	}
+
+	/* No more matches */
+	return NULL;
+}
+
+/* probably called by readline */
+static char **command_completion(const char *text, int start, int end)
+{
+	command_t *table;
+	char *pos;
+
+	match_tbl = top_level;
+
+	for (table = find_cmd(rl_line_buffer, match_tbl, &pos);
+	     table; table = find_cmd(pos, match_tbl, &pos)) {
+		if (*(pos - 1) == ' ')
+			match_tbl = table->pc_sub_cmd;
+	}
+
+	return rl_completion_matches(text, command_generator);
+}
+#endif
+
+/* take a string and execute the function or print help */
+int execute_line(char *line)
+{
+	command_t *cmd, *ambig;
+	char *prev;
+	char *next, *tmp;
+	char *argv[MAXARGS];
+	int i;
+	int rc = 0;
+
+	switch (process(line, &next, top_level, &cmd, &prev)) {
+	case CMD_AMBIG:
+		fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+		while ((ambig = find_cmd(prev, cmd, &tmp))) {
+			fprintf(stderr, "%s ", ambig->pc_name);
+			cmd = ambig + 1;
+		}
+		fprintf(stderr, "\n");
+		break;
+	case CMD_NONE:
+		fprintf(stderr, "No such command, type help\n");
+		break;
+	case CMD_INCOMPLETE:
+		fprintf(stderr,
+			"'%s' incomplete command.  Use '%s x' where x is one of:\n",
+			line, line);
+		fprintf(stderr, "\t");
+		for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++)
+			fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name);
+		fprintf(stderr, "\n");
+		break;
+	case CMD_COMPLETE:
+		optind = 0;
+		i = line2args(line, argv, MAXARGS);
+		rc = cmd->pc_func(i, argv);
+
+		if (rc == CMD_HELP)
+			fprintf(stderr, "%s\n", cmd->pc_help);
+
+		break;
+	}
+
+	return rc;
+}
+
+#ifdef HAVE_LIBREADLINE
+static void noop_int_fn(int unused) { }
+static void noop_void_fn(void) { }
+#endif
+
+/*
+ * just in case you're ever in an airplane and discover you
+ * forgot to install readline-dev. :)
+ */
+static int init_input(void)
+{
+	int interactive = isatty(fileno(stdin));
+
+#ifdef HAVE_LIBREADLINE
+	using_history();
+	stifle_history(HISTORY);
+
+	if (!interactive) {
+		rl_prep_term_function = noop_int_fn;
+		rl_deprep_term_function = noop_void_fn;
+	}
+
+	rl_attempted_completion_function = command_completion;
+	rl_completion_entry_function = command_generator;
+#endif
+	return interactive;
+}
+
+#ifndef HAVE_LIBREADLINE
+#define add_history(s)
+char *readline(char *prompt)
+{
+	int size = 2048;
+	char *line = malloc(size);
+	char *ptr = line;
+	int c;
+	int eof = 0;
+
+	if (!line)
+		return NULL;
+	if (prompt)
+		printf("%s", prompt);
+
+	while (1) {
+		if ((c = fgetc(stdin)) != EOF) {
+			if (c == '\n')
+				goto out;
+			*ptr++ = (char)c;
+
+			if (ptr - line >= size - 1) {
+				char *tmp;
+
+				size *= 2;
+				tmp = malloc(size);
+				if (!tmp)
+					goto outfree;
+				memcpy(tmp, line, ptr - line);
+				ptr = tmp + (ptr - line);
+				free(line);
+				line = tmp;
+			}
+		} else {
+			eof = 1;
+			if (ferror(stdin) || feof(stdin))
+				goto outfree;
+			goto out;
+		}
+	}
+out:
+	*ptr = 0;
+	if (eof && (strlen(line) == 0)) {
+		free(line);
+		line = NULL;
+	}
+	return line;
+outfree:
+	free(line);
+	return NULL;
+}
+#endif
+
+/* this is the command execution machine */
+int Parser_commands(void)
+{
+	char *line, *s;
+	int rc = 0, save_error = 0;
+	int interactive;
+
+	interactive = init_input();
+
+	while (!done) {
+		line = readline(interactive ? parser_prompt : NULL);
+
+		if (!line)
+			break;
+
+		s = skipwhitespace(line);
+
+		if (*s) {
+			add_history(s);
+			rc = execute_line(s);
+		}
+		/* stop on error if not-interactive */
+		if (rc != 0 && !interactive) {
+			if (save_error == 0)
+				save_error = rc;
+			if (!ignore_errors)
+				done = 1;
+		}
+		free(line);
+	}
+
+	if (save_error)
+		rc = save_error;
+
+	return rc;
+}
+
+/* sets the parser prompt */
+void Parser_init(char *prompt, command_t *cmds)
+{
+	done = 0;
+	top_level = cmds;
+	if (parser_prompt)
+		free(parser_prompt);
+	parser_prompt = strdup(prompt);
+}
+
+/* frees the parser prompt */
+void Parser_exit(int argc, char *argv[])
+{
+	done = 1;
+	free(parser_prompt);
+	parser_prompt = NULL;
+}
+
+/* convert a string to an integer */
+int Parser_int(char *s, int *val)
+{
+	int ret;
+
+	if (*s != '0') {
+		ret = sscanf(s, "%d", val);
+	} else if (*(s + 1) != 'x') {
+		ret = sscanf(s, "%o", val);
+	} else {
+		s++;
+		ret = sscanf(++s, "%x", val);
+	}
+
+	return ret;
+}
+
+void Parser_qhelp(int argc, char *argv[])
+{
+	printf("usage: %s [COMMAND] [OPTIONS]... [ARGS]\n",
+	       program_invocation_short_name);
+	printf("Without any parameters, interactive mode is invoked\n");
+
+	printf("Try '%s help <COMMAND>' or '%s --list-commands' for more information\n",
+	       program_invocation_short_name, program_invocation_short_name);
+}
+
+int Parser_help(int argc, char **argv)
+{
+	char line[1024];
+	char *next, *prev, *tmp;
+	command_t *result, *ambig;
+	int i;
+
+	if (argc == 1) {
+		Parser_qhelp(argc, argv);
+		return 0;
+	}
+
+	/*
+	 * Joining command line arguments without space is not critical here
+	 * because of this string is used for search a help topic and assume
+	 * that only one argument will be (the name of topic). For example:
+	 * lst > help ping run
+	 * pingrun: Unknown command.
+	 */
+	line[0] = '\0';
+	for (i = 1;  i < argc; i++) {
+		if (strlen(argv[i]) >= sizeof(line) - strlen(line))
+			return -E2BIG;
+		/*
+		 * The function strlcat() cannot be used here because of
+		 * this function is used in LNet utils that is not linked
+		 * with libcfs.a.
+		 */
+		strncat(line, argv[i], sizeof(line) - strlen(line));
+	}
+
+	switch (process(line, &next, top_level, &result, &prev)) {
+	case CMD_COMPLETE:
+		fprintf(stderr, "%s: %s\n", line, result->pc_help);
+		break;
+	case CMD_NONE:
+		fprintf(stderr, "%s: Unknown command.\n", line);
+		break;
+	case CMD_INCOMPLETE:
+		fprintf(stderr,
+			"'%s' incomplete command.  Use '%s x' where x is one of:\n",
+			line, line);
+		fprintf(stderr, "\t");
+		for (i = 0; result->pc_sub_cmd[i].pc_name; i++)
+			fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name);
+		fprintf(stderr, "\n");
+		break;
+	case CMD_AMBIG:
+		fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+		while ((ambig = find_cmd(prev, result, &tmp))) {
+			fprintf(stderr, "%s ", ambig->pc_name);
+			result = ambig + 1;
+		}
+		fprintf(stderr, "\n");
+		break;
+	}
+	return 0;
+}
+
+void Parser_printhelp(char *cmd)
+{
+	char *argv[] = { "help", cmd };
+
+	Parser_help(2, argv);
+}
+
+/* COMMANDS */
+
+/**
+ * Parser_list_commands() - Output a list of the supported commands.
+ * @cmdlist:	  Array of structures describing the commands.
+ * @buffer:	  String buffer used to temporarily store the output text.
+ * @buf_size:	  Length of the string buffer.
+ * @parent_cmd:	  When called recursively, contains the name of the parent cmd.
+ * @col_start:	  Column where printing should begin.
+ * @col_num:	  The number of commands printed in a single row.
+ *
+ * The commands and subcommands supported by the utility are printed, arranged
+ * into several columns for readability.  If a command supports subcommands, the
+ * function is called recursively, and the name of the parent command is
+ * supplied so that it can be prepended to the names of the subcommands.
+ *
+ * Return: The number of items that were printed.
+ */
+int Parser_list_commands(const command_t *cmdlist, char *buffer,
+			 size_t buf_size, const char *parent_cmd,
+			 int col_start, int col_num)
+{
+	int col = col_start;
+	int char_max;
+	int len;
+	int count = 0;
+	int rc;
+
+	if (col_start >= col_num)
+		return 0;
+
+	char_max = (buf_size - 1) / col_num; /* Reserve 1 char for NUL */
+
+	for (; cmdlist->pc_name; cmdlist++) {
+		if (!cmdlist->pc_func && !cmdlist->pc_sub_cmd)
+			break;
+		count++;
+		if (parent_cmd)
+			len = snprintf(&buffer[col * char_max],
+				       char_max + 1, "%s %s", parent_cmd,
+				       cmdlist->pc_name);
+		else
+			len = snprintf(&buffer[col * char_max],
+				       char_max + 1, "%s", cmdlist->pc_name);
+
+		/* Add trailing spaces to pad the entry to the column size */
+		if (len < char_max) {
+			snprintf(&buffer[col * char_max] + len,
+				 char_max - len + 1, "%*s", char_max - len,
+				 " ");
+		} else {
+			buffer[(col + 1) * char_max - 1] = ' ';
+		}
+
+		col++;
+		if (col >= col_num) {
+			fprintf(stdout, "%s\n", buffer);
+			col = 0;
+			buffer[0] = '\0';
+		}
+
+		if (cmdlist->pc_sub_cmd) {
+			rc = Parser_list_commands(cmdlist->pc_sub_cmd, buffer,
+						  buf_size, cmdlist->pc_name,
+						  col, col_num);
+			col = (col + rc) % col_num;
+			count += rc;
+		}
+	}
+	if (!parent_cmd && col != 0)
+		fprintf(stdout, "%s\n", buffer);
+	return count;
+}
+
+char *Parser_getstr(const char *prompt, const char *deft, char *res,
+		    size_t len)
+{
+	char *line = NULL;
+	int size = strlen(prompt) + strlen(deft) + 8;
+	char *theprompt;
+
+	theprompt = malloc(size);
+	assert(theprompt);
+
+	snprintf(theprompt, size, "%s [%s]: ", prompt, deft);
+
+	line  = readline(theprompt);
+	free(theprompt);
+
+	/*
+	 * The function strlcpy() cannot be used here because of
+	 * this function is used in LNet utils that is not linked
+	 * with libcfs.a.
+	 */
+	if (!line || *line == '\0')
+		strncpy(res, deft, len);
+	else
+		strncpy(res, line, len);
+	res[len - 1] = '\0';
+
+	if (line) {
+		free(line);
+		return res;
+	}
+	return NULL;
+}
+
+/* get integer from prompt, loop forever to get it */
+int Parser_getint(const char *prompt, long min, long max, long deft, int base)
+{
+	int rc;
+	long result;
+	char *line;
+	int size = strlen(prompt) + 40;
+	char *theprompt = malloc(size);
+
+	assert(theprompt);
+	snprintf(theprompt, size, "%s [%ld, (0x%lx)]: ", prompt, deft, deft);
+	fflush(stdout);
+
+	do {
+		line = NULL;
+		line = readline(theprompt);
+		if (!line) {
+			fprintf(stdout, "Please enter an integer.\n");
+			fflush(stdout);
+			continue;
+		}
+		if (*line == '\0') {
+			free(line);
+			result =  deft;
+			break;
+		}
+		rc = Parser_arg2int(line, &result, base);
+		free(line);
+		if (rc != 0) {
+			fprintf(stdout, "Invalid string.\n");
+			fflush(stdout);
+		} else if (result > max || result < min) {
+			fprintf(stdout,
+				"Error: response must lie between %ld and %ld.\n",
+				min, max);
+			fflush(stdout);
+		} else {
+			break;
+		}
+	} while (1);
+
+	if (theprompt)
+		free(theprompt);
+	return result;
+}
+
+/* get boolean (starting with YyNn; loop forever */
+int Parser_getbool(const char *prompt, int deft)
+{
+	int result = 0;
+	char *line;
+	int size = strlen(prompt) + 8;
+	char *theprompt = malloc(size);
+
+	assert(theprompt);
+	fflush(stdout);
+
+	if (deft != 0 && deft != 1) {
+		fprintf(stderr, "Error: Parser_getbool given bad default %d\n",
+			deft);
+		assert(0);
+	}
+	snprintf(theprompt, size, "%s [%s]: ", prompt, (deft == 0) ? "N" : "Y");
+
+	do {
+		line = NULL;
+		line = readline(theprompt);
+		if (!line) {
+			result = deft;
+			break;
+		}
+		if (*line == '\0') {
+			result = deft;
+			break;
+		}
+		if (*line == 'y' || *line == 'Y') {
+			result = 1;
+			break;
+		}
+		if (*line == 'n' || *line == 'N') {
+			result = 0;
+			break;
+		}
+		if (line)
+			free(line);
+		fprintf(stdout, "Invalid string. Must start with yY or nN\n");
+		fflush(stdout);
+	} while (1);
+
+	if (line)
+		free(line);
+	if (theprompt)
+		free(theprompt);
+
+	return result;
+}
+
+/* parse int out of a string or prompt for it */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+		   int min, int max, int base)
+{
+	long result;
+	int rc;
+
+	rc = Parser_arg2int(inp, &result, base);
+	if (rc == 0)
+		return result;
+	else
+		return Parser_getint(prompt, deft, min, max, base);
+}
+
+/* parse int out of a string or prompt for it */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+		    char *answer, int len)
+{
+	if (!inp || *inp == '\0')
+		return Parser_getstr(prompt, deft, answer, len);
+	else
+		return inp;
+}
+
+/*
+ * change a string into a number: return 0 on success. No invalid characters
+ * allowed. The processing of base and validity follows strtol(3)
+ */
+int Parser_arg2int(const char *inp, long *result, int base)
+{
+	char *endptr;
+
+	if ((base != 0) && (base < 2 || base > 36))
+		return 1;
+
+	*result = strtol(inp, &endptr, base);
+
+	if (*inp != '\0' && *endptr == '\0')
+		return 0;
+	else
+		return 1;
+}
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size(unsigned long *sizep, char *str)
+{
+	unsigned long size;
+	char mod[32];
+
+	switch (sscanf(str, "%lu%1[gGmMkK]", &size, mod)) {
+	default:
+		return -1;
+	case 1:
+		*sizep = size;
+		return 0;
+	case 2:
+		switch (*mod) {
+		case 'g':
+		case 'G':
+			*sizep = size << 30;
+			return 0;
+		case 'm':
+		case 'M':
+			*sizep = size << 20;
+			return 0;
+		case 'k':
+		case 'K':
+			*sizep = size << 10;
+			return 0;
+		default:
+			*sizep = size;
+			return 0;
+		}
+	}
+}
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool(int *b, char *str)
+{
+	if (!strcasecmp(str, "no") || !strcasecmp(str, "n") ||
+	    !strcasecmp(str, "off") || !strcasecmp(str, "down") ||
+	    !strcasecmp(str, "disable")) {
+		*b = 0;
+		return 0;
+	}
+
+	if (!strcasecmp(str, "yes") || !strcasecmp(str, "y") ||
+	    !strcasecmp(str, "on") || !strcasecmp(str, "up") ||
+	    !strcasecmp(str, "enable")) {
+		*b = 1;
+		return 0;
+	}
+
+	return -1;
+}
+
+int Parser_quit(int argc, char **argv)
+{
+	argc = argc;
+	argv = argv;
+	done = 1;
+	return 0;
+}
+
+int Parser_version(int argc, char **argv)
+{
+	fprintf(stdout, "%s %s\n", program_invocation_short_name,
+		LUSTRE_VERSION_STRING);
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
new file mode 100644
index 0000000000000..700f002d721df
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
@@ -0,0 +1,526 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/util/string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <unistd.h>
+#include <libcfs/util/string.h>
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+	char *end;
+
+	if (next->ls_str == NULL)
+		return 0;
+
+	/* skip leading white spaces */
+	while (next->ls_len) {
+		if (!isspace(*next->ls_str))
+			break;
+		next->ls_str++;
+		next->ls_len--;
+	}
+
+	if (next->ls_len == 0) /* whitespaces only */
+		return 0;
+
+	if (*next->ls_str == delim) {
+		/* first non-writespace is the delimiter */
+		return 0;
+	}
+
+	res->ls_str = next->ls_str;
+	end = memchr(next->ls_str, delim, next->ls_len);
+	if (end == NULL) {
+		/* there is no the delimeter in the string */
+		end = next->ls_str + next->ls_len;
+		next->ls_str = NULL;
+		next->ls_len = 0;
+	} else {
+		next->ls_str = end + 1;
+		next->ls_len -= (end - res->ls_str + 1);
+	}
+
+	/* skip ending whitespaces */
+	while (--end != res->ls_str) {
+		if (!isspace(*end))
+			break;
+	}
+
+	res->ls_len = end - res->ls_str + 1;
+	return 1;
+}
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+		  unsigned min, unsigned max)
+{
+	char	*endp;
+
+	*num = strtoul(str, &endp, 0);
+	if (endp == str)
+		return 0;
+
+	for (; endp < str + nob; endp++) {
+		if (!isspace(*endp))
+			return 0;
+	}
+
+	return (*num >= min && *num <= max);
+}
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ * src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+static int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+		     int bracketed, struct cfs_range_expr **expr)
+{
+	struct cfs_range_expr	*re;
+	struct cfs_lstr		tok;
+
+	re = calloc(1, sizeof(*re));
+	if (re == NULL)
+		return -ENOMEM;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		re->re_lo = min;
+		re->re_hi = max;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_lo, min, max)) {
+		/* <number> is parsed */
+		re->re_hi = re->re_lo;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (!bracketed || !cfs_gettok(src, '-', &tok))
+		goto failed;
+
+	if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+			       &re->re_lo, min, max))
+		goto failed;
+
+	/* <number> - */
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_hi, min, max)) {
+		/* <number> - <number> is parsed */
+		re->re_stride = 1;
+		goto out;
+	}
+
+	/* go to check <number> '-' <number> '/' <number> */
+	if (cfs_gettok(src, '/', &tok)) {
+		if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+				       &re->re_hi, min, max))
+			goto failed;
+
+		/* <number> - <number> / ... */
+		if (cfs_str2num_check(src->ls_str, src->ls_len,
+				      &re->re_stride, min, max)) {
+			/* <number> - <number> / <number> is parsed */
+			goto out;
+		}
+	}
+
+ out:
+	*expr = re;
+	return 0;
+
+ failed:
+	free(re);
+	return -EINVAL;
+}
+
+/**
+ * Print the range expression \a re into specified \a buffer.
+ * If \a bracketed is true, expression does not need additional
+ * brackets.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_range_expr_print(char *buffer, int count, struct cfs_range_expr *expr,
+		     bool bracketed)
+{
+	int i;
+	char s[] = "[";
+	char e[] = "]";
+
+	if (bracketed)
+		s[0] = e[0] = '\0';
+
+	if (expr->re_lo == expr->re_hi)
+		i = snprintf(buffer, count, "%u", expr->re_lo);
+	else if (expr->re_stride == 1)
+		i = snprintf(buffer, count, "%s%u-%u%s",
+				  s, expr->re_lo, expr->re_hi, e);
+	else
+		i = snprintf(buffer, count, "%s%u-%u/%u%s",
+				  s, expr->re_lo, expr->re_hi,
+				  expr->re_stride, e);
+	return i;
+}
+
+/**
+ * Print a list of range expressions (\a expr_list) into specified \a buffer.
+ * If the list contains several expressions, separate them with comma
+ * and surround the list with brackets.
+ *
+ * \retval number of characters written
+ */
+int
+cfs_expr_list_print(char *buffer, int count, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr *expr;
+	int i = 0, j = 0;
+	int numexprs = 0;
+
+	if (count <= 0)
+		return 0;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link)
+		numexprs++;
+
+	if (numexprs > 1)
+		i += scnprintf(buffer + i, count - i, "[");
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (j++ != 0)
+			i += scnprintf(buffer + i, count - i, ",");
+		i += cfs_range_expr_print(buffer + i, count - i, expr,
+					  numexprs > 1);
+	}
+
+	if (numexprs > 1)
+		i += scnprintf(buffer + i, count - i, "]");
+
+	return i;
+}
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr	*expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (value >= expr->re_lo && value <= expr->re_hi &&
+		    ((value - expr->re_lo) % expr->re_stride) == 0)
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+	struct cfs_range_expr	*expr;
+	__u32			*val;
+	int			count = 0;
+	int			i;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				count++;
+		}
+	}
+
+	if (count == 0) /* empty expression list */
+		return 0;
+
+	if (count > max)
+		return -EINVAL;
+
+	val = calloc(sizeof(val[0]), count);
+	if (val == NULL)
+		return -ENOMEM;
+
+	count = 0;
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				val[count++] = i;
+		}
+	}
+
+	*valpp = val;
+	return count;
+}
+
+void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+	/* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+	 * by OBD_FREE() if it's called by module other than libcfs & LNet,
+	 * otherwise we will see fake memory leak */
+	free(values);
+}
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+	while (!list_empty(&expr_list->el_exprs)) {
+		struct cfs_range_expr *expr;
+
+		expr = list_entry(expr_list->el_exprs.next,
+				  struct cfs_range_expr, re_link);
+		list_del(&expr->re_link);
+		free(expr);
+	}
+
+	free(expr_list);
+}
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 0 if \a str parses to \<number\> | \<expr_list\>
+ * \retval -errno otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+		    struct cfs_expr_list **elpp)
+{
+	struct cfs_expr_list	*expr_list;
+	struct cfs_range_expr	*expr;
+	struct cfs_lstr		src;
+	int			rc;
+
+	expr_list = calloc(1, sizeof(*expr_list));
+	if (expr_list == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+
+	INIT_LIST_HEAD(&expr_list->el_exprs);
+
+	if (src.ls_str[0] == '[' &&
+	    src.ls_str[src.ls_len - 1] == ']') {
+		src.ls_str++;
+		src.ls_len -= 2;
+
+		rc = -EINVAL;
+		while (src.ls_str != NULL) {
+			struct cfs_lstr tok;
+
+			if (!cfs_gettok(&src, ',', &tok)) {
+				rc = -EINVAL;
+				break;
+			}
+
+			rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+			if (rc != 0)
+				break;
+
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	} else {
+		rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+		if (rc == 0) {
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	}
+
+	if (rc != 0)
+		cfs_expr_list_free(expr_list);
+	else
+		*elpp = expr_list;
+
+	return rc;
+}
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+	struct cfs_expr_list *el;
+
+	while (!list_empty(list)) {
+		el = list_entry(list->next,
+				    struct cfs_expr_list, el_link);
+		list_del(&el->el_link);
+		cfs_expr_list_free(el);
+	}
+}
+
+/**
+ * cfs_abs_path() - Get the absolute path of a relative path
+ * @request_path:	The relative path to be resolved
+ * @resolved_path:	Set to the resolved absolute path
+ *
+ * Returns the canonicalized absolute pathname.  This function is a wrapper to
+ * realpath, but will work even if the target file does not exist.  All
+ * directories in the path must exist.
+ *
+ * Return: On success, 0 is returned and resolved_path points to an allocated
+ * string containing the absolute pathname.  On error, errno is set
+ * appropriately, -errno is returned, and resolved_path points to NULL.
+ */
+int cfs_abs_path(const char *request_path, char **resolved_path)
+{
+	char  buf[PATH_MAX + 1] = "";
+	char *path;
+	char *ptr;
+	int len;
+	int rc = 0;
+	const char *fmt;
+
+	path = malloc(sizeof(buf));
+	if (path == NULL)
+		return -ENOMEM;
+
+	if (request_path[0] != '/') {
+		if (getcwd(path, sizeof(buf) - 1) == NULL) {
+			rc = -errno;
+			goto out;
+		}
+		len = snprintf(buf, sizeof(buf), "%s/%s", path, request_path);
+		if (len >= sizeof(buf)) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	} else {
+		/* skip duplicate leading '/' */
+		len = snprintf(buf, sizeof(buf), "%s",
+			       request_path + strspn(request_path, "/") - 1);
+		if (len >= sizeof(buf)) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	}
+
+	/* if filename not in root directory, call realpath for parent path */
+	ptr = strrchr(buf, '/');
+	if (ptr != buf) {
+		*ptr = '\0';
+		if (path != realpath(buf, path)) {
+			rc = -errno;
+			goto out;
+		}
+		/* add the filename back */
+		len = strlen(path);
+		fmt = (path[len - 1] == '/') ? "%s" : "/%s";
+		len = snprintf(path + len, sizeof(buf) - len, fmt, ptr + 1);
+		if (len >= sizeof(buf) - len) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	} else {
+		len = snprintf(path, sizeof(buf), "%s", buf);
+		if (len >= sizeof(buf)) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	}
+
+out:
+	if (rc == 0) {
+		*resolved_path = path;
+	} else {
+		*resolved_path = NULL;
+		free(path);
+	}
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
new file mode 100644
index 0000000000000..d2b9eb4f871ea
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
@@ -0,0 +1,462 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/libcfs/workitem.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *         Liang Zhen  <zhen.liang@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/kthread.h>
+#include <libcfs/libcfs.h>
+
+#define CFS_WS_NAME_LEN         16
+
+struct cfs_wi_sched {
+	struct list_head		ws_list;	/* chain on global list */
+	/** serialised workitems */
+	spinlock_t			ws_lock;
+	/** where schedulers sleep */
+	wait_queue_head_t		ws_waitq;
+	/** concurrent workitems */
+	struct list_head		ws_runq;
+	/** rescheduled running-workitems, a workitem can be rescheduled
+	 * while running in wi_action(), but we don't to execute it again
+	 * unless it returns from wi_action(), so we put it on ws_rerunq
+	 * while rescheduling, and move it to runq after it returns
+	 * from wi_action() */
+	struct list_head		ws_rerunq;
+	/** CPT-table for this scheduler */
+	struct cfs_cpt_table	*ws_cptab;
+	/** CPT id for affinity */
+	int			ws_cpt;
+	/** number of scheduled workitems */
+	int			ws_nscheduled;
+	/** started scheduler thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_nthreads:30;
+	/** shutting down, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_stopping:1;
+	/** serialize starting thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_starting:1;
+	/** scheduler name */
+	char			ws_name[CFS_WS_NAME_LEN];
+};
+
+static struct cfs_workitem_data {
+	/** serialize */
+	spinlock_t		wi_glock;
+	/** list of all schedulers */
+	struct list_head	wi_scheds;
+	/** WI module is initialized */
+	int			wi_init;
+	/** shutting down the whole WI module */
+	int			wi_stopping;
+} cfs_wi_data;
+
+static inline int
+cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
+{
+	spin_lock(&sched->ws_lock);
+	if (sched->ws_stopping) {
+		spin_unlock(&sched->ws_lock);
+		return 0;
+	}
+
+	if (!list_empty(&sched->ws_runq)) {
+		spin_unlock(&sched->ws_lock);
+		return 0;
+	}
+	spin_unlock(&sched->ws_lock);
+	return 1;
+}
+
+/* XXX:
+ * 0. it only works when called from wi->wi_action.
+ * 1. when it returns no one shall try to schedule the workitem.
+ */
+void
+cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	spin_lock(&sched->ws_lock);
+
+	LASSERT(wi->wi_running);
+
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+	}
+
+	LASSERT(list_empty(&wi->wi_list));
+
+	wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+	spin_unlock(&sched->ws_lock);
+}
+EXPORT_SYMBOL(cfs_wi_exit);
+
+/**
+ * cancel schedule request of workitem \a wi
+ */
+int
+cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
+{
+	int	rc;
+
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+        /*
+         * return 0 if it's running already, otherwise return 1, which
+         * means the workitem will not be scheduled and will not have
+         * any race with wi_action.
+         */
+	spin_lock(&sched->ws_lock);
+
+	rc = !(wi->wi_running);
+
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+
+		wi->wi_scheduled = 0;
+	}
+
+	LASSERT (list_empty(&wi->wi_list));
+
+	spin_unlock(&sched->ws_lock);
+	return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
+
+/*
+ * Workitem scheduled with (serial == 1) is strictly serialised not only with
+ * itself, but also with others scheduled this way.
+ *
+ * Now there's only one static serialised queue, but in the future more might
+ * be added, and even dynamic creation of serialised queues might be supported.
+ */
+void
+cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	spin_lock(&sched->ws_lock);
+
+	if (!wi->wi_scheduled) {
+		LASSERT (list_empty(&wi->wi_list));
+
+		wi->wi_scheduled = 1;
+		sched->ws_nscheduled++;
+		if (!wi->wi_running) {
+			list_add_tail(&wi->wi_list, &sched->ws_runq);
+			wake_up(&sched->ws_waitq);
+		} else {
+			list_add(&wi->wi_list, &sched->ws_rerunq);
+		}
+	}
+
+	LASSERT (!list_empty(&wi->wi_list));
+	spin_unlock(&sched->ws_lock);
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
+
+static int
+cfs_wi_scheduler(void *arg)
+{
+	struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
+
+	/* CPT affinity scheduler? */
+	if (sched->ws_cptab != NULL)
+		if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
+			CWARN("Unable to bind %s on CPU partition %d\n",
+				sched->ws_name, sched->ws_cpt);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+
+	LASSERT(sched->ws_starting == 1);
+	sched->ws_starting--;
+	sched->ws_nthreads++;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	spin_lock(&sched->ws_lock);
+
+	while (!sched->ws_stopping) {
+		int		nloops = 0;
+		int		rc;
+		struct cfs_workitem *wi;
+
+		while (!list_empty(&sched->ws_runq) &&
+		       nloops < CFS_WI_RESCHED) {
+			wi = list_entry(sched->ws_runq.next,
+					struct cfs_workitem, wi_list);
+			LASSERT(wi->wi_scheduled && !wi->wi_running);
+
+			list_del_init(&wi->wi_list);
+
+			LASSERT(sched->ws_nscheduled > 0);
+			sched->ws_nscheduled--;
+
+                        wi->wi_running   = 1;
+                        wi->wi_scheduled = 0;
+
+			spin_unlock(&sched->ws_lock);
+                        nloops++;
+
+                        rc = (*wi->wi_action) (wi);
+
+			spin_lock(&sched->ws_lock);
+                        if (rc != 0) /* WI should be dead, even be freed! */
+                                continue;
+
+			wi->wi_running = 0;
+			if (list_empty(&wi->wi_list))
+                                continue;
+
+			LASSERT(wi->wi_scheduled);
+			/* wi is rescheduled, should be on rerunq now, we
+			 * move it to runq so it can run action now */
+			list_move_tail(&wi->wi_list, &sched->ws_runq);
+                }
+
+		if (!list_empty(&sched->ws_runq)) {
+			spin_unlock(&sched->ws_lock);
+			/* don't sleep because some workitems still
+			 * expect me to come back soon */
+			cond_resched();
+			spin_lock(&sched->ws_lock);
+			continue;
+		}
+
+		spin_unlock(&sched->ws_lock);
+		rc = wait_event_interruptible_exclusive(sched->ws_waitq,
+				!cfs_wi_sched_cansleep(sched));
+		spin_lock(&sched->ws_lock);
+        }
+
+	spin_unlock(&sched->ws_lock);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	sched->ws_nthreads--;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	return 0;
+}
+
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
+{
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	if (sched->ws_stopping) {
+		CDEBUG(D_INFO, "%s is in progress of stopping\n",
+		       sched->ws_name);
+		spin_unlock(&cfs_wi_data.wi_glock);
+		return;
+	}
+
+	LASSERT(!list_empty(&sched->ws_list));
+	sched->ws_stopping = 1;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	wake_up_all(&sched->ws_waitq);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	{
+		int i = 2;
+
+		while (sched->ws_nthreads > 0) {
+			CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET,
+			       "waiting %us for %d %s worker threads to exit\n",
+			       i / 20, sched->ws_nthreads, sched->ws_name);
+
+			spin_unlock(&cfs_wi_data.wi_glock);
+			schedule_timeout_uninterruptible(cfs_time_seconds(1)
+							 / 20);
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+	}
+
+	list_del(&sched->ws_list);
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	LASSERT(sched->ws_nscheduled == 0);
+
+	LIBCFS_FREE(sched, sizeof(*sched));
+}
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
+
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+		    int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
+{
+	struct cfs_wi_sched	*sched;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+	LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+		(cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+	LIBCFS_ALLOC(sched, sizeof(*sched));
+	if (sched == NULL)
+		return -ENOMEM;
+
+	if (strlen(name) > sizeof(sched->ws_name)-1) {
+		LIBCFS_FREE(sched, sizeof(*sched));
+		return -E2BIG;
+	}
+	strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
+
+	sched->ws_cptab = cptab;
+	sched->ws_cpt = cpt;
+
+	spin_lock_init(&sched->ws_lock);
+	init_waitqueue_head(&sched->ws_waitq);
+
+	INIT_LIST_HEAD(&sched->ws_runq);
+	INIT_LIST_HEAD(&sched->ws_rerunq);
+	INIT_LIST_HEAD(&sched->ws_list);
+
+	for (; nthrs > 0; nthrs--)  {
+		char			name[16];
+		struct task_struct	*task;
+
+		spin_lock(&cfs_wi_data.wi_glock);
+		while (sched->ws_starting > 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			schedule();
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+
+		sched->ws_starting++;
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+			snprintf(name, sizeof(name), "%s_%02d_%02d",
+				 sched->ws_name, sched->ws_cpt,
+				 sched->ws_nthreads);
+		} else {
+			snprintf(name, sizeof(name), "%s_%02d",
+				 sched->ws_name, sched->ws_nthreads);
+		}
+
+		task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
+		if (IS_ERR(task)) {
+			int rc = PTR_ERR(task);
+
+			CERROR("Failed to create thread for "
+				"WI scheduler %s: %d\n", name, rc);
+
+			spin_lock(&cfs_wi_data.wi_glock);
+
+			/* make up for cfs_wi_sched_destroy */
+			list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+			sched->ws_starting--;
+
+			spin_unlock(&cfs_wi_data.wi_glock);
+
+			cfs_wi_sched_destroy(sched);
+			return rc;
+		}
+	}
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	*sched_pp = sched;
+	return 0;
+}
+EXPORT_SYMBOL(cfs_wi_sched_create);
+
+int
+cfs_wi_startup(void)
+{
+	memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data));
+
+	spin_lock_init(&cfs_wi_data.wi_glock);
+	INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+	cfs_wi_data.wi_init = 1;
+
+	return 0;
+}
+
+void
+cfs_wi_shutdown (void)
+{
+	struct cfs_wi_sched	*sched;
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	cfs_wi_data.wi_stopping = 1;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	/* nobody should contend on this list */
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		sched->ws_stopping = 1;
+		wake_up_all(&sched->ws_waitq);
+	}
+
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		while (sched->ws_nthreads != 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			schedule_timeout_uninterruptible(cfs_time_seconds(1)
+							 / 20);
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+		spin_unlock(&cfs_wi_data.wi_glock);
+	}
+
+	while (!list_empty(&cfs_wi_data.wi_scheds)) {
+		sched = list_entry(cfs_wi_data.wi_scheds.next,
+				       struct cfs_wi_sched, ws_list);
+		list_del(&sched->ws_list);
+		LIBCFS_FREE(sched, sizeof(*sched));
+	}
+
+	cfs_wi_data.wi_stopping = 0;
+	cfs_wi_data.wi_init = 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/LICENSE b/drivers/staging/lustrefsx/lnet/LICENSE
new file mode 100644
index 0000000000000..92728f4d300d2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/LICENSE
@@ -0,0 +1,363 @@
+Each file in this distribution should contain a header stating the
+copyright owner(s), and the licensing terms for that module.  Some
+files are not eligible for copyright protection, and contain neither.
+
+All files in this subtree are licensed under the terms and conditions
+of the GNU General Public License version 2.
+
+Reproduced below is the GPL v2, and Linus's clarifying statement from
+the Linux kernel source code:
+
+----------------------------------------
+
+   NOTE! This copyright does *not* cover user programs that use kernel
+ services by normal system calls - this is merely considered normal use
+ of the kernel, and does *not* fall under the heading of "derived work".
+ Also note that the GPL below is copyrighted by the Free Software
+ Foundation, but the instance of code that it refers to (the Linux
+ kernel) is copyrighted by me and others who actually wrote it.
+
+			Linus Torvalds
+
+----------------------------------------
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/api.h b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
new file mode 100644
index 0000000000000..89be9c68e003e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
@@ -0,0 +1,171 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LNET_API_H__
+#define __LNET_API_H__
+
+/** \defgroup lnet LNet
+ *
+ * The Lustre Networking subsystem.
+ *
+ * LNet is an asynchronous message-passing API, which provides an unreliable
+ * connectionless service that can't guarantee any order. It supports OFA IB,
+ * TCP/IP, and Cray Portals, and routes between heterogeneous networks.
+ * @{
+ */
+
+#ifndef __KERNEL__
+# error This include is only for kernel use.
+#endif
+
+#include <uapi/linux/lnet/lnet-types.h>
+
+/** \defgroup lnet_init_fini Initialization and cleanup
+ * The LNet must be properly initialized before any LNet calls can be made.
+ * @{ */
+int LNetNIInit(lnet_pid_t requested_pid);
+int LNetNIFini(void);
+/** @} lnet_init_fini */
+
+/** \defgroup lnet_addr LNet addressing and basic types
+ *
+ * Addressing scheme and basic data types of LNet.
+ *
+ * The LNet API is memory-oriented, so LNet must be able to address not only
+ * end-points but also memory region within a process address space.
+ * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process
+ * in a node. A portal represents an opening in the address space of a
+ * process. Match bits is criteria to identify a region of memory inside a
+ * portal, and offset specifies an offset within the memory region.
+ *
+ * LNet creates a table of portals for each process during initialization.
+ * This table has MAX_PORTALS entries and its size can't be dynamically
+ * changed. A portal stays empty until the owning process starts to add
+ * memory regions to it. A portal is sometimes called an index because
+ * it's an entry in the portals table of a process.
+ *
+ * \see LNetMEAttach
+ * @{ */
+int LNetGetId(unsigned int index, struct lnet_processid *id);
+int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
+lnet_nid_t LNetPrimaryNID(lnet_nid_t nid);
+bool LNetIsPeerLocal(lnet_nid_t nid);
+
+/** @} lnet_addr */
+
+
+/** \defgroup lnet_me Match entries
+ *
+ * A match entry (abbreviated as ME) describes a set of criteria to accept
+ * incoming requests.
+ *
+ * A portal is essentially a match list plus a set of attributes. A match
+ * list is a chain of MEs. Each ME includes a pointer to a memory descriptor
+ * and a set of match criteria. The match criteria can be used to reject
+ * incoming requests based on process ID or the match bits provided in the
+ * request. MEs can be dynamically inserted into a match list by LNetMEAttach(),
+ * and must then be attached to an MD with LNetMDAttach().
+ * @{ */
+struct lnet_me *
+LNetMEAttach(unsigned int portal,
+	     struct lnet_processid *match_id_in,
+	     __u64 match_bits_in,
+	     __u64 ignore_bits_in,
+	     enum lnet_unlink unlink_in,
+	     enum lnet_ins_pos pos_in);
+/** @} lnet_me */
+
+/** \defgroup lnet_md Memory descriptors
+ *
+ * A memory descriptor contains information about a region of a user's
+ * memory (either in kernel or user space) and optionally points to an
+ * event queue where information about the operations performed on the
+ * memory descriptor are recorded. Memory descriptor is abbreviated as
+ * MD and can be used interchangeably with the memory region it describes.
+ *
+ * The LNet API provides two operations to create MDs: LNetMDAttach()
+ * and LNetMDBind(); one operation to unlink and release the resources
+ * associated with a MD: LNetMDUnlink().
+ * @{ */
+int LNetMDAttach(struct lnet_me *current_in,
+		 const struct lnet_md *md_in,
+		 enum lnet_unlink unlink_in,
+		 struct lnet_handle_md *md_handle_out);
+
+int LNetMDBind(const struct lnet_md *md_in,
+	       enum lnet_unlink unlink_in,
+	       struct lnet_handle_md *md_handle_out);
+
+int __LNetMDUnlink(struct lnet_handle_md md_in, bool discard);
+#define LNetMDUnlink(handle) __LNetMDUnlink(handle, false)
+
+void lnet_assert_handler_unused(lnet_handler_t handler);
+/** @} lnet_md */
+
+/** \defgroup lnet_data Data movement operations
+ *
+ * The LNet API provides two data movement operations: LNetPut()
+ * and LNetGet().
+ * @{ */
+int LNetPut(lnet_nid_t	      self,
+	    struct lnet_handle_md md_in,
+	    enum lnet_ack_req	ack_req_in,
+	    struct lnet_process_id target_in,
+	    unsigned int      portal_in,
+	    __u64	      match_bits_in,
+	    unsigned int      offset_in,
+	    __u64	      hdr_data_in);
+
+int LNetGet(lnet_nid_t	      self,
+	    struct lnet_handle_md md_in,
+	    struct lnet_process_id target_in,
+	    unsigned int      portal_in,
+	    __u64	      match_bits_in,
+	    unsigned int      offset_in,
+	    bool	      recovery);
+/** @} lnet_data */
+
+
+/** \defgroup lnet_misc Miscellaneous operations.
+ * Miscellaneous operations.
+ * @{ */
+
+int LNetSetLazyPortal(int portal);
+int LNetClearLazyPortal(int portal);
+int LNetCtl(unsigned int cmd, void *arg);
+void LNetDebugPeer(struct lnet_processid *id);
+int LNetGetPeerDiscoveryStatus(void);
+int LNetAddPeer(lnet_nid_t *nids, __u32 num_nids);
+
+/** @} lnet_misc */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
new file mode 100644
index 0000000000000..223c6d328bf26
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
@@ -0,0 +1,1243 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/include/lnet/lib-lnet.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef __LNET_LIB_LNET_H__
+#define __LNET_LIB_LNET_H__
+
+/* LNET has 0xeXXX */
+#define CFS_FAIL_PTLRPC_OST_BULK_CB2	0xe000
+
+#include <linux/netdevice.h>
+
+#include <libcfs/libcfs.h>
+#include <lnet/api.h>
+#include <lnet/lib-types.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+#include <uapi/linux/lnet/lnet-types.h>
+#include <uapi/linux/lnet/lnetctl.h>
+#include <uapi/linux/lnet/nidstr.h>
+
+extern struct lnet the_lnet;			/* THE network */
+
+#if (BITS_PER_LONG == 32)
+/* 2 CPTs, allowing more CPTs might make us under memory pressure */
+# define LNET_CPT_MAX_BITS     1
+
+#else /* 64-bit system */
+/*
+ * 256 CPTs for thousands of CPUs, allowing more CPTs might make us
+ * under risk of consuming all lh_cookie.
+ */
+# define LNET_CPT_MAX_BITS     8
+#endif /* BITS_PER_LONG == 32 */
+
+/* max allowed CPT number */
+#define LNET_CPT_MAX		(1 << LNET_CPT_MAX_BITS)
+
+#define LNET_CPT_NUMBER		(the_lnet.ln_cpt_number)
+#define LNET_CPT_BITS		(the_lnet.ln_cpt_bits)
+#define LNET_CPT_MASK		((1ULL << LNET_CPT_BITS) - 1)
+
+/** exclusive lock */
+#define LNET_LOCK_EX		CFS_PERCPT_LOCK_EX
+
+/* default timeout and credits */
+#define DEFAULT_PEER_TIMEOUT    180
+#define DEFAULT_PEER_CREDITS    8
+#define DEFAULT_CREDITS         256
+
+/* default number of connections per peer */
+#define DEFAULT_CONNS_PER_PEER  0
+
+#ifdef HAVE_KERN_SOCK_GETNAME_2ARGS
+#define lnet_kernel_getpeername(sock, addr, addrlen) \
+		kernel_getpeername(sock, addr)
+#define lnet_kernel_getsockname(sock, addr, addrlen) \
+		kernel_getsockname(sock, addr)
+#else
+#define lnet_kernel_getpeername(sock, addr, addrlen) \
+		kernel_getpeername(sock, addr, addrlen)
+#define lnet_kernel_getsockname(sock, addr, addrlen) \
+		kernel_getsockname(sock, addr, addrlen)
+#endif
+
+/*
+ * kernel 5.3: commit ef11db3310e272d3d8dbe8739e0770820dd20e52
+ * kernel 4.18.0-193.el8:
+ * added in_dev_for_each_ifa_rtnl and in_dev_for_each_ifa_rcu
+ * and removed for_ifa and endfor_ifa.
+ * Use the _rntl variant as the current locking is rtnl.
+ */
+#ifdef HAVE_IN_DEV_FOR_EACH_IFA_RTNL
+#define DECLARE_CONST_IN_IFADDR(ifa)		const struct in_ifaddr *ifa
+#define endfor_ifa(in_dev)
+#else
+#define DECLARE_CONST_IN_IFADDR(ifa)
+#define in_dev_for_each_ifa_rtnl(ifa, in_dev)	for_ifa((in_dev))
+#define in_dev_for_each_ifa_rcu(ifa, in_dev)	for_ifa((in_dev))
+#endif
+
+#ifndef fallthrough
+# if defined(__GNUC__) && __GNUC__ >= 7
+#  define fallthrough  __attribute__((fallthrough)) /* fallthrough */
+# else
+#  define fallthrough do {} while (0)  /* fallthrough */
+# endif
+#endif
+
+int choose_ipv4_src(__u32 *ret,
+		    int interface, __u32 dst_ipaddr, struct net *ns);
+
+bool lnet_is_route_alive(struct lnet_route *route);
+bool lnet_is_gateway_alive(struct lnet_peer *gw);
+
+static inline int lnet_is_wire_handle_none(struct lnet_handle_wire *wh)
+{
+	return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE &&
+		wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE);
+}
+
+static inline int lnet_md_exhausted(struct lnet_libmd *md)
+{
+	return (md->md_threshold == 0 ||
+		((md->md_options & LNET_MD_MAX_SIZE) != 0 &&
+		 md->md_offset + md->md_max_size > md->md_length));
+}
+
+static inline int lnet_md_unlinkable(struct lnet_libmd *md)
+{
+	/* Should unlink md when its refcount is 0 and either:
+	 *  - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink,
+	 *    in the latter case md may not be exhausted).
+	 *  - auto unlink is on and md is exhausted.
+	 */
+	if (md->md_refcount != 0)
+		return 0;
+
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0)
+		return 1;
+
+	return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 &&
+		lnet_md_exhausted(md));
+}
+
+#define lnet_cpt_table()	(the_lnet.ln_cpt_table)
+#define lnet_cpt_current()	cfs_cpt_current(the_lnet.ln_cpt_table, 1)
+
+static inline int
+lnet_cpt_of_cookie(__u64 cookie)
+{
+	unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK;
+
+	/* LNET_CPT_NUMBER doesn't have to be power2, which means we can
+	 * get illegal cpt from it's invalid cookie */
+	return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER;
+}
+
+static inline void
+lnet_res_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline void
+lnet_res_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline int
+lnet_res_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_res_lock(cpt);
+	return cpt;
+}
+
+static inline void
+lnet_net_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline void
+lnet_net_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline int
+lnet_net_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_net_lock(cpt);
+	return cpt;
+}
+
+#define LNET_LOCK()		lnet_net_lock(LNET_LOCK_EX)
+#define LNET_UNLOCK()		lnet_net_unlock(LNET_LOCK_EX)
+
+#define lnet_ptl_lock(ptl)	spin_lock(&(ptl)->ptl_lock)
+#define lnet_ptl_unlock(ptl)	spin_unlock(&(ptl)->ptl_lock)
+#define lnet_ni_lock(ni)	spin_lock(&(ni)->ni_lock)
+#define lnet_ni_unlock(ni)	spin_unlock(&(ni)->ni_lock)
+
+#define MAX_PORTALS	64
+
+#define LNET_SMALL_MD_SIZE   offsetof(struct lnet_libmd, md_kiov[1])
+extern struct kmem_cache *lnet_mes_cachep;	 /* MEs kmem_cache */
+extern struct kmem_cache *lnet_small_mds_cachep; /* <= LNET_SMALL_MD_SIZE bytes
+						  * MDs kmem_cache */
+extern struct kmem_cache *lnet_udsp_cachep;
+extern struct kmem_cache *lnet_rspt_cachep;
+extern struct kmem_cache *lnet_msg_cachep;
+
+static inline bool
+lnet_ni_set_status_locked(struct lnet_ni *ni, __u32 status)
+__must_hold(&ni->ni_lock)
+{
+	bool update = false;
+
+	if (ni->ni_status && ni->ni_status->ns_status != status) {
+		CDEBUG(D_NET, "ni %s status changed from %#x to %#x\n",
+		       libcfs_nidstr(&ni->ni_nid),
+		       ni->ni_status->ns_status, status);
+		ni->ni_status->ns_status = status;
+		update = true;
+	}
+
+	return update;
+}
+
+static inline unsigned int
+lnet_ni_get_status_locked(struct lnet_ni *ni)
+__must_hold(&ni->ni_lock)
+{
+	if (nid_is_lo0(&ni->ni_nid))
+		return LNET_NI_STATUS_UP;
+	else if (atomic_read(&ni->ni_fatal_error_on))
+		return LNET_NI_STATUS_DOWN;
+	else if (ni->ni_status)
+		return ni->ni_status->ns_status;
+	else
+		return LNET_NI_STATUS_UP;
+}
+
+static inline bool
+lnet_ni_set_status(struct lnet_ni *ni, __u32 status)
+{
+	bool update;
+
+	lnet_ni_lock(ni);
+	update = lnet_ni_set_status_locked(ni, status);
+	lnet_ni_unlock(ni);
+
+	return update;
+}
+
+static inline void lnet_md_wait_handling(struct lnet_libmd *md, int cpt)
+{
+	wait_queue_head_t *wq = __var_waitqueue(md);
+#if defined(HAVE_WAIT_BIT_QUEUE_ENTRY) || !defined(HAVE_WAIT_VAR_EVENT)
+	struct wait_bit_queue_entry entry;
+	wait_queue_entry_t *wqe = &entry.wq_entry;
+#else
+	struct wait_bit_queue entry;
+	wait_queue_entry_t *wqe = &entry.wait;
+#endif
+	init_wait_var_entry(&entry, md, 0);
+	prepare_to_wait_event(wq, wqe, TASK_IDLE);
+	if (md->md_flags & LNET_MD_FLAG_HANDLING) {
+		/* Race with unlocked call to ->md_handler.
+		 * It is safe to drop the res_lock here as the
+		 * caller has only just claimed it.
+		 */
+		lnet_res_unlock(cpt);
+		schedule();
+		/* Cannot check md now, it might be freed.  Caller
+		 * must reclaim reference and check.
+		 */
+		lnet_res_lock(cpt);
+	}
+	finish_wait(wq, wqe);
+}
+
+static inline void
+lnet_md_free(struct lnet_libmd *md)
+{
+	unsigned int  size;
+
+	LASSERTF(md->md_rspt_ptr == NULL, "md %p rsp %p\n", md, md->md_rspt_ptr);
+
+	size = offsetof(struct lnet_libmd, md_kiov[md->md_niov]);
+
+	if (size <= LNET_SMALL_MD_SIZE) {
+		CDEBUG(D_MALLOC, "slab-freed 'md' at %p.\n", md);
+		kmem_cache_free(lnet_small_mds_cachep, md);
+	} else {
+		LIBCFS_FREE(md, size);
+	}
+}
+
+struct lnet_libhandle *lnet_res_lh_lookup(struct lnet_res_container *rec,
+				     __u64 cookie);
+void lnet_res_lh_initialize(struct lnet_res_container *rec,
+			    struct lnet_libhandle *lh);
+static inline void
+lnet_res_lh_invalidate(struct lnet_libhandle *lh)
+{
+	/* ALWAYS called with resource lock held */
+	/* NB: cookie is still useful, don't reset it */
+	list_del(&lh->lh_hash_chain);
+}
+
+static inline void
+lnet_md2handle(struct lnet_handle_md *handle, struct lnet_libmd *md)
+{
+	handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline struct lnet_libmd *
+lnet_handle2md(struct lnet_handle_md *handle)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_libhandle *lh;
+	int		 cpt;
+
+	cpt = lnet_cpt_of_cookie(handle->cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, struct lnet_libmd, md_lh);
+}
+
+static inline struct lnet_libmd *
+lnet_wire_handle2md(struct lnet_handle_wire *wh)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_libhandle *lh;
+	int		 cpt;
+
+	if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie)
+		return NULL;
+
+	cpt = lnet_cpt_of_cookie(wh->wh_object_cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				wh->wh_object_cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, struct lnet_libmd, md_lh);
+}
+
+static inline void
+lnet_peer_net_addref_locked(struct lnet_peer_net *lpn)
+{
+	atomic_inc(&lpn->lpn_refcount);
+}
+
+extern void lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn);
+
+static inline void
+lnet_peer_net_decref_locked(struct lnet_peer_net *lpn)
+{
+	if (atomic_dec_and_test(&lpn->lpn_refcount))
+		lnet_destroy_peer_net_locked(lpn);
+}
+
+static inline void
+lnet_peer_addref_locked(struct lnet_peer *lp)
+{
+	atomic_inc(&lp->lp_refcount);
+}
+
+extern void lnet_destroy_peer_locked(struct lnet_peer *lp);
+
+static inline void
+lnet_peer_decref_locked(struct lnet_peer *lp)
+{
+	if (atomic_dec_and_test(&lp->lp_refcount))
+		lnet_destroy_peer_locked(lp);
+}
+
+static inline void
+lnet_peer_ni_addref_locked(struct lnet_peer_ni *lp)
+{
+	kref_get(&lp->lpni_kref);
+}
+
+extern void lnet_destroy_peer_ni_locked(struct kref *ref);
+
+static inline void
+lnet_peer_ni_decref_locked(struct lnet_peer_ni *lp)
+{
+	kref_put(&lp->lpni_kref, lnet_destroy_peer_ni_locked);
+}
+
+static inline int
+lnet_isrouter(struct lnet_peer_ni *lpni)
+{
+	return lpni->lpni_peer_net->lpn_peer->lp_rtr_refcount != 0;
+}
+
+static inline void
+lnet_ni_addref_locked(struct lnet_ni *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] >= 0);
+
+	(*ni->ni_refs[cpt])++;
+}
+
+static inline void
+lnet_ni_addref(struct lnet_ni *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+static inline void
+lnet_ni_decref_locked(struct lnet_ni *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] > 0);
+
+	(*ni->ni_refs[cpt])--;
+}
+
+static inline void
+lnet_ni_decref(struct lnet_ni *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_decref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+static inline struct lnet_msg *
+lnet_msg_alloc(void)
+{
+	struct lnet_msg *msg;
+
+	msg = kmem_cache_zalloc(lnet_msg_cachep, GFP_NOFS);
+
+	return (msg);
+}
+
+static inline void
+lnet_msg_free(struct lnet_msg *msg)
+{
+	LASSERT(!msg->msg_onactivelist);
+	kmem_cache_free(lnet_msg_cachep, msg);
+}
+
+static inline struct lnet_rsp_tracker *
+lnet_rspt_alloc(int cpt)
+{
+	struct lnet_rsp_tracker *rspt;
+
+	rspt = kmem_cache_zalloc(lnet_rspt_cachep, GFP_NOFS);
+	if (rspt) {
+		lnet_net_lock(cpt);
+		the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc++;
+		lnet_net_unlock(cpt);
+	}
+	CDEBUG(D_MALLOC, "rspt alloc %p\n", rspt);
+	return rspt;
+}
+
+static inline void
+lnet_rspt_free(struct lnet_rsp_tracker *rspt, int cpt)
+{
+	CDEBUG(D_MALLOC, "rspt free %p\n", rspt);
+
+	kmem_cache_free(lnet_rspt_cachep, rspt);
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc--;
+	lnet_net_unlock(cpt);
+}
+
+void lnet_ni_free(struct lnet_ni *ni);
+void lnet_net_free(struct lnet_net *net);
+
+struct lnet_net *
+lnet_net_alloc(__u32 net_type, struct list_head *netlist);
+
+struct lnet_ni *
+lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el,
+	      char *iface);
+struct lnet_ni *
+lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts,
+			  char *iface);
+
+static inline int
+lnet_nid2peerhash(struct lnet_nid *nid)
+{
+	u32 h = 0;
+	int i;
+
+	for (i = 0; i < 4; i++)
+		h = hash_32(nid->nid_addr[i]^h, 32);
+	return hash_32(LNET_NID_NET(nid) ^ h, LNET_PEER_HASH_BITS);
+}
+
+static inline struct list_head *
+lnet_net2rnethash(__u32 net)
+{
+	return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) +
+		LNET_NETTYP(net)) &
+		((1U << the_lnet.ln_remote_nets_hbits) - 1)];
+}
+
+static inline void lnet_hdr_from_nid4(struct lnet_hdr *hdr,
+				    const struct lnet_hdr_nid4 *vhdr)
+{
+	const struct _lnet_hdr_nid4 *hdr_nid4 = (void *)vhdr;
+
+	lnet_nid4_to_nid(le64_to_cpu(hdr_nid4->dest_nid), &hdr->dest_nid);
+	lnet_nid4_to_nid(le64_to_cpu(hdr_nid4->src_nid), &hdr->src_nid);
+	hdr->dest_pid = le32_to_cpu(hdr_nid4->dest_pid);
+	hdr->src_pid = le32_to_cpu(hdr_nid4->src_pid);
+	hdr->type = le32_to_cpu(hdr_nid4->type);
+	hdr->payload_length = le32_to_cpu(hdr_nid4->payload_length);
+
+	hdr->msg = hdr_nid4->msg;
+}
+
+static inline void lnet_hdr_to_nid4(const struct lnet_hdr *hdr,
+				      struct lnet_hdr_nid4 *vhdr)
+{
+	struct _lnet_hdr_nid4 *hdr_nid4 = (void *)vhdr;
+
+	hdr_nid4->dest_nid = cpu_to_le64(lnet_nid_to_nid4(&hdr->dest_nid));
+	hdr_nid4->src_nid = cpu_to_le64(lnet_nid_to_nid4(&hdr->src_nid));
+	hdr_nid4->dest_pid = cpu_to_le32(hdr->dest_pid);
+	hdr_nid4->src_pid = cpu_to_le32(hdr->src_pid);
+	hdr_nid4->type = cpu_to_le32(hdr->type);
+	hdr_nid4->payload_length = cpu_to_le32(hdr->payload_length);
+
+	hdr_nid4->msg = hdr->msg;
+}
+
+static inline void lnet_hdr_from_nid16(struct lnet_hdr *hdr,
+					const struct lnet_hdr_nid16 *vhdr)
+{
+	const struct lnet_hdr *hdr16 = (void *)vhdr;
+
+	hdr->dest_nid = hdr16->dest_nid;
+	hdr->src_nid = hdr16->src_nid;
+	hdr->dest_pid = le32_to_cpu(hdr16->dest_pid);
+	hdr->src_pid = le32_to_cpu(hdr16->src_pid);
+	hdr->type = le32_to_cpu(hdr16->type);
+	hdr->payload_length = le32_to_cpu(hdr16->payload_length);
+}
+
+static inline void lnet_hdr_to_nid16(const struct lnet_hdr *hdr,
+				      struct lnet_hdr_nid16 *vhdr)
+{
+	struct lnet_hdr *hdr16 = (void *)vhdr;
+
+	hdr16->dest_nid = hdr->dest_nid;
+	hdr16->src_nid = hdr->src_nid;
+	hdr16->dest_pid = cpu_to_le32(hdr->dest_pid);
+	hdr16->src_pid = cpu_to_le32(hdr->src_pid);
+	hdr16->type = cpu_to_le32(hdr->type);
+	hdr16->payload_length = cpu_to_le32(hdr->payload_length);
+}
+
+extern const struct lnet_lnd the_lolnd;
+extern int avoid_asym_router_failure;
+
+extern unsigned int lnet_nid_cpt_hash(struct lnet_nid *nid,
+				      unsigned int number);
+extern int lnet_cpt_of_nid_locked(struct lnet_nid *nid, struct lnet_ni *ni);
+extern int lnet_cpt_of_nid(lnet_nid_t nid, struct lnet_ni *ni);
+extern int lnet_nid2cpt(struct lnet_nid *nid, struct lnet_ni *ni);
+extern struct lnet_ni *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
+extern struct lnet_ni *lnet_nid_to_ni_locked(struct lnet_nid *nid, int cpt);
+extern struct lnet_ni *lnet_nid2ni_addref(lnet_nid_t nid);
+extern struct lnet_ni *lnet_net2ni_locked(__u32 net, int cpt);
+extern struct lnet_ni *lnet_net2ni_addref(__u32 net);
+extern struct lnet_ni *lnet_nid_to_ni_addref(struct lnet_nid *nid);
+struct lnet_net *lnet_get_net_locked(__u32 net_id);
+
+int lnet_lib_init(void);
+void lnet_lib_exit(void);
+
+extern unsigned int lnet_response_tracking;
+extern unsigned lnet_transaction_timeout;
+extern unsigned lnet_retry_count;
+extern unsigned int lnet_lnd_timeout;
+extern unsigned int lnet_numa_range;
+extern unsigned int lnet_health_sensitivity;
+extern unsigned int lnet_recovery_interval;
+extern unsigned int lnet_recovery_limit;
+extern unsigned int lnet_peer_discovery_disabled;
+extern unsigned int lnet_drop_asym_route;
+extern unsigned int router_sensitivity_percentage;
+extern int alive_router_check_interval;
+extern int live_router_check_interval;
+extern int dead_router_check_interval;
+extern int portal_rotor;
+
+void lnet_mt_event_handler(struct lnet_event *event);
+
+int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, bool alive, bool reset,
+		time64_t when);
+void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
+			time64_t when);
+int lnet_add_route(__u32 net, __u32 hops, struct lnet_nid *gateway,
+		   __u32 priority, __u32 sensitivity);
+int lnet_del_route(__u32 net, struct lnet_nid *gw_nid);
+void lnet_move_route(struct lnet_route *route, struct lnet_peer *lp,
+		     struct list_head *rt_list);
+void lnet_destroy_routes(void);
+int lnet_get_route(int idx, __u32 *net, __u32 *hops,
+		   lnet_nid_t *gateway, __u32 *alive, __u32 *priority,
+		   __u32 *sensitivity);
+int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg);
+struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet,
+					struct lnet_ni *prev);
+struct lnet_ni *lnet_get_ni_idx_locked(int idx);
+int lnet_get_net_healthv_locked(struct lnet_net *net);
+
+extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
+				struct libcfs_ioctl_hdr __user *uparam);
+extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep,
+			      struct lnet_process_id __user *ids);
+extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all);
+extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni,
+						 struct list_head *queue,
+						 time64_t now);
+extern int lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni,
+				  struct lnet_nid *nid);
+extern void lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni);
+extern int lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni,
+				  struct lnet_nid *nid);
+void lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni,
+					 __u32 priority);
+extern void lnet_ni_add_to_recoveryq_locked(struct lnet_ni *ni,
+					    struct list_head *queue,
+					    time64_t now);
+
+void lnet_router_debugfs_init(void);
+void lnet_router_debugfs_fini(void);
+int  lnet_rtrpools_alloc(int im_a_router);
+void lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages);
+int  lnet_rtrpools_adjust(int tiny, int small, int large);
+int lnet_rtrpools_enable(void);
+void lnet_rtrpools_disable(void);
+void lnet_rtrpools_free(int keep_pools);
+void lnet_rtr_transfer_to_peer(struct lnet_peer *src,
+			       struct lnet_peer *target);
+struct lnet_remotenet *lnet_find_rnet_locked(__u32 net);
+int lnet_dyn_add_net(struct lnet_ioctl_config_data *conf);
+int lnet_dyn_del_net(__u32 net);
+int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf);
+int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf);
+int lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason);
+struct lnet_net *lnet_get_net_locked(__u32 net_id);
+void lnet_net_clr_pref_rtrs(struct lnet_net *net);
+int lnet_net_add_pref_rtr(struct lnet_net *net, struct lnet_nid *gw_nid);
+
+int lnet_islocalnid(struct lnet_nid *nid);
+int lnet_islocalnet(__u32 net);
+int lnet_islocalnet_locked(__u32 net);
+
+void lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
+			unsigned int offset, unsigned int mlen);
+void lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev);
+void lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type);
+void lnet_msg_commit(struct lnet_msg *msg, int cpt);
+void lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status);
+
+void lnet_prep_send(struct lnet_msg *msg, int type,
+		    struct lnet_processid *target, unsigned int offset,
+		    unsigned int len);
+int lnet_send(struct lnet_nid *nid, struct lnet_msg *msg,
+	      struct lnet_nid *rtr_nid);
+int lnet_send_ping(struct lnet_nid *dest_nid, struct lnet_handle_md *mdh,
+		   int nnis, void *user_ptr, lnet_handler_t handler,
+		   bool recovery);
+void lnet_return_tx_credits_locked(struct lnet_msg *msg);
+void lnet_return_rx_credits_locked(struct lnet_msg *msg);
+void lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp);
+void lnet_drop_routed_msgs_locked(struct list_head *list, int cpt);
+
+struct list_head **lnet_create_array_of_queues(void);
+
+/* portals functions */
+/* portals attributes */
+static inline int
+lnet_ptl_is_lazy(struct lnet_portal *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_LAZY);
+}
+
+static inline int
+lnet_ptl_is_unique(struct lnet_portal *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE);
+}
+
+static inline int
+lnet_ptl_is_wildcard(struct lnet_portal *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD);
+}
+
+static inline void
+lnet_ptl_setopt(struct lnet_portal *ptl, int opt)
+{
+	ptl->ptl_options |= opt;
+}
+
+static inline void
+lnet_ptl_unsetopt(struct lnet_portal *ptl, int opt)
+{
+	ptl->ptl_options &= ~opt;
+}
+
+/* match-table functions */
+struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable,
+			       struct lnet_processid *id, __u64 mbits);
+struct lnet_match_table *lnet_mt_of_attach(unsigned int index,
+					   struct lnet_processid *id,
+					   __u64 mbits, __u64 ignore_bits,
+					   enum lnet_ins_pos pos);
+int lnet_mt_match_md(struct lnet_match_table *mtable,
+		     struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* portals match/attach functions */
+void lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md,
+			struct list_head *matches, struct list_head *drops);
+void lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md);
+int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* initialized and finalize portals */
+int lnet_portals_create(void);
+void lnet_portals_destroy(void);
+
+/* message functions */
+int lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr,
+	       struct lnet_nid *fromnid, void *private, int rdma_req);
+int lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg);
+int lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg);
+
+void lnet_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+	       int delayed, unsigned int offset, unsigned int mlen,
+	       unsigned int rlen);
+void lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+		  int delayed, unsigned int offset,
+		  unsigned int mlen, unsigned int rlen);
+void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg);
+
+struct lnet_msg *lnet_create_reply_msg(struct lnet_ni *ni,
+				       struct lnet_msg *get_msg);
+void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
+			    unsigned int len);
+void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt);
+void lnet_clean_zombie_rstqs(void);
+
+bool lnet_md_discarded(struct lnet_libmd *md);
+void lnet_finalize(struct lnet_msg *msg, int rc);
+bool lnet_send_error_simulation(struct lnet_msg *msg,
+				enum lnet_msg_hstatus *hstatus);
+void lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni);
+
+void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
+		       unsigned int nob, __u32 msg_type);
+void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
+void lnet_recv_delayed_msg_list(struct list_head *head);
+
+int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt);
+void lnet_msg_container_cleanup(struct lnet_msg_container *container);
+void lnet_msg_containers_destroy(void);
+int lnet_msg_containers_create(void);
+
+char *lnet_health_error2str(enum lnet_msg_hstatus hstatus);
+char *lnet_msgtyp2str(int type);
+int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
+
+/** \addtogroup lnet_fault_simulation @{ */
+
+int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data);
+int lnet_fault_init(void);
+void lnet_fault_fini(void);
+
+bool lnet_drop_rule_match(struct lnet_hdr *hdr, lnet_nid_t local_nid,
+			  enum lnet_msg_hstatus *hstatus);
+
+int lnet_delay_rule_add(struct lnet_fault_attr *attr);
+int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown);
+int lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr,
+			 struct lnet_fault_stat *stat);
+void lnet_delay_rule_reset(void);
+void lnet_delay_rule_check(void);
+bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg);
+
+/** @} lnet_fault_simulation */
+
+void lnet_counters_get_common(struct lnet_counters_common *common);
+int lnet_counters_get(struct lnet_counters *counters);
+void lnet_counters_reset(void);
+static inline void
+lnet_ni_set_sel_priority_locked(struct lnet_ni *ni, __u32 priority)
+{
+	ni->ni_sel_priority = priority;
+}
+
+static inline void
+lnet_net_set_sel_priority_locked(struct lnet_net *net, __u32 priority)
+{
+	net->net_sel_priority = priority;
+}
+
+unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov);
+unsigned int lnet_kiov_nob(unsigned int niov, struct bio_vec *iov);
+int lnet_extract_kiov(int dst_niov, struct bio_vec *dst,
+		      int src_niov, struct bio_vec *src,
+		      unsigned int offset, unsigned int len);
+
+void lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov,
+		       unsigned int doffset,
+		       unsigned int nsiov, struct kvec *siov,
+		       unsigned int soffset, unsigned int nob);
+void lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov,
+			unsigned int iovoffset,
+			unsigned int nkiov, struct bio_vec *kiov,
+			unsigned int kiovoffset, unsigned int nob);
+void lnet_copy_iov2kiov(unsigned int nkiov, struct bio_vec *kiov,
+			unsigned int kiovoffset,
+			unsigned int niov, struct kvec *iov,
+			unsigned int iovoffset, unsigned int nob);
+void lnet_copy_kiov2kiov(unsigned int ndkiov, struct bio_vec *dkiov,
+			 unsigned int doffset,
+			 unsigned int nskiov, struct bio_vec *skiov,
+			 unsigned int soffset, unsigned int nob);
+
+static inline void
+lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset,
+		    unsigned int nsiov, struct bio_vec *skiov,
+		    unsigned int soffset, unsigned int nob)
+{
+	struct kvec diov = { .iov_base = dest, .iov_len = dlen };
+
+	lnet_copy_kiov2iov(1, &diov, doffset,
+			   nsiov, skiov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2kiov(unsigned int ndiov, struct bio_vec *dkiov,
+		    unsigned int doffset, int slen, void *src,
+		    unsigned int soffset, unsigned int nob)
+{
+	struct kvec siov = { .iov_base = src, .iov_len = slen };
+	lnet_copy_iov2kiov(ndiov, dkiov, doffset,
+			   1, &siov, soffset, nob);
+}
+
+void lnet_me_unlink(struct lnet_me *me);
+
+void lnet_md_unlink(struct lnet_libmd *md);
+void lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_event *ev);
+struct page *lnet_kvaddr_to_page(unsigned long vaddr);
+struct page *lnet_get_first_page(struct lnet_libmd *md, unsigned int offset);
+int lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset);
+
+unsigned int lnet_get_lnd_timeout(void);
+void lnet_register_lnd(const struct lnet_lnd *lnd);
+void lnet_unregister_lnd(const struct lnet_lnd *lnd);
+
+struct socket *lnet_connect(struct lnet_nid *peer_nid, int interface,
+			    struct sockaddr *peeraddr, struct net *ns);
+void lnet_connect_console_error(int rc, struct lnet_nid *peer_nid,
+				struct sockaddr *sa);
+int lnet_count_acceptor_nets(void);
+int lnet_acceptor_timeout(void);
+int lnet_acceptor_port(void);
+int lnet_acceptor_start(void);
+void lnet_acceptor_stop(void);
+
+struct lnet_inetdev {
+	u32	li_cpt;
+	u32	li_flags;
+	u32	li_ipaddr;
+	u32	li_netmask;
+	char	li_name[IFNAMSIZ];
+};
+
+int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns);
+void lnet_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize);
+void lnet_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize);
+int lnet_sock_getaddr(struct socket *socket, bool remote,
+		      struct sockaddr_storage *peer);
+int lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout);
+int lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout);
+
+struct socket *lnet_sock_listen(int port, int backlog,
+				struct net *ns);
+struct socket *lnet_sock_connect(int interface, int local_port,
+				 struct sockaddr *peeraddr,
+				 struct net *ns);
+
+int lnet_peers_start_down(void);
+int lnet_peer_buffer_credits(struct lnet_net *net);
+void lnet_consolidate_routes_locked(struct lnet_peer *orig_lp,
+				    struct lnet_peer *new_lp);
+void lnet_router_discovery_complete(struct lnet_peer *lp);
+void lnet_router_discovery_ping_reply(struct lnet_peer *lp);
+
+int lnet_monitor_thr_start(void);
+void lnet_monitor_thr_stop(void);
+
+bool lnet_router_checker_active(void);
+void lnet_check_routers(void);
+void lnet_wait_router_start(void);
+void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf);
+
+int lnet_ping_info_validate(struct lnet_ping_info *pinfo);
+struct lnet_ping_buffer *lnet_ping_buffer_alloc(int nnis, gfp_t gfp);
+void lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf);
+
+static inline void lnet_ping_buffer_addref(struct lnet_ping_buffer *pbuf)
+{
+	atomic_inc(&pbuf->pb_refcnt);
+}
+
+static inline void lnet_ping_buffer_decref(struct lnet_ping_buffer *pbuf)
+{
+	if (atomic_dec_and_test(&pbuf->pb_refcnt)) {
+		wake_up_var(&pbuf->pb_refcnt);
+		lnet_ping_buffer_free(pbuf);
+	}
+}
+
+static inline int lnet_push_target_resize_needed(void)
+{
+	return the_lnet.ln_push_target->pb_nnis < the_lnet.ln_push_target_nnis;
+}
+
+int lnet_push_target_resize(void);
+int lnet_push_target_post(struct lnet_ping_buffer *pbuf,
+			  struct lnet_handle_md *mdh);
+void lnet_peer_push_event(struct lnet_event *ev);
+
+int lnet_parse_ip2nets(const char **networksp, const char *ip2nets);
+int lnet_parse_routes(const char *route_str, int *im_a_router);
+int lnet_parse_networks(struct list_head *nilist, const char *networks);
+bool lnet_net_unique(__u32 net_id, struct list_head *nilist,
+		     struct lnet_net **net);
+bool lnet_ni_unique_net(struct list_head *nilist, char *iface);
+void lnet_incr_dlc_seq(void);
+__u32 lnet_get_dlc_seq_locked(void);
+
+struct lnet_peer_net *lnet_get_next_peer_net_locked(struct lnet_peer *lp,
+						    __u32 prev_lpn_id);
+struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
+						  struct lnet_peer_net *peer_net,
+						  struct lnet_peer_ni *prev);
+struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref,
+					int cpt);
+struct lnet_peer_ni *lnet_peerni_by_nid_locked(struct lnet_nid *nid,
+					       struct lnet_nid *pref,
+					       int cpt);
+struct lnet_peer_ni *lnet_nid2peerni_ex(struct lnet_nid *nid, int cpt);
+struct lnet_peer_ni *lnet_peer_get_ni_locked(struct lnet_peer *lp,
+					     lnet_nid_t nid);
+struct lnet_peer_ni *lnet_peer_ni_get_locked(struct lnet_peer *lp,
+					     struct lnet_nid *nid);
+struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid);
+struct lnet_peer_ni *lnet_peer_ni_find_locked(struct lnet_nid *nid);
+struct lnet_peer *lnet_find_peer4(lnet_nid_t nid);
+struct lnet_peer *lnet_find_peer(struct lnet_nid *nid);
+void lnet_peer_net_added(struct lnet_net *net);
+void lnet_peer_primary_nid_locked(struct lnet_nid *nid,
+				  struct lnet_nid *result);
+int lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block);
+void lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg);
+int lnet_peer_discovery_start(void);
+void lnet_peer_discovery_stop(void);
+void lnet_push_update_to_peers(int force);
+void lnet_peer_tables_cleanup(struct lnet_net *net);
+void lnet_peer_uninit(void);
+int lnet_peer_tables_create(void);
+void lnet_debug_peer(lnet_nid_t nid);
+struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer,
+					       __u32 net_id);
+bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni,
+				  struct lnet_nid *nid);
+int lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, struct lnet_nid *nid);
+void lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni);
+bool lnet_peer_is_pref_rtr_locked(struct lnet_peer_ni *lpni,
+				  struct lnet_nid *gw_nid);
+void lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni);
+int lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni, struct lnet_nid *nid);
+int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni,
+				     struct lnet_nid *nid);
+int lnet_add_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, bool mr, bool temp);
+int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid);
+int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk);
+int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
+			  char alivness[LNET_MAX_STR_LEN],
+			  __u32 *cpt_iter, __u32 *refcount,
+			  __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
+			  __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
+			  __u32 *peer_tx_qnob);
+int lnet_get_peer_ni_hstats(struct lnet_ioctl_peer_ni_hstats *stats);
+
+static inline void
+lnet_peer_net_set_sel_priority_locked(struct lnet_peer_net *lpn, __u32 priority)
+{
+	lpn->lpn_sel_priority = priority;
+}
+
+
+static inline struct lnet_peer_net *
+lnet_find_peer_net_locked(struct lnet_peer *peer, __u32 net_id)
+{
+	struct lnet_peer_net *peer_net;
+
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+		if (peer_net->lpn_net_id == net_id)
+			return peer_net;
+	}
+
+	return NULL;
+}
+
+static inline bool
+lnet_peer_is_multi_rail(struct lnet_peer *lp)
+{
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+		return true;
+	return false;
+}
+
+static inline bool
+lnet_peer_ni_is_configured(struct lnet_peer_ni *lpni)
+{
+	if (lpni->lpni_peer_net->lpn_peer->lp_state & LNET_PEER_CONFIGURED)
+		return true;
+	return false;
+}
+
+static inline bool
+lnet_peer_ni_is_primary(struct lnet_peer_ni *lpni)
+{
+	return nid_same(&lpni->lpni_nid,
+			 &lpni->lpni_peer_net->lpn_peer->lp_primary_nid);
+}
+
+bool lnet_peer_is_uptodate(struct lnet_peer *lp);
+bool lnet_peer_is_uptodate_locked(struct lnet_peer *lp);
+bool lnet_is_discovery_disabled(struct lnet_peer *lp);
+bool lnet_is_discovery_disabled_locked(struct lnet_peer *lp);
+bool lnet_peer_gw_discovery(struct lnet_peer *lp);
+
+static inline bool
+lnet_peer_needs_push(struct lnet_peer *lp)
+{
+	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL))
+		return false;
+	if (lp->lp_state & LNET_PEER_MARK_DELETED)
+		return false;
+	if (lp->lp_state & LNET_PEER_FORCE_PUSH)
+		return true;
+	if (lp->lp_state & LNET_PEER_NO_DISCOVERY)
+		return false;
+	/* if discovery is not enabled then no need to push */
+	if (lnet_peer_discovery_disabled)
+		return false;
+	if (lp->lp_node_seqno < atomic_read(&the_lnet.ln_ping_target_seqno))
+		return true;
+	return false;
+}
+
+#define LNET_RECOVERY_INTERVAL_MAX 900
+static inline unsigned int
+lnet_get_next_recovery_ping(unsigned int ping_count, time64_t now)
+{
+	unsigned int interval;
+
+	/* 2^9 = 512, 2^10 = 1024 */
+	if (ping_count > 9)
+		interval = LNET_RECOVERY_INTERVAL_MAX;
+	else
+		interval = 1 << ping_count;
+
+	return now + interval;
+}
+
+static inline void
+lnet_peer_ni_set_next_ping(struct lnet_peer_ni *lpni, time64_t now)
+{
+	lpni->lpni_next_ping =
+		lnet_get_next_recovery_ping(lpni->lpni_ping_count, now);
+}
+
+static inline void
+lnet_ni_set_next_ping(struct lnet_ni *ni, time64_t now)
+{
+	ni->ni_next_ping = lnet_get_next_recovery_ping(ni->ni_ping_count, now);
+}
+
+/*
+ * A peer NI is alive if it satisfies the following two conditions:
+ *  1. peer NI health >= LNET_MAX_HEALTH_VALUE * router_sensitivity_percentage
+ *  2. the cached NI status received when we discover the peer is UP
+ */
+static inline bool
+lnet_is_peer_ni_alive(struct lnet_peer_ni *lpni)
+{
+	bool halive = false;
+
+	halive = (atomic_read(&lpni->lpni_healthv) >=
+		 (LNET_MAX_HEALTH_VALUE * router_sensitivity_percentage / 100));
+
+	return halive && lpni->lpni_ns_status == LNET_NI_STATUS_UP;
+}
+
+static inline void
+lnet_update_peer_net_healthv(struct lnet_peer_ni *lpni)
+{
+	struct lnet_peer_net *lpn;
+	int best_healthv = 0;
+
+	lpn = lpni->lpni_peer_net;
+
+	list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) {
+		int lpni_healthv = atomic_read(&lpni->lpni_healthv);
+		if (best_healthv < lpni_healthv)
+			best_healthv = lpni_healthv;
+	}
+
+	lpn->lpn_healthv = best_healthv;
+}
+
+static inline void
+lnet_set_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value)
+{
+	if (atomic_read(&lpni->lpni_healthv) == value)
+		return;
+	atomic_set(&lpni->lpni_healthv, value);
+	lnet_update_peer_net_healthv(lpni);
+}
+
+static inline bool
+lnet_atomic_add_unless_max(atomic_t *v, int a, int u)
+{
+	int c = atomic_read(v);
+	bool mod = false;
+	int old;
+	int m;
+
+	if (c == u)
+		return mod;
+
+	for (;;) {
+		if (c + a >= u)
+			m = u;
+		else
+			m = c + a;
+		old = atomic_cmpxchg(v, c, m);
+
+		if (old == u)
+			break;
+
+		if (old == c) {
+			mod = true;
+			break;
+		}
+		c = old;
+	}
+
+	return mod;
+}
+
+static inline void
+lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value)
+{
+	/* only adjust the net health if the lpni health value changed */
+	if (lnet_atomic_add_unless_max(&lpni->lpni_healthv, value,
+				       LNET_MAX_HEALTH_VALUE))
+		lnet_update_peer_net_healthv(lpni);
+}
+
+static inline void
+lnet_inc_healthv(atomic_t *healthv, int value)
+{
+	lnet_atomic_add_unless_max(healthv, value, LNET_MAX_HEALTH_VALUE);
+}
+
+static inline int
+lnet_get_list_len(struct list_head *list)
+{
+	struct list_head *l;
+	int count = 0;
+
+	list_for_each(l, list)
+		count++;
+
+	return count;
+}
+
+void lnet_incr_stats(struct lnet_element_stats *stats,
+		     enum lnet_msg_type msg_type,
+		     enum lnet_stats_type stats_type);
+
+__u32 lnet_sum_stats(struct lnet_element_stats *stats,
+		     enum lnet_stats_type stats_type);
+
+void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
+			      struct lnet_element_stats *stats);
+
+static inline void
+lnet_set_route_aliveness(struct lnet_route *route, bool alive)
+{
+	bool old = atomic_xchg(&route->lr_alive, alive);
+
+	if (old != alive)
+		CERROR("route to %s through %s has gone from %s to %s\n",
+		       libcfs_net2str(route->lr_net),
+		       libcfs_nidstr(&route->lr_gateway->lp_primary_nid),
+		       old ? "up" : "down",
+		       alive ? "up" : "down");
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
new file mode 100644
index 0000000000000..0df6857d89573
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
@@ -0,0 +1,1338 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/include/lnet/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef __LNET_LIB_TYPES_H__
+#define __LNET_LIB_TYPES_H__
+
+#ifndef __KERNEL__
+# error This include is only for kernel use.
+#endif
+
+#include <linux/kthread.h>
+#include <linux/uio.h>
+#include <linux/semaphore.h>
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/lnet/lnet-nl.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+#include <uapi/linux/lnet/lnetctl.h>
+#include <uapi/linux/lnet/nidstr.h>
+
+char *libcfs_nidstr_r(const struct lnet_nid *nid,
+		      char *buf, size_t buf_size);
+
+static inline char *libcfs_nidstr(const struct lnet_nid *nid)
+{
+	return libcfs_nidstr_r(nid, libcfs_next_nidstring(),
+			       LNET_NIDSTR_SIZE);
+}
+
+int libcfs_strnid(struct lnet_nid *nid, const char *str);
+char *libcfs_idstr(struct lnet_processid *id);
+
+int cfs_match_nid_net(struct lnet_nid *nid, u32 net,
+		      struct list_head *net_num_list,
+		      struct list_head *addr);
+
+/* Max payload size */
+#define LNET_MAX_PAYLOAD	LNET_MTU
+
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV	256
+
+/*
+ * This is the maximum health value.
+ * All local and peer NIs created have their health default to this value.
+ */
+#define LNET_MAX_HEALTH_VALUE 1000
+#define LNET_MAX_SELECTION_PRIORITY UINT_MAX
+
+/* forward refs */
+struct lnet_libmd;
+
+enum lnet_msg_hstatus {
+	LNET_MSG_STATUS_OK = 0,
+	LNET_MSG_STATUS_LOCAL_INTERRUPT,
+	LNET_MSG_STATUS_LOCAL_DROPPED,
+	LNET_MSG_STATUS_LOCAL_ABORTED,
+	LNET_MSG_STATUS_LOCAL_NO_ROUTE,
+	LNET_MSG_STATUS_LOCAL_ERROR,
+	LNET_MSG_STATUS_LOCAL_TIMEOUT,
+	LNET_MSG_STATUS_REMOTE_ERROR,
+	LNET_MSG_STATUS_REMOTE_DROPPED,
+	LNET_MSG_STATUS_REMOTE_TIMEOUT,
+	LNET_MSG_STATUS_NETWORK_TIMEOUT,
+	LNET_MSG_STATUS_END,
+};
+
+struct lnet_rsp_tracker {
+	/* chain on the waiting list */
+	struct list_head rspt_on_list;
+	/* cpt to lock */
+	int rspt_cpt;
+	/* nid of next hop */
+	struct lnet_nid rspt_next_hop_nid;
+	/* deadline of the REPLY/ACK */
+	ktime_t rspt_deadline;
+	/* parent MD */
+	struct lnet_handle_md rspt_mdh;
+};
+
+struct lnet_msg {
+	struct list_head	msg_activelist;
+	struct list_head	msg_list;	/* Q for credits/MD */
+
+	struct lnet_processid	msg_target;
+	/* Primary NID of the source. */
+	struct lnet_nid		msg_initiator;
+	/* where is it from, it's only for building event */
+	struct lnet_nid		msg_from;
+	__u32			msg_type;
+
+	/*
+	 * hold parameters in case message is with held due
+	 * to discovery
+	 */
+	struct lnet_nid		msg_src_nid_param;
+	struct lnet_nid		msg_rtr_nid_param;
+
+	/*
+	 * Deadline for the message after which it will be finalized if it
+	 * has not completed.
+	 */
+	ktime_t			msg_deadline;
+
+	/* The message health status. */
+	enum lnet_msg_hstatus	msg_health_status;
+	/* This is a recovery message */
+	bool			msg_recovery;
+	/* force an RDMA even if the message size is < 4K */
+	bool			msg_rdma_force;
+	/* the number of times a transmission has been retried */
+	int			msg_retry_count;
+	/* flag to indicate that we do not want to resend this message */
+	bool			msg_no_resend;
+
+	/* committed for sending */
+	unsigned int		msg_tx_committed:1;
+	/* CPT # this message committed for sending */
+	unsigned int		msg_tx_cpt:15;
+	/* committed for receiving */
+	unsigned int		msg_rx_committed:1;
+	/* CPT # this message committed for receiving */
+	unsigned int		msg_rx_cpt:15;
+	/* queued for tx credit */
+	unsigned int		msg_tx_delayed:1;
+	/* queued for RX buffer */
+	unsigned int		msg_rx_delayed:1;
+	/* ready for pending on RX delay list */
+	unsigned int		msg_rx_ready_delay:1;
+
+	unsigned int          msg_vmflush:1;      /* VM trying to free memory */
+	unsigned int          msg_target_is_router:1; /* sending to a router */
+	unsigned int          msg_routing:1;      /* being forwarded */
+	unsigned int          msg_ack:1;          /* ack on finalize (PUT) */
+	unsigned int          msg_sending:1;      /* outgoing message */
+	unsigned int          msg_receiving:1;    /* being received */
+	unsigned int          msg_txcredit:1;     /* taken an NI send credit */
+	unsigned int          msg_peertxcredit:1; /* taken a peer send credit */
+	unsigned int          msg_rtrcredit:1;    /* taken a globel router credit */
+	unsigned int          msg_peerrtrcredit:1; /* taken a peer router credit */
+	unsigned int          msg_onactivelist:1; /* on the activelist */
+	unsigned int	      msg_rdma_get:1;
+
+	struct lnet_peer_ni  *msg_txpeer;         /* peer I'm sending to */
+	struct lnet_peer_ni  *msg_rxpeer;         /* peer I received from */
+
+	void                 *msg_private;
+	struct lnet_libmd    *msg_md;
+	/* the NI the message was sent or received over */
+	struct lnet_ni       *msg_txni;
+	struct lnet_ni       *msg_rxni;
+
+	unsigned int          msg_len;
+	unsigned int          msg_wanted;
+	unsigned int          msg_offset;
+	unsigned int          msg_niov;
+	struct bio_vec	     *msg_kiov;
+
+	struct lnet_event	msg_ev;
+	struct lnet_hdr		msg_hdr;
+};
+
+struct lnet_libhandle {
+	struct list_head	lh_hash_chain;
+	__u64			lh_cookie;
+};
+
+#define lh_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+
+struct lnet_me {
+	struct list_head	me_list;
+	int			me_cpt;
+	struct lnet_processid	me_match_id;
+	unsigned int		me_portal;
+	unsigned int		me_pos;		/* hash offset in mt_hash */
+	__u64			me_match_bits;
+	__u64			me_ignore_bits;
+	enum lnet_unlink	me_unlink;
+	struct lnet_libmd      *me_md;
+};
+
+struct lnet_libmd {
+	struct list_head	 md_list;
+	struct lnet_libhandle	 md_lh;
+	struct lnet_me	        *md_me;
+	char		        *md_start;
+	unsigned int		 md_offset;
+	unsigned int		 md_length;
+	unsigned int		 md_max_size;
+	int			 md_threshold;
+	int			 md_refcount;
+	unsigned int		 md_options;
+	unsigned int		 md_flags;
+	unsigned int		 md_niov;	/* # frags at end of struct */
+	void		        *md_user_ptr;
+	struct lnet_rsp_tracker *md_rspt_ptr;
+	lnet_handler_t		 md_handler;
+	struct lnet_handle_md	 md_bulk_handle;
+	struct bio_vec		 md_kiov[LNET_MAX_IOV];
+};
+
+#define LNET_MD_FLAG_ZOMBIE	 BIT(0)
+#define LNET_MD_FLAG_AUTO_UNLINK BIT(1)
+#define LNET_MD_FLAG_ABORTED	 BIT(2)
+/* LNET_MD_FLAG_HANDLING is set when a non-unlink event handler
+ * is being called for an event relating to the md.
+ * It ensures only one such handler runs at a time.
+ * The final "unlink" event is only called once the
+ * md_refcount has reached zero, and this flag has been cleared,
+ * ensuring that it doesn't race with any other event handler
+ * call.
+ */
+#define LNET_MD_FLAG_HANDLING	 BIT(3)
+#define LNET_MD_FLAG_DISCARD	 BIT(4)
+
+struct lnet_test_peer {
+	/* info about peers we are trying to fail */
+	struct list_head	tp_list;	/* ln_test_peers */
+	struct lnet_nid		tp_nid;		/* matching nid */
+	unsigned int		tp_threshold;	/* # failures to simulate */
+};
+
+#define LNET_COOKIE_TYPE_MD    1
+#define LNET_COOKIE_TYPE_ME    2
+#define LNET_COOKIE_TYPE_EQ    3
+#define LNET_COOKIE_TYPE_BITS  2
+#define LNET_COOKIE_MASK	((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL)
+
+struct netstrfns {
+	u32	nf_type;
+	char	*nf_name;
+	char	*nf_modname;
+	void	(*nf_addr2str)(u32 addr, char *str, size_t size);
+	void	(*nf_addr2str_size)(const __be32 *addr, size_t asize,
+				    char *str, size_t size);
+	int	(*nf_str2addr)(const char *str, int nob, u32 *addr);
+	int	(*nf_str2addr_size)(const char *str, int nob,
+				    __be32 *addr, size_t *asize);
+	int	(*nf_parse_addrlist)(char *str, int len,
+				     struct list_head *list);
+	int	(*nf_print_addrlist)(char *buffer, int count,
+				     struct list_head *list);
+	int	(*nf_match_addr)(u32 addr, struct list_head *list);
+	int	(*nf_min_max)(struct list_head *nidlist, u32 *min_nid,
+			      u32 *max_nid);
+};
+
+struct lnet_ni;					 /* forward ref */
+struct socket;
+
+struct lnet_lnd {
+	/* fields initialized by the LND */
+	__u32			lnd_type;
+
+	int  (*lnd_startup)(struct lnet_ni *ni);
+	void (*lnd_shutdown)(struct lnet_ni *ni);
+	int  (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg);
+
+	/* In data movement APIs below, payload buffers are described as a set
+	 * of 'niov' fragments which are in pages.
+	 * The LND may NOT overwrite these fragment descriptors.
+	 * An 'offset' and may specify a byte offset within the set of
+	 * fragments to start from
+	 */
+
+	/* Start sending a preformatted message.  'private' is NULL for PUT and
+	 * GET messages; otherwise this is a response to an incoming message
+	 * and 'private' is the 'private' passed to lnet_parse().  Return
+	 * non-zero for immediate failure, otherwise complete later with
+	 * lnet_finalize() */
+	int (*lnd_send)(struct lnet_ni *ni, void *private,
+			struct lnet_msg *msg);
+
+	/* Start receiving 'mlen' bytes of payload data, skipping the following
+	 * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to
+	 * lnet_parse().  Return non-zero for immedaite failure, otherwise
+	 * complete later with lnet_finalize().  This also gives back a receive
+	 * credit if the LND does flow control. */
+	int (*lnd_recv)(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+			int delayed, unsigned int niov,
+			struct bio_vec *kiov,
+			unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+	/* lnet_parse() has had to delay processing of this message
+	 * (e.g. waiting for a forwarding buffer or send credits).  Give the
+	 * LND a chance to free urgently needed resources.  If called, return 0
+	 * for success and do NOT give back a receive credit; that has to wait
+	 * until lnd_recv() gets called.  On failure return < 0 and
+	 * release resources; lnd_recv() will not be called. */
+	int (*lnd_eager_recv)(struct lnet_ni *ni, void *private,
+			      struct lnet_msg *msg, void **new_privatep);
+
+	/* notification of peer down */
+	void (*lnd_notify_peer_down)(struct lnet_nid *peer);
+
+	/* accept a new connection */
+	int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock);
+
+	/* get dma_dev priority */
+	unsigned int (*lnd_get_dev_prio)(struct lnet_ni *ni,
+					 unsigned int dev_idx);
+};
+
+struct lnet_tx_queue {
+	int			tq_credits;	/* # tx credits free */
+	int			tq_credits_min;	/* lowest it's been */
+	int			tq_credits_max;	/* total # tx credits */
+	struct list_head	tq_delayed;	/* delayed TXs */
+};
+
+enum lnet_net_state {
+	/* set when net block is allocated */
+	LNET_NET_STATE_INIT = 0,
+	/* set when NIs in net are started successfully */
+	LNET_NET_STATE_ACTIVE,
+	/* set if all NIs in net are in FAILED state */
+	LNET_NET_STATE_INACTIVE,
+	/* set when shutting down a NET */
+	LNET_NET_STATE_DELETING
+};
+
+enum lnet_ni_state {
+	/* initial state when NI is created */
+	LNET_NI_STATE_INIT = 0,
+	/* set when NI is brought up */
+	LNET_NI_STATE_ACTIVE,
+	/* set when NI is being shutdown */
+	LNET_NI_STATE_DELETING,
+};
+
+#define LNET_NI_RECOVERY_PENDING	BIT(0)
+#define LNET_NI_RECOVERY_FAILED		BIT(1)
+
+enum lnet_stats_type {
+	LNET_STATS_TYPE_SEND = 0,
+	LNET_STATS_TYPE_RECV,
+	LNET_STATS_TYPE_DROP
+};
+
+struct lnet_comm_count {
+	atomic_t co_get_count;
+	atomic_t co_put_count;
+	atomic_t co_reply_count;
+	atomic_t co_ack_count;
+	atomic_t co_hello_count;
+};
+
+struct lnet_element_stats {
+	struct lnet_comm_count el_send_stats;
+	struct lnet_comm_count el_recv_stats;
+	struct lnet_comm_count el_drop_stats;
+};
+
+struct lnet_health_local_stats {
+	atomic_t hlt_local_interrupt;
+	atomic_t hlt_local_dropped;
+	atomic_t hlt_local_aborted;
+	atomic_t hlt_local_no_route;
+	atomic_t hlt_local_timeout;
+	atomic_t hlt_local_error;
+};
+
+struct lnet_health_remote_stats {
+	atomic_t hlt_remote_dropped;
+	atomic_t hlt_remote_timeout;
+	atomic_t hlt_remote_error;
+	atomic_t hlt_network_timeout;
+};
+
+struct lnet_net {
+	/* chain on the ln_nets */
+	struct list_head	net_list;
+
+	/* net ID, which is composed of
+	 * (net_type << 16) | net_num.
+	 * net_type can be one of the enumerated types defined in
+	 * lnet/include/lnet/nidstr.h */
+	__u32			net_id;
+
+	/* round robin selection */
+	__u32			net_seq;
+
+	/* total number of CPTs in the array */
+	__u32			net_ncpts;
+
+	/* cumulative CPTs of all NIs in this net */
+	__u32			*net_cpts;
+
+	/* relative net selection priority */
+	__u32			net_sel_priority;
+
+	/* network tunables */
+	struct lnet_ioctl_config_lnd_cmn_tunables net_tunables;
+
+	/*
+	 * boolean to indicate that the tunables have been set and
+	 * shouldn't be reset
+	 */
+	bool			net_tunables_set;
+
+	/* procedural interface */
+	const struct lnet_lnd	*net_lnd;
+
+	/* list of NIs on this net */
+	struct list_head	net_ni_list;
+
+	/* list of NIs being added, but not started yet */
+	struct list_head	net_ni_added;
+
+	/* dying LND instances */
+	struct list_head	net_ni_zombie;
+
+	/* when I was last alive */
+	time64_t		net_last_alive;
+
+	/* protects access to net_last_alive */
+	spinlock_t		net_lock;
+
+	/* list of router nids preferred for this network */
+	struct list_head	net_rtr_pref_nids;
+};
+
+struct lnet_ni {
+	/* chain on the lnet_net structure */
+	struct list_head	ni_netlist;
+
+	/* chain on the recovery queue */
+	struct list_head	ni_recovery;
+
+	/* MD handle for recovery ping */
+	struct lnet_handle_md	ni_ping_mdh;
+
+	spinlock_t		ni_lock;
+
+	/* number of CPTs */
+	int			ni_ncpts;
+
+	/* bond NI on some CPTs */
+	__u32			*ni_cpts;
+
+	/* interface's NID */
+	struct lnet_nid		ni_nid;
+
+	/* instance-specific data */
+	void			*ni_data;
+
+	/* per ni credits */
+	atomic_t		ni_tx_credits;
+
+	/* percpt TX queues */
+	struct lnet_tx_queue	**ni_tx_queues;
+
+	/* percpt reference count */
+	int			**ni_refs;
+
+	/* pointer to parent network */
+	struct lnet_net		*ni_net;
+
+	/* my health status */
+	struct lnet_ni_status	*ni_status;
+
+	/* NI FSM. Protected by lnet_ni_lock() */
+	enum lnet_ni_state	ni_state;
+
+	/* Recovery state. Protected by lnet_ni_lock() */
+	__u32			ni_recovery_state;
+
+	/* When to send the next recovery ping */
+	time64_t                ni_next_ping;
+	/* How many pings sent during current recovery period did not receive
+	 * a reply. NB: reset whenever _any_ message arrives on this NI
+	 */
+	unsigned int		ni_ping_count;
+
+	/* per NI LND tunables */
+	struct lnet_lnd_tunables ni_lnd_tunables;
+
+	/* lnd tunables set explicitly */
+	bool ni_lnd_tunables_set;
+
+	/* NI statistics */
+	struct lnet_element_stats ni_stats;
+	struct lnet_health_local_stats ni_hstats;
+
+	/* physical device CPT */
+	int			ni_dev_cpt;
+
+	/* sequence number used to round robin over nis within a net */
+	__u32			ni_seq;
+
+	/*
+	 * health value
+	 *	initialized to LNET_MAX_HEALTH_VALUE
+	 * Value is decremented every time we fail to send a message over
+	 * this NI because of a NI specific failure.
+	 * Value is incremented if we successfully send a message.
+	 */
+	atomic_t		ni_healthv;
+
+	/*
+	 * Set to 1 by the LND when it receives an event telling it the device
+	 * has gone into a fatal state. Set to 0 when the LND receives an
+	 * even telling it the device is back online.
+	 */
+	atomic_t		ni_fatal_error_on;
+
+	/* the relative selection priority of this NI */
+	__u32			ni_sel_priority;
+
+	/*
+	 * equivalent interface to use
+	 */
+	char			*ni_interface;
+	struct net		*ni_net_ns;     /* original net namespace */
+};
+
+#define LNET_PROTO_PING_MATCHBITS	0x8000000000000000LL
+
+/*
+ * Descriptor of a ping info buffer: keep a separate indicator of the
+ * size and a reference count. The type is used both as a source and
+ * sink of data, so we need to keep some information outside of the
+ * area that may be overwritten by network data.
+ */
+struct lnet_ping_buffer {
+	int			pb_nnis;
+	atomic_t		pb_refcnt;
+	bool			pb_needs_post;
+	struct lnet_ping_info	pb_info;
+};
+
+#define LNET_PING_BUFFER_SIZE(NNIDS) \
+	offsetof(struct lnet_ping_buffer, pb_info.pi_ni[NNIDS])
+#define LNET_PING_BUFFER_LONI(PBUF)	((PBUF)->pb_info.pi_ni[0].ns_nid)
+#define LNET_PING_BUFFER_SEQNO(PBUF)	((PBUF)->pb_info.pi_ni[0].ns_status)
+
+#define LNET_PING_INFO_TO_BUFFER(PINFO)	\
+	container_of((PINFO), struct lnet_ping_buffer, pb_info)
+
+struct lnet_nid_list {
+	struct list_head nl_list;
+	struct lnet_nid nl_nid;
+};
+
+struct lnet_peer_ni {
+	/* chain on lpn_peer_nis */
+	struct list_head	lpni_peer_nis;
+	/* chain on remote peer list */
+	struct list_head	lpni_on_remote_peer_ni_list;
+	/* chain on recovery queue */
+	struct list_head	lpni_recovery;
+	/* chain on peer hash */
+	struct list_head	lpni_hashlist;
+	/* messages blocking for tx credits */
+	struct list_head	lpni_txq;
+	/* pointer to peer net I'm part of */
+	struct lnet_peer_net	*lpni_peer_net;
+	/* statistics kept on each peer NI */
+	struct lnet_element_stats lpni_stats;
+	struct lnet_health_remote_stats lpni_hstats;
+	/* spin lock protecting credits and lpni_txq */
+	spinlock_t		lpni_lock;
+	/* # tx credits available */
+	int			lpni_txcredits;
+	/* low water mark */
+	int			lpni_mintxcredits;
+	/*
+	 * Each peer_ni in a gateway maintains its own credits. This
+	 * allows more traffic to gateways that have multiple interfaces.
+	 */
+	/* # router credits */
+	int			lpni_rtrcredits;
+	/* low water mark */
+	int			lpni_minrtrcredits;
+	/* bytes queued for sending */
+	long			lpni_txqnob;
+	/* network peer is on */
+	struct lnet_net		*lpni_net;
+	/* peer's NID */
+	struct lnet_nid		lpni_nid;
+	/* # refs */
+	struct kref		lpni_kref;
+	/* health value for the peer */
+	atomic_t		lpni_healthv;
+	/* recovery ping mdh */
+	struct lnet_handle_md	lpni_recovery_ping_mdh;
+	/* When to send the next recovery ping */
+	time64_t		lpni_next_ping;
+	/* How many pings sent during current recovery period did not receive
+	 * a reply. NB: reset whenever _any_ message arrives from this peer NI
+	 */
+	unsigned int		lpni_ping_count;
+	/* CPT this peer attached on */
+	int			lpni_cpt;
+	/* state flags -- protected by lpni_lock */
+	unsigned		lpni_state;
+	/* status of the peer NI as reported by the peer */
+	__u32			lpni_ns_status;
+	/* sequence number used to round robin over peer nis within a net */
+	__u32			lpni_seq;
+	/* sequence number used to round robin over gateways */
+	__u32			lpni_gw_seq;
+	/* returned RC ping features. Protected with lpni_lock */
+	unsigned int		lpni_ping_feats;
+	/* time last message was received from the peer */
+	time64_t		lpni_last_alive;
+	/* preferred local nids: if only one, use lpni_pref.nid */
+	union lpni_pref {
+		struct lnet_nid nid;
+		struct list_head nids;
+	} lpni_pref;
+	/* list of router nids preferred for this peer NI */
+	struct list_head	lpni_rtr_pref_nids;
+	/* The relative selection priority of this peer NI */
+	__u32			lpni_sel_priority;
+	/* number of preferred NIDs in lnpi_pref_nids */
+	__u32			lpni_pref_nnids;
+};
+
+/* Preferred path added due to traffic on non-MR peer_ni */
+#define LNET_PEER_NI_NON_MR_PREF	BIT(0)
+/* peer is being recovered. */
+#define LNET_PEER_NI_RECOVERY_PENDING	BIT(1)
+/* recovery ping failed */
+#define LNET_PEER_NI_RECOVERY_FAILED	BIT(2)
+/* peer is being deleted */
+#define LNET_PEER_NI_DELETING		BIT(3)
+
+struct lnet_peer {
+	/* chain on pt_peer_list */
+	struct list_head	lp_peer_list;
+
+	/* list of peer nets */
+	struct list_head	lp_peer_nets;
+
+	/* list of messages pending discovery*/
+	struct list_head	lp_dc_pendq;
+
+	/* chain on router list */
+	struct list_head	lp_rtr_list;
+
+	/* primary NID of the peer */
+	struct lnet_nid		lp_primary_nid;
+
+	/* source NID to use during discovery */
+	struct lnet_nid		lp_disc_src_nid;
+	/* destination NID to use during discovery */
+	struct lnet_nid		lp_disc_dst_nid;
+
+	/* net to perform discovery on */
+	__u32			lp_disc_net_id;
+
+	/* CPT of peer_table */
+	int			lp_cpt;
+
+	/* number of NIDs on this peer */
+	int			lp_nnis;
+
+	/* # refs from lnet_route::lr_gateway */
+	int			lp_rtr_refcount;
+
+	/*
+	 * peer specific health sensitivity value to decrement peer nis in
+	 * this peer with if set to something other than 0
+	 */
+	__u32			lp_health_sensitivity;
+
+	/* messages blocking for router credits */
+	struct list_head	lp_rtrq;
+
+	/* routes on this peer */
+	struct list_head	lp_routes;
+
+	/* reference count */
+	atomic_t		lp_refcount;
+
+	/* lock protecting peer state flags and lpni_rtrq */
+	spinlock_t		lp_lock;
+
+	/* peer state flags */
+	unsigned		lp_state;
+
+	/* buffer for data pushed by peer */
+	struct lnet_ping_buffer	*lp_data;
+
+	/* MD handle for ping in progress */
+	struct lnet_handle_md	lp_ping_mdh;
+
+	/* MD handle for push in progress */
+	struct lnet_handle_md	lp_push_mdh;
+
+	/* number of NIDs for sizing push data */
+	int			lp_data_nnis;
+
+	/* NI config sequence number of peer */
+	__u32			lp_peer_seqno;
+
+	/* Local NI config sequence number acked by peer */
+	__u32			lp_node_seqno;
+
+	/* Local NI config sequence number sent to peer */
+	__u32			lp_node_seqno_sent;
+
+	/* Ping error encountered during discovery. */
+	int			lp_ping_error;
+
+	/* Push error encountered during discovery. */
+	int			lp_push_error;
+
+	/* Error encountered during discovery. */
+	int			lp_dc_error;
+
+	/* time it was put on the ln_dc_working queue */
+	time64_t		lp_last_queued;
+
+	/* link on discovery-related lists */
+	struct list_head	lp_dc_list;
+
+	/* tasks waiting on discovery of this peer */
+	wait_queue_head_t	lp_dc_waitq;
+
+	/* cached peer aliveness */
+	bool			lp_alive;
+};
+
+/*
+ * The status flags in lp_state. Their semantics have chosen so that
+ * lp_state can be zero-initialized.
+ *
+ * A peer is marked MULTI_RAIL in two cases: it was configured using DLC
+ * as multi-rail aware, or the LNET_PING_FEAT_MULTI_RAIL bit was set.
+ *
+ * A peer is marked NO_DISCOVERY if the LNET_PING_FEAT_DISCOVERY bit was
+ * NOT set when the peer was pinged by discovery.
+ *
+ * A peer is marked ROUTER if it indicates so in the feature bit.
+ */
+#define LNET_PEER_MULTI_RAIL		BIT(0)	/* Multi-rail aware */
+#define LNET_PEER_NO_DISCOVERY		BIT(1)	/* Peer disabled discovery */
+#define LNET_PEER_ROUTER_ENABLED	BIT(2)	/* router feature enabled */
+
+/*
+ * A peer is marked CONFIGURED if it was configured by DLC.
+ *
+ * In addition, a peer is marked DISCOVERED if it has fully passed
+ * through Peer Discovery.
+ *
+ * When Peer Discovery is disabled, the discovery thread will mark
+ * peers REDISCOVER to indicate that they should be re-examined if
+ * discovery is (re)enabled on the node.
+ *
+ * A peer that was created as the result of inbound traffic will not
+ * be marked at all.
+ */
+#define LNET_PEER_CONFIGURED		BIT(3)	/* Configured via DLC */
+#define LNET_PEER_DISCOVERED		BIT(4)	/* Peer was discovered */
+#define LNET_PEER_REDISCOVER		BIT(5)	/* Discovery was disabled */
+/*
+ * A peer is marked DISCOVERING when discovery is in progress.
+ * The other flags below correspond to stages of discovery.
+ */
+#define LNET_PEER_DISCOVERING		BIT(6)	/* Discovering */
+#define LNET_PEER_DATA_PRESENT		BIT(7)	/* Remote peer data present */
+#define LNET_PEER_NIDS_UPTODATE		BIT(8)	/* Remote peer info uptodate */
+#define LNET_PEER_PING_SENT		BIT(9)	/* Waiting for REPLY to Ping */
+#define LNET_PEER_PUSH_SENT		BIT(10)	/* Waiting for ACK of Push */
+#define LNET_PEER_PING_FAILED		BIT(11)	/* Ping send failure */
+#define LNET_PEER_PUSH_FAILED		BIT(12)	/* Push send failure */
+/*
+ * A ping can be forced as a way to fix up state, or as a manual
+ * intervention by an admin.
+ * A push can be forced in circumstances that would normally not
+ * allow for one to happen.
+ */
+#define LNET_PEER_FORCE_PING		BIT(13)	/* Forced Ping */
+#define LNET_PEER_FORCE_PUSH		BIT(14)	/* Forced Push */
+
+/* force delete even if router */
+#define LNET_PEER_RTR_NI_FORCE_DEL	BIT(15)
+
+/* gw undergoing alive discovery */
+#define LNET_PEER_RTR_DISCOVERY		BIT(16)
+/* gw has undergone discovery (does not indicate success or failure) */
+#define LNET_PEER_RTR_DISCOVERED	BIT(17)
+
+/* peer is marked for deletion */
+#define LNET_PEER_MARK_DELETION		BIT(18)
+/* lnet_peer_del()/lnet_peer_del_locked() has been called on the peer */
+#define LNET_PEER_MARK_DELETED		BIT(19)
+/* lock primary NID to what's requested by ULP */
+#define LNET_PEER_LOCK_PRIMARY		BIT(20)
+/* this is for informational purposes only. It is set if a peer gets
+ * configured from Lustre with a primary NID which belongs to another peer
+ * which is also configured by Lustre as the primary NID.
+ */
+#define LNET_PEER_BAD_CONFIG		BIT(21)
+
+struct lnet_peer_net {
+	/* chain on lp_peer_nets */
+	struct list_head	lpn_peer_nets;
+
+	/* list of peer_nis on this network */
+	struct list_head	lpn_peer_nis;
+
+	/* pointer to the peer I'm part of */
+	struct lnet_peer	*lpn_peer;
+
+	/* Net ID */
+	__u32			lpn_net_id;
+
+	/* peer net health */
+	int			lpn_healthv;
+
+	/* time of next router ping on this net */
+	time64_t		lpn_next_ping;
+
+	/* selection sequence number */
+	__u32			lpn_seq;
+
+	/* relative peer net selection priority */
+	__u32			lpn_sel_priority;
+
+	/* reference count */
+	atomic_t		lpn_refcount;
+};
+
+/* peer hash size */
+#define LNET_PEER_HASH_BITS	9
+#define LNET_PEER_HASH_SIZE	(1 << LNET_PEER_HASH_BITS)
+
+/*
+ * peer hash table - one per CPT
+ *
+ * protected by lnet_net_lock/EX for update
+ *    pt_version
+ *    pt_hash[...]
+ *    pt_peer_list
+ *    pt_peers
+ * protected by pt_zombie_lock:
+ *    pt_zombie_list
+ *    pt_zombies
+ *
+ * pt_zombie lock nests inside lnet_net_lock
+ */
+struct lnet_peer_table {
+	int			pt_version;	/* /proc validity stamp */
+	struct list_head	*pt_hash;	/* NID->peer hash */
+	struct list_head	pt_peer_list;	/* peers */
+	int			pt_peers;	/* # peers */
+	struct list_head	pt_zombie_list;	/* zombie peer_ni */
+	int			pt_zombies;	/* # zombie peers_ni */
+	spinlock_t		pt_zombie_lock;	/* protect list and count */
+};
+
+/* peer aliveness is enabled only on routers for peers in a network where the
+ * struct lnet_ni::ni_peertimeout has been set to a positive value
+ */
+#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \
+					((lp)->lpni_net) && \
+					(lp)->lpni_net->net_tunables.lct_peer_timeout > 0)
+
+struct lnet_route {
+	struct list_head	lr_list;	/* chain on net */
+	struct list_head	lr_gwlist;	/* chain on gateway */
+	struct lnet_peer	*lr_gateway;	/* router node */
+	struct lnet_nid		lr_nid;		/* NID used to add route */
+	__u32			lr_net;		/* remote network number */
+	__u32			lr_lnet;	/* local network number */
+	int			lr_seq;		/* sequence for round-robin */
+	__u32			lr_hops;	/* how far I am */
+	unsigned int		lr_priority;	/* route priority */
+	atomic_t		lr_alive;	/* cached route aliveness */
+	bool			lr_single_hop;  /* this route is single-hop */
+};
+
+#define LNET_REMOTE_NETS_HASH_DEFAULT	(1U << 7)
+#define LNET_REMOTE_NETS_HASH_MAX	(1U << 16)
+#define LNET_REMOTE_NETS_HASH_SIZE	(1 << the_lnet.ln_remote_nets_hbits)
+
+struct lnet_remotenet {
+	/* chain on ln_remote_nets_hash */
+	struct list_head	lrn_list;
+	/* routes to me */
+	struct list_head	lrn_routes;
+	/* my net number */
+	__u32			lrn_net;
+};
+
+/** lnet message has credit and can be submitted to lnd for send/receive */
+#define LNET_CREDIT_OK		0
+/** lnet message is waiting for credit */
+#define LNET_CREDIT_WAIT	1
+/** lnet message is waiting for discovery */
+#define LNET_DC_WAIT		2
+
+struct lnet_rtrbufpool {
+	/* my free buffer pool */
+	struct list_head	rbp_bufs;
+	/* messages blocking for a buffer */
+	struct list_head	rbp_msgs;
+	/* # pages in each buffer */
+	int			rbp_npages;
+	/* requested number of buffers */
+	int			rbp_req_nbuffers;
+	/* # buffers actually allocated */
+	int			rbp_nbuffers;
+	/* # free buffers / blocked messages */
+	int			rbp_credits;
+	/* low water mark */
+	int			rbp_mincredits;
+};
+
+struct lnet_rtrbuf {
+	struct list_head	 rb_list;	/* chain on rbp_bufs */
+	struct lnet_rtrbufpool	*rb_pool;	/* owning pool */
+	struct bio_vec		 rb_kiov[0];	/* the buffer space */
+};
+
+#define LNET_PEER_HASHSIZE   503		/* prime! */
+
+enum lnet_match_flags {
+	/* Didn't match anything */
+	LNET_MATCHMD_NONE	= BIT(0),
+	/* Matched OK */
+	LNET_MATCHMD_OK		= BIT(1),
+	/* Must be discarded */
+	LNET_MATCHMD_DROP	= BIT(2),
+	/* match and buffer is exhausted */
+	LNET_MATCHMD_EXHAUSTED	= BIT(3),
+	/* match or drop */
+	LNET_MATCHMD_FINISH	= (LNET_MATCHMD_OK | LNET_MATCHMD_DROP),
+};
+
+/* Options for struct lnet_portal::ptl_options */
+#define LNET_PTL_LAZY		BIT(0)
+#define LNET_PTL_MATCH_UNIQUE	BIT(1)	/* unique match, for RDMA */
+#define LNET_PTL_MATCH_WILDCARD	BIT(2)	/* wildcard match, request portal */
+
+/* parameter for matching operations (GET, PUT) */
+struct lnet_match_info {
+	__u64			mi_mbits;
+	struct lnet_processid	mi_id;
+	unsigned int		mi_cpt;
+	unsigned int		mi_opc;
+	unsigned int		mi_portal;
+	unsigned int		mi_rlength;
+	unsigned int		mi_roffset;
+};
+
+/* ME hash of RDMA portal */
+#define LNET_MT_HASH_BITS		8
+#define LNET_MT_HASH_SIZE		(1 << LNET_MT_HASH_BITS)
+#define LNET_MT_HASH_MASK		(LNET_MT_HASH_SIZE - 1)
+/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash,
+ * the last entry is reserved for MEs with ignore-bits */
+#define LNET_MT_HASH_IGNORE		LNET_MT_HASH_SIZE
+/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which
+ * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the
+ * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */
+#define LNET_MT_BITS_U64		6	/* 2^6 bits */
+#define LNET_MT_EXHAUSTED_BITS		(LNET_MT_HASH_BITS - LNET_MT_BITS_U64)
+#define LNET_MT_EXHAUSTED_BMAP		((1 << LNET_MT_EXHAUSTED_BITS) + 1)
+
+/* portal match table */
+struct lnet_match_table {
+	/* reserved for upcoming patches, CPU partition ID */
+	unsigned int		mt_cpt;
+	unsigned int		mt_portal;	/* portal index */
+	/* match table is set as "enabled" if there's non-exhausted MD
+	 * attached on mt_mhash, it's only valid for wildcard portal */
+	unsigned int		mt_enabled;
+	/* bitmap to flag whether MEs on mt_hash are exhausted or not */
+	__u64			mt_exhausted[LNET_MT_EXHAUSTED_BMAP];
+	struct list_head	*mt_mhash;	/* matching hash */
+};
+
+/* these are only useful for wildcard portal */
+/* Turn off message rotor for wildcard portals */
+#define	LNET_PTL_ROTOR_OFF	0
+/* round-robin dispatch all PUT messages for wildcard portals */
+#define	LNET_PTL_ROTOR_ON	1
+/* round-robin dispatch routed PUT message for wildcard portals */
+#define	LNET_PTL_ROTOR_RR_RT	2
+/* dispatch routed PUT message by hashing source NID for wildcard portals */
+#define	LNET_PTL_ROTOR_HASH_RT	3
+
+struct lnet_portal {
+	spinlock_t		ptl_lock;
+	unsigned int		ptl_index;	/* portal ID, reserved */
+	/* flags on this portal: lazy, unique... */
+	unsigned int		ptl_options;
+	/* list of messages which are stealing buffer */
+	struct list_head	ptl_msg_stealing;
+	/* messages blocking for MD */
+	struct list_head	ptl_msg_delayed;
+	/* Match table for each CPT */
+	struct lnet_match_table	**ptl_mtables;
+	/* spread rotor of incoming "PUT" */
+	unsigned int		ptl_rotor;
+	/* # active entries for this portal */
+	int			ptl_mt_nmaps;
+	/* array of active entries' cpu-partition-id */
+	int			ptl_mt_maps[0];
+};
+
+#define LNET_LH_HASH_BITS	12
+#define LNET_LH_HASH_SIZE	(1ULL << LNET_LH_HASH_BITS)
+#define LNET_LH_HASH_MASK	(LNET_LH_HASH_SIZE - 1)
+
+/* resource container (ME, MD, EQ) */
+struct lnet_res_container {
+	unsigned int		rec_type;	/* container type */
+	__u64			rec_lh_cookie;	/* cookie generator */
+	struct list_head	rec_active;	/* active resource list */
+	struct list_head	*rec_lh_hash;	/* handle hash */
+};
+
+/* message container */
+struct lnet_msg_container {
+	int			msc_init;	/* initialized or not */
+	/* max # threads finalizing */
+	int			msc_nfinalizers;
+	/* msgs waiting to complete finalizing */
+	struct list_head	msc_finalizing;
+	/* msgs waiting to be resent */
+	struct list_head	msc_resending;
+	struct list_head	msc_active;	/* active message list */
+	/* threads doing finalization */
+	void			**msc_finalizers;
+	/* threads doing resends */
+	void			**msc_resenders;
+};
+
+/* This UDSP structures need to match the user space liblnetconfig structures
+ * in order for the marshall and unmarshall functions to be common.
+ */
+
+/* Net is described as a
+ *  1. net type
+ *  2. num range
+ */
+struct lnet_ud_net_descr {
+	__u32 udn_net_type;
+	struct list_head udn_net_num_range;
+};
+
+/* each NID range is defined as
+ *  1. net descriptor
+ *  2. address range descriptor
+ */
+struct lnet_ud_nid_descr {
+	struct lnet_ud_net_descr ud_net_id;
+	struct list_head ud_addr_range;
+	__u32 ud_mem_size;
+};
+
+/* a UDSP rule can have up to three user defined NID descriptors
+ *	- src: defines the local NID range for the rule
+ *	- dst: defines the peer NID range for the rule
+ *	- rte: defines the router NID range for the rule
+ *
+ * An action union defines the action to take when the rule
+ * is matched
+ */
+struct lnet_udsp {
+	struct list_head udsp_on_list;
+	__u32 udsp_idx;
+	struct lnet_ud_nid_descr udsp_src;
+	struct lnet_ud_nid_descr udsp_dst;
+	struct lnet_ud_nid_descr udsp_rte;
+	enum lnet_udsp_action_type udsp_action_type;
+	union {
+		__u32 udsp_priority;
+	} udsp_action;
+};
+
+/* Peer Discovery states */
+#define LNET_DC_STATE_SHUTDOWN		0	/* not started */
+#define LNET_DC_STATE_RUNNING		1	/* started up OK */
+#define LNET_DC_STATE_STOPPING		2	/* telling thread to stop */
+
+/* Router Checker states */
+#define LNET_MT_STATE_SHUTDOWN		0	/* not started */
+#define LNET_MT_STATE_RUNNING		1	/* started up OK */
+#define LNET_MT_STATE_STOPPING		2	/* telling thread to stop */
+
+/* LNet states */
+#define LNET_STATE_SHUTDOWN		0	/* not started */
+#define LNET_STATE_RUNNING		1	/* started up OK */
+#define LNET_STATE_STOPPING		2	/* telling thread to stop */
+
+struct lnet {
+	/* CPU partition table of LNet */
+	struct cfs_cpt_table		*ln_cpt_table;
+	/* number of CPTs in ln_cpt_table */
+	unsigned int			ln_cpt_number;
+	unsigned int			ln_cpt_bits;
+
+	/* protect LNet resources (ME/MD/EQ) */
+	struct cfs_percpt_lock		*ln_res_lock;
+	/* # portals */
+	int				ln_nportals;
+	/* the vector of portals */
+	struct lnet_portal		**ln_portals;
+	/* percpt MD container */
+	struct lnet_res_container	**ln_md_containers;
+
+	/* Event Queue container */
+	struct lnet_res_container	ln_eq_container;
+	spinlock_t			ln_eq_wait_lock;
+
+	unsigned int			ln_remote_nets_hbits;
+
+	/* protect NI, peer table, credits, routers, rtrbuf... */
+	struct cfs_percpt_lock		*ln_net_lock;
+	/* percpt message containers for active/finalizing/freed message */
+	struct lnet_msg_container	**ln_msg_containers;
+	struct lnet_counters		**ln_counters;
+	struct lnet_peer_table		**ln_peer_tables;
+	/* list of peer nis not on a local network */
+	struct list_head		ln_remote_peer_ni_list;
+	/* failure simulation */
+	struct list_head		ln_test_peers;
+	struct list_head		ln_drop_rules;
+	struct list_head		ln_delay_rules;
+	/* LND instances */
+	struct list_head		ln_nets;
+	/* the loopback NI */
+	struct lnet_ni			*ln_loni;
+	/* network zombie list */
+	struct list_head		ln_net_zombie;
+	/* resend messages list */
+	struct list_head		ln_msg_resend;
+	/* spin lock to protect the msg resend list */
+	spinlock_t			ln_msg_resend_lock;
+
+	/* remote networks with routes to them */
+	struct list_head		*ln_remote_nets_hash;
+	/* validity stamp */
+	__u64				ln_remote_nets_version;
+	/* list of all known routers */
+	struct list_head		ln_routers;
+	/* validity stamp */
+	__u64				ln_routers_version;
+	/* percpt router buffer pools */
+	struct lnet_rtrbufpool		**ln_rtrpools;
+
+	/*
+	 * Ping target / Push source
+	 *
+	 * The ping target and push source share a single buffer. The
+	 * ln_ping_target is protected against concurrent updates by
+	 * ln_api_mutex.
+	 */
+	struct lnet_handle_md		ln_ping_target_md;
+	lnet_handler_t			ln_ping_target_handler;
+	struct lnet_ping_buffer		*ln_ping_target;
+	atomic_t			ln_ping_target_seqno;
+
+	/*
+	 * Push Target
+	 *
+	 * ln_push_nnis contains the desired size of the push target.
+	 * The lnet_net_lock is used to handle update races. The old
+	 * buffer may linger a while after it has been unlinked, in
+	 * which case the event handler cleans up.
+	 */
+	lnet_handler_t			ln_push_target_handler;
+	struct lnet_handle_md		ln_push_target_md;
+	struct lnet_ping_buffer		*ln_push_target;
+	int				ln_push_target_nnis;
+
+	/* discovery event queue handle */
+	lnet_handler_t			ln_dc_handler;
+	/* discovery requests */
+	struct list_head		ln_dc_request;
+	/* discovery working list */
+	struct list_head		ln_dc_working;
+	/* discovery expired list */
+	struct list_head		ln_dc_expired;
+	/* discovery thread wait queue */
+	wait_queue_head_t		ln_dc_waitq;
+	/* discovery startup/shutdown state */
+	int				ln_dc_state;
+
+	/* monitor thread startup/shutdown state */
+	int				ln_mt_state;
+	/* serialise startup/shutdown */
+	struct semaphore		ln_mt_signal;
+
+	struct mutex			ln_api_mutex;
+	struct mutex			ln_lnd_mutex;
+	/* Have I called LNetNIInit myself? */
+	int				ln_niinit_self;
+	/* LNetNIInit/LNetNIFini counter */
+	int				ln_refcount;
+	/* SHUTDOWN/RUNNING/STOPPING */
+	int				ln_state;
+
+	int				ln_routing;	/* am I a router? */
+	lnet_pid_t			ln_pid;		/* requested pid */
+	/* uniquely identifies this ni in this epoch */
+	__u64				ln_interface_cookie;
+	/* registered LNDs */
+	const struct lnet_lnd		*ln_lnds[NUM_LNDS];
+
+	/* test protocol compatibility flags */
+	unsigned long			ln_testprotocompat;
+
+	/* 0 - load the NIs from the mod params
+	 * 1 - do not load the NIs from the mod params
+	 * Reverse logic to ensure that other calls to LNetNIInit
+	 * need no change
+	 */
+	bool				ln_nis_from_mod_params;
+
+	/*
+	 * completion for the monitor thread. The monitor thread takes care of
+	 * checking routes, timedout messages and resending messages.
+	 */
+	struct completion		ln_mt_wait_complete;
+
+	/* per-cpt resend queues */
+	struct list_head		**ln_mt_resendqs;
+	/* local NIs to recover */
+	struct list_head		ln_mt_localNIRecovq;
+	/* local NIs to recover */
+	struct list_head		ln_mt_peerNIRecovq;
+	/*
+	 * An array of queues for GET/PUT waiting for REPLY/ACK respectively.
+	 * There are CPT number of queues. Since response trackers will be
+	 * added on the fast path we can't afford to grab the exclusive
+	 * net lock to protect these queues. The CPT will be calculated
+	 * based on the mdh cookie.
+	 */
+	struct list_head		**ln_mt_rstq;
+	/*
+	 * A response tracker becomes a zombie when the associated MD is queued
+	 * for unlink before the response tracker is detached from the MD. An
+	 * entry on a zombie list can be freed when either the remaining
+	 * operations on the MD complete or when LNet has shut down.
+	 */
+	struct list_head		**ln_mt_zombie_rstqs;
+	/* recovery handler */
+	lnet_handler_t			ln_mt_handler;
+
+	/*
+	 * Completed when the discovery and monitor threads can enter their
+	 * work loops
+	 */
+	struct completion		ln_started;
+	/* UDSP list */
+	struct list_head		ln_udsp_list;
+};
+
+struct genl_filter_list {
+	struct list_head	 lp_list;
+	void			*lp_cursor;
+	bool			 lp_first;
+};
+
+static const struct nla_policy scalar_attr_policy[LN_SCALAR_MAX + 1] = {
+	[LN_SCALAR_ATTR_LIST]		= { .type = NLA_NESTED },
+	[LN_SCALAR_ATTR_LIST_SIZE]	= { .type = NLA_U16 },
+	[LN_SCALAR_ATTR_INDEX]		= { .type = NLA_U16 },
+	[LN_SCALAR_ATTR_NLA_TYPE]	= { .type = NLA_U16 },
+	[LN_SCALAR_ATTR_VALUE]		= { .type = NLA_STRING },
+	[LN_SCALAR_ATTR_KEY_FORMAT]	= { .type = NLA_U16 },
+};
+
+int lnet_genl_send_scalar_list(struct sk_buff *msg, u32 portid, u32 seq,
+			       const struct genl_family *family, int flags,
+			       u8 cmd, const struct ln_key_list *data[]);
+
+/* Special workaround for pre-4.19 kernels to send error messages
+ * from dumpit routines. Newer kernels will send message with
+ * NL_SET_ERR_MSG information by default if NETLINK_EXT_ACK is set.
+ */
+static inline int lnet_nl_send_error(struct sk_buff *msg, int portid, int seq,
+				     int error)
+{
+#ifndef HAVE_NL_DUMP_WITH_EXT_ACK
+	struct nlmsghdr *nlh;
+
+	if (!error)
+		return 0;
+
+	nlh = nlmsg_put(msg, portid, seq, NLMSG_ERROR, sizeof(error), 0);
+	if (!nlh)
+		return -ENOMEM;
+#ifdef HAVE_NL_PARSE_WITH_EXT_ACK
+	netlink_ack(msg, nlh, error, NULL);
+#else
+	netlink_ack(msg, nlh, error);
+#endif
+	return nlmsg_len(nlh);
+#else
+	return error;
+#endif
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnet_rdma.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnet_rdma.h
new file mode 100644
index 0000000000000..6aa5367af007c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnet_rdma.h
@@ -0,0 +1,89 @@
+#ifndef LUSTRE_NVFS_H
+#define LUSTRE_NVFS_H
+
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/cpumask.h>
+#include <linux/scatterlist.h>
+#include <linux/percpu-defs.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+
+#define REGSTR2(x) x##_register_nvfs_dma_ops
+#define REGSTR(x)  REGSTR2(x)
+
+#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops
+#define UNREGSTR(x)  UNREGSTR2(x)
+
+#define MODULE_PREFIX lustre_v1
+
+#define REGISTER_FUNC REGSTR(MODULE_PREFIX)
+#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX)
+
+#define NVFS_IO_ERR			-1
+#define NVFS_CPU_REQ			-2
+
+#define NVFS_HOLD_TIME_MS 1000
+
+struct nvfs_dma_rw_ops {
+	unsigned long long ft_bmap; /* feature bitmap */
+
+	int (*nvfs_blk_rq_map_sg) (struct request_queue *q,
+				   struct request *req,
+				   struct scatterlist *sglist);
+
+	int (*nvfs_dma_map_sg_attrs) (struct device *device,
+				      struct scatterlist *sglist,
+			              int nents,
+				      enum dma_data_direction dma_dir,
+				      unsigned long attrs);
+
+	int (*nvfs_dma_unmap_sg)  (struct device *device,
+				   struct scatterlist *sglist,
+				   int nents,
+				   enum dma_data_direction dma_dir);
+	bool (*nvfs_is_gpu_page) (struct page *);
+	unsigned int (*nvfs_gpu_index) (struct page *page);
+	unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int dev_index);
+};
+
+/* feature list for dma_ops, values indicate bit pos */
+enum ft_bits {
+	nvfs_ft_prep_sglist         = 1ULL << 0,
+	nvfs_ft_map_sglist          = 1ULL << 1,
+	nvfs_ft_is_gpu_page         = 1ULL << 2,
+	nvfs_ft_device_priority     = 1ULL << 3,
+};
+
+/* check features for use in registration with vendor drivers */
+#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) \
+	((ops)->ft_bmap & nvfs_ft_prep_sglist)
+#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops) \
+	((ops)->ft_bmap & nvfs_ft_map_sglist)
+#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops) \
+	((ops)->ft_bmap & nvfs_ft_is_gpu_page)
+#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops) \
+	((ops)->ft_bmap & nvfs_ft_device_priority)
+
+int REGISTER_FUNC (struct nvfs_dma_rw_ops *ops);
+
+void UNREGISTER_FUNC (void);
+
+unsigned int lnet_get_dev_prio(struct device *dev,
+			       unsigned int dev_idx);
+int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+			   int nents, enum dma_data_direction direction);
+int lnet_rdma_unmap_sg(struct device *dev,
+		       struct scatterlist *sg, int nents,
+		       enum dma_data_direction direction);
+bool lnet_is_rdma_only_page(struct page *page);
+unsigned int lnet_get_dev_idx(struct page *page);
+
+/* DMA_ATTR_NO_WARN was added to kernel v4.8-11962-ga9a62c9 */
+#ifndef DMA_ATTR_NO_WARN
+#define DMA_ATTR_NO_WARN 0
+#endif
+
+#endif /* LUSTRE_NVFS_H */
+
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
new file mode 100644
index 0000000000000..ff1fe2381768d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
@@ -0,0 +1,99 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/include/lnet/socklnd.h
+ */
+#ifndef __LNET_LNET_SOCKLND_H__
+#define __LNET_LNET_SOCKLND_H__
+
+#include <uapi/linux/lnet/lnet-types.h>
+#include <uapi/linux/lnet/socklnd.h>
+
+struct ksock_hello_msg {
+	__u32			kshm_magic;	/* LNET_PROTO_MAGIC */
+	__u32			kshm_version;	/* KSOCK_PROTO_V* */
+	struct lnet_nid		kshm_src_nid;	/* sender's nid */
+	struct lnet_nid		kshm_dst_nid;	/* destination nid */
+	lnet_pid_t		kshm_src_pid;	/* sender's pid */
+	lnet_pid_t		kshm_dst_pid;	/* destination pid */
+	__u64			kshm_src_incarnation; /* sender's incarnation */
+	__u64			kshm_dst_incarnation; /* destination's incarnation */
+	__u32			kshm_ctype;	/* SOCKLND_CONN_* */
+	__u32			kshm_nips;	/* always sent as zero */
+	__u32			kshm_ips[0];	/* deprecated */
+} __packed;
+
+struct ksock_hello_msg_nid4 {
+	__u32			kshm_magic;	/* LNET_PROTO_MAGIC */
+	__u32			kshm_version;	/* KSOCK_PROTO_V* */
+	lnet_nid_t		kshm_src_nid;	/* sender's nid */
+	lnet_nid_t		kshm_dst_nid;	/* destination nid */
+	lnet_pid_t		kshm_src_pid;	/* sender's pid */
+	lnet_pid_t		kshm_dst_pid;	/* destination pid */
+	__u64			kshm_src_incarnation; /* sender's incarnation */
+	__u64			kshm_dst_incarnation; /* destination's incarnation */
+	__u32			kshm_ctype;	/* SOCKLND_CONN_* */
+	__u32			kshm_nips;	/* sent as zero */
+	__u32			kshm_ips[0];	/* deprecated */
+} __packed;
+
+struct ksock_msg_hdr {
+	__u32			ksh_type;	/* type of socklnd message */
+	__u32			ksh_csum;	/* checksum if != 0 */
+	__u64			ksh_zc_cookies[2]; /* Zero-Copy request/ACK
+						    * cookie
+						    */
+} __packed;
+
+#define KSOCK_MSG_NOOP		0xc0		/* empty */
+#define KSOCK_MSG_LNET		0xc1		/* lnet msg */
+
+struct ksock_msg {
+	struct ksock_msg_hdr	ksm_kh;
+	union {
+		/* case ksm_kh.ksh_type == KSOCK_MSG_NOOP */
+		/* - nothing */
+		/* case ksm_kh.ksh_type == KSOCK_MSG_LNET */
+		struct lnet_hdr_nid4 lnetmsg_nid4;
+		/* case ksm_kh.ksh_type == KSOCK_MSG_LNET &&
+		 *      kshm_version >= KSOCK_PROTO_V4
+		 */
+		struct lnet_hdr_nid16 lnetmsg_nid16;
+	} __packed ksm_u;
+} __packed;
+#define ksm_type ksm_kh.ksh_type
+#define ksm_csum ksm_kh.ksh_csum
+#define ksm_zc_cookies ksm_kh.ksh_zc_cookies
+
+/* We need to know this number to parse hello msg from ksocklnd in
+ * other LND (usocklnd, for example) */
+#define KSOCK_PROTO_V2		2
+#define KSOCK_PROTO_V3		3
+#define KSOCK_PROTO_V4		4
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/udsp.h b/drivers/staging/lustrefsx/lnet/include/lnet/udsp.h
new file mode 100644
index 0000000000000..3ba5a30f6a374
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/udsp.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ * Copyright (c) 2018-2020 Data Direct Networks.
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   version 2 along with this program; If not, see
+ *   http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * Author: Amir Shehata
+ */
+
+#ifndef UDSP_H
+#define UDSP_H
+
+#include <lnet/lib-lnet.h>
+
+/**
+ * lnet_udsp_add_policy
+ *	Add a policy \new in position \idx
+ *	Must be called with api_mutex held
+ */
+int lnet_udsp_add_policy(struct lnet_udsp *new, int idx);
+
+/**
+ * lnet_udsp_get_policy
+ *	get a policy in position \idx
+ *	Must be called with api_mutex held
+ */
+struct lnet_udsp *lnet_udsp_get_policy(int idx);
+
+/**
+ * lnet_udsp_del_policy
+ *	Delete a policy from position \idx
+ *	Must be called with api_mutex held
+ */
+int lnet_udsp_del_policy(int idx);
+
+/**
+ * lnet_udsp_apply_policies
+ *	apply all stored policies across the system
+ *	Must be called with api_mutex held
+ *	Must NOT be called with lnet_net_lock held
+ *	udsp: NULL to apply on all existing udsps
+ *	      non-NULL to apply to specified udsp
+ *	revert: true to revert policy application
+ */
+int lnet_udsp_apply_policies(struct lnet_udsp *udsp, bool revert);
+
+/**
+ * lnet_udsp_apply_policies_on_lpni
+ *	apply all stored policies on specified \lpni
+ *	Must be called with api_mutex held
+ *	Must be called with LNET_LOCK_EX
+ */
+int lnet_udsp_apply_policies_on_lpni(struct lnet_peer_ni *lpni);
+
+/**
+ * lnet_udsp_apply_policies_on_lpn
+ *	Must be called with api_mutex held
+ *	apply all stored policies on specified \lpn
+ *	Must be called with LNET_LOCK_EX
+ */
+int lnet_udsp_apply_policies_on_lpn(struct lnet_peer_net *lpn);
+
+/**
+ * lnet_udsp_apply_policies_on_ni
+ *	apply all stored policies on specified \ni
+ *	Must be called with api_mutex held
+ *	Must be called with LNET_LOCK_EX
+ */
+int lnet_udsp_apply_policies_on_ni(struct lnet_ni *ni);
+
+/**
+ * lnet_udsp_apply_policies_on_net
+ *	apply all stored policies on specified \net
+ *	Must be called with api_mutex held
+ *	Must be called with LNET_LOCK_EX
+ */
+int lnet_udsp_apply_policies_on_net(struct lnet_net *net);
+
+/**
+ * lnet_udsp_alloc
+ *	Allocates a UDSP block and initializes it.
+ *	Return NULL if allocation fails
+ *	pointer to UDSP otherwise.
+ */
+struct lnet_udsp *lnet_udsp_alloc(void);
+
+/**
+ * lnet_udsp_free
+ *	Free a UDSP and all its descriptors
+ */
+void lnet_udsp_free(struct lnet_udsp *udsp);
+
+/**
+ * lnet_udsp_destroy
+ *	Free all the UDSPs
+ *	force: true to indicate shutdown in progress
+ */
+void lnet_udsp_destroy(bool shutdown);
+
+/**
+ * lnet_get_udsp_size
+ *	Return the size needed to store the marshalled UDSP
+ */
+size_t lnet_get_udsp_size(struct lnet_udsp *udsp);
+
+/**
+ * lnet_udsp_marshal
+ *	Marshal the udsp into the bulk memory provided.
+ *	Return success/failure.
+ */
+int lnet_udsp_marshal(struct lnet_udsp *udsp,
+		      struct lnet_ioctl_udsp *ioc_udsp);
+/**
+ * lnet_udsp_demarshal_add
+ *	Given a bulk containing a single UDSP,
+ *	demarshal and populate a udsp structure then add policy
+ */
+int lnet_udsp_demarshal_add(void *bulk, __u32 bulk_size);
+
+/**
+ * lnet_udsp_get_construct_info
+ *	get information of how the UDSP policies impacted the given
+ *	construct.
+ */
+void lnet_udsp_get_construct_info(struct lnet_ioctl_construct_udsp_info *info);
+
+#endif /* UDSP_H */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
new file mode 100644
index 0000000000000..86e46606b0e37
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
@@ -0,0 +1,157 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __UAPI_LIBCFS_DEBUG_H__
+#define __UAPI_LIBCFS_DEBUG_H__
+
+#include <linux/types.h>
+
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+	__u32 ph_len;
+	__u32 ph_flags;
+	__u32 ph_subsys;
+	__u32 ph_mask;
+	__u16 ph_cpu_id;
+	__u16 ph_type;
+	/* time_t overflow in 2106 */
+	__u32 ph_sec;
+	__u64 ph_usec;
+	__u32 ph_stack;
+	__u32 ph_pid;
+	__u32 ph_extern_pid;
+	__u32 ph_line_num;
+} __attribute__((packed));
+
+#define PH_FLAG_FIRST_RECORD	1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+enum libcfs_debug_subsys {
+	S_UNDEFINED	= 0x00000001,
+	S_MDC		= 0x00000002,
+	S_MDS		= 0x00000004,
+	S_OSC		= 0x00000008,
+	S_OST		= 0x00000010,
+	S_CLASS		= 0x00000020,
+	S_LOG		= 0x00000040,
+	S_LLITE		= 0x00000080,
+	S_RPC		= 0x00000100,
+	S_MGMT		= 0x00000200,
+	S_LNET		= 0x00000400,
+	S_LND		= 0x00000800, /* ALL LNDs */
+	S_PINGER	= 0x00001000,
+	S_FILTER	= 0x00002000,
+	S_LIBCFS	= 0x00004000,
+	S_ECHO		= 0x00008000,
+	S_LDLM		= 0x00010000,
+	S_LOV		= 0x00020000,
+	S_LQUOTA	= 0x00040000,
+	S_OSD		= 0x00080000,
+	S_LFSCK		= 0x00100000,
+	S_SNAPSHOT	= 0x00200000,
+/* unused */
+	S_LMV		= 0x00800000,
+/* unused */
+	S_SEC		= 0x02000000, /* upcall cache */
+	S_GSS		= 0x04000000,
+/* unused */
+	S_MGC		= 0x10000000,
+	S_MGS		= 0x20000000,
+	S_FID		= 0x40000000,
+	S_FLD		= 0x80000000,
+};
+#define LIBCFS_S_DEFAULT (~0)
+
+#define LIBCFS_DEBUG_SUBSYS_NAMES {					\
+	"undefined", "mdc", "mds", "osc", "ost", "class", "log",	\
+	"llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter",	\
+	"libcfs", "echo", "ldlm", "lov", "lquota", "osd", "lfsck",	\
+	"snapshot", "", "lmv", "", "sec", "gss", "", "mgc", "mgs",	\
+	"fid", "fld", NULL }
+
+/* Debugging masks (32 bits, non-overlapping) */
+enum libcfs_debug_masks {
+	D_TRACE		= 0x00000001, /* ENTRY/EXIT markers */
+	D_INODE		= 0x00000002,
+	D_SUPER		= 0x00000004,
+	D_IOTRACE	= 0x00000008, /* simple, low overhead io tracing */
+	D_MALLOC	= 0x00000010, /* print malloc, free information */
+	D_CACHE		= 0x00000020, /* cache-related items */
+	D_INFO		= 0x00000040, /* general information */
+	D_IOCTL		= 0x00000080, /* ioctl related information */
+	D_NETERROR	= 0x00000100, /* network errors */
+	D_NET		= 0x00000200, /* network communications */
+	D_WARNING	= 0x00000400, /* CWARN(...) == CDEBUG(D_WARNING, ...) */
+	D_BUFFS		= 0x00000800,
+	D_OTHER		= 0x00001000,
+	D_DENTRY	= 0x00002000,
+	D_NETTRACE	= 0x00004000,
+	D_PAGE		= 0x00008000, /* bulk page handling */
+	D_DLMTRACE	= 0x00010000,
+	D_ERROR		= 0x00020000, /* CERROR(...) == CDEBUG(D_ERROR, ...) */
+	D_EMERG		= 0x00040000, /* CEMERG(...) == CDEBUG(D_EMERG, ...) */
+	D_HA		= 0x00080000, /* recovery and failover */
+	D_RPCTRACE	= 0x00100000, /* for distributed debugging */
+	D_VFSTRACE	= 0x00200000,
+	D_READA		= 0x00400000, /* read-ahead */
+	D_MMAP		= 0x00800000,
+	D_CONFIG	= 0x01000000,
+	D_CONSOLE	= 0x02000000,
+	D_QUOTA		= 0x04000000,
+	D_SEC		= 0x08000000,
+	D_LFSCK		= 0x10000000, /* For both OI scrub and LFSCK */
+	D_HSM		= 0x20000000,
+	D_SNAPSHOT	= 0x40000000,
+	D_LAYOUT	= 0x80000000,
+};
+#define LIBCFS_D_DEFAULT (D_CANTMASK | D_NETERROR | D_HA | D_CONFIG | D_IOCTL |\
+			  D_LFSCK)
+
+#define LIBCFS_DEBUG_MASKS_NAMES {					\
+	"trace", "inode", "super", "iotrace", "malloc", "cache", "info",\
+	"ioctl", "neterror", "net", "warning", "buffs", "other",	\
+	"dentry", "nettrace", "page", "dlmtrace", "error", "emerg",	\
+	"ha", "rpctrace", "vfstrace", "reada", "mmap", "config",	\
+	"console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\
+	NULL }
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#endif	/* __UAPI_LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
new file mode 100644
index 0000000000000..1bcf47b29e0c9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
@@ -0,0 +1,162 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Low-level ioctl data structures. Kernel ioctl functions declared here,
+ * and user space functions are in libcfs/util/ioctl.h.
+ *
+ */
+
+#ifndef __UAPI_LIBCFS_IOCTL_H__
+#define __UAPI_LIBCFS_IOCTL_H__
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * sparse kernel source annotations
+ */
+#ifndef __user
+#define __user
+#endif
+
+#define LIBCFS_IOCTL_VERSION 0x0001000a
+#define LIBCFS_IOCTL_VERSION2 0x0001000b
+
+struct libcfs_ioctl_hdr {
+	__u32 ioc_len;
+	__u32 ioc_version;
+};
+
+/** max size to copy from userspace */
+#define LIBCFS_IOC_DATA_MAX	(128 * 1024)
+
+struct libcfs_ioctl_data {
+	struct libcfs_ioctl_hdr ioc_hdr;
+
+	__u64 ioc_nid;
+	__u64 ioc_u64[1];
+
+	__u32 ioc_flags;
+	__u32 ioc_count;
+	__u32 ioc_net;
+	__u32 ioc_u32[7];
+
+	__u32 ioc_inllen1;
+	char *ioc_inlbuf1;
+	__u32 ioc_inllen2;
+	char *ioc_inlbuf2;
+
+	__u32 ioc_plen1; /* buffers in userspace */
+	void __user *ioc_pbuf1;
+	__u32 ioc_plen2; /* buffers in userspace */
+	void __user *ioc_pbuf2;
+
+	char ioc_bulk[0];
+};
+
+#define IOCTL_LIBCFS_TYPE		  long
+
+#define IOC_LIBCFS_TYPE				      ('e')
+#define IOC_LIBCFS_MIN_NR			      30
+/* libcfs ioctls */
+/* IOC_LIBCFS_PANIC obsolete in 2.8.0, was _IOWR('e', 30, IOCTL_LIBCFS_TYPE) */
+#define IOC_LIBCFS_CLEAR_DEBUG             _IOWR('e', 31, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MARK_DEBUG              _IOWR('e', 32, IOCTL_LIBCFS_TYPE)
+/* IOC_LIBCFS_MEMHOG obsolete in 2.8.0, was _IOWR('e', 36, IOCTL_LIBCFS_TYPE) */
+/* lnet ioctls */
+#define IOC_LIBCFS_GET_NI		   _IOWR('e', 50, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_FAIL_NID		   _IOWR('e', 51, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_NOTIFY_ROUTER	   _IOWR('e', 55, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_UNCONFIGURE		   _IOWR('e', 56, IOCTL_LIBCFS_TYPE)
+/*	 IOC_LIBCFS_PORTALS_COMPATIBILITY  _IOWR('e', 57, IOCTL_LIBCFS_TYPE) */
+#define IOC_LIBCFS_LNET_DIST		   _IOWR('e', 58, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CONFIGURE		   _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_TESTPROTOCOMPAT	   _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING			   _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING_PEER               _IOWR('e', 62, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNETST		   _IOWR('e', 63, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNET_FAULT		   _IOWR('e', 64, IOCTL_LIBCFS_TYPE)
+/* lnd ioctls */
+#define IOC_LIBCFS_REGISTER_MYNID	   _IOWR('e', 70, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLOSE_CONNECTION	   _IOWR('e', 71, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PUSH_CONNECTION	   _IOWR('e', 72, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_CONN		   _IOWR('e', 73, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_PEER		   _IOWR('e', 74, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_PEER		   _IOWR('e', 75, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_PEER		   _IOWR('e', 76, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DISCOVER                _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_INTERFACE	   _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_INTERFACE	   _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_INTERFACE	   _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
+
+
+/*
+ * DLC Specific IOCTL numbers.
+ * In order to maintain backward compatibility with any possible external
+ * tools which might be accessing the IOCTL numbers, a new group of IOCTL
+ * number have been allocated.
+ */
+#define IOCTL_CONFIG_SIZE		   struct lnet_ioctl_config_data
+#define IOC_LIBCFS_ADD_ROUTE		   _IOWR(IOC_LIBCFS_TYPE, 81, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_ROUTE		   _IOWR(IOC_LIBCFS_TYPE, 82, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_ROUTE		   _IOWR(IOC_LIBCFS_TYPE, 83, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_NET		   _IOWR(IOC_LIBCFS_TYPE, 84, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_NET		   _IOWR(IOC_LIBCFS_TYPE, 85, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_NET		   _IOWR(IOC_LIBCFS_TYPE, 86, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_CONFIG_RTR		   _IOWR(IOC_LIBCFS_TYPE, 87, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_BUF		   _IOWR(IOC_LIBCFS_TYPE, 88, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_BUF		   _IOWR(IOC_LIBCFS_TYPE, 89, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_PEER_INFO	   _IOWR(IOC_LIBCFS_TYPE, 90, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LNET_STATS	   _IOWR(IOC_LIBCFS_TYPE, 91, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_PEER_NI		   _IOWR(IOC_LIBCFS_TYPE, 92, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_PEER_NI		   _IOWR(IOC_LIBCFS_TYPE, 93, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_PEER_NI		   _IOWR(IOC_LIBCFS_TYPE, 94, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_LOCAL_NI		   _IOWR(IOC_LIBCFS_TYPE, 95, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_LOCAL_NI		   _IOWR(IOC_LIBCFS_TYPE, 96, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LOCAL_NI		   _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_SET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_PEER_LIST	   _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS  _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_SET_HEALHV		   _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LOCAL_HSTATS	   _IOWR(IOC_LIBCFS_TYPE, 103, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_RECOVERY_QUEUE	   _IOWR(IOC_LIBCFS_TYPE, 104, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_UDSP		   _IOWR(IOC_LIBCFS_TYPE, 105, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_UDSP		   _IOWR(IOC_LIBCFS_TYPE, 106, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_UDSP_SIZE	   _IOWR(IOC_LIBCFS_TYPE, 107, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_UDSP		   _IOWR(IOC_LIBCFS_TYPE, 108, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_CONST_UDSP_INFO	   _IOWR(IOC_LIBCFS_TYPE, 109, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_RESET_LNET_STATS	   _IOWR(IOC_LIBCFS_TYPE, 110, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_SET_CONNS_PER_PEER	   _IOWR(IOC_LIBCFS_TYPE, 111, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR					  111
+
+extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
+
+#endif /* __UAPI_LIBCFS_IOCTL_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
new file mode 100644
index 0000000000000..2b2c05fa3b0b2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
@@ -0,0 +1,399 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.
+ *
+ * LGPL HEADER END
+ *
+ */
+/*
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+/*
+ * Author: Amir Shehata <amir.shehata@intel.com>
+ */
+
+#ifndef __UAPI_LNET_DLC_H_
+#define __UAPI_LNET_DLC_H_
+
+#include <linux/types.h>
+#include <linux/lnet/libcfs_ioctl.h>
+#include <linux/lnet/lnet-types.h>
+
+#define MAX_NUM_SHOW_ENTRIES	32
+#define LNET_MAX_STR_LEN	128
+#define LNET_MAX_SHOW_NUM_CPT	128
+#define LNET_MAX_SHOW_NUM_NID	128
+#define LNET_UNDEFINED_HOPS	((__u32) -1)
+
+#define LNET_RT_ALIVE		(1 << 0)
+#define LNET_RT_MULTI_HOP	(1 << 1)
+
+/*
+ * sparse kernel source annotations
+ */
+#ifndef __user
+#define __user
+#endif
+
+/*
+ * To allow for future enhancements to extend the tunables
+ * add a hdr to this structure, so that the version can be set
+ * and checked for backwards compatibility. Newer versions of LNet
+ * can still work with older versions of lnetctl. The restriction is
+ * that the structure can be added to and not removed from in order
+ * to not invalidate older lnetctl utilities. Moreover, the order of
+ * fields must remain the same, and new fields appended to the structure
+ *
+ * That said all existing LND tunables will be added in this structure
+ * to avoid future changes.
+ */
+struct lnet_ioctl_config_lnd_cmn_tunables {
+	__u32 lct_version;
+	__s32 lct_peer_timeout;
+	__s32 lct_peer_tx_credits;
+	__s32 lct_peer_rtr_credits;
+	__s32 lct_max_tx_credits;
+};
+
+struct lnet_ioctl_config_o2iblnd_tunables {
+	__u32 lnd_version;
+	__u32 lnd_peercredits_hiw;
+	__u32 lnd_map_on_demand;
+	__u32 lnd_concurrent_sends;
+	__u32 lnd_fmr_pool_size;
+	__u32 lnd_fmr_flush_trigger;
+	__u32 lnd_fmr_cache;
+	__u16 lnd_conns_per_peer;
+	__u16 lnd_ntx;
+};
+
+struct lnet_ioctl_config_socklnd_tunables {
+	__u32 lnd_version;
+	__u16 lnd_conns_per_peer;
+	__u16 lnd_pad;
+};
+
+struct lnet_lnd_tunables {
+	union {
+		struct lnet_ioctl_config_o2iblnd_tunables lnd_o2ib;
+		struct lnet_ioctl_config_socklnd_tunables lnd_sock;
+	} lnd_tun_u;
+};
+
+struct lnet_ioctl_config_lnd_tunables {
+	struct lnet_ioctl_config_lnd_cmn_tunables lt_cmn;
+	struct lnet_lnd_tunables lt_tun;
+};
+
+struct lnet_ioctl_net_config {
+	char ni_interface[LNET_MAX_STR_LEN];
+	__u32 ni_status;
+	__u32 ni_cpts[LNET_MAX_SHOW_NUM_CPT];
+	char cfg_bulk[0];
+};
+
+#define LNET_TINY_BUF_IDX	0
+#define LNET_SMALL_BUF_IDX	1
+#define LNET_LARGE_BUF_IDX	2
+
+/* # different router buffer pools */
+#define LNET_NRBPOOLS		(LNET_LARGE_BUF_IDX + 1)
+
+struct lnet_ioctl_pool_cfg {
+	struct {
+		__u32 pl_npages;
+		__u32 pl_nbuffers;
+		__u32 pl_credits;
+		__u32 pl_mincredits;
+	} pl_pools[LNET_NRBPOOLS];
+	__u32 pl_routing;
+};
+
+struct lnet_ioctl_ping_data {
+	struct libcfs_ioctl_hdr ping_hdr;
+
+	__u32 op_param;
+	__u32 ping_count;
+	__u32 ping_flags;
+	__u32 mr_info;
+	struct lnet_process_id ping_id;
+	struct lnet_process_id __user *ping_buf;
+	lnet_nid_t ping_src;
+};
+
+struct lnet_ioctl_config_data {
+	struct libcfs_ioctl_hdr cfg_hdr;
+
+	__u32 cfg_net;
+	__u32 cfg_count;
+	__u64 cfg_nid;
+	__u32 cfg_ncpts;
+
+	union {
+		struct {
+			__u32 rtr_hop;
+			__u32 rtr_priority;
+			__u32 rtr_flags;
+			__u32 rtr_sensitivity;
+		} cfg_route;
+		struct {
+			char net_intf[LNET_MAX_STR_LEN];
+			__s32 net_peer_timeout;
+			__s32 net_peer_tx_credits;
+			__s32 net_peer_rtr_credits;
+			__s32 net_max_tx_credits;
+			__u32 net_cksum_algo;
+			__u32 net_interface_count;
+		} cfg_net;
+		struct {
+			__u32 buf_enable;
+			__s32 buf_tiny;
+			__s32 buf_small;
+			__s32 buf_large;
+		} cfg_buffers;
+	} cfg_config_u;
+
+	char cfg_bulk[0];
+};
+
+struct lnet_ioctl_comm_count {
+	__u32 ico_get_count;
+	__u32 ico_put_count;
+	__u32 ico_reply_count;
+	__u32 ico_ack_count;
+	__u32 ico_hello_count;
+};
+
+struct lnet_ioctl_element_stats {
+	__u32 iel_send_count;
+	__u32 iel_recv_count;
+	__u32 iel_drop_count;
+};
+
+enum lnet_health_type {
+	LNET_HEALTH_TYPE_LOCAL_NI = 0,
+	LNET_HEALTH_TYPE_PEER_NI,
+};
+
+struct lnet_ioctl_local_ni_hstats {
+	struct libcfs_ioctl_hdr hlni_hdr;
+	lnet_nid_t hlni_nid;
+	__u32 hlni_local_interrupt;
+	__u32 hlni_local_dropped;
+	__u32 hlni_local_aborted;
+	__u32 hlni_local_no_route;
+	__u32 hlni_local_timeout;
+	__u32 hlni_local_error;
+	__s32 hlni_fatal_error;
+	__s32 hlni_health_value;
+	__u32 hlni_ping_count;
+	__u64 hlni_next_ping;
+};
+
+struct lnet_ioctl_peer_ni_hstats {
+	__u32 hlpni_remote_dropped;
+	__u32 hlpni_remote_timeout;
+	__u32 hlpni_remote_error;
+	__u32 hlpni_network_timeout;
+	__s32 hlpni_health_value;
+	__u32 hlpni_ping_count;
+	__u64 hlpni_next_ping;
+};
+
+struct lnet_ioctl_element_msg_stats {
+	struct libcfs_ioctl_hdr im_hdr;
+	__u32 im_idx;
+	struct lnet_ioctl_comm_count im_send_stats;
+	struct lnet_ioctl_comm_count im_recv_stats;
+	struct lnet_ioctl_comm_count im_drop_stats;
+};
+
+/*
+ * lnet_ioctl_config_ni
+ *  This structure describes an NI configuration. There are multiple components
+ *  when configuring an NI: Net, Interfaces, CPT list and LND tunables
+ *  A network is passed as a string to the DLC and translated using
+ *  libcfs_str2net()
+ *  An interface is the name of the system configured interface
+ *  (ex eth0, ib1)
+ *  CPT is the list of CPTS LND tunables are passed in the lic_bulk area
+ */
+struct lnet_ioctl_config_ni {
+	struct libcfs_ioctl_hdr lic_cfg_hdr;
+	lnet_nid_t		lic_nid;
+	char			lic_ni_intf[LNET_MAX_STR_LEN];
+	char			lic_legacy_ip2nets[LNET_MAX_STR_LEN];
+	__u32			lic_cpts[LNET_MAX_SHOW_NUM_CPT];
+	__u32			lic_ncpts;
+	__u32			lic_status;
+	__u32			lic_idx;
+	__s32			lic_dev_cpt;
+	char			pad[4];
+	char			lic_bulk[0];
+};
+
+struct lnet_peer_ni_credit_info {
+	char cr_aliveness[LNET_MAX_STR_LEN];
+	__u32 cr_refcount;
+	__s32 cr_ni_peer_tx_credits;
+	__s32 cr_peer_tx_credits;
+	__s32 cr_peer_min_tx_credits;
+	__u32 cr_peer_tx_qnob;
+	__s32 cr_peer_rtr_credits;
+	__s32 cr_peer_min_rtr_credits;
+	__u32 cr_ncpt;
+};
+
+struct lnet_ioctl_peer {
+	struct libcfs_ioctl_hdr pr_hdr;
+	__u32 pr_count;
+	__u32 pr_pad;
+	lnet_nid_t pr_nid;
+
+	union {
+		struct lnet_peer_ni_credit_info  pr_peer_credits;
+	} pr_lnd_u;
+};
+
+struct lnet_ioctl_peer_cfg {
+	struct libcfs_ioctl_hdr prcfg_hdr;
+	lnet_nid_t prcfg_prim_nid;
+	lnet_nid_t prcfg_cfg_nid;
+	__u32 prcfg_count;
+	__u32 prcfg_mr;
+	__u32 prcfg_state;
+	__u32 prcfg_size;
+	void __user *prcfg_bulk;
+};
+
+struct lnet_ioctl_reset_health_cfg {
+	struct libcfs_ioctl_hdr rh_hdr;
+	enum lnet_health_type rh_type:32;
+	__u16 rh_all:1;
+	__s16 rh_value;
+	lnet_nid_t rh_nid;
+};
+
+struct lnet_ioctl_reset_conns_per_peer_cfg {
+	struct libcfs_ioctl_hdr rcpp_hdr;
+	__u16 rcpp_all:1;
+	__s16 rcpp_value;
+	lnet_nid_t rcpp_nid;
+};
+
+struct lnet_ioctl_recovery_list {
+	struct libcfs_ioctl_hdr rlst_hdr;
+	enum lnet_health_type rlst_type:32;
+	__u32 rlst_num_nids;
+	lnet_nid_t rlst_nid_array[LNET_MAX_SHOW_NUM_NID];
+};
+
+struct lnet_ioctl_set_value {
+	struct libcfs_ioctl_hdr sv_hdr;
+	__u32 sv_value;
+};
+
+struct lnet_ioctl_lnet_stats {
+	struct libcfs_ioctl_hdr st_hdr;
+	struct lnet_counters st_cntrs;
+};
+
+/* An IP, numeric NID or a Net number is composed of 1 or more of these
+ * descriptor structures.
+ */
+struct lnet_range_expr {
+	__u32 re_lo;
+	__u32 re_hi;
+	__u32 re_stride;
+};
+
+/* le_count identifies the number of lnet_range_expr in the bulk
+ * which follows
+ */
+struct lnet_expressions {
+	__u32 le_count;
+};
+
+/* A net descriptor has the net type, IE: O2IBLND, SOCKLND, etc and an
+ * expression describing a net number range.
+ */
+struct lnet_ioctl_udsp_net_descr {
+	__u32 ud_net_type;
+	struct lnet_expressions ud_net_num_expr;
+};
+
+/* The UDSP descriptor header contains the type of matching criteria, SRC,
+ * DST, RTE, etc and how many lnet_expressions compose the LNet portion of
+ * the LNet NID. For example an IP can be
+ * composed of 4 lnet_expressions , a gni can be composed of 1
+ */
+struct lnet_ioctl_udsp_descr_hdr {
+	/* The literals SRC, DST and RTE are encoded
+	 * here.
+	 */
+	__u32 ud_descr_type;
+	__u32 ud_descr_count;
+};
+
+/* each matching expression in the UDSP is described with this.
+ * The bulk format is as follows:
+ *	1. 1x struct lnet_ioctl_udsp_net_descr
+ *		-> the net part of the NID
+ *	2. >=0 struct lnet_expressions
+ *		-> the address part of the NID
+ */
+struct lnet_ioctl_udsp_descr {
+	struct lnet_ioctl_udsp_descr_hdr iud_src_hdr;
+	struct lnet_ioctl_udsp_net_descr iud_net;
+};
+
+/* The cumulative UDSP descriptor
+ * The bulk format is as follows:
+ *	1. >=1 struct lnet_ioctl_udsp_descr
+ *
+ * The size indicated in iou_hdr is the total size of the UDSP.
+ *
+ */
+struct lnet_ioctl_udsp {
+	struct libcfs_ioctl_hdr iou_hdr;
+	__s32 iou_idx;
+	__u32 iou_action_type;
+	__u32 iou_bulk_size;
+	union {
+		__u32 priority;
+	} iou_action;
+	void __user *iou_bulk;
+};
+
+/* structure used to request udsp instantiation information on the
+ * specified construct.
+ *   cud_nid: the NID of the local or remote NI to pull info on.
+ *   cud_nid_priority: NID prio of the requested NID.
+ *   cud_net_priority: net prio of network of the requested NID.
+ *   cud_pref_nid: array of preferred NIDs if it exists.
+ */
+struct lnet_ioctl_construct_udsp_info {
+	struct libcfs_ioctl_hdr cud_hdr;
+	__u32 cud_peer:1;
+	lnet_nid_t cud_nid;
+	__u32 cud_nid_priority;
+	__u32 cud_net_priority;
+	lnet_nid_t cud_pref_nid[LNET_MAX_SHOW_NUM_NID];
+	lnet_nid_t cud_pref_rtr_nid[LNET_MAX_SHOW_NUM_NID];
+};
+
+#endif /* _LNET_DLC_H_ */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-idl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-idl.h
new file mode 100644
index 0000000000000..bdff24e8839da
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-idl.h
@@ -0,0 +1,298 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __UAPI_LNET_IDL_H__
+#define __UAPI_LNET_IDL_H__
+
+#include <linux/types.h>
+
+/************************************************************************
+ * Core LNet wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+/** Address of an end-point in an LNet network.
+ *
+ * A node can have multiple end-points and hence multiple addresses.
+ * An LNet network can be a simple network (e.g. tcp0) or a network of
+ * LNet networks connected by LNet routers. Therefore an end-point address
+ * has two parts: network ID, and address within a network.
+ * The most-significant-byte in this format is always 0.  A larger value
+ * would imply a larger nid with a larger address.
+ *
+ * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID.
+ */
+typedef __u64 lnet_nid_t;
+
+/*
+ * Address of LNet end-point in extended form
+ *
+ * To support addresses larger than 32bits we have
+ * an extended nid which supports up to 128 bits
+ * of address and is extensible.
+ * If nid_size is 0, then the nid can be stored in an lnet_nid_t,
+ * and the first 8 bytes of the 'struct lnet_nid' are identical to
+ * the lnet_nid_t in big-endian format.
+ * If nid_type == 0xff, then all other fields should be ignored
+ * and this is an ANY wildcard address.  In particular, the nid_size
+ * can be 0xff without making the address too big to fit.
+ */
+struct lnet_nid {
+	__u8	nid_size;	/* total bytes - 8 */
+	__u8	nid_type;
+	__be16	nid_num;
+	__be32	nid_addr[4];
+} __attribute__((packed));
+
+#define NID_BYTES(nid)		((nid)->nid_size + 8)
+#define NID_ADDR_BYTES(nid)	((nid)->nid_size + 4)
+
+/**
+ * ID of a process in a node. Shortened as PID to distinguish from
+ * lnet_process_id, the global process ID.
+ */
+typedef __u32 lnet_pid_t;
+
+/* Packed version of struct lnet_process_id to transfer via network */
+struct lnet_process_id_packed {
+	lnet_nid_t nid;
+	lnet_pid_t pid;	/* node id / process id */
+} __attribute__((packed));
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use).
+ */
+struct lnet_handle_wire {
+	__u64 wh_interface_cookie;
+	__u64 wh_object_cookie;
+} __attribute__((packed));
+
+enum lnet_msg_type {
+	LNET_MSG_ACK = 0,
+	LNET_MSG_PUT,
+	LNET_MSG_GET,
+	LNET_MSG_REPLY,
+	LNET_MSG_HELLO,
+};
+
+/* The variant fields of the portals message header are aligned on an 8
+ * byte boundary in the message header.  Note that all types used in these
+ * wire structs MUST be fixed size and the smaller types are placed at the
+ * end.
+ */
+struct lnet_ack {
+	struct lnet_handle_wire	dst_wmd;
+	__u64			match_bits;
+	__u32			mlength;
+} __attribute__((packed));
+
+struct lnet_put {
+	struct lnet_handle_wire	ack_wmd;
+	__u64			match_bits;
+	__u64			hdr_data;
+	__u32			ptl_index;
+	__u32			offset;
+} __attribute__((packed));
+
+struct lnet_get {
+	struct lnet_handle_wire	return_wmd;
+	__u64			match_bits;
+	__u32			ptl_index;
+	__u32			src_offset;
+	__u32			sink_length;
+} __attribute__((packed));
+
+struct lnet_reply {
+	struct lnet_handle_wire	dst_wmd;
+} __attribute__((packed));
+
+struct lnet_hello {
+	__u64			incarnation;
+	__u32			type;
+} __attribute__((packed));
+
+union lnet_cmd_hdr {
+	struct lnet_ack		ack;
+	struct lnet_put		put;
+	struct lnet_get		get;
+	struct lnet_reply	reply;
+	struct lnet_hello	hello;
+} __attribute__((packed));
+
+/* This is used for message headers that lnet code is manipulating.
+ *  All fields before the union are in host-byte-order.
+ */
+struct lnet_hdr {
+	struct lnet_nid		dest_nid;
+	struct lnet_nid		src_nid;
+	lnet_pid_t		dest_pid;
+	lnet_pid_t		src_pid;
+	__u32			type;		/* enum lnet_msg_type */
+	__u32			payload_length;	/* payload data to follow */
+	/*<------__u64 aligned------->*/
+	union lnet_cmd_hdr	msg;
+} __attribute__((packed));
+
+/* This is used to support conversion between an lnet_hdr and
+ * the content of a network message.
+ */
+struct _lnet_hdr_nid4 {
+	lnet_nid_t	dest_nid;
+	lnet_nid_t	src_nid;
+	lnet_pid_t	dest_pid;
+	lnet_pid_t	src_pid;
+	__u32		type;		/* enum lnet_msg_type */
+	__u32		payload_length;	/* payload data to follow */
+	/*<------__u64 aligned------->*/
+	union lnet_cmd_hdr msg;
+} __attribute__((packed));
+
+/* This is stored in a network message buffer.  Content cannot be accessed
+ * without converting to an lnet_hdr.
+ */
+struct lnet_hdr_nid4 {
+	char	_bytes[sizeof(struct _lnet_hdr_nid4)];
+} __attribute__((packed));
+
+/* A HELLO message contains a magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * LNET_MSG_HELLO in the type field.  All other common fields are zero
+ * (including payload_size; i.e. no payload).
+ * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID. These LNDs should
+ * exchange HELLO messages when a connection is first established.  Individual
+ * LNDs can put whatever else they fancy in lnet_hdr::msg.
+ */
+struct lnet_magicversion {
+	__u32	magic;		/* LNET_PROTO_TCP_MAGIC */
+	__u16	version_major;	/* increment on incompatible change */
+	__u16	version_minor;	/* increment on compatible change */
+} __attribute__((packed));
+
+/* PROTO MAGIC for LNDs */
+#define LNET_PROTO_IB_MAGIC		0x0be91b91
+#define LNET_PROTO_GNI_MAGIC		0xb00fbabe /* ask Kim */
+#define LNET_PROTO_TCP_MAGIC		0xeebc0ded
+#define LNET_PROTO_ACCEPTOR_MAGIC	0xacce7100
+#define LNET_PROTO_PING_MAGIC		0x70696E67 /* 'ping' */
+
+/* Placeholder for a future "unified" protocol across all LNDs */
+/* Current LNDs that receive a request with this magic will respond
+ * with a "stub" reply using their current protocol */
+#define LNET_PROTO_MAGIC		0x45726963 /* ! */
+
+#define LNET_PROTO_TCP_VERSION_MAJOR	1
+#define LNET_PROTO_TCP_VERSION_MINOR	0
+
+/* Acceptor connection request */
+struct lnet_acceptor_connreq {
+	__u32	acr_magic;	/* LNET_PROTO_ACCEPTOR_MAGIC */
+	__u32	acr_version;	/* protocol version */
+	__u64	acr_nid;	/* target NID */
+} __attribute__((packed));
+
+#define LNET_PROTO_ACCEPTOR_VERSION	1
+
+struct lnet_acceptor_connreq_v2 {
+	__u32			acr_magic;	/* LNET_PROTO_ACCEPTOR_MAGIC */
+	__u32			acr_version;	/* protocol version - 2 */
+	struct lnet_nid		acr_nid;	/* target NID */
+} __attribute__((packed));
+
+/* For use with 16-byte addresses */
+#define LNET_PROTO_ACCEPTOR_VERSION_16  2
+
+struct lnet_counters_common {
+	__u32	lcc_msgs_alloc;
+	__u32	lcc_msgs_max;
+	__u32	lcc_errors;
+	__u32	lcc_send_count;
+	__u32	lcc_recv_count;
+	__u32	lcc_route_count;
+	__u32	lcc_drop_count;
+	__u64	lcc_send_length;
+	__u64	lcc_recv_length;
+	__u64	lcc_route_length;
+	__u64	lcc_drop_length;
+} __attribute__((packed));
+
+
+#define LNET_NI_STATUS_UP	0x15aac0de
+#define LNET_NI_STATUS_DOWN	0xdeadface
+#define LNET_NI_STATUS_INVALID	0x00000000
+
+struct lnet_ni_status {
+	lnet_nid_t ns_nid;
+	__u32      ns_status;
+	__u32      ns_unused;
+} __attribute__((packed));
+
+/*
+ * NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue
+ */
+#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
+#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
+#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
+#define LNET_PING_FEAT_RTE_DISABLED	(1 << 2)        /* Routing enabled */
+#define LNET_PING_FEAT_MULTI_RAIL	(1 << 3)        /* Multi-Rail aware */
+#define LNET_PING_FEAT_DISCOVERY	(1 << 4)	/* Supports Discovery */
+
+/*
+ * All ping feature bits fit to hit the wire.
+ * In lnet_assert_wire_constants() this is compared against its open-coded
+ * value, and in lnet_ping_target_update() it is used to verify that no
+ * unknown bits have been set.
+ * New feature bits can be added, just be aware that this does change the
+ * over-the-wire protocol.
+ */
+#define LNET_PING_FEAT_BITS		(LNET_PING_FEAT_BASE | \
+					 LNET_PING_FEAT_NI_STATUS | \
+					 LNET_PING_FEAT_RTE_DISABLED | \
+					 LNET_PING_FEAT_MULTI_RAIL | \
+					 LNET_PING_FEAT_DISCOVERY)
+
+struct lnet_ping_info {
+	__u32			pi_magic;
+	__u32			pi_features;
+	lnet_pid_t		pi_pid;
+	__u32			pi_nnis;
+	struct lnet_ni_status	pi_ni[];
+} __attribute__((packed));
+
+#define LNET_PING_INFO_SIZE(NNIDS) \
+	offsetof(struct lnet_ping_info, pi_ni[NNIDS])
+#define LNET_PING_INFO_LONI(PINFO)      ((PINFO)->pi_ni[0].ns_nid)
+#define LNET_PING_INFO_SEQNO(PINFO)     ((PINFO)->pi_ni[0].ns_status)
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-nl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-nl.h
new file mode 100644
index 0000000000000..8bc0317c73e1c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-nl.h
@@ -0,0 +1,87 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.
+ *
+ * LGPL HEADER END
+ *
+ */
+/* Copyright (c) 2021,  UT-Battelle, LLC
+ *
+ * Author: James Simmons <jsimmons@infradead.org>
+ */
+
+#ifndef __UAPI_LNET_NL_H__
+#define __UAPI_LNET_NL_H__
+
+#include <linux/types.h>
+
+enum lnet_nl_key_format {
+	/* Is it FLOW or BLOCK */
+	LNKF_FLOW		= 1,
+	/* Is it SEQUENCE or MAPPING */
+	LNKF_MAPPING		= 2,
+	LNKF_SEQUENCE		= 4,
+};
+
+/**
+ * enum lnet_nl_scalar_attrs		- scalar LNet netlink attributes used
+ *					  to compose messages for sending or
+ *					  receiving.
+ *
+ * @LN_SCALAR_ATTR_UNSPEC:		unspecified attribute to catch errors
+ * @LN_SCALAR_ATTR_PAD:			padding for 64-bit attributes, ignore
+ *
+ * @LN_SCALAR_ATTR_LIST:		List of scalar attributes (NLA_NESTED)
+ * @LN_SCALAR_ATTR_LIST_SIZE:		Number of items in scalar list (NLA_U16)
+ * @LN_SCALAR_ATTR_INDEX:		True Netlink attr value (NLA_U16)
+ * @LN_SCALAR_ATTR_NLA_TYPE:		Data format for value part of the pair
+ *					(NLA_U16)
+ * @LN_SCALAR_ATTR_VALUE:		String value of key part of the pair.
+ *					(NLA_NUL_STRING)
+ * @LN_SCALAR_ATTR_INT_VALUE:		Numeric value of key part of the pair.
+ *					(NLA_S64)
+ * @LN_SCALAR_ATTR_KEY_FORMAT:		LNKF_* format of the key value pair.
+ */
+enum lnet_nl_scalar_attrs {
+	LN_SCALAR_ATTR_UNSPEC = 0,
+	LN_SCALAR_ATTR_PAD = LN_SCALAR_ATTR_UNSPEC,
+
+	LN_SCALAR_ATTR_LIST,
+	LN_SCALAR_ATTR_LIST_SIZE,
+	LN_SCALAR_ATTR_INDEX,
+	LN_SCALAR_ATTR_NLA_TYPE,
+	LN_SCALAR_ATTR_VALUE,
+	LN_SCALAR_ATTR_INT_VALUE,
+	LN_SCALAR_ATTR_KEY_FORMAT,
+
+	__LN_SCALAR_ATTR_MAX_PLUS_ONE,
+};
+
+#define LN_SCALAR_MAX (__LN_SCALAR_ATTR_MAX_PLUS_ONE - 1)
+
+struct ln_key_props {
+	char			*lkp_value;
+	__u16			lkp_key_format;
+	__u16			lkp_data_type;
+};
+
+struct ln_key_list {
+	__u16			lkl_maxattr;
+	struct ln_key_props	lkl_list[];
+};
+
+#endif /* __UAPI_LNET_NL_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
new file mode 100644
index 0000000000000..d32ec52263f57
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
@@ -0,0 +1,635 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __UAPI_LNET_TYPES_H__
+#define __UAPI_LNET_TYPES_H__
+
+#include <linux/string.h>
+#include <asm/byteorder.h>
+
+/** \addtogroup lnet
+ * @{ */
+
+#include <linux/lnet/lnet-idl.h>
+
+/** \addtogroup lnet_addr
+ * @{ */
+
+#define LNET_VERSION		"0.7.0"
+
+/** Portal reserved for LNet's own use.
+ * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments.
+ */
+#define LNET_RESERVED_PORTAL	  0
+
+/** wildcard NID that matches any end-point address */
+#define LNET_NID_ANY	  ((lnet_nid_t) -1)
+/** wildcard PID that matches any lnet_pid_t */
+#define LNET_PID_ANY	  ((lnet_pid_t) -1)
+
+static inline int LNET_NID_IS_ANY(const struct lnet_nid *nid)
+{
+	/* A NULL pointer can be used to mean "ANY" */
+	return !nid || nid->nid_type == 0xFF;
+}
+
+#define LNET_ANY_NID ((struct lnet_nid)			\
+		      {0xFF, 0xFF, ~0, {~0, ~0, ~0, ~0} })
+
+#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */
+#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */
+#define LNET_PID_LUSTRE 12345
+
+/* how an LNET NID encodes net:address */
+/** extract the address part of an lnet_nid_t */
+
+static inline __u32 LNET_NIDADDR(lnet_nid_t nid)
+{
+	return nid & 0xffffffff;
+}
+
+static inline __u32 LNET_NIDNET(lnet_nid_t nid)
+{
+	return (nid >> 32) & 0xffffffff;
+}
+
+static inline lnet_nid_t LNET_MKNID(__u32 net, __u32 addr)
+{
+	return (((__u64)net) << 32) | addr;
+}
+
+static inline __u32 LNET_NETNUM(__u32 net)
+{
+	return net & 0xffff;
+}
+
+static inline __u32 LNET_NETTYP(__u32 net)
+{
+	return (net >> 16) & 0xff;
+}
+
+static inline __u32 LNET_MKNET(__u32 type, __u32 num)
+{
+	return (type << 16) | num;
+}
+
+/** The lolnd NID (i.e. myself) */
+#define LNET_NID_LO_0 LNET_MKNID(LNET_MKNET(LOLND, 0), 0)
+
+#define LNET_NET_ANY LNET_NIDNET(LNET_NID_ANY)
+
+static inline int nid_is_nid4(const struct lnet_nid *nid)
+{
+	return NID_ADDR_BYTES(nid) == 4;
+}
+
+/* LOLND may not be defined yet, so we cannot use an inline */
+#define nid_is_lo0(__nid)						\
+	((__nid)->nid_type == LOLND &&					\
+	 nid_is_nid4(__nid) &&						\
+	 (__nid)->nid_num == 0 &&					\
+	 (__nid)->nid_addr[0] == 0)
+
+static inline __u32 LNET_NID_NET(const struct lnet_nid *nid)
+{
+	if (LNET_NID_IS_ANY(nid))
+		return LNET_NET_ANY;
+	else
+		return LNET_MKNET(nid->nid_type, __be16_to_cpu(nid->nid_num));
+}
+
+static inline void lnet_nid4_to_nid(lnet_nid_t nid4, struct lnet_nid *nid)
+{
+	if (nid4 == LNET_NID_ANY) {
+		*nid = LNET_ANY_NID;
+		return;
+	}
+
+	nid->nid_size = 0;
+	nid->nid_type = LNET_NETTYP(LNET_NIDNET(nid4));
+	nid->nid_num = __cpu_to_be16(LNET_NETNUM(LNET_NIDNET(nid4)));
+	nid->nid_addr[0] = __cpu_to_be32(LNET_NIDADDR(nid4));
+	nid->nid_addr[1] = nid->nid_addr[2] = nid->nid_addr[3] = 0;
+}
+
+static inline lnet_nid_t lnet_nid_to_nid4(const struct lnet_nid *nid)
+{
+	if (LNET_NID_IS_ANY(nid))
+		return LNET_NID_ANY;
+
+	return LNET_MKNID(LNET_NID_NET(nid), __be32_to_cpu(nid->nid_addr[0]));
+}
+
+static inline int nid_same(const struct lnet_nid *n1,
+			    const struct lnet_nid *n2)
+{
+	return n1->nid_size == n2->nid_size &&
+		n1->nid_type == n2->nid_type &&
+		n1->nid_num == n2->nid_num &&
+		n1->nid_addr[0] == n2->nid_addr[0] &&
+		n1->nid_addr[1] == n2->nid_addr[1] &&
+		n1->nid_addr[2] == n2->nid_addr[2] &&
+		n1->nid_addr[3] == n2->nid_addr[3];
+}
+
+/* This can be used when we need to hash a nid */
+static inline unsigned long nidhash(const struct lnet_nid *nid)
+{
+	int i;
+	unsigned long hash = 0;
+
+	hash ^= LNET_NID_NET(nid);
+	for (i = 0; i < 4; i++)
+		hash ^= nid->nid_addr[i];
+	return hash;
+}
+
+struct lnet_counters_health {
+	__u32	lch_rst_alloc;
+	__u32	lch_resend_count;
+	__u32	lch_response_timeout_count;
+	__u32	lch_local_interrupt_count;
+	__u32	lch_local_dropped_count;
+	__u32	lch_local_aborted_count;
+	__u32	lch_local_no_route_count;
+	__u32	lch_local_timeout_count;
+	__u32	lch_local_error_count;
+	__u32	lch_remote_dropped_count;
+	__u32	lch_remote_error_count;
+	__u32	lch_remote_timeout_count;
+	__u32	lch_network_timeout_count;
+};
+
+struct lnet_counters {
+	struct lnet_counters_common lct_common;
+	struct lnet_counters_health lct_health;
+};
+
+/*
+ * This is a hard-coded limit on the number of interfaces supported by
+ * the interface bonding implemented by the ksocknal LND. It must be
+ * defined here because it is used in LNet data structures that are
+ * common to all LNDs.
+ */
+#define LNET_INTERFACES_NUM	16
+
+/* The minimum number of interfaces per node supported by LNet. */
+#define LNET_INTERFACES_MIN	16
+/* The default - arbitrary - value of the lnet_max_interfaces tunable. */
+#define LNET_INTERFACES_MAX_DEFAULT	200
+
+/**
+ * Objects maintained by the LNet are accessed through handles. Handle types
+ * have names of the form lnet_handle_xx, where xx is one of the two letter
+ * object type codes ('md' for memory descriptor, and
+ * 'me' for match entry). Each type of object is given a unique handle type
+ * to enhance type checking.
+ */
+#define LNET_WIRE_HANDLE_COOKIE_NONE   (~0ULL)
+
+struct lnet_handle_md {
+	__u64	cookie;
+};
+
+/**
+ * Invalidate md handle \a h.
+ */
+static inline void LNetInvalidateMDHandle(struct lnet_handle_md *h)
+{
+	h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE;
+}
+
+/**
+ * Check whether handler \a h is invalid.
+ *
+ * \return 1 if handle is invalid, 0 if valid.
+ */
+static inline int LNetMDHandleIsInvalid(struct lnet_handle_md h)
+{
+	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
+}
+
+/**
+ * Global process ID.
+ */
+struct lnet_process_id {
+	/** node id */
+	lnet_nid_t nid;
+	/** process id */
+	lnet_pid_t pid;
+};
+
+/**
+ * Global process ID - with large addresses
+ */
+struct lnet_processid {
+	/** node id */
+	struct lnet_nid nid;
+	/** process id */
+	lnet_pid_t pid;
+};
+
+static inline void
+lnet_pid4_to_pid(struct lnet_process_id pid4, struct lnet_processid *pid)
+{
+	pid->pid = pid4.pid;
+	lnet_nid4_to_nid(pid4.nid, &pid->nid);
+}
+
+static inline struct lnet_process_id
+lnet_pid_to_pid4(struct lnet_processid *pid)
+{
+	struct lnet_process_id ret;
+
+	ret.pid = pid->pid;
+	ret.nid = lnet_nid_to_nid4(&pid->nid);
+	return ret;
+}
+
+/** @} lnet_addr */
+
+/** \addtogroup lnet_me
+ * @{ */
+
+/**
+ * Specifies whether the match entry or memory descriptor should be unlinked
+ * automatically (LNET_UNLINK) or not (LNET_RETAIN).
+ */
+enum lnet_unlink {
+	LNET_RETAIN = 0,
+	LNET_UNLINK
+};
+
+/**
+ * Values of the type enum lnet_ins_pos are used to control where a new match
+ * entry is inserted. The value LNET_INS_BEFORE is used to insert the new
+ * entry before the current entry or before the head of the list. The value
+ * LNET_INS_AFTER is used to insert the new entry after the current entry
+ * or after the last item in the list.
+ */
+enum lnet_ins_pos {
+	/** insert ME before current position or head of the list */
+	LNET_INS_BEFORE,
+	/** insert ME after current position or tail of the list */
+	LNET_INS_AFTER,
+	/** attach ME at tail of local CPU partition ME list */
+	LNET_INS_LOCAL
+};
+
+/** @} lnet_me */
+
+/** \addtogroup lnet_md
+ * @{ */
+
+struct lnet_hdr_nid16 {
+	char	_bytes[sizeof(struct lnet_hdr)];
+} __attribute__((packed));
+
+/**
+ * Event queue handler function type.
+ *
+ * The EQ handler runs for each event that is deposited into the EQ. The
+ * handler is supplied with a pointer to the event that triggered the
+ * handler invocation.
+ *
+ * The handler must not block, must be reentrant, and must not call any LNet
+ * API functions. It should return as quickly as possible.
+ */
+struct lnet_event;
+typedef void (*lnet_handler_t)(struct lnet_event *event);
+
+/**
+ * Defines the visible parts of a memory descriptor. Values of this type
+ * are used to initialize memory descriptors.
+ */
+struct lnet_md {
+	/**
+	 * Specify the memory region associated with the memory descriptor.
+	 * If the options field has:
+	 * - LNET_MD_KIOV bit set: The start field points to the starting
+	 * address of an array of struct bio_vec and the length field specifies
+	 * the number of entries in the array. The length can't be bigger
+	 * than LNET_MAX_IOV. The struct bio_vec is used to describe page-based
+	 * fragments that are not necessarily mapped in virtal memory.
+	 * - Otherwise: The memory region is contiguous. The start field
+	 * specifies the starting address for the memory region and the
+	 * length field specifies its length.
+	 *
+	 * When the memory region is fragmented, all fragments but the first
+	 * one must start on page boundary, and all but the last must end on
+	 * page boundary.
+	 */
+	void		*start;
+	unsigned int	 length;
+	/**
+	 * Specifies the maximum number of operations that can be performed
+	 * on the memory descriptor. An operation is any action that could
+	 * possibly generate an event. In the usual case, the threshold value
+	 * is decremented for each operation on the MD. When the threshold
+	 * drops to zero, the MD becomes inactive and does not respond to
+	 * operations. A threshold value of LNET_MD_THRESH_INF indicates that
+	 * there is no bound on the number of operations that may be applied
+	 * to a MD.
+	 */
+	int		 threshold;
+	/**
+	 * Specifies the largest incoming request that the memory descriptor
+	 * should respond to. When the unused portion of a MD (length -
+	 * local offset) falls below this value, the MD becomes inactive and
+	 * does not respond to further operations. This value is only used
+	 * if the LNET_MD_MAX_SIZE option is set.
+	 */
+	int		 max_size;
+	/**
+	 * Specifies the behavior of the memory descriptor. A bitwise OR
+	 * of the following values can be used:
+	 * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD.
+	 * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD.
+	 * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory
+	 *   region is provided by the incoming request. By default, the
+	 *   offset is maintained locally. When maintained locally, the
+	 *   offset is incremented by the length of the request so that
+	 *   the next operation (PUT or GET) will access the next part of
+	 *   the memory region. Note that only one offset variable exists
+	 *   per memory descriptor. If both PUT and GET operations are
+	 *   performed on a memory descriptor, the offset is updated each time.
+	 * - LNET_MD_TRUNCATE: The length provided in the incoming request can
+	 *   be reduced to match the memory available in the region (determined
+	 *   by subtracting the offset from the length of the memory region).
+	 *   By default, if the length in the incoming operation is greater
+	 *   than the amount of memory available, the operation is rejected.
+	 * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for
+	 *   incoming PUT operations, even if requested. By default,
+	 *   acknowledgments are sent for PUT operations that request an
+	 *   acknowledgment. Acknowledgments are never sent for GET operations.
+	 *   The data sent in the REPLY serves as an implicit acknowledgment.
+	 * - LNET_MD_KIOV: The start and length fields specify an array of
+	 *   struct bio_vec.
+	 * - LNET_MD_MAX_SIZE: The max_size field is valid.
+	 * - LNET_MD_BULK_HANDLE: The bulk_handle field is valid.
+	 * - LNET_MD_TRACK_RESPONSE: Enable response tracking on this MD
+	 *   regardless of the value of the lnet_response_tracking param.
+	 * - LNET_MD_NO_TRACK_RESPONSE: Disable response tracking on this MD
+	 *   regardless of the value of the lnet_response_tracking param.
+	 * - LNET_MD_GNILND: Disable warning about exceeding LNET_MAX_IOV.
+	 *
+	 * Note:
+	 * - LNET_MD_KIOV allows for a scatter/gather capability for memory
+	 *   descriptors.
+	 * - When LNET_MD_MAX_SIZE is set, the total length of the memory
+	 *   region (i.e. sum of all fragment lengths) must not be less than
+	 *   \a max_size.
+	 */
+	unsigned int	 options;
+	/**
+	 * A user-specified value that is associated with the memory
+	 * descriptor. The value does not need to be a pointer, but must fit
+	 * in the space used by a pointer. This value is recorded in events
+	 * associated with operations on this MD.
+	 */
+	void		*user_ptr;
+	/**
+	 * The event handler used to log the operations performed on
+	 * the memory region. If this argument is NULL operations
+	 * performed on this memory descriptor are not logged.
+	 */
+	lnet_handler_t	handler;
+	/**
+	 * The bulk MD handle which was registered to describe the buffers
+	 * either to be used to transfer data to the peer or receive data
+	 * from the peer. This allows LNet to properly determine the NUMA
+	 * node on which the memory was allocated and use that to select the
+	 * nearest local network interface. This value is only used
+	 * if the LNET_MD_BULK_HANDLE option is set.
+	 */
+	struct lnet_handle_md bulk_handle;
+};
+
+/* Max Transfer Unit (minimum supported everywhere).
+ * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
+ * these limits are system wide and not interface-local. */
+#define LNET_MTU_BITS	20
+#define LNET_MTU	(1 << LNET_MTU_BITS)
+
+/**
+ * Options for the MD structure. See struct lnet_md::options.
+ */
+#define LNET_MD_OP_PUT		     (1 << 0)
+/** See struct lnet_md::options. */
+#define LNET_MD_OP_GET		     (1 << 1)
+/** See struct lnet_md::options. */
+#define LNET_MD_MANAGE_REMOTE	     (1 << 2)
+/* unused			     (1 << 3) */
+/** See struct lnet_md::options. */
+#define LNET_MD_TRUNCATE	     (1 << 4)
+/** See struct lnet_md::options. */
+#define LNET_MD_ACK_DISABLE	     (1 << 5)
+/** See struct lnet_md::options. */
+/* deprecated #define LNET_MD_IOVEC  (1 << 6) */
+/** See struct lnet_md::options. */
+#define LNET_MD_MAX_SIZE	     (1 << 7)
+/** See struct lnet_md::options. */
+#define LNET_MD_KIOV		     (1 << 8)
+/** See struct lnet_md::options. */
+#define LNET_MD_BULK_HANDLE	     (1 << 9)
+/** See struct lnet_md::options. */
+#define LNET_MD_TRACK_RESPONSE	     (1 << 10)
+/** See struct lnet_md::options. */
+#define LNET_MD_NO_TRACK_RESPONSE    (1 << 11)
+/** See struct lnet_md::options. */
+#define LNET_MD_GNILND               (1 << 12)
+
+/** Infinite threshold on MD operations. See struct lnet_md::threshold */
+#define LNET_MD_THRESH_INF	 (-1)
+
+/** @} lnet_md */
+
+/** \addtogroup lnet_eq
+ * @{ */
+
+/**
+ * Six types of events can be logged in an event queue.
+ */
+enum lnet_event_kind {
+	/** An incoming GET operation has completed on the MD. */
+	LNET_EVENT_GET		= 1,
+	/**
+	 * An incoming PUT operation has completed on the MD. The
+	 * underlying layers will not alter the memory (on behalf of this
+	 * operation) once this event has been logged.
+	 */
+	LNET_EVENT_PUT,
+	/**
+	 * A REPLY operation has completed. This event is logged after the
+	 * data (if any) from the REPLY has been written into the MD.
+	 */
+	LNET_EVENT_REPLY,
+	/** An acknowledgment has been received. */
+	LNET_EVENT_ACK,
+	/**
+	 * An outgoing send (PUT or GET) operation has completed. This event
+	 * is logged after the entire buffer has been sent and it is safe for
+	 * the caller to reuse the buffer.
+	 *
+	 * Note:
+	 * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can
+	 *   happen even when the message has not yet been put out on wire.
+	 * - It's unsafe to assume that in an outgoing GET operation
+	 *   the LNET_EVENT_SEND event would happen before the
+	 *   LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and
+	 *   LNET_EVENT_ACK events in an outgoing PUT operation.
+	 */
+	LNET_EVENT_SEND,
+	/**
+	 * A MD has been unlinked. Note that LNetMDUnlink() does not
+	 * necessarily trigger an LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	LNET_EVENT_UNLINK,
+};
+
+#define LNET_SEQ_GT(a, b)	(((signed long)((a) - (b))) > 0)
+
+/**
+ * Information about an event on a MD.
+ */
+struct lnet_event {
+	/** The identifier (nid, pid) of the target. */
+	struct lnet_processid	target;
+	/** The identifier (nid, pid) of the initiator. */
+	struct lnet_processid	initiator;
+	/** The source NID on the initiator. */
+	struct lnet_processid	source;
+	/**
+	 * The NID of the immediate sender. If the request has been forwarded
+	 * by routers, this is the NID of the last hop; otherwise it's the
+	 * same as the source.
+	 */
+	struct lnet_nid	sender;
+	/** Indicates the type of the event. */
+	enum lnet_event_kind	type;
+	/** The portal table index specified in the request */
+	unsigned int		pt_index;
+	/** A copy of the match bits specified in the request. */
+	__u64			match_bits;
+	/** The length (in bytes) specified in the request. */
+	unsigned int		rlength;
+	/**
+	 * The length (in bytes) of the data that was manipulated by the
+	 * operation. For truncated operations, the manipulated length will be
+	 * the number of bytes specified by the MD (possibly with an offset,
+	 * see struct lnet_md). For all other operations, the manipulated length
+	 * will be the length of the requested operation, i.e. rlength.
+	 */
+	unsigned int		mlength;
+	/**
+	 * The handle to the MD associated with the event. The handle may be
+	 * invalid if the MD has been unlinked.
+	 */
+	struct lnet_handle_md	md_handle;
+	/**
+	 * A snapshot of relevant state of the MD immediately after the event
+	 * has been processed.
+	 */
+	void			*md_start;
+	void			*md_user_ptr;
+	unsigned int		md_options;
+	/**
+	 * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT.
+	 * \see LNetPut
+	 */
+	__u64			hdr_data;
+	/**
+	 * The message type, to ensure a handler for LNET_EVENT_SEND can
+	 * distinguish between LNET_MSG_GET and LNET_MSG_PUT.
+	 */
+	__u32			msg_type;
+	/**
+	 * Indicates the completion status of the operation. It's 0 for
+	 * successful operations, otherwise it's an error code.
+	 */
+	int			status;
+	/**
+	 * Indicates whether the MD has been unlinked. Note that:
+	 * - An event with unlinked set is the last event on the MD.
+	 * - This field is also set for an explicit LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	int			unlinked;
+	/**
+	 * The displacement (in bytes) into the memory region that the
+	 * operation used. The offset can be determined by the operation for
+	 * a remote managed MD or by the local MD.
+	 * \see struct lnet_md::options
+	 */
+	unsigned int		offset;
+	/**
+	 * The sequence number for this event. Sequence numbers are unique
+	 * to each event.
+	 */
+	volatile unsigned long	sequence;
+};
+
+/** \addtogroup lnet_data
+ * @{ */
+
+/**
+ * Specify whether an acknowledgment should be sent by target when the PUT
+ * operation completes (i.e., when the data has been written to a MD of the
+ * target process).
+ *
+ * \see struct lnet_md::options for the discussion on LNET_MD_ACK_DISABLE
+ * by which acknowledgments can be disabled for a MD.
+ */
+enum lnet_ack_req {
+	/** Request an acknowledgment */
+	LNET_ACK_REQ,
+	/** Request that no acknowledgment should be generated. */
+	LNET_NOACK_REQ
+};
+
+/**
+ * UDSP action types. There are two available actions:
+ *	1. PRIORITY - set priority of matching LNet constructs
+ *	2. PREFERRED LIST - set preferred list of matching LNet constructs
+ */
+enum lnet_udsp_action_type {
+	EN_LNET_UDSP_ACTION_NONE = 0,
+	/** assign a priority to matching constructs */
+	EN_LNET_UDSP_ACTION_PRIORITY = 1,
+	/** assign a preferred list of NIDs to matching constructs */
+	EN_LNET_UDSP_ACTION_PREFERRED_LIST = 2,
+};
+
+/** @} lnet_data */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
new file mode 100644
index 0000000000000..bbbed82d82874
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
@@ -0,0 +1,154 @@
+/*
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for lnet ioctl
+ */
+/*
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+#ifndef __UAPI_LNETCTL_H_
+#define __UAPI_LNETCTL_H_
+
+#include <linux/types.h>
+#include <linux/lnet/lnet-types.h>
+
+/** \addtogroup lnet_fault_simulation
+ * @{ */
+
+enum {
+	LNET_CTL_DROP_ADD,
+	LNET_CTL_DROP_DEL,
+	LNET_CTL_DROP_RESET,
+	LNET_CTL_DROP_LIST,
+	LNET_CTL_DELAY_ADD,
+	LNET_CTL_DELAY_DEL,
+	LNET_CTL_DELAY_RESET,
+	LNET_CTL_DELAY_LIST,
+};
+
+#define LNET_ACK_BIT		(1 << 0)
+#define LNET_PUT_BIT		(1 << 1)
+#define LNET_GET_BIT		(1 << 2)
+#define LNET_REPLY_BIT		(1 << 3)
+
+#define HSTATUS_END			11
+#define HSTATUS_LOCAL_INTERRUPT_BIT	(1 << 1)
+#define HSTATUS_LOCAL_DROPPED_BIT	(1 << 2)
+#define HSTATUS_LOCAL_ABORTED_BIT	(1 << 3)
+#define HSTATUS_LOCAL_NO_ROUTE_BIT	(1 << 4)
+#define HSTATUS_LOCAL_ERROR_BIT		(1 << 5)
+#define HSTATUS_LOCAL_TIMEOUT_BIT	(1 << 6)
+#define HSTATUS_REMOTE_ERROR_BIT	(1 << 7)
+#define HSTATUS_REMOTE_DROPPED_BIT	(1 << 8)
+#define HSTATUS_REMOTE_TIMEOUT_BIT	(1 << 9)
+#define HSTATUS_NETWORK_TIMEOUT_BIT	(1 << 10)
+#define HSTATUS_RANDOM			0xffffffff
+
+/** ioctl parameter for LNet fault simulation */
+struct lnet_fault_attr {
+	/**
+	 * source NID of drop rule
+	 * LNET_NID_ANY is wildcard for all sources
+	 * 255.255.255.255@net is wildcard for all addresses from @net
+	 */
+	lnet_nid_t			fa_src;
+	/** destination NID of drop rule, see \a dr_src for details */
+	lnet_nid_t			fa_dst;
+	/** local NID. In case of router this is the NID we're ceiving
+	 * messages on
+	 */
+	lnet_nid_t			fa_local_nid;
+	/**
+	 * Portal mask to drop, -1 means all portals, for example:
+	 * fa_ptl_mask = (1 << _LDLM_CB_REQUEST_PORTAL ) |
+	 *		 (1 << LDLM_CANCEL_REQUEST_PORTAL)
+	 *
+	 * If it is non-zero then only PUT and GET will be filtered, otherwise
+	 * there is no portal filter, all matched messages will be checked.
+	 */
+	__u64				fa_ptl_mask;
+	/**
+	 * message types to drop, for example:
+	 * dra_type = LNET_DROP_ACK_BIT | LNET_DROP_PUT_BIT
+	 *
+	 * If it is non-zero then only specified message types are filtered,
+	 * otherwise all message types will be checked.
+	 */
+	__u32				fa_msg_mask;
+	union {
+		/** message drop simulation */
+		struct {
+			/** drop rate of this rule */
+			__u32			da_rate;
+			/**
+			 * time interval of message drop, it is exclusive
+			 * with da_rate
+			 */
+			__u32			da_interval;
+			/** error type mask */
+			__u32			da_health_error_mask;
+			/** randomize error generation */
+			__u32			da_random:1,
+			/** drop all messages if flag is set */
+						da_drop_all:1;
+		} drop;
+		/** message latency simulation */
+		struct {
+			__u32			la_rate;
+			/**
+			 * time interval of message delay, it is exclusive
+			 * with la_rate
+			 */
+			__u32			la_interval;
+			/** latency to delay */
+			__u32			la_latency;
+		} delay;
+		__u64			space[8];
+	} u;
+
+};
+
+/** fault simluation stats */
+struct lnet_fault_stat {
+	/** total # matched messages */
+	__u64				fs_count;
+	/** # dropped LNET_MSG_PUT by this rule */
+	__u64				fs_put;
+	/** # dropped LNET_MSG_ACK by this rule */
+	__u64				fs_ack;
+	/** # dropped LNET_MSG_GET by this rule */
+	__u64				fs_get;
+	/** # dropped LNET_MSG_REPLY by this rule */
+	__u64				fs_reply;
+	union {
+		struct {
+			/** total # dropped messages */
+			__u64			ds_dropped;
+		} drop;
+		struct {
+			/** total # delayed messages */
+			__u64			ls_delayed;
+		} delay;
+		__u64			space[8];
+	} u;
+};
+
+/** @} lnet_fault_simulation */
+
+#define LNET_DEV_ID	0
+#define LNET_DEV_PATH	"/dev/lnet"
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
new file mode 100644
index 0000000000000..8749f8a5b0646
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
@@ -0,0 +1,537 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __UAPI_LNET_ST_H__
+#define __UAPI_LNET_ST_H__
+
+#include <linux/types.h>
+#include <linux/lnet/lnet-types.h>
+#include <linux/time.h>
+
+#define LST_FEAT_NONE		(0)
+#define LST_FEAT_BULK_LEN	(1 << 0)	/* enable variable page size */
+
+#define LST_FEATS_EMPTY		(LST_FEAT_NONE)
+#define LST_FEATS_MASK		(LST_FEAT_NONE | LST_FEAT_BULK_LEN)
+
+#define LST_NAME_SIZE		32		/* max name buffer length */
+
+#define LSTIO_DEBUG		0xC00		/* debug */
+#define LSTIO_SESSION_NEW	0xC01		/* create session */
+#define LSTIO_SESSION_END	0xC02		/* end session */
+#define LSTIO_SESSION_INFO	0xC03		/* query session */
+#define LSTIO_GROUP_ADD		0xC10		/* add group */
+#define LSTIO_GROUP_LIST	0xC11		/* list all groups in session */
+#define LSTIO_GROUP_INFO	0xC12		/* query defailt infomation of specified group */
+#define LSTIO_GROUP_DEL		0xC13		/* delete group */
+#define LSTIO_NODES_ADD		0xC14		/* add nodes to specified group */
+#define LSTIO_GROUP_UPDATE	0xC15		/* update group */
+#define LSTIO_BATCH_ADD		0xC20		/* add batch */
+#define LSTIO_BATCH_START	0xC21		/* start batch */
+#define LSTIO_BATCH_STOP	0xC22		/* stop batch */
+#define LSTIO_BATCH_DEL		0xC23		/* delete batch */
+#define LSTIO_BATCH_LIST	0xC24		/* show all batches in the session */
+#define LSTIO_BATCH_INFO	0xC25		/* show defail of specified batch */
+#define LSTIO_TEST_ADD		0xC26		/* add test (to batch) */
+#define LSTIO_BATCH_QUERY	0xC27		/* query batch status */
+#define LSTIO_STAT_QUERY	0xC30		/* get stats */
+
+/*
+ * sparse kernel source annotations
+ */
+#ifndef __user
+#define __user
+#endif
+
+struct lst_sid {
+	lnet_nid_t	ses_nid;	/* nid of console node */
+	__s64		ses_stamp;	/* time stamp in milliseconds */
+};					/*** session id */
+
+extern struct lst_sid LST_INVALID_SID;
+
+struct lst_bid {
+	__u64		bat_id;		/* unique id in session */
+};
+
+/* Status of test node */
+#define LST_NODE_ACTIVE		0x1	/* node in this session */
+#define LST_NODE_BUSY		0x2	/* node is taken by other session */
+#define LST_NODE_DOWN		0x4	/* node is down */
+#define LST_NODE_UNKNOWN	0x8	/* node not in session */
+
+struct lstcon_node_ent {
+	struct lnet_process_id	nde_id;		/* id of node */
+	int			nde_state;	/* state of node */
+};						/*** node entry, for list_group command */
+
+struct lstcon_ndlist_ent {
+	int	nle_nnode;	/* # of nodes */
+	int	nle_nactive;	/* # of active nodes */
+	int	nle_nbusy;	/* # of busy nodes */
+	int	nle_ndown;	/* # of down nodes */
+	int	nle_nunknown;	/* # of unknown nodes */
+};				/*** node_list entry, for list_batch command */
+
+struct lstcon_test_ent {
+	int	tse_type;	/* test type */
+	int	tse_loop;	/* loop count */
+	int	tse_concur;	/* concurrency of test */
+};				/*** test summary entry, for list_batch command */
+
+struct lstcon_batch_ent {
+	int	bae_state;	/* batch status */
+	int	bae_timeout;	/* batch timeout */
+	int	bae_ntest;	/* # of tests in the batch */
+};				/*** batch summary entry, for list_batch command */
+
+struct lstcon_test_batch_ent {
+	struct lstcon_ndlist_ent	tbe_cli_nle;	/* client (group) node_list entry */
+	struct lstcon_ndlist_ent	tbe_srv_nle;	/* server (group) node_list entry */
+	union {
+		struct lstcon_test_ent  tbe_test;	/* test entry */
+		struct lstcon_batch_ent tbe_batch;	/* batch entry */
+	} u;
+};							/*** test/batch verbose information entry,
+							 *** for list_batch command */
+
+/* This will go away once we move to netlink */
+#if !defined(__KERNEL__) && !defined(__LIBCFS_UTIL_LIST_H__)
+struct list_head {
+	struct list_head *next, *prev;
+};
+#endif
+
+struct lstcon_rpc_ent {
+	struct list_head	rpe_link;		/* link chain */
+	struct lnet_process_id	rpe_peer;		/* peer's id */
+	/* This has not been used since Lustre 2.2 so its safe to use.
+	 * Update to allow future use of timespec64
+	 */
+	struct {
+		__s64		tv_sec;
+		__s64		tv_nsec;
+	} rpe_stamp;					/* time stamp of RPC */
+	int			rpe_state;		/* peer's state */
+	int			rpe_rpc_errno;		/* RPC errno */
+
+	struct lst_sid		rpe_sid;		/* peer's session id */
+	int			rpe_fwk_errno;		/* framework errno */
+	int			rpe_priv[4];		/* private data */
+	char			rpe_payload[0];		/* private reply payload */
+};
+
+struct lstcon_trans_stat {
+	int	trs_rpc_stat[4];	/* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */
+	int	trs_rpc_errno;		/* RPC errno */
+	int	trs_fwk_stat[8];	/* framework stat */
+	int	trs_fwk_errno;		/* errno of the first remote error */
+	void   *trs_fwk_private;	/* private framework stat */
+};
+
+static inline int
+lstcon_rpc_stat_total(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0];
+}
+
+static inline int
+lstcon_rpc_stat_success(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1];
+}
+
+static inline int
+lstcon_rpc_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2];
+}
+
+static inline int
+lstcon_sesop_stat_success(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesop_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_active(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesqry_stat_busy(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_unknown(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_tsbop_stat_success(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbop_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_idle(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbqry_stat_run(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_statqry_stat_success(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_statqry_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+/* create a session */
+struct lstio_session_new_args {
+	int			lstio_ses_key;		/* IN: local key */
+	int			lstio_ses_timeout;	/* IN: session timeout */
+	int			lstio_ses_force;	/* IN: force create ? */
+	/** IN: session features */
+	unsigned		lstio_ses_feats;
+	struct lst_sid __user  *lstio_ses_idp;		/* OUT: session id */
+	int			lstio_ses_nmlen;	/* IN: name length */
+	char __user	       *lstio_ses_namep;	/* IN: session name */
+};
+
+/* query current session */
+struct lstio_session_info_args {
+	struct lst_sid __user	*lstio_ses_idp;		/* OUT: session id */
+	int __user		*lstio_ses_keyp;	/* OUT: local key */
+	/** OUT: session features */
+	unsigned __user		*lstio_ses_featp;
+	struct lstcon_ndlist_ent __user *lstio_ses_ndinfo; /* OUT: */
+	int			 lstio_ses_nmlen;	/* IN: name length */
+	char __user		*lstio_ses_namep;	/* OUT: session name */
+};
+
+/* delete a session */
+struct lstio_session_end_args {
+	int			lstio_ses_key;		/* IN: session key */
+};
+
+#define LST_OPC_SESSION		1
+#define LST_OPC_GROUP		2
+#define LST_OPC_NODES		3
+#define LST_OPC_BATCHCLI	4
+#define LST_OPC_BATCHSRV	5
+
+struct lstio_debug_args {
+	int			lstio_dbg_key;		/* IN: session key */
+	int			lstio_dbg_type;		/* IN: debug sessin|batch|group|nodes list */
+	int			lstio_dbg_flags;	/* IN: reserved debug flags */
+	int			lstio_dbg_timeout;	/* IN: timeout of debug */
+
+	int			lstio_dbg_nmlen;	/* IN: len of name */
+	char __user	       *lstio_dbg_namep;	/* IN: name of group|batch */
+	int			lstio_dbg_count;	/* IN: # of test nodes to debug */
+	struct lnet_process_id __user *lstio_dbg_idsp;	/* IN: id of test nodes */
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_dbg_resultp;
+};
+
+struct lstio_group_add_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_nmlen;	/* IN: name length */
+	char __user	       *lstio_grp_namep;	/* IN: group name */
+};
+
+struct lstio_group_del_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_nmlen;	/* IN: name length */
+	char __user	       *lstio_grp_namep;	/* IN: group name */
+};
+
+#define LST_GROUP_CLEAN		1			/* remove inactive nodes in the group */
+#define LST_GROUP_REFRESH	2			/* refresh inactive nodes in the group */
+#define LST_GROUP_RMND		3			/* delete nodes from the group */
+
+struct lstio_group_update_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_opc;		/* IN: OPC */
+	int			lstio_grp_args;		/* IN: arguments */
+	int			lstio_grp_nmlen;	/* IN: name length */
+	char __user	       *lstio_grp_namep;	/* IN: group name */
+	int			lstio_grp_count;	/* IN: # of nodes id */
+	struct lnet_process_id __user *lstio_grp_idsp;	/* IN: array of nodes */
+	/* OUT: list head of result buffer */
+	struct list_head __user	*lstio_grp_resultp;
+};
+
+struct lstio_group_nodes_args {
+	int			 lstio_grp_key;		/* IN: session key */
+	int			 lstio_grp_nmlen;	/* IN: name length */
+	char __user		*lstio_grp_namep;	/* IN: group name */
+	int			 lstio_grp_count;	/* IN: # of nodes */
+	/** OUT: session features */
+	unsigned __user		*lstio_grp_featp;
+	struct lnet_process_id __user *lstio_grp_idsp;	/* IN: nodes */
+	/* OUT: list head of result buffer */
+	struct list_head __user	*lstio_grp_resultp;
+};
+
+struct lstio_group_list_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_idx;		/* IN: group idx */
+	int			lstio_grp_nmlen;	/* IN: name len */
+	char __user	       *lstio_grp_namep;	/* OUT: name */
+};
+
+struct lstio_group_info_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_nmlen;	/* IN: name len */
+	char __user	       *lstio_grp_namep;	/* IN: name */
+	struct lstcon_ndlist_ent __user *lstio_grp_entp;/* OUT: description of group */
+
+	int __user	       *lstio_grp_idxp;		/* IN/OUT: node index */
+	int __user	       *lstio_grp_ndentp;	/* IN/OUT: # of nodent */
+	struct lstcon_node_ent __user *lstio_grp_dentsp;/* OUT: nodent array */
+};
+
+#define LST_DEFAULT_BATCH	"batch"			/* default batch name */
+
+struct lstio_batch_add_args {
+	int			lstio_bat_key;		/* IN: session key */
+	int			lstio_bat_nmlen;	/* IN: name length */
+	char __user	       *lstio_bat_namep;	/* IN: batch name */
+};
+
+struct lstio_batch_del_args {
+	int			lstio_bat_key;		/* IN: session key */
+	int			lstio_bat_nmlen;	/* IN: name length */
+	char __user	       *lstio_bat_namep;	/* IN: batch name */
+};
+
+struct lstio_batch_run_args {
+	/* IN: session key */
+	int			 lstio_bat_key;
+	/* IN: timeout for the batch */
+	int			 lstio_bat_timeout;
+	/* IN: name length */
+	int			 lstio_bat_nmlen;
+	/* IN: batch name */
+	char __user		*lstio_bat_namep;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_bat_resultp;
+};
+
+struct lstio_batch_stop_args {
+	/* IN: session key */
+	int			 lstio_bat_key;
+	/* IN: abort unfinished test RPC */
+	int			 lstio_bat_force;
+	/* IN: name length */
+	int			 lstio_bat_nmlen;
+	/* IN: batch name */
+	char __user		*lstio_bat_namep;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_bat_resultp;
+};
+
+struct lstio_batch_query_args {
+	/* IN: session key */
+	int			lstio_bat_key;
+	/* IN: test index */
+	int			lstio_bat_testidx;
+	/* IN: is test client? */
+	int			lstio_bat_client;
+	/* IN: timeout for waiting */
+	int			lstio_bat_timeout;
+	/* IN: name length */
+	int			lstio_bat_nmlen;
+	/* IN: batch name */
+	char __user		*lstio_bat_namep;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_bat_resultp;
+};
+
+struct lstio_batch_list_args {
+	int			lstio_bat_key;		/* IN: session key */
+	int			lstio_bat_idx;		/* IN: index */
+	int			lstio_bat_nmlen;	/* IN: name length */
+	char __user	       *lstio_bat_namep;	/* IN: batch name */
+};
+
+struct lstio_batch_info_args {
+	int			lstio_bat_key;		/* IN: session key */
+	int			lstio_bat_nmlen;	/* IN: name length */
+	char __user	       *lstio_bat_namep;	/* IN: name */
+	int			lstio_bat_server;	/* IN: query server or not */
+	int			lstio_bat_testidx;	/* IN: test index */
+	struct lstcon_test_batch_ent __user *lstio_bat_entp;/* OUT: batch ent */
+
+	int __user	       *lstio_bat_idxp;		/* IN/OUT: index of node */
+	int __user	       *lstio_bat_ndentp;	/* IN/OUT: # of nodent */
+	struct lstcon_node_ent __user *lstio_bat_dentsp;/* array of nodent */
+};
+
+/* add stat in session */
+struct lstio_stat_args {
+	/* IN: session key */
+	int			lstio_sta_key;
+	/* IN: timeout for stat requst */
+	int			lstio_sta_timeout;
+	/* IN: group name length */
+	int			lstio_sta_nmlen;
+	/* IN: group name */
+	char __user	       *lstio_sta_namep;
+	/* IN: # of pid */
+	int			lstio_sta_count;
+	/* IN: pid */
+	struct lnet_process_id __user *lstio_sta_idsp;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_sta_resultp;
+};
+
+enum lst_test_type {
+	LST_TEST_BULK	= 1,
+	LST_TEST_PING	= 2
+};
+
+/* create a test in a batch */
+#define LST_MAX_CONCUR		1024			/* Max concurrency of test */
+
+struct lstio_test_args {
+	int			lstio_tes_key;		/* IN: session key */
+	int			lstio_tes_bat_nmlen;	/* IN: batch name len */
+	char __user	       *lstio_tes_bat_name;	/* IN: batch name */
+	int			lstio_tes_type;		/* IN: test type */
+	int			lstio_tes_oneside;	/* IN: one sided test */
+	int			lstio_tes_loop;		/* IN: loop count */
+	int			lstio_tes_concur;	/* IN: concurrency */
+
+	int			lstio_tes_dist;		/* IN: node distribution in destination groups */
+	int			lstio_tes_span;		/* IN: node span in destination groups */
+	int			lstio_tes_sgrp_nmlen;	/* IN: source group name length */
+	char __user	       *lstio_tes_sgrp_name;	/* IN: group name */
+	int			lstio_tes_dgrp_nmlen;	/* IN: destination group name length */
+	char __user	       *lstio_tes_dgrp_name;	/* IN: group name */
+
+	/* IN: param buffer len */
+	int			 lstio_tes_param_len;
+	/* IN: parameter for specified test:
+	       lstio_bulk_param_t,
+	       lstio_ping_param_t,
+	       ... more */
+	void __user		*lstio_tes_param;
+	/* OUT: private returned value */
+	int __user		*lstio_tes_retp;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_tes_resultp;
+};
+
+enum lst_brw_type {
+	LST_BRW_READ	= 1,
+	LST_BRW_WRITE	= 2
+};
+
+enum lst_brw_flags {
+	LST_BRW_CHECK_NONE   = 1,
+	LST_BRW_CHECK_SIMPLE = 2,
+	LST_BRW_CHECK_FULL   = 3
+};
+
+struct lst_test_bulk_param {
+	int blk_opc;		/* bulk operation code */
+	int blk_size;		/* size (bytes) */
+	int blk_time;		/* time of running the test*/
+	int blk_flags;		/* reserved flags */
+	int blk_cli_off;	/* bulk offset on client */
+	int blk_srv_off;	/* reserved: bulk offset on server */
+};
+
+struct lst_test_ping_param {
+	int png_size;		/* size of ping message */
+	int png_time;		/* time */
+	int png_loop;		/* loop */
+	int png_flags;		/* reserved flags */
+};
+
+/* Both struct srpc_counters and struct sfw_counters are sent over the wire */
+struct srpc_counters {
+	__u32 errors;
+	__u32 rpcs_sent;
+	__u32 rpcs_rcvd;
+	__u32 rpcs_dropped;
+	__u32 rpcs_expired;
+	__u64 bulk_get;
+	__u64 bulk_put;
+} __attribute__((packed));
+
+struct sfw_counters {
+	/** milliseconds since current session started */
+	__u32 running_ms;
+	__u32 active_batches;
+	__u32 zombie_sessions;
+	__u32 brw_errors;
+	__u32 ping_errors;
+} __attribute__((packed));
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
new file mode 100644
index 0000000000000..9e4b156450e0b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
@@ -0,0 +1,107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2015, 2017, Intel Corporation.
+ */
+#ifndef _LNET_NIDSTRINGS_H
+#define _LNET_NIDSTRINGS_H
+
+#include <linux/types.h>
+#include <linux/lnet/lnet-types.h>
+
+/**
+ *  Lustre Network Driver types.
+ */
+enum {
+	/* Only add to these values (i.e. don't ever change or redefine them):
+	 * network addresses depend on them... */
+	/*QSWLND	= 1, removed v2_7_50                 */
+	SOCKLND		= 2,
+	/*GMLND		= 3, removed v2_0_0-rc1a-16-gc660aac */
+	/*PTLLND	= 4, removed v2_7_50                 */
+	O2IBLND		= 5,
+	/*CIBLND	= 6, removed v2_0_0-rc1a-175-gd2b8a0e */
+	/*OPENIBLND	= 7, removed v2_0_0-rc1a-175-gd2b8a0e */
+	/*IIBLND	= 8, removed v2_0_0-rc1a-175-gd2b8a0e */
+	LOLND		= 9,
+	/*RALND		= 10, removed v2_7_50_0-34-g8be9e41    */
+	/*VIBLND	= 11, removed v2_0_0-rc1a-175-gd2b8a0e */
+	/*MXLND		= 12, removed v2_7_50_0-34-g8be9e41    */
+	GNILND		= 13,
+	GNIIPLND	= 14,
+	PTL4LND		= 15,
+	KFILND		= 16,
+
+	NUM_LNDS
+};
+
+struct list_head;
+
+#define LNET_NIDSTR_COUNT 1024	/* # of nidstrings */
+#define LNET_NIDSTR_SIZE  64	/* size of each one (see below for usage) */
+
+/* support decl needed by both kernel and user space */
+char *libcfs_next_nidstring(void);
+int libcfs_isknown_lnd(__u32 lnd);
+char *libcfs_lnd2modname(__u32 lnd);
+char *libcfs_lnd2str_r(__u32 lnd, char *buf, __kernel_size_t buf_size);
+static inline char *libcfs_lnd2str(__u32 lnd)
+{
+	return libcfs_lnd2str_r(lnd, libcfs_next_nidstring(),
+				LNET_NIDSTR_SIZE);
+}
+int libcfs_str2lnd(const char *str);
+char *libcfs_net2str_r(__u32 net, char *buf, __kernel_size_t buf_size);
+static inline char *libcfs_net2str(__u32 net)
+{
+	return libcfs_net2str_r(net, libcfs_next_nidstring(),
+				LNET_NIDSTR_SIZE);
+}
+char *libcfs_nid2str_r(lnet_nid_t nid, char *buf, __kernel_size_t buf_size);
+static inline char *libcfs_nid2str(lnet_nid_t nid)
+{
+	return libcfs_nid2str_r(nid, libcfs_next_nidstring(),
+				LNET_NIDSTR_SIZE);
+}
+
+__u32 libcfs_str2net(const char *str);
+lnet_nid_t libcfs_str2nid(const char *str);
+int libcfs_str2anynid(lnet_nid_t *nid, const char *str);
+int libcfs_num_parse(char *str, int len, struct list_head *list);
+char *libcfs_id2str(struct lnet_process_id id);
+void cfs_free_nidlist(struct list_head *list);
+int cfs_parse_nidlist(char *str, int len, struct list_head *list);
+int cfs_print_nidlist(char *buffer, int count, struct list_head *list);
+int cfs_match_nid(lnet_nid_t nid, struct list_head *list);
+int cfs_match_net(__u32 net_id, __u32 net_type,
+		  struct list_head *net_num_list);
+
+int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
+int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid,
+			       char *max_nid, __kernel_size_t nidstr_length);
+void cfs_expr_list_free_list(struct list_head *list);
+
+#endif /* _LNET_NIDSTRINGS_H */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
new file mode 100644
index 0000000000000..2df2cf731db15
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * #defines shared between socknal implementation and utilities
+ */
+#ifndef __UAPI_LNET_SOCKLND_H__
+#define __UAPI_LNET_SOCKLND_H__
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY	0
+#define SOCKLND_CONN_CONTROL	1
+#define SOCKLND_CONN_BULK_IN	2
+#define SOCKLND_CONN_BULK_OUT	3
+#define SOCKLND_CONN_NTYPES	4
+
+#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd-idl.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd-idl.h
new file mode 100644
index 0000000000000..35df50b99bbb6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd-idl.h
@@ -0,0 +1,155 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/klnds/o2iblnd/o2iblnd-idl.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+#ifndef __LNET_O2IBLND_IDL_H__
+#define __LNET_O2IBLND_IDL_H__
+
+#include <uapi/linux/lnet/lnet-idl.h>
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+struct kib_connparams {
+	u16			ibcp_queue_depth;
+	u16			ibcp_max_frags;
+	u32			ibcp_max_msg_size;
+} __packed;
+
+struct kib_immediate_msg {
+	struct lnet_hdr_nid4	ibim_hdr;	/* portals header */
+	char			ibim_payload[0];/* piggy-backed payload */
+} __packed;
+
+struct kib_rdma_frag {
+	u32			rf_nob;		/* # bytes this frag */
+	u64			rf_addr;	/* CAVEAT EMPTOR: misaligned!! */
+} __packed;
+
+struct kib_rdma_desc {
+	u32			rd_key;		/* local/remote key */
+	u32			rd_nfrags;	/* # fragments */
+	struct kib_rdma_frag	rd_frags[0];	/* buffer frags */
+} __packed;
+
+struct kib_putreq_msg {
+	struct lnet_hdr_nid4	ibprm_hdr;	/* portals header */
+	u64			ibprm_cookie;	/* opaque completion cookie */
+} __packed;
+
+struct kib_putack_msg {
+	u64			ibpam_src_cookie;/* reflected completion cookie */
+	u64			ibpam_dst_cookie;/* opaque completion cookie */
+	struct kib_rdma_desc	ibpam_rd;	/* sender's sink buffer */
+} __packed;
+
+struct kib_get_msg {
+	struct lnet_hdr_nid4	ibgm_hdr;	/* portals header */
+	u64			ibgm_cookie;	/* opaque completion cookie */
+	struct kib_rdma_desc	ibgm_rd;	/* rdma descriptor */
+} __packed;
+
+struct kib_completion_msg {
+	u64			ibcm_cookie;	/* opaque completion cookie */
+	s32			ibcm_status;    /* < 0 failure: >= 0 length */
+} __packed;
+
+struct kib_msg {
+	/* First 2 fields fixed FOR ALL TIME */
+	u32			ibm_magic;	/* I'm an ibnal message */
+	u16			ibm_version;	/* this is my version number */
+
+	u8			ibm_type;	/* msg type */
+	u8			ibm_credits;	/* returned credits */
+	u32			ibm_nob;	/* # bytes in whole message */
+	u32			ibm_cksum;	/* checksum (0 == no checksum) */
+	u64			ibm_srcnid;	/* sender's NID */
+	u64			ibm_srcstamp;	/* sender's incarnation */
+	u64			ibm_dstnid;	/* destination's NID */
+	u64			ibm_dststamp;	/* destination's incarnation */
+
+	union {
+		struct kib_connparams		connparams;
+		struct kib_immediate_msg	immediate;
+		struct kib_putreq_msg		putreq;
+		struct kib_putack_msg		putack;
+		struct kib_get_msg		get;
+		struct kib_completion_msg	completion;
+	} __packed ibm_u;
+} __packed;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC     /* unique magic */
+
+#define IBLND_MSG_VERSION_1	0x11
+#define IBLND_MSG_VERSION_2	0x12
+#define IBLND_MSG_VERSION	IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ	0xc0	/* connection request */
+#define IBLND_MSG_CONNACK	0xc1	/* connection acknowledge */
+#define IBLND_MSG_NOOP		0xd0	/* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE	0xd1	/* immediate */
+#define IBLND_MSG_PUT_REQ	0xd2	/* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK	0xd3	/* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK	0xd4	/* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE	0xd5	/* completion (src->sink) */
+#define IBLND_MSG_GET_REQ	0xd6	/* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE	0xd7	/* completion (src->sink: all OK) */
+
+struct kib_rej {
+	u32			ibr_magic;	/* sender's magic */
+	u16			ibr_version;	/* sender's version */
+	u8			ibr_why;	/* reject reason */
+	u8			ibr_padding;	/* padding */
+	u64			ibr_incarnation;/* incarnation of peer_ni */
+	struct kib_connparams	ibr_cp;		/* connection parameters */
+} __packed;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE       1          /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES    2          /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL           3          /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT   4          /* incompatible version peer_ni */
+#define IBLND_REJECT_CONN_STALE      5          /* stale peer_ni */
+
+/* peer_ni's rdma frags doesn't match mine */
+#define IBLND_REJECT_RDMA_FRAGS      6
+/* peer_ni's msg queue size doesn't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7
+#define IBLND_REJECT_INVALID_SRV_ID  8
+
+/***********************************************************************/
+
+#endif /* __LNET_O2IBLND_IDL_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644
index 0000000000000..e9c23326b5c19
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
@@ -0,0 +1,3596 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <asm/page.h>
+#include <linux/ethtool.h>
+#include <linux/inetdevice.h>
+
+#include "o2iblnd.h"
+
+static const struct lnet_lnd the_o2iblnd;
+
+struct kib_data kiblnd_data;
+
+static __u32
+kiblnd_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
+}
+
+static char *
+kiblnd_msgtype2str(int type)
+{
+        switch (type) {
+        case IBLND_MSG_CONNREQ:
+                return "CONNREQ";
+
+        case IBLND_MSG_CONNACK:
+                return "CONNACK";
+
+        case IBLND_MSG_NOOP:
+                return "NOOP";
+
+        case IBLND_MSG_IMMEDIATE:
+                return "IMMEDIATE";
+
+        case IBLND_MSG_PUT_REQ:
+                return "PUT_REQ";
+
+        case IBLND_MSG_PUT_NAK:
+                return "PUT_NAK";
+
+        case IBLND_MSG_PUT_ACK:
+                return "PUT_ACK";
+
+        case IBLND_MSG_PUT_DONE:
+                return "PUT_DONE";
+
+        case IBLND_MSG_GET_REQ:
+                return "GET_REQ";
+
+        case IBLND_MSG_GET_DONE:
+                return "GET_DONE";
+
+        default:
+                return "???";
+        }
+}
+
+static int
+kiblnd_msgtype2size(int type)
+{
+	const int hdr_size = offsetof(struct kib_msg, ibm_u);
+
+        switch (type) {
+        case IBLND_MSG_CONNREQ:
+        case IBLND_MSG_CONNACK:
+		return hdr_size + sizeof(struct kib_connparams);
+
+        case IBLND_MSG_NOOP:
+                return hdr_size;
+
+        case IBLND_MSG_IMMEDIATE:
+		return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]);
+
+        case IBLND_MSG_PUT_REQ:
+		return hdr_size + sizeof(struct kib_putreq_msg);
+
+        case IBLND_MSG_PUT_ACK:
+		return hdr_size + sizeof(struct kib_putack_msg);
+
+        case IBLND_MSG_GET_REQ:
+		return hdr_size + sizeof(struct kib_get_msg);
+
+        case IBLND_MSG_PUT_NAK:
+        case IBLND_MSG_PUT_DONE:
+        case IBLND_MSG_GET_DONE:
+		return hdr_size + sizeof(struct kib_completion_msg);
+        default:
+                return -1;
+        }
+}
+
+static int kiblnd_unpack_rd(struct kib_msg *msg, bool flip)
+{
+	struct kib_rdma_desc *rd;
+	int nob;
+	int n;
+	int i;
+
+	LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
+		msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+	rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+		&msg->ibm_u.get.ibgm_rd :
+		&msg->ibm_u.putack.ibpam_rd;
+
+	if (flip) {
+		__swab32s(&rd->rd_key);
+		__swab32s(&rd->rd_nfrags);
+	}
+
+	n = rd->rd_nfrags;
+
+	if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+		CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+		       n, IBLND_MAX_RDMA_FRAGS);
+		return 1;
+	}
+
+	nob = offsetof(struct kib_msg, ibm_u) +
+		kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+	if (msg->ibm_nob < nob) {
+		CERROR("Short %s: %d(%d)\n",
+		       kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+		return 1;
+	}
+
+	if (!flip)
+		return 0;
+
+	for (i = 0; i < n; i++) {
+		__swab32s(&rd->rd_frags[i].rf_nob);
+		__swab64s(&rd->rd_frags[i].rf_addr);
+	}
+
+	return 0;
+}
+
+void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
+		     int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+	struct kib_net *net = ni->ni_data;
+
+	/* CAVEAT EMPTOR! all message fields not set here should have been
+	 * initialised previously.
+	 */
+	msg->ibm_magic    = IBLND_MSG_MAGIC;
+	msg->ibm_version  = version;
+	/*   ibm_type */
+	msg->ibm_credits  = credits;
+	/*   ibm_nob */
+	msg->ibm_cksum    = 0;
+	msg->ibm_srcnid   = lnet_nid_to_nid4(&ni->ni_nid);
+	msg->ibm_srcstamp = net->ibn_incarnation;
+	msg->ibm_dstnid   = dstnid;
+	msg->ibm_dststamp = dststamp;
+
+	if (*kiblnd_tunables.kib_cksum) {
+		/* NB ibm_cksum zero while computing cksum */
+		msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+	}
+}
+
+int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
+{
+	const int hdr_size = offsetof(struct kib_msg, ibm_u);
+	__u32 msg_cksum;
+	__u16 version;
+	int msg_nob;
+	bool flip;
+
+	/* 6 bytes are enough to have received magic + version */
+	if (nob < 6) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+		flip = false;
+	} else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+		flip = true;
+	} else {
+		CERROR("Bad magic: %08x\n", msg->ibm_magic);
+		return -EPROTO;
+	}
+
+	version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+	if (version != IBLND_MSG_VERSION &&
+	    version != IBLND_MSG_VERSION_1) {
+		CERROR("Bad version: %x\n", version);
+		return -EPROTO;
+	}
+
+	if (nob < hdr_size) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+	if (msg_nob > nob) {
+		CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+		return -EPROTO;
+	}
+
+	/* checksum must be computed with ibm_cksum zero and BEFORE anything
+	 * gets flipped
+	 */
+	msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+	msg->ibm_cksum = 0;
+	if (msg_cksum != 0 &&
+	    msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+		CERROR("Bad checksum\n");
+		return -EPROTO;
+	}
+
+	msg->ibm_cksum = msg_cksum;
+
+	if (flip) {
+		/* leave magic unflipped as a clue to peer_ni endianness */
+		msg->ibm_version = version;
+		BUILD_BUG_ON(sizeof(msg->ibm_type) != 1);
+		BUILD_BUG_ON(sizeof(msg->ibm_credits) != 1);
+		msg->ibm_nob     = msg_nob;
+		__swab64s(&msg->ibm_srcnid);
+		__swab64s(&msg->ibm_srcstamp);
+		__swab64s(&msg->ibm_dstnid);
+		__swab64s(&msg->ibm_dststamp);
+	}
+
+	if (msg->ibm_srcnid == LNET_NID_ANY) {
+		CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+		return -EPROTO;
+	}
+
+	if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+		CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+		       msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+		return -EPROTO;
+	}
+
+	switch (msg->ibm_type) {
+	default:
+		CERROR("Unknown message type %x\n", msg->ibm_type);
+		return -EPROTO;
+
+	case IBLND_MSG_NOOP:
+	case IBLND_MSG_IMMEDIATE:
+	case IBLND_MSG_PUT_REQ:
+		break;
+
+	case IBLND_MSG_PUT_ACK:
+	case IBLND_MSG_GET_REQ:
+		if (kiblnd_unpack_rd(msg, flip))
+			return -EPROTO;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		if (flip)
+			__swab32s(&msg->ibm_u.completion.ibcm_status);
+		break;
+
+	case IBLND_MSG_CONNREQ:
+	case IBLND_MSG_CONNACK:
+		if (flip) {
+			__swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+			__swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+			__swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+		}
+		break;
+	}
+	return 0;
+}
+
+int
+kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
+		   lnet_nid_t nid)
+{
+	struct kib_peer_ni *peer_ni;
+	struct kib_net *net = ni->ni_data;
+	int cpt = lnet_cpt_of_nid(nid, ni);
+	unsigned long flags;
+
+	LASSERT(net != NULL);
+	LASSERT(nid != LNET_NID_ANY);
+
+	LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni));
+	if (!peer_ni) {
+		CERROR("Cannot allocate peer_ni\n");
+		return -ENOMEM;
+	}
+
+	peer_ni->ibp_ni = ni;
+	peer_ni->ibp_nid = nid;
+	peer_ni->ibp_error = 0;
+	peer_ni->ibp_last_alive = 0;
+	peer_ni->ibp_max_frags = IBLND_MAX_RDMA_FRAGS;
+	peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits;
+	peer_ni->ibp_queue_depth_mod = 0;	/* try to use the default */
+	kref_init(&peer_ni->ibp_kref);
+
+	INIT_HLIST_NODE(&peer_ni->ibp_list);
+	INIT_LIST_HEAD(&peer_ni->ibp_conns);
+	INIT_LIST_HEAD(&peer_ni->ibp_tx_queue);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT(net->ibn_shutdown == 0);
+
+	/* npeers only grows with the global lock held */
+	atomic_inc(&net->ibn_npeers);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	*peerp = peer_ni;
+	return 0;
+}
+
+void
+kiblnd_destroy_peer(struct kref *kref)
+{
+	struct kib_peer_ni *peer_ni = container_of(kref, struct kib_peer_ni,
+						   ibp_kref);
+	struct kib_net *net = peer_ni->ibp_ni->ni_data;
+
+	LASSERT(net != NULL);
+	LASSERT(!kiblnd_peer_active(peer_ni));
+	LASSERT(kiblnd_peer_idle(peer_ni));
+	LASSERT(list_empty(&peer_ni->ibp_tx_queue));
+
+	LIBCFS_FREE(peer_ni, sizeof(*peer_ni));
+
+	/* NB a peer_ni's connections keep a reference on their peer_ni until
+	 * they are destroyed, so we can be assured that _all_ state to do
+	 * with this peer_ni has been cleaned up when its refcount drops to
+	 * zero.
+	 */
+	if (atomic_dec_and_test(&net->ibn_npeers))
+		wake_up_var(&net->ibn_npeers);
+}
+
+struct kib_peer_ni *
+kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid)
+{
+	/* the caller is responsible for accounting the additional reference
+	 * that this creates
+	 */
+	struct kib_peer_ni *peer_ni;
+
+	hash_for_each_possible(kiblnd_data.kib_peers, peer_ni,
+			       ibp_list, nid) {
+		LASSERT(!kiblnd_peer_idle(peer_ni));
+
+		/*
+		 * Match a peer if its NID and the NID of the local NI it
+		 * communicates over are the same. Otherwise don't match
+		 * the peer, which will result in a new lnd peer being
+		 * created.
+		 */
+		if (peer_ni->ibp_nid != nid ||
+		    !nid_same(&peer_ni->ibp_ni->ni_nid, &ni->ni_nid))
+			continue;
+
+		CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d) version: %x\n",
+		       peer_ni, libcfs_nid2str(nid),
+		       kref_read(&peer_ni->ibp_kref),
+		       peer_ni->ibp_version);
+		return peer_ni;
+	}
+	return NULL;
+}
+
+void
+kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni)
+{
+	LASSERT(list_empty(&peer_ni->ibp_conns));
+
+	LASSERT(kiblnd_peer_active(peer_ni));
+	hlist_del_init(&peer_ni->ibp_list);
+	/* lose peerlist's ref */
+	kiblnd_peer_decref(peer_ni);
+}
+
+static int
+kiblnd_get_peer_info(struct lnet_ni *ni, int index,
+		     lnet_nid_t *nidp, int *count)
+{
+	struct kib_peer_ni		*peer_ni;
+	int			 i;
+	unsigned long		 flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	hash_for_each(kiblnd_data.kib_peers, i, peer_ni, ibp_list) {
+		LASSERT(!kiblnd_peer_idle(peer_ni));
+
+		if (peer_ni->ibp_ni != ni)
+			continue;
+
+		if (index-- > 0)
+			continue;
+
+		*nidp = peer_ni->ibp_nid;
+		*count = kref_read(&peer_ni->ibp_kref);
+
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+		return 0;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return -ENOENT;
+}
+
+static void
+kiblnd_del_peer_locked(struct kib_peer_ni *peer_ni)
+{
+	struct kib_conn *cnxt;
+	struct kib_conn	*conn;
+
+	if (list_empty(&peer_ni->ibp_conns)) {
+		kiblnd_unlink_peer_locked(peer_ni);
+	} else {
+		list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns,
+					 ibc_list)
+			kiblnd_close_conn_locked(conn, 0);
+		/* NB closing peer_ni's last conn unlinked it. */
+	}
+	/* NB peer_ni now unlinked; might even be freed if the peer_ni table had the
+	 * last ref on it. */
+}
+
+static int
+kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
+{
+	LIST_HEAD(zombies);
+	struct hlist_node *pnxt;
+	struct kib_peer_ni *peer_ni;
+	int lo;
+	int hi;
+	int i;
+	unsigned long flags;
+	int rc = -ENOENT;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY) {
+		lo = hash_min(nid, HASH_BITS(kiblnd_data.kib_peers));
+		hi = lo;
+	} else {
+		lo = 0;
+		hi = HASH_SIZE(kiblnd_data.kib_peers) - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		hlist_for_each_entry_safe(peer_ni, pnxt,
+					  &kiblnd_data.kib_peers[i], ibp_list) {
+			LASSERT(!kiblnd_peer_idle(peer_ni));
+
+			if (peer_ni->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || peer_ni->ibp_nid == nid))
+				continue;
+
+			if (!list_empty(&peer_ni->ibp_tx_queue)) {
+				LASSERT(list_empty(&peer_ni->ibp_conns));
+
+				list_splice_init(&peer_ni->ibp_tx_queue,
+						 &zombies);
+			}
+
+			kiblnd_del_peer_locked(peer_ni);
+			rc = 0;		/* matched something */
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_txlist_done(&zombies, -EIO, LNET_MSG_STATUS_LOCAL_ERROR);
+
+	return rc;
+}
+
+static struct kib_conn *
+kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
+{
+	struct kib_peer_ni *peer_ni;
+	struct kib_conn	*conn;
+	int i;
+	unsigned long flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	hash_for_each(kiblnd_data.kib_peers, i, peer_ni, ibp_list) {
+		LASSERT(!kiblnd_peer_idle(peer_ni));
+
+		if (peer_ni->ibp_ni != ni)
+			continue;
+
+		list_for_each_entry(conn, &peer_ni->ibp_conns,
+				    ibc_list) {
+			if (index-- > 0)
+				continue;
+
+			kiblnd_conn_addref(conn);
+			read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					       flags);
+			return conn;
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return NULL;
+}
+
+static void
+kiblnd_debug_rx(struct kib_rx *rx)
+{
+	CDEBUG(D_CONSOLE, "      %p msg_type %x cred %d\n",
+	       rx, rx->rx_msg->ibm_type,
+	       rx->rx_msg->ibm_credits);
+}
+
+static void
+kiblnd_debug_tx(struct kib_tx *tx)
+{
+	CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lld "
+	       "cookie %#llx msg %s%s type %x cred %d\n",
+               tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+	       tx->tx_status, ktime_to_ns(tx->tx_deadline), tx->tx_cookie,
+               tx->tx_lntmsg[0] == NULL ? "-" : "!",
+               tx->tx_lntmsg[1] == NULL ? "-" : "!",
+               tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_conn(struct kib_conn *conn)
+{
+	struct list_head	*tmp;
+	int			i;
+
+	spin_lock(&conn->ibc_lock);
+
+	CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s:\n",
+	       atomic_read(&conn->ibc_refcount), conn,
+	       conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+	CDEBUG(D_CONSOLE, "   state %d nposted %d/%d cred %d o_cred %d "
+	       " r_cred %d\n", conn->ibc_state, conn->ibc_noops_posted,
+	       conn->ibc_nsends_posted, conn->ibc_credits,
+	       conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+	CDEBUG(D_CONSOLE, "   comms_err %d\n", conn->ibc_comms_error);
+
+	CDEBUG(D_CONSOLE, "   early_rxs:\n");
+	list_for_each(tmp, &conn->ibc_early_rxs)
+		kiblnd_debug_rx(list_entry(tmp, struct kib_rx, rx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_noops:\n");
+	list_for_each(tmp, &conn->ibc_tx_noops)
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue)
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+	CDEBUG(D_CONSOLE, "   active_txs:\n");
+	list_for_each(tmp, &conn->ibc_active_txs)
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+	CDEBUG(D_CONSOLE, "   rxs:\n");
+	for (i = 0; i < IBLND_RX_MSGS(conn); i++)
+		kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+	spin_unlock(&conn->ibc_lock);
+}
+
+static void
+kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+        /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+        if (cmid->route.path_rec == NULL)
+                return;
+
+	if (*kiblnd_tunables.kib_ib_mtu)
+		cmid->route.path_rec->mtu =
+			ib_mtu_int_to_enum(*kiblnd_tunables.kib_ib_mtu);
+}
+
+static int
+kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
+{
+	cpumask_var_t	*mask;
+	int		vectors;
+	int		off;
+	int		i;
+	lnet_nid_t	ibp_nid;
+
+	vectors = conn->ibc_cmid->device->num_comp_vectors;
+	if (vectors <= 1)
+		return 0;
+
+	mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+
+	/* hash NID to CPU id in this partition... */
+	ibp_nid = conn->ibc_peer->ibp_nid;
+	off = do_div(ibp_nid, cpumask_weight(*mask));
+	for_each_cpu(i, *mask) {
+		if (off-- == 0)
+			return i % vectors;
+	}
+
+	LBUG();
+	return 1;
+}
+
+/*
+ * Get the scheduler bound to this CPT. If the scheduler has no
+ * threads, which means that the CPT has no CPUs, then grab the
+ * next scheduler that we can use.
+ *
+ * This case would be triggered if a NUMA node is configured with
+ * no associated CPUs.
+ */
+static struct kib_sched_info *
+kiblnd_get_scheduler(int cpt)
+{
+	struct kib_sched_info *sched;
+	int i;
+
+	sched = kiblnd_data.kib_scheds[cpt];
+
+	if (sched->ibs_nthreads > 0)
+		return sched;
+
+	cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+		if (sched->ibs_nthreads > 0) {
+			CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n",
+					cpt, sched->ibs_cpt);
+			return sched;
+		}
+	}
+
+	return NULL;
+}
+
+static unsigned int kiblnd_send_wrs(struct kib_conn *conn)
+{
+	/*
+	 * One WR for the LNet message
+	 * And ibc_max_frags for the transfer WRs
+	 */
+	int ret;
+	int multiplier = 1 + conn->ibc_max_frags;
+
+	/* FastReg needs two extra WRs for map and invalidate */
+	if (IS_FAST_REG_DEV(conn->ibc_hdev->ibh_dev))
+		multiplier += 2;
+
+	/* account for a maximum of ibc_queue_depth in-flight transfers */
+	ret = multiplier * conn->ibc_queue_depth;
+
+	if (ret > conn->ibc_hdev->ibh_max_qp_wr) {
+		CDEBUG(D_NET, "peer_credits %u will result in send work "
+		       "request size %d larger than maximum %d device "
+		       "can handle\n", conn->ibc_queue_depth, ret,
+		       conn->ibc_hdev->ibh_max_qp_wr);
+		conn->ibc_queue_depth =
+			conn->ibc_hdev->ibh_max_qp_wr / multiplier;
+	}
+
+	/* don't go beyond the maximum the device can handle */
+	return min(ret, conn->ibc_hdev->ibh_max_qp_wr);
+}
+
+struct kib_conn *
+kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
+		   int state, int version)
+{
+	/* CAVEAT EMPTOR:
+	 * If the new conn is created successfully it takes over the caller's
+	 * ref on 'peer_ni'.  It also "owns" 'cmid' and destroys it when it itself
+	 * is destroyed.  On failure, the caller's ref on 'peer_ni' remains and
+	 * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
+	 * to destroy 'cmid' here since I'm called from the CM which still has
+	 * its ref on 'cmid'). */
+	rwlock_t	       *glock = &kiblnd_data.kib_global_lock;
+	struct kib_net              *net = peer_ni->ibp_ni->ni_data;
+	struct kib_dev *dev;
+	struct ib_qp_init_attr init_qp_attr = {};
+	struct kib_sched_info	*sched;
+#ifdef HAVE_IB_CQ_INIT_ATTR
+	struct ib_cq_init_attr  cq_attr = {};
+#endif
+	struct kib_conn	*conn;
+	struct ib_cq		*cq;
+	unsigned long		flags;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(net != NULL);
+	LASSERT(!in_interrupt());
+
+	dev = net->ibn_dev;
+
+	cpt = lnet_cpt_of_nid(peer_ni->ibp_nid, peer_ni->ibp_ni);
+	sched = kiblnd_get_scheduler(cpt);
+
+	if (sched == NULL) {
+		CERROR("no schedulers available. node is unhealthy\n");
+		goto failed_0;
+	}
+
+	/*
+	 * The cpt might have changed if we ended up selecting a non cpt
+	 * native scheduler. So use the scheduler's cpt instead.
+	 */
+	cpt = sched->ibs_cpt;
+
+	LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+	if (conn == NULL) {
+		CERROR("Can't allocate connection for %s\n",
+		       libcfs_nid2str(peer_ni->ibp_nid));
+		goto failed_0;
+	}
+
+	conn->ibc_state = IBLND_CONN_INIT;
+	conn->ibc_version = version;
+	conn->ibc_peer = peer_ni;			/* I take the caller's ref */
+	cmid->context = conn;			/* for future CM callbacks */
+	conn->ibc_cmid = cmid;
+	conn->ibc_max_frags = peer_ni->ibp_max_frags;
+	conn->ibc_queue_depth = peer_ni->ibp_queue_depth;
+	conn->ibc_rxs = NULL;
+	conn->ibc_rx_pages = NULL;
+
+	INIT_LIST_HEAD(&conn->ibc_early_rxs);
+	INIT_LIST_HEAD(&conn->ibc_tx_noops);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+	INIT_LIST_HEAD(&conn->ibc_active_txs);
+	INIT_LIST_HEAD(&conn->ibc_zombie_txs);
+	spin_lock_init(&conn->ibc_lock);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+			 sizeof(*conn->ibc_connvars));
+	if (conn->ibc_connvars == NULL) {
+		CERROR("Can't allocate in-progress connection state\n");
+		goto failed_2;
+	}
+
+	write_lock_irqsave(glock, flags);
+	if (dev->ibd_failover) {
+		write_unlock_irqrestore(glock, flags);
+		CERROR("%s: failover in progress\n", dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+		/* wakeup failover thread and teardown connection */
+		if (kiblnd_dev_can_failover(dev)) {
+			list_add_tail(&dev->ibd_fail_list,
+				      &kiblnd_data.kib_failed_devs);
+			wake_up(&kiblnd_data.kib_failover_waitq);
+		}
+
+		write_unlock_irqrestore(glock, flags);
+		CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+		       cmid->device->name, dev->ibd_ifname);
+		goto failed_2;
+	}
+
+        kiblnd_hdev_addref_locked(dev->ibd_hdev);
+        conn->ibc_hdev = dev->ibd_hdev;
+
+        kiblnd_setup_mtu_locked(cmid);
+
+	write_unlock_irqrestore(glock, flags);
+
+#ifdef HAVE_IB_CQ_INIT_ATTR
+	cq_attr.cqe = IBLND_CQ_ENTRIES(conn);
+	cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt);
+	cq = ib_create_cq(cmid->device,
+			  kiblnd_cq_completion, kiblnd_cq_event, conn,
+			  &cq_attr);
+#else
+	cq = ib_create_cq(cmid->device,
+			  kiblnd_cq_completion, kiblnd_cq_event, conn,
+			  IBLND_CQ_ENTRIES(conn),
+			  kiblnd_get_completion_vector(conn, cpt));
+#endif
+	if (IS_ERR(cq)) {
+		/*
+		 * on MLX-5 (possibly MLX-4 as well) this error could be
+		 * hit if the concurrent_sends and/or peer_tx_credits is set
+		 * too high. Or due to an MLX-5 bug which tries to
+		 * allocate 256kb via kmalloc for WR cookie array
+		 */
+		CERROR("Failed to create CQ with %d CQEs: %ld\n",
+			IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
+		goto failed_2;
+	}
+
+        conn->ibc_cq = cq;
+
+	rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (rc != 0) {
+		CERROR("Can't request completion notification: %d\n", rc);
+		goto failed_2;
+	}
+
+	init_qp_attr.event_handler = kiblnd_qp_event;
+	init_qp_attr.qp_context = conn;
+	init_qp_attr.cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge;
+	init_qp_attr.cap.max_recv_sge = 1;
+	init_qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	init_qp_attr.qp_type = IB_QPT_RC;
+	init_qp_attr.send_cq = cq;
+	init_qp_attr.recv_cq = cq;
+
+	if (peer_ni->ibp_queue_depth_mod &&
+	    peer_ni->ibp_queue_depth_mod < peer_ni->ibp_queue_depth) {
+		conn->ibc_queue_depth = peer_ni->ibp_queue_depth_mod;
+		CDEBUG(D_NET, "Use reduced queue depth %u (from %u)\n",
+		       peer_ni->ibp_queue_depth_mod,
+		       peer_ni->ibp_queue_depth);
+	}
+
+	do {
+		/* kiblnd_send_wrs() can change the connection's queue depth if
+		 * the maximum work requests for the device is maxed out
+		 */
+		init_qp_attr.cap.max_send_wr = kiblnd_send_wrs(conn);
+		init_qp_attr.cap.max_recv_wr = IBLND_RECV_WRS(conn);
+		rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd,
+				    &init_qp_attr);
+		if (rc != -ENOMEM || conn->ibc_queue_depth < 2)
+			break;
+		conn->ibc_queue_depth--;
+	} while (rc);
+
+	if (rc) {
+		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
+		       "send_sge: %d, recv_sge: %d\n",
+		       rc, init_qp_attr.cap.max_send_wr,
+		       init_qp_attr.cap.max_recv_wr,
+		       init_qp_attr.cap.max_send_sge,
+		       init_qp_attr.cap.max_recv_sge);
+		goto failed_2;
+	}
+
+	conn->ibc_sched = sched;
+
+	if (!peer_ni->ibp_queue_depth_mod &&
+	    conn->ibc_queue_depth != peer_ni->ibp_queue_depth) {
+		CWARN("peer %s - queue depth reduced from %u to %u"
+		      "  to allow for qp creation\n",
+		      libcfs_nid2str(peer_ni->ibp_nid),
+		      peer_ni->ibp_queue_depth,
+		      conn->ibc_queue_depth);
+		peer_ni->ibp_queue_depth_mod = conn->ibc_queue_depth;
+	}
+
+	LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+			 IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
+	if (conn->ibc_rxs == NULL) {
+		CERROR("Cannot allocate RX buffers\n");
+		goto failed_2;
+	}
+
+	rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+				IBLND_RX_MSG_PAGES(conn));
+	if (rc != 0)
+		goto failed_2;
+
+	kiblnd_map_rx_descs(conn);
+
+	/* 1 ref for caller and each rxmsg */
+	atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn));
+	conn->ibc_nrx = IBLND_RX_MSGS(conn);
+
+	/* post receives */
+	for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
+		rc = kiblnd_post_rx(&conn->ibc_rxs[i], IBLND_POSTRX_NO_CREDIT);
+		if (rc != 0) {
+			CERROR("Can't post rxmsg: %d\n", rc);
+
+			/* Make posted receives complete */
+			kiblnd_abort_receives(conn);
+
+			/* correct # of posted buffers
+			 * NB locking needed now I'm racing with completion */
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+			conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i;
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+                        /* cmid will be destroyed by CM(ofed) after cm_callback
+                         * returned, so we can't refer it anymore
+                         * (by kiblnd_connd()->kiblnd_destroy_conn) */
+                        rdma_destroy_qp(conn->ibc_cmid);
+                        conn->ibc_cmid = NULL;
+
+			/* Drop my own and unused rxbuffer refcounts */
+			while (i++ <= IBLND_RX_MSGS(conn))
+				kiblnd_conn_decref(conn);
+
+                        return NULL;
+                }
+        }
+
+        /* Init successful! */
+        LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
+                 state == IBLND_CONN_PASSIVE_WAIT);
+        conn->ibc_state = state;
+
+        /* 1 more conn */
+	atomic_inc(&net->ibn_nconns);
+        return conn;
+
+ failed_2:
+	kiblnd_destroy_conn(conn);
+	LIBCFS_FREE(conn, sizeof(*conn));
+ failed_0:
+        return NULL;
+}
+
+void
+kiblnd_destroy_conn(struct kib_conn *conn)
+{
+	struct rdma_cm_id *cmid = conn->ibc_cmid;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+
+	LASSERT (!in_interrupt());
+	LASSERT (atomic_read(&conn->ibc_refcount) == 0);
+	LASSERT(list_empty(&conn->ibc_early_rxs));
+	LASSERT(list_empty(&conn->ibc_tx_noops));
+	LASSERT(list_empty(&conn->ibc_tx_queue));
+	LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
+	LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
+	LASSERT(list_empty(&conn->ibc_active_txs));
+	LASSERT (conn->ibc_noops_posted == 0);
+	LASSERT (conn->ibc_nsends_posted == 0);
+
+	switch (conn->ibc_state) {
+	default:
+		/* conn must be completely disengaged from the network */
+		LBUG();
+
+	case IBLND_CONN_DISCONNECTED:
+		/* connvars should have been freed already */
+		LASSERT (conn->ibc_connvars == NULL);
+		break;
+
+	case IBLND_CONN_INIT:
+		break;
+	}
+
+	/* conn->ibc_cmid might be destroyed by CM already */
+	if (cmid != NULL && cmid->qp != NULL)
+		rdma_destroy_qp(cmid);
+
+	if (conn->ibc_cq)
+		ib_destroy_cq(conn->ibc_cq);
+
+	kiblnd_txlist_done(&conn->ibc_zombie_txs, -ECONNABORTED,
+			   LNET_MSG_STATUS_OK);
+
+	if (conn->ibc_rx_pages != NULL)
+		kiblnd_unmap_rx_descs(conn);
+
+	if (conn->ibc_rxs != NULL)
+		CFS_FREE_PTR_ARRAY(conn->ibc_rxs, IBLND_RX_MSGS(conn));
+
+	if (conn->ibc_connvars != NULL)
+		LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+	if (conn->ibc_hdev != NULL)
+		kiblnd_hdev_decref(conn->ibc_hdev);
+
+	/* See CAVEAT EMPTOR above in kiblnd_create_conn */
+	if (conn->ibc_state != IBLND_CONN_INIT) {
+		struct kib_net *net = peer_ni->ibp_ni->ni_data;
+
+		kiblnd_peer_decref(peer_ni);
+		rdma_destroy_id(cmid);
+		atomic_dec(&net->ibn_nconns);
+	}
+}
+
+int
+kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why)
+{
+	struct kib_conn	*conn;
+	struct kib_conn *cnxt;
+	int count = 0;
+
+	list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns,
+				 ibc_list) {
+		CDEBUG(D_NET, "Closing conn -> %s, "
+			      "version: %x, reason: %d\n",
+		       libcfs_nid2str(peer_ni->ibp_nid),
+		       conn->ibc_version, why);
+
+		kiblnd_close_conn_locked(conn, why);
+		count++;
+	}
+
+	return count;
+}
+
+int
+kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
+				int version, __u64 incarnation)
+{
+	struct kib_conn	*conn;
+	struct kib_conn *cnxt;
+	int count = 0;
+
+	list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns,
+				 ibc_list) {
+		if (conn->ibc_version     == version &&
+		    conn->ibc_incarnation == incarnation)
+			continue;
+
+		CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
+			      "incarnation:%#llx(%x, %#llx)\n",
+		       libcfs_nid2str(peer_ni->ibp_nid),
+		       conn->ibc_version, conn->ibc_incarnation,
+		       version, incarnation);
+
+		kiblnd_close_conn_locked(conn, -ESTALE);
+		count++;
+	}
+
+	return count;
+}
+
+static int
+kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
+{
+	struct kib_peer_ni *peer_ni;
+	struct hlist_node *pnxt;
+	int lo;
+	int hi;
+	int i;
+	unsigned long flags;
+	int count = 0;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY) {
+		lo = hash_min(nid, HASH_BITS(kiblnd_data.kib_peers));
+		hi = lo;
+	} else {
+		lo = 0;
+		hi = HASH_SIZE(kiblnd_data.kib_peers) - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		hlist_for_each_entry_safe(peer_ni, pnxt,
+					  &kiblnd_data.kib_peers[i], ibp_list) {
+			LASSERT(!kiblnd_peer_idle(peer_ni));
+
+			if (peer_ni->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || nid == peer_ni->ibp_nid))
+				continue;
+
+			count += kiblnd_close_peer_conns_locked(peer_ni, 0);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* wildcards always succeed */
+	if (nid == LNET_NID_ANY)
+		return 0;
+
+	return (count == 0) ? -ENOENT : 0;
+}
+
+static int
+kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
+{
+        struct libcfs_ioctl_data *data = arg;
+        int                       rc = -EINVAL;
+
+        switch(cmd) {
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_nid_t   nid = 0;
+                int          count = 0;
+
+                rc = kiblnd_get_peer_info(ni, data->ioc_count,
+                                          &nid, &count);
+                data->ioc_nid    = nid;
+                data->ioc_count  = count;
+                break;
+        }
+
+        case IOC_LIBCFS_DEL_PEER: {
+                rc = kiblnd_del_peer(ni, data->ioc_nid);
+                break;
+        }
+        case IOC_LIBCFS_GET_CONN: {
+		struct kib_conn *conn;
+
+                rc = 0;
+                conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+                if (conn == NULL) {
+                        rc = -ENOENT;
+                        break;
+                }
+
+		LASSERT(conn->ibc_cmid != NULL);
+		data->ioc_nid = conn->ibc_peer->ibp_nid;
+		if (conn->ibc_cmid->route.path_rec == NULL)
+			data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+		else
+			data->ioc_u32[0] =
+			ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+		kiblnd_conn_decref(conn);
+		break;
+        }
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+                break;
+        }
+
+        default:
+                break;
+        }
+
+        return rc;
+}
+
+static void
+kiblnd_free_pages(struct kib_pages *p)
+{
+	int	npages = p->ibp_npages;
+	int	i;
+
+	for (i = 0; i < npages; i++) {
+		if (p->ibp_pages[i] != NULL)
+			__free_page(p->ibp_pages[i]);
+	}
+
+	LIBCFS_FREE(p, offsetof(struct kib_pages, ibp_pages[npages]));
+}
+
+int
+kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages)
+{
+	struct kib_pages *p;
+	int i;
+
+	LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+			 offsetof(struct kib_pages, ibp_pages[npages]));
+        if (p == NULL) {
+                CERROR("Can't allocate descriptor for %d pages\n", npages);
+                return -ENOMEM;
+        }
+
+	memset(p, 0, offsetof(struct kib_pages, ibp_pages[npages]));
+        p->ibp_npages = npages;
+
+        for (i = 0; i < npages; i++) {
+		p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+						     GFP_NOFS);
+                if (p->ibp_pages[i] == NULL) {
+                        CERROR("Can't allocate page %d of %d\n", i, npages);
+                        kiblnd_free_pages(p);
+                        return -ENOMEM;
+                }
+        }
+
+        *pp = p;
+        return 0;
+}
+
+void
+kiblnd_unmap_rx_descs(struct kib_conn *conn)
+{
+	struct kib_rx *rx;
+        int       i;
+
+        LASSERT (conn->ibc_rxs != NULL);
+        LASSERT (conn->ibc_hdev != NULL);
+
+	for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
+		rx = &conn->ibc_rxs[i];
+
+		LASSERT(rx->rx_nob >= 0); /* not posted */
+
+		kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+							  rx->rx_msgaddr),
+					IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+	}
+
+        kiblnd_free_pages(conn->ibc_rx_pages);
+
+        conn->ibc_rx_pages = NULL;
+}
+
+void
+kiblnd_map_rx_descs(struct kib_conn *conn)
+{
+	struct kib_rx *rx;
+        struct page    *pg;
+        int             pg_off;
+        int             ipg;
+        int             i;
+
+	for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) {
+		pg = conn->ibc_rx_pages->ibp_pages[ipg];
+		rx = &conn->ibc_rxs[i];
+
+		rx->rx_conn = conn;
+		rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off);
+
+		rx->rx_msgaddr =
+			kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+					      rx->rx_msg, IBLND_MSG_SIZE,
+					      DMA_FROM_DEVICE);
+		LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+						  rx->rx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+		CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
+		       i, rx->rx_msg, rx->rx_msgaddr,
+		       (__u64)(page_to_phys(pg) + pg_off));
+
+		pg_off += IBLND_MSG_SIZE;
+		LASSERT(pg_off <= PAGE_SIZE);
+
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			ipg++;
+			LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn));
+		}
+	}
+}
+
+static void
+kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo)
+{
+	struct kib_hca_dev *hdev = tpo->tpo_hdev;
+	struct kib_tx *tx;
+	int i;
+
+        LASSERT (tpo->tpo_pool.po_allocated == 0);
+
+        if (hdev == NULL)
+                return;
+
+        for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+                tx = &tpo->tpo_tx_descs[i];
+                kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+                                        KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+                                                          tx->tx_msgaddr),
+                                        IBLND_MSG_SIZE, DMA_TO_DEVICE);
+        }
+
+        kiblnd_hdev_decref(hdev);
+        tpo->tpo_hdev = NULL;
+}
+
+static struct kib_hca_dev *
+kiblnd_current_hdev(struct kib_dev *dev)
+{
+	struct kib_hca_dev *hdev;
+        unsigned long  flags;
+        int            i = 0;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (dev->ibd_failover) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+		if (i++ % 50 == 0)
+			CDEBUG(D_NET, "%s: Wait for failover\n",
+			       dev->ibd_ifname);
+		schedule_timeout_interruptible(cfs_time_seconds(1) / 100);
+
+		read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	hdev = dev->ibd_hdev;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	return hdev;
+}
+
+static void
+kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
+{
+	struct kib_pages *txpgs = tpo->tpo_tx_pages;
+	struct kib_pool *pool = &tpo->tpo_pool;
+	struct kib_net      *net   = pool->po_owner->ps_net;
+	struct kib_dev *dev;
+	struct page *page;
+	struct kib_tx *tx;
+        int             page_offset;
+        int             ipage;
+        int             i;
+
+        LASSERT (net != NULL);
+
+	dev = net->ibn_dev;
+
+	/* pre-mapped messages are not bigger than 1 page */
+	BUILD_BUG_ON(IBLND_MSG_SIZE > PAGE_SIZE);
+
+	/* No fancy arithmetic when we do the buffer calculations */
+	BUILD_BUG_ON(PAGE_SIZE % IBLND_MSG_SIZE != 0);
+
+        tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+	for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+		page = txpgs->ibp_pages[ipage];
+		tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) +
+						page_offset);
+
+		tx->tx_msgaddr = kiblnd_dma_map_single(tpo->tpo_hdev->ibh_ibdev,
+						       tx->tx_msg,
+						       IBLND_MSG_SIZE,
+						       DMA_TO_DEVICE);
+		LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+						  tx->tx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+		list_add(&tx->tx_list, &pool->po_free_list);
+
+		page_offset += IBLND_MSG_SIZE;
+		LASSERT(page_offset <= PAGE_SIZE);
+
+		if (page_offset == PAGE_SIZE) {
+			page_offset = 0;
+			ipage++;
+			LASSERT(ipage <= txpgs->ibp_npages);
+		}
+	}
+}
+
+static void
+kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo)
+{
+	LASSERT(fpo->fpo_map_count == 0);
+
+#ifdef HAVE_FMR_POOL_API
+	if (fpo->fpo_is_fmr && fpo->fmr.fpo_fmr_pool) {
+		ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
+	} else
+#endif /* HAVE_FMR_POOL_API */
+	{
+		struct kib_fast_reg_descriptor *frd, *tmp;
+		int i = 0;
+
+		list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
+					 frd_list) {
+			list_del(&frd->frd_list);
+#ifndef HAVE_IB_MAP_MR_SG
+			ib_free_fast_reg_page_list(frd->frd_frpl);
+#endif
+			ib_dereg_mr(frd->frd_mr);
+			LIBCFS_FREE(frd, sizeof(*frd));
+			i++;
+		}
+		if (i < fpo->fast_reg.fpo_pool_size)
+			CERROR("FastReg pool still has %d regions registered\n",
+				fpo->fast_reg.fpo_pool_size - i);
+	}
+
+	if (fpo->fpo_hdev)
+		kiblnd_hdev_decref(fpo->fpo_hdev);
+
+	LIBCFS_FREE(fpo, sizeof(*fpo));
+}
+
+static void
+kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+	struct kib_fmr_pool *fpo, *tmp;
+
+	list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
+		list_del(&fpo->fpo_list);
+		kiblnd_destroy_fmr_pool(fpo);
+	}
+}
+
+static int
+kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
+		     int ncpts)
+{
+	int size = tunables->lnd_fmr_pool_size / ncpts;
+
+	return max(IBLND_FMR_POOL, size);
+}
+
+static int
+kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
+			 int ncpts)
+{
+	int size = tunables->lnd_fmr_flush_trigger / ncpts;
+
+	return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+#ifdef HAVE_FMR_POOL_API
+static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps,
+				 struct kib_fmr_pool *fpo)
+{
+	struct ib_fmr_pool_param param = {
+		.max_pages_per_fmr = IBLND_MAX_RDMA_FRAGS,
+		.page_shift        = PAGE_SHIFT,
+		.access            = (IB_ACCESS_LOCAL_WRITE |
+				      IB_ACCESS_REMOTE_WRITE),
+		.pool_size	   = fps->fps_pool_size,
+		.dirty_watermark   = fps->fps_flush_trigger,
+		.flush_function    = NULL,
+		.flush_arg         = NULL,
+		.cache             = !!fps->fps_cache };
+	int rc = 0;
+
+	fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd,
+						   &param);
+	if (IS_ERR(fpo->fmr.fpo_fmr_pool)) {
+		rc = PTR_ERR(fpo->fmr.fpo_fmr_pool);
+		if (rc != -ENOSYS)
+			CERROR("Failed to create FMR pool: %d\n", rc);
+		else
+			CERROR("FMRs are not supported\n");
+	}
+	fpo->fpo_is_fmr = true;
+
+	return rc;
+}
+#endif /* HAVE_FMR_POOL_API */
+
+static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
+				  struct kib_fmr_pool *fpo,
+				  enum kib_dev_caps dev_caps)
+{
+	struct kib_fast_reg_descriptor *frd, *tmp;
+	int i, rc;
+
+#ifdef HAVE_FMR_POOL_API
+	fpo->fpo_is_fmr = false;
+#endif
+
+	INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
+	fpo->fast_reg.fpo_pool_size = 0;
+	for (i = 0; i < fps->fps_pool_size; i++) {
+		LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt,
+				 sizeof(*frd));
+		if (!frd) {
+			CERROR("Failed to allocate a new fast_reg descriptor\n");
+			rc = -ENOMEM;
+			goto out;
+		}
+		frd->frd_mr = NULL;
+
+#ifndef HAVE_IB_MAP_MR_SG
+		frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
+							    IBLND_MAX_RDMA_FRAGS);
+		if (IS_ERR(frd->frd_frpl)) {
+			rc = PTR_ERR(frd->frd_frpl);
+			CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
+				rc);
+			frd->frd_frpl = NULL;
+			goto out_middle;
+		}
+#endif
+
+#ifdef HAVE_IB_ALLOC_FAST_REG_MR
+		frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
+						   IBLND_MAX_RDMA_FRAGS);
+#else
+		/*
+		 * it is expected to get here if this is an MLX-5 card.
+		 * MLX-4 cards will always use FMR and MLX-5 cards will
+		 * always use fast_reg. It turns out that some MLX-5 cards
+		 * (possibly due to older FW versions) do not natively support
+		 * gaps. So we will need to track them here.
+		 */
+		frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
+#ifdef IB_MR_TYPE_SG_GAPS
+					  ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
+					   (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT)) ?
+						IB_MR_TYPE_SG_GAPS :
+						IB_MR_TYPE_MEM_REG,
+#else
+						IB_MR_TYPE_MEM_REG,
+#endif
+					  IBLND_MAX_RDMA_FRAGS);
+		if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
+		    (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT))
+			CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n");
+#endif
+		if (IS_ERR(frd->frd_mr)) {
+			rc = PTR_ERR(frd->frd_mr);
+			CERROR("Failed to allocate ib_fast_reg_mr: %d\n", rc);
+			frd->frd_mr = NULL;
+			goto out_middle;
+		}
+
+		/* indicate that the local invalidate needs to be generated */
+		frd->frd_valid = false;
+
+		list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
+		fpo->fast_reg.fpo_pool_size++;
+	}
+
+	return 0;
+
+out_middle:
+	if (frd->frd_mr)
+		ib_dereg_mr(frd->frd_mr);
+#ifndef HAVE_IB_MAP_MR_SG
+	if (frd->frd_frpl)
+		ib_free_fast_reg_page_list(frd->frd_frpl);
+#endif
+	LIBCFS_FREE(frd, sizeof(*frd));
+
+out:
+	list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
+				 frd_list) {
+		list_del(&frd->frd_list);
+#ifndef HAVE_IB_MAP_MR_SG
+		ib_free_fast_reg_page_list(frd->frd_frpl);
+#endif
+		ib_dereg_mr(frd->frd_mr);
+		LIBCFS_FREE(frd, sizeof(*frd));
+	}
+
+	return rc;
+}
+
+static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps,
+				  struct kib_fmr_pool **pp_fpo)
+{
+	struct kib_dev *dev = fps->fps_net->ibn_dev;
+	struct kib_fmr_pool *fpo;
+	int rc;
+
+	LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+	if (!fpo) {
+		return -ENOMEM;
+	}
+	memset(fpo, 0, sizeof(*fpo));
+
+	fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+#ifdef HAVE_FMR_POOL_API
+	if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
+		rc = kiblnd_alloc_fmr_pool(fps, fpo);
+	else
+#endif /* HAVE_FMR_POOL_API */
+		rc = kiblnd_alloc_freg_pool(fps, fpo, dev->ibd_dev_caps);
+	if (rc)
+		goto out_fpo;
+
+	fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
+	fpo->fpo_owner = fps;
+	*pp_fpo = fpo;
+
+	return 0;
+
+out_fpo:
+	kiblnd_hdev_decref(fpo->fpo_hdev);
+	LIBCFS_FREE(fpo, sizeof(*fpo));
+	return rc;
+}
+
+static void
+kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies)
+{
+	struct kib_fmr_pool *fpo;
+
+	if (fps->fps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&fps->fps_lock);
+
+	while ((fpo = list_first_entry_or_null(&fps->fps_pool_list,
+					       struct kib_fmr_pool,
+					       fpo_list)) != NULL) {
+		fpo->fpo_failed = 1;
+		if (fpo->fpo_map_count == 0)
+			list_move(&fpo->fpo_list, zombies);
+		else
+			list_move(&fpo->fpo_list, &fps->fps_failed_pool_list);
+	}
+
+	spin_unlock(&fps->fps_lock);
+}
+
+static void
+kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
+{
+	if (fps->fps_net != NULL) { /* initialized? */
+		kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+		kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
+			struct kib_net *net,
+			struct lnet_ioctl_config_o2iblnd_tunables *tunables)
+{
+	struct kib_fmr_pool *fpo;
+	int rc;
+
+	memset(fps, 0, sizeof(struct kib_fmr_poolset));
+
+	fps->fps_net = net;
+	fps->fps_cpt = cpt;
+
+	fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts);
+	fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts);
+	fps->fps_cache = tunables->lnd_fmr_cache;
+
+	spin_lock_init(&fps->fps_lock);
+	INIT_LIST_HEAD(&fps->fps_pool_list);
+	INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	if (rc == 0)
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+	return rc;
+}
+
+static int
+kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, time64_t now)
+{
+        if (fpo->fpo_map_count != 0) /* still in use */
+                return 0;
+        if (fpo->fpo_failed)
+                return 1;
+	return now >= fpo->fpo_deadline;
+}
+
+#if defined(HAVE_FMR_POOL_API) || !defined(HAVE_IB_MAP_MR_SG)
+static int
+kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
+{
+	struct kib_hca_dev *hdev;
+	__u64		*pages = tx->tx_pages;
+	int		npages;
+	int		size;
+	int		i;
+
+	hdev = tx->tx_pool->tpo_hdev;
+
+	for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+		for (size = 0; size <  rd->rd_frags[i].rf_nob;
+			size += hdev->ibh_page_size) {
+			pages[npages++] = (rd->rd_frags[i].rf_addr &
+					   hdev->ibh_page_mask) + size;
+		}
+	}
+
+	return npages;
+}
+#endif
+
+void
+kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
+{
+	LIST_HEAD(zombies);
+	struct kib_fmr_pool *fpo = fmr->fmr_pool;
+	struct kib_fmr_poolset *fps;
+	time64_t now = ktime_get_seconds();
+	struct kib_fmr_pool *tmp;
+
+	if (!fpo)
+		return;
+
+	fps = fpo->fpo_owner;
+
+#ifdef HAVE_FMR_POOL_API
+	if (fpo->fpo_is_fmr) {
+		if (fmr->fmr_pfmr) {
+			ib_fmr_pool_unmap(fmr->fmr_pfmr);
+			fmr->fmr_pfmr = NULL;
+		}
+
+		if (status) {
+			int rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
+			LASSERT(!rc);
+		}
+	} else
+#endif /* HAVE_FMR_POOL_API */
+	{
+		struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
+		if (frd) {
+			frd->frd_posted = false;
+			fmr->fmr_frd = NULL;
+			spin_lock(&fps->fps_lock);
+			list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
+			spin_unlock(&fps->fps_lock);
+		}
+	}
+	fmr->fmr_pool = NULL;
+
+	spin_lock(&fps->fps_lock);
+	fpo->fpo_map_count--;	/* decref the pool */
+
+	list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+		/* the first pool is persistent */
+		if (fps->fps_pool_list.next == &fpo->fpo_list)
+			continue;
+
+		if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+			list_move(&fpo->fpo_list, &zombies);
+			fps->fps_version++;
+		}
+	}
+	spin_unlock(&fps->fps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
+			struct kib_rdma_desc *rd, u32 nob, u64 iov,
+			struct kib_fmr *fmr)
+{
+	struct kib_fmr_pool *fpo;
+	__u64 version;
+	bool is_rx = (rd != tx->tx_rd);
+#ifdef HAVE_FMR_POOL_API
+	__u64 *pages = tx->tx_pages;
+	bool tx_pages_mapped = false;
+	int npages = 0;
+#endif
+	int rc;
+
+again:
+	spin_lock(&fps->fps_lock);
+	version = fps->fps_version;
+	list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+		fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
+		fpo->fpo_map_count++;
+
+#ifdef HAVE_FMR_POOL_API
+		fmr->fmr_pfmr = NULL;
+		if (fpo->fpo_is_fmr) {
+			struct ib_pool_fmr *pfmr;
+
+			spin_unlock(&fps->fps_lock);
+
+			if (!tx_pages_mapped) {
+				npages = kiblnd_map_tx_pages(tx, rd);
+				tx_pages_mapped = true;
+			}
+
+			pfmr = kib_fmr_pool_map(fpo->fmr.fpo_fmr_pool,
+						pages, npages, iov);
+			if (likely(!IS_ERR(pfmr))) {
+				fmr->fmr_key  = is_rx ? pfmr->fmr->rkey
+					: pfmr->fmr->lkey;
+				fmr->fmr_frd  = NULL;
+				fmr->fmr_pfmr = pfmr;
+				fmr->fmr_pool = fpo;
+				return 0;
+			}
+			rc = PTR_ERR(pfmr);
+		} else
+#endif /* HAVE_FMR_POOL_API */
+		{
+			if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
+				struct kib_fast_reg_descriptor *frd;
+#ifdef HAVE_IB_MAP_MR_SG
+				struct ib_reg_wr *wr;
+				int n;
+#else
+				struct ib_rdma_wr *wr;
+				struct ib_fast_reg_page_list *frpl;
+#endif
+				struct ib_mr *mr;
+
+				frd = list_first_entry(
+					&fpo->fast_reg.fpo_pool_list,
+					struct kib_fast_reg_descriptor,
+					frd_list);
+				list_del(&frd->frd_list);
+				spin_unlock(&fps->fps_lock);
+
+#ifndef HAVE_IB_MAP_MR_SG
+				frpl = frd->frd_frpl;
+#endif
+				mr   = frd->frd_mr;
+
+				if (!frd->frd_valid) {
+					struct ib_rdma_wr *inv_wr;
+					__u32 key = is_rx ? mr->rkey : mr->lkey;
+
+					frd->frd_valid = true;
+					inv_wr = &frd->frd_inv_wr;
+					memset(inv_wr, 0, sizeof(*inv_wr));
+
+					inv_wr->wr.opcode = IB_WR_LOCAL_INV;
+					inv_wr->wr.wr_id  = IBLND_WID_MR;
+					inv_wr->wr.ex.invalidate_rkey = key;
+
+					/* Bump the key */
+					key = ib_inc_rkey(key);
+					ib_update_fast_reg_key(mr, key);
+				}
+
+#ifdef HAVE_IB_MAP_MR_SG
+#ifdef HAVE_IB_MAP_MR_SG_5ARGS
+				n = ib_map_mr_sg(mr, tx->tx_frags,
+						 rd->rd_nfrags, NULL, PAGE_SIZE);
+#else
+				n = ib_map_mr_sg(mr, tx->tx_frags,
+						 rd->rd_nfrags, PAGE_SIZE);
+#endif /* HAVE_IB_MAP_MR_SG_5ARGS */
+				if (unlikely(n != rd->rd_nfrags)) {
+					CERROR("Failed to map mr %d/%d elements\n",
+					       n, rd->rd_nfrags);
+					return n < 0 ? n : -EINVAL;
+				}
+
+				wr = &frd->frd_fastreg_wr;
+				memset(wr, 0, sizeof(*wr));
+
+				wr->wr.opcode = IB_WR_REG_MR;
+				wr->wr.wr_id  = IBLND_WID_MR;
+				wr->wr.num_sge = 0;
+				wr->wr.send_flags = 0;
+				wr->mr = mr;
+				wr->key = is_rx ? mr->rkey : mr->lkey;
+				wr->access = (IB_ACCESS_LOCAL_WRITE |
+					      IB_ACCESS_REMOTE_WRITE);
+#else /* HAVE_IB_MAP_MR_SG */
+				if (!tx_pages_mapped) {
+					npages = kiblnd_map_tx_pages(tx, rd);
+					tx_pages_mapped = true;
+				}
+
+				LASSERT(npages <= frpl->max_page_list_len);
+				memcpy(frpl->page_list, pages,
+				       sizeof(*pages) * npages);
+
+				/* Prepare FastReg WR */
+				wr = &frd->frd_fastreg_wr;
+				memset(wr, 0, sizeof(*wr));
+
+				wr->wr.opcode = IB_WR_FAST_REG_MR;
+				wr->wr.wr_id  = IBLND_WID_MR;
+
+				wr->wr.wr.fast_reg.iova_start = iov;
+				wr->wr.wr.fast_reg.page_list  = frpl;
+				wr->wr.wr.fast_reg.page_list_len = npages;
+				wr->wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+				wr->wr.wr.fast_reg.length = nob;
+				wr->wr.wr.fast_reg.rkey =
+					is_rx ? mr->rkey : mr->lkey;
+				wr->wr.wr.fast_reg.access_flags =
+					(IB_ACCESS_LOCAL_WRITE |
+					 IB_ACCESS_REMOTE_WRITE);
+#endif /* HAVE_IB_MAP_MR_SG */
+
+				fmr->fmr_key  = is_rx ? mr->rkey : mr->lkey;
+				fmr->fmr_frd  = frd;
+				fmr->fmr_pool = fpo;
+				frd->frd_posted = false;
+				return 0;
+			}
+			spin_unlock(&fps->fps_lock);
+			rc = -EAGAIN;
+		}
+
+		spin_lock(&fps->fps_lock);
+		fpo->fpo_map_count--;
+		if (rc != -EAGAIN) {
+			spin_unlock(&fps->fps_lock);
+			return rc;
+		}
+
+		/* EAGAIN and ... */
+		if (version != fps->fps_version) {
+			spin_unlock(&fps->fps_lock);
+			goto again;
+		}
+	}
+
+	if (fps->fps_increasing) {
+		spin_unlock(&fps->fps_lock);
+		CDEBUG(D_NET, "Another thread is allocating new "
+		       "FMR pool, waiting for her to complete\n");
+		wait_var_event(fps, !fps->fps_increasing);
+		goto again;
+
+	}
+
+	if (ktime_get_seconds() < fps->fps_next_retry) {
+		/* someone failed recently */
+		spin_unlock(&fps->fps_lock);
+		return -EAGAIN;
+	}
+
+	fps->fps_increasing = 1;
+	spin_unlock(&fps->fps_lock);
+
+	CDEBUG(D_NET, "Allocate new FMR pool\n");
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	spin_lock(&fps->fps_lock);
+	fps->fps_increasing = 0;
+	wake_up_var(fps);
+	if (rc == 0) {
+		fps->fps_version++;
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+	} else {
+		fps->fps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY;
+	}
+	spin_unlock(&fps->fps_lock);
+
+	goto again;
+}
+
+static void
+kiblnd_fini_pool(struct kib_pool *pool)
+{
+	LASSERT(list_empty(&pool->po_free_list));
+	LASSERT(pool->po_allocated == 0);
+
+	CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void
+kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size)
+{
+	CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+	memset(pool, 0, sizeof(struct kib_pool));
+	INIT_LIST_HEAD(&pool->po_free_list);
+	pool->po_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
+	pool->po_owner = ps;
+	pool->po_size = size;
+}
+
+static void
+kiblnd_destroy_pool_list(struct list_head *head)
+{
+	struct kib_pool *pool;
+
+	while ((pool = list_first_entry_or_null(head,
+						struct kib_pool,
+						po_list)) != NULL) {
+		list_del(&pool->po_list);
+
+		LASSERT(pool->po_owner != NULL);
+		pool->po_owner->ps_pool_destroy(pool);
+	}
+}
+
+static void
+kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
+{
+	struct kib_pool *po;
+
+	if (ps->ps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&ps->ps_lock);
+	while ((po = list_first_entry_or_null(&ps->ps_pool_list,
+					      struct kib_pool,
+					      po_list)) != NULL) {
+		po->po_failed = 1;
+		if (po->po_allocated == 0)
+			list_move(&po->po_list, zombies);
+		else
+			list_move(&po->po_list, &ps->ps_failed_pool_list);
+	}
+	spin_unlock(&ps->ps_lock);
+}
+
+static void
+kiblnd_fini_poolset(struct kib_poolset *ps)
+{
+	if (ps->ps_net != NULL) { /* initialized? */
+		kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+		kiblnd_destroy_pool_list(&ps->ps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_poolset(struct kib_poolset *ps, int cpt,
+		    struct kib_net *net, char *name, int size,
+		    kib_ps_pool_create_t po_create,
+		    kib_ps_pool_destroy_t po_destroy,
+		    kib_ps_node_init_t nd_init,
+		    kib_ps_node_fini_t nd_fini)
+{
+	struct kib_pool	*pool;
+	int rc;
+
+	memset(ps, 0, sizeof(struct kib_poolset));
+
+	ps->ps_cpt	    = cpt;
+        ps->ps_net          = net;
+        ps->ps_pool_create  = po_create;
+        ps->ps_pool_destroy = po_destroy;
+        ps->ps_node_init    = nd_init;
+        ps->ps_node_fini    = nd_fini;
+        ps->ps_pool_size    = size;
+	if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+	    >= sizeof(ps->ps_name))
+		return -E2BIG;
+	spin_lock_init(&ps->ps_lock);
+	INIT_LIST_HEAD(&ps->ps_pool_list);
+	INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+	rc = ps->ps_pool_create(ps, size, &pool);
+	if (rc == 0)
+		list_add(&pool->po_list, &ps->ps_pool_list);
+	else
+		CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+	return rc;
+}
+
+static int
+kiblnd_pool_is_idle(struct kib_pool *pool, time64_t now)
+{
+        if (pool->po_allocated != 0) /* still in use */
+                return 0;
+        if (pool->po_failed)
+                return 1;
+	return now >= pool->po_deadline;
+}
+
+void
+kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
+{
+	LIST_HEAD(zombies);
+	struct kib_poolset *ps = pool->po_owner;
+	struct kib_pool *tmp;
+	time64_t now = ktime_get_seconds();
+
+	spin_lock(&ps->ps_lock);
+
+	if (ps->ps_node_fini != NULL)
+		ps->ps_node_fini(pool, node);
+
+	LASSERT(pool->po_allocated > 0);
+	list_add(node, &pool->po_free_list);
+	pool->po_allocated--;
+
+	list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+		/* the first pool is persistent */
+		if (ps->ps_pool_list.next == &pool->po_list)
+			continue;
+
+		if (kiblnd_pool_is_idle(pool, now))
+			list_move(&pool->po_list, &zombies);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *
+kiblnd_pool_alloc_node(struct kib_poolset *ps)
+{
+	struct list_head	*node;
+	struct kib_pool	*pool;
+	int			rc;
+	unsigned int		interval = 1;
+	ktime_t time_before;
+	unsigned int trips = 0;
+
+again:
+	spin_lock(&ps->ps_lock);
+	list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+		if (list_empty(&pool->po_free_list))
+			continue;
+
+		pool->po_allocated++;
+		pool->po_deadline = ktime_get_seconds() +
+				    IBLND_POOL_DEADLINE;
+		node = pool->po_free_list.next;
+		list_del(node);
+
+		if (ps->ps_node_init != NULL) {
+			/* still hold the lock */
+			ps->ps_node_init(pool, node);
+		}
+		spin_unlock(&ps->ps_lock);
+		return node;
+	}
+
+	/* no available tx pool and ... */
+	if (ps->ps_increasing) {
+		/* another thread is allocating a new pool */
+		spin_unlock(&ps->ps_lock);
+		trips++;
+		CDEBUG(D_NET,
+		       "Another thread is allocating new %s pool, waiting %d jiffies for her to complete. trips = %d\n",
+		       ps->ps_name, interval, trips);
+
+		schedule_timeout_interruptible(interval);
+		if (interval < cfs_time_seconds(1))
+			interval *= 2;
+
+                goto again;
+        }
+
+	if (ktime_get_seconds() < ps->ps_next_retry) {
+		/* someone failed recently */
+		spin_unlock(&ps->ps_lock);
+		return NULL;
+	}
+
+	ps->ps_increasing = 1;
+	spin_unlock(&ps->ps_lock);
+
+	CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+	time_before = ktime_get();
+	rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+	CDEBUG(D_NET, "ps_pool_create took %lld ms to complete",
+	       ktime_ms_delta(ktime_get(), time_before));
+
+	spin_lock(&ps->ps_lock);
+	ps->ps_increasing = 0;
+	if (rc == 0) {
+		list_add_tail(&pool->po_list, &ps->ps_pool_list);
+	} else {
+		ps->ps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY;
+		CERROR("Can't allocate new %s pool because out of memory\n",
+		       ps->ps_name);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	goto again;
+}
+
+static void
+kiblnd_destroy_tx_pool(struct kib_pool *pool)
+{
+	struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool,
+					       tpo_pool);
+	int i;
+
+        LASSERT (pool->po_allocated == 0);
+
+        if (tpo->tpo_tx_pages != NULL) {
+                kiblnd_unmap_tx_pool(tpo);
+                kiblnd_free_pages(tpo->tpo_tx_pages);
+        }
+
+        if (tpo->tpo_tx_descs == NULL)
+                goto out;
+
+	for (i = 0; i < pool->po_size; i++) {
+		struct kib_tx *tx = &tpo->tpo_tx_descs[i];
+		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
+
+		list_del(&tx->tx_list);
+		if (tx->tx_pages != NULL)
+			CFS_FREE_PTR_ARRAY(tx->tx_pages, LNET_MAX_IOV);
+		if (tx->tx_frags != NULL)
+			CFS_FREE_PTR_ARRAY(tx->tx_frags,
+					   IBLND_MAX_RDMA_FRAGS);
+		if (tx->tx_wrq != NULL)
+			CFS_FREE_PTR_ARRAY(tx->tx_wrq,
+					   IBLND_MAX_RDMA_FRAGS);
+		if (tx->tx_sge != NULL) {
+			/* +1 is for the lnet header/message itself */
+			CFS_FREE_PTR_ARRAY(tx->tx_sge,
+					   (IBLND_MAX_RDMA_FRAGS *
+					   wrq_sge + 1));
+		}
+		if (tx->tx_rd != NULL)
+			LIBCFS_FREE(tx->tx_rd,
+				    offsetof(struct kib_rdma_desc,
+					     rd_frags[IBLND_MAX_RDMA_FRAGS]));
+	}
+
+	CFS_FREE_PTR_ARRAY(tpo->tpo_tx_descs, pool->po_size);
+out:
+	kiblnd_fini_pool(pool);
+	CFS_FREE_PTR(tpo);
+}
+
+static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	int ntx;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	ntx = tunables->lnd_ntx / ncpts;
+
+	return max(IBLND_TX_POOL, ntx);
+}
+
+static int
+kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
+{
+        int            i;
+        int            npg;
+	struct kib_pool *pool;
+	struct kib_tx_pool *tpo;
+
+	LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+        if (tpo == NULL) {
+                CERROR("Failed to allocate TX pool\n");
+                return -ENOMEM;
+        }
+
+        pool = &tpo->tpo_pool;
+        kiblnd_init_pool(ps, pool, size);
+        tpo->tpo_tx_descs = NULL;
+        tpo->tpo_tx_pages = NULL;
+
+        npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+	if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+		CERROR("Can't allocate tx pages: %d\n", npg);
+		CFS_FREE_PTR(tpo);
+		return -ENOMEM;
+	}
+
+	LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+			 size * sizeof(struct kib_tx));
+        if (tpo->tpo_tx_descs == NULL) {
+                CERROR("Can't allocate %d tx descriptors\n", size);
+                ps->ps_pool_destroy(pool);
+                return -ENOMEM;
+        }
+
+	memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx));
+
+        for (i = 0; i < size; i++) {
+		struct kib_tx *tx = &tpo->tpo_tx_descs[i];
+		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
+
+                tx->tx_pool = tpo;
+		if (ps->ps_net->ibn_fmr_ps != NULL) {
+			LIBCFS_CPT_ALLOC(tx->tx_pages,
+					 lnet_cpt_table(), ps->ps_cpt,
+					 LNET_MAX_IOV * sizeof(*tx->tx_pages));
+			if (tx->tx_pages == NULL)
+				break;
+		}
+
+		LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS *
+				 sizeof(*tx->tx_frags));
+		if (tx->tx_frags == NULL)
+			break;
+
+		sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
+
+		LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS *
+				 sizeof(*tx->tx_wrq));
+		if (tx->tx_wrq == NULL)
+			break;
+
+		/* +1 is for the lnet header/message itself */
+		LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+				 (IBLND_MAX_RDMA_FRAGS * wrq_sge + 1) *
+				 sizeof(*tx->tx_sge));
+		if (tx->tx_sge == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+				 offsetof(struct kib_rdma_desc,
+					  rd_frags[IBLND_MAX_RDMA_FRAGS]));
+		if (tx->tx_rd == NULL)
+			break;
+        }
+
+        if (i == size) {
+                kiblnd_map_tx_pool(tpo);
+                *pp_po = pool;
+                return 0;
+        }
+
+        ps->ps_pool_destroy(pool);
+        return -ENOMEM;
+}
+
+static void
+kiblnd_tx_init(struct kib_pool *pool, struct list_head *node)
+{
+	struct kib_tx_poolset *tps = container_of(pool->po_owner,
+						  struct kib_tx_poolset,
+						  tps_poolset);
+	struct kib_tx *tx  = list_entry(node, struct kib_tx, tx_list);
+
+	tx->tx_cookie = tps->tps_next_tx_cookie++;
+}
+
+static void
+kiblnd_net_fini_pools(struct kib_net *net)
+{
+	int	i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		struct kib_tx_poolset *tps;
+		struct kib_fmr_poolset *fps;
+
+		if (net->ibn_tx_ps != NULL) {
+			tps = net->ibn_tx_ps[i];
+			kiblnd_fini_poolset(&tps->tps_poolset);
+		}
+
+		if (net->ibn_fmr_ps != NULL) {
+			fps = net->ibn_fmr_ps[i];
+			kiblnd_fini_fmr_poolset(fps);
+		}
+	}
+
+	if (net->ibn_tx_ps != NULL) {
+		cfs_percpt_free(net->ibn_tx_ps);
+		net->ibn_tx_ps = NULL;
+	}
+
+	if (net->ibn_fmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_fmr_ps);
+		net->ibn_fmr_ps = NULL;
+	}
+}
+
+static int
+kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
+		      int ncpts)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+#ifdef HAVE_IB_GET_DMA_MR
+	unsigned long	flags;
+#endif
+	int		cpt;
+	int		rc;
+	int		i;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+#ifdef HAVE_IB_GET_DMA_MR
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	/*
+	 * if lnd_map_on_demand is zero then we have effectively disabled
+	 * FMR or FastReg and we're using global memory regions
+	 * exclusively.
+	 */
+	if (!tunables->lnd_map_on_demand) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					   flags);
+		goto create_tx_pool;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+#endif
+
+	if (tunables->lnd_fmr_pool_size < tunables->lnd_ntx / 4) {
+		CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+		       tunables->lnd_fmr_pool_size,
+		       tunables->lnd_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	/* TX pool must be created later than FMR, see LU-2268
+	 * for details */
+	LASSERT(net->ibn_tx_ps == NULL);
+
+	/* premapping can fail if ibd_nmr > 1, so we always create
+	 * FMR pool and map-on-demand if premapping failed */
+
+	net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(struct kib_fmr_poolset));
+	if (net->ibn_fmr_ps == NULL) {
+		CERROR("Failed to allocate FMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts,
+					     net, tunables);
+		if (rc != 0) {
+			CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	if (i > 0)
+		LASSERT(i == ncpts);
+
+#ifdef HAVE_IB_GET_DMA_MR
+ create_tx_pool:
+#endif
+	net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					  sizeof(struct kib_tx_poolset));
+	if (net->ibn_tx_ps == NULL) {
+		CERROR("Failed to allocate tx pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+					 cpt, net, "TX",
+					 kiblnd_tx_pool_size(ni, ncpts),
+					 kiblnd_create_tx_pool,
+					 kiblnd_destroy_tx_pool,
+					 kiblnd_tx_init, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize TX pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+ failed:
+	kiblnd_net_fini_pools(net);
+	LASSERT(rc != 0);
+	return rc;
+}
+
+static int
+kiblnd_port_get_attr(struct kib_hca_dev *hdev)
+{
+	struct ib_port_attr *port_attr;
+	int rc;
+	unsigned long flags;
+	rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+
+	LIBCFS_ALLOC(port_attr, sizeof(*port_attr));
+	if (port_attr == NULL) {
+		CDEBUG(D_NETERROR, "Out of memory\n");
+		return -ENOMEM;
+	}
+
+	rc = ib_query_port(hdev->ibh_ibdev, hdev->ibh_port, port_attr);
+
+	write_lock_irqsave(g_lock, flags);
+
+	if (rc == 0)
+		hdev->ibh_state = port_attr->state == IB_PORT_ACTIVE
+				 ? IBLND_DEV_PORT_ACTIVE
+				 : IBLND_DEV_PORT_DOWN;
+
+	write_unlock_irqrestore(g_lock, flags);
+	LIBCFS_FREE(port_attr, sizeof(*port_attr));
+
+	if (rc != 0) {
+		CDEBUG(D_NETERROR, "Failed to query IB port: %d\n", rc);
+		return rc;
+	}
+	return 0;
+}
+
+static inline void
+kiblnd_set_ni_fatal_on(struct kib_hca_dev *hdev, int val)
+{
+	struct kib_net  *net;
+
+	/* for health check */
+	list_for_each_entry(net, &hdev->ibh_dev->ibd_nets, ibn_list) {
+		if (val)
+			CDEBUG(D_NETERROR, "Fatal device error for NI %s\n",
+					libcfs_nidstr(&net->ibn_ni->ni_nid));
+		atomic_set(&net->ibn_ni->ni_fatal_error_on, val);
+	}
+}
+
+void
+kiblnd_event_handler(struct ib_event_handler *handler, struct ib_event *event)
+{
+	rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+	struct kib_hca_dev  *hdev;
+	unsigned long flags;
+
+	hdev = container_of(handler, struct kib_hca_dev, ibh_event_handler);
+
+	write_lock_irqsave(g_lock, flags);
+
+	switch (event->event) {
+	case IB_EVENT_DEVICE_FATAL:
+		CDEBUG(D_NET, "IB device fatal\n");
+		hdev->ibh_state = IBLND_DEV_FATAL;
+		kiblnd_set_ni_fatal_on(hdev, 1);
+		break;
+	case IB_EVENT_PORT_ACTIVE:
+		CDEBUG(D_NET, "IB port active\n");
+		if (event->element.port_num == hdev->ibh_port) {
+			hdev->ibh_state = IBLND_DEV_PORT_ACTIVE;
+			kiblnd_set_ni_fatal_on(hdev, 0);
+		}
+		break;
+	case IB_EVENT_PORT_ERR:
+		CDEBUG(D_NET, "IB port err\n");
+		if (event->element.port_num == hdev->ibh_port) {
+			hdev->ibh_state = IBLND_DEV_PORT_DOWN;
+			kiblnd_set_ni_fatal_on(hdev, 1);
+		}
+		break;
+	default:
+		break;
+	}
+	write_unlock_irqrestore(g_lock, flags);
+}
+
+static int
+kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
+{
+	struct ib_device_attr *dev_attr;
+	int rc = 0;
+	int rc2 = 0;
+
+	/* It's safe to assume a HCA can handle a page size
+	 * matching that of the native system */
+	hdev->ibh_page_shift = PAGE_SHIFT;
+	hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+	hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
+#ifndef HAVE_IB_DEVICE_ATTRS
+	LIBCFS_ALLOC(dev_attr, sizeof(*dev_attr));
+	if (dev_attr == NULL) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	rc = ib_query_device(hdev->ibh_ibdev, dev_attr);
+	if (rc != 0) {
+		CERROR("Failed to query IB device: %d\n", rc);
+		goto out_clean_attr;
+	}
+#else
+	dev_attr = &hdev->ibh_ibdev->attrs;
+#endif
+
+	hdev->ibh_mr_size = dev_attr->max_mr_size;
+	hdev->ibh_max_qp_wr = dev_attr->max_qp_wr;
+
+	/* Setup device Memory Registration capabilities */
+#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_IB_DEVICE_OPS
+	if (hdev->ibh_ibdev->ops.alloc_fmr &&
+	    hdev->ibh_ibdev->ops.dealloc_fmr &&
+	    hdev->ibh_ibdev->ops.map_phys_fmr &&
+	    hdev->ibh_ibdev->ops.unmap_fmr) {
+#else
+	if (hdev->ibh_ibdev->alloc_fmr &&
+	    hdev->ibh_ibdev->dealloc_fmr &&
+	    hdev->ibh_ibdev->map_phys_fmr &&
+	    hdev->ibh_ibdev->unmap_fmr) {
+#endif
+		LCONSOLE_INFO("Using FMR for registration\n");
+		hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FMR_ENABLED;
+	} else
+#endif /* HAVE_FMR_POOL_API */
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		LCONSOLE_INFO("Using FastReg for registration\n");
+		hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_ENABLED;
+#ifndef HAVE_IB_ALLOC_FAST_REG_MR
+#ifdef IB_DEVICE_SG_GAPS_REG
+		if (dev_attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+			hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT;
+#endif
+#endif
+	} else {
+		rc = -ENOSYS;
+	}
+
+	rc2 = kiblnd_port_get_attr(hdev);
+	if (rc2 != 0)
+		return rc2;
+
+	if (rc != 0)
+		rc = -EINVAL;
+
+#ifndef HAVE_IB_DEVICE_ATTRS
+out_clean_attr:
+	LIBCFS_FREE(dev_attr, sizeof(*dev_attr));
+#endif
+
+	if (rc == -ENOSYS)
+		CERROR("IB device does not support FMRs nor FastRegs, can't "
+		       "register memory: %d\n", rc);
+	else if (rc == -EINVAL)
+		CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
+	return rc;
+}
+
+#ifdef HAVE_IB_GET_DMA_MR
+static void
+kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
+{
+	if (hdev->ibh_mrs == NULL)
+		return;
+
+	ib_dereg_mr(hdev->ibh_mrs);
+
+	hdev->ibh_mrs = NULL;
+}
+#endif
+
+void
+kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
+{
+	if (hdev->ibh_event_handler.device != NULL)
+		ib_unregister_event_handler(&hdev->ibh_event_handler);
+
+#ifdef HAVE_IB_GET_DMA_MR
+        kiblnd_hdev_cleanup_mrs(hdev);
+#endif
+
+        if (hdev->ibh_pd != NULL)
+                ib_dealloc_pd(hdev->ibh_pd);
+
+        if (hdev->ibh_cmid != NULL)
+                rdma_destroy_id(hdev->ibh_cmid);
+
+        LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+#ifdef HAVE_IB_GET_DMA_MR
+static int
+kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev)
+{
+	struct ib_mr *mr;
+	int           acflags = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_WRITE;
+
+	mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+	if (IS_ERR(mr)) {
+		CERROR("Failed ib_get_dma_mr: %ld\n", PTR_ERR(mr));
+		kiblnd_hdev_cleanup_mrs(hdev);
+		return PTR_ERR(mr);
+	}
+
+	hdev->ibh_mrs = mr;
+
+	return 0;
+}
+#endif
+
+static int
+kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{       /* DUMMY */
+        return 0;
+}
+
+static int kiblnd_get_link_status(struct net_device *dev)
+{
+	int ret = -1;
+
+	LASSERT(dev);
+
+	if (!netif_running(dev))
+		ret = 0;
+	/* Some devices may not be providing link settings */
+	else if (dev->ethtool_ops->get_link)
+		ret = dev->ethtool_ops->get_link(dev);
+
+	return ret;
+}
+
+static int
+kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns)
+{
+        struct rdma_cm_id  *cmid;
+        struct sockaddr_in  srcaddr;
+        struct sockaddr_in  dstaddr;
+        int                 rc;
+
+        if (dev->ibd_hdev == NULL || /* initializing */
+            dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+            *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+                return 1;
+
+        /* XXX: it's UGLY, but I don't have better way to find
+         * ib-bonding HCA failover because:
+         *
+         * a. no reliable CM event for HCA failover...
+         * b. no OFED API to get ib_device for current net_device...
+         *
+         * We have only two choices at this point:
+         *
+         * a. rdma_bind_addr(), it will conflict with listener cmid
+         * b. rdma_resolve_addr() to zero addr */
+	cmid = kiblnd_rdma_create_id(ns, kiblnd_dummy_callback, dev,
+				     RDMA_PS_TCP, IB_QPT_RC);
+        if (IS_ERR(cmid)) {
+                rc = PTR_ERR(cmid);
+                CERROR("Failed to create cmid for failover: %d\n", rc);
+                return rc;
+        }
+
+        memset(&srcaddr, 0, sizeof(srcaddr));
+        srcaddr.sin_family      = AF_INET;
+        srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+        memset(&dstaddr, 0, sizeof(dstaddr));
+        dstaddr.sin_family = AF_INET;
+        rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+                               (struct sockaddr *)&dstaddr, 1);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
+		       dev->ibd_ifname, &dev->ibd_ifip,
+		       cmid->device, rc);
+                rdma_destroy_id(cmid);
+                return rc;
+        }
+
+	rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */
+	rdma_destroy_id(cmid);
+	return rc;
+}
+
+int
+kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
+{
+	LIST_HEAD(zombie_tpo);
+	LIST_HEAD(zombie_ppo);
+	LIST_HEAD(zombie_fpo);
+	struct rdma_cm_id  *cmid  = NULL;
+	struct kib_hca_dev *hdev  = NULL;
+	struct kib_hca_dev *old;
+	struct ib_pd       *pd;
+	struct kib_net *net;
+	struct sockaddr_in  addr;
+	struct net_device *netdev;
+	unsigned long       flags;
+	int                 rc = 0;
+	int		    i;
+	bool		    set_fatal = true;
+
+	LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
+		dev->ibd_can_failover ||
+		dev->ibd_hdev == NULL);
+
+	rc = kiblnd_dev_need_failover(dev, ns);
+	if (rc <= 0)
+		goto out;
+
+	if (dev->ibd_hdev != NULL &&
+	    dev->ibd_hdev->ibh_cmid != NULL) {
+		/* XXX it's not good to close old listener at here,
+		 * because we can fail to create new listener.
+		 * But we have to close it now, otherwise rdma_bind_addr
+		 * will return EADDRINUSE... How crap! */
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+		cmid = dev->ibd_hdev->ibh_cmid;
+		/* make next schedule of kiblnd_dev_need_failover()
+		 * return 1 for me */
+		dev->ibd_hdev->ibh_cmid  = NULL;
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		rdma_destroy_id(cmid);
+	}
+
+	cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(cmid)) {
+		rc = PTR_ERR(cmid);
+		CERROR("Failed to create cmid for failover: %d\n", rc);
+		goto out;
+	}
+
+	memset(&addr, 0, sizeof(addr));
+	addr.sin_family      = AF_INET;
+	addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+	addr.sin_port        = htons(*kiblnd_tunables.kib_service);
+
+	/* Bind to failover device or port */
+	rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
+		       dev->ibd_ifname, &dev->ibd_ifip,
+		       cmid->device, rc);
+		if (!rc && !cmid->device)
+			set_fatal = false;
+		rdma_destroy_id(cmid);
+		goto out;
+	}
+
+	LIBCFS_ALLOC(hdev, sizeof(*hdev));
+	if (hdev == NULL) {
+		CERROR("Failed to allocate kib_hca_dev\n");
+		rdma_destroy_id(cmid);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&hdev->ibh_ref, 1);
+	hdev->ibh_dev   = dev;
+	hdev->ibh_cmid  = cmid;
+	hdev->ibh_ibdev = cmid->device;
+	hdev->ibh_port  = cmid->port_num;
+
+#ifdef HAVE_IB_ALLOC_PD_2ARGS
+	pd = ib_alloc_pd(cmid->device, 0);
+#else
+	pd = ib_alloc_pd(cmid->device);
+#endif
+	if (IS_ERR(pd)) {
+		rc = PTR_ERR(pd);
+		CERROR("Can't allocate PD: %d\n", rc);
+		goto out;
+	}
+
+	hdev->ibh_pd = pd;
+
+	rc = rdma_listen(cmid, 0);
+	if (rc != 0) {
+		CERROR("Can't start new listener: %d\n", rc);
+		goto out;
+	}
+
+	rc = kiblnd_hdev_get_attr(hdev);
+	if (rc != 0) {
+		CERROR("Can't get device attributes: %d\n", rc);
+		goto out;
+	}
+
+#ifdef HAVE_IB_GET_DMA_MR
+	rc = kiblnd_hdev_setup_mrs(hdev);
+	if (rc != 0) {
+		CERROR("Can't setup device: %d\n", rc);
+		goto out;
+	}
+#endif
+
+	INIT_IB_EVENT_HANDLER(&hdev->ibh_event_handler,
+				hdev->ibh_ibdev, kiblnd_event_handler);
+	ib_register_event_handler(&hdev->ibh_event_handler);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	old = dev->ibd_hdev;
+	dev->ibd_hdev = hdev;	/* take over the refcount */
+	hdev = old;
+
+	list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+		cfs_cpt_for_each(i, lnet_cpt_table()) {
+			kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+					    &zombie_tpo);
+
+			if (net->ibn_fmr_ps != NULL)
+				kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+							&zombie_fpo);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+	if (!list_empty(&zombie_tpo))
+		kiblnd_destroy_pool_list(&zombie_tpo);
+	if (!list_empty(&zombie_ppo))
+		kiblnd_destroy_pool_list(&zombie_ppo);
+	if (!list_empty(&zombie_fpo))
+		kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+	if (hdev != NULL)
+		kiblnd_hdev_decref(hdev);
+
+	if (rc != 0) {
+		dev->ibd_failed_failover++;
+	} else {
+		dev->ibd_failed_failover = 0;
+
+		if (set_fatal) {
+			rcu_read_lock();
+			netdev = dev_get_by_name_rcu(ns, dev->ibd_ifname);
+			if (netdev && (kiblnd_get_link_status(netdev) == 1))
+				kiblnd_set_ni_fatal_on(dev->ibd_hdev, 0);
+			rcu_read_unlock();
+		}
+	}
+
+	return rc;
+}
+
+void
+kiblnd_destroy_dev(struct kib_dev *dev)
+{
+	LASSERT(dev->ibd_nnets == 0);
+	LASSERT(list_empty(&dev->ibd_nets));
+
+	list_del(&dev->ibd_fail_list);
+	list_del(&dev->ibd_list);
+
+        if (dev->ibd_hdev != NULL)
+                kiblnd_hdev_decref(dev->ibd_hdev);
+
+        LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+static void
+kiblnd_base_shutdown(void)
+{
+	struct kib_sched_info *sched;
+	struct kib_peer_ni *peer_ni;
+	int i;
+
+	LASSERT(list_empty(&kiblnd_data.kib_devs));
+
+	CDEBUG(D_MALLOC, "before LND base cleanup: kmem %lld\n",
+	       libcfs_kmem_read());
+
+	switch (kiblnd_data.kib_init) {
+	default:
+		LBUG();
+
+	case IBLND_INIT_ALL:
+	case IBLND_INIT_DATA:
+		hash_for_each(kiblnd_data.kib_peers, i, peer_ni, ibp_list)
+			LASSERT(0);
+		LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
+		LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
+		LASSERT(list_empty(&kiblnd_data.kib_reconn_list));
+		LASSERT(list_empty(&kiblnd_data.kib_reconn_wait));
+
+		/* flag threads to terminate; wake and wait for them to die */
+		kiblnd_data.kib_shutdown = 1;
+
+		/* NB: we really want to stop scheduler threads net by net
+		 * instead of the whole module, this should be improved
+		 * with dynamic configuration LNet.
+		 */
+		cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+			wake_up_all(&sched->ibs_waitq);
+
+		wake_up(&kiblnd_data.kib_connd_waitq);
+		wake_up(&kiblnd_data.kib_failover_waitq);
+
+		wait_var_event_warning(&kiblnd_data.kib_nthreads,
+				       !atomic_read(&kiblnd_data.kib_nthreads),
+				       "Waiting for %d threads to terminate\n",
+				       atomic_read(&kiblnd_data.kib_nthreads));
+		fallthrough;
+
+	case IBLND_INIT_NOTHING:
+		break;
+	}
+
+	if (kiblnd_data.kib_scheds != NULL)
+		cfs_percpt_free(kiblnd_data.kib_scheds);
+
+	CDEBUG(D_MALLOC, "after LND base cleanup: kmem %lld\n",
+	       libcfs_kmem_read());
+
+	kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+	module_put(THIS_MODULE);
+}
+
+static void
+kiblnd_shutdown(struct lnet_ni *ni)
+{
+	struct kib_net *net = ni->ni_data;
+	rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
+	unsigned long     flags;
+
+        LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+        if (net == NULL)
+                goto out;
+
+	CDEBUG(D_MALLOC, "before LND net cleanup: kmem %lld\n",
+	       libcfs_kmem_read());
+
+	write_lock_irqsave(g_lock, flags);
+	net->ibn_shutdown = 1;
+	write_unlock_irqrestore(g_lock, flags);
+
+        switch (net->ibn_init) {
+        default:
+                LBUG();
+
+	case IBLND_INIT_ALL:
+		/* nuke all existing peers within this net */
+		kiblnd_del_peer(ni, LNET_NID_ANY);
+
+		/* Wait for all peer_ni state to clean up */
+		wait_var_event_warning(&net->ibn_npeers,
+				       atomic_read(&net->ibn_npeers) == 0,
+				       "%s: waiting for %d peers to disconnect\n",
+				       libcfs_nidstr(&ni->ni_nid),
+				       atomic_read(&net->ibn_npeers));
+
+		kiblnd_net_fini_pools(net);
+
+		write_lock_irqsave(g_lock, flags);
+		LASSERT(net->ibn_dev->ibd_nnets > 0);
+		net->ibn_dev->ibd_nnets--;
+		list_del(&net->ibn_list);
+		write_unlock_irqrestore(g_lock, flags);
+
+		fallthrough;
+
+        case IBLND_INIT_NOTHING:
+		LASSERT (atomic_read(&net->ibn_nconns) == 0);
+
+                if (net->ibn_dev != NULL &&
+                    net->ibn_dev->ibd_nnets == 0)
+                        kiblnd_destroy_dev(net->ibn_dev);
+
+                break;
+        }
+
+	CDEBUG(D_MALLOC, "after LND net cleanup: kmem %lld\n",
+	       libcfs_kmem_read());
+
+        net->ibn_init = IBLND_INIT_NOTHING;
+        ni->ni_data = NULL;
+
+        LIBCFS_FREE(net, sizeof(*net));
+
+out:
+	if (list_empty(&kiblnd_data.kib_devs))
+                kiblnd_base_shutdown();
+}
+
+static int
+kiblnd_base_startup(struct net *ns)
+{
+	struct kib_sched_info *sched;
+	int rc;
+	int i;
+
+	LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+	if (!try_module_get(THIS_MODULE))
+		goto failed;
+
+	memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
+
+	rwlock_init(&kiblnd_data.kib_global_lock);
+
+	INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+	INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+	hash_init(kiblnd_data.kib_peers);
+
+	spin_lock_init(&kiblnd_data.kib_connd_lock);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_waits);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+	INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list);
+	INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait);
+
+	init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+	init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+	kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+						  sizeof(*sched));
+	if (kiblnd_data.kib_scheds == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+		int	nthrs;
+
+		spin_lock_init(&sched->ibs_lock);
+		INIT_LIST_HEAD(&sched->ibs_conns);
+		init_waitqueue_head(&sched->ibs_waitq);
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+		} else {
+			/* max to half of CPUs, another half is reserved for
+			 * upper layer modules */
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+		}
+
+		sched->ibs_nthreads_max = nthrs;
+		sched->ibs_cpt = i;
+	}
+
+	kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+	/* lists/ptrs/locks initialised */
+	kiblnd_data.kib_init = IBLND_INIT_DATA;
+	/*****************************************************/
+
+	rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+		goto failed;
+	}
+
+	if (*kiblnd_tunables.kib_dev_failover != 0)
+		rc = kiblnd_thread_start(kiblnd_failover_thread, ns,
+					 "kiblnd_failover");
+
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+		goto failed;
+	}
+
+	/* flag everything initialised */
+	kiblnd_data.kib_init = IBLND_INIT_ALL;
+	/*****************************************************/
+
+	return 0;
+
+ failed:
+	kiblnd_base_shutdown();
+	return -ENETDOWN;
+}
+
+static int
+kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+	int	rc = 0;
+	int	nthrs;
+	int	i;
+
+	if (sched->ibs_nthreads == 0) {
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = sched->ibs_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       sched->ibs_cpt);
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+			nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+		}
+	} else {
+		LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+		/* increase one thread if there is new interface */
+		nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long	id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+
+		rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id,
+					 "kiblnd_sd_%02ld_%02ld",
+					 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+		break;
+	}
+
+	sched->ibs_nthreads += i;
+	return rc;
+}
+
+static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts,
+				    int ncpts)
+{
+	int	cpt;
+	int	rc;
+	int	i;
+
+	for (i = 0; i < ncpts; i++) {
+		struct kib_sched_info *sched;
+
+		cpt = (cpts == NULL) ? i : cpts[i];
+		sched = kiblnd_data.kib_scheds[cpt];
+
+		if (!newdev && sched->ibs_nthreads > 0)
+			continue;
+
+		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+		if (rc != 0) {
+			CERROR("Failed to start scheduler threads for %s\n",
+			       dev->ibd_ifname);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static struct kib_dev *
+kiblnd_dev_search(char *ifname)
+{
+	struct kib_dev *alias = NULL;
+	struct kib_dev *dev;
+	char            *colon;
+	char            *colon2;
+
+	colon = strchr(ifname, ':');
+	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			return dev;
+
+		if (alias != NULL)
+			continue;
+
+		colon2 = strchr(dev->ibd_ifname, ':');
+		if (colon != NULL)
+			*colon = 0;
+		if (colon2 != NULL)
+			*colon2 = 0;
+
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			alias = dev;
+
+		if (colon != NULL)
+			*colon = ':';
+		if (colon2 != NULL)
+			*colon2 = ':';
+	}
+	return alias;
+}
+
+static int
+kiblnd_startup(struct lnet_ni *ni)
+{
+	char *ifname = NULL;
+	struct lnet_inetdev *ifaces = NULL;
+	struct kib_dev *ibdev = NULL;
+	struct kib_net *net = NULL;
+	unsigned long flags;
+	int rc;
+	int i;
+	bool newdev;
+
+	LASSERT(ni->ni_net->net_lnd == &the_o2iblnd);
+
+	if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+		rc = kiblnd_base_startup(ni->ni_net_ns);
+		if (rc != 0)
+			return rc;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	ni->ni_data = net;
+	if (net == NULL) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	net->ibn_ni = ni;
+	net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC;
+
+	kiblnd_tunables_setup(ni);
+
+	/*
+	 * Multi-Rail wants each secondary
+	 * IP to be treated as an unique 'struct ni' interface.
+	 */
+	if (ni->ni_interface != NULL) {
+		/* Use the IPoIB interface specified in 'networks=' */
+		ifname = ni->ni_interface;
+	} else {
+		ifname = *kiblnd_tunables.kib_default_ipif;
+	}
+
+	if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+		CERROR("IPoIB interface name too long: %s\n", ifname);
+		rc = -E2BIG;
+		goto failed;
+	}
+
+	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
+	if (rc < 0)
+		goto failed;
+
+	for (i = 0; i < rc; i++) {
+		if (strcmp(ifname, ifaces[i].li_name) == 0)
+			break;
+	}
+
+	if (i == rc) {
+		CERROR("ko2iblnd: No matching interfaces\n");
+		rc = -ENOENT;
+		goto failed;
+	}
+
+	ibdev = kiblnd_dev_search(ifname);
+	newdev = ibdev == NULL;
+	/* hmm...create kib_dev even for alias */
+	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) {
+		LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
+		if (!ibdev) {
+			rc = -ENOMEM;
+			goto failed;
+		}
+
+		ibdev->ibd_ifip = ifaces[i].li_ipaddr;
+		strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
+			sizeof(ibdev->ibd_ifname));
+		ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
+
+		INIT_LIST_HEAD(&ibdev->ibd_nets);
+		INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
+		INIT_LIST_HEAD(&ibdev->ibd_fail_list);
+
+		/* initialize the device */
+		rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
+		if (rc) {
+			CERROR("ko2iblnd: Can't initialize device: rc = %d\n",
+			       rc);
+			goto failed;
+		}
+
+		list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
+	}
+
+	net->ibn_dev = ibdev;
+	ni->ni_nid.nid_addr[0] = cpu_to_be32(ibdev->ibd_ifip);
+
+	ni->ni_dev_cpt = ifaces[i].li_cpt;
+
+	rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto failed;
+
+	rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0) {
+		CERROR("Failed to initialize NI pools: %d\n", rc);
+		goto failed;
+	}
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	ibdev->ibd_nnets++;
+	list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+	/* for health check */
+	if (ibdev->ibd_hdev->ibh_state == IBLND_DEV_PORT_DOWN)
+		kiblnd_set_ni_fatal_on(ibdev->ibd_hdev, 1);
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	net->ibn_init = IBLND_INIT_ALL;
+
+	return 0;
+
+failed:
+	if (net != NULL && net->ibn_dev == NULL && ibdev != NULL)
+		kiblnd_destroy_dev(ibdev);
+
+	kfree(ifaces);
+	kiblnd_shutdown(ni);
+
+	CDEBUG(D_NET, "Configuration of device %s failed: rc = %d\n",
+	       ifname ? ifname : "", rc);
+
+	return -ENETDOWN;
+}
+
+static const struct lnet_lnd the_o2iblnd = {
+	.lnd_type	= O2IBLND,
+	.lnd_startup	= kiblnd_startup,
+	.lnd_shutdown	= kiblnd_shutdown,
+	.lnd_ctl	= kiblnd_ctl,
+	.lnd_send	= kiblnd_send,
+	.lnd_recv	= kiblnd_recv,
+	.lnd_get_dev_prio = kiblnd_get_dev_prio,
+};
+
+static void ko2inlnd_assert_wire_constants(void)
+{
+	BUILD_BUG_ON(IBLND_MSG_MAGIC != 0x0be91b91);
+	BUILD_BUG_ON(IBLND_MSG_VERSION_1 != 0x11);
+	BUILD_BUG_ON(IBLND_MSG_VERSION_2 != 0x12);
+	BUILD_BUG_ON(IBLND_MSG_VERSION != IBLND_MSG_VERSION_2);
+
+	BUILD_BUG_ON(IBLND_MSG_CONNREQ != 0xc0);
+	BUILD_BUG_ON(IBLND_MSG_CONNACK != 0xc1);
+	BUILD_BUG_ON(IBLND_MSG_NOOP != 0xd0);
+	BUILD_BUG_ON(IBLND_MSG_IMMEDIATE != 0xd1);
+	BUILD_BUG_ON(IBLND_MSG_PUT_REQ != 0xd2);
+	BUILD_BUG_ON(IBLND_MSG_PUT_NAK != 0xd3);
+	BUILD_BUG_ON(IBLND_MSG_PUT_ACK != 0xd4);
+	BUILD_BUG_ON(IBLND_MSG_PUT_DONE != 0xd5);
+	BUILD_BUG_ON(IBLND_MSG_GET_REQ != 0xd6);
+	BUILD_BUG_ON(IBLND_MSG_GET_DONE != 0xd7);
+
+	BUILD_BUG_ON(IBLND_REJECT_CONN_RACE != 1);
+	BUILD_BUG_ON(IBLND_REJECT_NO_RESOURCES != 2);
+	BUILD_BUG_ON(IBLND_REJECT_FATAL != 3);
+	BUILD_BUG_ON(IBLND_REJECT_CONN_UNCOMPAT != 4);
+	BUILD_BUG_ON(IBLND_REJECT_CONN_STALE != 5);
+	BUILD_BUG_ON(IBLND_REJECT_RDMA_FRAGS != 6);
+	BUILD_BUG_ON(IBLND_REJECT_MSG_QUEUE_SIZE != 7);
+	BUILD_BUG_ON(IBLND_REJECT_INVALID_SRV_ID != 8);
+
+	BUILD_BUG_ON((int)sizeof(struct kib_connparams) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_connparams, ibcp_queue_depth) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct kib_connparams *)0)->ibcp_queue_depth) != 2);
+	BUILD_BUG_ON((int)offsetof(struct kib_connparams, ibcp_max_frags) != 2);
+	BUILD_BUG_ON((int)sizeof(((struct kib_connparams *)0)->ibcp_max_frags) != 2);
+	BUILD_BUG_ON((int)offsetof(struct kib_connparams, ibcp_max_msg_size) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct kib_connparams *)0)->ibcp_max_msg_size) != 4);
+
+	BUILD_BUG_ON((int)sizeof(struct kib_immediate_msg) != 72);
+	BUILD_BUG_ON((int)offsetof(struct kib_immediate_msg, ibim_hdr) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct kib_immediate_msg *)0)->ibim_hdr) != 72);
+	BUILD_BUG_ON((int)offsetof(struct kib_immediate_msg, ibim_payload) != 72);
+	BUILD_BUG_ON((int)sizeof(((struct kib_immediate_msg *)0)->ibim_payload) != 0);
+
+	BUILD_BUG_ON((int)sizeof(struct kib_rdma_frag) != 12);
+	BUILD_BUG_ON((int)offsetof(struct kib_rdma_frag, rf_nob) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct kib_rdma_frag *)0)->rf_nob) != 4);
+	BUILD_BUG_ON((int)offsetof(struct kib_rdma_frag, rf_addr) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct kib_rdma_frag *)0)->rf_addr) != 8);
+
+	BUILD_BUG_ON((int)sizeof(struct kib_rdma_desc) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_rdma_desc, rd_key) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct kib_rdma_desc *)0)->rd_key) != 4);
+	BUILD_BUG_ON((int)offsetof(struct kib_rdma_desc, rd_nfrags) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct kib_rdma_desc *)0)->rd_nfrags) != 4);
+	BUILD_BUG_ON((int)offsetof(struct kib_rdma_desc, rd_frags) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct kib_rdma_desc *)0)->rd_frags) != 0);
+
+	BUILD_BUG_ON((int)sizeof(struct kib_putreq_msg) != 80);
+	BUILD_BUG_ON((int)offsetof(struct kib_putreq_msg, ibprm_hdr) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct kib_putreq_msg *)0)->ibprm_hdr) != 72);
+	BUILD_BUG_ON((int)offsetof(struct kib_putreq_msg, ibprm_cookie) != 72);
+	BUILD_BUG_ON((int)sizeof(((struct kib_putreq_msg *)0)->ibprm_cookie) != 8);
+
+	BUILD_BUG_ON((int)sizeof(struct kib_putack_msg) != 24);
+	BUILD_BUG_ON((int)offsetof(struct kib_putack_msg, ibpam_src_cookie) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct kib_putack_msg *)0)->ibpam_src_cookie) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_putack_msg, ibpam_dst_cookie) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct kib_putack_msg *)0)->ibpam_dst_cookie) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_putack_msg, ibpam_rd) != 16);
+	BUILD_BUG_ON((int)sizeof(((struct kib_putack_msg *)0)->ibpam_rd) != 8);
+
+	BUILD_BUG_ON((int)sizeof(struct kib_get_msg) != 88);
+	BUILD_BUG_ON((int)offsetof(struct kib_get_msg, ibgm_hdr) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct kib_get_msg *)0)->ibgm_hdr) != 72);
+	BUILD_BUG_ON((int)offsetof(struct kib_get_msg, ibgm_cookie) != 72);
+	BUILD_BUG_ON((int)sizeof(((struct kib_get_msg *)0)->ibgm_cookie) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_get_msg, ibgm_rd) != 80);
+	BUILD_BUG_ON((int)sizeof(((struct kib_get_msg *)0)->ibgm_rd) != 8);
+
+	BUILD_BUG_ON((int)sizeof(struct kib_completion_msg) != 12);
+	BUILD_BUG_ON((int)offsetof(struct kib_completion_msg, ibcm_cookie) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct kib_completion_msg *)0)->ibcm_cookie) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_completion_msg, ibcm_status) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct kib_completion_msg *)0)->ibcm_status) != 4);
+
+	/* Checks for struct kib_msg */
+	//BUILD_BUG_ON((int)sizeof(struct kib_msg) != 12);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_magic) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_magic) != 4);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_version) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_version) != 2);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_type) != 6);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_type) != 1);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_credits) != 7);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_credits) != 1);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_nob) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_nob) != 4);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_cksum) != 12);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_cksum) != 4);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_srcnid) != 16);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_srcnid) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_srcstamp) != 24);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_srcstamp) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_dstnid) != 32);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_dstnid) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_dststamp) != 40);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_dststamp) != 8);
+
+	/* Connparams */
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.connparams.ibcp_queue_depth) != 48);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.connparams.ibcp_queue_depth) != 2);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.connparams.ibcp_max_frags) != 50);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.connparams.ibcp_max_frags) != 2);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.connparams.ibcp_max_msg_size) != 52);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.connparams.ibcp_max_msg_size) != 4);
+
+	/* Immediate message */
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.immediate.ibim_hdr) != 48);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.immediate.ibim_hdr) != 72);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.immediate.ibim_payload) != 120);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.immediate.ibim_payload) != 0);
+
+	/* PUT req message */
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putreq.ibprm_hdr) != 48);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putreq.ibprm_hdr) != 72);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putreq.ibprm_cookie) != 120);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putreq.ibprm_cookie) != 8);
+
+	/* Put ACK */
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putack.ibpam_src_cookie) != 48);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putack.ibpam_src_cookie) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putack.ibpam_dst_cookie) != 56);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putack.ibpam_dst_cookie) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.putack.ibpam_rd) != 64);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.putack.ibpam_rd) != 8);
+
+	/* GET message */
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.get.ibgm_hdr) != 48);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.get.ibgm_hdr) != 72);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.get.ibgm_cookie) != 120);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.get.ibgm_cookie) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.get.ibgm_rd) != 128);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.get.ibgm_rd) != 8);
+
+	/* Completion message */
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.completion.ibcm_cookie) != 48);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.completion.ibcm_cookie) != 8);
+	BUILD_BUG_ON((int)offsetof(struct kib_msg, ibm_u.completion.ibcm_status) != 56);
+	BUILD_BUG_ON((int)sizeof(((struct kib_msg *)0)->ibm_u.completion.ibcm_status) != 4);
+
+	/* Sanity checks */
+	BUILD_BUG_ON(sizeof(struct kib_msg) > IBLND_MSG_SIZE);
+	BUILD_BUG_ON(offsetof(struct kib_msg,
+		     ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) >
+		     IBLND_MSG_SIZE);
+	BUILD_BUG_ON(offsetof(struct kib_msg,
+		     ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) >
+		     IBLND_MSG_SIZE);
+}
+
+static void __exit ko2iblnd_exit(void)
+{
+	lnet_unregister_lnd(&the_o2iblnd);
+}
+
+static int __init ko2iblnd_init(void)
+{
+	int rc;
+
+	ko2inlnd_assert_wire_constants();
+
+	rc = kiblnd_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_o2iblnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver");
+MODULE_VERSION("2.8.0");
+MODULE_LICENSE("GPL");
+
+module_init(ko2iblnd_init);
+module_exit(ko2iblnd_exit);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644
index 0000000000000..d3f651224ee47
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
@@ -0,0 +1,1170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#if defined(NEED_LOCKDEP_IS_HELD_DISCARD_CONST) \
+ && defined(CONFIG_LOCKDEP) \
+ && defined(lockdep_is_held)
+#undef lockdep_is_held
+	#define lockdep_is_held(lock) \
+		lock_is_held((struct lockdep_map *)&(lock)->dep_map)
+#endif
+
+#ifdef HAVE_COMPAT_RDMA
+#include <linux/compat-2.6.h>
+
+#ifdef LINUX_3_17_COMPAT_H
+#undef NEED_KTIME_GET_REAL_NS
+#endif
+
+#define HAVE_NLA_PUT_U64_64BIT 1
+#define HAVE_NLA_PARSE_6_PARAMS 1
+#define HAVE_NETLINK_EXTACK 1
+
+
+/* MOFED has its own bitmap_alloc backport */
+#define HAVE_BITMAP_ALLOC 1
+
+#endif
+
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#ifdef HAVE_FMR_POOL_API
+#include <rdma/ib_fmr_pool.h>
+#endif
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <lnet/lib-lnet.h>
+#include <lnet/lnet_rdma.h>
+#include "o2iblnd-idl.h"
+
+#define IBLND_PEER_HASH_BITS		7	/* log2 of # peer_ni lists */
+#define IBLND_N_SCHED			2
+#define IBLND_N_SCHED_HIGH		4
+
+struct kib_tunables {
+	int              *kib_dev_failover;     /* HCA failover */
+	unsigned int     *kib_service;          /* IB service number */
+	int              *kib_cksum;            /* checksum struct kib_msg? */
+	int              *kib_timeout;          /* comms timeout (seconds) */
+	int              *kib_keepalive;        /* keepalive timeout (seconds) */
+	char            **kib_default_ipif;     /* default IPoIB interface */
+	int              *kib_retry_count;
+	int              *kib_rnr_retry_count;
+	int		 *kib_ib_mtu;		/* IB MTU */
+	int              *kib_require_priv_port;/* accept only privileged ports */
+	int              *kib_use_priv_port;    /* use privileged port for active connect */
+	/* # threads on each CPT */
+	int		 *kib_nscheds;
+	int		 *kib_wrq_sge;		/* # sg elements per wrq */
+	int		 *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */
+};
+
+extern struct kib_tunables  kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1      8          /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1    7          /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT        8          /* default # of peer_ni credits */
+#define IBLND_CREDITS_MAX          ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
+
+/* when eagerly to return credits */
+#define IBLND_CREDITS_HIGHWATER(t, conn) ((conn->ibc_version) == IBLND_MSG_VERSION_1 ? \
+					IBLND_CREDIT_HIGHWATER_V1 : \
+			min(t->lnd_peercredits_hiw, (__u32)conn->ibc_queue_depth - 1))
+
+#ifdef HAVE_RDMA_CREATE_ID_5ARG
+# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+	 rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt)
+#else
+# ifdef HAVE_RDMA_CREATE_ID_4ARG
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+	  rdma_create_id(cb, dev, ps, qpt)
+# else
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+	  rdma_create_id(cb, dev, ps)
+# endif
+#endif
+
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v)           (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+/* max size of queued messages (inc hdr) */
+#define IBLND_MSG_SIZE              (4<<10)
+/* max # of fragments supported. + 1 for unaligned case */
+#define IBLND_MAX_RDMA_FRAGS        (LNET_MAX_IOV + 1)
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL			256
+#define IBLND_FMR_POOL			256
+#define IBLND_FMR_POOL_FLUSH		192
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(c)	\
+	((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
+#define IBLND_RX_MSG_BYTES(c)       (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(c)	\
+	((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)
+
+/* 2 = LNet msg + Transfer chain */
+#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE              IFALIASZ
+#else
+#define KIB_IFNAME_SIZE              256
+#endif
+
+enum kib_dev_caps {
+	IBLND_DEV_CAPS_FASTREG_ENABLED		= BIT(0),
+	IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT	= BIT(1),
+#ifdef HAVE_FMR_POOL_API
+	IBLND_DEV_CAPS_FMR_ENABLED		= BIT(2),
+#endif
+};
+
+#define IS_FAST_REG_DEV(dev) \
+	((dev)->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
+
+
+struct kib_dev {
+	struct list_head	ibd_list;	/* chain on kib_devs */
+	struct list_head	ibd_fail_list;	/* chain on kib_failed_devs */
+	__u32			ibd_ifip;	/* IPoIB interface IP */
+	/** IPoIB interface name */
+	char			ibd_ifname[KIB_IFNAME_SIZE];
+	int			ibd_nnets;	/* # nets extant */
+
+	time64_t		ibd_next_failover;
+	/* # failover failures */
+	int			ibd_failed_failover;
+	/* failover in progress */
+	unsigned int		ibd_failover;
+	/* IPoIB interface is a bonding master */
+	unsigned int		ibd_can_failover;
+	struct list_head	ibd_nets;
+	struct kib_hca_dev	*ibd_hdev;
+	enum kib_dev_caps	ibd_dev_caps;
+};
+
+struct kib_hca_dev {
+	struct rdma_cm_id   *ibh_cmid;          /* listener cmid */
+	struct ib_device    *ibh_ibdev;         /* IB device */
+	int                  ibh_page_shift;    /* page shift of current HCA */
+	int                  ibh_page_size;     /* page size of current HCA */
+	__u64                ibh_page_mask;     /* page mask of current HCA */
+	__u64                ibh_mr_size;       /* size of MR */
+	int		     ibh_max_qp_wr;     /* maximum work requests size */
+#ifdef HAVE_IB_GET_DMA_MR
+	struct ib_mr        *ibh_mrs;           /* global MR */
+#endif
+	struct ib_pd        *ibh_pd;            /* PD */
+	u8                   ibh_port;          /* port number */
+	struct ib_event_handler
+			     ibh_event_handler; /* IB event handler */
+	int                  ibh_state;         /* device status */
+#define IBLND_DEV_PORT_DOWN     0
+#define IBLND_DEV_PORT_ACTIVE   1
+#define IBLND_DEV_FATAL         2
+	struct kib_dev           *ibh_dev;           /* owner */
+	atomic_t             ibh_ref;           /* refcount */
+};
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE     300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY        1
+
+struct kib_pages {
+        int                     ibp_npages;             /* # pages */
+        struct page            *ibp_pages[0];           /* page array */
+};
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+				     int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN     32
+
+struct kib_poolset {
+	/* serialize */
+	spinlock_t		ps_lock;
+	/* network it belongs to */
+	struct kib_net		*ps_net;
+	/* pool set name */
+	char			ps_name[IBLND_POOL_NAME_LEN];
+	/* list of pools */
+	struct list_head	ps_pool_list;
+	/* failed pool list */
+	struct list_head	ps_failed_pool_list;
+	/* time stamp for retry if failed to allocate */
+	time64_t		ps_next_retry;
+	/* is allocating new pool */
+	int			ps_increasing;
+	/* new pool size */
+	int			ps_pool_size;
+	/* CPT id */
+	int			ps_cpt;
+
+	/* create a new pool */
+	kib_ps_pool_create_t	ps_pool_create;
+	/* destroy a pool */
+	kib_ps_pool_destroy_t	ps_pool_destroy;
+	/* initialize new allocated node */
+	kib_ps_node_init_t	ps_node_init;
+	/* finalize node */
+	kib_ps_node_fini_t	ps_node_fini;
+};
+
+struct kib_pool {
+	/* chain on pool list */
+	struct list_head	po_list;
+	/* pre-allocated node */
+	struct list_head	po_free_list;
+	/* pool_set of this pool */
+	struct kib_poolset     *po_owner;
+	/* deadline of this pool */
+	time64_t		po_deadline;
+	/* # of elements in use */
+	int			po_allocated;
+	/* pool is created on failed HCA */
+	int			po_failed;
+	/* # of pre-allocated elements */
+	int			po_size;
+};
+
+struct kib_tx_poolset {
+	struct kib_poolset	tps_poolset;		/* pool-set */
+        __u64                   tps_next_tx_cookie;     /* cookie of TX */
+};
+
+struct kib_tx_pool {
+	struct kib_pool		tpo_pool;		/* pool */
+        struct kib_hca_dev     *tpo_hdev;               /* device for this pool */
+        struct kib_tx          *tpo_tx_descs;           /* all the tx descriptors */
+	struct kib_pages       *tpo_tx_pages;           /* premapped tx msg pages */
+};
+
+struct kib_fmr_poolset {
+	spinlock_t		fps_lock;		/* serialize */
+	struct kib_net	       *fps_net;		/* IB network */
+	struct list_head	fps_pool_list;		/* FMR pool list */
+	struct list_head	fps_failed_pool_list;	/* FMR pool list */
+	__u64			fps_version;		/* validity stamp */
+	int			fps_cpt;		/* CPT id */
+	int			fps_pool_size;
+	int			fps_flush_trigger;
+	int			fps_cache;
+	/* is allocating new pool */
+	int			fps_increasing;
+	/* time stamp for retry if failed to allocate */
+	time64_t		fps_next_retry;
+};
+
+#ifndef HAVE_IB_RDMA_WR
+struct ib_rdma_wr {
+	struct ib_send_wr wr;
+};
+#endif
+
+struct kib_fast_reg_descriptor { /* For fast registration */
+	struct list_head		 frd_list;
+	struct ib_rdma_wr		 frd_inv_wr;
+#ifdef HAVE_IB_MAP_MR_SG
+	struct ib_reg_wr		 frd_fastreg_wr;
+#else
+	struct ib_rdma_wr		 frd_fastreg_wr;
+	struct ib_fast_reg_page_list    *frd_frpl;
+#endif
+	struct ib_mr			*frd_mr;
+	bool				 frd_valid;
+	bool				 frd_posted;
+};
+
+struct kib_fmr_pool {
+	struct list_head	fpo_list;	/* chain on pool list */
+	struct kib_hca_dev     *fpo_hdev;	/* device for this pool */
+	struct kib_fmr_poolset      *fpo_owner;	/* owner of this pool */
+#ifdef HAVE_FMR_POOL_API
+	union {
+		struct {
+			struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
+		} fmr;
+#endif
+		struct { /* For fast registration */
+			struct list_head  fpo_pool_list;
+			int		  fpo_pool_size;
+		} fast_reg;
+#ifdef HAVE_FMR_POOL_API
+	};
+	bool			fpo_is_fmr; /* True if FMR pools allocated */
+#endif
+	time64_t		fpo_deadline;	/* deadline of this pool */
+	int			fpo_failed;	/* fmr pool is failed */
+	int			fpo_map_count;	/* # of mapped FMR */
+};
+
+struct kib_fmr {
+	struct kib_fmr_pool		*fmr_pool;	/* pool of FMR */
+#ifdef HAVE_FMR_POOL_API
+	struct ib_pool_fmr		*fmr_pfmr;	/* IB pool fmr */
+#endif /* HAVE_FMR_POOL_API */
+	struct kib_fast_reg_descriptor	*fmr_frd;
+	u32				 fmr_key;
+};
+
+#ifdef HAVE_FMR_POOL_API
+
+#ifdef HAVE_ORACLE_OFED_EXTENSIONS
+#define kib_fmr_pool_map(pool, pgs, n, iov) \
+	ib_fmr_pool_map_phys((pool), (pgs), (n), (iov), NULL)
+#else
+#define kib_fmr_pool_map(pool, pgs, n, iov) \
+	ib_fmr_pool_map_phys((pool), (pgs), (n), (iov))
+#endif
+
+#endif /* HAVE_FMR_POOL_API */
+
+struct kib_net {
+	/* chain on struct kib_dev::ibd_nets */
+	struct list_head	ibn_list;
+	__u64			ibn_incarnation;/* my epoch */
+	int			ibn_init;	/* initialisation state */
+	int			ibn_shutdown;	/* shutting down? */
+
+	atomic_t		ibn_npeers;	/* # peers extant */
+	atomic_t		ibn_nconns;	/* # connections extant */
+
+	struct kib_tx_poolset	**ibn_tx_ps;	/* tx pool-set */
+	struct kib_fmr_poolset	**ibn_fmr_ps;	/* fmr pool-set */
+
+	struct kib_dev		*ibn_dev;	/* underlying IB device */
+	struct lnet_ni          *ibn_ni;        /* LNet interface */
+};
+
+#define KIB_THREAD_SHIFT		16
+#define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)		((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)		((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+	/* serialise */
+	spinlock_t		ibs_lock;
+	/* schedulers sleep here */
+	wait_queue_head_t	ibs_waitq;
+	/* conns to check for rx completions */
+	struct list_head	ibs_conns;
+	/* number of scheduler threads */
+	int			ibs_nthreads;
+	/* max allowed scheduler threads */
+	int			ibs_nthreads_max;
+	int			ibs_cpt;	/* CPT id */
+};
+
+struct kib_data {
+	int			kib_init;	/* initialisation state */
+	int			kib_shutdown;	/* shut down? */
+	struct list_head	kib_devs;	/* IB devices extant */
+	/* list head of failed devices */
+	struct list_head	kib_failed_devs;
+	/* schedulers sleep here */
+	wait_queue_head_t	kib_failover_waitq;
+	atomic_t		kib_nthreads;	/* # live threads */
+	/* stabilize net/dev/peer_ni/conn ops */
+	rwlock_t		kib_global_lock;
+	/* hash table of all my known peers */
+	DECLARE_HASHTABLE(kib_peers, IBLND_PEER_HASH_BITS);
+	/* the connd task (serialisation assertions) */
+	void			*kib_connd;
+	/* connections to setup/teardown */
+	struct list_head	kib_connd_conns;
+	/* connections with zero refcount */
+	struct list_head	kib_connd_zombies;
+	/* connections to reconnect */
+	struct list_head	kib_reconn_list;
+	/* peers wait for reconnection */
+	struct list_head	kib_reconn_wait;
+	/* connections wait for completion */
+	struct list_head	kib_connd_waits;
+	/*
+	 * The second that peers are pulled out from \a kib_reconn_wait
+	 * for reconnection.
+	 */
+	time64_t		kib_reconn_sec;
+	/* connection daemon sleeps here */
+	wait_queue_head_t	kib_connd_waitq;
+	spinlock_t		kib_connd_lock;	/* serialise */
+	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
+	/* percpt data for schedulers */
+	struct kib_sched_info	**kib_scheds;
+};
+
+#define IBLND_INIT_NOTHING         0
+#define IBLND_INIT_DATA            1
+#define IBLND_INIT_ALL             2
+
+struct kib_rx {					/* receive message */
+	/* queue for attention */
+	struct list_head	rx_list;
+	/* owning conn */
+	struct kib_conn	       *rx_conn;
+	/* # bytes received (-1 while posted) */
+	int			rx_nob;
+	/* message buffer (host vaddr) */
+	struct kib_msg	       *rx_msg;
+	/* message buffer (I/O addr) */
+	__u64			rx_msgaddr;
+	/* for dma_unmap_single() */
+	DEFINE_DMA_UNMAP_ADDR(rx_msgunmap);
+	/* receive work item... */
+	struct ib_recv_wr	rx_wrq;
+	/* ...and its memory */
+	struct ib_sge		rx_sge;
+};
+
+#define IBLND_POSTRX_DONT_POST    0             /* don't post */
+#define IBLND_POSTRX_NO_CREDIT    1             /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer_ni back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3             /* post: give myself back 1 reserved credit */
+
+struct kib_tx {					/* transmit message */
+	/* queue on idle_txs ibc_tx_queue etc. */
+	struct list_head	tx_list;
+	/* pool I'm from */
+	struct kib_tx_pool	*tx_pool;
+	/* owning conn */
+	struct kib_conn		*tx_conn;
+	/* # tx callbacks outstanding */
+	short			tx_sending;
+	/* queued for sending */
+	short			tx_queued;
+	/* waiting for peer_ni */
+	short			tx_waiting;
+	/* LNET completion status */
+	int			tx_status;
+	/* health status of the transmit */
+	enum lnet_msg_hstatus	tx_hstatus;
+	/* completion deadline */
+	ktime_t			tx_deadline;
+	/* completion cookie */
+	__u64			tx_cookie;
+	/* lnet msgs to finalize on completion */
+	struct lnet_msg		*tx_lntmsg[2];
+	/* message buffer (host vaddr) */
+	struct kib_msg		*tx_msg;
+	/* message buffer (I/O addr) */
+	__u64			tx_msgaddr;
+	/* for dma_unmap_single() */
+	DEFINE_DMA_UNMAP_ADDR(tx_msgunmap);
+	/* # send work items */
+	int			tx_nwrq;
+	/* # used scatter/gather elements */
+	int			tx_nsge;
+	/* send work items... */
+	struct ib_rdma_wr	*tx_wrq;
+	/* ...and their memory */
+	struct ib_sge		*tx_sge;
+	/* rdma descriptor */
+	struct kib_rdma_desc	*tx_rd;
+	/* # entries in... */
+	int			tx_nfrags;
+	/* dma_map_sg descriptor */
+	struct scatterlist	*tx_frags;
+	/* rdma phys page addrs */
+	__u64			*tx_pages;
+	/* gaps in fragments */
+	bool			tx_gaps;
+	/* FMR */
+	struct kib_fmr		tx_fmr;
+				/* dma direction */
+	int			tx_dmadir;
+};
+
+struct kib_connvars {
+        /* connection-in-progress variables */
+	struct kib_msg		cv_msg;
+};
+
+struct kib_conn {
+	/* scheduler information */
+	struct kib_sched_info	*ibc_sched;
+	/* owning peer_ni */
+	struct kib_peer_ni	*ibc_peer;
+	/* HCA bound on */
+	struct kib_hca_dev	*ibc_hdev;
+	/* stash on peer_ni's conn list */
+	struct list_head	ibc_list;
+	/* schedule for attention */
+	struct list_head	ibc_sched_list;
+	/* version of connection */
+	__u16			ibc_version;
+	/* reconnect later */
+	__u16			ibc_reconnect:1;
+	/* which instance of the peer */
+	__u64			ibc_incarnation;
+	/* # users */
+	atomic_t		ibc_refcount;
+	/* what's happening */
+	int			ibc_state;
+	/* # uncompleted sends */
+	int			ibc_nsends_posted;
+	/* # uncompleted NOOPs */
+	int			ibc_noops_posted;
+	/* # credits I have */
+	int			ibc_credits;
+	/* # credits to return */
+	int			ibc_outstanding_credits;
+	/* # ACK/DONE msg credits */
+	int			ibc_reserved_credits;
+	/* set on comms error */
+	int			ibc_comms_error;
+	/* connections queue depth */
+	__u16			ibc_queue_depth;
+	/* connections max frags */
+	__u16			ibc_max_frags;
+	/* count of timeout txs waiting on cq */
+	__u16			ibc_waits;
+	/* receive buffers owned */
+	unsigned int		ibc_nrx:16;
+	/* scheduled for attention */
+	unsigned int		ibc_scheduled:1;
+	/* CQ callback fired */
+	unsigned int		ibc_ready:1;
+	/* time of last send */
+	ktime_t			ibc_last_send;
+	/** link chain for kiblnd_check_conns only */
+	struct list_head	ibc_connd_list;
+	/** rxs completed before ESTABLISHED */
+	struct list_head	ibc_early_rxs;
+	/** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+	struct list_head	ibc_tx_noops;
+	/* sends that need a credit */
+	struct list_head	ibc_tx_queue;
+	/* sends that don't need a credit */
+	struct list_head	ibc_tx_queue_nocred;
+	/* sends that need to reserve an ACK/DONE msg */
+	struct list_head	ibc_tx_queue_rsrvd;
+	/* active tx awaiting completion */
+	struct list_head	ibc_active_txs;
+	/* zombie tx awaiting done */
+	struct list_head	ibc_zombie_txs;
+	/* serialise */
+	spinlock_t		ibc_lock;
+	/* the rx descs */
+	struct kib_rx		*ibc_rxs;
+	/* premapped rx msg pages */
+	struct kib_pages	*ibc_rx_pages;
+
+	/* CM id */
+	struct rdma_cm_id	*ibc_cmid;
+	/* completion queue */
+	struct ib_cq		*ibc_cq;
+
+	/* in-progress connection state */
+	struct kib_connvars	*ibc_connvars;
+};
+
+#define IBLND_CONN_INIT               0         /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT       2         /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED        3         /* connection established */
+#define IBLND_CONN_CLOSING            4         /* being closed */
+#define IBLND_CONN_DISCONNECTED       5         /* disconnected */
+
+struct kib_peer_ni {
+	/* on peer_ni hash chain */
+	struct hlist_node	ibp_list;
+	/* who's on the other end(s) */
+	lnet_nid_t		ibp_nid;
+	/* LNet interface */
+	struct lnet_ni		*ibp_ni;
+	/* all active connections */
+	struct list_head	ibp_conns;
+	/* next connection to send on for round robin */
+	struct kib_conn		*ibp_next_conn;
+	/* msgs waiting for a conn */
+	struct list_head	ibp_tx_queue;
+	/* incarnation of peer_ni */
+	__u64			ibp_incarnation;
+	/* when (in seconds) I was last alive */
+	time64_t		ibp_last_alive;
+	/* # users */
+	struct kref		ibp_kref;
+	/* version of peer_ni */
+	__u16			ibp_version;
+	/* current passive connection attempts */
+	unsigned short		ibp_accepting;
+	/* current active connection attempts */
+	unsigned short		ibp_connecting;
+	/* reconnect this peer_ni later */
+	unsigned char		ibp_reconnecting;
+	/* counter of how many times we triggered a conn race */
+	unsigned char		ibp_races;
+	/* # consecutive reconnection attempts to this peer */
+	unsigned int		ibp_reconnected;
+	/* errno on closing this peer_ni */
+	int			ibp_error;
+	/* max map_on_demand */
+	__u16			ibp_max_frags;
+	/* max_peer_credits */
+	__u16			ibp_queue_depth;
+	/* reduced value which allows conn to be created if max fails */
+	__u16                   ibp_queue_depth_mod;
+};
+
+#ifndef HAVE_IB_INC_RKEY
+/**
+ * ib_inc_rkey - increments the key portion of the given rkey. Can be used
+ * for calculating a new rkey for type 2 memory windows.
+ * @rkey - the rkey to increment.
+ */
+static inline u32 ib_inc_rkey(u32 rkey)
+{
+	const u32 mask = 0x000000ff;
+	return ((rkey + 1) & mask) | (rkey & ~mask);
+}
+#endif
+
+extern struct kib_data kiblnd_data;
+
+extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
+
+int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
+
+static inline int kiblnd_timeout(void)
+{
+	return *kiblnd_tunables.kib_timeout ? *kiblnd_tunables.kib_timeout :
+		lnet_get_lnd_timeout();
+}
+
+static inline int
+kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	int concurrent_sends;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	concurrent_sends = tunables->lnd_concurrent_sends;
+
+	if (version == IBLND_MSG_VERSION_1) {
+		if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+			return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+		if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+			return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+	}
+
+	return concurrent_sends;
+}
+
+static inline void
+kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
+{
+	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
+	atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(struct kib_hca_dev *hdev)
+{
+	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
+	if (atomic_dec_and_test(&hdev->ibh_ref))
+		kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(struct kib_dev *dev)
+{
+	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+                return 0;
+
+        if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+                return 0;
+
+        if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+                return 1;
+
+        return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn)                                \
+do {                                                            \
+        CDEBUG(D_NET, "conn[%p] (%d)++\n",                      \
+	       (conn), atomic_read(&(conn)->ibc_refcount)); \
+	atomic_inc(&(conn)->ibc_refcount);                  \
+} while (0)
+
+#define kiblnd_conn_decref(conn)					\
+do {									\
+	unsigned long flags;						\
+									\
+	CDEBUG(D_NET, "conn[%p] (%d)--\n",				\
+	       (conn), atomic_read(&(conn)->ibc_refcount));		\
+	LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);			\
+	if (atomic_dec_and_test(&(conn)->ibc_refcount)) {		\
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);	\
+		list_add_tail(&(conn)->ibc_list,			\
+				  &kiblnd_data.kib_connd_zombies);	\
+		wake_up(&kiblnd_data.kib_connd_waitq);		\
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+	}								\
+} while (0)
+
+void kiblnd_destroy_peer(struct kref *kref);
+
+static inline void kiblnd_peer_addref(struct kib_peer_ni *peer_ni)
+{
+	CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)++\n",
+	       peer_ni, libcfs_nid2str(peer_ni->ibp_nid),
+	       kref_read(&peer_ni->ibp_kref));
+	kref_get(&(peer_ni)->ibp_kref);
+}
+
+static inline void kiblnd_peer_decref(struct kib_peer_ni *peer_ni)
+{
+	CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)--\n",
+	       peer_ni, libcfs_nid2str(peer_ni->ibp_nid),
+	       kref_read(&peer_ni->ibp_kref));
+	kref_put(&peer_ni->ibp_kref, kiblnd_destroy_peer);
+}
+
+static inline bool
+kiblnd_peer_connecting(struct kib_peer_ni *peer_ni)
+{
+	return peer_ni->ibp_connecting != 0 ||
+	       peer_ni->ibp_reconnecting != 0 ||
+	       peer_ni->ibp_accepting != 0;
+}
+
+static inline bool
+kiblnd_peer_idle(struct kib_peer_ni *peer_ni)
+{
+	return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
+}
+
+static inline int
+kiblnd_peer_active(struct kib_peer_ni *peer_ni)
+{
+	/* Am I in the peer_ni hash table? */
+	return !hlist_unhashed(&peer_ni->ibp_list);
+}
+
+static inline struct kib_conn *
+kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni)
+{
+	struct list_head *next;
+
+	LASSERT(!list_empty(&peer_ni->ibp_conns));
+
+	/* Advance to next connection, be sure to skip the head node */
+	if (!peer_ni->ibp_next_conn ||
+	    peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns)
+		next = peer_ni->ibp_conns.next;
+	else
+		next = peer_ni->ibp_next_conn->ibc_list.next;
+	peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list);
+
+	return peer_ni->ibp_next_conn;
+}
+
+static inline int
+kiblnd_send_keepalive(struct kib_conn *conn)
+{
+	s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC;
+
+	return (*kiblnd_tunables.kib_keepalive > 0) &&
+		ktime_after(ktime_get(),
+			    ktime_add_ns(conn->ibc_last_send, keepalive_ns));
+}
+
+static inline int
+kiblnd_need_noop(struct kib_conn *conn)
+{
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+        if (conn->ibc_outstanding_credits <
+	    IBLND_CREDITS_HIGHWATER(tunables, conn) &&
+            !kiblnd_send_keepalive(conn))
+                return 0; /* No need to send NOOP */
+
+        if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+		if (!list_empty(&conn->ibc_tx_queue_nocred))
+                        return 0; /* NOOP can be piggybacked */
+
+                /* No tx to piggyback NOOP onto or no credit to send a tx */
+		return (list_empty(&conn->ibc_tx_queue) ||
+                        conn->ibc_credits == 0);
+        }
+
+	if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+	    !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+            conn->ibc_credits == 0)                    /* no credit */
+                return 0;
+
+        if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+            conn->ibc_outstanding_credits == 0) /* giving back credits */
+                return 0;
+
+        /* No tx to piggyback NOOP onto or no credit to send a tx */
+	return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(struct kib_conn *conn)
+{
+        ib_modify_qp(conn->ibc_cmid->qp,
+                     &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
+{
+	if (q == &conn->ibc_tx_queue)
+		return "tx_queue";
+
+	if (q == &conn->ibc_tx_queue_rsrvd)
+		return "tx_queue_rsrvd";
+
+	if (q == &conn->ibc_tx_queue_nocred)
+		return "tx_queue_nocred";
+
+	if (q == &conn->ibc_active_txs)
+		return "active_txs";
+
+	LBUG();
+	return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_INVAL	0
+#define IBLND_WID_TX	1
+#define IBLND_WID_RX	2
+#define IBLND_WID_RDMA	3
+#define IBLND_WID_MR	4
+#define IBLND_WID_MASK	7UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & IBLND_WID_MASK) == 0);
+        LASSERT ((type & ~IBLND_WID_MASK) == 0);
+        return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+        return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state(struct kib_conn *conn, int state)
+{
+	conn->ibc_state = state;
+	smp_mb();
+}
+
+static inline void
+kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
+{
+        msg->ibm_type = type;
+	msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size(struct kib_rdma_desc *rd)
+{
+        int   i;
+        int   size;
+
+        for (i = size = 0; i < rd->rd_nfrags; i++)
+                size += rd->rd_frags[i].rf_nob;
+
+        return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
+{
+        return rd->rd_frags[index].rf_addr;
+}
+
+static inline int
+kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
+{
+        return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
+{
+        return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
+{
+        if (nob < rd->rd_frags[index].rf_nob) {
+                rd->rd_frags[index].rf_addr += nob;
+                rd->rd_frags[index].rf_nob  -= nob;
+        } else {
+                index ++;
+        }
+
+        return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
+{
+        LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+                 msgtype == IBLND_MSG_PUT_ACK);
+
+        return msgtype == IBLND_MSG_GET_REQ ?
+	       offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
+	       offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
+}
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+        return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+                                          void *msg, size_t size,
+                                          enum dma_data_direction direction)
+{
+        return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+                                           __u64 addr, size_t size,
+                                          enum dma_data_direction direction)
+{
+        ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a)      (a)
+
+static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev,
+				    struct scatterlist *sg, int nents,
+				    enum dma_data_direction direction)
+{
+	int count;
+
+	count = lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
+				       sg, nents, direction);
+
+	if (count != 0)
+		return count;
+
+	return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev,
+				       struct scatterlist *sg, int nents,
+				       enum dma_data_direction direction)
+{
+	int count;
+
+	count = lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
+				   sg, nents, direction);
+	if (count != 0)
+		return;
+
+	ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
+}
+
+#ifndef HAVE_IB_SG_DMA_ADDRESS
+#include <linux/scatterlist.h>
+#define ib_sg_dma_address(dev, sg)	sg_dma_address(sg)
+#define ib_sg_dma_len(dev, sg)		sg_dma_len(sg)
+#endif
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+                                          struct scatterlist *sg)
+{
+        return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+                                             struct scatterlist *sg)
+{
+        return ib_sg_dma_len(dev, sg);
+}
+
+#ifndef HAVE_RDMA_CONNECT_LOCKED
+#define rdma_connect_locked(cmid, cpp)	rdma_connect(cmid, cpp)
+#endif
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e)            ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e)        ((e)->param.conn.private_data_len)
+
+void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs);
+void kiblnd_map_rx_descs(struct kib_conn *conn);
+void kiblnd_unmap_rx_descs(struct kib_conn *conn);
+void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);
+
+int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
+			struct kib_rdma_desc *rd, u32 nob, u64 iov,
+			struct kib_fmr *fmr);
+void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);
+
+int  kiblnd_tunables_setup(struct lnet_ni *ni);
+int  kiblnd_tunables_init(void);
+
+int  kiblnd_connd (void *arg);
+int  kiblnd_scheduler(void *arg);
+#define kiblnd_thread_start(fn, data, namefmt, arg...)			\
+	({								\
+		struct task_struct *__task = kthread_run(fn, data,	\
+							 namefmt, ##arg); \
+		if (!IS_ERR(__task))					\
+			atomic_inc(&kiblnd_data.kib_nthreads);		\
+		PTR_ERR_OR_ZERO(__task);				\
+	})
+
+int  kiblnd_failover_thread (void *arg);
+
+int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
+
+int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
+                        struct rdma_cm_event *event);
+int  kiblnd_translate_mtu(int value);
+
+int  kiblnd_dev_failover(struct kib_dev *dev, struct net *ns);
+int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
+		       lnet_nid_t nid);
+bool kiblnd_reconnect_peer(struct kib_peer_ni *peer);
+void kiblnd_destroy_dev(struct kib_dev *dev);
+void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni);
+struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
+int  kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
+				     int version, u64 incarnation);
+int  kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why);
+
+struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni,
+				    struct rdma_cm_id *cmid,
+				    int state, int version);
+void kiblnd_destroy_conn(struct kib_conn *conn);
+void kiblnd_close_conn(struct kib_conn *conn, int error);
+void kiblnd_close_conn_locked(struct kib_conn *conn, int error);
+
+void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid);
+void kiblnd_txlist_done(struct list_head *txlist, int status,
+			enum lnet_msg_hstatus hstatus);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
+		     int credits, lnet_nid_t dstnid, __u64 dststamp);
+int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
+int kiblnd_post_rx(struct kib_rx *rx, int credit);
+
+int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
+		int delayed, unsigned int niov,
+		struct bio_vec *kiov, unsigned int offset, unsigned int mlen,
+		unsigned int rlen);
+unsigned int kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx);
+
+
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644
index 0000000000000..75895d69b080a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -0,0 +1,4021 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+#define MAX_CONN_RACES_BEFORE_ABORT 20
+
+static void kiblnd_peer_alive(struct kib_peer_ni *peer_ni);
+static void kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
+				       int error);
+static struct ib_rdma_wr *
+kiblnd_init_tx_msg_payload(struct lnet_ni *ni, struct kib_tx *tx,
+			       int type, int body_nob, int payload_nob);
+#define kiblnd_init_tx_msg(ni, tx, type, body) \
+	kiblnd_init_tx_msg_payload(ni, tx, type, body, 0)
+static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
+			    int resid, struct kib_rdma_desc *dstrd, u64 dstcookie);
+static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn);
+static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn);
+
+static void kiblnd_unmap_tx(struct kib_tx *tx);
+static void kiblnd_check_sends_locked(struct kib_conn *conn);
+
+void
+kiblnd_tx_done(struct kib_tx *tx)
+{
+	struct lnet_msg *lntmsg[2];
+	int         rc;
+	int         i;
+
+	LASSERT (!in_interrupt());
+	LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
+	LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
+	LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer_ni response */
+	LASSERT (tx->tx_pool != NULL);
+
+	kiblnd_unmap_tx(tx);
+
+	/* tx may have up to 2 lnet msgs to finalise */
+	lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+	lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+	rc = tx->tx_status;
+
+	if (tx->tx_conn != NULL) {
+		kiblnd_conn_decref(tx->tx_conn);
+		tx->tx_conn = NULL;
+	}
+
+	tx->tx_nwrq = tx->tx_nsge = 0;
+	tx->tx_status = 0;
+
+	kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+	/* delay finalize until my descs have been freed */
+	for (i = 0; i < 2; i++) {
+		if (lntmsg[i] == NULL)
+			continue;
+
+		/* propagate health status to LNet for requests */
+		if (i == 0 && lntmsg[i])
+			lntmsg[i]->msg_health_status = tx->tx_hstatus;
+
+		lnet_finalize(lntmsg[i], rc);
+	}
+}
+
+void
+kiblnd_txlist_done(struct list_head *txlist, int status,
+		   enum lnet_msg_hstatus hstatus)
+{
+	struct kib_tx *tx;
+
+	while ((tx = list_first_entry_or_null(txlist,
+					      struct kib_tx,
+					      tx_list)) != NULL) {
+		list_del(&tx->tx_list);
+		/* complete now */
+		tx->tx_waiting = 0;
+		tx->tx_status = status;
+		if (hstatus != LNET_MSG_STATUS_OK)
+			tx->tx_hstatus = hstatus;
+		kiblnd_tx_done(tx);
+	}
+}
+
+static struct kib_tx *
+kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
+{
+	struct kib_net *net = ni->ni_data;
+	struct list_head *node;
+	struct kib_tx *tx;
+	struct kib_tx_poolset *tps;
+
+	tps = net->ibn_tx_ps[lnet_cpt_of_nid(target, ni)];
+	node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+        if (node == NULL)
+                return NULL;
+	tx = container_of(node, struct kib_tx, tx_list);
+
+        LASSERT (tx->tx_nwrq == 0);
+        LASSERT (!tx->tx_queued);
+        LASSERT (tx->tx_sending == 0);
+        LASSERT (!tx->tx_waiting);
+        LASSERT (tx->tx_status == 0);
+        LASSERT (tx->tx_conn == NULL);
+        LASSERT (tx->tx_lntmsg[0] == NULL);
+        LASSERT (tx->tx_lntmsg[1] == NULL);
+        LASSERT (tx->tx_nfrags == 0);
+
+	tx->tx_gaps = false;
+	tx->tx_hstatus = LNET_MSG_STATUS_OK;
+
+        return tx;
+}
+
+static void
+kiblnd_drop_rx(struct kib_rx *rx)
+{
+	struct kib_conn *conn = rx->rx_conn;
+	struct kib_sched_info *sched = conn->ibc_sched;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+	LASSERT(conn->ibc_nrx > 0);
+	conn->ibc_nrx--;
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx(struct kib_rx *rx, int credit)
+{
+	struct kib_conn *conn = rx->rx_conn;
+	struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data;
+	struct ib_recv_wr *bad_wrq = NULL;
+#ifdef HAVE_IB_GET_DMA_MR
+	struct ib_mr *mr = conn->ibc_hdev->ibh_mrs;
+#endif
+	int rc;
+
+	LASSERT (net != NULL);
+	LASSERT (!in_interrupt());
+	LASSERT (credit == IBLND_POSTRX_NO_CREDIT ||
+		 credit == IBLND_POSTRX_PEER_CREDIT ||
+		 credit == IBLND_POSTRX_RSRVD_CREDIT);
+#ifdef HAVE_IB_GET_DMA_MR
+	LASSERT(mr != NULL);
+
+	rx->rx_sge.lkey   = mr->lkey;
+#else
+	rx->rx_sge.lkey   = conn->ibc_hdev->ibh_pd->local_dma_lkey;
+#endif
+        rx->rx_sge.addr   = rx->rx_msgaddr;
+        rx->rx_sge.length = IBLND_MSG_SIZE;
+
+        rx->rx_wrq.next = NULL;
+        rx->rx_wrq.sg_list = &rx->rx_sge;
+        rx->rx_wrq.num_sge = 1;
+        rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+        LASSERT (conn->ibc_state >= IBLND_CONN_INIT);
+        LASSERT (rx->rx_nob >= 0);              /* not posted */
+
+        if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+                kiblnd_drop_rx(rx);             /* No more posts for this rx */
+                return 0;
+        }
+
+        rx->rx_nob = -1;                        /* flag posted */
+
+	/* NB: need an extra reference after ib_post_recv because we don't
+	 * own this rx (and rx::rx_conn) anymore, LU-5678.
+	 */
+	kiblnd_conn_addref(conn);
+#ifdef HAVE_IB_POST_SEND_RECV_CONST
+	rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq,
+			  (const struct ib_recv_wr **)&bad_wrq);
+#else
+	rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+#endif
+	if (unlikely(rc != 0)) {
+		CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+		rx->rx_nob = 0;
+	}
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+		goto out;
+
+	if (unlikely(rc != 0)) {
+		kiblnd_close_conn(conn, rc);
+		kiblnd_drop_rx(rx);	/* No more posts for this rx */
+		goto out;
+	}
+
+	if (credit == IBLND_POSTRX_NO_CREDIT)
+		goto out;
+
+	spin_lock(&conn->ibc_lock);
+	if (credit == IBLND_POSTRX_PEER_CREDIT)
+		conn->ibc_outstanding_credits++;
+	else
+		conn->ibc_reserved_credits++;
+	kiblnd_check_sends_locked(conn);
+	spin_unlock(&conn->ibc_lock);
+
+out:
+	kiblnd_conn_decref(conn);
+	return rc;
+}
+
+static struct kib_tx *
+kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, u64 cookie)
+{
+	struct kib_tx *tx;
+
+	list_for_each_entry(tx, &conn->ibc_active_txs, tx_list) {
+		LASSERT(!tx->tx_queued);
+		LASSERT(tx->tx_sending != 0 || tx->tx_waiting);
+
+		if (tx->tx_cookie != cookie)
+			continue;
+
+		if (tx->tx_waiting &&
+		    tx->tx_msg->ibm_type == txtype)
+			return tx;
+
+		CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+		      tx->tx_waiting ? "" : "NOT ",
+		      tx->tx_msg->ibm_type, txtype);
+	}
+	return NULL;
+}
+
+static void
+kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, u64 cookie)
+{
+	struct kib_tx *tx;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	int idle;
+
+	spin_lock(&conn->ibc_lock);
+
+	tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+	if (tx == NULL) {
+		spin_unlock(&conn->ibc_lock);
+
+		CWARN("Unmatched completion type %x cookie %#llx from %s\n",
+		      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		kiblnd_close_conn(conn, -EPROTO);
+		return;
+	}
+
+	if (tx->tx_status == 0) {               /* success so far */
+		if (status < 0) {               /* failed? */
+			if (status == -ECONNABORTED) {
+				CDEBUG(D_NET, "bad status for connection to %s "
+				              "with completion type %x\n",
+				       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				       txtype);
+			}
+
+			tx->tx_status = status;
+			tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
+		} else if (txtype == IBLND_MSG_GET_REQ) {
+			lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+		}
+	}
+
+	tx->tx_waiting = 0;
+
+	idle = !tx->tx_queued && (tx->tx_sending == 0);
+	if (idle)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(tx);
+}
+
+static void
+kiblnd_send_completion(struct kib_conn *conn, int type, int status, u64 cookie)
+{
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+        if (tx == NULL) {
+                CERROR("Can't get tx for completion %x for %s\n",
+                       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return;
+        }
+
+        tx->tx_msg->ibm_u.completion.ibcm_status = status;
+        tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+	kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg));
+
+        kiblnd_queue_tx(tx, conn);
+}
+
+static void
+kiblnd_handle_rx(struct kib_rx *rx)
+{
+	struct kib_msg *msg = rx->rx_msg;
+	struct kib_conn *conn = rx->rx_conn;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	int credits = msg->ibm_credits;
+	struct kib_tx *tx;
+	int rc = 0;
+	int rc2;
+	int post_credit;
+	struct lnet_hdr hdr;
+	struct lnet_nid srcnid;
+
+        LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+        CDEBUG (D_NET, "Received %x[%d] from %s\n",
+                msg->ibm_type, credits,
+                libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+        if (credits != 0) {
+                /* Have I received credits that will let me send? */
+		spin_lock(&conn->ibc_lock);
+
+		if (conn->ibc_credits + credits >
+		    conn->ibc_queue_depth) {
+			rc2 = conn->ibc_credits;
+			spin_unlock(&conn->ibc_lock);
+
+			CERROR("Bad credits from %s: %d + %d > %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       rc2, credits,
+			       conn->ibc_queue_depth);
+
+			kiblnd_close_conn(conn, -EPROTO);
+			kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+			return;
+		}
+
+                conn->ibc_credits += credits;
+
+                /* This ensures the credit taken by NOOP can be returned */
+                if (msg->ibm_type == IBLND_MSG_NOOP &&
+                    !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+                        conn->ibc_outstanding_credits++;
+
+		kiblnd_check_sends_locked(conn);
+		spin_unlock(&conn->ibc_lock);
+        }
+
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Bad IBLND message type %x from %s\n",
+                       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                post_credit = IBLND_POSTRX_NO_CREDIT;
+                rc = -EPROTO;
+                break;
+
+        case IBLND_MSG_NOOP:
+                if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+                        post_credit = IBLND_POSTRX_NO_CREDIT;
+                        break;
+                }
+
+                if (credits != 0) /* credit already posted */
+                        post_credit = IBLND_POSTRX_NO_CREDIT;
+                else              /* a keepalive NOOP */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
+                break;
+
+	case IBLND_MSG_IMMEDIATE:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		lnet_hdr_from_nid4(&hdr, &msg->ibm_u.immediate.ibim_hdr);
+		lnet_nid4_to_nid(msg->ibm_srcnid, &srcnid);
+		rc = lnet_parse(ni, &hdr, &srcnid, rx, 0);
+		if (rc < 0)                     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_PUT_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		lnet_hdr_from_nid4(&hdr, &msg->ibm_u.putreq.ibprm_hdr);
+		lnet_nid4_to_nid(msg->ibm_srcnid, &srcnid);
+		rc = lnet_parse(ni, &hdr, &srcnid, rx, 1);
+		if (rc < 0)                     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+        case IBLND_MSG_PUT_NAK:
+                CWARN ("PUT_NACK from %s\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+                kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+
+        case IBLND_MSG_PUT_ACK:
+                post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+		spin_lock(&conn->ibc_lock);
+		tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+					msg->ibm_u.putack.ibpam_src_cookie);
+		if (tx != NULL)
+			list_del(&tx->tx_list);
+		spin_unlock(&conn->ibc_lock);
+
+                if (tx == NULL) {
+                        CERROR("Unmatched PUT_ACK from %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        rc = -EPROTO;
+                        break;
+                }
+
+                LASSERT (tx->tx_waiting);
+                /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+                 * (a) I can overwrite tx_msg since my peer_ni has received it!
+                 * (b) tx_waiting set tells tx_complete() it's not done. */
+
+		tx->tx_nwrq = tx->tx_nsge = 0;	/* overwrite PUT_REQ */
+
+                rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+                                       kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+                                       &msg->ibm_u.putack.ibpam_rd,
+                                       msg->ibm_u.putack.ibpam_dst_cookie);
+                if (rc2 < 0)
+                        CERROR("Can't setup rdma for PUT to %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+		spin_lock(&conn->ibc_lock);
+		tx->tx_waiting = 0;	/* clear waiting and queue atomically */
+		kiblnd_queue_tx_locked(tx, conn);
+		spin_unlock(&conn->ibc_lock);
+		break;
+
+        case IBLND_MSG_PUT_DONE:
+                post_credit = IBLND_POSTRX_PEER_CREDIT;
+                kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+
+	case IBLND_MSG_GET_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		lnet_hdr_from_nid4(&hdr, &msg->ibm_u.get.ibgm_hdr);
+		lnet_nid4_to_nid(msg->ibm_srcnid, &srcnid);
+		rc = lnet_parse(ni, &hdr, &srcnid, rx, 1);
+		if (rc < 0)			/* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+        case IBLND_MSG_GET_DONE:
+                post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+                kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+        }
+
+        if (rc < 0)                             /* protocol error */
+                kiblnd_close_conn(conn, rc);
+
+        if (post_credit != IBLND_POSTRX_DONT_POST)
+                kiblnd_post_rx(rx, post_credit);
+}
+
+static void
+kiblnd_rx_complete(struct kib_rx *rx, int status, int nob)
+{
+	struct kib_msg *msg = rx->rx_msg;
+	struct kib_conn   *conn = rx->rx_conn;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	struct kib_net *net = ni->ni_data;
+	int rc;
+	int err = -EIO;
+
+	LASSERT(net);
+	LASSERT(rx->rx_nob < 0);	/* was posted */
+	rx->rx_nob = 0;			/* isn't now */
+
+	if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+		goto ignore;
+
+	if (status != IB_WC_SUCCESS) {
+		CNETERR("Rx from %s failed: %d\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+		goto failed;
+	}
+
+	LASSERT(nob >= 0);
+	rx->rx_nob = nob;
+
+	rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+	if (rc != 0) {
+		CERROR("Error %d unpacking rx from %s\n",
+		       rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		goto failed;
+	}
+
+	if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+	    msg->ibm_dstnid != lnet_nid_to_nid4(&ni->ni_nid) ||
+	    msg->ibm_srcstamp != conn->ibc_incarnation ||
+	    msg->ibm_dststamp != net->ibn_incarnation) {
+		CERROR("Stale rx from %s\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		err = -ESTALE;
+		goto failed;
+	}
+
+	/* set time last known alive */
+	kiblnd_peer_alive(conn->ibc_peer);
+
+	/* racing with connection establishment/teardown! */
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		rwlock_t  *g_lock = &kiblnd_data.kib_global_lock;
+		unsigned long  flags;
+
+		write_lock_irqsave(g_lock, flags);
+		/* must check holding global lock to eliminate race */
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+			write_unlock_irqrestore(g_lock, flags);
+			return;
+		}
+		write_unlock_irqrestore(g_lock, flags);
+	}
+	kiblnd_handle_rx(rx);
+	return;
+
+failed:
+	CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+	kiblnd_close_conn(conn, err);
+ignore:
+	kiblnd_drop_rx(rx);                     /* Don't re-post rx. */
+}
+
+static int
+kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx,
+		  struct kib_rdma_desc *rd, u32 nob)
+{
+	struct kib_hca_dev *hdev;
+	struct kib_dev *dev;
+	struct kib_fmr_poolset *fps;
+	int			cpt;
+	int			rc;
+	int i;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	dev = net->ibn_dev;
+	hdev = tx->tx_pool->tpo_hdev;
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	/*
+	 * If we're dealing with FastReg, but the device doesn't
+	 * support GAPS and the tx has GAPS, then there is no real point
+	 * in trying to map the memory, because it'll just fail. So
+	 * preemptively fail with an appropriate message
+	 */
+	if (IS_FAST_REG_DEV(dev) &&
+	    !(dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT) &&
+	    tx->tx_gaps) {
+		CERROR("Using FastReg with no GAPS support, but tx has gaps. "
+		       "Try setting use_fastreg_gaps to 1\n");
+		return -EPROTONOSUPPORT;
+	}
+
+#ifdef HAVE_FMR_POOL_API
+	/*
+	 * FMR does not support gaps but the tx has gaps then
+	 * we should make sure that the number of fragments we'll be sending
+	 * over fits within the number of fragments negotiated on the
+	 * connection, otherwise, we won't be able to RDMA the data.
+	 * We need to maintain the number of fragments negotiation on the
+	 * connection for backwards compatibility.
+	 */
+	if (tx->tx_gaps && (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)) {
+		if (tx->tx_conn &&
+		    tx->tx_conn->ibc_max_frags <= rd->rd_nfrags) {
+			CERROR("TX number of frags (%d) is <= than connection"
+			       " number of frags (%d). Consider setting peer's"
+			       " map_on_demand to 256\n", tx->tx_nfrags,
+			       tx->tx_conn->ibc_max_frags);
+			return -EFBIG;
+		}
+	}
+#endif
+
+	fps = net->ibn_fmr_ps[cpt];
+	rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->tx_fmr);
+	if (rc != 0) {
+		CERROR("Can't map %u bytes (%u/%u)s: %d\n", nob,
+		       tx->tx_nfrags, rd->rd_nfrags, rc);
+		return rc;
+	}
+
+	/*
+	 * If rd is not tx_rd, it's going to get sent to a peer_ni, who will
+	 * need the rkey
+	 */
+	rd->rd_key = tx->tx_fmr.fmr_key;
+	/*
+	 * for FastReg or FMR with no gaps we can accumulate all
+	 * the fragments in one FastReg or FMR fragment.
+	 */
+	if (
+#ifdef HAVE_FMR_POOL_API
+	    ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
+	     && !tx->tx_gaps) ||
+#endif
+	    IS_FAST_REG_DEV(dev)) {
+		/* FMR requires zero based address */
+#ifdef HAVE_FMR_POOL_API
+		if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
+			rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+#endif
+		rd->rd_frags[0].rf_nob = nob;
+		rd->rd_nfrags = 1;
+	} else {
+		/*
+		 * We're transmitting with gaps using FMR.
+		 * We'll need to use multiple fragments and identify the
+		 * zero based address of each fragment.
+		 */
+		for (i = 0; i < rd->rd_nfrags; i++) {
+			rd->rd_frags[i].rf_addr &= ~hdev->ibh_page_mask;
+			rd->rd_frags[i].rf_addr += i << hdev->ibh_page_shift;
+		}
+	}
+
+	return 0;
+}
+
+static void
+kiblnd_unmap_tx(struct kib_tx *tx)
+{
+	if (
+#ifdef HAVE_FMR_POOL_API
+		tx->tx_fmr.fmr_pfmr ||
+#endif
+		tx->tx_fmr.fmr_frd)
+		kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
+
+	if (tx->tx_nfrags != 0) {
+		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev,
+				    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+		tx->tx_nfrags = 0;
+	}
+}
+
+#ifdef HAVE_IB_GET_DMA_MR
+static struct ib_mr *
+kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd)
+{
+	struct kib_net *net = ni->ni_data;
+	struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+	/*
+	 * if map-on-demand is turned on and the device supports
+	 * either FMR or FastReg then use that. Otherwise use global
+	 * memory regions. If that's not available either, then you're
+	 * dead in the water and fail the operation.
+	 */
+	if (tunables->lnd_map_on_demand && (IS_FAST_REG_DEV(net->ibn_dev)
+#ifdef HAVE_FMR_POOL_API
+	     || net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED
+#endif
+	))
+		return NULL;
+
+	/*
+	 * hdev->ibh_mrs can be NULL. This case is dealt with gracefully
+	 * in the call chain. The mapping will fail with appropriate error
+	 * message.
+	 */
+	return hdev->ibh_mrs;
+}
+#endif
+
+static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
+			 struct kib_rdma_desc *rd, int nfrags)
+{
+	struct kib_net *net = ni->ni_data;
+	struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
+#ifdef HAVE_IB_GET_DMA_MR
+	struct ib_mr *mr = NULL;
+#endif
+	__u32 nob;
+	int i;
+
+        /* If rd is not tx_rd, it's going to get sent to a peer_ni and I'm the
+         * RDMA sink */
+        tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	tx->tx_nfrags = nfrags;
+
+	rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx->tx_frags,
+					  tx->tx_nfrags, tx->tx_dmadir);
+
+        for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+                rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+                        hdev->ibh_ibdev, &tx->tx_frags[i]);
+                rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+                        hdev->ibh_ibdev, &tx->tx_frags[i]);
+                nob += rd->rd_frags[i].rf_nob;
+        }
+
+#ifdef HAVE_IB_GET_DMA_MR
+	mr = kiblnd_find_rd_dma_mr(ni, rd);
+	if (mr != NULL) {
+		/* found pre-mapping MR */
+		rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+		return 0;
+	}
+#endif
+
+	if (net->ibn_fmr_ps != NULL)
+		return kiblnd_fmr_map_tx(net, tx, rd, nob);
+
+	return -EINVAL;
+}
+
+static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
+				struct kib_rdma_desc *rd, int nkiov,
+				struct bio_vec *kiov, int offset, int nob)
+{
+	struct kib_net *net = ni->ni_data;
+	struct scatterlist *sg;
+	int fragnob;
+	int max_nkiov;
+	int sg_count = 0;
+
+	CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+	LASSERT(nob > 0);
+	LASSERT(nkiov > 0);
+	LASSERT(net != NULL);
+
+	while (offset >= kiov->bv_len) {
+		offset -= kiov->bv_len;
+		nkiov--;
+		kiov++;
+		LASSERT(nkiov > 0);
+	}
+
+	max_nkiov = nkiov;
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT(nkiov > 0);
+
+		if (!sg) {
+			CERROR("lacking enough sg entries to map tx\n");
+			return -EFAULT;
+		}
+		sg_count++;
+
+		fragnob = min((int)(kiov->bv_len - offset), nob);
+
+		/*
+		 * We're allowed to start at a non-aligned page offset in
+		 * the first fragment and end at a non-aligned page offset
+		 * in the last fragment.
+		 */
+		if ((fragnob < (int)(kiov->bv_len - offset)) &&
+		    nkiov < max_nkiov && nob > fragnob) {
+			CDEBUG(D_NET, "fragnob %d < available page %d: with"
+				      " remaining %d kiovs with %d nob left\n",
+			       fragnob, (int)(kiov->bv_len - offset),
+			       nkiov, nob);
+			tx->tx_gaps = true;
+		}
+
+		sg_set_page(sg, kiov->bv_page, fragnob,
+			    kiov->bv_offset + offset);
+		sg = sg_next(sg);
+
+		offset = 0;
+		kiov++;
+		nkiov--;
+		nob -= fragnob;
+	} while (nob > 0);
+
+	return kiblnd_map_tx(ni, tx, rd, sg_count);
+}
+
+static int
+kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit)
+__must_hold(&conn->ibc_lock)
+{
+	struct kib_msg *msg = tx->tx_msg;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct lnet_ni *ni = peer_ni->ibp_ni;
+	struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd;
+	int ver = conn->ibc_version;
+	int rc;
+	int done;
+
+	LASSERT(tx->tx_queued);
+	/* We rely on this for QP sizing */
+	LASSERT(tx->tx_nwrq > 0 && tx->tx_nsge >= 0);
+	LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags);
+
+	LASSERT(credit == 0 || credit == 1);
+	LASSERT(conn->ibc_outstanding_credits >= 0);
+	LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth);
+	LASSERT(conn->ibc_credits >= 0);
+	LASSERT(conn->ibc_credits <= conn->ibc_queue_depth);
+
+	if (conn->ibc_nsends_posted ==
+	    kiblnd_concurrent_sends(ver, ni)) {
+		/* tx completions outstanding... */
+		CDEBUG(D_NET, "%s: posted enough\n",
+		       libcfs_nid2str(peer_ni->ibp_nid));
+		return -EAGAIN;
+	}
+
+        if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
+                CDEBUG(D_NET, "%s: no credits\n",
+                       libcfs_nid2str(peer_ni->ibp_nid));
+                return -EAGAIN;
+        }
+
+        if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+            conn->ibc_credits == 1 &&   /* last credit reserved */
+            msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
+                CDEBUG(D_NET, "%s: not using last credit\n",
+                       libcfs_nid2str(peer_ni->ibp_nid));
+                return -EAGAIN;
+        }
+
+        /* NB don't drop ibc_lock before bumping tx_sending */
+	list_del(&tx->tx_list);
+        tx->tx_queued = 0;
+
+        if (msg->ibm_type == IBLND_MSG_NOOP &&
+            (!kiblnd_need_noop(conn) ||     /* redundant NOOP */
+             (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+              conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+		/* OK to drop when posted enough NOOPs, since
+		 * kiblnd_check_sends_locked will queue NOOP again when
+		 * posted NOOPs complete */
+		spin_unlock(&conn->ibc_lock);
+		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+		kiblnd_tx_done(tx);
+		spin_lock(&conn->ibc_lock);
+                CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+                       libcfs_nid2str(peer_ni->ibp_nid),
+                       conn->ibc_noops_posted);
+                return 0;
+        }
+
+        kiblnd_pack_msg(peer_ni->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+                        peer_ni->ibp_nid, conn->ibc_incarnation);
+
+	conn->ibc_credits -= credit;
+	conn->ibc_outstanding_credits = 0;
+	conn->ibc_nsends_posted++;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted++;
+
+	/* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+	 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+	 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+	 * and then re-queued here.  It's (just) possible that
+	 * tx_sending is non-zero if we've not done the tx_complete()
+	 * from the first send; hence the ++ rather than = below. */
+	tx->tx_sending++;
+	list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+        /* I'm still holding ibc_lock! */
+        if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+		CDEBUG(D_NET, "connection to %s is not established\n",
+				conn->ibc_peer? libcfs_nid2str(conn->ibc_peer->ibp_nid): "NULL");
+                rc = -ECONNABORTED;
+        } else if (tx->tx_pool->tpo_pool.po_failed ||
+                 conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+                /* close_conn will launch failover */
+                rc = -ENETDOWN;
+        } else {
+		struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
+		struct ib_send_wr *wr  = &tx->tx_wrq[0].wr;
+
+		if (frd != NULL && !frd->frd_posted) {
+			wr = &frd->frd_inv_wr.wr;
+			wr->next = &frd->frd_fastreg_wr.wr;
+			frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr;
+		}
+
+		LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
+			 "bad wr_id %#llx, opc %d, flags %d, peer_ni: %s\n",
+			 bad->wr_id, bad->opcode, bad->send_flags,
+			 libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+		bad = NULL;
+		if (lnet_send_error_simulation(tx->tx_lntmsg[0], &tx->tx_hstatus))
+			rc = -EINVAL;
+		else
+#ifdef HAVE_IB_POST_SEND_RECV_CONST
+			rc = ib_post_send(conn->ibc_cmid->qp, wr,
+					  (const struct ib_send_wr **)&bad);
+#else
+			rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
+#endif
+		if (frd && !frd->frd_posted) {
+			/* The local invalidate becomes invalid (has been
+			 * successfully used) if the post succeeds or the
+			 * failing wr was not the invalidate. */
+			frd->frd_valid =
+				!(rc == 0 || (bad != &frd->frd_inv_wr.wr));
+		}
+	}
+
+	conn->ibc_last_send = ktime_get();
+
+	if (rc == 0) {
+		if (frd != NULL)
+			frd->frd_posted = true;
+		return 0;
+	}
+
+        /* NB credits are transferred in the actual
+         * message, which can only be the last work item */
+        conn->ibc_credits += credit;
+        conn->ibc_outstanding_credits += msg->ibm_credits;
+        conn->ibc_nsends_posted--;
+        if (msg->ibm_type == IBLND_MSG_NOOP)
+                conn->ibc_noops_posted--;
+
+        tx->tx_status = rc;
+        tx->tx_waiting = 0;
+        tx->tx_sending--;
+
+        done = (tx->tx_sending == 0);
+        if (done)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+        if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+                CERROR("Error %d posting transmit to %s\n",
+                       rc, libcfs_nid2str(peer_ni->ibp_nid));
+        else
+                CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+                       rc, libcfs_nid2str(peer_ni->ibp_nid));
+
+        kiblnd_close_conn(conn, rc);
+
+	if (done)
+		kiblnd_tx_done(tx);
+
+	spin_lock(&conn->ibc_lock);
+
+	return -EIO;
+}
+
+static void
+kiblnd_check_sends_locked(struct kib_conn *conn)
+{
+	int ver = conn->ibc_version;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	struct kib_tx *tx;
+
+        /* Don't send anything until after the connection is established */
+        if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                CDEBUG(D_NET, "%s too soon\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return;
+        }
+
+	LASSERT(conn->ibc_nsends_posted <=
+		kiblnd_concurrent_sends(ver, ni));
+        LASSERT (!IBLND_OOB_CAPABLE(ver) ||
+                 conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+        LASSERT (conn->ibc_reserved_credits >= 0);
+
+        while (conn->ibc_reserved_credits > 0 &&
+	       (tx = list_first_entry_or_null(&conn->ibc_tx_queue_rsrvd,
+					      struct kib_tx, tx_list)) != NULL) {
+		list_move_tail(&tx->tx_list, &conn->ibc_tx_queue);
+                conn->ibc_reserved_credits--;
+        }
+
+        if (kiblnd_need_noop(conn)) {
+		spin_unlock(&conn->ibc_lock);
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx != NULL)
+			kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+		spin_lock(&conn->ibc_lock);
+                if (tx != NULL)
+                        kiblnd_queue_tx_locked(tx, conn);
+        }
+
+        for (;;) {
+                int credit;
+
+		if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+                        credit = 0;
+			tx = list_first_entry(&conn->ibc_tx_queue_nocred,
+					      struct kib_tx, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_noops)) {
+                        LASSERT (!IBLND_OOB_CAPABLE(ver));
+                        credit = 1;
+			tx = list_first_entry(&conn->ibc_tx_noops,
+					      struct kib_tx, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_queue)) {
+                        credit = 1;
+			tx = list_first_entry(&conn->ibc_tx_queue,
+					      struct kib_tx, tx_list);
+                } else
+                        break;
+
+                if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+                        break;
+        }
+}
+
+static void
+kiblnd_tx_complete(struct kib_tx *tx, int status)
+{
+	int           failed = (status != IB_WC_SUCCESS);
+	struct kib_conn   *conn = tx->tx_conn;
+	int           idle;
+
+	if (tx->tx_sending <= 0) {
+		CERROR("Received an event on a freed tx: %p status %d\n",
+		       tx, tx->tx_status);
+		return;
+	}
+
+	if (failed) {
+		if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+			CNETERR("Tx -> %s cookie %#llx"
+				" sending %d waiting %d: failed %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+				status);
+
+		kiblnd_close_conn(conn, -EIO);
+	} else {
+		kiblnd_peer_alive(conn->ibc_peer);
+	}
+
+	spin_lock(&conn->ibc_lock);
+
+        /* I could be racing with rdma completion.  Whoever makes 'tx' idle
+         * gets to free it, which also drops its ref on 'conn'. */
+
+        tx->tx_sending--;
+        conn->ibc_nsends_posted--;
+        if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+                conn->ibc_noops_posted--;
+
+        if (failed) {
+		tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
+                tx->tx_waiting = 0;             /* don't wait for peer_ni */
+                tx->tx_status = -EIO;
+        }
+
+        idle = (tx->tx_sending == 0) &&         /* This is the final callback */
+               !tx->tx_waiting &&               /* Not waiting for peer_ni */
+               !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
+        if (idle)
+		list_del(&tx->tx_list);
+
+	kiblnd_check_sends_locked(conn);
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(tx);
+}
+
+
+static void
+kiblnd_init_tx_sge(struct kib_tx *tx, u64 addr, unsigned int len)
+{
+	struct ib_sge *sge = &tx->tx_sge[tx->tx_nsge];
+	struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev;
+#ifdef HAVE_IB_GET_DMA_MR
+	struct ib_mr *mr = hdev->ibh_mrs;
+#endif
+
+	*sge = (struct ib_sge) {
+#ifdef HAVE_IB_GET_DMA_MR
+		.lkey   = mr->lkey,
+#else
+		.lkey   = hdev->ibh_pd->local_dma_lkey,
+#endif
+		.addr   = addr,
+		.length = len,
+	};
+
+	tx->tx_nsge++;
+}
+
+static struct ib_rdma_wr *
+kiblnd_init_tx_msg_payload(struct lnet_ni *ni, struct kib_tx *tx, int type,
+		   int body_nob, int payload)
+{
+	struct ib_rdma_wr *wrq;
+	int nob = offsetof(struct kib_msg, ibm_u) + body_nob;
+
+	LASSERT(tx->tx_nwrq >= 0);
+	LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+	LASSERT(nob <= IBLND_MSG_SIZE);
+
+	kiblnd_init_msg(tx->tx_msg, type, body_nob + payload);
+
+	wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+	*wrq = (struct ib_rdma_wr) {
+		.wr = {
+			.wr_id		= kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
+			.num_sge	= 1,
+			.sg_list	= &tx->tx_sge[tx->tx_nsge],
+			.opcode		= IB_WR_SEND,
+			.send_flags	= IB_SEND_SIGNALED,
+		},
+	};
+
+	kiblnd_init_tx_sge(tx, tx->tx_msgaddr, nob);
+
+	tx->tx_nwrq++;
+	return wrq;
+}
+
+static int
+kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
+		 int resid, struct kib_rdma_desc *dstrd, u64 dstcookie)
+{
+	struct kib_msg *ibmsg = tx->tx_msg;
+	struct kib_rdma_desc *srcrd = tx->tx_rd;
+	struct ib_rdma_wr *wrq = NULL;
+	struct ib_sge	  *sge;
+	int		   rc  = resid;
+	int		   srcidx;
+	int		   dstidx;
+	int		   sge_nob;
+	int		   wrq_sge;
+
+	LASSERT(!in_interrupt());
+	LASSERT(tx->tx_nwrq == 0 && tx->tx_nsge == 0);
+	LASSERT(type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE);
+
+	for (srcidx = dstidx = wrq_sge = sge_nob = 0;
+	     resid > 0; resid -= sge_nob) {
+		int	prev = dstidx;
+
+		if (srcidx >= srcrd->rd_nfrags) {
+			CERROR("Src buffer exhausted: %d frags\n", srcidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (dstidx >= dstrd->rd_nfrags) {
+			CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (tx->tx_nwrq >= conn->ibc_max_frags) {
+			CERROR("RDMA has too many fragments for peer_ni %s (%d), "
+			       "src idx/frags: %d/%d dst idx/frags: %d/%d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       conn->ibc_max_frags,
+			       srcidx, srcrd->rd_nfrags,
+			       dstidx, dstrd->rd_nfrags);
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		sge_nob = min3(kiblnd_rd_frag_size(srcrd, srcidx),
+			       kiblnd_rd_frag_size(dstrd, dstidx),
+			       resid);
+
+		sge = &tx->tx_sge[tx->tx_nsge];
+		sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+		sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+		sge->length = sge_nob;
+
+		if (wrq_sge == 0) {
+			wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+			wrq->wr.next	= &(wrq + 1)->wr;
+			wrq->wr.wr_id	= kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+			wrq->wr.sg_list	= sge;
+			wrq->wr.opcode	= IB_WR_RDMA_WRITE;
+			wrq->wr.send_flags = 0;
+
+#ifdef HAVE_IB_RDMA_WR
+			wrq->remote_addr	= kiblnd_rd_frag_addr(dstrd,
+								      dstidx);
+			wrq->rkey		= kiblnd_rd_frag_key(dstrd,
+								     dstidx);
+#else
+			wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd,
+									dstidx);
+			wrq->wr.wr.rdma.rkey	= kiblnd_rd_frag_key(dstrd,
+								     dstidx);
+#endif
+		}
+
+		srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, sge_nob);
+		dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, sge_nob);
+
+		wrq_sge++;
+		if (wrq_sge == *kiblnd_tunables.kib_wrq_sge || dstidx != prev) {
+			tx->tx_nwrq++;
+			wrq->wr.num_sge = wrq_sge;
+			wrq_sge = 0;
+		}
+		tx->tx_nsge++;
+	}
+
+	if (rc < 0)	/* no RDMA if completing with failure */
+		tx->tx_nwrq = tx->tx_nsge = 0;
+
+        ibmsg->ibm_u.completion.ibcm_status = rc;
+        ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+        kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+			   type, sizeof(struct kib_completion_msg));
+
+        return rc;
+}
+
+static void
+kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn)
+{
+	struct list_head *q;
+	s64 timeout_ns;
+
+	LASSERT(tx->tx_nwrq > 0);	/* work items set up */
+	LASSERT(!tx->tx_queued);	/* not queued for sending already */
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	if (conn->ibc_state >= IBLND_CONN_DISCONNECTED) {
+		CDEBUG(D_NET, "connection with %s is disconnected\n",
+				conn->ibc_peer? libcfs_nid2str(conn->ibc_peer->ibp_nid): "NULL");
+
+		tx->tx_status = -ECONNABORTED;
+		tx->tx_waiting = 0;
+		if (tx->tx_conn != NULL) {
+			/* PUT_DONE first attached to conn as a PUT_REQ */
+			LASSERT(tx->tx_conn == conn);
+			LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+			tx->tx_conn = NULL;
+			kiblnd_conn_decref(conn);
+		}
+		list_add(&tx->tx_list, &conn->ibc_zombie_txs);
+
+		return;
+	}
+
+	timeout_ns = kiblnd_timeout() * NSEC_PER_SEC;
+	tx->tx_queued = 1;
+	tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
+
+        if (tx->tx_conn == NULL) {
+                kiblnd_conn_addref(conn);
+                tx->tx_conn = conn;
+                LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+        } else {
+                /* PUT_DONE first attached to conn as a PUT_REQ */
+                LASSERT (tx->tx_conn == conn);
+                LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+        }
+
+        switch (tx->tx_msg->ibm_type) {
+        default:
+                LBUG();
+
+        case IBLND_MSG_PUT_REQ:
+        case IBLND_MSG_GET_REQ:
+                q = &conn->ibc_tx_queue_rsrvd;
+                break;
+
+        case IBLND_MSG_PUT_NAK:
+        case IBLND_MSG_PUT_ACK:
+        case IBLND_MSG_PUT_DONE:
+        case IBLND_MSG_GET_DONE:
+                q = &conn->ibc_tx_queue_nocred;
+                break;
+
+        case IBLND_MSG_NOOP:
+                if (IBLND_OOB_CAPABLE(conn->ibc_version))
+                        q = &conn->ibc_tx_queue_nocred;
+                else
+                        q = &conn->ibc_tx_noops;
+                break;
+
+        case IBLND_MSG_IMMEDIATE:
+                q = &conn->ibc_tx_queue;
+                break;
+        }
+
+	list_add_tail(&tx->tx_list, q);
+}
+
+static void
+kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn)
+{
+	spin_lock(&conn->ibc_lock);
+	kiblnd_queue_tx_locked(tx, conn);
+	kiblnd_check_sends_locked(conn);
+	spin_unlock(&conn->ibc_lock);
+}
+
+static int
+kiblnd_resolve_addr_cap(struct rdma_cm_id *cmid,
+			struct sockaddr_in *srcaddr,
+			struct sockaddr_in *dstaddr,
+			int timeout_ms)
+{
+        unsigned short port;
+        int rc;
+
+        /* allow the port to be reused */
+        rc = rdma_set_reuseaddr(cmid, 1);
+        if (rc != 0) {
+                CERROR("Unable to set reuse on cmid: %d\n", rc);
+                return rc;
+        }
+
+        /* look for a free privileged port */
+        for (port = PROT_SOCK-1; port > 0; port--) {
+                srcaddr->sin_port = htons(port);
+                rc = rdma_resolve_addr(cmid,
+                                       (struct sockaddr *)srcaddr,
+                                       (struct sockaddr *)dstaddr,
+                                       timeout_ms);
+                if (rc == 0) {
+                        CDEBUG(D_NET, "bound to port %hu\n", port);
+                        return 0;
+                } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+                        CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+                               port, rc);
+                } else {
+                        return rc;
+                }
+        }
+
+	CERROR("cannot bind to a free privileged port: rc = %d\n", rc);
+
+	return rc;
+}
+
+static int
+kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+		    struct sockaddr_in *srcaddr,
+		    struct sockaddr_in *dstaddr,
+		    int timeout_ms)
+{
+	const struct cred *old_creds = NULL;
+	struct cred *new_creds;
+	int rc;
+
+	if (!capable(CAP_NET_BIND_SERVICE)) {
+		new_creds = prepare_kernel_cred(NULL);
+		if (!new_creds)
+			return -ENOMEM;
+
+		cap_raise(new_creds->cap_effective, CAP_NET_BIND_SERVICE);
+		old_creds = override_creds(new_creds);
+	}
+
+	rc = kiblnd_resolve_addr_cap(cmid, srcaddr, dstaddr, timeout_ms);
+
+	if (old_creds)
+		revert_creds(old_creds);
+
+	return rc;
+}
+
+static void
+kiblnd_connect_peer(struct kib_peer_ni *peer_ni)
+{
+        struct rdma_cm_id *cmid;
+	struct kib_dev *dev;
+	struct kib_net *net = peer_ni->ibp_ni->ni_data;
+        struct sockaddr_in srcaddr;
+        struct sockaddr_in dstaddr;
+	int rc;
+
+        LASSERT (net != NULL);
+        LASSERT (peer_ni->ibp_connecting > 0);
+
+	cmid = kiblnd_rdma_create_id(peer_ni->ibp_ni->ni_net_ns,
+				     kiblnd_cm_callback, peer_ni,
+				     RDMA_PS_TCP, IB_QPT_RC);
+
+        if (IS_ERR(cmid)) {
+                CERROR("Can't create CMID for %s: %ld\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), PTR_ERR(cmid));
+                rc = PTR_ERR(cmid);
+                goto failed;
+        }
+
+        dev = net->ibn_dev;
+        memset(&srcaddr, 0, sizeof(srcaddr));
+        srcaddr.sin_family = AF_INET;
+        srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+        memset(&dstaddr, 0, sizeof(dstaddr));
+        dstaddr.sin_family = AF_INET;
+        dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+        dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer_ni->ibp_nid));
+
+        kiblnd_peer_addref(peer_ni);               /* cmid's ref */
+
+	if (*kiblnd_tunables.kib_use_priv_port) {
+		rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+					 kiblnd_timeout() * 1000);
+	} else {
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)&srcaddr,
+				       (struct sockaddr *)&dstaddr,
+				       kiblnd_timeout() * 1000);
+	}
+	if (rc != 0) {
+		/* Can't initiate address resolution:  */
+		CERROR("Can't resolve addr for %s: %d\n",
+		       libcfs_nid2str(peer_ni->ibp_nid), rc);
+		goto failed2;
+	}
+
+	return;
+
+ failed2:
+	kiblnd_peer_connect_failed(peer_ni, 1, rc);
+	kiblnd_peer_decref(peer_ni);               /* cmid's ref */
+	rdma_destroy_id(cmid);
+	return;
+ failed:
+	kiblnd_peer_connect_failed(peer_ni, 1, rc);
+}
+
+bool
+kiblnd_reconnect_peer(struct kib_peer_ni *peer_ni)
+{
+	rwlock_t *glock = &kiblnd_data.kib_global_lock;
+	char *reason = NULL;
+	LIST_HEAD(txs);
+	unsigned long flags;
+
+	write_lock_irqsave(glock, flags);
+	if (peer_ni->ibp_reconnecting == 0) {
+		if (peer_ni->ibp_accepting)
+			reason = "accepting";
+		else if (peer_ni->ibp_connecting)
+			reason = "connecting";
+		else if (!list_empty(&peer_ni->ibp_conns))
+			reason = "connected";
+		else /* connected then closed */
+			reason = "closed";
+
+		goto no_reconnect;
+	}
+
+	if (peer_ni->ibp_accepting)
+		CNETERR("Detecting race between accepting and reconnecting\n");
+	peer_ni->ibp_reconnecting--;
+
+	if (!kiblnd_peer_active(peer_ni)) {
+		list_splice_init(&peer_ni->ibp_tx_queue, &txs);
+		reason = "unlinked";
+		goto no_reconnect;
+	}
+
+	peer_ni->ibp_connecting++;
+	peer_ni->ibp_reconnected++;
+
+	write_unlock_irqrestore(glock, flags);
+
+	kiblnd_connect_peer(peer_ni);
+	return true;
+
+ no_reconnect:
+	write_unlock_irqrestore(glock, flags);
+
+	CWARN("Abort reconnection of %s: %s\n",
+	      libcfs_nid2str(peer_ni->ibp_nid), reason);
+	kiblnd_txlist_done(&txs, -ECONNABORTED,
+			   LNET_MSG_STATUS_LOCAL_ABORTED);
+	return false;
+}
+
+void
+kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
+{
+	struct kib_peer_ni *peer_ni;
+	struct kib_peer_ni *peer2;
+	struct kib_conn *conn;
+	rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+	unsigned long flags;
+	int rc;
+	int i;
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+	/* If I get here, I've committed to send, so I complete the tx with
+	 * failure on any problems
+	 */
+
+	LASSERT(!tx || !tx->tx_conn);	  /* only set when assigned a conn */
+	LASSERT(!tx || tx->tx_nwrq > 0);  /* work items have been set up */
+
+	/* First time, just use a read lock since I expect to find my peer_ni
+	 * connected
+	 */
+	read_lock_irqsave(g_lock, flags);
+
+	peer_ni = kiblnd_find_peer_locked(ni, nid);
+	if (peer_ni != NULL && !list_empty(&peer_ni->ibp_conns)) {
+		/* Found a peer_ni with an established connection */
+		conn = kiblnd_get_conn_locked(peer_ni);
+		kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+		read_unlock_irqrestore(g_lock, flags);
+
+		if (tx != NULL)
+			kiblnd_queue_tx(tx, conn);
+		kiblnd_conn_decref(conn); /* ...to here */
+		return;
+	}
+
+	read_unlock(g_lock);
+	/* Re-try with a write lock */
+	write_lock(g_lock);
+
+	peer_ni = kiblnd_find_peer_locked(ni, nid);
+	if (peer_ni != NULL) {
+		if (list_empty(&peer_ni->ibp_conns)) {
+			/* found a peer_ni, but it's still connecting... */
+			LASSERT(kiblnd_peer_connecting(peer_ni));
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+					      &peer_ni->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer_ni);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+		return;
+	}
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	/* Allocate a peer_ni ready to add to the peer_ni table and retry */
+	rc = kiblnd_create_peer(ni, &peer_ni, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer_ni %s\n", libcfs_nid2str(nid));
+		if (tx != NULL) {
+			tx->tx_status = -EHOSTUNREACH;
+			tx->tx_waiting = 0;
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+			kiblnd_tx_done(tx);
+		}
+		return;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(ni, nid);
+	if (peer2 != NULL) {
+		if (list_empty(&peer2->ibp_conns)) {
+			/* found a peer_ni, but it's still connecting... */
+			LASSERT(kiblnd_peer_connecting(peer2));
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+					      &peer2->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer2);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+
+		kiblnd_peer_decref(peer_ni);
+		return;
+	}
+
+	/* Brand new peer_ni */
+	LASSERT(peer_ni->ibp_connecting == 0);
+	tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	peer_ni->ibp_connecting = tunables->lnd_conns_per_peer;
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0);
+
+	if (tx != NULL)
+		list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
+
+	kiblnd_peer_addref(peer_ni);
+	hash_add(kiblnd_data.kib_peers, &peer_ni->ibp_list, nid);
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	for (i = 0; i < tunables->lnd_conns_per_peer; i++)
+		kiblnd_connect_peer(peer_ni);
+	kiblnd_peer_decref(peer_ni);
+}
+
+int
+kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
+{
+	struct kib_dev *dev = ((struct kib_net *)ni->ni_data)->ibn_dev;
+	struct lnet_hdr *hdr = &lntmsg->msg_hdr;
+	int               type = lntmsg->msg_type;
+	struct lnet_processid *target = &lntmsg->msg_target;
+	int               target_is_router = lntmsg->msg_target_is_router;
+	int               routing = lntmsg->msg_routing;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct bio_vec   *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	struct kib_msg *ibmsg;
+	struct kib_rdma_desc *rd;
+	struct kib_tx *tx;
+	int               nob;
+	int               rc;
+
+	/* NB 'private' is different depending on what we're sending.... */
+
+	CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_idstr(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+
+	/* Thread context */
+	LASSERT (!in_interrupt());
+
+	tx = kiblnd_get_idle_tx(ni, lnet_nid_to_nid4(&target->nid));
+	if (tx == NULL) {
+		CERROR("Can't allocate %s txd for %s\n",
+			lnet_msgtyp2str(type),
+			libcfs_nidstr(&target->nid));
+		return -ENOMEM;
+	}
+	ibmsg = tx->tx_msg;
+
+	switch (type) {
+	default:
+		LBUG();
+		return (-EIO);
+
+        case LNET_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case LNET_MSG_GET:
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+
+                /* is the REPLY message too small for RDMA? */
+		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+		if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
+			break;                  /* send IMMEDIATE */
+
+		rd = &ibmsg->ibm_u.get.ibgm_rd;
+		rc = kiblnd_setup_rd_kiov(ni, tx, rd,
+					  lntmsg->msg_md->md_niov,
+					  lntmsg->msg_md->md_kiov,
+					  0, lntmsg->msg_md->md_length);
+		if (rc != 0) {
+			CERROR("Can't setup GET sink for %s: %d\n",
+			       libcfs_nidstr(&target->nid), rc);
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+			kiblnd_tx_done(tx);
+			return -EIO;
+		}
+
+		nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]);
+		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+		lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.get.ibgm_hdr);
+
+                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+                tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+		if (tx->tx_lntmsg[1] == NULL) {
+			CERROR("Can't create reply for GET -> %s\n",
+			       libcfs_nidstr(&target->nid));
+			kiblnd_tx_done(tx);
+			return -EIO;
+		}
+
+		/* finalise lntmsg[0,1] on completion */
+		tx->tx_lntmsg[0] = lntmsg;
+		tx->tx_waiting = 1;             /* waiting for GET_DONE */
+		kiblnd_launch_tx(ni, tx, lnet_nid_to_nid4(&target->nid));
+		return 0;
+
+	case LNET_MSG_REPLY:
+	case LNET_MSG_PUT:
+		/* Is the payload small enough not to need RDMA? */
+		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
+		if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
+			break;                  /* send IMMEDIATE */
+
+		rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+					  payload_niov, payload_kiov,
+					  payload_offset, payload_nob);
+		if (rc != 0) {
+			CERROR("Can't setup PUT src for %s: %d\n",
+			       libcfs_nidstr(&target->nid), rc);
+			kiblnd_tx_done(tx);
+			return -EIO;
+		}
+
+		lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.putreq.ibprm_hdr);
+		ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ,
+				   sizeof(struct kib_putreq_msg));
+
+		/* finalise lntmsg[0,1] on completion */
+		tx->tx_lntmsg[0] = lntmsg;
+		tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
+		kiblnd_launch_tx(ni, tx, lnet_nid_to_nid4(&target->nid));
+		return 0;
+	}
+
+	/* send IMMEDIATE */
+	LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob])
+		<= IBLND_MSG_SIZE);
+
+	ibmsg = tx->tx_msg;
+	lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.immediate.ibim_hdr);
+
+	if (IS_FAST_REG_DEV(dev) && payload_nob)  {
+		struct ib_rdma_wr *wrq;
+		int i;
+
+		nob = offsetof(struct kib_immediate_msg, ibim_payload[0]);
+		wrq = kiblnd_init_tx_msg_payload(ni, tx, IBLND_MSG_IMMEDIATE,
+						 nob, payload_nob);
+
+		rd = tx->tx_rd;
+		rc = kiblnd_setup_rd_kiov(ni, tx, rd,
+					  payload_niov, payload_kiov,
+					  payload_offset, payload_nob);
+		if (rc != 0) {
+			CERROR("Can't setup IMMEDIATE src for %s: %d\n",
+			       libcfs_nidstr(&target->nid), rc);
+			kiblnd_tx_done(tx);
+			return -EIO;
+		}
+
+		/* lets generate a SGE chain */
+		for (i = 0; i < rd->rd_nfrags; i++) {
+			kiblnd_init_tx_sge(tx, rd->rd_frags[i].rf_addr,
+					   rd->rd_frags[i].rf_nob);
+			wrq->wr.num_sge++;
+		}
+	} else {
+		lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+				    offsetof(struct kib_msg,
+					     ibm_u.immediate.ibim_payload),
+				    payload_niov, payload_kiov,
+				    payload_offset, payload_nob);
+
+		nob = offsetof(struct kib_immediate_msg,
+			       ibim_payload[payload_nob]);
+
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+	}
+
+	/* finalise lntmsg on completion */
+	tx->tx_lntmsg[0] = lntmsg;
+
+	kiblnd_launch_tx(ni, tx, lnet_nid_to_nid4(&target->nid));
+	return 0;
+}
+
+static void
+kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
+{
+	struct lnet_processid *target = &lntmsg->msg_target;
+	unsigned int niov = lntmsg->msg_niov;
+	struct bio_vec *kiov = lntmsg->msg_kiov;
+	unsigned int offset = lntmsg->msg_offset;
+	unsigned int nob = lntmsg->msg_len;
+	struct kib_tx *tx;
+	int rc;
+
+	tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+	if (tx == NULL) {
+		CERROR("Can't get tx for REPLY to %s\n",
+		       libcfs_nidstr(&target->nid));
+		goto failed_0;
+	}
+
+	if (nob == 0)
+		rc = 0;
+	else
+		rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+					  niov, kiov, offset, nob);
+
+	if (rc != 0) {
+		CERROR("Can't setup GET src for %s: %d\n",
+		       libcfs_nidstr(&target->nid), rc);
+		goto failed_1;
+	}
+
+	rc = kiblnd_init_rdma(rx->rx_conn, tx,
+			      IBLND_MSG_GET_DONE, nob,
+			      &rx->rx_msg->ibm_u.get.ibgm_rd,
+			      rx->rx_msg->ibm_u.get.ibgm_cookie);
+	if (rc < 0) {
+		CERROR("Can't setup rdma for GET from %s: %d\n",
+		       libcfs_nidstr(&target->nid), rc);
+		goto failed_1;
+	}
+
+	if (nob == 0) {
+		/* No RDMA: local completion may happen now! */
+		lnet_finalize(lntmsg, 0);
+	} else {
+		/* RDMA: lnet_finalize(lntmsg) when it
+		 * completes */
+		tx->tx_lntmsg[0] = lntmsg;
+	}
+
+        kiblnd_queue_tx(tx, rx->rx_conn);
+        return;
+
+
+failed_1:
+	tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+	kiblnd_tx_done(tx);
+failed_0:
+	lnet_finalize(lntmsg, -EIO);
+}
+
+unsigned int
+kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx)
+{
+	struct kib_net *net = ni->ni_data;
+	struct device *dev = NULL;
+
+	if (net)
+		dev = net->ibn_dev->ibd_hdev->ibh_ibdev->dma_device;
+
+	return lnet_get_dev_prio(dev, dev_idx);
+
+}
+
+int
+kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
+	    int delayed, unsigned int niov, struct bio_vec *kiov,
+	    unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	struct kib_rx *rx = private;
+	struct kib_msg *rxmsg = rx->rx_msg;
+	struct kib_conn *conn = rx->rx_conn;
+	struct kib_tx *tx;
+	__u64	     ibprm_cookie;
+	int          nob;
+	int          post_credit = IBLND_POSTRX_PEER_CREDIT;
+	int          rc = 0;
+
+	LASSERT (mlen <= rlen);
+	LASSERT (!in_interrupt());
+
+	switch (rxmsg->ibm_type) {
+	default:
+		LBUG();
+		/* fallthrough */
+	case IBLND_MSG_IMMEDIATE:
+		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]);
+		if (nob > rx->rx_nob) {
+			CERROR("Immediate message from %s too big: %d(%d)\n",
+			       libcfs_nidstr(&lntmsg->msg_hdr.src_nid),
+			       nob, rx->rx_nob);
+			rc = -EPROTO;
+			break;
+		}
+
+		lnet_copy_flat2kiov(niov, kiov, offset,
+				    IBLND_MSG_SIZE, rxmsg,
+				    offsetof(struct kib_msg,
+					     ibm_u.immediate.ibim_payload),
+				    mlen);
+		lnet_finalize(lntmsg, 0);
+		break;
+
+	case IBLND_MSG_PUT_REQ: {
+		struct kib_msg	*txmsg;
+		struct kib_rdma_desc *rd;
+		ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+
+		if (mlen == 0) {
+			lnet_finalize(lntmsg, 0);
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
+					       0, ibprm_cookie);
+			break;
+		}
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+                if (tx == NULL) {
+                        CERROR("Can't allocate tx for %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        /* Not replying will break the connection */
+                        rc = -ENOMEM;
+                        break;
+                }
+
+		txmsg = tx->tx_msg;
+		rd = &txmsg->ibm_u.putack.ibpam_rd;
+		rc = kiblnd_setup_rd_kiov(ni, tx, rd,
+					  niov, kiov, offset, mlen);
+		if (rc != 0) {
+			CERROR("Can't setup PUT sink for %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+			kiblnd_tx_done(tx);
+			/* tell peer_ni it's over */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
+					       rc, ibprm_cookie);
+			break;
+		}
+
+		nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]);
+		txmsg->ibm_u.putack.ibpam_src_cookie = ibprm_cookie;
+		txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_DONE */
+                kiblnd_queue_tx(tx, conn);
+
+                /* reposted buffer reserved for PUT_DONE */
+                post_credit = IBLND_POSTRX_NO_CREDIT;
+                break;
+		}
+
+        case IBLND_MSG_GET_REQ:
+                if (lntmsg != NULL) {
+                        /* Optimized GET; RDMA lntmsg's payload */
+                        kiblnd_reply(ni, rx, lntmsg);
+                } else {
+                        /* GET didn't match anything */
+                        kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+                                               -ENODATA,
+                                               rxmsg->ibm_u.get.ibgm_cookie);
+                }
+                break;
+        }
+
+        kiblnd_post_rx(rx, post_credit);
+        return rc;
+}
+
+static void
+kiblnd_thread_fini (void)
+{
+	atomic_dec (&kiblnd_data.kib_nthreads);
+}
+
+static void
+kiblnd_peer_alive(struct kib_peer_ni *peer_ni)
+{
+	/* This is racy, but everyone's only writing ktime_get_seconds() */
+	peer_ni->ibp_last_alive = ktime_get_seconds();
+	smp_mb();
+}
+
+static void
+kiblnd_peer_notify(struct kib_peer_ni *peer_ni)
+{
+	int           error = 0;
+	time64_t last_alive = 0;
+	unsigned long flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (kiblnd_peer_idle(peer_ni) && peer_ni->ibp_error != 0) {
+		error = peer_ni->ibp_error;
+		peer_ni->ibp_error = 0;
+
+		last_alive = peer_ni->ibp_last_alive;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (error != 0)
+		lnet_notify(peer_ni->ibp_ni,
+			    peer_ni->ibp_nid, false, false, last_alive);
+}
+
+void
+kiblnd_close_conn_locked(struct kib_conn *conn, int error)
+{
+        /* This just does the immediate housekeeping.  'error' is zero for a
+         * normal shutdown which can happen only after the connection has been
+         * established.  If the connection is established, schedule the
+         * connection to be finished off by the connd.  Otherwise the connd is
+         * already dealing with it (either to set it up or tear it down).
+         * Caller holds kib_global_lock exclusively in irq context */
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct kib_dev *dev;
+	unsigned long flags;
+
+        LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+        if (error != 0 && conn->ibc_comms_error == 0)
+                conn->ibc_comms_error = error;
+
+        if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+                return; /* already being handled  */
+
+        if (error == 0 &&
+	    list_empty(&conn->ibc_tx_noops) &&
+	    list_empty(&conn->ibc_tx_queue) &&
+	    list_empty(&conn->ibc_tx_queue_rsrvd) &&
+	    list_empty(&conn->ibc_tx_queue_nocred) &&
+	    list_empty(&conn->ibc_active_txs)) {
+                CDEBUG(D_NET, "closing conn to %s\n", 
+                       libcfs_nid2str(peer_ni->ibp_nid));
+        } else {
+                CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), error,
+		       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+		       list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+		       list_empty(&conn->ibc_tx_queue_rsrvd) ?
+						"" : "(sending_rsrvd)",
+		       list_empty(&conn->ibc_tx_queue_nocred) ?
+						 "" : "(sending_nocred)",
+		       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+	}
+
+	dev = ((struct kib_net *)peer_ni->ibp_ni->ni_data)->ibn_dev;
+	if (peer_ni->ibp_next_conn == conn)
+		/* clear next_conn so it won't be used */
+		peer_ni->ibp_next_conn = NULL;
+	list_del(&conn->ibc_list);
+	/* connd (see below) takes over ibc_list's ref */
+
+	if (list_empty(&peer_ni->ibp_conns) &&    /* no more conns */
+            kiblnd_peer_active(peer_ni)) {         /* still in peer_ni table */
+                kiblnd_unlink_peer_locked(peer_ni);
+
+                /* set/clear error on last conn */
+                peer_ni->ibp_error = conn->ibc_comms_error;
+        }
+
+        kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+	if (error != 0 &&
+	    kiblnd_dev_can_failover(dev)) {
+		list_add_tail(&dev->ibd_fail_list,
+			      &kiblnd_data.kib_failed_devs);
+		wake_up(&kiblnd_data.kib_failover_waitq);
+	}
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+	wake_up(&kiblnd_data.kib_connd_waitq);
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(struct kib_conn *conn, int error)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_close_conn_locked(conn, error);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+static void
+kiblnd_handle_early_rxs(struct kib_conn *conn)
+{
+	unsigned long flags;
+	struct kib_rx *rx;
+
+	LASSERT(!in_interrupt());
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while ((rx = list_first_entry_or_null(&conn->ibc_early_rxs,
+					      struct kib_rx,
+					      rx_list)) != NULL) {
+		list_del(&rx->rx_list);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_handle_rx(rx);
+
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
+{
+	LIST_HEAD(zombies);
+	struct kib_tx *nxt;
+	struct kib_tx *tx;
+
+	spin_lock(&conn->ibc_lock);
+
+	list_for_each_entry_safe(tx, nxt, txs, tx_list) {
+		if (txs == &conn->ibc_active_txs) {
+			LASSERT(!tx->tx_queued);
+			LASSERT(tx->tx_waiting ||
+				tx->tx_sending != 0);
+			if (conn->ibc_comms_error == -ETIMEDOUT) {
+				if (tx->tx_waiting && !tx->tx_sending)
+					tx->tx_hstatus =
+					  LNET_MSG_STATUS_REMOTE_TIMEOUT;
+				else if (tx->tx_sending)
+					tx->tx_hstatus =
+					  LNET_MSG_STATUS_NETWORK_TIMEOUT;
+			}
+		} else {
+			LASSERT(tx->tx_queued);
+			if (conn->ibc_comms_error == -ETIMEDOUT)
+				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
+			else
+				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+		}
+
+		tx->tx_status = -ECONNABORTED;
+		tx->tx_waiting = 0;
+
+		/*
+		 * TODO: This makes an assumption that
+		 * kiblnd_tx_complete() will be called for each tx. If
+		 * that event is dropped we could end up with stale
+		 * connections floating around. We'd like to deal with
+		 * that in a better way.
+		 *
+		 * Also that means we can exceed the timeout by many
+		 * seconds.
+		 */
+		if (tx->tx_sending == 0) {
+			tx->tx_queued = 0;
+			list_move(&tx->tx_list, &zombies);
+		} else {
+			/* keep tx until cq destroy */
+			list_move(&tx->tx_list, &conn->ibc_zombie_txs);
+			conn->ibc_waits ++;
+		}
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	/*
+	 * aborting transmits occurs when finalizing the connection.
+	 * The connection is finalized on error.
+	 * Passing LNET_MSG_STATUS_OK to txlist_done() will not
+	 * override the value already set in tx->tx_hstatus above.
+	 */
+	kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK);
+}
+
+static bool
+kiblnd_tx_may_discard(struct kib_conn *conn)
+{
+	bool rc = false;
+	struct kib_tx *nxt;
+	struct kib_tx *tx;
+
+	spin_lock(&conn->ibc_lock);
+
+	list_for_each_entry_safe(tx, nxt, &conn->ibc_zombie_txs, tx_list) {
+		if (tx->tx_sending > 0 && tx->tx_lntmsg[0] &&
+		    lnet_md_discarded(tx->tx_lntmsg[0]->msg_md)) {
+			tx->tx_sending --;
+			if (tx->tx_sending == 0) {
+				kiblnd_conn_decref(tx->tx_conn);
+				tx->tx_conn = NULL;
+				rc = true;
+			}
+		}
+	}
+
+	spin_unlock(&conn->ibc_lock);
+	return rc;
+}
+
+static void
+kiblnd_finalise_conn(struct kib_conn *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state > IBLND_CONN_INIT);
+
+	/* abort_receives moves QP state to IB_QPS_ERR.  This is only required
+	 * for connections that didn't get as far as being connected, because
+	 * rdma_disconnect() does this for free. */
+	kiblnd_abort_receives(conn);
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+	/* Complete all tx descs not waiting for sends to complete.
+	 * NB we should be safe from RDMA now that the QP has changed state */
+
+	CDEBUG(D_NET, "abort connection with %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+	kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+	kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+	kiblnd_handle_early_rxs(conn);
+}
+
+static void
+kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
+			   int error)
+{
+	LIST_HEAD(zombies);
+	unsigned long flags;
+	enum lnet_msg_hstatus hstatus;
+
+	LASSERT(error != 0);
+	LASSERT(!in_interrupt());
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (active) {
+		LASSERT(peer_ni->ibp_connecting > 0);
+		peer_ni->ibp_connecting--;
+	} else {
+		LASSERT (peer_ni->ibp_accepting > 0);
+		peer_ni->ibp_accepting--;
+	}
+
+	if (kiblnd_peer_connecting(peer_ni)) {
+		/* another connection attempt under way... */
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					flags);
+		return;
+	}
+
+	peer_ni->ibp_reconnected = 0;
+	if (list_empty(&peer_ni->ibp_conns)) {
+		/* Take peer_ni's blocked transmits to complete with error */
+		list_splice_init(&peer_ni->ibp_tx_queue, &zombies);
+
+		if (kiblnd_peer_active(peer_ni))
+			kiblnd_unlink_peer_locked(peer_ni);
+
+		peer_ni->ibp_error = error;
+	} else {
+		/* Can't have blocked transmits if there are connections */
+		LASSERT(list_empty(&peer_ni->ibp_tx_queue));
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_peer_notify(peer_ni);
+
+	if (list_empty(&zombies))
+		return;
+
+	CNETERR("Deleting messages for %s: connection failed\n",
+		libcfs_nid2str(peer_ni->ibp_nid));
+
+	switch (error) {
+	case -EHOSTUNREACH:
+	case -ETIMEDOUT:
+		hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT;
+		break;
+	case -ECONNREFUSED:
+		hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
+		break;
+	default:
+		hstatus = LNET_MSG_STATUS_LOCAL_DROPPED;
+		break;
+	}
+
+	kiblnd_txlist_done(&zombies, error, hstatus);
+}
+
+static void
+kiblnd_connreq_done(struct kib_conn *conn, int status)
+{
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct kib_tx *tx;
+	LIST_HEAD(txs);
+	unsigned long	 flags;
+	int		 active;
+
+        active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n",
+	       libcfs_nid2str(peer_ni->ibp_nid), active,
+	       conn->ibc_version, status);
+
+	LASSERT (!in_interrupt());
+	LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+		  peer_ni->ibp_connecting > 0) ||
+		 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+		  peer_ni->ibp_accepting > 0));
+
+	LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+	conn->ibc_connvars = NULL;
+
+	if (status != 0) {
+		/* failed to establish connection */
+		kiblnd_peer_connect_failed(peer_ni, active, status);
+		kiblnd_finalise_conn(conn);
+		return;
+	}
+
+	/* connection established */
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	conn->ibc_last_send = ktime_get();
+	kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+	kiblnd_peer_alive(peer_ni);
+
+	/* Add conn to peer_ni's list and nuke any dangling conns from a different
+	 * peer_ni instance... */
+	kiblnd_conn_addref(conn);	/* +1 ref for ibc_list */
+	list_add(&conn->ibc_list, &peer_ni->ibp_conns);
+	peer_ni->ibp_reconnected = 0;
+	if (active)
+		peer_ni->ibp_connecting--;
+	else
+		peer_ni->ibp_accepting--;
+
+        if (peer_ni->ibp_version == 0) {
+                peer_ni->ibp_version     = conn->ibc_version;
+                peer_ni->ibp_incarnation = conn->ibc_incarnation;
+        }
+
+        if (peer_ni->ibp_version     != conn->ibc_version ||
+            peer_ni->ibp_incarnation != conn->ibc_incarnation) {
+                kiblnd_close_stale_conns_locked(peer_ni, conn->ibc_version,
+                                                conn->ibc_incarnation);
+                peer_ni->ibp_version     = conn->ibc_version;
+                peer_ni->ibp_incarnation = conn->ibc_incarnation;
+        }
+
+	/* grab pending txs while I have the lock */
+	list_splice_init(&peer_ni->ibp_tx_queue, &txs);
+
+        if (!kiblnd_peer_active(peer_ni) ||        /* peer_ni has been deleted */
+            conn->ibc_comms_error != 0) {       /* error has happened already */
+
+                /* start to shut down connection */
+                kiblnd_close_conn_locked(conn, -ECONNABORTED);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_txlist_done(&txs, -ECONNABORTED,
+				   LNET_MSG_STATUS_LOCAL_ERROR);
+
+		return;
+	}
+
+	/* +1 ref for myself, this connection is visible to other threads
+	 * now, refcount of peer:ibp_conns can be released by connection
+	 * close from either a different thread, or the calling of
+	 * kiblnd_check_sends_locked() below. See bz21911 for details.
+	 */
+	kiblnd_conn_addref(conn);
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Schedule blocked txs
+	 * Note: if we are running with conns_per_peer > 1, these blocked
+	 * txs will all get scheduled to the first connection which gets
+	 * scheduled.  We won't be using round robin on this first batch.
+	 */
+	spin_lock(&conn->ibc_lock);
+	while ((tx = list_first_entry_or_null(&txs, struct kib_tx,
+					      tx_list)) != NULL) {
+		list_del(&tx->tx_list);
+
+		kiblnd_queue_tx_locked(tx, conn);
+	}
+	kiblnd_check_sends_locked(conn);
+	spin_unlock(&conn->ibc_lock);
+
+	/* schedule blocked rxs */
+	kiblnd_handle_early_rxs(conn);
+	kiblnd_conn_decref(conn);
+}
+
+static void
+kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej)
+{
+        int          rc;
+
+#ifdef HAVE_RDMA_REJECT_4ARGS
+	rc = rdma_reject(cmid, rej, sizeof(*rej), IB_CM_REJ_CONSUMER_DEFINED);
+#else
+        rc = rdma_reject(cmid, rej, sizeof(*rej));
+#endif
+
+        if (rc != 0)
+                CWARN("Error %d sending reject\n", rc);
+}
+
+static int
+kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+	rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+	struct kib_msg *reqmsg = priv;
+	struct kib_msg *ackmsg;
+	struct kib_dev *ibdev;
+	struct kib_peer_ni *peer_ni;
+	struct kib_peer_ni *peer2;
+	struct kib_conn *conn;
+	struct lnet_ni *ni = NULL;
+	struct kib_net *net = NULL;
+	lnet_nid_t nid;
+	struct rdma_conn_param cp;
+	struct kib_rej rej;
+	int version = IBLND_MSG_VERSION;
+	unsigned long flags;
+	int rc;
+	struct sockaddr_in *peer_addr;
+
+	LASSERT(!in_interrupt());
+	/* cmid inherits 'context' from the corresponding listener id */
+	ibdev = cmid->context;
+	LASSERT(ibdev);
+
+	memset(&rej, 0, sizeof(rej));
+	rej.ibr_magic                = IBLND_MSG_MAGIC;
+	rej.ibr_why                  = IBLND_REJECT_FATAL;
+	rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+	if (*kiblnd_tunables.kib_require_priv_port &&
+	    ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+		__u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+		CERROR("peer_ni's port (%pI4h:%hu) is not privileged\n",
+		       &ip, ntohs(peer_addr->sin_port));
+		goto failed;
+	}
+
+	if (priv_nob < offsetof(struct kib_msg, ibm_type)) {
+		CERROR("Short connection request\n");
+		goto failed;
+	}
+
+	/* Future protocol version compatibility support!  If the
+	 * o2iblnd-specific protocol changes, or when LNET unifies
+	 * protocols over all LNDs, the initial connection will
+	 * negotiate a protocol version.  I trap this here to avoid
+	 * console errors; the reject tells the peer_ni which protocol I
+	 * speak. */
+	if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+	    reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+		goto failed;
+	if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+		goto failed;
+	if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+		goto failed;
+
+	rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+	if (rc != 0) {
+		CERROR("Can't parse connection request: %d\n", rc);
+		goto failed;
+	}
+
+	nid = reqmsg->ibm_srcnid;
+	ni  = lnet_nid2ni_addref(reqmsg->ibm_dstnid);
+
+	if (ni != NULL) {
+		net = (struct kib_net *)ni->ni_data;
+		rej.ibr_incarnation = net->ibn_incarnation;
+	}
+
+	if (ni == NULL ||			/* no matching net */
+	    lnet_nid_to_nid4(&ni->ni_nid) !=
+	    reqmsg->ibm_dstnid ||		/* right NET, wrong NID! */
+	    net->ibn_dev != ibdev) {		/* wrong device */
+		CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): bad dst nid %s\n", libcfs_nid2str(nid),
+		       ni ? libcfs_nidstr(&ni->ni_nid) : "NA",
+		       ibdev->ibd_ifname, ibdev->ibd_nnets,
+		       &ibdev->ibd_ifip,
+		       libcfs_nid2str(reqmsg->ibm_dstnid));
+
+		goto failed;
+	}
+
+	/* check time stamp as soon as possible */
+	if (reqmsg->ibm_dststamp != 0 &&
+	    reqmsg->ibm_dststamp != net->ibn_incarnation) {
+		CWARN("Stale connection request\n");
+		rej.ibr_why = IBLND_REJECT_CONN_STALE;
+		goto failed;
+	}
+
+	/* I can accept peer_ni's version */
+	version = reqmsg->ibm_version;
+
+	if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+		CERROR("Unexpected connreq msg type: %x from %s\n",
+		       reqmsg->ibm_type, libcfs_nid2str(nid));
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
+	    kiblnd_msg_queue_size(version, ni)) {
+		CERROR("Can't accept conn from %s, queue depth too large:  %d (<=%d wanted)\n",
+		       libcfs_nid2str(nid),
+		       reqmsg->ibm_u.connparams.ibcp_queue_depth,
+		       kiblnd_msg_queue_size(version, ni));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_frags >
+	    IBLND_MAX_RDMA_FRAGS) {
+		CWARN("Can't accept conn from %s (version %x): max_frags %d too large (%d wanted)\n",
+		      libcfs_nid2str(nid), version,
+		      reqmsg->ibm_u.connparams.ibcp_max_frags,
+		      IBLND_MAX_RDMA_FRAGS);
+
+		if (version >= IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+		goto failed;
+	} else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
+		   IBLND_MAX_RDMA_FRAGS &&
+		   net->ibn_fmr_ps == NULL) {
+		CWARN("Can't accept conn from %s (version %x): max_frags %d incompatible without FMR pool (%d wanted)\n",
+		      libcfs_nid2str(nid), version,
+		      reqmsg->ibm_u.connparams.ibcp_max_frags,
+		      IBLND_MAX_RDMA_FRAGS);
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+		CERROR("Can't accept %s: message size %d too big (%d max)\n",
+		       libcfs_nid2str(nid),
+		       reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+		       IBLND_MSG_SIZE);
+		goto failed;
+	}
+
+	/* assume 'nid' is a new peer_ni; create  */
+	rc = kiblnd_create_peer(ni, &peer_ni, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer_ni for %s\n", libcfs_nid2str(nid));
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	/* We have validated the peer's parameters so use those */
+	peer_ni->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags;
+	peer_ni->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth;
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(ni, nid);
+	if (peer2 != NULL) {
+		if (peer2->ibp_version == 0) {
+			peer2->ibp_version     = version;
+			peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+		}
+
+		/* not the guy I've talked with */
+		if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+		    peer2->ibp_version     != version) {
+			kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+
+			if (kiblnd_peer_active(peer2)) {
+				peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+				peer2->ibp_version = version;
+			}
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n",
+			      libcfs_nid2str(nid), peer2->ibp_version, version,
+			      peer2->ibp_incarnation, reqmsg->ibm_srcstamp);
+
+			kiblnd_peer_decref(peer_ni);
+			rej.ibr_why = IBLND_REJECT_CONN_STALE;
+			goto failed;
+		}
+
+		/* Tie-break connection race in favour of the higher NID.
+		 * If we keep running into a race condition multiple times,
+		 * we have to assume that the connection attempt with the
+		 * higher NID is stuck in a connecting state and will never
+		 * recover.  As such, we pass through this if-block and let
+		 * the lower NID connection win so we can move forward.
+		 */
+		if (peer2->ibp_connecting != 0 &&
+		    nid < lnet_nid_to_nid4(&ni->ni_nid) &&
+		    peer2->ibp_races < MAX_CONN_RACES_BEFORE_ABORT) {
+			peer2->ibp_races++;
+			write_unlock_irqrestore(g_lock, flags);
+
+			CDEBUG(D_NET, "Conn race %s\n",
+			       libcfs_nid2str(peer2->ibp_nid));
+
+			kiblnd_peer_decref(peer_ni);
+			rej.ibr_why = IBLND_REJECT_CONN_RACE;
+			goto failed;
+		}
+		if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT)
+			CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n",
+				libcfs_nid2str(peer2->ibp_nid),
+				MAX_CONN_RACES_BEFORE_ABORT);
+		/*
+		 * passive connection is allowed even this peer_ni is waiting for
+		 * reconnection.
+		 */
+		peer2->ibp_reconnecting = 0;
+		peer2->ibp_races = 0;
+		peer2->ibp_accepting++;
+		kiblnd_peer_addref(peer2);
+
+		/* Race with kiblnd_launch_tx (active connect) to create peer_ni
+		 * so copy validated parameters since we now know what the
+		 * peer_ni's limits are */
+		peer2->ibp_max_frags = peer_ni->ibp_max_frags;
+		peer2->ibp_queue_depth = peer_ni->ibp_queue_depth;
+
+		write_unlock_irqrestore(g_lock, flags);
+		kiblnd_peer_decref(peer_ni);
+		peer_ni = peer2;
+	} else {
+		/* Brand new peer_ni */
+		LASSERT(peer_ni->ibp_accepting == 0);
+		LASSERT(peer_ni->ibp_version == 0 &&
+			peer_ni->ibp_incarnation == 0);
+
+		peer_ni->ibp_accepting   = 1;
+		peer_ni->ibp_version     = version;
+		peer_ni->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+		/* I have a ref on ni that prevents it being shutdown */
+		LASSERT(net->ibn_shutdown == 0);
+
+		kiblnd_peer_addref(peer_ni);
+		hash_add(kiblnd_data.kib_peers, &peer_ni->ibp_list, nid);
+
+		write_unlock_irqrestore(g_lock, flags);
+	}
+
+	conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_PASSIVE_WAIT,
+				  version);
+	if (!conn) {
+		kiblnd_peer_connect_failed(peer_ni, 0, -ENOMEM);
+		kiblnd_peer_decref(peer_ni);
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	/* conn now "owns" cmid, so I return success from here on to ensure the
+	 * CM callback doesn't destroy cmid.
+	 */
+	conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+	conn->ibc_credits          = conn->ibc_queue_depth;
+	conn->ibc_reserved_credits = conn->ibc_queue_depth;
+	LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
+		IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn));
+
+	ackmsg = &conn->ibc_connvars->cv_msg;
+	memset(ackmsg, 0, sizeof(*ackmsg));
+
+	kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+			sizeof(ackmsg->ibm_u.connparams));
+	ackmsg->ibm_u.connparams.ibcp_queue_depth  = conn->ibc_queue_depth;
+	ackmsg->ibm_u.connparams.ibcp_max_frags    = conn->ibc_max_frags;
+	ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.private_data        = ackmsg;
+	cp.private_data_len    = ackmsg->ibm_nob;
+	cp.responder_resources = 0;            /* No atomic ops or RDMA reads */
+	cp.initiator_depth     = 0;
+	cp.flow_control        = 1;
+	cp.retry_count         = *kiblnd_tunables.kib_retry_count;
+	cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+	CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+	rc = rdma_accept(cmid, &cp);
+	if (rc != 0) {
+		CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+		rej.ibr_version = version;
+		rej.ibr_why     = IBLND_REJECT_FATAL;
+
+		kiblnd_reject(cmid, &rej);
+		kiblnd_connreq_done(conn, rc);
+		kiblnd_conn_decref(conn);
+	}
+
+	lnet_ni_decref(ni);
+	return 0;
+
+ failed:
+	if (ni != NULL) {
+		rej.ibr_cp.ibcp_queue_depth =
+			kiblnd_msg_queue_size(version, ni);
+		rej.ibr_cp.ibcp_max_frags   = IBLND_MAX_RDMA_FRAGS;
+		lnet_ni_decref(ni);
+	}
+
+	rej.ibr_version = version;
+	kiblnd_reject(cmid, &rej);
+
+	return -ECONNREFUSED;
+}
+
+static void
+kiblnd_check_reconnect(struct kib_conn *conn, int version,
+		       u64 incarnation, int why, struct kib_connparams *cp)
+{
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	char		*reason;
+	int		 msg_size = IBLND_MSG_SIZE;
+	int		 frag_num = -1;
+	int		 queue_dep = -1;
+	bool		 reconnect;
+	unsigned long	 flags;
+
+	LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+	LASSERT(peer_ni->ibp_connecting > 0);	/* 'conn' at least */
+
+	if (cp) {
+		msg_size	= cp->ibcp_max_msg_size;
+		frag_num	= cp->ibcp_max_frags;
+		queue_dep	= cp->ibcp_queue_depth;
+	}
+
+	write_lock_irqsave(glock, flags);
+	/* retry connection if it's still needed and no other connection
+	 * attempts (active or passive) are in progress
+	 * NB: reconnect is still needed even when ibp_tx_queue is
+	 * empty if ibp_version != version because reconnect may be
+	 * initiated.
+	 */
+	reconnect = (!list_empty(&peer_ni->ibp_tx_queue) ||
+		     peer_ni->ibp_version != version) &&
+		    peer_ni->ibp_connecting &&
+		    peer_ni->ibp_accepting == 0;
+	if (!reconnect) {
+		reason = "no need";
+		goto out;
+	}
+
+	switch (why) {
+	default:
+		reason = "Unknown";
+		break;
+
+	case IBLND_REJECT_RDMA_FRAGS: {
+		if (!cp) {
+			reason = "can't negotiate max frags";
+			goto out;
+		}
+
+		if (conn->ibc_max_frags <= frag_num) {
+			reason = "unsupported max frags";
+			goto out;
+		}
+
+		peer_ni->ibp_max_frags = frag_num;
+		reason = "rdma fragments";
+		break;
+	}
+	case IBLND_REJECT_MSG_QUEUE_SIZE:
+		if (!cp) {
+			reason = "can't negotiate queue depth";
+			goto out;
+		}
+		if (conn->ibc_queue_depth <= queue_dep) {
+			reason = "unsupported queue depth";
+			goto out;
+		}
+
+		peer_ni->ibp_queue_depth = queue_dep;
+		reason = "queue depth";
+		break;
+
+        case IBLND_REJECT_CONN_STALE:
+                reason = "stale";
+                break;
+
+        case IBLND_REJECT_CONN_RACE:
+                reason = "conn race";
+                break;
+
+        case IBLND_REJECT_CONN_UNCOMPAT:
+                reason = "version negotiation";
+                break;
+        }
+
+	conn->ibc_reconnect = 1;
+	peer_ni->ibp_reconnecting++;
+	peer_ni->ibp_version = version;
+	if (incarnation != 0)
+		peer_ni->ibp_incarnation = incarnation;
+ out:
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n",
+		libcfs_nid2str(peer_ni->ibp_nid),
+		reconnect ? "reconnect" : "don't reconnect",
+		reason, IBLND_MSG_VERSION, version, msg_size,
+		conn->ibc_queue_depth, queue_dep,
+		conn->ibc_max_frags, frag_num);
+	/*
+	 * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer_ni
+	 * while destroying the zombie
+	 */
+}
+
+static void
+kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
+{
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	int status = -ECONNREFUSED;
+
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	switch (reason) {
+	case IB_CM_REJ_STALE_CONN:
+		kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
+				       IBLND_REJECT_CONN_STALE, NULL);
+		break;
+
+	case IB_CM_REJ_INVALID_SERVICE_ID:
+		status = -EHOSTUNREACH;
+		CNETERR("%s rejected: no listener at %d\n",
+			libcfs_nid2str(peer_ni->ibp_nid),
+			*kiblnd_tunables.kib_service);
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
+			struct kib_rej *rej = priv;
+			struct kib_connparams *cp = NULL;
+			bool flip = false;
+			__u64 incarnation = -1;
+
+			/* NB. default incarnation is -1 because:
+			 * a) V1 will ignore dst incarnation in connreq.
+			 * b) V2 will provide incarnation while rejecting me,
+			 *    -1 will be overwrote.
+			 *
+			 * if I try to connect to a V1 peer_ni with V2 protocol,
+			 * it rejected me then upgrade to V2, I have no idea
+			 * about the upgrading and try to reconnect with V1,
+			 * in this case upgraded V2 can find out I'm trying to
+			 * talk to the old guy and reject me(incarnation is -1).
+			 */
+
+			if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+			    rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+				__swab32s(&rej->ibr_magic);
+				__swab16s(&rej->ibr_version);
+				flip = true;
+			}
+
+			if (priv_nob >= sizeof(struct kib_rej) &&
+			    rej->ibr_version > IBLND_MSG_VERSION_1) {
+				/* priv_nob is always 148 in current version
+				 * of OFED, so we still need to check version.
+				 * (define of IB_CM_REJ_PRIVATE_DATA_SIZE)
+				 */
+				cp = &rej->ibr_cp;
+
+				if (flip) {
+					__swab64s(&rej->ibr_incarnation);
+					__swab16s(&cp->ibcp_queue_depth);
+					__swab16s(&cp->ibcp_max_frags);
+					__swab32s(&cp->ibcp_max_msg_size);
+				}
+
+				incarnation = rej->ibr_incarnation;
+			}
+
+			if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+			    rej->ibr_magic != LNET_PROTO_MAGIC) {
+				CERROR("%s rejected: consumer defined fatal error\n",
+				       libcfs_nid2str(peer_ni->ibp_nid));
+				break;
+			}
+
+			if (rej->ibr_version != IBLND_MSG_VERSION &&
+			    rej->ibr_version != IBLND_MSG_VERSION_1) {
+				CERROR("%s rejected: o2iblnd version %x error\n",
+				       libcfs_nid2str(peer_ni->ibp_nid),
+				       rej->ibr_version);
+				break;
+			}
+
+			if (rej->ibr_why     == IBLND_REJECT_FATAL &&
+			    rej->ibr_version == IBLND_MSG_VERSION_1) {
+				CDEBUG(D_NET, "rejected by old version peer_ni %s: %x\n",
+				       libcfs_nid2str(peer_ni->ibp_nid),
+				       rej->ibr_version);
+
+				if (conn->ibc_version != IBLND_MSG_VERSION_1)
+					rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+			}
+
+			switch (rej->ibr_why) {
+			case IBLND_REJECT_CONN_RACE:
+			case IBLND_REJECT_CONN_STALE:
+			case IBLND_REJECT_CONN_UNCOMPAT:
+			case IBLND_REJECT_MSG_QUEUE_SIZE:
+			case IBLND_REJECT_RDMA_FRAGS:
+				kiblnd_check_reconnect(conn, rej->ibr_version,
+						       incarnation,
+						       rej->ibr_why, cp);
+				break;
+
+			case IBLND_REJECT_NO_RESOURCES:
+				CERROR("%s rejected: o2iblnd no resources\n",
+				       libcfs_nid2str(peer_ni->ibp_nid));
+				break;
+
+			case IBLND_REJECT_FATAL:
+				CERROR("%s rejected: o2iblnd fatal error\n",
+				       libcfs_nid2str(peer_ni->ibp_nid));
+				break;
+
+			default:
+				CERROR("%s rejected: o2iblnd reason %d\n",
+				       libcfs_nid2str(peer_ni->ibp_nid),
+				       rej->ibr_why);
+				break;
+			}
+			break;
+		}
+		fallthrough;
+	default:
+		CNETERR("%s rejected: reason %d, size %d\n",
+			libcfs_nid2str(peer_ni->ibp_nid), reason, priv_nob);
+		break;
+	}
+
+	kiblnd_connreq_done(conn, status);
+}
+
+static void
+kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob)
+{
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct lnet_ni *ni = peer_ni->ibp_ni;
+	struct kib_net *net = ni->ni_data;
+	struct kib_msg *msg = priv;
+        int            ver  = conn->ibc_version;
+        int            rc   = kiblnd_unpack_msg(msg, priv_nob);
+        unsigned long  flags;
+
+        LASSERT (net != NULL);
+
+        if (rc != 0) {
+                CERROR("Can't unpack connack from %s: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), rc);
+                goto failed;
+        }
+
+        if (msg->ibm_type != IBLND_MSG_CONNACK) {
+                CERROR("Unexpected message %d from %s\n",
+                       msg->ibm_type, libcfs_nid2str(peer_ni->ibp_nid));
+                rc = -EPROTO;
+                goto failed;
+        }
+
+        if (ver != msg->ibm_version) {
+                CERROR("%s replied version %x is different with "
+                       "requested version %x\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), msg->ibm_version, ver);
+                rc = -EPROTO;
+                goto failed;
+        }
+
+	if (msg->ibm_u.connparams.ibcp_queue_depth >
+	    conn->ibc_queue_depth) {
+		CERROR("%s has incompatible queue depth %d (<=%d wanted)\n",
+		       libcfs_nid2str(peer_ni->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_queue_depth,
+		       conn->ibc_queue_depth);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_frags >
+	    conn->ibc_max_frags) {
+		CERROR("%s has incompatible max_frags %d (<=%d wanted)\n",
+		       libcfs_nid2str(peer_ni->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_frags,
+		       conn->ibc_max_frags);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+        if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+                CERROR("%s max message size %d too big (%d max)\n",
+                       libcfs_nid2str(peer_ni->ibp_nid),
+                       msg->ibm_u.connparams.ibcp_max_msg_size,
+                       IBLND_MSG_SIZE);
+                rc = -EPROTO;
+                goto failed;
+        }
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (msg->ibm_dstnid == lnet_nid_to_nid4(&ni->ni_nid) &&
+	    msg->ibm_dststamp == net->ibn_incarnation)
+		rc = 0;
+	else
+		rc = -ESTALE;
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        if (rc != 0) {
+                CERROR("Bad connection reply from %s, rc = %d, "
+                       "version: %x max_frags: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), rc,
+                       msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+                goto failed;
+        }
+
+	conn->ibc_incarnation      = msg->ibm_srcstamp;
+	conn->ibc_credits          = msg->ibm_u.connparams.ibcp_queue_depth;
+	conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth;
+	conn->ibc_queue_depth      = msg->ibm_u.connparams.ibcp_queue_depth;
+	conn->ibc_max_frags        = msg->ibm_u.connparams.ibcp_max_frags;
+	LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
+		IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn));
+
+        kiblnd_connreq_done(conn, 0);
+        return;
+
+ failed:
+        /* NB My QP has already established itself, so I handle anything going
+         * wrong here by setting ibc_comms_error.
+         * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+         * immediately tears it down. */
+
+        LASSERT (rc != 0);
+        conn->ibc_comms_error = rc;
+        kiblnd_connreq_done(conn, 0);
+}
+
+static int
+kiblnd_active_connect(struct rdma_cm_id *cmid)
+{
+	struct kib_peer_ni *peer_ni = cmid->context;
+	struct kib_conn *conn;
+	struct kib_msg *msg;
+	struct rdma_conn_param cp;
+        int                      version;
+        __u64                    incarnation;
+        unsigned long            flags;
+        int                      rc;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	incarnation = peer_ni->ibp_incarnation;
+	version     = (peer_ni->ibp_version == 0) ? IBLND_MSG_VERSION :
+						 peer_ni->ibp_version;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_ACTIVE_CONNECT,
+				  version);
+        if (conn == NULL) {
+                kiblnd_peer_connect_failed(peer_ni, 1, -ENOMEM);
+                kiblnd_peer_decref(peer_ni); /* lose cmid's ref */
+                return -ENOMEM;
+        }
+
+        /* conn "owns" cmid now, so I return success from here on to ensure the
+         * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+         * on peer_ni */
+
+        msg = &conn->ibc_connvars->cv_msg;
+
+	memset(msg, 0, sizeof(*msg));
+	kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+	msg->ibm_u.connparams.ibcp_queue_depth  = conn->ibc_queue_depth;
+	msg->ibm_u.connparams.ibcp_max_frags    = conn->ibc_max_frags;
+	msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+        kiblnd_pack_msg(peer_ni->ibp_ni, msg, version,
+                        0, peer_ni->ibp_nid, incarnation);
+
+        memset(&cp, 0, sizeof(cp));
+        cp.private_data        = msg;
+        cp.private_data_len    = msg->ibm_nob;
+        cp.responder_resources = 0;             /* No atomic ops or RDMA reads */
+        cp.initiator_depth     = 0;
+        cp.flow_control        = 1;
+        cp.retry_count         = *kiblnd_tunables.kib_retry_count;
+        cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+        LASSERT(cmid->context == (void *)conn);
+        LASSERT(conn->ibc_cmid == cmid);
+	rc = rdma_connect_locked(cmid, &cp);
+        if (rc != 0) {
+                CERROR("Can't connect to %s: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), rc);
+                kiblnd_connreq_done(conn, rc);
+                kiblnd_conn_decref(conn);
+        }
+
+        return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+	struct kib_peer_ni *peer_ni;
+	struct kib_conn *conn;
+	int rc;
+
+	switch (event->event) {
+	default:
+                CERROR("Unexpected event: %d, status: %d\n",
+                       event->event, event->status);
+                LBUG();
+
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+                /* destroy cmid on failure */
+		rc = kiblnd_passive_connect(cmid,
+                                            (void *)KIBLND_CONN_PARAM(event),
+                                            KIBLND_CONN_PARAM_LEN(event));
+                CDEBUG(D_NET, "connreq: %d\n", rc);
+                return rc;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		peer_ni = cmid->context;
+                CNETERR("%s: ADDR ERROR %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
+                kiblnd_peer_decref(peer_ni);
+                return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		peer_ni = cmid->context;
+
+                CDEBUG(D_NET,"%s Addr resolved: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+
+                if (event->status != 0) {
+                        CNETERR("Can't resolve address for %s: %d\n",
+                                libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                        rc = event->status;
+		} else {
+			rc = rdma_resolve_route(
+				cmid, kiblnd_timeout() * 1000);
+			if (rc == 0) {
+				struct kib_net *net = peer_ni->ibp_ni->ni_data;
+				struct kib_dev *dev = net->ibn_dev;
+
+				CDEBUG(D_NET, "%s: connection bound to "\
+				       "%s:%pI4h:%s\n",
+				       libcfs_nid2str(peer_ni->ibp_nid),
+				       dev->ibd_ifname,
+				       &dev->ibd_ifip, cmid->device->name);
+
+				return 0;
+			}
+
+                        /* Can't initiate route resolution */
+                        CERROR("Can't resolve route for %s: %d\n",
+                               libcfs_nid2str(peer_ni->ibp_nid), rc);
+                }
+                kiblnd_peer_connect_failed(peer_ni, 1, rc);
+                kiblnd_peer_decref(peer_ni);
+                return rc;                      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		peer_ni = cmid->context;
+                CNETERR("%s: ROUTE ERROR %d\n",
+                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
+                kiblnd_peer_decref(peer_ni);
+                return -EHOSTUNREACH;           /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		peer_ni = cmid->context;
+                CDEBUG(D_NET,"%s Route resolved: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+
+                if (event->status == 0)
+                        return kiblnd_active_connect(cmid);
+
+                CNETERR("Can't resolve route for %s: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer_ni, 1, event->status);
+                kiblnd_peer_decref(peer_ni);
+                return event->status;           /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_UNREACHABLE:
+		conn = cmid->context;
+                LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                        conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+                CNETERR("%s: UNREACHABLE %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+                kiblnd_connreq_done(conn, -ENETDOWN);
+                kiblnd_conn_decref(conn);
+                return 0;
+
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		conn = cmid->context;
+                LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                        conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+                CNETERR("%s: CONNECT ERROR %d\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+                kiblnd_connreq_done(conn, -ENOTCONN);
+                kiblnd_conn_decref(conn);
+                return 0;
+
+	case RDMA_CM_EVENT_REJECTED:
+		conn = cmid->context;
+                switch (conn->ibc_state) {
+                default:
+                        LBUG();
+
+                case IBLND_CONN_PASSIVE_WAIT:
+                        CERROR ("%s: REJECTED %d\n",
+                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                                event->status);
+                        kiblnd_connreq_done(conn, -ECONNRESET);
+                        break;
+
+                case IBLND_CONN_ACTIVE_CONNECT:
+                        kiblnd_rejected(conn, event->status,
+                                        (void *)KIBLND_CONN_PARAM(event),
+                                        KIBLND_CONN_PARAM_LEN(event));
+                        break;
+                }
+                kiblnd_conn_decref(conn);
+                return 0;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		conn = cmid->context;
+                switch (conn->ibc_state) {
+                default:
+                        LBUG();
+
+                case IBLND_CONN_PASSIVE_WAIT:
+                        CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kiblnd_connreq_done(conn, 0);
+                        break;
+
+                case IBLND_CONN_ACTIVE_CONNECT:
+                        CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kiblnd_check_connreply(conn,
+                                               (void *)KIBLND_CONN_PARAM(event),
+                                               KIBLND_CONN_PARAM_LEN(event));
+                        break;
+                }
+                /* net keeps its ref on conn! */
+                return 0;
+
+        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+                CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+                return 0;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		conn = cmid->context;
+                if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                        CERROR("%s DISCONNECTED\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kiblnd_connreq_done(conn, -ECONNRESET);
+                } else {
+                        kiblnd_close_conn(conn, 0);
+                }
+                kiblnd_conn_decref(conn);
+                cmid->context = NULL;
+                return 0;
+
+        case RDMA_CM_EVENT_DEVICE_REMOVAL:
+                LCONSOLE_ERROR_MSG(0x131,
+                                   "Received notification of device removal\n"
+                                   "Please shutdown LNET to allow this to proceed\n");
+                /* Can't remove network from underneath LNET for now, so I have
+                 * to ignore this */
+                return 0;
+
+        case RDMA_CM_EVENT_ADDR_CHANGE:
+                LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+                return 0;
+        }
+}
+
+static int
+kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
+{
+	struct kib_tx *tx;
+
+	list_for_each_entry(tx, txs, tx_list) {
+		if (txs != &conn->ibc_active_txs) {
+			LASSERT(tx->tx_queued);
+		} else {
+			LASSERT(!tx->tx_queued);
+			LASSERT(tx->tx_waiting || tx->tx_sending != 0);
+		}
+
+		if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
+			CERROR("Timed out tx: %s(WSQ:%d%d%d), %lld seconds\n",
+			       kiblnd_queue2str(conn, txs),
+			       tx->tx_waiting, tx->tx_sending, tx->tx_queued,
+			       kiblnd_timeout() +
+			       ktime_ms_delta(ktime_get(),
+					      tx->tx_deadline) / MSEC_PER_SEC);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(struct kib_conn *conn)
+{
+        return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+                kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+                kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+                kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+                kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+static void
+kiblnd_check_conns (int idx)
+{
+	LIST_HEAD(closes);
+	LIST_HEAD(checksends);
+	LIST_HEAD(timedout_txs);
+	struct hlist_head *peers = &kiblnd_data.kib_peers[idx];
+	struct kib_peer_ni *peer_ni;
+	struct kib_conn *conn;
+	struct kib_tx *tx, *tx_tmp;
+	unsigned long flags;
+
+	/* NB. We expect to have a look at all the peers and not find any
+	 * RDMAs to time out, so we just use a shared lock while we
+	 * take a look...
+	 */
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	hlist_for_each_entry(peer_ni, peers, ibp_list) {
+		/* Check tx_deadline */
+		list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) {
+			if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
+				CWARN("Timed out tx for %s: %lld seconds\n",
+				      libcfs_nid2str(peer_ni->ibp_nid),
+				      ktime_ms_delta(ktime_get(),
+						     tx->tx_deadline) / MSEC_PER_SEC);
+				list_move(&tx->tx_list, &timedout_txs);
+			}
+		}
+
+		list_for_each_entry(conn, &peer_ni->ibp_conns, ibc_list) {
+			int timedout;
+			int sendnoop;
+
+			LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+			spin_lock(&conn->ibc_lock);
+
+			sendnoop = kiblnd_need_noop(conn);
+			timedout = kiblnd_conn_timed_out_locked(conn);
+			if (!sendnoop && !timedout) {
+				spin_unlock(&conn->ibc_lock);
+				continue;
+			}
+
+			if (timedout) {
+				CERROR("Timed out RDMA with %s (%lld): c: %u, oc: %u, rc: %u\n",
+				       libcfs_nid2str(peer_ni->ibp_nid),
+				       ktime_get_seconds()
+				       - peer_ni->ibp_last_alive,
+				       conn->ibc_credits,
+				       conn->ibc_outstanding_credits,
+				       conn->ibc_reserved_credits);
+				list_add(&conn->ibc_connd_list, &closes);
+			} else {
+				list_add(&conn->ibc_connd_list, &checksends);
+			}
+			/* +ref for 'closes' or 'checksends' */
+			kiblnd_conn_addref(conn);
+
+			spin_unlock(&conn->ibc_lock);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (!list_empty(&timedout_txs))
+		kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT,
+				   LNET_MSG_STATUS_NETWORK_TIMEOUT);
+
+	/* Handle timeout by closing the whole
+	 * connection. We can only be sure RDMA activity
+	 * has ceased once the QP has been modified.
+	 */
+	while ((conn = list_first_entry_or_null(&closes,
+						struct kib_conn,
+						ibc_connd_list)) != NULL) {
+		list_del(&conn->ibc_connd_list);
+		kiblnd_close_conn(conn, -ETIMEDOUT);
+		kiblnd_conn_decref(conn);
+	}
+
+	/* In case we have enough credits to return via a
+	 * NOOP, but there were no non-blocking tx descs
+	 * free to do it last time...
+	 */
+	while ((conn = list_first_entry_or_null(&checksends,
+						struct kib_conn,
+						ibc_connd_list)) != NULL) {
+		list_del(&conn->ibc_connd_list);
+
+		spin_lock(&conn->ibc_lock);
+		kiblnd_check_sends_locked(conn);
+		spin_unlock(&conn->ibc_lock);
+
+		kiblnd_conn_decref(conn);
+	}
+}
+
+static void
+kiblnd_disconnect_conn(struct kib_conn *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (current == kiblnd_data.kib_connd);
+	LASSERT (conn->ibc_state == IBLND_CONN_CLOSING);
+
+	rdma_disconnect(conn->ibc_cmid);
+	kiblnd_finalise_conn(conn);
+
+	kiblnd_peer_notify(conn->ibc_peer);
+}
+
+/*
+ * High-water for reconnection to the same peer_ni, reconnection attempt should
+ * be delayed after trying more than KIB_RECONN_HIGH_RACE.
+ */
+#define KIB_RECONN_HIGH_RACE	10
+/*
+ * Allow connd to take a break and handle other things after consecutive
+ * reconnection attemps.
+ */
+#define KIB_RECONN_BREAK	100
+
+int
+kiblnd_connd (void *arg)
+{
+	spinlock_t *lock = &kiblnd_data.kib_connd_lock;
+	wait_queue_entry_t wait;
+	unsigned long flags;
+	struct kib_conn *conn;
+	int timeout;
+	int i;
+	bool dropped_lock;
+	int peer_index = 0;
+	unsigned long deadline = jiffies;
+
+	init_wait(&wait);
+	kiblnd_data.kib_connd = current;
+
+	spin_lock_irqsave(lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		int reconn = 0;
+
+		dropped_lock = false;
+
+		conn = list_first_entry_or_null(&kiblnd_data.kib_connd_zombies,
+						struct kib_conn, ibc_list);
+		if (conn) {
+			struct kib_peer_ni *peer_ni = NULL;
+
+			list_del(&conn->ibc_list);
+			if (conn->ibc_reconnect) {
+				peer_ni = conn->ibc_peer;
+				kiblnd_peer_addref(peer_ni);
+			}
+
+			spin_unlock_irqrestore(lock, flags);
+			dropped_lock = true;
+
+			kiblnd_destroy_conn(conn);
+
+			spin_lock_irqsave(lock, flags);
+			if (!peer_ni) {
+				LIBCFS_FREE(conn, sizeof(*conn));
+				continue;
+			}
+
+			conn->ibc_peer = peer_ni;
+			if (peer_ni->ibp_reconnected < KIB_RECONN_HIGH_RACE)
+				list_add_tail(&conn->ibc_list,
+					      &kiblnd_data.kib_reconn_list);
+			else
+				list_add_tail(&conn->ibc_list,
+					      &kiblnd_data.kib_reconn_wait);
+		}
+
+		conn = list_first_entry_or_null(&kiblnd_data.kib_connd_conns,
+						struct kib_conn, ibc_list);
+		if (conn) {
+			int wait;
+
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(lock, flags);
+			dropped_lock = true;
+
+			kiblnd_disconnect_conn(conn);
+			wait = conn->ibc_waits;
+			if (wait == 0) /* keep ref for connd_wait, see below */
+				kiblnd_conn_decref(conn);
+
+			spin_lock_irqsave(lock, flags);
+
+			if (wait)
+				list_add_tail(&conn->ibc_list,
+					      &kiblnd_data.kib_connd_waits);
+		}
+
+		while (reconn < KIB_RECONN_BREAK) {
+			if (kiblnd_data.kib_reconn_sec !=
+			    ktime_get_real_seconds()) {
+				kiblnd_data.kib_reconn_sec = ktime_get_real_seconds();
+				list_splice_init(&kiblnd_data.kib_reconn_wait,
+						 &kiblnd_data.kib_reconn_list);
+			}
+
+			conn = list_first_entry_or_null(&kiblnd_data.kib_reconn_list,
+							struct kib_conn, ibc_list);
+			if (!conn)
+				break;
+
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(lock, flags);
+			dropped_lock = true;
+
+			reconn += kiblnd_reconnect_peer(conn->ibc_peer);
+			kiblnd_peer_decref(conn->ibc_peer);
+			LIBCFS_FREE(conn, sizeof(*conn));
+
+			spin_lock_irqsave(lock, flags);
+		}
+
+		conn = list_first_entry_or_null(&kiblnd_data.kib_connd_waits,
+						struct kib_conn, ibc_list);
+		if (conn) {
+			list_del(&conn->ibc_list);
+			spin_unlock_irqrestore(lock, flags);
+
+			dropped_lock = kiblnd_tx_may_discard(conn);
+			if (dropped_lock)
+				kiblnd_conn_decref(conn);
+
+			spin_lock_irqsave(lock, flags);
+			if (!dropped_lock)
+				list_add_tail(&conn->ibc_list,
+					      &kiblnd_data.kib_connd_waits);
+		}
+
+		/* careful with the jiffy wrap... */
+		timeout = (int)(deadline - jiffies);
+		if (timeout <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int chunk = HASH_SIZE(kiblnd_data.kib_peers);
+			unsigned int lnd_timeout;
+
+			spin_unlock_irqrestore(lock, flags);
+			dropped_lock = true;
+
+			/* Time to check for RDMA timeouts on a few more
+			 * peers: I do checks every 'p' seconds on a
+			 * proportion of the peer_ni table and I need to check
+			 * every connection 'n' times within a timeout
+			 * interval, to ensure I detect a timeout on any
+			 * connection within (n+1)/n times the timeout
+			 * interval.
+			 */
+
+			lnd_timeout = kiblnd_timeout();
+			if (lnd_timeout > n * p)
+				chunk = (chunk * n * p) / lnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				kiblnd_check_conns(peer_index);
+				peer_index = (peer_index + 1) %
+					HASH_SIZE(kiblnd_data.kib_peers);
+			}
+
+			deadline += cfs_time_seconds(p);
+			spin_lock_irqsave(lock, flags);
+		}
+
+		if (dropped_lock)
+			continue;
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_unlock_irqrestore(lock, flags);
+
+		schedule_timeout(timeout);
+
+		remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_lock_irqsave(lock, flags);
+	}
+
+	spin_unlock_irqrestore(lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+	struct kib_conn *conn = arg;
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		CDEBUG(D_NET, "%s established\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		/* We received a packet but connection isn't established
+		 * probably handshake packet was lost, so free to
+		 * force make connection established */
+		rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
+		return;
+
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_DEVICE_FATAL:
+		CERROR("Fatal device error for NI %s\n",
+		       libcfs_nidstr(&conn->ibc_peer->ibp_ni->ni_nid));
+		atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 1);
+		return;
+
+	case IB_EVENT_PORT_ACTIVE:
+		CERROR("Port reactivated for NI %s\n",
+		       libcfs_nidstr(&conn->ibc_peer->ibp_ni->ni_nid));
+		atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 0);
+		return;
+
+	default:
+		CERROR("%s: Async QP event type %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+		return;
+	}
+}
+
+static void
+kiblnd_complete (struct ib_wc *wc)
+{
+	switch (kiblnd_wreqid2type(wc->wr_id)) {
+	default:
+		LBUG();
+
+	case IBLND_WID_MR:
+		if (wc->status != IB_WC_SUCCESS &&
+		    wc->status != IB_WC_WR_FLUSH_ERR)
+			CNETERR("FastReg failed: %d\n", wc->status);
+		return;
+
+        case IBLND_WID_RDMA:
+                /* We only get RDMA completion notification if it fails.  All
+                 * subsequent work items, including the final SEND will fail
+                 * too.  However we can't print out any more info about the
+                 * failing RDMA because 'tx' might be back on the idle list or
+                 * even reused already if we didn't manage to post all our work
+                 * items */
+                CNETERR("RDMA (tx: %p) failed: %d\n",
+                        kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+                return;
+
+        case IBLND_WID_TX:
+                kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+                return;
+
+        case IBLND_WID_RX:
+                kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+                                   wc->byte_len);
+                return;
+        }
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+	/* NB I'm not allowed to schedule this conn once its refcount has
+	 * reached 0.  Since fundamentally I'm racing with scheduler threads
+	 * consuming my CQ I could be called after all completions have
+	 * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+	 * and this CQ is about to be destroyed so I NOOP. */
+	struct kib_conn	*conn = arg;
+	struct kib_sched_info *sched = conn->ibc_sched;
+	unsigned long flags;
+
+	LASSERT(cq == conn->ibc_cq);
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	conn->ibc_ready = 1;
+
+	if (!conn->ibc_scheduled &&
+	    (conn->ibc_nrx > 0 ||
+	     conn->ibc_nsends_posted > 0)) {
+		kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+		conn->ibc_scheduled = 1;
+		list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+		if (waitqueue_active(&sched->ibs_waitq))
+			wake_up(&sched->ibs_waitq);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+	struct kib_conn *conn = arg;
+
+        CERROR("%s: async CQ event type %d\n",
+               libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+	long id = (long)arg;
+	struct kib_sched_info *sched;
+	struct kib_conn	*conn;
+	wait_queue_entry_t wait;
+	unsigned long flags;
+	struct ib_wc wc;
+	bool did_something;
+	int rc;
+
+	init_wait(&wait);
+
+	sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+	if (rc != 0) {
+		CWARN("Unable to bind on CPU partition %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n", sched->ibs_cpt);
+	}
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		if (need_resched()) {
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			cond_resched();
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+		}
+
+		did_something = false;
+
+		conn = list_first_entry_or_null(&sched->ibs_conns,
+						struct kib_conn,
+						ibc_sched_list);
+		if (conn) {
+			/* take over kib_sched_conns' ref on conn... */
+			LASSERT(conn->ibc_scheduled);
+			list_del(&conn->ibc_sched_list);
+			conn->ibc_ready = 0;
+
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			wc.wr_id = IBLND_WID_INVAL;
+
+			rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			if (rc == 0) {
+				rc = ib_req_notify_cq(conn->ibc_cq,
+						      IB_CQ_NEXT_COMP);
+				if (rc < 0) {
+					CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n",
+					      libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+					kiblnd_close_conn(conn, -EIO);
+					kiblnd_conn_decref(conn);
+					spin_lock_irqsave(&sched->ibs_lock,
+							  flags);
+					continue;
+				}
+
+				rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			}
+
+			if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) {
+				LCONSOLE_ERROR(
+					"ib_poll_cq (rc: %d) returned invalid "
+					"wr_id, opcode %d, status: %d, "
+					"vendor_err: %d, conn: %s status: %d\n"
+					"please upgrade firmware and OFED or "
+					"contact vendor.\n", rc,
+					wc.opcode, wc.status, wc.vendor_err,
+					libcfs_nid2str(conn->ibc_peer->ibp_nid),
+					conn->ibc_state);
+				rc = -EINVAL;
+			}
+
+			if (rc < 0) {
+				CWARN("%s: ib_poll_cq failed: %d, closing connection\n",
+				      libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				      rc);
+				kiblnd_close_conn(conn, -EIO);
+				kiblnd_conn_decref(conn);
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+				continue;
+			}
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+
+			if (rc != 0 || conn->ibc_ready) {
+				/* There may be another completion waiting; get
+				 * another scheduler to check while I handle
+				 * this one... */
+				/* +1 ref for sched_conns */
+				kiblnd_conn_addref(conn);
+				list_add_tail(&conn->ibc_sched_list,
+					      &sched->ibs_conns);
+				if (waitqueue_active(&sched->ibs_waitq))
+					wake_up(&sched->ibs_waitq);
+			} else {
+				conn->ibc_scheduled = 0;
+			}
+
+			if (rc != 0) {
+				spin_unlock_irqrestore(&sched->ibs_lock, flags);
+				kiblnd_complete(&wc);
+
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+			}
+
+			kiblnd_conn_decref(conn); /* ..drop my ref from above */
+			did_something = true;
+		}
+
+		if (did_something)
+			continue;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+		spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+		schedule();
+
+		remove_wait_queue(&sched->ibs_waitq, &wait);
+		set_current_state(TASK_RUNNING);
+		spin_lock_irqsave(&sched->ibs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+	rwlock_t *glock = &kiblnd_data.kib_global_lock;
+	struct kib_dev *dev;
+	struct net *ns = arg;
+	wait_queue_entry_t wait;
+	unsigned long flags;
+	int rc;
+
+	LASSERT(*kiblnd_tunables.kib_dev_failover != 0);
+
+	init_wait(&wait);
+	write_lock_irqsave(glock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		bool do_failover = false;
+		int long_sleep;
+
+		list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+				    ibd_fail_list) {
+			if (ktime_get_seconds() < dev->ibd_next_failover)
+				continue;
+			do_failover = true;
+			break;
+		}
+
+		if (do_failover) {
+			list_del_init(&dev->ibd_fail_list);
+			dev->ibd_failover = 1;
+			write_unlock_irqrestore(glock, flags);
+
+			rc = kiblnd_dev_failover(dev, ns);
+
+			write_lock_irqsave(glock, flags);
+
+			LASSERT(dev->ibd_failover);
+			dev->ibd_failover = 0;
+			if (rc >= 0) { /* Device is OK or failover succeed */
+				dev->ibd_next_failover = ktime_get_seconds() + 3;
+				continue;
+			}
+
+			/* failed to failover, retry later */
+			dev->ibd_next_failover = ktime_get_seconds() +
+				min(dev->ibd_failed_failover, 10);
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+
+			continue;
+		}
+
+		/* long sleep if no more pending failover */
+		long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_unlock_irqrestore(glock, flags);
+
+		rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+				      cfs_time_seconds(1));
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_lock_irqsave(glock, flags);
+
+		if (!long_sleep || rc != 0)
+			continue;
+
+		/* have a long sleep, routine check all active devices,
+		 * we need checking like this because if there is not active
+		 * connection on the dev and no SEND from local, we may listen
+		 * on wrong HCA for ever while there is a bonding failover
+		 */
+		list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+		}
+	}
+
+	write_unlock_irqrestore(glock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644
index 0000000000000..95e72002c1c74
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -0,0 +1,332 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+#define CURRENT_LND_VERSION 1
+
+static int service = 987;
+module_param(service, int, 0444);
+MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)");
+
+static int cksum = 0;
+module_param(cksum, int, 0644);
+MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout;
+module_param(timeout, int, 0644);
+MODULE_PARM_DESC(timeout, "timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool");
+
+static unsigned int conns_per_peer = 1;
+module_param(conns_per_peer, uint, 0444);
+MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+module_param(ntx, int, 0444);
+MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = DEFAULT_CREDITS;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
+
+static int peer_credits = DEFAULT_PEER_CREDITS;
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
+
+static int peer_credits_hiw = 0;
+module_param(peer_credits_hiw, int, 0444);
+MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits");
+
+static int peer_buffer_credits = 0;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
+
+static int peer_timeout = DEFAULT_PEER_TIMEOUT;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+module_param(ipif_name, charp, 0444);
+MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
+
+static int retry_count = 5;
+module_param(retry_count, int, 0644);
+MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations");
+
+static int rnr_retry_count = 6;
+module_param(rnr_retry_count, int, 0644);
+MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions");
+
+static int keepalive = 100;
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive");
+
+static int ib_mtu;
+module_param(ib_mtu, int, 0444);
+MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends;
+module_param(concurrent_sends, int, 0444);
+MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
+
+static int use_fastreg_gaps;
+module_param(use_fastreg_gaps, int, 0444);
+MODULE_PARM_DESC(use_fastreg_gaps, "Enable discontiguous fastreg fragment support. Expect performance drop");
+
+/*
+ * map_on_demand is a flag used to determine if we can use FMR or FastReg.
+ * This is applicable for kernels which support global memory regions. For
+ * later kernels this flag is always enabled, since we will always either
+ * use FMR or FastReg
+ * For kernels which support global memory regions map_on_demand defaults
+ * to 0 which means we will be using global memory regions exclusively.
+ * If it is set to a value other than 0, then we will behave as follows:
+ *  1. Always default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ *  2. Create FMR/FastReg pools
+ *  3. Negotiate the supported number of fragments per connection
+ *  4. Attempt to transmit using global memory regions only if
+ *     map-on-demand is not turned on, otherwise use FMR or FastReg
+ *  5. In case of transmitting tx with GAPS over FMR we will need to
+ *     transmit it with multiple fragments. Look at the comments in
+ *     kiblnd_fmr_map_tx() for an explanation of the behavior.
+ *
+ * For later kernels we default map_on_demand to 1 and not allow
+ * it to be set to 0, since there is no longer support for global memory
+ * regions. Behavior:
+ *  1. Default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ *  2. Create FMR/FastReg pools
+ *  3. Negotiate the supported number of fragments per connection
+ *  4. Look at the comments in kiblnd_fmr_map_tx() for an explanation of
+ *     the behavior when transmit with GAPS verses contiguous.
+ */
+#ifdef HAVE_IB_GET_DMA_MR
+#define MOD_STR "map on demand"
+#else
+#define MOD_STR "map on demand (obsolete)"
+#endif
+static int map_on_demand = 1;
+module_param(map_on_demand, int, 0444);
+MODULE_PARM_DESC(map_on_demand, MOD_STR);
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+module_param(fmr_flush_trigger, int, 0444);
+MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+module_param(fmr_cache, int, 0444);
+MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover = 0;
+module_param(dev_failover, int, 0444);
+MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+static int require_privileged_port;
+module_param(require_privileged_port, int, 0644);
+MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+module_param(use_privileged_port, int, 0644);
+MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection");
+
+static unsigned int wrq_sge = 2;
+module_param(wrq_sge, uint, 0444);
+MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request");
+
+struct kib_tunables kiblnd_tunables = {
+        .kib_dev_failover           = &dev_failover,
+        .kib_service                = &service,
+        .kib_cksum                  = &cksum,
+        .kib_timeout                = &timeout,
+        .kib_keepalive              = &keepalive,
+        .kib_default_ipif           = &ipif_name,
+        .kib_retry_count            = &retry_count,
+        .kib_rnr_retry_count        = &rnr_retry_count,
+        .kib_ib_mtu                 = &ib_mtu,
+        .kib_require_priv_port      = &require_privileged_port,
+	.kib_use_priv_port	    = &use_privileged_port,
+	.kib_nscheds		    = &nscheds,
+	.kib_wrq_sge		    = &wrq_sge,
+	.kib_use_fastreg_gaps       = &use_fastreg_gaps,
+};
+
+static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;
+
+/* # messages/RDMAs in-flight */
+int
+kiblnd_msg_queue_size(int version, struct lnet_ni *ni)
+{
+	if (version == IBLND_MSG_VERSION_1)
+		return IBLND_MSG_QUEUE_SIZE_V1;
+	else if (ni)
+		return ni->ni_net->net_tunables.lct_peer_tx_credits;
+	else
+		return peer_credits;
+}
+
+int
+kiblnd_tunables_setup(struct lnet_ni *ni)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
+
+	/*
+	 * if there was no tunables specified, setup the tunables to be
+	 * defaulted
+	 */
+	if (!ni->ni_lnd_tunables_set)
+		memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib,
+		       &default_tunables, sizeof(*tunables));
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+	/* Current API version */
+	tunables->lnd_version = CURRENT_LND_VERSION;
+
+	if (*kiblnd_tunables.kib_ib_mtu &&
+	    ib_mtu_enum_to_int(ib_mtu_int_to_enum(*kiblnd_tunables.kib_ib_mtu)) !=
+	    *kiblnd_tunables.kib_ib_mtu) {
+		CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+		       *kiblnd_tunables.kib_ib_mtu);
+		return -EINVAL;
+	}
+
+	net_tunables = &ni->ni_net->net_tunables;
+
+	if (net_tunables->lct_peer_timeout == -1)
+		net_tunables->lct_peer_timeout = peer_timeout;
+
+	if (net_tunables->lct_max_tx_credits == -1)
+		net_tunables->lct_max_tx_credits = credits;
+
+	if (net_tunables->lct_peer_tx_credits == -1)
+		net_tunables->lct_peer_tx_credits = peer_credits;
+
+	if (net_tunables->lct_peer_rtr_credits == -1)
+		net_tunables->lct_peer_rtr_credits = peer_buffer_credits;
+
+	if (net_tunables->lct_peer_tx_credits < IBLND_CREDITS_DEFAULT)
+		net_tunables->lct_peer_tx_credits = IBLND_CREDITS_DEFAULT;
+
+	if (net_tunables->lct_peer_tx_credits > IBLND_CREDITS_MAX)
+		net_tunables->lct_peer_tx_credits = IBLND_CREDITS_MAX;
+
+	if (net_tunables->lct_peer_tx_credits >
+	    net_tunables->lct_max_tx_credits)
+		net_tunables->lct_peer_tx_credits =
+			net_tunables->lct_max_tx_credits;
+
+#ifndef HAVE_IB_GET_DMA_MR
+	/*
+	 * For kernels which do not support global memory regions, always
+	 * enable map_on_demand
+	 */
+	if (tunables->lnd_map_on_demand == 0)
+		tunables->lnd_map_on_demand = 1;
+#endif
+
+	if (!tunables->lnd_peercredits_hiw)
+		tunables->lnd_peercredits_hiw = peer_credits_hiw;
+
+	if (tunables->lnd_peercredits_hiw < net_tunables->lct_peer_tx_credits / 2)
+		tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits / 2;
+
+	if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits)
+		tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1;
+
+	if (tunables->lnd_concurrent_sends == 0)
+			tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits;
+
+	if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2)
+		tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2;
+
+	if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits / 2)
+		tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits / 2;
+
+	if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits) {
+		CWARN("Concurrent sends %d is lower than message "
+		      "queue size: %d, performance may drop slightly.\n",
+		      tunables->lnd_concurrent_sends,
+		      net_tunables->lct_peer_tx_credits);
+	}
+
+	if (!tunables->lnd_fmr_pool_size)
+		tunables->lnd_fmr_pool_size = fmr_pool_size;
+	if (!tunables->lnd_fmr_flush_trigger)
+		tunables->lnd_fmr_flush_trigger = fmr_flush_trigger;
+	if (!tunables->lnd_fmr_cache)
+		tunables->lnd_fmr_cache = fmr_cache;
+	if (!tunables->lnd_ntx)
+		tunables->lnd_ntx = ntx;
+	if (!tunables->lnd_conns_per_peer) {
+		tunables->lnd_conns_per_peer = (conns_per_peer) ?
+			conns_per_peer : 1;
+	}
+
+	return 0;
+}
+
+int
+kiblnd_tunables_init(void)
+{
+	default_tunables.lnd_version = CURRENT_LND_VERSION;
+	default_tunables.lnd_peercredits_hiw = peer_credits_hiw;
+	default_tunables.lnd_map_on_demand = map_on_demand;
+	default_tunables.lnd_concurrent_sends = concurrent_sends;
+	default_tunables.lnd_fmr_pool_size = fmr_pool_size;
+	default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger;
+	default_tunables.lnd_fmr_cache = fmr_cache;
+	default_tunables.lnd_ntx = ntx;
+	default_tunables.lnd_conns_per_peer = conns_per_peer;
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
new file mode 100644
index 0000000000000..9ea8c318c8190
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
@@ -0,0 +1,2596 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/ethtool.h>
+#include <linux/inetdevice.h>
+#include "socklnd.h"
+#include <linux/sunrpc/addr.h>
+
+static const struct lnet_lnd the_ksocklnd;
+struct ksock_nal_data ksocknal_data;
+
+static struct ksock_interface *
+ksocknal_ip2iface(struct lnet_ni *ni, struct sockaddr *addr)
+{
+	struct ksock_net *net = ni->ni_data;
+	struct ksock_interface *iface;
+
+	iface = &net->ksnn_interface;
+
+	if (rpc_cmp_addr((struct sockaddr *)&iface->ksni_addr, addr))
+		return iface;
+
+	return NULL;
+}
+
+static struct ksock_interface *
+ksocknal_index2iface(struct lnet_ni *ni, int index)
+{
+	struct ksock_net *net = ni->ni_data;
+	struct ksock_interface *iface;
+
+	iface = &net->ksnn_interface;
+
+	if (iface->ksni_index == index)
+		return iface;
+
+	return NULL;
+}
+
+static int ksocknal_ip2index(struct sockaddr *addr, struct lnet_ni *ni)
+{
+	struct net_device *dev;
+	int ret = -1;
+	DECLARE_CONST_IN_IFADDR(ifa);
+
+	if (addr->sa_family != AF_INET)
+		/* No IPv6 support yet */
+		return ret;
+
+	rcu_read_lock();
+	for_each_netdev(ni->ni_net_ns, dev) {
+		int flags = dev_get_flags(dev);
+		struct in_device *in_dev;
+
+		if (flags & IFF_LOOPBACK) /* skip the loopback IF */
+			continue;
+
+		if (!(flags & IFF_UP))
+			continue;
+
+		in_dev = __in_dev_get_rcu(dev);
+		if (!in_dev)
+			continue;
+
+		in_dev_for_each_ifa_rcu(ifa, in_dev) {
+			if (ifa->ifa_local ==
+			    ((struct sockaddr_in *)addr)->sin_addr.s_addr)
+				ret = dev->ifindex;
+		}
+		endfor_ifa(in_dev);
+		if (ret >= 0)
+			break;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static struct ksock_conn_cb *
+ksocknal_create_conn_cb(struct sockaddr *addr)
+{
+	struct ksock_conn_cb *conn_cb;
+
+	LIBCFS_ALLOC(conn_cb, sizeof(*conn_cb));
+	if (!conn_cb)
+		return NULL;
+
+	refcount_set(&conn_cb->ksnr_refcount, 1);
+	conn_cb->ksnr_peer = NULL;
+	conn_cb->ksnr_retry_interval = 0;         /* OK to connect at any time */
+	rpc_copy_addr((struct sockaddr *)&conn_cb->ksnr_addr, addr);
+	rpc_set_port((struct sockaddr *)&conn_cb->ksnr_addr,
+		     rpc_get_port(addr));
+	conn_cb->ksnr_myiface = -1;
+	conn_cb->ksnr_scheduled = 0;
+	conn_cb->ksnr_connecting = 0;
+	conn_cb->ksnr_connected = 0;
+	conn_cb->ksnr_deleted = 0;
+	conn_cb->ksnr_conn_count = 0;
+	conn_cb->ksnr_ctrl_conn_count = 0;
+	conn_cb->ksnr_blki_conn_count = 0;
+	conn_cb->ksnr_blko_conn_count = 0;
+	conn_cb->ksnr_max_conns = 0;
+
+	return conn_cb;
+}
+
+void
+ksocknal_destroy_conn_cb(struct ksock_conn_cb *conn_cb)
+{
+	LASSERT(refcount_read(&conn_cb->ksnr_refcount) == 0);
+
+	if (conn_cb->ksnr_peer)
+		ksocknal_peer_decref(conn_cb->ksnr_peer);
+
+	LIBCFS_FREE(conn_cb, sizeof(*conn_cb));
+}
+
+static struct ksock_peer_ni *
+ksocknal_create_peer(struct lnet_ni *ni, struct lnet_processid *id)
+{
+	int cpt = lnet_nid2cpt(&id->nid, ni);
+	struct ksock_net *net = ni->ni_data;
+	struct ksock_peer_ni *peer_ni;
+
+	LASSERT(!LNET_NID_IS_ANY(&id->nid));
+	LASSERT(id->pid != LNET_PID_ANY);
+	LASSERT(!in_interrupt());
+
+	if (!atomic_inc_unless_negative(&net->ksnn_npeers)) {
+		CERROR("Can't create peer_ni: network shutdown\n");
+		return ERR_PTR(-ESHUTDOWN);
+	}
+
+	LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni));
+	if (!peer_ni) {
+		atomic_dec(&net->ksnn_npeers);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	peer_ni->ksnp_ni = ni;
+	peer_ni->ksnp_id = *id;
+	refcount_set(&peer_ni->ksnp_refcount, 1); /* 1 ref for caller */
+	peer_ni->ksnp_closing = 0;
+	peer_ni->ksnp_accepting = 0;
+	peer_ni->ksnp_proto = NULL;
+	peer_ni->ksnp_last_alive = 0;
+	peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+	peer_ni->ksnp_conn_cb = NULL;
+
+	INIT_LIST_HEAD(&peer_ni->ksnp_conns);
+	INIT_LIST_HEAD(&peer_ni->ksnp_tx_queue);
+	INIT_LIST_HEAD(&peer_ni->ksnp_zc_req_list);
+	spin_lock_init(&peer_ni->ksnp_lock);
+
+	return peer_ni;
+}
+
+void
+ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni)
+{
+	struct ksock_net *net = peer_ni->ksnp_ni->ni_data;
+
+	CDEBUG (D_NET, "peer_ni %s %p deleted\n",
+		libcfs_idstr(&peer_ni->ksnp_id), peer_ni);
+
+	LASSERT(refcount_read(&peer_ni->ksnp_refcount) == 0);
+	LASSERT(peer_ni->ksnp_accepting == 0);
+	LASSERT(list_empty(&peer_ni->ksnp_conns));
+	LASSERT(peer_ni->ksnp_conn_cb == NULL);
+	LASSERT(list_empty(&peer_ni->ksnp_tx_queue));
+	LASSERT(list_empty(&peer_ni->ksnp_zc_req_list));
+
+	LIBCFS_FREE(peer_ni, sizeof(*peer_ni));
+
+	/* NB a peer_ni's connections and conn_cb keep a reference on their
+	 * peer_ni until they are destroyed, so we can be assured that _all_
+	 * state to do with this peer_ni has been cleaned up when its refcount
+	 * drops to zero.
+	 */
+	if (atomic_dec_and_test(&net->ksnn_npeers))
+		wake_up_var(&net->ksnn_npeers);
+}
+
+struct ksock_peer_ni *
+ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_processid *id)
+{
+	struct ksock_peer_ni *peer_ni;
+	unsigned long hash = nidhash(&id->nid);
+
+	hash_for_each_possible(ksocknal_data.ksnd_peers, peer_ni,
+			       ksnp_list, hash) {
+		LASSERT(!peer_ni->ksnp_closing);
+
+		if (peer_ni->ksnp_ni != ni)
+			continue;
+
+		if (!nid_same(&peer_ni->ksnp_id.nid, &id->nid) ||
+		    peer_ni->ksnp_id.pid != id->pid)
+			continue;
+
+		CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d)\n",
+		       peer_ni, libcfs_idstr(id),
+		       refcount_read(&peer_ni->ksnp_refcount));
+		return peer_ni;
+	}
+	return NULL;
+}
+
+struct ksock_peer_ni *
+ksocknal_find_peer(struct lnet_ni *ni, struct lnet_processid *id)
+{
+	struct ksock_peer_ni *peer_ni;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	peer_ni = ksocknal_find_peer_locked(ni, id);
+	if (peer_ni != NULL)			/* +1 ref for caller? */
+		ksocknal_peer_addref(peer_ni);
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return peer_ni;
+}
+
+static void
+ksocknal_unlink_peer_locked(struct ksock_peer_ni *peer_ni)
+{
+	int i;
+	struct ksock_interface *iface;
+
+	for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++) {
+		struct sockaddr_in sa = { .sin_family = AF_INET };
+		LASSERT(i < LNET_INTERFACES_NUM);
+		sa.sin_addr.s_addr = htonl(peer_ni->ksnp_passive_ips[i]);
+
+		iface = ksocknal_ip2iface(peer_ni->ksnp_ni,
+					  (struct sockaddr *)&sa);
+		/*
+		 * All IPs in peer_ni->ksnp_passive_ips[] come from the
+		 * interface list, therefore the call must succeed.
+		 */
+		LASSERT(iface != NULL);
+
+		CDEBUG(D_NET, "peer_ni=%p iface=%p ksni_nroutes=%d\n",
+		       peer_ni, iface, iface->ksni_nroutes);
+		iface->ksni_npeers--;
+	}
+
+	LASSERT(list_empty(&peer_ni->ksnp_conns));
+	LASSERT(peer_ni->ksnp_conn_cb == NULL);
+	LASSERT(!peer_ni->ksnp_closing);
+	peer_ni->ksnp_closing = 1;
+	hlist_del(&peer_ni->ksnp_list);
+	/* lose peerlist's ref */
+	ksocknal_peer_decref(peer_ni);
+}
+
+static int
+ksocknal_get_peer_info(struct lnet_ni *ni, int index,
+		       struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip,
+		       int *port, int *conn_count, int *share_count)
+{
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_conn_cb *conn_cb;
+	int i;
+	int j;
+	int rc = -ENOENT;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	hash_for_each(ksocknal_data.ksnd_peers, i, peer_ni, ksnp_list) {
+
+		if (peer_ni->ksnp_ni != ni)
+			continue;
+
+		if (peer_ni->ksnp_n_passive_ips == 0 &&
+		    peer_ni->ksnp_conn_cb == NULL) {
+			if (index-- > 0)
+				continue;
+
+			id->pid = peer_ni->ksnp_id.pid;
+			id->nid = lnet_nid_to_nid4(&peer_ni->ksnp_id.nid);
+			*myip = 0;
+			*peer_ip = 0;
+			*port = 0;
+			*conn_count = 0;
+			*share_count = 0;
+			rc = 0;
+			goto out;
+		}
+
+		for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++) {
+			if (index-- > 0)
+				continue;
+
+			id->pid = peer_ni->ksnp_id.pid;
+			id->nid = lnet_nid_to_nid4(&peer_ni->ksnp_id.nid);
+			*myip = peer_ni->ksnp_passive_ips[j];
+			*peer_ip = 0;
+			*port = 0;
+			*conn_count = 0;
+			*share_count = 0;
+			rc = 0;
+			goto out;
+		}
+
+		if (peer_ni->ksnp_conn_cb) {
+			if (index-- > 0)
+				continue;
+
+			conn_cb = peer_ni->ksnp_conn_cb;
+
+			id->pid = peer_ni->ksnp_id.pid;
+			id->nid = lnet_nid_to_nid4(&peer_ni->ksnp_id.nid);
+			if (conn_cb->ksnr_addr.ss_family == AF_INET) {
+				struct sockaddr_in *sa =
+					(void *)&conn_cb->ksnr_addr;
+
+				rc = choose_ipv4_src(myip,
+						     conn_cb->ksnr_myiface,
+						     ntohl(sa->sin_addr.s_addr),
+						     ni->ni_net_ns);
+				*peer_ip = ntohl(sa->sin_addr.s_addr);
+				*port = ntohs(sa->sin_port);
+			} else {
+				*myip = 0xFFFFFFFF;
+				*peer_ip = 0xFFFFFFFF;
+				*port = 0;
+				rc = -ENOTSUPP;
+			}
+			*conn_count = conn_cb->ksnr_conn_count;
+			*share_count = 1;
+			goto out;
+		}
+	}
+out:
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return rc;
+}
+
+static unsigned int
+ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb,
+				int type)
+{
+	unsigned int count = 0;
+
+	switch (type) {
+	case SOCKLND_CONN_CONTROL:
+		count = conn_cb->ksnr_ctrl_conn_count;
+		break;
+	case SOCKLND_CONN_BULK_IN:
+		count = conn_cb->ksnr_blki_conn_count;
+		break;
+	case SOCKLND_CONN_BULK_OUT:
+		count = conn_cb->ksnr_blko_conn_count;
+		break;
+	case SOCKLND_CONN_ANY:
+		count = conn_cb->ksnr_conn_count;
+		break;
+	default:
+		LBUG();
+		break;
+	}
+
+	return count;
+}
+
+static unsigned int
+ksocknal_get_conns_per_peer(struct ksock_peer_ni *peer_ni)
+{
+	struct lnet_ni *ni = peer_ni->ksnp_ni;
+	struct lnet_ioctl_config_socklnd_tunables *tunables;
+
+	LASSERT(ni);
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock;
+
+	return tunables->lnd_conns_per_peer;
+}
+
+static void
+ksocknal_incr_conn_count(struct ksock_conn_cb *conn_cb,
+			 int type)
+{
+	conn_cb->ksnr_conn_count++;
+
+	/* check if all connections of the given type got created */
+	switch (type) {
+	case SOCKLND_CONN_CONTROL:
+		conn_cb->ksnr_ctrl_conn_count++;
+		/* there's a single control connection per peer,
+		 * two in case of loopback
+		 */
+		conn_cb->ksnr_connected |= BIT(type);
+		break;
+	case SOCKLND_CONN_BULK_IN:
+		conn_cb->ksnr_blki_conn_count++;
+		if (conn_cb->ksnr_blki_conn_count >= conn_cb->ksnr_max_conns)
+			conn_cb->ksnr_connected |= BIT(type);
+		break;
+	case SOCKLND_CONN_BULK_OUT:
+		conn_cb->ksnr_blko_conn_count++;
+		if (conn_cb->ksnr_blko_conn_count >= conn_cb->ksnr_max_conns)
+			conn_cb->ksnr_connected |= BIT(type);
+		break;
+	case SOCKLND_CONN_ANY:
+		if (conn_cb->ksnr_conn_count >= conn_cb->ksnr_max_conns)
+			conn_cb->ksnr_connected |= BIT(type);
+		break;
+	default:
+		LBUG();
+		break;
+	}
+
+	CDEBUG(D_NET, "Add conn type %d, ksnr_connected %x ksnr_max_conns %d\n",
+	       type, conn_cb->ksnr_connected, conn_cb->ksnr_max_conns);
+}
+
+
+static void
+ksocknal_decr_conn_count(struct ksock_conn_cb *conn_cb,
+			 int type)
+{
+	conn_cb->ksnr_conn_count--;
+
+	/* check if all connections of the given type got created */
+	switch (type) {
+	case SOCKLND_CONN_CONTROL:
+		conn_cb->ksnr_ctrl_conn_count--;
+		/* there's a single control connection per peer,
+		 * two in case of loopback
+		 */
+		if (conn_cb->ksnr_ctrl_conn_count == 0)
+			conn_cb->ksnr_connected &= ~BIT(type);
+		break;
+	case SOCKLND_CONN_BULK_IN:
+		conn_cb->ksnr_blki_conn_count--;
+		if (conn_cb->ksnr_blki_conn_count < conn_cb->ksnr_max_conns)
+			conn_cb->ksnr_connected &= ~BIT(type);
+		break;
+	case SOCKLND_CONN_BULK_OUT:
+		conn_cb->ksnr_blko_conn_count--;
+		if (conn_cb->ksnr_blko_conn_count < conn_cb->ksnr_max_conns)
+			conn_cb->ksnr_connected &= ~BIT(type);
+		break;
+	case SOCKLND_CONN_ANY:
+		if (conn_cb->ksnr_conn_count < conn_cb->ksnr_max_conns)
+			conn_cb->ksnr_connected &= ~BIT(type);
+		break;
+	default:
+		LBUG();
+		break;
+	}
+
+	CDEBUG(D_NET, "Del conn type %d, ksnr_connected %x ksnr_max_conns %d\n",
+	       type, conn_cb->ksnr_connected, conn_cb->ksnr_max_conns);
+}
+
+static void
+ksocknal_associate_cb_conn_locked(struct ksock_conn_cb *conn_cb,
+				  struct ksock_conn *conn)
+{
+	struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer;
+	int type = conn->ksnc_type;
+	struct ksock_interface *iface;
+	int conn_iface;
+
+	conn_iface = ksocknal_ip2index((struct sockaddr *)&conn->ksnc_myaddr,
+				       peer_ni->ksnp_ni);
+	conn->ksnc_conn_cb = conn_cb;
+	ksocknal_conn_cb_addref(conn_cb);
+
+	if (conn_cb->ksnr_myiface != conn_iface) {
+		if (conn_cb->ksnr_myiface < 0) {
+			/* route wasn't bound locally yet (the initial route) */
+			CDEBUG(D_NET, "Binding %s %pIS to interface %d\n",
+			       libcfs_idstr(&peer_ni->ksnp_id),
+			       &conn_cb->ksnr_addr,
+			       conn_iface);
+		} else {
+			CDEBUG(D_NET,
+			       "Rebinding %s %pIS from interface %d to %d\n",
+			       libcfs_idstr(&peer_ni->ksnp_id),
+			       &conn_cb->ksnr_addr,
+			       conn_cb->ksnr_myiface,
+			       conn_iface);
+
+			iface = ksocknal_index2iface(peer_ni->ksnp_ni,
+						     conn_cb->ksnr_myiface);
+			if (iface)
+				iface->ksni_nroutes--;
+		}
+		conn_cb->ksnr_myiface = conn_iface;
+		iface = ksocknal_index2iface(peer_ni->ksnp_ni,
+					     conn_cb->ksnr_myiface);
+		if (iface)
+			iface->ksni_nroutes++;
+	}
+
+	ksocknal_incr_conn_count(conn_cb, type);
+
+	/* Successful connection => further attempts can
+	 * proceed immediately
+	 */
+	conn_cb->ksnr_retry_interval = 0;
+}
+
+static void
+ksocknal_add_conn_cb_locked(struct ksock_peer_ni *peer_ni,
+			    struct ksock_conn_cb *conn_cb)
+{
+	struct ksock_conn *conn;
+	struct ksock_net *net = peer_ni->ksnp_ni->ni_data;
+
+	LASSERT(!peer_ni->ksnp_closing);
+	LASSERT(!conn_cb->ksnr_peer);
+	LASSERT(!conn_cb->ksnr_scheduled);
+	LASSERT(!conn_cb->ksnr_connecting);
+	LASSERT(conn_cb->ksnr_connected == 0);
+
+	conn_cb->ksnr_peer = peer_ni;
+	ksocknal_peer_addref(peer_ni);
+
+	/* set the conn_cb's interface to the current net's interface */
+	conn_cb->ksnr_myiface = net->ksnn_interface.ksni_index;
+	net->ksnn_interface.ksni_nroutes++;
+
+	/* peer_ni's route list takes over my ref on 'route' */
+	peer_ni->ksnp_conn_cb = conn_cb;
+
+	list_for_each_entry(conn, &peer_ni->ksnp_conns, ksnc_list) {
+		if (!rpc_cmp_addr((struct sockaddr *)&conn->ksnc_peeraddr,
+				  (struct sockaddr *)&conn_cb->ksnr_addr))
+			continue;
+
+		ksocknal_associate_cb_conn_locked(conn_cb, conn);
+		/* keep going (typed conns) */
+	}
+}
+
+static void
+ksocknal_del_conn_cb_locked(struct ksock_conn_cb *conn_cb)
+{
+	struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer;
+	struct ksock_interface *iface;
+	struct ksock_conn *conn;
+	struct ksock_conn *cnxt;
+
+	LASSERT(!conn_cb->ksnr_deleted);
+
+	/* Close associated conns */
+	list_for_each_entry_safe(conn, cnxt, &peer_ni->ksnp_conns, ksnc_list) {
+		if (conn->ksnc_conn_cb != conn_cb)
+			continue;
+
+		ksocknal_close_conn_locked(conn, 0);
+	}
+
+	if (conn_cb->ksnr_myiface >= 0) {
+		iface = ksocknal_index2iface(peer_ni->ksnp_ni,
+					     conn_cb->ksnr_myiface);
+		if (iface)
+			iface->ksni_nroutes--;
+	}
+
+	conn_cb->ksnr_deleted = 1;
+	ksocknal_conn_cb_decref(conn_cb);		/* drop peer_ni's ref */
+	peer_ni->ksnp_conn_cb = NULL;
+
+	if (list_empty(&peer_ni->ksnp_conns)) {
+		/* I've just removed the last route to a peer_ni with no active
+		 * connections
+		 */
+		ksocknal_unlink_peer_locked(peer_ni);
+	}
+}
+
+int
+ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id4,
+		  struct sockaddr *addr)
+{
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_peer_ni *peer2;
+	struct ksock_conn_cb *conn_cb;
+	struct lnet_processid id;
+
+	if (id4.nid == LNET_NID_ANY ||
+	    id4.pid == LNET_PID_ANY)
+		return (-EINVAL);
+
+	id.pid = id4.pid;
+	lnet_nid4_to_nid(id4.nid, &id.nid);
+
+	/* Have a brand new peer_ni ready... */
+	peer_ni = ksocknal_create_peer(ni, &id);
+	if (IS_ERR(peer_ni))
+		return PTR_ERR(peer_ni);
+
+	conn_cb = ksocknal_create_conn_cb(addr);
+	if (!conn_cb) {
+		ksocknal_peer_decref(peer_ni);
+		return -ENOMEM;
+	}
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* always called with a ref on ni, so shutdown can't have started */
+	LASSERT(atomic_read(&((struct ksock_net *)ni->ni_data)->ksnn_npeers)
+		>= 0);
+
+	peer2 = ksocknal_find_peer_locked(ni, &id);
+	if (peer2 != NULL) {
+		ksocknal_peer_decref(peer_ni);
+		peer_ni = peer2;
+	} else {
+		/* peer_ni table takes my ref on peer_ni */
+		hash_add(ksocknal_data.ksnd_peers, &peer_ni->ksnp_list,
+			 nidhash(&id.nid));
+	}
+
+	if (peer_ni->ksnp_conn_cb) {
+		ksocknal_conn_cb_decref(conn_cb);
+	} else {
+		ksocknal_add_conn_cb_locked(peer_ni, conn_cb);
+		/* Remember conns_per_peer setting at the time
+		 * of connection initiation. It will define the
+		 * max number of conns per type for this conn_cb
+		 * while it's in use.
+		 */
+		conn_cb->ksnr_max_conns = ksocknal_get_conns_per_peer(peer_ni);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return 0;
+}
+
+static void
+ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
+{
+	struct ksock_conn *conn;
+	struct ksock_conn *cnxt;
+	struct ksock_conn_cb *conn_cb;
+
+	LASSERT(!peer_ni->ksnp_closing);
+
+	/* Extra ref prevents peer_ni disappearing until I'm done with it */
+	ksocknal_peer_addref(peer_ni);
+	conn_cb = peer_ni->ksnp_conn_cb;
+	if (conn_cb)
+		ksocknal_del_conn_cb_locked(conn_cb);
+
+	list_for_each_entry_safe(conn, cnxt, &peer_ni->ksnp_conns,
+				 ksnc_list)
+		ksocknal_close_conn_locked(conn, 0);
+
+	ksocknal_peer_decref(peer_ni);
+	/* NB peer_ni unlinks itself when last conn/conn_cb is removed */
+}
+
+static int
+ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id4, __u32 ip)
+{
+	LIST_HEAD(zombies);
+	struct hlist_node *pnxt;
+	struct ksock_peer_ni *peer_ni;
+	int lo;
+	int hi;
+	int i;
+	int rc = -ENOENT;
+	struct lnet_processid id;
+
+	id.pid = id4.pid;
+	lnet_nid4_to_nid(id4.nid, &id.nid);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (!LNET_NID_IS_ANY(&id.nid)) {
+		lo = hash_min(nidhash(&id.nid),
+			      HASH_BITS(ksocknal_data.ksnd_peers));
+		hi = lo;
+	} else {
+		lo = 0;
+		hi = HASH_SIZE(ksocknal_data.ksnd_peers) - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		hlist_for_each_entry_safe(peer_ni, pnxt,
+					  &ksocknal_data.ksnd_peers[i],
+					  ksnp_list) {
+			if (peer_ni->ksnp_ni != ni)
+				continue;
+
+			if (!((LNET_NID_IS_ANY(&id.nid) ||
+			       nid_same(&peer_ni->ksnp_id.nid, &id.nid)) &&
+			      (id.pid == LNET_PID_ANY ||
+			       peer_ni->ksnp_id.pid == id.pid)))
+				continue;
+
+			ksocknal_peer_addref(peer_ni);	/* a ref for me... */
+
+			ksocknal_del_peer_locked(peer_ni, ip);
+
+			if (peer_ni->ksnp_closing &&
+			    !list_empty(&peer_ni->ksnp_tx_queue)) {
+				LASSERT(list_empty(&peer_ni->ksnp_conns));
+				LASSERT(peer_ni->ksnp_conn_cb == NULL);
+
+				list_splice_init(&peer_ni->ksnp_tx_queue,
+						 &zombies);
+			}
+
+			ksocknal_peer_decref(peer_ni);	/* ...till here */
+
+			rc = 0;				/* matched! */
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(ni, &zombies, -ENETDOWN);
+
+	return rc;
+}
+
+static struct ksock_conn *
+ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
+{
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_conn *conn;
+	int i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	hash_for_each(ksocknal_data.ksnd_peers, i, peer_ni, ksnp_list) {
+		LASSERT(!peer_ni->ksnp_closing);
+
+		if (peer_ni->ksnp_ni != ni)
+			continue;
+
+		list_for_each_entry(conn, &peer_ni->ksnp_conns,
+				    ksnc_list) {
+			if (index-- > 0)
+				continue;
+
+			ksocknal_conn_addref(conn);
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			return conn;
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return NULL;
+}
+
+static struct ksock_sched *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+	struct ksock_sched *sched = ksocknal_data.ksnd_schedulers[cpt];
+	int i;
+
+	if (sched->kss_nthreads == 0) {
+		cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) {
+			if (sched->kss_nthreads > 0) {
+				CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n",
+				       cpt, sched->kss_cpt);
+				return sched;
+			}
+		}
+		return NULL;
+	}
+
+	return sched;
+}
+
+int
+ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
+{
+	struct ksock_connreq *cr;
+	int rc;
+	struct sockaddr_storage peer;
+
+	rc = lnet_sock_getaddr(sock, true, &peer);
+	if (rc != 0) {
+		CERROR("Can't determine new connection's address\n");
+		return rc;
+	}
+
+	LIBCFS_ALLOC(cr, sizeof(*cr));
+	if (cr == NULL) {
+		LCONSOLE_ERROR_MSG(0x12f,
+				   "Dropping connection request from %pIS: memory exhausted\n",
+				   &peer);
+		return -ENOMEM;
+	}
+
+	lnet_ni_addref(ni);
+	cr->ksncr_ni   = ni;
+	cr->ksncr_sock = sock;
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+	return 0;
+}
+
+static int
+ksocknal_connecting(struct ksock_conn_cb *conn_cb, struct sockaddr *sa)
+{
+	if (conn_cb &&
+	    rpc_cmp_addr((struct sockaddr *)&conn_cb->ksnr_addr, sa))
+		return conn_cb->ksnr_connecting;
+	return 0;
+}
+
+int
+ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
+		     struct socket *sock, int type)
+{
+	rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+	LIST_HEAD(zombies);
+	struct lnet_processid peerid;
+	u64 incarnation;
+	struct ksock_conn *conn;
+	struct ksock_conn *conn2;
+	struct ksock_peer_ni *peer_ni = NULL;
+	struct ksock_peer_ni *peer2;
+	struct ksock_sched *sched;
+	struct ksock_hello_msg *hello;
+	int cpt;
+	struct ksock_tx *tx;
+	struct ksock_tx *txtmp;
+	int rc;
+	int rc2;
+	int active;
+	int num_dup = 0;
+	char *warn = NULL;
+
+	active = (conn_cb != NULL);
+
+	LASSERT(active == (type != SOCKLND_CONN_NONE));
+
+	LIBCFS_ALLOC(conn, sizeof(*conn));
+	if (conn == NULL) {
+		rc = -ENOMEM;
+		goto failed_0;
+	}
+
+	conn->ksnc_peer = NULL;
+	conn->ksnc_conn_cb = NULL;
+	conn->ksnc_sock = sock;
+	/* 2 ref, 1 for conn, another extra ref prevents socket
+	 * being closed before establishment of connection */
+	refcount_set(&conn->ksnc_sock_refcount, 2);
+	conn->ksnc_type = type;
+	ksocknal_lib_save_callback(sock, conn);
+	refcount_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+	conn->ksnc_rx_ready = 0;
+	conn->ksnc_rx_scheduled = 0;
+
+	INIT_LIST_HEAD(&conn->ksnc_tx_queue);
+	conn->ksnc_tx_ready = 0;
+	conn->ksnc_tx_scheduled = 0;
+	conn->ksnc_tx_carrier = NULL;
+	atomic_set (&conn->ksnc_tx_nob, 0);
+
+	LIBCFS_ALLOC(hello, offsetof(struct ksock_hello_msg,
+				     kshm_ips[LNET_INTERFACES_NUM]));
+	if (hello == NULL) {
+		rc = -ENOMEM;
+		goto failed_1;
+	}
+
+	/* stash conn's local and remote addrs */
+	rc = ksocknal_lib_get_conn_addrs(conn);
+	if (rc != 0)
+		goto failed_1;
+
+	/* Find out/confirm peer_ni's NID and connection type and get the
+	 * vector of interfaces she's willing to let me connect to.
+	 * Passive connections use the listener timeout since the peer_ni sends
+	 * eagerly
+	 */
+
+	if (active) {
+		peer_ni = conn_cb->ksnr_peer;
+		LASSERT(ni == peer_ni->ksnp_ni);
+
+		/* Active connection sends HELLO eagerly */
+		hello->kshm_nips = 0;
+		peerid = peer_ni->ksnp_id;
+
+		write_lock_bh(global_lock);
+		conn->ksnc_proto = peer_ni->ksnp_proto;
+		write_unlock_bh(global_lock);
+
+		if (conn->ksnc_proto == NULL) {
+			conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			if (*ksocknal_tunables.ksnd_protocol == 2)
+				conn->ksnc_proto = &ksocknal_protocol_v2x;
+			else if (*ksocknal_tunables.ksnd_protocol == 1)
+				conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+		}
+
+		rc = ksocknal_send_hello(ni, conn, &peerid.nid, hello);
+		if (rc != 0)
+			goto failed_1;
+	} else {
+		peerid.nid = LNET_ANY_NID;
+		peerid.pid = LNET_PID_ANY;
+
+		/* Passive, get protocol from peer_ni */
+		conn->ksnc_proto = NULL;
+	}
+
+	rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation);
+	if (rc < 0)
+		goto failed_1;
+
+	LASSERT(rc == 0 || active);
+	LASSERT(conn->ksnc_proto != NULL);
+	LASSERT(!LNET_NID_IS_ANY(&peerid.nid));
+
+	cpt = lnet_nid2cpt(&peerid.nid, ni);
+
+	if (active) {
+		ksocknal_peer_addref(peer_ni);
+		write_lock_bh(global_lock);
+	} else {
+		peer_ni = ksocknal_create_peer(ni, &peerid);
+		if (IS_ERR(peer_ni)) {
+			rc = PTR_ERR(peer_ni);
+			goto failed_1;
+		}
+
+		write_lock_bh(global_lock);
+
+		/* called with a ref on ni, so shutdown can't have started */
+		LASSERT(atomic_read(&((struct ksock_net *)ni->ni_data)->ksnn_npeers) >= 0);
+
+		peer2 = ksocknal_find_peer_locked(ni, &peerid);
+		if (peer2 == NULL) {
+			/* NB this puts an "empty" peer_ni in the peer_ni
+			 * table (which takes my ref) */
+			hash_add(ksocknal_data.ksnd_peers,
+				 &peer_ni->ksnp_list, nidhash(&peerid.nid));
+		} else {
+			ksocknal_peer_decref(peer_ni);
+			peer_ni = peer2;
+		}
+
+		/* +1 ref for me */
+		ksocknal_peer_addref(peer_ni);
+		peer_ni->ksnp_accepting++;
+
+		/* Am I already connecting to this guy?  Resolve in
+		 * favour of higher NID...
+		 */
+		if (memcmp(&peerid.nid, &ni->ni_nid, sizeof(peerid.nid)) < 0 &&
+		    ksocknal_connecting(peer_ni->ksnp_conn_cb,
+					((struct sockaddr *) &conn->ksnc_peeraddr))) {
+			rc = EALREADY;
+			warn = "connection race resolution";
+			goto failed_2;
+		}
+	}
+
+	if (peer_ni->ksnp_closing ||
+	    (active && conn_cb->ksnr_deleted)) {
+		/* peer_ni/conn_cb got closed under me */
+		rc = -ESTALE;
+		warn = "peer_ni/conn_cb removed";
+		goto failed_2;
+        }
+
+	if (peer_ni->ksnp_proto == NULL) {
+		/* Never connected before.
+		 * NB recv_hello may have returned EPROTO to signal my peer_ni
+		 * wants a different protocol than the one I asked for.
+		 */
+		LASSERT(list_empty(&peer_ni->ksnp_conns));
+
+		peer_ni->ksnp_proto = conn->ksnc_proto;
+		peer_ni->ksnp_incarnation = incarnation;
+	}
+
+	if (peer_ni->ksnp_proto != conn->ksnc_proto ||
+	    peer_ni->ksnp_incarnation != incarnation) {
+		/* peer_ni rebooted or I've got the wrong protocol version */
+		ksocknal_close_peer_conns_locked(peer_ni, NULL, 0);
+
+		peer_ni->ksnp_proto = NULL;
+		rc = ESTALE;
+		warn = peer_ni->ksnp_incarnation != incarnation ?
+			"peer_ni rebooted" :
+			"wrong proto version";
+		goto failed_2;
+	}
+
+	switch (rc) {
+	default:
+		LBUG();
+	case 0:
+		break;
+	case EALREADY:
+		warn = "lost conn race";
+		goto failed_2;
+	case EPROTO:
+		warn = "retry with different protocol version";
+		goto failed_2;
+	}
+
+	/* Refuse to duplicate an existing connection, unless this is a
+	 * loopback connection */
+	if (!rpc_cmp_addr((struct sockaddr *)&conn->ksnc_peeraddr,
+			  (struct sockaddr *)&conn->ksnc_myaddr)) {
+		list_for_each_entry(conn2, &peer_ni->ksnp_conns, ksnc_list) {
+			if (!rpc_cmp_addr(
+				    (struct sockaddr *)&conn2->ksnc_peeraddr,
+				    (struct sockaddr *)&conn->ksnc_peeraddr) ||
+			    !rpc_cmp_addr(
+				    (struct sockaddr *)&conn2->ksnc_myaddr,
+				    (struct sockaddr *)&conn->ksnc_myaddr) ||
+			    conn2->ksnc_type != conn->ksnc_type)
+				continue;
+
+			num_dup++;
+			/* If max conns per type is not registered in conn_cb
+			 * as ksnr_max_conns, use ni's conns_per_peer
+			 */
+			if ((peer_ni->ksnp_conn_cb &&
+			    num_dup < peer_ni->ksnp_conn_cb->ksnr_max_conns) ||
+			    (!peer_ni->ksnp_conn_cb &&
+			    num_dup < ksocknal_get_conns_per_peer(peer_ni)))
+				continue;
+
+			/* Reply on a passive connection attempt so the peer_ni
+			 * realises we're connected.
+			 */
+			LASSERT(rc == 0);
+			if (!active)
+				rc = EALREADY;
+
+			warn = "duplicate";
+			goto failed_2;
+		}
+	}
+	/* If the connection created by this route didn't bind to the IP
+	 * address the route connected to, the connection/route matching
+	 * code below probably isn't going to work.
+	 */
+	if (active &&
+	    !rpc_cmp_addr((struct sockaddr *)&conn_cb->ksnr_addr,
+			  (struct sockaddr *)&conn->ksnc_peeraddr)) {
+		CERROR("Route %s %pIS connected to %pIS\n",
+		       libcfs_idstr(&peer_ni->ksnp_id),
+		       &conn_cb->ksnr_addr,
+		       &conn->ksnc_peeraddr);
+	}
+
+	/* Search for a conn_cb corresponding to the new connection and
+	 * create an association.  This allows incoming connections created
+	 * by conn_cbs in my peer_ni to match my own conn_cb entries so I don't
+	 * continually create duplicate conn_cbs.
+	 */
+	conn_cb = peer_ni->ksnp_conn_cb;
+
+	if (conn_cb && rpc_cmp_addr((struct sockaddr *)&conn->ksnc_peeraddr,
+				    (struct sockaddr *)&conn_cb->ksnr_addr))
+		ksocknal_associate_cb_conn_locked(conn_cb, conn);
+
+	conn->ksnc_peer = peer_ni;                 /* conn takes my ref on peer_ni */
+	peer_ni->ksnp_last_alive = ktime_get_seconds();
+	peer_ni->ksnp_send_keepalive = 0;
+	peer_ni->ksnp_error = 0;
+
+	sched = ksocknal_choose_scheduler_locked(cpt);
+	if (!sched) {
+		CERROR("no schedulers available. node is unhealthy\n");
+		goto failed_2;
+	}
+	/*
+	 * The cpt might have changed if we ended up selecting a non cpt
+	 * native scheduler. So use the scheduler's cpt instead.
+	 */
+	cpt = sched->kss_cpt;
+	sched->kss_nconns++;
+	conn->ksnc_scheduler = sched;
+
+	conn->ksnc_tx_last_post = ktime_get_seconds();
+	/* Set the deadline for the outgoing HELLO to drain */
+	conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
+	conn->ksnc_tx_deadline = ktime_get_seconds() +
+				 ksocknal_timeout();
+	smp_mb();   /* order with adding to peer_ni's conn list */
+
+	list_add(&conn->ksnc_list, &peer_ni->ksnp_conns);
+	ksocknal_conn_addref(conn);
+
+	ksocknal_new_packet(conn, 0);
+
+	conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+	/* Take packets blocking for this connection. */
+	list_for_each_entry_safe(tx, txtmp, &peer_ni->ksnp_tx_queue, tx_list) {
+		if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) ==
+		    SOCKNAL_MATCH_NO)
+			continue;
+
+		list_del(&tx->tx_list);
+		ksocknal_queue_tx_locked(tx, conn);
+	}
+
+	write_unlock_bh(global_lock);
+	/* We've now got a new connection.  Any errors from here on are just
+	 * like "normal" comms errors and we close the connection normally.
+	 * NB (a) we still have to send the reply HELLO for passive
+	 *        connections,
+	 *    (b) normal I/O on the conn is blocked until I setup and call the
+	 *        socket callbacks.
+	 */
+
+	CDEBUG(D_NET, "New conn %s p %d.x %pIS -> %pISp"
+	       " incarnation:%lld sched[%d]\n",
+	       libcfs_idstr(&peerid), conn->ksnc_proto->pro_version,
+	       &conn->ksnc_myaddr, &conn->ksnc_peeraddr,
+	       incarnation, cpt);
+
+	if (!active) {
+		hello->kshm_nips = 0;
+		rc = ksocknal_send_hello(ni, conn, &peerid.nid, hello);
+	}
+
+	LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
+				    kshm_ips[LNET_INTERFACES_NUM]));
+
+	/* setup the socket AFTER I've received hello (it disables
+	 * SO_LINGER).  I might call back to the acceptor who may want
+	 * to send a protocol version response and then close the
+	 * socket; this ensures the socket only tears down after the
+	 * response has been sent.
+	 */
+	if (rc == 0)
+		rc = ksocknal_lib_setup_sock(sock);
+
+	write_lock_bh(global_lock);
+
+	/* NB my callbacks block while I hold ksnd_global_lock */
+	ksocknal_lib_set_callback(sock, conn);
+
+	if (!active)
+		peer_ni->ksnp_accepting--;
+
+	write_unlock_bh(global_lock);
+
+	if (rc != 0) {
+		write_lock_bh(global_lock);
+		if (!conn->ksnc_closing) {
+			/* could be closed by another thread */
+			ksocknal_close_conn_locked(conn, rc);
+		}
+		write_unlock_bh(global_lock);
+	} else if (ksocknal_connsock_addref(conn) == 0) {
+		/* Allow I/O to proceed. */
+		ksocknal_read_callback(conn);
+		ksocknal_write_callback(conn);
+		ksocknal_connsock_decref(conn);
+	}
+
+	ksocknal_connsock_decref(conn);
+	ksocknal_conn_decref(conn);
+	return rc;
+
+failed_2:
+
+	if (!peer_ni->ksnp_closing &&
+	    list_empty(&peer_ni->ksnp_conns) &&
+	    peer_ni->ksnp_conn_cb == NULL) {
+		list_splice_init(&peer_ni->ksnp_tx_queue, &zombies);
+		ksocknal_unlink_peer_locked(peer_ni);
+	}
+
+	write_unlock_bh(global_lock);
+
+	if (warn != NULL) {
+		if (rc < 0)
+			CERROR("Not creating conn %s type %d: %s\n",
+			       libcfs_idstr(&peerid), conn->ksnc_type, warn);
+		else
+			CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+			       libcfs_idstr(&peerid), conn->ksnc_type, warn);
+	}
+
+	if (!active) {
+		if (rc > 0) {
+			/* Request retry by replying with CONN_NONE
+			 * ksnc_proto has been set already
+			 */
+			conn->ksnc_type = SOCKLND_CONN_NONE;
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, &peerid.nid, hello);
+		}
+
+		write_lock_bh(global_lock);
+		peer_ni->ksnp_accepting--;
+		write_unlock_bh(global_lock);
+	}
+
+	/*
+	 * If we get here without an error code, just use -EALREADY.
+	 * Depending on how we got here, the error may be positive
+	 * or negative. Normalize the value for ksocknal_txlist_done().
+	 */
+	rc2 = (rc == 0 ? -EALREADY : (rc > 0 ? -rc : rc));
+	ksocknal_txlist_done(ni, &zombies, rc2);
+	ksocknal_peer_decref(peer_ni);
+
+failed_1:
+	if (hello != NULL)
+		LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
+					    kshm_ips[LNET_INTERFACES_NUM]));
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+
+failed_0:
+	sock_release(sock);
+
+	return rc;
+}
+
+void
+ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
+{
+        /* This just does the immmediate housekeeping, and queues the
+         * connection for the reaper to terminate.
+         * Caller holds ksnd_global_lock exclusively in irq context */
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_conn_cb *conn_cb;
+	struct ksock_conn *conn2;
+	int conn_count;
+	int duplicate_count = 0;
+
+	LASSERT(peer_ni->ksnp_error == 0);
+	LASSERT(!conn->ksnc_closing);
+	conn->ksnc_closing = 1;
+
+	/* ksnd_deathrow_conns takes over peer_ni's ref */
+	list_del(&conn->ksnc_list);
+
+	conn_cb = conn->ksnc_conn_cb;
+	if (conn_cb != NULL) {
+		/* dissociate conn from cb... */
+		LASSERT(!conn_cb->ksnr_deleted);
+
+		conn_count = ksocknal_get_conn_count_by_type(conn_cb,
+							     conn->ksnc_type);
+		/* connected bit is set only if all connections
+		 * of the given type got created
+		 */
+		if (conn_count == conn_cb->ksnr_max_conns)
+			LASSERT((conn_cb->ksnr_connected &
+				BIT(conn->ksnc_type)) != 0);
+
+		if (conn_count == 1) {
+			list_for_each_entry(conn2, &peer_ni->ksnp_conns,
+					    ksnc_list) {
+				if (conn2->ksnc_conn_cb == conn_cb &&
+				    conn2->ksnc_type == conn->ksnc_type)
+					duplicate_count += 1;
+			}
+			if (duplicate_count > 0)
+				CERROR("Found %d duplicate conns type %d\n",
+				       duplicate_count,
+				       conn->ksnc_type);
+		}
+		ksocknal_decr_conn_count(conn_cb, conn->ksnc_type);
+
+		conn->ksnc_conn_cb = NULL;
+
+		/* drop conn's ref on conn_cb */
+		ksocknal_conn_cb_decref(conn_cb);
+	}
+
+	if (list_empty(&peer_ni->ksnp_conns)) {
+		/* No more connections to this peer_ni */
+
+		if (!list_empty(&peer_ni->ksnp_tx_queue)) {
+			struct ksock_tx *tx;
+
+			LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+			/* throw them to the last connection...,
+			 * these TXs will be send to /dev/null by scheduler */
+			list_for_each_entry(tx, &peer_ni->ksnp_tx_queue,
+					    tx_list)
+				ksocknal_tx_prep(conn, tx);
+
+			spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+			list_splice_init(&peer_ni->ksnp_tx_queue,
+					 &conn->ksnc_tx_queue);
+			spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+		}
+
+		/* renegotiate protocol version */
+		peer_ni->ksnp_proto = NULL;
+		/* stash last conn close reason */
+		peer_ni->ksnp_error = error;
+
+		if (peer_ni->ksnp_conn_cb == NULL) {
+			/* I've just closed last conn belonging to a
+			 * peer_ni with no connections to it
+			 */
+			ksocknal_unlink_peer_locked(peer_ni);
+		}
+	}
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed(struct ksock_peer_ni *peer_ni)
+{
+	bool notify = false;
+	time64_t last_alive = 0;
+
+	/* There has been a connection failure or comms error; but I'll only
+	 * tell LNET I think the peer_ni is dead if it's to another kernel and
+	 * there are no connections or connection attempts in existence. */
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	if ((peer_ni->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+	     list_empty(&peer_ni->ksnp_conns) &&
+	     peer_ni->ksnp_accepting == 0 &&
+	     !ksocknal_find_connecting_conn_cb_locked(peer_ni)) {
+		notify = true;
+		last_alive = peer_ni->ksnp_last_alive;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	if (notify)
+		lnet_notify(peer_ni->ksnp_ni,
+			    lnet_nid_to_nid4(&peer_ni->ksnp_id.nid),
+			    false, false, last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(struct ksock_conn *conn)
+{
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_tx *tx;
+	struct ksock_tx *tmp;
+	LIST_HEAD(zlist);
+
+	/* NB safe to finalize TXs because closing of socket will
+	 * abort all buffered data */
+	LASSERT(conn->ksnc_sock == NULL);
+
+	spin_lock(&peer_ni->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp, &peer_ni->ksnp_zc_req_list,
+				 tx_zc_list) {
+		if (tx->tx_conn != conn)
+			continue;
+
+		LASSERT(tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+		tx->tx_msg.ksm_zc_cookies[0] = 0;
+		tx->tx_zc_aborted = 1;	/* mark it as not-acked */
+		list_move(&tx->tx_zc_list, &zlist);
+	}
+
+	spin_unlock(&peer_ni->ksnp_lock);
+
+	while ((tx = list_first_entry_or_null(&zlist, struct ksock_tx,
+					      tx_zc_list)) != NULL) {
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+}
+
+void
+ksocknal_terminate_conn(struct ksock_conn *conn)
+{
+	/* This gets called by the reaper (guaranteed thread context) to
+	 * disengage the socket from its callbacks and close it.
+	 * ksnc_refcount will eventually hit zero, and then the reaper will
+	 * destroy it.
+	 */
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_sched *sched = conn->ksnc_scheduler;
+	bool failed = false;
+
+	LASSERT(conn->ksnc_closing);
+
+	/* wake up the scheduler to "send" all remaining packets to /dev/null */
+	spin_lock_bh(&sched->kss_lock);
+
+	/* a closing conn is always ready to tx */
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled &&
+	    !list_empty(&conn->ksnc_tx_queue)) {
+		list_add_tail(&conn->ksnc_tx_list,
+			      &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up(&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	/* serialise with callbacks */
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+	/* OK, so this conn may not be completely disengaged from its
+	 * scheduler yet, but it _has_ committed to terminate...
+	 */
+	conn->ksnc_scheduler->kss_nconns--;
+
+	if (peer_ni->ksnp_error != 0) {
+		/* peer_ni's last conn closed in error */
+		LASSERT(list_empty(&peer_ni->ksnp_conns));
+		failed = true;
+		peer_ni->ksnp_error = 0;     /* avoid multiple notifications */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (failed)
+		ksocknal_peer_failed(peer_ni);
+
+	/* The socket is closed on the final put; either here, or in
+	 * ksocknal_{send,recv}msg().  Since we set up the linger2 option
+	 * when the connection was established, this will close the socket
+	 * immediately, aborting anything buffered in it. Any hung
+	 * zero-copy transmits will therefore complete in finite time.
+	 */
+	ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn(struct ksock_conn *conn)
+{
+	/* Queue the conn for the reaper to destroy */
+	LASSERT(refcount_read(&conn->ksnc_conn_refcount) == 0);
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn(struct ksock_conn *conn)
+{
+	time64_t last_rcv;
+
+	/* Final coup-de-grace of the reaper */
+	CDEBUG(D_NET, "connection %p\n", conn);
+
+	LASSERT(refcount_read(&conn->ksnc_conn_refcount) == 0);
+	LASSERT(refcount_read(&conn->ksnc_sock_refcount) == 0);
+	LASSERT(conn->ksnc_sock == NULL);
+	LASSERT(conn->ksnc_conn_cb == NULL);
+	LASSERT(!conn->ksnc_tx_scheduled);
+	LASSERT(!conn->ksnc_rx_scheduled);
+	LASSERT(list_empty(&conn->ksnc_tx_queue));
+
+        /* complete current receive if any */
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_LNET_PAYLOAD:
+                last_rcv = conn->ksnc_rx_deadline -
+			   ksocknal_timeout();
+		CERROR("Completing partial receive from %s[%d], ip %pISp, with error, wanted: %d, left: %d, last alive is %lld secs ago\n",
+		       libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+		       conn->ksnc_type,
+		       &conn->ksnc_peeraddr,
+		       conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+		       ktime_get_seconds() - last_rcv);
+		if (conn->ksnc_lnet_msg)
+			conn->ksnc_lnet_msg->msg_health_status =
+				LNET_MSG_STATUS_REMOTE_ERROR;
+		lnet_finalize(conn->ksnc_lnet_msg, -EIO);
+		break;
+	case SOCKNAL_RX_LNET_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of lnet header from %s, ip %pISp, with error, protocol: %d.x.\n",
+			       libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+			       &conn->ksnc_peeraddr,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of ksock message from %s, ip %pISp, with error, protocol: %d.x.\n",
+			       libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+			       &conn->ksnc_peeraddr,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_SLOP:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of slops from %s, ip %pISp, with error\n",
+			       libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+			       &conn->ksnc_peeraddr);
+		break;
+	default:
+		LBUG();
+		break;
+	}
+
+	ksocknal_peer_decref(conn->ksnc_peer);
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni,
+				 struct sockaddr *addr, int why)
+{
+	struct ksock_conn *conn;
+	struct ksock_conn *cnxt;
+	int count = 0;
+
+	list_for_each_entry_safe(conn, cnxt, &peer_ni->ksnp_conns, ksnc_list) {
+		if (!addr ||
+		    rpc_cmp_addr(addr,
+				 (struct sockaddr *)&conn->ksnc_peeraddr)) {
+			count++;
+			ksocknal_close_conn_locked(conn, why);
+		}
+	}
+
+	return count;
+}
+
+int
+ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why)
+{
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	int count;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	count = ksocknal_close_peer_conns_locked(
+		peer_ni, (struct sockaddr *)&conn->ksnc_peeraddr, why);
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return count;
+}
+
+int
+ksocknal_close_matching_conns(struct lnet_processid *id, __u32 ipaddr)
+{
+	struct ksock_peer_ni *peer_ni;
+	struct hlist_node *pnxt;
+	int lo;
+	int hi;
+	int i;
+	int count = 0;
+	struct sockaddr_in sa = {.sin_family = AF_INET};
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (!LNET_NID_IS_ANY(&id->nid)) {
+		lo = hash_min(nidhash(&id->nid),
+			      HASH_BITS(ksocknal_data.ksnd_peers));
+		hi = lo;
+	} else {
+		lo = 0;
+		hi = HASH_SIZE(ksocknal_data.ksnd_peers) - 1;
+	}
+
+	sa.sin_addr.s_addr = htonl(ipaddr);
+	for (i = lo; i <= hi; i++) {
+		hlist_for_each_entry_safe(peer_ni, pnxt,
+					  &ksocknal_data.ksnd_peers[i],
+					  ksnp_list) {
+
+			if (!((LNET_NID_IS_ANY(&id->nid) ||
+			       nid_same(&id->nid, &peer_ni->ksnp_id.nid)) &&
+			      (id->pid == LNET_PID_ANY ||
+			       id->pid == peer_ni->ksnp_id.pid)))
+				continue;
+
+			count += ksocknal_close_peer_conns_locked(
+				peer_ni,
+				ipaddr ? (struct sockaddr *)&sa : NULL, 0);
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* wildcards always succeed */
+	if (LNET_NID_IS_ANY(&id->nid) || id->pid == LNET_PID_ANY ||
+	    ipaddr == 0)
+		return 0;
+
+	return (count == 0 ? -ENOENT : 0);
+}
+
+void
+ksocknal_notify_gw_down(struct lnet_nid *gw_nid)
+{
+	/* The router is telling me she's been notified of a change in
+	 * gateway state....
+	 */
+	struct lnet_processid id = {
+		.pid	= LNET_PID_ANY,
+		.nid	= *gw_nid,
+	};
+
+	CDEBUG(D_NET, "gw %s down\n", libcfs_nidstr(gw_nid));
+
+	/* If the gateway crashed, close all open connections... */
+	ksocknal_close_matching_conns(&id, 0);
+	return;
+
+	/* We can only establish new connections
+	 * if we have autroutes, and these connect on demand.
+	 */
+}
+
+static void
+ksocknal_push_peer(struct ksock_peer_ni *peer_ni)
+{
+	int index;
+	int i;
+	struct ksock_conn *conn;
+
+        for (index = 0; ; index++) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+                i = 0;
+                conn = NULL;
+
+		list_for_each_entry(conn, &peer_ni->ksnp_conns, ksnc_list) {
+                        if (i++ == index) {
+                                ksocknal_conn_addref(conn);
+                                break;
+                        }
+                }
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		if (i <= index)
+                        break;
+
+                ksocknal_lib_push_conn (conn);
+                ksocknal_conn_decref(conn);
+        }
+}
+
+static int
+ksocknal_push(struct lnet_ni *ni, struct lnet_processid *id)
+{
+	int lo;
+	int hi;
+	int bkt;
+	int rc = -ENOENT;
+
+	if (!LNET_NID_IS_ANY(&id->nid)) {
+		lo = hash_min(nidhash(&id->nid),
+			      HASH_BITS(ksocknal_data.ksnd_peers));
+		hi = lo;
+	} else {
+		lo = 0;
+		hi = HASH_SIZE(ksocknal_data.ksnd_peers) - 1;
+	}
+
+	for (bkt = lo; bkt <= hi; bkt++) {
+		int peer_off; /* searching offset in peer_ni hash table */
+
+		for (peer_off = 0; ; peer_off++) {
+			struct ksock_peer_ni *peer_ni;
+			int	      i = 0;
+
+			read_lock(&ksocknal_data.ksnd_global_lock);
+			hlist_for_each_entry(peer_ni,
+					     &ksocknal_data.ksnd_peers[bkt],
+					     ksnp_list) {
+				if (!((LNET_NID_IS_ANY(&id->nid) ||
+				       nid_same(&id->nid,
+						 &peer_ni->ksnp_id.nid)) &&
+				      (id->pid == LNET_PID_ANY ||
+				       id->pid == peer_ni->ksnp_id.pid)))
+					continue;
+
+				if (i++ == peer_off) {
+					ksocknal_peer_addref(peer_ni);
+					break;
+				}
+			}
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			if (i <= peer_off) /* no match */
+				break;
+
+			rc = 0;
+			ksocknal_push_peer(peer_ni);
+			ksocknal_peer_decref(peer_ni);
+		}
+	}
+	return rc;
+}
+
+int
+ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
+{
+	struct lnet_process_id id4 = {};
+	struct lnet_processid id = {};
+	struct libcfs_ioctl_data *data = arg;
+	int rc;
+
+	switch(cmd) {
+	case IOC_LIBCFS_GET_INTERFACE: {
+		struct ksock_net *net = ni->ni_data;
+		struct ksock_interface *iface;
+		struct sockaddr_in *sa;
+
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+		if (data->ioc_count >= 1) {
+			rc = -ENOENT;
+		} else {
+			rc = 0;
+			iface = &net->ksnn_interface;
+
+			sa = (void *)&iface->ksni_addr;
+			if (sa->sin_family == AF_INET)
+				data->ioc_u32[0] = ntohl(sa->sin_addr.s_addr);
+			else
+				data->ioc_u32[0] = 0xFFFFFFFF;
+			data->ioc_u32[1] = iface->ksni_netmask;
+			data->ioc_u32[2] = iface->ksni_npeers;
+			data->ioc_u32[3] = iface->ksni_nroutes;
+		}
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return rc;
+        }
+
+	case IOC_LIBCFS_GET_PEER: {
+		__u32 myip = 0;
+		__u32 ip = 0;
+		int port = 0;
+		int conn_count = 0;
+		int share_count = 0;
+
+		rc = ksocknal_get_peer_info(ni, data->ioc_count,
+					    &id4, &myip, &ip, &port,
+					    &conn_count,  &share_count);
+		if (rc != 0)
+			return rc;
+
+		data->ioc_nid    = id4.nid;
+		data->ioc_count  = share_count;
+		data->ioc_u32[0] = ip;
+		data->ioc_u32[1] = port;
+		data->ioc_u32[2] = myip;
+		data->ioc_u32[3] = conn_count;
+		data->ioc_u32[4] = id4.pid;
+		return 0;
+	}
+
+	case IOC_LIBCFS_ADD_PEER: {
+		struct sockaddr_in sa = {.sin_family = AF_INET};
+
+		id4.nid = data->ioc_nid;
+		id4.pid = LNET_PID_LUSTRE;
+		sa.sin_addr.s_addr = htonl(data->ioc_u32[0]);
+		sa.sin_port = htons(data->ioc_u32[1]);
+		return ksocknal_add_peer(ni, id4, (struct sockaddr *)&sa);
+	}
+	case IOC_LIBCFS_DEL_PEER:
+		id4.nid = data->ioc_nid;
+		id4.pid = LNET_PID_ANY;
+		return ksocknal_del_peer(ni, id4,
+					 data->ioc_u32[0]); /* IP */
+
+        case IOC_LIBCFS_GET_CONN: {
+                int           txmem;
+                int           rxmem;
+                int           nagle;
+		struct ksock_conn *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count);
+		struct sockaddr_in *psa = (void *)&conn->ksnc_peeraddr;
+		struct sockaddr_in *mysa = (void *)&conn->ksnc_myaddr;
+
+                if (conn == NULL)
+                        return -ENOENT;
+
+                ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+		data->ioc_count = txmem;
+		data->ioc_nid = lnet_nid_to_nid4(&conn->ksnc_peer->ksnp_id.nid);
+		data->ioc_flags = nagle;
+		if (psa->sin_family == AF_INET)
+			data->ioc_u32[0] = ntohl(psa->sin_addr.s_addr);
+		else
+			data->ioc_u32[0] = 0xFFFFFFFF;
+		data->ioc_u32[1] = rpc_get_port((struct sockaddr *)
+						&conn->ksnc_peeraddr);
+		if (mysa->sin_family == AF_INET)
+			data->ioc_u32[2] = ntohl(mysa->sin_addr.s_addr);
+		else
+			data->ioc_u32[2] = 0xFFFFFFFF;
+		data->ioc_u32[3] = conn->ksnc_type;
+		data->ioc_u32[4] = conn->ksnc_scheduler->kss_cpt;
+                data->ioc_u32[5] = rxmem;
+                data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+                ksocknal_conn_decref(conn);
+                return 0;
+        }
+
+	case IOC_LIBCFS_CLOSE_CONNECTION:
+		lnet_nid4_to_nid(data->ioc_nid, &id.nid);
+		id.pid = LNET_PID_ANY;
+		return ksocknal_close_matching_conns(&id,
+						     data->ioc_u32[0]);
+
+	case IOC_LIBCFS_REGISTER_MYNID:
+		/* Ignore if this is a noop */
+		if (nid_is_nid4(&ni->ni_nid) &&
+		    data->ioc_nid == lnet_nid_to_nid4(&ni->ni_nid))
+			return 0;
+
+		CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+		       libcfs_nid2str(data->ioc_nid),
+		       libcfs_nidstr(&ni->ni_nid));
+		return -EINVAL;
+
+	case IOC_LIBCFS_PUSH_CONNECTION:
+		lnet_nid4_to_nid(data->ioc_nid, &id.nid);
+		id.pid = LNET_PID_ANY;
+		return ksocknal_push(ni, &id);
+
+	default:
+		return -EINVAL;
+	}
+	/* not reached */
+}
+
+static void
+ksocknal_free_buffers (void)
+{
+	LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+	if (ksocknal_data.ksnd_schedulers != NULL)
+		cfs_percpt_free(ksocknal_data.ksnd_schedulers);
+
+	spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+	if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+		LIST_HEAD(zlist);
+		struct ksock_tx	*tx;
+
+		list_splice_init(&ksocknal_data.ksnd_idle_noop_txs, &zlist);
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+		while ((tx = list_first_entry_or_null(&zlist, struct ksock_tx,
+						      tx_list)) != NULL) {
+			list_del(&tx->tx_list);
+			LIBCFS_FREE(tx, tx->tx_desc_size);
+		}
+	} else {
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+}
+
+static int ksocknal_get_link_status(struct net_device *dev)
+{
+	int ret = -1;
+
+	LASSERT(dev);
+
+	if (!netif_running(dev)) {
+		ret = 0;
+		CDEBUG(D_NET, "device not running\n");
+	}
+	/* Some devices may not be providing link settings */
+	else if (dev->ethtool_ops->get_link) {
+		ret = dev->ethtool_ops->get_link(dev);
+		CDEBUG(D_NET, "get_link returns %u\n", ret);
+	}
+
+	return ret;
+}
+
+static int
+ksocknal_handle_link_state_change(struct net_device *dev,
+				  unsigned char operstate)
+{
+	struct lnet_ni *ni = NULL;
+	struct ksock_net *net;
+	struct ksock_net *cnxt;
+	int ifindex;
+	unsigned char link_down = !(operstate == IF_OPER_UP);
+	struct in_device *in_dev;
+	bool found_ip = false;
+	struct ksock_interface *ksi = NULL;
+	struct sockaddr_in *sa;
+	DECLARE_CONST_IN_IFADDR(ifa);
+
+	ifindex = dev->ifindex;
+
+	if (!ksocknal_data.ksnd_nnets)
+		goto out;
+
+	list_for_each_entry_safe(net, cnxt, &ksocknal_data.ksnd_nets,
+				 ksnn_list) {
+
+		ksi = &net->ksnn_interface;
+		sa = (void *)&ksi->ksni_addr;
+		found_ip = false;
+
+		if (ksi->ksni_index != ifindex ||
+		    strcmp(ksi->ksni_name, dev->name))
+			continue;
+
+		ni = net->ksnn_ni;
+
+		in_dev = __in_dev_get_rtnl(dev);
+		if (!in_dev) {
+			CDEBUG(D_NET, "Interface %s has no IPv4 status.\n",
+			       dev->name);
+			CDEBUG(D_NET, "set link fatal state to 1\n");
+			atomic_set(&ni->ni_fatal_error_on, 1);
+			continue;
+		}
+		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+			if (sa->sin_addr.s_addr == ifa->ifa_local)
+				found_ip = true;
+		}
+		endfor_ifa(in_dev);
+
+		if (!found_ip) {
+			CDEBUG(D_NET, "Interface %s has no matching ip\n",
+			       dev->name);
+			CDEBUG(D_NET, "set link fatal state to 1\n");
+			atomic_set(&ni->ni_fatal_error_on, 1);
+			continue;
+		}
+
+		if (link_down) {
+			CDEBUG(D_NET, "set link fatal state to 1\n");
+			atomic_set(&ni->ni_fatal_error_on, link_down);
+		} else {
+			CDEBUG(D_NET, "set link fatal state to %u\n",
+			       (ksocknal_get_link_status(dev) == 0));
+			atomic_set(&ni->ni_fatal_error_on,
+				   (ksocknal_get_link_status(dev) == 0));
+		}
+	}
+out:
+	return 0;
+}
+
+
+static int
+ksocknal_handle_inetaddr_change(struct in_ifaddr *ifa, unsigned long event)
+{
+	struct lnet_ni *ni;
+	struct ksock_net *net;
+	struct ksock_net *cnxt;
+	struct net_device *event_netdev = ifa->ifa_dev->dev;
+	int ifindex;
+	struct ksock_interface *ksi = NULL;
+	struct sockaddr_in *sa;
+
+	if (!ksocknal_data.ksnd_nnets)
+		goto out;
+
+	ifindex = event_netdev->ifindex;
+
+	list_for_each_entry_safe(net, cnxt, &ksocknal_data.ksnd_nets,
+				 ksnn_list) {
+
+		ksi = &net->ksnn_interface;
+		sa = (void *)&ksi->ksni_addr;
+
+		if (ksi->ksni_index != ifindex ||
+		    strcmp(ksi->ksni_name, event_netdev->name))
+			continue;
+
+		if (sa->sin_addr.s_addr == ifa->ifa_local) {
+			CDEBUG(D_NET, "set link fatal state to %u\n",
+			       (event == NETDEV_DOWN));
+			ni = net->ksnn_ni;
+			atomic_set(&ni->ni_fatal_error_on,
+				   (event == NETDEV_DOWN));
+		}
+	}
+out:
+	return 0;
+}
+
+/************************************
+ * Net device notifier event handler
+ ************************************/
+static int ksocknal_device_event(struct notifier_block *unused,
+				 unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	unsigned char operstate;
+
+	operstate = dev->operstate;
+
+	CDEBUG(D_NET, "devevent: status=%ld, iface=%s ifindex %d state %u\n",
+	       event, dev->name, dev->ifindex, operstate);
+
+	switch (event) {
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_CHANGE:
+		ksocknal_handle_link_state_change(dev, operstate);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+/************************************
+ * Inetaddr notifier event handler
+ ************************************/
+static int ksocknal_inetaddr_event(struct notifier_block *unused,
+				   unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+
+	CDEBUG(D_NET, "addrevent: status %ld ip addr %pI4, netmask %pI4.\n",
+	       event, &ifa->ifa_address, &ifa->ifa_mask);
+
+	switch (event) {
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_CHANGE:
+		ksocknal_handle_inetaddr_change(ifa, event);
+		break;
+
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ksocknal_dev_notifier_block = {
+	.notifier_call = ksocknal_device_event,
+};
+
+static struct notifier_block ksocknal_inetaddr_notifier_block = {
+	.notifier_call = ksocknal_inetaddr_event,
+};
+
+static void
+ksocknal_base_shutdown(void)
+{
+	struct ksock_sched *sched;
+	struct ksock_peer_ni *peer_ni;
+	int i;
+
+	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %lld\n",
+	       libcfs_kmem_read());
+	LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+	if (ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL) {
+		unregister_netdevice_notifier(&ksocknal_dev_notifier_block);
+		unregister_inetaddr_notifier(&ksocknal_inetaddr_notifier_block);
+	}
+
+	switch (ksocknal_data.ksnd_init) {
+	default:
+		LASSERT(0);
+		fallthrough;
+
+	case SOCKNAL_INIT_ALL:
+	case SOCKNAL_INIT_DATA:
+		hash_for_each(ksocknal_data.ksnd_peers, i, peer_ni, ksnp_list)
+			LASSERT(0);
+
+		LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+		LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns));
+		LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns));
+		LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs));
+		LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes));
+
+		if (ksocknal_data.ksnd_schedulers != NULL) {
+			cfs_percpt_for_each(sched, i,
+					    ksocknal_data.ksnd_schedulers) {
+
+				LASSERT(list_empty(&sched->kss_tx_conns));
+				LASSERT(list_empty(&sched->kss_rx_conns));
+				LASSERT(list_empty(&sched->kss_zombie_noop_txs));
+				LASSERT(sched->kss_nconns == 0);
+			}
+		}
+
+		/* flag threads to terminate; wake and wait for them to die */
+		ksocknal_data.ksnd_shuttingdown = 1;
+		wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+		wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+		if (ksocknal_data.ksnd_schedulers != NULL) {
+			cfs_percpt_for_each(sched, i,
+					    ksocknal_data.ksnd_schedulers)
+					wake_up_all(&sched->kss_waitq);
+		}
+
+		wait_var_event_warning(&ksocknal_data.ksnd_nthreads,
+				       atomic_read(&ksocknal_data.ksnd_nthreads) == 0,
+				       "waiting for %d threads to terminate\n",
+				       atomic_read(&ksocknal_data.ksnd_nthreads));
+
+		ksocknal_free_buffers();
+
+		ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+		break;
+	}
+
+	CDEBUG(D_MALLOC, "after NAL cleanup: kmem %lld\n",
+	       libcfs_kmem_read());
+
+	module_put(THIS_MODULE);
+}
+
+static int
+ksocknal_base_startup(void)
+{
+	struct ksock_sched *sched;
+	int rc;
+	int i;
+
+	LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+	LASSERT(ksocknal_data.ksnd_nnets == 0);
+
+	memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */
+
+	hash_init(ksocknal_data.ksnd_peers);
+
+	rwlock_init(&ksocknal_data.ksnd_global_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+	spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns);
+	init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes);
+	init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs);
+
+	/* NB memset above zeros whole of ksocknal_data */
+
+	/* flag lists/ptrs/locks initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+	if (!try_module_get(THIS_MODULE))
+		goto failed;
+
+	/* Create a scheduler block per available CPT */
+	ksocknal_data.ksnd_schedulers = cfs_percpt_alloc(lnet_cpt_table(),
+							 sizeof(*sched));
+	if (ksocknal_data.ksnd_schedulers == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) {
+		int nthrs;
+
+		/*
+		 * make sure not to allocate more threads than there are
+		 * cores/CPUs in teh CPT
+		 */
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+		} else {
+			/*
+			 * max to half of CPUs, assume another half should be
+			 * reserved for upper layer modules
+			 */
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+		}
+
+		sched->kss_nthreads_max = nthrs;
+		sched->kss_cpt = i;
+
+		spin_lock_init(&sched->kss_lock);
+		INIT_LIST_HEAD(&sched->kss_rx_conns);
+		INIT_LIST_HEAD(&sched->kss_tx_conns);
+		INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+		init_waitqueue_head(&sched->kss_waitq);
+        }
+
+        ksocknal_data.ksnd_connd_starting         = 0;
+        ksocknal_data.ksnd_connd_failed_stamp     = 0;
+	ksocknal_data.ksnd_connd_starting_stamp   = ktime_get_real_seconds();
+        /* must have at least 2 connds to remain responsive to accepts while
+         * connecting */
+        if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+                *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+        if (*ksocknal_tunables.ksnd_nconnds_max <
+            *ksocknal_tunables.ksnd_nconnds) {
+                ksocknal_tunables.ksnd_nconnds_max =
+                        ksocknal_tunables.ksnd_nconnds;
+        }
+
+        for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+		spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+		ksocknal_data.ksnd_connd_starting++;
+		spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+		rc = ksocknal_thread_start(ksocknal_connd,
+					   (void *)((uintptr_t)i),
+					   "socknal_cd%02d", i);
+		if (rc != 0) {
+			spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+			ksocknal_data.ksnd_connd_starting--;
+			spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+                        CERROR("Can't spawn socknal connd: %d\n", rc);
+                        goto failed;
+                }
+        }
+
+	rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+        if (rc != 0) {
+                CERROR ("Can't spawn socknal reaper: %d\n", rc);
+                goto failed;
+        }
+
+	register_netdevice_notifier(&ksocknal_dev_notifier_block);
+	register_inetaddr_notifier(&ksocknal_inetaddr_notifier_block);
+
+        /* flag everything initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+        return 0;
+
+ failed:
+        ksocknal_base_shutdown();
+        return -ENETDOWN;
+}
+
+static int
+ksocknal_debug_peerhash(struct lnet_ni *ni)
+{
+	struct ksock_peer_ni *peer_ni;
+	int i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	hash_for_each(ksocknal_data.ksnd_peers, i, peer_ni, ksnp_list) {
+		struct ksock_conn_cb *conn_cb;
+		struct ksock_conn *conn;
+
+		if (peer_ni->ksnp_ni != ni)
+			continue;
+
+		CWARN("Active peer_ni on shutdown: %s, ref %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n",
+		      libcfs_idstr(&peer_ni->ksnp_id),
+		      refcount_read(&peer_ni->ksnp_refcount),
+		      peer_ni->ksnp_closing,
+		      peer_ni->ksnp_accepting, peer_ni->ksnp_error,
+		      peer_ni->ksnp_zc_next_cookie,
+		      !list_empty(&peer_ni->ksnp_tx_queue),
+		      !list_empty(&peer_ni->ksnp_zc_req_list));
+
+		conn_cb = peer_ni->ksnp_conn_cb;
+		if (conn_cb) {
+			CWARN("ConnCB: ref %d, schd %d, conn %d, cnted %d, del %d\n",
+			      refcount_read(&conn_cb->ksnr_refcount),
+			      conn_cb->ksnr_scheduled, conn_cb->ksnr_connecting,
+			      conn_cb->ksnr_connected, conn_cb->ksnr_deleted);
+		}
+
+		list_for_each_entry(conn, &peer_ni->ksnp_conns, ksnc_list) {
+			CWARN("Conn: ref %d, sref %d, t %d, c %d\n",
+			      refcount_read(&conn->ksnc_conn_refcount),
+			      refcount_read(&conn->ksnc_sock_refcount),
+			      conn->ksnc_type, conn->ksnc_closing);
+		}
+		break;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return 0;
+}
+
+void
+ksocknal_shutdown(struct lnet_ni *ni)
+{
+	struct ksock_net *net = ni->ni_data;
+	struct lnet_process_id anyid = {
+		.nid = LNET_NID_ANY,
+		.pid = LNET_PID_ANY,
+	};
+
+	LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+	LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+	/* prevent new peers */
+	atomic_add(SOCKNAL_SHUTDOWN_BIAS, &net->ksnn_npeers);
+
+	/* Delete all peers */
+	ksocknal_del_peer(ni, anyid, 0);
+
+	/* Wait for all peer_ni state to clean up */
+	wait_var_event_warning(&net->ksnn_npeers,
+			       atomic_read(&net->ksnn_npeers) ==
+			       SOCKNAL_SHUTDOWN_BIAS,
+			       "waiting for %d peers to disconnect\n",
+			       ksocknal_debug_peerhash(ni) +
+			       atomic_read(&net->ksnn_npeers) -
+			       SOCKNAL_SHUTDOWN_BIAS);
+
+	LASSERT(net->ksnn_interface.ksni_npeers == 0);
+	LASSERT(net->ksnn_interface.ksni_nroutes == 0);
+
+	list_del(&net->ksnn_list);
+	LIBCFS_FREE(net, sizeof(*net));
+
+	ksocknal_data.ksnd_nnets--;
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+}
+
+static int
+ksocknal_search_new_ipif(struct ksock_net *net)
+{
+	int new_ipif = 0;
+	char *ifnam = &net->ksnn_interface.ksni_name[0];
+	char *colon = strchr(ifnam, ':');
+	bool found = false;
+	struct ksock_net *tmp;
+
+	if (colon != NULL)
+		*colon = 0;
+
+	list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, ksnn_list) {
+		char *ifnam2 = &tmp->ksnn_interface.ksni_name[0];
+		char *colon2 = strchr(ifnam2, ':');
+
+		if (colon2 != NULL)
+			*colon2 = 0;
+
+		found = strcmp(ifnam, ifnam2) == 0;
+		if (colon2 != NULL)
+			*colon2 = ':';
+	}
+
+	new_ipif += !found;
+	if (colon != NULL)
+		*colon = ':';
+
+	return new_ipif;
+}
+
+static int
+ksocknal_start_schedulers(struct ksock_sched *sched)
+{
+	int	nthrs;
+	int	rc = 0;
+	int	i;
+
+	if (sched->kss_nthreads == 0) {
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = sched->kss_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       sched->kss_cpt);
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+			nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+		}
+		nthrs = min(nthrs, sched->kss_nthreads_max);
+	} else {
+		LASSERT(sched->kss_nthreads <= sched->kss_nthreads_max);
+		/* increase two threads if there is new interface */
+		nthrs = min(2, sched->kss_nthreads_max - sched->kss_nthreads);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long id;
+
+		id = KSOCK_THREAD_ID(sched->kss_cpt, sched->kss_nthreads + i);
+		rc = ksocknal_thread_start(ksocknal_scheduler, (void *)id,
+					   "socknal_sd%02d_%02d",
+					   sched->kss_cpt,
+					   (int)KSOCK_THREAD_SID(id));
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       sched->kss_cpt, (int) KSOCK_THREAD_SID(id), rc);
+		break;
+	}
+
+	sched->kss_nthreads += i;
+	return rc;
+}
+
+static int
+ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts)
+{
+	int newif = ksocknal_search_new_ipif(net);
+	int rc;
+	int i;
+
+	if (ncpts > 0 && ncpts > cfs_cpt_number(lnet_cpt_table()))
+		return -EINVAL;
+
+	for (i = 0; i < ncpts; i++) {
+		struct ksock_sched *sched;
+		int cpt = (cpts == NULL) ? i : cpts[i];
+
+		LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+		sched = ksocknal_data.ksnd_schedulers[cpt];
+
+		if (!newif && sched->kss_nthreads > 0)
+			continue;
+
+		rc = ksocknal_start_schedulers(sched);
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+
+int
+ksocknal_startup(struct lnet_ni *ni)
+{
+	struct ksock_net *net;
+	struct ksock_interface *ksi = NULL;
+	struct lnet_inetdev *ifaces = NULL;
+	struct sockaddr_in *sa;
+	int i = 0;
+	int rc;
+
+        LASSERT (ni->ni_net->net_lnd == &the_ksocklnd);
+        if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+                rc = ksocknal_base_startup();
+                if (rc != 0)
+                        return rc;
+        }
+	LIBCFS_ALLOC(net, sizeof(*net));
+	if (net == NULL)
+		goto fail_0;
+	net->ksnn_incarnation = ktime_get_real_ns();
+	ni->ni_data = net;
+
+	ksocknal_tunables_setup(ni);
+
+	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
+	if (rc < 0)
+		goto fail_1;
+
+	ksi = &net->ksnn_interface;
+
+	/* Use the first discovered interface or look in the list */
+	if (ni->ni_interface) {
+		for (i = 0; i < rc; i++)
+			if (strcmp(ifaces[i].li_name, ni->ni_interface) == 0)
+				break;
+
+		/* ni_interfaces doesn't contain the interface we want */
+		if (i == rc) {
+			CERROR("ksocklnd: failed to find interface %s\n",
+			       ni->ni_interface);
+			goto fail_1;
+		}
+	}
+
+	ni->ni_dev_cpt = ifaces[i].li_cpt;
+	sa = (void *)&ksi->ksni_addr;
+	memset(sa, 0, sizeof(*sa));
+	sa->sin_family = AF_INET;
+	sa->sin_addr.s_addr = htonl(ifaces[i].li_ipaddr);
+	ksi->ksni_index = ksocknal_ip2index((struct sockaddr *)sa, ni);
+	ksi->ksni_netmask = ifaces[i].li_netmask;
+	strlcpy(ksi->ksni_name, ifaces[i].li_name, sizeof(ksi->ksni_name));
+
+	/* call it before add it to ksocknal_data.ksnd_nets */
+	rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto fail_1;
+
+	LASSERT(ksi);
+	LASSERT(ksi->ksni_addr.ss_family == AF_INET);
+	ni->ni_nid.nid_addr[0] =
+		((struct sockaddr_in *)&ksi->ksni_addr)->sin_addr.s_addr;
+	list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+	net->ksnn_ni = ni;
+	ksocknal_data.ksnd_nnets++;
+
+	return 0;
+
+fail_1:
+	LIBCFS_FREE(net, sizeof(*net));
+fail_0:
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+
+	return -ENETDOWN;
+}
+
+static void __exit ksocklnd_exit(void)
+{
+	lnet_unregister_lnd(&the_ksocklnd);
+}
+
+static const struct lnet_lnd the_ksocklnd = {
+	.lnd_type		= SOCKLND,
+	.lnd_startup		= ksocknal_startup,
+	.lnd_shutdown		= ksocknal_shutdown,
+	.lnd_ctl		= ksocknal_ctl,
+	.lnd_send		= ksocknal_send,
+	.lnd_recv		= ksocknal_recv,
+	.lnd_notify_peer_down	= ksocknal_notify_gw_down,
+	.lnd_accept		= ksocknal_accept,
+};
+
+static int __init ksocklnd_init(void)
+{
+	int rc;
+
+	/* check ksnr_connected/connecting field large enough */
+	BUILD_BUG_ON(SOCKLND_CONN_NTYPES > 4);
+	BUILD_BUG_ON(SOCKLND_CONN_ACK != SOCKLND_CONN_BULK_IN);
+
+	rc = ksocknal_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_ksocklnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("TCP Socket LNet Network Driver");
+MODULE_VERSION("2.8.0");
+MODULE_LICENSE("GPL");
+
+module_init(ksocklnd_init);
+module_exit(ksocklnd_exit);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
new file mode 100644
index 0000000000000..81db1c3a3e2b2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
@@ -0,0 +1,682 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef _SOCKLND_SOCKLND_H_
+#define _SOCKLND_SOCKLND_H_
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <linux/crc32.h>
+#include <linux/errno.h>
+#include <linux/if.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/refcount.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/syscalls.h>
+#include <linux/sysctl.h>
+#include <linux/uio.h>
+#include <linux/unistd.h>
+#include <linux/hashtable.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <lnet/lib-lnet.h>
+#include <lnet/socklnd.h>
+
+#include <libcfs/linux/linux-net.h>
+
+#ifndef NETIF_F_CSUM_MASK
+# define NETIF_F_CSUM_MASK NETIF_F_ALL_CSUM
+#endif
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS		3
+#define SOCKNAL_NSCHEDS_HIGH	(SOCKNAL_NSCHEDS << 1)
+
+#define SOCKNAL_PEER_HASH_BITS	7	/* log2 of # peer_ni lists */
+#define SOCKNAL_INSANITY_RECONN	5000	/* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY	1	/* seconds between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX      0	/* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX      0	/* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG       0	/* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK  0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK  1
+#endif
+
+/* per scheduler state */
+struct ksock_sched {
+	/* serialise */
+	spinlock_t kss_lock;
+	/* conn waiting to be written */
+	struct list_head kss_rx_conns;
+	struct list_head kss_tx_conns;
+	/* zombie noop tx list */
+	struct list_head kss_zombie_noop_txs;
+	/* where scheduler sleeps */
+	wait_queue_head_t kss_waitq;
+	/* # connections assigned to this scheduler */
+	int kss_nconns;
+	/* max allowed threads */
+	int kss_nthreads_max;
+	/* number of threads */
+	int kss_nthreads;
+	/* CPT id */
+	int kss_cpt;
+};
+
+#define KSOCK_CPT_SHIFT			16
+#define KSOCK_THREAD_ID(cpt, sid)	(((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id)		((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id)		((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+struct ksock_interface {			/* in-use interface */
+	int		ksni_index;		/* Linux interface index */
+	struct sockaddr_storage ksni_addr;	/* interface's address */
+	__u32		ksni_netmask;		/* interface's network mask */
+	int		ksni_nroutes;		/* # routes using (active) */
+	int		ksni_npeers;		/* # peers using (passive) */
+	char		ksni_name[IFNAMSIZ];	/* interface name */
+};
+
+struct ksock_tunables {
+	/* "stuck" socket timeout (seconds) */
+	int              *ksnd_timeout;
+	/* # scheduler threads in each pool while starting */
+	int		 *ksnd_nscheds;
+        int              *ksnd_nconnds;         /* # connection daemons */
+        int              *ksnd_nconnds_max;     /* max # connection daemons */
+        int              *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+        int              *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+        int              *ksnd_eager_ack;       /* make TCP ack eagerly? */
+        int              *ksnd_typed_conns;     /* drive sockets by type? */
+        int              *ksnd_min_bulk;        /* smallest "large" message */
+        int              *ksnd_tx_buffer_size;  /* socket tx buffer size */
+        int              *ksnd_rx_buffer_size;  /* socket rx buffer size */
+        int              *ksnd_nagle;           /* enable NAGLE? */
+        int              *ksnd_round_robin;     /* round robin for multiple interfaces */
+        int              *ksnd_keepalive;       /* # secs for sending keepalive NOOP */
+        int              *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+        int              *ksnd_keepalive_count; /* # probes */
+        int              *ksnd_keepalive_intvl; /* time between probes */
+        int              *ksnd_credits;         /* # concurrent sends */
+        int              *ksnd_peertxcredits;   /* # concurrent sends to 1 peer_ni */
+        int              *ksnd_peerrtrcredits;  /* # per-peer_ni router buffer credits */
+        int              *ksnd_peertimeout;     /* seconds to consider peer_ni dead */
+        int              *ksnd_enable_csum;     /* enable check sum */
+        int              *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+        int              *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
+        unsigned int     *ksnd_zc_min_payload;  /* minimum zero copy payload size */
+        int              *ksnd_zc_recv;         /* enable ZC receive (for Chelsio TOE) */
+        int              *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+        int              *ksnd_irq_affinity;    /* enable IRQ affinity? */
+#ifdef SOCKNAL_BACKOFF
+        int              *ksnd_backoff_init;    /* initial TCP backoff */
+        int              *ksnd_backoff_max;     /* maximum TCP backoff */
+#endif
+#if SOCKNAL_VERSION_DEBUG
+        int              *ksnd_protocol;        /* protocol version */
+#endif
+	int              *ksnd_conns_per_peer;  /* for typed mode, yields:
+						 * 1 + 2*conns_per_peer total
+						 * for untyped:
+						 * conns_per_peer total
+						 */
+};
+
+struct ksock_net {
+	__u64		  ksnn_incarnation;	/* my epoch */
+	struct list_head  ksnn_list;		/* chain on global list */
+	atomic_t	  ksnn_npeers;		/* # peers */
+	struct ksock_interface ksnn_interface;  /* IP interface */
+	struct lnet_ni	  *ksnn_ni;
+};
+/* When the ksock_net is shut down, this (negative) bias is added to
+ * ksnn_npeers, which prevents new peers from being added.
+ */
+#define SOCKNAL_SHUTDOWN_BIAS  (INT_MIN+1)
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT  120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV     1
+
+struct ksock_nal_data {
+	int			ksnd_init;	/* initialisation state */
+	int			ksnd_nnets;	/* # networks set up */
+	struct list_head	ksnd_nets;	/* list of nets */
+	/* stabilize peer_ni/conn ops */
+	rwlock_t		ksnd_global_lock;
+	/* hash table of all my known peers */
+	DECLARE_HASHTABLE(ksnd_peers, SOCKNAL_PEER_HASH_BITS);
+
+	atomic_t		ksnd_nthreads;	/* # live threads */
+	int			ksnd_shuttingdown; /* tell threads to exit */
+	/* schedulers information */
+	struct ksock_sched	**ksnd_schedulers;
+
+	atomic_t      ksnd_nactive_txs;    /* #active txs */
+
+	/* conns to close: reaper_lock*/
+	struct list_head	ksnd_deathrow_conns;
+	/* conns to free: reaper_lock */
+	struct list_head	ksnd_zombie_conns;
+	/* conns to retry: reaper_lock*/
+	struct list_head	ksnd_enomem_conns;
+	/* reaper sleeps here */
+	wait_queue_head_t       ksnd_reaper_waitq;
+	/* when reaper will wake */
+	time64_t		ksnd_reaper_waketime;
+	/* serialise */
+	spinlock_t	  ksnd_reaper_lock;
+
+	int               ksnd_enomem_tx;      /* test ENOMEM sender */
+	int               ksnd_stall_tx;       /* test sluggish sender */
+	int               ksnd_stall_rx;       /* test sluggish receiver */
+
+	/* incoming connection requests */
+	struct list_head	ksnd_connd_connreqs;
+	/* routes waiting to be connected */
+	struct list_head	ksnd_connd_routes;
+	/* connds sleep here */
+	wait_queue_head_t	ksnd_connd_waitq;
+	/* # connds connecting */
+	int			ksnd_connd_connecting;
+	/** time stamp of the last failed connecting attempt */
+	time64_t		ksnd_connd_failed_stamp;
+	/** # starting connd */
+	unsigned		ksnd_connd_starting;
+	/** time stamp of the last starting connd */
+	time64_t		ksnd_connd_starting_stamp;
+	/** # running connd */
+	unsigned		ksnd_connd_running;
+	/* serialise */
+	spinlock_t		ksnd_connd_lock;
+
+	/* list head for freed noop tx */
+	struct list_head	ksnd_idle_noop_txs;
+	/* serialise, g_lock unsafe */
+	spinlock_t		ksnd_tx_lock;
+};
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_ALL        2
+
+/* A packet just assembled for transmission is represented by 1
+ * struct iovec fragment - the portals header -  followed by 0
+ * or more struct bio_vec fragments.
+ *
+ * On the receive side, initially 1 struct kvec fragment is posted for
+ * receive (the header).  Once the header has been received, the payload is
+ * received into struct bio_vec fragments.
+ */
+struct ksock_conn;				/* forward ref */
+struct ksock_conn_cb;				/* forward ref */
+struct ksock_proto;				/* forward ref */
+
+struct ksock_tx {			/* transmit packet */
+	struct list_head tx_list;	/* queue on conn for transmission etc */
+	struct list_head tx_zc_list;	/* queue on peer_ni for ZC request */
+	refcount_t	tx_refcount;	/* tx reference count */
+	int		tx_nob;		/* # packet bytes */
+	int		tx_resid;	/* residual bytes */
+	int		tx_niov;	/* # packet kvec frags */
+	int		tx_nkiov;	/* # packet page frags */
+	unsigned short	tx_zc_aborted;	/* aborted ZC request */
+	unsigned short	tx_zc_capable:1; /* payload is large enough for ZC */
+	unsigned short	tx_zc_checked:1; /* Have I checked if I should ZC? */
+	unsigned short	tx_nonblk:1;	/* it's a non-blocking ACK */
+	struct bio_vec *tx_kiov;	/* packet page frags */
+	struct ksock_conn *tx_conn;	/* owning conn */
+	struct lnet_msg	*tx_lnetmsg;	/* lnet message for lnet_finalize() */
+	time64_t	tx_deadline;	/* when (in secs) tx times out */
+	struct ksock_msg tx_msg;	/* socklnd message buffer */
+	int		tx_desc_size;	/* size of this descriptor */
+	enum lnet_msg_hstatus tx_hstatus; /* health status of tx */
+	struct kvec	tx_hdr;		/* virt hdr */
+	struct bio_vec	tx_payload[0];	/* paged payload */
+};
+
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(struct ksock_tx, tx_payload[0]))
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+union ksock_rxiovspace {
+	struct kvec	iov[LNET_MAX_IOV];
+	struct bio_vec	kiov[LNET_MAX_IOV];
+};
+
+#define SOCKNAL_RX_KSM_HEADER   1               /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER  2               /* reading lnet message header */
+#define SOCKNAL_RX_PARSE        3               /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   4               /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5               /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP         6               /* skipping body */
+
+struct ksock_conn {
+	struct ksock_peer_ni	*ksnc_peer;		/* owning peer_ni */
+	struct ksock_conn_cb	*ksnc_conn_cb;		/* owning conn control block */
+	struct list_head	ksnc_list;		/* on peer_ni's conn list */
+	struct socket		*ksnc_sock;		/* actual socket */
+	void			*ksnc_saved_data_ready; /* socket's original
+							 * data_ready() cb */
+	void			*ksnc_saved_write_space; /* socket's original
+							  * write_space() cb */
+	refcount_t		ksnc_conn_refcount;	/* conn refcount */
+	refcount_t		ksnc_sock_refcount;	/* sock refcount */
+	struct ksock_sched	*ksnc_scheduler;	/* who schedules this
+							 * connection */
+	struct sockaddr_storage ksnc_myaddr;		/* my address */
+	struct sockaddr_storage ksnc_peeraddr;		/*  peer_ni's address */
+	signed int		ksnc_type:3;		/* type of connection,
+							 * should be signed
+							 * value */
+	unsigned int		ksnc_closing:1;		/* being shut down */
+	unsigned int		ksnc_flip:1;		/* flip or not, only for V2.x */
+	unsigned int		ksnc_zc_capable:1;	/* enable to ZC */
+	const struct ksock_proto *ksnc_proto; /* protocol for the connection */
+
+	/* READER */
+
+	/* where I enq waiting input or a forwarding descriptor */
+	struct list_head   ksnc_rx_list;
+	time64_t		ksnc_rx_deadline; /* when (in seconds) receive times out */
+        __u8                  ksnc_rx_started;  /* started receiving a message */
+        __u8                  ksnc_rx_ready;    /* data ready to read */
+        __u8                  ksnc_rx_scheduled;/* being progressed */
+        __u8                  ksnc_rx_state;    /* what is being read */
+        int                   ksnc_rx_nob_left; /* # bytes to next hdr/body */
+        int                   ksnc_rx_nob_wanted; /* bytes actually wanted */
+	int                   ksnc_rx_niov;     /* # kvec frags */
+	struct kvec          *ksnc_rx_iov;      /* the kvec frags */
+	int                   ksnc_rx_nkiov;    /* # page frags */
+	struct bio_vec       *ksnc_rx_kiov;     /* the page frags */
+	union ksock_rxiovspace	ksnc_rx_iov_space;/* space for frag descriptors */
+	__u32                 ksnc_rx_csum;     /* partial checksum for incoming
+						 * data */
+	struct lnet_msg      *ksnc_lnet_msg;    /* rx lnet_finalize arg*/
+	struct ksock_msg	ksnc_msg;	/* incoming message buffer:
+						 * V2.x message takes the
+						 * whole struct
+						 * V1.x message is a bare
+						 * struct lnet_hdr_nid4, it's
+						 * stored in
+						 * ksnc_msg.ksm_u.lnetmsg
+						 */
+	/* -- WRITER -- */
+	/* where I enq waiting for output space */
+	struct list_head	ksnc_tx_list;
+	/* packets waiting to be sent */
+	struct list_head	ksnc_tx_queue;
+	/* next TX that can carry a LNet message or ZC-ACK */
+	struct ksock_tx		*ksnc_tx_carrier;
+	/* when (in seconds) tx times out */
+	time64_t		ksnc_tx_deadline;
+	/* send buffer marker */
+	int			ksnc_tx_bufnob;
+	/* # bytes queued */
+	atomic_t		ksnc_tx_nob;
+	/* write space */
+	int			ksnc_tx_ready;
+	/* being progressed */
+	int			ksnc_tx_scheduled;
+	/* time stamp of the last posted TX */
+	time64_t		ksnc_tx_last_post;
+};
+
+#define SOCKNAL_CONN_COUNT_MAX_BITS	8	/* max conn count bits */
+
+struct ksock_conn_cb {
+	struct list_head	ksnr_connd_list;/* chain on ksnr_connd_routes */
+	struct ksock_peer_ni   *ksnr_peer;	/* owning peer_ni */
+	refcount_t		ksnr_refcount;	/* # users */
+	time64_t		ksnr_timeout;	/* when (in secs) reconnection
+						 * can happen next
+						 */
+	time64_t		ksnr_retry_interval;/* secs between retries */
+	int			ksnr_myiface;	/* interface index */
+	struct sockaddr_storage	ksnr_addr;	/* IP address to connect to */
+	unsigned int		ksnr_scheduled:1;/* scheduled for attention */
+	unsigned int		ksnr_connecting:1;/* connection in progress */
+	unsigned int		ksnr_connected:4;/* connections by type */
+	unsigned int		ksnr_deleted:1;	/* been removed from peer_ni? */
+	unsigned int		ksnr_ctrl_conn_count:2; /* # conns by type */
+	unsigned int		ksnr_blki_conn_count:8;
+	unsigned int		ksnr_blko_conn_count:8;
+	int			ksnr_conn_count;/* total # conns for this cb */
+	unsigned int		ksnr_max_conns; /* conns_per_peer at peer
+						 * creation
+						 */
+};
+
+#define SOCKNAL_KEEPALIVE_PING          1       /* cookie for keepalive ping */
+
+struct ksock_peer_ni {
+	struct hlist_node	ksnp_list;	/* stash on global peer_ni list */
+	time64_t		ksnp_last_alive;/* when (in seconds) I was last alive */
+	struct lnet_processid	ksnp_id;	/* who's on the other end(s) */
+	refcount_t		ksnp_refcount;	/* # users */
+	int			ksnp_closing;	/* being closed */
+	int			ksnp_accepting;	/* # passive connections pending */
+	int			ksnp_error;	/* errno on closing last conn */
+	__u64			ksnp_zc_next_cookie;/* ZC completion cookie */
+	__u64			ksnp_incarnation;   /* latest known peer_ni incarnation */
+	const struct ksock_proto *ksnp_proto;	/* latest known protocol */
+	struct list_head	ksnp_conns;	/* all active connections */
+	struct ksock_conn_cb	*ksnp_conn_cb;	/* conn control block */
+	struct list_head	ksnp_tx_queue;	/* waiting packets */
+	spinlock_t		ksnp_lock;	/* serialize, g_lock unsafe */
+	/* zero copy requests wait for ACK  */
+	struct list_head	ksnp_zc_req_list;
+	time64_t		ksnp_send_keepalive; /* time to send keepalive */
+	struct lnet_ni		*ksnp_ni;	/* which network */
+	int			ksnp_n_passive_ips; /* # of... */
+	__u32			ksnp_passive_ips[LNET_INTERFACES_NUM]; /* preferred local interfaces */
+};
+
+struct ksock_connreq {
+	/* stash on ksnd_connd_connreqs */
+	struct list_head	ksncr_list;
+	/* chosen NI */
+	struct lnet_ni		*ksncr_ni;
+	/* accepted socket */
+	struct socket		*ksncr_sock;
+};
+
+extern struct ksock_nal_data ksocknal_data;
+extern struct ksock_tunables ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO        0        /* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES       1        /* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY       2        /* TX can be sent on the connection, but not preferred */
+
+struct ksock_proto {
+        int           pro_version;                                              /* version number of protocol */
+	int         (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *);     /* handshake function */
+	int         (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int);/* handshake function */
+	void        (*pro_pack)(struct ksock_tx *);                                  /* message pack */
+	void        (*pro_unpack)(struct ksock_msg *, struct lnet_hdr *);	/* message unpack */
+	struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *);          /* queue tx on the connection */
+	int         (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64); /* queue ZC ack on the connection */
+	int         (*pro_handle_zcreq)(struct ksock_conn *, __u64, int);            /* handle ZC request */
+	int         (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64);          /* handle ZC ACK */
+	int         (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int);         /* msg type matches the connection type:
+                                                                                 * return value:
+                                                                                 *   return MATCH_NO  : no
+                                                                                 *   return MATCH_YES : matching type
+                                                                                 *   return MATCH_MAY : can be backup */
+};
+
+extern const struct ksock_proto ksocknal_protocol_v1x;
+extern const struct ksock_proto ksocknal_protocol_v2x;
+extern const struct ksock_proto ksocknal_protocol_v3x;
+extern const struct ksock_proto ksocknal_protocol_v4x;
+
+#define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1          KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE   0UL
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0)
+#undef netdev_notifier_info_to_dev
+#define netdev_notifier_info_to_dev(ndev) ndev
+#endif
+
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+	return crc32_le(crc, p, len);
+#else
+	while (len-- > 0)
+		crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+
+	return crc;
+#endif
+}
+
+static inline int
+ksocknal_conn_cb_mask(void)
+{
+	if (!*ksocknal_tunables.ksnd_typed_conns)
+		return BIT(SOCKLND_CONN_ANY);
+
+	return (BIT(SOCKLND_CONN_CONTROL) |
+		BIT(SOCKLND_CONN_BULK_IN) |
+		BIT(SOCKLND_CONN_BULK_OUT));
+}
+
+static inline void
+ksocknal_conn_addref(struct ksock_conn *conn)
+{
+	refcount_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn(struct ksock_conn *conn);
+extern void ksocknal_finalize_zcreq(struct ksock_conn *conn);
+
+static inline void
+ksocknal_conn_decref(struct ksock_conn *conn)
+{
+	if (refcount_dec_and_test(&conn->ksnc_conn_refcount))
+		ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref(struct ksock_conn *conn)
+{
+	int rc = -ESHUTDOWN;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	if (!conn->ksnc_closing) {
+		refcount_inc(&conn->ksnc_sock_refcount);
+		rc = 0;
+	}
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+static inline void
+ksocknal_connsock_decref(struct ksock_conn *conn)
+{
+	if (refcount_dec_and_test(&conn->ksnc_sock_refcount)) {
+		LASSERT (conn->ksnc_closing);
+		sock_release(conn->ksnc_sock);
+		conn->ksnc_sock = NULL;
+		ksocknal_finalize_zcreq(conn);
+	}
+}
+
+static inline void
+ksocknal_tx_addref(struct ksock_tx *tx)
+{
+	refcount_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx);
+extern void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int error);
+
+static inline void
+ksocknal_tx_decref(struct ksock_tx *tx)
+{
+	if (refcount_dec_and_test(&tx->tx_refcount))
+		ksocknal_tx_done(NULL, tx, 0);
+}
+
+static inline void
+ksocknal_conn_cb_addref(struct ksock_conn_cb  *conn_cb)
+{
+	refcount_inc(&conn_cb->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_conn_cb(struct ksock_conn_cb *conn_cb);
+
+static inline void
+ksocknal_conn_cb_decref(struct ksock_conn_cb *conn_cb)
+{
+	if (refcount_dec_and_test(&conn_cb->ksnr_refcount))
+		ksocknal_destroy_conn_cb(conn_cb);
+}
+
+static inline void
+ksocknal_peer_addref(struct ksock_peer_ni *peer_ni)
+{
+	refcount_inc(&peer_ni->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni);
+
+static inline void
+ksocknal_peer_decref(struct ksock_peer_ni *peer_ni)
+{
+	if (refcount_dec_and_test(&peer_ni->ksnp_refcount))
+		ksocknal_destroy_peer(peer_ni);
+}
+
+static inline int ksocknal_timeout(void)
+{
+	return *ksocknal_tunables.ksnd_timeout ?: lnet_get_lnd_timeout();
+}
+
+static inline int ksocknal_conns_per_peer(void)
+{
+	return *ksocknal_tunables.ksnd_conns_per_peer ?: 1;
+}
+
+int ksocknal_startup(struct lnet_ni *ni);
+void ksocknal_shutdown(struct lnet_ni *ni);
+int ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg);
+int ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
+		  int delayed, unsigned int niov,
+		  struct bio_vec *kiov,
+                  unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(struct lnet_ni *ni, struct socket *sock);
+
+int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id,
+		      struct sockaddr *addr);
+struct ksock_peer_ni *ksocknal_find_peer_locked(struct lnet_ni *ni,
+					   struct lnet_processid *id);
+struct ksock_peer_ni *ksocknal_find_peer(struct lnet_ni *ni,
+					 struct lnet_processid *id);
+extern void ksocknal_peer_failed(struct ksock_peer_ni *peer_ni);
+extern int ksocknal_create_conn(struct lnet_ni *ni,
+				struct ksock_conn_cb *conn_cb,
+				struct socket *sock, int type);
+extern void ksocknal_close_conn_locked(struct ksock_conn *conn, int why);
+extern void ksocknal_terminate_conn(struct ksock_conn *conn);
+extern void ksocknal_destroy_conn(struct ksock_conn *conn);
+extern int  ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni,
+					     struct sockaddr *peer, int why);
+extern int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why);
+int ksocknal_close_matching_conns(struct lnet_processid *id, __u32 ipaddr);
+extern struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni,
+						    struct ksock_tx *tx, int nonblk);
+
+extern int  ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
+				   struct lnet_processid *id);
+extern struct ksock_tx *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx(struct ksock_tx *tx);
+extern struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(struct ksock_conn *conn);
+extern void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn);
+extern void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist,
+				 int error);
+#define ksocknal_thread_start(fn, data, namefmt, arg...)		\
+	({								\
+		struct task_struct *__task = kthread_run(fn, data,	\
+							 namefmt, ##arg); \
+		if (!IS_ERR(__task))					\
+			atomic_inc(&ksocknal_data.ksnd_nthreads);	\
+		PTR_ERR_OR_ZERO(__task);				\
+	})
+
+extern void ksocknal_thread_fini(void);
+extern void ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni);
+extern struct ksock_conn_cb *ksocknal_find_connectable_conn_cb_locked(struct ksock_peer_ni *peer_ni);
+extern struct ksock_conn_cb *ksocknal_find_connecting_conn_cb_locked(struct ksock_peer_ni *peer_ni);
+extern int ksocknal_new_packet(struct ksock_conn *conn, int skip);
+extern int ksocknal_scheduler(void *arg);
+extern int ksocknal_connd(void *arg);
+extern int ksocknal_reaper(void *arg);
+int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
+			struct lnet_nid *peer_nid,
+			struct ksock_hello_msg *hello);
+int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
+			struct ksock_hello_msg *hello,
+			struct lnet_processid *id,
+			__u64 *incarnation);
+extern void ksocknal_read_callback(struct ksock_conn *conn);
+extern void ksocknal_write_callback(struct ksock_conn *conn);
+
+extern int ksocknal_lib_zc_capable(struct ksock_conn *conn);
+extern void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn);
+extern void ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn);
+extern void ksocknal_lib_reset_callback(struct socket *sock,
+					struct ksock_conn *conn);
+extern void ksocknal_lib_push_conn(struct ksock_conn *conn);
+extern int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn);
+extern int ksocknal_lib_setup_sock(struct socket *so);
+extern int ksocknal_lib_send_hdr(struct ksock_conn *conn, struct ksock_tx *tx,
+				 struct kvec *scratch_iov);
+extern int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
+				  struct kvec *scratch_iov);
+extern void ksocknal_lib_eager_ack(struct ksock_conn *conn);
+extern int ksocknal_lib_recv_iov(struct ksock_conn *conn,
+				 struct kvec *scratchiov);
+extern int ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
+		       struct kvec *scratchiov);
+extern int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem,
+					  int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+extern void ksocknal_tunables_setup(struct lnet_ni *ni);
+
+extern void ksocknal_lib_csum_tx(struct ksock_tx *tx);
+
+extern int ksocknal_lib_memory_pressure(struct ksock_conn *conn);
+
+#endif /* _SOCKLND_SOCKLND_H_ */
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644
index 0000000000000..4e7a61b3f5751
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
@@ -0,0 +1,2694 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <libcfs/linux/linux-mem.h>
+#include "socklnd.h"
+#include <linux/sunrpc/addr.h>
+
+struct ksock_tx *
+ksocknal_alloc_tx(int type, int size)
+{
+	struct ksock_tx *tx = NULL;
+
+	if (type == KSOCK_MSG_NOOP) {
+		LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+		/* searching for a noop tx in free list */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		tx = list_first_entry_or_null(&ksocknal_data.ksnd_idle_noop_txs,
+					      struct ksock_tx, tx_list);
+		if (tx) {
+			LASSERT(tx->tx_desc_size == size);
+			list_del(&tx->tx_list);
+		}
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+        }
+
+        if (tx == NULL)
+                LIBCFS_ALLOC(tx, size);
+
+        if (tx == NULL)
+                return NULL;
+
+	refcount_set(&tx->tx_refcount, 1);
+	tx->tx_zc_aborted = 0;
+	tx->tx_zc_capable = 0;
+	tx->tx_zc_checked = 0;
+	tx->tx_hstatus = LNET_MSG_STATUS_OK;
+	tx->tx_desc_size  = size;
+
+	atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+	return tx;
+}
+
+struct ksock_tx *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+	struct ksock_tx *tx;
+
+	tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+	if (tx == NULL) {
+		CERROR("Can't allocate noop tx desc\n");
+		return NULL;
+	}
+
+	tx->tx_conn     = NULL;
+	tx->tx_lnetmsg  = NULL;
+	tx->tx_kiov     = NULL;
+	tx->tx_nkiov    = 0;
+	tx->tx_niov     = 1;
+	tx->tx_nonblk   = nonblk;
+
+	tx->tx_msg.ksm_csum = 0;
+	tx->tx_msg.ksm_type = KSOCK_MSG_NOOP;
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+	tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+	return tx;
+}
+
+
+void
+ksocknal_free_tx(struct ksock_tx *tx)
+{
+	atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+	if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+		/* it's a noop tx */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	} else {
+		LIBCFS_FREE(tx, tx->tx_desc_size);
+	}
+}
+
+static int
+ksocknal_send_hdr(struct ksock_conn *conn, struct ksock_tx *tx,
+		  struct kvec *scratch_iov)
+{
+	struct kvec *iov = &tx->tx_hdr;
+	int    nob;
+	int    rc;
+
+	LASSERT(tx->tx_niov > 0);
+
+	/* Never touch tx->tx_hdr inside ksocknal_lib_send_hdr() */
+	rc = ksocknal_lib_send_hdr(conn, tx, scratch_iov);
+
+	if (rc <= 0)                            /* sent nothing? */
+		return rc;
+
+	nob = rc;
+	LASSERT(nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" iov */
+	LASSERT(tx->tx_niov == 1);
+
+	if (nob < (int) iov->iov_len) {
+		iov->iov_base += nob;
+		iov->iov_len -= nob;
+		return rc;
+	}
+
+	LASSERT(nob == iov->iov_len);
+	tx->tx_niov--;
+
+	return rc;
+}
+
+static int
+ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
+		   struct kvec *scratch_iov)
+{
+	struct bio_vec *kiov = tx->tx_kiov;
+	int nob;
+	int rc;
+
+	LASSERT(tx->tx_niov == 0);
+	LASSERT(tx->tx_nkiov > 0);
+
+	/* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+	rc = ksocknal_lib_send_kiov(conn, tx, scratch_iov);
+
+	if (rc <= 0)                            /* sent nothing? */
+		return rc;
+
+	nob = rc;
+	LASSERT(nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" kiov */
+	do {
+		LASSERT(tx->tx_nkiov > 0);
+
+		if (nob < (int)kiov->bv_len) {
+			kiov->bv_offset += nob;
+			kiov->bv_len -= nob;
+			return rc;
+		}
+
+		nob -= (int)kiov->bv_len;
+		tx->tx_kiov = ++kiov;
+		tx->tx_nkiov--;
+	} while (nob != 0);
+
+	return rc;
+}
+
+static int
+ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx,
+		  struct kvec *scratch_iov)
+{
+	int	rc;
+	int	bufnob;
+
+	if (ksocknal_data.ksnd_stall_tx != 0)
+		schedule_timeout_uninterruptible(
+			cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+
+	LASSERT(tx->tx_resid != 0);
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT(conn->ksnc_closing);
+		return -ESHUTDOWN;
+	}
+
+	do {
+		if (ksocknal_data.ksnd_enomem_tx > 0) {
+			/* testing... */
+			ksocknal_data.ksnd_enomem_tx--;
+			rc = -EAGAIN;
+		} else if (tx->tx_niov != 0) {
+			rc = ksocknal_send_hdr(conn, tx, scratch_iov);
+		} else {
+			rc = ksocknal_send_kiov(conn, tx, scratch_iov);
+		}
+
+		bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+		if (rc > 0)                     /* sent something? */
+			conn->ksnc_tx_bufnob += rc; /* account it */
+
+		if (bufnob < conn->ksnc_tx_bufnob) {
+			/* allocated send buffer bytes < computed; infer
+			 * something got ACKed */
+			conn->ksnc_tx_deadline = ktime_get_seconds() +
+						 ksocknal_timeout();
+			conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+			conn->ksnc_tx_bufnob = bufnob;
+			smp_mb();
+		}
+
+		if (rc <= 0) { /* Didn't write anything? */
+			/* some stacks return 0 instead of -EAGAIN */
+			if (rc == 0)
+				rc = -EAGAIN;
+
+			/* Check if EAGAIN is due to memory pressure */
+			if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+				rc = -ENOMEM;
+
+			break;
+		}
+
+		/* socket's wmem_queued now includes 'rc' bytes */
+		atomic_sub (rc, &conn->ksnc_tx_nob);
+		rc = 0;
+
+	} while (tx->tx_resid != 0);
+
+	ksocknal_connsock_decref(conn);
+	return rc;
+}
+
+static int
+ksocknal_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov)
+{
+	struct kvec *iov = conn->ksnc_rx_iov;
+	int     nob;
+	int     rc;
+
+	LASSERT(conn->ksnc_rx_niov > 0);
+
+	/* Never touch conn->ksnc_rx_iov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_iov(conn, scratchiov);
+
+	if (rc <= 0)
+		return rc;
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+	conn->ksnc_rx_deadline = ktime_get_seconds() +
+				 ksocknal_timeout();
+	smp_mb();                       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT(conn->ksnc_rx_niov > 0);
+
+		if (nob < (int)iov->iov_len) {
+			iov->iov_len -= nob;
+			iov->iov_base += nob;
+			return -EAGAIN;
+		}
+
+		nob -= iov->iov_len;
+		conn->ksnc_rx_iov = ++iov;
+		conn->ksnc_rx_niov--;
+	} while (nob != 0);
+
+	return rc;
+}
+
+static int
+ksocknal_recv_kiov(struct ksock_conn *conn, struct page **rx_scratch_pgs,
+		   struct kvec *scratch_iov)
+{
+	struct bio_vec *kiov = conn->ksnc_rx_kiov;
+	int nob;
+	int rc;
+
+	LASSERT(conn->ksnc_rx_nkiov > 0);
+	/* Never touch conn->ksnc_rx_kiov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_kiov(conn, rx_scratch_pgs, scratch_iov);
+
+	if (rc <= 0)
+		return rc;
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+	conn->ksnc_rx_deadline = ktime_get_seconds() +
+				 ksocknal_timeout();
+	smp_mb();                       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT(conn->ksnc_rx_nkiov > 0);
+
+		if (nob < (int) kiov->bv_len) {
+			kiov->bv_offset += nob;
+			kiov->bv_len -= nob;
+			return -EAGAIN;
+		}
+
+		nob -= kiov->bv_len;
+		conn->ksnc_rx_kiov = ++kiov;
+		conn->ksnc_rx_nkiov--;
+	} while (nob != 0);
+
+	return 1;
+}
+
+static int
+ksocknal_receive(struct ksock_conn *conn, struct page **rx_scratch_pgs,
+		 struct kvec *scratch_iov)
+{
+	/* Return 1 on success, 0 on EOF, < 0 on error.
+	 * Caller checks ksnc_rx_nob_wanted to determine
+	 * progress/completion. */
+	int     rc;
+	ENTRY;
+
+	if (ksocknal_data.ksnd_stall_rx != 0)
+		schedule_timeout_uninterruptible(
+			cfs_time_seconds(ksocknal_data.ksnd_stall_rx));
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT(conn->ksnc_closing);
+		return -ESHUTDOWN;
+	}
+
+	for (;;) {
+		if (conn->ksnc_rx_niov != 0)
+			rc = ksocknal_recv_iov(conn, scratch_iov);
+		else
+			rc = ksocknal_recv_kiov(conn, rx_scratch_pgs,
+						 scratch_iov);
+
+		if (rc <= 0) {
+			/* error/EOF or partial receive */
+			if (rc == -EAGAIN) {
+				rc = 1;
+			} else if (rc == 0 && conn->ksnc_rx_started) {
+				/* EOF in the middle of a message */
+				rc = -EPROTO;
+			}
+			break;
+		}
+
+		/* Completed a fragment */
+
+		if (conn->ksnc_rx_nob_wanted == 0) {
+			rc = 1;
+			break;
+		}
+	}
+
+	ksocknal_connsock_decref(conn);
+	RETURN(rc);
+}
+
+void
+ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int rc)
+{
+	struct lnet_msg *lnetmsg = tx->tx_lnetmsg;
+	enum lnet_msg_hstatus hstatus = tx->tx_hstatus;
+
+	LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+	if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) {
+		rc = -EIO;
+		if (hstatus == LNET_MSG_STATUS_OK)
+			hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+	}
+
+	if (tx->tx_conn != NULL)
+		ksocknal_conn_decref(tx->tx_conn);
+
+	ksocknal_free_tx(tx);
+	if (lnetmsg != NULL) { /* KSOCK_MSG_NOOP go without lnetmsg */
+		lnetmsg->msg_health_status = hstatus;
+		lnet_finalize(lnetmsg, rc);
+	}
+}
+
+void
+ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
+{
+	struct ksock_tx *tx;
+
+	while ((tx = list_first_entry_or_null(txlist, struct ksock_tx,
+					      tx_list)) != NULL) {
+		if (error && tx->tx_lnetmsg) {
+			CNETERR("Deleting packet type %d len %d %s->%s\n",
+				tx->tx_lnetmsg->msg_type,
+				tx->tx_lnetmsg->msg_len,
+				libcfs_nidstr(&tx->tx_lnetmsg->msg_initiator),
+				libcfs_nidstr(&tx->tx_lnetmsg->msg_target.nid));
+		} else if (error) {
+			CNETERR("Deleting noop packet\n");
+		}
+
+		list_del(&tx->tx_list);
+
+		if (tx->tx_hstatus == LNET_MSG_STATUS_OK) {
+			if (error == -ETIMEDOUT)
+				tx->tx_hstatus =
+				  LNET_MSG_STATUS_LOCAL_TIMEOUT;
+			else if (error == -ENETDOWN ||
+				 error == -EHOSTUNREACH ||
+				 error == -ENETUNREACH ||
+				 error == -ECONNREFUSED ||
+				 error == -ECONNRESET)
+				tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
+			/*
+			 * for all other errors we don't want to
+			 * retransmit
+			 */
+			else if (error)
+				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+		}
+
+		LASSERT(refcount_read(&tx->tx_refcount) == 1);
+		ksocknal_tx_done(ni, tx, error);
+	}
+}
+
+static void
+ksocknal_check_zc_req(struct ksock_tx *tx)
+{
+	struct ksock_conn *conn = tx->tx_conn;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+
+        /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+         * to ksnp_zc_req_list if some fragment of this message should be sent
+         * zero-copy.  Our peer_ni will send an ACK containing this cookie when
+         * she has received this message to tell us we can signal completion.
+         * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+         * ksnp_zc_req_list. */
+        LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+        LASSERT (tx->tx_zc_capable);
+
+        tx->tx_zc_checked = 1;
+
+        if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+            !conn->ksnc_zc_capable)
+                return;
+
+        /* assign cookie and queue tx to pending list, it will be released when
+         * a matching ack is received. See ksocknal_handle_zcack() */
+
+        ksocknal_tx_addref(tx);
+
+	spin_lock(&peer_ni->ksnp_lock);
+
+        /* ZC_REQ is going to be pinned to the peer_ni */
+	tx->tx_deadline = ktime_get_seconds() +
+			  ksocknal_timeout();
+
+        LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+        tx->tx_msg.ksm_zc_cookies[0] = peer_ni->ksnp_zc_next_cookie++;
+
+        if (peer_ni->ksnp_zc_next_cookie == 0)
+                peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	list_add_tail(&tx->tx_zc_list, &peer_ni->ksnp_zc_req_list);
+
+	spin_unlock(&peer_ni->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(struct ksock_tx *tx)
+{
+	struct ksock_peer_ni *peer_ni = tx->tx_conn->ksnc_peer;
+
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 0;
+
+	spin_lock(&peer_ni->ksnp_lock);
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* Not waiting for an ACK */
+		spin_unlock(&peer_ni->ksnp_lock);
+		return;
+	}
+
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+	list_del(&tx->tx_zc_list);
+
+	spin_unlock(&peer_ni->ksnp_lock);
+
+	ksocknal_tx_decref(tx);
+}
+
+static int
+ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx,
+			  struct kvec *scratch_iov)
+{
+	int rc;
+	bool error_sim = false;
+
+	if (lnet_send_error_simulation(tx->tx_lnetmsg, &tx->tx_hstatus)) {
+		error_sim = true;
+		rc = -EINVAL;
+		goto simulate_error;
+	}
+
+	if (tx->tx_zc_capable && !tx->tx_zc_checked)
+		ksocknal_check_zc_req(tx);
+
+	rc = ksocknal_transmit(conn, tx, scratch_iov);
+
+	CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+	if (tx->tx_resid == 0) {
+		/* Sent everything OK */
+		LASSERT(rc == 0);
+
+		return 0;
+	}
+
+	if (rc == -EAGAIN)
+		return rc;
+
+	if (rc == -ENOMEM) {
+		static int counter;
+
+		counter++;   /* exponential backoff warnings */
+		if ((counter & (-counter)) == counter)
+			CWARN("%u ENOMEM tx %p (%lld allocated)\n",
+			      counter, conn, libcfs_kmem_read());
+
+		/* Queue on ksnd_enomem_conns for retry after a timeout */
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/* enomem list takes over scheduler's ref... */
+		LASSERT(conn->ksnc_tx_scheduled);
+		list_add_tail(&conn->ksnc_tx_list,
+				  &ksocknal_data.ksnd_enomem_conns);
+		if (ktime_get_seconds() + SOCKNAL_ENOMEM_RETRY <
+		    ksocknal_data.ksnd_reaper_waketime)
+			wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/*
+		 * set the health status of the message which determines
+		 * whether we should retry the transmit
+		 */
+		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+		return (rc);
+	}
+
+simulate_error:
+
+	/* Actual error */
+	LASSERT(rc < 0);
+
+	if (!error_sim) {
+		/*
+		* set the health status of the message which determines
+		* whether we should retry the transmit
+		*/
+		if (rc == -ETIMEDOUT)
+			tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT;
+		else
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+	}
+
+	if (!conn->ksnc_closing) {
+		switch (rc) {
+		case -ECONNRESET:
+			LCONSOLE_WARN("Host %pIS reset our connection while we were sending data; it may have rebooted.\n",
+				      &conn->ksnc_peeraddr);
+			break;
+		default:
+			LCONSOLE_WARN("There was an unexpected network error while writing to %pIS: %d.\n",
+				      &conn->ksnc_peeraddr, rc);
+			break;
+		}
+		CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pISp\n",
+		       conn, rc, libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+		       &conn->ksnc_peeraddr);
+	}
+
+	if (tx->tx_zc_checked)
+		ksocknal_uncheck_zc_req(tx);
+
+	/* it's not an error if conn is being closed */
+	ksocknal_close_conn_and_siblings(conn,
+					  (conn->ksnc_closing) ? 0 : rc);
+
+	return rc;
+}
+
+static void
+ksocknal_launch_connection_locked(struct ksock_conn_cb *conn_cb)
+{
+	/* called holding write lock on ksnd_global_lock */
+
+	LASSERT(!conn_cb->ksnr_scheduled);
+	LASSERT(!conn_cb->ksnr_connecting);
+	LASSERT((ksocknal_conn_cb_mask() & ~conn_cb->ksnr_connected) != 0);
+
+	/* scheduling conn for connd */
+	conn_cb->ksnr_scheduled = 1;
+
+	/* extra ref for connd */
+	ksocknal_conn_cb_addref(conn_cb);
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&conn_cb->ksnr_connd_list,
+		      &ksocknal_data.ksnd_connd_routes);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni)
+{
+	struct ksock_conn_cb *conn_cb;
+
+	/* called holding write lock on ksnd_global_lock */
+	for (;;) {
+		/* launch any/all connections that need it */
+		conn_cb = ksocknal_find_connectable_conn_cb_locked(peer_ni);
+		if (conn_cb == NULL)
+			return;
+
+		ksocknal_launch_connection_locked(conn_cb);
+	}
+}
+
+struct ksock_conn *
+ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, struct ksock_tx *tx, int nonblk)
+{
+	struct ksock_conn *c;
+	struct ksock_conn *conn;
+	struct ksock_conn *typed = NULL;
+	struct ksock_conn *fallback = NULL;
+	int tnob = 0;
+	int fnob = 0;
+
+	list_for_each_entry(c, &peer_ni->ksnp_conns, ksnc_list) {
+		int nob = atomic_read(&c->ksnc_tx_nob) +
+			  c->ksnc_sock->sk->sk_wmem_queued;
+		int rc;
+
+                LASSERT (!c->ksnc_closing);
+                LASSERT (c->ksnc_proto != NULL &&
+                         c->ksnc_proto->pro_match_tx != NULL);
+
+                rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+                switch (rc) {
+                default:
+                        LBUG();
+                case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+                        continue;
+
+                case SOCKNAL_MATCH_YES: /* typed connection */
+                        if (typed == NULL || tnob > nob ||
+                            (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     typed->ksnc_tx_last_post > c->ksnc_tx_last_post)) {
+                                typed = c;
+                                tnob  = nob;
+                        }
+                        break;
+
+                case SOCKNAL_MATCH_MAY: /* fallback connection */
+                        if (fallback == NULL || fnob > nob ||
+                            (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     fallback->ksnc_tx_last_post > c->ksnc_tx_last_post)) {
+                                fallback = c;
+                                fnob     = nob;
+                        }
+                        break;
+                }
+        }
+
+        /* prefer the typed selection */
+        conn = (typed != NULL) ? typed : fallback;
+
+        if (conn != NULL)
+		conn->ksnc_tx_last_post = ktime_get_seconds();
+
+        return conn;
+}
+
+void
+ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx)
+{
+        conn->ksnc_proto->pro_pack(tx);
+
+	atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+        ksocknal_conn_addref(conn); /* +1 ref for tx */
+        tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn)
+{
+	struct ksock_sched *sched = conn->ksnc_scheduler;
+	struct ksock_msg *msg = &tx->tx_msg;
+	struct ksock_tx *ztx = NULL;
+	int bufnob = 0;
+
+        /* called holding global lock (read or irq-write) and caller may
+         * not have dropped this lock between finding conn and calling me,
+         * so we don't need the {get,put}connsock dance to deref
+         * ksnc_sock... */
+        LASSERT(!conn->ksnc_closing);
+
+	CDEBUG(D_NET, "Sending to %s ip %pISp\n",
+	       libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+	       &conn->ksnc_peeraddr);
+
+        ksocknal_tx_prep(conn, tx);
+
+	/* Ensure the frags we've been given EXACTLY match the number of
+	 * bytes we want to send.  Many TCP/IP stacks disregard any total
+	 * size parameters passed to them and just look at the frags.
+	 *
+	 * We always expect at least 1 mapped fragment containing the
+	 * complete ksocknal message header.
+	 */
+	LASSERT(lnet_iov_nob(tx->tx_niov, &tx->tx_hdr) +
+		lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+		(unsigned int)tx->tx_nob);
+	LASSERT(tx->tx_niov >= 1);
+	LASSERT(tx->tx_resid == tx->tx_nob);
+
+	CDEBUG(D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+	       tx, tx->tx_lnetmsg ? tx->tx_lnetmsg->msg_type : KSOCK_MSG_NOOP,
+	       tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+	bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+	spin_lock_bh(&sched->kss_lock);
+
+	if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+		/* First packet starts the timeout */
+		conn->ksnc_tx_deadline = ktime_get_seconds() +
+					 ksocknal_timeout();
+		if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+			conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+		conn->ksnc_tx_bufnob = 0;
+		smp_mb(); /* order with adding to tx_queue */
+	}
+
+	if (msg->ksm_type == KSOCK_MSG_NOOP) {
+		/* The packet is noop ZC ACK, try to piggyback the ack_cookie
+		 * on a normal packet so I don't need to send it */
+                LASSERT (msg->ksm_zc_cookies[1] != 0);
+                LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+                if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+                        ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+        } else {
+                /* It's a normal packet - can it piggback a noop zc-ack that
+                 * has been queued already? */
+                LASSERT (msg->ksm_zc_cookies[1] == 0);
+                LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+                ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+                /* ztx will be released later */
+        }
+
+        if (ztx != NULL) {
+		atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+		list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+        }
+
+	if (conn->ksnc_tx_ready &&      /* able to send */
+	    !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+		/* +1 ref for scheduler */
+		ksocknal_conn_addref(conn);
+		list_add_tail(&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		wake_up(&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+
+struct ksock_conn_cb *
+ksocknal_find_connectable_conn_cb_locked(struct ksock_peer_ni *peer_ni)
+{
+	time64_t now = ktime_get_seconds();
+	struct ksock_conn_cb *conn_cb;
+
+	conn_cb = peer_ni->ksnp_conn_cb;
+	if (!conn_cb)
+		return NULL;
+
+	LASSERT(!conn_cb->ksnr_connecting || conn_cb->ksnr_scheduled);
+
+	if (conn_cb->ksnr_scheduled)	/* connections being established */
+		return NULL;
+
+	/* all conn types connected ? */
+	if ((ksocknal_conn_cb_mask() & ~conn_cb->ksnr_connected) == 0)
+		return NULL;
+
+	if (!(conn_cb->ksnr_retry_interval == 0 || /* first attempt */
+	      now >= conn_cb->ksnr_timeout)) {
+		CDEBUG(D_NET,
+		       "Too soon to retry route %pIS (cnted %d, interval %lld, %lld secs later)\n",
+		       &conn_cb->ksnr_addr,
+		       conn_cb->ksnr_connected,
+		       conn_cb->ksnr_retry_interval,
+		       conn_cb->ksnr_timeout - now);
+		return NULL;
+	}
+
+	return conn_cb;
+}
+
+struct ksock_conn_cb *
+ksocknal_find_connecting_conn_cb_locked(struct ksock_peer_ni *peer_ni)
+{
+	struct ksock_conn_cb *conn_cb;
+
+	conn_cb = peer_ni->ksnp_conn_cb;
+	if (!conn_cb)
+		return NULL;
+
+	LASSERT(!conn_cb->ksnr_connecting || conn_cb->ksnr_scheduled);
+
+	return conn_cb->ksnr_scheduled ? conn_cb : NULL;
+}
+
+int
+ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
+		       struct lnet_processid *id)
+{
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_conn *conn;
+	struct sockaddr_in sa;
+	rwlock_t *g_lock;
+	int retry;
+	int rc;
+
+	LASSERT(tx->tx_conn == NULL);
+
+	g_lock = &ksocknal_data.ksnd_global_lock;
+
+	for (retry = 0;; retry = 1) {
+		read_lock(g_lock);
+		peer_ni = ksocknal_find_peer_locked(ni, id);
+		if (peer_ni != NULL) {
+			if (ksocknal_find_connectable_conn_cb_locked(peer_ni) == NULL) {
+				conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk);
+				if (conn != NULL) {
+					/* I've got nothing that need to be
+					 * connecting and I do have an actual
+					 * connection...
+					 */
+					ksocknal_queue_tx_locked(tx, conn);
+					read_unlock(g_lock);
+					return 0;
+				}
+			}
+		}
+
+		/* I'll need a write lock... */
+		read_unlock(g_lock);
+
+		write_lock_bh(g_lock);
+
+		peer_ni = ksocknal_find_peer_locked(ni, id);
+		if (peer_ni != NULL)
+			break;
+
+		write_unlock_bh(g_lock);
+
+		if ((id->pid & LNET_PID_USERFLAG) != 0) {
+			CERROR("Refusing to create a connection to userspace process %s\n",
+			       libcfs_idstr(id));
+			return -EHOSTUNREACH;
+		}
+
+		if (retry) {
+			CERROR("Can't find peer_ni %s\n", libcfs_idstr(id));
+			return -EHOSTUNREACH;
+		}
+
+		memset(&sa, 0, sizeof(sa));
+		sa.sin_family = AF_INET;
+		sa.sin_addr.s_addr = id->nid.nid_addr[0];
+		sa.sin_port = htons(lnet_acceptor_port());
+		{
+			struct lnet_process_id id4 = {
+				.pid = id->pid,
+				.nid = lnet_nid_to_nid4(&id->nid),
+			};
+			rc = ksocknal_add_peer(ni, id4, (struct sockaddr *)&sa);
+		}
+		if (rc != 0) {
+			CERROR("Can't add peer_ni %s: %d\n",
+			       libcfs_idstr(id), rc);
+			return rc;
+		}
+	}
+
+        ksocknal_launch_all_connections_locked(peer_ni);
+
+        conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk);
+        if (conn != NULL) {
+                /* Connection exists; queue message on it */
+                ksocknal_queue_tx_locked (tx, conn);
+		write_unlock_bh(g_lock);
+                return (0);
+        }
+
+	if (peer_ni->ksnp_accepting > 0 ||
+	    ksocknal_find_connecting_conn_cb_locked(peer_ni) != NULL) {
+                /* the message is going to be pinned to the peer_ni */
+		tx->tx_deadline = ktime_get_seconds() +
+				  ksocknal_timeout();
+
+                /* Queue the message until a connection is established */
+		list_add_tail(&tx->tx_list, &peer_ni->ksnp_tx_queue);
+		write_unlock_bh(g_lock);
+                return 0;
+	}
+
+	write_unlock_bh(g_lock);
+
+        /* NB Routes may be ignored if connections to them failed recently */
+	CNETERR("No usable routes to %s\n", libcfs_idstr(id));
+	tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
+        return (-EHOSTUNREACH);
+}
+
+int
+ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
+{
+	/* '1' for consistency with code that checks !mpflag to restore */
+	unsigned int mpflag = 1;
+	int type = lntmsg->msg_type;
+	struct lnet_processid *target = &lntmsg->msg_target;
+	unsigned int payload_niov = lntmsg->msg_niov;
+	struct bio_vec *payload_kiov = lntmsg->msg_kiov;
+	unsigned int payload_offset = lntmsg->msg_offset;
+	unsigned int payload_nob = lntmsg->msg_len;
+	struct ksock_tx *tx;
+	int desc_size;
+	int rc;
+
+	/* NB 'private' is different depending on what we're sending.
+	 * Just ignore it...
+	 */
+
+	CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_idstr(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+	LASSERT (!in_interrupt ());
+
+	desc_size = offsetof(struct ksock_tx,
+			     tx_payload[payload_niov]);
+
+        if (lntmsg->msg_vmflush)
+		mpflag = memalloc_noreclaim_save();
+
+	tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+	if (tx == NULL) {
+		CERROR("Can't allocate tx desc type %d size %d\n",
+		       type, desc_size);
+		if (lntmsg->msg_vmflush)
+			memalloc_noreclaim_restore(mpflag);
+		return -ENOMEM;
+	}
+
+	tx->tx_conn = NULL;                     /* set when assigned a conn */
+	tx->tx_lnetmsg = lntmsg;
+
+	tx->tx_niov = 1;
+	tx->tx_kiov = tx->tx_payload;
+	tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+					 payload_niov, payload_kiov,
+					 payload_offset, payload_nob);
+
+	if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+		tx->tx_zc_capable = 1;
+
+	tx->tx_msg.ksm_csum = 0;
+	tx->tx_msg.ksm_type = KSOCK_MSG_LNET;
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+	tx->tx_msg.ksm_zc_cookies[1] = 0;
+
+	/* The first fragment will be set later in pro_pack */
+	rc = ksocknal_launch_packet(ni, tx, target);
+	/*
+	 * We can't test lntsmg->msg_vmflush again as lntmsg may
+	 * have been freed.
+	 */
+	if (!mpflag)
+		memalloc_noreclaim_restore(mpflag);
+
+	if (rc == 0)
+		return (0);
+
+	lntmsg->msg_health_status = tx->tx_hstatus;
+	ksocknal_free_tx(tx);
+	return -EIO;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+	if (atomic_dec_and_test(&ksocknal_data.ksnd_nthreads))
+		wake_up_var(&ksocknal_data.ksnd_nthreads);
+}
+
+int
+ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip)
+{
+        static char ksocknal_slop_buffer[4096];
+	int nob;
+	unsigned int niov;
+	int skipped;
+
+        LASSERT(conn->ksnc_proto != NULL);
+
+        if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+                /* Remind the socket to ack eagerly... */
+                ksocknal_lib_eager_ack(conn);
+        }
+
+	if (nob_to_skip == 0) {         /* right at next packet boundary now */
+		conn->ksnc_rx_started = 0;
+		smp_mb();                       /* racing with timeout thread */
+
+		switch (conn->ksnc_proto->pro_version) {
+		case  KSOCK_PROTO_V2:
+		case  KSOCK_PROTO_V3:
+			conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+			conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg;
+
+			conn->ksnc_rx_nob_wanted = sizeof(struct ksock_msg_hdr);
+			conn->ksnc_rx_nob_left = sizeof(struct ksock_msg_hdr);
+			conn->ksnc_rx_iov[0].iov_len =
+				sizeof(struct ksock_msg_hdr);
+			break;
+
+		case KSOCK_PROTO_V1:
+			/* Receiving bare struct lnet_hdr_nid4 */
+			conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+			conn->ksnc_rx_nob_wanted = sizeof(struct lnet_hdr_nid4);
+			conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr_nid4);
+
+			conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base =
+				(void *)&conn->ksnc_msg.ksm_u.lnetmsg_nid4;
+			conn->ksnc_rx_iov[0].iov_len =
+				sizeof(struct lnet_hdr_nid4);
+			break;
+
+		default:
+			LBUG();
+		}
+                conn->ksnc_rx_niov = 1;
+
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_nkiov = 0;
+                conn->ksnc_rx_csum = ~0;
+                return (1);
+        }
+
+        /* Set up to skip as much as possible now.  If there's more left
+         * (ran out of iov entries) we'll get called again */
+
+	conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+	conn->ksnc_rx_nob_left = nob_to_skip;
+	conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+	skipped = 0;
+	niov = 0;
+
+	do {
+		nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer));
+
+		conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+		conn->ksnc_rx_iov[niov].iov_len  = nob;
+		niov++;
+		skipped += nob;
+		nob_to_skip -= nob;
+
+	} while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+		 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof(struct kvec));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+static int
+ksocknal_process_receive(struct ksock_conn *conn,
+			 struct page **rx_scratch_pgs,
+			 struct kvec *scratch_iov)
+{
+	struct _lnet_hdr_nid4 *lhdr;
+	struct lnet_processid *id;
+	struct lnet_hdr hdr;
+	int rc;
+
+	LASSERT(refcount_read(&conn->ksnc_conn_refcount) > 0);
+
+	/* NB: sched lock NOT held */
+	/* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */
+	LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+		conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+		conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+		conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+	if (conn->ksnc_rx_nob_wanted != 0) {
+		rc = ksocknal_receive(conn, rx_scratch_pgs,
+				      scratch_iov);
+
+		if (rc <= 0) {
+			struct lnet_processid *ksnp_id;
+
+			ksnp_id = &conn->ksnc_peer->ksnp_id;
+
+			LASSERT(rc != -EAGAIN);
+			if (rc == 0)
+				CDEBUG(D_NET, "[%p] EOF from %s ip %pISp\n",
+				       conn, libcfs_idstr(ksnp_id),
+				       &conn->ksnc_peeraddr);
+			else if (!conn->ksnc_closing)
+				CERROR("[%p] Error %d on read from %s ip %pISp\n",
+				       conn, rc, libcfs_idstr(ksnp_id),
+				       &conn->ksnc_peeraddr);
+
+                        /* it's not an error if conn is being closed */
+                        ksocknal_close_conn_and_siblings (conn,
+                                                          (conn->ksnc_closing) ? 0 : rc);
+                        return (rc == 0 ? -ESHUTDOWN : rc);
+                }
+
+                if (conn->ksnc_rx_nob_wanted != 0) {
+                        /* short read */
+                        return (-EAGAIN);
+                }
+        }
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_flip) {
+			__swab32s(&conn->ksnc_msg.ksm_type);
+			__swab32s(&conn->ksnc_msg.ksm_csum);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+		}
+
+		if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+		    conn->ksnc_msg.ksm_csum != 0 &&     /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			/* NOOP Checksum error */
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return (-EIO);
+		}
+
+		if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+			__u64 cookie = 0;
+
+			LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+				cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+			rc = conn->ksnc_proto->pro_handle_zcack(
+				conn, cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+
+			if (rc != 0) {
+				CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n",
+				       libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+				       cookie,
+				       conn->ksnc_msg.ksm_zc_cookies[1]);
+				ksocknal_new_packet(conn, 0);
+				ksocknal_close_conn_and_siblings(conn, -EPROTO);
+				return rc;
+			}
+		}
+
+		switch (conn->ksnc_msg.ksm_type) {
+		case KSOCK_MSG_NOOP:
+			ksocknal_new_packet(conn, 0);
+			return 0;	/* NOOP is done and just return */
+
+		case KSOCK_MSG_LNET:
+
+			conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+			conn->ksnc_rx_nob_wanted = sizeof(struct lnet_hdr_nid4);
+			conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr_nid4);
+
+			conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+			conn->ksnc_rx_iov[0].iov_base =
+				(void *)&conn->ksnc_msg.ksm_u.lnetmsg_nid4;
+			conn->ksnc_rx_iov[0].iov_len =
+				sizeof(struct lnet_hdr_nid4);
+
+			conn->ksnc_rx_niov = 1;
+			conn->ksnc_rx_kiov = NULL;
+			conn->ksnc_rx_nkiov = 0;
+
+			goto again;     /* read lnet header now */
+
+		default:
+			CERROR("%s: Unknown message type: %x\n",
+			       libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_type);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return -EPROTO;
+		}
+
+	case SOCKNAL_RX_LNET_HEADER:
+		/* unpack message header */
+		conn->ksnc_proto->pro_unpack(&conn->ksnc_msg, &hdr);
+
+		if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+			/* Userspace peer_ni */
+			id = &conn->ksnc_peer->ksnp_id;
+
+			/* Substitute process ID assigned at connection time */
+			hdr.src_pid = id->pid;
+			hdr.src_nid = id->nid;
+		}
+
+		conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+		ksocknal_conn_addref(conn);     /* ++ref while parsing */
+
+
+		rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+				&hdr,
+				&conn->ksnc_peer->ksnp_id.nid,
+				conn, 0);
+		if (rc < 0) {
+			/* I just received garbage: give up on this conn */
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, rc);
+			ksocknal_conn_decref(conn);
+			return (-EPROTO);
+		}
+
+		/* I'm racing with ksocknal_recv() */
+		LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+			conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+		if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+			return 0;
+
+		/* ksocknal_recv() got called */
+		goto again;
+
+	case SOCKNAL_RX_LNET_PAYLOAD:
+		/* payload all received */
+		rc = 0;
+
+		if (conn->ksnc_rx_nob_left == 0 &&   /* not truncating */
+		    conn->ksnc_msg.ksm_csum != 0 &&  /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			rc = -EIO;
+		}
+
+		if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+			LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			lhdr = (void *)&conn->ksnc_msg.ksm_u.lnetmsg_nid4;
+			id = &conn->ksnc_peer->ksnp_id;
+
+			rc = conn->ksnc_proto->pro_handle_zcreq(
+				conn,
+				conn->ksnc_msg.ksm_zc_cookies[0],
+				*ksocknal_tunables.ksnd_nonblk_zcack ||
+				le64_to_cpu(lhdr->src_nid) !=
+				lnet_nid_to_nid4(&id->nid));
+		}
+
+		if (rc && conn->ksnc_lnet_msg)
+			conn->ksnc_lnet_msg->msg_health_status =
+				LNET_MSG_STATUS_REMOTE_ERROR;
+		lnet_finalize(conn->ksnc_lnet_msg, rc);
+
+		if (rc != 0) {
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, rc);
+			return (-EPROTO);
+		}
+		fallthrough;
+
+	case SOCKNAL_RX_SLOP:
+		/* starting new packet? */
+		if (ksocknal_new_packet(conn, conn->ksnc_rx_nob_left))
+			return 0;	/* come back later */
+		goto again;		/* try to finish reading slop now */
+
+	default:
+		break;
+	}
+
+        /* Not Reached */
+        LBUG ();
+        return (-EINVAL);                       /* keep gcc happy */
+}
+
+int
+ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+	      int delayed, unsigned int niov,
+	      struct bio_vec *kiov, unsigned int offset, unsigned int mlen,
+	      unsigned int rlen)
+{
+	struct ksock_conn *conn = private;
+	struct ksock_sched *sched = conn->ksnc_scheduler;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= LNET_MAX_IOV);
+
+	conn->ksnc_lnet_msg = msg;
+	conn->ksnc_rx_nob_wanted = mlen;
+	conn->ksnc_rx_nob_left   = rlen;
+
+	if (mlen == 0) {
+		conn->ksnc_rx_nkiov = 0;
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+		conn->ksnc_rx_niov = 0;
+	} else {
+		conn->ksnc_rx_niov = 0;
+		conn->ksnc_rx_iov  = NULL;
+		conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+		conn->ksnc_rx_nkiov =
+			lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+					  niov, kiov, offset, mlen);
+	}
+
+        LASSERT (mlen ==
+                 lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        LASSERT (conn->ksnc_rx_scheduled);
+
+	spin_lock_bh(&sched->kss_lock);
+
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_PARSE_WAIT:
+		list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+		wake_up(&sched->kss_waitq);
+		LASSERT(conn->ksnc_rx_ready);
+		break;
+
+        case SOCKNAL_RX_PARSE:
+                /* scheduler hasn't noticed I'm parsing yet */
+                break;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_conn_decref(conn);
+	return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(struct ksock_sched *sched)
+{
+	int           rc;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	rc = (!ksocknal_data.ksnd_shuttingdown &&
+	      list_empty(&sched->kss_rx_conns) &&
+	      list_empty(&sched->kss_tx_conns));
+
+	spin_unlock_bh(&sched->kss_lock);
+	return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+	struct ksock_sched *sched;
+	struct ksock_conn *conn;
+	struct ksock_tx	*tx;
+	int rc;
+	long id = (long)arg;
+	struct page **rx_scratch_pgs;
+	struct kvec *scratch_iov;
+
+	sched = ksocknal_data.ksnd_schedulers[KSOCK_THREAD_CPT(id)];
+
+	LIBCFS_CPT_ALLOC(rx_scratch_pgs, lnet_cpt_table(), sched->kss_cpt,
+			 sizeof(*rx_scratch_pgs) * LNET_MAX_IOV);
+	if (!rx_scratch_pgs) {
+		CERROR("Unable to allocate scratch pages\n");
+		return -ENOMEM;
+	}
+
+	LIBCFS_CPT_ALLOC(scratch_iov, lnet_cpt_table(), sched->kss_cpt,
+			 sizeof(*scratch_iov) * LNET_MAX_IOV);
+	if (!scratch_iov) {
+		CERROR("Unable to allocate scratch iov\n");
+		return -ENOMEM;
+	}
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), sched->kss_cpt);
+	if (rc != 0) {
+		CWARN("Can't set CPU partition affinity to %d: %d\n",
+			sched->kss_cpt, rc);
+	}
+
+	spin_lock_bh(&sched->kss_lock);
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		bool did_something = false;
+
+		/* Ensure I progress everything semi-fairly */
+		conn = list_first_entry_or_null(&sched->kss_rx_conns,
+						struct ksock_conn,
+						ksnc_rx_list);
+		if (conn) {
+			list_del(&conn->ksnc_rx_list);
+
+			LASSERT(conn->ksnc_rx_scheduled);
+			LASSERT(conn->ksnc_rx_ready);
+
+			/* clear rx_ready in case receive isn't complete.
+			 * Do it BEFORE we call process_recv, since
+			 * data_ready can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_rx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			rc = ksocknal_process_receive(conn, rx_scratch_pgs,
+						      scratch_iov);
+
+			spin_lock_bh(&sched->kss_lock);
+
+			/* I'm the only one that can clear this flag */
+			LASSERT(conn->ksnc_rx_scheduled);
+
+			/* Did process_receive get everything it wanted? */
+			if (rc == 0)
+				conn->ksnc_rx_ready = 1;
+
+			if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+				/* Conn blocked waiting for ksocknal_recv()
+				 * I change its state (under lock) to signal
+				 * it can be rescheduled */
+				conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+			} else if (conn->ksnc_rx_ready) {
+				/* reschedule for rx */
+				list_add_tail(&conn->ksnc_rx_list,
+						   &sched->kss_rx_conns);
+			} else {
+				conn->ksnc_rx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = true;
+		}
+
+		if (!list_empty(&sched->kss_tx_conns)) {
+			LIST_HEAD(zlist);
+
+			list_splice_init(&sched->kss_zombie_noop_txs, &zlist);
+
+			conn = list_first_entry(&sched->kss_tx_conns,
+						struct ksock_conn,
+						ksnc_tx_list);
+			list_del(&conn->ksnc_tx_list);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			LASSERT(conn->ksnc_tx_ready);
+			LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+			tx = list_first_entry(&conn->ksnc_tx_queue,
+					      struct ksock_tx, tx_list);
+
+			if (conn->ksnc_tx_carrier == tx)
+				ksocknal_next_tx_carrier(conn);
+
+			/* dequeue now so empty list => more to send */
+			list_del(&tx->tx_list);
+
+			/* Clear tx_ready in case send isn't complete.  Do
+			 * it BEFORE we call process_transmit, since
+			 * write_space can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_tx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			if (!list_empty(&zlist)) {
+				/* free zombie noop txs, it's fast because
+				 * noop txs are just put in freelist */
+				ksocknal_txlist_done(NULL, &zlist, 0);
+			}
+
+			rc = ksocknal_process_transmit(conn, tx, scratch_iov);
+
+			if (rc == -ENOMEM || rc == -EAGAIN) {
+				/* Incomplete send: replace tx on HEAD of tx_queue */
+				spin_lock_bh(&sched->kss_lock);
+				list_add(&tx->tx_list,
+					 &conn->ksnc_tx_queue);
+			} else {
+				/* Complete send; tx -ref */
+				ksocknal_tx_decref(tx);
+
+				spin_lock_bh(&sched->kss_lock);
+				/* assume space for more */
+				conn->ksnc_tx_ready = 1;
+			}
+
+			if (rc == -ENOMEM) {
+				/* Do nothing; after a short timeout, this
+				 * conn will be reposted on kss_tx_conns. */
+			} else if (conn->ksnc_tx_ready &&
+				   !list_empty(&conn->ksnc_tx_queue)) {
+				/* reschedule for tx */
+				list_add_tail(&conn->ksnc_tx_list,
+					      &sched->kss_tx_conns);
+			} else {
+				conn->ksnc_tx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = true;
+		}
+		if (!did_something ||	/* nothing to do */
+		    need_resched()) {	/* hogging CPU? */
+			spin_unlock_bh(&sched->kss_lock);
+
+			if (!did_something) {   /* wait for something to do */
+				rc = wait_event_interruptible_exclusive(
+					sched->kss_waitq,
+					!ksocknal_sched_cansleep(sched));
+				LASSERT (rc == 0);
+			} else {
+				cond_resched();
+			}
+
+			spin_lock_bh(&sched->kss_lock);
+		}
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+	CFS_FREE_PTR_ARRAY(rx_scratch_pgs, LNET_MAX_IOV);
+	CFS_FREE_PTR_ARRAY(scratch_iov, LNET_MAX_IOV);
+	ksocknal_thread_fini();
+	return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback(struct ksock_conn *conn)
+{
+	struct ksock_sched *sched;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_rx_ready = 1;
+
+	if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+		list_add_tail(&conn->ksnc_rx_list,
+				  &sched->kss_rx_conns);
+		conn->ksnc_rx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback(struct ksock_conn *conn)
+{
+	struct ksock_sched *sched;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled && /* not being progressed */
+	    !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */
+		list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up(&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+static const struct ksock_proto *
+ksocknal_parse_proto_version(struct ksock_hello_msg *hello)
+{
+	__u32   version = 0;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		version = hello->kshm_version;
+	else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+		version = __swab32(hello->kshm_version);
+
+	if (version) {
+#if SOCKNAL_VERSION_DEBUG
+		if (*ksocknal_tunables.ksnd_protocol == 1)
+			return NULL;
+
+		if (*ksocknal_tunables.ksnd_protocol == 2 &&
+		    version == KSOCK_PROTO_V3)
+			return NULL;
+#endif
+		if (version == KSOCK_PROTO_V2)
+			return &ksocknal_protocol_v2x;
+
+		if (version == KSOCK_PROTO_V3)
+			return &ksocknal_protocol_v3x;
+
+		return NULL;
+	}
+
+	if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+		struct lnet_magicversion *hmv;
+
+		BUILD_BUG_ON(sizeof(struct lnet_magicversion) !=
+			     offsetof(struct ksock_hello_msg, kshm_src_nid));
+
+		hmv = (struct lnet_magicversion *)hello;
+
+		if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+		    hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+			return &ksocknal_protocol_v1x;
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
+		    struct lnet_nid *peer_nid, struct ksock_hello_msg *hello)
+{
+	/* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+	struct ksock_net *net = (struct ksock_net *)ni->ni_data;
+
+	LASSERT(hello->kshm_nips <= LNET_INTERFACES_NUM);
+
+	/* rely on caller to hold a ref on socket so it wouldn't disappear */
+	LASSERT(conn->ksnc_proto != NULL);
+
+	hello->kshm_src_nid = ni->ni_nid;
+	hello->kshm_dst_nid = *peer_nid;
+	hello->kshm_src_pid = the_lnet.ln_pid;
+
+	hello->kshm_src_incarnation = net->ksnn_incarnation;
+	hello->kshm_ctype = conn->ksnc_type;
+
+	return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+static int
+ksocknal_invert_type(int type)
+{
+	switch (type) {
+	case SOCKLND_CONN_ANY:
+	case SOCKLND_CONN_CONTROL:
+		return (type);
+	case SOCKLND_CONN_BULK_IN:
+		return SOCKLND_CONN_BULK_OUT;
+	case SOCKLND_CONN_BULK_OUT:
+		return SOCKLND_CONN_BULK_IN;
+	default:
+		return (SOCKLND_CONN_NONE);
+	}
+}
+
+int
+ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
+		    struct ksock_hello_msg *hello,
+		    struct lnet_processid *peerid,
+		    __u64 *incarnation)
+{
+	/* Return < 0        fatal error
+	 *        0          success
+	 *        EALREADY   lost connection race
+	 *        EPROTO     protocol version mismatch
+	 */
+	struct socket *sock = conn->ksnc_sock;
+	int active = (conn->ksnc_proto != NULL);
+	int timeout;
+	int proto_match;
+	int rc;
+	const struct ksock_proto *proto;
+	struct lnet_processid recv_id;
+
+	/* socket type set on active connections - not set on passive */
+	LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+	timeout = active ? ksocknal_timeout() :
+		lnet_acceptor_timeout();
+
+	rc = lnet_sock_read(sock, &hello->kshm_magic,
+			    sizeof(hello->kshm_magic), timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading HELLO from %pIS\n",
+		       rc, &conn->ksnc_peeraddr);
+		LASSERT(rc < 0);
+		return rc;
+	}
+
+	if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+	    hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+	    hello->kshm_magic != le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+		/* Unexpected magic! */
+		CERROR("Bad magic(1) %#08x (%#08x expected) from %pIS\n",
+		       __cpu_to_le32 (hello->kshm_magic),
+		       LNET_PROTO_TCP_MAGIC, &conn->ksnc_peeraddr);
+		return -EPROTO;
+	}
+
+	rc = lnet_sock_read(sock, &hello->kshm_version,
+			    sizeof(hello->kshm_version), timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading HELLO from %pIS\n",
+		       rc, &conn->ksnc_peeraddr);
+		LASSERT(rc < 0);
+		return rc;
+	}
+
+	proto = ksocknal_parse_proto_version(hello);
+	if (proto == NULL) {
+		if (!active) {
+			/* unknown protocol from peer_ni,
+			 * tell peer_ni my protocol.
+			 */
+			conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			if (*ksocknal_tunables.ksnd_protocol == 2)
+				conn->ksnc_proto = &ksocknal_protocol_v2x;
+			else if (*ksocknal_tunables.ksnd_protocol == 1)
+				conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, &ni->ni_nid,
+					    hello);
+		}
+
+		CERROR("Unknown protocol version (%d.x expected) from %pIS\n",
+		       conn->ksnc_proto->pro_version, &conn->ksnc_peeraddr);
+
+		return -EPROTO;
+	}
+
+	proto_match = (conn->ksnc_proto == proto);
+	conn->ksnc_proto = proto;
+
+	/* receive the rest of hello message anyway */
+	rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading or checking hello from from %pIS\n",
+		       rc, &conn->ksnc_peeraddr);
+		LASSERT(rc < 0);
+		return rc;
+	}
+
+	*incarnation = hello->kshm_src_incarnation;
+
+	if (LNET_NID_IS_ANY(&hello->kshm_src_nid)) {
+		CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pIS\n",
+		       &conn->ksnc_peeraddr);
+		return -EPROTO;
+	}
+
+	if (!active &&
+	    rpc_get_port((struct sockaddr *)&conn->ksnc_peeraddr) >
+	    LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+		/* Userspace NAL assigns peer_ni process ID from socket */
+		recv_id.pid = rpc_get_port((struct sockaddr *)
+					   &conn->ksnc_peeraddr) |
+			LNET_PID_USERFLAG;
+		LASSERT(conn->ksnc_peeraddr.ss_family == AF_INET);
+		memset(&recv_id.nid, 0, sizeof(recv_id.nid));
+		recv_id.nid.nid_type = ni->ni_nid.nid_type;
+		recv_id.nid.nid_num = ni->ni_nid.nid_num;
+		recv_id.nid.nid_addr[0] =
+			((struct sockaddr_in *)
+			 &conn->ksnc_peeraddr)->sin_addr.s_addr;
+	} else {
+		recv_id.nid = hello->kshm_src_nid;
+		recv_id.pid = hello->kshm_src_pid;
+	}
+
+	if (!active) {
+		*peerid = recv_id;
+
+		/* peer_ni determines type */
+		conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+		if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+			CERROR("Unexpected type %d from %s ip %pIS\n",
+			       hello->kshm_ctype, libcfs_idstr(peerid),
+			       &conn->ksnc_peeraddr);
+			return -EPROTO;
+		}
+		return 0;
+	}
+
+	if (peerid->pid != recv_id.pid ||
+	    !nid_same(&peerid->nid,  &recv_id.nid)) {
+		LCONSOLE_ERROR_MSG(0x130,
+				   "Connected successfully to %s on host %pIS, but they claimed they were %s; please check your Lustre configuration.\n",
+				   libcfs_idstr(peerid),
+				   &conn->ksnc_peeraddr,
+				   libcfs_idstr(&recv_id));
+		return -EPROTO;
+	}
+
+	if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+		/* Possible protocol mismatch or I lost the connection race */
+		return proto_match ? EALREADY : EPROTO;
+	}
+
+	if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+		CERROR("Mismatched types: me %d, %s ip %pIS %d\n",
+		       conn->ksnc_type, libcfs_idstr(peerid),
+		       &conn->ksnc_peeraddr,
+		       hello->kshm_ctype);
+		return -EPROTO;
+	}
+	return 0;
+}
+
+static bool
+ksocknal_connect(struct ksock_conn_cb *conn_cb)
+{
+	LIST_HEAD(zombies);
+	struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer;
+	int type;
+	int wanted;
+	struct socket *sock;
+	time64_t deadline;
+	bool retry_later = false;
+	int rc = 0;
+
+	deadline = ktime_get_seconds() + ksocknal_timeout();
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	LASSERT(conn_cb->ksnr_scheduled);
+	LASSERT(!conn_cb->ksnr_connecting);
+
+	conn_cb->ksnr_connecting = 1;
+
+	for (;;) {
+		wanted = ksocknal_conn_cb_mask() & ~conn_cb->ksnr_connected;
+
+		/* stop connecting if peer_ni/cb got closed under me, or
+		 * conn cb got connected while queued
+		 */
+		if (peer_ni->ksnp_closing || conn_cb->ksnr_deleted ||
+		    wanted == 0) {
+			retry_later = false;
+			break;
+		}
+
+		/* reschedule if peer_ni is connecting to me */
+		if (peer_ni->ksnp_accepting > 0) {
+			CDEBUG(D_NET,
+			       "peer_ni %s(%d) already connecting to me, retry later.\n",
+			       libcfs_nidstr(&peer_ni->ksnp_id.nid),
+			       peer_ni->ksnp_accepting);
+			retry_later = true;
+		}
+
+		if (retry_later) /* needs reschedule */
+			break;
+
+		if ((wanted & BIT(SOCKLND_CONN_ANY)) != 0) {
+			type = SOCKLND_CONN_ANY;
+		} else if ((wanted & BIT(SOCKLND_CONN_CONTROL)) != 0) {
+			type = SOCKLND_CONN_CONTROL;
+		} else if ((wanted & BIT(SOCKLND_CONN_BULK_IN)) != 0 &&
+			   conn_cb->ksnr_blki_conn_count <= conn_cb->ksnr_blko_conn_count) {
+			type = SOCKLND_CONN_BULK_IN;
+		} else {
+			LASSERT ((wanted & BIT(SOCKLND_CONN_BULK_OUT)) != 0);
+			type = SOCKLND_CONN_BULK_OUT;
+		}
+
+		write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+		if (ktime_get_seconds() >= deadline) {
+			rc = -ETIMEDOUT;
+			lnet_connect_console_error(
+				rc, &peer_ni->ksnp_id.nid,
+				(struct sockaddr *)&conn_cb->ksnr_addr);
+			goto failed;
+		}
+
+		sock = lnet_connect(&peer_ni->ksnp_id.nid,
+				    conn_cb->ksnr_myiface,
+				    (struct sockaddr *)&conn_cb->ksnr_addr,
+				    peer_ni->ksnp_ni->ni_net_ns);
+		if (IS_ERR(sock)) {
+			rc = PTR_ERR(sock);
+			goto failed;
+		}
+
+		rc = ksocknal_create_conn(peer_ni->ksnp_ni, conn_cb, sock,
+					  type);
+		if (rc < 0) {
+			lnet_connect_console_error(
+				rc, &peer_ni->ksnp_id.nid,
+				(struct sockaddr *)&conn_cb->ksnr_addr);
+			goto failed;
+		}
+
+		/* A +ve RC means I have to retry because I lost the connection
+		 * race or I have to renegotiate protocol version
+		 */
+		retry_later = (rc != 0);
+		if (retry_later)
+			CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n",
+			       libcfs_nidstr(&peer_ni->ksnp_id.nid));
+
+		write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	}
+
+	conn_cb->ksnr_scheduled = 0;
+	conn_cb->ksnr_connecting = 0;
+
+	if (retry_later) {
+		/* re-queue for attention; this frees me up to handle
+		 * the peer_ni's incoming connection request
+		 */
+
+		if (rc == EALREADY ||
+		    (rc == 0 && peer_ni->ksnp_accepting > 0)) {
+			/* We want to introduce a delay before next
+			 * attempt to connect if we lost conn race, but
+			 * the race is resolved quickly usually, so
+			 * min_reconnectms should be good heuristic
+			 */
+			conn_cb->ksnr_retry_interval =
+				*ksocknal_tunables.ksnd_min_reconnectms / 1000;
+			conn_cb->ksnr_timeout = ktime_get_seconds() +
+						conn_cb->ksnr_retry_interval;
+		}
+
+		ksocknal_launch_connection_locked(conn_cb);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return retry_later;
+
+ failed:
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	conn_cb->ksnr_scheduled = 0;
+	conn_cb->ksnr_connecting = 0;
+
+	/* This is a retry rather than a new connection */
+	conn_cb->ksnr_retry_interval *= 2;
+	conn_cb->ksnr_retry_interval =
+		max_t(time64_t, conn_cb->ksnr_retry_interval,
+		      *ksocknal_tunables.ksnd_min_reconnectms / 1000);
+	conn_cb->ksnr_retry_interval =
+		min_t(time64_t, conn_cb->ksnr_retry_interval,
+		      *ksocknal_tunables.ksnd_max_reconnectms / 1000);
+
+	LASSERT(conn_cb->ksnr_retry_interval);
+	conn_cb->ksnr_timeout = ktime_get_seconds() +
+				conn_cb->ksnr_retry_interval;
+
+	if (!list_empty(&peer_ni->ksnp_tx_queue) &&
+	    peer_ni->ksnp_accepting == 0 &&
+	    !ksocknal_find_connecting_conn_cb_locked(peer_ni)) {
+		struct ksock_conn *conn;
+
+		/* ksnp_tx_queue is queued on a conn on successful
+		 * connection for V1.x and V2.x
+		 */
+		conn = list_first_entry_or_null(&peer_ni->ksnp_conns,
+						struct ksock_conn, ksnc_list);
+		if (conn)
+			LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x ||
+				conn->ksnc_proto == &ksocknal_protocol_v4x);
+
+		/* take all the blocked packets while I've got the lock and
+		 * complete below...
+		 */
+		list_splice_init(&peer_ni->ksnp_tx_queue, &zombies);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_peer_failed(peer_ni);
+	ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, rc);
+	return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(time64_t sec, long *timeout)
+{
+        int rc;
+        int total = ksocknal_data.ksnd_connd_starting +
+                    ksocknal_data.ksnd_connd_running;
+
+        if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+                /* still in initializing */
+                return 0;
+        }
+
+        if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+            total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+                /* can't create more connd, or still have enough
+                 * threads to handle more connecting */
+                return 0;
+        }
+
+        if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+                /* no pending connecting request */
+                return 0;
+        }
+
+        if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+                /* may run out of resource, retry later */
+                *timeout = cfs_time_seconds(1);
+                return 0;
+        }
+
+        if (ksocknal_data.ksnd_connd_starting > 0) {
+                /* serialize starting to avoid flood */
+                return 0;
+        }
+
+        ksocknal_data.ksnd_connd_starting_stamp = sec;
+        ksocknal_data.ksnd_connd_starting++;
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	/* NB: total is the next id */
+	rc = ksocknal_thread_start(ksocknal_connd, NULL,
+				   "socknal_cd%02d", total);
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+        if (rc == 0)
+                return 1;
+
+        /* we tried ... */
+        LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+        ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds();
+
+        return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(time64_t sec, long *timeout)
+{
+        int val;
+
+        if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+                /* still in initializing */
+                return 0;
+        }
+
+        if (ksocknal_data.ksnd_connd_starting > 0) {
+                /* in progress of starting new thread */
+                return 0;
+        }
+
+        if (ksocknal_data.ksnd_connd_running <=
+            *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+                return 0;
+        }
+
+        /* created thread in past 120 seconds? */
+        val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+                    SOCKNAL_CONND_TIMEOUT - sec);
+
+        *timeout = (val > 0) ? cfs_time_seconds(val) :
+                               cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+        if (val > 0)
+                return 0;
+
+        /* no creating in past 120 seconds */
+
+        return ksocknal_data.ksnd_connd_running >
+               ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_cbs queue looking for a conn_cb that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static struct ksock_conn_cb *
+ksocknal_connd_get_conn_cb_locked(signed long *timeout_p)
+{
+	time64_t now = ktime_get_seconds();
+	time64_t conn_timeout;
+	struct ksock_conn_cb *conn_cb;
+
+	/* connd_routes can contain both pending and ordinary routes */
+	list_for_each_entry(conn_cb, &ksocknal_data.ksnd_connd_routes,
+			    ksnr_connd_list) {
+
+		conn_timeout = conn_cb->ksnr_timeout;
+
+		if (conn_cb->ksnr_retry_interval == 0 ||
+		    now >= conn_timeout)
+			return conn_cb;
+
+		if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+		    *timeout_p > cfs_time_seconds(conn_timeout - now))
+			*timeout_p = cfs_time_seconds(conn_timeout - now);
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_connd(void *arg)
+{
+	spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
+	struct ksock_connreq *cr;
+	wait_queue_entry_t wait;
+	int cons_retry = 0;
+
+	init_wait(&wait);
+
+	spin_lock_bh(connd_lock);
+
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_running++;
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		struct ksock_conn_cb *conn_cb = NULL;
+		time64_t sec = ktime_get_real_seconds();
+		long timeout = MAX_SCHEDULE_TIMEOUT;
+		bool dropped_lock = false;
+
+		if (ksocknal_connd_check_stop(sec, &timeout)) {
+			/* wakeup another one to check stop */
+			wake_up(&ksocknal_data.ksnd_connd_waitq);
+			break;
+		}
+
+		if (ksocknal_connd_check_start(sec, &timeout)) {
+			/* created new thread */
+			dropped_lock = true;
+		}
+
+		cr = list_first_entry_or_null(&ksocknal_data.ksnd_connd_connreqs,
+					      struct ksock_connreq, ksncr_list);
+		if (cr) {
+			/* Connection accepted by the listener */
+			list_del(&cr->ksncr_list);
+			spin_unlock_bh(connd_lock);
+			dropped_lock = true;
+
+			ksocknal_create_conn(cr->ksncr_ni, NULL,
+					     cr->ksncr_sock, SOCKLND_CONN_NONE);
+			lnet_ni_decref(cr->ksncr_ni);
+			LIBCFS_FREE(cr, sizeof(*cr));
+
+			spin_lock_bh(connd_lock);
+		}
+
+		/* Only handle an outgoing connection request if there
+		 * is a thread left to handle incoming connections and
+		 * create new connd
+		 */
+		if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+		    ksocknal_data.ksnd_connd_running)
+			conn_cb = ksocknal_connd_get_conn_cb_locked(&timeout);
+
+		if (conn_cb) {
+			list_del(&conn_cb->ksnr_connd_list);
+			ksocknal_data.ksnd_connd_connecting++;
+			spin_unlock_bh(connd_lock);
+			dropped_lock = true;
+
+			if (ksocknal_connect(conn_cb)) {
+				/* consecutive retry */
+				if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+					CWARN("massive consecutive re-connecting to %pIS\n",
+					      &conn_cb->ksnr_addr);
+					cons_retry = 0;
+				}
+			} else {
+				cons_retry = 0;
+			}
+
+			ksocknal_conn_cb_decref(conn_cb);
+
+			spin_lock_bh(connd_lock);
+			ksocknal_data.ksnd_connd_connecting--;
+		}
+
+		if (dropped_lock) {
+			if (!need_resched())
+				continue;
+			spin_unlock_bh(connd_lock);
+			cond_resched();
+			spin_lock_bh(connd_lock);
+			continue;
+		}
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq,
+					 &wait);
+		spin_unlock_bh(connd_lock);
+
+		schedule_timeout(timeout);
+
+		remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_lock_bh(connd_lock);
+	}
+	ksocknal_data.ksnd_connd_running--;
+	spin_unlock_bh(connd_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
+
+static struct ksock_conn *
+ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
+{
+        /* We're called with a shared lock on ksnd_global_lock */
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
+	struct ksock_sched *sched;
+
+	list_for_each_entry(conn, &peer_ni->ksnp_conns, ksnc_list) {
+		int error;
+
+                /* Don't need the {get,put}connsock dance to deref ksnc_sock */
+                LASSERT (!conn->ksnc_closing);
+		sched = conn->ksnc_scheduler;
+
+		error = conn->ksnc_sock->sk->sk_err;
+                if (error != 0) {
+                        ksocknal_conn_addref(conn);
+
+			switch (error) {
+			case ECONNRESET:
+				CNETERR("A connection with %s (%pISp) was reset; it may have rebooted.\n",
+					libcfs_idstr(&peer_ni->ksnp_id),
+					&conn->ksnc_peeraddr);
+				break;
+			case ETIMEDOUT:
+				CNETERR("A connection with %s (%pISp) timed out; the network or node may be down.\n",
+					libcfs_idstr(&peer_ni->ksnp_id),
+					&conn->ksnc_peeraddr);
+				break;
+			default:
+				CNETERR("An unexpected network error %d occurred with %s (%pISp\n",
+					error,
+					libcfs_idstr(&peer_ni->ksnp_id),
+					&conn->ksnc_peeraddr);
+				break;
+			}
+
+			return conn;
+		}
+
+		if (conn->ksnc_rx_started &&
+		    ktime_get_seconds() >= conn->ksnc_rx_deadline) {
+			/* Timed out incomplete incoming message */
+			ksocknal_conn_addref(conn);
+			CNETERR("Timeout receiving from %s (%pISp), state %d wanted %d left %d\n",
+				libcfs_idstr(&peer_ni->ksnp_id),
+				&conn->ksnc_peeraddr,
+				conn->ksnc_rx_state,
+				conn->ksnc_rx_nob_wanted,
+				conn->ksnc_rx_nob_left);
+			return conn;
+		}
+
+		spin_lock_bh(&sched->kss_lock);
+		if ((!list_empty(&conn->ksnc_tx_queue) ||
+		     conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
+		    ktime_get_seconds() >= conn->ksnc_tx_deadline) {
+			/* Timed out messages queued for sending or
+			 * buffered in the socket's send buffer
+			 */
+			ksocknal_conn_addref(conn);
+			list_for_each_entry(tx, &conn->ksnc_tx_queue,
+					    tx_list)
+				tx->tx_hstatus =
+					LNET_MSG_STATUS_LOCAL_TIMEOUT;
+			CNETERR("Timeout sending data to %s (%pISp) the network or that node may be down.\n",
+				libcfs_idstr(&peer_ni->ksnp_id),
+				&conn->ksnc_peeraddr);
+				spin_unlock_bh(&sched->kss_lock);
+				return conn;
+		}
+		spin_unlock_bh(&sched->kss_lock);
+	}
+
+	return (NULL);
+}
+
+static inline void
+ksocknal_flush_stale_txs(struct ksock_peer_ni *peer_ni)
+{
+	struct ksock_tx	*tx;
+	LIST_HEAD(stale_txs);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	while ((tx = list_first_entry_or_null(&peer_ni->ksnp_tx_queue,
+					      struct ksock_tx,
+					      tx_list)) != NULL) {
+		if (ktime_get_seconds() < tx->tx_deadline)
+			break;
+
+		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
+
+		list_move_tail(&tx->tx_list, &stale_txs);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, -ETIMEDOUT);
+}
+
+static int
+ksocknal_send_keepalive_locked(struct ksock_peer_ni *peer_ni)
+__must_hold(&ksocknal_data.ksnd_global_lock)
+{
+	struct ksock_sched *sched;
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
+
+	/* last_alive will be updated by create_conn */
+	if (list_empty(&peer_ni->ksnp_conns))
+                return 0;
+
+	if (peer_ni->ksnp_proto != &ksocknal_protocol_v3x &&
+	    peer_ni->ksnp_proto != &ksocknal_protocol_v4x)
+		return 0;
+
+        if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+	    ktime_get_seconds() < peer_ni->ksnp_last_alive +
+				  *ksocknal_tunables.ksnd_keepalive)
+                return 0;
+
+	if (ktime_get_seconds() < peer_ni->ksnp_send_keepalive)
+                return 0;
+
+        /* retry 10 secs later, so we wouldn't put pressure
+         * on this peer_ni if we failed to send keepalive this time */
+	peer_ni->ksnp_send_keepalive = ktime_get_seconds() + 10;
+
+        conn = ksocknal_find_conn_locked(peer_ni, NULL, 1);
+        if (conn != NULL) {
+                sched = conn->ksnc_scheduler;
+
+		spin_lock_bh(&sched->kss_lock);
+		if (!list_empty(&conn->ksnc_tx_queue)) {
+			spin_unlock_bh(&sched->kss_lock);
+			/* there is an queued ACK, don't need keepalive */
+			return 0;
+		}
+
+		spin_unlock_bh(&sched->kss_lock);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* cookie = 1 is reserved for keepalive PING */
+	tx = ksocknal_alloc_tx_noop(1, 1);
+	if (tx == NULL) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return -ENOMEM;
+	}
+
+	if (ksocknal_launch_packet(peer_ni->ksnp_ni, tx, &peer_ni->ksnp_id)
+	    == 0) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return 1;
+	}
+
+	ksocknal_free_tx(tx);
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	return -EIO;
+}
+
+
+static void
+ksocknal_check_peer_timeouts(int idx)
+{
+	struct hlist_head *peers = &ksocknal_data.ksnd_peers[idx];
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
+
+ again:
+	/* NB. We expect to have a look at all the peers and not find any
+	 * connections to time out, so we just use a shared lock while we
+	 * take a look...
+	 */
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	hlist_for_each_entry(peer_ni, peers, ksnp_list) {
+		struct ksock_tx *tx_stale;
+		time64_t deadline = 0;
+		int resid = 0;
+		int n = 0;
+
+		if (ksocknal_send_keepalive_locked(peer_ni) != 0) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			goto again;
+		}
+
+		conn = ksocknal_find_timed_out_conn(peer_ni);
+
+		if (conn != NULL) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT);
+
+			/* NB we won't find this one again, but we can't
+			 * just proceed with the next peer_ni, since we dropped
+			 * ksnd_global_lock and it might be dead already!
+			 */
+			ksocknal_conn_decref(conn);
+			goto again;
+		}
+
+		/* we can't process stale txs right here because we're
+		 * holding only shared lock
+		 */
+		tx = list_first_entry_or_null(&peer_ni->ksnp_tx_queue,
+					      struct ksock_tx, tx_list);
+		if (tx && ktime_get_seconds() >= tx->tx_deadline) {
+			ksocknal_peer_addref(peer_ni);
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			ksocknal_flush_stale_txs(peer_ni);
+
+			ksocknal_peer_decref(peer_ni);
+			goto again;
+		}
+
+		if (list_empty(&peer_ni->ksnp_zc_req_list))
+			continue;
+
+		tx_stale = NULL;
+		spin_lock(&peer_ni->ksnp_lock);
+		list_for_each_entry(tx, &peer_ni->ksnp_zc_req_list, tx_zc_list) {
+			if (ktime_get_seconds() < tx->tx_deadline)
+                                break;
+                        /* ignore the TX if connection is being closed */
+                        if (tx->tx_conn->ksnc_closing)
+                                continue;
+                        n++;
+			if (tx_stale == NULL)
+				tx_stale = tx;
+                }
+
+		if (tx_stale == NULL) {
+			spin_unlock(&peer_ni->ksnp_lock);
+			continue;
+		}
+
+		deadline = tx_stale->tx_deadline;
+		resid    = tx_stale->tx_resid;
+		conn     = tx_stale->tx_conn;
+		ksocknal_conn_addref(conn);
+
+		spin_unlock(&peer_ni->ksnp_lock);
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		CERROR("Total %d stale ZC_REQs for peer_ni %s detected; the "
+		       "oldest(%p) timed out %lld secs ago, "
+		       "resid: %d, wmem: %d\n",
+		       n, libcfs_nidstr(&peer_ni->ksnp_id.nid), tx_stale,
+		       ktime_get_seconds() - deadline,
+		       resid, conn->ksnc_sock->sk->sk_wmem_queued);
+
+                ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+                ksocknal_conn_decref(conn);
+                goto again;
+        }
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int ksocknal_reaper(void *arg)
+{
+	wait_queue_entry_t wait;
+	struct ksock_conn *conn;
+	struct ksock_sched *sched;
+	LIST_HEAD(enomem_conns);
+	int nenomem_conns;
+	time64_t timeout;
+	int i;
+	int peer_index = 0;
+	time64_t deadline = ktime_get_seconds();
+
+	init_wait(&wait);
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+		conn = list_first_entry_or_null(&ksocknal_data.ksnd_deathrow_conns,
+						struct ksock_conn, ksnc_list);
+		if (conn) {
+			list_del(&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_terminate_conn(conn);
+			ksocknal_conn_decref(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+                        continue;
+                }
+
+		conn = list_first_entry_or_null(&ksocknal_data.ksnd_zombie_conns,
+						struct ksock_conn, ksnc_list);
+		if (conn) {
+			list_del(&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_destroy_conn(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+			continue;
+		}
+
+		list_splice_init(&ksocknal_data.ksnd_enomem_conns,
+				 &enomem_conns);
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+                /* reschedule all the connections that stalled with ENOMEM... */
+                nenomem_conns = 0;
+		while ((conn = list_first_entry_or_null(&enomem_conns,
+							struct ksock_conn,
+							ksnc_tx_list)) != NULL) {
+			list_del(&conn->ksnc_tx_list);
+
+                        sched = conn->ksnc_scheduler;
+
+			spin_lock_bh(&sched->kss_lock);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			conn->ksnc_tx_ready = 1;
+			list_add_tail(&conn->ksnc_tx_list,
+					  &sched->kss_tx_conns);
+			wake_up(&sched->kss_waitq);
+
+			spin_unlock_bh(&sched->kss_lock);
+                        nenomem_conns++;
+                }
+
+		/* careful with the jiffy wrap... */
+		while ((timeout = deadline - ktime_get_seconds()) <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int  chunk = HASH_SIZE(ksocknal_data.ksnd_peers);
+			unsigned int lnd_timeout;
+
+			/* Time to check for timeouts on a few more peers: I
+			 * do checks every 'p' seconds on a proportion of the
+			 * peer_ni table and I need to check every connection
+			 * 'n' times within a timeout interval, to ensure I
+			 * detect a timeout on any connection within (n+1)/n
+			 * times the timeout interval.
+			 */
+
+			lnd_timeout = ksocknal_timeout();
+			if (lnd_timeout > n * p)
+				chunk = (chunk * n * p) / lnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				ksocknal_check_peer_timeouts(peer_index);
+				peer_index = (peer_index + 1) %
+					HASH_SIZE(ksocknal_data.ksnd_peers);
+			}
+
+			deadline += p;
+		}
+
+                if (nenomem_conns != 0) {
+                        /* Reduce my timeout if I rescheduled ENOMEM conns.
+                         * This also prevents me getting woken immediately
+                         * if any go back on my enomem list. */
+                        timeout = SOCKNAL_ENOMEM_RETRY;
+                }
+		ksocknal_data.ksnd_reaper_waketime = ktime_get_seconds() +
+						     timeout;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		if (!ksocknal_data.ksnd_shuttingdown &&
+		    list_empty(&ksocknal_data.ksnd_deathrow_conns) &&
+		    list_empty(&ksocknal_data.ksnd_zombie_conns))
+			schedule_timeout(cfs_time_seconds(timeout));
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+	}
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
new file mode 100644
index 0000000000000..46cb3c68e26ed
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
@@ -0,0 +1,698 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#include "socklnd.h"
+
+int
+ksocknal_lib_get_conn_addrs(struct ksock_conn *conn)
+{
+	int rc = lnet_sock_getaddr(conn->ksnc_sock, true,
+				   &conn->ksnc_peeraddr);
+
+	/* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+	LASSERT(!conn->ksnc_closing);
+
+	if (rc != 0) {
+		CERROR("Error %d getting sock peer_ni IP\n", rc);
+		return rc;
+	}
+
+	rc = lnet_sock_getaddr(conn->ksnc_sock, false,
+			       &conn->ksnc_myaddr);
+	if (rc != 0) {
+		CERROR("Error %d getting sock local IP\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+int
+ksocknal_lib_zc_capable(struct ksock_conn *conn)
+{
+	int  caps = conn->ksnc_sock->sk->sk_route_caps;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+		return 0;
+
+	/* ZC if the socket supports scatter/gather and doesn't need software
+	 * checksums */
+	return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_CSUM_MASK) != 0);
+}
+
+int
+ksocknal_lib_send_hdr(struct ksock_conn *conn, struct ksock_tx *tx,
+		      struct kvec *scratchiov)
+{
+	struct socket  *sock = conn->ksnc_sock;
+	int		nob = 0;
+	int		rc;
+
+	if (*ksocknal_tunables.ksnd_enable_csum	       && /* checksum enabled */
+	    conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
+	    tx->tx_nob == tx->tx_resid		       && /* frist sending    */
+	    tx->tx_msg.ksm_csum == 0)			  /* not checksummed  */
+		ksocknal_lib_csum_tx(tx);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+
+	{
+#if SOCKNAL_SINGLE_FRAG_TX
+		struct kvec scratch;
+		struct kvec *scratchiov = &scratch;
+		unsigned int niov = 1;
+#else
+		unsigned int niov = tx->tx_niov;
+#endif
+		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+
+		if (tx->tx_niov) {
+			scratchiov[0] = tx->tx_hdr;
+			nob += scratchiov[0].iov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob);
+	}
+	return rc;
+}
+
+int
+ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
+		       struct kvec *scratchiov)
+{
+	struct socket *sock = conn->ksnc_sock;
+	struct bio_vec *kiov = tx->tx_kiov;
+	int            rc;
+	int            nob;
+
+	/* Not NOOP message */
+	LASSERT(tx->tx_lnetmsg != NULL);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+		/* Zero copy is enabled */
+		struct sock   *sk = sock->sk;
+		struct page   *page = kiov->bv_page;
+		int            offset = kiov->bv_offset;
+		int            fragsize = kiov->bv_len;
+		int            msgflg = MSG_DONTWAIT;
+
+		CDEBUG(D_NET, "page %p + offset %x for %d\n",
+			       page, offset, kiov->bv_len);
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    fragsize < tx->tx_resid)
+			msgflg |= MSG_MORE;
+
+		rc = sk->sk_prot->sendpage(sk, page,
+					   offset, fragsize, msgflg);
+	} else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+		struct kvec	scratch;
+		struct kvec   *scratchiov = &scratch;
+		unsigned int	niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+		unsigned int  niov = tx->tx_nkiov;
+#endif
+		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+		int	      i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i].iov_base = kmap(kiov[i].bv_page) +
+						 kiov[i].bv_offset;
+			nob += scratchiov[i].iov_len = kiov[i].bv_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob);
+
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].bv_page);
+	}
+	return rc;
+}
+
+void
+ksocknal_lib_eager_ack(struct ksock_conn *conn)
+{
+	struct socket *sock = conn->ksnc_sock;
+
+	/* Remind the socket to ACK eagerly.  If I don't, the socket might
+	 * think I'm about to send something it could piggy-back the ACK on,
+	 * introducing delay in completing zero-copy sends in my peer_ni.
+	 */
+
+	tcp_sock_set_quickack(sock->sk, 1);
+}
+
+int
+ksocknal_lib_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+	struct kvec  scratch;
+	struct kvec *scratchiov = &scratch;
+	unsigned int  niov = 1;
+#else
+	unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+	struct kvec *iov = conn->ksnc_rx_iov;
+	struct msghdr msg = {
+		.msg_flags      = 0
+	};
+        int          nob;
+        int          i;
+        int          rc;
+        int          fragnob;
+        int          sum;
+        __u32        saved_csum;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+        LASSERT (niov > 0);
+
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i] = iov[i];
+                nob += scratchiov[i].iov_len;
+        }
+        LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, niov, nob,
+			    MSG_DONTWAIT);
+
+        saved_csum = 0;
+        if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+                saved_csum = conn->ksnc_msg.ksm_csum;
+                conn->ksnc_msg.ksm_csum = 0;
+        }
+
+        if (saved_csum != 0) {
+                /* accumulate checksum */
+                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+                        LASSERT (i < niov);
+
+                        fragnob = iov[i].iov_len;
+                        if (fragnob > sum)
+                                fragnob = sum;
+
+                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+                                                           iov[i].iov_base, fragnob);
+                }
+                conn->ksnc_msg.ksm_csum = saved_csum;
+        }
+
+        return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+        if (addr == NULL)
+                return;
+
+        vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(struct bio_vec *kiov, int niov,
+		       struct kvec *iov, struct page **pages)
+{
+        void             *addr;
+        int               nob;
+        int               i;
+
+        if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+                return NULL;
+
+        LASSERT (niov <= LNET_MAX_IOV);
+
+        if (niov < 2 ||
+            niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+                return NULL;
+
+	for (nob = i = 0; i < niov; i++) {
+		if ((kiov[i].bv_offset != 0 && i > 0) ||
+		    (kiov[i].bv_offset + kiov[i].bv_len !=
+		     PAGE_SIZE && i < niov - 1))
+			return NULL;
+
+		pages[i] = kiov[i].bv_page;
+		nob += kiov[i].bv_len;
+	}
+
+	addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+	if (addr == NULL)
+		return NULL;
+
+	iov->iov_base = addr + kiov[0].bv_offset;
+	iov->iov_len = nob;
+
+	return addr;
+}
+
+int
+ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
+		       struct kvec *scratchiov)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+	struct kvec   scratch;
+	struct kvec  *scratchiov = &scratch;
+        struct page  **pages      = NULL;
+        unsigned int   niov       = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+	unsigned int   niov       = conn->ksnc_rx_nkiov;
+#endif
+	struct bio_vec *kiov = conn->ksnc_rx_kiov;
+	struct msghdr msg = {
+		.msg_flags      = 0
+	};
+        int          nob;
+        int          i;
+        int          rc;
+        void        *base;
+        void        *addr;
+        int          sum;
+        int          fragnob;
+	int n;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+	if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
+		nob = scratchiov[0].iov_len;
+		n = 1;
+
+	} else {
+		for (nob = i = 0; i < niov; i++) {
+			nob += scratchiov[i].iov_len = kiov[i].bv_len;
+			scratchiov[i].iov_base = kmap(kiov[i].bv_page) +
+						 kiov[i].bv_offset;
+		}
+		n = niov;
+	}
+
+	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, n, nob,
+			    MSG_DONTWAIT);
+
+	if (conn->ksnc_msg.ksm_csum != 0) {
+		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+			LASSERT(i < niov);
+
+			/* Dang! have to kmap again because I have nowhere to
+			 * stash the mapped address.  But by doing it while the
+			 * page is still mapped, the kernel just bumps the map
+			 * count and returns me the address it stashed.
+			 */
+			base = kmap(kiov[i].bv_page) + kiov[i].bv_offset;
+			fragnob = kiov[i].bv_len;
+			if (fragnob > sum)
+				fragnob = sum;
+
+			conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+							   base, fragnob);
+
+			kunmap(kiov[i].bv_page);
+		}
+	}
+
+	if (addr != NULL) {
+		ksocknal_lib_kiov_vunmap(addr);
+	} else {
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].bv_page);
+	}
+
+	return rc;
+}
+
+void
+ksocknal_lib_csum_tx(struct ksock_tx *tx)
+{
+        int          i;
+        __u32        csum;
+        void        *base;
+
+	LASSERT(tx->tx_hdr.iov_base == (void *)&tx->tx_msg);
+	LASSERT(tx->tx_conn != NULL);
+	LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+        tx->tx_msg.ksm_csum = 0;
+
+	csum = ksocknal_csum(~0, (void *)tx->tx_hdr.iov_base,
+			     tx->tx_hdr.iov_len);
+
+	for (i = 0; i < tx->tx_nkiov; i++) {
+		base = kmap(tx->tx_kiov[i].bv_page) +
+			tx->tx_kiov[i].bv_offset;
+
+		csum = ksocknal_csum(csum, base, tx->tx_kiov[i].bv_len);
+
+		kunmap(tx->tx_kiov[i].bv_page);
+	}
+
+        if (*ksocknal_tunables.ksnd_inject_csum_error) {
+                csum++;
+                *ksocknal_tunables.ksnd_inject_csum_error = 0;
+        }
+
+        tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, int *rxmem, int *nagle)
+{
+	struct socket *sock = conn->ksnc_sock;
+	struct tcp_sock *tp = tcp_sk(sock->sk);
+
+	if (ksocknal_connsock_addref(conn) < 0) {
+		LASSERT(conn->ksnc_closing);
+		*txmem = 0;
+		*rxmem = 0;
+		*nagle = 0;
+		return -ESHUTDOWN;
+	}
+
+	lnet_sock_getbuf(sock, txmem, rxmem);
+
+	*nagle = !(tp->nonagle & TCP_NAGLE_OFF);
+
+	ksocknal_connsock_decref(conn);
+
+
+	return 0;
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+	int rc;
+	int keep_idle;
+	int keep_intvl;
+	int keep_count;
+	int do_keepalive;
+	struct tcp_sock *tp = tcp_sk(sock->sk);
+
+	sock->sk->sk_allocation = GFP_NOFS;
+
+	/* Ensure this socket aborts active sends immediately when closed. */
+	sock_reset_flag(sock->sk, SOCK_LINGER);
+
+	tp->linger2 = -1;
+
+	if (!*ksocknal_tunables.ksnd_nagle)
+		tcp_sock_set_nodelay(sock->sk);
+
+	lnet_sock_setbuf(sock,
+			 *ksocknal_tunables.ksnd_tx_buffer_size,
+			 *ksocknal_tunables.ksnd_rx_buffer_size);
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+#ifdef SOCKNAL_BACKOFF
+	if (*ksocknal_tunables.ksnd_backoff_init > 0) {
+		int option = *ksocknal_tunables.ksnd_backoff_init;
+#ifdef SOCKNAL_BACKOFF_MS
+		option *= 1000;
+#endif
+
+		rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_INIT,
+				       (char *)&option, sizeof(option));
+		if (rc != 0) {
+			CERROR("Can't set initial tcp backoff %d: %d\n",
+			       option, rc);
+			return rc;
+		}
+	}
+
+	if (*ksocknal_tunables.ksnd_backoff_max > 0) {
+		int option = *ksocknal_tunables.ksnd_backoff_max;
+#ifdef SOCKNAL_BACKOFF_MS
+		option *= 1000;
+#endif
+
+		rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_MAX,
+				       (char *)&option, sizeof(option));
+		if (rc != 0) {
+			CERROR("Can't set maximum tcp backoff %d: %d\n",
+			       option, rc);
+			return rc;
+		}
+	}
+#endif
+
+	/* snapshot tunables */
+	keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+	keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+	keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+	do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+#ifdef HAVE_KERNEL_SETSOCKOPT
+	/* open-coded version doesn't work in all kernels, and
+	 * there is no helper function, so call kernel_setsockopt()
+	 * directly.
+	 */
+	{
+		int option = (do_keepalive ? 1 : 0);
+		kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				  (char *)&option, sizeof(option));
+	}
+#else
+	if (sock->sk->sk_prot->keepalive)
+		sock->sk->sk_prot->keepalive(sock->sk, do_keepalive);
+	if (do_keepalive)
+		sock_set_flag(sock->sk, SOCK_KEEPOPEN);
+	else
+		sock_reset_flag(sock->sk, SOCK_KEEPOPEN);
+#endif /* HAVE_KERNEL_SETSOCKOPT */
+
+	if (!do_keepalive)
+		return (0);
+
+	rc = tcp_sock_set_keepidle(sock->sk, keep_idle);
+	if (rc != 0) {
+		CERROR("Can't set TCP_KEEPIDLE: %d\n", rc);
+		return rc;
+	}
+
+	rc = tcp_sock_set_keepintvl(sock->sk, keep_intvl);
+	if (rc != 0) {
+		CERROR("Can't set TCP_KEEPINTVL: %d\n", rc);
+		return rc;
+	}
+
+	rc = tcp_sock_set_keepcnt(sock->sk, keep_count);
+	if (rc != 0) {
+		CERROR("Can't set TCP_KEEPCNT: %d\n", rc);
+		return rc;
+	}
+
+	return (0);
+}
+
+void
+ksocknal_lib_push_conn(struct ksock_conn *conn)
+{
+	struct sock *sk;
+	struct tcp_sock *tp;
+	int nonagle;
+	int rc;
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0)                            /* being shut down */
+		return;
+
+	sk = conn->ksnc_sock->sk;
+	tp = tcp_sk(sk);
+
+	lock_sock(sk);
+	nonagle = tp->nonagle;
+	tp->nonagle = TCP_NAGLE_OFF;
+	release_sock(sk);
+
+	tcp_sock_set_nodelay(conn->ksnc_sock->sk);
+
+	lock_sock(sk);
+	tp->nonagle = nonagle;
+	release_sock(sk);
+
+	ksocknal_connsock_decref(conn);
+}
+
+void ksocknal_read_callback(struct ksock_conn *conn);
+void ksocknal_write_callback(struct ksock_conn *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+ksocknal_data_ready(struct sock *sk)
+#else
+ksocknal_data_ready(struct sock *sk, int n)
+#endif
+{
+	struct ksock_conn  *conn;
+
+        /* interleave correctly with closing sockets... */
+        LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	if (conn == NULL) {	/* raced with ksocknal_terminate_conn */
+		LASSERT(sk->sk_data_ready != &ksocknal_data_ready);
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+		sk->sk_data_ready(sk);
+#else
+		sk->sk_data_ready(sk, n);
+#endif
+	} else
+		ksocknal_read_callback(conn);
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+static void
+ksocknal_write_space (struct sock *sk)
+{
+	struct ksock_conn  *conn;
+        int            wspace;
+        int            min_wpace;
+
+        /* interleave correctly with closing sockets... */
+        LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+        conn = sk->sk_user_data;
+	wspace = sk_stream_wspace(sk);
+	min_wpace = sk_stream_min_wspace(sk);
+
+        CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+               sk, wspace, min_wpace, conn,
+               (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+                                      " ready" : " blocked"),
+               (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+                                      " scheduled" : " idle"),
+	       (conn == NULL) ? "" : (list_empty(&conn->ksnc_tx_queue) ?
+                                      " empty" : " queued"));
+
+        if (conn == NULL) {             /* raced with ksocknal_terminate_conn */
+                LASSERT (sk->sk_write_space != &ksocknal_write_space);
+                sk->sk_write_space (sk);
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+                return;
+        }
+
+        if (wspace >= min_wpace) {              /* got enough space */
+                ksocknal_write_callback(conn);
+
+                /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+                 * ENOMEM check in ksocknal_transmit is race-free (think about
+                 * it). */
+
+                clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
+        }
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn)
+{
+        conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+        conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn)
+{
+        sock->sk->sk_user_data = conn;
+        sock->sk->sk_data_ready = ksocknal_data_ready;
+        sock->sk->sk_write_space = ksocknal_write_space;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn)
+{
+        /* Remove conn's network callbacks.
+         * NB I _have_ to restore the callback, rather than storing a noop,
+         * since the socket could survive past this module being unloaded!! */
+        sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+        sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+        /* A callback could be in progress already; they hold a read lock
+         * on ksnd_global_lock (to serialise with me) and NOOP if
+         * sk_user_data is NULL. */
+        sock->sk->sk_user_data = NULL;
+
+        return ;
+}
+
+int
+ksocknal_lib_memory_pressure(struct ksock_conn *conn)
+{
+	int            rc = 0;
+	struct ksock_sched *sched;
+
+	sched = conn->ksnc_scheduler;
+	spin_lock_bh(&sched->kss_lock);
+
+	if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) &&
+            !conn->ksnc_tx_ready) {
+                /* SOCK_NOSPACE is set when the socket fills
+                 * and cleared in the write_space callback
+                 * (which also sets ksnc_tx_ready).  If
+                 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+                 * zero, I didn't fill the socket and
+                 * write_space won't reschedule me, so I
+                 * return -ENOMEM to get my caller to retry
+                 * after a timeout */
+                rc = -ENOMEM;
+        }
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644
index 0000000000000..10aee437590bb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+#include <linux/kvm_host.h>
+#if defined(__x86_64__) || defined(__i386__)
+#include <asm/hypervisor.h>
+#endif
+#ifdef HAVE_ETHTOOL_LINK_SETTINGS
+#include <linux/inetdevice.h>
+#include <linux/ethtool.h>
+#endif
+
+#define CURRENT_LND_VERSION 1
+
+static int sock_timeout;
+module_param(sock_timeout, int, 0644);
+MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
+
+static int credits = DEFAULT_CREDITS;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
+
+static int peer_credits = DEFAULT_PEER_CREDITS;
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
+
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
+
+static int peer_timeout = DEFAULT_PEER_TIMEOUT;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+module_param(nconnds, int, 0444);
+MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
+
+static int nconnds_max = 64;
+module_param(nconnds_max, int, 0444);
+MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
+
+static int min_reconnectms = 1000;
+module_param(min_reconnectms, int, 0644);
+MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+module_param(max_reconnectms, int, 0644);
+MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
+
+static int eager_ack;
+module_param(eager_ack, int, 0644);
+MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+module_param(typed_conns, int, 0444);
+MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
+
+static int min_bulk = (1<<10);
+module_param(min_bulk, int, 0644);
+MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+module_param(tx_buffer_size, int, 0644);
+MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+module_param(rx_buffer_size, int, 0644);
+MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
+
+static int nagle = 0;
+module_param(nagle, int, 0644);
+MODULE_PARM_DESC(nagle, "enable NAGLE?");
+
+static int round_robin = 1;
+module_param(round_robin, int, 0644);
+MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
+
+static int keepalive = 30;
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+module_param(keepalive_idle, int, 0644);
+MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT  5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+module_param(keepalive_count, int, 0644);
+MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
+
+static int keepalive_intvl = 5;
+module_param(keepalive_intvl, int, 0644);
+MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
+
+static int enable_csum = 0;
+module_param(enable_csum, int, 0644);
+MODULE_PARM_DESC(enable_csum, "enable check sum");
+
+static int inject_csum_error = 0;
+module_param(inject_csum_error, int, 0644);
+MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
+
+static int enable_irq_affinity = 0;
+module_param(enable_irq_affinity, int, 0644);
+MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity");
+
+static int nonblk_zcack = 1;
+module_param(nonblk_zcack, int, 0644);
+MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = (16 << 10);
+module_param(zc_min_payload, int, 0644);
+MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
+
+static unsigned int zc_recv = 0;
+module_param(zc_recv, int, 0644);
+MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+module_param(zc_recv_min_nfrags, int, 0644);
+MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
+
+static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER;
+module_param(conns_per_peer, uint, 0644);
+MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
+
+/* By default skip_mr_route_setup is 0 (do not skip) */
+static unsigned int skip_mr_route_setup;
+module_param(skip_mr_route_setup, uint, 0444);
+MODULE_PARM_DESC(skip_mr_route_setup, "skip automatic setup of linux routes for MR");
+
+#ifdef SOCKNAL_BACKOFF
+static int backoff_init = 3;
+module_param(backoff_init, int, 0644);
+MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff");
+
+static int backoff_max = 3;
+module_param(backoff_max, int, 0644);
+MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff");
+#endif
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+module_param(protocol, int, 0644);
+MODULE_PARM_DESC(protocol, "protocol version");
+#endif
+
+static inline bool is_native_host(void)
+{
+#if (!(defined(__x86_64__) || defined(__i386)))
+	return true;
+#elif defined(HAVE_HYPERVISOR_IS_TYPE)
+	return hypervisor_is_type(X86_HYPER_NATIVE);
+#else
+	return x86_hyper == NULL;
+#endif
+}
+
+struct ksock_tunables ksocknal_tunables;
+static struct lnet_ioctl_config_socklnd_tunables default_tunables;
+
+#ifdef HAVE_ETHTOOL_LINK_SETTINGS
+static int ksocklnd_ni_get_eth_intf_speed(struct lnet_ni *ni)
+{
+	struct net_device *dev;
+	int intf_idx = -1;
+	int ret = -1;
+
+	DECLARE_CONST_IN_IFADDR(ifa);
+
+	/* check if ni has interface assigned */
+	if (!ni->ni_net_ns || !ni->ni_interface)
+		return 0;
+
+	rtnl_lock();
+	for_each_netdev(ni->ni_net_ns, dev) {
+		int flags = dev_get_flags(dev);
+		struct in_device *in_dev;
+
+		if (flags & IFF_LOOPBACK) /* skip the loopback IF */
+			continue;
+
+		if (!(flags & IFF_UP))
+			continue;
+
+		in_dev = __in_dev_get_rcu(dev);
+		if (!in_dev)
+			continue;
+
+		in_dev_for_each_ifa_rcu(ifa, in_dev) {
+			if (strcmp(ifa->ifa_label, ni->ni_interface) == 0)
+				intf_idx = dev->ifindex;
+		}
+		endfor_ifa(in_dev);
+
+		if (intf_idx >= 0)
+			break;
+	}
+	if (intf_idx >= 0) {
+		struct ethtool_link_ksettings cmd;
+		int ethtool_ret;
+
+		/* Some devices may not be providing link settings */
+		ethtool_ret = __ethtool_get_link_ksettings(dev, &cmd);
+		if (!ethtool_ret)
+			ret = cmd.base.speed;
+		else
+			ret = ethtool_ret;
+	}
+	rtnl_unlock();
+
+	return ret;
+}
+
+static int ksocklnd_speed2cpp(int speed)
+{
+	/* Use the minimum of 1Gbps to avoid calling ilog2 with 0 */
+	if (speed < 1000)
+		speed = 1000;
+
+	/* Pick heuristically optimal conns_per_peer value
+	 * for the specified ethernet interface speed (Mbps)
+	 */
+	return ilog2(speed/1000) / 2 + 1;
+}
+#endif
+
+static int ksocklnd_lookup_conns_per_peer(struct lnet_ni *ni)
+{
+	int cpp = 1;
+#ifdef HAVE_ETHTOOL_LINK_SETTINGS
+	int speed = ksocklnd_ni_get_eth_intf_speed(ni);
+
+	if (ni->ni_interface)
+		CDEBUG(D_NET, "intf %s speed %d\n", ni->ni_interface, speed);
+
+	if (speed > 0)
+		cpp = ksocklnd_speed2cpp(speed);
+#endif
+	return cpp;
+}
+
+int ksocknal_tunables_init(void)
+{
+	default_tunables.lnd_version = CURRENT_LND_VERSION;
+	default_tunables.lnd_conns_per_peer = conns_per_peer;
+
+	/* initialize ksocknal_tunables structure */
+	ksocknal_tunables.ksnd_timeout            = &sock_timeout;
+	ksocknal_tunables.ksnd_nscheds		  = &nscheds;
+	ksocknal_tunables.ksnd_nconnds            = &nconnds;
+	ksocknal_tunables.ksnd_nconnds_max        = &nconnds_max;
+	ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+	ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+	ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
+	ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
+	ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
+	ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+	ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+	ksocknal_tunables.ksnd_nagle              = &nagle;
+	ksocknal_tunables.ksnd_round_robin        = &round_robin;
+	ksocknal_tunables.ksnd_keepalive          = &keepalive;
+	ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+	ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+	ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+	ksocknal_tunables.ksnd_credits            = &credits;
+	ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+	ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+	ksocknal_tunables.ksnd_peertimeout        = &peer_timeout;
+	ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
+	ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+	ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+	ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+	ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
+	ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+	if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) {
+		CWARN("socklnd conns_per_peer is capped at %u.\n",
+		      (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1);
+	}
+	ksocknal_tunables.ksnd_conns_per_peer     = &conns_per_peer;
+
+	if (enable_irq_affinity) {
+		CWARN("irq_affinity is removed from socklnd because modern "
+		      "computer always has fast CPUs and more cores than "
+		      "# NICs, although you still can set irq_affinity by "
+		      "another way, please check manual for details.\n");
+	}
+	ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
+
+#ifdef SOCKNAL_BACKOFF
+	ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
+	ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
+#endif
+
+#if SOCKNAL_VERSION_DEBUG
+	ksocknal_tunables.ksnd_protocol           = &protocol;
+#endif
+
+	if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+		*ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+	/* When on a hypervisor set the minimum zero copy size
+	 * above the maximum payload size
+	 */
+	if (!is_native_host())
+		*ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1;
+
+	return 0;
+}
+
+void ksocknal_tunables_setup(struct lnet_ni *ni)
+{
+	struct lnet_ioctl_config_socklnd_tunables *tunables;
+	struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
+
+	/* If no tunables specified, setup default tunables */
+	if (!ni->ni_lnd_tunables_set)
+		memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_sock,
+		       &default_tunables, sizeof(*tunables));
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock;
+
+	/* Current API version */
+	tunables->lnd_version = CURRENT_LND_VERSION;
+
+	net_tunables = &ni->ni_net->net_tunables;
+
+	if (net_tunables->lct_peer_timeout == -1)
+		net_tunables->lct_peer_timeout =
+			*ksocknal_tunables.ksnd_peertimeout;
+
+	if (net_tunables->lct_max_tx_credits == -1)
+		net_tunables->lct_max_tx_credits =
+			*ksocknal_tunables.ksnd_credits;
+
+	if (net_tunables->lct_peer_tx_credits == -1)
+		net_tunables->lct_peer_tx_credits =
+			*ksocknal_tunables.ksnd_peertxcredits;
+
+	if (net_tunables->lct_peer_tx_credits >
+	    net_tunables->lct_max_tx_credits)
+		net_tunables->lct_peer_tx_credits =
+			net_tunables->lct_max_tx_credits;
+
+	if (net_tunables->lct_peer_rtr_credits == -1)
+		net_tunables->lct_peer_rtr_credits =
+			*ksocknal_tunables.ksnd_peerrtrcredits;
+
+	if (!tunables->lnd_conns_per_peer)
+		tunables->lnd_conns_per_peer =
+			ksocklnd_lookup_conns_per_peer(ni);
+}
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644
index 0000000000000..40a1ffbea1405
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
@@ -0,0 +1,1001 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ *   pro_send_hello       : send hello message
+ *   pro_recv_hello       : receive hello message
+ *   pro_pack             : pack message header
+ *   pro_unpack           : unpack message header
+ *   pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ *                          return 1 if ACK is piggybacked, otherwise return 0
+ *   pro_queue_tx_msg()   : Called holding BH lock: kss_lock
+ *                          return the ACK that piggybacked by my message, or NULL
+ *   pro_handle_zcreq()   : handler of incoming ZC-REQ
+ *   pro_handle_zcack()   : handler of incoming ZC-ACK
+ *   pro_match_tx()       : Called holding glock
+ */
+
+static struct ksock_tx *
+ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg)
+{
+        /* V1.x, just enqueue it */
+	list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+        return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(struct ksock_conn *conn)
+{
+	struct ksock_tx *tx = conn->ksnc_tx_carrier;
+
+        /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+	LASSERT(!list_empty(&conn->ksnc_tx_queue));
+	LASSERT(tx != NULL);
+
+        /* Next TX that can carry ZC-ACK or LNet message */
+        if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+                /* no more packets queued */
+                conn->ksnc_tx_carrier = NULL;
+        } else {
+		conn->ksnc_tx_carrier = list_next_entry(tx, tx_list);
+		LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type ==
+			tx->tx_msg.ksm_type);
+        }
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn,
+			   struct ksock_tx *tx_ack, __u64 cookie)
+{
+	struct ksock_tx *tx = conn->ksnc_tx_carrier;
+
+        LASSERT (tx_ack == NULL ||
+                 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+        /*
+         * Enqueue or piggyback tx_ack / cookie
+         * . no tx can piggyback cookie of tx_ack (or cookie), just
+         *   enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+         * . There is tx can piggyback cookie of tx_ack (or cookie),
+         *   piggyback the cookie and return the tx.
+         */
+        if (tx == NULL) {
+                if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+                                          &conn->ksnc_tx_queue);
+                        conn->ksnc_tx_carrier = tx_ack;
+                }
+                return 0;
+        }
+
+        if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+                /* tx is noop zc-ack, can't piggyback zc-ack cookie */
+                if (tx_ack != NULL)
+			list_add_tail(&tx_ack->tx_list,
+                                          &conn->ksnc_tx_queue);
+                return 0;
+        }
+
+        LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+        LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+        if (tx_ack != NULL)
+                cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+        /* piggyback the zc-ack cookie */
+        tx->tx_msg.ksm_zc_cookies[1] = cookie;
+        /* move on to the next TX which can carry cookie */
+        ksocknal_next_tx_carrier(conn);
+
+        return 1;
+}
+
+static struct ksock_tx *
+ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg)
+{
+	struct ksock_tx  *tx  = conn->ksnc_tx_carrier;
+
+        /*
+         * Enqueue tx_msg:
+         * . If there is no NOOP on the connection, just enqueue
+         *   tx_msg and return NULL
+         * . If there is NOOP on the connection, piggyback the cookie
+         *   and replace the NOOP tx, and return the NOOP tx.
+         */
+        if (tx == NULL) { /* nothing on queue */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+                conn->ksnc_tx_carrier = tx_msg;
+                return NULL;
+        }
+
+        if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+                return NULL;
+        }
+
+        LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+        /* There is a noop zc-ack can be piggybacked */
+        tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+        ksocknal_next_tx_carrier(conn);
+
+	/* use new_tx to replace the noop zc-ack packet */
+	list_splice(&tx->tx_list, &tx_msg->tx_list);
+
+	return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn,
+			   struct ksock_tx *tx_ack, __u64 cookie)
+{
+	struct ksock_tx *tx;
+
+        if (conn->ksnc_type != SOCKLND_CONN_ACK)
+                return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+        /* non-blocking ZC-ACK (to router) */
+        LASSERT (tx_ack == NULL ||
+                 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+        if ((tx = conn->ksnc_tx_carrier) == NULL) {
+                if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+                                          &conn->ksnc_tx_queue);
+                        conn->ksnc_tx_carrier = tx_ack;
+                }
+                return 0;
+        }
+
+        /* conn->ksnc_tx_carrier != NULL */
+
+        if (tx_ack != NULL)
+                cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+        if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+                return 1;
+
+        if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+                /* replace the keepalive PING with a real ACK */
+                LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+                tx->tx_msg.ksm_zc_cookies[1] = cookie;
+                return 1;
+        }
+
+	if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+	    cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+		CWARN("%s: duplicated ZC cookie: %llu\n",
+		      libcfs_idstr(&conn->ksnc_peer->ksnp_id), cookie);
+		return 1; /* XXX return error in the future */
+	}
+
+        if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+                /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+                if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+                        tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+                        tx->tx_msg.ksm_zc_cookies[1] = cookie;
+                } else {
+                        tx->tx_msg.ksm_zc_cookies[0] = cookie;
+                }
+
+                if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+                        /* not likely to carry more ACKs, skip it to simplify logic */
+                        ksocknal_next_tx_carrier(conn);
+                }
+
+                return 1;
+        }
+
+        /* takes two or more cookies already */
+
+        if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+                __u64   tmp = 0;
+
+		/* two separated cookies: (a+2, a) or (a+1, a) */
+                LASSERT (tx->tx_msg.ksm_zc_cookies[0] -
+                         tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+                if (tx->tx_msg.ksm_zc_cookies[0] -
+                    tx->tx_msg.ksm_zc_cookies[1] == 2) {
+                        if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+                                tmp = cookie;
+                } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+                        tmp = tx->tx_msg.ksm_zc_cookies[1];
+                } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+                        tmp = tx->tx_msg.ksm_zc_cookies[0];
+                }
+
+                if (tmp != 0) {
+                        /* range of cookies */
+                        tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+                        tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+                        return 1;
+                }
+
+        } else {
+		/* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is a range
+		 * of cookies
+		 */
+		if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+		    cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+			CWARN("%s: duplicated ZC cookie: %llu\n",
+			      libcfs_idstr(&conn->ksnc_peer->ksnp_id),
+			      cookie);
+			return 1; /* XXX: return error in the future */
+		}
+
+                if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+                        tx->tx_msg.ksm_zc_cookies[1] = cookie;
+                        return 1;
+                }
+
+                if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+                        tx->tx_msg.ksm_zc_cookies[0] = cookie;
+                        return 1;
+                }
+        }
+
+        /* failed to piggyback ZC-ACK */
+        if (tx_ack != NULL) {
+		list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+                /* the next tx can piggyback at least 1 ACK */
+                ksocknal_next_tx_carrier(conn);
+        }
+
+        return 0;
+}
+
+static int
+ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
+{
+        int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+        if (!*ksocknal_tunables.ksnd_typed_conns)
+                return SOCKNAL_MATCH_YES;
+#endif
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL) {
+		/* noop packet */
+		nob = sizeof(struct ksock_msg_hdr);
+	} else {
+		nob = tx->tx_lnetmsg->msg_len +
+			((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+			 0 : sizeof(struct ksock_msg_hdr)) +
+			sizeof(struct lnet_hdr_nid4);
+	}
+
+        /* default checking for typed connection */
+        switch (conn->ksnc_type) {
+        default:
+                CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+                LBUG();
+        case SOCKLND_CONN_ANY:
+                return SOCKNAL_MATCH_YES;
+
+        case SOCKLND_CONN_BULK_IN:
+                return SOCKNAL_MATCH_MAY;
+
+        case SOCKLND_CONN_BULK_OUT:
+                if (nob < *ksocknal_tunables.ksnd_min_bulk)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_YES;
+
+        case SOCKLND_CONN_CONTROL:
+                if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_YES;
+        }
+}
+
+static int
+ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
+{
+        int nob;
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL)
+		nob = sizeof(struct ksock_msg_hdr);
+	else
+		nob = sizeof(struct ksock_msg_hdr) +
+			sizeof(struct lnet_hdr_nid4) +
+			tx->tx_lnetmsg->msg_len;
+
+        switch (conn->ksnc_type) {
+        default:
+                CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+                LBUG();
+        case SOCKLND_CONN_ANY:
+                return SOCKNAL_MATCH_NO;
+
+        case SOCKLND_CONN_ACK:
+                if (nonblk)
+                        return SOCKNAL_MATCH_YES;
+                else if (tx == NULL || tx->tx_lnetmsg == NULL)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_NO;
+
+        case SOCKLND_CONN_BULK_OUT:
+                if (nonblk)
+                        return SOCKNAL_MATCH_NO;
+                else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_YES;
+
+        case SOCKLND_CONN_CONTROL:
+                if (nonblk)
+                        return SOCKNAL_MATCH_NO;
+                else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_YES;
+        }
+}
+
+static int
+ksocknal_match_tx_v4(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
+{
+	int nob;
+
+	if (!tx || !tx->tx_lnetmsg)
+		nob = sizeof(struct ksock_msg_hdr);
+	else
+		nob = sizeof(struct ksock_msg_hdr) +
+			sizeof(struct lnet_hdr_nid16) +
+			tx->tx_lnetmsg->msg_len;
+
+	switch (conn->ksnc_type) {
+	default:
+		CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+		LBUG();
+	case SOCKLND_CONN_ANY:
+		return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_ACK:
+		if (nonblk)
+			return SOCKNAL_MATCH_YES;
+		else if (tx == NULL || tx->tx_lnetmsg == NULL)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_BULK_OUT:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_CONTROL:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+	}
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote)
+{
+	struct ksock_peer_ni *peer_ni = c->ksnc_peer;
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
+	int rc;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = ksocknal_find_conn_locked(peer_ni, NULL, !!remote);
+	if (conn != NULL) {
+		struct ksock_sched *sched = conn->ksnc_scheduler;
+
+		LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		spin_lock_bh(&sched->kss_lock);
+
+		rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+		spin_unlock_bh(&sched->kss_lock);
+
+		if (rc) { /* piggybacked */
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			return 0;
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+        /* ACK connection is not ready, or can't piggyback the ACK */
+        tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+        if (tx == NULL)
+                return -ENOMEM;
+
+	rc = ksocknal_launch_packet(peer_ni->ksnp_ni, tx, &peer_ni->ksnp_id);
+	if (rc == 0)
+		return 0;
+
+	ksocknal_free_tx(tx);
+	return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2)
+{
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_tx *tx;
+	struct ksock_tx *tmp;
+	LIST_HEAD(zlist);
+	int count;
+
+        if (cookie1 == 0)
+                cookie1 = cookie2;
+
+        count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+	if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+	    (conn->ksnc_proto == &ksocknal_protocol_v3x ||
+	     conn->ksnc_proto == &ksocknal_protocol_v4x)) {
+		/* keepalive PING for V3.x, just ignore it */
+		return count == 1 ? 0 : -EPROTO;
+	}
+
+	spin_lock(&peer_ni->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp, &peer_ni->ksnp_zc_req_list,
+				 tx_zc_list) {
+                __u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+                if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+                        tx->tx_msg.ksm_zc_cookies[0] = 0;
+			list_move(&tx->tx_zc_list, &zlist);
+
+                        if (--count == 0)
+                                break;
+                }
+        }
+
+	spin_unlock(&peer_ni->ksnp_lock);
+
+	while ((tx = list_first_entry_or_null(&zlist, struct ksock_tx,
+					      tx_zc_list)) != NULL) {
+		list_del(&tx->tx_zc_list);
+                ksocknal_tx_decref(tx);
+        }
+
+        return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello)
+{
+	struct socket *sock = conn->ksnc_sock;
+	struct _lnet_hdr_nid4 *hdr;
+	struct lnet_magicversion *hmv;
+	int rc;
+	int i;
+
+	BUILD_BUG_ON(sizeof(struct lnet_magicversion) !=
+		     offsetof(struct _lnet_hdr_nid4, src_nid));
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate struct lnet_hdr_nid4\n");
+		return -ENOMEM;
+	}
+
+	hmv = (struct lnet_magicversion *)&hdr->dest_nid;
+
+	/* Re-organize V2.x message header to V1.x (struct lnet_hdr_nid4)
+	 * header and send out
+	 */
+	hmv->magic         = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+	hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+	hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+	if (the_lnet.ln_testprotocompat) {
+		/* single-shot proto check */
+		if (test_and_clear_bit(0, &the_lnet.ln_testprotocompat))
+			hmv->version_major++;   /* just different! */
+
+		if (test_and_clear_bit(1, &the_lnet.ln_testprotocompat))
+			hmv->magic = LNET_PROTO_MAGIC;
+	}
+
+	hdr->src_nid        = cpu_to_le64(lnet_nid_to_nid4(&hello->kshm_src_nid));
+	hdr->src_pid        = cpu_to_le32 (hello->kshm_src_pid);
+	hdr->type           = cpu_to_le32 (LNET_MSG_HELLO);
+	hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+	hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+	hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+	rc = lnet_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %pISp\n",
+			rc, &conn->ksnc_peeraddr);
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	for (i = 0; i < (int) hello->kshm_nips; i++)
+		hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+
+	rc = lnet_sock_write(sock, hello->kshm_ips,
+			     hello->kshm_nips * sizeof(__u32),
+			     lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d) to %pISp\n",
+			rc, hello->kshm_nips,
+			&conn->ksnc_peeraddr);
+	}
+out:
+	LIBCFS_FREE(hdr, sizeof(*hdr));
+
+	return rc;
+}
+
+static int
+ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello)
+{
+	struct socket *sock = conn->ksnc_sock;
+	int rc;
+	struct ksock_hello_msg_nid4 *hello4;
+
+	CFS_ALLOC_PTR(hello4);
+	if (!hello4) {
+		CERROR("Can't allocate struct ksock_hello_msg_nid4\n");
+		return -ENOMEM;
+	}
+
+	hello->kshm_magic = LNET_PROTO_MAGIC;
+	hello->kshm_version = conn->ksnc_proto->pro_version;
+
+	hello4->kshm_magic = LNET_PROTO_MAGIC;
+	hello4->kshm_version = conn->ksnc_proto->pro_version;
+	hello4->kshm_src_nid = lnet_nid_to_nid4(&hello->kshm_src_nid);
+	hello4->kshm_dst_nid = lnet_nid_to_nid4(&hello->kshm_dst_nid);
+	hello4->kshm_src_pid = hello->kshm_src_pid;
+	hello4->kshm_dst_pid = hello->kshm_dst_pid;
+	hello4->kshm_src_incarnation = hello->kshm_src_incarnation;
+	hello4->kshm_dst_incarnation = hello->kshm_dst_incarnation;
+	hello4->kshm_ctype = hello->kshm_ctype;
+	hello4->kshm_nips = hello->kshm_nips;
+
+	if (the_lnet.ln_testprotocompat) {
+		/* single-shot proto check */
+		if (test_and_clear_bit(0, &the_lnet.ln_testprotocompat))
+			hello->kshm_version++;   /* just different! */
+	}
+	hello4->kshm_magic = LNET_PROTO_MAGIC;
+	hello4->kshm_version = hello->kshm_version;
+	hello4->kshm_src_nid = lnet_nid_to_nid4(&hello->kshm_src_nid);
+	hello4->kshm_dst_nid = lnet_nid_to_nid4(&hello->kshm_dst_nid);
+	hello4->kshm_src_pid = hello->kshm_src_pid;
+	hello4->kshm_dst_pid = hello->kshm_dst_pid;
+	hello4->kshm_src_incarnation = hello->kshm_src_incarnation;
+	hello4->kshm_dst_incarnation = hello->kshm_dst_incarnation;
+	hello4->kshm_ctype = hello->kshm_ctype;
+	hello4->kshm_nips = hello->kshm_nips;
+
+	rc = lnet_sock_write(sock, hello4, sizeof(*hello4),
+			     lnet_acceptor_timeout());
+	CFS_FREE_PTR(hello4);
+	if (rc) {
+		CNETERR("Error %d sending HELLO hdr to %pISp\n",
+			rc, &conn->ksnc_peeraddr);
+		return rc;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = lnet_sock_write(sock, hello->kshm_ips,
+			     hello->kshm_nips * sizeof(__u32),
+			     lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d) to %pISp\n", rc,
+			hello->kshm_nips,
+			&conn->ksnc_peeraddr);
+	}
+
+	return rc;
+}
+
+static int
+ksocknal_send_hello_v4(struct ksock_conn *conn, struct ksock_hello_msg *hello)
+{
+	struct socket *sock = conn->ksnc_sock;
+	int rc;
+
+	hello->kshm_magic   = LNET_PROTO_MAGIC;
+	hello->kshm_version = conn->ksnc_proto->pro_version;
+
+	rc = lnet_sock_write(sock, hello, sizeof(*hello),
+			     lnet_acceptor_timeout());
+
+	if (rc != 0)
+		CNETERR("Error %d sending HELLO hdr to %pISp\n",
+			rc, &conn->ksnc_peeraddr);
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello,
+		       int timeout)
+{
+	struct socket *sock = conn->ksnc_sock;
+	struct _lnet_hdr_nid4 *hdr;
+	int rc;
+	int i;
+
+	CFS_ALLOC_PTR(hdr);
+	if (!hdr) {
+		CERROR("Can't allocate struct lnet_hdr_nid4\n");
+		return -ENOMEM;
+	}
+
+	rc = lnet_sock_read(sock, &hdr->src_nid,
+			    sizeof(*hdr) - offsetof(struct _lnet_hdr_nid4,
+						    src_nid),
+			    timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading rest of HELLO hdr from %pIS\n",
+		       rc, &conn->ksnc_peeraddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	/* ...and check we got what we expected */
+	if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+		CERROR("Expecting a HELLO hdr, but got type %d from %pIS\n",
+		       le32_to_cpu(hdr->type),
+		       &conn->ksnc_peeraddr);
+		rc = -EPROTO;
+		goto out;
+	}
+
+	lnet_nid4_to_nid(le64_to_cpu(hdr->src_nid), &hello->kshm_src_nid);
+	hello->kshm_src_pid = le32_to_cpu(hdr->src_pid);
+	hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation);
+	hello->kshm_ctype = le32_to_cpu(hdr->msg.hello.type);
+	hello->kshm_nips = le32_to_cpu(hdr->payload_length) / sizeof(__u32);
+
+	if (hello->kshm_nips > LNET_INTERFACES_NUM) {
+		CERROR("Bad nips %d from ip %pIS\n",
+		       hello->kshm_nips, &conn->ksnc_peeraddr);
+		rc = -EPROTO;
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	rc = lnet_sock_read(sock, hello->kshm_ips,
+			    hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading IPs from ip %pIS\n",
+		       rc, &conn->ksnc_peeraddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %pIS\n",
+			       i, &conn->ksnc_peeraddr);
+			rc = -EPROTO;
+			break;
+		}
+	}
+out:
+	CFS_FREE_PTR(hdr);
+
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello,
+		       int timeout)
+{
+	struct socket *sock = conn->ksnc_sock;
+	struct ksock_hello_msg_nid4 *hello4 = (void *)hello;
+	int rc;
+	int i;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		conn->ksnc_flip = 0;
+	else
+		conn->ksnc_flip = 1;
+
+	rc = lnet_sock_read(sock, &hello4->kshm_src_nid,
+			    offsetof(struct ksock_hello_msg_nid4, kshm_ips) -
+			    offsetof(struct ksock_hello_msg_nid4, kshm_src_nid),
+			    timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading HELLO from %pIS\n",
+		       rc, &conn->ksnc_peeraddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	if (conn->ksnc_flip) {
+		/* These must be copied in reverse order to avoid corruption. */
+		hello->kshm_nips = __swab32(hello4->kshm_nips);
+		hello->kshm_ctype = __swab32(hello4->kshm_ctype);
+		hello->kshm_dst_incarnation = __swab64(hello4->kshm_dst_incarnation);
+		hello->kshm_src_incarnation = __swab64(hello4->kshm_src_incarnation);
+		hello->kshm_dst_pid = __swab32(hello4->kshm_dst_pid);
+		hello->kshm_src_pid = __swab32(hello4->kshm_src_pid);
+		lnet_nid4_to_nid(hello4->kshm_dst_nid, &hello->kshm_dst_nid);
+		lnet_nid4_to_nid(hello4->kshm_src_nid, &hello->kshm_src_nid);
+	} else {
+		/* These must be copied in reverse order to avoid corruption. */
+		hello->kshm_nips = hello4->kshm_nips;
+		hello->kshm_ctype = hello4->kshm_ctype;
+		hello->kshm_dst_incarnation = hello4->kshm_dst_incarnation;
+		hello->kshm_src_incarnation = hello4->kshm_src_incarnation;
+		hello->kshm_dst_pid = hello4->kshm_dst_pid;
+		hello->kshm_src_pid = hello4->kshm_src_pid;
+		lnet_nid4_to_nid(hello4->kshm_dst_nid, &hello->kshm_dst_nid);
+		lnet_nid4_to_nid(hello4->kshm_src_nid, &hello->kshm_src_nid);
+	}
+
+	if (hello->kshm_nips > LNET_INTERFACES_NUM) {
+		CERROR("Bad nips %d from ip %pIS\n",
+		       hello->kshm_nips, &conn->ksnc_peeraddr);
+		return -EPROTO;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = lnet_sock_read(sock, hello->kshm_ips,
+			    hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading IPs from ip %pIS\n",
+		       rc, &conn->ksnc_peeraddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		if (conn->ksnc_flip)
+			__swab32s(&hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %pIS\n",
+			       i, &conn->ksnc_peeraddr);
+			return -EPROTO;
+		}
+	}
+
+	return 0;
+}
+
+static int
+ksocknal_recv_hello_v4(struct ksock_conn *conn, struct ksock_hello_msg *hello,
+		       int timeout)
+{
+	struct socket *sock = conn->ksnc_sock;
+	int rc;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		conn->ksnc_flip = 0;
+	else
+		conn->ksnc_flip = 1;
+
+	rc = lnet_sock_read(sock, &hello->kshm_src_nid,
+			    sizeof(*hello) -
+			    offsetof(struct ksock_hello_msg, kshm_src_nid),
+			    timeout);
+	if (rc) {
+		CERROR("Error %d reading HELLO from %pIS\n",
+		       rc, &conn->ksnc_peeraddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	if (conn->ksnc_flip) {
+		__swab32s(&hello->kshm_src_pid);
+		__swab32s(&hello->kshm_dst_pid);
+		__swab64s(&hello->kshm_src_incarnation);
+		__swab64s(&hello->kshm_dst_incarnation);
+		__swab32s(&hello->kshm_ctype);
+	}
+
+	return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(struct ksock_tx *tx)
+{
+	/* V1.x has no KSOCK_MSG_NOOP */
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_lnetmsg != NULL);
+
+	lnet_hdr_to_nid4(&tx->tx_lnetmsg->msg_hdr,
+			   &tx->tx_msg.ksm_u.lnetmsg_nid4);
+	tx->tx_hdr.iov_base = (void *)&tx->tx_msg.ksm_u.lnetmsg_nid4;
+	tx->tx_hdr.iov_len  = sizeof(struct lnet_hdr_nid4);
+
+	tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr_nid4);
+	tx->tx_resid = tx->tx_nob;
+}
+
+static void
+ksocknal_pack_msg_v2(struct ksock_tx *tx)
+{
+	int hdr_size;
+
+	tx->tx_hdr.iov_base = (void *)&tx->tx_msg;
+
+	switch (tx->tx_msg.ksm_type) {
+	case KSOCK_MSG_LNET:
+		LASSERT(tx->tx_lnetmsg != NULL);
+		hdr_size = (sizeof(struct ksock_msg_hdr) +
+				sizeof(struct lnet_hdr_nid4));
+
+		lnet_hdr_to_nid4(&tx->tx_lnetmsg->msg_hdr,
+				   &tx->tx_msg.ksm_u.lnetmsg_nid4);
+		tx->tx_hdr.iov_len = hdr_size;
+		tx->tx_resid = tx->tx_nob = hdr_size + tx->tx_lnetmsg->msg_len;
+		break;
+	case KSOCK_MSG_NOOP:
+		LASSERT(tx->tx_lnetmsg == NULL);
+		hdr_size = sizeof(struct ksock_msg_hdr);
+
+		tx->tx_hdr.iov_len = hdr_size;
+		tx->tx_resid = tx->tx_nob = hdr_size;
+		break;
+	default:
+		LASSERT(0);
+	}
+	/* Don't checksum before start sending, because packet can be
+	 * piggybacked with ACK
+	 */
+}
+
+static void
+ksocknal_pack_msg_v4(struct ksock_tx *tx)
+{
+	int hdr_size;
+
+	tx->tx_hdr.iov_base = (void *)&tx->tx_msg;
+
+	switch (tx->tx_msg.ksm_type) {
+	case KSOCK_MSG_LNET:
+		LASSERT(tx->tx_lnetmsg != NULL);
+		hdr_size = (sizeof(struct ksock_msg_hdr) +
+				sizeof(struct lnet_hdr_nid16));
+
+		lnet_hdr_to_nid16(&tx->tx_lnetmsg->msg_hdr,
+				     &tx->tx_msg.ksm_u.lnetmsg_nid16);
+		tx->tx_hdr.iov_len = hdr_size;
+		tx->tx_resid = tx->tx_nob = hdr_size + tx->tx_lnetmsg->msg_len;
+		break;
+	case KSOCK_MSG_NOOP:
+		LASSERT(tx->tx_lnetmsg == NULL);
+		hdr_size = sizeof(struct ksock_msg_hdr);
+
+		tx->tx_hdr.iov_len = hdr_size;
+		tx->tx_resid = tx->tx_nob = hdr_size;
+		break;
+	default:
+		LASSERT(0);
+	}
+	/* Don't checksum before start sending, because packet can be
+	 * piggybacked with ACK
+	 */
+}
+
+static void
+ksocknal_unpack_msg_v1(struct ksock_msg *msg, struct lnet_hdr *hdr)
+{
+	msg->ksm_csum		= 0;
+	msg->ksm_type		= KSOCK_MSG_LNET;
+	msg->ksm_zc_cookies[0]	= msg->ksm_zc_cookies[1]  = 0;
+	lnet_hdr_from_nid4(hdr, &msg->ksm_u.lnetmsg_nid4);
+}
+
+static void
+ksocknal_unpack_msg_v2(struct ksock_msg *msg, struct lnet_hdr *hdr)
+{
+	lnet_hdr_from_nid4(hdr, &msg->ksm_u.lnetmsg_nid4);
+}
+
+static void
+ksocknal_unpack_msg_v4(struct ksock_msg *msg, struct lnet_hdr *hdr)
+{
+	lnet_hdr_from_nid16(hdr, &msg->ksm_u.lnetmsg_nid16);
+}
+
+const struct ksock_proto ksocknal_protocol_v1x =
+{
+        .pro_version            = KSOCK_PROTO_V1,
+        .pro_send_hello         = ksocknal_send_hello_v1,
+        .pro_recv_hello         = ksocknal_recv_hello_v1,
+        .pro_pack               = ksocknal_pack_msg_v1,
+        .pro_unpack             = ksocknal_unpack_msg_v1,
+        .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v1,
+        .pro_handle_zcreq       = NULL,
+        .pro_handle_zcack       = NULL,
+        .pro_queue_tx_zcack     = NULL,
+        .pro_match_tx           = ksocknal_match_tx
+};
+
+const struct ksock_proto ksocknal_protocol_v2x =
+{
+        .pro_version            = KSOCK_PROTO_V2,
+        .pro_send_hello         = ksocknal_send_hello_v2,
+        .pro_recv_hello         = ksocknal_recv_hello_v2,
+        .pro_pack               = ksocknal_pack_msg_v2,
+        .pro_unpack             = ksocknal_unpack_msg_v2,
+        .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+        .pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v2,
+        .pro_handle_zcreq       = ksocknal_handle_zcreq,
+        .pro_handle_zcack       = ksocknal_handle_zcack,
+        .pro_match_tx           = ksocknal_match_tx
+};
+
+const struct ksock_proto ksocknal_protocol_v3x =
+{
+        .pro_version            = KSOCK_PROTO_V3,
+        .pro_send_hello         = ksocknal_send_hello_v2,
+        .pro_recv_hello         = ksocknal_recv_hello_v2,
+        .pro_pack               = ksocknal_pack_msg_v2,
+        .pro_unpack             = ksocknal_unpack_msg_v2,
+        .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+        .pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v3,
+        .pro_handle_zcreq       = ksocknal_handle_zcreq,
+        .pro_handle_zcack       = ksocknal_handle_zcack,
+        .pro_match_tx           = ksocknal_match_tx_v3
+};
+
+const struct ksock_proto ksocknal_protocol_v4x = {
+	.pro_version		= KSOCK_PROTO_V4,
+	.pro_send_hello		= ksocknal_send_hello_v4,
+	.pro_recv_hello		= ksocknal_recv_hello_v4,
+	.pro_pack		= ksocknal_pack_msg_v4,
+	.pro_unpack		= ksocknal_unpack_msg_v4,
+	.pro_queue_tx_msg	= ksocknal_queue_tx_msg_v2,
+	.pro_queue_tx_zcack	= ksocknal_queue_tx_zcack_v3,
+	.pro_handle_zcreq	= ksocknal_handle_zcreq,
+	.pro_handle_zcack	= ksocknal_handle_zcack,
+	.pro_match_tx		= ksocknal_match_tx_v4,
+};
diff --git a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
new file mode 100644
index 0000000000000..51ffd29da7c1b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
@@ -0,0 +1,570 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/completion.h>
+#include <net/sock.h>
+#include <lnet/lib-lnet.h>
+#include <linux/sunrpc/addr.h>
+
+static int   accept_port    = 988;
+static int   accept_backlog = 127;
+static int   accept_timeout = 5;
+
+static struct {
+	int			pta_shutdown;
+	struct socket		*pta_sock;
+	struct completion	pta_signal;
+	struct net		*pta_ns;
+	wait_queue_head_t	pta_waitq;
+	atomic_t		pta_ready;
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+	void			(*pta_odata)(struct sock *);
+#else
+	void			(*pta_odata)(struct sock *, int);
+#endif
+} lnet_acceptor_state = {
+	.pta_shutdown = 1
+};
+
+int
+lnet_acceptor_port(void)
+{
+	return accept_port;
+}
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+	return (magic == constant ||
+		magic == __swab32(constant));
+}
+
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static char *accept_type = "secure";
+
+module_param_named(accept, accept_type, charp, 0444);
+MODULE_PARM_DESC(accept, "Accept connections (secure|all|none)");
+module_param(accept_port, int, 0444);
+MODULE_PARM_DESC(accept_port, "Acceptor's port (same on all nodes)");
+module_param(accept_backlog, int, 0444);
+MODULE_PARM_DESC(accept_backlog, "Acceptor's listen backlog");
+module_param(accept_timeout, int, 0644);
+MODULE_PARM_DESC(accept_timeout, "Acceptor's timeout (seconds)");
+
+int
+lnet_acceptor_timeout(void)
+{
+	return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error(int rc, struct lnet_nid *peer_nid,
+			   struct sockaddr *sa)
+{
+	switch (rc) {
+	/* "normal" errors */
+	case -ECONNREFUSED:
+		CNETERR("Connection to %s at host %pISp was refused: check that Lustre is running on that node.\n",
+			libcfs_nidstr(peer_nid), sa);
+		break;
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+		CNETERR("Connection to %s at host %pIS was unreachable: the network or that node may be down, or Lustre may be misconfigured.\n",
+			libcfs_nidstr(peer_nid), sa);
+		break;
+	case -ETIMEDOUT:
+		CNETERR("Connection to %s at host %pISp took too long: that node may be hung or experiencing high load.\n",
+			libcfs_nidstr(peer_nid), sa);
+		break;
+	case -ECONNRESET:
+		LCONSOLE_ERROR_MSG(0x11b,
+				   "Connection to %s at host %pISp was reset: is it running a compatible version of Lustre and is %s one of its NIDs?\n",
+				   libcfs_nidstr(peer_nid), sa,
+				   libcfs_nidstr(peer_nid));
+		break;
+	case -EPROTO:
+		LCONSOLE_ERROR_MSG(0x11c,
+				   "Protocol error connecting to %s at host %pISp: is it running a compatible version of Lustre?\n",
+				   libcfs_nidstr(peer_nid), sa);
+		break;
+	case -EADDRINUSE:
+		LCONSOLE_ERROR_MSG(0x11d,
+				   "No privileged ports available to connect to %s at host %pISp\n",
+				   libcfs_nidstr(peer_nid), sa);
+		break;
+	default:
+		LCONSOLE_ERROR_MSG(0x11e,
+				   "Unexpected error %d connecting to %s at host %pISp\n",
+				   rc, libcfs_nidstr(peer_nid), sa);
+		break;
+	}
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+struct socket *
+lnet_connect(struct lnet_nid *peer_nid, int interface,
+	     struct sockaddr *peeraddr,
+	     struct net *ns)
+{
+	struct lnet_acceptor_connreq cr1;
+	struct lnet_acceptor_connreq_v2 cr2;
+	void *cr;
+	int crsize;
+	struct socket *sock;
+	int rc;
+	int port;
+
+	BUILD_BUG_ON(sizeof(cr) > 16); /* not too big to be on the stack */
+
+	LASSERT(peeraddr->sa_family == AF_INET ||
+		peeraddr->sa_family == AF_INET6);
+
+	for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+	     port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+	     --port) {
+		/* Iterate through reserved ports. */
+		sock = lnet_sock_connect(interface, port, peeraddr, ns);
+		if (IS_ERR(sock)) {
+			rc = PTR_ERR(sock);
+			if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL)
+				continue;
+			goto failed;
+		}
+
+		BUILD_BUG_ON(LNET_PROTO_ACCEPTOR_VERSION != 1);
+
+		if (nid_is_nid4(peer_nid)) {
+			cr1.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+			cr1.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+			cr1.acr_nid     = lnet_nid_to_nid4(peer_nid);
+			cr = &cr1;
+			crsize = sizeof(cr1);
+
+			if (the_lnet.ln_testprotocompat) {
+				/* single-shot proto check */
+				if (test_and_clear_bit(
+					    2, &the_lnet.ln_testprotocompat))
+					cr1.acr_version++;
+				if (test_and_clear_bit(
+					    3, &the_lnet.ln_testprotocompat))
+					cr1.acr_magic = LNET_PROTO_MAGIC;
+			}
+
+		} else {
+			cr2.acr_magic	= LNET_PROTO_ACCEPTOR_MAGIC;
+			cr2.acr_version	= LNET_PROTO_ACCEPTOR_VERSION_16;
+			cr2.acr_nid	= *peer_nid;
+			cr = &cr2;
+			crsize = sizeof(cr2);
+		}
+
+		rc = lnet_sock_write(sock, cr, crsize, accept_timeout);
+		if (rc != 0)
+			goto failed_sock;
+
+		return sock;
+	}
+
+	rc = -EADDRINUSE;
+	goto failed;
+
+failed_sock:
+	sock_release(sock);
+failed:
+	lnet_connect_console_error(rc, peer_nid, peeraddr);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(lnet_connect);
+
+static int
+lnet_accept(struct socket *sock, __u32 magic)
+{
+	struct lnet_acceptor_connreq cr;
+	struct lnet_acceptor_connreq_v2 cr2;
+	struct lnet_nid nid;
+	struct sockaddr_storage peer;
+	int peer_version;
+	int rc;
+	int flip;
+	struct lnet_ni *ni;
+	char *str;
+
+	LASSERT(sizeof(cr) <= 16);		/* not too big for the stack */
+
+	rc = lnet_sock_getaddr(sock, true, &peer);
+	if (rc != 0) {
+		CERROR("Can't determine new connection's address\n");
+		return rc;
+	}
+
+	if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+		if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+			/* future version compatibility!
+			 * When LNET unifies protocols over all LNDs, the first
+			 * thing sent will be a version query.	I send back
+			 * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+			memset(&cr, 0, sizeof(cr));
+			cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+			cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+			rc = lnet_sock_write(sock, &cr, sizeof(cr),
+					       accept_timeout);
+
+			if (rc != 0)
+				CERROR("Error sending magic+version in response to LNET magic from %pIS: %d\n",
+				       &peer, rc);
+			return -EPROTO;
+		}
+
+		if (lnet_accept_magic(magic, LNET_PROTO_TCP_MAGIC))
+			str = "'old' socknal/tcpnal";
+		else
+			str = "unrecognised";
+
+		LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %pIS"
+				   " magic %08x: %s acceptor protocol\n",
+				   &peer, magic, str);
+		return -EPROTO;
+	}
+
+	flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+	rc = lnet_sock_read(sock, &cr.acr_version,
+			      sizeof(cr.acr_version),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request version from %pIS\n",
+		       rc, &peer);
+		return -EIO;
+	}
+
+	if (flip)
+		__swab32s(&cr.acr_version);
+
+	switch (cr.acr_version) {
+	default:
+		/* future version compatibility!
+		 * An acceptor-specific protocol rev will first send a version
+		 * query.  I send back my current version to tell her I'm
+		 * "old". */
+		peer_version = cr.acr_version;
+
+		memset(&cr, 0, sizeof(cr));
+		cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+		rc = lnet_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+
+		if (rc != 0)
+			CERROR("Error sending magic+version in response to version %d from %pIS: %d\n",
+			       peer_version, &peer, rc);
+		return -EPROTO;
+
+	case LNET_PROTO_ACCEPTOR_VERSION:
+
+		rc = lnet_sock_read(sock, &cr.acr_nid,
+				    sizeof(cr) -
+				    offsetof(struct lnet_acceptor_connreq,
+					     acr_nid),
+				    accept_timeout);
+		if (rc)
+			break;
+		if (flip)
+			__swab64s(&cr.acr_nid);
+
+		lnet_nid4_to_nid(cr.acr_nid, &nid);
+		break;
+
+	case LNET_PROTO_ACCEPTOR_VERSION_16:
+		rc = lnet_sock_read(sock, &cr2.acr_nid,
+				    sizeof(cr2) -
+				    offsetof(struct lnet_acceptor_connreq_v2,
+					     acr_nid),
+				    accept_timeout);
+		if (rc)
+			break;
+		nid = cr2.acr_nid;
+		break;
+	}
+	if (rc != 0) {
+		CERROR("Error %d reading connection request from %pIS\n",
+		       rc, &peer);
+		return -EIO;
+	}
+
+	ni = lnet_nid_to_ni_addref(&nid);
+	if (ni == NULL ||               /* no matching net */
+	    !nid_same(&ni->ni_nid, &nid)) {
+		/* right NET, wrong NID! */
+		if (ni != NULL)
+			lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x120,
+				   "Refusing connection from %pIS for %s: No matching NI\n",
+				   &peer, libcfs_nidstr(&nid));
+		return -EPERM;
+	}
+
+	if (ni->ni_net->net_lnd->lnd_accept == NULL) {
+		/* This catches a request for the loopback LND */
+		lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x121,
+				   "Refusing connection from %pIS for %s: NI doesn not accept IP connections\n",
+				  &peer, libcfs_nidstr(&nid));
+		return -EPERM;
+	}
+
+	CDEBUG(D_NET, "Accept %s from %pI4h\n", libcfs_nidstr(&nid), &peer);
+
+	rc = ni->ni_net->net_lnd->lnd_accept(ni, sock);
+
+	lnet_ni_decref(ni);
+	return rc;
+}
+
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+static void lnet_acceptor_ready(struct sock *sk)
+#else
+static void lnet_acceptor_ready(struct sock *sk, int len)
+#endif
+{
+	/* Ensure pta_odata has actually been set before calling it */
+	rmb();
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+	lnet_acceptor_state.pta_odata(sk);
+#else
+	lnet_acceptor_state.pta_odata(sk, 0);
+#endif
+
+	atomic_set(&lnet_acceptor_state.pta_ready, 1);
+	wake_up(&lnet_acceptor_state.pta_waitq);
+}
+
+static int
+lnet_acceptor(void *arg)
+{
+	struct socket  *newsock;
+	int	       rc;
+	__u32	       magic;
+	struct sockaddr_storage peer;
+	int	       secure = (int)((uintptr_t)arg);
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+	lnet_acceptor_state.pta_sock =
+		lnet_sock_listen(accept_port, accept_backlog,
+				 lnet_acceptor_state.pta_ns);
+	if (IS_ERR(lnet_acceptor_state.pta_sock)) {
+		rc = PTR_ERR(lnet_acceptor_state.pta_sock);
+		if (rc == -EADDRINUSE)
+			LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
+					   " %d: port already in use\n",
+					   accept_port);
+		else
+			LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port "
+					   "%d: unexpected error %d\n",
+					   accept_port, rc);
+
+		lnet_acceptor_state.pta_sock = NULL;
+	} else {
+		rc = 0;
+		LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+		init_waitqueue_head(&lnet_acceptor_state.pta_waitq);
+		lnet_acceptor_state.pta_odata =
+			lnet_acceptor_state.pta_sock->sk->sk_data_ready;
+		/* ensure pta_odata gets set before there is any chance of
+		 * lnet_accept_ready() trying to read it.
+		 */
+		wmb();
+		lnet_acceptor_state.pta_sock->sk->sk_data_ready =
+			lnet_acceptor_ready;
+		atomic_set(&lnet_acceptor_state.pta_ready, 1);
+	}
+
+	/* set init status and unblock parent */
+	lnet_acceptor_state.pta_shutdown = rc;
+	complete(&lnet_acceptor_state.pta_signal);
+
+	if (rc != 0)
+		return rc;
+
+	while (!lnet_acceptor_state.pta_shutdown) {
+
+		wait_event_idle(lnet_acceptor_state.pta_waitq,
+				lnet_acceptor_state.pta_shutdown ||
+				atomic_read(&lnet_acceptor_state.pta_ready));
+		if (!atomic_read(&lnet_acceptor_state.pta_ready))
+			continue;
+		atomic_set(&lnet_acceptor_state.pta_ready, 0);
+		rc = kernel_accept(lnet_acceptor_state.pta_sock, &newsock,
+				   SOCK_NONBLOCK);
+		if (rc != 0) {
+			if (rc != -EAGAIN) {
+				CWARN("Accept error %d: pausing...\n", rc);
+				schedule_timeout_uninterruptible(
+					cfs_time_seconds(1));
+			}
+			continue;
+		}
+
+		/* make sure we call lnet_sock_accept() again, until it fails */
+		atomic_set(&lnet_acceptor_state.pta_ready, 1);
+
+		rc = lnet_sock_getaddr(newsock, true, &peer);
+		if (rc != 0) {
+			CERROR("Can't determine new connection's address\n");
+			goto failed;
+		}
+
+		if (secure &&
+		    rpc_get_port((struct sockaddr *)&peer) >
+		    LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+			CERROR("Refusing connection from %pISp: insecure port.\n",
+			       &peer);
+			goto failed;
+		}
+
+		rc = lnet_sock_read(newsock, &magic, sizeof(magic),
+				      accept_timeout);
+		if (rc != 0) {
+			CERROR("Error %d reading connection request from %pIS\n",
+			       rc, &peer);
+			goto failed;
+		}
+
+		rc = lnet_accept(newsock, magic);
+		if (rc != 0)
+			goto failed;
+
+		continue;
+
+failed:
+		sock_release(newsock);
+	}
+
+	lnet_acceptor_state.pta_sock->sk->sk_data_ready =
+		lnet_acceptor_state.pta_odata;
+	sock_release(lnet_acceptor_state.pta_sock);
+	lnet_acceptor_state.pta_sock = NULL;
+
+	CDEBUG(D_NET, "Acceptor stopping\n");
+
+	/* unblock lnet_acceptor_stop() */
+	complete(&lnet_acceptor_state.pta_signal);
+	return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+	if (!strcmp(acc, "secure")) {
+		*sec = 1;
+		return 1;
+	} else if (!strcmp(acc, "all")) {
+		*sec = 0;
+		return 1;
+	} else if (!strcmp(acc, "none")) {
+		return 0;
+	} else {
+		LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+				   acc);
+		return -EINVAL;
+	}
+}
+
+int
+lnet_acceptor_start(void)
+{
+	struct task_struct *task;
+	int  rc;
+	long rc2;
+	long secure;
+
+	/* if acceptor is already running return immediately */
+	if (!lnet_acceptor_state.pta_shutdown)
+		return 0;
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+	init_completion(&lnet_acceptor_state.pta_signal);
+	rc = accept2secure(accept_type, &secure);
+	if (rc <= 0)
+		return rc;
+
+	if (lnet_count_acceptor_nets() == 0)  /* not required */
+		return 0;
+	if (current->nsproxy && current->nsproxy->net_ns)
+		lnet_acceptor_state.pta_ns = current->nsproxy->net_ns;
+	else
+		lnet_acceptor_state.pta_ns = &init_net;
+	task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure,
+			   "acceptor_%03ld", secure);
+	if (IS_ERR(task)) {
+		rc2 = PTR_ERR(task);
+		CERROR("Can't start acceptor thread: %ld\n", rc2);
+		return -ESRCH;
+	}
+
+	/* wait for acceptor to startup */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+	if (!lnet_acceptor_state.pta_shutdown) {
+		/* started OK */
+		LASSERT(lnet_acceptor_state.pta_sock != NULL);
+		return 0;
+	}
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+	return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+	if (lnet_acceptor_state.pta_shutdown) /* not running */
+		return;
+
+	/* If still required, return immediately */
+	if (the_lnet.ln_refcount && lnet_count_acceptor_nets() > 0)
+		return;
+
+	lnet_acceptor_state.pta_shutdown = 1;
+	wake_up(&lnet_acceptor_state.pta_waitq);
+
+	/* block until acceptor signals exit */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
new file mode 100644
index 0000000000000..b99c85b73d0e0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
@@ -0,0 +1,4884 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/ctype.h>
+#include <linux/log2.h>
+#include <linux/ktime.h>
+#include <linux/moduleparam.h>
+#include <linux/uaccess.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
+#include <lnet/udsp.h>
+#include <lnet/lib-lnet.h>
+
+#define D_LNI D_CONSOLE
+
+/*
+ * initialize ln_api_mutex statically, since it needs to be used in
+ * discovery_set callback. That module parameter callback can be called
+ * before module init completes. The mutex needs to be ready for use then.
+ */
+struct lnet the_lnet = {
+	.ln_api_mutex = __MUTEX_INITIALIZER(the_lnet.ln_api_mutex),
+};		/* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+static char *ip2nets = "";
+module_param(ip2nets, charp, 0444);
+MODULE_PARM_DESC(ip2nets, "LNET network <- IP table");
+
+static char *networks = "";
+module_param(networks, charp, 0444);
+MODULE_PARM_DESC(networks, "local networks");
+
+static char *routes = "";
+module_param(routes, charp, 0444);
+MODULE_PARM_DESC(routes, "routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+module_param(rnet_htable_size, int, 0444);
+MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table");
+
+static int use_tcp_bonding;
+module_param(use_tcp_bonding, int, 0444);
+MODULE_PARM_DESC(use_tcp_bonding,
+		 "use_tcp_bonding parameter has been removed");
+
+unsigned int lnet_numa_range = 0;
+module_param(lnet_numa_range, uint, 0444);
+MODULE_PARM_DESC(lnet_numa_range,
+		"NUMA range to consider during Multi-Rail selection");
+
+/*
+ * lnet_health_sensitivity determines by how much we decrement the health
+ * value on sending error. The value defaults to 100, which means health
+ * interface health is decremented by 100 points every failure.
+ */
+unsigned int lnet_health_sensitivity = 100;
+static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_health_sensitivity = {
+	.set = sensitivity_set,
+	.get = param_get_int,
+};
+#define param_check_health_sensitivity(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
+		  &lnet_health_sensitivity, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_health_sensitivity,
+		"Value to decrement the health value by on error");
+
+/*
+ * lnet_recovery_interval determines how often we should perform recovery
+ * on unhealthy interfaces.
+ */
+unsigned int lnet_recovery_interval = 1;
+static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_recovery_interval = {
+	.set = recovery_interval_set,
+	.get = param_get_int,
+};
+#define param_check_recovery_interval(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int,
+		  &lnet_recovery_interval, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_recovery_interval,
+		"DEPRECATED - Interval to recover unhealthy interfaces in seconds");
+
+unsigned int lnet_recovery_limit;
+module_param(lnet_recovery_limit, uint, 0644);
+MODULE_PARM_DESC(lnet_recovery_limit,
+		 "How long to attempt recovery of unhealthy peer interfaces in seconds. Set to 0 to allow indefinite recovery");
+
+static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
+static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+static struct kernel_param_ops param_ops_interfaces_max = {
+	.set = intf_max_set,
+	.get = param_get_int,
+};
+
+#define param_check_interfaces_max(name, p) \
+		__param_check(name, p, int)
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(lnet_interfaces_max, interfaces_max, 0644);
+#else
+module_param_call(lnet_interfaces_max, intf_max_set, param_get_int,
+		  &param_ops_interfaces_max, 0644);
+#endif
+MODULE_PARM_DESC(lnet_interfaces_max,
+		"Maximum number of interfaces in a node.");
+
+unsigned lnet_peer_discovery_disabled = 0;
+static int discovery_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+static struct kernel_param_ops param_ops_discovery_disabled = {
+	.set = discovery_set,
+	.get = param_get_int,
+};
+
+#define param_check_discovery_disabled(name, p) \
+		__param_check(name, p, int)
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(lnet_peer_discovery_disabled, discovery_disabled, 0644);
+#else
+module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int,
+		  &param_ops_discovery_disabled, 0644);
+#endif
+MODULE_PARM_DESC(lnet_peer_discovery_disabled,
+		"Set to 1 to disable peer discovery on this node.");
+
+unsigned int lnet_drop_asym_route;
+static int drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+static struct kernel_param_ops param_ops_drop_asym_route = {
+	.set = drop_asym_route_set,
+	.get = param_get_int,
+};
+
+#define param_check_drop_asym_route(name, p)	\
+	__param_check(name, p, int)
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(lnet_drop_asym_route, drop_asym_route, 0644);
+#else
+module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int,
+		  &param_ops_drop_asym_route, 0644);
+#endif
+MODULE_PARM_DESC(lnet_drop_asym_route,
+		 "Set to 1 to drop asymmetrical route messages.");
+
+#define LNET_TRANSACTION_TIMEOUT_DEFAULT 50
+unsigned int lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_DEFAULT;
+static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_transaction_timeout = {
+	.set = transaction_to_set,
+	.get = param_get_int,
+};
+
+#define param_check_transaction_timeout(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_transaction_timeout, transaction_timeout, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
+		  &lnet_transaction_timeout, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_transaction_timeout,
+		"Maximum number of seconds to wait for a peer response.");
+
+#define LNET_RETRY_COUNT_DEFAULT 2
+unsigned int lnet_retry_count = LNET_RETRY_COUNT_DEFAULT;
+static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_retry_count = {
+	.set = retry_count_set,
+	.get = param_get_int,
+};
+
+#define param_check_retry_count(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_retry_count, retry_count, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_retry_count, retry_count_set, param_get_int,
+		  &lnet_retry_count, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_retry_count,
+		 "Maximum number of times to retry transmitting a message");
+
+unsigned int lnet_response_tracking = 3;
+static int response_tracking_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_response_tracking = {
+	.set = response_tracking_set,
+	.get = param_get_int,
+};
+
+#define param_check_response_tracking(name, p)  \
+	__param_check(name, p, int)
+module_param(lnet_response_tracking, response_tracking, 0644);
+#else
+module_param_call(lnet_response_tracking, response_tracking_set, param_get_int,
+		  &lnet_response_tracking, 0644);
+#endif
+MODULE_PARM_DESC(lnet_response_tracking,
+		 "(0|1|2|3) LNet Internal Only|GET Reply only|PUT ACK only|Full Tracking (default)");
+
+#define LNET_LND_TIMEOUT_DEFAULT ((LNET_TRANSACTION_TIMEOUT_DEFAULT - 1) / \
+				  (LNET_RETRY_COUNT_DEFAULT + 1))
+unsigned int lnet_lnd_timeout = LNET_LND_TIMEOUT_DEFAULT;
+static void lnet_set_lnd_timeout(void)
+{
+	lnet_lnd_timeout = (lnet_transaction_timeout - 1) /
+			   (lnet_retry_count + 1);
+}
+
+/*
+ * This sequence number keeps track of how many times DLC was used to
+ * update the local NIs. It is incremented when a NI is added or
+ * removed and checked when sending a message to determine if there is
+ * a need to re-run the selection algorithm. See lnet_select_pathway()
+ * for more details on its usage.
+ */
+static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0);
+
+static int lnet_ping(struct lnet_process_id id, struct lnet_nid *src_nid,
+		     signed long timeout, struct lnet_process_id __user *ids,
+		     int n_ids);
+
+static int lnet_discover(struct lnet_process_id id, __u32 force,
+			 struct lnet_process_id __user *ids, int n_ids);
+
+static int
+sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *sensitivity = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_health_sensitivity'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value > LNET_MAX_HEALTH_VALUE) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Invalid health value. Maximum: %d value = %lu\n",
+		       LNET_MAX_HEALTH_VALUE, value);
+		return -EINVAL;
+	}
+
+	if (*sensitivity != 0 && value == 0 && lnet_retry_count != 0) {
+		lnet_retry_count = 0;
+		lnet_set_lnd_timeout();
+	}
+
+	*sensitivity = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	CWARN("'lnet_recovery_interval' has been deprecated\n");
+
+	return 0;
+}
+
+static int
+discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *discovery_off = (unsigned *)kp->arg;
+	unsigned long value;
+	struct lnet_ping_buffer *pbuf;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_peer_discovery_disabled'\n");
+		return rc;
+	}
+
+	value = (value) ? 1 : 0;
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value == *discovery_off) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	/*
+	 * We still want to set the discovery value even when LNet is not
+	 * running. This is the case when LNet is being loaded and we want
+	 * the module parameters to take effect. Otherwise if we're
+	 * changing the value dynamically, we want to set it after
+	 * updating the peers
+	 */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		*discovery_off = value;
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	/* tell peers that discovery setting has changed */
+	lnet_net_lock(LNET_LOCK_EX);
+	pbuf = the_lnet.ln_ping_target;
+	if (value)
+		pbuf->pb_info.pi_features &= ~LNET_PING_FEAT_DISCOVERY;
+	else
+		pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* only send a push when we're turning off discovery */
+	if (*discovery_off <= 0 && value > 0)
+		lnet_push_update_to_peers(1);
+	*discovery_off = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned int *drop_asym_route = (unsigned int *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for "
+		       "'lnet_drop_asym_route'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value == *drop_asym_route) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	*drop_asym_route = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *transaction_to = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_transaction_timeout'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value <= lnet_retry_count || value == 0) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Invalid value for lnet_transaction_timeout (%lu). "
+		       "Has to be greater than lnet_retry_count (%u)\n",
+		       value, lnet_retry_count);
+		return -EINVAL;
+	}
+
+	if (value == *transaction_to) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	*transaction_to = value;
+	/* Update the lnet_lnd_timeout now that we've modified the
+	 * transaction timeout
+	 */
+	lnet_set_lnd_timeout();
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *retry_count = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_retry_count'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (lnet_health_sensitivity == 0 && value > 0) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Can not set lnet_retry_count when health feature is turned off\n");
+		return -EINVAL;
+	}
+
+	if (value > lnet_transaction_timeout) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Invalid value for lnet_retry_count (%lu). "
+		       "Has to be smaller than lnet_transaction_timeout (%u)\n",
+		       value, lnet_transaction_timeout);
+		return -EINVAL;
+	}
+
+	*retry_count = value;
+
+	/* Update the lnet_lnd_timeout now that we've modified the
+	 * retry count
+	 */
+	lnet_set_lnd_timeout();
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+intf_max_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int value, rc;
+
+	rc = kstrtoint(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_interfaces_max'\n");
+		return rc;
+	}
+
+	if (value < LNET_INTERFACES_MIN) {
+		CWARN("max interfaces provided are too small, setting to %d\n",
+		      LNET_INTERFACES_MAX_DEFAULT);
+		value = LNET_INTERFACES_MAX_DEFAULT;
+	}
+
+	*(int *)kp->arg = value;
+
+	return 0;
+}
+
+static int
+response_tracking_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned long new_value;
+
+	rc = kstrtoul(val, 0, &new_value);
+	if (rc) {
+		CERROR("Invalid value for 'lnet_response_tracking'\n");
+		return -EINVAL;
+	}
+
+	if (new_value < 0 || new_value > 3) {
+		CWARN("Invalid value (%lu) for 'lnet_response_tracking'\n",
+		      new_value);
+		return -EINVAL;
+	}
+
+	lnet_response_tracking = new_value;
+
+	return 0;
+}
+
+static const char *
+lnet_get_routes(void)
+{
+	return routes;
+}
+
+static const char *
+lnet_get_networks(void)
+{
+	const char *nets;
+	int rc;
+
+	if (*networks != 0 && *ip2nets != 0) {
+		LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or "
+				   "'ip2nets' but not both at once\n");
+		return NULL;
+	}
+
+	if (*ip2nets != 0) {
+		rc = lnet_parse_ip2nets(&nets, ip2nets);
+		return (rc == 0) ? nets : NULL;
+	}
+
+	if (*networks != 0)
+		return networks;
+
+	return "tcp";
+}
+
+static void
+lnet_init_locks(void)
+{
+	spin_lock_init(&the_lnet.ln_eq_wait_lock);
+	spin_lock_init(&the_lnet.ln_msg_resend_lock);
+	init_completion(&the_lnet.ln_mt_wait_complete);
+	mutex_init(&the_lnet.ln_lnd_mutex);
+}
+
+struct kmem_cache *lnet_mes_cachep;	   /* MEs kmem_cache */
+struct kmem_cache *lnet_small_mds_cachep;  /* <= LNET_SMALL_MD_SIZE bytes
+					    *  MDs kmem_cache */
+struct kmem_cache *lnet_udsp_cachep;	   /* udsp cache */
+struct kmem_cache *lnet_rspt_cachep;	   /* response tracker cache */
+struct kmem_cache *lnet_msg_cachep;
+
+static int
+lnet_slab_setup(void)
+{
+	/* create specific kmem_cache for MEs and small MDs (i.e., originally
+	 * allocated in <size-xxx> kmem_cache).
+	 */
+	lnet_mes_cachep = kmem_cache_create("lnet_MEs", sizeof(struct lnet_me),
+					    0, 0, NULL);
+	if (!lnet_mes_cachep)
+		return -ENOMEM;
+
+	lnet_small_mds_cachep = kmem_cache_create("lnet_small_MDs",
+						  LNET_SMALL_MD_SIZE, 0, 0,
+						  NULL);
+	if (!lnet_small_mds_cachep)
+		return -ENOMEM;
+
+	lnet_udsp_cachep = kmem_cache_create("lnet_udsp",
+					     sizeof(struct lnet_udsp),
+					     0, 0, NULL);
+	if (!lnet_udsp_cachep)
+		return -ENOMEM;
+
+	lnet_rspt_cachep = kmem_cache_create("lnet_rspt", sizeof(struct lnet_rsp_tracker),
+					    0, 0, NULL);
+	if (!lnet_rspt_cachep)
+		return -ENOMEM;
+
+	lnet_msg_cachep = kmem_cache_create("lnet_msg", sizeof(struct lnet_msg),
+					    0, 0, NULL);
+	if (!lnet_msg_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void
+lnet_slab_cleanup(void)
+{
+	if (lnet_msg_cachep) {
+		kmem_cache_destroy(lnet_msg_cachep);
+		lnet_msg_cachep = NULL;
+	}
+
+	if (lnet_rspt_cachep) {
+		kmem_cache_destroy(lnet_rspt_cachep);
+		lnet_rspt_cachep = NULL;
+	}
+
+	if (lnet_udsp_cachep) {
+		kmem_cache_destroy(lnet_udsp_cachep);
+		lnet_udsp_cachep = NULL;
+	}
+
+	if (lnet_small_mds_cachep) {
+		kmem_cache_destroy(lnet_small_mds_cachep);
+		lnet_small_mds_cachep = NULL;
+	}
+
+	if (lnet_mes_cachep) {
+		kmem_cache_destroy(lnet_mes_cachep);
+		lnet_mes_cachep = NULL;
+	}
+}
+
+static int
+lnet_create_remote_nets_table(void)
+{
+	int		  i;
+	struct list_head *hash;
+
+	LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+	LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+	CFS_ALLOC_PTR_ARRAY(hash, LNET_REMOTE_NETS_HASH_SIZE);
+	if (hash == NULL) {
+		CERROR("Failed to create remote nets hash table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&hash[i]);
+	the_lnet.ln_remote_nets_hash = hash;
+	return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+	int i;
+
+	if (the_lnet.ln_remote_nets_hash == NULL)
+		return;
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+	CFS_FREE_PTR_ARRAY(the_lnet.ln_remote_nets_hash,
+			   LNET_REMOTE_NETS_HASH_SIZE);
+	the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+	if (the_lnet.ln_res_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_res_lock);
+		the_lnet.ln_res_lock = NULL;
+	}
+
+	if (the_lnet.ln_net_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_net_lock);
+		the_lnet.ln_net_lock = NULL;
+	}
+}
+
+static int
+lnet_create_locks(void)
+{
+	lnet_init_locks();
+
+	the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_res_lock == NULL)
+		goto failed;
+
+	the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_net_lock == NULL)
+		goto failed;
+
+	return 0;
+
+ failed:
+	lnet_destroy_locks();
+	return -ENOMEM;
+}
+
+static void lnet_assert_wire_constants(void)
+{
+	/* Wire protocol assertions generated by 'wirecheck'
+	 * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+	 * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+	 * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7)
+	 */
+
+	/* Constants... */
+	BUILD_BUG_ON(LNET_PROTO_TCP_MAGIC != 0xeebc0ded);
+	BUILD_BUG_ON(LNET_PROTO_TCP_VERSION_MAJOR != 1);
+	BUILD_BUG_ON(LNET_PROTO_TCP_VERSION_MINOR != 0);
+	BUILD_BUG_ON(LNET_MSG_ACK != 0);
+	BUILD_BUG_ON(LNET_MSG_PUT != 1);
+	BUILD_BUG_ON(LNET_MSG_GET != 2);
+	BUILD_BUG_ON(LNET_MSG_REPLY != 3);
+	BUILD_BUG_ON(LNET_MSG_HELLO != 4);
+
+	BUILD_BUG_ON((int)sizeof(lnet_nid_t) != 8);
+	BUILD_BUG_ON((int)sizeof(lnet_pid_t) != 4);
+
+	/* Checks for struct lnet_nid */
+	BUILD_BUG_ON((int)sizeof(struct lnet_nid) != 20);
+	BUILD_BUG_ON((int)offsetof(struct lnet_nid, nid_size) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_nid *)0)->nid_size) != 1);
+	BUILD_BUG_ON((int)offsetof(struct lnet_nid, nid_type) != 1);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_nid *)0)->nid_type) != 1);
+	BUILD_BUG_ON((int)offsetof(struct lnet_nid, nid_num) != 2);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_nid *)0)->nid_num) != 2);
+	BUILD_BUG_ON((int)offsetof(struct lnet_nid, nid_addr) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_nid *)0)->nid_addr) != 16);
+
+	/* Checks for struct lnet_process_id_packed */
+	BUILD_BUG_ON((int)sizeof(struct lnet_process_id_packed) != 12);
+	BUILD_BUG_ON((int)offsetof(struct lnet_process_id_packed, nid) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_process_id_packed *)0)->nid) != 8);
+	BUILD_BUG_ON((int)offsetof(struct lnet_process_id_packed, pid) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_process_id_packed *)0)->pid) != 4);
+
+	/* Checks for struct lnet_handle_wire */
+	BUILD_BUG_ON((int)sizeof(struct lnet_handle_wire) != 16);
+	BUILD_BUG_ON((int)offsetof(struct lnet_handle_wire,
+				   wh_interface_cookie) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_handle_wire *)0)->wh_interface_cookie) != 8);
+	BUILD_BUG_ON((int)offsetof(struct lnet_handle_wire,
+				   wh_object_cookie) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_handle_wire *)0)->wh_object_cookie) != 8);
+
+	/* Checks for struct struct lnet_magicversion */
+	BUILD_BUG_ON((int)sizeof(struct lnet_magicversion) != 8);
+	BUILD_BUG_ON((int)offsetof(struct lnet_magicversion, magic) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_magicversion *)0)->magic) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_magicversion, version_major) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_magicversion *)0)->version_major) != 2);
+	BUILD_BUG_ON((int)offsetof(struct lnet_magicversion,
+				   version_minor) != 6);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_magicversion *)0)->version_minor) != 2);
+
+	/* Checks for struct _lnet_hdr_nid4 */
+	BUILD_BUG_ON((int)sizeof(struct _lnet_hdr_nid4) != 72);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, dest_nid) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->dest_nid) != 8);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, src_nid) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->src_nid) != 8);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, dest_pid) != 16);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->dest_pid) != 4);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, src_pid) != 20);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->src_pid) != 4);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, type) != 24);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->type) != 4);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, payload_length) != 28);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->payload_length) != 4);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg) != 32);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg) != 40);
+
+	/* Ack */
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.ack.dst_wmd) != 32);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.ack.dst_wmd) != 16);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.ack.match_bits) != 48);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.ack.match_bits) != 8);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.ack.mlength) != 56);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.ack.mlength) != 4);
+
+	/* Put */
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.ack_wmd) != 32);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.ack_wmd) != 16);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.match_bits) != 48);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.match_bits) != 8);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.hdr_data) != 56);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.hdr_data) != 8);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.ptl_index) != 64);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.ptl_index) != 4);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.put.offset) != 68);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.put.offset) != 4);
+
+	/* Get */
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.return_wmd) != 32);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.return_wmd) != 16);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.match_bits) != 48);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.match_bits) != 8);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.ptl_index) != 56);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.ptl_index) != 4);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.src_offset) != 60);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.src_offset) != 4);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.get.sink_length) != 64);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.get.sink_length) != 4);
+
+	/* Reply */
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.reply.dst_wmd) != 32);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.reply.dst_wmd) != 16);
+
+	/* Hello */
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.hello.incarnation) != 32);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.hello.incarnation) != 8);
+	BUILD_BUG_ON((int)offsetof(struct _lnet_hdr_nid4, msg.hello.type) != 40);
+	BUILD_BUG_ON((int)sizeof(((struct _lnet_hdr_nid4 *)0)->msg.hello.type) != 4);
+
+	/* Checks for struct lnet_ni_status and related constants */
+	BUILD_BUG_ON(LNET_NI_STATUS_INVALID != 0x00000000);
+	BUILD_BUG_ON(LNET_NI_STATUS_UP != 0x15aac0de);
+	BUILD_BUG_ON(LNET_NI_STATUS_DOWN != 0xdeadface);
+
+	/* Checks for struct lnet_ni_status */
+	BUILD_BUG_ON((int)sizeof(struct lnet_ni_status) != 16);
+	BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_nid) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_nid) != 8);
+	BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_status) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_status) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_unused) != 12);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_unused) != 4);
+
+	/* Checks for struct lnet_ping_info and related constants */
+	BUILD_BUG_ON(LNET_PROTO_PING_MAGIC != 0x70696E67);
+	BUILD_BUG_ON(LNET_PING_FEAT_INVAL != 0);
+	BUILD_BUG_ON(LNET_PING_FEAT_BASE != 1);
+	BUILD_BUG_ON(LNET_PING_FEAT_NI_STATUS != 2);
+	BUILD_BUG_ON(LNET_PING_FEAT_RTE_DISABLED != 4);
+	BUILD_BUG_ON(LNET_PING_FEAT_MULTI_RAIL != 8);
+	BUILD_BUG_ON(LNET_PING_FEAT_DISCOVERY != 16);
+	BUILD_BUG_ON(LNET_PING_FEAT_BITS != 31);
+
+	/* Checks for struct lnet_ping_info */
+	BUILD_BUG_ON((int)sizeof(struct lnet_ping_info) != 16);
+	BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_magic) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_ping_info *)0)->pi_magic) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_features) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_ping_info *)0)->pi_features) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_pid) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_ping_info *)0)->pi_pid) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_nnis) != 12);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_ping_info *)0)->pi_nnis) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_ping_info, pi_ni) != 16);
+	BUILD_BUG_ON(offsetof(struct lnet_ping_info, pi_ni) != sizeof(struct lnet_ping_info));
+
+	/* Acceptor connection request */
+	BUILD_BUG_ON(LNET_PROTO_ACCEPTOR_VERSION != 1);
+
+	/* Checks for struct lnet_acceptor_connreq */
+	BUILD_BUG_ON((int)sizeof(struct lnet_acceptor_connreq) != 16);
+	BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq, acr_magic) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq *)0)->acr_magic) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq, acr_version) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq *)0)->acr_version) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq, acr_nid) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq *)0)->acr_nid) != 8);
+
+	/* Checks for struct lnet_acceptor_connreq_v2 */
+	BUILD_BUG_ON((int)sizeof(struct lnet_acceptor_connreq_v2) != 28);
+	BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq_v2, acr_magic) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq_v2 *)0)->acr_magic) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq_v2, acr_version) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq_v2 *)0)->acr_version) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_acceptor_connreq_v2, acr_nid) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_acceptor_connreq_v2 *)0)->acr_nid) != 20);
+
+	/* Checks for struct lnet_counters_common */
+	BUILD_BUG_ON((int)sizeof(struct lnet_counters_common) != 60);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_msgs_alloc) != 0);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_msgs_alloc) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_msgs_max) != 4);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_msgs_max) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_errors) != 8);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_errors) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_send_count) != 12);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_send_count) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_recv_count) != 16);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_recv_count) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_route_count) != 20);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_route_count) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_drop_count) != 24);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_drop_count) != 4);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_send_length) != 28);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_send_length) != 8);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_recv_length) != 36);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_recv_length) != 8);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_route_length) != 44);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_route_length) != 8);
+	BUILD_BUG_ON((int)offsetof(struct lnet_counters_common, lcc_drop_length) != 52);
+	BUILD_BUG_ON((int)sizeof(((struct lnet_counters_common *)0)->lcc_drop_length) != 8);
+}
+
+static const struct lnet_lnd *lnet_find_lnd_by_type(__u32 type)
+{
+	const struct lnet_lnd *lnd;
+
+	/* holding lnd mutex */
+	if (type >= NUM_LNDS)
+		return NULL;
+	lnd = the_lnet.ln_lnds[type];
+	LASSERT(!lnd || lnd->lnd_type == type);
+
+	return lnd;
+}
+
+unsigned int
+lnet_get_lnd_timeout(void)
+{
+	return lnet_lnd_timeout;
+}
+EXPORT_SYMBOL(lnet_get_lnd_timeout);
+
+void
+lnet_register_lnd(const struct lnet_lnd *lnd)
+{
+	mutex_lock(&the_lnet.ln_lnd_mutex);
+
+	LASSERT(libcfs_isknown_lnd(lnd->lnd_type));
+	LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+	the_lnet.ln_lnds[lnd->lnd_type] = lnd;
+
+	CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	mutex_unlock(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd(const struct lnet_lnd *lnd)
+{
+	mutex_lock(&the_lnet.ln_lnd_mutex);
+
+	LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+
+	the_lnet.ln_lnds[lnd->lnd_type] = NULL;
+	CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	mutex_unlock(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+static void
+lnet_counters_get_common_locked(struct lnet_counters_common *common)
+{
+	struct lnet_counters *ctr;
+	int i;
+
+	/* FIXME !!! Their is no assert_lnet_net_locked() to ensure this
+	 * actually called under the protection of the lnet_net_lock.
+	 */
+	memset(common, 0, sizeof(*common));
+
+	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+		common->lcc_msgs_max     += ctr->lct_common.lcc_msgs_max;
+		common->lcc_msgs_alloc   += ctr->lct_common.lcc_msgs_alloc;
+		common->lcc_errors       += ctr->lct_common.lcc_errors;
+		common->lcc_send_count   += ctr->lct_common.lcc_send_count;
+		common->lcc_recv_count   += ctr->lct_common.lcc_recv_count;
+		common->lcc_route_count  += ctr->lct_common.lcc_route_count;
+		common->lcc_drop_count   += ctr->lct_common.lcc_drop_count;
+		common->lcc_send_length  += ctr->lct_common.lcc_send_length;
+		common->lcc_recv_length  += ctr->lct_common.lcc_recv_length;
+		common->lcc_route_length += ctr->lct_common.lcc_route_length;
+		common->lcc_drop_length  += ctr->lct_common.lcc_drop_length;
+	}
+}
+
+void
+lnet_counters_get_common(struct lnet_counters_common *common)
+{
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_counters_get_common_locked(common);
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get_common);
+
+int
+lnet_counters_get(struct lnet_counters *counters)
+{
+	struct lnet_counters *ctr;
+	struct lnet_counters_health *health = &counters->lct_health;
+	int i, rc = 0;
+
+	memset(counters, 0, sizeof(*counters));
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		GOTO(out_unlock, rc = -ENODEV);
+
+	lnet_counters_get_common_locked(&counters->lct_common);
+
+	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+		health->lch_rst_alloc    += ctr->lct_health.lch_rst_alloc;
+		health->lch_resend_count += ctr->lct_health.lch_resend_count;
+		health->lch_response_timeout_count +=
+				ctr->lct_health.lch_response_timeout_count;
+		health->lch_local_interrupt_count +=
+				ctr->lct_health.lch_local_interrupt_count;
+		health->lch_local_dropped_count +=
+				ctr->lct_health.lch_local_dropped_count;
+		health->lch_local_aborted_count +=
+				ctr->lct_health.lch_local_aborted_count;
+		health->lch_local_no_route_count +=
+				ctr->lct_health.lch_local_no_route_count;
+		health->lch_local_timeout_count +=
+				ctr->lct_health.lch_local_timeout_count;
+		health->lch_local_error_count +=
+				ctr->lct_health.lch_local_error_count;
+		health->lch_remote_dropped_count +=
+				ctr->lct_health.lch_remote_dropped_count;
+		health->lch_remote_error_count +=
+				ctr->lct_health.lch_remote_error_count;
+		health->lch_remote_timeout_count +=
+				ctr->lct_health.lch_remote_timeout_count;
+		health->lch_network_timeout_count +=
+				ctr->lct_health.lch_network_timeout_count;
+	}
+out_unlock:
+	lnet_net_unlock(LNET_LOCK_EX);
+	return rc;
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+	struct lnet_counters *counters;
+	int		i;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		goto avoid_reset;
+
+	cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+		memset(counters, 0, sizeof(struct lnet_counters));
+avoid_reset:
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+	switch (type) {
+	default:
+		LBUG();
+	case LNET_COOKIE_TYPE_MD:
+		return "MD";
+	case LNET_COOKIE_TYPE_ME:
+		return "ME";
+	case LNET_COOKIE_TYPE_EQ:
+		return "EQ";
+	}
+}
+
+static void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+	int	count = 0;
+
+	if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+		return;
+
+	while (!list_empty(&rec->rec_active)) {
+		struct list_head *e = rec->rec_active.next;
+
+		list_del_init(e);
+		if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+			lnet_md_free(list_entry(e, struct lnet_libmd, md_list));
+
+		} else { /* NB: Active MEs should be attached on portals */
+			LBUG();
+		}
+		count++;
+	}
+
+	if (count > 0) {
+		/* Found alive MD/ME/EQ, user really should unlink/free
+		 * all of them before finalize LNet, but if someone didn't,
+		 * we have to recycle garbage for him */
+		CERROR("%d active elements on exit of %s container\n",
+		       count, lnet_res_type2str(rec->rec_type));
+	}
+
+	if (rec->rec_lh_hash != NULL) {
+		CFS_FREE_PTR_ARRAY(rec->rec_lh_hash, LNET_LH_HASH_SIZE);
+		rec->rec_lh_hash = NULL;
+	}
+
+	rec->rec_type = 0; /* mark it as finalized */
+}
+
+static int
+lnet_res_container_setup(struct lnet_res_container *rec, int cpt, int type)
+{
+	int	rc = 0;
+	int	i;
+
+	LASSERT(rec->rec_type == 0);
+
+	rec->rec_type = type;
+	INIT_LIST_HEAD(&rec->rec_active);
+
+	rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+	/* Arbitrary choice of hash table size */
+	LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+			 LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+	if (rec->rec_lh_hash == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+	return 0;
+
+out:
+	CERROR("Failed to setup %s resource container\n",
+	       lnet_res_type2str(type));
+	lnet_res_container_cleanup(rec);
+	return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+	struct lnet_res_container	*rec;
+	int				i;
+
+	cfs_percpt_for_each(rec, i, recs)
+		lnet_res_container_cleanup(rec);
+
+	cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type)
+{
+	struct lnet_res_container	**recs;
+	struct lnet_res_container	*rec;
+	int				rc;
+	int				i;
+
+	recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+	if (recs == NULL) {
+		CERROR("Failed to allocate %s resource containers\n",
+		       lnet_res_type2str(type));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(rec, i, recs) {
+		rc = lnet_res_container_setup(rec, i, type);
+		if (rc != 0) {
+			lnet_res_containers_destroy(recs);
+			return NULL;
+		}
+	}
+
+	return recs;
+}
+
+struct lnet_libhandle *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	struct list_head	*head;
+	struct lnet_libhandle	*lh;
+	unsigned int		hash;
+
+	if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+		return NULL;
+
+	hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+	head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+	list_for_each_entry(lh, head, lh_hash_chain) {
+		if (lh->lh_cookie == cookie)
+			return lh;
+	}
+
+	return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec,
+		       struct lnet_libhandle *lh)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	unsigned int	ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+	unsigned int	hash;
+
+	lh->lh_cookie = rec->rec_lh_cookie;
+	rec->rec_lh_cookie += 1 << ibits;
+
+	hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+	list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+struct list_head **
+lnet_create_array_of_queues(void)
+{
+	struct list_head **qs;
+	struct list_head *q;
+	int i;
+
+	qs = cfs_percpt_alloc(lnet_cpt_table(),
+			      sizeof(struct list_head));
+	if (!qs) {
+		CERROR("Failed to allocate queues\n");
+		return NULL;
+	}
+
+	cfs_percpt_for_each(q, i, qs)
+		INIT_LIST_HEAD(q);
+
+	return qs;
+}
+
+static int lnet_unprepare(void);
+
+static int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+	/* Prepare to bring up the network */
+	struct lnet_res_container **recs;
+	int			  rc = 0;
+
+	if (requested_pid == LNET_PID_ANY) {
+		/* Don't instantiate LNET just for me */
+		return -ENETDOWN;
+	}
+
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	the_lnet.ln_routing = 0;
+
+	LASSERT((requested_pid & LNET_PID_USERFLAG) == 0);
+	the_lnet.ln_pid = requested_pid;
+
+	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+	INIT_LIST_HEAD(&the_lnet.ln_remote_peer_ni_list);
+	INIT_LIST_HEAD(&the_lnet.ln_nets);
+	INIT_LIST_HEAD(&the_lnet.ln_routers);
+	INIT_LIST_HEAD(&the_lnet.ln_drop_rules);
+	INIT_LIST_HEAD(&the_lnet.ln_delay_rules);
+	INIT_LIST_HEAD(&the_lnet.ln_dc_request);
+	INIT_LIST_HEAD(&the_lnet.ln_dc_working);
+	INIT_LIST_HEAD(&the_lnet.ln_dc_expired);
+	INIT_LIST_HEAD(&the_lnet.ln_mt_localNIRecovq);
+	INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
+	INIT_LIST_HEAD(&the_lnet.ln_udsp_list);
+	init_waitqueue_head(&the_lnet.ln_dc_waitq);
+	the_lnet.ln_mt_handler = NULL;
+	init_completion(&the_lnet.ln_started);
+
+	rc = lnet_slab_setup();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_create_remote_nets_table();
+	if (rc != 0)
+		goto failed;
+
+	/*
+	 * NB the interface cookie in wire handles guards against delayed
+	 * replies and ACKs appearing valid after reboot.
+	 */
+	the_lnet.ln_interface_cookie = ktime_get_real_ns();
+
+	the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+						sizeof(struct lnet_counters));
+	if (the_lnet.ln_counters == NULL) {
+		CERROR("Failed to allocate counters for LNet\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	rc = lnet_peer_tables_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_msg_containers_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+				      LNET_COOKIE_TYPE_EQ);
+	if (rc != 0)
+		goto failed;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD);
+	if (recs == NULL) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	the_lnet.ln_md_containers = recs;
+
+	rc = lnet_portals_create();
+	if (rc != 0) {
+		CERROR("Failed to create portals for LNet: %d\n", rc);
+		goto failed;
+	}
+
+	the_lnet.ln_mt_zombie_rstqs = lnet_create_array_of_queues();
+	if (!the_lnet.ln_mt_zombie_rstqs) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_unprepare();
+	return rc;
+}
+
+static int
+lnet_unprepare(void)
+{
+	/* NB no LNET_LOCK since this is the last reference.  All LND instances
+	 * have shut down already, so it is safe to unlink and free all
+	 * descriptors, even those that appear committed to a network op (eg MD
+	 * with non-zero pending count) */
+
+	lnet_fail_nid(LNET_NID_ANY, 0);
+
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_test_peers));
+	LASSERT(list_empty(&the_lnet.ln_nets));
+
+	if (the_lnet.ln_mt_zombie_rstqs) {
+		lnet_clean_zombie_rstqs();
+		the_lnet.ln_mt_zombie_rstqs = NULL;
+	}
+
+	lnet_assert_handler_unused(the_lnet.ln_mt_handler);
+	the_lnet.ln_mt_handler = NULL;
+
+	lnet_portals_destroy();
+
+	if (the_lnet.ln_md_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_md_containers);
+		the_lnet.ln_md_containers = NULL;
+	}
+
+	lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+	lnet_msg_containers_destroy();
+	lnet_peer_uninit();
+	lnet_rtrpools_free(0);
+
+	if (the_lnet.ln_counters != NULL) {
+		cfs_percpt_free(the_lnet.ln_counters);
+		the_lnet.ln_counters = NULL;
+	}
+	lnet_destroy_remote_nets_table();
+	lnet_udsp_destroy(true);
+	lnet_slab_cleanup();
+
+	return 0;
+}
+
+struct lnet_ni  *
+lnet_net2ni_locked(__u32 net_id, int cpt)
+{
+	struct lnet_ni	 *ni;
+	struct lnet_net	 *net;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		if (net->net_id == net_id) {
+			ni = list_entry(net->net_ni_list.next, struct lnet_ni,
+					ni_netlist);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+struct lnet_ni *
+lnet_net2ni_addref(__u32 net)
+{
+	struct lnet_ni *ni;
+
+	lnet_net_lock(0);
+	ni = lnet_net2ni_locked(net, 0);
+	if (ni)
+		lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+
+	return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni_addref);
+
+struct lnet_net *
+lnet_get_net_locked(__u32 net_id)
+{
+	struct lnet_net	 *net;
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		if (net->net_id == net_id)
+			return net;
+	}
+
+	return NULL;
+}
+
+void
+lnet_net_clr_pref_rtrs(struct lnet_net *net)
+{
+	struct list_head zombies;
+	struct lnet_nid_list *ne;
+	struct lnet_nid_list *tmp;
+
+	INIT_LIST_HEAD(&zombies);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_splice_init(&net->net_rtr_pref_nids, &zombies);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(ne, tmp, &zombies, nl_list) {
+		list_del_init(&ne->nl_list);
+		LIBCFS_FREE(ne, sizeof(*ne));
+	}
+}
+
+int
+lnet_net_add_pref_rtr(struct lnet_net *net,
+		      struct lnet_nid *gw_nid)
+__must_hold(&the_lnet.ln_api_mutex)
+{
+	struct lnet_nid_list *ne;
+
+	/* This function is called with api_mutex held. When the api_mutex
+	 * is held the list can not be modified, as it is only modified as
+	 * a result of applying a UDSP and that happens under api_mutex
+	 * lock.
+	 */
+	list_for_each_entry(ne, &net->net_rtr_pref_nids, nl_list) {
+		if (nid_same(&ne->nl_nid, gw_nid))
+			return -EEXIST;
+	}
+
+	LIBCFS_ALLOC(ne, sizeof(*ne));
+	if (!ne)
+		return -ENOMEM;
+
+	ne->nl_nid = *gw_nid;
+
+	/* Lock the cpt to protect against addition and checks in the
+	 * selection algorithm
+	 */
+	lnet_net_lock(LNET_LOCK_EX);
+	list_add(&ne->nl_list, &net->net_rtr_pref_nids);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+
+bool
+lnet_net_is_pref_rtr_locked(struct lnet_net *net, struct lnet_nid *rtr_nid)
+{
+	struct lnet_nid_list *ne;
+
+	CDEBUG(D_NET, "%s: rtr pref empty: %d\n",
+	       libcfs_net2str(net->net_id),
+	       list_empty(&net->net_rtr_pref_nids));
+
+	if (list_empty(&net->net_rtr_pref_nids))
+		return false;
+
+	list_for_each_entry(ne, &net->net_rtr_pref_nids, nl_list) {
+		CDEBUG(D_NET, "Comparing pref %s with gw %s\n",
+		       libcfs_nidstr(&ne->nl_nid),
+		       libcfs_nidstr(rtr_nid));
+		if (nid_same(rtr_nid, &ne->nl_nid))
+			return true;
+	}
+
+	return false;
+}
+
+static unsigned int
+lnet_nid4_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+	__u64 key = nid;
+	__u64 pair_bits = 0x0001000100010001LLU;
+	__u64 mask = pair_bits * 0xFF;
+	__u64 pair_sum;
+
+	/* Use (sum-by-multiplication of nid bytes) mod (number of CPTs)
+	 * to match nid to a CPT.
+	 */
+	pair_sum = (key & mask) + ((key >> 8) & mask);
+	pair_sum = (pair_sum * pair_bits) >> 48;
+
+	CDEBUG(D_NET, "Match nid %s to cpt %u\n",
+	       libcfs_nid2str(nid), (unsigned int)(pair_sum) % number);
+
+	return (unsigned int)(pair_sum) % number;
+}
+
+unsigned int
+lnet_nid_cpt_hash(struct lnet_nid *nid, unsigned int number)
+{
+	unsigned int val;
+	u32 h = 0;
+	int i;
+
+	LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+	if (number == 1)
+		return 0;
+
+	if (nid_is_nid4(nid))
+		return lnet_nid4_cpt_hash(lnet_nid_to_nid4(nid), number);
+
+	for (i = 0; i < 4; i++)
+		h = hash_32(nid->nid_addr[i]^h, 32);
+	val = hash_32(LNET_NID_NET(nid) ^ h, LNET_CPT_BITS);
+	if (val < number)
+		return val;
+	return (unsigned int)(h + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(struct lnet_nid *nid, struct lnet_ni *ni)
+{
+	struct lnet_net *net;
+
+	/* must called with hold of lnet_net_lock */
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	/*
+	 * If NI is provided then use the CPT identified in the NI cpt
+	 * list if one exists. If one doesn't exist, then that NI is
+	 * associated with all CPTs and it follows that the net it belongs
+	 * to is implicitly associated with all CPTs, so just hash the nid
+	 * and return that.
+	 */
+	if (ni != NULL) {
+		if (ni->ni_cpts != NULL)
+			return ni->ni_cpts[lnet_nid_cpt_hash(nid,
+							     ni->ni_ncpts)];
+		else
+			return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+	}
+
+	/* no NI provided so look at the net */
+	net = lnet_get_net_locked(LNET_NID_NET(nid));
+
+	if (net != NULL && net->net_cpts != NULL) {
+		return net->net_cpts[lnet_nid_cpt_hash(nid, net->net_ncpts)];
+	}
+
+	return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_nid2cpt(struct lnet_nid *nid, struct lnet_ni *ni)
+{
+	int	cpt;
+	int	cpt2;
+
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	cpt = lnet_net_lock_current();
+
+	cpt2 = lnet_cpt_of_nid_locked(nid, ni);
+
+	lnet_net_unlock(cpt);
+
+	return cpt2;
+}
+EXPORT_SYMBOL(lnet_nid2cpt);
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid4, struct lnet_ni *ni)
+{
+	struct lnet_nid nid;
+
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	lnet_nid4_to_nid(nid4, &nid);
+	return lnet_nid2cpt(&nid, ni);
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet_locked(__u32 net_id)
+{
+	struct lnet_net *net;
+	bool local;
+
+	net = lnet_get_net_locked(net_id);
+
+	local = net != NULL;
+
+	return local;
+}
+
+int
+lnet_islocalnet(__u32 net_id)
+{
+	int cpt;
+	bool local;
+
+	cpt = lnet_net_lock_current();
+
+	local = lnet_islocalnet_locked(net_id);
+
+	lnet_net_unlock(cpt);
+
+	return local;
+}
+
+struct lnet_ni  *
+lnet_nid_to_ni_locked(struct lnet_nid *nid, int cpt)
+{
+	struct lnet_net  *net;
+	struct lnet_ni *ni;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (nid_same(&ni->ni_nid, nid))
+				return ni;
+		}
+	}
+
+	return NULL;
+}
+
+struct lnet_ni  *
+lnet_nid2ni_locked(lnet_nid_t nid4, int cpt)
+{
+	struct lnet_nid nid;
+
+	lnet_nid4_to_nid(nid4, &nid);
+	return lnet_nid_to_ni_locked(&nid, cpt);
+}
+
+struct lnet_ni *
+lnet_nid2ni_addref(lnet_nid_t nid4)
+{
+	struct lnet_ni *ni;
+	struct lnet_nid nid;
+
+	lnet_nid4_to_nid(nid4, &nid);
+
+	lnet_net_lock(0);
+	ni = lnet_nid_to_ni_locked(&nid, 0);
+	if (ni)
+		lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+
+	return ni;
+}
+EXPORT_SYMBOL(lnet_nid2ni_addref);
+
+struct lnet_ni *
+lnet_nid_to_ni_addref(struct lnet_nid *nid)
+{
+	struct lnet_ni *ni;
+
+	lnet_net_lock(0);
+	ni = lnet_nid_to_ni_locked(nid, 0);
+	if (ni)
+		lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+
+	return ni;
+}
+EXPORT_SYMBOL(lnet_nid_to_ni_addref);
+
+int
+lnet_islocalnid(struct lnet_nid *nid)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	ni = lnet_nid_to_ni_locked(nid, cpt);
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nets(void)
+{
+	/* Return the # of NIs that need the acceptor. */
+	int		 count = 0;
+	struct lnet_net  *net;
+	int		 cpt;
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		/* all socklnd type networks should have the acceptor
+		 * thread started */
+		if (net->net_lnd->lnd_accept != NULL)
+			count++;
+	}
+
+	lnet_net_unlock(cpt);
+
+	return count;
+}
+
+struct lnet_ping_buffer *
+lnet_ping_buffer_alloc(int nnis, gfp_t gfp)
+{
+	struct lnet_ping_buffer *pbuf;
+
+	LIBCFS_ALLOC_GFP(pbuf, LNET_PING_BUFFER_SIZE(nnis), gfp);
+	if (pbuf) {
+		pbuf->pb_nnis = nnis;
+		pbuf->pb_needs_post = false;
+		atomic_set(&pbuf->pb_refcnt, 1);
+	}
+
+	return pbuf;
+}
+
+void
+lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf)
+{
+	LASSERT(atomic_read(&pbuf->pb_refcnt) == 0);
+	LIBCFS_FREE(pbuf, LNET_PING_BUFFER_SIZE(pbuf->pb_nnis));
+}
+
+static struct lnet_ping_buffer *
+lnet_ping_target_create(int nnis)
+{
+	struct lnet_ping_buffer *pbuf;
+
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (pbuf == NULL) {
+		CERROR("Can't allocate ping source [%d]\n", nnis);
+		return NULL;
+	}
+
+	pbuf->pb_info.pi_nnis = nnis;
+	pbuf->pb_info.pi_pid = the_lnet.ln_pid;
+	pbuf->pb_info.pi_magic = LNET_PROTO_PING_MAGIC;
+	pbuf->pb_info.pi_features =
+		LNET_PING_FEAT_NI_STATUS | LNET_PING_FEAT_MULTI_RAIL;
+
+	return pbuf;
+}
+
+static inline int
+lnet_get_net_ni_count_locked(struct lnet_net *net)
+{
+	struct lnet_ni	*ni;
+	int		count = 0;
+
+	list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
+		count++;
+
+	return count;
+}
+
+static inline int
+lnet_get_net_ni_count_pre(struct lnet_net *net)
+{
+	struct lnet_ni	*ni;
+	int		count = 0;
+
+	list_for_each_entry(ni, &net->net_ni_added, ni_netlist)
+		count++;
+
+	return count;
+}
+
+static inline int
+lnet_get_ni_count(void)
+{
+	struct lnet_ni	*ni;
+	struct lnet_net *net;
+	int		count = 0;
+
+	lnet_net_lock(0);
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
+			count++;
+	}
+
+	lnet_net_unlock(0);
+
+	return count;
+}
+
+void
+lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
+{
+	struct lnet_ni_status *stat;
+	int nnis;
+	int i;
+
+	__swab32s(&pbuf->pb_info.pi_magic);
+	__swab32s(&pbuf->pb_info.pi_features);
+	__swab32s(&pbuf->pb_info.pi_pid);
+	__swab32s(&pbuf->pb_info.pi_nnis);
+	nnis = pbuf->pb_info.pi_nnis;
+	if (nnis > pbuf->pb_nnis)
+		nnis = pbuf->pb_nnis;
+	for (i = 0; i < nnis; i++) {
+		stat = &pbuf->pb_info.pi_ni[i];
+		__swab64s(&stat->ns_nid);
+		__swab32s(&stat->ns_status);
+	}
+}
+
+int
+lnet_ping_info_validate(struct lnet_ping_info *pinfo)
+{
+	if (!pinfo)
+		return -EINVAL;
+	if (pinfo->pi_magic != LNET_PROTO_PING_MAGIC)
+		return -EPROTO;
+	if (!(pinfo->pi_features & LNET_PING_FEAT_NI_STATUS))
+		return -EPROTO;
+	/* Loopback is guaranteed to be present */
+	if (pinfo->pi_nnis < 1 || pinfo->pi_nnis > lnet_interfaces_max)
+		return -ERANGE;
+	if (LNET_PING_INFO_LONI(pinfo) != LNET_NID_LO_0)
+		return -EPROTO;
+	return 0;
+}
+
+static void
+lnet_ping_target_destroy(void)
+{
+	struct lnet_net *net;
+	struct lnet_ni	*ni;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			lnet_ni_lock(ni);
+			ni->ni_status = NULL;
+			lnet_ni_unlock(ni);
+		}
+	}
+
+	lnet_ping_buffer_decref(the_lnet.ln_ping_target);
+	the_lnet.ln_ping_target = NULL;
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static void
+lnet_ping_target_event_handler(struct lnet_event *event)
+{
+	struct lnet_ping_buffer *pbuf = event->md_user_ptr;
+
+	if (event->unlinked)
+		lnet_ping_buffer_decref(pbuf);
+}
+
+static int
+lnet_ping_target_setup(struct lnet_ping_buffer **ppbuf,
+		       struct lnet_handle_md *ping_mdh,
+		       int ni_count, bool set_eq)
+{
+	struct lnet_processid id = {
+		.nid = LNET_ANY_NID,
+		.pid = LNET_PID_ANY
+	};
+	struct lnet_me *me;
+	struct lnet_md md = { NULL };
+	int rc;
+
+	if (set_eq)
+		the_lnet.ln_ping_target_handler =
+			lnet_ping_target_event_handler;
+
+	*ppbuf = lnet_ping_target_create(ni_count);
+	if (*ppbuf == NULL) {
+		rc = -ENOMEM;
+		goto fail_free_eq;
+	}
+
+	/* Ping target ME/MD */
+	me = LNetMEAttach(LNET_RESERVED_PORTAL, &id,
+			  LNET_PROTO_PING_MATCHBITS, 0,
+			  LNET_UNLINK, LNET_INS_AFTER);
+	if (IS_ERR(me)) {
+		rc = PTR_ERR(me);
+		CERROR("Can't create ping target ME: %d\n", rc);
+		goto fail_decref_ping_buffer;
+	}
+
+	/* initialize md content */
+	md.start     = &(*ppbuf)->pb_info;
+	md.length    = LNET_PING_INFO_SIZE((*ppbuf)->pb_nnis);
+	md.threshold = LNET_MD_THRESH_INF;
+	md.max_size  = 0;
+	md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+		       LNET_MD_MANAGE_REMOTE;
+	md.handler   = the_lnet.ln_ping_target_handler;
+	md.user_ptr  = *ppbuf;
+
+	rc = LNetMDAttach(me, &md, LNET_RETAIN, ping_mdh);
+	if (rc != 0) {
+		CERROR("Can't attach ping target MD: %d\n", rc);
+		goto fail_decref_ping_buffer;
+	}
+	lnet_ping_buffer_addref(*ppbuf);
+
+	return 0;
+
+fail_decref_ping_buffer:
+	LASSERT(atomic_read(&(*ppbuf)->pb_refcnt) == 1);
+	lnet_ping_buffer_decref(*ppbuf);
+	*ppbuf = NULL;
+fail_free_eq:
+	return rc;
+}
+
+static void
+lnet_ping_md_unlink(struct lnet_ping_buffer *pbuf,
+		    struct lnet_handle_md *ping_mdh)
+{
+	LNetMDUnlink(*ping_mdh);
+	LNetInvalidateMDHandle(ping_mdh);
+
+	/* NB the MD could be busy; this just starts the unlink */
+	wait_var_event_warning(&pbuf->pb_refcnt,
+			       atomic_read(&pbuf->pb_refcnt) <= 1,
+			       "Still waiting for ping data MD to unlink\n");
+}
+
+static void
+lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf)
+{
+	struct lnet_ni		*ni;
+	struct lnet_net		*net;
+	struct lnet_ni_status *ns;
+	int			i;
+	int			rc;
+
+	i = 0;
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			LASSERT(i < pbuf->pb_nnis);
+
+			ns = &pbuf->pb_info.pi_ni[i];
+
+			if (!nid_is_nid4(&ni->ni_nid))
+				continue;
+			ns->ns_nid = lnet_nid_to_nid4(&ni->ni_nid);
+
+			lnet_ni_lock(ni);
+			ns->ns_status = lnet_ni_get_status_locked(ni);
+			ni->ni_status = ns;
+			lnet_ni_unlock(ni);
+
+			i++;
+		}
+	}
+	/*
+	 * We (ab)use the ns_status of the loopback interface to
+	 * transmit the sequence number. The first interface listed
+	 * must be the loopback interface.
+	 */
+	rc = lnet_ping_info_validate(&pbuf->pb_info);
+	if (rc) {
+		LCONSOLE_EMERG("Invalid ping target: %d\n", rc);
+		LBUG();
+	}
+	LNET_PING_BUFFER_SEQNO(pbuf) =
+		atomic_inc_return(&the_lnet.ln_ping_target_seqno);
+}
+
+static void
+lnet_ping_target_update(struct lnet_ping_buffer *pbuf,
+			struct lnet_handle_md ping_mdh)
+{
+	struct lnet_ping_buffer *old_pbuf = NULL;
+	struct lnet_handle_md old_ping_md;
+
+	/* switch the NIs to point to the new ping info created */
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (!the_lnet.ln_routing)
+		pbuf->pb_info.pi_features |= LNET_PING_FEAT_RTE_DISABLED;
+	if (!lnet_peer_discovery_disabled)
+		pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
+
+	/* Ensure only known feature bits have been set. */
+	LASSERT(pbuf->pb_info.pi_features & LNET_PING_FEAT_BITS);
+	LASSERT(!(pbuf->pb_info.pi_features & ~LNET_PING_FEAT_BITS));
+
+	lnet_ping_target_install_locked(pbuf);
+
+	if (the_lnet.ln_ping_target) {
+		old_pbuf = the_lnet.ln_ping_target;
+		old_ping_md = the_lnet.ln_ping_target_md;
+	}
+	the_lnet.ln_ping_target_md = ping_mdh;
+	the_lnet.ln_ping_target = pbuf;
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (old_pbuf) {
+		/* unlink and free the old ping info */
+		lnet_ping_md_unlink(old_pbuf, &old_ping_md);
+		lnet_ping_buffer_decref(old_pbuf);
+	}
+
+	lnet_push_update_to_peers(0);
+}
+
+static void
+lnet_ping_target_fini(void)
+{
+	lnet_ping_md_unlink(the_lnet.ln_ping_target,
+			    &the_lnet.ln_ping_target_md);
+
+	lnet_assert_handler_unused(the_lnet.ln_ping_target_handler);
+	lnet_ping_target_destroy();
+}
+
+/* Resize the push target. */
+int lnet_push_target_resize(void)
+{
+	struct lnet_handle_md mdh;
+	struct lnet_handle_md old_mdh;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_ping_buffer *old_pbuf;
+	int nnis;
+	int rc;
+
+again:
+	nnis = the_lnet.ln_push_target_nnis;
+	if (nnis <= 0) {
+		CDEBUG(D_NET, "Invalid nnis %d\n", nnis);
+		return -EINVAL;
+	}
+
+	/* NB: lnet_ping_buffer_alloc() sets pbuf refcount to 1. That ref is
+	 * dropped when we need to resize again (see "old_pbuf" below) or when
+	 * LNet is shutdown (see lnet_push_target_fini())
+	 */
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (!pbuf) {
+		CDEBUG(D_NET, "Can't allocate pbuf for nnis %d\n", nnis);
+		return -ENOMEM;
+	}
+
+	rc = lnet_push_target_post(pbuf, &mdh);
+	if (rc) {
+		CDEBUG(D_NET, "Failed to post push target: %d\n", rc);
+		lnet_ping_buffer_decref(pbuf);
+		return rc;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	old_pbuf = the_lnet.ln_push_target;
+	old_mdh = the_lnet.ln_push_target_md;
+	the_lnet.ln_push_target = pbuf;
+	the_lnet.ln_push_target_md = mdh;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (old_pbuf) {
+		LNetMDUnlink(old_mdh);
+		/* Drop ref set by lnet_ping_buffer_alloc() */
+		lnet_ping_buffer_decref(old_pbuf);
+	}
+
+	/* Received another push or reply that requires a larger buffer */
+	if (nnis < the_lnet.ln_push_target_nnis)
+		goto again;
+
+	CDEBUG(D_NET, "nnis %d success\n", nnis);
+	return 0;
+}
+
+int lnet_push_target_post(struct lnet_ping_buffer *pbuf,
+			  struct lnet_handle_md *mdhp)
+{
+	struct lnet_processid id = { LNET_ANY_NID, LNET_PID_ANY };
+	struct lnet_md md = { NULL };
+	struct lnet_me *me;
+	int rc;
+
+	me = LNetMEAttach(LNET_RESERVED_PORTAL, &id,
+			  LNET_PROTO_PING_MATCHBITS, 0,
+			  LNET_UNLINK, LNET_INS_AFTER);
+	if (IS_ERR(me)) {
+		rc = PTR_ERR(me);
+		CERROR("Can't create push target ME: %d\n", rc);
+		return rc;
+	}
+
+	pbuf->pb_needs_post = false;
+
+	/* This reference is dropped by lnet_push_target_event_handler() */
+	lnet_ping_buffer_addref(pbuf);
+
+	/* initialize md content */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(pbuf->pb_nnis);
+	md.threshold = 1;
+	md.max_size  = 0;
+	md.options   = LNET_MD_OP_PUT | LNET_MD_TRUNCATE;
+	md.user_ptr  = pbuf;
+	md.handler   = the_lnet.ln_push_target_handler;
+
+	rc = LNetMDAttach(me, &md, LNET_UNLINK, mdhp);
+	if (rc) {
+		CERROR("Can't attach push MD: %d\n", rc);
+		lnet_ping_buffer_decref(pbuf);
+		pbuf->pb_needs_post = true;
+		return rc;
+	}
+
+	CDEBUG(D_NET, "posted push target %p\n", pbuf);
+
+	return 0;
+}
+
+static void lnet_push_target_event_handler(struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf = ev->md_user_ptr;
+
+	CDEBUG(D_NET, "type %d status %d unlinked %d\n", ev->type, ev->status,
+	       ev->unlinked);
+
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(pbuf);
+
+	if (ev->type == LNET_EVENT_UNLINK) {
+		/* Drop ref added by lnet_push_target_post() */
+		lnet_ping_buffer_decref(pbuf);
+		return;
+	}
+
+	lnet_peer_push_event(ev);
+	if (ev->unlinked)
+		/* Drop ref added by lnet_push_target_post */
+		lnet_ping_buffer_decref(pbuf);
+}
+
+/* Initialize the push target. */
+static int lnet_push_target_init(void)
+{
+	int rc;
+
+	if (the_lnet.ln_push_target)
+		return -EALREADY;
+
+	the_lnet.ln_push_target_handler =
+		lnet_push_target_event_handler;
+
+	rc = LNetSetLazyPortal(LNET_RESERVED_PORTAL);
+	LASSERT(rc == 0);
+
+	/* Start at the required minimum, we'll enlarge if required. */
+	the_lnet.ln_push_target_nnis = LNET_INTERFACES_MIN;
+
+	rc = lnet_push_target_resize();
+
+	if (rc) {
+		LNetClearLazyPortal(LNET_RESERVED_PORTAL);
+		the_lnet.ln_push_target_handler = NULL;
+	}
+
+	return rc;
+}
+
+/* Clean up the push target. */
+static void lnet_push_target_fini(void)
+{
+	if (!the_lnet.ln_push_target)
+		return;
+
+	/* Unlink and invalidate to prevent new references. */
+	LNetMDUnlink(the_lnet.ln_push_target_md);
+	LNetInvalidateMDHandle(&the_lnet.ln_push_target_md);
+
+	/* Wait for the unlink to complete. */
+	wait_var_event_warning(&the_lnet.ln_push_target->pb_refcnt,
+			       atomic_read(&the_lnet.ln_push_target->pb_refcnt) <= 1,
+			       "Still waiting for ping data MD to unlink\n");
+
+	/* Drop ref set by lnet_ping_buffer_alloc() */
+	lnet_ping_buffer_decref(the_lnet.ln_push_target);
+	the_lnet.ln_push_target = NULL;
+	the_lnet.ln_push_target_nnis = 0;
+
+	LNetClearLazyPortal(LNET_RESERVED_PORTAL);
+	lnet_assert_handler_unused(the_lnet.ln_push_target_handler);
+	the_lnet.ln_push_target_handler = NULL;
+}
+
+static int
+lnet_ni_tq_credits(struct lnet_ni *ni)
+{
+	int	credits;
+
+	LASSERT(ni->ni_ncpts >= 1);
+
+	if (ni->ni_ncpts == 1)
+		return ni->ni_net->net_tunables.lct_max_tx_credits;
+
+	credits = ni->ni_net->net_tunables.lct_max_tx_credits / ni->ni_ncpts;
+	credits = max(credits, 8 * ni->ni_net->net_tunables.lct_peer_tx_credits);
+	credits = min(credits, ni->ni_net->net_tunables.lct_max_tx_credits);
+
+	return credits;
+}
+
+static void
+lnet_ni_unlink_locked(struct lnet_ni *ni)
+{
+	/* move it to zombie list and nobody can find it anymore */
+	LASSERT(!list_empty(&ni->ni_netlist));
+	list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie);
+	lnet_ni_decref_locked(ni, 0);
+}
+
+static void
+lnet_clear_zombies_nis_locked(struct lnet_net *net)
+{
+	int		i;
+	int		islo;
+	struct lnet_ni	*ni;
+	struct list_head *zombie_list = &net->net_ni_zombie;
+
+	/*
+	 * Now wait for the NIs I just nuked to show up on the zombie
+	 * list and shut them down in guaranteed thread context
+	 */
+	i = 2;
+	while (!list_empty(zombie_list)) {
+		int	*ref;
+		int	j;
+
+		ni = list_entry(zombie_list->next,
+				struct lnet_ni, ni_netlist);
+		list_del_init(&ni->ni_netlist);
+		/* the ni should be in deleting state. If it's not it's
+		 * a bug */
+		LASSERT(ni->ni_state == LNET_NI_STATE_DELETING);
+		cfs_percpt_for_each(ref, j, ni->ni_refs) {
+			if (*ref == 0)
+				continue;
+			/* still busy, add it back to zombie list */
+			list_add(&ni->ni_netlist, zombie_list);
+			break;
+		}
+
+		if (!list_empty(&ni->ni_netlist)) {
+			/* Unlock mutex while waiting to allow other
+			 * threads to read the LNet state and fall through
+			 * to avoid deadlock
+			 */
+			lnet_net_unlock(LNET_LOCK_EX);
+			mutex_unlock(&the_lnet.ln_api_mutex);
+
+			++i;
+			if ((i & (-i)) == i) {
+				CDEBUG(D_WARNING,
+				       "Waiting for zombie LNI %s\n",
+				       libcfs_nidstr(&ni->ni_nid));
+			}
+			schedule_timeout_uninterruptible(cfs_time_seconds(1));
+
+			mutex_lock(&the_lnet.ln_api_mutex);
+			lnet_net_lock(LNET_LOCK_EX);
+			continue;
+		}
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		islo = ni->ni_net->net_lnd->lnd_type == LOLND;
+
+		LASSERT(!in_interrupt());
+		/* Holding the LND mutex makes it safe for lnd_shutdown
+		 * to call module_put(). Module unload cannot finish
+		 * until lnet_unregister_lnd() completes, and that
+		 * requires the LND mutex.
+		 */
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		mutex_lock(&the_lnet.ln_lnd_mutex);
+		(net->net_lnd->lnd_shutdown)(ni);
+		mutex_unlock(&the_lnet.ln_lnd_mutex);
+		mutex_lock(&the_lnet.ln_api_mutex);
+
+		if (!islo)
+			CDEBUG(D_LNI, "Removed LNI %s\n",
+			      libcfs_nidstr(&ni->ni_nid));
+
+		lnet_ni_free(ni);
+		i = 2;
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+}
+
+/* shutdown down the NI and release refcount */
+static void
+lnet_shutdown_lndni(struct lnet_ni *ni)
+{
+	int i;
+	struct lnet_net *net = ni->ni_net;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_ni_lock(ni);
+	ni->ni_state = LNET_NI_STATE_DELETING;
+	lnet_ni_unlock(ni);
+	lnet_ni_unlink_locked(ni);
+	lnet_incr_dlc_seq();
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* clear messages for this NI on the lazy portal */
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		lnet_clear_lazy_portal(ni, i, "Shutting down NI");
+
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_clear_zombies_nis_locked(net);
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static void
+lnet_shutdown_lndnet(struct lnet_net *net)
+{
+	struct lnet_ni *ni;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	list_del_init(&net->net_list);
+
+	while (!list_empty(&net->net_ni_list)) {
+		ni = list_entry(net->net_ni_list.next,
+				struct lnet_ni, ni_netlist);
+		lnet_net_unlock(LNET_LOCK_EX);
+		lnet_shutdown_lndni(ni);
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* Do peer table cleanup for this net */
+	lnet_peer_tables_cleanup(net);
+
+	lnet_net_free(net);
+}
+
+static void
+lnet_shutdown_lndnets(void)
+{
+	struct lnet_net *net;
+	LIST_HEAD(resend);
+	struct lnet_msg *msg, *tmp;
+
+	/* NB called holding the global mutex */
+
+	/* All quiet on the API front */
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING ||
+		the_lnet.ln_state == LNET_STATE_STOPPING);
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_state = LNET_STATE_STOPPING;
+
+	/*
+	 * move the nets to the zombie list to avoid them being
+	 * picked up for new work. LONET is also included in the
+	 * Nets that will be moved to the zombie list
+	 */
+	list_splice_init(&the_lnet.ln_nets, &the_lnet.ln_net_zombie);
+
+	/* Drop the cached loopback Net. */
+	if (the_lnet.ln_loni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+		the_lnet.ln_loni = NULL;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* iterate through the net zombie list and delete each net */
+	while (!list_empty(&the_lnet.ln_net_zombie)) {
+		net = list_entry(the_lnet.ln_net_zombie.next,
+				 struct lnet_net, net_list);
+		lnet_shutdown_lndnet(net);
+	}
+
+	spin_lock(&the_lnet.ln_msg_resend_lock);
+	list_splice(&the_lnet.ln_msg_resend, &resend);
+	spin_unlock(&the_lnet.ln_msg_resend_lock);
+
+	list_for_each_entry_safe(msg, tmp, &resend, msg_list) {
+		list_del_init(&msg->msg_list);
+		msg->msg_no_resend = true;
+		lnet_finalize(msg, -ECANCELED);
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_state = LNET_STATE_SHUTDOWN;
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static int
+lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
+{
+	int			rc = -EINVAL;
+	struct lnet_tx_queue	*tq;
+	int			i;
+	struct lnet_net		*net = ni->ni_net;
+
+	mutex_lock(&the_lnet.ln_lnd_mutex);
+
+	if (tun) {
+		memcpy(&ni->ni_lnd_tunables, tun, sizeof(*tun));
+		ni->ni_lnd_tunables_set = true;
+	}
+
+	rc = (net->net_lnd->lnd_startup)(ni);
+
+	mutex_unlock(&the_lnet.ln_lnd_mutex);
+
+	if (rc != 0) {
+		LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s\n",
+				   rc, libcfs_lnd2str(net->net_lnd->lnd_type));
+		goto failed0;
+	}
+
+	lnet_ni_lock(ni);
+	ni->ni_state = LNET_NI_STATE_ACTIVE;
+	lnet_ni_unlock(ni);
+
+	/* We keep a reference on the loopback net through the loopback NI */
+	if (net->net_lnd->lnd_type == LOLND) {
+		lnet_ni_addref(ni);
+		LASSERT(the_lnet.ln_loni == NULL);
+		the_lnet.ln_loni = ni;
+		ni->ni_net->net_tunables.lct_peer_tx_credits = 0;
+		ni->ni_net->net_tunables.lct_peer_rtr_credits = 0;
+		ni->ni_net->net_tunables.lct_max_tx_credits = 0;
+		ni->ni_net->net_tunables.lct_peer_timeout = 0;
+		return 0;
+	}
+
+	if (ni->ni_net->net_tunables.lct_peer_tx_credits == 0 ||
+	    ni->ni_net->net_tunables.lct_max_tx_credits == 0) {
+		LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+				   libcfs_lnd2str(net->net_lnd->lnd_type),
+				   ni->ni_net->net_tunables.lct_peer_tx_credits == 0 ?
+					"" : "per-peer ");
+		/* shutdown the NI since if we get here then it must've already
+		 * been started
+		 */
+		lnet_shutdown_lndni(ni);
+		return -EINVAL;
+	}
+
+	cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+		tq->tq_credits_min =
+		tq->tq_credits_max =
+		tq->tq_credits = lnet_ni_tq_credits(ni);
+	}
+
+	atomic_set(&ni->ni_tx_credits,
+		   lnet_ni_tq_credits(ni) * ni->ni_ncpts);
+	atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE);
+
+	CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+		libcfs_nidstr(&ni->ni_nid),
+		ni->ni_net->net_tunables.lct_peer_tx_credits,
+		lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+		ni->ni_net->net_tunables.lct_peer_rtr_credits,
+		ni->ni_net->net_tunables.lct_peer_timeout);
+
+	return 0;
+failed0:
+	lnet_ni_free(ni);
+	return rc;
+}
+
+static int
+lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
+{
+	struct lnet_ni *ni;
+	struct lnet_net *net_l = NULL;
+	LIST_HEAD(local_ni_list);
+	int rc;
+	int ni_count = 0;
+	__u32 lnd_type;
+	const struct lnet_lnd  *lnd;
+	int peer_timeout =
+		net->net_tunables.lct_peer_timeout;
+	int maxtxcredits =
+		net->net_tunables.lct_max_tx_credits;
+	int peerrtrcredits =
+		net->net_tunables.lct_peer_rtr_credits;
+
+	/*
+	 * make sure that this net is unique. If it isn't then
+	 * we are adding interfaces to an already existing network, and
+	 * 'net' is just a convenient way to pass in the list.
+	 * if it is unique we need to find the LND and load it if
+	 * necessary.
+	 */
+	if (lnet_net_unique(net->net_id, &the_lnet.ln_nets, &net_l)) {
+		lnd_type = LNET_NETTYP(net->net_id);
+
+		mutex_lock(&the_lnet.ln_lnd_mutex);
+		lnd = lnet_find_lnd_by_type(lnd_type);
+
+		if (lnd == NULL) {
+			mutex_unlock(&the_lnet.ln_lnd_mutex);
+			rc = request_module("%s", libcfs_lnd2modname(lnd_type));
+			mutex_lock(&the_lnet.ln_lnd_mutex);
+
+			lnd = lnet_find_lnd_by_type(lnd_type);
+			if (lnd == NULL) {
+				mutex_unlock(&the_lnet.ln_lnd_mutex);
+				CERROR("Can't load LND %s, module %s, rc=%d\n",
+				libcfs_lnd2str(lnd_type),
+				libcfs_lnd2modname(lnd_type), rc);
+#ifndef HAVE_MODULE_LOADING_SUPPORT
+				LCONSOLE_ERROR_MSG(0x104, "Your kernel must be "
+						"compiled with kernel module "
+						"loading support.");
+#endif
+				rc = -EINVAL;
+				goto failed0;
+			}
+		}
+
+		net->net_lnd = lnd;
+
+		mutex_unlock(&the_lnet.ln_lnd_mutex);
+
+		net_l = net;
+	}
+
+	/*
+	 * net_l: if the network being added is unique then net_l
+	 *        will point to that network
+	 *        if the network being added is not unique then
+	 *        net_l points to the existing network.
+	 *
+	 * When we enter the loop below, we'll pick NIs off he
+	 * network beign added and start them up, then add them to
+	 * a local ni list. Once we've successfully started all
+	 * the NIs then we join the local NI list (of started up
+	 * networks) with the net_l->net_ni_list, which should
+	 * point to the correct network to add the new ni list to
+	 *
+	 * If any of the new NIs fail to start up, then we want to
+	 * iterate through the local ni list, which should include
+	 * any NIs which were successfully started up, and shut
+	 * them down.
+	 *
+	 * After than we want to delete the network being added,
+	 * to avoid a memory leak.
+	 */
+	while (!list_empty(&net->net_ni_added)) {
+		ni = list_entry(net->net_ni_added.next, struct lnet_ni,
+				ni_netlist);
+		list_del_init(&ni->ni_netlist);
+
+		/* make sure that the the NI we're about to start
+		 * up is actually unique. if it's not fail. */
+		if (!lnet_ni_unique_net(&net_l->net_ni_list,
+					ni->ni_interface)) {
+			rc = -EEXIST;
+			goto failed1;
+		}
+
+		/* adjust the pointer the parent network, just in case it
+		 * the net is a duplicate */
+		ni->ni_net = net_l;
+
+		rc = lnet_startup_lndni(ni, tun);
+
+		if (rc < 0)
+			goto failed1;
+
+		lnet_ni_addref(ni);
+		list_add_tail(&ni->ni_netlist, &local_ni_list);
+
+		ni_count++;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_splice_tail(&local_ni_list, &net_l->net_ni_list);
+	lnet_incr_dlc_seq();
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* if the network is not unique then we don't want to keep
+	 * it around after we're done. Free it. Otherwise add that
+	 * net to the global the_lnet.ln_nets */
+	if (net_l != net && net_l != NULL) {
+		/*
+		 * TODO - note. currently the tunables can not be updated
+		 * once added
+		 */
+		lnet_net_free(net);
+	} else {
+		/*
+		 * restore tunables after it has been overwitten by the
+		 * lnd
+		 */
+		if (peer_timeout != -1)
+			net->net_tunables.lct_peer_timeout = peer_timeout;
+		if (maxtxcredits != -1)
+			net->net_tunables.lct_max_tx_credits = maxtxcredits;
+		if (peerrtrcredits != -1)
+			net->net_tunables.lct_peer_rtr_credits = peerrtrcredits;
+
+		lnet_net_lock(LNET_LOCK_EX);
+		list_add_tail(&net->net_list, &the_lnet.ln_nets);
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	return ni_count;
+
+failed1:
+	/*
+	 * shutdown the new NIs that are being started up
+	 * free the NET being started
+	 */
+	while (!list_empty(&local_ni_list)) {
+		ni = list_entry(local_ni_list.next, struct lnet_ni,
+				ni_netlist);
+
+		lnet_shutdown_lndni(ni);
+	}
+
+failed0:
+	lnet_net_free(net);
+
+	return rc;
+}
+
+static int
+lnet_startup_lndnets(struct list_head *netlist)
+{
+	struct lnet_net		*net;
+	int			rc;
+	int			ni_count = 0;
+
+	/*
+	 * Change to running state before bringing up the LNDs. This
+	 * allows lnet_shutdown_lndnets() to assert that we've passed
+	 * through here.
+	 */
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_state = LNET_STATE_RUNNING;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	while (!list_empty(netlist)) {
+		net = list_entry(netlist->next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+
+		rc = lnet_startup_lndnet(net, NULL);
+
+		if (rc < 0)
+			goto failed;
+
+		ni_count += rc;
+	}
+
+	return ni_count;
+failed:
+	lnet_shutdown_lndnets();
+
+	return rc;
+}
+
+static int lnet_genl_parse_list(struct sk_buff *msg,
+				const struct ln_key_list *data[], u16 idx)
+{
+	const struct ln_key_list *list = data[idx];
+	const struct ln_key_props *props;
+	struct nlattr *node;
+	u16 count;
+
+	if (!list)
+		return 0;
+
+	if (!list->lkl_maxattr)
+		return -ERANGE;
+
+	props = list->lkl_list;
+	if (!props)
+		return -EINVAL;
+
+	node = nla_nest_start(msg, LN_SCALAR_ATTR_LIST);
+	if (!node)
+		return -ENOBUFS;
+
+	for (count = 1; count <= list->lkl_maxattr; count++) {
+		struct nlattr *key = nla_nest_start(msg, count);
+
+		if (count == 1)
+			nla_put_u16(msg, LN_SCALAR_ATTR_LIST_SIZE,
+				    list->lkl_maxattr);
+
+		nla_put_u16(msg, LN_SCALAR_ATTR_INDEX, count);
+		if (props[count].lkp_value)
+			nla_put_string(msg, LN_SCALAR_ATTR_VALUE,
+				       props[count].lkp_value);
+		if (props[count].lkp_key_format)
+			nla_put_u16(msg, LN_SCALAR_ATTR_KEY_FORMAT,
+				    props[count].lkp_key_format);
+		nla_put_u16(msg, LN_SCALAR_ATTR_NLA_TYPE,
+			    props[count].lkp_data_type);
+		if (props[count].lkp_data_type == NLA_NESTED) {
+			int rc;
+
+			rc = lnet_genl_parse_list(msg, data, ++idx);
+			if (rc < 0)
+				return rc;
+			idx = rc;
+		}
+
+		nla_nest_end(msg, key);
+	}
+
+	nla_nest_end(msg, node);
+	return idx;
+}
+
+int lnet_genl_send_scalar_list(struct sk_buff *msg, u32 portid, u32 seq,
+			       const struct genl_family *family, int flags,
+			       u8 cmd, const struct ln_key_list *data[])
+{
+	int rc = 0;
+	void *hdr;
+
+	if (!data[0])
+		return -EINVAL;
+
+	hdr = genlmsg_put(msg, portid, seq, family, flags, cmd);
+	if (!hdr)
+		GOTO(canceled, rc = -EMSGSIZE);
+
+	rc = lnet_genl_parse_list(msg, data, 0);
+	if (rc < 0)
+		GOTO(canceled, rc);
+
+	genlmsg_end(msg, hdr);
+canceled:
+	if (rc < 0)
+		genlmsg_cancel(msg, hdr);
+	return rc > 0 ? 0 : rc;
+}
+EXPORT_SYMBOL(lnet_genl_send_scalar_list);
+
+/**
+ * Initialize LNet library.
+ *
+ * Automatically called at module loading time. Caller has to call
+ * lnet_lib_exit() after a call to lnet_lib_init(), if and only if the
+ * latter returned 0. It must be called exactly once.
+ *
+ * \retval 0 on success
+ * \retval -ve on failures.
+ */
+int lnet_lib_init(void)
+{
+	int rc;
+
+	lnet_assert_wire_constants();
+
+	/* refer to global cfs_cpt_table for now */
+	the_lnet.ln_cpt_table = cfs_cpt_tab;
+	the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_tab);
+
+	LASSERT(the_lnet.ln_cpt_number > 0);
+	if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+		/* we are under risk of consuming all lh_cookie */
+		CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+		       "please change setting of CPT-table and retry\n",
+		       the_lnet.ln_cpt_number, LNET_CPT_MAX);
+		return -E2BIG;
+	}
+
+	while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+		the_lnet.ln_cpt_bits++;
+
+	rc = lnet_create_locks();
+	if (rc != 0) {
+		CERROR("Can't create LNet global locks: %d\n", rc);
+		return rc;
+	}
+
+	the_lnet.ln_refcount = 0;
+	INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_msg_resend);
+
+	/* The hash table size is the number of bits it takes to express the set
+	 * ln_num_routes, minus 1 (better to under estimate than over so we
+	 * don't waste memory). */
+	if (rnet_htable_size <= 0)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+	else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+	the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+					   order_base_2(rnet_htable_size) - 1);
+
+	/* All LNDs apart from the LOLND are in separate modules.  They
+	 * register themselves when their module loads, and unregister
+	 * themselves when their module is unloaded. */
+	lnet_register_lnd(&the_lolnd);
+	return 0;
+}
+
+/**
+ * Finalize LNet library.
+ *
+ * \pre lnet_lib_init() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ *
+ * As this happens at module-unload, all lnds must already be unloaded,
+ * so they must already be unregistered.
+ */
+void lnet_lib_exit(void)
+{
+	int i;
+
+	LASSERT(the_lnet.ln_refcount == 0);
+	lnet_unregister_lnd(&the_lolnd);
+	for (i = 0; i < NUM_LNDS; i++)
+		LASSERT(!the_lnet.ln_lnds[i]);
+	lnet_destroy_locks();
+}
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+	int			im_a_router = 0;
+	int			rc;
+	int			ni_count;
+	struct lnet_ping_buffer	*pbuf;
+	struct lnet_handle_md	ping_mdh;
+	LIST_HEAD(net_head);
+	struct lnet_net		*net;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+	if (the_lnet.ln_state == LNET_STATE_STOPPING) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return -ESHUTDOWN;
+	}
+
+	if (the_lnet.ln_refcount > 0) {
+		rc = the_lnet.ln_refcount++;
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	rc = lnet_prepare(requested_pid);
+	if (rc != 0) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	/* create a network for Loopback network */
+	net = lnet_net_alloc(LNET_MKNET(LOLND, 0), &net_head);
+	if (net == NULL) {
+		rc = -ENOMEM;
+		goto err_empty_list;
+	}
+
+	/* Add in the loopback NI */
+	if (lnet_ni_alloc(net, NULL, NULL) == NULL) {
+		rc = -ENOMEM;
+		goto err_empty_list;
+	}
+
+	if (use_tcp_bonding)
+		CWARN("use_tcp_bonding has been removed. Use Multi-Rail and Dynamic Discovery instead, see LU-13641\n");
+
+	/* If LNet is being initialized via DLC it is possible
+	 * that the user requests not to load module parameters (ones which
+	 * are supported by DLC) on initialization.  Therefore, make sure not
+	 * to load networks, routes and forwarding from module parameters
+	 * in this case.  On cleanup in case of failure only clean up
+	 * routes if it has been loaded */
+	if (!the_lnet.ln_nis_from_mod_params) {
+		rc = lnet_parse_networks(&net_head, lnet_get_networks());
+		if (rc < 0)
+			goto err_empty_list;
+	}
+
+	ni_count = lnet_startup_lndnets(&net_head);
+	if (ni_count < 0) {
+		rc = ni_count;
+		goto err_empty_list;
+	}
+
+	if (!the_lnet.ln_nis_from_mod_params) {
+		rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+		if (rc != 0)
+			goto err_shutdown_lndnis;
+
+		rc = lnet_rtrpools_alloc(im_a_router);
+		if (rc != 0)
+			goto err_destroy_routes;
+	}
+
+	rc = lnet_acceptor_start();
+	if (rc != 0)
+		goto err_destroy_routes;
+
+	the_lnet.ln_refcount = 1;
+	/* Now I may use my own API functions... */
+
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh, ni_count, true);
+	if (rc != 0)
+		goto err_acceptor_stop;
+
+	lnet_ping_target_update(pbuf, ping_mdh);
+
+	the_lnet.ln_mt_handler = lnet_mt_event_handler;
+
+	rc = lnet_push_target_init();
+	if (rc != 0)
+		goto err_stop_ping;
+
+	rc = lnet_peer_discovery_start();
+	if (rc != 0)
+		goto err_destroy_push_target;
+
+	rc = lnet_monitor_thr_start();
+	if (rc != 0)
+		goto err_stop_discovery_thr;
+
+	lnet_fault_init();
+	lnet_router_debugfs_init();
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	complete_all(&the_lnet.ln_started);
+
+	/* wait for all routers to start */
+	lnet_wait_router_start();
+
+	return 0;
+
+err_stop_discovery_thr:
+	lnet_peer_discovery_stop();
+err_destroy_push_target:
+	lnet_push_target_fini();
+err_stop_ping:
+	lnet_ping_target_fini();
+err_acceptor_stop:
+	the_lnet.ln_refcount = 0;
+	lnet_acceptor_stop();
+err_destroy_routes:
+	if (!the_lnet.ln_nis_from_mod_params)
+		lnet_destroy_routes();
+err_shutdown_lndnis:
+	lnet_shutdown_lndnets();
+err_empty_list:
+	lnet_unprepare();
+	LASSERT(rc < 0);
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	while (!list_empty(&net_head)) {
+		struct lnet_net *net;
+
+		net = list_entry(net_head.next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+		lnet_net_free(net);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini(void)
+{
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (the_lnet.ln_refcount != 1) {
+		the_lnet.ln_refcount--;
+	} else {
+		LASSERT(!the_lnet.ln_niinit_self);
+
+		lnet_net_lock(LNET_LOCK_EX);
+		the_lnet.ln_state = LNET_STATE_STOPPING;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		lnet_fault_fini();
+
+		lnet_router_debugfs_fini();
+		lnet_monitor_thr_stop();
+		lnet_peer_discovery_stop();
+		lnet_push_target_fini();
+		lnet_ping_target_fini();
+
+		/* Teardown fns that use my own API functions BEFORE here */
+		the_lnet.ln_refcount = 0;
+
+		lnet_acceptor_stop();
+		lnet_destroy_routes();
+		lnet_shutdown_lndnets();
+		lnet_unprepare();
+	}
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * Grabs the ni data from the ni structure and fills the out
+ * parameters
+ *
+ * \param[in] ni network	interface structure
+ * \param[out] cfg_ni		NI config information
+ * \param[out] tun		network and LND tunables
+ */
+static void
+lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
+		   struct lnet_ioctl_config_lnd_tunables *tun,
+		   struct lnet_ioctl_element_stats *stats,
+		   __u32 tun_size)
+{
+	size_t min_size = 0;
+	int i;
+
+	if (!ni || !cfg_ni || !tun || !nid_is_nid4(&ni->ni_nid))
+		return;
+
+	if (ni->ni_interface != NULL) {
+		strncpy(cfg_ni->lic_ni_intf,
+			ni->ni_interface,
+			sizeof(cfg_ni->lic_ni_intf));
+	}
+
+	cfg_ni->lic_nid = lnet_nid_to_nid4(&ni->ni_nid);
+	cfg_ni->lic_status = lnet_ni_get_status_locked(ni);
+	cfg_ni->lic_dev_cpt = ni->ni_dev_cpt;
+
+	memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
+
+	if (stats) {
+		stats->iel_send_count = lnet_sum_stats(&ni->ni_stats,
+						       LNET_STATS_TYPE_SEND);
+		stats->iel_recv_count = lnet_sum_stats(&ni->ni_stats,
+						       LNET_STATS_TYPE_RECV);
+		stats->iel_drop_count = lnet_sum_stats(&ni->ni_stats,
+						       LNET_STATS_TYPE_DROP);
+	}
+
+	/*
+	 * tun->lt_tun will always be present, but in order to be
+	 * backwards compatible, we need to deal with the cases when
+	 * tun->lt_tun is smaller than what the kernel has, because it
+	 * comes from an older version of a userspace program, then we'll
+	 * need to copy as much information as we have available space.
+	 */
+	min_size = tun_size - sizeof(tun->lt_cmn);
+	memcpy(&tun->lt_tun, &ni->ni_lnd_tunables, min_size);
+
+	/* copy over the cpts */
+	if (ni->ni_ncpts == LNET_CPT_NUMBER &&
+	    ni->ni_cpts == NULL)  {
+		for (i = 0; i < ni->ni_ncpts; i++)
+			cfg_ni->lic_cpts[i] = i;
+	} else {
+		for (i = 0;
+		     ni->ni_cpts != NULL && i < ni->ni_ncpts &&
+		     i < LNET_MAX_SHOW_NUM_CPT;
+		     i++)
+			cfg_ni->lic_cpts[i] = ni->ni_cpts[i];
+	}
+	cfg_ni->lic_ncpts = ni->ni_ncpts;
+}
+
+/**
+ * NOTE: This is a legacy function left in the code to be backwards
+ * compatible with older userspace programs. It should eventually be
+ * removed.
+ *
+ * Grabs the ni data from the ni structure and fills the out
+ * parameters
+ *
+ * \param[in] ni network	interface structure
+ * \param[out] config		config information
+ */
+static void
+lnet_fill_ni_info_legacy(struct lnet_ni *ni,
+			 struct lnet_ioctl_config_data *config)
+{
+	struct lnet_ioctl_net_config *net_config;
+	struct lnet_ioctl_config_lnd_tunables *lnd_cfg = NULL;
+	size_t min_size, tunable_size = 0;
+	int i;
+
+	if (!ni || !config || !nid_is_nid4(&ni->ni_nid))
+		return;
+
+	net_config = (struct lnet_ioctl_net_config *) config->cfg_bulk;
+	if (!net_config)
+		return;
+
+	if (!ni->ni_interface)
+		return;
+
+	strncpy(net_config->ni_interface,
+		ni->ni_interface,
+		sizeof(net_config->ni_interface));
+
+	config->cfg_nid = lnet_nid_to_nid4(&ni->ni_nid);
+	config->cfg_config_u.cfg_net.net_peer_timeout =
+		ni->ni_net->net_tunables.lct_peer_timeout;
+	config->cfg_config_u.cfg_net.net_max_tx_credits =
+		ni->ni_net->net_tunables.lct_max_tx_credits;
+	config->cfg_config_u.cfg_net.net_peer_tx_credits =
+		ni->ni_net->net_tunables.lct_peer_tx_credits;
+	config->cfg_config_u.cfg_net.net_peer_rtr_credits =
+		ni->ni_net->net_tunables.lct_peer_rtr_credits;
+
+	net_config->ni_status = lnet_ni_get_status_locked(ni);
+
+	if (ni->ni_cpts) {
+		int num_cpts = min(ni->ni_ncpts, LNET_MAX_SHOW_NUM_CPT);
+
+		for (i = 0; i < num_cpts; i++)
+			net_config->ni_cpts[i] = ni->ni_cpts[i];
+
+		config->cfg_ncpts = num_cpts;
+	}
+
+	/*
+	 * See if user land tools sent in a newer and larger version
+	 * of struct lnet_tunables than what the kernel uses.
+	 */
+	min_size = sizeof(*config) + sizeof(*net_config);
+
+	if (config->cfg_hdr.ioc_len > min_size)
+		tunable_size = config->cfg_hdr.ioc_len - min_size;
+
+	/* Don't copy too much data to user space */
+	min_size = min(tunable_size, sizeof(ni->ni_lnd_tunables));
+	lnd_cfg = (struct lnet_ioctl_config_lnd_tunables *)net_config->cfg_bulk;
+
+	if (lnd_cfg && min_size) {
+		memcpy(&lnd_cfg->lt_tun, &ni->ni_lnd_tunables, min_size);
+		config->cfg_config_u.cfg_net.net_interface_count = 1;
+
+		/* Tell user land that kernel side has less data */
+		if (tunable_size > sizeof(ni->ni_lnd_tunables)) {
+			min_size = tunable_size - sizeof(ni->ni_lnd_tunables);
+			config->cfg_hdr.ioc_len -= min_size;
+		}
+	}
+}
+
+struct lnet_ni *
+lnet_get_ni_idx_locked(int idx)
+{
+	struct lnet_ni		*ni;
+	struct lnet_net		*net;
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (idx-- == 0)
+				return ni;
+		}
+	}
+
+	return NULL;
+}
+
+int lnet_get_net_healthv_locked(struct lnet_net *net)
+{
+	struct lnet_ni *ni;
+	int best_healthv = 0;
+	int healthv, ni_fatal;
+
+	list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+		healthv = atomic_read(&ni->ni_healthv);
+		ni_fatal = atomic_read(&ni->ni_fatal_error_on);
+		if (!ni_fatal && healthv > best_healthv)
+			best_healthv = healthv;
+	}
+
+	return best_healthv;
+}
+
+struct lnet_ni *
+lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
+{
+	struct lnet_ni		*ni;
+	struct lnet_net		*net = mynet;
+
+	/*
+	 * It is possible that the net has been cleaned out while there is
+	 * a message being sent. This function accessed the net without
+	 * checking if the list is empty
+	 */
+	if (prev == NULL) {
+		if (net == NULL)
+			net = list_entry(the_lnet.ln_nets.next, struct lnet_net,
+					net_list);
+		if (list_empty(&net->net_ni_list))
+			return NULL;
+		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
+				ni_netlist);
+
+		return ni;
+	}
+
+	if (prev->ni_netlist.next == &prev->ni_net->net_ni_list) {
+		/* if you reached the end of the ni list and the net is
+		 * specified, then there are no more nis in that net */
+		if (net != NULL)
+			return NULL;
+
+		/* we reached the end of this net ni list. move to the
+		 * next net */
+		if (prev->ni_net->net_list.next == &the_lnet.ln_nets)
+			/* no more nets and no more NIs. */
+			return NULL;
+
+		/* get the next net */
+		net = list_entry(prev->ni_net->net_list.next, struct lnet_net,
+				 net_list);
+		if (list_empty(&net->net_ni_list))
+			return NULL;
+		/* get the ni on it */
+		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
+				ni_netlist);
+
+		return ni;
+	}
+
+	if (list_empty(&prev->ni_netlist))
+		return NULL;
+
+	/* there are more nis left */
+	ni = list_entry(prev->ni_netlist.next, struct lnet_ni, ni_netlist);
+
+	return ni;
+}
+
+int
+lnet_get_net_config(struct lnet_ioctl_config_data *config)
+{
+	struct lnet_ni *ni;
+	int cpt;
+	int rc = -ENOENT;
+	int idx = config->cfg_count;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_get_ni_idx_locked(idx);
+
+	if (ni != NULL) {
+		rc = 0;
+		lnet_ni_lock(ni);
+		lnet_fill_ni_info_legacy(ni, config);
+		lnet_ni_unlock(ni);
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+
+int
+lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni,
+		   struct lnet_ioctl_config_lnd_tunables *tun,
+		   struct lnet_ioctl_element_stats *stats,
+		   __u32 tun_size)
+{
+	struct lnet_ni		*ni;
+	int			cpt;
+	int			rc = -ENOENT;
+
+	if (!cfg_ni || !tun || !stats)
+		return -EINVAL;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_get_ni_idx_locked(cfg_ni->lic_idx);
+
+	if (ni) {
+		rc = 0;
+		lnet_ni_lock(ni);
+		lnet_fill_ni_info(ni, cfg_ni, tun, stats, tun_size);
+		lnet_ni_unlock(ni);
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+
+int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats)
+{
+	struct lnet_ni *ni;
+	int cpt;
+	int rc = -ENOENT;
+
+	if (!msg_stats)
+		return -EINVAL;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_get_ni_idx_locked(msg_stats->im_idx);
+
+	if (ni) {
+		lnet_usr_translate_stats(msg_stats, &ni->ni_stats);
+		rc = 0;
+	}
+
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
+static int lnet_add_net_common(struct lnet_net *net,
+			       struct lnet_ioctl_config_lnd_tunables *tun)
+{
+	struct lnet_handle_md ping_mdh;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_remotenet *rnet;
+	struct lnet_ni *ni;
+	int net_ni_count;
+	__u32 net_id;
+	int rc;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	rnet = lnet_find_rnet_locked(net->net_id);
+	lnet_net_unlock(LNET_LOCK_EX);
+	/*
+	 * make sure that the net added doesn't invalidate the current
+	 * configuration LNet is keeping
+	 */
+	if (rnet) {
+		CERROR("Adding net %s will invalidate routing configuration\n",
+		       libcfs_net2str(net->net_id));
+		lnet_net_free(net);
+		return -EUSERS;
+	}
+
+	/*
+	 * make sure you calculate the correct number of slots in the ping
+	 * buffer. Since the ping info is a flattened list of all the NIs,
+	 * we should allocate enough slots to accomodate the number of NIs
+	 * which will be added.
+	 *
+	 * since ni hasn't been configured yet, use
+	 * lnet_get_net_ni_count_pre() which checks the net_ni_added list
+	 */
+	net_ni_count = lnet_get_net_ni_count_pre(net);
+
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+				    net_ni_count + lnet_get_ni_count(),
+				    false);
+	if (rc < 0) {
+		lnet_net_free(net);
+		return rc;
+	}
+
+	if (tun)
+		memcpy(&net->net_tunables,
+		       &tun->lt_cmn, sizeof(net->net_tunables));
+	else
+		memset(&net->net_tunables, -1, sizeof(net->net_tunables));
+
+	net_id = net->net_id;
+
+	rc = lnet_startup_lndnet(net,
+				 (tun) ? &tun->lt_tun : NULL);
+	if (rc < 0)
+		goto failed;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	net = lnet_get_net_locked(net_id);
+	LASSERT(net);
+
+	/* apply the UDSPs */
+	rc = lnet_udsp_apply_policies_on_net(net);
+	if (rc)
+		CERROR("Failed to apply UDSPs on local net %s\n",
+		       libcfs_net2str(net->net_id));
+
+	/* At this point we lost track of which NI was just added, so we
+	 * just re-apply the policies on all of the NIs on this net
+	 */
+	list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+		rc = lnet_udsp_apply_policies_on_ni(ni);
+		if (rc)
+			CERROR("Failed to apply UDSPs on ni %s\n",
+			       libcfs_nidstr(&ni->ni_nid));
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/*
+	 * Start the acceptor thread if this is the first network
+	 * being added that requires the thread.
+	 */
+	if (net->net_lnd->lnd_accept) {
+		rc = lnet_acceptor_start();
+		if (rc < 0) {
+			/* shutdown the net that we just started */
+			CERROR("Failed to start up acceptor thread\n");
+			lnet_shutdown_lndnet(net);
+			goto failed;
+		}
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_peer_net_added(net);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	lnet_ping_target_update(pbuf, ping_mdh);
+
+	return 0;
+
+failed:
+	lnet_ping_md_unlink(pbuf, &ping_mdh);
+	lnet_ping_buffer_decref(pbuf);
+	return rc;
+}
+
+static void
+lnet_set_tune_defaults(struct lnet_ioctl_config_lnd_tunables *tun)
+{
+	if (tun) {
+		if (!tun->lt_cmn.lct_peer_timeout)
+			tun->lt_cmn.lct_peer_timeout = DEFAULT_PEER_TIMEOUT;
+		if (!tun->lt_cmn.lct_peer_tx_credits)
+			tun->lt_cmn.lct_peer_tx_credits = DEFAULT_PEER_CREDITS;
+		if (!tun->lt_cmn.lct_max_tx_credits)
+			tun->lt_cmn.lct_max_tx_credits = DEFAULT_CREDITS;
+	}
+}
+
+static int lnet_handle_legacy_ip2nets(char *ip2nets,
+				      struct lnet_ioctl_config_lnd_tunables *tun)
+{
+	struct lnet_net *net;
+	const char *nets;
+	int rc;
+	LIST_HEAD(net_head);
+
+	rc = lnet_parse_ip2nets(&nets, ip2nets);
+	if (rc < 0)
+		return rc;
+
+	rc = lnet_parse_networks(&net_head, nets);
+	if (rc < 0)
+		return rc;
+
+	lnet_set_tune_defaults(tun);
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+	while (!list_empty(&net_head)) {
+		net = list_entry(net_head.next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+		rc = lnet_add_net_common(net, tun);
+		if (rc < 0)
+			goto out;
+	}
+
+out:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	while (!list_empty(&net_head)) {
+		net = list_entry(net_head.next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+		lnet_net_free(net);
+	}
+	return rc;
+}
+
+int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+	struct lnet_ioctl_config_lnd_tunables *tun = NULL;
+	int rc, i;
+	__u32 net_id, lnd_type;
+
+	/* get the tunables if they are available */
+	if (conf->lic_cfg_hdr.ioc_len >=
+	    sizeof(*conf) + sizeof(*tun))
+		tun = (struct lnet_ioctl_config_lnd_tunables *)
+			conf->lic_bulk;
+
+	/* handle legacy ip2nets from DLC */
+	if (conf->lic_legacy_ip2nets[0] != '\0')
+		return lnet_handle_legacy_ip2nets(conf->lic_legacy_ip2nets,
+						  tun);
+
+	net_id = LNET_NIDNET(conf->lic_nid);
+	lnd_type = LNET_NETTYP(net_id);
+
+	if (!libcfs_isknown_lnd(lnd_type)) {
+		CERROR("No valid net and lnd information provided\n");
+		return -EINVAL;
+	}
+
+	net = lnet_net_alloc(net_id, NULL);
+	if (!net)
+		return -ENOMEM;
+
+	for (i = 0; i < conf->lic_ncpts; i++) {
+		if (conf->lic_cpts[i] >= LNET_CPT_NUMBER)
+			return -EINVAL;
+	}
+
+	ni = lnet_ni_alloc_w_cpt_array(net, conf->lic_cpts, conf->lic_ncpts,
+				       conf->lic_ni_intf);
+	if (!ni)
+		return -ENOMEM;
+
+	lnet_set_tune_defaults(tun);
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		rc = -ESHUTDOWN;
+	else
+		rc = lnet_add_net_common(net, tun);
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return rc;
+}
+
+int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
+{
+	struct lnet_net	 *net;
+	struct lnet_ni *ni;
+	__u32 net_id = LNET_NIDNET(conf->lic_nid);
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_handle_md  ping_mdh;
+	int		  rc;
+	int		  net_count;
+	__u32		  addr;
+
+	/* don't allow userspace to shutdown the LOLND */
+	if (LNET_NETTYP(net_id) == LOLND)
+		return -EINVAL;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto unlock_api_mutex;
+	}
+
+	lnet_net_lock(0);
+
+	net = lnet_get_net_locked(net_id);
+	if (!net) {
+		CERROR("net %s not found\n",
+		       libcfs_net2str(net_id));
+		rc = -ENOENT;
+		goto unlock_net;
+	}
+
+	addr = LNET_NIDADDR(conf->lic_nid);
+	if (addr == 0) {
+		/* remove the entire net */
+		net_count = lnet_get_net_ni_count_locked(net);
+
+		lnet_net_unlock(0);
+
+		/* create and link a new ping info, before removing the old one */
+		rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+					lnet_get_ni_count() - net_count,
+					false);
+		if (rc != 0)
+			goto unlock_api_mutex;
+
+		lnet_shutdown_lndnet(net);
+
+		lnet_acceptor_stop();
+
+		lnet_ping_target_update(pbuf, ping_mdh);
+
+		goto unlock_api_mutex;
+	}
+
+	ni = lnet_nid2ni_locked(conf->lic_nid, 0);
+	if (!ni) {
+		CERROR("nid %s not found\n",
+		       libcfs_nid2str(conf->lic_nid));
+		rc = -ENOENT;
+		goto unlock_net;
+	}
+
+	net_count = lnet_get_net_ni_count_locked(net);
+
+	lnet_net_unlock(0);
+
+	/* create and link a new ping info, before removing the old one */
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+				  lnet_get_ni_count() - 1, false);
+	if (rc != 0)
+		goto unlock_api_mutex;
+
+	lnet_shutdown_lndni(ni);
+
+	lnet_acceptor_stop();
+
+	lnet_ping_target_update(pbuf, ping_mdh);
+
+	/* check if the net is empty and remove it if it is */
+	if (net_count == 1)
+		lnet_shutdown_lndnet(net);
+
+	goto unlock_api_mutex;
+
+unlock_net:
+	lnet_net_unlock(0);
+unlock_api_mutex:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return rc;
+}
+
+/*
+ * lnet_dyn_add_net and lnet_dyn_del_net are now deprecated.
+ * They are only expected to be called for unique networks.
+ * That can be as a result of older DLC library
+ * calls. Multi-Rail DLC and beyond no longer uses these APIs.
+ */
+int
+lnet_dyn_add_net(struct lnet_ioctl_config_data *conf)
+{
+	struct lnet_net *net;
+	LIST_HEAD(net_head);
+	int rc;
+	struct lnet_ioctl_config_lnd_tunables tun;
+	const char *nets = conf->cfg_config_u.cfg_net.net_intf;
+
+	/* Create a net/ni structures for the network string */
+	rc = lnet_parse_networks(&net_head, nets);
+	if (rc <= 0)
+		return rc == 0 ? -EINVAL : rc;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto out_unlock_clean;
+	}
+
+	if (rc > 1) {
+		rc = -EINVAL; /* only add one network per call */
+		goto out_unlock_clean;
+	}
+
+	net = list_entry(net_head.next, struct lnet_net, net_list);
+	list_del_init(&net->net_list);
+
+	LASSERT(lnet_net_unique(net->net_id, &the_lnet.ln_nets, NULL));
+
+	memset(&tun, 0, sizeof(tun));
+
+	tun.lt_cmn.lct_peer_timeout =
+	  (!conf->cfg_config_u.cfg_net.net_peer_timeout) ? DEFAULT_PEER_TIMEOUT :
+		conf->cfg_config_u.cfg_net.net_peer_timeout;
+	tun.lt_cmn.lct_peer_tx_credits =
+	  (!conf->cfg_config_u.cfg_net.net_peer_tx_credits) ? DEFAULT_PEER_CREDITS :
+		conf->cfg_config_u.cfg_net.net_peer_tx_credits;
+	tun.lt_cmn.lct_peer_rtr_credits =
+	  conf->cfg_config_u.cfg_net.net_peer_rtr_credits;
+	tun.lt_cmn.lct_max_tx_credits =
+	  (!conf->cfg_config_u.cfg_net.net_max_tx_credits) ? DEFAULT_CREDITS :
+		conf->cfg_config_u.cfg_net.net_max_tx_credits;
+
+	rc = lnet_add_net_common(net, &tun);
+
+out_unlock_clean:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	while (!list_empty(&net_head)) {
+		/* net_head list is empty in success case */
+		net = list_entry(net_head.next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+		lnet_net_free(net);
+	}
+	return rc;
+}
+
+int
+lnet_dyn_del_net(__u32 net_id)
+{
+	struct lnet_net	 *net;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_handle_md ping_mdh;
+	int		  rc;
+	int		  net_ni_count;
+
+	/* don't allow userspace to shutdown the LOLND */
+	if (LNET_NETTYP(net_id) == LOLND)
+		return -EINVAL;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	lnet_net_lock(0);
+
+	net = lnet_get_net_locked(net_id);
+	if (net == NULL) {
+		lnet_net_unlock(0);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	net_ni_count = lnet_get_net_ni_count_locked(net);
+
+	lnet_net_unlock(0);
+
+	/* create and link a new ping info, before removing the old one */
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+				    lnet_get_ni_count() - net_ni_count, false);
+	if (rc != 0)
+		goto out;
+
+	lnet_shutdown_lndnet(net);
+
+	lnet_acceptor_stop();
+
+	lnet_ping_target_update(pbuf, ping_mdh);
+
+out:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return rc;
+}
+
+void lnet_incr_dlc_seq(void)
+{
+	atomic_inc(&lnet_dlc_seq_no);
+}
+
+__u32 lnet_get_dlc_seq_locked(void)
+{
+	return atomic_read(&lnet_dlc_seq_no);
+}
+
+static void
+lnet_ni_set_healthv(lnet_nid_t nid, int value, bool all)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (all || (nid_is_nid4(&ni->ni_nid) &&
+				    lnet_nid_to_nid4(&ni->ni_nid) == nid)) {
+				atomic_set(&ni->ni_healthv, value);
+				if (list_empty(&ni->ni_recovery) &&
+				    value < LNET_MAX_HEALTH_VALUE) {
+					CERROR("manually adding local NI %s to recovery\n",
+					       libcfs_nidstr(&ni->ni_nid));
+					list_add_tail(&ni->ni_recovery,
+						      &the_lnet.ln_mt_localNIRecovq);
+					lnet_ni_addref_locked(ni, 0);
+				}
+				if (!all) {
+					lnet_net_unlock(LNET_LOCK_EX);
+					return;
+				}
+			}
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static void
+lnet_ni_set_conns_per_peer(lnet_nid_t nid, int value, bool all)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (lnet_nid_to_nid4(&ni->ni_nid) != nid && !all)
+				continue;
+			if (LNET_NETTYP(net->net_id) == SOCKLND)
+				ni->ni_lnd_tunables.lnd_tun_u.lnd_sock.lnd_conns_per_peer = value;
+			else if (LNET_NETTYP(net->net_id) == O2IBLND)
+				ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib.lnd_conns_per_peer = value;
+			if (!all) {
+				lnet_net_unlock(LNET_LOCK_EX);
+				return;
+			}
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static int
+lnet_get_local_ni_hstats(struct lnet_ioctl_local_ni_hstats *stats)
+{
+	int cpt, rc = 0;
+	struct lnet_ni *ni;
+	lnet_nid_t nid = stats->hlni_nid;
+
+	cpt = lnet_net_lock_current();
+	ni = lnet_nid2ni_locked(nid, cpt);
+
+	if (!ni) {
+		rc = -ENOENT;
+		goto unlock;
+	}
+
+	stats->hlni_local_interrupt = atomic_read(&ni->ni_hstats.hlt_local_interrupt);
+	stats->hlni_local_dropped = atomic_read(&ni->ni_hstats.hlt_local_dropped);
+	stats->hlni_local_aborted = atomic_read(&ni->ni_hstats.hlt_local_aborted);
+	stats->hlni_local_no_route = atomic_read(&ni->ni_hstats.hlt_local_no_route);
+	stats->hlni_local_timeout = atomic_read(&ni->ni_hstats.hlt_local_timeout);
+	stats->hlni_local_error = atomic_read(&ni->ni_hstats.hlt_local_error);
+	stats->hlni_fatal_error = atomic_read(&ni->ni_fatal_error_on);
+	stats->hlni_health_value = atomic_read(&ni->ni_healthv);
+	stats->hlni_ping_count = ni->ni_ping_count;
+	stats->hlni_next_ping = ni->ni_next_ping;
+
+unlock:
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
+static int
+lnet_get_local_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
+{
+	struct lnet_ni *ni;
+	int i = 0;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(ni, &the_lnet.ln_mt_localNIRecovq, ni_recovery) {
+		if (!nid_is_nid4(&ni->ni_nid))
+			continue;
+		list->rlst_nid_array[i] = lnet_nid_to_nid4(&ni->ni_nid);
+		i++;
+		if (i >= LNET_MAX_SHOW_NUM_NID)
+			break;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+	list->rlst_num_nids = i;
+
+	return 0;
+}
+
+static int
+lnet_get_peer_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
+{
+	struct lnet_peer_ni *lpni;
+	int i = 0;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(lpni, &the_lnet.ln_mt_peerNIRecovq, lpni_recovery) {
+		list->rlst_nid_array[i] = lnet_nid_to_nid4(&lpni->lpni_nid);
+		i++;
+		if (i >= LNET_MAX_SHOW_NUM_NID)
+			break;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+	list->rlst_num_nids = i;
+
+	return 0;
+}
+
+/**
+ * LNet ioctl handler.
+ *
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	struct lnet_ioctl_config_data *config;
+	struct lnet_process_id	  id4 = {};
+	struct lnet_processid	  id = {};
+	struct lnet_ni		 *ni;
+	struct lnet_nid		  nid;
+	int			  rc;
+
+	BUILD_BUG_ON(sizeof(struct lnet_ioctl_net_config) +
+		     sizeof(struct lnet_ioctl_config_data) > LIBCFS_IOC_DATA_MAX);
+
+	switch (cmd) {
+	case IOC_LIBCFS_GET_NI:
+		rc = LNetGetId(data->ioc_count, &id);
+		data->ioc_nid = lnet_nid_to_nid4(&id.nid);
+		return rc;
+
+	case IOC_LIBCFS_FAIL_NID:
+		return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+	case IOC_LIBCFS_ADD_ROUTE: {
+		/* default router sensitivity to 1 */
+		unsigned int sensitivity = 1;
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		if (config->cfg_config_u.cfg_route.rtr_sensitivity) {
+			sensitivity =
+			  config->cfg_config_u.cfg_route.rtr_sensitivity;
+		}
+
+		lnet_nid4_to_nid(config->cfg_nid, &nid);
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_add_route(config->cfg_net,
+				    config->cfg_config_u.cfg_route.rtr_hop,
+				    &nid,
+				    config->cfg_config_u.cfg_route.
+					rtr_priority, sensitivity);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_DEL_ROUTE:
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		lnet_nid4_to_nid(config->cfg_nid, &nid);
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_del_route(config->cfg_net, &nid);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+
+	case IOC_LIBCFS_GET_ROUTE:
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_route(config->cfg_count,
+				    &config->cfg_net,
+				    &config->cfg_config_u.cfg_route.rtr_hop,
+				    &config->cfg_nid,
+				    &config->cfg_config_u.cfg_route.rtr_flags,
+				    &config->cfg_config_u.cfg_route.
+					rtr_priority,
+				    &config->cfg_config_u.cfg_route.
+					rtr_sensitivity);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+
+	case IOC_LIBCFS_GET_LOCAL_NI: {
+		struct lnet_ioctl_config_ni *cfg_ni;
+		struct lnet_ioctl_config_lnd_tunables *tun = NULL;
+		struct lnet_ioctl_element_stats *stats;
+		__u32 tun_size;
+
+		cfg_ni = arg;
+
+		/* get the tunables if they are available */
+		if (cfg_ni->lic_cfg_hdr.ioc_len <
+		    sizeof(*cfg_ni) + sizeof(*stats) + sizeof(*tun))
+			return -EINVAL;
+
+		stats = (struct lnet_ioctl_element_stats *)
+			cfg_ni->lic_bulk;
+		tun = (struct lnet_ioctl_config_lnd_tunables *)
+				(cfg_ni->lic_bulk + sizeof(*stats));
+
+		tun_size = cfg_ni->lic_cfg_hdr.ioc_len - sizeof(*cfg_ni) -
+			sizeof(*stats);
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_ni_config(cfg_ni, tun, stats, tun_size);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS: {
+		struct lnet_ioctl_element_msg_stats *msg_stats = arg;
+
+		if (msg_stats->im_hdr.ioc_len != sizeof(*msg_stats))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_ni_stats(msg_stats);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_NET: {
+		size_t total = sizeof(*config) +
+			       sizeof(struct lnet_ioctl_net_config);
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < total)
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_net_config(config);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_LNET_STATS:
+	{
+		struct lnet_ioctl_lnet_stats *lnet_stats = arg;
+
+		if (lnet_stats->st_hdr.ioc_len < sizeof(*lnet_stats))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_counters_get(&lnet_stats->st_cntrs);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_RESET_LNET_STATS:
+	{
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lnet_counters_reset();
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	case IOC_LIBCFS_CONFIG_RTR:
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		if (config->cfg_config_u.cfg_buffers.buf_enable) {
+			rc = lnet_rtrpools_enable();
+			mutex_unlock(&the_lnet.ln_api_mutex);
+			return rc;
+		}
+		lnet_rtrpools_disable();
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+
+	case IOC_LIBCFS_ADD_BUF:
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_rtrpools_adjust(config->cfg_config_u.cfg_buffers.
+						buf_tiny,
+					  config->cfg_config_u.cfg_buffers.
+						buf_small,
+					  config->cfg_config_u.cfg_buffers.
+						buf_large);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+
+	case IOC_LIBCFS_SET_NUMA_RANGE: {
+		struct lnet_ioctl_set_value *numa;
+		numa = arg;
+		if (numa->sv_hdr.ioc_len != sizeof(*numa))
+			return -EINVAL;
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_numa_range = numa->sv_value;
+		lnet_net_unlock(LNET_LOCK_EX);
+		return 0;
+	}
+
+	case IOC_LIBCFS_GET_NUMA_RANGE: {
+		struct lnet_ioctl_set_value *numa;
+		numa = arg;
+		if (numa->sv_hdr.ioc_len != sizeof(*numa))
+			return -EINVAL;
+		numa->sv_value = lnet_numa_range;
+		return 0;
+	}
+
+	case IOC_LIBCFS_GET_BUF: {
+		struct lnet_ioctl_pool_cfg *pool_cfg;
+		size_t total = sizeof(*config) + sizeof(*pool_cfg);
+
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < total)
+			return -EINVAL;
+
+		pool_cfg = (struct lnet_ioctl_pool_cfg *)config->cfg_bulk;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_rtr_pool_cfg(config->cfg_count, pool_cfg);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_LOCAL_HSTATS: {
+		struct lnet_ioctl_local_ni_hstats *stats = arg;
+
+		if (stats->hlni_hdr.ioc_len < sizeof(*stats))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_local_ni_hstats(stats);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_RECOVERY_QUEUE: {
+		struct lnet_ioctl_recovery_list *list = arg;
+		if (list->rlst_hdr.ioc_len < sizeof(*list))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		if (list->rlst_type == LNET_HEALTH_TYPE_LOCAL_NI)
+			rc = lnet_get_local_ni_recovery_list(list);
+		else
+			rc = lnet_get_peer_ni_recovery_list(list);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_ADD_PEER_NI: {
+		struct lnet_ioctl_peer_cfg *cfg = arg;
+
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_add_peer_ni(cfg->prcfg_prim_nid,
+				      cfg->prcfg_cfg_nid,
+				      cfg->prcfg_mr, false);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_DEL_PEER_NI: {
+		struct lnet_ioctl_peer_cfg *cfg = arg;
+
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_del_peer_ni(cfg->prcfg_prim_nid,
+				      cfg->prcfg_cfg_nid);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_PEER_INFO: {
+		struct lnet_ioctl_peer *peer_info = arg;
+
+		if (peer_info->pr_hdr.ioc_len < sizeof(*peer_info))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_peer_ni_info(
+		   peer_info->pr_count,
+		   &peer_info->pr_nid,
+		   peer_info->pr_lnd_u.pr_peer_credits.cr_aliveness,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_ncpt,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_refcount,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_ni_peer_tx_credits,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_credits,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_rtr_credits,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_min_tx_credits,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_qnob);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_PEER_NI: {
+		struct lnet_ioctl_peer_cfg *cfg = arg;
+
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_peer_info(cfg,
+					(void __user *)cfg->prcfg_bulk);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_PEER_LIST: {
+		struct lnet_ioctl_peer_cfg *cfg = arg;
+
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_peer_list(&cfg->prcfg_count, &cfg->prcfg_size,
+				(struct lnet_process_id __user *)cfg->prcfg_bulk);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_SET_HEALHV: {
+		struct lnet_ioctl_reset_health_cfg *cfg = arg;
+		int value;
+		if (cfg->rh_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+		if (cfg->rh_value < 0 ||
+		    cfg->rh_value > LNET_MAX_HEALTH_VALUE)
+			value = LNET_MAX_HEALTH_VALUE;
+		else
+			value = cfg->rh_value;
+		CDEBUG(D_NET, "Manually setting healthv to %d for %s:%s. all = %d\n",
+		       value, (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) ?
+		       "local" : "peer", libcfs_nid2str(cfg->rh_nid), cfg->rh_all);
+		mutex_lock(&the_lnet.ln_api_mutex);
+		if (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI)
+			lnet_ni_set_healthv(cfg->rh_nid, value,
+					     cfg->rh_all);
+		else
+			lnet_peer_ni_set_healthv(cfg->rh_nid, value,
+						  cfg->rh_all);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	case IOC_LIBCFS_SET_CONNS_PER_PEER: {
+		struct lnet_ioctl_reset_conns_per_peer_cfg *cfg = arg;
+		int value;
+
+		if (cfg->rcpp_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+		if (cfg->rcpp_value < 0)
+			value = 1;
+		else
+			value = cfg->rcpp_value;
+		CDEBUG(D_NET,
+		       "Setting conns_per_peer to %d for %s. all = %d\n",
+		       value, libcfs_nid2str(cfg->rcpp_nid), cfg->rcpp_all);
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lnet_ni_set_conns_per_peer(cfg->rcpp_nid, value, cfg->rcpp_all);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	case IOC_LIBCFS_NOTIFY_ROUTER: {
+		time64_t deadline = ktime_get_real_seconds() - data->ioc_u64[0];
+
+		/* The deadline passed in by the user should be some time in
+		 * seconds in the future since the UNIX epoch. We have to map
+		 * that deadline to the wall clock.
+		 */
+		deadline += ktime_get_seconds();
+		return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, false,
+				   deadline);
+	}
+
+	case IOC_LIBCFS_LNET_DIST:
+		rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+		if (rc < 0 && rc != -EHOSTUNREACH)
+			return rc;
+
+		data->ioc_u32[0] = rc;
+		return 0;
+
+	case IOC_LIBCFS_TESTPROTOCOMPAT:
+		the_lnet.ln_testprotocompat = data->ioc_flags;
+		return 0;
+
+	case IOC_LIBCFS_LNET_FAULT:
+		return lnet_fault_ctl(data->ioc_flags, data);
+
+	case IOC_LIBCFS_PING: {
+		signed long timeout;
+
+		id4.nid = data->ioc_nid;
+		id4.pid = data->ioc_u32[0];
+
+		/* If timeout is negative then set default of 3 minutes */
+		if (((s32)data->ioc_u32[1] <= 0) ||
+		    data->ioc_u32[1] > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC))
+			timeout = cfs_time_seconds(DEFAULT_PEER_TIMEOUT);
+		else
+			timeout = nsecs_to_jiffies(data->ioc_u32[1] * NSEC_PER_MSEC);
+
+		rc = lnet_ping(id4, &LNET_ANY_NID, timeout, data->ioc_pbuf1,
+			       data->ioc_plen1 / sizeof(struct lnet_process_id));
+
+		if (rc < 0)
+			return rc;
+
+		data->ioc_count = rc;
+		return 0;
+	}
+
+	case IOC_LIBCFS_PING_PEER: {
+		struct lnet_ioctl_ping_data *ping = arg;
+		struct lnet_nid src_nid = LNET_ANY_NID;
+		struct lnet_peer *lp;
+		signed long timeout;
+
+		/* Check if the supplied ping data supports source nid
+		 * NB: This check is sufficient if lnet_ioctl_ping_data has
+		 * additional fields added, but if they are re-ordered or
+		 * fields removed then this will break. It is expected that
+		 * these ioctls will be replaced with netlink implementation, so
+		 * it is probably not worth coming up with a more robust version
+		 * compatibility scheme.
+		 */
+		if (ping->ping_hdr.ioc_len >= sizeof(struct lnet_ioctl_ping_data))
+			lnet_nid4_to_nid(ping->ping_src, &src_nid);
+
+		/* If timeout is negative then set default of 3 minutes */
+		if (((s32)ping->op_param) <= 0 ||
+		    ping->op_param > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC))
+			timeout = cfs_time_seconds(DEFAULT_PEER_TIMEOUT);
+		else
+			timeout = nsecs_to_jiffies(ping->op_param * NSEC_PER_MSEC);
+
+		rc = lnet_ping(ping->ping_id, &src_nid, timeout,
+			       ping->ping_buf,
+			       ping->ping_count);
+		if (rc < 0)
+			return rc;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lp = lnet_find_peer4(ping->ping_id.nid);
+		if (lp) {
+			ping->ping_id.nid =
+				lnet_nid_to_nid4(&lp->lp_primary_nid);
+			ping->mr_info = lnet_peer_is_multi_rail(lp);
+			lnet_peer_decref_locked(lp);
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		ping->ping_count = rc;
+		return 0;
+	}
+
+	case IOC_LIBCFS_DISCOVER: {
+		struct lnet_ioctl_ping_data *discover = arg;
+		struct lnet_peer *lp;
+
+		rc = lnet_discover(discover->ping_id, discover->op_param,
+				   discover->ping_buf,
+				   discover->ping_count);
+		if (rc < 0)
+			return rc;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lp = lnet_find_peer4(discover->ping_id.nid);
+		if (lp) {
+			discover->ping_id.nid =
+				lnet_nid_to_nid4(&lp->lp_primary_nid);
+			discover->mr_info = lnet_peer_is_multi_rail(lp);
+			lnet_peer_decref_locked(lp);
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		discover->ping_count = rc;
+		return 0;
+	}
+
+	case IOC_LIBCFS_ADD_UDSP: {
+		struct lnet_ioctl_udsp *ioc_udsp = arg;
+		__u32 bulk_size = ioc_udsp->iou_hdr.ioc_len;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_udsp_demarshal_add(arg, bulk_size);
+		if (!rc) {
+			rc = lnet_udsp_apply_policies(NULL, false);
+			CDEBUG(D_NET, "policy application returned %d\n", rc);
+			rc = 0;
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
+	case IOC_LIBCFS_DEL_UDSP: {
+		struct lnet_ioctl_udsp *ioc_udsp = arg;
+		int idx = ioc_udsp->iou_idx;
+
+		if (ioc_udsp->iou_hdr.ioc_len < sizeof(*ioc_udsp))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_udsp_del_policy(idx);
+		if (!rc) {
+			rc = lnet_udsp_apply_policies(NULL, false);
+			CDEBUG(D_NET, "policy re-application returned %d\n",
+			       rc);
+			rc = 0;
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_UDSP_SIZE: {
+		struct lnet_ioctl_udsp *ioc_udsp = arg;
+		struct lnet_udsp *udsp;
+
+		if (ioc_udsp->iou_hdr.ioc_len < sizeof(*ioc_udsp))
+			return -EINVAL;
+
+		rc = 0;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		udsp = lnet_udsp_get_policy(ioc_udsp->iou_idx);
+		if (!udsp) {
+			rc = -ENOENT;
+		} else {
+			/* coming in iou_idx will hold the idx of the udsp
+			 * to get the size of. going out the iou_idx will
+			 * hold the size of the UDSP found at the passed
+			 * in index.
+			 */
+			ioc_udsp->iou_idx = lnet_get_udsp_size(udsp);
+			if (ioc_udsp->iou_idx < 0)
+				rc = -EINVAL;
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_UDSP: {
+		struct lnet_ioctl_udsp *ioc_udsp = arg;
+		struct lnet_udsp *udsp;
+
+		if (ioc_udsp->iou_hdr.ioc_len < sizeof(*ioc_udsp))
+			return -EINVAL;
+
+		rc = 0;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		udsp = lnet_udsp_get_policy(ioc_udsp->iou_idx);
+		if (!udsp)
+			rc = -ENOENT;
+		else
+			rc = lnet_udsp_marshal(udsp, ioc_udsp);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_CONST_UDSP_INFO: {
+		struct lnet_ioctl_construct_udsp_info *info = arg;
+
+		if (info->cud_hdr.ioc_len < sizeof(*info))
+			return -EINVAL;
+
+		CDEBUG(D_NET, "GET_UDSP_INFO for %s\n",
+		       libcfs_nid2str(info->cud_nid));
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lnet_udsp_get_construct_info(info);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return 0;
+	}
+
+	default:
+		ni = lnet_net2ni_addref(data->ioc_net);
+		if (ni == NULL)
+			return -EINVAL;
+
+		if (ni->ni_net->net_lnd->lnd_ctl == NULL)
+			rc = -EINVAL;
+		else
+			rc = ni->ni_net->net_lnd->lnd_ctl(ni, cmd, arg);
+
+		lnet_ni_decref(ni);
+		return rc;
+	}
+	/* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+void LNetDebugPeer(struct lnet_processid *id)
+{
+	lnet_debug_peer(lnet_nid_to_nid4(&id->nid));
+}
+EXPORT_SYMBOL(LNetDebugPeer);
+
+/**
+ * Determine if the specified peer \a nid is on the local node.
+ *
+ * \param nid	peer nid to check
+ *
+ * \retval true		If peer NID is on the local node.
+ * \retval false	If peer NID is not on the local node.
+ */
+bool LNetIsPeerLocal(lnet_nid_t nid)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+	int cpt;
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (lnet_nid_to_nid4(&ni->ni_nid) == nid) {
+				lnet_net_unlock(cpt);
+				return true;
+			}
+		}
+	}
+	lnet_net_unlock(cpt);
+
+	return false;
+}
+EXPORT_SYMBOL(LNetIsPeerLocal);
+
+/**
+ * Retrieve the struct lnet_process_id ID of LNet interface at \a index.
+ * Note that all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * struct lnet_process_id ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, struct lnet_processid *id)
+{
+	struct lnet_ni	 *ni;
+	struct lnet_net  *net;
+	int		  cpt;
+	int		  rc = -ENOENT;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (!nid_is_nid4(&ni->ni_nid))
+				/* FIXME this needs to be handled */
+				continue;
+			if (index-- != 0)
+				continue;
+
+			id->nid = ni->ni_nid;
+			id->pid = the_lnet.ln_pid;
+			rc = 0;
+			break;
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+struct ping_data {
+	int rc;
+	int replied;
+	struct lnet_handle_md mdh;
+	struct completion completion;
+};
+
+static void
+lnet_ping_event_handler(struct lnet_event *event)
+{
+	struct ping_data *pd = event->md_user_ptr;
+
+	CDEBUG(D_NET, "ping event (%d %d)%s\n",
+	       event->type, event->status,
+	       event->unlinked ? " unlinked" : "");
+
+	if (event->status) {
+		if (!pd->rc)
+			pd->rc = event->status;
+	} else if (event->type == LNET_EVENT_REPLY) {
+		pd->replied = 1;
+		pd->rc = event->mlength;
+	}
+	if (event->unlinked)
+		complete(&pd->completion);
+}
+
+static int lnet_ping(struct lnet_process_id id, struct lnet_nid *src_nid,
+		     signed long timeout, struct lnet_process_id __user *ids,
+		     int n_ids)
+{
+	struct lnet_md md = { NULL };
+	struct ping_data pd = { 0 };
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_process_id tmpid;
+	int i;
+	int nob;
+	int rc;
+	int rc2;
+
+	/* n_ids limit is arbitrary */
+	if (n_ids <= 0 || id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	/*
+	 * if the user buffer has more space than the lnet_interfaces_max
+	 * then only fill it up to lnet_interfaces_max
+	 */
+	if (n_ids > lnet_interfaces_max)
+		n_ids = lnet_interfaces_max;
+
+	if (id.pid == LNET_PID_ANY)
+		id.pid = LNET_PID_LUSTRE;
+
+	pbuf = lnet_ping_buffer_alloc(n_ids, GFP_NOFS);
+	if (!pbuf)
+		return -ENOMEM;
+
+	/* initialize md content */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(n_ids);
+	md.threshold = 2; /* GET/REPLY */
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRUNCATE;
+	md.user_ptr  = &pd;
+	md.handler   = lnet_ping_event_handler;
+
+	init_completion(&pd.completion);
+
+	rc = LNetMDBind(&md, LNET_UNLINK, &pd.mdh);
+	if (rc != 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto fail_ping_buffer_decref;
+	}
+
+	rc = LNetGet(lnet_nid_to_nid4(src_nid), pd.mdh, id,
+		     LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0, false);
+
+	if (rc != 0) {
+		/* Don't CERROR; this could be deliberate! */
+		rc2 = LNetMDUnlink(pd.mdh);
+		LASSERT(rc2 == 0);
+
+		/* NB must wait for the UNLINK event below... */
+	}
+
+	if (wait_for_completion_timeout(&pd.completion, timeout) == 0) {
+		/* Ensure completion in finite time... */
+		LNetMDUnlink(pd.mdh);
+		wait_for_completion(&pd.completion);
+	}
+	if (!pd.replied) {
+		rc = -EIO;
+		goto fail_ping_buffer_decref;
+	}
+
+	nob = pd.rc;
+	LASSERT(nob >= 0 && nob <= LNET_PING_INFO_SIZE(n_ids));
+
+	rc = -EPROTO;		/* if I can't parse... */
+
+	if (nob < 8) {
+		CERROR("%s: ping info too short %d\n",
+		       libcfs_id2str(id), nob);
+		goto fail_ping_buffer_decref;
+	}
+
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+		lnet_swap_pinginfo(pbuf);
+	} else if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) {
+		CERROR("%s: Unexpected magic %08x\n",
+		       libcfs_id2str(id), pbuf->pb_info.pi_magic);
+		goto fail_ping_buffer_decref;
+	}
+
+	if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+		CERROR("%s: ping w/o NI status: 0x%x\n",
+		       libcfs_id2str(id), pbuf->pb_info.pi_features);
+		goto fail_ping_buffer_decref;
+	}
+
+	if (nob < LNET_PING_INFO_SIZE(0)) {
+		CERROR("%s: Short reply %d(%d min)\n",
+		       libcfs_id2str(id),
+		       nob, (int)LNET_PING_INFO_SIZE(0));
+		goto fail_ping_buffer_decref;
+	}
+
+	if (pbuf->pb_info.pi_nnis < n_ids)
+		n_ids = pbuf->pb_info.pi_nnis;
+
+	if (nob < LNET_PING_INFO_SIZE(n_ids)) {
+		CERROR("%s: Short reply %d(%d expected)\n",
+		       libcfs_id2str(id),
+		       nob, (int)LNET_PING_INFO_SIZE(n_ids));
+		goto fail_ping_buffer_decref;
+	}
+
+	rc = -EFAULT;		/* if I segv in copy_to_user()... */
+
+	memset(&tmpid, 0, sizeof(tmpid));
+	for (i = 0; i < n_ids; i++) {
+		tmpid.pid = pbuf->pb_info.pi_pid;
+		tmpid.nid = pbuf->pb_info.pi_ni[i].ns_nid;
+		if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+			goto fail_ping_buffer_decref;
+	}
+	rc = pbuf->pb_info.pi_nnis;
+
+ fail_ping_buffer_decref:
+	lnet_ping_buffer_decref(pbuf);
+	return rc;
+}
+
+static int
+lnet_discover(struct lnet_process_id id, __u32 force,
+	      struct lnet_process_id __user *ids, int n_ids)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_ni *p;
+	struct lnet_peer *lp;
+	struct lnet_process_id *buf;
+	int cpt;
+	int i;
+	int rc;
+
+	if (n_ids <= 0 ||
+	    id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	if (id.pid == LNET_PID_ANY)
+		id.pid = LNET_PID_LUSTRE;
+
+	/*
+	 * If the user buffer has more space than the lnet_interfaces_max,
+	 * then only fill it up to lnet_interfaces_max.
+	 */
+	if (n_ids > lnet_interfaces_max)
+		n_ids = lnet_interfaces_max;
+
+	CFS_ALLOC_PTR_ARRAY(buf, n_ids);
+	if (!buf)
+		return -ENOMEM;
+
+	cpt = lnet_net_lock_current();
+	lpni = lnet_nid2peerni_locked(id.nid, LNET_NID_ANY, cpt);
+	if (IS_ERR(lpni)) {
+		rc = PTR_ERR(lpni);
+		goto out;
+	}
+
+	/*
+	 * Clearing the NIDS_UPTODATE flag ensures the peer will
+	 * be discovered, provided discovery has not been disabled.
+	 */
+	lp = lpni->lpni_peer_net->lpn_peer;
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+	/* If the force flag is set, force a PING and PUSH as well. */
+	if (force)
+		lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
+	spin_unlock(&lp->lp_lock);
+	rc = lnet_discover_peer_locked(lpni, cpt, true);
+	if (rc)
+		goto out_decref;
+
+	/* The lpni (or lp) for this NID may have changed and our ref is
+	 * the only thing keeping the old one around. Release the ref
+	 * and lookup the lpni again
+	 */
+	lnet_peer_ni_decref_locked(lpni);
+	lpni = lnet_find_peer_ni_locked(id.nid);
+	if (!lpni) {
+		rc = -ENOENT;
+		goto out;
+	}
+	lp = lpni->lpni_peer_net->lpn_peer;
+
+	i = 0;
+	p = NULL;
+	while ((p = lnet_get_next_peer_ni_locked(lp, NULL, p)) != NULL) {
+		buf[i].pid = id.pid;
+		buf[i].nid = lnet_nid_to_nid4(&p->lpni_nid);
+		if (++i >= n_ids)
+			break;
+	}
+	rc = i;
+
+out_decref:
+	lnet_peer_ni_decref_locked(lpni);
+out:
+	lnet_net_unlock(cpt);
+
+	if (rc >= 0)
+		if (copy_to_user(ids, buf, rc * sizeof(*buf)))
+			rc = -EFAULT;
+	CFS_FREE_PTR_ARRAY(buf, n_ids);
+
+	return rc;
+}
+
+/**
+ * Retrieve peer discovery status.
+ *
+ * \retval 1 if lnet_peer_discovery_disabled is 0
+ * \retval 0 if lnet_peer_discovery_disabled is 1
+ */
+int
+LNetGetPeerDiscoveryStatus(void)
+{
+	return !lnet_peer_discovery_disabled;
+}
+EXPORT_SYMBOL(LNetGetPeerDiscoveryStatus);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/config.c b/drivers/staging/lustrefsx/lnet/lnet/config.c
new file mode 100644
index 0000000000000..09fe96d6c1011
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/config.c
@@ -0,0 +1,1636 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/ctype.h>
+#include <linux/inetdevice.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include <lnet/lib-lnet.h>
+
+/* tmp struct for parsing routes */
+struct lnet_text_buf {
+	struct list_head	ltb_list;	/* stash on lists */
+	int			ltb_size;	/* allocated size */
+	char			ltb_text[0];	/* text buffer */
+};
+
+static int lnet_tbnob = 0;			/* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB	 (64<<10)	/* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
+
+#define SPACESTR " \t\v\r\n"
+#define DELIMITERS ":()[]"
+
+#ifndef HAVE_STRSCPY
+#define strscpy(s1, s2, sz)	strlcpy((s1), (s2), (sz))
+#endif
+
+static void
+lnet_syntax(const char *name, const char *str, int offset, int width)
+{
+	static char dots[LNET_SINGLE_TEXTBUF_NOB];
+	static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+	memset(dots, '.', sizeof(dots));
+	dots[sizeof(dots)-1] = 0;
+	memset(dashes, '-', sizeof(dashes));
+	dashes[sizeof(dashes)-1] = 0;
+
+	LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+	LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+			   (int)strlen(name), dots, offset, dots,
+			    (width < 1) ? 0 : width - 1, dashes);
+}
+
+static int
+lnet_issep (char c)
+{
+	switch (c) {
+	case '\n':
+	case '\r':
+	case ';':
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+bool
+lnet_net_unique(__u32 net_id, struct list_head *netlist,
+		struct lnet_net **net)
+{
+	struct lnet_net  *net_l;
+
+	if (!netlist)
+		return true;
+
+	list_for_each_entry(net_l, netlist, net_list) {
+		if (net_l->net_id == net_id) {
+			if (net != NULL)
+				*net = net_l;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/* check that the NI is unique within the list of NIs already added to
+ * a network */
+bool
+lnet_ni_unique_net(struct list_head *nilist, char *iface)
+{
+	struct list_head *tmp;
+	struct lnet_ni *ni;
+
+	list_for_each(tmp, nilist) {
+		ni = list_entry(tmp, struct lnet_ni, ni_netlist);
+
+		if (ni->ni_interface != NULL &&
+		    strncmp(ni->ni_interface, iface, strlen(iface)) == 0)
+			return false;
+	}
+
+	return true;
+}
+static bool
+in_array(__u32 *array, __u32 size, __u32 value)
+{
+	int i;
+
+	for (i = 0; i < size; i++) {
+		if (array[i] == value)
+			return false;
+	}
+
+	return true;
+}
+
+static int
+lnet_net_append_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net)
+{
+	__u32 *added_cpts = NULL;
+	int i, j = 0, rc = 0;
+
+	/*
+	 * no need to go futher since a subset of the NIs already exist on
+	 * all CPTs
+	 */
+	if (net->net_ncpts == LNET_CPT_NUMBER)
+		return 0;
+
+	if (cpts == NULL) {
+		/* there is an NI which will exist on all CPTs */
+		if (net->net_cpts != NULL)
+			CFS_FREE_PTR_ARRAY(net->net_cpts, net->net_ncpts);
+		net->net_cpts = NULL;
+		net->net_ncpts = LNET_CPT_NUMBER;
+		return 0;
+	}
+
+	if (net->net_cpts == NULL) {
+		CFS_ALLOC_PTR_ARRAY(net->net_cpts, ncpts);
+		if (net->net_cpts == NULL)
+			return -ENOMEM;
+		memcpy(net->net_cpts, cpts, ncpts * sizeof(*net->net_cpts));
+		net->net_ncpts = ncpts;
+		return 0;
+	}
+
+	CFS_ALLOC_PTR_ARRAY(added_cpts, LNET_CPT_NUMBER);
+	if (added_cpts == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ncpts; i++) {
+		if (!in_array(net->net_cpts, net->net_ncpts, cpts[i])) {
+			added_cpts[j] = cpts[i];
+			j++;
+		}
+	}
+
+	/* append the new cpts if any to the list of cpts in the net */
+	if (j > 0) {
+		__u32 *array = NULL, *loc;
+		__u32 total_entries = j + net->net_ncpts;
+
+		CFS_ALLOC_PTR_ARRAY(array, total_entries);
+		if (array == NULL) {
+			rc = -ENOMEM;
+			goto failed;
+		}
+
+		memcpy(array, net->net_cpts,
+		       net->net_ncpts * sizeof(*net->net_cpts));
+		loc = array + net->net_ncpts;
+		memcpy(loc, added_cpts, j * sizeof(*net->net_cpts));
+
+		CFS_FREE_PTR_ARRAY(net->net_cpts, net->net_ncpts);
+		net->net_ncpts = total_entries;
+		net->net_cpts = array;
+	}
+
+failed:
+	CFS_FREE_PTR_ARRAY(added_cpts, LNET_CPT_NUMBER);
+
+	return rc;
+}
+
+static void
+lnet_net_remove_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net)
+{
+	struct lnet_ni *ni;
+	int rc;
+
+	/*
+	 * Operation Assumption:
+	 *	This function is called after an NI has been removed from
+	 *	its parent net.
+	 *
+	 * if we're removing an NI which exists on all CPTs then
+	 * we have to check if any of the other NIs on this net also
+	 * exists on all CPTs. If none, then we need to build our Net CPT
+	 * list based on the remaining NIs.
+	 *
+	 * If the NI being removed exist on a subset of the CPTs then we
+	 * alo rebuild the Net CPT list based on the remaining NIs, which
+	 * should resutl in the expected Net CPT list.
+	 */
+
+	/*
+	 * sometimes this function can be called due to some failure
+	 * creating an NI, before any of the cpts are allocated, so check
+	 * for that case and don't do anything
+	 */
+	if (ncpts == 0)
+		return;
+
+	if (ncpts == LNET_CPT_NUMBER) {
+		/*
+		 * first iteration through the NI list in the net to see
+		 * if any of the NIs exist on all the CPTs. If one is
+		 * found then our job is done.
+		 */
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (ni->ni_ncpts == LNET_CPT_NUMBER)
+				return;
+		}
+	}
+
+	/*
+	 * Rebuild the Net CPT list again, thereby only including only the
+	 * CPTs which the remaining NIs are associated with.
+	 */
+	if (net->net_cpts != NULL) {
+		CFS_FREE_PTR_ARRAY(net->net_cpts, net->net_ncpts);
+		net->net_cpts = NULL;
+	}
+
+	list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+		rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts,
+					  net);
+		if (rc != 0) {
+			CERROR("Out of Memory\n");
+			/*
+			 * do our best to keep on going. Delete
+			 * the net cpts and set it to NULL. This
+			 * way we can keep on going but less
+			 * efficiently, since memory accesses might be
+			 * accross CPT lines.
+			 */
+			if (net->net_cpts != NULL) {
+				CFS_FREE_PTR_ARRAY(net->net_cpts,
+						   net->net_ncpts);
+				net->net_cpts = NULL;
+				net->net_ncpts = LNET_CPT_NUMBER;
+			}
+			return;
+		}
+	}
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+	lnet_net_remove_cpts(ni->ni_cpts, ni->ni_ncpts, ni->ni_net);
+
+	if (ni->ni_refs != NULL)
+		cfs_percpt_free(ni->ni_refs);
+
+	if (ni->ni_tx_queues != NULL)
+		cfs_percpt_free(ni->ni_tx_queues);
+
+	if (ni->ni_cpts != NULL)
+		cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+	if (ni->ni_interface != NULL) {
+		LIBCFS_FREE(ni->ni_interface,
+			    strlen(ni->ni_interface) + 1);
+	}
+
+	/* release reference to net namespace */
+	if (ni->ni_net_ns != NULL)
+		put_net(ni->ni_net_ns);
+
+	LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+void
+lnet_net_free(struct lnet_net *net)
+{
+	struct list_head *tmp, *tmp2;
+	struct lnet_ni *ni;
+
+	LASSERT(list_empty(&net->net_ni_zombie));
+
+	/*
+	 * delete any nis that haven't been added yet. This could happen
+	 * if there is a failure on net startup
+	 */
+	list_for_each_safe(tmp, tmp2, &net->net_ni_added) {
+		ni = list_entry(tmp, struct lnet_ni, ni_netlist);
+		list_del_init(&ni->ni_netlist);
+		lnet_ni_free(ni);
+	}
+
+	/* delete any nis which have been started. */
+	list_for_each_safe(tmp, tmp2, &net->net_ni_list) {
+		ni = list_entry(tmp, struct lnet_ni, ni_netlist);
+		list_del_init(&ni->ni_netlist);
+		lnet_ni_free(ni);
+	}
+
+	if (net->net_cpts != NULL)
+		CFS_FREE_PTR_ARRAY(net->net_cpts, net->net_ncpts);
+
+	LIBCFS_FREE(net, sizeof(*net));
+}
+
+struct lnet_net *
+lnet_net_alloc(__u32 net_id, struct list_head *net_list)
+{
+	struct lnet_net		*net;
+
+	if (!lnet_net_unique(net_id, net_list, &net)) {
+		CDEBUG(D_NET, "Returning duplicate net %p %s\n", net,
+		       libcfs_net2str(net->net_id));
+		return net;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	if (net == NULL) {
+		CERROR("Out of memory creating network %s\n",
+		       libcfs_net2str(net_id));
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&net->net_list);
+	INIT_LIST_HEAD(&net->net_ni_list);
+	INIT_LIST_HEAD(&net->net_ni_added);
+	INIT_LIST_HEAD(&net->net_ni_zombie);
+	INIT_LIST_HEAD(&net->net_rtr_pref_nids);
+	spin_lock_init(&net->net_lock);
+
+	net->net_id = net_id;
+	net->net_last_alive = ktime_get_seconds();
+
+	net->net_sel_priority = LNET_MAX_SELECTION_PRIORITY;
+
+	/* initialize global paramters to undefiend */
+	net->net_tunables.lct_peer_timeout = -1;
+	net->net_tunables.lct_max_tx_credits = -1;
+	net->net_tunables.lct_peer_tx_credits = -1;
+	net->net_tunables.lct_peer_rtr_credits = -1;
+
+	if (net_list)
+		list_add_tail(&net->net_list, net_list);
+
+	return net;
+}
+
+static int
+lnet_ni_add_interface(struct lnet_ni *ni, char *iface)
+{
+	size_t iface_len = strlen(iface) + 1;
+
+	if (ni == NULL)
+		return -ENOMEM;
+
+	if (ni->ni_interface != NULL) {
+		LCONSOLE_ERROR_MSG(0x115, "%s: interface %s already set for net %s: rc = %d\n",
+				   iface, ni->ni_interface,
+				   libcfs_net2str(LNET_NID_NET(&ni->ni_nid)),
+				   -EINVAL);
+		return -EINVAL;
+	}
+
+	/* Allocate memory for the interface, so the code parsing input into
+	 * tokens and adding interfaces can free the input safely.
+	 * ni->ni_interface is freed in lnet_ni_free().
+	 */
+	LIBCFS_ALLOC(ni->ni_interface, iface_len);
+
+	if (ni->ni_interface == NULL) {
+		CERROR("%s: cannot allocate net interface name: rc = %d\n",
+			iface, -ENOMEM);
+		return -ENOMEM;
+	}
+
+	strscpy(ni->ni_interface, iface, iface_len);
+
+	return 0;
+}
+
+static struct lnet_ni *
+lnet_ni_alloc_common(struct lnet_net *net, char *iface)
+{
+	struct lnet_tx_queue	*tq;
+	struct lnet_ni		*ni;
+	int			i;
+
+	if (iface != NULL)
+		/* make sure that this NI is unique in the net it's
+		 * being added to */
+		if (!lnet_ni_unique_net(&net->net_ni_added, iface))
+			return NULL;
+
+	LIBCFS_ALLOC(ni, sizeof(*ni));
+	if (ni == NULL) {
+		CERROR("Out of memory creating network interface %s%s\n",
+		       libcfs_net2str(net->net_id),
+		       (iface != NULL) ? iface : "");
+		return NULL;
+	}
+
+	spin_lock_init(&ni->ni_lock);
+	INIT_LIST_HEAD(&ni->ni_netlist);
+	INIT_LIST_HEAD(&ni->ni_recovery);
+	LNetInvalidateMDHandle(&ni->ni_ping_mdh);
+	ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*ni->ni_refs[0]));
+	if (ni->ni_refs == NULL)
+		goto failed;
+
+	ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(*ni->ni_tx_queues[0]));
+	if (ni->ni_tx_queues == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+		INIT_LIST_HEAD(&tq->tq_delayed);
+
+	ni->ni_net = net;
+	/* LND will fill in the address part of the NID */
+	ni->ni_nid.nid_type = LNET_NETTYP(net->net_id);
+	ni->ni_nid.nid_num = cpu_to_be16(LNET_NETNUM(net->net_id));
+
+	/* Store net namespace in which current ni is being created */
+	if (current->nsproxy && current->nsproxy->net_ns)
+		ni->ni_net_ns = get_net(current->nsproxy->net_ns);
+	else
+		ni->ni_net_ns = get_net(&init_net);
+
+	ni->ni_state = LNET_NI_STATE_INIT;
+	ni->ni_sel_priority = LNET_MAX_SELECTION_PRIORITY;
+	list_add_tail(&ni->ni_netlist, &net->net_ni_added);
+
+	/*
+	 * if an interface name is provided then make sure to add in that
+	 * interface name in NI
+	 */
+	if (iface)
+		if (lnet_ni_add_interface(ni, iface) != 0)
+			goto failed;
+
+	return ni;
+failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+/* allocate and add to the provided network */
+struct lnet_ni *
+lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el, char *iface)
+{
+	struct lnet_ni		*ni;
+	int			rc;
+
+	ni = lnet_ni_alloc_common(net, iface);
+	if (!ni)
+		return NULL;
+
+	if (!el) {
+		ni->ni_cpts  = NULL;
+		ni->ni_ncpts = LNET_CPT_NUMBER;
+	} else {
+		rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+		if (rc <= 0) {
+			CERROR("Failed to set CPTs for NI %s(%s): %d\n",
+			       libcfs_net2str(net->net_id),
+			       (iface != NULL) ? iface : "", rc);
+			goto failed;
+		}
+
+		LASSERT(rc <= LNET_CPT_NUMBER);
+		if (rc == LNET_CPT_NUMBER) {
+			CFS_FREE_PTR_ARRAY(ni->ni_cpts, rc);
+			ni->ni_cpts = NULL;
+		}
+
+		ni->ni_ncpts = rc;
+	}
+
+	rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net);
+	if (rc != 0)
+		goto failed;
+
+	return ni;
+failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+struct lnet_ni *
+lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts,
+			  char *iface)
+{
+	struct lnet_ni		*ni;
+	int			rc;
+
+	ni = lnet_ni_alloc_common(net, iface);
+	if (!ni)
+		return NULL;
+
+	if (ncpts == 0) {
+		ni->ni_cpts  = NULL;
+		ni->ni_ncpts = LNET_CPT_NUMBER;
+	} else {
+		size_t array_size = ncpts * sizeof(ni->ni_cpts[0]);
+
+		CFS_ALLOC_PTR_ARRAY(ni->ni_cpts, ncpts);
+		if (ni->ni_cpts == NULL)
+			goto failed;
+		memcpy(ni->ni_cpts, cpts, array_size);
+		ni->ni_ncpts = ncpts;
+	}
+
+	rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net);
+	if (rc != 0)
+		goto failed;
+
+	return ni;
+failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+/*
+ * Parse the networks string and create the matching set of NIs on the
+ * nilist.
+ */
+int
+lnet_parse_networks(struct list_head *netlist, const char *networks)
+{
+	struct cfs_expr_list *net_el = NULL;
+	struct cfs_expr_list *ni_el = NULL;
+	int		tokensize;
+	char		*tokens;
+	char		*str;
+	struct lnet_net *net;
+	struct lnet_ni	*ni = NULL;
+	__u32		net_id;
+	int		nnets = 0;
+
+	if (networks == NULL) {
+		CERROR("networks string is undefined\n");
+		return -EINVAL;
+	}
+
+	if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _WAY_ conservative */
+		LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too "
+				   "long\n");
+		return -EINVAL;
+	}
+
+	tokensize = strlen(networks) + 1;
+
+	LIBCFS_ALLOC(tokens, tokensize);
+	if (tokens == NULL) {
+		CERROR("Can't allocate net tokens\n");
+		return -ENOMEM;
+	}
+
+	memcpy(tokens, networks, tokensize);
+	str = tokens;
+
+	/*
+	 * Main parser loop.
+	 *
+	 * NB we don't check interface conflicts here; it's the LNDs
+	 * responsibility (if it cares at all)
+	 */
+	do {
+		char *nistr;
+		char *elstr;
+		char *name;
+		int rc;
+
+		/*
+		 * Parse a network string into its components.
+		 *
+		 * <name>{"("...")"}{"["<el>"]"}
+		 */
+
+		/* Network name (mandatory) */
+		while (isspace(*str))
+			*str++ = '\0';
+		if (!*str)
+			break;
+		name = str;
+		str += strcspn(str, SPACESTR ":()[],");
+		while (isspace(*str))
+			*str++ = '\0';
+
+		/* Interface list (optional) */
+		if (*str == '(') {
+			*str++ = '\0';
+			nistr = str;
+			str += strcspn(str, ")");
+			if (*str != ')') {
+				str = nistr;
+				goto failed_syntax;
+			}
+			do {
+				*str++ = '\0';
+			} while (isspace(*str));
+		} else {
+			nistr = NULL;
+		}
+
+		/* CPT expression (optional) */
+		if (*str == '[') {
+			elstr = str;
+			str += strcspn(str, "]");
+			if (*str != ']') {
+				str = elstr;
+				goto failed_syntax;
+			}
+			rc = cfs_expr_list_parse(elstr, str - elstr + 1,
+						0, LNET_CPT_NUMBER - 1,
+						&net_el);
+			if (rc != 0) {
+				str = elstr;
+				goto failed_syntax;
+			}
+			*elstr = '\0';
+			do {
+				*str++ = '\0';
+			} while (isspace(*str));
+		}
+
+		/* Bad delimiters */
+		if (*str && (strchr(DELIMITERS, *str) != NULL))
+			goto failed_syntax;
+
+		/* go to the next net if it exits */
+		str += strcspn(str, ",");
+		if (*str == ',')
+			*str++ = '\0';
+
+		/*
+		 * At this point the name is properly terminated.
+		 */
+		net_id = libcfs_str2net(name);
+		if (net_id == LNET_NET_ANY) {
+			LCONSOLE_ERROR_MSG(0x113,
+					"Unrecognised network type\n");
+			str = name;
+			goto failed_syntax;
+		}
+
+		if (LNET_NETTYP(net_id) == LOLND) {
+			/* Loopback is implicit, and there can be only one. */
+			if (net_el) {
+				cfs_expr_list_free(net_el);
+				net_el = NULL;
+			}
+			/* Should we error out instead? */
+			continue;
+		}
+
+		/*
+		 * All network paramaters are now known.
+		 */
+		nnets++;
+
+		/* always allocate a net, since we will eventually add an
+		 * interface to it, or we will fail, in which case we'll
+		 * just delete it */
+		net = lnet_net_alloc(net_id, netlist);
+		if (IS_ERR_OR_NULL(net))
+			goto failed;
+
+		if (!nistr) {
+			/*
+			 * No interface list was specified, allocate a
+			 * ni using the defaults.
+			 */
+			ni = lnet_ni_alloc(net, net_el, NULL);
+			if (IS_ERR_OR_NULL(ni))
+				goto failed;
+
+			if (!nistr) {
+				if (net_el) {
+					cfs_expr_list_free(net_el);
+					net_el = NULL;
+				}
+				continue;
+			}
+		}
+
+		do {
+			elstr = NULL;
+
+			/* Interface name (mandatory) */
+			while (isspace(*nistr))
+				*nistr++ = '\0';
+			name = nistr;
+			nistr += strcspn(nistr, SPACESTR "[],");
+			while (isspace(*nistr))
+				*nistr++ = '\0';
+
+			/* CPT expression (optional) */
+			if (*nistr == '[') {
+				elstr = nistr;
+				nistr += strcspn(nistr, "]");
+				if (*nistr != ']') {
+					str = elstr;
+					goto failed_syntax;
+				}
+				rc = cfs_expr_list_parse(elstr,
+							nistr - elstr + 1,
+							0, LNET_CPT_NUMBER - 1,
+							&ni_el);
+				if (rc != 0) {
+					str = elstr;
+					goto failed_syntax;
+				}
+				*elstr = '\0';
+				do {
+					*nistr++ = '\0';
+				} while (isspace(*nistr));
+			} else {
+				ni_el = net_el;
+			}
+
+			/*
+			 * End of single interface specificaton,
+			 * advance to the start of the next one, if
+			 * any.
+			 */
+			if (*nistr == ',') {
+				do {
+					*nistr++ = '\0';
+				} while (isspace(*nistr));
+				if (!*nistr) {
+					str = nistr;
+					goto failed_syntax;
+				}
+			} else if (*nistr) {
+				str = nistr;
+				goto failed_syntax;
+			}
+
+			/*
+			 * At this point the name is properly terminated.
+			 */
+			if (!*name) {
+				str = name;
+				goto failed_syntax;
+			}
+
+			ni = lnet_ni_alloc(net, ni_el, name);
+			if (IS_ERR_OR_NULL(ni))
+				goto failed;
+
+			if (ni_el) {
+				if (ni_el != net_el) {
+					cfs_expr_list_free(ni_el);
+					ni_el = NULL;
+				}
+			}
+		} while (*nistr);
+
+		if (net_el) {
+			cfs_expr_list_free(net_el);
+			net_el = NULL;
+		}
+	} while (*str);
+
+	LIBCFS_FREE(tokens, tokensize);
+	return nnets;
+
+ failed_syntax:
+	lnet_syntax("networks", networks, (int)(str - tokens), strlen(str));
+ failed:
+	/* free the net list and all the nis on each net */
+	while (!list_empty(netlist)) {
+		net = list_entry(netlist->next, struct lnet_net, net_list);
+
+		list_del_init(&net->net_list);
+		lnet_net_free(net);
+	}
+
+	if (ni_el && ni_el != net_el)
+		cfs_expr_list_free(ni_el);
+	if (net_el)
+		cfs_expr_list_free(net_el);
+
+	LIBCFS_FREE(tokens, tokensize);
+
+	return -EINVAL;
+}
+
+static struct lnet_text_buf *lnet_new_text_buf(int str_len)
+{
+	struct lnet_text_buf *ltb;
+	int nob;
+
+	/* NB allocate space for the terminating 0 */
+	nob = offsetof(struct lnet_text_buf, ltb_text[str_len + 1]);
+	if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _way_ conservative for "route net gateway..." */
+		CERROR("text buffer too big\n");
+		return NULL;
+	}
+
+	if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+		CERROR("Too many text buffers\n");
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ltb, nob);
+	if (ltb == NULL)
+		return NULL;
+
+	ltb->ltb_size = nob;
+	ltb->ltb_text[0] = 0;
+	lnet_tbnob += nob;
+	return ltb;
+}
+
+static void
+lnet_free_text_buf(struct lnet_text_buf *ltb)
+{
+	lnet_tbnob -= ltb->ltb_size;
+	LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+static void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+	struct lnet_text_buf  *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list);
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+}
+
+static int
+lnet_str2tbs_sep(struct list_head *tbs, const char *str)
+{
+	LIST_HEAD(pending);
+	const char *sep;
+	int nob;
+	int i;
+	struct lnet_text_buf *ltb;
+
+	/* Split 'str' into separate commands */
+	for (;;) {
+		/* skip leading whitespace */
+		while (isspace(*str))
+			str++;
+
+		/* scan for separator or comment */
+		for (sep = str; *sep != 0; sep++)
+			if (lnet_issep(*sep) || *sep == '#')
+				break;
+
+		nob = (int)(sep - str);
+		if (nob > 0) {
+			ltb = lnet_new_text_buf(nob);
+			if (ltb == NULL) {
+				lnet_free_text_bufs(&pending);
+				return -ENOMEM;
+			}
+
+			for (i = 0; i < nob; i++)
+				if (isspace(str[i]))
+					ltb->ltb_text[i] = ' ';
+				else
+					ltb->ltb_text[i] = str[i];
+
+			ltb->ltb_text[nob] = 0;
+
+			list_add_tail(&ltb->ltb_list, &pending);
+		}
+
+		if (*sep == '#') {
+			/* scan for separator */
+			do {
+				sep++;
+			} while (*sep != 0 && !lnet_issep(*sep));
+		}
+
+		if (*sep == 0)
+			break;
+
+		str = sep + 1;
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 0;
+}
+
+static int
+lnet_expand1tb(struct list_head *list,
+	       char *str, char *sep1, char *sep2,
+	       char *item, int itemlen)
+{
+	int		 len1 = (int)(sep1 - str);
+	int		 len2 = strlen(sep2 + 1);
+	struct lnet_text_buf *ltb;
+
+	LASSERT (*sep1 == '[');
+	LASSERT (*sep2 == ']');
+
+	ltb = lnet_new_text_buf(len1 + itemlen + len2);
+	if (ltb == NULL)
+		return -ENOMEM;
+
+	memcpy(ltb->ltb_text, str, len1);
+	memcpy(&ltb->ltb_text[len1], item, itemlen);
+	memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+	ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+	list_add_tail(&ltb->ltb_list, list);
+	return 0;
+}
+
+static int
+lnet_str2tbs_expand(struct list_head *tbs, char *str)
+{
+	char		  num[16];
+	LIST_HEAD(pending);
+	char		 *sep;
+	char		 *sep2;
+	char		 *parsed;
+	char		 *enditem;
+	int		  lo;
+	int		  hi;
+	int		  stride;
+	int		  i;
+	int		  nob;
+	int		  scanned;
+
+	sep = strchr(str, '[');
+	if (sep == NULL)			/* nothing to expand */
+		return 0;
+
+	sep2 = strchr(sep, ']');
+	if (sep2 == NULL)
+		goto failed;
+
+	for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+		enditem = ++parsed;
+		while (enditem < sep2 && *enditem != ',')
+			enditem++;
+
+		if (enditem == parsed)		/* no empty items */
+			goto failed;
+
+		if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+			if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+				/* simple string enumeration */
+				if (lnet_expand1tb(&pending, str, sep, sep2,
+						   parsed, (int)(enditem - parsed)) != 0)
+					goto failed;
+
+				continue;
+			}
+
+			stride = 1;
+		}
+
+		/* range expansion */
+
+		if (enditem != parsed + scanned) /* no trailing junk */
+			goto failed;
+
+		if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+		    (hi - lo) % stride != 0)
+			goto failed;
+
+		for (i = lo; i <= hi; i += stride) {
+
+			snprintf(num, sizeof(num), "%d", i);
+			nob = strlen(num);
+			if (nob + 1 == sizeof(num))
+				goto failed;
+
+			if (lnet_expand1tb(&pending, str, sep, sep2,
+					   num, nob) != 0)
+				goto failed;
+		}
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 1;
+
+ failed:
+	lnet_free_text_bufs(&pending);
+	return -EINVAL;
+}
+
+static int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+	int	len = strlen(str);
+	int	nob = len;
+
+	return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+		nob == len &&
+		*hops > 0 && *hops < 256);
+}
+
+#define LNET_PRIORITY_SEPARATOR (':')
+
+static int
+lnet_parse_priority(char *str, unsigned int *priority, char **token)
+{
+	int   nob;
+	char *sep;
+	int   len;
+
+	sep = strchr(str, LNET_PRIORITY_SEPARATOR);
+	if (sep == NULL) {
+		*priority = 0;
+		return 0;
+	}
+	len = strlen(sep + 1);
+
+	if ((sscanf((sep+1), "%u%n", priority, &nob) < 1) || (len != nob)) {
+		/* Update the caller's token pointer so it treats the found
+		   priority as the token to report in the error message. */
+		*token += sep - str + 1;
+		return -EINVAL;
+	}
+
+	CDEBUG(D_NET, "gateway %s, priority %d, nob %d\n", str, *priority, nob);
+
+	/*
+	 * Change priority separator to \0 to be able to parse NID
+	 */
+	*sep = '\0';
+	return 0;
+}
+
+static int
+lnet_parse_route(char *str, int *im_a_router)
+{
+	/* static scratch buffer OK (single threaded) */
+	static char cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+	LIST_HEAD(nets);
+	LIST_HEAD(gateways);
+	struct list_head *tmp1;
+	struct list_head *tmp2;
+	__u32 net;
+	struct lnet_nid nid;
+	struct lnet_text_buf  *ltb;
+	int rc;
+	char *sep;
+	char *token = str;
+	int ntokens = 0;
+	int myrc = -1;
+	__u32 hops;
+	int got_hops = 0;
+	unsigned int priority = 0;
+
+	/* save a copy of the string for error messages */
+	strncpy(cmd, str, sizeof(cmd));
+	cmd[sizeof(cmd) - 1] = '\0';
+
+	sep = str;
+	for (;;) {
+		/* scan for token start */
+		while (isspace(*sep))
+			sep++;
+		if (*sep == 0) {
+			if (ntokens < (got_hops ? 3 : 2))
+				goto token_error;
+			break;
+		}
+
+		ntokens++;
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !isspace(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens == 1) {
+			tmp2 = &nets;		/* expanding nets */
+		} else if (ntokens == 2 &&
+			   lnet_parse_hops(token, &hops)) {
+			got_hops = 1;		/* got a hop count */
+			continue;
+		} else {
+			tmp2 = &gateways;	/* expanding gateways */
+		}
+
+		ltb = lnet_new_text_buf(strlen(token));
+		if (ltb == NULL)
+			goto out;
+
+		strcpy(ltb->ltb_text, token);
+		tmp1 = &ltb->ltb_list;
+		list_add_tail(tmp1, tmp2);
+
+		while (tmp1 != tmp2) {
+			ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list);
+
+			rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+			if (rc < 0)
+				goto token_error;
+
+			tmp1 = tmp1->next;
+
+			if (rc > 0) {		/* expanded! */
+				list_del(&ltb->ltb_list);
+				lnet_free_text_buf(ltb);
+				continue;
+			}
+
+			if (ntokens == 1) {
+				net = libcfs_str2net(ltb->ltb_text);
+				if (net == LNET_NET_ANY ||
+				    LNET_NETTYP(net) == LOLND)
+					goto token_error;
+			} else {
+				rc = lnet_parse_priority(ltb->ltb_text,
+							 &priority, &token);
+				if (rc < 0)
+					goto token_error;
+
+				if (libcfs_strnid(&nid, ltb->ltb_text) != 0 ||
+				    nid_is_lo0(&nid))
+					goto token_error;
+			}
+		}
+	}
+
+	/* if there are no hops set then we want to flag this value as
+	 * unset since hops is an optional parameter */
+	if (!got_hops)
+		hops = LNET_UNDEFINED_HOPS;
+
+	LASSERT(!list_empty(&nets));
+	LASSERT(!list_empty(&gateways));
+
+	list_for_each(tmp1, &nets) {
+		ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list);
+		net = libcfs_str2net(ltb->ltb_text);
+		LASSERT(net != LNET_NET_ANY);
+
+		list_for_each(tmp2, &gateways) {
+			ltb = list_entry(tmp2, struct lnet_text_buf, ltb_list);
+			LASSERT(libcfs_strnid(&nid, ltb->ltb_text) == 0);
+
+			if (lnet_islocalnid(&nid)) {
+				*im_a_router = 1;
+				continue;
+			}
+
+			rc = lnet_add_route(net, hops, &nid, priority, 1);
+			if (rc != 0 && rc != -EEXIST && rc != -EHOSTUNREACH) {
+				CERROR("Can't create route "
+				       "to %s via %s\n",
+				       libcfs_net2str(net),
+				       libcfs_nidstr(&nid));
+				goto out;
+			}
+		}
+	}
+
+	myrc = 0;
+	goto out;
+
+token_error:
+	lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+out:
+	lnet_free_text_bufs(&nets);
+	lnet_free_text_bufs(&gateways);
+	return myrc;
+}
+
+static int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+	struct lnet_text_buf   *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list);
+
+		if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+			lnet_free_text_bufs(tbs);
+			return -EINVAL;
+		}
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+
+	return 0;
+}
+
+int
+lnet_parse_routes(const char *routes, int *im_a_router)
+{
+	LIST_HEAD(tbs);
+	int rc = 0;
+
+	*im_a_router = 0;
+
+	if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+		CERROR("Error parsing routes\n");
+		rc = -EINVAL;
+	} else {
+		rc = lnet_parse_route_tbs(&tbs, im_a_router);
+	}
+
+	LASSERT (lnet_tbnob == 0);
+	return rc;
+}
+
+static int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+	LIST_HEAD(list);
+	int		rc;
+	int		i;
+
+	rc = cfs_ip_addr_parse(token, len, &list);
+	if (rc != 0)
+		return rc;
+
+	for (rc = i = 0; !rc && i < nip; i++)
+		rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+	cfs_expr_list_free_list(&list);
+
+	return rc;
+}
+
+static int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+	static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+	int   matched = 0;
+	int   ntokens = 0;
+	int   len;
+	char *net = NULL;
+	char *sep;
+	char *token;
+	int   rc;
+
+	LASSERT(strlen(net_entry) < sizeof(tokens));
+
+	/* work on a copy of the string */
+	strcpy(tokens, net_entry);
+	sep = tokens;
+	for (;;) {
+		/* scan for token start */
+		while (isspace(*sep))
+			sep++;
+		if (*sep == 0)
+			break;
+
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !isspace(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens++ == 0) {
+			net = token;
+			continue;
+		}
+
+		len = strlen(token);
+
+		rc = lnet_match_network_token(token, len, ipaddrs, nip);
+		if (rc < 0) {
+			lnet_syntax("ip2nets", net_entry,
+				    (int)(token - tokens), len);
+			return rc;
+		}
+
+		matched |= (rc != 0);
+	}
+
+	if (!matched)
+		return 0;
+
+	strcpy(net_entry, net);			/* replace with matched net */
+	return 1;
+}
+
+static __u32
+lnet_netspec2net(char *netspec)
+{
+	char   *bracket = strchr(netspec, '(');
+	__u32	net;
+
+	if (bracket != NULL)
+		*bracket = 0;
+
+	net = libcfs_str2net(netspec);
+
+	if (bracket != NULL)
+		*bracket = '(';
+
+	return net;
+}
+
+static int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+	int		  offset = 0;
+	int		  offset2;
+	int		  len;
+	struct lnet_text_buf  *tb;
+	struct lnet_text_buf  *tb2;
+	struct list_head *t;
+	char		 *sep;
+	char		 *bracket;
+	__u32		  net;
+
+	LASSERT(!list_empty(nets));
+	LASSERT(nets->next == nets->prev);	/* single entry */
+
+	tb = list_entry(nets->next, struct lnet_text_buf, ltb_list);
+
+	for (;;) {
+		sep = strchr(tb->ltb_text, ',');
+		bracket = strchr(tb->ltb_text, '(');
+
+		if (sep != NULL &&
+		    bracket != NULL &&
+		    bracket < sep) {
+			/* netspec lists interfaces... */
+
+			offset2 = offset + (int)(bracket - tb->ltb_text);
+			len = strlen(bracket);
+
+			bracket = strchr(bracket + 1, ')');
+
+			if (bracket == NULL ||
+			    !(bracket[1] == ',' || bracket[1] == 0)) {
+				lnet_syntax("ip2nets", source, offset2, len);
+				return -EINVAL;
+			}
+
+			sep = (bracket[1] == 0) ? NULL : bracket + 1;
+		}
+
+		if (sep != NULL)
+			*sep++ = 0;
+
+		net = lnet_netspec2net(tb->ltb_text);
+		if (net == LNET_NET_ANY) {
+			lnet_syntax("ip2nets", source, offset,
+				    strlen(tb->ltb_text));
+			return -EINVAL;
+		}
+
+		list_for_each(t, nets) {
+			tb2 = list_entry(t, struct lnet_text_buf, ltb_list);
+
+			if (tb2 == tb)
+				continue;
+
+			if (net == lnet_netspec2net(tb2->ltb_text)) {
+				/* duplicate network */
+				lnet_syntax("ip2nets", source, offset,
+					    strlen(tb->ltb_text));
+				return -EINVAL;
+			}
+		}
+
+		if (sep == NULL)
+			return 0;
+
+		offset += (int)(sep - tb->ltb_text);
+		len = strlen(sep);
+		tb2 = lnet_new_text_buf(len);
+		if (tb2 == NULL)
+			return -ENOMEM;
+
+		strncpy(tb2->ltb_text, sep, len);
+		tb2->ltb_text[len] = '\0';
+		list_add_tail(&tb2->ltb_list, nets);
+
+		tb = tb2;
+	}
+}
+
+static int
+lnet_match_networks(const char **networksp, const char *ip2nets,
+		    __u32 *ipaddrs, int nip)
+{
+	static char	  networks[LNET_SINGLE_TEXTBUF_NOB];
+	static char	  source[LNET_SINGLE_TEXTBUF_NOB];
+
+	LIST_HEAD(raw_entries);
+	LIST_HEAD(matched_nets);
+	LIST_HEAD(current_nets);
+	struct list_head *t;
+	struct list_head *t2;
+	struct lnet_text_buf  *tb;
+	int		  len;
+	int		  count;
+	int		  rc;
+
+	if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+		CERROR("Error parsing ip2nets\n");
+		LASSERT(lnet_tbnob == 0);
+		return -EINVAL;
+	}
+
+	networks[0] = 0;
+	count = 0;
+	len = 0;
+	rc = 0;
+
+	while (!list_empty(&raw_entries)) {
+		tb = list_entry(raw_entries.next, struct lnet_text_buf,
+				ltb_list);
+
+		strncpy(source, tb->ltb_text, sizeof(source));
+		source[sizeof(source) - 1] = '\0';
+
+		/* replace ltb_text with the network(s) add on match */
+		rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+		if (rc < 0)
+			break;
+
+		list_del(&tb->ltb_list);
+
+		if (rc == 0) {			/* no match */
+			lnet_free_text_buf(tb);
+			continue;
+		}
+
+		/* split into separate networks */
+		INIT_LIST_HEAD(&current_nets);
+		list_add(&tb->ltb_list, &current_nets);
+		rc = lnet_splitnets(source, &current_nets);
+		if (rc < 0)
+			break;
+
+		list_for_each_safe(t, t2, &current_nets) {
+			tb = list_entry(t, struct lnet_text_buf, ltb_list);
+
+			list_move_tail(&tb->ltb_list, &matched_nets);
+
+			len += scnprintf(networks + len, sizeof(networks) - len,
+					 "%s%s", (len == 0) ? "" : ",",
+					 tb->ltb_text);
+
+			if (len >= sizeof(networks)) {
+				CERROR("Too many matched networks\n");
+				rc = -E2BIG;
+				goto out;
+			}
+		}
+
+		count++;
+	}
+
+ out:
+	lnet_free_text_bufs(&raw_entries);
+	lnet_free_text_bufs(&matched_nets);
+	lnet_free_text_bufs(&current_nets);
+	LASSERT(lnet_tbnob == 0);
+
+	if (rc < 0)
+		return rc;
+
+	*networksp = networks;
+	return count;
+}
+
+int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns)
+{
+	struct lnet_inetdev *ifaces = NULL;
+	struct net_device *dev;
+	int nalloc = 0;
+	int nip = 0;
+	DECLARE_CONST_IN_IFADDR(ifa);
+
+	rtnl_lock();
+	for_each_netdev(ns, dev) {
+		int flags = dev_get_flags(dev);
+		struct in_device *in_dev;
+		int node_id;
+		int cpt;
+
+		if (flags & IFF_LOOPBACK) /* skip the loopback IF */
+			continue;
+
+		if (!(flags & IFF_UP)) {
+			CWARN("lnet: Ignoring interface %s: it's down\n",
+			      dev->name);
+			continue;
+		}
+
+		in_dev = __in_dev_get_rtnl(dev);
+		if (!in_dev) {
+			CWARN("lnet: Interface %s has no IPv4 status.\n",
+			      dev->name);
+			continue;
+		}
+
+		node_id = dev_to_node(&dev->dev);
+		cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+
+		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+			if (nip >= nalloc) {
+				struct lnet_inetdev *tmp;
+
+				nalloc += LNET_INTERFACES_NUM;
+				tmp = krealloc(ifaces, nalloc * sizeof(*tmp),
+					       GFP_KERNEL);
+				if (!tmp) {
+					kfree(ifaces);
+					ifaces = NULL;
+					nip = -ENOMEM;
+					goto unlock_rtnl;
+				}
+				ifaces = tmp;
+			}
+
+			ifaces[nip].li_cpt = cpt;
+			ifaces[nip].li_flags = flags;
+			ifaces[nip].li_ipaddr = ntohl(ifa->ifa_local);
+			ifaces[nip].li_netmask = ntohl(ifa->ifa_mask);
+			strlcpy(ifaces[nip].li_name, ifa->ifa_label,
+				sizeof(ifaces[nip].li_name));
+			nip++;
+		}
+		endfor_ifa(in_dev);
+	}
+unlock_rtnl:
+	rtnl_unlock();
+
+	if (nip == 0) {
+		CERROR("lnet: Can't find any usable interfaces, rc = -ENOENT\n");
+		nip = -ENOENT;
+	}
+
+	*dev_list = ifaces;
+	return nip;
+}
+EXPORT_SYMBOL(lnet_inet_enumerate);
+
+int
+lnet_parse_ip2nets(const char **networksp, const char *ip2nets)
+{
+	struct lnet_inetdev *ifaces = NULL;
+	__u32	  *ipaddrs = NULL;
+	int nip;
+	int	   rc;
+	int i;
+
+	if (current->nsproxy && current->nsproxy->net_ns)
+		nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns);
+	else
+		nip = lnet_inet_enumerate(&ifaces, &init_net);
+	if (nip < 0) {
+		if (nip != -ENOENT) {
+			LCONSOLE_ERROR_MSG(0x117,
+					   "Error %d enumerating local IP interfaces for ip2nets to match\n",
+					   nip);
+		} else {
+			LCONSOLE_ERROR_MSG(0x118,
+					   "No local IP interfaces for ip2nets to match\n");
+		}
+		return nip;
+	}
+
+	CFS_ALLOC_PTR_ARRAY(ipaddrs, nip);
+	if (!ipaddrs) {
+		rc = -ENOMEM;
+		CERROR("lnet: Can't allocate ipaddrs[%d], rc = %d\n",
+		       nip, rc);
+		goto out_free_addrs;
+	}
+
+	for (i = 0; i < nip; i++)
+		ipaddrs[i] = ifaces[i].li_ipaddr;
+
+	rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+	if (rc < 0) {
+		LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+	} else if (rc == 0) {
+		LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
+				   "any local IP interfaces\n");
+		rc = -ENOENT;
+	}
+	CFS_FREE_PTR_ARRAY(ipaddrs, nip);
+out_free_addrs:
+	kfree(ifaces);
+	return rc > 0 ? 0 : rc;
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
new file mode 100644
index 0000000000000..ba318a2929632
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
@@ -0,0 +1,558 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(struct lnet_libmd *md)
+{
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+		/* first unlink attempt... */
+		struct lnet_me *me = md->md_me;
+
+		md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+		/* Disassociate from ME (if any), and unlink it if it was created
+		 * with LNET_UNLINK */
+		if (me != NULL) {
+			/* detach MD from portal */
+			lnet_ptl_detach_md(me, md);
+			if (me->me_unlink == LNET_UNLINK)
+				lnet_me_unlink(me);
+		}
+
+		/* ensure all future handle lookups fail */
+		lnet_res_lh_invalidate(&md->md_lh);
+	}
+
+	if (md->md_refcount != 0) {
+		CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+		return;
+	}
+
+	CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+	LASSERT(!list_empty(&md->md_list));
+	list_del_init(&md->md_list);
+	LASSERT(!(md->md_flags & LNET_MD_FLAG_HANDLING));
+	lnet_md_free(md);
+}
+
+struct page *
+lnet_kvaddr_to_page(unsigned long vaddr)
+{
+	if (is_vmalloc_addr((void *)vaddr))
+		return vmalloc_to_page((void *)vaddr);
+
+#ifdef CONFIG_HIGHMEM
+
+#ifdef HAVE_KMAP_TO_PAGE
+	/*
+	 * This ifdef is added to handle the kernel versions
+	 * which have kmap_to_page() function exported. If so,
+	 * we should use it. Otherwise, remain with the legacy check.
+	 */
+	return kmap_to_page((void *)vaddr);
+#else
+
+	if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
+		/* No highmem pages only used for bulk (kiov) I/O */
+		CERROR("find page for address in highmem\n");
+		LBUG();
+	}
+	return virt_to_page(vaddr);
+#endif /* HAVE_KMAP_TO_PAGE */
+#else
+
+	return virt_to_page(vaddr);
+#endif /* CONFIG_HIGHMEM */
+}
+EXPORT_SYMBOL(lnet_kvaddr_to_page);
+
+struct page *
+lnet_get_first_page(struct lnet_libmd *md, unsigned int offset)
+{
+	unsigned int niov;
+	struct bio_vec *kiov;
+
+	/*
+	 * if the md_options has a bulk handle then we want to look at the
+	 * bulk md because that's the data which we will be DMAing
+	 */
+	if (md && (md->md_options & LNET_MD_BULK_HANDLE) != 0 &&
+	    !LNetMDHandleIsInvalid(md->md_bulk_handle))
+		md = lnet_handle2md(&md->md_bulk_handle);
+
+	if (!md || md->md_niov == 0)
+		return NULL;
+
+	kiov = md->md_kiov;
+	niov = md->md_niov;
+
+	while (offset >= kiov->bv_len) {
+		offset -= kiov->bv_len;
+		niov--;
+		kiov++;
+		if (niov == 0) {
+			CERROR("offset %d goes beyond kiov\n", offset);
+			return NULL;
+		}
+	}
+
+	return kiov->bv_page;
+}
+
+int
+lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset)
+{
+	struct page *page;
+	int cpt = CFS_CPT_ANY;
+
+	page = lnet_get_first_page(md, offset);
+	if (!page) {
+		CDEBUG(D_NET, "Couldn't resolve first page of md %p with offset %u\n",
+			md, offset);
+		goto out;
+	}
+
+	cpt = cfs_cpt_of_node(lnet_cpt_table(), page_to_nid(page));
+
+out:
+	return cpt;
+}
+
+static int lnet_md_validate(const struct lnet_md *umd);
+
+static struct lnet_libmd *
+lnet_md_build(const struct lnet_md *umd, int unlink)
+{
+	int i;
+	unsigned int niov;
+	int total_length = 0;
+	struct lnet_libmd *lmd;
+	unsigned int size;
+
+	if (lnet_md_validate(umd) != 0)
+		return ERR_PTR(-EINVAL);
+
+	if (umd->options & LNET_MD_KIOV)
+		niov = umd->length;
+	else
+		niov = DIV_ROUND_UP(offset_in_page(umd->start) + umd->length,
+				    PAGE_SIZE);
+	size = offsetof(struct lnet_libmd, md_kiov[niov]);
+
+	if (size <= LNET_SMALL_MD_SIZE) {
+		lmd = kmem_cache_zalloc(lnet_small_mds_cachep, GFP_NOFS);
+		if (lmd) {
+			CDEBUG(D_MALLOC,
+			       "slab-alloced 'md' of size %u at %p.\n",
+			       size, lmd);
+		} else {
+			CDEBUG(D_MALLOC, "failed to allocate 'md' of size %u\n",
+			       size);
+		}
+	} else {
+		LIBCFS_ALLOC(lmd, size);
+	}
+
+	if (!lmd)
+		return ERR_PTR(-ENOMEM);
+
+	lmd->md_niov = niov;
+	INIT_LIST_HEAD(&lmd->md_list);
+	lmd->md_me = NULL;
+	lmd->md_start = umd->start;
+	lmd->md_offset = 0;
+	lmd->md_max_size = umd->max_size;
+	lmd->md_options = umd->options;
+	lmd->md_user_ptr = umd->user_ptr;
+	lmd->md_handler = NULL;
+	lmd->md_threshold = umd->threshold;
+	lmd->md_refcount = 0;
+	lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+	lmd->md_bulk_handle = umd->bulk_handle;
+
+	if (umd->options & LNET_MD_KIOV) {
+		memcpy(lmd->md_kiov, umd->start,
+		       niov * sizeof(lmd->md_kiov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the page pointer on trust */
+			if (lmd->md_kiov[i].bv_offset +
+			    lmd->md_kiov[i].bv_len > PAGE_SIZE) {
+				lnet_md_free(lmd);
+				return ERR_PTR(-EINVAL); /* invalid length */
+			}
+
+			total_length += lmd->md_kiov[i].bv_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) { /* illegal max_size */
+			lnet_md_free(lmd);
+			return ERR_PTR(-EINVAL);
+		}
+	} else {   /* contiguous - split into pages */
+		void *pa = umd->start;
+		int len = umd->length;
+
+		lmd->md_length = len;
+		i = 0;
+		while (len) {
+			int plen;
+
+			plen = min_t(int, len, PAGE_SIZE - offset_in_page(pa));
+
+			lmd->md_kiov[i].bv_page =
+				lnet_kvaddr_to_page((unsigned long) pa);
+			lmd->md_kiov[i].bv_offset = offset_in_page(pa);
+			lmd->md_kiov[i].bv_len = plen;
+
+			len -= plen;
+			pa += plen;
+			i += 1;
+		}
+		WARN(!(lmd->md_options  & LNET_MD_GNILND) && i > LNET_MAX_IOV,
+			"Max IOV exceeded: %d should be < %d\n",
+			i, LNET_MAX_IOV);
+		if ((umd->options & LNET_MD_MAX_SIZE) && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > (int)umd->length)) { /* illegal max_size */
+			lnet_md_free(lmd);
+			return ERR_PTR(-EINVAL);
+		}
+		lmd->md_options |= LNET_MD_KIOV;
+	}
+
+	return lmd;
+}
+
+/* must be called with resource lock held */
+static void
+lnet_md_link(struct lnet_libmd *md, lnet_handler_t handler, int cpt)
+{
+	struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+	/* NB we are passed an allocated, but inactive md.
+	 * Caller may lnet_md_unlink() it, or may lnet_md_free() it.
+	 */
+	/* This implementation doesn't know how to create START events or
+	 * disable END events.	Best to LASSERT our caller is compliant so
+	 * we find out quickly...  */
+	/*  TODO - reevaluate what should be here in light of
+	 * the removal of the start and end events
+	 * maybe there we shouldn't even allow LNET_EQ_NONE!)
+	 * LASSERT (handler != NULL);
+	 */
+	md->md_handler = handler;
+
+	lnet_res_lh_initialize(container, &md->md_lh);
+
+	LASSERT(list_empty(&md->md_list));
+	list_add(&md->md_list, &container->rec_active);
+}
+
+void lnet_assert_handler_unused(lnet_handler_t handler)
+{
+	struct lnet_res_container *container;
+	int cpt;
+
+	if (!handler)
+		return;
+	cfs_percpt_for_each(container, cpt, the_lnet.ln_md_containers) {
+		struct lnet_libmd *md;
+
+		lnet_res_lock(cpt);
+		list_for_each_entry(md, &container->rec_active, md_list)
+			LASSERT(md->md_handler != handler);
+		lnet_res_unlock(cpt);
+	}
+}
+EXPORT_SYMBOL(lnet_assert_handler_unused);
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_event *ev)
+{
+	ev->md_start = lmd->md_start;
+	ev->md_options = lmd->md_options;
+	ev->md_user_ptr = lmd->md_user_ptr;
+}
+
+static int
+lnet_md_validate(const struct lnet_md *umd)
+{
+	if (umd->start == NULL && umd->length != 0) {
+		CERROR("MD start pointer can not be NULL with length %u\n",
+		       umd->length);
+		return -EINVAL;
+	}
+
+	if ((umd->options & LNET_MD_KIOV) &&
+	    umd->length > LNET_MAX_IOV) {
+		CERROR("Invalid option: too many fragments %u, %d max\n",
+		       umd->length, LNET_MAX_IOV);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param me An ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * The ME will either be linked to the new MD, or it will be freed.
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ */
+int
+LNetMDAttach(struct lnet_me *me, const struct lnet_md *umd,
+	     enum lnet_unlink unlink, struct lnet_handle_md *handle)
+{
+	LIST_HEAD(matches);
+	LIST_HEAD(drops);
+	struct lnet_libmd	*md;
+	int			cpt;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+	LASSERT(!me->me_md);
+
+	if ((umd->options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+		CERROR("Invalid option: no MD_OP set\n");
+		md = ERR_PTR(-EINVAL);
+	} else
+		md = lnet_md_build(umd, unlink);
+
+	cpt = me->me_cpt;
+	lnet_res_lock(cpt);
+
+	if (IS_ERR(md)) {
+		lnet_me_unlink(me);
+		lnet_res_unlock(cpt);
+		return PTR_ERR(md);
+	}
+
+	lnet_md_link(md, umd->handler, cpt);
+
+	/* attach this MD to portal of ME and check if it matches any
+	 * blocked msgs on this portal */
+	lnet_ptl_attach_md(me, md, &matches, &drops);
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+
+	lnet_drop_delayed_msg_list(&drops, "Bad match");
+	lnet_recv_delayed_msg_list(&matches);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ */
+int
+LNetMDBind(const struct lnet_md *umd, enum lnet_unlink unlink,
+	   struct lnet_handle_md *handle)
+{
+	struct lnet_libmd	*md;
+	int		cpt;
+	int		rc;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if ((umd->options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+		CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_build(umd, unlink);
+	if (IS_ERR(md))
+		return PTR_ERR(md);
+
+	if (md->md_length > LNET_MTU) {
+		CERROR("Invalid length: too big transfer size %u, %d max\n",
+		       md->md_length, LNET_MTU);
+		rc = -EINVAL;
+		goto out_free;
+	}
+
+	cpt = lnet_res_lock_current();
+
+	lnet_md_link(md, umd->handler, cpt);
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+
+ out_free:
+	lnet_md_free(md);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it. As a result, active messages
+ * associated with the MD may get aborted.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ *   and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ *   returns, and the unlinked event will be piggybacked on the event of
+ *   the completion of the last operation by setting the unlinked field of
+ *   the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0	   On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+__LNetMDUnlink(struct lnet_handle_md mdh, bool discard)
+{
+	struct lnet_event ev;
+	struct lnet_libmd *md = NULL;
+	lnet_handler_t handler = NULL;
+	int cpt;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+	while (!md) {
+		md = lnet_handle2md(&mdh);
+		if (!md) {
+			lnet_res_unlock(cpt);
+			return -ENOENT;
+		}
+		if (md->md_refcount == 0 &&
+		    md->md_flags & LNET_MD_FLAG_HANDLING) {
+			/* Race with unlocked call to ->md_handler. */
+			lnet_md_wait_handling(md, cpt);
+			md = NULL;
+		}
+	}
+
+	md->md_flags |= LNET_MD_FLAG_ABORTED;
+	/* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+	 * when the LND is done, the completion event flags that the MD was
+	 * unlinked. Otherwise, we enqueue an event now... */
+	if (md->md_handler && md->md_refcount == 0) {
+		lnet_build_unlink_event(md, &ev);
+		handler = md->md_handler;
+	}
+
+	if (discard)
+		md->md_flags |= LNET_MD_FLAG_DISCARD;
+
+	if (md->md_rspt_ptr != NULL)
+		lnet_detach_rsp_tracker(md, cpt);
+
+	lnet_md_unlink(md);
+
+	lnet_res_unlock(cpt);
+
+	if (handler)
+		handler(&ev);
+
+	return 0;
+}
+EXPORT_SYMBOL(__LNetMDUnlink);
+
+bool
+lnet_md_discarded(struct lnet_libmd *md)
+{
+	bool rc;
+	int cpt;
+
+	if (md == NULL)
+		return false;
+
+	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+	lnet_res_lock(cpt);
+	rc = md->md_flags & LNET_MD_FLAG_DISCARD;
+	lnet_res_unlock(cpt);
+
+	return rc;
+}
+EXPORT_SYMBOL(lnet_md_discarded);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-me.c b/drivers/staging/lustrefsx/lnet/lnet/lib-me.c
new file mode 100644
index 0000000000000..8d7c9ee97f94b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-me.c
@@ -0,0 +1,155 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the struct lnet_process_id
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ *
+ * \retval A handle to the newly created ME is returned on success
+ * \retval ERR_PTR(-EINVAL) If \a portal is invalid.
+ * \retval ERR_PTR(-ENOMEM) If new ME object cannot be allocated.
+ */
+struct lnet_me *
+LNetMEAttach(unsigned int portal,
+	     struct lnet_processid *match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     enum lnet_unlink unlink, enum lnet_ins_pos pos)
+{
+	struct lnet_match_table *mtable;
+	struct lnet_me		*me;
+	struct list_head	*head;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if ((int)portal >= the_lnet.ln_nportals)
+		return ERR_PTR(-EINVAL);
+
+	mtable = lnet_mt_of_attach(portal, match_id,
+				   match_bits, ignore_bits, pos);
+	if (mtable == NULL) /* can't match portal type */
+		return ERR_PTR(-EPERM);
+
+	me = kmem_cache_zalloc(lnet_mes_cachep, GFP_NOFS);
+	if (me == NULL) {
+		CDEBUG(D_MALLOC, "failed to allocate 'me'\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	CDEBUG(D_MALLOC, "slab-alloced 'me' at %p.\n", me);
+
+	lnet_res_lock(mtable->mt_cpt);
+
+	me->me_portal = portal;
+	me->me_match_id = *match_id;
+	me->me_match_bits = match_bits;
+	me->me_ignore_bits = ignore_bits;
+	me->me_unlink = unlink;
+	me->me_md = NULL;
+
+	me->me_cpt = mtable->mt_cpt;
+
+	if (ignore_bits != 0)
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+	me->me_pos = head - &mtable->mt_mhash[0];
+	if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+		list_add_tail(&me->me_list, head);
+	else
+		list_add(&me->me_list, head);
+
+	lnet_res_unlock(mtable->mt_cpt);
+	return me;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(struct lnet_me *me)
+{
+	list_del(&me->me_list);
+
+	if (me->me_md != NULL) {
+		struct lnet_libmd *md = me->me_md;
+
+		/* detach MD from portal of this ME */
+		lnet_ptl_detach_md(me, md);
+		lnet_md_unlink(md);
+	}
+
+	CDEBUG(D_MALLOC, "slab-freed 'me' at %p.\n", me);
+	kmem_cache_free(lnet_mes_cachep, me);
+}
+
+#if 0
+static void
+lib_me_dump(struct lnet_me *me)
+{
+	CWARN("Match Entry %p (%#llx)\n", me,
+	      me->me_lh.lh_cookie);
+
+	CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+	      me->me_match_bits, me->me_ignore_bits);
+
+	CWARN("\tMD\t= %p\n", me->md);
+	CWARN("\tprev\t= %p\n",
+	      list_entry(me->me_list.prev, struct lnet_me, me_list));
+	CWARN("\tnext\t= %p\n",
+	      list_entry(me->me_list.next, struct lnet_me, me_list));
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
new file mode 100644
index 0000000000000..5c50c9e179ac2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
@@ -0,0 +1,5456 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/pagemap.h>
+
+#include <lnet/lib-lnet.h>
+#include <linux/nsproxy.h>
+#include <lnet/lnet_rdma.h>
+#include <net/net_namespace.h>
+
+static int local_nid_dist_zero = 1;
+module_param(local_nid_dist_zero, int, 0444);
+MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
+
+struct lnet_send_data {
+	struct lnet_ni *sd_best_ni;
+	struct lnet_peer_ni *sd_best_lpni;
+	struct lnet_peer_ni *sd_final_dst_lpni;
+	struct lnet_peer *sd_peer;
+	struct lnet_peer *sd_gw_peer;
+	struct lnet_peer_ni *sd_gw_lpni;
+	struct lnet_peer_net *sd_peer_net;
+	struct lnet_msg *sd_msg;
+	struct lnet_nid sd_dst_nid;
+	struct lnet_nid sd_src_nid;
+	struct lnet_nid sd_rtr_nid;
+	int sd_cpt;
+	int sd_md_cpt;
+	__u32 sd_send_case;
+};
+
+static inline bool
+lnet_msg_is_response(struct lnet_msg *msg)
+{
+	return msg->msg_type == LNET_MSG_ACK || msg->msg_type == LNET_MSG_REPLY;
+}
+
+static inline bool
+lnet_response_tracking_enabled(__u32 msg_type, unsigned int md_options)
+{
+	if (md_options & LNET_MD_NO_TRACK_RESPONSE)
+		/* Explicitly disabled in MD options */
+		return false;
+
+	if (md_options & LNET_MD_TRACK_RESPONSE)
+		/* Explicity enabled in MD options */
+		return true;
+
+	if (lnet_response_tracking == 3)
+		/* Enabled for all message types */
+		return true;
+
+	if (msg_type == LNET_MSG_PUT)
+		return lnet_response_tracking == 2;
+
+	if (msg_type == LNET_MSG_GET)
+		return lnet_response_tracking == 1;
+
+	return false;
+}
+
+static inline struct lnet_comm_count *
+get_stats_counts(struct lnet_element_stats *stats,
+		 enum lnet_stats_type stats_type)
+{
+	switch (stats_type) {
+	case LNET_STATS_TYPE_SEND:
+		return &stats->el_send_stats;
+	case LNET_STATS_TYPE_RECV:
+		return &stats->el_recv_stats;
+	case LNET_STATS_TYPE_DROP:
+		return &stats->el_drop_stats;
+	default:
+		CERROR("Unknown stats type\n");
+	}
+
+	return NULL;
+}
+
+void lnet_incr_stats(struct lnet_element_stats *stats,
+		     enum lnet_msg_type msg_type,
+		     enum lnet_stats_type stats_type)
+{
+	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
+	if (!counts)
+		return;
+
+	switch (msg_type) {
+	case LNET_MSG_ACK:
+		atomic_inc(&counts->co_ack_count);
+		break;
+	case LNET_MSG_PUT:
+		atomic_inc(&counts->co_put_count);
+		break;
+	case LNET_MSG_GET:
+		atomic_inc(&counts->co_get_count);
+		break;
+	case LNET_MSG_REPLY:
+		atomic_inc(&counts->co_reply_count);
+		break;
+	case LNET_MSG_HELLO:
+		atomic_inc(&counts->co_hello_count);
+		break;
+	default:
+		CERROR("There is a BUG in the code. Unknown message type\n");
+		break;
+	}
+}
+
+__u32 lnet_sum_stats(struct lnet_element_stats *stats,
+		     enum lnet_stats_type stats_type)
+{
+	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
+	if (!counts)
+		return 0;
+
+	return (atomic_read(&counts->co_ack_count) +
+		atomic_read(&counts->co_put_count) +
+		atomic_read(&counts->co_get_count) +
+		atomic_read(&counts->co_reply_count) +
+		atomic_read(&counts->co_hello_count));
+}
+
+static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats,
+				struct lnet_comm_count *counts)
+{
+	msg_stats->ico_get_count = atomic_read(&counts->co_get_count);
+	msg_stats->ico_put_count = atomic_read(&counts->co_put_count);
+	msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count);
+	msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count);
+	msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count);
+}
+
+void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
+			      struct lnet_element_stats *stats)
+{
+	struct lnet_comm_count *counts;
+
+	LASSERT(msg_stats);
+	LASSERT(stats);
+
+	counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND);
+	if (!counts)
+		return;
+	assign_stats(&msg_stats->im_send_stats, counts);
+
+	counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV);
+	if (!counts)
+		return;
+	assign_stats(&msg_stats->im_recv_stats, counts);
+
+	counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP);
+	if (!counts)
+		return;
+	assign_stats(&msg_stats->im_drop_stats, counts);
+}
+
+int
+lnet_fail_nid(lnet_nid_t nid4, unsigned int threshold)
+{
+	struct lnet_test_peer *tp;
+	struct list_head *el;
+	struct list_head *next;
+	struct lnet_nid nid;
+	LIST_HEAD(cull);
+
+	lnet_nid4_to_nid(nid4, &nid);
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	if (threshold != 0) {
+		/* Adding a new entry */
+		LIBCFS_ALLOC(tp, sizeof(*tp));
+		if (tp == NULL)
+			return -ENOMEM;
+
+		tp->tp_nid = nid;
+		tp->tp_threshold = threshold;
+
+		lnet_net_lock(0);
+		list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+		lnet_net_unlock(0);
+		return 0;
+	}
+
+	lnet_net_lock(0);
+
+	list_for_each_safe(el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry(el, struct lnet_test_peer, tp_list);
+
+		if (tp->tp_threshold == 0 ||	/* needs culling anyway */
+		    LNET_NID_IS_ANY(&nid) ||	/* removing all entries */
+		    nid_same(&tp->tp_nid, &nid)) {	/* matched this one */
+			list_move(&tp->tp_list, &cull);
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty(&cull)) {
+		tp = list_entry(cull.next, struct lnet_test_peer, tp_list);
+
+		list_del(&tp->tp_list);
+		LIBCFS_FREE(tp, sizeof(*tp));
+	}
+	return 0;
+}
+
+static int
+fail_peer(struct lnet_nid *nid, int outgoing)
+{
+	struct lnet_test_peer *tp;
+	struct list_head *el;
+	struct list_head *next;
+	LIST_HEAD(cull);
+	int fail = 0;
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	lnet_net_lock(0);
+
+	list_for_each_safe(el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry(el, struct lnet_test_peer, tp_list);
+
+		if (tp->tp_threshold == 0) {
+			/* zombie entry */
+			if (outgoing) {
+				/* only cull zombies on outgoing tests,
+				 * since we may be at interrupt priority on
+				 * incoming messages. */
+				list_move(&tp->tp_list, &cull);
+			}
+			continue;
+		}
+
+		if (LNET_NID_IS_ANY(&tp->tp_nid) ||	/* fail every peer */
+		    nid_same(nid, &tp->tp_nid)) {	/* fail this peer */
+			fail = 1;
+
+			if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+				tp->tp_threshold--;
+				if (outgoing &&
+				    tp->tp_threshold == 0) {
+					/* see above */
+					list_move(&tp->tp_list, &cull);
+				}
+			}
+			break;
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty(&cull)) {
+		tp = list_entry(cull.next, struct lnet_test_peer, tp_list);
+		list_del(&tp->tp_list);
+
+		LIBCFS_FREE(tp, sizeof(*tp));
+	}
+
+	return fail;
+}
+
+unsigned int
+lnet_iov_nob(unsigned int niov, struct kvec *iov)
+{
+	unsigned int nob = 0;
+
+	LASSERT(niov == 0 || iov != NULL);
+	while (niov-- > 0)
+		nob += (iov++)->iov_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset,
+		  unsigned int nsiov, struct kvec *siov, unsigned int soffset,
+		  unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int this_nob;
+
+	if (nob == 0)
+		return;
+
+	/* skip complete frags before 'doffset' */
+	LASSERT(ndiov > 0);
+	while (doffset >= diov->iov_len) {
+		doffset -= diov->iov_len;
+		diov++;
+		ndiov--;
+		LASSERT(ndiov > 0);
+	}
+
+	/* skip complete frags before 'soffset' */
+	LASSERT(nsiov > 0);
+	while (soffset >= siov->iov_len) {
+		soffset -= siov->iov_len;
+		siov++;
+		nsiov--;
+		LASSERT(nsiov > 0);
+	}
+
+	do {
+		LASSERT(ndiov > 0);
+		LASSERT(nsiov > 0);
+		this_nob = min3((unsigned int)diov->iov_len - doffset,
+				(unsigned int)siov->iov_len - soffset,
+				nob);
+
+		memcpy((char *)diov->iov_base + doffset,
+		       (char *)siov->iov_base + soffset, this_nob);
+		nob -= this_nob;
+
+		if (diov->iov_len > doffset + this_nob) {
+			doffset += this_nob;
+		} else {
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->iov_len > soffset + this_nob) {
+			soffset += this_nob;
+		} else {
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+unsigned int
+lnet_kiov_nob(unsigned int niov, struct bio_vec *kiov)
+{
+	unsigned int  nob = 0;
+
+	LASSERT(niov == 0 || kiov != NULL);
+	while (niov-- > 0)
+		nob += (kiov++)->bv_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov(unsigned int ndiov, struct bio_vec *diov,
+		    unsigned int doffset,
+		    unsigned int nsiov, struct bio_vec *siov,
+		    unsigned int soffset,
+		    unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int	this_nob;
+	char	       *daddr = NULL;
+	char	       *saddr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (ndiov > 0);
+	while (doffset >= diov->bv_len) {
+		doffset -= diov->bv_len;
+		diov++;
+		ndiov--;
+		LASSERT(ndiov > 0);
+	}
+
+	LASSERT(nsiov > 0);
+	while (soffset >= siov->bv_len) {
+		soffset -= siov->bv_len;
+		siov++;
+		nsiov--;
+		LASSERT(nsiov > 0);
+	}
+
+	do {
+		LASSERT(ndiov > 0);
+		LASSERT(nsiov > 0);
+		this_nob = min3(diov->bv_len - doffset,
+				siov->bv_len - soffset,
+				nob);
+
+		if (daddr == NULL)
+			daddr = ((char *)kmap(diov->bv_page)) +
+				diov->bv_offset + doffset;
+		if (saddr == NULL)
+			saddr = ((char *)kmap(siov->bv_page)) +
+				siov->bv_offset + soffset;
+
+		/* Vanishing risk of kmap deadlock when mapping 2 pages.
+		 * However in practice at least one of the kiovs will be mapped
+		 * kernel pages and the map/unmap will be NOOPs */
+
+		memcpy (daddr, saddr, this_nob);
+		nob -= this_nob;
+
+		if (diov->bv_len > doffset + this_nob) {
+			daddr += this_nob;
+			doffset += this_nob;
+		} else {
+			kunmap(diov->bv_page);
+			daddr = NULL;
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->bv_len > soffset + this_nob) {
+			saddr += this_nob;
+			soffset += this_nob;
+		} else {
+			kunmap(siov->bv_page);
+			saddr = NULL;
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+
+	if (daddr != NULL)
+		kunmap(diov->bv_page);
+	if (saddr != NULL)
+		kunmap(siov->bv_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset,
+		    unsigned int nkiov, struct bio_vec *kiov,
+		    unsigned int kiovoffset,
+		    unsigned int nob)
+{
+	/* NB iov, kiov are READ-ONLY */
+	unsigned int	this_nob;
+	char	       *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT(niov > 0);
+	}
+
+	LASSERT(nkiov > 0);
+	while (kiovoffset >= kiov->bv_len) {
+		kiovoffset -= kiov->bv_len;
+		kiov++;
+		nkiov--;
+		LASSERT(nkiov > 0);
+	}
+
+	do {
+		LASSERT(niov > 0);
+		LASSERT(nkiov > 0);
+		this_nob = min3((unsigned int)iov->iov_len - iovoffset,
+				(unsigned int)kiov->bv_len - kiovoffset,
+				nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->bv_page)) +
+				kiov->bv_offset + kiovoffset;
+
+		memcpy((char *)iov->iov_base + iovoffset, addr, this_nob);
+		nob -= this_nob;
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+
+		if (kiov->bv_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->bv_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->bv_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov(unsigned int nkiov, struct bio_vec *kiov,
+		   unsigned int kiovoffset,
+		   unsigned int niov, struct kvec *iov, unsigned int iovoffset,
+		   unsigned int nob)
+{
+	/* NB kiov, iov are READ-ONLY */
+	unsigned int	this_nob;
+	char	       *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (nkiov > 0);
+	while (kiovoffset >= kiov->bv_len) {
+		kiovoffset -= kiov->bv_len;
+		kiov++;
+		nkiov--;
+		LASSERT(nkiov > 0);
+	}
+
+	LASSERT(niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT(niov > 0);
+	}
+
+	do {
+		LASSERT(nkiov > 0);
+		LASSERT(niov > 0);
+		this_nob = min3((unsigned int)kiov->bv_len - kiovoffset,
+				(unsigned int)iov->iov_len - iovoffset,
+				nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->bv_page)) +
+				kiov->bv_offset + kiovoffset;
+
+		memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+		nob -= this_nob;
+
+		if (kiov->bv_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->bv_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->bv_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov(int dst_niov, struct bio_vec *dst,
+		  int src_niov, struct bio_vec *src,
+		  unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int	frag_len;
+	unsigned int	niov;
+
+	if (len == 0)				/* no data => */
+		return (0);			/* no frags */
+
+	LASSERT(src_niov > 0);
+	while (offset >= src->bv_len) {      /* skip initial frags */
+		offset -= src->bv_len;
+		src_niov--;
+		src++;
+		LASSERT(src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT(src_niov > 0);
+		LASSERT((int)niov <= dst_niov);
+
+		frag_len = src->bv_len - offset;
+		dst->bv_page = src->bv_page;
+		dst->bv_offset = src->bv_offset + offset;
+
+		if (len <= frag_len) {
+			dst->bv_len = len;
+			LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE);
+			return niov;
+		}
+
+		dst->bv_len = frag_len;
+		LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE);
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+void
+lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+	     int delayed, unsigned int offset, unsigned int mlen,
+	     unsigned int rlen)
+{
+	unsigned int niov = 0;
+	struct kvec *iov = NULL;
+	struct bio_vec  *kiov = NULL;
+	int rc;
+
+	LASSERT (!in_interrupt ());
+	LASSERT (mlen == 0 || msg != NULL);
+
+	if (msg != NULL) {
+		LASSERT(msg->msg_receiving);
+		LASSERT(!msg->msg_sending);
+		LASSERT(rlen == msg->msg_len);
+		LASSERT(mlen <= msg->msg_len);
+		LASSERT(msg->msg_offset == offset);
+		LASSERT(msg->msg_wanted == mlen);
+
+		msg->msg_receiving = 0;
+
+		if (mlen != 0) {
+			niov = msg->msg_niov;
+			kiov = msg->msg_kiov;
+
+			LASSERT (niov > 0);
+			LASSERT ((iov == NULL) != (kiov == NULL));
+		}
+	}
+
+	rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed,
+					     niov, kiov, offset, mlen,
+					     rlen);
+	if (rc < 0)
+		lnet_finalize(msg, rc);
+}
+
+static void
+lnet_setpayloadbuffer(struct lnet_msg *msg)
+{
+	struct lnet_libmd *md = msg->msg_md;
+
+	LASSERT(msg->msg_len > 0);
+	LASSERT(!msg->msg_routing);
+	LASSERT(md != NULL);
+	LASSERT(msg->msg_niov == 0);
+	LASSERT(msg->msg_kiov == NULL);
+
+	msg->msg_niov = md->md_niov;
+	msg->msg_kiov = md->md_kiov;
+}
+
+void
+lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_processid *target,
+	       unsigned int offset, unsigned int len)
+{
+	msg->msg_type = type;
+	msg->msg_target = *target;
+	msg->msg_len = len;
+	msg->msg_offset = offset;
+
+	if (len != 0)
+		lnet_setpayloadbuffer(msg);
+
+	memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+	msg->msg_hdr.type           = type;
+	/* dest_nid will be overwritten by lnet_select_pathway() */
+	msg->msg_hdr.dest_nid = target->nid;
+	msg->msg_hdr.dest_pid = target->pid;
+	/* src_nid will be set later */
+	msg->msg_hdr.src_pid        = the_lnet.ln_pid;
+	msg->msg_hdr.payload_length = len;
+}
+
+void
+lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	void *priv = msg->msg_private;
+	int rc;
+
+	LASSERT(!in_interrupt());
+	LASSERT(nid_is_lo0(&ni->ni_nid) ||
+		(msg->msg_txcredit && msg->msg_peertxcredit));
+
+	rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg);
+	if (rc < 0) {
+		msg->msg_no_resend = true;
+		lnet_finalize(msg, rc);
+	}
+}
+
+static int
+lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	int	rc;
+
+	LASSERT(!msg->msg_sending);
+	LASSERT(msg->msg_receiving);
+	LASSERT(!msg->msg_rx_ready_delay);
+	LASSERT(ni->ni_net->net_lnd->lnd_eager_recv != NULL);
+
+	msg->msg_rx_ready_delay = 1;
+	rc = (ni->ni_net->net_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+						  &msg->msg_private);
+	if (rc != 0) {
+		CERROR("recv from %s / send to %s aborted: "
+		       "eager_recv failed %d\n",
+		       libcfs_nidstr(&msg->msg_rxpeer->lpni_nid),
+		       libcfs_idstr(&msg->msg_target), rc);
+		LASSERT(rc < 0); /* required by my callers */
+	}
+
+	return rc;
+}
+
+static bool
+lnet_is_peer_deadline_passed(struct lnet_peer_ni *lpni, time64_t now)
+{
+	time64_t deadline;
+
+	deadline = lpni->lpni_last_alive +
+		   lpni->lpni_net->net_tunables.lct_peer_timeout;
+
+	/*
+	 * assume peer_ni is alive as long as we're within the configured
+	 * peer timeout
+	 */
+	if (deadline > now)
+		return false;
+
+	return true;
+}
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the lnet_net_lock */
+static int
+lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni,
+		       struct lnet_msg *msg)
+{
+	time64_t now = ktime_get_seconds();
+
+	if (!lnet_peer_aliveness_enabled(lpni))
+		return -ENODEV;
+
+	/*
+	 * If we're resending a message, let's attempt to send it even if
+	 * the peer is down to fulfill our resend quota on the message
+	 */
+	if (msg->msg_retry_count > 0)
+		return 1;
+
+	/* try and send recovery messages irregardless */
+	if (msg->msg_recovery)
+		return 1;
+
+	/* always send any responses */
+	if (lnet_msg_is_response(msg))
+		return 1;
+
+	/* always send non-routed messages */
+	if (!msg->msg_routing)
+		return 1;
+
+	if (!lnet_is_peer_deadline_passed(lpni, now))
+		return true;
+
+	return lnet_is_peer_ni_alive(lpni);
+}
+
+/**
+ * \param msg The message to be sent.
+ * \param do_send True if lnet_ni_send() should be called in this function.
+ *	  lnet_send() is going to lnet_net_unlock immediately after this, so
+ *	  it sets do_send FALSE and I don't do the unlock/send/lock bit.
+ *
+ * \retval LNET_CREDIT_OK If \a msg sent or OK to send.
+ * \retval LNET_CREDIT_WAIT If \a msg blocked for credit.
+ * \retval -EHOSTUNREACH If the next hop of the message appears dead.
+ * \retval -ECANCELED If the MD of the message has been unlinked.
+ */
+static int
+lnet_post_send_locked(struct lnet_msg *msg, int do_send)
+{
+	struct lnet_peer_ni	*lp = msg->msg_txpeer;
+	struct lnet_ni		*ni = msg->msg_txni;
+	int			cpt = msg->msg_tx_cpt;
+	struct lnet_tx_queue	*tq = ni->ni_tx_queues[cpt];
+
+	/* non-lnet_send() callers have checked before */
+	LASSERT(!do_send || msg->msg_tx_delayed);
+	LASSERT(!msg->msg_receiving);
+	LASSERT(msg->msg_tx_committed);
+
+	/* can't get here if we're sending to the loopback interface */
+	if (the_lnet.ln_loni)
+		LASSERT(!nid_same(&lp->lpni_nid, &the_lnet.ln_loni->ni_nid));
+
+	/* NB 'lp' is always the next hop */
+	if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+	    lnet_peer_alive_locked(ni, lp, msg) == 0) {
+		the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+		the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
+			msg->msg_len;
+		lnet_net_unlock(cpt);
+		if (msg->msg_txpeer)
+			lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
+					msg->msg_type,
+					LNET_STATS_TYPE_DROP);
+		if (msg->msg_txni)
+			lnet_incr_stats(&msg->msg_txni->ni_stats,
+					msg->msg_type,
+					LNET_STATS_TYPE_DROP);
+
+		CNETERR("Dropping message for %s: peer not alive\n",
+			libcfs_idstr(&msg->msg_target));
+		msg->msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED;
+		if (do_send)
+			lnet_finalize(msg, -EHOSTUNREACH);
+
+		lnet_net_lock(cpt);
+		return -EHOSTUNREACH;
+	}
+
+	if (msg->msg_md != NULL &&
+	    (msg->msg_md->md_flags & LNET_MD_FLAG_ABORTED) != 0) {
+		lnet_net_unlock(cpt);
+
+		CNETERR("Aborting message for %s: LNetM[DE]Unlink() already "
+			"called on the MD/ME.\n",
+			libcfs_idstr(&msg->msg_target));
+		if (do_send) {
+			msg->msg_no_resend = true;
+			CDEBUG(D_NET, "msg %p to %s canceled and will not be resent\n",
+			       msg, libcfs_idstr(&msg->msg_target));
+			lnet_finalize(msg, -ECANCELED);
+		}
+
+		lnet_net_lock(cpt);
+		return -ECANCELED;
+	}
+
+	if (!msg->msg_peertxcredit) {
+		spin_lock(&lp->lpni_lock);
+		LASSERT((lp->lpni_txcredits < 0) ==
+			!list_empty(&lp->lpni_txq));
+
+		msg->msg_peertxcredit = 1;
+		lp->lpni_txqnob += msg->msg_len + sizeof(struct lnet_hdr_nid4);
+		lp->lpni_txcredits--;
+
+		if (lp->lpni_txcredits < lp->lpni_mintxcredits)
+			lp->lpni_mintxcredits = lp->lpni_txcredits;
+
+		if (lp->lpni_txcredits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lpni_txq);
+			spin_unlock(&lp->lpni_lock);
+			return LNET_CREDIT_WAIT;
+		}
+		spin_unlock(&lp->lpni_lock);
+	}
+
+	if (!msg->msg_txcredit) {
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		msg->msg_txcredit = 1;
+		tq->tq_credits--;
+		atomic_dec(&ni->ni_tx_credits);
+
+		if (tq->tq_credits < tq->tq_credits_min)
+			tq->tq_credits_min = tq->tq_credits;
+
+		if (tq->tq_credits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &tq->tq_delayed);
+			return LNET_CREDIT_WAIT;
+		}
+	}
+
+	if (unlikely(!list_empty(&the_lnet.ln_delay_rules)) &&
+	    lnet_delay_rule_match_locked(&msg->msg_hdr, msg)) {
+		msg->msg_tx_delayed = 1;
+		return LNET_CREDIT_WAIT;
+	}
+
+	/* unset the tx_delay flag as we're going to send it now */
+	msg->msg_tx_delayed = 0;
+
+	if (do_send) {
+		lnet_net_unlock(cpt);
+		lnet_ni_send(ni, msg);
+		lnet_net_lock(cpt);
+	}
+	return LNET_CREDIT_OK;
+}
+
+
+static struct lnet_rtrbufpool *
+lnet_msg2bufpool(struct lnet_msg *msg)
+{
+	struct lnet_rtrbufpool	*rbp;
+	int			cpt;
+
+	LASSERT(msg->msg_rx_committed);
+
+	cpt = msg->msg_rx_cpt;
+	rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+	LASSERT(msg->msg_len <= LNET_MTU);
+	while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_SIZE) {
+		rbp++;
+		LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+	}
+
+	return rbp;
+}
+
+static int
+lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
+{
+	/* lnet_parse is going to lnet_net_unlock immediately after this, so it
+	 * sets do_recv FALSE and I don't do the unlock/send/lock bit.
+	 * I return LNET_CREDIT_WAIT if msg blocked and LNET_CREDIT_OK if
+	 * received or OK to receive */
+	struct lnet_peer_ni *lpni = msg->msg_rxpeer;
+	struct lnet_peer *lp;
+	struct lnet_rtrbufpool *rbp;
+	struct lnet_rtrbuf *rb;
+
+	LASSERT(msg->msg_kiov == NULL);
+	LASSERT(msg->msg_niov == 0);
+	LASSERT(msg->msg_routing);
+	LASSERT(msg->msg_receiving);
+	LASSERT(!msg->msg_sending);
+	LASSERT(lpni->lpni_peer_net);
+	LASSERT(lpni->lpni_peer_net->lpn_peer);
+
+	lp = lpni->lpni_peer_net->lpn_peer;
+
+	/* non-lnet_parse callers only receive delayed messages */
+	LASSERT(!do_recv || msg->msg_rx_delayed);
+
+	if (!msg->msg_peerrtrcredit) {
+		/* lpni_lock protects the credit manipulation */
+		spin_lock(&lpni->lpni_lock);
+
+		msg->msg_peerrtrcredit = 1;
+		lpni->lpni_rtrcredits--;
+		if (lpni->lpni_rtrcredits < lpni->lpni_minrtrcredits)
+			lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits;
+
+		if (lpni->lpni_rtrcredits < 0) {
+			spin_unlock(&lpni->lpni_lock);
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			/* lp_lock protects the lp_rtrq */
+			spin_lock(&lp->lp_lock);
+			list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+			spin_unlock(&lp->lp_lock);
+			return LNET_CREDIT_WAIT;
+		}
+		spin_unlock(&lpni->lpni_lock);
+	}
+
+	rbp = lnet_msg2bufpool(msg);
+
+	if (!msg->msg_rtrcredit) {
+		msg->msg_rtrcredit = 1;
+		rbp->rbp_credits--;
+		if (rbp->rbp_credits < rbp->rbp_mincredits)
+			rbp->rbp_mincredits = rbp->rbp_credits;
+
+		if (rbp->rbp_credits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+			return LNET_CREDIT_WAIT;
+		}
+	}
+
+	LASSERT(!list_empty(&rbp->rbp_bufs));
+	rb = list_entry(rbp->rbp_bufs.next, struct lnet_rtrbuf, rb_list);
+	list_del(&rb->rb_list);
+
+	msg->msg_niov = rbp->rbp_npages;
+	msg->msg_kiov = &rb->rb_kiov[0];
+
+	/* unset the msg-rx_delayed flag since we're receiving the message */
+	msg->msg_rx_delayed = 0;
+
+	if (do_recv) {
+		int cpt = msg->msg_rx_cpt;
+
+		lnet_net_unlock(cpt);
+		lnet_ni_recv(msg->msg_rxni, msg->msg_private, msg, 1,
+			     0, msg->msg_len, msg->msg_len);
+		lnet_net_lock(cpt);
+	}
+	return LNET_CREDIT_OK;
+}
+
+void
+lnet_return_tx_credits_locked(struct lnet_msg *msg)
+{
+	struct lnet_peer_ni	*txpeer = msg->msg_txpeer;
+	struct lnet_ni		*txni = msg->msg_txni;
+	struct lnet_msg		*msg2;
+
+	if (msg->msg_txcredit) {
+		struct lnet_ni	     *ni = msg->msg_txni;
+		struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+		/* give back NI txcredits */
+		msg->msg_txcredit = 0;
+
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		tq->tq_credits++;
+		atomic_inc(&ni->ni_tx_credits);
+		if (tq->tq_credits <= 0) {
+			msg2 = list_entry(tq->tq_delayed.next,
+					  struct lnet_msg, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txni == ni);
+			LASSERT(msg2->msg_tx_delayed);
+			LASSERT(msg2->msg_tx_cpt == msg->msg_tx_cpt);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peertxcredit) {
+		/* give back peer txcredits */
+		msg->msg_peertxcredit = 0;
+
+		spin_lock(&txpeer->lpni_lock);
+		LASSERT((txpeer->lpni_txcredits < 0) ==
+			!list_empty(&txpeer->lpni_txq));
+
+		txpeer->lpni_txqnob -=	msg->msg_len +
+					sizeof(struct lnet_hdr_nid4);
+		LASSERT(txpeer->lpni_txqnob >= 0);
+
+		txpeer->lpni_txcredits++;
+		if (txpeer->lpni_txcredits <= 0) {
+			int msg2_cpt;
+
+			msg2 = list_entry(txpeer->lpni_txq.next,
+					      struct lnet_msg, msg_list);
+			list_del(&msg2->msg_list);
+			spin_unlock(&txpeer->lpni_lock);
+
+			LASSERT(msg2->msg_txpeer == txpeer);
+			LASSERT(msg2->msg_tx_delayed);
+
+			msg2_cpt = msg2->msg_tx_cpt;
+
+			/*
+			 * The msg_cpt can be different from the msg2_cpt
+			 * so we need to make sure we lock the correct cpt
+			 * for msg2.
+			 * Once we call lnet_post_send_locked() it is no
+			 * longer safe to access msg2, since it could've
+			 * been freed by lnet_finalize(), but we still
+			 * need to relock the correct cpt, so we cache the
+			 * msg2_cpt for the purpose of the check that
+			 * follows the call to lnet_pose_send_locked().
+			 */
+			if (msg2_cpt != msg->msg_tx_cpt) {
+				lnet_net_unlock(msg->msg_tx_cpt);
+				lnet_net_lock(msg2_cpt);
+			}
+                        (void) lnet_post_send_locked(msg2, 1);
+			if (msg2_cpt != msg->msg_tx_cpt) {
+				lnet_net_unlock(msg2_cpt);
+				lnet_net_lock(msg->msg_tx_cpt);
+			}
+                } else {
+			spin_unlock(&txpeer->lpni_lock);
+		}
+        }
+
+	if (txni != NULL) {
+		msg->msg_txni = NULL;
+		lnet_ni_decref_locked(txni, msg->msg_tx_cpt);
+	}
+
+	if (txpeer != NULL) {
+		msg->msg_txpeer = NULL;
+		lnet_peer_ni_decref_locked(txpeer);
+	}
+}
+
+void
+lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp)
+{
+	struct lnet_msg	*msg;
+
+	if (list_empty(&rbp->rbp_msgs))
+		return;
+	msg = list_entry(rbp->rbp_msgs.next,
+			 struct lnet_msg, msg_list);
+	list_del(&msg->msg_list);
+
+	(void)lnet_post_routed_recv_locked(msg, 1);
+}
+
+void
+lnet_drop_routed_msgs_locked(struct list_head *list, int cpt)
+{
+	struct lnet_msg *msg;
+	struct lnet_msg *tmp;
+
+	lnet_net_unlock(cpt);
+
+	list_for_each_entry_safe(msg, tmp, list, msg_list) {
+		lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL,
+			     0, 0, 0, msg->msg_hdr.payload_length);
+		list_del_init(&msg->msg_list);
+		msg->msg_no_resend = true;
+		msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR;
+		lnet_finalize(msg, -ECANCELED);
+	}
+
+	lnet_net_lock(cpt);
+}
+
+void
+lnet_return_rx_credits_locked(struct lnet_msg *msg)
+{
+	struct lnet_peer_ni *rxpeerni = msg->msg_rxpeer;
+	struct lnet_peer *lp;
+	struct lnet_ni *rxni = msg->msg_rxni;
+	struct lnet_msg	*msg2;
+
+	if (msg->msg_rtrcredit) {
+		/* give back global router credits */
+		struct lnet_rtrbuf *rb;
+		struct lnet_rtrbufpool *rbp;
+
+		/* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+		 * there until it gets one allocated, or aborts the wait
+		 * itself */
+		LASSERT(msg->msg_kiov != NULL);
+
+		rb = list_entry(msg->msg_kiov, struct lnet_rtrbuf, rb_kiov[0]);
+		rbp = rb->rb_pool;
+
+		msg->msg_kiov = NULL;
+		msg->msg_rtrcredit = 0;
+
+		LASSERT(rbp == lnet_msg2bufpool(msg));
+
+		LASSERT((rbp->rbp_credits > 0) ==
+			!list_empty(&rbp->rbp_bufs));
+
+		/* If routing is now turned off, we just drop this buffer and
+		 * don't bother trying to return credits.  */
+		if (!the_lnet.ln_routing) {
+			lnet_destroy_rtrbuf(rb, rbp->rbp_npages);
+			goto routing_off;
+		}
+
+		/* It is possible that a user has lowered the desired number of
+		 * buffers in this pool.  Make sure we never put back
+		 * more buffers than the stated number. */
+		if (unlikely(rbp->rbp_credits >= rbp->rbp_req_nbuffers)) {
+			/* Discard this buffer so we don't have too
+			 * many. */
+			lnet_destroy_rtrbuf(rb, rbp->rbp_npages);
+			rbp->rbp_nbuffers--;
+		} else {
+			list_add(&rb->rb_list, &rbp->rbp_bufs);
+			rbp->rbp_credits++;
+			if (rbp->rbp_credits <= 0)
+				lnet_schedule_blocked_locked(rbp);
+		}
+	}
+
+routing_off:
+	if (msg->msg_peerrtrcredit) {
+		LASSERT(rxpeerni);
+		LASSERT(rxpeerni->lpni_peer_net);
+		LASSERT(rxpeerni->lpni_peer_net->lpn_peer);
+
+		/* give back peer router credits */
+		msg->msg_peerrtrcredit = 0;
+
+		spin_lock(&rxpeerni->lpni_lock);
+		rxpeerni->lpni_rtrcredits++;
+		spin_unlock(&rxpeerni->lpni_lock);
+
+		lp = rxpeerni->lpni_peer_net->lpn_peer;
+		spin_lock(&lp->lp_lock);
+
+		/* drop all messages which are queued to be routed on that
+		 * peer. */
+		if (!the_lnet.ln_routing) {
+			LIST_HEAD(drop);
+			list_splice_init(&lp->lp_rtrq, &drop);
+			spin_unlock(&lp->lp_lock);
+			lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt);
+		} else if (!list_empty(&lp->lp_rtrq)) {
+			int msg2_cpt;
+
+			msg2 = list_entry(lp->lp_rtrq.next,
+					  struct lnet_msg, msg_list);
+			list_del(&msg2->msg_list);
+			msg2_cpt = msg2->msg_rx_cpt;
+			spin_unlock(&lp->lp_lock);
+			/*
+			 * messages on the lp_rtrq can be from any NID in
+			 * the peer, which means they might have different
+			 * cpts. We need to make sure we lock the right
+			 * one.
+			 */
+			if (msg2_cpt != msg->msg_rx_cpt) {
+				lnet_net_unlock(msg->msg_rx_cpt);
+				lnet_net_lock(msg2_cpt);
+			}
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+			if (msg2_cpt != msg->msg_rx_cpt) {
+				lnet_net_unlock(msg2_cpt);
+				lnet_net_lock(msg->msg_rx_cpt);
+			}
+		} else {
+			spin_unlock(&lp->lp_lock);
+		}
+	}
+	if (rxni != NULL) {
+		msg->msg_rxni = NULL;
+		lnet_ni_decref_locked(rxni, msg->msg_rx_cpt);
+	}
+	if (rxpeerni != NULL) {
+		msg->msg_rxpeer = NULL;
+		lnet_peer_ni_decref_locked(rxpeerni);
+	}
+}
+
+static struct lnet_peer_ni *
+lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid,
+		    struct lnet_peer *peer,
+		    struct lnet_peer_ni *best_lpni,
+		    struct lnet_peer_net *peer_net)
+{
+	/*
+	 * Look at the peer NIs for the destination peer that connect
+	 * to the chosen net. If a peer_ni is preferred when using the
+	 * best_ni to communicate, we use that one. If there is no
+	 * preferred peer_ni, or there are multiple preferred peer_ni,
+	 * the available transmit credits are used. If the transmit
+	 * credits are equal, we round-robin over the peer_ni.
+	 */
+	struct lnet_peer_ni *lpni = NULL;
+	int best_lpni_credits = (best_lpni) ? best_lpni->lpni_txcredits :
+		INT_MIN;
+	int best_lpni_healthv = (best_lpni) ?
+		atomic_read(&best_lpni->lpni_healthv) : 0;
+	bool best_lpni_is_preferred = false;
+	bool lpni_is_preferred;
+	int lpni_healthv;
+	__u32 lpni_sel_prio;
+	__u32 best_sel_prio = LNET_MAX_SELECTION_PRIORITY;
+
+	while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
+		/*
+		 * if the best_ni we've chosen aleady has this lpni
+		 * preferred, then let's use it
+		 */
+		if (best_ni) {
+			lpni_is_preferred = lnet_peer_is_pref_nid_locked(
+				lpni, &best_ni->ni_nid);
+			CDEBUG(D_NET, "%s lpni_is_preferred = %d\n",
+			       libcfs_nidstr(&best_ni->ni_nid),
+			       lpni_is_preferred);
+		} else {
+			lpni_is_preferred = false;
+		}
+
+		lpni_healthv = atomic_read(&lpni->lpni_healthv);
+		lpni_sel_prio = lpni->lpni_sel_priority;
+
+		if (best_lpni)
+			CDEBUG(D_NET, "n:[%s, %s] h:[%d, %d] p:[%d, %d] c:[%d, %d] s:[%d, %d]\n",
+				libcfs_nidstr(&lpni->lpni_nid),
+				libcfs_nidstr(&best_lpni->lpni_nid),
+				lpni_healthv, best_lpni_healthv,
+				lpni_sel_prio, best_sel_prio,
+				lpni->lpni_txcredits, best_lpni_credits,
+				lpni->lpni_seq, best_lpni->lpni_seq);
+		else
+			goto select_lpni;
+
+		/* pick the healthiest peer ni */
+		if (lpni_healthv < best_lpni_healthv)
+			continue;
+		else if (lpni_healthv > best_lpni_healthv) {
+			if (best_lpni_is_preferred)
+				best_lpni_is_preferred = false;
+			goto select_lpni;
+		}
+
+		if (lpni_sel_prio > best_sel_prio)
+			continue;
+		else if (lpni_sel_prio < best_sel_prio) {
+			if (best_lpni_is_preferred)
+				best_lpni_is_preferred = false;
+			goto select_lpni;
+		}
+
+		/* if this is a preferred peer use it */
+		if (!best_lpni_is_preferred && lpni_is_preferred) {
+			best_lpni_is_preferred = true;
+			goto select_lpni;
+		} else if (best_lpni_is_preferred && !lpni_is_preferred) {
+			/* this is not the preferred peer so let's ignore
+			 * it.
+			 */
+			continue;
+		}
+
+		if (lpni->lpni_txcredits < best_lpni_credits)
+			/* We already have a peer that has more credits
+			 * available than this one. No need to consider
+			 * this peer further.
+			 */
+			continue;
+		else if (lpni->lpni_txcredits > best_lpni_credits)
+			goto select_lpni;
+
+		/* The best peer found so far and the current peer
+		 * have the same number of available credits let's
+		 * make sure to select between them using Round Robin
+		 */
+		if (best_lpni && (best_lpni->lpni_seq <= lpni->lpni_seq))
+			continue;
+select_lpni:
+		best_lpni_is_preferred = lpni_is_preferred;
+		best_lpni_healthv = lpni_healthv;
+		best_sel_prio = lpni_sel_prio;
+		best_lpni = lpni;
+		best_lpni_credits = lpni->lpni_txcredits;
+	}
+
+	/* if we still can't find a peer ni then we can't reach it */
+	if (!best_lpni) {
+		__u32 net_id = (peer_net) ? peer_net->lpn_net_id :
+			LNET_NIDNET(dst_nid);
+		CDEBUG(D_NET, "no peer_ni found on peer net %s\n",
+				libcfs_net2str(net_id));
+		return NULL;
+	}
+
+	CDEBUG(D_NET, "sd_best_lpni = %s\n",
+	       libcfs_nidstr(&best_lpni->lpni_nid));
+
+	return best_lpni;
+}
+
+/*
+ * Prerequisite: the best_ni should already be set in the sd
+ * Find the best lpni.
+ * If the net id is provided then restrict lpni selection on
+ * that particular net.
+ * Otherwise find any reachable lpni. When dealing with an MR
+ * gateway and it has multiple lpnis which we can use
+ * we want to select the best one from the list of reachable
+ * ones.
+ */
+static inline struct lnet_peer_ni *
+lnet_find_best_lpni(struct lnet_ni *lni, lnet_nid_t dst_nid,
+		    struct lnet_peer *peer, __u32 net_id)
+{
+	struct lnet_peer_net *peer_net;
+
+	/* find the best_lpni on any local network */
+	if (net_id == LNET_NET_ANY) {
+		struct lnet_peer_ni *best_lpni = NULL;
+		struct lnet_peer_net *lpn;
+		list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) {
+			/* no net specified find any reachable peer ni */
+			if (!lnet_islocalnet_locked(lpn->lpn_net_id))
+				continue;
+			best_lpni = lnet_select_peer_ni(lni, dst_nid, peer,
+							best_lpni, lpn);
+		}
+
+		return best_lpni;
+	}
+	/* restrict on the specified net */
+	peer_net = lnet_peer_get_net_locked(peer, net_id);
+	if (peer_net)
+		return lnet_select_peer_ni(lni, dst_nid, peer, NULL, peer_net);
+
+	return NULL;
+}
+
+static int
+lnet_compare_gw_lpnis(struct lnet_peer_ni *lpni1, struct lnet_peer_ni *lpni2)
+{
+	if (lpni1->lpni_txqnob < lpni2->lpni_txqnob)
+		return 1;
+
+	if (lpni1->lpni_txqnob > lpni2->lpni_txqnob)
+		return -1;
+
+	if (lpni1->lpni_txcredits > lpni2->lpni_txcredits)
+		return 1;
+
+	if (lpni1->lpni_txcredits < lpni2->lpni_txcredits)
+		return -1;
+
+	return 0;
+}
+
+/* Compare route priorities and hop counts */
+static int
+lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2)
+{
+	int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops;
+	int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops;
+
+	if (r1->lr_priority < r2->lr_priority)
+		return 1;
+
+	if (r1->lr_priority > r2->lr_priority)
+		return -1;
+
+	if (r1_hops < r2_hops)
+		return 1;
+
+	if (r1_hops > r2_hops)
+		return -1;
+
+	return 0;
+}
+
+static struct lnet_route *
+lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net,
+		       struct lnet_peer_ni *remote_lpni,
+		       struct lnet_route **prev_route,
+		       struct lnet_peer_ni **gwni)
+{
+	struct lnet_peer_ni *lpni, *best_gw_ni = NULL;
+	struct lnet_route *best_route;
+	struct lnet_route *last_route;
+	struct lnet_route *route;
+	int rc;
+	bool best_rte_is_preferred = false;
+	struct lnet_nid *gw_pnid;
+
+	CDEBUG(D_NET, "Looking up a route to %s, from %s\n",
+	       libcfs_net2str(rnet->lrn_net), libcfs_net2str(src_net));
+
+	best_route = last_route = NULL;
+	list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
+		if (!lnet_is_route_alive(route))
+			continue;
+		gw_pnid = &route->lr_gateway->lp_primary_nid;
+
+		/* no protection on below fields, but it's harmless */
+		if (last_route && (last_route->lr_seq - route->lr_seq < 0))
+			last_route = route;
+
+		/* if the best route found is in the preferred list then
+		 * tag it as preferred and use it later on. But if we
+		 * didn't find any routes which are on the preferred list
+		 * then just use the best route possible.
+		 */
+		rc = lnet_peer_is_pref_rtr_locked(remote_lpni, gw_pnid);
+
+		if (!best_route || (rc && !best_rte_is_preferred)) {
+			/* Restrict the selection of the router NI on the
+			 * src_net provided. If the src_net is LNET_NID_ANY,
+			 * then select the best interface available.
+			 */
+			lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY,
+						   route->lr_gateway,
+						   src_net);
+			if (!lpni) {
+				CDEBUG(D_NET,
+				       "Gateway %s does not have a peer NI on net %s\n",
+				       libcfs_nidstr(gw_pnid),
+				       libcfs_net2str(src_net));
+				continue;
+			}
+		}
+
+		if (rc && !best_rte_is_preferred) {
+			/* This is the first preferred route we found,
+			 * so it beats any route found previously
+			 */
+			best_route = route;
+			if (!last_route)
+				last_route = route;
+			best_gw_ni = lpni;
+			best_rte_is_preferred = true;
+			CDEBUG(D_NET, "preferred gw = %s\n",
+			       libcfs_nidstr(gw_pnid));
+			continue;
+		} else if ((!rc) && best_rte_is_preferred)
+			/* The best route we found so far is in the preferred
+			 * list, so it beats any non-preferred route
+			 */
+			continue;
+
+		if (!best_route) {
+			best_route = last_route = route;
+			best_gw_ni = lpni;
+			continue;
+		}
+
+		rc = lnet_compare_routes(route, best_route);
+		if (rc == -1)
+			continue;
+
+		/* Restrict the selection of the router NI on the
+		 * src_net provided. If the src_net is LNET_NID_ANY,
+		 * then select the best interface available.
+		 */
+		lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY,
+					   route->lr_gateway,
+					   src_net);
+		if (!lpni) {
+			CDEBUG(D_NET,
+			       "Gateway %s does not have a peer NI on net %s\n",
+			       libcfs_nidstr(gw_pnid),
+			       libcfs_net2str(src_net));
+			continue;
+		}
+
+		if (rc == 1) {
+			best_route = route;
+			best_gw_ni = lpni;
+			continue;
+		}
+
+		rc = lnet_compare_gw_lpnis(lpni, best_gw_ni);
+		if (rc == -1)
+			continue;
+
+		if (rc == 1 || route->lr_seq <= best_route->lr_seq) {
+			best_route = route;
+			best_gw_ni = lpni;
+			continue;
+		}
+	}
+
+	*prev_route = last_route;
+	*gwni = best_gw_ni;
+
+	return best_route;
+}
+
+static inline unsigned int
+lnet_dev_prio_of_md(struct lnet_ni *ni, unsigned int dev_idx)
+{
+	if (dev_idx == UINT_MAX)
+		return UINT_MAX;
+
+	if (!ni || !ni->ni_net || !ni->ni_net->net_lnd ||
+	    !ni->ni_net->net_lnd->lnd_get_dev_prio)
+		return UINT_MAX;
+
+	return ni->ni_net->net_lnd->lnd_get_dev_prio(ni, dev_idx);
+}
+
+static struct lnet_ni *
+lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
+		 struct lnet_peer *peer, struct lnet_peer_net *peer_net,
+		 struct lnet_msg *msg, int md_cpt)
+{
+	struct lnet_libmd *md = msg->msg_md;
+	unsigned int offset = msg->msg_offset;
+	unsigned int shortest_distance;
+	struct lnet_ni *ni = NULL;
+	int best_credits;
+	int best_healthv;
+	__u32 best_sel_prio;
+	unsigned int best_dev_prio;
+	unsigned int dev_idx = UINT_MAX;
+	struct page *page = lnet_get_first_page(md, offset);
+	msg->msg_rdma_force = lnet_is_rdma_only_page(page);
+
+	if (msg->msg_rdma_force)
+		dev_idx = lnet_get_dev_idx(page);
+
+	/*
+	 * If there is no peer_ni that we can send to on this network,
+	 * then there is no point in looking for a new best_ni here.
+	*/
+	if (!lnet_get_next_peer_ni_locked(peer, peer_net, NULL))
+		return best_ni;
+
+	if (best_ni == NULL) {
+		best_sel_prio = LNET_MAX_SELECTION_PRIORITY;
+		shortest_distance = UINT_MAX;
+		best_dev_prio = UINT_MAX;
+		best_credits = INT_MIN;
+		best_healthv = 0;
+	} else {
+		best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx);
+		shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
+						     best_ni->ni_dev_cpt);
+		best_credits = atomic_read(&best_ni->ni_tx_credits);
+		best_healthv = atomic_read(&best_ni->ni_healthv);
+		best_sel_prio = best_ni->ni_sel_priority;
+	}
+
+	while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
+		unsigned int distance;
+		int ni_credits;
+		int ni_healthv;
+		int ni_fatal;
+		__u32 ni_sel_prio;
+		unsigned int ni_dev_prio;
+
+		ni_credits = atomic_read(&ni->ni_tx_credits);
+		ni_healthv = atomic_read(&ni->ni_healthv);
+		ni_fatal = atomic_read(&ni->ni_fatal_error_on);
+		ni_sel_prio = ni->ni_sel_priority;
+
+		/*
+		 * calculate the distance from the CPT on which
+		 * the message memory is allocated to the CPT of
+		 * the NI's physical device
+		 */
+		distance = cfs_cpt_distance(lnet_cpt_table(),
+					    md_cpt,
+					    ni->ni_dev_cpt);
+
+		ni_dev_prio = lnet_dev_prio_of_md(ni, dev_idx);
+
+		/*
+		 * All distances smaller than the NUMA range
+		 * are treated equally.
+		 */
+		if (distance < lnet_numa_range)
+			distance = lnet_numa_range;
+
+		/*
+		 * Select on health, selection policy, direct dma prio,
+		 * shorter distance, available credits, then round-robin.
+		 */
+		if (ni_fatal)
+			continue;
+
+		if (best_ni)
+			CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n",
+			       libcfs_nidstr(&ni->ni_nid), ni_credits, distance,
+			       ni->ni_seq, ni_sel_prio, ni_dev_prio, ni_healthv,
+			       (best_ni) ? libcfs_nidstr(&best_ni->ni_nid)
+			       : "not selected", best_credits, shortest_distance,
+			       (best_ni) ? best_ni->ni_seq : 0,
+			       best_sel_prio, best_dev_prio, best_healthv);
+		else
+			goto select_ni;
+
+		if (ni_healthv < best_healthv)
+			continue;
+		else if (ni_healthv > best_healthv)
+			goto select_ni;
+
+		if (ni_sel_prio > best_sel_prio)
+			continue;
+		else if (ni_sel_prio < best_sel_prio)
+			goto select_ni;
+
+		if (ni_dev_prio > best_dev_prio)
+			continue;
+		else if (ni_dev_prio < best_dev_prio)
+			goto select_ni;
+
+		if (distance > shortest_distance)
+			continue;
+		else if (distance < shortest_distance)
+			goto select_ni;
+
+		if (ni_credits < best_credits)
+			continue;
+		else if (ni_credits > best_credits)
+			goto select_ni;
+
+		if (best_ni && best_ni->ni_seq <= ni->ni_seq)
+			continue;
+
+select_ni:
+		best_sel_prio = ni_sel_prio;
+		best_dev_prio = ni_dev_prio;
+		shortest_distance = distance;
+		best_healthv = ni_healthv;
+		best_ni = ni;
+		best_credits = ni_credits;
+	}
+
+	CDEBUG(D_NET, "selected best_ni %s\n",
+	       (best_ni) ? libcfs_nidstr(&best_ni->ni_nid) : "no selection");
+
+	return best_ni;
+}
+
+static bool
+lnet_reserved_msg(struct lnet_msg *msg)
+{
+	if (msg->msg_type == LNET_MSG_PUT) {
+		if (msg->msg_hdr.msg.put.ptl_index == LNET_RESERVED_PORTAL)
+			return true;
+	} else if (msg->msg_type == LNET_MSG_GET) {
+		if (msg->msg_hdr.msg.get.ptl_index == LNET_RESERVED_PORTAL)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Traffic to the LNET_RESERVED_PORTAL may not trigger peer discovery,
+ * because such traffic is required to perform discovery. We therefore
+ * exclude all GET and PUT on that portal. We also exclude all ACK and
+ * REPLY traffic, but that is because the portal is not tracked in the
+ * message structure for these message types. We could restrict this
+ * further by also checking for LNET_PROTO_PING_MATCHBITS.
+ */
+static bool
+lnet_msg_discovery(struct lnet_msg *msg)
+{
+	return !(lnet_reserved_msg(msg) || lnet_msg_is_response(msg));
+}
+
+#define SRC_SPEC	0x0001
+#define SRC_ANY		0x0002
+#define LOCAL_DST	0x0004
+#define REMOTE_DST	0x0008
+#define MR_DST		0x0010
+#define NMR_DST		0x0020
+#define SND_RESP	0x0040
+
+/* The following to defines are used for return codes */
+#define REPEAT_SEND	0x1000
+#define PASS_THROUGH	0x2000
+
+/* The different cases lnet_select pathway needs to handle */
+#define SRC_SPEC_LOCAL_MR_DST	(SRC_SPEC | LOCAL_DST | MR_DST)
+#define SRC_SPEC_ROUTER_MR_DST	(SRC_SPEC | REMOTE_DST | MR_DST)
+#define SRC_SPEC_LOCAL_NMR_DST	(SRC_SPEC | LOCAL_DST | NMR_DST)
+#define SRC_SPEC_ROUTER_NMR_DST	(SRC_SPEC | REMOTE_DST | NMR_DST)
+#define SRC_ANY_LOCAL_MR_DST	(SRC_ANY | LOCAL_DST | MR_DST)
+#define SRC_ANY_ROUTER_MR_DST	(SRC_ANY | REMOTE_DST | MR_DST)
+#define SRC_ANY_LOCAL_NMR_DST	(SRC_ANY | LOCAL_DST | NMR_DST)
+#define SRC_ANY_ROUTER_NMR_DST	(SRC_ANY | REMOTE_DST | NMR_DST)
+
+static int
+lnet_handle_lo_send(struct lnet_send_data *sd)
+{
+	struct lnet_msg *msg = sd->sd_msg;
+	int cpt = sd->sd_cpt;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		return -ESHUTDOWN;
+
+	/* No send credit hassles with LOLND */
+	lnet_ni_addref_locked(the_lnet.ln_loni, cpt);
+	msg->msg_hdr.dest_nid = the_lnet.ln_loni->ni_nid;
+	if (!msg->msg_routing)
+		msg->msg_hdr.src_nid = the_lnet.ln_loni->ni_nid;
+	msg->msg_target.nid = the_lnet.ln_loni->ni_nid;
+	lnet_msg_commit(msg, cpt);
+	msg->msg_txni = the_lnet.ln_loni;
+
+	return LNET_CREDIT_OK;
+}
+
+static int
+lnet_handle_send(struct lnet_send_data *sd)
+{
+	struct lnet_ni *best_ni = sd->sd_best_ni;
+	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
+	struct lnet_peer_ni *final_dst_lpni = sd->sd_final_dst_lpni;
+	struct lnet_msg *msg = sd->sd_msg;
+	int cpt2;
+	__u32 send_case = sd->sd_send_case;
+	int rc;
+	__u32 routing = send_case & REMOTE_DST;
+	 struct lnet_rsp_tracker *rspt;
+
+	/* Increment sequence number of the selected peer, peer net,
+	 * local ni and local net so that we pick the next ones
+	 * in Round Robin.
+	 */
+	best_lpni->lpni_peer_net->lpn_seq++;
+	best_lpni->lpni_seq = best_lpni->lpni_peer_net->lpn_seq;
+	best_ni->ni_net->net_seq++;
+	best_ni->ni_seq = best_ni->ni_net->net_seq;
+
+	CDEBUG(D_NET, "%s NI seq info: [%d:%d:%d:%u] %s LPNI seq info [%d:%d:%d:%u]\n",
+	       libcfs_nidstr(&best_ni->ni_nid),
+	       best_ni->ni_seq, best_ni->ni_net->net_seq,
+	       atomic_read(&best_ni->ni_tx_credits),
+	       best_ni->ni_sel_priority,
+	       libcfs_nidstr(&best_lpni->lpni_nid),
+	       best_lpni->lpni_seq, best_lpni->lpni_peer_net->lpn_seq,
+	       best_lpni->lpni_txcredits,
+	       best_lpni->lpni_sel_priority);
+
+	/*
+	 * grab a reference on the peer_ni so it sticks around even if
+	 * we need to drop and relock the lnet_net_lock below.
+	 */
+	lnet_peer_ni_addref_locked(best_lpni);
+
+	/*
+	 * Use lnet_cpt_of_nid() to determine the CPT used to commit the
+	 * message. This ensures that we get a CPT that is correct for
+	 * the NI when the NI has been restricted to a subset of all CPTs.
+	 * If the selected CPT differs from the one currently locked, we
+	 * must unlock and relock the lnet_net_lock(), and then check whether
+	 * the configuration has changed. We don't have a hold on the best_ni
+	 * yet, and it may have vanished.
+	 */
+	cpt2 = lnet_cpt_of_nid_locked(&best_lpni->lpni_nid, best_ni);
+	if (sd->sd_cpt != cpt2) {
+		__u32 seq = lnet_get_dlc_seq_locked();
+		lnet_net_unlock(sd->sd_cpt);
+		sd->sd_cpt = cpt2;
+		lnet_net_lock(sd->sd_cpt);
+		if (seq != lnet_get_dlc_seq_locked()) {
+			lnet_peer_ni_decref_locked(best_lpni);
+			return REPEAT_SEND;
+		}
+	}
+
+	/*
+	 * store the best_lpni in the message right away to avoid having
+	 * to do the same operation under different conditions
+	 */
+	msg->msg_txpeer = best_lpni;
+	msg->msg_txni = best_ni;
+
+	/*
+	 * grab a reference for the best_ni since now it's in use in this
+	 * send. The reference will be dropped in lnet_finalize()
+	 */
+	lnet_ni_addref_locked(msg->msg_txni, sd->sd_cpt);
+
+	/*
+	 * Always set the target.nid to the best peer picked. Either the
+	 * NID will be one of the peer NIDs selected, or the same NID as
+	 * what was originally set in the target or it will be the NID of
+	 * a router if this message should be routed
+	 */
+	msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
+
+	/*
+	 * lnet_msg_commit assigns the correct cpt to the message, which
+	 * is used to decrement the correct refcount on the ni when it's
+	 * time to return the credits
+	 */
+	lnet_msg_commit(msg, sd->sd_cpt);
+
+	/*
+	 * If we are routing the message then we keep the src_nid that was
+	 * set by the originator. If we are not routing then we are the
+	 * originator and set it here.
+	 */
+	if (!msg->msg_routing)
+		msg->msg_hdr.src_nid = msg->msg_txni->ni_nid;
+
+	if (routing) {
+		msg->msg_target_is_router = 1;
+		msg->msg_target.pid = LNET_PID_LUSTRE;
+		/*
+		 * since we're routing we want to ensure that the
+		 * msg_hdr.dest_nid is set to the final destination. When
+		 * the router receives this message it knows how to route
+		 * it.
+		 *
+		 * final_dst_lpni is set at the beginning of the
+		 * lnet_select_pathway() function and is never changed.
+		 * It's safe to use it here.
+		 */
+		msg->msg_hdr.dest_nid = final_dst_lpni->lpni_nid;
+	} else {
+		/*
+		 * if we're not routing set the dest_nid to the best peer
+		 * ni NID that we picked earlier in the algorithm.
+		 */
+		msg->msg_hdr.dest_nid = msg->msg_txpeer->lpni_nid;
+	}
+
+	/*
+	 * if we have response tracker block update it with the next hop
+	 * nid
+	 */
+	if (msg->msg_md) {
+		rspt = msg->msg_md->md_rspt_ptr;
+		if (rspt) {
+			rspt->rspt_next_hop_nid =
+				msg->msg_txpeer->lpni_nid;
+			CDEBUG(D_NET, "rspt_next_hop_nid = %s\n",
+			       libcfs_nidstr(&rspt->rspt_next_hop_nid));
+		}
+	}
+
+	rc = lnet_post_send_locked(msg, 0);
+
+	if (!rc)
+		CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) %s : %s try# %d\n",
+		       libcfs_nidstr(&msg->msg_hdr.src_nid),
+		       libcfs_nidstr(&msg->msg_txni->ni_nid),
+		       libcfs_nidstr(&sd->sd_src_nid),
+		       libcfs_nidstr(&msg->msg_hdr.dest_nid),
+		       libcfs_nidstr(&sd->sd_dst_nid),
+		       libcfs_nidstr(&msg->msg_txpeer->lpni_nid),
+		       libcfs_nidstr(&sd->sd_rtr_nid),
+		       lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count);
+
+	return rc;
+}
+
+static inline void
+lnet_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, struct lnet_ni *lni,
+			 struct lnet_msg *msg)
+{
+	if (!lnet_peer_is_multi_rail(lpni->lpni_peer_net->lpn_peer) &&
+	    !lnet_msg_is_response(msg) && lpni->lpni_pref_nnids == 0) {
+		CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n",
+		       libcfs_nidstr(&lni->ni_nid),
+		       libcfs_nidstr(&lpni->lpni_nid));
+		lnet_peer_ni_set_non_mr_pref_nid(lpni, &lni->ni_nid);
+	}
+}
+
+/*
+ * Source Specified
+ * Local Destination
+ * non-mr peer
+ *
+ * use the source and destination NIDs as the pathway
+ */
+static int
+lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd)
+{
+	/* the destination lpni is set before we get here. */
+
+	/* find local NI */
+	sd->sd_best_ni = lnet_nid_to_ni_locked(&sd->sd_src_nid, sd->sd_cpt);
+	if (!sd->sd_best_ni) {
+		CERROR("Can't send to %s: src %s is not a local nid\n",
+		       libcfs_nidstr(&sd->sd_dst_nid),
+		       libcfs_nidstr(&sd->sd_src_nid));
+		return -EINVAL;
+	}
+
+	lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg);
+
+	return lnet_handle_send(sd);
+}
+
+/*
+ * Source Specified
+ * Local Destination
+ * MR Peer
+ *
+ * Don't run the selection algorithm on the peer NIs. By specifying the
+ * local NID, we're also saying that we should always use the destination NID
+ * provided. This handles the case where we should be using the same
+ * destination NID for the all the messages which belong to the same RPC
+ * request.
+ */
+static int
+lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd)
+{
+	sd->sd_best_ni = lnet_nid_to_ni_locked(&sd->sd_src_nid, sd->sd_cpt);
+	if (!sd->sd_best_ni) {
+		CERROR("Can't send to %s: src %s is not a local nid\n",
+		       libcfs_nidstr(&sd->sd_dst_nid),
+		       libcfs_nidstr(&sd->sd_src_nid));
+		return -EINVAL;
+	}
+
+	if (sd->sd_best_lpni &&
+	    nid_same(&sd->sd_best_lpni->lpni_nid,
+		      &the_lnet.ln_loni->ni_nid))
+		return lnet_handle_lo_send(sd);
+	else if (sd->sd_best_lpni)
+		return lnet_handle_send(sd);
+
+	CERROR("can't send to %s. no NI on %s\n",
+	       libcfs_nidstr(&sd->sd_dst_nid),
+	       libcfs_net2str(sd->sd_best_ni->ni_net->net_id));
+
+	return -EHOSTUNREACH;
+}
+
+struct lnet_ni *
+lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
+			      struct lnet_peer *peer,
+			      struct lnet_peer_net *peer_net,
+			      struct lnet_msg *msg,
+			      int cpt)
+{
+	struct lnet_net *local_net;
+	struct lnet_ni *best_ni;
+
+	local_net = lnet_get_net_locked(peer_net->lpn_net_id);
+	if (!local_net)
+		return NULL;
+
+	/*
+	 * Iterate through the NIs in this local Net and select
+	 * the NI to send from. The selection is determined by
+	 * these 3 criterion in the following priority:
+	 *	1. NUMA
+	 *	2. NI available credits
+	 *	3. Round Robin
+	 */
+	best_ni = lnet_get_best_ni(local_net, cur_best_ni,
+				   peer, peer_net, msg, cpt);
+
+	return best_ni;
+}
+
+static int
+lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg,
+			     int cpt)
+{
+	struct lnet_peer *peer;
+	struct lnet_peer_ni *new_lpni;
+	int rc;
+
+	lnet_peer_ni_addref_locked(lpni);
+
+	peer = lpni->lpni_peer_net->lpn_peer;
+
+	if (lnet_peer_gw_discovery(peer)) {
+		lnet_peer_ni_decref_locked(lpni);
+		return 0;
+	}
+
+	if (!lnet_msg_discovery(msg) || lnet_peer_is_uptodate(peer)) {
+		lnet_peer_ni_decref_locked(lpni);
+		return 0;
+	}
+
+	rc = lnet_discover_peer_locked(lpni, cpt, false);
+	if (rc) {
+		lnet_peer_ni_decref_locked(lpni);
+		return rc;
+	}
+
+	new_lpni = lnet_find_peer_ni_locked(lnet_nid_to_nid4(&lpni->lpni_nid));
+	if (!new_lpni) {
+		lnet_peer_ni_decref_locked(lpni);
+		return -ENOENT;
+	}
+
+	peer = new_lpni->lpni_peer_net->lpn_peer;
+	spin_lock(&peer->lp_lock);
+	if (lpni == new_lpni && lnet_peer_is_uptodate_locked(peer)) {
+		/* The peer NI did not change and the peer is up to date.
+		 * Nothing more to do.
+		 */
+		spin_unlock(&peer->lp_lock);
+		lnet_peer_ni_decref_locked(lpni);
+		lnet_peer_ni_decref_locked(new_lpni);
+		return 0;
+	}
+	spin_unlock(&peer->lp_lock);
+
+	/* Either the peer NI changed during discovery, or the peer isn't up
+	 * to date. In both cases we want to queue the message on the
+	 * (possibly new) peer's pending queue and queue the peer for discovery
+	 */
+	msg->msg_sending = 0;
+	msg->msg_txpeer = NULL;
+	lnet_net_unlock(cpt);
+	lnet_peer_queue_message(peer, msg);
+	lnet_net_lock(cpt);
+
+	lnet_peer_ni_decref_locked(lpni);
+	lnet_peer_ni_decref_locked(new_lpni);
+
+	CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n",
+	       msg, libcfs_nidstr(&peer->lp_primary_nid));
+
+	return LNET_DC_WAIT;
+}
+
+static int
+lnet_handle_find_routed_path(struct lnet_send_data *sd,
+			     struct lnet_nid *dst_nid,
+			     struct lnet_peer_ni **gw_lpni,
+			     struct lnet_peer **gw_peer)
+{
+	int rc;
+	struct lnet_peer *gw;
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_net *best_lpn = NULL;
+	struct lnet_remotenet *rnet, *best_rnet = NULL;
+	struct lnet_route *best_route = NULL;
+	struct lnet_route *last_route = NULL;
+	struct lnet_peer_ni *lpni = NULL;
+	struct lnet_peer_ni *gwni = NULL;
+	bool route_found = false;
+	struct lnet_nid *src_nid =
+		!LNET_NID_IS_ANY(&sd->sd_src_nid) || !sd->sd_best_ni
+		? &sd->sd_src_nid
+		: &sd->sd_best_ni->ni_nid;
+	int best_lpn_healthv = 0;
+	__u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY;
+
+	CDEBUG(D_NET, "using src nid %s for route restriction\n",
+	       src_nid ? libcfs_nidstr(src_nid) : "ANY");
+
+	/* If a router nid was specified then we are replying to a GET or
+	 * sending an ACK. In this case we use the gateway associated with the
+	 * specified router nid.
+	 */
+	if (!LNET_NID_IS_ANY(&sd->sd_rtr_nid)) {
+		gwni = lnet_peer_ni_find_locked(&sd->sd_rtr_nid);
+		if (gwni) {
+			gw = gwni->lpni_peer_net->lpn_peer;
+			lnet_peer_ni_decref_locked(gwni);
+			if (gw->lp_rtr_refcount)
+				route_found = true;
+		} else {
+			CWARN("No peer NI for gateway %s. Attempting to find an alternative route.\n",
+			       libcfs_nidstr(&sd->sd_rtr_nid));
+		}
+	}
+
+	if (!route_found) {
+		if (sd->sd_msg->msg_routing || (src_nid && !LNET_NID_IS_ANY(src_nid))) {
+			/* If I'm routing this message then I need to find the
+			 * next hop based on the destination NID
+			 *
+			 * We also find next hop based on the destination NID
+			 * if the source NI was specified
+			 */
+			best_rnet = lnet_find_rnet_locked(LNET_NID_NET(&sd->sd_dst_nid));
+			if (!best_rnet) {
+				CERROR("Unable to send message from %s to %s - Route table may be misconfigured\n",
+				       (src_nid && LNET_NID_IS_ANY(src_nid)) ?
+						"any local NI" :
+						libcfs_nidstr(src_nid),
+				       libcfs_nidstr(&sd->sd_dst_nid));
+				return -EHOSTUNREACH;
+			}
+		} else {
+			/* we've already looked up the initial lpni using
+			 * dst_nid
+			 */
+			lpni = sd->sd_best_lpni;
+			/* the peer tree must be in existence */
+			LASSERT(lpni && lpni->lpni_peer_net &&
+				lpni->lpni_peer_net->lpn_peer);
+			lp = lpni->lpni_peer_net->lpn_peer;
+
+			list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) {
+				/* is this remote network reachable?  */
+				rnet = lnet_find_rnet_locked(lpn->lpn_net_id);
+				if (!rnet)
+					continue;
+
+				if (!best_lpn) {
+					best_lpn = lpn;
+					best_rnet = rnet;
+				}
+
+				/* select the preferred peer net */
+				if (best_lpn_healthv > lpn->lpn_healthv)
+					continue;
+				else if (best_lpn_healthv < lpn->lpn_healthv)
+					goto use_lpn;
+
+				if (best_lpn_sel_prio < lpn->lpn_sel_priority)
+					continue;
+				else if (best_lpn_sel_prio > lpn->lpn_sel_priority)
+					goto use_lpn;
+
+				if (best_lpn->lpn_seq <= lpn->lpn_seq)
+					continue;
+use_lpn:
+				best_lpn_healthv = lpn->lpn_healthv;
+				best_lpn_sel_prio = lpn->lpn_sel_priority;
+				best_lpn = lpn;
+				best_rnet = rnet;
+			}
+
+			if (!best_lpn) {
+				CERROR("peer %s has no available nets\n",
+				       libcfs_nidstr(&sd->sd_dst_nid));
+				return -EHOSTUNREACH;
+			}
+
+			sd->sd_best_lpni = lnet_find_best_lpni(sd->sd_best_ni,
+							       lnet_nid_to_nid4(&sd->sd_dst_nid),
+							       lp,
+							       best_lpn->lpn_net_id);
+			if (!sd->sd_best_lpni) {
+				CERROR("peer %s is unreachable\n",
+				       libcfs_nidstr(&sd->sd_dst_nid));
+				return -EHOSTUNREACH;
+			}
+
+			/* We're attempting to round robin over the remote peer
+			 * NI's so update the final destination we selected
+			 */
+			sd->sd_final_dst_lpni = sd->sd_best_lpni;
+
+			/* Increment the sequence number of the remote lpni so
+			 * we can round robin over the different interfaces of
+			 * the remote lpni
+			 */
+			sd->sd_best_lpni->lpni_seq++;
+		}
+
+		/*
+		 * find the best route. Restrict the selection on the net of the
+		 * local NI if we've already picked the local NI to send from.
+		 * Otherwise, let's pick any route we can find and then find
+		 * a local NI we can reach the route's gateway on. Any route we
+		 * select will be reachable by virtue of the restriction we have
+		 * when adding a route.
+		 */
+		best_route = lnet_find_route_locked(best_rnet,
+						    LNET_NID_NET(src_nid),
+						    sd->sd_best_lpni,
+						    &last_route, &gwni);
+
+		if (!best_route) {
+			CERROR("no route to %s from %s\n",
+			       libcfs_nidstr(dst_nid),
+			       libcfs_nidstr(src_nid));
+			return -EHOSTUNREACH;
+		}
+
+		if (!gwni) {
+			CERROR("Internal Error. Route expected to %s from %s\n",
+			       libcfs_nidstr(dst_nid),
+			       libcfs_nidstr(src_nid));
+			return -EFAULT;
+		}
+
+		gw = best_route->lr_gateway;
+		LASSERT(gw == gwni->lpni_peer_net->lpn_peer);
+	}
+
+	/*
+	 * If the router checker is not active then discover the gateway here.
+	 * This ensures we are able to take advantage of multi-rail routing, but
+	 * if the router checker is active then we do not unecessarily delay
+	 * messages while the gateway is being checked by the dedicated monitor
+	 * thread.
+	 *
+	 * NB: We're only checking the alive_router_check_interval here, rather
+	 * than calling lnet_router_checker_active(), because the other
+	 * conditions that are checked by that function are either
+	 * irrelevant (the_lnet.ln_routing) or must be true (list of routers
+	 * is not empty)
+	 */
+	if (alive_router_check_interval <= 0) {
+		rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_cpt);
+		if (rc)
+			return rc;
+	}
+
+	if (!sd->sd_best_ni) {
+		lpn = gwni->lpni_peer_net;
+		sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lpn,
+							       sd->sd_msg,
+							       sd->sd_md_cpt);
+		if (!sd->sd_best_ni) {
+			CERROR("Internal Error. Expected local ni on %s but non found: %s\n",
+			       libcfs_net2str(lpn->lpn_net_id),
+			       libcfs_nidstr(&sd->sd_src_nid));
+			return -EFAULT;
+		}
+	}
+
+	*gw_lpni = gwni;
+	*gw_peer = gw;
+
+	/*
+	 * increment the sequence numbers since now we're sure we're
+	 * going to use this path
+	 */
+	if (LNET_NID_IS_ANY(&sd->sd_rtr_nid)) {
+		LASSERT(best_route && last_route);
+		best_route->lr_seq = last_route->lr_seq + 1;
+		if (best_lpn)
+			best_lpn->lpn_seq++;
+	}
+
+	return 0;
+}
+
+/*
+ * Handle two cases:
+ *
+ * Case 1:
+ *  Source specified
+ *  Remote destination
+ *  Non-MR destination
+ *
+ * Case 2:
+ *  Source specified
+ *  Remote destination
+ *  MR destination
+ *
+ * The handling of these two cases is similar. Even though the destination
+ * can be MR or non-MR, we'll deal directly with the router.
+ */
+static int
+lnet_handle_spec_router_dst(struct lnet_send_data *sd)
+{
+	int rc;
+	struct lnet_peer_ni *gw_lpni = NULL;
+	struct lnet_peer *gw_peer = NULL;
+
+	/* find local NI */
+	sd->sd_best_ni = lnet_nid_to_ni_locked(&sd->sd_src_nid, sd->sd_cpt);
+	if (!sd->sd_best_ni) {
+		CERROR("Can't send to %s: src %s is not a local nid\n",
+		       libcfs_nidstr(&sd->sd_dst_nid),
+		       libcfs_nidstr(&sd->sd_src_nid));
+		return -EINVAL;
+	}
+
+	rc = lnet_handle_find_routed_path(sd, &sd->sd_dst_nid,
+					  &gw_lpni, &gw_peer);
+	if (rc)
+		return rc;
+
+	if (sd->sd_send_case & NMR_DST)
+		/*
+		 * since the final destination is non-MR let's set its preferred
+		 * NID before we send
+		 */
+		lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni,
+					 sd->sd_msg);
+
+	/*
+	 * We're going to send to the gw found so let's set its
+	 * info
+	 */
+	sd->sd_peer = gw_peer;
+	sd->sd_best_lpni = gw_lpni;
+
+	return lnet_handle_send(sd);
+}
+
+struct lnet_ni *
+lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
+			       struct lnet_msg *msg, bool discovery)
+{
+	struct lnet_peer_net *lpn = NULL;
+	struct lnet_peer_net *best_lpn = NULL;
+	struct lnet_net *net = NULL;
+	struct lnet_net *best_net = NULL;
+	struct lnet_ni *best_ni = NULL;
+	int best_lpn_healthv = 0;
+	int best_net_healthv = 0;
+	int net_healthv;
+	__u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY;
+	__u32 lpn_sel_prio;
+	__u32 best_net_sel_prio = LNET_MAX_SELECTION_PRIORITY;
+	__u32 net_sel_prio;
+	bool exit = false;
+
+	/*
+	 * The peer can have multiple interfaces, some of them can be on
+	 * the local network and others on a routed network. We should
+	 * prefer the local network. However if the local network is not
+	 * available then we need to try the routed network
+	 */
+
+	/* go through all the peer nets and find the best_ni */
+	list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) {
+		/*
+		 * The peer's list of nets can contain non-local nets. We
+		 * want to only examine the local ones.
+		 */
+		net = lnet_get_net_locked(lpn->lpn_net_id);
+		if (!net)
+			continue;
+
+		lpn_sel_prio = lpn->lpn_sel_priority;
+		net_healthv = lnet_get_net_healthv_locked(net);
+		net_sel_prio = net->net_sel_priority;
+
+		/*
+		 * if this is a discovery message and lp_disc_net_id is
+		 * specified then use that net to send the discovery on.
+		 */
+		if (peer->lp_disc_net_id == lpn->lpn_net_id &&
+		    discovery) {
+			exit = true;
+			goto select_lpn;
+		}
+
+		if (!best_lpn)
+			goto select_lpn;
+
+		/* always select the lpn with the best health */
+		if (best_lpn_healthv > lpn->lpn_healthv)
+			continue;
+		else if (best_lpn_healthv < lpn->lpn_healthv)
+			goto select_lpn;
+
+		/* select the preferred peer and local nets */
+		if (best_lpn_sel_prio < lpn_sel_prio)
+			continue;
+		else if (best_lpn_sel_prio > lpn_sel_prio)
+			goto select_lpn;
+
+		if (best_net_healthv > net_healthv)
+			continue;
+		else if (best_net_healthv < net_healthv)
+			goto select_lpn;
+
+		if (best_net_sel_prio < net_sel_prio)
+			continue;
+		else if (best_net_sel_prio > net_sel_prio)
+			goto select_lpn;
+
+		if (best_lpn->lpn_seq < lpn->lpn_seq)
+			continue;
+		else if (best_lpn->lpn_seq > lpn->lpn_seq)
+			goto select_lpn;
+
+		/* round robin over the local networks */
+		if (best_net->net_seq <= net->net_seq)
+			continue;
+
+select_lpn:
+		best_net_healthv = net_healthv;
+		best_net_sel_prio = net_sel_prio;
+		best_lpn_healthv = lpn->lpn_healthv;
+		best_lpn_sel_prio = lpn_sel_prio;
+		best_lpn = lpn;
+		best_net = net;
+
+		if (exit)
+			break;
+	}
+
+	if (best_lpn) {
+		/* Select the best NI on the same net as best_lpn chosen
+		 * above
+		 */
+		best_ni = lnet_find_best_ni_on_spec_net(NULL, peer, best_lpn,
+							msg, md_cpt);
+	}
+
+	return best_ni;
+}
+
+static struct lnet_ni *
+lnet_find_existing_preferred_best_ni(struct lnet_peer_ni *lpni, int cpt)
+{
+	struct lnet_ni *best_ni = NULL;
+	struct lnet_peer_net *peer_net = lpni->lpni_peer_net;
+	struct lnet_peer_ni *lpni_entry;
+
+	/*
+	 * We must use a consistent source address when sending to a
+	 * non-MR peer. However, a non-MR peer can have multiple NIDs
+	 * on multiple networks, and we may even need to talk to this
+	 * peer on multiple networks -- certain types of
+	 * load-balancing configuration do this.
+	 *
+	 * So we need to pick the NI the peer prefers for this
+	 * particular network.
+	 */
+	LASSERT(peer_net);
+	list_for_each_entry(lpni_entry, &peer_net->lpn_peer_nis,
+			    lpni_peer_nis) {
+		if (lpni_entry->lpni_pref_nnids == 0)
+			continue;
+		LASSERT(lpni_entry->lpni_pref_nnids == 1);
+		best_ni = lnet_nid_to_ni_locked(&lpni_entry->lpni_pref.nid,
+						cpt);
+		break;
+	}
+
+	return best_ni;
+}
+
+/* Prerequisite: sd->sd_peer and sd->sd_best_lpni should be set */
+static int
+lnet_select_preferred_best_ni(struct lnet_send_data *sd)
+{
+	struct lnet_ni *best_ni = NULL;
+
+	/*
+	 * We must use a consistent source address when sending to a
+	 * non-MR peer. However, a non-MR peer can have multiple NIDs
+	 * on multiple networks, and we may even need to talk to this
+	 * peer on multiple networks -- certain types of
+	 * load-balancing configuration do this.
+	 *
+	 * So we need to pick the NI the peer prefers for this
+	 * particular network.
+	 *
+	 * An exception is traffic on LNET_RESERVED_PORTAL. Internal LNet
+	 * traffic doesn't care which source NI is used, and we don't actually
+	 * want to restrict local recovery pings to a single source NI.
+	 */
+	if (!lnet_reserved_msg(sd->sd_msg))
+		best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni,
+							       sd->sd_cpt);
+
+	if (!best_ni)
+		best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
+						sd->sd_best_lpni->lpni_peer_net,
+						sd->sd_msg,
+						sd->sd_md_cpt);
+
+	/* If there is no best_ni we don't have a route */
+	if (!best_ni) {
+		CERROR("no path to %s from net %s\n",
+			libcfs_nidstr(&sd->sd_best_lpni->lpni_nid),
+			libcfs_net2str(sd->sd_best_lpni->lpni_net->net_id));
+		return -EHOSTUNREACH;
+	}
+
+	sd->sd_best_ni = best_ni;
+
+	/* Set preferred NI if necessary. */
+	lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg);
+
+	return 0;
+}
+
+
+/*
+ * Source not specified
+ * Local destination
+ * Non-MR Peer
+ *
+ * always use the same source NID for NMR peers
+ * If we've talked to that peer before then we already have a preferred
+ * source NI associated with it. Otherwise, we select a preferred local NI
+ * and store it in the peer
+ */
+static int
+lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd)
+{
+	int rc = 0;
+
+	/* sd->sd_best_lpni is already set to the final destination */
+
+	/*
+	 * At this point we should've created the peer ni and peer. If we
+	 * can't find it, then something went wrong. Instead of assert
+	 * output a relevant message and fail the send
+	 */
+	if (!sd->sd_best_lpni) {
+		CERROR("Internal fault. Unable to send msg %s to %s. NID not known\n",
+		       lnet_msgtyp2str(sd->sd_msg->msg_type),
+		       libcfs_nidstr(&sd->sd_dst_nid));
+		return -EFAULT;
+	}
+
+	if (sd->sd_msg->msg_routing) {
+		/* If I'm forwarding this message then I can choose any NI
+		 * on the destination peer net
+		 */
+		sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL,
+							       sd->sd_peer,
+							       sd->sd_best_lpni->lpni_peer_net,
+							       sd->sd_msg,
+							       sd->sd_md_cpt);
+		if (!sd->sd_best_ni) {
+			CERROR("Unable to forward message to %s. No local NI available\n",
+			       libcfs_nidstr(&sd->sd_dst_nid));
+			rc = -EHOSTUNREACH;
+		}
+	} else
+		rc = lnet_select_preferred_best_ni(sd);
+
+	if (!rc)
+		rc = lnet_handle_send(sd);
+
+	return rc;
+}
+
+static int
+lnet_handle_any_mr_dsta(struct lnet_send_data *sd)
+{
+	/*
+	 * NOTE we've already handled the remote peer case. So we only
+	 * need to worry about the local case here.
+	 *
+	 * if we're sending a response, ACK or reply, we need to send it
+	 * to the destination NID given to us. At this point we already
+	 * have the peer_ni we're suppose to send to, so just find the
+	 * best_ni on the peer net and use that. Since we're sending to an
+	 * MR peer then we can just run the selection algorithm on our
+	 * local NIs and pick the best one.
+	 */
+	if (sd->sd_send_case & SND_RESP) {
+		sd->sd_best_ni =
+		  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
+						sd->sd_best_lpni->lpni_peer_net,
+						sd->sd_msg,
+						sd->sd_md_cpt);
+
+		if (!sd->sd_best_ni) {
+			/*
+			 * We're not going to deal with not able to send
+			 * a response to the provided final destination
+			 */
+			CERROR("Can't send response to %s. No local NI available\n",
+				libcfs_nidstr(&sd->sd_dst_nid));
+			return -EHOSTUNREACH;
+		}
+
+		return lnet_handle_send(sd);
+	}
+
+	/*
+	 * If we get here that means we're sending a fresh request, PUT or
+	 * GET, so we need to run our standard selection algorithm.
+	 * First find the best local interface that's on any of the peer's
+	 * networks.
+	 */
+	sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer,
+					sd->sd_md_cpt,
+					sd->sd_msg,
+					lnet_msg_discovery(sd->sd_msg));
+	if (sd->sd_best_ni) {
+		sd->sd_best_lpni =
+		  lnet_find_best_lpni(sd->sd_best_ni,
+					     lnet_nid_to_nid4(&sd->sd_dst_nid),
+				      sd->sd_peer,
+				      sd->sd_best_ni->ni_net->net_id);
+
+		/*
+		 * if we're successful in selecting a peer_ni on the local
+		 * network, then send to it. Otherwise fall through and
+		 * try and see if we can reach it over another routed
+		 * network
+		 */
+		if (sd->sd_best_lpni &&
+		    nid_same(&sd->sd_best_lpni->lpni_nid,
+			     &the_lnet.ln_loni->ni_nid)) {
+			/*
+			 * in case we initially started with a routed
+			 * destination, let's reset to local
+			 */
+			sd->sd_send_case &= ~REMOTE_DST;
+			sd->sd_send_case |= LOCAL_DST;
+			return lnet_handle_lo_send(sd);
+		} else if (sd->sd_best_lpni) {
+			/*
+			 * in case we initially started with a routed
+			 * destination, let's reset to local
+			 */
+			sd->sd_send_case &= ~REMOTE_DST;
+			sd->sd_send_case |= LOCAL_DST;
+			return lnet_handle_send(sd);
+		}
+
+		CERROR("Internal Error. Expected to have a best_lpni: "
+		       "%s -> %s\n",
+		       libcfs_nidstr(&sd->sd_src_nid),
+		       libcfs_nidstr(&sd->sd_dst_nid));
+
+		return -EFAULT;
+	}
+
+	/*
+	 * Peer doesn't have a local network. Let's see if there is
+	 * a remote network we can reach it on.
+	 */
+	return PASS_THROUGH;
+}
+
+/*
+ * Case 1:
+ *	Source NID not specified
+ *	Local destination
+ *	MR peer
+ *
+ * Case 2:
+ *	Source NID not speified
+ *	Remote destination
+ *	MR peer
+ *
+ * In both of these cases if we're sending a response, ACK or REPLY, then
+ * we need to send to the destination NID provided.
+ *
+ * In the remote case let's deal with MR routers.
+ *
+ */
+
+static int
+lnet_handle_any_mr_dst(struct lnet_send_data *sd)
+{
+	int rc = 0;
+	struct lnet_peer *gw_peer = NULL;
+	struct lnet_peer_ni *gw_lpni = NULL;
+
+	/*
+	 * handle sending a response to a remote peer here so we don't
+	 * have to worry about it if we hit lnet_handle_any_mr_dsta()
+	 */
+	if (sd->sd_send_case & REMOTE_DST &&
+	    sd->sd_send_case & SND_RESP) {
+		struct lnet_peer_ni *gw;
+		struct lnet_peer *gw_peer;
+
+		rc = lnet_handle_find_routed_path(
+			sd, &sd->sd_dst_nid, &gw, &gw_peer);
+		if (rc < 0) {
+			CERROR("Can't send response to %s. No route available\n",
+			       libcfs_nidstr(&sd->sd_dst_nid));
+			return -EHOSTUNREACH;
+		} else if (rc > 0) {
+			return rc;
+		}
+
+		sd->sd_best_lpni = gw;
+		sd->sd_peer = gw_peer;
+
+		return lnet_handle_send(sd);
+	}
+
+	/*
+	 * Even though the NID for the peer might not be on a local network,
+	 * since the peer is MR there could be other interfaces on the
+	 * local network. In that case we'd still like to prefer the local
+	 * network over the routed network. If we're unable to do that
+	 * then we select the best router among the different routed networks,
+	 * and if the router is MR then we can deal with it as such.
+	 */
+	rc = lnet_handle_any_mr_dsta(sd);
+	if (rc != PASS_THROUGH)
+		return rc;
+
+	/*
+	 * Now that we must route to the destination, we must consider the
+	 * MR case, where the destination has multiple interfaces, some of
+	 * which we can route to and others we do not. For this reason we
+	 * need to select the destination which we can route to and if
+	 * there are multiple, we need to round robin.
+	 */
+	rc = lnet_handle_find_routed_path(sd, &sd->sd_dst_nid,
+					  &gw_lpni, &gw_peer);
+	if (rc)
+		return rc;
+
+	sd->sd_send_case &= ~LOCAL_DST;
+	sd->sd_send_case |= REMOTE_DST;
+
+	sd->sd_peer = gw_peer;
+	sd->sd_best_lpni = gw_lpni;
+
+	return lnet_handle_send(sd);
+}
+
+/*
+ * Source not specified
+ * Remote destination
+ * Non-MR peer
+ *
+ * Must send to the specified peer NID using the same source NID that
+ * we've used before. If it's the first time to talk to that peer then
+ * find the source NI and assign it as preferred to that peer
+ */
+static int
+lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd)
+{
+	int rc;
+	struct lnet_peer_ni *gw_lpni = NULL;
+	struct lnet_peer *gw_peer = NULL;
+
+	/*
+	 * Let's see if we have a preferred NI to talk to this NMR peer
+	 */
+	sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni,
+							      sd->sd_cpt);
+
+	/*
+	 * find the router and that'll find the best NI if we didn't find
+	 * it already.
+	 */
+	rc = lnet_handle_find_routed_path(sd, &sd->sd_dst_nid, &gw_lpni,
+					  &gw_peer);
+	if (rc)
+		return rc;
+
+	/*
+	 * set the best_ni we've chosen as the preferred one for
+	 * this peer
+	 */
+	lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg);
+
+	/* we'll be sending to the gw */
+	sd->sd_best_lpni = gw_lpni;
+	sd->sd_peer = gw_peer;
+
+	return lnet_handle_send(sd);
+}
+
+static int
+lnet_handle_send_case_locked(struct lnet_send_data *sd)
+{
+	/*
+	 * turn off the SND_RESP bit.
+	 * It will be checked in the case handling
+	 */
+	__u32 send_case = sd->sd_send_case &= ~SND_RESP ;
+
+	CDEBUG(D_NET, "Source %s%s to %s %s %s destination\n",
+		(send_case & SRC_SPEC) ? "Specified: " : "ANY",
+		(send_case & SRC_SPEC) ? libcfs_nidstr(&sd->sd_src_nid) : "",
+		(send_case & MR_DST) ? "MR: " : "NMR: ",
+		libcfs_nidstr(&sd->sd_dst_nid),
+		(send_case & LOCAL_DST) ? "local" : "routed");
+
+	switch (send_case) {
+	/*
+	 * For all cases where the source is specified, we should always
+	 * use the destination NID, whether it's an MR destination or not,
+	 * since we're continuing a series of related messages for the
+	 * same RPC
+	 */
+	case SRC_SPEC_LOCAL_NMR_DST:
+		return lnet_handle_spec_local_nmr_dst(sd);
+	case SRC_SPEC_LOCAL_MR_DST:
+		return lnet_handle_spec_local_mr_dst(sd);
+	case SRC_SPEC_ROUTER_NMR_DST:
+	case SRC_SPEC_ROUTER_MR_DST:
+		return lnet_handle_spec_router_dst(sd);
+	case SRC_ANY_LOCAL_NMR_DST:
+		return lnet_handle_any_local_nmr_dst(sd);
+	case SRC_ANY_LOCAL_MR_DST:
+	case SRC_ANY_ROUTER_MR_DST:
+		return lnet_handle_any_mr_dst(sd);
+	case SRC_ANY_ROUTER_NMR_DST:
+		return lnet_handle_any_router_nmr_dst(sd);
+	default:
+		CERROR("Unknown send case\n");
+		return -1;
+	}
+}
+
+static int
+lnet_select_pathway(struct lnet_nid *src_nid,
+		    struct lnet_nid *dst_nid,
+		    struct lnet_msg *msg,
+		    struct lnet_nid *rtr_nid)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *peer;
+	struct lnet_send_data send_data;
+	int cpt, rc;
+	int md_cpt;
+	__u32 send_case = 0;
+	bool final_hop;
+	bool mr_forwarding_allowed;
+
+	memset(&send_data, 0, sizeof(send_data));
+
+	/*
+	 * get an initial CPT to use for locking. The idea here is not to
+	 * serialize the calls to select_pathway, so that as many
+	 * operations can run concurrently as possible. To do that we use
+	 * the CPT where this call is being executed. Later on when we
+	 * determine the CPT to use in lnet_message_commit, we switch the
+	 * lock and check if there was any configuration change.  If none,
+	 * then we proceed, if there is, then we restart the operation.
+	 */
+	cpt = lnet_net_lock_current();
+
+	md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
+	if (md_cpt == CFS_CPT_ANY)
+		md_cpt = cpt;
+
+again:
+
+	/*
+	 * If we're being asked to send to the loopback interface, there
+	 * is no need to go through any selection. We can just shortcut
+	 * the entire process and send over lolnd
+	 */
+	send_data.sd_msg = msg;
+	send_data.sd_cpt = cpt;
+	if (nid_is_lo0(dst_nid)) {
+		rc = lnet_handle_lo_send(&send_data);
+		lnet_net_unlock(cpt);
+		return rc;
+	}
+
+	/*
+	 * find an existing peer_ni, or create one and mark it as having been
+	 * created due to network traffic. This call will create the
+	 * peer->peer_net->peer_ni tree.
+	 */
+	lpni = lnet_peerni_by_nid_locked(dst_nid, NULL, cpt);
+	if (IS_ERR(lpni)) {
+		lnet_net_unlock(cpt);
+		return PTR_ERR(lpni);
+	}
+
+	/*
+	 * Cache the original src_nid and rtr_nid. If we need to resend the
+	 * message then we'll need to know whether the src_nid was originally
+	 * specified for this message. If it was originally specified,
+	 * then we need to keep using the same src_nid since it's
+	 * continuing the same sequence of messages. Similarly, rtr_nid will
+	 * affect our choice of next hop.
+	 */
+	if (src_nid)
+		msg->msg_src_nid_param = *src_nid;
+	else
+		msg->msg_src_nid_param = LNET_ANY_NID;
+	if (rtr_nid)
+		msg->msg_rtr_nid_param = *rtr_nid;
+	else
+		msg->msg_rtr_nid_param = LNET_ANY_NID;
+
+	/*
+	 * If necessary, perform discovery on the peer that owns this peer_ni.
+	 * Note, this can result in the ownership of this peer_ni changing
+	 * to another peer object.
+	 */
+	rc = lnet_initiate_peer_discovery(lpni, msg, cpt);
+	if (rc) {
+		lnet_peer_ni_decref_locked(lpni);
+		lnet_net_unlock(cpt);
+		return rc;
+	}
+	lnet_peer_ni_decref_locked(lpni);
+
+	peer = lpni->lpni_peer_net->lpn_peer;
+
+	/*
+	 * Identify the different send cases
+	 */
+	if (!src_nid || LNET_NID_IS_ANY(src_nid)) {
+		send_case |= SRC_ANY;
+		if (lnet_get_net_locked(LNET_NID_NET(dst_nid)))
+			send_case |= LOCAL_DST;
+		else
+			send_case |= REMOTE_DST;
+	} else {
+		send_case |= SRC_SPEC;
+		if (LNET_NID_NET(src_nid) == LNET_NID_NET(dst_nid))
+			send_case |= LOCAL_DST;
+		else
+			send_case |= REMOTE_DST;
+	}
+
+	final_hop = false;
+	if (msg->msg_routing && (send_case & LOCAL_DST))
+		final_hop = true;
+
+	/* Determine whether to allow MR forwarding for this message.
+	 * NB: MR forwarding is allowed if the message originator and the
+	 * destination are both MR capable, and the destination lpni that was
+	 * originally chosen by the originator is unhealthy or down.
+	 * We check the MR capability of the destination further below
+	 */
+	mr_forwarding_allowed = false;
+	if (final_hop) {
+		struct lnet_peer *src_lp;
+		struct lnet_peer_ni *src_lpni;
+
+		src_lpni = lnet_peerni_by_nid_locked(&msg->msg_hdr.src_nid,
+						   NULL, cpt);
+		/* We don't fail the send if we hit any errors here. We'll just
+		 * try to send it via non-multi-rail criteria
+		 */
+		if (!IS_ERR(src_lpni)) {
+			/* Drop ref taken by lnet_nid2peerni_locked() */
+			lnet_peer_ni_decref_locked(src_lpni);
+			src_lp = lpni->lpni_peer_net->lpn_peer;
+			if (lnet_peer_is_multi_rail(src_lp) &&
+			    !lnet_is_peer_ni_alive(lpni))
+				mr_forwarding_allowed = true;
+
+		}
+		CDEBUG(D_NET, "msg %p MR forwarding %s\n", msg,
+		       mr_forwarding_allowed ? "allowed" : "not allowed");
+	}
+
+	/*
+	 * Deal with the peer as NMR in the following cases:
+	 * 1. the peer is NMR
+	 * 2. We're trying to recover a specific peer NI
+	 * 3. I'm a router sending to the final destination and MR forwarding is
+	 *    not allowed for this message (as determined above).
+	 *    In this case the source of the message would've
+	 *    already selected the final destination so my job
+	 *    is to honor the selection.
+	 */
+	if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery ||
+	    (final_hop && !mr_forwarding_allowed))
+		send_case |= NMR_DST;
+	else
+		send_case |= MR_DST;
+
+	if (lnet_msg_is_response(msg))
+		send_case |= SND_RESP;
+
+	/* assign parameters to the send_data */
+	if (rtr_nid)
+		send_data.sd_rtr_nid = *rtr_nid;
+	else
+		send_data.sd_rtr_nid = LNET_ANY_NID;
+	if (src_nid)
+		send_data.sd_src_nid = *src_nid;
+	else
+		send_data.sd_src_nid = LNET_ANY_NID;
+	send_data.sd_dst_nid = *dst_nid;
+	send_data.sd_best_lpni = lpni;
+	/*
+	 * keep a pointer to the final destination in case we're going to
+	 * route, so we'll need to access it later
+	 */
+	send_data.sd_final_dst_lpni = lpni;
+	send_data.sd_peer = peer;
+	send_data.sd_md_cpt = md_cpt;
+	send_data.sd_send_case = send_case;
+
+	rc = lnet_handle_send_case_locked(&send_data);
+
+	/*
+	 * Update the local cpt since send_data.sd_cpt might've been
+	 * updated as a result of calling lnet_handle_send_case_locked().
+	 */
+	cpt = send_data.sd_cpt;
+
+	if (rc == REPEAT_SEND)
+		goto again;
+
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
+int
+lnet_send(struct lnet_nid *src_nid, struct lnet_msg *msg,
+	  struct lnet_nid *rtr_nid)
+{
+	struct lnet_nid	*dst_nid = &msg->msg_target.nid;
+	int rc;
+
+	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+	LASSERT(msg->msg_txpeer == NULL);
+	LASSERT(msg->msg_txni == NULL);
+	LASSERT(!msg->msg_sending);
+	LASSERT(!msg->msg_target_is_router);
+	LASSERT(!msg->msg_receiving);
+
+	msg->msg_sending = 1;
+
+	LASSERT(!msg->msg_tx_committed);
+
+	rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
+	if (rc < 0) {
+		if (rc == -EHOSTUNREACH)
+			msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR;
+		else
+			msg->msg_health_status = LNET_MSG_STATUS_LOCAL_ERROR;
+		return rc;
+	}
+
+	if (rc == LNET_CREDIT_OK)
+		lnet_ni_send(msg->msg_txni, msg);
+
+	/* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT or LNET_DC_WAIT */
+	return 0;
+}
+
+enum lnet_mt_event_type {
+	MT_TYPE_LOCAL_NI = 0,
+	MT_TYPE_PEER_NI
+};
+
+struct lnet_mt_event_info {
+	enum lnet_mt_event_type mt_type;
+	struct lnet_nid mt_nid;
+};
+
+/* called with res_lock held */
+void
+lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt)
+{
+	struct lnet_rsp_tracker *rspt;
+
+	/*
+	 * msg has a refcount on the MD so the MD is not going away.
+	 * The rspt queue for the cpt is protected by
+	 * the lnet_net_lock(cpt). cpt is the cpt of the MD cookie.
+	 */
+	if (!md->md_rspt_ptr)
+		return;
+
+	rspt = md->md_rspt_ptr;
+
+	/* debug code */
+	LASSERT(rspt->rspt_cpt == cpt);
+
+	md->md_rspt_ptr = NULL;
+
+	if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
+		/*
+		 * The monitor thread has invalidated this handle because the
+		 * response timed out, but it failed to lookup the MD. That
+		 * means this response tracker is on the zombie list. We can
+		 * safely remove it under the resource lock (held by caller) and
+		 * free the response tracker block.
+		 */
+		list_del(&rspt->rspt_on_list);
+		lnet_rspt_free(rspt, cpt);
+	} else {
+		/*
+		 * invalidate the handle to indicate that a response has been
+		 * received, which will then lead the monitor thread to clean up
+		 * the rspt block.
+		 */
+		LNetInvalidateMDHandle(&rspt->rspt_mdh);
+	}
+}
+
+void
+lnet_clean_zombie_rstqs(void)
+{
+	struct lnet_rsp_tracker *rspt, *tmp;
+	int i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		list_for_each_entry_safe(rspt, tmp,
+					 the_lnet.ln_mt_zombie_rstqs[i],
+					 rspt_on_list) {
+			list_del(&rspt->rspt_on_list);
+			lnet_rspt_free(rspt, i);
+		}
+	}
+
+	cfs_percpt_free(the_lnet.ln_mt_zombie_rstqs);
+}
+
+static void
+lnet_finalize_expired_responses(void)
+{
+	struct lnet_libmd *md;
+	struct lnet_rsp_tracker *rspt, *tmp;
+	ktime_t now;
+	int i;
+
+	if (the_lnet.ln_mt_rstq == NULL)
+		return;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		LIST_HEAD(local_queue);
+
+		lnet_net_lock(i);
+		if (!the_lnet.ln_mt_rstq[i]) {
+			lnet_net_unlock(i);
+			continue;
+		}
+		list_splice_init(the_lnet.ln_mt_rstq[i], &local_queue);
+		lnet_net_unlock(i);
+
+		now = ktime_get();
+
+		list_for_each_entry_safe(rspt, tmp, &local_queue, rspt_on_list) {
+			/*
+			 * The rspt mdh will be invalidated when a response
+			 * is received or whenever we want to discard the
+			 * block the monitor thread will walk the queue
+			 * and clean up any rsts with an invalid mdh.
+			 * The monitor thread will walk the queue until
+			 * the first unexpired rspt block. This means that
+			 * some rspt blocks which received their
+			 * corresponding responses will linger in the
+			 * queue until they are cleaned up eventually.
+			 */
+			lnet_res_lock(i);
+			if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
+				lnet_res_unlock(i);
+				list_del(&rspt->rspt_on_list);
+				lnet_rspt_free(rspt, i);
+				continue;
+			}
+
+			if (ktime_compare(now, rspt->rspt_deadline) >= 0 ||
+			    the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) {
+				struct lnet_peer_ni *lpni;
+				struct lnet_nid nid;
+
+				md = lnet_handle2md(&rspt->rspt_mdh);
+				if (!md) {
+					/* MD has been queued for unlink, but
+					 * rspt hasn't been detached (Note we've
+					 * checked above that the rspt_mdh is
+					 * valid). Since we cannot lookup the MD
+					 * we're unable to detach the rspt
+					 * ourselves. Thus, move the rspt to the
+					 * zombie list where we'll wait for
+					 * either:
+					 *   1. The remaining operations on the
+					 *   MD to complete. In this case the
+					 *   final operation will result in
+					 *   lnet_msg_detach_md()->
+					 *   lnet_detach_rsp_tracker() where
+					 *   we will clean up this response
+					 *   tracker.
+					 *   2. LNet to shutdown. In this case
+					 *   we'll wait until after all LND Nets
+					 *   have shutdown and then we can
+					 *   safely free any remaining response
+					 *   tracker blocks on the zombie list.
+					 * Note: We need to hold the resource
+					 * lock when adding to the zombie list
+					 * because we may have concurrent access
+					 * with lnet_detach_rsp_tracker().
+					 */
+					LNetInvalidateMDHandle(&rspt->rspt_mdh);
+					list_move(&rspt->rspt_on_list,
+						  the_lnet.ln_mt_zombie_rstqs[i]);
+					lnet_res_unlock(i);
+					continue;
+				}
+				LASSERT(md->md_rspt_ptr == rspt);
+				md->md_rspt_ptr = NULL;
+				lnet_res_unlock(i);
+
+				LNetMDUnlink(rspt->rspt_mdh);
+
+				nid = rspt->rspt_next_hop_nid;
+
+				list_del(&rspt->rspt_on_list);
+				lnet_rspt_free(rspt, i);
+
+				/* If we're shutting down we just want to clean
+				 * up the rspt blocks
+				 */
+				if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN)
+					continue;
+
+				lnet_net_lock(i);
+				the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++;
+				lnet_net_unlock(i);
+
+				CDEBUG(D_NET,
+				       "Response timeout: md = %p: nid = %s\n",
+				       md, libcfs_nidstr(&nid));
+
+				/*
+				 * If there is a timeout on the response
+				 * from the next hop decrement its health
+				 * value so that we don't use it
+				 */
+				lnet_net_lock(0);
+				lpni = lnet_peer_ni_find_locked(&nid);
+				if (lpni) {
+					lnet_handle_remote_failure_locked(lpni);
+					lnet_peer_ni_decref_locked(lpni);
+				}
+				lnet_net_unlock(0);
+			} else {
+				lnet_res_unlock(i);
+				break;
+			}
+		}
+
+		if (!list_empty(&local_queue)) {
+			lnet_net_lock(i);
+			list_splice(&local_queue, the_lnet.ln_mt_rstq[i]);
+			lnet_net_unlock(i);
+		}
+	}
+}
+
+static void
+lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt)
+{
+	struct lnet_msg *msg;
+
+	while (!list_empty(resendq)) {
+		struct lnet_peer_ni *lpni;
+
+		msg = list_entry(resendq->next, struct lnet_msg,
+				 msg_list);
+
+		list_del_init(&msg->msg_list);
+
+		lpni = lnet_peer_ni_find_locked(&msg->msg_hdr.dest_nid);
+		if (!lpni) {
+			lnet_net_unlock(cpt);
+			CERROR("Expected that a peer is already created for %s\n",
+			       libcfs_nidstr(&msg->msg_hdr.dest_nid));
+			msg->msg_no_resend = true;
+			lnet_finalize(msg, -EFAULT);
+			lnet_net_lock(cpt);
+		} else {
+			int rc;
+
+			lnet_peer_ni_decref_locked(lpni);
+
+			lnet_net_unlock(cpt);
+			CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n",
+			       libcfs_nidstr(&msg->msg_src_nid_param),
+			       libcfs_idstr(&msg->msg_target),
+			       lnet_msgtyp2str(msg->msg_type),
+			       msg->msg_recovery,
+			       msg->msg_retry_count);
+			rc = lnet_send(&msg->msg_src_nid_param, msg,
+				       &msg->msg_rtr_nid_param);
+			if (rc) {
+				CERROR("Error sending %s to %s: %d\n",
+				       lnet_msgtyp2str(msg->msg_type),
+				       libcfs_idstr(&msg->msg_target), rc);
+				msg->msg_no_resend = true;
+				lnet_finalize(msg, rc);
+			}
+			lnet_net_lock(cpt);
+			if (!rc)
+				the_lnet.ln_counters[cpt]->lct_health.lch_resend_count++;
+		}
+	}
+}
+
+static void
+lnet_resend_pending_msgs(void)
+{
+	int i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		lnet_net_lock(i);
+		lnet_resend_pending_msgs_locked(the_lnet.ln_mt_resendqs[i], i);
+		lnet_net_unlock(i);
+	}
+}
+
+/* called with cpt and ni_lock held */
+static void
+lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force)
+{
+	struct lnet_handle_md recovery_mdh;
+
+	LNetInvalidateMDHandle(&recovery_mdh);
+
+	if (ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING ||
+	    force) {
+		recovery_mdh = ni->ni_ping_mdh;
+		LNetInvalidateMDHandle(&ni->ni_ping_mdh);
+	}
+	lnet_ni_unlock(ni);
+	lnet_net_unlock(cpt);
+	if (!LNetMDHandleIsInvalid(recovery_mdh))
+		LNetMDUnlink(recovery_mdh);
+	lnet_net_lock(cpt);
+	lnet_ni_lock(ni);
+}
+
+static void
+lnet_recover_local_nis(void)
+{
+	struct lnet_mt_event_info *ev_info;
+	LIST_HEAD(processed_list);
+	LIST_HEAD(local_queue);
+	struct lnet_handle_md mdh;
+	struct lnet_ni *tmp;
+	struct lnet_ni *ni;
+	struct lnet_nid nid;
+	int healthv;
+	int rc;
+	time64_t now;
+
+	/*
+	 * splice the recovery queue on a local queue. We will iterate
+	 * through the local queue and update it as needed. Once we're
+	 * done with the traversal, we'll splice the local queue back on
+	 * the head of the ln_mt_localNIRecovq. Any newly added local NIs
+	 * will be traversed in the next iteration.
+	 */
+	lnet_net_lock(0);
+	list_splice_init(&the_lnet.ln_mt_localNIRecovq,
+			 &local_queue);
+	lnet_net_unlock(0);
+
+	now = ktime_get_seconds();
+
+	list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) {
+		/*
+		 * if an NI is being deleted or it is now healthy, there
+		 * is no need to keep it around in the recovery queue.
+		 * The monitor thread is the only thread responsible for
+		 * removing the NI from the recovery queue.
+		 * Multiple threads can be adding NIs to the recovery
+		 * queue.
+		 */
+		healthv = atomic_read(&ni->ni_healthv);
+
+		lnet_net_lock(0);
+		lnet_ni_lock(ni);
+		if (ni->ni_state != LNET_NI_STATE_ACTIVE ||
+		    healthv == LNET_MAX_HEALTH_VALUE) {
+			list_del_init(&ni->ni_recovery);
+			lnet_unlink_ni_recovery_mdh_locked(ni, 0, false);
+			lnet_ni_unlock(ni);
+			lnet_ni_decref_locked(ni, 0);
+			lnet_net_unlock(0);
+			continue;
+		}
+
+		/*
+		 * if the local NI failed recovery we must unlink the md.
+		 * But we want to keep the local_ni on the recovery queue
+		 * so we can continue the attempts to recover it.
+		 */
+		if (ni->ni_recovery_state & LNET_NI_RECOVERY_FAILED) {
+			lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
+			ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED;
+		}
+
+
+		lnet_ni_unlock(ni);
+
+		if (now < ni->ni_next_ping) {
+			lnet_net_unlock(0);
+			continue;
+		}
+
+		lnet_net_unlock(0);
+
+		CDEBUG(D_NET, "attempting to recover local ni: %s\n",
+		       libcfs_nidstr(&ni->ni_nid));
+
+		lnet_ni_lock(ni);
+		if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) {
+			ni->ni_recovery_state |= LNET_NI_RECOVERY_PENDING;
+			lnet_ni_unlock(ni);
+
+			LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
+			if (!ev_info) {
+				CERROR("out of memory. Can't recover %s\n",
+				       libcfs_nidstr(&ni->ni_nid));
+				lnet_ni_lock(ni);
+				ni->ni_recovery_state &=
+				  ~LNET_NI_RECOVERY_PENDING;
+				lnet_ni_unlock(ni);
+				continue;
+			}
+
+			mdh = ni->ni_ping_mdh;
+			/*
+			 * Invalidate the ni mdh in case it's deleted.
+			 * We'll unlink the mdh in this case below.
+			 */
+			LNetInvalidateMDHandle(&ni->ni_ping_mdh);
+			nid = ni->ni_nid;
+
+			/*
+			 * remove the NI from the local queue and drop the
+			 * reference count to it while we're recovering
+			 * it. The reason for that, is that the NI could
+			 * be deleted, and the way the code is structured
+			 * is if we don't drop the NI, then the deletion
+			 * code will enter a loop waiting for the
+			 * reference count to be removed while holding the
+			 * ln_mutex_lock(). When we look up the peer to
+			 * send to in lnet_select_pathway() we will try to
+			 * lock the ln_mutex_lock() as well, leading to
+			 * a deadlock. By dropping the refcount and
+			 * removing it from the list, we allow for the NI
+			 * to be removed, then we use the cached NID to
+			 * look it up again. If it's gone, then we just
+			 * continue examining the rest of the queue.
+			 */
+			lnet_net_lock(0);
+			list_del_init(&ni->ni_recovery);
+			lnet_ni_decref_locked(ni, 0);
+			lnet_net_unlock(0);
+
+			ev_info->mt_type = MT_TYPE_LOCAL_NI;
+			ev_info->mt_nid = nid;
+			rc = lnet_send_ping(&nid, &mdh, LNET_INTERFACES_MIN,
+					    ev_info, the_lnet.ln_mt_handler,
+					    true);
+			/* lookup the nid again */
+			lnet_net_lock(0);
+			ni = lnet_nid_to_ni_locked(&nid, 0);
+			if (!ni) {
+				/*
+				 * the NI has been deleted when we dropped
+				 * the ref count
+				 */
+				lnet_net_unlock(0);
+				LNetMDUnlink(mdh);
+				continue;
+			}
+			ni->ni_ping_count++;
+
+			ni->ni_ping_mdh = mdh;
+			lnet_ni_add_to_recoveryq_locked(ni, &processed_list,
+							now);
+
+			if (rc) {
+				lnet_ni_lock(ni);
+				ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+				lnet_ni_unlock(ni);
+			}
+			lnet_net_unlock(0);
+		} else
+			lnet_ni_unlock(ni);
+	}
+
+	/*
+	 * put back the remaining NIs on the ln_mt_localNIRecovq to be
+	 * reexamined in the next iteration.
+	 */
+	list_splice_init(&processed_list, &local_queue);
+	lnet_net_lock(0);
+	list_splice(&local_queue, &the_lnet.ln_mt_localNIRecovq);
+	lnet_net_unlock(0);
+}
+
+static int
+lnet_resendqs_create(void)
+{
+	struct list_head **resendqs;
+	resendqs = lnet_create_array_of_queues();
+
+	if (!resendqs)
+		return -ENOMEM;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_resendqs = resendqs;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+
+static void
+lnet_clean_local_ni_recoveryq(void)
+{
+	struct lnet_ni *ni;
+
+	/* This is only called when the monitor thread has stopped */
+	lnet_net_lock(0);
+
+	while (!list_empty(&the_lnet.ln_mt_localNIRecovq)) {
+		ni = list_entry(the_lnet.ln_mt_localNIRecovq.next,
+				struct lnet_ni, ni_recovery);
+		list_del_init(&ni->ni_recovery);
+		lnet_ni_lock(ni);
+		lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
+		lnet_ni_unlock(ni);
+		lnet_ni_decref_locked(ni, 0);
+	}
+
+	lnet_net_unlock(0);
+}
+
+static void
+lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt,
+				     bool force)
+{
+	struct lnet_handle_md recovery_mdh;
+
+	LNetInvalidateMDHandle(&recovery_mdh);
+
+	if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) {
+		recovery_mdh = lpni->lpni_recovery_ping_mdh;
+		LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
+	}
+	spin_unlock(&lpni->lpni_lock);
+	lnet_net_unlock(cpt);
+	if (!LNetMDHandleIsInvalid(recovery_mdh))
+		LNetMDUnlink(recovery_mdh);
+	lnet_net_lock(cpt);
+	spin_lock(&lpni->lpni_lock);
+}
+
+static void
+lnet_clean_peer_ni_recoveryq(void)
+{
+	struct lnet_peer_ni *lpni, *tmp;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_mt_peerNIRecovq,
+				 lpni_recovery) {
+		list_del_init(&lpni->lpni_recovery);
+		spin_lock(&lpni->lpni_lock);
+		lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true);
+		spin_unlock(&lpni->lpni_lock);
+		lnet_peer_ni_decref_locked(lpni);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static void
+lnet_clean_resendqs(void)
+{
+	struct lnet_msg *msg, *tmp;
+	LIST_HEAD(msgs);
+	int i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		lnet_net_lock(i);
+		list_splice_init(the_lnet.ln_mt_resendqs[i], &msgs);
+		lnet_net_unlock(i);
+		list_for_each_entry_safe(msg, tmp, &msgs, msg_list) {
+			list_del_init(&msg->msg_list);
+			msg->msg_no_resend = true;
+			lnet_finalize(msg, -ESHUTDOWN);
+		}
+	}
+
+	cfs_percpt_free(the_lnet.ln_mt_resendqs);
+}
+
+static void
+lnet_recover_peer_nis(void)
+{
+	struct lnet_mt_event_info *ev_info;
+	LIST_HEAD(processed_list);
+	LIST_HEAD(local_queue);
+	struct lnet_handle_md mdh;
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_ni *tmp;
+	struct lnet_nid nid;
+	int healthv;
+	int rc;
+	time64_t now;
+
+	/*
+	 * Always use cpt 0 for locking across all interactions with
+	 * ln_mt_peerNIRecovq
+	 */
+	lnet_net_lock(0);
+	list_splice_init(&the_lnet.ln_mt_peerNIRecovq,
+			 &local_queue);
+	lnet_net_unlock(0);
+
+	now = ktime_get_seconds();
+
+	list_for_each_entry_safe(lpni, tmp, &local_queue,
+				 lpni_recovery) {
+		/*
+		 * The same protection strategy is used here as is in the
+		 * local recovery case.
+		 */
+		lnet_net_lock(0);
+		healthv = atomic_read(&lpni->lpni_healthv);
+		spin_lock(&lpni->lpni_lock);
+		if (lpni->lpni_state & LNET_PEER_NI_DELETING ||
+		    healthv == LNET_MAX_HEALTH_VALUE) {
+			list_del_init(&lpni->lpni_recovery);
+			lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false);
+			spin_unlock(&lpni->lpni_lock);
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(0);
+			continue;
+		}
+
+		/*
+		 * If the peer NI has failed recovery we must unlink the
+		 * md. But we want to keep the peer ni on the recovery
+		 * queue so we can try to continue recovering it
+		 */
+		if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) {
+			lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true);
+			lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED;
+		}
+
+		spin_unlock(&lpni->lpni_lock);
+
+		if (now < lpni->lpni_next_ping) {
+			lnet_net_unlock(0);
+			continue;
+		}
+
+		lnet_net_unlock(0);
+
+		/*
+		 * NOTE: we're racing with peer deletion from user space.
+		 * It's possible that a peer is deleted after we check its
+		 * state. In this case the recovery can create a new peer
+		 */
+		spin_lock(&lpni->lpni_lock);
+		if (!(lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) &&
+		    !(lpni->lpni_state & LNET_PEER_NI_DELETING)) {
+			lpni->lpni_state |= LNET_PEER_NI_RECOVERY_PENDING;
+			spin_unlock(&lpni->lpni_lock);
+
+			LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
+			if (!ev_info) {
+				CERROR("out of memory. Can't recover %s\n",
+				       libcfs_nidstr(&lpni->lpni_nid));
+				spin_lock(&lpni->lpni_lock);
+				lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+				spin_unlock(&lpni->lpni_lock);
+				continue;
+			}
+
+			/* look at the comments in lnet_recover_local_nis() */
+			mdh = lpni->lpni_recovery_ping_mdh;
+			nid = lpni->lpni_nid;
+			LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
+			lnet_net_lock(0);
+			list_del_init(&lpni->lpni_recovery);
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(0);
+
+			ev_info->mt_type = MT_TYPE_PEER_NI;
+			ev_info->mt_nid = nid;
+			rc = lnet_send_ping(&nid, &mdh, LNET_INTERFACES_MIN,
+					    ev_info, the_lnet.ln_mt_handler,
+					    true);
+			lnet_net_lock(0);
+			/*
+			 * lnet_find_peer_ni_locked() grabs a refcount for
+			 * us. No need to take it explicitly.
+			 */
+			lpni = lnet_peer_ni_find_locked(&nid);
+			if (!lpni) {
+				lnet_net_unlock(0);
+				LNetMDUnlink(mdh);
+				continue;
+			}
+
+			lpni->lpni_ping_count++;
+
+			lpni->lpni_recovery_ping_mdh = mdh;
+
+			lnet_peer_ni_add_to_recoveryq_locked(lpni,
+							     &processed_list,
+							     now);
+			if (rc) {
+				spin_lock(&lpni->lpni_lock);
+				lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+				spin_unlock(&lpni->lpni_lock);
+			}
+
+			/* Drop the ref taken by lnet_find_peer_ni_locked() */
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(0);
+		} else
+			spin_unlock(&lpni->lpni_lock);
+	}
+
+	list_splice_init(&processed_list, &local_queue);
+	lnet_net_lock(0);
+	list_splice(&local_queue, &the_lnet.ln_mt_peerNIRecovq);
+	lnet_net_unlock(0);
+}
+
+static int
+lnet_monitor_thread(void *arg)
+{
+	time64_t rsp_timeout = 0;
+	time64_t now;
+
+	wait_for_completion(&the_lnet.ln_started);
+	/*
+	 * The monitor thread takes care of the following:
+	 *  1. Checks the aliveness of routers
+	 *  2. Checks if there are messages on the resend queue to resend
+	 *     them.
+	 *  3. Check if there are any NIs on the local recovery queue and
+	 *     pings them
+	 *  4. Checks if there are any NIs on the remote recovery queue
+	 *     and pings them.
+	 */
+	while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
+		now = ktime_get_real_seconds();
+
+		if (lnet_router_checker_active())
+			lnet_check_routers();
+
+		lnet_resend_pending_msgs();
+
+		if (now >= rsp_timeout) {
+			lnet_finalize_expired_responses();
+			rsp_timeout = now + (lnet_transaction_timeout / 2);
+		}
+
+		lnet_recover_local_nis();
+		lnet_recover_peer_nis();
+
+		/*
+		 * TODO do we need to check if we should sleep without
+		 * timeout?  Technically, an active system will always
+		 * have messages in flight so this check will always
+		 * evaluate to false. And on an idle system do we care
+		 * if we wake up every 1 second? Although, we've seen
+		 * cases where we get a complaint that an idle thread
+		 * is waking up unnecessarily.
+		 */
+		wait_for_completion_interruptible_timeout(
+			&the_lnet.ln_mt_wait_complete,
+			cfs_time_seconds(1));
+		/* Must re-init the completion before testing anything,
+		 * including ln_mt_state.
+		 */
+		reinit_completion(&the_lnet.ln_mt_wait_complete);
+	}
+
+	/* Shutting down */
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* signal that the monitor thread is exiting */
+	up(&the_lnet.ln_mt_signal);
+
+	return 0;
+}
+
+/*
+ * lnet_send_ping
+ * Sends a ping.
+ * Returns == 0 if success
+ * Returns > 0 if LNetMDBind or prior fails
+ * Returns < 0 if LNetGet fails
+ */
+int
+lnet_send_ping(struct lnet_nid *dest_nid,
+	       struct lnet_handle_md *mdh, int nnis,
+	       void *user_data, lnet_handler_t handler, bool recovery)
+{
+	struct lnet_md md = { NULL };
+	struct lnet_process_id id;
+	struct lnet_ping_buffer *pbuf;
+	int rc;
+
+	if (LNET_NID_IS_ANY(dest_nid)) {
+		rc = -EHOSTUNREACH;
+		goto fail_error;
+	}
+
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (!pbuf) {
+		rc = ENOMEM;
+		goto fail_error;
+	}
+
+	/* initialize md content */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(nnis);
+	md.threshold = 2; /* GET/REPLY */
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRUNCATE | LNET_MD_TRACK_RESPONSE;
+	md.user_ptr  = user_data;
+	md.handler   = handler;
+
+	rc = LNetMDBind(&md, LNET_UNLINK, mdh);
+	if (rc) {
+		lnet_ping_buffer_decref(pbuf);
+		CERROR("Can't bind MD: %d\n", rc);
+		rc = -rc; /* change the rc to positive */
+		goto fail_error;
+	}
+	id.pid = LNET_PID_LUSTRE;
+	id.nid = lnet_nid_to_nid4(dest_nid);
+
+	rc = LNetGet(LNET_NID_ANY, *mdh, id,
+		     LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0, recovery);
+
+	if (rc)
+		goto fail_unlink_md;
+
+	return 0;
+
+fail_unlink_md:
+	LNetMDUnlink(*mdh);
+	LNetInvalidateMDHandle(mdh);
+fail_error:
+	return rc;
+}
+
+static void
+lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
+			   int status, bool send, bool unlink_event)
+{
+	struct lnet_nid *nid = &ev_info->mt_nid;
+
+	if (ev_info->mt_type == MT_TYPE_LOCAL_NI) {
+		struct lnet_ni *ni;
+
+		lnet_net_lock(0);
+		ni = lnet_nid_to_ni_locked(nid, 0);
+		if (!ni) {
+			lnet_net_unlock(0);
+			return;
+		}
+		lnet_ni_lock(ni);
+		if (!send || (send && status != 0))
+			ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+		if (status)
+			ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
+		lnet_ni_unlock(ni);
+		lnet_net_unlock(0);
+
+		if (status != 0) {
+			CERROR("local NI (%s) recovery failed with %d\n",
+			       libcfs_nidstr(nid), status);
+			return;
+		}
+		/*
+		 * need to increment healthv for the ni here, because in
+		 * the lnet_finalize() path we don't have access to this
+		 * NI. And in order to get access to it, we'll need to
+		 * carry forward too much information.
+		 * In the peer case, it'll naturally be incremented
+		 */
+		if (!unlink_event)
+			lnet_inc_healthv(&ni->ni_healthv,
+					 lnet_health_sensitivity);
+	} else {
+		struct lnet_peer_ni *lpni;
+		int cpt;
+
+		cpt = lnet_net_lock_current();
+		lpni = lnet_peer_ni_find_locked(nid);
+		if (!lpni) {
+			lnet_net_unlock(cpt);
+			return;
+		}
+		spin_lock(&lpni->lpni_lock);
+		if (!send || (send && status != 0))
+			lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+		if (status)
+			lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
+		spin_unlock(&lpni->lpni_lock);
+		lnet_peer_ni_decref_locked(lpni);
+		lnet_net_unlock(cpt);
+
+		if (status != 0)
+			CERROR("peer NI (%s) recovery failed with %d\n",
+			       libcfs_nidstr(nid), status);
+	}
+}
+
+void
+lnet_mt_event_handler(struct lnet_event *event)
+{
+	struct lnet_mt_event_info *ev_info = event->md_user_ptr;
+	struct lnet_ping_buffer *pbuf;
+
+	/* TODO: remove assert */
+	LASSERT(event->type == LNET_EVENT_REPLY ||
+		event->type == LNET_EVENT_SEND ||
+		event->type == LNET_EVENT_UNLINK);
+
+	CDEBUG(D_NET, "Received event: %d status: %d\n", event->type,
+	       event->status);
+
+	switch (event->type) {
+	case LNET_EVENT_UNLINK:
+		CDEBUG(D_NET, "%s recovery ping unlinked\n",
+		       libcfs_nidstr(&ev_info->mt_nid));
+		fallthrough;
+	case LNET_EVENT_REPLY:
+		lnet_handle_recovery_reply(ev_info, event->status, false,
+					   event->type == LNET_EVENT_UNLINK);
+		break;
+	case LNET_EVENT_SEND:
+		CDEBUG(D_NET, "%s recovery message sent %s:%d\n",
+			       libcfs_nidstr(&ev_info->mt_nid),
+			       (event->status) ? "unsuccessfully" :
+			       "successfully", event->status);
+		lnet_handle_recovery_reply(ev_info, event->status, true, false);
+		break;
+	default:
+		CERROR("Unexpected event: %d\n", event->type);
+		break;
+	}
+	if (event->unlinked) {
+		LIBCFS_FREE(ev_info, sizeof(*ev_info));
+		pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start);
+		lnet_ping_buffer_decref(pbuf);
+	}
+}
+
+static int
+lnet_rsp_tracker_create(void)
+{
+	struct list_head **rstqs;
+	rstqs = lnet_create_array_of_queues();
+
+	if (!rstqs)
+		return -ENOMEM;
+
+	the_lnet.ln_mt_rstq = rstqs;
+
+	return 0;
+}
+
+static void
+lnet_rsp_tracker_clean(void)
+{
+	lnet_finalize_expired_responses();
+
+	cfs_percpt_free(the_lnet.ln_mt_rstq);
+	the_lnet.ln_mt_rstq = NULL;
+}
+
+int lnet_monitor_thr_start(void)
+{
+	int rc = 0;
+	struct task_struct *task;
+
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_SHUTDOWN)
+		return -EALREADY;
+
+	rc = lnet_resendqs_create();
+	if (rc)
+		return rc;
+
+	rc = lnet_rsp_tracker_create();
+	if (rc)
+		goto clean_queues;
+
+	sema_init(&the_lnet.ln_mt_signal, 0);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_RUNNING;
+	lnet_net_unlock(LNET_LOCK_EX);
+	task = kthread_run(lnet_monitor_thread, NULL, "monitor_thread");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Can't start monitor thread: %d\n", rc);
+		goto clean_thread;
+	}
+
+	return 0;
+
+clean_thread:
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
+	lnet_net_unlock(LNET_LOCK_EX);
+	/* block until event callback signals exit */
+	down(&the_lnet.ln_mt_signal);
+	/* clean up */
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
+	lnet_net_unlock(LNET_LOCK_EX);
+	lnet_rsp_tracker_clean();
+	lnet_clean_local_ni_recoveryq();
+	lnet_clean_peer_ni_recoveryq();
+	lnet_clean_resendqs();
+	the_lnet.ln_mt_handler = NULL;
+	return rc;
+clean_queues:
+	lnet_rsp_tracker_clean();
+	lnet_clean_local_ni_recoveryq();
+	lnet_clean_peer_ni_recoveryq();
+	lnet_clean_resendqs();
+	return rc;
+}
+
+void lnet_monitor_thr_stop(void)
+{
+	if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN)
+		return;
+
+	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* tell the monitor thread that we're shutting down */
+	complete(&the_lnet.ln_mt_wait_complete);
+
+	/* block until monitor thread signals that it's done */
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	down(&the_lnet.ln_mt_signal);
+	mutex_lock(&the_lnet.ln_api_mutex);
+	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN);
+
+	/* perform cleanup tasks */
+	lnet_rsp_tracker_clean();
+	lnet_clean_local_ni_recoveryq();
+	lnet_clean_peer_ni_recoveryq();
+	lnet_clean_resendqs();
+}
+
+void
+lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob,
+		  __u32 msg_type)
+{
+	lnet_net_lock(cpt);
+	lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP);
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += nob;
+	lnet_net_unlock(cpt);
+
+	lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	struct lnet_hdr	*hdr = &msg->msg_hdr;
+
+	if (msg->msg_wanted != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+	/* Must I ACK?	If so I'll grab the ack_wmd out of the header and put
+	 * it back into the ACK during lnet_finalize() */
+	msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+			(msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+		     msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	struct lnet_hdr		*hdr = &msg->msg_hdr;
+	struct lnet_match_info	info;
+	int			rc;
+	bool			ready_delay;
+
+	/* Convert put fields to host byte order */
+	hdr->msg.put.match_bits	= le64_to_cpu(hdr->msg.put.match_bits);
+	hdr->msg.put.ptl_index	= le32_to_cpu(hdr->msg.put.ptl_index);
+	hdr->msg.put.offset	= le32_to_cpu(hdr->msg.put.offset);
+
+	/* Primary peer NID. */
+	info.mi_id.nid = msg->msg_initiator;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_PUT;
+	info.mi_portal	= hdr->msg.put.ptl_index;
+	info.mi_rlength	= hdr->payload_length;
+	info.mi_roffset	= hdr->msg.put.offset;
+	info.mi_mbits	= hdr->msg.put.match_bits;
+	info.mi_cpt	= lnet_nid2cpt(&msg->msg_initiator, ni);
+
+	msg->msg_rx_ready_delay = ni->ni_net->net_lnd->lnd_eager_recv == NULL;
+	ready_delay = msg->msg_rx_ready_delay;
+
+ again:
+	rc = lnet_ptl_match_md(&info, msg);
+	switch (rc) {
+	default:
+		LBUG();
+
+	case LNET_MATCHMD_OK:
+		lnet_recv_put(ni, msg);
+		return 0;
+
+	case LNET_MATCHMD_NONE:
+		if (ready_delay)
+			/* no eager_recv or has already called it, should
+			 * have been attached on delayed list */
+			return 0;
+
+		rc = lnet_ni_eager_recv(ni, msg);
+		if (rc == 0) {
+			ready_delay = true;
+			goto again;
+		}
+		fallthrough;
+
+	case LNET_MATCHMD_DROP:
+		CNETERR("Dropping PUT from %s portal %d match %llu"
+			" offset %d length %d: %d\n",
+			libcfs_idstr(&info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+		return -ENOENT;	/* -ve: OK but no match */
+	}
+}
+
+static int
+lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get)
+{
+	struct lnet_match_info info;
+	struct lnet_hdr *hdr = &msg->msg_hdr;
+	struct lnet_processid source_id;
+	struct lnet_handle_wire	reply_wmd;
+	int rc;
+
+	/* Convert get fields to host byte order */
+	hdr->msg.get.match_bits	  = le64_to_cpu(hdr->msg.get.match_bits);
+	hdr->msg.get.ptl_index	  = le32_to_cpu(hdr->msg.get.ptl_index);
+	hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
+	hdr->msg.get.src_offset	  = le32_to_cpu(hdr->msg.get.src_offset);
+
+	source_id.nid = hdr->src_nid;
+	source_id.pid = hdr->src_pid;
+	/* Primary peer NID */
+	info.mi_id.nid  = msg->msg_initiator;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_GET;
+	info.mi_portal	= hdr->msg.get.ptl_index;
+	info.mi_rlength	= hdr->msg.get.sink_length;
+	info.mi_roffset	= hdr->msg.get.src_offset;
+	info.mi_mbits	= hdr->msg.get.match_bits;
+	info.mi_cpt	= lnet_nid2cpt(&msg->msg_initiator, ni);
+
+	rc = lnet_ptl_match_md(&info, msg);
+	if (rc == LNET_MATCHMD_DROP) {
+		CNETERR("Dropping GET from %s portal %d match %llu"
+			" offset %d length %d\n",
+			libcfs_idstr(&info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength);
+		return -ENOENT;	/* -ve: OK but no match */
+	}
+
+	LASSERT(rc == LNET_MATCHMD_OK);
+
+	lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+	reply_wmd = hdr->msg.get.return_wmd;
+
+	lnet_prep_send(msg, LNET_MSG_REPLY, &source_id,
+		       msg->msg_offset, msg->msg_wanted);
+
+	msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+	if (rdma_get) {
+		/* The LND completes the REPLY from her recv procedure */
+		lnet_ni_recv(ni, msg->msg_private, msg, 0,
+			     msg->msg_offset, msg->msg_len, msg->msg_len);
+		return 0;
+	}
+
+	lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+	msg->msg_receiving = 0;
+
+	rc = lnet_send(&ni->ni_nid, msg, &msg->msg_from);
+	if (rc < 0) {
+		/* didn't get as far as lnet_ni_send() */
+		CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+		       libcfs_nidstr(&ni->ni_nid),
+		       libcfs_idstr(&info.mi_id), rc);
+
+		lnet_finalize(msg, rc);
+	}
+
+	return 0;
+}
+
+static int
+lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	void *private = msg->msg_private;
+	struct lnet_hdr *hdr = &msg->msg_hdr;
+	struct lnet_processid src = {};
+	struct lnet_libmd *md;
+	unsigned int rlength;
+	unsigned int mlength;
+	int cpt;
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CNETERR("%s: Dropping REPLY from %s for %s "
+			"MD %#llx.%#llx\n",
+			libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src),
+			(md == NULL) ? "invalid" : "inactive",
+			hdr->msg.reply.dst_wmd.wh_interface_cookie,
+			hdr->msg.reply.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return -ENOENT;	/* -ve: OK but no match */
+	}
+
+	LASSERT(md->md_offset == 0);
+
+	rlength = hdr->payload_length;
+	mlength = min(rlength, md->md_length);
+
+	if (mlength < rlength &&
+	    (md->md_options & LNET_MD_TRUNCATE) == 0) {
+		CNETERR("%s: Dropping REPLY from %s length %d "
+			"for MD %#llx would overflow (%d)\n",
+			libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src),
+			rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+			mlength);
+		lnet_res_unlock(cpt);
+		return -ENOENT;	/* -ve: OK but no match */
+	}
+
+	CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n",
+	       libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src),
+	       mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, mlength);
+
+	if (mlength != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+	return 0;
+}
+
+static int
+lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	struct lnet_hdr *hdr = &msg->msg_hdr;
+	struct lnet_processid src = {};
+	struct lnet_libmd *md;
+	int cpt;
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* Convert ack fields to host byte order */
+	hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+	hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		/* Don't moan; this is expected */
+		CDEBUG(D_NET,
+		       "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n",
+		       libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src),
+		       (md == NULL) ? "invalid" : "inactive",
+		       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		       hdr->msg.ack.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return -ENOENT;			 /* -ve! */
+	}
+
+	CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n",
+	       libcfs_nidstr(&ni->ni_nid), libcfs_idstr(&src),
+	       hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+	return 0;
+}
+
+/**
+ * \retval LNET_CREDIT_OK	If \a msg is forwarded
+ * \retval LNET_CREDIT_WAIT	If \a msg is blocked because w/o buffer
+ * \retval -ve			error code
+ */
+int
+lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	int	rc = 0;
+
+	if (!the_lnet.ln_routing)
+		return -ECANCELED;
+
+	if (msg->msg_rxpeer->lpni_rtrcredits <= 0 ||
+	    lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+		if (ni->ni_net->net_lnd->lnd_eager_recv == NULL) {
+			msg->msg_rx_ready_delay = 1;
+		} else {
+			lnet_net_unlock(msg->msg_rx_cpt);
+			rc = lnet_ni_eager_recv(ni, msg);
+			lnet_net_lock(msg->msg_rx_cpt);
+		}
+	}
+
+	if (rc == 0)
+		rc = lnet_post_routed_recv_locked(msg, 0);
+	return rc;
+}
+
+int
+lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	int	rc;
+
+	switch (msg->msg_type) {
+	case LNET_MSG_ACK:
+		rc = lnet_parse_ack(ni, msg);
+		break;
+	case LNET_MSG_PUT:
+		rc = lnet_parse_put(ni, msg);
+		break;
+	case LNET_MSG_GET:
+		rc = lnet_parse_get(ni, msg, msg->msg_rdma_get);
+		break;
+	case LNET_MSG_REPLY:
+		rc = lnet_parse_reply(ni, msg);
+		break;
+	default: /* prevent an unused label if !kernel */
+		LASSERT(0);
+		return -EPROTO;
+	}
+
+	LASSERT(rc == 0 || rc == -ENOENT);
+	return rc;
+}
+
+char *
+lnet_msgtyp2str (int type)
+{
+	switch (type) {
+	case LNET_MSG_ACK:
+		return ("ACK");
+	case LNET_MSG_PUT:
+		return ("PUT");
+	case LNET_MSG_GET:
+		return ("GET");
+	case LNET_MSG_REPLY:
+		return ("REPLY");
+	case LNET_MSG_HELLO:
+		return ("HELLO");
+	default:
+		return ("<UNKNOWN>");
+	}
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+int
+lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr,
+	   struct lnet_nid *from_nid, void *private, int rdma_req)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_msg *msg;
+	__u32 payload_length;
+	lnet_pid_t dest_pid;
+	struct lnet_nid dest_nid;
+	struct lnet_nid src_nid;
+	bool push = false;
+	int for_me;
+	__u32 type;
+	int rc = 0;
+	int cpt;
+	time64_t now = ktime_get_seconds();
+
+	LASSERT (!in_interrupt ());
+
+	type = hdr->type;
+	src_nid = hdr->src_nid;
+	dest_nid = hdr->dest_nid;
+	dest_pid = hdr->dest_pid;
+	payload_length = hdr->payload_length;
+
+	for_me = nid_same(&ni->ni_nid, &dest_nid);
+	cpt = lnet_nid2cpt(from_nid, ni);
+
+	CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s - %s\n",
+		libcfs_nidstr(&dest_nid),
+		libcfs_nidstr(&ni->ni_nid),
+		libcfs_nidstr(&src_nid),
+		lnet_msgtyp2str(type),
+		(for_me) ? "for me" : "routed");
+
+	switch (type) {
+	case LNET_MSG_ACK:
+	case LNET_MSG_GET:
+		if (payload_length > 0) {
+			CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+			       libcfs_nidstr(from_nid),
+			       libcfs_nidstr(&src_nid),
+			       lnet_msgtyp2str(type), payload_length);
+			return -EPROTO;
+		}
+		break;
+
+	case LNET_MSG_PUT:
+	case LNET_MSG_REPLY:
+		if (payload_length >
+		    (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+			CERROR("%s, src %s: bad %s payload %d "
+			       "(%d max expected)\n",
+			       libcfs_nidstr(from_nid),
+			       libcfs_nidstr(&src_nid),
+			       lnet_msgtyp2str(type),
+			       payload_length,
+			       for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+			return -EPROTO;
+		}
+		break;
+
+	default:
+		CERROR("%s, src %s: Bad message type 0x%x\n",
+		       libcfs_nidstr(from_nid),
+		       libcfs_nidstr(&src_nid), type);
+		return -EPROTO;
+	}
+
+	/* Only update net_last_alive for incoming GETs on the reserved portal
+	 * (i.e. incoming lnet/discovery pings).
+	 * This avoids situations where the router's own traffic results in NI
+	 * status changes
+	 */
+	if (the_lnet.ln_routing && type == LNET_MSG_GET &&
+	    hdr->msg.get.ptl_index == LNET_RESERVED_PORTAL &&
+	    !lnet_islocalnid(&src_nid) &&
+	    ni->ni_net->net_last_alive != now) {
+		lnet_ni_lock(ni);
+		spin_lock(&ni->ni_net->net_lock);
+		ni->ni_net->net_last_alive = now;
+		spin_unlock(&ni->ni_net->net_lock);
+		push = lnet_ni_set_status_locked(ni, LNET_NI_STATUS_UP);
+		lnet_ni_unlock(ni);
+	}
+
+	if (push)
+		lnet_push_update_to_peers(1);
+
+	/* Regard a bad destination NID as a protocol error.  Senders should
+	 * know what they're doing; if they don't they're misconfigured, buggy
+	 * or malicious so we chop them off at the knees :) */
+
+	if (!for_me) {
+		if (LNET_NID_NET(&dest_nid) == LNET_NID_NET(&ni->ni_nid)) {
+			/* should have gone direct */
+			CERROR("%s, src %s: Bad dest nid %s "
+			       "(should have been sent direct)\n",
+				libcfs_nidstr(from_nid),
+				libcfs_nidstr(&src_nid),
+				libcfs_nidstr(&dest_nid));
+			return -EPROTO;
+		}
+
+		if (lnet_islocalnid(&dest_nid)) {
+			/* dest is another local NI; sender should have used
+			 * this node's NID on its own network */
+			CERROR("%s, src %s: Bad dest nid %s "
+			       "(it's my nid but on a different network)\n",
+				libcfs_nidstr(from_nid),
+				libcfs_nidstr(&src_nid),
+				libcfs_nidstr(&dest_nid));
+			return -EPROTO;
+		}
+
+		if (rdma_req && type == LNET_MSG_GET) {
+			CERROR("%s, src %s: Bad optimized GET for %s "
+			       "(final destination must be me)\n",
+				libcfs_nidstr(from_nid),
+				libcfs_nidstr(&src_nid),
+				libcfs_nidstr(&dest_nid));
+			return -EPROTO;
+		}
+
+		if (!the_lnet.ln_routing) {
+			CERROR("%s, src %s: Dropping message for %s "
+			       "(routing not enabled)\n",
+				libcfs_nidstr(from_nid),
+				libcfs_nidstr(&src_nid),
+				libcfs_nidstr(&dest_nid));
+			goto drop;
+		}
+	}
+
+	/* Message looks OK; we're not going to return an error, so we MUST
+	 * call back lnd_recv() come what may... */
+
+	if (!list_empty(&the_lnet.ln_test_peers) &&	/* normally we don't */
+	    fail_peer(&src_nid, 0)) {			/* shall we now? */
+		CERROR("%s, src %s: Dropping %s to simulate failure\n",
+		       libcfs_nidstr(from_nid), libcfs_nidstr(&src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	/* FIXME need to support large-addr nid */
+	if (!list_empty(&the_lnet.ln_drop_rules) &&
+	    lnet_drop_rule_match(hdr, lnet_nid_to_nid4(&ni->ni_nid), NULL)) {
+		CDEBUG(D_NET,
+		       "%s, src %s, dst %s: Dropping %s to simulate silent message loss\n",
+		       libcfs_nidstr(from_nid), libcfs_nidstr(&src_nid),
+		       libcfs_nidstr(&dest_nid), lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("%s, src %s: Dropping %s (out of memory)\n",
+		       libcfs_nidstr(from_nid), libcfs_nidstr(&src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	/* msg zeroed in lnet_msg_alloc; i.e. flags all clear,
+	 * pointers NULL etc */
+
+	msg->msg_type = type;
+	msg->msg_private = private;
+	msg->msg_receiving = 1;
+	msg->msg_rdma_get = rdma_req;
+	msg->msg_len = msg->msg_wanted = payload_length;
+	msg->msg_offset = 0;
+	msg->msg_hdr = *hdr;
+	/* for building message event */
+	msg->msg_from = *from_nid;
+	if (!for_me) {
+		msg->msg_target.pid = dest_pid;
+		msg->msg_target.nid = dest_nid;
+		msg->msg_routing = 1;
+	}
+
+	lnet_net_lock(cpt);
+	lpni = lnet_peerni_by_nid_locked(from_nid, &ni->ni_nid, cpt);
+	if (IS_ERR(lpni)) {
+		lnet_net_unlock(cpt);
+		rc = PTR_ERR(lpni);
+		CERROR("%s, src %s: Dropping %s (error %d looking up sender)\n",
+		       libcfs_nidstr(from_nid), libcfs_nidstr(&src_nid),
+		       lnet_msgtyp2str(type), rc);
+		lnet_msg_free(msg);
+		if (rc == -ESHUTDOWN)
+			/* We are shutting down.  Don't do anything more */
+			return 0;
+		goto drop;
+	}
+
+	/* If this message was forwarded to us from a router then we may need
+	 * to update router aliveness or check for an asymmetrical route
+	 * (or both)
+	 */
+	if (((lnet_drop_asym_route && for_me) ||
+	     !lpni->lpni_peer_net->lpn_peer->lp_alive) &&
+	    LNET_NID_NET(&src_nid) != LNET_NID_NET(from_nid)) {
+		__u32 src_net_id = LNET_NID_NET(&src_nid);
+		struct lnet_peer *gw = lpni->lpni_peer_net->lpn_peer;
+		struct lnet_route *route;
+		bool found = false;
+
+		list_for_each_entry(route, &gw->lp_routes, lr_gwlist) {
+			if (route->lr_net == src_net_id) {
+				found = true;
+				/* If we're transitioning the gateway from
+				 * dead -> alive, and discovery is disabled
+				 * locally or on the gateway, then we need to
+				 * update the cached route aliveness for each
+				 * route to the src_nid's net.
+				 *
+				 * Otherwise, we're only checking for
+				 * symmetrical route, and we can break the
+				 * loop
+				 */
+				if (!gw->lp_alive &&
+				    lnet_is_discovery_disabled(gw))
+					lnet_set_route_aliveness(route, true);
+				else
+					break;
+			}
+		}
+		if (lnet_drop_asym_route && for_me && !found) {
+			/* Drop ref taken by lnet_nid2peerni_locked() */
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(cpt);
+			/* we would not use from_nid to route a message to
+			 * src_nid
+			 * => asymmetric routing detected but forbidden
+			 */
+			CERROR("%s, src %s: Dropping asymmetrical route %s\n",
+			       libcfs_nidstr(from_nid),
+			       libcfs_nidstr(&src_nid), lnet_msgtyp2str(type));
+			lnet_msg_free(msg);
+			goto drop;
+		}
+		if (!gw->lp_alive) {
+			struct lnet_peer_net *lpn;
+			struct lnet_peer_ni *lpni2;
+
+			gw->lp_alive = true;
+			/* Mark all remote NIs on src_nid's net UP */
+			lpn = lnet_peer_get_net_locked(gw, src_net_id);
+			if (lpn)
+				list_for_each_entry(lpni2, &lpn->lpn_peer_nis,
+						    lpni_peer_nis)
+					lpni2->lpni_ns_status = LNET_NI_STATUS_UP;
+		}
+	}
+
+	lpni->lpni_last_alive = now;
+
+	msg->msg_rxpeer = lpni;
+	msg->msg_rxni = ni;
+	lnet_ni_addref_locked(ni, cpt);
+	/* Multi-Rail: Primary NID of source. */
+	lnet_peer_primary_nid_locked(&src_nid, &msg->msg_initiator);
+
+	/*
+	 * mark the status of this lpni as UP since we received a message
+	 * from it. The ping response reports back the ns_status which is
+	 * marked on the remote as up or down and we cache it here.
+	 */
+	msg->msg_rxpeer->lpni_ns_status = LNET_NI_STATUS_UP;
+
+	lnet_msg_commit(msg, cpt);
+
+	/* message delay simulation */
+	if (unlikely(!list_empty(&the_lnet.ln_delay_rules) &&
+		     lnet_delay_rule_match_locked(hdr, msg))) {
+		lnet_net_unlock(cpt);
+		return 0;
+	}
+
+	if (!for_me) {
+		rc = lnet_parse_forward_locked(ni, msg);
+		lnet_net_unlock(cpt);
+
+		if (rc < 0)
+			goto free_drop;
+
+		if (rc == LNET_CREDIT_OK) {
+			lnet_ni_recv(ni, msg->msg_private, msg, 0,
+				     0, payload_length, payload_length);
+		}
+		return 0;
+	}
+
+	lnet_net_unlock(cpt);
+
+	rc = lnet_parse_local(ni, msg);
+	if (rc != 0)
+		goto free_drop;
+	return 0;
+
+ free_drop:
+	LASSERT(msg->msg_md == NULL);
+	lnet_finalize(msg, rc);
+
+ drop:
+	lnet_drop_message(ni, cpt, private, payload_length, type);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+	while (!list_empty(head)) {
+		struct lnet_processid id = {};
+		struct lnet_msg	*msg;
+
+		msg = list_entry(head->next, struct lnet_msg, msg_list);
+		list_del(&msg->msg_list);
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_md == NULL);
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CWARN("Dropping delayed PUT from %s portal %d match %llu"
+		      " offset %d length %d: %s\n",
+		      libcfs_idstr(&id),
+		      msg->msg_hdr.msg.put.ptl_index,
+		      msg->msg_hdr.msg.put.match_bits,
+		      msg->msg_hdr.msg.put.offset,
+		      msg->msg_hdr.payload_length, reason);
+
+		/* NB I can't drop msg's ref on msg_rxpeer until after I've
+		 * called lnet_drop_message(), so I just hang onto msg as well
+		 * until that's done */
+
+		lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
+				  msg->msg_private, msg->msg_len,
+				  msg->msg_type);
+
+		msg->msg_no_resend = true;
+		/*
+		 * NB: message will not generate event because w/o attached MD,
+		 * but we still should give error code so lnet_msg_decommit()
+		 * can skip counters operations and other checks.
+		 */
+		lnet_finalize(msg, -ENOENT);
+	}
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct lnet_msg	*msg;
+		struct lnet_processid id;
+
+		msg = list_entry(head->next, struct lnet_msg, msg_list);
+		list_del(&msg->msg_list);
+
+		/* md won't disappear under me, since each msg
+		 * holds a ref on it */
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_md != NULL);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_rxni != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+		       "match %llu offset %d length %d.\n",
+			libcfs_idstr(&id), msg->msg_hdr.msg.put.ptl_index,
+			msg->msg_hdr.msg.put.match_bits,
+			msg->msg_hdr.msg.put.offset,
+			msg->msg_hdr.payload_length);
+
+		lnet_recv_put(msg->msg_rxni, msg);
+	}
+}
+
+static void
+lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt,
+			struct lnet_libmd *md, struct lnet_handle_md mdh)
+{
+	s64 timeout_ns;
+	struct lnet_rsp_tracker *local_rspt;
+
+	/*
+	 * MD has a refcount taken by message so it's not going away.
+	 * The MD however can be looked up. We need to secure the access
+	 * to the md_rspt_ptr by taking the res_lock.
+	 * The rspt can be accessed without protection up to when it gets
+	 * added to the list.
+	 */
+
+	lnet_res_lock(cpt);
+	local_rspt = md->md_rspt_ptr;
+	timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC;
+	if (local_rspt != NULL) {
+		/*
+		 * we already have an rspt attached to the md, so we'll
+		 * update the deadline on that one.
+		 */
+		lnet_rspt_free(rspt, cpt);
+	} else {
+		/* new md */
+		rspt->rspt_mdh = mdh;
+		rspt->rspt_cpt = cpt;
+		/* store the rspt so we can access it when we get the REPLY */
+		md->md_rspt_ptr = rspt;
+		local_rspt = rspt;
+	}
+	local_rspt->rspt_deadline = ktime_add_ns(ktime_get(), timeout_ns);
+
+	/*
+	 * add to the list of tracked responses. It's added to tail of the
+	 * list in order to expire all the older entries first.
+	 */
+	lnet_net_lock(cpt);
+	list_move_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]);
+	lnet_net_unlock(cpt);
+	lnet_res_unlock(cpt);
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval  0	   Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see struct lnet_event::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self4, struct lnet_handle_md mdh, enum lnet_ack_req ack,
+	struct lnet_process_id target4, unsigned int portal,
+	__u64 match_bits, unsigned int offset,
+	__u64 hdr_data)
+{
+	struct lnet_msg *msg;
+	struct lnet_libmd *md;
+	int cpt;
+	int rc;
+	struct lnet_processid target;
+	struct lnet_rsp_tracker *rspt = NULL;
+	struct lnet_nid self;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	lnet_nid4_to_nid(self4, &self);
+	lnet_nid4_to_nid(target4.nid, &target.nid);
+	target.pid = target4.pid;
+
+	if (!list_empty(&the_lnet.ln_test_peers) &&	/* normally we don't */
+	    fail_peer(&target.nid, 1)) {		/* shall we now? */
+		CERROR("Dropping PUT to %s: simulated failure\n",
+		       libcfs_id2str(target4));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping PUT to %s: ENOMEM on struct lnet_msg\n",
+		       libcfs_id2str(target4));
+		return -ENOMEM;
+	}
+	msg->msg_vmflush = !!(current->flags & PF_MEMALLOC);
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+
+	if (ack == LNET_ACK_REQ) {
+		rspt = lnet_rspt_alloc(cpt);
+		if (!rspt) {
+			CERROR("Dropping PUT to %s: ENOMEM on response tracker\n",
+				libcfs_id2str(target4));
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&rspt->rspt_on_list);
+	}
+
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping PUT (%llu:%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target4),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+		lnet_res_unlock(cpt);
+
+		if (rspt)
+			lnet_rspt_free(rspt, cpt);
+
+		lnet_msg_free(msg);
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "%s -> %s\n", __func__, libcfs_id2str(target4));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_PUT, &target, 0, md->md_length);
+
+	msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+	/* NB handles only looked up by creator (no flips) */
+	if (ack == LNET_ACK_REQ) {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			the_lnet.ln_interface_cookie;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			md->md_lh.lh_cookie;
+	} else {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+	}
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	if (rspt && lnet_response_tracking_enabled(LNET_MSG_PUT,
+						   md->md_options))
+		lnet_attach_rsp_tracker(rspt, cpt, md, mdh);
+	else if (rspt)
+		lnet_rspt_free(rspt, cpt);
+
+	if (CFS_FAIL_CHECK_ORSET(CFS_FAIL_PTLRPC_OST_BULK_CB2,
+				 CFS_FAIL_ONCE))
+		rc = -EIO;
+	else
+		rc = lnet_send(&self, msg, NULL);
+
+	if (rc != 0) {
+		CNETERR("Error sending PUT to %s: %d\n",
+			libcfs_id2str(target4), rc);
+		msg->msg_no_resend = true;
+		lnet_finalize(msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+/*
+ * The LND can DMA direct to the GET md (i.e. no REPLY msg).  This
+ * returns a msg for the LND to pass to lnet_finalize() when the sink
+ * data has been received.
+ *
+ * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+ * lnet_finalize() is called on it, so the LND must call this first
+ */
+struct lnet_msg *
+lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
+{
+	struct lnet_msg	*msg = lnet_msg_alloc();
+	struct lnet_libmd *getmd = getmsg->msg_md;
+	struct lnet_processid *peer_id = &getmsg->msg_target;
+	int cpt;
+
+	LASSERT(!getmsg->msg_target_is_router);
+	LASSERT(!getmsg->msg_routing);
+
+	if (msg == NULL) {
+		CERROR("%s: Dropping REPLY from %s: can't allocate msg\n",
+		       libcfs_nidstr(&ni->ni_nid), libcfs_idstr(peer_id));
+		goto drop;
+	}
+
+	cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+	lnet_res_lock(cpt);
+
+	LASSERT(getmd->md_refcount > 0);
+
+	if (getmd->md_threshold == 0) {
+		CERROR("%s: Dropping REPLY from %s for inactive MD %p\n",
+			libcfs_nidstr(&ni->ni_nid), libcfs_idstr(peer_id),
+			getmd);
+		lnet_res_unlock(cpt);
+		goto drop;
+	}
+
+	LASSERT(getmd->md_offset == 0);
+
+	CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+	       libcfs_nidstr(&ni->ni_nid), libcfs_idstr(peer_id), getmd);
+
+	/* setup information for lnet_build_msg_event */
+	msg->msg_initiator =
+		getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid;
+	msg->msg_from = peer_id->nid;
+	msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+	msg->msg_hdr.src_nid = peer_id->nid;
+	msg->msg_hdr.payload_length = getmd->md_length;
+	msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+	lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+	lnet_res_unlock(cpt);
+
+	cpt = lnet_nid2cpt(&peer_id->nid, ni);
+
+	lnet_net_lock(cpt);
+	lnet_msg_commit(msg, cpt);
+	lnet_net_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	return msg;
+
+ drop:
+	cpt = lnet_nid2cpt(&peer_id->nid, ni);
+
+	lnet_net_lock(cpt);
+	lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP);
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
+		getmd->md_length;
+	lnet_net_unlock(cpt);
+
+	if (msg != NULL)
+		lnet_msg_free(msg);
+
+	return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *reply,
+		       unsigned int len)
+{
+	/* Set the REPLY length, now the RDMA that elides the REPLY message has
+	 * completed and I know it. */
+	LASSERT(reply != NULL);
+	LASSERT(reply->msg_type == LNET_MSG_GET);
+	LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY);
+
+	/* NB I trusted my peer to RDMA.  If she tells me she's written beyond
+	 * the end of my buffer, I might as well be dead. */
+	LASSERT(len <= reply->msg_ev.mlength);
+
+	reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating" (See LNetMDBind()).
+ *
+ * \retval  0	   Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self4, struct lnet_handle_md mdh,
+	struct lnet_process_id target4, unsigned int portal,
+	__u64 match_bits, unsigned int offset, bool recovery)
+{
+	struct lnet_msg *msg;
+	struct lnet_libmd *md;
+	struct lnet_rsp_tracker *rspt;
+	int cpt;
+	int rc;
+	struct lnet_nid self;
+	struct lnet_processid target;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	lnet_nid4_to_nid(self4, &self);
+	lnet_nid4_to_nid(target4.nid, &target.nid);
+	target.pid = target4.pid;
+
+	if (!list_empty(&the_lnet.ln_test_peers) &&	/* normally we don't */
+	    fail_peer(&target.nid, 1))		/* shall we now? */
+	{
+		CERROR("Dropping GET to %s: simulated failure\n",
+		       libcfs_id2str(target4));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (!msg) {
+		CERROR("Dropping GET to %s: ENOMEM on struct lnet_msg\n",
+		       libcfs_id2str(target4));
+		return -ENOMEM;
+	}
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+
+	rspt = lnet_rspt_alloc(cpt);
+	if (!rspt) {
+		CERROR("Dropping GET to %s: ENOMEM on response tracker\n",
+		       libcfs_id2str(target4));
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&rspt->rspt_on_list);
+
+	msg->msg_recovery = recovery;
+
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping GET (%llu:%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target4),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+		lnet_rspt_free(rspt, cpt);
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "%s -> %s\n", __func__, libcfs_id2str(target4));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_GET, &target, 0, 0);
+
+	msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+	/* NB handles only looked up by creator (no flips) */
+	msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+		the_lnet.ln_interface_cookie;
+	msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+		md->md_lh.lh_cookie;
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	if (lnet_response_tracking_enabled(LNET_MSG_GET, md->md_options))
+		lnet_attach_rsp_tracker(rspt, cpt, md, mdh);
+	else
+		lnet_rspt_free(rspt, cpt);
+
+	rc = lnet_send(&self, msg, NULL);
+	if (rc < 0) {
+		CNETERR("Error sending GET to %s: %d\n",
+			libcfs_id2str(target4), rc);
+		msg->msg_no_resend = true;
+		lnet_finalize(msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+	struct list_head *e;
+	struct lnet_ni *ni = NULL;
+	struct lnet_remotenet *rnet;
+	__u32 dstnet = LNET_NIDNET(dstnid);
+	int hops;
+	int cpt;
+	__u32 order = 2;
+	struct list_head *rn_list;
+	bool matched_dstnet = false;
+
+	/* if !local_nid_dist_zero, I don't return a distance of 0 ever
+	 * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+	 * keep order 0 free for 0@lo and order 1 free for a local NID
+	 * match */
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
+		/* FIXME support large-addr nid */
+		if (lnet_nid_to_nid4(&ni->ni_nid) == dstnid) {
+			if (srcnidp != NULL)
+				*srcnidp = dstnid;
+			if (orderp != NULL) {
+				if (dstnid == LNET_NID_LO_0)
+					*orderp = 0;
+				else
+					*orderp = 1;
+			}
+			lnet_net_unlock(cpt);
+
+			return local_nid_dist_zero ? 0 : 1;
+		}
+
+		if (!matched_dstnet && LNET_NID_NET(&ni->ni_nid) == dstnet) {
+			matched_dstnet = true;
+			/* We matched the destination net, but we may have
+			 * additional local NIs to inspect.
+			 *
+			 * We record the nid and order as appropriate, but
+			 * they may be overwritten if we match local NI above.
+			 */
+			if (srcnidp)
+				/* FIXME support large-addr nids */
+				*srcnidp = lnet_nid_to_nid4(&ni->ni_nid);
+
+			if (orderp) {
+				/* Check if ni was originally created in
+				 * current net namespace.
+				 * If not, assign order above 0xffff0000,
+				 * to make this ni not a priority.
+				 */
+				if (current->nsproxy &&
+				    !net_eq(ni->ni_net_ns,
+					    current->nsproxy->net_ns))
+					*orderp = order + 0xffff0000;
+				else
+					*orderp = order;
+			}
+		}
+
+		order++;
+	}
+
+	if (matched_dstnet) {
+		lnet_net_unlock(cpt);
+		return 1;
+	}
+
+	rn_list = lnet_net2rnethash(dstnet);
+	list_for_each(e, rn_list) {
+		rnet = list_entry(e, struct lnet_remotenet, lrn_list);
+
+		if (rnet->lrn_net == dstnet) {
+			struct lnet_route *route;
+			struct lnet_route *shortest = NULL;
+			__u32 shortest_hops = LNET_UNDEFINED_HOPS;
+			__u32 route_hops;
+
+			LASSERT(!list_empty(&rnet->lrn_routes));
+
+			list_for_each_entry(route, &rnet->lrn_routes,
+					    lr_list) {
+				route_hops = route->lr_hops;
+				if (route_hops == LNET_UNDEFINED_HOPS)
+					route_hops = 1;
+				if (shortest == NULL ||
+				    route_hops < shortest_hops) {
+					shortest = route;
+					shortest_hops = route_hops;
+				}
+			}
+
+			LASSERT(shortest != NULL);
+			hops = shortest_hops;
+			if (srcnidp != NULL) {
+				struct lnet_net *net;
+				net = lnet_get_net_locked(shortest->lr_lnet);
+				LASSERT(net);
+				ni = lnet_get_next_ni_locked(net, NULL);
+				/* FIXME support large-addr nids */
+				*srcnidp = lnet_nid_to_nid4(&ni->ni_nid);
+			}
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return hops + 1;
+		}
+		order++;
+	}
+
+	lnet_net_unlock(cpt);
+	return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
new file mode 100644
index 0000000000000..ed979bcbd9d08
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
@@ -0,0 +1,1346 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+
+void
+lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev)
+{
+	ENTRY;
+
+	memset(ev, 0, sizeof(*ev));
+
+	ev->status   = 0;
+	ev->unlinked = 1;
+	ev->type     = LNET_EVENT_UNLINK;
+	lnet_md_deconstruct(md, ev);
+	lnet_md2handle(&ev->md_handle, md);
+	EXIT;
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type)
+{
+	struct lnet_hdr	*hdr = &msg->msg_hdr;
+	struct lnet_event *ev = &msg->msg_ev;
+
+	LASSERT(!msg->msg_routing);
+
+	ev->type = ev_type;
+	ev->msg_type = msg->msg_type;
+
+	if (ev_type == LNET_EVENT_SEND) {
+		/* event for active message */
+		ev->target.nid	  = hdr->dest_nid;
+		ev->target.pid	  = hdr->dest_pid;
+		ev->initiator.nid = LNET_ANY_NID;
+		ev->initiator.pid = the_lnet.ln_pid;
+		ev->source.nid	  = LNET_ANY_NID;
+		ev->source.pid    = the_lnet.ln_pid;
+		ev->sender	  = LNET_ANY_NID;
+	} else {
+		/* event for passive message */
+		ev->target.pid	  = hdr->dest_pid;
+		ev->target.nid	  = hdr->dest_nid;
+		ev->initiator.pid = hdr->src_pid;
+		/* Multi-Rail: resolve src_nid to "primary" peer NID */
+		ev->initiator.nid = msg->msg_initiator;
+		/* Multi-Rail: track source NID. */
+		ev->source.pid	  = hdr->src_pid;
+		ev->source.nid	  = hdr->src_nid;
+		ev->rlength	  = hdr->payload_length;
+		ev->sender	  = msg->msg_from;
+		ev->mlength	  = msg->msg_wanted;
+		ev->offset	  = msg->msg_offset;
+	}
+
+	switch (ev_type) {
+	default:
+		LBUG();
+
+	case LNET_EVENT_PUT: /* passive PUT */
+		ev->pt_index   = hdr->msg.put.ptl_index;
+		ev->match_bits = hdr->msg.put.match_bits;
+		ev->hdr_data   = hdr->msg.put.hdr_data;
+		return;
+
+	case LNET_EVENT_GET: /* passive GET */
+		ev->pt_index   = hdr->msg.get.ptl_index;
+		ev->match_bits = hdr->msg.get.match_bits;
+		ev->hdr_data   = 0;
+		return;
+
+	case LNET_EVENT_ACK: /* ACK */
+		ev->match_bits = hdr->msg.ack.match_bits;
+		ev->mlength    = hdr->msg.ack.mlength;
+		return;
+
+	case LNET_EVENT_REPLY: /* REPLY */
+		return;
+
+	case LNET_EVENT_SEND: /* active message */
+		if (msg->msg_type == LNET_MSG_PUT) {
+			ev->pt_index   = le32_to_cpu(hdr->msg.put.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+			ev->offset     = le32_to_cpu(hdr->msg.put.offset);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->payload_length);
+			ev->hdr_data   = le64_to_cpu(hdr->msg.put.hdr_data);
+
+		} else {
+			LASSERT(msg->msg_type == LNET_MSG_GET);
+			ev->pt_index   = le32_to_cpu(hdr->msg.get.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->msg.get.sink_length);
+			ev->offset     = le32_to_cpu(hdr->msg.get.src_offset);
+			ev->hdr_data   = 0;
+		}
+		return;
+	}
+}
+
+void
+lnet_msg_commit(struct lnet_msg *msg, int cpt)
+{
+	struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+	struct lnet_counters_common *common;
+	s64 timeout_ns;
+
+	/* set the message deadline */
+	timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC;
+	msg->msg_deadline = ktime_add_ns(ktime_get(), timeout_ns);
+
+	/* routed message can be committed for both receiving and sending */
+	LASSERT(!msg->msg_tx_committed);
+
+	if (msg->msg_sending) {
+		LASSERT(!msg->msg_receiving);
+		msg->msg_tx_cpt = cpt;
+		msg->msg_tx_committed = 1;
+		if (msg->msg_rx_committed) { /* routed message REPLY */
+			LASSERT(msg->msg_onactivelist);
+			return;
+		}
+	} else {
+		LASSERT(!msg->msg_sending);
+		msg->msg_rx_cpt = cpt;
+		msg->msg_rx_committed = 1;
+	}
+
+	LASSERT(!msg->msg_onactivelist);
+
+	msg->msg_onactivelist = 1;
+	list_add_tail(&msg->msg_activelist, &container->msc_active);
+
+	common = &the_lnet.ln_counters[cpt]->lct_common;
+	common->lcc_msgs_alloc++;
+	if (common->lcc_msgs_alloc > common->lcc_msgs_max)
+		common->lcc_msgs_max = common->lcc_msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
+{
+	struct lnet_counters_common *common;
+	struct lnet_event *ev = &msg->msg_ev;
+
+	LASSERT(msg->msg_tx_committed);
+	if (status != 0)
+		goto out;
+
+	common = &(the_lnet.ln_counters[msg->msg_tx_cpt]->lct_common);
+	switch (ev->type) {
+	default: /* routed message */
+		LASSERT(msg->msg_routing);
+		LASSERT(msg->msg_rx_committed);
+		LASSERT(ev->type == 0);
+
+		common->lcc_route_length += msg->msg_len;
+		common->lcc_route_count++;
+		goto incr_stats;
+
+	case LNET_EVENT_PUT:
+		/* should have been decommitted */
+		LASSERT(!msg->msg_rx_committed);
+		/* overwritten while sending ACK */
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		msg->msg_type = LNET_MSG_PUT; /* fix type */
+		break;
+
+	case LNET_EVENT_SEND:
+		LASSERT(!msg->msg_rx_committed);
+		if (msg->msg_type == LNET_MSG_PUT)
+			common->lcc_send_length += msg->msg_len;
+		break;
+
+	case LNET_EVENT_GET:
+		LASSERT(msg->msg_rx_committed);
+		/* overwritten while sending reply, we should never be
+		 * here for optimized GET */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY);
+		msg->msg_type = LNET_MSG_GET; /* fix type */
+		break;
+	}
+
+	common->lcc_send_count++;
+
+incr_stats:
+	if (msg->msg_txpeer)
+		lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_SEND);
+	if (msg->msg_txni)
+		lnet_incr_stats(&msg->msg_txni->ni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_SEND);
+ out:
+	lnet_return_tx_credits_locked(msg);
+	msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
+{
+	struct lnet_counters_common *common;
+	struct lnet_event *ev = &msg->msg_ev;
+
+	LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+	LASSERT(msg->msg_rx_committed);
+
+	if (status != 0)
+		goto out;
+
+	common = &(the_lnet.ln_counters[msg->msg_rx_cpt]->lct_common);
+	switch (ev->type) {
+	default:
+		LASSERT(ev->type == 0);
+		LASSERT(msg->msg_routing);
+		goto incr_stats;
+
+	case LNET_EVENT_ACK:
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		break;
+
+	case LNET_EVENT_GET:
+		/* type is "REPLY" if it's an optimized GET on passive side,
+		 * because optimized GET will never be committed for sending,
+		 * so message type wouldn't be changed back to "GET" by
+		 * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+			msg->msg_type == LNET_MSG_GET);
+		common->lcc_send_length += msg->msg_wanted;
+		break;
+
+	case LNET_EVENT_PUT:
+		LASSERT(msg->msg_type == LNET_MSG_PUT);
+		break;
+
+	case LNET_EVENT_REPLY:
+		/* type is "GET" if it's an optimized GET on active side,
+		 * see details in lnet_create_reply_msg() */
+		LASSERT(msg->msg_type == LNET_MSG_GET ||
+			msg->msg_type == LNET_MSG_REPLY);
+		break;
+	}
+
+	common->lcc_recv_count++;
+
+incr_stats:
+	if (msg->msg_rxpeer)
+		lnet_incr_stats(&msg->msg_rxpeer->lpni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_RECV);
+	if (msg->msg_rxni)
+		lnet_incr_stats(&msg->msg_rxni->ni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_RECV);
+	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+		common->lcc_recv_length += msg->msg_wanted;
+
+ out:
+	lnet_return_rx_credits_locked(msg);
+	msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status)
+{
+	int	cpt2 = cpt;
+
+	LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+	LASSERT(msg->msg_onactivelist);
+
+	if (msg->msg_tx_committed) { /* always decommit for sending first */
+		LASSERT(cpt == msg->msg_tx_cpt);
+		lnet_msg_decommit_tx(msg, status);
+	}
+
+	if (msg->msg_rx_committed) {
+		/* forwarding msg committed for both receiving and sending */
+		if (cpt != msg->msg_rx_cpt) {
+			lnet_net_unlock(cpt);
+			cpt2 = msg->msg_rx_cpt;
+			lnet_net_lock(cpt2);
+		}
+		lnet_msg_decommit_rx(msg, status);
+	}
+
+	list_del(&msg->msg_activelist);
+	msg->msg_onactivelist = 0;
+
+	the_lnet.ln_counters[cpt2]->lct_common.lcc_msgs_alloc--;
+
+	if (cpt2 != cpt) {
+		lnet_net_unlock(cpt2);
+		lnet_net_lock(cpt);
+	}
+}
+
+void
+lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
+		   unsigned int offset, unsigned int mlen)
+{
+	/* NB: @offset and @len are only useful for receiving */
+	/* Here, we attach the MD on lnet_msg and mark it busy and
+	 * decrementing its threshold. Come what may, the lnet_msg "owns"
+	 * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+	 * signals completion. */
+	LASSERT(!msg->msg_routing);
+
+	msg->msg_md = md;
+	if (msg->msg_receiving) { /* committed for receiving */
+		msg->msg_offset = offset;
+		msg->msg_wanted = mlen;
+	}
+
+	md->md_refcount++;
+	if (md->md_threshold != LNET_MD_THRESH_INF) {
+		LASSERT(md->md_threshold > 0);
+		md->md_threshold--;
+	}
+
+	/* build umd in event */
+	lnet_md2handle(&msg->msg_ev.md_handle, md);
+	lnet_md_deconstruct(md, &msg->msg_ev);
+}
+
+static int
+lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
+{
+	struct lnet_handle_wire ack_wmd;
+	int		   rc;
+	int		   status = msg->msg_ev.status;
+
+	LASSERT(msg->msg_onactivelist);
+
+	if (status == 0 && msg->msg_ack) {
+		/* Only send an ACK if the PUT completed successfully */
+
+		lnet_msg_decommit(msg, cpt, 0);
+
+		msg->msg_ack = 0;
+		lnet_net_unlock(cpt);
+
+		LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+		LASSERT(!msg->msg_routing);
+
+		ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+		lnet_prep_send(msg, LNET_MSG_ACK, &msg->msg_ev.source, 0, 0);
+
+		msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+		msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+		msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+		rc = lnet_send(&msg->msg_ev.target.nid, msg,
+			       &msg->msg_from);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is committed for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either
+		 * because CPT for sending can be different with CPT for
+		 * receiving, so we should return back to lnet_finalize()
+		 * to make sure we are locking the correct partition.
+		 */
+		return rc;
+
+	} else if (status == 0 &&	/* OK so far */
+		   (msg->msg_routing && !msg->msg_sending)) {
+		/* not forwarded */
+		LASSERT(!msg->msg_receiving);	/* called back recv already */
+		lnet_net_unlock(cpt);
+
+		rc = lnet_send(NULL, msg, NULL);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is committed for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either:
+		 * - The rule is message must decommit for sending first if
+		 *   the it's committed for both sending and receiving
+		 * - CPT for sending can be different with CPT for receiving,
+		 *   so we should return back to lnet_finalize() to make
+		 *   sure we are locking the correct partition.
+		 */
+		return rc;
+	}
+
+	lnet_msg_decommit(msg, cpt, status);
+	lnet_msg_free(msg);
+	return 0;
+}
+
+static void
+lnet_dec_healthv_locked(atomic_t *healthv, int sensitivity)
+{
+	int h = atomic_read(healthv);
+
+	if (h < sensitivity) {
+		atomic_set(healthv, 0);
+	} else {
+		h -= sensitivity;
+		atomic_set(healthv, h);
+	}
+}
+
+/* must hold net_lock/0 */
+void
+lnet_ni_add_to_recoveryq_locked(struct lnet_ni *ni,
+				struct list_head *recovery_queue, time64_t now)
+{
+	if (!list_empty(&ni->ni_recovery))
+		return;
+
+	if (atomic_read(&ni->ni_healthv) == LNET_MAX_HEALTH_VALUE)
+		return;
+
+	/* This NI is going on the recovery queue, so take a ref on it */
+	lnet_ni_addref_locked(ni, 0);
+
+	lnet_ni_set_next_ping(ni, now);
+
+	CDEBUG(D_NET, "%s added to recovery queue. ping count: %u next ping: %lld health :%d\n",
+	       libcfs_nidstr(&ni->ni_nid),
+	       ni->ni_ping_count,
+	       ni->ni_next_ping,
+	       atomic_read(&ni->ni_healthv));
+
+	list_add_tail(&ni->ni_recovery, recovery_queue);
+}
+
+static void
+lnet_handle_local_failure(struct lnet_ni *local_ni)
+{
+	/*
+	 * the lnet_net_lock(0) is used to protect the addref on the ni
+	 * and the recovery queue.
+	 */
+	lnet_net_lock(0);
+	/* the mt could've shutdown and cleaned up the queues */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+		lnet_net_unlock(0);
+		return;
+	}
+
+	lnet_dec_healthv_locked(&local_ni->ni_healthv, lnet_health_sensitivity);
+	lnet_ni_add_to_recoveryq_locked(local_ni, &the_lnet.ln_mt_localNIRecovq,
+					ktime_get_seconds());
+	lnet_net_unlock(0);
+}
+
+/* must hold net_lock/0 */
+void
+lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni)
+{
+	__u32 sensitivity = lnet_health_sensitivity;
+	__u32 lp_sensitivity;
+
+	/*
+	 * If there is a health sensitivity in the peer then use that
+	 * instead of the globally set one.
+	 */
+	lp_sensitivity = lpni->lpni_peer_net->lpn_peer->lp_health_sensitivity;
+	if (lp_sensitivity)
+		sensitivity = lp_sensitivity;
+
+	lnet_dec_healthv_locked(&lpni->lpni_healthv, sensitivity);
+
+	/* update the peer_net's health value */
+	lnet_update_peer_net_healthv(lpni);
+
+	/*
+	 * add the peer NI to the recovery queue if it's not already there
+	 * and it's health value is actually below the maximum. It's
+	 * possible that the sensitivity might be set to 0, and the health
+	 * value will not be reduced. In this case, there is no reason to
+	 * invoke recovery
+	 */
+	lnet_peer_ni_add_to_recoveryq_locked(lpni,
+					     &the_lnet.ln_mt_peerNIRecovq,
+					     ktime_get_seconds());
+}
+
+static void
+lnet_handle_remote_failure(struct lnet_peer_ni *lpni)
+{
+	/* lpni could be NULL if we're in the LOLND case */
+	if (!lpni)
+		return;
+
+	lnet_net_lock(0);
+	/* the mt could've shutdown and cleaned up the queues */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+		lnet_net_unlock(0);
+		return;
+	}
+	lnet_handle_remote_failure_locked(lpni);
+	lnet_net_unlock(0);
+}
+
+static void
+lnet_incr_hstats(struct lnet_ni *ni, struct lnet_peer_ni *lpni,
+		 enum lnet_msg_hstatus hstatus)
+{
+	struct lnet_counters_health *health;
+
+	health = &the_lnet.ln_counters[0]->lct_health;
+
+	switch (hstatus) {
+	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+		atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+		health->lch_local_interrupt_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_DROPPED:
+		atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+		health->lch_local_dropped_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_ABORTED:
+		atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+		health->lch_local_aborted_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+		atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+		health->lch_local_no_route_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+		atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+		health->lch_local_timeout_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_ERROR:
+		atomic_inc(&ni->ni_hstats.hlt_local_error);
+		health->lch_local_error_count++;
+		break;
+	case LNET_MSG_STATUS_REMOTE_DROPPED:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+		health->lch_remote_dropped_count++;
+		break;
+	case LNET_MSG_STATUS_REMOTE_ERROR:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+		health->lch_remote_error_count++;
+		break;
+	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+		health->lch_remote_timeout_count++;
+		break;
+	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+		health->lch_network_timeout_count++;
+		break;
+	case LNET_MSG_STATUS_OK:
+		break;
+	default:
+		LBUG();
+	}
+}
+
+static void
+lnet_resend_msg_locked(struct lnet_msg *msg)
+{
+	msg->msg_retry_count++;
+
+	/*
+	 * remove message from the active list and reset it to prepare
+	 * for a resend. Two exceptions to this
+	 *
+	 * 1. the router case. When a message is being routed it is
+	 * committed for rx when received and committed for tx when
+	 * forwarded. We don't want to remove it from the active list, since
+	 * code which handles receiving expects it to remain on the active
+	 * list.
+	 *
+	 * 2. The REPLY case. Reply messages use the same message
+	 * structure for the GET that was received.
+	 */
+	if (!msg->msg_routing && msg->msg_type != LNET_MSG_REPLY) {
+		list_del_init(&msg->msg_activelist);
+		msg->msg_onactivelist = 0;
+	}
+	/*
+	 * The msg_target.nid which was originally set
+	 * when calling LNetGet() or LNetPut() might've
+	 * been overwritten if we're routing this message.
+	 * Call lnet_msg_decommit_tx() to return the credit
+	 * this message consumed. The message will
+	 * consume another credit when it gets resent.
+	 */
+	msg->msg_target.nid = msg->msg_hdr.dest_nid;
+	lnet_msg_decommit_tx(msg, -EAGAIN);
+	msg->msg_sending = 0;
+	msg->msg_receiving = 0;
+	msg->msg_target_is_router = 0;
+
+	CDEBUG(D_NET, "%s->%s:%s:%s - queuing msg (%p) for resend\n",
+	       libcfs_nidstr(&msg->msg_hdr.src_nid),
+	       libcfs_nidstr(&msg->msg_hdr.dest_nid),
+	       lnet_msgtyp2str(msg->msg_type),
+	       lnet_health_error2str(msg->msg_health_status), msg);
+
+	list_add_tail(&msg->msg_list, the_lnet.ln_mt_resendqs[msg->msg_tx_cpt]);
+
+	complete(&the_lnet.ln_mt_wait_complete);
+}
+
+int
+lnet_check_finalize_recursion_locked(struct lnet_msg *msg,
+				     struct list_head *containerq,
+				     int nworkers, void **workers)
+{
+	int my_slot = -1;
+	int i;
+
+	list_add_tail(&msg->msg_list, containerq);
+
+	for (i = 0; i < nworkers; i++) {
+		if (workers[i] == current)
+			break;
+
+		if (my_slot < 0 && workers[i] == NULL)
+			my_slot = i;
+	}
+
+	if (i < nworkers || my_slot < 0)
+		return -1;
+
+	workers[my_slot] = current;
+
+	return my_slot;
+}
+
+int
+lnet_attempt_msg_resend(struct lnet_msg *msg)
+{
+	struct lnet_msg_container *container;
+	int my_slot;
+	int cpt;
+
+	/* we can only resend tx_committed messages */
+	LASSERT(msg->msg_tx_committed);
+
+	/* don't resend recovery messages */
+	if (msg->msg_recovery) {
+		CDEBUG(D_NET, "msg %s->%s is a recovery ping. retry# %d\n",
+			libcfs_nidstr(&msg->msg_from),
+			libcfs_nidstr(&msg->msg_target.nid),
+			msg->msg_retry_count);
+		return -ENOTRECOVERABLE;
+	}
+
+	/*
+	 * if we explicitly indicated we don't want to resend then just
+	 * return
+	 */
+	if (msg->msg_no_resend) {
+		CDEBUG(D_NET, "msg %s->%s requested no resend. retry# %d\n",
+			libcfs_nidstr(&msg->msg_from),
+			libcfs_nidstr(&msg->msg_target.nid),
+			msg->msg_retry_count);
+		return -ENOTRECOVERABLE;
+	}
+
+	/* check if the message has exceeded the number of retries */
+	if (msg->msg_retry_count >= lnet_retry_count) {
+		CNETERR("msg %s->%s exceeded retry count %d\n",
+			libcfs_nidstr(&msg->msg_from),
+			libcfs_nidstr(&msg->msg_target.nid),
+			msg->msg_retry_count);
+		return -ENOTRECOVERABLE;
+	}
+
+	cpt = msg->msg_tx_cpt;
+	lnet_net_lock(cpt);
+
+	/* check again under lock */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	container = the_lnet.ln_msg_containers[cpt];
+	my_slot =
+		lnet_check_finalize_recursion_locked(msg,
+					&container->msc_resending,
+					container->msc_nfinalizers,
+					container->msc_resenders);
+
+	/* enough threads are resending */
+	if (my_slot == -1) {
+		lnet_net_unlock(cpt);
+		return 0;
+	}
+
+	while (!list_empty(&container->msc_resending)) {
+		msg = list_entry(container->msc_resending.next,
+					struct lnet_msg, msg_list);
+		list_del(&msg->msg_list);
+
+		/*
+		 * resending the message will require us to call
+		 * lnet_msg_decommit_tx() which will return the credit
+		 * which this message holds. This could trigger another
+		 * queued message to be sent. If that message fails and
+		 * requires a resend we will recurse.
+		 * But since at this point the slot is taken, the message
+		 * will be queued in the container and dealt with
+		 * later. This breaks the recursion.
+		 */
+		lnet_resend_msg_locked(msg);
+	}
+
+	/*
+	 * msc_resenders is an array of process pointers. Each entry holds
+	 * a pointer to the current process operating on the message. An
+	 * array entry is created per CPT. If the array slot is already
+	 * set, then it means that there is a thread on the CPT currently
+	 * resending a message.
+	 * Once the thread finishes clear the slot to enable the thread to
+	 * take on more resend work.
+	 */
+	container->msc_resenders[my_slot] = NULL;
+	lnet_net_unlock(cpt);
+
+	return 0;
+}
+
+/*
+ * Do a health check on the message:
+ * return -1 if we're not going to handle the error or
+ *   if we've reached the maximum number of retries.
+ *   success case will return -1 as well
+ * return 0 if it the message is requeued for send
+ */
+static int
+lnet_health_check(struct lnet_msg *msg)
+{
+	enum lnet_msg_hstatus hstatus = msg->msg_health_status;
+	struct lnet_peer_ni *lpni;
+	struct lnet_ni *ni;
+	bool lo = false;
+	bool attempt_local_resend;
+	bool attempt_remote_resend;
+	bool handle_local_health;
+	bool handle_remote_health;
+
+	/* if we're shutting down no point in handling health. */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
+		return -1;
+
+	LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+
+	/*
+	 * if we're sending to the LOLND then the msg_txpeer will not be
+	 * set. So no need to sanity check it.
+	 */
+	if (msg->msg_tx_committed &&
+	    !nid_is_lo0(&msg->msg_txni->ni_nid))
+		LASSERT(msg->msg_txpeer);
+	else if (msg->msg_tx_committed &&
+		 nid_is_lo0(&msg->msg_txni->ni_nid))
+		lo = true;
+
+	if (hstatus != LNET_MSG_STATUS_OK &&
+	    ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
+		return -1;
+
+	/*
+	 * always prefer txni/txpeer if they message is committed for both
+	 * directions.
+	 */
+	if (msg->msg_tx_committed) {
+		ni = msg->msg_txni;
+		lpni = msg->msg_txpeer;
+		attempt_local_resend = attempt_remote_resend = true;
+	} else {
+		ni = msg->msg_rxni;
+		lpni = msg->msg_rxpeer;
+		attempt_local_resend = attempt_remote_resend = false;
+	}
+
+	if (!lo)
+		LASSERT(ni && lpni);
+	else
+		LASSERT(ni);
+
+	CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
+	       libcfs_nidstr(&ni->ni_nid),
+	       (lo) ? "self" : libcfs_nidstr(&lpni->lpni_nid),
+	       lnet_msgtyp2str(msg->msg_type),
+	       lnet_health_error2str(hstatus));
+
+	/*
+	 * stats are only incremented for errors so avoid wasting time
+	 * incrementing statistics if there is no error. Similarly, whether to
+	 * update health values or perform resends is only applicable for
+	 * messages with a health status != OK.
+	 */
+	if (hstatus != LNET_MSG_STATUS_OK) {
+		/* Don't further decrement the health value if a recovery
+		 * message failed.
+		 */
+		if (msg->msg_recovery)
+			handle_local_health = handle_remote_health = false;
+		else
+			handle_local_health = handle_remote_health = true;
+
+		/* For local failures, health/recovery/resends are not needed if
+		 * I only have a single (non-lolnd) interface. NB: pb_nnis
+		 * includes the lolnd interface, so a single-rail node would
+		 * have pb_nnis == 2.
+		 */
+		if (the_lnet.ln_ping_target->pb_nnis <= 2) {
+			handle_local_health = false;
+			attempt_local_resend = false;
+		}
+
+		lnet_net_lock(0);
+		lnet_incr_hstats(ni, lpni, hstatus);
+		/* For remote failures, health/recovery/resends are not needed
+		 * if the peer only has a single interface. Special case for
+		 * routers where we rely on health feature to manage route
+		 * aliveness. NB: unlike pb_nnis above, lp_nnis does _not_
+		 * include the lolnd, so a single-rail node would have
+		 * lp_nnis == 1.
+		 */
+		if (lpni && lpni->lpni_peer_net &&
+		    lpni->lpni_peer_net->lpn_peer &&
+		    lpni->lpni_peer_net->lpn_peer->lp_nnis <= 1) {
+			attempt_remote_resend = false;
+			if (!lnet_isrouter(lpni))
+				handle_remote_health = false;
+		}
+		/* Do not put my interfaces into peer NI recovery. They should
+		 * be handled with local NI recovery.
+		 */
+		if (handle_remote_health && lpni &&
+		    lnet_nid_to_ni_locked(&lpni->lpni_nid, 0))
+			handle_remote_health = false;
+		lnet_net_unlock(0);
+	}
+
+	switch (hstatus) {
+	case LNET_MSG_STATUS_OK:
+		/*
+		 * increment the local ni health whether we successfully
+		 * received or sent a message on it.
+		 *
+		 * Ping counts are reset to 0 as appropriate to allow for
+		 * faster recovery.
+		 */
+		lnet_inc_healthv(&ni->ni_healthv, lnet_health_sensitivity);
+		/*
+		 * It's possible msg_txpeer is NULL in the LOLND
+		 * case. Only increment the peer's health if we're
+		 * receiving a message from it. It's the only sure way to
+		 * know that a remote interface is up.
+		 * If this interface is part of a router, then take that
+		 * as indication that the router is fully healthy.
+		 */
+		if (lpni && msg->msg_rx_committed) {
+			lnet_net_lock(0);
+			lpni->lpni_ping_count = 0;
+			ni->ni_ping_count = 0;
+			/*
+			 * If we're receiving a message from the router or
+			 * I'm a router, then set that lpni's health to
+			 * maximum so we can commence communication
+			 */
+			if (lnet_isrouter(lpni) || the_lnet.ln_routing) {
+				lnet_set_lpni_healthv_locked(lpni,
+					LNET_MAX_HEALTH_VALUE);
+			} else {
+				__u32 sensitivity = lpni->lpni_peer_net->
+					lpn_peer->lp_health_sensitivity;
+
+				lnet_inc_lpni_healthv_locked(lpni,
+					(sensitivity) ? sensitivity :
+					lnet_health_sensitivity);
+				/* This peer NI may have previously aged out
+				 * of recovery. Now that we've received a
+				 * message from it, we can continue recovery
+				 * if its health value is still below the
+				 * maximum.
+				 */
+				lnet_peer_ni_add_to_recoveryq_locked(lpni,
+						&the_lnet.ln_mt_peerNIRecovq,
+						ktime_get_seconds());
+			}
+			lnet_net_unlock(0);
+		}
+
+		/* we can finalize this message */
+		return -1;
+	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+	case LNET_MSG_STATUS_LOCAL_DROPPED:
+	case LNET_MSG_STATUS_LOCAL_ABORTED:
+	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+		if (handle_local_health)
+			lnet_handle_local_failure(ni);
+		if (attempt_local_resend)
+			return lnet_attempt_msg_resend(msg);
+		break;
+	case LNET_MSG_STATUS_LOCAL_ERROR:
+		if (handle_local_health)
+			lnet_handle_local_failure(ni);
+		return -1;
+	case LNET_MSG_STATUS_REMOTE_DROPPED:
+		if (handle_remote_health)
+			lnet_handle_remote_failure(lpni);
+		if (attempt_remote_resend)
+			return lnet_attempt_msg_resend(msg);
+		break;
+	case LNET_MSG_STATUS_REMOTE_ERROR:
+	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+		if (handle_remote_health)
+			lnet_handle_remote_failure(lpni);
+		return -1;
+	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+		if (handle_remote_health)
+			lnet_handle_remote_failure(lpni);
+		if (handle_local_health)
+			lnet_handle_local_failure(ni);
+		return -1;
+	default:
+		LBUG();
+	}
+
+	/* no resend is needed */
+	return -1;
+}
+
+static void
+lnet_msg_detach_md(struct lnet_msg *msg, int status)
+{
+	struct lnet_libmd *md = msg->msg_md;
+	lnet_handler_t handler = NULL;
+	int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+	int unlink;
+
+	lnet_res_lock(cpt);
+	while (md->md_flags & LNET_MD_FLAG_HANDLING)
+		/* An event handler is running - wait for it to
+		 * complete to avoid races.
+		 */
+		lnet_md_wait_handling(md, cpt);
+
+	/* Now it's safe to drop my caller's ref */
+	md->md_refcount--;
+	LASSERT(md->md_refcount >= 0);
+
+	unlink = lnet_md_unlinkable(md);
+	if (md->md_handler) {
+		if ((md->md_flags & LNET_MD_FLAG_ABORTED) && !status) {
+			msg->msg_ev.status   = -ETIMEDOUT;
+			CDEBUG(D_NET, "md 0x%p already unlinked\n", md);
+		} else {
+			msg->msg_ev.status   = status;
+		}
+		msg->msg_ev.unlinked = unlink;
+		handler = md->md_handler;
+		if (!unlink)
+			md->md_flags |= LNET_MD_FLAG_HANDLING;
+	}
+
+	if (unlink || (md->md_refcount == 0 &&
+		       md->md_threshold == LNET_MD_THRESH_INF))
+		lnet_detach_rsp_tracker(md, cpt);
+
+	msg->msg_md = NULL;
+	if (unlink)
+		lnet_md_unlink(md);
+
+	lnet_res_unlock(cpt);
+
+	if (handler) {
+		handler(&msg->msg_ev);
+		if (!unlink) {
+			lnet_res_lock(cpt);
+			md->md_flags &= ~LNET_MD_FLAG_HANDLING;
+			wake_up_var(md);
+			lnet_res_unlock(cpt);
+		}
+	}
+}
+
+static bool
+lnet_is_health_check(struct lnet_msg *msg)
+{
+	bool hc = true;
+	int status = msg->msg_ev.status;
+
+	if ((!msg->msg_tx_committed && !msg->msg_rx_committed) ||
+	    !msg->msg_onactivelist) {
+		CDEBUG(D_NET, "msg %p not committed for send or receive\n",
+		       msg);
+		return false;
+	}
+
+	if ((msg->msg_tx_committed && !msg->msg_txpeer) ||
+	    (msg->msg_rx_committed && !msg->msg_rxpeer)) {
+		/* The optimized GET case does not set msg_rxpeer, but status
+		 * could be zero. Only print the error message if we have a
+		 * non-zero status.
+		 */
+		if (status)
+			CDEBUG(D_NET, "msg %p status %d cannot retry\n", msg,
+			       status);
+		return false;
+	}
+
+	/* Check for status inconsistencies */
+	if ((!status && msg->msg_health_status != LNET_MSG_STATUS_OK) ||
+	     (status && msg->msg_health_status == LNET_MSG_STATUS_OK)) {
+		CDEBUG(D_NET, "Msg %p is in inconsistent state, don't perform health "
+		       "checking (%d, %d)\n", msg, status,
+		       msg->msg_health_status);
+		hc = false;
+	}
+
+	CDEBUG(D_NET, "health check = %d, status = %d, hstatus = %d\n",
+	       hc, status, msg->msg_health_status);
+
+	return hc;
+}
+
+char *
+lnet_health_error2str(enum lnet_msg_hstatus hstatus)
+{
+	switch (hstatus) {
+	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+		return "LOCAL_INTERRUPT";
+	case LNET_MSG_STATUS_LOCAL_DROPPED:
+		return "LOCAL_DROPPED";
+	case LNET_MSG_STATUS_LOCAL_ABORTED:
+		return "LOCAL_ABORTED";
+	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+		return "LOCAL_NO_ROUTE";
+	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+		return "LOCAL_TIMEOUT";
+	case LNET_MSG_STATUS_LOCAL_ERROR:
+		return "LOCAL_ERROR";
+	case LNET_MSG_STATUS_REMOTE_DROPPED:
+		return "REMOTE_DROPPED";
+	case LNET_MSG_STATUS_REMOTE_ERROR:
+		return "REMOTE_ERROR";
+	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+		return "REMOTE_TIMEOUT";
+	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+		return "NETWORK_TIMEOUT";
+	case LNET_MSG_STATUS_OK:
+		return "OK";
+	default:
+		return "<UNKNOWN>";
+	}
+}
+
+bool
+lnet_send_error_simulation(struct lnet_msg *msg,
+			   enum lnet_msg_hstatus *hstatus)
+{
+	if (!msg)
+		return false;
+
+	if (list_empty(&the_lnet.ln_drop_rules))
+	    return false;
+
+	/* match only health rules */
+	if (!lnet_drop_rule_match(&msg->msg_hdr, LNET_NID_ANY,
+				  hstatus))
+		return false;
+
+	CDEBUG(D_NET, "src %s(%s)->dst %s: %s simulate health error: %s\n",
+		libcfs_nidstr(&msg->msg_hdr.src_nid),
+		libcfs_nidstr(&msg->msg_txni->ni_nid),
+		libcfs_nidstr(&msg->msg_hdr.dest_nid),
+		lnet_msgtyp2str(msg->msg_type),
+		lnet_health_error2str(*hstatus));
+
+	return true;
+}
+EXPORT_SYMBOL(lnet_send_error_simulation);
+
+void
+lnet_finalize(struct lnet_msg *msg, int status)
+{
+	struct lnet_msg_container *container;
+	int my_slot;
+	int cpt;
+	int rc;
+
+	LASSERT(!in_interrupt());
+
+	if (msg == NULL)
+		return;
+
+	msg->msg_ev.status = status;
+
+	if (lnet_is_health_check(msg)) {
+		/*
+		 * Check the health status of the message. If it has one
+		 * of the errors that we're supposed to handle, and it has
+		 * not timed out, then
+		 *	1. Decrement the appropriate health_value
+		 *	2. queue the message on the resend queue
+
+		 * if the message send is success, timed out or failed in the
+		 * health check for any reason then we'll just finalize the
+		 * message. Otherwise just return since the message has been
+		 * put on the resend queue.
+		 */
+		if (!lnet_health_check(msg))
+			return;
+	}
+
+	/*
+	 * We're not going to resend this message so detach its MD and invoke
+	 * the appropriate callbacks
+	 */
+	if (msg->msg_md != NULL)
+		lnet_msg_detach_md(msg, status);
+
+again:
+	if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+		/* not committed to network yet */
+		LASSERT(!msg->msg_onactivelist);
+		lnet_msg_free(msg);
+		return;
+	}
+
+	/*
+	 * NB: routed message can be committed for both receiving and sending,
+	 * we should finalize in LIFO order and keep counters correct.
+	 * (finalize sending first then finalize receiving)
+	 */
+	cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+	lnet_net_lock(cpt);
+
+	container = the_lnet.ln_msg_containers[cpt];
+
+	/* Recursion breaker.  Don't complete the message here if I am (or
+	 * enough other threads are) already completing messages */
+	my_slot = lnet_check_finalize_recursion_locked(msg,
+						&container->msc_finalizing,
+						container->msc_nfinalizers,
+						container->msc_finalizers);
+
+	/* enough threads are resending */
+	if (my_slot == -1) {
+		lnet_net_unlock(cpt);
+		return;
+	}
+
+	rc = 0;
+	while (!list_empty(&container->msc_finalizing)) {
+		msg = list_entry(container->msc_finalizing.next,
+				 struct lnet_msg, msg_list);
+
+		list_del_init(&msg->msg_list);
+
+		/* NB drops and regains the lnet lock if it actually does
+		 * anything, so my finalizing friends can chomp along too */
+		rc = lnet_complete_msg_locked(msg, cpt);
+		if (rc != 0)
+			break;
+	}
+
+	if (unlikely(!list_empty(&the_lnet.ln_delay_rules))) {
+		lnet_net_unlock(cpt);
+		lnet_delay_rule_check();
+		lnet_net_lock(cpt);
+	}
+
+	container->msc_finalizers[my_slot] = NULL;
+	lnet_net_unlock(cpt);
+
+	if (rc != 0)
+		goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+	int	count = 0;
+
+	if (container->msc_init == 0)
+		return;
+
+	while (!list_empty(&container->msc_active)) {
+		struct lnet_msg *msg;
+
+		msg  = list_entry(container->msc_active.next,
+				  struct lnet_msg, msg_activelist);
+		LASSERT(msg->msg_onactivelist);
+		msg->msg_onactivelist = 0;
+		list_del_init(&msg->msg_activelist);
+		lnet_msg_free(msg);
+		count++;
+	}
+
+	if (count > 0)
+		CERROR("%d active msg on exit\n", count);
+
+	if (container->msc_finalizers != NULL) {
+		CFS_FREE_PTR_ARRAY(container->msc_finalizers,
+				   container->msc_nfinalizers);
+		container->msc_finalizers = NULL;
+	}
+
+	if (container->msc_resenders != NULL) {
+		CFS_FREE_PTR_ARRAY(container->msc_resenders,
+				   container->msc_nfinalizers);
+		container->msc_resenders = NULL;
+	}
+	container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+	int rc = 0;
+
+	container->msc_init = 1;
+
+	INIT_LIST_HEAD(&container->msc_active);
+	INIT_LIST_HEAD(&container->msc_finalizing);
+	INIT_LIST_HEAD(&container->msc_resending);
+
+	/* number of CPUs */
+	container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+	if (container->msc_nfinalizers == 0)
+		container->msc_nfinalizers = 1;
+
+	LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+			 container->msc_nfinalizers *
+			 sizeof(*container->msc_finalizers));
+
+	if (container->msc_finalizers == NULL) {
+		CERROR("Failed to allocate message finalizers\n");
+		lnet_msg_container_cleanup(container);
+		return -ENOMEM;
+	}
+
+	LIBCFS_CPT_ALLOC(container->msc_resenders, lnet_cpt_table(), cpt,
+			 container->msc_nfinalizers *
+			 sizeof(*container->msc_resenders));
+
+	if (container->msc_resenders == NULL) {
+		CERROR("Failed to allocate message resenders\n");
+		lnet_msg_container_cleanup(container);
+		return -ENOMEM;
+	}
+
+	return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+	struct lnet_msg_container *container;
+	int	i;
+
+	if (the_lnet.ln_msg_containers == NULL)
+		return;
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+		lnet_msg_container_cleanup(container);
+
+	cfs_percpt_free(the_lnet.ln_msg_containers);
+	the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+	struct lnet_msg_container *container;
+	int	rc;
+	int	i;
+
+	the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+						      sizeof(*container));
+
+	if (the_lnet.ln_msg_containers == NULL) {
+		CERROR("Failed to allocate cpu-partition data for network\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+		rc = lnet_msg_container_setup(container, i);
+		if (rc != 0) {
+			lnet_msg_containers_destroy();
+			return rc;
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
new file mode 100644
index 0000000000000..cbe7a30eb50bd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
@@ -0,0 +1,991 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+
+/* NB: add /proc interfaces in upcoming patches */
+int portal_rotor = LNET_PTL_ROTOR_HASH_RT;
+module_param(portal_rotor, int, 0644);
+MODULE_PARM_DESC(portal_rotor, "redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, struct lnet_processid *match_id,
+		    __u64 mbits, __u64 ignore_bits)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[index];
+	int			unique;
+
+	unique = (ignore_bits == 0 &&
+		  !LNET_NID_IS_ANY(&match_id->nid) &&
+		  match_id->pid != LNET_PID_ANY);
+
+	LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+	/* prefer to check w/o any lock */
+	if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+		goto match;
+
+	/* unset, new portal */
+	lnet_ptl_lock(ptl);
+	/* check again with lock */
+	if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+		lnet_ptl_unlock(ptl);
+		goto match;
+	}
+
+	/* still not set */
+	if (unique)
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+	else
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+	lnet_ptl_unlock(ptl);
+
+	return 1;
+
+ match:
+	if ((lnet_ptl_is_unique(ptl) && !unique) ||
+	    (lnet_ptl_is_wildcard(ptl) && unique))
+		return 0;
+	return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	mtable->mt_enabled = 1;
+
+	ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+	for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+		LASSERT(ptl->ptl_mt_maps[i] != cpt);
+		if (ptl->ptl_mt_maps[i] < cpt)
+			break;
+
+		/* swap to order */
+		ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+		ptl->ptl_mt_maps[i] = cpt;
+	}
+
+	ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	if (LNET_CPT_NUMBER == 1)
+		return; /* never disable the only match-table */
+
+	mtable->mt_enabled = 0;
+
+	LASSERT(ptl->ptl_mt_nmaps > 0 &&
+		ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+	/* remove it from mt_maps */
+	ptl->ptl_mt_nmaps--;
+	for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+		if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+			ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+	}
+}
+
+static int
+lnet_try_match_md(struct lnet_libmd *md,
+		  struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	/* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+	 * lnet_match_blocked_msg() relies on this to avoid races */
+	unsigned int	offset;
+	unsigned int	mlength;
+	struct lnet_me	*me = md->md_me;
+
+	/* MD exhausted */
+	if (lnet_md_exhausted(md))
+		return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+	/* mismatched MD op */
+	if ((md->md_options & info->mi_opc) == 0)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME nid/pid? */
+	if (!LNET_NID_IS_ANY(&me->me_match_id.nid) &&
+	    !nid_same(&me->me_match_id.nid, &info->mi_id.nid))
+		return LNET_MATCHMD_NONE;
+
+	if (me->me_match_id.pid != LNET_PID_ANY &&
+	    me->me_match_id.pid != info->mi_id.pid)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME matchbits? */
+	if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+		return LNET_MATCHMD_NONE;
+
+	/* Hurrah! This _is_ a match; check it out... */
+
+	if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+		offset = md->md_offset;
+	else
+		offset = info->mi_roffset;
+
+	if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+		mlength = md->md_max_size;
+		LASSERT(md->md_offset + mlength <= md->md_length);
+	} else {
+		mlength = md->md_length - offset;
+	}
+
+	if (info->mi_rlength <= mlength) {	/* fits in allowed space */
+		mlength = info->mi_rlength;
+	} else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+		/* this packet _really_ is too big */
+		CERROR("Matching packet from %s, match %llu"
+		       " length %d too big: %d left, %d allowed\n",
+		       libcfs_idstr(&info->mi_id), info->mi_mbits,
+		       info->mi_rlength, md->md_length - offset, mlength);
+
+		return LNET_MATCHMD_DROP;
+	}
+
+	/* Commit to this ME/MD */
+	CDEBUG(D_NET, "Incoming %s index %x from %s of "
+	       "length %d/%d into md %#llx [%d] + %d\n",
+	       (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+	       info->mi_portal, libcfs_idstr(&info->mi_id), mlength,
+	       info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+	lnet_msg_attach_md(msg, md, offset, mlength);
+	md->md_offset = offset + mlength;
+
+	if (!lnet_md_exhausted(md))
+		return LNET_MATCHMD_OK;
+
+	/* Auto-unlink NOW, so the ME gets unlinked if required.
+	 * We bumped md->md_refcount above so the MD just gets flagged
+	 * for unlink when it is finalized. */
+	if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+		lnet_md_unlink(md);
+
+	return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, struct lnet_processid *id, __u64 mbits)
+{
+	if (LNET_CPT_NUMBER == 1)
+		return ptl->ptl_mtables[0]; /* the only one */
+
+	/* if it's a unique portal, return match-table hashed by NID */
+	return lnet_ptl_is_unique(ptl) ?
+	       ptl->ptl_mtables[lnet_nid2cpt(&id->nid, NULL)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, struct lnet_processid *id,
+		  __u64 mbits, __u64 ignore_bits, enum lnet_ins_pos pos)
+{
+	struct lnet_portal	*ptl;
+	struct lnet_match_table	*mtable;
+
+	/* NB: called w/o lock */
+	LASSERT(index < the_lnet.ln_nportals);
+
+	if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+		return NULL;
+
+	ptl = the_lnet.ln_portals[index];
+
+	mtable = lnet_match2mt(ptl, id, mbits);
+	if (mtable != NULL) /* unique portal or only one match-table */
+		return mtable;
+
+	/* it's a wildcard portal */
+	switch (pos) {
+	default:
+		return NULL;
+	case LNET_INS_BEFORE:
+	case LNET_INS_AFTER:
+		/* posted by no affinity thread, always hash to specific
+		 * match-table to avoid buffer stealing which is heavy */
+		return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+	case LNET_INS_LOCAL:
+		/* posted by cpu-affinity thread */
+		return ptl->ptl_mtables[lnet_cpt_current()];
+	}
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	unsigned int		nmaps;
+	unsigned int		rotor;
+	unsigned int		cpt;
+	bool			routed;
+
+	/* NB: called w/o lock */
+	LASSERT(info->mi_portal < the_lnet.ln_nportals);
+	ptl = the_lnet.ln_portals[info->mi_portal];
+
+	LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+	mtable = lnet_match2mt(ptl, &info->mi_id, info->mi_mbits);
+	if (mtable != NULL)
+		return mtable;
+
+	/* it's a wildcard portal */
+	routed = LNET_NID_NET(&msg->msg_hdr.src_nid) !=
+		 LNET_NID_NET(&msg->msg_hdr.dest_nid);
+
+	if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+	    (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+		cpt = lnet_cpt_current();
+		if (ptl->ptl_mtables[cpt]->mt_enabled)
+			return ptl->ptl_mtables[cpt];
+	}
+
+	rotor = ptl->ptl_rotor++; /* get round-robin factor */
+	if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+		cpt = info->mi_cpt;
+	else
+		cpt = rotor % LNET_CPT_NUMBER;
+
+	if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+		/* is there any active entry for this portal? */
+		nmaps = ptl->ptl_mt_nmaps;
+		/* map to an active mtable to avoid heavy "stealing" */
+		if (nmaps != 0) {
+			/* NB: there is possibility that ptl_mt_maps is being
+			 * changed because we are not under protection of
+			 * lnet_ptl_lock, but it shouldn't hurt anything */
+			cpt = ptl->ptl_mt_maps[rotor % nmaps];
+		}
+	}
+
+	return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+	__u64	*bmap;
+	int	i;
+
+	if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		return 0;
+
+	if (pos < 0) { /* check all bits */
+		for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+			if (mtable->mt_exhausted[i] != (__u64)(-1))
+				return 0;
+		}
+		return 1;
+	}
+
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+	/* mtable::mt_mhash[pos] is marked as exhausted or not */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+	__u64	*bmap;
+
+	LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+	/* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	if (!exhausted)
+		*bmap &= ~(1ULL << pos);
+	else
+		*bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+		   struct lnet_processid *id, __u64 mbits)
+{
+	struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+	if (lnet_ptl_is_wildcard(ptl)) {
+		return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+	} else {
+		unsigned long hash = mbits + nidhash(&id->nid) + id->pid;
+
+		LASSERT(lnet_ptl_is_unique(ptl));
+		hash = hash_long(hash, LNET_MT_HASH_BITS);
+		return &mtable->mt_mhash[hash & LNET_MT_HASH_MASK];
+	}
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+		 struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct list_head	*head;
+	struct lnet_me		*me;
+	struct lnet_me		*tmp;
+	int			exhausted = 0;
+	int			rc;
+
+	/* any ME with ignore bits? */
+	if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, &info->mi_id,
+					  info->mi_mbits);
+ again:
+	/* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+	if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		exhausted = LNET_MATCHMD_EXHAUSTED;
+
+	list_for_each_entry_safe(me, tmp, head, me_list) {
+		/* ME attached but MD not attached yet */
+		if (me->me_md == NULL)
+			continue;
+
+		LASSERT(me == me->me_md->md_me);
+
+		rc = lnet_try_match_md(me->me_md, info, msg);
+		if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+			exhausted = 0; /* mlist is not empty */
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0) {
+			/* don't return EXHAUSTED bit because we don't know
+			 * whether the mlist is empty or not */
+			return rc & ~LNET_MATCHMD_EXHAUSTED;
+		}
+	}
+
+	if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+		lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+		if (!lnet_mt_test_exhausted(mtable, -1))
+			exhausted = 0;
+	}
+
+	if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+		head = lnet_mt_match_head(mtable, &info->mi_id,
+					  info->mi_mbits);
+		goto again; /* re-check MEs w/o ignore-bits */
+	}
+
+	if (info->mi_opc == LNET_MD_OP_GET ||
+	    !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+		return LNET_MATCHMD_DROP | exhausted;
+
+	return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+	int	rc;
+
+	/* message arrived before any buffer posting on this portal,
+	 * simply delay or drop this message */
+	if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+		return 0;
+
+	lnet_ptl_lock(ptl);
+	/* check it again with hold of lock */
+	if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+		lnet_ptl_unlock(ptl);
+		return 0;
+	}
+
+	if (lnet_ptl_is_lazy(ptl)) {
+		if (msg->msg_rx_ready_delay) {
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list,
+				      &ptl->ptl_msg_delayed);
+		}
+		rc = LNET_MATCHMD_NONE;
+	} else {
+		rc = LNET_MATCHMD_DROP;
+	}
+
+	lnet_ptl_unlock(ptl);
+	return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+		     struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	int	first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+	int	rc = 0;
+	int	i;
+
+	/*
+	 * Steal buffer from other CPTs, and delay msg if nothing to
+	 * steal.  This function is more expensive than a regular
+	 * match, but we don't expect it can happen a lot. The return
+	 * code contains one of LNET_MATCHMD_OK, LNET_MATCHMD_DROP, or
+	 * LNET_MATCHMD_NONE.
+	 */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	for (i = 0; i < LNET_CPT_NUMBER; i++) {
+		struct lnet_match_table *mtable;
+		int			cpt;
+
+		cpt = (first + i) % LNET_CPT_NUMBER;
+		mtable = ptl->ptl_mtables[cpt];
+		if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+			continue;
+
+		lnet_res_lock(cpt);
+		lnet_ptl_lock(ptl);
+
+		if (i == 0) {
+			/* The first try, add to stealing list. */
+			list_add_tail(&msg->msg_list,
+				      &ptl->ptl_msg_stealing);
+		}
+
+		if (!list_empty(&msg->msg_list)) {
+			/* On stealing list. */
+			rc = lnet_mt_match_md(mtable, info, msg);
+
+			if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+			    mtable->mt_enabled)
+				lnet_ptl_disable_mt(ptl, cpt);
+
+			if ((rc & LNET_MATCHMD_FINISH) != 0) {
+				/* Match found, remove from stealing list. */
+				list_del_init(&msg->msg_list);
+			} else if (i == LNET_CPT_NUMBER - 1 || /* (1) */
+				   ptl->ptl_mt_nmaps == 0 ||   /* (2) */
+				   (ptl->ptl_mt_nmaps == 1 &&  /* (3) */
+				    ptl->ptl_mt_maps[0] == cpt)) {
+				/*
+				 * No match found, and this is either
+				 * (1) the last cpt to check, or
+				 * (2) there is no active cpt, or
+				 * (3) this is the only active cpt.
+				 * There is nothing to steal: delay or
+				 * drop the message.
+				 */
+				list_del_init(&msg->msg_list);
+
+				if (lnet_ptl_is_lazy(ptl)) {
+					msg->msg_rx_delayed = 1;
+					list_add_tail(&msg->msg_list,
+						      &ptl->ptl_msg_delayed);
+					rc = LNET_MATCHMD_NONE;
+				} else {
+					rc = LNET_MATCHMD_DROP;
+				}
+			} else {
+				/* Do another iteration. */
+				rc = 0;
+			}
+		} else {
+			/*
+			 * No longer on stealing list: another thread
+			 * matched the message in lnet_ptl_attach_md().
+			 * We are now expected to handle the message.
+			 */
+			rc = msg->msg_md == NULL ?
+				LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+		}
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(cpt);
+
+		/*
+		 * Note that test (1) above ensures that we always
+		 * exit the loop through this break statement.
+		 *
+		 * LNET_MATCHMD_NONE means msg was added to the
+		 * delayed queue, and we may no longer reference it
+		 * after lnet_ptl_unlock() and lnet_res_unlock().
+		 */
+		if (rc & (LNET_MATCHMD_FINISH | LNET_MATCHMD_NONE))
+			break;
+	}
+
+	return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	int			rc;
+
+	CDEBUG(D_NET,
+	       "Request from %s of length %d into portal %d MB=%#llx\n",
+	       libcfs_idstr(&info->mi_id),
+	       info->mi_rlength, info->mi_portal, info->mi_mbits);
+
+	if (info->mi_portal >= the_lnet.ln_nportals) {
+		CERROR("Invalid portal %d not in [0-%d]\n",
+		       info->mi_portal, the_lnet.ln_nportals);
+		return LNET_MATCHMD_DROP;
+	}
+
+	ptl = the_lnet.ln_portals[info->mi_portal];
+	rc = lnet_ptl_match_early(ptl, msg);
+	if (rc != 0) /* matched or delayed early message */
+		return rc;
+
+	mtable = lnet_mt_of_match(info, msg);
+	lnet_res_lock(mtable->mt_cpt);
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = LNET_MATCHMD_DROP;
+		goto out1;
+	}
+
+	rc = lnet_mt_match_md(mtable, info, msg);
+	if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+		lnet_ptl_lock(ptl);
+		lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+		lnet_ptl_unlock(ptl);
+	}
+
+	if ((rc & LNET_MATCHMD_FINISH) != 0)	/* matched or dropping */
+		goto out1;
+
+	if (!msg->msg_rx_ready_delay)
+		goto out1;
+
+	LASSERT(lnet_ptl_is_lazy(ptl));
+	LASSERT(!msg->msg_rx_delayed);
+
+	/* NB: we don't expect "delay" can happen a lot */
+	if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+		lnet_ptl_lock(ptl);
+
+		msg->msg_rx_delayed = 1;
+		list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(mtable->mt_cpt);
+		rc = LNET_MATCHMD_NONE;
+	} else	{
+		lnet_res_unlock(mtable->mt_cpt);
+		rc = lnet_ptl_match_delay(ptl, info, msg);
+	}
+
+	/* LNET_MATCHMD_NONE means msg was added to the delay queue */
+	if (rc & LNET_MATCHMD_NONE) {
+		CDEBUG(D_NET,
+		       "Delaying %s from %s ptl %d MB %#llx off %d len %d\n",
+		       info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+		       libcfs_idstr(&info->mi_id), info->mi_portal,
+		       info->mi_mbits, info->mi_roffset, info->mi_rlength);
+	}
+	goto out0;
+ out1:
+	lnet_res_unlock(mtable->mt_cpt);
+ out0:
+	/* EXHAUSTED bit is only meaningful for internal functions */
+	return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md)
+{
+	LASSERT(me->me_md == md && md->md_me == me);
+
+	me->me_md = NULL;
+	md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md,
+		   struct list_head *matches, struct list_head *drops)
+{
+	struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal];
+	struct lnet_match_table	*mtable;
+	struct list_head *head;
+	struct lnet_msg	*tmp;
+	struct lnet_msg	*msg;
+	int exhausted = 0;
+	int cpt;
+
+	LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+	me->me_md = md;
+	md->md_me = me;
+
+	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+	mtable = ptl->ptl_mtables[cpt];
+
+	if (list_empty(&ptl->ptl_msg_stealing) &&
+	    list_empty(&ptl->ptl_msg_delayed) &&
+	    !lnet_mt_test_exhausted(mtable, me->me_pos))
+		return;
+
+	lnet_ptl_lock(ptl);
+	head = &ptl->ptl_msg_stealing;
+ again:
+	list_for_each_entry_safe(msg, tmp, head, msg_list) {
+		struct lnet_match_info	info;
+		struct lnet_hdr		*hdr;
+		int			rc;
+
+		LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+		hdr   = &msg->msg_hdr;
+		/* Multi-Rail: Primary peer NID */
+		info.mi_id.nid  = msg->msg_initiator;
+		info.mi_id.pid	= hdr->src_pid;
+		info.mi_opc	= LNET_MD_OP_PUT;
+		info.mi_portal	= hdr->msg.put.ptl_index;
+		info.mi_rlength	= hdr->payload_length;
+		info.mi_roffset	= hdr->msg.put.offset;
+		info.mi_mbits	= hdr->msg.put.match_bits;
+
+		rc = lnet_try_match_md(md, &info, msg);
+
+		exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+		if ((rc & LNET_MATCHMD_NONE) != 0) {
+			if (exhausted)
+				break;
+			continue;
+		}
+
+		/* Hurrah! This _is_ a match */
+		LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+		list_del_init(&msg->msg_list);
+
+		if (head == &ptl->ptl_msg_stealing) {
+			if (exhausted)
+				break;
+			/* stealing thread will handle the message */
+			continue;
+		}
+
+		if ((rc & LNET_MATCHMD_OK) != 0) {
+			list_add_tail(&msg->msg_list, matches);
+
+			CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+			       "match %llu offset %d length %d.\n",
+			       libcfs_idstr(&info.mi_id),
+			       info.mi_portal, info.mi_mbits,
+			       info.mi_roffset, info.mi_rlength);
+		} else {
+			list_add_tail(&msg->msg_list, drops);
+		}
+
+		if (exhausted)
+			break;
+	}
+
+	if (!exhausted && head == &ptl->ptl_msg_stealing) {
+		head = &ptl->ptl_msg_delayed;
+		goto again;
+	}
+
+	if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+		lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+		if (!mtable->mt_enabled)
+			lnet_ptl_enable_mt(ptl, cpt);
+	}
+
+	lnet_ptl_unlock(ptl);
+}
+
+static void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+	struct lnet_match_table	*mtable;
+	int			i;
+
+	if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+		return;
+
+	LASSERT(list_empty(&ptl->ptl_msg_delayed));
+	LASSERT(list_empty(&ptl->ptl_msg_stealing));
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		struct list_head *mhash;
+		struct lnet_me	 *me;
+		int		  j;
+
+		if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+			continue;
+
+		mhash = mtable->mt_mhash;
+		/* cleanup ME */
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+			while (!list_empty(&mhash[j])) {
+				me = list_entry(mhash[j].next,
+						struct lnet_me, me_list);
+				CERROR("Active ME %p on exit\n", me);
+				list_del(&me->me_list);
+				CDEBUG(D_MALLOC,
+				       "slab-freed 'me' at %p in cleanup.\n",
+				       me);
+				kmem_cache_free(lnet_mes_cachep, me);
+			}
+		}
+		/* the extra entry is for MEs with ignore bits */
+		CFS_FREE_PTR_ARRAY(mhash, LNET_MT_HASH_SIZE + 1);
+	}
+
+	cfs_percpt_free(ptl->ptl_mtables);
+	ptl->ptl_mtables = NULL;
+}
+
+static int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+	struct lnet_match_table	*mtable;
+	struct list_head	*mhash;
+	int			i;
+	int			j;
+
+	ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct lnet_match_table));
+	if (ptl->ptl_mtables == NULL) {
+		CERROR("Failed to create match table for portal %d\n", index);
+		return -ENOMEM;
+	}
+
+	ptl->ptl_index = index;
+	INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+	INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+	spin_lock_init(&ptl->ptl_lock);
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+				 sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+		if (mhash == NULL) {
+			CERROR("Failed to create match hash for portal %d\n",
+			       index);
+			goto failed;
+		}
+
+		memset(&mtable->mt_exhausted[0], -1,
+		       sizeof(mtable->mt_exhausted[0]) *
+		       LNET_MT_EXHAUSTED_BMAP);
+		mtable->mt_mhash = mhash;
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+			INIT_LIST_HEAD(&mhash[j]);
+
+		mtable->mt_portal = index;
+		mtable->mt_cpt = i;
+	}
+
+	return 0;
+ failed:
+	lnet_ptl_cleanup(ptl);
+	return -ENOMEM;
+}
+
+#define PORTAL_SIZE (offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]))
+void
+lnet_portals_destroy(void)
+{
+	int	i;
+
+	if (the_lnet.ln_portals == NULL)
+		return;
+
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		if (the_lnet.ln_portals[i]) {
+			lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+			LIBCFS_FREE(the_lnet.ln_portals[i], PORTAL_SIZE);
+		}
+
+	CFS_FREE_PTR_ARRAY(the_lnet.ln_portals, the_lnet.ln_nportals);
+	the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+	int	i;
+
+	the_lnet.ln_nportals = MAX_PORTALS;
+	CFS_ALLOC_PTR_ARRAY(the_lnet.ln_portals, the_lnet.ln_nportals);
+	if (the_lnet.ln_portals == NULL) {
+		CERROR("Failed to allocate portals table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < the_lnet.ln_nportals; i++) {
+		LIBCFS_ALLOC(the_lnet.ln_portals[i], PORTAL_SIZE);
+		if (!the_lnet.ln_portals[i] ||
+		    lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+			lnet_portals_destroy();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+	struct lnet_portal *ptl;
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+int
+lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason)
+{
+	struct lnet_portal	*ptl;
+	LIST_HEAD(zombies);
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	if (!lnet_ptl_is_lazy(ptl)) {
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(LNET_LOCK_EX);
+		return 0;
+	}
+
+	if (ni != NULL) {
+		struct lnet_msg *msg, *tmp;
+
+		/* grab all messages which are on the NI passed in */
+		list_for_each_entry_safe(msg, tmp, &ptl->ptl_msg_delayed,
+					 msg_list) {
+			if (msg->msg_txni == ni || msg->msg_rxni == ni)
+				list_move(&msg->msg_list, &zombies);
+		}
+	} else {
+		if (the_lnet.ln_state != LNET_STATE_RUNNING)
+			CWARN("Active lazy portal %d on exit\n", portal);
+		else
+			CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+		/* grab all the blocked messages atomically */
+		list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+		lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+	}
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_drop_delayed_msg_list(&zombies, reason);
+
+	return 0;
+}
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+	return lnet_clear_lazy_portal(NULL, portal,
+				      "Clearing lazy portal attr");
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
new file mode 100644
index 0000000000000..90cdc3e2b4dbe
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -0,0 +1,434 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2015, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+/* For sys_open & sys_close */
+#include <linux/syscalls.h>
+#include <net/sock.h>
+#include <linux/inetdevice.h>
+
+#include <libcfs/linux/linux-time.h>
+#include <libcfs/linux/linux-net.h>
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+int
+lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int rc;
+	long jiffies_left = cfs_time_seconds(timeout);
+	unsigned long then;
+
+	LASSERT(nob > 0);
+	/* Caller may pass a zero timeout if she thinks the socket buffer is
+	 * empty enough to take the whole message immediately */
+
+	for (;;) {
+		struct kvec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_flags	= (timeout == 0) ? MSG_DONTWAIT : 0
+		};
+
+		if (timeout != 0) {
+			struct sock *sk = sock->sk;
+
+			/* Set send timeout to remaining time */
+			lock_sock(sk);
+			sk->sk_sndtimeo = jiffies_left;
+			release_sock(sk);
+		}
+
+		then = jiffies;
+		rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
+		jiffies_left -= jiffies - then;
+
+		if (rc == nob)
+			return 0;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0) {
+			CERROR("Unexpected zero rc\n");
+			return -ECONNABORTED;
+		}
+
+		if (jiffies_left <= 0)
+			return -EAGAIN;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lnet_sock_write);
+
+int
+lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int rc;
+	long jiffies_left = cfs_time_seconds(timeout);
+	unsigned long then;
+
+	LASSERT(nob > 0);
+	LASSERT(jiffies_left > 0);
+
+	for (;;) {
+		struct kvec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_flags	= 0
+		};
+		struct sock *sk = sock->sk;
+
+		/* Set receive timeout to remaining time */
+		lock_sock(sk);
+		sk->sk_rcvtimeo = jiffies_left;
+		release_sock(sk);
+
+		then = jiffies;
+		rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
+		jiffies_left -= jiffies - then;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0)
+			return -ECONNRESET;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+
+		if (nob == 0)
+			return 0;
+
+		if (jiffies_left <= 0)
+			return -ETIMEDOUT;
+	}
+}
+EXPORT_SYMBOL(lnet_sock_read);
+
+int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns)
+{
+	struct net_device *dev;
+	struct in_device *in_dev;
+	int err;
+	DECLARE_CONST_IN_IFADDR(ifa);
+
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(ns, interface);
+	err = -EINVAL;
+	if (!dev || !(dev->flags & IFF_UP))
+		goto out;
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev)
+		goto out;
+	err = -ENOENT;
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
+		if (err ||
+		    ((dst_ipaddr ^ ntohl(ifa->ifa_local))
+		     & ntohl(ifa->ifa_mask)) == 0) {
+			/* This address at least as good as what we
+			 * already have
+			 */
+			*ret = ntohl(ifa->ifa_local);
+			err = 0;
+		}
+	}
+	endfor_ifa(in_dev);
+out:
+	rcu_read_unlock();
+	return err;
+}
+EXPORT_SYMBOL(choose_ipv4_src);
+
+static struct socket *
+lnet_sock_create(int interface, struct sockaddr *remaddr,
+		 int local_port, struct net *ns)
+{
+	struct socket *sock;
+	int rc;
+	int family;
+
+	family = AF_INET6;
+	if (remaddr)
+		family = remaddr->sa_family;
+retry:
+#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
+	rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock);
+#else
+	rc = sock_create_kern(family, SOCK_STREAM, 0, &sock);
+#endif
+	if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) {
+		family = AF_INET;
+		goto retry;
+	}
+
+	if (rc) {
+		CERROR("Can't create socket: %d\n", rc);
+		return ERR_PTR(rc);
+	}
+
+	sock->sk->sk_reuseport = 1;
+
+	if (interface >= 0 || local_port != 0) {
+		struct sockaddr_storage locaddr = {};
+
+		switch (family) {
+		case AF_INET: {
+			struct sockaddr_in *sin = (void *)&locaddr;
+
+			sin->sin_family = AF_INET;
+			sin->sin_addr.s_addr = INADDR_ANY;
+			if (interface >= 0 && remaddr) {
+				struct sockaddr_in *rem = (void *)remaddr;
+				__u32 ip;
+
+				rc = choose_ipv4_src(&ip,
+						     interface,
+						     ntohl(rem->sin_addr.s_addr),
+						     ns);
+				if (rc)
+					goto failed;
+				sin->sin_addr.s_addr = htonl(ip);
+			}
+			sin->sin_port = htons(local_port);
+			break;
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		case AF_INET6: {
+			struct sockaddr_in6 *sin6 = (void *)&locaddr;
+			int val = 0;
+
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_addr = in6addr_any;
+
+			/* Make sure we get both IPv4 and IPv6 connections.
+			 * This is the default, but it can be overridden so we
+			 * force it back.
+			 */
+#ifdef HAVE_KERNEL_SETSOCKOPT
+			kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
+					  (char *) &val, sizeof(val));
+#elif defined(_LINUX_SOCKPTR_H)
+			/* sockptr_t was introduced around
+			 * v5.8-rc4-1952-ga7b75c5a8c41 and allows a
+			 * kernel address to be passed to ->setsockopt
+			 */
+			if (ipv6_only_sock(sock->sk)) {
+				sockptr_t optval = KERNEL_SOCKPTR(&val);
+
+				sock->ops->setsockopt(sock,
+						      IPPROTO_IPV6, IPV6_V6ONLY,
+						      optval, sizeof(val));
+			}
+#else
+			/* From v5.7-rc6-2614-g5a892ff2facb when
+			 * kernel_setsockopt() was removed until
+			 * sockptr_t (above) there is no clean way to
+			 * pass kernel address to setsockopt.  We could
+			 * use get_fs()/set_fs(), but in this particular
+			 * situation there is an easier way.  It depends
+			 * on the fact that at least for these few
+			 * kernels a NULL address to ipv6_setsockopt()
+			 * is treated like the address of a zero.
+			 */
+			if (ipv6_only_sock(sock->sk) && !val) {
+				void *optval = NULL;
+
+				sock->ops->setsockopt(sock,
+						      IPPROTO_IPV6, IPV6_V6ONLY,
+						      optval, sizeof(val));
+			}
+#endif /* HAVE_KERNEL_SETSOCKOPT */
+
+			if (interface >= 0 && remaddr) {
+				struct sockaddr_in6 *rem = (void *)remaddr;
+
+				ipv6_dev_get_saddr(ns,
+						   dev_get_by_index(ns,
+								    interface),
+						   &rem->sin6_addr, 0,
+						   &sin6->sin6_addr);
+			}
+			sin6->sin6_port = htons(local_port);
+			break;
+		}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+		}
+		rc = kernel_bind(sock, (struct sockaddr *)&locaddr,
+				 sizeof(locaddr));
+		if (rc == -EADDRINUSE) {
+			CDEBUG(D_NET, "Port %d already in use\n", local_port);
+			goto failed;
+		}
+		if (rc != 0) {
+			CERROR("Error trying to bind to port %d: %d\n",
+			       local_port, rc);
+			goto failed;
+		}
+	}
+	return sock;
+
+failed:
+	sock_release(sock);
+	return ERR_PTR(rc);
+}
+
+void
+lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
+{
+	struct sock *sk = sock->sk;
+
+	if (txbufsize != 0) {
+		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+		sk->sk_sndbuf = txbufsize;
+		sk->sk_write_space(sk);
+	}
+
+	if (rxbufsize != 0) {
+		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+		sk->sk_sndbuf = rxbufsize;
+	}
+}
+EXPORT_SYMBOL(lnet_sock_setbuf);
+
+int
+lnet_sock_getaddr(struct socket *sock, bool remote,
+		  struct sockaddr_storage *peer)
+{
+	int rc;
+#ifndef HAVE_KERN_SOCK_GETNAME_2ARGS
+	int len = sizeof(*peer);
+#endif
+
+	if (remote)
+		rc = lnet_kernel_getpeername(sock,
+					     (struct sockaddr *)peer, &len);
+	else
+		rc = lnet_kernel_getsockname(sock,
+					     (struct sockaddr *)peer, &len);
+	if (rc < 0) {
+		CERROR("Error %d getting sock %s IP/port\n",
+			rc, remote ? "peer" : "local");
+		return rc;
+	}
+	if (peer->ss_family == AF_INET6) {
+		struct sockaddr_in6 *in6 = (void *)peer;
+		struct sockaddr_in *in = (void *)peer;
+		short port = in6->sin6_port;
+
+		if (ipv6_addr_v4mapped(&in6->sin6_addr)) {
+			/* Pretend it is a v4 socket */
+			memset(in, 0, sizeof(*in));
+			in->sin_family = AF_INET;
+			in->sin_port = port;
+			memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4);
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lnet_sock_getaddr);
+
+void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+	if (txbufsize != NULL)
+		*txbufsize = sock->sk->sk_sndbuf;
+
+	if (rxbufsize != NULL)
+		*rxbufsize = sock->sk->sk_rcvbuf;
+}
+EXPORT_SYMBOL(lnet_sock_getbuf);
+
+struct socket *
+lnet_sock_listen(int local_port, int backlog, struct net *ns)
+{
+	struct socket *sock;
+	int rc;
+
+	sock = lnet_sock_create(-1, NULL, local_port, ns);
+	if (IS_ERR(sock)) {
+		rc = PTR_ERR(sock);
+		if (rc == -EADDRINUSE)
+			CERROR("Can't create socket: port %d already in use\n",
+			       local_port);
+		return ERR_PTR(rc);
+	}
+
+	rc = kernel_listen(sock, backlog);
+	if (rc == 0)
+		return sock;
+
+	CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+	sock_release(sock);
+	return ERR_PTR(rc);
+}
+
+struct socket *
+lnet_sock_connect(int interface, int local_port,
+		  struct sockaddr *peeraddr,
+		  struct net *ns)
+{
+	struct socket *sock;
+	int rc;
+
+	sock = lnet_sock_create(interface, peeraddr, local_port, ns);
+	if (IS_ERR(sock))
+		return sock;
+
+	rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0);
+	if (rc == 0)
+		return sock;
+
+	/* EADDRNOTAVAIL probably means we're already connected to the same
+	 * peer/port on the same local port on a differently typed
+	 * connection.	Let our caller retry with a different local
+	 * port... */
+
+	CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR,
+		     "Error %d connecting %d -> %pISp\n", rc,
+		     local_port, peeraddr);
+
+	sock_release(sock);
+	return ERR_PTR(rc);
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lnet_rdma.c b/drivers/staging/lustrefsx/lnet/lnet/lnet_rdma.c
new file mode 100644
index 0000000000000..c5c9d9ffe8b50
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lnet_rdma.c
@@ -0,0 +1,208 @@
+#include <lnet/lnet_rdma.h>
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+#define ERROR_PRINT_DEADLINE 3600
+
+atomic_t nvfs_shutdown = ATOMIC_INIT(1);
+struct nvfs_dma_rw_ops *nvfs_ops = NULL;
+struct percpu_counter nvfs_n_ops;
+
+static inline long nvfs_count_ops(void)
+{
+	return percpu_counter_sum(&nvfs_n_ops);
+}
+
+static struct nvfs_dma_rw_ops *nvfs_get_ops(void)
+{
+	if (!nvfs_ops || atomic_read(&nvfs_shutdown))
+		return NULL;
+
+	percpu_counter_inc(&nvfs_n_ops);
+
+	return nvfs_ops;
+}
+
+static inline void nvfs_put_ops(void)
+{
+	percpu_counter_dec(&nvfs_n_ops);
+}
+
+static inline bool nvfs_check_feature_set(struct nvfs_dma_rw_ops *ops)
+{
+	bool supported = true;
+	static time64_t last_printed;
+
+	if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops))) {
+		if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+			CDEBUG(D_CONSOLE,
+			       "NVFS sg list preparation callback missing\n");
+		supported = false;
+	}
+	if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops))) {
+		if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+			CDEBUG(D_CONSOLE,
+			       "NVFS DMA mapping callbacks missing\n");
+		supported = false;
+	}
+	if (unlikely(!NVIDIA_FS_CHECK_FT_GPU_PAGE(ops))) {
+		if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+			CDEBUG(D_CONSOLE,
+			       "NVFS page identification callback missing\n");
+		supported = false;
+	}
+	if (unlikely(!NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops))) {
+		if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+			CDEBUG(D_CONSOLE,
+			       "NVFS device priority callback not missing\n");
+		supported = false;
+	}
+
+	if (unlikely(!supported &&
+		     ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)))
+		last_printed = ktime_get_seconds();
+	else if (supported)
+		last_printed = 0;
+
+	return supported;
+}
+
+int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
+{
+	if (!ops || !nvfs_check_feature_set(ops))
+		return -EINVAL;
+
+	nvfs_ops = ops;
+	(void)percpu_counter_init(&nvfs_n_ops, 0, GFP_KERNEL);
+	atomic_set(&nvfs_shutdown, 0);
+	CDEBUG(D_NET, "registering nvfs %p\n", ops);
+	return 0;
+}
+EXPORT_SYMBOL(REGISTER_FUNC);
+
+void UNREGISTER_FUNC(void)
+{
+	(void)atomic_cmpxchg(&nvfs_shutdown, 0, 1);
+	do {
+		CDEBUG(D_NET, "Attempting to de-register nvfs: %ld\n",
+		       nvfs_count_ops());
+		msleep(NVFS_HOLD_TIME_MS);
+	} while (nvfs_count_ops());
+	nvfs_ops = NULL;
+	percpu_counter_destroy(&nvfs_n_ops);
+}
+EXPORT_SYMBOL(UNREGISTER_FUNC);
+
+unsigned int
+lnet_get_dev_prio(struct device *dev, unsigned int dev_idx)
+{
+	unsigned int dev_prio = UINT_MAX;
+	struct nvfs_dma_rw_ops *nvfs_ops;
+
+	if (!dev)
+		return dev_prio;
+
+	nvfs_ops = nvfs_get_ops();
+	if (!nvfs_ops)
+		return dev_prio;
+
+	dev_prio = nvfs_ops->nvfs_device_priority (dev, dev_idx);
+
+	nvfs_put_ops();
+	return dev_prio;
+}
+EXPORT_SYMBOL(lnet_get_dev_prio);
+
+int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+			   int nents, enum dma_data_direction direction)
+{
+	struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops();
+
+	if (nvfs_ops) {
+		int count;
+
+		count = nvfs_ops->nvfs_dma_map_sg_attrs(dev,
+				sg, nents, direction,
+				DMA_ATTR_NO_WARN);
+
+		if (unlikely((count == NVFS_IO_ERR))) {
+			nvfs_put_ops();
+			return -EIO;
+		}
+
+		if (unlikely(count == NVFS_CPU_REQ))
+			nvfs_put_ops();
+		else
+			return count;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(lnet_rdma_map_sg_attrs);
+
+int lnet_rdma_unmap_sg(struct device *dev,
+		       struct scatterlist *sg, int nents,
+		       enum dma_data_direction direction)
+{
+	struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops();
+
+	if (nvfs_ops) {
+		int count;
+
+		count = nvfs_ops->nvfs_dma_unmap_sg(dev, sg,
+						    nents, direction);
+
+		/* drop the count we got by calling nvfs_get_ops() */
+		nvfs_put_ops();
+
+		if (count) {
+			nvfs_put_ops();
+			return count;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(lnet_rdma_unmap_sg);
+
+bool
+lnet_is_rdma_only_page(struct page *page)
+{
+	bool found = false;
+	struct nvfs_dma_rw_ops *nvfs_ops;
+
+	if (!page)
+		return found;
+
+	nvfs_ops = nvfs_get_ops();
+	if (!nvfs_ops)
+		return found;
+
+	if (!nvfs_ops->nvfs_is_gpu_page(page))
+		goto out;
+
+	found = true;
+
+out:
+	nvfs_put_ops();
+	return found;
+}
+EXPORT_SYMBOL(lnet_is_rdma_only_page);
+
+unsigned int
+lnet_get_dev_idx(struct page *page)
+{
+	unsigned int dev_idx = UINT_MAX;
+	struct nvfs_dma_rw_ops *nvfs_ops;
+
+	nvfs_ops = nvfs_get_ops();
+	if (!nvfs_ops)
+		return dev_idx;
+
+	dev_idx = nvfs_ops->nvfs_gpu_index(page);
+
+	nvfs_put_ops();
+	return dev_idx;
+}
+EXPORT_SYMBOL(lnet_get_dev_idx);
+
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lo.c b/drivers/staging/lustrefsx/lnet/lnet/lo.c
new file mode 100644
index 0000000000000..d4c9ed101e803
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lo.c
@@ -0,0 +1,92 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
+
+static int
+lolnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
+{
+	LASSERT(!lntmsg->msg_routing);
+	LASSERT(!lntmsg->msg_target_is_router);
+
+	return lnet_parse(ni, &lntmsg->msg_hdr, &ni->ni_nid, lntmsg, 0);
+}
+
+static int
+lolnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
+	   int delayed, unsigned int niov,
+	   struct bio_vec *kiov,
+	   unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	struct lnet_msg *sendmsg = private;
+
+	if (lntmsg) {			/* not discarding */
+		lnet_copy_kiov2kiov(niov, kiov, offset,
+				    sendmsg->msg_niov,
+				    sendmsg->msg_kiov,
+				    sendmsg->msg_offset, mlen);
+
+		lnet_finalize(lntmsg, 0);
+	}
+
+	lnet_finalize(sendmsg, 0);
+	return 0;
+}
+
+static int lolnd_instanced;
+
+static void
+lolnd_shutdown(struct lnet_ni *ni)
+{
+	CDEBUG (D_NET, "shutdown\n");
+	LASSERT(lolnd_instanced);
+
+	lolnd_instanced = 0;
+}
+
+static int
+lolnd_startup(struct lnet_ni *ni)
+{
+	LASSERT (ni->ni_net->net_lnd == &the_lolnd);
+	LASSERT (!lolnd_instanced);
+	lolnd_instanced = 1;
+
+	return (0);
+}
+
+const struct lnet_lnd the_lolnd = {
+	.lnd_type	= LOLND,
+	.lnd_startup	= lolnd_startup,
+	.lnd_shutdown	= lolnd_shutdown,
+	.lnd_send	= lolnd_send,
+	.lnd_recv	= lolnd_recv
+};
diff --git a/drivers/staging/lustrefsx/lnet/lnet/module.c b/drivers/staging/lustrefsx/lnet/lnet/module.c
new file mode 100644
index 0000000000000..e4fe3f8aa2381
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/module.c
@@ -0,0 +1,277 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+
+static int config_on_load = 0;
+module_param(config_on_load, int, 0444);
+MODULE_PARM_DESC(config_on_load, "configure network at module load");
+
+static DEFINE_MUTEX(lnet_config_mutex);
+
+static int
+lnet_configure(void *arg)
+{
+	/* 'arg' only there so I can be passed to cfs_create_thread() */
+	int    rc = 0;
+
+	mutex_lock(&lnet_config_mutex);
+
+	if (!the_lnet.ln_niinit_self) {
+		rc = try_module_get(THIS_MODULE);
+
+		if (rc != 1)
+			goto out;
+
+		rc = LNetNIInit(LNET_PID_LUSTRE);
+		if (rc >= 0) {
+			the_lnet.ln_niinit_self = 1;
+			rc = 0;
+		} else {
+			module_put(THIS_MODULE);
+		}
+	}
+
+out:
+	mutex_unlock(&lnet_config_mutex);
+	return rc;
+}
+
+static int
+lnet_unconfigure (void)
+{
+	int   refcount;
+
+	mutex_lock(&lnet_config_mutex);
+
+	if (the_lnet.ln_niinit_self) {
+		the_lnet.ln_niinit_self = 0;
+		LNetNIFini();
+		module_put(THIS_MODULE);
+	}
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+	refcount = the_lnet.ln_refcount;
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	mutex_unlock(&lnet_config_mutex);
+
+	return (refcount == 0) ? 0 : -EBUSY;
+}
+
+static int
+lnet_dyn_configure_net(struct libcfs_ioctl_hdr *hdr)
+{
+	struct lnet_ioctl_config_data *conf =
+	  (struct lnet_ioctl_config_data *)hdr;
+	int			      rc;
+
+	if (conf->cfg_hdr.ioc_len < sizeof(*conf))
+		return -EINVAL;
+
+	mutex_lock(&lnet_config_mutex);
+	if (the_lnet.ln_niinit_self)
+		rc = lnet_dyn_add_net(conf);
+	else
+		rc = -EINVAL;
+	mutex_unlock(&lnet_config_mutex);
+
+	return rc;
+}
+
+static int
+lnet_dyn_unconfigure_net(struct libcfs_ioctl_hdr *hdr)
+{
+	struct lnet_ioctl_config_data *conf =
+	  (struct lnet_ioctl_config_data *) hdr;
+	int			      rc;
+
+	if (conf->cfg_hdr.ioc_len < sizeof(*conf))
+		return -EINVAL;
+
+	mutex_lock(&lnet_config_mutex);
+	if (the_lnet.ln_niinit_self)
+		rc = lnet_dyn_del_net(conf->cfg_net);
+	else
+		rc = -EINVAL;
+	mutex_unlock(&lnet_config_mutex);
+
+	return rc;
+}
+
+static int
+lnet_dyn_configure_ni(struct libcfs_ioctl_hdr *hdr)
+{
+	struct lnet_ioctl_config_ni *conf =
+	  (struct lnet_ioctl_config_ni *)hdr;
+	int			      rc;
+
+	if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf))
+		return -EINVAL;
+
+	mutex_lock(&lnet_config_mutex);
+	if (the_lnet.ln_niinit_self)
+		rc = lnet_dyn_add_ni(conf);
+	else
+		rc = -EINVAL;
+	mutex_unlock(&lnet_config_mutex);
+
+	return rc;
+}
+
+static int
+lnet_dyn_unconfigure_ni(struct libcfs_ioctl_hdr *hdr)
+{
+	struct lnet_ioctl_config_ni *conf =
+	  (struct lnet_ioctl_config_ni *) hdr;
+	int			      rc;
+
+	if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf))
+		return -EINVAL;
+
+	mutex_lock(&lnet_config_mutex);
+	if (the_lnet.ln_niinit_self)
+		rc = lnet_dyn_del_ni(conf);
+	else
+		rc = -EINVAL;
+	mutex_unlock(&lnet_config_mutex);
+
+	return rc;
+}
+
+static int
+lnet_ioctl(struct notifier_block *nb,
+	   unsigned long cmd, void *vdata)
+{
+	struct libcfs_ioctl_hdr *hdr = vdata;
+	int rc;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CONFIGURE: {
+		struct libcfs_ioctl_data *data =
+		  (struct libcfs_ioctl_data *)hdr;
+
+		if (data->ioc_hdr.ioc_len < sizeof(*data)) {
+			rc = -EINVAL;
+		} else {
+			the_lnet.ln_nis_from_mod_params = data->ioc_flags;
+			rc = lnet_configure(NULL);
+		}
+		break;
+	}
+
+	case IOC_LIBCFS_UNCONFIGURE:
+		rc = lnet_unconfigure();
+		break;
+
+	case IOC_LIBCFS_ADD_NET:
+		rc = lnet_dyn_configure_net(hdr);
+		break;
+
+	case IOC_LIBCFS_DEL_NET:
+		rc = lnet_dyn_unconfigure_net(hdr);
+		break;
+
+	case IOC_LIBCFS_ADD_LOCAL_NI:
+		rc = lnet_dyn_configure_ni(hdr);
+		break;
+
+	case IOC_LIBCFS_DEL_LOCAL_NI:
+		rc = lnet_dyn_unconfigure_ni(hdr);
+		break;
+
+	default:
+		/* Passing LNET_PID_ANY only gives me a ref if the net is up
+		 * already; I'll need it to ensure the net can't go down while
+		 * I'm called into it */
+		rc = LNetNIInit(LNET_PID_ANY);
+		if (rc >= 0) {
+			rc = LNetCtl(cmd, hdr);
+			LNetNIFini();
+		}
+		break;
+	}
+	return notifier_from_ioctl_errno(rc);
+}
+
+static struct notifier_block lnet_ioctl_handler = {
+	.notifier_call = lnet_ioctl,
+};
+
+static int __init lnet_init(void)
+{
+	int rc;
+	ENTRY;
+
+	rc = lnet_lib_init();
+	if (rc != 0) {
+		CERROR("lnet_lib_init: error %d\n", rc);
+		RETURN(rc);
+	}
+
+	if (live_router_check_interval != INT_MIN ||
+	    dead_router_check_interval != INT_MIN)
+		LCONSOLE_WARN("live_router_check_interval and dead_router_check_interval have been deprecated. Use alive_router_check_interval instead. Ignoring these deprecated parameters.\n");
+
+	rc = blocking_notifier_chain_register(&libcfs_ioctl_list,
+					      &lnet_ioctl_handler);
+	LASSERT(rc == 0);
+
+	if (config_on_load) {
+		/* Have to schedule a separate thread to avoid deadlocking
+		 * in modload */
+		(void)kthread_run(lnet_configure, NULL, "lnet_initd");
+	}
+
+	RETURN(0);
+}
+
+static void __exit lnet_exit(void)
+{
+	int rc;
+
+	rc = blocking_notifier_chain_unregister(&libcfs_ioctl_list,
+						&lnet_ioctl_handler);
+	LASSERT(rc == 0);
+
+	lnet_lib_exit();
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Networking layer");
+MODULE_VERSION(LNET_VERSION);
+MODULE_LICENSE("GPL");
+
+module_init(lnet_init);
+module_exit(lnet_exit);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
new file mode 100644
index 0000000000000..c2f81fb150887
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
@@ -0,0 +1,1114 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/lnet/net_fault.c
+ *
+ * Lustre network fault simulation
+ *
+ * Author: liang.zhen@intel.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/random.h>
+#include <lnet/lib-lnet.h>
+#include <uapi/linux/lnet/lnetctl.h>
+
+#define LNET_MSG_MASK		(LNET_PUT_BIT | LNET_ACK_BIT | \
+				 LNET_GET_BIT | LNET_REPLY_BIT)
+
+struct lnet_drop_rule {
+	/** link chain on the_lnet.ln_drop_rules */
+	struct list_head	dr_link;
+	/** attributes of this rule */
+	struct lnet_fault_attr	dr_attr;
+	/** lock to protect \a dr_drop_at and \a dr_stat */
+	spinlock_t		dr_lock;
+	/**
+	 * the message sequence to drop, which means message is dropped when
+	 * dr_stat.drs_count == dr_drop_at
+	 */
+	unsigned long		dr_drop_at;
+	/**
+	 * seconds to drop the next message, it's exclusive with dr_drop_at
+	 */
+	time64_t		dr_drop_time;
+	/** baseline to caculate dr_drop_time */
+	time64_t		dr_time_base;
+	/** statistic of dropped messages */
+	struct lnet_fault_stat	dr_stat;
+};
+
+static bool
+lnet_fault_nid_match(lnet_nid_t nid, lnet_nid_t msg_nid)
+{
+	if (nid == msg_nid || nid == LNET_NID_ANY)
+		return true;
+
+	if (LNET_NIDNET(nid) != LNET_NIDNET(msg_nid))
+		return false;
+
+	/* 255.255.255.255@net is wildcard for all addresses in a network */
+	return LNET_NIDADDR(nid) == LNET_NIDADDR(LNET_NID_ANY);
+}
+
+static bool
+lnet_fault_attr_match(struct lnet_fault_attr *attr, lnet_nid_t src,
+		      lnet_nid_t local_nid, lnet_nid_t dst,
+		      unsigned int type, unsigned int portal)
+{
+	if (!lnet_fault_nid_match(attr->fa_src, src) ||
+	    !lnet_fault_nid_match(attr->fa_dst, dst) ||
+	    !lnet_fault_nid_match(attr->fa_local_nid, local_nid))
+		return false;
+
+	if (!(attr->fa_msg_mask & BIT(type)))
+		return false;
+
+	/* NB: ACK and REPLY have no portal, but they should have been
+	 * rejected by message mask */
+	if (attr->fa_ptl_mask != 0 && /* has portal filter */
+	    !(attr->fa_ptl_mask & (1ULL << portal)))
+		return false;
+
+	return true;
+}
+
+static int
+lnet_fault_attr_validate(struct lnet_fault_attr *attr)
+{
+	if (attr->fa_msg_mask == 0)
+		attr->fa_msg_mask = LNET_MSG_MASK; /* all message types */
+
+	if (attr->fa_ptl_mask == 0) /* no portal filter */
+		return 0;
+
+	/* NB: only PUT and GET can be filtered if portal filter has been set */
+	attr->fa_msg_mask &= LNET_GET_BIT | LNET_PUT_BIT;
+	if (attr->fa_msg_mask == 0) {
+		CDEBUG(D_NET, "can't find valid message type bits %x\n",
+		       attr->fa_msg_mask);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void
+lnet_fault_stat_inc(struct lnet_fault_stat *stat, unsigned int type)
+{
+	/* NB: fs_counter is NOT updated by this function */
+	switch (type) {
+	case LNET_MSG_PUT:
+		stat->fs_put++;
+		return;
+	case LNET_MSG_ACK:
+		stat->fs_ack++;
+		return;
+	case LNET_MSG_GET:
+		stat->fs_get++;
+		return;
+	case LNET_MSG_REPLY:
+		stat->fs_reply++;
+		return;
+	}
+}
+
+/**
+ * LNet message drop simulation
+ */
+
+/**
+ * Add a new drop rule to LNet
+ * There is no check for duplicated drop rule, all rules will be checked for
+ * incoming message.
+ */
+static int
+lnet_drop_rule_add(struct lnet_fault_attr *attr)
+{
+	struct lnet_drop_rule *rule;
+	ENTRY;
+
+	if (!((attr->u.drop.da_rate == 0) ^ (attr->u.drop.da_interval == 0))) {
+		CDEBUG(D_NET,
+		       "please provide either drop rate or drop interval, "
+		       "but not both at the same time %d/%d\n",
+		       attr->u.drop.da_rate, attr->u.drop.da_interval);
+		RETURN(-EINVAL);
+	}
+
+	if (lnet_fault_attr_validate(attr) != 0)
+		RETURN(-EINVAL);
+
+	CFS_ALLOC_PTR(rule);
+	if (rule == NULL)
+		RETURN(-ENOMEM);
+
+	spin_lock_init(&rule->dr_lock);
+
+	rule->dr_attr = *attr;
+	if (attr->u.drop.da_interval != 0) {
+		rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval;
+		rule->dr_drop_time = ktime_get_seconds() +
+				     get_random_u32_below(attr->u.drop.da_interval);
+	} else {
+		rule->dr_drop_at = get_random_u32_below(attr->u.drop.da_rate);
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_add(&rule->dr_link, &the_lnet.ln_drop_rules);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	CDEBUG(D_NET, "Added drop rule: src %s, dst %s, rate %d, interval %d\n",
+	       libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src),
+	       attr->u.drop.da_rate, attr->u.drop.da_interval);
+	RETURN(0);
+}
+
+/**
+ * Remove matched drop rules from lnet, all rules that can match \a src and
+ * \a dst will be removed.
+ * If \a src is zero, then all rules have \a dst as destination will be remove
+ * If \a dst is zero, then all rules have \a src as source will be removed
+ * If both of them are zero, all rules will be removed
+ */
+static int
+lnet_drop_rule_del(lnet_nid_t src, lnet_nid_t dst)
+{
+	struct lnet_drop_rule *rule;
+	struct lnet_drop_rule *tmp;
+	LIST_HEAD(zombies);
+	int n = 0;
+	ENTRY;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry_safe(rule, tmp, &the_lnet.ln_drop_rules, dr_link) {
+		if (rule->dr_attr.fa_src != src && src != 0)
+			continue;
+
+		if (rule->dr_attr.fa_dst != dst && dst != 0)
+			continue;
+
+		list_move(&rule->dr_link, &zombies);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(rule, tmp, &zombies, dr_link) {
+		CDEBUG(D_NET, "Remove drop rule: src %s->dst: %s (1/%d, %d)\n",
+		       libcfs_nid2str(rule->dr_attr.fa_src),
+		       libcfs_nid2str(rule->dr_attr.fa_dst),
+		       rule->dr_attr.u.drop.da_rate,
+		       rule->dr_attr.u.drop.da_interval);
+
+		list_del(&rule->dr_link);
+		CFS_FREE_PTR(rule);
+		n++;
+	}
+
+	RETURN(n);
+}
+
+/**
+ * List drop rule at position of \a pos
+ */
+static int
+lnet_drop_rule_list(int pos, struct lnet_fault_attr *attr,
+		    struct lnet_fault_stat *stat)
+{
+	struct lnet_drop_rule *rule;
+	int		       cpt;
+	int		       i = 0;
+	int		       rc = -ENOENT;
+	ENTRY;
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
+		if (i++ < pos)
+			continue;
+
+		spin_lock(&rule->dr_lock);
+		*attr = rule->dr_attr;
+		*stat = rule->dr_stat;
+		spin_unlock(&rule->dr_lock);
+		rc = 0;
+		break;
+	}
+
+	lnet_net_unlock(cpt);
+	RETURN(rc);
+}
+
+/**
+ * reset counters for all drop rules
+ */
+static void
+lnet_drop_rule_reset(void)
+{
+	struct lnet_drop_rule *rule;
+	int		       cpt;
+	ENTRY;
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
+		struct lnet_fault_attr *attr = &rule->dr_attr;
+
+		spin_lock(&rule->dr_lock);
+
+		memset(&rule->dr_stat, 0, sizeof(rule->dr_stat));
+		if (attr->u.drop.da_rate != 0) {
+			rule->dr_drop_at = get_random_u32_below(attr->u.drop.da_rate);
+		} else {
+			rule->dr_drop_time = ktime_get_seconds() +
+					     get_random_u32_below(attr->u.drop.da_interval);
+			rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval;
+		}
+		spin_unlock(&rule->dr_lock);
+	}
+
+	lnet_net_unlock(cpt);
+	EXIT;
+}
+
+static void
+lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask)
+{
+	int choice;
+	int delta;
+	int best_delta;
+	int i;
+
+	/* assign a random failure */
+	choice = get_random_u32_below(LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK);
+	if (choice == 0)
+		choice++;
+
+	if (mask == HSTATUS_RANDOM) {
+		*hstatus = choice;
+		return;
+	}
+
+	if (mask & BIT(choice)) {
+		*hstatus = choice;
+		return;
+	}
+
+	/* round to the closest ON bit */
+	i = HSTATUS_END;
+	best_delta = HSTATUS_END;
+	while (i > 0) {
+		if (mask & BIT(i)) {
+			delta = choice - i;
+			if (delta < 0)
+				delta *= -1;
+			if (delta < best_delta) {
+				best_delta = delta;
+				choice = i;
+			}
+		}
+		i--;
+	}
+
+	*hstatus = choice;
+}
+
+/**
+ * check source/destination NID, portal, message type and drop rate,
+ * decide whether should drop this message or not
+ */
+static bool
+drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
+		lnet_nid_t local_nid, lnet_nid_t dst,
+		unsigned int type, unsigned int portal,
+		enum lnet_msg_hstatus *hstatus)
+{
+	struct lnet_fault_attr	*attr = &rule->dr_attr;
+	bool			 drop;
+
+	if (!lnet_fault_attr_match(attr, src, local_nid, dst, type, portal))
+		return false;
+
+	if (attr->u.drop.da_drop_all) {
+		CDEBUG(D_NET, "set to drop all messages\n");
+		drop = true;
+		goto drop_matched;
+	}
+
+	/*
+	 * if we're trying to match a health status error but it hasn't
+	 * been set in the rule, then don't match
+	 */
+	if ((hstatus && !attr->u.drop.da_health_error_mask) ||
+	    (!hstatus && attr->u.drop.da_health_error_mask))
+		return false;
+
+	/* match this rule, check drop rate now */
+	spin_lock(&rule->dr_lock);
+	if (attr->u.drop.da_random) {
+		int value = get_random_u32_below(attr->u.drop.da_interval);
+		if (value >= (attr->u.drop.da_interval / 2))
+			drop = true;
+		else
+			drop = false;
+	} else if (rule->dr_drop_time != 0) { /* time based drop */
+		time64_t now = ktime_get_seconds();
+
+		rule->dr_stat.fs_count++;
+		drop = now >= rule->dr_drop_time;
+		if (drop) {
+			if (now > rule->dr_time_base)
+				rule->dr_time_base = now;
+
+			rule->dr_drop_time = rule->dr_time_base +
+					     get_random_u32_below(attr->u.drop.da_interval);
+			rule->dr_time_base += attr->u.drop.da_interval;
+
+			CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %lld\n",
+			       libcfs_nid2str(attr->fa_src),
+			       libcfs_nid2str(attr->fa_dst),
+			       rule->dr_drop_time);
+		}
+
+	} else { /* rate based drop */
+		__u64 count;
+
+		drop = rule->dr_stat.fs_count++ == rule->dr_drop_at;
+		count = rule->dr_stat.fs_count;
+		if (do_div(count, attr->u.drop.da_rate) == 0) {
+			rule->dr_drop_at = rule->dr_stat.fs_count +
+					   get_random_u32_below(attr->u.drop.da_rate);
+			CDEBUG(D_NET, "Drop Rule %s->%s: next drop: %lu\n",
+			       libcfs_nid2str(attr->fa_src),
+			       libcfs_nid2str(attr->fa_dst), rule->dr_drop_at);
+		}
+	}
+
+drop_matched:
+
+	if (drop) { /* drop this message, update counters */
+		if (hstatus)
+			lnet_fault_match_health(hstatus,
+				attr->u.drop.da_health_error_mask);
+		lnet_fault_stat_inc(&rule->dr_stat, type);
+		rule->dr_stat.u.drop.ds_dropped++;
+	}
+
+	spin_unlock(&rule->dr_lock);
+	return drop;
+}
+
+/**
+ * Check if message from \a src to \a dst can match any existed drop rule
+ */
+bool
+lnet_drop_rule_match(struct lnet_hdr *hdr,
+		     lnet_nid_t local_nid,
+		     enum lnet_msg_hstatus *hstatus)
+{
+	lnet_nid_t src = lnet_nid_to_nid4(&hdr->src_nid);
+	lnet_nid_t dst = lnet_nid_to_nid4(&hdr->dest_nid);
+	unsigned int typ = hdr->type;
+	struct lnet_drop_rule *rule;
+	unsigned int ptl = -1;
+	bool drop = false;
+	int cpt;
+
+	/* NB: if Portal is specified, then only PUT and GET will be
+	 * filtered by drop rule */
+	if (typ == LNET_MSG_PUT)
+		ptl = le32_to_cpu(hdr->msg.put.ptl_index);
+	else if (typ == LNET_MSG_GET)
+		ptl = le32_to_cpu(hdr->msg.get.ptl_index);
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
+		drop = drop_rule_match(rule, src, local_nid, dst, typ, ptl,
+				       hstatus);
+		if (drop)
+			break;
+	}
+	lnet_net_unlock(cpt);
+
+	return drop;
+}
+
+/**
+ * LNet Delay Simulation
+ */
+/** timestamp (second) to send delayed message */
+#define msg_delay_send		 msg_ev.hdr_data
+
+struct lnet_delay_rule {
+	/** link chain on the_lnet.ln_delay_rules */
+	struct list_head	dl_link;
+	/** link chain on delay_dd.dd_sched_rules */
+	struct list_head	dl_sched_link;
+	/** attributes of this rule */
+	struct lnet_fault_attr	dl_attr;
+	/** lock to protect \a below members */
+	spinlock_t		dl_lock;
+	/** refcount of delay rule */
+	atomic_t		dl_refcount;
+	/**
+	 * the message sequence to delay, which means message is delayed when
+	 * dl_stat.fs_count == dl_delay_at
+	 */
+	unsigned long		dl_delay_at;
+	/**
+	 * seconds to delay the next message, it's exclusive with dl_delay_at
+	 */
+	time64_t		dl_delay_time;
+	/** baseline to caculate dl_delay_time */
+	time64_t		dl_time_base;
+	/** seconds until we send the next delayed message */
+	time64_t		dl_msg_send;
+	/** delayed message list */
+	struct list_head	dl_msg_list;
+	/** statistic of delayed messages */
+	struct lnet_fault_stat	dl_stat;
+	/** timer to wakeup delay_daemon */
+	struct timer_list	dl_timer;
+};
+
+struct delay_daemon_data {
+	/** serialise rule add/remove */
+	struct mutex		dd_mutex;
+	/** protect rules on \a dd_sched_rules */
+	spinlock_t		dd_lock;
+	/** scheduled delay rules (by timer) */
+	struct list_head	dd_sched_rules;
+	/** deamon thread sleeps at here */
+	wait_queue_head_t	dd_waitq;
+	/** controler (lctl command) wait at here */
+	wait_queue_head_t	dd_ctl_waitq;
+	/** deamon is running */
+	unsigned int		dd_running;
+	/** deamon stopped */
+	unsigned int		dd_stopped;
+};
+
+static struct delay_daemon_data	delay_dd;
+
+static void
+delay_rule_decref(struct lnet_delay_rule *rule)
+{
+	if (atomic_dec_and_test(&rule->dl_refcount)) {
+		LASSERT(list_empty(&rule->dl_sched_link));
+		LASSERT(list_empty(&rule->dl_msg_list));
+		LASSERT(list_empty(&rule->dl_link));
+
+		CFS_FREE_PTR(rule);
+	}
+}
+
+/**
+ * check source/destination NID, portal, message type and delay rate,
+ * decide whether should delay this message or not
+ */
+static bool
+delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
+		lnet_nid_t dst, unsigned int type, unsigned int portal,
+		struct lnet_msg *msg)
+{
+	struct lnet_fault_attr *attr = &rule->dl_attr;
+	bool delay;
+	time64_t now = ktime_get_seconds();
+
+	if (!lnet_fault_attr_match(attr, src, LNET_NID_ANY,
+				   dst, type, portal))
+		return false;
+
+	/* match this rule, check delay rate now */
+	spin_lock(&rule->dl_lock);
+	if (rule->dl_delay_time != 0) { /* time based delay */
+		rule->dl_stat.fs_count++;
+		delay = now >= rule->dl_delay_time;
+		if (delay) {
+			if (now > rule->dl_time_base)
+				rule->dl_time_base = now;
+
+			rule->dl_delay_time = rule->dl_time_base +
+					      get_random_u32_below(attr->u.delay.la_interval);
+			rule->dl_time_base += attr->u.delay.la_interval;
+
+			CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %lld\n",
+			       libcfs_nid2str(attr->fa_src),
+			       libcfs_nid2str(attr->fa_dst),
+			       rule->dl_delay_time);
+		}
+
+	} else { /* rate based delay */
+		__u64 count;
+
+		delay = rule->dl_stat.fs_count++ == rule->dl_delay_at;
+		/* generate the next random rate sequence */
+		count = rule->dl_stat.fs_count;
+		if (do_div(count, attr->u.delay.la_rate) == 0) {
+			rule->dl_delay_at = rule->dl_stat.fs_count +
+					    get_random_u32_below(attr->u.delay.la_rate);
+			CDEBUG(D_NET, "Delay Rule %s->%s: next delay: %lu\n",
+			       libcfs_nid2str(attr->fa_src),
+			       libcfs_nid2str(attr->fa_dst), rule->dl_delay_at);
+		}
+	}
+
+	if (!delay) {
+		spin_unlock(&rule->dl_lock);
+		return false;
+	}
+
+	/* delay this message, update counters */
+	lnet_fault_stat_inc(&rule->dl_stat, type);
+	rule->dl_stat.u.delay.ls_delayed++;
+
+	list_add_tail(&msg->msg_list, &rule->dl_msg_list);
+	msg->msg_delay_send = now + attr->u.delay.la_latency;
+	if (rule->dl_msg_send == -1) {
+		rule->dl_msg_send = msg->msg_delay_send;
+		mod_timer(&rule->dl_timer,
+			  jiffies + cfs_time_seconds(attr->u.delay.la_latency));
+	}
+
+	spin_unlock(&rule->dl_lock);
+	return true;
+}
+
+/**
+ * check if \a msg can match any Delay Rule, receiving of this message
+ * will be delayed if there is a match.
+ */
+bool
+lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg)
+{
+	struct lnet_delay_rule	*rule;
+	lnet_nid_t		 src = lnet_nid_to_nid4(&hdr->src_nid);
+	lnet_nid_t		 dst = lnet_nid_to_nid4(&hdr->dest_nid);
+	unsigned int		 typ = hdr->type;
+	unsigned int		 ptl = -1;
+
+	/* NB: called with hold of lnet_net_lock */
+
+	/* NB: if Portal is specified, then only PUT and GET will be
+	 * filtered by delay rule */
+	if (typ == LNET_MSG_PUT)
+		ptl = le32_to_cpu(hdr->msg.put.ptl_index);
+	else if (typ == LNET_MSG_GET)
+		ptl = le32_to_cpu(hdr->msg.get.ptl_index);
+
+	list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) {
+		if (delay_rule_match(rule, src, dst, typ, ptl, msg))
+			return true;
+	}
+
+	return false;
+}
+
+/** check out delayed messages for send */
+static void
+delayed_msg_check(struct lnet_delay_rule *rule, bool all,
+		  struct list_head *msg_list)
+{
+	struct lnet_msg *msg;
+	struct lnet_msg *tmp;
+	time64_t now = ktime_get_seconds();
+
+	if (!all && rule->dl_msg_send > now)
+		return;
+
+	spin_lock(&rule->dl_lock);
+	list_for_each_entry_safe(msg, tmp, &rule->dl_msg_list, msg_list) {
+		if (!all && msg->msg_delay_send > now)
+			break;
+
+		msg->msg_delay_send = 0;
+		list_move_tail(&msg->msg_list, msg_list);
+	}
+
+	if (list_empty(&rule->dl_msg_list)) {
+		del_timer(&rule->dl_timer);
+		rule->dl_msg_send = -1;
+
+	} else if (!list_empty(msg_list)) {
+		/* dequeued some timedout messages, update timer for the
+		 * next delayed message on rule */
+		msg = list_entry(rule->dl_msg_list.next,
+				 struct lnet_msg, msg_list);
+		rule->dl_msg_send = msg->msg_delay_send;
+		mod_timer(&rule->dl_timer,
+			  jiffies +
+			  cfs_time_seconds(msg->msg_delay_send - now));
+	}
+	spin_unlock(&rule->dl_lock);
+}
+
+static void
+delayed_msg_process(struct list_head *msg_list, bool drop)
+{
+	struct lnet_msg	*msg;
+
+	while (!list_empty(msg_list)) {
+		struct lnet_ni *ni;
+		int		cpt;
+		int		rc;
+
+		msg = list_entry(msg_list->next, struct lnet_msg, msg_list);
+
+		if (msg->msg_sending) {
+			/* Delayed send */
+			list_del_init(&msg->msg_list);
+			ni = msg->msg_txni;
+			CDEBUG(D_NET, "TRACE: msg %p %s -> %s : %s\n", msg,
+			       libcfs_nidstr(&ni->ni_nid),
+			       libcfs_nidstr(&msg->msg_txpeer->lpni_nid),
+			       lnet_msgtyp2str(msg->msg_type));
+			lnet_ni_send(ni, msg);
+			continue;
+		}
+
+		/* Delayed receive */
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_rxni != NULL);
+
+		ni = msg->msg_rxni;
+		cpt = msg->msg_rx_cpt;
+
+		list_del_init(&msg->msg_list);
+		if (drop) {
+			rc = -ECANCELED;
+
+		} else if (!msg->msg_routing) {
+			rc = lnet_parse_local(ni, msg);
+			if (rc == 0)
+				continue;
+
+		} else {
+			lnet_net_lock(cpt);
+			rc = lnet_parse_forward_locked(ni, msg);
+			lnet_net_unlock(cpt);
+
+			switch (rc) {
+			case LNET_CREDIT_OK:
+				lnet_ni_recv(ni, msg->msg_private, msg, 0,
+					     0, msg->msg_len, msg->msg_len);
+				fallthrough;
+			case LNET_CREDIT_WAIT:
+				continue;
+			default: /* failures */
+				break;
+			}
+		}
+
+		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len,
+				  msg->msg_type);
+		lnet_finalize(msg, rc);
+	}
+}
+
+/**
+ * Process delayed messages for scheduled rules
+ * This function can either be called by delay_rule_daemon, or by lnet_finalise
+ */
+void
+lnet_delay_rule_check(void)
+{
+	struct lnet_delay_rule *rule;
+	LIST_HEAD(msgs);
+
+	while (1) {
+		if (list_empty(&delay_dd.dd_sched_rules))
+			break;
+
+		spin_lock_bh(&delay_dd.dd_lock);
+		if (list_empty(&delay_dd.dd_sched_rules)) {
+			spin_unlock_bh(&delay_dd.dd_lock);
+			break;
+		}
+
+		rule = list_entry(delay_dd.dd_sched_rules.next,
+				  struct lnet_delay_rule, dl_sched_link);
+		list_del_init(&rule->dl_sched_link);
+		spin_unlock_bh(&delay_dd.dd_lock);
+
+		delayed_msg_check(rule, false, &msgs);
+		delay_rule_decref(rule); /* -1 for delay_dd.dd_sched_rules */
+	}
+
+	if (!list_empty(&msgs))
+		delayed_msg_process(&msgs, false);
+}
+
+/** deamon thread to handle delayed messages */
+static int
+lnet_delay_rule_daemon(void *arg)
+{
+	delay_dd.dd_running = 1;
+	wake_up(&delay_dd.dd_ctl_waitq);
+
+	while (delay_dd.dd_running) {
+		wait_event_interruptible(delay_dd.dd_waitq,
+					 !delay_dd.dd_running ||
+					 !list_empty(&delay_dd.dd_sched_rules));
+		lnet_delay_rule_check();
+	}
+
+	/* in case more rules have been enqueued after my last check */
+	lnet_delay_rule_check();
+	delay_dd.dd_stopped = 1;
+	wake_up(&delay_dd.dd_ctl_waitq);
+
+	return 0;
+}
+
+static void
+delay_timer_cb(cfs_timer_cb_arg_t data)
+{
+	struct lnet_delay_rule *rule = cfs_from_timer(rule, data, dl_timer);
+
+	spin_lock_bh(&delay_dd.dd_lock);
+	if (list_empty(&rule->dl_sched_link) && delay_dd.dd_running) {
+		atomic_inc(&rule->dl_refcount);
+		list_add_tail(&rule->dl_sched_link, &delay_dd.dd_sched_rules);
+		wake_up(&delay_dd.dd_waitq);
+	}
+	spin_unlock_bh(&delay_dd.dd_lock);
+}
+
+/**
+ * Add a new delay rule to LNet
+ * There is no check for duplicated delay rule, all rules will be checked for
+ * incoming message.
+ */
+int
+lnet_delay_rule_add(struct lnet_fault_attr *attr)
+{
+	struct lnet_delay_rule *rule;
+	int			rc = 0;
+	ENTRY;
+
+	if (!((attr->u.delay.la_rate == 0) ^
+	      (attr->u.delay.la_interval == 0))) {
+		CDEBUG(D_NET,
+		       "please provide either delay rate or delay interval, "
+		       "but not both at the same time %d/%d\n",
+		       attr->u.delay.la_rate, attr->u.delay.la_interval);
+		RETURN(-EINVAL);
+	}
+
+	if (attr->u.delay.la_latency == 0) {
+		CDEBUG(D_NET, "delay latency cannot be zero\n");
+		RETURN(-EINVAL);
+	}
+
+	if (lnet_fault_attr_validate(attr) != 0)
+		RETURN(-EINVAL);
+
+	CFS_ALLOC_PTR(rule);
+	if (rule == NULL)
+		RETURN(-ENOMEM);
+
+	mutex_lock(&delay_dd.dd_mutex);
+	if (!delay_dd.dd_running) {
+		struct task_struct *task;
+
+		/* NB: although LND threads will process delayed message
+		 * in lnet_finalize, but there is no guarantee that LND
+		 * threads will be waken up if no other message needs to
+		 * be handled.
+		 * Only one daemon thread, performance is not the concern
+		 * of this simualation module.
+		 */
+		task = kthread_run(lnet_delay_rule_daemon, NULL, "lnet_dd");
+		if (IS_ERR(task)) {
+			rc = PTR_ERR(task);
+			GOTO(failed, rc);
+		}
+		wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_running);
+	}
+
+	cfs_timer_setup(&rule->dl_timer, delay_timer_cb,
+			(unsigned long)rule, 0);
+
+	spin_lock_init(&rule->dl_lock);
+	INIT_LIST_HEAD(&rule->dl_msg_list);
+	INIT_LIST_HEAD(&rule->dl_sched_link);
+
+	rule->dl_attr = *attr;
+	if (attr->u.delay.la_interval != 0) {
+		rule->dl_time_base = ktime_get_seconds() +
+				     attr->u.delay.la_interval;
+		rule->dl_delay_time = ktime_get_seconds() +
+				      get_random_u32_below(attr->u.delay.la_interval);
+	} else {
+		rule->dl_delay_at = get_random_u32_below(attr->u.delay.la_rate);
+	}
+
+	rule->dl_msg_send = -1;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	atomic_set(&rule->dl_refcount, 1);
+	list_add(&rule->dl_link, &the_lnet.ln_delay_rules);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	CDEBUG(D_NET, "Added delay rule: src %s, dst %s, rate %d\n",
+	       libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src),
+	       attr->u.delay.la_rate);
+
+	mutex_unlock(&delay_dd.dd_mutex);
+	RETURN(0);
+ failed:
+	mutex_unlock(&delay_dd.dd_mutex);
+	CFS_FREE_PTR(rule);
+	return rc;
+}
+
+/**
+ * Remove matched Delay Rules from lnet, if \a shutdown is true or both \a src
+ * and \a dst are zero, all rules will be removed, otherwise only matched rules
+ * will be removed.
+ * If \a src is zero, then all rules have \a dst as destination will be remove
+ * If \a dst is zero, then all rules have \a src as source will be removed
+ *
+ * When a delay rule is removed, all delayed messages of this rule will be
+ * processed immediately.
+ */
+int
+lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown)
+{
+	struct lnet_delay_rule *rule;
+	struct lnet_delay_rule *tmp;
+	LIST_HEAD(rule_list);
+	LIST_HEAD(msg_list);
+	int n = 0;
+	bool cleanup;
+	ENTRY;
+
+	if (shutdown)
+		src = dst = 0;
+
+	mutex_lock(&delay_dd.dd_mutex);
+	lnet_net_lock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(rule, tmp, &the_lnet.ln_delay_rules, dl_link) {
+		if (rule->dl_attr.fa_src != src && src != 0)
+			continue;
+
+		if (rule->dl_attr.fa_dst != dst && dst != 0)
+			continue;
+
+		CDEBUG(D_NET, "Remove delay rule: src %s->dst: %s (1/%d, %d)\n",
+		       libcfs_nid2str(rule->dl_attr.fa_src),
+		       libcfs_nid2str(rule->dl_attr.fa_dst),
+		       rule->dl_attr.u.delay.la_rate,
+		       rule->dl_attr.u.delay.la_interval);
+		/* refcount is taken over by rule_list */
+		list_move(&rule->dl_link, &rule_list);
+	}
+
+	/* check if we need to shutdown delay_daemon */
+	cleanup = list_empty(&the_lnet.ln_delay_rules) &&
+		  !list_empty(&rule_list);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(rule, tmp, &rule_list, dl_link) {
+		list_del_init(&rule->dl_link);
+
+		del_timer_sync(&rule->dl_timer);
+		delayed_msg_check(rule, true, &msg_list);
+		delay_rule_decref(rule); /* -1 for the_lnet.ln_delay_rules */
+		n++;
+	}
+
+	if (cleanup) { /* no more delay rule, shutdown delay_daemon */
+		LASSERT(delay_dd.dd_running);
+		delay_dd.dd_running = 0;
+		wake_up(&delay_dd.dd_waitq);
+
+		while (!delay_dd.dd_stopped)
+			wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_stopped);
+	}
+	mutex_unlock(&delay_dd.dd_mutex);
+
+	if (!list_empty(&msg_list))
+		delayed_msg_process(&msg_list, shutdown);
+
+	RETURN(n);
+}
+
+/**
+ * List Delay Rule at position of \a pos
+ */
+int
+lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr,
+		    struct lnet_fault_stat *stat)
+{
+	struct lnet_delay_rule *rule;
+	int			cpt;
+	int			i = 0;
+	int			rc = -ENOENT;
+	ENTRY;
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) {
+		if (i++ < pos)
+			continue;
+
+		spin_lock(&rule->dl_lock);
+		*attr = rule->dl_attr;
+		*stat = rule->dl_stat;
+		spin_unlock(&rule->dl_lock);
+		rc = 0;
+		break;
+	}
+
+	lnet_net_unlock(cpt);
+	RETURN(rc);
+}
+
+/**
+ * reset counters for all Delay Rules
+ */
+void
+lnet_delay_rule_reset(void)
+{
+	struct lnet_delay_rule *rule;
+	int			cpt;
+	ENTRY;
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) {
+		struct lnet_fault_attr *attr = &rule->dl_attr;
+
+		spin_lock(&rule->dl_lock);
+
+		memset(&rule->dl_stat, 0, sizeof(rule->dl_stat));
+		if (attr->u.delay.la_rate != 0) {
+			rule->dl_delay_at = get_random_u32_below(attr->u.delay.la_rate);
+		} else {
+			rule->dl_delay_time = ktime_get_seconds() +
+					      get_random_u32_below(attr->u.delay.la_interval);
+			rule->dl_time_base = ktime_get_seconds() +
+					     attr->u.delay.la_interval;
+		}
+		spin_unlock(&rule->dl_lock);
+	}
+
+	lnet_net_unlock(cpt);
+	EXIT;
+}
+
+int
+lnet_fault_ctl(int opc, struct libcfs_ioctl_data *data)
+{
+	struct lnet_fault_attr *attr;
+	struct lnet_fault_stat *stat;
+
+	attr = (struct lnet_fault_attr *)data->ioc_inlbuf1;
+
+	switch (opc) {
+	default:
+		return -EINVAL;
+
+	case LNET_CTL_DROP_ADD:
+		if (attr == NULL)
+			return -EINVAL;
+
+		return lnet_drop_rule_add(attr);
+
+	case LNET_CTL_DROP_DEL:
+		if (attr == NULL)
+			return -EINVAL;
+
+		data->ioc_count = lnet_drop_rule_del(attr->fa_src,
+						     attr->fa_dst);
+		return 0;
+
+	case LNET_CTL_DROP_RESET:
+		lnet_drop_rule_reset();
+		return 0;
+
+	case LNET_CTL_DROP_LIST:
+		stat = (struct lnet_fault_stat *)data->ioc_inlbuf2;
+		if (attr == NULL || stat == NULL)
+			return -EINVAL;
+
+		return lnet_drop_rule_list(data->ioc_count, attr, stat);
+
+	case LNET_CTL_DELAY_ADD:
+		if (attr == NULL)
+			return -EINVAL;
+
+		return lnet_delay_rule_add(attr);
+
+	case LNET_CTL_DELAY_DEL:
+		if (attr == NULL)
+			return -EINVAL;
+
+		data->ioc_count = lnet_delay_rule_del(attr->fa_src,
+						      attr->fa_dst, false);
+		return 0;
+
+	case LNET_CTL_DELAY_RESET:
+		lnet_delay_rule_reset();
+		return 0;
+
+	case LNET_CTL_DELAY_LIST:
+		stat = (struct lnet_fault_stat *)data->ioc_inlbuf2;
+		if (attr == NULL || stat == NULL)
+			return -EINVAL;
+
+		return lnet_delay_rule_list(data->ioc_count, attr, stat);
+	}
+}
+
+int
+lnet_fault_init(void)
+{
+	BUILD_BUG_ON(LNET_PUT_BIT != BIT(LNET_MSG_PUT));
+	BUILD_BUG_ON(LNET_ACK_BIT != BIT(LNET_MSG_ACK));
+	BUILD_BUG_ON(LNET_GET_BIT != BIT(LNET_MSG_GET));
+	BUILD_BUG_ON(LNET_REPLY_BIT != BIT(LNET_MSG_REPLY));
+
+	mutex_init(&delay_dd.dd_mutex);
+	spin_lock_init(&delay_dd.dd_lock);
+	init_waitqueue_head(&delay_dd.dd_waitq);
+	init_waitqueue_head(&delay_dd.dd_ctl_waitq);
+	INIT_LIST_HEAD(&delay_dd.dd_sched_rules);
+
+	return 0;
+}
+
+void
+lnet_fault_fini(void)
+{
+	lnet_drop_rule_del(0, 0);
+	lnet_delay_rule_del(0, 0, true);
+
+	LASSERT(list_empty(&the_lnet.ln_drop_rules));
+	LASSERT(list_empty(&the_lnet.ln_delay_rules));
+	LASSERT(list_empty(&delay_dd.dd_sched_rules));
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
new file mode 100644
index 0000000000000..16e16f6360adb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
@@ -0,0 +1,1190 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/lnet/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/sunrpc/addr.h>
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lnet/nidstr.h>
+#include <lnet/lib-types.h>
+
+/* max value for numeric network address */
+#define MAX_NUMERIC_VALUE 0xffffffff
+
+#define IPSTRING_LENGTH 16
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char	 libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int	 libcfs_nidstring_idx;
+
+static DEFINE_SPINLOCK(libcfs_nidstring_lock);
+
+static struct netstrfns *libcfs_namenum2netstrfns(const char *name);
+
+char *
+libcfs_next_nidstring(void)
+{
+	char	      *str;
+	unsigned long  flags;
+
+	spin_lock_irqsave(&libcfs_nidstring_lock, flags);
+
+	str = libcfs_nidstrings[libcfs_nidstring_idx++];
+	if (libcfs_nidstring_idx == ARRAY_SIZE(libcfs_nidstrings))
+		libcfs_nidstring_idx = 0;
+
+	spin_unlock_irqrestore(&libcfs_nidstring_lock, flags);
+	return str;
+}
+EXPORT_SYMBOL(libcfs_next_nidstring);
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>	     :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>	     :== <addrrange> '@' <net>
+ * <addrrange>	     :== '*' |
+ *			 <ipaddr_range> |
+ *			 <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *			 <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *			 <expr_list>
+ * <expr_list>	     :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *			 <number> '-' <number> |
+ *			 <number> '-' <number> '/' <number>
+ * <net>	     :== <netname> | <netname><number>
+ * <netname>	     :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *			 "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+	/**
+	 * Link to list of this structures which is built on nid range
+	 * list parsing.
+	 */
+	struct list_head nr_link;
+	/**
+	 * List head for addrrange::ar_link.
+	 */
+	struct list_head nr_addrranges;
+	/**
+	 * Flag indicating that *@<net> is found.
+	 */
+	int nr_all;
+	/**
+	 * Pointer to corresponding element of libcfs_netstrfns.
+	 */
+	struct netstrfns *nr_netstrfns;
+	/**
+	 * Number of network. E.g. 5 if \<net\> is "elan5".
+	 */
+	int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+	/**
+	 * Link to nidrange::nr_addrranges.
+	 */
+	struct list_head ar_link;
+	/**
+	 * List head for cfs_expr_list::el_list.
+	 */
+	struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 0 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval -errno otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+	struct addrrange *addrrange;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		nidrange->nr_all = 1;
+		return 0;
+	}
+
+	CFS_ALLOC_PTR(addrrange);
+	if (addrrange == NULL)
+		return -ENOMEM;
+	list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+	INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+	return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+						src->ls_len,
+						&addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+	     struct list_head *nidlist)
+{
+	struct netstrfns *nf;
+	struct nidrange *nr;
+	int endlen;
+	unsigned netnum;
+
+	if (src->ls_len >= LNET_NIDSTR_SIZE)
+		return NULL;
+
+	nf = libcfs_namenum2netstrfns(src->ls_str);
+	if (nf == NULL)
+		return NULL;
+	endlen = src->ls_len - strlen(nf->nf_name);
+	if (endlen == 0)
+		/* network name only, e.g. "elan" or "tcp" */
+		netnum = 0;
+	else {
+		/* e.g. "elan25" or "tcp23", refuse to parse if
+		 * network name is not appended with decimal or
+		 * hexadecimal number */
+		if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+				       endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+			return NULL;
+	}
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns != nf)
+			continue;
+		if (nr->nr_netnum != netnum)
+			continue;
+		return nr;
+	}
+
+	CFS_ALLOC_PTR(nr);
+	if (nr == NULL)
+		return NULL;
+	list_add_tail(&nr->nr_link, nidlist);
+	INIT_LIST_HEAD(&nr->nr_addrranges);
+	nr->nr_netstrfns = nf;
+	nr->nr_all = 0;
+	nr->nr_netnum = netnum;
+
+	return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+	struct cfs_lstr addrrange;
+	struct cfs_lstr net;
+	struct nidrange *nr;
+
+	if (cfs_gettok(src, '@', &addrrange) == 0)
+		goto failed;
+
+	if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+		goto failed;
+
+	nr = add_nidrange(&net, nidlist);
+	if (nr == NULL)
+		goto failed;
+
+	if (parse_addrange(&addrrange, nr) != 0)
+		goto failed;
+
+	return 1;
+failed:
+	return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct addrrange *ar;
+
+		ar = list_entry(list->next, struct addrrange, ar_link);
+
+		cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+		list_del(&ar->ar_link);
+		CFS_FREE_PTR(ar);
+	}
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	struct nidrange *nr;
+
+	list_for_each_safe(pos, next, list) {
+		nr = list_entry(pos, struct nidrange, nr_link);
+		free_addrranges(&nr->nr_addrranges);
+		list_del(pos);
+		CFS_FREE_PTR(nr);
+	}
+}
+EXPORT_SYMBOL(cfs_free_nidlist);
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(nidlist);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+		rc = parse_nidrange(&res, nidlist);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+	}
+	return 1;
+}
+EXPORT_SYMBOL(cfs_parse_nidlist);
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_all)
+			return 1;
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+			if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+							&ar->ar_numaddr_ranges))
+				return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cfs_match_nid);
+
+/**
+ * Print the network part of the nidrange \a nr into the specified \a buffer.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_print_network(char *buffer, int count, struct nidrange *nr)
+{
+	struct netstrfns *nf = nr->nr_netstrfns;
+
+	if (nr->nr_netnum == 0)
+		return scnprintf(buffer, count, "@%s", nf->nf_name);
+	else
+		return scnprintf(buffer, count, "@%s%u",
+				    nf->nf_name, nr->nr_netnum);
+}
+
+/**
+ * Print a list of addrrange (\a addrranges) into the specified \a buffer.
+ * At max \a count characters can be printed into \a buffer.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_print_addrranges(char *buffer, int count, struct list_head *addrranges,
+		     struct nidrange *nr)
+{
+	int i = 0;
+	struct addrrange *ar;
+	struct netstrfns *nf = nr->nr_netstrfns;
+
+	list_for_each_entry(ar, addrranges, ar_link) {
+		if (i != 0)
+			i += scnprintf(buffer + i, count - i, " ");
+		i += nf->nf_print_addrlist(buffer + i, count - i,
+					   &ar->ar_numaddr_ranges);
+		i += cfs_print_network(buffer + i, count - i, nr);
+	}
+	return i;
+}
+
+/**
+ * Print a list of nidranges (\a nidlist) into the specified \a buffer.
+ * At max \a count characters can be printed into \a buffer.
+ * Nidranges are separated by a space character.
+ *
+ * \retval number of characters written
+ */
+int cfs_print_nidlist(char *buffer, int count, struct list_head *nidlist)
+{
+	int i = 0;
+	struct nidrange *nr;
+
+	if (count <= 0)
+		return 0;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (i != 0)
+			i += scnprintf(buffer + i, count - i, " ");
+
+		if (nr->nr_all != 0) {
+			LASSERT(list_empty(&nr->nr_addrranges));
+			i += scnprintf(buffer + i, count - i, "*");
+			i += cfs_print_network(buffer + i, count - i, nr);
+		} else {
+			i += cfs_print_addrranges(buffer + i, count - i,
+						  &nr->nr_addrranges, nr);
+		}
+	}
+	return i;
+}
+EXPORT_SYMBOL(cfs_print_nidlist);
+
+static int
+libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+	*addr = 0;
+	return 1;
+}
+
+static void
+libcfs_ip_addr2str(__u32 addr, char *str, size_t size)
+{
+	snprintf(str, size, "%u.%u.%u.%u",
+		 (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+		 (addr >> 8) & 0xff, addr & 0xff);
+}
+
+static void
+libcfs_ip_addr2str_size(const __be32 *addr, size_t asize,
+			char *str, size_t size)
+{
+	struct sockaddr_storage sa = {};
+
+	switch (asize) {
+	case 4:
+		sa.ss_family = AF_INET;
+		memcpy(&((struct sockaddr_in *)(&sa))->sin_addr.s_addr,
+		       addr, asize);
+		break;
+	case 16:
+		sa.ss_family = AF_INET6;
+		memcpy(&((struct sockaddr_in6 *)(&sa))->sin6_addr.s6_addr,
+		       addr, asize);
+		break;
+	default:
+		return;
+	}
+
+	rpc_ntop((struct sockaddr *)&sa, str, size);
+}
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+static int
+libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+	unsigned int	a;
+	unsigned int	b;
+	unsigned int	c;
+	unsigned int	d;
+	int		n = nob; /* XscanfX */
+
+	/* numeric IP? */
+	if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+	    n == nob &&
+	    (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+	    (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+		*addr = ((a<<24)|(b<<16)|(c<<8)|d);
+		return 1;
+	}
+	return 0;
+}
+
+static int
+libcfs_ip_str2addr_size(const char *str, int nob,
+			__be32 *addr, size_t *alen)
+{
+	struct sockaddr_storage sa;
+
+	/* Note: 'net' arg to rpc_pton is only needed for link-local
+	 * addresses.  Such addresses would not work with LNet routing,
+	 * so we can assume they aren't used.  So it doesn't matter
+	 * which net namespace is passed.
+	 */
+	if (rpc_pton(&init_net, str, nob,
+		     (struct sockaddr *)&sa, sizeof(sa)) == 0)
+		return 0;
+	if (sa.ss_family == AF_INET6) {
+		memcpy(addr,
+		       &((struct sockaddr_in6 *)(&sa))->sin6_addr.s6_addr,
+		       16);
+		*alen = 16;
+		return 1;
+	}
+	if (sa.ss_family == AF_INET) {
+		memcpy(addr,
+		       &((struct sockaddr_in *)(&sa))->sin_addr.s_addr,
+		       4);
+		*alen = 4;
+		return 1;
+	}
+	return 0;
+}
+
+
+/* Used by lnet/config.c so it can't be static */
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	struct cfs_lstr src;
+	int rc;
+	int i;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	i = 0;
+
+	while (src.ls_str != NULL) {
+		struct cfs_lstr res;
+
+		if (!cfs_gettok(&src, '.', &res)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+		if (rc != 0)
+			goto out;
+
+		list_add_tail(&el->el_link, list);
+		i++;
+	}
+
+	if (i == 4)
+		return 0;
+
+	rc = -EINVAL;
+out:
+	cfs_expr_list_free_list(list);
+
+	return rc;
+}
+
+static int
+libcfs_ip_addr_range_print(char *buffer, int count, struct list_head *list)
+{
+	int i = 0, j = 0;
+	struct cfs_expr_list *el;
+
+	list_for_each_entry(el, list, el_link) {
+		LASSERT(j++ < 4);
+		if (i != 0)
+			i += scnprintf(buffer + i, count - i, ".");
+		i += cfs_expr_list_print(buffer + i, count - i, el);
+	}
+	return i;
+}
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0;
+
+	list_for_each_entry_reverse(el, list, el_link) {
+		if (!cfs_expr_list_match(addr & 0xff, el))
+			return 0;
+		addr >>= 8;
+		i++;
+	}
+
+	return i == 4;
+}
+
+/**
+ * Print the network part of the nidrange \a nr into the specified \a buffer.
+ *
+ * \retval number of characters written
+ */
+static void
+libcfs_decnum_addr2str(__u32 addr, char *str, size_t size)
+{
+	snprintf(str, size, "%u", addr);
+}
+
+static int
+libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int	n;
+
+	n = nob;
+	if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int	rc;
+
+	rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+	if (rc == 0)
+		list_add_tail(&el->el_link, list);
+
+	return rc;
+}
+
+static int
+libcfs_num_addr_range_print(char *buffer, int count, struct list_head *list)
+{
+	int i = 0, j = 0;
+	struct cfs_expr_list *el;
+
+	list_for_each_entry(el, list, el_link) {
+		LASSERT(j++ < 1);
+		i += cfs_expr_list_print(buffer + i, count - i, el);
+	}
+	return i;
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+	struct cfs_expr_list *el;
+
+	LASSERT(!list_empty(numaddr));
+	el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+	return cfs_expr_list_match(addr, el);
+}
+
+static struct netstrfns libcfs_netstrfns[] = {
+	{ .nf_type		= LOLND,
+	  .nf_name		= "lo",
+	  .nf_modname		= "klolnd",
+	  .nf_addr2str		= libcfs_decnum_addr2str,
+	  .nf_str2addr		= libcfs_lo_str2addr,
+	  .nf_parse_addrlist	= libcfs_num_parse,
+	  .nf_print_addrlist	= libcfs_num_addr_range_print,
+	  .nf_match_addr	= libcfs_num_match
+	},
+	{ .nf_type		= SOCKLND,
+	  .nf_name		= "tcp",
+	  .nf_modname		= "ksocklnd",
+	  .nf_addr2str		= libcfs_ip_addr2str,
+	  .nf_addr2str_size	= libcfs_ip_addr2str_size,
+	  .nf_str2addr		= libcfs_ip_str2addr,
+	  .nf_str2addr_size	= libcfs_ip_str2addr_size,
+	  .nf_parse_addrlist	= cfs_ip_addr_parse,
+	  .nf_print_addrlist	= libcfs_ip_addr_range_print,
+	  .nf_match_addr	= cfs_ip_addr_match
+	},
+	{ .nf_type		= O2IBLND,
+	  .nf_name		= "o2ib",
+	  .nf_modname		= "ko2iblnd",
+	  .nf_addr2str		= libcfs_ip_addr2str,
+	  .nf_str2addr		= libcfs_ip_str2addr,
+	  .nf_parse_addrlist	= cfs_ip_addr_parse,
+	  .nf_print_addrlist	= libcfs_ip_addr_range_print,
+	  .nf_match_addr	= cfs_ip_addr_match
+	},
+	{ .nf_type		= GNILND,
+	  .nf_name		= "gni",
+	  .nf_modname		= "kgnilnd",
+	  .nf_addr2str		= libcfs_decnum_addr2str,
+	  .nf_str2addr		= libcfs_num_str2addr,
+	  .nf_parse_addrlist	= libcfs_num_parse,
+	  .nf_print_addrlist	= libcfs_num_addr_range_print,
+	  .nf_match_addr	= libcfs_num_match
+	},
+	{ .nf_type		= GNIIPLND,
+	  .nf_name		= "gip",
+	  .nf_modname		= "kgnilnd",
+	  .nf_addr2str		= libcfs_ip_addr2str,
+	  .nf_str2addr		= libcfs_ip_str2addr,
+	  .nf_parse_addrlist	= cfs_ip_addr_parse,
+	  .nf_print_addrlist	= libcfs_ip_addr_range_print,
+	  .nf_match_addr	= cfs_ip_addr_match
+	},
+	{ .nf_type		= PTL4LND,
+	  .nf_name		= "ptlf",
+	  .nf_modname		= "kptl4lnd",
+	  .nf_addr2str		= libcfs_decnum_addr2str,
+	  .nf_str2addr		= libcfs_num_str2addr,
+	  .nf_parse_addrlist	= libcfs_num_parse,
+	  .nf_print_addrlist	= libcfs_num_addr_range_print,
+	  .nf_match_addr	= libcfs_num_match
+	},
+	{
+	  .nf_type		= KFILND,
+	  .nf_name		= "kfi",
+	  .nf_modname		= "kkfilnd",
+	  .nf_addr2str		= libcfs_decnum_addr2str,
+	  .nf_str2addr		= libcfs_num_str2addr,
+	  .nf_parse_addrlist	= libcfs_num_parse,
+	  .nf_print_addrlist	= libcfs_num_addr_range_print,
+	  .nf_match_addr	= libcfs_num_match
+	},
+};
+
+static const size_t libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns);
+
+static struct netstrfns *
+type2net_info(__u32 net_type)
+{
+	int i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		if (libcfs_netstrfns[i].nf_type == net_type)
+			return &libcfs_netstrfns[i];
+	}
+
+	return NULL;
+}
+
+int
+cfs_match_net(__u32 net_id, __u32 net_type, struct list_head *net_num_list)
+{
+	__u32 net_num;
+
+	if (!net_num_list)
+		return 0;
+
+	if (net_type != LNET_NETTYP(net_id))
+		return 0;
+
+	net_num = LNET_NETNUM(net_id);
+
+	/* if there is a net number but the list passed in is empty, then
+	 * there is no match.
+	 */
+	if (!net_num && list_empty(net_num_list))
+		return 1;
+	else if (list_empty(net_num_list))
+		return 0;
+
+	if (!libcfs_num_match(net_num, net_num_list))
+		return 0;
+
+	return 1;
+}
+
+int
+cfs_match_nid_net(struct lnet_nid *nid, __u32 net_type,
+		   struct list_head *net_num_list,
+		   struct list_head *addr)
+{
+	__u32 address;
+	struct netstrfns *nf;
+
+	if (!addr || !net_num_list)
+		return 0;
+
+	nf = type2net_info(LNET_NETTYP(LNET_NID_NET(nid)));
+	if (!nf || !net_num_list || !addr)
+		return 0;
+
+	/* FIXME handle long-addr nid */
+	address = LNET_NIDADDR(lnet_nid_to_nid4(nid));
+
+	/* if either the address or net number don't match then no match */
+	if (!nf->nf_match_addr(address, addr) ||
+	    !cfs_match_net(LNET_NID_NET(nid), net_type, net_num_list))
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_match_nid_net);
+
+static struct netstrfns *
+libcfs_lnd2netstrfns(__u32 lnd)
+{
+	int	i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (lnd == libcfs_netstrfns[i].nf_type)
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+	struct netstrfns *nf;
+	int		  i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (!strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+			return nf;
+	}
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+	int    i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (!strcmp(libcfs_netstrfns[i].nf_name, name))
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+int
+libcfs_isknown_lnd(__u32 lnd)
+{
+	return libcfs_lnd2netstrfns(lnd) != NULL;
+}
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+
+char *
+libcfs_lnd2modname(__u32 lnd)
+{
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	return (nf == NULL) ? NULL : nf->nf_modname;
+}
+EXPORT_SYMBOL(libcfs_lnd2modname);
+
+int
+libcfs_str2lnd(const char *str)
+{
+	struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+	if (nf != NULL)
+		return nf->nf_type;
+
+	return -ENXIO;
+}
+EXPORT_SYMBOL(libcfs_str2lnd);
+
+char *
+libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size)
+{
+	struct netstrfns *nf;
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL)
+		snprintf(buf, buf_size, "?%u?", lnd);
+	else
+		snprintf(buf, buf_size, "%s", nf->nf_name);
+
+	return buf;
+}
+EXPORT_SYMBOL(libcfs_lnd2str_r);
+
+char *
+libcfs_net2str_r(__u32 net, char *buf, size_t buf_size)
+{
+	__u32		  nnum = LNET_NETNUM(net);
+	__u32		  lnd  = LNET_NETTYP(net);
+	struct netstrfns *nf;
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL)
+		snprintf(buf, buf_size, "<%u:%u>", lnd, nnum);
+	else if (nnum == 0)
+		snprintf(buf, buf_size, "%s", nf->nf_name);
+	else
+		snprintf(buf, buf_size, "%s%u", nf->nf_name, nnum);
+
+	return buf;
+}
+EXPORT_SYMBOL(libcfs_net2str_r);
+
+char *
+libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size)
+{
+	__u32		  addr = LNET_NIDADDR(nid);
+	__u32		  net  = LNET_NIDNET(nid);
+	__u32		  nnum = LNET_NETNUM(net);
+	__u32		  lnd  = LNET_NETTYP(net);
+	struct netstrfns *nf;
+
+	if (nid == LNET_NID_ANY) {
+		strncpy(buf, "<?>", buf_size);
+		buf[buf_size - 1] = '\0';
+		return buf;
+	}
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL) {
+		snprintf(buf, buf_size, "%x@<%u:%u>", addr, lnd, nnum);
+	} else {
+		size_t addr_len;
+
+		nf->nf_addr2str(addr, buf, buf_size);
+		addr_len = strlen(buf);
+		if (nnum == 0)
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s",
+				 nf->nf_name);
+		else
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s%u",
+				 nf->nf_name, nnum);
+	}
+
+	return buf;
+}
+EXPORT_SYMBOL(libcfs_nid2str_r);
+
+char *
+libcfs_nidstr_r(const struct lnet_nid *nid, char *buf, size_t buf_size)
+{
+	__u32 nnum;
+	__u32 lnd;
+	struct netstrfns *nf;
+
+	if (LNET_NID_IS_ANY(nid)) {
+		strncpy(buf, "<?>", buf_size);
+		buf[buf_size - 1] = '\0';
+		return buf;
+	}
+
+	nnum = be16_to_cpu(nid->nid_num);
+	lnd = nid->nid_type;
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf) {
+		size_t addr_len;
+
+		if (nf->nf_addr2str_size)
+			nf->nf_addr2str_size(nid->nid_addr, NID_ADDR_BYTES(nid),
+					     buf, buf_size);
+		else
+			nf->nf_addr2str(ntohl(nid->nid_addr[0]), buf, buf_size);
+		addr_len = strlen(buf);
+		if (nnum == 0)
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s",
+				 nf->nf_name);
+		else
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s%u",
+				 nf->nf_name, nnum);
+	} else {
+		int l = 0;
+		int words = DIV_ROUND_UP(NID_ADDR_BYTES(nid), 4);
+		int i;
+
+		for (i = 0; i < words && i < 4; i++)
+			l = snprintf(buf+l, buf_size-l, "%s%x",
+				     i ? ":" : "", ntohl(nid->nid_addr[i]));
+		snprintf(buf+l, buf_size-l, "@<%u:%u>", lnd, nnum);
+	}
+
+	return buf;
+}
+EXPORT_SYMBOL(libcfs_nidstr_r);
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+	struct netstrfns *nf = NULL;
+	int		  nob;
+	unsigned int	  netnum;
+	int		  i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (!strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+			break;
+	}
+
+	if (i == libcfs_nnetstrfns)
+		return NULL;
+
+	nob = strlen(nf->nf_name);
+
+	if (strlen(str) == (unsigned int)nob) {
+		netnum = 0;
+	} else {
+		if (nf->nf_type == LOLND) /* net number not allowed */
+			return NULL;
+
+		str += nob;
+		i = strlen(str);
+		if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+		    i != (int)strlen(str))
+			return NULL;
+	}
+
+	*net = LNET_MKNET(nf->nf_type, netnum);
+	return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+	__u32  net;
+
+	if (libcfs_str2net_internal(str, &net) != NULL)
+		return net;
+
+	return LNET_NET_ANY;
+}
+EXPORT_SYMBOL(libcfs_str2net);
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+	const char	 *sep = strchr(str, '@');
+	struct netstrfns *nf;
+	__u32		  net;
+	__u32		  addr;
+
+	if (sep != NULL) {
+		nf = libcfs_str2net_internal(sep + 1, &net);
+		if (nf == NULL)
+			return LNET_NID_ANY;
+	} else {
+		sep = str + strlen(str);
+		net = LNET_MKNET(SOCKLND, 0);
+		nf = libcfs_lnd2netstrfns(SOCKLND);
+		LASSERT(nf != NULL);
+	}
+
+	if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+		return LNET_NID_ANY;
+
+	return LNET_MKNID(net, addr);
+}
+EXPORT_SYMBOL(libcfs_str2nid);
+
+int
+libcfs_strnid(struct lnet_nid *nid, const char *str)
+{
+	const char	 *sep = strchr(str, '@');
+	struct netstrfns *nf;
+	__u32		  net;
+
+	if (sep != NULL) {
+		nf = libcfs_str2net_internal(sep + 1, &net);
+		if (nf == NULL)
+			return -EINVAL;
+	} else {
+		sep = str + strlen(str);
+		net = LNET_MKNET(SOCKLND, 0);
+		nf = libcfs_lnd2netstrfns(SOCKLND);
+		LASSERT(nf != NULL);
+	}
+
+	memset(nid, 0, sizeof(*nid));
+	nid->nid_type = LNET_NETTYP(net);
+	nid->nid_num = htons(LNET_NETNUM(net));
+	if (nf->nf_str2addr_size) {
+		size_t asize = 0;
+
+		if (!nf->nf_str2addr_size(str, (int)(sep - str),
+					  nid->nid_addr, &asize))
+			return -EINVAL;
+		nid->nid_size = asize - 4;
+	} else {
+		__u32 addr;
+
+		if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+			return -EINVAL;
+		nid->nid_addr[0] = htonl(addr);
+		nid->nid_size = 0;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_strnid);
+
+char *
+libcfs_id2str(struct lnet_process_id id)
+{
+	char *str = libcfs_next_nidstring();
+
+	if (id.pid == LNET_PID_ANY) {
+		snprintf(str, LNET_NIDSTR_SIZE,
+			 "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+		return str;
+	}
+
+	snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+		 ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+		 (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+	return str;
+}
+EXPORT_SYMBOL(libcfs_id2str);
+
+char *
+libcfs_idstr(struct lnet_processid *id)
+{
+	char *str = libcfs_next_nidstring();
+
+	if (id->pid == LNET_PID_ANY) {
+		snprintf(str, LNET_NIDSTR_SIZE,
+			 "LNET_PID_ANY-%s", libcfs_nidstr(&id->nid));
+		return str;
+	}
+
+	snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+		 ((id->pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+		 (id->pid & ~LNET_PID_USERFLAG), libcfs_nidstr(&id->nid));
+	return str;
+}
+EXPORT_SYMBOL(libcfs_idstr);
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+	if (!strcmp(str, "*")) {
+		*nidp = LNET_NID_ANY;
+		return 1;
+	}
+
+	*nidp = libcfs_str2nid(str);
+	return *nidp != LNET_NID_ANY;
+}
+EXPORT_SYMBOL(libcfs_str2anynid);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/peer.c b/drivers/staging/lustrefsx/lnet/lnet/peer.c
new file mode 100644
index 0000000000000..7a438ea086c4e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/peer.c
@@ -0,0 +1,4314 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
+#include <linux/uaccess.h>
+
+#include <lnet/udsp.h>
+#include <lnet/lib-lnet.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+
+/* Value indicating that recovery needs to re-check a peer immediately. */
+#define LNET_REDISCOVER_PEER	(1)
+
+static int lnet_peer_queue_for_discovery(struct lnet_peer *lp);
+
+static void
+lnet_peer_remove_from_remote_list(struct lnet_peer_ni *lpni)
+{
+	if (!list_empty(&lpni->lpni_on_remote_peer_ni_list)) {
+		list_del_init(&lpni->lpni_on_remote_peer_ni_list);
+		lnet_peer_ni_decref_locked(lpni);
+	}
+}
+
+void
+lnet_peer_net_added(struct lnet_net *net)
+{
+	struct lnet_peer_ni *lpni, *tmp;
+
+	list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list,
+				 lpni_on_remote_peer_ni_list) {
+
+		if (LNET_NID_NET(&lpni->lpni_nid) == net->net_id) {
+			lpni->lpni_net = net;
+
+			spin_lock(&lpni->lpni_lock);
+			lpni->lpni_txcredits =
+				lpni->lpni_net->net_tunables.lct_peer_tx_credits;
+			lpni->lpni_mintxcredits = lpni->lpni_txcredits;
+			lpni->lpni_rtrcredits =
+				lnet_peer_buffer_credits(lpni->lpni_net);
+			lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits;
+			spin_unlock(&lpni->lpni_lock);
+
+			lnet_peer_remove_from_remote_list(lpni);
+		}
+	}
+}
+
+static void
+lnet_peer_tables_destroy(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head	*hash;
+	int			i;
+	int			j;
+
+	if (!the_lnet.ln_peer_tables)
+		return;
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		hash = ptable->pt_hash;
+		if (!hash) /* not intialized */
+			break;
+
+		LASSERT(list_empty(&ptable->pt_zombie_list));
+
+		ptable->pt_hash = NULL;
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			LASSERT(list_empty(&hash[j]));
+
+		CFS_FREE_PTR_ARRAY(hash, LNET_PEER_HASH_SIZE);
+	}
+
+	cfs_percpt_free(the_lnet.ln_peer_tables);
+	the_lnet.ln_peer_tables = NULL;
+}
+
+int
+lnet_peer_tables_create(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head	*hash;
+	int			i;
+	int			j;
+
+	the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+						   sizeof(*ptable));
+	if (the_lnet.ln_peer_tables == NULL) {
+		CERROR("Failed to allocate cpu-partition peer tables\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+				 LNET_PEER_HASH_SIZE * sizeof(*hash));
+		if (hash == NULL) {
+			CERROR("Failed to create peer hash table\n");
+			lnet_peer_tables_destroy();
+			return -ENOMEM;
+		}
+
+		spin_lock_init(&ptable->pt_zombie_lock);
+		INIT_LIST_HEAD(&ptable->pt_zombie_list);
+
+		INIT_LIST_HEAD(&ptable->pt_peer_list);
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			INIT_LIST_HEAD(&hash[j]);
+		ptable->pt_hash = hash; /* sign of initialization */
+	}
+
+	return 0;
+}
+
+static struct lnet_peer_ni *
+lnet_peer_ni_alloc(struct lnet_nid *nid)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_net *net;
+	int cpt;
+
+	cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+	LIBCFS_CPT_ALLOC(lpni, lnet_cpt_table(), cpt, sizeof(*lpni));
+	if (!lpni)
+		return NULL;
+
+	INIT_LIST_HEAD(&lpni->lpni_txq);
+	INIT_LIST_HEAD(&lpni->lpni_hashlist);
+	INIT_LIST_HEAD(&lpni->lpni_peer_nis);
+	INIT_LIST_HEAD(&lpni->lpni_recovery);
+	INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list);
+	INIT_LIST_HEAD(&lpni->lpni_rtr_pref_nids);
+	LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
+	kref_init(&lpni->lpni_kref);
+	lpni->lpni_sel_priority = LNET_MAX_SELECTION_PRIORITY;
+
+	spin_lock_init(&lpni->lpni_lock);
+
+	if (lnet_peers_start_down())
+		lpni->lpni_ns_status = LNET_NI_STATUS_DOWN;
+	else
+		lpni->lpni_ns_status = LNET_NI_STATUS_UP;
+	lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+	lpni->lpni_nid = *nid;
+	lpni->lpni_cpt = cpt;
+	atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE);
+
+	net = lnet_get_net_locked(LNET_NID_NET(nid));
+	lpni->lpni_net = net;
+	if (net) {
+		lpni->lpni_txcredits = net->net_tunables.lct_peer_tx_credits;
+		lpni->lpni_mintxcredits = lpni->lpni_txcredits;
+		lpni->lpni_rtrcredits = lnet_peer_buffer_credits(net);
+		lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits;
+	} else {
+		/*
+		 * This peer_ni is not on a local network, so we
+		 * cannot add the credits here. In case the net is
+		 * added later, add the peer_ni to the remote peer ni
+		 * list so it can be easily found and revisited.
+		 */
+		/* FIXME: per-net implementation instead? */
+		lnet_peer_ni_addref_locked(lpni);
+		list_add_tail(&lpni->lpni_on_remote_peer_ni_list,
+			      &the_lnet.ln_remote_peer_ni_list);
+	}
+
+	CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nidstr(&lpni->lpni_nid));
+
+	return lpni;
+}
+
+static struct lnet_peer_net *
+lnet_peer_net_alloc(__u32 net_id)
+{
+	struct lnet_peer_net *lpn;
+
+	LIBCFS_CPT_ALLOC(lpn, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lpn));
+	if (!lpn)
+		return NULL;
+
+	INIT_LIST_HEAD(&lpn->lpn_peer_nets);
+	INIT_LIST_HEAD(&lpn->lpn_peer_nis);
+	lpn->lpn_net_id = net_id;
+	lpn->lpn_sel_priority = LNET_MAX_SELECTION_PRIORITY;
+
+	CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id));
+
+	return lpn;
+}
+
+void
+lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn)
+{
+	struct lnet_peer *lp;
+
+	CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id));
+
+	LASSERT(atomic_read(&lpn->lpn_refcount) == 0);
+	LASSERT(list_empty(&lpn->lpn_peer_nis));
+	LASSERT(list_empty(&lpn->lpn_peer_nets));
+	lp = lpn->lpn_peer;
+	lpn->lpn_peer = NULL;
+	LIBCFS_FREE(lpn, sizeof(*lpn));
+
+	lnet_peer_decref_locked(lp);
+}
+
+static struct lnet_peer *
+lnet_peer_alloc(struct lnet_nid *nid)
+{
+	struct lnet_peer *lp;
+
+	LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lp));
+	if (!lp)
+		return NULL;
+
+	INIT_LIST_HEAD(&lp->lp_rtrq);
+	INIT_LIST_HEAD(&lp->lp_routes);
+	INIT_LIST_HEAD(&lp->lp_peer_list);
+	INIT_LIST_HEAD(&lp->lp_peer_nets);
+	INIT_LIST_HEAD(&lp->lp_dc_list);
+	INIT_LIST_HEAD(&lp->lp_dc_pendq);
+	INIT_LIST_HEAD(&lp->lp_rtr_list);
+	init_waitqueue_head(&lp->lp_dc_waitq);
+	spin_lock_init(&lp->lp_lock);
+	lp->lp_primary_nid = *nid;
+	lp->lp_disc_src_nid = LNET_ANY_NID;
+	lp->lp_disc_dst_nid = LNET_ANY_NID;
+	if (lnet_peers_start_down())
+		lp->lp_alive = false;
+	else
+		lp->lp_alive = true;
+
+	/*
+	 * all peers created on a router should have health on
+	 * if it's not already on.
+	 */
+	if (the_lnet.ln_routing && !lnet_health_sensitivity)
+		lp->lp_health_sensitivity = 1;
+
+	/*
+	 * Turn off discovery for loopback peer. If you're creating a peer
+	 * for the loopback interface then that was initiated when we
+	 * attempted to send a message over the loopback. There is no need
+	 * to ever use a different interface when sending messages to
+	 * myself.
+	 */
+	if (nid_is_lo0(nid))
+		lp->lp_state = LNET_PEER_NO_DISCOVERY;
+	lp->lp_cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+	CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nidstr(&lp->lp_primary_nid));
+
+	return lp;
+}
+
+void
+lnet_destroy_peer_locked(struct lnet_peer *lp)
+{
+	CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nidstr(&lp->lp_primary_nid));
+
+	LASSERT(atomic_read(&lp->lp_refcount) == 0);
+	LASSERT(lp->lp_rtr_refcount == 0);
+	LASSERT(list_empty(&lp->lp_peer_nets));
+	LASSERT(list_empty(&lp->lp_peer_list));
+	LASSERT(list_empty(&lp->lp_dc_list));
+
+	if (lp->lp_data)
+		lnet_ping_buffer_decref(lp->lp_data);
+
+	/*
+	 * if there are messages still on the pending queue, then make
+	 * sure to queue them on the ln_msg_resend list so they can be
+	 * resent at a later point if the discovery thread is still
+	 * running.
+	 * If the discovery thread has stopped, then the wakeup will be a
+	 * no-op, and it is expected the lnet_shutdown_lndnets() will
+	 * eventually be called, which will traverse this list and
+	 * finalize the messages on the list.
+	 * We can not resend them now because we're holding the cpt lock.
+	 * Releasing the lock can cause an inconsistent state
+	 */
+	spin_lock(&the_lnet.ln_msg_resend_lock);
+	spin_lock(&lp->lp_lock);
+	list_splice(&lp->lp_dc_pendq, &the_lnet.ln_msg_resend);
+	spin_unlock(&lp->lp_lock);
+	spin_unlock(&the_lnet.ln_msg_resend_lock);
+	wake_up(&the_lnet.ln_dc_waitq);
+
+	LIBCFS_FREE(lp, sizeof(*lp));
+}
+
+/*
+ * Detach a peer_ni from its peer_net. If this was the last peer_ni on
+ * that peer_net, detach the peer_net from the peer.
+ *
+ * Call with lnet_net_lock/EX held
+ */
+static void
+lnet_peer_detach_peer_ni_locked(struct lnet_peer_ni *lpni)
+{
+	struct lnet_peer_table *ptable;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer *lp;
+
+	/*
+	 * Belts and suspenders: gracefully handle teardown of a
+	 * partially connected peer_ni.
+	 */
+	lpn = lpni->lpni_peer_net;
+
+	list_del_init(&lpni->lpni_peer_nis);
+	/*
+	 * If there are no lpni's left, we detach lpn from
+	 * lp_peer_nets, so it cannot be found anymore.
+	 */
+	if (list_empty(&lpn->lpn_peer_nis))
+		list_del_init(&lpn->lpn_peer_nets);
+
+	/* Update peer NID count. */
+	lp = lpn->lpn_peer;
+	lp->lp_nnis--;
+
+	/*
+	 * If there are no more peer nets, make the peer unfindable
+	 * via the peer_tables.
+	 *
+	 * Otherwise, if the peer is DISCOVERED, tell discovery to
+	 * take another look at it. This is a no-op if discovery for
+	 * this peer did the detaching.
+	 */
+	if (list_empty(&lp->lp_peer_nets)) {
+		list_del_init(&lp->lp_peer_list);
+		ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+		ptable->pt_peers--;
+	} else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) {
+		/* Discovery isn't running, nothing to do here. */
+	} else if (lp->lp_state & LNET_PEER_DISCOVERED) {
+		lnet_peer_queue_for_discovery(lp);
+		wake_up(&the_lnet.ln_dc_waitq);
+	}
+	CDEBUG(D_NET, "peer %s NID %s\n",
+		libcfs_nidstr(&lp->lp_primary_nid),
+		libcfs_nidstr(&lpni->lpni_nid));
+}
+
+/* called with lnet_net_lock LNET_LOCK_EX held */
+static int
+lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni, bool force)
+{
+	struct lnet_peer_table *ptable = NULL;
+
+	/* don't remove a peer_ni if it's also a gateway */
+	if (lnet_isrouter(lpni) && !force) {
+		CERROR("Peer NI %s is a gateway. Can not delete it\n",
+		       libcfs_nidstr(&lpni->lpni_nid));
+		return -EBUSY;
+	}
+
+	lnet_peer_remove_from_remote_list(lpni);
+
+	/* remove peer ni from the hash list. */
+	list_del_init(&lpni->lpni_hashlist);
+
+	/*
+	 * indicate the peer is being deleted so the monitor thread can
+	 * remove it from the recovery queue.
+	 */
+	spin_lock(&lpni->lpni_lock);
+	lpni->lpni_state |= LNET_PEER_NI_DELETING;
+	spin_unlock(&lpni->lpni_lock);
+
+	/* decrement the ref count on the peer table */
+	ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+
+	/*
+	 * The peer_ni can no longer be found with a lookup. But there
+	 * can be current users, so keep track of it on the zombie
+	 * list until the reference count has gone to zero.
+	 *
+	 * The last reference may be lost in a place where the
+	 * lnet_net_lock locks only a single cpt, and that cpt may not
+	 * be lpni->lpni_cpt. So the zombie list of lnet_peer_table
+	 * has its own lock.
+	 */
+	spin_lock(&ptable->pt_zombie_lock);
+	list_add(&lpni->lpni_hashlist, &ptable->pt_zombie_list);
+	ptable->pt_zombies++;
+	spin_unlock(&ptable->pt_zombie_lock);
+
+	/* no need to keep this peer_ni on the hierarchy anymore */
+	lnet_peer_detach_peer_ni_locked(lpni);
+
+	/* remove hashlist reference on peer_ni */
+	lnet_peer_ni_decref_locked(lpni);
+
+	return 0;
+}
+
+void lnet_peer_uninit(void)
+{
+	struct lnet_peer_ni *lpni, *tmp;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	/* remove all peer_nis from the remote peer and the hash list */
+	list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list,
+				 lpni_on_remote_peer_ni_list)
+		lnet_peer_ni_del_locked(lpni, false);
+
+	lnet_peer_tables_destroy();
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static int
+lnet_peer_del_locked(struct lnet_peer *peer)
+{
+	struct lnet_peer_ni *lpni = NULL, *lpni2;
+	int rc = 0, rc2 = 0;
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&peer->lp_primary_nid));
+
+	spin_lock(&peer->lp_lock);
+	peer->lp_state |= LNET_PEER_MARK_DELETED;
+	spin_unlock(&peer->lp_lock);
+
+	lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
+	while (lpni != NULL) {
+		lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
+		rc = lnet_peer_ni_del_locked(lpni, false);
+		if (rc != 0)
+			rc2 = rc;
+		lpni = lpni2;
+	}
+
+	return rc2;
+}
+
+/*
+ * Discovering this peer is taking too long. Cancel any Ping or Push
+ * that discovery is waiting on by unlinking the relevant MDs. The
+ * lnet_discovery_event_handler() will proceed from here and complete
+ * the cleanup.
+ */
+static void lnet_peer_cancel_discovery(struct lnet_peer *lp)
+{
+	struct lnet_handle_md ping_mdh;
+	struct lnet_handle_md push_mdh;
+
+	LNetInvalidateMDHandle(&ping_mdh);
+	LNetInvalidateMDHandle(&push_mdh);
+
+	spin_lock(&lp->lp_lock);
+	if (lp->lp_state & LNET_PEER_PING_SENT) {
+		ping_mdh = lp->lp_ping_mdh;
+		LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+	}
+	if (lp->lp_state & LNET_PEER_PUSH_SENT) {
+		push_mdh = lp->lp_push_mdh;
+		LNetInvalidateMDHandle(&lp->lp_push_mdh);
+	}
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(ping_mdh))
+		LNetMDUnlink(ping_mdh);
+	if (!LNetMDHandleIsInvalid(push_mdh))
+		LNetMDUnlink(push_mdh);
+}
+
+static int
+lnet_peer_del(struct lnet_peer *peer)
+{
+	int rc;
+
+	lnet_peer_cancel_discovery(peer);
+	lnet_net_lock(LNET_LOCK_EX);
+	rc = lnet_peer_del_locked(peer);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+/*
+ * Delete a NID from a peer. Call with ln_api_mutex held.
+ *
+ * Error codes:
+ *  -EPERM:  Non-DLC deletion from DLC-configured peer.
+ *  -ENOENT: No lnet_peer_ni corresponding to the nid.
+ *  -ECHILD: The lnet_peer_ni isn't connected to the peer.
+ *  -EBUSY:  The lnet_peer_ni is the primary, and not the only peer_ni.
+ */
+static int
+lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_nid primary_nid = lp->lp_primary_nid;
+	struct lnet_nid nid;
+	int rc = 0;
+	bool force = (flags & LNET_PEER_RTR_NI_FORCE_DEL) ? true : false;
+
+	lnet_nid4_to_nid(nid4, &nid);
+	if (!(flags & LNET_PEER_CONFIGURED)) {
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			rc = -EPERM;
+			goto out;
+		}
+	}
+
+	lpni = lnet_peer_ni_find_locked(&nid);
+	if (!lpni) {
+		rc = -ENOENT;
+		goto out;
+	}
+	lnet_peer_ni_decref_locked(lpni);
+	if (lp != lpni->lpni_peer_net->lpn_peer) {
+		rc = -ECHILD;
+		goto out;
+	}
+
+	/*
+	 * This function only allows deletion of the primary NID if it
+	 * is the only NID.
+	 */
+	if (nid_same(&nid, &lp->lp_primary_nid) && lp->lp_nnis != 1 && !force) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (nid_same(&nid, &lp->lp_primary_nid) && lp->lp_nnis != 1 && force) {
+		struct lnet_peer_ni *lpni2;
+		/* assign the next peer_ni to be the primary */
+		lpni2 = lnet_get_next_peer_ni_locked(lp, NULL, lpni);
+		LASSERT(lpni2);
+		lp->lp_primary_nid = lpni2->lpni_nid;
+	}
+	rc = lnet_peer_ni_del_locked(lpni, force);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+out:
+	CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n",
+	       libcfs_nidstr(&primary_nid), libcfs_nidstr(&nid),
+	       flags, rc);
+
+	return rc;
+}
+
+static void
+lnet_peer_table_cleanup_locked(struct lnet_net *net,
+			       struct lnet_peer_table *ptable)
+{
+	int			 i;
+	struct lnet_peer_ni	*next;
+	struct lnet_peer_ni	*lpni;
+	struct lnet_peer	*peer;
+
+	for (i = 0; i < LNET_PEER_HASH_SIZE; i++) {
+		list_for_each_entry_safe(lpni, next, &ptable->pt_hash[i],
+					 lpni_hashlist) {
+			if (net != NULL && net != lpni->lpni_net)
+				continue;
+
+			peer = lpni->lpni_peer_net->lpn_peer;
+			if (!nid_same(&peer->lp_primary_nid,
+				       &lpni->lpni_nid)) {
+				lnet_peer_ni_del_locked(lpni, false);
+				continue;
+			}
+			/*
+			 * Removing the primary NID implies removing
+			 * the entire peer. Advance next beyond any
+			 * peer_ni that belongs to the same peer.
+			 */
+			list_for_each_entry_from(next, &ptable->pt_hash[i],
+						 lpni_hashlist) {
+				if (next->lpni_peer_net->lpn_peer != peer)
+					break;
+			}
+			lnet_peer_del_locked(peer);
+		}
+	}
+}
+
+static void
+lnet_peer_ni_finalize_wait(struct lnet_peer_table *ptable)
+{
+	wait_var_event_warning(&ptable->pt_zombies,
+			       ptable->pt_zombies == 0,
+			       "Waiting for %d zombies on peer table\n",
+			       ptable->pt_zombies);
+}
+
+static void
+lnet_peer_table_del_rtrs_locked(struct lnet_net *net,
+				struct lnet_peer_table *ptable)
+{
+	struct lnet_peer_ni	*lp;
+	struct lnet_peer_ni	*tmp;
+	struct lnet_nid		gw_nid;
+	int			i;
+
+	for (i = 0; i < LNET_PEER_HASH_SIZE; i++) {
+		list_for_each_entry_safe(lp, tmp, &ptable->pt_hash[i],
+					 lpni_hashlist) {
+			if (net != lp->lpni_net)
+				continue;
+
+			if (!lnet_isrouter(lp))
+				continue;
+
+			gw_nid = lp->lpni_peer_net->lpn_peer->lp_primary_nid;
+
+			lnet_net_unlock(LNET_LOCK_EX);
+			lnet_del_route(LNET_NET_ANY, &gw_nid);
+			lnet_net_lock(LNET_LOCK_EX);
+		}
+	}
+}
+
+void
+lnet_peer_tables_cleanup(struct lnet_net *net)
+{
+	int i;
+	struct lnet_peer_table *ptable;
+
+	LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net != NULL);
+	/* If just deleting the peers for a NI, get rid of any routes these
+	 * peers are gateways for. */
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_peer_table_del_rtrs_locked(net, ptable);
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	/* Start the cleanup process */
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_peer_table_cleanup_locked(net, ptable);
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables)
+		lnet_peer_ni_finalize_wait(ptable);
+}
+
+static struct lnet_peer_ni *
+lnet_get_peer_ni_locked(struct lnet_peer_table *ptable, struct lnet_nid *nid)
+{
+	struct list_head	*peers;
+	struct lnet_peer_ni	*lp;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		return NULL;
+
+	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+	list_for_each_entry(lp, peers, lpni_hashlist) {
+		if (nid_same(&lp->lpni_nid, nid)) {
+			lnet_peer_ni_addref_locked(lp);
+			return lp;
+		}
+	}
+
+	return NULL;
+}
+
+struct lnet_peer_ni *
+lnet_find_peer_ni_locked(lnet_nid_t nid4)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_table *ptable;
+	int cpt;
+	struct lnet_nid nid;
+
+	lnet_nid4_to_nid(nid4, &nid);
+
+	cpt = lnet_nid_cpt_hash(&nid, LNET_CPT_NUMBER);
+
+	ptable = the_lnet.ln_peer_tables[cpt];
+	lpni = lnet_get_peer_ni_locked(ptable, &nid);
+
+	return lpni;
+}
+
+struct lnet_peer_ni *
+lnet_peer_ni_find_locked(struct lnet_nid *nid)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_table *ptable;
+	int cpt;
+
+	cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+	ptable = the_lnet.ln_peer_tables[cpt];
+	lpni = lnet_get_peer_ni_locked(ptable, nid);
+
+	return lpni;
+}
+
+struct lnet_peer_ni *
+lnet_peer_get_ni_locked(struct lnet_peer *lp, lnet_nid_t nid)
+{
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+
+	lpn = lnet_peer_get_net_locked(lp, LNET_NIDNET(nid));
+	if (!lpn)
+		return NULL;
+
+	list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) {
+		if (lnet_nid_to_nid4(&lpni->lpni_nid) == nid)
+			return lpni;
+	}
+
+	return NULL;
+}
+
+struct lnet_peer_ni *
+lnet_peer_ni_get_locked(struct lnet_peer *lp, struct lnet_nid *nid)
+{
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+
+	lpn = lnet_peer_get_net_locked(lp, LNET_NID_NET(nid));
+	if (!lpn)
+		return NULL;
+
+	list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) {
+		if (nid_same(&lpni->lpni_nid, nid))
+			return lpni;
+	}
+
+	return NULL;
+}
+
+struct lnet_peer *
+lnet_find_peer4(lnet_nid_t nid)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *lp = NULL;
+	int cpt;
+
+	cpt = lnet_net_lock_current();
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		lp = lpni->lpni_peer_net->lpn_peer;
+		lnet_peer_addref_locked(lp);
+		lnet_peer_ni_decref_locked(lpni);
+	}
+	lnet_net_unlock(cpt);
+
+	return lp;
+}
+
+struct lnet_peer *
+lnet_find_peer(struct lnet_nid *nid)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *lp = NULL;
+	int cpt;
+
+	cpt = lnet_net_lock_current();
+	lpni = lnet_peer_ni_find_locked(nid);
+	if (lpni) {
+		lp = lpni->lpni_peer_net->lpn_peer;
+		lnet_peer_addref_locked(lp);
+		lnet_peer_ni_decref_locked(lpni);
+	}
+	lnet_net_unlock(cpt);
+
+	return lp;
+}
+
+struct lnet_peer_net *
+lnet_get_next_peer_net_locked(struct lnet_peer *lp, __u32 prev_lpn_id)
+{
+	struct lnet_peer_net *net;
+
+	if (!prev_lpn_id) {
+		/* no net id provided return the first net */
+		net = list_first_entry_or_null(&lp->lp_peer_nets,
+					       struct lnet_peer_net,
+					       lpn_peer_nets);
+
+		return net;
+	}
+
+	/* find the net after the one provided */
+	list_for_each_entry(net, &lp->lp_peer_nets, lpn_peer_nets) {
+		if (net->lpn_net_id == prev_lpn_id) {
+			/*
+			 * if we reached the end of the list loop to the
+			 * beginning.
+			 */
+			if (net->lpn_peer_nets.next == &lp->lp_peer_nets)
+				return list_first_entry_or_null(&lp->lp_peer_nets,
+								struct lnet_peer_net,
+								lpn_peer_nets);
+			else
+				return list_next_entry(net, lpn_peer_nets);
+		}
+	}
+
+	return NULL;
+}
+
+struct lnet_peer_ni *
+lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
+			     struct lnet_peer_net *peer_net,
+			     struct lnet_peer_ni *prev)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_net *net = peer_net;
+
+	if (!prev) {
+		if (!net) {
+			if (list_empty(&peer->lp_peer_nets))
+				return NULL;
+
+			net = list_entry(peer->lp_peer_nets.next,
+					 struct lnet_peer_net,
+					 lpn_peer_nets);
+		}
+		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
+				  lpni_peer_nis);
+
+		return lpni;
+	}
+
+	if (prev->lpni_peer_nis.next == &prev->lpni_peer_net->lpn_peer_nis) {
+		/*
+		 * if you reached the end of the peer ni list and the peer
+		 * net is specified then there are no more peer nis in that
+		 * net.
+		 */
+		if (net)
+			return NULL;
+
+		/*
+		 * we reached the end of this net ni list. move to the
+		 * next net
+		 */
+		if (prev->lpni_peer_net->lpn_peer_nets.next ==
+		    &peer->lp_peer_nets)
+			/* no more nets and no more NIs. */
+			return NULL;
+
+		/* get the next net */
+		net = list_entry(prev->lpni_peer_net->lpn_peer_nets.next,
+				 struct lnet_peer_net,
+				 lpn_peer_nets);
+		/* get the ni on it */
+		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
+				  lpni_peer_nis);
+
+		return lpni;
+	}
+
+	/* there are more nis left */
+	lpni = list_entry(prev->lpni_peer_nis.next,
+			  struct lnet_peer_ni, lpni_peer_nis);
+
+	return lpni;
+}
+
+/* Call with the ln_api_mutex held */
+int lnet_get_peer_list(u32 *countp, u32 *sizep, struct lnet_process_id __user *ids)
+{
+	struct lnet_process_id id;
+	struct lnet_peer_table *ptable;
+	struct lnet_peer *lp;
+	__u32 count = 0;
+	__u32 size = 0;
+	int lncpt;
+	int cpt;
+	__u32 i;
+	int rc;
+
+	rc = -ESHUTDOWN;
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		goto done;
+
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+
+	/*
+	 * Count the number of peers, and return E2BIG if the buffer
+	 * is too small. We'll also return the desired size.
+	 */
+	rc = -E2BIG;
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		count += ptable->pt_peers;
+	}
+	size = count * sizeof(*ids);
+	if (size > *sizep)
+		goto done;
+
+	/*
+	 * Walk the peer lists and copy out the primary nids.
+	 * This is safe because the peer lists are only modified
+	 * while the ln_api_mutex is held. So we don't need to
+	 * hold the lnet_net_lock as well, and can therefore
+	 * directly call copy_to_user().
+	 */
+	rc = -EFAULT;
+	memset(&id, 0, sizeof(id));
+	id.pid = LNET_PID_LUSTRE;
+	i = 0;
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			if (!nid_is_nid4(&lp->lp_primary_nid))
+				continue;
+			if (i >= count)
+				goto done;
+			id.nid = lnet_nid_to_nid4(&lp->lp_primary_nid);
+			if (copy_to_user(&ids[i], &id, sizeof(id)))
+				goto done;
+			i++;
+		}
+	}
+	rc = 0;
+done:
+	*countp = count;
+	*sizep = size;
+	return rc;
+}
+
+/*
+ * Start pushes to peers that need to be updated for a configuration
+ * change on this node.
+ */
+void
+lnet_push_update_to_peers(int force)
+{
+	struct lnet_peer_table *ptable;
+	struct lnet_peer *lp;
+	int lncpt;
+	int cpt;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	if (lnet_peer_discovery_disabled)
+		force = 0;
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			if (force) {
+				spin_lock(&lp->lp_lock);
+				if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+					lp->lp_state |= LNET_PEER_FORCE_PUSH;
+				spin_unlock(&lp->lp_lock);
+			}
+			if (lnet_peer_needs_push(lp))
+				lnet_peer_queue_for_discovery(lp);
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+	wake_up(&the_lnet.ln_dc_waitq);
+}
+
+/* find the NID in the preferred gateways for the remote peer
+ * return:
+ *	false: list is not empty and NID is not preferred
+ *	false: list is empty
+ *	true: nid is found in the list
+ */
+bool
+lnet_peer_is_pref_rtr_locked(struct lnet_peer_ni *lpni,
+			     struct lnet_nid *gw_nid)
+{
+	struct lnet_nid_list *ne;
+
+	CDEBUG(D_NET, "%s: rtr pref emtpy: %d\n",
+	       libcfs_nidstr(&lpni->lpni_nid),
+	       list_empty(&lpni->lpni_rtr_pref_nids));
+
+	if (list_empty(&lpni->lpni_rtr_pref_nids))
+		return false;
+
+	/* iterate through all the preferred NIDs and see if any of them
+	 * matches the provided gw_nid
+	 */
+	list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) {
+		CDEBUG(D_NET, "Comparing pref %s with gw %s\n",
+		       libcfs_nidstr(&ne->nl_nid),
+		       libcfs_nidstr(gw_nid));
+		if (nid_same(&ne->nl_nid, gw_nid))
+			return true;
+	}
+
+	return false;
+}
+
+void
+lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni)
+{
+	struct list_head zombies;
+	struct lnet_nid_list *ne;
+	struct lnet_nid_list *tmp;
+	int cpt = lpni->lpni_cpt;
+
+	INIT_LIST_HEAD(&zombies);
+
+	lnet_net_lock(cpt);
+	list_splice_init(&lpni->lpni_rtr_pref_nids, &zombies);
+	lnet_net_unlock(cpt);
+
+	list_for_each_entry_safe(ne, tmp, &zombies, nl_list) {
+		list_del(&ne->nl_list);
+		LIBCFS_FREE(ne, sizeof(*ne));
+	}
+}
+
+int
+lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni,
+		       struct lnet_nid *gw_nid)
+{
+	int cpt = lpni->lpni_cpt;
+	struct lnet_nid_list *ne = NULL;
+
+	/* This function is called with api_mutex held. When the api_mutex
+	 * is held the list can not be modified, as it is only modified as
+	 * a result of applying a UDSP and that happens under api_mutex
+	 * lock.
+	 */
+	__must_hold(&the_lnet.ln_api_mutex);
+
+	list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) {
+		if (nid_same(&ne->nl_nid, gw_nid))
+			return -EEXIST;
+	}
+
+	LIBCFS_CPT_ALLOC(ne, lnet_cpt_table(), cpt, sizeof(*ne));
+	if (!ne)
+		return -ENOMEM;
+
+	ne->nl_nid = *gw_nid;
+
+	/* Lock the cpt to protect against addition and checks in the
+	 * selection algorithm
+	 */
+	lnet_net_lock(cpt);
+	list_add(&ne->nl_list, &lpni->lpni_rtr_pref_nids);
+	lnet_net_unlock(cpt);
+
+	return 0;
+}
+
+/*
+ * Test whether a ni is a preferred ni for this peer_ni, e.g, whether
+ * this is a preferred point-to-point path. Call with lnet_net_lock in
+ * shared mmode.
+ */
+bool
+lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, struct lnet_nid *nid)
+{
+	struct lnet_nid_list *ne;
+
+	if (lpni->lpni_pref_nnids == 0)
+		return false;
+	if (lpni->lpni_pref_nnids == 1)
+		return nid_same(&lpni->lpni_pref.nid, nid);
+	list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) {
+		if (nid_same(&ne->nl_nid, nid))
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Set a single ni as preferred, provided no preferred ni is already
+ * defined. Only to be used for non-multi-rail peer_ni.
+ */
+int
+lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni,
+				  struct lnet_nid *nid)
+{
+	int rc = 0;
+
+	if (!nid)
+		return -EINVAL;
+	spin_lock(&lpni->lpni_lock);
+	if (LNET_NID_IS_ANY(nid)) {
+		rc = -EINVAL;
+	} else if (lpni->lpni_pref_nnids > 0) {
+		rc = -EPERM;
+	} else if (lpni->lpni_pref_nnids == 0) {
+		lpni->lpni_pref.nid = *nid;
+		lpni->lpni_pref_nnids = 1;
+		lpni->lpni_state |= LNET_PEER_NI_NON_MR_PREF;
+	}
+	spin_unlock(&lpni->lpni_lock);
+
+	CDEBUG(D_NET, "peer %s nid %s: %d\n",
+	       libcfs_nidstr(&lpni->lpni_nid), libcfs_nidstr(nid), rc);
+	return rc;
+}
+
+/*
+ * Clear the preferred NID from a non-multi-rail peer_ni, provided
+ * this preference was set by lnet_peer_ni_set_non_mr_pref_nid().
+ */
+int
+lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni)
+{
+	int rc = 0;
+
+	spin_lock(&lpni->lpni_lock);
+	if (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF) {
+		lpni->lpni_pref_nnids = 0;
+		lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+	} else if (lpni->lpni_pref_nnids == 0) {
+		rc = -ENOENT;
+	} else {
+		rc = -EPERM;
+	}
+	spin_unlock(&lpni->lpni_lock);
+
+	CDEBUG(D_NET, "peer %s: %d\n",
+	       libcfs_nidstr(&lpni->lpni_nid), rc);
+	return rc;
+}
+
+void
+lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni, __u32 priority)
+{
+	lpni->lpni_sel_priority = priority;
+}
+
+/*
+ * Clear the preferred NIDs from a non-multi-rail peer.
+ */
+void
+lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp)
+{
+	struct lnet_peer_ni *lpni = NULL;
+
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
+		lnet_peer_ni_clr_non_mr_pref_nid(lpni);
+}
+
+int
+lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, struct lnet_nid *nid)
+{
+	struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
+	struct lnet_nid_list *ne1 = NULL;
+	struct lnet_nid_list *ne2 = NULL;
+	struct lnet_nid *tmp_nid = NULL;
+	int rc = 0;
+
+	if (LNET_NID_IS_ANY(nid)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (lpni->lpni_pref_nnids == 1 &&
+	    nid_same(&lpni->lpni_pref.nid, nid)) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	/* A non-MR node may have only one preferred NI per peer_ni */
+	if (lpni->lpni_pref_nnids > 0 &&
+	    !(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+		rc = -EPERM;
+		goto out;
+	}
+
+	/* add the new preferred nid to the list of preferred nids */
+	if (lpni->lpni_pref_nnids != 0) {
+		size_t alloc_size = sizeof(*ne1);
+
+		if (lpni->lpni_pref_nnids == 1) {
+			tmp_nid = &lpni->lpni_pref.nid;
+			INIT_LIST_HEAD(&lpni->lpni_pref.nids);
+		}
+
+		list_for_each_entry(ne1, &lpni->lpni_pref.nids, nl_list) {
+			if (nid_same(&ne1->nl_nid, nid)) {
+				rc = -EEXIST;
+				goto out;
+			}
+		}
+
+		LIBCFS_CPT_ALLOC(ne1, lnet_cpt_table(), lpni->lpni_cpt,
+				 alloc_size);
+		if (!ne1) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		/* move the originally stored nid to the list */
+		if (lpni->lpni_pref_nnids == 1) {
+			LIBCFS_CPT_ALLOC(ne2, lnet_cpt_table(),
+				lpni->lpni_cpt, alloc_size);
+			if (!ne2) {
+				rc = -ENOMEM;
+				goto out;
+			}
+			INIT_LIST_HEAD(&ne2->nl_list);
+			ne2->nl_nid = *tmp_nid;
+		}
+		ne1->nl_nid = *nid;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	spin_lock(&lpni->lpni_lock);
+	if (lpni->lpni_pref_nnids == 0) {
+		lpni->lpni_pref.nid = *nid;
+	} else {
+		if (ne2)
+			list_add_tail(&ne2->nl_list, &lpni->lpni_pref.nids);
+		list_add_tail(&ne1->nl_list, &lpni->lpni_pref.nids);
+	}
+	lpni->lpni_pref_nnids++;
+	lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+	spin_unlock(&lpni->lpni_lock);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+out:
+	if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) {
+		spin_lock(&lpni->lpni_lock);
+		lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+		spin_unlock(&lpni->lpni_lock);
+	}
+	CDEBUG(D_NET, "peer %s nid %s: %d\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), libcfs_nidstr(nid), rc);
+	return rc;
+}
+
+int
+lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, struct lnet_nid *nid)
+{
+	struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
+	struct lnet_nid_list *ne = NULL;
+	int rc = 0;
+
+	if (lpni->lpni_pref_nnids == 0) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	if (lpni->lpni_pref_nnids == 1) {
+		if (!nid_same(&lpni->lpni_pref.nid, nid)) {
+			rc = -ENOENT;
+			goto out;
+		}
+	} else {
+		list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) {
+			if (nid_same(&ne->nl_nid, nid))
+				goto remove_nid_entry;
+		}
+		rc = -ENOENT;
+		ne = NULL;
+		goto out;
+	}
+
+remove_nid_entry:
+	lnet_net_lock(LNET_LOCK_EX);
+	spin_lock(&lpni->lpni_lock);
+	if (lpni->lpni_pref_nnids == 1)
+		lpni->lpni_pref.nid = LNET_ANY_NID;
+	else {
+		list_del_init(&ne->nl_list);
+		if (lpni->lpni_pref_nnids == 2) {
+			struct lnet_nid_list *ne, *tmp;
+
+			list_for_each_entry_safe(ne, tmp,
+						 &lpni->lpni_pref.nids,
+						 nl_list) {
+				lpni->lpni_pref.nid = ne->nl_nid;
+				list_del_init(&ne->nl_list);
+				LIBCFS_FREE(ne, sizeof(*ne));
+			}
+		}
+	}
+	lpni->lpni_pref_nnids--;
+	lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+	spin_unlock(&lpni->lpni_lock);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (ne)
+		LIBCFS_FREE(ne, sizeof(*ne));
+out:
+	CDEBUG(D_NET, "peer %s nid %s: %d\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), libcfs_nidstr(nid), rc);
+	return rc;
+}
+
+void
+lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni)
+{
+	struct list_head zombies;
+	struct lnet_nid_list *ne;
+	struct lnet_nid_list *tmp;
+
+	INIT_LIST_HEAD(&zombies);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	if (lpni->lpni_pref_nnids == 1)
+		lpni->lpni_pref.nid = LNET_ANY_NID;
+	else if (lpni->lpni_pref_nnids > 1)
+		list_splice_init(&lpni->lpni_pref.nids, &zombies);
+	lpni->lpni_pref_nnids = 0;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(ne, tmp, &zombies, nl_list) {
+		list_del_init(&ne->nl_list);
+		LIBCFS_FREE(ne, sizeof(*ne));
+	}
+}
+
+void
+lnet_peer_primary_nid_locked(struct lnet_nid *nid, struct lnet_nid *result)
+{
+	struct lnet_peer_ni *lpni;
+
+	*result = *nid;
+	lpni = lnet_peer_ni_find_locked(nid);
+	if (lpni) {
+		*result = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+		lnet_peer_ni_decref_locked(lpni);
+	}
+}
+
+bool
+lnet_is_discovery_disabled_locked(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	if (lnet_peer_discovery_disabled)
+		return true;
+
+	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) ||
+	    (lp->lp_state & LNET_PEER_NO_DISCOVERY)) {
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Peer Discovery
+ */
+bool
+lnet_is_discovery_disabled(struct lnet_peer *lp)
+{
+	bool rc = false;
+
+	spin_lock(&lp->lp_lock);
+	rc = lnet_is_discovery_disabled_locked(lp);
+	spin_unlock(&lp->lp_lock);
+
+	return rc;
+}
+
+int
+LNetAddPeer(lnet_nid_t *nids, __u32 num_nids)
+{
+	lnet_nid_t pnid = 0;
+	bool mr;
+	int i, rc;
+
+	if (!nids || num_nids < 1)
+		return -EINVAL;
+
+	rc = LNetNIInit(LNET_PID_ANY);
+	if (rc < 0)
+		return rc;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	mr = lnet_peer_discovery_disabled == 0;
+
+	rc = 0;
+	for (i = 0; i < num_nids; i++) {
+		if (nids[i] == LNET_NID_LO_0)
+			continue;
+
+		if (!pnid) {
+			pnid = nids[i];
+			rc = lnet_add_peer_ni(pnid, LNET_NID_ANY, mr, true);
+		} else if (lnet_peer_discovery_disabled) {
+			rc = lnet_add_peer_ni(nids[i], LNET_NID_ANY, mr, true);
+		} else {
+			rc = lnet_add_peer_ni(pnid, nids[i], mr, true);
+		}
+
+		if (rc && rc != -EEXIST)
+			goto unlock;
+	}
+
+unlock:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	LNetNIFini();
+
+	return rc == -EEXIST ? 0 : rc;
+}
+EXPORT_SYMBOL(LNetAddPeer);
+
+/* FIXME support large-addr nid */
+lnet_nid_t
+LNetPrimaryNID(lnet_nid_t nid)
+{
+	struct lnet_peer *lp;
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = nid;
+	int rc = 0;
+	int cpt;
+
+	if (nid == LNET_NID_LO_0)
+		return LNET_NID_LO_0;
+
+	cpt = lnet_net_lock_current();
+	lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt);
+	if (IS_ERR(lpni)) {
+		rc = PTR_ERR(lpni);
+		goto out_unlock;
+	}
+	lp = lpni->lpni_peer_net->lpn_peer;
+
+	/* If discovery is disabled locally then we needn't bother running
+	 * discovery here because discovery will not modify whatever
+	 * primary NID is currently set for this peer. If the specified peer is
+	 * down then this discovery can introduce long delays into the mount
+	 * process, so skip it if it isn't necessary.
+	 */
+	while (!lnet_peer_discovery_disabled && !lnet_peer_is_uptodate(lp)) {
+		spin_lock(&lp->lp_lock);
+		/* force a full discovery cycle */
+		lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
+		spin_unlock(&lp->lp_lock);
+
+		rc = lnet_discover_peer_locked(lpni, cpt, true);
+		if (rc)
+			goto out_decref;
+		/* The lpni (or lp) for this NID may have changed and our ref is
+		 * the only thing keeping the old one around. Release the ref
+		 * and lookup the lpni again
+		 */
+		lnet_peer_ni_decref_locked(lpni);
+		lpni = lnet_find_peer_ni_locked(nid);
+		if (!lpni) {
+			rc = -ENOENT;
+			goto out_unlock;
+		}
+		lp = lpni->lpni_peer_net->lpn_peer;
+
+		/* If we find that the peer has discovery disabled then we will
+		 * not modify whatever primary NID is currently set for this
+		 * peer. Thus, we can break out of this loop even if the peer
+		 * is not fully up to date.
+		 */
+		if (lnet_is_discovery_disabled(lp))
+			break;
+	}
+	primary_nid = lnet_nid_to_nid4(&lp->lp_primary_nid);
+out_decref:
+	lnet_peer_ni_decref_locked(lpni);
+out_unlock:
+	lnet_net_unlock(cpt);
+
+	CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
+	       libcfs_nid2str(primary_nid), rc);
+	return primary_nid;
+}
+EXPORT_SYMBOL(LNetPrimaryNID);
+
+struct lnet_peer_net *
+lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
+{
+	struct lnet_peer_net *peer_net;
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+		if (peer_net->lpn_net_id == net_id)
+			return peer_net;
+	}
+	return NULL;
+}
+
+/*
+ * Attach a peer_ni to a peer_net and peer. This function assumes
+ * peer_ni is not already attached to the peer_net/peer. The peer_ni
+ * may be attached to a different peer, in which case it will be
+ * properly detached first. The whole operation is done atomically.
+ *
+ * This function consumes the reference on lpni and Always returns 0.
+ * This is the last function called from functions that do return an
+ * int, so returning 0 here allows the compiler to do a tail call.
+ */
+static int
+lnet_peer_attach_peer_ni(struct lnet_peer *lp,
+				struct lnet_peer_net *lpn,
+				struct lnet_peer_ni *lpni,
+				unsigned flags)
+{
+	struct lnet_peer_table *ptable;
+	bool new_lpn = false;
+	int rc;
+
+	/* Install the new peer_ni */
+	lnet_net_lock(LNET_LOCK_EX);
+	/* Add peer_ni to global peer table hash, if necessary. */
+	if (list_empty(&lpni->lpni_hashlist)) {
+		int hash = lnet_nid2peerhash(&lpni->lpni_nid);
+
+		ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+		list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[hash]);
+		ptable->pt_version++;
+		lnet_peer_ni_addref_locked(lpni);
+	}
+
+	/* Detach the peer_ni from an existing peer, if necessary. */
+	if (lpni->lpni_peer_net) {
+		LASSERT(lpni->lpni_peer_net != lpn);
+		LASSERT(lpni->lpni_peer_net->lpn_peer != lp);
+		lnet_peer_detach_peer_ni_locked(lpni);
+		lnet_peer_net_decref_locked(lpni->lpni_peer_net);
+		lpni->lpni_peer_net = NULL;
+	}
+
+	/* Add peer_ni to peer_net */
+	lpni->lpni_peer_net = lpn;
+	if (nid_same(&lp->lp_primary_nid, &lpni->lpni_nid))
+		list_add(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis);
+	else
+		list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis);
+	lnet_update_peer_net_healthv(lpni);
+	lnet_peer_net_addref_locked(lpn);
+
+	/* Add peer_net to peer */
+	if (!lpn->lpn_peer) {
+		new_lpn = true;
+		lpn->lpn_peer = lp;
+		if (nid_same(&lp->lp_primary_nid, &lpni->lpni_nid))
+			list_add(&lpn->lpn_peer_nets, &lp->lp_peer_nets);
+		else
+			list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets);
+		lnet_peer_addref_locked(lp);
+	}
+
+	/* Add peer to global peer list, if necessary */
+	ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+	if (list_empty(&lp->lp_peer_list)) {
+		list_add_tail(&lp->lp_peer_list, &ptable->pt_peer_list);
+		ptable->pt_peers++;
+	}
+
+
+	/* Update peer state */
+	spin_lock(&lp->lp_lock);
+	if (flags & LNET_PEER_CONFIGURED) {
+		if (!(lp->lp_state & LNET_PEER_CONFIGURED))
+			lp->lp_state |= LNET_PEER_CONFIGURED;
+	}
+	if (flags & LNET_PEER_MULTI_RAIL) {
+		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+			lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			lnet_peer_clr_non_mr_pref_nids(lp);
+		}
+	}
+	spin_unlock(&lp->lp_lock);
+
+	lp->lp_nnis++;
+
+	/* apply UDSPs */
+	if (new_lpn) {
+		rc = lnet_udsp_apply_policies_on_lpn(lpn);
+		if (rc)
+			CERROR("Failed to apply UDSPs on lpn %s\n",
+			       libcfs_net2str(lpn->lpn_net_id));
+	}
+	rc = lnet_udsp_apply_policies_on_lpni(lpni);
+	if (rc)
+		CERROR("Failed to apply UDSPs on lpni %s\n",
+		       libcfs_nidstr(&lpni->lpni_nid));
+
+	CDEBUG(D_NET, "peer %s NID %s flags %#x\n",
+	       libcfs_nidstr(&lp->lp_primary_nid),
+	       libcfs_nidstr(&lpni->lpni_nid), flags);
+	lnet_peer_ni_decref_locked(lpni);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+
+/*
+ * Create a new peer, with nid as its primary nid.
+ *
+ * Call with the lnet_api_mutex held.
+ */
+static int
+lnet_peer_add(lnet_nid_t nid4, unsigned int flags)
+{
+	struct lnet_nid nid;
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	int rc = 0;
+
+	LASSERT(nid4 != LNET_NID_ANY);
+
+	/*
+	 * No need for the lnet_net_lock here, because the
+	 * lnet_api_mutex is held.
+	 */
+	lpni = lnet_find_peer_ni_locked(nid4);
+	if (lpni) {
+		/* A peer with this NID already exists. */
+		lp = lpni->lpni_peer_net->lpn_peer;
+		lnet_peer_ni_decref_locked(lpni);
+		/*
+		 * This is an error if the peer was configured and the
+		 * primary NID differs or an attempt is made to change
+		 * the Multi-Rail flag. Otherwise the assumption is
+		 * that an existing peer is being modified.
+		 */
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			if (lnet_nid_to_nid4(&lp->lp_primary_nid) != nid4)
+				rc = -EEXIST;
+			else if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL)
+				rc = -EPERM;
+			goto out;
+		} else if (!(flags & LNET_PEER_CONFIGURED)) {
+			if (lnet_nid_to_nid4(&lp->lp_primary_nid) == nid4) {
+				rc = -EEXIST;
+				goto out;
+			}
+		}
+		/* Delete and recreate as a configured peer. */
+		rc = lnet_peer_del(lp);
+		if (rc)
+			goto out;
+	}
+
+	/* Create peer, peer_net, and peer_ni. */
+	rc = -ENOMEM;
+	lnet_nid4_to_nid(nid4, &nid);
+	lp = lnet_peer_alloc(&nid);
+	if (!lp)
+		goto out;
+	lpn = lnet_peer_net_alloc(LNET_NID_NET(&nid));
+	if (!lpn)
+		goto out_free_lp;
+	lpni = lnet_peer_ni_alloc(&nid);
+	if (!lpni)
+		goto out_free_lpn;
+
+	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
+
+out_free_lpn:
+	LIBCFS_FREE(lpn, sizeof(*lpn));
+out_free_lp:
+	LIBCFS_FREE(lp, sizeof(*lp));
+out:
+	CDEBUG(D_NET, "peer %s NID flags %#x: %d\n",
+	       libcfs_nid2str(nid4), flags, rc);
+	return rc;
+}
+
+/*
+ * Add a NID to a peer. Call with ln_api_mutex held.
+ *
+ * Error codes:
+ *  -EPERM:    Non-DLC addition to a DLC-configured peer.
+ *  -EEXIST:   The NID was configured by DLC for a different peer.
+ *  -ENOMEM:   Out of memory.
+ *  -ENOTUNIQ: Adding a second peer NID on a single network on a
+ *             non-multi-rail peer.
+ */
+static int
+lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags)
+{
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	struct lnet_nid nid;
+	int rc = 0;
+
+	LASSERT(lp);
+	LASSERT(nid4 != LNET_NID_ANY);
+
+	lnet_nid4_to_nid(nid4, &nid);
+
+	/* A configured peer can only be updated through configuration. */
+	if (!(flags & LNET_PEER_CONFIGURED)) {
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			rc = -EPERM;
+			goto out;
+		}
+	}
+
+	/*
+	 * The MULTI_RAIL flag can be set but not cleared, because
+	 * that would leave the peer struct in an invalid state.
+	 */
+	if (flags & LNET_PEER_MULTI_RAIL) {
+		spin_lock(&lp->lp_lock);
+		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+			lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			lnet_peer_clr_non_mr_pref_nids(lp);
+		}
+		spin_unlock(&lp->lp_lock);
+	} else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		rc = -EPERM;
+		goto out;
+	}
+
+	lpni = lnet_find_peer_ni_locked(nid4);
+	if (lpni) {
+		/*
+		 * A peer_ni already exists. This is only a problem if
+		 * it is not connected to this peer and was configured
+		 * by DLC.
+		 */
+		if (lpni->lpni_peer_net->lpn_peer == lp)
+			goto out_free_lpni;
+		if (lnet_peer_ni_is_configured(lpni)) {
+			rc = -EEXIST;
+			goto out_free_lpni;
+		}
+		/* If this is the primary NID, destroy the peer. */
+		if (lnet_peer_ni_is_primary(lpni)) {
+			struct lnet_peer *rtr_lp =
+				lpni->lpni_peer_net->lpn_peer;
+			int rtr_refcount = rtr_lp->lp_rtr_refcount;
+			/*
+			 * if we're trying to delete a router it means
+			 * we're moving this peer NI to a new peer so must
+			 * transfer router properties to the new peer
+			 */
+			if (rtr_refcount > 0) {
+				flags |= LNET_PEER_RTR_NI_FORCE_DEL;
+				lnet_rtr_transfer_to_peer(rtr_lp, lp);
+			}
+			lnet_peer_del(lpni->lpni_peer_net->lpn_peer);
+			lnet_peer_ni_decref_locked(lpni);
+			lpni = lnet_peer_ni_alloc(&nid);
+			if (!lpni) {
+				rc = -ENOMEM;
+				goto out_free_lpni;
+			}
+		}
+	} else {
+		lpni = lnet_peer_ni_alloc(&nid);
+		if (!lpni) {
+			rc = -ENOMEM;
+			goto out_free_lpni;
+		}
+	}
+
+	/*
+	 * Get the peer_net. Check that we're not adding a second
+	 * peer_ni on a peer_net of a non-multi-rail peer.
+	 */
+	lpn = lnet_peer_get_net_locked(lp, LNET_NIDNET(nid4));
+	if (!lpn) {
+		lpn = lnet_peer_net_alloc(LNET_NIDNET(nid4));
+		if (!lpn) {
+			rc = -ENOMEM;
+			goto out_free_lpni;
+		}
+	} else if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+		rc = -ENOTUNIQ;
+		goto out_free_lpni;
+	}
+
+	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
+
+out_free_lpni:
+	lnet_peer_ni_decref_locked(lpni);
+out:
+	CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), libcfs_nid2str(nid4),
+	       flags, rc);
+	return rc;
+}
+
+/*
+ * Update the primary NID of a peer, if possible.
+ *
+ * Call with the lnet_api_mutex held.
+ */
+static int
+lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid,
+			  unsigned int flags)
+{
+	struct lnet_nid old = lp->lp_primary_nid;
+	int rc = 0;
+
+	if (lnet_nid_to_nid4(&lp->lp_primary_nid) == nid)
+		goto out;
+
+	lnet_nid4_to_nid(nid, &lp->lp_primary_nid);
+
+	rc = lnet_peer_add_nid(lp, nid, flags);
+	if (rc) {
+		lp->lp_primary_nid = old;
+		goto out;
+	}
+out:
+	CDEBUG(D_NET, "peer %s NID %s: %d\n",
+	       libcfs_nidstr(&old), libcfs_nid2str(nid), rc);
+
+	return rc;
+}
+
+/*
+ * lpni creation initiated due to traffic either sending or receiving.
+ */
+static int
+lnet_peer_ni_traffic_add(struct lnet_nid *nid, struct lnet_nid *pref)
+{
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	unsigned flags = 0;
+	int rc = 0;
+
+	if (LNET_NID_IS_ANY(nid)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* lnet_net_lock is not needed here because ln_api_lock is held */
+	lpni = lnet_peer_ni_find_locked(nid);
+	if (lpni) {
+		/*
+		 * We must have raced with another thread. Since we
+		 * know next to nothing about a peer_ni created by
+		 * traffic, we just assume everything is ok and
+		 * return.
+		 */
+		lnet_peer_ni_decref_locked(lpni);
+		goto out;
+	}
+
+	/* Create peer, peer_net, and peer_ni. */
+	rc = -ENOMEM;
+	lp = lnet_peer_alloc(nid);
+	if (!lp)
+		goto out;
+	lpn = lnet_peer_net_alloc(LNET_NID_NET(nid));
+	if (!lpn)
+		goto out_free_lp;
+	lpni = lnet_peer_ni_alloc(nid);
+	if (!lpni)
+		goto out_free_lpn;
+	lnet_peer_ni_set_non_mr_pref_nid(lpni, pref);
+
+	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
+
+out_free_lpn:
+	LIBCFS_FREE(lpn, sizeof(*lpn));
+out_free_lp:
+	LIBCFS_FREE(lp, sizeof(*lp));
+out:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nidstr(nid), rc);
+	return rc;
+}
+
+/*
+ * Implementation of IOC_LIBCFS_ADD_PEER_NI.
+ *
+ * This API handles the following combinations:
+ *   Create a peer with its primary NI if only the prim_nid is provided
+ *   Add a NID to a peer identified by the prim_nid. The peer identified
+ *   by the prim_nid must already exist.
+ *   The peer being created may be non-MR.
+ *
+ * The caller must hold ln_api_mutex. This prevents the peer from
+ * being created/modified/deleted by a different thread.
+ */
+int
+lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr, bool temp)
+{
+	struct lnet_peer *lp = NULL;
+	struct lnet_peer_ni *lpni;
+	unsigned int flags = 0;
+
+	/* The prim_nid must always be specified */
+	if (prim_nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	if (!temp)
+		flags = LNET_PEER_CONFIGURED;
+
+	if (mr)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	/*
+	 * If nid isn't specified, we must create a new peer with
+	 * prim_nid as its primary nid.
+	 */
+	if (nid == LNET_NID_ANY)
+		return lnet_peer_add(prim_nid, flags);
+
+	/* Look up the prim_nid, which must exist. */
+	lpni = lnet_find_peer_ni_locked(prim_nid);
+	if (!lpni)
+		return -ENOENT;
+	lnet_peer_ni_decref_locked(lpni);
+	lp = lpni->lpni_peer_net->lpn_peer;
+
+	/* Peer must have been configured. */
+	if (!temp && !(lp->lp_state & LNET_PEER_CONFIGURED)) {
+		CDEBUG(D_NET, "peer %s was not configured\n",
+		       libcfs_nid2str(prim_nid));
+		return -ENOENT;
+	}
+
+	/* Primary NID must match */
+	if (lnet_nid_to_nid4(&lp->lp_primary_nid) != prim_nid) {
+		CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n",
+		       libcfs_nid2str(prim_nid),
+		       libcfs_nidstr(&lp->lp_primary_nid));
+		return -ENODEV;
+	}
+
+	/* Multi-Rail flag must match. */
+	if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL) {
+		CDEBUG(D_NET, "multi-rail state mismatch for peer %s\n",
+		       libcfs_nid2str(prim_nid));
+		return -EPERM;
+	}
+
+	return lnet_peer_add_nid(lp, nid, flags);
+}
+
+/*
+ * Implementation of IOC_LIBCFS_DEL_PEER_NI.
+ *
+ * This API handles the following combinations:
+ *   Delete a NI from a peer if both prim_nid and nid are provided.
+ *   Delete a peer if only prim_nid is provided.
+ *   Delete a peer if its primary nid is provided.
+ *
+ * The caller must hold ln_api_mutex. This prevents the peer from
+ * being modified/deleted by a different thread.
+ */
+int
+lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid)
+{
+	struct lnet_peer *lp;
+	struct lnet_peer_ni *lpni;
+	unsigned flags;
+
+	if (prim_nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	lpni = lnet_find_peer_ni_locked(prim_nid);
+	if (!lpni)
+		return -ENOENT;
+	lnet_peer_ni_decref_locked(lpni);
+	lp = lpni->lpni_peer_net->lpn_peer;
+
+	if (prim_nid != lnet_nid_to_nid4(&lp->lp_primary_nid)) {
+		CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n",
+		       libcfs_nid2str(prim_nid),
+		       libcfs_nidstr(&lp->lp_primary_nid));
+		return -ENODEV;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	if (lp->lp_rtr_refcount > 0) {
+		lnet_net_unlock(LNET_LOCK_EX);
+		CERROR("%s is a router. Can not be deleted\n",
+		       libcfs_nid2str(prim_nid));
+		return -EBUSY;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (nid == LNET_NID_ANY || nid == lnet_nid_to_nid4(&lp->lp_primary_nid))
+		return lnet_peer_del(lp);
+
+	flags = LNET_PEER_CONFIGURED;
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	return lnet_peer_del_nid(lp, nid, flags);
+}
+
+void
+lnet_destroy_peer_ni_locked(struct kref *ref)
+{
+	struct lnet_peer_ni *lpni = container_of(ref, struct lnet_peer_ni,
+						 lpni_kref);
+	struct lnet_peer_table *ptable;
+	struct lnet_peer_net *lpn;
+
+	CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nidstr(&lpni->lpni_nid));
+
+	LASSERT(kref_read(&lpni->lpni_kref) == 0);
+	LASSERT(list_empty(&lpni->lpni_txq));
+	LASSERT(lpni->lpni_txqnob == 0);
+	LASSERT(list_empty(&lpni->lpni_peer_nis));
+	LASSERT(list_empty(&lpni->lpni_on_remote_peer_ni_list));
+
+	lpn = lpni->lpni_peer_net;
+	lpni->lpni_peer_net = NULL;
+	lpni->lpni_net = NULL;
+
+	if (!list_empty(&lpni->lpni_hashlist)) {
+		/* remove the peer ni from the zombie list */
+		ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+		spin_lock(&ptable->pt_zombie_lock);
+		list_del_init(&lpni->lpni_hashlist);
+		ptable->pt_zombies--;
+		spin_unlock(&ptable->pt_zombie_lock);
+	}
+
+	if (lpni->lpni_pref_nnids > 1) {
+		struct lnet_nid_list *ne, *tmp;
+
+		list_for_each_entry_safe(ne, tmp, &lpni->lpni_pref.nids,
+					 nl_list) {
+			list_del_init(&ne->nl_list);
+			LIBCFS_FREE(ne, sizeof(*ne));
+		}
+	}
+	LIBCFS_FREE(lpni, sizeof(*lpni));
+
+	if (lpn)
+		lnet_peer_net_decref_locked(lpn);
+}
+
+struct lnet_peer_ni *
+lnet_nid2peerni_ex(struct lnet_nid *nid, int cpt)
+{
+	struct lnet_peer_ni *lpni = NULL;
+	int rc;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		return ERR_PTR(-ESHUTDOWN);
+
+	/*
+	 * find if a peer_ni already exists.
+	 * If so then just return that.
+	 */
+	lpni = lnet_peer_ni_find_locked(nid);
+	if (lpni)
+		return lpni;
+
+	lnet_net_unlock(cpt);
+
+	rc = lnet_peer_ni_traffic_add(nid, NULL);
+	if (rc) {
+		lpni = ERR_PTR(rc);
+		goto out_net_relock;
+	}
+
+	lpni = lnet_peer_ni_find_locked(nid);
+	LASSERT(lpni);
+
+out_net_relock:
+	lnet_net_lock(cpt);
+
+	return lpni;
+}
+
+/*
+ * Get a peer_ni for the given nid, create it if necessary. Takes a
+ * hold on the peer_ni.
+ */
+struct lnet_peer_ni *
+lnet_peerni_by_nid_locked(struct lnet_nid *nid,
+			struct lnet_nid *pref, int cpt)
+{
+	struct lnet_peer_ni *lpni = NULL;
+	int rc;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		return ERR_PTR(-ESHUTDOWN);
+
+	/*
+	 * find if a peer_ni already exists.
+	 * If so then just return that.
+	 */
+	lpni = lnet_peer_ni_find_locked(nid);
+	if (lpni)
+		return lpni;
+
+	/*
+	 * Slow path:
+	 * use the lnet_api_mutex to serialize the creation of the peer_ni
+	 * and the creation/deletion of the local ni/net. When a local ni is
+	 * created, if there exists a set of peer_nis on that network,
+	 * they need to be traversed and updated. When a local NI is
+	 * deleted, which could result in a network being deleted, then
+	 * all peer nis on that network need to be removed as well.
+	 *
+	 * Creation through traffic should also be serialized with
+	 * creation through DLC.
+	 */
+	lnet_net_unlock(cpt);
+	mutex_lock(&the_lnet.ln_api_mutex);
+	/*
+	 * Shutdown is only set under the ln_api_lock, so a single
+	 * check here is sufficent.
+	 */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		lpni = ERR_PTR(-ESHUTDOWN);
+		goto out_mutex_unlock;
+	}
+
+	rc = lnet_peer_ni_traffic_add(nid, pref);
+	if (rc) {
+		lpni = ERR_PTR(rc);
+		goto out_mutex_unlock;
+	}
+
+	lpni = lnet_peer_ni_find_locked(nid);
+	LASSERT(lpni);
+
+out_mutex_unlock:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	lnet_net_lock(cpt);
+
+	/* Lock has been dropped, check again for shutdown. */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		if (!IS_ERR(lpni))
+			lnet_peer_ni_decref_locked(lpni);
+		lpni = ERR_PTR(-ESHUTDOWN);
+	}
+
+	return lpni;
+}
+
+struct lnet_peer_ni *
+lnet_nid2peerni_locked(lnet_nid_t nid4, lnet_nid_t pref4, int cpt)
+{
+	struct lnet_nid nid, pref;
+
+	lnet_nid4_to_nid(nid4, &nid);
+	lnet_nid4_to_nid(pref4, &pref);
+	if (pref4 == LNET_NID_ANY)
+		return lnet_peerni_by_nid_locked(&nid, NULL, cpt);
+	else
+		return lnet_peerni_by_nid_locked(&nid, &pref, cpt);
+}
+
+bool
+lnet_peer_gw_discovery(struct lnet_peer *lp)
+{
+	bool rc = false;
+
+	spin_lock(&lp->lp_lock);
+	if (lp->lp_state & LNET_PEER_RTR_DISCOVERY)
+		rc = true;
+	spin_unlock(&lp->lp_lock);
+
+	return rc;
+}
+
+bool
+lnet_peer_is_uptodate(struct lnet_peer *lp)
+{
+	bool rc;
+
+	spin_lock(&lp->lp_lock);
+	rc = lnet_peer_is_uptodate_locked(lp);
+	spin_unlock(&lp->lp_lock);
+	return rc;
+}
+
+/*
+ * Is a peer uptodate from the point of view of discovery?
+ *
+ * If it is currently being processed, obviously not.
+ * A forced Ping or Push is also handled by the discovery thread.
+ *
+ * Otherwise look at whether the peer needs rediscovering.
+ */
+bool
+lnet_peer_is_uptodate_locked(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	bool rc;
+
+	if (lp->lp_state & (LNET_PEER_DISCOVERING |
+			    LNET_PEER_FORCE_PING |
+			    LNET_PEER_FORCE_PUSH)) {
+		rc = false;
+	} else if (lp->lp_state & LNET_PEER_REDISCOVER) {
+		rc = false;
+	} else if (lnet_peer_needs_push(lp)) {
+		rc = false;
+	} else if (lp->lp_state & LNET_PEER_DISCOVERED) {
+		if (lp->lp_state & LNET_PEER_NIDS_UPTODATE)
+			rc = true;
+		else
+			rc = false;
+	} else {
+		rc = false;
+	}
+
+	return rc;
+}
+
+/* Add the message to the peer's lp_dc_pendq and queue the peer for discovery */
+void
+lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg)
+{
+	/* The discovery thread holds net_lock/EX and lp_lock when it splices
+	 * the lp_dc_pendq onto a local list for resending. Thus, we do the same
+	 * when adding to the list and queuing the peer to ensure that we do not
+	 * strand any messages on the lp_dc_pendq. This scheme ensures the
+	 * message will be resent even if the peer is already being discovered.
+	 * Therefore we needn't check the return value of
+	 * lnet_peer_queue_for_discovery(lp).
+	 */
+	lnet_net_lock(LNET_LOCK_EX);
+	spin_lock(&lp->lp_lock);
+	list_add_tail(&msg->msg_list, &lp->lp_dc_pendq);
+	spin_unlock(&lp->lp_lock);
+	lnet_peer_queue_for_discovery(lp);
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+/*
+ * Queue a peer for the attention of the discovery thread.  Call with
+ * lnet_net_lock/EX held. Returns 0 if the peer was queued, and
+ * -EALREADY if the peer was already queued.
+ */
+static int lnet_peer_queue_for_discovery(struct lnet_peer *lp)
+{
+	int rc;
+
+	spin_lock(&lp->lp_lock);
+	if (!(lp->lp_state & LNET_PEER_DISCOVERING))
+		lp->lp_state |= LNET_PEER_DISCOVERING;
+	spin_unlock(&lp->lp_lock);
+	if (list_empty(&lp->lp_dc_list)) {
+		lnet_peer_addref_locked(lp);
+		list_add_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+		wake_up(&the_lnet.ln_dc_waitq);
+		rc = 0;
+	} else {
+		rc = -EALREADY;
+	}
+
+	CDEBUG(D_NET, "Queue peer %s: %d\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), rc);
+
+	return rc;
+}
+
+/*
+ * Discovery of a peer is complete. Wake all waiters on the peer.
+ * Call with lnet_net_lock/EX held.
+ */
+static void lnet_peer_discovery_complete(struct lnet_peer *lp, int dc_error)
+{
+	struct lnet_msg *msg, *tmp;
+	int rc = 0;
+	LIST_HEAD(pending_msgs);
+
+	CDEBUG(D_NET, "Discovery complete. Dequeue peer %s\n",
+	       libcfs_nidstr(&lp->lp_primary_nid));
+
+	spin_lock(&lp->lp_lock);
+	/* Our caller dropped lp_lock which may have allowed another thread to
+	 * set LNET_PEER_DISCOVERING, or it may be set if dc_error is non-zero.
+	 * Ensure it is cleared.
+	 */
+	lp->lp_state &= ~LNET_PEER_DISCOVERING;
+	if (dc_error) {
+		lp->lp_dc_error = dc_error;
+		lp->lp_state |= LNET_PEER_REDISCOVER;
+	}
+	list_splice_init(&lp->lp_dc_pendq, &pending_msgs);
+	spin_unlock(&lp->lp_lock);
+	list_del_init(&lp->lp_dc_list);
+	wake_up(&lp->lp_dc_waitq);
+
+	if (lp->lp_rtr_refcount > 0)
+		lnet_router_discovery_complete(lp);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* iterate through all pending messages and send them again */
+	list_for_each_entry_safe(msg, tmp, &pending_msgs, msg_list) {
+		list_del_init(&msg->msg_list);
+		if (dc_error) {
+			lnet_finalize(msg, dc_error);
+			continue;
+		}
+
+		CDEBUG(D_NET, "sending pending message %s to target %s\n",
+		       lnet_msgtyp2str(msg->msg_type),
+		       libcfs_idstr(&msg->msg_target));
+		rc = lnet_send(&msg->msg_src_nid_param, msg,
+			       &msg->msg_rtr_nid_param);
+		if (rc < 0) {
+			CNETERR("Error sending %s to %s: %d\n",
+			       lnet_msgtyp2str(msg->msg_type),
+			       libcfs_idstr(&msg->msg_target), rc);
+			lnet_finalize(msg, rc);
+		}
+	}
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_peer_decref_locked(lp);
+}
+
+/*
+ * Handle inbound push.
+ * Like any event handler, called with lnet_res_lock/CPT held.
+ */
+void lnet_peer_push_event(struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_peer *lp;
+
+	pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start + ev->offset);
+
+	/* lnet_find_peer() adds a refcount */
+	lp = lnet_find_peer(&ev->source.nid);
+	if (!lp) {
+		CDEBUG(D_NET, "Push Put from unknown %s (source %s). Ignoring...\n",
+		       libcfs_nidstr(&ev->initiator.nid),
+		       libcfs_nidstr(&ev->source.nid));
+		pbuf->pb_needs_post = true;
+		return;
+	}
+
+	/* Ensure peer state remains consistent while we modify it. */
+	spin_lock(&lp->lp_lock);
+
+	/*
+	 * If some kind of error happened the contents of the message
+	 * cannot be used. Clear the NIDS_UPTODATE and set the
+	 * FORCE_PING flag to trigger a ping.
+	 */
+	if (ev->status) {
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Push Put error %d from %s (source %s)\n",
+		       ev->status,
+		       libcfs_nidstr(&lp->lp_primary_nid),
+		       libcfs_nidstr(&ev->source.nid));
+		goto out;
+	}
+
+	/*
+	 * A push with invalid or corrupted info. Clear the UPTODATE
+	 * flag to trigger a ping.
+	 */
+	if (lnet_ping_info_validate(&pbuf->pb_info)) {
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Corrupted Push from %s\n",
+		       libcfs_nidstr(&lp->lp_primary_nid));
+		goto out;
+	}
+
+	/*
+	 * Make sure we'll allocate the correct size ping buffer when
+	 * pinging the peer.
+	 */
+	if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis)
+		lp->lp_data_nnis = pbuf->pb_info.pi_nnis;
+
+	/*
+	 * A non-Multi-Rail peer is not supposed to be capable of
+	 * sending a push.
+	 */
+	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)) {
+		CERROR("Push from non-Multi-Rail peer %s dropped\n",
+		       libcfs_nidstr(&lp->lp_primary_nid));
+		goto out;
+	}
+
+	/*
+	 * The peer may have discovery disabled at its end. Set
+	 * NO_DISCOVERY as appropriate.
+	 */
+	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) {
+		CDEBUG(D_NET, "Peer %s has discovery disabled\n",
+		       libcfs_nidstr(&lp->lp_primary_nid));
+		/*
+		 * Mark the peer for deletion if we already know about it
+		 * and it's going from discovery set to no discovery set
+		 */
+		if (!(lp->lp_state & (LNET_PEER_NO_DISCOVERY |
+				      LNET_PEER_DISCOVERING)) &&
+		     lp->lp_state & LNET_PEER_DISCOVERED) {
+			CDEBUG(D_NET, "Marking %s:0x%x for deletion\n",
+			       libcfs_nidstr(&lp->lp_primary_nid),
+			       lp->lp_state);
+			lp->lp_state |= LNET_PEER_MARK_DELETION;
+		}
+		lp->lp_state |= LNET_PEER_NO_DISCOVERY;
+	} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+		CDEBUG(D_NET, "Peer %s has discovery enabled\n",
+		       libcfs_nidstr(&lp->lp_primary_nid));
+		lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
+	}
+
+	/*
+	 * Update the MULTI_RAIL flag based on the push. If the peer
+	 * was configured with DLC then the setting should match what
+	 * DLC put in.
+	 * NB: We verified above that the MR feature bit is set in pi_features
+	 */
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		CDEBUG(D_NET, "peer %s(%p) is MR\n",
+		       libcfs_nidstr(&lp->lp_primary_nid), lp);
+	} else if (lp->lp_state & LNET_PEER_CONFIGURED) {
+		CWARN("Push says %s is Multi-Rail, DLC says not\n",
+		      libcfs_nidstr(&lp->lp_primary_nid));
+	} else if (lnet_peer_discovery_disabled) {
+		CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled locally\n",
+		       libcfs_nidstr(&lp->lp_primary_nid), lp);
+	} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+		CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled remotely\n",
+		       libcfs_nidstr(&lp->lp_primary_nid), lp);
+	} else {
+		CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
+		       libcfs_nidstr(&lp->lp_primary_nid), lp);
+		lp->lp_state |= LNET_PEER_MULTI_RAIL;
+		lnet_peer_clr_non_mr_pref_nids(lp);
+	}
+
+	/*
+	 * Check for truncation of the Put message. Clear the
+	 * NIDS_UPTODATE flag and set FORCE_PING to trigger a ping,
+	 * and tell discovery to allocate a bigger buffer.
+	 */
+	if (ev->mlength < ev->rlength) {
+		if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis)
+			the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis;
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Truncated Push from %s (%d nids)\n",
+		       libcfs_nidstr(&lp->lp_primary_nid),
+		       pbuf->pb_info.pi_nnis);
+		goto out;
+	}
+
+	/* always assume new data */
+	lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+	lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+
+	/*
+	 * If there is data present that hasn't been processed yet,
+	 * we'll replace it if the Put contained newer data and it
+	 * fits. We're racing with a Ping or earlier Push in this
+	 * case.
+	 */
+	if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
+		if (LNET_PING_BUFFER_SEQNO(pbuf) >
+			LNET_PING_BUFFER_SEQNO(lp->lp_data) &&
+		    pbuf->pb_info.pi_nnis <= lp->lp_data->pb_nnis) {
+			memcpy(&lp->lp_data->pb_info, &pbuf->pb_info,
+			       LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis));
+			CDEBUG(D_NET, "Ping/Push race from %s: %u vs %u\n",
+			      libcfs_nidstr(&lp->lp_primary_nid),
+			      LNET_PING_BUFFER_SEQNO(pbuf),
+			      LNET_PING_BUFFER_SEQNO(lp->lp_data));
+		}
+		goto out;
+	}
+
+	/*
+	 * Allocate a buffer to copy the data. On a failure we drop
+	 * the Push and set FORCE_PING to force the discovery
+	 * thread to fix the problem by pinging the peer.
+	 */
+	lp->lp_data = lnet_ping_buffer_alloc(lp->lp_data_nnis, GFP_ATOMIC);
+	if (!lp->lp_data) {
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Cannot allocate Push buffer for %s %u\n",
+		       libcfs_nidstr(&lp->lp_primary_nid),
+		       LNET_PING_BUFFER_SEQNO(pbuf));
+		goto out;
+	}
+
+	/* Success */
+	unsafe_memcpy(&lp->lp_data->pb_info, &pbuf->pb_info,
+	       LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis), FLEXIBLE_OBJECT);
+	lp->lp_state |= LNET_PEER_DATA_PRESENT;
+	CDEBUG(D_NET, "Received Push %s %u\n",
+	       libcfs_nidstr(&lp->lp_primary_nid),
+	       LNET_PING_BUFFER_SEQNO(pbuf));
+
+out:
+	/* We've processed this buffer. It can be reposted */
+	pbuf->pb_needs_post = true;
+
+	/*
+	 * Queue the peer for discovery if not done, force it on the request
+	 * queue and wake the discovery thread if the peer was already queued,
+	 * because its status changed.
+	 */
+	spin_unlock(&lp->lp_lock);
+	lnet_net_lock(LNET_LOCK_EX);
+	if (!lnet_peer_is_uptodate(lp) && lnet_peer_queue_for_discovery(lp)) {
+		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+		wake_up(&the_lnet.ln_dc_waitq);
+	}
+	/* Drop refcount from lookup */
+	lnet_peer_decref_locked(lp);
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+/*
+ * Clear the discovery error state, unless we're already discovering
+ * this peer, in which case the error is current.
+ */
+static void lnet_peer_clear_discovery_error(struct lnet_peer *lp)
+{
+	spin_lock(&lp->lp_lock);
+	if (!(lp->lp_state & LNET_PEER_DISCOVERING))
+		lp->lp_dc_error = 0;
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Peer discovery slow path. The ln_api_mutex is held on entry, and
+ * dropped/retaken within this function. An lnet_peer_ni is passed in
+ * because discovery could tear down an lnet_peer.
+ */
+int
+lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block)
+{
+	DEFINE_WAIT(wait);
+	struct lnet_peer *lp;
+	int rc = 0;
+	int count = 0;
+
+again:
+	lnet_net_unlock(cpt);
+	lnet_net_lock(LNET_LOCK_EX);
+	lp = lpni->lpni_peer_net->lpn_peer;
+	lnet_peer_clear_discovery_error(lp);
+
+	/*
+	 * We're willing to be interrupted. The lpni can become a
+	 * zombie if we race with DLC, so we must check for that.
+	 */
+	for (;;) {
+		/* Keep lp alive when the lnet_net_lock is unlocked */
+		lnet_peer_addref_locked(lp);
+		prepare_to_wait(&lp->lp_dc_waitq, &wait, TASK_INTERRUPTIBLE);
+		if (signal_pending(current))
+			break;
+		if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
+			break;
+		/*
+		 * Don't repeat discovery if discovery is disabled. This is
+		 * done to ensure we can use discovery as a standard ping as
+		 * well for backwards compatibility with routers which do not
+		 * have discovery or have discovery disabled
+		 */
+		if (lnet_is_discovery_disabled(lp) && count > 0)
+			break;
+		if (lp->lp_dc_error)
+			break;
+		if (lnet_peer_is_uptodate(lp))
+			break;
+		if (lp->lp_state & LNET_PEER_MARK_DELETED)
+			break;
+		lnet_peer_queue_for_discovery(lp);
+		count++;
+		CDEBUG(D_NET, "Discovery attempt # %d\n", count);
+
+		/*
+		 * If caller requested a non-blocking operation then
+		 * return immediately. Once discovery is complete any
+		 * pending messages that were stopped due to discovery
+		 * will be transmitted.
+		 */
+		if (!block)
+			break;
+
+		lnet_net_unlock(LNET_LOCK_EX);
+		schedule();
+		finish_wait(&lp->lp_dc_waitq, &wait);
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_peer_decref_locked(lp);
+		/* Peer may have changed */
+		lp = lpni->lpni_peer_net->lpn_peer;
+	}
+	finish_wait(&lp->lp_dc_waitq, &wait);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+	lnet_net_lock(cpt);
+	lnet_peer_decref_locked(lp);
+	/*
+	 * The peer may have changed, so re-check and rediscover if that turns
+	 * out to have been the case. The reference count on lp ensured that
+	 * even if it was unlinked from lpni the memory could not be recycled.
+	 * Thus the check below is sufficient to determine whether the peer
+	 * changed. If the peer changed, then lp must not be dereferenced.
+	 */
+	if (lp != lpni->lpni_peer_net->lpn_peer)
+		goto again;
+
+	if (signal_pending(current))
+		rc = -EINTR;
+	else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
+		rc = -ESHUTDOWN;
+	else if (lp->lp_dc_error)
+		rc = lp->lp_dc_error;
+	else if (!block)
+		CDEBUG(D_NET, "non-blocking discovery\n");
+	else if (!lnet_peer_is_uptodate(lp) &&
+		 !(lnet_is_discovery_disabled(lp) ||
+		   (lp->lp_state & LNET_PEER_MARK_DELETED)))
+		goto again;
+
+	CDEBUG(D_NET, "peer %s NID %s: %d. %s\n",
+	       (lp ? libcfs_nidstr(&lp->lp_primary_nid) : "(none)"),
+	       libcfs_nidstr(&lpni->lpni_nid), rc,
+	       (!block) ? "pending discovery" : "discovery complete");
+
+	return rc;
+}
+
+/* Handle an incoming ack for a push. */
+static void
+lnet_discovery_event_ack(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf;
+
+	pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start);
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~LNET_PEER_PUSH_SENT;
+	lp->lp_push_error = ev->status;
+	if (ev->status)
+		lp->lp_state |= LNET_PEER_PUSH_FAILED;
+	else
+		lp->lp_node_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+	spin_unlock(&lp->lp_lock);
+
+	CDEBUG(D_NET, "peer %s ev->status %d\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), ev->status);
+}
+
+/* Handle a Reply message. This is the reply to a Ping message. */
+static void
+lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf;
+	int rc;
+
+	spin_lock(&lp->lp_lock);
+
+	lp->lp_disc_src_nid = ev->target.nid;
+	lp->lp_disc_dst_nid = ev->source.nid;
+
+	/*
+	 * If some kind of error happened the contents of message
+	 * cannot be used. Set PING_FAILED to trigger a retry.
+	 */
+	if (ev->status) {
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = ev->status;
+		CDEBUG(D_NET, "Ping Reply error %d from %s (source %s)\n",
+		       ev->status,
+		       libcfs_nidstr(&lp->lp_primary_nid),
+		       libcfs_nidstr(&ev->source.nid));
+		goto out;
+	}
+
+	pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start);
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(pbuf);
+
+	/*
+	 * A reply with invalid or corrupted info. Set PING_FAILED to
+	 * trigger a retry.
+	 */
+	rc = lnet_ping_info_validate(&pbuf->pb_info);
+	if (rc) {
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = 0;
+		CDEBUG(D_NET, "Corrupted Ping Reply from %s: %d\n",
+		       libcfs_nidstr(&lp->lp_primary_nid), rc);
+		goto out;
+	}
+
+	/*
+	 * The peer may have discovery disabled at its end. Set
+	 * NO_DISCOVERY as appropriate.
+	 */
+	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY) ||
+	    lnet_peer_discovery_disabled) {
+		CDEBUG(D_NET, "Peer %s has discovery disabled\n",
+		       libcfs_nidstr(&lp->lp_primary_nid));
+
+		/* Detect whether this peer has toggled discovery from on to
+		 * off and whether we can delete and re-create the peer. Peers
+		 * that were manually configured cannot be deleted by discovery.
+		 * We need to delete this peer and re-create it if the peer was
+		 * not configured manually, is currently considered DD capable,
+		 * and either:
+		 * 1. We've already discovered the peer (the peer has toggled
+		 *    the discovery feature from on to off), or
+		 * 2. The peer is considered MR, but it was not user configured
+		 *    (this was a "temporary" peer created via the kernel APIs
+		 *     that we're discovering for the first time)
+		 */
+		if (!(lp->lp_state & (LNET_PEER_CONFIGURED |
+				      LNET_PEER_NO_DISCOVERY)) &&
+		    (lp->lp_state & (LNET_PEER_DISCOVERED |
+				     LNET_PEER_MULTI_RAIL))) {
+			CDEBUG(D_NET, "Marking %s:0x%x for deletion\n",
+			       libcfs_nidstr(&lp->lp_primary_nid),
+			       lp->lp_state);
+			lp->lp_state |= LNET_PEER_MARK_DELETION;
+		}
+		lp->lp_state |= LNET_PEER_NO_DISCOVERY;
+	} else {
+		CDEBUG(D_NET, "Peer %s has discovery enabled\n",
+		       libcfs_nidstr(&lp->lp_primary_nid));
+		lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
+	}
+
+	/*
+	 * Update the MULTI_RAIL flag based on the reply. If the peer
+	 * was configured with DLC then the setting should match what
+	 * DLC put in.
+	 */
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) {
+		if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+			CDEBUG(D_NET, "peer %s(%p) is MR\n",
+			       libcfs_nidstr(&lp->lp_primary_nid), lp);
+		} else if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			CWARN("Reply says %s is Multi-Rail, DLC says not\n",
+			      libcfs_nidstr(&lp->lp_primary_nid));
+		} else if (lnet_peer_discovery_disabled) {
+			CDEBUG(D_NET,
+			       "peer %s(%p) not MR: DD disabled locally\n",
+			       libcfs_nidstr(&lp->lp_primary_nid), lp);
+		} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+			CDEBUG(D_NET,
+			       "peer %s(%p) not MR: DD disabled remotely\n",
+			       libcfs_nidstr(&lp->lp_primary_nid), lp);
+		} else {
+			CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
+			       libcfs_nidstr(&lp->lp_primary_nid), lp);
+			lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			lnet_peer_clr_non_mr_pref_nids(lp);
+		}
+	} else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			CWARN("DLC says %s is Multi-Rail, Reply says not\n",
+			      libcfs_nidstr(&lp->lp_primary_nid));
+		} else {
+			CERROR("Multi-Rail state vanished from %s\n",
+			       libcfs_nidstr(&lp->lp_primary_nid));
+			lp->lp_state &= ~LNET_PEER_MULTI_RAIL;
+		}
+	}
+
+	/*
+	 * Make sure we'll allocate the correct size ping buffer when
+	 * pinging the peer.
+	 */
+	if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis)
+		lp->lp_data_nnis = pbuf->pb_info.pi_nnis;
+
+	/*
+	 * Check for truncation of the Reply. Clear PING_SENT and set
+	 * PING_FAILED to trigger a retry.
+	 */
+	if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) {
+		if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis)
+			the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis;
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = 0;
+		CDEBUG(D_NET, "Truncated Reply from %s (%d nids)\n",
+		       libcfs_nidstr(&lp->lp_primary_nid),
+		       pbuf->pb_info.pi_nnis);
+		goto out;
+	}
+
+	/*
+	 * Check the sequence numbers in the reply. These are only
+	 * available if the reply came from a Multi-Rail peer.
+	 */
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL &&
+	    pbuf->pb_info.pi_nnis > 1 &&
+	    lnet_nid_to_nid4(&lp->lp_primary_nid) ==
+	    pbuf->pb_info.pi_ni[1].ns_nid) {
+		if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno)
+			CDEBUG(D_NET, "peer %s: seq# got %u have %u. peer rebooted?\n",
+				libcfs_nidstr(&lp->lp_primary_nid),
+				LNET_PING_BUFFER_SEQNO(pbuf),
+				lp->lp_peer_seqno);
+
+		lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+	}
+
+	/* We're happy with the state of the data in the buffer. */
+	CDEBUG(D_NET, "peer %s data present %u. state = 0x%x\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), lp->lp_peer_seqno,
+	       lp->lp_state);
+	if (lp->lp_state & LNET_PEER_DATA_PRESENT)
+		lnet_ping_buffer_decref(lp->lp_data);
+	else
+		lp->lp_state |= LNET_PEER_DATA_PRESENT;
+	lnet_ping_buffer_addref(pbuf);
+	lp->lp_data = pbuf;
+out:
+	lp->lp_state &= ~LNET_PEER_PING_SENT;
+	spin_unlock(&lp->lp_lock);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	/*
+	 * If this peer is a gateway, call the routing callback to
+	 * handle the ping reply
+	 */
+	if (lp->lp_rtr_refcount > 0)
+		lnet_router_discovery_ping_reply(lp);
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+/*
+ * Send event handling. Only matters for error cases, where we clean
+ * up state on the peer and peer_ni that would otherwise be updated in
+ * the REPLY event handler for a successful Ping, and the ACK event
+ * handler for a successful Push.
+ */
+static int
+lnet_discovery_event_send(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	int rc = 0;
+
+	if (!ev->status)
+		goto out;
+
+	spin_lock(&lp->lp_lock);
+	if (ev->msg_type == LNET_MSG_GET) {
+		lp->lp_state &= ~LNET_PEER_PING_SENT;
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = ev->status;
+	} else { /* ev->msg_type == LNET_MSG_PUT */
+		lp->lp_state &= ~LNET_PEER_PUSH_SENT;
+		lp->lp_state |= LNET_PEER_PUSH_FAILED;
+		lp->lp_push_error = ev->status;
+	}
+	spin_unlock(&lp->lp_lock);
+	rc = LNET_REDISCOVER_PEER;
+out:
+	CDEBUG(D_NET, "%s Send to %s: %d\n",
+		(ev->msg_type == LNET_MSG_GET ? "Ping" : "Push"),
+		libcfs_nidstr(&ev->target.nid), rc);
+	return rc;
+}
+
+/*
+ * Unlink event handling. This event is only seen if a call to
+ * LNetMDUnlink() caused the event to be unlinked. If this call was
+ * made after the event was set up in LNetGet() or LNetPut() then we
+ * assume the Ping or Push timed out.
+ */
+static void
+lnet_discovery_event_unlink(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	spin_lock(&lp->lp_lock);
+	/* We've passed through LNetGet() */
+	if (lp->lp_state & LNET_PEER_PING_SENT) {
+		lp->lp_state &= ~LNET_PEER_PING_SENT;
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = -ETIMEDOUT;
+		CDEBUG(D_NET, "Ping Unlink for message to peer %s\n",
+			libcfs_nidstr(&lp->lp_primary_nid));
+	}
+	/* We've passed through LNetPut() */
+	if (lp->lp_state & LNET_PEER_PUSH_SENT) {
+		lp->lp_state &= ~LNET_PEER_PUSH_SENT;
+		lp->lp_state |= LNET_PEER_PUSH_FAILED;
+		lp->lp_push_error = -ETIMEDOUT;
+		CDEBUG(D_NET, "Push Unlink for message to peer %s\n",
+			libcfs_nidstr(&lp->lp_primary_nid));
+	}
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Event handler for the discovery EQ.
+ *
+ * Called with lnet_res_lock(cpt) held. The cpt is the
+ * lnet_cpt_of_cookie() of the md handle cookie.
+ */
+static void lnet_discovery_event_handler(struct lnet_event *event)
+{
+	struct lnet_peer *lp = event->md_user_ptr;
+	struct lnet_ping_buffer *pbuf;
+	int rc;
+
+	/* discovery needs to take another look */
+	rc = LNET_REDISCOVER_PEER;
+
+	CDEBUG(D_NET, "Received event: %d\n", event->type);
+
+	switch (event->type) {
+	case LNET_EVENT_ACK:
+		lnet_discovery_event_ack(lp, event);
+		break;
+	case LNET_EVENT_REPLY:
+		lnet_discovery_event_reply(lp, event);
+		break;
+	case LNET_EVENT_SEND:
+		/* Only send failure triggers a retry. */
+		rc = lnet_discovery_event_send(lp, event);
+		break;
+	case LNET_EVENT_UNLINK:
+		/* LNetMDUnlink() was called */
+		lnet_discovery_event_unlink(lp, event);
+		break;
+	default:
+		/* Invalid events. */
+		LBUG();
+	}
+	lnet_net_lock(LNET_LOCK_EX);
+	if (event->unlinked) {
+		pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start);
+		lnet_ping_buffer_decref(pbuf);
+		lnet_peer_decref_locked(lp);
+	}
+
+	/* put peer back at end of request queue, if discovery not already
+	 * done */
+	if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp) &&
+	    lnet_peer_queue_for_discovery(lp)) {
+		list_move_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+		wake_up(&the_lnet.ln_dc_waitq);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+/*
+ * Build a peer from incoming data.
+ *
+ * The NIDs in the incoming data are supposed to be structured as follows:
+ *  - loopback
+ *  - primary NID
+ *  - other NIDs in same net
+ *  - NIDs in second net
+ *  - NIDs in third net
+ *  - ...
+ * This due to the way the list of NIDs in the data is created.
+ *
+ * Note that this function will mark the peer uptodate unless an
+ * ENOMEM is encontered. All other errors are due to a conflict
+ * between the DLC configuration and what discovery sees. We treat DLC
+ * as binding, and therefore set the NIDS_UPTODATE flag to prevent the
+ * peer from becoming stuck in discovery.
+ */
+static int lnet_peer_merge_data(struct lnet_peer *lp,
+				struct lnet_ping_buffer *pbuf)
+{
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t *curnis = NULL;
+	struct lnet_ni_status *addnis = NULL;
+	lnet_nid_t *delnis = NULL;
+	unsigned flags;
+	int ncurnis;
+	int naddnis;
+	int ndelnis;
+	int nnis = 0;
+	int i;
+	int j;
+	int rc;
+
+	flags = LNET_PEER_DISCOVERED;
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	/*
+	 * Cache the routing feature for the peer; whether it is enabled
+	 * for disabled as reported by the remote peer.
+	 */
+	spin_lock(&lp->lp_lock);
+	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_RTE_DISABLED))
+		lp->lp_state |= LNET_PEER_ROUTER_ENABLED;
+	else
+		lp->lp_state &= ~LNET_PEER_ROUTER_ENABLED;
+	spin_unlock(&lp->lp_lock);
+
+	nnis = max_t(int, lp->lp_nnis, pbuf->pb_info.pi_nnis);
+	CFS_ALLOC_PTR_ARRAY(curnis, nnis);
+	CFS_ALLOC_PTR_ARRAY(addnis, nnis);
+	CFS_ALLOC_PTR_ARRAY(delnis, nnis);
+	if (!curnis || !addnis || !delnis) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	ncurnis = 0;
+	naddnis = 0;
+	ndelnis = 0;
+
+	/* Construct the list of NIDs present in peer. */
+	lpni = NULL;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
+		curnis[ncurnis++] = lnet_nid_to_nid4(&lpni->lpni_nid);
+
+	/*
+	 * Check for NIDs in pbuf not present in curnis[].
+	 * The loop starts at 1 to skip the loopback NID.
+	 */
+	for (i = 1; i < pbuf->pb_info.pi_nnis; i++) {
+		for (j = 0; j < ncurnis; j++)
+			if (pbuf->pb_info.pi_ni[i].ns_nid == curnis[j])
+				break;
+		if (j == ncurnis)
+			addnis[naddnis++] = pbuf->pb_info.pi_ni[i];
+	}
+	/*
+	 * Check for NIDs in curnis[] not present in pbuf.
+	 * The nested loop starts at 1 to skip the loopback NID.
+	 *
+	 * But never add the loopback NID to delnis[]: if it is
+	 * present in curnis[] then this peer is for this node.
+	 */
+	for (i = 0; i < ncurnis; i++) {
+		if (curnis[i] == LNET_NID_LO_0)
+			continue;
+		for (j = 1; j < pbuf->pb_info.pi_nnis; j++) {
+			if (curnis[i] == pbuf->pb_info.pi_ni[j].ns_nid) {
+				/*
+				 * update the information we cache for the
+				 * peer with the latest information we
+				 * received
+				 */
+				lpni = lnet_find_peer_ni_locked(curnis[i]);
+				if (lpni) {
+					lpni->lpni_ns_status = pbuf->pb_info.pi_ni[j].ns_status;
+					lnet_peer_ni_decref_locked(lpni);
+				}
+				break;
+			}
+		}
+		if (j == pbuf->pb_info.pi_nnis)
+			delnis[ndelnis++] = curnis[i];
+	}
+
+	/*
+	 * If we get here and the discovery is disabled then we don't want
+	 * to add or delete any NIs. We just updated the ones we have some
+	 * information on, and call it a day
+	 */
+	rc = 0;
+	if (lnet_is_discovery_disabled(lp))
+		goto out;
+
+	for (i = 0; i < naddnis; i++) {
+		rc = lnet_peer_add_nid(lp, addnis[i].ns_nid, flags);
+		if (rc) {
+			CERROR("Error adding NID %s to peer %s: %d\n",
+			       libcfs_nid2str(addnis[i].ns_nid),
+			       libcfs_nidstr(&lp->lp_primary_nid), rc);
+			if (rc == -ENOMEM)
+				goto out;
+		}
+		lpni = lnet_find_peer_ni_locked(addnis[i].ns_nid);
+		if (lpni) {
+			lpni->lpni_ns_status = addnis[i].ns_status;
+			lnet_peer_ni_decref_locked(lpni);
+		}
+	}
+
+	for (i = 0; i < ndelnis; i++) {
+		/*
+		 * for routers it's okay to delete the primary_nid because
+		 * the upper layers don't really rely on it. So if we're
+		 * being told that the router changed its primary_nid
+		 * then it's okay to delete it.
+		 */
+		if (lp->lp_rtr_refcount > 0)
+			flags |= LNET_PEER_RTR_NI_FORCE_DEL;
+		rc = lnet_peer_del_nid(lp, delnis[i], flags);
+		if (rc) {
+			CERROR("Error deleting NID %s from peer %s: %d\n",
+			       libcfs_nid2str(delnis[i]),
+			       libcfs_nidstr(&lp->lp_primary_nid), rc);
+			if (rc == -ENOMEM)
+				goto out;
+		}
+	}
+
+	/* The peer net for the primary NID should be the first entry in the
+	 * peer's lp_peer_nets list, and the peer NI for the primary NID should
+	 * be the first entry in its peer net's lpn_peer_nis list.
+	 */
+	lpni = lnet_find_peer_ni_locked(pbuf->pb_info.pi_ni[1].ns_nid);
+	if (!lpni) {
+		CERROR("Internal error: Failed to lookup peer NI for primary NID: %s\n",
+		       libcfs_nid2str(pbuf->pb_info.pi_ni[1].ns_nid));
+		goto out;
+	}
+
+	lnet_peer_ni_decref_locked(lpni);
+
+	lpn = lpni->lpni_peer_net;
+	if (lpn->lpn_peer_nets.prev != &lp->lp_peer_nets)
+		list_move(&lpn->lpn_peer_nets, &lp->lp_peer_nets);
+
+	if (lpni->lpni_peer_nis.prev != &lpni->lpni_peer_net->lpn_peer_nis)
+		list_move(&lpni->lpni_peer_nis,
+			  &lpni->lpni_peer_net->lpn_peer_nis);
+
+	/*
+	 * Errors other than -ENOMEM are due to peers having been
+	 * configured with DLC. Ignore these because DLC overrides
+	 * Discovery.
+	 */
+	rc = 0;
+out:
+	CFS_FREE_PTR_ARRAY(curnis, nnis);
+	CFS_FREE_PTR_ARRAY(addnis, nnis);
+	CFS_FREE_PTR_ARRAY(delnis, nnis);
+	lnet_ping_buffer_decref(pbuf);
+	CDEBUG(D_NET, "peer %s (%p): %d\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), lp, rc);
+
+	if (rc) {
+		spin_lock(&lp->lp_lock);
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		spin_unlock(&lp->lp_lock);
+	}
+	return rc;
+}
+
+/*
+ * The data in pbuf says lp is its primary peer, but the data was
+ * received by a different peer. Try to update lp with the data.
+ */
+static int
+lnet_peer_set_primary_data(struct lnet_peer *lp, struct lnet_ping_buffer *pbuf)
+{
+	struct lnet_handle_md mdh;
+
+	/* Queue lp for discovery, and force it on the request queue. */
+	lnet_net_lock(LNET_LOCK_EX);
+	if (lnet_peer_queue_for_discovery(lp))
+		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	LNetInvalidateMDHandle(&mdh);
+
+	/*
+	 * Decide whether we can move the peer to the DATA_PRESENT state.
+	 *
+	 * We replace stale data for a multi-rail peer, repair PING_FAILED
+	 * status, and preempt FORCE_PING.
+	 *
+	 * If after that we have DATA_PRESENT, we merge it into this peer.
+	 */
+	spin_lock(&lp->lp_lock);
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		if (lp->lp_peer_seqno < LNET_PING_BUFFER_SEQNO(pbuf)) {
+			lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+		} else if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
+			lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
+			lnet_ping_buffer_decref(pbuf);
+			pbuf = lp->lp_data;
+			lp->lp_data = NULL;
+		}
+	}
+	if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
+		lnet_ping_buffer_decref(lp->lp_data);
+		lp->lp_data = NULL;
+		lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
+	}
+	if (lp->lp_state & LNET_PEER_PING_FAILED) {
+		mdh = lp->lp_ping_mdh;
+		LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+		lp->lp_state &= ~LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = 0;
+	}
+	if (lp->lp_state & LNET_PEER_FORCE_PING)
+		lp->lp_state &= ~LNET_PEER_FORCE_PING;
+	lp->lp_state |= LNET_PEER_NIDS_UPTODATE;
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(mdh))
+		LNetMDUnlink(mdh);
+
+	if (pbuf)
+		return lnet_peer_merge_data(lp, pbuf);
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid));
+	return 0;
+}
+
+static bool lnet_is_nid_in_ping_info(lnet_nid_t nid, struct lnet_ping_info *pinfo)
+{
+	int i;
+
+	for (i = 0; i < pinfo->pi_nnis; i++) {
+		if (pinfo->pi_ni[i].ns_nid == nid)
+			return true;
+	}
+
+	return false;
+}
+
+/* Delete a peer that has been marked for deletion. NB: when this peer was added
+ * to the discovery queue a reference was taken that will prevent the peer from
+ * actually being freed by this function. After this function exits the
+ * discovery thread should call lnet_peer_discovery_complete() which will
+ * drop that reference as well as wake any waiters that may also be holding a
+ * ref on the peer
+ */
+static int lnet_peer_deletion(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct list_head rlist;
+	struct lnet_route *route, *tmp;
+	int sensitivity = lp->lp_health_sensitivity;
+	int rc = 0;
+
+	INIT_LIST_HEAD(&rlist);
+
+	CDEBUG(D_NET, "peer %s(%p) state %#x\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), lp, lp->lp_state);
+
+	/* no-op if lnet_peer_del() has already been called on this peer */
+	if (lp->lp_state & LNET_PEER_MARK_DELETED)
+		goto clear_discovering;
+
+	spin_unlock(&lp->lp_lock);
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING ||
+	    the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		spin_lock(&lp->lp_lock);
+		rc = -ESHUTDOWN;
+		goto clear_discovering;
+	}
+
+	lnet_peer_cancel_discovery(lp);
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry_safe(route, tmp,
+				 &lp->lp_routes,
+				 lr_gwlist)
+		lnet_move_route(route, NULL, &rlist);
+
+	/* lnet_peer_del_locked() deletes all the peer NIs owned by this peer */
+	rc = lnet_peer_del_locked(lp);
+	if (rc)
+		CNETERR("Internal error: Unable to delete peer %s rc %d\n",
+			libcfs_nidstr(&lp->lp_primary_nid), rc);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(route, tmp,
+				 &rlist, lr_list) {
+		/* re-add these routes */
+		lnet_add_route(route->lr_net,
+			       route->lr_hops,
+			       &route->lr_nid,
+			       route->lr_priority,
+			       sensitivity);
+		LIBCFS_FREE(route, sizeof(*route));
+	}
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	spin_lock(&lp->lp_lock);
+
+	rc = 0;
+
+clear_discovering:
+	lp->lp_state &= ~(LNET_PEER_DISCOVERING | LNET_PEER_FORCE_PING |
+			  LNET_PEER_FORCE_PUSH);
+
+	return rc;
+}
+
+/*
+ * Update a peer using the data received.
+ */
+static int lnet_peer_data_present(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t nid = LNET_NID_ANY;
+	unsigned flags;
+	int rc = 0;
+
+	pbuf = lp->lp_data;
+	lp->lp_data = NULL;
+	lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
+	lp->lp_state |= LNET_PEER_NIDS_UPTODATE;
+	spin_unlock(&lp->lp_lock);
+
+	/*
+	 * Modifications of peer structures are done while holding the
+	 * ln_api_mutex. A global lock is required because we may be
+	 * modifying multiple peer structures, and a mutex greatly
+	 * simplifies memory management.
+	 *
+	 * The actual changes to the data structures must also protect
+	 * against concurrent lookups, for which the lnet_net_lock in
+	 * LNET_LOCK_EX mode is used.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	/*
+	 * If this peer is not on the peer list then it is being torn
+	 * down, and our reference count may be all that is keeping it
+	 * alive. Don't do any work on it.
+	 */
+	if (list_empty(&lp->lp_peer_list))
+		goto out;
+
+	flags = LNET_PEER_DISCOVERED;
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	/*
+	 * Check whether the primary NID in the message matches the
+	 * primary NID of the peer. If it does, update the peer, if
+	 * it it does not, check whether there is already a peer with
+	 * that primary NID. If no such peer exists, try to update
+	 * the primary NID of the current peer (allowed if it was
+	 * created due to message traffic) and complete the update.
+	 * If the peer did exist, hand off the data to it.
+	 *
+	 * The peer for the loopback interface is a special case: this
+	 * is the peer for the local node, and we want to set its
+	 * primary NID to the correct value here. Moreover, this peer
+	 * can show up with only the loopback NID in the ping buffer.
+	 */
+	if (pbuf->pb_info.pi_nnis <= 1) {
+		lnet_ping_buffer_decref(pbuf);
+		goto out;
+	}
+	nid = pbuf->pb_info.pi_ni[1].ns_nid;
+	if (nid_is_lo0(&lp->lp_primary_nid)) {
+		rc = lnet_peer_set_primary_nid(lp, nid, flags);
+		if (!rc)
+			rc = lnet_peer_merge_data(lp, pbuf);
+	/*
+	 * if the primary nid of the peer is present in the ping info returned
+	 * from the peer, but it's not the local primary peer we have
+	 * cached and discovery is disabled, then we don't want to update
+	 * our local peer info, by adding or removing NIDs, we just want
+	 * to update the status of the nids that we currently have
+	 * recorded in that peer.
+	 */
+	} else if (lnet_nid_to_nid4(&lp->lp_primary_nid) == nid ||
+		   (lnet_is_nid_in_ping_info(lnet_nid_to_nid4(&lp->lp_primary_nid),
+					     &pbuf->pb_info) &&
+		    lnet_is_discovery_disabled(lp))) {
+		rc = lnet_peer_merge_data(lp, pbuf);
+	} else {
+		lpni = lnet_find_peer_ni_locked(nid);
+		if (!lpni || lp == lpni->lpni_peer_net->lpn_peer) {
+			rc = lnet_peer_set_primary_nid(lp, nid, flags);
+			if (rc) {
+				CERROR("Primary NID error %s versus %s: %d\n",
+				       libcfs_nidstr(&lp->lp_primary_nid),
+				       libcfs_nid2str(nid), rc);
+			} else {
+				rc = lnet_peer_merge_data(lp, pbuf);
+			}
+			if (lpni)
+				lnet_peer_ni_decref_locked(lpni);
+		} else {
+			struct lnet_peer *new_lp;
+			new_lp = lpni->lpni_peer_net->lpn_peer;
+			/*
+			 * if lp has discovery/MR enabled that means new_lp
+			 * should have discovery/MR enabled as well, since
+			 * it's the same peer, which we're about to merge
+			 */
+			spin_lock(&lp->lp_lock);
+			spin_lock(&new_lp->lp_lock);
+			if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY))
+				new_lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
+			if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+				new_lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			/* If we're processing a ping reply then we may be
+			 * about to send a push to the peer that we ping'd.
+			 * Since the ping reply that we're processing was
+			 * received by lp, we need to set the discovery source
+			 * NID for new_lp to the NID stored in lp.
+			 */
+			if (!LNET_NID_IS_ANY(&lp->lp_disc_src_nid)) {
+				new_lp->lp_disc_src_nid = lp->lp_disc_src_nid;
+				new_lp->lp_disc_dst_nid = lp->lp_disc_dst_nid;
+			}
+			spin_unlock(&new_lp->lp_lock);
+			spin_unlock(&lp->lp_lock);
+
+			rc = lnet_peer_set_primary_data(new_lp, pbuf);
+			lnet_consolidate_routes_locked(lp, new_lp);
+			lnet_peer_ni_decref_locked(lpni);
+		}
+	}
+out:
+	CDEBUG(D_NET, "peer %s(%p): %d. state = 0x%x\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), lp, rc,
+	       lp->lp_state);
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	spin_lock(&lp->lp_lock);
+	/* Tell discovery to re-check the peer immediately. */
+	if (!rc)
+		rc = LNET_REDISCOVER_PEER;
+	return rc;
+}
+
+/*
+ * A ping failed. Clear the PING_FAILED state and set the
+ * FORCE_PING state, to ensure a retry even if discovery is
+ * disabled. This avoids being left with incorrect state.
+ */
+static int lnet_peer_ping_failed(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_handle_md mdh;
+	int rc;
+
+	mdh = lp->lp_ping_mdh;
+	LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+	lp->lp_state &= ~LNET_PEER_PING_FAILED;
+	lp->lp_state |= LNET_PEER_FORCE_PING;
+	rc = lp->lp_ping_error;
+	lp->lp_ping_error = 0;
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(mdh))
+		LNetMDUnlink(mdh);
+
+	CDEBUG(D_NET, "peer %s:%d\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), rc);
+
+	spin_lock(&lp->lp_lock);
+	return rc ? rc : LNET_REDISCOVER_PEER;
+}
+
+/* Active side of ping. */
+static int lnet_peer_send_ping(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	int nnis;
+	int rc;
+	int cpt;
+
+	lp->lp_state |= LNET_PEER_PING_SENT;
+	lp->lp_state &= ~LNET_PEER_FORCE_PING;
+	spin_unlock(&lp->lp_lock);
+
+	cpt = lnet_net_lock_current();
+	/* Refcount for MD. */
+	lnet_peer_addref_locked(lp);
+	lnet_net_unlock(cpt);
+
+	nnis = max(lp->lp_data_nnis, LNET_INTERFACES_MIN);
+
+	rc = lnet_send_ping(&lp->lp_primary_nid, &lp->lp_ping_mdh, nnis, lp,
+			    the_lnet.ln_dc_handler, false);
+
+	/*
+	 * if LNetMDBind in lnet_send_ping fails we need to decrement the
+	 * refcount on the peer, otherwise LNetMDUnlink will be called
+	 * which will eventually do that.
+	 */
+	if (rc > 0) {
+		lnet_net_lock(cpt);
+		lnet_peer_decref_locked(lp);
+		lnet_net_unlock(cpt);
+		rc = -rc; /* change the rc to negative value */
+		goto fail_error;
+	} else if (rc < 0) {
+		goto fail_error;
+	}
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid));
+
+	spin_lock(&lp->lp_lock);
+	return 0;
+
+fail_error:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nidstr(&lp->lp_primary_nid), rc);
+	/*
+	 * The errors that get us here are considered hard errors and
+	 * cause Discovery to terminate. So we clear PING_SENT, but do
+	 * not set either PING_FAILED or FORCE_PING. In fact we need
+	 * to clear PING_FAILED, because the unlink event handler will
+	 * have set it if we called LNetMDUnlink() above.
+	 */
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~(LNET_PEER_PING_SENT | LNET_PEER_PING_FAILED);
+	return rc;
+}
+
+/*
+ * This function exists because you cannot call LNetMDUnlink() from an
+ * event handler.
+ */
+static int lnet_peer_push_failed(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_handle_md mdh;
+	int rc;
+
+	mdh = lp->lp_push_mdh;
+	LNetInvalidateMDHandle(&lp->lp_push_mdh);
+	lp->lp_state &= ~LNET_PEER_PUSH_FAILED;
+	rc = lp->lp_push_error;
+	lp->lp_push_error = 0;
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(mdh))
+		LNetMDUnlink(mdh);
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid));
+	spin_lock(&lp->lp_lock);
+	return rc ? rc : LNET_REDISCOVER_PEER;
+}
+
+/*
+ * Mark the peer as discovered.
+ */
+static int lnet_peer_discovered(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	lp->lp_state |= LNET_PEER_DISCOVERED;
+	lp->lp_state &= ~(LNET_PEER_DISCOVERING |
+			  LNET_PEER_REDISCOVER);
+
+	lp->lp_dc_error = 0;
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid));
+
+	return 0;
+}
+
+/* Active side of push. */
+static int lnet_peer_send_push(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_process_id id;
+	struct lnet_md md;
+	int cpt;
+	int rc;
+
+	/* Don't push to a non-multi-rail peer. */
+	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+		lp->lp_state &= ~LNET_PEER_FORCE_PUSH;
+		/* if peer's NIDs are uptodate then peer is discovered */
+		if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) {
+			rc = lnet_peer_discovered(lp);
+			return rc;
+		}
+
+		return 0;
+	}
+
+	lp->lp_state |= LNET_PEER_PUSH_SENT;
+	lp->lp_state &= ~LNET_PEER_FORCE_PUSH;
+	spin_unlock(&lp->lp_lock);
+
+	cpt = lnet_net_lock_current();
+	pbuf = the_lnet.ln_ping_target;
+	lnet_ping_buffer_addref(pbuf);
+	lnet_net_unlock(cpt);
+
+	/* Push source MD */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(pbuf->pb_nnis);
+	md.threshold = 2; /* Put/Ack */
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRACK_RESPONSE;
+	md.handler   = the_lnet.ln_dc_handler;
+	md.user_ptr  = lp;
+
+	rc = LNetMDBind(&md, LNET_UNLINK, &lp->lp_push_mdh);
+	if (rc) {
+		lnet_ping_buffer_decref(pbuf);
+		CERROR("Can't bind push source MD: %d\n", rc);
+		goto fail_error;
+	}
+
+	cpt = lnet_net_lock_current();
+	/* Refcount for MD. */
+	lnet_peer_addref_locked(lp);
+	id.pid = LNET_PID_LUSTRE;
+	if (!LNET_NID_IS_ANY(&lp->lp_disc_dst_nid))
+		id.nid = lnet_nid_to_nid4(&lp->lp_disc_dst_nid);
+	else
+		id.nid = lnet_nid_to_nid4(&lp->lp_primary_nid);
+	lnet_net_unlock(cpt);
+
+	rc = LNetPut(lnet_nid_to_nid4(&lp->lp_disc_src_nid), lp->lp_push_mdh,
+		     LNET_ACK_REQ, id, LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0, 0);
+
+	/*
+	 * reset the discovery nid. There is no need to restrict sending
+	 * from that source, if we call lnet_push_update_to_peers(). It'll
+	 * get set to a specific NID, if we initiate discovery from the
+	 * scratch
+	 */
+	lp->lp_disc_src_nid = LNET_ANY_NID;
+	lp->lp_disc_dst_nid = LNET_ANY_NID;
+
+	if (rc)
+		goto fail_unlink;
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid));
+
+	spin_lock(&lp->lp_lock);
+	return 0;
+
+fail_unlink:
+	LNetMDUnlink(lp->lp_push_mdh);
+	LNetInvalidateMDHandle(&lp->lp_push_mdh);
+fail_error:
+	CDEBUG(D_NET, "peer %s(%p): %d\n", libcfs_nidstr(&lp->lp_primary_nid),
+	       lp, rc);
+	/*
+	 * The errors that get us here are considered hard errors and
+	 * cause Discovery to terminate. So we clear PUSH_SENT, but do
+	 * not set PUSH_FAILED. In fact we need to clear PUSH_FAILED,
+	 * because the unlink event handler will have set it if we
+	 * called LNetMDUnlink() above.
+	 */
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~(LNET_PEER_PUSH_SENT | LNET_PEER_PUSH_FAILED);
+	return rc;
+}
+
+/*
+ * Wait for work to be queued or some other change that must be
+ * attended to. Returns non-zero if the discovery thread should shut
+ * down.
+ */
+static int lnet_peer_discovery_wait_for_work(void)
+{
+	int cpt;
+	int rc = 0;
+
+	DEFINE_WAIT(wait);
+
+	cpt = lnet_net_lock_current();
+	for (;;) {
+		prepare_to_wait(&the_lnet.ln_dc_waitq, &wait,
+				TASK_INTERRUPTIBLE);
+		if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
+			break;
+		if (lnet_push_target_resize_needed() ||
+		    the_lnet.ln_push_target->pb_needs_post)
+			break;
+		if (!list_empty(&the_lnet.ln_dc_request))
+			break;
+		if (!list_empty(&the_lnet.ln_msg_resend))
+			break;
+		lnet_net_unlock(cpt);
+
+		/*
+		 * wakeup max every second to check if there are peers that
+		 * have been stuck on the working queue for greater than
+		 * the peer timeout.
+		 */
+		schedule_timeout(cfs_time_seconds(1));
+		finish_wait(&the_lnet.ln_dc_waitq, &wait);
+		cpt = lnet_net_lock_current();
+	}
+	finish_wait(&the_lnet.ln_dc_waitq, &wait);
+
+	if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
+		rc = -ESHUTDOWN;
+
+	lnet_net_unlock(cpt);
+
+	CDEBUG(D_NET, "woken: %d\n", rc);
+
+	return rc;
+}
+
+/*
+ * Messages that were pending on a destroyed peer will be put on a global
+ * resend list. The message resend list will be checked by
+ * the discovery thread when it wakes up, and will resend messages. These
+ * messages can still be sendable in the case the lpni which was the initial
+ * cause of the message re-queue was transfered to another peer.
+ *
+ * It is possible that LNet could be shutdown while we're iterating
+ * through the list. lnet_shudown_lndnets() will attempt to access the
+ * resend list, but will have to wait until the spinlock is released, by
+ * which time there shouldn't be any more messages on the resend list.
+ * During shutdown lnet_send() will fail and lnet_finalize() will be called
+ * for the messages so they can be released. The other case is that
+ * lnet_shudown_lndnets() can finalize all the messages before this
+ * function can visit the resend list, in which case this function will be
+ * a no-op.
+ */
+static void lnet_resend_msgs(void)
+{
+	struct lnet_msg *msg, *tmp;
+	LIST_HEAD(resend);
+	int rc;
+
+	spin_lock(&the_lnet.ln_msg_resend_lock);
+	list_splice(&the_lnet.ln_msg_resend, &resend);
+	spin_unlock(&the_lnet.ln_msg_resend_lock);
+
+	list_for_each_entry_safe(msg, tmp, &resend, msg_list) {
+		list_del_init(&msg->msg_list);
+		rc = lnet_send(&msg->msg_src_nid_param, msg,
+			       &msg->msg_rtr_nid_param);
+		if (rc < 0) {
+			CNETERR("Error sending %s to %s: %d\n",
+			       lnet_msgtyp2str(msg->msg_type),
+			       libcfs_idstr(&msg->msg_target), rc);
+			lnet_finalize(msg, rc);
+		}
+	}
+}
+
+/* The discovery thread. */
+static int lnet_peer_discovery(void *arg)
+{
+	struct lnet_peer *lp;
+	int rc;
+
+	wait_for_completion(&the_lnet.ln_started);
+
+	CDEBUG(D_NET, "started\n");
+
+	for (;;) {
+		if (lnet_peer_discovery_wait_for_work())
+			break;
+
+		if (lnet_push_target_resize_needed())
+			lnet_push_target_resize();
+		else if (the_lnet.ln_push_target->pb_needs_post)
+			lnet_push_target_post(the_lnet.ln_push_target,
+					      &the_lnet.ln_push_target_md);
+
+		lnet_resend_msgs();
+
+		lnet_net_lock(LNET_LOCK_EX);
+		if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			break;
+		}
+
+		/*
+		 * Process all incoming discovery work requests.  When
+		 * discovery must wait on a peer to change state, it
+		 * is added to the tail of the ln_dc_working queue. A
+		 * timestamp keeps track of when the peer was added,
+		 * so we can time out discovery requests that take too
+		 * long.
+		 */
+		while (!list_empty(&the_lnet.ln_dc_request)) {
+			lp = list_first_entry(&the_lnet.ln_dc_request,
+					      struct lnet_peer, lp_dc_list);
+			list_move(&lp->lp_dc_list, &the_lnet.ln_dc_working);
+			/*
+			 * set the time the peer was put on the dc_working
+			 * queue. It shouldn't remain on the queue
+			 * forever, in case the GET message (for ping)
+			 * doesn't get a REPLY or the PUT message (for
+			 * push) doesn't get an ACK.
+			 */
+			lp->lp_last_queued = ktime_get_real_seconds();
+			lnet_net_unlock(LNET_LOCK_EX);
+
+			if (lnet_push_target_resize_needed())
+				lnet_push_target_resize();
+			else if (the_lnet.ln_push_target->pb_needs_post)
+				lnet_push_target_post(the_lnet.ln_push_target,
+						      &the_lnet.ln_push_target_md);
+
+			/*
+			 * Select an action depending on the state of
+			 * the peer and whether discovery is disabled.
+			 * The check whether discovery is disabled is
+			 * done after the code that handles processing
+			 * for arrived data, cleanup for failures, and
+			 * forcing a Ping or Push.
+			 */
+			spin_lock(&lp->lp_lock);
+			CDEBUG(D_NET, "peer %s(%p) state %#x\n",
+				libcfs_nidstr(&lp->lp_primary_nid), lp,
+				lp->lp_state);
+			if (lp->lp_state & (LNET_PEER_MARK_DELETION |
+					    LNET_PEER_MARK_DELETED))
+				rc = lnet_peer_deletion(lp);
+			else if (lp->lp_state & LNET_PEER_DATA_PRESENT)
+				rc = lnet_peer_data_present(lp);
+			else if (lp->lp_state & LNET_PEER_PING_FAILED)
+				rc = lnet_peer_ping_failed(lp);
+			else if (lp->lp_state & LNET_PEER_PUSH_FAILED)
+				rc = lnet_peer_push_failed(lp);
+			else if (lp->lp_state & LNET_PEER_FORCE_PING)
+				rc = lnet_peer_send_ping(lp);
+			else if (lp->lp_state & LNET_PEER_FORCE_PUSH)
+				rc = lnet_peer_send_push(lp);
+			else if (!(lp->lp_state & LNET_PEER_NIDS_UPTODATE))
+				rc = lnet_peer_send_ping(lp);
+			else if (lnet_peer_needs_push(lp))
+				rc = lnet_peer_send_push(lp);
+			else
+				rc = lnet_peer_discovered(lp);
+			CDEBUG(D_NET, "peer %s(%p) state %#x rc %d\n",
+				libcfs_nidstr(&lp->lp_primary_nid), lp,
+				lp->lp_state, rc);
+
+			if (rc == LNET_REDISCOVER_PEER) {
+				spin_unlock(&lp->lp_lock);
+				lnet_net_lock(LNET_LOCK_EX);
+				list_move(&lp->lp_dc_list,
+					  &the_lnet.ln_dc_request);
+			} else if (rc ||
+				   !(lp->lp_state & LNET_PEER_DISCOVERING)) {
+				spin_unlock(&lp->lp_lock);
+				lnet_net_lock(LNET_LOCK_EX);
+				lnet_peer_discovery_complete(lp, rc);
+			} else {
+				spin_unlock(&lp->lp_lock);
+				lnet_net_lock(LNET_LOCK_EX);
+			}
+
+			if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
+				break;
+
+		}
+
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	CDEBUG(D_NET, "stopping\n");
+	/*
+	 * Clean up before telling lnet_peer_discovery_stop() that
+	 * we're done. Use wake_up() below to somewhat reduce the
+	 * size of the thundering herd if there are multiple threads
+	 * waiting on discovery of a single peer.
+	 */
+
+	/* Queue cleanup 1: stop all pending pings and pushes. */
+	lnet_net_lock(LNET_LOCK_EX);
+	while (!list_empty(&the_lnet.ln_dc_working)) {
+		lp = list_first_entry(&the_lnet.ln_dc_working,
+				      struct lnet_peer, lp_dc_list);
+		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired);
+		lnet_net_unlock(LNET_LOCK_EX);
+		lnet_peer_cancel_discovery(lp);
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* Queue cleanup 2: wait for the expired queue to clear. */
+	while (!list_empty(&the_lnet.ln_dc_expired))
+		schedule_timeout_uninterruptible(cfs_time_seconds(1));
+
+	/* Queue cleanup 3: clear the request queue. */
+	lnet_net_lock(LNET_LOCK_EX);
+	while (!list_empty(&the_lnet.ln_dc_request)) {
+		lp = list_first_entry(&the_lnet.ln_dc_request,
+				      struct lnet_peer, lp_dc_list);
+		lnet_peer_discovery_complete(lp, -ESHUTDOWN);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	lnet_assert_handler_unused(the_lnet.ln_dc_handler);
+	the_lnet.ln_dc_handler = NULL;
+
+	the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN;
+	wake_up(&the_lnet.ln_dc_waitq);
+
+	CDEBUG(D_NET, "stopped\n");
+
+	return 0;
+}
+
+/* ln_api_mutex is held on entry. */
+int lnet_peer_discovery_start(void)
+{
+	struct task_struct *task;
+	int rc = 0;
+
+	if (the_lnet.ln_dc_state != LNET_DC_STATE_SHUTDOWN)
+		return -EALREADY;
+
+	the_lnet.ln_dc_handler = lnet_discovery_event_handler;
+	the_lnet.ln_dc_state = LNET_DC_STATE_RUNNING;
+	task = kthread_run(lnet_peer_discovery, NULL, "lnet_discovery");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Can't start peer discovery thread: %d\n", rc);
+
+		the_lnet.ln_dc_handler = NULL;
+
+		the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN;
+	}
+
+	CDEBUG(D_NET, "discovery start: %d\n", rc);
+
+	return rc;
+}
+
+/* ln_api_mutex is held on entry. */
+void lnet_peer_discovery_stop(void)
+{
+	if (the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN)
+		return;
+
+	LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING);
+	the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING;
+
+	/* In the LNetNIInit() path we may be stopping discovery before it
+	 * entered its work loop
+	 */
+	if (!completion_done(&the_lnet.ln_started))
+		complete(&the_lnet.ln_started);
+	else
+		wake_up(&the_lnet.ln_dc_waitq);
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	wait_event(the_lnet.ln_dc_waitq,
+		   the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	LASSERT(list_empty(&the_lnet.ln_dc_request));
+	LASSERT(list_empty(&the_lnet.ln_dc_working));
+	LASSERT(list_empty(&the_lnet.ln_dc_expired));
+
+	CDEBUG(D_NET, "discovery stopped\n");
+}
+
+/* Debugging */
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+	char			*aliveness = "NA";
+	struct lnet_peer_ni	*lp;
+	int			cpt;
+
+	cpt = lnet_cpt_of_nid(nid, NULL);
+	lnet_net_lock(cpt);
+
+	lp = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt);
+	if (IS_ERR(lp)) {
+		lnet_net_unlock(cpt);
+		CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+		return;
+	}
+
+	if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+		aliveness = (lnet_is_peer_ni_alive(lp)) ? "up" : "down";
+
+	CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+	       libcfs_nidstr(&lp->lpni_nid), kref_read(&lp->lpni_kref),
+	       aliveness, lp->lpni_net->net_tunables.lct_peer_tx_credits,
+	       lp->lpni_rtrcredits, lp->lpni_minrtrcredits,
+	       lp->lpni_txcredits, lp->lpni_mintxcredits, lp->lpni_txqnob);
+
+	lnet_peer_ni_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+}
+
+/* Gathering information for userspace. */
+
+int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
+			  char aliveness[LNET_MAX_STR_LEN],
+			  __u32 *cpt_iter, __u32 *refcount,
+			  __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
+			  __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credits,
+			  __u32 *peer_tx_qnob)
+{
+	struct lnet_peer_table		*peer_table;
+	struct lnet_peer_ni		*lp;
+	int				j;
+	int				lncpt;
+	bool				found = false;
+
+	/* get the number of CPTs */
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+
+	/* if the cpt number to be examined is >= the number of cpts in
+	 * the system then indicate that there are no more cpts to examin
+	 */
+	if (*cpt_iter >= lncpt)
+		return -ENOENT;
+
+	/* get the current table */
+	peer_table = the_lnet.ln_peer_tables[*cpt_iter];
+	/* if the ptable is NULL then there are no more cpts to examine */
+	if (peer_table == NULL)
+		return -ENOENT;
+
+	lnet_net_lock(*cpt_iter);
+
+	for (j = 0; j < LNET_PEER_HASH_SIZE && !found; j++) {
+		struct list_head *peers = &peer_table->pt_hash[j];
+
+		list_for_each_entry(lp, peers, lpni_hashlist) {
+			if (!nid_is_nid4(&lp->lpni_nid))
+				continue;
+			if (peer_index-- > 0)
+				continue;
+
+			snprintf(aliveness, LNET_MAX_STR_LEN, "NA");
+			if (lnet_isrouter(lp) ||
+				lnet_peer_aliveness_enabled(lp))
+				snprintf(aliveness, LNET_MAX_STR_LEN,
+					 lnet_is_peer_ni_alive(lp) ? "up" : "down");
+
+			*nid = lnet_nid_to_nid4(&lp->lpni_nid);
+			*refcount = kref_read(&lp->lpni_kref);
+			*ni_peer_tx_credits =
+				lp->lpni_net->net_tunables.lct_peer_tx_credits;
+			*peer_tx_credits = lp->lpni_txcredits;
+			*peer_rtr_credits = lp->lpni_rtrcredits;
+			*peer_min_rtr_credits = lp->lpni_mintxcredits;
+			*peer_tx_qnob = lp->lpni_txqnob;
+
+			found = true;
+		}
+
+	}
+	lnet_net_unlock(*cpt_iter);
+
+	*cpt_iter = lncpt;
+
+	return found ? 0 : -ENOENT;
+}
+
+/* ln_api_mutex is held, which keeps the peer list stable */
+int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk)
+{
+	struct lnet_ioctl_element_stats *lpni_stats;
+	struct lnet_ioctl_element_msg_stats *lpni_msg_stats;
+	struct lnet_ioctl_peer_ni_hstats *lpni_hstats;
+	struct lnet_peer_ni_credit_info *lpni_info;
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *lp;
+	lnet_nid_t nid;
+	__u32 size;
+	int rc;
+
+	lp = lnet_find_peer4(cfg->prcfg_prim_nid);
+
+	if (!lp) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats)
+		+ sizeof(*lpni_msg_stats) + sizeof(*lpni_hstats);
+	size *= lp->lp_nnis;
+	if (size > cfg->prcfg_size) {
+		cfg->prcfg_size = size;
+		rc = -E2BIG;
+		goto out_lp_decref;
+	}
+
+	cfg->prcfg_prim_nid = lnet_nid_to_nid4(&lp->lp_primary_nid);
+	cfg->prcfg_mr = lnet_peer_is_multi_rail(lp);
+	cfg->prcfg_cfg_nid = lnet_nid_to_nid4(&lp->lp_primary_nid);
+	cfg->prcfg_count = lp->lp_nnis;
+	cfg->prcfg_size = size;
+	cfg->prcfg_state = lp->lp_state;
+
+	/* Allocate helper buffers. */
+	rc = -ENOMEM;
+	LIBCFS_ALLOC(lpni_info, sizeof(*lpni_info));
+	if (!lpni_info)
+		goto out_lp_decref;
+	LIBCFS_ALLOC(lpni_stats, sizeof(*lpni_stats));
+	if (!lpni_stats)
+		goto out_free_info;
+	LIBCFS_ALLOC(lpni_msg_stats, sizeof(*lpni_msg_stats));
+	if (!lpni_msg_stats)
+		goto out_free_stats;
+	LIBCFS_ALLOC(lpni_hstats, sizeof(*lpni_hstats));
+	if (!lpni_hstats)
+		goto out_free_msg_stats;
+
+
+	lpni = NULL;
+	rc = -EFAULT;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
+		if (!nid_is_nid4(&lpni->lpni_nid))
+			continue;
+		nid = lnet_nid_to_nid4(&lpni->lpni_nid);
+		if (copy_to_user(bulk, &nid, sizeof(nid)))
+			goto out_free_hstats;
+		bulk += sizeof(nid);
+
+		memset(lpni_info, 0, sizeof(*lpni_info));
+		snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, "NA");
+		if (lnet_isrouter(lpni) ||
+			lnet_peer_aliveness_enabled(lpni))
+			snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN,
+				lnet_is_peer_ni_alive(lpni) ? "up" : "down");
+
+		lpni_info->cr_refcount = kref_read(&lpni->lpni_kref);
+		lpni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
+			lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0;
+		lpni_info->cr_peer_tx_credits = lpni->lpni_txcredits;
+		lpni_info->cr_peer_rtr_credits = lpni->lpni_rtrcredits;
+		lpni_info->cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits;
+		lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
+		lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob;
+		if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_info);
+
+		memset(lpni_stats, 0, sizeof(*lpni_stats));
+		lpni_stats->iel_send_count = lnet_sum_stats(&lpni->lpni_stats,
+							    LNET_STATS_TYPE_SEND);
+		lpni_stats->iel_recv_count = lnet_sum_stats(&lpni->lpni_stats,
+							    LNET_STATS_TYPE_RECV);
+		lpni_stats->iel_drop_count = lnet_sum_stats(&lpni->lpni_stats,
+							    LNET_STATS_TYPE_DROP);
+		if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_stats);
+		lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats);
+		if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_msg_stats);
+		lpni_hstats->hlpni_network_timeout =
+		  atomic_read(&lpni->lpni_hstats.hlt_network_timeout);
+		lpni_hstats->hlpni_remote_dropped =
+		  atomic_read(&lpni->lpni_hstats.hlt_remote_dropped);
+		lpni_hstats->hlpni_remote_timeout =
+		  atomic_read(&lpni->lpni_hstats.hlt_remote_timeout);
+		lpni_hstats->hlpni_remote_error =
+		  atomic_read(&lpni->lpni_hstats.hlt_remote_error);
+		lpni_hstats->hlpni_health_value =
+		  atomic_read(&lpni->lpni_healthv);
+		lpni_hstats->hlpni_ping_count = lpni->lpni_ping_count;
+		lpni_hstats->hlpni_next_ping = lpni->lpni_next_ping;
+		if (copy_to_user(bulk, lpni_hstats, sizeof(*lpni_hstats)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_hstats);
+	}
+	rc = 0;
+
+out_free_hstats:
+	LIBCFS_FREE(lpni_hstats, sizeof(*lpni_hstats));
+out_free_msg_stats:
+	LIBCFS_FREE(lpni_msg_stats, sizeof(*lpni_msg_stats));
+out_free_stats:
+	LIBCFS_FREE(lpni_stats, sizeof(*lpni_stats));
+out_free_info:
+	LIBCFS_FREE(lpni_info, sizeof(*lpni_info));
+out_lp_decref:
+	lnet_peer_decref_locked(lp);
+out:
+	return rc;
+}
+
+/* must hold net_lock/0 */
+void
+lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni,
+				     struct list_head *recovery_queue,
+				     time64_t now)
+{
+	/* the mt could've shutdown and cleaned up the queues */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
+		return;
+
+	if (!list_empty(&lpni->lpni_recovery))
+		return;
+
+	if (atomic_read(&lpni->lpni_healthv) == LNET_MAX_HEALTH_VALUE)
+		return;
+
+	if (!lpni->lpni_last_alive) {
+		CDEBUG(D_NET,
+		       "lpni %s(%p) not eligible for recovery last alive %lld\n",
+		       libcfs_nidstr(&lpni->lpni_nid), lpni,
+		       lpni->lpni_last_alive);
+		return;
+	}
+
+	if (lnet_recovery_limit &&
+	    now > lpni->lpni_last_alive + lnet_recovery_limit) {
+		CDEBUG(D_NET, "lpni %s aged out last alive %lld\n",
+		       libcfs_nidstr(&lpni->lpni_nid),
+		       lpni->lpni_last_alive);
+		/* Reset the ping count so that if this peer NI is added back to
+		 * the recovery queue we will send the first ping right away.
+		 */
+		lpni->lpni_ping_count = 0;
+		return;
+	}
+
+	/* This peer NI is going on the recovery queue, so take a ref on it */
+	lnet_peer_ni_addref_locked(lpni);
+
+	lnet_peer_ni_set_next_ping(lpni, now);
+
+	CDEBUG(D_NET, "%s added to recovery queue. ping count: %u next ping: %lld last alive: %lld health: %d\n",
+	       libcfs_nidstr(&lpni->lpni_nid),
+	       lpni->lpni_ping_count,
+	       lpni->lpni_next_ping,
+	       lpni->lpni_last_alive,
+	       atomic_read(&lpni->lpni_healthv));
+
+	list_add_tail(&lpni->lpni_recovery, recovery_queue);
+}
+
+/* Call with the ln_api_mutex held */
+void
+lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all)
+{
+	struct lnet_peer_table *ptable;
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	int lncpt;
+	int cpt;
+	time64_t now;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		return;
+
+	now = ktime_get_seconds();
+
+	if (!all) {
+		lnet_net_lock(LNET_LOCK_EX);
+		lpni = lnet_find_peer_ni_locked(nid);
+		if (!lpni) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			return;
+		}
+		lnet_set_lpni_healthv_locked(lpni, value);
+		lnet_peer_ni_add_to_recoveryq_locked(lpni,
+					     &the_lnet.ln_mt_peerNIRecovq, now);
+		lnet_peer_ni_decref_locked(lpni);
+		lnet_net_unlock(LNET_LOCK_EX);
+		return;
+	}
+
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+
+	/*
+	 * Walk all the peers and reset the health value for each one to the
+	 * specified value.
+	 */
+	lnet_net_lock(LNET_LOCK_EX);
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) {
+				list_for_each_entry(lpni, &lpn->lpn_peer_nis,
+						    lpni_peer_nis) {
+					lnet_set_lpni_healthv_locked(lpni,
+								     value);
+					lnet_peer_ni_add_to_recoveryq_locked(lpni,
+					     &the_lnet.ln_mt_peerNIRecovq, now);
+				}
+			}
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router.c b/drivers/staging/lustrefsx/lnet/lnet/router.c
new file mode 100644
index 0000000000000..9002cf0bcbe89
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/router.c
@@ -0,0 +1,1835 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/random.h>
+#include <lnet/lib-lnet.h>
+
+#define LNET_NRB_TINY_MIN	512	/* min value for each CPT */
+#define LNET_NRB_TINY		(LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN	4096	/* min value for each CPT */
+#define LNET_NRB_SMALL		(LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_SMALL_PAGES	1
+#define LNET_NRB_LARGE_MIN	256	/* min value for each CPT */
+#define LNET_NRB_LARGE		(LNET_NRB_LARGE_MIN * 4)
+#define LNET_NRB_LARGE_PAGES	((LNET_MTU + PAGE_SIZE - 1) >> \
+				  PAGE_SHIFT)
+
+static char *forwarding = "";
+module_param(forwarding, charp, 0444);
+MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+module_param(tiny_router_buffers, int, 0444);
+MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+module_param(small_router_buffers, int, 0444);
+MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+module_param(large_router_buffers, int, 0444);
+MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router");
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer");
+
+static int auto_down = 1;
+module_param(auto_down, int, 0444);
+MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(struct lnet_net *net)
+{
+	/* NI option overrides LNet default */
+	if (net->net_tunables.lct_peer_rtr_credits > 0)
+		return net->net_tunables.lct_peer_rtr_credits;
+	if (peer_buffer_credits > 0)
+		return peer_buffer_credits;
+
+	/* As an approximation, allow this peer the same number of router
+	 * buffers as it is allowed outstanding sends */
+	return net->net_tunables.lct_peer_tx_credits;
+}
+
+static int check_routers_before_use;
+module_param(check_routers_before_use, int, 0444);
+MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use");
+
+int avoid_asym_router_failure = 1;
+module_param(avoid_asym_router_failure, int, 0644);
+MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)");
+
+int dead_router_check_interval = INT_MIN;
+module_param(dead_router_check_interval, int, 0444);
+MODULE_PARM_DESC(dead_router_check_interval, "(DEPRECATED - Use alive_router_check_interval)");
+
+int live_router_check_interval = INT_MIN;
+module_param(live_router_check_interval, int, 0444);
+MODULE_PARM_DESC(live_router_check_interval, "(DEPRECATED - Use alive_router_check_interval)");
+
+int alive_router_check_interval = 60;
+module_param(alive_router_check_interval, int, 0644);
+MODULE_PARM_DESC(alive_router_check_interval, "Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+module_param(router_ping_timeout, int, 0644);
+MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query");
+
+/*
+ * A value between 0 and 100. 0 meaning that even if router's interfaces
+ * have the worse health still consider the gateway usable.
+ * 100 means that at least one interface on the route's remote net is 100%
+ * healthy to consider the route alive.
+ * The default is set to 100 to ensure we maintain the original behavior.
+ */
+unsigned int router_sensitivity_percentage = 100;
+static int rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
+static struct kernel_param_ops param_ops_rtr_sensitivity = {
+	.set = rtr_sensitivity_set,
+	.get = param_get_int,
+};
+#define param_check_rtr_sensitivity(name, p) \
+		__param_check(name, p, int)
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(router_sensitivity_percentage, rtr_sensitivity, S_IRUGO|S_IWUSR);
+#else
+module_param_call(router_sensitivity_percentage, rtr_sensitivity_set, param_get_int,
+		  &router_sensitivity_percentage, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(router_sensitivity_percentage,
+		"How healthy a gateway should be to be used in percent");
+
+static void lnet_add_route_to_rnet(struct lnet_remotenet *rnet,
+				   struct lnet_route *route);
+static void lnet_del_route_from_rnet(struct lnet_nid *gw_nid,
+				     struct list_head *route_list,
+				     struct list_head *zombies);
+
+static int
+rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *sen = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'router_sensitivity_percentage'\n");
+		return rc;
+	}
+
+	if (value < 0 || value > 100) {
+		CERROR("Invalid value: %lu for 'router_sensitivity_percentage'\n", value);
+		return -EINVAL;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	*sen = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+void
+lnet_move_route(struct lnet_route *route, struct lnet_peer *lp,
+		struct list_head *rt_list)
+{
+	struct lnet_remotenet *rnet;
+	struct list_head zombies;
+	struct list_head *l;
+
+	INIT_LIST_HEAD(&zombies);
+
+	if (rt_list)
+		l = rt_list;
+	else
+		l = &zombies;
+
+	rnet = lnet_find_rnet_locked(route->lr_net);
+	LASSERT(rnet);
+
+	CDEBUG(D_NET, "deleting route %s->%s\n",
+	       libcfs_net2str(route->lr_net),
+	       libcfs_nidstr(&route->lr_nid));
+
+	/*
+	 * use the gateway's lp_primary_nid to delete the route as the
+	 * lr_nid can be a constituent NID of the peer
+	 */
+	lnet_del_route_from_rnet(
+		&route->lr_gateway->lp_primary_nid,
+		&rnet->lrn_routes, l);
+
+	if (lp) {
+		route = list_first_entry(l, struct lnet_route,
+					lr_list);
+		route->lr_gateway = lp;
+		lnet_add_route_to_rnet(rnet, route);
+	} else {
+		while (!list_empty(l) && !rt_list) {
+			route = list_first_entry(l, struct lnet_route,
+				 lr_list);
+			list_del(&route->lr_list);
+			LIBCFS_FREE(route, sizeof(*route));
+		}
+	}
+}
+
+void
+lnet_rtr_transfer_to_peer(struct lnet_peer *src, struct lnet_peer *target)
+{
+	struct lnet_route *route;
+	struct lnet_route *tmp, *tmp2;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	CDEBUG(D_NET, "transfering routes from %s -> %s\n",
+	       libcfs_nidstr(&src->lp_primary_nid),
+	       libcfs_nidstr(&target->lp_primary_nid));
+	list_for_each_entry(route, &src->lp_routes, lr_gwlist) {
+		CDEBUG(D_NET, "%s: %s->%s\n",
+		       libcfs_nidstr(&src->lp_primary_nid),
+		       libcfs_net2str(route->lr_net),
+		       libcfs_nidstr(&route->lr_nid));
+	}
+	list_splice_init(&src->lp_rtrq, &target->lp_rtrq);
+	list_for_each_entry_safe(route, tmp, &src->lp_routes, lr_gwlist) {
+		struct lnet_route *r2;
+		bool present = false;
+		list_for_each_entry_safe(r2, tmp2, &target->lp_routes, lr_gwlist) {
+			if (route->lr_net == r2->lr_net) {
+				if (route->lr_priority >= r2->lr_priority)
+					present = true;
+				else if (route->lr_hops >= r2->lr_hops)
+					present = true;
+				else
+					lnet_move_route(r2, NULL, NULL);
+			}
+		}
+		if (present)
+			lnet_move_route(route, NULL, NULL);
+		else
+			lnet_move_route(route, target, NULL);
+	}
+
+	if (list_empty(&target->lp_rtr_list)) {
+		lnet_peer_addref_locked(target);
+		list_add_tail(&target->lp_rtr_list, &the_lnet.ln_routers);
+	}
+
+	the_lnet.ln_routers_version++;
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+int
+lnet_peers_start_down(void)
+{
+	return check_routers_before_use;
+}
+
+/*
+ * The peer_net of a gateway is alive if at least one of the peer_ni's on
+ * that peer_net is alive.
+ */
+static bool
+lnet_is_gateway_net_alive(struct lnet_peer_net *lpn)
+{
+	struct lnet_peer_ni *lpni;
+
+	list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) {
+		if (lnet_is_peer_ni_alive(lpni))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * a gateway is alive only if all its nets are alive
+ * called with cpt lock held
+ */
+bool lnet_is_gateway_alive(struct lnet_peer *gw)
+{
+	struct lnet_peer_net *lpn;
+
+	if (!gw->lp_alive)
+		return false;
+
+	list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) {
+		if (!lnet_is_gateway_net_alive(lpn))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * lnet_is_route_alive() needs to be called with cpt lock held
+ * A route is alive if the gateway can route between the local network and
+ * the remote network of the route.
+ * This means at least one NI is alive on each of the local and remote
+ * networks of the gateway.
+ */
+bool lnet_is_route_alive(struct lnet_route *route)
+{
+	struct lnet_peer *gw = route->lr_gateway;
+	struct lnet_peer_net *llpn;
+	struct lnet_peer_net *rlpn;
+
+	/* If the gateway is down then all routes are considered down */
+	if (!gw->lp_alive)
+		return false;
+
+	/*
+	 * if discovery is disabled then rely on the cached aliveness
+	 * information. This is handicapped information which we log when
+	 * we receive the discovery ping response. The most uptodate
+	 * aliveness information can only be obtained when discovery is
+	 * enabled.
+	 */
+	if (lnet_is_discovery_disabled(gw))
+		return atomic_read(&route->lr_alive) == 1;
+
+	/*
+	 * check the gateway's interfaces on the local network
+	 */
+	llpn = lnet_peer_get_net_locked(gw, route->lr_lnet);
+	if (!llpn)
+		return false;
+
+	if (!lnet_is_gateway_net_alive(llpn))
+		return false;
+
+	/*
+	 * For single hop routes avoid_asym_router_failure dictates
+	 * that the remote net must exist on the gateway. For multi-hop
+	 * routes the next-hop will not have the remote net.
+	 */
+	if (avoid_asym_router_failure &&
+	    (route->lr_hops == 1 || route->lr_single_hop)) {
+		rlpn = lnet_peer_get_net_locked(gw, route->lr_net);
+		if (!rlpn)
+			return false;
+		if (!lnet_is_gateway_net_alive(rlpn))
+			return false;
+	}
+
+	spin_lock(&gw->lp_lock);
+	if (!(gw->lp_state & LNET_PEER_ROUTER_ENABLED)) {
+		spin_unlock(&gw->lp_lock);
+		if (gw->lp_rtr_refcount > 0)
+			CERROR("peer %s is being used as a gateway but routing feature is not turned on\n",
+			       libcfs_nidstr(&gw->lp_primary_nid));
+		return false;
+	}
+	spin_unlock(&gw->lp_lock);
+
+	return true;
+}
+
+void
+lnet_consolidate_routes_locked(struct lnet_peer *orig_lp,
+			       struct lnet_peer *new_lp)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_route *route;
+
+	/*
+	 * Although a route is correlated with a peer, but when it's added
+	 * a specific NID is used. That NID refers to a peer_ni within
+	 * a peer. There could be other peer_nis on the same net, which
+	 * can be used to send to that gateway. However when we are
+	 * consolidating gateways because of discovery, the nid used to
+	 * add the route might've moved between gateway peers. In this
+	 * case we want to move the route to the new gateway as well. The
+	 * intent here is not to confuse the user who added the route.
+	 */
+	list_for_each_entry(route, &orig_lp->lp_routes, lr_gwlist) {
+		lpni = lnet_peer_ni_get_locked(orig_lp, &route->lr_nid);
+		if (!lpni) {
+			lnet_net_lock(LNET_LOCK_EX);
+			list_move(&route->lr_gwlist, &new_lp->lp_routes);
+			lnet_net_unlock(LNET_LOCK_EX);
+		}
+	}
+}
+
+static inline void
+lnet_check_route_inconsistency(struct lnet_route *route)
+{
+	if (!route->lr_single_hop &&
+	    (route->lr_hops == 1 || route->lr_hops == LNET_UNDEFINED_HOPS)) {
+		CWARN("route %s->%s is detected to be multi-hop but hop count is set to %d\n",
+			libcfs_net2str(route->lr_net),
+			libcfs_nidstr(&route->lr_gateway->lp_primary_nid),
+			(int) route->lr_hops);
+	}
+}
+
+static void
+lnet_set_route_hop_type(struct lnet_peer *gw, struct lnet_route *route)
+{
+	struct lnet_peer_net *lpn;
+	bool single_hop = false;
+
+	list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) {
+		if (route->lr_net == lpn->lpn_net_id) {
+			single_hop = true;
+			break;
+		}
+	}
+	route->lr_single_hop = single_hop;
+	lnet_check_route_inconsistency(route);
+}
+
+/* Must hold net_lock/EX */
+void
+lnet_router_discovery_ping_reply(struct lnet_peer *lp)
+{
+	struct lnet_ping_buffer *pbuf = lp->lp_data;
+	struct lnet_peer_net *llpn;
+	struct lnet_route *route;
+	bool single_hop = false;
+	bool net_up = false;
+	unsigned lp_state;
+	__u32 net;
+	int i;
+
+
+	spin_lock(&lp->lp_lock);
+	lp_state = lp->lp_state;
+
+	/* only handle replies if discovery is disabled. */
+	if (!lnet_is_discovery_disabled_locked(lp)) {
+		spin_unlock(&lp->lp_lock);
+		return;
+	}
+
+	spin_unlock(&lp->lp_lock);
+
+	if (lp_state & LNET_PEER_PING_FAILED ||
+	    pbuf->pb_info.pi_features & LNET_PING_FEAT_RTE_DISABLED) {
+		CDEBUG(D_NET, "Set routes down for gw %s because %s %d\n",
+		       libcfs_nidstr(&lp->lp_primary_nid),
+		       lp_state & LNET_PEER_PING_FAILED ? "ping failed" :
+		       "route feature is disabled", lp->lp_ping_error);
+		/* If the ping failed or the peer has routing disabled then
+		 * mark the routes served by this peer down
+		 */
+		list_for_each_entry(route, &lp->lp_routes, lr_gwlist)
+			lnet_set_route_aliveness(route, false);
+		return;
+	}
+
+	CDEBUG(D_NET, "Discovery is disabled. Processing reply for gw: %s:%d\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), pbuf->pb_info.pi_nnis);
+
+	/*
+	 * examine the ping response to determine if the routes on that
+	 * gateway should be declared alive.
+	 * The route is alive if:
+	 *  1. local network to reach the route is alive and
+	 *  2. route is single hop, avoid_async_router_failure is set and
+	 *     there exists at least one NI on the route's remote net
+	 */
+	list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
+		llpn = lnet_peer_get_net_locked(lp, route->lr_lnet);
+		if (!llpn) {
+			lnet_set_route_aliveness(route, false);
+			continue;
+		}
+
+		if (!lnet_is_gateway_net_alive(llpn)) {
+			lnet_set_route_aliveness(route, false);
+			continue;
+		}
+
+		single_hop = net_up = false;
+		for (i = 1; i < pbuf->pb_info.pi_nnis; i++) {
+			net = LNET_NIDNET(pbuf->pb_info.pi_ni[i].ns_nid);
+
+			if (route->lr_net == net) {
+				single_hop = true;
+				if (pbuf->pb_info.pi_ni[i].ns_status ==
+				    LNET_NI_STATUS_UP) {
+					net_up = true;
+					break;
+				}
+			}
+		}
+
+		route->lr_single_hop = single_hop;
+		if (avoid_asym_router_failure &&
+		    (route->lr_hops == 1 || route->lr_single_hop))
+			lnet_set_route_aliveness(route, net_up);
+		else
+			lnet_set_route_aliveness(route, true);
+
+		/*
+		 * warn that the route is configured as single-hop but it
+		 * really is multi-hop as far as we can tell.
+		 */
+		lnet_check_route_inconsistency(route);
+	}
+}
+
+void
+lnet_router_discovery_complete(struct lnet_peer *lp)
+{
+	struct lnet_peer_ni *lpni = NULL;
+	struct lnet_route *route;
+
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~LNET_PEER_RTR_DISCOVERY;
+	lp->lp_state |= LNET_PEER_RTR_DISCOVERED;
+	lp->lp_alive = lp->lp_dc_error == 0;
+	spin_unlock(&lp->lp_lock);
+
+	if (!lp->lp_dc_error) {
+		/* ping replies are being handled when discovery is disabled */
+		if (lnet_is_discovery_disabled_locked(lp))
+			return;
+
+		/*
+		* mark single-hop routes.  If the remote net is not configured on
+		* the gateway we assume this is intentional and we mark the
+		* gateway as multi-hop
+		*/
+		list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
+			lnet_set_route_aliveness(route, true);
+			lnet_set_route_hop_type(lp, route);
+		}
+
+		return;
+	}
+
+	/*
+	 * We do not send messages directly to the remote interfaces
+	 * of an LNet router. As such, we rely on the PING response
+	 * to determine the up/down status of these interfaces. If
+	 * a PING response is not receieved, or some other problem with
+	 * discovery occurs that prevents us from getting this status,
+	 * we assume all interfaces are down until we're able to
+	 * determine otherwise.
+	 */
+	CDEBUG(D_NET, "%s: Router discovery failed %d\n",
+	       libcfs_nidstr(&lp->lp_primary_nid), lp->lp_dc_error);
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
+		lpni->lpni_ns_status = LNET_NI_STATUS_DOWN;
+
+	list_for_each_entry(route, &lp->lp_routes, lr_gwlist)
+		lnet_set_route_aliveness(route, false);
+}
+
+static void
+lnet_rtr_addref_locked(struct lnet_peer *lp)
+{
+	LASSERT(lp->lp_rtr_refcount >= 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount++;
+	if (lp->lp_rtr_refcount == 1) {
+		list_add_tail(&lp->lp_rtr_list, &the_lnet.ln_routers);
+		/* addref for the_lnet.ln_routers */
+		lnet_peer_addref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+static void
+lnet_rtr_decref_locked(struct lnet_peer *lp)
+{
+	LASSERT(atomic_read(&lp->lp_refcount) > 0);
+	LASSERT(lp->lp_rtr_refcount > 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount--;
+	if (lp->lp_rtr_refcount == 0) {
+		LASSERT(list_empty(&lp->lp_routes));
+
+		list_del(&lp->lp_rtr_list);
+		/* decref for the_lnet.ln_routers */
+		lnet_peer_decref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+struct lnet_remotenet *
+lnet_find_rnet_locked(__u32 net)
+{
+	struct lnet_remotenet *rnet;
+	struct list_head *tmp;
+	struct list_head *rn_list;
+
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
+
+	rn_list = lnet_net2rnethash(net);
+	list_for_each(tmp, rn_list) {
+		rnet = list_entry(tmp, struct lnet_remotenet, lrn_list);
+
+		if (rnet->lrn_net == net)
+			return rnet;
+	}
+	return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+	static int seeded;
+	struct lnet_ni *ni = NULL;
+
+	if (seeded)
+		return;
+
+	/* Nodes with small feet have little entropy
+	 * the NID for this node gives the most entropy in the low bits */
+	while ((ni = lnet_get_next_ni_locked(NULL, ni)))
+		add_device_randomness(&ni->ni_nid, sizeof(ni->ni_nid));
+
+	seeded = 1;
+}
+
+/* NB expects LNET_LOCK held */
+static void
+lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route)
+{
+	struct lnet_peer_net *lpn;
+	unsigned int offset = 0;
+	unsigned int len = 0;
+	struct list_head *e;
+	time64_t now;
+
+	lnet_shuffle_seed();
+
+	list_for_each(e, &rnet->lrn_routes)
+		len++;
+
+	/*
+	 * Randomly adding routes to the list is done to ensure that when
+	 * different nodes are using the same list of routers, they end up
+	 * preferring different routers.
+	 */
+	offset = get_random_u32_below(len + 1);
+	list_for_each(e, &rnet->lrn_routes) {
+		if (offset == 0)
+			break;
+		offset--;
+	}
+	list_add(&route->lr_list, e);
+	/*
+	 * force a router check on the gateway to make sure the route is
+	 * alive
+	 */
+	now = ktime_get_real_seconds();
+	list_for_each_entry(lpn, &route->lr_gateway->lp_peer_nets,
+			    lpn_peer_nets) {
+		lpn->lpn_next_ping = now;
+	}
+
+	the_lnet.ln_remote_nets_version++;
+
+	/* add the route on the gateway list */
+	list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+	/* take a router reference count on the gateway */
+	lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route(__u32 net, __u32 hops, struct lnet_nid *gateway,
+	       __u32 priority, __u32 sensitivity)
+{
+	struct list_head *route_entry;
+	struct lnet_remotenet *rnet;
+	struct lnet_remotenet *rnet2;
+	struct lnet_route *route;
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *gw;
+	int add_route;
+	int rc;
+
+	CDEBUG(D_NET, "Add route: remote net %s hops %d priority %u gw %s\n",
+	       libcfs_net2str(net), hops, priority, libcfs_nidstr(gateway));
+
+	if (LNET_NID_IS_ANY(gateway) ||
+	    nid_is_lo0(gateway) ||
+	    net == LNET_NET_ANY ||
+	    LNET_NETTYP(net) == LOLND ||
+	    LNET_NID_NET(gateway) == net ||
+	    (hops != LNET_UNDEFINED_HOPS && (hops < 1 || hops > 255)))
+		return -EINVAL;
+
+	/* it's a local network */
+	if (lnet_islocalnet(net))
+		return -EEXIST;
+
+	if (!lnet_islocalnet(LNET_NID_NET(gateway))) {
+		CERROR("Cannot add route with gateway %s. There is no local interface configured on LNet %s\n",
+		       libcfs_nidstr(gateway),
+		       libcfs_net2str(LNET_NID_NET(gateway)));
+		return -EHOSTUNREACH;
+	}
+
+	/* Assume net, route, all new */
+	LIBCFS_ALLOC(route, sizeof(*route));
+	LIBCFS_ALLOC(rnet, sizeof(*rnet));
+	if (route == NULL || rnet == NULL) {
+		CERROR("Out of memory creating route %s %d %s\n",
+		       libcfs_net2str(net), hops, libcfs_nidstr(gateway));
+		if (route != NULL)
+			LIBCFS_FREE(route, sizeof(*route));
+		if (rnet != NULL)
+			LIBCFS_FREE(rnet, sizeof(*rnet));
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&rnet->lrn_routes);
+	rnet->lrn_net = net;
+	/* store the local and remote net that the route represents */
+	route->lr_lnet = LNET_NID_NET(gateway);
+	route->lr_net = net;
+	route->lr_nid = *gateway;
+	route->lr_priority = priority;
+	route->lr_hops = hops;
+	if (lnet_peers_start_down())
+		atomic_set(&route->lr_alive, 0);
+	else
+		atomic_set(&route->lr_alive, 1);
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	/*
+	 * lnet_nid2peerni_ex() grabs a ref on the lpni. We will need to
+	 * lose that once we're done
+	 */
+	lpni = lnet_nid2peerni_ex(gateway, LNET_LOCK_EX);
+	if (IS_ERR(lpni)) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		LIBCFS_FREE(route, sizeof(*route));
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+		rc = PTR_ERR(lpni);
+		CERROR("Error %d creating route %s %d %s\n", rc,
+			libcfs_net2str(net), hops,
+			libcfs_nidstr(gateway));
+		return rc;
+	}
+
+	LASSERT(lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer);
+	gw = lpni->lpni_peer_net->lpn_peer;
+
+	route->lr_gateway = gw;
+
+	rnet2 = lnet_find_rnet_locked(net);
+	if (rnet2 == NULL) {
+		/* new network */
+		list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+		rnet2 = rnet;
+	}
+
+	/* Search for a duplicate route (it's a NOOP if it is) */
+	add_route = 1;
+	list_for_each(route_entry, &rnet2->lrn_routes) {
+		struct lnet_route *route2;
+
+		route2 = list_entry(route_entry, struct lnet_route, lr_list);
+		if (route2->lr_gateway == route->lr_gateway) {
+			add_route = 0;
+			break;
+		}
+
+		/* our lookups must be true */
+		LASSERT(!nid_same(&route2->lr_gateway->lp_primary_nid,
+				  gateway));
+	}
+
+	/*
+	 * It is possible to add multiple routes through the same peer,
+	 * but it'll be using a different NID of that peer. When the
+	 * gateway is discovered, discovery will consolidate the different
+	 * peers into one peer. In this case the discovery code will have
+	 * to move the routes from the peer that's being deleted to the
+	 * consolidated peer lp_routes list
+	 */
+	if (add_route) {
+		gw->lp_health_sensitivity = sensitivity;
+		lnet_add_route_to_rnet(rnet2, route);
+		if (lnet_peer_discovery_disabled)
+			CWARN("Consider turning discovery on to enable full Multi-Rail routing functionality\n");
+	}
+
+	/*
+	 * get rid of the reference on the lpni.
+	 */
+	lnet_peer_ni_decref_locked(lpni);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* If avoid_asym_router_failure is enabled and hop count is not
+	 * set to 1 for a route that is actually single-hop, then the
+	 * feature will fail to prevent the router from being selected
+	 * if it is missing a NI on the remote network due to misconfiguration.
+	 */
+	if (avoid_asym_router_failure && hops == LNET_UNDEFINED_HOPS)
+		CWARN("Use hops = 1 for a single-hop route when avoid_asym_router_failure feature is enabled\n");
+
+	rc = 0;
+
+	if (!add_route) {
+		rc = -EEXIST;
+		LIBCFS_FREE(route, sizeof(*route));
+	}
+
+	if (rnet != rnet2)
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+	/* kick start the monitor thread to handle the added route */
+	complete(&the_lnet.ln_mt_wait_complete);
+
+	return rc;
+}
+
+void
+lnet_del_route_from_rnet(struct lnet_nid *gw_nid,
+			 struct list_head *route_list,
+			 struct list_head *zombies)
+{
+	struct lnet_peer *gateway;
+	struct lnet_route *route;
+	struct lnet_route *tmp;
+
+	list_for_each_entry_safe(route, tmp, route_list, lr_list) {
+		gateway = route->lr_gateway;
+		if (gw_nid && !nid_same(gw_nid, &gateway->lp_primary_nid))
+			continue;
+
+		/*
+		 * move to zombie to delete outside the lock
+		 * Note that this function is called with the
+		 * ln_api_mutex held as well as the exclusive net
+		 * lock. Adding to the remote net list happens
+		 * under the same conditions. Same goes for the
+		 * gateway router list
+		 */
+		list_move(&route->lr_list, zombies);
+		the_lnet.ln_remote_nets_version++;
+
+		list_del(&route->lr_gwlist);
+		lnet_rtr_decref_locked(gateway);
+	}
+}
+
+int
+lnet_del_route(__u32 net, struct lnet_nid *gw)
+{
+	LIST_HEAD(rnet_zombies);
+	struct lnet_remotenet *rnet;
+	struct lnet_remotenet *tmp;
+	struct list_head *rn_list;
+	struct lnet_peer_ni *lpni;
+	struct lnet_route *route;
+	struct lnet_nid gw_nid;
+	LIST_HEAD(zombies);
+	struct lnet_peer *lp = NULL;
+	int i = 0;
+
+	CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+	       libcfs_net2str(net), libcfs_nidstr(gw));
+
+	/* NB Caller may specify either all routes via the given gateway
+	 * or a specific route entry actual NIDs) */
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (gw)
+		lpni = lnet_peer_ni_find_locked(gw);
+	else
+		lpni = NULL;
+	if (lpni) {
+		lp = lpni->lpni_peer_net->lpn_peer;
+		LASSERT(lp);
+		gw_nid = lp->lp_primary_nid;
+		gw = &gw_nid;
+		lnet_peer_ni_decref_locked(lpni);
+	}
+
+	if (net != LNET_NET_ANY) {
+		rnet = lnet_find_rnet_locked(net);
+		if (!rnet) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			return -ENOENT;
+		}
+		lnet_del_route_from_rnet(gw, &rnet->lrn_routes,
+					 &zombies);
+		if (list_empty(&rnet->lrn_routes))
+			list_move(&rnet->lrn_list, &rnet_zombies);
+		goto delete_zombies;
+	}
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+		list_for_each_entry_safe(rnet, tmp, rn_list, lrn_list) {
+			lnet_del_route_from_rnet(gw, &rnet->lrn_routes,
+						 &zombies);
+			if (list_empty(&rnet->lrn_routes))
+				list_move(&rnet->lrn_list, &rnet_zombies);
+		}
+	}
+
+delete_zombies:
+	/*
+	 * check if there are any routes remaining on the gateway
+	 * If there are no more routes make sure to set the peer's
+	 * lp_disc_net_id to 0 (invalid), in case we add more routes in
+	 * the future on that gateway, then we start our discovery process
+	 * from scratch
+	 */
+	if (lpni) {
+		if (list_empty(&lp->lp_routes))
+			lp->lp_disc_net_id = 0;
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	while (!list_empty(&zombies)) {
+		route = list_first_entry(&zombies, struct lnet_route, lr_list);
+		list_del(&route->lr_list);
+		LIBCFS_FREE(route, sizeof(*route));
+	}
+
+	while (!list_empty(&rnet_zombies)) {
+		rnet = list_first_entry(&rnet_zombies, struct lnet_remotenet,
+					lrn_list);
+		list_del(&rnet->lrn_list);
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+	}
+
+	return 0;
+}
+
+void
+lnet_destroy_routes(void)
+{
+	lnet_del_route(LNET_NET_ANY, NULL);
+}
+
+int lnet_get_rtr_pool_cfg(int cpt, struct lnet_ioctl_pool_cfg *pool_cfg)
+{
+	struct lnet_rtrbufpool *rbp;
+	int i, rc = -ENOENT, j;
+
+	if (the_lnet.ln_rtrpools == NULL)
+		return rc;
+
+
+	cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+		if (i != cpt)
+			continue;
+
+		lnet_net_lock(i);
+		for (j = 0; j < LNET_NRBPOOLS; j++) {
+			pool_cfg->pl_pools[j].pl_npages = rbp[j].rbp_npages;
+			pool_cfg->pl_pools[j].pl_nbuffers = rbp[j].rbp_nbuffers;
+			pool_cfg->pl_pools[j].pl_credits = rbp[j].rbp_credits;
+			pool_cfg->pl_pools[j].pl_mincredits = rbp[j].rbp_mincredits;
+		}
+		lnet_net_unlock(i);
+		rc = 0;
+		break;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	pool_cfg->pl_routing = the_lnet.ln_routing;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops, lnet_nid_t *gateway,
+	       __u32 *flags, __u32 *priority, __u32 *sensitivity)
+{
+	struct lnet_remotenet *rnet;
+	struct list_head *rn_list;
+	struct lnet_route *route;
+	struct list_head *e1;
+	struct list_head *e2;
+	int cpt;
+	int i;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, struct lnet_remotenet, lrn_list);
+
+			list_for_each(e2, &rnet->lrn_routes) {
+				route = list_entry(e2, struct lnet_route,
+						   lr_list);
+
+				if (idx-- == 0) {
+					*net	  = rnet->lrn_net;
+					*gateway  = lnet_nid_to_nid4(&route->lr_nid);
+					*hops	  = route->lr_hops;
+					*priority = route->lr_priority;
+					*sensitivity = route->lr_gateway->
+						lp_health_sensitivity;
+					if (lnet_is_route_alive(route))
+						*flags |= LNET_RT_ALIVE;
+					else
+						*flags &= ~LNET_RT_ALIVE;
+					if (route->lr_single_hop)
+						*flags &= ~LNET_RT_MULTI_HOP;
+					else
+						*flags |= LNET_RT_MULTI_HOP;
+					lnet_net_unlock(cpt);
+					return 0;
+				}
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return -ENOENT;
+}
+
+static void
+lnet_wait_known_routerstate(void)
+{
+	struct lnet_peer *rtr;
+	struct list_head *entry;
+	int all_known;
+
+	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
+
+	for (;;) {
+		int cpt = lnet_net_lock_current();
+
+		all_known = 1;
+		list_for_each(entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, struct lnet_peer,
+					 lp_rtr_list);
+
+			spin_lock(&rtr->lp_lock);
+
+			if ((rtr->lp_state & LNET_PEER_RTR_DISCOVERED) == 0) {
+				all_known = 0;
+				spin_unlock(&rtr->lp_lock);
+				break;
+			}
+			spin_unlock(&rtr->lp_lock);
+		}
+
+		lnet_net_unlock(cpt);
+
+		if (all_known)
+			return;
+
+		schedule_timeout_uninterruptible(cfs_time_seconds(1));
+	}
+}
+
+static inline bool
+lnet_net_set_status_locked(struct lnet_net *net, __u32 status)
+{
+	struct lnet_ni *ni;
+	bool update = false;
+
+	list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
+		if (lnet_ni_set_status(ni, status))
+			update = true;
+
+	return update;
+}
+
+static bool
+lnet_update_ni_status_locked(void)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+	bool push = false;
+	time64_t now;
+	time64_t timeout;
+
+	LASSERT(the_lnet.ln_routing);
+
+	timeout = router_ping_timeout + alive_router_check_interval;
+
+	now = ktime_get_seconds();
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		if (net->net_lnd->lnd_type == LOLND)
+			continue;
+
+		if (now < net->net_last_alive + timeout)
+			goto check_ni_fatal;
+
+		spin_lock(&net->net_lock);
+		/* re-check with lock */
+		if (now < net->net_last_alive + timeout) {
+			spin_unlock(&net->net_lock);
+			goto check_ni_fatal;
+		}
+		spin_unlock(&net->net_lock);
+
+		/*
+		 * if the net didn't receive any traffic for past the
+		 * timeout on any of its constituent NIs, then mark all
+		 * the NIs down.
+		 */
+		if (lnet_net_set_status_locked(net, LNET_NI_STATUS_DOWN)) {
+			push = true;
+			continue;
+		}
+
+check_ni_fatal:
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			/* lnet_ni_set_status() will perform the same check of
+			 * ni_status while holding the ni lock. We can safely
+			 * check ni_status without that lock because it is only
+			 * written to under net_lock/EX and our caller is
+			 * holding a net lock.
+			 */
+			if (atomic_read(&ni->ni_fatal_error_on) &&
+			    ni->ni_status &&
+			    ni->ni_status->ns_status != LNET_NI_STATUS_DOWN &&
+			    lnet_ni_set_status(ni, LNET_NI_STATUS_DOWN))
+				push = true;
+		}
+	}
+
+	return push;
+}
+
+void lnet_wait_router_start(void)
+{
+	if (check_routers_before_use) {
+		/* Note that a helpful side-effect of pinging all known routers
+		 * at startup is that it makes them drop stale connections they
+		 * may have to a previous instance of me. */
+		lnet_wait_known_routerstate();
+	}
+}
+
+/*
+ * This function is called from the monitor thread to check if there are
+ * any active routers that need to be checked.
+ */
+bool lnet_router_checker_active(void)
+{
+	/* Router Checker thread needs to run when routing is enabled in
+	 * order to call lnet_update_ni_status_locked() */
+	if (the_lnet.ln_routing)
+		return true;
+
+	return !list_empty(&the_lnet.ln_routers) &&
+		alive_router_check_interval > 0;
+}
+
+void
+lnet_check_routers(void)
+{
+	struct lnet_peer_net *first_lpn;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	struct list_head *entry;
+	struct lnet_peer *rtr;
+	bool push = false;
+	bool needs_ping;
+	bool found_lpn;
+	__u64 version;
+	__u32 net_id;
+	time64_t now;
+	int cpt;
+	int rc;
+
+	cpt = lnet_net_lock_current();
+rescan:
+	version = the_lnet.ln_routers_version;
+
+	list_for_each(entry, &the_lnet.ln_routers) {
+		rtr = list_entry(entry, struct lnet_peer,
+				 lp_rtr_list);
+
+		/* If we're currently discovering the peer then don't
+		 * issue another discovery
+		 */
+		if (rtr->lp_state & LNET_PEER_RTR_DISCOVERY)
+			continue;
+
+		now = ktime_get_real_seconds();
+
+		/* find the next local peer net which needs to be ping'd */
+		needs_ping = false;
+		first_lpn = NULL;
+		found_lpn = false;
+		net_id = rtr->lp_disc_net_id;
+		do {
+			lpn = lnet_get_next_peer_net_locked(rtr, net_id);
+			if (!lpn) {
+				CERROR("gateway %s has no networks\n",
+				libcfs_nidstr(&rtr->lp_primary_nid));
+				break;
+			}
+
+			/* We looped back to the first peer net */
+			if (first_lpn == lpn)
+				break;
+			if (!first_lpn)
+				first_lpn = lpn;
+
+			net_id = lpn->lpn_net_id;
+			if (!lnet_islocalnet_locked(net_id))
+				continue;
+
+			found_lpn = true;
+
+			CDEBUG(D_NET, "rtr %s(%p) %s(%p) next ping %lld\n",
+			       libcfs_nidstr(&rtr->lp_primary_nid), rtr,
+			       libcfs_net2str(net_id), lpn,
+			       lpn->lpn_next_ping);
+
+			needs_ping = now >= lpn->lpn_next_ping;
+
+		} while (!needs_ping);
+
+		if (!found_lpn || !lpn) {
+			CERROR("no local network found for gateway %s\n",
+			       libcfs_nidstr(&rtr->lp_primary_nid));
+			continue;
+		}
+
+		if (!needs_ping)
+			continue;
+
+		spin_lock(&rtr->lp_lock);
+		/* make sure we fully discover the router */
+		rtr->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		rtr->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH |
+			LNET_PEER_RTR_DISCOVERY;
+		spin_unlock(&rtr->lp_lock);
+
+		/* find the peer_ni associated with the primary NID */
+		lpni = lnet_peer_get_ni_locked(
+			rtr, lnet_nid_to_nid4(&rtr->lp_primary_nid));
+		if (!lpni) {
+			CDEBUG(D_NET, "Expected to find an lpni for %s, but non found\n",
+			       libcfs_nidstr(&rtr->lp_primary_nid));
+			continue;
+		}
+		lnet_peer_ni_addref_locked(lpni);
+
+		/* specify the net to use */
+		rtr->lp_disc_net_id = lpn->lpn_net_id;
+
+		/* discover the router */
+		CDEBUG(D_NET, "discover %s, cpt = %d\n",
+		       libcfs_nidstr(&lpni->lpni_nid), cpt);
+		rc = lnet_discover_peer_locked(lpni, cpt, false);
+
+		/* drop ref taken above */
+		lnet_peer_ni_decref_locked(lpni);
+
+		if (!rc)
+			lpn->lpn_next_ping = now + alive_router_check_interval;
+		else
+			CERROR("Failed to discover router %s\n",
+			       libcfs_nidstr(&rtr->lp_primary_nid));
+
+		/* NB cpt lock was dropped in lnet_discover_peer_locked() */
+		if (version != the_lnet.ln_routers_version) {
+			/* the routers list has changed */
+			goto rescan;
+		}
+	}
+
+	if (the_lnet.ln_routing)
+		push = lnet_update_ni_status_locked();
+
+	lnet_net_unlock(cpt);
+
+	/* if the status of the ni changed update the peers */
+	if (push)
+		lnet_push_update_to_peers(1);
+}
+
+void
+lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages)
+{
+	int sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]);
+
+	while (--npages >= 0)
+		__free_page(rb->rb_kiov[npages].bv_page);
+
+	LIBCFS_FREE(rb, sz);
+}
+
+static struct lnet_rtrbuf *
+lnet_new_rtrbuf(struct lnet_rtrbufpool *rbp, int cpt)
+{
+	int	       npages = rbp->rbp_npages;
+	int	       sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]);
+	struct page   *page;
+	struct lnet_rtrbuf *rb;
+	int	       i;
+
+	LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+	if (rb == NULL)
+		return NULL;
+
+	rb->rb_pool = rbp;
+
+	for (i = 0; i < npages; i++) {
+		page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL |
+					  __GFP_ZERO | __GFP_NORETRY);
+		if (page == NULL) {
+			while (--i >= 0)
+				__free_page(rb->rb_kiov[i].bv_page);
+
+			LIBCFS_FREE(rb, sz);
+			return NULL;
+		}
+
+		rb->rb_kiov[i].bv_len = PAGE_SIZE;
+		rb->rb_kiov[i].bv_offset = 0;
+		rb->rb_kiov[i].bv_page = page;
+	}
+
+	return rb;
+}
+
+static void
+lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt)
+{
+	int npages = rbp->rbp_npages;
+	struct lnet_rtrbuf *rb;
+	LIST_HEAD(tmp);
+
+	if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+		return;
+
+	lnet_net_lock(cpt);
+	list_splice_init(&rbp->rbp_msgs, &tmp);
+	lnet_drop_routed_msgs_locked(&tmp, cpt);
+	list_splice_init(&rbp->rbp_bufs, &tmp);
+	rbp->rbp_req_nbuffers = 0;
+	rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+	rbp->rbp_mincredits = 0;
+	lnet_net_unlock(cpt);
+
+	/* Free buffers on the free list. */
+	while (!list_empty(&tmp)) {
+		rb = list_entry(tmp.next, struct lnet_rtrbuf, rb_list);
+		list_del(&rb->rb_list);
+		lnet_destroy_rtrbuf(rb, npages);
+	}
+}
+
+static int
+lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt)
+{
+	LIST_HEAD(rb_list);
+	struct lnet_rtrbuf *rb;
+	int		num_rb;
+	int		num_buffers = 0;
+	int		old_req_nbufs;
+	int		npages = rbp->rbp_npages;
+
+	lnet_net_lock(cpt);
+	/* If we are called for less buffers than already in the pool, we
+	 * just lower the req_nbuffers number and excess buffers will be
+	 * thrown away as they are returned to the free list.  Credits
+	 * then get adjusted as well.
+	 * If we already have enough buffers allocated to serve the
+	 * increase requested, then we can treat that the same way as we
+	 * do the decrease. */
+	num_rb = nbufs - rbp->rbp_nbuffers;
+	if (nbufs <= rbp->rbp_req_nbuffers || num_rb <= 0) {
+		rbp->rbp_req_nbuffers = nbufs;
+		lnet_net_unlock(cpt);
+		return 0;
+	}
+	/* store the older value of rbp_req_nbuffers and then set it to
+	 * the new request to prevent lnet_return_rx_credits_locked() from
+	 * freeing buffers that we need to keep around */
+	old_req_nbufs = rbp->rbp_req_nbuffers;
+	rbp->rbp_req_nbuffers = nbufs;
+	lnet_net_unlock(cpt);
+
+	/* allocate the buffers on a local list first.	If all buffers are
+	 * allocated successfully then join this list to the rbp buffer
+	 * list.  If not then free all allocated buffers. */
+	while (num_rb-- > 0) {
+		rb = lnet_new_rtrbuf(rbp, cpt);
+		if (rb == NULL) {
+			CERROR("lnet: error allocating %ux%u page router buffers on CPT %u: rc = %d\n",
+			       nbufs, npages, cpt, -ENOMEM);
+
+			lnet_net_lock(cpt);
+			rbp->rbp_req_nbuffers = old_req_nbufs;
+			lnet_net_unlock(cpt);
+
+			goto failed;
+		}
+
+		list_add(&rb->rb_list, &rb_list);
+		num_buffers++;
+	}
+
+	lnet_net_lock(cpt);
+
+	list_splice_tail(&rb_list, &rbp->rbp_bufs);
+	rbp->rbp_nbuffers += num_buffers;
+	rbp->rbp_credits += num_buffers;
+	rbp->rbp_mincredits = rbp->rbp_credits;
+	/* We need to schedule blocked msg using the newly
+	 * added buffers. */
+	while (!list_empty(&rbp->rbp_bufs) &&
+	       !list_empty(&rbp->rbp_msgs))
+		lnet_schedule_blocked_locked(rbp);
+
+	lnet_net_unlock(cpt);
+
+	return 0;
+
+failed:
+	while (!list_empty(&rb_list)) {
+		rb = list_entry(rb_list.next, struct lnet_rtrbuf, rb_list);
+		list_del(&rb->rb_list);
+		lnet_destroy_rtrbuf(rb, npages);
+	}
+
+	return -ENOMEM;
+}
+
+static void
+lnet_rtrpool_init(struct lnet_rtrbufpool *rbp, int npages)
+{
+	INIT_LIST_HEAD(&rbp->rbp_msgs);
+	INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+	rbp->rbp_npages = npages;
+	rbp->rbp_credits = 0;
+	rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(int keep_pools)
+{
+	struct lnet_rtrbufpool *rtrp;
+	int		  i;
+
+	if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+		return;
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_free_bufs(&rtrp[LNET_TINY_BUF_IDX], i);
+		lnet_rtrpool_free_bufs(&rtrp[LNET_SMALL_BUF_IDX], i);
+		lnet_rtrpool_free_bufs(&rtrp[LNET_LARGE_BUF_IDX], i);
+	}
+
+	if (!keep_pools) {
+		cfs_percpt_free(the_lnet.ln_rtrpools);
+		the_lnet.ln_rtrpools = NULL;
+	}
+}
+
+static int
+lnet_nrb_tiny_calculate(void)
+{
+	int	nrbs = LNET_NRB_TINY;
+
+	if (tiny_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "tiny_router_buffers=%d invalid when "
+				   "routing enabled\n", tiny_router_buffers);
+		return -EINVAL;
+	}
+
+	if (tiny_router_buffers > 0)
+		nrbs = tiny_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(void)
+{
+	int	nrbs = LNET_NRB_SMALL;
+
+	if (small_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "small_router_buffers=%d invalid when "
+				   "routing enabled\n", small_router_buffers);
+		return -EINVAL;
+	}
+
+	if (small_router_buffers > 0)
+		nrbs = small_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(void)
+{
+	int	nrbs = LNET_NRB_LARGE;
+
+	if (large_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "large_router_buffers=%d invalid when "
+				   "routing enabled\n", large_router_buffers);
+		return -EINVAL;
+	}
+
+	if (large_router_buffers > 0)
+		nrbs = large_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+	struct lnet_rtrbufpool *rtrp;
+	int	nrb_tiny;
+	int	nrb_small;
+	int	nrb_large;
+	int	rc;
+	int	i;
+
+	if (!strcmp(forwarding, "")) {
+		/* not set either way */
+		if (!im_a_router)
+			return 0;
+	} else if (!strcmp(forwarding, "disabled")) {
+		/* explicitly disabled */
+		return 0;
+	} else if (!strcmp(forwarding, "enabled")) {
+		/* explicitly enabled */
+	} else {
+		rc = -EINVAL;
+		LCONSOLE_ERROR_MSG(0x10b,
+				   "lnet: forwarding='%s' not set to either 'enabled' or 'disabled': rc = %d\n",
+				   forwarding, rc);
+		return rc;
+	}
+
+	nrb_tiny = lnet_nrb_tiny_calculate();
+	if (nrb_tiny < 0)
+		return -EINVAL;
+
+	nrb_small = lnet_nrb_small_calculate();
+	if (nrb_small < 0)
+		return -EINVAL;
+
+	nrb_large = lnet_nrb_large_calculate();
+	if (nrb_large < 0)
+		return -EINVAL;
+
+	the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+						LNET_NRBPOOLS *
+						sizeof(struct lnet_rtrbufpool));
+	if (the_lnet.ln_rtrpools == NULL) {
+		rc = -ENOMEM;
+		LCONSOLE_ERROR_MSG(0x10c,
+			"lnet: error allocating router buffer pool: rc = %d\n",
+			rc);
+		return rc;
+	}
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_init(&rtrp[LNET_TINY_BUF_IDX], 0);
+		rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX],
+					      nrb_tiny, i);
+		if (rc)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[LNET_SMALL_BUF_IDX],
+				  LNET_NRB_SMALL_PAGES);
+		rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX],
+					      nrb_small, i);
+		if (rc)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[LNET_LARGE_BUF_IDX],
+				  LNET_NRB_LARGE_PAGES);
+		rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX],
+					      nrb_large, i);
+		if (rc)
+			goto failed;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 1;
+	lnet_net_unlock(LNET_LOCK_EX);
+	complete(&the_lnet.ln_mt_wait_complete);
+	return 0;
+
+ failed:
+	lnet_rtrpools_free(0);
+	return rc;
+}
+
+static int
+lnet_rtrpools_adjust_helper(int tiny, int small, int large)
+{
+	int nrb = 0;
+	int rc = 0;
+	int i;
+	struct lnet_rtrbufpool *rtrp;
+
+	/* If the provided values for each buffer pool are different than the
+	 * configured values, we need to take action. */
+	if (tiny >= 0) {
+		tiny_router_buffers = tiny;
+		nrb = lnet_nrb_tiny_calculate();
+		cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+			rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX],
+						      nrb, i);
+			if (rc != 0)
+				return rc;
+		}
+	}
+	if (small >= 0) {
+		small_router_buffers = small;
+		nrb = lnet_nrb_small_calculate();
+		cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+			rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX],
+						      nrb, i);
+			if (rc != 0)
+				return rc;
+		}
+	}
+	if (large >= 0) {
+		large_router_buffers = large;
+		nrb = lnet_nrb_large_calculate();
+		cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+			rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX],
+						      nrb, i);
+			if (rc != 0)
+				return rc;
+		}
+	}
+
+	return 0;
+}
+
+int
+lnet_rtrpools_adjust(int tiny, int small, int large)
+{
+	/* this function doesn't revert the changes if adding new buffers
+	 * failed.  It's up to the user space caller to revert the
+	 * changes. */
+
+	if (!the_lnet.ln_routing)
+		return 0;
+
+	return lnet_rtrpools_adjust_helper(tiny, small, large);
+}
+
+int
+lnet_rtrpools_enable(void)
+{
+	int rc = 0;
+
+	if (the_lnet.ln_routing)
+		return 0;
+
+	if (the_lnet.ln_rtrpools == NULL)
+		/* If routing is turned off, and we have never
+		 * initialized the pools before, just call the
+		 * standard buffer pool allocation routine as
+		 * if we are just configuring this for the first
+		 * time. */
+		rc = lnet_rtrpools_alloc(1);
+	else
+		rc = lnet_rtrpools_adjust_helper(0, 0, 0);
+	if (rc != 0)
+		return rc;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 1;
+
+	the_lnet.ln_ping_target->pb_info.pi_features &=
+		~LNET_PING_FEAT_RTE_DISABLED;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (lnet_peer_discovery_disabled)
+		CWARN("Consider turning discovery on to enable full "
+		      "Multi-Rail routing functionality\n");
+
+	return rc;
+}
+
+void
+lnet_rtrpools_disable(void)
+{
+	if (!the_lnet.ln_routing)
+		return;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 0;
+	the_lnet.ln_ping_target->pb_info.pi_features |=
+		LNET_PING_FEAT_RTE_DISABLED;
+
+	tiny_router_buffers = 0;
+	small_router_buffers = 0;
+	large_router_buffers = 0;
+	lnet_net_unlock(LNET_LOCK_EX);
+	lnet_rtrpools_free(1);
+}
+
+static inline void
+lnet_notify_peer_down(struct lnet_ni *ni, struct lnet_nid *nid)
+{
+	if (ni->ni_net->net_lnd->lnd_notify_peer_down != NULL)
+		(ni->ni_net->net_lnd->lnd_notify_peer_down)(nid);
+}
+
+/*
+ * ni: local NI used to communicate with the peer
+ * nid: peer NID
+ * alive: true if peer is alive, false otherwise
+ * reset: reset health value. This is requested by the LND.
+ * when: notificaiton time.
+ */
+int
+lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
+	    time64_t when)
+{
+	struct lnet_peer_ni *lpni = NULL;
+	struct lnet_route *route;
+	struct lnet_peer *lp;
+	time64_t now = ktime_get_seconds();
+	int cpt;
+
+	LASSERT(!in_interrupt());
+
+	CDEBUG(D_NET, "%s notifying %s: %s\n",
+	       (ni == NULL) ? "userspace" : libcfs_nidstr(&ni->ni_nid),
+	       libcfs_nid2str(nid), alive ? "up" : "down");
+
+	if (ni != NULL &&
+	    LNET_NID_NET(&ni->ni_nid) != LNET_NIDNET(nid)) {
+		CWARN("Ignoring notification of %s %s by %s (different net)\n",
+		      libcfs_nid2str(nid), alive ? "birth" : "death",
+		      libcfs_nidstr(&ni->ni_nid));
+		return -EINVAL;
+	}
+
+	/* can't do predictions... */
+	if (when > now) {
+		CWARN("Ignoring prediction from %s of %s %s %lld seconds in the future\n",
+			ni ? libcfs_nidstr(&ni->ni_nid) :  "userspace",
+			libcfs_nid2str(nid), alive ? "up" : "down", when - now);
+		return -EINVAL;
+	}
+
+	if (ni != NULL && !alive &&		/* LND telling me she's down */
+	    !auto_down) {			/* auto-down disabled */
+		CDEBUG(D_NET, "Auto-down disabled\n");
+		return 0;
+	}
+
+	/* must lock 0 since this is used for synchronization */
+	lnet_net_lock(0);
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		lnet_net_unlock(0);
+		return -ESHUTDOWN;
+	}
+
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni == NULL) {
+		/* nid not found */
+		lnet_net_unlock(0);
+		CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+		return 0;
+	}
+
+	if (alive) {
+		if (reset) {
+			lpni->lpni_ns_status = LNET_NI_STATUS_UP;
+			lnet_set_lpni_healthv_locked(lpni,
+						     LNET_MAX_HEALTH_VALUE);
+		} else {
+			__u32 sensitivity = lpni->lpni_peer_net->
+					lpn_peer->lp_health_sensitivity;
+
+			lnet_inc_lpni_healthv_locked(lpni,
+					(sensitivity) ? sensitivity :
+					lnet_health_sensitivity);
+		}
+	} else if (reset) {
+		lpni->lpni_ns_status = LNET_NI_STATUS_DOWN;
+	}
+
+	/* recalculate aliveness */
+	alive = lnet_is_peer_ni_alive(lpni);
+
+	lp = lpni->lpni_peer_net->lpn_peer;
+	/* If this is an LNet router then update route aliveness */
+	if (lp->lp_rtr_refcount) {
+		if (reset)
+			/* reset flag indicates gateway peer went up or down */
+			lp->lp_alive = alive;
+
+		/* If discovery is disabled, locally or on the gateway, then
+		 * any routes using lpni as next-hop need to be updated
+		 *
+		 * NB: We can get many notifications while a route is down, so
+		 * we try and avoid the expensive net_lock/EX here for the
+		 * common case of receiving duplicate lnet_notify() calls (i.e.
+		 * only grab EX lock when we actually need to update the route
+		 * aliveness).
+		 */
+		if (lnet_is_discovery_disabled(lp)) {
+			list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
+				if (nid_same(&route->lr_nid, &lpni->lpni_nid))
+					lnet_set_route_aliveness(route, alive);
+			}
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	if (ni != NULL && !alive)
+		lnet_notify_peer_down(ni, &lpni->lpni_nid);
+
+	cpt = lpni->lpni_cpt;
+	lnet_net_lock(cpt);
+	lnet_peer_ni_decref_locked(lpni);
+	lnet_net_unlock(cpt);
+
+	return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
new file mode 100644
index 0000000000000..926891481d641
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
@@ -0,0 +1,902 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/uaccess.h>
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+#define LNET_LOFFT_BITS		(sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS	(LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS		\
+	clamp_t(int, LNET_LOFFT_BITS / 4, 8, 16)
+
+#define LNET_PROC_HASH_BITS	LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS	(LNET_LOFFT_BITS -	 \
+				 LNET_PROC_CPT_BITS -	 \
+				 LNET_PROC_VER_BITS -	 \
+				 LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS	(LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS	(LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK	((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK	((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK	((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK	((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos)				\
+	(int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off)		\
+	(((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) |   \
+	((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) |   \
+	((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+	((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v)	((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int proc_lnet_stats(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc;
+	struct lnet_counters *ctrs;
+	struct lnet_counters_common common;
+	size_t nob = *lenp;
+	loff_t pos = *ppos;
+	int len;
+	char tmpstr[256]; /* 7 %u and 4 u64 */
+
+	if (write) {
+		lnet_counters_reset();
+		return 0;
+	}
+
+	/* read */
+
+	LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+	if (ctrs == NULL)
+		return -ENOMEM;
+
+	rc = lnet_counters_get(ctrs);
+	if (rc)
+		goto out_no_ctrs;
+
+	common = ctrs->lct_common;
+
+	len = scnprintf(tmpstr, sizeof(tmpstr),
+			"%u %u %u %u %u %u %u %llu %llu "
+			"%llu %llu",
+			common.lcc_msgs_alloc, common.lcc_msgs_max,
+			common.lcc_errors,
+			common.lcc_send_count, common.lcc_recv_count,
+			common.lcc_route_count, common.lcc_drop_count,
+			common.lcc_send_length, common.lcc_recv_length,
+			common.lcc_route_length, common.lcc_drop_length);
+
+	if (pos >= len)
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, "\n");
+out_no_ctrs:
+	LIBCFS_FREE(ctrs, sizeof(*ctrs));
+	return rc;
+}
+
+static int
+proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer,
+		 size_t *lenp, loff_t *ppos)
+{
+	const int	tmpsiz = 256;
+	char		*tmpstr;
+	char		*s;
+	int		rc = 0;
+	int		len;
+	int		ver;
+	int		off;
+
+	BUILD_BUG_ON(sizeof(loff_t) < 4);
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += scnprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+			       the_lnet.ln_routing ? "enabled" : "disabled");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		s += scnprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %8s %7s %s\n",
+			       "net", "hops", "priority", "state", "router");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_remote_nets_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head	*n;
+		struct list_head	*r;
+		struct lnet_route		*route = NULL;
+		struct lnet_remotenet	*rnet  = NULL;
+		int			skip  = off - 1;
+		struct list_head	*rn_list;
+		int			i;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+			lnet_net_unlock(0);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+		     i++) {
+			rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+			n = rn_list->next;
+
+			while (n != rn_list && route == NULL) {
+				rnet = list_entry(n, struct lnet_remotenet,
+						  lrn_list);
+
+				r = rnet->lrn_routes.next;
+
+				while (r != &rnet->lrn_routes) {
+					struct lnet_route *re =
+						list_entry(r, struct lnet_route,
+							   lr_list);
+					if (skip == 0) {
+						route = re;
+						break;
+					}
+
+					skip--;
+					r = r->next;
+				}
+
+				n = n->next;
+			}
+		}
+
+		if (route != NULL) {
+			__u32 net = rnet->lrn_net;
+			__u32 hops = route->lr_hops;
+			unsigned int priority = route->lr_priority;
+			int alive = lnet_is_route_alive(route);
+
+			s += scnprintf(s, tmpstr + tmpsiz - s,
+				       "%-8s %4d %8u %7s %s\n",
+				       libcfs_net2str(net), hops,
+				       priority,
+				       alive ? "up" : "down",
+				       libcfs_nidstr(&route->lr_nid));
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int
+proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
+		  size_t *lenp, loff_t *ppos)
+{
+	int	   rc = 0;
+	char	  *tmpstr;
+	char	  *s;
+	const int  tmpsiz = 256;
+	int	   len;
+	int	   ver;
+	int	   off;
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += scnprintf(s, tmpstr + tmpsiz - s,
+			       "%-4s %7s %5s %s\n",
+			       "ref", "rtr_ref", "alive", "router");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_routers_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head *r;
+		struct lnet_peer *peer = NULL;
+		int		  skip = off - 1;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+			lnet_net_unlock(0);
+
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		r = the_lnet.ln_routers.next;
+
+		while (r != &the_lnet.ln_routers) {
+			struct lnet_peer *lp =
+			  list_entry(r, struct lnet_peer,
+				     lp_rtr_list);
+
+			if (skip == 0) {
+				peer = lp;
+				break;
+			}
+
+			skip--;
+			r = r->next;
+		}
+
+		if (peer != NULL) {
+			struct lnet_nid *nid = &peer->lp_primary_nid;
+			int nrefs     = atomic_read(&peer->lp_refcount);
+			int nrtrrefs  = peer->lp_rtr_refcount;
+			int alive     = lnet_is_gateway_alive(peer);
+
+			s += scnprintf(s, tmpstr + tmpsiz - s,
+				       "%-4d %7d %5s %s\n",
+				       nrefs, nrtrrefs,
+				       alive ? "up" : "down",
+				       libcfs_nidstr(nid));
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+/* TODO: there should be no direct access to ptable. We should add a set
+ * of APIs that give access to the ptable and its members */
+static int
+proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	const int		tmpsiz	= 256;
+	struct lnet_peer_table	*ptable;
+	char			*tmpstr = NULL;
+	char			*s;
+	int			cpt  = LNET_PROC_CPT_GET(*ppos);
+	int			ver  = LNET_PROC_VER_GET(*ppos);
+	int			hash = LNET_PROC_HASH_GET(*ppos);
+	int			hoff = LNET_PROC_HOFF_GET(*ppos);
+	int			rc = 0;
+	int			len;
+
+	if (write) {
+		int i;
+		struct lnet_peer_ni *peer;
+
+		cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+			lnet_net_lock(i);
+			for (hash = 0; hash < LNET_PEER_HASH_SIZE; hash++) {
+				list_for_each_entry(peer,
+						    &ptable->pt_hash[hash],
+						    lpni_hashlist) {
+					peer->lpni_mintxcredits =
+						peer->lpni_txcredits;
+					peer->lpni_minrtrcredits =
+						peer->lpni_rtrcredits;
+				}
+			}
+			lnet_net_unlock(i);
+		}
+		*ppos += *lenp;
+		return 0;
+	}
+
+	if (*lenp == 0)
+		return 0;
+
+	BUILD_BUG_ON(LNET_PROC_HASH_BITS < LNET_PEER_HASH_BITS);
+
+	if (cpt >= LNET_CPT_NUMBER) {
+		*lenp = 0;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += scnprintf(s, tmpstr + tmpsiz - s,
+			       "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+			       "nid", "refs", "state", "last", "max",
+			       "rtr", "min", "tx", "min", "queue");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		hoff++;
+	} else {
+		struct lnet_peer_ni	*peer;
+		struct list_head	*p;
+		int			skip;
+
+ again:
+		p = NULL;
+		peer = NULL;
+		skip = hoff - 1;
+
+		lnet_net_lock(cpt);
+		ptable = the_lnet.ln_peer_tables[cpt];
+		if (hoff == 1)
+			ver = LNET_PROC_VERSION(ptable->pt_version);
+
+		if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+			lnet_net_unlock(cpt);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		while (hash < LNET_PEER_HASH_SIZE) {
+			if (p == NULL)
+				p = ptable->pt_hash[hash].next;
+
+			while (p != &ptable->pt_hash[hash]) {
+				struct lnet_peer_ni *lp =
+				  list_entry(p, struct lnet_peer_ni,
+					     lpni_hashlist);
+				if (skip == 0) {
+					peer = lp;
+
+					/* minor optimization: start from idx+1
+					 * on next iteration if we've just
+					 * drained lpni_hashlist */
+					if (lp->lpni_hashlist.next ==
+					    &ptable->pt_hash[hash]) {
+						hoff = 1;
+						hash++;
+					} else {
+						hoff++;
+					}
+
+					break;
+				}
+
+				skip--;
+				p = lp->lpni_hashlist.next;
+			}
+
+			if (peer != NULL)
+				break;
+
+			p = NULL;
+			hoff = 1;
+			hash++;
+                }
+
+		if (peer != NULL) {
+			struct lnet_nid nid = peer->lpni_nid;
+			int nrefs = kref_read(&peer->lpni_kref);
+			time64_t lastalive = -1;
+			char *aliveness = "NA";
+			int maxcr = (peer->lpni_net) ?
+			  peer->lpni_net->net_tunables.lct_peer_tx_credits : 0;
+			int txcr = peer->lpni_txcredits;
+			int mintxcr = peer->lpni_mintxcredits;
+			int rtrcr = peer->lpni_rtrcredits;
+			int minrtrcr = peer->lpni_minrtrcredits;
+			int txqnob = peer->lpni_txqnob;
+
+			if (lnet_isrouter(peer) ||
+			    lnet_peer_aliveness_enabled(peer))
+				aliveness = lnet_is_peer_ni_alive(peer) ?
+					"up" : "down";
+
+			lnet_net_unlock(cpt);
+
+			s += scnprintf(s, tmpstr + tmpsiz - s,
+				       "%-24s %4d %5s %5lld %5d %5d %5d %5d %5d %d\n",
+				       libcfs_nidstr(&nid), nrefs, aliveness,
+				       lastalive, maxcr, rtrcr, minrtrcr, txcr,
+				       mintxcr, txqnob);
+			LASSERT(tmpstr + tmpsiz - s > 0);
+
+		} else { /* peer is NULL */
+			lnet_net_unlock(cpt);
+		}
+
+		if (hash == LNET_PEER_HASH_SIZE) {
+			cpt++;
+			hash = 0;
+			hoff = 1;
+			if (peer == NULL && cpt < LNET_CPT_NUMBER)
+				goto again;
+		}
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int proc_lnet_buffers(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	size_t nob = *lenp;
+	loff_t pos = *ppos;
+	char		*s;
+	char		*tmpstr;
+	int		tmpsiz;
+	int		idx;
+	int		len;
+	int		rc;
+	int		i;
+
+	LASSERT(!write);
+
+	/* (4 %d) * 4 * LNET_CPT_NUMBER */
+	tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	s += scnprintf(s, tmpstr + tmpsiz - s,
+		       "%5s %5s %7s %7s\n",
+		       "pages", "count", "credits", "min");
+	LASSERT(tmpstr + tmpsiz - s > 0);
+
+	if (the_lnet.ln_rtrpools == NULL)
+		goto out; /* I'm not a router */
+
+	for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+		struct lnet_rtrbufpool *rbp;
+
+		lnet_net_lock(LNET_LOCK_EX);
+		cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+			s += scnprintf(s, tmpstr + tmpsiz - s,
+				       "%5d %5d %7d %7d\n",
+				       rbp[idx].rbp_npages,
+				       rbp[idx].rbp_nbuffers,
+				       rbp[idx].rbp_credits,
+				       rbp[idx].rbp_mincredits);
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+ out:
+	len = s - tmpstr;
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, NULL);
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	return rc;
+}
+
+static int
+proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
+	      size_t *lenp, loff_t *ppos)
+{
+	int	tmpsiz = 128 * LNET_CPT_NUMBER;
+	int	rc = 0;
+	char	*tmpstr;
+	char	*s;
+	int	len;
+
+	if (*lenp == 0)
+		return 0;
+
+	if (write) {
+		/* Just reset the min stat. */
+		struct lnet_ni	*ni;
+		struct lnet_net	*net;
+
+		lnet_net_lock(0);
+
+		list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+			list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+				struct lnet_tx_queue *tq;
+				int i;
+				int j;
+
+				cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+					for (j = 0; ni->ni_cpts != NULL &&
+					     j < ni->ni_ncpts; j++) {
+						if (i == ni->ni_cpts[j])
+							break;
+					}
+
+					if (j == ni->ni_ncpts)
+						continue;
+
+					if (i != 0)
+						lnet_net_lock(i);
+					tq->tq_credits_min = tq->tq_credits;
+					if (i != 0)
+						lnet_net_unlock(i);
+				}
+			}
+		}
+		lnet_net_unlock(0);
+		*ppos += *lenp;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += scnprintf(s, tmpstr + tmpsiz - s,
+			       "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+			       "nid", "status", "alive", "refs", "peer",
+			       "rtr", "max", "tx", "min");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+	} else {
+		struct lnet_ni *ni   = NULL;
+		int skip = *ppos - 1;
+
+		lnet_net_lock(0);
+
+		ni = lnet_get_ni_idx_locked(skip);
+
+		if (ni != NULL) {
+			struct lnet_tx_queue *tq;
+			char *stat;
+			time64_t now = ktime_get_seconds();
+			time64_t last_alive = -1;
+			int i;
+			int j;
+
+			if (the_lnet.ln_routing)
+				last_alive = now - ni->ni_net->net_last_alive;
+
+			lnet_ni_lock(ni);
+			LASSERT(ni->ni_status != NULL);
+			stat = (lnet_ni_get_status_locked(ni) ==
+				LNET_NI_STATUS_UP) ? "up" : "down";
+			lnet_ni_unlock(ni);
+
+			/* @lo forever alive */
+			if (ni->ni_net->net_lnd->lnd_type == LOLND) {
+				last_alive = 0;
+				stat = "up";
+			}
+
+			/* we actually output credits information for
+			 * TX queue of each partition */
+			cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+				for (j = 0; ni->ni_cpts != NULL &&
+				     j < ni->ni_ncpts; j++) {
+					if (i == ni->ni_cpts[j])
+						break;
+				}
+
+				if (j == ni->ni_ncpts)
+					continue;
+
+				if (i != 0)
+					lnet_net_lock(i);
+
+				s += scnprintf(s, tmpstr + tmpsiz - s,
+				       "%-24s %6s %5lld %4d %4d %4d %5d %5d %5d\n",
+				       libcfs_nidstr(&ni->ni_nid), stat,
+				       last_alive, *ni->ni_refs[i],
+				       ni->ni_net->net_tunables.lct_peer_tx_credits,
+				       ni->ni_net->net_tunables.lct_peer_rtr_credits,
+				       tq->tq_credits_max,
+				       tq->tq_credits, tq->tq_credits_min);
+				if (i != 0)
+					lnet_net_unlock(i);
+			}
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos += 1;
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+struct lnet_portal_rotors {
+	int		pr_value;
+	const char	*pr_name;
+	const char	*pr_desc;
+};
+
+static struct lnet_portal_rotors	portal_rotors[] = {
+	{
+		.pr_value = LNET_PTL_ROTOR_OFF,
+		.pr_name  = "OFF",
+		.pr_desc  = "Turn off message rotor for wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_ON,
+		.pr_name  = "ON",
+		.pr_desc  = "round-robin dispatch all PUT messages for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_RR_RT,
+		.pr_name  = "RR_RT",
+		.pr_desc  = "round-robin dispatch routed PUT message for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_HASH_RT,
+		.pr_name  = "HASH_RT",
+		.pr_desc  = "dispatch routed PUT message by hashing source "
+			    "NID for wildcard portals"
+	},
+	{
+		.pr_value = -1,
+		.pr_name  = NULL,
+		.pr_desc  = NULL
+	},
+};
+
+static int proc_lnet_portal_rotor(struct ctl_table *table, int write,
+				  void __user *buffer, size_t *lenp,
+				  loff_t *ppos)
+{
+	const int	buf_len	= 128;
+	size_t nob = *lenp;
+	loff_t pos = *ppos;
+	char		*buf;
+	char		*tmp;
+	int		rc;
+	int		i;
+
+	if (!write) {
+		LIBCFS_ALLOC(buf, buf_len);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		lnet_res_lock(0);
+
+		for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+			if (portal_rotors[i].pr_value == portal_rotor)
+				break;
+		}
+
+		LASSERT(portal_rotors[i].pr_value == portal_rotor);
+		lnet_res_unlock(0);
+
+		rc = scnprintf(buf, buf_len,
+			       "{\n\tportals: all\n"
+			       "\trotor: %s\n\tdescription: %s\n}",
+			       portal_rotors[i].pr_name,
+			       portal_rotors[i].pr_desc);
+
+		if (pos >= min_t(int, rc, buf_len)) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+						      buf + pos, "\n");
+		}
+		LIBCFS_FREE(buf, buf_len);
+
+		return rc;
+	}
+
+	buf = memdup_user_nul(buffer, nob);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	tmp = strim(buf);
+
+	rc = -EINVAL;
+	lnet_res_lock(0);
+	for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+		if (strncasecmp(portal_rotors[i].pr_name, tmp,
+				strlen(portal_rotors[i].pr_name)) == 0) {
+			portal_rotor = portal_rotors[i].pr_value;
+			rc = 0;
+			break;
+		}
+	}
+	lnet_res_unlock(0);
+	kfree(buf);
+
+	return rc;
+}
+
+static struct ctl_table lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		.procname	= "stats",
+		.mode		= 0644,
+		.proc_handler	= &proc_lnet_stats,
+	},
+	{
+		.procname	= "routes",
+		.mode		= 0444,
+		.proc_handler	= &proc_lnet_routes,
+	},
+	{
+		.procname	= "routers",
+		.mode		= 0444,
+		.proc_handler	= &proc_lnet_routers,
+	},
+	{
+		.procname	= "peers",
+		.mode		= 0644,
+		.proc_handler	= &proc_lnet_peers,
+	},
+	{
+		.procname	= "buffers",
+		.mode		= 0444,
+		.proc_handler	= &proc_lnet_buffers,
+	},
+	{
+		.procname	= "nis",
+		.mode		= 0644,
+		.proc_handler	= &proc_lnet_nis,
+	},
+	{
+		.procname	= "portal_rotor",
+		.mode		= 0644,
+		.proc_handler	= &proc_lnet_portal_rotor,
+	},
+	{
+		.procname       = "lnet_lnd_timeout",
+		.data           = &lnet_lnd_timeout,
+		.maxlen         = sizeof(lnet_lnd_timeout),
+		.mode           = 0444,
+		.proc_handler   = &debugfs_doint,
+	},
+	{ .procname = NULL }
+};
+
+void lnet_router_debugfs_init(void)
+{
+	lnet_insert_debugfs(lnet_table);
+}
+
+void lnet_router_debugfs_fini(void)
+{
+	lnet_remove_debugfs(lnet_table);
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/udsp.c b/drivers/staging/lustrefsx/lnet/lnet/udsp.c
new file mode 100644
index 0000000000000..08c1a7fcccc0d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/udsp.c
@@ -0,0 +1,1557 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ * Copyright (c) 2018-2020 Data Direct Networks.
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   version 2 along with this program; If not, see
+ *   http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ *   lnet/lnet/udsp.c
+ *
+ *   User Defined Selection Policies (UDSP) are introduced to add
+ *   ability of fine traffic control. The policies are instantiated
+ *   on LNet constructs and allow preference of some constructs
+ *   over others as an extension of the selection algorithm.
+ *   The order of operation is defined by the selection algorithm logical flow:
+ *
+ *   1. Iterate over all the networks that a peer can be reached on
+ *      and select the best local network
+ *      - The remote network with the highest priority is examined
+ *        (Network Rule)
+ *      - The local network with the highest priority is selected
+ *        (Network Rule)
+ *      - The local NI with the highest priority is selected
+ *        (NID Rule)
+ *   2. If the peer is a remote peer and has no local networks,
+ *      - then select the remote peer network with the highest priority
+ *        (Network Rule)
+ *      - Select the highest priority remote peer_ni on the network selected
+ *        (NID Rule)
+ *      - Now that the peer's network and NI are decided, select the router
+ *        in round robin from the peer NI's preferred router list.
+ *        (Router Rule)
+ *      - Select the highest priority local NI on the local net of the
+ *        selected route.
+ *        (NID Rule)
+ *   3. Otherwise for local peers, select the peer_ni from the peer.
+ *      - highest priority peer NI is selected
+ *        (NID Rule)
+ *      - Select the peer NI which has the local NI selected on its
+ *        preferred list.
+ *        (NID Pair Rule)
+ *
+ *   Accordingly, the User Interface allows for the following:
+ *   - Adding a local network udsp: if multiple local networks are
+ *     available, each one can have a priority.
+ *   - Adding a local NID udsp: after a local network is chosen,
+ *     if there are multiple NIs, each one can have a priority.
+ *   - Adding a remote NID udsp: assign priority to a peer NID.
+ *   - Adding a NID pair udsp: allows to specify local NIDs
+ *     to be added on the list on the specified peer NIs
+ *     When selecting a peer NI, the one with the
+ *     local NID being used on its list is preferred.
+ *   - Adding a Router udsp: similar to the NID pair udsp.
+ *     Specified router NIDs are added on the list on the specified peer NIs.
+ *     When sending to a remote peer, remote net is selected and the peer NID
+ *     is selected. The router which has its nid on the peer NI list
+ *     is preferred.
+ *   - Deleting a udsp: use the specified policy index to remove it
+ *     from the policy list.
+ *
+ *   Generally, the syntax is as follows
+ *     lnetctl policy <add | del | show>
+ *      --src:      ip2nets syntax specifying the local NID to match
+ *      --dst:      ip2nets syntax specifying the remote NID to match
+ *      --rte:      ip2nets syntax specifying the router NID to match
+ *      --priority: Priority to apply to rule matches
+ *      --idx:      Index of where to insert or delete the rule
+ *                  By default add appends to the end of the rule list
+ *
+ * Author: Amir Shehata
+ */
+
+#include <linux/uaccess.h>
+
+#include <lnet/udsp.h>
+#include <libcfs/libcfs.h>
+
+struct udsp_info {
+	struct lnet_peer_ni *udi_lpni;
+	struct lnet_peer_net *udi_lpn;
+	struct lnet_ni *udi_ni;
+	struct lnet_net *udi_net;
+	struct lnet_ud_nid_descr *udi_match;
+	struct lnet_ud_nid_descr *udi_action;
+	__u32 udi_priority;
+	enum lnet_udsp_action_type udi_type;
+	bool udi_local;
+	bool udi_revert;
+};
+
+typedef int (*udsp_apply_rule)(struct udsp_info *);
+
+enum udsp_apply {
+	UDSP_APPLY_ON_PEERS = 0,
+	UDSP_APPLY_PRIO_ON_NIS = 1,
+	UDSP_APPLY_RTE_ON_NETS = 2,
+	UDSP_APPLY_MAX_ENUM = 3,
+};
+
+#define RULE_NOT_APPLICABLE -1
+
+static inline bool
+lnet_udsp_is_net_rule(struct lnet_ud_nid_descr *match)
+{
+	return list_empty(&match->ud_addr_range);
+}
+
+static bool
+lnet_udsp_expr_list_equal(struct list_head *e1,
+			  struct list_head *e2)
+{
+	struct cfs_expr_list *expr1;
+	struct cfs_expr_list *expr2;
+	struct cfs_range_expr *range1, *range2;
+
+	if (list_empty(e1) && list_empty(e2))
+		return true;
+
+	if (lnet_get_list_len(e1) != lnet_get_list_len(e2))
+		return false;
+
+	expr2 = list_first_entry(e2, struct cfs_expr_list, el_link);
+
+	list_for_each_entry(expr1, e1, el_link) {
+		if (lnet_get_list_len(&expr1->el_exprs) !=
+		    lnet_get_list_len(&expr2->el_exprs))
+			return false;
+
+		range2 = list_first_entry(&expr2->el_exprs,
+					  struct cfs_range_expr,
+					  re_link);
+
+		list_for_each_entry(range1, &expr1->el_exprs, re_link) {
+			if (range1->re_lo != range2->re_lo ||
+			    range1->re_hi != range2->re_hi ||
+			    range1->re_stride != range2->re_stride)
+				return false;
+			range2 = list_next_entry(range2, re_link);
+		}
+		expr2 = list_next_entry(expr2, el_link);
+	}
+
+	return true;
+}
+
+static bool
+lnet_udsp_nid_descr_equal(struct lnet_ud_nid_descr *e1,
+			  struct lnet_ud_nid_descr *e2)
+{
+	if (e1->ud_net_id.udn_net_type != e2->ud_net_id.udn_net_type ||
+	    !lnet_udsp_expr_list_equal(&e1->ud_net_id.udn_net_num_range,
+				       &e2->ud_net_id.udn_net_num_range) ||
+	    !lnet_udsp_expr_list_equal(&e1->ud_addr_range, &e2->ud_addr_range))
+		return false;
+
+	return true;
+}
+
+static bool
+lnet_udsp_action_equal(struct lnet_udsp *e1, struct lnet_udsp *e2)
+{
+	if (e1->udsp_action_type != e2->udsp_action_type)
+		return false;
+
+	if (e1->udsp_action_type == EN_LNET_UDSP_ACTION_PRIORITY &&
+	    e1->udsp_action.udsp_priority != e2->udsp_action.udsp_priority)
+		return false;
+
+	return true;
+}
+
+static bool
+lnet_udsp_equal(struct lnet_udsp *e1, struct lnet_udsp *e2)
+{
+	/* check each NID descr */
+	if (!lnet_udsp_nid_descr_equal(&e1->udsp_src, &e2->udsp_src) ||
+	    !lnet_udsp_nid_descr_equal(&e1->udsp_dst, &e2->udsp_dst) ||
+	    !lnet_udsp_nid_descr_equal(&e1->udsp_rte, &e2->udsp_rte))
+		return false;
+
+	return true;
+}
+
+/* it is enough to look at the net type of the descriptor. If the criteria
+ * is present the net must be specified
+ */
+static inline bool
+lnet_udsp_criteria_present(struct lnet_ud_nid_descr *descr)
+{
+	return (descr->ud_net_id.udn_net_type != 0);
+}
+
+static int
+lnet_udsp_apply_rule_on_ni(struct udsp_info *udi)
+{
+	int rc;
+	struct lnet_ni *ni = udi->udi_ni;
+	struct lnet_ud_nid_descr *ni_match = udi->udi_match;
+	__u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority;
+
+	rc = cfs_match_nid_net(
+		&ni->ni_nid,
+		ni_match->ud_net_id.udn_net_type,
+		&ni_match->ud_net_id.udn_net_num_range,
+		&ni_match->ud_addr_range);
+	if (!rc)
+		return 0;
+
+	CDEBUG(D_NET, "apply udsp on ni %s\n",
+	       libcfs_nidstr(&ni->ni_nid));
+
+	/* Detected match. Set NIDs priority */
+	lnet_ni_set_sel_priority_locked(ni, priority);
+
+	return 0;
+}
+
+static int
+lnet_udsp_apply_rte_list_on_net(struct lnet_net *net,
+				struct lnet_ud_nid_descr *rte_action,
+				bool revert)
+{
+	struct lnet_remotenet *rnet;
+	struct list_head *rn_list;
+	struct lnet_route *route;
+	struct lnet_peer_ni *lpni;
+	bool cleared = false;
+	struct lnet_nid *gw_nid, *gw_prim_nid;
+	int rc = 0;
+	int i;
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each_entry(rnet, rn_list, lrn_list) {
+			list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
+				/* look if gw nid on the same net matches */
+				gw_prim_nid =
+					&route->lr_gateway->lp_primary_nid;
+				lpni = NULL;
+				while ((lpni = lnet_get_next_peer_ni_locked(route->lr_gateway,
+									    NULL,
+									    lpni)) != NULL) {
+					if (!lnet_get_net_locked(lpni->lpni_peer_net->lpn_net_id))
+						continue;
+					gw_nid = &lpni->lpni_nid;
+					rc = cfs_match_nid_net(
+						gw_nid,
+						rte_action->ud_net_id.udn_net_type,
+						&rte_action->ud_net_id.udn_net_num_range,
+						&rte_action->ud_addr_range);
+					if (rc)
+						break;
+				}
+				/* match gw primary nid on a remote network */
+				if (!rc) {
+					gw_nid = gw_prim_nid;
+					rc = cfs_match_nid_net(
+						gw_nid,
+						rte_action->ud_net_id.udn_net_type,
+						&rte_action->ud_net_id.udn_net_num_range,
+						&rte_action->ud_addr_range);
+				}
+				if (!rc)
+					continue;
+				lnet_net_unlock(LNET_LOCK_EX);
+				if (!cleared || revert) {
+					lnet_net_clr_pref_rtrs(net);
+					cleared = true;
+					if (revert) {
+						lnet_net_lock(LNET_LOCK_EX);
+						continue;
+					}
+				}
+				/* match. Add to pref NIDs */
+				CDEBUG(D_NET, "udsp net->gw: %s->%s\n",
+				       libcfs_net2str(net->net_id),
+				       libcfs_nidstr(gw_prim_nid));
+				rc = lnet_net_add_pref_rtr(net, gw_prim_nid);
+				lnet_net_lock(LNET_LOCK_EX);
+				/* success if EEXIST return */
+				if (rc && rc != -EEXIST) {
+					CERROR("Failed to add %s to %s pref rtr list\n",
+					       libcfs_nidstr(gw_prim_nid),
+					       libcfs_net2str(net->net_id));
+					return rc;
+				}
+			}
+		}
+	}
+
+	return rc;
+}
+
+static int
+lnet_udsp_apply_rte_rule_on_nets(struct udsp_info *udi)
+{
+	int rc = 0;
+	int last_failure = 0;
+	struct lnet_net *net;
+	struct lnet_ud_nid_descr *match = udi->udi_match;
+	struct lnet_ud_nid_descr *rte_action = udi->udi_action;
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		if (LNET_NETTYP(net->net_id) != match->ud_net_id.udn_net_type)
+			continue;
+
+		rc = cfs_match_net(net->net_id,
+				   match->ud_net_id.udn_net_type,
+				   &match->ud_net_id.udn_net_num_range);
+		if (!rc)
+			continue;
+
+		CDEBUG(D_NET, "apply rule on %s\n",
+		       libcfs_net2str(net->net_id));
+		rc = lnet_udsp_apply_rte_list_on_net(net, rte_action,
+						     udi->udi_revert);
+		if (rc)
+			last_failure = rc;
+	}
+
+	return last_failure;
+}
+
+static int
+lnet_udsp_apply_rte_rule_on_net(struct udsp_info *udi)
+{
+	int rc = 0;
+	struct lnet_net *net = udi->udi_net;
+	struct lnet_ud_nid_descr *match = udi->udi_match;
+	struct lnet_ud_nid_descr *rte_action = udi->udi_action;
+
+	rc = cfs_match_net(net->net_id,
+			   match->ud_net_id.udn_net_type,
+			   &match->ud_net_id.udn_net_num_range);
+	if (!rc)
+		return 0;
+
+	CDEBUG(D_NET, "apply rule on %s\n",
+		libcfs_net2str(net->net_id));
+	rc = lnet_udsp_apply_rte_list_on_net(net, rte_action,
+					     udi->udi_revert);
+
+	return rc;
+}
+
+static int
+lnet_udsp_apply_prio_rule_on_net(struct udsp_info *udi)
+{
+	int rc;
+	struct lnet_ud_nid_descr *match = udi->udi_match;
+	struct lnet_net *net = udi->udi_net;
+	__u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority;
+
+	if (!lnet_udsp_is_net_rule(match))
+		return RULE_NOT_APPLICABLE;
+
+	rc = cfs_match_net(net->net_id,
+			   match->ud_net_id.udn_net_type,
+			   &match->ud_net_id.udn_net_num_range);
+	if (!rc)
+		return 0;
+
+	CDEBUG(D_NET, "apply rule on %s\n",
+	       libcfs_net2str(net->net_id));
+
+	lnet_net_set_sel_priority_locked(net, priority);
+
+	return 0;
+}
+
+static int
+lnet_udsp_apply_rule_on_nis(struct udsp_info *udi)
+{
+	int rc = 0;
+	struct lnet_ni *ni;
+	struct lnet_net *net;
+	struct lnet_ud_nid_descr *ni_match = udi->udi_match;
+	int last_failure = 0;
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		if (LNET_NETTYP(net->net_id) != ni_match->ud_net_id.udn_net_type)
+			continue;
+
+		udi->udi_net = net;
+		if (!lnet_udsp_apply_prio_rule_on_net(udi))
+			continue;
+
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			udi->udi_ni = ni;
+			rc = lnet_udsp_apply_rule_on_ni(udi);
+			if (rc)
+				last_failure = rc;
+		}
+	}
+
+	return last_failure;
+}
+
+static int
+lnet_udsp_apply_rte_list_on_lpni(struct lnet_peer_ni *lpni,
+				 struct lnet_ud_nid_descr *rte_action,
+				 bool revert)
+{
+	struct lnet_remotenet *rnet;
+	struct list_head *rn_list;
+	struct lnet_route *route;
+	bool cleared = false;
+	struct lnet_nid *gw_nid;
+	int rc = 0;
+	int i;
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each_entry(rnet, rn_list, lrn_list) {
+			list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
+				gw_nid = &route->lr_gateway->lp_primary_nid;
+				rc = cfs_match_nid_net(
+					gw_nid,
+					rte_action->ud_net_id.udn_net_type,
+					&rte_action->ud_net_id.udn_net_num_range,
+					&rte_action->ud_addr_range);
+				if (!rc)
+					continue;
+				lnet_net_unlock(LNET_LOCK_EX);
+				if (!cleared || revert) {
+					CDEBUG(D_NET, "%spref rtr nids from lpni %s\n",
+					       (revert) ? "revert " : "clear ",
+					       libcfs_nidstr(&lpni->lpni_nid));
+					lnet_peer_clr_pref_rtrs(lpni);
+					cleared = true;
+					if (revert) {
+						lnet_net_lock(LNET_LOCK_EX);
+						continue;
+					}
+				}
+				CDEBUG(D_NET, "add gw nid %s as preferred for peer %s\n",
+				       libcfs_nidstr(gw_nid),
+				       libcfs_nidstr(&lpni->lpni_nid));
+				/* match. Add to pref NIDs */
+				rc = lnet_peer_add_pref_rtr(lpni, gw_nid);
+				lnet_net_lock(LNET_LOCK_EX);
+				/* success if EEXIST return */
+				if (rc && rc != -EEXIST) {
+					CERROR("Failed to add %s to %s pref rtr list\n",
+					       libcfs_nidstr(gw_nid),
+					       libcfs_nidstr(&lpni->lpni_nid));
+					return rc;
+				}
+			}
+		}
+	}
+
+	return rc;
+}
+
+static int
+lnet_udsp_apply_ni_list(struct lnet_peer_ni *lpni,
+			struct lnet_ud_nid_descr *ni_action,
+			bool revert)
+{
+	int rc = 0;
+	struct lnet_ni *ni;
+	struct lnet_net *net;
+	bool cleared = false;
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		if (LNET_NETTYP(net->net_id) != ni_action->ud_net_id.udn_net_type)
+			continue;
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			rc = cfs_match_nid_net(
+				&ni->ni_nid,
+				ni_action->ud_net_id.udn_net_type,
+				&ni_action->ud_net_id.udn_net_num_range,
+				&ni_action->ud_addr_range);
+			if (!rc)
+				continue;
+			lnet_net_unlock(LNET_LOCK_EX);
+			if (!cleared || revert) {
+				lnet_peer_clr_pref_nids(lpni);
+				CDEBUG(D_NET, "%spref nids from lpni %s\n",
+					(revert) ? "revert " : "clear ",
+					libcfs_nidstr(&lpni->lpni_nid));
+				cleared = true;
+				if (revert) {
+					lnet_net_lock(LNET_LOCK_EX);
+					continue;
+				}
+			}
+			CDEBUG(D_NET, "add nid %s as preferred for peer %s\n",
+				libcfs_nidstr(&ni->ni_nid),
+				libcfs_nidstr(&lpni->lpni_nid));
+			/* match. Add to pref NIDs */
+			rc = lnet_peer_add_pref_nid(lpni, &ni->ni_nid);
+			lnet_net_lock(LNET_LOCK_EX);
+			/* success if EEXIST return */
+			if (rc && rc != -EEXIST) {
+				CERROR("Failed to add %s to %s pref nid list\n",
+					libcfs_nidstr(&ni->ni_nid),
+					libcfs_nidstr(&lpni->lpni_nid));
+				return rc;
+			}
+		}
+	}
+
+	return rc;
+}
+
+static int
+lnet_udsp_apply_rule_on_lpni(struct udsp_info *udi)
+{
+	int rc;
+	struct lnet_peer_ni *lpni = udi->udi_lpni;
+	struct lnet_ud_nid_descr *lp_match = udi->udi_match;
+	struct lnet_ud_nid_descr *action = udi->udi_action;
+	__u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority;
+	bool local = udi->udi_local;
+	enum lnet_udsp_action_type type = udi->udi_type;
+
+	rc = cfs_match_nid_net(
+		&lpni->lpni_nid,
+		lp_match->ud_net_id.udn_net_type,
+		&lp_match->ud_net_id.udn_net_num_range,
+		&lp_match->ud_addr_range);
+
+	/* check if looking for a net match */
+	if (!rc &&
+	    (lnet_get_list_len(&lp_match->ud_addr_range) ||
+	     !cfs_match_net(udi->udi_lpn->lpn_net_id,
+			   lp_match->ud_net_id.udn_net_type,
+			   &lp_match->ud_net_id.udn_net_num_range))) {
+		return 0;
+	}
+
+	if (type == EN_LNET_UDSP_ACTION_PREFERRED_LIST && local) {
+		rc = lnet_udsp_apply_ni_list(lpni, action,
+					     udi->udi_revert);
+		if (rc)
+			return rc;
+	} else if (type == EN_LNET_UDSP_ACTION_PREFERRED_LIST &&
+			!local) {
+		rc = lnet_udsp_apply_rte_list_on_lpni(lpni, action,
+						      udi->udi_revert);
+		if (rc)
+			return rc;
+	} else {
+		lnet_peer_ni_set_selection_priority(lpni, priority);
+	}
+
+	return 0;
+}
+
+static int
+lnet_udsp_apply_rule_on_lpn(struct udsp_info *udi)
+{
+	int rc;
+	struct lnet_ud_nid_descr *match = udi->udi_match;
+	struct lnet_peer_net *lpn = udi->udi_lpn;
+	__u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority;
+
+	if (udi->udi_type == EN_LNET_UDSP_ACTION_PREFERRED_LIST ||
+	    !lnet_udsp_is_net_rule(match))
+		return RULE_NOT_APPLICABLE;
+
+	rc = cfs_match_net(lpn->lpn_net_id,
+			match->ud_net_id.udn_net_type,
+			&match->ud_net_id.udn_net_num_range);
+	if (!rc)
+		return 0;
+
+	CDEBUG(D_NET, "apply rule on lpn %s\n",
+	       libcfs_net2str(lpn->lpn_net_id));
+	lnet_peer_net_set_sel_priority_locked(lpn, priority);
+
+	return 0;
+}
+
+static int
+lnet_udsp_apply_rule_on_lpnis(struct udsp_info *udi)
+{
+	/* iterate over all the peers in the system and find if any of the
+	 * peers match the criteria. If they do, clear the preferred list
+	 * and add the new list
+	 */
+	int lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+	struct lnet_ud_nid_descr *lp_match = udi->udi_match;
+	struct lnet_peer_table *ptable;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *lp;
+	int last_failure = 0;
+	int cpt;
+	int rc;
+
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			CDEBUG(D_NET, "udsp examining lp %s\n",
+			       libcfs_nidstr(&lp->lp_primary_nid));
+			list_for_each_entry(lpn,
+					    &lp->lp_peer_nets,
+					    lpn_peer_nets) {
+				CDEBUG(D_NET, "udsp examining lpn %s\n",
+				       libcfs_net2str(lpn->lpn_net_id));
+
+				if (LNET_NETTYP(lpn->lpn_net_id) !=
+				    lp_match->ud_net_id.udn_net_type)
+					continue;
+
+				udi->udi_lpn = lpn;
+
+				if (!lnet_udsp_apply_rule_on_lpn(udi))
+					continue;
+
+				list_for_each_entry(lpni,
+						    &lpn->lpn_peer_nis,
+						    lpni_peer_nis) {
+					CDEBUG(D_NET, "udsp examining lpni %s\n",
+					       libcfs_nidstr(&lpni->lpni_nid));
+					udi->udi_lpni = lpni;
+					rc = lnet_udsp_apply_rule_on_lpni(udi);
+					if (rc)
+						last_failure = rc;
+				}
+			}
+		}
+	}
+
+	return last_failure;
+}
+
+static int
+lnet_udsp_apply_single_policy(struct lnet_udsp *udsp, struct udsp_info *udi,
+			      udsp_apply_rule *cbs)
+{
+	int rc;
+
+	if (lnet_udsp_criteria_present(&udsp->udsp_dst) &&
+	    lnet_udsp_criteria_present(&udsp->udsp_src)) {
+		/* NID Pair rule */
+		if (!cbs[UDSP_APPLY_ON_PEERS])
+			return 0;
+
+		if (udsp->udsp_action_type !=
+			EN_LNET_UDSP_ACTION_PREFERRED_LIST) {
+			CERROR("Bad action type. Expected %d got %d\n",
+				EN_LNET_UDSP_ACTION_PREFERRED_LIST,
+				udsp->udsp_action_type);
+			return 0;
+		}
+		udi->udi_match = &udsp->udsp_dst;
+		udi->udi_action = &udsp->udsp_src;
+		udi->udi_type = EN_LNET_UDSP_ACTION_PREFERRED_LIST;
+		udi->udi_local = true;
+
+		CDEBUG(D_NET, "applying udsp (%p) dst->src\n",
+			udsp);
+		rc = cbs[UDSP_APPLY_ON_PEERS](udi);
+		if (rc)
+			return rc;
+	} else if (lnet_udsp_criteria_present(&udsp->udsp_dst) &&
+		   lnet_udsp_criteria_present(&udsp->udsp_rte)) {
+		/* Router rule */
+		if (!cbs[UDSP_APPLY_ON_PEERS])
+			return 0;
+
+		if (udsp->udsp_action_type !=
+			EN_LNET_UDSP_ACTION_PREFERRED_LIST) {
+			CERROR("Bad action type. Expected %d got %d\n",
+				EN_LNET_UDSP_ACTION_PREFERRED_LIST,
+				udsp->udsp_action_type);
+			return 0;
+		}
+
+		if (lnet_udsp_criteria_present(&udsp->udsp_src)) {
+			CERROR("only one of src or dst can be specified\n");
+			return 0;
+		}
+		udi->udi_match = &udsp->udsp_dst;
+		udi->udi_action = &udsp->udsp_rte;
+		udi->udi_type = EN_LNET_UDSP_ACTION_PREFERRED_LIST;
+		udi->udi_local = false;
+
+		CDEBUG(D_NET, "applying udsp (%p) dst->rte\n",
+			udsp);
+		rc = cbs[UDSP_APPLY_ON_PEERS](udi);
+		if (rc)
+			return rc;
+	} else if (lnet_udsp_criteria_present(&udsp->udsp_dst)) {
+		/* destination priority rule */
+		if (!cbs[UDSP_APPLY_ON_PEERS])
+			return 0;
+
+		if (udsp->udsp_action_type !=
+			EN_LNET_UDSP_ACTION_PRIORITY) {
+			CERROR("Bad action type. Expected %d got %d\n",
+				EN_LNET_UDSP_ACTION_PRIORITY,
+				udsp->udsp_action_type);
+			return 0;
+		}
+		udi->udi_match = &udsp->udsp_dst;
+		udi->udi_type = EN_LNET_UDSP_ACTION_PRIORITY;
+		if (udsp->udsp_action_type !=
+		    EN_LNET_UDSP_ACTION_PRIORITY) {
+			udi->udi_priority = 0;
+		} else {
+			udi->udi_priority = udsp->udsp_action.udsp_priority;
+		}
+		udi->udi_local = true;
+
+		CDEBUG(D_NET, "applying udsp (%p) on destination\n",
+			udsp);
+		rc = cbs[UDSP_APPLY_ON_PEERS](udi);
+		if (rc)
+			return rc;
+	} else if (lnet_udsp_criteria_present(&udsp->udsp_src)) {
+		/* source priority rule */
+		if (!cbs[UDSP_APPLY_PRIO_ON_NIS])
+			return 0;
+
+		if (udsp->udsp_action_type !=
+			EN_LNET_UDSP_ACTION_PRIORITY) {
+			CERROR("Bad action type. Expected %d got %d\n",
+				EN_LNET_UDSP_ACTION_PRIORITY,
+				udsp->udsp_action_type);
+			return 0;
+		}
+		udi->udi_match = &udsp->udsp_src;
+		udi->udi_type = EN_LNET_UDSP_ACTION_PRIORITY;
+		if (udsp->udsp_action_type !=
+		    EN_LNET_UDSP_ACTION_PRIORITY) {
+			udi->udi_priority = 0;
+		} else {
+			udi->udi_priority = udsp->udsp_action.udsp_priority;
+		}
+		udi->udi_local = true;
+
+		CDEBUG(D_NET, "applying udsp (%p) on source\n",
+			udsp);
+		rc = cbs[UDSP_APPLY_PRIO_ON_NIS](udi);
+	} else {
+		CERROR("Bad UDSP policy\n");
+		return 0;
+	}
+
+	return 0;
+}
+
+static int
+lnet_udsp_apply_policies_helper(struct lnet_udsp *udsp, struct udsp_info *udi,
+				udsp_apply_rule *cbs)
+{
+	int rc;
+	int last_failure = 0;
+
+	if (udsp)
+		return lnet_udsp_apply_single_policy(udsp, udi, cbs);
+
+	list_for_each_entry_reverse(udsp,
+				    &the_lnet.ln_udsp_list,
+				    udsp_on_list) {
+		rc = lnet_udsp_apply_single_policy(udsp, udi, cbs);
+		if (rc)
+			last_failure = rc;
+	}
+
+	return last_failure;
+}
+
+int
+lnet_udsp_apply_policies_on_ni(struct lnet_ni *ni)
+{
+	struct udsp_info udi;
+	udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+	memset(&udi, 0, sizeof(udi));
+
+	udi.udi_ni = ni;
+
+	cbs[UDSP_APPLY_PRIO_ON_NIS] = lnet_udsp_apply_rule_on_ni;
+
+	return lnet_udsp_apply_policies_helper(NULL, &udi, cbs);
+}
+
+int
+lnet_udsp_apply_policies_on_net(struct lnet_net *net)
+{
+	struct udsp_info udi;
+	udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+	memset(&udi, 0, sizeof(udi));
+
+	udi.udi_net = net;
+
+	cbs[UDSP_APPLY_PRIO_ON_NIS] = lnet_udsp_apply_prio_rule_on_net;
+	cbs[UDSP_APPLY_RTE_ON_NETS] = lnet_udsp_apply_rte_rule_on_net;
+
+	return lnet_udsp_apply_policies_helper(NULL, &udi, cbs);
+}
+
+int
+lnet_udsp_apply_policies_on_lpni(struct lnet_peer_ni *lpni)
+{
+	struct udsp_info udi;
+	udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+	memset(&udi, 0, sizeof(udi));
+
+	udi.udi_lpni = lpni;
+
+	cbs[UDSP_APPLY_ON_PEERS] = lnet_udsp_apply_rule_on_lpni;
+
+	return lnet_udsp_apply_policies_helper(NULL, &udi, cbs);
+}
+
+int
+lnet_udsp_apply_policies_on_lpn(struct lnet_peer_net *lpn)
+{
+	struct udsp_info udi;
+	udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+	memset(&udi, 0, sizeof(udi));
+
+	udi.udi_lpn = lpn;
+
+	cbs[UDSP_APPLY_ON_PEERS] = lnet_udsp_apply_rule_on_lpn;
+
+	return lnet_udsp_apply_policies_helper(NULL, &udi, cbs);
+}
+
+int
+lnet_udsp_apply_policies(struct lnet_udsp *udsp, bool revert)
+{
+	int rc;
+	struct udsp_info udi;
+	udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+	memset(&udi, 0, sizeof(udi));
+
+	cbs[UDSP_APPLY_ON_PEERS] = lnet_udsp_apply_rule_on_lpnis;
+	cbs[UDSP_APPLY_PRIO_ON_NIS] = lnet_udsp_apply_rule_on_nis;
+	cbs[UDSP_APPLY_RTE_ON_NETS] = lnet_udsp_apply_rte_rule_on_nets;
+
+	udi.udi_revert = revert;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	rc = lnet_udsp_apply_policies_helper(udsp, &udi, cbs);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+struct lnet_udsp *
+lnet_udsp_get_policy(int idx)
+{
+	int i = 0;
+	struct lnet_udsp *udsp = NULL;
+	bool found = false;
+
+	CDEBUG(D_NET, "Get UDSP at idx = %d\n", idx);
+
+	if (idx < 0)
+		return NULL;
+
+	list_for_each_entry(udsp, &the_lnet.ln_udsp_list, udsp_on_list) {
+		CDEBUG(D_NET, "iterating over upsp %d:%d:%d\n",
+		       udsp->udsp_idx, i, idx);
+		if (i == idx) {
+			found = true;
+			break;
+		}
+		i++;
+	}
+
+	CDEBUG(D_NET, "Found UDSP (%p)\n", udsp);
+
+	if (!found)
+		return NULL;
+
+	return udsp;
+}
+
+int
+lnet_udsp_add_policy(struct lnet_udsp *new, int idx)
+{
+	struct lnet_udsp *udsp;
+	struct lnet_udsp *insert = NULL;
+	int i = 0;
+
+	list_for_each_entry(udsp, &the_lnet.ln_udsp_list, udsp_on_list) {
+		CDEBUG(D_NET, "found udsp i = %d:%d, idx = %d\n",
+		       i, udsp->udsp_idx, idx);
+		if (i == idx) {
+			insert = udsp;
+			new->udsp_idx = idx;
+		}
+		i++;
+		if (lnet_udsp_equal(udsp, new)) {
+			if (!lnet_udsp_action_equal(udsp, new) &&
+			    udsp->udsp_action_type == EN_LNET_UDSP_ACTION_PRIORITY &&
+			    new->udsp_action_type == EN_LNET_UDSP_ACTION_PRIORITY) {
+				udsp->udsp_action.udsp_priority = new->udsp_action.udsp_priority;
+				CDEBUG(D_NET, "udsp: %p index %d updated priority to %d\n",
+				       udsp,
+				       udsp->udsp_idx,
+				       udsp->udsp_action.udsp_priority);
+				return 0;
+			}
+			return -EALREADY;
+		}
+	}
+
+	if (insert) {
+		list_add(&new->udsp_on_list, insert->udsp_on_list.prev);
+		i = 0;
+		list_for_each_entry(udsp,
+				    &the_lnet.ln_udsp_list,
+				    udsp_on_list) {
+			if (i <= idx) {
+				i++;
+				continue;
+			}
+			udsp->udsp_idx++;
+		}
+	} else {
+		list_add_tail(&new->udsp_on_list, &the_lnet.ln_udsp_list);
+		new->udsp_idx = i;
+	}
+
+	CDEBUG(D_NET, "udsp: %p added at index %d\n", new, new->udsp_idx);
+
+	CDEBUG(D_NET, "udsp list:\n");
+	list_for_each_entry(udsp, &the_lnet.ln_udsp_list, udsp_on_list)
+		CDEBUG(D_NET, "udsp %p:%d\n", udsp, udsp->udsp_idx);
+
+	return 0;
+}
+
+int
+lnet_udsp_del_policy(int idx)
+{
+	struct lnet_udsp *udsp;
+	struct lnet_udsp *tmp;
+	bool removed = false;
+
+	if (idx < 0) {
+		lnet_udsp_destroy(false);
+		return 0;
+	}
+
+	CDEBUG(D_NET, "del udsp at idx = %d\n", idx);
+
+	list_for_each_entry_safe(udsp,
+				 tmp,
+				 &the_lnet.ln_udsp_list,
+				 udsp_on_list) {
+		if (removed)
+			udsp->udsp_idx--;
+		if (udsp->udsp_idx == idx && !removed) {
+			list_del_init(&udsp->udsp_on_list);
+			lnet_udsp_apply_policies(udsp, true);
+			lnet_udsp_free(udsp);
+			removed = true;
+		}
+	}
+
+	return 0;
+}
+
+static void
+lnet_udsp_get_ni_info(struct lnet_ioctl_construct_udsp_info *info,
+		      struct lnet_ni *ni)
+{
+	struct lnet_nid_list *ne;
+	struct lnet_net *net = ni->ni_net;
+	int i = 0;
+
+	LASSERT(ni);
+
+	info->cud_nid_priority = ni->ni_sel_priority;
+	if (net) {
+		info->cud_net_priority = ni->ni_net->net_sel_priority;
+		list_for_each_entry(ne, &net->net_rtr_pref_nids, nl_list) {
+			if (i < LNET_MAX_SHOW_NUM_NID)
+				info->cud_pref_rtr_nid[i] =
+					lnet_nid_to_nid4(&ne->nl_nid);
+			else
+				break;
+			i++;
+		}
+	}
+}
+
+static void
+lnet_udsp_get_peer_info(struct lnet_ioctl_construct_udsp_info *info,
+			struct lnet_peer_ni *lpni)
+{
+	struct lnet_nid_list *ne;
+	int i = 0;
+
+	/* peer tree structure needs to be in existence */
+	LASSERT(lpni && lpni->lpni_peer_net &&
+		lpni->lpni_peer_net->lpn_peer);
+
+	info->cud_nid_priority = lpni->lpni_sel_priority;
+	CDEBUG(D_NET, "lpni %s has %d pref nids\n",
+	       libcfs_nidstr(&lpni->lpni_nid),
+	       lpni->lpni_pref_nnids);
+	if (lpni->lpni_pref_nnids == 1) {
+		info->cud_pref_nid[0] = lnet_nid_to_nid4(&lpni->lpni_pref.nid);
+	} else if (lpni->lpni_pref_nnids > 1) {
+		struct list_head *list = &lpni->lpni_pref.nids;
+
+		list_for_each_entry(ne, list, nl_list) {
+			if (i < LNET_MAX_SHOW_NUM_NID)
+				info->cud_pref_nid[i] =
+					lnet_nid_to_nid4(&ne->nl_nid);
+			else
+				break;
+			i++;
+		}
+	}
+
+	i = 0;
+	list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) {
+		if (i < LNET_MAX_SHOW_NUM_NID)
+			info->cud_pref_rtr_nid[i] =
+				lnet_nid_to_nid4(&ne->nl_nid);
+		else
+			break;
+		i++;
+	}
+
+	info->cud_net_priority = lpni->lpni_peer_net->lpn_sel_priority;
+}
+
+void
+lnet_udsp_get_construct_info(struct lnet_ioctl_construct_udsp_info *info)
+{
+	struct lnet_ni *ni;
+	struct lnet_peer_ni *lpni;
+
+	lnet_net_lock(0);
+	if (!info->cud_peer) {
+		ni = lnet_nid2ni_locked(info->cud_nid, 0);
+		if (ni)
+			lnet_udsp_get_ni_info(info, ni);
+	} else {
+		lpni = lnet_find_peer_ni_locked(info->cud_nid);
+		if (!lpni) {
+			CDEBUG(D_NET, "nid %s is not found\n",
+			       libcfs_nid2str(info->cud_nid));
+		} else {
+			lnet_udsp_get_peer_info(info, lpni);
+			lnet_peer_ni_decref_locked(lpni);
+		}
+	}
+	lnet_net_unlock(0);
+}
+
+struct lnet_udsp *
+lnet_udsp_alloc(void)
+{
+	struct lnet_udsp *udsp;
+
+	udsp = kmem_cache_alloc(lnet_udsp_cachep, GFP_NOFS | __GFP_ZERO);
+
+	if (!udsp)
+		return NULL;
+
+	INIT_LIST_HEAD(&udsp->udsp_on_list);
+	INIT_LIST_HEAD(&udsp->udsp_src.ud_addr_range);
+	INIT_LIST_HEAD(&udsp->udsp_src.ud_net_id.udn_net_num_range);
+	INIT_LIST_HEAD(&udsp->udsp_dst.ud_addr_range);
+	INIT_LIST_HEAD(&udsp->udsp_dst.ud_net_id.udn_net_num_range);
+	INIT_LIST_HEAD(&udsp->udsp_rte.ud_addr_range);
+	INIT_LIST_HEAD(&udsp->udsp_rte.ud_net_id.udn_net_num_range);
+
+	CDEBUG(D_MALLOC, "udsp alloc %p\n", udsp);
+	return udsp;
+}
+
+static void
+lnet_udsp_nid_descr_free(struct lnet_ud_nid_descr *nid_descr)
+{
+	struct list_head *net_range = &nid_descr->ud_net_id.udn_net_num_range;
+
+	if (!lnet_udsp_criteria_present(nid_descr))
+		return;
+
+	/* memory management is a bit tricky here. When we allocate the
+	 * memory to store the NID descriptor we allocate a large buffer
+	 * for all the data, so we need to free the entire buffer at
+	 * once. If the net is present the net_range->next points to that
+	 * buffer otherwise if the ud_addr_range is present then it's the
+	 * ud_addr_range.next
+	 */
+	if (!list_empty(net_range))
+		LIBCFS_FREE(net_range->next, nid_descr->ud_mem_size);
+	else if (!list_empty(&nid_descr->ud_addr_range))
+		LIBCFS_FREE(nid_descr->ud_addr_range.next,
+			    nid_descr->ud_mem_size);
+}
+
+void
+lnet_udsp_free(struct lnet_udsp *udsp)
+{
+	lnet_udsp_nid_descr_free(&udsp->udsp_src);
+	lnet_udsp_nid_descr_free(&udsp->udsp_dst);
+	lnet_udsp_nid_descr_free(&udsp->udsp_rte);
+
+	CDEBUG(D_MALLOC, "udsp free %p\n", udsp);
+	kmem_cache_free(lnet_udsp_cachep, udsp);
+}
+
+void
+lnet_udsp_destroy(bool shutdown)
+{
+	struct lnet_udsp *udsp, *tmp;
+
+	CDEBUG(D_NET, "Destroying UDSPs in the system\n");
+
+	list_for_each_entry_safe(udsp, tmp, &the_lnet.ln_udsp_list,
+				 udsp_on_list) {
+		list_del(&udsp->udsp_on_list);
+		if (!shutdown)
+			lnet_udsp_apply_policies(udsp, true);
+		lnet_udsp_free(udsp);
+	}
+}
+
+static size_t
+lnet_size_marshaled_nid_descr(struct lnet_ud_nid_descr *descr)
+{
+	struct cfs_expr_list *expr;
+	int expr_count = 0;
+	int range_count = 0;
+	size_t size = sizeof(struct lnet_ioctl_udsp_descr);
+
+	if (!lnet_udsp_criteria_present(descr))
+		return size;
+
+	/* we always have one net expression */
+	if (!list_empty(&descr->ud_net_id.udn_net_num_range)) {
+		expr = list_first_entry(&descr->ud_net_id.udn_net_num_range,
+					struct cfs_expr_list, el_link);
+
+		/* count the number of cfs_range_expr in the net expression */
+		range_count = lnet_get_list_len(&expr->el_exprs);
+	}
+
+	/* count the number of cfs_range_expr in the address expressions */
+	list_for_each_entry(expr, &descr->ud_addr_range, el_link) {
+		expr_count++;
+		range_count += lnet_get_list_len(&expr->el_exprs);
+	}
+
+	size += (sizeof(struct lnet_expressions) * expr_count);
+	size += (sizeof(struct lnet_range_expr) * range_count);
+
+	return size;
+}
+
+size_t
+lnet_get_udsp_size(struct lnet_udsp *udsp)
+{
+	size_t size = sizeof(struct lnet_ioctl_udsp);
+
+	size += lnet_size_marshaled_nid_descr(&udsp->udsp_src);
+	size += lnet_size_marshaled_nid_descr(&udsp->udsp_dst);
+	size += lnet_size_marshaled_nid_descr(&udsp->udsp_rte);
+
+	CDEBUG(D_NET, "get udsp (%p) size: %d\n", udsp, (int)size);
+
+	return size;
+}
+
+static int
+copy_exprs(struct cfs_expr_list *expr, void __user **bulk,
+	   __u32 *bulk_size)
+{
+	struct cfs_range_expr *range;
+	struct lnet_range_expr range_expr;
+
+	/* copy over the net range expressions to the bulk */
+	list_for_each_entry(range, &expr->el_exprs, re_link) {
+		range_expr.re_lo = range->re_lo;
+		range_expr.re_hi = range->re_hi;
+		range_expr.re_stride = range->re_stride;
+		CDEBUG(D_NET, "Copy Range %u:%u:%u\n",
+		       range_expr.re_lo, range_expr.re_hi,
+		       range_expr.re_stride);
+		if (copy_to_user(*bulk, &range_expr, sizeof(range_expr))) {
+			CDEBUG(D_NET, "Failed to copy range_expr\n");
+			return -EFAULT;
+		}
+		*bulk += sizeof(range_expr);
+		*bulk_size -= sizeof(range_expr);
+	}
+
+	return 0;
+}
+
+static int
+copy_nid_range(struct lnet_ud_nid_descr *nid_descr, char *type,
+		void __user **bulk, __u32 *bulk_size)
+{
+	struct lnet_ioctl_udsp_descr ioc_udsp_descr;
+	struct cfs_expr_list *expr;
+	struct lnet_expressions ioc_expr;
+	int expr_count;
+	int net_expr_count;
+	int rc;
+
+	memset(&ioc_udsp_descr, 0, sizeof(ioc_udsp_descr));
+	ioc_udsp_descr.iud_src_hdr.ud_descr_type = *(__u32 *)type;
+
+	/* if criteria not present, copy over the static part of the NID
+	 * descriptor
+	 */
+	if (!lnet_udsp_criteria_present(nid_descr)) {
+		CDEBUG(D_NET, "Descriptor %u:%u:%u:%u\n",
+		       ioc_udsp_descr.iud_src_hdr.ud_descr_type,
+		       ioc_udsp_descr.iud_src_hdr.ud_descr_count,
+		       ioc_udsp_descr.iud_net.ud_net_type,
+		       ioc_udsp_descr.iud_net.ud_net_num_expr.le_count);
+		if (copy_to_user(*bulk, &ioc_udsp_descr,
+				 sizeof(ioc_udsp_descr))) {
+			CDEBUG(D_NET, "failed to copy ioc_udsp_descr\n");
+			return -EFAULT;
+		}
+		*bulk += sizeof(ioc_udsp_descr);
+		*bulk_size -= sizeof(ioc_udsp_descr);
+		return 0;
+	}
+
+	expr_count = lnet_get_list_len(&nid_descr->ud_addr_range);
+
+	/* copy the net information */
+	if (!list_empty(&nid_descr->ud_net_id.udn_net_num_range)) {
+		expr = list_first_entry(&nid_descr->ud_net_id.udn_net_num_range,
+					struct cfs_expr_list, el_link);
+		net_expr_count = lnet_get_list_len(&expr->el_exprs);
+	} else {
+		net_expr_count = 0;
+	}
+
+	/* set the total expression count */
+	ioc_udsp_descr.iud_src_hdr.ud_descr_count = expr_count;
+	ioc_udsp_descr.iud_net.ud_net_type =
+		nid_descr->ud_net_id.udn_net_type;
+	ioc_udsp_descr.iud_net.ud_net_num_expr.le_count = net_expr_count;
+
+	CDEBUG(D_NET, "Descriptor %u:%u:%u:%u\n",
+		ioc_udsp_descr.iud_src_hdr.ud_descr_type,
+		ioc_udsp_descr.iud_src_hdr.ud_descr_count,
+		ioc_udsp_descr.iud_net.ud_net_type,
+		ioc_udsp_descr.iud_net.ud_net_num_expr.le_count);
+
+	/* copy over the header info to the bulk */
+	if (copy_to_user(*bulk, &ioc_udsp_descr, sizeof(ioc_udsp_descr))) {
+		CDEBUG(D_NET, "Failed to copy data\n");
+		return -EFAULT;
+	}
+	*bulk += sizeof(ioc_udsp_descr);
+	*bulk_size -= sizeof(ioc_udsp_descr);
+
+	/* copy over the net num expression if it exists */
+	if (net_expr_count) {
+		rc = copy_exprs(expr, bulk, bulk_size);
+		if (rc)
+			return rc;
+	}
+
+	/* copy the address range */
+	list_for_each_entry(expr, &nid_descr->ud_addr_range, el_link) {
+		ioc_expr.le_count = lnet_get_list_len(&expr->el_exprs);
+		if (copy_to_user(*bulk, &ioc_expr, sizeof(ioc_expr))) {
+			CDEBUG(D_NET, "failex to copy ioc_expr\n");
+			return -EFAULT;
+		}
+		*bulk += sizeof(ioc_expr);
+		*bulk_size -= sizeof(ioc_expr);
+
+		rc = copy_exprs(expr, bulk, bulk_size);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+int
+lnet_udsp_marshal(struct lnet_udsp *udsp, struct lnet_ioctl_udsp *ioc_udsp)
+{
+	int rc = -ENOMEM;
+	void __user *bulk;
+	__u32 bulk_size;
+
+	if (!ioc_udsp)
+		return -EINVAL;
+
+	bulk = ioc_udsp->iou_bulk;
+	bulk_size = ioc_udsp->iou_hdr.ioc_len +
+	  ioc_udsp->iou_bulk_size;
+
+	CDEBUG(D_NET, "marshal udsp (%p)\n", udsp);
+	CDEBUG(D_NET, "MEM -----> bulk: %p:0x%x\n", bulk, bulk_size);
+	/* make sure user space allocated enough buffer to marshal the
+	 * udsp
+	 */
+	if (bulk_size != lnet_get_udsp_size(udsp)) {
+		rc = -ENOSPC;
+		goto fail;
+	}
+
+	ioc_udsp->iou_idx = udsp->udsp_idx;
+	ioc_udsp->iou_action_type = udsp->udsp_action_type;
+	ioc_udsp->iou_action.priority = udsp->udsp_action.udsp_priority;
+
+	bulk_size -= sizeof(*ioc_udsp);
+
+	rc = copy_nid_range(&udsp->udsp_src, "SRC", &bulk, &bulk_size);
+	if (rc)
+		goto fail;
+
+	rc = copy_nid_range(&udsp->udsp_dst, "DST", &bulk, &bulk_size);
+	if (rc)
+		goto fail;
+
+	rc = copy_nid_range(&udsp->udsp_rte, "RTE", &bulk, &bulk_size);
+	if (rc)
+		goto fail;
+
+	CDEBUG(D_NET, "MEM <----- bulk: %p\n", bulk);
+
+	/* we should've consumed the entire buffer */
+	LASSERT(bulk_size == 0);
+	return 0;
+
+fail:
+	CERROR("Failed to marshal udsp: %d\n", rc);
+	return rc;
+}
+
+static void
+copy_range_info(void **bulk, void **buf, struct list_head *list,
+		int count)
+{
+	struct lnet_range_expr *range_expr;
+	struct cfs_range_expr *range;
+	struct cfs_expr_list *exprs;
+	int range_count = count;
+	int i;
+
+	if (range_count == 0)
+		return;
+
+	if (range_count == -1) {
+		struct lnet_expressions *e;
+
+		e = *bulk;
+		range_count = e->le_count;
+		*bulk += sizeof(*e);
+	}
+
+	exprs = *buf;
+	INIT_LIST_HEAD(&exprs->el_link);
+	INIT_LIST_HEAD(&exprs->el_exprs);
+	list_add_tail(&exprs->el_link, list);
+	*buf += sizeof(*exprs);
+
+	for (i = 0; i < range_count; i++) {
+		range_expr = *bulk;
+		range = *buf;
+		INIT_LIST_HEAD(&range->re_link);
+		range->re_lo = range_expr->re_lo;
+		range->re_hi = range_expr->re_hi;
+		range->re_stride = range_expr->re_stride;
+		CDEBUG(D_NET, "Copy Range %u:%u:%u\n",
+		       range->re_lo,
+		       range->re_hi,
+		       range->re_stride);
+		list_add_tail(&range->re_link, &exprs->el_exprs);
+		*bulk += sizeof(*range_expr);
+		*buf += sizeof(*range);
+	}
+}
+
+static int
+copy_ioc_udsp_descr(struct lnet_ud_nid_descr *nid_descr, char *type,
+		    void **bulk, __u32 *bulk_size)
+{
+	struct lnet_ioctl_udsp_descr *ioc_nid = *bulk;
+	struct lnet_expressions *exprs;
+	__u32 descr_type;
+	int expr_count = 0;
+	int range_count = 0;
+	int i;
+	__u32 size;
+	int remaining_size = *bulk_size;
+	void *tmp = *bulk;
+	__u32 alloc_size;
+	void *buf;
+	size_t range_expr_s = sizeof(struct lnet_range_expr);
+	size_t lnet_exprs_s = sizeof(struct lnet_expressions);
+
+	CDEBUG(D_NET, "%s: bulk = %p:%u\n", type, *bulk, *bulk_size);
+
+	/* criteria not present, skip over the static part of the
+	 * bulk, which is included for each NID descriptor
+	 */
+	if (ioc_nid->iud_net.ud_net_type == 0) {
+		remaining_size -= sizeof(*ioc_nid);
+		if (remaining_size < 0) {
+			CERROR("Truncated userspace udsp buffer given\n");
+			return -EINVAL;
+		}
+		*bulk += sizeof(*ioc_nid);
+		*bulk_size = remaining_size;
+		return 0;
+	}
+
+	descr_type = ioc_nid->iud_src_hdr.ud_descr_type;
+	if (descr_type != *(__u32 *)type) {
+		CERROR("Bad NID descriptor type. Expected %s, given %c%c%c\n",
+			type, (__u8)descr_type, (__u8)(descr_type << 4),
+			(__u8)(descr_type << 8));
+		return -EINVAL;
+	}
+
+	/* calculate the total size to verify we have enough buffer.
+	 * Start of by finding how many ranges there are for the net
+	 * expression.
+	 */
+	range_count = ioc_nid->iud_net.ud_net_num_expr.le_count;
+	size = sizeof(*ioc_nid) + (range_count * range_expr_s);
+	remaining_size -= size;
+	if (remaining_size < 0) {
+		CERROR("Truncated userspace udsp buffer given\n");
+		return -EINVAL;
+	}
+
+	CDEBUG(D_NET, "Total net num ranges in %s: %d:%u\n", type,
+	       range_count, size);
+	/* the number of expressions for the NID. IE 4 for IP, 1 for GNI */
+	expr_count = ioc_nid->iud_src_hdr.ud_descr_count;
+	CDEBUG(D_NET, "addr as %d exprs\n", expr_count);
+	/* point tmp to the beginning of the NID expressions */
+	tmp += size;
+	for (i = 0; i < expr_count; i++) {
+		/* get the number of ranges per expression */
+		exprs = tmp;
+		range_count += exprs->le_count;
+		size = (range_expr_s * exprs->le_count) + lnet_exprs_s;
+		remaining_size -= size;
+		CDEBUG(D_NET, "expr %d:%d:%u:%d:%d\n", i, exprs->le_count,
+		       size, remaining_size, range_count);
+		if (remaining_size < 0) {
+			CERROR("Truncated userspace udsp buffer given\n");
+			return -EINVAL;
+		}
+		tmp += size;
+	}
+
+	*bulk_size = remaining_size;
+
+	/* copy over the net type */
+	nid_descr->ud_net_id.udn_net_type = ioc_nid->iud_net.ud_net_type;
+
+	CDEBUG(D_NET, "%u\n", nid_descr->ud_net_id.udn_net_type);
+
+	/* allocate the total memory required to copy this NID descriptor */
+	alloc_size = (sizeof(struct cfs_expr_list) * (expr_count + 1)) +
+		     (sizeof(struct cfs_range_expr) * (range_count));
+	LIBCFS_ALLOC(buf, alloc_size);
+	if (!buf)
+		return -ENOMEM;
+
+	/* store the amount of memory allocated so we can free it later on */
+	nid_descr->ud_mem_size = alloc_size;
+
+	/* copy over the net number range */
+	range_count = ioc_nid->iud_net.ud_net_num_expr.le_count;
+	*bulk += sizeof(*ioc_nid);
+	CDEBUG(D_NET, "bulk = %p\n", *bulk);
+	copy_range_info(bulk, &buf, &nid_descr->ud_net_id.udn_net_num_range,
+			range_count);
+	CDEBUG(D_NET, "bulk = %p\n", *bulk);
+
+	/* copy over the NID descriptor */
+	for (i = 0; i < expr_count; i++) {
+		copy_range_info(bulk, &buf, &nid_descr->ud_addr_range, -1);
+		CDEBUG(D_NET, "bulk = %p\n", *bulk);
+	}
+
+	return 0;
+}
+
+int
+lnet_udsp_demarshal_add(void *bulk, __u32 bulk_size)
+{
+	struct lnet_ioctl_udsp *ioc_udsp;
+	struct lnet_udsp *udsp;
+	int rc = -ENOMEM;
+	int idx;
+
+	if (bulk_size < sizeof(*ioc_udsp))
+		return -ENOSPC;
+
+	udsp = lnet_udsp_alloc();
+	if (!udsp)
+		return rc;
+
+	ioc_udsp = bulk;
+
+	udsp->udsp_action_type = ioc_udsp->iou_action_type;
+	udsp->udsp_action.udsp_priority = ioc_udsp->iou_action.priority;
+	idx = ioc_udsp->iou_idx;
+
+	CDEBUG(D_NET, "demarshal descr %u:%u:%d:%u\n", udsp->udsp_action_type,
+	       udsp->udsp_action.udsp_priority, idx, bulk_size);
+
+	bulk += sizeof(*ioc_udsp);
+	bulk_size -= sizeof(*ioc_udsp);
+
+	rc = copy_ioc_udsp_descr(&udsp->udsp_src, "SRC", &bulk, &bulk_size);
+	if (rc < 0)
+		goto free_udsp;
+
+	rc = copy_ioc_udsp_descr(&udsp->udsp_dst, "DST", &bulk, &bulk_size);
+	if (rc < 0)
+		goto free_udsp;
+
+	rc = copy_ioc_udsp_descr(&udsp->udsp_rte, "RTE", &bulk, &bulk_size);
+	if (rc < 0)
+		goto free_udsp;
+
+	return lnet_udsp_add_policy(udsp, idx);
+
+free_udsp:
+	lnet_udsp_free(udsp);
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
new file mode 100644
index 0000000000000..2e77d8fa6d6b6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
@@ -0,0 +1,524 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+module_param(brw_srv_workitems, int, 0644);
+MODULE_PARM_DESC(brw_srv_workitems, "# BRW server workitems");
+
+static int brw_inject_errors;
+module_param(brw_inject_errors, int, 0644);
+MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by default");
+
+#define BRW_POISON	0xbeefbeefbeefbeefULL
+#define BRW_MAGIC	0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE	sizeof(__u64)
+
+static void
+brw_client_fini(struct sfw_test_instance *tsi)
+{
+	struct srpc_bulk *bulk;
+	struct sfw_test_unit *tsu;
+
+	LASSERT(tsi->tsi_is_client);
+
+	list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+		bulk = tsu->tsu_private;
+		if (bulk == NULL)
+			continue;
+
+		srpc_free_bulk(bulk);
+		tsu->tsu_private = NULL;
+	}
+}
+
+static int
+brw_client_init(struct sfw_test_instance *tsi)
+{
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	int		  flags;
+	int		  off;
+	int		  npg;
+	int		  len;
+	int		  opc;
+	struct srpc_bulk *bulk;
+	struct sfw_test_unit *tsu;
+
+	LASSERT(sn != NULL);
+	LASSERT(tsi->tsi_is_client);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		/* NB: this is not going to work for variable page size,
+		 * but we have to keep it for compatibility */
+		len   = npg * PAGE_SIZE;
+		off   = 0;
+
+	} else {
+		struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		off   = breq->blk_offset & ~PAGE_MASK;
+		npg   = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	}
+
+	if (off % BRW_MSIZE != 0)
+		return -EINVAL;
+
+	if (npg > LNET_MAX_IOV || npg <= 0)
+		return -EINVAL;
+
+	if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+		return -EINVAL;
+
+	if (flags != LST_BRW_CHECK_NONE &&
+	    flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+		return -EINVAL;
+
+	list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+		bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid, NULL),
+				       off, npg, len, opc == LST_BRW_READ);
+		if (bulk == NULL) {
+			brw_client_fini(tsi);
+			return -ENOMEM;
+		}
+
+		tsu->tsu_private = bulk;
+	}
+
+	return 0;
+}
+
+#define BRW_POISON      0xbeefbeefbeefbeefULL
+#define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE       sizeof(__u64)
+
+static int brw_inject_one_error(void)
+{
+	struct timespec64 ts;
+
+	if (brw_inject_errors <= 0) return 0;
+
+	ktime_get_ts64(&ts);
+
+	if (((ts.tv_nsec / NSEC_PER_USEC) & 1) == 0)
+		return 0;
+
+	return brw_inject_errors--;
+}
+
+static void
+brw_fill_page(struct page *pg, int off, int len, int pattern, __u64 magic)
+{
+	char *addr = page_address(pg) + off;
+	int   i;
+
+	LASSERT(addr != NULL);
+	LASSERT(off % BRW_MSIZE == 0 && len % BRW_MSIZE == 0);
+
+	if (pattern == LST_BRW_CHECK_NONE)
+		return;
+
+	if (magic == BRW_MAGIC)
+		magic += brw_inject_one_error();
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		memcpy(addr, &magic, BRW_MSIZE);
+		if (len > BRW_MSIZE) {
+			addr += len - BRW_MSIZE;
+			memcpy(addr, &magic, BRW_MSIZE);
+		}
+		return;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < len; i += BRW_MSIZE)
+			memcpy(addr + i, &magic, BRW_MSIZE);
+		return;
+	}
+	LBUG();
+}
+
+static int
+brw_check_page(struct page *pg, int off, int len, int pattern, __u64 magic)
+{
+	char  *addr = page_address(pg) + off;
+	__u64  data = 0; /* make compiler happy */
+	int    i;
+
+	LASSERT(addr != NULL);
+	LASSERT(off % BRW_MSIZE == 0 && len % BRW_MSIZE == 0);
+
+	if (pattern == LST_BRW_CHECK_NONE)
+		return 0;
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		data = *((__u64 *) addr);
+		if (data != magic)
+			goto bad_data;
+
+		if (len > BRW_MSIZE) {
+			addr += len - BRW_MSIZE;
+			data = *((__u64 *) addr);
+			if (data != magic)
+				goto bad_data;
+		}
+		return 0;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < len; i += BRW_MSIZE) {
+			data = *(__u64 *)(addr + i);
+			if (data != magic)
+				goto bad_data;
+		}
+		return 0;
+	}
+
+	LBUG();
+
+bad_data:
+	CERROR ("Bad data in page %p: %#llx, %#llx expected\n",
+		pg, data, magic);
+	return 1;
+}
+
+static void
+brw_fill_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
+{
+	int	     i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		int	off;
+		int	len;
+
+		pg = bk->bk_iovs[i].bv_page;
+		off = bk->bk_iovs[i].bv_offset;
+		len = bk->bk_iovs[i].bv_len;
+		brw_fill_page(pg, off, len, pattern, magic);
+	}
+}
+
+static int
+brw_check_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
+{
+	int	     i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		int	off;
+		int	len;
+
+		pg = bk->bk_iovs[i].bv_page;
+		off = bk->bk_iovs[i].bv_offset;
+		len = bk->bk_iovs[i].bv_len;
+		if (brw_check_page(pg, off, len, pattern, magic) != 0) {
+			CERROR("Bulk page %p (%d/%d) is corrupted!\n",
+			       pg, i, bk->bk_niov);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
+		    struct srpc_client_rpc **rpcpp)
+{
+	struct srpc_bulk *bulk = tsu->tsu_private;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct srpc_client_rpc *rpc;
+	struct srpc_brw_reqst *req;
+	int flags;
+	int npg;
+	int len;
+	int opc;
+	int rc;
+
+	LASSERT(sn != NULL);
+	LASSERT(bulk != NULL);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		len   = npg * PAGE_SIZE;
+
+	} else {
+		struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1;
+		int off;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		off   = breq->blk_offset;
+		npg   = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	}
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+	if (rc != 0)
+		return rc;
+
+	memcpy(&rpc->crpc_bulk, bulk, offsetof(struct srpc_bulk, bk_iovs[npg]));
+	if (opc == LST_BRW_WRITE)
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+	req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+	req->brw_flags = flags;
+	req->brw_rw    = opc;
+	req->brw_len   = len;
+
+	*rpcpp = rpc;
+	return 0;
+}
+
+static void
+brw_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
+{
+	__u64 magic = BRW_MAGIC;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct srpc_msg *msg = &rpc->crpc_replymsg;
+	struct srpc_brw_reply *reply = &msg->msg_body.brw_reply;
+	struct srpc_brw_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+	LASSERT(sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		CERROR("BRW RPC to %s failed with %d\n",
+		       libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_brw_errors);
+		return;
+	}
+
+	if (msg->msg_magic != SRPC_MSG_MAGIC) {
+		__swab64s(&magic);
+		__swab32s(&reply->brw_status);
+	}
+
+	CDEBUG(reply->brw_status ? D_WARNING : D_NET,
+	       "BRW RPC to %s finished with brw_status: %d\n",
+	       libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+	if (reply->brw_status != 0) {
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -(int)reply->brw_status;
+		return;
+	}
+
+	if (reqst->brw_rw == LST_BRW_WRITE)
+		return;
+
+	if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR("Bulk data from %s is corrupted!\n",
+		       libcfs_id2str(rpc->crpc_dest));
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -EBADMSG;
+	}
+}
+
+static void
+brw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+	struct srpc_bulk *blk = rpc->srpc_bulk;
+
+	if (blk == NULL)
+		return;
+
+	if (rpc->srpc_status != 0)
+		CERROR("Bulk transfer %s %s has failed: %d\n",
+		       blk->bk_sink ? "from" : "to",
+		       libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+	else
+		CDEBUG(D_NET, "Transferred %d pages bulk data %s %s\n",
+		       blk->bk_niov, blk->bk_sink ? "from" : "to",
+		       libcfs_id2str(rpc->srpc_peer));
+
+	sfw_free_pages(rpc);
+}
+
+static int
+brw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+	__u64 magic = BRW_MAGIC;
+	struct srpc_brw_reply *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+	struct srpc_brw_reqst *reqst;
+	struct srpc_msg *reqstmsg;
+
+        LASSERT (rpc->srpc_bulk != NULL);
+        LASSERT (rpc->srpc_reqstbuf != NULL);
+
+        reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+        reqst = &reqstmsg->msg_body.brw_reqst;
+
+        if (status != 0) {
+                CERROR ("BRW bulk %s failed for RPC from %s: %d\n",
+                        reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+                        libcfs_id2str(rpc->srpc_peer), status);
+                return -EIO;
+        }
+
+        if (reqst->brw_rw == LST_BRW_READ)
+                return 0;
+
+        if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+                __swab64s(&magic);
+
+        if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+                CERROR ("Bulk data from %s is corrupted!\n",
+                        libcfs_id2str(rpc->srpc_peer));
+                reply->brw_status = EBADMSG;
+        }
+
+        return 0;
+}
+
+static int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+	struct srpc_msg *replymsg = &rpc->srpc_replymsg;
+	struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_brw_reply *reply = &replymsg->msg_body.brw_reply;
+	struct srpc_brw_reqst *reqst = &reqstmsg->msg_body.brw_reqst;
+	int npg;
+	int rc;
+
+        LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
+
+        if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+                LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+                __swab32s(&reqst->brw_rw);
+                __swab32s(&reqst->brw_len);
+                __swab32s(&reqst->brw_flags);
+                __swab64s(&reqst->brw_rpyid);
+                __swab64s(&reqst->brw_bulkid);
+        }
+        LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+	reply->brw_status = 0;
+        rpc->srpc_done = brw_server_rpc_done;
+
+        if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+            (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+             reqst->brw_flags != LST_BRW_CHECK_FULL &&
+             reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+                reply->brw_status = EINVAL;
+                return 0;
+        }
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		reply->brw_status = EPROTO;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+		/* compat with old version */
+		if ((reqst->brw_len & ~PAGE_MASK) != 0) {
+			reply->brw_status = EINVAL;
+			return 0;
+		}
+		npg = reqst->brw_len >> PAGE_SHIFT;
+
+	} else {
+		npg = (reqst->brw_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+			     reqst->brw_len,
+			     reqst->brw_rw == LST_BRW_WRITE);
+	if (rc != 0)
+		return rc;
+
+        if (reqst->brw_rw == LST_BRW_READ)
+                brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+        else
+                brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+        return 0;
+}
+
+struct sfw_test_client_ops brw_test_client;
+
+void brw_init_test_client(void)
+{
+        brw_test_client.tso_init       = brw_client_init;
+        brw_test_client.tso_fini       = brw_client_fini;
+        brw_test_client.tso_prep_rpc   = brw_client_prep_rpc;
+        brw_test_client.tso_done_rpc   = brw_client_done_rpc;
+};
+
+struct srpc_service brw_test_service;
+
+void brw_init_test_service(void)
+{
+        brw_test_service.sv_id         = SRPC_SERVICE_BRW;
+        brw_test_service.sv_name       = "brw_test";
+        brw_test_service.sv_handler    = brw_server_handle;
+        brw_test_service.sv_bulk_ready = brw_bulk_ready;
+	brw_test_service.sv_wi_total   = brw_srv_workitems;
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conctl.c b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
new file mode 100644
index 0000000000000..9afbdae89d398
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
@@ -0,0 +1,929 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+#include "console.h"
+
+static int
+lst_session_new_ioctl(struct lstio_session_new_args *args)
+{
+	char *name;
+	int rc;
+
+	if (args->lstio_ses_idp == NULL || /* address for output sid */
+	    args->lstio_ses_key == 0 || /* no key is specified */
+	    args->lstio_ses_namep == NULL || /* session name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_ses_namep,
+			   args->lstio_ses_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_ses_nmlen] = 0;
+
+	rc = lstcon_session_new(name,
+				args->lstio_ses_key,
+				args->lstio_ses_feats,
+				args->lstio_ses_timeout,
+				args->lstio_ses_force,
+				args->lstio_ses_idp);
+
+	LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+	return rc;
+}
+
+static int
+lst_session_end_ioctl(struct lstio_session_end_args *args)
+{
+	if (args->lstio_ses_key != console_session.ses_key)
+		return -EACCES;
+
+	return lstcon_session_end();
+}
+
+static int
+lst_session_info_ioctl(struct lstio_session_info_args *args)
+{
+	/* no checking of key */
+
+	if (args->lstio_ses_idp == NULL || /* address for ouput sid */
+	    args->lstio_ses_keyp == NULL || /* address for ouput key */
+	    args->lstio_ses_featp == NULL || /* address for ouput features */
+	    args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+	    args->lstio_ses_namep == NULL || /* address for ouput name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_session_info(args->lstio_ses_idp,
+				   args->lstio_ses_keyp,
+				   args->lstio_ses_featp,
+				   args->lstio_ses_ndinfo,
+				   args->lstio_ses_namep,
+				   args->lstio_ses_nmlen);
+}
+
+static int
+lst_debug_ioctl(struct lstio_debug_args *args)
+{
+	char *name = NULL;
+	int client = 1;
+	int rc;
+
+	if (args->lstio_dbg_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_dbg_resultp == NULL)
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+	    (args->lstio_dbg_nmlen <= 0 ||
+	     args->lstio_dbg_nmlen > LST_NAME_SIZE))
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL) {
+		LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+		if (name == NULL)
+			return -ENOMEM;
+
+		if (copy_from_user(name, args->lstio_dbg_namep,
+				   args->lstio_dbg_nmlen)) {
+			LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+			return -EFAULT;
+		}
+
+		name[args->lstio_dbg_nmlen] = 0;
+	}
+
+	rc = -EINVAL;
+
+	switch (args->lstio_dbg_type) {
+	case LST_OPC_SESSION:
+		rc = lstcon_session_debug(args->lstio_dbg_timeout,
+					  args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_BATCHSRV:
+		client = 0;
+		fallthrough;
+	case LST_OPC_BATCHCLI:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+					name, client, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_GROUP:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_group_debug(args->lstio_dbg_timeout,
+					name, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_NODES:
+		if (args->lstio_dbg_count <= 0 ||
+		    args->lstio_dbg_idsp == NULL)
+			goto out;
+
+		rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+					args->lstio_dbg_count,
+					args->lstio_dbg_idsp,
+					args->lstio_dbg_resultp);
+		break;
+
+	default:
+		break;
+	}
+
+out:
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_group_add_ioctl(struct lstio_group_add_args *args)
+{
+	char *name;
+	int rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_add(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_group_del_ioctl(struct lstio_group_del_args *args)
+{
+	int rc;
+	char *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_del(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_group_update_ioctl(struct lstio_group_update_args *args)
+{
+	int rc;
+	char *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	switch (args->lstio_grp_opc) {
+	case LST_GROUP_CLEAN:
+		rc = lstcon_group_clean(name, args->lstio_grp_args);
+		break;
+
+	case LST_GROUP_REFRESH:
+		rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+		break;
+
+	case LST_GROUP_RMND:
+		if (args->lstio_grp_count <= 0 ||
+		    args->lstio_grp_idsp == NULL) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+					 args->lstio_grp_idsp,
+					 args->lstio_grp_resultp);
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_nodes_add_ioctl(struct lstio_group_nodes_args *args)
+{
+	unsigned int feats;
+	int rc;
+	char *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idsp == NULL || /* array of ids */
+	    args->lstio_grp_count <= 0 ||
+	    args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_featp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_nodes_add(name, args->lstio_grp_count,
+			      args->lstio_grp_idsp, &feats,
+			      args->lstio_grp_resultp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	if (rc == 0 &&
+	    copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+		return -EINVAL;
+	}
+
+	return rc;
+}
+
+static int
+lst_group_list_ioctl(struct lstio_group_list_args *args)
+{
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idx   < 0 ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_group_list(args->lstio_grp_idx,
+				 args->lstio_grp_nmlen,
+				 args->lstio_grp_namep);
+}
+
+static int
+lst_group_info_ioctl(struct lstio_group_info_args *args)
+{
+	char *name;
+	int ndent;
+	int index;
+	int rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_grp_entp == NULL && /* output: group entry */
+	    args->lstio_grp_dentsp == NULL)  /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+		if (args->lstio_grp_idxp == NULL || /* node index */
+		    args->lstio_grp_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+				   sizeof(ndent)) ||
+		    copy_from_user(&index, args->lstio_grp_idxp,
+				   sizeof(index)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_info(name, args->lstio_grp_entp,
+			       &index, &ndent, args->lstio_grp_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_grp_dentsp != NULL &&
+	    (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int
+lst_batch_add_ioctl(struct lstio_batch_add_args *args)
+{
+	int rc;
+	char *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_add(name);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_batch_run_ioctl(struct lstio_batch_run_args *args)
+{
+	int rc;
+	char *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+			      args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_batch_stop_ioctl(struct lstio_batch_stop_args *args)
+{
+	int rc;
+	char *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_stop(name, args->lstio_bat_force,
+			       args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_batch_query_ioctl(struct lstio_batch_query_args *args)
+{
+	char *name;
+	int rc;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_testidx < 0)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_test_batch_query(name,
+				     args->lstio_bat_testidx,
+				     args->lstio_bat_client,
+				     args->lstio_bat_timeout,
+				     args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+static int
+lst_batch_list_ioctl(struct lstio_batch_list_args *args)
+{
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_idx < 0 ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_batch_list(args->lstio_bat_idx,
+				 args->lstio_bat_nmlen,
+				 args->lstio_bat_namep);
+}
+
+static int
+lst_batch_info_ioctl(struct lstio_batch_info_args *args)
+{
+	char *name;
+	int rc;
+	int index;
+	int ndent;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL || /* batch name */
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_entp == NULL && /* output: batch entry */
+	    args->lstio_bat_dentsp == NULL) /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+		if (args->lstio_bat_idxp == NULL || /* node index */
+		    args->lstio_bat_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&index, args->lstio_bat_idxp,
+				   sizeof(index)) ||
+		    copy_from_user(&ndent, args->lstio_bat_ndentp,
+				   sizeof(ndent)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_info(name,
+			       args->lstio_bat_entp, args->lstio_bat_server,
+			       args->lstio_bat_testidx, &index, &ndent,
+			       args->lstio_bat_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_bat_dentsp != NULL &&
+	    (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return rc;
+}
+
+static int
+lst_stat_query_ioctl(struct lstio_stat_args *args)
+{
+	int rc;
+	char *name = NULL;
+
+	/* TODO: not finished */
+	if (args->lstio_sta_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_sta_resultp == NULL)
+		return -EINVAL;
+
+	if (args->lstio_sta_idsp != NULL) {
+		if (args->lstio_sta_count <= 0)
+			return -EINVAL;
+
+		rc = lstcon_nodes_stat(args->lstio_sta_count,
+				       args->lstio_sta_idsp,
+				       args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
+	} else if (args->lstio_sta_namep != NULL) {
+		if (args->lstio_sta_nmlen <= 0 ||
+		    args->lstio_sta_nmlen > LST_NAME_SIZE)
+			return -EINVAL;
+
+		LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+		if (name == NULL)
+			return -ENOMEM;
+
+		rc = copy_from_user(name, args->lstio_sta_namep,
+				    args->lstio_sta_nmlen);
+		if (rc == 0)
+			rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+					       args->lstio_sta_resultp);
+		else
+			rc = -EFAULT;
+
+	} else {
+		rc = -EINVAL;
+	}
+
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+	return rc;
+}
+
+static int lst_test_add_ioctl(struct lstio_test_args *args)
+{
+	char *batch_name;
+	char *src_name = NULL;
+	char *dst_name = NULL;
+	void *param = NULL;
+	int ret = 0;
+	int rc = -ENOMEM;
+
+	if (args->lstio_tes_resultp == NULL ||
+	    args->lstio_tes_retp == NULL ||
+	    args->lstio_tes_bat_name == NULL || /* no specified batch */
+	    args->lstio_tes_bat_nmlen <= 0 ||
+	    args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_sgrp_name == NULL || /* no source group */
+	    args->lstio_tes_sgrp_nmlen <= 0 ||
+	    args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_dgrp_name == NULL || /* no target group */
+	    args->lstio_tes_dgrp_nmlen <= 0 ||
+	    args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_tes_loop == 0 || /* negative is infinite */
+	    args->lstio_tes_concur <= 0 ||
+	    args->lstio_tes_dist <= 0 ||
+	    args->lstio_tes_span <= 0)
+		return -EINVAL;
+
+	/* have parameter, check if parameter length is valid */
+	if (args->lstio_tes_param != NULL &&
+	    (args->lstio_tes_param_len <= 0 ||
+	     args->lstio_tes_param_len >
+	     PAGE_SIZE - sizeof(struct lstcon_test)))
+		return -EINVAL;
+
+	LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1);
+	if (batch_name == NULL)
+		return rc;
+
+	LIBCFS_ALLOC(src_name, args->lstio_tes_sgrp_nmlen + 1);
+	if (src_name == NULL)
+		goto out;
+
+	LIBCFS_ALLOC(dst_name, args->lstio_tes_dgrp_nmlen + 1);
+	if (dst_name == NULL)
+		goto out;
+
+	if (args->lstio_tes_param != NULL) {
+		LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+		if (param == NULL)
+			goto out;
+		if (copy_from_user(param, args->lstio_tes_param,
+				   args->lstio_tes_param_len)) {
+			rc = -EFAULT;
+			goto out;
+		}
+	}
+
+	rc = -EFAULT;
+	if (copy_from_user(batch_name, args->lstio_tes_bat_name,
+			   args->lstio_tes_bat_nmlen) ||
+	    copy_from_user(src_name, args->lstio_tes_sgrp_name,
+			   args->lstio_tes_sgrp_nmlen) ||
+	    copy_from_user(dst_name, args->lstio_tes_dgrp_name,
+			   args->lstio_tes_dgrp_nmlen))
+		goto out;
+
+	rc = lstcon_test_add(batch_name,
+			     args->lstio_tes_type,
+			     args->lstio_tes_loop,
+			     args->lstio_tes_concur,
+			     args->lstio_tes_dist, args->lstio_tes_span,
+			     src_name, dst_name, param,
+			     args->lstio_tes_param_len,
+			     &ret, args->lstio_tes_resultp);
+
+	if (ret != 0)
+		rc = (copy_to_user(args->lstio_tes_retp, &ret,
+				   sizeof(ret))) ? -EFAULT : 0;
+out:
+	if (batch_name != NULL)
+		LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1);
+
+	if (src_name != NULL)
+		LIBCFS_FREE(src_name, args->lstio_tes_sgrp_nmlen + 1);
+
+	if (dst_name != NULL)
+		LIBCFS_FREE(dst_name, args->lstio_tes_dgrp_nmlen + 1);
+
+	if (param != NULL)
+		LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+	return rc;
+}
+
+int
+lstcon_ioctl_entry(struct notifier_block *nb,
+		   unsigned long cmd, void *vdata)
+{
+	struct libcfs_ioctl_hdr *hdr = vdata;
+	struct libcfs_ioctl_data *data;
+	char *buf = NULL;
+	int rc = -EINVAL;
+	int opc;
+
+	if (cmd != IOC_LIBCFS_LNETST)
+		goto err;
+
+	data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr);
+
+	opc = data->ioc_u32[0];
+
+	if (data->ioc_plen1 > PAGE_SIZE)
+		goto err;
+
+	LIBCFS_ALLOC(buf, data->ioc_plen1);
+	if (buf == NULL) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	/* copy in parameter */
+	if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+		rc = -EFAULT;
+		goto out_free_buf;
+	}
+
+	mutex_lock(&console_session.ses_mutex);
+
+	console_session.ses_laststamp = ktime_get_real_seconds();
+
+	if (console_session.ses_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	if (console_session.ses_expired)
+		lstcon_session_end();
+
+	if (opc != LSTIO_SESSION_NEW &&
+	    console_session.ses_state == LST_SESSION_NONE) {
+		CDEBUG(D_NET, "LST no active session\n");
+		rc = -ESRCH;
+		goto out;
+	}
+
+	memset(&console_session.ses_trans_stat, 0,
+	       sizeof(struct lstcon_trans_stat));
+
+	switch (opc) {
+	case LSTIO_SESSION_NEW:
+		rc = lst_session_new_ioctl((struct lstio_session_new_args *)buf);
+		break;
+	case LSTIO_SESSION_END:
+		rc = lst_session_end_ioctl((struct lstio_session_end_args *)buf);
+		break;
+	case LSTIO_SESSION_INFO:
+		rc = lst_session_info_ioctl((struct lstio_session_info_args *)buf);
+		break;
+	case LSTIO_DEBUG:
+		rc = lst_debug_ioctl((struct lstio_debug_args *)buf);
+		break;
+	case LSTIO_GROUP_ADD:
+		rc = lst_group_add_ioctl((struct lstio_group_add_args *)buf);
+		break;
+	case LSTIO_GROUP_DEL:
+		rc = lst_group_del_ioctl((struct lstio_group_del_args *)buf);
+		break;
+	case LSTIO_GROUP_UPDATE:
+		rc = lst_group_update_ioctl((struct lstio_group_update_args *)buf);
+		break;
+	case LSTIO_NODES_ADD:
+		rc = lst_nodes_add_ioctl((struct lstio_group_nodes_args *)buf);
+		break;
+	case LSTIO_GROUP_LIST:
+		rc = lst_group_list_ioctl((struct lstio_group_list_args *)buf);
+		break;
+	case LSTIO_GROUP_INFO:
+		rc = lst_group_info_ioctl((struct lstio_group_info_args *)buf);
+		break;
+	case LSTIO_BATCH_ADD:
+		rc = lst_batch_add_ioctl((struct lstio_batch_add_args *)buf);
+		break;
+	case LSTIO_BATCH_START:
+		rc = lst_batch_run_ioctl((struct lstio_batch_run_args *)buf);
+		break;
+	case LSTIO_BATCH_STOP:
+		rc = lst_batch_stop_ioctl((struct lstio_batch_stop_args *)buf);
+		break;
+	case LSTIO_BATCH_QUERY:
+		rc = lst_batch_query_ioctl((struct lstio_batch_query_args *)buf);
+		break;
+	case LSTIO_BATCH_LIST:
+		rc = lst_batch_list_ioctl((struct lstio_batch_list_args *)buf);
+		break;
+	case LSTIO_BATCH_INFO:
+		rc = lst_batch_info_ioctl((struct lstio_batch_info_args *)buf);
+		break;
+	case LSTIO_TEST_ADD:
+		rc = lst_test_add_ioctl((struct lstio_test_args *)buf);
+		break;
+	case LSTIO_STAT_QUERY:
+		rc = lst_stat_query_ioctl((struct lstio_stat_args *)buf);
+		break;
+	default:
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+			 sizeof(struct lstcon_trans_stat)))
+		rc = -EFAULT;
+out:
+	mutex_unlock(&console_session.ses_mutex);
+out_free_buf:
+	LIBCFS_FREE(buf, data->ioc_plen1);
+err:
+	return notifier_from_ioctl_errno(rc);
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
new file mode 100644
index 0000000000000..d2147e6bb8b44
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
@@ -0,0 +1,1398 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(struct lstcon_rpc_trans *, struct srpc_msg *,
+			   struct lstcon_node *, struct lstcon_trans_stat *);
+
+static void
+lstcon_rpc_done(struct srpc_client_rpc *rpc)
+{
+	struct lstcon_rpc *crpc = rpc->crpc_priv;
+
+	LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+	LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (crpc->crp_trans == NULL) {
+		/* Orphan RPC is not in any transaction,
+		 * I'm just a poor body and nobody loves me */
+		spin_unlock(&rpc->crpc_lock);
+
+		/* release it */
+		lstcon_rpc_put(crpc);
+		return;
+	}
+
+	/* not an orphan RPC */
+	crpc->crp_finished = 1;
+
+	if (crpc->crp_stamp_ns == 0) {
+		/* not aborted */
+		LASSERT(crpc->crp_status == 0);
+
+		crpc->crp_stamp_ns = ktime_get_ns();
+		crpc->crp_status = rpc->crpc_status;
+	}
+
+	/* wakeup (transaction)thread if I'm the last RPC in the transaction */
+	if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+		wake_up(&crpc->crp_trans->tas_waitq);
+
+	spin_unlock(&rpc->crpc_lock);
+}
+
+static int
+lstcon_rpc_init(struct lstcon_node *nd, int service, unsigned int feats,
+		int bulk_npg, int bulk_len, int embedded,
+		struct lstcon_rpc *crpc)
+{
+	memset(crpc, 0, sizeof(*crpc));
+
+	crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+				       feats, bulk_npg, bulk_len,
+				       lstcon_rpc_done, (void *)crpc);
+	if (crpc->crp_rpc == NULL)
+		return -ENOMEM;
+
+	crpc->crp_node	   = nd;
+	crpc->crp_embedded = embedded;
+	INIT_LIST_HEAD(&crpc->crp_link);
+
+	atomic_inc(&console_session.ses_rpc_counter);
+
+	return 0;
+}
+
+static int
+lstcon_rpc_prep(struct lstcon_node *nd, int service, unsigned int feats,
+		int bulk_npg, int bulk_len, struct lstcon_rpc **crpcpp)
+{
+	struct lstcon_rpc *crpc = NULL;
+	int rc;
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!list_empty(&console_session.ses_rpc_freelist)) {
+		crpc = list_entry(console_session.ses_rpc_freelist.next,
+				  struct lstcon_rpc, crp_link);
+		list_del_init(&crpc->crp_link);
+	}
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (crpc == NULL) {
+		LIBCFS_ALLOC(crpc, sizeof(*crpc));
+		if (crpc == NULL)
+			return -ENOMEM;
+	}
+
+	rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+        if (rc == 0) {
+                *crpcpp = crpc;
+                return 0;
+        }
+
+        LIBCFS_FREE(crpc, sizeof(*crpc));
+
+        return rc;
+}
+
+void
+lstcon_rpc_put(struct lstcon_rpc *crpc)
+{
+	struct srpc_bulk *bulk = &crpc->crp_rpc->crpc_bulk;
+	int i;
+
+	LASSERT(list_empty(&crpc->crp_link));
+
+	for (i = 0; i < bulk->bk_niov; i++) {
+		if (bulk->bk_iovs[i].bv_page == NULL)
+			continue;
+
+		__free_page(bulk->bk_iovs[i].bv_page);
+	}
+
+	srpc_client_rpc_decref(crpc->crp_rpc);
+
+	if (crpc->crp_embedded) {
+		/* embedded RPC, don't recycle it */
+		memset(crpc, 0, sizeof(*crpc));
+		crpc->crp_embedded = 1;
+
+	} else {
+		spin_lock(&console_session.ses_rpc_lock);
+
+		list_add(&crpc->crp_link,
+			 &console_session.ses_rpc_freelist);
+
+		spin_unlock(&console_session.ses_rpc_lock);
+	}
+
+	/* RPC is not alive now */
+	atomic_dec(&console_session.ses_rpc_counter);
+}
+
+static void
+lstcon_rpc_post(struct lstcon_rpc *crpc)
+{
+	struct lstcon_rpc_trans *trans = crpc->crp_trans;
+
+        LASSERT (trans != NULL);
+
+	atomic_inc(&trans->tas_remaining);
+        crpc->crp_posted = 1;
+
+        sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+        if (transop == LST_TRANS_SESNEW)
+                return "SESNEW";
+
+        if (transop == LST_TRANS_SESEND)
+                return "SESEND";
+
+        if (transop == LST_TRANS_SESQRY)
+                return "SESQRY";
+
+        if (transop == LST_TRANS_SESPING)
+                return "SESPING";
+
+        if (transop == LST_TRANS_TSBCLIADD)
+                return "TSBCLIADD";
+
+        if (transop == LST_TRANS_TSBSRVADD)
+                return "TSBSRVADD";
+
+        if (transop == LST_TRANS_TSBRUN)
+                return "TSBRUN";
+
+        if (transop == LST_TRANS_TSBSTOP)
+                return "TSBSTOP";
+
+        if (transop == LST_TRANS_TSBCLIQRY)
+                return "TSBCLIQRY";
+
+        if (transop == LST_TRANS_TSBSRVQRY)
+                return "TSBSRVQRY";
+
+        if (transop == LST_TRANS_STATQRY)
+                return "STATQRY";
+
+        return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist, int transop,
+		      struct lstcon_rpc_trans **transpp)
+{
+	struct lstcon_rpc_trans *trans;
+
+	if (translist != NULL) {
+		list_for_each_entry(trans, translist, tas_link) {
+			/* Can't enqueue two private transaction on
+			 * the same object */
+			if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+				return -EPERM;
+		}
+	}
+
+	/* create a trans group */
+	LIBCFS_ALLOC(trans, sizeof(*trans));
+	if (trans == NULL)
+		return -ENOMEM;
+
+	trans->tas_opc = transop;
+
+	if (translist == NULL)
+		INIT_LIST_HEAD(&trans->tas_olink);
+	else
+		list_add_tail(&trans->tas_olink, translist);
+
+	list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+	INIT_LIST_HEAD(&trans->tas_rpcs_list);
+	atomic_set(&trans->tas_remaining, 0);
+	init_waitqueue_head(&trans->tas_waitq);
+
+	spin_lock(&console_session.ses_rpc_lock);
+	trans->tas_features = console_session.ses_features;
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	*transpp = trans;
+	return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, struct lstcon_rpc *crpc)
+{
+	list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+	crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error)
+{
+	struct srpc_client_rpc *rpc;
+	struct lstcon_rpc *crpc;
+	struct lstcon_node *nd;
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		if (!crpc->crp_posted || /* not posted */
+		    crpc->crp_stamp_ns != 0) { /* rpc done or aborted already */
+			if (crpc->crp_stamp_ns == 0) {
+				crpc->crp_stamp_ns = ktime_get_ns();
+				crpc->crp_status = -EINTR;
+			}
+			spin_unlock(&rpc->crpc_lock);
+			continue;
+		}
+
+		crpc->crp_stamp_ns  = ktime_get_ns();
+		crpc->crp_status = error;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		sfw_abort_rpc(rpc);
+
+		if (error != -ETIMEDOUT)
+			continue;
+
+		nd = crpc->crp_node;
+		if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns)
+			continue;
+
+		nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns);
+		nd->nd_state = LST_NODE_DOWN;
+	}
+}
+
+static int
+lstcon_rpc_trans_check(struct lstcon_rpc_trans *trans)
+{
+	if (console_session.ses_shutdown &&
+	    !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+		return 1;
+
+	return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0;
+}
+
+int
+lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout)
+{
+	struct lstcon_rpc *crpc;
+	int rc;
+
+	if (list_empty(&trans->tas_rpcs_list))
+                return 0;
+
+	if (timeout < LST_TRANS_MIN_TIMEOUT)
+		timeout = LST_TRANS_MIN_TIMEOUT;
+
+	CDEBUG(D_NET, "Transaction %s started\n",
+	lstcon_rpc_trans_name(trans->tas_opc));
+
+	/* post all requests */
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		LASSERT(!crpc->crp_posted);
+
+		lstcon_rpc_post(crpc);
+	}
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	rc = wait_event_interruptible_timeout(trans->tas_waitq,
+					      lstcon_rpc_trans_check(trans),
+					      cfs_time_seconds(timeout));
+
+        rc = (rc > 0)? 0: ((rc < 0)? -EINTR: -ETIMEDOUT);
+
+	mutex_lock(&console_session.ses_mutex);
+
+        if (console_session.ses_shutdown)
+                rc = -ESHUTDOWN;
+
+        if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+                /* treat short timeout as canceled */
+                if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+                        rc = -EINTR;
+
+                lstcon_rpc_trans_abort(trans, rc);
+        }
+
+        CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+               lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+        lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+        return rc;
+}
+
+static int
+lstcon_rpc_get_reply(struct lstcon_rpc *crpc, struct srpc_msg **msgpp)
+{
+	struct lstcon_node *nd = crpc->crp_node;
+	struct srpc_client_rpc *rpc = crpc->crp_rpc;
+	struct srpc_generic_reply *rep;
+
+	LASSERT(nd != NULL && rpc != NULL);
+	LASSERT(crpc->crp_stamp_ns != 0);
+
+        if (crpc->crp_status != 0) {
+                *msgpp = NULL;
+                return crpc->crp_status;
+        }
+
+        *msgpp = &rpc->crpc_replymsg;
+        if (!crpc->crp_unpacked) {
+                sfw_unpack_message(*msgpp);
+                crpc->crp_unpacked = 1;
+        }
+
+	if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns)
+		return 0;
+
+	nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns);
+	rep = &(*msgpp)->msg_body.reply;
+
+        if (rep->sid.ses_nid == LNET_NID_ANY)
+                nd->nd_state = LST_NODE_UNKNOWN;
+        else if (lstcon_session_match(rep->sid))
+                nd->nd_state = LST_NODE_ACTIVE;
+        else
+                nd->nd_state = LST_NODE_BUSY;
+
+        return 0;
+}
+
+void
+lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
+		      struct lstcon_trans_stat *stat)
+{
+	struct lstcon_rpc *crpc;
+	struct srpc_msg	*rep;
+	int error;
+
+	LASSERT(stat != NULL);
+
+	memset(stat, 0, sizeof(*stat));
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		lstcon_rpc_stat_total(stat, 1);
+
+		LASSERT(crpc->crp_stamp_ns != 0);
+
+                error = lstcon_rpc_get_reply(crpc, &rep);
+                if (error != 0) {
+                        lstcon_rpc_stat_failure(stat, 1);
+                        if (stat->trs_rpc_errno == 0)
+                                stat->trs_rpc_errno = -error;
+
+                        continue;
+                }
+
+                lstcon_rpc_stat_success(stat, 1);
+
+		lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+	}
+
+	if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+		stat->trs_fwk_errno =
+		      lstcon_session_feats_check(trans->tas_features);
+        }
+
+        CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, "
+                      "RPC error(%d), Framework error(%d)\n",
+               lstcon_rpc_trans_name(trans->tas_opc),
+               lstcon_rpc_stat_success(stat, 0),
+               lstcon_rpc_stat_failure(stat, 0),
+               lstcon_rpc_stat_total(stat, 0),
+               stat->trs_rpc_errno, stat->trs_fwk_errno);
+}
+
+int
+lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
+			     struct list_head __user *head_up,
+			     lstcon_rpc_readent_func_t readent)
+{
+	struct list_head tmp;
+	struct list_head __user *next;
+	struct lstcon_rpc_ent *ent;
+	struct srpc_generic_reply *rep;
+	struct lstcon_rpc *crpc;
+	struct srpc_msg *msg;
+	struct lstcon_node *nd;
+	struct timespec64 ts;
+	int error;
+	s64 dur;
+
+	LASSERT(head_up != NULL);
+
+	next = head_up;
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		if (copy_from_user(&tmp, next,
+				   sizeof(struct list_head)))
+			return -EFAULT;
+
+		if (tmp.next == head_up)
+			return 0;
+
+		next = tmp.next;
+
+		ent = list_entry(next, struct lstcon_rpc_ent, rpe_link);
+
+		LASSERT(crpc->crp_stamp_ns != 0);
+
+		error = lstcon_rpc_get_reply(crpc, &msg);
+
+		nd = crpc->crp_node;
+
+		dur = crpc->crp_stamp_ns -
+		      console_session.ses_id.ses_stamp * NSEC_PER_MSEC;
+		ts = ns_to_timespec64(dur);
+
+		if (copy_to_user(&ent->rpe_peer,
+				 &nd->nd_id, sizeof(struct lnet_process_id)) ||
+		    copy_to_user(&ent->rpe_stamp, &ts, sizeof(ts)) ||
+		    copy_to_user(&ent->rpe_state,
+				 &nd->nd_state, sizeof(nd->nd_state)) ||
+		    copy_to_user(&ent->rpe_rpc_errno, &error,
+				     sizeof(error)))
+			return -EFAULT;
+
+		if (error != 0)
+			continue;
+
+		/* RPC is done */
+		rep = (struct srpc_generic_reply *)&msg->msg_body.reply;
+
+		if (copy_to_user(&ent->rpe_sid,
+				 &rep->sid, sizeof(rep->sid)) ||
+		    copy_to_user(&ent->rpe_fwk_errno,
+				 &rep->status, sizeof(rep->status)))
+			return -EFAULT;
+
+		if (readent == NULL)
+			continue;
+
+		error = readent(trans->tas_opc, msg, ent);
+		if (error != 0)
+			return error;
+	}
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans)
+{
+	struct srpc_client_rpc *rpc;
+	struct lstcon_rpc *crpc;
+	struct lstcon_rpc *tmp;
+	int count = 0;
+
+	list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		/* free it if not posted or finished already */
+		if (!crpc->crp_posted || crpc->crp_finished) {
+			spin_unlock(&rpc->crpc_lock);
+
+			list_del_init(&crpc->crp_link);
+			lstcon_rpc_put(crpc);
+
+			continue;
+		}
+
+		/* rpcs can be still not callbacked (even LNetMDUnlink is
+		 * called) because huge timeout for inaccessible network,
+		 * don't make user wait for them, just abandon them, they
+		 * will be recycled in callback */
+
+		LASSERT(crpc->crp_status != 0);
+
+		crpc->crp_node  = NULL;
+		crpc->crp_trans = NULL;
+		list_del_init(&crpc->crp_link);
+		count++;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		atomic_dec(&trans->tas_remaining);
+	}
+
+	LASSERT(atomic_read(&trans->tas_remaining) == 0);
+
+	list_del(&trans->tas_link);
+	if (!list_empty(&trans->tas_olink))
+		list_del(&trans->tas_olink);
+
+	CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), count);
+
+	LIBCFS_FREE(trans, sizeof(*trans));
+}
+
+int
+lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+		   unsigned int feats, struct lstcon_rpc **crpc)
+{
+	struct srpc_mksn_reqst *msrq;
+	struct srpc_rmsn_reqst *rsrq;
+	int rc;
+
+        switch (transop) {
+        case LST_TRANS_SESNEW:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+				     feats, 0, 0, crpc);
+                if (rc != 0)
+                        return rc;
+
+                msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+                msrq->mksn_sid     = console_session.ses_id;
+                msrq->mksn_force   = console_session.ses_force;
+		strlcpy(msrq->mksn_name, console_session.ses_name,
+			sizeof(msrq->mksn_name));
+                break;
+
+        case LST_TRANS_SESEND:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+				     feats, 0, 0, crpc);
+                if (rc != 0)
+                        return rc;
+
+                rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+                rsrq->rmsn_sid = console_session.ses_id;
+                break;
+
+        default:
+                LBUG();
+        }
+
+        return 0;
+}
+
+int
+lstcon_dbgrpc_prep(struct lstcon_node *nd, unsigned int feats,
+		   struct lstcon_rpc **crpc)
+{
+	struct srpc_debug_reqst *drq;
+	int rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+        if (rc != 0)
+                return rc;
+
+        drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+        drq->dbg_sid   = console_session.ses_id;
+        drq->dbg_flags = 0;
+
+        return rc;
+}
+
+int
+lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
+		   struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc)
+{
+	struct lstcon_batch *batch;
+	struct srpc_batch_reqst *brq;
+	int rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+        if (rc != 0)
+                return rc;
+
+        brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+        brq->bar_sid     = console_session.ses_id;
+        brq->bar_bid     = tsb->tsb_id;
+        brq->bar_testidx = tsb->tsb_index;
+        brq->bar_opc     = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+                           (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP:
+                            SRPC_BATCH_OPC_QUERY);
+
+        if (transop != LST_TRANS_TSBRUN &&
+            transop != LST_TRANS_TSBSTOP)
+                return 0;
+
+        LASSERT (tsb->tsb_index == 0);
+
+	batch = (struct lstcon_batch *)tsb;
+        brq->bar_arg = batch->bat_arg;
+
+        return 0;
+}
+
+int
+lstcon_statrpc_prep(struct lstcon_node *nd, unsigned int feats,
+		    struct lstcon_rpc **crpc)
+{
+	struct srpc_stat_reqst *srq;
+	int rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+        if (rc != 0)
+                return rc;
+
+        srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+        srq->str_sid  = console_session.ses_id;
+        srq->str_type = 0; /* XXX remove it */
+
+        return 0;
+}
+
+static struct lnet_process_id_packed *
+lstcon_next_id(int idx, int nkiov, struct bio_vec *kiov)
+{
+	struct lnet_process_id_packed *pid;
+        int                       i;
+
+        i = idx / SFW_ID_PER_PAGE;
+
+        LASSERT (i < nkiov);
+
+	pid = (struct lnet_process_id_packed *)page_address(kiov[i].bv_page);
+
+        return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+static int
+lstcon_dstnodes_prep(struct lstcon_group *grp, int idx,
+		     int dist, int span, int nkiov, struct bio_vec *kiov)
+{
+	struct lnet_process_id_packed *pid;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int start;
+	int end;
+	int i = 0;
+
+        LASSERT (dist >= 1);
+        LASSERT (span >= 1);
+        LASSERT (grp->grp_nnode >= 1);
+
+        if (span > grp->grp_nnode)
+                return -EINVAL;
+
+        start = ((idx / dist) * span) % grp->grp_nnode;
+        end   = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+		if (i < start) {
+			i++;
+			continue;
+		}
+
+		if (i > (end >= start ? end : grp->grp_nnode))
+			break;
+
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	if (start <= end)	/* done */
+		return 0;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		if (i > grp->grp_nnode + end)
+			break;
+
+		nd = ndl->ndl_node;
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	return 0;
+}
+
+static int
+lstcon_pingrpc_prep(struct lst_test_ping_param *param,
+		    struct srpc_test_reqst *req)
+{
+	struct test_ping_req *prq = &req->tsr_u.ping;
+
+        prq->png_size   = param->png_size;
+        prq->png_flags  = param->png_flags;
+        /* TODO dest */
+        return 0;
+}
+
+static int
+lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param,
+		       struct srpc_test_reqst *req)
+{
+	struct test_bulk_req *brq = &req->tsr_u.bulk_v0;
+
+	brq->blk_opc    = param->blk_opc;
+	brq->blk_npg    = (param->blk_size + PAGE_SIZE - 1) /
+			   PAGE_SIZE;
+	brq->blk_flags  = param->blk_flags;
+
+	return 0;
+}
+
+static int
+lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client,
+		       struct srpc_test_reqst *req)
+{
+	struct test_bulk_req_v1 *brq = &req->tsr_u.bulk_v1;
+
+	brq->blk_opc	= param->blk_opc;
+	brq->blk_flags	= param->blk_flags;
+	brq->blk_len	= param->blk_size;
+	brq->blk_offset	= is_client ? param->blk_cli_off : param->blk_srv_off;
+
+	return 0;
+}
+
+int
+lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
+		    struct lstcon_test *test, struct lstcon_rpc **crpc)
+{
+	struct lstcon_group *sgrp = test->tes_src_grp;
+	struct lstcon_group *dgrp = test->tes_dst_grp;
+	struct srpc_test_reqst *trq;
+	struct srpc_bulk *bulk;
+	int i;
+	int npg = 0;
+	int nob = 0;
+	int rc = 0;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		npg = sfw_id_pages(test->tes_span);
+		nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+		      npg * PAGE_SIZE :
+		      sizeof(struct lnet_process_id_packed) * test->tes_span;
+	}
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+	if (rc != 0)
+                return rc;
+
+        trq  = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+        if (transop == LST_TRANS_TSBSRVADD) {
+                int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+                int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+                int nmax = (ndist + nspan - 1) / nspan;
+
+                trq->tsr_ndest = 0;
+                trq->tsr_loop  = nmax * test->tes_dist * test->tes_concur;
+
+        } else {
+                bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+		for (i = 0; i < npg; i++) {
+			int	len;
+
+			LASSERT(nob > 0);
+
+			len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+			      PAGE_SIZE : min_t(int, nob, PAGE_SIZE);
+			nob -= len;
+
+			bulk->bk_iovs[i].bv_offset = 0;
+			bulk->bk_iovs[i].bv_len    = len;
+			bulk->bk_iovs[i].bv_page   =
+				alloc_page(GFP_KERNEL);
+
+			if (bulk->bk_iovs[i].bv_page == NULL) {
+				lstcon_rpc_put(*crpc);
+				return -ENOMEM;
+			}
+                }
+
+                bulk->bk_sink = 0;
+
+                LASSERT (transop == LST_TRANS_TSBCLIADD);
+
+                rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+					  test->tes_cliidx++,
+					  test->tes_dist,
+					  test->tes_span,
+					  npg, &bulk->bk_iovs[0]);
+                if (rc != 0) {
+                        lstcon_rpc_put(*crpc);
+                        return rc;
+                }
+
+                trq->tsr_ndest = test->tes_span;
+                trq->tsr_loop  = test->tes_loop;
+	}
+
+        trq->tsr_sid        = console_session.ses_id;
+        trq->tsr_bid        = test->tes_hdr.tsb_id;
+        trq->tsr_concur     = test->tes_concur;
+        trq->tsr_is_client  = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+        trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+        switch (test->tes_type) {
+        case LST_TEST_PING:
+                trq->tsr_service = SRPC_SERVICE_PING;
+		rc = lstcon_pingrpc_prep((struct lst_test_ping_param *)
+					 &test->tes_param[0], trq);
+		break;
+
+	case LST_TEST_BULK:
+		trq->tsr_service = SRPC_SERVICE_BRW;
+		if ((feats & LST_FEAT_BULK_LEN) == 0) {
+			rc = lstcon_bulkrpc_v0_prep((struct lst_test_bulk_param *)
+						    &test->tes_param[0], trq);
+		} else {
+			rc = lstcon_bulkrpc_v1_prep((struct lst_test_bulk_param *)
+						    &test->tes_param[0],
+						    trq->tsr_is_client, trq);
+		}
+
+                break;
+        default:
+                LBUG();
+                break;
+        }
+
+        return rc;
+}
+
+static int
+lstcon_sesnew_stat_reply(struct lstcon_rpc_trans *trans,
+			 struct lstcon_node *nd, struct srpc_msg *reply)
+{
+	struct srpc_mksn_reply *mksn_rep = &reply->msg_body.mksn_reply;
+	int status = mksn_rep->mksn_status;
+
+	if (status == 0 &&
+	    (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		mksn_rep->mksn_status = EPROTO;
+		status = EPROTO;
+	}
+
+	if (status == EPROTO) {
+		CNETERR("session protocol error from %s: %u\n",
+			libcfs_nid2str(nd->nd_id.nid),
+			reply->msg_ses_feats);
+	}
+
+	if (status != 0)
+		return status;
+
+	if (!trans->tas_feats_updated) {
+		spin_lock(&console_session.ses_rpc_lock);
+		if (!trans->tas_feats_updated) { /* recheck with lock */
+			trans->tas_feats_updated = 1;
+			trans->tas_features = reply->msg_ses_feats;
+		}
+		spin_unlock(&console_session.ses_rpc_lock);
+	}
+
+	if (reply->msg_ses_feats != trans->tas_features) {
+		CNETERR("Framework features %x from %s is different with "
+			"features on this transaction: %x\n",
+			 reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+			 trans->tas_features);
+		status = mksn_rep->mksn_status = EPROTO;
+	}
+
+	if (status == 0) {
+		/* session timeout on remote node */
+		nd->nd_timeout = mksn_rep->mksn_timeout;
+	}
+
+	return status;
+}
+
+void
+lstcon_rpc_stat_reply(struct lstcon_rpc_trans *trans, struct srpc_msg *msg,
+		      struct lstcon_node *nd, struct lstcon_trans_stat *stat)
+{
+	struct srpc_rmsn_reply *rmsn_rep;
+	struct srpc_debug_reply *dbg_rep;
+	struct srpc_batch_reply *bat_rep;
+	struct srpc_test_reply *test_rep;
+	struct srpc_stat_reply *stat_rep;
+	int rc = 0;
+
+	switch (trans->tas_opc) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+		if (rc == 0) {
+                        lstcon_sesop_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_sesop_stat_failure(stat, 1);
+                break;
+
+        case LST_TRANS_SESEND:
+                rmsn_rep = &msg->msg_body.rmsn_reply;
+                /* ESRCH is not an error for end session */
+                if (rmsn_rep->rmsn_status == 0 ||
+                    rmsn_rep->rmsn_status == ESRCH) {
+                        lstcon_sesop_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_sesop_stat_failure(stat, 1);
+                rc = rmsn_rep->rmsn_status;
+                break;
+
+        case LST_TRANS_SESQRY:
+        case LST_TRANS_SESPING:
+                dbg_rep = &msg->msg_body.dbg_reply;
+
+                if (dbg_rep->dbg_status == ESRCH) {
+                        lstcon_sesqry_stat_unknown(stat, 1);
+                        return;
+		}
+
+                if (lstcon_session_match(dbg_rep->dbg_sid))
+                        lstcon_sesqry_stat_active(stat, 1);
+                else
+                        lstcon_sesqry_stat_busy(stat, 1);
+                return;
+
+        case LST_TRANS_TSBRUN:
+        case LST_TRANS_TSBSTOP:
+                bat_rep = &msg->msg_body.bat_reply;
+
+                if (bat_rep->bar_status == 0) {
+                        lstcon_tsbop_stat_success(stat, 1);
+                        return;
+                }
+
+		if (bat_rep->bar_status == EPERM &&
+		    trans->tas_opc == LST_TRANS_TSBSTOP) {
+                        lstcon_tsbop_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_tsbop_stat_failure(stat, 1);
+                rc = bat_rep->bar_status;
+                break;
+
+        case LST_TRANS_TSBCLIQRY:
+        case LST_TRANS_TSBSRVQRY:
+                bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_active != 0)
+                        lstcon_tsbqry_stat_run(stat, 1);
+                else
+                        lstcon_tsbqry_stat_idle(stat, 1);
+
+		if (bat_rep->bar_status == 0)
+                        return;
+
+                lstcon_tsbqry_stat_failure(stat, 1);
+                rc = bat_rep->bar_status;
+                break;
+
+        case LST_TRANS_TSBCLIADD:
+        case LST_TRANS_TSBSRVADD:
+                test_rep = &msg->msg_body.tes_reply;
+
+                if (test_rep->tsr_status == 0) {
+                        lstcon_tsbop_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_tsbop_stat_failure(stat, 1);
+                rc = test_rep->tsr_status;
+                break;
+
+        case LST_TRANS_STATQRY:
+                stat_rep = &msg->msg_body.stat_reply;
+
+                if (stat_rep->str_status == 0) {
+                        lstcon_statqry_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_statqry_stat_failure(stat, 1);
+                rc = stat_rep->str_status;
+                break;
+
+        default:
+                LBUG();
+        }
+
+        if (stat->trs_fwk_errno == 0)
+                stat->trs_fwk_errno = rc;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			struct list_head *translist, int transop,
+			void *arg, lstcon_rpc_cond_func_t condition,
+			struct lstcon_rpc_trans **transpp)
+{
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	struct lstcon_rpc *rpc;
+	unsigned int feats;
+	int rc;
+
+        /* Creating session RPG for list of nodes */
+
+        rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction %d: %d\n", transop, rc);
+                return rc;
+        }
+
+	feats = trans->tas_features;
+	list_for_each_entry(ndl, ndlist, ndl_link) {
+                rc = condition == NULL ? 1 :
+                     condition(transop, ndl->ndl_node, arg);
+
+                if (rc == 0)
+                        continue;
+
+                if (rc < 0) {
+                        CDEBUG(D_NET, "Condition error while creating RPC "
+                                      " for transaction %d: %d\n", transop, rc);
+                        break;
+                }
+
+                nd = ndl->ndl_node;
+
+                switch (transop) {
+                case LST_TRANS_SESNEW:
+                case LST_TRANS_SESEND:
+			rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+			break;
+		case LST_TRANS_SESQRY:
+		case LST_TRANS_SESPING:
+			rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+			break;
+		case LST_TRANS_TSBCLIADD:
+		case LST_TRANS_TSBSRVADD:
+			rc = lstcon_testrpc_prep(nd, transop, feats,
+						 (struct lstcon_test *)arg,
+						 &rpc);
+			break;
+		case LST_TRANS_TSBRUN:
+		case LST_TRANS_TSBSTOP:
+		case LST_TRANS_TSBCLIQRY:
+		case LST_TRANS_TSBSRVQRY:
+			rc = lstcon_batrpc_prep(nd, transop, feats,
+						(struct lstcon_tsb_hdr *)arg,
+						&rpc);
+			break;
+		case LST_TRANS_STATQRY:
+			rc = lstcon_statrpc_prep(nd, feats, &rpc);
+                        break;
+                default:
+                        rc = -EINVAL;
+                        break;
+                }
+
+                if (rc != 0) {
+                        CERROR("Failed to create RPC for transaction %s: %d\n",
+                               lstcon_rpc_trans_name(transop), rc);
+                        break;
+                }
+
+                lstcon_rpc_trans_addreq(trans, rpc);
+        }
+
+        if (rc == 0) {
+                *transpp = trans;
+                return 0;
+        }
+
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+static void
+lstcon_rpc_pinger(void *arg)
+{
+	struct stt_timer *ptimer = arg;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_rpc *crpc;
+	struct srpc_msg *rep;
+	struct srpc_debug_reqst *drq;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int intv;
+	int count = 0;
+	int rc;
+
+        /* RPC pinger is a special case of transaction,
+         * it's called by timer at 8 seconds interval.
+         */
+	mutex_lock(&console_session.ses_mutex);
+
+        if (console_session.ses_shutdown || console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+                return;
+        }
+
+	if (!console_session.ses_expired &&
+	    ktime_get_real_seconds() - console_session.ses_laststamp >
+	    (time64_t)console_session.ses_timeout)
+		console_session.ses_expired = 1;
+
+	trans = console_session.ses_ping;
+
+	LASSERT(trans != NULL);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+
+                if (console_session.ses_expired) {
+                        /* idle console, end session on all nodes */
+                        if (nd->nd_state != LST_NODE_ACTIVE)
+                                continue;
+
+			rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+						trans->tas_features, &crpc);
+                        if (rc != 0) {
+                                CERROR("Out of memory\n");
+                                break;
+                        }
+
+                        lstcon_rpc_trans_addreq(trans, crpc);
+                        lstcon_rpc_post(crpc);
+
+                        continue;
+                }
+
+                crpc = &nd->nd_ping;
+
+		if (crpc->crp_rpc != NULL) {
+			LASSERT(crpc->crp_trans == trans);
+			LASSERT(!list_empty(&crpc->crp_link));
+
+			spin_lock(&crpc->crp_rpc->crpc_lock);
+
+			LASSERT(crpc->crp_posted);
+
+			if (!crpc->crp_finished) {
+				/* in flight */
+				spin_unlock(&crpc->crp_rpc->crpc_lock);
+				continue;
+			}
+
+			spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+			lstcon_rpc_get_reply(crpc, &rep);
+
+			list_del_init(&crpc->crp_link);
+
+			lstcon_rpc_put(crpc);
+		}
+
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			continue;
+
+		intv = div_u64(ktime_ms_delta(ktime_get(), nd->nd_stamp),
+			       MSEC_PER_SEC);
+		if (intv < nd->nd_timeout / 2)
+			continue;
+
+		rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+				     trans->tas_features, 0, 0, 1, crpc);
+                if (rc != 0) {
+                        CERROR("Out of memory\n");
+                        break;
+                }
+
+                drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+                drq->dbg_sid   = console_session.ses_id;
+                drq->dbg_flags = 0;
+
+                lstcon_rpc_trans_addreq(trans, crpc);
+                lstcon_rpc_post(crpc);
+
+		count++;
+        }
+
+        if (console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+                return;
+        }
+
+        CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+	ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL;
+	stt_add_timer(ptimer);
+
+	mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+	struct stt_timer *ptimer;
+	int rc;
+
+	LASSERT(list_empty(&console_session.ses_rpc_freelist));
+	LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
+
+        rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+                                   &console_session.ses_ping);
+        if (rc != 0) {
+                CERROR("Failed to create console pinger\n");
+                return rc;
+        }
+
+	ptimer = &console_session.ses_ping_timer;
+	ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL;
+
+	stt_add_timer(ptimer);
+
+        return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+        LASSERT (console_session.ses_shutdown);
+
+        stt_del_timer(&console_session.ses_ping_timer);
+
+        lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+        lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+        lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+	memset(lstcon_trans_stat(), 0, sizeof(struct lstcon_trans_stat));
+
+        console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+	struct lstcon_rpc_trans	*trans;
+	struct lstcon_rpc *crpc;
+	struct list_head *pacer;
+	LIST_HEAD(zlist);
+
+	/* Called with hold of global mutex */
+
+	LASSERT(console_session.ses_shutdown);
+
+	while (!list_empty(&console_session.ses_trans_list)) {
+		list_for_each(pacer, &console_session.ses_trans_list) {
+			trans = list_entry(pacer, struct lstcon_rpc_trans,
+					   tas_link);
+
+			CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+			       lstcon_rpc_trans_name(trans->tas_opc));
+
+			wake_up(&trans->tas_waitq);
+		}
+
+		mutex_unlock(&console_session.ses_mutex);
+
+		CWARN("Session is shutting down, "
+		      "waiting for termination of transactions\n");
+		schedule_timeout_uninterruptible(cfs_time_seconds(1));
+
+		mutex_lock(&console_session.ses_mutex);
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+                       console_session.ses_rpc_lock,
+                       "Network is not accessable or target is down, "
+                       "waiting for %d console RPCs to being recycled\n",
+		       atomic_read(&console_session.ses_rpc_counter));
+
+	list_splice_init(&console_session.ses_rpc_freelist, &zlist);
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	while (!list_empty(&zlist)) {
+		crpc = list_entry(zlist.next, struct lstcon_rpc, crp_link);
+
+		list_del(&crpc->crp_link);
+		LIBCFS_FREE(crpc, sizeof(*crpc));
+	}
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+	INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+        console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+        console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+        console_session.ses_ping = NULL;
+
+	spin_lock_init(&console_session.ses_rpc_lock);
+	atomic_set(&console_session.ses_rpc_counter, 0);
+	INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+	return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+	LASSERT(list_empty(&console_session.ses_rpc_freelist));
+	LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
+}
+
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
new file mode 100644
index 0000000000000..4defb121497fc
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
@@ -0,0 +1,145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-types.h>
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT       30
+#define LST_TRANS_MIN_TIMEOUT   3
+
+#define LST_VALIDATE_TIMEOUT(t)	\
+	clamp_t(int, t, LST_TRANS_MIN_TIMEOUT, LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL       8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+struct lstcon_rpc {
+	struct list_head	 crp_link;	/* chain on rpc transaction */
+	struct srpc_client_rpc	*crp_rpc;	/* client rpc */
+	struct lstcon_node	*crp_node;	/* destination node */
+	struct lstcon_rpc_trans *crp_trans;	/* conrpc transaction */
+
+	unsigned int		 crp_posted:1;   /* rpc is posted */
+	unsigned int		 crp_finished:1; /* rpc is finished */
+	unsigned int		 crp_unpacked:1; /* reply is unpacked */
+	/** RPC is embedded in other structure and can't free it */
+	unsigned int		 crp_embedded:1;
+        int                      crp_status;     /* console rpc errors */
+	s64			 crp_stamp_ns;	 /* replied time stamp */
+};
+
+struct lstcon_rpc_trans {
+	/* link chain on owner list */
+	struct list_head	tas_olink;
+	/* link chain on global list */
+	struct list_head	tas_link;
+	/* operation code of transaction */
+	int			tas_opc;
+	/* features mask is uptodate */
+	unsigned		tas_feats_updated;
+	/* test features mask */
+	unsigned		tas_features;
+	wait_queue_head_t	tas_waitq;	/* wait queue head */
+	atomic_t		tas_remaining;	/* # of un-scheduled rpcs */
+	struct list_head	tas_rpcs_list;	/* queued requests */
+};
+
+#define LST_TRANS_PRIVATE       0x1000
+
+#define LST_TRANS_SESNEW        (LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND        (LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY        0x03
+#define LST_TRANS_SESPING       0x04
+
+#define LST_TRANS_TSBCLIADD     (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD     (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN        (LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP       (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY     0x15
+#define LST_TRANS_TSBSRVQRY     0x16
+
+#define LST_TRANS_STATQRY       0x21
+
+typedef int (*lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (*lstcon_rpc_readent_func_t)(int, struct srpc_msg *,
+					 struct lstcon_rpc_ent __user *);
+
+int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+			unsigned int version, struct lstcon_rpc **crpc);
+int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
+			unsigned int version, struct lstcon_rpc **crpc);
+int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc);
+int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			 struct lstcon_test *test, struct lstcon_rpc **crpc);
+int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+			 struct lstcon_rpc **crpc);
+void lstcon_rpc_put(struct lstcon_rpc *crpc);
+int  lstcon_rpc_trans_prep(struct list_head *translist,
+			   int transop, struct lstcon_rpc_trans **transpp);
+int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			     struct list_head *translist, int transop,
+			     void *arg, lstcon_rpc_cond_func_t condition,
+			     struct lstcon_rpc_trans **transpp);
+void lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
+			   struct lstcon_trans_stat *stat);
+int  lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
+				  struct list_head __user *head_up,
+				  lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error);
+void lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans);
+void lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans,
+			     struct lstcon_rpc *req);
+int  lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout);
+int  lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int  lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.c b/drivers/staging/lustrefsx/lnet/selftest/console.c
new file mode 100644
index 0000000000000..25de1f25242e0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.c
@@ -0,0 +1,2105 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p)                   \
+do {                                                    \
+        if ((nd)->nd_state == LST_NODE_ACTIVE)          \
+                (p)->nle_nactive ++;                    \
+        else if ((nd)->nd_state == LST_NODE_BUSY)       \
+                (p)->nle_nbusy ++;                      \
+        else if ((nd)->nd_state == LST_NODE_DOWN)       \
+                (p)->nle_ndown ++;                      \
+        else                                            \
+                (p)->nle_nunknown ++;                   \
+        (p)->nle_nnode ++;                              \
+} while (0)
+
+struct lstcon_session console_session;
+
+static void
+lstcon_node_get(struct lstcon_node *nd)
+{
+        LASSERT (nd->nd_ref >= 1);
+
+        nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(struct lnet_process_id id, struct lstcon_node **ndpp,
+		 int create)
+{
+	struct lstcon_ndlink *ndl;
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+	LASSERT(id.nid != LNET_NID_ANY);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx],
+			    ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		lstcon_node_get(ndl->ndl_node);
+		*ndpp = ndl->ndl_node;
+		return 0;
+	}
+
+        if (!create)
+                return -ENOENT;
+
+	LIBCFS_ALLOC(*ndpp, sizeof(**ndpp) + sizeof(*ndl));
+	if (*ndpp == NULL)
+		return -ENOMEM;
+
+	ndl = (struct lstcon_ndlink *)(*ndpp + 1);
+
+	ndl->ndl_node = *ndpp;
+
+	ndl->ndl_node->nd_ref   = 1;
+	ndl->ndl_node->nd_id    = id;
+	ndl->ndl_node->nd_stamp = ktime_get();
+	ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+	ndl->ndl_node->nd_timeout = 0;
+	memset(&ndl->ndl_node->nd_ping, 0, sizeof(ndl->ndl_node->nd_ping));
+
+	/* queued in global hash & list, no refcount is taken by
+	 * global hash & list, if caller release his refcount,
+	 * node will be released */
+	list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+	return 0;
+}
+
+static void
+lstcon_node_put(struct lstcon_node *nd)
+{
+	struct lstcon_ndlink *ndl;
+
+	LASSERT(nd->nd_ref > 0);
+
+	if (--nd->nd_ref > 0)
+		return;
+
+	ndl = (struct lstcon_ndlink *)(nd + 1);
+
+	LASSERT(!list_empty(&ndl->ndl_link));
+	LASSERT(!list_empty(&ndl->ndl_hlink));
+
+	/* remove from session */
+	list_del(&ndl->ndl_link);
+	list_del(&ndl->ndl_hlink);
+
+	LIBCFS_FREE(nd, sizeof(*nd) + sizeof(*ndl));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
+		   struct lstcon_ndlink **ndlpp, int create)
+{
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int rc;
+
+	if (id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	/* search in hash */
+	list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		*ndlpp = ndl;
+		return 0;
+	}
+
+        if (create == 0)
+                return -ENOENT;
+
+        /* find or create in session hash */
+        rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+        if (rc != 0)
+                return rc;
+
+	LIBCFS_ALLOC(ndl, sizeof(*ndl));
+        if (ndl == NULL) {
+                lstcon_node_put(nd);
+                return -ENOMEM;
+        }
+
+        *ndlpp = ndl;
+
+	ndl->ndl_node = nd;
+	INIT_LIST_HEAD(&ndl->ndl_link);
+	list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+	return 0;
+}
+
+static void
+lstcon_ndlink_release(struct lstcon_ndlink *ndl)
+{
+	LASSERT(list_empty(&ndl->ndl_link));
+	LASSERT(!list_empty(&ndl->ndl_hlink));
+
+	list_del(&ndl->ndl_hlink); /* delete from hash */
+        lstcon_node_put(ndl->ndl_node);
+
+        LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, struct lstcon_group **grpp)
+{
+	struct lstcon_group *grp;
+	int i;
+
+	LIBCFS_ALLOC(grp, offsetof(struct lstcon_group,
+                                   grp_ndl_hash[LST_NODE_HASHSIZE]));
+        if (grp == NULL)
+                return -ENOMEM;
+
+        grp->grp_ref = 1;
+	if (name != NULL) {
+		if (strlen(name) > sizeof(grp->grp_name)-1) {
+			LIBCFS_FREE(grp, offsetof(struct lstcon_group,
+					  grp_ndl_hash[LST_NODE_HASHSIZE]));
+			return -E2BIG;
+		}
+		strncpy(grp->grp_name, name, sizeof(grp->grp_name));
+	}
+
+	INIT_LIST_HEAD(&grp->grp_link);
+	INIT_LIST_HEAD(&grp->grp_ndl_list);
+	INIT_LIST_HEAD(&grp->grp_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+	*grpp = grp;
+
+	return 0;
+}
+
+static void
+lstcon_group_addref(struct lstcon_group *grp)
+{
+	grp->grp_ref++;
+}
+
+static void lstcon_group_ndlink_release(struct lstcon_group *,
+					struct lstcon_ndlink *);
+
+static void
+lstcon_group_drain(struct lstcon_group *grp, int keep)
+{
+	struct lstcon_ndlink *ndl;
+	struct lstcon_ndlink *tmp;
+
+	list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+		if ((ndl->ndl_node->nd_state & keep) == 0)
+			lstcon_group_ndlink_release(grp, ndl);
+	}
+}
+
+static void
+lstcon_group_decref(struct lstcon_group *grp)
+{
+	int i;
+
+	if (--grp->grp_ref > 0)
+		return;
+
+	if (!list_empty(&grp->grp_link))
+		list_del(&grp->grp_link);
+
+	lstcon_group_drain(grp, 0);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		LASSERT(list_empty(&grp->grp_ndl_hash[i]));
+
+	LIBCFS_FREE(grp, offsetof(struct lstcon_group,
+				  grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(const char *name, struct lstcon_group **grpp)
+{
+	struct lstcon_group *grp;
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+			continue;
+
+		lstcon_group_addref(grp);  /* +1 ref for caller */
+		*grpp = grp;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_group_ndlink_find(struct lstcon_group *grp, struct lnet_process_id id,
+			 struct lstcon_ndlink **ndlpp, int create)
+{
+	int rc;
+
+	rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+	if (rc != 0)
+		return rc;
+
+	if (!list_empty(&(*ndlpp)->ndl_link))
+		return 0;
+
+	list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+	grp->grp_nnode++;
+
+	return 0;
+}
+
+static void
+lstcon_group_ndlink_release(struct lstcon_group *grp, struct lstcon_ndlink *ndl)
+{
+	list_del_init(&ndl->ndl_link);
+	lstcon_ndlink_release(ndl);
+	grp->grp_nnode--;
+}
+
+static void
+lstcon_group_ndlink_move(struct lstcon_group *old,
+			 struct lstcon_group *new, struct lstcon_ndlink *ndl)
+{
+	unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+					LST_NODE_HASHSIZE;
+
+	old->grp_nnode--;
+
+	list_move_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+	list_move_tail(&ndl->ndl_link, &new->grp_ndl_list);
+	new->grp_nnode++;
+}
+
+static void
+lstcon_group_move(struct lstcon_group *old, struct lstcon_group *new)
+{
+	struct lstcon_ndlink *ndl;
+
+	while (!list_empty(&old->grp_ndl_list)) {
+		ndl = list_entry(old->grp_ndl_list.next,
+				 struct lstcon_ndlink, ndl_link);
+		lstcon_group_ndlink_move(old, new, ndl);
+	}
+}
+
+static int
+lstcon_sesrpc_condition(int transop, struct lstcon_node *nd, void *arg)
+{
+	struct lstcon_group *grp = arg;
+
+        switch (transop) {
+        case LST_TRANS_SESNEW:
+                if (nd->nd_state == LST_NODE_ACTIVE)
+                        return 0;
+                break;
+
+        case LST_TRANS_SESEND:
+                if (nd->nd_state != LST_NODE_ACTIVE)
+                        return 0;
+
+                if (grp != NULL && nd->nd_ref > 1)
+                        return 0;
+                break;
+
+        case LST_TRANS_SESQRY:
+                break;
+
+        default:
+                LBUG();
+        }
+
+        return 1;
+}
+
+static int
+lstcon_sesrpc_readent(int transop, struct srpc_msg *msg,
+		      struct lstcon_rpc_ent __user *ent_up)
+{
+	struct srpc_debug_reply *rep;
+
+        switch (transop) {
+        case LST_TRANS_SESNEW:
+        case LST_TRANS_SESEND:
+                return 0;
+
+        case LST_TRANS_SESQRY:
+                rep = &msg->msg_body.dbg_reply;
+
+		if (copy_to_user(&ent_up->rpe_priv[0],
+                                     &rep->dbg_timeout, sizeof(int)) ||
+		    copy_to_user(&ent_up->rpe_payload[0],
+                                     &rep->dbg_name, LST_NAME_SIZE))
+                        return -EFAULT;
+
+                return 0;
+
+        default:
+                LBUG();
+        }
+
+        return 0;
+}
+
+static int
+lstcon_group_nodes_add(struct lstcon_group *grp,
+		       int count, struct lnet_process_id __user *ids_up,
+		       unsigned int *featp,
+		       struct list_head __user *result_up)
+{
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *tmp;
+	struct lnet_process_id id;
+	int i;
+	int rc;
+
+        rc = lstcon_group_alloc(NULL, &tmp);
+        if (rc != 0) {
+                CERROR("Out of memory\n");
+                return -ENOMEM;
+        }
+
+        for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                        rc = -EFAULT;
+                        break;
+                }
+
+                /* skip if it's in this group already */
+                rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+                if (rc == 0)
+                        continue;
+
+                /* add to tmp group */
+                rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+                if (rc != 0) {
+                        CERROR("Can't create ndlink, out of memory\n");
+                        break;
+                }
+        }
+
+        if (rc != 0) {
+		lstcon_group_decref(tmp);
+                return rc;
+        }
+
+        rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+                                     &tmp->grp_trans_list, LST_TRANS_SESNEW,
+                                     tmp, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_decref(tmp);
+                return rc;
+        }
+
+        /* post all RPCs */
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                          lstcon_sesrpc_readent);
+	*featp = trans->tas_features;
+
+        /* destroy all RPGs */
+        lstcon_rpc_trans_destroy(trans);
+
+        lstcon_group_move(tmp, grp);
+	lstcon_group_decref(tmp);
+
+        return rc;
+}
+
+static int
+lstcon_group_nodes_remove(struct lstcon_group *grp,
+			  int count, struct lnet_process_id __user *ids_up,
+			  struct list_head __user *result_up)
+{
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *tmp;
+	struct lnet_process_id id;
+	int rc;
+	int i;
+
+        /* End session and remove node from the group */
+
+        rc = lstcon_group_alloc(NULL, &tmp);
+        if (rc != 0) {
+                CERROR("Out of memory\n");
+                return -ENOMEM;
+        }
+
+        for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                        rc = -EFAULT;
+                        goto error;
+                }
+
+                /* move node to tmp group */
+                if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+                        lstcon_group_ndlink_move(grp, tmp, ndl);
+        }
+
+        rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+                                     &tmp->grp_trans_list, LST_TRANS_SESEND,
+                                     tmp, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                goto error;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+        lstcon_rpc_trans_destroy(trans);
+        /* release nodes anyway, because we can't rollback status */
+	lstcon_group_decref(tmp);
+
+        return rc;
+error:
+        lstcon_group_move(tmp, grp);
+	lstcon_group_decref(tmp);
+
+        return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+	struct lstcon_group *grp;
+	int rc;
+
+        rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+        if (rc != 0) {
+                /* find a group with same name */
+		lstcon_group_decref(grp);
+                return rc;
+        }
+
+        rc = lstcon_group_alloc(name, &grp);
+        if (rc != 0) {
+                CERROR("Can't allocate descriptor for group %s\n", name);
+                return -ENOMEM;
+        }
+
+	list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+	return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up,
+		 unsigned *featp, struct list_head __user *result_up)
+{
+	struct lstcon_group         *grp;
+        int                     rc;
+
+        LASSERT (count > 0);
+        LASSERT (ids_up != NULL);
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by other threads or test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+
+                return -EBUSY;
+        }
+
+	rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	int rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group: %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by others threads or test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+                return -EBUSY;
+        }
+
+        rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                     &grp->grp_trans_list, LST_TRANS_SESEND,
+                                     grp, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_decref(grp);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_decref(grp);
+        /* -ref for session, it's destroyed,
+         * status can't be rolled back, destroy group anway */
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+	struct lstcon_group *grp = NULL;
+	int rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+                return -EBUSY;
+        }
+
+        args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+                LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+        lstcon_group_drain(grp, args);
+
+	lstcon_group_decref(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_decref(grp);
+
+	return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+		    struct lnet_process_id __user *ids_up,
+		    struct list_head __user *result_up)
+{
+	struct lstcon_group *grp = NULL;
+	int rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group: %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+                return -EBUSY;
+        }
+
+        rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+	lstcon_group_decref(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_decref(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head __user *result_up)
+{
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	int rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group: %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+                return -EBUSY;
+        }
+
+        /* re-invite all inactive nodes int the group */
+        rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                     &grp->grp_trans_list, LST_TRANS_SESNEW,
+                                     grp, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                /* local error, return */
+                CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+		lstcon_group_decref(grp);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+        lstcon_rpc_trans_destroy(trans);
+        /* -ref for me */
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char __user *name_up)
+{
+	struct lstcon_group *grp;
+
+	LASSERT(index >= 0);
+	LASSERT(name_up != NULL);
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up, grp->grp_name, len) ?
+					    -EFAULT : 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+		    int *count_p, struct lstcon_node_ent __user *dents_up)
+{
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int count = 0;
+	int index = 0;
+
+	LASSERT(index_p != NULL && count_p != NULL);
+	LASSERT(dents_up != NULL);
+	LASSERT(*index_p >= 0);
+	LASSERT(*count_p > 0);
+
+	list_for_each_entry(ndl, head, ndl_link) {
+		if (index++ < *index_p)
+			continue;
+
+                if (count >= *count_p)
+                        break;
+
+                nd = ndl->ndl_node;
+		if (copy_to_user(&dents_up[count].nde_id,
+                                     &nd->nd_id, sizeof(nd->nd_id)) ||
+		    copy_to_user(&dents_up[count].nde_state,
+                                     &nd->nd_state, sizeof(nd->nd_state)))
+                        return -EFAULT;
+
+                count ++;
+        }
+
+        if (index <= *index_p)
+                return -ENOENT;
+
+        *count_p = count;
+        *index_p = index;
+
+        return 0;
+}
+
+int
+lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p,
+		  int *index_p, int *count_p,
+		  struct lstcon_node_ent __user *dents_up)
+{
+	struct lstcon_ndlist_ent *gentp;
+	struct lstcon_group *grp;
+	struct lstcon_ndlink *ndl;
+	int rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group %s\n", name);
+                return rc;
+        }
+
+	if (dents_up != NULL) {
+                /* verbose query */
+                rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+                                         index_p, count_p, dents_up);
+		lstcon_group_decref(grp);
+
+                return rc;
+        }
+
+        /* non-verbose query */
+	CFS_ALLOC_PTR(gentp);
+        if (gentp == NULL) {
+                CERROR("Can't allocate ndlist_ent\n");
+		lstcon_group_decref(grp);
+
+                return -ENOMEM;
+        }
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+                LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+	rc = copy_to_user(gents_p, gentp,
+			  sizeof(struct lstcon_ndlist_ent)) ? -EFAULT : 0;
+
+	CFS_FREE_PTR(gentp);
+
+	lstcon_group_decref(grp);
+
+	return 0;
+}
+
+static int
+lstcon_batch_find(const char *name, struct lstcon_batch **batpp)
+{
+	struct lstcon_batch *bat;
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+			*batpp = bat;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+	struct lstcon_batch *bat;
+	int i;
+	int rc;
+
+        rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+        if (rc != 0) {
+                CDEBUG(D_NET, "Batch %s already exists\n", name);
+                return rc;
+        }
+
+	LIBCFS_ALLOC(bat, sizeof(*bat));
+        if (bat == NULL) {
+                CERROR("Can't allocate descriptor for batch %s\n", name);
+                return -ENOMEM;
+        }
+
+	CFS_ALLOC_PTR_ARRAY(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+	if (bat->bat_cli_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat, sizeof(*bat));
+
+		return -ENOMEM;
+	}
+
+	CFS_ALLOC_PTR_ARRAY(bat->bat_srv_hash, LST_NODE_HASHSIZE);
+	if (bat->bat_srv_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+		LIBCFS_FREE(bat, sizeof(*bat));
+
+		return -ENOMEM;
+	}
+
+	if (strlen(name) > sizeof(bat->bat_name)-1) {
+		LIBCFS_FREE(bat->bat_srv_hash, LST_NODE_HASHSIZE);
+		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+		LIBCFS_FREE(bat, sizeof(*bat));
+		return -E2BIG;
+	}
+	strncpy(bat->bat_name, name, sizeof(bat->bat_name));
+        bat->bat_hdr.tsb_index = 0;
+        bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+        bat->bat_ntest = 0;
+        bat->bat_state = LST_BATCH_IDLE;
+
+	INIT_LIST_HEAD(&bat->bat_cli_list);
+	INIT_LIST_HEAD(&bat->bat_srv_list);
+	INIT_LIST_HEAD(&bat->bat_test_list);
+	INIT_LIST_HEAD(&bat->bat_trans_list);
+
+        for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+		INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+        }
+
+	list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+        return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char __user *name_up)
+{
+	struct lstcon_batch *bat;
+
+	LASSERT(name_up != NULL);
+	LASSERT(index >= 0);
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up, bat->bat_name, len) ?
+					    -EFAULT : 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up,
+		  int server, int testidx, int *index_p, int *ndent_p,
+		  struct lstcon_node_ent __user *dents_up)
+{
+	struct lstcon_test_batch_ent *entp;
+	struct list_head *clilst;
+	struct list_head *srvlst;
+	struct lstcon_test *test = NULL;
+	struct lstcon_batch *bat;
+	struct lstcon_ndlink *ndl;
+	int rc;
+
+        rc = lstcon_batch_find(name, &bat);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find batch %s\n", name);
+                return -ENOENT;
+        }
+
+	if (testidx > 0) {
+		/* query test, test index start from 1 */
+		list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+			if (testidx-- == 1)
+				break;
+		}
+
+                if (testidx > 0) {
+                        CDEBUG(D_NET, "Can't find specified test in batch\n");
+                        return -ENOENT;
+                }
+        }
+
+        clilst = (test == NULL) ? &bat->bat_cli_list :
+                                  &test->tes_src_grp->grp_ndl_list;
+        srvlst = (test == NULL) ? &bat->bat_srv_list :
+                                  &test->tes_dst_grp->grp_ndl_list;
+
+        if (dents_up != NULL) {
+                rc = lstcon_nodes_getent((server ? srvlst: clilst),
+                                         index_p, ndent_p, dents_up);
+                return rc;
+        }
+
+        /* non-verbose query */
+	CFS_ALLOC_PTR(entp);
+        if (entp == NULL)
+                return -ENOMEM;
+
+        if (test == NULL) {
+                entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+                entp->u.tbe_batch.bae_state = bat->bat_state;
+
+        } else {
+
+                entp->u.tbe_test.tse_type   = test->tes_type;
+                entp->u.tbe_test.tse_loop   = test->tes_loop;
+                entp->u.tbe_test.tse_concur = test->tes_concur;
+        }
+
+	list_for_each_entry(ndl, clilst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+	list_for_each_entry(ndl, srvlst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+	rc = copy_to_user(ent_up, entp,
+			  sizeof(struct lstcon_test_batch_ent)) ? -EFAULT : 0;
+
+	CFS_FREE_PTR(entp)
+
+	return rc;
+}
+
+static int
+lstcon_batrpc_condition(int transop, struct lstcon_node *nd, void *arg)
+{
+        switch (transop) {
+        case LST_TRANS_TSBRUN:
+                if (nd->nd_state != LST_NODE_ACTIVE)
+                        return -ENETDOWN;
+                break;
+
+        case LST_TRANS_TSBSTOP:
+                if (nd->nd_state != LST_NODE_ACTIVE)
+                        return 0;
+                break;
+
+        case LST_TRANS_TSBCLIQRY:
+        case LST_TRANS_TSBSRVQRY:
+                break;
+        }
+
+        return 1;
+}
+
+static int
+lstcon_batch_op(struct lstcon_batch *bat, int transop,
+		struct list_head __user *result_up)
+{
+	struct lstcon_rpc_trans *trans;
+        int                 rc;
+
+        rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+                                     &bat->bat_trans_list, transop,
+                                     bat, lstcon_batrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up)
+{
+	struct lstcon_batch *bat;
+	int rc;
+
+        if (lstcon_batch_find(name, &bat) != 0) {
+                CDEBUG(D_NET, "Can't find batch %s\n", name);
+                return -ENOENT;
+        }
+
+        bat->bat_arg = timeout;
+
+        rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+        /* mark batch as running if it's started in any node */
+        if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+                bat->bat_state = LST_BATCH_RUNNING;
+
+        return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head __user *result_up)
+{
+	struct lstcon_batch *bat;
+	int rc;
+
+        if (lstcon_batch_find(name, &bat) != 0) {
+                CDEBUG(D_NET, "Can't find batch %s\n", name);
+                return -ENOENT;
+        }
+
+        bat->bat_arg = force;
+
+        rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+        /* mark batch as stopped if all RPCs finished */
+        if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+                bat->bat_state = LST_BATCH_IDLE;
+
+        return rc;
+}
+
+static void
+lstcon_batch_destroy(struct lstcon_batch *bat)
+{
+	struct lstcon_ndlink *ndl;
+	struct lstcon_test *test;
+	int i;
+
+	list_del(&bat->bat_link);
+
+	while (!list_empty(&bat->bat_test_list)) {
+		test = list_entry(bat->bat_test_list.next,
+				  struct lstcon_test, tes_link);
+		LASSERT(list_empty(&test->tes_trans_list));
+
+		list_del(&test->tes_link);
+
+		lstcon_group_decref(test->tes_src_grp);
+		lstcon_group_decref(test->tes_dst_grp);
+
+		LIBCFS_FREE(test, offsetof(struct lstcon_test,
+					   tes_param[test->tes_paramlen]));
+	}
+
+	LASSERT(list_empty(&bat->bat_trans_list));
+
+	while (!list_empty(&bat->bat_cli_list)) {
+		ndl = list_entry(bat->bat_cli_list.next,
+				 struct lstcon_ndlink, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	while (!list_empty(&bat->bat_srv_list)) {
+		ndl = list_entry(bat->bat_srv_list.next,
+				 struct lstcon_ndlink, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT(list_empty(&bat->bat_cli_hash[i]));
+		LASSERT(list_empty(&bat->bat_srv_hash[i]));
+	}
+
+	LIBCFS_FREE(bat->bat_cli_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat->bat_srv_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat, sizeof(*bat));
+}
+
+static int
+lstcon_testrpc_condition(int transop, struct lstcon_node *nd, void *arg)
+{
+	struct lstcon_test *test = arg;
+	struct lstcon_batch *batch;
+	struct lstcon_ndlink *ndl;
+	struct list_head *hash;
+	struct list_head *head;
+
+	LASSERT(test != NULL);
+
+	batch = test->tes_batch;
+	LASSERT(batch != NULL);
+
+        if (test->tes_oneside &&
+            transop == LST_TRANS_TSBSRVADD)
+                return 0;
+
+        if (nd->nd_state != LST_NODE_ACTIVE)
+                return -ENETDOWN;
+
+        if (transop == LST_TRANS_TSBCLIADD) {
+                hash = batch->bat_cli_hash;
+                head = &batch->bat_cli_list;
+
+        } else {
+                LASSERT (transop == LST_TRANS_TSBSRVADD);
+
+                hash = batch->bat_srv_hash;
+                head = &batch->bat_srv_list;
+        }
+
+        LASSERT (nd->nd_id.nid != LNET_NID_ANY);
+
+        if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+                return -ENOMEM;
+
+	if (list_empty(&ndl->ndl_link))
+		list_add_tail(&ndl->ndl_link, head);
+
+	return 1;
+}
+
+static int
+lstcon_test_nodes_add(struct lstcon_test *test,
+		      struct list_head __user *result_up)
+{
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	int transop;
+	int rc;
+
+        LASSERT (test->tes_src_grp != NULL);
+        LASSERT (test->tes_dst_grp != NULL);
+
+        transop = LST_TRANS_TSBSRVADD;
+        grp  = test->tes_dst_grp;
+again:
+        rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                     &test->tes_trans_list, transop,
+                                     test, lstcon_testrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+            lstcon_trans_stat()->trs_fwk_errno != 0) {
+                lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+                lstcon_rpc_trans_destroy(trans);
+                /* return if any error */
+                CDEBUG(D_NET, "Failed to add test %s, "
+                              "RPC error %d, framework error %d\n",
+                       transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+                       lstcon_trans_stat()->trs_rpc_errno,
+                       lstcon_trans_stat()->trs_fwk_errno);
+
+                return rc;
+        }
+
+        lstcon_rpc_trans_destroy(trans);
+
+        if (transop == LST_TRANS_TSBCLIADD)
+                return rc;
+
+        transop = LST_TRANS_TSBCLIADD;
+        grp = test->tes_src_grp;
+        test->tes_cliidx = 0;
+
+        /* requests to test clients */
+        goto again;
+}
+
+static int
+lstcon_verify_batch(const char *name, struct lstcon_batch **batch)
+{
+	int rc;
+
+	rc = lstcon_batch_find(name, batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return rc;
+	}
+
+	if ((*batch)->bat_state != LST_BATCH_IDLE) {
+		CDEBUG(D_NET, "Can't change running batch %s\n", name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+lstcon_verify_group(const char *name, struct lstcon_group **grp)
+{
+	int rc;
+	struct lstcon_ndlink *ndl;
+
+	rc = lstcon_group_find(name, grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "can't find group %s\n", name);
+		return rc;
+	}
+
+	list_for_each_entry(ndl, &(*grp)->grp_ndl_list, ndl_link) {
+		if (ndl->ndl_node->nd_state == LST_NODE_ACTIVE) {
+			return 0;
+		}
+	}
+
+	CDEBUG(D_NET, "Group %s has no ACTIVE nodes\n", name);
+
+	return -EINVAL;
+}
+
+int
+lstcon_test_add(char *batch_name, int type, int loop,
+		int concur, int dist, int span,
+		char *src_name, char *dst_name,
+		void *param, int paramlen, int *retp,
+		struct list_head __user *result_up)
+{
+	struct lstcon_test *test = NULL;
+	int rc;
+	struct lstcon_group *src_grp = NULL;
+	struct lstcon_group *dst_grp = NULL;
+	struct lstcon_batch *batch = NULL;
+
+	/*
+	 * verify that a batch of the given name exists, and the groups
+	 * that will be part of the batch exist and have at least one
+	 * active node
+	 */
+	rc = lstcon_verify_batch(batch_name, &batch);
+	if (rc != 0)
+		goto out;
+
+	rc = lstcon_verify_group(src_name, &src_grp);
+	if (rc != 0)
+		goto out;
+
+	rc = lstcon_verify_group(dst_name, &dst_grp);
+	if (rc != 0)
+		goto out;
+
+	if (dst_grp->grp_userland)
+		*retp = 1;
+
+	LIBCFS_ALLOC(test, offsetof(struct lstcon_test, tes_param[paramlen]));
+	if (!test) {
+		CERROR("Can't allocate test descriptor\n");
+		rc = -ENOMEM;
+
+		goto out;
+	}
+
+	test->tes_hdr.tsb_id	= batch->bat_hdr.tsb_id;
+	test->tes_batch		= batch;
+	test->tes_type		= type;
+	test->tes_oneside	= 0; /* TODO */
+	test->tes_loop		= loop;
+	test->tes_concur	= concur;
+	test->tes_stop_onerr	= 1; /* TODO */
+	test->tes_span		= span;
+	test->tes_dist		= dist;
+	test->tes_cliidx	= 0; /* just used for creating RPC */
+	test->tes_src_grp	= src_grp;
+	test->tes_dst_grp	= dst_grp;
+	INIT_LIST_HEAD(&test->tes_trans_list);
+
+	if (param != NULL) {
+		test->tes_paramlen = paramlen;
+		memcpy(&test->tes_param[0], param, paramlen);
+	}
+
+	rc = lstcon_test_nodes_add(test, result_up);
+
+	if (rc != 0)
+		goto out;
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0)
+		CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type,
+		       batch_name);
+
+	/* add to test list anyway, so user can check what's going on */
+	list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+	batch->bat_ntest++;
+	test->tes_hdr.tsb_index = batch->bat_ntest;
+
+	/*  hold groups so nobody can change them */
+	return rc;
+out:
+	if (test != NULL)
+		LIBCFS_FREE(test, offsetof(struct lstcon_test,
+					   tes_param[paramlen]));
+
+	if (dst_grp != NULL)
+		lstcon_group_decref(dst_grp);
+
+	if (src_grp != NULL)
+		lstcon_group_decref(src_grp);
+
+	return rc;
+}
+
+static int
+lstcon_test_find(struct lstcon_batch *batch, int idx,
+		 struct lstcon_test **testpp)
+{
+	struct lstcon_test *test;
+
+	list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+		if (idx == test->tes_hdr.tsb_index) {
+			*testpp = test;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_tsbrpc_readent(int transop, struct srpc_msg *msg,
+		      struct lstcon_rpc_ent __user *ent_up)
+{
+	struct srpc_batch_reply *rep = &msg->msg_body.bat_reply;
+
+        LASSERT (transop == LST_TRANS_TSBCLIQRY ||
+                 transop == LST_TRANS_TSBSRVQRY);
+
+        /* positive errno, framework error code */
+	if (copy_to_user(&ent_up->rpe_priv[0],
+                             &rep->bar_active, sizeof(rep->bar_active)))
+                return -EFAULT;
+
+        return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+			int timeout, struct list_head __user *result_up)
+{
+	struct lstcon_rpc_trans *trans;
+	struct list_head *translist;
+	struct list_head *ndlist;
+	struct lstcon_tsb_hdr *hdr;
+	struct lstcon_batch *batch;
+	struct lstcon_test *test = NULL;
+	int transop;
+	int rc;
+
+        rc = lstcon_batch_find(name, &batch);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find batch: %s\n", name);
+                return rc;
+        }
+
+        if (testidx == 0) {
+                translist = &batch->bat_trans_list;
+                ndlist    = &batch->bat_cli_list;
+                hdr       = &batch->bat_hdr;
+
+        } else {
+                /* query specified test only */
+                rc = lstcon_test_find(batch, testidx, &test);
+                if (rc != 0) {
+                        CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+                        return rc;
+                }
+
+                translist = &test->tes_trans_list;
+                ndlist    = &test->tes_src_grp->grp_ndl_list;
+                hdr       = &test->tes_hdr;
+	}
+
+        transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+        rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+                                     lstcon_batrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, timeout);
+
+        if (testidx == 0 && /* query a batch, not a test */
+            lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+            lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+                /* all RPCs finished, and no active test */
+                batch->bat_state = LST_BATCH_IDLE;
+        }
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                          lstcon_tsbrpc_readent);
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+static int
+lstcon_statrpc_readent(int transop, struct srpc_msg *msg,
+		       struct lstcon_rpc_ent __user *ent_up)
+{
+	struct srpc_stat_reply *rep = &msg->msg_body.stat_reply;
+	struct sfw_counters __user *sfwk_stat;
+	struct srpc_counters __user *srpc_stat;
+	struct lnet_counters_common __user *lnet_stat;
+
+        if (rep->str_status != 0)
+                return 0;
+
+	sfwk_stat = (struct sfw_counters __user *)&ent_up->rpe_payload[0];
+	srpc_stat = (struct srpc_counters __user *)
+		((char __user *)sfwk_stat + sizeof(*sfwk_stat));
+	lnet_stat = (struct lnet_counters_common __user *)
+		((char __user *)srpc_stat + sizeof(*srpc_stat));
+
+	if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+	    copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+	    copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+                return -EFAULT;
+
+        return 0;
+}
+
+static int
+lstcon_ndlist_stat(struct list_head *ndlist,
+		   int timeout, struct list_head __user *result_up)
+{
+	LIST_HEAD(head);
+	struct lstcon_rpc_trans *trans;
+	int rc;
+
+        rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+                                     LST_TRANS_STATQRY, NULL, NULL, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                          lstcon_statrpc_readent);
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout,
+		  struct list_head __user *result_up)
+{
+	struct lstcon_group *grp;
+	int rc;
+
+        rc = lstcon_group_find(grp_name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+                return rc;
+        }
+
+        rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up,
+		  int timeout, struct list_head __user *result_up)
+{
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *tmp;
+	struct lnet_process_id id;
+	int i;
+	int rc;
+
+        rc = lstcon_group_alloc(NULL, &tmp);
+        if (rc != 0) {
+                CERROR("Out of memory\n");
+                return -ENOMEM;
+        }
+
+        for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                        rc = -EFAULT;
+                        break;
+                }
+
+                /* add to tmp group */
+                rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+                if (rc != 0) {
+                        CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+                               "Failed to find or create %s: %d\n",
+                               libcfs_id2str(id), rc);
+                        break;
+                }
+        }
+
+        if (rc != 0) {
+		lstcon_group_decref(tmp);
+                return rc;
+        }
+
+        rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_decref(tmp);
+
+        return rc;
+}
+
+static int
+lstcon_debug_ndlist(struct list_head *ndlist,
+		    struct list_head *translist,
+		    int timeout, struct list_head __user *result_up)
+{
+	struct lstcon_rpc_trans *trans;
+	int rc;
+
+        rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+                                     NULL, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                          lstcon_sesrpc_readent);
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head __user *result_up)
+{
+        return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+                                   NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+		   int client, struct list_head __user *result_up)
+{
+	struct lstcon_batch *bat;
+	int rc;
+
+        rc = lstcon_batch_find(name, &bat);
+        if (rc != 0)
+                return -ENOENT;
+
+        rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+                                          &bat->bat_srv_list,
+                                 NULL, timeout, result_up);
+
+        return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+		   struct list_head __user *result_up)
+{
+	struct lstcon_group *grp;
+	int rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0)
+                return -ENOENT;
+
+        rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+                                 timeout, result_up);
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout, int count,
+		   struct lnet_process_id __user *ids_up,
+		   struct list_head __user *result_up)
+{
+	struct lnet_process_id id;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *grp;
+	int i;
+	int rc;
+
+        rc = lstcon_group_alloc(NULL, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Out of memory\n");
+                return rc;
+        }
+
+        for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                        rc = -EFAULT;
+                        break;
+                }
+
+                /* node is added to tmp group */
+                rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+                if (rc != 0) {
+                        CERROR("Can't create node link\n");
+                        break;
+                }
+        }
+
+        if (rc != 0) {
+		lstcon_group_decref(grp);
+                return rc;
+        }
+
+        rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+                                 timeout, result_up);
+
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_session_match(struct lst_sid sid)
+{
+        return (console_session.ses_id.ses_nid   == sid.ses_nid &&
+                console_session.ses_id.ses_stamp == sid.ses_stamp) ?  1: 0;
+}
+
+static void
+lstcon_new_session_id(struct lst_sid *sid)
+{
+	struct lnet_processid id;
+
+	LASSERT(console_session.ses_state == LST_SESSION_NONE);
+
+	LNetGetId(1, &id);
+	sid->ses_nid = lnet_nid_to_nid4(&id.nid);
+	sid->ses_stamp = div_u64(ktime_get_ns(), NSEC_PER_MSEC);
+}
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+		   int timeout, int force, struct lst_sid __user *sid_up)
+{
+        int     rc = 0;
+        int     i;
+
+        if (console_session.ses_state != LST_SESSION_NONE) {
+                /* session exists */
+                if (!force) {
+			CNETERR("Session %s already exists\n",
+				console_session.ses_name);
+                        return -EEXIST;
+                }
+
+                rc = lstcon_session_end();
+
+                /* lstcon_session_end() only return local error */
+                if  (rc != 0)
+                        return rc;
+        }
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CNETERR("Unknown session features %x\n",
+			(feats & ~LST_FEATS_MASK));
+		return -EINVAL;
+	}
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+	lstcon_new_session_id(&console_session.ses_id);
+
+	console_session.ses_key	    = key;
+	console_session.ses_state   = LST_SESSION_ACTIVE;
+	console_session.ses_force   = !!force;
+	console_session.ses_features = feats;
+	console_session.ses_feats_updated = 0;
+	console_session.ses_timeout = (timeout <= 0) ?
+				      LST_CONSOLE_TIMEOUT : timeout;
+
+	if (strlen(name) > sizeof(console_session.ses_name)-1)
+		return -E2BIG;
+	strlcpy(console_session.ses_name, name,
+		sizeof(console_session.ses_name));
+
+        rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+        if (rc != 0)
+                return rc;
+
+        rc = lstcon_rpc_pinger_start();
+        if (rc != 0) {
+		struct lstcon_batch *bat = NULL;
+
+                lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+                lstcon_batch_destroy(bat);
+
+                return rc;
+        }
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(struct lst_sid)) == 0)
+                return rc;
+
+        lstcon_session_end();
+
+        return -EFAULT;
+}
+
+int
+lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up,
+		    unsigned __user *featp,
+		    struct lstcon_ndlist_ent __user *ndinfo_up,
+		    char __user *name_up, int len)
+{
+	struct lstcon_ndlist_ent *entp;
+	struct lstcon_ndlink *ndl;
+	int rc = 0;
+
+        if (console_session.ses_state != LST_SESSION_ACTIVE)
+                return -ESRCH;
+
+        LIBCFS_ALLOC(entp, sizeof(*entp));
+        if (entp == NULL)
+                return -ENOMEM;
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			 sizeof(struct lst_sid)) ||
+	    copy_to_user(key_up, &console_session.ses_key,
+			     sizeof(*key_up)) ||
+	    copy_to_user(featp, &console_session.ses_features,
+			     sizeof(*featp)) ||
+	    copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+	    copy_to_user(name_up, console_session.ses_name, len))
+                rc = -EFAULT;
+
+        LIBCFS_FREE(entp, sizeof(*entp));
+
+        return rc;
+}
+
+int
+lstcon_session_end(void)
+{
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	struct lstcon_batch *bat;
+	int rc = 0;
+
+        LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
+
+	rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+                                     NULL, LST_TRANS_SESEND, NULL,
+                                     lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        console_session.ses_shutdown = 1;
+
+        lstcon_rpc_pinger_stop();
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        lstcon_rpc_trans_destroy(trans);
+        /* User can do nothing even rpc failed, so go on */
+
+        /* waiting for orphan rpcs to die */
+        lstcon_rpc_cleanup_wait();
+
+        console_session.ses_id    = LST_INVALID_SID;
+        console_session.ses_state = LST_SESSION_NONE;
+        console_session.ses_key   = 0;
+        console_session.ses_force = 0;
+	console_session.ses_feats_updated = 0;
+
+	/* destroy all batches */
+	while (!list_empty(&console_session.ses_bat_list)) {
+		bat = list_entry(console_session.ses_bat_list.next,
+				 struct lstcon_batch, bat_link);
+
+		lstcon_batch_destroy(bat);
+	}
+
+	/* destroy all groups */
+	while (!list_empty(&console_session.ses_grp_list)) {
+		grp = list_entry(console_session.ses_grp_list.next,
+				 struct lstcon_group, grp_link);
+		LASSERT(grp->grp_ref == 1);
+
+		lstcon_group_decref(grp);
+	}
+
+	/* all nodes should be released */
+	LASSERT(list_empty(&console_session.ses_ndl_list));
+
+	console_session.ses_shutdown = 0;
+	console_session.ses_expired  = 0;
+
+	return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+	int rc = 0;
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CERROR("Can't support these features: %x\n",
+		       (feats & ~LST_FEATS_MASK));
+		return -EPROTO;
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!console_session.ses_feats_updated) {
+		console_session.ses_feats_updated = 1;
+		console_session.ses_features = feats;
+	}
+
+	if (console_session.ses_features != feats)
+		rc = -EPROTO;
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (rc != 0) {
+		CERROR("remote features %x do not match with "
+		       "session features %x of console\n",
+		       feats, console_session.ses_features);
+	}
+
+	return rc;
+}
+
+static int
+lstcon_acceptor_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_msg *rep = &rpc->srpc_replymsg;
+	struct srpc_msg *req = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_join_reqst *jreq = &req->msg_body.join_reqst;
+	struct srpc_join_reply *jrep = &rep->msg_body.join_reply;
+	struct lstcon_group *grp = NULL;
+	struct lstcon_ndlink *ndl;
+	int rc = 0;
+
+        sfw_unpack_message(req);
+
+	mutex_lock(&console_session.ses_mutex);
+
+        jrep->join_sid = console_session.ses_id;
+
+        if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+                jrep->join_status = ESRCH;
+                goto out;
+        }
+
+	if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+		jrep->join_status = EPROTO;
+		goto out;
+	}
+
+        if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+             !lstcon_session_match(jreq->join_sid)) {
+                jrep->join_status = EBUSY;
+                goto out;
+        }
+
+        if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+                rc = lstcon_group_alloc(jreq->join_group, &grp);
+                if (rc != 0) {
+                        CERROR("Out of memory\n");
+                        goto out;
+                }
+
+		list_add_tail(&grp->grp_link,
+			      &console_session.ses_grp_list);
+		lstcon_group_addref(grp);
+	}
+
+        if (grp->grp_ref > 2) {
+                /* Group in using */
+                jrep->join_status = EBUSY;
+                goto out;
+        }
+
+        rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+        if (rc == 0) {
+                jrep->join_status = EEXIST;
+                goto out;
+        }
+
+        rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+        if (rc != 0) {
+                CERROR("Out of memory\n");
+                goto out;
+        }
+
+        ndl->ndl_node->nd_state   = LST_NODE_ACTIVE;
+        ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+        if (grp->grp_userland == 0)
+                grp->grp_userland = 1;
+
+	strlcpy(jrep->join_session, console_session.ses_name,
+		sizeof(jrep->join_session));
+        jrep->join_timeout = console_session.ses_timeout;
+        jrep->join_status  = 0;
+
+out:
+	rep->msg_ses_feats = console_session.ses_features;
+        if (grp != NULL)
+		lstcon_group_decref(grp);
+
+	mutex_unlock(&console_session.ses_mutex);
+
+        return rc;
+}
+
+static struct srpc_service lstcon_acceptor_service;
+
+static void lstcon_init_acceptor_service(void)
+{
+        /* initialize selftest console acceptor service table */
+        lstcon_acceptor_service.sv_name    = "join session";
+        lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+        lstcon_acceptor_service.sv_id      = SRPC_SERVICE_JOIN;
+	lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+static struct notifier_block lstcon_ioctl_handler = {
+	.notifier_call = lstcon_ioctl_entry,
+};
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+        int     i;
+        int     rc;
+
+	console_session.ses_id		    = LST_INVALID_SID;
+	console_session.ses_state	    = LST_SESSION_NONE;
+	console_session.ses_timeout	    = 0;
+	console_session.ses_force	    = 0;
+	console_session.ses_expired	    = 0;
+	console_session.ses_feats_updated   = 0;
+	console_session.ses_features	    = LST_FEATS_MASK;
+	console_session.ses_laststamp = ktime_get_real_seconds();
+
+	mutex_init(&console_session.ses_mutex);
+
+	INIT_LIST_HEAD(&console_session.ses_ndl_list);
+	INIT_LIST_HEAD(&console_session.ses_grp_list);
+	INIT_LIST_HEAD(&console_session.ses_bat_list);
+	INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+	CFS_ALLOC_PTR_ARRAY(console_session.ses_ndl_hash,
+			       LST_GLOBAL_HASHSIZE);
+	if (console_session.ses_ndl_hash == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+        /* initialize acceptor service table */
+        lstcon_init_acceptor_service();
+
+	rc = srpc_add_service(&lstcon_acceptor_service);
+	LASSERT(rc != -EBUSY);
+	if (rc != 0) {
+		CFS_FREE_PTR_ARRAY(console_session.ses_ndl_hash,
+				   LST_GLOBAL_HASHSIZE);
+		return rc;
+	}
+
+	rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+				      lstcon_acceptor_service.sv_wi_total);
+	if (rc != 0) {
+                rc = -ENOMEM;
+                goto out;
+        }
+
+	rc = blocking_notifier_chain_register(&libcfs_ioctl_list,
+					      &lstcon_ioctl_handler);
+	if (rc == 0) {
+		lstcon_rpc_module_init();
+		return 0;
+	}
+
+out:
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	CFS_FREE_PTR_ARRAY(console_session.ses_ndl_hash, LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+	int i;
+
+	blocking_notifier_chain_unregister(&libcfs_ioctl_list,
+					   &lstcon_ioctl_handler);
+
+	mutex_lock(&console_session.ses_mutex);
+
+        srpc_shutdown_service(&lstcon_acceptor_service);
+        srpc_remove_service(&lstcon_acceptor_service);
+
+        if (console_session.ses_state != LST_SESSION_NONE)
+                lstcon_session_end();
+
+        lstcon_rpc_module_fini();
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	LASSERT(list_empty(&console_session.ses_ndl_list));
+	LASSERT(list_empty(&console_session.ses_grp_list));
+	LASSERT(list_empty(&console_session.ses_bat_list));
+	LASSERT(list_empty(&console_session.ses_trans_list));
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+	CFS_FREE_PTR_ARRAY(console_session.ses_ndl_hash,
+			   LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return 0;
+}
+
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.h b/drivers/staging/lustrefsx/lnet/selftest/console.h
new file mode 100644
index 0000000000000..13144cc8dd4cd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.h
@@ -0,0 +1,262 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+#include <linux/uaccess.h>
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-types.h>
+#include "selftest.h"
+#include "conrpc.h"
+
+/* node descriptor */
+struct lstcon_node {
+	struct lnet_process_id    nd_id;	/* id of the node */
+        int                  nd_ref;         /* reference count */
+        int                  nd_state;       /* state of the node */
+        int                  nd_timeout;     /* session timeout */
+	ktime_t			nd_stamp;	/* last RPC reply timestamp */
+	struct lstcon_rpc	nd_ping;	/* ping rpc */
+};
+
+/* node link descriptor */
+struct lstcon_ndlink {
+	struct list_head	ndl_link;	/* chain on list */
+	struct list_head	ndl_hlink;	/* chain on hash */
+	struct lstcon_node	*ndl_node;	/* pointer to node */
+};
+
+/* (alias of nodes) group descriptor */
+struct lstcon_group {
+	struct list_head	grp_link;	/* chain on global group list */
+	int			grp_ref;	/* reference count */
+	int			grp_userland;	/* has userland nodes */
+	int			grp_nnode;	/* # of nodes */
+	char			grp_name[LST_NAME_SIZE];	/* group name */
+
+	struct list_head	grp_trans_list;	/* transaction list */
+	struct list_head	grp_ndl_list;	/* nodes list */
+	struct list_head	grp_ndl_hash[0];/* hash table for nodes */
+};
+
+#define LST_BATCH_IDLE          0xB0            /* idle batch */
+#define LST_BATCH_RUNNING       0xB1            /* running batch */
+
+struct lstcon_tsb_hdr {
+	struct lst_bid		tsb_id;		/* batch ID */
+        int                     tsb_index;      /* test index */
+};
+
+/* (tests ) batch descriptor */
+struct lstcon_batch {
+	/* test_batch header */
+	struct lstcon_tsb_hdr	bat_hdr;
+	/* chain on session's batches list */
+	struct list_head	bat_link;
+	/* # of test */
+	int			bat_ntest;
+	/* state of the batch */
+	int			bat_state;
+	/* parameter for run|stop, timeout for run, force for stop */
+	int			bat_arg;
+	/* name of batch */
+	char			bat_name[LST_NAME_SIZE];
+
+	/* list head of tests (lstcon_test_t) */
+	struct list_head	bat_test_list;
+	/* list head of transaction */
+	struct list_head	bat_trans_list;
+	/* list head of client nodes (struct lstcon_node) */
+	struct list_head	bat_cli_list;
+	/* hash table of client nodes */
+	struct list_head	*bat_cli_hash;
+	/* list head of server nodes */
+	struct list_head	bat_srv_list;
+	/* hash table of server nodes */
+	struct list_head	*bat_srv_hash;
+};
+
+/* a single test descriptor */
+struct lstcon_test {
+	/* test batch header */
+	struct lstcon_tsb_hdr	tes_hdr;
+	/* chain on batch's tests list */
+	struct list_head	tes_link;
+	/* pointer to batch */
+	struct lstcon_batch	*tes_batch;
+
+        int                   tes_type;       /* type of the test, i.e: bulk, ping */
+        int                   tes_stop_onerr; /* stop on error */
+        int                   tes_oneside;    /* one-sided test */
+        int                   tes_concur;     /* concurrency */
+        int                   tes_loop;       /* loop count */
+        int                   tes_dist;       /* nodes distribution of target group */
+        int                   tes_span;       /* nodes span of target group */
+        int                   tes_cliidx;     /* client index, used for RPC creating */
+
+	struct list_head	tes_trans_list;	/* transaction list */
+	struct lstcon_group	*tes_src_grp;	/* group run the test */
+	struct lstcon_group	*tes_dst_grp;	/* target group */
+
+        int                   tes_paramlen;   /* test parameter length */
+        char                  tes_param[0];   /* test parameter */
+};
+
+#define LST_GLOBAL_HASHSIZE     503             /* global nodes hash table size */
+#define LST_NODE_HASHSIZE       239             /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE        0x0             /* no session */
+#define LST_SESSION_ACTIVE      0x1             /* working session */
+
+#define LST_CONSOLE_TIMEOUT     300             /* default console timeout */
+
+struct lstcon_session {
+	struct mutex		ses_mutex;      /* only 1 thread in session */
+	struct lst_sid          ses_id;         /* global session id */
+        int                     ses_key;        /* local session key */
+        int                     ses_state;      /* state of session */
+        int                     ses_timeout;    /* timeout in seconds */
+	time64_t		ses_laststamp;  /* last operation stamp (seconds) */
+	/** tests features of the session */
+	unsigned		ses_features;
+	/** features are synced with remote test nodes */
+	unsigned		ses_feats_updated:1;
+	/** force creating */
+	unsigned		ses_force:1;
+	/** session is shutting down */
+	unsigned		ses_shutdown:1;
+	/** console is timedout */
+	unsigned		ses_expired:1;
+        __u64                   ses_id_cookie;  /* batch id cookie */
+        char                    ses_name[LST_NAME_SIZE];  /* session name */
+	struct lstcon_rpc_trans	*ses_ping;      /* session pinger */
+	struct stt_timer	ses_ping_timer;	/* timer for pinger */
+	struct lstcon_trans_stat ses_trans_stat;/* transaction stats */
+
+	struct list_head	ses_trans_list;	/* global list of transaction */
+	struct list_head	ses_grp_list;	/* global list of groups */
+	struct list_head	ses_bat_list;	/* global list of batches */
+	struct list_head	ses_ndl_list;	/* global list of nodes */
+	struct list_head	*ses_ndl_hash;	/* hash table of nodes */
+
+	spinlock_t		ses_rpc_lock;	/* serialize */
+	atomic_t		ses_rpc_counter;/* # of initialized RPCs */
+	struct list_head	ses_rpc_freelist;/* idle console rpc */
+}; /* session descriptor */
+
+extern struct lstcon_session console_session;
+
+static inline struct lstcon_trans_stat *
+lstcon_trans_stat(void)
+{
+        return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash(struct lnet_process_id id, struct list_head *hash)
+{
+        unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+        return &hash[idx];
+}
+
+extern int lstcon_session_match(struct lst_sid sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+			      int timeout, int flags, struct lst_sid __user *sid_up);
+extern int lstcon_session_info(struct lst_sid __user *sid_up, int __user *key,
+			       unsigned __user *verp,
+			       struct lstcon_ndlist_ent __user *entp,
+			       char __user *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout,
+				struct list_head __user *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+			      int client, struct list_head __user *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+			      struct list_head __user *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd,
+			      struct lnet_process_id __user *nds_up,
+			      struct list_head __user *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head __user *result_up);
+extern int lstcon_nodes_add(char *name, int nnd,
+			    struct lnet_process_id __user *nds_up,
+			    unsigned *featp,
+			    struct list_head __user *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd,
+			       struct lnet_process_id __user *nds_up,
+			       struct list_head __user *result_up);
+extern int lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gent_up,
+			     int *index_p, int *ndent_p,
+			     struct lstcon_node_ent __user *ndents_up);
+extern int lstcon_group_list(int idx, int len, char __user *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+			    struct list_head __user *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+			     struct list_head __user *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+				   int client, int timeout,
+				   struct list_head __user *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char __user *name_up);
+extern int lstcon_batch_info(char *name,
+			     struct lstcon_test_batch_ent __user *ent_up,
+			     int server, int testidx, int *index_p,
+			     int *ndent_p,
+			     struct lstcon_node_ent __user *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+			     struct list_head __user *result_up);
+extern int lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up,
+			     int timeout, struct list_head __user *result_up);
+extern int lstcon_test_add(char *batch_name, int type, int loop,
+			   int concur, int dist, int span,
+			   char *src_name, char *dst_name,
+			   void *param, int paramlen, int *retp,
+			   struct list_head __user *result_up);
+
+int lstcon_ioctl_entry(struct notifier_block *nb,
+		       unsigned long cmd, void *vdata);
+int lstcon_console_init(void);
+int lstcon_console_fini(void);
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/selftest/framework.c b/drivers/staging/lustrefsx/lnet/selftest/framework.c
new file mode 100644
index 0000000000000..7e048ad4595e4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/framework.c
@@ -0,0 +1,1766 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen  <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+struct lst_sid LST_INVALID_SID = { .ses_nid = LNET_NID_ANY, .ses_stamp = -1};
+
+static int session_timeout = 100;
+module_param(session_timeout, int, 0444);
+MODULE_PARM_DESC(session_timeout, "test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+module_param(rpc_timeout, int, 0644);
+MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id)               \
+do {                                    \
+	__swab64s(&(id).nid);           \
+	__swab32s(&(id).pid);           \
+} while (0)
+
+#define sfw_unpack_sid(sid)             \
+do {                                    \
+	__swab64s(&(sid).ses_nid);      \
+	__swab64s(&(sid).ses_stamp);    \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc)        \
+do {                                      \
+	__swab32s(&(fc).running_ms);      \
+	__swab32s(&(fc).active_batches);  \
+	__swab32s(&(fc).zombie_sessions); \
+	__swab32s(&(fc).brw_errors);      \
+	__swab32s(&(fc).ping_errors);     \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc)     \
+do {                                    \
+	__swab32s(&(rc).errors);        \
+	__swab32s(&(rc).rpcs_sent);     \
+	__swab32s(&(rc).rpcs_rcvd);     \
+	__swab32s(&(rc).rpcs_dropped);  \
+	__swab32s(&(rc).rpcs_expired);  \
+	__swab64s(&(rc).bulk_get);      \
+	__swab64s(&(rc).bulk_put);      \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc)    \
+do {                                    \
+	__swab32s(&(lc).lcc_errors);        \
+	__swab32s(&(lc).lcc_msgs_max);      \
+	__swab32s(&(lc).lcc_msgs_alloc);    \
+	__swab32s(&(lc).lcc_send_count);    \
+	__swab32s(&(lc).lcc_recv_count);    \
+	__swab32s(&(lc).lcc_drop_count);    \
+	__swab32s(&(lc).lcc_route_count);   \
+	__swab64s(&(lc).lcc_send_length);   \
+	__swab64s(&(lc).lcc_recv_length);   \
+	__swab64s(&(lc).lcc_drop_length);   \
+	__swab64s(&(lc).lcc_route_length);  \
+} while (0)
+
+#define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b)     (atomic_read(&(b)->bat_nactive) != 0)
+
+static struct smoketest_framework {
+	/* RPCs to be recycled */
+	struct list_head	fw_zombie_rpcs;
+	/* stopping sessions */
+	struct list_head	fw_zombie_sessions;
+	/* registered test cases */
+	struct list_head	fw_tests;
+	/* # zombie sessions */
+	atomic_t		fw_nzombies;
+	/* serialise */
+	spinlock_t		fw_lock;
+	/* _the_ session */
+	struct sfw_session	*fw_session;
+	/* shutdown in progress */
+	int			fw_shuttingdown;
+	/* running RPC */
+	struct srpc_server_rpc	*fw_active_srpc;
+} sfw_data;
+
+/* forward ref's */
+static int sfw_stop_batch(struct sfw_batch *tsb, int force);
+static void sfw_destroy_session(struct sfw_session *sn);
+
+static inline struct sfw_test_case *
+sfw_find_test_case(int id)
+{
+	struct sfw_test_case *tsc;
+
+	LASSERT(id <= SRPC_SERVICE_MAX_ID);
+	LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		if (tsc->tsc_srv_service->sv_id == id)
+			return tsc;
+	}
+
+	return NULL;
+}
+
+static int
+sfw_register_test(struct srpc_service *service,
+		  struct sfw_test_client_ops *cliops)
+{
+	struct sfw_test_case *tsc;
+
+        if (sfw_find_test_case(service->sv_id) != NULL) {
+                CERROR ("Failed to register test %s (%d)\n",
+                        service->sv_name, service->sv_id);
+                return -EEXIST;
+        }
+
+	LIBCFS_ALLOC(tsc, sizeof(*tsc));
+        if (tsc == NULL)
+                return -ENOMEM;
+
+        tsc->tsc_cli_ops     = cliops;
+        tsc->tsc_srv_service = service;
+
+	list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+	return 0;
+}
+
+static void
+sfw_add_session_timer (void)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct stt_timer *timer = &sn->sn_timer;
+
+        LASSERT (!sfw_data.fw_shuttingdown);
+
+        if (sn == NULL || sn->sn_timeout == 0)
+                return;
+
+        LASSERT (!sn->sn_timer_active);
+
+        sn->sn_timer_active = 1;
+	timer->stt_expires = ktime_get_real_seconds()+ sn->sn_timeout;
+        stt_add_timer(timer);
+}
+
+static int
+sfw_del_session_timer (void)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+
+        if (sn == NULL || !sn->sn_timer_active)
+                return 0;
+
+        LASSERT (sn->sn_timeout != 0);
+
+        if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+                sn->sn_timer_active = 0;
+                return 0;
+        }
+
+        return EBUSY; /* racing with sfw_session_expired() */
+}
+
+/* called with sfw_data.fw_lock held */
+static void
+sfw_deactivate_session (void)
+__must_hold(&sfw_data.fw_lock)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+        int            nactive = 0;
+	struct sfw_batch *tsb;
+	struct sfw_test_case *tsc;
+
+        if (sn == NULL) return;
+
+	LASSERT(!sn->sn_timer_active);
+
+	sfw_data.fw_session = NULL;
+	atomic_inc(&sfw_data.fw_nzombies);
+	list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		srpc_abort_service(tsc->tsc_srv_service);
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+
+	list_for_each_entry(tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			nactive++;
+			sfw_stop_batch(tsb, 1);
+		}
+	}
+
+	if (nactive != 0)
+		return;	/* wait for active batches to stop */
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+
+	spin_lock(&sfw_data.fw_lock);
+}
+
+
+static void
+sfw_session_expired (void *data)
+{
+	struct sfw_session *sn = data;
+
+	spin_lock(&sfw_data.fw_lock);
+
+        LASSERT (sn->sn_timer_active);
+        LASSERT (sn == sfw_data.fw_session);
+
+	CWARN ("Session expired! sid: %s-%llu, name: %s\n",
+               libcfs_nid2str(sn->sn_id.ses_nid),
+               sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+        sn->sn_timer_active = 0;
+        sfw_deactivate_session();
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(struct sfw_session *sn, struct lst_sid sid,
+		 unsigned features, const char *name)
+{
+	struct stt_timer *timer = &sn->sn_timer;
+
+	memset(sn, 0, sizeof(struct sfw_session));
+	INIT_LIST_HEAD(&sn->sn_list);
+	INIT_LIST_HEAD(&sn->sn_batches);
+	atomic_set(&sn->sn_refcount, 1);        /* +1 for caller */
+	atomic_set(&sn->sn_brw_errors, 0);
+	atomic_set(&sn->sn_ping_errors, 0);
+	strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+	sn->sn_timer_active = 0;
+	sn->sn_id = sid;
+	sn->sn_features = features;
+	sn->sn_timeout = session_timeout;
+	sn->sn_started = ktime_get();
+
+	timer->stt_data = sn;
+	timer->stt_func = sfw_session_expired;
+	INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+static void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv	= rpc->srpc_scd->scd_svc;
+	int			status	= rpc->srpc_status;
+
+        CDEBUG (D_NET,
+                "Incoming framework RPC done: "
+                "service %s, peer %s, status %s:%d\n",
+                sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+                swi_state2str(rpc->srpc_wi.swi_state),
+                status);
+
+        if (rpc->srpc_bulk != NULL)
+                sfw_free_pages(rpc);
+}
+
+static void
+sfw_client_rpc_fini(struct srpc_client_rpc *rpc)
+{
+	LASSERT(rpc->crpc_bulk.bk_niov == 0);
+	LASSERT(list_empty(&rpc->crpc_list));
+	LASSERT(atomic_read(&rpc->crpc_refcount) == 0);
+
+	CDEBUG(D_NET, "Outgoing framework RPC done: "
+	       "service %d, peer %s, status %s:%d:%d\n",
+	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	       swi_state2str(rpc->crpc_wi.swi_state),
+	       rpc->crpc_aborted, rpc->crpc_status);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	/* my callers must finish all RPCs before shutting me down */
+	LASSERT(!sfw_data.fw_shuttingdown);
+	list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+static struct sfw_batch *
+sfw_find_batch(struct lst_bid bid)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct sfw_batch *bat;
+
+	LASSERT(sn != NULL);
+
+	list_for_each_entry(bat, &sn->sn_batches, bat_list) {
+		if (bat->bat_id.bat_id == bid.bat_id)
+			return bat;
+	}
+
+	return NULL;
+}
+
+static struct sfw_batch *
+sfw_bid2batch(struct lst_bid bid)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct sfw_batch *bat;
+
+        LASSERT (sn != NULL);
+
+        bat = sfw_find_batch(bid);
+        if (bat != NULL)
+                return bat;
+
+	LIBCFS_ALLOC(bat, sizeof(*bat));
+	if (bat == NULL)
+                return NULL;
+
+	bat->bat_error	 = 0;
+	bat->bat_session = sn;
+	bat->bat_id	 = bid;
+	atomic_set(&bat->bat_nactive, 0);
+	INIT_LIST_HEAD(&bat->bat_tests);
+
+	list_add_tail(&bat->bat_list, &sn->sn_batches);
+	return bat;
+}
+
+static int
+sfw_get_stats(struct srpc_stat_reqst *request, struct srpc_stat_reply *reply)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct sfw_counters *cnt = &reply->str_fw;
+	struct sfw_batch *bat;
+
+        reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+        if (request->str_sid.ses_nid == LNET_NID_ANY) {
+                reply->str_status = EINVAL;
+                return 0;
+        }
+
+        if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+                reply->str_status = ESRCH;
+                return 0;
+        }
+
+	lnet_counters_get_common(&reply->str_lnet);
+	srpc_get_counters(&reply->str_rpc);
+
+        /* send over the msecs since the session was started
+         - with 32 bits to send, this is ~49 days */
+	cnt->running_ms = ktime_ms_delta(ktime_get(), sn->sn_started);
+	cnt->brw_errors = atomic_read(&sn->sn_brw_errors);
+	cnt->ping_errors = atomic_read(&sn->sn_ping_errors);
+	cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+	cnt->active_batches = 0;
+	list_for_each_entry(bat, &sn->sn_batches, bat_list) {
+		if (atomic_read(&bat->bat_nactive) > 0)
+			cnt->active_batches++;
+	}
+
+	reply->str_status = 0;
+	return 0;
+}
+
+int
+sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct srpc_msg *msg = container_of(request, struct srpc_msg,
+					    msg_body.mksn_reqst);
+	int cplen = 0;
+
+        if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+                reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+                reply->mksn_status = EINVAL;
+                return 0;
+        }
+
+        if (sn != NULL) {
+                reply->mksn_status  = 0;
+                reply->mksn_sid     = sn->sn_id;
+                reply->mksn_timeout = sn->sn_timeout;
+
+                if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+			atomic_inc(&sn->sn_refcount);
+                        return 0;
+                }
+
+                if (!request->mksn_force) {
+                        reply->mksn_status = EBUSY;
+			cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+					sizeof(reply->mksn_name));
+			if (cplen >= sizeof(reply->mksn_name))
+				return -E2BIG;
+                        return 0;
+                }
+        }
+
+	/* reject the request if it requires unknown features
+	 * NB: old version will always accept all features because it's not
+	 * aware of struct srpc_msg::msg_ses_feats, it's a defect but it's also
+	 * harmless because it will return zero feature to console, and it's
+	 * console's responsibility to make sure all nodes in a session have
+	 * same feature mask. */
+	if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		reply->mksn_status = EPROTO;
+		return 0;
+	}
+
+	/* brand new or create by force */
+	LIBCFS_ALLOC(sn, sizeof(*sn));
+	if (sn == NULL) {
+		CERROR("dropping RPC mksn under memory pressure\n");
+		return -ENOMEM;
+	}
+
+	sfw_init_session(sn, request->mksn_sid,
+			 msg->msg_ses_feats, &request->mksn_name[0]);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_deactivate_session();
+	LASSERT(sfw_data.fw_session == NULL);
+	sfw_data.fw_session = sn;
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->mksn_status  = 0;
+	reply->mksn_sid     = sn->sn_id;
+	reply->mksn_timeout = sn->sn_timeout;
+	return 0;
+}
+
+static int
+sfw_remove_session(struct srpc_rmsn_reqst *request,
+		   struct srpc_rmsn_reply *reply)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+
+        reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+        if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+                reply->rmsn_status = EINVAL;
+                return 0;
+        }
+
+        if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+                reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+                return 0;
+        }
+
+	if (!atomic_dec_and_test(&sn->sn_refcount)) {
+                reply->rmsn_status = 0;
+                return 0;
+        }
+
+	spin_lock(&sfw_data.fw_lock);
+	sfw_deactivate_session();
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->rmsn_status = 0;
+	reply->rmsn_sid    = LST_INVALID_SID;
+	LASSERT(sfw_data.fw_session == NULL);
+	return 0;
+}
+
+static int
+sfw_debug_session(struct srpc_debug_reqst *request,
+		  struct srpc_debug_reply *reply)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+
+        if (sn == NULL) {
+                reply->dbg_status = ESRCH;
+                reply->dbg_sid    = LST_INVALID_SID;
+                return 0;
+	}
+
+	reply->dbg_status  = 0;
+	reply->dbg_sid     = sn->sn_id;
+	reply->dbg_timeout = sn->sn_timeout;
+	if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+	    >= sizeof(reply->dbg_name))
+		return -E2BIG;
+
+        return 0;
+}
+
+static void
+sfw_test_rpc_fini(struct srpc_client_rpc *rpc)
+{
+	struct sfw_test_unit *tsu = rpc->crpc_priv;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+
+	/* Called with hold of tsi->tsi_lock */
+	LASSERT(list_empty(&rpc->crpc_list));
+	list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case	*tsc;
+	struct srpc_service	*svc;
+	int			nbuf;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	LASSERT(tsc != NULL);
+	svc = tsc->tsc_srv_service;
+	LASSERT(svc != NULL);
+
+	nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+	return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+static int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case	*tsc;
+	struct srpc_service	*svc;
+	int			nbuf;
+	int			rc;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	nbuf = sfw_test_buffers(tsi);
+	LASSERT(tsc != NULL);
+	svc = tsc->tsc_srv_service;
+
+	if (tsi->tsi_is_client) {
+		tsi->tsi_ops = tsc->tsc_cli_ops;
+		return 0;
+	}
+
+	rc = srpc_service_add_buffers(svc, nbuf);
+	if (rc != 0) {
+		CWARN("Failed to reserve enough buffers: "
+		      "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc);
+		/* NB: this error handler is not strictly correct, because
+		 * it may release more buffers than already allocated,
+		 * but it doesn't matter because request portal should
+		 * be lazy portal and will grow buffers if necessary. */
+		srpc_service_remove_buffers(svc, nbuf);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+	       nbuf * (srpc_serv_is_framework(svc) ?
+		       1 : cfs_cpt_number(cfs_cpt_tab)), svc->sv_name);
+	return 0;
+}
+
+static void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case *tsc;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	LASSERT(tsc != NULL);
+
+	if (tsi->tsi_is_client)
+		return;
+
+	/* shrink buffers, because request portal is lazy portal
+	 * which can grow buffers at runtime so we may leave
+	 * some buffers behind, but never mind... */
+	srpc_service_remove_buffers(tsc->tsc_srv_service,
+				    sfw_test_buffers(tsi));
+}
+
+static void
+sfw_destroy_test_instance(struct sfw_test_instance *tsi)
+{
+	struct srpc_client_rpc *rpc;
+	struct sfw_test_unit *tsu;
+
+        if (!tsi->tsi_is_client) goto clean;
+
+        tsi->tsi_ops->tso_fini(tsi);
+
+	LASSERT(!tsi->tsi_stopping);
+	LASSERT(list_empty(&tsi->tsi_active_rpcs));
+	LASSERT(!sfw_test_active(tsi));
+
+	while (!list_empty(&tsi->tsi_units)) {
+		tsu = list_entry(tsi->tsi_units.next,
+				 struct sfw_test_unit, tsu_list);
+		list_del(&tsu->tsu_list);
+		LIBCFS_FREE(tsu, sizeof(*tsu));
+	}
+
+	while (!list_empty(&tsi->tsi_free_rpcs)) {
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				 struct srpc_client_rpc, crpc_list);
+		list_del(&rpc->crpc_list);
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+clean:
+	sfw_unload_test(tsi);
+	LIBCFS_FREE(tsi, sizeof(*tsi));
+}
+
+static void
+sfw_destroy_batch(struct sfw_batch *tsb)
+{
+	struct sfw_test_instance *tsi;
+
+	LASSERT(!sfw_batch_active(tsb));
+	LASSERT(list_empty(&tsb->bat_list));
+
+	while (!list_empty(&tsb->bat_tests)) {
+		tsi = list_entry(tsb->bat_tests.next,
+				 struct sfw_test_instance, tsi_list);
+		list_del_init(&tsi->tsi_list);
+		sfw_destroy_test_instance(tsi);
+	}
+
+	LIBCFS_FREE(tsb, sizeof(*tsb));
+}
+
+static void
+sfw_destroy_session(struct sfw_session *sn)
+{
+	struct sfw_batch *batch;
+
+	LASSERT(list_empty(&sn->sn_list));
+	LASSERT(sn != sfw_data.fw_session);
+
+	while (!list_empty(&sn->sn_batches)) {
+		batch = list_entry(sn->sn_batches.next,
+				   struct sfw_batch, bat_list);
+		list_del_init(&batch->bat_list);
+		sfw_destroy_batch(batch);
+	}
+
+	LIBCFS_FREE(sn, sizeof(*sn));
+	atomic_dec(&sfw_data.fw_nzombies);
+}
+
+static void
+sfw_unpack_addtest_req(struct srpc_msg *msg)
+{
+	struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
+
+        LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
+        LASSERT (req->tsr_is_client);
+
+        if (msg->msg_magic == SRPC_MSG_MAGIC)
+                return; /* no flipping needed */
+
+        LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (req->tsr_service == SRPC_SERVICE_BRW) {
+		if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+			struct test_bulk_req *bulk = &req->tsr_u.bulk_v0;
+
+			__swab32s(&bulk->blk_opc);
+			__swab32s(&bulk->blk_npg);
+			__swab32s(&bulk->blk_flags);
+
+		} else {
+			struct test_bulk_req_v1 *bulk = &req->tsr_u.bulk_v1;
+
+			__swab16s(&bulk->blk_opc);
+			__swab16s(&bulk->blk_flags);
+			__swab32s(&bulk->blk_offset);
+			__swab32s(&bulk->blk_len);
+		}
+
+		return;
+	}
+
+        if (req->tsr_service == SRPC_SERVICE_PING) {
+		struct test_ping_req *ping = &req->tsr_u.ping;
+
+                __swab32s(&ping->png_size);
+                __swab32s(&ping->png_flags);
+                return;
+        }
+
+	LBUG();
+}
+
+static int
+sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc)
+{
+	struct srpc_msg *msg = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
+	struct srpc_bulk *bk = rpc->srpc_bulk;
+	int ndest = req->tsr_ndest;
+	struct sfw_test_unit *tsu;
+	struct sfw_test_instance *tsi;
+	int i;
+	int rc;
+
+        LIBCFS_ALLOC(tsi, sizeof(*tsi));
+        if (tsi == NULL) {
+		CERROR ("Can't allocate test instance for batch: %llu\n",
+                        tsb->bat_id.bat_id);
+                return -ENOMEM;
+        }
+
+	spin_lock_init(&tsi->tsi_lock);
+	atomic_set(&tsi->tsi_nactive, 0);
+	INIT_LIST_HEAD(&tsi->tsi_units);
+	INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+	INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+        tsi->tsi_stopping      = 0;
+        tsi->tsi_batch         = tsb;
+        tsi->tsi_loop          = req->tsr_loop;
+        tsi->tsi_concur        = req->tsr_concur;
+        tsi->tsi_service       = req->tsr_service;
+        tsi->tsi_is_client     = !!(req->tsr_is_client);
+        tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+        rc = sfw_load_test(tsi);
+        if (rc != 0) {
+                LIBCFS_FREE(tsi, sizeof(*tsi));
+                return rc;
+        }
+
+        LASSERT (!sfw_batch_active(tsb));
+
+	if (!tsi->tsi_is_client) {
+		/* it's test server, just add it to tsb */
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+        LASSERT (bk != NULL);
+        LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+	LASSERT((unsigned int)bk->bk_len >=
+		sizeof(struct lnet_process_id_packed) * ndest);
+
+	sfw_unpack_addtest_req(msg);
+        memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+        for (i = 0; i < ndest; i++) {
+		struct lnet_process_id_packed *dests;
+		struct lnet_process_id_packed  id;
+                int                       j;
+
+		dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].bv_page);
+		LASSERT (dests != NULL);  /* my pages are within KVM always */
+                id = dests[i % SFW_ID_PER_PAGE];
+                if (msg->msg_magic != SRPC_MSG_MAGIC)
+                        sfw_unpack_id(id);
+
+                for (j = 0; j < tsi->tsi_concur; j++) {
+			LIBCFS_ALLOC(tsu, sizeof(*tsu));
+                        if (tsu == NULL) {
+                                rc = -ENOMEM;
+                                CERROR ("Can't allocate tsu for %d\n",
+                                        tsi->tsi_service);
+                                goto error;
+                        }
+
+			tsu->tsu_dest.nid = id.nid;
+			tsu->tsu_dest.pid = id.pid;
+			tsu->tsu_instance = tsi;
+			tsu->tsu_private  = NULL;
+			list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+		}
+	}
+
+	rc = tsi->tsi_ops->tso_init(tsi);
+	if (rc == 0) {
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+error:
+	LASSERT(rc != 0);
+	sfw_destroy_test_instance(tsi);
+	return rc;
+}
+
+static void
+sfw_test_unit_done(struct sfw_test_unit *tsu)
+{
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_batch *tsb = tsi->tsi_batch;
+	struct sfw_session *sn = tsb->bat_session;
+
+        LASSERT (sfw_test_active(tsi));
+
+	if (!atomic_dec_and_test(&tsi->tsi_nactive))
+                return;
+
+        /* the test instance is done */
+	spin_lock(&tsi->tsi_lock);
+
+	tsi->tsi_stopping = 0;
+
+	spin_unlock(&tsi->tsi_lock);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+	    sn == sfw_data.fw_session) {		  /* sn also active */
+		spin_unlock(&sfw_data.fw_lock);
+                return;
+        }
+
+	LASSERT(!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+	list_for_each_entry(tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			spin_unlock(&sfw_data.fw_lock);
+			return;
+		}
+	}
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+}
+
+static void
+sfw_test_rpc_done(struct srpc_client_rpc *rpc)
+{
+	struct sfw_test_unit *tsu = rpc->crpc_priv;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+        int                  done = 0;
+
+        tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT(sfw_test_active(tsi));
+	LASSERT(!list_empty(&rpc->crpc_list));
+
+	list_del_init(&rpc->crpc_list);
+
+        /* batch is stopping or loop is done or get error */
+        if (tsi->tsi_stopping ||
+            tsu->tsu_loop == 0 ||
+            (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+                done = 1;
+
+        /* dec ref for poster */
+        srpc_client_rpc_decref(rpc);
+
+	spin_unlock(&tsi->tsi_lock);
+
+        if (!done) {
+                swi_schedule_workitem(&tsu->tsu_worker);
+                return;
+        }
+
+        sfw_test_unit_done(tsu);
+}
+
+int
+sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer,
+		    unsigned features, int nblk, int blklen,
+		    struct srpc_client_rpc **rpcpp)
+{
+	struct srpc_client_rpc *rpc = NULL;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+
+	spin_lock(&tsi->tsi_lock);
+
+        LASSERT (sfw_test_active(tsi));
+
+	if (!list_empty(&tsi->tsi_free_rpcs)) {
+		/* pick request from buffer */
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				 struct srpc_client_rpc, crpc_list);
+		LASSERT(nblk == rpc->crpc_bulk.bk_niov);
+		list_del_init(&rpc->crpc_list);
+	}
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+					     blklen, sfw_test_rpc_done,
+					     sfw_test_rpc_fini, tsu);
+	} else {
+		srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+				     blklen, sfw_test_rpc_done,
+				     sfw_test_rpc_fini, tsu);
+	}
+
+	if (rpc == NULL) {
+		CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+		return -ENOMEM;
+	}
+
+	rpc->crpc_reqstmsg.msg_ses_feats = features;
+	*rpcpp = rpc;
+
+	return 0;
+}
+
+static int
+sfw_run_test(struct swi_workitem *wi)
+{
+	struct sfw_test_unit *tsu = container_of(wi, struct sfw_test_unit, tsu_worker);
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct srpc_client_rpc *rpc = NULL;
+
+        LASSERT (wi == &tsu->tsu_worker);
+
+        if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+                LASSERT (rpc == NULL);
+                goto test_done;
+        }
+
+        LASSERT (rpc != NULL);
+
+	spin_lock(&tsi->tsi_lock);
+
+	if (tsi->tsi_stopping) {
+		list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+		spin_unlock(&tsi->tsi_lock);
+		goto test_done;
+	}
+
+	if (tsu->tsu_loop > 0)
+		tsu->tsu_loop--;
+
+	list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+	spin_unlock(&tsi->tsi_lock);
+
+	spin_lock(&rpc->crpc_lock);
+	rpc->crpc_timeout = rpc_timeout;
+	srpc_post_rpc(rpc);
+	spin_unlock(&rpc->crpc_lock);
+	return 0;
+
+test_done:
+        /*
+         * No one can schedule me now since:
+         * - previous RPC, if any, has done and
+         * - no new RPC is initiated.
+         * - my batch is still active; no one can run it again now.
+         * Cancel pending schedules and prevent future schedule attempts:
+         */
+	swi_exit_workitem(wi);
+	sfw_test_unit_done(tsu);
+	return 1;
+}
+
+static int
+sfw_run_batch(struct sfw_batch *tsb)
+{
+	struct swi_workitem *wi;
+	struct sfw_test_unit *tsu;
+	struct sfw_test_instance *tsi;
+
+        if (sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch already active: %llu (%d)\n",
+		       tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+                return 0;
+        }
+
+	list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+		if (!tsi->tsi_is_client)	/* skip server instances */
+			continue;
+
+		LASSERT(!tsi->tsi_stopping);
+		LASSERT(!sfw_test_active(tsi));
+
+		atomic_inc(&tsb->bat_nactive);
+
+		list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+			atomic_inc(&tsi->tsi_nactive);
+			tsu->tsu_loop = tsi->tsi_loop;
+			wi = &tsu->tsu_worker;
+			swi_init_workitem(wi, sfw_run_test,
+					  lst_sched_test[lnet_cpt_of_nid(tsu->tsu_dest.nid, NULL)]);
+			swi_schedule_workitem(wi);
+		}
+	}
+
+	return 0;
+}
+
+static int
+sfw_stop_batch(struct sfw_batch *tsb, int force)
+{
+	struct sfw_test_instance *tsi;
+	struct srpc_client_rpc *rpc;
+
+        if (!sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id);
+                return 0;
+        }
+
+	list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+		spin_lock(&tsi->tsi_lock);
+
+		if (!tsi->tsi_is_client ||
+		    !sfw_test_active(tsi) || tsi->tsi_stopping) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		tsi->tsi_stopping = 1;
+
+		if (!force) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		/* abort launched rpcs in the test */
+		list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+			spin_lock(&rpc->crpc_lock);
+
+			srpc_abort_rpc(rpc, -EINTR);
+
+			spin_unlock(&rpc->crpc_lock);
+		}
+
+		spin_unlock(&tsi->tsi_lock);
+	}
+
+	return 0;
+}
+
+static int
+sfw_query_batch(struct sfw_batch *tsb, int testidx,
+		struct srpc_batch_reply *reply)
+{
+	struct sfw_test_instance *tsi;
+
+        if (testidx < 0)
+                return -EINVAL;
+
+        if (testidx == 0) {
+		reply->bar_active = atomic_read(&tsb->bat_nactive);
+                return 0;
+        }
+
+	list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+                if (testidx-- > 1)
+                        continue;
+
+		reply->bar_active = atomic_read(&tsi->tsi_nactive);
+                return 0;
+        }
+
+        return -ENOENT;
+}
+
+void
+sfw_free_pages(struct srpc_server_rpc *rpc)
+{
+        srpc_free_bulk(rpc->srpc_bulk);
+        rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+		int sink)
+{
+	LASSERT(rpc->srpc_bulk == NULL);
+	LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+	rpc->srpc_bulk = srpc_alloc_bulk(cpt, 0, npages, len, sink);
+	if (rpc->srpc_bulk == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int
+sfw_add_test(struct srpc_server_rpc *rpc)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct srpc_test_reply *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+	struct srpc_test_reqst *request;
+	int rc;
+	struct sfw_batch *bat;
+
+        request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+        reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+        if (request->tsr_loop == 0 ||
+            request->tsr_concur == 0 ||
+            request->tsr_sid.ses_nid == LNET_NID_ANY ||
+            request->tsr_ndest > SFW_MAX_NDESTS ||
+            (request->tsr_is_client && request->tsr_ndest == 0) ||
+            request->tsr_concur > SFW_MAX_CONCUR ||
+            request->tsr_service > SRPC_SERVICE_MAX_ID ||
+            request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+                reply->tsr_status = EINVAL;
+                return 0;
+        }
+
+        if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+            sfw_find_test_case(request->tsr_service) == NULL) {
+                reply->tsr_status = ENOENT;
+                return 0;
+        }
+
+	bat = sfw_bid2batch(request->tsr_bid);
+	if (bat == NULL) {
+		CERROR("dropping RPC %s from %s under memory pressure\n",
+			rpc->srpc_scd->scd_svc->sv_name,
+                        libcfs_id2str(rpc->srpc_peer));
+                return -ENOMEM;
+        }
+
+        if (sfw_batch_active(bat)) {
+                reply->tsr_status = EBUSY;
+                return 0;
+        }
+
+        if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+		/* rpc will be resumed later in sfw_bulk_ready */
+		int	npg = sfw_id_pages(request->tsr_ndest);
+		int	len;
+
+		if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+			len = npg * PAGE_SIZE;
+
+		} else  {
+			len = sizeof(struct lnet_process_id_packed) *
+			      request->tsr_ndest;
+		}
+
+		return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+        }
+
+        rc = sfw_add_test_instance(bat, rpc);
+        CDEBUG (rc == 0 ? D_NET : D_WARNING,
+                "%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+                rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+                request->tsr_is_client ? "client" : "server",
+                request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+        reply->tsr_status = (rc < 0) ? -rc : rc;
+        return 0;
+}
+
+static int
+sfw_control_batch(struct srpc_batch_reqst *request,
+		  struct srpc_batch_reply *reply)
+{
+	struct sfw_session *sn = sfw_data.fw_session;
+        int            rc = 0;
+	struct sfw_batch *bat;
+
+        reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+        if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+                reply->bar_status = ESRCH;
+                return 0;
+        }
+
+        bat = sfw_find_batch(request->bar_bid);
+        if (bat == NULL) {
+                reply->bar_status = ENOENT;
+                return 0;
+        }
+
+        switch (request->bar_opc) {
+        case SRPC_BATCH_OPC_RUN:
+                rc = sfw_run_batch(bat);
+                break;
+
+        case SRPC_BATCH_OPC_STOP:
+                rc = sfw_stop_batch(bat, request->bar_arg);
+                break;
+
+        case SRPC_BATCH_OPC_QUERY:
+                rc = sfw_query_batch(bat, request->bar_testidx, reply);
+                break;
+
+        default:
+                return -EINVAL; /* drop it */
+        }
+
+        reply->bar_status = (rc < 0) ? -rc : rc;
+        return 0;
+}
+
+static int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	struct srpc_msg     *reply	= &rpc->srpc_replymsg;
+	struct srpc_msg     *request	= &rpc->srpc_reqstbuf->buf_msg;
+	unsigned	features = LST_FEATS_MASK;
+	int		rc = 0;
+
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	/* Remove timer to avoid racing with it or expiring active session */
+	if (sfw_del_session_timer() != 0) {
+		CERROR("dropping RPC %s from %s: racing with expiry timer\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_unpack_message(request);
+	LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+	/* rpc module should have checked this */
+	LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+	if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+	    sv->sv_id != SRPC_SERVICE_DEBUG) {
+		struct sfw_session *sn = sfw_data.fw_session;
+
+		if (sn != NULL &&
+		    sn->sn_features != request->msg_ses_feats) {
+			CNETERR("Features of framework RPC don't match "
+				"features of current session: %x/%x\n",
+				request->msg_ses_feats, sn->sn_features);
+			reply->msg_body.reply.status = EPROTO;
+			reply->msg_body.reply.sid    = sn->sn_id;
+			goto out;
+		}
+
+	} else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		/* NB: at this point, old version will ignore features and
+		 * create new session anyway, so console should be able
+		 * to handle this */
+		reply->msg_body.reply.status = EPROTO;
+		goto out;
+	}
+
+        switch(sv->sv_id) {
+        default:
+                LBUG ();
+        case SRPC_SERVICE_TEST:
+                rc = sfw_add_test(rpc);
+                break;
+
+        case SRPC_SERVICE_BATCH:
+                rc = sfw_control_batch(&request->msg_body.bat_reqst,
+                                       &reply->msg_body.bat_reply);
+                break;
+
+        case SRPC_SERVICE_QUERY_STAT:
+                rc = sfw_get_stats(&request->msg_body.stat_reqst,
+                                   &reply->msg_body.stat_reply);
+                break;
+
+        case SRPC_SERVICE_DEBUG:
+                rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+                                       &reply->msg_body.dbg_reply);
+                break;
+
+        case SRPC_SERVICE_MAKE_SESSION:
+                rc = sfw_make_session(&request->msg_body.mksn_reqst,
+                                      &reply->msg_body.mksn_reply);
+                break;
+
+        case SRPC_SERVICE_REMOVE_SESSION:
+                rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+                                        &reply->msg_body.rmsn_reply);
+                break;
+        }
+
+	if (sfw_data.fw_session != NULL)
+		features = sfw_data.fw_session->sn_features;
+ out:
+	reply->msg_ses_feats = features;
+	rpc->srpc_done = sfw_server_rpc_done;
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+static int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	int			rc;
+
+	LASSERT(rpc->srpc_bulk != NULL);
+	LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (status != 0) {
+		CERROR("Bulk transfer failed for RPC: "
+		       "service %s, peer %s, status %d\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+		spin_unlock(&sfw_data.fw_lock);
+		return -EIO;
+	}
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	if (sfw_del_session_timer() != 0) {
+		CERROR("dropping RPC %s from %s: racing with expiry timer\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	rc = sfw_add_test(rpc);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+struct srpc_client_rpc *
+sfw_create_rpc(struct lnet_process_id peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done)(struct srpc_client_rpc *), void *priv)
+{
+	struct srpc_client_rpc *rpc = NULL;
+
+	spin_lock(&sfw_data.fw_lock);
+
+        LASSERT (!sfw_data.fw_shuttingdown);
+        LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				     struct srpc_client_rpc, crpc_list);
+		list_del(&rpc->crpc_list);
+
+                srpc_init_client_rpc(rpc, peer, service, 0, 0,
+                                     done, sfw_client_rpc_fini, priv);
+        }
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, service,
+					     nbulkiov, bulklen, done,
+					     nbulkiov != 0 ?  NULL :
+					     sfw_client_rpc_fini,
+					     priv);
+	}
+
+	if (rpc != NULL) /* "session" is concept in framework */
+		rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+	return rpc;
+}
+
+void
+sfw_unpack_message(struct srpc_msg *msg)
+{
+        if (msg->msg_magic == SRPC_MSG_MAGIC)
+                return; /* no flipping needed */
+
+	/* srpc module should guarantee I wouldn't get crap */
+        LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+        if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+		struct srpc_stat_reqst *req = &msg->msg_body.stat_reqst;
+
+                __swab32s(&req->str_type);
+                __swab64s(&req->str_rpyid);
+                sfw_unpack_sid(req->str_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+		struct srpc_stat_reply *rep = &msg->msg_body.stat_reply;
+
+                __swab32s(&rep->str_status);
+                sfw_unpack_sid(rep->str_sid);
+                sfw_unpack_fw_counters(rep->str_fw);
+                sfw_unpack_rpc_counters(rep->str_rpc);
+                sfw_unpack_lnet_counters(rep->str_lnet);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+		struct srpc_mksn_reqst *req = &msg->msg_body.mksn_reqst;
+
+                __swab64s(&req->mksn_rpyid);
+                __swab32s(&req->mksn_force);
+                sfw_unpack_sid(req->mksn_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+		struct srpc_mksn_reply *rep = &msg->msg_body.mksn_reply;
+
+                __swab32s(&rep->mksn_status);
+                __swab32s(&rep->mksn_timeout);
+                sfw_unpack_sid(rep->mksn_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+		struct srpc_rmsn_reqst *req = &msg->msg_body.rmsn_reqst;
+
+                __swab64s(&req->rmsn_rpyid);
+                sfw_unpack_sid(req->rmsn_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+		struct srpc_rmsn_reply *rep = &msg->msg_body.rmsn_reply;
+
+                __swab32s(&rep->rmsn_status);
+                sfw_unpack_sid(rep->rmsn_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+		struct srpc_debug_reqst *req = &msg->msg_body.dbg_reqst;
+
+                __swab64s(&req->dbg_rpyid);
+                __swab32s(&req->dbg_flags);
+                sfw_unpack_sid(req->dbg_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+		struct srpc_debug_reply *rep = &msg->msg_body.dbg_reply;
+
+                __swab32s(&rep->dbg_nbatch);
+                __swab32s(&rep->dbg_timeout);
+                sfw_unpack_sid(rep->dbg_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+		struct srpc_batch_reqst *req = &msg->msg_body.bat_reqst;
+
+                __swab32s(&req->bar_opc);
+                __swab64s(&req->bar_rpyid);
+                __swab32s(&req->bar_testidx);
+                __swab32s(&req->bar_arg);
+                sfw_unpack_sid(req->bar_sid);
+                __swab64s(&req->bar_bid.bat_id);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+		struct srpc_batch_reply *rep = &msg->msg_body.bat_reply;
+
+                __swab32s(&rep->bar_status);
+                sfw_unpack_sid(rep->bar_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+		struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
+
+                __swab64s(&req->tsr_rpyid);
+                __swab64s(&req->tsr_bulkid);
+                __swab32s(&req->tsr_loop);
+                __swab32s(&req->tsr_ndest);
+                __swab32s(&req->tsr_concur);
+                __swab32s(&req->tsr_service);
+                sfw_unpack_sid(req->tsr_sid);
+                __swab64s(&req->tsr_bid.bat_id);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+		struct srpc_test_reply *rep = &msg->msg_body.tes_reply;
+
+                __swab32s(&rep->tsr_status);
+                sfw_unpack_sid(rep->tsr_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+		struct srpc_join_reqst *req = &msg->msg_body.join_reqst;
+
+                __swab64s(&req->join_rpyid);
+                sfw_unpack_sid(req->join_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+		struct srpc_join_reply *rep = &msg->msg_body.join_reply;
+
+                __swab32s(&rep->join_status);
+                __swab32s(&rep->join_timeout);
+                sfw_unpack_sid(rep->join_sid);
+                return;
+        }
+
+        LBUG ();
+}
+
+void
+sfw_abort_rpc(struct srpc_client_rpc *rpc)
+{
+	LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+	LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_abort_rpc(rpc, -EINTR);
+	spin_unlock(&rpc->crpc_lock);
+}
+
+void
+sfw_post_rpc(struct srpc_client_rpc *rpc)
+{
+	spin_lock(&rpc->crpc_lock);
+
+	LASSERT(!rpc->crpc_closed);
+	LASSERT(!rpc->crpc_aborted);
+	LASSERT(list_empty(&rpc->crpc_list));
+	LASSERT(!sfw_data.fw_shuttingdown);
+
+	rpc->crpc_timeout = rpc_timeout;
+	srpc_post_rpc(rpc);
+
+	spin_unlock(&rpc->crpc_lock);
+}
+
+static struct srpc_service sfw_services[] = {
+	{ .sv_id = SRPC_SERVICE_DEBUG,		.sv_name = "debug", },
+	{ .sv_id = SRPC_SERVICE_QUERY_STAT,	.sv_name = "query stats", },
+	{ .sv_id = SRPC_SERVICE_MAKE_SESSION,	.sv_name = "make session", },
+	{ .sv_id = SRPC_SERVICE_REMOVE_SESSION,	.sv_name = "remove session", },
+	{ .sv_id = SRPC_SERVICE_BATCH,		.sv_name = "batch service", },
+	{ .sv_id = SRPC_SERVICE_TEST,		.sv_name = "test service", },
+	{ .sv_id = 0, } };
+
+int
+sfw_startup (void)
+{
+        int              i;
+        int              rc;
+        int              error;
+	struct srpc_service *sv;
+	struct sfw_test_case *tsc;
+
+
+        if (session_timeout < 0) {
+                CERROR ("Session timeout must be non-negative: %d\n",
+                        session_timeout);
+                return -EINVAL;
+        }
+
+        if (rpc_timeout < 0) {
+                CERROR ("RPC timeout must be non-negative: %d\n",
+                        rpc_timeout);
+                return -EINVAL;
+        }
+
+        if (session_timeout == 0)
+                CWARN ("Zero session_timeout specified "
+                       "- test sessions never expire.\n");
+
+        if (rpc_timeout == 0)
+                CWARN ("Zero rpc_timeout specified "
+                       "- test RPC never expire.\n");
+
+        memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+        sfw_data.fw_session     = NULL;
+        sfw_data.fw_active_srpc = NULL;
+	spin_lock_init(&sfw_data.fw_lock);
+	atomic_set(&sfw_data.fw_nzombies, 0);
+	INIT_LIST_HEAD(&sfw_data.fw_tests);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+        brw_init_test_client();
+        brw_init_test_service();
+        rc = sfw_register_test(&brw_test_service, &brw_test_client);
+        LASSERT (rc == 0);
+
+        ping_init_test_client();
+        ping_init_test_service();
+        rc = sfw_register_test(&ping_test_service, &ping_test_client);
+        LASSERT (rc == 0);
+
+	error = 0;
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+
+		rc = srpc_add_service(sv);
+		LASSERT(rc != -EBUSY);
+		if (rc != 0) {
+			CWARN("Failed to add %s service: %d\n",
+			      sv->sv_name, rc);
+			error = rc;
+		}
+	}
+
+        for (i = 0; ; i++) {
+                sv = &sfw_services[i];
+                if (sv->sv_name == NULL) break;
+
+                sv->sv_bulk_ready = NULL;
+                sv->sv_handler    = sfw_handle_server_rpc;
+		sv->sv_wi_total   = SFW_FRWK_WI_MAX;
+                if (sv->sv_id == SRPC_SERVICE_TEST)
+                        sv->sv_bulk_ready = sfw_bulk_ready;
+
+                rc = srpc_add_service(sv);
+                LASSERT (rc != -EBUSY);
+                if (rc != 0) {
+                        CWARN ("Failed to add %s service: %d\n",
+                               sv->sv_name, rc);
+                        error = rc;
+                }
+
+                /* about to sfw_shutdown, no need to add buffer */
+                if (error) continue;
+
+		rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+		if (rc != 0) {
+			CWARN("Failed to reserve enough buffers: "
+			      "service %s, %d needed: %d\n",
+			      sv->sv_name, sv->sv_wi_total, rc);
+			error = -ENOMEM;
+		}
+        }
+
+        if (error != 0)
+                sfw_shutdown();
+        return error;
+}
+
+void
+sfw_shutdown (void)
+{
+	struct srpc_service *sv;
+	struct sfw_test_case *tsc;
+	int		 i;
+
+	spin_lock(&sfw_data.fw_lock);
+
+        sfw_data.fw_shuttingdown = 1;
+        lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+                       "waiting for active RPC to finish.\n");
+
+        if (sfw_del_session_timer() != 0)
+                lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+                               "waiting for session timer to explode.\n");
+
+        sfw_deactivate_session();
+	lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+                       sfw_data.fw_lock,
+                       "waiting for %d zombie sessions to die.\n",
+		       atomic_read(&sfw_data.fw_nzombies));
+
+	spin_unlock(&sfw_data.fw_lock);
+
+        for (i = 0; ; i++) {
+                sv = &sfw_services[i];
+                if (sv->sv_name == NULL)
+                        break;
+
+                srpc_shutdown_service(sv);
+                srpc_remove_service(sv);
+        }
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+                sv = tsc->tsc_srv_service;
+                srpc_shutdown_service(sv);
+                srpc_remove_service(sv);
+        }
+
+	while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+		struct srpc_client_rpc *rpc;
+
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				 struct srpc_client_rpc, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+        for (i = 0; ; i++) {
+                sv = &sfw_services[i];
+                if (sv->sv_name == NULL)
+                        break;
+
+                srpc_wait_service_shutdown(sv);
+        }
+
+	while (!list_empty(&sfw_data.fw_tests)) {
+		tsc = list_entry(sfw_data.fw_tests.next,
+				 struct sfw_test_case, tsc_list);
+
+		srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+		list_del(&tsc->tsc_list);
+		LIBCFS_FREE(tsc, sizeof(*tsc));
+	}
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/module.c b/drivers/staging/lustrefsx/lnet/selftest/module.c
new file mode 100644
index 0000000000000..1441600e1a327
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/module.c
@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+#include "console.h"
+
+enum {
+	LST_INIT_NONE		= 0,
+	LST_INIT_WI_SERIAL,
+	LST_INIT_WI_TEST,
+	LST_INIT_RPC,
+	LST_INIT_FW,
+	LST_INIT_CONSOLE
+};
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+static void
+lnet_selftest_exit(void)
+{
+	int i;
+
+	switch (lst_init_step) {
+	case LST_INIT_CONSOLE:
+		lstcon_console_fini();
+		fallthrough;
+	case LST_INIT_FW:
+		sfw_shutdown();
+		fallthrough;
+	case LST_INIT_RPC:
+		srpc_shutdown();
+		fallthrough;
+	case LST_INIT_WI_TEST:
+		for (i = 0;
+		     i < cfs_cpt_number(lnet_cpt_table()); i++) {
+			if (lst_sched_test[i] == NULL)
+				continue;
+			cfs_wi_sched_destroy(lst_sched_test[i]);
+		}
+		CFS_FREE_PTR_ARRAY(lst_sched_test,
+				   cfs_cpt_number(lnet_cpt_table()));
+		lst_sched_test = NULL;
+		fallthrough;
+	case LST_INIT_WI_SERIAL:
+		cfs_wi_sched_destroy(lst_sched_serial);
+		lst_sched_serial = NULL;
+		fallthrough;
+	case LST_INIT_NONE:
+		break;
+	default:
+		LBUG();
+	}
+}
+
+void
+lnet_selftest_structure_assertion(void)
+{
+	BUILD_BUG_ON(sizeof(struct srpc_msg) != 160);
+	BUILD_BUG_ON(sizeof(struct srpc_test_reqst) != 70);
+	BUILD_BUG_ON(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_concur) !=
+		     72);
+	BUILD_BUG_ON(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_ndest) !=
+			      78);
+	BUILD_BUG_ON(sizeof(struct srpc_stat_reply) != 136);
+	BUILD_BUG_ON(sizeof(struct srpc_stat_reqst) != 28);
+}
+
+static int __init
+lnet_selftest_init(void)
+{
+	int nscheds;
+	int rc;
+	int i;
+
+	rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+				 1, &lst_sched_serial);
+	if (rc != 0) {
+		CERROR("Failed to create serial WI scheduler for LST\n");
+		return rc;
+	}
+	lst_init_step = LST_INIT_WI_SERIAL;
+
+	nscheds = cfs_cpt_number(lnet_cpt_table());
+	CFS_ALLOC_PTR_ARRAY(lst_sched_test, nscheds);
+	if (lst_sched_test == NULL) {
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	lst_init_step = LST_INIT_WI_TEST;
+	for (i = 0; i < nscheds; i++) {
+		int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+		/* reserve at least one CPU for LND */
+		nthrs = max(nthrs - 1, 1);
+		rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+					 nthrs, &lst_sched_test[i]);
+		if (rc != 0) {
+			CERROR("Failed to create CPU partition affinity WI scheduler %d for LST\n",
+			       i);
+			goto error;
+		}
+	}
+
+	rc = srpc_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup rpc\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_RPC;
+
+	rc = sfw_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup framework\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_FW;
+
+	rc = lstcon_console_init();
+	if (rc != 0) {
+		CERROR("LST can't startup console\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_CONSOLE;
+	return 0;
+error:
+	lnet_selftest_exit();
+	return rc;
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_VERSION("2.8.0");
+MODULE_LICENSE("GPL");
+
+module_init(lnet_selftest_init);
+module_exit(lnet_selftest_exit);
diff --git a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
new file mode 100644
index 0000000000000..021cb431108dd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC     0xbabeface
+
+static int ping_srv_workitems = SFW_TEST_WI_MAX;
+module_param(ping_srv_workitems, int, 0644);
+MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems");
+
+struct lst_ping_data {
+	spinlock_t	pnd_lock;	/* serialize */
+	int		pnd_counter;	/* sequence counter */
+};
+
+static struct lst_ping_data lst_ping_data;
+
+static int
+ping_client_init(struct sfw_test_instance *tsi)
+{
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+
+	LASSERT(tsi->tsi_is_client);
+	LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	spin_lock_init(&lst_ping_data.pnd_lock);
+	lst_ping_data.pnd_counter = 0;
+
+	return 0;
+}
+
+static void
+ping_client_fini(struct sfw_test_instance *tsi)
+{
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+        int            errors;
+
+        LASSERT (sn != NULL);
+        LASSERT (tsi->tsi_is_client);
+
+	errors = atomic_read(&sn->sn_ping_errors);
+        if (errors)
+                CWARN ("%d pings have failed.\n", errors);
+        else
+                CDEBUG (D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
+		     struct srpc_client_rpc **rpc)
+{
+	struct srpc_ping_reqst *req;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct timespec64 ts;
+	int rc;
+
+	LASSERT(sn != NULL);
+	LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+        if (rc != 0)
+                return rc;
+
+        req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+        req->pnr_magic = LST_PING_TEST_MAGIC;
+
+	spin_lock(&lst_ping_data.pnd_lock);
+	req->pnr_seq = lst_ping_data.pnd_counter++;
+	spin_unlock(&lst_ping_data.pnd_lock);
+
+	ktime_get_real_ts64(&ts);
+	req->pnr_time_sec  = ts.tv_sec;
+	req->pnr_time_nsec = ts.tv_nsec;
+
+	return rc;
+}
+
+static void
+ping_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
+{
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct srpc_ping_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+	struct srpc_ping_reply *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+	struct timespec64 ts;
+
+	LASSERT(sn != NULL);
+
+        if (rpc->crpc_status != 0) {
+                if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_ping_errors);
+                CERROR ("Unable to ping %s (%d): %d\n",
+                        libcfs_id2str(rpc->crpc_dest),
+                        reqst->pnr_seq, rpc->crpc_status);
+                return;
+	}
+
+        if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+                __swab32s(&reply->pnr_seq);
+                __swab32s(&reply->pnr_magic);
+                __swab32s(&reply->pnr_status);
+        }
+
+        if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+                rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+                CERROR ("Bad magic %u from %s, %u expected.\n",
+                        reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+                        LST_PING_TEST_MAGIC);
+                return;
+	}
+
+        if (reply->pnr_seq != reqst->pnr_seq) {
+                rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+                CERROR ("Bad seq %u from %s, %u expected.\n",
+                        reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+                        reqst->pnr_seq);
+                return;
+        }
+
+	ktime_get_real_ts64(&ts);
+	CDEBUG(D_NET, "%d reply in %llu nsec\n", reply->pnr_seq,
+	       (u64)((ts.tv_sec - reqst->pnr_time_sec) * NSEC_PER_SEC +
+		    (ts.tv_nsec - reqst->pnr_time_nsec)));
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service *sv  = rpc->srpc_scd->scd_svc;
+	struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_msg *replymsg = &rpc->srpc_replymsg;
+	struct srpc_ping_reqst *req = &reqstmsg->msg_body.ping_reqst;
+	struct srpc_ping_reply *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+        LASSERT (sv->sv_id == SRPC_SERVICE_PING);
+
+        if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+                LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+                __swab32s(&req->pnr_seq);
+                __swab32s(&req->pnr_magic);
+                __swab64s(&req->pnr_time_sec);
+		__swab64s(&req->pnr_time_nsec);
+        }
+        LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+        if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+                CERROR ("Unexpect magic %08x from %s\n",
+                        req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+                return -EINVAL;
+        }
+
+        rep->pnr_seq   = req->pnr_seq;
+        rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		rep->pnr_status = EPROTO;
+		return 0;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	CDEBUG(D_NET, "Get ping %d from %s\n",
+	       req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+	return 0;
+}
+
+struct sfw_test_client_ops ping_test_client;
+
+void ping_init_test_client(void)
+{
+        ping_test_client.tso_init     = ping_client_init;
+        ping_test_client.tso_fini     = ping_client_fini;
+        ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+        ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+struct srpc_service ping_test_service;
+
+void ping_init_test_service(void)
+{
+	ping_test_service.sv_id       = SRPC_SERVICE_PING;
+	ping_test_service.sv_name     = "ping_test";
+	ping_test_service.sv_handler  = ping_server_handle;
+	ping_test_service.sv_wi_total = ping_srv_workitems;
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.c b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
new file mode 100644
index 0000000000000..d0bdd019e47e6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
@@ -0,0 +1,1685 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+enum srpc_state {
+	SRPC_STATE_NONE,
+	SRPC_STATE_NI_INIT,
+	SRPC_STATE_EQ_INIT,
+	SRPC_STATE_RUNNING,
+	SRPC_STATE_STOPPING,
+};
+
+static struct smoketest_rpc {
+	spinlock_t	 rpc_glock;	/* global lock */
+	struct srpc_service	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
+	lnet_handler_t		 rpc_lnet_handler;/* _the_ LNet event handler */
+	enum srpc_state		 rpc_state;
+	struct srpc_counters	 rpc_counters;
+	__u64			 rpc_matchbits;	/* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+	return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+	       SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+static int srpc_handle_rpc(struct swi_workitem *wi);
+
+void srpc_get_counters(struct srpc_counters *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	*cnt = srpc_data.rpc_counters;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters(const struct srpc_counters *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters = *cnt;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+static int
+srpc_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i, int off,
+		   int nob)
+{
+	LASSERT(off < PAGE_SIZE);
+	LASSERT(nob > 0 && nob <= PAGE_SIZE);
+
+	bk->bk_iovs[i].bv_offset = off;
+	bk->bk_iovs[i].bv_page   = pg;
+	bk->bk_iovs[i].bv_len    = nob;
+	return nob;
+}
+
+void
+srpc_free_bulk(struct srpc_bulk *bk)
+{
+	int i;
+	struct page *pg;
+
+	LASSERT(bk != NULL);
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].bv_page;
+		if (pg == NULL)
+			break;
+
+		__free_page(pg);
+	}
+
+	LIBCFS_FREE(bk, offsetof(struct srpc_bulk, bk_iovs[bk->bk_niov]));
+}
+
+struct srpc_bulk *
+srpc_alloc_bulk(int cpt, unsigned bulk_off, unsigned bulk_npg,
+		unsigned bulk_len, int sink)
+{
+	struct srpc_bulk *bk;
+	int i;
+
+	LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+	LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+			 offsetof(struct srpc_bulk, bk_iovs[bulk_npg]));
+	if (bk == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+		return NULL;
+	}
+
+	memset(bk, 0, offsetof(struct srpc_bulk, bk_iovs[bulk_npg]));
+	bk->bk_sink   = sink;
+	bk->bk_len    = bulk_len;
+	bk->bk_niov   = bulk_npg;
+
+	for (i = 0; i < bulk_npg; i++) {
+		struct page *pg;
+		int nob;
+
+		pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL);
+		if (pg == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+			srpc_free_bulk(bk);
+			return NULL;
+		}
+
+		nob = min_t(unsigned, bulk_off + bulk_len, PAGE_SIZE) -
+		      bulk_off;
+
+		srpc_add_bulk_page(bk, pg, i, bulk_off, nob);
+		bulk_len -= nob;
+		bulk_off = 0;
+	}
+
+	return bk;
+}
+
+static inline __u64
+srpc_next_id (void)
+{
+	__u64 id;
+
+	spin_lock(&srpc_data.rpc_glock);
+	id = srpc_data.rpc_matchbits++;
+	spin_unlock(&srpc_data.rpc_glock);
+	return id;
+}
+
+static void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+		     struct srpc_service_cd *scd,
+		     struct srpc_buffer *buffer)
+{
+	memset(rpc, 0, sizeof(*rpc));
+	swi_init_workitem(&rpc->srpc_wi, srpc_handle_rpc,
+			  srpc_serv_is_framework(scd->scd_svc) ?
+			  lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+	rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+	rpc->srpc_scd      = scd;
+	rpc->srpc_reqstbuf = buffer;
+	rpc->srpc_peer     = buffer->buf_peer;
+	rpc->srpc_self     = buffer->buf_self;
+	LNetInvalidateMDHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	struct srpc_buffer *buf;
+	struct list_head *q;
+	int i;
+
+	if (svc->sv_cpt_data == NULL)
+		return;
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		while (1) {
+			if (!list_empty(&scd->scd_buf_posted))
+				q = &scd->scd_buf_posted;
+			else if (!list_empty(&scd->scd_buf_blocked))
+				q = &scd->scd_buf_blocked;
+			else
+				break;
+
+			while (!list_empty(q)) {
+				buf = list_entry(q->next,
+						 struct srpc_buffer,
+						 buf_list);
+				list_del(&buf->buf_list);
+				LIBCFS_FREE(buf, sizeof(*buf));
+			}
+		}
+
+		LASSERT(list_empty(&scd->scd_rpc_active));
+
+		while (!list_empty(&scd->scd_rpc_free)) {
+			rpc = list_entry(scd->scd_rpc_free.next,
+					 struct srpc_server_rpc,
+					 srpc_list);
+			list_del(&rpc->srpc_list);
+			LIBCFS_FREE(rpc, sizeof(*rpc));
+		}
+	}
+
+	cfs_percpt_free(svc->sv_cpt_data);
+	svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+	int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+	return srpc_serv_is_framework(svc) ?
+	       max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	int nrpcs;
+	int i;
+	int j;
+
+	svc->sv_shuttingdown = 0;
+
+	svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct srpc_service_cd));
+	if (svc->sv_cpt_data == NULL)
+		return -ENOMEM;
+
+	svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+			1 : cfs_cpt_number(lnet_cpt_table());
+	nrpcs = srpc_service_nrpcs(svc);
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		scd->scd_cpt = i;
+		scd->scd_svc = svc;
+		spin_lock_init(&scd->scd_lock);
+		INIT_LIST_HEAD(&scd->scd_rpc_free);
+		INIT_LIST_HEAD(&scd->scd_rpc_active);
+		INIT_LIST_HEAD(&scd->scd_buf_posted);
+		INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+		scd->scd_ev.ev_data = scd;
+		scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+		/* NB: don't use lst_sched_serial for adding buffer,
+		 * see details in srpc_service_add_buffers() */
+		swi_init_workitem(&scd->scd_buf_wi,
+				  srpc_add_buffer, lst_sched_test[i]);
+
+		if (i != 0 && srpc_serv_is_framework(svc)) {
+			/* NB: framework service only needs srpc_service_cd for
+			 * one partition, but we allocate for all to make
+			 * it easier to implement, it will waste a little
+			 * memory but nobody should care about this */
+			continue;
+		}
+
+		for (j = 0; j < nrpcs; j++) {
+			LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+					 i, sizeof(*rpc));
+			if (rpc == NULL) {
+				srpc_service_fini(svc);
+				return -ENOMEM;
+			}
+			list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+		}
+	}
+
+	return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+	int id = sv->sv_id;
+
+	LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+	if (srpc_service_init(sv) != 0)
+		return -ENOMEM;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	if (srpc_data.rpc_services[id] != NULL) {
+		spin_unlock(&srpc_data.rpc_glock);
+		goto failed;
+	}
+
+	srpc_data.rpc_services[id] = sv;
+	spin_unlock(&srpc_data.rpc_glock);
+
+	CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+	return 0;
+
+failed:
+	srpc_service_fini(sv);
+	return -EBUSY;
+}
+
+int
+srpc_remove_service(struct srpc_service *sv)
+{
+	int id = sv->sv_id;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	if (srpc_data.rpc_services[id] != sv) {
+		spin_unlock(&srpc_data.rpc_glock);
+		return -ENOENT;
+	}
+
+	srpc_data.rpc_services[id] = NULL;
+	spin_unlock(&srpc_data.rpc_glock);
+	return 0;
+}
+
+static int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+		       int len, int options, struct lnet_process_id peer4,
+		       struct lnet_handle_md *mdh, struct srpc_event *ev)
+{
+	int rc;
+	struct lnet_md md;
+	struct lnet_me *me;
+	struct lnet_processid peer;
+
+	peer.pid = peer4.pid;
+	lnet_nid4_to_nid(peer4.nid, &peer.nid);
+
+	me = LNetMEAttach(portal, &peer, matchbits, 0, LNET_UNLINK,
+			  local ? LNET_INS_LOCAL : LNET_INS_AFTER);
+	if (IS_ERR(me)) {
+		rc = PTR_ERR(me);
+		CERROR("LNetMEAttach failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	md.threshold = 1;
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.options   = options;
+	md.handler   = srpc_data.rpc_lnet_handler;
+
+	rc = LNetMDAttach(me, &md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR("LNetMDAttach failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET,
+	       "Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
+	       libcfs_id2str(peer4), portal, matchbits);
+	return 0;
+}
+
+static int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+		      int options, struct lnet_process_id peer,
+		      lnet_nid_t self, struct lnet_handle_md *mdh,
+		      struct srpc_event *ev)
+{
+	int rc;
+	struct lnet_md md;
+
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.handler   = srpc_data.rpc_lnet_handler;
+	md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+	md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+	rc = LNetMDBind(&md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR("LNetMDBind failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	/* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+	 * they're only meaningful for MDs attached to an ME (i.e. passive
+	 * buffers...
+	 */
+	if ((options & LNET_MD_OP_PUT) != 0) {
+		rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+			     portal, matchbits, 0, 0);
+	} else {
+		LASSERT((options & LNET_MD_OP_GET) != 0);
+
+		rc = LNetGet(self, *mdh, peer, portal, matchbits, 0, false);
+	}
+
+	if (rc != 0) {
+		CERROR("LNet%s(%s, %d, %lld) failed: %d\n",
+		       ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+		       libcfs_id2str(peer), portal, matchbits, rc);
+
+		/* The forthcoming unlink event will complete this operation
+		 * with failure, so fall through and return success here.
+		 */
+		rc = LNetMDUnlink(*mdh);
+		LASSERT(rc == 0);
+	} else {
+		CDEBUG(D_NET,
+		       "Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
+		       libcfs_id2str(peer), portal, matchbits);
+	}
+	return 0;
+}
+
+static int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+			 struct lnet_handle_md *mdh, struct srpc_event *ev)
+{
+	struct lnet_process_id any = {0};
+
+	any.nid = LNET_NID_ANY;
+	any.pid = LNET_PID_ANY;
+
+	return srpc_post_passive_rdma(srpc_serv_portal(service),
+				      local, service, buf, len,
+				      LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+static int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+__must_hold(&scd->scd_lock)
+{
+	struct srpc_service *sv = scd->scd_svc;
+	struct srpc_msg *msg = &buf->buf_msg;
+	int rc;
+
+	LNetInvalidateMDHandle(&buf->buf_mdh);
+	list_add(&buf->buf_list, &scd->scd_buf_posted);
+	scd->scd_buf_nposted++;
+	spin_unlock(&scd->scd_lock);
+
+	rc = srpc_post_passive_rqtbuf(sv->sv_id,
+				      !srpc_serv_is_framework(sv),
+				      msg, sizeof(*msg), &buf->buf_mdh,
+				      &scd->scd_ev);
+
+	/* At this point, a RPC (new or delayed) may have arrived in
+	 * msg and its event handler has been called. So we must add
+	 * buf to scd_buf_posted _before_ dropping scd_lock */
+
+	spin_lock(&scd->scd_lock);
+
+	if (rc == 0) {
+		if (!sv->sv_shuttingdown)
+			return 0;
+
+		spin_unlock(&scd->scd_lock);
+		/* srpc_shutdown_service might have tried to unlink me
+		 * when my buf_mdh was still invalid */
+		LNetMDUnlink(buf->buf_mdh);
+		spin_lock(&scd->scd_lock);
+		return 0;
+	}
+
+	scd->scd_buf_nposted--;
+	if (sv->sv_shuttingdown)
+		return rc; /* don't allow to change scd_buf_posted */
+
+	list_del(&buf->buf_list);
+	spin_unlock(&scd->scd_lock);
+
+	LIBCFS_FREE(buf, sizeof(*buf));
+
+	spin_lock(&scd->scd_lock);
+	return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+	struct srpc_service_cd *scd = container_of(wi, struct srpc_service_cd,
+						   scd_buf_wi);
+	struct srpc_buffer *buf;
+	int rc = 0;
+
+	/* it's called by workitem scheduler threads, these threads
+	 * should have been set CPT affinity, so buffers will be posted
+	 * on CPT local list of Portal */
+	spin_lock(&scd->scd_lock);
+
+	while (scd->scd_buf_adjust > 0 &&
+	       !scd->scd_svc->sv_shuttingdown) {
+		scd->scd_buf_adjust--; /* consume it */
+		scd->scd_buf_posting++;
+
+		spin_unlock(&scd->scd_lock);
+
+		LIBCFS_ALLOC(buf, sizeof(*buf));
+		if (buf == NULL) {
+			CERROR("Failed to add new buf to service: %s\n",
+			       scd->scd_svc->sv_name);
+			spin_lock(&scd->scd_lock);
+			rc = -ENOMEM;
+			break;
+		}
+
+		spin_lock(&scd->scd_lock);
+		if (scd->scd_svc->sv_shuttingdown) {
+			spin_unlock(&scd->scd_lock);
+			LIBCFS_FREE(buf, sizeof(*buf));
+
+			spin_lock(&scd->scd_lock);
+			rc = -ESHUTDOWN;
+			break;
+		}
+
+		rc = srpc_service_post_buffer(scd, buf);
+		if (rc != 0)
+			break; /* buf has been freed inside */
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+		scd->scd_buf_total++;
+		scd->scd_buf_low = max(2, scd->scd_buf_total / 4);
+	}
+
+	if (rc != 0) {
+		scd->scd_buf_err_stamp = ktime_get_real_seconds();
+		scd->scd_buf_err = rc;
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd *scd;
+	int rc = 0;
+	int i;
+
+	LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		scd->scd_buf_err = 0;
+		scd->scd_buf_err_stamp = 0;
+		scd->scd_buf_posting = 0;
+		scd->scd_buf_adjust = nbuffer;
+		/* start to post buffers */
+		swi_schedule_workitem(&scd->scd_buf_wi);
+		spin_unlock(&scd->scd_lock);
+
+		/* framework service only post buffer for one partition  */
+		if (srpc_serv_is_framework(sv))
+			break;
+	}
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		/*
+		 * NB: srpc_service_add_buffers() can be called inside
+		 * thread context of lst_sched_serial, and we don't normally
+		 * allow to sleep inside thread context of WI scheduler
+		 * because it will block current scheduler thread from doing
+		 * anything else, even worse, it could deadlock if it's
+		 * waiting on result from another WI of the same scheduler.
+		 * However, it's safe at here because scd_buf_wi is scheduled
+		 * by thread in a different WI scheduler (lst_sched_test),
+		 * so we don't have any risk of deadlock, though this could
+		 * block all WIs pending on lst_sched_serial for a moment
+		 * which is not good but not fatal.
+		 */
+		lst_wait_until(scd->scd_buf_err != 0 ||
+			       (scd->scd_buf_adjust == 0 &&
+				scd->scd_buf_posting == 0),
+			       scd->scd_lock, "waiting for adding buffer\n");
+
+		if (scd->scd_buf_err != 0 && rc == 0)
+			rc = scd->scd_buf_err;
+
+		spin_unlock(&scd->scd_lock);
+	}
+
+	return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd *scd;
+	int num;
+	int i;
+
+	LASSERT(!sv->sv_shuttingdown);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		num = scd->scd_buf_total + scd->scd_buf_posting;
+		scd->scd_buf_adjust -= min(nbuffer, num);
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	int i;
+
+	LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		if (!swi_deschedule_workitem(&scd->scd_buf_wi)) {
+			spin_unlock(&scd->scd_lock);
+			return 0;
+		}
+
+		if (scd->scd_buf_nposted > 0) {
+			CDEBUG(D_NET, "waiting for %d posted buffers to unlink\n",
+			       scd->scd_buf_nposted);
+			spin_unlock(&scd->scd_lock);
+			return 0;
+		}
+
+		if (list_empty(&scd->scd_rpc_active)) {
+			spin_unlock(&scd->scd_lock);
+			continue;
+		}
+
+		rpc = list_entry(scd->scd_rpc_active.next,
+				 struct srpc_server_rpc, srpc_list);
+		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n",
+			rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+			swi_state2str(rpc->srpc_wi.swi_state),
+			rpc->srpc_wi.swi_workitem.wi_scheduled,
+			rpc->srpc_wi.swi_workitem.wi_running,
+			rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+			rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+		spin_unlock(&scd->scd_lock);
+		return 0;
+	}
+
+	/* no lock needed from now on */
+	srpc_service_fini(sv);
+	return 1;
+}
+
+/* called with sv->sv_lock held */
+static void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd,
+			    struct srpc_buffer *buf)
+__must_hold(&scd->scd_lock)
+{
+	if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+		if (srpc_service_post_buffer(scd, buf) != 0) {
+			CWARN("Failed to post %s buffer\n",
+			      scd->scd_svc->sv_name);
+		}
+		return;
+	}
+
+	/* service is shutting down, or we want to recycle some buffers */
+	scd->scd_buf_total--;
+
+	if (scd->scd_buf_adjust < 0) {
+		scd->scd_buf_adjust++;
+		if (scd->scd_buf_adjust < 0 &&
+		    scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+			CDEBUG(D_INFO,
+			       "Try to recyle %d buffers but nothing left\n",
+			       scd->scd_buf_adjust);
+			scd->scd_buf_adjust = 0;
+		}
+	}
+
+	spin_unlock(&scd->scd_lock);
+	LIBCFS_FREE(buf, sizeof(*buf));
+	spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	int i;
+
+	CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the abort, NB:
+		 * racing with incoming RPCs; complete fix should make test
+		 * RPCs carry session ID in its headers
+		 */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+			rpc->srpc_aborted = 1;
+			swi_schedule_workitem(&rpc->srpc_wi);
+		}
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+void
+srpc_shutdown_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	struct srpc_buffer *buf;
+	int i;
+
+	CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_lock(&scd->scd_lock);
+
+	sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_unlock(&scd->scd_lock);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the shutdown */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+			swi_schedule_workitem(&rpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+
+		/* OK to traverse scd_buf_posted without lock, since no one
+		 * touches scd_buf_posted now
+		 */
+		list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+			LNetMDUnlink(buf->buf_mdh);
+	}
+}
+
+static int
+srpc_send_request(struct srpc_client_rpc *rpc)
+{
+	struct srpc_event *ev = &rpc->crpc_reqstev;
+	int rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REQUEST_SENT;
+
+	rc = srpc_post_active_rdma(srpc_serv_portal(rpc->crpc_service),
+				   rpc->crpc_service, &rpc->crpc_reqstmsg,
+				   sizeof(struct srpc_msg), LNET_MD_OP_PUT,
+				   rpc->crpc_dest, LNET_NID_ANY,
+				   &rpc->crpc_reqstmdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+static int
+srpc_prepare_reply(struct srpc_client_rpc *rpc)
+{
+	struct srpc_event *ev = &rpc->crpc_replyev;
+	u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+	int rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &rpc->crpc_replymsg,
+				    sizeof(struct srpc_msg),
+				    LNET_MD_OP_PUT, rpc->crpc_dest,
+				    &rpc->crpc_replymdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+static int
+srpc_prepare_bulk(struct srpc_client_rpc *rpc)
+{
+	struct srpc_bulk *bk = &rpc->crpc_bulk;
+	struct srpc_event *ev = &rpc->crpc_bulkev;
+	__u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+	int rc;
+	int opt;
+
+	LASSERT(bk->bk_niov <= LNET_MAX_IOV);
+
+	/* nothing to do */
+	if (bk->bk_niov == 0)
+		return 0;
+
+	opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_BULK_REQ_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &bk->bk_iovs[0], bk->bk_niov, opt,
+				    rpc->crpc_dest, &bk->bk_mdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+static int
+srpc_do_bulk(struct srpc_server_rpc *rpc)
+{
+	struct srpc_event *ev = &rpc->srpc_ev;
+	struct srpc_bulk *bk = rpc->srpc_bulk;
+	__u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+	int rc;
+	int opt;
+
+	LASSERT(bk != NULL);
+
+	opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+				   &bk->bk_iovs[0], bk->bk_niov, opt,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &bk->bk_mdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* only called from srpc_handle_rpc */
+static void
+srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status)
+{
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv  = scd->scd_svc;
+	struct srpc_buffer *buffer;
+
+	LASSERT(status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+	rpc->srpc_status = status;
+
+	CDEBUG_LIMIT(status == 0 ? D_NET : D_NETERROR,
+		     "Server RPC %p done: service %s, peer %s, status %s:%d\n",
+		     rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		     swi_state2str(rpc->srpc_wi.swi_state), status);
+
+	if (status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_dropped++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	if (rpc->srpc_done != NULL)
+		(*rpc->srpc_done) (rpc);
+	LASSERT(rpc->srpc_bulk == NULL);
+
+	spin_lock(&scd->scd_lock);
+
+	if (rpc->srpc_reqstbuf != NULL) {
+		/* NB might drop sv_lock in srpc_service_recycle_buffer, but
+		 * sv won't go away for scd_rpc_active must not be empty
+		 */
+		srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+	/*
+	 * No one can schedule me now since:
+	 * - I'm not on scd_rpc_active.
+	 * - all LNet events have been fired.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(rpc->srpc_ev.ev_fired);
+	swi_exit_workitem(&rpc->srpc_wi);
+
+	if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+		buffer = list_entry(scd->scd_buf_blocked.next,
+				    struct srpc_buffer, buf_list);
+		list_del(&buffer->buf_list);
+
+		srpc_init_server_rpc(rpc, scd, buffer);
+		list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+		swi_schedule_workitem(&rpc->srpc_wi);
+	} else {
+		list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+	}
+
+	spin_unlock(&scd->scd_lock);
+}
+
+/* handles an incoming RPC */
+static int srpc_handle_rpc(struct swi_workitem *wi)
+{
+	struct srpc_server_rpc *rpc = container_of(wi, struct srpc_server_rpc,
+						   srpc_wi);
+	struct srpc_service_cd *scd = rpc->srpc_scd;
+	struct srpc_service *sv = scd->scd_svc;
+	struct srpc_event *ev = &rpc->srpc_ev;
+	int rc = 0;
+
+	LASSERT(wi == &rpc->srpc_wi);
+
+	spin_lock(&scd->scd_lock);
+
+	if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+		spin_unlock(&scd->scd_lock);
+
+		if (rpc->srpc_bulk != NULL)
+			LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+		LNetMDUnlink(rpc->srpc_replymdh);
+
+		if (ev->ev_fired) { /* no more event, OK to finish */
+			srpc_server_rpc_done(rpc, -ESHUTDOWN);
+			return 1;
+		}
+		return 0;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG();
+		fallthrough;
+	case SWI_STATE_NEWBORN: {
+		struct srpc_msg *msg;
+		struct srpc_generic_reply *reply;
+
+		msg = &rpc->srpc_reqstbuf->buf_msg;
+		reply = &rpc->srpc_replymsg.msg_body.reply;
+
+		if (msg->msg_magic == 0) {
+			/* moaned already in srpc_lnet_ev_handler */
+			srpc_server_rpc_done(rpc, EBADMSG);
+			return 1;
+		}
+
+		srpc_unpack_msg_hdr(msg);
+		if (msg->msg_version != SRPC_MSG_VERSION) {
+			CWARN("Version mismatch: %u, %u expected, from %s\n",
+			      msg->msg_version, SRPC_MSG_VERSION,
+			      libcfs_id2str(rpc->srpc_peer));
+			reply->status = EPROTO;
+			/* drop through and send reply */
+		} else {
+			reply->status = 0;
+			rc = (*sv->sv_handler)(rpc);
+			LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_BULK_STARTED;
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = srpc_do_bulk(rpc);
+			if (rc == 0)
+				return 0; /* wait for bulk */
+
+			LASSERT(ev->ev_fired);
+			ev->ev_status = rc;
+		}
+	}
+	fallthrough;
+	case SWI_STATE_BULK_STARTED:
+		LASSERT(rpc->srpc_bulk == NULL || ev->ev_fired);
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = ev->ev_status;
+
+			if (sv->sv_bulk_ready != NULL)
+				rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+		rc = srpc_send_reply(rpc);
+		if (rc == 0)
+			return 0; /* wait for reply */
+		srpc_server_rpc_done(rpc, rc);
+		return 1;
+
+	case SWI_STATE_REPLY_SUBMITTED:
+		if (!ev->ev_fired) {
+			CERROR("RPC %p: bulk %p, service %d\n",
+			       rpc, rpc->srpc_bulk, sv->sv_id);
+			CERROR("Event: status %d, type %d, lnet %d\n",
+			       ev->ev_status, ev->ev_type, ev->ev_lnet);
+			LASSERT(ev->ev_fired);
+		}
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_server_rpc_done(rpc, ev->ev_status);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void
+srpc_client_rpc_expired (void *data)
+{
+	struct srpc_client_rpc *rpc = data;
+
+	CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n",
+	      rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	      rpc->crpc_timeout);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_timeout = 0;
+	srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters.rpcs_expired++;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+static void
+srpc_add_client_rpc_timer(struct srpc_client_rpc *rpc)
+{
+	struct stt_timer *timer = &rpc->crpc_timer;
+
+	if (rpc->crpc_timeout == 0)
+		return;
+
+	INIT_LIST_HEAD(&timer->stt_list);
+	timer->stt_data	   = rpc;
+	timer->stt_func    = srpc_client_rpc_expired;
+	timer->stt_expires = ktime_get_real_seconds() + rpc->crpc_timeout;
+	stt_add_timer(timer);
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU.
+ */
+static void
+srpc_del_client_rpc_timer(struct srpc_client_rpc *rpc)
+{
+	/* timer not planted or already exploded */
+	if (rpc->crpc_timeout == 0)
+		return;
+
+	/* timer successfully defused */
+	if (stt_del_timer(&rpc->crpc_timer))
+		return;
+
+	/* timer detonated, wait for it to explode */
+	while (rpc->crpc_timeout != 0) {
+		spin_unlock(&rpc->crpc_lock);
+
+		schedule();
+
+		spin_lock(&rpc->crpc_lock);
+	}
+}
+
+static void
+srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status)
+{
+	struct swi_workitem *wi = &rpc->crpc_wi;
+
+	LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_closed = 1;
+	if (rpc->crpc_status == 0)
+		rpc->crpc_status = status;
+
+	srpc_del_client_rpc_timer(rpc);
+
+	CDEBUG_LIMIT((status == 0) ? D_NET : D_NETERROR,
+		     "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+		     rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		     swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+	/*
+	 * No one can schedule me now since:
+	 * - RPC timer has been defused.
+	 * - all LNet events have been fired.
+	 * - crpc_closed has been set, preventing srpc_abort_rpc from
+	 *   scheduling me.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(!srpc_event_pending(rpc));
+	swi_exit_workitem(wi);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	(*rpc->crpc_done)(rpc);
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc(struct swi_workitem *wi)
+{
+	int rc = 0;
+	struct srpc_client_rpc *rpc;
+	struct srpc_msg *reply;
+	int do_bulk;
+
+	LASSERT(wi != NULL);
+
+	rpc = container_of(wi, struct srpc_client_rpc, crpc_wi);
+
+	LASSERT(rpc != NULL);
+	LASSERT(wi == &rpc->crpc_wi);
+
+	reply = &rpc->crpc_replymsg;
+	do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (rpc->crpc_aborted) {
+		spin_unlock(&rpc->crpc_lock);
+		goto abort;
+	}
+
+	spin_unlock(&rpc->crpc_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG();
+	case SWI_STATE_NEWBORN:
+		LASSERT(!srpc_event_pending(rpc));
+
+		rc = srpc_prepare_reply(rpc);
+		if (rc != 0) {
+			srpc_client_rpc_done(rpc, rc);
+			return 1;
+		}
+
+		rc = srpc_prepare_bulk(rpc);
+		if (rc != 0)
+			break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+		rc = srpc_send_request(rpc);
+		break;
+
+	case SWI_STATE_REQUEST_SUBMITTED:
+		/* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+		 * order; however, they're processed in a strict order:
+		 * rqt, rpy, and bulk.
+		 */
+		if (!rpc->crpc_reqstev.ev_fired)
+			break;
+
+		rc = rpc->crpc_reqstev.ev_status;
+		if (rc != 0)
+			break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SENT;
+		fallthrough;
+	case SWI_STATE_REQUEST_SENT: {
+		enum srpc_msg_type type;
+
+		type = srpc_service2reply(rpc->crpc_service);
+
+		if (!rpc->crpc_replyev.ev_fired)
+			break;
+
+		rc = rpc->crpc_replyev.ev_status;
+		if (rc != 0)
+			break;
+
+		srpc_unpack_msg_hdr(reply);
+		if (reply->msg_type != type ||
+		    (reply->msg_magic != SRPC_MSG_MAGIC &&
+		     reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n",
+			      libcfs_id2str(rpc->crpc_dest),
+			      reply->msg_type, type,
+			      reply->msg_magic, SRPC_MSG_MAGIC);
+			rc = -EBADMSG;
+			break;
+		}
+
+		if (do_bulk && reply->msg_body.reply.status != 0) {
+			CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n",
+			      reply->msg_body.reply.status,
+			      libcfs_id2str(rpc->crpc_dest));
+			LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+	}
+	fallthrough;
+	case SWI_STATE_REPLY_RECEIVED:
+		if (do_bulk && !rpc->crpc_bulkev.ev_fired)
+			break;
+
+		rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+		/* Bulk buffer was unlinked due to remote error. Clear error
+		 * since reply buffer still contains valid data.
+		 * NB rpc->crpc_done shouldn't look into bulk data in case of
+		 * remote error.
+		 */
+		if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+		    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+			rc = 0;
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_client_rpc_done(rpc, rc);
+		return 1;
+	}
+
+	if (rc != 0) {
+		spin_lock(&rpc->crpc_lock);
+		srpc_abort_rpc(rpc, rc);
+		spin_unlock(&rpc->crpc_lock);
+	}
+
+abort:
+	if (rpc->crpc_aborted) {
+		LNetMDUnlink(rpc->crpc_reqstmdh);
+		LNetMDUnlink(rpc->crpc_replymdh);
+		LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+		if (!srpc_event_pending(rpc)) {
+			srpc_client_rpc_done(rpc, -EINTR);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+struct srpc_client_rpc *
+srpc_create_client_rpc(struct lnet_process_id peer, int service,
+		       int nbulkiov, int bulklen,
+		       void (*rpc_done)(struct srpc_client_rpc *),
+		       void (*rpc_fini)(struct srpc_client_rpc *), void *priv)
+{
+	struct srpc_client_rpc *rpc;
+
+	LIBCFS_ALLOC(rpc, offsetof(struct srpc_client_rpc,
+				   crpc_bulk.bk_iovs[nbulkiov]));
+	if (rpc == NULL)
+		return NULL;
+
+	srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+			     bulklen, rpc_done, rpc_fini, priv);
+	return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc(struct srpc_client_rpc *rpc, int why)
+{
+	LASSERT(why != 0);
+
+	if (rpc->crpc_aborted || /* already aborted */
+	    rpc->crpc_closed)    /* callback imminent */
+		return;
+
+	CDEBUG(D_NET,
+	       "Aborting RPC: service %d, peer %s, state %s, why %d\n",
+	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	       swi_state2str(rpc->crpc_wi.swi_state), why);
+
+	rpc->crpc_aborted = 1;
+	rpc->crpc_status  = why;
+	swi_schedule_workitem(&rpc->crpc_wi);
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc(struct srpc_client_rpc *rpc)
+{
+	LASSERT(!rpc->crpc_aborted);
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+	       libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+	       rpc->crpc_timeout);
+
+	srpc_add_client_rpc_timer(rpc);
+	swi_schedule_workitem(&rpc->crpc_wi);
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+	struct srpc_event *ev = &rpc->srpc_ev;
+	struct srpc_msg *msg = &rpc->srpc_replymsg;
+	struct srpc_buffer *buffer = rpc->srpc_reqstbuf;
+	struct srpc_service_cd *scd = rpc->srpc_scd;
+	struct srpc_service *sv = scd->scd_svc;
+	__u64 rpyid;
+	int rc;
+
+	LASSERT(buffer != NULL);
+	rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+	spin_lock(&scd->scd_lock);
+
+	if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+		/* Repost buffer before replying since test client
+		 * might send me another RPC once it gets the reply
+		 */
+		if (srpc_service_post_buffer(scd, buffer) != 0)
+			CWARN("Failed to repost %s buffer\n", sv->sv_name);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_SENT;
+
+	msg->msg_magic   = SRPC_MSG_MAGIC;
+	msg->msg_version = SRPC_MSG_VERSION;
+	msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+				   sizeof(*msg), LNET_MD_OP_PUT,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &rpc->srpc_replymdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+static void
+srpc_lnet_ev_handler(struct lnet_event *ev)
+{
+	struct srpc_service_cd *scd;
+	struct srpc_event *rpcev = ev->md_user_ptr;
+	struct srpc_client_rpc *crpc;
+	struct srpc_server_rpc *srpc;
+	struct srpc_buffer *buffer;
+	struct srpc_service *sv;
+	struct srpc_msg *msg;
+	enum srpc_msg_type type;
+
+	LASSERT(!in_interrupt());
+
+	if (ev->status != 0) {
+		__u32 errors;
+
+		spin_lock(&srpc_data.rpc_glock);
+		if (ev->status != -ECANCELED) /* cancellation is not error */
+			srpc_data.rpc_counters.errors++;
+		errors = srpc_data.rpc_counters.errors;
+		spin_unlock(&srpc_data.rpc_glock);
+
+		CNETERR("LNet event status %d type %d, RPC errors %u\n",
+			ev->status, ev->type, errors);
+	}
+
+	rpcev->ev_lnet = ev->type;
+
+	switch (rpcev->ev_type) {
+	default:
+		CERROR("Unknown event: status %d, type %d, lnet %d\n",
+		       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+		LBUG();
+		fallthrough;
+	case SRPC_REQUEST_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+			srpc_data.rpc_counters.rpcs_sent++;
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+		fallthrough;
+	case SRPC_REPLY_RCVD:
+	case SRPC_BULK_REQ_RCVD:
+		crpc = rpcev->ev_data;
+
+		if (rpcev != &crpc->crpc_reqstev &&
+		    rpcev != &crpc->crpc_replyev &&
+		    rpcev != &crpc->crpc_bulkev) {
+			CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+			       rpcev, crpc, &crpc->crpc_reqstev,
+			       &crpc->crpc_replyev, &crpc->crpc_bulkev);
+			CERROR("Bad event: status %d, type %d, lnet %d\n",
+			       rpcev->ev_status, rpcev->ev_type,
+			       rpcev->ev_lnet);
+			LBUG();
+		}
+
+		spin_lock(&crpc->crpc_lock);
+
+		LASSERT(rpcev->ev_fired == 0);
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+				   -EINTR : ev->status;
+		swi_schedule_workitem(&crpc->crpc_wi);
+
+		spin_unlock(&crpc->crpc_lock);
+		break;
+
+	case SRPC_REQUEST_RCVD:
+		scd = rpcev->ev_data;
+		sv = scd->scd_svc;
+
+		LASSERT(rpcev == &scd->scd_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		LASSERT(ev->unlinked);
+		LASSERT(ev->type == LNET_EVENT_PUT ||
+			ev->type == LNET_EVENT_UNLINK);
+		LASSERT(ev->type != LNET_EVENT_UNLINK ||
+			sv->sv_shuttingdown);
+
+		buffer = container_of(ev->md_start, struct srpc_buffer,
+				      buf_msg);
+		buffer->buf_peer = lnet_pid_to_pid4(&ev->source);
+		buffer->buf_self = lnet_nid_to_nid4(&ev->target.nid);
+
+		LASSERT(scd->scd_buf_nposted > 0);
+		scd->scd_buf_nposted--;
+
+		if (sv->sv_shuttingdown) {
+			/* Leave buffer on scd->scd_buf_nposted since
+			 * srpc_finish_service needs to traverse it.
+			 */
+			spin_unlock(&scd->scd_lock);
+			break;
+		}
+
+		if (scd->scd_buf_err_stamp != 0 &&
+		    scd->scd_buf_err_stamp < ktime_get_real_seconds()) {
+			/* re-enable adding buffer */
+			scd->scd_buf_err_stamp = 0;
+			scd->scd_buf_err = 0;
+		}
+
+		if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+		    scd->scd_buf_adjust == 0 &&
+		    scd->scd_buf_nposted < scd->scd_buf_low) {
+			scd->scd_buf_adjust = max(scd->scd_buf_total / 2,
+						  SFW_TEST_WI_MIN);
+			swi_schedule_workitem(&scd->scd_buf_wi);
+		}
+
+		list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+		msg = &buffer->buf_msg;
+		type = srpc_service2request(sv->sv_id);
+
+		if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+		    (msg->msg_type != type &&
+		     msg->msg_type != __swab32(type)) ||
+		    (msg->msg_magic != SRPC_MSG_MAGIC &&
+		     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n",
+			       sv->sv_name, libcfs_idstr(&ev->initiator),
+			       ev->status, ev->mlength,
+			       msg->msg_type, msg->msg_magic);
+
+			/* NB can't call srpc_service_recycle_buffer here since
+			 * it may call LNetM[DE]Attach. The invalid magic tells
+			 * srpc_handle_rpc to drop this RPC
+			 */
+			msg->msg_magic = 0;
+		}
+
+		if (!list_empty(&scd->scd_rpc_free)) {
+			srpc = list_entry(scd->scd_rpc_free.next,
+					  struct srpc_server_rpc,
+					  srpc_list);
+			list_del(&srpc->srpc_list);
+
+			srpc_init_server_rpc(srpc, scd, buffer);
+			list_add_tail(&srpc->srpc_list,
+				      &scd->scd_rpc_active);
+			swi_schedule_workitem(&srpc->srpc_wi);
+		} else {
+			list_add_tail(&buffer->buf_list,
+				      &scd->scd_buf_blocked);
+		}
+
+		spin_unlock(&scd->scd_lock);
+
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_rcvd++;
+		spin_unlock(&srpc_data.rpc_glock);
+		break;
+
+	case SRPC_BULK_GET_RPLD:
+		LASSERT(ev->type == LNET_EVENT_SEND ||
+			ev->type == LNET_EVENT_REPLY ||
+			ev->type == LNET_EVENT_UNLINK);
+
+		if (!ev->unlinked)
+			break; /* wait for final event */
+		fallthrough;
+	case SRPC_BULK_PUT_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+
+			if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+				srpc_data.rpc_counters.bulk_get += ev->mlength;
+			else
+				srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+		fallthrough;
+	case SRPC_REPLY_SENT:
+		srpc = rpcev->ev_data;
+		scd  = srpc->srpc_scd;
+
+		LASSERT(rpcev == &srpc->srpc_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+				   -EINTR : ev->status;
+		swi_schedule_workitem(&srpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+		break;
+	}
+}
+
+
+int
+srpc_startup (void)
+{
+	int rc;
+
+	memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+	spin_lock_init(&srpc_data.rpc_glock);
+
+	/* 1 second pause to avoid timestamp reuse */
+	schedule_timeout_uninterruptible(cfs_time_seconds(1));
+	srpc_data.rpc_matchbits = ((__u64) ktime_get_real_seconds()) << 48;
+
+	srpc_data.rpc_state = SRPC_STATE_NONE;
+
+	rc = LNetNIInit(LNET_PID_LUSTRE);
+	if (rc < 0) {
+		CERROR("LNetNIInit() has failed: %d\n", rc);
+		return rc;
+	}
+
+	srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+	srpc_data.rpc_lnet_handler = srpc_lnet_ev_handler;
+
+	rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+	rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+
+	srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+	rc = stt_startup();
+
+	if (rc != 0)
+		srpc_shutdown();
+	else
+		srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+	return rc;
+}
+
+void
+srpc_shutdown (void)
+{
+	int i;
+	int rc;
+	int state;
+
+	state = srpc_data.rpc_state;
+	srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+	switch (state) {
+	default:
+		LBUG();
+		fallthrough;
+	case SRPC_STATE_RUNNING:
+		spin_lock(&srpc_data.rpc_glock);
+
+		for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+			struct srpc_service *sv = srpc_data.rpc_services[i];
+
+			LASSERTF(sv == NULL,
+				 "service not empty: id %d, name %s\n",
+				 i, sv->sv_name);
+		}
+
+		spin_unlock(&srpc_data.rpc_glock);
+
+		stt_shutdown();
+		fallthrough;
+
+	case SRPC_STATE_EQ_INIT:
+		rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+		rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+		LASSERT(rc == 0);
+		lnet_assert_handler_unused(srpc_data.rpc_lnet_handler);
+		fallthrough;
+
+	case SRPC_STATE_NI_INIT:
+		LNetNIFini();
+	}
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.h b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
new file mode 100644
index 0000000000000..7b0b786cce324
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
@@ -0,0 +1,296 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include <uapi/linux/lnet/lnetst.h>
+
+/*
+ * LST wired structures
+ *
+ * XXX: *REPLY == *REQST + 1
+ */
+enum srpc_msg_type {
+        SRPC_MSG_MKSN_REQST     = 0,
+        SRPC_MSG_MKSN_REPLY     = 1,
+        SRPC_MSG_RMSN_REQST     = 2,
+        SRPC_MSG_RMSN_REPLY     = 3,
+        SRPC_MSG_BATCH_REQST    = 4,
+        SRPC_MSG_BATCH_REPLY    = 5,
+        SRPC_MSG_STAT_REQST     = 6,
+        SRPC_MSG_STAT_REPLY     = 7,
+        SRPC_MSG_TEST_REQST     = 8,
+        SRPC_MSG_TEST_REPLY     = 9,
+        SRPC_MSG_DEBUG_REQST    = 10,
+        SRPC_MSG_DEBUG_REPLY    = 11,
+        SRPC_MSG_BRW_REQST      = 12,
+        SRPC_MSG_BRW_REPLY      = 13,
+        SRPC_MSG_PING_REQST     = 14,
+        SRPC_MSG_PING_REPLY     = 15,
+        SRPC_MSG_JOIN_REQST     = 16,
+        SRPC_MSG_JOIN_REPLY     = 17,
+};
+
+/* CAVEAT EMPTOR:
+ * All struct srpc_*_reqst's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All struct srpc_*_reply's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+struct srpc_generic_reqst {
+        __u64 			rpyid;  	/* reply buffer matchbits */
+        __u64 			bulkid; 	/* bulk buffer matchbits */
+} __packed;
+
+struct srpc_generic_reply {
+        __u32                   status;
+	struct lst_sid               sid;
+} __packed;
+
+/* FRAMEWORK RPCs */
+struct srpc_mksn_reqst {
+        __u64 			mksn_rpyid;      /* reply buffer matchbits */
+	struct lst_sid               mksn_sid;        /* session id */
+        __u32 			mksn_force;      /* use brute force */
+        char  			mksn_name[LST_NAME_SIZE];
+} __packed;					/* make session request */
+
+struct srpc_mksn_reply {
+        __u32                   mksn_status;      /* session status */
+	struct lst_sid               mksn_sid;         /* session id */
+        __u32                   mksn_timeout;     /* session timeout */
+        char  			mksn_name[LST_NAME_SIZE];
+} __packed;					/* make session reply */
+
+struct srpc_rmsn_reqst {
+	__u64			rmsn_rpyid;	/* reply buffer matchbits */
+	struct lst_sid		rmsn_sid;	/* session id */
+} __packed;					/* remove session request */
+
+struct srpc_rmsn_reply {
+        __u32			rmsn_status;
+	struct lst_sid		rmsn_sid;	/* session id */
+} __packed;					/* remove session reply */
+
+struct srpc_join_reqst {
+        __u64			join_rpyid;     /* reply buffer matchbits */
+	struct lst_sid               join_sid;       /* session id to join */
+        char                    join_group[LST_NAME_SIZE]; /* group name */
+} __packed;
+
+struct srpc_join_reply {
+        __u32                   join_status;    /* returned status */
+	struct lst_sid               join_sid;       /* session id */
+        __u32 			join_timeout;   /* # seconds' inactivity to expire */
+        char                    join_session[LST_NAME_SIZE]; /* session name */
+} __packed;
+
+struct srpc_debug_reqst {
+        __u64                   dbg_rpyid;      /* reply buffer matchbits */ 
+	struct lst_sid               dbg_sid;        /* session id */
+        __u32                   dbg_flags;      /* bitmap of debug */
+} __packed;
+
+struct srpc_debug_reply {
+        __u32                   dbg_status;     /* returned code */
+	struct lst_sid               dbg_sid;        /* session id */
+        __u32                   dbg_timeout;    /* session timeout */
+        __u32                   dbg_nbatch;     /* # of batches in the node */
+        char                    dbg_name[LST_NAME_SIZE]; /* session name */
+} __packed;
+
+#define SRPC_BATCH_OPC_RUN      1
+#define SRPC_BATCH_OPC_STOP     2
+#define SRPC_BATCH_OPC_QUERY    3
+
+struct srpc_batch_reqst {
+        __u64                   bar_rpyid;      /* reply buffer matchbits */ 
+	struct lst_sid               bar_sid;        /* session id */
+	struct lst_bid               bar_bid;        /* batch id */
+        __u32                   bar_opc;        /* create/start/stop batch */
+        __u32                   bar_testidx;    /* index of test */
+        __u32                   bar_arg;        /* parameters */
+} __packed;
+
+struct srpc_batch_reply {
+        __u32                   bar_status;     /* status of request */
+	struct lst_sid		bar_sid;	/* session id */
+        __u32                   bar_active;     /* # of active tests in batch/test */
+        __u32                   bar_time;       /* remained time */
+} __packed;
+
+struct srpc_stat_reqst {
+        __u64                   str_rpyid;      /* reply buffer matchbits */
+	struct lst_sid		str_sid;	/* session id */
+        __u32                   str_type;       /* type of stat */
+} __packed;
+
+struct srpc_stat_reply {
+	__u32                    str_status;
+	struct lst_sid           str_sid;
+	struct sfw_counters      str_fw;
+	struct srpc_counters     str_rpc;
+	struct lnet_counters_common str_lnet;
+} __packed;
+
+struct test_bulk_req {
+        __u32                   blk_opc;        /* bulk operation code */
+        __u32                   blk_npg;        /* # of pages */
+        __u32                   blk_flags;      /* reserved flags */
+} __packed;
+
+struct test_bulk_req_v1 {
+	/** bulk operation code */
+	__u16			blk_opc;
+	/** data check flags */
+	__u16			blk_flags;
+	/** data length */
+	__u32			blk_len;
+	/** bulk offset */
+	__u32                   blk_offset;
+} __packed;
+
+struct test_ping_req {
+	__u32			png_size;       /* size of ping message */
+	__u32			png_flags;      /* reserved flags */
+} __packed;
+
+struct srpc_test_reqst {
+	__u64			tsr_rpyid;      /* reply buffer matchbits */
+	__u64			tsr_bulkid;     /* bulk buffer matchbits */
+	struct lst_sid		tsr_sid;        /* session id */
+	struct lst_bid		tsr_bid;        /* batch id */
+	__u32			tsr_service;    /* test type: bulk|ping|... */
+	/* test client loop count or # server buffers needed */
+	__u32			tsr_loop;
+	__u32			tsr_concur;     /* concurrency of test */
+	__u8			tsr_is_client;  /* is test client or not */
+	__u8			tsr_stop_onerr; /* stop on error */
+	__u32			tsr_ndest;      /* # of dest nodes */
+
+	union {
+		struct test_ping_req	ping;
+		struct test_bulk_req	bulk_v0;
+		struct test_bulk_req_v1	bulk_v1;
+	} tsr_u;
+} __packed;
+
+struct srpc_test_reply {
+	__u32			tsr_status;     /* returned code */
+	struct lst_sid		tsr_sid;
+} __packed;
+
+/* TEST RPCs */
+struct srpc_ping_reqst {
+        __u64                   pnr_rpyid;
+        __u32                   pnr_magic;
+        __u32                   pnr_seq;
+        __u64                   pnr_time_sec;
+	__u64                   pnr_time_nsec;
+} __packed;
+
+struct srpc_ping_reply {
+        __u32                   pnr_status;
+        __u32                   pnr_magic;
+        __u32                   pnr_seq;
+} __packed;
+
+struct srpc_brw_reqst {
+        __u64                   brw_rpyid;      /* reply buffer matchbits */
+        __u64                   brw_bulkid;     /* bulk buffer matchbits */
+        __u32                   brw_rw;         /* read or write */
+        __u32                   brw_len;        /* bulk data len */
+        __u32                   brw_flags;      /* bulk data patterns */
+} __packed;					/* bulk r/w request */
+
+struct srpc_brw_reply {
+        __u32                   brw_status;
+} __packed; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC                  0xeeb0f00d
+#define SRPC_MSG_VERSION                1
+
+struct srpc_msg {
+	/** magic number */
+	__u32	msg_magic;
+	/** message version number */
+	__u32	msg_version;
+	/** type of message body: enum srpc_msg_type */
+	__u32	msg_type;
+	__u32	msg_reserved0;
+	__u32	msg_reserved1;
+	/** test session features */
+	__u32	msg_ses_feats;
+        union {
+		struct srpc_generic_reqst	reqst;
+		struct srpc_generic_reply	reply;
+
+		struct srpc_mksn_reqst		mksn_reqst;
+		struct srpc_mksn_reply		mksn_reply;
+		struct srpc_rmsn_reqst		rmsn_reqst;
+		struct srpc_rmsn_reply		rmsn_reply;
+		struct srpc_debug_reqst		dbg_reqst;
+		struct srpc_debug_reply		dbg_reply;
+		struct srpc_batch_reqst		bat_reqst;
+		struct srpc_batch_reply		bat_reply;
+		struct srpc_stat_reqst		stat_reqst;
+		struct srpc_stat_reply		stat_reply;
+		struct srpc_test_reqst		tes_reqst;
+		struct srpc_test_reply		tes_reply;
+		struct srpc_join_reqst		join_reqst;
+		struct srpc_join_reply		join_reply;
+
+		struct srpc_ping_reqst		ping_reqst;
+		struct srpc_ping_reply		ping_reply;
+		struct srpc_brw_reqst		brw_reqst;
+		struct srpc_brw_reply		brw_reply;
+	} msg_body;
+} __packed;
+
+static inline void
+srpc_unpack_msg_hdr(struct srpc_msg *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* We do not swap the magic number here as it is needed to
+	   determine whether the body needs to be swapped. */
+	/* __swab32s(&msg->msg_magic); */
+	__swab32s(&msg->msg_type);
+	__swab32s(&msg->msg_version);
+	__swab32s(&msg->msg_ses_feats);
+	__swab32s(&msg->msg_reserved0);
+	__swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/selftest/selftest.h b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
new file mode 100644
index 0000000000000..27126be9cb086
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
@@ -0,0 +1,613 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include <libcfs/libcfs.h>
+#include <lnet/api.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/lib-types.h>
+#include <uapi/linux/lnet/lnetst.h>
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN                  0
+#define SWI_STATE_REPLY_SUBMITTED          1
+#define SWI_STATE_REPLY_SENT               2
+#define SWI_STATE_REQUEST_SUBMITTED        3
+#define SWI_STATE_REQUEST_SENT             4
+#define SWI_STATE_REPLY_RECEIVED           5
+#define SWI_STATE_BULK_STARTED             6
+#define SWI_STATE_DONE                     10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG              0
+#define SRPC_SERVICE_MAKE_SESSION       1
+#define SRPC_SERVICE_REMOVE_SESSION     2
+#define SRPC_SERVICE_BATCH              3
+#define SRPC_SERVICE_TEST               4
+#define SRPC_SERVICE_QUERY_STAT         5
+#define SRPC_SERVICE_JOIN               6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID   10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW                11
+#define SRPC_SERVICE_PING               12
+#define SRPC_SERVICE_MAX_ID             12
+
+#define SRPC_REQUEST_PORTAL             50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL   51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL                52
+
+static inline enum srpc_msg_type
+srpc_service2request (int service)
+{
+        switch (service) {
+        default:
+                LBUG ();
+        case SRPC_SERVICE_DEBUG:
+                return SRPC_MSG_DEBUG_REQST;
+
+        case SRPC_SERVICE_MAKE_SESSION:
+                return SRPC_MSG_MKSN_REQST;
+
+        case SRPC_SERVICE_REMOVE_SESSION:
+                return SRPC_MSG_RMSN_REQST;
+
+        case SRPC_SERVICE_BATCH:
+                return SRPC_MSG_BATCH_REQST;
+
+        case SRPC_SERVICE_TEST:
+                return SRPC_MSG_TEST_REQST;
+
+        case SRPC_SERVICE_QUERY_STAT:
+                return SRPC_MSG_STAT_REQST;
+
+        case SRPC_SERVICE_BRW:
+                return SRPC_MSG_BRW_REQST;
+
+        case SRPC_SERVICE_PING:
+                return SRPC_MSG_PING_REQST;
+
+        case SRPC_SERVICE_JOIN:
+                return SRPC_MSG_JOIN_REQST;
+        }
+}
+
+static inline enum srpc_msg_type
+srpc_service2reply (int service)
+{
+        return srpc_service2request(service) + 1;
+}
+
+enum srpc_event_type {
+        SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
+        SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
+        SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
+        SRPC_REPLY_RCVD      = 4, /* incoming reply received */
+        SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
+        SRPC_REQUEST_RCVD    = 6, /* incoming request received */
+        SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
+};
+
+/* RPC event */
+struct srpc_event {
+	enum srpc_event_type	ev_type;   /* what's up */
+	enum lnet_event_kind	ev_lnet;   /* LNet event type */
+        int               ev_fired;  /* LNet event fired? */
+        int               ev_status; /* LNet event status */
+        void             *ev_data;   /* owning server/client RPC */
+};
+
+/* bulk descriptor */
+struct srpc_bulk {
+	int			bk_len;  /* len of bulk data */
+	struct lnet_handle_md	bk_mdh;
+	int			bk_sink; /* sink/source */
+	int			bk_niov; /* # iov in bk_iovs */
+	struct bio_vec		bk_iovs[0];
+};
+
+/* message buffer descriptor */
+struct srpc_buffer {
+	struct list_head	buf_list; /* chain on srpc_service::*_msgq */
+	struct srpc_msg		buf_msg;
+	struct lnet_handle_md	buf_mdh;
+	lnet_nid_t		buf_self;
+	struct lnet_process_id	buf_peer;
+};
+
+struct swi_workitem;
+typedef int (*swi_action_t)(struct swi_workitem *);
+
+struct swi_workitem {
+	struct cfs_wi_sched	*swi_sched;
+	struct cfs_workitem	swi_workitem;
+        swi_action_t         swi_action;
+        int                  swi_state;
+};
+
+/* server-side state of a RPC */
+struct srpc_server_rpc {
+	/* chain on srpc_service::*_rpcq */
+	struct list_head	srpc_list;
+	struct srpc_service_cd *srpc_scd;
+	struct swi_workitem	srpc_wi;
+	struct srpc_event	srpc_ev;	/* bulk/reply event */
+	lnet_nid_t		srpc_self;
+	struct lnet_process_id	srpc_peer;
+	struct srpc_msg		srpc_replymsg;
+	struct lnet_handle_md	srpc_replymdh;
+	struct srpc_buffer     *srpc_reqstbuf;
+	struct srpc_bulk       *srpc_bulk;
+
+	unsigned int	srpc_aborted; /* being given up */
+	int		srpc_status;
+	void		(*srpc_done)(struct srpc_server_rpc *);
+};
+
+/* client-side state of a RPC */
+struct srpc_client_rpc {
+	struct list_head	crpc_list;	/* chain on user's lists */
+	spinlock_t		crpc_lock;	/* serialize */
+	int			crpc_service;
+	atomic_t		crpc_refcount;
+	/* # seconds to wait for reply */
+	int			crpc_timeout;
+	struct stt_timer	crpc_timer;
+	struct swi_workitem	crpc_wi;
+	struct lnet_process_id	crpc_dest;
+
+        void               (*crpc_done)(struct srpc_client_rpc *);
+        void               (*crpc_fini)(struct srpc_client_rpc *);
+        int                  crpc_status;    /* completion status */
+        void                *crpc_priv;      /* caller data */
+
+        /* state flags */
+        unsigned int         crpc_aborted:1; /* being given up */
+        unsigned int         crpc_closed:1;  /* completed */
+
+	/* RPC events */
+	struct srpc_event	crpc_bulkev;	/* bulk event */
+	struct srpc_event	crpc_reqstev;	/* request event */
+	struct srpc_event	crpc_replyev;	/* reply event */
+
+	/* bulk, request(reqst), and reply exchanged on wire */
+	struct srpc_msg		crpc_reqstmsg;
+	struct srpc_msg		crpc_replymsg;
+	struct lnet_handle_md	crpc_reqstmdh;
+	struct lnet_handle_md	crpc_replymdh;
+	struct srpc_bulk	crpc_bulk;
+};
+
+#define srpc_client_rpc_size(rpc)                                       \
+offsetof(struct srpc_client_rpc, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc)                                     \
+do {                                                                    \
+        CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n",                         \
+               (rpc), libcfs_id2str((rpc)->crpc_dest),                  \
+	       atomic_read(&(rpc)->crpc_refcount));                 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);            \
+	atomic_inc(&(rpc)->crpc_refcount);                          \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc)                                     \
+do {                                                                    \
+        CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n",                         \
+               (rpc), libcfs_id2str((rpc)->crpc_dest),                  \
+	       atomic_read(&(rpc)->crpc_refcount));                 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);            \
+	if (atomic_dec_and_test(&(rpc)->crpc_refcount))             \
+                srpc_destroy_client_rpc(rpc);                           \
+} while (0)
+
+#define srpc_event_pending(rpc)   ((rpc)->crpc_bulkev.ev_fired == 0 ||  \
+                                   (rpc)->crpc_reqstev.ev_fired == 0 || \
+                                   (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+	/** serialize */
+	spinlock_t		scd_lock;
+	/** backref to service */
+	struct srpc_service	*scd_svc;
+	/** event buffer */
+	struct srpc_event	scd_ev;
+	/** free RPC descriptors */
+	struct list_head	scd_rpc_free;
+	/** in-flight RPCs */
+	struct list_head	scd_rpc_active;
+	/** workitem for posting buffer */
+	struct swi_workitem	scd_buf_wi;
+	/** CPT id */
+	int			scd_cpt;
+	/** error code for scd_buf_wi */
+	int			scd_buf_err;
+	/** timestamp for scd_buf_err */
+	time64_t		scd_buf_err_stamp;
+	/** total # request buffers */
+	int			scd_buf_total;
+	/** # posted request buffers */
+	int			scd_buf_nposted;
+	/** in progress of buffer posting */
+	int			scd_buf_posting;
+	/** allocate more buffers if scd_buf_nposted < scd_buf_low */
+	int			scd_buf_low;
+	/** increase/decrease some buffers */
+	int			scd_buf_adjust;
+	/** posted message buffers */
+	struct list_head	scd_buf_posted;
+	/** blocked for RPC descriptor */
+	struct list_head	scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN		256
+#define SFW_TEST_WI_MAX		2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions  */
+#define SFW_TEST_WI_EXTRA	64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN		16
+#define SFW_FRWK_WI_MAX		256
+
+struct srpc_service {
+	int			sv_id;		/* service id */
+	const char		*sv_name;	/* human readable name */
+	int			sv_wi_total;	/* total server workitems */
+	int			sv_shuttingdown;
+	int			sv_ncpts;
+	/* percpt data for srpc_service */
+	struct srpc_service_cd	**sv_cpt_data;
+        /* Service callbacks:
+         * - sv_handler: process incoming RPC request
+         * - sv_bulk_ready: notify bulk data
+         */
+	int              (*sv_handler)(struct srpc_server_rpc *);
+	int              (*sv_bulk_ready)(struct srpc_server_rpc *, int);
+};
+
+struct sfw_session {
+	/* chain on fw_zombie_sessions */
+	struct list_head	sn_list;
+	struct lst_sid		sn_id;		/* unique identifier */
+	/* # seconds' inactivity to expire */
+	unsigned int		sn_timeout;
+	int			sn_timer_active;
+	unsigned int		sn_features;
+	struct stt_timer	sn_timer;
+	struct list_head	sn_batches;	/* list of batches */
+	char			sn_name[LST_NAME_SIZE];
+	atomic_t		sn_refcount;
+	atomic_t		sn_brw_errors;
+	atomic_t		sn_ping_errors;
+	ktime_t			sn_started;
+};
+
+#define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
+                                       (sid0).ses_stamp == (sid1).ses_stamp)
+
+struct sfw_batch {
+	struct list_head	bat_list;	/* chain on sn_batches */
+	struct lst_bid		bat_id;		/* batch id */
+	int			bat_error;	/* error code of batch */
+	struct sfw_session	*bat_session;	/* batch's session */
+	atomic_t		bat_nactive;	/* # of active tests */
+	struct list_head	bat_tests;	/* test instances */
+};
+
+struct sfw_test_client_ops {
+	int  (*tso_init)(struct sfw_test_instance *tsi); /* intailize test client */
+	void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+	int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+			     struct lnet_process_id dest,
+			     struct srpc_client_rpc **rpc); /* prep a tests rpc */
+        void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+			     struct srpc_client_rpc *rpc);  /* done a test rpc */
+};
+
+struct sfw_test_instance {
+	struct list_head	tsi_list;	/* chain on batch */
+	int			tsi_service;	/* test type */
+	struct sfw_batch	*tsi_batch;	/* batch */
+	struct sfw_test_client_ops	*tsi_ops;	/* test client operations */
+
+	/* public parameter for all test units */
+	unsigned int		tsi_is_client:1;     /* is test client */
+	unsigned int		tsi_stoptsu_onerr:1; /* stop tsu on error */
+        int                     tsi_concur;          /* concurrency */
+        int                     tsi_loop;            /* loop count */
+
+	/* status of test instance */
+	spinlock_t		tsi_lock;	/* serialize */
+	unsigned int		tsi_stopping:1;	/* test is stopping */
+	atomic_t		tsi_nactive;	/* # of active test unit */
+	struct list_head	tsi_units;	/* test units */
+	struct list_head	tsi_free_rpcs;	/* free rpcs */
+	struct list_head	tsi_active_rpcs;/* active rpcs */
+
+	union {
+		struct test_ping_req	ping;	  /* ping parameter */
+		struct test_bulk_req	bulk_v0;  /* bulk parameter */
+		struct test_bulk_req_v1	bulk_v1;  /* bulk v1 parameter */
+	} tsi_u;
+};
+
+/* XXX: trailing (PAGE_SIZE % sizeof(struct lnet_process_id)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR     LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE    (PAGE_SIZE / sizeof(struct lnet_process_id_packed))
+#define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+struct sfw_test_unit {
+	struct list_head	tsu_list;	/* chain on lst_test_instance */
+	struct lnet_process_id	tsu_dest;	/* id of dest node */
+	int			tsu_loop;	/* loop count of the test */
+	struct sfw_test_instance *tsu_instance;	/* pointer to test instance */
+	void			*tsu_private;	/* private data */
+	struct swi_workitem	 tsu_worker;	/* workitem of the test unit */
+};
+
+struct sfw_test_case {
+	struct list_head		tsc_list;		/* chain on fw_tests */
+	struct srpc_service		*tsc_srv_service;	/* test service */
+	struct sfw_test_client_ops	*tsc_cli_ops;		/* ops of test client */
+};
+
+struct srpc_client_rpc *
+sfw_create_rpc(struct lnet_process_id peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done)(struct srpc_client_rpc *), void *priv);
+int sfw_create_test_rpc(struct sfw_test_unit *tsu,
+			struct lnet_process_id peer, unsigned int features,
+			int nblk, int blklen, struct srpc_client_rpc **rpc);
+void sfw_abort_rpc(struct srpc_client_rpc *rpc);
+void sfw_post_rpc(struct srpc_client_rpc *rpc);
+void sfw_client_rpc_done(struct srpc_client_rpc *rpc);
+void sfw_unpack_message(struct srpc_msg *msg);
+void sfw_free_pages(struct srpc_server_rpc *rpc);
+void sfw_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i);
+int sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+		    int sink);
+int sfw_make_session(struct srpc_mksn_reqst *request,
+		     struct srpc_mksn_reply *reply);
+
+struct srpc_client_rpc *
+srpc_create_client_rpc(struct lnet_process_id peer, int service,
+                       int nbulkiov, int bulklen,
+		       void (*rpc_done)(struct srpc_client_rpc *),
+		       void (*rpc_fini)(struct srpc_client_rpc *), void *priv);
+void srpc_post_rpc(struct srpc_client_rpc *rpc);
+void srpc_abort_rpc(struct srpc_client_rpc *rpc, int why);
+void srpc_free_bulk(struct srpc_bulk *bk);
+struct srpc_bulk *srpc_alloc_bulk(int cpt, unsigned int off,
+				  unsigned int bulk_npg, unsigned int bulk_len,
+				  int sink);
+int srpc_send_rpc(struct swi_workitem *wi);
+int srpc_send_reply(struct srpc_server_rpc *rpc);
+int srpc_add_service(struct srpc_service *sv);
+int srpc_remove_service(struct srpc_service *sv);
+void srpc_shutdown_service(struct srpc_service *sv);
+void srpc_abort_service(struct srpc_service *sv);
+int srpc_finish_service(struct srpc_service *sv);
+int srpc_service_add_buffers(struct srpc_service *sv, int nbuffer);
+void srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer);
+void srpc_get_counters(struct srpc_counters *cnt);
+void srpc_set_counters(const struct srpc_counters *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+	return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(struct cfs_workitem *wi)
+{
+	struct swi_workitem *swi;
+
+	swi = container_of(wi, struct swi_workitem, swi_workitem);
+	return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(struct swi_workitem *swi,
+		  swi_action_t action, struct cfs_wi_sched *sched)
+{
+	swi->swi_sched  = sched;
+	swi->swi_action = action;
+	swi->swi_state  = SWI_STATE_NEWBORN;
+	cfs_wi_init(&swi->swi_workitem, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(struct swi_workitem *wi)
+{
+	cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(struct swi_workitem *swi)
+{
+	cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(struct swi_workitem *swi)
+{
+	return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc(struct srpc_client_rpc *rpc)
+{
+	LASSERT (rpc != NULL);
+	LASSERT (!srpc_event_pending(rpc));
+	LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+	if (rpc->crpc_fini == NULL) {
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	} else {
+		(*rpc->crpc_fini) (rpc);
+	}
+}
+
+static inline void
+srpc_init_client_rpc(struct srpc_client_rpc *rpc, struct lnet_process_id peer,
+		     int service, int nbulkiov, int bulklen,
+		     void (*rpc_done)(struct srpc_client_rpc *),
+		     void (*rpc_fini)(struct srpc_client_rpc *), void *priv)
+{
+	LASSERT(nbulkiov <= LNET_MAX_IOV);
+
+	memset(rpc, 0, offsetof(struct srpc_client_rpc,
+				crpc_bulk.bk_iovs[nbulkiov]));
+
+	INIT_LIST_HEAD(&rpc->crpc_list);
+	swi_init_workitem(&rpc->crpc_wi, srpc_send_rpc,
+			  lst_sched_test[lnet_cpt_of_nid(peer.nid, NULL)]);
+	spin_lock_init(&rpc->crpc_lock);
+	atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+	rpc->crpc_dest         = peer;
+	rpc->crpc_priv         = priv;
+        rpc->crpc_service      = service;
+        rpc->crpc_bulk.bk_len  = bulklen;
+        rpc->crpc_bulk.bk_niov = nbulkiov;
+        rpc->crpc_done         = rpc_done;
+        rpc->crpc_fini         = rpc_fini;
+	LNetInvalidateMDHandle(&rpc->crpc_reqstmdh);
+	LNetInvalidateMDHandle(&rpc->crpc_replymdh);
+	LNetInvalidateMDHandle(&rpc->crpc_bulk.bk_mdh);
+
+        /* no event is expected at this point */
+        rpc->crpc_bulkev.ev_fired  =
+        rpc->crpc_reqstev.ev_fired =
+        rpc->crpc_replyev.ev_fired = 1;
+
+        rpc->crpc_reqstmsg.msg_magic   = SRPC_MSG_MAGIC;
+        rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+        rpc->crpc_reqstmsg.msg_type    = srpc_service2request(service);
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+        switch(state) {
+                default:
+                        LBUG();
+                STATE2STR(SWI_STATE_NEWBORN);
+                STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+                STATE2STR(SWI_STATE_REPLY_SENT);
+                STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+                STATE2STR(SWI_STATE_REQUEST_SENT);
+                STATE2STR(SWI_STATE_REPLY_RECEIVED);
+                STATE2STR(SWI_STATE_BULK_STARTED);
+                STATE2STR(SWI_STATE_DONE);
+        }
+#undef STATE2STR
+}
+
+#define lst_wait_until(cond, lock, fmt, ...)				\
+do {									\
+	int __I = 2;							\
+	while (!(cond)) {						\
+		CDEBUG(is_power_of_2(++__I) ? D_WARNING : D_NET,	\
+		       fmt, ## __VA_ARGS__);				\
+		spin_unlock(&(lock));					\
+									\
+		schedule_timeout_uninterruptible(			\
+			cfs_time_seconds(1) / 10);			\
+									\
+		spin_lock(&(lock));					\
+	}								\
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(struct srpc_service *sv)
+{
+	int i = 2;
+
+	LASSERT(sv->sv_shuttingdown);
+
+	while (srpc_finish_service(sv) == 0) {
+		i++;
+		CDEBUG(((i & -i) == i) ? D_WARNING : D_NET,
+		       "Waiting for %s service to shutdown...\n",
+		       sv->sv_name);
+		schedule_timeout_uninterruptible(cfs_time_seconds(1) / 10);
+	}
+}
+
+extern struct sfw_test_client_ops ping_test_client;
+extern struct srpc_service ping_test_service;
+void ping_init_test_client(void);
+void ping_init_test_service(void);
+
+extern struct sfw_test_client_ops brw_test_client;
+extern struct srpc_service brw_test_service;
+void brw_init_test_client(void);
+void brw_init_test_service(void);
+
+#endif /* __SELFTEST_SELFTEST_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.c b/drivers/staging/lustrefsx/lnet/selftest/timer.c
new file mode 100644
index 0000000000000..8a35334b065cd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.c
@@ -0,0 +1,244 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL        3   /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME       (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK   (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS	       (1 << 7)
+#define STTIMER_SLOT(t)	       (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+                                                    (STTIMER_NSLOTS - 1))])
+
+static struct st_timer_data {
+	spinlock_t		stt_lock;
+	/* start time of the slot processed previously */
+	time64_t		stt_prev_slot;
+	struct list_head	stt_hash[STTIMER_NSLOTS];
+	int			stt_shuttingdown;
+	wait_queue_head_t	stt_waitq;
+	int			stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(struct stt_timer *timer)
+{
+	struct list_head *pos;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT(stt_data.stt_nthreads > 0);
+	LASSERT(!stt_data.stt_shuttingdown);
+	LASSERT(timer->stt_func != NULL);
+	LASSERT(list_empty(&timer->stt_list));
+	LASSERT(timer->stt_expires > ktime_get_real_seconds());
+
+	/* a simple insertion sort */
+	list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) {
+		struct stt_timer *old = list_entry(pos, struct stt_timer,
+						   stt_list);
+
+		if (timer->stt_expires >= old->stt_expires)
+			break;
+	}
+	list_add(&timer->stt_list, pos);
+
+	spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer(struct stt_timer *timer)
+{
+	int ret = 0;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT(stt_data.stt_nthreads > 0);
+	LASSERT(!stt_data.stt_shuttingdown);
+
+	if (!list_empty(&timer->stt_list)) {
+		ret = 1;
+		list_del_init(&timer->stt_list);
+	}
+
+	spin_unlock(&stt_data.stt_lock);
+	return ret;
+}
+
+/* called with stt_data.stt_lock held */
+static int
+stt_expire_list(struct list_head *slot, time64_t now)
+{
+	int	     expired = 0;
+	struct stt_timer *timer;
+
+	while (!list_empty(slot)) {
+		timer = list_entry(slot->next, struct stt_timer, stt_list);
+
+		if (timer->stt_expires > now)
+			break;
+
+		list_del_init(&timer->stt_list);
+		spin_unlock(&stt_data.stt_lock);
+
+		expired++;
+		(*timer->stt_func) (timer->stt_data);
+
+		spin_lock(&stt_data.stt_lock);
+	}
+
+	return expired;
+}
+
+static int
+stt_check_timers(time64_t *last)
+{
+	int expired = 0;
+	time64_t now;
+	time64_t this_slot;
+
+	now = ktime_get_real_seconds();
+	this_slot = now & STTIMER_SLOTTIMEMASK;
+
+	spin_lock(&stt_data.stt_lock);
+
+	while (this_slot >= *last) {
+		expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+		this_slot = this_slot - STTIMER_SLOTTIME;
+	}
+
+	*last = now & STTIMER_SLOTTIMEMASK;
+	spin_unlock(&stt_data.stt_lock);
+	return expired;
+}
+
+
+static int
+stt_timer_main (void *arg)
+{
+        int rc = 0;
+
+	while (!stt_data.stt_shuttingdown) {
+		stt_check_timers(&stt_data.stt_prev_slot);
+
+		rc = wait_event_timeout(stt_data.stt_waitq,
+					stt_data.stt_shuttingdown,
+					cfs_time_seconds(STTIMER_SLOTTIME));
+	}
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads--;
+	spin_unlock(&stt_data.stt_lock);
+	return rc;
+}
+
+static int
+stt_start_timer_thread (void)
+{
+	struct task_struct *task;
+
+	LASSERT(!stt_data.stt_shuttingdown);
+
+	task = kthread_run(stt_timer_main, NULL, "st_timer");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads++;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+
+int
+stt_startup (void)
+{
+        int rc = 0;
+        int i;
+
+        stt_data.stt_shuttingdown = 0;
+	stt_data.stt_prev_slot = ktime_get_real_seconds() & STTIMER_SLOTTIMEMASK;
+
+	spin_lock_init(&stt_data.stt_lock);
+        for (i = 0; i < STTIMER_NSLOTS; i++)
+		INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+	stt_data.stt_nthreads = 0;
+	init_waitqueue_head(&stt_data.stt_waitq);
+	rc = stt_start_timer_thread();
+	if (rc != 0)
+		CERROR ("Can't spawn timer thread: %d\n", rc);
+
+        return rc;
+}
+
+void
+stt_shutdown(void)
+{
+	int i;
+
+	spin_lock(&stt_data.stt_lock);
+
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		LASSERT(list_empty(&stt_data.stt_hash[i]));
+
+	stt_data.stt_shuttingdown = 1;
+
+	wake_up(&stt_data.stt_waitq);
+	lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+		       "waiting for %d threads to terminate\n",
+		       stt_data.stt_nthreads);
+
+	spin_unlock(&stt_data.stt_lock);
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.h b/drivers/staging/lustrefsx/lnet/selftest/timer.h
new file mode 100644
index 0000000000000..bd90553e2d942
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.h
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+struct stt_timer {
+	struct list_head	stt_list;
+	time64_t		stt_expires;
+	void			(*stt_func)(void *);
+	void			*stt_data;
+};
+
+void stt_add_timer(struct stt_timer *timer);
+int stt_del_timer(struct stt_timer *timer);
+int stt_startup(void);
+void stt_shutdown(void);
+
+#endif /* __SELFTEST_TIMER_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/LICENSE b/drivers/staging/lustrefsx/lustre/LICENSE
new file mode 100644
index 0000000000000..edb73cdedca6a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/LICENSE
@@ -0,0 +1,372 @@
+Each file in this distribution contains a header stating the copyright
+owner(s), and the licensing terms for that file.  Some files are not
+eligible for copyright protection, and contain neither.
+
+There are many files which may be covered by a separate license that
+you signed or otherwise agreed to before downloading this software.
+If you did not agree to such an agreement, or if the file does not
+mention that license, then you can redistribute and/or modify it under
+the terms of version 2 of the GNU General Public License.  Each file
+is very clear about which license is applicable.
+
+In any case, Lustre is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the license
+text for more details.
+
+Reproduced below is the GNU General Public License version 2, and
+Linus's clarifying statement from the Linux kernel source code:
+
+----------------------------------------
+
+   NOTE! This copyright does *not* cover user programs that use kernel
+ services by normal system calls - this is merely considered normal use
+ of the kernel, and does *not* fall under the heading of "derived work".
+ Also note that the GPL below is copyrighted by the Free Software
+ Foundation, but the instance of code that it refers to (the Linux
+ kernel) is copyrighted by me and others who actually wrote it.
+
+			Linus Torvalds
+
+----------------------------------------
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
new file mode 100644
index 0000000000000..06196e66b971e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
@@ -0,0 +1,616 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fid/fid_handler.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+/* Assigns client to sequence controller node. */
+int seq_server_set_cli(const struct lu_env *env, struct lu_server_seq *seq,
+		       struct lu_client_seq *cli)
+{
+	int rc = 0;
+	ENTRY;
+
+	/*
+	 * Ask client for new range, assign that range to ->seq_space and write
+	 * seq state to backing store should be atomic.
+	 */
+	mutex_lock(&seq->lss_mutex);
+
+	if (!cli) {
+		CDEBUG(D_INFO, "%s: Detached sequence client\n", seq->lss_name);
+		seq->lss_cli = NULL;
+		GOTO(out_up, rc = 0);
+	}
+
+	if (seq->lss_cli) {
+		CDEBUG(D_HA, "%s: Sequence controller is already assigned\n",
+		       seq->lss_name);
+		GOTO(out_up, rc = -EEXIST);
+	}
+
+	CDEBUG(D_INFO, "%s: Attached sequence controller %s\n",
+	       seq->lss_name, cli->lcs_name);
+
+	seq->lss_cli = cli;
+	cli->lcs_space.lsr_index = seq->lss_site->ss_node_id;
+	EXIT;
+out_up:
+	mutex_unlock(&seq->lss_mutex);
+	return rc;
+}
+EXPORT_SYMBOL(seq_server_set_cli);
+/*
+ * allocate \a w units of sequence from range \a from.
+ */
+static inline void range_alloc(struct lu_seq_range *to,
+			       struct lu_seq_range *from,
+			       __u64 width)
+{
+	width = min(lu_seq_range_space(from), width);
+	to->lsr_start = from->lsr_start;
+	to->lsr_end = from->lsr_start + width;
+	from->lsr_start += width;
+}
+
+/**
+ * On controller node, allocate new super sequence for regular sequence server.
+ * As this super sequence controller, this node suppose to maintain fld
+ * and update index.
+ * \a out range always has currect mds node number of requester.
+ */
+
+static int __seq_server_alloc_super(struct lu_server_seq *seq,
+				    struct lu_seq_range *out,
+				    const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc;
+	ENTRY;
+
+	LASSERT(lu_seq_range_is_sane(space));
+
+	if (lu_seq_range_is_exhausted(space)) {
+		CERROR("%s: Sequences space is exhausted\n",
+		       seq->lss_name);
+		RETURN(-ENOSPC);
+	} else {
+		range_alloc(out, space, seq->lss_width);
+	}
+
+	rc = seq_store_update(env, seq, out, 1 /* sync */);
+
+	LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n",
+		      seq->lss_name, rc, PRANGE(out));
+
+	RETURN(rc);
+}
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+			   struct lu_seq_range *out,
+			   const struct lu_env *env)
+{
+	int rc;
+	ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+	rc = __seq_server_alloc_super(seq, out, env);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+
+int seq_server_alloc_spec(struct lu_server_seq *seq,
+			  struct lu_seq_range *spec,
+			  const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc = -ENOSPC;
+	ENTRY;
+
+	/*
+	 * In some cases (like recovery after a disaster)
+	 * we may need to allocate sequences manually
+	 * Notice some sequences can be lost if requested
+	 * range doesn't start at the beginning of current
+	 * free space. Also notice it's not possible now
+	 * to allocate sequences out of natural order.
+	 */
+	if (spec->lsr_start >= spec->lsr_end)
+		RETURN(-EINVAL);
+	if (spec->lsr_flags != LU_SEQ_RANGE_MDT &&
+	    spec->lsr_flags != LU_SEQ_RANGE_OST)
+		RETURN(-EINVAL);
+
+	mutex_lock(&seq->lss_mutex);
+	if (spec->lsr_start >= space->lsr_start) {
+		space->lsr_start = spec->lsr_end;
+		rc = seq_store_update(env, seq, spec, 1 /* sync */);
+
+		LCONSOLE_INFO("%s: "DRANGE" sequences allocated: rc = %d \n",
+			      seq->lss_name, PRANGE(spec), rc);
+	}
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+
+static int __seq_set_init(const struct lu_env *env,
+			  struct lu_server_seq *seq)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc;
+
+	range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width);
+	range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width);
+
+	rc = seq_store_update(env, seq, NULL, 1);
+
+	return rc;
+}
+
+/*
+ * This function implements new seq allocation algorithm using async
+ * updates to seq file on disk. ref bug 18857 for details.
+ * there are four variable to keep track of this process
+ *
+ * lss_space; - available lss_space
+ * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use
+ * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be
+ *                    not yet committed
+ *
+ * when lss_lowater_set reaches the end it is replaced with hiwater one and
+ * a write operation is initiated to allocate new hiwater range.
+ * if last seq write opearion is still not committed, current operation is
+ * flaged as sync write op.
+ */
+static int range_alloc_set(const struct lu_env *env,
+			   struct lu_seq_range *out,
+			   struct lu_server_seq *seq)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	struct lu_seq_range *loset = &seq->lss_lowater_set;
+	struct lu_seq_range *hiset = &seq->lss_hiwater_set;
+	int rc = 0;
+
+	if (lu_seq_range_is_zero(loset))
+		__seq_set_init(env, seq);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */
+		loset->lsr_start = loset->lsr_end;
+
+	if (lu_seq_range_is_exhausted(loset)) {
+		/* reached high water mark. */
+		struct lu_device *dev = seq->lss_site->ss_lu->ls_top_dev;
+		int obd_num_clients = dev->ld_obd->obd_num_exports;
+		__u64 set_sz;
+
+		/* calculate new seq width based on number of clients */
+		set_sz = max(seq->lss_set_width,
+			     obd_num_clients * seq->lss_width);
+		set_sz = min(lu_seq_range_space(space), set_sz);
+
+		/* Switch to hiwater range now */
+		*loset = *hiset;
+		/* allocate new hiwater range */
+		range_alloc(hiset, space, set_sz);
+
+		/* update ondisk seq with new *space */
+		rc = seq_store_update(env, seq, NULL, seq->lss_need_sync);
+	}
+
+	LASSERTF(!lu_seq_range_is_exhausted(loset) ||
+		 lu_seq_range_is_sane(loset),
+		 DRANGE"\n", PRANGE(loset));
+
+	if (rc == 0)
+		range_alloc(out, loset, seq->lss_width);
+
+	RETURN(rc);
+}
+
+/**
+ * Check if the sequence server has sequence avaible
+ *
+ * Check if the sequence server has sequence avaible, if not, then
+ * allocating super sequence from sequence manager (MDT0).
+ *
+ * \param[in] env	execution environment
+ * \param[in] seq	server sequence
+ *
+ * \retval		negative errno if allocating new sequence fails
+ * \retval		0 if there is enough sequence or allocating
+ *                      new sequence succeeds
+ */
+int seq_server_check_and_alloc_super(const struct lu_env *env,
+				     struct lu_server_seq *seq)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc = 0;
+
+	ENTRY;
+
+	/* Check if available space ends and allocate new super seq */
+	if (lu_seq_range_is_exhausted(space)) {
+		if (!seq->lss_cli) {
+			CERROR("%s: No sequence controller is attached.\n",
+			       seq->lss_name);
+			RETURN(-ENODEV);
+		}
+
+		rc = seq_client_alloc_super(seq->lss_cli, env);
+		if (rc) {
+			CDEBUG(D_HA,
+			       "%s: Can't allocate super-sequence: rc = %d\n",
+			       seq->lss_name, rc);
+			RETURN(rc);
+		}
+
+		/* Saving new range to allocation space. */
+		*space = seq->lss_cli->lcs_space;
+		LASSERT(lu_seq_range_is_sane(space));
+		if (!seq->lss_cli->lcs_srv) {
+			struct lu_server_fld *fld;
+
+			/* Insert it to the local FLDB */
+			fld = seq->lss_site->ss_server_fld;
+			mutex_lock(&fld->lsf_lock);
+			rc = fld_insert_entry(env, fld, space);
+			mutex_unlock(&fld->lsf_lock);
+		}
+	}
+
+	if (lu_seq_range_is_zero(&seq->lss_lowater_set))
+		__seq_set_init(env, seq);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_server_check_and_alloc_super);
+
+static int __seq_server_alloc_meta(struct lu_server_seq *seq,
+				   struct lu_seq_range *out,
+				   const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(lu_seq_range_is_sane(space));
+
+	rc = seq_server_check_and_alloc_super(env, seq);
+	if (rc < 0) {
+		if (rc == -EINPROGRESS) {
+			static int printed;
+
+			if (printed++ % 8 == 0)
+				LCONSOLE_INFO("%s: Waiting to contact MDT0000 to allocate super-sequence: rc = %d\n",
+					      seq->lss_name, rc);
+		} else {
+			CERROR("%s: Allocated super-sequence failed: rc = %d\n",
+			       seq->lss_name, rc);
+		}
+		RETURN(rc);
+	}
+
+	rc = range_alloc_set(env, out, seq);
+	if (rc != 0) {
+		CERROR("%s: Allocated meta-sequence failed: rc = %d\n",
+		       seq->lss_name, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INFO, "%s: Allocated meta-sequence " DRANGE"\n",
+	       seq->lss_name, PRANGE(out));
+
+	RETURN(rc);
+}
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+			  struct lu_seq_range *out,
+			  const struct lu_env *env)
+{
+	int rc;
+	ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+	rc = __seq_server_alloc_meta(seq, out, env);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_server_alloc_meta);
+
+static int seq_server_handle(struct lu_site *site,
+			     const struct lu_env *env,
+			     __u32 opc, struct lu_seq_range *out)
+{
+	int rc;
+	struct seq_server_site *ss_site;
+	struct dt_device *dev;
+	ENTRY;
+
+	ss_site = lu_site2seq(site);
+
+	switch (opc) {
+	case SEQ_ALLOC_META:
+		if (!ss_site->ss_server_seq) {
+			rc = -EINVAL;
+			CERROR("Sequence server is not initialized: rc = %d\n",
+			       rc);
+			RETURN(rc);
+		}
+
+		dev = lu2dt_dev(ss_site->ss_server_seq->lss_obj->do_lu.lo_dev);
+		if (dev->dd_rdonly)
+			RETURN(-EROFS);
+
+		rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env);
+		break;
+	case SEQ_ALLOC_SUPER:
+		if (!ss_site->ss_control_seq) {
+			rc = -EINVAL;
+			CERROR("Sequence controller is not initialized: rc = %d\n",
+			       rc);
+			RETURN(rc);
+		}
+
+		dev = lu2dt_dev(ss_site->ss_control_seq->lss_obj->do_lu.lo_dev);
+		if (dev->dd_rdonly)
+			RETURN(-EROFS);
+
+		rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	RETURN(rc);
+}
+
+static int seq_handler(struct tgt_session_info *tsi)
+{
+	struct lu_seq_range	*out, *tmp;
+	struct lu_site		*site;
+	int			 rc;
+	__u32			*opc;
+
+	ENTRY;
+
+	LASSERT(!(lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & MSG_REPLAY));
+	site = tsi->tsi_exp->exp_obd->obd_lu_dev->ld_site;
+	LASSERT(site != NULL);
+
+	opc = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_OPC);
+	if (opc) {
+		out = req_capsule_server_get(tsi->tsi_pill, &RMF_SEQ_RANGE);
+		if (!out)
+			RETURN(err_serious(-EPROTO));
+
+		tmp = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_RANGE);
+
+		/*
+		 * seq client passed mdt id, we need to pass that using out
+		 * range parameter
+		 */
+		out->lsr_index = tmp->lsr_index;
+		out->lsr_flags = tmp->lsr_flags;
+		rc = seq_server_handle(site, tsi->tsi_env, *opc, out);
+	} else {
+		rc = err_serious(-EPROTO);
+	}
+
+	RETURN(rc);
+}
+
+struct tgt_handler seq_handlers[] = {
+TGT_SEQ_HDL(HAS_REPLY,	SEQ_QUERY,	seq_handler),
+};
+EXPORT_SYMBOL(seq_handlers);
+
+/* context key constructor/destructor: seq_key_init, seq_key_fini */
+LU_KEY_INIT_FINI(seq, struct seq_thread_info);
+
+/* context key: seq_thread_key */
+LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
+
+static void seq_server_debugfs_fini(struct lu_server_seq *seq)
+{
+	debugfs_remove_recursive(seq->lss_debugfs_entry);
+}
+
+static void seq_server_debugfs_init(struct lu_server_seq *seq)
+{
+	ENTRY;
+
+	seq->lss_debugfs_entry = debugfs_create_dir(seq->lss_name,
+						    seq_debugfs_dir);
+
+	ldebugfs_add_vars(seq->lss_debugfs_entry,
+			  seq_server_debugfs_list, seq);
+
+	if (seq->lss_type == LUSTRE_SEQ_CONTROLLER)
+		debugfs_create_file("fldb", 0644, seq->lss_debugfs_entry,
+				    seq, &seq_fld_debugfs_seq_fops);
+}
+
+int seq_server_init(const struct lu_env *env,
+		    struct lu_server_seq *seq,
+		    struct dt_device *dev,
+		    const char *prefix,
+		    enum lu_mgr_type type,
+		    struct seq_server_site *ss)
+{
+	int rc, is_srv = (type == LUSTRE_SEQ_SERVER);
+	ENTRY;
+
+	LASSERT(dev != NULL);
+	LASSERT(prefix != NULL);
+	LASSERT(ss != NULL);
+	LASSERT(ss->ss_lu != NULL);
+
+	/*
+	 * Check all lu_fid fields are converted in fid_cpu_to_le() and friends
+	 * and that there is no padding added by compiler to the struct.
+	 */
+	{
+		struct lu_fid tst;
+
+		BUILD_BUG_ON(sizeof(tst) != sizeof(tst.f_seq) +
+			     sizeof(tst.f_oid) + sizeof(tst.f_ver));
+	}
+
+	seq->lss_cli = NULL;
+	seq->lss_type = type;
+	seq->lss_site = ss;
+	lu_seq_range_init(&seq->lss_space);
+
+	lu_seq_range_init(&seq->lss_lowater_set);
+	lu_seq_range_init(&seq->lss_hiwater_set);
+	seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH;
+
+	mutex_init(&seq->lss_mutex);
+
+	seq->lss_width = is_srv ?
+		LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH;
+
+	snprintf(seq->lss_name, sizeof(seq->lss_name),
+		 "%s-%s", (is_srv ? "srv" : "ctl"), prefix);
+
+	rc = seq_store_init(seq, env, dev);
+	if (rc)
+		GOTO(out, rc);
+	/* Request backing store for saved sequence info. */
+	rc = seq_store_read(seq, env);
+	if (rc == -ENODATA) {
+
+		/* Nothing is read, init by default value. */
+		seq->lss_space = is_srv ?
+			LUSTRE_SEQ_ZERO_RANGE :
+			LUSTRE_SEQ_SPACE_RANGE;
+
+		seq->lss_space.lsr_index = ss->ss_node_id;
+		LCONSOLE_INFO("%s: No data found on store. Initialize space: rc = %d\n",
+			      seq->lss_name, rc);
+
+		rc = seq_store_update(env, seq, NULL, 0);
+		if (rc) {
+			CERROR("%s: Can't write space data: rc = %d\n",
+			       seq->lss_name, rc);
+		}
+	} else if (rc) {
+		CERROR("%s: Can't read space data: rc = %d\n",
+		       seq->lss_name, rc);
+		GOTO(out, rc);
+	}
+
+	if (is_srv) {
+		LASSERT(lu_seq_range_is_sane(&seq->lss_space));
+	} else {
+		LASSERT(!lu_seq_range_is_zero(&seq->lss_space) &&
+			lu_seq_range_is_sane(&seq->lss_space));
+	}
+
+	seq_server_debugfs_init(seq);
+
+	EXIT;
+out:
+	if (rc)
+		seq_server_fini(seq, env);
+	return rc;
+}
+EXPORT_SYMBOL(seq_server_init);
+
+void seq_server_fini(struct lu_server_seq *seq,
+		     const struct lu_env *env)
+{
+	ENTRY;
+
+	seq_server_debugfs_fini(seq);
+	seq_store_fini(seq, env);
+
+	EXIT;
+}
+EXPORT_SYMBOL(seq_server_fini);
+
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss)
+{
+	if (!ss)
+		RETURN(0);
+
+	if (ss->ss_server_seq) {
+		seq_server_fini(ss->ss_server_seq, env);
+		OBD_FREE_PTR(ss->ss_server_seq);
+		ss->ss_server_seq = NULL;
+	}
+
+	if (ss->ss_control_seq) {
+		seq_server_fini(ss->ss_control_seq, env);
+		OBD_FREE_PTR(ss->ss_control_seq);
+		ss->ss_control_seq = NULL;
+	}
+
+	if (ss->ss_client_seq) {
+		seq_client_fini(ss->ss_client_seq);
+		OBD_FREE_PTR(ss->ss_client_seq);
+		ss->ss_client_seq = NULL;
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(seq_site_fini);
+
+int fid_server_mod_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&seq_thread_key);
+	return lu_context_key_register(&seq_thread_key);
+}
+
+void fid_server_mod_exit(void)
+{
+	lu_context_key_degister(&seq_thread_key);
+}
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
new file mode 100644
index 0000000000000..c2b0f5f688f1d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fid/fid_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+#ifndef __FID_INTERNAL_H
+#define __FID_INTERNAL_H
+
+#include <libcfs/libcfs.h>
+
+#ifdef HAVE_SERVER_SUPPORT
+# define HAVE_SEQ_SERVER
+
+struct req_capsule;
+
+struct seq_thread_info {
+        struct req_capsule     *sti_pill;
+        struct lu_seq_range     sti_space;
+        struct lu_buf           sti_buf;
+};
+
+enum {
+        SEQ_TXN_STORE_CREDITS = 20
+};
+
+extern struct lu_context_key seq_thread_key;
+
+extern struct ldebugfs_vars seq_server_debugfs_list[];
+
+/* Store API functions. */
+struct dt_device;
+
+int seq_store_init(struct lu_server_seq *seq,
+		   const struct lu_env *env,
+		   struct dt_device *dt);
+
+void seq_store_fini(struct lu_server_seq *seq,
+		    const struct lu_env *env);
+
+int seq_store_read(struct lu_server_seq *seq,
+		   const struct lu_env *env);
+
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+		     struct lu_seq_range *out, int sync);
+
+int seq_server_alloc_spec(struct lu_server_seq *seq,
+			  struct lu_seq_range *spec,
+			  const struct lu_env *env);
+
+int fid_server_mod_init(void);
+
+void fid_server_mod_exit(void);
+
+# endif /* HAVE_SERVER_SUPPORT */
+
+/* Functions used internally in module. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+			   const struct lu_env *env);
+
+extern struct dentry *seq_debugfs_dir;
+
+extern struct ldebugfs_vars seq_client_debugfs_list[];
+
+extern const struct file_operations seq_fld_debugfs_seq_fops;
+
+#endif /* __FID_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
new file mode 100644
index 0000000000000..4bd05526fe283
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fid/fid_lib.c
+ *
+ * Miscellaneous fid functions.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <lustre_fid.h>
+
+/**
+ * A cluster-wide range from which fid-sequences are granted to servers and
+ * then clients.
+ *
+ * Fid namespace:
+ * <pre>
+ * Normal FID:        seq:64 [2^33,2^64-1]      oid:32          ver:32
+ * IGIF      :        0:32, ino:32              gen:32          0:32
+ * IDIF      :        0:31, 1:1, ost-index:16,  objd:48         0:32
+ * </pre>
+ *
+ * The first 0x400 sequences of normal FID are reserved for special purpose.
+ * FID_SEQ_START + 1 is for local file id generation.
+ * FID_SEQ_START + 2 is for .lustre directory and its objects
+ */
+const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = {
+	.lsr_start	= FID_SEQ_NORMAL,
+	.lsr_end	= (__u64)~0ULL,
+};
+
+/* Zero range, used for init and other purposes. */
+const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = {
+	.lsr_start = 0,
+};
+
+/* Lustre Big Fs Lock fid. */
+const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL,
+                                       .f_oid = FID_OID_SPECIAL_BFL,
+                                       .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LUSTRE_BFL_FID);
+
+/** Special fid for ".lustre" directory */
+const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+                                          .f_oid = FID_OID_DOT_LUSTRE,
+                                          .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_DOT_LUSTRE_FID);
+
+/** Special fid for "fid" special object in .lustre */
+const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+                                   .f_oid = FID_OID_DOT_LUSTRE_OBF,
+                                   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_OBF_FID);
+
+/** Special fid for "lost+found" special object in .lustre */
+const struct lu_fid LU_LPF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+				   .f_oid = FID_OID_DOT_LUSTRE_LPF,
+				   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_LPF_FID);
+
+/** "/lost+found" - special FID for ldiskfs backend, invislbe to client. */
+const struct lu_fid LU_BACKEND_LPF_FID = { .f_seq = FID_SEQ_LOCAL_FILE,
+					   .f_oid = OSD_LPF_OID,
+					   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_BACKEND_LPF_FID);
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_request.c b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
new file mode 100644
index 0000000000000..2fa8590506c0f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
@@ -0,0 +1,523 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fid/fid_request.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+/* mdc RPC locks */
+#include <lustre_mdc.h>
+#include "fid_internal.h"
+
+struct dentry *seq_debugfs_dir;
+
+static int seq_client_rpc(struct lu_client_seq *seq,
+			  struct lu_seq_range *output, __u32 opc,
+			  const char *opcname)
+{
+	struct obd_export     *exp = seq->lcs_exp;
+	struct ptlrpc_request *req;
+	struct lu_seq_range   *out, *in;
+	__u32                 *op;
+	unsigned int           debug_mask;
+	int                    rc;
+	ENTRY;
+
+	LASSERT(exp != NULL && !IS_ERR(exp));
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY,
+					LUSTRE_MDS_VERSION, SEQ_QUERY);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	/* Init operation code */
+	op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC);
+	*op = opc;
+
+	/* Zero out input range, this is not recovery yet. */
+	in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	lu_seq_range_init(in);
+
+	ptlrpc_request_set_replen(req);
+
+	in->lsr_index = seq->lcs_space.lsr_index;
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		fld_range_set_mdt(in);
+	else
+		fld_range_set_ost(in);
+
+	if (opc == SEQ_ALLOC_SUPER) {
+		req->rq_request_portal = SEQ_CONTROLLER_PORTAL;
+		req->rq_reply_portal = MDC_REPLY_PORTAL;
+		/*
+		 * During allocating super sequence for data object,
+		 * the current thread might hold the export of MDT0(MDT0
+		 * precreating objects on this OST), and it will send the
+		 * request to MDT0 here, so we can not keep resending the
+		 * request here, otherwise if MDT0 is failed(umounted),
+		 * it can not release the export of MDT0
+		 */
+		if (seq->lcs_type == LUSTRE_SEQ_DATA) {
+			req->rq_no_resend = 1;
+			req->rq_no_delay = 1;
+		}
+		debug_mask = D_CONSOLE;
+	} else {
+		if (seq->lcs_type == LUSTRE_SEQ_METADATA) {
+			req->rq_reply_portal = MDC_REPLY_PORTAL;
+			req->rq_request_portal = SEQ_METADATA_PORTAL;
+		} else {
+			req->rq_reply_portal = OSC_REPLY_PORTAL;
+			req->rq_request_portal = SEQ_DATA_PORTAL;
+		}
+
+		debug_mask = D_INFO;
+	}
+
+	/* Allow seq client RPC during recovery time. */
+	req->rq_allow_replay = 1;
+
+	ptlrpc_at_set_req_timeout(req);
+
+	rc = ptlrpc_queue_wait(req);
+
+	if (rc)
+		GOTO(out_req, rc);
+
+	out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	*output = *out;
+
+	if (!lu_seq_range_is_sane(output)) {
+		CERROR("%s: Invalid range received from server: "
+		       DRANGE"\n", seq->lcs_name, PRANGE(output));
+		GOTO(out_req, rc = -EINVAL);
+	}
+
+	if (lu_seq_range_is_exhausted(output)) {
+		CERROR("%s: Range received from server is exhausted: "
+		       DRANGE"]\n", seq->lcs_name, PRANGE(output));
+		GOTO(out_req, rc = -EINVAL);
+	}
+
+	CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n",
+		     seq->lcs_name, opcname, PRANGE(output));
+
+	EXIT;
+out_req:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* Request sequence-controller node to allocate new super-sequence. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+			   const struct lu_env *env)
+{
+	int rc;
+	ENTRY;
+
+	mutex_lock(&seq->lcs_mutex);
+
+	if (seq->lcs_srv) {
+#ifdef HAVE_SEQ_SERVER
+		LASSERT(env != NULL);
+		rc = seq_server_alloc_super(seq->lcs_srv, &seq->lcs_space, env);
+#else
+		rc = 0;
+#endif
+	} else {
+		/*
+		 * Check whether the connection to seq controller has been
+		 * setup (lcs_exp != NULL)
+		 */
+		if (!seq->lcs_exp) {
+			mutex_unlock(&seq->lcs_mutex);
+			RETURN(-EINPROGRESS);
+		}
+
+		rc = seq_client_rpc(seq, &seq->lcs_space,
+				    SEQ_ALLOC_SUPER, "super");
+	}
+	mutex_unlock(&seq->lcs_mutex);
+	RETURN(rc);
+}
+
+/* Request sequence-controller node to allocate new meta-sequence. */
+static int seq_client_alloc_meta(const struct lu_env *env,
+				 struct lu_client_seq *seq)
+{
+	int rc;
+	ENTRY;
+
+	if (seq->lcs_srv) {
+#ifdef HAVE_SEQ_SERVER
+		LASSERT(env);
+		rc = seq_server_alloc_meta(seq->lcs_srv, &seq->lcs_space, env);
+#else
+		rc = 0;
+#endif
+	} else {
+		do {
+			/*
+			 * If meta server return -EINPROGRESS or EAGAIN,
+			 * it means meta server might not be ready to
+			 * allocate super sequence from sequence controller
+			 * (MDT0)yet
+			 */
+			rc = seq_client_rpc(seq, &seq->lcs_space,
+					    SEQ_ALLOC_META, "meta");
+			if (rc == -EINPROGRESS || rc == -EAGAIN)
+				/*
+				 * MDT0 is not ready, let's wait for 2
+				 * seconds and retry.
+				 */
+				ssleep(2);
+
+		} while (rc == -EINPROGRESS || rc == -EAGAIN);
+	}
+
+	RETURN(rc);
+}
+
+/* Allocate new sequence for client. */
+static int seq_client_alloc_seq(const struct lu_env *env,
+				struct lu_client_seq *seq, u64 *seqnr)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(lu_seq_range_is_sane(&seq->lcs_space));
+
+	if (lu_seq_range_is_exhausted(&seq->lcs_space)) {
+		rc = seq_client_alloc_meta(env, seq);
+		if (rc) {
+			if (rc != -EINPROGRESS)
+				CERROR("%s: Cannot allocate new meta-sequence: rc = %d\n",
+				       seq->lcs_name, rc);
+			RETURN(rc);
+		} else {
+			CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
+			       seq->lcs_name, PRANGE(&seq->lcs_space));
+		}
+	} else {
+		rc = 0;
+	}
+
+	LASSERT(!lu_seq_range_is_exhausted(&seq->lcs_space));
+	*seqnr = seq->lcs_space.lsr_start;
+	seq->lcs_space.lsr_start += 1;
+
+	CDEBUG(D_INFO, "%s: Allocated sequence [%#llx]\n", seq->lcs_name,
+	       *seqnr);
+
+	RETURN(rc);
+}
+
+/**
+ * Allocate the whole non-used seq to the caller.
+ *
+ * \param[in] env	pointer to the thread context
+ * \param[in,out] seq	pointer to the client sequence manager
+ * \param[out] seqnr	to hold the new allocated sequence
+ *
+ * \retval		0 for new sequence allocated.
+ * \retval		Negative error number on failure.
+ */
+int seq_client_get_seq(const struct lu_env *env,
+		       struct lu_client_seq *seq, u64 *seqnr)
+{
+	int rc;
+
+	LASSERT(seqnr != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+
+	rc = seq_client_alloc_seq(env, seq, seqnr);
+	if (rc) {
+		CERROR("%s: Can't allocate new sequence: rc = %d\n",
+		       seq->lcs_name, rc);
+	} else {
+		CDEBUG(D_INFO, "%s: New sequence [0x%16.16llx]\n",
+		       seq->lcs_name, *seqnr);
+		seq->lcs_fid.f_seq = *seqnr;
+		seq->lcs_fid.f_ver = 0;
+		/*
+		 *  The caller require the whole seq,
+		 * so marked this seq to be used
+		 */
+		if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+			seq->lcs_fid.f_oid =
+				LUSTRE_METADATA_SEQ_MAX_WIDTH;
+		else
+			seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH;
+	}
+	mutex_unlock(&seq->lcs_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(seq_client_get_seq);
+
+/**
+ * Allocate new fid on passed client @seq and save it to @fid.
+ *
+ * \param[in] env	pointer to the thread context
+ * \param[in,out] seq	pointer to the client sequence manager
+ * \param[out] fid	to hold the new allocated fid
+ *
+ * \retval		1 for notify the caller that sequence switch
+ *			is performed to allow it to setup FLD for it.
+ * \retval		0 for new FID allocated in current sequence.
+ * \retval		Negative error number on failure.
+ */
+int seq_client_alloc_fid(const struct lu_env *env,
+			 struct lu_client_seq *seq, struct lu_fid *fid)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+	LASSERT(fid != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST))
+		seq->lcs_fid.f_oid = seq->lcs_width;
+
+	if (unlikely(!fid_is_zero(&seq->lcs_fid) &&
+		     fid_oid(&seq->lcs_fid) < seq->lcs_width)) {
+		/* Just bump last allocated fid and return to caller. */
+		seq->lcs_fid.f_oid++;
+		rc = 0;
+	} else {
+		u64 seqnr;
+
+		rc = seq_client_alloc_seq(env, seq, &seqnr);
+		if (rc) {
+			if (rc != -EINPROGRESS)
+				CERROR("%s: Can't allocate new sequence: rc = %d\n",
+				       seq->lcs_name, rc);
+		} else {
+			CDEBUG(D_INFO, "%s: New sequence [0x%16.16llx]\n",
+			       seq->lcs_name, seqnr);
+
+			seq->lcs_fid.f_seq = seqnr;
+			seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID;
+			seq->lcs_fid.f_ver = 0;
+			rc = 1;
+		}
+	}
+
+	if (rc >= 0) {
+		*fid = seq->lcs_fid;
+		CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name,
+		       PFID(fid));
+	}
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_alloc_fid);
+
+/*
+ * Finish the current sequence due to disconnect.
+ * See mdc_import_event()
+ */
+void seq_client_flush(struct lu_client_seq *seq)
+{
+	LASSERT(seq != NULL);
+	mutex_lock(&seq->lcs_mutex);
+
+	fid_zero(&seq->lcs_fid);
+	/**
+	 * this id shld not be used for seq range allocation.
+	 * set to -1 for dgb check.
+	 */
+	seq->lcs_space.lsr_index = -1;
+
+	lu_seq_range_init(&seq->lcs_space);
+	mutex_unlock(&seq->lcs_mutex);
+}
+EXPORT_SYMBOL(seq_client_flush);
+
+static void seq_client_debugfs_fini(struct lu_client_seq *seq)
+{
+	debugfs_remove_recursive(seq->lcs_debugfs_entry);
+}
+
+static void seq_client_debugfs_init(struct lu_client_seq *seq)
+{
+	seq->lcs_debugfs_entry = debugfs_create_dir(seq->lcs_name,
+						    seq_debugfs_dir);
+
+	ldebugfs_add_vars(seq->lcs_debugfs_entry,
+			  seq_client_debugfs_list, seq);
+}
+
+void seq_client_fini(struct lu_client_seq *seq)
+{
+	ENTRY;
+
+	seq_client_debugfs_fini(seq);
+
+	if (seq->lcs_exp) {
+		class_export_put(seq->lcs_exp);
+		seq->lcs_exp = NULL;
+	}
+
+	seq->lcs_srv = NULL;
+	EXIT;
+}
+EXPORT_SYMBOL(seq_client_fini);
+
+void seq_client_init(struct lu_client_seq *seq,
+		     struct obd_export *exp,
+		     enum lu_cli_type type,
+		     const char *prefix,
+		     struct lu_server_seq *srv)
+{
+	ENTRY;
+
+	LASSERT(seq != NULL);
+	LASSERT(prefix != NULL);
+
+	seq->lcs_srv = srv;
+	seq->lcs_type = type;
+
+	mutex_init(&seq->lcs_mutex);
+	if (type == LUSTRE_SEQ_METADATA)
+		seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+	else
+		seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+	/* Make sure that things are clear before work is started. */
+	seq_client_flush(seq);
+
+	if (exp)
+		seq->lcs_exp = class_export_get(exp);
+
+	snprintf(seq->lcs_name, sizeof(seq->lcs_name),
+		 "cli-%s", prefix);
+
+	seq_client_debugfs_init(seq);
+}
+EXPORT_SYMBOL(seq_client_init);
+
+int client_fid_init(struct obd_device *obd,
+		    struct obd_export *exp, enum lu_cli_type type)
+{
+	struct client_obd *cli = &obd->u.cli;
+	char *prefix;
+	int rc = 0;
+	ENTRY;
+
+	down_write(&cli->cl_seq_rwsem);
+	OBD_ALLOC_PTR(cli->cl_seq);
+	if (!cli->cl_seq)
+		GOTO(out, rc = -ENOMEM);
+
+	OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+	if (!prefix)
+		GOTO(out, rc = -ENOMEM);
+
+	snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name);
+
+	/* Init client side sequence-manager */
+	seq_client_init(cli->cl_seq, exp, type, prefix, NULL);
+	OBD_FREE(prefix, MAX_OBD_NAME + 5);
+
+out:
+	if (rc && cli->cl_seq) {
+		OBD_FREE_PTR(cli->cl_seq);
+		cli->cl_seq = NULL;
+	}
+	up_write(&cli->cl_seq_rwsem);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_fid_init);
+
+int client_fid_fini(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	down_write(&cli->cl_seq_rwsem);
+	if (cli->cl_seq) {
+		seq_client_fini(cli->cl_seq);
+		OBD_FREE_PTR(cli->cl_seq);
+		cli->cl_seq = NULL;
+	}
+	up_write(&cli->cl_seq_rwsem);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(client_fid_fini);
+
+static int __init fid_init(void)
+{
+	struct dentry *de;
+#ifdef HAVE_SERVER_SUPPORT
+	int rc = fid_server_mod_init();
+
+	if (rc)
+		return rc;
+#endif
+	de = debugfs_create_dir(LUSTRE_SEQ_NAME,
+				debugfs_lustre_root);
+	if (!IS_ERR(de))
+		seq_debugfs_dir = de;
+	return PTR_ERR_OR_ZERO(de);
+}
+
+static void __exit fid_exit(void)
+{
+# ifdef HAVE_SERVER_SUPPORT
+	fid_server_mod_exit();
+# endif
+	debugfs_remove_recursive(seq_debugfs_dir);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre File IDentifier");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(fid_init);
+module_exit(fid_exit);
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_store.c b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
new file mode 100644
index 0000000000000..e73e8498ece59
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
@@ -0,0 +1,249 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fid/fid_store.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <libcfs/libcfs.h>
+#include <dt_object.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include "fid_internal.h"
+
+static struct lu_buf *seq_store_buf(struct seq_thread_info *info)
+{
+	struct lu_buf *buf;
+
+	buf = &info->sti_buf;
+	buf->lb_buf = &info->sti_space;
+	buf->lb_len = sizeof(info->sti_space);
+	return buf;
+}
+
+struct seq_update_callback {
+	struct dt_txn_commit_cb suc_cb;
+	struct lu_server_seq   *suc_seq;
+};
+
+void seq_update_cb(struct lu_env *env, struct thandle *th,
+		   struct dt_txn_commit_cb *cb, int err)
+{
+	struct seq_update_callback *ccb;
+
+	ccb = container_of(cb, struct seq_update_callback, suc_cb);
+
+	LASSERT(ccb->suc_seq != NULL);
+
+	ccb->suc_seq->lss_need_sync = 0;
+	OBD_FREE_PTR(ccb);
+}
+
+int seq_update_cb_add(struct thandle *th, struct lu_server_seq *seq)
+{
+	struct seq_update_callback *ccb;
+	struct dt_txn_commit_cb *dcb;
+	int rc;
+
+	OBD_ALLOC_PTR(ccb);
+	if (!ccb)
+		return -ENOMEM;
+
+	ccb->suc_seq = seq;
+	seq->lss_need_sync = 1;
+
+	dcb = &ccb->suc_cb;
+	dcb->dcb_func  = seq_update_cb;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strlcpy(dcb->dcb_name, "seq_update_cb", sizeof(dcb->dcb_name));
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc)
+		OBD_FREE_PTR(ccb);
+	return rc;
+}
+
+/* This function implies that caller takes care about locking. */
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+		     struct lu_seq_range *out, int sync)
+{
+	struct dt_device *dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev);
+	struct seq_thread_info *info;
+	struct thandle *th;
+	loff_t pos = 0;
+	int rc;
+
+	if (dt_dev->dd_rdonly)
+		RETURN(0);
+
+	info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+	LASSERT(info != NULL);
+
+	th = dt_trans_create(env, dt_dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	/* Store ranges in le format. */
+	range_cpu_to_le(&info->sti_space, &seq->lss_space);
+
+	rc = dt_declare_record_write(env, seq->lss_obj,
+				     seq_store_buf(info), 0, th);
+	if (rc)
+		GOTO(exit, rc);
+
+	if (out) {
+		rc = fld_declare_server_create(env,
+					       seq->lss_site->ss_server_fld,
+					       out, th);
+		if (rc)
+			GOTO(exit, rc);
+	}
+
+	rc = dt_trans_start_local(env, dt_dev, th);
+	if (rc)
+		GOTO(exit, rc);
+
+	rc = dt_record_write(env, seq->lss_obj, seq_store_buf(info), &pos, th);
+	if (rc) {
+		CERROR("%s: Can't write space data, rc %d\n",
+		       seq->lss_name, rc);
+		GOTO(exit, rc);
+	} else if (out) {
+		rc = fld_server_create(env, seq->lss_site->ss_server_fld, out,
+				       th);
+		if (rc) {
+			CERROR("%s: Can't Update fld database, rc %d\n",
+				seq->lss_name, rc);
+			GOTO(exit, rc);
+		}
+	}
+	/*
+	 * next sequence update will need sync until this update is committed
+	 * in case of sync operation this is not needed obviously
+	 */
+	if (!sync)
+		/* if callback can't be added then sync always */
+		sync = !!seq_update_cb_add(th, seq);
+
+	th->th_sync |= sync;
+exit:
+	dt_trans_stop(env, dt_dev, th);
+	return rc;
+}
+
+/*
+ * This function implies that caller takes care about locking or locking is not
+ * needed (init time).
+ */
+int seq_store_read(struct lu_server_seq *seq,
+		   const struct lu_env *env)
+{
+	struct seq_thread_info *info;
+	loff_t pos = 0;
+	int rc;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+	LASSERT(info != NULL);
+
+	rc = dt_read(env, seq->lss_obj, seq_store_buf(info), &pos);
+
+	if (rc == sizeof(info->sti_space)) {
+		range_le_to_cpu(&seq->lss_space, &info->sti_space);
+		CDEBUG(D_INFO, "%s: Space - "DRANGE"\n",
+		       seq->lss_name, PRANGE(&seq->lss_space));
+		rc = 0;
+	} else if (rc == 0) {
+		rc = -ENODATA;
+	} else if (rc > 0) {
+		CERROR("%s: Read only %d bytes of %d\n", seq->lss_name,
+		       rc, (int)sizeof(info->sti_space));
+		rc = -EIO;
+	}
+
+	RETURN(rc);
+}
+
+int seq_store_init(struct lu_server_seq *seq,
+		   const struct lu_env *env,
+		   struct dt_device *dt)
+{
+	struct dt_object *dt_obj;
+	struct lu_fid fid;
+	struct lu_attr attr;
+	struct dt_object_format dof;
+	const char *name;
+	int rc;
+	ENTRY;
+
+	name = seq->lss_type == LUSTRE_SEQ_SERVER ?
+		LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME;
+
+	if (seq->lss_type == LUSTRE_SEQ_SERVER)
+		lu_local_obj_fid(&fid, FID_SEQ_SRV_OID);
+	else
+		lu_local_obj_fid(&fid, FID_SEQ_CTL_OID);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.la_valid = LA_MODE;
+	attr.la_mode = S_IFREG | 0666;
+	dof.dof_type = DFT_REGULAR;
+
+	dt_obj = dt_find_or_create(env, dt, &fid, &dof, &attr);
+	if (!IS_ERR(dt_obj)) {
+		seq->lss_obj = dt_obj;
+		rc = 0;
+	} else {
+		CERROR("%s: Can't find \"%s\" obj %d\n",
+		       seq->lss_name, name, (int)PTR_ERR(dt_obj));
+		rc = PTR_ERR(dt_obj);
+	}
+
+	RETURN(rc);
+}
+
+void seq_store_fini(struct lu_server_seq *seq, const struct lu_env *env)
+{
+	ENTRY;
+
+	if (seq->lss_obj) {
+		if (!IS_ERR(seq->lss_obj))
+			dt_object_put(env, seq->lss_obj);
+		seq->lss_obj = NULL;
+	}
+
+	EXIT;
+}
diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
new file mode 100644
index 0000000000000..f4d9b6a8e0861
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
@@ -0,0 +1,635 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fid/lproc_fid.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include "fid_internal.h"
+
+/* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */
+#define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64))
+/**
+ * Reduce the SEQ range allocated to a node to a strict subset of the range
+ * currently-allocated SEQ range.  If the specified range is "clear", then
+ * drop all allocated sequences and request a new one from the master.
+ *
+ * Note: this function should only be used for testing, it is not necessarily
+ * safe for production use.
+ */
+static int
+ldebugfs_fid_write_common(const char __user *buffer, size_t count,
+			  struct lu_seq_range *range)
+{
+	char kernbuf[MAX_FID_RANGE_STRLEN];
+	struct lu_seq_range tmp = {
+		.lsr_start = 0,
+	};
+	int rc;
+
+	ENTRY;
+	LASSERT(range);
+
+	if (count >= sizeof(kernbuf))
+		RETURN(-EINVAL);
+
+	if (copy_from_user(kernbuf, buffer, count))
+		RETURN(-EFAULT);
+
+	kernbuf[count] = 0;
+
+	if (count == 5 && strcmp(kernbuf, "clear") == 0) {
+		memset(range, 0, sizeof(*range));
+		RETURN(count);
+	}
+
+	/* of the form "[0x0000000240000400 - 0x000000028000400]" */
+	rc = sscanf(kernbuf, "[%llx - %llx]\n",
+		    (unsigned long long *)&tmp.lsr_start,
+		    (unsigned long long *)&tmp.lsr_end);
+	if (rc != 2)
+		RETURN(-EINVAL);
+	if (!lu_seq_range_is_sane(&tmp) || lu_seq_range_is_zero(&tmp) ||
+	    tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end)
+		RETURN(-EINVAL);
+	*range = tmp;
+	RETURN(0);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/*
+ * Server side debugfs stuff.
+ */
+static ssize_t
+ldebugfs_server_fid_space_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct lu_server_seq *seq = m->private;
+	int rc;
+
+	ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+	rc = ldebugfs_fid_write_common(buffer, count, &seq->lss_space);
+	if (rc == 0) {
+		CDEBUG(D_INFO, "%s: Space: " DRANGE "\n",
+		       seq->lss_name, PRANGE(&seq->lss_space));
+	}
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(count);
+}
+
+static int
+ldebugfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
+	ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lss_space));
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(0);
+}
+
+static int
+ldebugfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
+	struct client_obd *cli;
+	ENTRY;
+
+	if (seq->lss_cli) {
+		if (seq->lss_cli->lcs_exp != NULL) {
+			cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli;
+			seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+		} else {
+			seq_printf(m, "%s\n", seq->lss_cli->lcs_srv->lss_name);
+                }
+	} else {
+		seq_puts(m, "<none>\n");
+	}
+
+	RETURN(0);
+}
+
+static ssize_t ldebugfs_server_fid_width_seq_write(struct file *file,
+						   const char __user *buffer,
+						   size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct lu_server_seq *seq = m->private;
+	int rc;
+
+	ENTRY;
+	mutex_lock(&seq->lss_mutex);
+
+	rc = kstrtoull_from_user(buffer, count, 0, &seq->lss_width);
+	if (rc) {
+		CERROR("%s: invalid FID sequence width: rc = %d\n",
+		       seq->lss_name, rc);
+		GOTO(out_unlock, count = rc);
+	}
+
+	CDEBUG(D_INFO, "%s: Width: %llu\n",
+	       seq->lss_name, seq->lss_width);
+out_unlock:
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(count);
+}
+
+static int
+ldebugfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
+
+	ENTRY;
+	mutex_lock(&seq->lss_mutex);
+	seq_printf(m, "%llu\n", seq->lss_width);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(0);
+}
+
+LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_space);
+LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_width);
+LDEBUGFS_SEQ_FOPS_RO(ldebugfs_server_fid_server);
+
+struct ldebugfs_vars seq_server_debugfs_list[] = {
+	{ .name	=	"space",
+	  .fops	=	&ldebugfs_server_fid_space_fops	},
+	{ .name	=	"width",
+	  .fops	=	&ldebugfs_server_fid_width_fops	},
+	{ .name	=	"server",
+	  .fops	=	&ldebugfs_server_fid_server_fops},
+	{ NULL }
+};
+
+struct fld_seq_param {
+	struct lu_env		fsp_env;
+	struct dt_it		*fsp_it;
+	struct lu_server_fld	*fsp_fld;
+	struct lu_server_seq	*fsp_seq;
+	unsigned int		fsp_stop:1;
+};
+
+/*
+ * XXX: below is a copy of the functions in lustre/fld/lproc_fld.c.
+ * we want to avoid this duplication either by exporting the
+ * functions or merging fid and fld into a single module.
+ */
+static void *fldb_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld    *fld;
+	struct dt_object        *obj;
+	const struct dt_it_ops  *iops;
+	struct dt_key		*key;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->load(&param->fsp_env, param->fsp_it, *pos);
+	if (rc <= 0)
+		return NULL;
+
+	key = iops->key(&param->fsp_env, param->fsp_it);
+	if (IS_ERR(key))
+		return NULL;
+
+	*pos = be64_to_cpu(*(__u64 *)key);
+
+	return param;
+}
+
+static void fldb_seq_stop(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	const struct dt_it_ops	*iops;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+
+	if (param == NULL)
+		return;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	iops->put(&param->fsp_env, param->fsp_it);
+}
+
+static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	int			rc;
+
+	++*pos;
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->next(&param->fsp_env, param->fsp_it);
+	if (rc > 0) {
+		param->fsp_stop = 1;
+		return NULL;
+	}
+
+	*pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+	return param;
+}
+
+static int fldb_seq_show(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	struct lu_seq_range	 fld_rec;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return 0;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->rec(&param->fsp_env, param->fsp_it,
+		       (struct dt_rec *)&fld_rec, 0);
+	if (rc != 0) {
+		CERROR("%s: read record error: rc = %d\n",
+		       fld->lsf_name, rc);
+	} else if (fld_rec.lsr_start != 0) {
+		range_be_to_cpu(&fld_rec, &fld_rec);
+		seq_printf(p, DRANGE"\n", PRANGE(&fld_rec));
+	}
+
+	return rc;
+}
+
+static const struct seq_operations fldb_sops = {
+	.start = fldb_seq_start,
+	.stop = fldb_seq_stop,
+	.next = fldb_seq_next,
+	.show = fldb_seq_show,
+};
+
+static int fldb_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq;
+	struct lu_server_seq *ss = inode->i_private;
+	struct lu_server_fld    *fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops  *iops;
+	struct fld_seq_param    *param = NULL;
+	int			env_init = 0;
+	int			rc;
+
+	fld = ss->lss_site->ss_server_fld;
+	LASSERT(fld != NULL);
+
+	rc = seq_open(file, &fldb_sops);
+	if (rc)
+		return rc;
+
+	obj = fld->lsf_obj;
+	if (obj == NULL) {
+		seq = file->private_data;
+		seq->private = NULL;
+		return 0;
+	}
+
+	OBD_ALLOC_PTR(param);
+	if (param == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = lu_env_init(&param->fsp_env, LCT_MD_THREAD);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	env_init = 1;
+	iops = &obj->do_index_ops->dio_it;
+	param->fsp_it = iops->init(&param->fsp_env, obj, 0);
+	if (IS_ERR(param->fsp_it))
+		GOTO(out, rc = PTR_ERR(param->fsp_it));
+
+	param->fsp_fld = fld;
+	param->fsp_seq = ss;
+	param->fsp_stop = 0;
+
+	seq = file->private_data;
+	seq->private = param;
+out:
+	if (rc != 0) {
+		if (env_init == 1)
+			lu_env_fini(&param->fsp_env);
+		if (param != NULL)
+			OBD_FREE_PTR(param);
+	}
+	return rc;
+}
+
+static int fldb_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq = file->private_data;
+	struct fld_seq_param	*param;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+
+	param = seq->private;
+	if (param == NULL) {
+		seq_release(inode, file);
+		return 0;
+	}
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	LASSERT(iops != NULL);
+	LASSERT(param->fsp_it != NULL);
+	iops->fini(&param->fsp_env, param->fsp_it);
+	lu_env_fini(&param->fsp_env);
+	OBD_FREE_PTR(param);
+	seq_release(inode, file);
+
+	return 0;
+}
+
+static ssize_t fldb_seq_write(struct file *file, const char __user *buf,
+			      size_t len, loff_t *off)
+{
+	struct seq_file		*seq = file->private_data;
+	struct fld_seq_param	*param;
+	struct lu_seq_range	 range;
+	int			 rc = 0;
+	char			 _buffer[MAX_FID_RANGE_STRLEN];
+	char			*buffer = _buffer;
+	char *tmp;
+	ENTRY;
+
+	param = seq->private;
+	if (param == NULL)
+		RETURN(-EINVAL);
+
+	if (len >= sizeof(_buffer))
+		RETURN(-EINVAL);
+
+	if (copy_from_user(buffer, buf, len))
+		GOTO(out, rc = -EFAULT);
+	buffer[len] = 0;
+
+	/*
+	 * format - [0x0000000200000007-0x0000000200000008):0:mdt
+	 */
+	if (*buffer != '[')
+		GOTO(out, rc = -EINVAL);
+	buffer++;
+
+	tmp = strchr(buffer, '-');
+	if (!tmp)
+		GOTO(out, rc = -EINVAL);
+	*tmp++ = '\0';
+	rc = kstrtoull(buffer, 0, &range.lsr_start);
+	if (rc)
+		GOTO(out, rc);
+	buffer = tmp;
+
+	tmp = strchr(buffer, ')');
+	if (!tmp)
+		GOTO(out, rc = -EINVAL);
+	*tmp++ = '\0';
+	rc = kstrtoull(buffer, 0, &range.lsr_end);
+	if (rc)
+		GOTO(out, rc);
+	buffer = tmp;
+
+	if (*buffer != ':')
+		GOTO(out, rc = -EINVAL);
+	buffer++;
+
+	tmp = strchr(buffer, ':');
+	if (!tmp)
+		GOTO(out, rc = -EINVAL);
+	*tmp++ = '\0';
+	rc = kstrtouint(buffer, 0, &range.lsr_index);
+	if (rc)
+		GOTO(out, rc);
+	buffer = tmp;
+
+	if (strncmp(buffer, "mdt", 3) == 0)
+		range.lsr_flags = LU_SEQ_RANGE_MDT;
+	else if (strncmp(buffer, "ost", 3) == 0)
+		range.lsr_flags = LU_SEQ_RANGE_OST;
+	else
+		GOTO(out, rc = -EINVAL);
+
+	rc = seq_server_alloc_spec(param->fsp_seq->lss_site->ss_control_seq,
+				   &range, &param->fsp_env);
+
+out:
+	RETURN(rc < 0 ? rc : len);
+}
+
+const struct file_operations seq_fld_debugfs_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = fldb_seq_open,
+	.read	 = seq_read,
+	.write	 = fldb_seq_write,
+	.release = fldb_seq_release,
+};
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+/* Client side debugfs stuff */
+static ssize_t
+ldebugfs_client_fid_space_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct lu_client_seq *seq = m->private;
+	int rc;
+
+	ENTRY;
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = ldebugfs_fid_write_common(buffer, count, &seq->lcs_space);
+	if (rc == 0) {
+		CDEBUG(D_INFO, "%s: Space: " DRANGE "\n",
+                       seq->lcs_name, PRANGE(&seq->lcs_space));
+	}
+
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(count);
+}
+
+static int ldebugfs_client_fid_space_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+
+	ENTRY;
+	mutex_lock(&seq->lcs_mutex);
+	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space));
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(0);
+}
+
+static ssize_t ldebugfs_client_fid_width_seq_write(struct file *file,
+						   const char __user *buffer,
+						   size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct lu_client_seq *seq = m->private;
+	u64 val;
+	u64 max;
+	int rc;
+
+	ENTRY;
+	rc = kstrtoull_from_user(buffer, count, 0, &val);
+	if (rc)
+		return rc;
+
+	mutex_lock(&seq->lcs_mutex);
+	if (seq->lcs_type == LUSTRE_SEQ_DATA)
+		max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+	else
+		max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+
+	if (val <= max) {
+		seq->lcs_width = val;
+
+		CDEBUG(D_INFO, "%s: Sequence size: %llu\n", seq->lcs_name,
+		       seq->lcs_width);
+	} else {
+		count = -ERANGE;
+	}
+
+	mutex_unlock(&seq->lcs_mutex);
+	RETURN(count);
+}
+
+static int
+ldebugfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+
+	ENTRY;
+	mutex_lock(&seq->lcs_mutex);
+	seq_printf(m, "%llu\n", seq->lcs_width);
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(0);
+}
+
+static int
+ldebugfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+
+	ENTRY;
+	mutex_lock(&seq->lcs_mutex);
+	seq_printf(m, DFID"\n", PFID(&seq->lcs_fid));
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(0);
+}
+
+static int
+ldebugfs_client_fid_server_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	struct client_obd *cli;
+	ENTRY;
+
+	if (seq->lcs_exp) {
+		cli = &seq->lcs_exp->exp_obd->u.cli;
+		seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+#ifdef HAVE_SERVER_SUPPORT
+	} else {
+		seq_printf(m, "%s\n", seq->lcs_srv->lss_name);
+#endif /* HAVE_SERVER_SUPPORT */
+	}
+
+	RETURN(0);
+}
+
+LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_space);
+LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_width);
+LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_server);
+LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_fid);
+
+struct ldebugfs_vars seq_client_debugfs_list[] = {
+	{ .name	=	"space",
+	  .fops	=	&ldebugfs_client_fid_space_fops	},
+	{ .name	=	"width",
+	  .fops	=	&ldebugfs_client_fid_width_fops	},
+	{ .name	=	"server",
+	  .fops	=	&ldebugfs_client_fid_server_fops},
+	{ .name	=	"fid",
+	  .fops	=	&ldebugfs_client_fid_fid_fops	},
+	{ NULL }
+};
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
new file mode 100644
index 0000000000000..e77df9652141e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
@@ -0,0 +1,492 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fld/fld_cache.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <obd_support.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+/**
+ * create fld cache.
+ */
+struct fld_cache *fld_cache_init(const char *name, int cache_size,
+				 int cache_threshold)
+{
+	struct fld_cache *cache;
+
+	ENTRY;
+
+	LASSERT(name != NULL);
+	LASSERT(cache_threshold < cache_size);
+
+	OBD_ALLOC_PTR(cache);
+	if (cache == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	INIT_LIST_HEAD(&cache->fci_entries_head);
+	INIT_LIST_HEAD(&cache->fci_lru);
+
+	cache->fci_cache_count = 0;
+	rwlock_init(&cache->fci_lock);
+
+	strlcpy(cache->fci_name, name, sizeof(cache->fci_name));
+
+	cache->fci_cache_size = cache_size;
+	cache->fci_threshold = cache_threshold;
+
+	/* Init fld cache info. */
+	memset(&cache->fci_stat, 0, sizeof(cache->fci_stat));
+
+	CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n",
+	       cache->fci_name, cache_size, cache_threshold);
+
+	RETURN(cache);
+}
+
+/**
+ * destroy fld cache.
+ */
+void fld_cache_fini(struct fld_cache *cache)
+{
+	LASSERT(cache != NULL);
+	fld_cache_flush(cache);
+
+	CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
+	CDEBUG(D_INFO, "  Cache reqs: %llu\n", cache->fci_stat.fst_cache);
+	CDEBUG(D_INFO, "  Total reqs: %llu\n", cache->fci_stat.fst_count);
+
+	OBD_FREE_PTR(cache);
+}
+
+/**
+ * delete given node from list.
+ */
+static void fld_cache_entry_delete(struct fld_cache *cache,
+				   struct fld_cache_entry *node)
+{
+	list_del(&node->fce_list);
+	list_del(&node->fce_lru);
+	cache->fci_cache_count--;
+	OBD_FREE_PTR(node);
+}
+
+/**
+ * fix list by checking new entry with NEXT entry in order.
+ */
+static void fld_fix_new_list(struct fld_cache *cache)
+{
+	struct fld_cache_entry *f_curr;
+	struct fld_cache_entry *f_next;
+	struct lu_seq_range *c_range;
+	struct lu_seq_range *n_range;
+	struct list_head *head = &cache->fci_entries_head;
+
+	ENTRY;
+
+restart_fixup:
+
+	list_for_each_entry_safe(f_curr, f_next, head, fce_list) {
+		c_range = &f_curr->fce_range;
+		n_range = &f_next->fce_range;
+
+		LASSERT(lu_seq_range_is_sane(c_range));
+		if (&f_next->fce_list == head)
+			break;
+
+		if (c_range->lsr_flags != n_range->lsr_flags)
+			continue;
+
+		LASSERTF(c_range->lsr_start <= n_range->lsr_start,
+			 "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n",
+			 PRANGE(c_range), PRANGE(n_range));
+
+		/* check merge possibility with next range */
+		if (c_range->lsr_end == n_range->lsr_start) {
+			if (c_range->lsr_index != n_range->lsr_index)
+				continue;
+			n_range->lsr_start = c_range->lsr_start;
+			fld_cache_entry_delete(cache, f_curr);
+			continue;
+		}
+
+		/* check if current range overlaps with next range. */
+		if (n_range->lsr_start < c_range->lsr_end) {
+			if (c_range->lsr_index == n_range->lsr_index) {
+				n_range->lsr_start = c_range->lsr_start;
+				n_range->lsr_end = max(c_range->lsr_end,
+						       n_range->lsr_end);
+				fld_cache_entry_delete(cache, f_curr);
+			} else {
+				if (n_range->lsr_end <= c_range->lsr_end) {
+					*n_range = *c_range;
+					fld_cache_entry_delete(cache, f_curr);
+				} else
+					n_range->lsr_start = c_range->lsr_end;
+			}
+
+			/* we could have overlap over next
+			 * range too. better restart.
+			 */
+			goto restart_fixup;
+		}
+
+		/* kill duplicates */
+		if (c_range->lsr_start == n_range->lsr_start &&
+		    c_range->lsr_end == n_range->lsr_end)
+			fld_cache_entry_delete(cache, f_curr);
+	}
+
+	EXIT;
+}
+
+/**
+ * add node to fld cache
+ */
+static inline void fld_cache_entry_add(struct fld_cache *cache,
+				       struct fld_cache_entry *f_new,
+				       struct list_head *pos)
+{
+	list_add(&f_new->fce_list, pos);
+	list_add(&f_new->fce_lru, &cache->fci_lru);
+
+	cache->fci_cache_count++;
+	fld_fix_new_list(cache);
+}
+
+/**
+ * Check if cache needs to be shrunk. If so - do it.
+ * Remove one entry in list and so on until cache is shrunk enough.
+ */
+static int fld_cache_shrink(struct fld_cache *cache)
+{
+	int num = 0;
+
+	ENTRY;
+
+	LASSERT(cache != NULL);
+
+	if (cache->fci_cache_count < cache->fci_cache_size)
+		RETURN(0);
+
+	while (cache->fci_cache_count + cache->fci_threshold >
+	       cache->fci_cache_size &&
+	       !list_empty(&cache->fci_lru)) {
+		struct fld_cache_entry *flde =
+			list_last_entry(&cache->fci_lru, struct fld_cache_entry,
+					fce_lru);
+
+		fld_cache_entry_delete(cache, flde);
+		num++;
+	}
+
+	CDEBUG(D_INFO, "%s: FLD cache - Shrunk by %d entries\n",
+	       cache->fci_name, num);
+
+	RETURN(0);
+}
+
+/**
+ * kill all fld cache entries.
+ */
+void fld_cache_flush(struct fld_cache *cache)
+{
+	ENTRY;
+
+	write_lock(&cache->fci_lock);
+	cache->fci_cache_size = 0;
+	fld_cache_shrink(cache);
+	write_unlock(&cache->fci_lock);
+
+	EXIT;
+}
+
+/**
+ * punch hole in existing range. divide this range and add new
+ * entry accordingly.
+ */
+
+static void fld_cache_punch_hole(struct fld_cache *cache,
+				 struct fld_cache_entry *f_curr,
+				 struct fld_cache_entry *f_new)
+{
+	const struct lu_seq_range *range = &f_new->fce_range;
+	const u64 new_start  = range->lsr_start;
+	const u64 new_end  = range->lsr_end;
+	struct fld_cache_entry *fldt;
+
+	ENTRY;
+	OBD_ALLOC_GFP(fldt, sizeof(*fldt), GFP_ATOMIC);
+	if (!fldt) {
+		OBD_FREE_PTR(f_new);
+		EXIT;
+		/* overlap is not allowed, so dont mess up list. */
+		return;
+	}
+	/*  break f_curr RANGE into three RANGES:
+	 *        f_curr, f_new , fldt
+	 */
+
+	/* fldt */
+	fldt->fce_range.lsr_start = new_end;
+	fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end;
+	fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index;
+
+	/* f_curr */
+	f_curr->fce_range.lsr_end = new_start;
+
+	/* add these two entries to list */
+	fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+	fld_cache_entry_add(cache, fldt, &f_new->fce_list);
+
+	/* no need to fixup */
+	EXIT;
+}
+
+/**
+ * handle range overlap in fld cache.
+ */
+static void fld_cache_overlap_handle(struct fld_cache *cache,
+				struct fld_cache_entry *f_curr,
+				struct fld_cache_entry *f_new)
+{
+	const struct lu_seq_range *range = &f_new->fce_range;
+	const u64 new_start  = range->lsr_start;
+	const u64 new_end  = range->lsr_end;
+	const u32 mdt = range->lsr_index;
+
+	/* this is overlap case, these case are checking overlapping with
+	 * prev range only. fixup will handle overlaping with next range.
+	 */
+
+	if (f_curr->fce_range.lsr_index == mdt) {
+		f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start,
+						  new_start);
+
+		f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end,
+						new_end);
+
+		OBD_FREE_PTR(f_new);
+		fld_fix_new_list(cache);
+
+	} else if (new_start <= f_curr->fce_range.lsr_start &&
+			f_curr->fce_range.lsr_end <= new_end) {
+		/* case 1: new range completely overshadowed existing range.
+		 *         e.g. whole range migrated. update fld cache entry
+		 */
+
+		f_curr->fce_range = *range;
+		OBD_FREE_PTR(f_new);
+		fld_fix_new_list(cache);
+
+	} else if (f_curr->fce_range.lsr_start < new_start &&
+			new_end < f_curr->fce_range.lsr_end) {
+		/* case 2: new range fit within existing range. */
+
+		fld_cache_punch_hole(cache, f_curr, f_new);
+
+	} else  if (new_end <= f_curr->fce_range.lsr_end) {
+		/* case 3: overlap:
+		 *         [new_start [c_start  new_end)  c_end)
+		 */
+
+		LASSERT(new_start <= f_curr->fce_range.lsr_start);
+
+		f_curr->fce_range.lsr_start = new_end;
+		fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev);
+
+	} else if (f_curr->fce_range.lsr_start <= new_start) {
+		/* case 4: overlap:
+		 *         [c_start [new_start c_end) new_end)
+		 */
+
+		LASSERT(f_curr->fce_range.lsr_end <= new_end);
+
+		f_curr->fce_range.lsr_end = new_start;
+		fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+	} else
+		CERROR("NEW range ="DRANGE" curr = "DRANGE"\n",
+		       PRANGE(range), PRANGE(&f_curr->fce_range));
+}
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *f_new;
+
+	LASSERT(lu_seq_range_is_sane(range));
+
+	OBD_ALLOC_PTR(f_new);
+	if (!f_new)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	f_new->fce_range = *range;
+	RETURN(f_new);
+}
+
+/**
+ * Insert FLD entry in FLD cache.
+ *
+ * This function handles all cases of merging and breaking up of
+ * ranges.
+ */
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new)
+{
+	struct fld_cache_entry *f_curr;
+	struct fld_cache_entry *n;
+	struct list_head *head;
+	struct list_head *prev = NULL;
+	const u64 new_start  = f_new->fce_range.lsr_start;
+	const u64 new_end  = f_new->fce_range.lsr_end;
+	__u32 new_flags  = f_new->fce_range.lsr_flags;
+
+	ENTRY;
+
+	/*
+	 * Duplicate entries are eliminated in insert op.
+	 * So we don't need to search new entry before starting
+	 * insertion loop.
+	 */
+
+	fld_cache_shrink(cache);
+
+	head = &cache->fci_entries_head;
+
+	list_for_each_entry_safe(f_curr, n, head, fce_list) {
+		/* add list if next is end of list */
+		if (new_end < f_curr->fce_range.lsr_start ||
+		   (new_end == f_curr->fce_range.lsr_start &&
+		    new_flags != f_curr->fce_range.lsr_flags))
+			break;
+
+		prev = &f_curr->fce_list;
+		/* check if this range is to left of new range. */
+		if (new_start < f_curr->fce_range.lsr_end &&
+		    new_flags == f_curr->fce_range.lsr_flags) {
+			fld_cache_overlap_handle(cache, f_curr, f_new);
+			goto out;
+		}
+	}
+
+	if (prev == NULL)
+		prev = head;
+
+	CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range));
+	/* Add new entry to cache and lru list. */
+	fld_cache_entry_add(cache, f_new, prev);
+out:
+	RETURN(0);
+}
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range)
+{
+	struct fld_cache_entry	*flde;
+	int rc;
+
+	flde = fld_cache_entry_create(range);
+	if (IS_ERR(flde))
+		RETURN(PTR_ERR(flde));
+
+	write_lock(&cache->fci_lock);
+	rc = fld_cache_insert_nolock(cache, flde);
+	write_unlock(&cache->fci_lock);
+	if (rc)
+		OBD_FREE_PTR(flde);
+
+	RETURN(rc);
+}
+
+void fld_cache_delete_nolock(struct fld_cache *cache,
+		      const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *tmp;
+	struct list_head *head;
+
+	head = &cache->fci_entries_head;
+	list_for_each_entry_safe(flde, tmp, head, fce_list) {
+		/* add list if next is end of list */
+		if (range->lsr_start == flde->fce_range.lsr_start ||
+		   (range->lsr_end == flde->fce_range.lsr_end &&
+		    range->lsr_flags == flde->fce_range.lsr_flags)) {
+			fld_cache_entry_delete(cache, flde);
+			break;
+		}
+	}
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+int fld_cache_lookup(struct fld_cache *cache,
+		     const u64 seq, struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *prev = NULL;
+	struct list_head *head;
+
+	ENTRY;
+
+	read_lock(&cache->fci_lock);
+	head = &cache->fci_entries_head;
+
+	cache->fci_stat.fst_count++;
+	list_for_each_entry(flde, head, fce_list) {
+		if (flde->fce_range.lsr_start > seq) {
+			if (prev != NULL)
+				*range = prev->fce_range;
+			break;
+		}
+
+		prev = flde;
+		if (lu_seq_range_within(&flde->fce_range, seq)) {
+			*range = flde->fce_range;
+
+			cache->fci_stat.fst_cache++;
+			read_unlock(&cache->fci_lock);
+			RETURN(0);
+		}
+	}
+	read_unlock(&cache->fci_lock);
+	RETURN(-ENOENT);
+}
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
new file mode 100644
index 0000000000000..6f01007c59e8c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
@@ -0,0 +1,485 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fld/fld_handler.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include <lustre_req_layout.h>
+#include <lprocfs_status.h>
+#include "fld_internal.h"
+
+/* context key constructor/destructor: fld_key_init, fld_key_fini */
+LU_KEY_INIT_FINI(fld, struct fld_thread_info);
+
+/* context key: fld_thread_key */
+/* MGS thread may create llog file causing FLD lookup */
+LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD);
+
+int fld_server_mod_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&fld_thread_key);
+	return lu_context_key_register(&fld_thread_key);
+}
+
+void fld_server_mod_exit(void)
+{
+	lu_context_key_degister(&fld_thread_key);
+}
+
+int fld_declare_server_create(const struct lu_env *env,
+			      struct lu_server_fld *fld,
+			      const struct lu_seq_range *range,
+			      struct thandle *th)
+{
+	int rc;
+
+	rc = fld_declare_index_create(env, fld, range, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_declare_server_create);
+
+/**
+ * Insert FLD index entry and update FLD cache.
+ *
+ * This function is called from the sequence allocator when a super-sequence
+ * is granted to a server.
+ */
+int fld_server_create(const struct lu_env *env, struct lu_server_fld *fld,
+		      const struct lu_seq_range *range, struct thandle *th)
+{
+	int rc;
+
+	mutex_lock(&fld->lsf_lock);
+	rc = fld_index_create(env, fld, range, th);
+	mutex_unlock(&fld->lsf_lock);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_create);
+
+/**
+ * Extract index information from fld name like srv-fsname-MDT0000
+ **/
+int fld_name_to_index(const char *name, u32 *index)
+{
+	char *dash;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_INFO, "get index from %s\n", name);
+	dash = strrchr(name, '-');
+	if (!dash)
+		RETURN(-EINVAL);
+	dash++;
+	rc = target_name2index(dash, index, NULL);
+	RETURN(rc);
+}
+
+/**
+ * Retrieve fldb entry from MDT0 and add to local FLDB and cache.
+ **/
+int fld_update_from_controller(const struct lu_env *env,
+			       struct lu_server_fld *fld)
+{
+	struct fld_thread_info *info;
+	struct lu_seq_range *range;
+	struct lu_seq_range_array *lsra;
+	u32 index;
+	struct ptlrpc_request *req;
+	int rc;
+	int i;
+
+	ENTRY;
+
+	/*
+	 * Update only happens during initalization, i.e. local FLDB
+	 * does not exist yet
+	 */
+	if (!fld->lsf_new)
+		RETURN(0);
+
+	rc = fld_name_to_index(fld->lsf_name, &index);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* No need update fldb for MDT0 */
+	if (index == 0)
+		RETURN(0);
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+	range = &info->fti_lrange;
+	memset(range, 0, sizeof(*range));
+	range->lsr_index = index;
+	fld_range_set_mdt(range);
+
+	do {
+		rc = fld_client_rpc(fld->lsf_control_exp, range, FLD_READ,
+				    &req);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		LASSERT(req != NULL);
+		lsra = (struct lu_seq_range_array *)req_capsule_server_get(
+					  &req->rq_pill, &RMF_GENERIC_DATA);
+		if (!lsra)
+			GOTO(out, rc = -EPROTO);
+
+		range_array_le_to_cpu(lsra, lsra);
+		for (i = 0; i < lsra->lsra_count; i++) {
+			int rc1;
+
+			if (lsra->lsra_lsr[i].lsr_flags != LU_SEQ_RANGE_MDT)
+				GOTO(out, rc = -EINVAL);
+
+			if (lsra->lsra_lsr[i].lsr_index != index)
+				GOTO(out, rc = -EINVAL);
+
+			mutex_lock(&fld->lsf_lock);
+			rc1 = fld_insert_entry(env, fld, &lsra->lsra_lsr[i]);
+			mutex_unlock(&fld->lsf_lock);
+
+			if (rc1 != 0)
+				GOTO(out, rc = rc1);
+		}
+		if (rc == -EAGAIN)
+			*range = lsra->lsra_lsr[lsra->lsra_count - 1];
+	} while (rc == -EAGAIN);
+
+	fld->lsf_new = 1;
+out:
+	if (req)
+		ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_update_from_controller);
+
+/**
+ * Lookup sequece in local cache/fldb.
+ **/
+int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     u64 seq, struct lu_seq_range *range)
+{
+	struct lu_seq_range *erange;
+	struct fld_thread_info *info;
+	int rc;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+	erange = &info->fti_lrange;
+
+	/* Lookup it in the cache. */
+	rc = fld_cache_lookup(fld->lsf_cache, seq, erange);
+	if (rc == 0) {
+		if (unlikely(fld_range_type(erange) != fld_range_type(range) &&
+			     !fld_range_is_any(range))) {
+			CERROR("%s: FLD cache range "DRANGE" does not match requested flag %x: rc = %d\n",
+			       fld->lsf_name, PRANGE(erange), range->lsr_flags,
+			       -EIO);
+			RETURN(-EIO);
+		}
+		*range = *erange;
+		RETURN(0);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_local_lookup);
+
+/**
+ *  Lookup MDT/OST by seq, returns a range for given seq.
+ *
+ *  If that entry is not cached in fld cache, request is sent to super
+ *  sequence controller node (MDT0). All other MDT[1...N] and client
+ *  cache fld entries, but this cache is not persistent.
+ */
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		      u64 seq, struct lu_seq_range *range)
+{
+	u32 index;
+	int rc;
+
+	ENTRY;
+
+	rc = fld_local_lookup(env, fld, seq, range);
+	if (likely(rc == 0))
+		RETURN(rc);
+
+	rc = fld_name_to_index(fld->lsf_name, &index);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (index == 0 && rc == LDD_F_SV_TYPE_MDT) {
+		/*
+		 * On server side, all entries should be in cache.
+		 * If we can not find it in cache, just return error
+		 */
+		CERROR("%s: Cannot find sequence %#llx: rc = %d\n",
+		       fld->lsf_name, seq, -ENOENT);
+		RETURN(-ENOENT);
+	} else {
+		int i;
+
+		if (!fld->lsf_control_exp) {
+			CERROR("%s: lookup %#llx, but not connects to MDT0 yet: rc = %d.\n",
+			       fld->lsf_name, seq, -EIO);
+			RETURN(-EIO);
+		}
+		/*
+		 * send request to mdt0 i.e. super seq. controller.
+		 * This is temporary solution, long term solution is fld
+		 * replication on all mdt servers.
+		 */
+		range->lsr_start = seq;
+		for (i = 0; i < 5; i++) {
+			rc = fld_client_rpc(fld->lsf_control_exp,
+					    range, FLD_QUERY, NULL);
+			if (rc != -EAGAIN)
+				break;
+			schedule_timeout_interruptible(cfs_time_seconds(1));
+		}
+		if (rc == 0)
+			fld_cache_insert(fld->lsf_cache, range);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_lookup);
+
+/**
+ * All MDT server handle fld lookup operation. But only MDT0 has fld index.
+ * if entry is not found in cache we need to forward lookup request to MDT0
+ */
+static int fld_handle_lookup(struct tgt_session_info *tsi)
+{
+	struct obd_export *exp = tsi->tsi_exp;
+	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_server_fld *fld;
+	struct lu_seq_range *in;
+	struct lu_seq_range *out;
+	int rc;
+
+	ENTRY;
+
+	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
+	if (!in)
+		RETURN(err_serious(-EPROTO));
+
+	rc = req_capsule_server_pack(tsi->tsi_pill);
+	if (unlikely(rc != 0))
+		RETURN(err_serious(rc));
+
+	out = req_capsule_server_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
+	if (!out)
+		RETURN(err_serious(-EPROTO));
+	*out = *in;
+
+	fld = lu_site2seq(site)->ss_server_fld;
+
+	rc = fld_server_lookup(tsi->tsi_env, fld, in->lsr_start, out);
+
+	CDEBUG(D_INFO, "%s: FLD req handle: error %d (range: "DRANGE")\n",
+	       fld->lsf_name, rc, PRANGE(out));
+
+	RETURN(rc);
+}
+
+static int fld_handle_read(struct tgt_session_info *tsi)
+{
+	struct obd_export *exp = tsi->tsi_exp;
+	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_seq_range *in;
+	void *data;
+	int rc;
+
+	ENTRY;
+
+	req_capsule_set(tsi->tsi_pill, &RQF_FLD_READ);
+
+	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
+	if (!in)
+		RETURN(err_serious(-EPROTO));
+
+	req_capsule_set_size(tsi->tsi_pill, &RMF_GENERIC_DATA, RCL_SERVER,
+			     PAGE_SIZE);
+
+	rc = req_capsule_server_pack(tsi->tsi_pill);
+	if (unlikely(rc != 0))
+		RETURN(err_serious(rc));
+
+	data = req_capsule_server_get(tsi->tsi_pill, &RMF_GENERIC_DATA);
+
+	rc = fld_server_read(tsi->tsi_env, lu_site2seq(site)->ss_server_fld,
+			     in, data, PAGE_SIZE);
+	RETURN(rc);
+}
+
+static int fld_handle_query(struct tgt_session_info *tsi)
+{
+	int	rc;
+
+	ENTRY;
+
+	req_capsule_set(tsi->tsi_pill, &RQF_FLD_QUERY);
+
+	rc = fld_handle_lookup(tsi);
+
+	RETURN(rc);
+}
+
+/*
+ * Returns true, if fid is local to this server node.
+ *
+ * WARNING: this function is *not* guaranteed to return false if fid is
+ * remote: it makes an educated conservative guess only.
+ *
+ * fid_is_local() is supposed to be used in assertion checks only.
+ */
+int fid_is_local(const struct lu_env *env,
+		 struct lu_site *site, const struct lu_fid *fid)
+{
+	int result;
+	struct seq_server_site *ss_site;
+	struct lu_seq_range *range;
+	struct fld_thread_info *info;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	range = &info->fti_lrange;
+
+	result = 1; /* conservatively assume fid is local */
+	ss_site = lu_site2seq(site);
+	if (ss_site->ss_client_fld) {
+		int rc;
+
+		rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache,
+				      fid_seq(fid), range);
+		if (rc == 0)
+			result = (range->lsr_index == ss_site->ss_node_id);
+	}
+	return result;
+}
+EXPORT_SYMBOL(fid_is_local);
+
+static void fld_server_debugfs_fini(struct lu_server_fld *fld)
+{
+	debugfs_remove_recursive(fld->lsf_debugfs_entry);
+}
+
+static void fld_server_debugfs_init(struct lu_server_fld *fld)
+{
+	ENTRY;
+	fld->lsf_debugfs_entry = debugfs_create_dir(fld->lsf_name,
+						    fld_debugfs_dir);
+
+	debugfs_create_file("fldb", 0444, fld->lsf_debugfs_entry, fld,
+			    &fld_debugfs_seq_fops);
+}
+
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct dt_device *dt, const char *prefix, int type)
+{
+	int cache_size, cache_threshold;
+	int rc;
+
+	ENTRY;
+
+	snprintf(fld->lsf_name, sizeof(fld->lsf_name), "srv-%s", prefix);
+
+	cache_size = FLD_SERVER_CACHE_SIZE / sizeof(struct fld_cache_entry);
+
+	cache_threshold = cache_size * FLD_SERVER_CACHE_THRESHOLD / 100;
+
+	mutex_init(&fld->lsf_lock);
+	fld->lsf_cache = fld_cache_init(fld->lsf_name, cache_size,
+					cache_threshold);
+	if (IS_ERR(fld->lsf_cache)) {
+		rc = PTR_ERR(fld->lsf_cache);
+		fld->lsf_cache = NULL;
+		RETURN(rc);
+	}
+
+	rc = fld_index_init(env, fld, dt, type);
+	if (rc)
+		GOTO(out_cache, rc);
+
+	fld_server_debugfs_init(fld);
+
+	fld->lsf_control_exp = NULL;
+	fld->lsf_seq_lookup = fld_server_lookup;
+
+	fld->lsf_seq_lookup = fld_server_lookup;
+	RETURN(0);
+out_cache:
+	fld_cache_fini(fld->lsf_cache);
+	return rc;
+}
+EXPORT_SYMBOL(fld_server_init);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+	ENTRY;
+
+	fld_server_debugfs_fini(fld);
+	fld_index_fini(env, fld);
+
+	if (fld->lsf_cache) {
+		if (!IS_ERR(fld->lsf_cache))
+			fld_cache_fini(fld->lsf_cache);
+		fld->lsf_cache = NULL;
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(fld_server_fini);
+
+struct tgt_handler fld_handlers[] = {
+TGT_FLD_HDL_VAR(0,	FLD_QUERY,	fld_handle_query),
+TGT_FLD_HDL_VAR(0,	FLD_READ,	fld_handle_read),
+};
+EXPORT_SYMBOL(fld_handlers);
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_index.c b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
new file mode 100644
index 0000000000000..7188f45b95869
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
@@ -0,0 +1,531 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fld/fld_index.c
+ *
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <obd_support.h>
+#include <dt_object.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+static const char fld_index_name[] = "fld";
+
+static const struct lu_seq_range IGIF_FLD_RANGE = {
+	.lsr_start = FID_SEQ_IGIF,
+	.lsr_end   = FID_SEQ_IGIF_MAX + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range DOT_LUSTRE_FLD_RANGE = {
+	.lsr_start = FID_SEQ_DOT_LUSTRE,
+	.lsr_end   = FID_SEQ_DOT_LUSTRE + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range ROOT_FLD_RANGE = {
+	.lsr_start = FID_SEQ_ROOT,
+	.lsr_end   = FID_SEQ_ROOT + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct dt_index_features fld_index_features = {
+	.dif_flags       = DT_IND_UPDATE,
+	.dif_keysize_min = sizeof(u64),
+	.dif_keysize_max = sizeof(u64),
+	.dif_recsize_min = sizeof(struct lu_seq_range),
+	.dif_recsize_max = sizeof(struct lu_seq_range),
+	.dif_ptrsize     = 4
+};
+
+int fld_declare_index_create(const struct lu_env *env,
+			     struct lu_server_fld *fld,
+			     const struct lu_seq_range *new_range,
+			     struct thandle *th)
+{
+	struct lu_seq_range *tmp;
+	struct lu_seq_range *range;
+	struct fld_thread_info *info;
+	int rc = 0;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	range = &info->fti_lrange;
+	tmp = &info->fti_irange;
+	memset(range, 0, sizeof(*range));
+
+	rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+	if (rc == 0) {
+		/* In case of duplicate entry, the location must be same */
+		LASSERT((lu_seq_range_compare_loc(new_range, range) == 0));
+		GOTO(out, rc = -EEXIST);
+	}
+
+	if (rc != -ENOENT) {
+		CERROR("%s: lookup range "DRANGE" error: rc = %d\n",
+			fld->lsf_name, PRANGE(range), rc);
+		GOTO(out, rc);
+	}
+
+	/*
+	 * Check for merge case, since the fld entry can only be increamental,
+	 * so we will only check whether it can be merged from the left.
+	 */
+	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+	    lu_seq_range_compare_loc(new_range, range) == 0) {
+		range_cpu_to_be(tmp, range);
+		rc = dt_declare_delete(env, fld->lsf_obj,
+				       (struct dt_key *)&tmp->lsr_start, th);
+		if (rc) {
+			CERROR("%s: declare record "DRANGE" failed: rc = %d\n",
+			       fld->lsf_name, PRANGE(range), rc);
+			GOTO(out, rc);
+		}
+		*tmp = *new_range;
+		tmp->lsr_start = range->lsr_start;
+	} else {
+		*tmp = *new_range;
+	}
+
+	range_cpu_to_be(tmp, tmp);
+	rc = dt_declare_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+			       (struct dt_key *)&tmp->lsr_start, th);
+out:
+	RETURN(rc);
+}
+
+/**
+ * insert range in fld store.
+ *
+ *      \param  range  range to be inserted
+ *      \param  th     transaction for this operation as it could compound
+ *                     transaction.
+ *
+ *      \retval  0  success
+ *      \retval  -ve error
+ *
+ * The whole fld index insertion is protected by seq->lss_mutex (see
+ * seq_server_alloc_super), i.e. only one thread will access fldb each
+ * time, so we do not need worry the fld file and cache will being
+ * changed between declare and create.
+ * Because the fld entry can only be increamental, so we will only check
+ * whether it can be merged from the left.
+ *
+ * Caller must hold fld->lsf_lock
+ **/
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+		     const struct lu_seq_range *new_range, struct thandle *th)
+{
+	struct lu_seq_range *range;
+	struct lu_seq_range *tmp;
+	struct fld_thread_info *info;
+	int rc = 0;
+	int deleted = 0;
+	struct fld_cache_entry *flde;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+
+	LASSERT(mutex_is_locked(&fld->lsf_lock));
+
+	range = &info->fti_lrange;
+	memset(range, 0, sizeof(*range));
+	tmp = &info->fti_irange;
+	rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+	if (rc != -ENOENT) {
+		rc = rc == 0 ? -EEXIST : rc;
+		GOTO(out, rc);
+	}
+
+	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+	    lu_seq_range_compare_loc(new_range, range) == 0) {
+		range_cpu_to_be(tmp, range);
+		rc = dt_delete(env, fld->lsf_obj,
+			       (struct dt_key *)&tmp->lsr_start, th);
+		if (rc != 0)
+			GOTO(out, rc);
+		*tmp = *new_range;
+		tmp->lsr_start = range->lsr_start;
+		deleted = 1;
+	} else {
+		*tmp = *new_range;
+	}
+
+	range_cpu_to_be(tmp, tmp);
+	rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+		       (struct dt_key *)&tmp->lsr_start, th);
+	if (rc != 0) {
+		CERROR("%s: insert range "DRANGE" failed: rc = %d\n",
+		       fld->lsf_name, PRANGE(new_range), rc);
+		GOTO(out, rc);
+	}
+
+	flde = fld_cache_entry_create(new_range);
+	if (IS_ERR(flde))
+		GOTO(out, rc = PTR_ERR(flde));
+
+	write_lock(&fld->lsf_cache->fci_lock);
+	if (deleted)
+		fld_cache_delete_nolock(fld->lsf_cache, new_range);
+	rc = fld_cache_insert_nolock(fld->lsf_cache, flde);
+	write_unlock(&fld->lsf_cache->fci_lock);
+	if (rc)
+		OBD_FREE_PTR(flde);
+out:
+	RETURN(rc);
+}
+
+/**
+ * lookup range for a seq passed. note here we only care about the start/end,
+ * caller should handle the attached location data (flags, index).
+ *
+ * \param  seq     seq for lookup.
+ * \param  range   result of lookup.
+ *
+ * \retval  0           found, \a range is the matched range;
+ * \retval -ENOENT      not found, \a range is the left-side range;
+ * \retval  -ve         other error;
+ */
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     u64 seq, struct lu_seq_range *range)
+{
+	struct lu_seq_range *fld_rec;
+	struct fld_thread_info *info;
+	int rc;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	fld_rec = &info->fti_rec;
+
+	rc = fld_cache_lookup(fld->lsf_cache, seq, fld_rec);
+	if (rc == 0) {
+		*range = *fld_rec;
+		if (lu_seq_range_within(range, seq))
+			rc = 0;
+		else
+			rc = -ENOENT;
+	}
+
+	CDEBUG(D_INFO, "%s: lookup seq = %#llx range : "DRANGE" rc = %d\n",
+	       fld->lsf_name, seq, PRANGE(range), rc);
+
+	RETURN(rc);
+}
+
+/**
+ * insert entry in fld store.
+ *
+ * \param  env    relevant lu_env
+ * \param  fld    fld store
+ * \param  range  range to be inserted
+ *
+ * \retval  0  success
+ * \retval  -ve error
+ *
+ * Caller must hold fld->lsf_lock
+ **/
+
+int fld_insert_entry(const struct lu_env *env,
+		     struct lu_server_fld *fld,
+		     const struct lu_seq_range *range)
+{
+	struct thandle *th;
+	struct dt_device *dt = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev);
+	int rc;
+
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&fld->lsf_lock));
+
+	if (dt->dd_rdonly)
+		RETURN(0);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = fld_declare_index_create(env, fld, range, th);
+	if (rc != 0) {
+		if (rc == -EEXIST)
+			rc = 0;
+		GOTO(out, rc);
+	}
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = fld_index_create(env, fld, range, th);
+	if (rc == -EEXIST)
+		rc = 0;
+out:
+	dt_trans_stop(env, dt, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_insert_entry);
+
+static int fld_insert_special_entries(const struct lu_env *env,
+				      struct lu_server_fld *fld)
+{
+	int rc;
+
+	rc = fld_insert_entry(env, fld, &IGIF_FLD_RANGE);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = fld_insert_entry(env, fld, &DOT_LUSTRE_FLD_RANGE);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = fld_insert_entry(env, fld, &ROOT_FLD_RANGE);
+
+	RETURN(rc);
+}
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+		   struct dt_device *dt, int type)
+{
+	struct dt_object *dt_obj = NULL;
+	struct lu_fid fid;
+	struct lu_attr *attr = NULL;
+	struct lu_seq_range *range = NULL;
+	struct fld_thread_info *info;
+	struct dt_object_format dof;
+	struct dt_it *it;
+	const struct dt_it_ops *iops;
+	int rc;
+	u32 index;
+	int range_count = 0;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+
+	lu_local_obj_fid(&fid, FLD_INDEX_OID);
+	OBD_ALLOC_PTR(attr);
+	if (!attr)
+		RETURN(-ENOMEM);
+
+	memset(attr, 0, sizeof(*attr));
+	attr->la_valid = LA_MODE;
+	attr->la_mode = S_IFREG | 0666;
+	dof.dof_type = DFT_INDEX;
+	dof.u.dof_idx.di_feat = &fld_index_features;
+
+	dt_obj = dt_locate(env, dt, &fid);
+	if (IS_ERR(dt_obj)) {
+		rc = PTR_ERR(dt_obj);
+		dt_obj = NULL;
+		GOTO(out, rc);
+	}
+
+	LASSERT(dt_obj != NULL);
+	if (!dt_object_exists(dt_obj)) {
+		dt_object_put(env, dt_obj);
+		dt_obj = dt_find_or_create(env, dt, &fid, &dof, attr);
+		fld->lsf_new = 1;
+		if (IS_ERR(dt_obj)) {
+			rc = PTR_ERR(dt_obj);
+			CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name,
+				fld_index_name, rc);
+			dt_obj = NULL;
+			GOTO(out, rc);
+		}
+	}
+
+	fld->lsf_obj = dt_obj;
+	rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features);
+	if (rc != 0) {
+		CERROR("%s: File \"%s\" is not an index: rc = %d!\n",
+		       fld->lsf_name, fld_index_name, rc);
+		GOTO(out, rc);
+	}
+
+	range = &info->fti_rec;
+	/* Load fld entry to cache */
+	iops = &dt_obj->do_index_ops->dio_it;
+	it = iops->init(env, dt_obj, 0);
+	if (IS_ERR(it))
+		GOTO(out, rc = PTR_ERR(it));
+
+	rc = iops->load(env, it, 0);
+	if (rc > 0)
+		rc = 0;
+	else if (rc == 0)
+		rc = iops->next(env, it);
+
+	if (rc < 0)
+		GOTO(out_it_fini, rc);
+
+	while (rc == 0) {
+		rc = iops->rec(env, it, (struct dt_rec *)range, 0);
+		if (rc != 0)
+			GOTO(out_it_put, rc);
+
+		range_be_to_cpu(range, range);
+
+		/*
+		 * Newly created ldiskfs IAM indexes may include a
+		 * zeroed-out key and record. Ignore it here.
+		 */
+		if (range->lsr_start < range->lsr_end) {
+			rc = fld_cache_insert(fld->lsf_cache, range);
+			if (rc != 0)
+				GOTO(out_it_put, rc);
+
+			range_count++;
+		}
+
+		rc = iops->next(env, it);
+		if (rc < 0)
+			GOTO(out_it_fini, rc);
+	}
+
+	if (range_count == 0)
+		fld->lsf_new = 1;
+
+	rc = fld_name_to_index(fld->lsf_name, &index);
+	if (rc < 0)
+		GOTO(out_it_put, rc);
+	else
+		rc = 0;
+
+	if (index == 0 && type == LU_SEQ_RANGE_MDT) {
+		/*
+		 * Note: fld_insert_entry will detect whether these
+		 * special entries already exist inside FLDB
+		 */
+		mutex_lock(&fld->lsf_lock);
+		rc = fld_insert_special_entries(env, fld);
+		mutex_unlock(&fld->lsf_lock);
+		if (rc != 0) {
+			CERROR("%s: insert special entries failed!: rc = %d\n",
+			       fld->lsf_name, rc);
+			GOTO(out_it_put, rc);
+		}
+	}
+out_it_put:
+	iops->put(env, it);
+out_it_fini:
+	iops->fini(env, it);
+out:
+	if (attr)
+		OBD_FREE_PTR(attr);
+
+	if (rc < 0) {
+		if (dt_obj)
+			dt_object_put(env, dt_obj);
+		fld->lsf_obj = NULL;
+	}
+	RETURN(rc);
+}
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+	ENTRY;
+	if (fld->lsf_obj) {
+		if (!IS_ERR(fld->lsf_obj))
+			dt_object_put(env, fld->lsf_obj);
+		fld->lsf_obj = NULL;
+	}
+	EXIT;
+}
+
+int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct lu_seq_range *range, void *data, int data_len)
+{
+	struct lu_seq_range_array *lsra = data;
+	struct fld_thread_info *info;
+	struct dt_object *dt_obj = fld->lsf_obj;
+	struct lu_seq_range *entry;
+	struct dt_it *it;
+	const struct dt_it_ops *iops;
+	int rc;
+
+	ENTRY;
+
+	lsra->lsra_count = 0;
+	iops = &dt_obj->do_index_ops->dio_it;
+	it = iops->init(env, dt_obj, 0);
+	if (IS_ERR(it))
+		RETURN(PTR_ERR(it));
+
+	rc = iops->load(env, it, range->lsr_end);
+	if (rc <= 0)
+		GOTO(out_it_fini, rc);
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+	entry = &info->fti_rec;
+	do {
+		rc = iops->rec(env, it, (struct dt_rec *)entry, 0);
+		if (rc != 0)
+			GOTO(out_it_put, rc);
+
+		if (offsetof(typeof(*lsra), lsra_lsr[lsra->lsra_count + 1]) >
+		    data_len)
+			GOTO(out, rc = -EAGAIN);
+
+		range_be_to_cpu(entry, entry);
+		if (entry->lsr_index == range->lsr_index &&
+		    entry->lsr_flags == range->lsr_flags &&
+		    entry->lsr_start > range->lsr_start) {
+			lsra->lsra_lsr[lsra->lsra_count] = *entry;
+			lsra->lsra_count++;
+		}
+
+		rc = iops->next(env, it);
+	} while (rc == 0);
+	if (rc > 0)
+		rc = 0;
+out:
+	range_array_cpu_to_le(lsra, lsra);
+out_it_put:
+	iops->put(env, it);
+out_it_fini:
+	iops->fini(env, it);
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
new file mode 100644
index 0000000000000..84c0f92ac21f4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
@@ -0,0 +1,214 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fld/fld_internal.h
+ *
+ * Subsystem Description:
+ * FLD is FID Location Database, which stores where (IE, on which MDT)
+ * FIDs are located.
+ * The database is basically a record file, each record consists of a FID
+ * sequence range, MDT/OST index, and flags. The FLD for the whole FS
+ * is only stored on the sequence controller(MDT0) right now, but each target
+ * also has its local FLD, which only stores the local sequence.
+ *
+ * The FLD subsystem usually has two tasks:
+ * 1. maintain the database, i.e. when the sequence controller allocates
+ * new sequence ranges to some nodes, it will call the FLD API to insert the
+ * location information <sequence_range, node_index> in FLDB.
+ *
+ * 2. Handle requests from other nodes, i.e. if client needs to know where
+ * the FID is located, if it can not find the information in the local cache,
+ * it will send a FLD lookup RPC to the FLD service, and the FLD service will
+ * look up the FLDB entry and return the location information to client.
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Tom WangDi <wangdi@clusterfs.com>
+ */
+#ifndef __FLD_INTERNAL_H
+#define __FLD_INTERNAL_H
+
+#include <obd.h>
+#include <libcfs/libcfs.h>
+#include <lustre_fld.h>
+
+struct fld_stats {
+        __u64   fst_count;
+        __u64   fst_cache;
+};
+
+struct lu_fld_hash {
+	const char		*fh_name;
+	int			(*fh_hash_func)(struct lu_client_fld *fld,
+						__u64 seq);
+	struct lu_fld_target *	(*fh_scan_func)(struct lu_client_fld *fld,
+						__u64 seq);
+};
+
+struct fld_cache_entry {
+	struct list_head	fce_lru;
+	struct list_head	fce_list;
+	/**
+	 * fld cache entries are sorted on range->lsr_start field. */
+	struct lu_seq_range	fce_range;
+};
+
+struct fld_cache {
+	/**
+	 * Cache guard, protects fci_hash mostly because others immutable after
+	 * init is finished.
+	 */
+	rwlock_t		 fci_lock;
+
+        /**
+         * Cache shrink threshold */
+        int                      fci_threshold;
+
+        /**
+         * Prefered number of cached entries */
+        int                      fci_cache_size;
+
+        /**
+         * Current number of cached entries. Protected by \a fci_lock */
+        int                      fci_cache_count;
+
+        /**
+         * LRU list fld entries. */
+	struct list_head	fci_lru;
+
+        /**
+         * sorted fld entries. */
+	struct list_head	fci_entries_head;
+
+	/**
+	 * Cache statistics.
+	 */
+	struct fld_stats	fci_stat;
+
+	/**
+	 * Cache name used for debug and messages.
+	 */
+	char			fci_name[LUSTRE_MDT_MAXNAMELEN];
+};
+
+enum {
+        /* 4M of FLD cache will not hurt client a lot. */
+        FLD_SERVER_CACHE_SIZE      = (4 * 0x100000),
+
+        /* 1M of FLD cache will not hurt client a lot. */
+        FLD_CLIENT_CACHE_SIZE      = (1 * 0x100000)
+};
+
+enum {
+        /* Cache threshold is 10 percent of size. */
+        FLD_SERVER_CACHE_THRESHOLD = 10,
+
+        /* Cache threshold is 10 percent of size. */
+        FLD_CLIENT_CACHE_THRESHOLD = 10
+};
+
+extern struct lu_fld_hash fld_hash[];
+
+# ifdef HAVE_SERVER_SUPPORT
+struct fld_thread_info {
+	struct lu_seq_range fti_rec;
+	struct lu_seq_range fti_lrange;
+	struct lu_seq_range fti_irange;
+};
+
+extern struct lu_context_key fld_thread_key;
+
+struct dt_device;
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+		   struct dt_device *dt, int type);
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_index_create(const struct lu_env *env,
+			     struct lu_server_fld *fld,
+			     const struct lu_seq_range *new_range,
+			     struct thandle *th);
+
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+		     const struct lu_seq_range *new_range, struct thandle *th);
+
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     u64 seq, struct lu_seq_range *range);
+
+int fld_name_to_index(const char *name, __u32 *index);
+
+int fld_server_mod_init(void);
+void fld_server_mod_exit(void);
+
+int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct lu_seq_range *range, void *data, int data_len);
+
+extern const struct file_operations fld_debugfs_seq_fops;
+extern struct dentry *fld_debugfs_dir;
+
+# endif /* HAVE_SERVER_SUPPORT */
+
+int fld_client_rpc(struct obd_export *exp,
+                   struct lu_seq_range *range, __u32 fld_op,
+		   struct ptlrpc_request **reqp);
+
+extern struct ldebugfs_vars fld_client_debugfs_list[];
+
+struct fld_cache *fld_cache_init(const char *name,
+                                 int cache_size, int cache_threshold);
+
+void fld_cache_fini(struct fld_cache *cache);
+
+void fld_cache_flush(struct fld_cache *cache);
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range);
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range);
+
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new);
+void fld_cache_delete_nolock(struct fld_cache *cache,
+			     const struct lu_seq_range *range);
+int fld_cache_lookup(struct fld_cache *cache,
+		     const u64 seq, struct lu_seq_range *range);
+
+static inline const char *
+fld_target_name(const struct lu_fld_target *tar)
+{
+#ifdef HAVE_SERVER_SUPPORT
+	if (tar->ft_srv != NULL)
+		return tar->ft_srv->lsf_name;
+#endif
+
+	return tar->ft_exp->exp_obd->obd_name;
+}
+
+#endif /* __FLD_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_request.c b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
new file mode 100644
index 0000000000000..e381eb87634fc
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
@@ -0,0 +1,544 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fld/fld_request.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <linux/delay.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_mdc.h>
+#include "fld_internal.h"
+
+static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq)
+{
+	LASSERT(fld->lcf_count > 0);
+	return do_div(seq, fld->lcf_count);
+}
+
+static struct lu_fld_target *
+fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
+{
+	struct lu_fld_target *target;
+	int hash;
+
+	ENTRY;
+
+	/*
+	 * Because almost all of special sequence located in MDT0,
+	 * it should go to index 0 directly, instead of calculating
+	 * hash again, and also if other MDTs is not being connected,
+	 * the fld lookup requests(for seq on MDT0) should not be
+	 * blocked because of other MDTs
+	 */
+	if (fid_seq_is_norm(seq))
+		hash = fld_rrb_hash(fld, seq);
+	else
+		hash = 0;
+
+again:
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+		if (target->ft_idx == hash)
+			RETURN(target);
+	}
+
+	if (hash != 0) {
+		/*
+		 * It is possible the remote target(MDT) are not connected to
+		 * with client yet, so we will refer this to MDT0, which should
+		 * be connected during mount
+		 */
+		hash = 0;
+		goto again;
+	}
+
+	CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n",
+	       fld->lcf_name, hash, seq, fld->lcf_count);
+
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+		const char *srv_name = target->ft_srv != NULL  ?
+			target->ft_srv->lsf_name : "<null>";
+		const char *exp_name = target->ft_exp != NULL ?
+			(char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+			"<null>";
+
+		CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n",
+		       target->ft_exp, exp_name, target->ft_srv,
+		       srv_name, target->ft_idx);
+	}
+
+	/*
+	 * If target is not found, there is logical error anyway, so here is
+	 * LBUG() to catch this situation.
+	 */
+	LBUG();
+	RETURN(NULL);
+}
+
+struct lu_fld_hash fld_hash[] = {
+	{
+		.fh_name = "RRB",
+		.fh_hash_func = fld_rrb_hash,
+		.fh_scan_func = fld_rrb_scan
+	},
+	{
+		NULL,
+	}
+};
+
+static struct lu_fld_target *
+fld_client_get_target(struct lu_client_fld *fld, u64 seq)
+{
+	struct lu_fld_target *target;
+
+	ENTRY;
+
+	LASSERT(fld->lcf_hash != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	target = fld->lcf_hash->fh_scan_func(fld, seq);
+	spin_unlock(&fld->lcf_lock);
+
+	if (target) {
+		CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n",
+		       fld->lcf_name, target->ft_idx, seq);
+	}
+
+	RETURN(target);
+}
+
+/*
+ * Add export to FLD. This is usually done by CMM and LMV as they are main users
+ * of FLD module.
+ */
+int fld_client_add_target(struct lu_client_fld *fld,
+			  struct lu_fld_target *tar)
+{
+	const char *name;
+	struct lu_fld_target *target, *tmp;
+
+	ENTRY;
+
+	LASSERT(tar != NULL);
+	name = fld_target_name(tar);
+	LASSERT(name != NULL);
+	LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+
+	CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n", fld->lcf_name,
+	       name, tar->ft_idx);
+
+	OBD_ALLOC_PTR(target);
+	if (!target)
+		RETURN(-ENOMEM);
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
+		if (tmp->ft_idx == tar->ft_idx) {
+			spin_unlock(&fld->lcf_lock);
+			OBD_FREE_PTR(target);
+			CERROR("Target %s exists in FLD and known as %s:#%llu\n",
+			       name, fld_target_name(tmp), tmp->ft_idx);
+			RETURN(-EEXIST);
+		}
+	}
+
+	target->ft_exp = tar->ft_exp;
+	if (target->ft_exp)
+		class_export_get(target->ft_exp);
+	target->ft_srv = tar->ft_srv;
+	target->ft_idx = tar->ft_idx;
+
+	list_add_tail(&target->ft_chain, &fld->lcf_targets);
+
+	fld->lcf_count++;
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(fld_client_add_target);
+
+/* Remove export from FLD */
+int fld_client_del_target(struct lu_client_fld *fld, u64 idx)
+{
+	struct lu_fld_target *target, *tmp;
+
+	ENTRY;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) {
+		if (target->ft_idx == idx) {
+			fld->lcf_count--;
+			list_del(&target->ft_chain);
+			spin_unlock(&fld->lcf_lock);
+
+			if (target->ft_exp)
+				class_export_put(target->ft_exp);
+
+			OBD_FREE_PTR(target);
+			RETURN(0);
+		}
+	}
+	spin_unlock(&fld->lcf_lock);
+	RETURN(-ENOENT);
+}
+
+struct dentry *fld_debugfs_dir;
+
+static void fld_client_debugfs_init(struct lu_client_fld *fld)
+{
+	ENTRY;
+	fld->lcf_debugfs_entry = debugfs_create_dir(fld->lcf_name,
+						   fld_debugfs_dir);
+	ldebugfs_add_vars(fld->lcf_debugfs_entry,
+			  fld_client_debugfs_list,
+			  fld);
+}
+
+void fld_client_debugfs_fini(struct lu_client_fld *fld)
+{
+	debugfs_remove_recursive(fld->lcf_debugfs_entry);
+}
+EXPORT_SYMBOL(fld_client_debugfs_fini);
+
+static inline int hash_is_sane(int hash)
+{
+	return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+}
+
+int fld_client_init(struct lu_client_fld *fld,
+		    const char *prefix, int hash)
+{
+	int cache_size, cache_threshold;
+	int rc = 0;
+
+	ENTRY;
+	snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+		 "cli-%s", prefix);
+
+	if (!hash_is_sane(hash)) {
+		CERROR("%s: Wrong hash function %#x\n",
+		       fld->lcf_name, hash);
+		RETURN(-EINVAL);
+	}
+
+	fld->lcf_count = 0;
+	spin_lock_init(&fld->lcf_lock);
+	fld->lcf_hash = &fld_hash[hash];
+	INIT_LIST_HEAD(&fld->lcf_targets);
+
+	cache_size = FLD_CLIENT_CACHE_SIZE /
+		sizeof(struct fld_cache_entry);
+
+	cache_threshold = cache_size *
+		FLD_CLIENT_CACHE_THRESHOLD / 100;
+
+	fld->lcf_cache = fld_cache_init(fld->lcf_name,
+					cache_size, cache_threshold);
+	if (IS_ERR(fld->lcf_cache)) {
+		rc = PTR_ERR(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+		GOTO(out, rc);
+	}
+
+	fld_client_debugfs_init(fld);
+	EXIT;
+out:
+	if (rc)
+		fld_client_fini(fld);
+	else
+		CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+		       fld->lcf_name, fld->lcf_hash->fh_name);
+	return rc;
+}
+EXPORT_SYMBOL(fld_client_init);
+
+void fld_client_fini(struct lu_client_fld *fld)
+{
+	struct lu_fld_target *target, *tmp;
+
+	ENTRY;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) {
+		fld->lcf_count--;
+		list_del(&target->ft_chain);
+		if (target->ft_exp)
+			class_export_put(target->ft_exp);
+		OBD_FREE_PTR(target);
+	}
+	spin_unlock(&fld->lcf_lock);
+
+	if (fld->lcf_cache) {
+		if (!IS_ERR(fld->lcf_cache))
+			fld_cache_fini(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(fld_client_fini);
+
+int fld_client_rpc(struct obd_export *exp,
+		   struct lu_seq_range *range, u32 fld_op,
+		   struct ptlrpc_request **reqp)
+{
+	struct ptlrpc_request *req = NULL;
+	struct lu_seq_range *prange;
+	u32 *op;
+	int rc = 0;
+	struct obd_import *imp;
+
+	ENTRY;
+
+	LASSERT(exp != NULL);
+
+	imp = class_exp2cliimp(exp);
+again:
+	switch (fld_op) {
+	case FLD_QUERY:
+		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY,
+						LUSTRE_MDS_VERSION, FLD_QUERY);
+		if (!req)
+			RETURN(-ENOMEM);
+
+		/*
+		 * XXX: only needed when talking to old server(< 2.6), it should
+		 * be removed when < 2.6 server is not supported
+		 */
+		op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
+		*op = FLD_LOOKUP;
+
+		/*
+		 * For MDS_MDS seq lookup, it will always use LWP connection,
+		 * but LWP will be evicted after restart, so cause the error.
+		 * so we will set no_delay for seq lookup request, once the
+		 * request fails because of the eviction. always retry here
+		 */
+		if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
+			req->rq_allow_replay = 1;
+			req->rq_no_delay = 1;
+		}
+		break;
+	case FLD_READ:
+		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_READ,
+						LUSTRE_MDS_VERSION, FLD_READ);
+		if (!req)
+			RETURN(-ENOMEM);
+
+		req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA,
+				     RCL_SERVER, PAGE_SIZE);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	if (rc != 0)
+		RETURN(rc);
+
+	prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
+	*prange = *range;
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = FLD_REQUEST_PORTAL;
+	req->rq_reply_portal = MDC_REPLY_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ) && req->rq_no_delay) {
+		/* the same error returned by ptlrpc_import_delay_req */
+		rc = -EAGAIN;
+		req->rq_status = rc;
+	} else {
+		rc = ptlrpc_queue_wait(req);
+	}
+
+	if (rc == -ENOENT) {
+		/* Don't loop forever on non-existing FID sequences. */
+		GOTO(out_req, rc);
+	}
+
+	if (rc != 0) {
+		if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+		    !imp->imp_deactive &&
+		    imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS &&
+		    OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) &&
+		    rc != -ENOTSUPP) {
+			/* LWP is not replayable, retry after a while */
+			rc = -EAGAIN;
+		}
+		if (rc == -EAGAIN) {
+			ptlrpc_req_finished(req);
+			if (msleep_interruptible(2 * MSEC_PER_SEC))
+				GOTO(out_req, rc = -EINTR);
+			rc = 0;
+			goto again;
+		}
+		GOTO(out_req, rc);
+	}
+
+	if (fld_op == FLD_QUERY) {
+		prange = req_capsule_server_get(&req->rq_pill,
+						&RMF_FLD_MDFLD);
+		if (!prange)
+			GOTO(out_req, rc = -EFAULT);
+		*range = *prange;
+	}
+
+	EXIT;
+out_req:
+	if (rc != 0 || !reqp) {
+		ptlrpc_req_finished(req);
+		req = NULL;
+	}
+
+	if (reqp)
+		*reqp = req;
+
+	return rc;
+}
+
+int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
+		      u32 flags, const struct lu_env *env)
+{
+	struct lu_seq_range res = { 0 };
+	struct lu_fld_target *target;
+	struct lu_fld_target *origin;
+	int rc;
+
+	ENTRY;
+
+	rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
+	if (rc == 0) {
+		*mds = res.lsr_index;
+		RETURN(0);
+	}
+
+	/* Can not find it in the cache */
+	target = fld_client_get_target(fld, seq);
+	LASSERT(target != NULL);
+	origin = target;
+again:
+	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n",
+	       fld->lcf_name, seq, fld_target_name(target), target->ft_idx);
+
+	res.lsr_start = seq;
+	fld_range_set_type(&res, flags);
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (target->ft_srv) {
+		LASSERT(env != NULL);
+		rc = fld_server_lookup(env, target->ft_srv, seq, &res);
+	} else
+#endif /* HAVE_SERVER_SUPPORT */
+	{
+		rc = fld_client_rpc(target->ft_exp, &res, FLD_QUERY, NULL);
+	}
+
+	if (rc == -ESHUTDOWN) {
+		/*
+		 * If fld lookup failed because the target has been shutdown,
+		 * then try next target in the list, until trying all targets
+		 * or fld lookup succeeds
+		 */
+		spin_lock(&fld->lcf_lock);
+		/*
+		 * If the next entry in the list is the head of the list,
+		 * move to the next entry after the head and retrieve
+		 * the target. Else retreive the next target entry.
+		 */
+		if (target->ft_chain.next == &fld->lcf_targets)
+			target = list_entry(target->ft_chain.next->next,
+					    struct lu_fld_target, ft_chain);
+		else
+			target = list_entry(target->ft_chain.next,
+						 struct lu_fld_target,
+						 ft_chain);
+		spin_unlock(&fld->lcf_lock);
+		if (target != origin)
+			goto again;
+	}
+	if (rc == 0) {
+		*mds = res.lsr_index;
+		fld_cache_insert(fld->lcf_cache, &res);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_client_lookup);
+
+void fld_client_flush(struct lu_client_fld *fld)
+{
+	fld_cache_flush(fld->lcf_cache);
+}
+
+static int __init fld_init(void)
+{
+#ifdef HAVE_SERVER_SUPPORT
+	int rc;
+
+	rc = fld_server_mod_init();
+	if (rc)
+		return rc;
+#endif /* HAVE_SERVER_SUPPORT */
+
+	fld_debugfs_dir = debugfs_create_dir(LUSTRE_FLD_NAME,
+					     debugfs_lustre_root);
+	return PTR_ERR_OR_ZERO(fld_debugfs_dir);
+}
+
+static void __exit fld_exit(void)
+{
+#ifdef HAVE_SERVER_SUPPORT
+	fld_server_mod_exit();
+#endif /* HAVE_SERVER_SUPPORT */
+
+	debugfs_remove_recursive(fld_debugfs_dir);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FID Location Database");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(fld_init);
+module_exit(fld_exit);
diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
new file mode 100644
index 0000000000000..91641015c94bd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
@@ -0,0 +1,357 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/fld/lproc_fld.c
+ *
+ * FLD (FIDs Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ *	Di Wang <di.wang@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+
+#ifdef HAVE_SERVER_SUPPORT
+#include <dt_object.h>
+#endif
+#include <obd_support.h>
+#include <lustre_fld.h>
+#include <lustre_fid.h>
+#include "fld_internal.h"
+
+static int
+fld_debugfs_targets_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+        struct lu_fld_target *target;
+
+	ENTRY;
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain)
+	seq_printf(m, "%s\n", fld_target_name(target));
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(0);
+}
+
+static int
+fld_debugfs_hash_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+
+	ENTRY;
+	spin_lock(&fld->lcf_lock);
+	seq_printf(m, "%s\n", fld->lcf_hash->fh_name);
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(0);
+}
+
+static ssize_t
+fld_debugfs_hash_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct lu_client_fld *fld = m->private;
+	struct lu_fld_hash *hash = NULL;
+	char fh_name[8];
+	int i;
+
+	if (count > sizeof(fh_name))
+		return -ENAMETOOLONG;
+
+	if (copy_from_user(fh_name, buffer, count) != 0)
+		return -EFAULT;
+
+	for (i = 0; fld_hash[i].fh_name; i++) {
+		if (count != strlen(fld_hash[i].fh_name))
+			continue;
+
+		if (!strncmp(fld_hash[i].fh_name, fh_name, count)) {
+			hash = &fld_hash[i];
+			break;
+		}
+	}
+
+	if (hash) {
+		spin_lock(&fld->lcf_lock);
+		fld->lcf_hash = hash;
+		spin_unlock(&fld->lcf_lock);
+
+		CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n",
+		       fld->lcf_name, hash->fh_name);
+	}
+
+	return count;
+}
+
+static ssize_t ldebugfs_cache_flush_seq_write(struct file *file,
+					      const char __user *buffer,
+					      size_t count, loff_t *pos)
+{
+	struct seq_file *m = file->private_data;
+	struct lu_client_fld *fld = m->private;
+
+	ENTRY;
+        fld_cache_flush(fld->lcf_cache);
+
+        CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
+
+        RETURN(count);
+}
+
+LDEBUGFS_SEQ_FOPS_RO(fld_debugfs_targets);
+LDEBUGFS_SEQ_FOPS(fld_debugfs_hash);
+LDEBUGFS_FOPS_WR_ONLY(fld, cache_flush);
+
+struct ldebugfs_vars fld_client_debugfs_list[] = {
+	{ .name	=	"targets",
+	  .fops	=	&fld_debugfs_targets_fops	},
+	{ .name	=	"hash",
+	  .fops	=	&fld_debugfs_hash_fops	},
+	{ .name	=	"cache_flush",
+	  .fops	=	&fld_cache_flush_fops	},
+	{ NULL }
+};
+
+#ifdef HAVE_SERVER_SUPPORT
+struct fld_seq_param {
+	struct lu_env		fsp_env;
+	struct dt_it		*fsp_it;
+	struct lu_server_fld	*fsp_fld;
+	unsigned int		fsp_stop:1;
+};
+
+static void *fldb_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld    *fld;
+	struct dt_object        *obj;
+	const struct dt_it_ops  *iops;
+	struct dt_key		*key;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->load(&param->fsp_env, param->fsp_it, *pos);
+	if (rc <= 0)
+		return NULL;
+
+	key = iops->key(&param->fsp_env, param->fsp_it);
+	if (IS_ERR(key))
+		return NULL;
+
+	*pos = be64_to_cpu(*(__u64 *)key);
+
+	return param;
+}
+
+static void fldb_seq_stop(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	const struct dt_it_ops	*iops;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+
+	if (param == NULL)
+		return;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	iops->put(&param->fsp_env, param->fsp_it);
+}
+
+static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	int			rc;
+
+	++*pos;
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->next(&param->fsp_env, param->fsp_it);
+	if (rc > 0) {
+		param->fsp_stop = 1;
+		return NULL;
+	}
+
+	*pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+	return param;
+}
+
+static int fldb_seq_show(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	struct fld_thread_info	*info;
+	struct lu_seq_range	*fld_rec;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return 0;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	info = lu_context_key_get(&param->fsp_env.le_ctx,
+				  &fld_thread_key);
+	fld_rec = &info->fti_rec;
+	rc = iops->rec(&param->fsp_env, param->fsp_it,
+		       (struct dt_rec *)fld_rec, 0);
+	if (rc != 0) {
+		CERROR("%s:read record error: rc %d\n",
+		       fld->lsf_name, rc);
+	} else if (fld_rec->lsr_start != 0) {
+		range_be_to_cpu(fld_rec, fld_rec);
+		seq_printf(p, DRANGE"\n", PRANGE(fld_rec));
+	}
+
+	return rc;
+}
+
+static const struct seq_operations fldb_sops = {
+	.start = fldb_seq_start,
+	.stop = fldb_seq_stop,
+	.next = fldb_seq_next,
+	.show = fldb_seq_show,
+};
+
+static int fldb_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq;
+	struct lu_server_fld    *fld = inode->i_private;
+	struct dt_object	*obj;
+	const struct dt_it_ops  *iops;
+	struct fld_seq_param    *param = NULL;
+	int			env_init = 0;
+	int			rc;
+
+	rc = seq_open(file, &fldb_sops);
+	if (rc)
+		GOTO(out, rc);
+
+	obj = fld->lsf_obj;
+	if (obj == NULL) {
+		seq = file->private_data;
+		seq->private = NULL;
+		return 0;
+	}
+
+	OBD_ALLOC_PTR(param);
+	if (param == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = lu_env_init(&param->fsp_env, LCT_MD_THREAD);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	env_init = 1;
+	iops = &obj->do_index_ops->dio_it;
+	param->fsp_it = iops->init(&param->fsp_env, obj, 0);
+	if (IS_ERR(param->fsp_it))
+		GOTO(out, rc = PTR_ERR(param->fsp_it));
+
+	param->fsp_fld = fld;
+	param->fsp_stop = 0;
+
+	seq = file->private_data;
+	seq->private = param;
+out:
+	if (rc != 0) {
+		if (env_init == 1)
+			lu_env_fini(&param->fsp_env);
+		if (param != NULL)
+			OBD_FREE_PTR(param);
+	}
+	return rc;
+}
+
+static int fldb_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq = file->private_data;
+	struct fld_seq_param	*param;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+
+	param = seq->private;
+	if (param == NULL) {
+		lprocfs_seq_release(inode, file);
+		return 0;
+	}
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	LASSERT(iops != NULL);
+	LASSERT(param->fsp_it != NULL);
+	iops->fini(&param->fsp_env, param->fsp_it);
+	lu_env_fini(&param->fsp_env);
+	OBD_FREE_PTR(param);
+	lprocfs_seq_release(inode, file);
+
+	return 0;
+}
+
+const struct file_operations fld_debugfs_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = fldb_seq_open,
+	.read    = seq_read,
+	.release = fldb_seq_release,
+};
+
+# endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/include/cl_object.h b/drivers/staging/lustrefsx/lustre/include/cl_object.h
new file mode 100644
index 0000000000000..4d2d059e6243f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/cl_object.h
@@ -0,0 +1,2710 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ *   - cl_object
+ *
+ *   - cl_page
+ *
+ *   - cl_lock     represents an extent lock on an object.
+ *
+ *   - cl_io       represents high-level i/o activity such as whole read/write
+ *                 system call, or write-out of pages from under the lock being
+ *                 canceled. cl_io has sub-ios that can be stopped and resumed
+ *                 independently, thus achieving high degree of transfer
+ *                 parallelism. Single cl_io can be advanced forward by
+ *                 the multiple threads (although in the most usual case of
+ *                 read/write system call it is associated with the single user
+ *                 thread, that issued the system call).
+ *
+ * Terminology
+ *
+ *     - to avoid confusion high-level I/O operation like read or write system
+ *     call is referred to as "an io", whereas low-level I/O operation, like
+ *     RPC, is referred to as "a transfer"
+ *
+ *     - "generic code" means generic (not file system specific) code in the
+ *     hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ *     is not layer specific.
+ *
+ * Locking.
+ *
+ *  - i_mutex
+ *      - PG_locked
+ *          - cl_object_header::coh_page_guard
+ *          - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include <linux/aio.h>
+#include <linux/fs.h>
+
+#include <libcfs/libcfs.h>
+#include <lu_object.h>
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/pagevec.h>
+#include <libcfs/linux/linux-misc.h>
+#include <lustre_dlm.h>
+
+struct obd_info;
+struct inode;
+
+struct cl_device;
+
+struct cl_object;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req_attr;
+
+/**
+ * Device in the client stack.
+ *
+ * \see vvp_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+        /** Super-class. */
+        struct lu_device                   cd_lu_dev;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+        /** Object size, in bytes */
+        loff_t cat_size;
+        /**
+         * Known minimal size, in bytes.
+         *
+         * This is only valid when at least one DLM lock is held.
+         */
+        loff_t cat_kms;
+        /** Modification time. Measured in seconds since epoch. */
+        time64_t cat_mtime;
+        /** Access time. Measured in seconds since epoch. */
+        time64_t cat_atime;
+        /** Change time. Measured in seconds since epoch. */
+        time64_t cat_ctime;
+        /**
+         * Blocks allocated to this cl_object on the server file system.
+         *
+         * \todo XXX An interface for block size is needed.
+         */
+        __u64  cat_blocks;
+        /**
+         * User identifier for quota purposes.
+         */
+        uid_t  cat_uid;
+        /**
+         * Group identifier for quota purposes.
+         */
+        gid_t  cat_gid;
+
+	/* nlink of the directory */
+	__u64  cat_nlink;
+
+	/* Project identifier for quota purpose. */
+	__u32  cat_projid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+	CAT_SIZE	= BIT(0),
+	CAT_KMS		= BIT(1),
+	CAT_MTIME	= BIT(3),
+	CAT_ATIME	= BIT(4),
+	CAT_CTIME	= BIT(5),
+	CAT_BLOCKS	= BIT(6),
+	CAT_UID		= BIT(7),
+	CAT_GID		= BIT(8),
+	CAT_PROJID	= BIT(9),
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ *    stripe. cl_object is based on lu_object: it is identified by a fid,
+ *    layered, cached, hashed, and lrued. Important distinction with the server
+ *    side, where md_object and dt_object are used, is that cl_object "fans out"
+ *    at the lov/sns level: depending on the file layout, single file is
+ *    represented as a set of "sub-objects" (stripes). At the implementation
+ *    level, struct lov_object contains an array of cl_objects. Each sub-object
+ *    is a full-fledged cl_object, having its fid, living in the lru and hash
+ *    table.
+ *
+ *    This leads to the next important difference with the server side: on the
+ *    client, it's quite usual to have objects with the different sequence of
+ *    layers. For example, typical top-object is composed of the following
+ *    layers:
+ *
+ *        - vvp
+ *        - lov
+ *
+ *    whereas its sub-objects are composed of
+ *
+ *        - lovsub
+ *        - osc
+ *
+ *    layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ *    track of the object-subobject relationship.
+ *
+ *    Sub-objects are not cached independently: when top-object is about to
+ *    be discarded from the memory, all its sub-objects are torn-down and
+ *    destroyed too.
+ *
+ * \see vvp_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+        /** super class */
+        struct lu_object                   co_lu;
+        /** per-object-layer operations */
+        const struct cl_object_operations *co_ops;
+	/** offset of page slice in cl_page buffer */
+	int				   co_slice_off;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+        /** Super-class. */
+	struct lu_object_conf     coc_lu;
+	union {
+		/**
+		 * Object layout. This is consumed by lov.
+		 */
+		struct lu_buf	 coc_layout;
+                /**
+                 * Description of particular stripe location in the
+                 * cluster. This is consumed by osc.
+                 */
+                struct lov_oinfo *coc_oinfo;
+        } u;
+        /**
+         * VFS inode. This is consumed by vvp.
+         */
+        struct inode             *coc_inode;
+	/**
+	 * Layout lock handle.
+	 */
+	struct ldlm_lock	 *coc_lock;
+	/**
+	 * Operation to handle layout, OBJECT_CONF_XYZ.
+	 */
+	int			  coc_opc;
+};
+
+enum {
+	/** configure layout, set up a new stripe, must be called while
+	 * holding layout lock. */
+	OBJECT_CONF_SET = 0,
+	/** invalidate the current stripe configuration due to losing
+	 * layout lock. */
+	OBJECT_CONF_INVALIDATE = 1,
+	/** wait for old layout to go away so that new layout can be
+	 * set up. */
+	OBJECT_CONF_WAIT = 2
+};
+
+enum {
+	CL_LAYOUT_GEN_NONE	= (u32)-2,	/* layout lock was cancelled */
+	CL_LAYOUT_GEN_EMPTY	= (u32)-1,	/* for empty layout */
+};
+
+struct cl_layout {
+	/** the buffer to return the layout in lov_mds_md format. */
+	struct lu_buf	cl_buf;
+	/** size of layout in lov_mds_md format. */
+	size_t		cl_size;
+	/** Layout generation. */
+	u32		cl_layout_gen;
+	/** whether layout is a composite one */
+	bool		cl_is_composite;
+	/** Whether layout is a HSM released one */
+	bool		cl_is_released;
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+        /**
+         * Initialize page slice for this layer. Called top-to-bottom through
+         * every object layer when a new cl_page is instantiated. Layer
+         * keeping private per-page data, or requiring its own page operations
+         * vector should allocate these data here, and attach then to the page
+         * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+         * sense). Optional.
+         *
+         * \retval NULL success.
+         *
+         * \retval ERR_PTR(errno) failure code.
+         *
+         * \retval valid-pointer pointer to already existing referenced page
+         *         to be used instead of newly created.
+         */
+	int  (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+				struct cl_page *page, pgoff_t index);
+        /**
+         * Initialize lock slice for this layer. Called top-to-bottom through
+         * every object layer when a new cl_lock is instantiated. Layer
+         * keeping private per-lock data, or requiring its own lock operations
+         * vector should allocate these data here, and attach then to the lock
+         * by calling cl_lock_slice_add(). Mandatory.
+         */
+        int  (*coo_lock_init)(const struct lu_env *env,
+                              struct cl_object *obj, struct cl_lock *lock,
+                              const struct cl_io *io);
+        /**
+         * Initialize io state for a given layer.
+         *
+         * called top-to-bottom once per io existence to initialize io
+         * state. If layer wants to keep some state for this type of io, it
+         * has to embed struct cl_io_slice in lu_env::le_ses, and register
+         * slice with cl_io_slice_add(). It is guaranteed that all threads
+         * participating in this io share the same session.
+         */
+        int  (*coo_io_init)(const struct lu_env *env,
+                            struct cl_object *obj, struct cl_io *io);
+	/**
+	 * Fill portion of \a attr that this layer controls. This method is
+	 * called top-to-bottom through all object layers.
+	 *
+	 * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+	 *
+	 * \return   0: to continue
+	 * \return +ve: to stop iterating through layers (but 0 is returned
+	 *              from enclosing cl_object_attr_get())
+	 * \return -ve: to signal error
+	 */
+	int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_attr *attr);
+        /**
+         * Update attributes.
+         *
+         * \a valid is a bitmask composed from enum #cl_attr_valid, and
+         * indicating what attributes are to be set.
+         *
+         * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+         *
+         * \return the same convention as for
+         * cl_object_operations::coo_attr_get() is used.
+         */
+	int (*coo_attr_update)(const struct lu_env *env, struct cl_object *obj,
+			       const struct cl_attr *attr, unsigned valid);
+        /**
+         * Update object configuration. Called top-to-bottom to modify object
+         * configuration.
+         *
+         * XXX error conditions and handling.
+         */
+        int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+                            const struct cl_object_conf *conf);
+	/**
+	 * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+	 * object. Layers are supposed to fill parts of \a lvb that will be
+	 * shipped to the glimpse originator as a glimpse result.
+	 *
+	 * \see vvp_object_glimpse(), lovsub_object_glimpse(),
+	 * \see osc_object_glimpse()
+	 */
+        int (*coo_glimpse)(const struct lu_env *env,
+                           const struct cl_object *obj, struct ost_lvb *lvb);
+	/**
+	 * Object prune method. Called when the layout is going to change on
+	 * this object, therefore each layer has to clean up their cache,
+	 * mainly pages and locks.
+	 */
+	int (*coo_prune)(const struct lu_env *env, struct cl_object *obj);
+	/**
+	 * Object getstripe method.
+	 */
+	int (*coo_getstripe)(const struct lu_env *env, struct cl_object *obj,
+			     struct lov_user_md __user *lum, size_t size);
+	/**
+	 * Get FIEMAP mapping from the object.
+	 */
+	int (*coo_fiemap)(const struct lu_env *env, struct cl_object *obj,
+			  struct ll_fiemap_info_key *fmkey,
+			  struct fiemap *fiemap, size_t *buflen);
+	/**
+	 * Get layout and generation of the object.
+	 */
+	int (*coo_layout_get)(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_layout *layout);
+	/**
+	 * Get maximum size of the object.
+	 */
+	loff_t (*coo_maxbytes)(struct cl_object *obj);
+	/**
+	 * Set request attributes.
+	 */
+	void (*coo_req_attr_set)(const struct lu_env *env,
+				 struct cl_object *obj,
+				 struct cl_req_attr *attr);
+	/**
+	 * Flush \a obj data corresponding to \a lock. Used for DoM
+	 * locks in llite's cancelling blocking ast callback.
+	 */
+	int (*coo_object_flush)(const struct lu_env *env,
+				struct cl_object *obj,
+				struct ldlm_lock *lock);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+        /** Standard lu_object_header. cl_object::co_lu::lo_header points
+         * here. */
+	struct lu_object_header	coh_lu;
+
+        /**
+         * Parent object. It is assumed that an object has a well-defined
+         * parent, but not a well-defined child (there may be multiple
+         * sub-objects, for the same top-object). cl_object_header::coh_parent
+         * field allows certain code to be written generically, without
+         * limiting possible cl_object layouts unduly.
+         */
+        struct cl_object_header *coh_parent;
+        /**
+         * Protects consistency between cl_attr of parent object and
+         * attributes of sub-objects, that the former is calculated ("merged")
+         * from.
+         *
+         * \todo XXX this can be read/write lock if needed.
+         */
+	spinlock_t		 coh_attr_guard;
+	/**
+	 * Size of cl_page + page slices
+	 */
+	unsigned short		 coh_page_bufsize;
+	/**
+	 * Number of objects above this one: 0 for a top-object, 1 for its
+	 * sub-object, etc.
+	 */
+	unsigned char		 coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj)				\
+	list_for_each_entry((slice),				\
+			    &(obj)->co_lu.lo_header->loh_layers,\
+			    co_lu.lo_linkage)
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj)				\
+	list_for_each_entry_reverse((slice),				\
+				    &(obj)->co_lu.lo_header->loh_layers,\
+				    co_lu.lo_linkage)
+
+/** @} cl_object */
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ *    of the given file are of the same size, and are kept in the radix tree
+ *    hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ *    of the top-level file object are first class cl_objects, they have their
+ *    own radix trees of pages and hence page is implemented as a sequence of
+ *    struct cl_pages's, linked into double-linked list through
+ *    cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ *    corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ *    page in Linux kernel, for example), struct page. It is assumed, that this
+ *    association is implemented by one of cl_page layers (top layer in the
+ *    current design) that
+ *
+ *        - intercepts per-VM-page call-backs made by the environment (e.g.,
+ *          memory pressure),
+ *
+ *        - translates state (page flag bits) and locking between lustre and
+ *          environment.
+ *
+ *    The association between cl_page and struct page is immutable and
+ *    established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ *    this io an exclusive access to this page w.r.t. other io attempts and
+ *    various events changing page state (such as transfer completion, or
+ *    eviction of the page from the memory). Note, that in general cl_io
+ *    cannot be identified with a particular thread, and page ownership is not
+ *    exactly equal to the current thread holding a lock on the page. Layer
+ *    implementing association between cl_page and struct page has to implement
+ *    ownership on top of available synchronization mechanisms.
+ *
+ *    While lustre client maintains the notion of an page ownership by io,
+ *    hosting MM/VM usually has its own page concurrency control
+ *    mechanisms. For example, in Linux, page access is synchronized by the
+ *    per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ *    takes care to acquire and release such locks as necessary around the
+ *    calls to the file system methods (->readpage(), ->prepare_write(),
+ *    ->commit_write(), etc.). This leads to the situation when there are two
+ *    different ways to own a page in the client:
+ *
+ *        - client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ *        - VM locks a page and then calls the client, that has "to assume"
+ *          the ownership from the VM (cl_page_assume()).
+ *
+ *    Dual methods to release ownership are cl_page_disown() and
+ *    cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ *    drops to 0, the page is returned to the cache, unless it is in
+ *    cl_page_state::CPS_FREEING state, in which case it is immediately
+ *    destroyed.
+ *
+ *    The general logic guaranteeing the absence of "existential races" for
+ *    pages is the following:
+ *
+ *        - there are fixed known ways for a thread to obtain a new reference
+ *          to a page:
+ *
+ *            - by doing a lookup in the cl_object radix tree, protected by the
+ *              spin-lock;
+ *
+ *            - by starting from VM-locked struct page and following some
+ *              hosting environment method (e.g., following ->private pointer in
+ *              the case of Linux kernel), see cl_vmpage_page();
+ *
+ *        - when the page enters cl_page_state::CPS_FREEING state, all these
+ *          ways are severed with the proper synchronization
+ *          (cl_page_delete());
+ *
+ *        - entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ *          lock;
+ *
+ *        - no new references to the page in cl_page_state::CPS_FREEING state
+ *          are allowed (checked in cl_page_get()).
+ *
+ *    Together this guarantees that when last reference to a
+ *    cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ *    page, as neither references to it can be acquired at that point, nor
+ *    ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ *    cl_page_state. Possible state transitions are enumerated in
+ *    cl_page_state_set(). State transition process (i.e., actual changing of
+ *    cl_page::cp_state field) is protected by the lock on the underlying VM
+ *    page.
+ *
+ * Linux Kernel implementation.
+ *
+ *    Binding between cl_page and struct page (which is a typedef for
+ *    struct page) is implemented in the vvp layer. cl_page is attached to the
+ *    ->private pointer of the struct page, together with the setting of
+ *    PG_private bit in page->flags, and acquiring additional reference on the
+ *    struct page (much like struct buffer_head, or any similar file system
+ *    private data structures).
+ *
+ *    PG_locked lock is used to implement both ownership and transfer
+ *    synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ *    states. No additional references are acquired for the duration of the
+ *    transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ *          write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+        /**
+         * Page is in the cache, un-owned. Page leaves cached state in the
+         * following cases:
+         *
+         *     - [cl_page_state::CPS_OWNED] io comes across the page and
+         *     owns it;
+         *
+         *     - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+         *     req-formation engine decides that it wants to include this page
+         *     into an RPC being constructed, and yanks it from the cache;
+         *
+         *     - [cl_page_state::CPS_FREEING] VM callback is executed to
+         *     evict the page form the memory;
+         *
+         * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+         */
+	CPS_CACHED = 1,
+        /**
+         * Page is exclusively owned by some cl_io. Page may end up in this
+         * state as a result of
+         *
+         *     - io creating new page and immediately owning it;
+         *
+         *     - [cl_page_state::CPS_CACHED] io finding existing cached page
+         *     and owning it;
+         *
+         *     - [cl_page_state::CPS_OWNED] io finding existing owned page
+         *     and waiting for owner to release the page;
+         *
+         * Page leaves owned state in the following cases:
+         *
+         *     - [cl_page_state::CPS_CACHED] io decides to leave the page in
+         *     the cache, doing nothing;
+         *
+         *     - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+         *     this page;
+         *
+         *     - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+         *     transfer for this page;
+         *
+         *     - [cl_page_state::CPS_FREEING] io decides to destroy this
+         *     page (e.g., as part of truncate or extent lock cancellation).
+         *
+         * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+         */
+        CPS_OWNED,
+        /**
+         * Page is being written out, as a part of a transfer. This state is
+         * entered when req-formation logic decided that it wants this page to
+         * be sent through the wire _now_. Specifically, it means that once
+         * this state is achieved, transfer completion handler (with either
+         * success or failure indication) is guaranteed to be executed against
+         * this page independently of any locks and any scheduling decisions
+         * made by the hosting environment (that effectively means that the
+         * page is never put into cl_page_state::CPS_PAGEOUT state "in
+         * advance". This property is mentioned, because it is important when
+         * reasoning about possible dead-locks in the system). The page can
+         * enter this state as a result of
+         *
+         *     - [cl_page_state::CPS_OWNED] an io requesting an immediate
+         *     write-out of this page, or
+         *
+         *     - [cl_page_state::CPS_CACHED] req-forming engine deciding
+         *     that it has enough dirty pages cached to issue a "good"
+         *     transfer.
+         *
+         * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+         * is completed---it is moved into cl_page_state::CPS_CACHED state.
+         *
+         * Underlying VM page is locked for the duration of transfer.
+         *
+         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+         */
+        CPS_PAGEOUT,
+        /**
+         * Page is being read in, as a part of a transfer. This is quite
+         * similar to the cl_page_state::CPS_PAGEOUT state, except that
+         * read-in is always "immediate"---there is no such thing a sudden
+         * construction of read request from cached, presumably not up to date,
+         * pages.
+         *
+         * Underlying VM page is locked for the duration of transfer.
+         *
+         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+         */
+        CPS_PAGEIN,
+        /**
+         * Page is being destroyed. This state is entered when client decides
+         * that page has to be deleted from its host object, as, e.g., a part
+         * of truncate.
+         *
+         * Once this state is reached, there is no way to escape it.
+         *
+         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+         */
+        CPS_FREEING,
+        CPS_NR
+};
+
+enum cl_page_type {
+        /** Host page, the page is from the host inode which the cl_page
+         * belongs to. */
+        CPT_CACHEABLE = 1,
+
+        /** Transient page, the transient cl_page is used to bind a cl_page
+         *  to vmpage which is not belonging to the same object of cl_page.
+         *  it is used in DirectIO and lockless IO. */
+        CPT_TRANSIENT,
+	CPT_NR
+};
+
+#define	CP_STATE_BITS	4
+#define	CP_TYPE_BITS	2
+#define	CP_MAX_LAYER	3
+
+/**
+ * Fields are protected by the lock on struct page, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+	/** Reference counter. */
+	atomic_t		cp_ref;
+	/** layout_entry + stripe index, composed using lov_comp_index() */
+	unsigned int		cp_lov_index;
+	/** page->index of the page within the whole file */
+	pgoff_t			cp_page_index;
+	/** An object this page is a part of. Immutable after creation. */
+	struct cl_object	*cp_obj;
+	/** vmpage */
+	struct page		*cp_vmpage;
+	/**
+	 * Assigned if doing direct IO, because in this case cp_vmpage is not
+	 * a valid page cache page, hence the inode cannot be inferred from
+	 * cp_vmpage->mapping->host.
+	 */
+	struct inode		*cp_inode;
+	/** Linkage of pages within group. Pages must be owned */
+	struct list_head	cp_batch;
+	/** array of slices offset. Immutable after creation. */
+	unsigned char		cp_layer_offset[CP_MAX_LAYER]; /* 24 bits */
+	/** current slice index */
+	unsigned char		cp_layer_count:2; /* 26 bits */
+	/**
+	 * Page state. This field is const to avoid accidental update, it is
+	 * modified only internally within cl_page.c. Protected by a VM lock.
+	 */
+	enum cl_page_state	 cp_state:CP_STATE_BITS; /* 30 bits */
+        /**
+         * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+         * creation.
+         */
+	enum cl_page_type	cp_type:CP_TYPE_BITS; /* 32 bits */
+	/* which slab kmem index this memory allocated from */
+	short int		cp_kmem_index; /* 48 bits */
+	unsigned int		cp_unused1:16;	/* 64 bits */
+
+	/**
+	 * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+	 * by sub-io. Protected by a VM lock.
+	 */
+        struct cl_io            *cp_owner;
+	/** List of references to this page, for debugging. */
+        struct lu_ref		cp_reference;
+	/** Link to an object, for debugging. */
+	struct lu_ref_link	cp_obj_ref;
+	/** Link to a queue, for debugging. */
+	struct lu_ref_link	cp_queue_ref;
+	/** Assigned if doing a sync_io */
+	struct cl_sync_io	*cp_sync_io;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see vvp_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+        struct cl_page                  *cpl_page;
+        /**
+         * Object slice corresponding to this page slice. Immutable after
+         * creation.
+         */
+        struct cl_object                *cpl_obj;
+        const struct cl_page_operations *cpl_ops;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+	CLM_READ,
+	CLM_WRITE,
+	CLM_GROUP,
+	CLM_MAX,
+};
+
+/**
+ * Requested transfer type.
+ */
+enum cl_req_type {
+        CRT_READ,
+        CRT_WRITE,
+        CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+        /**
+	 * cl_page<->struct page methods. Only one layer in the stack has to
+         * implement these. Current code assumes that this functionality is
+         * provided by the topmost layer, see cl_page_disown0() as an example.
+         */
+
+        /**
+         * Called when \a io acquires this page into the exclusive
+         * ownership. When this method returns, it is guaranteed that the is
+         * not owned by other io, and no transfer is going on against
+         * it. Optional.
+         *
+         * \see cl_page_own()
+         * \see vvp_page_own(), lov_page_own()
+         */
+        int  (*cpo_own)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *io, int nonblock);
+        /** Called when ownership it yielded. Optional.
+         *
+         * \see cl_page_disown()
+         * \see vvp_page_disown()
+         */
+        void (*cpo_disown)(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io);
+        /**
+         * Called for a page that is already "owned" by \a io from VM point of
+         * view. Optional.
+         *
+         * \see cl_page_assume()
+         * \see vvp_page_assume(), lov_page_assume()
+         */
+        void (*cpo_assume)(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io);
+        /** Dual to cl_page_operations::cpo_assume(). Optional. Called
+         * bottom-to-top when IO releases a page without actually unlocking
+         * it.
+         *
+         * \see cl_page_unassume()
+         * \see vvp_page_unassume()
+         */
+        void (*cpo_unassume)(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *io);
+        /**
+         * Announces whether the page contains valid data or not by \a uptodate.
+         *
+         * \see cl_page_export()
+         * \see vvp_page_export()
+         */
+        void  (*cpo_export)(const struct lu_env *env,
+                            const struct cl_page_slice *slice, int uptodate);
+        /**
+         * Checks whether underlying VM page is locked (in the suitable
+         * sense). Used for assertions.
+         *
+         * \retval    -EBUSY: page is protected by a lock of a given mode;
+         * \retval  -ENODATA: page is not protected by a lock;
+         * \retval         0: this layer cannot decide. (Should never happen.)
+         */
+        int (*cpo_is_vmlocked)(const struct lu_env *env,
+                               const struct cl_page_slice *slice);
+
+	/**
+	 * Update file attributes when all we have is this page.  Used for tiny
+	 * writes to update attributes when we don't have a full cl_io.
+	 */
+	void (*cpo_page_touch)(const struct lu_env *env,
+			       const struct cl_page_slice *slice, size_t to);
+        /**
+         * Page destruction.
+         */
+
+        /**
+         * Called when page is truncated from the object. Optional.
+         *
+         * \see cl_page_discard()
+         * \see vvp_page_discard(), osc_page_discard()
+         */
+        void (*cpo_discard)(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *io);
+        /**
+         * Called when page is removed from the cache, and is about to being
+         * destroyed. Optional.
+         *
+         * \see cl_page_delete()
+         * \see vvp_page_delete(), osc_page_delete()
+         */
+        void (*cpo_delete)(const struct lu_env *env,
+                           const struct cl_page_slice *slice);
+        /** Destructor. Frees resources and slice itself. */
+        void (*cpo_fini)(const struct lu_env *env,
+			 struct cl_page_slice *slice,
+			 struct pagevec *pvec);
+        /**
+         * Optional debugging helper. Prints given page slice.
+         *
+         * \see cl_page_print()
+         */
+        int (*cpo_print)(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         void *cookie, lu_printer_t p);
+        /**
+         * \name transfer
+         *
+         * Transfer methods.
+         *
+         * @{
+         */
+        /**
+         * Request type dependent vector of operations.
+         *
+         * Transfer operations depend on transfer mode (cl_req_type). To avoid
+         * passing transfer mode to each and every of these methods, and to
+         * avoid branching on request type inside of the methods, separate
+         * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+         * provided. That is, method invocation usually looks like
+         *
+         *         slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+         */
+        struct {
+                /**
+                 * Called when a page is submitted for a transfer as a part of
+                 * cl_page_list.
+                 *
+                 * \return    0         : page is eligible for submission;
+                 * \return    -EALREADY : skip this page;
+                 * \return    -ve       : error.
+                 *
+                 * \see cl_page_prep()
+                 */
+                int  (*cpo_prep)(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 struct cl_io *io);
+                /**
+                 * Completion handler. This is guaranteed to be eventually
+                 * fired after cl_page_operations::cpo_prep() or
+                 * cl_page_operations::cpo_make_ready() call.
+                 *
+                 * This method can be called in a non-blocking context. It is
+                 * guaranteed however, that the page involved and its object
+                 * are pinned in memory (and, hence, calling cl_page_put() is
+                 * safe).
+                 *
+                 * \see cl_page_completion()
+                 */
+                void (*cpo_completion)(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       int ioret);
+                /**
+                 * Called when cached page is about to be added to the
+                 * ptlrpc request as a part of req formation.
+                 *
+                 * \return    0       : proceed with this page;
+                 * \return    -EAGAIN : skip this page;
+                 * \return    -ve     : error.
+                 *
+                 * \see cl_page_make_ready()
+                 */
+                int  (*cpo_make_ready)(const struct lu_env *env,
+                                       const struct cl_page_slice *slice);
+        } io[CRT_NR];
+        /**
+         * Tell transfer engine that only [to, from] part of a page should be
+         * transmitted.
+         *
+         * This is used for immediate transfers.
+         *
+         * \todo XXX this is not very good interface. It would be much better
+         * if all transfer parameters were supplied as arguments to
+         * cl_io_operations::cio_submit() call, but it is not clear how to do
+         * this for page queues.
+         *
+         * \see cl_page_clip()
+         */
+        void (*cpo_clip)(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         int from, int to);
+	/**
+	 * Write out a page by kernel. This is only called by ll_writepage
+	 * right now.
+	 *
+	 * \see cl_page_flush()
+	 */
+	int (*cpo_flush)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *io);
+        /** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...)                     \
+do {                                                                    \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);        \
+                cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
+        }                                                               \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...)                          \
+do {                                                                          \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                         \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);              \
+                cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+                CDEBUG(mask, format , ## __VA_ARGS__);                        \
+        }                                                                     \
+} while (0)
+
+static inline struct page *cl_page_vmpage(const struct cl_page *page)
+{
+	LASSERT(page->cp_vmpage != NULL);
+	return page->cp_vmpage;
+}
+
+/**
+ * Check if a cl_page is in use.
+ *
+ * Client cache holds a refcount, this refcount will be dropped when
+ * the page is taken out of cache, see vvp_page_delete().
+ */
+static inline bool __page_in_use(const struct cl_page *page, int refc)
+{
+	return (atomic_read(&page->cp_ref) > refc + 1);
+}
+
+/**
+ * Caller itself holds a refcount of cl_page.
+ */
+#define cl_page_in_use(pg)       __page_in_use(pg, 1)
+/**
+ * Caller doesn't hold a refcount.
+ */
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ *        struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * Typical cl_lock consists of one layer:
+ *
+ *     - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ *     - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is a cacheless data container for the requirements of locks to
+ * complete the IO. cl_lock is created before I/O starts and destroyed when the
+ * I/O is complete.
+ *
+ * cl_lock depends on LDLM lock to fulfill lock semantics. LDLM lock is attached
+ * to cl_lock at OSC layer. LDLM lock is still cacheable.
+ *
+ * INTERFACE AND USAGE
+ *
+ * Two major methods are supported for cl_lock: clo_enqueue and clo_cancel.  A
+ * cl_lock is enqueued by cl_lock_request(), which will call clo_enqueue()
+ * methods for each layer to enqueue the lock. At the LOV layer, if a cl_lock
+ * consists of multiple sub cl_locks, each sub locks will be enqueued
+ * correspondingly. At OSC layer, the lock enqueue request will tend to reuse
+ * cached LDLM lock; otherwise a new LDLM lock will have to be requested from
+ * OST side.
+ *
+ * cl_lock_cancel() must be called to release a cl_lock after use. clo_cancel()
+ * method will be called for each layer to release the resource held by this
+ * lock. At OSC layer, the reference count of LDLM lock, which is held at
+ * clo_enqueue time, is released.
+ *
+ * LDLM lock can only be canceled if there is no cl_lock using it.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ *     - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ *       is called on each layer. Responsibility of this method is to add locks,
+ *       needed by a given layer into cl_io.ci_lockset.
+ *
+ *     - once locks for all layers were collected, they are sorted to avoid
+ *       dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ *     - when all locks are acquired, IO is performed;
+ *
+ *     - locks are released after IO is complete.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ *  - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ *  atomicity;
+ *
+ *  - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+        /** Object this lock is granted for. */
+        struct cl_object *cld_obj;
+        /** Index of the first page protected by this lock. */
+        pgoff_t           cld_start;
+        /** Index of the last page (inclusive) protected by this lock. */
+        pgoff_t           cld_end;
+        /** Group ID, for group lock */
+        __u64             cld_gid;
+        /** Lock mode. */
+        enum cl_lock_mode cld_mode;
+        /**
+         * flags to enqueue lock. A combination of bit-flags from
+         * enum cl_enq_flags.
+         */
+        __u32             cld_enq_flags;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]:%x"
+#define PDESCR(descr)							\
+	cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode,	\
+	(descr)->cld_start, (descr)->cld_end, (descr)->cld_enq_flags
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+	/** List of slices. Immutable after creation. */
+	struct list_head      cll_layers;
+	/** lock attribute, extent, cl_object, etc. */
+	struct cl_lock_descr  cll_descr;
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see lov_lock, osc_lock
+ */
+struct cl_lock_slice {
+        struct cl_lock                  *cls_lock;
+        /** Object slice corresponding to this lock slice. Immutable after
+         * creation. */
+        struct cl_object                *cls_obj;
+        const struct cl_lock_operations *cls_ops;
+        /** Linkage into cl_lock::cll_layers. Immutable after creation. */
+	struct list_head		 cls_linkage;
+};
+
+/**
+ *
+ * \see lov_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+	/** @{ */
+	/**
+	 * Attempts to enqueue the lock. Called top-to-bottom.
+	 *
+	 * \retval 0	this layer has enqueued the lock successfully
+	 * \retval >0	this layer has enqueued the lock, but need to wait on
+	 *		@anchor for resources
+	 * \retval -ve	failure
+	 *
+	 * \see lov_lock_enqueue(), osc_lock_enqueue()
+	 */
+	int  (*clo_enqueue)(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, struct cl_sync_io *anchor);
+	/**
+	 * Cancel a lock, release its DLM lock ref, while does not cancel the
+	 * DLM lock
+	 */
+	void (*clo_cancel)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice);
+	/** @} */
+	/**
+	 * Destructor. Frees resources and the slice.
+	 *
+	 * \see lov_lock_fini(), osc_lock_fini()
+	 */
+        void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+        /**
+         * Optional debugging helper. Prints given lock slice.
+         */
+        int (*clo_print)(const struct lu_env *env,
+                         void *cookie, lu_printer_t p,
+                         const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...)                     \
+do {                                                                    \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);        \
+                cl_lock_print(env, &msgdata, lu_cdebug_printer, lock);  \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
+        }                                                               \
+} while (0)
+
+#define CL_LOCK_ASSERT(expr, env, lock) do {                            \
+	if (likely(expr))                                               \
+		break;                                                  \
+									\
+	CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr);    \
+	LBUG();                                                         \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ *     - submit pages for an immediate transfer,
+ *
+ *     - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ *     - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+	unsigned		 pl_nr;
+	struct list_head	 pl_pages;
+};
+
+/**
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+        struct cl_page_list c2_qin;
+        struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ *     (0) initialize io state through all layers;
+ *
+ *     (1) loop: prepare chunk of work to do
+ *
+ *     (2) call all layers to collect locks they need to process current chunk
+ *
+ *     (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ *     (4) process the chunk: call per-page methods
+ *         cl_io_operations::cio_prepare_write(),
+ *         cl_io_operations::cio_commit_write() for write)
+ *
+ *     (5) release locks
+ *
+ *     (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+        /** read system call */
+	CIT_READ = 1,
+        /** write system call */
+        CIT_WRITE,
+        /** truncate, utime system calls */
+        CIT_SETATTR,
+	/** get data version */
+	CIT_DATA_VERSION,
+        /**
+         * page fault handling
+         */
+        CIT_FAULT,
+        /**
+	 * fsync system call handling
+	 * To write out a range of file
+	 */
+	CIT_FSYNC,
+	/**
+	 * glimpse. An io context to acquire glimpse lock.
+	 */
+	CIT_GLIMPSE,
+	/**
+         * Miscellaneous io. This is used for occasional io activity that
+         * doesn't fit into other types. Currently this is used for:
+         *
+         *     - cancellation of an extent lock. This io exists as a context
+         *     to write dirty pages from under the lock being canceled back
+         *     to the server;
+         *
+         *     - VM induced page write-out. An io context for writing page out
+         *     for memory cleansing;
+         *
+         *     - grouplock. An io context to acquire group lock.
+         *
+         * CIT_MISC io is used simply as a context in which locks and pages
+         * are manipulated. Such io has no internal "process", that is,
+         * cl_io_loop() is never called for it.
+         */
+        CIT_MISC,
+	/**
+	 * ladvise handling
+	 * To give advice about access of a file
+	 */
+	CIT_LADVISE,
+	/**
+	 * SEEK_HOLE/SEEK_DATA handling to search holes or data
+	 * across all file objects
+	 */
+	CIT_LSEEK,
+        CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+        /** Not initialized. */
+        CIS_ZERO,
+        /** Initialized. */
+        CIS_INIT,
+        /** IO iteration started. */
+        CIS_IT_STARTED,
+        /** Locks taken. */
+        CIS_LOCKED,
+        /** Actual IO is in progress. */
+        CIS_IO_GOING,
+        /** IO for the current iteration finished. */
+        CIS_IO_FINISHED,
+        /** Locks released. */
+        CIS_UNLOCKED,
+        /** Iteration completed. */
+        CIS_IT_ENDED,
+        /** cl_io finalized. */
+        CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io
+ */
+struct cl_io_slice {
+	struct cl_io			*cis_io;
+	/** corresponding object slice. Immutable after creation. */
+	struct cl_object		*cis_obj;
+	/** io operations. Immutable after creation. */
+	const struct cl_io_operations	*cis_iop;
+	/**
+	 * linkage into a list of all slices for a given cl_io, hanging off
+	 * cl_io::ci_layers. Immutable after creation.
+	 */
+	struct list_head		cis_linkage;
+};
+
+typedef void (*cl_commit_cbt)(const struct lu_env *, struct cl_io *,
+			      struct pagevec *);
+
+struct cl_read_ahead {
+	/* Maximum page index the readahead window will end.
+	 * This is determined DLM lock coverage, RPC and stripe boundary.
+	 * cra_end is included. */
+	pgoff_t		cra_end_idx;
+	/* optimal RPC size for this read, by pages */
+	unsigned long	cra_rpc_pages;
+	/* Release callback. If readahead holds resources underneath, this
+	 * function should be called to release it. */
+	void		(*cra_release)(const struct lu_env *env,
+				       struct cl_read_ahead *ra);
+
+	/* Callback data for cra_release routine */
+	void		*cra_dlmlock;
+	void		*cra_oio;
+
+	/* whether lock is in contention */
+	bool		cra_contention;
+};
+
+static inline void cl_read_ahead_release(const struct lu_env *env,
+					 struct cl_read_ahead *ra)
+{
+	if (ra->cra_release != NULL)
+		ra->cra_release(env, ra);
+	memset(ra, 0, sizeof(*ra));
+}
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+        /**
+         * Vector of io state transition methods for every io type.
+         *
+         * \see cl_page_operations::io
+         */
+        struct {
+                /**
+                 * Prepare io iteration at a given layer.
+                 *
+                 * Called top-to-bottom at the beginning of each iteration of
+                 * "io loop" (if it makes sense for this type of io). Here
+                 * layer selects what work it will do during this iteration.
+                 *
+                 * \see cl_io_operations::cio_iter_fini()
+                 */
+                int (*cio_iter_init) (const struct lu_env *env,
+                                      const struct cl_io_slice *slice);
+                /**
+                 * Finalize io iteration.
+                 *
+                 * Called bottom-to-top at the end of each iteration of "io
+                 * loop". Here layers can decide whether IO has to be
+                 * continued.
+                 *
+                 * \see cl_io_operations::cio_iter_init()
+                 */
+                void (*cio_iter_fini) (const struct lu_env *env,
+                                       const struct cl_io_slice *slice);
+                /**
+                 * Collect locks for the current iteration of io.
+                 *
+                 * Called top-to-bottom to collect all locks necessary for
+                 * this iteration. This methods shouldn't actually enqueue
+                 * anything, instead it should post a lock through
+                 * cl_io_lock_add(). Once all locks are collected, they are
+                 * sorted and enqueued in the proper order.
+                 */
+                int  (*cio_lock) (const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+                /**
+                 * Finalize unlocking.
+                 *
+                 * Called bottom-to-top to finish layer specific unlocking
+                 * functionality, after generic code released all locks
+                 * acquired by cl_io_operations::cio_lock().
+                 */
+                void  (*cio_unlock)(const struct lu_env *env,
+                                    const struct cl_io_slice *slice);
+                /**
+                 * Start io iteration.
+                 *
+                 * Once all locks are acquired, called top-to-bottom to
+                 * commence actual IO. In the current implementation,
+                 * top-level vvp_io_{read,write}_start() does all the work
+                 * synchronously by calling generic_file_*(), so other layers
+                 * are called when everything is done.
+                 */
+                int  (*cio_start)(const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+                /**
+                 * Called top-to-bottom at the end of io loop. Here layer
+                 * might wait for an unfinished asynchronous io.
+                 */
+                void (*cio_end)  (const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+                /**
+                 * Called bottom-to-top to notify layers that read/write IO
+                 * iteration finished, with \a nob bytes transferred.
+                 */
+                void (*cio_advance)(const struct lu_env *env,
+                                    const struct cl_io_slice *slice,
+                                    size_t nob);
+                /**
+                 * Called once per io, bottom-to-top to release io resources.
+                 */
+                void (*cio_fini) (const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+        } op[CIT_OP_NR];
+
+	/**
+	 * Submit pages from \a queue->c2_qin for IO, and move
+	 * successfully submitted pages into \a queue->c2_qout. Return
+	 * non-zero if failed to submit even the single page. If
+	 * submission failed after some pages were moved into \a
+	 * queue->c2_qout, completion callback with non-zero ioret is
+	 * executed on them.
+	 */
+	int  (*cio_submit)(const struct lu_env *env,
+			const struct cl_io_slice *slice,
+			enum cl_req_type crt,
+			struct cl_2queue *queue);
+	/**
+	 * Queue async page for write.
+	 * The difference between cio_submit and cio_queue is that
+	 * cio_submit is for urgent request.
+	 */
+	int  (*cio_commit_async)(const struct lu_env *env,
+			const struct cl_io_slice *slice,
+			struct cl_page_list *queue, int from, int to,
+			cl_commit_cbt cb);
+	/**
+	 * Release active extent.
+	 */
+	void  (*cio_extent_release)(const struct lu_env *env,
+				    const struct cl_io_slice *slice);
+	/**
+	 * Decide maximum read ahead extent
+	 *
+	 * \pre io->ci_type == CIT_READ
+	 */
+	int (*cio_read_ahead)(const struct lu_env *env,
+			      const struct cl_io_slice *slice,
+			      pgoff_t start, struct cl_read_ahead *ra);
+	/**
+	 *
+	 * Reserve LRU slots before IO.
+	 */
+	int (*cio_lru_reserve) (const struct lu_env *env,
+				const struct cl_io_slice *slice,
+				loff_t pos, size_t bytes);
+        /**
+         * Optional debugging helper. Print given io slice.
+         */
+        int (*cio_print)(const struct lu_env *env, void *cookie,
+                         lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+	/**
+	 * instruct server to not block, if conflicting lock is found. Instead
+	 * -EAGAIN is returned immediately.
+	 */
+	CEF_NONBLOCK     = 0x00000001,
+	/**
+	 * Tell lower layers this is a glimpse request, translated to
+	 * LDLM_FL_HAS_INTENT at LDLM layer.
+	 *
+	 * Also, because glimpse locks never block other locks, we count this
+	 * as automatically compatible with other osc locks.
+	 * (see osc_lock_compatible)
+	 */
+	CEF_GLIMPSE        = 0x00000002,
+        /**
+         * tell the server to instruct (though a flag in the blocking ast) an
+         * owner of the conflicting lock, that it can drop dirty pages
+         * protected by this lock, without sending them to the server.
+         */
+        CEF_DISCARD_DATA = 0x00000004,
+	/**
+	 * tell the sub layers that it must be a `real' lock. This is used for
+	 * mmapped-buffer locks, glimpse locks, manually requested locks
+	 * (LU_LADVISE_LOCKAHEAD) that must never be converted into lockless
+	 * mode.
+	 *
+	 * \see vvp_mmap_locks(), cl_glimpse_lock, cl_request_lock().
+	 */
+	CEF_MUST         = 0x00000008,
+        /**
+         * tell the sub layers that never request a `real' lock. This flag is
+         * not used currently.
+         *
+         * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+         * conversion policy: ci_lockreq describes generic information of lock
+         * requirement for this IO, especially for locks which belong to the
+         * object doing IO; however, lock itself may have precise requirements
+         * that are described by the enqueue flags.
+         */
+        CEF_NEVER        = 0x00000010,
+        /**
+	 * tell the dlm layer this is a speculative lock request
+	 * speculative lock requests are locks which are not requested as part
+	 * of an I/O operation.  Instead, they are requested because we expect
+	 * to use them in the future.  They are requested asynchronously at the
+	 * ptlrpc layer.
+	 *
+	 * Currently used for asynchronous glimpse locks and manually requested
+	 * locks (LU_LADVISE_LOCKAHEAD).
+         */
+	CEF_SPECULATIVE          = 0x00000020,
+	/**
+	 * enqueue a lock to test DLM lock existence.
+	 */
+	CEF_PEEK	= 0x00000040,
+	/**
+	 * Lock match only. Used by group lock in I/O as group lock
+	 * is known to exist.
+	 */
+	CEF_LOCK_MATCH  = 0x00000080,
+	/**
+	 * tell the DLM layer to lock only the requested range
+	 */
+	CEF_LOCK_NO_EXPAND    = 0x00000100,
+	/**
+	 * mask of enq_flags.
+	 */
+	CEF_MASK         = 0x000001ff,
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+	/** linkage into one of cl_lockset lists. */
+	struct list_head        cill_linkage;
+	struct cl_lock          cill_lock;
+	/** optional destructor */
+	void                    (*cill_fini)(const struct lu_env *env,
+					     struct cl_io_lock_link *link);
+};
+#define cill_descr	cill_lock.cll_descr
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ *      - holding extent locks over multiple ost's introduces the danger of
+ *        "cascading timeouts";
+ *
+ *      - holding multiple locks over the same ost is still dead-lock prone,
+ *        see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ *      - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ *      - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ *      - SNS has to take locks across full stripe for correctness;
+ *
+ *      - in the case when user level buffer, supplied to {read,write}(file0),
+ *        is a part of a memory mapped lustre file, client has to take a dlm
+ *        locks on file0, and all files that back up the buffer (or a part of
+ *        the buffer, that is being processed in the current chunk, in any
+ *        case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+	/** locks to be acquired. */
+	struct list_head  cls_todo;
+	/** locks acquired. */
+	struct list_head  cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+        /** Always lock data (e.g., O_APPEND). */
+        CILR_MANDATORY = 0,
+        /** Layers are free to decide between local and global locking. */
+        CILR_MAYBE,
+        /** Never lock: there is no cache (e.g., liblustre). */
+        CILR_NEVER
+};
+
+enum cl_fsync_mode {
+	/** start writeback, do not wait for them to finish */
+	CL_FSYNC_NONE  = 0,
+	/** start writeback and wait for them to finish */
+	CL_FSYNC_LOCAL = 1,
+	/** discard all of dirty pages in a specific file range */
+	CL_FSYNC_DISCARD = 2,
+	/** start writeback and make sure they have reached storage before
+	 * return. OST_SYNC RPC must be issued and finished */
+	CL_FSYNC_ALL   = 3
+};
+
+struct cl_io_rw_common {
+	loff_t	crw_pos;
+	size_t	crw_count;
+	int	crw_nonblock;
+};
+enum cl_setattr_subtype {
+	/** regular setattr **/
+	CL_SETATTR_REG = 1,
+	/** truncate(2) **/
+	CL_SETATTR_TRUNC,
+	/** fallocate(2) - mode preallocate **/
+	CL_SETATTR_FALLOCATE
+};
+
+struct cl_io_range {
+	loff_t cir_pos;
+	size_t cir_count;
+};
+
+struct cl_io_pt {
+	struct cl_io_pt *cip_next;
+	struct kiocb cip_iocb;
+	struct iov_iter cip_iter;
+	struct file *cip_file;
+	enum cl_io_type cip_iot;
+	unsigned int cip_need_restart:1;
+	loff_t cip_pos;
+	size_t cip_count;
+	ssize_t cip_result;
+};
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+        /** type of this IO. Immutable after creation. */
+        enum cl_io_type                ci_type;
+        /** current state of cl_io state machine. */
+        enum cl_io_state               ci_state;
+        /** main object this io is against. Immutable after creation. */
+        struct cl_object              *ci_obj;
+	/** top level dio_aio */
+	struct cl_dio_aio	      *ci_dio_aio;
+        /**
+         * Upper layer io, of which this io is a part of. Immutable after
+         * creation.
+         */
+        struct cl_io                  *ci_parent;
+        /** List of slices. Immutable after creation. */
+	struct list_head		ci_layers;
+        /** list of locks (to be) acquired by this io. */
+        struct cl_lockset              ci_lockset;
+        /** lock requirements, this is just a help info for sublayers. */
+        enum cl_io_lock_dmd            ci_lockreq;
+	/** layout version when this IO occurs */
+	__u32				ci_layout_version;
+	union {
+		struct cl_rd_io {
+			struct cl_io_rw_common rd;
+		} ci_rd;
+		struct cl_wr_io {
+			struct cl_io_rw_common wr;
+			int                    wr_append;
+			int                    wr_sync;
+		} ci_wr;
+		struct cl_io_rw_common ci_rw;
+		struct cl_setattr_io {
+			struct ost_lvb		 sa_attr;
+			unsigned int		 sa_attr_flags;
+			unsigned int		 sa_avalid; /* ATTR_* */
+			unsigned int		 sa_xvalid; /* OP_XVALID */
+			int			 sa_stripe_index;
+			struct ost_layout	 sa_layout;
+			const struct lu_fid	*sa_parent_fid;
+			/* SETATTR interface is used for regular setattr, */
+			/* truncate(2) and fallocate(2) subtypes */
+			enum cl_setattr_subtype	 sa_subtype;
+			/* The following are used for fallocate(2) */
+			int			 sa_falloc_mode;
+			loff_t			 sa_falloc_offset;
+			loff_t			 sa_falloc_end;
+			uid_t			 sa_falloc_uid;
+			gid_t			 sa_falloc_gid;
+		} ci_setattr;
+		struct cl_data_version_io {
+			u64 dv_data_version;
+			u32 dv_layout_version;
+			int dv_flags;
+		} ci_data_version;
+                struct cl_fault_io {
+                        /** page index within file. */
+                        pgoff_t         ft_index;
+                        /** bytes valid byte on a faulted page. */
+			size_t		ft_nob;
+                        /** writable page? for nopage() only */
+                        int             ft_writable;
+                        /** page of an executable? */
+                        int             ft_executable;
+                        /** page_mkwrite() */
+                        int             ft_mkwrite;
+                        /** resulting page */
+                        struct cl_page *ft_page;
+                } ci_fault;
+		struct cl_fsync_io {
+			loff_t             fi_start;
+			loff_t             fi_end;
+			/** file system level fid */
+			struct lu_fid     *fi_fid;
+			enum cl_fsync_mode fi_mode;
+			/* how many pages were written/discarded */
+			unsigned int       fi_nr_written;
+		} ci_fsync;
+		struct cl_ladvise_io {
+			__u64			 li_start;
+			__u64			 li_end;
+			/** file system level fid */
+			struct lu_fid		*li_fid;
+			enum lu_ladvise_type	 li_advice;
+			__u64			 li_flags;
+		} ci_ladvise;
+		struct cl_lseek_io {
+			loff_t			 ls_start;
+			loff_t			 ls_result;
+			int			 ls_whence;
+		} ci_lseek;
+		struct cl_misc_io {
+			time64_t		 lm_next_rpc_time;
+		} ci_misc;
+        } u;
+        struct cl_2queue     ci_queue;
+        size_t               ci_nob;
+        int                  ci_result;
+	unsigned int         ci_continue:1,
+	/**
+	 * This io has held grouplock, to inform sublayers that
+	 * don't do lockless i/o.
+	 */
+			     ci_no_srvlock:1,
+	/**
+	 * The whole IO need to be restarted because layout has been changed
+	 */
+			     ci_need_restart:1,
+	/**
+	 * to not refresh layout - the IO issuer knows that the layout won't
+	 * change(page operations, layout change causes all page to be
+	 * discarded), or it doesn't matter if it changes(sync).
+	 */
+			     ci_ignore_layout:1,
+	/**
+	 * Need MDS intervention to complete a write.
+	 * Write intent is required for the following cases:
+	 * 1. component being written is not initialized, or
+	 * 2. the mirrored files are NOT in WRITE_PENDING state.
+	 */
+			     ci_need_write_intent:1,
+	/**
+	 * Check if layout changed after the IO finishes. Mainly for HSM
+	 * requirement. If IO occurs to openning files, it doesn't need to
+	 * verify layout because HSM won't release openning files.
+	 * Right now, only two opertaions need to verify layout: glimpse
+	 * and setattr.
+	 */
+			     ci_verify_layout:1,
+	/**
+	 * file is released, restore has to to be triggered by vvp layer
+	 */
+			     ci_restore_needed:1,
+	/**
+	 * O_NOATIME
+	 */
+			     ci_noatime:1,
+	/* Tell sublayers not to expand LDLM locks requested for this IO */
+			     ci_lock_no_expand:1,
+	/**
+	 * Set if non-delay RPC should be used for this IO.
+	 *
+	 * If this file has multiple mirrors, and if the OSTs of the current
+	 * mirror is inaccessible, non-delay RPC would error out quickly so
+	 * that the upper layer can try to access the next mirror.
+	 */
+			     ci_ndelay:1,
+	/**
+	 * Set if IO is triggered by async workqueue readahead.
+	 */
+			     ci_async_readahead:1,
+	/**
+	 * Ignore lockless and do normal locking for this io.
+	 */
+			     ci_dio_lock:1,
+	/**
+	 * Set if we've tried all mirrors for this read IO, if it's not set,
+	 * the read IO will check to-be-read OSCs' status, and make fast-switch
+	 * another mirror if some of the OSTs are not healthy.
+	 */
+			     ci_tried_all_mirrors:1,
+	/**
+	 * Random read hints, readahead will be disabled.
+	 */
+			     ci_rand_read:1,
+	/**
+	 * Sequential read hints.
+	 */
+			     ci_seq_read:1,
+	/**
+	 * Do parallel (async) submission of DIO RPCs.  Note DIO is still sync
+	 * to userspace, only the RPCs are submitted async, then waited for at
+	 * the llite layer before returning.
+	 */
+			     ci_parallel_dio:1;
+	/**
+	 * Bypass quota check
+	 */
+	unsigned	     ci_noquota:1,
+	/**
+	 * The filesystem must exclusively acquire invalidate_lock before
+	 * invalidating page cache in truncate / hole punch / DLM extent
+	 * lock blocking AST path (and thus calling into ->invalidatepage)
+	 * to block races between page cache invalidation and page cache
+	 * filling functions (fault, read, ...)
+	 */
+			     ci_invalidate_page_cache:1;
+
+	/**
+	 * How many times the read has retried before this one.
+	 * Set by the top level and consumed by the LOV.
+	 */
+	unsigned             ci_ndelay_tried;
+	/**
+	 * Designated mirror index for this I/O.
+	 */
+	unsigned	     ci_designated_mirror;
+	/**
+	 * Number of pages owned by this IO. For invariant checking.
+	 */
+	unsigned	     ci_owned_nr;
+	/**
+	 * Range of write intent. Valid if ci_need_write_intent is set.
+	 */
+	struct lu_extent	ci_write_intent;
+};
+
+/** @} cl_io */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+	enum cl_req_type cra_type;
+	u64		 cra_flags;
+	struct cl_page  *cra_page;
+	/** Generic attributes for the server consumption. */
+	struct obdo	*cra_oa;
+	/** Jobid */
+	char		 cra_jobid[LUSTRE_JOBID_SIZE];
+};
+
+enum cache_stats_item {
+	/** how many cache lookups were performed */
+	CS_lookup = 0,
+	/** how many times cache lookup resulted in a hit */
+	CS_hit,
+	/** how many entities are in the cache right now */
+	CS_total,
+	/** how many entities in the cache are actively used (and cannot be
+	 * evicted) right now */
+	CS_busy,
+	/** how many entities were created at all */
+	CS_create,
+	CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+	const char	*cs_name;
+	atomic_t	cs_stats[CS_NR];
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+	struct lu_site		cs_lu;
+	/**
+	 * Statistical counters. Atomics do not scale, something better like
+	 * per-cpu counters is needed.
+	 *
+	 * These are exported as /proc/fs/lustre/llite/.../site
+	 *
+	 * When interpreting keep in mind that both sub-locks (and sub-pages)
+	 * and top-locks (and top-pages) are accounted here.
+	 */
+	struct cache_stats	cs_pages;
+	atomic_t		cs_pages_state[CPS_NR];
+};
+
+int  cl_site_init(struct cl_site *s, struct cl_device *top);
+void cl_site_fini(struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+        return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+	LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+	return container_of_safe(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+        return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+	return container_of_safe(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+	return container_of_safe(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+	return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+	return container_of_safe(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+	return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+	return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+	return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+	lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                       struct cl_object *obj,
+                       const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+                     struct cl_object *obj, const struct cl_io_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+                                 const struct lu_fid *fid,
+                                 const struct cl_object_conf *c);
+
+int  cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put        (const struct lu_env *env, struct cl_object *o);
+void cl_object_get        (struct cl_object *o);
+void cl_object_attr_lock  (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int  cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr);
+int  cl_object_attr_update(const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_attr *attr, unsigned valid);
+int  cl_object_glimpse    (const struct lu_env *env, struct cl_object *obj,
+                           struct ost_lvb *lvb);
+int  cl_conf_set          (const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_object_conf *conf);
+int  cl_object_prune      (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill       (const struct lu_env *env, struct cl_object *obj);
+int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj,
+			struct lov_user_md __user *lum, size_t size);
+int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj,
+		     struct ll_fiemap_info_key *fmkey, struct fiemap *fiemap,
+		     size_t *buflen);
+int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj,
+			 struct cl_layout *cl);
+loff_t cl_object_maxbytes(struct cl_object *obj);
+int cl_object_flush(const struct lu_env *env, struct cl_object *obj,
+		    struct ldlm_lock *lock);
+
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+        return cl_object_header(o0) == cl_object_header(o1);
+}
+
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+	clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+	cl_object_header(clob)->coh_page_bufsize += cfs_size_round(size);
+	WARN_ON(cl_object_header(clob)->coh_page_bufsize > 512);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+					 struct cl_page *page)
+{
+	return (void *)((char *)page + clob->co_slice_off);
+}
+
+/**
+ * Return refcount of cl_object.
+ */
+static inline int cl_object_refc(struct cl_object *clob)
+{
+	struct lu_object_header *header = clob->co_lu.lo_header;
+	return atomic_read(&header->loh_ref);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+struct cl_page *cl_page_find        (const struct lu_env *env,
+                                     struct cl_object *obj,
+                                     pgoff_t idx, struct page *vmpage,
+                                     enum cl_page_type type);
+struct cl_page *cl_page_alloc       (const struct lu_env *env,
+				     struct cl_object *o, pgoff_t ind,
+				     struct page *vmpage,
+				     enum cl_page_type type);
+void            cl_page_get         (struct cl_page *page);
+void            cl_page_put         (const struct lu_env *env,
+                                     struct cl_page *page);
+void		cl_pagevec_put      (const struct lu_env *env,
+				     struct cl_page *page,
+				     struct pagevec *pvec);
+void            cl_page_print       (const struct lu_env *env, void *cookie,
+                                     lu_printer_t printer,
+                                     const struct cl_page *pg);
+void            cl_page_header_print(const struct lu_env *env, void *cookie,
+                                     lu_printer_t printer,
+                                     const struct cl_page *pg);
+struct cl_page *cl_vmpage_page      (struct page *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top         (struct cl_page *page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                       const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int  cl_page_own        (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+int  cl_page_own_try    (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+void cl_page_assume     (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+void cl_page_unassume   (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *pg);
+void cl_page_disown     (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+int  cl_page_is_owned   (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int  cl_page_prep       (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+                         struct cl_page *pg, enum cl_req_type crt, int ioret);
+int  cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+                         enum cl_req_type crt);
+int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
+                         int from, int to);
+int  cl_page_flush      (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void    cl_page_discard(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *pg);
+void    cl_page_delete(const struct lu_env *env, struct cl_page *pg);
+int     cl_page_is_vmlocked(const struct lu_env *env,
+			    const struct cl_page *pg);
+void	cl_page_touch(const struct lu_env *env, const struct cl_page *pg,
+		      size_t to);
+void    cl_page_export(const struct lu_env *env,
+		       struct cl_page *pg, int uptodate);
+loff_t  cl_offset(const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset);
+size_t  cl_page_size(const struct cl_object *obj);
+
+void cl_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+			 lu_printer_t printer,
+			 const struct cl_lock_descr *descr);
+/* @} helper */
+
+/**
+ * Data structure managing a client's cached pages. A count of
+ * "unstable" pages is maintained, and an LRU of clean pages is
+ * maintained. "unstable" pages are pages pinned by the ptlrpc
+ * layer for recovery purposes.
+ */
+struct cl_client_cache {
+	/**
+	 * # of client cache refcount
+	 * # of users (OSCs) + 2 (held by llite and lov)
+	 */
+	atomic_t		ccc_users;
+	/**
+	 * # of threads are doing shrinking
+	 */
+	unsigned int		ccc_lru_shrinkers;
+	/**
+	 * # of LRU entries available
+	 */
+	atomic_long_t		ccc_lru_left;
+	/**
+	 * List of entities(OSCs) for this LRU cache
+	 */
+	struct list_head	ccc_lru;
+	/**
+	 * Max # of LRU entries
+	 */
+	unsigned long		ccc_lru_max;
+	/**
+	 * Lock to protect ccc_lru list
+	 */
+	spinlock_t		ccc_lru_lock;
+	/**
+	 * Set if unstable check is enabled
+	 */
+	unsigned int		ccc_unstable_check:1;
+	/**
+	 * # of unstable pages for this mount point
+	 */
+	atomic_long_t		ccc_unstable_nr;
+	/**
+	 * Waitq for awaiting unstable pages to reach zero.
+	 * Used at umounting time and signaled on BRW commit
+	 */
+	wait_queue_head_t	ccc_unstable_waitq;
+	/**
+	 * Serialize max_cache_mb write operation
+	 */
+	struct mutex		ccc_max_cache_mb_lock;
+};
+/**
+ * cl_cache functions
+ */
+struct cl_client_cache *cl_cache_init(unsigned long lru_page_max);
+void cl_cache_incref(struct cl_client_cache *cache);
+void cl_cache_decref(struct cl_client_cache *cache);
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+int cl_lock_request(const struct lu_env *env, struct cl_io *io,
+		    struct cl_lock *lock);
+int cl_lock_init(const struct lu_env *env, struct cl_lock *lock,
+		 const struct cl_io *io);
+void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock);
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype);
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock);
+
+int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io,
+		    struct cl_lock *lock, struct cl_sync_io *anchor);
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int   cl_io_init         (const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_sub_init     (const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_rw_init      (const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_type iot, loff_t pos, size_t count);
+int   cl_io_loop         (const struct lu_env *env, struct cl_io *io);
+
+void  cl_io_fini         (const struct lu_env *env, struct cl_io *io);
+int   cl_io_iter_init    (const struct lu_env *env, struct cl_io *io);
+void  cl_io_iter_fini    (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock         (const struct lu_env *env, struct cl_io *io);
+void  cl_io_unlock       (const struct lu_env *env, struct cl_io *io);
+int   cl_io_start        (const struct lu_env *env, struct cl_io *io);
+void  cl_io_end          (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock_add     (const struct lu_env *env, struct cl_io *io,
+                          struct cl_io_lock_link *link);
+int   cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                           struct cl_lock_descr *descr);
+int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue);
+int   cl_io_submit_sync  (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue,
+			  long timeout);
+int   cl_io_commit_async (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *queue, int from, int to,
+			  cl_commit_cbt cb);
+void  cl_io_extent_release (const struct lu_env *env, struct cl_io *io);
+int cl_io_lru_reserve(const struct lu_env *env, struct cl_io *io,
+		      loff_t pos, size_t bytes);
+int   cl_io_read_ahead   (const struct lu_env *env, struct cl_io *io,
+			  pgoff_t start, struct cl_read_ahead *ra);
+void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
+                          size_t nob);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+}
+
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+	return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
+/**
+ * True, iff \a io is a truncate(2).
+ */
+static inline int cl_io_is_trunc(const struct cl_io *io)
+{
+	return io->ci_type == CIT_SETATTR &&
+		(io->u.ci_setattr.sa_avalid & ATTR_SIZE) &&
+		(io->u.ci_setattr.sa_subtype != CL_SETATTR_FALLOCATE);
+}
+
+static inline int cl_io_is_fallocate(const struct cl_io *io)
+{
+	return (io->ci_type == CIT_SETATTR) &&
+	       (io->u.ci_setattr.sa_subtype == CL_SETATTR_FALLOCATE);
+}
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(obj, base) memset_startat(obj, 0, base)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Last page in the page list.
+ */
+static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
+{
+	LASSERT(plist->pl_nr > 0);
+	return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
+}
+
+static inline struct cl_page *cl_page_list_first(struct cl_page_list *plist)
+{
+	LASSERT(plist->pl_nr > 0);
+	return list_first_entry(&plist->pl_pages, struct cl_page, cp_batch);
+}
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list)                               \
+	list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list)                    \
+	list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init(struct cl_page_list *plist);
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page,
+		      bool get_ref);
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+		       struct cl_page *page);
+void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src,
+			    struct cl_page *page);
+void cl_page_list_splice(struct cl_page_list *list,
+			 struct cl_page_list *head);
+void cl_page_list_del(const struct lu_env *env,
+		      struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init(struct cl_2queue *queue);
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page,
+		   bool get_ref);
+void cl_2queue_disown(const struct lu_env *env, struct cl_io *io,
+		      struct cl_2queue *queue);
+void cl_2queue_assume(const struct lu_env *env, struct cl_io *io,
+		      struct cl_2queue *queue);
+void cl_2queue_discard(const struct lu_env *env, struct cl_io *io,
+		       struct cl_2queue *queue);
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_req_attr *attr);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+struct cl_sync_io;
+struct cl_dio_aio;
+struct cl_sub_dio;
+
+typedef void (cl_sync_io_end_t)(const struct lu_env *, struct cl_sync_io *);
+
+void cl_sync_io_init_notify(struct cl_sync_io *anchor, int nr, void *dio_aio,
+			    cl_sync_io_end_t *end);
+
+int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor,
+		    long timeout);
+void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor,
+		     int ioret);
+int cl_sync_io_wait_recycle(const struct lu_env *env, struct cl_sync_io *anchor,
+			    long timeout, int ioret);
+struct cl_dio_aio *cl_dio_aio_alloc(struct kiocb *iocb, struct cl_object *obj,
+				    bool is_aio);
+struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio, bool sync);
+void cl_dio_aio_free(const struct lu_env *env, struct cl_dio_aio *aio);
+void cl_sub_dio_free(struct cl_sub_dio *sdio);
+static inline void cl_sync_io_init(struct cl_sync_io *anchor, int nr)
+{
+	cl_sync_io_init_notify(anchor, nr, NULL, NULL);
+}
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+	/** number of pages yet to be transferred. */
+	atomic_t		csi_sync_nr;
+	/** error code. */
+	int			csi_sync_rc;
+	/** completion to be signaled when transfer is complete. */
+	wait_queue_head_t	csi_waitq;
+	/** callback to invoke when this IO is finished */
+	cl_sync_io_end_t       *csi_end_io;
+	/* private pointer for an associated DIO/AIO */
+	void		       *csi_dio_aio;
+};
+
+/** direct IO pages */
+struct ll_dio_pages {
+	/*
+	 * page array to be written. we don't support
+	 * partial pages except the last one.
+	 */
+	struct page             **ldp_pages;
+	/** # of pages in the array. */
+	size_t                  ldp_count;
+	/* the file offset of the first page. */
+	loff_t                  ldp_file_offset;
+};
+
+/* Top level struct used for AIO and DIO */
+struct cl_dio_aio {
+	struct cl_sync_io	cda_sync;
+	struct cl_object	*cda_obj;
+	struct kiocb		*cda_iocb;
+	ssize_t			cda_bytes;
+	unsigned		cda_no_aio_complete:1,
+				cda_creator_free:1;
+};
+
+/* Sub-dio used for splitting DIO (and AIO, because AIO is DIO) according to
+ * the layout/striping, so we can do parallel submit of DIO RPCs
+ */
+struct cl_sub_dio {
+	struct cl_sync_io	csd_sync;
+	struct cl_page_list	csd_pages;
+	ssize_t			csd_bytes;
+	struct cl_dio_aio	*csd_ll_aio;
+	struct ll_dio_pages	csd_dio_pages;
+	unsigned		csd_creator_free:1;
+};
+#if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) || \
+	defined(HAVE_DIRECTIO_2ARGS)
+#define HAVE_DIO_ITER 1
+#endif
+
+void ll_release_user_pages(struct page **pages, int npages);
+
+/** @} cl_sync_io */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ *     - there is a (mostly) fixed number of threads, and
+ *
+ *     - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumtpion fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ *     - allocation and destruction of environment is amortized by caching no
+ *     longer used environments instead of destroying them;
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct lu_env *cl_env_get(__u16 *refcheck);
+struct lu_env *cl_env_alloc(__u16 *refcheck, __u32 tags);
+void cl_env_put(struct lu_env *env, __u16 *refcheck);
+unsigned cl_env_cache_purge(unsigned nr);
+struct lu_env *cl_env_percpu_get(void);
+void cl_env_percpu_put(struct lu_env *env);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                                struct lu_device_type *ldt,
+                                struct lu_device *next);
+/** @} clio */
+
+int cl_global_init(void);
+void cl_global_fini(void);
+
+#endif /* _LINUX_CL_OBJECT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/dt_object.h b/drivers/staging/lustrefsx/lustre/include/dt_object.h
new file mode 100644
index 0000000000000..f24d7d359453a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/dt_object.h
@@ -0,0 +1,3054 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LUSTRE_DT_OBJECT_H
+#define __LUSTRE_DT_OBJECT_H
+
+/** \defgroup dt dt
+ * Sub-class of lu_object with methods common for "data" objects in OST stack.
+ *
+ * Data objects behave like regular files: you can read/write them, get and
+ * set their attributes. Implementation of dt interface is supposed to
+ * implement some form of garbage collection, normally reference counting
+ * (nlink) based one.
+ *
+ * Examples: osd (lustre/osd) is an implementation of dt interface.
+ * @{
+ */
+
+#include <obd_support.h>
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+
+#include <libcfs/libcfs.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+
+struct thandle;
+struct dt_device;
+struct dt_object;
+struct dt_index_features;
+struct niobuf_local;
+struct niobuf_remote;
+struct ldlm_enqueue_info;
+
+typedef enum {
+        MNTOPT_USERXATTR        = 0x00000001,
+        MNTOPT_ACL              = 0x00000002,
+} mntopt_t;
+
+struct dt_device_param {
+	unsigned	   ddp_max_name_len;
+	unsigned	   ddp_max_nlink;
+	unsigned	   ddp_symlink_max;
+	mntopt_t	   ddp_mntopts;
+	unsigned	   ddp_max_ea_size;
+	unsigned	   ddp_mount_type;
+	unsigned long long ddp_maxbytes;
+	/* per-inode space consumption */
+	short		   ddp_inodespace;
+	/* maximum number of blocks in an extent */
+	unsigned	   ddp_max_extent_blks;
+	/* per-extent insertion overhead to be used by client for grant
+	 * calculation */
+	unsigned int	   ddp_extent_tax;
+	unsigned int	   ddp_brw_size;	/* optimal RPC size */
+	/* T10PI checksum type, zero if not supported */
+	enum cksum_types   ddp_t10_cksum_type;
+	bool		   ddp_has_lseek_data_hole;
+};
+
+/**
+ * Per-transaction commit callback function
+ */
+struct dt_txn_commit_cb;
+typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th,
+                        struct dt_txn_commit_cb *cb, int err);
+/**
+ * Special per-transaction callback for cases when just commit callback
+ * is needed and per-device callback are not convenient to use
+ */
+#define TRANS_COMMIT_CB_MAGIC	0xa0a00a0a
+#define MAX_COMMIT_CB_STR_LEN	32
+
+#define DCB_TRANS_STOP		0x1
+struct dt_txn_commit_cb {
+	struct list_head	dcb_linkage;
+	dt_cb_t			dcb_func;
+	void			*dcb_data;
+	__u32			dcb_magic;
+	__u32			dcb_flags;
+	char			dcb_name[MAX_COMMIT_CB_STR_LEN];
+};
+
+/**
+ * Operations on dt device.
+ */
+struct dt_device_operations {
+        /**
+         * Return device-wide statistics.
+	 *
+	 * Return device-wide stats including block size, total and
+	 * free blocks, total and free objects, etc. See struct obd_statfs
+	 * for the details.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[out] osfs	stats information
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*dt_statfs)(const struct lu_env *env,
+			   struct dt_device *dev,
+			   struct obd_statfs *osfs,
+			   struct obd_statfs_info *info);
+
+        /**
+	 * Create transaction.
+	 *
+	 * Create in-memory structure representing the transaction for the
+	 * caller. The structure returned will be used by the calling thread
+	 * to specify the transaction the updates belong to. Once created
+	 * successfully ->dt_trans_stop() must be called in any case (with
+	 * ->dt_trans_start() and updates or not) so that the transaction
+	 * handle and other resources can be released by the layers below.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 *
+	 * \retval pointer to handle	if creation succeeds
+	 * \retval ERR_PTR(errno)	if creation fails
+         */
+        struct thandle *(*dt_trans_create)(const struct lu_env *env,
+                                           struct dt_device *dev);
+
+        /**
+	 * Start transaction.
+	 *
+	 * Start the transaction. The transaction described by \a th can be
+	 * started only once. Another start is considered as an error.
+	 * A thread is not supposed to start a transaction while another
+	 * transaction isn't closed by the thread (though multiple handles
+	 * can be created). The caller should start the transaction once
+	 * all possible updates are declared (see the ->do_declare_* methods
+	 * below) and all the needed resources are reserved.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*dt_trans_start)(const struct lu_env *env,
+				struct dt_device *dev,
+				struct thandle *th);
+
+	/**
+	 * Stop transaction.
+	 *
+	 * Once stopped the transaction described by \a th is complete (all
+	 * the needed updates are applied) and further processing such as
+	 * flushing to disk, sending to another target, etc, is handled by
+	 * lower layers. The caller can't access this transaction by the
+	 * handle anymore (except from the commit callbacks, see below).
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dt_trans_stop)(const struct lu_env *env,
+			       struct dt_device *dev,
+			       struct thandle *th);
+
+        /**
+         * Add commit callback to the transaction.
+	 *
+	 * Add a commit callback to the given transaction handle. The callback
+	 * will be called when the associated transaction is stored. I.e. the
+	 * transaction will survive an event like power off if the callback did
+	 * run. The number of callbacks isn't limited, but you should note that
+	 * some disk filesystems do handle the commit callbacks in the thread
+	 * handling commit/flush of all the transactions, meaning that new
+	 * transactions are blocked from commit and flush until all the
+	 * callbacks are done. Also, note multiple callbacks can be running
+	 * concurrently using multiple CPU cores. The callbacks will be running
+	 * in a special environment which can not be used to pass data around.
+	 *
+	 * \param[in] th	transaction handle
+	 * \param[in] dcb	commit callback description
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*dt_trans_cb_add)(struct thandle *th,
+                                 struct dt_txn_commit_cb *dcb);
+
+        /**
+	 * Return FID of root index object.
+	 *
+	 * Return the FID of the root object in the filesystem. This object
+	 * is usually provided as a bootstrap point by a disk filesystem.
+	 * This is up to the implementation which FID to use, though
+	 * [FID_SEQ_ROOT:1:0] is reserved for this purpose.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[out] fid	FID of the root object
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*dt_root_get)(const struct lu_env *env,
+			     struct dt_device *dev,
+			     struct lu_fid *f);
+
+        /**
+         * Return device configuration data.
+	 *
+	 * Return device (disk fs, actually) specific configuration.
+	 * The configuration isn't subject to change at runtime.
+	 * See struct dt_device_param for the details.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[out] param	configuration parameters
+         */
+        void  (*dt_conf_get)(const struct lu_env *env,
+                             const struct dt_device *dev,
+                             struct dt_device_param *param);
+
+	/**
+	 * Return device's super block.
+	 *
+	 * \param[in] dev	dt device
+	 */
+	struct super_block *(*dt_mnt_sb_get)(const struct dt_device *dev);
+
+	/**
+	 * Sync the device.
+	 *
+	 * Sync all the cached state (dirty buffers, pages, etc) to the
+	 * persistent storage. The method returns control once the sync is
+	 * complete. This operation may incur significant I/O to disk and
+	 * should be reserved for cases where a global sync is strictly
+	 * necessary.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dt_sync)(const struct lu_env *env,
+			 struct dt_device *dev);
+
+	/**
+	 * Make device read-only.
+	 *
+	 * Prevent new modifications to the device. This is a very specific
+	 * state where all the changes are accepted successfully and the
+	 * commit callbacks are called, but persistent state never changes.
+	 * Used only in the tests to simulate power-off scenario.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dt_ro)(const struct lu_env *env,
+		       struct dt_device *dev);
+
+	/**
+	 * Start transaction commit asynchronously.
+	 *
+
+	 * Provide a hint to the underlying filesystem that it should start
+	 * committing soon. The control returns immediately. It's up to the
+	 * layer implementing the method how soon to start committing. Usually
+	 * this should be throttled to some extent, otherwise the number of
+	 * aggregated transaction goes too high causing performance drop.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dt_commit_async)(const struct lu_env *env,
+				 struct dt_device *dev);
+
+	/**
+	 * The unit of \a count is byte for block or inodes for metadata.
+	 *
+	 * If \a count > 0, reserve quota in advance of an operation that
+	 * changes the quota assignment, such as chgrp() or rename() into
+	 * a directory with a different group ID.
+	 *
+	 * If \a count < 0, free the reserved quota previously.
+	 *
+	 * \param[in] env       execution environment for this thread
+	 * \param[in] dev       the bottom OSD device to reserve quota
+	 * \param[in] type      quota type (LQUOTA_RES_DT or LQUOTA_RES_MD)
+	 * \param[in] uid       quota uid
+	 * \param[in] gid       quota gid
+	 * \param[in] count     space (bytes or inodes) to reserve or free
+	 * \param[in] md        true for inode, false for block
+	 *
+	 * \retval 0            on success
+	 * \retval negative     negated errno on error
+	 */
+	int   (*dt_reserve_or_free_quota)(const struct lu_env *env,
+					  struct dt_device *dev,
+					  enum quota_type type, __u64 uid,
+					  __u64 gid, __s64 count, bool md);
+};
+
+struct dt_index_features {
+        /** required feature flags from enum dt_index_flags */
+        __u32 dif_flags;
+        /** minimal required key size */
+        size_t dif_keysize_min;
+        /** maximal required key size, 0 if no limit */
+        size_t dif_keysize_max;
+        /** minimal required record size */
+        size_t dif_recsize_min;
+        /** maximal required record size, 0 if no limit */
+        size_t dif_recsize_max;
+        /** pointer size for record */
+        size_t dif_ptrsize;
+};
+
+enum dt_index_flags {
+	/** index supports variable sized keys */
+	DT_IND_VARKEY = BIT(0),
+	/** index supports variable sized records */
+	DT_IND_VARREC = BIT(1),
+	/** index can be modified */
+	DT_IND_UPDATE = BIT(2),
+	/** index supports records with non-unique (duplicate) keys */
+	DT_IND_NONUNQ = BIT(3),
+	/**
+	 * index support fixed-size keys sorted with natural numerical way
+	 * and is able to return left-side value if no exact value found
+	 */
+	DT_IND_RANGE = BIT(4),
+};
+
+/* for dt_read_lock() and dt_write_lock() object lock rule */
+enum dt_object_role {
+	DT_SRC_PARENT,
+	DT_SRC_CHILD,
+	DT_TGT_PARENT,
+	DT_TGT_CHILD,
+	DT_TGT_ORPHAN,
+	DT_LASTID,
+};
+
+/**
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+extern const struct dt_index_features dt_otable_features;
+extern const struct dt_index_features dt_lfsck_layout_orphan_features;
+extern const struct dt_index_features dt_lfsck_layout_dangling_features;
+extern const struct dt_index_features dt_lfsck_namespace_features;
+
+/* index features supported by the accounting objects */
+extern const struct dt_index_features dt_acct_features;
+
+/* index features supported by the quota global indexes */
+extern const struct dt_index_features dt_quota_glb_features;
+
+/* index features supported by the quota slave indexes */
+extern const struct dt_index_features dt_quota_slv_features;
+
+/* index features supported by the nodemap index */
+extern const struct dt_index_features dt_nodemap_features;
+
+/**
+ * This is a general purpose dt allocation hint.
+ * It now contains the parent object.
+ * It can contain any allocation hint in the future.
+ */
+struct dt_allocation_hint {
+	struct dt_object	*dah_parent;
+	const void		*dah_eadata;
+	int			dah_eadata_len;
+	int			dah_acl_len;
+	__u32			dah_mode;
+	int			dah_append_stripes;
+	bool			dah_can_block;
+	char			*dah_append_pool;
+};
+
+/**
+ * object type specifier.
+ */
+
+enum dt_format_type {
+        DFT_REGULAR,
+        DFT_DIR,
+        /** for mknod */
+        DFT_NODE,
+        /** for special index */
+        DFT_INDEX,
+        /** for symbolic link */
+        DFT_SYM,
+};
+
+/**
+ * object format specifier.
+ */
+struct dt_object_format {
+        /** type for dt object */
+        enum dt_format_type dof_type;
+        union {
+                struct dof_regular {
+			int striped;
+                } dof_reg;
+                struct dof_dir {
+                } dof_dir;
+                struct dof_node {
+                } dof_node;
+                /**
+                 * special index need feature as parameter to create
+                 * special idx
+                 */
+                struct dof_index {
+                        const struct dt_index_features *di_feat;
+                } dof_idx;
+        } u;
+};
+
+enum dt_format_type dt_mode_to_dft(__u32 mode);
+
+typedef __u64 dt_obj_version_t;
+
+union ldlm_policy_data;
+
+struct md_layout_change;
+
+/**
+ * A dt_object provides common operations to create and destroy
+ * objects and to manage regular and extended attributes.
+ */
+struct dt_object_operations {
+	/**
+	 * Get read lock on object.
+	 *
+	 * Read lock is compatible with other read locks, so it's shared.
+	 * Read lock is not compatible with write lock which is exclusive.
+	 * The lock is blocking and can't be used from an interrupt context.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object to lock for reading
+	 * \param[in] role	a hint to debug locks (see kernel's mutexes)
+	 */
+	void  (*do_read_lock)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      unsigned role);
+
+	/*
+	 * Get write lock on object.
+	 *
+	 * Write lock is exclusive and cannot be shared. The lock is blocking
+	 * and can't be used from an interrupt context.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object to lock for writing
+	 * \param[in] role	a hint to debug locks (see kernel's mutexes)
+	 *
+	 */
+	void  (*do_write_lock)(const struct lu_env *env,
+			       struct dt_object *dt,
+			       unsigned role);
+
+	/**
+	 * Release read lock.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 */
+        void  (*do_read_unlock)(const struct lu_env *env,
+                                struct dt_object *dt);
+
+	/**
+	 * Release write lock.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 */
+        void  (*do_write_unlock)(const struct lu_env *env,
+                                 struct dt_object *dt);
+
+	/**
+	 * Check whether write lock is held.
+	 *
+	 * The caller can learn whether write lock is held on the object
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 *
+	 * \retval 0		no write lock
+	 * \retval 1		write lock is held
+	 */
+        int  (*do_write_locked)(const struct lu_env *env,
+                                struct dt_object *dt);
+
+	/**
+	 * Declare intention to request reqular attributes.
+	 *
+	 * Notity the underlying filesystem that the caller may request regular
+	 * attributes with ->do_attr_get() soon. This allows OSD to implement
+	 * prefetching logic in an object-oriented manner. The implementation
+	 * can be noop. This method should avoid expensive delays such as
+	 * waiting on disk I/O, otherwise the goal of enabling a performance
+	 * optimization would be defeated.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_attr_get)(const struct lu_env *env,
+				     struct dt_object *dt);
+
+	/**
+	 * Return regular attributes.
+	 *
+	 * The object must exist. Currently all the attributes should be
+	 * returned, but in the future this can be improved so that only
+	 * a selected set is returned. This can improve performance as in
+	 * some cases attributes are stored in different places and
+	 * getting them all can be an iterative and expensive process.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] attr	attributes to fill
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_attr_get)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     struct lu_attr *attr);
+
+	/**
+	 * Declare intention to change regular object's attributes.
+	 *
+	 * Notify the underlying filesystem that the regular attributes may
+	 * change in this transaction. This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4).  This method should be
+	 * called between creating the transaction and starting it. Note that
+	 * the la_valid field of \a attr specifies which attributes will change.
+	 * The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] attr	attributes to change specified in attr.la_valid
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*do_declare_attr_set)(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const struct lu_attr *attr,
+				     struct thandle *th);
+
+	/**
+	 * Change regular attributes.
+	 *
+	 * Change regular attributes in the given transaction. Note only
+	 * attributes flagged by attr.la_valid change. The object must
+	 * exist. If the layer implementing this method is responsible for
+	 * quota, then the method should maintain object accounting for the
+	 * given credentials when la_uid/la_gid changes.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] attr	new attributes to apply
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_attr_set)(const struct lu_env *env,
+                             struct dt_object *dt,
+                             const struct lu_attr *attr,
+			     struct thandle *th);
+
+	/**
+	 * Declare intention to request extented attribute.
+	 *
+	 * Notify the underlying filesystem that the caller may request extended
+	 * attribute with ->do_xattr_get() soon. This allows OSD to implement
+	 * prefetching logic in an object-oriented manner. The implementation
+	 * can be noop. This method should avoid expensive delays such as
+	 * waiting on disk I/O, otherwise the goal of enabling a performance
+	 * optimization would be defeated.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	unused, may be removed in the future
+	 * \param[in] name	name of the extended attribute
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_xattr_get)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      struct lu_buf *buf,
+				      const char *name);
+
+	/**
+	 * Return a value of an extended attribute.
+	 *
+	 * The object must exist. If the buffer is NULL, then the method
+	 * must return the size of the value.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] buf	buffer in which to store the value
+	 * \param[in] name	name of the extended attribute
+	 *
+	 * \retval 0		on success
+	 * \retval -ERANGE	if \a buf is too small
+	 * \retval negative	negated errno on error
+	 * \retval positive	value's size if \a buf is NULL or has zero size
+	 */
+	int   (*do_xattr_get)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      struct lu_buf *buf,
+			      const char *name);
+
+	/**
+	 * Declare intention to change an extended attribute.
+	 *
+	 * Notify the underlying filesystem that the extended attribute may
+	 * change in this transaction.  This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4).  This method should be
+	 * called between creating the transaction and starting it. The object
+	 * need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	buffer storing new value of the attribute
+	 * \param[in] name	name of the attribute
+	 * \param[in] fl	LU_XATTR_CREATE - fail if EA exists
+	 *			LU_XATTR_REPLACE - fail if EA doesn't exist
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+	int   (*do_declare_xattr_set)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_buf *buf,
+				      const char *name,
+				      int fl,
+				      struct thandle *th);
+
+	/**
+	 * Set an extended attribute.
+	 *
+	 * Change or replace the specified extended attribute (EA).
+	 * The flags passed in \a fl dictate whether the EA is to be
+	 * created or replaced, as follows.
+	 *   LU_XATTR_CREATE - fail if EA exists
+	 *   LU_XATTR_REPLACE - fail if EA doesn't exist
+	 * The object must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	buffer storing new value of the attribute
+	 * \param[in] name	name of the attribute
+	 * \param[in] fl	flags indicating EA creation or replacement
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_xattr_set)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const struct lu_buf *buf,
+			      const char *name,
+			      int fl,
+			      struct thandle *th);
+
+	/**
+	 * Declare intention to delete an extended attribute.
+	 *
+	 * Notify the underlying filesystem that the extended attribute may
+	 * be deleted in this transaction. This enables the layer below to
+	 * prepare resources (e.g. journal credits in ext4).  This method
+	 * should be called between creating the transaction and starting it.
+	 * The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] name	name of the attribute
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_xattr_del)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const char *name,
+				      struct thandle *th);
+
+	/**
+	 * Delete an extended attribute.
+	 *
+	 * This method deletes the specified extended attribute. The object
+	 * must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] name	name of the attribute
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_xattr_del)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const char *name,
+			      struct thandle *th);
+
+	/**
+	 * Return a list of the extended attributes.
+	 *
+	 * Fills the passed buffer with a list of the extended attributes
+	 * found in the object. The names are separated with '\0'.
+	 * The object must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] buf	buffer to put the list in
+	 *
+	 * \retval positive	bytes used/required in the buffer
+	 * \retval negative	negated errno on error
+         */
+	int   (*do_xattr_list)(const struct lu_env *env,
+			       struct dt_object *dt,
+			       const struct lu_buf *buf);
+
+	/**
+	 * Prepare allocation hint for a new object.
+	 *
+	 * This method is used by the caller to inform OSD of the parent-child
+	 * relationship between two objects and enable efficient object
+	 * allocation. Filled allocation hint will be passed to ->do_create()
+	 * later.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[out] ah	allocation hint
+	 * \param[in] parent	parent object (can be NULL)
+	 * \param[in] child	child object
+	 * \param[in] _mode	type of the child object
+	 */
+	void  (*do_ah_init)(const struct lu_env *env,
+			    struct dt_allocation_hint *ah,
+			    struct dt_object *parent,
+			    struct dt_object *child,
+			    umode_t mode);
+
+	/**
+	 * Declare intention to create a new object.
+	 *
+	 * Notify the underlying filesystem that the object may be created
+	 * in this transaction. This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4).  This method should be
+	 * called between creating the transaction and starting it.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should reserve an object for the given credentials
+	 * and return an error if quota is over. If object creation later
+	 * fails for some reason, then the reservation should be released
+	 * properly (usually in ->dt_trans_stop()).
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] attr	attributes of the new object
+	 * \param[in] hint	allocation hint
+	 * \param[in] dof	object format
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_declare_create)(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   struct lu_attr *attr,
+                                   struct dt_allocation_hint *hint,
+                                   struct dt_object_format *dof,
+                                   struct thandle *th);
+
+	/**
+	 * Create new object.
+	 *
+	 * The method creates the object passed with the specified attributes
+	 * and object format. Object allocation procedure can use information
+	 * stored in the allocation hint. Different object formats are supported
+	 * (see enum dt_format_type and struct dt_object_format) depending on
+	 * the device. If creation succeeds, then LOHA_EXISTS flag must be set
+	 * in the LU-object header attributes.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should maintain object accounting for the given
+	 * credentials.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] attr	attributes of the new object
+	 * \param[in] hint	allocation hint
+	 * \param[in] dof	object format
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_create)(const struct lu_env *env,
+			   struct dt_object *dt,
+                           struct lu_attr *attr,
+                           struct dt_allocation_hint *hint,
+                           struct dt_object_format *dof,
+                           struct thandle *th);
+
+	/**
+	 * Declare intention to destroy an object.
+	 *
+	 * Notify the underlying filesystem that the object may be destroyed
+	 * in this transaction. This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4).  This method should be
+	 * called between creating the transaction and starting it. The object
+	 * need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_declare_destroy)(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    struct thandle *th);
+
+	/**
+	 * Destroy an object.
+	 *
+	 * This method destroys the object and all the resources associated
+	 * with the object (data, key/value pairs, extended attributes, etc).
+	 * The object must exist. If destroy is successful, then flag
+	 * LU_OBJECT_HEARD_BANSHEE should be set to forbid access to this
+	 * instance of in-core object. Any subsequent access to the same FID
+	 * should get another instance with no LOHA_EXIST flag set.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should maintain object accounting for the given
+	 * credentials.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_destroy)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct thandle *th);
+
+	/**
+	 * Try object as an index.
+	 *
+         * Announce that this object is going to be used as an index. This
+	 * operation checks that object supports indexing operations and
+         * installs appropriate dt_index_operations vector on success.
+         * Also probes for features. Operation is successful if all required
+	 * features are supported. It's not possible to access the object
+	 * with index methods before ->do_index_try() returns success.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] feat	index features
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_index_try)(const struct lu_env *env,
+                              struct dt_object *dt,
+                              const struct dt_index_features *feat);
+
+	/**
+	 * Declare intention to increment nlink count.
+	 *
+	 * Notify the underlying filesystem that the nlink regular attribute
+	 * be changed in this transaction. This enables the layer below to
+	 * prepare resources (e.g. journal credits in ext4).  This method
+	 * should be called between creating the transaction and starting it.
+	 * The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_ref_add)(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct thandle *th);
+
+	/**
+	 * Increment nlink.
+	 *
+	 * Increment nlink (from the regular attributes set) in the given
+	 * transaction. Note the absolute limit for nlink should be learnt
+	 * from struct dt_device_param::ddp_max_nlink. The object must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_ref_add)(const struct lu_env *env,
+                            struct dt_object *dt, struct thandle *th);
+
+	/**
+	 * Declare intention to decrement nlink count.
+	 *
+	 * Notify the underlying filesystem that the nlink regular attribute
+	 * be changed in this transaction. This enables the layer below to
+	 * prepare resources (e.g. journal credits in ext4).  This method
+	 * should be called between creating the transaction and starting it.
+	 * The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_ref_del)(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct thandle *th);
+
+	/**
+	 * Decrement nlink.
+	 *
+	 * Decrement nlink (from the regular attributes set) in the given
+	 * transaction. The object must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_ref_del)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct thandle *th);
+
+	/**
+	 * Sync obect.
+	 *
+	 * The method is called to sync specified range of the object to a
+	 * persistent storage. The control is returned once the operation is
+	 * complete. The difference from ->do_sync() is that the object can
+	 * be in-sync with the persistent storage (nothing to flush), then
+	 * the method returns quickly with no I/O overhead. So, this method
+	 * should be preferred over ->do_sync() where possible. Also note that
+	 * if the object isn't clean, then some disk filesystems will call
+	 * ->do_sync() to maintain overall consistency, in which case it's
+	 * still very expensive.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] start	start of the range to sync
+	 * \param[in] end	end of the range to sync
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*do_object_sync)(const struct lu_env *env, struct dt_object *obj,
+			      __u64 start, __u64 end);
+
+	/**
+	 * Lock object.
+	 *
+	 * Lock object(s) using Distributed Lock Manager (LDLM).
+	 *
+	 * Get LDLM locks for the object. Currently used to lock "remote"
+	 * objects in DNE configuration - a service running on MDTx needs
+	 * to lock an object on MDTy.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] lh	lock handle, sometimes used, sometimes not
+	 * \param[in] einfo	ldlm callbacks, locking type and mode
+	 * \param[out] einfo	private data to be passed to unlock later
+	 * \param[in] policy	inodebits data
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt,
+			      struct lustre_handle *lh,
+			      struct ldlm_enqueue_info *einfo,
+			      union ldlm_policy_data *policy);
+
+	/**
+	 * Unlock object.
+	 *
+	 * Release LDLM lock(s) granted with ->do_object_lock().
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] einfo	lock handles, from ->do_object_lock()
+	 * \param[in] policy	inodebits data
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*do_object_unlock)(const struct lu_env *env,
+				struct dt_object *dt,
+				struct ldlm_enqueue_info *einfo,
+				union ldlm_policy_data *policy);
+
+	/**
+	 * Invalidate attribute cache.
+	 *
+	 * This method invalidate attribute cache of the object, which is on OSP
+	 * only.
+	 *
+	 * \param[in] env	execution envionment for this thread
+	 * \param[in] dt	object
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_invalidate)(const struct lu_env *env, struct dt_object *dt);
+
+	/**
+	 * Check object stale state.
+	 *
+	 * OSP only.
+	 *
+	 * \param[in] dt	object
+	 *
+	 * \retval true		for stale object
+	 * \retval false	for not stale object
+	 */
+	bool (*do_check_stale)(struct dt_object *dt);
+
+	/**
+	 * Declare intention to instaintiate extended layout component.
+	 *
+	 * \param[in] env	execution environment
+	 * \param[in] dt	DT object
+	 * \param[in] layout	data structure to describe the changes to
+	 *			the DT object's layout
+	 * \param[in] buf	buffer containing client's lovea or empty
+	 *
+	 * \retval 0		success
+	 * \retval -ne		error code
+	 */
+	int (*do_declare_layout_change)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct md_layout_change *mlc,
+					struct thandle *th);
+
+	/**
+	 * Client is trying to write to un-instantiated layout component.
+	 *
+	 * \param[in] env	execution environment
+	 * \param[in] dt	DT object
+	 * \param[in] layout	data structure to describe the changes to
+	 *			the DT object's layout
+	 * \param[in] buf	buffer containing client's lovea or empty
+	 *
+	 * \retval 0		success
+	 * \retval -ne		error code
+	 */
+	int (*do_layout_change)(const struct lu_env *env, struct dt_object *dt,
+				struct md_layout_change *mlc,
+				struct thandle *th);
+};
+
+enum dt_bufs_type {
+	DT_BUFS_TYPE_READ	= 0x0000,
+	DT_BUFS_TYPE_WRITE	= 0x0001,
+	DT_BUFS_TYPE_READAHEAD	= 0x0002,
+	DT_BUFS_TYPE_LOCAL	= 0x0004,
+};
+
+/**
+ * Per-dt-object operations on "file body" - unstructure raw data.
+ */
+struct dt_body_operations {
+	/**
+	 * Read data.
+	 *
+	 * Read unstructured data from an existing regular object.
+	 * Only data before attr.la_size is returned.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] buf	buffer (including size) to copy data in
+	 * \param[in] pos	position in the object to start
+	 * \param[out] pos	original value of \a pos + bytes returned
+	 *
+	 * \retval positive	bytes read on success
+	 * \retval negative	negated errno on error
+	 */
+	ssize_t (*dbo_read)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct lu_buf *buf,
+			    loff_t *pos);
+
+	/**
+	 * Declare intention to write data to object.
+	 *
+	 * Notify the underlying filesystem that data may be written in
+	 * this transaction. This enables the layer below to prepare resources
+	 * (e.g. journal credits in ext4).  This method should be called
+	 * between creating the transaction and starting it. The object need
+	 * not exist. If the layer implementing this method is responsible for
+	 * quota, then the method should reserve space for the given credentials
+	 * and return an error if quota is over. If the write later fails
+	 * for some reason, then the reserve should be released properly
+	 * (usually in ->dt_trans_stop()).
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	buffer (including size) to copy data from
+	 * \param[in] pos	position in the object to start
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	ssize_t (*dbo_declare_write)(const struct lu_env *env,
+				     struct dt_object *dt,
+				     const struct lu_buf *buf,
+				     loff_t pos,
+				     struct thandle *th);
+
+	/**
+	 * Write unstructured data to regular existing object.
+	 *
+	 * The method allocates space and puts data in. Also, the method should
+	 * maintain attr.la_size properly. Partial writes are possible.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should maintain space accounting for the given
+	 * credentials.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	buffer (including size) to copy data from
+	 * \param[in] pos	position in the object to start
+	 * \param[out] pos	\a pos + bytes written
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval positive	bytes written on success
+	 * \retval negative	negated errno on error
+	 */
+	ssize_t (*dbo_write)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     const struct lu_buf *buf,
+			     loff_t *pos,
+			     struct thandle *th);
+
+	/**
+	 * Return buffers for data.
+	 *
+	 * This method is used to access data with no copying. It's so-called
+	 * zero-copy I/O. The method returns the descriptors for the internal
+	 * buffers where data are managed by the disk filesystem. For example,
+	 * pagecache in case of ext4 or ARC with ZFS. Then other components
+	 * (e.g. networking) can transfer data from or to the buffers with no
+	 * additional copying.
+	 *
+	 * The method should fill an array of struct niobuf_local, where
+	 * each element describes a full or partial page for data at specific
+	 * offset. The caller should use page/lnb_page_offset/len to find data
+	 * at object's offset lnb_file_offset.
+	 *
+	 * The memory referenced by the descriptors can't change its purpose
+	 * until the complementary ->dbo_bufs_put() is called. The caller should
+	 * specify if the buffers are used to read or modify data so that OSD
+	 * can decide how to initialize the buffers: bring all the data for
+	 * reads or just bring partial buffers for write. Note: the method does
+	 * not check whether output array is large enough.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] pos	position in the object to start
+	 * \param[in] len	size of region in bytes
+	 * \param[out] lb	array of descriptors to fill
+	 * \param[in] maxlnb	max slots in @lnb array
+	 * \param[in] rw	0 if used to read, 1 if used for write
+	 *
+	 * \retval positive	number of descriptors on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_bufs_get)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    loff_t pos,
+			    ssize_t len,
+			    struct niobuf_local *lb,
+			    int maxlnb,
+			    enum dt_bufs_type rw);
+
+	/**
+	 * Release reference granted by ->dbo_bufs_get().
+	 *
+	 * Release the reference granted by the previous ->dbo_bufs_get().
+	 * Note the references are counted.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] lb	array of descriptors to fill
+	 * \param[in] nr	size of the array
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_bufs_put)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct niobuf_local *lb,
+			    int nr);
+
+	/**
+	 * Prepare buffers for reading.
+	 *
+	 * The method is called on the given buffers to fill them with data
+	 * if that wasn't done in ->dbo_bufs_get(). The idea is that the
+	 * caller should be able to get few buffers for discontiguous regions
+	 * using few calls to ->dbo_bufs_get() and then request them all for
+	 * the preparation with a single call, so that OSD can fire many I/Os
+	 * to run concurrently. It's up to the specific OSD whether to implement
+	 * this logic in ->dbo_read_prep() or just use ->dbo_bufs_get() to
+	 * prepare data for every requested region individually.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] lnb	array of buffer descriptors
+	 * \param[in] nr	size of the array
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_read_prep)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     struct niobuf_local *lnb,
+			     int nr);
+
+	/**
+	 * Prepare buffers for write.
+	 *
+	 * This method is called on the given buffers to ensure the partial
+	 * buffers contain correct data. The underlying idea is the same as
+	 * in ->db_read_prep().
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] lb	array of buffer descriptors
+	 * \param[in] nr	size of the array
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_write_prep)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      struct niobuf_local *lb,
+			      int nr);
+
+	/**
+	 * Declare intention to write data stored in the buffers.
+	 *
+	 * Notify the underlying filesystem that data may be written in
+	 * this transaction. This enables the layer below to prepare resources
+	 * (e.g. journal credits in ext4).  This method should be called
+	 * between creating the transaction and starting it.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should be reserving a space for the given
+	 * credentials and return an error if quota is exceeded. If the write
+	 * later fails for some reason, then the reserve should be released
+	 * properly (usually in ->dt_trans_stop()).
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] lb	array of descriptors
+	 * \param[in] nr	size of the array
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_declare_write_commit)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct niobuf_local *lb,
+					int nr,
+					struct thandle *th);
+
+	/**
+	 * Write to existing object.
+	 *
+	 * This method is used to write data to a persistent storage using
+	 * the buffers returned by ->dbo_bufs_get(). The caller puts new
+	 * data into the buffers using own mechanisms (e.g. direct transfer
+	 * from a NIC). The method should maintain attr.la_size. Also,
+	 * attr.la_blocks should be maintained but this can be done in lazy
+	 * manner, when actual allocation happens.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should maintain space accounting for the given
+	 * credentials.
+	 *
+	 * user_size parameter is the apparent size of the file, ie the size
+	 * of the clear text version of the file. It can differ from the actual
+	 * amount of valuable data received when a file is encrypted,
+	 * because encrypted pages always contain PAGE_SIZE bytes of data,
+	 * even if clear text data is only a few bytes.
+	 * In case of encrypted file, apparent size will be stored as the inode
+	 * size, so that servers return to clients an object size they can use
+	 * to determine clear text size.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] lb	array of descriptors for the buffers
+	 * \param[in] nr	size of the array
+	 * \param[in] th	transaction handle
+	 * \param[in] user_size	apparent size
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_write_commit)(const struct lu_env *env,
+				struct dt_object *dt,
+				struct niobuf_local *lb,
+				int nr,
+				struct thandle *th,
+				__u64 user_size);
+
+	/**
+	 * Return logical to physical block mapping for a given extent
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] fm	describe the region to map and the output buffer
+	 *			see the details in include/linux/fiemap.h
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_fiemap_get)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      struct fiemap *fm);
+
+	/**
+	 * Declare intention to deallocate space from an object.
+	 *
+	 * Notify the underlying filesystem that space may be deallocated in
+	 * this transactions. This enables the layer below to prepare resources
+	 * (e.g. journal credits in ext4).  This method should be called between
+	 * creating the transaction and starting it. The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] start	the start of the region to deallocate
+	 * \param[in] end	the end of the region to deallocate
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dbo_declare_punch)(const struct lu_env *env,
+				   struct dt_object *dt,
+				   __u64 start,
+				   __u64 end,
+				   struct thandle *th);
+
+	/**
+	 * Deallocate specified region in an object.
+	 *
+	 * This method is used to deallocate (release) space possibly consumed
+	 * by the given region of the object. If the layer implementing this
+	 * method is responsible for quota, then the method should maintain
+	 * space accounting for the given credentials.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] start	the start of the region to deallocate
+	 * \param[in] end	the end of the region to deallocate
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dbo_punch)(const struct lu_env *env,
+			   struct dt_object *dt,
+			   __u64 start,
+			   __u64 end,
+			   struct thandle *th);
+	/**
+	 * Give advices on specified region in an object.
+	 *
+	 * This method is used to give advices about access pattern on an
+	 * given region of the object. The disk filesystem understands
+	 * the advices and tunes cache/read-ahead policies.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] start	the start of the region affected
+	 * \param[in] end	the end of the region affected
+	 * \param[in] advice	advice type
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dbo_ladvise)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     __u64 start,
+			     __u64 end,
+			     enum lu_ladvise_type advice);
+
+	/**
+	 * Declare intention to preallocate space for an object
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_declare_fallocate)(const struct lu_env *env,
+				    struct dt_object *dt, __u64 start,
+				    __u64 end, int mode, struct thandle *th);
+	/**
+	 * Allocate specified region for an object
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] start	the start of the region to allocate
+	 * \param[in] end	the end of the region to allocate
+	 * \param[in] mode	fallocate mode
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_fallocate)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    __u64 start,
+			    __u64 end,
+			    int mode,
+			    struct thandle *th);
+	/**
+	 * Do SEEK_HOLE/SEEK_DATA request on object
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] offset	the offset to start seek from
+	 * \param[in] whence	seek mode, SEEK_HOLE or SEEK_DATA
+	 *
+	 * \retval hole/data offset	on success
+	 * \retval negative		negated errno on error
+	 */
+	loff_t (*dbo_lseek)(const struct lu_env *env, struct dt_object *dt,
+			    loff_t offset, int whence);
+};
+
+/**
+ * Incomplete type of index record.
+ */
+struct dt_rec;
+
+/**
+ * Incomplete type of index key.
+ */
+struct dt_key;
+
+/**
+ * Incomplete type of dt iterator.
+ */
+struct dt_it;
+
+/**
+ * Per-dt-object operations on object as index. Index is a set of key/value
+ * pairs abstracted from an on-disk representation. An index supports the
+ * number of operations including lookup by key, insert and delete. Also,
+ * an index can be iterated to find the pairs one by one, from a beginning
+ * or specified point.
+ */
+struct dt_index_operations {
+	/**
+	 * Lookup in an index by key.
+	 *
+	 * The method returns a value for the given key. Key/value format
+	 * and size should have been negotiated with ->do_index_try() before.
+	 * Thus it's the caller's responsibility to provide the method with
+	 * proper key and big enough buffer. No external locking is required,
+	 * all the internal consistency should be implemented by the method
+	 * or lower layers. The object should should have been created with
+	 * type DFT_INDEX or DFT_DIR.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] rec	buffer where value will be stored
+	 * \param[in] key	key
+	 *
+	 * \retval 0		on success
+	 * \retval -ENOENT	if key isn't found
+	 * \retval negative	negated errno on error
+	 */
+	int (*dio_lookup)(const struct lu_env *env,
+			  struct dt_object *dt,
+			  struct dt_rec *rec,
+			  const struct dt_key *key);
+
+	/**
+	 * Declare intention to insert a key/value into an index.
+	 *
+	 * Notify the underlying filesystem that new key/value may be inserted
+	 * in this transaction. This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4). This method should be
+	 * called between creating the transaction and starting it. key/value
+	 * format and size is subject to ->do_index_try().
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] rec	buffer storing value
+	 * \param[in] key	key
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int (*dio_declare_insert)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_rec *rec,
+				  const struct dt_key *key,
+				  struct thandle *th);
+
+	/**
+	 * Insert a new key/value pair into an index.
+	 *
+	 * The method inserts specified key/value pair into the given index
+	 * object. The internal consistency is maintained by the method or
+	 * the functionality below. The format and size of key/value should
+	 * have been negotiated before using ->do_index_try(), no additional
+	 * information can be specified to the method. The keys are unique
+	 * in a given index.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] rec	buffer storing value
+	 * \param[in] key	key
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dio_insert)(const struct lu_env *env,
+			  struct dt_object *dt,
+			  const struct dt_rec *rec,
+			  const struct dt_key *key,
+			  struct thandle *th);
+
+	/**
+	 * Declare intention to delete a key/value from an index.
+	 *
+	 * Notify the underlying filesystem that key/value may be deleted in
+	 * this transaction. This enables the layer below to prepare resources
+	 * (e.g. journal credits in ext4).  This method should be called
+	 * between creating the transaction and starting it. Key/value format
+	 * and size is subject to ->do_index_try(). The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] key	key
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int (*dio_declare_delete)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_key *key,
+				  struct thandle *th);
+
+	/**
+	 * Delete key/value pair from an index.
+	 *
+	 * The method deletes specified key and corresponding value from the
+	 * given index object. The internal consistency is maintained by the
+	 * method or the functionality below. The format and size of the key
+	 * should have been negotiated before using ->do_index_try(), no
+	 * additional information can be specified to the method.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] key	key
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dio_delete)(const struct lu_env *env,
+			  struct dt_object *dt,
+			  const struct dt_key *key,
+			  struct thandle *th);
+
+        /**
+	 * Iterator interface.
+	 *
+	 * Methods to iterate over an existing index, list the keys stored and
+	 * associated values, get key/value size, etc.
+         */
+        struct dt_it_ops {
+		/**
+		 * Allocate and initialize new iterator.
+		 *
+		 * The iterator is a handler to be used in the subsequent
+		 * methods to access index's content. Note the position is
+		 * not defined at this point and should be initialized with
+		 * ->get() or ->load() method.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] dt	object
+		 * \param[in] attr	ask the iterator to return part of
+					the records, see LUDA_* for details
+		 *
+		 * \retval pointer	iterator pointer on success
+		 * \retval ERR_PTR(errno)	on error
+                 */
+                struct dt_it *(*init)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      __u32 attr);
+
+		/**
+		 * Release iterator.
+		 *
+		 * Release the specified iterator and all the resources
+		 * associated (e.g. the object, index cache, etc).
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator to release
+		 */
+                void          (*fini)(const struct lu_env *env,
+                                      struct dt_it *di);
+
+		/**
+		 * Move position of iterator.
+		 *
+		 * Move the position of the specified iterator to the specified
+		 * key.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 * \param[in] key	key to position to
+		 *
+		 * \retval 0		if exact key is found
+		 * \retval 1		if at the record with least key
+		 *			not larger than the key
+		 * \retval negative	negated errno on error
+		 */
+                int            (*get)(const struct lu_env *env,
+                                      struct dt_it *di,
+                                      const struct dt_key *key);
+
+		/**
+		 * Release position
+		 *
+		 * Complimentary method for dt_it_ops::get() above. Some
+		 * implementation can increase a reference on the iterator in
+		 * dt_it_ops::get(). So the caller should be able to release
+		 * with dt_it_ops::put().
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 */
+                void           (*put)(const struct lu_env *env,
+                                      struct dt_it *di);
+
+		/**
+		 * Move to next record.
+		 *
+		 * Moves the position of the iterator to a next record
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 *
+		 * \retval 1		if no more records
+		 * \retval 0		on success, the next record is found
+		 * \retval negative	negated errno on error
+		 */
+                int           (*next)(const struct lu_env *env,
+                                      struct dt_it *di);
+
+		/**
+		 * Return key.
+		 *
+		 * Returns a pointer to a buffer containing the key of the
+		 * record at the current position. The pointer is valid and
+		 * retains data until ->get(), ->load() and ->fini() methods
+		 * are called.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 *
+		 * \retval pointer to key	on success
+		 * \retval ERR_PTR(errno)	on error
+		 */
+                struct dt_key *(*key)(const struct lu_env *env,
+                                      const struct dt_it *di);
+
+		/**
+		 * Return key size.
+		 *
+		 * Returns size of the key at the current position.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 *
+		 * \retval key's size	on success
+		 * \retval negative	negated errno on error
+		 */
+                int       (*key_size)(const struct lu_env *env,
+                                      const struct dt_it *di);
+
+		/**
+		 * Return record.
+		 *
+		 * Stores the value of the record at the current position. The
+		 * buffer must be big enough (as negotiated with
+		 * ->do_index_try() or ->rec_size()). The caller can specify
+		 * she is interested only in part of the record, using attr
+		 * argument (see LUDA_* definitions for the details).
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 * \param[out] rec	buffer to store value in
+		 * \param[in] attr	specify part of the value to copy
+		 *
+		 * \retval 0		on success
+		 * \retval negative	negated errno on error
+		 */
+                int            (*rec)(const struct lu_env *env,
+                                      const struct dt_it *di,
+                                      struct dt_rec *rec,
+                                      __u32 attr);
+
+		/**
+		 * Return record size.
+		 *
+		 * Returns size of the record at the current position. The
+		 * \a attr can be used to specify only the parts of the record
+		 * needed to be returned. (see LUDA_* definitions for the
+		 * details).
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 * \param[in] attr	part of the record to return
+		 *
+		 * \retval record's size	on success
+		 * \retval negative		negated errno on error
+		 */
+		int	   (*rec_size)(const struct lu_env *env,
+				       const struct dt_it *di,
+				      __u32 attr);
+
+		/**
+		 * Return a cookie (hash).
+		 *
+		 * Returns the cookie (usually hash) of the key at the current
+		 * position. This allows the caller to resume iteration at this
+		 * position later. The exact value is specific to implementation
+		 * and should not be interpreted by the caller.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 *
+		 * \retval cookie/hash of the key
+		 */
+                __u64        (*store)(const struct lu_env *env,
+                                      const struct dt_it *di);
+
+		/**
+		 * Initialize position using cookie/hash.
+		 *
+		 * Initializes the current position of the iterator to one
+		 * described by the cookie/hash as returned by ->store()
+		 * previously.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 * \param[in] hash	cookie/hash value
+		 *
+		 * \retval positive	if current position points to
+		 *			record with least cookie not larger
+		 *			than cookie
+		 * \retval 0		if current position matches cookie
+		 * \retval negative	negated errno on error
+		 */
+                int           (*load)(const struct lu_env *env,
+				      const struct dt_it *di,
+				      __u64 hash);
+
+		/**
+		 * Not used
+		 */
+                int        (*key_rec)(const struct lu_env *env,
+				      const struct dt_it *di,
+				      void *key_rec);
+        } dio_it;
+};
+
+enum dt_otable_it_valid {
+	DOIV_ERROR_HANDLE	= 0x0001,
+	DOIV_DRYRUN		= 0x0002,
+};
+
+enum dt_otable_it_flags {
+	/* Exit when fail. */
+	DOIF_FAILOUT	= 0x0001,
+
+	/* Reset iteration position to the device beginning. */
+	DOIF_RESET	= 0x0002,
+
+	/* There is up layer component uses the iteration. */
+	DOIF_OUTUSED	= 0x0004,
+
+	/* Check only without repairing. */
+	DOIF_DRYRUN	= 0x0008,
+};
+
+/* otable based iteration needs to use the common DT iteration APIs.
+ * To initialize the iteration, it needs call dio_it::init() firstly.
+ * Here is how the otable based iteration should prepare arguments to
+ * call dt_it_ops::init().
+ *
+ * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init()
+ * is composed of two parts:
+ * low 16-bits is for valid bits, high 16-bits is for flags bits. */
+#define DT_OTABLE_IT_FLAGS_SHIFT	16
+#define DT_OTABLE_IT_FLAGS_MASK 	0xffff0000
+
+struct dt_device {
+        struct lu_device                   dd_lu_dev;
+        const struct dt_device_operations *dd_ops;
+
+        /**
+         * List of dt_txn_callback (see below). This is not protected in any
+         * way, because callbacks are supposed to be added/deleted only during
+         * single-threaded start-up shut-down procedures.
+         */
+	struct list_head		   dd_txn_callbacks;
+	unsigned int			   dd_record_fid_accessed:1,
+					   dd_rdonly:1;
+
+	/* sysfs and debugfs handling */
+	struct dentry			  *dd_debugfs_entry;
+
+	const struct attribute		 **dd_def_attrs;
+	struct kobject			   dd_kobj;
+	struct kobj_type		   dd_ktype;
+	struct completion		   dd_kobj_unregister;
+};
+
+int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
+void dt_device_fini(struct dt_device *dev);
+
+static inline int lu_device_is_dt(const struct lu_device *d)
+{
+        return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT);
+}
+
+static inline struct dt_device * lu2dt_dev(struct lu_device *l)
+{
+	LASSERT(lu_device_is_dt(l));
+	return container_of_safe(l, struct dt_device, dd_lu_dev);
+}
+
+struct dt_object {
+        struct lu_object                   do_lu;
+        const struct dt_object_operations *do_ops;
+        const struct dt_body_operations   *do_body_ops;
+        const struct dt_index_operations  *do_index_ops;
+};
+
+/*
+ * In-core representation of per-device local object OID storage
+ */
+struct local_oid_storage {
+	/* all initialized llog systems on this node linked by this */
+	struct list_head  los_list;
+
+	/* how many handle's reference this los has */
+	atomic_t	  los_refcount;
+	struct dt_device *los_dev;
+	struct dt_object *los_obj;
+
+	/* data used to generate new fids */
+	struct mutex	  los_id_lock;
+	__u64		  los_seq;
+	__u32		  los_last_oid;
+};
+
+static inline struct lu_device *dt2lu_dev(struct dt_device *d)
+{
+        return &d->dd_lu_dev;
+}
+
+static inline struct dt_object *lu2dt(struct lu_object *l)
+{
+	LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev));
+	return container_of_safe(l, struct dt_object, do_lu);
+}
+
+int  dt_object_init(struct dt_object *obj,
+                    struct lu_object_header *h, struct lu_device *d);
+
+void dt_object_fini(struct dt_object *obj);
+
+static inline int dt_object_exists(const struct dt_object *dt)
+{
+        return lu_object_exists(&dt->do_lu);
+}
+
+static inline int dt_object_remote(const struct dt_object *dt)
+{
+	return lu_object_remote(&dt->do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+	LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+	return container_of_safe(o, struct dt_object, do_lu);
+}
+
+static inline struct dt_object *dt_object_child(struct dt_object *o)
+{
+	return container_of(lu_object_next(&(o)->do_lu),
+			    struct dt_object, do_lu);
+}
+
+struct dt_quota_reserve_rec {
+	enum quota_type	 qrr_type;
+	union lquota_id	 qrr_id;
+	__u64		 qrr_count;
+};
+
+/**
+ * This is the general purpose transaction handle.
+ * 1. Transaction Life Cycle
+ *      This transaction handle is allocated upon starting a new transaction,
+ *      and deallocated after this transaction is committed.
+ * 2. Transaction Nesting
+ *      We do _NOT_ support nested transaction. So, every thread should only
+ *      have one active transaction, and a transaction only belongs to one
+ *      thread. Due to this, transaction handle need no reference count.
+ * 3. Transaction & dt_object locking
+ *      dt_object locks should be taken inside transaction.
+ * 4. Transaction & RPC
+ *      No RPC request should be issued inside transaction.
+ */
+struct thandle {
+	/** the dt device on which the transactions are executed */
+	struct dt_device *th_dev;
+
+	/* point to the top thandle, XXX this is a bit hacky right now,
+	 * but normal device trans callback triggered by the bottom
+	 * device (OSP/OSD == sub thandle layer) needs to get the
+	 * top_thandle (see dt_txn_hook_start/stop()), so we put the
+	 * top thandle here for now, will fix it when we have better
+	 * callback mechanism */
+	struct thandle	*th_top;
+
+	/* reserved quota for this handle */
+	struct dt_quota_reserve_rec th_reserved_quota;
+
+	/** the last operation result in this transaction.
+	 * this value is used in recovery */
+	__s32             th_result;
+
+	/** whether we need sync commit */
+	unsigned int		th_sync:1,
+	/* local transation, no need to inform other layers */
+				th_local:1,
+	/* Whether we need wait the transaction to be submitted
+	 * (send to remote target) */
+				th_wait_submit:1,
+	/* complex transaction which will track updates on all targets,
+	 * including OSTs */
+				th_complex:1,
+	/* whether ignore quota */
+				th_ignore_quota:1,
+	/* whether restart transaction */
+				th_restart_tran:1;
+};
+
+/**
+ * Transaction call-backs.
+ *
+ * These are invoked by osd (or underlying transaction engine) when
+ * transaction changes state.
+ *
+ * Call-backs are used by upper layers to modify transaction parameters and to
+ * perform some actions on for each transaction state transition. Typical
+ * example is mdt registering call-back to write into last-received file
+ * before each transaction commit.
+ */
+struct dt_txn_callback {
+        int (*dtc_txn_start)(const struct lu_env *env,
+                             struct thandle *txn, void *cookie);
+        int (*dtc_txn_stop)(const struct lu_env *env,
+                            struct thandle *txn, void *cookie);
+	void			*dtc_cookie;
+	__u32			dtc_tag;
+	struct list_head	dtc_linkage;
+};
+
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb);
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
+
+int dt_txn_hook_start(const struct lu_env *env,
+                      struct dt_device *dev, struct thandle *txn);
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
+
+/**
+ * Callback function used for parsing path.
+ * \see llo_store_resolve
+ */
+typedef int (*dt_entry_func_t)(const struct lu_env *env,
+                            const char *name,
+                            void *pvt);
+
+#define DT_MAX_PATH 1024
+
+int dt_path_parser(const struct lu_env *env,
+                   char *local, dt_entry_func_t entry_func,
+                   void *data);
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid);
+
+struct dt_object *dt_store_open(const struct lu_env *env,
+                                struct dt_device *dt,
+                                const char *dirname,
+                                const char *filename,
+                                struct lu_fid *fid);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+                                    struct dt_device *dt,
+                                    const struct lu_fid *fid,
+                                    struct dt_object_format *dof,
+                                    struct lu_attr *attr);
+
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev,
+			       const struct lu_fid *fid,
+			       struct lu_device *top_dev,
+			       const struct lu_object_conf *conf);
+
+static inline struct dt_object *
+dt_locate(const struct lu_env *env, struct dt_device *dev,
+	  const struct lu_fid *fid)
+{
+	return dt_locate_at(env, dev, fid,
+			    dev->dd_lu_dev.ld_site->ls_top_dev, NULL);
+}
+
+static inline struct dt_object *
+dt_object_locate(struct dt_object *dto, struct dt_device *dt_dev)
+{
+	struct lu_object *lo;
+
+	list_for_each_entry(lo, &dto->do_lu.lo_header->loh_layers, lo_linkage) {
+		if (lo->lo_dev == &dt_dev->dd_lu_dev)
+			return container_of(lo, struct dt_object, do_lu);
+	}
+	return NULL;
+}
+
+static inline void dt_object_put(const struct lu_env *env,
+				 struct dt_object *dto)
+{
+	lu_object_put(env, &dto->do_lu);
+}
+
+static inline void dt_object_put_nocache(const struct lu_env *env,
+					 struct dt_object *dto)
+{
+	lu_object_put_nocache(env, &dto->do_lu);
+}
+
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los);
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los);
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid);
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o,
+				struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th);
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o,
+			struct lu_attr *attr, struct dt_object_format *dof,
+			struct thandle *th);
+struct dt_object *local_file_find(const struct lu_env *env,
+				  struct local_oid_storage *los,
+				  struct dt_object *parent,
+				  const char *name);
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode);
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode);
+struct dt_object *
+local_index_find_or_create(const struct lu_env *env,
+			   struct local_oid_storage *los,
+			   struct dt_object *parent,
+			   const char *name, __u32 mode,
+			   const struct dt_index_features *ft);
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft);
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name);
+
+static inline int dt_object_lock(const struct lu_env *env,
+				 struct dt_object *o, struct lustre_handle *lh,
+				 struct ldlm_enqueue_info *einfo,
+				 union ldlm_policy_data *policy)
+{
+	LASSERT(o != NULL);
+	LASSERT(o->do_ops != NULL);
+	LASSERT(o->do_ops->do_object_lock != NULL);
+	return o->do_ops->do_object_lock(env, o, lh, einfo, policy);
+}
+
+static inline int dt_object_unlock(const struct lu_env *env,
+				   struct dt_object *o,
+				   struct ldlm_enqueue_info *einfo,
+				   union ldlm_policy_data *policy)
+{
+	LASSERT(o != NULL);
+	LASSERT(o->do_ops != NULL);
+	LASSERT(o->do_ops->do_object_unlock != NULL);
+	return o->do_ops->do_object_unlock(env, o, einfo, policy);
+}
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+		  const char *name, struct lu_fid *fid);
+
+static inline int dt_object_sync(const struct lu_env *env, struct dt_object *o,
+				 __u64 start, __u64 end)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_object_sync);
+	return o->do_ops->do_object_sync(env, o, start, end);
+}
+
+static inline int dt_fid_alloc(const struct lu_env *env,
+			       struct dt_device *d,
+			       struct lu_fid *fid,
+			       struct lu_object *parent,
+			       const struct lu_name *name)
+{
+	struct lu_device *l = dt2lu_dev(d);
+
+	return l->ld_ops->ldo_fid_alloc(env, l, fid, parent, name);
+}
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+                           struct thandle *th);
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+                    dt_obj_version_t version, struct thandle *th);
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o);
+
+
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+            struct lu_buf *buf, loff_t *pos);
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+                   struct lu_buf *buf, loff_t *pos);
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+                    const struct lu_buf *buf, loff_t *pos, struct thandle *th);
+typedef int (*dt_index_page_build_t)(const struct lu_env *env,
+				     union lu_page *lp, size_t nob,
+				     const struct dt_it_ops *iops,
+				     struct dt_it *it, __u32 attr, void *arg);
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg);
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg);
+
+static inline struct thandle *dt_trans_create(const struct lu_env *env,
+                                              struct dt_device *d)
+{
+        LASSERT(d->dd_ops->dt_trans_create);
+        return d->dd_ops->dt_trans_create(env, d);
+}
+
+static inline int dt_trans_start(const struct lu_env *env,
+                                 struct dt_device *d, struct thandle *th)
+{
+        LASSERT(d->dd_ops->dt_trans_start);
+        return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+/* for this transaction hooks shouldn't be called */
+static inline int dt_trans_start_local(const struct lu_env *env,
+                                       struct dt_device *d, struct thandle *th)
+{
+        LASSERT(d->dd_ops->dt_trans_start);
+        th->th_local = 1;
+        return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+static inline int dt_trans_stop(const struct lu_env *env,
+                                struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_stop);
+	return d->dd_ops->dt_trans_stop(env, d, th);
+}
+
+static inline int dt_trans_cb_add(struct thandle *th,
+				  struct dt_txn_commit_cb *dcb)
+{
+	LASSERT(th->th_dev->dd_ops->dt_trans_cb_add);
+	dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC;
+	return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb);
+}
+/** @} dt */
+
+
+static inline int dt_declare_record_write(const struct lu_env *env,
+					  struct dt_object *dt,
+					  const struct lu_buf *buf,
+					  loff_t pos,
+					  struct thandle *th)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERTF(dt->do_body_ops, DFID" doesn't exit\n",
+		 PFID(lu_object_fid(&dt->do_lu)));
+	LASSERT(dt->do_body_ops->dbo_declare_write);
+	rc = dt->do_body_ops->dbo_declare_write(env, dt, buf, pos, th);
+	return rc;
+}
+
+static inline int dt_declare_create(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    struct lu_attr *attr,
+                                    struct dt_allocation_hint *hint,
+                                    struct dt_object_format *dof,
+                                    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_create);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_CREATE))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_create(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    struct lu_attr *attr,
+                                    struct dt_allocation_hint *hint,
+                                    struct dt_object_format *dof,
+                                    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_create);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_CREATE))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_declare_destroy(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_destroy);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_DESTROY))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_destroy(env, dt, th);
+}
+
+static inline int dt_destroy(const struct lu_env *env,
+                             struct dt_object *dt,
+                             struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_destroy);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DESTROY))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_destroy(env, dt, th);
+}
+
+static inline void dt_read_lock(const struct lu_env *env,
+                                struct dt_object *dt,
+                                unsigned role)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_read_lock);
+        dt->do_ops->do_read_lock(env, dt, role);
+}
+
+static inline void dt_write_lock(const struct lu_env *env,
+                                struct dt_object *dt,
+                                unsigned role)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_write_lock);
+        dt->do_ops->do_write_lock(env, dt, role);
+}
+
+static inline void dt_read_unlock(const struct lu_env *env,
+                                struct dt_object *dt)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_read_unlock);
+        dt->do_ops->do_read_unlock(env, dt);
+}
+
+static inline void dt_write_unlock(const struct lu_env *env,
+                                struct dt_object *dt)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_write_unlock);
+        dt->do_ops->do_write_unlock(env, dt);
+}
+
+static inline int dt_write_locked(const struct lu_env *env,
+                                  struct dt_object *dt)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_write_locked);
+        return dt->do_ops->do_write_locked(env, dt);
+}
+
+static inline bool dt_object_stale(struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_check_stale);
+
+	return dt->do_ops->do_check_stale(dt);
+}
+
+static inline int dt_declare_attr_get(const struct lu_env *env,
+				      struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_attr_get);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_ATTR_GET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_declare_attr_get(env, dt);
+}
+
+static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt,
+			      struct lu_attr *la)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_attr_get);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_ATTR_GET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_attr_get(env, dt, la);
+}
+
+static inline int dt_declare_attr_set(const struct lu_env *env,
+                                      struct dt_object *dt,
+                                      const struct lu_attr *la,
+                                      struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_attr_set);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_ATTR_SET))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_attr_set(env, dt, la, th);
+}
+
+static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt,
+			      const struct lu_attr *la, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_attr_set);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_ATTR_SET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_attr_set(env, dt, la, th);
+}
+
+static inline int dt_declare_ref_add(const struct lu_env *env,
+                                     struct dt_object *dt, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_ref_add);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_REF_ADD))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_ref_add(env, dt, th);
+}
+
+static inline int dt_ref_add(const struct lu_env *env,
+                             struct dt_object *dt, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_ref_add);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_REF_ADD))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_ref_add(env, dt, th);
+}
+
+static inline int dt_declare_ref_del(const struct lu_env *env,
+                                     struct dt_object *dt, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_ref_del);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_REF_DEL))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_ref_del(env, dt, th);
+}
+
+static inline int dt_ref_del(const struct lu_env *env,
+                             struct dt_object *dt, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_ref_del);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_REF_DEL))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_ref_del(env, dt, th);
+}
+
+static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
+			      struct niobuf_remote *rnb,
+			      struct niobuf_local *lnb, int maxlnb,
+			      enum dt_bufs_type rw)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_bufs_get);
+	return d->do_body_ops->dbo_bufs_get(env, d, rnb->rnb_offset,
+					    rnb->rnb_len, lnb, maxlnb, rw);
+}
+
+static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
+                              struct niobuf_local *lnb, int n)
+{
+        LASSERT(d);
+        LASSERT(d->do_body_ops);
+        LASSERT(d->do_body_ops->dbo_bufs_put);
+        return d->do_body_ops->dbo_bufs_put(env, d, lnb, n);
+}
+
+static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d,
+                                struct niobuf_local *lnb, int n)
+{
+        LASSERT(d);
+        LASSERT(d->do_body_ops);
+        LASSERT(d->do_body_ops->dbo_write_prep);
+        return d->do_body_ops->dbo_write_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write_commit(const struct lu_env *env,
+                                          struct dt_object *d,
+                                          struct niobuf_local *lnb,
+                                          int n, struct thandle *th)
+{
+        LASSERTF(d != NULL, "dt is NULL when we want to declare write\n");
+        LASSERT(th != NULL);
+        return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th);
+}
+
+
+static inline int dt_write_commit(const struct lu_env *env,
+				  struct dt_object *d, struct niobuf_local *lnb,
+				  int n, struct thandle *th, __u64 size)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_write_commit);
+	return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th, size);
+}
+
+static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d,
+                               struct niobuf_local *lnb, int n)
+{
+        LASSERT(d);
+        LASSERT(d->do_body_ops);
+        LASSERT(d->do_body_ops->dbo_read_prep);
+        return d->do_body_ops->dbo_read_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write(const struct lu_env *env,
+				   struct dt_object *dt,
+				   const struct lu_buf *buf, loff_t pos,
+				   struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_write);
+	return dt->do_body_ops->dbo_declare_write(env, dt, buf, pos, th);
+}
+
+static inline ssize_t dt_write(const struct lu_env *env, struct dt_object *dt,
+			       const struct lu_buf *buf, loff_t *pos,
+			       struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_write);
+	return dt->do_body_ops->dbo_write(env, dt, buf, pos, th);
+}
+
+static inline int dt_declare_punch(const struct lu_env *env,
+                                   struct dt_object *dt, __u64 start,
+                                   __u64 end, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_body_ops);
+        LASSERT(dt->do_body_ops->dbo_declare_punch);
+        return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th);
+}
+
+static inline int dt_punch(const struct lu_env *env, struct dt_object *dt,
+			   __u64 start, __u64 end, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_body_ops);
+        LASSERT(dt->do_body_ops->dbo_punch);
+	return dt->do_body_ops->dbo_punch(env, dt, start, end, th);
+}
+
+static inline int dt_ladvise(const struct lu_env *env, struct dt_object *dt,
+			     __u64 start, __u64 end, int advice)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_ladvise);
+	return dt->do_body_ops->dbo_ladvise(env, dt, start, end, advice);
+}
+
+static inline int dt_declare_fallocate(const struct lu_env *env,
+				       struct dt_object *dt, __u64 start,
+				       __u64 end, int mode, struct thandle *th)
+{
+	LASSERT(dt);
+	if (!dt->do_body_ops)
+		return -EOPNOTSUPP;
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_fallocate);
+	return dt->do_body_ops->dbo_declare_fallocate(env, dt, start, end,
+						      mode, th);
+}
+
+static inline int dt_falloc(const struct lu_env *env, struct dt_object *dt,
+			      __u64 start, __u64 end, int mode,
+			      struct thandle *th)
+{
+	LASSERT(dt);
+	if (!dt->do_body_ops)
+		return -EOPNOTSUPP;
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_fallocate);
+	return dt->do_body_ops->dbo_fallocate(env, dt, start, end, mode, th);
+}
+
+static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d,
+				struct fiemap *fm)
+{
+        LASSERT(d);
+        if (d->do_body_ops == NULL)
+                return -EPROTO;
+	if (d->do_body_ops->dbo_fiemap_get == NULL)
+		return -EOPNOTSUPP;
+        return d->do_body_ops->dbo_fiemap_get(env, d, fm);
+}
+
+static inline loff_t dt_lseek(const struct lu_env *env, struct dt_object *d,
+			      loff_t offset, int whence)
+{
+	LASSERT(d);
+	if (d->do_body_ops == NULL)
+		return -EPROTO;
+	if (d->do_body_ops->dbo_lseek == NULL)
+		return -EOPNOTSUPP;
+	return d->do_body_ops->dbo_lseek(env, d, offset, whence);
+}
+
+static inline int dt_statfs_info(const struct lu_env *env,
+				 struct dt_device *dev,
+				struct obd_statfs *osfs,
+				struct obd_statfs_info *info)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_statfs);
+	return dev->dd_ops->dt_statfs(env, dev, osfs, info);
+}
+
+static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev,
+			    struct obd_statfs *osfs)
+{
+	return dt_statfs_info(env, dev, osfs, NULL);
+}
+
+static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev,
+                              struct lu_fid *f)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_root_get);
+        return dev->dd_ops->dt_root_get(env, dev, f);
+}
+
+static inline void dt_conf_get(const struct lu_env *env,
+                               const struct dt_device *dev,
+                               struct dt_device_param *param)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_conf_get);
+        return dev->dd_ops->dt_conf_get(env, dev, param);
+}
+
+static inline struct super_block *dt_mnt_sb_get(const struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	if (dev->dd_ops->dt_mnt_sb_get)
+		return dev->dd_ops->dt_mnt_sb_get(dev);
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_sync);
+        return dev->dd_ops->dt_sync(env, dev);
+}
+
+static inline int dt_ro(const struct lu_env *env, struct dt_device *dev)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_ro);
+        return dev->dd_ops->dt_ro(env, dev);
+}
+
+static inline int dt_declare_insert(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    const struct dt_rec *rec,
+                                    const struct dt_key *key,
+                                    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_declare_insert);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_INSERT))
+		return cfs_fail_err;
+
+	return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_insert(const struct lu_env *env,
+			    struct dt_object *dt,
+			    const struct dt_rec *rec,
+			    const struct dt_key *key,
+			    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_insert);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_INSERT))
+		return cfs_fail_err;
+
+	return dt->do_index_ops->dio_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_declare_xattr_del(const struct lu_env *env,
+                                       struct dt_object *dt,
+                                       const char *name,
+                                       struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_xattr_del);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_DEL))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_xattr_del(const struct lu_env *env,
+			       struct dt_object *dt, const char *name,
+			       struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_xattr_del);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_DEL))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_declare_xattr_set(const struct lu_env *env,
+                                      struct dt_object *dt,
+                                      const struct lu_buf *buf,
+                                      const char *name, int fl,
+                                      struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_xattr_set);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_SET))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_xattr_set(const struct lu_env *env,
+			       struct dt_object *dt, const struct lu_buf *buf,
+			       const char *name, int fl, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_xattr_set);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_SET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_declare_xattr_get(const struct lu_env *env,
+				       struct dt_object *dt,
+				       struct lu_buf *buf,
+				       const char *name)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_xattr_get);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_GET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_declare_xattr_get(env, dt, buf, name);
+}
+
+static inline int dt_xattr_get(const struct lu_env *env,
+			       struct dt_object *dt, struct lu_buf *buf,
+			       const char *name)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_xattr_get);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_GET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_xattr_get(env, dt, buf, name);
+}
+
+static inline int dt_xattr_list(const struct lu_env *env, struct dt_object *dt,
+				const struct lu_buf *buf)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_xattr_list);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_LIST))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_xattr_list(env, dt, buf);
+}
+
+static inline int dt_invalidate(const struct lu_env *env, struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_invalidate);
+
+	return dt->do_ops->do_invalidate(env, dt);
+}
+
+static inline int dt_declare_delete(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    const struct dt_key *key,
+                                    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_declare_delete);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_DELETE))
+		return cfs_fail_err;
+
+        return dt->do_index_ops->dio_declare_delete(env, dt, key, th);
+}
+
+static inline int dt_delete(const struct lu_env *env,
+			    struct dt_object *dt,
+			    const struct dt_key *key,
+			    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_delete);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DELETE))
+		return cfs_fail_err;
+
+	return dt->do_index_ops->dio_delete(env, dt, key, th);
+}
+
+static inline int dt_commit_async(const struct lu_env *env,
+                                  struct dt_device *dev)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_commit_async);
+        return dev->dd_ops->dt_commit_async(env, dev);
+}
+
+static inline int dt_reserve_or_free_quota(const struct lu_env *env,
+					   struct dt_device *dev,
+					   enum quota_type type, __u64 uid,
+					   __u64 gid, int count, bool is_md)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_reserve_or_free_quota);
+	return dev->dd_ops->dt_reserve_or_free_quota(env, dev, type, uid, gid,
+						     count, is_md);
+}
+
+static inline int dt_lookup(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct dt_rec *rec,
+			    const struct dt_key *key)
+{
+        int ret;
+
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_lookup);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_LOOKUP))
+		return cfs_fail_err;
+
+	ret = dt->do_index_ops->dio_lookup(env, dt, rec, key);
+        if (ret > 0)
+                ret = 0;
+        else if (ret == 0)
+                ret = -ENOENT;
+        return ret;
+}
+
+static inline int dt_declare_layout_change(const struct lu_env *env,
+					   struct dt_object *o,
+					   struct md_layout_change *mlc,
+					   struct thandle *th)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_declare_layout_change);
+	return o->do_ops->do_declare_layout_change(env, o, mlc, th);
+}
+
+static inline int dt_layout_change(const struct lu_env *env,
+				   struct dt_object *o,
+				   struct md_layout_change *mlc,
+				   struct thandle *th)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_layout_change);
+	return o->do_ops->do_layout_change(env, o, mlc, th);
+}
+
+struct dt_find_hint {
+	struct lu_fid        *dfh_fid;
+	struct dt_device     *dfh_dt;
+	struct dt_object     *dfh_o;
+};
+
+struct dt_insert_rec {
+	union {
+		const struct lu_fid	*rec_fid;
+		void			*rec_data;
+	};
+	union {
+		struct {
+			__u32		 rec_type;
+			__u32		 rec_padding;
+		};
+		__u64			 rec_misc;
+	};
+};
+
+struct dt_thread_info {
+	char                     dti_buf[DT_MAX_PATH];
+	struct dt_find_hint      dti_dfh;
+	struct lu_attr           dti_attr;
+	struct lu_fid            dti_fid;
+	struct dt_object_format  dti_dof;
+	struct lustre_mdt_attrs  dti_lma;
+	struct lu_buf            dti_lb;
+	struct lu_object_conf	 dti_conf;
+	loff_t                   dti_off;
+	struct dt_insert_rec	 dti_dt_rec;
+};
+
+extern struct lu_context_key dt_key;
+
+static inline struct dt_thread_info *dt_info(const struct lu_env *env)
+{
+	struct dt_thread_info *dti;
+
+	dti = lu_context_key_get(&env->le_ctx, &dt_key);
+	LASSERT(dti);
+	return dti;
+}
+
+int dt_global_init(void);
+void dt_global_fini(void);
+int dt_tunables_init(struct dt_device *dt, struct obd_type *type,
+		     const char *name, struct ldebugfs_vars *list);
+int dt_tunables_fini(struct dt_device *dt);
+
+# ifdef CONFIG_PROC_FS
+int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_kbytestotal_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_kbytesfree_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_kbytesavail_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_filestotal_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v);
+# endif /* CONFIG_PROC_FS */
+
+#endif /* __LUSTRE_DT_OBJECT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/interval_tree.h b/drivers/staging/lustrefsx/lustre/include/interval_tree.h
new file mode 100644
index 0000000000000..9d6f3031b4293
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/interval_tree.h
@@ -0,0 +1,130 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+struct interval_node {
+        struct interval_node   *in_left;
+        struct interval_node   *in_right;
+        struct interval_node   *in_parent;
+        unsigned                in_color:1,
+                                in_intree:1, /** set if the node is in tree */
+                                in_res1:30;
+        __u8                    in_res2[4];  /** tags, 8-bytes aligned */
+        __u64                   in_max_high;
+        struct interval_node_extent {
+                __u64 start;
+                __u64 end;
+        } in_extent;
+};
+
+enum interval_iter {
+        INTERVAL_ITER_CONT = 1,
+        INTERVAL_ITER_STOP = 2
+};
+
+static inline int interval_is_intree(struct interval_node *node)
+{
+        return node->in_intree == 1;
+}
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+        return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+        return node->in_extent.end;
+}
+
+static inline int interval_set(struct interval_node *node,
+			       __u64 start, __u64 end)
+{
+	if (start > end)
+		return -ERANGE;
+	node->in_extent.start = start;
+	node->in_extent.end = end;
+	node->in_max_high = end;
+	return 0;
+}
+
+static inline void interval_init(struct interval_node *node)
+{
+	memset(node, 0, sizeof(*node));
+}
+
+int node_equal(struct interval_node *n1, struct interval_node *n2);
+
+/* Rules to write an interval callback.
+ *  - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ *    should be stopped. It will then cause the iteration function to return
+ *    immediately with return value INTERVAL_ITER_STOP.
+ *  - callbacks for interval_iterate and interval_iterate_reverse: Every 
+ *    nodes in the tree will be set to @node before the callback being called
+ *  - callback for interval_search: Only overlapped node will be set to @node
+ *    before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+                                                  void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                      struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+                                   struct interval_node_extent *ex,
+                                   interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root, 
+                                    interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+                                    interval_callback_t func,void *data);
+
+void interval_expand(struct interval_node *root, 
+                     struct interval_node_extent *ext,
+                     struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root, 
+                           struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+                                    struct interval_node_extent *ex);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/llog_swab.h b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
new file mode 100644
index 0000000000000..cf48167ff8042
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
@@ -0,0 +1,70 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each type has corresponding 'lustre_swab_xxxtypexxx()' routines
+ * are implemented in ptlrpc/pack_generic.c.  These 'swabbers' convert the
+ * type from "other" endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ */
+
+#ifndef _LLOG_SWAB_H_
+#define _LLOG_SWAB_H_
+
+#include <uapi/linux/lustre/lustre_idl.h>
+struct lustre_cfg;
+
+void lustre_swab_lu_fid(struct lu_fid *fid);
+void lustre_swab_ost_id(struct ost_id *oid);
+void lustre_swab_ll_fid(struct ll_fid *fid);
+void lustre_swab_llogd_body(struct llogd_body *d);
+void lustre_swab_llog_hdr(struct llog_log_hdr *h);
+void lustre_swab_llogd_conn_body(struct llogd_conn_body *d);
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec);
+void lustre_swab_llog_id(struct llog_logid *lid);
+void lustre_swab_lu_seq_range(struct lu_seq_range *range);
+#ifdef HAVE_SERVER_SUPPORT
+void lustre_swab_update_ops(struct update_ops *uops, unsigned int op_count);
+#endif
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
+void lustre_swab_cfg_marker(struct cfg_marker *marker,
+			    int swab, int size);
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
new file mode 100644
index 0000000000000..d5fc7da4fbda2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
@@ -0,0 +1,1124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LPROCFS_STATUS_H
+#define _LPROCFS_STATUS_H
+
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/string_helpers.h>
+#include <linux/seq_file.h>
+
+#include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-fs.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+
+/*
+ * Liuux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2
+ * Now that proc and debugfs use separate operation vector types
+ * separate containers are also needed.
+ */
+struct lprocfs_vars {
+	const char			*name;
+	const struct proc_ops		*fops;
+	void				*data;
+	/** /proc file mode. */
+	mode_t				 proc_mode;
+};
+
+/** Provide a debugfs container */
+struct ldebugfs_vars {
+	const char			*name;
+	const struct file_operations	*fops;
+	void				*data;
+	/** debugfs file mode. */
+	mode_t				 proc_mode;
+};
+
+static inline unsigned int pct(unsigned long a, unsigned long b)
+{
+	return b ? a * 100 / b : 0;
+}
+
+#define PAGES_TO_MiB(pages)	((pages) >> (20 - PAGE_SHIFT))
+#define MiB_TO_PAGES(mb)	((mb) << (20 - PAGE_SHIFT))
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(port, flag)						\
+	do {								\
+		if ((port)->port##_##flag) {				\
+			seq_printf(m, "%s" #flag, first ? "" : ", ");	\
+			first = false;					\
+		}							\
+	} while (0)
+
+void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2,
+			       const char *sep);
+void obd_connect_data_seqprint(struct seq_file *m,
+			       struct obd_connect_data *ocd);
+
+/* if we find more consumers this could be generalized */
+#define OBD_HIST_MAX 32
+struct obd_histogram {
+	spinlock_t	oh_lock;
+	unsigned long	oh_buckets[OBD_HIST_MAX];
+};
+
+struct obd_hist_pcpu {
+	struct percpu_counter	oh_pc_buckets[OBD_HIST_MAX];
+	bool			oh_initialized;
+};
+
+enum {
+        RENAME_SAMEDIR_SIZE = 0,
+        RENAME_CROSSDIR_SRC_SIZE,
+        RENAME_CROSSDIR_TGT_SIZE,
+        RENAME_LAST,
+};
+
+struct rename_stats {
+	ktime_t			rs_init;
+	struct obd_histogram	rs_hist[RENAME_LAST];
+};
+
+/* An lprocfs counter can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+	LPROCFS_CNTR_EXTERNALLOCK	= 0x0001,
+	LPROCFS_CNTR_AVGMINMAX		= 0x0002,
+	LPROCFS_CNTR_STDDEV		= 0x0004,
+
+	/* counter data type */
+	LPROCFS_TYPE_REQS		= 0x0100,
+	LPROCFS_TYPE_BYTES		= 0x0200,
+	LPROCFS_TYPE_PAGES		= 0x0400,
+	LPROCFS_TYPE_USEC		= 0x0800,
+
+	LPROCFS_TYPE_LATENCY		= LPROCFS_TYPE_USEC |
+					  LPROCFS_CNTR_AVGMINMAX |
+					  LPROCFS_CNTR_STDDEV,
+	LPROCFS_TYPE_BYTES_FULL		= LPROCFS_TYPE_BYTES |
+					  LPROCFS_CNTR_AVGMINMAX |
+					  LPROCFS_CNTR_STDDEV,
+};
+#define LC_MIN_INIT ((~(__u64)0) >> 1)
+
+struct lprocfs_counter_header {
+	unsigned int		lc_config;
+	const char		*lc_name;   /* must be static */
+	const char		*lc_units;  /* must be static */
+};
+
+struct lprocfs_counter {
+	__s64	lc_count;
+	__s64	lc_min;
+	__s64	lc_max;
+	__s64	lc_sumsquare;
+	/*
+	 * Every counter has lc_array_sum[0], while lc_array_sum[1] is only
+	 * for irq context counter, i.e. stats with
+	 * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need
+	 * lc_array_sum[1]
+	 */
+	__s64	lc_array_sum[1];
+};
+#define lc_sum		lc_array_sum[0]
+#define lc_sum_irq	lc_array_sum[1]
+
+struct lprocfs_percpu {
+	struct lprocfs_counter lp_cntr[0];
+};
+
+enum lprocfs_stats_lock_ops {
+	LPROCFS_GET_NUM_CPU	= 0x0001, /* number allocated per-CPU stats */
+	LPROCFS_GET_SMP_ID	= 0x0002, /* current stat to be updated */
+};
+
+enum lprocfs_stats_flags {
+	LPROCFS_STATS_FLAG_NONE     = 0x0000, /* per cpu counter */
+	LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
+					       * area and need locking */
+	LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */
+};
+
+enum lprocfs_fields_flags {
+	LPROCFS_FIELDS_FLAGS_CONFIG     = 0x0001,
+	LPROCFS_FIELDS_FLAGS_SUM        = 0x0002,
+	LPROCFS_FIELDS_FLAGS_MIN        = 0x0003,
+	LPROCFS_FIELDS_FLAGS_MAX        = 0x0004,
+	LPROCFS_FIELDS_FLAGS_AVG        = 0x0005,
+	LPROCFS_FIELDS_FLAGS_SUMSQUARE  = 0x0006,
+	LPROCFS_FIELDS_FLAGS_COUNT      = 0x0007,
+};
+
+struct lprocfs_stats {
+	/* # of counters */
+	unsigned short			ls_num;
+	/* 1 + the biggest cpu # whose ls_percpu slot has been allocated */
+	unsigned short			ls_biggest_alloc_num;
+	enum lprocfs_stats_flags	ls_flags;
+	ktime_t				ls_init;
+	/* Lock used when there are no percpu stats areas; For percpu stats,
+	 * it is used to protect ls_biggest_alloc_num change */
+	spinlock_t			ls_lock;
+
+	/* has ls_num of counter headers */
+	struct lprocfs_counter_header	*ls_cnt_header;
+	struct lprocfs_percpu		*ls_percpu[0];
+};
+
+#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC)
+
+/* Pack all opcodes down into a single monotonically increasing index */
+static inline int opcode_offset(__u32 opc) {
+        if (opc < OST_LAST_OPC) {
+                 /* OST opcode */
+                return (opc - OST_FIRST_OPC);
+        } else if (opc < MDS_LAST_OPC) {
+                /* MDS opcode */
+                return (opc - MDS_FIRST_OPC +
+                        OPC_RANGE(OST));
+        } else if (opc < LDLM_LAST_OPC) {
+                /* LDLM Opcode */
+                return (opc - LDLM_FIRST_OPC +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < MGS_LAST_OPC) {
+                /* MGS Opcode */
+                return (opc - MGS_FIRST_OPC +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < OBD_LAST_OPC) {
+                /* OBD Ping */
+                return (opc - OBD_FIRST_OPC +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < LLOG_LAST_OPC) {
+                /* LLOG Opcode */
+                return (opc - LLOG_FIRST_OPC +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < QUOTA_LAST_OPC) {
+                /* LQUOTA Opcode */
+                return (opc - QUOTA_FIRST_OPC +
+                        OPC_RANGE(LLOG) +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < SEQ_LAST_OPC) {
+                /* SEQ opcode */
+                return (opc - SEQ_FIRST_OPC +
+                        OPC_RANGE(QUOTA) +
+                        OPC_RANGE(LLOG) +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < SEC_LAST_OPC) {
+                /* SEC opcode */
+                return (opc - SEC_FIRST_OPC +
+                        OPC_RANGE(SEQ) +
+                        OPC_RANGE(QUOTA) +
+                        OPC_RANGE(LLOG) +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < FLD_LAST_OPC) {
+                /* FLD opcode */
+                 return (opc - FLD_FIRST_OPC +
+                        OPC_RANGE(SEC) +
+                        OPC_RANGE(SEQ) +
+                        OPC_RANGE(QUOTA) +
+                        OPC_RANGE(LLOG) +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+#ifdef HAVE_SERVER_SUPPORT
+	} else if (opc < OUT_UPDATE_LAST_OPC) {
+		/* update opcode */
+		return (opc - OUT_UPDATE_FIRST_OPC +
+			OPC_RANGE(FLD) +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < LFSCK_LAST_OPC) {
+		/* LFSCK opcode */
+		return (opc - LFSCK_FIRST_OPC +
+			OPC_RANGE(OUT_UPDATE) +
+			OPC_RANGE(FLD) +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+#endif /* HAVE_SERVER_SUPPORT */
+	} else {
+		/* Unknown Opcode */
+		return -1;
+	}
+}
+
+#define LUSTRE_MAX_OPCODES_CLIENT (OPC_RANGE(OST)  + \
+				   OPC_RANGE(MDS)  + \
+				   OPC_RANGE(LDLM) + \
+				   OPC_RANGE(MGS)  + \
+				   OPC_RANGE(OBD)  + \
+				   OPC_RANGE(LLOG) + \
+				   OPC_RANGE(SEC)  + \
+				   OPC_RANGE(SEQ)  + \
+				   OPC_RANGE(SEC)  + \
+				   OPC_RANGE(FLD))
+
+#ifdef HAVE_SERVER_SUPPORT
+#define LUSTRE_MAX_OPCODES (LUSTRE_MAX_OPCODES_CLIENT + \
+			    OPC_RANGE(OUT_UPDATE) + \
+			    OPC_RANGE(LFSCK))
+#else
+#define LUSTRE_MAX_OPCODES LUSTRE_MAX_OPCODES_CLIENT
+#endif
+
+#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR)  + \
+                            OPC_RANGE(EXTRA))
+
+enum {
+        PTLRPC_REQWAIT_CNTR = 0,
+        PTLRPC_REQQDEPTH_CNTR,
+        PTLRPC_REQACTIVE_CNTR,
+        PTLRPC_TIMEOUT,
+        PTLRPC_REQBUF_AVAIL_CNTR,
+        PTLRPC_LAST_CNTR
+};
+
+#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
+
+enum lprocfs_extra_opc {
+	LDLM_GLIMPSE_ENQUEUE = 0,
+	LDLM_PLAIN_ENQUEUE,
+	LDLM_EXTENT_ENQUEUE,
+	LDLM_FLOCK_ENQUEUE,
+	LDLM_IBITS_ENQUEUE,
+	MDS_REINT_SETATTR,
+	MDS_REINT_CREATE,
+	MDS_REINT_LINK,
+	MDS_REINT_UNLINK,
+	MDS_REINT_RENAME,
+	MDS_REINT_OPEN,
+	MDS_REINT_SETXATTR,
+	MDS_REINT_RESYNC,
+	BRW_READ_BYTES,
+	BRW_WRITE_BYTES,
+	EXTRA_LAST_OPC
+};
+
+#ifdef HAVE_SERVER_SUPPORT
+enum brw_rw_stats {
+	BRW_R_PAGES = 0,
+	BRW_W_PAGES,
+	BRW_R_DISCONT_PAGES,
+	BRW_W_DISCONT_PAGES,
+	BRW_R_DISCONT_BLOCKS,
+	BRW_W_DISCONT_BLOCKS,
+	BRW_R_DIO_FRAGS,
+	BRW_W_DIO_FRAGS,
+	BRW_R_RPC_HIST,
+	BRW_W_RPC_HIST,
+	BRW_R_IO_TIME,
+	BRW_W_IO_TIME,
+	BRW_R_DISK_IOSIZE,
+	BRW_W_DISK_IOSIZE,
+	BRW_RW_STATS_NUM,
+};
+
+struct brw_stats_props {
+	const char	*bsp_name;
+	const char	*bsp_units;
+	bool		 bsp_scale;
+};
+
+struct brw_stats {
+	ktime_t			bs_init;
+	struct obd_hist_pcpu	bs_hist[BRW_RW_STATS_NUM];
+	struct brw_stats_props	bs_props[BRW_RW_STATS_NUM / 2];
+};
+
+int lprocfs_init_brw_stats(struct brw_stats *brw_stats);
+void lprocfs_fini_brw_stats(struct brw_stats *brw_stats);
+
+void ldebugfs_register_osd_stats(struct dentry *parent,
+				 struct brw_stats *brw_stats,
+				 struct lprocfs_stats *stats);
+#endif /* HAVE_SERVER_SUPPORT */
+
+#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
+/* class_obd.c */
+extern struct proc_dir_entry *proc_lustre_root;
+extern struct dentry *debugfs_lustre_root;
+extern struct kset *lustre_kset;
+
+struct obd_device;
+
+#define JOBSTATS_JOBID_VAR_MAX_LEN	20
+#define JOBSTATS_DISABLE		"disable"
+#define JOBSTATS_PROCNAME_UID		"procname_uid"
+#define JOBSTATS_NODELOCAL		"nodelocal"
+#define JOBSTATS_SESSION		"session"
+
+typedef void (*cntr_init_callback)(struct lprocfs_stats *stats,
+				   unsigned int offset);
+
+struct obd_job_stats {
+	struct cfs_hash	       *ojs_hash;	/* hash of jobids */
+	struct list_head	ojs_list;	/* list of job_stat structs */
+	rwlock_t		ojs_lock;	/* protect ojs_list/js_list */
+	ktime_t			ojs_cleanup_interval;/* 1/2 expiry seconds */
+	ktime_t			ojs_cleanup_last;/* previous cleanup time */
+	cntr_init_callback	ojs_cntr_init_fn;/* lprocfs_stats initializer */
+	unsigned short		ojs_cntr_num;	/* number of stats in struct */
+	bool			ojs_cleaning;	/* currently expiring stats */
+};
+
+#ifdef CONFIG_PROC_FS
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats,
+			    unsigned int cpuid);
+int lprocfs_stats_lock(struct lprocfs_stats *stats,
+		       enum lprocfs_stats_lock_ops opc,
+		       unsigned long *flags);
+void lprocfs_stats_unlock(struct lprocfs_stats *stats,
+			  enum lprocfs_stats_lock_ops opc,
+			  unsigned long *flags);
+
+static inline unsigned int
+lprocfs_stats_counter_size(struct lprocfs_stats *stats)
+{
+	unsigned int percpusize;
+
+	percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
+
+	/* irq safe stats need lc_array_sum[1] */
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		percpusize += stats->ls_num * sizeof(__s64);
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0)
+		percpusize = L1_CACHE_ALIGN(percpusize);
+
+	return percpusize;
+}
+
+static inline struct lprocfs_counter *
+lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid,
+			  int index)
+{
+	struct lprocfs_counter *cntr;
+
+	cntr = &stats->ls_percpu[cpuid]->lp_cntr[index];
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		cntr = (void *)cntr + index * sizeof(__s64);
+
+	return cntr;
+}
+
+/* Two optimized LPROCFS counter increment functions are provided:
+ *     lprocfs_counter_incr(cntr, value) - optimized for by-one counters
+ *     lprocfs_counter_add(cntr) - use for multi-valued counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
+                                long amount);
+extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
+                                long amount);
+
+#define lprocfs_counter_incr(stats, idx) \
+        lprocfs_counter_add(stats, idx, 1)
+#define lprocfs_counter_decr(stats, idx) \
+        lprocfs_counter_sub(stats, idx, 1)
+
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+				 struct lprocfs_counter_header *header,
+				 enum lprocfs_stats_flags flags,
+				 enum lprocfs_fields_flags field);
+u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+			    enum lprocfs_fields_flags field);
+
+extern struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
+extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
+extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
+extern int lprocfs_alloc_obd_stats(struct obd_device *obd,
+				   unsigned int num_stats);
+extern int lprocfs_alloc_md_stats(struct obd_device *obd,
+				  unsigned int num_private_stats);
+extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+				 unsigned conf, const char *name,
+				 const char *units);
+extern void lprocfs_free_obd_stats(struct obd_device *obd);
+extern void lprocfs_free_md_stats(struct obd_device *obd);
+struct obd_export;
+struct nid_stat;
+extern int lprocfs_add_clear_entry(struct obd_device *obd,
+				   struct proc_dir_entry *entry);
+#ifdef HAVE_SERVER_SUPPORT
+extern int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *peer_nid);
+extern int lprocfs_exp_cleanup(struct obd_export *exp);
+struct dentry *ldebugfs_add_symlink(const char *name, const char *target,
+				    const char *format, ...);
+#else
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+#endif
+extern struct proc_dir_entry *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+		   void *data, const struct proc_ops *ops);
+extern struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+                    const char *format, ...);
+extern void lprocfs_free_per_client_stats(struct obd_device *obd);
+#ifdef HAVE_SERVER_SUPPORT
+extern ssize_t
+lprocfs_nid_stats_clear_seq_write(struct file *file, const char __user *buffer,
+					size_t count, loff_t *off);
+extern int lprocfs_nid_stats_clear_seq_show(struct seq_file *file, void *data);
+#endif
+extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+				  struct lprocfs_stats *stats);
+extern const struct file_operations ldebugfs_stats_seq_fops;
+
+/* lprocfs_status.c */
+extern void ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *var,
+			      void *data);
+extern struct dentry *ldebugfs_register(const char *name,
+					struct dentry *parent,
+					struct ldebugfs_vars *list,
+					void *data);
+extern int lprocfs_add_vars(struct proc_dir_entry *root,
+			    struct lprocfs_vars *var, void *data);
+
+extern struct proc_dir_entry *
+lprocfs_register(const char *name, struct proc_dir_entry *parent,
+		 struct lprocfs_vars *list, void *data);
+
+extern void lprocfs_remove(struct proc_dir_entry **root);
+extern void lprocfs_remove_proc_entry(const char *name,
+                                      struct proc_dir_entry *parent);
+extern int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only);
+extern int lprocfs_obd_cleanup(struct obd_device *obd);
+
+extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name,
+			      mode_t mode, const struct proc_ops *seq_fops,
+			      void *data);
+extern int lprocfs_obd_seq_create(struct obd_device *obd, const char *name,
+				  mode_t mode, const struct proc_ops *seq_fops,
+				  void *data);
+extern void lprocfs_stats_header(struct seq_file *seq, ktime_t now,
+				 ktime_t ts_init, int width, const char *colon,
+				 bool show_units, const char *prefix);
+
+/* Generic callbacks */
+extern int lprocfs_uuid_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data);
+ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf);
+extern int lprocfs_import_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_state_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data);
+#ifdef HAVE_SERVER_SUPPORT
+ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+ssize_t grant_check_threshold_show(struct kobject *kobj,
+				   struct attribute *attr, char *buf);
+ssize_t grant_check_threshold_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buffer, size_t count);
+#endif
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(struct seq_file *m,
+				  struct adaptive_timeout *at);
+extern int lprocfs_timeouts_seq_show(struct seq_file *m, void *data);
+extern ssize_t
+lprocfs_timeouts_seq_write(struct file *file, const char __user *buffer,
+			   size_t count, loff_t *off);
+#ifdef HAVE_SERVER_SUPPORT
+extern ssize_t
+lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off);
+#endif
+ssize_t ping_store(struct kobject *kobj, struct attribute *attr,
+		   const char *buffer, size_t count);
+ssize_t ping_show(struct kobject *kobj, struct attribute *attr,
+		  char *buffer);
+
+extern ssize_t
+ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off);
+static inline ssize_t
+lprocfs_import_seq_write(struct file *file, const char __user *buffer,
+			 size_t count, loff_t *off)
+{
+	return ldebugfs_import_seq_write(file, buffer, count, off);
+}
+
+extern int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data);
+extern ssize_t
+lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off);
+
+int string_to_size(u64 *size, const char *buffer, size_t count);
+int sysfs_memparse(const char *buffer, size_t count, u64 *val,
+		    const char *defunit);
+char *lprocfs_strnstr(const char *s1, const char *s2, size_t len);
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+				size_t *count);
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_clear(struct obd_histogram *oh);
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
+
+void lprocfs_oh_tally_pcpu(struct obd_hist_pcpu *oh, unsigned int value);
+void lprocfs_oh_tally_log2_pcpu(struct obd_hist_pcpu *oh, unsigned int value);
+int lprocfs_oh_alloc_pcpu(struct obd_hist_pcpu *oh);
+void lprocfs_oh_clear_pcpu(struct obd_hist_pcpu *oh);
+void lprocfs_oh_release_pcpu(struct obd_hist_pcpu *oh);
+unsigned long lprocfs_oh_sum_pcpu(struct obd_hist_pcpu *oh);
+unsigned long lprocfs_oh_counter_pcpu(struct obd_hist_pcpu *oh,
+		      unsigned int value);
+
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                           struct lprocfs_counter *cnt);
+
+#ifdef HAVE_SERVER_SUPPORT
+/* lprocfs_status.c: recovery status */
+int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data);
+
+/* lprocfs_status.c: hash statistics */
+int lprocfs_hash_seq_show(struct seq_file *m, void *data);
+
+/* lprocfs_status.c: IR factor */
+ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf);
+ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr,
+			const char *buffer, size_t count);
+#endif
+
+/* lprocfs_status.c: dump pages on cksum error */
+int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data);
+ssize_t
+lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off);
+
+extern int lprocfs_single_release(struct inode *, struct file *);
+extern int lprocfs_seq_release(struct inode *, struct file *);
+
+/* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry
+ * Note that it is not safe to 'goto', 'return' or 'break'
+ * out of the body of this statement.  It *IS* safe to
+ * 'goto' the a label inside the statement, or to 'continue'
+ * to get out of the statement.
+ */
+
+#define with_imp_locked_nested(__obd, __imp, __rc, __nest)		\
+	for (down_read_nested(&(__obd)->u.cli.cl_sem, __nest),		\
+	     __imp = (__obd)->u.cli.cl_import,				\
+	     __rc = __imp ? 0 : -ENODEV;				\
+	     __imp ? 1 : (up_read(&(__obd)->u.cli.cl_sem), 0);		\
+	     __imp = NULL)
+
+#define with_imp_locked(__obd, __imp, __rc)	\
+	with_imp_locked_nested(__obd, __imp, __rc, 0)
+
+/* write the name##_seq_show function, call LDEBUGFS_SEQ_FOPS_RO for read-only
+ * debugfs entries; otherwise, you will define name##_seq_write function also
+ * for a read-write debugfs entry, and then call LDEBUGFS_SEQ_FOPS instead.
+ * Finally, call debugfs_create_file(filename, 0444, obd, data, &name#_fops);
+ */
+#define __LDEBUGFS_SEQ_FOPS(name, custom_seq_write)			\
+static int name##_single_open(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, name##_seq_show, inode->i_private);	\
+}									\
+static const struct file_operations name##_fops = {			\
+	.owner	 = THIS_MODULE,						\
+	.open	 = name##_single_open,					\
+	.read	 = seq_read,						\
+	.write	 = custom_seq_write,					\
+	.llseek	 = seq_lseek,						\
+	.release = single_release,					\
+}
+
+#define LDEBUGFS_SEQ_FOPS_RO(name)	__LDEBUGFS_SEQ_FOPS(name, NULL)
+#define LDEBUGFS_SEQ_FOPS(name)		__LDEBUGFS_SEQ_FOPS(name, \
+							    name##_seq_write)
+
+#define LDEBUGFS_SEQ_FOPS_RO_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		if (!m->private)					\
+			return -ENODEV;					\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	LDEBUGFS_SEQ_FOPS_RO(name##_##type)
+
+#define LDEBUGFS_SEQ_FOPS_RW_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		if (!m->private)					\
+			return -ENODEV;					\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	static ssize_t name##_##type##_seq_write(struct file *file,	\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		struct seq_file *seq = file->private_data;		\
+									\
+		if (!seq->private)					\
+			return -ENODEV;					\
+		return ldebugfs_##type##_seq_write(file, buffer, count,	\
+						   seq->private);	\
+	}								\
+	LDEBUGFS_SEQ_FOPS(name##_##type);
+
+#define LDEBUGFS_FOPS_WR_ONLY(name, type)				\
+	static ssize_t name##_##type##_write(struct file *file,		\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		return ldebugfs_##type##_seq_write(file, buffer, count,	\
+						   off);		\
+	}								\
+	static int name##_##type##_open(struct inode *inode,		\
+					struct file *file)		\
+	{								\
+		return single_open(file, NULL, inode->i_private);	\
+	}								\
+	static const struct file_operations name##_##type##_fops = {	\
+		.open	 = name##_##type##_open,			\
+		.write	 = name##_##type##_write,			\
+		.release = single_release,				\
+	};
+
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+ * proc entries; otherwise, you will define name##_seq_write function also for
+ * a read-write proc entry, and then call LPROC_SEQ_FOPS instead. Finally,
+ * call ldebugfs_obd_seq_create(obd, filename, 0444, &name#_fops, data);
+ */
+#define __LPROC_SEQ_FOPS(name, custom_seq_write)			\
+static int name##_single_open(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, name##_seq_show,			\
+			   inode->i_private ? inode->i_private :	\
+					      pde_data(inode));		\
+}									\
+static const struct proc_ops name##_fops = {				\
+	PROC_OWNER(THIS_MODULE)						\
+	.proc_open		= name##_single_open,			\
+	.proc_read		= seq_read,				\
+	.proc_write		= custom_seq_write,			\
+	.proc_lseek		= seq_lseek,				\
+	.proc_release		= lprocfs_single_release,		\
+}
+
+#define LPROC_SEQ_FOPS_RO(name)		__LPROC_SEQ_FOPS(name, NULL)
+#define LPROC_SEQ_FOPS(name)		__LPROC_SEQ_FOPS(name, name##_seq_write)
+
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	LPROC_SEQ_FOPS_RO(name##_##type)
+
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	static ssize_t name##_##type##_seq_write(struct file *file,	\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		struct seq_file *seq = file->private_data;		\
+		return lprocfs_##type##_seq_write(file, buffer,		\
+						  count, seq->private);	\
+	}								\
+	LPROC_SEQ_FOPS(name##_##type);
+
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)				\
+	static ssize_t name##_##type##_write(struct file *file,		\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		return lprocfs_##type##_seq_write(file, buffer, count, off);\
+	}								\
+	static int name##_##type##_open(struct inode *inode, struct file *file)\
+	{								\
+		return single_open(file, NULL,				\
+				   inode->i_private ? inode->i_private : \
+				   pde_data(inode));			\
+	}								\
+	static const struct proc_ops name##_##type##_fops = {		\
+		.proc_open	= name##_##type##_open,			\
+		.proc_write	= name##_##type##_write,		\
+		.proc_release	= lprocfs_single_release,		\
+	};
+
+struct lustre_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+			 const char *buf, size_t len);
+};
+
+#define LUSTRE_ATTR(name, mode, show, store) \
+static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store)
+
+#define LUSTRE_WO_ATTR(name) LUSTRE_ATTR(name, 0200, NULL, name##_store)
+#define LUSTRE_RO_ATTR(name) LUSTRE_ATTR(name, 0444, name##_show, NULL)
+#define LUSTRE_RW_ATTR(name) LUSTRE_ATTR(name, 0644, name##_show, name##_store)
+
+ssize_t lustre_attr_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+ssize_t lustre_attr_store(struct kobject *kobj, struct attribute *attr,
+			  const char *buf, size_t len);
+
+extern const struct sysfs_ops lustre_sysfs_ops;
+
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
+#ifdef HAVE_SERVER_SUPPORT
+/* lprocfs_jobstats.c */
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+			  int event, long amount);
+void lprocfs_job_stats_fini(struct obd_device *obd);
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback fn);
+ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf);
+ssize_t job_cleanup_interval_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count);
+/* lproc_status_server.c */
+ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+ssize_t recovery_time_soft_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count);
+ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+ssize_t recovery_time_hard_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count);
+ssize_t instance_show(struct kobject *kobj, struct attribute *attr,
+		      char *buf);
+#endif
+/* lproc_status.c */
+int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data);
+ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off);
+ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf);
+ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr,
+			     const char *buffer, size_t count);
+
+struct root_squash_info;
+int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
+			   struct root_squash_info *squash, char *name);
+int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
+			     struct root_squash_info *squash, char *name);
+
+#else /* !CONFIG_PROC_FS */
+
+#define proc_lustre_root NULL
+
+static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
+                                       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
+                                        int index)
+{ return; }
+static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
+                                       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_decr(struct lprocfs_stats *stats,
+                                        int index)
+{ return; }
+static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
+                                        int index, unsigned conf,
+                                        const char *name, const char *units)
+{ return; }
+
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+                                   enum lprocfs_fields_flags field)
+{ return 0; }
+
+/* NB: we return !NULL to satisfy error checker */
+static inline struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags)
+{ return (struct lprocfs_stats *)1; }
+static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
+{ return; }
+static inline int lprocfs_register_stats(struct proc_dir_entry *root,
+                                         const char *name,
+                                         struct lprocfs_stats *stats)
+{ return 0; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
+static inline int lprocfs_alloc_obd_stats(struct obd_device *obd,
+					  unsigned int num_stats)
+{ return 0; }
+static inline int lprocfs_alloc_md_stats(struct obd_device *obd,
+                                         unsigned int num_private_stats)
+{ return 0; }
+static inline void lprocfs_free_obd_stats(struct obd_device *obd)
+{ return; }
+static inline void lprocfs_free_md_stats(struct obd_device *obd)
+{ return; }
+
+struct obd_export;
+static inline int lprocfs_add_clear_entry(struct obd_export *exp)
+{ return 0; }
+static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
+{ return; }
+#ifdef HAVE_SERVER_SUPPORT
+static inline
+ssize_t lprocfs_nid_stats_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
+{return 0;}
+static inline
+int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data)
+{return 0;}
+static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid)
+{ return 0; }
+#endif
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+		   void *data, const struct file_operations *fops)
+{return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+                    const char *format, ...)
+{return NULL; }
+static inline int lprocfs_add_vars(struct proc_dir_entry *root,
+				   struct lprocfs_vars *var, void *data)
+{ return 0; }
+static inline struct proc_dir_entry *
+lprocfs_register(const char *name, struct proc_dir_entry *parent,
+		 struct lprocfs_vars *list, void *data)
+{ return NULL; }
+static inline void lprocfs_remove(struct proc_dir_entry **root)
+{ return; }
+static inline void lprocfs_remove_proc_entry(const char *name,
+                                             struct proc_dir_entry *parent)
+{ return; }
+static inline int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only)
+{ return 0; }
+static inline int lprocfs_obd_cleanup(struct obd_device *obd)
+{ return 0; }
+static inline int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_server_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_import_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_state_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+#ifdef HAVE_SERVER_SUPPORT
+static inline int lprocfs_num_exports_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+#endif
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(struct seq_file *m,
+					 struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_timeouts_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline ssize_t
+lprocfs_timeouts_seq_write(struct file *file, const char __user *buffer,
+			   size_t count, loff_t *off)
+{ return 0; }
+#ifdef HAVE_SERVER_SUPPORT
+static inline ssize_t
+lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{ return 0; }
+#endif
+static inline ssize_t
+lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off)
+{ return 0; }
+static inline ssize_t
+ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off)
+{ return 0; }
+static inline ssize_t
+lprocfs_import_seq_write(struct file *file, const char __user *buffer,
+			 size_t count, loff_t *off)
+{ return 0; }
+static inline int
+lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline ssize_t
+lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{ return 0; }
+
+/* Statfs helpers */
+static inline
+int lprocfs_blksize_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_filestotal_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_filesfree_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{ return; }
+static inline
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{ return 0; }
+static inline
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                           struct lprocfs_counter *cnt)
+{ return; }
+static inline
+u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+			    enum lprocfs_fields_flags field)
+{ return (__u64)0; }
+
+#define LPROC_SEQ_FOPS_RO(name)
+#define LPROC_SEQ_FOPS(name)
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)
+
+/* lprocfs_jobstats.c */
+static inline
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event,
+			  long amount)
+{ return 0; }
+static inline
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback fn)
+{ return 0; }
+
+
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
+#endif /* CONFIG_PROC_FS */
+
+#endif /* LPROCFS_STATUS_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_object.h b/drivers/staging/lustrefsx/lustre/include/lu_object.h
new file mode 100644
index 0000000000000..9ceabafb2636f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lu_object.h
@@ -0,0 +1,1760 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LUSTRE_LU_OBJECT_H
+#define __LUSTRE_LU_OBJECT_H
+
+#ifdef HAVE_LINUX_STDARG_HEADER
+#include <linux/stdarg.h>
+#else
+#include <stdarg.h>
+#endif
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lu_ref.h>
+#include <linux/percpu_counter.h>
+#include <linux/rhashtable.h>
+#include <linux/ctype.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+struct lprocfs_stats;
+struct obd_type;
+
+/** \defgroup lu lu
+ * lu_* data-types represent server-side entities shared by data and meta-data
+ * stacks.
+ *
+ * Design goals:
+ *
+ * -# support for layering.
+ *
+ *     Server side object is split into layers, one per device in the
+ *     corresponding device stack. Individual layer is represented by struct
+ *     lu_object. Compound layered object --- by struct lu_object_header. Most
+ *     interface functions take lu_object as an argument and operate on the
+ *     whole compound object. This decision was made due to the following
+ *     reasons:
+ *
+ *        - it's envisaged that lu_object will be used much more often than
+ *        lu_object_header;
+ *
+ *        - we want lower (non-top) layers to be able to initiate operations
+ *        on the whole object.
+ *
+ *     Generic code supports layering more complex than simple stacking, e.g.,
+ *     it is possible that at some layer object "spawns" multiple sub-objects
+ *     on the lower layer.
+ *
+ * -# fid-based identification.
+ *
+ *     Compound object is uniquely identified by its fid. Objects are indexed
+ *     by their fids (hash table is used for index).
+ *
+ * -# caching and life-cycle management.
+ *
+ *     Object's life-time is controlled by reference counting. When reference
+ *     count drops to 0, object is returned to cache. Cached objects still
+ *     retain their identity (i.e., fid), and can be recovered from cache.
+ *
+ *     Objects are kept in the global LRU list, and lu_site_purge() function
+ *     can be used to reclaim given number of unused objects from the tail of
+ *     the LRU.
+ *
+ * -# avoiding recursion.
+ *
+ *     Generic code tries to replace recursion through layers by iterations
+ *     where possible. Additionally to the end of reducing stack consumption,
+ *     data, when practically possible, are allocated through lu_context_key
+ *     interface rather than on stack.
+ * @{
+ */
+
+struct lu_site;
+struct lu_object;
+struct lu_device;
+struct lu_object_header;
+struct lu_context;
+struct lu_env;
+struct lu_name;
+
+/**
+ * Operations common for data and meta-data devices.
+ */
+struct lu_device_operations {
+        /**
+         * Allocate object for the given device (without lower-layer
+         * parts). This is called by lu_object_operations::loo_object_init()
+         * from the parent layer, and should setup at least lu_object::lo_dev
+         * and lu_object::lo_ops fields of resulting lu_object.
+         *
+         * Object creation protocol.
+         *
+         * Due to design goal of avoiding recursion, object creation (see
+         * lu_object_alloc()) is somewhat involved:
+         *
+         *  - first, lu_device_operations::ldo_object_alloc() method of the
+         *  top-level device in the stack is called. It should allocate top
+         *  level object (including lu_object_header), but without any
+         *  lower-layer sub-object(s).
+         *
+         *  - then lu_object_alloc() sets fid in the header of newly created
+         *  object.
+         *
+         *  - then lu_object_operations::loo_object_init() is called. It has
+         *  to allocate lower-layer object(s). To do this,
+         *  lu_object_operations::loo_object_init() calls ldo_object_alloc()
+         *  of the lower-layer device(s).
+         *
+         *  - for all new objects allocated by
+         *  lu_object_operations::loo_object_init() (and inserted into object
+         *  stack), lu_object_operations::loo_object_init() is called again
+         *  repeatedly, until no new objects are created.
+         *
+         * \post ergo(!IS_ERR(result), result->lo_dev == d &&
+         *                             result->lo_ops != NULL);
+         */
+        struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
+                                              const struct lu_object_header *h,
+                                              struct lu_device *d);
+        /**
+         * process config specific for device.
+         */
+        int (*ldo_process_config)(const struct lu_env *env,
+                                  struct lu_device *, struct lustre_cfg *);
+        int (*ldo_recovery_complete)(const struct lu_env *,
+                                     struct lu_device *);
+
+        /**
+         * initialize local objects for device. this method called after layer has
+         * been initialized (after LCFG_SETUP stage) and before it starts serving
+         * user requests.
+         */
+
+        int (*ldo_prepare)(const struct lu_env *,
+                           struct lu_device *parent,
+                           struct lu_device *dev);
+
+
+	/**
+	 * Allocate new FID for file with @name under @parent
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[out] fid	new FID allocated
+	 * \param[in] parent	parent object
+	 * \param[in] name	lu_name
+	 *
+	 * \retval 0		0 FID allocated successfully.
+	 * \retval 1		1 FID allocated successfully and new sequence
+	 *                      requested from seq meta server
+	 * \retval negative	negative errno if FID allocation failed.
+	 */
+	int (*ldo_fid_alloc)(const struct lu_env *env,
+			     struct lu_device *dev,
+			     struct lu_fid *fid,
+			     struct lu_object *parent,
+			     const struct lu_name *name);
+};
+
+/**
+ * For lu_object_conf flags
+ */
+typedef enum {
+	/* This is a new object to be allocated, or the file
+	 * corresponding to the object does not exists. */
+	LOC_F_NEW	= 0x00000001,
+} loc_flags_t;
+
+/**
+ * Object configuration, describing particulars of object being created. On
+ * server this is not used, as server objects are full identified by fid. On
+ * client configuration contains struct lustre_md.
+ */
+struct lu_object_conf {
+        /**
+         * Some hints for obj find and alloc.
+         */
+        loc_flags_t     loc_flags;
+};
+
+/**
+ * Type of "printer" function used by lu_object_operations::loo_object_print()
+ * method.
+ *
+ * Printer function is needed to provide some flexibility in (semi-)debugging
+ * output: possible implementations: printk, CDEBUG, sysfs/seq_file
+ */
+typedef int (*lu_printer_t)(const struct lu_env *env,
+                            void *cookie, const char *format, ...)
+        __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Operations specific for particular lu_object.
+ */
+struct lu_object_operations {
+
+        /**
+         * Allocate lower-layer parts of the object by calling
+         * lu_device_operations::ldo_object_alloc() of the corresponding
+         * underlying device.
+         *
+         * This method is called once for each object inserted into object
+         * stack. It's responsibility of this method to insert lower-layer
+         * object(s) it create into appropriate places of object stack.
+         */
+        int (*loo_object_init)(const struct lu_env *env,
+                               struct lu_object *o,
+                               const struct lu_object_conf *conf);
+        /**
+         * Called (in top-to-bottom order) during object allocation after all
+         * layers were allocated and initialized. Can be used to perform
+         * initialization depending on lower layers.
+         */
+        int (*loo_object_start)(const struct lu_env *env,
+                                struct lu_object *o);
+        /**
+         * Called before lu_object_operations::loo_object_free() to signal
+         * that object is being destroyed. Dual to
+         * lu_object_operations::loo_object_init().
+         */
+        void (*loo_object_delete)(const struct lu_env *env,
+                                  struct lu_object *o);
+	/**
+	 * Dual to lu_device_operations::ldo_object_alloc(). Called when
+	 * object is removed from memory.  Must use call_rcu or kfree_rcu
+	 * if the object contains an lu_object_header.
+	 */
+	void (*loo_object_free)(const struct lu_env *env,
+				struct lu_object *o);
+        /**
+         * Called when last active reference to the object is released (and
+         * object returns to the cache). This method is optional.
+         */
+        void (*loo_object_release)(const struct lu_env *env,
+                                   struct lu_object *o);
+        /**
+         * Optional debugging helper. Print given object.
+         */
+        int (*loo_object_print)(const struct lu_env *env, void *cookie,
+                                lu_printer_t p, const struct lu_object *o);
+        /**
+         * Optional debugging method. Returns true iff method is internally
+         * consistent.
+         */
+        int (*loo_object_invariant)(const struct lu_object *o);
+};
+
+/**
+ * Type of lu_device.
+ */
+struct lu_device_type;
+
+/**
+ * Device: a layer in the server side abstraction stacking.
+ */
+struct lu_device {
+	/**
+	 * reference count. This is incremented, in particular, on each object
+	 * created at this layer.
+	 *
+	 * \todo XXX which means that atomic_t is probably too small.
+	 */
+	atomic_t			   ld_ref;
+	/**
+	 * Pointer to device type. Never modified once set.
+	 */
+	struct lu_device_type		  *ld_type;
+        /**
+         * Operation vector for this device.
+         */
+        const struct lu_device_operations *ld_ops;
+        /**
+         * Stack this device belongs to.
+         */
+        struct lu_site                    *ld_site;
+        struct proc_dir_entry             *ld_proc_entry;
+
+        /** \todo XXX: temporary back pointer into obd. */
+        struct obd_device                 *ld_obd;
+        /**
+         * A list of references to this object, for debugging.
+         */
+        struct lu_ref                      ld_reference;
+        /**
+         * Link the device to the site.
+         **/
+	struct list_head		   ld_linkage;
+};
+
+struct lu_device_type_operations;
+
+/**
+ * Tag bits for device type. They are used to distinguish certain groups of
+ * device types.
+ */
+enum lu_device_tag {
+	/** this is meta-data device */
+	LU_DEVICE_MD = BIT(0),
+	/** this is data device */
+	LU_DEVICE_DT = BIT(1),
+	/** data device in the client stack */
+	LU_DEVICE_CL = BIT(2)
+};
+
+/**
+ * Type of device.
+ */
+struct lu_device_type {
+        /**
+         * Tag bits. Taken from enum lu_device_tag. Never modified once set.
+         */
+        __u32                                   ldt_tags;
+        /**
+         * Name of this class. Unique system-wide. Never modified once set.
+         */
+        char                                   *ldt_name;
+        /**
+         * Operations for this type.
+         */
+        const struct lu_device_type_operations *ldt_ops;
+        /**
+         * \todo XXX: temporary: context tags used by obd_*() calls.
+         */
+        __u32                                   ldt_ctx_tags;
+        /**
+         * Number of existing device type instances.
+         */
+	atomic_t				ldt_device_nr;
+};
+
+/**
+ * Operations on a device type.
+ */
+struct lu_device_type_operations {
+        /**
+         * Allocate new device.
+         */
+        struct lu_device *(*ldto_device_alloc)(const struct lu_env *env,
+                                               struct lu_device_type *t,
+                                               struct lustre_cfg *lcfg);
+        /**
+         * Free device. Dual to
+         * lu_device_type_operations::ldto_device_alloc(). Returns pointer to
+         * the next device in the stack.
+         */
+        struct lu_device *(*ldto_device_free)(const struct lu_env *,
+                                              struct lu_device *);
+
+        /**
+         * Initialize the devices after allocation
+         */
+        int  (*ldto_device_init)(const struct lu_env *env,
+                                 struct lu_device *, const char *,
+                                 struct lu_device *);
+        /**
+         * Finalize device. Dual to
+         * lu_device_type_operations::ldto_device_init(). Returns pointer to
+         * the next device in the stack.
+         */
+        struct lu_device *(*ldto_device_fini)(const struct lu_env *env,
+                                              struct lu_device *);
+        /**
+         * Initialize device type. This is called on module load.
+         */
+        int  (*ldto_init)(struct lu_device_type *t);
+        /**
+         * Finalize device type. Dual to
+         * lu_device_type_operations::ldto_init(). Called on module unload.
+         */
+        void (*ldto_fini)(struct lu_device_type *t);
+        /**
+         * Called when the first device is created.
+         */
+        void (*ldto_start)(struct lu_device_type *t);
+        /**
+         * Called when number of devices drops to 0.
+         */
+        void (*ldto_stop)(struct lu_device_type *t);
+};
+
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+	return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
+
+/**
+ * Common object attributes.
+ */
+struct lu_attr {
+	/**
+	 * valid bits
+	 *
+	 * \see enum la_valid
+	 */
+	__u64		la_valid;
+        /** size in bytes */
+	__u64		la_size;
+	/** modification time in seconds since Epoch */
+	s64		la_mtime;
+	/** access time in seconds since Epoch */
+	s64		la_atime;
+	/** change time in seconds since Epoch */
+	s64		la_ctime;
+	/** create time in seconds since Epoch */
+	s64		la_btime;
+        /** 512-byte blocks allocated to object */
+	__u64		la_blocks;
+        /** permission bits and file type */
+	__u32		la_mode;
+        /** owner id */
+	__u32		la_uid;
+        /** group id */
+	__u32		la_gid;
+        /** object flags */
+	__u32		la_flags;
+        /** number of persistent references to this object */
+	__u32		la_nlink;
+        /** blk bits of the object*/
+	__u32		la_blkbits;
+        /** blk size of the object*/
+	__u32		la_blksize;
+        /** real device */
+	__u32		la_rdev;
+	/** project id */
+	__u32		la_projid;
+	/** set layout version to OST objects. */
+	__u32		la_layout_version;
+	/** dirent count */
+	__u64		la_dirent_count;
+};
+
+#define LU_DIRENT_COUNT_UNSET	~0ULL
+
+/**
+ * Layer in the layered object.
+ */
+struct lu_object {
+        /**
+         * Header for this object.
+         */
+        struct lu_object_header           *lo_header;
+        /**
+         * Device for this layer.
+         */
+        struct lu_device                  *lo_dev;
+        /**
+         * Operations for this object.
+         */
+        const struct lu_object_operations *lo_ops;
+        /**
+         * Linkage into list of all layers.
+         */
+	struct list_head		   lo_linkage;
+	/**
+	 * Link to the device, for debugging.
+	 */
+	struct lu_ref_link                 lo_dev_ref;
+};
+
+enum lu_object_header_flags {
+	/**
+	 * Don't keep this object in cache. Object will be destroyed as soon
+	 * as last reference to it is released. This flag cannot be cleared
+	 * once set.
+	 */
+	LU_OBJECT_HEARD_BANSHEE = 0,
+	/**
+	 * Mark this object has already been taken out of cache.
+	 */
+	LU_OBJECT_UNHASHED	= 1,
+	/**
+	 * Object is initialized, when object is found in cache, it may not be
+	 * intialized yet, the object allocator will initialize it.
+	 */
+	LU_OBJECT_INITED	= 2,
+};
+
+enum lu_object_header_attr {
+	LOHA_EXISTS		= BIT(0),
+	LOHA_REMOTE		= BIT(1),
+	LOHA_HAS_AGENT_ENTRY	= BIT(2),
+	/**
+	 * UNIX file type is stored in S_IFMT bits.
+	 */
+	LOHA_FT_START		= 001 << 12, /**< S_IFIFO */
+	LOHA_FT_END		= 017 << 12, /**< S_IFMT */
+};
+
+/**
+ * "Compound" object, consisting of multiple layers.
+ *
+ * Compound object with given fid is unique with given lu_site.
+ *
+ * Note, that object does *not* necessary correspond to the real object in the
+ * persistent storage: object is an anchor for locking and method calling, so
+ * it is created for things like not-yet-existing child created by mkdir or
+ * create calls. lu_object_operations::loo_exists() can be used to check
+ * whether object is backed by persistent storage entity.
+ * Any object containing this structre which might be placed in an
+ * rhashtable via loh_hash MUST be freed using call_rcu() or rcu_kfree().
+ */
+struct lu_object_header {
+	/**
+	 * Fid, uniquely identifying this object.
+	 */
+	struct lu_fid		loh_fid;
+	/**
+	 * Object flags from enum lu_object_header_flags. Set and checked
+	 * atomically.
+	 */
+	unsigned long		loh_flags;
+	/**
+	 * Object reference count. Protected by lu_site::ls_guard.
+	 */
+	atomic_t		loh_ref;
+	/**
+	 * Common object attributes, cached for efficiency. From enum
+	 * lu_object_header_attr.
+	 */
+	__u32			loh_attr;
+	/**
+	 * Linkage into per-site hash table.
+	 */
+	struct rhash_head	loh_hash;
+	/**
+	 * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+	 */
+	struct list_head	loh_lru;
+	/**
+	 * Linkage into list of layers. Never modified once set (except lately
+	 * during object destruction). No locking is necessary.
+	 */
+	struct list_head	loh_layers;
+	/**
+	 * A list of references to this object, for debugging.
+	 */
+	struct lu_ref		loh_reference;
+	/*
+	 * Handle used for kfree_rcu() or similar.
+	 */
+	struct rcu_head		loh_rcu;
+};
+
+struct fld;
+
+enum {
+	LU_SS_CREATED		= 0,
+	LU_SS_CACHE_HIT,
+	LU_SS_CACHE_MISS,
+	LU_SS_CACHE_RACE,
+	LU_SS_CACHE_DEATH_RACE,
+	LU_SS_LRU_PURGED,
+	LU_SS_LAST_STAT
+};
+
+/**
+ * lu_site is a "compartment" within which objects are unique, and LRU
+ * discipline is maintained.
+ *
+ * lu_site exists so that multiple layered stacks can co-exist in the same
+ * address space.
+ *
+ * lu_site has the same relation to lu_device as lu_object_header to
+ * lu_object.
+ */
+struct lu_site {
+        /**
+         * objects hash table
+         */
+	struct rhashtable	ls_obj_hash;
+	/*
+	 * buckets for summary data
+	 */
+	struct lu_site_bkt_data	*ls_bkts;
+	int			ls_bkt_cnt;
+	u32			ls_bkt_seed;
+        /**
+         * index of bucket on hash table while purging
+         */
+	unsigned int		ls_purge_start;
+	/**
+	 * Top-level device for this stack.
+	 */
+	struct lu_device	*ls_top_dev;
+	/**
+	 * Bottom-level device for this stack
+	 */
+	struct lu_device	*ls_bottom_dev;
+	/**
+	 * Linkage into global list of sites.
+	 */
+	struct list_head	ls_linkage;
+	/**
+	 * List for lu device for this site, protected
+	 * by ls_ld_lock.
+	 **/
+	struct list_head	ls_ld_linkage;
+	spinlock_t		ls_ld_lock;
+	/**
+	 * Lock to serialize site purge.
+	 */
+	struct mutex		ls_purge_mutex;
+	/**
+	 * lu_site stats
+	 */
+	struct lprocfs_stats	*ls_stats;
+	/**
+	 * XXX: a hack! fld has to find md_site via site, remove when possible
+	 */
+	struct seq_server_site	*ld_seq_site;
+	/**
+	 * Pointer to the lu_target for this site.
+	 */
+	struct lu_target	*ls_tgt;
+
+	/**
+	 * Number of objects in lsb_lru_lists - used for shrinking
+	 */
+	struct percpu_counter   ls_lru_len_counter;
+};
+
+wait_queue_head_t *
+lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid);
+
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+	return s->ld_seq_site;
+}
+
+/** \name ctors
+ * Constructors/destructors.
+ * @{
+ */
+
+int  lu_site_init         (struct lu_site *s, struct lu_device *d);
+void lu_site_fini         (struct lu_site *s);
+int  lu_site_init_finish  (struct lu_site *s);
+void lu_stack_fini        (const struct lu_env *env, struct lu_device *top);
+void lu_device_get        (struct lu_device *d);
+void lu_device_put        (struct lu_device *d);
+int  lu_device_init       (struct lu_device *d, struct lu_device_type *t);
+void lu_device_fini       (struct lu_device *d);
+int  lu_object_header_init(struct lu_object_header *h);
+void lu_object_header_fini(struct lu_object_header *h);
+void lu_object_header_free(struct lu_object_header *h);
+int  lu_object_init       (struct lu_object *o,
+                           struct lu_object_header *h, struct lu_device *d);
+void lu_object_fini       (struct lu_object *o);
+void lu_object_add_top    (struct lu_object_header *h, struct lu_object *o);
+void lu_object_add        (struct lu_object *before, struct lu_object *o);
+struct lu_object *lu_object_get_first(struct lu_object_header *h,
+				      struct lu_device *dev);
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
+
+/**
+ * Helpers to initialize and finalize device types.
+ */
+
+int  lu_device_type_init(struct lu_device_type *ldt);
+void lu_device_type_fini(struct lu_device_type *ldt);
+
+/** @} ctors */
+
+/** \name caching
+ * Caching and reference counting.
+ * @{
+ */
+
+/**
+ * Acquire additional reference to the given object. This function is used to
+ * attain additional reference. To acquire initial reference use
+ * lu_object_find().
+ */
+static inline void lu_object_get(struct lu_object *o)
+{
+	LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+	atomic_inc(&o->lo_header->loh_ref);
+}
+
+/**
+ * Return true if object will not be cached after last reference to it is
+ * released.
+ */
+static inline int lu_object_is_dying(const struct lu_object_header *h)
+{
+	return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
+}
+
+/**
+ * Return true if object is initialized.
+ */
+static inline int lu_object_is_inited(const struct lu_object_header *h)
+{
+	return test_bit(LU_OBJECT_INITED, &h->loh_flags);
+}
+
+void lu_object_put(const struct lu_env *env, struct lu_object *o);
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, int nr,
+			  int canblock);
+
+static inline int lu_site_purge(const struct lu_env *env, struct lu_site *s,
+				int nr)
+{
+	return lu_site_purge_objects(env, s, nr, 1);
+}
+
+void lu_site_print(const struct lu_env *env, struct lu_site *s, atomic_t *ref,
+		   int msg_flags, lu_printer_t printer);
+struct lu_object *lu_object_find(const struct lu_env *env,
+                                 struct lu_device *dev, const struct lu_fid *f,
+                                 const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+                                    struct lu_device *dev,
+                                    const struct lu_fid *f,
+                                    const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+                                       struct lu_device *dev,
+                                       const struct lu_fid *f,
+                                       const struct lu_object_conf *conf);
+/** @} caching */
+
+/** \name helpers
+ * Helpers.
+ * @{
+ */
+
+/**
+ * First (topmost) sub-object of given compound object
+ */
+static inline struct lu_object *lu_object_top(struct lu_object_header *h)
+{
+	LASSERT(!list_empty(&h->loh_layers));
+	return container_of(h->loh_layers.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Next sub-object in the layering
+ */
+static inline struct lu_object *lu_object_next(const struct lu_object *o)
+{
+	return container_of(o->lo_linkage.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Pointer to the fid of this object.
+ */
+static inline const struct lu_fid *lu_object_fid(const struct lu_object *o)
+{
+        return &o->lo_header->loh_fid;
+}
+
+/**
+ * return device operations vector for this object
+ */
+static const inline struct lu_device_operations *
+lu_object_ops(const struct lu_object *o)
+{
+        return o->lo_dev->ld_ops;
+}
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+                                   const struct lu_device_type *dtype);
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+                      void *cookie, const char *format, ...);
+
+/**
+ * Print object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_DEBUG(mask, env, object, format, ...)                   \
+do {                                                                      \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                     \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);          \
+                lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
+                CDEBUG(mask, format "\n", ## __VA_ARGS__);                \
+        }                                                                 \
+} while (0)
+
+/**
+ * Print short object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_HEADER(mask, env, object, format, ...)                \
+do {                                                                    \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);        \
+                lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
+                                       (object)->lo_header);            \
+                lu_cdebug_printer(env, &msgdata, "\n");                 \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
+        }                                                               \
+} while (0)
+
+void lu_object_print       (const struct lu_env *env, void *cookie,
+                            lu_printer_t printer, const struct lu_object *o);
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t printer,
+                            const struct lu_object_header *hdr);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o);
+
+
+/**
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
+ */
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+/**
+ * Check whether the object as agent entry on current target
+ */
+#define lu_object_has_agent_entry(o) \
+	unlikely((o)->lo_header->loh_attr & LOHA_HAS_AGENT_ENTRY)
+
+static inline void lu_object_set_agent_entry(struct lu_object *o)
+{
+	o->lo_header->loh_attr |= LOHA_HAS_AGENT_ENTRY;
+}
+
+static inline void lu_object_clear_agent_entry(struct lu_object *o)
+{
+	o->lo_header->loh_attr &= ~LOHA_HAS_AGENT_ENTRY;
+}
+
+static inline int lu_object_assert_exists(const struct lu_object *o)
+{
+	return lu_object_exists(o);
+}
+
+static inline int lu_object_assert_not_exists(const struct lu_object *o)
+{
+	return !lu_object_exists(o);
+}
+
+/**
+ * Attr of this object.
+ */
+static inline __u32 lu_object_attr(const struct lu_object *o)
+{
+	LASSERT(lu_object_exists(o) != 0);
+
+	return o->lo_header->loh_attr & S_IFMT;
+}
+
+static inline void lu_object_ref_add_atomic(struct lu_object *o,
+					    const char *scope,
+					    const void *source)
+{
+	lu_ref_add_atomic(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_add(struct lu_object *o,
+				     const char *scope,
+				     const void *source)
+{
+	lu_ref_add(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_add_at(struct lu_object *o,
+					struct lu_ref_link *link,
+					const char *scope,
+					const void *source)
+{
+	lu_ref_add_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+static inline void lu_object_ref_del(struct lu_object *o,
+                                     const char *scope, const void *source)
+{
+        lu_ref_del(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del_at(struct lu_object *o,
+                                        struct lu_ref_link *link,
+                                        const char *scope, const void *source)
+{
+        lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+/** input params, should be filled out by mdt */
+struct lu_rdpg {
+        /** hash */
+        __u64                   rp_hash;
+        /** count in bytes */
+        unsigned int            rp_count;
+        /** number of pages */
+        unsigned int            rp_npages;
+        /** requested attr */
+        __u32                   rp_attrs;
+        /** pointers to pages */
+        struct page           **rp_pages;
+};
+
+enum lu_xattr_flags {
+	LU_XATTR_REPLACE = BIT(0),
+	LU_XATTR_CREATE  = BIT(1),
+	LU_XATTR_MERGE   = BIT(2),
+	LU_XATTR_SPLIT   = BIT(3),
+	LU_XATTR_PURGE   = BIT(4),
+};
+
+/** @} helpers */
+
+/** \name lu_context
+ * @{ */
+
+/** For lu_context health-checks */
+enum lu_context_state {
+        LCS_INITIALIZED = 1,
+        LCS_ENTERED,
+	LCS_LEAVING,
+        LCS_LEFT,
+        LCS_FINALIZED
+};
+
+/**
+ * lu_context. Execution context for lu_object methods. Currently associated
+ * with thread.
+ *
+ * All lu_object methods, except device and device type methods (called during
+ * system initialization and shutdown) are executed "within" some
+ * lu_context. This means, that pointer to some "current" lu_context is passed
+ * as an argument to all methods.
+ *
+ * All service ptlrpc threads create lu_context as part of their
+ * initialization. It is possible to create "stand-alone" context for other
+ * execution environments (like system calls).
+ *
+ * lu_object methods mainly use lu_context through lu_context_key interface
+ * that allows each layer to associate arbitrary pieces of data with each
+ * context (see pthread_key_create(3) for similar interface).
+ *
+ * On a client, lu_context is bound to a thread, see cl_env_get().
+ *
+ * \see lu_context_key
+ */
+struct lu_context {
+        /**
+         * lu_context is used on the client side too. Yet we don't want to
+         * allocate values of server-side keys for the client contexts and
+         * vice versa.
+         *
+         * To achieve this, set of tags in introduced. Contexts and keys are
+         * marked with tags. Key value are created only for context whose set
+         * of tags has non-empty intersection with one for key. Tags are taken
+         * from enum lu_context_tag.
+         */
+        __u32                  lc_tags;
+	enum lu_context_state  lc_state;
+        /**
+         * Pointer to the home service thread. NULL for other execution
+         * contexts.
+         */
+        struct ptlrpc_thread  *lc_thread;
+        /**
+         * Pointer to an array with key values. Internal implementation
+         * detail.
+         */
+	void		      **lc_value;
+	/**
+	 * Linkage into a list of all remembered contexts. Only
+	 * `non-transient' contexts, i.e., ones created for service threads
+	 * are placed here.
+	 */
+	struct list_head	lc_remember;
+	/**
+	 * Version counter used to skip calls to lu_context_refill() when no
+	 * keys were registered.
+	 */
+	unsigned		lc_version;
+        /**
+         * Debugging cookie.
+         */
+	unsigned		lc_cookie;
+};
+
+/**
+ * lu_context_key interface. Similar to pthread_key.
+ */
+
+enum lu_context_tag {
+	/**
+	 * Thread on md server
+	 */
+	LCT_MD_THREAD		= BIT(0),
+	/**
+	 * Thread on dt server
+	 */
+	LCT_DT_THREAD		= BIT(1),
+	/**
+	 * Thread on client
+	 */
+	LCT_CL_THREAD		= BIT(3),
+	/**
+	 * A per-request session on a server, and a per-system-call session on
+	 * a client.
+	 */
+	LCT_SESSION		= BIT(4),
+	/**
+	 * A per-request data on OSP device
+	 */
+	LCT_OSP_THREAD		= BIT(5),
+	/**
+	 * MGS device thread
+	 */
+	LCT_MG_THREAD		= BIT(6),
+	/**
+	 * Context for local operations
+	 */
+	LCT_LOCAL		= BIT(7),
+	/**
+	 * session for server thread
+	 **/
+	LCT_SERVER_SESSION	= BIT(8),
+	/**
+	 * Set when at least one of keys, having values in this context has
+	 * non-NULL lu_context_key::lct_exit() method. This is used to
+	 * optimize lu_context_exit() call.
+	 */
+	LCT_HAS_EXIT		= BIT(28),
+	/**
+	 * Don't add references for modules creating key values in that context.
+	 * This is only for contexts used internally by lu_object framework.
+	 */
+	LCT_NOREF		= BIT(29),
+	/**
+	 * Key is being prepared for retiring, don't create new values for it.
+	 */
+	LCT_QUIESCENT		= BIT(30),
+	/**
+	 * Context should be remembered.
+	 */
+	LCT_REMEMBER		= BIT(31),
+	/**
+	 * Contexts usable in cache shrinker thread.
+	 */
+	LCT_SHRINKER	= LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF,
+};
+
+/**
+ * Key. Represents per-context value slot.
+ *
+ * Keys are usually registered when module owning the key is initialized, and
+ * de-registered when module is unloaded. Once key is registered, all new
+ * contexts with matching tags, will get key value. "Old" contexts, already
+ * initialized at the time of key registration, can be forced to get key value
+ * by calling lu_context_refill().
+ *
+ * Every key value is counted in lu_context_key::lct_used and acquires a
+ * reference on an owning module. This means, that all key values have to be
+ * destroyed before module can be unloaded. This is usually achieved by
+ * stopping threads started by the module, that created contexts in their
+ * entry functions. Situation is complicated by the threads shared by multiple
+ * modules, like ptlrpcd daemon on a client. To work around this problem,
+ * contexts, created in such threads, are `remembered' (see
+ * LCT_REMEMBER)---i.e., added into a global list. When module is preparing
+ * for unloading it does the following:
+ *
+ *     - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT)
+ *       preventing new key values from being allocated in the new contexts,
+ *       and
+ *
+ *     - scans a list of remembered contexts, destroying values of module
+ *       keys, thus releasing references to the module.
+ *
+ * This is done by lu_context_key_quiesce(). If module is re-activated
+ * before key has been de-registered, lu_context_key_revive() call clears
+ * `quiescent' marker.
+ *
+ * lu_context code doesn't provide any internal synchronization for these
+ * activities---it's assumed that startup (including threads start-up) and
+ * shutdown are serialized by some external means.
+ *
+ * \see lu_context
+ */
+struct lu_context_key {
+        /**
+         * Set of tags for which values of this key are to be instantiated.
+         */
+        __u32 lct_tags;
+        /**
+         * Value constructor. This is called when new value is created for a
+         * context. Returns pointer to new value of error pointer.
+         */
+        void  *(*lct_init)(const struct lu_context *ctx,
+                           struct lu_context_key *key);
+        /**
+         * Value destructor. Called when context with previously allocated
+         * value of this slot is destroyed. \a data is a value that was returned
+         * by a matching call to lu_context_key::lct_init().
+         */
+        void   (*lct_fini)(const struct lu_context *ctx,
+                           struct lu_context_key *key, void *data);
+        /**
+         * Optional method called on lu_context_exit() for all allocated
+         * keys. Can be used by debugging code checking that locks are
+         * released, etc.
+         */
+        void   (*lct_exit)(const struct lu_context *ctx,
+                           struct lu_context_key *key, void *data);
+	/**
+	 * Internal implementation detail: index within lu_context::lc_value[]
+	 * reserved for this key.
+	 */
+	int		lct_index;
+	/**
+	 * Internal implementation detail: number of values created for this
+	 * key.
+	 */
+	atomic_t	lct_used;
+	/**
+	 * Internal implementation detail: module for this key.
+	 */
+	struct module	*lct_owner;
+	/**
+	 * References to this key. For debugging.
+	 */
+	struct lu_ref	lct_reference;
+};
+
+#define LU_KEY_INIT(mod, type)                                    \
+	static void *mod##_key_init(const struct lu_context *ctx, \
+				    struct lu_context_key *key)   \
+	{                                                         \
+		type *value;                                      \
+                                                                  \
+		BUILD_BUG_ON(PAGE_SIZE < sizeof(*value));	  \
+                                                                  \
+		OBD_ALLOC_PTR(value);                             \
+		if (value == NULL)                                \
+			value = ERR_PTR(-ENOMEM);                 \
+								  \
+		return value;                                     \
+	}                                                         \
+	struct __##mod##__dummy_init { ; } /* semicolon catcher */
+
+#define LU_KEY_FINI(mod, type)                                              \
+        static void mod##_key_fini(const struct lu_context *ctx,            \
+                                    struct lu_context_key *key, void* data) \
+        {                                                                   \
+                type *info = data;                                          \
+                                                                            \
+                OBD_FREE_PTR(info);                                         \
+        }                                                                   \
+        struct __##mod##__dummy_fini {;} /* semicolon catcher */
+
+#define LU_KEY_INIT_FINI(mod, type)   \
+        LU_KEY_INIT(mod,type);        \
+        LU_KEY_FINI(mod,type)
+
+#define LU_CONTEXT_KEY_DEFINE(mod, tags)                \
+        struct lu_context_key mod##_thread_key = {      \
+                .lct_tags = tags,                       \
+                .lct_init = mod##_key_init,             \
+                .lct_fini = mod##_key_fini              \
+        }
+
+#define LU_CONTEXT_KEY_INIT(key)                        \
+do {                                                    \
+        (key)->lct_owner = THIS_MODULE;                 \
+} while (0)
+
+int   lu_context_key_register(struct lu_context_key *key);
+void  lu_context_key_degister(struct lu_context_key *key);
+void *lu_context_key_get     (const struct lu_context *ctx,
+                               const struct lu_context_key *key);
+void  lu_context_key_quiesce(struct lu_device_type *t,
+			     struct lu_context_key *key);
+void  lu_context_key_revive(struct lu_context_key *key);
+
+
+/*
+ * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an
+ * owning module.
+ */
+
+#define LU_KEY_INIT_GENERIC(mod)                                        \
+        static void mod##_key_init_generic(struct lu_context_key *k, ...) \
+        {                                                               \
+                struct lu_context_key *key = k;                         \
+                va_list args;                                           \
+                                                                        \
+                va_start(args, k);                                      \
+                do {                                                    \
+                        LU_CONTEXT_KEY_INIT(key);                       \
+                        key = va_arg(args, struct lu_context_key *);    \
+                } while (key != NULL);                                  \
+                va_end(args);                                           \
+        }
+
+#define LU_TYPE_INIT(mod, ...)                                          \
+        LU_KEY_INIT_GENERIC(mod)                                        \
+        static int mod##_type_init(struct lu_device_type *t)            \
+        {                                                               \
+                mod##_key_init_generic(__VA_ARGS__, NULL);              \
+                return lu_context_key_register_many(__VA_ARGS__, NULL); \
+        }                                                               \
+        struct __##mod##_dummy_type_init {;}
+
+#define LU_TYPE_FINI(mod, ...)                                          \
+        static void mod##_type_fini(struct lu_device_type *t)           \
+        {                                                               \
+                lu_context_key_degister_many(__VA_ARGS__, NULL);        \
+        }                                                               \
+        struct __##mod##_dummy_type_fini {;}
+
+#define LU_TYPE_START(mod, ...)                                 \
+        static void mod##_type_start(struct lu_device_type *t)  \
+        {                                                       \
+                lu_context_key_revive_many(__VA_ARGS__, NULL);  \
+        }                                                       \
+        struct __##mod##_dummy_type_start {;}
+
+#define LU_TYPE_STOP(mod, ...)                                     \
+	static void mod##_type_stop(struct lu_device_type *t)      \
+	{                                                          \
+		lu_context_key_quiesce_many(t, __VA_ARGS__, NULL); \
+	}                                                          \
+	struct __##mod##_dummy_type_stop { }
+
+
+
+#define LU_TYPE_INIT_FINI(mod, ...)             \
+        LU_TYPE_INIT(mod, __VA_ARGS__);         \
+        LU_TYPE_FINI(mod, __VA_ARGS__);         \
+        LU_TYPE_START(mod, __VA_ARGS__);        \
+        LU_TYPE_STOP(mod, __VA_ARGS__)
+
+int   lu_context_init  (struct lu_context *ctx, __u32 tags);
+void  lu_context_fini  (struct lu_context *ctx);
+void  lu_context_enter (struct lu_context *ctx);
+void  lu_context_exit  (struct lu_context *ctx);
+int   lu_context_refill(struct lu_context *ctx);
+
+/*
+ * Helper functions to operate on multiple keys. These are used by the default
+ * device type operations, defined by LU_TYPE_INIT_FINI().
+ */
+
+int  lu_context_key_register_many(struct lu_context_key *k, ...);
+void lu_context_key_degister_many(struct lu_context_key *k, ...);
+void lu_context_key_revive_many  (struct lu_context_key *k, ...);
+void lu_context_key_quiesce_many(struct lu_device_type *t,
+				 struct lu_context_key *k, ...);
+
+/*
+ * update/clear ctx/ses tags.
+ */
+void lu_context_tags_update(__u32 tags);
+void lu_context_tags_clear(__u32 tags);
+void lu_session_tags_update(__u32 tags);
+void lu_session_tags_clear(__u32 tags);
+
+/**
+ * Environment.
+ */
+struct lu_env {
+        /**
+         * "Local" context, used to store data instead of stack.
+         */
+        struct lu_context  le_ctx;
+        /**
+         * "Session" context for per-request data.
+         */
+        struct lu_context *le_ses;
+};
+
+int  lu_env_init  (struct lu_env *env, __u32 tags);
+void lu_env_fini  (struct lu_env *env);
+int  lu_env_refill(struct lu_env *env);
+int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
+
+static inline void* lu_env_info(const struct lu_env *env,
+				const struct lu_context_key *key)
+{
+	void *info;
+	info = lu_context_key_get(&env->le_ctx, key);
+	if (!info) {
+		if (!lu_env_refill((struct lu_env *)env))
+			info = lu_context_key_get(&env->le_ctx, key);
+	}
+	LASSERT(info);
+	return info;
+}
+
+struct lu_env *lu_env_find(void);
+int lu_env_add(struct lu_env *env);
+int lu_env_add_task(struct lu_env *env, struct task_struct *task);
+void lu_env_remove(struct lu_env *env);
+
+/** @} lu_context */
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m);
+
+/**
+ * Common name structure to be passed around for various name related methods.
+ */
+struct lu_name {
+        const char    *ln_name;
+        int            ln_namelen;
+};
+
+static inline bool name_is_dot_or_dotdot(const char *name, int namelen)
+{
+	return name[0] == '.' &&
+	       (namelen == 1 || (namelen == 2 && name[1] == '.'));
+}
+
+static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname)
+{
+	return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen);
+}
+
+static inline bool lu_name_is_temp_file(const char *name, int namelen,
+					bool dot_prefix, int suffixlen)
+{
+	int lower = 0;
+	int upper = 0;
+	int digit = 0;
+	int len = suffixlen;
+
+	if (dot_prefix && name[0] != '.')
+		return false;
+
+	if (namelen < dot_prefix + suffixlen + 2 ||
+	    name[namelen - suffixlen - 1] != '.')
+		return false;
+
+	while (len) {
+		lower += islower(name[namelen - len]);
+		upper += isupper(name[namelen - len]);
+		digit += isdigit(name[namelen - len]);
+		len--;
+	}
+	/* mktemp() filename suffixes will have a mix of upper- and lower-case
+	 * letters and/or numbers, not all numbers, or all upper or lower-case.
+	 * About 0.07% of randomly-generated names will slip through,
+	 * but this avoids 99.93% of cross-MDT renames for those files.
+	 */
+	if ((digit >= suffixlen - 1 && !isdigit(name[namelen - suffixlen])) ||
+	    upper == suffixlen || lower == suffixlen)
+		return false;
+
+	return true;
+}
+
+static inline bool lu_name_is_backup_file(const char *name, int namelen,
+					  int *suffixlen)
+{
+	if (namelen > 1 &&
+	    name[namelen - 2] != '.' && name[namelen - 1] == '~') {
+		if (suffixlen)
+			*suffixlen = 1;
+		return true;
+	}
+
+	if (namelen > 4 && name[namelen - 4] == '.' &&
+	    (!strncasecmp(name + namelen - 3, "bak", 3) ||
+	     !strncasecmp(name + namelen - 3, "sav", 3))) {
+		if (suffixlen)
+			*suffixlen = 4;
+		return true;
+	}
+
+	if (namelen > 5 && name[namelen - 5] == '.' &&
+	    !strncasecmp(name + namelen - 4, "orig", 4)) {
+		if (suffixlen)
+			*suffixlen = 5;
+		return true;
+	}
+
+	return false;
+}
+
+static inline bool lu_name_is_valid_len(const char *name, size_t name_len)
+{
+	return name != NULL &&
+	       name_len > 0 &&
+	       name_len < INT_MAX &&
+	       strlen(name) == name_len &&
+	       memchr(name, '/', name_len) == NULL;
+}
+
+/**
+ * Validate names (path components)
+ *
+ * To be valid \a name must be non-empty, '\0' terminated of length \a
+ * name_len, and not contain '/'. The maximum length of a name (before
+ * say -ENAMETOOLONG will be returned) is really controlled by llite
+ * and the server. We only check for something insane coming from bad
+ * integer handling here.
+ */
+static inline bool lu_name_is_valid_2(const char *name, size_t name_len)
+{
+	return lu_name_is_valid_len(name, name_len) && name[name_len] == '\0';
+}
+
+static inline bool lu_name_is_valid(const struct lu_name *ln)
+{
+	return lu_name_is_valid_2(ln->ln_name, ln->ln_namelen);
+}
+
+#define DNAME "%.*s"
+#define PNAME(ln)					\
+	(lu_name_is_valid(ln) ? (ln)->ln_namelen : 0),	\
+	(lu_name_is_valid(ln) ? (ln)->ln_name : "")
+
+/**
+ * Common buffer structure to be passed around for various xattr_{s,g}et()
+ * methods.
+ */
+struct lu_buf {
+	void   *lb_buf;
+	size_t  lb_len;
+};
+
+#define DLUBUF "(%p %zu)"
+#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+
+/* read buffer params, should be filled out by out */
+struct lu_rdbuf {
+	/** number of buffers */
+	unsigned int	rb_nbufs;
+	/** pointers to buffers */
+	struct lu_buf	rb_bufs[];
+};
+
+/**
+ * One-time initializers, called at obdclass module initialization, not
+ * exported.
+ */
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void);
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void);
+
+struct lu_kmem_descr {
+	struct kmem_cache **ckd_cache;
+        const char       *ckd_name;
+        const size_t      ckd_size;
+};
+
+int  lu_kmem_init(struct lu_kmem_descr *caches);
+void lu_kmem_fini(struct lu_kmem_descr *caches);
+
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid);
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf);
+
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, size_t size);
+void lu_buf_realloc(struct lu_buf *buf, size_t size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, size_t len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len);
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+
+static inline bool lu_device_is_cl(const struct lu_device *d)
+{
+	return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline bool lu_object_is_cl(const struct lu_object *o)
+{
+	return lu_device_is_cl(o->lo_dev);
+}
+
+/* Generic subset of tgts */
+struct lu_tgt_pool {
+	__u32		   *op_array;	/* array of index of
+					 * lov_obd->lov_tgts
+					 */
+	unsigned int	    op_count;	/* number of tgts in the array */
+	unsigned int	    op_size;	/* allocated size of op_array */
+	struct rw_semaphore op_rw_sem;	/* to protect lu_tgt_pool use */
+};
+
+int lu_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count);
+int lu_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count);
+int lu_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx);
+void lu_tgt_pool_free(struct lu_tgt_pool *op);
+int lu_tgt_check_index(int idx, struct lu_tgt_pool *osts);
+int lu_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count);
+
+/* bitflags used in rr / qos allocation */
+enum lq_flag {
+	LQ_DIRTY	= 0, /* recalc qos data */
+	LQ_SAME_SPACE,	     /* the OSTs all have approx.
+			      * the same space avail */
+	LQ_RESET,	     /* zero current penalties */
+	LQ_SF_PROGRESS,      /* statfs op in progress */
+};
+
+#ifdef HAVE_SERVER_SUPPORT
+/* round-robin QoS data for LOD/LMV */
+struct lu_qos_rr {
+	spinlock_t		 lqr_alloc;	/* protect allocation index */
+	atomic_t		 lqr_start_idx;	/* start index of new inode */
+	__u32			 lqr_offset_idx;/* aliasing for start_idx */
+	int			 lqr_start_count;/* reseed counter */
+	struct lu_tgt_pool	 lqr_pool;	/* round-robin optimized list */
+	unsigned long		 lqr_flags;
+};
+
+static inline void lu_qos_rr_init(struct lu_qos_rr *lqr)
+{
+	spin_lock_init(&lqr->lqr_alloc);
+	set_bit(LQ_DIRTY, &lqr->lqr_flags);
+}
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+/* QoS data per MDS/OSS */
+struct lu_svr_qos {
+	struct obd_uuid		 lsq_uuid;	/* ptlrpc's c_remote_uuid */
+	struct list_head	 lsq_svr_list;	/* link to lq_svr_list */
+	__u64			 lsq_bavail;	/* total bytes avail on svr */
+	__u64			 lsq_iavail;	/* total inode avail on svr */
+	__u64			 lsq_penalty;	/* current penalty */
+	__u64			 lsq_penalty_per_obj; /* penalty decrease
+						       * every obj*/
+	time64_t		 lsq_used;	/* last used time, seconds */
+	__u32			 lsq_tgt_count;	/* number of tgts on this svr */
+	__u32			 lsq_id;	/* unique svr id */
+};
+
+/* QoS data per MDT/OST */
+struct lu_tgt_qos {
+	struct lu_svr_qos	*ltq_svr;	/* svr info */
+	__u64			 ltq_penalty;	/* current penalty */
+	__u64			 ltq_penalty_per_obj; /* penalty decrease
+						       * every obj*/
+	__u64			 ltq_avail;	/* bytes/inode avail */
+	__u64			 ltq_weight;	/* net weighting */
+	time64_t		 ltq_used;	/* last used time, seconds */
+	bool			 ltq_usable:1;	/* usable for striping */
+};
+
+/* target descriptor */
+#define LOV_QOS_DEF_THRESHOLD_RR_PCT	17
+#define LMV_QOS_DEF_THRESHOLD_RR_PCT	5
+
+#define LOV_QOS_DEF_PRIO_FREE		90
+#define LMV_QOS_DEF_PRIO_FREE		90
+
+struct lu_tgt_desc {
+	union {
+		struct dt_device	*ltd_tgt;
+		struct obd_device	*ltd_obd;
+	};
+	struct obd_export *ltd_exp;
+	struct obd_uuid    ltd_uuid;
+	__u32              ltd_index;
+	__u32		   ltd_gen;
+	struct list_head   ltd_kill;
+	struct task_struct *ltd_recovery_task;
+	struct mutex	   ltd_fid_mutex;
+	struct lu_tgt_qos  ltd_qos; /* qos info per target */
+	struct obd_statfs  ltd_statfs;
+	time64_t	   ltd_statfs_age;
+	unsigned long      ltd_active:1,/* is this target up for requests */
+			   ltd_activate:1,/* should target be activated */
+			   ltd_reap:1,  /* should this target be deleted */
+			   ltd_got_update_log:1, /* Already got update log */
+			   ltd_connecting:1; /* target is connecting */
+};
+
+/* number of pointers at 2nd level */
+#define TGT_PTRS_PER_BLOCK	(PAGE_SIZE / sizeof(void *))
+/* number of pointers at 1st level - only need as many as max OST/MDT count */
+#define TGT_PTRS		((LOV_ALL_STRIPES + 1) / TGT_PTRS_PER_BLOCK)
+
+struct lu_tgt_desc_idx {
+	struct lu_tgt_desc *ldi_tgt[TGT_PTRS_PER_BLOCK];
+};
+
+
+/* QoS data for LOD/LMV */
+#define QOS_THRESHOLD_MAX 256 /* should be power of two */
+struct lu_qos {
+	struct list_head	 lq_svr_list;	/* lu_svr_qos list */
+	struct rw_semaphore	 lq_rw_sem;
+	__u32			 lq_active_svr_count;
+	unsigned int		 lq_prio_free;   /* priority for free space */
+	unsigned int		 lq_threshold_rr;/* priority for rr */
+#ifdef HAVE_SERVER_SUPPORT
+	struct lu_qos_rr	 lq_rr;          /* round robin qos data */
+#endif
+	unsigned long		 lq_flags;
+#if 0
+	unsigned long		 lq_dirty:1,     /* recalc qos data */
+				 lq_same_space:1,/* the servers all have approx.
+						  * the same space avail */
+				 lq_reset:1;     /* zero current penalties */
+#endif
+};
+
+struct lu_tgt_descs {
+	union {
+		struct lov_desc	      ltd_lov_desc;
+		struct lmv_desc	      ltd_lmv_desc;
+	};
+	/* list of known TGTs */
+	struct lu_tgt_desc_idx	*ltd_tgt_idx[TGT_PTRS];
+	/* Size of the lu_tgts array, granted to be a power of 2 */
+	__u32			ltd_tgts_size;
+	/* bitmap of TGTs available */
+	unsigned long		*ltd_tgt_bitmap;
+	/* TGTs scheduled to be deleted */
+	__u32			ltd_death_row;
+	/* Table refcount used for delayed deletion */
+	int			ltd_refcount;
+	/* mutex to serialize concurrent updates to the tgt table */
+	struct mutex		ltd_mutex;
+	/* read/write semaphore used for array relocation */
+	struct rw_semaphore	ltd_rw_sem;
+	/* QoS */
+	struct lu_qos		ltd_qos;
+	/* all tgts in a packed array */
+	struct lu_tgt_pool	ltd_tgt_pool;
+	/* true if tgt is MDT */
+	bool			ltd_is_mdt;
+};
+
+#define LTD_TGT(ltd, index)						\
+	 (ltd)->ltd_tgt_idx[(index) / TGT_PTRS_PER_BLOCK]->		\
+		ldi_tgt[(index) % TGT_PTRS_PER_BLOCK]
+
+u64 lu_prandom_u64_max(u64 ep_ro);
+int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd);
+void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt);
+
+int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt);
+void lu_tgt_descs_fini(struct lu_tgt_descs *ltd);
+int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt);
+void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt);
+int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd);
+int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
+		   __u64 *total_wt);
+
+/**
+ * Whether MDT inode and space usages are balanced.
+ */
+static inline bool ltd_qos_is_balanced(struct lu_tgt_descs *ltd)
+{
+	return !test_bit(LQ_DIRTY, &ltd->ltd_qos.lq_flags) &&
+	       test_bit(LQ_SAME_SPACE, &ltd->ltd_qos.lq_flags);
+}
+
+/**
+ * Whether QoS data is up-to-date and QoS can be applied.
+ */
+static inline bool ltd_qos_is_usable(struct lu_tgt_descs *ltd)
+{
+	if (ltd_qos_is_balanced(ltd))
+		return false;
+
+	if (ltd->ltd_lov_desc.ld_active_tgt_count < 2)
+		return false;
+
+	return true;
+}
+
+static inline struct lu_tgt_desc *ltd_first_tgt(struct lu_tgt_descs *ltd)
+{
+	int index;
+
+	index = find_first_bit(ltd->ltd_tgt_bitmap,
+			       ltd->ltd_tgts_size);
+	return (index < ltd->ltd_tgts_size) ? LTD_TGT(ltd, index) : NULL;
+}
+
+static inline struct lu_tgt_desc *ltd_next_tgt(struct lu_tgt_descs *ltd,
+					       struct lu_tgt_desc *tgt)
+{
+	int index;
+
+	if (!tgt)
+		return NULL;
+
+	index = tgt->ltd_index;
+	LASSERT(index < ltd->ltd_tgts_size);
+	index = find_next_bit(ltd->ltd_tgt_bitmap,
+			      ltd->ltd_tgts_size, index + 1);
+	return (index < ltd->ltd_tgts_size) ? LTD_TGT(ltd, index) : NULL;
+}
+
+#define ltd_foreach_tgt(ltd, tgt) \
+	for (tgt = ltd_first_tgt(ltd); tgt; tgt = ltd_next_tgt(ltd, tgt))
+
+#define ltd_foreach_tgt_safe(ltd, tgt, tmp)				  \
+	for (tgt = ltd_first_tgt(ltd), tmp = ltd_next_tgt(ltd, tgt); tgt; \
+	     tgt = tmp, tmp = ltd_next_tgt(ltd, tgt))
+
+/** @} lu */
+#endif /* __LUSTRE_LU_OBJECT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_ref.h b/drivers/staging/lustrefsx/lustre/include/lu_ref.h
new file mode 100644
index 0000000000000..7b368c297ff13
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lu_ref.h
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ */
+
+#ifndef __LUSTRE_LU_REF_H
+#define __LUSTRE_LU_REF_H
+
+#include <linux/list.h>
+
+/** \defgroup lu_ref lu_ref
+ *
+ * An interface to track references between objects. Mostly for debugging.
+ *
+ * Suppose there is a reference counted data-structure struct foo. To track
+ * who acquired references to instance of struct foo, add lu_ref field to it:
+ *
+ * \code
+ *	 struct foo {
+ *		 atomic_t      foo_refcount;
+ *		 struct lu_ref foo_reference;
+ *		 ...
+ *	 };
+ * \endcode
+ *
+ * foo::foo_reference has to be initialized by calling
+ * lu_ref_init(). Typically there will be functions or macros to increment and
+ * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo)
+ * and foo_put(struct foo *foo), respectively.
+ *
+ * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add()
+ * has to be called to insert into foo::foo_reference a record, describing
+ * acquired reference. Dually, lu_ref_del() removes matching record. Typical
+ * usages are:
+ *
+ * \code
+ *	struct bar *bar;
+ *
+ *	// bar owns a reference to foo.
+ *	bar->bar_foo = foo_get(foo);
+ *	lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *	...
+ *
+ *	// reference from bar to foo is released.
+ *	lu_ref_del(&foo->foo_reference, "bar", bar);
+ *	foo_put(bar->bar_foo);
+ *
+ *
+ *	// current thread acquired a temporary reference to foo.
+ *	foo_get(foo);
+ *	lu_ref_add(&foo->reference, __func__, current);
+ *
+ *	...
+ *
+ *	// temporary reference is released.
+ *	lu_ref_del(&foo->reference, __func__, current);
+ *	foo_put(foo);
+ * \endcode
+ *
+ * \e Et \e cetera. Often it makes sense to include lu_ref_add() and
+ * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct
+ * foo is destroyed, lu_ref_fini() has to be called that checks that no
+ * pending references remain. lu_ref_print() can be used to dump a list of
+ * pending references, while hunting down a leak.
+ *
+ * For objects to which a large number of references can be acquired,
+ * lu_ref_del() can become cpu consuming, as it has to scan the list of
+ * references. To work around this, remember result of lu_ref_add() (usually
+ * in the same place where pointer to struct foo is stored), and use
+ * lu_ref_del_at():
+ *
+ * \code
+ *	// There is a large number of bar's for a single foo.
+ *	bar->bar_foo     = foo_get(foo);
+ *	bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *	...
+ *
+ *	// reference from bar to foo is released.
+ *	lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar);
+ *	foo_put(bar->bar_foo);
+ * \endcode
+ *
+ * lu_ref interface degrades gracefully in case of memory shortages.
+ *
+ * @{
+ */
+
+#ifdef CONFIG_LUSTRE_DEBUG_LU_REF
+
+/**
+ * Data-structure to keep track of references to a given object. This is used
+ * for debugging.
+ *
+ * lu_ref is embedded into an object which other entities (objects, threads,
+ * etc.) refer to.
+ */
+struct lu_ref {
+	/**
+	 * Spin-lock protecting lu_ref::lf_list.
+	 */
+	spinlock_t		lf_guard;
+	/**
+	 * List of all outstanding references (each represented by struct
+	 * lu_ref_link), pointing to this object.
+	 */
+	struct list_head	lf_list;
+	/**
+	 * # of links.
+	 */
+	short			lf_refs;
+	/**
+	 * Flag set when lu_ref_add() failed to allocate lu_ref_link. It is
+	 * used to mask spurious failure of the following lu_ref_del().
+	 */
+	short			lf_failed;
+	/**
+	 * flags - attribute for the lu_ref, for pad and future use.
+	 */
+	short			lf_flags;
+	/**
+	 * Where was I initialized?
+	 */
+	short			lf_line;
+	const char		*lf_func;
+	/**
+	 * Linkage into a global list of all lu_ref's (lu_ref_refs).
+	 */
+	struct list_head	lf_linkage;
+};
+
+struct lu_ref_link {
+	struct lu_ref	*ll_ref;
+	struct list_head ll_linkage;
+	const char	*ll_scope;
+	const void	*ll_source;
+};
+
+void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line);
+void lu_ref_fini(struct lu_ref *ref);
+#define lu_ref_init(ref) lu_ref_init_loc(ref, __func__, __LINE__)
+
+void lu_ref_add(struct lu_ref *ref, const char *scope, const void *source);
+
+void lu_ref_add_atomic(struct lu_ref *ref, const char *scope,
+		       const void *source);
+
+void lu_ref_add_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source);
+
+void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source);
+
+void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source0, const void *source1);
+
+void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source);
+
+void lu_ref_print(const struct lu_ref *ref);
+
+void lu_ref_print_all(void);
+
+int lu_ref_global_init(void);
+
+void lu_ref_global_fini(void);
+
+#else /* !CONFIG_LUSTRE_DEBUG_LU_REF */
+
+struct lu_ref {
+};
+
+struct lu_ref_link {
+};
+
+static inline void lu_ref_init(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_fini(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_add(struct lu_ref *ref,
+			      const char *scope,
+			      const void *source)
+{
+}
+
+static inline void lu_ref_add_atomic(struct lu_ref *ref,
+				     const char *scope,
+				     const void *source)
+{
+}
+
+static inline void lu_ref_add_at(struct lu_ref *ref,
+				 struct lu_ref_link *link,
+				 const char *scope,
+				 const void *source)
+{
+}
+
+static inline void lu_ref_del(struct lu_ref *ref, const char *scope,
+			      const void *source)
+{
+}
+
+static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+				 const char *scope, const void *source0,
+				 const void *source1)
+{
+}
+
+static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+				 const char *scope, const void *source)
+{
+}
+
+static inline int lu_ref_global_init(void)
+{
+	return 0;
+}
+
+static inline void lu_ref_global_fini(void)
+{
+}
+
+static inline void lu_ref_print(const struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_print_all(void)
+{
+}
+#endif /* CONFIG_LUSTRE_DEBUG_LU_REF */
+
+/** @} lu */
+
+#endif /* __LUSTRE_LU_REF_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_target.h b/drivers/staging/lustrefsx/lustre/include/lu_target.h
new file mode 100644
index 0000000000000..d061244d22322
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lu_target.h
@@ -0,0 +1,740 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _LUSTRE_LU_TARGET_H
+#define _LUSTRE_LU_TARGET_H
+
+#include <dt_object.h>
+#include <lustre_export.h>
+#include <lustre_update.h>
+#include <lustre_disk.h>
+#include <lustre_lfsck.h>
+#include <lu_object.h>
+
+/* Each one represents a distribute transaction replay
+ * operation, and updates on each MDTs are linked to
+ * dtr_sub_list */
+struct distribute_txn_replay_req {
+	/* update record, may be vmalloc'd */
+	struct llog_update_record *dtrq_lur;
+	int			dtrq_lur_size;
+
+	/* linked to the distribute transaction replay
+	 * list (tdtd_replay_list) */
+	struct list_head	dtrq_list;
+	__u64			dtrq_master_transno;
+	__u64			dtrq_batchid;
+	__u64			dtrq_xid;
+
+	/* all of sub updates are linked here */
+	struct list_head	dtrq_sub_list;
+	spinlock_t		dtrq_sub_list_lock;
+
+	/* If the local update has been executed during replay */
+	__u32			dtrq_local_update_executed:1;
+};
+
+/* Each one represents a sub replay item under a distribute
+ * transaction. A distribute transaction will be operated in
+ * two or more MDTs, and updates on each MDT will be represented
+ * by this structure */
+struct distribute_txn_replay_req_sub {
+	__u32			dtrqs_mdt_index;
+
+	/* All of cookies for the update will be linked here */
+	spinlock_t		dtrqs_cookie_list_lock;
+	struct list_head	dtrqs_cookie_list;
+	struct list_head	dtrqs_list;
+};
+
+struct target_distribute_txn_data;
+typedef int (*distribute_txn_replay_handler_t)(struct lu_env *env,
+				       struct target_distribute_txn_data *tdtd,
+				       struct distribute_txn_replay_req *dtrq);
+typedef char *(*target_show_update_logs_retrievers_t)(void *data, int *size,
+						      int *count);
+struct target_distribute_txn_data {
+	/* Distribution ID is used to identify updates log on different
+	 * MDTs for one operation */
+	spinlock_t		tdtd_batchid_lock;
+	__u64			tdtd_batchid;
+	struct lu_target	*tdtd_lut;
+	struct dt_object	*tdtd_batchid_obj;
+	struct dt_device	*tdtd_dt;
+
+	/* Committed batchid for distribute transaction */
+	__u64                   tdtd_committed_batchid;
+
+	/* List for distribute transaction */
+	struct list_head	tdtd_list;
+
+	/* Threads to manage distribute transaction */
+	struct task_struct	*tdtd_commit_task;
+	atomic_t		tdtd_refcount;
+	struct lu_env		tdtd_env;
+
+	/* recovery update */
+	distribute_txn_replay_handler_t	tdtd_replay_handler;
+	struct list_head		tdtd_replay_list;
+	struct list_head		tdtd_replay_finish_list;
+	spinlock_t			tdtd_replay_list_lock;
+	/* last replay update transno */
+	__u32				tdtd_replay_ready:1;
+
+	/* Manage the llog recovery threads */
+	atomic_t		tdtd_recovery_threads_count;
+	wait_queue_head_t	tdtd_recovery_threads_waitq;
+	target_show_update_logs_retrievers_t
+				tdtd_show_update_logs_retrievers;
+	void			*tdtd_show_retrievers_cbdata;
+};
+
+struct tg_grants_data {
+	/* grants: all values in bytes */
+	/* grant lock to protect all grant counters */
+	spinlock_t		 tgd_grant_lock;
+	/* total amount of dirty data reported by clients in incoming obdo */
+	u64			 tgd_tot_dirty;
+	/* sum of filesystem space granted to clients for async writes */
+	u64			 tgd_tot_granted;
+	/* grant used by I/Os in progress (between prepare and commit) */
+	u64			 tgd_tot_pending;
+	/* amount of available space in percentage that is never used for
+	 * grants, used on MDT to always keep space for metadata. */
+	u64			 tgd_reserved_pcnt;
+	/* number of clients using grants */
+	int			 tgd_tot_granted_clients;
+	/* shall we grant space to clients not
+	 * supporting OBD_CONNECT_GRANT_PARAM? */
+	int			 tgd_grant_compat_disable;
+	/* protect all statfs-related counters */
+	spinlock_t		 tgd_osfs_lock;
+	time64_t		 tgd_osfs_age;
+	int			 tgd_blockbits;
+	/* counters used during statfs update, protected by ofd_osfs_lock.
+	 * record when some statfs refresh are in progress */
+	int			 tgd_statfs_inflight;
+	/* writes between prep & commit which might be accounted twice in
+	 * ofd_osfs.os_bavail */
+	u64			 tgd_osfs_unstable;
+	/* track writes completed while statfs refresh is underway.
+	 * tracking is only effective when ofd_statfs_inflight > 1 */
+	u64			 tgd_osfs_inflight;
+	/* statfs optimization: we cache a bit  */
+	struct obd_statfs	 tgd_osfs;
+};
+
+struct lu_target {
+	struct obd_device	*lut_obd;
+	struct dt_device	*lut_bottom;
+	struct dt_device_param	 lut_dt_conf;
+
+	struct target_distribute_txn_data *lut_tdtd;
+
+	/* supported opcodes and handlers for this target */
+	struct tgt_opc_slice	*lut_slice;
+	__u32			 lut_reply_fail_id;
+	__u32			 lut_request_fail_id;
+
+	/* sptlrpc rules */
+	rwlock_t		 lut_sptlrpc_lock;
+	struct sptlrpc_rule_set	 lut_sptlrpc_rset;
+	spinlock_t		 lut_flags_lock;
+	unsigned int		 lut_syncjournal:1,
+				 lut_sync_lock_cancel:2,
+				 /* e.g. OST node */
+				 lut_no_reconstruct:1,
+				 /* enforce recovery for local clients */
+				 lut_local_recovery:1,
+				 lut_cksum_t10pi_enforce:1;
+	/* checksum types supported on this node */
+	enum cksum_types	 lut_cksum_types_supported;
+	/** last_rcvd file */
+	struct dt_object	*lut_last_rcvd;
+	/* transaction callbacks */
+	struct dt_txn_callback	 lut_txn_cb;
+	/** server data in last_rcvd file */
+	struct lr_server_data	 lut_lsd;
+	/** Server last transaction number */
+	__u64			 lut_last_transno;
+	/** Lock protecting last transaction number */
+	spinlock_t		 lut_translock;
+	/** Lock protecting client bitmap */
+	spinlock_t		 lut_client_bitmap_lock;
+	/** Bitmap of known clients */
+	unsigned long		*lut_client_bitmap;
+	/* Number of clients supporting multiple modify RPCs
+	 * recorded in the last_rcvd file
+	 */
+	atomic_t		 lut_num_clients;
+	/* Client generation to identify client slot reuse */
+	atomic_t		 lut_client_generation;
+	/** reply_data file */
+	struct dt_object	*lut_reply_data;
+	/** Bitmap of used slots in the reply data file */
+	unsigned long		**lut_reply_bitmap;
+	/** target sync count, used for debug & test */
+	atomic_t		 lut_sync_count;
+
+	/** cross MDT locks which should trigger Sync-on-Lock-Cancel */
+	spinlock_t		 lut_slc_locks_guard;
+	struct list_head	 lut_slc_locks;
+
+	/* target grants fields */
+	struct tg_grants_data	 lut_tgd;
+
+	/* target tunables */
+	const struct attribute	**lut_attrs;
+
+	/* FMD (file modification data) values */
+	int			 lut_fmd_max_num;
+	time64_t		 lut_fmd_max_age;
+};
+
+#define LUT_FMD_MAX_NUM_DEFAULT 128
+#define LUT_FMD_MAX_AGE_DEFAULT (obd_timeout + 10)
+
+/* number of slots in reply bitmap */
+#define LUT_REPLY_SLOTS_PER_CHUNK (1<<20)
+#define LUT_REPLY_SLOTS_MAX_CHUNKS 16
+
+#define TRD_INDEX_MEMORY -1
+
+/**
+ * Target reply data
+ */
+struct tg_reply_data {
+	/** chain of reply data anchored in tg_export_data */
+	struct list_head	trd_list;
+	/** copy of on-disk reply data */
+	struct lsd_reply_data	trd_reply;
+	/** versions for Version Based Recovery */
+	__u64			trd_pre_versions[4];
+	/** slot index in reply_data file */
+	int			trd_index;
+	/** tag the client used */
+	__u16			trd_tag;
+	/** child fid to reconstruct open */
+	struct lu_fid		trd_object;
+};
+
+extern struct lu_context_key tgt_session_key;
+
+struct tgt_session_info {
+	/*
+	 * The following members will be filled explicitly
+	 * with specific data in tgt_ses_init().
+	 */
+	struct req_capsule	*tsi_pill;
+
+	/*
+	 * Lock request for "habeo clavis" operations.
+	 */
+	struct ldlm_request	*tsi_dlm_req;
+
+	/* although we have export in req, there are cases when it is not
+	 * available, e.g. closing files upon export destroy */
+	struct obd_export	*tsi_exp;
+	const struct lu_env	*tsi_env;
+	struct lu_target	*tsi_tgt;
+
+	const struct mdt_body	*tsi_mdt_body;
+	struct ost_body		*tsi_ost_body;
+	struct lu_object	*tsi_corpus;
+
+	struct lu_fid		 tsi_fid;
+	struct ldlm_res_id	 tsi_resid;
+
+	/* object affected by VBR, for last_rcvd_update */
+	struct dt_object	*tsi_vbr_obj;
+	/* open child object, for last_rcvd_update */
+	struct dt_object	*tsi_open_obj;
+	/* opdata for mdt_reint_open(), has the same value as
+	 * ldlm_reply:lock_policy_res1.  The tgt_update_last_rcvd() stores
+	 * this value onto disk for recovery when tgt_txn_stop_cb() is called.
+	 */
+	__u64			 tsi_opdata;
+
+	/*
+	 * Additional fail id that can be set by handler.
+	 */
+	int			 tsi_reply_fail_id;
+	bool			 tsi_preprocessed;
+	/* request JobID */
+	char                    *tsi_jobid;
+
+	/* update replay */
+	__u64			tsi_xid;
+	__u32			tsi_result;
+	__u32			tsi_client_gen;
+};
+
+static inline struct tgt_session_info *tgt_ses_info(const struct lu_env *env)
+{
+	struct tgt_session_info *tsi;
+
+	LASSERT(env->le_ses != NULL);
+	tsi = lu_context_key_get(env->le_ses, &tgt_session_key);
+	LASSERT(tsi);
+	return tsi;
+}
+
+static inline void tgt_vbr_obj_set(const struct lu_env *env,
+				   struct dt_object *obj)
+{
+	struct tgt_session_info	*tsi;
+
+	if (env->le_ses != NULL) {
+		tsi = tgt_ses_info(env);
+		tsi->tsi_vbr_obj = obj;
+	}
+}
+
+static inline void tgt_open_obj_set(const struct lu_env *env,
+				   struct dt_object *obj)
+{
+	struct tgt_session_info	*tsi;
+
+	if (env->le_ses != NULL) {
+		tsi = tgt_ses_info(env);
+		tsi->tsi_open_obj = obj;
+	}
+}
+
+static inline void tgt_opdata_set(const struct lu_env *env, __u64 flags)
+{
+	struct tgt_session_info	*tsi;
+
+	if (env->le_ses != NULL) {
+		tsi = tgt_ses_info(env);
+		tsi->tsi_opdata |= flags;
+	}
+}
+
+static inline void tgt_opdata_clear(const struct lu_env *env, __u64 flags)
+{
+	struct tgt_session_info	*tsi;
+
+	if (env->le_ses != NULL) {
+		tsi = tgt_ses_info(env);
+		tsi->tsi_opdata &= ~flags;
+	}
+}
+
+/*
+ * Generic unified target support.
+ */
+enum tgt_handler_flags {
+	/*
+	 * struct *_body is passed in the incoming message, and object
+	 * identified by this fid exists on disk.
+	 */
+	HAS_BODY	= BIT(0),
+	/*
+	 * struct ldlm_request is passed in the incoming message.
+	 */
+	HAS_KEY		= BIT(1),
+	/*
+	 * this request has fixed reply format, so that reply message can be
+	 * packed by generic code.
+	 */
+	HAS_REPLY	= BIT(2),
+	/*
+	 * this request will modify something, so check whether the file system
+	 * is readonly or not, then return -EROFS to client asap if necessary.
+	 */
+	IS_MUTABLE	= BIT(3)
+};
+
+struct tgt_handler {
+	/* The name of this handler. */
+	const char		*th_name;
+	/* Fail id, check at the beginning */
+	int			 th_fail_id;
+	/* Operation code */
+	__u32			 th_opc;
+	/* Flags in enum tgt_handler_flags */
+	__u32			 th_flags;
+	/* Request version for this opcode */
+	enum lustre_msg_version	 th_version;
+	/* Handler function */
+	int			(*th_act)(struct tgt_session_info *tsi);
+	/* Handler function for high priority requests */
+	void			(*th_hp)(struct tgt_session_info *tsi);
+	/* Request format for this request */
+	const struct req_format	*th_fmt;
+};
+
+struct tgt_opc_slice {
+	__u32			 tos_opc_start; /* First op code */
+	__u32			 tos_opc_end; /* Last op code */
+	struct tgt_handler	*tos_hs; /* Registered handler */
+};
+
+static inline struct ptlrpc_request *tgt_ses_req(struct tgt_session_info *tsi)
+{
+	return tsi->tsi_pill ? tsi->tsi_pill->rc_req : NULL;
+}
+
+static inline __u64 tgt_conn_flags(struct tgt_session_info *tsi)
+{
+	LASSERT(tsi->tsi_exp);
+	return exp_connect_flags(tsi->tsi_exp);
+}
+
+static inline int req_is_replay(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_reqmsg);
+	return !!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY);
+}
+
+static inline bool tgt_is_multimodrpcs_client(struct obd_export *exp)
+{
+	return exp_connect_flags(exp) & OBD_CONNECT_MULTIMODRPCS;
+}
+
+static inline bool tgt_is_increasing_xid_client(struct obd_export *exp)
+{
+	return exp_connect_flags2(exp) & OBD_CONNECT2_INC_XID;
+}
+
+/* target/tgt_handler.c */
+int tgt_request_handle(struct ptlrpc_request *req);
+char *tgt_name(struct lu_target *tgt);
+void tgt_counter_incr(struct obd_export *exp, int opcode);
+int tgt_connect_check_sptlrpc(struct ptlrpc_request *req,
+			      struct obd_export *exp);
+int tgt_adapt_sptlrpc_conf(struct lu_target *tgt);
+int tgt_connect(struct tgt_session_info *tsi);
+int tgt_disconnect(struct tgt_session_info *uti);
+int tgt_obd_ping(struct tgt_session_info *tsi);
+int tgt_enqueue(struct tgt_session_info *tsi);
+int tgt_convert(struct tgt_session_info *tsi);
+int tgt_bl_callback(struct tgt_session_info *tsi);
+int tgt_cp_callback(struct tgt_session_info *tsi);
+int tgt_llog_open(struct tgt_session_info *tsi);
+int tgt_llog_read_header(struct tgt_session_info *tsi);
+int tgt_llog_next_block(struct tgt_session_info *tsi);
+int tgt_llog_prev_block(struct tgt_session_info *tsi);
+int tgt_sec_ctx_init(struct tgt_session_info *tsi);
+int tgt_sec_ctx_init_cont(struct tgt_session_info *tsi);
+int tgt_sec_ctx_fini(struct tgt_session_info *tsi);
+int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob);
+int tgt_send_buffer(struct tgt_session_info *tsi, struct lu_rdbuf *rdbuf);
+int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa);
+int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
+	     struct dt_object *obj, __u64 start, __u64 end);
+
+int tgt_io_thread_init(struct ptlrpc_thread *thread);
+void tgt_io_thread_done(struct ptlrpc_thread *thread);
+
+int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		      struct lustre_handle *lh, int mode, __u64 *flags);
+int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns,
+		    struct ldlm_res_id *res_id, __u64 start, __u64 end,
+		    struct lustre_handle *lh, int mode, __u64 *flags);
+void tgt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode);
+int tgt_brw_read(struct tgt_session_info *tsi);
+int tgt_brw_write(struct tgt_session_info *tsi);
+int tgt_lseek(struct tgt_session_info *tsi);
+int tgt_hpreq_handler(struct ptlrpc_request *req);
+void tgt_register_lfsck_in_notify_local(int (*notify)(const struct lu_env *,
+						      struct dt_device *,
+						      struct lfsck_req_local *,
+						      struct thandle *));
+void tgt_register_lfsck_in_notify(int (*notify)(const struct lu_env *,
+						struct dt_device *,
+						struct lfsck_request *));
+void tgt_register_lfsck_query(int (*query)(const struct lu_env *,
+					   struct dt_device *,
+					   struct lfsck_request *,
+					   struct lfsck_reply *,
+					   struct lfsck_query *));
+int req_can_reconstruct(struct ptlrpc_request *req, struct tg_reply_data *trd);
+
+extern struct tgt_handler tgt_sec_ctx_handlers[];
+extern struct tgt_handler tgt_lfsck_handlers[];
+extern struct tgt_handler tgt_obd_handlers[];
+extern struct tgt_handler tgt_dlm_handlers[];
+extern struct tgt_handler tgt_llog_handlers[];
+extern struct tgt_handler tgt_out_handlers[];
+extern struct tgt_handler fld_handlers[];
+extern struct tgt_handler seq_handlers[];
+
+typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno,
+			 void *data, int err);
+struct tgt_commit_cb {
+	tgt_cb_t  tgt_cb_func;
+	void     *tgt_cb_data;
+};
+
+int tgt_hpreq_handler(struct ptlrpc_request *req);
+
+/* target/tgt_main.c */
+void tgt_boot_epoch_update(struct lu_target *lut);
+void tgt_save_slc_lock(struct lu_target *lut, struct ldlm_lock *lock,
+		       __u64 transno);
+void tgt_discard_slc_lock(struct lu_target *lut, struct ldlm_lock *lock);
+int tgt_init(const struct lu_env *env, struct lu_target *lut,
+	     struct obd_device *obd, struct dt_device *dt,
+	     struct tgt_opc_slice *slice,
+	     int request_fail_id, int reply_fail_id);
+void tgt_fini(const struct lu_env *env, struct lu_target *lut);
+int tgt_client_alloc(struct obd_export *exp);
+void tgt_client_free(struct obd_export *exp);
+int tgt_client_del(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int);
+int tgt_client_new(const struct lu_env *env, struct obd_export *exp);
+int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg,
+			   int sync);
+int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt);
+int tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd);
+int tgt_mk_reply_data(const struct lu_env *env, struct lu_target *tgt,
+		      struct tg_export_data *ted, struct ptlrpc_request *req,
+		      __u64 opdata, struct thandle *th, bool write_update,
+		      __u64 transno);
+struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted,
+					       __u64 xid);
+int tgt_tunables_init(struct lu_target *lut);
+void tgt_tunables_fini(struct lu_target *lut);
+void tgt_mask_cksum_types(struct lu_target *lut, enum cksum_types *cksum_types);
+
+/* target/tgt_grant.c */
+static inline int exp_grant_param_supp(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_GRANT_PARAM);
+}
+
+/* Blocksize used for client not supporting OBD_CONNECT_GRANT_PARAM.
+ * That's 4KB=2^12 which is the biggest block size known to work whatever
+ * the client's page size is. */
+#define COMPAT_BSIZE_SHIFT 12
+
+void tgt_grant_sanity_check(struct obd_device *obd, const char *func);
+void tgt_grant_connect(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_connect_data *data, bool new_conn);
+void tgt_grant_discard(struct obd_export *exp);
+void tgt_grant_prepare_read(const struct lu_env *env, struct obd_export *exp,
+			    struct obdo *oa);
+void tgt_grant_prepare_write(const struct lu_env *env, struct obd_export *exp,
+			     struct obdo *oa, struct niobuf_remote *rnb,
+			     int niocount);
+void tgt_grant_commit(struct obd_export *exp, unsigned long grant_used, int rc);
+int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
+			    unsigned long grant);
+long tgt_grant_create(const struct lu_env *env, struct obd_export *exp,
+		      s64 *nr);
+int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
+			struct obd_statfs *osfs, time64_t max_age,
+			int *from_cache);
+ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf);
+ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf);
+ssize_t grant_compat_disable_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0)
+ssize_t sync_lock_cancel_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf);
+ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count);
+#endif
+
+/* FMD */
+void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid,
+		    __u64 xid);
+bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid,
+		   __u64 xid);
+#ifdef DO_FMD_DROP
+void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid);
+#else
+#define tgt_fmd_drop(exp, fid) do {} while (0)
+#endif
+
+/* target/update_trans.c */
+int distribute_txn_init(const struct lu_env *env,
+			struct lu_target *lut,
+			struct target_distribute_txn_data *tdtd,
+			__u32 index);
+void distribute_txn_fini(const struct lu_env *env,
+			 struct target_distribute_txn_data *tdtd);
+
+/* target/update_recovery.c */
+int insert_update_records_to_replay_list(struct target_distribute_txn_data *,
+					 struct llog_update_record *,
+					 struct llog_cookie *, __u32);
+void dtrq_list_dump(struct target_distribute_txn_data *tdtd,
+		    unsigned int mask);
+void dtrq_list_destroy(struct target_distribute_txn_data *tdtd);
+int distribute_txn_replay_handle(struct lu_env *env,
+			   struct target_distribute_txn_data *tdtd,
+			   struct distribute_txn_replay_req *dtrq);
+__u64 distribute_txn_get_next_transno(struct target_distribute_txn_data *tdtd);
+struct distribute_txn_replay_req *
+distribute_txn_get_next_req(struct target_distribute_txn_data *tdtd);
+void dtrq_destroy(struct distribute_txn_replay_req *dtrq);
+struct distribute_txn_replay_req_sub *
+dtrq_sub_lookup(struct distribute_txn_replay_req *dtrq, __u32 mdt_index);
+struct distribute_txn_replay_req *
+distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd,
+				  __u64 transno);
+bool is_req_replayed_by_update(struct ptlrpc_request *req);
+enum {
+	ESERIOUS = 0x0001000
+};
+
+static inline int err_serious(int rc)
+{
+	LASSERT(rc < 0);
+	return -(-rc | ESERIOUS);
+}
+
+static inline int clear_serious(int rc)
+{
+	if (rc < 0)
+		rc = -(-rc & ~ESERIOUS);
+	return rc;
+}
+
+static inline int is_serious(int rc)
+{
+	return (rc < 0 && -rc & ESERIOUS);
+}
+
+/*
+ * Unified target generic handers macros and generic functions.
+ */
+#define TGT_RPC_HANDLER_HP(base, flags, opc, fn, hp, fmt, version)	\
+[opc - base] = {							\
+	.th_name	= #opc,						\
+	.th_fail_id	= OBD_FAIL_ ## opc ## _NET,			\
+	.th_opc		= opc,						\
+	.th_flags	= flags,					\
+	.th_act		= fn,						\
+	.th_fmt		= fmt,						\
+	.th_version	= version,					\
+	.th_hp		= hp,						\
+}
+#define TGT_RPC_HANDLER(base, flags, opc, fn, fmt, version)		\
+	TGT_RPC_HANDLER_HP(base, flags, opc, fn, NULL, fmt, version)
+
+/* MDT Request with a format known in advance */
+#define TGT_MDT_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(MDS_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MDS_VERSION)
+/* Request with a format we do not yet know */
+#define TGT_MDT_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(MDS_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_MDS_VERSION)
+
+/* OST Request with a format known in advance */
+#define TGT_OST_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(OST_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_OST_VERSION)
+#define TGT_OST_HDL_HP(flags, name, fn, hp)				\
+	TGT_RPC_HANDLER_HP(OST_FIRST_OPC, flags, name, fn, hp,		\
+			   &RQF_ ## name, LUSTRE_OST_VERSION)
+
+/* MGS request with a format known in advance */
+#define TGT_MGS_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(MGS_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MGS_VERSION)
+#define TGT_MGS_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(MGS_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_MGS_VERSION)
+
+/*
+ * OBD handler macros and generic functions.
+ */
+#define TGT_OBD_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(OBD_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_OBD_VERSION)
+#define TGT_OBD_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(OBD_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_OBD_VERSION)
+
+/*
+ * DLM handler macros and generic functions.
+ */
+#define TGT_DLM_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(LDLM_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_DLM_VERSION)
+#define TGT_DLM_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(LDLM_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_DLM_VERSION)
+
+/*
+ * LLOG handler macros and generic functions.
+ */
+#define TGT_LLOG_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(LLOG_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_LOG_VERSION)
+#define TGT_LLOG_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(LLOG_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_LOG_VERSION)
+
+/*
+ * Sec context handler macros and generic functions.
+ */
+#define TGT_SEC_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(SEC_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_OBD_VERSION)
+
+#define TGT_QUOTA_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(QUOTA_DQACQ, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MDS_VERSION)
+
+/* Sequence service handlers */
+#define TGT_SEQ_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(SEQ_QUERY, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MDS_VERSION)
+
+/* FID Location Database handlers */
+#define TGT_FLD_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(FLD_QUERY, flags, name, fn, NULL,		\
+			LUSTRE_MDS_VERSION)
+
+/* LFSCK handlers */
+#define TGT_LFSCK_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(LFSCK_FIRST_OPC, flags, name, fn,		\
+			&RQF_ ## name, LUSTRE_OBD_VERSION)
+
+/* Request with a format known in advance */
+#define TGT_UPDATE_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(OUT_UPDATE, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MDS_VERSION)
+
+#endif /* __LUSTRE_LU_TARGET_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h b/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h
new file mode 100644
index 0000000000000..b0f20f7b0a483
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h
@@ -0,0 +1,140 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre/libiam.h
+ *
+ * iam user level library
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+/*
+ *  lustre/libiam.h
+ */
+
+#ifndef __IAM_ULIB_H__
+#define __IAM_ULIB_H__
+
+/** \defgroup libiam libiam
+ *
+ * @{
+ */
+
+
+#define DX_FMT_NAME_LEN 16
+
+enum iam_fmt_t {
+        FMT_LFIX,
+        FMT_LVAR
+};
+
+struct iam_uapi_info {
+        __u16 iui_keysize;
+        __u16 iui_recsize;
+        __u16 iui_ptrsize;
+        __u16 iui_height;
+        char  iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Creat an iam file, but do NOT open it.
+ * Return 0 if success, else -1.
+ */
+int iam_creat(char *filename, enum iam_fmt_t fmt,
+              int blocksize, int keysize, int recsize, int ptrsize);
+
+/*
+ * Open an iam file, but do NOT creat it if the file doesn't exist.
+ * Please use iam_creat for creating the file before use iam_open.
+ * Return file id (fd) if success, else -1.
+ */
+int iam_open(char *filename, struct iam_uapi_info *ua);
+
+/*
+ * Close file opened by iam_open. 
+ */
+int iam_close(int fd);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_insert(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *keybuf,
+               int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_lookup(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *key_buf,
+               int *keysize, char *save_key,
+               int rec_need_convert, char *rec_buf,
+               int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_delete(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *keybuf,
+               int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_start(int fd, struct iam_uapi_info *ua,
+                 int key_need_convert, char *key_buf,
+                 int *keysize, char *save_key,
+                 int rec_need_convert, char *rec_buf,
+                 int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_next(int fd, struct iam_uapi_info *ua,
+                int key_need_convert, char *key_buf,
+                int *keysize, char *save_key,
+                int rec_need_convert, char *rec_buf,
+                int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_stop(int fd, struct iam_uapi_info *ua,
+                int key_need_convert, char *keybuf,
+                int rec_need_convert, char *recbuf);
+
+/*
+ * Change iam file mode.
+ */
+int iam_polymorph(char *filename, unsigned long mode);
+
+/** @} libiam */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h
new file mode 100644
index 0000000000000..2778a34149bfa
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h
@@ -0,0 +1,38 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+/*
+ * NOTE: This file is DEPRECATED!  Please include lustreapi.h directly
+ * instead of this file.  This file will be removed from a future version
+ * of lustre!
+ */
+
+#ifndef _LIBLUSTREAPI_H_
+#define _LIBLUSTREAPI_H_
+
+#include <lustre/lustreapi.h>
+#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly."
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
new file mode 100644
index 0000000000000..083187b461269
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_fiemap.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
+ */
+
+#include <linux/lustre/lustre_fiemap.h>
+
+#warning "Including ll_fiemap.h is deprecated. Include linux/lustre/lustre_fiemap.h directly."
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
new file mode 100644
index 0000000000000..f8489d55a3b44
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
@@ -0,0 +1,40 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * lustre/include/lustre/lustre_barrier_user.h
+ *
+ * Lustre write barrier (on MDT) userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_barrier_user.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
+ */
+
+#include <linux/lustre/lustre_barrier_user.h>
+
+#warning "Including lustre_barrier_user.h is deprecated. Include linux/lustre/lustre_barrier_user.h directly."
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
new file mode 100644
index 0000000000000..7b84426fa2750
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
@@ -0,0 +1,40 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_lfsck_user.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
+ */
+
+#include <linux/lustre/lustre_lfsck_user.h>
+#warning "Including lustre_lfsck_user.h is deprecated. Include linux/lustre/lustre_lfsck_user.h directly."
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
new file mode 100644
index 0000000000000..81bcf6dc6697e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_user.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
+ */
+
+#include <linux/lustre/lustre_user.h>
+
+/* Disable warning until 2.16 or 3.0, until new header is widely available.
+ * This gives apps time to move to the new header without spurious warnings.
+#warning "Including lustre/lustre_user.h is deprecated. Include linux/lustre/lustre_user.h instead."
+*/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
new file mode 100644
index 0000000000000..02317112226a5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
@@ -0,0 +1,1245 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _LUSTREAPI_H_
+#define _LUSTREAPI_H_
+
+/** \defgroup llapi llapi
+ *
+ * @{
+ */
+
+#include <glob.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <linux/lustre/lustre_user.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef LL_MAXQUOTAS
+#define LL_MAXQUOTAS 3
+#endif
+
+#ifndef SEL_UNIT_SIZE
+#define SEL_UNIT_SIZE 1024llu
+#endif
+
+#ifndef LOV_PATTERN_DEFAULT
+#define LOV_PATTERN_DEFAULT	0xffffffff
+#endif
+
+#ifndef fallthrough
+# if defined(__GNUC__) && __GNUC__ >= 7
+#  define fallthrough  __attribute__((fallthrough)) /* fallthrough */
+# else
+#  define fallthrough do {} while (0)  /* fallthrough */
+# endif
+#endif
+
+typedef struct statx lstatx_t;
+
+#define lustre_fid struct lu_fid
+
+/*
+ * BUILD_BUG_ON() is Compile-time check which verifies correctness at
+ * compile-time rather than runtime. If "cond" is true, (1 - 2*!!(cond))
+ * will be a negative value, which will cause the compiler to complain.
+ *
+ */
+#ifndef BUILD_BUG_ON
+#define BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2*!!(cond)]))
+#endif
+
+/* Currently external applications can access this but in the
+ * future this will no longer be exposed for the user. Instead
+ * if you want to know if the library is initialized just call
+ * llapi_liblustreapi_initialized() which is now available. */
+extern bool liblustreapi_initialized;
+
+typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid,
+			   void *args);
+
+/* lustreapi message severity level */
+enum llapi_message_level {
+        LLAPI_MSG_OFF    = 0,
+        LLAPI_MSG_FATAL  = 1,
+        LLAPI_MSG_ERROR  = 2,
+        LLAPI_MSG_WARN   = 3,
+        LLAPI_MSG_NORMAL = 4,
+        LLAPI_MSG_INFO   = 5,
+        LLAPI_MSG_DEBUG  = 6,
+        LLAPI_MSG_MAX
+};
+
+typedef void (*llapi_log_callback_t)(enum llapi_message_level level, int err,
+				     const char *fmt, va_list ap);
+
+static inline bool llapi_liblustreapi_initialized(void)
+{
+	return liblustreapi_initialized;
+}
+
+/* the bottom three bits reserved for llapi_message_level */
+#define LLAPI_MSG_MASK          0x00000007
+#define LLAPI_MSG_NO_ERRNO      0x00000010
+
+static inline const char *llapi_msg_level2str(enum llapi_message_level level)
+{
+	static const char *levels[LLAPI_MSG_MAX] = {"OFF", "FATAL", "ERROR",
+						    "WARNING", "NORMAL",
+						    "INFO", "DEBUG"};
+
+	if (level >= LLAPI_MSG_MAX)
+		return NULL;
+
+	return levels[level];
+}
+
+void llapi_msg_set_level(int level);
+int llapi_msg_get_level(void);
+llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb);
+llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb);
+
+void llapi_error(enum llapi_message_level level, int err, const char *fmt, ...)
+	__attribute__((__format__(__printf__, 3, 4)));
+#define llapi_err_noerrno(level, fmt, a...)			\
+	llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a)
+void llapi_printf(enum llapi_message_level level, const char *fmt, ...)
+	__attribute__((__format__(__printf__, 2, 3)));
+
+struct llapi_stripe_param {
+	unsigned long long	lsp_stripe_size;
+	char			*lsp_pool;
+	int			lsp_stripe_offset;
+	int			lsp_stripe_pattern;
+	/* Number of stripes. Size of lsp_osts[] if lsp_specific is true.*/
+	int			lsp_stripe_count;
+	bool			lsp_is_specific;
+	bool			lsp_is_create;
+	__u8			lsp_max_inherit;
+	__u8			lsp_max_inherit_rr;
+	__u32			lsp_osts[0];
+};
+
+#define lsp_tgts	lsp_osts
+
+enum {
+	LLAPI_MIGRATION_NONBLOCK	= 0x0001,
+	LLAPI_MIGRATION_MIRROR		= 0x0002,
+	LLAPI_MIGRATION_NONDIRECT	= 0x0004,
+	LLAPI_MIGRATION_VERBOSE		= 0x0008,
+};
+
+__u32 llapi_pattern_to_lov(uint64_t pattern);
+
+int llapi_file_open_param(const char *name, int flags, mode_t mode,
+			  const struct llapi_stripe_param *param);
+int llapi_file_is_encrypted(int fd);
+int llapi_file_create_foreign(const char *name, mode_t mode, __u32 type,
+			      __u32 flags, char *foreign_lov);
+int llapi_file_create(const char *name, unsigned long long stripe_size,
+		      int stripe_offset, int stripe_count, int stripe_pattern);
+int llapi_file_open(const char *name, int flags, int mode,
+		    unsigned long long stripe_size, int stripe_offset,
+		    int stripe_count, int stripe_pattern);
+int llapi_file_create_pool(const char *name, unsigned long long stripe_size,
+			   int stripe_offset, int stripe_count,
+			   int stripe_pattern, char *pool_name);
+int llapi_file_open_pool(const char *name, int flags, int mode,
+			 unsigned long long stripe_size, int stripe_offset,
+			 int stripe_count, int stripe_pattern, char *pool_name);
+int llapi_poollist(const char *name);
+int llapi_get_poolbuf(const char *name, char **buf,
+		      char ***poolist, int *poolcount);
+int llapi_get_poollist(const char *name, char **poollist, int list_size,
+		       char *buffer, int buffer_size);
+int llapi_get_poolmembers(const char *poolname, char **members, int list_size,
+			  char *buffer, int buffer_size);
+int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+int llapi_file_lookup(int dirfd, const char *name);
+void llapi_set_command_name(const char *cmd);
+void llapi_clear_command_name(void);
+
+enum llapi_layout_verbose  {
+	VERBOSE_STRIPE_COUNT	=      0x1,
+	VERBOSE_STRIPE_SIZE	=      0x2,
+	VERBOSE_STRIPE_OFFSET	=      0x4,
+	VERBOSE_POOL		=      0x8,
+	VERBOSE_DETAIL		=     0x10,
+	VERBOSE_OBJID		=     0x20,
+	VERBOSE_GENERATION	=     0x40,
+	VERBOSE_MDTINDEX	=     0x80,
+	VERBOSE_PATTERN		=    0x100,
+	VERBOSE_COMP_COUNT	=    0x200,
+	VERBOSE_COMP_FLAGS	=    0x400,
+	VERBOSE_COMP_START	=    0x800,
+	VERBOSE_COMP_END	=   0x1000,
+	VERBOSE_COMP_ID		=   0x2000,
+	VERBOSE_DFID		=   0x4000,
+	VERBOSE_HASH_TYPE	=   0x8000,
+	VERBOSE_MIRROR_COUNT	=  0x10000,
+	VERBOSE_MIRROR_ID	=  0x20000,
+	VERBOSE_EXT_SIZE	=  0x40000,
+	VERBOSE_INHERIT		=  0x80000,
+	VERBOSE_INHERIT_RR	= 0x100000,
+	VERBOSE_DEFAULT		= VERBOSE_STRIPE_COUNT | VERBOSE_STRIPE_SIZE |
+				  VERBOSE_STRIPE_OFFSET | VERBOSE_POOL |
+				  VERBOSE_OBJID | VERBOSE_GENERATION |
+				  VERBOSE_PATTERN | VERBOSE_HASH_TYPE |
+				  VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS |
+				  VERBOSE_COMP_START | VERBOSE_COMP_END |
+				  VERBOSE_COMP_ID | VERBOSE_MIRROR_COUNT |
+				  VERBOSE_MIRROR_ID | VERBOSE_EXT_SIZE |
+				  VERBOSE_INHERIT | VERBOSE_INHERIT_RR
+};
+/* Compatibility with original names */
+#define VERBOSE_SIZE	VERBOSE_STRIPE_SIZE
+#define VERBOSE_COUNT	VERBOSE_STRIPE_COUNT
+#define VERBOSE_OFFSET	VERBOSE_STRIPE_OFFSET
+#define VERBOSE_LAYOUT	VERBOSE_PATTERN
+
+enum {
+	NEWERXY_ATIME = 0,	/* neweraY */
+	NEWERXY_MTIME = 1,	/* newermY */
+	NEWERXY_CTIME = 2,	/* newercY */
+	NEWERXY_BTIME = 3,	/* newerbY | newerBY */
+	NEWERXY_MAX,
+};
+
+enum lfs_find_perm {
+	LFS_FIND_PERM_EXACT = -2,
+	LFS_FIND_PERM_ANY   = -1,
+	LFS_FIND_PERM_OFF   =  0,
+	LFS_FIND_PERM_ALL   =  1,
+};
+
+struct find_param {
+	unsigned int		 fp_max_depth;
+	dev_t			 fp_dev;
+	mode_t			 fp_type; /* S_IFIFO,... */
+	uid_t			 fp_uid;
+	gid_t			 fp_gid;
+	mode_t			 fp_perm;
+	time_t			 fp_atime;
+	time_t			 fp_mtime;
+	time_t			 fp_ctime;
+	/* {a,m,c,b}sign cannot be bitfields due to using pointers to
+	 * access them during argument parsing. */
+	int			 fp_asign;
+	int			 fp_msign;
+	int			 fp_csign;
+	/* these need to be signed values */
+	int			 fp_size_sign:2,
+				 fp_stripe_size_sign:2,
+				 fp_stripe_count_sign:2,
+				 fp_comp_start_sign:2,
+				 fp_comp_end_sign:2,
+				 fp_comp_count_sign:2,
+				 fp_mirror_count_sign:2,
+				 fp_mirror_index_sign:2,
+				 fp_mirror_id_sign:2,
+				 fp_mdt_count_sign:2,
+				 fp_blocks_sign:2,
+				 fp_ext_size_sign:2,
+				 fp_perm_sign:2,
+				 fp_unused2_sign:2, /* Once used we must add  */
+				 fp_unused3_sign:2, /* a separate flag field  */
+				 fp_unused4_sign:2; /* at end of the struct.  */
+	unsigned long long	 fp_size;
+	unsigned long long	 fp_size_units;
+
+	unsigned long long	 fp_zero_end:1,
+				 fp_recursive:1,
+				 fp_exclude_pattern:1,
+				 fp_exclude_type:1,
+				 fp_exclude_obd:1,
+				 fp_exclude_mdt:1,
+				 fp_exclude_gid:1,
+				 fp_exclude_uid:1,
+				 fp_check_gid:1,
+				 fp_check_uid:1,
+				 fp_check_pool:1,	/* LOV pool name */
+				 fp_check_size:1,	/* file size */
+				 fp_exclude_pool:1,
+				 fp_exclude_size:1,
+				 fp_exclude_atime:1,
+				 fp_exclude_mtime:1,
+				 fp_exclude_ctime:1,
+				 fp_get_lmv:1,	/* get MDT list from LMV */
+				 fp_raw:1,	/* do not fill in defaults */
+				 fp_check_stripe_size:1, /* LOV stripe size */
+				 fp_exclude_stripe_size:1,
+				 fp_check_stripe_count:1, /* LOV stripe count */
+				 fp_exclude_stripe_count:1,
+				 fp_check_layout:1,
+				 fp_exclude_layout:1,
+				 fp_get_default_lmv:1, /* Get default LMV */
+				 fp_migrate:1,
+				 fp_check_projid:1,
+				 fp_exclude_projid:1,
+				 fp_check_comp_count:1,
+				 fp_exclude_comp_count:1,
+				 fp_check_mirror_count:1,
+				 fp_exclude_mirror_count:1,
+				 fp_check_comp_flags:1,
+				 fp_check_mirror_state:1,
+				 fp_check_comp_start:1,
+				 fp_exclude_comp_start:1,
+				 fp_check_comp_end:1,
+				 fp_exclude_comp_end:1,
+				 fp_check_comp_id:1,
+				 fp_exclude_comp_id:1,
+				 fp_check_mirror_id:1,
+				 fp_exclude_mirror_id:1,
+				 fp_check_mirror_index:1,
+				 fp_exclude_mirror_index:1,
+				 fp_check_mdt_count:1,
+				 fp_exclude_mdt_count:1,
+				 fp_check_hash_flag:1,
+				 fp_exclude_hash_type:1,
+				 fp_yaml:1,	/* output layout in YAML */
+				 fp_check_blocks:1,
+				 fp_exclude_blocks:1,
+				 fp_check_foreign:1,
+				 fp_exclude_foreign:1,
+				 fp_check_ext_size:1, /* extension size */
+				 fp_exclude_ext_size:1,
+				 fp_lazy:1,
+				 fp_newerxy:1,
+				 fp_exclude_btime:1,
+				 fp_exclude_perm:1,
+				 fp_unused_bit4:1, /* Once all unused fields  */
+				 fp_unused_bit5:1, /* are used we need to add */
+				 fp_unused_bit6:1, /* a separate flag field at*/
+				 fp_unused_bit7:1; /* the end of the struct.  */
+
+	enum llapi_layout_verbose fp_verbose;
+	int			 fp_quiet;
+
+	/* regular expression */
+	char			*fp_pattern;
+
+	struct  obd_uuid	*fp_obd_uuid;
+	int			 fp_num_obds;
+	int			 fp_num_alloc_obds;
+	int			 fp_obd_index;
+	int			*fp_obd_indexes;
+
+	struct  obd_uuid	*fp_mdt_uuid;
+	int			 fp_num_mdts;
+	int			 fp_num_alloc_mdts;
+	int			 fp_mdt_index;
+	int			*fp_mdt_indexes;
+	int			 fp_file_mdt_index;
+
+	size_t			 fp_lum_size;
+	struct  lov_user_mds_data *fp_lmd;
+
+	char			 fp_poolname[LOV_MAXPOOLNAME + 1];
+
+	__u32			 fp_lmv_stripe_count;
+	struct lmv_user_md	*fp_lmv_md;
+
+	unsigned long long	 fp_stripe_size;
+	unsigned long long	 fp_stripe_size_units;
+	unsigned long long	 fp_stripe_count;
+	__u32			 fp_layout;
+
+	__u32			 fp_comp_count;
+	__u32			 fp_mirror_count;
+	__u32			 fp_comp_flags;
+	__u32			 fp_comp_neg_flags;
+	__u16			 fp_mirror_state;
+	__u16			 fp_mirror_neg_state;
+	__u32			 fp_comp_id;
+	__u16			 fp_mirror_id;
+	__u16			 fp_mirror_index;
+	unsigned long long	 fp_comp_start;
+	unsigned long long	 fp_comp_start_units;
+	unsigned long long	 fp_comp_end;
+	unsigned long long	 fp_comp_end_units;
+	unsigned long long	 fp_mdt_count;
+	unsigned int		 fp_projid;
+	unsigned long long	 fp_blocks;
+	unsigned long long	 fp_blocks_units;
+
+	unsigned long		 fp_got_uuids:1,
+				 fp_obds_printed:1;
+	unsigned int		 fp_depth;
+	unsigned int		 fp_hash_type;
+	unsigned int		 fp_time_margin; /* time margin in seconds */
+	__u32			 fp_foreign_type;
+	unsigned long long	 fp_ext_size;
+	unsigned long long	 fp_ext_size_units;
+
+	/*
+	 * fp_newery[NEWERXY_MAX][0]: --newerXY reference
+	 * fp_newery[NEWERXY_MAX][1]: ! -- newerXY reference
+	 */
+	time_t			 fp_newery[NEWERXY_MAX][2];
+
+	time_t			 fp_btime;
+	int			 fp_bsign;
+	unsigned int		 fp_hash_inflags;
+	unsigned int		 fp_hash_exflags;
+	/* Print all information (lfs find only) */
+	char			 *fp_format_printf_str;
+};
+
+int llapi_ostlist(char *path, struct find_param *param);
+int llapi_uuid_match(char *real_uuid, char *search_uuid);
+int llapi_getstripe(char *path, struct find_param *param);
+int llapi_find(char *path, struct find_param *param);
+
+int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+int llapi_dir_set_default_lmv(const char *name,
+			      const struct llapi_stripe_param *param);
+int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset,
+				     int stripe_count, int stripe_pattern,
+				     const char *pool_name);
+int llapi_dir_create(const char *name, mode_t mode,
+		     const struct llapi_stripe_param *param);
+int llapi_dir_create_foreign(const char *name, mode_t mode, __u32 type,
+			     __u32 flags, const char *value);
+int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+			  int stripe_count, int stripe_pattern,
+			  const char *poolname);
+int llapi_direntry_remove(char *dname);
+int llapi_unlink_foreign(char *dname);
+
+int llapi_obd_fstatfs(int fd, __u32 type, __u32 index,
+		      struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf);
+int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+		     struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf);
+int llapi_ping(char *obd_type, char *obd_name);
+int llapi_target_check(int num_types, char **obd_types, char *dir);
+int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+int llapi_is_lustre_mnttype(const char *type);
+int llapi_search_tgt(const char *fsname, const char *poolname,
+		     const char *tgtname, bool is_mdt);
+int llapi_search_mdt(const char *fsname, const char *poolname,
+		     const char *mdtname);
+int llapi_search_ost(const char *fsname, const char *poolname,
+		     const char *ostname);
+int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+int llapi_parse_size(const char *optarg, unsigned long long *size,
+		     unsigned long long *size_units, int bytes_spec);
+int llapi_search_mounts(const char *pathname, int index, char *mntdir,
+			char *fsname);
+int llapi_search_fsname(const char *pathname, char *fsname);
+int llapi_get_fsname_instance(const char *path, char *fsname, size_t fsname_len,
+			      char *instance, size_t instance_len);
+int llapi_get_instance(const char *path, char *instance, size_t instance_len);
+int llapi_get_fsname(const char *path, char *fsname, size_t fsname_len);
+int llapi_getname(const char *path, char *name, size_t namelen);
+int llapi_search_fileset(const char *pathname, char *fileset);
+
+int llapi_search_rootpath(char *pathname, const char *fsname);
+int llapi_search_rootpath_by_dev(char *pathname, dev_t dev);
+int llapi_nodemap_exists(const char *name);
+int llapi_migrate_mdt(char *path, struct find_param *param);
+int llapi_mv(char *path, struct find_param *param);
+
+struct mntent;
+
+#define HAVE_LLAPI_IS_LUSTRE_MNT
+int llapi_is_lustre_mnt(struct mntent *mnt);
+int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+int llapi_target_iterate(int type_num, char **obd_type, void *args,
+			 llapi_cb_t cb);
+int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+int llapi_cp(int argc, char *argv[]);
+int llapi_ls(int argc, char *argv[]);
+int llapi_fid_parse(const char *fidstr, struct lu_fid *fid, char **endptr);
+int llapi_fid2path_at(int mnt_fd, const struct lu_fid *fid, char *path,
+		      int pathlen, long long *recno, int *linkno);
+int llapi_fid2path(const char *device, const char *fidstr, char *path,
+		   int pathlen, long long *recno, int *linkno);
+int llapi_path2fid(const char *path, struct lu_fid *fid);
+int llapi_get_mdt_index_by_fid(int fd, const struct lu_fid *fid,
+			       int *mdt_index);
+int llapi_get_lum_file(const char *path, __u64 *valid, lstatx_t *statx,
+		       struct lov_user_md *lum, size_t lumsize);
+int llapi_get_lum_dir(const char *path, __u64 *valid, lstatx_t *statx,
+		      struct lov_user_md *lum, size_t lumsize);
+int llapi_get_lum_file_fd(int dir_fd, const char *fname, __u64 *valid,
+			  lstatx_t *statx, struct lov_user_md *lum,
+			  size_t lumsize);
+int llapi_get_lum_dir_fd(int dir_fd, __u64 *valid, lstatx_t *statx,
+			 struct lov_user_md *lum, size_t lumsize);
+
+int llapi_fd2fid(int fd, struct lu_fid *fid);
+/* get FID of parent dir + the related name of entry in this parent dir */
+int llapi_path2parent(const char *path, unsigned int linkno,
+		      struct lu_fid *parent_fid, char *name, size_t name_size);
+int llapi_fd2parent(int fd, unsigned int linkno, struct lu_fid *parent_fid,
+		    char *name, size_t name_size);
+int llapi_rmfid(const char *path, struct fid_array *fa);
+int llapi_chomp_string(char *buf);
+int llapi_open_by_fid(const char *dir, const struct lu_fid *fid,
+		      int open_flags);
+int llapi_get_version_string(char *version, unsigned int version_size);
+/* llapi_get_version() is deprecated, use llapi_get_version_string() instead */
+int llapi_get_version(char *buffer, int buffer_size, char **version)
+	__attribute__((deprecated));
+int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+int llapi_file_flush(int fd);
+extern int llapi_get_ost_layout_version(int fd, __u32 *layout_version);
+int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
+int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
+			   __u32 archive_id);
+int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+			__u32 archive_id);
+int llapi_hsm_register_event_fifo(const char *path);
+int llapi_hsm_unregister_event_fifo(const char *path);
+void llapi_hsm_log_error(enum llapi_message_level level, int _rc,
+			 const char *fmt, va_list args);
+
+int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize);
+int llapi_create_volatile_idx(const char *directory, int mdt_idx,
+			      int open_flags);
+int llapi_create_volatile_param(const char *directory, int mdt_idx,
+				int open_flags, mode_t mode,
+				const struct llapi_stripe_param *stripe_param);
+
+static inline int llapi_create_volatile(char *directory, int open_flags)
+{
+	return llapi_create_volatile_idx(directory, -1, open_flags);
+}
+
+
+int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+				  int gid, __u64 flags);
+int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags);
+int llapi_swap_layouts(const char *path1, const char *path2, __u64 dv1,
+		       __u64 dv2, __u64 flags);
+
+/* Changelog interface.  priv is private state, managed internally by these
+ * functions */
+
+/* Records received are in extended format now, though most of them are still
+ * written in disk in changelog_rec format (to save space and time), it's
+ * converted to extended format in the lustre api to ease changelog analysis.
+ */
+#define HAVE_CHANGELOG_EXTEND_REC 1
+
+int llapi_changelog_start(void **priv, enum changelog_send_flag flags,
+			  const char *mdtname, long long startrec);
+int llapi_changelog_fini(void **priv);
+int llapi_changelog_recv(void *priv, struct changelog_rec **rech);
+int llapi_changelog_in_buf(void *priv);
+int llapi_changelog_free(struct changelog_rec **rech);
+int llapi_changelog_get_fd(void *priv);
+/* Allow records up to endrec to be destroyed; requires registered id. */
+int llapi_changelog_clear(const char *mdtname, const char *idstr,
+			  long long endrec);
+extern int llapi_changelog_set_xflags(void *priv,
+				    enum changelog_send_extra_flag extra_flags);
+
+/* HSM copytool interface.
+ * priv is private state, managed internally by these functions
+ */
+struct hsm_copytool_private;
+struct hsm_copyaction_private;
+
+int llapi_hsm_copytool_register(struct hsm_copytool_private **priv,
+				const char *mnt, int archive_count,
+				int *archives, int rfd_flags);
+int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv);
+int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct);
+int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+			    struct hsm_action_list **hal, int *msgsize);
+int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
+			   const struct hsm_copytool_private *ct,
+			   const struct hsm_action_item *hai,
+			   int restore_mdt_index, int restore_open_flags,
+			   bool is_error);
+int llapi_hsm_action_end(struct hsm_copyaction_private **phcp,
+			 const struct hsm_extent *he, int hp_flags, int errval);
+int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
+			      const struct hsm_extent *he, __u64 total,
+			      int hp_flags);
+int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp,
+			      struct lu_fid *fid);
+int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp);
+int llapi_hsm_import(const char *dst, int archive, const struct stat *st,
+		     unsigned long long stripe_size, int stripe_offset,
+		     int stripe_count, int stripe_pattern, char *pool_name,
+		     struct lu_fid *newfid);
+
+/* HSM user interface */
+struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+						      int data_len);
+int llapi_hsm_request(const char *path, const struct hsm_user_request *request);
+int llapi_hsm_current_action(const char *path, struct hsm_current_action *hca);
+
+/* JSON handling */
+enum llapi_json_types {
+	LLAPI_JSON_INTEGER = 1,
+	LLAPI_JSON_BIGNUM,
+	LLAPI_JSON_REAL,
+	LLAPI_JSON_STRING
+};
+
+struct llapi_json_item {
+	char			*lji_key;
+	__u32			lji_type;
+	union {
+		int		lji_integer;
+		__u64		lji_u64;
+		double		lji_real;
+		char		*lji_string;
+	};
+	struct llapi_json_item	*lji_next;
+};
+
+struct llapi_json_item_list {
+	int			ljil_item_count;
+	struct llapi_json_item	*ljil_items;
+};
+
+int llapi_json_init_list(struct llapi_json_item_list **item_list);
+int llapi_json_destroy_list(struct llapi_json_item_list **item_list);
+int llapi_json_add_item(struct llapi_json_item_list **item_list, char *key,
+			__u32 type, void *val);
+int llapi_json_write_list(struct llapi_json_item_list **item_list, FILE *fp);
+
+/* File lease */
+int llapi_lease_acquire(int fd, enum ll_lease_mode mode);
+int llapi_lease_release(int fd);
+int llapi_lease_set(int fd, const struct ll_ioc_lease *data);
+int llapi_lease_check(int fd);
+int llapi_lease_get(int fd, int mode); /* obsoleted */
+int llapi_lease_put(int fd); /* obsoleted */
+
+/* Group lock */
+int llapi_group_lock(int fd, int gid);
+int llapi_group_unlock(int fd, int gid);
+int llapi_group_lock64(int fd, __u64 gid);
+int llapi_group_unlock64(int fd, __u64 gid);
+
+bool llapi_file_is_sparse(int fd);
+off_t llapi_data_seek(int src_fd, off_t offset, size_t *length);
+int llapi_hole_punch(int fd, off_t start, size_t length);
+
+/* Ladvise */
+int llapi_ladvise(int fd, unsigned long long flags, int num_advise,
+		  struct llapi_lu_ladvise *ladvise);
+
+/* PCC */
+int llapi_pcc_attach(const char *path, __u32 id, enum lu_pcc_type type);
+int llapi_pcc_attach_fid(const char *mntpath, const struct lu_fid *fid,
+			 __u32 id, enum lu_pcc_type type);
+int llapi_pcc_attach_fid_str(const char *mntpath, const char *fidstr,
+			     __u32 id, enum lu_pcc_type type);
+int llapi_pcc_detach_fd(int fd, __u32 option);
+int llapi_pcc_detach_fid(const char *mntpath, const struct lu_fid *fid,
+			 __u32 option);
+int llapi_pcc_detach_fid_str(const char *mntpath, const char *fidstr,
+			     __u32 option);
+int llapi_pcc_detach_file(const char *path, __u32 option);
+int llapi_pcc_state_get_fd(int fd, struct lu_pcc_state *state);
+int llapi_pcc_state_get(const char *path, struct lu_pcc_state *state);
+int llapi_pccdev_set(const char *mntpath, const char *cmd);
+int llapi_pccdev_get(const char *mntpath);
+/** @} llapi */
+
+/* llapi_layout user interface */
+
+/**
+ * An array element storing component info to be resynced during mirror
+ * resynchronization.
+ */
+struct llapi_resync_comp {
+	uint64_t lrc_start;
+	uint64_t lrc_end;
+	uint32_t lrc_mirror_id;
+	uint32_t lrc_id;	/* component id */
+	bool lrc_synced;
+};
+
+/** Opaque data type abstracting the layout of a Lustre file. */
+struct llapi_layout;
+
+int llapi_mirror_truncate(int fd, unsigned int id, off_t length);
+ssize_t llapi_mirror_write(int fd, unsigned int id, const void *buf,
+			   size_t count, off_t pos);
+int llapi_mirror_find(struct llapi_layout *layout, uint64_t file_start,
+		      uint64_t file_end, uint64_t *endp);
+int llapi_layout_get_last_init_comp(struct llapi_layout *layout);
+int llapi_layout_mirror_inherit(struct llapi_layout *f_layout,
+				struct llapi_layout *m_layout);
+int llapi_mirror_find_stale(struct llapi_layout *layout,
+		struct llapi_resync_comp *comp, size_t comp_size,
+		__u16 *mirror_ids, int ids_nr);
+int llapi_mirror_resync_many(int fd, struct llapi_layout *layout,
+			     struct llapi_resync_comp *comp_array,
+			     int comp_size,  uint64_t start, uint64_t end);
+/*
+ * Flags to control how layouts are retrieved.
+ */
+
+enum llapi_layout_get_flags {
+	/** Replace non-specified values with expected inherited values. */
+	LLAPI_LAYOUT_GET_EXPECTED	= 0x0001,
+	/** Use a temporary buffer to swab and return xattrs. */
+	LLAPI_LAYOUT_GET_COPY		= 0x0002,
+	/** Verify xattr contains sane layout values. */
+	LLAPI_LAYOUT_GET_CHECK		= 0x0004,
+};
+/* compatibility macros for old interfaces */
+#define LAYOUT_GET_EXPECTED	LLAPI_LAYOUT_GET_EXPECTED
+#define LLAPI_LXF_COPY		LLAPI_LAYOUT_GET_COPY
+#define LLAPI_LXF_CHECK		LLAPI_LAYOUT_GET_CHECK
+
+/**
+ * Return a pointer to a newly-allocated opaque data structure containing
+ * the layout for the file at \a path.  The pointer should be freed with
+ * llapi_layout_free() when it is no longer needed. Failure is indicated
+ * by a NULL return value and an appropriate error code stored in errno.
+ */
+struct llapi_layout *llapi_layout_get_by_path(const char *path,
+					     enum llapi_layout_get_flags flags);
+
+/**
+ * Return a pointer to a newly-allocated opaque data type containing the
+ * layout for the file referenced by open file descriptor \a fd.  The
+ * pointer should be freed with llapi_layout_free() when it is no longer
+ * needed. Failure is indicated by a NULL return value and an
+ * appropriate error code stored in errno.
+ */
+struct llapi_layout *llapi_layout_get_by_fd(int fd,
+					    enum llapi_layout_get_flags flags);
+
+/**
+ * Return a pointer to a newly-allocated opaque data type containing the
+ * layout for the file associated with Lustre file identifier
+ * \a fid.  The string \a path must name a path within the
+ * filesystem that contains the file being looked up, such as the
+ * filesystem root.  The returned pointer should be freed with
+ * llapi_layout_free() when it is no longer needed.  Failure is
+ * indicated with a NULL return value and an appropriate error code
+ * stored in errno.
+ */
+struct llapi_layout *llapi_layout_get_by_fid(const char *path,
+					     const struct lu_fid *fid,
+					     enum llapi_layout_get_flags flags);
+
+/**
+ * Return a pointer to a newly-allocated opaque data type containing the
+ * layout for the file associated with extended attribute \a lov_xattr.  The
+ * length of the extended attribute is \a lov_xattr_size. The \a lov_xattr
+ * should be raw xattr without being swapped, since this function will swap it
+ * properly. Thus, \a lov_xattr will be modified during the process. If the
+ * \a LLAPI_LXF_CHECK flag of \a flags is set, this function will check whether
+ * the objects count in lum is consistent with the stripe count in lum. This
+ * check only apply to regular file, so \a LLAPI_LAYOUT_GET_CHECK flag should
+ * be cleared if the xattr belongs to a directory. If the flag \a
+ * LLAPI_LAYOUT_GET_COPY is set, this function will use a temporary buffer for
+ * byte swapping when necessary, leaving \a lov_xattr untouched. Otherwise, the
+ * byte swapping will be done to the \a lov_xattr buffer directly.  The returned
+ * pointer should be freed with llapi_layout_free() when it is no longer
+ * needed.  Failure is  * indicated with a NULL return value and an appropriate
+ * error code stored in errno.
+ */
+struct llapi_layout *llapi_layout_get_by_xattr(void *lov_xattr,
+					     ssize_t lov_xattr_size,
+					     enum llapi_layout_get_flags flags);
+
+/**
+ * Allocate a new layout. Use this when creating a new file with
+ * llapi_layout_file_create().
+ */
+struct llapi_layout *llapi_layout_alloc(void);
+
+/**
+ * Free memory allocated for \a layout.
+ */
+void llapi_layout_free(struct llapi_layout *layout);
+
+/**
+ * llapi_layout_merge() - Merge a composite layout into another one.
+ * @dst_layout: Destination composite layout.
+ * @src_layout: Source composite layout.
+ *
+ * This function copies all of the components from @src_layout and
+ * appends them to @dst_layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_merge(struct llapi_layout **dst_layout,
+		       const struct llapi_layout *src_layout);
+
+/** Not a valid stripe size, offset, or RAID pattern. */
+#define LLAPI_LAYOUT_INVALID	0x1000000000000001ULL
+
+/**
+ * When specified or returned as the value for stripe count,
+ * stripe size, offset, or RAID pattern, the filesystem-wide
+ * default behavior will apply.
+ */
+#define LLAPI_LAYOUT_DEFAULT	(LLAPI_LAYOUT_INVALID + 1)
+
+/**
+ * When specified or returned as the value for stripe count, all
+ * available OSTs will be used.
+ */
+#define LLAPI_LAYOUT_WIDE	(LLAPI_LAYOUT_INVALID + 2)
+
+/**
+ * When specified as the value for layout pattern, file objects will be
+ * stored using RAID0.  That is, data will be split evenly and without
+ * redundancy across all OSTs in the layout.
+ */
+#define LLAPI_LAYOUT_RAID0		0ULL
+#define LLAPI_LAYOUT_MDT		2ULL
+#define LLAPI_LAYOUT_OVERSTRIPING	4ULL
+
+/**
+ * The layout includes a specific set of OSTs on which to allocate.
+ */
+#define LLAPI_LAYOUT_SPECIFIC	0x2000000000000000ULL
+
+/**
+ * A valid ost index should be less than maximum valid OST index (UINT_MAX).
+ */
+#define LLAPI_LAYOUT_IDX_MAX	0x00000000FFFFFFFFULL
+
+/**
+ * Flags to modify how layouts are retrieved.
+ */
+/******************** Stripe Count ********************/
+
+/**
+ * Store the stripe count of \a layout in \a count.
+ *
+ * \retval  0 Success
+ * \retval -1 Error with status code in errno.
+ */
+int llapi_layout_stripe_count_get(const struct llapi_layout *layout,
+				  uint64_t *count);
+
+/**
+ * Set the stripe count of \a layout to \a count.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_stripe_count_set(struct llapi_layout *layout, uint64_t count);
+
+/**
+ * Check if the stripe count \a stripe_count \a is valid.
+ */
+bool llapi_layout_stripe_count_is_valid(int64_t stripe_count);
+/******************** Stripe Size ********************/
+
+/**
+ * Store the stripe size of \a layout in \a size.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_stripe_size_get(const struct llapi_layout *layout,
+				 uint64_t *size);
+
+/**
+ * Set the stripe size of \a layout to \a stripe_size.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_stripe_size_set(struct llapi_layout *layout, uint64_t size);
+
+
+/******************** Extension Size ********************/
+
+/**
+ * Store the extension size of \a layout in \a size.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_extension_size_get(const struct llapi_layout *layout,
+				    uint64_t *size);
+
+/**
+ * Set the extension size of \a layout to \a stripe_size.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_extension_size_set(struct llapi_layout *layout, uint64_t size);
+
+
+/******************** Stripe Pattern ********************/
+
+/**
+ * Store the stripe pattern of \a layout in \a pattern.
+ *
+ * \retval 0  Success.
+ * \retval -1 Error with status code in errno.
+ */
+int llapi_layout_pattern_get(const struct llapi_layout *layout,
+			     uint64_t *pattern);
+
+/**
+ * Set the stripe pattern of \a layout to \a pattern.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_pattern_set(struct llapi_layout *layout, uint64_t pattern);
+
+/******************** OST Index ********************/
+
+/**
+ * Store the index of the OST where stripe number \a stripe_number is stored
+ * in \a index.
+ *
+ * An error return value will result from a NULL layout, if \a
+ * stripe_number is out of range, or if \a layout was not initialized
+ * with llapi_layout_lookup_by{path,fd,fid}().
+ *
+ * \retval  0 Success
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_ost_index_get(const struct llapi_layout *layout,
+			       uint64_t stripe_number, uint64_t *index);
+
+/**
+ * Set the OST index associated with stripe number \a stripe_number to
+ * \a ost_index.
+ * NB: This is currently supported only for \a stripe_number = 0 and
+ * other usage will return ENOTSUPP in errno.  A NULL \a layout or
+ * out-of-range \a stripe_number will return EINVAL in errno.
+ *
+ * \retval  0 Success.
+ * \retval -1 Error with errno set to non-zero value.
+ */
+int llapi_layout_ost_index_set(struct llapi_layout *layout, int stripe_number,
+			       uint64_t index);
+
+/******************** Pool Name ********************/
+
+/**
+ * Store up to \a pool_name_len characters of the name of the pool of
+ * OSTs associated with \a layout into the buffer pointed to by
+ * \a pool_name.
+ *
+ * The correct calling form is:
+ *
+ *   llapi_layout_pool_name_get(layout, pool_name, sizeof(pool_name));
+ *
+ * A pool defines a set of OSTs from which file objects may be
+ * allocated for a file using \a layout.
+ *
+ * On success, the number of bytes stored is returned, excluding the
+ * terminating '\0' character (zero indicates that \a layout does not
+ * have an associated OST pool).  On error, -1 is returned and errno is
+ * set appropriately. Possible sources of error include a NULL pointer
+ * argument or insufficient space in \a dest to store the pool name,
+ * in which cases errno will be set to EINVAL.
+ *
+ * \retval 0+		The number of bytes stored in \a dest.
+ * \retval -1		Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_pool_name_get(const struct llapi_layout *layout,
+			      char *pool_name, size_t pool_name_len);
+
+/**
+ * Set the name of the pool of OSTs from which file objects will be
+ * allocated to \a pool_name.
+ *
+ * If the pool name uses "fsname.pool" notation to qualify the pool name
+ * with a filesystem name, the "fsname." portion will be silently
+ * discarded before storing the value. No validation that \a pool_name
+ * is an existing non-empty pool in filesystem \a fsname will be
+ * performed.  Such validation can be performed by the application if
+ * desired using the llapi_search_ost() function.  The maximum length of
+ * the stored value is defined by the constant LOV_MAXPOOLNAME.
+ *
+ * \retval  0	Success.
+ * \retval -1	Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_pool_name_set(struct llapi_layout *layout,
+			       const char *pool_name);
+
+/******************** File Creation ********************/
+
+/**
+ * Open an existing file at \a path, or create it with the specified
+ * \a layout and \a mode.
+ *
+ * One access mode and zero or more file creation flags and file status
+ * flags May be bitwise-or'd in \a open_flags (see open(2)).  Return an
+ * open file descriptor for the file.  If \a layout is non-NULL and
+ * \a path is not on a Lustre filesystem this function will fail and set
+ * errno to ENOTTY.
+ *
+ * An already existing file may be opened with this function, but
+ * \a layout and \a mode will not be applied to it.  Callers requiring a
+ * guarantee that the opened file is created with the specified
+ * \a layout and \a mode should use llapi_layout_file_create().
+ *
+ * A NULL \a layout may be specified, in which case the standard Lustre
+ * behavior for assigning layouts to newly-created files will apply.
+ *
+ * \retval 0+ An open file descriptor.
+ * \retval -1 Error with status code in errno.
+ */
+int llapi_layout_file_open(const char *path, int open_flags, mode_t mode,
+			   const struct llapi_layout *layout);
+
+/**
+ * Create a new file at \a path with the specified \a layout and \a mode.
+ *
+ * One access mode and zero or more file creation flags and file status
+ * flags May be bitwise-or'd in \a open_flags (see open(2)).  Return an
+ * open file descriptor for the file.  If \a layout is non-NULL and
+ * \a path is not on a Lustre filesystem this function will fail and set
+ * errno to ENOTTY.
+ *
+ * The function call
+ *
+ *   llapi_layout_file_create(path, open_flags, mode, layout)
+ *
+ * shall be equivalent to:
+ *
+ *   llapi_layout_file_open(path, open_flags|O_CREAT|O_EXCL, mode, layout)
+ *
+ * It is an error if \a path specifies an existing file.
+ *
+ * A NULL \a layout may be specified, in which the standard Lustre
+ * behavior for assigning layouts to newly-created files will apply.
+ *
+ * \retval 0+ An open file descriptor.
+ * \retval -1 Error with status code in errno.
+ */
+int llapi_layout_file_create(const char *path, int open_flags, int mode,
+			     const struct llapi_layout *layout);
+
+/**
+ * Set flags to the header of component layout.
+ */
+int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags);
+int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags);
+const char *llapi_layout_flags_string(uint32_t flags);
+__u16 llapi_layout_string_flags(char *string);
+
+/**
+ * llapi_layout_mirror_count_get() - Get mirror count from the header of
+ *				     a layout.
+ * @layout: Layout to get mirror count from.
+ * @count:  Returned mirror count value.
+ *
+ * This function gets mirror count from the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_get(struct llapi_layout *layout,
+				  uint16_t *count);
+
+/**
+ * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout.
+ * @layout: Layout to set mirror count in.
+ * @count:  Mirror count value to be set.
+ *
+ * This function sets mirror count to the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_set(struct llapi_layout *layout,
+				  uint16_t count);
+
+/**
+ * Fetch the start and end offset of the current layout component.
+ */
+int llapi_layout_comp_extent_get(const struct llapi_layout *layout,
+				 uint64_t *start, uint64_t *end);
+/**
+ * Set the extent of current layout component.
+ */
+int llapi_layout_comp_extent_set(struct llapi_layout *layout,
+				 uint64_t start, uint64_t end);
+
+/* PFL component flags table */
+static const struct comp_flag_name {
+	enum lov_comp_md_entry_flags cfn_flag;
+	const char *cfn_name;
+} comp_flags_table[] = {
+	{ LCME_FL_INIT,		"init" },
+	{ LCME_FL_STALE,	"stale" },
+	{ LCME_FL_PREF_RW,	"prefer" },
+	{ LCME_FL_OFFLINE,	"offline" },
+	{ LCME_FL_NOSYNC,	"nosync" },
+	{ LCME_FL_EXTENSION,	"extension" },
+};
+
+/**
+ * Gets the attribute flags of the current component.
+ */
+int llapi_layout_comp_flags_get(const struct llapi_layout *layout,
+				uint32_t *flags);
+/**
+ * Sets the specified flags of the current component leaving other flags as-is.
+ */
+int llapi_layout_comp_flags_set(struct llapi_layout *layout, uint32_t flags);
+/**
+ * Clears the flags specified in the flags leaving other flags as-is.
+ */
+int llapi_layout_comp_flags_clear(struct llapi_layout *layout, uint32_t flags);
+/**
+ * Fetches the file-unique component ID of the current layout component.
+ */
+int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id);
+/**
+ * Fetches the mirror ID of the current layout component.
+ */
+int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id);
+/**
+ * Adds one component to the existing composite or plain layout.
+ */
+int llapi_layout_comp_add(struct llapi_layout *layout);
+/**
+ * Adds a first component of a mirror to the existing composite layout.
+ */
+int llapi_layout_add_first_comp(struct llapi_layout *layout);
+/**
+ * Deletes the current layout component from the composite layout.
+ */
+int llapi_layout_comp_del(struct llapi_layout *layout);
+
+enum llapi_layout_comp_use {
+	LLAPI_LAYOUT_COMP_USE_FIRST = 1,
+	LLAPI_LAYOUT_COMP_USE_LAST = 2,
+	LLAPI_LAYOUT_COMP_USE_NEXT = 3,
+	LLAPI_LAYOUT_COMP_USE_PREV = 4,
+};
+
+/**
+ * Set the currently active component to the specified component ID.
+ */
+int llapi_layout_comp_use_id(struct llapi_layout *layout, uint32_t id);
+/**
+ * Select the currently active component at the specified position.
+ */
+int llapi_layout_comp_use(struct llapi_layout *layout, uint32_t pos);
+/**
+ * Add layout components to an existing file.
+ */
+int llapi_layout_file_comp_add(const char *path,
+			       const struct llapi_layout *layout);
+/**
+ * Delete component(s) by the specified component id or flags.
+ */
+int llapi_layout_file_comp_del(const char *path, uint32_t id, uint32_t flags);
+/**
+ * Change flags or other parameters of the component(s) by component ID of an
+ * existing file. The component to be modified is specified by the
+ * comp->lcme_id value, which must be an unique component ID. The new
+ * attributes are passed in by @comp and @valid is used to specify which
+ * attributes in the component are going to be changed.
+ */
+int llapi_layout_file_comp_set(const char *path, uint32_t *ids, uint32_t *flags,
+			       size_t count);
+/**
+ * Check if the file layout is composite.
+ */
+bool llapi_layout_is_composite(struct llapi_layout *layout);
+
+enum {
+	LLAPI_LAYOUT_ITER_CONT = 0,
+	LLAPI_LAYOUT_ITER_STOP = 1,
+};
+
+/**
+ * Iteration callback function.
+ *
+ * \retval LLAPI_LAYOUT_ITER_CONT	Iteration proceeds
+ * \retval LLAPI_LAYOUT_ITER_STOP	Stop iteration
+ * \retval < 0				error code
+ */
+typedef int (*llapi_layout_iter_cb)(struct llapi_layout *layout, void *cbdata);
+
+/**
+ * Iterate all components in the corresponding layout
+ */
+int llapi_layout_comp_iterate(struct llapi_layout *layout,
+			      llapi_layout_iter_cb cb, void *cbdata);
+
+/**
+ * FLR: mirror operation APIs
+ */
+int llapi_mirror_set(int fd, unsigned int id);
+int llapi_mirror_clear(int fd);
+ssize_t llapi_mirror_read(int fd, unsigned int id,
+			   void *buf, size_t count, off_t pos);
+ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count);
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst,
+		      off_t pos, size_t count);
+off_t llapi_mirror_data_seek(int fd, unsigned int id, off_t pos, size_t *size);
+int llapi_mirror_punch(int fd, unsigned int id, off_t start, size_t length);
+
+int llapi_heat_get(int fd, struct lu_heat *heat);
+int llapi_heat_set(int fd, __u64 flags);
+
+int llapi_layout_sanity(struct llapi_layout *layout, bool incomplete, bool flr);
+void llapi_layout_sanity_perror(int error);
+int llapi_layout_dom_size(struct llapi_layout *layout, uint64_t *size);
+
+int llapi_param_get_paths(const char *pattern, glob_t *paths);
+int llapi_param_get_value(const char *path, char **buf, size_t *buflen);
+void llapi_param_paths_free(glob_t *paths);
+
+/* MDLL */
+int llapi_dir_open_pool(const char *name, int flags, int mode,
+			unsigned long long stripe_size, int stripe_offset,
+			int stripe_count, int stripe_pattern, char *pool_name);
+
+void llapi_hsm_action_begin_restore_dir(struct hsm_copytool_private *ct);
+
+/** @} llapi */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
new file mode 100644
index 0000000000000..166e1bd10994a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_acl.h
+ */
+
+#ifndef _LUSTRE_ACL_H
+#define _LUSTRE_ACL_H
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+# include <linux/posix_acl_xattr.h>
+# define LUSTRE_POSIX_ACL_MAX_ENTRIES 32
+# define LUSTRE_POSIX_ACL_MAX_SIZE_OLD					\
+	(sizeof(posix_acl_xattr_header) +				\
+	 LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry))
+#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
+
+#ifndef LUSTRE_POSIX_ACL_MAX_SIZE_OLD
+# define LUSTRE_POSIX_ACL_MAX_SIZE_OLD 0
+#endif /* LUSTRE_POSIX_ACL_MAX_SIZE */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
new file mode 100644
index 0000000000000..df6f78bb4b29b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
@@ -0,0 +1,44 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * lustre/include/lustre_barrier.h
+ *
+ * Lustre write barrier (on MDT) exported functions.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_BARRIER_H
+# define _LUSTRE_BARRIER_H
+
+#include <dt_object.h>
+#include <lustre_export.h>
+
+bool barrier_entry(struct dt_device *key);
+void barrier_exit(struct dt_device *key);
+int barrier_handler(struct dt_device *key, struct ptlrpc_request *req);
+int barrier_register(struct dt_device *key, struct dt_device *next);
+void barrier_deregister(struct dt_device *key);
+
+#endif /* _LUSTRE_BARRIER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
new file mode 100644
index 0000000000000..3b269d4fd1d33
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -0,0 +1,637 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _LUSTRE_COMPAT_H
+#define _LUSTRE_COMPAT_H
+
+#include <linux/aio.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/bio.h>
+#include <linux/xattr.h>
+#include <linux/workqueue.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <libcfs/linux/linux-fs.h>
+#include <obd_support.h>
+
+#ifdef HAVE_4ARGS_VFS_SYMLINK
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                vfs_symlink(dir, dentry, path, mode)
+#else
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                       vfs_symlink(dir, dentry, path)
+#endif
+
+#ifdef HAVE_BVEC_ITER
+#define bio_idx(bio)			(bio->bi_iter.bi_idx)
+#define bio_set_sector(bio, sector)	(bio->bi_iter.bi_sector = sector)
+#define bvl_to_page(bvl)		(bvl->bv_page)
+#else
+#define bio_idx(bio)			(bio->bi_idx)
+#define bio_set_sector(bio, sector)	(bio->bi_sector = sector)
+#define bio_sectors(bio)		((bio)->bi_size >> 9)
+#define bvl_to_page(bvl)		(bvl->bv_page)
+#endif
+
+#ifdef HAVE_BVEC_ITER
+#define bio_start_sector(bio) (bio->bi_iter.bi_sector)
+#else
+#define bio_start_sector(bio) (bio->bi_sector)
+#endif
+
+#ifndef HAVE_DENTRY_D_CHILD
+#define d_child			d_u.d_child
+#endif
+
+#ifdef HAVE_DENTRY_D_U_D_ALIAS
+#define d_alias			d_u.d_alias
+#endif
+
+#ifndef HAVE_D_IN_LOOKUP
+static inline int d_in_lookup(struct dentry *dentry)
+{
+	return false;
+}
+#endif
+
+#ifndef HAVE_VM_FAULT_T
+#define vm_fault_t int
+#endif
+
+#ifndef HAVE_FOP_ITERATE_SHARED
+#define iterate_shared iterate
+#endif
+
+#ifdef HAVE_OLDSIZE_TRUNCATE_PAGECACHE
+#define ll_truncate_pagecache(inode, size) truncate_pagecache(inode, 0, size)
+#else
+#define ll_truncate_pagecache(inode, size) truncate_pagecache(inode, size)
+#endif
+
+#ifdef HAVE_VFS_RENAME_5ARGS
+#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d, NULL)
+#elif defined HAVE_VFS_RENAME_6ARGS
+#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d, NULL, 0)
+#else
+#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d)
+#endif
+
+#ifdef HAVE_USER_NAMESPACE_ARG
+#define vfs_unlink(ns, dir, de) vfs_unlink(ns, dir, de, NULL)
+#elif defined HAVE_VFS_UNLINK_3ARGS
+#define vfs_unlink(ns, dir, de) vfs_unlink(dir, de, NULL)
+#else
+#define vfs_unlink(ns, dir, de) vfs_unlink(dir, de)
+#endif
+
+static inline int ll_vfs_getattr(struct path *path, struct kstat *st,
+				 u32 request_mask, unsigned int flags)
+{
+	int rc;
+
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR)
+	rc = vfs_getattr(path, st, request_mask, flags);
+#else
+	rc = vfs_getattr(path, st);
+#endif
+	return rc;
+}
+
+#ifndef HAVE_D_IS_POSITIVE
+static inline bool d_is_positive(const struct dentry *dentry)
+{
+	return dentry->d_inode != NULL;
+}
+#endif
+
+#ifndef HAVE_INODE_LOCK
+# define inode_lock(inode) mutex_lock(&(inode)->i_mutex)
+# define inode_unlock(inode) mutex_unlock(&(inode)->i_mutex)
+# define inode_trylock(inode) mutex_trylock(&(inode)->i_mutex)
+#endif
+
+/* Old kernels lacked both Xarray support and the page cache
+ * using Xarrays. Our back ported Xarray support introduces
+ * the real xa_is_value() but we need a wrapper as well for
+ * the page cache interaction. Lets keep xa_is_value() separate
+ * in old kernels for Xarray support and page cache handling.
+ */
+#ifndef HAVE_XARRAY_SUPPORT
+static inline bool ll_xa_is_value(void *entry)
+{
+	return radix_tree_exceptional_entry(entry);
+}
+#else
+#define ll_xa_is_value	xa_is_value
+#endif
+
+#ifndef HAVE_TRUNCATE_INODE_PAGES_FINAL
+static inline void truncate_inode_pages_final(struct address_space *map)
+{
+	truncate_inode_pages(map, 0);
+}
+#endif
+
+#ifndef HAVE_PTR_ERR_OR_ZERO
+static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
+{
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	else
+		return 0;
+}
+#endif
+
+#ifdef HAVE_PID_NS_FOR_CHILDREN
+# define ll_task_pid_ns(task) \
+	 ((task)->nsproxy ? ((task)->nsproxy->pid_ns_for_children) : NULL)
+#else
+# define ll_task_pid_ns(task) \
+	 ((task)->nsproxy ? ((task)->nsproxy->pid_ns) : NULL)
+#endif
+
+#ifdef HAVE_FULL_NAME_HASH_3ARGS
+# define ll_full_name_hash(salt, name, len) full_name_hash(salt, name, len)
+#else
+# define ll_full_name_hash(salt, name, len) full_name_hash(name, len)
+#endif
+
+#ifdef HAVE_STRUCT_POSIX_ACL_XATTR
+# define posix_acl_xattr_header struct posix_acl_xattr_header
+# define posix_acl_xattr_entry  struct posix_acl_xattr_entry
+# define GET_POSIX_ACL_XATTR_ENTRY(head) ((void *)((head) + 1))
+#else
+# define GET_POSIX_ACL_XATTR_ENTRY(head) ((head)->a_entries)
+#endif
+
+#ifdef HAVE_IOP_XATTR
+#define ll_setxattr     generic_setxattr
+#define ll_getxattr     generic_getxattr
+#define ll_removexattr  generic_removexattr
+#endif /* HAVE_IOP_XATTR */
+
+#ifndef HAVE_POSIX_ACL_VALID_USER_NS
+#define posix_acl_valid(a,b)		posix_acl_valid(b)
+#endif
+
+#ifdef HAVE_IOP_SET_ACL
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+#if !defined(HAVE_USER_NAMESPACE_ARG) && !defined(HAVE_POSIX_ACL_UPDATE_MODE)
+static inline int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
+			  struct posix_acl **acl)
+{
+	umode_t mode = inode->i_mode;
+	int error;
+
+	error = posix_acl_equiv_mode(*acl, &mode);
+	if (error < 0)
+		return error;
+	if (error == 0)
+		*acl = NULL;
+	if (!in_group_p(inode->i_gid) &&
+	    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+		mode &= ~S_ISGID;
+	*mode_p = mode;
+	return 0;
+}
+#endif /* HAVE_POSIX_ACL_UPDATE_MODE */
+#endif
+#endif
+
+#ifndef HAVE_IOV_ITER_TRUNCATE
+static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
+{
+	if (i->count > count)
+		i->count = count;
+}
+#endif
+
+/*
+ * mount MS_* flags split from superblock SB_* flags
+ * if the SB_* flags are not available use the MS_* flags
+ */
+#if !defined(SB_RDONLY) && defined(MS_RDONLY)
+# define SB_RDONLY MS_RDONLY
+#endif
+#if !defined(SB_ACTIVE) && defined(MS_ACTIVE)
+# define SB_ACTIVE MS_ACTIVE
+#endif
+#if !defined(SB_NOSEC) && defined(MS_NOSEC)
+# define SB_NOSEC MS_NOSEC
+#endif
+#if !defined(SB_POSIXACL) && defined(MS_POSIXACL)
+# define SB_POSIXACL MS_POSIXACL
+#endif
+#if !defined(SB_NODIRATIME) && defined(MS_NODIRATIME)
+# define SB_NODIRATIME MS_NODIRATIME
+#endif
+
+#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
+{
+	i->count = count;
+}
+
+static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
+{
+	return (struct iovec) {
+		.iov_base = iter->iov->iov_base + iter->iov_offset,
+		.iov_len = min(iter->count,
+			       iter->iov->iov_len - iter->iov_offset),
+	};
+}
+
+#define iov_for_each(iov, iter, start)					\
+	for (iter = (start);						\
+	     (iter).count && ((iov = iov_iter_iovec(&(iter))), 1);	\
+	     iov_iter_advance(&(iter), (iov).iov_len))
+
+static inline ssize_t
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct iovec iov;
+	struct iov_iter i;
+	ssize_t bytes = 0;
+
+	iov_for_each(iov, i, *iter) {
+		ssize_t res;
+
+		res = generic_file_aio_read(iocb, &iov, 1, iocb->ki_pos);
+		if (res <= 0) {
+			if (bytes == 0)
+				bytes = res;
+			break;
+		}
+
+		bytes += res;
+		if (res < iov.iov_len)
+			break;
+	}
+
+	if (bytes > 0)
+		iov_iter_advance(iter, bytes);
+	return bytes;
+}
+
+static inline ssize_t
+__generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct iovec iov;
+	struct iov_iter i;
+	ssize_t bytes = 0;
+
+	/* Since LLITE updates file size at the end of I/O in
+	 * vvp_io_commit_write(), append write has to be done in atomic when
+	 * there are multiple segments because otherwise each iteration to
+	 * __generic_file_aio_write() will see original file size */
+	if (unlikely(iocb->ki_filp->f_flags & O_APPEND && iter->nr_segs > 1)) {
+		struct iovec *iov_copy;
+		int count = 0;
+
+		OBD_ALLOC_PTR_ARRAY(iov_copy, iter->nr_segs);
+		if (!iov_copy)
+			return -ENOMEM;
+
+		iov_for_each(iov, i, *iter)
+			iov_copy[count++] = iov;
+
+		bytes = __generic_file_aio_write(iocb, iov_copy, count,
+						 &iocb->ki_pos);
+		OBD_FREE_PTR_ARRAY(iov_copy, iter->nr_segs);
+
+		if (bytes > 0)
+			iov_iter_advance(iter, bytes);
+		return bytes;
+	}
+
+	iov_for_each(iov, i, *iter) {
+		ssize_t res;
+
+		res = __generic_file_aio_write(iocb, &iov, 1, &iocb->ki_pos);
+		if (res <= 0) {
+			if (bytes == 0)
+				bytes = res;
+			break;
+		}
+
+		bytes += res;
+		if (res < iov.iov_len)
+			break;
+	}
+
+	if (bytes > 0)
+		iov_iter_advance(iter, bytes);
+	return bytes;
+}
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+
+static inline void __user *get_vmf_address(struct vm_fault *vmf)
+{
+#ifdef HAVE_VM_FAULT_ADDRESS
+	return (void __user *)vmf->address;
+#else
+	return vmf->virtual_address;
+#endif
+}
+
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+# define __ll_filemap_fault(vma, vmf) filemap_fault(vmf)
+#else
+# define __ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf)
+#endif
+
+#ifndef HAVE_CURRENT_TIME
+static inline struct timespec current_time(struct inode *inode)
+{
+	return CURRENT_TIME;
+}
+#endif
+
+#ifndef time_after32
+/**
+ * time_after32 - compare two 32-bit relative times
+ * @a: the time which may be after @b
+ * @b: the time which may be before @a
+ *
+ * Needed for kernels earlier than v4.14-rc1~134^2
+ *
+ * time_after32(a, b) returns true if the time @a is after time @b.
+ * time_before32(b, a) returns true if the time @b is before time @a.
+ *
+ * Similar to time_after(), compare two 32-bit timestamps for relative
+ * times.  This is useful for comparing 32-bit seconds values that can't
+ * be converted to 64-bit values (e.g. due to disk format or wire protocol
+ * issues) when it is known that the times are less than 68 years apart.
+ */
+#define time_after32(a, b)     ((s32)((u32)(b) - (u32)(a)) < 0)
+#define time_before32(b, a)    time_after32(a, b)
+
+#endif
+
+#ifndef smp_store_mb
+#define smp_store_mb(var, value)	set_mb(var, value)
+#endif
+
+#ifdef HAVE_PAGEVEC_INIT_ONE_PARAM
+#define ll_pagevec_init(pvec, n) pagevec_init(pvec)
+#else
+#define ll_pagevec_init(pvec, n) pagevec_init(pvec, n)
+#endif
+
+#ifdef HAVE_D_COUNT
+#  define ll_d_count(d)		d_count(d)
+#else
+#  define ll_d_count(d)		((d)->d_count)
+#endif /* HAVE_D_COUNT */
+
+#ifndef HAVE_IN_COMPAT_SYSCALL
+#define in_compat_syscall	is_compat_task
+#endif
+
+#ifdef HAVE_I_PAGES
+#define page_tree i_pages
+#define ll_xa_lock_irqsave(lockp, flags) xa_lock_irqsave(lockp, flags)
+#define ll_xa_unlock_irqrestore(lockp, flags) xa_unlock_irqrestore(lockp, flags)
+#else
+#define i_pages tree_lock
+#define ll_xa_lock_irqsave(lockp, flags) spin_lock_irqsave(lockp, flags)
+#define ll_xa_unlock_irqrestore(lockp, flags) spin_unlock_irqrestore(lockp, flags)
+#endif
+
+/* Linux commit v5.15-12273-gab2f9d2d3626
+ *   mm: unexport {,un}lock_page_memcg
+ *
+ * Note that the functions are still defined or declared breaking
+ * the simple approach of just defining the missing functions here
+ */
+#ifdef HAVE_LOCK_PAGE_MEMCG
+#define vvp_lock_page_memcg(page)	lock_page_memcg((page))
+#define vvp_unlock_page_memcg(page)	unlock_page_memcg((page))
+#else
+#define vvp_lock_page_memcg(page)
+#define vvp_unlock_page_memcg(page)
+#endif
+
+#ifndef KMEM_CACHE_USERCOPY
+#define kmem_cache_create_usercopy(name, size, align, flags, useroffset, \
+				   usersize, ctor)			 \
+	kmem_cache_create(name, size, align, flags, ctor)
+#endif
+
+static inline bool ll_security_xattr_wanted(struct inode *in)
+{
+#ifdef CONFIG_SECURITY
+	return in->i_security && in->i_sb->s_security;
+#else
+	return false;
+#endif
+}
+
+static inline int ll_vfs_getxattr(struct dentry *dentry, struct inode *inode,
+				  const char *name,
+				  void *value, size_t size)
+{
+#ifdef HAVE_USER_NAMESPACE_ARG
+	return vfs_getxattr(&init_user_ns, dentry, name, value, size);
+#elif defined(HAVE_VFS_SETXATTR)
+	return __vfs_getxattr(dentry, inode, name, value, size);
+#else
+	if (unlikely(!inode->i_op->getxattr))
+		return -ENODATA;
+
+	return inode->i_op->getxattr(dentry, name, value, size);
+#endif
+}
+
+static inline int ll_vfs_setxattr(struct dentry *dentry, struct inode *inode,
+				  const char *name,
+				  const void *value, size_t size, int flags)
+{
+#ifdef HAVE_USER_NAMESPACE_ARG
+	return vfs_setxattr(&init_user_ns, dentry, name,
+			    VFS_SETXATTR_VALUE(value), size, flags);
+#elif defined(HAVE_VFS_SETXATTR)
+	return __vfs_setxattr(dentry, inode, name, value, size, flags);
+#else
+	if (unlikely(!inode->i_op->setxattr))
+		return -EOPNOTSUPP;
+
+	return inode->i_op->setxattr(dentry, name, value, size, flags);
+#endif
+}
+
+static inline int ll_vfs_removexattr(struct dentry *dentry, struct inode *inode,
+				     const char *name)
+{
+#ifdef HAVE_USER_NAMESPACE_ARG
+	return vfs_removexattr(&init_user_ns, dentry, name);
+#elif defined(HAVE_VFS_SETXATTR)
+	return __vfs_removexattr(dentry, name);
+#else
+	if (unlikely(!inode->i_op->setxattr))
+		return -EOPNOTSUPP;
+
+	return inode->i_op->removexattr(dentry, name);
+#endif
+}
+
+#ifndef FALLOC_FL_COLLAPSE_RANGE
+#define FALLOC_FL_COLLAPSE_RANGE 0x08 /* remove a range of a file */
+#endif
+
+#ifndef FALLOC_FL_ZERO_RANGE
+#define FALLOC_FL_ZERO_RANGE 0x10 /* convert range to zeros */
+#endif
+
+#ifndef FALLOC_FL_INSERT_RANGE
+#define FALLOC_FL_INSERT_RANGE 0x20 /* insert space within file */
+#endif
+
+#ifndef raw_cpu_ptr
+#define raw_cpu_ptr(p) __this_cpu_ptr(p)
+#endif
+
+#ifndef HAVE_IS_ROOT_INODE
+static inline bool is_root_inode(struct inode *inode)
+{
+	return inode == inode->i_sb->s_root->d_inode;
+}
+#endif
+
+#ifndef HAVE_IOV_ITER_GET_PAGES_ALLOC2
+#define iov_iter_get_pages_alloc2(i, p, m, s) \
+	iov_iter_get_pages_alloc((i), (p), (m), (s))
+#endif
+
+#ifdef HAVE_AOPS_MIGRATE_FOLIO
+#define folio_migr	folio
+#else
+#define folio_migr	page
+#define migrate_folio	migratepage
+#endif
+
+#ifdef HAVE_REGISTER_SHRINKER_FORMAT_NAMED
+#define register_shrinker(_s) register_shrinker((_s), "%ps", (_s))
+#elif !defined(HAVE_REGISTER_SHRINKER_RET)
+#define register_shrinker(_s) (register_shrinker(_s), 0)
+#endif
+
+#ifndef fallthrough
+# if defined(__GNUC__) && __GNUC__ >= 7
+#  define fallthrough  __attribute__((fallthrough)) /* fallthrough */
+# else
+#  define fallthrough do {} while (0)  /* fallthrough */
+# endif
+#endif
+
+#ifdef HAVE_SEC_RELEASE_SECCTX_1ARG
+#ifndef HAVE_LSMCONTEXT_INIT
+/* Ubuntu 5.19 */
+static inline void lsmcontext_init(struct lsmcontext *cp, char *context,
+				   u32 size, int slot)
+{
+	cp->slot = slot;
+	cp->context = context;
+	cp->len = size;
+}
+#endif
+#endif
+
+static inline void ll_security_release_secctx(char *secdata, u32 seclen,
+					      int slot)
+{
+#ifdef HAVE_SEC_RELEASE_SECCTX_1ARG
+	struct lsmcontext context = { };
+
+	lsmcontext_init(&context, secdata, seclen, slot);
+	return security_release_secctx(&context);
+#else
+	return security_release_secctx(secdata, seclen);
+#endif
+}
+
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define posix_acl_update_mode(ns, inode, mode, acl) \
+	posix_acl_update_mode(inode, mode, acl)
+#define notify_change(ns, de, attr, inode)	notify_change(de, attr, inode)
+#define inode_owner_or_capable(ns, inode)	inode_owner_or_capable(inode)
+#define vfs_create(ns, dir, de, mode, ex)	vfs_create(dir, de, mode, ex)
+#define vfs_mkdir(ns, dir, de, mode)		vfs_mkdir(dir, de, mode)
+#define ll_set_acl(ns, inode, acl, type)	ll_set_acl(inode, acl, type)
+#endif
+
+/**
+ * delete_from_page_cache is not exported anymore
+ */
+#ifdef HAVE_DELETE_FROM_PAGE_CACHE
+#define cfs_delete_from_page_cache(page)	delete_from_page_cache((page))
+#else
+static inline void cfs_delete_from_page_cache(struct page *page)
+{
+	if (!page->mapping)
+		return;
+	LASSERT(PageLocked(page));
+	get_page(page);
+	unlock_page(page);
+	/* on entry page is locked */
+	if (S_ISREG(page->mapping->host->i_mode)) {
+		generic_error_remove_page(page->mapping, page);
+	} else {
+		loff_t lstart = page->index << PAGE_SHIFT;
+		loff_t lend = lstart + PAGE_SIZE - 1;
+
+		truncate_inode_pages_range(page->mapping, lstart, lend);
+	}
+	lock_page(page);
+	put_page(page);
+}
+#endif
+
+static inline struct page *ll_read_cache_page(struct address_space *mapping,
+					      pgoff_t index, filler_t *filler,
+					      void *data)
+{
+#ifdef HAVE_READ_CACHE_PAGE_WANTS_FILE
+	struct file dummy_file;
+
+	dummy_file.f_ra.ra_pages = 32; /* unused, modified on ra error */
+	dummy_file.private_data = data;
+	return read_cache_page(mapping, index, filler, &dummy_file);
+#else
+	return read_cache_page(mapping, index, filler, data);
+#endif /* HAVE_READ_CACHE_PAGE_WANTS_FILE */
+}
+
+#endif /* _LUSTRE_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_crypto.h b/drivers/staging/lustrefsx/lustre/include/lustre_crypto.h
new file mode 100644
index 0000000000000..d048470691a4b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_crypto.h
@@ -0,0 +1,230 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2019, 2020, Whamcloud.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _LUSTRE_CRYPTO_H_
+#define _LUSTRE_CRYPTO_H_
+
+#if defined(HAVE_LUSTRE_CRYPTO) && !defined(CONFIG_LL_ENCRYPTION)
+#define __FS_HAS_ENCRYPTION 1
+#include <linux/fscrypt.h>
+
+#define LL_CRYPTO_BLOCK_SIZE		FS_CRYPTO_BLOCK_SIZE
+#define llcrypt_name			fscrypt_name
+#define llcrypt_str			fscrypt_str
+#define LLTR_INIT			FSTR_INIT
+#define llcrypt_operations		fscrypt_operations
+#define llcrypt_symlink_data		fscrypt_symlink_data
+#define llcrypt_init()			0
+#define llcrypt_exit()			{}
+#ifndef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED
+#define llcrypt_context			fscrypt_context
+#define llcrypt_dummy_context		fscrypt_dummy_context
+#endif
+#define llcrypt_require_key(inode)     \
+	fscrypt_require_key(inode)
+#define llcrypt_has_encryption_key(inode) fscrypt_has_encryption_key(inode)
+#define llcrypt_encrypt_pagecache_blocks(page, len, offs, gfp_flags)	\
+	fscrypt_encrypt_pagecache_blocks(page, len, offs, gfp_flags)
+#define llcrypt_decrypt_pagecache_blocks(page, len, offs)	\
+	fscrypt_decrypt_pagecache_blocks(page, len, offs)
+#define llcrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num)	\
+	fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num)
+#define llcrypt_inherit_context(parent, child, fs_data, preload)	\
+	fscrypt_inherit_context(parent, child, fs_data, preload)
+#define llcrypt_get_encryption_info(inode) fscrypt_get_encryption_info(inode)
+#define llcrypt_put_encryption_info(inode) fscrypt_put_encryption_info(inode)
+#define llcrypt_free_inode(inode)	   fscrypt_free_inode(inode)
+#define llcrypt_finalize_bounce_page(pagep)  fscrypt_finalize_bounce_page(pagep)
+#define llcrypt_file_open(inode, filp)	fscrypt_file_open(inode, filp)
+#define llcrypt_ioctl_set_policy(filp, arg)  fscrypt_ioctl_set_policy(filp, arg)
+#define llcrypt_ioctl_get_policy_ex(filp, arg)	\
+	fscrypt_ioctl_get_policy_ex(filp, arg)
+#define llcrypt_policy_has_filename_enc(inode) true
+#define llcrypt_ioctl_add_key(filp, arg)	fscrypt_ioctl_add_key(filp, arg)
+#define llcrypt_ioctl_remove_key(filp, arg)  fscrypt_ioctl_remove_key(filp, arg)
+#define llcrypt_ioctl_remove_key_all_users(filp, arg)	\
+	fscrypt_ioctl_remove_key_all_users(filp, arg)
+#define llcrypt_ioctl_get_key_status(filp, arg)	\
+	fscrypt_ioctl_get_key_status(filp, arg)
+#define llcrypt_drop_inode(inode)	fscrypt_drop_inode(inode)
+#define llcrypt_prepare_rename(olddir, olddentry, newdir, newdentry, flags) \
+	fscrypt_prepare_rename(olddir, olddentry, newdir, newdentry, flags)
+#define llcrypt_prepare_link(old_dentry, dir, dentry)	\
+	fscrypt_prepare_link(old_dentry, dir, dentry)
+#define llcrypt_prepare_setattr(dentry, attr)		\
+	fscrypt_prepare_setattr(dentry, attr)
+#define __llcrypt_prepare_lookup(inode, dentry, fname)	\
+	__fscrypt_prepare_lookup(inode, dentry, fname)
+#define llcrypt_set_ops(sb, cop)	fscrypt_set_ops(sb, cop)
+#define llcrypt_sb_free(sb)		{}
+#define llcrypt_fname_alloc_buffer(inode, max_encrypted_len, crypto_str) \
+	fscrypt_fname_alloc_buffer(inode, max_encrypted_len, crypto_str)
+#define llcrypt_fname_disk_to_usr(inode, hash, minor_hash, iname, oname) \
+	fscrypt_fname_disk_to_usr(inode, hash, minor_hash, iname, oname)
+#define llcrypt_fname_free_buffer(crypto_str) \
+	fscrypt_fname_free_buffer(crypto_str)
+#define llcrypt_setup_filename(dir, iname, lookup, fname) \
+	fscrypt_setup_filename(dir, iname, lookup, fname)
+#define llcrypt_free_filename(fname) \
+	fscrypt_free_filename(fname)
+#define llcrypt_match_name(fname, de_name, name_len)		\
+	fscrypt_match_name(fname, de_name, name_len)
+#define llcrypt_prepare_lookup(dir, dentry, fname) \
+	fscrypt_prepare_lookup(dir, dentry, fname)
+#define llcrypt_encrypt_symlink(inode, target, len, disk_link) \
+	fscrypt_encrypt_symlink(inode, target, len, disk_link)
+#define __llcrypt_encrypt_symlink(inode, target, len, disk_link) \
+	__fscrypt_encrypt_symlink(inode, target, len, disk_link)
+#define llcrypt_prepare_symlink(dir, target, len, max_len, disk_link)	\
+	fscrypt_prepare_symlink(dir, target, len, max_len, disk_link)
+#define llcrypt_get_symlink(inode, caddr, max_size, done) \
+	fscrypt_get_symlink(inode, caddr, max_size, done)
+
+#define LL_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
+#define LL_IOC_GET_ENCRYPTION_POLICY_EX FS_IOC_GET_ENCRYPTION_POLICY_EX
+#define LL_IOC_ADD_ENCRYPTION_KEY FS_IOC_ADD_ENCRYPTION_KEY
+#define LL_IOC_REMOVE_ENCRYPTION_KEY FS_IOC_REMOVE_ENCRYPTION_KEY
+#define LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS \
+	FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS
+#define LL_IOC_GET_ENCRYPTION_KEY_STATUS FS_IOC_GET_ENCRYPTION_KEY_STATUS
+
+#else /* HAVE_LUSTRE_CRYPTO && !CONFIG_LL_ENCRYPTION */
+#include <libcfs/crypto/llcrypt.h>
+#endif /* !HAVE_LUSTRE_CRYPTO || CONFIG_LL_ENCRYPTION */
+
+#ifndef DCACHE_NOKEY_NAME
+#define DCACHE_NOKEY_NAME               0x02000000 /* Enc name without key */
+#endif
+
+#if !defined(HAVE_FSCRYPT_IS_NOKEY_NAME) || defined(CONFIG_LL_ENCRYPTION)
+
+static inline bool llcrypt_is_nokey_name(const struct dentry *dentry)
+{
+	return dentry->d_flags & DCACHE_NOKEY_NAME;
+}
+#else
+#define llcrypt_is_nokey_name(dentry)		\
+	fscrypt_is_nokey_name(dentry)
+#endif
+
+#if defined(HAVE_LUSTRE_CRYPTO) && !defined(HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED)
+#define llcrypt_show_test_dummy_encryption(seq, sep, sb)	\
+	fscrypt_show_test_dummy_encryption(seq, sep, sb)
+#define llcrypt_set_test_dummy_encryption(sb, arg, ctx)		\
+	fscrypt_set_test_dummy_encryption(sb, arg, ctx)
+#define llcrypt_free_dummy_context(ctx)				\
+	fscrypt_free_dummy_context(ctx)
+#else
+#define llcrypt_show_test_dummy_encryption(seq, sep, sb)	{}
+#define llcrypt_free_dummy_context(ctx)				{}
+#endif
+
+/* Macro to extract digest from Lustre specific structures */
+#if defined(HAVE_FSCRYPT_DIGESTED_NAME) && !defined(CONFIG_LL_ENCRYPTION)
+#define LLCRYPT_EXTRACT_DIGEST		FSCRYPT_FNAME_DIGEST
+#else
+#define LLCRYPT_EXTRACT_DIGEST(name, len)			\
+	((name) + round_down((len) - LL_CRYPTO_BLOCK_SIZE - 1,	\
+			     LL_CRYPTO_BLOCK_SIZE))
+#endif
+
+struct ll_sb_info;
+int ll_set_encflags(struct inode *inode, void *encctx, __u32 encctxlen,
+		    bool preload);
+void llcrypt_free_ctx(void *encctx, __u32 size);
+bool ll_sb_has_test_dummy_encryption(struct super_block *sb);
+bool ll_sbi_has_encrypt(struct ll_sb_info *sbi);
+void ll_sbi_set_encrypt(struct ll_sb_info *sbi, bool set);
+bool ll_sbi_has_name_encrypt(struct ll_sb_info *sbi);
+void ll_sbi_set_name_encrypt(struct ll_sb_info *sbi, bool set);
+/* sizeof(struct fscrypt_context_v2) = 40 */
+#define LLCRYPT_ENC_CTX_SIZE 40
+
+/* Encoding/decoding routines inspired from yEnc principles.
+ * We just take care of a few critical characters:
+ * NULL, LF, CR, /, DEL and =.
+ * If such a char is found, it is replaced with '=' followed by
+ * the char value + 64.
+ * All other chars are left untouched.
+ * Efficiency of this encoding depends on the occurences of the
+ * critical chars, but statistically on binary data it can be much higher
+ * than base64 for instance.
+ */
+static inline int critical_encode(const u8 *src, int len, char *dst)
+{
+	u8 *p = (u8 *)src, *q = dst;
+
+	while (p - src < len) {
+		/* escape NULL, LF, CR, /, DEL and = */
+		if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD ||
+			     *p == '/' || *p == 0x7F || *p == '=')) {
+			*(q++) = '=';
+			*(q++) = *(p++) + 64;
+		} else {
+			*(q++) = *(p++);
+		}
+	}
+
+	return (char *)q - dst;
+}
+
+/* returns the number of chars encoding would produce */
+static inline int critical_chars(const u8 *src, int len)
+{
+	u8 *p = (u8 *)src;
+	int newlen = len;
+
+	while (p - src < len) {
+		/* NULL, LF, CR, /, DEL and = cost an additional '=' */
+		if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD ||
+			     *p == '/' || *p == 0x7F || *p == '='))
+			newlen++;
+		p++;
+	}
+
+	return newlen;
+}
+
+/* decoding routine - returns the number of chars in output */
+static inline int critical_decode(const u8 *src, int len, char *dst)
+{
+	u8 *p = (u8 *)src, *q = dst;
+
+	while (p - src < len) {
+		if (unlikely(*p == '=')) {
+			*(q++) = *(++p) - 64;
+			p++;
+		} else {
+			*(q++) = *(p++);
+		}
+	}
+
+	return (char *)q - dst;
+}
+
+#endif /* _LUSTRE_CRYPTO_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
new file mode 100644
index 0000000000000..23fe796728c8f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
@@ -0,0 +1,383 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+#include <asm/byteorder.h>
+#include <linux/types.h>
+#include <linux/backing-dev.h>
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_disk.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#if !defined(CONFIG_LL_ENCRYPTION) && defined(HAVE_LUSTRE_CRYPTO)
+#include <lustre_crypto.h>
+#endif
+
+#define IS_MDT(data)		((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data)		((data)->lsi_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data)		((data)->lsi_flags & LDD_F_SV_TYPE_MGS)
+#define IS_SERVER(data)		((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \
+						      LDD_F_SV_TYPE_MDT | \
+						      LDD_F_SV_TYPE_OST))
+#define MT_STR(data)		mt_str((data)->ldd_mount_type)
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes
+ * everything as string options
+ */
+#define LMD_MAGIC		0xbdacbd03
+#define LMD_PARAMS_MAXLEN	4096
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+	u32	lmd_magic;
+	u32	lmd_flags;	/* lustre mount flags */
+	int	lmd_mgs_failnodes; /* mgs failover node count */
+	int	lmd_exclude_count;
+	int	lmd_recovery_time_soft;
+	int	lmd_recovery_time_hard;
+	char   *lmd_dev;	/* device name */
+	char   *lmd_profile;	/* client only */
+	char   *lmd_fileset;	/* mount fileset */
+	char   *lmd_mgssec;	/* sptlrpc flavor to mgs */
+	char   *lmd_opts;	/* lustre mount options (as opposed to
+				 * device_ mount options) */
+	char   *lmd_params;	/* lustre params */
+	u32    *lmd_exclude;	/* array of OSTs to ignore */
+	char   *lmd_mgs;	/* MGS nid */
+	char   *lmd_osd_type;	/* OSD type */
+	char   *lmd_nidnet;     /* network to restrict this client to */
+};
+
+#define LMD_FLG_SERVER		0x0001	/* Mounting a server */
+#define LMD_FLG_CLIENT		0x0002	/* Mounting a client */
+#define LMD_FLG_SKIP_LFSCK	0x0004	/* NOT auto resume LFSCK when mount */
+#define LMD_FLG_ABORT_RECOV	0x0008	/* Abort recovery */
+#define LMD_FLG_NOSVC		0x0010	/* Only start MGS/MGC for servers,
+					   no other services */
+#define LMD_FLG_NOMGS		0x0020	/* Only start target for servers, reusing
+					   existing MGS services */
+#define LMD_FLG_WRITECONF	0x0040	/* Rewrite config log */
+#define LMD_FLG_NOIR		0x0080	/* NO imperative recovery */
+#define LMD_FLG_NOSCRUB		0x0100	/* Do not trigger scrub automatically */
+#define LMD_FLG_MGS		0x0200	/* Also start MGS along with server */
+#define LMD_FLG_IAM		0x0400	/* IAM dir */
+#define LMD_FLG_NO_PRIMNODE	0x0800	/* all nodes are service nodes */
+#define LMD_FLG_VIRGIN		0x1000	/* the service registers first time */
+#define LMD_FLG_UPDATE		0x2000	/* update parameters */
+#define LMD_FLG_HSM		0x4000	/* Start coordinator */
+#define LMD_FLG_DEV_RDONLY	0x8000	/* discard modification quitely */
+#define LMD_FLG_NO_PRECREATE	0x10000	/* do not allow OST object creation */
+#define LMD_FLG_LOCAL_RECOV	0x20000 /* force recovery for local clients */
+#define LMD_FLG_ABORT_RECOV_MDT	0x40000 /* Abort recovery between MDTs */
+#define LMD_FLG_NO_LOCAL_LOGS	0x80000 /* Use config logs from MGS */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
+
+/****************** superblock additional info *********************/
+struct ll_sb_info;
+struct kobject;
+
+struct lustre_sb_info {
+	int                       lsi_flags;
+	struct obd_device        *lsi_mgc;     /* mgc obd */
+	struct lustre_mount_data *lsi_lmd;     /* mount command info */
+	struct ll_sb_info        *lsi_llsbi;   /* add'l client sbi info */
+	struct dt_device	 *lsi_dt_dev;  /* dt device to access disk fs*/
+	atomic_t		  lsi_mounts;  /* references to the srv_mnt */
+	struct kobject		 *lsi_kobj;
+	char			  lsi_svname[MTI_NAME_MAXLEN];
+	/* lsi_osd_obdname format = 'lsi->ls_svname'-osd */
+	char			  lsi_osd_obdname[MTI_NAME_MAXLEN + 4];
+	/* lsi_osd_uuid format = 'lsi->ls_osd_obdname'_UUID */
+	char			  lsi_osd_uuid[MTI_NAME_MAXLEN + 9];
+	struct obd_export	 *lsi_osd_exp;
+	char			  lsi_osd_type[16];
+	char			  lsi_fstype[16];
+	struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
+						  own backing_dev_info */
+	/* protect lsi_lwp_list */
+	struct mutex		  lsi_lwp_mutex;
+	struct list_head	  lsi_lwp_list;
+	unsigned long		  lsi_lwp_started:1,
+				  lsi_server_started:1;
+#ifdef CONFIG_LL_ENCRYPTION
+	const struct llcrypt_operations	*lsi_cop;
+	struct key		 *lsi_master_keys; /* master crypto keys used */
+#elif defined(HAVE_LUSTRE_CRYPTO) && !defined(HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED)
+	/* Encryption context for '-o test_dummy_encryption' */
+	struct llcrypt_dummy_context lsi_dummy_enc_ctx;
+#endif
+};
+
+#define LSI_UMOUNT_FAILOVER              0x00200000
+#ifndef HAVE_SUPER_SETUP_BDI_NAME
+#define LSI_BDI_INITIALIZED		 0x00400000
+#endif
+#define LSI_FILENAME_ENC		 0x00800000 /* enable name encryption */
+#define LSI_FILENAME_ENC_B64_OLD_CLI	 0x01000000 /* use old style base64 */
+
+#define     s2lsi(sb)        ((struct lustre_sb_info *)((sb)->s_fs_info))
+#define     s2lsi_nocast(sb) ((sb)->s_fs_info)
+
+#define     get_profile_name(sb)   (s2lsi(sb)->lsi_lmd->lmd_profile)
+#define     get_mount_fileset(sb)  (s2lsi(sb)->lsi_lmd->lmd_fileset)
+
+# ifdef HAVE_SERVER_SUPPORT
+/* opc for target register */
+#define LDD_F_OPC_REG   0x10000000
+#define LDD_F_OPC_UNREG 0x20000000
+#define LDD_F_OPC_READY 0x40000000
+#define LDD_F_OPC_MASK  0xf0000000
+
+#define LDD_F_MASK	0xFFFF
+
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#if (128 * 1024UL) > (PAGE_SIZE * 8)
+#define LR_MAX_CLIENTS (128 * 1024UL)
+#else
+#define LR_MAX_CLIENTS (PAGE_SIZE * 8)
+#endif
+
+/** COMPAT_146: this is an OST (temporary) */
+#define OBD_COMPAT_OST          0x00000002
+/** COMPAT_146: this is an MDT (temporary) */
+#define OBD_COMPAT_MDT          0x00000004
+/** 2.0 server, interop flag to show server version is changed */
+#define OBD_COMPAT_20           0x00000008
+
+/** MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_LOVOBJID		0x00000001
+/** store OST index in the IDIF */
+#define OBD_ROCOMPAT_IDX_IN_IDIF	0x00000002
+
+/** OST handles group subdirs */
+#define OBD_INCOMPAT_GROUPS     0x00000001
+/** this is an OST */
+#define OBD_INCOMPAT_OST        0x00000002
+/** this is an MDT */
+#define OBD_INCOMPAT_MDT        0x00000004
+/** common last_rvcd format */
+#define OBD_INCOMPAT_COMMON_LR  0x00000008
+/** FID is enabled */
+#define OBD_INCOMPAT_FID        0x00000010
+/** Size-on-MDS is enabled */
+#define OBD_INCOMPAT_SOM        0x00000020
+/** filesystem using iam format to store directory entries */
+#define OBD_INCOMPAT_IAM_DIR    0x00000040
+/** LMA attribute contains per-inode incompatible flags */
+#define OBD_INCOMPAT_LMA        0x00000080
+/** lmm_stripe_count has been shrunk from u32 to u16 and the remaining 16
+ * bits are now used to store a generation. Once we start changing the layout
+ * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
+ * will be confused by interpreting stripe_count | gen << 16 as the actual
+ * stripe count */
+#define OBD_INCOMPAT_LMM_VER    0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI   0x00000200
+/** multiple RPCs in flight */
+#define OBD_INCOMPAT_MULTI_RPCS	0x00000400
+
+/* last_rcvd handling */
+static inline void lsd_le_to_cpu(struct lr_server_data *buf,
+                                 struct lr_server_data *lsd)
+{
+	int i;
+
+	memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
+	lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno);
+	lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14);
+	lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count);
+	lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat);
+	lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
+	lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
+	lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size);
+	lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start);
+	lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size);
+	lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count);
+	lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid);
+	lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen);
+	memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
+	lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index);
+	lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1);
+	lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
+	lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
+	lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
+}
+
+static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
+                                 struct lr_server_data *buf)
+{
+	int i;
+
+	memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
+	buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno);
+	buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14);
+	buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count);
+	buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat);
+	buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
+	buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
+	buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size);
+	buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start);
+	buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size);
+	buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count);
+	buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid);
+	buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen);
+	memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
+	buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index);
+	buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1);
+	buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
+	buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
+	buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
+}
+
+static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
+                                 struct lsd_client_data *lcd)
+{
+        memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
+	lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno);
+	lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid);
+	lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result);
+	lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data);
+	lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
+	lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid);
+	lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result);
+	lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data);
+	lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]);
+	lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]);
+	lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]);
+	lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]);
+	lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch);
+	lcd->lcd_generation = le32_to_cpu(buf->lcd_generation);
+}
+
+static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
+                                 struct lsd_client_data *buf)
+{
+        memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
+	buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno);
+	buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid);
+	buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result);
+	buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data);
+	buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
+	buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid);
+	buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result);
+	buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data);
+	buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]);
+	buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]);
+	buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]);
+	buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]);
+	buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch);
+	buf->lcd_generation = cpu_to_le32(lcd->lcd_generation);
+}
+
+static inline u64 lcd_last_transno(struct lsd_client_data *lcd)
+{
+        return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
+                lcd->lcd_last_transno : lcd->lcd_last_close_transno);
+}
+
+static inline u64 lcd_last_xid(struct lsd_client_data *lcd)
+{
+        return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
+                lcd->lcd_last_xid : lcd->lcd_last_close_xid);
+}
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+	char			*lmi_name;
+	struct super_block	*lmi_sb;
+	struct list_head	 lmi_list_chain;
+};
+
+/****************** prototypes *********************/
+
+/* obd_mount_server.c */
+int server_fill_super(struct super_block *sb);
+struct lustre_mount_info *server_get_mount(const char *name);
+int server_put_mount(const char *name, bool dereg_mnt);
+struct mgs_target_info;
+int server_mti_print(const char *title, struct mgs_target_info *mti);
+void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd);
+
+/* obd_mount.c */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize);
+
+int server_name_is_ost(const char *svname);
+int target_name2index(const char *svname, u32 *idx, const char **endptr);
+
+int lustre_put_lsi(struct super_block *sb);
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4);
+#endif /* HAVE_SERVER_SUPPORT */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr);
+void obdname2fsname(const char *tgt, char *fsname, size_t fslen);
+
+int lustre_start_mgc(struct super_block *sb);
+int lustre_common_put_super(struct super_block *sb);
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb);
+int lustre_put_lsi(struct super_block *sb);
+int lmd_parse(char *options, struct lustre_mount_data *lmd);
+
+/* mgc_request.c */
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id,
+		     enum mgs_cfg_type type);
+int mgc_logname2resid(char *fsname, struct ldlm_res_id *res_id,
+		      enum mgs_cfg_type type);
+
+/** @} disk */
+
+#endif /* _LUSTRE_DISK_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
new file mode 100644
index 0000000000000..dc738f4184c29
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
@@ -0,0 +1,1865 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+/** \defgroup LDLM Lustre Distributed Lock Manager
+ *
+ * Lustre DLM is based on VAX DLM.
+ * Its two main roles are:
+ *   - To provide locking assuring consistency of data on all Lustre nodes.
+ *   - To allow clients to cache state protected by a lock by holding the
+ *     lock until a conflicting lock is requested or it is expired by the LRU.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_DLM_H__
+#define _LUSTRE_DLM_H__
+
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_handles.h>
+#include <interval_tree.h> /* for interval_node{}, ldlm_extent */
+#include <lu_ref.h>
+
+#include "lustre_dlm_flags.h"
+
+struct obd_ops;
+struct obd_device;
+
+extern struct kset *ldlm_ns_kset;
+extern struct kset *ldlm_svc_kset;
+
+#define OBD_LDLM_DEVICENAME  "ldlm"
+
+#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#define LDLM_DEFAULT_MAX_ALIVE		3900	/* 3900 seconds ~65 min */
+#define LDLM_CTIME_AGE_LIMIT (10)
+/* if client lock is unused for that time it can be cancelled if any other
+ * client shows interest in that lock, e.g. glimpse is occured. */
+#define LDLM_DIRTY_AGE_LIMIT (10)
+#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
+#define LDLM_DEFAULT_LRU_SHRINK_BATCH (16)
+#define LDLM_DEFAULT_SLV_RECALC_PCT (10)
+
+/**
+ * LDLM non-error return states
+ */
+enum ldlm_error {
+	ELDLM_OK		= 0,
+	ELDLM_LOCK_MATCHED	= 1,
+
+	ELDLM_LOCK_CHANGED	= 300,
+	ELDLM_LOCK_ABORTED	= 301,
+	ELDLM_LOCK_REPLACED	= 302,
+	ELDLM_NO_LOCK_DATA	= 303,
+	ELDLM_LOCK_WOULDBLOCK	= 304,
+
+	ELDLM_NAMESPACE_EXISTS	= 400,
+	ELDLM_BAD_NAMESPACE	= 401,
+};
+
+/**
+ * LDLM namespace type.
+ * The "client" type is actually an indication that this is a narrow local view
+ * into complete namespace on the server. Such namespaces cannot make any
+ * decisions about lack of conflicts or do any autonomous lock granting without
+ * first speaking to a server.
+ */
+enum ldlm_side {
+	LDLM_NAMESPACE_SERVER = 0x01,
+	LDLM_NAMESPACE_CLIENT = 0x02
+};
+
+/**
+ * The blocking callback is overloaded to perform two functions.  These flags
+ * indicate which operation should be performed.
+ */
+#define LDLM_CB_BLOCKING    1
+#define LDLM_CB_CANCELING   2
+
+/**
+ * \name Lock Compatibility Matrix.
+ *
+ * A lock has both a type (extent, flock, inode bits, or plain) and a mode.
+ * Lock types are described in their respective implementation files:
+ * ldlm_{extent,flock,inodebits,plain}.c.
+ *
+ * There are six lock modes along with a compatibility matrix to indicate if
+ * two locks are compatible.
+ *
+ * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock
+ *   on the parent.
+ * - PW: Protective Write (normal write) mode. When a client requests a write
+ *   lock from an OST, a lock with PW mode will be issued.
+ * - PR: Protective Read (normal read) mode. When a client requests a read from
+ *   an OST, a lock with PR mode will be issued. Also, if the client opens a
+ *   file for execution, it is granted a lock with PR mode.
+ * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client
+ *   requests a write lock during a file open operation.
+ * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants
+ *   an inodebit lock with the CR mode on the intermediate path component.
+ * - NL Null mode.
+ *
+ * <PRE>
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * </PRE>
+ */
+/** @{ */
+#define LCK_COMPAT_EX  LCK_NL
+#define LCK_COMPAT_PW  (LCK_COMPAT_EX | LCK_CR)
+#define LCK_COMPAT_PR  (LCK_COMPAT_PW | LCK_PR)
+#define LCK_COMPAT_CW  (LCK_COMPAT_PW | LCK_CW)
+#define LCK_COMPAT_CR  (LCK_COMPAT_CW | LCK_PR | LCK_PW)
+#define LCK_COMPAT_NL  (LCK_COMPAT_CR | LCK_EX | LCK_GROUP)
+#define LCK_COMPAT_GROUP  (LCK_GROUP | LCK_NL)
+#define LCK_COMPAT_COS (LCK_COS)
+/** @} Lock Compatibility Matrix */
+
+extern enum ldlm_mode lck_compat_array[];
+
+static inline void lockmode_verify(enum ldlm_mode mode)
+{
+	LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE);
+}
+
+static inline int lockmode_compat(enum ldlm_mode exist_mode,
+				  enum ldlm_mode new_mode)
+{
+	return lck_compat_array[exist_mode] & new_mode;
+}
+
+/*
+ *
+ * cluster name spaces
+ *
+ */
+
+#define DLM_OST_NAMESPACE 1
+#define DLM_MDS_NAMESPACE 2
+
+/* XXX
+   - do we just separate this by security domains and use a prefix for
+     multiple namespaces in the same domain?
+   -
+*/
+
+/**
+ * Locking rules for LDLM:
+ *
+ * lr_lock
+ *
+ * lr_lock
+ *     waiting_locks_spinlock
+ *
+ * lr_lock
+ *     led_lock
+ *
+ * lr_lock
+ *     ns_lock
+ *
+ * lr_lvb_mutex
+ *     lr_lock
+ *
+ */
+
+/* Cancel lru flag, it indicates we cancel aged locks. */
+enum ldlm_lru_flags {
+	LDLM_LRU_FLAG_NO_WAIT	= 0x1, /* Cancel locks w/o blocking (neither
+					* sending nor waiting for any RPCs) */
+	LDLM_LRU_FLAG_CLEANUP	= 0x2, /* Used when clearing lru, tells
+					* prepare_lru_list to set discard flag
+					* on PR extent locks so we don't waste
+					* time saving pages that will be
+					* discarded momentarily */
+};
+
+struct ldlm_pool;
+struct ldlm_lock;
+struct ldlm_resource;
+struct ldlm_namespace;
+
+/**
+ * Operations on LDLM pools.
+ * LDLM pool is a pool of locks in the namespace without any implicitly
+ * specified limits.
+ * Locks in the pool are organized in LRU.
+ * Local memory pressure or server instructions (e.g. mempressure on server)
+ * can trigger freeing of locks from the pool
+ */
+struct ldlm_pool_ops {
+	/** Recalculate pool \a pl usage */
+	int (*po_recalc)(struct ldlm_pool *pl, bool force);
+	/** Cancel at least \a nr locks from pool \a pl */
+	int (*po_shrink)(struct ldlm_pool *pl, int nr, gfp_t gfp_mask);
+	int (*po_setup)(struct ldlm_pool *pl, int limit);
+};
+
+/** One second for pools thread check interval. Each pool has own period. */
+#define LDLM_POOLS_THREAD_PERIOD (1)
+
+/** ~6% margin for modest pools. See ldlm_pool.c for details. */
+#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4)
+
+/** Default recalc period for server side pools in sec. */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/** Default recalc period for client side pools in sec. */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
+
+/**
+ * LDLM pool structure to track granted locks.
+ * For purposes of determining when to release locks on e.g. memory pressure.
+ * This feature is commonly referred to as lru_resize.
+ */
+struct ldlm_pool {
+	/** Pool debugfs directory. */
+	struct dentry		*pl_debugfs_entry;
+	/** Pool name, must be long enough to hold compound proc entry name. */
+	char			pl_name[100];
+	/** Lock for protecting SLV/CLV updates. */
+	spinlock_t		pl_lock;
+	/** Number of allowed locks in in pool, both, client and server side. */
+	atomic_t		pl_limit;
+	/** Number of granted locks in */
+	atomic_t		pl_granted;
+	/** Grant rate per T. */
+	atomic_t		pl_grant_rate;
+	/** Cancel rate per T. */
+	atomic_t		pl_cancel_rate;
+	/** Server lock volume (SLV). Protected by pl_lock. */
+	__u64			pl_server_lock_volume;
+	/** Current biggest client lock volume. Protected by pl_lock. */
+	__u64			pl_client_lock_volume;
+	/** Lock volume factor, shown in percents in procfs, but internally
+	 *  Client SLV calculated as: server_slv * lock_volume_factor >> 8.
+	 */
+	atomic_t		pl_lock_volume_factor;
+	/** Time when last SLV from server was obtained. */
+	time64_t		pl_recalc_time;
+	/** Recalculation period for pool. */
+	time64_t		pl_recalc_period;
+	/** Recalculation and shrink operations. */
+	struct ldlm_pool_ops	*pl_ops;
+	/** Number of planned locks for next period. */
+	int			pl_grant_plan;
+	/** Pool statistics. */
+	struct lprocfs_stats	*pl_stats;
+
+	/* sysfs object */
+	struct kobject		 pl_kobj;
+	struct completion	 pl_kobj_unregister;
+};
+
+typedef int (*ldlm_res_policy)(const struct lu_env *env,
+			       struct ldlm_namespace *,
+			       struct ldlm_lock **, void *req_cookie,
+			       enum ldlm_mode mode, __u64 flags, void *data);
+
+typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock);
+
+/**
+ * LVB operations.
+ * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
+ * be associated with an LDLM lock and transferred from client to server and
+ * back.
+ *
+ * Currently LVBs are used by:
+ *  - OSC-OST code to maintain current object size/times
+ *  - layout lock code to return the layout when the layout lock is granted
+ *
+ * To ensure delayed LVB initialization, it is highly recommended to use the set
+ * of ldlm_[res_]lvbo_[init,update,fill]() functions.
+ */
+struct ldlm_valblock_ops {
+	int (*lvbo_init)(struct ldlm_resource *res);
+	int (*lvbo_update)(struct ldlm_resource *res, struct ldlm_lock *lock,
+			   struct ptlrpc_request *r, int increase);
+	int (*lvbo_free)(struct ldlm_resource *res);
+	/* Return size of lvb data appropriate RPC size can be reserved */
+	int (*lvbo_size)(struct ldlm_lock *lock);
+	/* Called to fill in lvb data to RPC buffer @buf */
+	int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int *buflen);
+};
+
+/**
+ * LDLM pools related, type of lock pool in the namespace.
+ * Greedy means release cached locks aggressively
+ */
+enum ldlm_appetite {
+	LDLM_NAMESPACE_GREEDY = BIT(0),
+	LDLM_NAMESPACE_MODEST = BIT(1),
+};
+
+/**
+ * Default values for the "max_nolock_size", "contention_time" and
+ * "contended_locks" namespace tunables.
+ */
+#define NS_DEFAULT_MAX_NOLOCK_BYTES 0
+#define NS_DEFAULT_CONTENTION_SECONDS 2
+#define NS_DEFAULT_CONTENDED_LOCKS 32
+
+struct ldlm_ns_bucket {
+	/** back pointer to namespace */
+	struct ldlm_namespace      *nsb_namespace;
+	/**
+	 * Estimated lock callback time.  Used by adaptive timeout code to
+	 * avoid spurious client evictions due to unresponsiveness when in
+	 * fact the network or overall system load is at fault
+	 */
+	struct adaptive_timeout     nsb_at_estimate;
+	/**
+	 * Which res in the bucket should we start with the reclaim.
+	 */
+	int			    nsb_reclaim_start;
+	/* counter of entries in this bucket */
+	atomic_t		nsb_count;
+};
+
+enum {
+	/** LDLM namespace lock stats */
+	LDLM_NSS_LOCKS          = 0,
+	LDLM_NSS_LAST
+};
+
+enum ldlm_ns_type {
+	LDLM_NS_TYPE_UNKNOWN = 0,	/**< invalid type */
+	LDLM_NS_TYPE_MDC,		/**< MDC namespace */
+	LDLM_NS_TYPE_MDT,		/**< MDT namespace */
+	LDLM_NS_TYPE_OSC,		/**< OSC namespace */
+	LDLM_NS_TYPE_OST,		/**< OST namespace */
+	LDLM_NS_TYPE_MGC,		/**< MGC namespace */
+	LDLM_NS_TYPE_MGT,		/**< MGT namespace */
+};
+
+enum ldlm_namespace_flags {
+	/**
+	 * Flag to indicate the LRU cancel is in progress.
+	 * Used to limit the process by 1 thread only.
+	 */
+	LDLM_LRU_CANCEL = 0
+};
+
+/**
+ * LDLM Namespace.
+ *
+ * Namespace serves to contain locks related to a particular service.
+ * There are two kinds of namespaces:
+ * - Server namespace has knowledge of all locks and is therefore authoritative
+ *   to make decisions like what locks could be granted and what conflicts
+ *   exist during new lock enqueue.
+ * - Client namespace only has limited knowledge about locks in the namespace,
+ *   only seeing locks held by the client.
+ *
+ * Every Lustre service has one server namespace present on the server serving
+ * that service. Every client connected to the service has a client namespace
+ * for it.
+ * Every lock obtained by client in that namespace is actually represented by
+ * two in-memory locks. One on the server and one on the client. The locks are
+ * linked by a special cookie by which one node can tell to the other which lock
+ * it actually means during communications. Such locks are called remote locks.
+ * The locks held by server only without any reference to a client are called
+ * local locks.
+ */
+struct ldlm_namespace {
+	/** Backward link to OBD, required for LDLM pool to store new SLV. */
+	struct obd_device	*ns_obd;
+
+	/** Flag indicating if namespace is on client instead of server */
+	enum ldlm_side		ns_client;
+
+	/** name of this namespace */
+	char			*ns_name;
+
+	/** Resource hash table for namespace. */
+	struct cfs_hash		*ns_rs_hash;
+	struct ldlm_ns_bucket	*ns_rs_buckets;
+	unsigned int		ns_bucket_bits;
+
+	/** serialize */
+	spinlock_t		ns_lock;
+
+	/** big refcount (by bucket) */
+	atomic_t		ns_bref;
+
+	/**
+	 * Namespace connect flags supported by server (may be changed via
+	 * /proc, LRU resize may be disabled/enabled).
+	 */
+	__u64			ns_connect_flags;
+
+	/** Client side original connect flags supported by server. */
+	__u64			ns_orig_connect_flags;
+
+	/* namespace debugfs dir entry */
+	struct dentry		*ns_debugfs_entry;
+
+	/**
+	 * Position in global namespace list linking all namespaces on
+	 * the node.
+	 */
+	struct list_head	ns_list_chain;
+
+	/**
+	 * List of unused locks for this namespace. This list is also called
+	 * LRU lock list.
+	 * Unused locks are locks with zero reader/writer reference counts.
+	 * This list is only used on clients for lock caching purposes.
+	 * When we want to release some locks voluntarily or if server wants
+	 * us to release some locks due to e.g. memory pressure, we take locks
+	 * to release from the head of this list.
+	 * Locks are linked via l_lru field in \see struct ldlm_lock.
+	 */
+	struct list_head	ns_unused_list;
+	/** Number of locks in the LRU list above */
+	int			ns_nr_unused;
+	struct list_head	*ns_last_pos;
+
+	/**
+	 * Maximum number of locks permitted in the LRU. If 0, means locks
+	 * are managed by pools and there is no preset limit, rather it is all
+	 * controlled by available memory on this client and on server.
+	 */
+	unsigned int		ns_max_unused;
+
+	/**
+	 * Cancel batch, if unused lock count exceed lru_size
+	 * Only be used if LRUR disable.
+	 */
+	unsigned int            ns_cancel_batch;
+
+	/**
+	 * How much the SLV should decrease in %% to trigger LRU cancel urgently.
+	 */
+	unsigned int            ns_recalc_pct;
+
+	/** Maximum allowed age (last used time) for locks in the LRU.  Set in
+	 * seconds from userspace, but stored in ns to avoid repeat conversions.
+	 */
+	ktime_t			ns_max_age;
+
+	/**
+	 * Server only: number of times we evicted clients due to lack of reply
+	 * to ASTs.
+	 */
+	unsigned int		ns_timeouts;
+	/**
+	 * Number of seconds since the file change time after which
+	 * the MDT will return an UPDATE lock along with a LOOKUP lock.
+	 * This allows the client to start caching negative dentries
+	 * for a directory and may save an RPC for a later stat.
+	 */
+	timeout_t		ns_ctime_age_limit;
+	/**
+	 * Number of (nano)seconds since the lock was last used. The client
+	 * may cancel the lock older than this age and flush related data if
+	 * another client shows interest in this lock by doing glimpse request.
+	 * This allows to cache stat data locally for such files early. Set in
+	 * seconds from userspace, but stored in ns to avoid repeat conversions.
+	 */
+	ktime_t			ns_dirty_age_limit;
+	/**
+	 * Used to rate-limit ldlm_namespace_dump calls.
+	 * \see ldlm_namespace_dump. Increased by 10 seconds every time
+	 * it is called.
+	 */
+	time64_t		ns_next_dump;
+
+	/** "policy" function that does actual lock conflict determination */
+	ldlm_res_policy		ns_policy;
+
+	/**
+	 * LVB operations for this namespace.
+	 * \see struct ldlm_valblock_ops
+	 */
+	struct ldlm_valblock_ops *ns_lvbo;
+
+	/**
+	 * Used by filter code to store pointer to OBD of the service.
+	 * Should be dropped in favor of \a ns_obd
+	 */
+	void			*ns_lvbp;
+
+	/**
+	 * Wait queue used by __ldlm_namespace_free. Gets woken up every time
+	 * a resource is removed.
+	 */
+	wait_queue_head_t	ns_waitq;
+	/** LDLM pool structure for this namespace */
+	struct ldlm_pool	ns_pool;
+	/** Definition of how eagerly unused locks will be released from LRU */
+	enum ldlm_appetite	ns_appetite;
+
+	/**
+	 * If more than \a ns_contended_locks are found, the resource is
+	 * considered to be contended. Lock enqueues might specify that no
+	 * contended locks should be granted
+	 */
+	unsigned		ns_contended_locks;
+
+	/**
+	 * The resources in this namespace remember contended state during
+	 * \a ns_contention_time, in seconds.
+	 */
+	timeout_t		ns_contention_time;
+
+	/**
+	 * Limit size of contended extent locks, in bytes.
+	 * If extended lock is requested for more then this many bytes and
+	 * caller instructs us not to grant contended locks, we would disregard
+	 * such a request.
+	 */
+	unsigned		ns_max_nolock_size;
+
+	/** Limit of parallel AST RPC count. */
+	unsigned		ns_max_parallel_ast;
+
+	/**
+	 * Callback to check if a lock is good to be canceled by ELC or
+	 * during recovery.
+	 */
+	ldlm_cancel_cbt		ns_cancel;
+
+	/** LDLM lock stats */
+	struct lprocfs_stats	*ns_stats;
+
+	/**
+	 * Flag to indicate namespace is being freed. Used to determine if
+	 * recalculation of LDLM pool statistics should be skipped.
+	 */
+	unsigned		ns_stopping:1,
+
+	/**
+	 * Flag to indicate the LRU recalc on RPC reply is in progress.
+	 * Used to limit the process by 1 thread only.
+	 */
+				ns_rpc_recalc:1;
+
+	/**
+	 * Which bucket should we start with the lock reclaim.
+	 */
+	int			ns_reclaim_start;
+
+	struct kobject		ns_kobj; /* sysfs object */
+	struct completion	ns_kobj_unregister;
+
+	/**
+	 * To avoid another ns_lock usage, a separate bitops field.
+	 */
+	unsigned long		ns_flags;
+};
+
+/**
+ * Returns 1 if namespace \a ns is a client namespace.
+ */
+static inline int ns_is_client(struct ldlm_namespace *ns)
+{
+        LASSERT(ns != NULL);
+        LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+                ns->ns_client == LDLM_NAMESPACE_SERVER);
+        return ns->ns_client == LDLM_NAMESPACE_CLIENT;
+}
+
+/**
+ * Returns 1 if namespace \a ns is a server namespace.
+ */
+static inline int ns_is_server(struct ldlm_namespace *ns)
+{
+        LASSERT(ns != NULL);
+        LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+                ns->ns_client == LDLM_NAMESPACE_SERVER);
+        return ns->ns_client == LDLM_NAMESPACE_SERVER;
+}
+
+/**
+ * Returns 1 if namespace \a ns supports early lock cancel (ELC).
+ */
+static inline int ns_connect_cancelset(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET);
+}
+
+/**
+ * Returns 1 if this namespace supports lru_resize.
+ */
+static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
+{
+        LASSERT(ns != NULL);
+        return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+				      ldlm_cancel_cbt arg)
+{
+	LASSERT(ns != NULL);
+	ns->ns_cancel = arg;
+}
+
+struct ldlm_lock;
+
+/** Type for blocking callback function of a lock. */
+typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
+				      struct ldlm_lock_desc *new, void *data,
+				      int flag);
+/** Type for completion callback function of a lock. */
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
+					void *data);
+/** Type for glimpse callback function of a lock. */
+typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
+
+/** Type for created callback function of a lock. */
+typedef void (*ldlm_created_callback)(struct ldlm_lock *lock);
+
+/** Work list for sending GL ASTs to multiple locks. */
+struct ldlm_glimpse_work {
+	struct ldlm_lock	*gl_lock; /* lock to glimpse */
+	struct list_head	 gl_list; /* linkage to other gl work structs */
+	__u32			 gl_flags;/* see LDLM_GL_WORK_* below */
+	union ldlm_gl_desc	*gl_desc; /* glimpse descriptor to be packed in
+					   * glimpse callback request */
+	ptlrpc_interpterer_t	 gl_interpret_reply;
+	void			*gl_interpret_data;
+};
+
+struct ldlm_bl_desc {
+	unsigned int bl_same_client:1,
+		     bl_cos_incompat:1;
+};
+
+struct ldlm_cb_set_arg {
+	struct ptlrpc_request_set	*set;
+	int				 type; /* LDLM_{CP,BL,GL}_CALLBACK */
+	atomic_t			 restart;
+	struct list_head		*list;
+	union ldlm_gl_desc		*gl_desc; /* glimpse AST descriptor */
+	ptlrpc_interpterer_t		 gl_interpret_reply;
+	void				*gl_interpret_data;
+	struct ldlm_bl_desc		*bl_desc;
+};
+
+struct ldlm_cb_async_args {
+	struct ldlm_cb_set_arg	*ca_set_arg;
+	struct ldlm_lock	*ca_lock;
+};
+
+/** The ldlm_glimpse_work was slab allocated & must be freed accordingly.*/
+#define LDLM_GL_WORK_SLAB_ALLOCATED 0x1
+
+/** Interval node data for each LDLM_EXTENT lock. */
+struct ldlm_interval {
+	struct interval_node	li_node;  /* node for tree management */
+	struct list_head	li_group; /* the locks which have the same
+					   * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/**
+ * Interval tree for extent locks.
+ * The interval tree must be accessed under the resource lock.
+ * Interval trees are used for granted extent locks to speed up conflicts
+ * lookup. See ldlm/interval_tree.c for more details.
+ */
+struct ldlm_interval_tree {
+	/** Tree size. */
+	int			lit_size;
+	enum ldlm_mode		lit_mode;  /* lock mode */
+	struct interval_node	*lit_root; /* actual ldlm_interval */
+};
+
+/**
+ * Lists of waiting locks for each inodebit type.
+ * A lock can be in several liq_waiting lists and it remains in lr_waiting.
+ */
+struct ldlm_ibits_queues {
+	struct list_head	liq_waiting[MDS_INODELOCK_NUMBITS];
+};
+
+struct ldlm_ibits_node {
+	struct list_head	lin_link[MDS_INODELOCK_NUMBITS];
+	struct ldlm_lock	*lock;
+};
+
+/** Whether to track references to exports by LDLM locks. */
+#define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
+
+/** Cancel flags. */
+enum ldlm_cancel_flags {
+	LCF_ASYNC	= 0x1, /* Cancel locks asynchronously. */
+	LCF_LOCAL	= 0x2, /* Cancel locks locally, not notifing server */
+	LCF_BL_AST	= 0x4, /* Cancel LDLM_FL_BL_AST locks in the same RPC */
+};
+
+struct ldlm_flock {
+	__u64 start;
+	__u64 end;
+	__u64 owner;
+	__u64 blocking_owner;
+	struct obd_export *blocking_export;
+	atomic_t blocking_refs;
+	__u32 pid;
+};
+
+union ldlm_policy_data {
+	struct ldlm_extent l_extent;
+	struct ldlm_flock l_flock;
+	struct ldlm_inodebits l_inodebits;
+};
+
+void ldlm_convert_policy_to_wire(enum ldlm_type type,
+				 const union ldlm_policy_data *lpolicy,
+				 union ldlm_wire_policy_data *wpolicy);
+void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type,
+				  const union ldlm_wire_policy_data *wpolicy,
+				  union ldlm_policy_data *lpolicy);
+
+enum lvb_type {
+	LVB_T_NONE	= 0,
+	LVB_T_OST	= 1,
+	LVB_T_LQUOTA	= 2,
+	LVB_T_LAYOUT	= 3,
+};
+
+/**
+ * LDLM_GID_ANY is used to match any group id in ldlm_lock_match().
+ */
+#define LDLM_GID_ANY  ((__u64)-1)
+
+/**
+ * LDLM lock structure
+ *
+ * Represents a single LDLM lock and its state in memory. Each lock is
+ * associated with a single ldlm_resource, the object which is being
+ * locked. There may be multiple ldlm_locks on a single resource,
+ * depending on the lock type and whether the locks are conflicting or
+ * not.
+ */
+struct ldlm_lock {
+	/**
+	 * Local lock handle.
+	 * When remote side wants to tell us about a lock, they address
+	 * it by this opaque handle.  The handle does not hold a
+	 * reference on the ldlm_lock, so it can be safely passed to
+	 * other threads or nodes. When the lock needs to be accessed
+	 * from the handle, it is looked up again in the lock table, and
+	 * may no longer exist.
+	 *
+	 * Must be first in the structure.
+	 */
+	struct portals_handle	l_handle;
+	/**
+	 * Pointer to actual resource this lock is in.
+	 * ldlm_lock_change_resource() can change this on the client.
+	 * When this is possible, rcu must be used to stablise
+	 * the resource while we lock and check it hasn't been changed.
+	 */
+	struct ldlm_resource	*l_resource;
+	/**
+	 * List item for client side LRU list.
+	 * Protected by ns_lock in struct ldlm_namespace.
+	 */
+	struct list_head	l_lru;
+	/**
+	 * Linkage to resource's lock queues according to current lock state.
+	 * (could be granted or waiting)
+	 * Protected by lr_lock in struct ldlm_resource.
+	 */
+	struct list_head	l_res_link;
+	/**
+	 * Internal structures per lock type..
+	 */
+	union {
+		struct ldlm_interval	*l_tree_node;
+		struct ldlm_ibits_node  *l_ibits_node;
+	};
+	/**
+	 * Per export hash of locks.
+	 * Protected by per-bucket exp->exp_lock_hash locks.
+	 */
+	struct hlist_node	l_exp_hash;
+	/**
+	 * Per export hash of flock locks.
+	 * Protected by per-bucket exp->exp_flock_hash locks.
+	 */
+	struct hlist_node	l_exp_flock_hash;
+	/**
+	 * Requested mode.
+	 * Protected by lr_lock.
+	 */
+	enum ldlm_mode		l_req_mode;
+	/**
+	 * Granted mode, also protected by lr_lock.
+	 */
+	enum ldlm_mode		l_granted_mode;
+	/** Lock completion handler pointer. Called when lock is granted. */
+	ldlm_completion_callback l_completion_ast;
+	/**
+	 * Lock blocking AST handler pointer.
+	 * It plays two roles:
+	 * - as a notification of an attempt to queue a conflicting lock (once)
+	 * - as a notification when the lock is being cancelled.
+	 *
+	 * As such it's typically called twice: once for the initial conflict
+	 * and then once more when the last user went away and the lock is
+	 * cancelled (could happen recursively).
+	 */
+	ldlm_blocking_callback	l_blocking_ast;
+	/**
+	 * Lock glimpse handler.
+	 * Glimpse handler is used to obtain LVB updates from a client by
+	 * server
+	 */
+	ldlm_glimpse_callback	l_glimpse_ast;
+
+	/**
+	 * Lock export.
+	 * This is a pointer to actual client export for locks that were granted
+	 * to clients. Used server-side.
+	 */
+	struct obd_export	*l_export;
+	/**
+	 * Lock connection export.
+	 * Pointer to server export on a client.
+	 */
+	struct obd_export	*l_conn_export;
+
+	/**
+	 * Remote lock handle.
+	 * If the lock is remote, this is the handle of the other side lock
+	 * (l_handle)
+	 */
+	struct lustre_handle	l_remote_handle;
+
+	/**
+	 * Representation of private data specific for a lock type.
+	 * Examples are: extent range for extent lock or bitmask for ibits locks
+	 */
+	union ldlm_policy_data	l_policy_data;
+
+	/**
+	 * Lock state flags. Protected by lr_lock.
+	 * \see lustre_dlm_flags.h where the bits are defined.
+	 */
+	__u64			l_flags;
+
+	/**
+	 * Lock r/w usage counters.
+	 * Protected by lr_lock.
+	 */
+	__u32			l_readers;
+	__u32			l_writers;
+	/**
+	 * If the lock is granted, a process sleeps on this waitq to learn when
+	 * it's no longer in use.  If the lock is not granted, a process sleeps
+	 * on this waitq to learn when it becomes granted.
+	 */
+	wait_queue_head_t	l_waitq;
+
+	/**
+	 * Time, in nanoseconds, last used by e.g. being matched by lock match.
+	 */
+	ktime_t			l_last_used;
+
+	/** Originally requested extent for the extent lock. */
+	struct ldlm_extent	l_req_extent;
+
+	/*
+	 * Client-side-only members.
+	 */
+
+	enum lvb_type	      l_lvb_type;
+
+	/**
+	 * Temporary storage for a LVB received during an enqueue operation.
+	 * May be vmalloc'd, so needs to be freed with OBD_FREE_LARGE().
+	 */
+	__u32			l_lvb_len;
+	void			*l_lvb_data;
+
+	/** Private storage for lock user. Opaque to LDLM. */
+	void			*l_ast_data;
+
+	union {
+	/**
+	 * Seconds. It will be updated if there is any activity related to
+	 * the lock at client, e.g. enqueue the lock. For server it is the
+	 * time when blocking ast was sent.
+	 */
+		time64_t	l_activity;
+		time64_t	l_blast_sent;
+	};
+
+	/* separate ost_lvb used mostly by Data-on-MDT for now.
+	 * It is introduced to don't mix with layout lock data. */
+	struct ost_lvb		 l_ost_lvb;
+	/*
+	 * Server-side-only members.
+	 */
+
+	/**
+	 * Connection cookie for the client originating the operation.
+	 * Used by Commit on Share (COS) code. Currently only used for
+	 * inodebits locks on MDS.
+	 */
+	__u64			l_client_cookie;
+
+	/**
+	 * List item for locks waiting for cancellation from clients.
+	 * The lists this could be linked into are:
+	 * waiting_locks_list (protected by waiting_locks_spinlock),
+	 * then if the lock timed out, it is moved to
+	 * expired_lock_list for further processing.
+	 */
+	struct list_head	l_pending_chain;
+
+	/**
+	 * Set when lock is sent a blocking AST. Time in seconds when timeout
+	 * is reached and client holding this lock could be evicted.
+	 * This timeout could be further extended by e.g. certain IO activity
+	 * under this lock.
+	 * \see ost_rw_prolong_locks
+	 */
+	time64_t		l_callback_timestamp;
+
+	/** Local PID of process which created this lock. */
+	__u32			l_pid;
+
+	/**
+	 * Number of times blocking AST was sent for this lock.
+	 * This is for debugging. Valid values are 0 and 1, if there is an
+	 * attempt to send blocking AST more than once, an assertion would be
+	 * hit. \see ldlm_work_bl_ast_lock
+	 */
+	int			l_bl_ast_run;
+	/** List item ldlm_add_ast_work_item() for case of blocking ASTs. */
+	struct list_head	l_bl_ast;
+	/** List item ldlm_add_ast_work_item() for case of completion ASTs. */
+	struct list_head	l_cp_ast;
+	/** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */
+	struct list_head	l_rk_ast;
+
+	/**
+	 * Pointer to a conflicting lock that caused blocking AST to be sent
+	 * for this lock
+	 */
+	struct ldlm_lock	*l_blocking_lock;
+
+	/**
+	 * Protected by lr_lock, linkages to "skip lists".
+	 * For more explanations of skip lists see ldlm/ldlm_inodebits.c
+	 */
+	struct list_head	l_sl_mode;
+	struct list_head	l_sl_policy;
+
+	/** Reference tracking structure to debug leaked locks. */
+	struct lu_ref		l_reference;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	/* Debugging stuff for bug 20498, for tracking export references. */
+	/** number of export references taken */
+	int			l_exp_refs_nr;
+	/** link all locks referencing one export */
+	struct list_head	l_exp_refs_link;
+	/** referenced export object */
+	struct obd_export	*l_exp_refs_target;
+#endif
+	/**
+	 * export blocking dlm lock list, protected by
+	 * l_export->exp_bl_list_lock.
+	 * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock
+	 * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
+	 */
+	struct list_head	l_exp_list;
+};
+
+enum ldlm_match_flags {
+	LDLM_MATCH_UNREF   = BIT(0),
+	LDLM_MATCH_AST     = BIT(1),
+	LDLM_MATCH_AST_ANY = BIT(2),
+	LDLM_MATCH_RIGHT   = BIT(3),
+};
+
+/**
+ * Describe the overlap between two locks.  itree_overlap_cb data.
+ */
+struct ldlm_match_data {
+	struct ldlm_lock	*lmd_old;
+	struct ldlm_lock	*lmd_lock;
+	enum ldlm_mode		*lmd_mode;
+	union ldlm_policy_data	*lmd_policy;
+	__u64			 lmd_flags;
+	__u64			 lmd_skip_flags;
+	enum ldlm_match_flags    lmd_match;
+};
+
+/** For uncommitted cross-MDT lock, store transno this lock belongs to */
+#define l_transno l_client_cookie
+
+/** For uncommitted cross-MDT lock, which is client lock, share with l_rk_ast
+ *  which is for server. */
+#define l_slc_link l_rk_ast
+
+struct lustre_handle_array {
+	unsigned int		ha_count;
+	/* ha_map is used as bit flag to indicate handle is remote or local */
+	DECLARE_BITMAP(ha_map, LMV_MAX_STRIPE_COUNT);
+	struct lustre_handle	ha_handles[0];
+};
+
+/**
+ * LDLM resource description.
+ * Basically, resource is a representation for a single object.
+ * Object has a name which is currently 4 64-bit integers. LDLM user is
+ * responsible for creation of a mapping between objects it wants to be
+ * protected and resource names.
+ *
+ * A resource can only hold locks of a single lock type, though there may be
+ * multiple ldlm_locks on a single resource, depending on the lock type and
+ * whether the locks are conflicting or not.
+ */
+struct ldlm_resource {
+	struct ldlm_ns_bucket	*lr_ns_bucket;
+
+	/**
+	 * List item for list in namespace hash.
+	 * protected by ns_lock.
+	 * Shared with linkage for RCU-delayed free.
+	 */
+	union {
+		struct hlist_node	lr_hash;
+		struct rcu_head		lr_rcu;
+	};
+
+	/** Reference count for this resource */
+	atomic_t		lr_refcount;
+
+	/** Spinlock to protect locks under this resource. */
+	spinlock_t		lr_lock;
+
+	/**
+	 * protected by lr_lock
+	 * @{ */
+	/** List of locks in granted state */
+	struct list_head	lr_granted;
+	/**
+	 * List of locks that could not be granted due to conflicts and
+	 * that are waiting for conflicts to go away */
+	struct list_head	lr_waiting;
+	/** @} */
+
+	/** Resource name */
+	struct ldlm_res_id	lr_name;
+
+	union {
+		/**
+		 * Interval trees (only for extent locks) for all modes of
+		 * this resource
+		 */
+		struct ldlm_interval_tree *lr_itree;
+		struct ldlm_ibits_queues *lr_ibits_queues;
+	};
+
+	union {
+		/**
+		 * When the resource was considered as contended,
+		 * used only on server side.
+		 */
+		time64_t	lr_contention_time;
+		/**
+		 * Associated inode, used only on client side.
+		 */
+		struct inode	*lr_lvb_inode;
+	};
+
+	/** Type of locks this resource can hold. Only one type per resource. */
+	enum ldlm_type		lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */
+
+	/**
+	 * Server-side-only lock value block elements.
+	 * To serialize lvbo_init.
+	 */
+	int			lr_lvb_len;
+	struct mutex		lr_lvb_mutex;
+	/** protected by lr_lock */
+	void			*lr_lvb_data;
+	/** is lvb initialized ? */
+	bool			lr_lvb_initialized;
+
+	/** List of references to this resource. For debugging. */
+	struct lu_ref		lr_reference;
+};
+
+static inline int ldlm_is_granted(struct ldlm_lock *lock)
+{
+	return lock->l_req_mode == lock->l_granted_mode;
+}
+
+static inline bool ldlm_has_layout(struct ldlm_lock *lock)
+{
+	return lock->l_resource->lr_type == LDLM_IBITS &&
+		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
+}
+
+static inline bool ldlm_has_dom(struct ldlm_lock *lock)
+{
+	return lock->l_resource->lr_type == LDLM_IBITS &&
+		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM;
+}
+
+static inline char *
+ldlm_ns_name(struct ldlm_namespace *ns)
+{
+	return ns->ns_name;
+}
+
+static inline struct ldlm_namespace *
+ldlm_res_to_ns(struct ldlm_resource *res)
+{
+        return res->lr_ns_bucket->nsb_namespace;
+}
+
+static inline struct ldlm_namespace *
+ldlm_lock_to_ns(struct ldlm_lock *lock)
+{
+        return ldlm_res_to_ns(lock->l_resource);
+}
+
+static inline char *
+ldlm_lock_to_ns_name(struct ldlm_lock *lock)
+{
+        return ldlm_ns_name(ldlm_lock_to_ns(lock));
+}
+
+static inline struct adaptive_timeout *
+ldlm_lock_to_ns_at(struct ldlm_lock *lock)
+{
+        return &lock->l_resource->lr_ns_bucket->nsb_at_estimate;
+}
+
+static inline int ldlm_lvbo_init(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	int rc = 0;
+
+	if (ns->ns_lvbo == NULL || ns->ns_lvbo->lvbo_init == NULL ||
+	    res->lr_lvb_initialized)
+		return 0;
+
+	mutex_lock(&res->lr_lvb_mutex);
+	/* Did we lose the race? */
+	if (res->lr_lvb_initialized) {
+		mutex_unlock(&res->lr_lvb_mutex);
+		return 0;
+	}
+	rc = ns->ns_lvbo->lvbo_init(res);
+	if (rc < 0) {
+		CDEBUG(D_DLMTRACE, "lvbo_init failed for resource : rc = %d\n",
+		       rc);
+		if (res->lr_lvb_data != NULL) {
+			OBD_FREE(res->lr_lvb_data, res->lr_lvb_len);
+			res->lr_lvb_data = NULL;
+		}
+		res->lr_lvb_len = rc;
+	} else {
+		res->lr_lvb_initialized = true;
+	}
+	mutex_unlock(&res->lr_lvb_mutex);
+	return rc;
+}
+
+static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL)
+		return ns->ns_lvbo->lvbo_size(lock);
+
+	return 0;
+}
+
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int *len)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	int rc;
+
+	if (ns->ns_lvbo != NULL) {
+		LASSERT(ns->ns_lvbo->lvbo_fill != NULL);
+		/* init lvb now if not already */
+		rc = ldlm_lvbo_init(lock->l_resource);
+		if (rc < 0) {
+			CERROR("lock %p: delayed lvb init failed (rc %d)\n",
+			       lock, rc);
+			return rc;
+		}
+		return ns->ns_lvbo->lvbo_fill(lock, buf, len);
+	}
+	return 0;
+}
+
+struct ldlm_ast_work {
+	struct ldlm_lock       *w_lock;
+	int			w_blocking;
+	struct ldlm_lock_desc	w_desc;
+	struct list_head	w_list;
+	int			w_flags;
+	void		       *w_data;
+	int			w_datalen;
+};
+
+/**
+ * Common ldlm_enqueue parameters
+ */
+struct ldlm_enqueue_info {
+	enum ldlm_type	ei_type;	/** Type of the lock being enqueued. */
+	enum ldlm_mode	ei_mode;	/** Mode of the lock being enqueued. */
+	void		*ei_cb_bl;	/** blocking lock callback */
+	void		*ei_cb_local_bl; /** blocking local lock callback */
+	void		*ei_cb_cp;	/** lock completion callback */
+	void		*ei_cb_gl;	/** lock glimpse callback */
+	ldlm_created_callback ei_cb_created;	/** lock created callback */
+	void		*ei_cbdata;	/** Data to be passed into callbacks. */
+	void		*ei_namespace;	/** lock namespace **/
+	u64		ei_inodebits;	/** lock inode bits **/
+	unsigned int	ei_enq_slave:1;	/** whether enqueue slave stripes */
+	unsigned int	ei_req_slot:1;	/** whether acquire rpc slot */
+	unsigned int	ei_mod_slot:1;	/** whether acquire mod rpc slot */
+};
+
+#define ei_res_id	ei_cb_gl
+
+extern char *ldlm_lockname[];
+extern char *ldlm_typename[];
+extern const char *ldlm_it2str(enum ldlm_intent_flags it);
+
+/**
+ * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG.
+ * For the cases where we do not have actual lock to print along
+ * with a debugging message that is ldlm-related
+ */
+#define LDLM_DEBUG_NOLOCK(format, a...)			\
+	CDEBUG(D_DLMTRACE, "### " format "\n" , ##a)
+
+/**
+ * Support function for lock information printing into debug logs.
+ * \see LDLM_DEBUG
+ */
+#ifdef LIBCFS_DEBUG
+#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do {      \
+        CFS_CHECK_STACK(msgdata, mask, cdls);                           \
+                                                                        \
+        if (((mask) & D_CANTMASK) != 0 ||                               \
+            ((libcfs_debug & (mask)) != 0 &&                            \
+             (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))          \
+                _ldlm_lock_debug(lock, msgdata, fmt, ##a);              \
+} while(0)
+
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+                      struct libcfs_debug_msg_data *data,
+                      const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Rate-limited version of lock printing function.
+ */
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do {                         \
+	static struct cfs_debug_limit_state _ldlm_cdls;			     \
+        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls);              \
+        ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\
+} while (0)
+
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...)  LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
+/** Non-rate-limited lock printing function for debugging purposes. */
+#define LDLM_DEBUG(lock, fmt, a...)   do {                                  \
+	if (likely(lock != NULL)) {					    \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL);      \
+		ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, 	    \
+				"### " fmt , ##a);			    \
+	} else {							    \
+		LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a);		    \
+	}								    \
+} while (0)
+#else /* !LIBCFS_DEBUG */
+# define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) ((void)0)
+# define LDLM_DEBUG(lock, fmt, a...) ((void)0)
+# define LDLM_ERROR(lock, fmt, a...) ((void)0)
+#endif
+
+/*
+ * Three intentions can be used for the policy functions in
+ * ldlm_processing_policy.
+ *
+ * LDLM_PROCESS_RESCAN:
+ *
+ * It's used when policy functions are called from ldlm_reprocess_queue() to
+ * reprocess the wait list and try to grant locks, blocking ASTs
+ * have already been sent in this situation, completion ASTs need be sent for
+ * the locks being granted.
+ *
+ * LDLM_PROCESS_ENQUEUE:
+ *
+ * It's used when policy functions are called from ldlm_lock_enqueue() to
+ * process the wait list for handling an enqueue request, blocking
+ * ASTs have not been sent yet, so list of conflicting locks would be
+ * collected and ASTs sent.
+ *
+ * LDLM_PROCESS_RECOVERY:
+ *
+ * It's used when policy functions are called from ldlm_reprocess_queue() to
+ * reprocess the wait list when recovery done. In case of blocking
+ * ASTs are lost before recovery, it needs not only to grant locks if
+ * available, but also send blocking ASTs to the locks doesn't have AST sent
+ * flag. Completion ASTs need be sent for the locks being granted.
+ */
+enum ldlm_process_intention {
+	LDLM_PROCESS_RESCAN = 0,
+	LDLM_PROCESS_ENQUEUE = 1,
+	LDLM_PROCESS_RECOVERY = 2,
+};
+
+typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
+				      enum ldlm_process_intention intention,
+				      enum ldlm_error *err,
+				      struct list_head *work_list);
+
+typedef int (*ldlm_reprocessing_policy)(struct ldlm_resource *res,
+					struct list_head *queue,
+					struct list_head *work_list,
+					enum ldlm_process_intention intention,
+					__u64 hint);
+
+/**
+ * Return values for lock iterators.
+ * Also used during deciding of lock grants and cancellations.
+ */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP     2 /* stop iterating */
+
+typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
+
+/** \defgroup ldlm_iterator Lock iterators
+ *
+ * LDLM provides for a way to iterate through every lock on a resource or
+ * namespace or every resource in a namespace.
+ * @{ */
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+			  void *closure);
+void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
+			    void *closure);
+int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *,
+			  ldlm_iterator_t iter, void *data);
+/** @} ldlm_iterator */
+
+int ldlm_replay_locks(struct obd_import *imp);
+
+/* ldlm_flock.c */
+int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+
+/* ldlm_extent.c */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
+
+struct ldlm_prolong_args {
+	struct obd_export	*lpa_export;
+	struct ldlm_res_id	lpa_resid;
+	struct ldlm_extent	lpa_extent;
+	enum ldlm_mode		lpa_mode;
+	timeout_t		lpa_timeout;
+	int			lpa_locks_cnt;
+	int			lpa_blocks_cnt;
+};
+void ldlm_lock_prolong_one(struct ldlm_lock *lock,
+			   struct ldlm_prolong_args *arg);
+void ldlm_resource_prolong(struct ldlm_prolong_args *arg);
+
+struct ldlm_callback_suite {
+        ldlm_completion_callback lcs_completion;
+        ldlm_blocking_callback   lcs_blocking;
+        ldlm_glimpse_callback    lcs_glimpse;
+};
+
+/* ldlm_lockd.c */
+#ifdef HAVE_SERVER_SUPPORT
+/** \defgroup ldlm_srv_ast Server AST handlers
+ * These are AST handlers used by server code.
+ * Their property is that they are just preparing RPCs to be sent to clients.
+ * @{
+ */
+int ldlm_server_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+			     void *data, int flag);
+int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data);
+int ldlm_glimpse_locks(struct ldlm_resource *res,
+		       struct list_head *gl_work_list);
+/** @} ldlm_srv_ast */
+
+/** \defgroup ldlm_handlers Server LDLM handlers
+ * These are handler functions that should be called by "frontends" such as
+ * MDT or OST to pass through LDLM requests to LDLM for handling
+ * @{
+ */
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs);
+int ldlm_handle_convert0(struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req);
+int ldlm_handle_cancel(struct ptlrpc_request *req);
+int ldlm_request_cancel(struct ptlrpc_request *req,
+			const struct ldlm_request *dlm_req,
+			int first, enum lustre_at_flags flags);
+/** @} ldlm_handlers */
+
+void ldlm_revoke_export_locks(struct obd_export *exp);
+timeout_t ldlm_bl_timeout(struct ldlm_lock *lock);
+#endif
+int ldlm_del_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout);
+int ldlm_get_ref(void);
+void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
+
+/* ldlm_lock.c */
+#ifdef HAVE_SERVER_SUPPORT
+ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res);
+ldlm_reprocessing_policy
+ldlm_get_reprocessing_policy(struct ldlm_resource *res);
+#endif
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
+void ldlm_lock2handle(const struct ldlm_lock *lock,
+                      struct lustre_handle *lockh);
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags);
+void ldlm_cancel_callback(struct ldlm_lock *);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *);
+int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data);
+
+/**
+ * Obtain a lock reference by its handle.
+ */
+static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
+{
+        return __ldlm_handle2lock(h, 0);
+}
+
+#define LDLM_LOCK_REF_DEL(lock) \
+	lu_ref_del(&lock->l_reference, "handle", lock)
+
+static inline struct ldlm_lock *
+ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
+{
+        struct ldlm_lock *lock;
+
+        lock = __ldlm_handle2lock(h, flags);
+        if (lock != NULL)
+                LDLM_LOCK_REF_DEL(lock);
+        return lock;
+}
+
+/**
+ * Update Lock Value Block Operations (LVBO) on a resource taking into account
+ * data from request \a r
+ */
+static inline int ldlm_lvbo_update(struct ldlm_resource *res,
+				   struct ldlm_lock *lock,
+				   struct ptlrpc_request *req, int increase)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	int rc;
+
+	/* delayed lvb init may be required */
+	rc = ldlm_lvbo_init(res);
+	if (rc < 0) {
+		CERROR("delayed lvb init failed (rc %d)\n", rc);
+		return rc;
+	}
+
+	if (ns->ns_lvbo && ns->ns_lvbo->lvbo_update)
+		return ns->ns_lvbo->lvbo_update(res, lock, req, increase);
+
+	return 0;
+}
+
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+				       struct ptlrpc_request *req,
+				       int increase)
+{
+	return ldlm_lvbo_update(res, NULL, req, increase);
+}
+
+int is_granted_or_cancelled_nolock(struct ldlm_lock *lock);
+
+int ldlm_error2errno(enum ldlm_error error);
+enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this
+					       * confuses user-space. */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp);
+#endif
+
+/**
+ * Release a temporary lock reference obtained by ldlm_handle2lock() or
+ * __ldlm_handle2lock().
+ */
+#define LDLM_LOCK_PUT(lock)                     \
+do {                                            \
+        LDLM_LOCK_REF_DEL(lock);                \
+        /*LDLM_DEBUG((lock), "put");*/          \
+        ldlm_lock_put(lock);                    \
+} while (0)
+
+/**
+ * Release a lock reference obtained by some other means (see
+ * LDLM_LOCK_PUT()).
+ */
+#define LDLM_LOCK_RELEASE(lock)                 \
+do {                                            \
+        /*LDLM_DEBUG((lock), "put");*/          \
+        ldlm_lock_put(lock);                    \
+} while (0)
+
+#define LDLM_LOCK_GET(lock)                     \
+({                                              \
+        ldlm_lock_get(lock);                    \
+        /*LDLM_DEBUG((lock), "get");*/          \
+        lock;                                   \
+})
+
+#define ldlm_lock_list_put(head, member, count)			\
+({								\
+	struct ldlm_lock *_lock, *_next;			\
+	int c = count;						\
+	list_for_each_entry_safe(_lock, _next, head, member) {	\
+		if (c-- == 0)					\
+			break;					\
+		list_del_init(&_lock->member);			\
+		LDLM_LOCK_RELEASE(_lock);			\
+	}							\
+	LASSERT(c <= 0);					\
+})
+
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+void ldlm_lock_put(struct ldlm_lock *lock);
+void ldlm_lock_destroy(struct ldlm_lock *lock);
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc);
+void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode);
+int  ldlm_lock_addref_try(const struct lustre_handle *lockh,
+			  enum ldlm_mode mode);
+void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode);
+void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh,
+				 enum ldlm_mode mode);
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
+void ldlm_lock_fail_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
+
+enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
+					 __u64 flags, __u64 skip_flags,
+					 const struct ldlm_res_id *res_id,
+					 enum ldlm_type type,
+					 union ldlm_policy_data *policy,
+					 enum ldlm_mode mode,
+					 struct lustre_handle *lh,
+					 enum ldlm_match_flags match_flags);
+static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns,
+					     __u64 flags,
+					     const struct ldlm_res_id *res_id,
+					     enum ldlm_type type,
+					     union ldlm_policy_data *policy,
+					     enum ldlm_mode mode,
+					     struct lustre_handle *lh)
+{
+	return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy,
+					 mode, lh, 0);
+}
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+			       struct ldlm_match_data *data);
+enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
+					   __u64 *bits);
+void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
+void ldlm_lock_cancel(struct ldlm_lock *lock);
+void ldlm_reprocess_all(struct ldlm_resource *res, __u64 hint);
+void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns);
+void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh);
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
+
+/* resource.c */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+					  enum ldlm_side client,
+					  enum ldlm_appetite apt,
+					  enum ldlm_ns_type ns_type);
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags);
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+			       struct obd_import *imp,
+			       int force);
+void ldlm_namespace_free_post(struct ldlm_namespace *ns);
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+			 struct obd_import *imp, int force);
+void ldlm_namespace_register(struct ldlm_namespace *ns, enum ldlm_side client);
+void ldlm_namespace_unregister(struct ldlm_namespace *ns,
+			       enum ldlm_side client);
+void ldlm_namespace_get(struct ldlm_namespace *ns);
+void ldlm_namespace_put(struct ldlm_namespace *ns);
+
+int ldlm_debugfs_setup(void);
+void ldlm_debugfs_cleanup(void);
+
+static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+				     struct lprocfs_stats *srv_stats)
+{
+	int lock_type = 0, op = 0;
+
+	lock_type = dlm_req->lock_desc.l_resource.lr_type;
+
+	switch (lock_type) {
+	case LDLM_PLAIN:
+		op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
+		break;
+	case LDLM_EXTENT:
+		op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
+		break;
+	case LDLM_FLOCK:
+		op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
+		break;
+	case LDLM_IBITS:
+		op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
+		break;
+	default:
+		op = 0;
+		break;
+	}
+
+	if (op != 0)
+		lprocfs_counter_incr(srv_stats, op);
+}
+
+/* resource.c - internal */
+struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
+					struct ldlm_resource *parent,
+					const struct ldlm_res_id *,
+					enum ldlm_type type, int create);
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
+int ldlm_resource_putref(struct ldlm_resource *res);
+void ldlm_resource_add_lock(struct ldlm_resource *res,
+			    struct list_head *head,
+			    struct ldlm_lock *lock);
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
+void ldlm_dump_all_namespaces(enum ldlm_side client, int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+                              const struct ldlm_res_id *);
+
+#define LDLM_RESOURCE_ADDREF(res) do {                                  \
+	lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+#define LDLM_RESOURCE_DELREF(res) do {                                  \
+	lu_ref_del(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+/* ldlm_request.c */
+/** \defgroup ldlm_local_ast Default AST handlers for local locks
+ * These AST handlers are typically used for server-side local locks and are
+ * also used by client-side lock handlers to perform minimum level base
+ * processing.
+ * @{ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock);
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		      void *data, int flag);
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+/** @} ldlm_local_ast */
+
+/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users.
+ * These are typically used by client and server (*_local versions)
+ * to obtain and release locks.
+ * @{ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     union ldlm_policy_data const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async);
+int ldlm_prep_enqueue_req(struct obd_export *exp,
+			  struct ptlrpc_request *req,
+			  struct list_head *cancels,
+			  int count);
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  struct ldlm_enqueue_info *einfo, __u8 with_policy,
+			  __u64 *flags, void *lvb, __u32 lvb_len,
+			  const struct lustre_handle *lockh, int rc,
+			  bool request_slot);
+int ldlm_cli_enqueue_local(const struct lu_env *env,
+			   struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   enum ldlm_type type, union ldlm_policy_data *policy,
+			   enum ldlm_mode mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh);
+int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits);
+int ldlm_cli_convert(struct ldlm_lock *lock,
+		     enum ldlm_cancel_flags cancel_flags);
+int ldlm_cli_update_pool(struct ptlrpc_request *req);
+int ldlm_cli_cancel(const struct lustre_handle *lockh,
+		    enum ldlm_cancel_flags cancel_flags);
+int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *,
+			   enum ldlm_cancel_flags flags, void *opaque);
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    union ldlm_policy_data *policy,
+				    enum ldlm_mode mode,
+				    enum ldlm_cancel_flags flags, void *opaque);
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head,
+			int count, enum ldlm_cancel_flags flags);
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       union ldlm_policy_data *policy,
+			       enum ldlm_mode mode, __u64 lock_flags,
+			       enum ldlm_cancel_flags cancel_flags,
+			       void *opaque);
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       enum ldlm_cancel_flags flags);
+int ldlm_cli_cancel_list(struct list_head *head, int count,
+			 struct ptlrpc_request *req,
+			 enum ldlm_cancel_flags flags);
+
+int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop);
+int ldlm_cli_inodebits_convert(struct ldlm_lock *lock,
+			       enum ldlm_cancel_flags cancel_flags);
+
+/** @} ldlm_cli_api */
+
+extern unsigned int ldlm_enqueue_min;
+
+/* mds/handler.c */
+/* This has to be here because recursive inclusion sucks. */
+int intent_disposition(struct ldlm_reply *rep, int flag);
+void intent_set_disposition(struct ldlm_reply *rep, int flag);
+
+/**
+ * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more
+ * than one lock_res is dead-lock safe.
+ */
+enum lock_res_type {
+        LRT_NORMAL,
+        LRT_NEW
+};
+
+/** Lock resource. */
+static inline void lock_res(struct ldlm_resource *res)
+{
+	spin_lock(&res->lr_lock);
+}
+
+/** Lock resource with a way to instruct lockdep code about nestedness-safe. */
+static inline void lock_res_nested(struct ldlm_resource *res,
+				   enum lock_res_type mode)
+{
+	spin_lock_nested(&res->lr_lock, mode);
+}
+
+/** Unlock resource. */
+static inline void unlock_res(struct ldlm_resource *res)
+{
+	spin_unlock(&res->lr_lock);
+}
+
+/** Check if resource is already locked, assert if not. */
+static inline void check_res_locked(struct ldlm_resource *res)
+{
+	assert_spin_locked(&res->lr_lock);
+}
+
+struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock);
+void unlock_res_and_lock(struct ldlm_lock *lock);
+
+/* ldlm_pool.c */
+/** \defgroup ldlm_pools Various LDLM pool related functions
+ * There are not used outside of ldlm.
+ * @{
+ */
+int ldlm_pools_init(void);
+void ldlm_pools_fini(void);
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, enum ldlm_side client);
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask);
+void ldlm_pool_fini(struct ldlm_pool *pl);
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
+time64_t ldlm_pool_recalc(struct ldlm_pool *pl, bool force);
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl);
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv);
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv);
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit);
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock);
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock);
+/** @} */
+
+static inline int ldlm_extent_overlap(const struct ldlm_extent *ex1,
+				      const struct ldlm_extent *ex2)
+{
+	return ex1->start <= ex2->end && ex2->start <= ex1->end;
+}
+
+/* check if @ex1 contains @ex2 */
+static inline int ldlm_extent_contain(const struct ldlm_extent *ex1,
+				      const struct ldlm_extent *ex2)
+{
+	return ex1->start <= ex2->start && ex1->end >= ex2->end;
+}
+
+int ldlm_inodebits_drop(struct ldlm_lock *lock,  __u64 to_drop);
+
+#endif
+/** @} LDLM */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
new file mode 100644
index 0000000000000..1fa049de2f567
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
@@ -0,0 +1,444 @@
+/*  -*- buffer-read-only: t -*- vi: set ro:
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/**
+ * \file lustre_dlm_flags.h
+ * The flags and collections of flags (masks) for \see struct ldlm_lock.
+ *
+ * \addtogroup LDLM Lustre Distributed Lock Manager
+ * @{
+ *
+ * \name flags
+ * The flags and collections of flags (masks) for \see struct ldlm_lock.
+ * @{
+ */
+#ifndef LDLM_ALL_FLAGS_MASK
+
+/** l_flags bits marked as "all_flags" bits */
+#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC28F932FULL
+
+/** extent, mode, or resource changed */
+#define LDLM_FL_LOCK_CHANGED            0x0000000000000001ULL // bit   0
+#define ldlm_is_lock_changed(_l)        LDLM_TEST_FLAG(( _l), 1ULL <<  0)
+#define ldlm_set_lock_changed(_l)       LDLM_SET_FLAG((  _l), 1ULL <<  0)
+#define ldlm_clear_lock_changed(_l)     LDLM_CLEAR_FLAG((_l), 1ULL <<  0)
+
+/**
+ * Server placed lock on granted list, or a recovering client wants the
+ * lock added to the granted list, no questions asked. */
+#define LDLM_FL_BLOCK_GRANTED           0x0000000000000002ULL // bit   1
+#define ldlm_is_block_granted(_l)       LDLM_TEST_FLAG(( _l), 1ULL <<  1)
+#define ldlm_set_block_granted(_l)      LDLM_SET_FLAG((  _l), 1ULL <<  1)
+#define ldlm_clear_block_granted(_l)    LDLM_CLEAR_FLAG((_l), 1ULL <<  1)
+
+/**
+ * Server placed lock on conv list, or a recovering client wants the lock
+ * added to the conv list, no questions asked. (obsoleted) */
+#define LDLM_FL_BLOCK_CONV              0x0000000000000004ULL // bit   2
+#define ldlm_is_block_conv(_l)          LDLM_TEST_FLAG(( _l), 1ULL <<  2)
+#define ldlm_set_block_conv(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  2)
+#define ldlm_clear_block_conv(_l)       LDLM_CLEAR_FLAG((_l), 1ULL <<  2)
+
+/**
+ * Server placed lock on wait list, or a recovering client wants the lock
+ * added to the wait list, no questions asked. */
+#define LDLM_FL_BLOCK_WAIT              0x0000000000000008ULL // bit   3
+#define ldlm_is_block_wait(_l)          LDLM_TEST_FLAG(( _l), 1ULL <<  3)
+#define ldlm_set_block_wait(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  3)
+#define ldlm_clear_block_wait(_l)       LDLM_CLEAR_FLAG((_l), 1ULL <<  3)
+
+/**
+ * Lock request is speculative/asynchronous, and cannot wait for any reason.
+ * Fail the lock request if any blocking locks are encountered.
+ * */
+#define LDLM_FL_SPECULATIVE		0x0000000000000010ULL /* bit   4 */
+#define ldlm_is_speculative(_l)		LDLM_TEST_FLAG((_l), 1ULL <<  4)
+#define ldlm_set_speculative(_l)	LDLM_SET_FLAG((_l), 1ULL <<  4)
+#define ldlm_clear_specualtive_(_l)	LDLM_CLEAR_FLAG((_l), 1ULL <<  4)
+
+/** blocking or cancel packet was queued for sending. */
+#define LDLM_FL_AST_SENT                0x0000000000000020ULL // bit   5
+#define ldlm_is_ast_sent(_l)            LDLM_TEST_FLAG(( _l), 1ULL <<  5)
+#define ldlm_set_ast_sent(_l)           LDLM_SET_FLAG((  _l), 1ULL <<  5)
+#define ldlm_clear_ast_sent(_l)         LDLM_CLEAR_FLAG((_l), 1ULL <<  5)
+
+/**
+ * Lock is being replayed.  This could probably be implied by the fact that
+ * one of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. */
+#define LDLM_FL_REPLAY                  0x0000000000000100ULL // bit   8
+#define ldlm_is_replay(_l)              LDLM_TEST_FLAG(( _l), 1ULL <<  8)
+#define ldlm_set_replay(_l)             LDLM_SET_FLAG((  _l), 1ULL <<  8)
+#define ldlm_clear_replay(_l)           LDLM_CLEAR_FLAG((_l), 1ULL <<  8)
+
+/** Don't grant lock, just do intent. */
+#define LDLM_FL_INTENT_ONLY             0x0000000000000200ULL // bit   9
+#define ldlm_is_intent_only(_l)         LDLM_TEST_FLAG(( _l), 1ULL <<  9)
+#define ldlm_set_intent_only(_l)        LDLM_SET_FLAG((  _l), 1ULL <<  9)
+#define ldlm_clear_intent_only(_l)      LDLM_CLEAR_FLAG((_l), 1ULL <<  9)
+
+/** lock request has intent */
+#define LDLM_FL_HAS_INTENT              0x0000000000001000ULL // bit  12
+#define ldlm_is_has_intent(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 12)
+#define ldlm_set_has_intent(_l)         LDLM_SET_FLAG((  _l), 1ULL << 12)
+#define ldlm_clear_has_intent(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 12)
+
+/** flock deadlock detected */
+#define LDLM_FL_FLOCK_DEADLOCK          0x0000000000008000ULL // bit  15
+#define ldlm_is_flock_deadlock(_l)      LDLM_TEST_FLAG(( _l), 1ULL << 15)
+#define ldlm_set_flock_deadlock(_l)     LDLM_SET_FLAG((  _l), 1ULL << 15)
+#define ldlm_clear_flock_deadlock(_l)   LDLM_CLEAR_FLAG((_l), 1ULL << 15)
+
+/** discard (no writeback (PW locks) or page retention (PR locks)) on cancel */
+#define LDLM_FL_DISCARD_DATA            0x0000000000010000ULL // bit  16
+#define ldlm_is_discard_data(_l)        LDLM_TEST_FLAG(( _l), 1ULL << 16)
+#define ldlm_set_discard_data(_l)       LDLM_SET_FLAG((  _l), 1ULL << 16)
+#define ldlm_clear_discard_data(_l)     LDLM_CLEAR_FLAG((_l), 1ULL << 16)
+
+/** Blocked by group lock - wait indefinitely */
+#define LDLM_FL_NO_TIMEOUT              0x0000000000020000ULL // bit  17
+#define ldlm_is_no_timeout(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 17)
+#define ldlm_set_no_timeout(_l)         LDLM_SET_FLAG((  _l), 1ULL << 17)
+#define ldlm_clear_no_timeout(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 17)
+
+/**
+ * Server told not to wait if blocked. For AGL, OST will not send glimpse
+ * callback. */
+#define LDLM_FL_BLOCK_NOWAIT            0x0000000000040000ULL // bit  18
+#define ldlm_is_block_nowait(_l)        LDLM_TEST_FLAG(( _l), 1ULL << 18)
+#define ldlm_set_block_nowait(_l)       LDLM_SET_FLAG((  _l), 1ULL << 18)
+#define ldlm_clear_block_nowait(_l)     LDLM_CLEAR_FLAG((_l), 1ULL << 18)
+
+/** return blocking lock */
+#define LDLM_FL_TEST_LOCK               0x0000000000080000ULL // bit  19
+#define ldlm_is_test_lock(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 19)
+#define ldlm_set_test_lock(_l)          LDLM_SET_FLAG((  _l), 1ULL << 19)
+#define ldlm_clear_test_lock(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 19)
+
+/** match lock only */
+#define LDLM_FL_MATCH_LOCK               0x0000000000100000ULL // bit  20
+
+/**
+ * Immediatelly cancel such locks when they block some other locks. Send
+ * cancel notification to original lock holder, but expect no reply. This
+ * is for clients (like liblustre) that cannot be expected to reliably
+ * response to blocking AST. */
+#define LDLM_FL_CANCEL_ON_BLOCK         0x0000000000800000ULL // bit  23
+#define ldlm_is_cancel_on_block(_l)     LDLM_TEST_FLAG(( _l), 1ULL << 23)
+#define ldlm_set_cancel_on_block(_l)    LDLM_SET_FLAG((  _l), 1ULL << 23)
+#define ldlm_clear_cancel_on_block(_l)  LDLM_CLEAR_FLAG((_l), 1ULL << 23)
+
+/** Flag whether a lock is enqueued from a distributed transaction, and the
+ *  requesting lock mode is PW/EX, if so, it will check compatibility with COS
+ *  locks, and different from original COS semantic, transactions from the same
+ *  client is also treated as lock conflict. */
+#define LDLM_FL_COS_INCOMPAT		0x0000000001000000ULL /* bit  24 */
+#define ldlm_is_cos_incompat(_l)	LDLM_TEST_FLAG((_l), 1ULL << 24)
+#define ldlm_set_cos_incompat(_l)	LDLM_SET_FLAG((_l), 1ULL << 24)
+#define ldlm_clear_cos_incompat(_l)	LDLM_CLEAR_FLAG((_l), 1ULL << 24)
+
+/*
+ * Flag indicates that lock is being converted (downgraded) during the blocking
+ * AST instead of cancelling. Used for IBITS locks now and drops conflicting
+ * bits only keepeing other.
+ */
+#define LDLM_FL_CONVERTING              0x0000000002000000ULL /* bit  25 */
+#define ldlm_is_converting(_l)          LDLM_TEST_FLAG((_l), 1ULL << 25)
+#define ldlm_set_converting(_l)         LDLM_SET_FLAG((_l), 1ULL << 25)
+#define ldlm_clear_converting(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 25)
+
+/**
+ * Do not expand this lock.  Grant it only on the extent requested.
+ * Used for manually requested locks from the client (LU_LADVISE_LOCKAHEAD).
+ * */
+#define LDLM_FL_NO_EXPANSION		0x0000000020000000ULL /* bit  29 */
+#define ldlm_is_do_not_expand(_l)	LDLM_TEST_FLAG((_l), 1ULL << 29)
+#define ldlm_set_do_not_expand(_l)	LDLM_SET_FLAG((_l), 1ULL << 29)
+#define ldlm_clear_do_not_expand(_l)	LDLM_CLEAR_FLAG((_l), 1ULL << 29)
+
+/**
+ * measure lock contention and return -EUSERS if locking contention is high */
+#define LDLM_FL_DENY_ON_CONTENTION        0x0000000040000000ULL // bit  30
+#define ldlm_is_deny_on_contention(_l)    LDLM_TEST_FLAG(( _l), 1ULL << 30)
+#define ldlm_set_deny_on_contention(_l)   LDLM_SET_FLAG((  _l), 1ULL << 30)
+#define ldlm_clear_deny_on_contention(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 30)
+
+/**
+ * These are flags that are mapped into the flags and ASTs of blocking
+ * locks Add FL_DISCARD to blocking ASTs */
+#define LDLM_FL_AST_DISCARD_DATA        0x0000000080000000ULL // bit  31
+#define ldlm_is_ast_discard_data(_l)    LDLM_TEST_FLAG(( _l), 1ULL << 31)
+#define ldlm_set_ast_discard_data(_l)   LDLM_SET_FLAG((  _l), 1ULL << 31)
+#define ldlm_clear_ast_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 31)
+
+/**
+ * Used for marking lock as a target for -EINTR while cp_ast sleep emulation
+ * + race with upcoming bl_ast. */
+#define LDLM_FL_FAIL_LOC                0x0000000100000000ULL // bit  32
+#define ldlm_is_fail_loc(_l)            LDLM_TEST_FLAG(( _l), 1ULL << 32)
+#define ldlm_set_fail_loc(_l)           LDLM_SET_FLAG((  _l), 1ULL << 32)
+#define ldlm_clear_fail_loc(_l)         LDLM_CLEAR_FLAG((_l), 1ULL << 32)
+
+/** this lock is being destroyed */
+#define LDLM_FL_CBPENDING               0x0000000400000000ULL // bit  34
+#define ldlm_is_cbpending(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 34)
+#define ldlm_set_cbpending(_l)          LDLM_SET_FLAG((  _l), 1ULL << 34)
+#define ldlm_clear_cbpending(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 34)
+
+/** not a real flag, not saved in lock */
+#define LDLM_FL_WAIT_NOREPROC           0x0000000800000000ULL // bit  35
+#define ldlm_is_wait_noreproc(_l)       LDLM_TEST_FLAG(( _l), 1ULL << 35)
+#define ldlm_set_wait_noreproc(_l)      LDLM_SET_FLAG((  _l), 1ULL << 35)
+#define ldlm_clear_wait_noreproc(_l)    LDLM_CLEAR_FLAG((_l), 1ULL << 35)
+
+/** cancellation callback already run */
+#define LDLM_FL_CANCEL                  0x0000001000000000ULL // bit  36
+#define ldlm_is_cancel(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 36)
+#define ldlm_set_cancel(_l)             LDLM_SET_FLAG((  _l), 1ULL << 36)
+#define ldlm_clear_cancel(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 36)
+
+/** whatever it might mean -- never transmitted? */
+#define LDLM_FL_LOCAL_ONLY              0x0000002000000000ULL // bit  37
+#define ldlm_is_local_only(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 37)
+#define ldlm_set_local_only(_l)         LDLM_SET_FLAG((  _l), 1ULL << 37)
+#define ldlm_clear_local_only(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 37)
+
+/** don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_FAILED                  0x0000004000000000ULL // bit  38
+#define ldlm_is_failed(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 38)
+#define ldlm_set_failed(_l)             LDLM_SET_FLAG((  _l), 1ULL << 38)
+#define ldlm_clear_failed(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 38)
+
+/** lock cancel has already been sent */
+#define LDLM_FL_CANCELING               0x0000008000000000ULL // bit  39
+#define ldlm_is_canceling(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 39)
+#define ldlm_set_canceling(_l)          LDLM_SET_FLAG((  _l), 1ULL << 39)
+#define ldlm_clear_canceling(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 39)
+
+/** local lock (ie, no srv/cli split) */
+#define LDLM_FL_LOCAL                   0x0000010000000000ULL // bit  40
+#define ldlm_is_local(_l)               LDLM_TEST_FLAG(( _l), 1ULL << 40)
+#define ldlm_set_local(_l)              LDLM_SET_FLAG((  _l), 1ULL << 40)
+#define ldlm_clear_local(_l)            LDLM_CLEAR_FLAG((_l), 1ULL << 40)
+
+/**
+ * XXX FIXME: This is being added to b_size as a low-risk fix to the
+ * fact that the LVB filling happens _after_ the lock has been granted,
+ * so another thread can match it before the LVB has been updated.  As a
+ * dirty hack, we set LDLM_FL_LVB_READY only after we've done the LVB poop.
+ * this is only needed on LOV/OSC now, where LVB is actually used and
+ * callers must set it in input flags.
+ *
+ * The proper fix is to do the granting inside of the completion AST,
+ * which can be replaced with a LVB-aware wrapping function for OSC locks.
+ * That change is pretty high-risk, though, and would need a lot more
+ * testing. */
+#define LDLM_FL_LVB_READY               0x0000020000000000ULL // bit  41
+#define ldlm_is_lvb_ready(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 41)
+#define ldlm_set_lvb_ready(_l)          LDLM_SET_FLAG((  _l), 1ULL << 41)
+#define ldlm_clear_lvb_ready(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 41)
+
+/**
+ * A lock contributes to the known minimum size (KMS) calculation until it
+ * has finished the part of its cancelation that performs write back on its
+ * dirty pages.  It can remain on the granted list during this whole time.
+ * Threads racing to update the KMS after performing their writeback need
+ * to know to exclude each other's locks from the calculation as they walk
+ * the granted list. */
+#define LDLM_FL_KMS_IGNORE              0x0000040000000000ULL // bit  42
+#define ldlm_is_kms_ignore(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 42)
+#define ldlm_set_kms_ignore(_l)         LDLM_SET_FLAG((  _l), 1ULL << 42)
+#define ldlm_clear_kms_ignore(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 42)
+
+/** completion AST to be executed */
+#define LDLM_FL_CP_REQD                 0x0000080000000000ULL // bit  43
+#define ldlm_is_cp_reqd(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 43)
+#define ldlm_set_cp_reqd(_l)            LDLM_SET_FLAG((  _l), 1ULL << 43)
+#define ldlm_clear_cp_reqd(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 43)
+
+/** cleanup_resource has already handled the lock */
+#define LDLM_FL_CLEANED                 0x0000100000000000ULL // bit  44
+#define ldlm_is_cleaned(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 44)
+#define ldlm_set_cleaned(_l)            LDLM_SET_FLAG((  _l), 1ULL << 44)
+#define ldlm_clear_cleaned(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 44)
+
+/**
+ * optimization hint: LDLM can run blocking callback from current context
+ * w/o involving separate thread. in order to decrease cs rate */
+#define LDLM_FL_ATOMIC_CB               0x0000200000000000ULL // bit  45
+#define ldlm_is_atomic_cb(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 45)
+#define ldlm_set_atomic_cb(_l)          LDLM_SET_FLAG((  _l), 1ULL << 45)
+#define ldlm_clear_atomic_cb(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 45)
+
+/**
+ * It may happen that a client initiates two operations, e.g. unlink and
+ * mkdir, such that the server sends a blocking AST for conflicting locks
+ * to this client for the first operation, whereas the second operation
+ * has canceled this lock and is waiting for rpc_lock which is taken by
+ * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in
+ * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it. */
+#define LDLM_FL_BL_AST                  0x0000400000000000ULL // bit  46
+#define ldlm_is_bl_ast(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 46)
+#define ldlm_set_bl_ast(_l)             LDLM_SET_FLAG((  _l), 1ULL << 46)
+#define ldlm_clear_bl_ast(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 46)
+
+/**
+ * Set by ldlm_cancel_callback() when lock cache is dropped to let
+ * ldlm_callback_handler() return EINVAL to the server. It is used when
+ * ELC RPC is already prepared and is waiting for rpc_lock, too late to
+ * send a separate CANCEL RPC. */
+#define LDLM_FL_BL_DONE                 0x0000800000000000ULL // bit  47
+#define ldlm_is_bl_done(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 47)
+#define ldlm_set_bl_done(_l)            LDLM_SET_FLAG((  _l), 1ULL << 47)
+#define ldlm_clear_bl_done(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 47)
+
+/**
+ * Don't put lock into the LRU list, so that it is not canceled due
+ * to aging.  Used by MGC locks, they are cancelled only at unmount or
+ * by callback. */
+#define LDLM_FL_NO_LRU                  0x0001000000000000ULL // bit  48
+#define ldlm_is_no_lru(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 48)
+#define ldlm_set_no_lru(_l)             LDLM_SET_FLAG((  _l), 1ULL << 48)
+#define ldlm_clear_no_lru(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 48)
+
+/**
+ * Set for locks that failed and where the server has been notified.
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_FAIL_NOTIFIED           0x0002000000000000ULL // bit  49
+#define ldlm_is_fail_notified(_l)       LDLM_TEST_FLAG(( _l), 1ULL << 49)
+#define ldlm_set_fail_notified(_l)      LDLM_SET_FLAG((  _l), 1ULL << 49)
+#define ldlm_clear_fail_notified(_l)    LDLM_CLEAR_FLAG((_l), 1ULL << 49)
+
+/**
+ * Set for locks that were removed from class hash table and will
+ * be destroyed when last reference to them is released. Set by
+ * ldlm_lock_destroy_internal().
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_DESTROYED               0x0004000000000000ULL // bit  50
+#define ldlm_is_destroyed(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 50)
+#define ldlm_set_destroyed(_l)          LDLM_SET_FLAG((  _l), 1ULL << 50)
+#define ldlm_clear_destroyed(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 50)
+
+/** flag whether this is a server namespace lock */
+#define LDLM_FL_SERVER_LOCK             0x0008000000000000ULL // bit  51
+#define ldlm_is_server_lock(_l)         LDLM_TEST_FLAG(( _l), 1ULL << 51)
+#define ldlm_set_server_lock(_l)        LDLM_SET_FLAG((  _l), 1ULL << 51)
+#define ldlm_clear_server_lock(_l)      LDLM_CLEAR_FLAG((_l), 1ULL << 51)
+
+/**
+ * It's set in lock_res_and_lock() and unset in unlock_res_and_lock().
+ *
+ * NB: compared with check_res_locked(), checking this bit is cheaper.
+ * Also, spin_is_locked() is deprecated for kernel code; one reason is
+ * because it works only for SMP so user needs to add extra macros like
+ * LASSERT_SPIN_LOCKED for uniprocessor kernels. */
+#define LDLM_FL_RES_LOCKED              0x0010000000000000ULL // bit  52
+#define ldlm_is_res_locked(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 52)
+#define ldlm_set_res_locked(_l)         LDLM_SET_FLAG((  _l), 1ULL << 52)
+#define ldlm_clear_res_locked(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 52)
+
+/**
+ * It's set once we call ldlm_add_waiting_lock_res_locked() to start the
+ * lock-timeout timer and it will never be reset.
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_WAITED                  0x0020000000000000ULL // bit  53
+#define ldlm_is_waited(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 53)
+#define ldlm_set_waited(_l)             LDLM_SET_FLAG((  _l), 1ULL << 53)
+#define ldlm_clear_waited(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 53)
+
+/** Flag whether this is a server namespace lock. */
+#define LDLM_FL_NS_SRV                  0x0040000000000000ULL // bit  54
+#define ldlm_is_ns_srv(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 54)
+#define ldlm_set_ns_srv(_l)             LDLM_SET_FLAG((  _l), 1ULL << 54)
+#define ldlm_clear_ns_srv(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 54)
+
+/** Flag whether this lock can be reused. Used by exclusive open. */
+#define LDLM_FL_EXCL                    0x0080000000000000ULL // bit  55
+#define ldlm_is_excl(_l)                LDLM_TEST_FLAG(( _l), 1ULL << 55)
+#define ldlm_set_excl(_l)               LDLM_SET_FLAG((  _l), 1ULL << 55)
+#define ldlm_clear_excl(_l)             LDLM_CLEAR_FLAG((_l), 1ULL << 55)
+
+/** Flag whether a lock is found on server for re-sent RPC. */
+#define LDLM_FL_RESENT                   0x0100000000000000ULL // bit  56
+
+/** Flag whether Commit-on-Sharing is enabled, if LDLM_FL_COS_INCOMPAT is set
+ *  this flag may not be set because once the former is set this flag won't be
+ *  checked, and for cross-MDT lock COS_INCOMPAT is always set but ast handle is
+ *  in ldlm context which doesn't know whether COS is enabled or not. */
+#define LDLM_FL_COS_ENABLED              0x0200000000000000ULL /* bit  57 */
+#define ldlm_is_cos_enabled(_l)          LDLM_TEST_FLAG((_l), 1ULL << 57)
+#define ldlm_set_cos_enabled(_l)         LDLM_SET_FLAG((_l), 1ULL << 57)
+
+/**
+ * This flags means to use non-delay RPC to send dlm request RPC.
+ */
+#define LDLM_FL_NDELAY			 0x0400000000000000ULL /* bit  58 */
+#define ldlm_is_ndelay(_l)		 LDLM_TEST_FLAG((_l), 1ULL << 58)
+#define ldlm_set_ndelay(_l)		 LDLM_SET_FLAG((_l), 1ULL << 58)
+
+/**
+ * LVB from this lock is cached in osc object
+ */
+#define LDLM_FL_LVB_CACHED              0x0800000000000000ULL /* bit  59 */
+#define ldlm_is_lvb_cached(_l)          LDLM_TEST_FLAG((_l), 1ULL << 59)
+#define ldlm_set_lvb_cached(_l)         LDLM_SET_FLAG((_l), 1ULL << 59)
+#define ldlm_clear_lvb_cached(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 59)
+
+/** l_flags bits marked as "ast" bits */
+#define LDLM_FL_AST_MASK                (LDLM_FL_FLOCK_DEADLOCK		|\
+					 LDLM_FL_DISCARD_DATA)
+
+/** l_flags bits marked as "blocked" bits */
+#define LDLM_FL_BLOCKED_MASK            (LDLM_FL_BLOCK_GRANTED		|\
+					 LDLM_FL_BLOCK_WAIT)
+
+/** l_flags bits marked as "gone" bits */
+#define LDLM_FL_GONE_MASK		(LDLM_FL_DESTROYED		|\
+					 LDLM_FL_FAILED)
+
+/** l_flags bits marked as "inherit" bits
+ * Flags inherited from wire on enqueue/reply between client/server.
+ * CANCEL_ON_BLOCK so server will not grant if a blocking lock is found
+ * NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout.
+ * TEST_LOCK flag to not let TEST lock to be granted.
+ * NO_EXPANSION to tell server not to expand extent of lock request */
+#define LDLM_FL_INHERIT_MASK            (LDLM_FL_CANCEL_ON_BLOCK	|\
+					 LDLM_FL_NO_TIMEOUT		|\
+					 LDLM_FL_TEST_LOCK              |\
+					 LDLM_FL_NO_EXPANSION)
+
+/** flags returned in @flags parameter on ldlm_lock_enqueue,
+ * to be re-constructed on re-send */
+#define LDLM_FL_SRV_ENQ_MASK	(LDLM_FL_LOCK_CHANGED		|\
+				 LDLM_FL_BLOCKED_MASK		|\
+				 LDLM_FL_NO_TIMEOUT)
+
+/** test for ldlm_lock flag bit set */
+#define LDLM_TEST_FLAG(_l, _b)    (((_l)->l_flags & (_b)) != 0)
+
+/** multi-bit test: are any of mask bits set? */
+#define LDLM_HAVE_MASK(_l, _m)    (((_l)->l_flags & LDLM_FL_##_m##_MASK) != 0)
+
+/** set a ldlm_lock flag bit */
+#define LDLM_SET_FLAG(_l, _b)     ((_l)->l_flags |= (_b))
+
+/** clear a ldlm_lock flag bit */
+#define LDLM_CLEAR_FLAG(_l, _b)   ((_l)->l_flags &= ~(_b))
+
+/** @} subgroup */
+/** @} group */
+#endif /* LDLM_ALL_FLAGS_MASK */
+
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_errno.h b/drivers/staging/lustrefsx/lustre/include/lustre_errno.h
new file mode 100644
index 0000000000000..fe9ccd2e07a82
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_errno.h
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2011 FUJITSU LIMITED.  All rights reserved.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+
+#ifndef LUSTRE_ERRNO_H
+#define LUSTRE_ERRNO_H
+
+/*
+ * Only "network" errnos, which are defined below, are allowed on wire (or on
+ * disk).  Generic routines exist to help translate between these and a subset
+ * of the "host" errnos.  Some host errnos (e.g., EDEADLOCK) are intentionally
+ * left out.  See also the comment on lustre_errno_hton_mapping[].
+ *
+ * To maintain compatibility with existing x86 clients and servers, each of
+ * these network errnos has the same numerical value as its corresponding host
+ * errno on x86.
+ */
+#define LUSTRE_EPERM		1	/* Operation not permitted */
+#define LUSTRE_ENOENT		2	/* No such file or directory */
+#define LUSTRE_ESRCH		3	/* No such process */
+#define LUSTRE_EINTR		4	/* Interrupted system call */
+#define LUSTRE_EIO		5	/* I/O error */
+#define LUSTRE_ENXIO		6	/* No such device or address */
+#define LUSTRE_E2BIG		7	/* Argument list too long */
+#define LUSTRE_ENOEXEC		8	/* Exec format error */
+#define LUSTRE_EBADF		9	/* Bad file number */
+#define LUSTRE_ECHILD		10	/* No child processes */
+#define LUSTRE_EAGAIN		11	/* Try again */
+#define LUSTRE_ENOMEM		12	/* Out of memory */
+#define LUSTRE_EACCES		13	/* Permission denied */
+#define LUSTRE_EFAULT		14	/* Bad address */
+#define LUSTRE_ENOTBLK		15	/* Block device required */
+#define LUSTRE_EBUSY		16	/* Device or resource busy */
+#define LUSTRE_EEXIST		17	/* File exists */
+#define LUSTRE_EXDEV		18	/* Cross-device link */
+#define LUSTRE_ENODEV		19	/* No such device */
+#define LUSTRE_ENOTDIR		20	/* Not a directory */
+#define LUSTRE_EISDIR		21	/* Is a directory */
+#define LUSTRE_EINVAL		22	/* Invalid argument */
+#define LUSTRE_ENFILE		23	/* File table overflow */
+#define LUSTRE_EMFILE		24	/* Too many open files */
+#define LUSTRE_ENOTTY		25	/* Not a typewriter */
+#define LUSTRE_ETXTBSY		26	/* Text file busy */
+#define LUSTRE_EFBIG		27	/* File too large */
+#define LUSTRE_ENOSPC		28	/* No space left on device */
+#define LUSTRE_ESPIPE		29	/* Illegal seek */
+#define LUSTRE_EROFS		30	/* Read-only file system */
+#define LUSTRE_EMLINK		31	/* Too many links */
+#define LUSTRE_EPIPE		32	/* Broken pipe */
+#define LUSTRE_EDOM		33	/* Math argument out of domain of
+					   func */
+#define LUSTRE_ERANGE		34	/* Math result not representable */
+#define LUSTRE_EDEADLK		35	/* Resource deadlock would occur */
+#define LUSTRE_ENAMETOOLONG	36	/* File name too long */
+#define LUSTRE_ENOLCK		37	/* No record locks available */
+#define LUSTRE_ENOSYS		38	/* Function not implemented */
+#define LUSTRE_ENOTEMPTY	39	/* Directory not empty */
+#define LUSTRE_ELOOP		40	/* Too many symbolic links
+					   encountered */
+#define LUSTRE_ENOMSG		42	/* No message of desired type */
+#define LUSTRE_EIDRM		43	/* Identifier removed */
+#define LUSTRE_ECHRNG		44	/* Channel number out of range */
+#define LUSTRE_EL2NSYNC		45	/* Level 2 not synchronized */
+#define LUSTRE_EL3HLT		46	/* Level 3 halted */
+#define LUSTRE_EL3RST		47	/* Level 3 reset */
+#define LUSTRE_ELNRNG		48	/* Link number out of range */
+#define LUSTRE_EUNATCH		49	/* Protocol driver not attached */
+#define LUSTRE_ENOCSI		50	/* No CSI structure available */
+#define LUSTRE_EL2HLT		51	/* Level 2 halted */
+#define LUSTRE_EBADE		52	/* Invalid exchange */
+#define LUSTRE_EBADR		53	/* Invalid request descriptor */
+#define LUSTRE_EXFULL		54	/* Exchange full */
+#define LUSTRE_ENOANO		55	/* No anode */
+#define LUSTRE_EBADRQC		56	/* Invalid request code */
+#define LUSTRE_EBADSLT		57	/* Invalid slot */
+#define LUSTRE_EBFONT		59	/* Bad font file format */
+#define LUSTRE_ENOSTR		60	/* Device not a stream */
+#define LUSTRE_ENODATA		61	/* No data available */
+#define LUSTRE_ETIME		62	/* Timer expired */
+#define LUSTRE_ENOSR		63	/* Out of streams resources */
+#define LUSTRE_ENONET		64	/* Machine is not on the network */
+#define LUSTRE_ENOPKG		65	/* Package not installed */
+#define LUSTRE_EREMOTE		66	/* Object is remote */
+#define LUSTRE_ENOLINK		67	/* Link has been severed */
+#define LUSTRE_EADV		68	/* Advertise error */
+#define LUSTRE_ESRMNT		69	/* Srmount error */
+#define LUSTRE_ECOMM		70	/* Communication error on send */
+#define LUSTRE_EPROTO		71	/* Protocol error */
+#define LUSTRE_EMULTIHOP	72	/* Multihop attempted */
+#define LUSTRE_EDOTDOT		73	/* RFS specific error */
+#define LUSTRE_EBADMSG		74	/* Not a data message */
+#define LUSTRE_EOVERFLOW	75	/* Value too large for defined data
+					   type */
+#define LUSTRE_ENOTUNIQ		76	/* Name not unique on network */
+#define LUSTRE_EBADFD		77	/* File descriptor in bad state */
+#define LUSTRE_EREMCHG		78	/* Remote address changed */
+#define LUSTRE_ELIBACC		79	/* Can not access a needed shared
+					   library */
+#define LUSTRE_ELIBBAD		80	/* Accessing a corrupted shared
+					   library */
+#define LUSTRE_ELIBSCN		81	/* .lib section in a.out corrupted */
+#define LUSTRE_ELIBMAX		82	/* Attempting to link in too many shared
+					   libraries */
+#define LUSTRE_ELIBEXEC		83	/* Cannot exec a shared library
+					   directly */
+#define LUSTRE_EILSEQ		84	/* Illegal byte sequence */
+#define LUSTRE_ERESTART		85	/* Interrupted system call should be
+					   restarted */
+#define LUSTRE_ESTRPIPE		86	/* Streams pipe error */
+#define LUSTRE_EUSERS		87	/* Too many users */
+#define LUSTRE_ENOTSOCK		88	/* Socket operation on non-socket */
+#define LUSTRE_EDESTADDRREQ	89	/* Destination address required */
+#define LUSTRE_EMSGSIZE		90	/* Message too long */
+#define LUSTRE_EPROTOTYPE	91	/* Protocol wrong type for socket */
+#define LUSTRE_ENOPROTOOPT	92	/* Protocol not available */
+#define LUSTRE_EPROTONOSUPPORT	93	/* Protocol not supported */
+#define LUSTRE_ESOCKTNOSUPPORT	94	/* Socket type not supported */
+#define LUSTRE_EOPNOTSUPP	95	/* Operation not supported on transport
+					   endpoint */
+#define LUSTRE_EPFNOSUPPORT	96	/* Protocol family not supported */
+#define LUSTRE_EAFNOSUPPORT	97	/* Address family not supported by
+					   protocol */
+#define LUSTRE_EADDRINUSE	98	/* Address already in use */
+#define LUSTRE_EADDRNOTAVAIL	99	/* Cannot assign requested address */
+#define LUSTRE_ENETDOWN		100	/* Network is down */
+#define LUSTRE_ENETUNREACH	101	/* Network is unreachable */
+#define LUSTRE_ENETRESET	102	/* Network dropped connection because of
+					   reset */
+#define LUSTRE_ECONNABORTED	103	/* Software caused connection abort */
+#define LUSTRE_ECONNRESET	104	/* Connection reset by peer */
+#define LUSTRE_ENOBUFS		105	/* No buffer space available */
+#define LUSTRE_EISCONN		106	/* Transport endpoint is already
+					   connected */
+#define LUSTRE_ENOTCONN		107	/* Transport endpoint is not
+					   connected */
+#define LUSTRE_ESHUTDOWN	108	/* Cannot send after transport endpoint
+					   shutdown */
+#define LUSTRE_ETOOMANYREFS	109	/* Too many references: cannot splice */
+#define LUSTRE_ETIMEDOUT	110	/* Connection timed out */
+#define LUSTRE_ECONNREFUSED	111	/* Connection refused */
+#define LUSTRE_EHOSTDOWN	112	/* Host is down */
+#define LUSTRE_EHOSTUNREACH	113	/* No route to host */
+#define LUSTRE_EALREADY		114	/* Operation already in progress */
+#define LUSTRE_EINPROGRESS	115	/* Operation now in progress */
+#define LUSTRE_ESTALE		116	/* Stale NFS file handle */
+#define LUSTRE_EUCLEAN		117	/* Structure needs cleaning */
+#define LUSTRE_ENOTNAM		118	/* Not a XENIX named type file */
+#define LUSTRE_ENAVAIL		119	/* No XENIX semaphores available */
+#define LUSTRE_EISNAM		120	/* Is a named type file */
+#define LUSTRE_EREMOTEIO	121	/* Remote I/O error */
+#define LUSTRE_EDQUOT		122	/* Quota exceeded */
+#define LUSTRE_ENOMEDIUM	123	/* No medium found */
+#define LUSTRE_EMEDIUMTYPE	124	/* Wrong medium type */
+#define LUSTRE_ECANCELED	125	/* Operation Canceled */
+#define LUSTRE_ENOKEY		126	/* Required key not available */
+#define LUSTRE_EKEYEXPIRED	127	/* Key has expired */
+#define LUSTRE_EKEYREVOKED	128	/* Key has been revoked */
+#define LUSTRE_EKEYREJECTED	129	/* Key was rejected by service */
+#define LUSTRE_EOWNERDEAD	130	/* Owner died */
+#define LUSTRE_ENOTRECOVERABLE	131	/* State not recoverable */
+#define LUSTRE_ERESTARTSYS	512
+#define LUSTRE_ERESTARTNOINTR	513
+#define LUSTRE_ERESTARTNOHAND	514	/* restart if no handler.. */
+#define LUSTRE_ENOIOCTLCMD	515	/* No ioctl command */
+#define LUSTRE_ERESTART_RESTARTBLOCK 516 /* restart by calling
+					    sys_restart_syscall */
+#define LUSTRE_EBADHANDLE	521	/* Illegal NFS file handle */
+#define LUSTRE_ENOTSYNC		522	/* Update synchronization mismatch */
+#define LUSTRE_EBADCOOKIE	523	/* Cookie is stale */
+#define LUSTRE_ENOTSUPP		524	/* Operation is not supported */
+#define LUSTRE_ETOOSMALL	525	/* Buffer or request is too small */
+#define LUSTRE_ESERVERFAULT	526	/* An untranslatable error occurred */
+#define LUSTRE_EBADTYPE		527	/* Type not supported by server */
+#define LUSTRE_EJUKEBOX		528	/* Request initiated, but will not
+					   complete before timeout */
+#define LUSTRE_EIOCBQUEUED	529	/* iocb queued, will get completion
+					   event */
+
+/*
+ * Translations are optimized away on x86.  Host errnos that shouldn't be put
+ * on wire could leak through as a result.  Do not count on this side effect.
+ */
+#if !defined(__x86_64__) && !defined(__i386__)
+#define LUSTRE_TRANSLATE_ERRNOS
+#endif
+
+#ifdef LUSTRE_TRANSLATE_ERRNOS
+unsigned int lustre_errno_hton(unsigned int h);
+unsigned int lustre_errno_ntoh(unsigned int n);
+#else
+#define lustre_errno_hton(h) (h)
+#define lustre_errno_ntoh(n) (n)
+#endif
+
+#endif /* LUSTRE_ERRNO_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_export.h b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
new file mode 100644
index 0000000000000..a0682d85620c4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
@@ -0,0 +1,519 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+/** \defgroup obd_export PortalRPC export definitions
+ *
+ * @{
+ */
+
+#ifndef __EXPORT_H
+#define __EXPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <linux/rhashtable.h>
+#include <linux/workqueue.h>
+
+#include <lprocfs_status.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+struct mds_client_data;
+struct mdt_client_data;
+struct mds_idmap_table;
+struct mdt_idmap_table;
+
+/**
+ * Target-specific export data
+ */
+struct tg_export_data {
+	/** Protects ted_lcd, ted_reply_* and
+	 * ted_release_* fields below */
+	struct mutex		ted_lcd_lock;
+	/** Per-client data for each export */
+	struct lsd_client_data	*ted_lcd;
+	/** Offset of record in last_rcvd file */
+	loff_t			ted_lr_off;
+	/** Client index in last_rcvd file */
+	int			ted_lr_idx;
+
+	/**
+	 * ted_nodemap_lock is used to ensure that the nodemap is not destroyed
+	 * between the time that ted_nodemap is checked for NULL, and a
+	 * reference is taken. Modifications to ted_nodemap require that the
+	 * active_config_lock and the nodemap(s)'s nm_member_list_lock be
+	 * taken, as well as ted_nodemap_lock, so the export can be properly
+	 * added to or removed from the nodemap's member list. When an export
+	 * is added to a nodemap, a reference on that nodemap must be taken.
+	 * That reference can be put only after ted_nodemap no longer refers to
+	 * it.
+	 */
+	spinlock_t		ted_nodemap_lock;
+	struct lu_nodemap	*ted_nodemap;
+	struct list_head	ted_nodemap_member;
+
+	/** last version of nodemap config sent to client */
+	__u64			ted_nodemap_version;
+
+	/* Every reply data fields below are
+	 * protected by ted_lcd_lock */
+	/** List of reply data */
+	struct list_head	ted_reply_list;
+	int			ted_reply_cnt;
+	/** Reply data with highest transno is retained */
+	struct tg_reply_data	*ted_reply_last;
+	/* Statistics */
+	int			ted_reply_max; /* high water mark */
+	int			ted_release_xid;
+	int			ted_release_tag;
+	/* grants */
+	long			ted_dirty;    /* in bytes */
+	long			ted_grant;    /* in bytes */
+	long			ted_pending;  /* bytes just being written */
+	__u8			ted_pagebits; /* log2 of client page size */
+
+	/**
+	 * File Modification Data (FMD) tracking
+	 */
+	spinlock_t		ted_fmd_lock; /* protects ted_fmd_list */
+	struct list_head	ted_fmd_list; /* FIDs being modified */
+	int			ted_fmd_count;/* items in ted_fmd_list */
+};
+
+/**
+ * MDT-specific export data
+ */
+struct mdt_export_data {
+	struct tg_export_data	med_ted;
+	/** List of all files opened by client on this MDT */
+	struct list_head	med_open_head;
+	spinlock_t		med_open_lock; /* med_open_head, mfd_list */
+};
+
+struct ec_export_data { /* echo client */
+	struct list_head	eced_locks;
+};
+
+/* In-memory access to client data from OST struct */
+/** Filter (oss-side) specific import data */
+struct filter_export_data {
+	struct tg_export_data	fed_ted;
+	__u64			fed_lastid_gen;
+	/* count of SOFT_SYNC RPCs, which will be reset after
+	 * ofd_soft_sync_limit number of RPCs, and trigger a sync. */
+	atomic_t		fed_soft_sync_count;
+	__u32			fed_group;
+};
+
+struct mgs_export_data {
+	struct list_head	med_clients;	/* mgc fs client via this exp */
+	spinlock_t		med_lock;	/* protect med_clients */
+};
+
+/**
+ * per-NID statistics structure.
+ * It tracks access patterns to this export on a per-client-NID basis
+ */
+struct nid_stat {
+	struct lnet_nid		 nid;
+	struct hlist_node	 nid_hash;
+	struct list_head	 nid_list;
+	struct obd_device       *nid_obd;
+	struct proc_dir_entry   *nid_proc;
+	struct lprocfs_stats    *nid_stats;
+	struct lprocfs_stats    *nid_ldlm_stats;
+	atomic_t		 nid_exp_ref_count; /* for obd_nid_stats_hash
+						       exp_nid_stats */
+};
+
+#define nidstat_getref(nidstat)                                                \
+do {                                                                           \
+	atomic_inc(&(nidstat)->nid_exp_ref_count);                         \
+} while(0)
+
+#define nidstat_putref(nidstat)                                                \
+do {                                                                           \
+	atomic_dec(&(nidstat)->nid_exp_ref_count);                         \
+	LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0,          \
+		 "stat %p nid_exp_ref_count < 0\n", nidstat);                  \
+} while(0)
+
+enum obd_option {
+        OBD_OPT_FORCE =         0x0001,
+        OBD_OPT_FAILOVER =      0x0002,
+        OBD_OPT_ABORT_RECOV =   0x0004,
+};
+
+/**
+ * Export structure. Represents target-side of connection in portals.
+ * Also used in Lustre to connect between layers on the same node when
+ * there is no network-connection in-between.
+ * For every connected client there is an export structure on the server
+ * attached to the same obd device.
+ */
+struct obd_export {
+	/**
+	 * Export handle, it's id is provided to client on connect
+	 * Subsequent client RPCs contain this handle id to identify
+	 * what export they are talking to.
+	 */
+	struct portals_handle	exp_handle;
+	/**
+	 * Set of counters below is to track where export references are
+	 * kept. The exp_rpc_count is used for reconnect handling also,
+	 * the cb_count and locks_count are for debug purposes only for now.
+	 * The sum of them should be less than exp_handle.href by 3
+	 */
+	atomic_t		exp_rpc_count; /* RPC references */
+	atomic_t		exp_cb_count; /* Commit callback references */
+	/** Number of queued replay requests to be processes */
+	atomic_t		exp_replay_count;
+	atomic_t		exp_locks_count; /** Lock references */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	struct list_head	exp_locks_list;
+	spinlock_t		exp_locks_list_guard;
+#endif
+        /** UUID of client connected to this export */
+	struct obd_uuid		exp_client_uuid;
+        /** To link all exports on an obd device */
+	struct list_head	exp_obd_chain;
+	/** work_struct for destruction of export */
+	struct work_struct	exp_zombie_work;
+	/* Unlinked export list */
+	struct list_head	exp_stale_list;
+	struct rhash_head	exp_uuid_hash;	/** uuid-export hash */
+	struct rhlist_head	exp_nid_hash;	/** nid-export hash */
+	struct hlist_node	exp_gen_hash;   /** last_rcvd clt gen hash */
+        /**
+         * All exports eligible for ping evictor are linked into a list
+         * through this field in "most time since last request on this export"
+         * order
+         * protected by obd_dev_lock
+         */
+	struct list_head	exp_obd_chain_timed;
+	/** Obd device of this export */
+	struct obd_device      *exp_obd;
+	/**
+	 * "reverse" import to send requests (e.g. from ldlm) back to client
+	 * exp_lock protect its change
+	 */
+        struct obd_import        *exp_imp_reverse;
+        struct nid_stat          *exp_nid_stats;
+        /** Active connetion */
+        struct ptlrpc_connection *exp_connection;
+	/** Connection count value from last successful reconnect rpc */
+	__u32			  exp_conn_cnt;
+	/** Hash list of all ldlm locks granted on this export */
+	struct cfs_hash		 *exp_lock_hash;
+	/**
+	 * Hash list for Posix lock deadlock detection, added with
+	 * ldlm_lock::l_exp_flock_hash.
+	 */
+	struct cfs_hash	       *exp_flock_hash;
+	struct list_head	exp_outstanding_replies;
+	struct list_head	exp_uncommitted_replies;
+	spinlock_t		exp_uncommitted_replies_lock;
+	/** Last committed transno for this export */
+	__u64			exp_last_committed;
+	/** When was last request received */
+	time64_t		exp_last_request_time;
+	/** On replay all requests waiting for replay are linked here */
+	struct list_head	exp_req_replay_queue;
+	/**
+	 * protects exp_flags, exp_outstanding_replies and the change
+	 * of exp_imp_reverse
+	 */
+	spinlock_t		exp_lock;
+	/** Compatibility flags for this export are embedded into
+	 *  exp_connect_data */
+	struct obd_connect_data exp_connect_data;
+	enum obd_option		exp_flags;
+	unsigned long		exp_failed:1,
+				exp_in_recovery:1,
+				exp_disconnected:1,
+				exp_connecting:1,
+				/** VBR: export missed recovery */
+				exp_delayed:1,
+				/** VBR: failed version checking */
+				exp_vbr_failed:1,
+				exp_req_replay_needed:1,
+				exp_lock_replay_needed:1,
+				exp_need_sync:1,
+				exp_flvr_changed:1,
+				exp_flvr_adapt:1,
+				/* if to swap nidtbl entries for 2.2 clients.
+				 * Only used by the MGS to fix LU-1644. */
+				exp_need_mne_swab:1,
+				/* The export already got final replay ping
+				 * request. */
+				exp_replay_done:1,
+				/* local client with recovery disabled */
+				exp_no_recovery:1,
+				exp_hashed:1;
+	/* also protected by exp_lock */
+	enum lustre_sec_part	exp_sp_peer;
+	struct sptlrpc_flavor	exp_flvr;		/* current */
+	struct sptlrpc_flavor	exp_flvr_old[2];	/* about-to-expire */
+	time64_t		exp_flvr_expire[2];	/* seconds */
+
+	/** protects exp_hp_rpcs */
+	spinlock_t		exp_rpc_lock;
+	struct list_head	exp_hp_rpcs;	/* (potential) HP RPCs */
+	struct list_head	exp_reg_rpcs;  /* RPC being handled */
+
+	/** blocking dlm lock list, protected by exp_bl_list_lock */
+	struct list_head	exp_bl_list;
+	spinlock_t		exp_bl_list_lock;
+
+        /** Target specific data */
+        union {
+                struct tg_export_data     eu_target_data;
+                struct mdt_export_data    eu_mdt_data;
+                struct filter_export_data eu_filter_data;
+                struct ec_export_data     eu_ec_data;
+                struct mgs_export_data    eu_mgs_data;
+        } u;
+
+	struct adaptive_timeout    exp_bl_lock_at;
+
+	/** highest XID received by export client that has no
+	 * unreceived lower-numbered XID
+	 */
+	__u64			exp_last_xid;
+	long			*exp_used_slots;
+};
+
+#define exp_target_data u.eu_target_data
+#define exp_mdt_data    u.eu_mdt_data
+#define exp_filter_data u.eu_filter_data
+#define exp_ec_data     u.eu_ec_data
+
+static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp)
+{
+	return &exp->exp_connect_data.ocd_connect_flags;
+}
+
+static inline __u64 exp_connect_flags(struct obd_export *exp)
+{
+	return *exp_connect_flags_ptr(exp);
+}
+
+static inline __u64 *exp_connect_flags2_ptr(struct obd_export *exp)
+{
+	return &exp->exp_connect_data.ocd_connect_flags2;
+}
+
+static inline __u64 exp_connect_flags2(struct obd_export *exp)
+{
+	if (exp_connect_flags(exp) & OBD_CONNECT_FLAGS2)
+		return *exp_connect_flags2_ptr(exp);
+	return 0;
+}
+
+static inline int exp_max_brw_size(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)
+		return exp->exp_connect_data.ocd_brw_size;
+
+	return ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_multibulk(struct obd_export *exp)
+{
+	return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_cancelset(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET);
+}
+
+static inline int exp_connect_lru_resize(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	LASSERT(exp->exp_connection);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR);
+}
+
+static inline int exp_connect_umask(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK);
+}
+
+static inline int imp_connect_lru_resize(struct obd_import *imp)
+{
+        struct obd_connect_data *ocd;
+
+        LASSERT(imp != NULL);
+        ocd = &imp->imp_connect_data;
+        return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_layout(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK);
+}
+
+static inline bool exp_connect_lvb_type(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+static inline bool imp_connect_lvb_type(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+static inline bool imp_connect_disp_stripe(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE;
+}
+
+static inline bool imp_connect_shortio(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+	return ocd->ocd_connect_flags & OBD_CONNECT_SHORTIO;
+}
+
+static inline __u64 exp_connect_ibits(struct obd_export *exp)
+{
+	struct obd_connect_data *ocd;
+
+	ocd = &exp->exp_connect_data;
+	return ocd->ocd_ibits_known;
+}
+
+static inline int exp_connect_large_acl(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LARGE_ACL);
+}
+
+static inline int exp_connect_lockahead(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCKAHEAD);
+}
+
+static inline int exp_connect_overstriping(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_OVERSTRIPING);
+}
+
+static inline int exp_connect_flr(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_FLR);
+}
+
+static inline int exp_bypass_mdll(struct obd_export *exp)
+{
+    return !!(exp_connect_flags2(exp) & OBD_CONNECT2_MDLL_BYPASS);
+}
+
+static inline int exp_mdll(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_MDLL);
+}
+
+static inline int exp_connect_lock_convert(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCK_CONVERT);
+}
+
+extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+
+static inline int exp_connect_archive_id_array(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_ARCHIVE_ID_ARRAY);
+}
+
+static inline int exp_connect_sepol(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_SELINUX_POLICY);
+}
+
+static inline int exp_connect_encrypt(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_ENCRYPT);
+}
+
+static inline int exp_connect_lseek(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LSEEK);
+}
+
+static inline int exp_connect_dom_lvb(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_DOM_LVB);
+}
+
+enum {
+	/* archive_ids in array format */
+	KKUC_CT_DATA_ARRAY_MAGIC	= 0x092013cea,
+	/* archive_ids in bitmap format */
+	KKUC_CT_DATA_BITMAP_MAGIC	= 0x082018cea,
+};
+
+
+struct kkuc_ct_data {
+	__u32		kcd_magic;
+	__u32		kcd_nr_archives;
+	__u32		kcd_archives[0];
+};
+
+/** @} export */
+
+#endif /* __EXPORT_H */
+/** @} obd_export */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
new file mode 100644
index 0000000000000..e82b847885eac
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
@@ -0,0 +1,953 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_fid.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef __LUSTRE_FID_H
+#define __LUSTRE_FID_H
+
+/** \defgroup fid fid
+ *
+ * @{
+ *
+ * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs
+ * describes the FID namespace and interoperability requirements for FIDs.
+ * The important parts of that document are included here for reference.
+ *
+ * FID
+ *   File IDentifier generated by client from range allocated by the SEQuence
+ *   service and stored in struct lu_fid. The FID is composed of three parts:
+ *   SEQuence, ObjectID, and VERsion.  The SEQ component is a filesystem
+ *   unique 64-bit integer, and only one client is ever assigned any SEQ value.
+ *   The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved
+ *   for system use.  The OID component is a 32-bit value generated by the
+ *   client on a per-SEQ basis to allow creating many unique FIDs without
+ *   communication with the server.  The VER component is a 32-bit value that
+ *   distinguishes between different FID instantiations, such as snapshots or
+ *   separate subtrees within the filesystem.  FIDs with the same VER field
+ *   are considered part of the same namespace.
+ *
+ * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and
+ *   MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while
+ *   OSTs use 64-bit Lustre object IDs and generation numbers.
+ *
+ * NEW filesystems are those formatted since the introduction of FIDs.
+ *
+ * IGIF
+ *   Inode and Generation In FID, a surrogate FID used to globally identify
+ *   an existing object on OLD formatted MDT file system. This would only be
+ *   used on MDT0 in a DNE filesystem, because there cannot be more than one
+ *   MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1]
+ *   range, where inode number is stored in SEQ, and inode generation is in OID.
+ *   NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ *   which is the maximum possible for an ldiskfs backend.  It also assumes
+ *   that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ *   to clients, which has always been true.
+ *
+ * IDIF
+ *   object ID In FID, a surrogate FID used to globally identify an existing
+ *   OST object on OLD formatted OST file system. Belongs to a sequence in
+ *   [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *
+ *      1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ *
+ *   that is, SEQ consists of 16-bit OST index, and higher 16 bits of object
+ *   ID. The generation of unique SEQ values per OST allows the IDIF FIDs to
+ *   be identified in the FLD correctly. The OID field is calculated as:
+ *
+ *      objid & 0xffffffff
+ *
+ *   that is, it consists of lower 32 bits of object ID.  For objects within
+ *   the IDIF range, object ID extraction will be:
+ *
+ *      o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ *      o_seq = 0;  // formerly group number
+ *
+ *   NOTE: This assumes that no more than 2^48-1 objects have ever been created
+ *   on any OST, and that no more than 65535 OSTs are in use.  Both are very
+ *   reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming
+ *   a maximum creation rate of 1M objects per second for a maximum of 9 years,
+ *   or combinations thereof.
+ *
+ * OST_MDT0
+ *   Surrogate FID used to identify an existing object on OLD formatted OST
+ *   filesystem. Belongs to the reserved SEQuence 0, and is used prior to
+ *   the introduction of FID-on-OST, at which point IDIF will be used to
+ *   identify objects as residing on a specific OST.
+ *
+ * LLOG
+ *   For Lustre Log objects the object sequence 1 is used. This is compatible
+ *   with both OLD and NEW namespaces, as this SEQ number is in the
+ *   ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * ECHO
+ *   For testing OST IO performance the object sequence 2 is used. This is
+ *   compatible with both OLD and NEW namespaces, as this SEQ number is in
+ *   the ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * OST_MDT1 .. OST_MAX
+ *   For testing with multiple MDTs the object sequence 3 through 9 is used,
+ *   allowing direct mapping of MDTs 1 through 7 respectively, for a total
+ *   of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ *   mappings. However, this SEQ range is only for testing prior to any
+ *   production DNE release, as the objects in this range conflict across all
+ *   OSTs, as the OST index is not part of the FID.  For production DNE usage,
+ *   OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs.
+ *
+ * DLM OST objid to IDIF mapping
+ *   For compatibility with existing OLD OST network protocol structures, the
+ *   FID must map onto the o_id and o_seq in a manner that ensures existing
+ *   objects are identified consistently for IO, as well as onto the LDLM
+ *   namespace to ensure IDIFs there is only a single resource name for any
+ *   object in the DLM.  The OLD OST object DLM resource mapping is:
+ *
+ *      resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases
+ *
+ *   The NEW OST object DLM resource mapping is the same for both MDT and OST:
+ *
+ *      resource[] = {SEQ, OID, VER, HASH};
+ *
+ *  NOTE: for mapping IDIF values to DLM resource names the o_id may be
+ *  larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ *  for the o_id numbers to overlap FID SEQ numbers in the resource. However,
+ *  in all production releases the OLD o_seq field is always zero, and all
+ *  valid FID OID values are non-zero, so the lock resources will not collide.
+ *  Even so, the MDT and OST resources are also in different LDLM namespaces.
+ */
+
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_fid.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_ostid.h>
+
+/* Lustre service names are following the format
+ * service name + MDT + seq name
+ */
+#define LUSTRE_MDT_MAXNAMELEN		80
+
+struct lu_env;
+struct lu_site;
+struct lu_context;
+struct obd_device;
+struct obd_export;
+
+/* Whole sequences space range and zero range definitions */
+extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
+extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
+extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_LPF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
+extern const struct lu_fid LU_BACKEND_LPF_FID;
+
+enum {
+	/*
+	 * This is how may metadata FIDs may be allocated in one sequence(128k)
+	 */
+	LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
+
+	/*
+	 * This is how many data FIDs could be allocated in one sequence(4B - 1)
+	 */
+	LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+
+	/*
+	 * How many sequences to allocate to a client at once.
+	 */
+	LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL,
+
+	/*
+	 * seq allocation pool size.
+	 */
+	LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000,
+
+	/*
+	 * This is how many sequences may be in one super-sequence allocated to
+	 * MDTs.
+	 */
+	LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
+};
+
+/** special OID for local objects */
+enum local_oid {
+	/** \see fld_mod_init */
+	FLD_INDEX_OID		= 3UL,
+	/** \see fid_mod_init */
+	FID_SEQ_CTL_OID		= 4UL,
+	FID_SEQ_SRV_OID		= 5UL,
+	/** \see mdd_mod_init */
+	MDD_ROOT_INDEX_OID	= 6UL, /* deprecated in 2.4 */
+	MDD_ORPHAN_OID		= 7UL, /* deprecated in 2.4 */
+	MDD_LOV_OBJ_OID		= 8UL,
+	MDD_CAPA_KEYS_OID	= 9UL,
+	/** \see mdt_mod_init */
+	LAST_RECV_OID		= 11UL,
+	OSD_FS_ROOT_OID		= 13UL,
+	ACCT_USER_OID		= 15UL,
+	ACCT_GROUP_OID		= 16UL,
+	LFSCK_BOOKMARK_OID	= 17UL,
+	OTABLE_IT_OID		= 18UL,
+	OSD_LPF_OID		= 19UL,
+	REPLY_DATA_OID		= 21UL,
+	ACCT_PROJECT_OID	= 22UL,
+	INDEX_BACKUP_OID	= 4116UL,
+	OFD_LAST_GROUP_OID	= 4117UL,
+	LLOG_CATALOGS_OID	= 4118UL,
+	MGS_CONFIGS_OID		= 4119UL,
+	OFD_HEALTH_CHECK_OID	= 4120UL,
+	MDD_LOV_OBJ_OSEQ	= 4121UL,
+	LFSCK_NAMESPACE_OID     = 4122UL,
+	REMOTE_PARENT_DIR_OID	= 4123UL,
+	/* This definition is obsolete
+	 * SLAVE_LLOG_CATALOGS_OID	= 4124UL,
+	 */
+	BATCHID_COMMITTED_OID   = 4125UL,
+};
+
+static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+        fid->f_seq = FID_SEQ_LOCAL_FILE;
+        fid->f_oid = oid;
+        fid->f_ver = 0;
+}
+
+static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+        fid->f_seq = FID_SEQ_LOCAL_NAME;
+        fid->f_oid = oid;
+        fid->f_ver = 0;
+}
+
+/* For new FS (>= 2.4), the root FID will be changed to
+ * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4),
+ * the root FID will still be IGIF */
+static inline int fid_is_root(const struct lu_fid *fid)
+{
+	return unlikely((fid_seq(fid) == FID_SEQ_ROOT &&
+			 fid_oid(fid) == FID_OID_ROOT));
+}
+
+static inline int fid_is_dot_lustre(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE);
+}
+
+static inline int fid_is_obf(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF);
+}
+
+static inline int fid_is_otable_it(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+			fid_oid(fid) == OTABLE_IT_OID);
+}
+
+static inline int fid_oid_is_quota(const struct lu_fid *fid)
+{
+	switch (fid_oid(fid)) {
+	case ACCT_USER_OID:
+	case ACCT_GROUP_OID:
+	case ACCT_PROJECT_OID:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static inline int fid_is_acct(const struct lu_fid *fid)
+{
+        return fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+	       fid_oid_is_quota(fid);
+}
+
+static inline int fid_is_quota(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_QUOTA ||
+	       fid_seq(fid) == FID_SEQ_QUOTA_GLB;
+}
+
+static inline int fid_is_name_llog(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_LLOG_NAME;
+}
+
+static inline int fid_seq_in_fldb(u64 seq)
+{
+	return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
+	       fid_seq_is_root(seq) || fid_seq_is_dot(seq);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
+{
+	const __u64 seq = fid_seq(fid);
+
+	/* Here, we cannot distinguish whether the normal FID is for OST
+	 * object or not. It is caller's duty to check more if needed. */
+	return (!fid_is_last_id(fid) &&
+		(fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) ||
+	       fid_is_root(fid) || fid_seq_is_dot(seq);
+}
+
+static inline void ost_layout_cpu_to_le(struct ost_layout *dst,
+					const struct ost_layout *src)
+{
+	dst->ol_stripe_size = __cpu_to_le32(src->ol_stripe_size);
+	dst->ol_stripe_count = __cpu_to_le32(src->ol_stripe_count);
+	dst->ol_comp_start = __cpu_to_le64(src->ol_comp_start);
+	dst->ol_comp_end = __cpu_to_le64(src->ol_comp_end);
+	dst->ol_comp_id = __cpu_to_le32(src->ol_comp_id);
+}
+
+static inline void ost_layout_le_to_cpu(struct ost_layout *dst,
+					const struct ost_layout *src)
+{
+	dst->ol_stripe_size = __le32_to_cpu(src->ol_stripe_size);
+	dst->ol_stripe_count = __le32_to_cpu(src->ol_stripe_count);
+	dst->ol_comp_start = __le64_to_cpu(src->ol_comp_start);
+	dst->ol_comp_end = __le64_to_cpu(src->ol_comp_end);
+	dst->ol_comp_id = __le32_to_cpu(src->ol_comp_id);
+}
+
+static inline void filter_fid_cpu_to_le(struct filter_fid *dst,
+					const struct filter_fid *src, int size)
+{
+	fid_cpu_to_le(&dst->ff_parent, &src->ff_parent);
+
+	if (size < sizeof(struct filter_fid)) {
+		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
+	} else {
+		ost_layout_cpu_to_le(&dst->ff_layout, &src->ff_layout);
+		dst->ff_layout_version = cpu_to_le32(src->ff_layout_version);
+		dst->ff_range = cpu_to_le32(src->ff_range);
+	}
+
+	/* XXX: Add more if filter_fid is enlarged in the future. */
+}
+
+static inline void filter_fid_le_to_cpu(struct filter_fid *dst,
+					const struct filter_fid *src, int size)
+{
+	fid_le_to_cpu(&dst->ff_parent, &src->ff_parent);
+
+	if (size < sizeof(struct filter_fid)) {
+		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
+	} else {
+		ost_layout_le_to_cpu(&dst->ff_layout, &src->ff_layout);
+		dst->ff_layout_version = le32_to_cpu(src->ff_layout_version);
+		dst->ff_range = le32_to_cpu(src->ff_range);
+	}
+
+	/* XXX: Add more if filter_fid is enlarged in the future. */
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq, __u32 ost_idx)
+{
+	if (fid_seq_is_mdt0(seq)) {
+		fid->f_seq = fid_idif_seq(0, ost_idx);
+	} else {
+		LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) ||
+			 fid_seq_is_idif(seq), "%#llx\n", seq);
+		fid->f_seq = seq;
+	}
+	fid->f_oid = 0;
+	fid->f_ver = 0;
+}
+
+static inline bool fid_is_md_operative(const struct lu_fid *fid)
+{
+	return fid_is_mdt0(fid) || fid_is_igif(fid) ||
+	       fid_is_norm(fid) || fid_is_root(fid);
+}
+
+/* seq client type */
+enum lu_cli_type {
+	LUSTRE_SEQ_METADATA = 1,
+	LUSTRE_SEQ_DATA
+};
+
+enum lu_mgr_type {
+        LUSTRE_SEQ_SERVER,
+        LUSTRE_SEQ_CONTROLLER
+};
+
+struct lu_server_seq;
+
+/* Client sequence manager interface. */
+struct lu_client_seq {
+	/* Sequence-controller export. */
+	struct obd_export	*lcs_exp;
+	struct mutex		lcs_mutex;
+
+	/*
+	 * Range of allowed for allocation sequeces. When using lu_client_seq on
+	 * clients, this contains meta-sequence range. And for servers this
+	 * contains super-sequence range.
+	 */
+	struct lu_seq_range	lcs_space;
+
+	/* Seq related debugfs */
+	struct dentry		*lcs_debugfs_entry;
+
+	/* This holds last allocated fid in last obtained seq */
+	struct lu_fid		lcs_fid;
+
+	/* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+	enum lu_cli_type	lcs_type;
+
+	/*
+	 * Service uuid, passed from MDT + seq name to form unique seq name to
+	 * use it with debugfs.
+	 */
+	char			lcs_name[LUSTRE_MDT_MAXNAMELEN];
+
+	/*
+	 * Sequence width, that is how many objects may be allocated in one
+	 * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+	 */
+	__u64			lcs_width;
+
+	/* Seq-server for direct talking */
+	struct lu_server_seq	*lcs_srv;
+};
+
+/* server sequence manager interface */
+struct lu_server_seq {
+        /* Available sequences space */
+        struct lu_seq_range         lss_space;
+
+        /* keeps highwater in lsr_end for seq allocation algorithm */
+        struct lu_seq_range         lss_lowater_set;
+        struct lu_seq_range         lss_hiwater_set;
+
+        /*
+         * Device for server side seq manager needs (saving sequences to backing
+         * store).
+         */
+        struct dt_device       *lss_dev;
+
+        /* /seq file object device */
+        struct dt_object       *lss_obj;
+
+	/* Seq related debugfs */
+	struct dentry		*lss_debugfs_entry;
+
+        /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
+        enum lu_mgr_type       lss_type;
+
+	/* Client interface to request controller */
+        struct lu_client_seq   *lss_cli;
+
+        /* Mutex for protecting allocation */
+	struct mutex		lss_mutex;
+
+	/*
+	 * Service uuid, passed from MDT + seq name to form unique seq name to
+	 * use it with debugfs.
+	 */
+	char			lss_name[LUSTRE_MDT_MAXNAMELEN];
+
+        /*
+         * Allocation chunks for super and meta sequences. Default values are
+         * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH.
+         */
+        __u64                   lss_width;
+
+        /*
+         * minimum lss_alloc_set size that should be allocated from
+         * lss_space
+         */
+        __u64                   lss_set_width;
+
+        /* sync is needed for update operation */
+        __u32                   lss_need_sync;
+
+	/**
+	 * Pointer to site object, required to access site fld.
+	 */
+	struct seq_server_site  *lss_site;
+};
+
+struct seq_server_site {
+	struct lu_site	     *ss_lu;
+	/**
+	 * mds number of this site.
+	 */
+	u32		      ss_node_id;
+	/**
+	 * Fid location database
+	 */
+	struct lu_server_fld *ss_server_fld;
+	struct lu_client_fld *ss_client_fld;
+
+	/**
+	 * Server Seq Manager
+	 */
+	struct lu_server_seq *ss_server_seq;
+
+	/**
+	 * Controller Seq Manager
+	 */
+	struct lu_server_seq *ss_control_seq;
+	struct obd_export    *ss_control_exp;
+
+	/**
+	 * Client Seq Manager
+	 */
+	struct lu_client_seq *ss_client_seq;
+};
+
+/* Server methods */
+
+int seq_server_init(const struct lu_env *env,
+		    struct lu_server_seq *seq,
+		    struct dt_device *dev,
+		    const char *prefix,
+		    enum lu_mgr_type type,
+		    struct seq_server_site *ss);
+
+void seq_server_fini(struct lu_server_seq *seq,
+                     const struct lu_env *env);
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+                           struct lu_seq_range *out,
+                           const struct lu_env *env);
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+                          struct lu_seq_range *out,
+                          const struct lu_env *env);
+
+int seq_server_set_cli(const struct lu_env *env,
+		       struct lu_server_seq *seq,
+		       struct lu_client_seq *cli);
+
+int seq_server_check_and_alloc_super(const struct lu_env *env,
+				     struct lu_server_seq *seq);
+/* Client methods */
+void seq_client_init(struct lu_client_seq *seq,
+		     struct obd_export *exp,
+		     enum lu_cli_type type,
+		     const char *prefix,
+		     struct lu_server_seq *srv);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+void seq_client_flush(struct lu_client_seq *seq);
+
+int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq,
+                         struct lu_fid *fid);
+int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq,
+		       u64 *seqnr);
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss);
+/* Fids common stuff */
+int fid_is_local(const struct lu_env *env,
+                 struct lu_site *site, const struct lu_fid *fid);
+
+enum lu_cli_type;
+int client_fid_init(struct obd_device *obd, struct obd_export *exp,
+		    enum lu_cli_type type);
+int client_fid_fini(struct obd_device *obd);
+
+/* fid locking */
+
+struct ldlm_namespace;
+
+/*
+ * Build (DLM) resource name from FID.
+ *
+ * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * renaming name[2,3] fields that need to be used for the quota identifier.
+ */
+static inline void
+fid_build_reg_res_name(const struct lu_fid *fid, struct ldlm_res_id *res)
+{
+	memset(res, 0, sizeof(*res));
+	res->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(fid);
+	res->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(fid);
+}
+
+/*
+ * Return true if resource is for object identified by FID.
+ */
+static inline int fid_res_name_eq(const struct lu_fid *fid,
+				  const struct ldlm_res_id *res)
+{
+	return res->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(fid) &&
+	       res->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(fid);
+}
+
+/*
+ * Extract FID from LDLM resource. Reverse of fid_build_reg_res_name().
+ */
+static inline void
+fid_extract_from_res_name(struct lu_fid *fid, const struct ldlm_res_id *res)
+{
+	fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF];
+	fid->f_oid = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF]);
+	fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+	LASSERT(fid_res_name_eq(fid, res));
+}
+
+/*
+ * Build (DLM) resource identifier from global quota FID and quota ID.
+ */
+static inline void
+fid_build_quota_res_name(const struct lu_fid *glb_fid, union lquota_id *qid,
+		      struct ldlm_res_id *res)
+{
+	fid_build_reg_res_name(glb_fid, res);
+	res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid);
+	res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid);
+}
+
+/*
+ * Extract global FID and quota ID from resource name
+ */
+static inline void fid_extract_from_quota_res(struct lu_fid *glb_fid,
+					      union lquota_id *qid,
+					      const struct ldlm_res_id *res)
+{
+	fid_extract_from_res_name(glb_fid, res);
+	qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF];
+	qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF];
+	qid->qid_fid.f_ver =
+		(__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32);
+}
+
+static inline void
+fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash,
+		       struct ldlm_res_id *res)
+{
+	fid_build_reg_res_name(fid, res);
+	res->name[LUSTRE_RES_ID_HSH_OFF] = hash;
+}
+
+/**
+ * Build DLM resource name from object id & seq, which will be removed
+ * finally, when we replace ost_id with FID in data stack.
+ *
+ * Currently, resid from the old client, whose res[0] = object_id,
+ * res[1] = object_seq, is just oposite with Metatdata
+ * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid.
+ * To unifiy the resid identification, we will reverse the data
+ * resid to keep it same with Metadata resid, i.e.
+ *
+ * For resid from the old client,
+ *    res[0] = objid,  res[1] = 0, still keep the original order,
+ *    for compatiblity.
+ *
+ * For new resid
+ *    res will be built from normal FID directly, i.e. res[0] = f_seq,
+ *    res[1] = f_oid + f_ver.
+ */
+static inline void ostid_build_res_name(const struct ost_id *oi,
+					struct ldlm_res_id *name)
+{
+	memset(name, 0, sizeof *name);
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi);
+		name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi);
+	} else {
+		fid_build_reg_res_name(&oi->oi_fid, name);
+	}
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline bool ostid_res_name_eq(const struct ost_id *oi,
+				     const struct ldlm_res_id *name)
+{
+	/* Note: it is just a trick here to save some effort, probably the
+	 * correct way would be turn them into the FID and compare */
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi);
+	} else {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi);
+	}
+}
+
+/**
+ * Note: we need check oi_seq to decide where to set oi_id,
+ * so oi_seq should always be set ahead of oi_id.
+ */
+static inline int ostid_set_id(struct ost_id *oi, __u64 oid)
+{
+	if (fid_seq_is_mdt0(oi->oi.oi_seq)) {
+		if (oid >= IDIF_MAX_OID)
+			return -E2BIG;
+		oi->oi.oi_id = oid;
+	} else if (fid_is_idif(&oi->oi_fid)) {
+		if (oid >= IDIF_MAX_OID)
+			return -E2BIG;
+		oi->oi_fid.f_seq = fid_idif_seq(oid,
+						fid_idif_ost_idx(&oi->oi_fid));
+		oi->oi_fid.f_oid = oid;
+		oi->oi_fid.f_ver = oid >> 48;
+	} else {
+		if (oid >= OBIF_MAX_OID)
+			return -E2BIG;
+		oi->oi_fid.f_oid = oid;
+	}
+	return 0;
+}
+
+/* pack any OST FID into an ostid (id/seq) for the wire/disk */
+static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid)
+{
+	int rc = 0;
+
+	if (fid_seq_is_igif(fid->f_seq))
+		return -EBADF;
+
+	if (fid_is_idif(fid)) {
+		ostid_set_seq_mdt0(ostid);
+		rc = ostid_set_id(ostid, fid_idif_id(fid_seq(fid),
+				  fid_oid(fid), fid_ver(fid)));
+	} else {
+		ostid->oi_fid = *fid;
+	}
+
+	return rc;
+}
+
+/* The same as osc_build_res_name() */
+static inline void ost_fid_build_resid(const struct lu_fid *fid,
+				       struct ldlm_res_id *resname)
+{
+	if (fid_is_mdt0(fid) || fid_is_idif(fid)) {
+		struct ost_id oi;
+		oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */
+		if (fid_to_ostid(fid, &oi) != 0)
+			return;
+		ostid_build_res_name(&oi, resname);
+	} else {
+		fid_build_reg_res_name(fid, resname);
+	}
+}
+
+static inline void ost_fid_from_resid(struct lu_fid *fid,
+				      const struct ldlm_res_id *name,
+				      int ost_idx)
+{
+	if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) {
+		/* old resid */
+		struct ost_id oi;
+
+		memset(&oi, 0, sizeof(oi));
+		ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+		if (ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF])) {
+			CERROR("Bad %llu to set " DOSTID "\n",
+			       name->name[LUSTRE_RES_ID_SEQ_OFF], POSTID(&oi));
+		}
+		ostid_to_fid(fid, &oi, ost_idx);
+	} else {
+		/* new resid */
+		fid_extract_from_res_name(fid, name);
+	}
+}
+
+/**
+ * Flatten 128-bit FID values into a 64-bit value for use as an inode number.
+ * For non-IGIF FIDs this starts just over 2^32, and continues without
+ * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ
+ * into the range where there may not be many OID values in use, to minimize
+ * the risk of conflict.
+ *
+ * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true,
+ * the time between re-used inode numbers is very long - 2^40 SEQ numbers,
+ * or about 2^40 client mounts, if clients create less than 2^24 files/mount.
+ */
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{
+	__u64 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		return ino;
+	}
+
+	seq = fid_seq(fid);
+
+	ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
+
+	return ino ?: fid_oid(fid);
+}
+
+static inline __u32 fid_hash(const struct lu_fid *f, int bits)
+{
+	/* all objects with same id and different versions will belong to same
+	 * collisions list. */
+	return hash_long(fid_flatten(f), bits);
+}
+
+/**
+ * map fid to 32 bit value for ino on 32bit systems. */
+static inline __u32 fid_flatten32(const struct lu_fid *fid)
+{
+	__u32 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		return ino;
+	}
+
+	seq = fid_seq(fid) - FID_SEQ_START;
+
+	/* Map the high bits of the OID into higher bits of the inode number so
+	 * that inodes generated at about the same time have a reduced chance
+	 * of collisions. This will give a period of 2^12 = 1024 unique clients
+	 * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
+	 * (from OID), or up to 128M inodes without collisions for new files. */
+	ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
+	      (seq >> (64 - (40-8)) & 0xffffff00) +
+	      (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
+
+	return ino ?: fid_oid(fid);
+}
+
+static inline int
+lu_fid_diff(const struct lu_fid *fid1, const struct lu_fid *fid2)
+{
+	LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+		 PFID(fid1), PFID(fid2));
+
+	if (fid_is_idif(fid1) && fid_is_idif(fid2))
+		return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) -
+		       fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver);
+
+	return fid_oid(fid1) - fid_oid(fid2);
+}
+
+static inline int fid_set_id(struct lu_fid *fid, u64 oid)
+{
+	if (unlikely(fid_seq_is_igif(fid->f_seq))) {
+		CERROR("bad IGIF, "DFID"\n", PFID(fid));
+		return -EBADF;
+	}
+
+	if (fid_is_idif(fid)) {
+		if (oid >= IDIF_MAX_OID) {
+			CERROR("Too large OID %#llx to set IDIF "DFID"\n",
+			       (unsigned long long)oid, PFID(fid));
+			return -EBADF;
+		}
+		fid->f_seq = fid_idif_seq(oid, fid_idif_ost_idx(fid));
+		fid->f_oid = oid;
+		fid->f_ver = oid >> 48;
+	} else {
+		if (oid > OBIF_MAX_OID) {
+			CERROR("Too large OID %#llx to set REG "DFID"\n",
+			       (unsigned long long)oid, PFID(fid));
+			return -EBADF;
+		}
+		fid->f_oid = oid;
+	}
+	return 0;
+}
+
+#define LUSTRE_SEQ_SRV_NAME "seq_srv"
+#define LUSTRE_SEQ_CTL_NAME "seq_ctl"
+
+/* Range common stuff */
+static inline void
+range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        dst->lsr_start = cpu_to_le64(src->lsr_start);
+        dst->lsr_end = cpu_to_le64(src->lsr_end);
+        dst->lsr_index = cpu_to_le32(src->lsr_index);
+        dst->lsr_flags = cpu_to_le32(src->lsr_flags);
+}
+
+static inline void
+range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        dst->lsr_start = le64_to_cpu(src->lsr_start);
+        dst->lsr_end = le64_to_cpu(src->lsr_end);
+        dst->lsr_index = le32_to_cpu(src->lsr_index);
+        dst->lsr_flags = le32_to_cpu(src->lsr_flags);
+}
+
+static inline void
+range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        dst->lsr_start = cpu_to_be64(src->lsr_start);
+        dst->lsr_end = cpu_to_be64(src->lsr_end);
+        dst->lsr_index = cpu_to_be32(src->lsr_index);
+        dst->lsr_flags = cpu_to_be32(src->lsr_flags);
+}
+
+static inline void
+range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        dst->lsr_start = be64_to_cpu(src->lsr_start);
+        dst->lsr_end = be64_to_cpu(src->lsr_end);
+        dst->lsr_index = be32_to_cpu(src->lsr_index);
+        dst->lsr_flags = be32_to_cpu(src->lsr_flags);
+}
+
+static inline void range_array_cpu_to_le(struct lu_seq_range_array *dst,
+					 const struct lu_seq_range_array *src)
+{
+	__u32 i;
+
+	for (i = 0; i < src->lsra_count; i++)
+		range_cpu_to_le(&dst->lsra_lsr[i], &src->lsra_lsr[i]);
+
+	dst->lsra_count = cpu_to_le32(src->lsra_count);
+}
+
+static inline void range_array_le_to_cpu(struct lu_seq_range_array *dst,
+					 const struct lu_seq_range_array *src)
+{
+	__u32 i;
+
+	dst->lsra_count = le32_to_cpu(src->lsra_count);
+	for (i = 0; i < dst->lsra_count; i++)
+		range_le_to_cpu(&dst->lsra_lsr[i], &src->lsra_lsr[i]);
+}
+
+/** @} fid */
+
+#endif /* __LUSTRE_FID_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
new file mode 100644
index 0000000000000..cd9036a32d344
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
@@ -0,0 +1,200 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LINUX_FLD_H
+#define __LINUX_FLD_H
+
+/** \defgroup fld fld
+ *
+ * @{
+ */
+
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <libcfs/libcfs.h>
+#include <seq_range.h>
+#include <lustre_fid.h>
+
+struct lu_env;
+struct lu_client_fld;
+struct lu_server_fld;
+struct lu_fld_hash;
+struct fld_cache;
+struct thandle;
+struct dt_device;
+struct dt_object;
+
+/*
+ * FLD (Fid Location Database) interface.
+ */
+enum {
+        LUSTRE_CLI_FLD_HASH_DHT = 0,
+        LUSTRE_CLI_FLD_HASH_RRB
+};
+
+struct lu_fld_target {
+	struct list_head	ft_chain;
+	struct obd_export      *ft_exp;
+	struct lu_server_fld   *ft_srv;
+	__u64			ft_idx;
+};
+
+struct lu_server_fld {
+	/**
+	 * Fld dir debugfs entry.
+	 */
+	struct dentry		*lsf_debugfs_entry;
+
+        /**
+         * /fld file object device */
+        struct dt_object        *lsf_obj;
+
+        /**
+         * super sequence controller export, needed to forward fld
+         * lookup  request. */
+        struct obd_export       *lsf_control_exp;
+
+        /**
+         * Client FLD cache. */
+        struct fld_cache        *lsf_cache;
+
+        /**
+         * Protect index modifications */
+	struct mutex		lsf_lock;
+
+	/**
+	 * Fld service name in form "fld-srv-lustre-MDTXXX"
+	 */
+	char			lsf_name[LUSTRE_MDT_MAXNAMELEN];
+
+	int (*lsf_seq_lookup)(const struct lu_env *env,
+			      struct lu_server_fld *fld, u64 seq,
+			      struct lu_seq_range *range);
+
+	/**
+	 * Just reformatted or upgraded, and this flag is being
+	 * used to check whether the local FLDB is needs to be
+	 * synced with global FLDB(in MDT0), and it is only needed
+	 * if the MDT is upgraded from < 2.6 to 2.6, i.e. when the
+	 * local FLDB is being invited */
+	unsigned int		 lsf_new:1;
+
+};
+
+struct lu_client_fld {
+	/**
+	 * Client side debugfs entry.
+	 */
+	struct dentry		*lcf_debugfs_entry;
+
+	/**
+	 * List of exports client FLD knows about. */
+	struct list_head	lcf_targets;
+
+        /**
+         * Current hash to be used to chose an export. */
+        struct lu_fld_hash      *lcf_hash;
+
+        /**
+         * Exports count. */
+        int                      lcf_count;
+
+        /**
+         * Lock protecting exports list and fld_hash. */
+	spinlock_t		 lcf_lock;
+
+        /**
+         * Client FLD cache. */
+        struct fld_cache        *lcf_cache;
+
+	/**
+	 * Client fld debugfs entry name.
+	 */
+	char			lcf_name[LUSTRE_MDT_MAXNAMELEN];
+};
+
+/* Server methods */
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct dt_device *dt, const char *prefix, int type);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_server_create(const struct lu_env *env,
+			      struct lu_server_fld *fld,
+			      const struct lu_seq_range *range,
+			      struct thandle *th);
+
+int fld_server_create(const struct lu_env *env,
+		      struct lu_server_fld *fld,
+		      const struct lu_seq_range *add_range,
+		      struct thandle *th);
+
+int fld_insert_entry(const struct lu_env *env,
+		     struct lu_server_fld *fld,
+		     const struct lu_seq_range *range);
+
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		      u64 seq, struct lu_seq_range *range);
+
+int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     u64 seq, struct lu_seq_range *range);
+
+int fld_update_from_controller(const struct lu_env *env,
+			       struct lu_server_fld *fld);
+
+/* Client methods */
+int fld_client_init(struct lu_client_fld *fld,
+                    const char *prefix, int hash);
+
+void fld_client_fini(struct lu_client_fld *fld);
+
+void fld_client_flush(struct lu_client_fld *fld);
+
+int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
+                      __u32 flags, const struct lu_env *env);
+
+int fld_client_create(struct lu_client_fld *fld,
+                      struct lu_seq_range *range,
+                      const struct lu_env *env);
+
+int fld_client_delete(struct lu_client_fld *fld, u64 seq,
+                      const struct lu_env *env);
+
+int fld_client_add_target(struct lu_client_fld *fld,
+                          struct lu_fld_target *tar);
+
+int fld_client_del_target(struct lu_client_fld *fld,
+                          __u64 idx);
+
+void fld_client_debugfs_fini(struct lu_client_fld *fld);
+
+/** @} fld */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
new file mode 100644
index 0000000000000..282115ef67550
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
@@ -0,0 +1,59 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _LUSTRE_HA_H
+#define _LUSTRE_HA_H
+
+/** \defgroup ha ha
+ *
+ * @{
+ */
+
+struct obd_import;
+struct obd_export;
+struct obd_device;
+struct ptlrpc_request;
+
+
+int ptlrpc_replay(struct obd_import *imp);
+int ptlrpc_resend(struct obd_import *imp);
+void ptlrpc_free_committed(struct obd_import *imp);
+void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full);
+void ptlrpc_deactivate_import(struct obd_import *imp);
+void ptlrpc_invalidate_import(struct obd_import *imp);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
+void ptlrpc_pinger_force(struct obd_import *imp);
+/** @} ha */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_handles.h b/drivers/staging/lustrefsx/lustre/include/lustre_handles.h
new file mode 100644
index 0000000000000..538f427683cbd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_handles.h
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LUSTRE_HANDLES_H_
+#define __LUSTRE_HANDLES_H_
+
+/** \defgroup handles handles
+ *
+ * @{
+ */
+
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+#include <linux/spinlock.h>
+#include <libcfs/libcfs.h>
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for.  ie:
+ *
+ * struct ldlm_lock {
+ *         struct portals_handle handle;
+ *         ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock.  If it's not at the top, you'll want to use container_of()
+ * to compute the start of the structure based on the handle field. */
+struct portals_handle {
+	struct hlist_node		h_link;
+	__u64				h_cookie;
+	const char			*h_owner;
+	refcount_t			h_ref;
+	struct rcu_head			h_rcu;
+};
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *, const char *h_owner);
+void class_handle_unhash(struct portals_handle *);
+void *class_handle2object(u64 cookie, const char *h_owner);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+/** @} handles */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
new file mode 100644
index 0000000000000..35f55eb755707
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
@@ -0,0 +1,70 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_IDMAP_H
+#define _LUSTRE_IDMAP_H
+
+/** \defgroup idmap idmap
+ *
+ * @{
+ */
+
+#include <libcfs/libcfs.h>
+
+#ifdef HAVE_GROUP_INFO_GID
+
+#define CFS_GROUP_AT(gi, i) ((gi)->gid[(i)])
+
+#else  /* !HAVE_GROUP_INFO_GID */
+
+#define CFS_NGROUPS_PER_BLOCK   ((int)(PAGE_SIZE / sizeof(gid_t)))
+
+#define CFS_GROUP_AT(gi, i) \
+        ((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK])
+
+#endif /* HAVE_GROUP_INFO_GID */
+
+#include <linux/cred.h>
+
+struct lu_ucred;
+
+extern void lustre_groups_from_list(struct group_info *ginfo, gid_t *glist);
+extern void lustre_groups_sort(struct group_info *group_info);
+extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp);
+
+/** @} idmap */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_import.h b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
new file mode 100644
index 0000000000000..7b97c5555c327
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
@@ -0,0 +1,430 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+/** \defgroup obd_import PtlRPC import definitions
+ * Imports are client-side representation of remote obd target.
+ *
+ * @{
+ */
+
+#ifndef __IMPORT_H
+#define __IMPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/refcount.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+
+/**
+ * Adaptive Timeout stuff
+ *
+ * @{
+ */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4                  /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1          /* use last reported value only */
+
+struct adaptive_timeout {
+	time64_t	at_binstart;         /* bin start time */
+	unsigned int	at_hist[AT_BINS];    /* timeout history bins */
+	unsigned int	at_flags;
+	timeout_t	at_current_timeout;	/* current timeout value */
+	timeout_t	at_worst_timeout_ever;	/* worst-ever timeout delta
+						 * value
+						 */
+	time64_t	at_worst_timestamp;	/* worst-ever timeout
+						 * timestamp
+						 */
+	spinlock_t	at_lock;
+};
+
+enum lustre_at_flags {
+	LATF_SKIP	= 0x0,
+	LATF_STATS	= 0x1,
+};
+
+struct ptlrpc_at_array {
+	struct list_head *paa_reqs_array; /** array to hold requests */
+        __u32             paa_size;       /** the size of array */
+        __u32             paa_count;      /** the total count of reqs */
+	time64_t	  paa_deadline;	  /** the earliest deadline of reqs */
+        __u32            *paa_reqs_count; /** the count of reqs in each entry */
+};
+
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+        int                     iat_portal[IMP_AT_MAX_PORTALS];
+        struct adaptive_timeout iat_net_latency;
+        struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
+
+/** @} */
+
+/** Possible import states */
+enum lustre_imp_state {
+        LUSTRE_IMP_CLOSED     = 1,
+        LUSTRE_IMP_NEW        = 2,
+        LUSTRE_IMP_DISCON     = 3,
+        LUSTRE_IMP_CONNECTING = 4,
+        LUSTRE_IMP_REPLAY     = 5,
+        LUSTRE_IMP_REPLAY_LOCKS = 6,
+        LUSTRE_IMP_REPLAY_WAIT  = 7,
+        LUSTRE_IMP_RECOVER    = 8,
+        LUSTRE_IMP_FULL       = 9,
+        LUSTRE_IMP_EVICTED    = 10,
+	LUSTRE_IMP_IDLE	      = 11,
+	LUSTRE_IMP_LAST
+};
+
+/** Returns test string representation of numeric import state \a state */
+static inline const char *ptlrpc_import_state_name(enum lustre_imp_state state)
+{
+	static const char * const import_state_names[] = {
+		"<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+		"CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+		"RECOVER", "FULL", "EVICTED", "IDLE",
+	};
+
+	LASSERT(state < LUSTRE_IMP_LAST);
+	return import_state_names[state];
+}
+
+/**
+ * List of import event types
+ */
+enum obd_import_event {
+        IMP_EVENT_DISCON     = 0x808001,
+        IMP_EVENT_INACTIVE   = 0x808002,
+        IMP_EVENT_INVALIDATE = 0x808003,
+        IMP_EVENT_ACTIVE     = 0x808004,
+        IMP_EVENT_OCD        = 0x808005,
+        IMP_EVENT_DEACTIVATE = 0x808006,
+        IMP_EVENT_ACTIVATE   = 0x808007,
+};
+
+/**
+ * Definition of import connection structure
+ */
+struct obd_import_conn {
+	/** Item for linking connections together */
+	struct list_head	  oic_item;
+	/** Pointer to actual PortalRPC connection */
+        struct ptlrpc_connection *oic_conn;
+        /** uuid of remote side */
+        struct obd_uuid           oic_uuid;
+        /**
+	 * Time (64 bit seconds) of last connection attempt on this connection
+         */
+	time64_t		  oic_last_attempt;
+};
+
+/* state history */
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+	enum lustre_imp_state	ish_state;
+	time64_t		ish_time;
+};
+
+/**
+ * Defintion of PortalRPC import structure.
+ * Imports are representing client-side view to remote target.
+ */
+struct obd_import {
+	/** Reference counter */
+	refcount_t		  imp_refcount;
+	struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
+	/** Currently active connection */
+	struct ptlrpc_connection *imp_connection;
+        /** PortalRPC client structure for this import */
+        struct ptlrpc_client     *imp_client;
+	/** List element for linking into pinger chain */
+	struct list_head	  imp_pinger_chain;
+	/** work struct for destruction of import */
+	struct work_struct	  imp_zombie_work;
+
+        /**
+         * Lists of requests that are retained for replay, waiting for a reply,
+         * or waiting for recovery to complete, respectively.
+         * @{
+         */
+	struct list_head	imp_replay_list;
+	struct list_head	imp_sending_list;
+	struct list_head	imp_delayed_list;
+        /** @} */
+
+	/**
+	 * List of requests that are retained for committed open replay. Once
+	 * open is committed, open replay request will be moved from the
+	 * imp_replay_list into the imp_committed_list.
+	 * The imp_replay_cursor is for accelerating searching during replay.
+	 * @{
+	 */
+	struct list_head	imp_committed_list;
+	struct list_head	*imp_replay_cursor;
+	/** @} */
+
+	/** List of not replied requests */
+	struct list_head	imp_unreplied_list;
+	/** Known maximal replied XID */
+	__u64			imp_known_replied_xid;
+
+	/** obd device for this import */
+	struct obd_device	*imp_obd;
+
+        /**
+         * some seciruty-related fields
+         * @{
+         */
+	struct ptlrpc_sec        *imp_sec;
+	rwlock_t		  imp_sec_lock;
+	time64_t		imp_sec_expire;
+	pid_t			  imp_sec_refpid;
+        /** @} */
+
+	/** Wait queue for those who need to wait for recovery completion */
+	wait_queue_head_t         imp_recovery_waitq;
+
+	/** Number of requests allocated */
+	atomic_t                  imp_reqs;
+	/** Number of requests currently in-flight */
+	atomic_t                  imp_inflight;
+	/** Number of requests currently unregistering */
+	atomic_t                  imp_unregistering;
+	/** Number of replay requests inflight */
+	atomic_t                  imp_replay_inflight;
+	/** In-flight replays rate control */
+	wait_queue_head_t	  imp_replay_waitq;
+
+	/** Number of currently happening import invalidations */
+	atomic_t                  imp_inval_count;
+	/** Numbner of request timeouts */
+	atomic_t                  imp_timeouts;
+	/** Current import state */
+        enum lustre_imp_state     imp_state;
+	/** Last replay state */
+	enum lustre_imp_state     imp_replay_state;
+        /** History of import states */
+        struct import_state_hist  imp_state_hist[IMP_STATE_HIST_LEN];
+        int                       imp_state_hist_idx;
+        /** Current import generation. Incremented on every reconnect */
+        int                       imp_generation;
+	/** Idle connection initiated at this generation */
+	int			  imp_initiated_at;
+        /** Incremented every time we send reconnection request */
+        __u32                     imp_conn_cnt;
+       /** 
+        * \see ptlrpc_free_committed remembers imp_generation value here
+        * after a check to save on unnecessary replay list iterations
+        */
+        int                       imp_last_generation_checked;
+        /** Last tranno we replayed */
+        __u64                     imp_last_replay_transno;
+        /** Last transno committed on remote side */
+        __u64                     imp_peer_committed_transno;
+        /**
+         * \see ptlrpc_free_committed remembers last_transno since its last
+         * check here and if last_transno did not change since last run of
+         * ptlrpc_free_committed and import generation is the same, we can
+         * skip looking for requests to remove from replay list as optimisation
+         */
+        __u64                     imp_last_transno_checked;
+        /**
+         * Remote export handle. This is how remote side knows what export
+         * we are talking to. Filled from response to connect request
+         */
+        struct lustre_handle      imp_remote_handle;
+        /** When to perform next ping. time in jiffies. */
+	time64_t		imp_next_ping;
+	/** When we last successfully connected. time in 64bit jiffies */
+	time64_t		imp_last_success_conn;
+
+        /** List of all possible connection for import. */
+	struct list_head	imp_conn_list;
+        /**
+         * Current connection. \a imp_connection is imp_conn_current->oic_conn
+         */
+        struct obd_import_conn   *imp_conn_current;
+
+        /** Protects flags, level, generation, conn_cnt, *_list */
+	spinlock_t		  imp_lock;
+
+	/**
+	 * A "sentinel" value used to check if there are other threads
+	 * waiting on the imp_lock.
+	 */
+	atomic_t                  imp_waiting;
+
+	/* flags */
+	unsigned long		  imp_invalid:1,    /* evicted */
+				  /* administratively disabled */
+				  imp_deactive:1,
+				  /* try to recover the import */
+				  imp_replayable:1,
+				  /* don't run recovery (timeout instead) */
+				  imp_dlm_fake:1,
+				  /* use 1/2 timeout on MDS' OSCs */
+				  imp_server_timeout:1,
+				  /* VBR: imp in delayed recovery */
+				  imp_delayed_recovery:1,
+				  /* recovery by versions was failed */
+				  imp_vbr_failed:1,
+				  /* force an immidiate ping */
+				  imp_force_verify:1,
+				  /* force a scheduled ping */
+				  imp_force_next_verify:1,
+				  /* pingable */
+				  imp_pingable:1,
+				  /* resend for replay */
+				  imp_resend_replay:1,
+				  /* disable normal recovery, for test only. */
+				  imp_no_pinger_recover:1,
+				  /* import must be reconnected instead of
+				   * chouse new connection */
+				  imp_force_reconnect:1,
+				  /* import has tried to connect with server */
+				  imp_connect_tried:1,
+				  /* connected but not FULL yet */
+				  imp_connected:1,
+				  /* grant shrink disabled */
+				  imp_grant_shrink_disabled:1,
+				  /* to supress LCONSOLE() at conn.restore */
+				  imp_was_idle:1;
+	u32			  imp_connect_op;
+	u32			  imp_idle_timeout;
+	u32			  imp_idle_debug;
+	struct obd_connect_data	  imp_connect_data;
+	__u64			  imp_connect_flags_orig;
+	__u64			  imp_connect_flags2_orig;
+	int			  imp_connect_error;
+
+	enum lustre_msg_magic	imp_msg_magic;
+				/* adjusted based on server capability */
+	enum lustre_msghdr	imp_msghdr_flags;
+
+				/* adaptive timeout data */
+	struct imp_at		imp_at;
+	time64_t		imp_last_reply_time;	/* for health check */
+	__u32			imp_conn_restricted_net;
+};
+
+/* import.c : adaptive timeout handling.
+ *
+ * Lustre tracks how long RPCs take to complete. This information is reported
+ * back to clients who utilize the information to estimate the time needed
+ * for future requests and set appropriate RPC timeouts. Minimum and maximum
+ * service times can be configured via the at_min and at_max kernel module
+ * parameters, respectively.
+ *
+ * Since this information is transmitted between nodes the timeouts are in
+ * seconds not jiffies which can vary from node to node. To avoid confusion
+ * the timeout is handled in timeout_t (s32) instead of time64_t or
+ * long (jiffies).
+ */
+static inline timeout_t at_est2timeout(timeout_t timeout)
+{
+	/* add an arbitrary minimum: 125% +5 sec */
+	return timeout + (timeout >> 2) + 5;
+}
+
+static inline timeout_t at_timeout2est(timeout_t timeout)
+{
+	/* restore estimate value from timeout: e=4/5(t-5) */
+	LASSERT(timeout > 0);
+	return max((timeout << 2) / 5, 5) - 4;
+}
+
+static inline void at_reset_nolock(struct adaptive_timeout *at,
+				   timeout_t timeout)
+{
+	at->at_current_timeout = timeout;
+	at->at_worst_timeout_ever = timeout;
+	at->at_worst_timestamp = ktime_get_real_seconds();
+}
+
+static inline void at_reset(struct adaptive_timeout *at, timeout_t timeout)
+{
+	spin_lock(&at->at_lock);
+	at_reset_nolock(at, timeout);
+	spin_unlock(&at->at_lock);
+}
+
+static inline void at_init(struct adaptive_timeout *at, timeout_t timeout,
+			   int flags)
+{
+	memset(at, 0, sizeof(*at));
+	spin_lock_init(&at->at_lock);
+	at->at_flags = flags;
+	at_reset(at, timeout);
+}
+
+static inline void at_reinit(struct adaptive_timeout *at, timeout_t timeout,
+			     int flags)
+{
+	spin_lock(&at->at_lock);
+	at->at_binstart = 0;
+	memset(at->at_hist, 0, sizeof(at->at_hist));
+	at->at_flags = flags;
+	at_reset_nolock(at, timeout);
+	spin_unlock(&at->at_lock);
+}
+
+extern unsigned int at_min;
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
+static inline timeout_t at_get(struct adaptive_timeout *at)
+{
+	return (at->at_current_timeout > at_min) ?
+		at->at_current_timeout : at_min;
+}
+
+timeout_t at_measured(struct adaptive_timeout *at, timeout_t timeout);
+int import_at_get_index(struct obd_import *imp, int portal);
+
+/* genops.c */
+struct obd_export;
+extern struct obd_import *class_exp2cliimp(struct obd_export *);
+
+/** @} import */
+
+#endif /* __IMPORT_H */
+
+/** @} obd_import */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_intent.h b/drivers/staging/lustrefsx/lustre/include/lustre_intent.h
new file mode 100644
index 0000000000000..5f3a717c9590b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_intent.h
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef LUSTRE_INTENT_H
+#define LUSTRE_INTENT_H
+
+/* intent IT_XXX are defined in lustre/include/obd.h */
+
+struct lookup_intent {
+	int			 it_op;
+	int			 it_create_mode;
+	__u64			 it_flags;
+	int			 it_disposition;
+	int			 it_status;
+	__u64			 it_lock_handle;
+	__u64			 it_lock_bits;
+	int			 it_lock_mode;
+	int			 it_remote_lock_mode;
+	__u64			 it_remote_lock_handle;
+	struct ptlrpc_request	*it_request;
+	unsigned int		 it_lock_set:1;
+};
+
+static inline int it_disposition(const struct lookup_intent *it, int flag)
+{
+	return it->it_disposition & flag;
+}
+
+static inline void it_set_disposition(struct lookup_intent *it, int flag)
+{
+	it->it_disposition |= flag;
+}
+
+static inline void it_clear_disposition(struct lookup_intent *it, int flag)
+{
+	it->it_disposition &= ~flag;
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
new file mode 100644
index 0000000000000..4af88af0edf87
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * The definitions below are used in the kernel and userspace.
+ */
+
+#ifndef __LUSTRE_KERNELCOMM_H__
+#define __LUSTRE_KERNELCOMM_H__
+
+/* For declarations shared with userspace */
+#include <uapi/linux/lustre/lustre_kernelcomm.h>
+
+/* prototype for callback function on kuc groups */
+typedef int (*libcfs_kkuc_cb_t)(void *data, void *cb_arg);
+
+/* Kernel methods */
+void libcfs_kkuc_init(void);
+int libcfs_kkuc_msg_put(struct file *fp, void *payload);
+int libcfs_kkuc_group_put(const struct obd_uuid *uuid, int group, void *data);
+int libcfs_kkuc_group_add(struct file *fp, const struct obd_uuid *uuid, int uid,
+			  int group, void *data, size_t data_len);
+int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group);
+int libcfs_kkuc_group_foreach(const struct obd_uuid *uuid, int group,
+			      libcfs_kkuc_cb_t cb_func, void *cb_arg);
+
+#endif /* __LUSTRE_KERNELCOMM_H__ */
+
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
new file mode 100644
index 0000000000000..64b0d55921897
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
@@ -0,0 +1,130 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_lfsck.h
+ *
+ * Lustre LFSCK exported functions.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_H
+# define _LUSTRE_LFSCK_H
+
+#include <uapi/linux/lustre/lustre_lfsck_user.h>
+#include <lustre_dlm.h>
+#include <lu_object.h>
+#include <dt_object.h>
+
+struct lfsck_start_param {
+	struct lfsck_start	*lsp_start;
+	__u32			 lsp_index;
+	unsigned int		 lsp_index_valid:1;
+};
+
+/* For LE_PAIRS_VERIFY returned status */
+enum lfsck_pv_status {
+	LPVS_INIT		= 0,
+	LPVS_INCONSISTENT	= 1,
+	LPVS_INCONSISTENT_TOFIX = 2,
+};
+
+enum lfsck_events_local {
+	LEL_FID_ACCESSED	= 1,
+	LEL_PAIRS_VERIFY_LOCAL	= 2,
+};
+
+struct lfsck_req_local {
+	__u32		lrl_event;
+	__u32		lrl_status;
+	__u16		lrl_active;
+	__u16		lrl_padding0;
+	__u32		lrl_padding1;
+	struct lu_fid	lrl_fid;
+	struct filter_fid lrl_ff_client;
+	struct filter_fid lrl_ff_local;
+};
+
+struct lfsck_layout_dangling_key {
+	struct lu_fid	lldk_fid;
+	__u32		lldk_comp_id;
+	__u32		lldk_ea_off;
+};
+
+typedef int (*lfsck_out_notify)(const struct lu_env *env, void *data,
+				enum lfsck_events event);
+
+int lfsck_register_namespace(const struct lu_env *env, struct dt_device *key,
+			     struct ldlm_namespace *ns);
+int lfsck_register(const struct lu_env *env, struct dt_device *key,
+		   struct dt_device *next, struct obd_device *obd,
+		   lfsck_out_notify notify, void *notify_data, bool master);
+void lfsck_degister(const struct lu_env *env, struct dt_device *key);
+
+int lfsck_add_target(const struct lu_env *env, struct dt_device *key,
+		     struct dt_device *tgt, struct obd_export *exp,
+		     __u32 index, bool for_ost);
+void lfsck_del_target(const struct lu_env *env, struct dt_device *key,
+		      struct dt_device *tgt, __u32 index, bool for_ost);
+
+int lfsck_start(const struct lu_env *env, struct dt_device *key,
+		struct lfsck_start_param *lsp);
+int lfsck_stop(const struct lu_env *env, struct dt_device *key,
+	       struct lfsck_stop *stop);
+int lfsck_in_notify_local(const struct lu_env *env, struct dt_device *key,
+			  struct lfsck_req_local *lrl, struct thandle *th);
+int lfsck_in_notify(const struct lu_env *env, struct dt_device *key,
+		    struct lfsck_request *lr);
+int lfsck_query(const struct lu_env *env, struct dt_device *key,
+		struct lfsck_request *req, struct lfsck_reply *rep,
+		struct lfsck_query *que);
+
+int lfsck_get_speed(char *buf, struct dt_device *key);
+int lfsck_set_speed(struct dt_device *key, __u32 val);
+int lfsck_get_windows(char *buf, struct dt_device *key);
+int lfsck_set_windows(struct dt_device *key, unsigned int val);
+
+int lfsck_dump(struct seq_file *m, struct dt_device *key, enum lfsck_type type);
+
+static inline void lfsck_pack_rfa(struct lfsck_req_local *lrl,
+				  const struct lu_fid *fid,
+				  enum lfsck_events_local event, __u16 com)
+{
+	memset(lrl, 0, sizeof(*lrl));
+	lrl->lrl_fid = *fid;
+	lrl->lrl_event = event;
+	lrl->lrl_active = com;
+}
+
+static inline bool lovea_slot_is_dummy(const struct lov_ost_data_v1 *obj)
+{
+	/* zero area does not care about the bytes-order. */
+	if (obj->l_ost_oi.oi.oi_id == 0 && obj->l_ost_oi.oi.oi_seq == 0 &&
+	    obj->l_ost_idx == 0 && obj->l_ost_gen == 0)
+		return true;
+
+	return false;
+}
+#endif /* _LUSTRE_LFSCK_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
new file mode 100644
index 0000000000000..4d36fcfb3c000
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
@@ -0,0 +1,99 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LUSTRE_LIB_H
+#define _LUSTRE_LIB_H
+
+/** \defgroup lib lib
+ *
+ * @{
+ */
+
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#endif
+
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_ver.h>
+#include <uapi/linux/lustre/lustre_cfg.h>
+
+/* target.c */
+struct ptlrpc_request;
+struct obd_export;
+struct lu_target;
+#include <lustre_ha.h>
+#include <lustre_net.h>
+
+#define LI_POISON 0x5a5a5a5a
+#if BITS_PER_LONG > 32
+# define LL_POISON 0x5a5a5a5a5a5a5a5aL
+#else
+# define LL_POISON 0x5a5a5a5aL
+#endif
+#define LP_POISON ((void *)LL_POISON)
+
+#ifdef HAVE_SERVER_SUPPORT
+int rev_import_init(struct obd_export *exp);
+int target_handle_connect(struct ptlrpc_request *req);
+int target_handle_disconnect(struct ptlrpc_request *req);
+void target_destroy_export(struct obd_export *exp);
+void target_committed_to_req(struct ptlrpc_request *req);
+void target_cancel_recovery_timer(struct obd_device *obd);
+void target_stop_recovery_thread(struct obd_device *obd);
+void target_cleanup_recovery(struct obd_device *obd);
+int target_queue_recovery_request(struct ptlrpc_request *req,
+                                  struct obd_device *obd);
+int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc);
+#endif
+
+int target_pack_pool_reply(struct ptlrpc_request *req);
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      size_t keylen, void *key,
+		      size_t vallen, void *val,
+		      struct ptlrpc_request_set *set);
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
+
+#define LL_CDEBUG_PAGE(mask, page, fmt, arg...)				\
+	CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: " \
+	       fmt, page, page->mapping, page->index, (long)page->flags, \
+	       page_count(page), page_private(page), ## arg)
+
+/** @} lib */
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
new file mode 100644
index 0000000000000..f9deb4d28a4df
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: di wang <di.wang@intel.com>
+ */
+
+/* There are several reasons to restrict the linkEA size:
+ *
+ * 1. Under DNE mode, if we do not restrict the linkEA size, and if there
+ *    are too many cross-MDTs hard links to the same object, then it will
+ *    casue the llog overflow.
+ *
+ * 2. Some backend has limited size for EA. For example, if without large
+ *    EA enabled, the ldiskfs will make all EAs to share one (4K) EA block.
+ *
+ * 3. Too many entries in linkEA will seriously affect linkEA performance
+ *    because we only support to locate linkEA entry consecutively. */
+#define MAX_LINKEA_SIZE	4096
+
+struct linkea_data {
+	/**
+	 * Buffer to keep link EA body.
+	 */
+	struct lu_buf		*ld_buf;
+	/**
+	 * The matched header, entry and its lenght in the EA
+	 */
+	struct link_ea_header	*ld_leh;
+	struct link_ea_entry	*ld_lee;
+	int			ld_reclen;
+};
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf);
+int linkea_init(struct linkea_data *ldata);
+int linkea_init_with_rec(struct linkea_data *ldata);
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+			 struct lu_name *lname, struct lu_fid *pfid);
+int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname,
+		      const struct lu_fid *pfid);
+bool linkea_will_overflow(struct linkea_data *ldata,
+			  const struct lu_name *lname);
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		   const struct lu_fid *pfid, bool err_on_overflow);
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		    bool is_encrypted);
+int linkea_links_new(struct linkea_data *ldata, struct lu_buf *buf,
+		     const struct lu_name *cname, const struct lu_fid *pfid);
+int linkea_overflow_shrink(struct linkea_data *ldata);
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+		      const struct lu_fid  *pfid);
+
+static inline void linkea_first_entry(struct linkea_data *ldata)
+{
+	LASSERT(ldata != NULL);
+	LASSERT(ldata->ld_leh != NULL);
+
+	if (ldata->ld_leh->leh_reccount == 0)
+		ldata->ld_lee = NULL;
+	else
+		ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+}
+
+static inline void linkea_next_entry(struct linkea_data *ldata)
+{
+	LASSERT(ldata != NULL);
+	LASSERT(ldata->ld_leh != NULL);
+
+	if (ldata->ld_lee != NULL) {
+		ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+							 ldata->ld_reclen);
+		if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh +
+					      ldata->ld_leh->leh_len))
+			ldata->ld_lee = NULL;
+	}
+}
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
new file mode 100644
index 0000000000000..2f11379003e12
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
@@ -0,0 +1,539 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_lmv.h
+ *
+ * Lustre LMV structures and functions.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#ifndef _LUSTRE_LMV_H
+#define _LUSTRE_LMV_H
+#include <uapi/linux/lustre/lustre_idl.h>
+
+struct lmv_oinfo {
+	struct lu_fid	lmo_fid;
+	u32		lmo_mds;
+	struct inode	*lmo_root;
+};
+
+struct lmv_stripe_md {
+	__u32	lsm_md_magic;
+	__u32	lsm_md_stripe_count;
+	__u32	lsm_md_master_mdt_index;
+	__u32	lsm_md_hash_type;
+	__u8	lsm_md_max_inherit;
+	__u8	lsm_md_max_inherit_rr;
+	__u32	lsm_md_layout_version;
+	__u32	lsm_md_migrate_offset;
+	__u32	lsm_md_migrate_hash;
+	char	lsm_md_pool_name[LOV_MAXPOOLNAME + 1];
+	struct lmv_oinfo lsm_md_oinfo[0];
+};
+
+static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm)
+{
+	return lsm && lsm->lsm_md_magic == LMV_MAGIC;
+}
+
+static inline bool lmv_dir_foreign(const struct lmv_stripe_md *lsm)
+{
+	return lsm && lsm->lsm_md_magic == LMV_MAGIC_FOREIGN;
+}
+
+static inline bool lmv_dir_layout_changing(const struct lmv_stripe_md *lsm)
+{
+	return lmv_dir_striped(lsm) &&
+	       lmv_hash_is_layout_changing(lsm->lsm_md_hash_type);
+}
+
+static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm)
+{
+	if (!lmv_dir_striped(lsm))
+		return false;
+
+	if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_BAD_TYPE)
+		return true;
+
+	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type);
+}
+
+static inline bool
+lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
+{
+	__u32 idx;
+
+	if (lsm1->lsm_md_magic != lsm2->lsm_md_magic ||
+	    lsm1->lsm_md_stripe_count != lsm2->lsm_md_stripe_count ||
+	    lsm1->lsm_md_master_mdt_index !=
+				lsm2->lsm_md_master_mdt_index ||
+	    lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type ||
+	    lsm1->lsm_md_max_inherit != lsm2->lsm_md_max_inherit ||
+	    lsm1->lsm_md_max_inherit_rr != lsm2->lsm_md_max_inherit_rr ||
+	    lsm1->lsm_md_layout_version !=
+				lsm2->lsm_md_layout_version ||
+	    lsm1->lsm_md_migrate_offset !=
+				lsm2->lsm_md_migrate_offset ||
+	    lsm1->lsm_md_migrate_hash !=
+				lsm2->lsm_md_migrate_hash ||
+	    strncmp(lsm1->lsm_md_pool_name, lsm2->lsm_md_pool_name,
+		    sizeof(lsm1->lsm_md_pool_name)) != 0)
+		return false;
+
+	if (lmv_dir_striped(lsm1)) {
+		for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) {
+			if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid,
+				       &lsm2->lsm_md_oinfo[idx].lmo_fid))
+				return false;
+		}
+	} else if (lsm1->lsm_md_magic == LMV_USER_MAGIC_SPECIFIC) {
+		for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) {
+			if (lsm1->lsm_md_oinfo[idx].lmo_mds !=
+			    lsm2->lsm_md_oinfo[idx].lmo_mds)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm)
+{
+	bool valid_hash = lmv_dir_bad_hash(lsm);
+	int i;
+
+	/* If lsm_md_magic == LMV_MAGIC_FOREIGN pool_name may not be a null
+	 * terminated string so only print LOV_MAXPOOLNAME bytes.
+	 */
+	CDEBUG(mask,
+	       "magic %#x stripe count %d master mdt %d hash type %s:%#x max-inherit %hhu max-inherit-rr %hhu version %d migrate offset %d migrate hash %#x pool %.*s\n",
+	       lsm->lsm_md_magic, lsm->lsm_md_stripe_count,
+	       lsm->lsm_md_master_mdt_index,
+	       valid_hash ? "invalid hash" :
+			    mdt_hash_name[lsm->lsm_md_hash_type & (LMV_HASH_TYPE_MAX - 1)],
+	       lsm->lsm_md_hash_type, lsm->lsm_md_max_inherit,
+	       lsm->lsm_md_max_inherit_rr, lsm->lsm_md_layout_version,
+	       lsm->lsm_md_migrate_offset, lsm->lsm_md_migrate_hash,
+	       LOV_MAXPOOLNAME, lsm->lsm_md_pool_name);
+
+	if (!lmv_dir_striped(lsm))
+		return;
+
+	for (i = 0; i < lsm->lsm_md_stripe_count; i++)
+		CDEBUG(mask, "stripe[%d] "DFID"\n",
+		       i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
+}
+
+union lmv_mds_md;
+
+void lmv_free_memmd(struct lmv_stripe_md *lsm);
+
+static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst,
+				  const struct lmv_mds_md_v1 *lmv_src)
+{
+	__u32 i;
+
+	lmv_dst->lmv_magic = le32_to_cpu(lmv_src->lmv_magic);
+	lmv_dst->lmv_stripe_count = le32_to_cpu(lmv_src->lmv_stripe_count);
+	lmv_dst->lmv_master_mdt_index =
+				le32_to_cpu(lmv_src->lmv_master_mdt_index);
+	lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type);
+	lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version);
+	if (lmv_src->lmv_stripe_count > LMV_MAX_STRIPE_COUNT)
+		return;
+	for (i = 0; i < lmv_src->lmv_stripe_count; i++)
+		fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i],
+			      &lmv_src->lmv_stripe_fids[i]);
+}
+
+static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst,
+				 const union lmv_mds_md *lmv_src)
+{
+	switch (le32_to_cpu(lmv_src->lmv_magic)) {
+	case LMV_MAGIC_V1:
+		lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1);
+		break;
+	default:
+		break;
+	}
+}
+
+/* This hash is only for testing purpose */
+static inline unsigned int
+lmv_hash_all_chars(unsigned int count, const char *name, int namelen)
+{
+	unsigned int c = 0;
+	const unsigned char *p = (const unsigned char *)name;
+
+	while (--namelen >= 0)
+		c += p[namelen];
+
+	c = c % count;
+
+	return c;
+}
+
+static inline unsigned int
+lmv_hash_fnv1a(unsigned int count, const char *name, int namelen)
+{
+	__u64 hash;
+
+	hash = lustre_hash_fnv_1a_64(name, namelen);
+
+	return do_div(hash, count);
+}
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ *
+ * Mixing inputs to generate an evenly distributed hash.
+ */
+#define crush_hashmix(a, b, c)				\
+do {							\
+	a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+	b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+	c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+	a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+	b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+	c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+	a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+	b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+	c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+} while (0)
+
+#define crush_hash_seed 1315423911
+
+static inline __u32 crush_hash(__u32 a, __u32 b)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b;
+	__u32 x = 231232;
+	__u32 y = 1232;
+
+	crush_hashmix(a, b, hash);
+	crush_hashmix(x, a, hash);
+	crush_hashmix(b, y, hash);
+
+	return hash;
+}
+
+/* refer to https://github.com/ceph/ceph/blob/master/src/crush/hash.c and
+ * https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf for details of CRUSH
+ * algorithm.
+ */
+static inline unsigned int
+lmv_hash_crush(unsigned int count, const char *name, int namelen)
+{
+	unsigned long long straw;
+	unsigned long long highest_straw = 0;
+	unsigned int pg_id;
+	unsigned int idx = 0;
+	int i;
+
+	/* put temp and backup file on the same MDT where target is located.
+	 * temporary file naming rule:
+	 * 1. rsync: .<target>.XXXXXX
+	 * 2. dstripe: <target>.XXXXXXXX
+	 */
+	if (lu_name_is_temp_file(name, namelen, true, 6)) {
+		name++;
+		namelen -= 8;
+	} else if (lu_name_is_temp_file(name, namelen, false, 8)) {
+		namelen -= 9;
+	} else if (lu_name_is_backup_file(name, namelen, &i)) {
+		LASSERT(i < namelen);
+		namelen -= i;
+	}
+
+	pg_id = lmv_hash_fnv1a(LMV_CRUSH_PG_COUNT, name, namelen);
+
+	/* distribute PG among all stripes pseudo-randomly, so they are almost
+	 * evenly distributed, and when stripe count changes, only (delta /
+	 * total) sub files need to be moved, herein 'delta' is added or removed
+	 * stripe count, 'total' is total stripe count before change for
+	 * removal, or count after change for addition.
+	 */
+	for (i = 0; i < count; i++) {
+		straw = crush_hash(pg_id, i);
+		if (straw > highest_straw) {
+			highest_straw = straw;
+			idx = i;
+		}
+	}
+	LASSERT(idx < count);
+
+	return idx;
+}
+
+/* directory layout may change in three ways:
+ * 1. directory migration, in its LMV source stripes are appended after
+ *    target stripes, \a migrate_hash is source hash type, \a migrate_offset is
+ *    target stripe count,
+ * 2. directory split, \a migrate_hash is hash type before split,
+ *    \a migrate_offset is stripe count before split.
+ * 3. directory merge, \a migrate_hash is hash type after merge,
+ *    \a migrate_offset is stripe count after merge.
+ */
+static inline int
+__lmv_name_to_stripe_index(__u32 hash_type, __u32 stripe_count,
+			   __u32 migrate_hash, __u32 migrate_offset,
+			   const char *name, int namelen, bool new_layout)
+{
+	__u32 saved_hash = hash_type;
+	__u32 saved_count = stripe_count;
+	int stripe_index = 0;
+
+	LASSERT(namelen > 0);
+	LASSERT(stripe_count > 0);
+
+	if (lmv_hash_is_splitting(hash_type)) {
+		if (!new_layout) {
+			hash_type = migrate_hash;
+			stripe_count = migrate_offset;
+		}
+	} else if (lmv_hash_is_merging(hash_type)) {
+		if (new_layout) {
+			hash_type = migrate_hash;
+			stripe_count = migrate_offset;
+		}
+	} else if (lmv_hash_is_migrating(hash_type)) {
+		if (new_layout) {
+			stripe_count = migrate_offset;
+		} else {
+			hash_type = migrate_hash;
+			stripe_count -= migrate_offset;
+		}
+	}
+
+	if (stripe_count > 1) {
+		switch (hash_type & LMV_HASH_TYPE_MASK) {
+		case LMV_HASH_TYPE_ALL_CHARS:
+			stripe_index = lmv_hash_all_chars(stripe_count, name,
+							  namelen);
+			break;
+		case LMV_HASH_TYPE_FNV_1A_64:
+			stripe_index = lmv_hash_fnv1a(stripe_count, name,
+						      namelen);
+			break;
+		case LMV_HASH_TYPE_CRUSH:
+			stripe_index = lmv_hash_crush(stripe_count, name,
+						      namelen);
+			break;
+		default:
+			return -EBADFD;
+		}
+	}
+
+	LASSERT(stripe_index < stripe_count);
+
+	if (!new_layout && lmv_hash_is_migrating(saved_hash))
+		stripe_index += migrate_offset;
+
+	LASSERT(stripe_index < saved_count);
+
+	CDEBUG(D_INFO, "name %.*s hash=%#x/%#x idx=%d/%u/%u under %s layout\n",
+	       namelen, name, saved_hash, migrate_hash, stripe_index,
+	       saved_count, migrate_offset, new_layout ? "new" : "old");
+
+	return stripe_index;
+}
+
+static inline int lmv_name_to_stripe_index(struct lmv_mds_md_v1 *lmv,
+					   const char *name, int namelen)
+{
+	if (lmv->lmv_magic == LMV_MAGIC_V1)
+		return __lmv_name_to_stripe_index(lmv->lmv_hash_type,
+						  lmv->lmv_stripe_count,
+						  lmv->lmv_migrate_hash,
+						  lmv->lmv_migrate_offset,
+						  name, namelen, true);
+
+	if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1))
+		return __lmv_name_to_stripe_index(
+					le32_to_cpu(lmv->lmv_hash_type),
+					le32_to_cpu(lmv->lmv_stripe_count),
+					le32_to_cpu(lmv->lmv_migrate_hash),
+					le32_to_cpu(lmv->lmv_migrate_offset),
+					name, namelen, true);
+
+	return -EINVAL;
+}
+
+static inline int lmv_name_to_stripe_index_old(struct lmv_mds_md_v1 *lmv,
+					       const char *name, int namelen)
+{
+	if (lmv->lmv_magic == LMV_MAGIC_V1 ||
+	    lmv->lmv_magic == LMV_MAGIC_STRIPE)
+		return __lmv_name_to_stripe_index(lmv->lmv_hash_type,
+						  lmv->lmv_stripe_count,
+						  lmv->lmv_migrate_hash,
+						  lmv->lmv_migrate_offset,
+						  name, namelen, false);
+
+	if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1) ||
+	    lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_STRIPE))
+		return __lmv_name_to_stripe_index(
+					le32_to_cpu(lmv->lmv_hash_type),
+					le32_to_cpu(lmv->lmv_stripe_count),
+					le32_to_cpu(lmv->lmv_migrate_hash),
+					le32_to_cpu(lmv->lmv_migrate_offset),
+					name, namelen, false);
+
+	return -EINVAL;
+}
+
+static inline bool lmv_user_magic_supported(__u32 lum_magic)
+{
+	return lum_magic == LMV_USER_MAGIC ||
+	       lum_magic == LMV_USER_MAGIC_SPECIFIC ||
+	       lum_magic == LMV_MAGIC_FOREIGN;
+}
+
+#define LMV_DEBUG(mask, lmv, msg)					\
+	CDEBUG(mask,							\
+	       "%s LMV: magic=%#x count=%u index=%u hash=%s:%#x version=%u migrate offset=%u migrate hash=%s:%u.\n",\
+	       msg, (lmv)->lmv_magic, (lmv)->lmv_stripe_count,		\
+	       (lmv)->lmv_master_mdt_index,				\
+	       mdt_hash_name[(lmv)->lmv_hash_type & (LMV_HASH_TYPE_MAX - 1)],\
+	       (lmv)->lmv_hash_type, (lmv)->lmv_layout_version,		\
+	       (lmv)->lmv_migrate_offset,				\
+	       mdt_hash_name[(lmv)->lmv_migrate_hash & (LMV_HASH_TYPE_MAX - 1)],\
+	       (lmv)->lmv_migrate_hash)
+
+/* master LMV is sane */
+static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv)
+{
+	if (!lmv)
+		return false;
+
+	if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1)
+		goto insane;
+
+	if (le32_to_cpu(lmv->lmv_stripe_count) == 0)
+		goto insane;
+
+	if (!lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_hash_type)))
+		goto insane;
+
+	return true;
+insane:
+	LMV_DEBUG(D_ERROR, lmv, "insane");
+	return false;
+}
+
+/* LMV can be either master or stripe LMV */
+static inline bool lmv_is_sane2(const struct lmv_mds_md_v1 *lmv)
+{
+	if (!lmv)
+		return false;
+
+	if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1 &&
+	    le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_STRIPE)
+		goto insane;
+
+	if (le32_to_cpu(lmv->lmv_stripe_count) == 0)
+		goto insane;
+
+	if (!lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_hash_type)))
+		goto insane;
+
+	return true;
+insane:
+	LMV_DEBUG(D_ERROR, lmv, "insane");
+	return false;
+}
+
+static inline bool lmv_is_splitting(const struct lmv_mds_md_v1 *lmv)
+{
+	if (!lmv_is_sane2(lmv))
+		return false;
+
+	return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type));
+}
+
+static inline bool lmv_is_merging(const struct lmv_mds_md_v1 *lmv)
+{
+	if (!lmv_is_sane2(lmv))
+		return false;
+
+	return lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type));
+}
+
+static inline bool lmv_is_migrating(const struct lmv_mds_md_v1 *lmv)
+{
+	if (!lmv_is_sane(lmv))
+		return false;
+
+	return lmv_hash_is_migrating(cpu_to_le32(lmv->lmv_hash_type));
+}
+
+static inline bool lmv_is_restriping(const struct lmv_mds_md_v1 *lmv)
+{
+	if (!lmv_is_sane2(lmv))
+		return false;
+
+	return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)) ||
+	       lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type));
+}
+
+static inline bool lmv_is_layout_changing(const struct lmv_mds_md_v1 *lmv)
+{
+	if (!lmv_is_sane2(lmv))
+		return false;
+
+	return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)) ||
+	       lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type)) ||
+	       lmv_hash_is_migrating(cpu_to_le32(lmv->lmv_hash_type));
+}
+
+static inline bool lmv_is_fixed(const struct lmv_mds_md_v1 *lmv)
+{
+	return cpu_to_le32(lmv->lmv_hash_type) & LMV_HASH_FLAG_FIXED;
+}
+
+static inline __u8 lmv_inherit_next(__u8 inherit)
+{
+	if (inherit == LMV_INHERIT_END || inherit == LMV_INHERIT_NONE)
+		return LMV_INHERIT_NONE;
+
+	if (inherit == LMV_INHERIT_UNLIMITED || inherit > LMV_INHERIT_MAX)
+		return inherit;
+
+	return inherit - 1;
+}
+
+static inline __u8 lmv_inherit_rr_next(__u8 inherit_rr)
+{
+	if (inherit_rr == LMV_INHERIT_RR_NONE ||
+	    inherit_rr == LMV_INHERIT_RR_UNLIMITED ||
+	    inherit_rr > LMV_INHERIT_RR_MAX)
+		return inherit_rr;
+
+	return inherit_rr - 1;
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log.h b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
new file mode 100644
index 0000000000000..360ba26dd52c8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
@@ -0,0 +1,572 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LUSTRE_LOG_H
+#define _LUSTRE_LOG_H
+
+/** \defgroup log log
+ *
+ * @{
+ */
+
+#include <obd_class.h>
+#include <dt_object.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_log_user.h>
+
+#define LOG_NAME_LIMIT(logname, name)                   \
+        snprintf(logname, sizeof(logname), "LOGS/%s", name)
+#define LLOG_EEMPTY 4711
+
+enum llog_open_param {
+	LLOG_OPEN_EXISTS	= 0x0000,
+	LLOG_OPEN_NEW		= 0x0001,
+};
+
+struct plain_handle_data {
+	struct list_head	phd_entry;
+	struct llog_handle	*phd_cat_handle;
+	/* cookie of this log in its cat */
+	struct llog_cookie	phd_cookie;
+};
+
+struct cat_handle_data {
+	struct list_head	chd_head;
+	struct llog_handle     *chd_current_log;/* currently open log */
+	struct llog_handle     *chd_next_log;	/* llog to be used next */
+};
+
+struct llog_handle;
+
+/* llog.c  -  general API */
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid);
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+		      struct llog_rec_hdr *rec, void *data);
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata);
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index);
+int llog_cancel_arr_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int num, int *index);
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param);
+int llog_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt,
+		  char *name);
+int llog_backup(const struct lu_env *env, struct obd_device *obd,
+		struct llog_ctxt *ctxt, struct llog_ctxt *bak_ctxt,
+		char *name, char *backup);
+int llog_read_header(const struct lu_env *env, struct llog_handle *handle,
+		     const struct obd_uuid *uuid);
+__u64 llog_size(const struct lu_env *env, struct llog_handle *llh);
+
+/* llog_process flags */
+#define LLOG_FLAG_NODEAMON 0x0001
+
+/* llog read mode, LLOG_READ_MODE_RAW will process llog canceled records */
+enum llog_read_mode {
+	LLOG_READ_MODE_NORMAL	= 0x0000,
+	LLOG_READ_MODE_RAW	= 0x0001,
+};
+
+
+/* llog_cat.c - catalog api */
+struct llog_process_data {
+        /**
+         * Any useful data needed while processing catalog. This is
+         * passed later to process callback.
+         */
+        void                *lpd_data;
+        /**
+         * Catalog process callback function, called for each record
+         * in catalog.
+         */
+        llog_cb_t            lpd_cb;
+        /**
+         * Start processing the catalog from startcat/startidx
+         */
+        int                  lpd_startcat;
+        int                  lpd_startidx;
+};
+
+struct llog_process_cat_data {
+        /**
+         * Temporary stored first_idx while scanning log.
+         */
+        int                  lpcd_first_idx;
+        /**
+         * Temporary stored last_idx while scanning log.
+         */
+        int                  lpcd_last_idx;
+	/**
+	 * llog read mode
+	 */
+	enum llog_read_mode  lpcd_read_mode;
+};
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     struct thandle *th);
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th);
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie);
+int llog_cat_cancel_arr_rec(const struct lu_env *env,
+			    struct llog_handle *cathandle,
+			    struct llog_logid *lgl, int count, int *index);
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies);
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cat_cb,
+			     llog_cb_t cb, void *data, int startcat,
+			     int startidx, bool fork);
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx);
+__u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh);
+__u32 llog_cat_free_space(struct llog_handle *cat_llh);
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cb,
+			     void *data);
+/* llog_obd.c */
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, const struct llog_operations *op);
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+
+/* llog_ioctl.c */
+struct obd_ioctl_data;
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+	       struct obd_ioctl_data *data);
+int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
+		      int count, struct obd_ioctl_data *data,
+		      const struct lu_fid *fid);
+
+/* llog_net.c */
+int llog_initiator_connect(struct llog_ctxt *ctxt);
+
+struct llog_operations {
+	int (*lop_declare_destroy)(const struct lu_env *env,
+			   struct llog_handle *handle, struct thandle *th);
+	int (*lop_destroy)(const struct lu_env *env,
+			   struct llog_handle *handle, struct thandle *th);
+	int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h,
+			      int *curr_idx, int next_idx, __u64 *offset,
+			      void *buf, int len);
+	int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h,
+			      int prev_idx, void *buf, int len);
+	int (*lop_read_header)(const struct lu_env *env,
+			       struct llog_handle *handle);
+	int (*lop_setup)(const struct lu_env *env, struct obd_device *obd,
+			 struct obd_llog_group *olg, int ctxt_idx,
+			 struct obd_device *disk_obd);
+	int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
+			int flags);
+	int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+	int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
+			   struct llog_gen *gen, struct obd_uuid *uuid);
+	/**
+	 * Any llog file must be opened first using llog_open().  Llog can be
+	 * opened by name, logid or without both, in last case the new logid
+	 * will be generated.
+	 */
+	int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh,
+			struct llog_logid *logid, char *name,
+			enum llog_open_param);
+	/**
+	 * Opened llog may not exist and this must be checked where needed using
+	 * the llog_exist() call.
+	 */
+	int (*lop_exist)(struct llog_handle *lgh);
+	/**
+	 * Close llog file and calls llog_free_handle() implicitly.
+	 * Any opened llog must be closed by llog_close() call.
+	 */
+	int (*lop_close)(const struct lu_env *env, struct llog_handle *handle);
+	/**
+	 * Create new llog file. The llog must be opened.
+	 * Must be used only for local llog operations.
+	 */
+	int (*lop_declare_create)(const struct lu_env *env,
+				  struct llog_handle *handle,
+				  struct thandle *th);
+	int (*lop_create)(const struct lu_env *env, struct llog_handle *handle,
+			  struct thandle *th);
+	/**
+	 * write new record in llog. It appends records usually but can edit
+	 * existing records too.
+	 */
+	int (*lop_declare_write_rec)(const struct lu_env *env,
+				     struct llog_handle *lgh,
+				     struct llog_rec_hdr *rec,
+				     int idx, struct thandle *th);
+	int (*lop_write_rec)(const struct lu_env *env,
+			     struct llog_handle *loghandle,
+			     struct llog_rec_hdr *rec,
+			     struct llog_cookie *cookie,
+			     int idx, struct thandle *th);
+	/**
+	 * Add new record in llog catalog. Does the same as llog_write_rec()
+	 * but using llog catalog.
+	 */
+	int (*lop_declare_add)(const struct lu_env *env,
+			       struct llog_handle *lgh,
+			       struct llog_rec_hdr *rec, struct thandle *th);
+	int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh,
+		       struct llog_rec_hdr *rec, struct llog_cookie *cookie,
+		       struct thandle *th);
+};
+
+/* In-memory descriptor for a log object or log catalog */
+struct llog_handle {
+	struct rw_semaphore	 lgh_lock;
+	struct mutex		 lgh_hdr_mutex; /* protect lgh_hdr data */
+	struct llog_logid	 lgh_id; /* id of this log */
+	struct llog_log_hdr	*lgh_hdr; /* may be vmalloc'd */
+	size_t			lgh_hdr_size;
+	struct dt_object	*lgh_obj;
+	/* For a Catalog, is the last/newest used index for a plain slot.
+	 * Used in conjunction with llh_cat_idx to handle Catalog wrap-around
+	 * case, after it will have reached LLOG_HDR_BITMAP_SIZE, llh_cat_idx
+	 * will become its upper limit */
+	int			 lgh_last_idx;
+	struct rw_semaphore	 lgh_last_sem;
+	__u64			 lgh_cur_offset; /* used for test only */
+	struct llog_ctxt	*lgh_ctxt;
+	union {
+		struct plain_handle_data	 phd;
+		struct cat_handle_data		 chd;
+	} u;
+	char			*lgh_name;
+	void			*private_data;
+	const struct llog_operations	*lgh_logops;
+	refcount_t		 lgh_refcount;
+
+	int			lgh_max_size;
+	bool			lgh_destroyed;
+};
+
+/* llog_osd.c */
+extern const struct llog_operations llog_osd_ops;
+extern const struct llog_operations llog_common_cat_ops;
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray,
+			  const struct lu_fid *fid);
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray,
+			  const struct lu_fid *fid);
+
+#define LLOG_CTXT_FLAG_UNINITIALIZED     0x00000001
+#define LLOG_CTXT_FLAG_STOP		 0x00000002
+
+/* Indicate the llog objects under this context are normal FID objects,
+ * instead of objects with local FID. */
+#define LLOG_CTXT_FLAG_NORMAL_FID	 0x00000004
+
+struct llog_ctxt {
+	int			 loc_idx; /* my index the obd array of ctxt's */
+	struct obd_device	*loc_obd; /* points back to the containing obd*/
+	struct obd_llog_group	*loc_olg; /* group containing that ctxt */
+	struct obd_export	*loc_exp; /* parent "disk" export (e.g. MDS) */
+	struct obd_import	*loc_imp; /* to use in RPC's: can be backward
+					   * pointing import */
+	const struct llog_operations  *loc_logops;
+	struct llog_handle	*loc_handle;
+	struct mutex		 loc_mutex; /* protect loc_imp */
+	atomic_t		 loc_refcount;
+	long			 loc_flags; /* flags, see above defines */
+	struct dt_object	*loc_dir;
+	struct local_oid_storage *loc_los_nameless;
+	struct local_oid_storage *loc_los_named;
+	/* llog chunk size, and llog record size can not be bigger than
+	 * loc_chunk_size */
+	__u32			 loc_chunk_size;
+};
+
+#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
+#define LLOG_DEL_PLAIN  0x0003
+
+static inline int llog_obd2ops(struct llog_ctxt *ctxt,
+			       const struct llog_operations **lop)
+{
+	if (ctxt == NULL)
+		return -ENOTCONN;
+
+	*lop = ctxt->loc_logops;
+	if (*lop == NULL)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static inline int llog_handle2ops(struct llog_handle *loghandle,
+				  const struct llog_operations **lop)
+{
+	if (loghandle == NULL || loghandle->lgh_logops == NULL)
+		return -EINVAL;
+
+	*lop = loghandle->lgh_logops;
+	return 0;
+}
+
+static inline int llog_data_len(int len)
+{
+	return cfs_size_round(len);
+}
+
+static inline int llog_get_size(struct llog_handle *loghandle)
+{
+	if (loghandle && loghandle->lgh_hdr)
+		return loghandle->lgh_hdr->llh_count;
+	return 0;
+}
+
+static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt)
+{
+	atomic_inc(&ctxt->loc_refcount);
+	CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount));
+	return ctxt;
+}
+
+static inline void llog_ctxt_put(struct llog_ctxt *ctxt)
+{
+	if (ctxt == NULL)
+		return;
+	LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount) - 1);
+	__llog_ctxt_put(NULL, ctxt);
+}
+
+static inline void llog_group_init(struct obd_llog_group *olg)
+{
+	init_waitqueue_head(&olg->olg_waitq);
+	spin_lock_init(&olg->olg_lock);
+}
+
+static inline int llog_group_set_ctxt(struct obd_llog_group *olg,
+                                      struct llog_ctxt *ctxt, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] != NULL) {
+		spin_unlock(&olg->olg_lock);
+		return -EEXIST;
+	}
+	olg->olg_ctxts[index] = ctxt;
+	spin_unlock(&olg->olg_lock);
+	return 0;
+}
+
+static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg,
+                                                    int index)
+{
+	struct llog_ctxt *ctxt;
+
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] == NULL)
+		ctxt = NULL;
+	else
+		ctxt = llog_ctxt_get(olg->olg_ctxts[index]);
+	spin_unlock(&olg->olg_lock);
+	return ctxt;
+}
+
+static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+	spin_lock(&olg->olg_lock);
+	olg->olg_ctxts[index] = NULL;
+	spin_unlock(&olg->olg_lock);
+}
+
+static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
+                                                 int index)
+{
+        return llog_group_get_ctxt(&obd->obd_olg, index);
+}
+
+static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index)
+{
+        return (olg->olg_ctxts[index] == NULL);
+}
+
+static inline int llog_ctxt_null(struct obd_device *obd, int index)
+{
+        return (llog_group_ctxt_null(&obd->obd_olg, index));
+}
+
+static inline int llog_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle, int *cur_idx,
+				  int next_idx, __u64 *cur_offset, void *buf,
+				  int len)
+{
+	const struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_next_block == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx,
+				 cur_offset, buf, len);
+	RETURN(rc);
+}
+
+static inline int llog_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+	const struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_prev_block == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len);
+	RETURN(rc);
+}
+
+static inline int llog_connect(struct llog_ctxt *ctxt,
+			       struct llog_logid *logid, struct llog_gen *gen,
+			       struct obd_uuid *uuid)
+{
+	const struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_obd2ops(ctxt, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_connect == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_connect(ctxt, logid, gen, uuid);
+	RETURN(rc);
+}
+
+static inline int llog_is_full(struct llog_handle *llh)
+{
+	return llh->lgh_last_idx >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1;
+}
+
+struct llog_cfg_rec {
+	struct llog_rec_hdr	lcr_hdr;
+	struct lustre_cfg	lcr_cfg;
+	struct llog_rec_tail	lcr_tail;
+};
+
+struct llog_cfg_rec *lustre_cfg_rec_new(int cmd, struct lustre_cfg_bufs *bufs);
+void lustre_cfg_rec_free(struct llog_cfg_rec *lcr);
+
+enum {
+	LLOG_NEXT_IDX = -1,
+	LLOG_HEADER_IDX = 0,
+};
+
+/* llog.c */
+int llog_exist(struct llog_handle *loghandle);
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th);
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th);
+int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle,
+		       struct thandle *th);
+int llog_destroy(const struct lu_env *env, struct llog_handle *handle);
+
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th);
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int idx, struct thandle *th);
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     struct thandle *th);
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th);
+int lustre_process_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname,
+		   struct config_llog_instance *cfg);
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name);
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name);
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, int idx);
+
+/** @} log */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
new file mode 100644
index 0000000000000..0f3d7592fc154
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
@@ -0,0 +1,126 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_mdc.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDC_H
+#define _LUSTRE_MDC_H
+
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+# include <lustre_compat.h>
+#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
+#include <lustre_handles.h>
+#include <lustre_intent.h>
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct ptlrpc_client;
+struct obd_export;
+struct ptlrpc_request;
+struct obd_device;
+
+/**
+ * Update the maximum possible easize.
+ *
+ * This value is learned from ptlrpc replies sent by the MDT.  The
+ * default easize is initialized to the minimum value but allowed to
+ * grow up to a single page in size if required to handle the common
+ * case.
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] exp	export for MDC device
+ * \param[in] body	body of ptlrpc reply from MDT
+ *
+ */
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+					       struct mdt_body *body)
+{
+	if (body->mbo_valid & OBD_MD_FLMODEASIZE) {
+		struct client_obd *cli = &exp->exp_obd->u.cli;
+		__u32 def_easize;
+
+		if (cli->cl_max_mds_easize < body->mbo_max_mdsize)
+			cli->cl_max_mds_easize = body->mbo_max_mdsize;
+
+		def_easize = min_t(__u32, body->mbo_max_mdsize,
+				   OBD_MAX_DEFAULT_EA_SIZE);
+		cli->cl_default_mds_easize = def_easize;
+	}
+}
+
+
+/* mdc/mdc_locks.c */
+int it_open_error(int phase, struct lookup_intent *it);
+
+static inline bool cl_is_lov_delay_create(unsigned int flags)
+{
+	return  (flags & O_LOV_DELAY_CREATE_1_8) != 0 ||
+		(flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK;
+}
+
+static inline void cl_lov_delay_create_clear(unsigned int *flags)
+{
+	if ((*flags & O_LOV_DELAY_CREATE_1_8) != 0)
+		*flags &= ~O_LOV_DELAY_CREATE_1_8;
+	if ((*flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK)
+		*flags &= ~O_LOV_DELAY_CREATE_MASK;
+}
+
+static inline bool cl_is_lu_noimport(unsigned int flags)
+{
+	return (flags & O_LU_NOIMPORT_MASK) == O_LU_NOIMPORT_MASK;
+}
+
+static inline void cl_lu_noimport_clear(unsigned int *flags)
+{
+	if (cl_is_lu_noimport(*flags))
+		*flags &= ~O_LU_NOIMPORT_MASK;
+}
+
+/** @} mdc */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
new file mode 100644
index 0000000000000..8c3c010c8c49a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_mds.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDS_H
+#define _LUSTRE_MDS_H
+
+/** \defgroup mds mds
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct md_rejig_data {
+	struct md_object	*mrd_obj;
+	__u16			mrd_mirror_id;
+};
+
+#define MDD_OBD_NAME     "mdd_obd"
+#define MDD_OBD_UUID     "mdd_obd_uuid"
+
+static inline int md_should_create(u64 open_flags)
+{
+	return !(open_flags & MDS_OPEN_DELAY_CREATE) &&
+		(open_flags & MDS_FMODE_WRITE) &&
+	       !(open_flags & MDS_OPEN_LEASE);
+}
+
+/* do NOT or the MAY_*'s, you'll get the weakest */
+static inline int mds_accmode(u64 open_flags)
+{
+	unsigned int may_mask = 0;
+
+	if (open_flags & MDS_FMODE_READ)
+		may_mask |= MAY_READ;
+	if (open_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC | MDS_OPEN_APPEND))
+		may_mask |= MAY_WRITE;
+	if (open_flags & MDS_FMODE_EXEC)
+		may_mask = MAY_EXEC;
+
+	return may_mask;
+}
+
+/** @} mds */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_net.h b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
new file mode 100644
index 0000000000000..7fde30cfe18b3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
@@ -0,0 +1,2673 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+/** \defgroup PtlRPC Portal RPC and networking module.
+ *
+ * PortalRPC is the layer used by rest of lustre code to achieve network
+ * communications: establish connections with corresponding export and import
+ * states, listen for a service, send and receive RPCs.
+ * PortalRPC also includes base recovery framework: packet resending and
+ * replaying, reconnections, pinger.
+ *
+ * PortalRPC utilizes LNet as its transport layer.
+ *
+ * @{
+ */
+
+
+#ifndef _LUSTRE_NET_H
+#define _LUSTRE_NET_H
+
+/** \defgroup net net
+ *
+ * @{
+ */
+#include <linux/kobject.h>
+#include <linux/rhashtable.h>
+#include <linux/uio.h>
+#include <libcfs/libcfs.h>
+#include <lnet/api.h>
+#include <lnet/lib-types.h>
+#include <uapi/linux/lnet/nidstr.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_ha.h>
+#include <lustre_sec.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lu_object.h>
+#include <lustre_req_layout.h>
+#include <obd_support.h>
+#include <uapi/linux/lustre/lustre_ver.h>
+
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS  0
+
+/**
+ * log2 max # of bulk operations in one request: 2=4MB/RPC, 5=32MB/RPC, ...
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value.  The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value.
+ * NOTE: This is limited to 16 (=64GB RPCs) by IOOBJ_MAX_BRW_BITS. */
+#define PTLRPC_BULK_OPS_BITS	6
+#if PTLRPC_BULK_OPS_BITS > 16
+#error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS."
+#endif
+#define PTLRPC_BULK_OPS_COUNT	(1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all.  Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future.  Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK	(~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers.  Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS	(LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE	(1U << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES	(PTLRPC_MAX_BRW_SIZE >> PAGE_SHIFT)
+
+#define ONE_MB_BRW_SIZE		(1U << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE		(1U << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES	(MD_MAX_BRW_SIZE >> PAGE_SHIFT)
+#define DT_MAX_BRW_SIZE		PTLRPC_MAX_BRW_SIZE
+#define DT_DEF_BRW_SIZE		(4 * ONE_MB_BRW_SIZE)
+#define DT_MAX_BRW_PAGES	(DT_MAX_BRW_SIZE >> PAGE_SHIFT)
+#define OFD_MAX_BRW_SIZE	(1U << LNET_MTU_BITS)
+
+/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
+#if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+# error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
+#endif
+#if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_SIZE))
+# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_SIZE"
+#endif
+#if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
+# error "PTLRPC_MAX_BRW_SIZE too big"
+#endif
+#if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
+# error "PTLRPC_MAX_BRW_PAGES too big"
+#endif
+
+#define PTLRPC_NTHRS_INIT	2
+
+/**
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
+ *
+ * ?_NBUFS              # buffers to allocate when growing the pool
+ * ?_BUFSIZE            # bytes in a single request buffer
+ * ?_MAXREQSIZE         # maximum request service will receive
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped.  Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
+ */
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT	        # threads to create for each service partition on
+ *			  initializing. If it's non-affinity service and
+ *			  there is only one partition, it's the overall #
+ *			  threads for the service while initializing.
+ * ?_NTHRS_BASE		# threads should be created at least for each
+ *			  ptlrpc partition to keep the service healthy.
+ *			  It's the low-water mark of threads upper-limit
+ *			  for each partition.
+ * ?_THR_FACTOR         # threads can be added on threads upper-limit for
+ *			  each CPU core. This factor is only for reference,
+ *			  we might decrease value of factor if number of cores
+ *			  per CPT is above a limit.
+ * ?_NTHRS_MAX		# overall threads can be created for a service,
+ *			  it's a soft limit because if service is running
+ *			  on machine with hundreds of cores and tens of
+ *			  CPU partitions, we need to guarantee each partition
+ *			  has ?_NTHRS_BASE threads, which means total threads
+ *			  will be ?_NTHRS_BASE * number_of_cpts which can
+ *			  exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT	2
+ * #define MDS_NTHRS_BASE	64
+ * #define MDS_NTHRS_FACTOR	8
+ * #define MDS_NTHRS_MAX	1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ *     96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ *     128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ *     160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ *     MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ *     MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ *     filesystem itself, buffer cache, or underlying network stack might
+ *     have some SMP scalability issues at that large scale.
+ *
+ *     If user already has a fat machine with hundreds or thousands of cores,
+ *     there are two choices for configuration:
+ *     a) create CPU table from subset of all CPUs and run Lustre on
+ *        top of this subset
+ *     b) bind service threads on a few partitions, see modparameters of
+ *        MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ *     understanding, the real implementation is a little more complex,
+ *     please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+  * LDLM threads constants:
+  *
+  * Given 8 as factor and 24 as base threads number
+  *
+  * example 1)
+  * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+  *
+  * example 2)
+  * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+  * threads for each partition and total threads number will be 112.
+  *
+  * example 3)
+  * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+  * threads for each partition to keep service healthy, so total threads
+  * number should be 24 * 8 = 192.
+  *
+  * So with these constants, threads number will be at the similar level
+  * of old versions, unless target machine has over a hundred cores
+  */
+#define LDLM_THR_FACTOR		8
+#define LDLM_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE		24
+#define LDLM_NTHRS_MAX		(num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS   LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE      (8 * 1024)
+#define LDLM_MAXREQSIZE   (5 * 1024)
+#define LDLM_MAXREPSIZE   (1024)
+
+ /*
+  * MDS threads constants:
+  *
+  * Please see examples in "Thread Constants", MDS threads number will be at
+  * the comparable level of old versions, unless the server has many cores.
+  */
+#ifndef MDS_MAX_THREADS
+#define MDS_MAX_THREADS		1024
+#define MDS_MAX_OTHR_THREADS	256
+
+#else /* MDS_MAX_THREADS */
+#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT
+#undef MDS_MAX_THREADS
+#define MDS_MAX_THREADS	PTLRPC_NTHRS_INIT
+#endif
+#define MDS_MAX_OTHR_THREADS	max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2)
+#endif
+
+/* default service */
+#define MDS_THR_FACTOR		8
+#define MDS_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define MDS_NTHRS_MAX		MDS_MAX_THREADS
+#define MDS_NTHRS_BASE		min(64, MDS_NTHRS_MAX)
+
+/* read-page service */
+#define MDS_RDPG_THR_FACTOR	4
+#define MDS_RDPG_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_RDPG_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_RDPG_NTHRS_BASE	min(48, MDS_RDPG_NTHRS_MAX)
+
+/* these should be removed when we remove setattr service in the future */
+#define MDS_SETA_THR_FACTOR	4
+#define MDS_SETA_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_SETA_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_SETA_NTHRS_BASE	min(48, MDS_SETA_NTHRS_MAX)
+
+/* non-affinity threads */
+#define MDS_OTHR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_OTHR_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+
+#define MDS_NBUFS		64
+
+/**
+ * Assume file name length = FNAME_MAX = 256 (true for ext3).
+ *	  path name length = PATH_MAX = 4096
+ *	  LOV MD size max  = EA_MAX = 24 * 2000
+ *	  	(NB: 24 is size of lov_ost_data)
+ *	  LOV LOGCOOKIE size max = 32 * 2000
+ *	  	(NB: 32 is size of llog_cookie)
+ * symlink:  FNAME_MAX + PATH_MAX  <- largest
+ * link:     FNAME_MAX + PATH_MAX  (mds_rec_link < mds_rec_create)
+ * rename:   FNAME_MAX + FNAME_MAX
+ * open:     FNAME_MAX + EA_MAX
+ *
+ * MDS_MAXREQSIZE ~= 4736 bytes =
+ * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX
+ * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
+ *
+ * Realistic size is about 512 bytes (20 character name + 128 char symlink),
+ * except in the open case where there are a large number of OSTs in a LOV.
+ */
+#define MDS_MAXREQSIZE		(5 * 1024)	/* >= 4736 */
+#define MDS_MAXREPSIZE		(9 * 1024)	/* >= 8300 */
+
+/**
+ * MDS incoming request with LOV EA
+ * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate
+ */
+#define MDS_LOV_MAXREQSIZE	max(MDS_MAXREQSIZE, \
+				    362 + LOV_MAX_STRIPE_COUNT * 24)
+/**
+ * MDS outgoing reply with LOV EA
+ *
+ * NB: max reply size Lustre 2.4+ client can get from old MDS is:
+ * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes
+ *
+ * but 2.4 or later MDS will never send reply with llog_cookie to any
+ * version client. This macro is defined for server side reply buffer size.
+ */
+#define MDS_LOV_MAXREPSIZE	MDS_LOV_MAXREQSIZE
+
+/**
+ * This is the size of a maximum REINT_SETXATTR request:
+ *
+ *   lustre_msg		 56 (32 + 4 x 5 + 4)
+ *   ptlrpc_body	184
+ *   mdt_rec_setxattr	136
+ *   lustre_capa	120
+ *   name		256 (XATTR_NAME_MAX)
+ *   value	      65536 (XATTR_SIZE_MAX)
+ */
+#define MDS_EA_MAXREQSIZE	66288
+
+/**
+ * These are the maximum request and reply sizes (rounded up to 1 KB
+ * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL.
+ */
+#define MDS_REG_MAXREQSIZE	(((max(MDS_EA_MAXREQSIZE, \
+				       MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10)
+#define MDS_REG_MAXREPSIZE	MDS_REG_MAXREQSIZE
+
+/**
+ * The update request includes all of updates from the create, which might
+ * include linkea (4K maxim), together with other updates, we set it to 1000K:
+ * lustre_msg + ptlrpc_body + OUT_UPDATE_BUFFER_SIZE_MAX
+ */
+#define OUT_MAXREQSIZE	(1000 * 1024)
+#define OUT_MAXREPSIZE	MDS_MAXREPSIZE
+
+/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
+#define MDS_BUFSIZE		max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    8 * 1024)
+
+/**
+ * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD.
+ * However, we need to allocate a much larger buffer for it because LNet
+ * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid
+ * dropping of maximum-sized incoming request.  So if MDS_REG_BUFSIZE is only a
+ * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request
+ * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory
+ * utilization is very low.
+ *
+ * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
+ * reused until all requests fit in it have been processed and released,
+ * which means one long blocked request can prevent the rqbd be reused.
+ * Now we set request buffer size to 160 KB, so even each rqbd is unlinked
+ * from LNet with unused 65 KB, buffer utilization will be about 59%.
+ * Please check LU-2432 for details.
+ */
+#define MDS_REG_BUFSIZE		max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    160 * 1024)
+
+/**
+ * OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
+ * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some
+ * extra bytes to each request buffer to improve buffer utilization rate.
+  */
+#define OUT_BUFSIZE		max(OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    24 * 1024)
+
+/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
+#define FLD_MAXREQSIZE  (160)
+
+/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */
+#define FLD_MAXREPSIZE  (152)
+#define FLD_BUFSIZE	(1 << 12)
+
+/**
+ * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range +
+ * __u32 padding */
+#define SEQ_MAXREQSIZE  (160)
+
+/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */
+#define SEQ_MAXREPSIZE  (152)
+#define SEQ_BUFSIZE	(1 << 12)
+
+/** MGS threads must be >= 3, see bug 22458 comment #28 */
+#define MGS_NTHRS_INIT	(PTLRPC_NTHRS_INIT + 1)
+#define MGS_NTHRS_MAX	32
+
+#define MGS_NBUFS       64
+#define MGS_BUFSIZE     (8 * 1024)
+#define MGS_MAXREQSIZE  (7 * 1024)
+#define MGS_MAXREPSIZE  (9 * 1024)
+
+ /*
+  * OSS threads constants:
+  *
+  * Given 8 as factor and 64 as base threads number
+  *
+  * example 1):
+  * On 8-core server configured to 2 partitions, we will have
+  * 64 + 8 * 4 = 96 threads for each partition, 192 total threads.
+  *
+  * example 2):
+  * On 32-core machine configured to 4 partitions, we will have
+  * 64 + 8 * 8 = 112 threads for each partition, so total threads number
+  * will be 112 * 4 = 448.
+  *
+  * example 3):
+  * On 64-core machine configured to 4 partitions, we will have
+  * 64 + 16 * 8 = 192 threads for each partition, so total threads number
+  * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we
+  * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads
+  * for each partition.
+  *
+  * So we can see that with these constants, threads number wil be at the
+  * similar level of old versions, unless the server has many cores.
+  */
+ /* depress threads factor for VM with small memory size */
+#define OSS_THR_FACTOR		min_t(int, 8, \
+				NUM_CACHEPAGES >> (28 - PAGE_SHIFT))
+#define OSS_NTHRS_INIT		(PTLRPC_NTHRS_INIT + 1)
+#define OSS_NTHRS_BASE		64
+
+/* threads for handling "create" request */
+#define OSS_CR_THR_FACTOR	1
+#define OSS_CR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define OSS_CR_NTHRS_BASE	8
+#define OSS_CR_NTHRS_MAX	64
+
+/**
+ * OST_IO_MAXREQSIZE ~=
+ *	lustre_msg + ptlrpc_body + obdo + obd_ioobj +
+ *	DT_MAX_BRW_PAGES * niobuf_remote
+ *
+ * - single object with 16 pages is 512 bytes
+ * - OST_IO_MAXREQSIZE must be at least 1 niobuf per page of data
+ * - Must be a multiple of 1024
+ * - should allow a reasonably large SHORT_IO_BYTES size (64KB)
+ */
+#define _OST_MAXREQSIZE_BASE ((unsigned long)(sizeof(struct lustre_msg)   + \
+			     /* lm_buflens */ sizeof(__u32) * 4		  + \
+					      sizeof(struct ptlrpc_body)  + \
+					      sizeof(struct obdo)	  + \
+					      sizeof(struct obd_ioobj)	  + \
+					      sizeof(struct niobuf_remote)))
+#define _OST_MAXREQSIZE_SUM ((unsigned long)(_OST_MAXREQSIZE_BASE	  + \
+					     sizeof(struct niobuf_remote) * \
+					     DT_MAX_BRW_PAGES))
+/**
+ * FIEMAP request can be 4K+ for now
+ */
+#define OST_MAXREQSIZE		(16UL * 1024UL)
+#define OST_IO_MAXREQSIZE	max(OST_MAXREQSIZE,			\
+				   ((_OST_MAXREQSIZE_SUM - 1) |		\
+				    (1024UL - 1)) + 1)
+/* Safe estimate of free space in standard RPC, provides upper limit for # of
+ * bytes of i/o to pack in RPC (skipping bulk transfer). */
+#define OST_MAX_SHORT_IO_BYTES	((OST_IO_MAXREQSIZE - _OST_MAXREQSIZE_BASE) & \
+				 PAGE_MASK)
+
+/* Actual size used for short i/o buffer.  Calculation means this:
+ * At least one page (for large PAGE_SIZE), or 16 KiB, but not more
+ * than the available space aligned to a page boundary. */
+#define OBD_DEF_SHORT_IO_BYTES	min(max(PAGE_SIZE, 16UL * 1024UL), \
+				    OST_MAX_SHORT_IO_BYTES)
+
+#define OST_MAXREPSIZE		(9 * 1024)
+#define OST_IO_MAXREPSIZE	OST_MAXREPSIZE
+
+#define OST_NBUFS		64
+/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */
+#define OST_BUFSIZE		max_t(int, OST_MAXREQSIZE + 1024, 32 * 1024)
+/**
+ * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization
+ * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details.
+ */
+#define OST_IO_BUFSIZE		max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
+
+/* Macro to hide a typecast and BUILD_BUG. */
+#define ptlrpc_req_async_args(_var, req) ({				\
+		BUILD_BUG_ON(sizeof(*_var) > sizeof(req->rq_async_args)); \
+		(typeof(_var))&req->rq_async_args;			\
+	})
+
+struct ptlrpc_replay_async_args {
+	int		praa_old_state;
+	int		praa_old_status;
+};
+
+/**
+ * Structure to single define portal connection.
+ */
+struct ptlrpc_connection {
+	/** linkage for connections hash table */
+	struct rhash_head	c_hash;
+	/** Our own lnet nid for this connection */
+	struct lnet_nid		c_self;
+	/** Remote side nid for this connection */
+	struct lnet_processid	c_peer;
+	/** UUID of the other side */
+	struct obd_uuid		c_remote_uuid;
+	/** reference counter for this connection */
+	atomic_t		c_refcount;
+};
+
+/** Client definition for PortalRPC */
+struct ptlrpc_client {
+	/** What lnet portal does this client send messages to by default */
+	__u32			cli_request_portal;
+	/** What portal do we expect replies on */
+	__u32			cli_reply_portal;
+	/** Name of the client */
+	const char		*cli_name;
+};
+
+/** state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
+#define PTL_RPC_FL_INTR		BIT(0)	/* reply wait was interrupted by user */
+#define PTL_RPC_FL_TIMEOUT	BIT(7)	/* request timed out waiting for reply */
+
+#define REQ_MAX_ACK_LOCKS 8
+
+union ptlrpc_async_args {
+	/**
+	 * Scratchpad for passing args to completion interpreter. Users
+	 * cast to the struct of their choosing, and BUILD_BUG_ON that this is
+	 * big enough.  For _tons_ of context, OBD_ALLOC a struct and store
+	 * a pointer to it here.  The pointer_arg ensures this struct is at
+	 * least big enough for that.
+	 */
+	void    *pointer_arg[11];
+	__u64   space[7];
+};
+
+struct ptlrpc_request_set;
+typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
+
+/**
+ * Definition of request set structure.
+ * Request set is a list of requests (not necessary to the same target) that
+ * once populated with RPCs could be sent in parallel.
+ * There are two kinds of request sets. General purpose and with dedicated
+ * serving thread. Example of the latter is ptlrpcd set.
+ * For general purpose sets once request set started sending it is impossible
+ * to add new requests to such set.
+ * Provides a way to call "completion callbacks" when all requests in the set
+ * returned.
+ */
+struct ptlrpc_request_set {
+	atomic_t		set_refcount;
+	/** number of in queue requests */
+	atomic_t		set_new_count;
+	/** number of uncompleted requests */
+	atomic_t		set_remaining;
+	/** wait queue to wait on for request events */
+	wait_queue_head_t	set_waitq;
+	/** List of requests in the set */
+	struct list_head	set_requests;
+	/**
+	 * Lock for \a set_new_requests manipulations
+	 * locked so that any old caller can communicate requests to
+	 * the set holder who can then fold them into the lock-free set
+	 */
+	spinlock_t		set_new_req_lock;
+	/** List of new yet unsent requests. Only used with ptlrpcd now. */
+	struct list_head	set_new_requests;
+
+	/** rq_status of requests that have been freed already */
+	int			set_rc;
+	/** Additional fields used by the flow control extension */
+	/** Maximum number of RPCs in flight */
+	int			set_max_inflight;
+	/** Callback function used to generate RPCs */
+	set_producer_func	set_producer;
+	/** opaq argument passed to the producer callback */
+	void			*set_producer_arg;
+	unsigned int		 set_allow_intr:1;
+};
+
+struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
+struct ptlrpc_service;
+
+/**
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+	void (*cbid_fn)(struct lnet_event *ev);	/* specific callback fn */
+	void *cbid_arg;				/* additional arg */
+};
+
+/** Maximum number of locks to fit into reply state */
+#define RS_MAX_LOCKS 8
+#define RS_DEBUG     0
+
+/**
+ * Structure to define reply state on the server
+ * Reply state holds various reply message information. Also for "difficult"
+ * replies (rep-ack case) we store the state after sending reply and wait
+ * for the client to acknowledge the reception. In these cases locks could be
+ * added to the state for replay/failover consistency guarantees.
+ */
+struct ptlrpc_reply_state {
+	/** Callback description */
+	struct ptlrpc_cb_id	rs_cb_id;
+	/** Linkage for list of all reply states in a system */
+	struct list_head	rs_list;
+	/** Linkage for list of all reply states on same export */
+	struct list_head	rs_exp_list;
+	/** Linkage for list of all reply states for same obd */
+	struct list_head	rs_obd_list;
+#if RS_DEBUG
+	struct list_head	rs_debug_list;
+#endif
+	/** A spinlock to protect the reply state flags */
+	spinlock_t		rs_lock;
+	/** Reply state flags */
+	unsigned long		rs_difficult:1;     /* ACK/commit stuff */
+	unsigned long		rs_no_ack:1;    /* no ACK, even for
+                                                  difficult requests */
+	unsigned long		rs_scheduled:1;     /* being handled? */
+	unsigned long		rs_scheduled_ever:1;/* any schedule attempts? */
+	unsigned long		rs_handled:1;  /* been handled yet? */
+	unsigned long		rs_sent:1;   /* Got LNET_EVENT_SEND? */
+	unsigned long		rs_unlinked:1; /* Reply MD unlinked? */
+	unsigned long		rs_prealloc:1; /* rs from prealloc list */
+	unsigned long		rs_committed:1;/* the transaction was committed
+                                                 and the rs was dispatched
+                                                 by ptlrpc_commit_replies */
+	unsigned long		rs_convert_lock:1; /* need to convert saved
+						    * locks to COS mode */
+	atomic_t		rs_refcount;	/* number of users */
+	/** Number of locks awaiting client ACK */
+	int			rs_nlocks;
+
+	/** Size of the state */
+	int			rs_size;
+	/** opcode */
+	__u32			rs_opc;
+	/** Transaction number */
+	__u64			rs_transno;
+	/** xid */
+	__u64			rs_xid;
+	struct obd_export	*rs_export;
+	struct ptlrpc_service_part *rs_svcpt;
+	/** Lnet metadata handle for the reply */
+	struct lnet_handle_md	rs_md_h;
+
+	/** Context for the sevice thread */
+	struct ptlrpc_svc_ctx	*rs_svc_ctx;
+	/** Reply buffer (actually sent to the client), encoded if needed */
+	struct lustre_msg	*rs_repbuf;	/* wrapper */
+	/** Size of the reply buffer */
+	int			rs_repbuf_len;	/* wrapper buf length */
+	/** Size of the reply message */
+	int			rs_repdata_len;	/* wrapper msg length */
+	/**
+	 * Actual reply message. Its content is encrupted (if needed) to
+	 * produce reply buffer for actual sending. In simple case
+	 * of no network encryption we jus set \a rs_repbuf to \a rs_msg
+	 */
+	struct lustre_msg	*rs_msg;	/* reply message */
+
+	/** Handles of locks awaiting client reply ACK */
+	struct lustre_handle	rs_locks[RS_MAX_LOCKS];
+	/** Lock modes of locks in \a rs_locks */
+	enum ldlm_mode		rs_modes[RS_MAX_LOCKS];
+};
+
+struct ptlrpc_thread;
+
+/** RPC stages */
+enum rq_phase {
+	RQ_PHASE_NEW            = 0xebc0de00,
+	RQ_PHASE_RPC            = 0xebc0de01,
+	RQ_PHASE_BULK           = 0xebc0de02,
+	RQ_PHASE_INTERPRET      = 0xebc0de03,
+	RQ_PHASE_COMPLETE       = 0xebc0de04,
+	RQ_PHASE_UNREG_RPC      = 0xebc0de05,
+	RQ_PHASE_UNREG_BULK     = 0xebc0de06,
+	RQ_PHASE_UNDEFINED      = 0xebc0de07
+};
+
+/** Type of request interpreter call-back */
+typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
+                                    struct ptlrpc_request *req,
+                                    void *arg, int rc);
+/** Type of request resend call-back */
+typedef void (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req,
+				   void *arg);
+
+/**
+ * Definition of request pool structure.
+ * The pool is used to store empty preallocated requests for the case
+ * when we would actually need to send something without performing
+ * any allocations (to avoid e.g. OOM).
+ */
+struct ptlrpc_request_pool {
+	/** Locks the list */
+	spinlock_t		prp_lock;
+	/** list of ptlrpc_request structs */
+	struct list_head	prp_req_list;
+	/** Maximum message size that would fit into a rquest from this pool */
+	int			prp_rq_size;
+	/** Function to allocate more requests for this pool */
+	int (*prp_populate)(struct ptlrpc_request_pool *, int);
+};
+
+struct lu_context;
+struct lu_env;
+
+struct ldlm_lock;
+
+#include <lustre_nrs.h>
+
+/**
+ * Basic request prioritization operations structure.
+ * The whole idea is centered around locks and RPCs that might affect locks.
+ * When a lock is contended we try to give priority to RPCs that might lead
+ * to fastest release of that lock.
+ * Currently only implemented for OSTs only in a way that makes all
+ * IO and truncate RPCs that are coming from a locked region where a lock is
+ * contended a priority over other requests.
+ */
+struct ptlrpc_hpreq_ops {
+        /**
+         * Check if the lock handle of the given lock is the same as
+         * taken from the request.
+         */
+        int  (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+        /**
+         * Check if the request is a high priority one.
+         */
+        int  (*hpreq_check)(struct ptlrpc_request *);
+        /**
+         * Called after the request has been handled.
+         */
+        void (*hpreq_fini)(struct ptlrpc_request *);
+};
+
+struct ptlrpc_cli_req {
+	/** For bulk requests on client only: bulk descriptor */
+	struct ptlrpc_bulk_desc		*cr_bulk;
+	/** optional time limit for send attempts. This is a timeout
+	 *  not a timestamp so timeout_t (s32) is used instead of time64_t
+	 */
+	timeout_t			 cr_delay_limit;
+	/** time request was first queued */
+	time64_t			 cr_queued_time;
+	/** request sent in nanoseconds */
+	ktime_t				 cr_sent_ns;
+	/** time for request really sent out */
+	time64_t			 cr_sent_out;
+	/** when req reply unlink must finish. */
+	time64_t			 cr_reply_deadline;
+	/** when req bulk unlink must finish. */
+	time64_t			 cr_bulk_deadline;
+	/** when req unlink must finish. */
+	time64_t			 cr_req_deadline;
+	/** Portal to which this request would be sent */
+	short				 cr_req_ptl;
+	/** Portal where to wait for reply and where reply would be sent */
+	short				 cr_rep_ptl;
+	/** request resending number */
+	unsigned int			 cr_resend_nr;
+	/** What was import generation when this request was sent */
+	int				 cr_imp_gen;
+	enum lustre_imp_state		 cr_send_state;
+	/** Per-request waitq introduced by bug 21938 for recovery waiting */
+	wait_queue_head_t		 cr_set_waitq;
+	/** Link item for request set lists */
+	struct list_head		 cr_set_chain;
+	/** link to waited ctx */
+	struct list_head		 cr_ctx_chain;
+
+	/** client's half ctx */
+	struct ptlrpc_cli_ctx		*cr_cli_ctx;
+	/** Link back to the request set */
+	struct ptlrpc_request_set	*cr_set;
+	/** outgoing request MD handle */
+	struct lnet_handle_md		 cr_req_md_h;
+	/** request-out callback parameter */
+	struct ptlrpc_cb_id		 cr_req_cbid;
+	/** incoming reply MD handle */
+	struct lnet_handle_md		 cr_reply_md_h;
+	wait_queue_head_t		 cr_reply_waitq;
+	/** reply callback parameter */
+	struct ptlrpc_cb_id		 cr_reply_cbid;
+	/** Async completion handler, called when reply is received */
+	ptlrpc_interpterer_t		 cr_reply_interp;
+	/** Resend handler, called when request is resend to update RPC data */
+	ptlrpc_resend_cb_t		 cr_resend_cb;
+	/** Async completion context */
+	union ptlrpc_async_args		 cr_async_args;
+	/** Opaq data for replay and commit callbacks. */
+	void				*cr_cb_data;
+	/** Link to the imp->imp_unreplied_list */
+	struct list_head		 cr_unreplied_list;
+	/**
+	 * Commit callback, called when request is committed and about to be
+	 * freed.
+	 */
+	void (*cr_commit_cb)(struct ptlrpc_request *);
+	/** Replay callback, called after request is replayed at recovery */
+	void (*cr_replay_cb)(struct ptlrpc_request *);
+};
+
+/** client request member alias */
+/* NB: these alias should NOT be used by any new code, instead they should
+ * be removed step by step to avoid potential abuse */
+#define rq_bulk			rq_cli.cr_bulk
+#define rq_delay_limit		rq_cli.cr_delay_limit
+#define rq_queued_time		rq_cli.cr_queued_time
+#define rq_sent_ns		rq_cli.cr_sent_ns
+#define rq_real_sent		rq_cli.cr_sent_out
+#define rq_reply_deadline	rq_cli.cr_reply_deadline
+#define rq_bulk_deadline	rq_cli.cr_bulk_deadline
+#define rq_req_deadline		rq_cli.cr_req_deadline
+#define rq_nr_resend		rq_cli.cr_resend_nr
+#define rq_request_portal	rq_cli.cr_req_ptl
+#define rq_reply_portal		rq_cli.cr_rep_ptl
+#define rq_import_generation	rq_cli.cr_imp_gen
+#define rq_send_state		rq_cli.cr_send_state
+#define rq_set_chain		rq_cli.cr_set_chain
+#define rq_ctx_chain		rq_cli.cr_ctx_chain
+#define rq_set			rq_cli.cr_set
+#define rq_set_waitq		rq_cli.cr_set_waitq
+#define rq_cli_ctx		rq_cli.cr_cli_ctx
+#define rq_req_md_h		rq_cli.cr_req_md_h
+#define rq_req_cbid		rq_cli.cr_req_cbid
+#define rq_reply_md_h		rq_cli.cr_reply_md_h
+#define rq_reply_waitq		rq_cli.cr_reply_waitq
+#define rq_reply_cbid		rq_cli.cr_reply_cbid
+#define rq_interpret_reply	rq_cli.cr_reply_interp
+#define rq_resend_cb		rq_cli.cr_resend_cb
+#define rq_async_args		rq_cli.cr_async_args
+#define rq_cb_data		rq_cli.cr_cb_data
+#define rq_unreplied_list	rq_cli.cr_unreplied_list
+#define rq_commit_cb		rq_cli.cr_commit_cb
+#define rq_replay_cb		rq_cli.cr_replay_cb
+
+struct ptlrpc_srv_req {
+	/** initial thread servicing this request */
+	struct ptlrpc_thread		*sr_svc_thread;
+	/**
+	 * Server side list of incoming unserved requests sorted by arrival
+	 * time.  Traversed from time to time to notice about to expire
+	 * requests and sent back "early replies" to clients to let them
+	 * know server is alive and well, just very busy to service their
+	 * requests in time
+	 */
+	struct list_head		 sr_timed_list;
+	/** server-side per-export list */
+	struct list_head		 sr_exp_list;
+	/** server-side history, used for debuging purposes. */
+	struct list_head		 sr_hist_list;
+	/** history sequence # */
+	__u64				 sr_hist_seq;
+	/** the index of service's srv_at_array into which request is linked */
+	__u32				 sr_at_index;
+	/** authed uid */
+	uid_t				 sr_auth_uid;
+	/** authed uid mapped to */
+	uid_t				 sr_auth_mapped_uid;
+	/** RPC is generated from what part of Lustre */
+	enum lustre_sec_part		 sr_sp_from;
+	/** request session context */
+	struct lu_context		 sr_ses;
+	/** \addtogroup  nrs
+	 * @{
+	 */
+	/** stub for NRS request */
+	struct ptlrpc_nrs_request	 sr_nrq;
+	/** @} nrs */
+	/** request arrival time */
+	struct timespec64		 sr_arrival_time;
+	/** server's half ctx */
+	struct ptlrpc_svc_ctx		*sr_svc_ctx;
+	/** (server side), pointed directly into req buffer */
+	struct ptlrpc_user_desc		*sr_user_desc;
+	/** separated reply state, may be vmalloc'd */
+	struct ptlrpc_reply_state	*sr_reply_state;
+	/** server-side hp handlers */
+	struct ptlrpc_hpreq_ops		*sr_ops;
+	/** incoming request buffer */
+	struct ptlrpc_request_buffer_desc *sr_rqbd;
+};
+
+/** server request member alias */
+/* NB: these alias should NOT be used by any new code, instead they should
+ * be removed step by step to avoid potential abuse */
+#define rq_svc_thread		rq_srv.sr_svc_thread
+#define rq_timed_list		rq_srv.sr_timed_list
+#define rq_exp_list		rq_srv.sr_exp_list
+#define rq_history_list		rq_srv.sr_hist_list
+#define rq_history_seq		rq_srv.sr_hist_seq
+#define rq_at_index		rq_srv.sr_at_index
+#define rq_auth_uid		rq_srv.sr_auth_uid
+#define rq_auth_mapped_uid	rq_srv.sr_auth_mapped_uid
+#define rq_sp_from		rq_srv.sr_sp_from
+#define rq_session		rq_srv.sr_ses
+#define rq_nrq			rq_srv.sr_nrq
+#define rq_arrival_time		rq_srv.sr_arrival_time
+#define rq_reply_state		rq_srv.sr_reply_state
+#define rq_svc_ctx		rq_srv.sr_svc_ctx
+#define rq_user_desc		rq_srv.sr_user_desc
+#define rq_ops			rq_srv.sr_ops
+#define rq_rqbd			rq_srv.sr_rqbd
+#define rq_reqmsg		rq_pill.rc_reqmsg
+#define rq_repmsg		rq_pill.rc_repmsg
+#define rq_req_swab_mask	rq_pill.rc_req_swab_mask
+#define rq_rep_swab_mask	rq_pill.rc_rep_swab_mask
+
+/**
+ * Represents remote procedure call.
+ *
+ * This is a staple structure used by everybody wanting to send a request
+ * in Lustre.
+ */
+struct ptlrpc_request {
+	/* Request type: one of PTL_RPC_MSG_* */
+	int				 rq_type;
+	/** Result of request processing */
+	int				 rq_status;
+	/**
+	 * Linkage item through which this request is included into
+	 * sending/delayed lists on client and into rqbd list on server
+	 */
+	struct list_head		 rq_list;
+	/** Lock to protect request flags and some other important bits, like
+	 * rq_list
+	 */
+	spinlock_t			 rq_lock;
+	spinlock_t			 rq_early_free_lock;
+	/** client-side flags are serialized by rq_lock @{ */
+	unsigned int rq_intr:1, rq_replied:1, rq_err:1,
+                rq_timedout:1, rq_resend:1, rq_restart:1,
+                /**
+                 * when ->rq_replay is set, request is kept by the client even
+                 * after server commits corresponding transaction. This is
+                 * used for operations that require sequence of multiple
+                 * requests to be replayed. The only example currently is file
+                 * open/close. When last request in such a sequence is
+                 * committed, ->rq_replay is cleared on all requests in the
+                 * sequence.
+                 */
+                rq_replay:1,
+                rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
+                rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+		rq_early:1,
+		rq_req_unlinked:1,	/* unlinked request buffer from lnet */
+		rq_reply_unlinked:1,	/* unlinked reply buffer from lnet */
+		rq_memalloc:1,      /* req originated from "kswapd" */
+		rq_committed:1,
+		rq_reply_truncated:1,
+		/** whether the "rq_set" is a valid one */
+		rq_invalid_rqset:1,
+		rq_generation_set:1,
+		/** do not resend request on -EINPROGRESS */
+		rq_no_retry_einprogress:1,
+		/* allow the req to be sent if the import is in recovery
+		 * status */
+		rq_allow_replay:1,
+		/* bulk request, sent to server, but uncommitted */
+		rq_unstable:1,
+		rq_early_free_repbuf:1, /* free reply buffer in advance */
+		rq_allow_intr:1;
+	/** @} */
+
+	/** server-side flags @{ */
+	unsigned int
+		rq_hp:1,		/**< high priority RPC */
+		rq_at_linked:1,		/**< link into service's srv_at_array */
+		rq_packed_final:1,	/**< packed final reply */
+		rq_obsolete:1;		/* aborted by a signal on a client */
+	/** @} */
+
+	/** one of RQ_PHASE_* */
+	enum rq_phase			 rq_phase;
+	/** one of RQ_PHASE_* to be used next */
+	enum rq_phase			 rq_next_phase;
+	/**
+	 * client-side refcount for SENT race, server-side refcounf
+	 * for multiple replies
+	 */
+	atomic_t			 rq_refcount;
+        /**
+         * client-side:
+         * !rq_truncate : # reply bytes actually received,
+         *  rq_truncate : required repbuf_len for resend
+         */
+        int rq_nob_received;
+        /** Request length */
+        int rq_reqlen;
+        /** Reply length */
+        int rq_replen;
+	/** Pool if request is from preallocated list */
+	struct ptlrpc_request_pool	*rq_pool;
+        /** Transaction number */
+        __u64 rq_transno;
+        /** xid */
+        __u64				 rq_xid;
+	/** bulk match bits */
+	__u64				 rq_mbits;
+	/** reply match bits */
+	__u64				 rq_rep_mbits;
+	/**
+	 * List item to for replay list. Not yet committed requests get linked
+	 * there.
+	 * Also see \a rq_replay comment above.
+	 * It's also link chain on obd_export::exp_req_replay_queue
+	 */
+	struct list_head		 rq_replay_list;
+	/** non-shared members for client & server request*/
+	union {
+		struct ptlrpc_cli_req	 rq_cli;
+		struct ptlrpc_srv_req	 rq_srv;
+	};
+	/**
+	 * security and encryption data
+	 * @{ */
+	/** description of flavors for client & server */
+	struct sptlrpc_flavor		 rq_flvr;
+
+	/**
+	 * SELinux policy info at the time of the request
+	 * sepol string format is:
+	 * <mode>:<policy name>:<policy version>:<policy hash>
+	 */
+	char rq_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1];
+
+	/* client/server security flags */
+	unsigned int
+                                 rq_ctx_init:1,      /* context initiation */
+                                 rq_ctx_fini:1,      /* context destroy */
+                                 rq_bulk_read:1,     /* request bulk read */
+                                 rq_bulk_write:1,    /* request bulk write */
+                                 /* server authentication flags */
+                                 rq_auth_gss:1,      /* authenticated by gss */
+                                 rq_auth_usr_root:1, /* authed as root */
+                                 rq_auth_usr_mdt:1,  /* authed as mdt */
+                                 rq_auth_usr_ost:1,  /* authed as ost */
+                                 /* security tfm flags */
+                                 rq_pack_udesc:1,
+                                 rq_pack_bulk:1,
+                                 /* doesn't expect reply FIXME */
+                                 rq_no_reply:1,
+				 rq_pill_init:1, /* pill initialized */
+				 rq_srv_req:1; /* server request */
+
+
+	/** various buffer pointers */
+	struct lustre_msg		*rq_reqbuf;  /**< req wrapper, vmalloc*/
+	char				*rq_repbuf;  /**< rep buffer, vmalloc */
+	struct lustre_msg		*rq_repdata; /**< rep wrapper msg */
+	/** only in priv mode */
+	struct lustre_msg		*rq_clrbuf;
+        int                      rq_reqbuf_len;  /* req wrapper buf len */
+        int                      rq_reqdata_len; /* req wrapper msg len */
+        int                      rq_repbuf_len;  /* rep buffer len */
+        int                      rq_repdata_len; /* rep wrapper msg len */
+        int                      rq_clrbuf_len;  /* only in priv mode */
+        int                      rq_clrdata_len; /* only in priv mode */
+
+	/** early replies go to offset 0, regular replies go after that */
+	unsigned int			 rq_reply_off;
+	/** @} */
+
+	/** how many early replies (for stats) */
+	int				 rq_early_count;
+	/** Server-side, export on which request was received */
+	struct obd_export		*rq_export;
+	/** import where request is being sent */
+	struct obd_import		*rq_import;
+	/** our LNet NID */
+	lnet_nid_t			 rq_self;
+	/** Peer description (the other side) */
+	struct lnet_process_id		 rq_peer;
+	/** Descriptor for the NID from which the peer sent the request. */
+	struct lnet_process_id		 rq_source;
+	/**
+	 * service time estimate (secs)
+	 * If the request is not served by this time, it is marked as timed out.
+	 * Do not change to time64_t since this is transmitted over the wire.
+	 *
+	 * The linux kernel handles timestamps with time64_t and timeouts
+	 * are normally done with jiffies. Lustre shares the rq_timeout between
+	 * nodes. Since jiffies can vary from node to node Lustre instead
+	 * will express the timeout value in seconds. To avoid confusion with
+	 * timestamps (time64_t) and jiffy timeouts (long) Lustre timeouts
+	 * are expressed in s32 (timeout_t). Also what is transmitted over
+	 * the wire is 32 bits.
+	 */
+	timeout_t			 rq_timeout;
+	/**
+	 * when request/reply sent (secs), or time when request should be sent
+	 */
+	time64_t			 rq_sent;
+	/** when request must finish. */
+	time64_t			 rq_deadline;
+	/** request format description */
+	struct req_capsule		 rq_pill;
+};
+
+/**
+ * Call completion handler for rpc if any, return it's status or original
+ * rc if there was no handler defined for this request.
+ */
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+                                       struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_interpret_reply != NULL) {
+		req->rq_status = req->rq_interpret_reply(env, req,
+							 &req->rq_async_args,
+							 rc);
+		return req->rq_status;
+	}
+
+	return rc;
+}
+
+/** \addtogroup  nrs
+ * @{
+ */
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+	/**
+	 * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+	 * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+	 * to make sure it has not been scheduled yet (analogous to previous
+	 * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+	 */
+	return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
+/**
+ * Convert numerical request phase value \a phase into text string description
+ */
+static inline const char *
+ptlrpc_phase2str(enum rq_phase phase)
+{
+	switch (phase) {
+	case RQ_PHASE_NEW:
+		return "New";
+	case RQ_PHASE_RPC:
+		return "Rpc";
+	case RQ_PHASE_BULK:
+		return "Bulk";
+	case RQ_PHASE_INTERPRET:
+		return "Interpret";
+	case RQ_PHASE_COMPLETE:
+		return "Complete";
+	case RQ_PHASE_UNREG_RPC:
+		return "UnregRPC";
+	case RQ_PHASE_UNREG_BULK:
+		return "UnregBULK";
+	default:
+		return "?Phase?";
+	}
+}
+
+/**
+ * Convert numerical request phase of the request \a req into text stringi
+ * description
+ */
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+        return ptlrpc_phase2str(req->rq_phase);
+}
+
+/**
+ * Debugging functions and helpers to print request structure into debug log
+ * @{
+ */
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+/** Convert bit flags into a string */
+#define DEBUG_REQ_FLAGS(req)                                                   \
+	ptlrpc_rqphase2str(req),                                               \
+	FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),                   \
+	FLAG(req->rq_err, "E"), FLAG(req->rq_net_err, "e"),                    \
+	FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),  \
+	FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),                 \
+	FLAG(req->rq_no_resend, "N"), FLAG(req->rq_no_reply, "n"),            \
+	FLAG(req->rq_waiting, "W"),                                            \
+	FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"),                    \
+	FLAG(req->rq_committed, "M"),                                          \
+	FLAG(req->rq_req_unlinked, "Q"),                                       \
+	FLAG(req->rq_reply_unlinked, "U"),                                     \
+	FLAG(req->rq_receiving_reply, "r")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s"
+
+void _debug_req(struct ptlrpc_request *req,
+                struct libcfs_debug_msg_data *data, const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Helper that decides if we need to print request accordig to current debug
+ * level settings
+ */
+#define debug_req(msgdata, mask, cdls, req, fmt, a...)                        \
+do {                                                                          \
+        CFS_CHECK_STACK(msgdata, mask, cdls);                                 \
+                                                                              \
+        if (((mask) & D_CANTMASK) != 0 ||                                     \
+            ((libcfs_debug & (mask)) != 0 &&                                  \
+             (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                \
+                _debug_req((req), msgdata, fmt, ##a);                         \
+} while(0)
+
+/**
+ * This is the debug print function you need to use to print request sturucture
+ * content into lustre debug log.
+ * for most callers (level is a constant) this is resolved at compile time */
+#define DEBUG_REQ(level, req, fmt, args...)                                   \
+do {                                                                          \
+        if ((level) & (D_ERROR | D_WARNING)) {                                \
+		static struct cfs_debug_limit_state cdls;		      \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);            \
+                debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\
+        } else {                                                              \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);             \
+                debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \
+        }                                                                     \
+} while (0)
+/** @} */
+
+enum ptlrpc_bulk_op_type {
+	PTLRPC_BULK_OP_ACTIVE =	 0x00000001,
+	PTLRPC_BULK_OP_PASSIVE = 0x00000002,
+	PTLRPC_BULK_OP_PUT =	 0x00000004,
+	PTLRPC_BULK_OP_GET =	 0x00000008,
+	PTLRPC_BULK_GET_SOURCE = PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_GET,
+	PTLRPC_BULK_PUT_SINK =	 PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_PUT,
+	PTLRPC_BULK_GET_SINK =	 PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_GET,
+	PTLRPC_BULK_PUT_SOURCE = PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_PUT,
+};
+
+static inline bool ptlrpc_is_bulk_op_get(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_OP_GET) == PTLRPC_BULK_OP_GET;
+}
+
+static inline bool ptlrpc_is_bulk_get_source(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_GET_SOURCE) == PTLRPC_BULK_GET_SOURCE;
+}
+
+static inline bool ptlrpc_is_bulk_put_sink(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_PUT_SINK) == PTLRPC_BULK_PUT_SINK;
+}
+
+static inline bool ptlrpc_is_bulk_get_sink(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_GET_SINK) == PTLRPC_BULK_GET_SINK;
+}
+
+static inline bool ptlrpc_is_bulk_put_source(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_PUT_SOURCE) == PTLRPC_BULK_PUT_SOURCE;
+}
+
+static inline bool ptlrpc_is_bulk_op_active(enum ptlrpc_bulk_op_type type)
+{
+	return ((type & PTLRPC_BULK_OP_ACTIVE) |
+		(type & PTLRPC_BULK_OP_PASSIVE))
+			== PTLRPC_BULK_OP_ACTIVE;
+}
+
+static inline bool ptlrpc_is_bulk_op_passive(enum ptlrpc_bulk_op_type type)
+{
+	return ((type & PTLRPC_BULK_OP_ACTIVE) |
+		(type & PTLRPC_BULK_OP_PASSIVE))
+			== PTLRPC_BULK_OP_PASSIVE;
+}
+
+struct ptlrpc_bulk_frag_ops {
+	/**
+	 * Add a page \a page to the bulk descriptor \a desc
+	 * Data to transfer in the page starts at offset \a pageoffset and
+	 * amount of data to transfer from the page is \a len
+	 */
+	void (*add_kiov_frag)(struct ptlrpc_bulk_desc *desc,
+			      struct page *page, int pageoffset, int len);
+
+	/*
+	 * Add a \a fragment to the bulk descriptor \a desc.
+	 * Data to transfer in the fragment is pointed to by \a frag
+	 * The size of the fragment is \a len
+	 */
+	int (*add_iov_frag)(struct ptlrpc_bulk_desc *desc, void *frag, int len);
+
+	/**
+	 * Uninitialize and free bulk descriptor \a desc.
+	 * Works on bulk descriptors both from server and client side.
+	 */
+	void (*release_frags)(struct ptlrpc_bulk_desc *desc);
+};
+
+extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops;
+extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops;
+
+/*
+ * Definition of bulk descriptor.
+ * Bulks are special "Two phase" RPCs where initial request message
+ * is sent first and it is followed bt a transfer (o receiving) of a large
+ * amount of data to be settled into pages referenced from the bulk descriptors.
+ * Bulks transfers (the actual data following the small requests) are done
+ * on separate LNet portals.
+ * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs.
+ *  Another user is readpage for MDT.
+ */
+struct ptlrpc_bulk_desc {
+	unsigned int	bd_refs; /* number MD's assigned including zero-sends */
+	/** completed with failure */
+	unsigned long bd_failure:1;
+	/** client side */
+	unsigned long bd_registered:1;
+	/** For serialization with callback */
+	spinlock_t bd_lock;
+	/** {put,get}{source,sink}{kvec,kiov} */
+	enum ptlrpc_bulk_op_type bd_type;
+	/** LNet portal for this bulk */
+	__u32 bd_portal;
+	/** Server side - export this bulk created for */
+	struct obd_export *bd_export;
+	/** Client side - import this bulk was sent on */
+	struct obd_import *bd_import;
+	/** Back pointer to the request */
+	struct ptlrpc_request *bd_req;
+	const struct ptlrpc_bulk_frag_ops *bd_frag_ops;
+	wait_queue_head_t      bd_waitq;        /* server side only WQ */
+	int                    bd_iov_count;    /* # entries in bd_iov */
+	int                    bd_max_iov;      /* allocated size of bd_iov */
+	int                    bd_nob;          /* # bytes covered */
+	int                    bd_nob_transferred; /* # bytes GOT/PUT */
+	unsigned int		bd_nob_last;	/* # bytes in last MD */
+
+	__u64                  bd_last_mbits;
+
+	struct ptlrpc_cb_id    bd_cbid;         /* network callback info */
+	lnet_nid_t             bd_sender;       /* stash event::sender */
+	int			bd_md_count;	/* # valid entries in bd_mds */
+	int			bd_md_max_brw;	/* max entries in bd_mds */
+
+	/** array of offsets for each MD */
+	unsigned int		bd_mds_off[PTLRPC_BULK_OPS_COUNT];
+	/** array of associated MDs */
+	struct lnet_handle_md	bd_mds[PTLRPC_BULK_OPS_COUNT];
+
+	/* encrypted iov, size is either 0 or bd_iov_count. */
+	struct bio_vec *bd_enc_vec;
+	struct bio_vec *bd_vec;
+};
+
+enum {
+	SVC_INIT	= 0,
+	SVC_STOPPED	= BIT(0),
+	SVC_STOPPING	= BIT(1),
+	SVC_STARTING	= BIT(2),
+	SVC_RUNNING	= BIT(3),
+};
+
+#define PTLRPC_THR_NAME_LEN		32
+/**
+ * Definition of server service thread structure
+ */
+struct ptlrpc_thread {
+	/**
+	 * List of active threads in svcpt->scp_threads
+	 */
+	struct list_head t_link;
+	/**
+	 * thread-private data (preallocated vmalloc'd memory)
+	 */
+	void *t_data;
+	__u32 t_flags;
+	/**
+	 * service thread index, from ptlrpc_start_threads
+	 */
+	unsigned int t_id;
+	/**
+	 * service thread
+	 */
+	struct task_struct *t_task;
+	pid_t t_pid;
+	ktime_t t_touched;
+	/**
+	 * put watchdog in the structure per thread b=14840
+	 */
+	struct delayed_work t_watchdog;
+	/**
+	 * the svc this thread belonged to b=18582
+	 */
+	struct ptlrpc_service_part	*t_svcpt;
+	wait_queue_head_t		t_ctl_waitq;
+	struct lu_env			*t_env;
+	char				t_name[PTLRPC_THR_NAME_LEN];
+};
+
+static inline int thread_is_init(struct ptlrpc_thread *thread)
+{
+	return thread->t_flags == 0;
+}
+
+static inline int thread_is_stopped(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_STOPPED);
+}
+
+static inline int thread_is_stopping(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_STOPPING);
+}
+
+static inline int thread_is_starting(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_STARTING);
+}
+
+static inline int thread_is_running(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_RUNNING);
+}
+
+static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+        thread->t_flags &= ~flags;
+}
+
+static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+        thread->t_flags = flags;
+}
+
+static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+        thread->t_flags |= flags;
+}
+
+static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread,
+                                              __u32 flags)
+{
+        if (thread->t_flags & flags) {
+                thread->t_flags &= ~flags;
+                return 1;
+        }
+        return 0;
+}
+
+/**
+ * Request buffer descriptor structure.
+ * This is a structure that contains one posted request buffer for service.
+ * Once data land into a buffer, event callback creates actual request and
+ * notifies wakes one of the service threads to process new incoming request.
+ * More than one request can fit into the buffer.
+ */
+struct ptlrpc_request_buffer_desc {
+	/** Link item for rqbds on a service */
+	struct list_head		rqbd_list;
+	/** History of requests for this buffer */
+	struct list_head		rqbd_reqs;
+	/** Back pointer to service for which this buffer is registered */
+	struct ptlrpc_service_part	*rqbd_svcpt;
+	/** LNet descriptor */
+	struct lnet_handle_md		rqbd_md_h;
+	int				rqbd_refcount;
+	/** The buffer itself */
+	char				*rqbd_buffer;
+	struct ptlrpc_cb_id		rqbd_cbid;
+	/**
+	 * This "embedded" request structure is only used for the
+	 * last request to fit into the buffer
+	 */
+	struct ptlrpc_request		rqbd_req;
+};
+
+typedef int  (*svc_handler_t)(struct ptlrpc_request *req);
+
+struct ptlrpc_service_ops {
+	/**
+	 * if non-NULL called during thread creation (ptlrpc_start_thread())
+	 * to initialize service specific per-thread state.
+	 */
+	int		(*so_thr_init)(struct ptlrpc_thread *thr);
+	/**
+	 * if non-NULL called during thread shutdown (ptlrpc_main()) to
+	 * destruct state created by ->srv_init().
+	 */
+	void		(*so_thr_done)(struct ptlrpc_thread *thr);
+	/**
+	 * Handler function for incoming requests for this service
+	 */
+	int		(*so_req_handler)(struct ptlrpc_request *req);
+	/**
+	 * function to determine priority of the request, it's called
+	 * on every new request
+	 */
+	int		(*so_hpreq_handler)(struct ptlrpc_request *);
+	/**
+	 * service-specific print fn
+	 */
+	void		(*so_req_printer)(void *, struct ptlrpc_request *);
+};
+
+#ifndef __cfs_cacheline_aligned
+/* NB: put it here for reducing patche dependence */
+# define __cfs_cacheline_aligned
+#endif
+
+/**
+ * How many high priority requests to serve before serving one normal
+ * priority request
+ */
+#define PTLRPC_SVC_HP_RATIO 10
+
+/**
+ * Definition of PortalRPC service.
+ * The service is listening on a particular portal (like tcp port)
+ * and perform actions for a specific server like IO service for OST
+ * or general metadata service for MDS.
+ */
+struct ptlrpc_service {
+	/** serialize /proc operations */
+	spinlock_t			srv_lock;
+	/** most often accessed fields */
+	/** chain thru all services */
+	struct list_head		srv_list;
+	/** service operations table */
+	struct ptlrpc_service_ops	srv_ops;
+        /** only statically allocated strings here; we don't clean them */
+        char                           *srv_name;
+        /** only statically allocated strings here; we don't clean them */
+        char                           *srv_thread_name;
+	/** threads # should be created for each partition on initializing */
+	int				srv_nthrs_cpt_init;
+	/** limit of threads number for each partition */
+	int				srv_nthrs_cpt_limit;
+	/** Root of debugfs dir tree for this service */
+	struct dentry		       *srv_debugfs_entry;
+        /** Pointer to statistic data for this service */
+        struct lprocfs_stats           *srv_stats;
+        /** # hp per lp reqs to handle */
+        int                             srv_hpreq_ratio;
+        /** biggest request to receive */
+        int                             srv_max_req_size;
+        /** biggest reply to send */
+        int                             srv_max_reply_size;
+        /** size of individual buffers */
+        int                             srv_buf_size;
+        /** # buffers to allocate in 1 group */
+        int                             srv_nbuf_per_group;
+        /** Local portal on which to receive requests */
+        __u32                           srv_req_portal;
+        /** Portal on the client to send replies to */
+        __u32                           srv_rep_portal;
+        /**
+         * Tags for lu_context associated with this thread, see struct
+         * lu_context.
+         */
+        __u32                           srv_ctx_tags;
+        /** soft watchdog timeout multiplier */
+        int                             srv_watchdog_factor;
+        /** under unregister_service */
+        unsigned                        srv_is_stopping:1;
+	/** Whether or not to restrict service threads to CPUs in this CPT */
+	unsigned			srv_cpt_bind:1;
+
+	/** max # request buffers */
+	int				srv_nrqbds_max;
+	/** max # request buffers in history per partition */
+	int				srv_hist_nrqbds_cpt_max;
+	/** number of CPTs this service associated with */
+	int				srv_ncpts;
+	/** CPTs array this service associated with */
+	__u32				*srv_cpts;
+	/** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+	int				srv_cpt_bits;
+	/** CPT table this service is running over */
+	struct cfs_cpt_table		*srv_cptable;
+
+	/* sysfs object */
+	struct kobject			srv_kobj;
+	struct completion		srv_kobj_unregister;
+	/**
+	 * partition data for ptlrpc service
+	 */
+	struct ptlrpc_service_part	*srv_parts[0];
+};
+
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ *    serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ *    serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ *    serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ *    serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+	/** back reference to owner */
+	struct ptlrpc_service		*scp_service __cfs_cacheline_aligned;
+	/* CPT id, reserved */
+	int				scp_cpt;
+	/** always increasing number */
+	int				scp_thr_nextid;
+	/** # of starting threads */
+	int				scp_nthrs_starting;
+	/** # running threads */
+	int				scp_nthrs_running;
+	/** service threads list */
+	struct list_head		scp_threads;
+
+	/**
+	 * serialize the following fields, used for protecting
+	 * rqbd list and incoming requests waiting for preprocess,
+	 * threads starting & stopping are also protected by this lock.
+	 */
+	spinlock_t			scp_lock  __cfs_cacheline_aligned;
+	/** userland serialization */
+	struct mutex			scp_mutex;
+	/** total # req buffer descs allocated */
+	int				scp_nrqbds_total;
+	/** # posted request buffers for receiving */
+	int				scp_nrqbds_posted;
+	/** in progress of allocating rqbd */
+	int				scp_rqbd_allocating;
+	/** # incoming reqs */
+	int				scp_nreqs_incoming;
+	/** request buffers to be reposted */
+	struct list_head		scp_rqbd_idle;
+	/** req buffers receiving */
+	struct list_head		scp_rqbd_posted;
+	/** incoming reqs */
+	struct list_head		scp_req_incoming;
+	/** timeout before re-posting reqs, in jiffies */
+	long				scp_rqbd_timeout;
+	/**
+	 * all threads sleep on this. This wait-queue is signalled when new
+	 * incoming request arrives and when difficult reply has to be handled.
+	 */
+	wait_queue_head_t		scp_waitq;
+
+	/** request history */
+	struct list_head		scp_hist_reqs;
+	/** request buffer history */
+	struct list_head		scp_hist_rqbds;
+	/** # request buffers in history */
+	int				scp_hist_nrqbds;
+	/** sequence number for request */
+	__u64				scp_hist_seq;
+	/** highest seq culled from history */
+	__u64				scp_hist_seq_culled;
+
+	/**
+	 * serialize the following fields, used for processing requests
+	 * sent to this portal
+	 */
+	spinlock_t			scp_req_lock __cfs_cacheline_aligned;
+	/** # reqs in either of the NRS heads below */
+	/** # reqs being served */
+	int				scp_nreqs_active;
+	/** # HPreqs being served */
+	int				scp_nhreqs_active;
+	/** # hp requests handled */
+	int				scp_hreq_count;
+
+	/** NRS head for regular requests */
+	struct ptlrpc_nrs		scp_nrs_reg;
+	/** NRS head for HP requests; this is only valid for services that can
+	 *  handle HP requests */
+	struct ptlrpc_nrs	       *scp_nrs_hp;
+
+	/** AT stuff */
+	/** @{ */
+	/**
+	 * serialize the following fields, used for changes on
+	 * adaptive timeout
+	 */
+	spinlock_t			scp_at_lock __cfs_cacheline_aligned;
+	/** estimated rpc service time */
+	struct adaptive_timeout		scp_at_estimate;
+	/** reqs waiting for replies */
+	struct ptlrpc_at_array		scp_at_array;
+	/** early reply timer */
+	struct timer_list		scp_at_timer;
+	/** debug */
+	ktime_t				scp_at_checktime;
+	/** check early replies */
+	unsigned			scp_at_check;
+	/** @} */
+
+	/**
+	 * serialize the following fields, used for processing
+	 * replies for this portal
+	 */
+	spinlock_t			scp_rep_lock __cfs_cacheline_aligned;
+	/** all the active replies */
+	struct list_head		scp_rep_active;
+	/** List of free reply_states */
+	struct list_head		scp_rep_idle;
+	/** waitq to run, when adding stuff to srv_free_rs_list */
+	wait_queue_head_t		scp_rep_waitq;
+	/** # 'difficult' replies */
+	atomic_t			scp_nreps_difficult;
+};
+
+#define ptlrpc_service_for_each_part(part, i, svc)			\
+	for (i = 0;							\
+	     i < (svc)->srv_ncpts &&					\
+	     (svc)->srv_parts != NULL &&				\
+	     ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
+/**
+ * Declaration of ptlrpcd control structure
+ */
+struct ptlrpcd_ctl {
+	/**
+	 * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+	 */
+	unsigned long			pc_flags;
+	/**
+	 * Thread lock protecting structure fields.
+	 */
+	spinlock_t			pc_lock;
+	/**
+	 * Start completion.
+	 */
+	struct completion		pc_starting;
+	/**
+	 * Stop completion.
+	 */
+	struct completion		pc_finishing;
+	/**
+	 * Thread requests set.
+	 */
+	struct ptlrpc_request_set	*pc_set;
+	/**
+	 * Thread name used in kthread_run()
+	 */
+	char				pc_name[16];
+	/**
+	 * CPT the thread is bound on.
+	 */
+	int				pc_cpt;
+        /**
+         * Index of ptlrpcd thread in the array.
+         */
+	int				pc_index;
+	/**
+	 * Pointer to the array of partners' ptlrpcd_ctl structure.
+	 */
+	struct ptlrpcd_ctl		**pc_partners;
+	/**
+	 * Number of the ptlrpcd's partners.
+	 */
+	int				pc_npartners;
+	/**
+	 * Record the partner index to be processed next.
+	 */
+	int				pc_cursor;
+	/**
+	 * Error code if the thread failed to fully start.
+	 */
+	int				pc_error;
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+	/**
+	 * Ptlrpc thread start flag.
+	 */
+	LIOD_START	= BIT(0),
+	/**
+	 * Ptlrpc thread stop flag.
+	 */
+	LIOD_STOP	= BIT(1),
+	/**
+	 * Ptlrpc thread force flag (only stop force so far).
+	 * This will cause aborting any inflight rpcs handled
+	 * by thread if LIOD_STOP is specified.
+	 */
+	LIOD_FORCE	= BIT(2),
+	/**
+	 * This is a recovery ptlrpc thread.
+	 */
+	LIOD_RECOVERY	= BIT(3),
+};
+
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; the policy is compatible with all services.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return true;
+}
+
+/**
+ * Service compatibility function; the policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the service
+ * \retval true	 The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	LASSERT(desc->pd_compat_svc_name != NULL);
+	return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
+/* ptlrpc/events.c */
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+			       struct lnet_process_id *peer, lnet_nid_t *self);
+/**
+ * These callbacks are invoked by LNet when something happened to
+ * underlying buffer
+ * @{
+ */
+extern void request_out_callback(struct lnet_event *ev);
+extern void reply_in_callback(struct lnet_event *ev);
+extern void client_bulk_callback(struct lnet_event *ev);
+extern void request_in_callback(struct lnet_event *ev);
+extern void reply_out_callback(struct lnet_event *ev);
+#ifdef HAVE_SERVER_SUPPORT
+extern void server_bulk_callback(struct lnet_event *ev);
+#endif
+/** @} */
+
+/* ptlrpc/connection.c */
+struct ptlrpc_connection *ptlrpc_connection_get(struct lnet_process_id peer,
+                                                lnet_nid_t self,
+                                                struct obd_uuid *uuid);
+
+static inline void  ptlrpc_connection_put(struct ptlrpc_connection *conn)
+{
+	if (!conn)
+		return;
+
+	LASSERT(atomic_read(&conn->c_refcount) > 0);
+
+	/*
+	 * We do not remove connection from hashtable and
+	 * do not free it even if last caller released ref,
+	 * as we want to have it cached for the case it is
+	 * needed again.
+	 *
+	 * Deallocating it and later creating new connection
+	 * again would be wastful. This way we also avoid
+	 * expensive locking to protect things from get/put
+	 * race when found cached connection is freed by
+	 * ptlrpc_connection_put().
+	 *
+	 * It will be freed later in module unload time,
+	 * when ptlrpc_connection_fini()->lh_exit->conn_exit()
+	 * path is called.
+	 */
+	atomic_dec(&conn->c_refcount);
+
+	CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nidstr(&conn->c_peer.nid));
+}
+
+struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
+extern lnet_pid_t ptl_get_pid(void);
+
+/*
+ * Check if the peer connection is on the local node.  We need to use GFP_NOFS
+ * for requests from a local client to avoid recursing into the filesystem
+ * as we might end up waiting on a page sent in the request we're serving.
+ *
+ * Use __GFP_HIGHMEM so that the pages can use all of the available memory
+ * on 32-bit machines.  Use more aggressive GFP_HIGHUSER flags from non-local
+ * clients to be able to generate more memory pressure on the OSS and allow
+ * inactive pages to be reclaimed, since it doesn't have any other processes
+ * or allocations that generate memory reclaim pressure.
+ *
+ * See b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details.
+ */
+static inline bool ptlrpc_connection_is_local(struct ptlrpc_connection *conn)
+{
+	if (!conn)
+		return false;
+
+	if (nid_same(&conn->c_peer.nid, &conn->c_self))
+		return true;
+
+	RETURN(LNetIsPeerLocal(lnet_nid_to_nid4(&conn->c_peer.nid)));
+}
+
+/* ptlrpc/niobuf.c */
+/**
+ * Actual interfacing with LNet to put/get/register/unregister stuff
+ * @{
+ */
+#ifdef HAVE_SERVER_SUPPORT
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
+					      unsigned nfrags, unsigned max_brw,
+					      unsigned int type,
+					      unsigned portal,
+					      const struct ptlrpc_bulk_frag_ops
+						*ops);
+int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc);
+void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc);
+
+static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc)
+{
+	int rc;
+
+	LASSERT(desc != NULL);
+
+	spin_lock(&desc->bd_lock);
+	rc = desc->bd_refs;
+	spin_unlock(&desc->bd_lock);
+	return rc;
+}
+#endif
+
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int rc;
+
+	LASSERT(req != NULL);
+	desc = req->rq_bulk;
+
+	if (!desc)
+		return 0;
+
+	if (req->rq_bulk_deadline > ktime_get_real_seconds())
+		return 1;
+
+
+	spin_lock(&desc->bd_lock);
+	rc = desc->bd_refs;
+	spin_unlock(&desc->bd_lock);
+	return rc;
+}
+
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY           0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
+int ptlrpc_error(struct ptlrpc_request *req);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
+/** @} */
+
+/* ptlrpc/client.c */
+/**
+ * Client-side portals API. Everything to send requests, receive replies,
+ * request queues, request management, etc.
+ * @{
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force);
+
+void ptlrpc_init_client(int req_portal, int rep_portal, const char *name,
+                        struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct obd_import *imp);
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid,
+						    lnet_nid_t nid4refnet);
+
+int ptlrpc_queue_wait(struct ptlrpc_request *req);
+int ptlrpc_replay_req(struct ptlrpc_request *req);
+void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_cleanup_imp(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg);
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
+int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
+#define PTLRPCD_SET ((struct ptlrpc_request_set *)1)
+
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
+int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+		    int (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+                                            const struct req_format *format);
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+                                            struct ptlrpc_request_pool *,
+                                            const struct req_format *format);
+void ptlrpc_request_free(struct ptlrpc_request *request);
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+                        __u32 version, int opcode);
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+                                                const struct req_format *format,
+                                                __u32 version, int opcode);
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+                             __u32 version, int opcode, char **bufs,
+                             struct ptlrpc_cli_ctx *ctx);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned nfrags, unsigned max_brw,
+					      unsigned int type,
+					      unsigned portal,
+					      const struct ptlrpc_bulk_frag_ops
+						*ops);
+
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len,
+			     int pin);
+
+void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
+
+static inline void ptlrpc_release_bulk_noop(struct ptlrpc_bulk_desc *desc)
+{
+}
+
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+                                      struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
+void ptlrpc_get_mod_rpc_slot(struct ptlrpc_request *req);
+void ptlrpc_put_mod_rpc_slot(struct ptlrpc_request *req);
+
+/* Set of routines to run a function in ptlrpcd context */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+                         int (*cb)(const struct lu_env *, void *), void *data);
+void ptlrpcd_destroy_work(void *handler);
+int ptlrpcd_queue_work(void *handler);
+
+/** @} */
+struct ptlrpc_service_buf_conf {
+	/* nbufs is buffers # to allocate when growing the pool */
+	unsigned int			bc_nbufs;
+	/* buffer size to post */
+	unsigned int			bc_buf_size;
+	/* portal to listed for requests on */
+	unsigned int			bc_req_portal;
+	/* portal of where to send replies to */
+	unsigned int			bc_rep_portal;
+	/* maximum request size to be accepted for this service */
+	unsigned int			bc_req_max_size;
+	/* maximum reply size this service can ever send */
+	unsigned int			bc_rep_max_size;
+};
+
+struct ptlrpc_service_thr_conf {
+	/* threadname should be 8 characters or less - 6 will be added on */
+	char				*tc_thr_name;
+	/* threads increasing factor for each CPU */
+	unsigned int			tc_thr_factor;
+	/* service threads # to start on each partition while initializing */
+	unsigned int			tc_nthrs_init;
+	/*
+	 * low water of threads # upper-limit on each partition while running,
+	 * service availability may be impacted if threads number is lower
+	 * than this value. It can be ZERO if the service doesn't require
+	 * CPU affinity or there is only one partition.
+	 */
+	unsigned int			tc_nthrs_base;
+	/* "soft" limit for total threads number */
+	unsigned int			tc_nthrs_max;
+	/* user specified threads number, it will be validated due to
+	 * other members of this structure. */
+	unsigned int			tc_nthrs_user;
+	/* bind service threads to only CPUs in their associated CPT */
+	unsigned int			tc_cpu_bind;
+	/* Tags for lu_context associated with service thread */
+	__u32				tc_ctx_tags;
+};
+
+struct ptlrpc_service_cpt_conf {
+	struct cfs_cpt_table		*cc_cptable;
+	/* string pattern to describe CPTs for a service */
+	char				*cc_pattern;
+	/* whether or not to have per-CPT service partitions */
+	bool				cc_affinity;
+};
+
+struct ptlrpc_service_conf {
+	/* service name */
+	char				*psc_name;
+	/* soft watchdog timeout multiplifier to print stuck service traces */
+	unsigned int			psc_watchdog_factor;
+	/* buffer information */
+	struct ptlrpc_service_buf_conf	psc_buf;
+	/* thread information */
+	struct ptlrpc_service_thr_conf	psc_thr;
+	/* CPU partition information */
+	struct ptlrpc_service_cpt_conf	psc_cpt;
+	/* function table */
+	struct ptlrpc_service_ops	psc_ops;
+};
+
+/* ptlrpc/service.c */
+/**
+ * Server-side services API. Register/unregister service, request state
+ * management, service thread management
+ *
+ * @{
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock,
+		      int mode, bool no_ack, bool convert_lock);
+void ptlrpc_commit_replies(struct obd_export *exp);
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
+struct ptlrpc_service *ptlrpc_register_service(
+				struct ptlrpc_service_conf *conf,
+				struct kset *parent,
+				struct dentry *debugfs_entry);
+
+int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export);
+void ptlrpc_update_export_timer(struct obd_export *exp,
+				time64_t extra_delay);
+
+int ptlrpc_hr_init(void);
+void ptlrpc_hr_fini(void);
+
+void ptlrpc_watchdog_init(struct delayed_work *work, timeout_t timeout);
+void ptlrpc_watchdog_disable(struct delayed_work *work);
+void ptlrpc_watchdog_touch(struct delayed_work *work, timeout_t timeout);
+
+/** @} */
+
+/* ptlrpc/import.c */
+/**
+ * Import API
+ * @{
+ */
+int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_connect_import_locked(struct obd_import *imp);
+int ptlrpc_init_import(struct obd_import *imp);
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_disconnect_and_idle_import(struct obd_import *imp);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+void deuuidify(char *uuid, const char *prefix, char **uuid_start,
+	       int *uuid_len);
+void ptlrpc_import_enter_resend(struct obd_import *imp);
+/* ptlrpc/pack_generic.c */
+int ptlrpc_reconnect_import(struct obd_import *imp);
+/** @} */
+
+/**
+ * ptlrpc msg buffer and swab interface
+ *
+ * @{
+ */
+#define PTLRPC_MAX_BUFCOUNT \
+	(sizeof(((struct ptlrpc_request *)0)->rq_req_swab_mask) * 8)
+#define MD_MAX_BUFLEN		(MDS_REG_MAXREQSIZE > OUT_MAXREQSIZE ? \
+				 MDS_REG_MAXREQSIZE : OUT_MAXREQSIZE)
+#define PTLRPC_MAX_BUFLEN	(OST_IO_MAXREQSIZE > MD_MAX_BUFLEN ? \
+				 OST_IO_MAXREQSIZE : MD_MAX_BUFLEN)
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len);
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+                        char **bufs);
+int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
+                        __u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
+                      char **bufs);
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+                         __u32 *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
+                            char **bufs, int flags);
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+                      unsigned int newlen, int move_data);
+int lustre_grow_msg(struct lustre_msg *msg, int segment, unsigned int newlen);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
+int __lustre_unpack_msg(struct lustre_msg *m, int len);
+__u32 lustre_msg_hdr_size(__u32 magic, __u32 count);
+__u32 lustre_msg_size(__u32 magic, int count, __u32 *lengths);
+__u32 lustre_msg_size_v2(int count, __u32 *lengths);
+__u32 lustre_packed_msg_size(struct lustre_msg *msg);
+extern __u32 lustre_msg_early_size;
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size);
+void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 minlen);
+__u32 lustre_msg_buflen(struct lustre_msg *m, __u32 n);
+void lustre_msg_set_buflen(struct lustre_msg *m, __u32 n, __u32 len);
+__u32 lustre_msg_bufcount(struct lustre_msg *m);
+char *lustre_msg_string(struct lustre_msg *m, __u32 n, __u32 max_len);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_flags(struct lustre_msg *msg);
+void lustre_msg_add_flags(struct lustre_msg *msg, __u32 flags);
+void lustre_msg_set_flags(struct lustre_msg *msg, __u32 flags);
+void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
+void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags);
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
+__u32 lustre_msg_get_type(struct lustre_msg *msg);
+enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg);
+void lustre_msg_add_version(struct lustre_msg *msg, __u32 version);
+__u32 lustre_msg_get_opc(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
+__u16 lustre_msg_get_tag(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
+__u64 lustre_msg_get_transno(struct lustre_msg *msg);
+__u64 lustre_msg_get_slv(struct lustre_msg *msg);
+__u32 lustre_msg_get_limit(struct lustre_msg *msg);
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv);
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
+int lustre_msg_get_status(struct lustre_msg *msg);
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+timeout_t lustre_msg_get_timeout(struct lustre_msg *msg);
+timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg);
+char *lustre_msg_get_jobid(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+__u64 lustre_msg_get_mbits(struct lustre_msg *msg);
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, __u32 buf);
+void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
+void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag);
+void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
+void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout);
+void lustre_msg_set_service_timeout(struct lustre_msg *msg,
+				    timeout_t service_timeout);
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
+void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits);
+
+static inline void
+lustre_shrink_reply(struct ptlrpc_request *req, int segment,
+                    unsigned int newlen, int move_data)
+{
+        LASSERT(req->rq_reply_state);
+        LASSERT(req->rq_repmsg);
+        req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment,
+                                           newlen, move_data);
+}
+
+#ifdef LUSTRE_TRANSLATE_ERRNOS
+
+static inline int ptlrpc_status_hton(int h)
+{
+	/*
+	 * Positive errnos must be network errnos, such as LUSTRE_EDEADLK,
+	 * ELDLM_LOCK_ABORTED, etc.
+	 */
+	if (h < 0)
+		return -lustre_errno_hton(-h);
+	else
+		return h;
+}
+
+static inline int ptlrpc_status_ntoh(int n)
+{
+	/*
+	 * See the comment in ptlrpc_status_hton().
+	 */
+	if (n < 0)
+		return -lustre_errno_ntoh(-n);
+	else
+		return n;
+}
+
+#else
+
+#define ptlrpc_status_hton(h) (h)
+#define ptlrpc_status_ntoh(n) (n)
+
+#endif
+/** @} */
+
+/** Change request phase of \a req to \a new_phase */
+static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+	if (req->rq_phase == new_phase)
+		return;
+
+	if (new_phase == RQ_PHASE_UNREG_RPC ||
+	    new_phase == RQ_PHASE_UNREG_BULK) {
+		/* No embedded unregistering phases */
+		if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
+		    req->rq_phase == RQ_PHASE_UNREG_BULK)
+			return;
+
+		req->rq_next_phase = req->rq_phase;
+		if (req->rq_import)
+			atomic_inc(&req->rq_import->imp_unregistering);
+	}
+
+	if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
+	    req->rq_phase == RQ_PHASE_UNREG_BULK) {
+		if (req->rq_import)
+			atomic_dec(&req->rq_import->imp_unregistering);
+	}
+
+	DEBUG_REQ(D_INFO, req, "move request phase from %s to %s",
+		  ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+	req->rq_phase = new_phase;
+}
+
+/**
+ * Returns true if request \a req got early reply and hard deadline is not met
+ */
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+        return req->rq_early;
+}
+
+/**
+ * Returns true if we got real reply from server for this request
+ */
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+	if (req->rq_reply_deadline > ktime_get_real_seconds())
+		return 0;
+	return req->rq_replied;
+}
+
+/** Returns true if request \a req is in process of receiving server reply */
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+	if (req->rq_reply_deadline > ktime_get_real_seconds())
+		return 1;
+	return req->rq_receiving_reply;
+}
+
+#define ptlrpc_cli_wait_unlink(req) __ptlrpc_cli_wait_unlink(req, NULL)
+
+static inline int
+__ptlrpc_cli_wait_unlink(struct ptlrpc_request *req, bool *discard)
+{
+	int rc;
+
+	spin_lock(&req->rq_lock);
+	if (req->rq_reply_deadline > ktime_get_real_seconds()) {
+		spin_unlock(&req->rq_lock);
+		return 1;
+	}
+	if (req->rq_req_deadline > ktime_get_real_seconds()) {
+		spin_unlock(&req->rq_lock);
+		return 1;
+	}
+
+	if (discard) {
+		*discard = false;
+		if (req->rq_reply_unlinked && req->rq_req_unlinked == 0) {
+			*discard = true;
+			spin_unlock(&req->rq_lock);
+			return 1; /* Should call again after LNetMDUnlink */
+		}
+	}
+
+	rc = !req->rq_req_unlinked || !req->rq_reply_unlinked ||
+	     req->rq_receiving_reply;
+	spin_unlock(&req->rq_lock);
+	return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+	smp_mb();
+	if (req->rq_set == NULL)
+		wake_up(&req->rq_reply_waitq);
+	else
+		wake_up(&req->rq_set->set_waitq);
+}
+
+static inline void
+ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	atomic_inc(&rs->rs_refcount);
+}
+
+static inline void
+ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	if (atomic_dec_and_test(&rs->rs_refcount))
+		lustre_free_reply_state(rs);
+}
+
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+	if (req->rq_reply_state == NULL)
+		return; /* shouldn't occur */
+
+	/* req_repmsg equals rq_reply_state->rs_msg,
+	 * so set it to NULL before rq_reply_state is possibly freed
+	 */
+	spin_lock(&req->rq_early_free_lock);
+	req->rq_repmsg = NULL;
+	spin_unlock(&req->rq_early_free_lock);
+
+	ptlrpc_rs_decref(req->rq_reply_state);
+	req->rq_reply_state = NULL;
+}
+
+static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
+{
+        return lustre_msg_get_magic(req->rq_reqmsg);
+}
+
+static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
+{
+        switch (req->rq_reqmsg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return req->rq_reqmsg->lm_repsize;
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n",
+                         req->rq_reqmsg->lm_magic);
+                return -EFAULT;
+        }
+}
+
+static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
+{
+        if (req->rq_delay_limit != 0 &&
+	    req->rq_queued_time + req->rq_delay_limit < ktime_get_seconds())
+                return 1;
+        return 0;
+}
+
+static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
+{
+	if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+		spin_lock(&req->rq_lock);
+		req->rq_no_resend = 1;
+		spin_unlock(&req->rq_lock);
+	}
+	return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+	int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+	return svcpt->scp_service->srv_watchdog_factor *
+	       max_t(int, at, obd_timeout);
+}
+
+/**
+ * Calculate the amount of time for lock prolongation.
+ *
+ * This is helper function to get the timeout extra time.
+ *
+ * @req		current request
+ *
+ * Return:	amount of time to extend the timeout with
+ */
+static inline timeout_t prolong_timeout(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	timeout_t req_timeout = 0;
+
+	if (AT_OFF)
+		return obd_timeout / 2;
+
+	if (req->rq_deadline > req->rq_arrival_time.tv_sec)
+		req_timeout = req->rq_deadline - req->rq_arrival_time.tv_sec;
+
+	return max(req_timeout,
+		   at_est2timeout(at_get(&svcpt->scp_at_estimate)));
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_rqbd != NULL);
+	return req->rq_rqbd->rqbd_svcpt->scp_service;
+}
+
+/* ldlm/ldlm_lib.c */
+/**
+ * Target client logic
+ * @{
+ */
+int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int client_obd_cleanup(struct obd_device *obd);
+int client_connect_import(const struct lu_env *env,
+                          struct obd_export **exp, struct obd_device *obd,
+                          struct obd_uuid *cluuid, struct obd_connect_data *,
+                          void *localdata);
+int client_disconnect_export(struct obd_export *exp);
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                           int priority);
+int client_import_dyn_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			       lnet_nid_t prim_nid, int priority);
+int client_import_add_nids_to_conn(struct obd_import *imp, lnet_nid_t *nids,
+				   int nid_count, struct obd_uuid *uuid);
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid);
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+                            struct obd_uuid *uuid);
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
+void client_destroy_import(struct obd_import *imp);
+/** @} */
+
+#ifdef HAVE_SERVER_SUPPORT
+int server_disconnect_export(struct obd_export *exp);
+#endif
+
+/* ptlrpc/pinger.c */
+/**
+ * Pinger API (client side only)
+ * @{
+ */
+enum timeout_event {
+        TIMEOUT_GRANT = 1
+};
+struct timeout_item;
+typedef int (*timeout_cb_t)(struct timeout_item *, void *);
+int ptlrpc_pinger_add_import(struct obd_import *imp);
+int ptlrpc_pinger_del_import(struct obd_import *imp);
+struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+void ptlrpc_pinger_ir_up(void);
+void ptlrpc_pinger_ir_down(void);
+/** @} */
+int ptlrpc_pinger_suppress_pings(void);
+
+/* ptlrpc/ptlrpcd.c */
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
+void ptlrpcd_wake(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req);
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
+int ptlrpcd_addref(void);
+void ptlrpcd_decref(void);
+
+/* ptlrpc/lproc_ptlrpc.c */
+/**
+ * procfs output related functions
+ * @{
+ */
+const char* ll_opcode2str(__u32 opcode);
+const int ll_str2opcode(const char *ops);
+#ifdef CONFIG_PROC_FS
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
+#else
+static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
+#endif
+/** @} */
+
+/* ptlrpc/llog_server.c */
+int llog_origin_handle_open(struct ptlrpc_request *req);
+int llog_origin_handle_prev_block(struct ptlrpc_request *req);
+int llog_origin_handle_next_block(struct ptlrpc_request *req);
+int llog_origin_handle_read_header(struct ptlrpc_request *req);
+
+/* ptlrpc/llog_client.c */
+extern const struct llog_operations llog_client_ops;
+/** @} net */
+
+#endif
+/** @} PtlRPC */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
new file mode 100644
index 0000000000000..80f123a8e2277
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
@@ -0,0 +1,241 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013, Trustees of Indiana University
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * Author: Joshua Walgenbach <jjw@iu.edu>
+ */
+
+#ifndef _LUSTRE_NODEMAP_H
+#define _LUSTRE_NODEMAP_H
+
+#include <uapi/linux/lustre/lustre_idl.h>
+
+#define LUSTRE_NODEMAP_NAME "nodemap"
+
+#define LUSTRE_NODEMAP_DEFAULT_ID 0
+
+/** enums containing the types of ids contained in a nodemap
+ * kept so other modules (mgs, mdt, etc) can define the type
+ * of search easily
+ */
+
+enum nodemap_id_type {
+	NODEMAP_UID,
+	NODEMAP_GID,
+	NODEMAP_PROJID,
+};
+
+enum nodemap_tree_type {
+	NODEMAP_FS_TO_CLIENT,
+	NODEMAP_CLIENT_TO_FS,
+};
+
+enum nodemap_mapping_modes {
+	NODEMAP_MAP_BOTH_LEGACY	= 0x0,  /* for compatibility */
+	NODEMAP_MAP_UID		= 0x01,
+	NODEMAP_MAP_GID		= 0x02,
+	NODEMAP_MAP_BOTH	= 0x03, /* for compatibility */
+	NODEMAP_MAP_PROJID	= 0x04,
+	NODEMAP_MAP_ALL		= NODEMAP_MAP_UID |
+				  NODEMAP_MAP_GID |
+				  NODEMAP_MAP_PROJID,
+};
+
+struct nodemap_pde {
+	char			 npe_name[LUSTRE_NODEMAP_NAME_LENGTH + 1];
+	struct proc_dir_entry	*npe_proc_entry;
+	struct list_head	 npe_list_member;
+};
+
+/** The nodemap id 0 will be the default nodemap. It will have a configuration
+ * set by the MGS, but no ranges will be allowed as all NIDs that do not map
+ * will be added to the default nodemap
+ */
+
+struct lu_nodemap {
+	/* human readable ID */
+	char			 nm_name[LUSTRE_NODEMAP_NAME_LENGTH + 1];
+	/* flags to govern nodemap behavior */
+	bool			 nmf_trust_client_ids:1,
+				 nmf_deny_unknown:1,
+				 nmf_allow_root_access:1,
+				 nmf_enable_audit:1,
+				 nmf_forbid_encryption:1;
+	/* bitmap for mapping type */
+	enum nodemap_mapping_modes
+				nmf_map_mode;
+	/* unique ID set by MGS */
+	unsigned int		 nm_id;
+	/* nodemap ref counter */
+	atomic_t		 nm_refcount;
+	/* UID to squash unmapped UIDs */
+	uid_t			 nm_squash_uid;
+	/* GID to squash unmapped GIDs */
+	gid_t			 nm_squash_gid;
+	/* PROJID to squash unmapped PROJIDs */
+	projid_t		 nm_squash_projid;
+	/* NID range list */
+	struct list_head	 nm_ranges;
+	/* lock for idmap red/black trees */
+	struct rw_semaphore	 nm_idmap_lock;
+	/* UID map keyed by local UID */
+	struct rb_root		 nm_fs_to_client_uidmap;
+	/* UID map keyed by remote UID */
+	struct rb_root		 nm_client_to_fs_uidmap;
+	/* GID map keyed by local UID */
+	struct rb_root		 nm_fs_to_client_gidmap;
+	/* GID map keyed by remote UID */
+	struct rb_root		 nm_client_to_fs_gidmap;
+	/* PROJID map keyed by local UID */
+	struct rb_root		 nm_fs_to_client_projidmap;
+	/* PROJID map keyed by remote UID */
+	struct rb_root		 nm_client_to_fs_projidmap;
+	/* attached client members of this nodemap */
+	struct mutex		 nm_member_list_lock;
+	struct list_head	 nm_member_list;
+	/* access by nodemap name */
+	struct hlist_node	 nm_hash;
+	struct nodemap_pde	*nm_pde_data;
+	/* fileset the nodes of this nodemap are restricted to */
+	char			 nm_fileset[PATH_MAX+1];
+	/* information about the expected SELinux policy on the nodes */
+	char			 nm_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1];
+
+	/* used when loading/unloading nodemaps */
+	struct list_head	 nm_list;
+};
+
+/* Store handles to local MGC storage to save config locally. In future
+ * versions of nodemap, mgc will receive the config directly and so this might
+ * not be needed.
+ */
+struct nm_config_file {
+	struct local_oid_storage	*ncf_los;
+	struct dt_object		*ncf_obj;
+	struct list_head		 ncf_list;
+};
+
+void nodemap_activate(const bool value);
+int nodemap_add(const char *nodemap_name);
+int nodemap_del(const char *nodemap_name);
+int nodemap_add_member(lnet_nid_t nid, struct obd_export *exp);
+void nodemap_del_member(struct obd_export *exp);
+int nodemap_parse_range(const char *range_string, lnet_nid_t range[2]);
+int nodemap_parse_idmap(char *idmap_string, __u32 idmap[2]);
+int nodemap_add_range(const char *name, const lnet_nid_t nid[2]);
+int nodemap_del_range(const char *name, const lnet_nid_t nid[2]);
+int nodemap_set_allow_root(const char *name, bool allow_root);
+int nodemap_set_trust_client_ids(const char *name, bool trust_client_ids);
+int nodemap_set_deny_unknown(const char *name, bool deny_unknown);
+int nodemap_set_mapping_mode(const char *name,
+			     enum nodemap_mapping_modes map_mode);
+int nodemap_set_squash_uid(const char *name, uid_t uid);
+int nodemap_set_squash_gid(const char *name, gid_t gid);
+int nodemap_set_squash_projid(const char *name, projid_t projid);
+int nodemap_set_audit_mode(const char *name, bool enable_audit);
+int nodemap_set_forbid_encryption(const char *name, bool forbid_encryption);
+bool nodemap_can_setquota(struct lu_nodemap *nodemap, __u32 qc_type, __u32 id);
+int nodemap_add_idmap(const char *name, enum nodemap_id_type id_type,
+		      const __u32 map[2]);
+int nodemap_del_idmap(const char *name, enum nodemap_id_type id_type,
+		      const __u32 map[2]);
+int nodemap_set_fileset(const char *name, const char *fileset);
+char *nodemap_get_fileset(const struct lu_nodemap *nodemap);
+int nodemap_set_sepol(const char *name, const char *sepol);
+const char *nodemap_get_sepol(const struct lu_nodemap *nodemap);
+__u32 nodemap_map_id(struct lu_nodemap *nodemap,
+		     enum nodemap_id_type id_type,
+		     enum nodemap_tree_type tree_type, __u32 id);
+ssize_t nodemap_map_acl(struct lu_nodemap *nodemap, void *buf, size_t size,
+			enum nodemap_tree_type tree_type);
+#ifdef HAVE_SERVER_SUPPORT
+void nodemap_test_nid(lnet_nid_t nid, char *name_buf, size_t name_len);
+#else
+#define nodemap_test_nid(nid, name_buf, name_len) do {} while(0)
+#endif
+int nodemap_test_id(lnet_nid_t nid, enum nodemap_id_type idtype,
+		    __u32 client_id, __u32 *fs_id);
+
+struct nm_config_file *nm_config_file_register_mgs(const struct lu_env *env,
+						   struct dt_object *obj,
+						   struct local_oid_storage *los);
+struct dt_device;
+struct nm_config_file *nm_config_file_register_tgt(const struct lu_env *env,
+						   struct dt_device *dev,
+						   struct local_oid_storage *los);
+void nm_config_file_deregister_mgs(const struct lu_env *env,
+				   struct nm_config_file *ncf);
+void nm_config_file_deregister_tgt(const struct lu_env *env,
+				   struct nm_config_file *ncf);
+struct lu_nodemap *nodemap_get_from_exp(struct obd_export *exp);
+void nodemap_putref(struct lu_nodemap *nodemap);
+
+#ifdef HAVE_SERVER_SUPPORT
+
+struct nodemap_range_tree {
+	struct interval_tree_root nmrt_range_interval_root;
+	unsigned int nmrt_range_highest_id;
+};
+
+struct nodemap_config {
+	/* Highest numerical lu_nodemap.nm_id defined */
+	unsigned int nmc_nodemap_highest_id;
+
+	/* Simple flag to determine if nodemaps are active */
+	bool nmc_nodemap_is_active;
+
+	/* Pointer to default nodemap as it is needed more often */
+	struct lu_nodemap *nmc_default_nodemap;
+
+	/**
+	 * Lock required to access the range tree.
+	 */
+	struct rw_semaphore nmc_range_tree_lock;
+	struct nodemap_range_tree nmc_range_tree;
+
+	/**
+	 * Hash keyed on nodemap name containing all
+	 * nodemaps
+	 */
+	struct cfs_hash *nmc_nodemap_hash;
+};
+
+struct nodemap_config *nodemap_config_alloc(void);
+void nodemap_config_dealloc(struct nodemap_config *config);
+void nodemap_config_set_active_mgc(struct nodemap_config *config);
+
+int nodemap_process_idx_pages(struct nodemap_config *config, union lu_page *lip,
+			      struct lu_nodemap **recent_nodemap);
+
+#else /* disable nodemap processing in MGC of non-servers */
+static inline int nodemap_process_idx_pages(void *config,
+					    union lu_page *lip,
+					    struct lu_nodemap **recent_nodemap)
+{ return 0; }
+#endif /* HAVE_SERVER_SUPPORT */
+
+int nodemap_get_config_req(struct obd_device *mgs_obd,
+			   struct ptlrpc_request *req);
+#endif	/* _LUSTRE_NODEMAP_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h
new file mode 100644
index 0000000000000..80f9f6b4a2b3a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h
@@ -0,0 +1,752 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ *
+ * Network Request Scheduler (NRS)
+ *
+ */
+
+#ifndef _LUSTRE_NRS_H
+#define _LUSTRE_NRS_H
+
+/**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+	/**
+	 * Not a valid opcode.
+	 */
+	PTLRPC_NRS_CTL_INVALID,
+	/**
+	 * Activate the policy.
+	 */
+	PTLRPC_NRS_CTL_START,
+	/**
+	 * Reserved for multiple primary policies, which may be a possibility
+	 * in the future.
+	 */
+	PTLRPC_NRS_CTL_STOP,
+	/**
+	 * Policies can start using opcodes from this value and onwards for
+	 * their own purposes; the assigned value itself is arbitrary.
+	 */
+	PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+	/**
+	 * Called during policy registration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being initialized
+	 */
+	int	(*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called during policy unregistration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being unregistered/finalized
+	 */
+	void	(*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when activating a policy via lprocfs; policies allocate and
+	 * initialize their resources here; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being started
+	 * \param[in,out] arg A generic char buffer
+	 *
+	 * \see nrs_policy_start_locked()
+	 */
+	int	(*op_policy_start) (struct ptlrpc_nrs_policy *policy,
+				    char *arg);
+	/**
+	 * Called when deactivating a policy via lprocfs; policies deallocate
+	 * their resources here; this operation is optional
+	 *
+	 * \param[in,out] policy The policy being stopped
+	 *
+	 * \see nrs_policy_stop0()
+	 */
+	void	(*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Used for policy-specific operations; i.e. not generic ones like
+	 * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+	 * to an ioctl; this operation is optional.
+	 *
+	 * \param[in,out]	 policy The policy carrying out operation \a opc
+	 * \param[in]	  opc	 The command operation being carried out
+	 * \param[in,out] arg	 An generic buffer for communication between the
+	 *			 user and the control operation
+	 *
+	 * \retval -ve error
+	 * \retval   0 success
+	 *
+	 * \see ptlrpc_nrs_policy_control()
+	 */
+	int	(*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+				  enum ptlrpc_nrs_ctl opc, void *arg);
+
+	/**
+	 * Called when obtaining references to the resources of the resource
+	 * hierarchy for a request that has arrived for handling at the PTLRPC
+	 * service. Policies should return -ve for requests they do not wish
+	 * to handle. This operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy we're getting resources for.
+	 * \param[in,out] nrq	  The request we are getting resources for.
+	 * \param[in]	  parent  The parent resource of the resource being
+	 *			  requested; set to NULL if none.
+	 * \param[out]	  resp	  The resource is to be returned here; the
+	 *			  fallback policy in an NRS head should
+	 *			  \e always return a non-NULL pointer value.
+	 * \param[in]  moving_req When set, signifies that this is an attempt
+	 *			  to obtain resources for a request being moved
+	 *			  to the high-priority NRS head by
+	 *			  ldlm_lock_reorder_req().
+	 *			  This implies two things:
+	 *			  1. We are under obd_export::exp_rpc_lock and
+	 *			  so should not sleep.
+	 *			  2. We should not perform non-idempotent or can
+	 *			  skip performing idempotent operations that
+	 *			  were carried out when resources were first
+	 *			  taken for the request when it was initialized
+	 *			  in ptlrpc_nrs_req_initialize().
+	 *
+	 * \retval 0, +ve The level of the returned resource in the resource
+	 *		  hierarchy; currently only 0 (for a non-leaf resource)
+	 *		  and 1 (for a leaf resource) are supported by the
+	 *		  framework.
+	 * \retval -ve	  error
+	 *
+	 * \see ptlrpc_nrs_req_initialize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	int	(*op_res_get) (struct ptlrpc_nrs_policy *policy,
+			       struct ptlrpc_nrs_request *nrq,
+			       const struct ptlrpc_nrs_resource *parent,
+			       struct ptlrpc_nrs_resource **resp,
+			       bool moving_req);
+	/**
+	 * Called when releasing references taken for resources in the resource
+	 * hierarchy for the request; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy the resource belongs to
+	 * \param[in] res	 The resource to be freed
+	 *
+	 * \see ptlrpc_nrs_req_finalize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	void	(*op_res_put) (struct ptlrpc_nrs_policy *policy,
+			       const struct ptlrpc_nrs_resource *res);
+
+	/**
+	 * Obtains a request for handling from the policy, and optionally
+	 * removes the request from the policy; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy to poll
+	 * \param[in]	  peek	 When set, signifies that we just want to
+	 *			 examine the request, and not handle it, so the
+	 *			 request is not removed from the policy.
+	 * \param[in]	  force  When set, it will force a policy to return a
+	 *			 request if it has one queued.
+	 *
+	 * \retval NULL No request available for handling
+	 * \retval valid-pointer The request polled for handling
+	 *
+	 * \see ptlrpc_nrs_req_get_nolock()
+	 */
+	struct ptlrpc_nrs_request *
+		(*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
+			       bool force);
+	/**
+	 * Called when attempting to add a request to a policy for later
+	 * handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy on which to enqueue \a nrq
+	 * \param[in,out] nrq The request to enqueue
+	 *
+	 * \retval 0	success
+	 * \retval != 0 error
+	 *
+	 * \see ptlrpc_nrs_req_add_nolock()
+	 */
+	int	(*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Removes a request from the policy's set of pending requests. Normally
+	 * called after a request has been polled successfully from the policy
+	 * for handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy the request \a nrq belongs to
+	 * \param[in,out] nrq	 The request to dequeue
+	 *
+	 * \see ptlrpc_nrs_req_del_nolock()
+	 */
+	void	(*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Called after the request being carried out. Could be used for
+	 * job/resource control; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy which is stopping to handle request
+	 *			 \a nrq
+	 * \param[in,out] nrq	 The request
+	 *
+	 * \pre assert_spin_locked(&svcpt->scp_req_lock)
+	 *
+	 * \see ptlrpc_nrs_req_stop_nolock()
+	 */
+	void	(*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Registers the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * \param[in] svc The service
+	 *
+	 * \retval 0	success
+	 * \retval != 0 error
+	 */
+	int	(*op_lprocfs_init) (struct ptlrpc_service *svc);
+	/**
+	 * Unegisters the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * In cases of failed policy registration in
+	 * \e ptlrpc_nrs_policy_register(), this function may be called for a
+	 * service which has not registered the policy successfully, so
+	 * implementations of this method should make sure their operations are
+	 * safe in such cases.
+	 *
+	 * \param[in] svc The service
+	 */
+	void	(*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+	/**
+	 * Fallback policy, use this flag only on a single supported policy per
+	 * service. The flag cannot be used on policies that use
+	 * \e PTLRPC_NRS_FL_REG_EXTERN
+	 */
+	PTLRPC_NRS_FL_FALLBACK		= BIT(0),
+	/**
+	 * Start policy immediately after registering.
+	 */
+	PTLRPC_NRS_FL_REG_START		= BIT(1),
+	/**
+	 * This is a policy registering from a module different to the one NRS
+	 * core ships in (currently ptlrpc).
+	 */
+	PTLRPC_NRS_FL_REG_EXTERN	= BIT(2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+	PTLRPC_NRS_QUEUE_REG	= BIT(0),
+	PTLRPC_NRS_QUEUE_HP	= BIT(1),
+	PTLRPC_NRS_QUEUE_BOTH	= (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ *   was initialized.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, denoted it did not wish, or for some other reason was
+ *   not able to handle the request, by returning a non-valid NRS resource
+ *   reference.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+	spinlock_t			nrs_lock;
+	/** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+	/**
+	 * List of registered policies
+	 */
+	struct list_head		nrs_policy_list;
+	/**
+	 * List of policies with queued requests. Policies that have any
+	 * outstanding requests are queued here, and this list is queried
+	 * in a round-robin manner from NRS core when obtaining a request
+	 * for handling. This ensures that requests from policies that at some
+	 * point transition away from the
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+	 */
+	struct list_head		nrs_policy_queued;
+	/**
+	 * Service partition for this NRS head
+	 */
+	struct ptlrpc_service_part     *nrs_svcpt;
+	/**
+	 * Primary policy, which is the preferred policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_primary;
+	/**
+	 * Fallback policy, which is the backup policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_fallback;
+	/**
+	 * This NRS head handles either HP or regular requests
+	 */
+	enum ptlrpc_nrs_queue_type	nrs_queue_type;
+	/**
+	 * # queued requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_queued;
+	/**
+	 * # scheduled requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_started;
+	/**
+	 * # policies on this NRS
+	 */
+	unsigned			nrs_num_pols;
+	/**
+	 * This NRS head is in progress of starting a policy
+	 */
+	unsigned			nrs_policy_starting:1;
+	/**
+	 * In progress of shutting down the whole NRS head; used during
+	 * unregistration
+	 */
+	unsigned			nrs_stopping:1;
+	/**
+	 * NRS policy is throttling reqeust
+	 */
+	unsigned			nrs_throttling:1;
+};
+
+#define NRS_POL_NAME_MAX		16
+#define NRS_POL_ARG_MAX			16
+
+struct ptlrpc_nrs_pol_desc;
+
+/**
+ * Service compatibility predicate; this determines whether a policy is adequate
+ * for handling RPCs of a particular PTLRPC service.
+ *
+ * XXX:This should give the same result during policy registration and
+ * unregistration, and for all partitions of a service; so the result should not
+ * depend on temporal service or other properties, that may influence the
+ * result.
+ */
+typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
+				       const struct ptlrpc_nrs_pol_desc *desc);
+
+struct ptlrpc_nrs_pol_conf {
+	/**
+	 * Human-readable policy name
+	 */
+	char				   nc_name[NRS_POL_NAME_MAX];
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops   *nc_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t		   nc_compat;
+	/**
+	 * Set for policies that support a single ptlrpc service, i.e. ones that
+	 * have \a pd_compat set to nrs_policy_compat_one(). The variable value
+	 * depicts the name of the single service that such policies are
+	 * compatible with.
+	 */
+	const char			  *nc_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor; policies registering from a
+	 * different module to the one the NRS framework is held within
+	 * (currently ptlrpc), should set this field to THIS_MODULE.
+	 */
+	struct module			  *nc_owner;
+	/**
+	 * Policy registration flags; a bitmast of \e nrs_policy_flags
+	 */
+	unsigned			   nc_flags;
+};
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+	/**
+	 * Human-readable policy name
+	 */
+	char					pd_name[NRS_POL_NAME_MAX];
+	/**
+	 * Link into nrs_core::nrs_policies
+	 */
+	struct list_head			pd_list;
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops        *pd_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t			pd_compat;
+	/**
+	 * Set for policies that are compatible with only one PTLRPC service.
+	 *
+	 * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
+	 */
+	const char			       *pd_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor.
+	 *
+	 * We need to hold a reference to the module whenever we might make use
+	 * of any of the module's contents, i.e.
+	 * - If one or more instances of the policy are at a state where they
+	 *   might be handling a request, i.e.
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
+	 *   call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
+	 *   is taken on the module when
+	 *   \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
+	 *   becomes 0, so that we hold only one reference to the module maximum
+	 *   at any time.
+	 *
+	 *   We do not need to hold a reference to the module, even though we
+	 *   might use code and data from the module, in the following cases:
+	 * - During external policy registration, because this should happen in
+	 *   the module's init() function, in which case the module is safe from
+	 *   removal because a reference is being held on the module by the
+	 *   kernel, and iirc kmod (and I guess module-init-tools also) will
+	 *   serialize any racing processes properly anyway.
+	 * - During external policy unregistration, because this should happen
+	 *   in a module's exit() function, and any attempts to start a policy
+	 *   instance would need to take a reference on the module, and this is
+	 *   not possible once we have reached the point where the exit()
+	 *   handler is called.
+	 * - During service registration and unregistration, as service setup
+	 *   and cleanup, and policy registration, unregistration and policy
+	 *   instance starting, are serialized by \e nrs_core::nrs_mutex, so
+	 *   as long as users adhere to the convention of registering policies
+	 *   in init() and unregistering them in module exit() functions, there
+	 *   should not be a race between these operations.
+	 * - During any policy-specific lprocfs operations, because a reference
+	 *   is held by the kernel on a proc entry that has been entered by a
+	 *   syscall, so as long as proc entries are removed during
+	 *   unregistration time, then unregistration and lprocfs operations
+	 *   will be properly serialized.
+	 */
+	struct module			       *pd_owner;
+	/**
+	 * Bitmask of \e nrs_policy_flags
+	 */
+	unsigned				pd_flags;
+	/**
+	 * # of references on this descriptor
+	 */
+	atomic_t				pd_refs;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+	/**
+	 * Not a valid policy state.
+	 */
+	NRS_POL_STATE_INVALID,
+	/**
+	 * Policies are at this state either at the start of their life, or
+	 * transition here when the user selects a different policy to act
+	 * as the primary one.
+	 */
+	NRS_POL_STATE_STOPPED,
+	/**
+	 * Policy is progress of stopping
+	 */
+	NRS_POL_STATE_STOPPING,
+	/**
+	 * Policy is in progress of starting
+	 */
+	NRS_POL_STATE_STARTING,
+	/**
+	 * A policy is in this state in two cases:
+	 * - it is the fallback policy, which is always in this state.
+	 * - it has been activated by the user; i.e. it is the primary policy,
+	 */
+	NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+	/**
+	 * Policy name
+	 */
+	char				pi_name[NRS_POL_NAME_MAX];
+	/**
+	 * Policy argument
+	 */
+	char				pi_arg[NRS_POL_ARG_MAX];
+	/**
+	 * Current policy state
+	 */
+	enum ptlrpc_nrs_pol_state	pi_state;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pi_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pi_req_started;
+	/**
+	 * Is this a fallback policy?
+	 */
+	unsigned			pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+	/**
+	 * Linkage into the NRS head's list of policies,
+	 * ptlrpc_nrs:nrs_policy_list
+	 */
+	struct list_head		pol_list;
+	/**
+	 * Linkage into the NRS head's list of policies with enqueued
+	 * requests ptlrpc_nrs:nrs_policy_queued
+	 */
+	struct list_head		pol_list_queued;
+	/**
+	 * Current state of this policy
+	 */
+	enum ptlrpc_nrs_pol_state	pol_state;
+	/**
+	 * Bitmask of nrs_policy_flags
+	 */
+	unsigned			pol_flags;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pol_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pol_req_started;
+	/**
+	 * Usage Reference count taken on the policy instance
+	 */
+	long				pol_ref;
+	/**
+	 * Human-readable policy argument
+	 */
+	char				pol_arg[NRS_POL_ARG_MAX];
+	/**
+	 * The NRS head this policy has been created at
+	 */
+	struct ptlrpc_nrs	       *pol_nrs;
+	/**
+	 * Private policy data; varies by policy type
+	 */
+	void			       *pol_private;
+	/**
+	 * Policy descriptor for this policy instance.
+	 */
+	struct ptlrpc_nrs_pol_desc     *pol_desc;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ *   ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ *   policies; e.g. on a policy that performs round robin or similar order
+ *   scheduling across client NIDs, there would be one NRS resource per unique
+ *   client NID. On a policy which performs round robin scheduling across
+ *   backend filesystem objects, there would be one resource associated with
+ *   each of the backend filesystem objects partaking in the scheduling
+ *   performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+	/**
+	 * This NRS resource's parent; is NULL for resources embedded in NRS
+	 * policy instances; i.e. those are top-level ones.
+	 */
+	struct ptlrpc_nrs_resource     *res_parent;
+	/**
+	 * The policy associated with this resource.
+	 */
+	struct ptlrpc_nrs_policy       *res_policy;
+};
+
+enum {
+	NRS_RES_FALLBACK,
+	NRS_RES_PRIMARY,
+	NRS_RES_MAX
+};
+
+#include <lustre_nrs_fifo.h>
+/**
+ * Binary heap node.
+ *
+ * Objects of this type are embedded into objects of the ordered set that is to
+ * be maintained by a \e struct binheap instance.
+ */
+struct binheap_node {
+	/** Index into the binary tree */
+	unsigned int	chn_index;
+};
+#ifdef HAVE_SERVER_SUPPORT
+#include <lustre_nrs_tbf.h>
+#include <lustre_nrs_crr.h>
+#include <lustre_nrs_orr.h>
+#endif /* HAVE_SERVER_SUPPORT */
+#include <lustre_nrs_delay.h>
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+	/**
+	 * The request's resource hierarchy.
+	 */
+	struct ptlrpc_nrs_resource     *nr_res_ptrs[NRS_RES_MAX];
+	/**
+	 * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+	 * policy that was used to enqueue the request.
+	 *
+	 * \see nrs_request_enqueue()
+	 */
+	unsigned			nr_res_idx;
+	unsigned			nr_initialized:1;
+	unsigned			nr_enqueued:1;
+	unsigned			nr_started:1;
+	unsigned			nr_finalized:1;
+	struct binheap_node		nr_node;
+
+	/**
+	 * Policy-specific fields, used for determining a request's scheduling
+	 * priority, and other supporting functionality.
+	 */
+	union {
+		/**
+		 * Fields for the FIFO policy
+		 */
+		struct nrs_fifo_req	fifo;
+#ifdef HAVE_SERVER_SUPPORT
+		/**
+		 * CRR-N request defintion
+		 */
+		struct nrs_crrn_req	crr;
+		/** ORR and TRR share the same request definition */
+		struct nrs_orr_req	orr;
+		/**
+		 * TBF request definition
+		 */
+		struct nrs_tbf_req	tbf;
+#endif /* HAVE_SERVER_SUPPORT */
+		/**
+		 * Fields for the delay policy
+		 */
+		struct nrs_delay_req	delay;
+	} nr_u;
+	/**
+	 * Externally-registering policies may want to use this to allocate
+	 * their own request properties.
+	 */
+	void			       *ext;
+};
+
+/** @} nrs */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h
new file mode 100644
index 0000000000000..c4c217bd52679
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h
@@ -0,0 +1,128 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) Client Round Robin over NIDs (CRR-N) policy
+ *
+ */
+
+#ifndef _LUSTRE_NRS_CRR_H
+#define _LUSTRE_NRS_CRR_H
+
+/**
+ * \name CRR-N
+ *
+ * CRR-N, Client Round Robin over NIDs
+ * @{
+ */
+#include <libcfs/linux/linux-hash.h>
+
+/**
+ * private data structure for CRR-N NRS
+ */
+struct nrs_crrn_net {
+	struct ptlrpc_nrs_resource	cn_res;
+	struct binheap	       *cn_binheap;
+	/* CRR-N NRS - NID hash body */
+	struct rhashtable		cn_cli_hash;
+	/**
+	 * Used when a new scheduling round commences, in order to synchronize
+	 * all clients with the new round number.
+	 */
+	__u64				cn_round;
+	/**
+	 * Determines the relevant ordering amongst request batches within a
+	 * scheduling round.
+	 */
+	__u64				cn_sequence;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs that each request
+	 * batch for each client can have in a scheduling round.
+	 */
+	__u16				cn_quantum;
+};
+
+/**
+ * Object representing a client in CRR-N, as identified by its NID
+ */
+struct nrs_crrn_client {
+	struct ptlrpc_nrs_resource	cc_res;
+	struct rhash_head		cc_rhead;
+	lnet_nid_t			cc_nid;
+	/**
+	 * The round number against which this client is currently scheduling
+	 * requests.
+	 */
+	__u64				cc_round;
+	/**
+	 * The sequence number used for requests scheduled by this client during
+	 * the current round number.
+	 */
+	__u64				cc_sequence;
+	atomic_t			cc_ref;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs the client is allowed
+	 * to schedule in a single batch of each round.
+	 */
+	__u16				cc_quantum;
+	/**
+	 * # of pending requests for this client, on all existing rounds
+	 */
+	__u16				cc_active;
+};
+
+/**
+ * CRR-N NRS request definition
+ */
+struct nrs_crrn_req {
+	/**
+	 * Round number for this request; shared with all other requests in the
+	 * same batch.
+	 */
+	__u64			cr_round;
+	/**
+	 * Sequence number for this request; shared with all other requests in
+	 * the same batch.
+	 */
+	__u64			cr_sequence;
+};
+
+/**
+ * CRR-N policy operations.
+ */
+enum nrs_ctl_crr {
+	/**
+	 * Read the RR quantum size of a CRR-N policy.
+	 */
+	NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	/**
+	 * Write the RR quantum size of a CRR-N policy.
+	 */
+	NRS_CTL_CRRN_WR_QUANTUM,
+};
+
+/** @} CRR-N */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h
new file mode 100644
index 0000000000000..9ffbc51b988c0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, Cray Inc. All Rights Reserved.
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) Delay policy
+ *
+ */
+
+#ifndef _LUSTRE_NRS_DELAY_H
+#define _LUSTRE_NRS_DELAY_H
+
+/* \name delay
+ *
+ * Delay policy
+ * @{
+ */
+
+/**
+ * Private data structure for the delay policy
+ */
+struct nrs_delay_data {
+	struct ptlrpc_nrs_resource	 delay_res;
+
+	/**
+	 * Delayed requests are stored in this binheap until they are
+	 * removed for handling.
+	 */
+	struct binheap		*delay_binheap;
+
+	/**
+	 * Minimum service time
+	 */
+	__u32				 min_delay;
+
+	/**
+	 * Maximum service time
+	 */
+	__u32				 max_delay;
+
+	/**
+	 * We'll delay this percent of requests
+	 */
+	__u32				 delay_pct;
+};
+
+struct nrs_delay_req {
+	/**
+	 * This is the time at which a request becomes eligible for handling
+	 */
+	time64_t	req_start_time;
+};
+
+enum nrs_ctl_delay {
+	NRS_CTL_DELAY_RD_MIN = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	NRS_CTL_DELAY_WR_MIN,
+	NRS_CTL_DELAY_RD_MAX,
+	NRS_CTL_DELAY_WR_MAX,
+	NRS_CTL_DELAY_RD_PCT,
+	NRS_CTL_DELAY_WR_PCT,
+};
+
+/** @} delay */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h
new file mode 100644
index 0000000000000..3b5418eac6c44
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h
@@ -0,0 +1,70 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) First-in First-out (FIFO) policy
+ *
+ */
+
+#ifndef _LUSTRE_NRS_FIFO_H
+#define _LUSTRE_NRS_FIFO_H
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+	/**
+	 * Resource object for policy instance.
+	 */
+	struct ptlrpc_nrs_resource	fh_res;
+	/**
+	 * List of queued requests.
+	 */
+	struct list_head		fh_list;
+	/**
+	 * For debugging purposes.
+	 */
+	__u64				fh_sequence;
+};
+
+struct nrs_fifo_req {
+	struct list_head	fr_list;
+	__u64			fr_sequence;
+};
+
+/** @} fifo */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h
new file mode 100644
index 0000000000000..df3d16ab8b1c9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h
@@ -0,0 +1,225 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) Object-based Round Robin and Target-based
+ * Round Robin (ORR and TRR) policies
+ *
+ */
+
+#ifndef _LUSTRE_NRS_ORR_H
+#define _LUSTRE_NRS_ORR_H
+
+/**
+ * ORR policy operations
+ */
+enum nrs_ctl_orr {
+	NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	NRS_CTL_ORR_WR_QUANTUM,
+	NRS_CTL_ORR_RD_OFF_TYPE,
+	NRS_CTL_ORR_WR_OFF_TYPE,
+	NRS_CTL_ORR_RD_SUPP_REQ,
+	NRS_CTL_ORR_WR_SUPP_REQ,
+};
+
+/**
+ * \name ORR/TRR
+ *
+ * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies
+ * @{
+ */
+
+/**
+ * Lower and upper byte offsets of a brw RPC
+ */
+struct nrs_orr_req_range {
+	__u64		or_start;
+	__u64		or_end;
+};
+
+/**
+ * RPC types supported by the ORR/TRR policies
+ */
+enum nrs_orr_supp {
+	NOS_OST_READ	= BIT(0),
+	NOS_OST_WRITE	= BIT(1),
+	NOS_OST_RW	= (NOS_OST_READ | NOS_OST_WRITE),
+	/**
+	 * Default value for policies.
+	 */
+	NOS_DFLT	= NOS_OST_READ
+};
+
+/**
+ * As unique keys for grouping RPCs together, we use the object's OST FID for
+ * the ORR policy, and the OST index for the TRR policy.
+ *
+ * XXX: We waste some space for TRR policy instances by using a union, but it
+ *	allows to consolidate some of the code between ORR and TRR, and these
+ *	policies will probably eventually merge into one anyway.
+ */
+struct nrs_orr_key {
+	union {
+		/** object FID for ORR */
+		struct lu_fid	ok_fid;
+		/** OST index for TRR */
+		__u32		ok_idx;
+	};
+};
+
+/**
+ * The largest base string for unique hash/slab object names is
+ * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT
+ * id number, so this _should_ be more than enough for the maximum number of
+ * CPTs on any system. If it does happen that this statement is incorrect,
+ * nrs_orr_genobjname() will inevitably yield a non-unique name and cause
+ * kmem_cache_create() to complain (on Linux), so the erroneous situation
+ * will hopefully not go unnoticed.
+ */
+#define NRS_ORR_OBJ_NAME_MAX	(sizeof("nrs_orr_reg_") + 3)
+
+/**
+ * private data structure for ORR and TRR NRS
+ */
+struct nrs_orr_data {
+	struct ptlrpc_nrs_resource	od_res;
+	struct binheap	       *od_binheap;
+	struct cfs_hash		       *od_obj_hash;
+	struct kmem_cache	       *od_cache;
+	/**
+	 * Used when a new scheduling round commences, in order to synchronize
+	 * all object or OST batches with the new round number.
+	 */
+	__u64				od_round;
+	/**
+	 * Determines the relevant ordering amongst request batches within a
+	 * scheduling round.
+	 */
+	__u64				od_sequence;
+	/**
+	 * RPC types that are currently supported.
+	 */
+	enum nrs_orr_supp		od_supp;
+	/**
+	 * Round Robin quantum; the maxium number of RPCs that each request
+	 * batch for each object or OST can have in a scheduling round.
+	 */
+	__u16				od_quantum;
+	/**
+	 * Whether to use physical disk offsets or logical file offsets.
+	 */
+	bool				od_physical;
+	/**
+	 * XXX: We need to provide a persistently allocated string to hold
+	 * unique object names for this policy, since in currently supported
+	 * versions of Linux by Lustre, kmem_cache_create() just sets a pointer
+	 * to the name string provided. kstrdup() is used in the version of
+	 * kmeme_cache_create() in current Linux mainline, so we may be able to
+	 * remove this in the future.
+	 */
+	char				od_objname[NRS_ORR_OBJ_NAME_MAX];
+};
+
+/**
+ * Represents a backend-fs object or OST in the ORR and TRR policies
+ * respectively
+ */
+struct nrs_orr_object {
+	struct ptlrpc_nrs_resource	oo_res;
+	struct hlist_node		oo_hnode;
+	/**
+	 * The round number against which requests are being scheduled for this
+	 * object or OST
+	 */
+	__u64				oo_round;
+	/**
+	 * The sequence number used for requests scheduled for this object or
+	 * OST during the current round number.
+	 */
+	__u64				oo_sequence;
+	/**
+	 * The key of the object or OST for which this structure instance is
+	 * scheduling RPCs
+	 */
+	struct nrs_orr_key		oo_key;
+	long				oo_ref;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs that are allowed to
+	 * be scheduled for the object or OST in a single batch of each round.
+	 */
+	__u16				oo_quantum;
+	/**
+	 * # of pending requests for this object or OST, on all existing rounds
+	 */
+	__u16				oo_active;
+};
+
+/**
+ * ORR/TRR NRS request definition
+ */
+struct nrs_orr_req {
+	/**
+	 * The offset range this request covers
+	 */
+	struct nrs_orr_req_range	or_range;
+	/**
+	 * Round number for this request; shared with all other requests in the
+	 * same batch.
+	 */
+	__u64				or_round;
+	/**
+	 * Sequence number for this request; shared with all other requests in
+	 * the same batch.
+	 */
+	__u64				or_sequence;
+	/**
+	 * For debugging purposes.
+	 */
+	struct nrs_orr_key		or_key;
+	/**
+	 * An ORR policy instance has filled in request information while
+	 * enqueueing the request on the service partition's regular NRS head.
+	 */
+	unsigned int			or_orr_set:1;
+	/**
+	 * A TRR policy instance has filled in request information while
+	 * enqueueing the request on the service partition's regular NRS head.
+	 */
+	unsigned int			or_trr_set:1;
+	/**
+	 * Request offset ranges have been filled in with logical offset
+	 * values.
+	 */
+	unsigned int			or_logical_set:1;
+	/**
+	 * Request offset ranges have been filled in with physical offset
+	 * values.
+	 */
+	unsigned int			or_physical_set:1;
+};
+
+/** @} ORR/TRR */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
new file mode 100644
index 0000000000000..feffa4eecee63
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
@@ -0,0 +1,380 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013 DataDirect Networks, Inc.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) Token Bucket Filter(TBF) policy
+ *
+ */
+
+#ifndef _LUSTRE_NRS_TBF_H
+#define _LUSTRE_NRS_TBF_H
+
+/* \name tbf
+ *
+ * TBF policy
+ *
+ * @{
+ */
+
+struct nrs_tbf_head;
+struct nrs_tbf_cmd;
+
+#define NRS_TBF_MATCH_FULL	0x0000001
+#define NRS_TBF_MATCH_WILDCARD	0x0000002
+
+struct nrs_tbf_jobid {
+	char		*tj_id;
+	__u32		 tj_match_flag;
+	struct list_head tj_linkage;
+};
+
+#define MAX_U32_STR_LEN	10
+#define NRS_TBF_KEY_LEN	(LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + \
+			 MAX_U32_STR_LEN + MAX_U32_STR_LEN + 3 + 2)
+
+enum nrs_tbf_flag {
+	NRS_TBF_FLAG_INVALID    = 0x0000000,
+	NRS_TBF_FLAG_JOBID      = 0x0000001,
+	NRS_TBF_FLAG_NID        = 0x0000002,
+	NRS_TBF_FLAG_OPCODE     = 0x0000004,
+	NRS_TBF_FLAG_GENERIC    = 0x0000008,
+	NRS_TBF_FLAG_UID        = 0x0000010,
+	NRS_TBF_FLAG_GID        = 0x0000020,
+};
+
+struct tbf_id {
+	enum nrs_tbf_flag	ti_type;
+	u32			ti_uid;
+	u32			ti_gid;
+};
+
+struct nrs_tbf_id {
+	struct tbf_id		nti_id;
+	struct list_head	nti_linkage;
+};
+
+struct nrs_tbf_client {
+	/** Resource object for policy instance. */
+	struct ptlrpc_nrs_resource	 tc_res;
+	/** Node in the hash table. */
+	struct hlist_node		 tc_hnode;
+	/** NID of the client. */
+	lnet_nid_t			 tc_nid;
+	/** Jobid of the client. */
+	char				 tc_jobid[LUSTRE_JOBID_SIZE];
+	/** opcode of the client. */
+	__u32				 tc_opcode;
+	/** gid or uid of the client. */
+	struct tbf_id			tc_id;
+	/** Hash key of the client. */
+	char				 tc_key[NRS_TBF_KEY_LEN];
+	/** Reference number of the client. */
+	atomic_t			 tc_ref;
+	/** Lock to protect rule and linkage. */
+	spinlock_t			 tc_rule_lock;
+	/** Linkage to rule. */
+	struct list_head	         tc_linkage;
+	/** Pointer to rule. */
+	struct nrs_tbf_rule		*tc_rule;
+	/** Generation of the rule matched. */
+	__u64				 tc_rule_generation;
+	/** Limit of RPC rate. */
+	__u64				 tc_rpc_rate;
+	/** Time to wait for next token. */
+	__u64				 tc_nsecs;
+	/** RPC token number. */
+	__u64				 tc_ntoken;
+	/** Token bucket depth. */
+	__u64				 tc_depth;
+	/** Time check-point. */
+	__u64				 tc_check_time;
+	/** Deadline of a class */
+	__u64				 tc_deadline;
+	/**
+	 * Time residue: the remainder of elapsed time
+	 * divided by nsecs when dequeue a request.
+	 */
+	__u64				 tc_nsecs_resid;
+	/** List of queued requests. */
+	struct list_head		 tc_list;
+	/** Node in binary heap. */
+	struct binheap_node		 tc_node;
+	/** Whether the client is in heap. */
+	bool				 tc_in_heap;
+	/** Sequence of the newest rule. */
+	__u32				 tc_rule_sequence;
+	/**
+	 * Linkage into LRU list. Protected bucket lock of
+	 * nrs_tbf_head::th_cli_hash.
+	 */
+	struct list_head		 tc_lru;
+};
+
+#define MAX_TBF_NAME (16)
+
+enum nrs_rule_flags {
+	NTRS_STOPPING	= 0x00000001,
+	NTRS_DEFAULT	= 0x00000002,
+	NTRS_REALTIME	= 0x00000004,
+};
+
+struct nrs_tbf_rule {
+	/** Name of the rule. */
+	char				 tr_name[MAX_TBF_NAME];
+	/** Head belongs to. */
+	struct nrs_tbf_head		*tr_head;
+	/** Likage to head. */
+	struct list_head		 tr_linkage;
+	/** Nid list of the rule. */
+	struct list_head		 tr_nids;
+	/** Nid list string of the rule.*/
+	char				*tr_nids_str;
+	/** Jobid list of the rule. */
+	struct list_head		 tr_jobids;
+	/** Jobid list string of the rule.*/
+	char				*tr_jobids_str;
+	/** uid/gid list of the rule. */
+	struct list_head		tr_ids;
+	/** uid/gid list string of the rule. */
+	char				*tr_ids_str;
+	/** Opcode bitmap of the rule. */
+	struct cfs_bitmap		*tr_opcodes;
+	/** Opcode list string of the rule.*/
+	char				*tr_opcodes_str;
+	/** Condition list of the rule.*/
+	struct list_head		tr_conds;
+	/** Generic condition string of the rule. */
+	char				*tr_conds_str;
+	/** RPC/s limit. */
+	__u64				 tr_rpc_rate;
+	/** Time to wait for next token. */
+	u64				 tr_nsecs_per_rpc;
+	/** Token bucket depth. */
+	__u64				 tr_depth;
+	/** Lock to protect the list of clients. */
+	spinlock_t			 tr_rule_lock;
+	/** List of client. */
+	struct list_head		 tr_cli_list;
+	/** Flags of the rule. */
+	enum nrs_rule_flags		 tr_flags;
+	/** Usage Reference count taken on the rule. */
+	atomic_t			 tr_ref;
+	/** Generation of the rule. */
+	__u64				 tr_generation;
+};
+
+struct nrs_tbf_ops {
+	char *o_name;
+	int (*o_startup)(struct ptlrpc_nrs_policy *, struct nrs_tbf_head *);
+	struct nrs_tbf_client *(*o_cli_find)(struct nrs_tbf_head *,
+					     struct ptlrpc_request *);
+	struct nrs_tbf_client *(*o_cli_findadd)(struct nrs_tbf_head *,
+						struct nrs_tbf_client *);
+	void (*o_cli_put)(struct nrs_tbf_head *, struct nrs_tbf_client *);
+	void (*o_cli_init)(struct nrs_tbf_client *, struct ptlrpc_request *);
+	int (*o_rule_init)(struct ptlrpc_nrs_policy *,
+			   struct nrs_tbf_rule *,
+			   struct nrs_tbf_cmd *);
+	int (*o_rule_dump)(struct nrs_tbf_rule *, struct seq_file *);
+	int (*o_rule_match)(struct nrs_tbf_rule *,
+			    struct nrs_tbf_client *);
+	void (*o_rule_fini)(struct nrs_tbf_rule *);
+};
+
+#define NRS_TBF_TYPE_JOBID	"jobid"
+#define NRS_TBF_TYPE_NID	"nid"
+#define NRS_TBF_TYPE_OPCODE	"opcode"
+#define NRS_TBF_TYPE_GENERIC	"generic"
+#define NRS_TBF_TYPE_UID	"uid"
+#define NRS_TBF_TYPE_GID	"gid"
+#define NRS_TBF_TYPE_MAX_LEN	20
+
+struct nrs_tbf_type {
+	const char		*ntt_name;
+	enum nrs_tbf_flag	 ntt_flag;
+	struct nrs_tbf_ops	*ntt_ops;
+};
+
+struct nrs_tbf_bucket {
+	/**
+	 * LRU list, updated on each access to client. Protected by
+	 * bucket lock of nrs_tbf_head::th_cli_hash.
+	 */
+	struct list_head	ntb_lru;
+};
+
+/**
+ * Private data structure for the TBF policy
+ */
+struct nrs_tbf_head {
+	/**
+	 * Resource object for policy instance.
+	 */
+	struct ptlrpc_nrs_resource	 th_res;
+	/**
+	 * List of rules.
+	 */
+	struct list_head		 th_list;
+	/**
+	 * Lock to protect the list of rules.
+	 */
+	spinlock_t			 th_rule_lock;
+	/**
+	 * Generation of rules.
+	 */
+	atomic_t			 th_rule_sequence;
+	/**
+	 * Default rule.
+	 */
+	struct nrs_tbf_rule		*th_rule;
+	/**
+	 * Timer for next token.
+	 */
+	struct hrtimer			 th_timer;
+	/**
+	 * Deadline of the timer.
+	 */
+	__u64				 th_deadline;
+	/**
+	 * Sequence of requests.
+	 */
+	__u64				 th_sequence;
+	/**
+	 * Heap of queues.
+	 */
+	struct binheap		*th_binheap;
+	/**
+	 * Hash of clients.
+	 */
+	struct cfs_hash			*th_cli_hash;
+	/**
+	 * Type of TBF policy.
+	 */
+	char				 th_type[NRS_TBF_TYPE_MAX_LEN + 1];
+	/**
+	 * Rule operations.
+	 */
+	struct nrs_tbf_ops		*th_ops;
+	/**
+	 * Flag of type.
+	 */
+	__u32				 th_type_flag;
+	/**
+	 * Index of bucket on hash table while purging.
+	 */
+	int				 th_purge_start;
+};
+
+enum nrs_tbf_cmd_type {
+	NRS_CTL_TBF_START_RULE = 0,
+	NRS_CTL_TBF_STOP_RULE,
+	NRS_CTL_TBF_CHANGE_RULE,
+};
+
+struct nrs_tbf_cmd {
+	enum nrs_tbf_cmd_type			 tc_cmd;
+	char					*tc_name;
+	union {
+		struct nrs_tbf_cmd_start {
+			__u64			 ts_rpc_rate;
+			struct list_head	 ts_nids;
+			char			*ts_nids_str;
+			struct list_head	 ts_jobids;
+			char			*ts_jobids_str;
+			struct list_head	 ts_ids;
+			char			*ts_ids_str;
+			char			*ts_opcodes_str;
+			struct list_head	 ts_conds;
+			char			*ts_conds_str;
+			__u32			 ts_valid_type;
+			enum nrs_rule_flags	 ts_rule_flags;
+			char			*ts_next_name;
+		} tc_start;
+		struct nrs_tbf_cmd_change {
+			__u64			 tc_rpc_rate;
+			char			*tc_next_name;
+		} tc_change;
+	} u;
+};
+
+enum nrs_tbf_field {
+	NRS_TBF_FIELD_NID,
+	NRS_TBF_FIELD_JOBID,
+	NRS_TBF_FIELD_OPCODE,
+	NRS_TBF_FIELD_UID,
+	NRS_TBF_FIELD_GID,
+	NRS_TBF_FIELD_MAX
+};
+
+struct nrs_tbf_expression {
+	enum nrs_tbf_field	 te_field;
+	struct list_head	 te_cond;
+	struct cfs_bitmap	*te_opcodes;
+	struct list_head	 te_linkage;
+};
+
+struct nrs_tbf_conjunction {
+	/**
+	 * link to disjunction.
+	 */
+	struct list_head tc_linkage;
+	/**
+	 * list of logical conjunction
+	 */
+	struct list_head tc_expressions;
+};
+
+struct nrs_tbf_req {
+	/**
+	 * Linkage to queue.
+	 */
+	struct list_head	tr_list;
+	/**
+	 * Sequence of the request.
+	 */
+	__u64			tr_sequence;
+};
+
+/**
+ * TBF policy operations.
+ */
+enum nrs_ctl_tbf {
+	/**
+	 * Read the the data of a TBF policy.
+	 */
+	NRS_CTL_TBF_RD_RULE = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	/**
+	 * Write the the data of a TBF policy.
+	 */
+	NRS_CTL_TBF_WR_RULE,
+	/**
+	 * Read the TBF policy type preset by proc entry "nrs_policies".
+	 */
+	NRS_CTL_TBF_RD_TYPE_FLAG,
+};
+
+/** @} tbf */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
new file mode 100644
index 0000000000000..dd99eee5af714
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Define obdo associated functions
+ *   obdo:  OBject Device o...
+ */
+
+#ifndef _LUSTRE_OBDO_H_
+#define _LUSTRE_OBDO_H_
+
+#include <uapi/linux/lustre/lustre_idl.h>
+
+/**
+ * Create an obdo to send over the wire
+ */
+void lustre_set_wire_obdo(const struct obd_connect_data *ocd,
+			  struct obdo *wobdo,
+			  const struct obdo *lobdo);
+
+/**
+ * Create a local obdo from a wire based odbo
+ */
+void lustre_get_wire_obdo(const struct obd_connect_data *ocd,
+			  struct obdo *lobdo,
+			  const struct obdo *wobdo);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_osc.h b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h
new file mode 100644
index 0000000000000..60300fa2b970b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h
@@ -0,0 +1,983 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+/*
+ * lustre/include/lustre_osc.h
+ *
+ * OSC layer structures and methods common for both OSC and MDC.
+ *
+ * This file contains OSC interfaces used by OSC and MDC. Most of them
+ * were just moved from lustre/osc/osc_cl_internal.h for Data-on-MDT
+ * purposes.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ *   Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#ifndef LUSTRE_OSC_H
+#define LUSTRE_OSC_H
+
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <cl_object.h>
+#include <lustre_crypto.h>
+
+/** \defgroup osc osc
+ *  @{
+ */
+
+struct osc_quota_info {
+	/** linkage for quota hash table */
+	struct hlist_node oqi_hash;
+	__u32             oqi_id;
+};
+
+enum async_flags {
+	ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+			      page is added to an rpc */
+	ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+	ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+				     to give the caller a chance to update
+				     or cancel the size of the io */
+	ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+	int			oap_magic;
+	unsigned short		oap_cmd;
+
+	struct list_head	oap_pending_item;
+	struct list_head	oap_rpc_item;
+
+	loff_t			oap_obj_off;
+	unsigned		oap_page_off;
+	enum async_flags	oap_async_flags;
+
+	struct brw_page		oap_brw_page;
+
+	struct ptlrpc_request	*oap_request;
+	struct client_obd	*oap_cli;
+	struct osc_object	*oap_obj;
+
+	spinlock_t		 oap_lock;
+};
+
+#define oap_page	oap_brw_page.pg
+#define oap_count	oap_brw_page.count
+#define oap_brw_flags	oap_brw_page.flag
+
+static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
+{
+	return container_of(pga, struct osc_async_page, oap_brw_page);
+}
+
+struct osc_device {
+	struct cl_device	osc_cl;
+	struct obd_export	*osc_exp;
+
+	/* Write stats is actually protected by client_obd's lock. */
+	struct osc_stats {
+		ktime_t		os_init;
+		uint64_t	os_lockless_writes;    /* by bytes */
+		uint64_t	os_lockless_reads;     /* by bytes */
+	} osc_stats;
+
+	/* configuration item(s) */
+	time64_t		osc_contention_time;
+};
+
+struct osc_extent;
+
+/**
+ * State maintained by osc layer for each IO context.
+ */
+struct osc_io {
+	/** super class */
+	struct cl_io_slice oi_cl;
+	/** true if this io is lockless. */
+	unsigned int	   oi_lockless:1,
+	/** true if this io is counted as active IO */
+			   oi_is_active:1,
+	/** true if this io has CAP_SYS_RESOURCE */
+			   oi_cap_sys_resource:1,
+	/** true if this io issued by readahead */
+			   oi_is_readahead:1;
+	/** how many LRU pages are reserved for this IO */
+	unsigned long	   oi_lru_reserved;
+
+	/** active extents, we know how many bytes is going to be written,
+	 * so having an active extent will prevent it from being fragmented */
+	struct osc_extent *oi_active;
+	/** partially truncated extent, we need to hold this extent to prevent
+	 * page writeback from happening. */
+	struct osc_extent *oi_trunc;
+	/** write osc_lock for this IO, used by osc_extent_find(). */
+	struct osc_lock   *oi_write_osclock;
+	struct obdo        oi_oa;
+	struct osc_async_cbargs {
+		bool		  opc_rpc_sent;
+		int		  opc_rc;
+		struct completion opc_sync;
+	} oi_cbarg;
+};
+
+/**
+ * State maintained by osc layer for the duration of a system call.
+ */
+struct osc_session {
+	struct osc_io os_io;
+};
+
+#define OTI_PVEC_SIZE 256
+struct osc_thread_info {
+	struct ldlm_res_id	oti_resname;
+	union ldlm_policy_data	oti_policy;
+	struct cl_attr		oti_attr;
+	struct cl_io		oti_io;
+	struct pagevec		oti_pagevec;
+	void			*oti_pvec[OTI_PVEC_SIZE];
+	/**
+	 * Fields used by cl_lock_discard_pages().
+	 */
+	pgoff_t			oti_next_index;
+	pgoff_t			oti_fn_index; /* first non-overlapped index */
+	pgoff_t			oti_ng_index; /* negative lock caching */
+	struct cl_sync_io	oti_anchor;
+	struct cl_req_attr	oti_req_attr;
+	struct lu_buf		oti_ladvise_buf;
+};
+
+static inline __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+	__u64 result = 0;
+
+	CDEBUG(D_DLMTRACE, "flags: %x\n", enqflags);
+
+	LASSERT((enqflags & ~CEF_MASK) == 0);
+
+	if (enqflags & CEF_NONBLOCK)
+		result |= LDLM_FL_BLOCK_NOWAIT;
+	if (enqflags & CEF_GLIMPSE)
+		result |= LDLM_FL_HAS_INTENT|LDLM_FL_CBPENDING;
+	if (enqflags & CEF_DISCARD_DATA)
+		result |= LDLM_FL_AST_DISCARD_DATA;
+	if (enqflags & CEF_PEEK)
+		result |= LDLM_FL_TEST_LOCK;
+	if (enqflags & CEF_LOCK_MATCH)
+		result |= LDLM_FL_MATCH_LOCK;
+	if (enqflags & CEF_LOCK_NO_EXPAND)
+		result |= LDLM_FL_NO_EXPANSION;
+	if (enqflags & CEF_SPECULATIVE)
+		result |= LDLM_FL_SPECULATIVE;
+	return result;
+}
+
+typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
+				    int rc);
+
+struct osc_enqueue_args {
+	struct obd_export	*oa_exp;
+	enum ldlm_type		oa_type;
+	enum ldlm_mode		oa_mode;
+	__u64			*oa_flags;
+	osc_enqueue_upcall_f	oa_upcall;
+	void			*oa_cookie;
+	struct ost_lvb		*oa_lvb;
+	struct lustre_handle	oa_lockh;
+	bool			oa_speculative;
+};
+
+/**
+ * Bit flags for osc_dlm_lock_at_pageoff().
+ */
+enum osc_dap_flags {
+	/**
+	 * Just check if the desired lock exists, it won't hold reference
+	 * count on lock.
+	 */
+	OSC_DAP_FL_TEST_LOCK = BIT(0),
+	/**
+	 * Return the lock even if it is being canceled.
+	 */
+	OSC_DAP_FL_CANCELING = BIT(1),
+	/**
+	 * check ast data is present, requested to cancel cb
+	 */
+	OSC_DAP_FL_AST	     = BIT(2),
+	/**
+	 * look at right region for the desired lock
+	 */
+	OSC_DAP_FL_RIGHT     = BIT(3),
+};
+
+/*
+ * The set of operations which are different for MDC and OSC objects
+ */
+struct osc_object_operations {
+	void (*oto_build_res_name)(struct osc_object *osc,
+				   struct ldlm_res_id *resname);
+	struct ldlm_lock* (*oto_dlmlock_at_pgoff)(const struct lu_env *env,
+						struct osc_object *obj,
+						pgoff_t index,
+						enum osc_dap_flags dap_flags);
+};
+
+struct osc_object {
+	struct cl_object	oo_cl;
+	struct lov_oinfo	*oo_oinfo;
+	/**
+	 * True if locking against this stripe got -EUSERS.
+	 */
+	int			oo_contended;
+	ktime_t			oo_contention_time;
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+	/**
+	 * IO context used for invariant checks in osc_lock_has_pages().
+	 */
+	struct cl_io		oo_debug_io;
+	/** Serialization object for osc_object::oo_debug_io. */
+	struct mutex		oo_debug_mutex;
+#endif
+	/**
+	 * used by the osc to keep track of what objects to build into rpcs.
+	 * Protected by client_obd->cli_loi_list_lock.
+	 */
+	struct list_head	oo_ready_item;
+	struct list_head	oo_hp_ready_item;
+	struct list_head	oo_write_item;
+	struct list_head	oo_read_item;
+
+	/**
+	 * extent is a red black tree to manage (async) dirty pages.
+	 */
+	struct rb_root		oo_root;
+	/**
+	 * Manage write(dirty) extents.
+	 */
+	struct list_head	oo_hp_exts;	/* list of hp extents */
+	struct list_head	oo_urgent_exts;	/* list of writeback extents */
+	struct list_head	oo_full_exts;
+
+	struct list_head	oo_reading_exts;
+
+	atomic_t		oo_nr_reads;
+	atomic_t		oo_nr_writes;
+
+	/** Protect extent tree. Will be used to protect
+	 * oo_{read|write}_pages soon. */
+	spinlock_t		oo_lock;
+
+	/**
+	 * Radix tree for caching pages
+	 */
+	spinlock_t		oo_tree_lock;
+	struct radix_tree_root	oo_tree;
+	unsigned long		oo_npages;
+
+	/* Protect osc_lock this osc_object has */
+	struct list_head	oo_ol_list;
+	spinlock_t		oo_ol_spin;
+
+	/** number of active IOs of this object */
+	atomic_t		oo_nr_ios;
+	wait_queue_head_t	oo_io_waitq;
+
+	const struct osc_object_operations *oo_obj_ops;
+	bool			oo_initialized;
+};
+
+static inline void osc_build_res_name(struct osc_object *osc,
+				      struct ldlm_res_id *resname)
+{
+	return osc->oo_obj_ops->oto_build_res_name(osc, resname);
+}
+
+static inline struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+						    struct osc_object *obj,
+						    pgoff_t index,
+						    enum osc_dap_flags flags)
+{
+	return obj->oo_obj_ops->oto_dlmlock_at_pgoff(env, obj, index, flags);
+}
+
+static inline void osc_object_lock(struct osc_object *obj)
+{
+	spin_lock(&obj->oo_lock);
+}
+
+static inline int osc_object_trylock(struct osc_object *obj)
+{
+	return spin_trylock(&obj->oo_lock);
+}
+
+static inline void osc_object_unlock(struct osc_object *obj)
+{
+	spin_unlock(&obj->oo_lock);
+}
+
+#define assert_osc_object_is_locked(obj)	\
+	assert_spin_locked(&obj->oo_lock)
+
+static inline void osc_object_set_contended(struct osc_object *obj)
+{
+	obj->oo_contention_time = ktime_get();
+	/* mb(); */
+	obj->oo_contended = 1;
+}
+
+static inline void osc_object_clear_contended(struct osc_object *obj)
+{
+	obj->oo_contended = 0;
+}
+
+/*
+ * Lock "micro-states" for osc layer.
+ */
+enum osc_lock_state {
+	OLS_NEW,
+	OLS_ENQUEUED,
+	OLS_UPCALL_RECEIVED,
+	OLS_GRANTED,
+	OLS_CANCELLED
+};
+
+/**
+ * osc-private state of cl_lock.
+ *
+ * Interaction with DLM.
+ *
+ * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
+ * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_dlmlock.
+ *
+ * This pointer is protected through a reference, acquired by
+ * osc_lock_upcall0(). Also, an additional reference is acquired by
+ * ldlm_lock_addref() call protecting the lock from cancellation, until
+ * osc_lock_unuse() releases it.
+ *
+ * Below is a description of how lock references are acquired and released
+ * inside of DLM.
+ *
+ * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
+ *      - ldlm_lock_create()
+ *          - ldlm_lock_new(): initializes a lock with 2 references. One for
+ *            the caller (released when reply from the server is received, or on
+ *            error), and another for the hash table.
+ *      - ldlm_lock_addref_internal(): protects the lock from cancellation.
+ *
+ * - When reply is received from the server (osc_enqueue_interpret())
+ *      - ldlm_cli_enqueue_fini()
+ *          - LDLM_LOCK_PUT(): releases caller reference acquired by
+ *            ldlm_lock_new().
+ *          - if (rc != 0)
+ *                ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
+ *      - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
+ *
+ * - When lock is being cancelled (ldlm_lock_cancel())
+ *      - ldlm_lock_destroy()
+ *          - LDLM_LOCK_PUT(): releases hash-table reference acquired by
+ *            ldlm_lock_new().
+ *
+ * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
+ * either when lock is cancelled (osc_lock_blocking()), or when locks is
+ * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
+ * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
+ * future.
+ */
+struct osc_lock {
+	struct cl_lock_slice	ols_cl;
+	/** Internal lock to protect states, etc. */
+	spinlock_t		ols_lock;
+	/** Owner sleeps on this channel for state change */
+	struct cl_sync_io	*ols_owner;
+	/** waiting list for this lock to be cancelled */
+	struct list_head	ols_waiting_list;
+	/** wait entry of ols_waiting_list */
+	struct list_head	ols_wait_entry;
+	/** list entry for osc_object::oo_ol_list */
+	struct list_head	ols_nextlock_oscobj;
+
+	/** underlying DLM lock */
+	struct ldlm_lock	*ols_dlmlock;
+	/** DLM flags with which osc_lock::ols_lock was enqueued */
+	__u64			ols_flags;
+	/** osc_lock::ols_lock handle */
+	struct lustre_handle	ols_handle;
+	struct ldlm_enqueue_info ols_einfo;
+	enum osc_lock_state	ols_state;
+	/** lock value block */
+	struct ost_lvb		ols_lvb;
+	/** Lockless operations to be used by lockless lock */
+	const struct cl_lock_operations *ols_lockless_ops;
+	/**
+	 * true, if ldlm_lock_addref() was called against
+	 * osc_lock::ols_lock. This is used for sanity checking.
+	 *
+	 * \see osc_lock::ols_has_ref
+	 */
+	unsigned		ols_hold :1,
+	/**
+	 * this is much like osc_lock::ols_hold, except that this bit is
+	 * cleared _after_ reference in released in osc_lock_unuse(). This
+	 * fine distinction is needed because:
+	 *
+	 *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+	 *       to return associated cl_lock (so that a flag is needed that is
+	 *       cleared after ldlm_lock_decref() returned), and
+	 *
+	 *     - ldlm_lock_decref() can invoke blocking ast (for a
+	 *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+	 *       osc_lock_cancel() called from there need to know whether to
+	 *       release lock reference (so that a flag is needed that is
+	 *       cleared before ldlm_lock_decref() is called).
+	 */
+				ols_has_ref:1,
+	/**
+	 * inherit the lockless attribute from top level cl_io.
+	 * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+	 */
+				ols_locklessable:1,
+	/**
+	 * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+	 * the EVAVAIL error as torerable, this will make upper logic happy
+	 * to wait all glimpse locks to each OSTs to be completed.
+	 * Glimpse lock converts to normal lock if the server lock is granted.
+	 * Glimpse lock should be destroyed immediately after use.
+	 */
+				ols_glimpse:1,
+	/**
+	 * For async glimpse lock.
+	 */
+				ols_agl:1,
+	/**
+	 * for speculative locks - asynchronous glimpse locks and ladvise
+	 * lockahead manual lock requests
+	 *
+	 * Used to tell osc layer to not wait for the ldlm reply from the
+	 * server, so the osc lock will be short lived - It only exists to
+	 * create the ldlm request and is not updated on request completion.
+	 */
+				ols_speculative:1;
+};
+
+static inline int osc_lock_is_lockless(const struct osc_lock *ols)
+{
+	return (ols->ols_cl.cls_ops == ols->ols_lockless_ops);
+}
+
+/**
+ * Page state private for osc layer.
+ */
+struct osc_page {
+	struct cl_page_slice  ops_cl;
+	/**
+	 * Page queues used by osc to detect when RPC can be formed.
+	 */
+	struct osc_async_page ops_oap;
+	/**
+	 * An offset within page from which next transfer starts. This is used
+	 * by cl_page_clip() to submit partial page transfers.
+	 */
+	unsigned int		ops_from:PAGE_SHIFT,
+	/**
+	 * An offset within page at which next transfer ends(inclusive).
+	 *
+	 * \see osc_page::ops_from.
+	 */
+				ops_to:PAGE_SHIFT,
+	/**
+	 * Boolean, true iff page is under transfer. Used for sanity checking.
+	 */
+				ops_transfer_pinned:1,
+	/**
+	 * in LRU?
+	 */
+				ops_in_lru:1,
+	/**
+	 * Set if the page must be transferred with OBD_BRW_SRVLOCK.
+	 */
+				ops_srvlock:1,
+	/**
+	 * If the page is in osc_object::oo_tree.
+	 */
+				ops_intree:1;
+	/**
+	 * lru page list. See osc_lru_{del|use}() in osc_page.c for usage.
+	 */
+	struct list_head	ops_lru;
+	/**
+	 * Submit time - the time when the page is starting RPC. For debugging.
+	 */
+	ktime_t			ops_submit_time;
+};
+
+struct osc_brw_async_args {
+	struct obdo		*aa_oa;
+	int			 aa_requested_nob;
+	int			 aa_nio_count;
+	u32			 aa_page_count;
+	s32			 aa_resends;
+	struct brw_page		**aa_ppga;
+	struct client_obd	*aa_cli;
+	struct list_head	 aa_oaps;
+	struct list_head	 aa_exts;
+};
+
+extern struct kmem_cache *osc_lock_kmem;
+extern struct kmem_cache *osc_object_kmem;
+extern struct kmem_cache *osc_thread_kmem;
+extern struct kmem_cache *osc_session_kmem;
+extern struct kmem_cache *osc_extent_kmem;
+extern struct kmem_cache *osc_quota_kmem;
+extern struct kmem_cache *osc_obdo_kmem;
+
+extern struct lu_context_key osc_key;
+extern struct lu_context_key osc_session_key;
+
+#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+
+/* osc_page.c */
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *page, pgoff_t ind);
+void osc_index2policy(union ldlm_policy_data *policy, const struct cl_object *obj,
+		      pgoff_t start, pgoff_t end);
+void osc_lru_add_batch(struct client_obd *cli, struct list_head *list);
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags, ktime_t submit_time);
+int lru_queue_work(const struct lu_env *env, void *data);
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+		    long target, bool force);
+
+/* osc_cache.c */
+int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
+			u32 async_flags);
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct cl_page *page, loff_t offset);
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops, cl_commit_cbt cb);
+int osc_page_cache_add(const struct lu_env *env, struct osc_page *opg,
+		       struct cl_io *io, cl_commit_cbt cb);
+int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
+			    struct osc_page *ops);
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops);
+int osc_queue_sync_pages(const struct lu_env *env, struct cl_io *io,
+			 struct osc_object *obj, struct list_head *list,
+			 int brw_flags);
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
+			     __u64 size, struct osc_extent **extp);
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard);
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end);
+int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, int async);
+static inline void osc_wake_cache_waiters(struct client_obd *cli)
+{
+	wake_up(&cli->cl_cache_waiters);
+}
+
+static inline int osc_io_unplug_async(const struct lu_env *env,
+				      struct client_obd *cli,
+				      struct osc_object *osc)
+{
+	return osc_io_unplug0(env, cli, osc, 1);
+}
+
+static inline void osc_io_unplug(const struct lu_env *env,
+				 struct client_obd *cli,
+				 struct osc_object *osc)
+{
+	(void)osc_io_unplug0(env, cli, osc, 0);
+}
+
+typedef bool (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
+				  void**, int, void *);
+bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+			  struct osc_object *osc, pgoff_t start, pgoff_t end,
+			  osc_page_gang_cbt cb, void *cbdata);
+bool osc_discard_cb(const struct lu_env *env, struct cl_io *io,
+		    void**, int, void *cbdata);
+
+/* osc_dev.c */
+int osc_device_init(const struct lu_env *env, struct lu_device *d,
+		    const char *name, struct lu_device *next);
+struct lu_device *osc_device_fini(const struct lu_env *env,
+				  struct lu_device *d);
+struct lu_device *osc_device_free(const struct lu_env *env,
+				  struct lu_device *d);
+
+/* osc_object.c */
+int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf);
+void osc_object_free(const struct lu_env *env, struct lu_object *obj);
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+		  lu_printer_t p, const struct ost_lvb *lvb);
+int osc_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t p, const struct lu_object *obj);
+int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+		 struct cl_attr *attr);
+int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+		    const struct cl_attr *attr, unsigned valid);
+int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
+		       struct ost_lvb *lvb);
+int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
+int osc_object_find_cbdata(const struct lu_env *env, struct cl_object *obj,
+			   ldlm_iterator_t iter, void *data);
+int osc_object_prune(const struct lu_env *env, struct cl_object *obj);
+
+/* osc_request.c */
+void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd);
+int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg);
+int osc_precleanup_common(struct obd_device *obd);
+int osc_cleanup_common(struct obd_device *obd);
+int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       u32 keylen, void *key, u32 vallen, void *val,
+		       struct ptlrpc_request_set *set);
+int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				 struct hlist_node *hnode, void *arg);
+int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+		  struct obd_device *obd, struct obd_uuid *cluuid,
+		  struct obd_connect_data *data, void *localdata);
+int osc_disconnect(struct obd_export *exp);
+int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+		   obd_enqueue_update_f upcall, void *cookie);
+int osc_fallocate_base(struct obd_export *exp, struct obdo *oa,
+		       obd_enqueue_update_f upcall, void *cookie, int mode);
+void osc_update_next_shrink(struct client_obd *cli);
+void osc_schedule_grant_work(void);
+
+/* osc_io.c */
+int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
+		  enum cl_req_type crt, struct cl_2queue *queue);
+int osc_io_commit_async(const struct lu_env *env,
+			const struct cl_io_slice *ios,
+			struct cl_page_list *qin, int from, int to,
+			cl_commit_cbt cb);
+void osc_io_extent_release(const struct lu_env *env,
+			   const struct cl_io_slice *ios);
+int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios);
+void osc_io_iter_fini(const struct lu_env *env,
+		      const struct cl_io_slice *ios);
+void osc_io_rw_iter_fini(const struct lu_env *env,
+			    const struct cl_io_slice *ios);
+int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios);
+void osc_io_setattr_end(const struct lu_env *env,
+			const struct cl_io_slice *slice);
+int osc_io_read_start(const struct lu_env *env,
+		      const struct cl_io_slice *slice);
+int osc_io_write_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice);
+void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice);
+int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+		  struct cl_fsync_io *fio);
+void osc_io_fsync_end(const struct lu_env *env,
+		      const struct cl_io_slice *slice);
+void osc_read_ahead_release(const struct lu_env *env, struct cl_read_ahead *ra);
+int osc_io_lseek_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice);
+void osc_io_lseek_end(const struct lu_env *env,
+		      const struct cl_io_slice *slice);
+int osc_io_lru_reserve(const struct lu_env *env, const struct cl_io_slice *ios,
+		       loff_t pos, size_t count);
+int osc_punch_start(const struct lu_env *env, struct cl_io *io,
+		    struct cl_object *obj);
+
+/* osc_lock.c */
+void osc_lock_to_lockless(const struct lu_env *env, struct osc_lock *ols,
+			  int force);
+void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
+			   struct osc_lock *oscl);
+int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
+			  struct osc_lock *oscl);
+void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
+			 struct cl_object *obj, struct osc_lock *oscl);
+int osc_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t p, const struct cl_lock_slice *slice);
+void osc_lock_cancel(const struct lu_env *env,
+		     const struct cl_lock_slice *slice);
+void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice);
+int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
+unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
+
+/*****************************************************************************
+ *
+ * Accessors and type conversions.
+ *
+ */
+static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
+{
+	struct osc_thread_info *info;
+
+	info = lu_context_key_get(&env->le_ctx, &osc_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct osc_session *osc_env_session(const struct lu_env *env)
+{
+	struct osc_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &osc_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct osc_io *osc_env_io(const struct lu_env *env)
+{
+	return &osc_env_session(env)->os_io;
+}
+
+static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
+{
+	return container_of_safe(d, struct osc_device, osc_cl.cd_lu_dev);
+}
+
+static inline struct obd_export *osc_export(const struct osc_object *obj)
+{
+	return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->osc_exp;
+}
+
+static inline struct client_obd *osc_cli(const struct osc_object *obj)
+{
+	return &osc_export(obj)->exp_obd->u.cli;
+}
+
+static inline struct osc_object *cl2osc(const struct cl_object *obj)
+{
+	return container_of_safe(obj, struct osc_object, oo_cl);
+}
+
+static inline struct cl_object *osc2cl(const struct osc_object *obj)
+{
+	return (struct cl_object *)&obj->oo_cl;
+}
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *obd)
+{
+	return container_of_safe(obd->obd_lu_dev, struct osc_device,
+				 osc_cl.cd_lu_dev);
+}
+
+static inline struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+	return &osc->osc_cl.cd_lu_dev;
+}
+
+static inline struct lu_object *osc2lu(struct osc_object *osc)
+{
+	return &osc->oo_cl.co_lu;
+}
+
+static inline struct osc_object *lu2osc(const struct lu_object *obj)
+{
+	return container_of_safe(obj, struct osc_object, oo_cl.co_lu);
+}
+
+static inline struct osc_io *cl2osc_io(const struct lu_env *env,
+				       const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = container_of(slice, struct osc_io, oi_cl);
+
+	LINVRNT(oio == osc_env_io(env));
+	return oio;
+}
+
+static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode)
+{
+	LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
+	if (mode == CLM_READ)
+		return LCK_PR;
+	if (mode == CLM_WRITE)
+		return LCK_PW;
+	return LCK_GROUP;
+}
+
+static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode)
+{
+	LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
+	if (mode == LCK_PR)
+		return CLM_READ;
+	if (mode == LCK_PW)
+		return CLM_WRITE;
+	return CLM_GROUP;
+}
+
+static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
+{
+	return container_of_safe(slice, struct osc_page, ops_cl);
+}
+
+static inline struct osc_page *oap2osc(struct osc_async_page *oap)
+{
+	return container_of_safe(oap, struct osc_page, ops_oap);
+}
+
+static inline pgoff_t osc_index(struct osc_page *opg)
+{
+	return opg->ops_oap.oap_obj_off >> PAGE_SHIFT;
+}
+
+static inline struct cl_page *oap2cl_page(struct osc_async_page *oap)
+{
+	return oap2osc(oap)->ops_cl.cpl_page;
+}
+
+static inline struct osc_page *oap2osc_page(struct osc_async_page *oap)
+{
+	return (struct osc_page *)container_of(oap, struct osc_page, ops_oap);
+}
+
+static inline struct osc_page *
+osc_cl_page_osc(struct cl_page *page, struct osc_object *osc)
+{
+	const struct cl_page_slice *slice;
+
+	LASSERT(osc != NULL);
+	slice = cl_object_page_slice(&osc->oo_cl, page);
+	return cl2osc_page(slice);
+}
+
+static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
+{
+	return container_of_safe(slice, struct osc_lock, ols_cl);
+}
+
+static inline int osc_io_srvlock(struct osc_io *oio)
+{
+	return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+}
+
+enum osc_extent_state {
+	OES_INV       = 0, /** extent is just initialized or destroyed */
+	OES_ACTIVE    = 1, /** process is using this extent */
+	OES_CACHE     = 2, /** extent is ready for IO */
+	OES_LOCKING   = 3, /** locking page to prepare IO */
+	OES_LOCK_DONE = 4, /** locking finished, ready to send */
+	OES_RPC       = 5, /** in RPC */
+	OES_TRUNC     = 6, /** being truncated */
+	OES_STATE_MAX
+};
+
+/**
+ * osc_extent data to manage dirty pages.
+ * osc_extent has the following attributes:
+ * 1. all pages in the same must be in one RPC in write back;
+ * 2. # of pages must be less than max_pages_per_rpc - implied by 1;
+ * 3. must be covered by only 1 osc_lock;
+ * 4. exclusive. It's impossible to have overlapped osc_extent.
+ *
+ * The lifetime of an extent is from when the 1st page is dirtied to when
+ * all pages inside it are written out.
+ *
+ * LOCKING ORDER
+ * =============
+ * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock)
+ */
+struct osc_extent {
+	/** red-black tree node */
+	struct rb_node		oe_node;
+	/** osc_object of this extent */
+	struct osc_object	*oe_obj;
+	/** refcount, removed from red-black tree if reaches zero. */
+	struct kref		oe_refc;
+	/** busy if non-zero */
+	atomic_t		oe_users;
+	/** link list of osc_object's oo_{hp|urgent|locking}_exts. */
+	struct list_head	oe_link;
+	/** state of this extent */
+	enum osc_extent_state	oe_state;
+	/** flags for this extent. */
+	/** 0 is write, 1 is read */
+	unsigned int		oe_rw:1,
+	/** sync extent, queued by osc_queue_sync_pages() */
+				oe_sync:1,
+	/** set if this extent has partial, sync pages.
+	 * Extents with partial page(s) can't merge with others in RPC */
+				oe_no_merge:1,
+				oe_srvlock:1,
+				oe_memalloc:1,
+	/** an ACTIVE extent is going to be truncated, so when this extent
+	 * is released, it will turn into TRUNC state instead of CACHE. */
+				oe_trunc_pending:1,
+	/** this extent should be written asap and someone may wait for the
+	 * write to finish. This bit is usually set along with urgent if
+	 * the extent was CACHE state.
+	 * fsync_wait extent can't be merged because new extent region may
+	 * exceed fsync range. */
+				oe_fsync_wait:1,
+	/** covering lock is being canceled */
+				oe_hp:1,
+	/** this extent should be written back asap. set if one of pages is
+	 * called by page WB daemon, or sync write or reading requests. */
+				oe_urgent:1,
+	/** Non-delay RPC should be used for this extent. */
+				oe_ndelay:1,
+	/** direct IO pages */
+				oe_dio:1,
+	/** this extent consists of pages that are not directly accessible
+	 *  from the CPU */
+				oe_is_rdma_only:1;
+	/** how many grants allocated for this extent.
+	 *  Grant allocated for this extent. There is no grant allocated
+	 *  for reading extents and sync write extents. */
+	unsigned int		oe_grants;
+	/** # of dirty pages in this extent */
+	unsigned int		oe_nr_pages;
+	/** list of pending oap pages. Pages in this list are NOT sorted. */
+	struct list_head	oe_pages;
+	/** start and end index of this extent, include start and end
+	 * themselves. Page offset here is the page index of osc_pages.
+	 * oe_start is used as keyword for red-black tree. */
+	pgoff_t			oe_start;
+	pgoff_t			oe_end;
+	/** maximum ending index of this extent, this is limited by
+	 * max_pages_per_rpc, lock extent and chunk size. */
+	pgoff_t			oe_max_end;
+	/** waitqueue - for those who want to be notified if this extent's
+	 * state has changed. */
+	wait_queue_head_t	oe_waitq;
+	/** lock covering this extent */
+	struct ldlm_lock	*oe_dlmlock;
+	/** terminator of this extent. Must be true if this extent is in IO. */
+	struct task_struct	*oe_owner;
+	/** return value of writeback. If somebody is waiting for this extent,
+	 * this value can be known by outside world. */
+	int			oe_rc;
+	/** max pages per rpc when this extent was created */
+	unsigned int		oe_mppr;
+	/** FLR: layout version when this osc_extent is publised */
+	__u32			oe_layout_version;
+};
+
+/** @} osc */
+
+#endif /* LUSTRE_OSC_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
new file mode 100644
index 0000000000000..4b674d8b1257b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
@@ -0,0 +1,279 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LUSTRE_QUOTA_H
+#define _LUSTRE_QUOTA_H
+
+/** \defgroup quota quota
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/sort.h>
+#include <dt_object.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+
+#ifndef MAX_IQ_TIME
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+#ifndef MAX_DQ_TIME
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+struct lquota_id_info;
+struct lquota_trans;
+
+/* Gather all quota record type in an union that can be used to read any records
+ * from disk. All fields of these records must be 64-bit aligned, otherwise the
+ * OSD layer may swab them incorrectly. */
+union lquota_rec {
+	struct lquota_glb_rec	lqr_glb_rec;
+	struct lquota_slv_rec	lqr_slv_rec;
+	struct lquota_acct_rec	lqr_acct_rec;
+};
+
+/* flags for inode/block quota accounting */
+enum osd_qid_declare_flags {
+	OSD_QID_INODE	= BIT(0),
+	OSD_QID_BLK	= BIT(1),
+	OSD_QID_FORCE	= BIT(2),
+};
+
+/* Index features supported by the global index objects
+ * Only used for migration purpose and should be removed once on-disk migration
+ * is no longer needed */
+extern struct dt_index_features dt_quota_iusr_features;
+extern struct dt_index_features dt_quota_busr_features;
+extern struct dt_index_features dt_quota_igrp_features;
+extern struct dt_index_features dt_quota_bgrp_features;
+
+/* Name used in the configuration logs to identify the default metadata pool
+ * (composed of all the MDTs, with pool ID 0) and the default data pool (all
+ * the OSTs, with pool ID 0 too). */
+#define QUOTA_METAPOOL_NAME   "mdt="
+#define QUOTA_DATAPOOL_NAME   "ost="
+
+/*
+ * Quota Master Target support
+ */
+
+/* Request handlers for quota master operations.
+ * This is used by the MDT to pass quota/lock requests to the quota master
+ * target. This won't be needed any more once the QMT is a real target and
+ * does not rely any more on the MDT service threads and namespace. */
+struct qmt_handlers {
+	/* Handle quotactl request from client. */
+	int (*qmth_quotactl)(const struct lu_env *, struct lu_device *,
+			     struct obd_quotactl *);
+
+	/* Handle dqacq/dqrel request from slave. */
+	int (*qmth_dqacq)(const struct lu_env *, struct lu_device *,
+			  struct ptlrpc_request *);
+
+	/* LDLM intent policy associated with quota locks */
+	int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *,
+				  struct ptlrpc_request *, struct ldlm_lock **,
+				  int);
+
+	/* Initialize LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *);
+
+	/* Update LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *,
+				struct ptlrpc_request *, int);
+
+	/* Return size of LVB to be packed in ldlm message */
+	int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *);
+
+	/* Fill request buffer with lvb */
+	int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *,
+			      int);
+
+	/* Free lvb associated with ldlm resource */
+	int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *);
+};
+
+/* actual handlers are defined in lustre/quota/qmt_handler.c */
+extern struct qmt_handlers qmt_hdls;
+
+/*
+ * Quota enforcement support on slaves
+ */
+
+struct qsd_instance;
+
+/* The quota slave feature is implemented under the form of a library.
+ * The API is the following:
+ *
+ * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd
+ *               instance via qsd_init(). This creates all required structures
+ *               to manage quota enforcement for this target and performs all
+ *               low-level initialization which does not involve any lustre
+ *               object. qsd_init() should typically be called when the OSD
+ *               is being set up.
+ *
+ * - qsd_prepare(): This sets up on-disk objects associated with the quota slave
+ *                  feature and initiates the quota reintegration procedure if
+ *                  needed. qsd_prepare() should typically be called when
+ *                  ->ldo_prepare is invoked.
+ *
+ * - qsd_start(): a qsd instance should be started once recovery is completed
+ *                (i.e. when ->ldo_recovery_complete is called). This is used
+ *                to notify the qsd layer that quota should now be enforced
+ *                again via the qsd_op_begin/end functions. The last step of the
+ *                reintegration prodecure (namely usage reconciliation) will be
+ *                completed during start.
+ *
+ * - qsd_fini(): is used to release a qsd_instance structure allocated with
+ *               qsd_init(). This releases all quota slave objects and frees the
+ *               structures associated with the qsd_instance.
+ *
+ * - qsd_op_begin(): is used to enforce quota, it must be called in the
+ *                   declaration of each operation. qsd_op_end() should then be
+ *                   invoked later once all operations have been completed in
+ *                   order to release/adjust the quota space.
+ *                   Running qsd_op_begin() before qsd_start() isn't fatal and
+ *                   will return success.
+ *                   Once qsd_start() has been run, qsd_op_begin() will block
+ *                   until the reintegration procedure is completed.
+ *
+ * - qsd_op_end(): performs the post operation quota processing. This must be
+ *                 called after the operation transaction stopped.
+ *                 While qsd_op_begin() must be invoked each time a new
+ *                 operation is declared, qsd_op_end() should be called only
+ *                 once for the whole transaction.
+ *
+ * - qsd_op_adjust(): triggers pre-acquire/release if necessary.
+ *
+ * Below are the function prototypes to be used by OSD layer to manage quota
+ * enforcement. Arguments are documented where each function is defined.  */
+
+/* flags for quota local enforcement */
+enum osd_quota_local_flags {
+	QUOTA_FL_OVER_USRQUOTA	= BIT(0),
+	QUOTA_FL_OVER_GRPQUOTA	= BIT(1),
+	QUOTA_FL_SYNC		= BIT(2),
+	QUOTA_FL_OVER_PRJQUOTA	= BIT(3),
+};
+
+struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
+			      struct proc_dir_entry *, bool is_md, bool excl);
+int qsd_prepare(const struct lu_env *, struct qsd_instance *);
+int qsd_start(const struct lu_env *, struct qsd_instance *);
+void qsd_fini(const struct lu_env *, struct qsd_instance *);
+int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
+		 struct lquota_trans *, struct lquota_id_info *,
+		 enum osd_quota_local_flags *);
+void qsd_op_end(const struct lu_env *, struct qsd_instance *,
+		struct lquota_trans *);
+void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
+		   union lquota_id *, int);
+int qsd_transfer(const struct lu_env *env, struct qsd_instance *qsd,
+		 struct lquota_trans *trans, unsigned int qtype,
+		 u64 orig_id, u64 new_id, u64 bspace,
+		 struct lquota_id_info *qi);
+int qsd_reserve_or_free_quota(const struct lu_env *env,
+			      struct qsd_instance *qsd,
+			      struct lquota_id_info *qi);
+
+/*
+ * Quota information attached to a transaction
+ */
+
+struct lquota_entry;
+
+struct lquota_id_info {
+	/* quota identifier */
+	union lquota_id		 lqi_id;
+
+	/* USRQUOTA or GRPQUOTA for now, could be expanded for
+	 * directory quota or other types later.  */
+	int			 lqi_type;
+
+	/* inodes or kbytes to be consumed or released, it could
+	 * be negative when releasing space.  */
+	long long		 lqi_space;
+
+	/* quota slave entry structure associated with this ID */
+	struct lquota_entry	*lqi_qentry;
+
+	/* whether we are reporting blocks or inodes */
+	bool			 lqi_is_blk;
+};
+
+/* With the DoM, both inode quota in meta pool and block quota in data pool
+ * will be enforced at MDT, there are at most 4 quota ids being enforced in
+ * a single transaction for inode and block quota, which is chown transaction:
+ * original uid and gid, new uid and gid.
+ *
+ * This value might need to be revised when directory quota is added.  */
+#define QUOTA_MAX_TRANSIDS    8
+
+/* all qids involved in a single transaction */
+struct lquota_trans {
+	unsigned short		lqt_id_cnt;
+	struct lquota_id_info	lqt_ids[QUOTA_MAX_TRANSIDS];
+};
+
+#define IS_LQUOTA_RES(res)						\
+	(res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||	\
+	 res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
+
+/* helper function used by MDT & OFD to retrieve quota accounting information
+ * on slave */
+int lquotactl_slv(const struct lu_env *, struct dt_device *,
+		  struct obd_quotactl *);
+
+static inline int quota_reserve_or_free(const struct lu_env *env,
+					struct qsd_instance *qsd,
+					struct lquota_id_info *qi,
+					enum quota_type type, __u64 uid,
+					__u64 gid, __s64 count, bool is_md)
+{
+	qi->lqi_type = type;
+	if (count > 0)
+		qi->lqi_space = toqb(count);
+	else
+		qi->lqi_space = -toqb(-count);
+
+	if (is_md)
+		qi->lqi_is_blk = false;
+	else
+		qi->lqi_is_blk = true;
+
+	qi->lqi_id.qid_uid = uid;
+	qi->lqi_id.qid_gid = gid;
+
+	return qsd_reserve_or_free_quota(env, qsd, qi);
+}
+
+/** @} quota */
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
new file mode 100644
index 0000000000000..57c74aa322fe1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
@@ -0,0 +1,428 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_req_layout.h
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_REQ_LAYOUT_H__
+#define _LUSTRE_REQ_LAYOUT_H__
+
+#include <linux/types.h>
+
+/** \defgroup req_layout req_layout
+ *
+ * @{
+ */
+
+struct req_msg_field;
+struct req_format;
+struct req_capsule;
+
+struct ptlrpc_request;
+
+enum req_location {
+        RCL_CLIENT,
+        RCL_SERVER,
+        RCL_NR
+};
+
+/* Maximal number of fields (buffers) in a request message. */
+#define REQ_MAX_FIELD_NR 12
+
+struct req_capsule {
+        struct ptlrpc_request   *rc_req;
+	/** Request message - what client sent */
+	struct lustre_msg	*rc_reqmsg;
+	/** Reply message - server response */
+	struct lustre_msg	*rc_repmsg;
+	/** Fields that help to see if request and reply were swabved or not */
+	__u32			 rc_req_swab_mask;
+	__u32			 rc_rep_swab_mask;
+	const struct req_format *rc_fmt;
+	enum req_location        rc_loc;
+	__u32                    rc_area[RCL_NR][REQ_MAX_FIELD_NR];
+};
+
+void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req,
+                      enum req_location location);
+void req_capsule_fini(struct req_capsule *pill);
+
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt);
+void req_capsule_client_dump(struct req_capsule *pill);
+void req_capsule_server_dump(struct req_capsule *pill);
+void req_capsule_init_area(struct req_capsule *pill);
+size_t req_capsule_filled_sizes(struct req_capsule *pill,
+				enum req_location loc);
+int  req_capsule_server_pack(struct req_capsule *pill);
+
+void *req_capsule_client_get(struct req_capsule *pill,
+                             const struct req_msg_field *field);
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber);
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   __u32 len);
+void *req_capsule_server_get(struct req_capsule *pill,
+                             const struct req_msg_field *field);
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   __u32 len);
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  void *swabber);
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					__u32 len, void *swabber);
+const void *req_capsule_other_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field);
+
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, __u32 size);
+__u32 req_capsule_get_size(const struct req_capsule *pill,
+			   const struct req_msg_field *field,
+			   enum req_location loc);
+__u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc);
+__u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+                         enum req_location loc);
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt);
+
+int req_capsule_has_field(const struct req_capsule *pill,
+                          const struct req_msg_field *field,
+                          enum req_location loc);
+int req_capsule_field_present(const struct req_capsule *pill,
+                              const struct req_msg_field *field,
+                              enum req_location loc);
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			__u32 newlen,
+			enum req_location loc);
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    __u32 newlen);
+bool req_capsule_need_swab(struct req_capsule *pill, enum req_location loc,
+			   __u32 index);
+void req_capsule_set_swabbed(struct req_capsule *pill, enum req_location loc,
+			     __u32 index);
+
+/**
+ * Returns true if request buffer at offset \a index was already swabbed
+ */
+static inline bool req_capsule_req_swabbed(struct req_capsule *pill,
+					   size_t index)
+{
+	LASSERT(index < sizeof(pill->rc_req_swab_mask) * 8);
+	return pill->rc_req_swab_mask & BIT(index);
+}
+
+/**
+ * Returns true if request reply buffer at offset \a index was already swabbed
+ */
+static inline bool req_capsule_rep_swabbed(struct req_capsule *pill,
+					   size_t index)
+{
+	LASSERT(index < sizeof(pill->rc_rep_swab_mask) * 8);
+	return pill->rc_rep_swab_mask & BIT(index);
+}
+
+/**
+ * Returns true if request needs to be swabbed into local cpu byteorder
+ */
+static inline bool req_capsule_req_need_swab(struct req_capsule *pill)
+{
+	return req_capsule_req_swabbed(pill, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Returns true if request reply needs to be swabbed into local cpu byteorder
+ */
+static inline bool req_capsule_rep_need_swab(struct req_capsule *pill)
+{
+	return req_capsule_rep_swabbed(pill, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Mark request buffer at offset \a index that it was already swabbed
+ */
+static inline void req_capsule_set_req_swabbed(struct req_capsule *pill,
+					       size_t index)
+{
+	LASSERT(index < sizeof(pill->rc_req_swab_mask) * 8);
+	LASSERT((pill->rc_req_swab_mask & BIT(index)) == 0);
+	pill->rc_req_swab_mask |= BIT(index);
+}
+
+/**
+ * Mark request reply buffer at offset \a index that it was already swabbed
+ */
+static inline void req_capsule_set_rep_swabbed(struct req_capsule *pill,
+					       size_t index)
+{
+	LASSERT(index < sizeof(pill->rc_rep_swab_mask) * 8);
+	LASSERT((pill->rc_rep_swab_mask & BIT(index)) == 0);
+	pill->rc_rep_swab_mask |= BIT(index);
+}
+
+int  req_layout_init(void);
+void req_layout_fini(void);
+#ifdef HAVE_SERVER_SUPPORT
+int req_check_sepol(struct req_capsule *pill);
+#else
+static inline int req_check_sepol(struct req_capsule *pill)
+{
+	return 0;
+}
+#endif
+
+extern struct req_format RQF_OBD_PING;
+extern struct req_format RQF_OBD_SET_INFO;
+extern struct req_format RQF_MDT_SET_INFO;
+extern struct req_format RQF_SEC_CTX;
+extern struct req_format RQF_OBD_IDX_READ;
+/* MGS req_format */
+extern struct req_format RQF_MGS_TARGET_REG;
+extern struct req_format RQF_MGS_SET_INFO;
+extern struct req_format RQF_MGS_CONFIG_READ;
+/* fid/fld req_format */
+extern struct req_format RQF_SEQ_QUERY;
+extern struct req_format RQF_FLD_QUERY;
+extern struct req_format RQF_FLD_READ;
+/* MDS req_format */
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_STATFS_NEW;
+extern struct req_format RQF_MDS_GET_ROOT;
+extern struct req_format RQF_MDS_SYNC;
+extern struct req_format RQF_MDS_GETXATTR;
+extern struct req_format RQF_MDS_GETATTR;
+extern struct req_format RQF_OUT_UPDATE;
+
+/*
+ * This is format of direct (non-intent) MDS_GETATTR_NAME request.
+ */
+extern struct req_format RQF_MDS_GETATTR_NAME;
+extern struct req_format RQF_MDS_CLOSE;
+extern struct req_format RQF_MDS_CLOSE_INTENT;
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_READPAGE;
+extern struct req_format RQF_MDS_REINT;
+extern struct req_format RQF_MDS_REINT_CREATE;
+extern struct req_format RQF_MDS_REINT_CREATE_ACL;
+extern struct req_format RQF_MDS_REINT_CREATE_SLAVE;
+extern struct req_format RQF_MDS_REINT_CREATE_SYM;
+extern struct req_format RQF_MDS_REINT_OPEN;
+extern struct req_format RQF_MDS_REINT_UNLINK;
+extern struct req_format RQF_MDS_REINT_LINK;
+extern struct req_format RQF_MDS_REINT_RENAME;
+extern struct req_format RQF_MDS_REINT_SETATTR;
+extern struct req_format RQF_MDS_REINT_SETXATTR;
+extern struct req_format RQF_MDS_QUOTACTL;
+extern struct req_format RQF_QUOTA_DQACQ;
+extern struct req_format RQF_MDS_SWAP_LAYOUTS;
+extern struct req_format RQF_MDS_REINT_MIGRATE;
+extern struct req_format RQF_MDS_REINT_RESYNC;
+extern struct req_format RQF_MDS_RMFID;
+/* MDS hsm formats */
+extern struct req_format RQF_MDS_HSM_STATE_GET;
+extern struct req_format RQF_MDS_HSM_STATE_SET;
+extern struct req_format RQF_MDS_HSM_ACTION;
+extern struct req_format RQF_MDS_HSM_PROGRESS;
+extern struct req_format RQF_MDS_HSM_CT_REGISTER;
+extern struct req_format RQF_MDS_HSM_CT_UNREGISTER;
+extern struct req_format RQF_MDS_HSM_REQUEST;
+/* OST req_format */
+extern struct req_format RQF_OST_CONNECT;
+extern struct req_format RQF_OST_DISCONNECT;
+extern struct req_format RQF_OST_QUOTACTL;
+extern struct req_format RQF_OST_GETATTR;
+extern struct req_format RQF_OST_SETATTR;
+extern struct req_format RQF_OST_CREATE;
+extern struct req_format RQF_OST_PUNCH;
+extern struct req_format RQF_OST_FALLOCATE;
+extern struct req_format RQF_OST_SYNC;
+extern struct req_format RQF_OST_DESTROY;
+extern struct req_format RQF_OST_BRW_READ;
+extern struct req_format RQF_OST_BRW_WRITE;
+extern struct req_format RQF_OST_STATFS;
+extern struct req_format RQF_OST_SET_GRANT_INFO;
+extern struct req_format RQF_OST_GET_INFO;
+extern struct req_format RQF_OST_GET_INFO_LAST_ID;
+extern struct req_format RQF_OST_GET_INFO_LAST_FID;
+extern struct req_format RQF_OST_SET_INFO_LAST_FID;
+extern struct req_format RQF_OST_GET_INFO_FIEMAP;
+extern struct req_format RQF_OST_LADVISE;
+extern struct req_format RQF_OST_SEEK;
+
+/* LDLM req_format */
+extern struct req_format RQF_LDLM_ENQUEUE;
+extern struct req_format RQF_LDLM_ENQUEUE_LVB;
+extern struct req_format RQF_LDLM_CONVERT;
+extern struct req_format RQF_LDLM_INTENT;
+extern struct req_format RQF_LDLM_INTENT_BASIC;
+extern struct req_format RQF_LDLM_INTENT_LAYOUT;
+extern struct req_format RQF_LDLM_INTENT_GETATTR;
+extern struct req_format RQF_LDLM_INTENT_OPEN;
+extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_GETXATTR;
+extern struct req_format RQF_LDLM_INTENT_QUOTA;
+extern struct req_format RQF_LDLM_CANCEL;
+extern struct req_format RQF_LDLM_CALLBACK;
+extern struct req_format RQF_LDLM_CP_CALLBACK;
+extern struct req_format RQF_LDLM_BL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK_DESC;
+/* LOG req_format */
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+
+extern struct req_format RQF_CONNECT;
+
+/* LFSCK req_format */
+extern struct req_format RQF_LFSCK_NOTIFY;
+extern struct req_format RQF_LFSCK_QUERY;
+
+extern struct req_msg_field RMF_GENERIC_DATA;
+extern struct req_msg_field RMF_PTLRPC_BODY;
+extern struct req_msg_field RMF_MDT_BODY;
+extern struct req_msg_field RMF_MDT_EPOCH;
+extern struct req_msg_field RMF_OBD_STATFS;
+extern struct req_msg_field RMF_NAME;
+extern struct req_msg_field RMF_SYMTGT;
+extern struct req_msg_field RMF_TGTUUID;
+extern struct req_msg_field RMF_CLUUID;
+extern struct req_msg_field RMF_SETINFO_VAL;
+extern struct req_msg_field RMF_SETINFO_KEY;
+extern struct req_msg_field RMF_GETINFO_VAL;
+extern struct req_msg_field RMF_GETINFO_VALLEN;
+extern struct req_msg_field RMF_GETINFO_KEY;
+extern struct req_msg_field RMF_IDX_INFO;
+extern struct req_msg_field RMF_CLOSE_DATA;
+extern struct req_msg_field RMF_FILE_SECCTX_NAME;
+extern struct req_msg_field RMF_FILE_SECCTX;
+extern struct req_msg_field RMF_FID_ARRAY;
+extern struct req_msg_field RMF_FILE_ENCCTX;
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ */
+extern struct req_msg_field RMF_CONN;
+extern struct req_msg_field RMF_CONNECT_DATA;
+extern struct req_msg_field RMF_DLM_REQ;
+extern struct req_msg_field RMF_DLM_REP;
+extern struct req_msg_field RMF_DLM_LVB;
+extern struct req_msg_field RMF_DLM_GL_DESC;
+extern struct req_msg_field RMF_LDLM_INTENT;
+extern struct req_msg_field RMF_LAYOUT_INTENT;
+extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_DEFAULT_MDT_MD;
+extern struct req_msg_field RMF_REC_REINT;
+extern struct req_msg_field RMF_EADATA;
+extern struct req_msg_field RMF_EAVALS;
+extern struct req_msg_field RMF_EAVALS_LENS;
+extern struct req_msg_field RMF_ACL;
+extern struct req_msg_field RMF_LOGCOOKIES;
+extern struct req_msg_field RMF_CAPA1;
+extern struct req_msg_field RMF_CAPA2;
+extern struct req_msg_field RMF_OBD_QUOTACHECK;
+extern struct req_msg_field RMF_OBD_QUOTACTL;
+extern struct req_msg_field RMF_OBD_QUOTACTL_POOL;
+extern struct req_msg_field RMF_QUOTA_BODY;
+extern struct req_msg_field RMF_STRING;
+extern struct req_msg_field RMF_SWAP_LAYOUTS;
+extern struct req_msg_field RMF_MDS_HSM_PROGRESS;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_MDS_HSM_USER_ITEM;
+extern struct req_msg_field RMF_MDS_HSM_ARCHIVE;
+extern struct req_msg_field RMF_HSM_USER_STATE;
+extern struct req_msg_field RMF_HSM_STATE_SET;
+extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_SELINUX_POL;
+
+/* seq-mgr fields */
+extern struct req_msg_field RMF_SEQ_OPC;
+extern struct req_msg_field RMF_SEQ_RANGE;
+extern struct req_msg_field RMF_FID_SPACE;
+
+/* FLD fields */
+extern struct req_msg_field RMF_FLD_OPC;
+extern struct req_msg_field RMF_FLD_MDFLD;
+
+extern struct req_msg_field RMF_LLOGD_BODY;
+extern struct req_msg_field RMF_LLOG_LOG_HDR;
+extern struct req_msg_field RMF_LLOGD_CONN_BODY;
+
+extern struct req_msg_field RMF_MGS_TARGET_INFO;
+extern struct req_msg_field RMF_MGS_SEND_PARAM;
+
+extern struct req_msg_field RMF_OST_BODY;
+extern struct req_msg_field RMF_OBD_IOOBJ;
+extern struct req_msg_field RMF_OBD_ID;
+extern struct req_msg_field RMF_FID;
+extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_NIOBUF_INLINE;
+extern struct req_msg_field RMF_RCS;
+extern struct req_msg_field RMF_FIEMAP_KEY;
+extern struct req_msg_field RMF_FIEMAP_VAL;
+extern struct req_msg_field RMF_OST_ID;
+extern struct req_msg_field RMF_SHORT_IO;
+
+/* MGS config read message format */
+extern struct req_msg_field RMF_MGS_CONFIG_BODY;
+extern struct req_msg_field RMF_MGS_CONFIG_RES;
+
+/* generic uint32 */
+extern struct req_msg_field RMF_U32;
+
+/* OBJ update format */
+extern struct req_msg_field RMF_OUT_UPDATE;
+extern struct req_msg_field RMF_OUT_UPDATE_REPLY;
+extern struct req_msg_field RMF_OUT_UPDATE_HEADER;
+extern struct req_msg_field RMF_OUT_UPDATE_BUF;
+
+/* LFSCK format */
+extern struct req_msg_field RMF_LFSCK_REQUEST;
+extern struct req_msg_field RMF_LFSCK_REPLY;
+
+extern struct req_msg_field RMF_OST_LADVISE_HDR;
+extern struct req_msg_field RMF_OST_LADVISE;
+/** @} req_layout */
+
+#endif /* _LUSTRE_REQ_LAYOUT_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
new file mode 100644
index 0000000000000..16249a3a65f2e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
@@ -0,0 +1,392 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_scrub.h
+ *
+ * Shared definitions and declarations for Lustre OI scrub.
+ *
+ * Author: Fan Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_SCRUB_H
+# define _LUSTRE_SCRUB_H
+
+#include <libcfs/linux/linux-uuid.h>
+#include <dt_object.h>
+#include <lustre_net.h>
+
+#define OSD_OI_FID_OID_BITS_MAX	10
+#define OSD_OI_FID_NR_MAX	(1UL << OSD_OI_FID_OID_BITS_MAX)
+#define SCRUB_OI_BITMAP_SIZE	(OSD_OI_FID_NR_MAX >> 3)
+#define PFID_STRIPE_IDX_BITS	16
+#define PFID_STRIPE_COUNT_MASK	((1 << PFID_STRIPE_IDX_BITS) - 1)
+
+#define SCRUB_MAGIC_V1			0x4C5FD252
+#define SCRUB_MAGIC_V2			0x4C5FE253
+#define SCRUB_CHECKPOINT_INTERVAL	60
+#define SCRUB_WINDOW_SIZE		1024
+
+enum scrub_next_status {
+	/* exit current loop and process next group */
+	SCRUB_NEXT_BREAK	= 1,
+
+	/* skip current object and process next bit */
+	SCRUB_NEXT_CONTINUE	= 2,
+
+	/* exit all the loops */
+	SCRUB_NEXT_EXIT		= 3,
+
+	/* wait for free cache slot */
+	SCRUB_NEXT_WAIT		= 4,
+
+	/* simulate system crash during OI scrub */
+	SCRUB_NEXT_CRASH	= 5,
+
+	/* simulate failure during OI scrub */
+	SCRUB_NEXT_FATAL	= 6,
+
+	/* new created object, no scrub on it */
+	SCRUB_NEXT_NOSCRUB	= 7,
+
+	/* the object has no FID-in-LMA */
+	SCRUB_NEXT_NOLMA	= 8,
+
+	/* for OST-object */
+	SCRUB_NEXT_OSTOBJ	= 9,
+
+	/* old OST-object, no LMA or no FID-on-OST flags in LMA */
+	SCRUB_NEXT_OSTOBJ_OLD	= 10,
+};
+
+enum scrub_local_file_flags {
+	SLFF_SCAN_SUBITEMS	= 0x0001,
+	SLFF_HIDE_FID		= 0x0002,
+	SLFF_SHOW_NAME		= 0x0004,
+	SLFF_NO_OI		= 0x0008,
+	SLFF_IDX_IN_FID		= 0x0010,
+};
+
+enum scrub_status {
+	/* The scrub file is new created, for new MDT, upgrading from old disk,
+	 * or re-creating the scrub file manually. */
+	SS_INIT		= 0,
+
+	/* The scrub is checking/repairing the OI files. */
+	SS_SCANNING	= 1,
+
+	/* The scrub checked/repaired the OI files successfully. */
+	SS_COMPLETED	= 2,
+
+	/* The scrub failed to check/repair the OI files. */
+	SS_FAILED	= 3,
+
+	/* The scrub is stopped manually, the OI files may be inconsistent. */
+	SS_STOPPED	= 4,
+
+	/* The scrub is paused automatically when umount. */
+	SS_PAUSED	= 5,
+
+	/* The scrub crashed during the scanning, should be restarted. */
+	SS_CRASHED	= 6,
+};
+
+enum scrub_flags {
+	/* OI files have been recreated, OI mappings should be re-inserted. */
+	SF_RECREATED	= 0x0000000000000001ULL,
+
+	/* OI files are invalid, should be rebuild ASAP */
+	SF_INCONSISTENT	= 0x0000000000000002ULL,
+
+	/* OI scrub is triggered automatically. */
+	SF_AUTO		= 0x0000000000000004ULL,
+
+	/* The device is upgraded from 1.8 format. */
+	SF_UPGRADE	= 0x0000000000000008ULL,
+};
+
+enum scrub_param {
+	/* Exit when fail. */
+	SP_FAILOUT	= 0x0001,
+
+	/* Check only without repairing. */
+	SP_DRYRUN	= 0x0002,
+};
+
+enum scrub_start {
+	/* Set failout flag. */
+	SS_SET_FAILOUT		= 0x00000001,
+
+	/* Clear failout flag. */
+	SS_CLEAR_FAILOUT	= 0x00000002,
+
+	/* Reset scrub start position. */
+	SS_RESET		= 0x00000004,
+
+	/* Trigger full scrub automatically. */
+	SS_AUTO_FULL		= 0x00000008,
+
+	/* Trigger partial scrub automatically. */
+	SS_AUTO_PARTIAL		= 0x00000010,
+
+	/* Set dryrun flag. */
+	SS_SET_DRYRUN		= 0x00000020,
+
+	/* Clear dryrun flag. */
+	SS_CLEAR_DRYRUN		= 0x00000040,
+};
+
+enum osd_lf_flags {
+	OLF_SCAN_SUBITEMS	= 0x0001,
+	OLF_HIDE_FID		= 0x0002,
+	OLF_SHOW_NAME		= 0x0004,
+	OLF_NO_OI		= 0x0008,
+	OLF_IDX_IN_FID		= 0x0010,
+	OLF_NOT_BACKUP		= 0x0020,
+};
+
+/* There are some overhead to detect OI inconsistency automatically
+ * during normal RPC handling. We do not want to always auto detect
+ * OI inconsistency especailly when OI scrub just done recently.
+ *
+ * The 'auto_scrub' defines the time (united as second) interval to
+ * enable auto detect OI inconsistency since last OI scurb done. */
+enum auto_scrub {
+	/* Disable auto scrub. */
+	AS_NEVER	= 0,
+
+	/* 1 second is too short interval, it is almost equal to always auto
+	 * detect inconsistent OI, usually used for test. */
+	AS_ALWAYS	= 1,
+
+	/* Enable auto detect OI inconsistency one month (60 * 60 * 24 * 30)
+	 * after last OI scrub. */
+	AS_DEFAULT	= 2592000LL,
+};
+
+struct scrub_file {
+	/* 128-bit uuid for volume. */
+	uuid_t	sf_uuid;
+
+	/* See 'enum scrub_flags'. */
+	__u64   sf_flags;
+
+	/* The scrub magic. */
+	__u32   sf_magic;
+
+	/* See 'enum scrub_status'. */
+	__u16   sf_status;
+
+	/* See 'enum scrub_param'. */
+	__u16   sf_param;
+
+	/* The time for the last OI scrub completed. */
+	time64_t sf_time_last_complete;
+
+	/* The ttime for the latest OI scrub ran. */
+	time64_t sf_time_latest_start;
+
+	/* The time for the last OI scrub checkpoint. */
+	time64_t sf_time_last_checkpoint;
+
+	/* The position for the latest OI scrub started from. */
+	__u64   sf_pos_latest_start;
+
+	/* The position for the last OI scrub checkpoint. */
+	__u64   sf_pos_last_checkpoint;
+
+	/* The position for the first should be updated object. */
+	__u64   sf_pos_first_inconsistent;
+
+	/* How many objects have been checked. */
+	__u64   sf_items_checked;
+
+	/* How many objects have been updated. */
+	__u64   sf_items_updated;
+
+	/* How many objects failed to be processed. */
+	__u64   sf_items_failed;
+
+	/* How many prior objects have been updated during scanning. */
+	__u64   sf_items_updated_prior;
+
+	/* How many objects marked as LDISKFS_STATE_LUSTRE_NOSCRUB. */
+	__u64   sf_items_noscrub;
+
+	/* How many IGIF objects. */
+	__u64   sf_items_igif;
+
+	/* How long the OI scrub has run in seconds. Do NOT change
+	 * to time64_t since this breaks backwards compatibility.
+	 * It shouldn't take more than 136 years to complete :-)
+	 */
+	s32	sf_run_time;
+
+	/* How many completed OI scrub ran on the device. */
+	__u32   sf_success_count;
+
+	/* How many OI files. */
+	__u16   sf_oi_count;
+
+	/* Keep the flags after scrub reset. See 'enum scrub_internal_flags' */
+	__u16	sf_internal_flags;
+
+	__u32	sf_reserved_1;
+	__u64	sf_reserved_2[16];
+
+	/* Bitmap for OI files recreated case. */
+	__u8    sf_oi_bitmap[SCRUB_OI_BITMAP_SIZE];
+};
+
+struct lustre_scrub {
+	/* Object for the scrub file. */
+	struct dt_object       *os_obj;
+
+	struct task_struct     *os_task;
+	struct list_head	os_inconsistent_items;
+
+	/* write lock for scrub prep/update/post/checkpoint,
+	 * read lock for scrub dump. */
+	struct rw_semaphore	os_rwsem;
+	spinlock_t		os_lock;
+
+	/* Scrub file in memory. */
+	struct scrub_file       os_file;
+
+	/* Buffer for scrub file load/store. */
+	struct scrub_file       os_file_disk;
+
+	const char	       *os_name;
+
+	/* The time for last checkpoint, seconds */
+	time64_t		os_time_last_checkpoint;
+
+	/* The time for next checkpoint, seconds */
+	time64_t		os_time_next_checkpoint;
+
+	/* How long to wait to start scrubbing */
+	time64_t		os_auto_scrub_interval;
+
+	/* How many objects have been checked since last checkpoint. */
+	__u64			os_new_checked;
+	__u64			os_pos_current;
+	__u32			os_start_flags;
+	/* Some of these bits can be set by different threads so
+	 * all updates must be protected by ->os_lock to avoid
+	 * racing read-modify-write cycles causing corruption.
+	 */
+	unsigned int		os_in_prior:1, /* process inconsistent item
+						* found by RPC prior */
+				os_waiting:1, /* Waiting for scan window. */
+				os_full_speed:1, /* run w/o speed limit */
+				os_paused:1, /* The scrub is paused. */
+				os_convert_igif:1,
+				os_partial_scan:1,
+				os_in_join:1,
+				os_running:1,	/* scrub thread is running */
+				os_full_scrub:1,
+				os_has_ml_file:1;
+};
+
+#define INDEX_BACKUP_MAGIC_V1	0x1E41F208
+#define INDEX_BACKUP_BUFSIZE	(4096 * 4)
+
+enum lustre_index_backup_policy {
+	/* By default, do not backup the index */
+	LIBP_NONE	= 0,
+
+	/* Backup the dirty index objects when umount */
+	LIBP_AUTO	= 1,
+};
+
+struct lustre_index_backup_header {
+	__u32		libh_magic;
+	__u32		libh_count;
+	__u32		libh_keysize;
+	__u32		libh_recsize;
+	struct lu_fid	libh_owner;
+	__u64		libh_pad[60]; /* keep header 512 bytes aligned */
+};
+
+struct lustre_index_backup_unit {
+	struct list_head	libu_link;
+	struct lu_fid		libu_fid;
+	__u32			libu_keysize;
+	__u32			libu_recsize;
+};
+
+struct lustre_index_restore_unit {
+	struct list_head	liru_link;
+	struct lu_fid		liru_pfid;
+	struct lu_fid		liru_cfid;
+	__u64			liru_clid;
+	int			liru_len;
+	char			liru_name[0];
+};
+
+void scrub_file_init(struct lustre_scrub *scrub, uuid_t uuid);
+void scrub_file_reset(struct lustre_scrub *scrub, uuid_t uuid, u64 flags);
+int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub);
+int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub);
+bool scrub_needs_check(struct lustre_scrub *scrub, const struct lu_fid *fid,
+		       u64 index);
+int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub);
+int scrub_thread_prep(const struct lu_env *env, struct lustre_scrub *scrub,
+		      uuid_t uuid, u64 start);
+int scrub_thread_post(const struct lu_env *env, struct lustre_scrub *scrub,
+		      int result);
+int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub,
+		void *data, __u32 flags);
+void scrub_stop(struct lustre_scrub *scrub);
+void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub);
+
+int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid,
+		    const struct lu_fid *cfid, __u64 child,
+		    const char *name, int namelen);
+
+int lustre_index_register(struct dt_device *dev, const char *devname,
+			  struct list_head *head, spinlock_t *lock, int *guard,
+			  const struct lu_fid *fid,
+			  __u32 keysize, __u32 recsize);
+
+void lustre_index_backup(const struct lu_env *env, struct dt_device *dev,
+			 const char *devname, struct list_head *head,
+			 spinlock_t *lock, int *guard, bool backup);
+int lustre_index_restore(const struct lu_env *env, struct dt_device *dev,
+			 const struct lu_fid *parent_fid,
+			 const struct lu_fid *tgt_fid,
+			 const struct lu_fid *bak_fid, const char *name,
+			 struct list_head *head, spinlock_t *lock,
+			 char *buf, int bufsize);
+
+static inline void lustre_fid2lbx(char *buf, const struct lu_fid *fid, int len)
+{
+	snprintf(buf, len, DFID_NOBRACE".lbx", PFID(fid));
+}
+
+static inline const char *osd_scrub2name(struct lustre_scrub *scrub)
+{
+	return scrub->os_name;
+}
+#endif /* _LUSTRE_SCRUB_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
new file mode 100644
index 0000000000000..831d35183247f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
@@ -0,0 +1,1208 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _LUSTRE_SEC_H_
+#define _LUSTRE_SEC_H_
+
+/** \defgroup sptlrpc sptlrpc
+ *
+ * @{
+ */
+
+/*
+ * to avoid include
+ */
+struct obd_import;
+struct obd_export;
+struct ptlrpc_request;
+struct ptlrpc_reply_state;
+struct ptlrpc_bulk_desc;
+struct brw_page;
+struct lu_env;
+/* Linux specific */
+struct key;
+struct seq_file;
+struct lustre_cfg;
+
+/*
+ * forward declaration
+ */
+struct ptlrpc_sec_policy;
+struct ptlrpc_sec_cops;
+struct ptlrpc_sec_sops;
+struct ptlrpc_sec;
+struct ptlrpc_svc_ctx;
+struct ptlrpc_cli_ctx;
+struct ptlrpc_ctx_ops;
+struct req_msg_field;
+
+/**
+ * \addtogroup flavor flavor
+ *
+ * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits
+ * are unused, must be set to 0 for future expansion.
+ * <pre>
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * </pre>
+ *
+ * @{
+ */
+
+/*
+ * flavor constants
+ */
+enum sptlrpc_policy {
+        SPTLRPC_POLICY_NULL             = 0,
+        SPTLRPC_POLICY_PLAIN            = 1,
+        SPTLRPC_POLICY_GSS              = 2,
+        SPTLRPC_POLICY_MAX,
+};
+
+enum sptlrpc_mech_null {
+        SPTLRPC_MECH_NULL               = 0,
+        SPTLRPC_MECH_NULL_MAX,
+};
+
+enum sptlrpc_mech_plain {
+        SPTLRPC_MECH_PLAIN              = 0,
+        SPTLRPC_MECH_PLAIN_MAX,
+};
+
+enum sptlrpc_mech_gss {
+        SPTLRPC_MECH_GSS_NULL           = 0,
+        SPTLRPC_MECH_GSS_KRB5           = 1,
+	SPTLRPC_MECH_GSS_SK             = 2,
+        SPTLRPC_MECH_GSS_MAX,
+};
+
+enum sptlrpc_service_type {
+        SPTLRPC_SVC_NULL                = 0,    /**< no security */
+        SPTLRPC_SVC_AUTH                = 1,    /**< authentication only */
+        SPTLRPC_SVC_INTG                = 2,    /**< integrity */
+        SPTLRPC_SVC_PRIV                = 3,    /**< privacy */
+        SPTLRPC_SVC_MAX,
+};
+
+enum sptlrpc_bulk_type {
+        SPTLRPC_BULK_DEFAULT            = 0,    /**< follow rpc flavor */
+        SPTLRPC_BULK_HASH               = 1,    /**< hash integrity */
+        SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+        SPTLRPC_BULK_SVC_NULL           = 0,    /**< no security */
+        SPTLRPC_BULK_SVC_AUTH           = 1,    /**< authentication only */
+        SPTLRPC_BULK_SVC_INTG           = 2,    /**< integrity */
+        SPTLRPC_BULK_SVC_PRIV           = 3,    /**< privacy */
+        SPTLRPC_BULK_SVC_MAX,
+};
+
+/*
+ * compose/extract macros
+ */
+#define FLVR_POLICY_OFFSET              (0)
+#define FLVR_MECH_OFFSET                (4)
+#define FLVR_SVC_OFFSET                 (8)
+#define FLVR_BULK_TYPE_OFFSET           (12)
+#define FLVR_BULK_SVC_OFFSET            (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc)                       \
+        (((__u32)(policy) << FLVR_POLICY_OFFSET) |                      \
+         ((__u32)(mech) << FLVR_MECH_OFFSET) |                          \
+         ((__u32)(svc) << FLVR_SVC_OFFSET) |                            \
+         ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) |                    \
+         ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
+
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor)                                     \
+        ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor)                                       \
+        ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor)                                        \
+        ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor)                                  \
+        ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor)                                   \
+        ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor)                                       \
+        ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor)                                   \
+        ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
+
+/*
+ * gss subflavors
+ */
+#define MAKE_BASE_SUBFLVR(mech, svc)                                    \
+        ((__u32)(mech) |                                                \
+         ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
+#define SPTLRPC_SUBFLVR_GSSNULL						\
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_NULL, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5N                                           \
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5A                                           \
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_KRB5I                                           \
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_KRB5P                                           \
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+#define SPTLRPC_SUBFLVR_SKN                                             \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_SKA                                             \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_SKI                                             \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_SKPI                                            \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_PRIV)
+
+/*
+ * "end user" flavors
+ */
+#define SPTLRPC_FLVR_NULL                               \
+        MAKE_FLVR(SPTLRPC_POLICY_NULL,                  \
+                  SPTLRPC_MECH_NULL,                    \
+                  SPTLRPC_SVC_NULL,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_PLAIN                              \
+        MAKE_FLVR(SPTLRPC_POLICY_PLAIN,                 \
+                  SPTLRPC_MECH_PLAIN,                   \
+                  SPTLRPC_SVC_NULL,                     \
+                  SPTLRPC_BULK_HASH,                    \
+                  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_GSSNULL				\
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,			\
+		  SPTLRPC_MECH_GSS_NULL,		\
+		  SPTLRPC_SVC_NULL,			\
+		  SPTLRPC_BULK_DEFAULT,			\
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5N                              \
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_NULL,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5A                              \
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_AUTH,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5I                              \
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_INTG,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5P                              \
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_PRIV,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_PRIV)
+#define SPTLRPC_FLVR_SKN                                \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+		  SPTLRPC_MECH_GSS_SK,                  \
+		  SPTLRPC_SVC_NULL,                     \
+		  SPTLRPC_BULK_DEFAULT,                 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_SKA                                \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+		  SPTLRPC_MECH_GSS_SK,                  \
+		  SPTLRPC_SVC_AUTH,                     \
+		  SPTLRPC_BULK_DEFAULT,                 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_SKI                                \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+		  SPTLRPC_MECH_GSS_SK,                  \
+		  SPTLRPC_SVC_INTG,                     \
+		  SPTLRPC_BULK_DEFAULT,                 \
+		  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_SKPI                               \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+		  SPTLRPC_MECH_GSS_SK,                  \
+		  SPTLRPC_SVC_PRIV,                     \
+		  SPTLRPC_BULK_DEFAULT,                 \
+		  SPTLRPC_BULK_SVC_PRIV)
+
+#define SPTLRPC_FLVR_DEFAULT            SPTLRPC_FLVR_NULL
+
+#define SPTLRPC_FLVR_INVALID            ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY                ((__u32) 0xFFF00000)
+
+/**
+ * extract the useful part from wire flavor
+ */
+#define WIRE_FLVR(wflvr)                (((__u32) (wflvr)) & 0x000FFFFF)
+
+/** @} flavor */
+
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
+{
+        LASSERT(svc < SPTLRPC_SVC_MAX);
+        *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                          SPTLRPC_FLVR_MECH(*flvr),
+                          svc,
+                          SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                          SPTLRPC_FLVR_BULK_SVC(*flvr));
+}
+
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+        LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+        *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                          SPTLRPC_FLVR_MECH(*flvr),
+                          SPTLRPC_FLVR_SVC(*flvr),
+                          SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                          svc);
+}
+
+struct bulk_spec_hash {
+        __u8    hash_alg;
+};
+
+/**
+ * Full description of flavors being used on a ptlrpc connection, include
+ * both regular RPC and bulk transfer parts.
+ */
+struct sptlrpc_flavor {
+        /**
+         * wire flavor, should be renamed to sf_wire.
+         */
+        __u32   sf_rpc;
+        /**
+         * general flags of PTLRPC_SEC_FL_*
+         */
+        __u32   sf_flags;
+        /**
+         * rpc flavor specification
+         */
+        union {
+                /* nothing for now */
+        } u_rpc;
+        /**
+         * bulk flavor specification
+         */
+        union {
+                struct bulk_spec_hash hash;
+        } u_bulk;
+};
+
+/**
+ * identify the RPC is generated from what part of Lustre. It's encoded into
+ * RPC requests and to be checked by ptlrpc service.
+ */
+enum lustre_sec_part {
+        LUSTRE_SP_CLI           = 0,
+        LUSTRE_SP_MDT,
+        LUSTRE_SP_OST,
+        LUSTRE_SP_MGC,
+        LUSTRE_SP_MGS,
+        LUSTRE_SP_ANY           = 0xFF
+};
+
+const char *sptlrpc_part2name(enum lustre_sec_part sp);
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd);
+
+/**
+ * A rule specifies a flavor to be used by a ptlrpc connection between
+ * two Lustre parts.
+ */
+struct sptlrpc_rule {
+        __u32                   sr_netid;   /* LNET network ID */
+        __u8                    sr_from;    /* sec_part */
+        __u8                    sr_to;      /* sec_part */
+        __u16                   sr_padding;
+        struct sptlrpc_flavor   sr_flvr;
+};
+
+/**
+ * A set of rules in memory.
+ *
+ * Rules are generated and stored on MGS, and propagated to MDT, OST,
+ * and client when needed.
+ */
+struct sptlrpc_rule_set {
+        int                     srs_nslot;
+        int                     srs_nrule;
+        struct sptlrpc_rule    *srs_rules;
+};
+
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
+
+static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
+{
+        memset(set, 0, sizeof(*set));
+}
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
+			    struct sptlrpc_rule *rule);
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+			    enum lustre_sec_part from,
+			    enum lustre_sec_part to,
+			    lnet_nid_t nid,
+			    struct sptlrpc_flavor *sf);
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set);
+
+int  sptlrpc_process_config(struct lustre_cfg *lcfg);
+void sptlrpc_conf_log_start(const char *logname);
+void sptlrpc_conf_log_stop(const char *logname);
+void sptlrpc_conf_log_update_begin(const char *logname);
+void sptlrpc_conf_log_update_end(const char *logname);
+void sptlrpc_conf_client_adapt(struct obd_device *obd);
+int  sptlrpc_conf_target_get_rules(struct obd_device *obd,
+				   struct sptlrpc_rule_set *rset);
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+				  enum lustre_sec_part from,
+				  lnet_nid_t nid,
+				  struct sptlrpc_flavor *flavor);
+
+/* The maximum length of security payload. 1024 is enough for Kerberos 5,
+ * and should be enough for other future mechanisms but not sure.
+ * Only used by pre-allocated request/reply pool.
+ */
+#define SPTLRPC_MAX_PAYLOAD     (1024)
+
+
+struct vfs_cred {
+        uint32_t        vc_uid;
+        uint32_t        vc_gid;
+};
+
+struct ptlrpc_ctx_ops {
+        /**
+         * To determine whether it's suitable to use the \a ctx for \a vcred.
+         */
+        int     (*match)       (struct ptlrpc_cli_ctx *ctx,
+                                struct vfs_cred *vcred);
+
+        /**
+         * To bring the \a ctx uptodate.
+         */
+        int     (*refresh)     (struct ptlrpc_cli_ctx *ctx);
+
+        /**
+         * Validate the \a ctx.
+         */
+        int     (*validate)    (struct ptlrpc_cli_ctx *ctx);
+
+        /**
+         * Force the \a ctx to die.
+         */
+        void    (*die)         (struct ptlrpc_cli_ctx *ctx,
+                                int grace);
+        int     (*display)     (struct ptlrpc_cli_ctx *ctx,
+                                char *buf, int bufsize);
+
+        /**
+         * Sign the request message using \a ctx.
+         *
+         * \pre req->rq_reqmsg point to request message.
+         * \pre req->rq_reqlen is the request message length.
+         * \post req->rq_reqbuf point to request message with signature.
+         * \post req->rq_reqdata_len is set to the final request message size.
+         *
+         * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign().
+         */
+        int     (*sign)        (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req);
+
+        /**
+         * Verify the reply message using \a ctx.
+         *
+         * \pre req->rq_repdata point to reply message with signature.
+         * \pre req->rq_repdata_len is the total reply message length.
+         * \post req->rq_repmsg point to reply message without signature.
+         * \post req->rq_replen is the reply message length.
+         *
+         * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify().
+         */
+        int     (*verify)      (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req);
+
+        /**
+         * Encrypt the request message using \a ctx.
+         *
+         * \pre req->rq_reqmsg point to request message in clear text.
+         * \pre req->rq_reqlen is the request message length.
+         * \post req->rq_reqbuf point to request message.
+         * \post req->rq_reqdata_len is set to the final request message size.
+         *
+         * \see gss_cli_ctx_seal().
+         */
+        int     (*seal)        (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req);
+
+        /**
+         * Decrypt the reply message using \a ctx.
+         *
+         * \pre req->rq_repdata point to encrypted reply message.
+         * \pre req->rq_repdata_len is the total cipher text length.
+         * \post req->rq_repmsg point to reply message in clear text.
+         * \post req->rq_replen is the reply message length in clear text.
+         *
+         * \see gss_cli_ctx_unseal().
+         */
+        int     (*unseal)      (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req);
+
+        /**
+         * Wrap bulk request data. This is called before wrapping RPC
+         * request message.
+         *
+         * \pre bulk buffer is descripted by desc->bd_iov and
+         * desc->bd_iov_count. note for read it's just buffer, no data
+         * need to be sent;  for write it contains data in clear text.
+         * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared
+         * (usually inside of RPC request message).
+         * - encryption: cipher text bulk buffer is descripted by
+         *   desc->bd_enc_iov and desc->bd_iov_count (currently assume iov
+         *   count remains the same).
+         * - otherwise: bulk buffer is still desc->bd_iov and
+         *   desc->bd_iov_count.
+         *
+         * \return 0: success.
+         * \return -ev: error code.
+         *
+         * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk().
+         */
+        int     (*wrap_bulk)   (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req,
+                                struct ptlrpc_bulk_desc *desc);
+
+        /**
+         * Unwrap bulk reply data. This is called after wrapping RPC
+         * reply message.
+         *
+         * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and
+         * desc->bd_iov_count, according to wrap_bulk().
+         * \post final bulk data in clear text is placed in buffer described
+         * by desc->bd_iov and desc->bd_iov_count.
+         * \return +ve nob of actual bulk data in clear text.
+         * \return -ve error code.
+         *
+         * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk().
+         */
+        int     (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req,
+                                struct ptlrpc_bulk_desc *desc);
+};
+
+#define PTLRPC_CTX_NEW_BIT             (0)  /* newly created */
+#define PTLRPC_CTX_UPTODATE_BIT        (1)  /* uptodate */
+#define PTLRPC_CTX_DEAD_BIT            (2)  /* mark expired gracefully */
+#define PTLRPC_CTX_ERROR_BIT           (3)  /* fatal error (refresh, etc.) */
+#define PTLRPC_CTX_CACHED_BIT          (8)  /* in ctx cache (hash etc.) */
+#define PTLRPC_CTX_ETERNAL_BIT         (9)  /* always valid */
+
+#define PTLRPC_CTX_NEW                 BIT(PTLRPC_CTX_NEW_BIT)
+#define PTLRPC_CTX_UPTODATE            BIT(PTLRPC_CTX_UPTODATE_BIT)
+#define PTLRPC_CTX_DEAD                BIT(PTLRPC_CTX_DEAD_BIT)
+#define PTLRPC_CTX_ERROR               BIT(PTLRPC_CTX_ERROR_BIT)
+#define PTLRPC_CTX_CACHED              BIT(PTLRPC_CTX_CACHED_BIT)
+#define PTLRPC_CTX_ETERNAL             BIT(PTLRPC_CTX_ETERNAL_BIT)
+
+#define PTLRPC_CTX_STATUS_MASK         (PTLRPC_CTX_NEW_BIT    |       \
+                                        PTLRPC_CTX_UPTODATE   |       \
+                                        PTLRPC_CTX_DEAD       |       \
+                                        PTLRPC_CTX_ERROR)
+
+struct ptlrpc_cli_ctx {
+	struct hlist_node	cc_cache;	/* linked into ctx cache */
+	atomic_t		cc_refcount;
+	struct ptlrpc_sec      *cc_sec;
+	struct ptlrpc_ctx_ops  *cc_ops;
+	time64_t		cc_expire;	/* in seconds */
+	unsigned int		cc_early_expire:1;
+	unsigned long		cc_flags;
+	struct vfs_cred		cc_vcred;
+	spinlock_t		cc_lock;
+	struct list_head	cc_req_list;	/* waiting reqs linked here */
+	struct list_head	cc_gc_chain;	/* linked to gc chain */
+};
+
+/**
+ * client side policy operation vector.
+ */
+struct ptlrpc_sec_cops {
+        /**
+         * Given an \a imp, create and initialize a ptlrpc_sec structure.
+         * \param ctx service context:
+         * - regular import: \a ctx should be NULL;
+         * - reverse import: \a ctx is obtained from incoming request.
+         * \param flavor specify what flavor to use.
+         *
+         * When necessary, policy module is responsible for taking reference
+         * on the import.
+         *
+         * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr().
+         */
+        struct ptlrpc_sec *     (*create_sec)  (struct obd_import *imp,
+                                                struct ptlrpc_svc_ctx *ctx,
+                                                struct sptlrpc_flavor *flavor);
+
+        /**
+         * Destructor of ptlrpc_sec. When called, refcount has been dropped
+         * to 0 and all contexts has been destroyed.
+         *
+         * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr().
+         */
+        void                    (*destroy_sec) (struct ptlrpc_sec *sec);
+
+        /**
+         * Notify that this ptlrpc_sec is going to die. Optionally, policy
+         * module is supposed to set sec->ps_dying and whatever necessary
+         * actions.
+         *
+         * \see plain_kill_sec(), gss_sec_kill().
+         */
+        void                    (*kill_sec)    (struct ptlrpc_sec *sec);
+
+        /**
+         * Given \a vcred, lookup and/or create its context. The policy module
+         * is supposed to maintain its own context cache.
+         * XXX currently \a create and \a remove_dead is always 1, perhaps
+         * should be removed completely.
+         *
+         * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr().
+         */
+        struct ptlrpc_cli_ctx * (*lookup_ctx)  (struct ptlrpc_sec *sec,
+                                                struct vfs_cred *vcred,
+                                                int create,
+                                                int remove_dead);
+
+        /**
+         * Called then the reference of \a ctx dropped to 0. The policy module
+         * is supposed to destroy this context or whatever else according to
+         * its cache maintainance mechamism.
+         *
+         * \param sync if zero, we shouldn't wait for the context being
+         * destroyed completely.
+         *
+         * \see plain_release_ctx(), gss_sec_release_ctx_kr().
+         */
+        void                    (*release_ctx) (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_cli_ctx *ctx,
+                                                int sync);
+
+        /**
+         * Flush the context cache.
+         *
+         * \param uid context of which user, -1 means all contexts.
+         * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected
+         * contexts should be cleared immediately.
+         * \param force if zero, only idle contexts will be flushed.
+         *
+         * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr().
+         */
+        int                     (*flush_ctx_cache)
+                                               (struct ptlrpc_sec *sec,
+                                                uid_t uid,
+                                                int grace,
+                                                int force);
+
+        /**
+         * Called periodically by garbage collector to remove dead contexts
+         * from cache.
+         *
+         * \see gss_sec_gc_ctx_kr().
+         */
+        void                    (*gc_ctx)      (struct ptlrpc_sec *sec);
+
+        /**
+         * Given an context \a ctx, install a corresponding reverse service
+         * context on client side.
+         * XXX currently it's only used by GSS module, maybe we should remove
+         * this from general API.
+         */
+        int                     (*install_rctx)(struct obd_import *imp,
+                                                struct ptlrpc_sec *sec,
+                                                struct ptlrpc_cli_ctx *ctx);
+
+        /**
+         * To allocate request buffer for \a req.
+         *
+         * \pre req->rq_reqmsg == NULL.
+         * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated,
+         * we are not supposed to free it.
+         * \post if success, req->rq_reqmsg point to a buffer with size
+         * at least \a lustre_msg_size.
+         *
+         * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf().
+         */
+        int                     (*alloc_reqbuf)(struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req,
+                                                int lustre_msg_size);
+
+        /**
+         * To free request buffer for \a req.
+         *
+         * \pre req->rq_reqbuf != NULL.
+         *
+         * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf().
+         */
+        void                    (*free_reqbuf) (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req);
+
+        /**
+         * To allocate reply buffer for \a req.
+         *
+         * \pre req->rq_repbuf == NULL.
+         * \post if success, req->rq_repbuf point to a buffer with size
+         * req->rq_repbuf_len, the size should be large enough to receive
+         * reply which be transformed from \a lustre_msg_size of clear text.
+         *
+         * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf().
+         */
+        int                     (*alloc_repbuf)(struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req,
+                                                int lustre_msg_size);
+
+        /**
+         * To free reply buffer for \a req.
+         *
+         * \pre req->rq_repbuf != NULL.
+         * \post req->rq_repbuf == NULL.
+         * \post req->rq_repbuf_len == 0.
+         *
+         * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf().
+         */
+        void                    (*free_repbuf) (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req);
+
+        /**
+         * To expand the request buffer of \a req, thus the \a segment in
+         * the request message pointed by req->rq_reqmsg can accommodate
+         * at least \a newsize of data.
+         *
+         * \pre req->rq_reqmsg->lm_buflens[segment] < newsize.
+         *
+         * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(),
+         * gss_enlarge_reqbuf().
+         */
+        int                     (*enlarge_reqbuf)
+                                               (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req,
+                                                int segment, int newsize);
+        /*
+         * misc
+         */
+        int                     (*display)     (struct ptlrpc_sec *sec,
+                                                struct seq_file *seq);
+};
+
+/**
+ * server side policy operation vector.
+ */
+struct ptlrpc_sec_sops {
+        /**
+         * verify an incoming request.
+         *
+         * \pre request message is pointed by req->rq_reqbuf, size is
+         * req->rq_reqdata_len; and the message has been unpacked to
+         * host byte order.
+         *
+         * \retval SECSVC_OK success, req->rq_reqmsg point to request message
+         * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set;
+         * req->rq_sp_from is decoded from request.
+         * \retval SECSVC_COMPLETE success, the request has been fully
+         * processed, and reply message has been prepared; req->rq_sp_from is
+         * decoded from request.
+         * \retval SECSVC_DROP failed, this request should be dropped.
+         *
+         * \see null_accept(), plain_accept(), gss_svc_accept_kr().
+         */
+        int                     (*accept)      (struct ptlrpc_request *req);
+
+        /**
+         * Perform security transformation upon reply message.
+         *
+         * \pre reply message is pointed by req->rq_reply_state->rs_msg, size
+         * is req->rq_replen.
+         * \post req->rs_repdata_len is the final message size.
+         * \post req->rq_reply_off is set.
+         *
+         * \see null_authorize(), plain_authorize(), gss_svc_authorize().
+         */
+        int                     (*authorize)   (struct ptlrpc_request *req);
+
+        /**
+         * Invalidate server context \a ctx.
+         *
+         * \see gss_svc_invalidate_ctx().
+         */
+        void                    (*invalidate_ctx)
+                                               (struct ptlrpc_svc_ctx *ctx);
+
+        /**
+         * Allocate a ptlrpc_reply_state.
+         *
+         * \param msgsize size of the reply message in clear text.
+         * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we
+         * should simply use it; otherwise we'll responsible for allocating
+         * a new one.
+         * \post req->rq_reply_state != NULL;
+         * \post req->rq_reply_state->rs_msg != NULL;
+         *
+         * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs().
+         */
+        int                     (*alloc_rs)    (struct ptlrpc_request *req,
+                                                int msgsize);
+
+        /**
+         * Free a ptlrpc_reply_state.
+         */
+        void                    (*free_rs)     (struct ptlrpc_reply_state *rs);
+
+        /**
+         * Release the server context \a ctx.
+         *
+         * \see gss_svc_free_ctx().
+         */
+        void                    (*free_ctx)    (struct ptlrpc_svc_ctx *ctx);
+
+        /**
+         * Install a reverse context based on the server context \a ctx.
+         *
+         * \see gss_svc_install_rctx_kr().
+         */
+        int                     (*install_rctx)(struct obd_import *imp,
+                                                struct ptlrpc_svc_ctx *ctx);
+
+        /**
+         * Prepare buffer for incoming bulk write.
+         *
+         * \pre desc->bd_iov and desc->bd_iov_count describes the buffer
+         * intended to receive the write.
+         *
+         * \see gss_svc_prep_bulk().
+         */
+        int                     (*prep_bulk)   (struct ptlrpc_request *req,
+                                                struct ptlrpc_bulk_desc *desc);
+
+        /**
+         * Unwrap the bulk write data.
+         *
+         * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk().
+         */
+        int                     (*unwrap_bulk) (struct ptlrpc_request *req,
+                                                struct ptlrpc_bulk_desc *desc);
+
+        /**
+         * Wrap the bulk read data.
+         *
+         * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk().
+         */
+        int                     (*wrap_bulk)   (struct ptlrpc_request *req,
+                                                struct ptlrpc_bulk_desc *desc);
+};
+
+struct ptlrpc_sec_policy {
+	struct module                  *sp_owner;
+	char                           *sp_name;
+	__u16                           sp_policy; /* policy number */
+	struct ptlrpc_sec_cops         *sp_cops;   /* client ops */
+	struct ptlrpc_sec_sops         *sp_sops;   /* server ops */
+};
+
+#define PTLRPC_SEC_FL_REVERSE           0x0001 /* reverse sec */
+#define PTLRPC_SEC_FL_ROOTONLY          0x0002 /* treat everyone as root */
+#define PTLRPC_SEC_FL_UDESC             0x0004 /* ship udesc */
+#define PTLRPC_SEC_FL_BULK              0x0008 /* intensive bulk i/o expected */
+#define PTLRPC_SEC_FL_PAG               0x0010 /* PAG mode */
+
+/**
+ * The ptlrpc_sec represents the client side ptlrpc security facilities,
+ * each obd_import (both regular and reverse import) must associate with
+ * a ptlrpc_sec.
+ *
+ * \see sptlrpc_import_sec_adapt().
+ */
+struct ptlrpc_sec {
+	struct ptlrpc_sec_policy       *ps_policy;
+	atomic_t                        ps_refcount;
+	/** statistic only */
+	atomic_t                        ps_nctx;
+	/** unique identifier */
+	int                             ps_id;
+        struct sptlrpc_flavor           ps_flvr;
+        enum lustre_sec_part            ps_part;
+        /** after set, no more new context will be created */
+        unsigned int                    ps_dying:1;
+        /** owning import */
+        struct obd_import              *ps_import;
+	spinlock_t			ps_lock;
+	/** mtime of SELinux policy file */
+	ktime_t				ps_sepol_mtime;
+	/** next check time of SELinux policy file */
+	ktime_t				ps_sepol_checknext;
+	/**
+	 * SELinux policy info
+	 * sepol string format is:
+	 * <mode>:<policy name>:<policy version>:<policy hash>
+	 */
+	char				ps_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH
+						 + 1];
+
+	/*
+	 * garbage collection
+	 */
+	struct list_head		ps_gc_list;
+	time64_t			ps_gc_interval;	/* in seconds */
+	time64_t			ps_gc_next;	/* in seconds */
+};
+
+static inline int flvr_is_rootonly(__u32 flavor)
+{
+	return (SPTLRPC_FLVR_POLICY(flavor) == SPTLRPC_POLICY_GSS &&
+		(SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_NULL ||
+		 SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_SK));
+}
+
+static inline int flvr_allows_user_desc(__u32 flavor)
+{
+	return (SPTLRPC_FLVR_POLICY(flavor) == SPTLRPC_POLICY_GSS &&
+		(SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_NULL ||
+		 SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_SK));
+}
+
+static inline int sec_is_reverse(struct ptlrpc_sec *sec)
+{
+        return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE);
+}
+
+static inline int sec_is_rootonly(struct ptlrpc_sec *sec)
+{
+        return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY);
+}
+
+
+struct ptlrpc_svc_ctx {
+	atomic_t                        sc_refcount;
+	struct ptlrpc_sec_policy       *sc_policy;
+};
+
+/*
+ * user identity descriptor
+ */
+#define LUSTRE_MAX_GROUPS               (128)
+
+struct ptlrpc_user_desc {
+        __u32           pud_uid;
+        __u32           pud_gid;
+        __u32           pud_fsuid;
+        __u32           pud_fsgid;
+        __u32           pud_cap;
+        __u32           pud_ngroups;
+        __u32           pud_groups[0];
+};
+
+/*
+ * bulk flavors
+ */
+enum sptlrpc_bulk_hash_alg {
+        BULK_HASH_ALG_NULL      = 0,
+        BULK_HASH_ALG_ADLER32,
+        BULK_HASH_ALG_CRC32,
+        BULK_HASH_ALG_MD5,
+        BULK_HASH_ALG_SHA1,
+        BULK_HASH_ALG_SHA256,
+        BULK_HASH_ALG_SHA384,
+        BULK_HASH_ALG_SHA512,
+        BULK_HASH_ALG_MAX
+};
+
+const char * sptlrpc_get_hash_name(__u8 hash_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
+
+enum {
+        BSD_FL_ERR      = 1,
+};
+
+struct ptlrpc_bulk_sec_desc {
+        __u8            bsd_version;    /* 0 */
+        __u8            bsd_type;       /* SPTLRPC_BULK_XXX */
+        __u8            bsd_svc;        /* SPTLRPC_BULK_SVC_XXXX */
+        __u8            bsd_flags;      /* flags */
+        __u32           bsd_nob;        /* nob of bulk data */
+        __u8            bsd_data[0];    /* policy-specific token */
+};
+
+extern struct dentry *sptlrpc_debugfs_dir;
+extern struct proc_dir_entry *sptlrpc_lprocfs_dir;
+
+/*
+ * round size up to next power of 2, for slab allocation.
+ * @size must be sane (can't overflow after round up)
+ */
+static inline int size_roundup_power2(int size)
+{
+        size--;
+        size |= size >> 1;
+        size |= size >> 2;
+        size |= size >> 4;
+        size |= size >> 8;
+        size |= size >> 16;
+        size++;
+        return size;
+}
+
+/*
+ * internal support libraries
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+                                  int segment, int newsize);
+
+/*
+ * security policies
+ */
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
+
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+                               char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
+
+static inline struct ptlrpc_sec_policy *
+sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
+{
+	__module_get(policy->sp_owner);
+	return policy;
+}
+
+static inline void
+sptlrpc_policy_put(struct ptlrpc_sec_policy *policy)
+{
+	module_put(policy->sp_owner);
+}
+
+/*
+ * client credential
+ */
+static inline
+unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx)
+{
+        return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK);
+}
+
+static inline
+int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx)
+{
+        return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE);
+}
+
+static inline
+int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx)
+{
+        return (cli_ctx_status(ctx) != 0);
+}
+
+static inline
+int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx)
+{
+        return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0);
+}
+
+static inline
+int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx)
+{
+        return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0);
+}
+
+static inline
+int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx)
+{
+        return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0);
+}
+
+static inline
+int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx)
+{
+        return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0);
+}
+
+/*
+ * sec get/put
+ */
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec);
+void sptlrpc_sec_put(struct ptlrpc_sec *sec);
+
+/*
+ * internal apis which only used by policy impelentation
+ */
+int  sptlrpc_get_next_secid(void);
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec);
+
+/*
+ * exported client context api
+ */
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync);
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx);
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+
+/*
+ * exported client context wrap/buffers
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req);
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       const struct req_msg_field *field,
+			       int newsize);
+int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+                                    struct ptlrpc_request **req_ret);
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+int sptlrpc_get_sepol(struct ptlrpc_request *req);
+
+/*
+ * exported higher interface of import & request
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+                             struct ptlrpc_svc_ctx *ctx,
+                             struct sptlrpc_flavor *flvr);
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
+void sptlrpc_import_sec_put(struct obd_import *imp);
+
+int  sptlrpc_import_check_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
+int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
+int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int  sptlrpc_export_update_ctx(struct obd_export *exp);
+int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
+
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule);
+
+/* gc */
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx);
+
+/* misc */
+const char * sec2target_str(struct ptlrpc_sec *sec);
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *obd);
+
+/*
+ * server side
+ */
+enum secsvc_accept_res {
+        SECSVC_OK       = 0,
+        SECSVC_COMPLETE,
+        SECSVC_DROP,
+};
+
+int  sptlrpc_svc_unwrap_request(struct ptlrpc_request *req);
+int  sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  sptlrpc_svc_wrap_reply(struct ptlrpc_request *req);
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs);
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req);
+
+int  sptlrpc_target_export_check(struct obd_export *exp,
+                                 struct ptlrpc_request *req);
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+                                      struct sptlrpc_rule_set *rset);
+
+/*
+ * reverse context
+ */
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+                                struct ptlrpc_svc_ctx *ctx);
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+                                struct ptlrpc_cli_ctx *ctx);
+
+/* bulk security api */
+int sptlrpc_enc_pool_add_user(void);
+int sptlrpc_enc_pool_del_user(void);
+int  sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
+int sptlrpc_enc_pool_get_pages_array(struct page **pa, unsigned int count);
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+void sptlrpc_enc_pool_put_pages_array(struct page **pa, unsigned int count);
+int get_free_pages_in_pool(void);
+int pool_is_at_full_capacity(void);
+
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc);
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+                                 struct ptlrpc_bulk_desc *desc,
+                                 int nob);
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+                                  struct ptlrpc_bulk_desc *desc);
+#ifdef HAVE_SERVER_SUPPORT
+int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc);
+int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc);
+int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req,
+                            struct ptlrpc_bulk_desc *desc);
+#endif
+
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+                              void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed);
+
+/* user descriptor helpers */
+static inline int sptlrpc_user_desc_size(int ngroups)
+{
+        return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32);
+}
+
+int sptlrpc_current_user_desc_size(void);
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
+int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
+
+/** @} sptlrpc */
+
+#endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
new file mode 100644
index 0000000000000..2e9d9f5cdbb99
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines
+ * are implemented in ptlrpc/lustre_swab.c.  These 'swabbers' convert the
+ * type from "other" endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ */
+
+#ifndef _LUSTRE_SWAB_H_
+#define _LUSTRE_SWAB_H_
+
+#include <uapi/linux/lustre/lustre_idl.h>
+
+#ifdef HAVE_SERVER_SUPPORT
+void lustre_swab_orphan_ent(struct lu_orphan_ent *ent);
+void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent);
+void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent);
+void lustre_swab_gl_lquota_desc(struct ldlm_gl_lquota_desc *desc);
+void lustre_swab_gl_barrier_desc(struct ldlm_gl_barrier_desc *desc);
+void lustre_swab_object_update(struct object_update *ou);
+int lustre_swab_object_update_request(struct object_update_request *our,
+				      __u32 len);
+void lustre_swab_out_update_header(struct out_update_header *ouh);
+void lustre_swab_out_update_buffer(struct out_update_buffer *oub);
+void lustre_swab_object_update_result(struct object_update_result *our);
+int lustre_swab_object_update_reply(struct object_update_reply *our, __u32 len);
+#endif /* HAVE_SERVER_SUPPORT */
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+void lustre_swab_connect(struct obd_connect_data *ocd);
+void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss);
+void lustre_swab_obd_statfs(struct obd_statfs *os);
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo);
+void lustre_swab_niobuf_remote(struct niobuf_remote *nbr);
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb);
+void lustre_swab_ost_lvb(struct ost_lvb *lvb);
+int lustre_swab_obd_quotactl(struct obd_quotactl *q, __u32 len);
+void lustre_swab_quota_body(struct quota_body *b);
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb);
+void lustre_swab_barrier_lvb(struct barrier_lvb *lvb);
+void lustre_swab_generic_32s(__u32 *val);
+void lustre_swab_mdt_body(struct mdt_body *b);
+void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b);
+void lustre_swab_mdt_rec_setattr(struct mdt_rec_setattr *sa);
+void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
+void lustre_swab_lmv_desc(struct lmv_desc *ld);
+void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm);
+void lustre_swab_lov_desc(struct lov_desc *ld);
+void lustre_swab_ldlm_res_id(struct ldlm_res_id *id);
+void lustre_swab_ldlm_policy_data(union ldlm_wire_policy_data *d);
+void lustre_swab_ldlm_intent(struct ldlm_intent *i);
+void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r);
+void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l);
+void lustre_swab_ldlm_request(struct ldlm_request *rq);
+void lustre_swab_ldlm_reply(struct ldlm_reply *r);
+void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo);
+void lustre_swab_mgs_config_body(struct mgs_config_body *body);
+void lustre_swab_mgs_config_res(struct mgs_config_res *body);
+void lustre_swab_lfsck_request(struct lfsck_request *lr);
+void lustre_swab_lfsck_reply(struct lfsck_reply *lr);
+void lustre_swab_obdo(struct obdo *o);
+void lustre_swab_ost_body(struct ost_body *b);
+void lustre_swab_ost_last_id(__u64 *id);
+int lustre_swab_fiemap(struct fiemap *fiemap, __u32 len);
+void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info);
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum);
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+				     int stripe_count);
+void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size);
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
+void lustre_swab_idx_info(struct idx_info *ii);
+void lustre_swab_lip_header(struct lu_idxpage *lip);
+void lustre_swab_fid2path(struct getinfo_fid2path *gf);
+void lustre_swab_layout_intent(struct layout_intent *li);
+void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+void lustre_swab_hsm_current_action(struct hsm_current_action *action);
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk);
+void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui);
+void lustre_swab_hsm_request(struct hsm_request *hr);
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
+void lustre_swab_close_data(struct close_data *data);
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync);
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
+void lustre_swab_ladvise(struct lu_ladvise *ladvise);
+void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr);
+
+/* Functions for dumping PTLRPC fields */
+void dump_rniobuf(struct niobuf_remote *rnb);
+void dump_ioo(struct obd_ioobj *nb);
+void dump_ost_body(struct ost_body *ob);
+void dump_rcs(__u32 *rc);
+
+void lustre_print_user_md(unsigned int level, struct lov_user_md *lum,
+			  const char *msg);
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_update.h b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
new file mode 100644
index 0000000000000..78cd3d4bfdd51
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
@@ -0,0 +1,709 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.htm
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_update.h
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#ifndef _LUSTRE_UPDATE_H
+#define _LUSTRE_UPDATE_H
+#include <dt_object.h>
+#include <lustre_net.h>
+#include <obj_update.h>
+
+#define OUT_UPDATE_REPLY_SIZE		4096
+#define OUT_BULK_BUFFER_SIZE		4096
+
+struct dt_key;
+struct dt_rec;
+struct object_update_param;
+struct llog_update_record;
+
+static inline size_t update_params_size(const struct update_params *params,
+					unsigned int param_count)
+{
+	struct object_update_param	*param;
+	size_t total_size = sizeof(*params);
+	unsigned int i;
+
+	param = (struct object_update_param *)&params->up_params[0];
+	for (i = 0; i < param_count; i++) {
+		size_t size = object_update_param_size(param);
+
+		param = (struct object_update_param *)((char *)param + size);
+		total_size += size;
+	}
+
+	return total_size;
+}
+
+static inline struct object_update_param *
+update_params_get_param(const struct update_params *params,
+			unsigned int index, unsigned int param_count)
+{
+	struct object_update_param *param;
+	unsigned int		i;
+
+	if (index > param_count)
+		return NULL;
+
+	param = (struct object_update_param *)&params->up_params[0];
+	for (i = 0; i < index; i++)
+		param = (struct object_update_param *)((char *)param +
+			object_update_param_size(param));
+
+	return param;
+}
+
+static inline void*
+update_params_get_param_buf(const struct update_params *params, __u16 index,
+			    unsigned int param_count, __u16 *size)
+{
+	struct object_update_param *param;
+
+	param = update_params_get_param(params, (unsigned int)index,
+					param_count);
+	if (param == NULL)
+		return NULL;
+
+	if (size != NULL)
+		*size = param->oup_len;
+
+	return param->oup_buf;
+}
+
+static inline size_t
+update_op_size(unsigned int param_count)
+{
+	return offsetof(struct update_op, uop_params_off[param_count]);
+}
+
+static inline struct update_op *
+update_op_next_op(const struct update_op *uop)
+{
+	return (struct update_op *)((char *)uop +
+				update_op_size(uop->uop_param_count));
+}
+
+static inline size_t update_ops_size(const struct update_ops *ops,
+				     unsigned int update_count)
+{
+	struct update_op *op;
+	size_t total_size = sizeof(*ops);
+	unsigned int i;
+
+	op = (struct update_op *)&ops->uops_op[0];
+	for (i = 0; i < update_count; i++, op = update_op_next_op(op))
+		total_size += update_op_size(op->uop_param_count);
+
+	return total_size;
+}
+
+static inline struct update_params *
+update_records_get_params(const struct update_records *record)
+{
+	return (struct update_params *)((char *)record +
+		offsetof(struct update_records, ur_ops) +
+		update_ops_size(&record->ur_ops, record->ur_update_count));
+}
+
+static inline struct update_param *
+update_param_next_param(const struct update_param *param)
+{
+	return (struct update_param *)((char *)param +
+				       object_update_param_size(
+					  (struct object_update_param *)param));
+}
+
+static inline size_t
+__update_records_size(size_t raw_size)
+{
+	return cfs_size_round(offsetof(struct update_records, ur_ops) +
+			      raw_size);
+}
+
+static inline size_t
+update_records_size(const struct update_records *record)
+{
+	size_t op_size = 0;
+	size_t param_size = 0;
+
+	if (record->ur_update_count > 0)
+		op_size = update_ops_size(&record->ur_ops,
+					  record->ur_update_count);
+	if (record->ur_param_count > 0) {
+		struct update_params *params;
+
+		params = update_records_get_params(record);
+		param_size = update_params_size(params, record->ur_param_count);
+	}
+
+	return __update_records_size(op_size + param_size);
+}
+
+static inline size_t
+__llog_update_record_size(size_t records_size)
+{
+	return cfs_size_round(sizeof(struct llog_rec_hdr) + records_size +
+			      sizeof(struct llog_rec_tail));
+}
+
+static inline size_t
+llog_update_record_size(const struct llog_update_record *lur)
+{
+	return __llog_update_record_size(
+			update_records_size(&lur->lur_update_rec));
+}
+
+static inline struct update_op *
+update_ops_get_op(const struct update_ops *ops, unsigned int index,
+		  unsigned int update_count)
+{
+	struct update_op *op;
+	unsigned int i;
+
+	if (index > update_count)
+		return NULL;
+
+	op = (struct update_op *)&ops->uops_op[0];
+	for (i = 0; i < index; i++)
+		op = update_op_next_op(op);
+
+	return op;
+}
+
+static inline void
+*object_update_param_get(const struct object_update *update, size_t index,
+			 size_t *size)
+{
+	const struct	object_update_param *param;
+	size_t		i;
+
+	if (index >= update->ou_params_count)
+		return ERR_PTR(-EINVAL);
+
+	param = &update->ou_params[0];
+	for (i = 0; i < index; i++)
+		param = (struct object_update_param *)((char *)param +
+			object_update_param_size(param));
+
+	if (size != NULL)
+		*size = param->oup_len;
+
+	if (param->oup_len == 0)
+		return ERR_PTR(-ENODATA);
+
+	return (void *)&param->oup_buf[0];
+}
+
+static inline unsigned long
+object_update_request_size(const struct object_update_request *our)
+{
+	unsigned long	size;
+	size_t		i = 0;
+
+	size = offsetof(struct object_update_request, ourq_updates[0]);
+	for (i = 0; i < our->ourq_count; i++) {
+		struct object_update *update;
+
+		update = (struct object_update *)((char *)our + size);
+		size += object_update_size(update);
+	}
+	return size;
+}
+
+static inline void
+object_update_result_insert(struct object_update_reply *reply,
+			    void *data, size_t data_len, size_t index,
+			    int rc)
+{
+	struct object_update_result *update_result;
+
+	update_result = object_update_result_get(reply, index, NULL);
+	LASSERT(update_result);
+
+	update_result->our_rc = ptlrpc_status_hton(rc);
+	if (rc >= 0) {
+		if (data_len > 0 && data)
+			memcpy(update_result->our_data, data, data_len);
+		update_result->our_datalen = data_len;
+	}
+
+	reply->ourp_lens[index] = cfs_size_round(data_len +
+					sizeof(struct object_update_result));
+}
+
+static inline int
+object_update_result_data_get(const struct object_update_reply *reply,
+			      struct lu_buf *lbuf, size_t index)
+{
+	struct object_update_result *update_result;
+	size_t size = 0;
+	int    result;
+
+	LASSERT(lbuf != NULL);
+	update_result = object_update_result_get(reply, index, &size);
+	if (update_result == NULL ||
+	    size < cfs_size_round(sizeof(struct object_update_reply)) ||
+	    update_result->our_datalen > size)
+		RETURN(-EFAULT);
+
+	result = ptlrpc_status_ntoh(update_result->our_rc);
+	if (result < 0)
+		return result;
+
+	lbuf->lb_buf = update_result->our_data;
+	lbuf->lb_len = update_result->our_datalen;
+
+	return result;
+}
+
+/**
+ * Attached in the thandle to record the updates for distribute
+ * distribution.
+ */
+struct thandle_update_records {
+	/* All of updates for the cross-MDT operation, vmalloc'd. */
+	struct llog_update_record	*tur_update_records;
+	size_t				tur_update_records_buf_size;
+
+	/* All of parameters for the cross-MDT operation, vmalloc'd */
+	struct update_params    *tur_update_params;
+	unsigned int		tur_update_param_count;
+	size_t			tur_update_params_buf_size;
+};
+
+#define TOP_THANDLE_MAGIC	0x20140917
+struct top_multiple_thandle {
+	struct dt_device	*tmt_master_sub_dt;
+	atomic_t		tmt_refcount;
+	/* Other sub transactions will be listed here. */
+	struct list_head	tmt_sub_thandle_list;
+	spinlock_t		tmt_sub_lock;
+
+	struct list_head	tmt_commit_list;
+	/* All of update records will packed here */
+	struct thandle_update_records *tmt_update_records;
+
+	wait_queue_head_t	tmt_stop_waitq;
+	__u64			tmt_batchid;
+	int			tmt_result;
+	__u32			tmt_magic;
+	size_t			tmt_record_size;
+	__u32			tmt_committed:1;
+};
+
+/* {top,sub}_thandle are used to manage distributed transactions which
+ * include updates on several nodes. A top_handle represents the
+ * whole operation, and sub_thandle represents updates on each node. */
+struct top_thandle {
+	struct thandle		tt_super;
+	/* The master sub transaction. */
+	struct thandle		*tt_master_sub_thandle;
+
+	struct top_multiple_thandle *tt_multiple_thandle;
+};
+
+struct sub_thandle_cookie {
+	struct llog_cookie	stc_cookie;
+	struct list_head	stc_list;
+};
+
+/* Sub thandle is used to track multiple sub thandles under one parent
+ * thandle */
+struct sub_thandle {
+	struct thandle		*st_sub_th;
+	struct dt_device	*st_dt;
+	struct list_head	st_cookie_list;
+	struct dt_txn_commit_cb	st_commit_dcb;
+	struct dt_txn_commit_cb	st_stop_dcb;
+	int			st_result;
+
+	/* linked to top_thandle */
+	struct list_head	st_sub_list;
+
+	/* If this sub thandle is committed */
+	bool			st_committed:1,
+				st_stopped:1,
+				st_started:1;
+};
+
+struct tx_arg;
+typedef int (*tx_exec_func_t)(const struct lu_env *env, struct thandle *th,
+			      struct tx_arg *ta);
+
+/* Structure for holding one update execution */
+struct tx_arg {
+	tx_exec_func_t		 exec_fn;
+	tx_exec_func_t		 undo_fn;
+	struct dt_object	*object;
+	const char		*file;
+	struct object_update_reply *reply;
+	int			 line;
+	int			 index;
+	union {
+		struct {
+			struct dt_insert_rec	 rec;
+			const struct dt_key	*key;
+		} insert;
+		struct {
+		} ref;
+		struct {
+			struct lu_attr	 attr;
+		} attr_set;
+		struct {
+			struct lu_buf	 buf;
+			const char	*name;
+			int		 flags;
+			__u32		 csum;
+		} xattr_set;
+		struct {
+			struct lu_attr			attr;
+			struct dt_allocation_hint	hint;
+			struct dt_object_format		dof;
+			struct lu_fid			fid;
+		} create;
+		struct {
+			struct lu_buf	buf;
+			loff_t		pos;
+		} write;
+		struct {
+			struct ost_body	    *body;
+		} destroy;
+	} u;
+};
+
+/* Structure for holding all update executations of one transaction */
+struct thandle_exec_args {
+	struct thandle		*ta_handle;
+	int			ta_argno;   /* used args */
+	int			ta_alloc_args; /* allocated args count */
+	struct tx_arg		**ta_args;
+};
+
+/* target/out_lib.c */
+int out_update_pack(const struct lu_env *env, struct object_update *update,
+		    size_t *max_update_size, enum update_type op,
+		    const struct lu_fid *fid, unsigned int params_count,
+		    __u16 *param_sizes, const void **param_bufs,
+		    __u32 reply_size);
+int out_create_pack(const struct lu_env *env, struct object_update *update,
+		    size_t *max_update_size, const struct lu_fid *fid,
+		    const struct lu_attr *attr, struct dt_allocation_hint *hint,
+		    struct dt_object_format *dof);
+int out_destroy_pack(const struct lu_env *env, struct object_update *update,
+		     size_t *max_update_size, const struct lu_fid *fid);
+int out_index_delete_pack(const struct lu_env *env,
+			  struct object_update *update, size_t *max_update_size,
+			  const struct lu_fid *fid, const struct dt_key *key);
+int out_index_insert_pack(const struct lu_env *env,
+			  struct object_update *update, size_t *max_update_size,
+			  const struct lu_fid *fid, const struct dt_rec *rec,
+			  const struct dt_key *key);
+int out_xattr_set_pack(const struct lu_env *env,
+		       struct object_update *update, size_t *max_update_size,
+		       const struct lu_fid *fid, const struct lu_buf *buf,
+		       const char *name, __u32 flag);
+int out_xattr_del_pack(const struct lu_env *env,
+		       struct object_update *update, size_t *max_update_size,
+		       const struct lu_fid *fid, const char *name);
+int out_attr_set_pack(const struct lu_env *env,
+		      struct object_update *update, size_t *max_update_size,
+		      const struct lu_fid *fid, const struct lu_attr *attr);
+int out_ref_add_pack(const struct lu_env *env,
+		     struct object_update *update, size_t *max_update_size,
+		     const struct lu_fid *fid);
+int out_ref_del_pack(const struct lu_env *env,
+		     struct object_update *update, size_t *max_update_size,
+		     const struct lu_fid *fid);
+int out_write_pack(const struct lu_env *env,
+		   struct object_update *update, size_t *max_update_size,
+		   const struct lu_fid *fid, const struct lu_buf *buf,
+		   __u64 pos);
+int out_attr_get_pack(const struct lu_env *env,
+		      struct object_update *update, size_t *max_update_size,
+		      const struct lu_fid *fid);
+int out_index_lookup_pack(const struct lu_env *env,
+			  struct object_update *update, size_t *max_update_size,
+			  const struct lu_fid *fid, struct dt_rec *rec,
+			  const struct dt_key *key);
+int out_xattr_get_pack(const struct lu_env *env,
+		       struct object_update *update, size_t *max_update_size,
+		       const struct lu_fid *fid, const char *name,
+		       const int bufsize);
+int out_xattr_list_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const int bufsize);
+int out_read_pack(const struct lu_env *env, struct object_update *update,
+		  size_t *max_update_length, const struct lu_fid *fid,
+		  size_t size, loff_t pos);
+
+const char *update_op_str(__u16 opcode);
+
+/* target/update_trans.c */
+struct thandle *thandle_get_sub_by_dt(const struct lu_env *env,
+				      struct thandle *th,
+				      struct dt_device *sub_dt);
+
+static inline struct thandle *
+thandle_get_sub(const struct lu_env *env, struct thandle *th,
+		 const struct dt_object *sub_obj)
+{
+	return thandle_get_sub_by_dt(env, th, lu2dt_dev(sub_obj->do_lu.lo_dev));
+}
+
+struct thandle *
+top_trans_create(const struct lu_env *env, struct dt_device *master_dev);
+int top_trans_start(const struct lu_env *env, struct dt_device *master_dev,
+		    struct thandle *th);
+int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
+		   struct thandle *th);
+void top_multiple_thandle_destroy(struct top_multiple_thandle *tmt);
+
+static inline void top_multiple_thandle_get(struct top_multiple_thandle *tmt)
+{
+	atomic_inc(&tmt->tmt_refcount);
+}
+
+static inline void top_multiple_thandle_put(struct top_multiple_thandle *tmt)
+{
+	if (atomic_dec_and_test(&tmt->tmt_refcount))
+		top_multiple_thandle_destroy(tmt);
+}
+
+struct sub_thandle *lookup_sub_thandle(struct top_multiple_thandle *tmt,
+				       struct dt_device *dt_dev);
+int sub_thandle_trans_create(const struct lu_env *env,
+			     struct top_thandle *top_th,
+			     struct sub_thandle *st);
+
+/* update_records.c */
+size_t update_records_create_size(const struct lu_env *env,
+				  const struct lu_fid *fid,
+				  const struct lu_attr *attr,
+				  const struct dt_allocation_hint *hint,
+				  struct dt_object_format *dof);
+size_t update_records_attr_set_size(const struct lu_env *env,
+				    const struct lu_fid *fid,
+				    const struct lu_attr *attr);
+size_t update_records_ref_add_size(const struct lu_env *env,
+				   const struct lu_fid *fid);
+size_t update_records_ref_del_size(const struct lu_env *env,
+				   const struct lu_fid *fid);
+size_t update_records_destroy_size(const struct lu_env *env,
+				   const struct lu_fid *fid);
+size_t update_records_index_insert_size(const struct lu_env *env,
+					const struct lu_fid *fid,
+					const struct dt_rec *rec,
+					const struct dt_key *key);
+size_t update_records_index_delete_size(const struct lu_env *env,
+					const struct lu_fid *fid,
+					const struct dt_key *key);
+size_t update_records_xattr_set_size(const struct lu_env *env,
+				     const struct lu_fid *fid,
+				     const struct lu_buf *buf,
+				     const char *name,
+				     __u32 flag);
+size_t update_records_xattr_del_size(const struct lu_env *env,
+				     const struct lu_fid *fid,
+				     const char *name);
+size_t update_records_write_size(const struct lu_env *env,
+				 const struct lu_fid *fid,
+				 const struct lu_buf *buf,
+				 __u64 pos);
+size_t update_records_punch_size(const struct lu_env *env,
+				 const struct lu_fid *fid,
+				 __u64 start, __u64 end);
+
+int update_records_create_pack(const struct lu_env *env,
+			       struct update_ops *ops,
+			       unsigned int *op_count,
+			       size_t *max_ops_size,
+			       struct update_params *params,
+			       unsigned int *param_count,
+			       size_t *max_param_size,
+			       const struct lu_fid *fid,
+			       const struct lu_attr *attr,
+			       const struct dt_allocation_hint *hint,
+			       struct dt_object_format *dof);
+int update_records_attr_set_pack(const struct lu_env *env,
+				 struct update_ops *ops,
+				 unsigned int *op_count,
+				 size_t *max_ops_size,
+				 struct update_params *params,
+				 unsigned int *param_count,
+				 size_t *max_param_size,
+				 const struct lu_fid *fid,
+				 const struct lu_attr *attr);
+int update_records_ref_add_pack(const struct lu_env *env,
+				struct update_ops *ops,
+				unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid);
+int update_records_ref_del_pack(const struct lu_env *env,
+				struct update_ops *ops,
+				unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid);
+int update_records_destroy_pack(const struct lu_env *env,
+				struct update_ops *ops, unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid);
+int update_records_index_insert_pack(const struct lu_env *env,
+				     struct update_ops *ops,
+				     unsigned int *op_count,
+				     size_t *max_ops_size,
+				     struct update_params *params,
+				     unsigned int *param_count,
+				     size_t *max_param_size,
+				     const struct lu_fid *fid,
+				     const struct dt_rec *rec,
+				     const struct dt_key *key);
+int update_records_index_delete_pack(const struct lu_env *env,
+				     struct update_ops *ops,
+				     unsigned int *op_count,
+				     size_t *max_ops_size,
+				     struct update_params *params,
+				     unsigned int *param_count,
+				     size_t *max_param_size,
+				     const struct lu_fid *fid,
+				     const struct dt_key *key);
+int update_records_xattr_set_pack(const struct lu_env *env,
+				  struct update_ops *ops,
+				  unsigned int *op_count,
+				  size_t *max_ops_size,
+				  struct update_params *params,
+				  unsigned int *param_count,
+				  size_t *max_param_size,
+				  const struct lu_fid *fid,
+				  const struct lu_buf *buf, const char *name,
+				  __u32 flag);
+int update_records_xattr_del_pack(const struct lu_env *env,
+				  struct update_ops *ops,
+				  unsigned int *op_count,
+				  size_t *max_ops_size,
+				  struct update_params *params,
+				  unsigned int *param_count,
+				  size_t *max_param_size,
+				  const struct lu_fid *fid,
+				  const char *name);
+int update_records_write_pack(const struct lu_env *env,
+			      struct update_ops *ops,
+			      unsigned int *op_count,
+			      size_t *max_ops_size,
+			      struct update_params *params,
+			      unsigned int *param_count,
+			      size_t *max_param_size,
+			      const struct lu_fid *fid,
+			      const struct lu_buf *buf,
+			      __u64 pos);
+int update_records_punch_pack(const struct lu_env *env,
+			      struct update_ops *ops,
+			      unsigned int *op_count,
+			      size_t *max_ops_size,
+			      struct update_params *params,
+			      unsigned int *param_count,
+			      size_t *max_param_size,
+			      const struct lu_fid *fid,
+			      __u64 start, __u64 end);
+int update_records_noop_pack(const struct lu_env *env,
+			     struct update_ops *ops,
+			     unsigned int *op_count,
+			     size_t *max_ops_size,
+			     struct update_params *params,
+			     unsigned int *param_count,
+			     size_t *max_param_size,
+			     const struct lu_fid *fid);
+
+int tur_update_records_extend(struct thandle_update_records *tur,
+			      size_t new_size);
+int tur_update_params_extend(struct thandle_update_records *tur,
+			     size_t new_size);
+int tur_update_extend(struct thandle_update_records *tur,
+		      size_t new_op_size, size_t new_param_size);
+
+#define update_record_pack(name, th, ...)				\
+({									\
+	struct top_thandle *top_th;					\
+	struct top_multiple_thandle *tmt;				\
+	struct thandle_update_records *tur;				\
+	struct llog_update_record     *lur;				\
+	size_t		avail_param_size;				\
+	size_t		avail_op_size;					\
+	int		ret;						\
+									\
+	while (1) {							\
+		top_th = container_of(th, struct top_thandle, tt_super);\
+		tmt = top_th->tt_multiple_thandle;			\
+		tur = tmt->tmt_update_records;				\
+		lur = tur->tur_update_records;				\
+		avail_param_size = tur->tur_update_params_buf_size -	\
+			     update_params_size(tur->tur_update_params,	\
+					tur->tur_update_param_count);	\
+		avail_op_size = tur->tur_update_records_buf_size -	\
+				llog_update_record_size(lur);		\
+		ret = update_records_##name##_pack(env,			\
+					  &lur->lur_update_rec.ur_ops,	\
+				  &lur->lur_update_rec.ur_update_count,	\
+				  &avail_op_size,			\
+				  tur->tur_update_params,		\
+				  &tur->tur_update_param_count,		\
+				  &avail_param_size, __VA_ARGS__);	\
+		if (ret == -E2BIG) {					\
+			ret = tur_update_extend(tur, avail_op_size,	\
+						   avail_param_size);	\
+			if (ret != 0)					\
+				break;					\
+			continue;					\
+		} else {						\
+			break;						\
+		}							\
+	}								\
+	ret;								\
+})
+
+#define update_record_size(env, name, th, ...)				\
+({									\
+	struct top_thandle *top_th;					\
+	struct top_multiple_thandle *tmt;				\
+									\
+	top_th = container_of(th, struct top_thandle, tt_super);	\
+									\
+	LASSERT(top_th->tt_multiple_thandle != NULL);			\
+	tmt = top_th->tt_multiple_thandle;				\
+	tmt->tmt_record_size +=						\
+		update_records_##name##_size(env, __VA_ARGS__);		\
+})
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lvfs.h b/drivers/staging/lustrefsx/lustre/include/lvfs.h
new file mode 100644
index 0000000000000..2ca2f19bab7b3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lvfs.h
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LVFS_H__
+#define __LVFS_H__
+
+#include <linux/dcache.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <lustre_compat.h>
+
+#define OBD_RUN_CTXT_MAGIC	0xC0FFEEAA
+#define OBD_CTXT_DEBUG		/* development-only debugging */
+
+struct dt_device;
+
+struct lvfs_run_ctxt {
+	struct vfsmount		*pwdmnt;
+	struct dentry		*pwd;
+	int			 umask;
+	struct dt_device	*dt;
+#ifdef OBD_CTXT_DEBUG
+	unsigned int		 magic;
+#endif
+};
+
+static inline void OBD_SET_CTXT_MAGIC(struct lvfs_run_ctxt *ctxt)
+{
+#ifdef OBD_CTXT_DEBUG
+	ctxt->magic = OBD_RUN_CTXT_MAGIC;
+#endif
+}
+
+/* ptlrpc_sec_ctx.c */
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx);
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx);
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/md_object.h b/drivers/staging/lustrefsx/lustre/include/md_object.h
new file mode 100644
index 0000000000000..daa62d86f9f29
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/md_object.h
@@ -0,0 +1,733 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/md_object.h
+ *
+ * Extention of lu_object.h for metadata objects
+ */
+
+#ifndef _LUSTRE_MD_OBJECT_H
+#define _LUSTRE_MD_OBJECT_H
+
+#ifndef HAVE_SERVER_SUPPORT
+# error "client code should not depend on md_object.h"
+#endif /* !HAVE_SERVER_SUPPORT */
+
+/** \defgroup md md
+ * Sub-class of lu_object with methods common for "meta-data" objects in MDT
+ * stack.
+ *
+ * Meta-data objects implement namespace operations: you can link, unlink
+ * them, and treat them as directories.
+ *
+ * Examples: mdt, cmm, and mdt are implementations of md interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <dt_object.h>
+
+struct md_device;
+struct md_device_operations;
+struct md_object;
+struct obd_export;
+
+/** metadata attributes */
+enum ma_valid {
+	MA_INODE	= BIT(0),
+	MA_LOV		= BIT(1),
+	MA_FLAGS	= BIT(2),
+	MA_LMV		= BIT(3),
+	MA_ACL_DEF	= BIT(4),
+	MA_LOV_DEF	= BIT(5),
+	MA_HSM		= BIT(6),
+	MA_PFID		= BIT(7),
+	MA_LMV_DEF	= BIT(8),
+	MA_SOM		= BIT(9),
+	MA_FORCE_LOG	= BIT(10), /* forced close logged in mdt_mfd_close */
+};
+
+typedef enum {
+        MDL_MINMODE  = 0,
+        MDL_EX       = 1,
+        MDL_PW       = 2,
+        MDL_PR       = 4,
+        MDL_CW       = 8,
+        MDL_CR       = 16,
+        MDL_NL       = 32,
+        MDL_GROUP    = 64,
+        MDL_MAXMODE
+} mdl_mode_t;
+
+typedef enum {
+	MDT_NUL_LOCK = 0,
+	MDT_REG_LOCK = BIT(0),
+	MDT_PDO_LOCK = BIT(1),
+} mdl_type_t;
+
+/* lfs rgetfacl permission check */
+#define MAY_RGETFACL	BIT(14)
+
+/* memory structure for hsm attributes
+ * for fields description see the on disk structure hsm_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_hsm {
+	__u32	mh_compat;
+	__u32	mh_flags;
+	__u64	mh_arch_id;
+	__u64	mh_arch_ver;
+};
+
+
+/* memory structure for SOM attributes
+ * for fields description see the on disk structure som_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_som {
+	__u16	ms_valid;
+	__u64	ms_size;
+	__u64	ms_blocks;
+};
+
+struct md_attr {
+	__u64			 ma_valid;
+	__u64			 ma_need;
+	__u64			 ma_attr_flags;
+	struct lu_attr		 ma_attr;
+	struct lu_fid		 ma_pfid;
+	struct md_hsm		 ma_hsm;
+	struct md_som		 ma_som;
+	struct lov_mds_md	*ma_lmm;
+	union lmv_mds_md	*ma_lmv;
+	struct lmv_user_md	*ma_default_lmv;
+	void			*ma_acl;
+	int			 ma_lmm_size;
+	int			 ma_lmv_size;
+	int			 ma_default_lmv_size;
+	int			 ma_acl_size;
+	int			 ma_enable_chprojid_gid;
+};
+
+/** Additional parameters for create */
+struct md_op_spec {
+	union {
+		/** symlink target */
+		struct lu_name sp_symname;
+		/** eadata for regular files */
+		struct md_spec_reg {
+			void *eadata;
+			int  eadatalen;
+		} sp_ea;
+	} u;
+
+	/** Open flags from client: such as MDS_OPEN_CREAT, and others. */
+	__u64      sp_cr_flags;
+
+	/* File security context for creates. */
+	const char	*sp_cr_file_secctx_name; /* (security) xattr name */
+	void		*sp_cr_file_secctx; /* xattr value */
+	size_t		 sp_cr_file_secctx_size; /* xattr value size */
+
+	/* File encryption context for creates. */
+	void		*sp_cr_file_encctx; /* enc ctx value */
+	size_t		 sp_cr_file_encctx_size; /* enc ctx size */
+
+	/* Archive ID used for auto PCC attach when create newly files. */
+	__u32		 sp_archive_id;
+
+	/** don't create lov objects or llog cookie - this replay */
+	unsigned int no_create:1,
+		     sp_cr_lookup:1, /* do lookup sanity check or not. */
+		     sp_rm_entry:1,  /* only remove name entry */
+		     sp_permitted:1, /* do not check permission */
+		     sp_migrate_close:1, /* close the file during migrate */
+		     sp_migrate_nsonly:1; /* migrate dirent only */
+
+	/** to create directory */
+	const struct dt_index_features *sp_feat;
+};
+
+enum md_layout_opc {
+	MD_LAYOUT_NOP	= 0,
+	MD_LAYOUT_WRITE,	/* FLR: write the file */
+	MD_LAYOUT_RESYNC,	/* FLR: resync starts */
+	MD_LAYOUT_RESYNC_DONE,	/* FLR: resync done */
+	MD_LAYOUT_ATTACH,	/* attach stripes */
+	MD_LAYOUT_DETACH,	/* detach stripes */
+	MD_LAYOUT_SHRINK,	/* shrink striped directory (destroy stripes) */
+	MD_LAYOUT_SPLIT,	/* split directory (allocate new stripes) */
+	MD_LAYOUT_MAX,
+};
+
+/**
+ * Parameters for layout change API.
+ */
+struct md_layout_change {
+	enum md_layout_opc			 mlc_opc;
+	struct lu_buf				 mlc_buf;
+	union {
+		struct {
+			__u16			 mlc_mirror_id;
+			struct layout_intent	*mlc_intent;
+			struct lustre_som_attrs	 mlc_som;
+			size_t			 mlc_resync_count;
+			__u32			*mlc_resync_ids;
+		}; /* file */
+		struct {
+			/* parent obj in plain dir split */
+			struct md_object	*mlc_parent;
+			/* target obj in plain dir split */
+			struct md_object	*mlc_target;
+			/* target attr in plain dir split */
+			struct lu_attr		*mlc_attr;
+			/* target name in plain dir split */
+			const struct lu_name	*mlc_name;
+			/* dir split spec */
+			struct md_op_spec	*mlc_spec;
+		}; /* dir */
+	};
+};
+
+union ldlm_policy_data;
+/**
+ * Operations implemented for each md object (both directory and leaf).
+ */
+struct md_object_operations {
+	int (*moo_permission)(const struct lu_env *env,
+			      struct md_object *pobj, struct md_object *cobj,
+			      struct md_attr *attr, unsigned int may_mask);
+
+	int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+			    struct md_attr *attr);
+
+	int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+			    const struct md_attr *attr);
+
+	int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+			     struct lu_buf *buf, const char *name);
+
+	int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+			      struct lu_buf *buf);
+
+	int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+			     const struct lu_buf *buf, const char *name,
+			     int fl);
+
+	int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+			     const char *name);
+
+	/** This method is used to swap the layouts between 2 objects */
+	int (*moo_swap_layouts)(const struct lu_env *env,
+			       struct md_object *obj1, struct md_object *obj2,
+			       __u64 flags);
+
+	/** \retval number of bytes actually read upon success */
+	int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+			    const struct lu_rdpg *rdpg);
+
+	int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+			    struct lu_buf *buf);
+
+	int (*moo_changelog)(const struct lu_env *env,
+			     enum changelog_rec_type type,
+			     enum changelog_rec_flags clf_flags,
+			     struct md_device *m, const struct lu_fid *fid);
+
+	int (*moo_open)(const struct lu_env *env, struct md_object *obj,
+			u64 open_flags, struct md_op_spec*);
+
+	int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+			 struct md_attr *ma, u64 open_flags);
+
+	int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+
+	int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
+			       struct lustre_handle *lh,
+			       struct ldlm_enqueue_info *einfo,
+			       union ldlm_policy_data *policy);
+	int (*moo_object_unlock)(const struct lu_env *env,
+				 struct md_object *obj,
+				 struct ldlm_enqueue_info *einfo,
+				 union ldlm_policy_data *policy);
+
+	int (*moo_invalidate)(const struct lu_env *env, struct md_object *obj);
+	/**
+	 * Trying to write to un-instantiated layout component.
+	 *
+	 * The caller should have held layout lock.
+	 *
+	 * This API can be extended to support every other layout changing
+	 * operations, such as component {add,del,change}, layout swap,
+	 * layout merge, etc. One of the benefits by doing this is that the MDT
+	 * no longer needs to understand layout.
+	 *
+	 * However, layout creation, removal, and fetch should still use
+	 * xattr_{get,set}() because they don't interpret layout on the
+	 * MDT layer.
+	 *
+	 * \param[in] env	execution environment
+	 * \param[in] obj	MD object
+	 * \param[in] layout	data structure to describe the changes to
+	 *			the MD object's layout
+	 *
+	 * \retval 0		success
+	 * \retval -ne		error code
+	 */
+	int (*moo_layout_change)(const struct lu_env *env,
+				 struct md_object *obj,
+				 struct md_layout_change *layout);
+};
+
+/**
+ * Operations implemented for each directory object.
+ */
+struct md_dir_operations {
+	int (*mdo_is_subdir)(const struct lu_env *env, struct md_object *obj,
+			     const struct lu_fid *fid);
+
+	int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+			  const struct lu_name *lname, struct lu_fid *fid,
+			  struct md_op_spec *spec);
+
+	mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+				    struct md_object *obj,
+				    mdl_mode_t mode);
+
+	int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+			  const struct lu_name *lname, struct md_object *child,
+			  struct md_op_spec *spec,
+			  struct md_attr *ma);
+
+	/** This method is used for creating data object for this meta object*/
+	int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+			       struct md_object *o,
+			       const struct md_op_spec *spec,
+			       struct md_attr *ma);
+
+	int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+			  struct md_object *tpobj, const struct lu_fid *lf,
+			  const struct lu_name *lsname, struct md_object *tobj,
+			  const struct lu_name *ltname, struct md_attr *ma);
+
+	int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+			struct md_object *src_obj, const struct lu_name *lname,
+			struct md_attr *ma);
+
+	int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
+			  struct md_object *cobj, const struct lu_name *lname,
+			  struct md_attr *ma, int no_name);
+
+	int (*mdo_migrate)(const struct lu_env *env, struct md_object *pobj,
+			   struct md_object *sobj, const struct lu_name *lname,
+			   struct md_object *tobj, struct md_op_spec *spec,
+			   struct md_attr *ma);
+};
+
+struct md_device_operations {
+        /** meta-data device related handlers. */
+	int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
+			    struct lu_fid *f);
+
+	const struct dt_device_param *(*mdo_dtconf_get)(const struct lu_env *e,
+							struct md_device *m);
+
+        int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
+                          struct obd_statfs *sfs);
+
+        int (*mdo_llog_ctxt_get)(const struct lu_env *env,
+                                 struct md_device *m, int idx, void **h);
+
+        int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m,
+                             unsigned int cmd, int len, void *data);
+};
+
+struct md_device {
+        struct lu_device                   md_lu_dev;
+        const struct md_device_operations *md_ops;
+};
+
+struct md_object {
+        struct lu_object                   mo_lu;
+        const struct md_object_operations *mo_ops;
+        const struct md_dir_operations    *mo_dir_ops;
+};
+
+static inline struct md_device *lu2md_dev(const struct lu_device *d)
+{
+	LASSERT(IS_ERR(d) || lu_device_is_md(d));
+	return container_of_safe(d, struct md_device, md_lu_dev);
+}
+
+static inline struct lu_device *md2lu_dev(struct md_device *d)
+{
+        return &d->md_lu_dev;
+}
+
+static inline struct md_object *lu2md(const struct lu_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev));
+	return container_of_safe(o, struct md_object, mo_lu);
+}
+
+static inline int md_device_init(struct md_device *md, struct lu_device_type *t)
+{
+        return lu_device_init(&md->md_lu_dev, t);
+}
+
+static inline void md_device_fini(struct md_device *md)
+{
+        lu_device_fini(&md->md_lu_dev);
+}
+
+static inline struct md_object *md_object_find_slice(const struct lu_env *env,
+                                                     struct md_device *md,
+                                                     const struct lu_fid *f)
+{
+        return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL));
+}
+
+
+/** md operations */
+static inline int mo_permission(const struct lu_env *env, struct md_object *p,
+				struct md_object *c, struct md_attr *at,
+				unsigned int may_mask)
+{
+	LASSERT(c->mo_ops->moo_permission);
+	return c->mo_ops->moo_permission(env, p, c, at, may_mask);
+}
+
+static inline int mo_attr_get(const struct lu_env *env, struct md_object *m,
+			      struct md_attr *at)
+{
+	LASSERT(m->mo_ops->moo_attr_get);
+	return m->mo_ops->moo_attr_get(env, m, at);
+}
+
+static inline int mo_readlink(const struct lu_env *env,
+                              struct md_object *m,
+                              struct lu_buf *buf)
+{
+        LASSERT(m->mo_ops->moo_readlink);
+        return m->mo_ops->moo_readlink(env, m, buf);
+}
+
+static inline int mo_changelog(const struct lu_env *env,
+			       enum changelog_rec_type type,
+			       enum changelog_rec_flags clf_flags,
+			       struct md_device *m, const struct lu_fid *fid)
+{
+	struct lu_fid rootfid;
+	struct md_object *root;
+	int rc;
+
+	rc = m->md_ops->mdo_root_get(env, m, &rootfid);
+	if (rc)
+		return rc;
+
+	root = md_object_find_slice(env, m, &rootfid);
+	if (IS_ERR(root))
+		RETURN(PTR_ERR(root));
+
+	LASSERT(root->mo_ops->moo_changelog);
+	rc = root->mo_ops->moo_changelog(env, type, clf_flags, m, fid);
+
+	lu_object_put(env, &root->mo_lu);
+
+	return rc;
+}
+
+static inline int mo_attr_set(const struct lu_env *env,
+                              struct md_object *m,
+                              const struct md_attr *at)
+{
+        LASSERT(m->mo_ops->moo_attr_set);
+        return m->mo_ops->moo_attr_set(env, m, at);
+}
+
+static inline int mo_xattr_get(const struct lu_env *env,
+                               struct md_object *m,
+                               struct lu_buf *buf,
+                               const char *name)
+{
+        LASSERT(m->mo_ops->moo_xattr_get);
+        return m->mo_ops->moo_xattr_get(env, m, buf, name);
+}
+
+static inline int mo_xattr_del(const struct lu_env *env,
+                               struct md_object *m,
+                               const char *name)
+{
+        LASSERT(m->mo_ops->moo_xattr_del);
+        return m->mo_ops->moo_xattr_del(env, m, name);
+}
+
+static inline int mo_xattr_set(const struct lu_env *env,
+                               struct md_object *m,
+                               const struct lu_buf *buf,
+                               const char *name,
+                               int flags)
+{
+        LASSERT(m->mo_ops->moo_xattr_set);
+        return m->mo_ops->moo_xattr_set(env, m, buf, name, flags);
+}
+
+static inline int mo_xattr_list(const struct lu_env *env,
+                                struct md_object *m,
+                                struct lu_buf *buf)
+{
+        LASSERT(m->mo_ops->moo_xattr_list);
+        return m->mo_ops->moo_xattr_list(env, m, buf);
+}
+
+static inline int mo_invalidate(const struct lu_env *env, struct md_object *m)
+{
+	LASSERT(m->mo_ops->moo_invalidate);
+	return m->mo_ops->moo_invalidate(env, m);
+}
+
+static inline int mo_layout_change(const struct lu_env *env,
+				   struct md_object *m,
+				   struct md_layout_change *layout)
+{
+	/* need instantiate objects which in the access range */
+	LASSERT(m->mo_ops->moo_layout_change);
+	return m->mo_ops->moo_layout_change(env, m, layout);
+}
+
+static inline int mo_swap_layouts(const struct lu_env *env,
+				  struct md_object *o1,
+				  struct md_object *o2, __u64 flags)
+{
+	LASSERT(o1->mo_ops->moo_swap_layouts);
+	LASSERT(o2->mo_ops->moo_swap_layouts);
+	if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts)
+		return -EPERM;
+	return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
+}
+
+static inline int mo_open(const struct lu_env *env, struct md_object *m,
+			  u64 open_flags, struct md_op_spec *spec)
+{
+	LASSERT(m->mo_ops->moo_open);
+	return m->mo_ops->moo_open(env, m, open_flags, spec);
+}
+
+static inline int mo_close(const struct lu_env *env, struct md_object *m,
+			   struct md_attr *ma, u64 open_flags)
+{
+	LASSERT(m->mo_ops->moo_close);
+	return m->mo_ops->moo_close(env, m, ma, open_flags);
+}
+
+static inline int mo_readpage(const struct lu_env *env,
+                              struct md_object *m,
+                              const struct lu_rdpg *rdpg)
+{
+        LASSERT(m->mo_ops->moo_readpage);
+        return m->mo_ops->moo_readpage(env, m, rdpg);
+}
+
+static inline int mo_object_sync(const struct lu_env *env, struct md_object *m)
+{
+        LASSERT(m->mo_ops->moo_object_sync);
+        return m->mo_ops->moo_object_sync(env, m);
+}
+
+static inline int mo_object_lock(const struct lu_env *env,
+				 struct md_object *m,
+				 struct lustre_handle *lh,
+				 struct ldlm_enqueue_info *einfo,
+				 union ldlm_policy_data *policy)
+{
+	LASSERT(m->mo_ops->moo_object_lock);
+	return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy);
+}
+
+static inline int mo_object_unlock(const struct lu_env *env,
+				   struct md_object *m,
+				   struct ldlm_enqueue_info *einfo,
+				   union ldlm_policy_data *policy)
+{
+	LASSERT(m->mo_ops->moo_object_unlock);
+	return m->mo_ops->moo_object_unlock(env, m, einfo, policy);
+}
+
+static inline int mdo_lookup(const struct lu_env *env,
+                             struct md_object *p,
+                             const struct lu_name *lname,
+                             struct lu_fid *f,
+                             struct md_op_spec *spec)
+{
+        LASSERT(p->mo_dir_ops->mdo_lookup);
+        return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec);
+}
+
+static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env,
+                                       struct md_object *mo,
+                                       mdl_mode_t lm)
+{
+        if (mo->mo_dir_ops->mdo_lock_mode == NULL)
+                return MDL_MINMODE;
+        return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm);
+}
+
+static inline int mdo_create(const struct lu_env *env,
+                             struct md_object *p,
+                             const struct lu_name *lchild_name,
+                             struct md_object *c,
+                             struct md_op_spec *spc,
+                             struct md_attr *at)
+{
+	LASSERT(p->mo_dir_ops->mdo_create);
+	return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at);
+}
+
+static inline int mdo_create_data(const struct lu_env *env,
+                                  struct md_object *p,
+                                  struct md_object *c,
+                                  const struct md_op_spec *spec,
+                                  struct md_attr *ma)
+{
+        LASSERT(c->mo_dir_ops->mdo_create_data);
+        return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma);
+}
+
+static inline int mdo_rename(const struct lu_env *env,
+                             struct md_object *sp,
+                             struct md_object *tp,
+                             const struct lu_fid *lf,
+                             const struct lu_name *lsname,
+                             struct md_object *t,
+                             const struct lu_name *ltname,
+                             struct md_attr *ma)
+{
+        LASSERT(tp->mo_dir_ops->mdo_rename);
+        return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname,
+                                          ma);
+}
+
+static inline int mdo_migrate(const struct lu_env *env,
+			     struct md_object *pobj,
+			     struct md_object *sobj,
+			     const struct lu_name *lname,
+			     struct md_object *tobj,
+			     struct md_op_spec *spec,
+			     struct md_attr *ma)
+{
+	LASSERT(pobj->mo_dir_ops->mdo_migrate);
+	return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, spec,
+					     ma);
+}
+
+static inline int mdo_is_subdir(const struct lu_env *env,
+				struct md_object *mo,
+				const struct lu_fid *fid)
+{
+	LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+	return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid);
+}
+
+static inline int mdo_link(const struct lu_env *env,
+                           struct md_object *p,
+                           struct md_object *s,
+                           const struct lu_name *lname,
+                           struct md_attr *ma)
+{
+        LASSERT(s->mo_dir_ops->mdo_link);
+        return s->mo_dir_ops->mdo_link(env, p, s, lname, ma);
+}
+
+static inline int mdo_unlink(const struct lu_env *env,
+			     struct md_object *p,
+			     struct md_object *c,
+			     const struct lu_name *lname,
+			     struct md_attr *ma, int no_name)
+{
+	LASSERT(p->mo_dir_ops->mdo_unlink);
+	return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
+}
+
+static inline int mdo_statfs(const struct lu_env *env,
+			     struct md_device *m,
+			     struct obd_statfs *sfs)
+{
+	LASSERT(m->md_ops->mdo_statfs);
+	return m->md_ops->mdo_statfs(env, m, sfs);
+}
+
+struct dt_device;
+
+void lustre_som_swab(struct lustre_som_attrs *attrs);
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
+void lustre_hsm2buf(void *buf, const struct md_hsm *mh);
+
+enum {
+	UCRED_INVALID	= -1,
+	UCRED_INIT	= 0,
+	UCRED_OLD	= 1,
+	UCRED_NEW	= 2,
+};
+
+struct lu_ucred {
+	__u32			 uc_valid;
+	__u32			 uc_o_uid;
+	__u32			 uc_o_gid;
+	__u32			 uc_o_fsuid;
+	__u32			 uc_o_fsgid;
+	__u32			 uc_uid;
+	__u32			 uc_gid;
+	__u32			 uc_fsuid;
+	__u32			 uc_fsgid;
+	__u32			 uc_suppgids[2];
+	kernel_cap_t		 uc_cap;
+	__u32			 uc_umask;
+	struct group_info	*uc_ginfo;
+	struct md_identity	*uc_identity;
+	char			 uc_jobid[LUSTRE_JOBID_SIZE];
+	lnet_nid_t		 uc_nid;
+	bool			 uc_enable_audit;
+};
+
+struct lu_ucred *lu_ucred(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_check(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env);
+
+int lu_ucred_global_init(void);
+
+void lu_ucred_global_fini(void);
+
+/** @} md */
+#endif /* _LINUX_MD_OBJECT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h
new file mode 100644
index 0000000000000..4a42feb690f35
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd.h
@@ -0,0 +1,1376 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __OBD_H
+#define __OBD_H
+
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/kobject.h>
+#include <linux/spinlock.h>
+#include <linux/sysfs.h>
+#include <linux/xarray.h>
+
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <libcfs/bitmap.h>
+#ifdef HAVE_SERVER_SUPPORT
+# include <lu_target.h>
+# include <obd_target.h>
+# include <lustre_quota.h>
+#endif
+#include <lu_ref.h>
+#include <lustre_export.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include <lustre_handles.h>
+#include <lustre_intent.h>
+#include <lvfs.h>
+
+#define MAX_OBD_DEVICES 8192
+
+struct osc_async_rc {
+        int     ar_rc;
+        int     ar_force_sync;
+        __u64   ar_min_xid;
+};
+
+struct lov_oinfo {                 /* per-stripe data structure */
+	struct ost_id   loi_oi;    /* object ID/Sequence on the target OST */
+	int loi_ost_idx;           /* OST stripe index in lov_tgt_desc->tgts */
+	int loi_ost_gen;           /* generation of this loi_ost_idx */
+
+	unsigned long loi_kms_valid:1;
+	__u64 loi_kms;             /* known minimum size */
+	struct ost_lvb loi_lvb;
+	struct osc_async_rc     loi_ar;
+};
+
+void lov_fix_ea_for_replay(void *lovea);
+
+static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms)
+{
+        oinfo->loi_kms = kms;
+        oinfo->loi_kms_valid = 1;
+}
+
+struct lov_stripe_md;
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+	/* OBD_STATFS_* flags */
+	__u64                   oi_flags;
+	struct obd_device      *oi_obd;
+	struct lu_tgt_desc     *oi_tgt;
+        /* statfs data specific for every OSC, if needed at all. */
+        struct obd_statfs      *oi_osfs;
+        /* An update callback which is called to update some data on upper
+	 * level. E.g. it is used for update lsm->lsm_oinfo at every received
+         * request in osc level for enqueue requests. It is also possible to
+         * update some caller data from LOV layer if needed. */
+        obd_enqueue_update_f    oi_cb_up;
+};
+
+struct obd_type {
+	const struct obd_ops	*typ_dt_ops;
+	const struct md_ops	*typ_md_ops;
+	struct proc_dir_entry	*typ_procroot;
+	struct dentry		*typ_debugfs_entry;
+#ifdef HAVE_SERVER_SUPPORT
+	bool			 typ_sym_filter;
+#endif
+	atomic_t		 typ_refcnt;
+	struct lu_device_type	*typ_lu;
+	struct kobject		 typ_kobj;
+};
+#define typ_name typ_kobj.name
+#define OBD_LU_TYPE_SETUP ((void *)0x01UL)
+
+struct brw_page {
+	u64		 off;
+	struct page	*pg;
+	u32		 count;
+	u32		 flag;
+	/* used for encryption: difference with offset in clear text page */
+	u16		 bp_off_diff;
+	/* used for encryption: difference with count in clear text page */
+	u16		 bp_count_diff;
+	u32		 bp_padding;
+};
+
+struct timeout_item {
+	enum timeout_event ti_event;
+	time64_t	   ti_timeout;
+	timeout_cb_t       ti_cb;
+	void              *ti_cb_data;
+	struct list_head   ti_obd_list;
+	struct list_head   ti_chain;
+};
+
+#define OBD_MAX_RIF_DEFAULT	8
+#define OBD_MAX_RIF_MAX		512
+#define OSC_MAX_RIF_MAX		256
+#define OSC_MAX_DIRTY_DEFAULT	64
+#define OSC_MAX_DIRTY_MB_MAX	2048     /* arbitrary, but < MAX_LONG bytes */
+#define OSC_DEFAULT_RESENDS	10
+
+/* possible values for lut_sync_lock_cancel */
+enum tgt_sync_lock_cancel {
+	SYNC_LOCK_CANCEL_NEVER    = 0,
+	SYNC_LOCK_CANCEL_BLOCKING = 1,
+	SYNC_LOCK_CANCEL_ALWAYS   = 2,
+};
+
+/*
+ * Limit reply buffer size for striping data to one x86_64 page. This
+ * value is chosen to fit the striping data for common use cases while
+ * staying well below the limit at which the buffer must be backed by
+ * vmalloc(). Excessive use of vmalloc() may cause spinlock contention
+ * on the MDS.
+ */
+#define OBD_MAX_DEFAULT_EA_SIZE	4096
+
+/*
+ * Lustre can handle larger xattrs internally, but we must respect the Linux
+ * VFS limitation or tools like tar cannot interact with Lustre volumes
+ * correctly.
+ */
+#define OBD_MAX_EA_SIZE		XATTR_SIZE_MAX
+
+
+enum obd_cl_sem_lock_class {
+	OBD_CLI_SEM_NORMAL,
+	OBD_CLI_SEM_MGC,
+	OBD_CLI_SEM_MDCOSC,
+};
+
+struct mdc_rpc_lock;
+struct obd_import;
+struct client_obd {
+	struct rw_semaphore	 cl_sem;
+	struct obd_uuid		 cl_target_uuid;
+	struct obd_import	*cl_import; /* ptlrpc connection state */
+	size_t			 cl_conn_count;
+
+	/* Cache maximum and default values for easize. This is
+	 * strictly a performance optimization to minimize calls to
+	 * obd_size_diskmd(). The default values are used to calculate the
+	 * initial size of a request buffer. The ptlrpc layer will resize the
+	 * buffer as needed to accommodate a larger reply from the
+	 * server. The default values should be small enough to avoid wasted
+	 * memory and excessive use of vmalloc(), yet large enough to avoid
+	 * reallocating the buffer in the common use case. */
+
+	/* Default EA size for striping attributes. It is initialized at
+	 * mount-time based on the default stripe width of the filesystem,
+	 * then it tracks the largest observed EA size advertised by
+	 * the MDT, up to a maximum value of OBD_MAX_DEFAULT_EA_SIZE. */
+	__u32			 cl_default_mds_easize;
+
+	/* Maximum possible EA size computed at mount-time based on
+	 * the number of OSTs in the filesystem. May be increased at
+	 * run-time if a larger observed size is advertised by the MDT. */
+	__u32			 cl_max_mds_easize;
+
+	/* Data-on-MDT specific value to set larger reply buffer for possible
+	 * data read along with open/stat requests. By default it tries to use
+	 * unused space in reply buffer.
+	 * This value is used to ensure that reply buffer has at least as
+	 * much free space as value indicates. That free space is gained from
+	 * LOV EA buffer which is small for DoM files and on big systems can
+	 * provide up to 32KB of extra space in reply buffer.
+	 * Default value is 8K now.
+	 */
+	__u32			 cl_dom_min_inline_repsize;
+
+	unsigned int		 cl_checksum:1, /* 0 = disabled, 1 = enabled */
+				 cl_checksum_dump:1, /* same */
+				 cl_ocd_grant_param:1,
+				 cl_lsom_update:1; /* send LSOM updates */
+	enum lustre_sec_part	 cl_sp_me;
+	enum lustre_sec_part	 cl_sp_to;
+	struct sptlrpc_flavor	 cl_flvr_mgc; /* fixed flavor of mgc->mgs */
+
+	/* the grant values are protected by loi_list_lock below */
+	unsigned long		 cl_dirty_pages;      /* all _dirty_ in pages */
+	unsigned long		 cl_dirty_max_pages;  /* allowed w/o rpc */
+	unsigned long		 cl_avail_grant;   /* bytes of credit for ost */
+	unsigned long		 cl_lost_grant;    /* lost credits (trunc) */
+	/* grant consumed for dirty pages */
+	unsigned long		 cl_dirty_grant;
+
+	/* since we allocate grant by blocks, we don't know how many grant will
+	 * be used to add a page into cache. As a solution, we reserve maximum
+	 * grant before trying to dirty a page and unreserve the rest.
+	 * See osc_{reserve|unreserve}_grant for details. */
+	long			cl_reserved_grant;
+	wait_queue_head_t	cl_cache_waiters; /* waiting for cache/grant */
+	time64_t		cl_next_shrink_grant;	/* seconds */
+	struct list_head	cl_grant_chain;
+	time64_t		cl_grant_shrink_interval; /* seconds */
+
+	int			cl_root_squash; /* if root squash enabled*/
+
+	/* A chunk is an optimal size used by osc_extent to determine
+	 * the extent size. A chunk is max(PAGE_SIZE, OST block size) */
+	int			cl_chunkbits;
+	/* extent insertion metadata overhead to be accounted in grant,
+	 * in bytes */
+	unsigned int		cl_grant_extent_tax;
+	/* maximum extent size, in number of pages */
+	unsigned int		cl_max_extent_pages;
+
+	/* keep track of objects that have lois that contain pages which
+	 * have been queued for async brw.  this lock also protects the
+	 * lists of osc_client_pages that hang off of the loi */
+        /*
+         * ->cl_loi_list_lock protects consistency of
+         * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and
+         * ->ap_completion() call-backs are executed under this lock. As we
+         * cannot guarantee that these call-backs never block on all platforms
+         * (as a matter of fact they do block on Mac OS X), type of
+         * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux
+         * and blocking mutex on Mac OS X. (Alternative is to make this lock
+         * blocking everywhere, but we don't want to slow down fast-path of
+         * our main platform.)
+         *
+	 * NB by Jinshan: though field names are still _loi_, but actually
+	 * osc_object{}s are in the list.
+	 */
+	spinlock_t		cl_loi_list_lock;
+	struct list_head	cl_loi_ready_list;
+	struct list_head	cl_loi_hp_ready_list;
+	struct list_head	cl_loi_write_list;
+	struct list_head	cl_loi_read_list;
+	__u32			cl_r_in_flight;
+	__u32			cl_w_in_flight;
+	/* just a sum of the loi/lop pending numbers to be exported by /proc */
+	atomic_t		cl_pending_w_pages;
+	atomic_t		cl_pending_r_pages;
+	u32			cl_max_pages_per_rpc;
+	u32			cl_max_rpcs_in_flight;
+	u32			cl_max_short_io_bytes;
+	ktime_t			cl_stats_init;
+	struct obd_histogram	cl_read_rpc_hist;
+	struct obd_histogram	cl_write_rpc_hist;
+	struct obd_histogram	cl_read_page_hist;
+	struct obd_histogram	cl_write_page_hist;
+	struct obd_histogram	cl_read_offset_hist;
+	struct obd_histogram	cl_write_offset_hist;
+
+	/** LRU for osc caching pages */
+	struct cl_client_cache  *cl_cache;
+	/** member of cl_cache->ccc_lru */
+	struct list_head         cl_lru_osc;
+	/** # of available LRU slots left in the per-OSC cache.
+	 * Available LRU slots are shared by all OSCs of the same file system,
+	 * therefore this is a pointer to cl_client_cache::ccc_lru_left. */
+	atomic_long_t           *cl_lru_left;
+	/** # of busy LRU pages. A page is considered busy if it's in writeback
+	 * queue, or in transfer. Busy pages can't be discarded so they are not
+	 * in LRU cache. */
+	atomic_long_t            cl_lru_busy;
+	/** # of LRU pages in the cache for this client_obd */
+	atomic_long_t            cl_lru_in_list;
+	/** # of threads are shrinking LRU cache. To avoid contention, it's not
+	 * allowed to have multiple threads shrinking LRU cache. */
+	atomic_t                 cl_lru_shrinkers;
+	/** The time when this LRU cache was last used. */
+	time64_t		 cl_lru_last_used;
+	/** stats: how many reclaims have happened for this client_obd.
+	 * reclaim and shrink - shrink is async, voluntarily rebalancing;
+	 * reclaim is sync, initiated by IO thread when the LRU slots are
+	 * in shortage. */
+	__u64                    cl_lru_reclaim;
+	/** List of LRU pages for this client_obd */
+	struct list_head         cl_lru_list;
+	/** Lock for LRU page list */
+	spinlock_t		 cl_lru_list_lock;
+	/** # of unstable pages in this client_obd.
+	 * An unstable page is a page state that WRITE RPC has finished but
+	 * the transaction has NOT yet committed. */
+	atomic_long_t            cl_unstable_count;
+	/** Link to osc_shrinker_list */
+	struct list_head	 cl_shrink_list;
+
+	/* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
+	atomic_t		 cl_destroy_in_flight;
+	wait_queue_head_t	 cl_destroy_waitq;
+
+	/* modify rpcs in flight
+	 * currently used for metadata only */
+	spinlock_t		 cl_mod_rpcs_lock;
+	__u16			 cl_max_mod_rpcs_in_flight;
+	__u16			 cl_mod_rpcs_in_flight;
+	__u16			 cl_close_rpcs_in_flight;
+	wait_queue_head_t	 cl_mod_rpcs_waitq;
+	unsigned long		*cl_mod_tag_bitmap;
+	ktime_t			 cl_mod_rpcs_init;
+	struct obd_histogram	 cl_mod_rpcs_hist;
+
+	/* mgc datastruct */
+	struct mutex		  cl_mgc_mutex;
+	struct local_oid_storage *cl_mgc_los;
+	struct dt_object	 *cl_mgc_configs_dir;
+	struct obd_export        *cl_mgc_mgsexp;
+	atomic_t		  cl_mgc_refcount;
+	/* in-flight control list and total RPCs counter */
+	struct list_head	 cl_flight_waiters;
+	__u32			 cl_rpcs_in_flight;
+
+        /* supported checksum types that are worked out at connect time */
+        __u32                    cl_supp_cksum_types;
+        /* checksum algorithm to be used */
+	enum cksum_types	 cl_cksum_type;
+	/* preferred checksum algorithm to be used */
+	enum cksum_types	 cl_preferred_cksum_type;
+
+        /* also protected by the poorly named _loi_list_lock lock above */
+        struct osc_async_rc      cl_ar;
+
+	/* sequence manager */
+	struct lu_client_seq    *cl_seq;
+	struct rw_semaphore	 cl_seq_rwsem;
+
+	atomic_t		 cl_resends; /* resend count */
+
+	/* ptlrpc work for writeback in ptlrpcd context */
+	void			*cl_writeback_work;
+	void			*cl_lru_work;
+	struct mutex		  cl_quota_mutex;
+	/* hash tables for osc_quota_info */
+	struct cfs_hash		*cl_quota_hash[LL_MAXQUOTAS];
+	/* the xid of the request updating the hash tables */
+	__u64			 cl_quota_last_xid;
+	/* Links to the global list of registered changelog devices */
+	struct list_head	 cl_chg_dev_linkage;
+};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
+
+struct obd_id_info {
+	u32	 idx;
+	u64	*data;
+};
+
+struct echo_client_obd {
+	struct obd_export      *ec_exp;	/* the local connection to osc/lov */
+	spinlock_t		ec_lock;
+	struct list_head	ec_objects;
+	struct list_head	ec_locks;
+	__u64			ec_unique;
+};
+
+/* allow statfs data caching for 1 second */
+#define OBD_STATFS_CACHE_SECONDS 1
+/* arbitrary maximum. larger would be useless, allows catching bogus input */
+#define OBD_STATFS_CACHE_MAX_AGE 3600 /* seconds */
+/* By default, don't do time based negative cache invalidation */
+#define OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS (-1) /* seconds */
+
+#define lov_tgt_desc lu_tgt_desc
+
+struct lov_md_tgt_desc {
+	struct obd_device *lmtd_mdc;
+	__u32		   lmtd_index;
+};
+
+struct lov_obd {
+	struct lov_desc		desc;
+	struct lov_tgt_desc   **lov_tgts;		/* sparse array */
+	struct lu_tgt_pool	lov_packed;		/* all OSTs in a packed
+							   array */
+	struct mutex		lov_lock;
+	struct obd_connect_data	lov_ocd;
+	atomic_t		lov_refcount;
+	__u32			lov_death_row;	/* tgts scheduled to be deleted */
+	__u32			lov_tgt_size;	/* size of tgts array */
+	int			lov_connects;
+	int			lov_pool_count;
+	struct rhashtable       lov_pools_hash_body; /* used for key access */
+	struct list_head	lov_pool_list;	/* used for sequential access */
+	struct proc_dir_entry  *lov_pool_proc_entry;
+	enum lustre_sec_part	lov_sp_me;
+
+	/* Cached LRU and unstable data from upper layer */
+	struct cl_client_cache *lov_cache;
+
+	struct rw_semaphore	lov_notify_lock;
+	/* Data-on-MDT: MDC array */
+	struct lov_md_tgt_desc	*lov_mdc_tgts;
+
+	struct kobject		*lov_tgts_kobj;
+};
+
+#define lmv_tgt_desc lu_tgt_desc
+
+struct lmv_obd {
+	struct lu_client_fld	lmv_fld;
+	spinlock_t		lmv_lock;
+
+	int			connected;
+	int			max_easize;
+	int			max_def_easize;
+	u32			lmv_statfs_start;
+
+	struct lu_tgt_descs	lmv_mdt_descs;
+
+	struct obd_connect_data	conn_data;
+	struct kobject		*lmv_tgts_kobj;
+	void			*lmv_cache;
+
+	__u32			lmv_qos_rr_index;
+};
+
+#define lmv_mdt_count	lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count
+#define lmv_qos		lmv_mdt_descs.ltd_qos
+
+/* Minimum sector size is 512 */
+#define MAX_GUARD_NUMBER (PAGE_SIZE / 512)
+
+struct niobuf_local {
+	__u64		lnb_file_offset;
+	__u32		lnb_page_offset;
+	__u32		lnb_len;
+	__u32		lnb_flags;
+	int		lnb_rc;
+	struct page	*lnb_page;
+	void		*lnb_data;
+	__be16		lnb_guards[MAX_GUARD_NUMBER];
+	__u16		lnb_guard_rpc:1;
+	__u16		lnb_guard_disk:1;
+	/* separate unlock for read path to allow shared access */
+	__u16		lnb_locked:1;
+};
+
+struct tgt_thread_big_cache {
+	struct niobuf_local	local[PTLRPC_MAX_BRW_PAGES];
+};
+
+#define LUSTRE_FLD_NAME         "fld"
+#define LUSTRE_SEQ_NAME         "seq"
+
+#define LUSTRE_MDD_NAME         "mdd"
+#define LUSTRE_OSD_LDISKFS_NAME	"osd-ldiskfs"
+#define LUSTRE_OSD_ZFS_NAME     "osd-zfs"
+#define LUSTRE_VVP_NAME         "vvp"
+#define LUSTRE_LMV_NAME         "lmv"
+#define LUSTRE_SLP_NAME         "slp"
+#define LUSTRE_LOD_NAME		"lod"
+#define LUSTRE_OSP_NAME		"osp"
+#define LUSTRE_LWP_NAME		"lwp"
+
+/* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
+#define LUSTRE_MDS_NAME         "mds"
+#define LUSTRE_MDT_NAME         "mdt"
+#define LUSTRE_MDC_NAME         "mdc"
+#define LUSTRE_OSS_NAME         "ost"       /* FIXME change name to oss */
+#define LUSTRE_OST_NAME         "obdfilter" /* FIXME change name to ost */
+#define LUSTRE_OSC_NAME         "osc"
+#define LUSTRE_LOV_NAME         "lov"
+#define LUSTRE_MGS_NAME         "mgs"
+#define LUSTRE_MGC_NAME         "mgc"
+
+#define LUSTRE_ECHO_NAME        "obdecho"
+#define LUSTRE_ECHO_CLIENT_NAME "echo_client"
+#define LUSTRE_QMT_NAME         "qmt"
+
+/* Constant obd names (post-rename) */
+#define LUSTRE_MDS_OBDNAME "MDS"
+#define LUSTRE_OSS_OBDNAME "OSS"
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
+
+static inline int is_lwp_on_mdt(char *name)
+{
+	char   *ptr;
+
+	ptr = strrchr(name, '-');
+	if (ptr == NULL) {
+		CERROR("%s is not a obdname\n", name);
+		return 0;
+	}
+
+	/* LWP name on MDT is fsname-MDTxxxx-lwp-MDTxxxx */
+
+	if (strncmp(ptr + 1, "MDT", 3) != 0)
+		return 0;
+
+	while (*(--ptr) != '-' && ptr != name);
+
+	if (ptr == name)
+		return 0;
+
+	if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0)
+		return 0;
+
+	return 1;
+}
+
+static inline int is_lwp_on_ost(char *name)
+{
+	char   *ptr;
+
+	ptr = strrchr(name, '-');
+	if (ptr == NULL) {
+		CERROR("%s is not a obdname\n", name);
+		return 0;
+	}
+
+	/* LWP name on OST is fsname-MDTxxxx-lwp-OSTxxxx */
+
+	if (strncmp(ptr + 1, "OST", 3) != 0)
+		return 0;
+
+	while (*(--ptr) != '-' && ptr != name);
+
+	if (ptr == name)
+		return 0;
+
+	if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+        /* Device connect start */
+        OBD_NOTIFY_CONNECT,
+        /* Device activated */
+        OBD_NOTIFY_ACTIVE,
+        /* Device deactivated */
+        OBD_NOTIFY_INACTIVE,
+        /* Connect data for import were changed */
+        OBD_NOTIFY_OCD,
+        /* Administratively deactivate/activate event */
+        OBD_NOTIFY_DEACTIVATE,
+        OBD_NOTIFY_ACTIVATE
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * being main example).
+ */
+struct obd_notify_upcall {
+	int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+			  enum obd_notify_event ev, void *owner);
+        /* Opaque datum supplied by upper layer listener */
+        void *onu_owner;
+};
+
+struct target_recovery_data {
+	svc_handler_t		trd_recovery_handler;
+	pid_t			trd_processing_task;
+	struct completion	trd_starting;
+	struct completion	trd_finishing;
+};
+
+struct obd_llog_group {
+	struct llog_ctxt   *olg_ctxts[LLOG_MAX_CTXTS];
+	wait_queue_head_t  olg_waitq;
+	spinlock_t	   olg_lock;
+};
+
+/* corresponds to one of the obd's */
+#define OBD_DEVICE_MAGIC        0XAB5CD6EF
+
+struct obd_device {
+	struct obd_type			*obd_type;
+	__u32				 obd_magic; /* OBD_DEVICE_MAGIC */
+	int				 obd_minor; /* device number: lctl dl */
+	struct lu_device		*obd_lu_dev;
+
+	/* common and UUID name of this device */
+	struct obd_uuid			 obd_uuid;
+	char				 obd_name[MAX_OBD_NAME];
+
+	/* bitfield modification is protected by obd_dev_lock */
+	unsigned long
+		obd_attached:1,		/* finished attach */
+		obd_set_up:1,		/* finished setup */
+		obd_recovering:1,	/* there are recoverable clients */
+		obd_abort_recovery:1,	/* recovery expired */
+		obd_abort_recov_mdt:1,	/* only abort recovery between MDTs */
+		obd_version_recov:1,	/* obd uses version checking */
+		obd_replayable:1,	/* recovery enabled; inform clients */
+		obd_no_recov:1,		/* fail instead of retry messages */
+		obd_stopping:1,		/* started cleanup */
+		obd_starting:1,		/* started setup */
+		obd_force:1,		/* cleanup with > 0 obd refcount */
+		obd_fail:1,		/* cleanup with failover */
+		obd_no_conn:1,		/* deny new connections */
+		obd_inactive:1,		/* device active/inactive
+					 * (for /proc/status only!!) */
+		obd_no_ir:1,		/* no imperative recovery. */
+		obd_process_conf:1,	/* device is processing mgs config */
+		obd_checksum_dump:1,	/* dump pages upon cksum error */
+		obd_dynamic_nids:1;	/* Allow dynamic NIDs on device */
+#ifdef HAVE_SERVER_SUPPORT
+	/* no committed-transno notification */
+	unsigned long			obd_no_transno:1;
+#endif
+
+        /* use separate field as it is set in interrupt to don't mess with
+         * protection of other bits using _bh lock */
+        unsigned long obd_recovery_expired:1;
+        /* uuid-export hash body */
+	struct rhashtable		obd_uuid_hash;
+        /* nid-export hash body */
+	struct rhltable			obd_nid_hash;
+	/* nid stats body */
+	struct cfs_hash             *obd_nid_stats_hash;
+	/* client_generation-export hash body */
+	struct cfs_hash		    *obd_gen_hash;
+	struct list_head	obd_nid_stats;
+	struct list_head	obd_exports;
+	struct list_head	obd_unlinked_exports;
+	struct list_head	obd_delayed_exports;
+	struct list_head	obd_lwp_list;
+	atomic_t		obd_refcount;
+	int                     obd_num_exports;
+	int			obd_grant_check_threshold;
+	spinlock_t		obd_nid_lock;
+	struct ldlm_namespace  *obd_namespace;
+	struct ptlrpc_client	obd_ldlm_client; /* XXX OST/MDS only */
+	/* a spinlock is OK for what we do now, may need a semaphore later */
+	spinlock_t		obd_dev_lock; /* protect OBD bitfield above */
+	spinlock_t		obd_osfs_lock;
+	struct obd_statfs	obd_osfs;       /* locked by obd_osfs_lock */
+	time64_t		obd_osfs_age;
+	__u64			obd_last_committed;
+	struct mutex		obd_dev_mutex;
+	struct lvfs_run_ctxt	obd_lvfs_ctxt;
+	struct obd_llog_group	obd_olg;	/* default llog group */
+	struct obd_device	*obd_observer;
+	struct rw_semaphore	obd_observer_link_sem;
+        struct obd_notify_upcall obd_upcall;
+        struct obd_export       *obd_self_export;
+	struct obd_export	*obd_lwp_export;
+	/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
+	struct list_head	obd_exports_timed;
+	time64_t		obd_eviction_timer;	/* for ping evictor */
+
+	atomic_t                obd_max_recoverable_clients;
+	atomic_t                obd_connected_clients;
+	int                     obd_stale_clients;
+        /* this lock protects all recovery list_heads, timer and
+         * obd_next_recovery_transno value */
+	spinlock_t		obd_recovery_task_lock;
+	__u64			obd_next_recovery_transno;
+	int			obd_replayed_requests;
+	int			obd_requests_queued_for_recovery;
+	wait_queue_head_t	obd_next_transno_waitq;
+	/* protected by obd_recovery_task_lock */
+	struct hrtimer		obd_recovery_timer;
+	/* seconds */
+	time64_t		obd_recovery_start;
+	/* seconds, for lprocfs_status */
+	time64_t		obd_recovery_end;
+	/* To tell timeouts from time stamps Lustre uses timeout_t
+	 * instead of time64_t.
+	 */
+	timeout_t			obd_recovery_time_hard;
+	timeout_t			obd_recovery_timeout;
+	int				obd_recovery_ir_factor;
+
+	/* new recovery stuff from CMD2 */
+	int				obd_replayed_locks;
+	atomic_t			obd_req_replay_clients;
+	atomic_t			obd_lock_replay_clients;
+	struct target_recovery_data	obd_recovery_data;
+
+	/* all lists are protected by obd_recovery_task_lock */
+	struct list_head		obd_req_replay_queue;
+	struct list_head		obd_lock_replay_queue;
+	struct list_head		obd_final_req_queue;
+
+	union {
+#ifdef HAVE_SERVER_SUPPORT
+		struct obd_device_target obt;
+		struct filter_obd filter;
+		struct ost_obd ost;
+		struct echo_obd echo;
+#endif
+		struct client_obd cli;
+		struct echo_client_obd echo_client;
+		struct lov_obd lov;
+		struct lmv_obd lmv;
+	} u;
+
+	/* Fields used by LProcFS */
+	struct lprocfs_stats		*obd_stats;
+
+	struct lprocfs_stats		*obd_md_stats;
+
+	struct dentry			*obd_debugfs_entry;
+	struct proc_dir_entry	*obd_proc_entry;
+	struct proc_dir_entry	*obd_proc_exports_entry;
+	struct dentry			*obd_svc_debugfs_entry;
+	struct lprocfs_stats	*obd_svc_stats;
+	const struct attribute	       **obd_attrs;
+	struct lprocfs_vars	*obd_vars;
+	struct ldebugfs_vars	*obd_debugfs_vars;
+	atomic_t		obd_evict_inprogress;
+	wait_queue_head_t	obd_evict_inprogress_waitq;
+	struct list_head	obd_evict_list;	/* protected with pet_lock */
+
+	/**
+	 * LDLM pool part. Save last calculated SLV and Limit.
+	 */
+	rwlock_t			obd_pool_lock;
+	__u64				obd_pool_slv;
+	int				obd_pool_limit;
+
+	int				obd_conn_inprogress;
+
+	/**
+	 * List of outstanding class_incref()'s fo this OBD. For debugging. */
+	struct lu_ref			obd_reference;
+
+	struct kset		        obd_kset; /* sysfs object collection */
+	struct kobj_type		obd_ktype;
+	struct completion		obd_kobj_unregister;
+};
+
+int obd_uuid_add(struct obd_device *obd, struct obd_export *export);
+void obd_uuid_del(struct obd_device *obd, struct obd_export *export);
+#ifdef HAVE_SERVER_SUPPORT
+struct obd_export *obd_uuid_lookup(struct obd_device *obd,
+				   struct obd_uuid *uuid);
+
+int obd_nid_export_for_each(struct obd_device *obd, struct lnet_nid *nid,
+			    int cb(struct obd_export *exp, void *data),
+			    void *data);
+int obd_nid_add(struct obd_device *obd, struct obd_export *exp);
+void obd_nid_del(struct obd_device *obd, struct obd_export *exp);
+#endif
+
+/* get/set_info keys */
+#define KEY_ASYNC               "async"
+#define KEY_CHANGELOG_CLEAR     "changelog_clear"
+#define KEY_FID2PATH            "fid2path"
+#define KEY_CHECKSUM            "checksum"
+#define KEY_CLEAR_FS            "clear_fs"
+#define KEY_CONN_DATA           "conn_data"
+#define KEY_EVICT_BY_NID        "evict_by_nid"
+#define KEY_FIEMAP              "fiemap"
+#define KEY_FLUSH_CTX           "flush_ctx"
+#define KEY_GRANT_SHRINK        "grant_shrink"
+#define KEY_HSM_COPYTOOL_SEND   "hsm_send"
+#define KEY_INIT_RECOV_BACKUP   "init_recov_bk"
+#define KEY_INTERMDS            "inter_mds"
+#define KEY_LAST_ID             "last_id"
+#define KEY_LAST_FID		"last_fid"
+#define KEY_MAX_EASIZE		"max_easize"
+#define KEY_DEFAULT_EASIZE	"default_easize"
+#define KEY_MGSSEC              "mgssec"
+#define KEY_READ_ONLY           "read-only"
+#define KEY_REGISTER_TARGET     "register_target"
+#define KEY_SET_FS              "set_fs"
+#define KEY_TGT_COUNT           "tgt_count"
+/*      KEY_SET_INFO in lustre_idl.h */
+#define KEY_SPTLRPC_CONF        "sptlrpc_conf"
+
+#define KEY_CACHE_LRU_SHRINK	"cache_lru_shrink"
+#define KEY_OSP_CONNECTED	"osp_connected"
+
+/* Flags for op_xvalid */
+enum op_xvalid {
+	OP_XVALID_CTIME_SET	= BIT(0),	/* 0x0001 */
+	OP_XVALID_BLOCKS	= BIT(1),	/* 0x0002 */
+	OP_XVALID_OWNEROVERRIDE	= BIT(2),	/* 0x0004 */
+	OP_XVALID_FLAGS		= BIT(3),	/* 0x0008 */
+	OP_XVALID_PROJID	= BIT(4),	/* 0x0010 */
+	OP_XVALID_LAZYSIZE	= BIT(5),	/* 0x0020 */
+	OP_XVALID_LAZYBLOCKS	= BIT(6),	/* 0x0040 */
+};
+
+struct lu_context;
+
+static inline int it_to_lock_mode(struct lookup_intent *it)
+{
+	/* CREAT needs to be tested before open (both could be set) */
+	if (it->it_op & IT_CREAT)
+		return LCK_CW;
+	else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP))
+		return LCK_CR;
+	else if (it->it_op & IT_LAYOUT)
+		return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR;
+	else if (it->it_op &  IT_READDIR)
+		return LCK_PR;
+	else if (it->it_op &  IT_GETXATTR)
+		return LCK_PR;
+
+	LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
+	return -EINVAL;
+}
+
+enum md_op_flags {
+	MF_MDC_CANCEL_FID1	= BIT(0),
+	MF_MDC_CANCEL_FID2	= BIT(1),
+	MF_MDC_CANCEL_FID3	= BIT(2),
+	MF_MDC_CANCEL_FID4	= BIT(3),
+	MF_GET_MDT_IDX		= BIT(4),
+	MF_GETATTR_BY_FID	= BIT(5),
+	MF_QOS_MKDIR		= BIT(6),
+	MF_RR_MKDIR		= BIT(7),
+	MF_OPNAME_KMALLOCED	= BIT(8),
+};
+
+enum md_cli_flags {
+	CLI_SET_MEA	= BIT(0),
+	CLI_RM_ENTRY	= BIT(1),
+	CLI_HASH64	= BIT(2),
+	CLI_API32	= BIT(3),
+	CLI_MIGRATE	= BIT(4),
+	CLI_DIRTY_DATA	= BIT(5),
+	CLI_NO_SLOT     = BIT(6),
+};
+
+enum md_op_code {
+	LUSTRE_OPC_MKDIR = 1,
+	LUSTRE_OPC_SYMLINK,
+	LUSTRE_OPC_MKNOD,
+	LUSTRE_OPC_CREATE,
+	LUSTRE_OPC_ANY,
+	LUSTRE_OPC_LOOKUP,
+	LUSTRE_OPC_OPEN,
+};
+
+/**
+ * GETXATTR is not included as only a couple of fields in the reply body
+ * is filled, but not FID which is needed for common intent handling in
+ * mdc_finish_intent_lock()
+ */
+static inline bool it_has_reply_body(const struct lookup_intent *it)
+{
+	return it->it_op & (IT_OPEN | IT_LOOKUP | IT_GETATTR);
+}
+
+struct md_op_data {
+	struct lu_fid		op_fid1; /* operation fid1 (usualy parent) */
+	struct lu_fid		op_fid2; /* operation fid2 (usualy child) */
+	struct lu_fid		op_fid3; /* 2 extra fids to find conflicting */
+	struct lu_fid		op_fid4; /* to the operation locks. */
+	u32			op_mds;  /* what mds server open will go to */
+	__u32			op_mode;
+	enum md_op_code		op_code;
+	struct lustre_handle	op_open_handle;
+	s64			op_mod_time;
+	const char		*op_name;
+	size_t			op_namelen;
+	struct rw_semaphore	*op_mea1_sem;
+	struct rw_semaphore	*op_mea2_sem;
+	struct lmv_stripe_md	*op_mea1;
+	struct lmv_stripe_md	*op_mea2;
+	struct lmv_stripe_md	*op_default_mea1;	/* default LMV */
+	__u32			op_suppgids[2];
+	__u32			op_fsuid;
+	__u32			op_fsgid;
+	kernel_cap_t		op_cap;
+	void			*op_data;
+	size_t			op_data_size;
+
+	/* iattr fields and blocks. */
+	struct iattr            op_attr;
+	enum op_xvalid		op_xvalid;	/* eXtra validity flags */
+	loff_t                  op_attr_blocks;
+	u64			op_valid;	/* OBD_MD_* */
+	unsigned int		op_attr_flags;	/* LUSTRE_{SYNC,..}_FL */
+
+	enum md_op_flags	op_flags;
+
+	/* Various operation flags. */
+	enum mds_op_bias        op_bias;
+
+	/* used to transfer info between the stacks of MD client
+	 * see enum op_cli_flags */
+	enum md_cli_flags	op_cli_flags;
+
+	/* File object data version for HSM release, on client */
+	__u64			op_data_version;
+	struct lustre_handle	op_lease_handle;
+
+	/* File security context, for creates/metadata ops */
+	const char	       *op_file_secctx_name;
+	__u32			op_file_secctx_name_size;
+	void		       *op_file_secctx;
+	__u32			op_file_secctx_size;
+	int			op_file_secctx_slot;
+
+	/* File encryption context, for creates/metadata ops */
+	void		       *op_file_encctx;
+	__u32			op_file_encctx_size;
+
+	__u32			op_projid;
+
+	union {
+		/* Used by readdir */
+		unsigned int	op_max_pages;
+		/* mkdir */
+		unsigned short	op_dir_depth;
+	};
+
+	__u16			op_mirror_id;
+
+	/*
+	 * used to access dir that is changing layout: if it's set, access
+	 * dir by new layout, otherwise old layout.
+	 * By default it's not set, because new files are created under new
+	 * layout, if we can't find file with name under both old and new
+	 * layout, we are sure file with name doesn't exist, but in reverse
+	 * order there may be a race with creation by others.
+	 */
+	bool			op_new_layout;
+	/* used to access dir with bash hash */
+	__u32			op_stripe_index;
+	/* Archive ID for PCC attach */
+	__u32			op_archive_id;
+};
+
+struct md_readdir_info {
+	int (*mr_blocking_ast)(struct ldlm_lock *lock,
+			       struct ldlm_lock_desc *desc,
+			       void *data, int flag);
+	/* if striped directory is partially read, the result is stored here */
+	int mr_partial_readdir_rc;
+};
+
+struct md_enqueue_info;
+/* metadata stat-ahead */
+typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req,
+                                struct md_enqueue_info *minfo,
+                                int rc);
+
+struct md_enqueue_info {
+	struct md_op_data		mi_data;
+	struct lookup_intent		mi_it;
+	struct lustre_handle		mi_lockh;
+	struct inode		       *mi_dir;
+	struct ldlm_enqueue_info	mi_einfo;
+	md_enqueue_cb_t			mi_cb;
+	void			       *mi_cbdata;
+};
+
+struct obd_ops {
+	struct module *o_owner;
+	int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
+			   void *karg, void __user *uarg);
+	int (*o_get_info)(const struct lu_env *env, struct obd_export *,
+			  __u32 keylen, void *key, __u32 *vallen, void *val);
+	int (*o_set_info_async)(const struct lu_env *, struct obd_export *,
+				__u32 keylen, void *key,
+				__u32 vallen, void *val,
+				struct ptlrpc_request_set *set);
+	int (*o_setup) (struct obd_device *obd, struct lustre_cfg *cfg);
+	int (*o_precleanup)(struct obd_device *obd);
+	int (*o_cleanup)(struct obd_device *obd);
+	int (*o_process_config)(struct obd_device *obd, size_t len, void *data);
+	int (*o_postrecov)(struct obd_device *obd);
+	int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
+			  int priority);
+	int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
+	/* connect to the target device with given connection
+	 * data. @ocd->ocd_connect_flags is modified to reflect flags actually
+	 * granted by the target, which are guaranteed to be a subset of flags
+	 * asked for. If @ocd == NULL, use default parameters. */
+	int (*o_connect)(const struct lu_env *env,
+			 struct obd_export **exp, struct obd_device *src,
+			 struct obd_uuid *cluuid, struct obd_connect_data *ocd,
+			 void *localdata);
+	int (*o_reconnect)(const struct lu_env *env,
+			   struct obd_export *exp, struct obd_device *src,
+			   struct obd_uuid *cluuid,
+			   struct obd_connect_data *ocd,
+			   void *localdata);
+	int (*o_disconnect)(struct obd_export *exp);
+
+	/* Initialize/finalize fids infrastructure. */
+	int (*o_fid_init)(struct obd_device *obd,
+			  struct obd_export *exp, enum lu_cli_type type);
+	int (*o_fid_fini)(struct obd_device *obd);
+
+	/* Allocate new fid according to passed @hint. */
+	int (*o_fid_alloc)(const struct lu_env *env, struct obd_export *exp,
+			   struct lu_fid *fid, struct md_op_data *op_data);
+
+	/*
+	 * Object with @fid is getting deleted, we may want to do something
+	 * about this.
+	 */
+	int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
+			struct obd_statfs *osfs, time64_t max_age, __u32 flags);
+	int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
+			      time64_t max_age, struct ptlrpc_request_set *set);
+	int (*o_create)(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa);
+	int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
+			 struct obdo *oa);
+	int (*o_setattr)(const struct lu_env *, struct obd_export *exp,
+			 struct obdo *oa);
+	int (*o_getattr)(const struct lu_env *env, struct obd_export *exp,
+			 struct obdo *oa);
+	int (*o_preprw)(const struct lu_env *env, int cmd,
+			struct obd_export *exp, struct obdo *oa, int objcount,
+			struct obd_ioobj *obj, struct niobuf_remote *remote,
+			int *nr_pages, struct niobuf_local *local);
+	int (*o_commitrw)(const struct lu_env *env, int cmd,
+			  struct obd_export *exp, struct obdo *oa,
+			  int objcount, struct obd_ioobj *obj,
+			  struct niobuf_remote *remote, int pages,
+			  struct niobuf_local *local, int rc, int nob,
+			  ktime_t kstart);
+	int (*o_init_export)(struct obd_export *exp);
+	int (*o_destroy_export)(struct obd_export *exp);
+
+	int (*o_import_event)(struct obd_device *, struct obd_import *,
+			      enum obd_import_event);
+
+	int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
+			enum obd_notify_event ev);
+
+	int (*o_health_check)(const struct lu_env *env, struct obd_device *);
+	struct obd_uuid *(*o_get_uuid) (struct obd_export *exp);
+
+	/* quota methods */
+	int (*o_quotactl)(struct obd_device *, struct obd_export *,
+			  struct obd_quotactl *);
+
+	/* pools methods */
+	int (*o_pool_new)(struct obd_device *obd, char *poolname);
+	int (*o_pool_del)(struct obd_device *obd, char *poolname);
+	int (*o_pool_add)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+	int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+};
+
+/* lmv structures */
+struct lustre_md {
+	struct mdt_body         *body;
+	struct lu_buf		 layout;
+	union {
+		struct lmv_stripe_md    *lmv;
+		struct lmv_foreign_md   *lfm;
+	};
+	struct lmv_stripe_md    *default_lmv;
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+	struct posix_acl        *posix_acl;
+#endif
+};
+
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+static inline void lmd_clear_acl(struct lustre_md *md)
+{
+	if (md->posix_acl) {
+		posix_acl_release(md->posix_acl);
+		md->posix_acl = NULL;
+	}
+}
+
+#define OBD_CONNECT_ACL_FLAGS  \
+	(OBD_CONNECT_ACL | OBD_CONNECT_UMASK | OBD_CONNECT_LARGE_ACL)
+#else
+static inline void lmd_clear_acl(struct lustre_md *md)
+{
+}
+
+#define OBD_CONNECT_ACL_FLAGS  (0)
+#endif
+
+struct md_open_data {
+	struct obd_client_handle	*mod_och;
+	struct ptlrpc_request		*mod_open_req;
+	struct ptlrpc_request		*mod_close_req;
+	atomic_t			 mod_refcount;
+	bool				 mod_is_create;
+};
+
+struct obd_client_handle {
+	struct lustre_handle	 och_open_handle;
+	struct lu_fid		 och_fid;
+	struct md_open_data	*och_mod;
+	struct lustre_handle	 och_lease_handle; /* open lock for lease */
+	__u32			 och_magic;
+	int			 och_flags;
+};
+
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
+struct lookup_intent;
+struct cl_attr;
+
+struct md_ops {
+	int (*m_close)(struct obd_export *, struct md_op_data *,
+		       struct md_open_data *, struct ptlrpc_request **);
+
+	int (*m_create)(struct obd_export *, struct md_op_data *,
+			const void *, size_t, umode_t, uid_t, gid_t,
+			kernel_cap_t, __u64, struct ptlrpc_request **);
+
+	int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *,
+			 const union ldlm_policy_data *, struct md_op_data *,
+			 struct lustre_handle *, __u64);
+
+	int (*m_getattr)(struct obd_export *, struct md_op_data *,
+			 struct ptlrpc_request **);
+
+	int (*m_intent_lock)(struct obd_export *, struct md_op_data *,
+			     struct lookup_intent *,
+			     struct ptlrpc_request **,
+			     ldlm_blocking_callback, __u64);
+
+	int (*m_link)(struct obd_export *, struct md_op_data *,
+		      struct ptlrpc_request **);
+
+	int (*m_rename)(struct obd_export *, struct md_op_data *,
+			const char *, size_t, const char *, size_t,
+			struct ptlrpc_request **);
+
+	int (*m_setattr)(struct obd_export *, struct md_op_data *, void *,
+			 size_t , struct ptlrpc_request **);
+
+	int (*m_fsync)(struct obd_export *, const struct lu_fid *,
+		       struct ptlrpc_request **);
+
+	int (*m_read_page)(struct obd_export *, struct md_op_data *,
+			   struct md_readdir_info *mrinfo, __u64 hash_offset,
+			   struct page **ppage);
+
+	int (*m_unlink)(struct obd_export *, struct md_op_data *,
+			struct ptlrpc_request **);
+
+	int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
+			  u64, const char *, const void *, size_t, unsigned int,
+			  u32, struct ptlrpc_request **);
+
+	int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
+			  u64, const char *, size_t, struct ptlrpc_request **);
+
+	int (*m_intent_getattr_async)(struct obd_export *,
+				      struct md_enqueue_info *);
+
+        int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
+                                 struct lu_fid *, __u64 *bits);
+
+	int (*m_file_resync)(struct obd_export *, struct md_op_data *);
+
+	int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *);
+	int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
+
+	int (*m_getattr_name)(struct obd_export *, struct md_op_data *,
+			      struct ptlrpc_request **);
+
+	int (*m_init_ea_size)(struct obd_export *, __u32, __u32);
+
+	int (*m_get_lustre_md)(struct obd_export *, struct req_capsule *,
+			       struct obd_export *, struct obd_export *,
+			       struct lustre_md *);
+
+	int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
+
+	int (*m_merge_attr)(struct obd_export *,
+			    const struct lmv_stripe_md *lsm,
+			    struct cl_attr *attr, ldlm_blocking_callback);
+
+	int (*m_set_open_replay_data)(struct obd_export *,
+				      struct obd_client_handle *,
+				      struct lookup_intent *);
+
+	int (*m_clear_open_replay_data)(struct obd_export *,
+					struct obd_client_handle *);
+
+	int (*m_set_lock_data)(struct obd_export *,
+			       const struct lustre_handle *, void *, __u64 *);
+
+	enum ldlm_mode (*m_lock_match)(struct obd_export *, __u64,
+				       const struct lu_fid *, enum ldlm_type,
+				       union ldlm_policy_data *, enum ldlm_mode,
+				       struct lustre_handle *);
+
+	int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *,
+			       union ldlm_policy_data *, enum ldlm_mode,
+			       enum ldlm_cancel_flags flags, void *opaque);
+
+	int (*m_get_fid_from_lsm)(struct obd_export *,
+				  const struct lmv_stripe_md *,
+				  const char *name, int namelen,
+				  struct lu_fid *fid);
+	int (*m_unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm,
+			  const union lmv_mds_md *lmv, size_t lmv_size);
+	int (*m_rmfid)(struct obd_export *exp, struct fid_array *fa, int *rcs,
+		       struct ptlrpc_request_set *set);
+};
+
+static inline struct md_open_data *obd_mod_alloc(void)
+{
+	struct md_open_data *mod;
+	OBD_ALLOC_PTR(mod);
+	if (mod == NULL)
+		return NULL;
+	atomic_set(&mod->mod_refcount, 1);
+	return mod;
+}
+
+#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount)
+#define obd_mod_put(mod)                                          \
+({                                                                \
+	if (atomic_dec_and_test(&(mod)->mod_refcount)) {      	  \
+		if ((mod)->mod_open_req)                          \
+			ptlrpc_req_finished((mod)->mod_open_req); \
+		OBD_FREE_PTR(mod);                                \
+	}                                                         \
+})
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid);
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent);
+void obdo_set_o_projid(struct obdo *dst, u32 projid);
+
+/* return 1 if client should be resend request */
+static inline int client_should_resend(int resend, struct client_obd *cli)
+{
+	return atomic_read(&cli->cl_resends) ?
+	       atomic_read(&cli->cl_resends) > resend : 1;
+}
+
+/**
+ * Return device name for this device
+ *
+ * XXX: lu_device is declared before obd_device, while a pointer pointing
+ * back to obd_device in lu_device, so this helper function defines here
+ * instead of in lu_object.h
+ */
+static inline const char *lu_dev_name(const struct lu_device *lu_dev)
+{
+        return lu_dev->ld_obd->obd_name;
+}
+
+static inline bool filename_is_volatile(const char *name, size_t namelen,
+					int *idx)
+{
+	const char	*start;
+	char		*end;
+
+	if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0)
+		return false;
+
+	/* caller does not care of idx */
+	if (idx == NULL)
+		return true;
+
+	/* volatile file, the MDT can be set from name */
+	/* name format is LUSTRE_VOLATILE_HDR:[idx]: */
+	/* if no MDT is specified, use std way */
+	if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2)
+		goto bad_format;
+	/* test for no MDT idx case */
+	if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') &&
+	    (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) {
+		*idx = -1;
+		return true;
+	}
+	/* we have an idx, read it */
+	start = name + LUSTRE_VOLATILE_HDR_LEN + 1;
+	*idx = simple_strtoul(start, &end, 16);
+	/* error cases:
+	 * no digit, no trailing :, negative value
+	 */
+	if (((*idx == 0) && (end == start)) ||
+	    (*end != ':') || (*idx < 0))
+		goto bad_format;
+
+	return true;
+bad_format:
+	/* bad format of mdt idx, we cannot return an error
+	 * to caller so we use hash algo */
+	CERROR("Bad volatile file name format: %s\n",
+	       name + LUSTRE_VOLATILE_HDR_LEN);
+	return false;
+}
+
+static inline int cli_brw_size(struct obd_device *obd)
+{
+	LASSERT(obd != NULL);
+	return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT;
+}
+
+/*
+ * When RPC size or the max RPCs in flight is increased, the max dirty pages
+ * of the client should be increased accordingly to avoid sending fragmented
+ * RPCs over the network when the client runs out of the maximum dirty space
+ * when so many RPCs are being generated.
+ */
+static inline void client_adjust_max_dirty(struct client_obd *cli)
+{
+	 /* initializing */
+	if (cli->cl_dirty_max_pages <= 0) {
+		cli->cl_dirty_max_pages =
+			(OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) >> PAGE_SHIFT;
+	} else {
+		unsigned long dirty_max = cli->cl_max_rpcs_in_flight *
+					  cli->cl_max_pages_per_rpc;
+
+		if (dirty_max > cli->cl_dirty_max_pages)
+			cli->cl_dirty_max_pages = dirty_max;
+	}
+
+	if (cli->cl_dirty_max_pages > cfs_totalram_pages() / 8)
+		cli->cl_dirty_max_pages = cfs_totalram_pages() / 8;
+
+	/* This value is exported to userspace through the max_dirty_mb
+	 * parameter.  So we round up the number of pages to make it a round
+	 * number of MBs. */
+	cli->cl_dirty_max_pages = round_up(cli->cl_dirty_max_pages,
+					   1 << (20 - PAGE_SHIFT));
+}
+
+/* Must be used for page cache pages only,
+ * not safe otherwise (e.g. direct IO pages)
+ */
+static inline struct inode *page2inode(struct page *page)
+{
+	if (page->mapping) {
+		if (PageAnon(page))
+			return NULL;
+		else
+			return page->mapping->host;
+	} else {
+		return NULL;
+	}
+}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cache.h b/drivers/staging/lustrefsx/lustre/include/obd_cache.h
new file mode 100644
index 0000000000000..128fad781edcb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_cache.h
@@ -0,0 +1,34 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
new file mode 100644
index 0000000000000..1f9f0b14a5975
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
@@ -0,0 +1,193 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __OBD_CKSUM
+#define __OBD_CKSUM
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_crypto.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+
+int obd_t10_cksum_speed(const char *obd_name,
+			enum cksum_types cksum_type);
+
+static inline unsigned char cksum_obd2cfs(enum cksum_types cksum_type)
+{
+	switch (cksum_type) {
+	case OBD_CKSUM_CRC32:
+		return CFS_HASH_ALG_CRC32;
+	case OBD_CKSUM_ADLER:
+		return CFS_HASH_ALG_ADLER32;
+	case OBD_CKSUM_CRC32C:
+		return CFS_HASH_ALG_CRC32C;
+	default:
+		CERROR("Unknown checksum type (%x)!!!\n", cksum_type);
+		LBUG();
+	}
+	return 0;
+}
+
+u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type);
+
+static inline enum cksum_types obd_cksum_type_unpack(u32 o_flags)
+{
+	switch (o_flags & OBD_FL_CKSUM_ALL) {
+	case OBD_FL_CKSUM_CRC32C:
+		return OBD_CKSUM_CRC32C;
+	case OBD_FL_CKSUM_CRC32:
+		return OBD_CKSUM_CRC32;
+	case OBD_FL_CKSUM_T10IP512:
+		return OBD_CKSUM_T10IP512;
+	case OBD_FL_CKSUM_T10IP4K:
+		return OBD_CKSUM_T10IP4K;
+	case OBD_FL_CKSUM_T10CRC512:
+		return OBD_CKSUM_T10CRC512;
+	case OBD_FL_CKSUM_T10CRC4K:
+		return OBD_CKSUM_T10CRC4K;
+	default:
+		break;
+	}
+
+	return OBD_CKSUM_ADLER;
+}
+
+/* Return a bitmask of the checksum types supported on this system.
+ * 1.8 supported ADLER it is base and not depend on hw
+ * Client uses all available local algos
+ */
+static inline enum cksum_types obd_cksum_types_supported_client(void)
+{
+	enum cksum_types ret = OBD_CKSUM_ADLER;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
+		ret |= OBD_CKSUM_CRC32;
+
+	/* Client support all kinds of T10 checksum */
+	ret |= OBD_CKSUM_T10_ALL;
+
+	return ret;
+}
+
+enum cksum_types obd_cksum_types_supported_server(const char *obd_name);
+
+/* Select the best checksum algorithm among those supplied in the cksum_types
+ * input.
+ *
+ * Currently, calling cksum_type_pack() with a mask will return the fastest
+ * checksum type due to its benchmarking at libcfs module load.
+ * Caution is advised, however, since what is fastest on a single client may
+ * not be the fastest or most efficient algorithm on the server.  */
+static inline
+enum cksum_types obd_cksum_type_select(const char *obd_name,
+				       enum cksum_types cksum_types,
+				       enum cksum_types preferred)
+{
+	u32 flag;
+
+	if (preferred & cksum_types)
+		return preferred;
+
+	/*
+	 * Server reporting a single T10 checksum type
+	 * means the target actually supports T10-PI.
+	 */
+	if (hweight32(cksum_types & OBD_CKSUM_T10_ALL) == 1)
+		return cksum_types & OBD_CKSUM_T10_ALL;
+
+	flag = obd_cksum_type_pack(obd_name, cksum_types);
+
+	return obd_cksum_type_unpack(flag);
+}
+
+/* Checksum algorithm names. Must be defined in the same order as the
+ * OBD_CKSUM_* flags. */
+#define DECLARE_CKSUM_NAME const char *const cksum_name[] = {"crc32", "adler", \
+	"crc32c", "reserved", "t10ip512", "t10ip4K", "t10crc512", "t10crc4K"}
+
+typedef __be16 (obd_dif_csum_fn) (void *, unsigned int);
+
+__be16 obd_dif_crc_fn(void *data, unsigned int len);
+__be16 obd_dif_ip_fn(void *data, unsigned int len);
+int obd_page_dif_generate_buffer(const char *obd_name, struct page *page,
+				 __u32 offset, __u32 length,
+				 __be16 *guard_start, int guard_number,
+				 int *used_number, int sector_size,
+				 obd_dif_csum_fn *fn);
+/*
+ * If checksum type is one T10 checksum types, init the csum_fn and sector
+ * size. Otherwise, init them to NULL/zero.
+ */
+static inline void obd_t10_cksum2dif(enum cksum_types cksum_type,
+				     obd_dif_csum_fn **fn, int *sector_size)
+{
+	*fn = NULL;
+	*sector_size = 0;
+
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+	switch (cksum_type) {
+	case OBD_CKSUM_T10IP512:
+		*fn = obd_dif_ip_fn;
+		*sector_size = 512;
+		break;
+	case OBD_CKSUM_T10IP4K:
+		*fn = obd_dif_ip_fn;
+		*sector_size = 4096;
+		break;
+	case OBD_CKSUM_T10CRC512:
+		*fn = obd_dif_crc_fn;
+		*sector_size = 512;
+		break;
+	case OBD_CKSUM_T10CRC4K:
+		*fn = obd_dif_crc_fn;
+		*sector_size = 4096;
+		break;
+	default:
+		break;
+	}
+#endif /* CONFIG_CRC_T10DIF */
+}
+
+enum obd_t10_cksum_type {
+	OBD_T10_CKSUM_UNKNOWN = 0,
+	OBD_T10_CKSUM_IP512,
+	OBD_T10_CKSUM_IP4K,
+	OBD_T10_CKSUM_CRC512,
+	OBD_T10_CKSUM_CRC4K,
+	OBD_T10_CKSUM_MAX
+};
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h
new file mode 100644
index 0000000000000..8d93466d61b5b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h
@@ -0,0 +1,1954 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#ifndef __CLASS_OBD_H
+#define __CLASS_OBD_H
+
+#include <linux/kobject.h>
+#include <obd_support.h>
+#include <lustre_import.h>
+#include <lustre_net.h>
+#include <obd.h>
+#include <lustre_lib.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lprocfs_status.h>
+
+#define OBD_STATFS_NODELAY	0x0001	/* requests should be send without delay
+					 * and resends for avoid deadlocks */
+#define OBD_STATFS_FROM_CACHE	0x0002	/* the statfs callback should not update
+					 * obd_osfs_age */
+#define OBD_STATFS_FOR_MDT0	0x0004	/* The statfs is only for retrieving
+					 * information from MDT0. */
+#define OBD_STATFS_SUM		0x0008	/* get aggregated statfs from MDT */
+#define OBD_STATFS_NESTED	0x0010	/* Call while already holding
+					 * obd_dev_mutex of a difference
+					 * device.
+					 */
+
+extern rwlock_t obd_dev_lock;
+
+/* OBD Operations Declarations */
+extern struct obd_device *class_exp2obd(struct obd_export *);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+int lustre_get_jobid(char *jobid, size_t len);
+void lustre_jobid_clear(const char *jobid);
+void jobid_cache_fini(void);
+int jobid_cache_init(void);
+char *jobid_current(void);
+int jobid_set_current(char *jobid);
+
+struct lu_device_type;
+
+/* genops.c */
+struct obd_export *class_conn2export(struct lustre_handle *);
+#ifdef HAVE_SERVER_SUPPORT
+struct obd_type *class_add_symlinks(const char *name, bool enable_proc);
+#endif
+int class_register_type(const struct obd_ops *dt_ops,
+			const struct md_ops *md_ops, bool enable_proc,
+			const char *nm, struct lu_device_type *ldt);
+int class_unregister_type(const char *nm);
+
+struct obd_device *class_newdev(const char *type_name, const char *name,
+				const char *uuid);
+int class_register_device(struct obd_device *obd);
+void class_unregister_device(struct obd_device *obd);
+void class_free_dev(struct obd_device *obd);
+
+struct obd_device *class_dev_by_str(const char *str);
+int class_name2dev(const char *name);
+struct obd_device *class_name2obd(const char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
+struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid,
+					 const char *type_name,
+					 struct obd_uuid *grp_uuid);
+struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid,
+					  int *next);
+struct obd_device *class_num2obd(int num);
+int get_devices_count(void);
+
+int class_notify_sptlrpc_conf(const char *fsname, int namelen);
+
+static inline char *obd_export_nid2str(struct obd_export *exp)
+{
+	return exp->exp_connection == NULL ?
+	       "<unknown>" : libcfs_nidstr(&exp->exp_connection->c_peer.nid);
+}
+
+static inline char *obd_import_nid2str(struct obd_import *imp)
+{
+	return imp->imp_connection == NULL ?
+	       "<unknown>" : libcfs_nidstr(&imp->imp_connection->c_peer.nid);
+}
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
+int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
+			  const char *sep);
+
+int obd_zombie_impexp_init(void);
+void obd_zombie_impexp_stop(void);
+void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
+int kuc_len(int payload_len);
+struct kuc_hdr * kuc_ptr(void *p);
+void *kuc_alloc(int payload_len, int transport, int type);
+void kuc_free(void *p, int payload_len);
+int obd_get_request_slot(struct client_obd *cli);
+void obd_put_request_slot(struct client_obd *cli);
+__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli);
+int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max);
+__u16 obd_get_max_mod_rpcs_in_flight(struct client_obd *cli);
+int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max);
+int obd_mod_rpc_stats_seq_show(struct client_obd *cli, struct seq_file *seq);
+
+__u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc);
+void obd_put_mod_rpc_slot(struct client_obd *cli, __u32 opc, __u16 tag);
+
+struct llog_handle;
+struct llog_rec_hdr;
+typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *,
+			 struct llog_rec_hdr *, void *);
+
+struct obd_export *obd_stale_export_get(void);
+void obd_stale_export_put(struct obd_export *exp);
+void obd_stale_export_adjust(struct obd_export *exp);
+
+/* obd_config.c */
+/* For interoperability */
+struct cfg_interop_param {
+	char *old_param;
+	char *new_param;
+};
+
+char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index);
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name);
+void print_lustre_cfg(struct lustre_cfg *lcfg);
+int class_process_config(struct lustre_cfg *lcfg);
+ssize_t class_set_global(const char *param);
+ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix,
+			    struct kobject *kobj);
+int class_attach(struct lustre_cfg *lcfg);
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+int class_find_param(char *buf, char *key, char **valp);
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr);
+int class_get_next_param(char **params, char *copy);
+int class_match_param(char *buf, const char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_net(char *buf, u32 *net, char **endh);
+int class_match_nid(char *buf, char *key, lnet_nid_t nid);
+int class_match_net(char *buf, char *key, u32 net);
+
+struct obd_device *class_incref(struct obd_device *obd,
+                                const char *scope, const void *source);
+void class_decref(struct obd_device *obd,
+                  const char *scope, const void *source);
+void dump_exports(struct obd_device *obd, int locks, int debug_level);
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+#define CFG_F_START     0x01   /* Set when we start updating from a log */
+#define CFG_F_MARKER    0x02   /* We are within a maker */
+#define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
+#define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
+
+/* Passed as data param to class_config_parse_llog */
+struct config_llog_instance {
+	unsigned long		 cfg_instance;
+	struct super_block	*cfg_sb;
+	struct obd_uuid		 cfg_uuid;
+	llog_cb_t		 cfg_callback;
+	int			 cfg_last_idx; /* for partial llog processing */
+	int			 cfg_flags;
+	__u32			 cfg_lwp_idx;
+	__u32			 cfg_sub_clds;
+};
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg);
+
+/**
+ * Generate a unique configuration instance for this mount
+ *
+ * Temporary hack to bypass ASLR in 4.15+ kernels, a better fix soon.
+ * For now, use the same value as before - the superblock pointer value.
+ *
+ * Using the client UUID would be an option, but it needs more testing.
+ */
+static inline unsigned long ll_get_cfg_instance(struct super_block *sb)
+{
+	return (unsigned long)sb;
+}
+
+#define CONFIG_SUB_SPTLRPC	0x01
+#define CONFIG_SUB_RECOVER	0x02
+#define CONFIG_SUB_PARAMS	0x04
+#define CONFIG_SUB_NODEMAP	0x08
+#define CONFIG_SUB_BARRIER	0x10
+
+/* Sub clds should be attached to the config_llog_data when processing
+ * config log for client or server target. */
+#define CONFIG_SUB_CLIENT	(CONFIG_SUB_SPTLRPC | CONFIG_SUB_RECOVER | \
+				 CONFIG_SUB_PARAMS)
+#define CONFIG_SUB_SERVER	(CONFIG_SUB_CLIENT | CONFIG_SUB_NODEMAP | \
+				 CONFIG_SUB_BARRIER)
+
+#define PARAMS_FILENAME		"params"
+#define BARRIER_FILENAME	"barrier"
+#define LCTL_UPCALL		"lctl"
+
+static inline bool logname_is_barrier(const char *logname)
+{
+	char *ptr;
+
+	/* logname for barrier is "fsname-barrier" */
+	ptr = strstr(logname, BARRIER_FILENAME);
+	if (ptr && (ptr - logname) >= 2 &&
+	    *(ptr - 1) == '-' && *(ptr + 7) == '\0')
+		return true;
+
+	return false;
+}
+
+/* list of active configuration logs  */
+struct config_llog_data {
+	struct ldlm_res_id	    cld_resid;
+	struct lustre_handle	    cld_lockh;
+	struct config_llog_instance cld_cfg;
+	struct list_head	    cld_list_chain;/* on config_llog_list */
+	atomic_t		    cld_refcount;
+	struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
+	struct config_llog_data    *cld_params;	/* common parameters log */
+	struct config_llog_data    *cld_recover;/* imperative recover log */
+	struct config_llog_data    *cld_nodemap;/* nodemap log */
+	struct config_llog_data    *cld_barrier;/* barrier log (for MDT only) */
+	struct obd_export	   *cld_mgcexp;
+	struct mutex		    cld_lock;
+	enum mgs_cfg_type	    cld_type;
+	unsigned int		    cld_stopping:1, /* we were told to stop
+						     * watching */
+				    cld_lostlock:1; /* lock not requeued */
+	char			    cld_logname[0];
+};
+
+struct lustre_profile {
+	struct list_head	 lp_list;
+	char			*lp_profile;
+	char			*lp_dt;
+	char			*lp_md;
+	int			 lp_refs;
+	bool			 lp_list_deleted;
+};
+
+struct lustre_profile *class_get_profile(const char * prof);
+void class_del_profile(const char *prof);
+void class_put_profile(struct lustre_profile *lprof);
+void class_del_profiles(void);
+
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *);
+void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *);
+extern void (*class_export_dump_hook)(struct obd_export *);
+
+#else
+
+#define __class_export_add_lock_ref(exp, lock)             do {} while(0)
+#define __class_export_del_lock_ref(exp, lock)             do {} while(0)
+
+#endif
+
+#define class_export_rpc_inc(exp)                                       \
+({                                                                      \
+	atomic_inc(&(exp)->exp_rpc_count);                          	\
+	CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n",    \
+	       (exp), atomic_read(&(exp)->exp_rpc_count));          	\
+})
+
+#define class_export_rpc_dec(exp)                                       \
+({                                                                      \
+	LASSERT_ATOMIC_POS(&exp->exp_rpc_count);                        \
+	atomic_dec(&(exp)->exp_rpc_count);                          	\
+	CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n",    \
+	       (exp), atomic_read(&(exp)->exp_rpc_count));          	\
+})
+
+#define class_export_lock_get(exp, lock)                                \
+({                                                                      \
+	atomic_inc(&(exp)->exp_locks_count);                        	\
+	__class_export_add_lock_ref(exp, lock);                         \
+	CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));        	\
+	class_export_get(exp);                                          \
+})
+
+#define class_export_lock_put(exp, lock)                                \
+({                                                                      \
+	LASSERT_ATOMIC_POS(&exp->exp_locks_count);                      \
+	atomic_dec(&(exp)->exp_locks_count);                        	\
+	__class_export_del_lock_ref(exp, lock);                         \
+	CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));        	\
+	class_export_put(exp);                                          \
+})
+
+#define class_export_cb_get(exp)                                        \
+({                                                                      \
+	atomic_inc(&(exp)->exp_cb_count);                           	\
+	CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));           	\
+	class_export_get(exp);                                          \
+})
+
+#define class_export_cb_put(exp)                                        \
+({                                                                      \
+	LASSERT_ATOMIC_POS(&exp->exp_cb_count);                         \
+	atomic_dec(&(exp)->exp_cb_count);                           	\
+	CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));           	\
+	class_export_put(exp);                                          \
+})
+
+/* genops.c */
+struct obd_export *class_export_get(struct obd_export *exp);
+void class_export_put(struct obd_export *exp);
+struct obd_export *class_new_export(struct obd_device *obd,
+                                    struct obd_uuid *cluuid);
+struct obd_export *class_new_export_self(struct obd_device *obd,
+					 struct obd_uuid *uuid);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(struct obd_device *obd);
+void class_destroy_import(struct obd_import *exp);
+
+#ifdef HAVE_SERVER_SUPPORT
+struct obd_type *class_search_type(const char *name);
+struct obd_type *class_get_type(const char *name);
+#endif
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+                  struct obd_uuid *cluuid);
+int class_disconnect(struct obd_export *exp);
+void class_fail_export(struct obd_export *exp);
+int class_connected_export(struct obd_export *exp);
+void class_disconnect_exports(struct obd_device *obd);
+int class_manual_cleanup(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *,
+                                    int (*test_export)(struct obd_export *));
+
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+        return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+                (obd->obd_force ? OBD_OPT_FORCE : 0) |
+                (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+                0);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+static inline struct lu_target *class_exp2tgt(struct obd_export *exp)
+{
+        LASSERT(exp->exp_obd);
+	if (exp->exp_obd->u.obt.obt_magic != OBT_MAGIC)
+		return NULL;
+        return exp->exp_obd->u.obt.obt_lut;
+}
+
+static inline struct lr_server_data *class_server_data(struct obd_device *obd)
+{
+        LASSERT(obd->u.obt.obt_lut);
+        return &obd->u.obt.obt_lut->lut_lsd;
+}
+#endif
+
+/* obdo.c */
+struct lu_attr;
+struct inode;
+
+void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid);
+void la_from_obdo(struct lu_attr *la, const struct obdo *dst, u64 valid);
+
+void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid);
+void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj);
+
+#define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
+#define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
+
+static inline int obd_check_dev(struct obd_device *obd)
+{
+	if (!obd) {
+		CERROR("NULL device\n");
+		return -ENODEV;
+	}
+	return 0;
+}
+
+/* ensure obd_setup and !obd_stopping */
+#define OBD_CHECK_DEV_ACTIVE(obd)                               \
+do {                                                            \
+	rc = obd_check_dev(obd);				\
+	if (rc)							\
+		return rc;					\
+								\
+        if (!(obd)->obd_set_up || (obd)->obd_stopping) {        \
+                CERROR("Device %d not setup\n",                 \
+                       (obd)->obd_minor);                       \
+                RETURN(-ENODEV);                                \
+        }                                                       \
+} while (0)
+
+
+static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
+{
+	/* Always add in ldlm_stats */
+	tmp->nid_ldlm_stats =
+		lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC,
+				    LPROCFS_STATS_FLAG_NOPERCPU);
+	if (tmp->nid_ldlm_stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+	return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+				      tmp->nid_ldlm_stats);
+}
+
+static inline int exp_check_ops(struct obd_export *exp)
+{
+	if (exp == NULL) {
+		RETURN(-ENODEV);
+	}
+	if (exp->exp_obd == NULL || !exp->exp_obd->obd_type) {
+		RETURN(-EOPNOTSUPP);
+	}
+	RETURN(0);
+}
+
+static inline int class_devno_max(void)
+{
+	return MAX_OBD_DEVICES;
+}
+
+static inline int obd_get_info(const struct lu_env *env, struct obd_export *exp,
+			       __u32 keylen, void *key,
+			       __u32 *vallen, void *val)
+{
+	int rc;
+	ENTRY;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_get_info) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+	rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val);
+	RETURN(rc);
+}
+
+static inline int obd_set_info_async(const struct lu_env *env,
+				     struct obd_export *exp,
+				     __u32 keylen, void *key,
+				     __u32 vallen, void *val,
+				     struct ptlrpc_request_set *set)
+{
+        int rc;
+        ENTRY;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_set_info_async) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+        rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
+                                               val, set);
+        RETURN(rc);
+}
+
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+        int rc;
+	struct obd_type *type = obd->obd_type;
+	struct lu_device_type *ldt;
+
+	ENTRY;
+
+	wait_var_event(&type->typ_lu,
+		       smp_load_acquire(&type->typ_lu) != OBD_LU_TYPE_SETUP);
+	ldt = type->typ_lu;
+	if (ldt != NULL) {
+		struct lu_context session_ctx;
+		struct lu_env env;
+
+		lu_context_init(&session_ctx, LCT_SESSION | LCT_SERVER_SESSION);
+		session_ctx.lc_thread = NULL;
+		lu_context_enter(&session_ctx);
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			struct lu_device *dev;
+			env.le_ses = &session_ctx;
+			dev = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+			lu_env_fini(&env);
+			if (!IS_ERR(dev)) {
+				obd->obd_lu_dev = dev;
+				dev->ld_obd = obd;
+				rc = 0;
+			} else
+				rc = PTR_ERR(dev);
+		}
+		lu_context_exit(&session_ctx);
+		lu_context_fini(&session_ctx);
+	} else {
+		if (!obd->obd_type->typ_dt_ops->o_setup) {
+			CERROR("%s: no %s operation\n", obd->obd_name,
+			       __func__);
+			RETURN(-EOPNOTSUPP);
+		}
+		rc = OBP(obd, setup)(obd, cfg);
+	}
+	RETURN(rc);
+}
+
+static inline int obd_precleanup(struct obd_device *obd)
+{
+	int rc;
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d = obd->obd_lu_dev;
+
+	ENTRY;
+
+	if (ldt != NULL && d != NULL) {
+		struct lu_env *env = lu_env_find();
+		struct lu_env _env;
+
+		if (!env) {
+			env = &_env;
+			rc = lu_env_init(env, ldt->ldt_ctx_tags);
+			LASSERT(rc == 0);
+			lu_env_add(env);
+		}
+		ldt->ldt_ops->ldto_device_fini(env, d);
+		if (env == &_env) {
+			lu_env_remove(env);
+			lu_env_fini(env);
+		}
+	}
+
+	if (!obd->obd_type->typ_dt_ops->o_precleanup)
+		RETURN(0);
+
+	rc = OBP(obd, precleanup)(obd);
+	RETURN(rc);
+}
+
+static inline int obd_cleanup(struct obd_device *obd)
+{
+        int rc;
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d = obd->obd_lu_dev;
+
+	ENTRY;
+        if (ldt != NULL && d != NULL) {
+                struct lu_env env;
+
+                rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+                if (rc == 0) {
+                        ldt->ldt_ops->ldto_device_free(&env, d);
+                        lu_env_fini(&env);
+                        obd->obd_lu_dev = NULL;
+                }
+        }
+	if (!obd->obd_type->typ_dt_ops->o_cleanup)
+		RETURN(0);
+
+        rc = OBP(obd, cleanup)(obd);
+        RETURN(rc);
+}
+
+static inline void obd_cleanup_client_import(struct obd_device *obd)
+{
+	ENTRY;
+
+	/* If we set up but never connected, the client import will not
+	 * have been cleaned.
+	 */
+	down_write(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import) {
+		struct obd_import *imp;
+
+		imp = obd->u.cli.cl_import;
+		CDEBUG(D_CONFIG, "%s: client import never connected\n",
+		       obd->obd_name);
+		ptlrpc_invalidate_import(imp);
+		client_destroy_import(imp);
+		obd->u.cli.cl_import = NULL;
+	}
+	up_write(&obd->u.cli.cl_sem);
+
+	EXIT;
+}
+
+static inline int obd_process_config(struct obd_device *obd, int datalen,
+				     void *data)
+{
+        int rc;
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d = obd->obd_lu_dev;
+
+	ENTRY;
+
+        obd->obd_process_conf = 1;
+        if (ldt != NULL && d != NULL) {
+                struct lu_env env;
+
+                rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+                if (rc == 0) {
+                        rc = d->ld_ops->ldo_process_config(&env, d, data);
+                        lu_env_fini(&env);
+                }
+        } else {
+		if (!obd->obd_type->typ_dt_ops->o_process_config) {
+			CERROR("%s: no %s operation\n",
+			       obd->obd_name, __func__);
+			RETURN(-EOPNOTSUPP);
+		}
+                rc = OBP(obd, process_config)(obd, datalen, data);
+        }
+
+        obd->obd_process_conf = 0;
+
+        RETURN(rc);
+}
+
+static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
+			     struct obdo *obdo)
+{
+	int rc;
+	ENTRY;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_create) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+	rc = OBP(exp->exp_obd, create)(env, exp, obdo);
+	RETURN(rc);
+}
+
+static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
+			      struct obdo *obdo)
+{
+	int rc;
+	ENTRY;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_destroy) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+	rc = OBP(exp->exp_obd, destroy)(env, exp, obdo);
+	RETURN(rc);
+}
+
+static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obdo *oa)
+{
+	int rc;
+
+	ENTRY;
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_getattr) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+	rc = OBP(exp->exp_obd, getattr)(env, exp, oa);
+
+	RETURN(rc);
+}
+
+static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obdo *oa)
+{
+	int rc;
+
+	ENTRY;
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_setattr) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+	rc = OBP(exp->exp_obd, setattr)(env, exp, oa);
+
+	RETURN(rc);
+}
+
+static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                               int priority)
+{
+        struct obd_device *obd = imp->imp_obd;
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DEV_ACTIVE(obd);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_add_conn) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+        rc = OBP(obd, add_conn)(imp, uuid, priority);
+        RETURN(rc);
+}
+
+static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+        struct obd_device *obd = imp->imp_obd;
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DEV_ACTIVE(obd);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_del_conn) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+        rc = OBP(obd, del_conn)(imp, uuid);
+        RETURN(rc);
+}
+
+static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
+{
+        struct obd_uuid *uuid;
+        ENTRY;
+
+	if (!exp->exp_obd->obd_type ||
+	    !exp->exp_obd->obd_type->typ_dt_ops->o_get_uuid)
+		RETURN(NULL);
+
+        uuid = OBP(exp->exp_obd, get_uuid)(exp);
+        RETURN(uuid);
+}
+
+/** Create a new /a exp on device /a obd for the uuid /a cluuid
+ * @param exp New export handle
+ * @param d Connect data, supported flags are set, flags also understood
+ *    by obd are returned.
+ */
+static inline int obd_connect(const struct lu_env *env,
+                              struct obd_export **exp,struct obd_device *obd,
+                              struct obd_uuid *cluuid,
+                              struct obd_connect_data *data,
+                              void *localdata)
+{
+        int rc;
+        __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition
+                                                   * check */
+        ENTRY;
+
+        OBD_CHECK_DEV_ACTIVE(obd);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_connect) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+        rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
+        /* check that only subset is granted */
+        LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) ==
+                                    data->ocd_connect_flags));
+        RETURN(rc);
+}
+
+static inline int obd_reconnect(const struct lu_env *env,
+                                struct obd_export *exp,
+                                struct obd_device *obd,
+                                struct obd_uuid *cluuid,
+                                struct obd_connect_data *d,
+                                void *localdata)
+{
+        int rc;
+        __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
+                                                   * check */
+
+        ENTRY;
+
+        OBD_CHECK_DEV_ACTIVE(obd);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_reconnect)
+		RETURN(0);
+
+        rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
+        /* check that only subset is granted */
+        LASSERT(ergo(d != NULL,
+                     (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+        RETURN(rc);
+}
+
+static inline int obd_disconnect(struct obd_export *exp)
+{
+        int rc;
+        ENTRY;
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_disconnect) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+        rc = OBP(exp->exp_obd, disconnect)(exp);
+        RETURN(rc);
+}
+
+static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
+			       enum lu_cli_type type)
+{
+	int rc;
+	ENTRY;
+
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_init)
+		RETURN(0);
+
+	rc = OBP(obd, fid_init)(obd, exp, type);
+	RETURN(rc);
+}
+
+static inline int obd_fid_fini(struct obd_device *obd)
+{
+	int rc;
+	ENTRY;
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_fini)
+		RETURN(0);
+
+	rc = OBP(obd, fid_fini)(obd);
+	RETURN(rc);
+}
+
+static inline int obd_fid_alloc(const struct lu_env *env,
+				struct obd_export *exp,
+                                struct lu_fid *fid,
+                                struct md_op_data *op_data)
+{
+	int rc;
+	ENTRY;
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_fid_alloc) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+	rc = OBP(exp->exp_obd, fid_alloc)(env, exp, fid, op_data);
+	RETURN(rc);
+}
+
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+        int rc;
+        ENTRY;
+
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_new) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+        rc = OBP(obd, pool_new)(obd, poolname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+        int rc;
+        ENTRY;
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_del) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+        rc = OBP(obd, pool_del)(obd, poolname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname,
+			       char *ostname)
+{
+        int rc;
+        ENTRY;
+
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_add) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+        rc = OBP(obd, pool_add)(obd, poolname, ostname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname,
+			       char *ostname)
+{
+	int rc;
+
+	ENTRY;
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_rem) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+	rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+	RETURN(rc);
+}
+
+static inline int obd_init_export(struct obd_export *exp)
+{
+	int rc = 0;
+
+	ENTRY;
+	if (exp->exp_obd != NULL && exp->exp_obd->obd_type &&
+	    OBP((exp)->exp_obd, init_export))
+		rc = OBP(exp->exp_obd, init_export)(exp);
+	RETURN(rc);
+}
+
+static inline int obd_destroy_export(struct obd_export *exp)
+{
+	ENTRY;
+	if (exp->exp_obd != NULL && exp->exp_obd->obd_type &&
+	    OBP(exp->exp_obd, destroy_export))
+		OBP(exp->exp_obd, destroy_export)(exp);
+	RETURN(0);
+}
+
+/* @max_age is the oldest time in seconds that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness.
+ */
+static inline int obd_statfs_async(struct obd_export *exp,
+				   struct obd_info *oinfo,
+				   time64_t max_age,
+				   struct ptlrpc_request_set *rqset)
+{
+	struct obd_device *obd;
+	int rc = 0;
+
+	ENTRY;
+
+	if (exp == NULL || exp->exp_obd == NULL)
+		RETURN(-EINVAL);
+
+	obd = exp->exp_obd;
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs) {
+		rc = -EOPNOTSUPP;
+		CERROR("%s: no statfs operation: rc = %d\n", obd->obd_name, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n",
+	       obd->obd_name, obd->obd_osfs_age, max_age);
+	rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+
+	RETURN(rc);
+}
+
+/* @max_age is the oldest time in seconds that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness.
+ */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+			     struct obd_statfs *osfs, time64_t max_age,
+			     __u32 flags)
+{
+	struct obd_device *obd;
+	int rc = 0;
+
+	ENTRY;
+	if (unlikely(exp == NULL || exp->exp_obd == NULL))
+		RETURN(-EINVAL);
+
+	obd = exp->exp_obd;
+	OBD_CHECK_DEV_ACTIVE(obd);
+
+	if (unlikely(!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs)) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+	CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n",
+	       obd->obd_name, obd->obd_osfs_age, max_age);
+	/* ignore cache if aggregated isn't expected */
+	if (obd->obd_osfs_age < max_age ||
+	    ((obd->obd_osfs.os_state & OS_STATFS_SUM) &&
+	     !(flags & OBD_STATFS_SUM))) {
+		/* the RPC will block anyway, so avoid sending many at once */
+		rc = mutex_lock_interruptible_nested(&obd->obd_dev_mutex,
+						     (flags & OBD_STATFS_NESTED)
+						     ? SINGLE_DEPTH_NESTING : 0);
+		if (rc)
+			RETURN(rc);
+		if (obd->obd_osfs_age < max_age ||
+		    ((obd->obd_osfs.os_state & OS_STATFS_SUM) &&
+		     !(flags & OBD_STATFS_SUM))) {
+			rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+		} else {
+			mutex_unlock(&obd->obd_dev_mutex);
+			GOTO(cached, rc = 0);
+		}
+		if (rc == 0) {
+			CDEBUG(D_SUPER,
+			       "%s: update %p cache blocks %llu/%llu objects %llu/%llu\n",
+			       obd->obd_name, &obd->obd_osfs,
+			       osfs->os_bavail, osfs->os_blocks,
+			       osfs->os_ffree, osfs->os_files);
+
+			spin_lock(&obd->obd_osfs_lock);
+			memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
+			obd->obd_osfs_age = ktime_get_seconds();
+			spin_unlock(&obd->obd_osfs_lock);
+		}
+		mutex_unlock(&obd->obd_dev_mutex);
+	} else {
+cached:
+		CDEBUG(D_SUPER,
+		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+	}
+	RETURN(rc);
+}
+
+static inline int obd_preprw(const struct lu_env *env, int cmd,
+			     struct obd_export *exp, struct obdo *oa,
+			     int objcount, struct obd_ioobj *obj,
+			     struct niobuf_remote *remote, int *pages,
+			     struct niobuf_local *local)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_preprw) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+	rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
+				       pages, local);
+
+	RETURN(rc);
+}
+
+static inline int obd_commitrw(const struct lu_env *env, int cmd,
+			       struct obd_export *exp, struct obdo *oa,
+			       int objcount, struct obd_ioobj *obj,
+			       struct niobuf_remote *rnb, int pages,
+			       struct niobuf_local *local, const int orig_rc,
+			       int nob, ktime_t kstart)
+{
+	int rc;
+	ENTRY;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_commitrw) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+	rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
+					 rnb, pages, local, orig_rc, nob,
+					 kstart);
+
+	RETURN(rc);
+}
+
+static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
+				int len, void *karg, void __user *uarg)
+{
+        int rc;
+        ENTRY;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_iocontrol) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+        rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
+        RETURN(rc);
+}
+
+static inline void obd_import_event(struct obd_device *obd,
+                                    struct obd_import *imp,
+                                    enum obd_import_event event)
+{
+        ENTRY;
+        if (!obd) {
+                CERROR("NULL device\n");
+                EXIT;
+                return;
+        }
+
+        if (obd->obd_set_up && OBP(obd, import_event))
+                OBP(obd, import_event)(obd, imp, event);
+
+        EXIT;
+}
+
+static inline int obd_notify(struct obd_device *obd,
+			     struct obd_device *watched,
+			     enum obd_notify_event ev)
+{
+	int rc;
+	ENTRY;
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+
+	if (!obd->obd_set_up) {
+		CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	if (!OBP(obd, notify)) {
+		CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name);
+		RETURN(-ENOSYS);
+	}
+
+	rc = OBP(obd, notify)(obd, watched, ev);
+
+	RETURN(rc);
+}
+
+static inline int obd_notify_observer(struct obd_device *observer,
+				      struct obd_device *observed,
+				      enum obd_notify_event ev)
+{
+	int rc = 0;
+	int rc2 = 0;
+	struct obd_notify_upcall *onu;
+
+	if (observer->obd_observer)
+		rc = obd_notify(observer->obd_observer, observed, ev);
+
+	/*
+	 * Also, call non-obd listener, if any
+	 */
+	onu = &observer->obd_upcall;
+	if (onu->onu_upcall != NULL)
+		rc2 = onu->onu_upcall(observer, observed, ev, onu->onu_owner);
+
+	return rc ? rc : rc2;
+}
+
+static inline int obd_quotactl(struct obd_export *exp,
+                               struct obd_quotactl *oqctl)
+{
+        int rc;
+        ENTRY;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_quotactl) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
+        rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
+        RETURN(rc);
+}
+
+static inline int obd_health_check(const struct lu_env *env,
+				   struct obd_device *obd)
+{
+	/* returns: 0 on healthy
+	 *         >0 on unhealthy + reason code/flag
+	 *            however the only suppored reason == 1 right now
+	 *            We'll need to define some better reasons
+	 *            or flags in the future.
+	 *         <0 on error
+	 */
+	int rc;
+
+	ENTRY;
+
+	/* NULL method is normal here */
+	if (obd == NULL || !obd->obd_type) {
+		CERROR("cleaned up obd\n");
+		RETURN(-EOPNOTSUPP);
+	}
+	if (!obd->obd_set_up || obd->obd_stopping)
+		RETURN(0);
+	if (!OBP(obd, health_check))
+		RETURN(0);
+
+	rc = OBP(obd, health_check)(env, obd);
+	RETURN(rc);
+}
+
+static inline int obd_register_observer(struct obd_device *obd,
+                                        struct obd_device *observer)
+{
+	int rc;
+        ENTRY;
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+
+	down_write(&obd->obd_observer_link_sem);
+        if (obd->obd_observer && observer) {
+		up_write(&obd->obd_observer_link_sem);
+                RETURN(-EALREADY);
+        }
+        obd->obd_observer = observer;
+	up_write(&obd->obd_observer_link_sem);
+        RETURN(0);
+}
+
+/* metadata helpers */
+enum mps_stat_idx {
+	LPROC_MD_CLOSE,
+	LPROC_MD_CREATE,
+	LPROC_MD_ENQUEUE,
+	LPROC_MD_GETATTR,
+	LPROC_MD_INTENT_LOCK,
+	LPROC_MD_LINK,
+	LPROC_MD_RENAME,
+	LPROC_MD_SETATTR,
+	LPROC_MD_FSYNC,
+	LPROC_MD_READ_PAGE,
+	LPROC_MD_UNLINK,
+	LPROC_MD_SETXATTR,
+	LPROC_MD_GETXATTR,
+	LPROC_MD_INTENT_GETATTR_ASYNC,
+	LPROC_MD_REVALIDATE_LOCK,
+	LPROC_MD_LAST_OPC,
+};
+
+static inline int md_get_root(struct obd_export *exp, const char *fileset,
+			      struct lu_fid *fid)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, get_root)(exp, fileset, fid);
+}
+
+static inline int md_getattr(struct obd_export *exp,
+			     struct md_op_data *op_data,
+			     struct ptlrpc_request **request)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_GETATTR);
+
+	return MDP(exp->exp_obd, getattr)(exp, op_data, request);
+}
+
+static inline int md_null_inode(struct obd_export *exp,
+                                   const struct lu_fid *fid)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, null_inode)(exp, fid);
+}
+
+static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
+                           struct md_open_data *mod,
+                           struct ptlrpc_request **request)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_CLOSE);
+
+	return MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+}
+
+static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
+			    const void *data, size_t datalen, umode_t mode,
+			    uid_t uid, gid_t gid, kernel_cap_t cap_effective,
+			    __u64 rdev, struct ptlrpc_request **request)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_CREATE);
+
+	return MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+					 uid, gid, cap_effective, rdev,
+					 request);
+}
+
+static inline int md_enqueue(struct obd_export *exp,
+			     struct ldlm_enqueue_info *einfo,
+			     const union ldlm_policy_data *policy,
+			     struct md_op_data *op_data,
+			     struct lustre_handle *lockh,
+			     __u64 extra_lock_flags)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_ENQUEUE);
+
+	return MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh,
+		   extra_lock_flags);
+}
+
+static inline int md_getattr_name(struct obd_export *exp,
+                                  struct md_op_data *op_data,
+                                  struct ptlrpc_request **request)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+}
+
+static inline int md_intent_lock(struct obd_export *exp,
+				 struct md_op_data *op_data,
+				 struct lookup_intent *it,
+				 struct ptlrpc_request **reqp,
+				 ldlm_blocking_callback cb_blocking,
+				 __u64 extra_lock_flags)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_INTENT_LOCK);
+
+	return MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp,
+					      cb_blocking, extra_lock_flags);
+}
+
+static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
+                          struct ptlrpc_request **request)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_LINK);
+
+	return MDP(exp->exp_obd, link)(exp, op_data, request);
+}
+
+static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
+			    const char *old_name, size_t oldlen,
+			    const char *new_name, size_t newlen,
+			    struct ptlrpc_request **request)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_RENAME);
+
+	return MDP(exp->exp_obd, rename)(exp, op_data, old_name, oldlen,
+					 new_name, newlen, request);
+}
+
+static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
+			     void *ea, size_t ealen,
+			     struct ptlrpc_request **request)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_SETATTR);
+
+	return MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request);
+}
+
+static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
+			   struct ptlrpc_request **request)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_FSYNC);
+
+	return MDP(exp->exp_obd, fsync)(exp, fid, request);
+}
+
+/* FLR: resync mirrored files. */
+static inline int md_file_resync(struct obd_export *exp,
+				 struct md_op_data *data)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, file_resync)(exp, data);
+}
+
+static inline int md_read_page(struct obd_export *exp,
+			       struct md_op_data *op_data,
+			       struct md_readdir_info *mrinfo,
+			       __u64  hash_offset, struct page **ppage)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_READ_PAGE);
+
+	return MDP(exp->exp_obd, read_page)(exp, op_data, mrinfo, hash_offset,
+					    ppage);
+}
+
+static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
+                            struct ptlrpc_request **request)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_UNLINK);
+
+	return MDP(exp->exp_obd, unlink)(exp, op_data, request);
+}
+
+static inline int md_get_lustre_md(struct obd_export *exp,
+				   struct req_capsule *pill,
+				   struct obd_export *dt_exp,
+				   struct obd_export *md_exp,
+				   struct lustre_md *md)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, get_lustre_md)(exp, pill, dt_exp, md_exp, md);
+}
+
+static inline int md_free_lustre_md(struct obd_export *exp,
+                                    struct lustre_md *md)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, free_lustre_md)(exp, md);
+}
+
+static inline int md_merge_attr(struct obd_export *exp,
+				const struct lmv_stripe_md *lsm,
+				struct cl_attr *attr,
+				ldlm_blocking_callback cb)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb);
+}
+
+static inline int md_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+			      u64 obd_md_valid, const char *name,
+			      const void *value, size_t value_size,
+			      unsigned int xattr_flags, u32 suppgid,
+			      struct ptlrpc_request **req)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_SETXATTR);
+
+	return MDP(exp->exp_obd, setxattr)(exp, fid, obd_md_valid, name,
+					   value, value_size, xattr_flags,
+					   suppgid, req);
+}
+
+static inline int md_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+			      u64 obd_md_valid, const char *name,
+			      size_t buf_size, struct ptlrpc_request **req)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_GETXATTR);
+
+	return MDP(exp->exp_obd, getxattr)(exp, fid, obd_md_valid, name,
+					   buf_size, req);
+}
+
+static inline int md_set_open_replay_data(struct obd_export *exp,
+					  struct obd_client_handle *och,
+					  struct lookup_intent *it)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it);
+}
+
+static inline int md_clear_open_replay_data(struct obd_export *exp,
+                                            struct obd_client_handle *och)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, clear_open_replay_data)(exp, och);
+}
+
+static inline int md_set_lock_data(struct obd_export *exp,
+				   const struct lustre_handle *lockh,
+				   void *data, __u64 *bits)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits);
+}
+
+static inline
+int md_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+		     union ldlm_policy_data *policy, enum ldlm_mode mode,
+		     enum ldlm_cancel_flags cancel_flags, void *opaque)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+						cancel_flags, opaque);
+}
+
+static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags,
+					   const struct lu_fid *fid,
+					   enum ldlm_type type,
+					   union ldlm_policy_data *policy,
+					   enum ldlm_mode mode,
+					   struct lustre_handle *lockh)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+					     policy, mode, lockh);
+}
+
+static inline int md_init_ea_size(struct obd_export *exp, __u32 ea_size,
+				  __u32 def_ea_size)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size);
+}
+
+static inline int md_intent_getattr_async(struct obd_export *exp,
+					  struct md_enqueue_info *minfo)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_INTENT_GETATTR_ASYNC);
+
+	return MDP(exp->exp_obd, intent_getattr_async)(exp, minfo);
+}
+
+static inline int md_revalidate_lock(struct obd_export *exp,
+                                     struct lookup_intent *it,
+                                     struct lu_fid *fid, __u64 *bits)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_REVALIDATE_LOCK);
+
+	return MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+}
+
+static inline int md_get_fid_from_lsm(struct obd_export *exp,
+				      const struct lmv_stripe_md *lsm,
+				      const char *name, int namelen,
+				      struct lu_fid *fid)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen,
+						   fid);
+}
+
+/* Unpack an MD struct from disk to in-memory format.
+ * Returns +ve size of unpacked MD (0 for free), or -ve error.
+ *
+ * If *plsm != NULL and lmm == NULL then *lsm will be freed.
+ * If *plsm == NULL then it will be allocated.
+ */
+static inline int md_unpackmd(struct obd_export *exp,
+			      struct lmv_stripe_md **plsm,
+			      const union lmv_mds_md *lmm, size_t lmm_size)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size);
+}
+
+static inline int md_rmfid(struct obd_export *exp, struct fid_array *fa,
+			   int *rcs, struct ptlrpc_request_set *set)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, rmfid)(exp, fa, rcs, set);
+}
+
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
+
+typedef int (*register_lwp_cb)(void *data);
+
+struct lwp_register_item {
+	struct obd_export **lri_exp;
+	register_lwp_cb	    lri_cb_func;
+	void		   *lri_cb_data;
+	struct list_head    lri_list;
+	atomic_t	    lri_ref;
+	char		    lri_name[MTI_NAME_MAXLEN];
+};
+
+/* obd_mount.c */
+#ifdef HAVE_SERVER_SUPPORT
+int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp,
+			     register_lwp_cb cb_func, void *cb_data);
+void lustre_deregister_lwp_item(struct obd_export **exp);
+struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx);
+void lustre_notify_lwp_list(struct obd_export *exp);
+int tgt_name2lwp_name(const char *tgt_name, char *lwp_name, int len, __u32 idx);
+int lustre_tgt_register_fs(void);
+void lustre_tgt_unregister_fs(void);
+#endif /* HAVE_SERVER_SUPPORT */
+int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/* lustre_peer.c    */
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
+int class_add_uuid(const char *uuid, __u64 nid);
+int class_del_uuid (const char *uuid);
+int class_add_nids_to_uuid(struct obd_uuid *uuid, lnet_nid_t *nids,
+			   int nid_count);
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
+
+/* class_obd.c */
+extern char obd_jobid_name[];
+
+extern unsigned int obd_lbug_on_eviction;
+extern unsigned int obd_dump_on_eviction;
+
+static inline bool do_dump_on_eviction(struct obd_device *exp_obd)
+{
+	if (obd_lbug_on_eviction &&
+	    strncmp(exp_obd->obd_type->typ_name, LUSTRE_MGC_NAME,
+		    strlen(LUSTRE_MGC_NAME))) {
+		CERROR("LBUG upon eviction\n");
+		LBUG();
+	}
+
+	return obd_dump_on_eviction;
+}
+
+/* statfs_pack.c */
+struct kstatfs;
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs);
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs);
+
+/* root squash info */
+struct root_squash_info {
+	uid_t			rsi_uid;
+	gid_t			rsi_gid;
+	struct list_head	rsi_nosquash_nids;
+	spinlock_t		rsi_lock;
+};
+
+int server_name2index(const char *svname, __u32 *idx, const char **endptr);
+
+/* linux-module.c */
+struct obd_ioctl_data;
+int obd_ioctl_getdata(struct obd_ioctl_data **data, int *len, void __user *arg);
+int class_procfs_init(void);
+int class_procfs_clean(void);
+
+extern void obd_heat_add(struct obd_heat_instance *instance,
+			 unsigned int time_second, __u64 count,
+			 unsigned int weight, unsigned int period_second);
+extern void obd_heat_decay(struct obd_heat_instance *instance,
+			   __u64 time_second, unsigned int weight,
+			   unsigned int period_second);
+extern __u64 obd_heat_get(struct obd_heat_instance *instance,
+			  unsigned int time_second, unsigned int weight,
+			  unsigned int period_second);
+extern void obd_heat_clear(struct obd_heat_instance *instance, int count);
+
+/* struct kobj_type */
+static inline
+struct attribute *_get_attr_matches(const struct kobj_type *typ,
+				    const char *key, size_t keylen,
+				    int (*is_match)(const char *, const char *,
+						    size_t))
+{
+	int i;
+
+#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS
+	for (i = 0; typ->default_groups[i]; i++) {
+		int k;
+		struct attribute **attrs;
+
+		attrs = (struct attribute **)typ->default_groups[i]->attrs;
+		for (k = 0; attrs[k]; k++) {
+			if (is_match(attrs[k]->name, key, keylen))
+				return (struct attribute *)attrs[k];
+		}
+	}
+#else
+	for (i = 0; typ->default_attrs[i]; i++) {
+		if (is_match(typ->default_attrs[i]->name, key, keylen))
+			return typ->default_attrs[i];
+	}
+#endif
+	return NULL;
+}
+
+static inline
+int _attr_name_exact(const char *attr_name, const char *key, size_t len)
+{
+	return !strcmp(attr_name, key);
+}
+
+static inline
+struct attribute *get_attr_by_name(const struct kobj_type *typ,
+				   const char *name)
+{
+	return _get_attr_matches(typ, name, 0, _attr_name_exact);
+}
+
+static inline
+int _attr_name_starts_with(const char *attr_name, const char *name, size_t len)
+{
+	return !strncmp(attr_name, name, len);
+}
+
+static inline
+struct attribute *get_attr_starts_with(const struct kobj_type *typ,
+				       const char *name,
+				       size_t len)
+{
+	return _get_attr_matches(typ, name, len, _attr_name_starts_with);
+}
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h
new file mode 100644
index 0000000000000..28d2650e11b06
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h
@@ -0,0 +1,1055 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _OBD_SUPPORT
+#define _OBD_SUPPORT
+
+#include <linux/atomic.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <libcfs/libcfs.h>
+#include <lprocfs_status.h>
+#include <lustre_handles.h>
+
+/* global variables */
+extern struct lprocfs_stats *obd_memory;
+enum {
+        OBD_MEMORY_STAT = 0,
+        OBD_STATS_NUM,
+};
+
+extern unsigned int obd_debug_peer_on_timeout;
+extern unsigned int obd_dump_on_timeout;
+extern unsigned int obd_dump_on_eviction;
+extern unsigned int obd_lbug_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+   networking / disk / timings affected by load (use Adaptive Timeouts) */
+extern unsigned int obd_timeout;          /* seconds */
+extern unsigned int ldlm_timeout;         /* seconds */
+extern unsigned int obd_timeout_set;
+extern unsigned int ldlm_timeout_set;
+extern unsigned int bulk_timeout;
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
+extern unsigned long obd_max_dirty_pages;
+extern atomic_long_t obd_dirty_pages;
+extern char obd_jobid_var[];
+
+/* Some hash init argument constants */
+#define HASH_NID_STATS_BKT_BITS 5
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_GEN_BKT_BITS 5
+#define HASH_GEN_CUR_BITS 7
+#define HASH_GEN_MAX_BITS 12
+#define HASH_LQE_BKT_BITS 5
+#define HASH_LQE_CUR_BITS 7
+#define HASH_LQE_MAX_BITS 12
+#define HASH_EXP_LOCK_BKT_BITS  5
+#define HASH_EXP_LOCK_CUR_BITS  7
+#define HASH_EXP_LOCK_MAX_BITS  16
+#define HASH_JOB_STATS_BKT_BITS 5
+#define HASH_JOB_STATS_CUR_BITS 7
+#define HASH_JOB_STATS_MAX_BITS 12
+
+/* Timeout definitions */
+#define OBD_TIMEOUT_DEFAULT             100
+#define LDLM_TIMEOUT_DEFAULT            20
+#define MDS_LDLM_TIMEOUT_DEFAULT        6
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD          (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIME_SOFT          (obd_timeout * 3)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+    connect requests in the LND queues, but within obd_timeout so we don't
+    miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
+/* In general this should be low to have quick detection of a system
+   running on a backup server. (If it's too low, import_select_connection
+   will increase the timeout anyhow.)  */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+                             INITIAL_CONNECT_TIMEOUT)
+/* The min time a target should wait for clients to reconnect in recovery */
+#define OBD_RECOVERY_TIME_MIN	(2*RECONNECT_DELAY_MAX)
+#define OBD_IR_FACTOR_MIN	1
+#define OBD_IR_FACTOR_MAX	10
+#define OBD_IR_FACTOR_DEFAULT	(OBD_IR_FACTOR_MAX/2)
+/* default timeout for the MGS to become IR_FULL */
+#define OBD_IR_MGS_TIMEOUT	(4*obd_timeout)
+/* Unlink should happen within this many seconds. */
+#define PTLRPC_REQ_LONG_UNLINK	300
+
+/**
+ * Time interval of shrink, if the client is "idle" more than this interval,
+ * then the ll_grant thread will return the requested grant space to filter
+ */
+#define GRANT_SHRINK_INTERVAL            1200/*20 minutes*/
+
+#define OBD_FAIL_MDS                     0x100
+#define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
+#define OBD_FAIL_MDS_GETATTR_NET         0x102
+#define OBD_FAIL_MDS_GETATTR_PACK        0x103
+#define OBD_FAIL_MDS_READPAGE_NET        0x104
+#define OBD_FAIL_MDS_READPAGE_PACK       0x105
+#define OBD_FAIL_MDS_SENDPAGE            0x106
+#define OBD_FAIL_MDS_REINT_NET           0x107
+#define OBD_FAIL_MDS_REINT_UNPACK        0x108
+#define OBD_FAIL_MDS_REINT_SETATTR       0x109
+#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a
+#define OBD_FAIL_MDS_REINT_CREATE        0x10b
+#define OBD_FAIL_MDS_REINT_CREATE_WRITE  0x10c
+#define OBD_FAIL_MDS_REINT_UNLINK        0x10d
+#define OBD_FAIL_MDS_REINT_UNLINK_WRITE  0x10e
+#define OBD_FAIL_MDS_REINT_LINK          0x10f
+#define OBD_FAIL_MDS_REINT_LINK_WRITE    0x110
+#define OBD_FAIL_MDS_REINT_RENAME        0x111
+#define OBD_FAIL_MDS_REINT_RENAME_WRITE  0x112
+#define OBD_FAIL_MDS_OPEN_NET            0x113
+#define OBD_FAIL_MDS_OPEN_PACK           0x114
+#define OBD_FAIL_MDS_CLOSE_NET           0x115
+#define OBD_FAIL_MDS_CLOSE_PACK          0x116
+#define OBD_FAIL_MDS_CONNECT_NET         0x117
+#define OBD_FAIL_MDS_CONNECT_PACK        0x118
+#define OBD_FAIL_MDS_REINT_NET_REP       0x119
+#define OBD_FAIL_MDS_DISCONNECT_NET      0x11a
+#define OBD_FAIL_MDS_GET_ROOT_NET	 0x11b
+#define OBD_FAIL_MDS_GET_ROOT_PACK	 0x11c
+#define OBD_FAIL_MDS_STATFS_PACK         0x11d
+#define OBD_FAIL_MDS_STATFS_SUM_PACK     0x11d
+#define OBD_FAIL_MDS_STATFS_NET          0x11e
+#define OBD_FAIL_MDS_STATFS_SUM_NET      0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
+#define OBD_FAIL_MDS_PIN_NET             0x120
+#define OBD_FAIL_MDS_UNPIN_NET           0x121
+#define OBD_FAIL_MDS_ALL_REPLY_NET       0x122
+#define OBD_FAIL_MDS_ALL_REQUEST_NET     0x123
+#define OBD_FAIL_MDS_SYNC_NET            0x124
+#define OBD_FAIL_MDS_SYNC_PACK           0x125
+/*	OBD_FAIL_MDS_DONE_WRITING_NET    0x126 obsolete since 2.8.0 */
+/*	OBD_FAIL_MDS_DONE_WRITING_PACK   0x127 obsolete since 2.8.0 */
+#define OBD_FAIL_MDS_ALLOC_OBDO          0x128
+#define OBD_FAIL_MDS_PAUSE_OPEN          0x129
+#define OBD_FAIL_MDS_STATFS_LCW_SLEEP    0x12a
+#define OBD_FAIL_MDS_OPEN_CREATE         0x12b
+#define OBD_FAIL_MDS_OST_SETATTR         0x12c
+/*	OBD_FAIL_MDS_QUOTACHECK_NET      0x12d obsolete since 2.4 */
+#define OBD_FAIL_MDS_QUOTACTL_NET        0x12e
+#define OBD_FAIL_MDS_CLIENT_ADD          0x12f
+#define OBD_FAIL_MDS_GETXATTR_NET        0x130
+#define OBD_FAIL_MDS_GETXATTR_PACK       0x131
+#define OBD_FAIL_MDS_SETXATTR_NET        0x132
+#define OBD_FAIL_MDS_SETXATTR            0x133
+#define OBD_FAIL_MDS_SETXATTR_WRITE      0x134
+#define OBD_FAIL_MDS_FS_SETUP            0x135
+#define OBD_FAIL_MDS_RESEND              0x136
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x138
+#define OBD_FAIL_MDS_OSC_PRECREATE       0x139
+#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT   0x13a
+#define OBD_FAIL_MDS_CLOSE_NET_REP       0x13b
+#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ     0x13c
+#define OBD_FAIL_MDS_DROP_QUOTA_REQ      0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING   0x13f
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD    0x140
+#define OBD_FAIL_MDS_LOV_PREP_CREATE     0x141
+#define OBD_FAIL_MDS_REINT_DELAY         0x142
+#define OBD_FAIL_MDS_READLINK_EPROTO     0x143
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE    0x144
+#define OBD_FAIL_MDS_PDO_LOCK            0x145
+#define OBD_FAIL_MDS_PDO_LOCK2           0x146
+#define OBD_FAIL_MDS_OSC_CREATE_FAIL     0x147
+#define OBD_FAIL_MDS_NEGATIVE_POSITIVE	 0x148
+#define OBD_FAIL_MDS_HSM_STATE_GET_NET		0x149
+#define OBD_FAIL_MDS_HSM_STATE_SET_NET		0x14a
+#define OBD_FAIL_MDS_HSM_PROGRESS_NET		0x14b
+#define OBD_FAIL_MDS_HSM_REQUEST_NET		0x14c
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET	0x14d
+#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET	0x14e
+#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET		0x14f
+#define OBD_FAIL_MDS_HSM_ACTION_NET		0x150
+#define OBD_FAIL_MDS_CHANGELOG_INIT		0x151
+#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS		0x152
+#define OBD_FAIL_MDS_RENAME              0x153
+#define OBD_FAIL_MDS_RENAME2             0x154
+#define OBD_FAIL_MDS_RENAME3             0x155
+#define OBD_FAIL_MDS_RENAME4             0x156
+#define OBD_FAIL_MDS_LDLM_REPLY_NET	 0x157
+#define OBD_FAIL_MDS_STALE_DIR_LAYOUT	 0x158
+#define OBD_FAIL_MDS_REINT_MULTI_NET     0x159
+#define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b
+#define OBD_FAIL_MDS_FLD_LOOKUP			0x15c
+#define OBD_FAIL_MDS_CHANGELOG_REORDER	0x15d
+#define OBD_FAIL_MDS_LLOG_UMOUNT_RACE   0x15e
+#define OBD_FAIL_MDS_CHANGELOG_RACE	 0x15f
+#define OBD_FAIL_MDS_INTENT_DELAY		0x160
+#define OBD_FAIL_MDS_XATTR_REP			0x161
+#define OBD_FAIL_MDS_TRACK_OVERFLOW	 0x162
+#define OBD_FAIL_MDS_LOV_CREATE_RACE	 0x163
+#define OBD_FAIL_MDS_HSM_CDT_DELAY	 0x164
+#define OBD_FAIL_MDS_ORPHAN_DELETE	 0x165
+#define OBD_FAIL_MDS_RMFID_NET		 0x166
+#define OBD_FAIL_MDS_CREATE_RACE	 0x167
+#define OBD_FAIL_MDS_STATFS_SPOOF	 0x168
+#define OBD_FAIL_MDS_REINT_OPEN		 0x169
+#define OBD_FAIL_MDS_REINT_OPEN2	 0x16a
+#define OBD_FAIL_MDS_COMMITRW_DELAY	 0x16b
+#define OBD_FAIL_MDS_CHANGELOG_DEL	 0x16c
+#define OBD_FAIL_MDS_CHANGELOG_IDX_PUMP	 0x16d
+#define OBD_FAIL_MDS_DELAY_DELORPHAN	 0x16e
+
+/* layout lock */
+#define OBD_FAIL_MDS_NO_LL_GETATTR	 0x170
+#define OBD_FAIL_MDS_NO_LL_OPEN		 0x171
+#define OBD_FAIL_MDS_LL_BLOCK		 0x172
+#define OBD_FAIL_MDS_LOD_CREATE_PAUSE	 0x173
+
+/* CMD */
+#define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
+#define OBD_FAIL_MDS_IS_SUBDIR_PACK      0x181
+#define OBD_FAIL_MDS_SET_INFO_NET        0x182
+#define OBD_FAIL_MDS_WRITEPAGE_NET       0x183
+#define OBD_FAIL_MDS_WRITEPAGE_PACK      0x184
+#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET        0x186
+#define OBD_FAIL_MDS_DQACQ_NET           0x187
+#define OBD_FAIL_MDS_STRIPE_CREATE	 0x188
+#define OBD_FAIL_MDS_STRIPE_FID		 0x189
+#define OBD_FAIL_MDS_LINK_RENAME_RACE	 0x18a
+
+/* OI scrub */
+#define OBD_FAIL_OSD_SCRUB_DELAY			0x190
+#define OBD_FAIL_OSD_SCRUB_CRASH			0x191
+#define OBD_FAIL_OSD_SCRUB_FATAL			0x192
+#define OBD_FAIL_OSD_FID_MAPPING			0x193
+#define OBD_FAIL_OSD_LMA_INCOMPAT			0x194
+#define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY		0x195
+#define OBD_FAIL_OSD_COMPAT_NO_ENTRY			0x196
+#define OBD_FAIL_OSD_OST_EA_FID_SET			0x197
+#define OBD_FAIL_OSD_NO_OI_ENTRY			0x198
+#define OBD_FAIL_OSD_INDEX_CRASH			0x199
+#define OBD_FAIL_OSD_TXN_START				0x19a
+#define OBD_FAIL_OSD_DUPLICATE_MAP			0x19b
+#define OBD_FAIL_OSD_REF_DEL				0x19c
+#define OBD_FAIL_OSD_OI_ENOSPC				0x19d
+#define OBD_FAIL_OSD_DOTDOT_ENOSPC			0x19e
+
+#define OBD_FAIL_OFD_SET_OID				0x1e0
+
+#define OBD_FAIL_OST                     0x200
+#define OBD_FAIL_OST_CONNECT_NET         0x201
+#define OBD_FAIL_OST_DISCONNECT_NET      0x202
+#define OBD_FAIL_OST_GET_INFO_NET        0x203
+#define OBD_FAIL_OST_CREATE_NET          0x204
+#define OBD_FAIL_OST_DESTROY_NET         0x205
+#define OBD_FAIL_OST_GETATTR_NET         0x206
+#define OBD_FAIL_OST_SETATTR_NET         0x207
+#define OBD_FAIL_OST_OPEN_NET            0x208
+#define OBD_FAIL_OST_CLOSE_NET           0x209
+#define OBD_FAIL_OST_BRW_NET             0x20a
+#define OBD_FAIL_OST_PUNCH_NET           0x20b
+#define OBD_FAIL_OST_STATFS_NET          0x20c
+#define OBD_FAIL_OST_HANDLE_UNPACK       0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+#define OBD_FAIL_OST_SYNC_NET            0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET       0x211
+#define OBD_FAIL_OST_ALL_REQUEST_NET     0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
+#define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
+#define OBD_FAIL_OST_ENOSPC              0x215
+#define OBD_FAIL_OST_EROFS               0x216
+#define OBD_FAIL_SRV_ENOENT              0x217
+/*	OBD_FAIL_OST_QUOTACHECK_NET      0x218 obsolete since 2.4 */
+#define OBD_FAIL_OST_QUOTACTL_NET        0x219
+#define OBD_FAIL_OST_CHECKSUM_RECEIVE    0x21a
+#define OBD_FAIL_OST_CHECKSUM_SEND       0x21b
+#define OBD_FAIL_OST_BRW_SIZE            0x21c
+#define OBD_FAIL_OST_DROP_REQ            0x21d
+#define OBD_FAIL_OST_SETATTR_CREDITS     0x21e
+#define OBD_FAIL_OST_HOLD_WRITE_RPC      0x21f
+#define OBD_FAIL_OST_BRW_WRITE_BULK2     0x220
+#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
+#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE        0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
+#define OBD_FAIL_OST_CONNECT_NET2        0x225
+#define OBD_FAIL_OST_NOMEM               0x226
+#define OBD_FAIL_OST_BRW_PAUSE_BULK2     0x227
+#define OBD_FAIL_OST_MAPBLK_ENOSPC       0x228
+#define OBD_FAIL_OST_ENOINO              0x229
+#define OBD_FAIL_OST_DQACQ_NET           0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS  0x231
+#define OBD_FAIL_OST_SET_INFO_NET        0x232
+#define OBD_FAIL_OST_NODESTROY		 0x233
+/*	OBD_FAIL_OST_READ_SIZE		 0x234 obsolete since 2.14 */
+#define OBD_FAIL_OST_LADVISE_NET	 0x235
+#define OBD_FAIL_OST_PAUSE_PUNCH         0x236
+#define OBD_FAIL_OST_LADVISE_PAUSE	 0x237
+#define OBD_FAIL_OST_FAKE_RW		 0x238
+#define OBD_FAIL_OST_LIST_ASSERT         0x239
+#define OBD_FAIL_OST_GL_WORK_ALLOC	 0x240
+#define OBD_FAIL_OST_SKIP_LV_CHECK	 0x241
+#define OBD_FAIL_OST_STATFS_DELAY	 0x242
+#define OBD_FAIL_OST_INTEGRITY_FAULT	 0x243
+#define OBD_FAIL_OST_INTEGRITY_CMP	 0x244
+#define OBD_FAIL_OST_DISCONNECT_DELAY	 0x245
+#define OBD_FAIL_OST_PREPARE_DELAY	 0x247
+#define OBD_FAIL_OST_2BIG_NIOBUF	 0x248
+#define OBD_FAIL_OST_FALLOCATE_NET	 0x249
+#define OBD_FAIL_OST_SEEK_NET		 0x24a
+#define OBD_FAIL_OST_WR_ATTR_DELAY	 0x250
+#define OBD_FAIL_OST_RESTART_IO		 0x251
+#define OBD_FAIL_OST_GET_LAST_FID	 0x252
+
+#define OBD_FAIL_LDLM                    0x300
+#define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
+#define OBD_FAIL_LDLM_ENQUEUE_NET	 0x302
+#define OBD_FAIL_LDLM_CONVERT_NET	 0x303
+#define OBD_FAIL_LDLM_CANCEL_NET	 0x304
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET	 0x305
+#define OBD_FAIL_LDLM_CP_CALLBACK_NET	 0x306
+#define OBD_FAIL_LDLM_GL_CALLBACK_NET	 0x307
+#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
+#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309
+#define OBD_FAIL_LDLM_CREATE_RESOURCE    0x30a
+#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED    0x30b
+#define OBD_FAIL_LDLM_REPLY              0x30c
+#define OBD_FAIL_LDLM_RECOV_CLIENTS      0x30d
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+#define OBD_FAIL_LDLM_GLIMPSE            0x30f
+#define OBD_FAIL_LDLM_CANCEL_RACE        0x310
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE  0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD       0x313
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE  0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT         0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE      0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST        0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE         0x318
+#define OBD_FAIL_LDLM_NEW_LOCK           0x319
+#define OBD_FAIL_LDLM_AGL_DELAY          0x31a
+#define OBD_FAIL_LDLM_AGL_NOLOCK         0x31b
+#define OBD_FAIL_LDLM_OST_LVB		 0x31c
+#define OBD_FAIL_LDLM_ENQUEUE_HANG	 0x31d
+#define OBD_FAIL_LDLM_BL_EVICT           0x31e
+#define OBD_FAIL_LDLM_PAUSE_CANCEL2      0x31f
+#define OBD_FAIL_LDLM_CP_CB_WAIT2        0x320
+#define OBD_FAIL_LDLM_CP_CB_WAIT3        0x321
+#define OBD_FAIL_LDLM_CP_CB_WAIT4        0x322
+#define OBD_FAIL_LDLM_CP_CB_WAIT5        0x323
+#define OBD_FAIL_LDLM_SRV_BL_AST	 0x324
+#define OBD_FAIL_LDLM_SRV_CP_AST	 0x325
+#define OBD_FAIL_LDLM_SRV_GL_AST	 0x326
+#define OBD_FAIL_LDLM_WATERMARK_LOW	 0x327
+#define OBD_FAIL_LDLM_WATERMARK_HIGH	 0x328
+#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
+
+#define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
+#define OBD_FAIL_LDLM_PROLONG_PAUSE	 0x32b
+#define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE 0x32c
+#define OBD_FAIL_LDLM_LOCK_REPLAY	 0x32d
+#define OBD_FAIL_LDLM_REPLAY_PAUSE	 0x32e
+
+/* LOCKLESS IO */
+#define OBD_FAIL_LDLM_SET_CONTENTION     0x385
+
+#define OBD_FAIL_OSC                     0x400
+#define OBD_FAIL_OSC_BRW_READ_BULK       0x401
+#define OBD_FAIL_OSC_BRW_WRITE_BULK      0x402
+#define OBD_FAIL_OSC_LOCK_BL_AST         0x403
+#define OBD_FAIL_OSC_LOCK_CP_AST         0x404
+#define OBD_FAIL_OSC_MATCH               0x405
+#define OBD_FAIL_OSC_BRW_PREP_REQ        0x406
+#define OBD_FAIL_OSC_SHUTDOWN            0x407
+#define OBD_FAIL_OSC_CHECKSUM_RECEIVE    0x408
+#define OBD_FAIL_OSC_CHECKSUM_SEND       0x409
+#define OBD_FAIL_OSC_BRW_PREP_REQ2       0x40a
+/* #define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b Obsolete since 2.9 */
+#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE           0x40d
+#define OBD_FAIL_OSC_OBJECT_CONTENTION   0x40e
+#define OBD_FAIL_OSC_CP_CANCEL_RACE      0x40f
+#define OBD_FAIL_OSC_CP_ENQ_RACE         0x410
+#define OBD_FAIL_OSC_NO_GRANT            0x411
+#define OBD_FAIL_OSC_DELAY_SETTIME	 0x412
+#define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413
+#define OBD_FAIL_OSC_DELAY_IO            0x414
+#define OBD_FAIL_OSC_NO_SIZE_DATA        0x415
+#define OBD_FAIL_OSC_DELAY_CANCEL        0x416
+#define OBD_FAIL_OSC_SLOW_PAGE_EVICT	 0x417
+
+#define OBD_FAIL_PTLRPC                  0x500
+#define OBD_FAIL_PTLRPC_ACK              0x501
+#define OBD_FAIL_PTLRPC_RQBD             0x502
+#define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
+#define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC         0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
+#define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP        0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE     0x50d
+#define OBD_FAIL_PTLRPC_DUMP_LOG         0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT    0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
+#define OBD_FAIL_PTLRPC_DROP_REQ_OPC     0x513
+#define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
+#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL   0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND    0x517
+#define OBD_FAIL_PTLRPC_DROP_BULK	 0x51a
+#define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK  0x51b
+#define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3  0x520
+#define OBD_FAIL_PTLRPC_BULK_ATTACH      0x521
+#define OBD_FAIL_PTLRPC_BULK_REPLY_ATTACH      0x522
+#define OBD_FAIL_PTLRPC_RESEND_RACE	 0x525
+#define OBD_FAIL_PTLRPC_ROUND_XID	 0x530
+#define OBD_FAIL_PTLRPC_CONNECT_RACE	 0x531
+#define OBD_FAIL_NET_ERROR_RPC		 0x532
+#define OBD_FAIL_PTLRPC_IDLE_RACE	 0x533
+#define OBD_FAIL_PTLRPC_ENQ_RESEND	 0x534
+
+#define OBD_FAIL_OBD_PING_NET            0x600
+/*	OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
+#define OBD_FAIL_OBD_LOGD_NET            0x602
+/*	OBD_FAIL_OBD_QC_CALLBACK_NET     0x603 obsolete since 2.4 */
+#define OBD_FAIL_OBD_DQACQ               0x604
+#define OBD_FAIL_OBD_LLOG_SETUP          0x605
+/*	OBD_FAIL_OBD_LOG_CANCEL_REP      0x606 obsolete since 1.5 */
+#define OBD_FAIL_OBD_IDX_READ_NET        0x607
+#define OBD_FAIL_OBD_IDX_READ_BREAK	 0x608
+#define OBD_FAIL_OBD_NO_LRU		 0x609
+#define OBD_FAIL_OBDCLASS_MODULE_LOAD	 0x60a
+#define OBD_FAIL_OBD_ZERO_NLINK_RACE	 0x60b
+#define OBD_FAIL_OBD_STOP_MDS_RACE	 0x60c
+#define OBD_FAIL_OBD_SETUP		 0x60d
+
+#define OBD_FAIL_TGT_REPLY_NET           0x700
+#define OBD_FAIL_TGT_CONN_RACE           0x701
+#define OBD_FAIL_TGT_FORCE_RECONNECT     0x702
+#define OBD_FAIL_TGT_DELAY_CONNECT       0x703
+#define OBD_FAIL_TGT_DELAY_RECONNECT     0x704
+#define OBD_FAIL_TGT_DELAY_PRECREATE     0x705
+#define OBD_FAIL_TGT_TOOMANY_THREADS     0x706
+#define OBD_FAIL_TGT_REPLAY_DROP         0x707
+#define OBD_FAIL_TGT_FAKE_EXP            0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY        0x709
+/* #define OBD_FAIL_TGT_LAST_REPLAY         0x710 (obsoleted) */
+#define OBD_FAIL_TGT_CLIENT_ADD          0x711
+#define OBD_FAIL_TGT_RCVG_FLAG           0x712
+#define OBD_FAIL_TGT_DELAY_CONDITIONAL	 0x713
+#define OBD_FAIL_TGT_REPLAY_DELAY2       0x714
+#define OBD_FAIL_TGT_REPLAY_RECONNECT	 0x715
+#define OBD_FAIL_TGT_MOUNT_RACE		 0x716
+#define OBD_FAIL_TGT_REPLAY_TIMEOUT	 0x717
+#define OBD_FAIL_TGT_CLIENT_DEL		 0x718
+#define OBD_FAIL_TGT_SLUGGISH_NET	 0x719
+#define OBD_FAIL_TGT_RCVD_EIO		 0x720
+#define OBD_FAIL_TGT_RECOVERY_REQ_RACE	 0x721
+#define OBD_FAIL_TGT_REPLY_DATA_RACE	 0x722
+#define OBD_FAIL_TGT_RECOVERY_CONNECT    0x724
+#define OBD_FAIL_TGT_NO_GRANT		 0x725
+#define OBD_FAIL_TGT_TXN_NO_CANCEL	 0x726
+
+#define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
+#define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
+#define OBD_FAIL_MDC_RPCS_SEM		 0x804 /* deprecated */
+#define OBD_FAIL_MDC_LIGHTWEIGHT	 0x805
+#define OBD_FAIL_MDC_CLOSE		 0x806
+#define OBD_FAIL_MDC_MERGE		 0x807
+#define OBD_FAIL_MDC_GLIMPSE_DDOS	 0x808
+
+#define OBD_FAIL_MGS                     0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET       0x902
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG   0x903
+#define OBD_FAIL_MGS_PAUSE_REQ           0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG    0x905
+#define OBD_FAIL_MGS_CONNECT_NET	 0x906
+#define OBD_FAIL_MGS_DISCONNECT_NET	 0x907
+#define OBD_FAIL_MGS_SET_INFO_NET	 0x908
+#define OBD_FAIL_MGS_EXCEPTION_NET	 0x909
+#define OBD_FAIL_MGS_TARGET_REG_NET	 0x90a
+#define OBD_FAIL_MGS_TARGET_DEL_NET	 0x90b
+#define OBD_FAIL_MGS_CONFIG_READ_NET	 0x90c
+#define OBD_FAIL_MGS_LDLM_REPLY_NET	 0x90d
+#define OBD_FAIL_MGS_WRITE_TARGET_DELAY	 0x90e
+
+#define OBD_FAIL_QUOTA_DQACQ_NET			0xA01
+#define OBD_FAIL_QUOTA_EDQUOT            0xA02
+#define OBD_FAIL_QUOTA_DELAY_REINT       0xA03
+#define OBD_FAIL_QUOTA_RECOVERABLE_ERR   0xA04
+#define OBD_FAIL_QUOTA_INIT              0xA05
+#define OBD_FAIL_QUOTA_PREACQ            0xA06
+#define OBD_FAIL_QUOTA_RECALC            0xA07
+
+#define OBD_FAIL_LPROC_REMOVE            0xB00
+
+#define OBD_FAIL_SEQ                     0x1000
+#define OBD_FAIL_SEQ_QUERY_NET           0x1001
+#define OBD_FAIL_SEQ_EXHAUST		 0x1002
+
+#define OBD_FAIL_FLD                     0x1100
+#define OBD_FAIL_FLD_QUERY_NET           0x1101
+#define OBD_FAIL_FLD_READ_NET		 0x1102
+#define OBD_FAIL_FLD_QUERY_REQ		 0x1103
+
+#define OBD_FAIL_SEC_CTX                 0x1200
+#define OBD_FAIL_SEC_CTX_INIT_NET        0x1201
+#define OBD_FAIL_SEC_CTX_INIT_CONT_NET   0x1202
+#define OBD_FAIL_SEC_CTX_FINI_NET        0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
+
+#define OBD_FAIL_LLOG                               0x1300
+/* was	OBD_FAIL_LLOG_ORIGIN_CONNECT_NET            0x1301 until 2.4 */
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
+/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303 until 2.11 */
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
+/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307 until 2.1 */
+/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308 until 1.8 */
+/* was	OBD_FAIL_LLOG_CATINFO_NET                   0x1309 until 2.3 */
+#define OBD_FAIL_MDS_SYNC_CAPA_SL                   0x1310
+#define OBD_FAIL_SEQ_ALLOC                          0x1311
+#define OBD_FAIL_CAT_RECORDS			    0x1312
+#define OBD_FAIL_CAT_FREE_RECORDS		    0x1313
+#define OBD_FAIL_TIME_IN_CHLOG_USER		    0x1314
+#define CFS_FAIL_CHLOG_USER_REG_UNREG_RACE	    0x1315
+#define OBD_FAIL_FORCE_GC_THREAD		    0x1316
+#define OBD_FAIL_LLOG_PROCESS_TIMEOUT		    0x1317
+#define OBD_FAIL_LLOG_PURGE_DELAY		    0x1318
+#define OBD_FAIL_PLAIN_RECORDS			    0x1319
+#define OBD_FAIL_CATALOG_FULL_CHECK		    0x131a
+#define OBD_FAIL_CATLIST			    0x131b
+#define OBD_FAIL_LLOG_PAUSE_AFTER_PAD               0x131c
+#define OBD_FAIL_LLOG_ADD_GAP			    0x131d
+
+#define OBD_FAIL_LLITE                              0x1400
+#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE             0x1401
+#define OBD_FAIL_LOCK_STATE_WAIT_INTR               0x1402
+#define OBD_FAIL_LOV_INIT			    0x1403
+#define OBD_FAIL_GLIMPSE_DELAY			    0x1404
+#define OBD_FAIL_LLITE_XATTR_ENOMEM		    0x1405
+#define OBD_FAIL_MAKE_LOVEA_HOLE		    0x1406
+#define OBD_FAIL_LLITE_LOST_LAYOUT		    0x1407
+#define OBD_FAIL_LLITE_NO_CHECK_DEAD		    0x1408
+#define OBD_FAIL_GETATTR_DELAY			    0x1409
+#define OBD_FAIL_LLITE_CREATE_FILE_PAUSE	    0x1409
+#define OBD_FAIL_LLITE_NEWNODE_PAUSE		    0x140a
+#define OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE	    0x140b
+#define OBD_FAIL_LLITE_CREATE_NODE_PAUSE	    0x140c
+#define OBD_FAIL_LLITE_IMUTEX_SEC		    0x140e
+#define OBD_FAIL_LLITE_IMUTEX_NOSEC		    0x140f
+#define OBD_FAIL_LLITE_OPEN_BY_NAME		    0x1410
+#define OBD_FAIL_LLITE_PCC_FAKE_ERROR		    0x1411
+#define OBD_FAIL_LLITE_PCC_DETACH_MKWRITE	    0x1412
+#define OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE	    0x1413
+#define OBD_FAIL_LLITE_PCC_ATTACH_PAUSE		    0x1414
+#define OBD_FAIL_LLITE_SHORT_COMMIT		    0x1415
+#define OBD_FAIL_LLITE_CREATE_FILE_PAUSE2	    0x1416
+#define OBD_FAIL_LLITE_RACE_MOUNT		    0x1417
+#define OBD_FAIL_LLITE_PAGE_ALLOC		    0x1418
+#define OBD_FAIL_LLITE_OPEN_DELAY		    0x1419
+#define OBD_FAIL_LLITE_XATTR_PAUSE		    0x1420
+#define OBD_FAIL_LLITE_PAGE_INVALIDATE_PAUSE	    0x1421
+#define OBD_FAIL_LLITE_READPAGE_PAUSE		    0x1422
+#define OBD_FAIL_LLITE_READPAGE_PAUSE2		    0x1424
+
+#define OBD_FAIL_FID_INDIR	0x1501
+#define OBD_FAIL_FID_INLMA	0x1502
+#define OBD_FAIL_FID_IGIF	0x1504
+#define OBD_FAIL_FID_LOOKUP	0x1505
+#define OBD_FAIL_FID_NOLMA	0x1506
+
+/* LFSCK */
+#define OBD_FAIL_LFSCK_DELAY1		0x1600
+#define OBD_FAIL_LFSCK_DELAY2		0x1601
+#define OBD_FAIL_LFSCK_DELAY3		0x1602
+#define OBD_FAIL_LFSCK_LINKEA_CRASH	0x1603
+#define OBD_FAIL_LFSCK_LINKEA_MORE	0x1604
+#define OBD_FAIL_LFSCK_LINKEA_MORE2	0x1605
+#define OBD_FAIL_LFSCK_FATAL1		0x1608
+#define OBD_FAIL_LFSCK_FATAL2		0x1609
+#define OBD_FAIL_LFSCK_CRASH		0x160a
+#define OBD_FAIL_LFSCK_NO_AUTO		0x160b
+#define OBD_FAIL_LFSCK_NO_DOUBLESCAN	0x160c
+#define OBD_FAIL_LFSCK_SKIP_LASTID	0x160d
+#define OBD_FAIL_LFSCK_DELAY4		0x160e
+#define OBD_FAIL_LFSCK_BAD_LMMOI	0x160f
+#define OBD_FAIL_LFSCK_DANGLING 	0x1610
+#define OBD_FAIL_LFSCK_UNMATCHED_PAIR1	0x1611
+#define OBD_FAIL_LFSCK_UNMATCHED_PAIR2	0x1612
+#define OBD_FAIL_LFSCK_BAD_OWNER	0x1613
+#define OBD_FAIL_LFSCK_MULTIPLE_REF	0x1614
+#define OBD_FAIL_LFSCK_LOST_STRIPE	0x1615
+#define OBD_FAIL_LFSCK_LOST_MDTOBJ	0x1616
+#define OBD_FAIL_LFSCK_NOPFID		0x1617
+#define OBD_FAIL_LFSCK_CHANGE_STRIPE	0x1618
+#define OBD_FAIL_LFSCK_INVALID_PFID	0x1619
+#define OBD_FAIL_LFSCK_LOST_SPEOBJ	0x161a
+#define OBD_FAIL_LFSCK_DELAY5		0x161b
+#define OBD_FAIL_LFSCK_BAD_NETWORK	0x161c
+#define OBD_FAIL_LFSCK_NO_LINKEA	0x161d
+#define OBD_FAIL_LFSCK_BAD_PARENT	0x161e
+#define OBD_FAIL_LFSCK_DANGLING2	0x1620
+#define OBD_FAIL_LFSCK_DANGLING3	0x1621
+#define OBD_FAIL_LFSCK_MUL_REF		0x1622
+#define OBD_FAIL_LFSCK_BAD_TYPE		0x1623
+#define OBD_FAIL_LFSCK_NO_NAMEENTRY	0x1624
+#define OBD_FAIL_LFSCK_LESS_NLINK	0x1626
+#define OBD_FAIL_LFSCK_BAD_NAME_HASH	0x1628
+#define OBD_FAIL_LFSCK_LOST_MASTER_LMV	0x1629
+#define OBD_FAIL_LFSCK_LOST_SLAVE_LMV	0x162a
+#define OBD_FAIL_LFSCK_BAD_SLAVE_LMV	0x162b
+#define OBD_FAIL_LFSCK_BAD_SLAVE_NAME	0x162c
+#define OBD_FAIL_LFSCK_ENGINE_DELAY	0x162d
+#define OBD_FAIL_LFSCK_LOST_MDTOBJ2	0x162e
+#define OBD_FAIL_LFSCK_BAD_PFL_RANGE	0x162f
+#define OBD_FAIL_LFSCK_NO_AGENTOBJ	0x1630
+#define OBD_FAIL_LFSCK_NO_AGENTENT	0x1631
+#define OBD_FAIL_LFSCK_NO_ENCFLAG	0x1632
+
+#define OBD_FAIL_LFSCK_NOTIFY_NET	0x16f0
+#define OBD_FAIL_LFSCK_QUERY_NET	0x16f1
+
+/* UPDATE */
+#define OBD_FAIL_OUT_UPDATE_NET		0x1700
+#define OBD_FAIL_OUT_UPDATE_NET_REP	0x1701
+#define OBD_FAIL_SPLIT_UPDATE_REC	0x1702
+#define OBD_FAIL_LARGE_STRIPE		0x1703
+#define OBD_FAIL_OUT_ENOSPC             0x1704
+#define OBD_FAIL_INVALIDATE_UPDATE	0x1705
+#define OBD_FAIL_OUT_UPDATE_DROP        0x1707
+#define OBD_FAIL_OUT_OBJECT_MISS	0x1708
+
+/* MIGRATE */
+#define OBD_FAIL_MIGRATE_ENTRIES		0x1801
+
+/* LMV */
+#define OBD_FAIL_UNKNOWN_LMV_STRIPE		0x1901
+
+/* FLR */
+#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE		0x1A00
+#define OBD_FAIL_FLR_LV_DELAY			0x1A01
+#define OBD_FAIL_FLR_LV_INC			0x1A02
+#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR	0x1A03
+
+/* DT */
+#define OBD_FAIL_DT_DECLARE_ATTR_GET		0x2000
+#define OBD_FAIL_DT_ATTR_GET			0x2001
+#define OBD_FAIL_DT_DECLARE_ATTR_SET		0x2002
+#define OBD_FAIL_DT_ATTR_SET			0x2003
+#define OBD_FAIL_DT_DECLARE_XATTR_GET		0x2004
+#define OBD_FAIL_DT_XATTR_GET			0x2005
+#define OBD_FAIL_DT_DECLARE_XATTR_SET		0x2006
+#define OBD_FAIL_DT_XATTR_SET			0x2007
+#define OBD_FAIL_DT_DECLARE_XATTR_DEL		0x2008
+#define OBD_FAIL_DT_XATTR_DEL			0x2009
+#define OBD_FAIL_DT_XATTR_LIST			0x200a
+#define OBD_FAIL_DT_DECLARE_CREATE		0x200b
+#define OBD_FAIL_DT_CREATE			0x200c
+#define OBD_FAIL_DT_DECLARE_DESTROY		0x200d
+#define OBD_FAIL_DT_DESTROY			0x200e
+#define OBD_FAIL_DT_INDEX_TRY			0x200f
+#define OBD_FAIL_DT_DECLARE_REF_ADD		0x2010
+#define OBD_FAIL_DT_REF_ADD			0x2011
+#define OBD_FAIL_DT_DECLARE_REF_DEL		0x2012
+#define OBD_FAIL_DT_REF_DEL			0x2013
+#define OBD_FAIL_DT_DECLARE_INSERT		0x2014
+#define OBD_FAIL_DT_INSERT			0x2015
+#define OBD_FAIL_DT_DECLARE_DELETE		0x2016
+#define OBD_FAIL_DT_DELETE			0x2017
+#define OBD_FAIL_DT_LOOKUP			0x2018
+#define OBD_FAIL_DT_TXN_STOP			0x2019
+
+#define OBD_FAIL_OSP_CHECK_INVALID_REC		0x2100
+#define OBD_FAIL_OSP_CHECK_ENOMEM		0x2101
+#define OBD_FAIL_OSP_FAKE_PRECREATE		0x2102
+#define OBD_FAIL_OSP_RPCS_SEM			0x2104
+#define OBD_FAIL_OSP_CANT_PROCESS_LLOG		0x2105
+#define OBD_FAIL_OSP_INVALID_LOGID		0x2106
+#define OBD_FAIL_OSP_CON_EVENT_DELAY		0x2107
+#define OBD_FAIL_OSP_PRECREATE_PAUSE		0x2108
+#define OBD_FAIL_OSP_GET_LAST_FID		0x2109
+
+/* barrier */
+#define OBD_FAIL_MGS_BARRIER_READ_NET		0x2200
+#define OBD_FAIL_MGS_BARRIER_NOTIFY_NET		0x2201
+
+#define OBD_FAIL_BARRIER_DELAY			0x2202
+#define OBD_FAIL_BARRIER_FAILURE		0x2203
+
+#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE		0x2301
+
+/* LNet is allocated failure locations 0xe000 to 0xffff */
+/* Assign references to moved code to reduce code changes */
+#define OBD_FAIL_PRECHECK(id)                   (unlikely(CFS_FAIL_PRECHECK(id)))
+#define OBD_FAIL_CHECK(id)                      CFS_FAIL_CHECK(id)
+#define OBD_FAIL_CHECK_QUIET(id)                CFS_FAIL_CHECK_QUIET(id)
+#define OBD_FAIL_CHECK_VALUE(id, value)         CFS_FAIL_CHECK_VALUE(id, value)
+#define OBD_FAIL_CHECK_ORSET(id, value)         CFS_FAIL_CHECK_ORSET(id, value)
+#define OBD_FAIL_CHECK_RESET(id, value)         CFS_FAIL_CHECK_RESET(id, value)
+#define OBD_FAIL_RETURN(id, ret)                CFS_FAIL_RETURN(id, ret)
+#define OBD_FAIL_TIMEOUT(id, secs)              CFS_FAIL_TIMEOUT(id, secs)
+#define OBD_FAIL_TIMEOUT_MS(id, ms)             CFS_FAIL_TIMEOUT_MS(id, ms)
+#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs)
+#define OBD_RACE(id)                            CFS_RACE(id)
+#define OBD_FAIL_ONCE                           CFS_FAIL_ONCE
+#define OBD_FAILED                              CFS_FAILED
+
+#define LUT_FAIL_CLASS(fail_id)			(((fail_id) >> 8) << 16)
+#define LUT_FAIL_MGT				LUT_FAIL_CLASS(OBD_FAIL_MGS)
+#define LUT_FAIL_MDT				LUT_FAIL_CLASS(OBD_FAIL_MDS)
+#define LUT_FAIL_OST				LUT_FAIL_CLASS(OBD_FAIL_OST)
+
+extern atomic64_t libcfs_kmem;
+
+#ifdef CONFIG_PROC_FS
+#define obd_memory_add(size)                                                  \
+        lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sub(size)                                                  \
+        lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sum()                                                      \
+        lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT,                  \
+                                LPROCFS_FIELDS_FLAGS_SUM)
+
+extern void obd_update_maxusage(void);
+extern __u64 obd_memory_max(void);
+
+#else /* CONFIG_PROC_FS */
+
+extern __u64 obd_alloc;
+
+extern __u64 obd_max_alloc;
+
+static inline void obd_memory_add(long size)
+{
+        obd_alloc += size;
+        if (obd_alloc > obd_max_alloc)
+                obd_max_alloc = obd_alloc;
+}
+
+static inline void obd_memory_sub(long size)
+{
+        obd_alloc -= size;
+}
+
+#define obd_memory_sum() (obd_alloc)
+
+#define obd_memory_max() (obd_max_alloc)
+
+#endif /* !CONFIG_PROC_FS */
+
+#define OBD_DEBUG_MEMUSAGE (1)
+
+#if OBD_DEBUG_MEMUSAGE
+#define OBD_ALLOC_POST(ptr, size, name)                                 \
+                obd_memory_add(size);                                   \
+                CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",       \
+                       (int)(size), ptr)
+
+#define OBD_FREE_PRE(ptr, size, name)                                   \
+        LASSERT(ptr);                                                   \
+        obd_memory_sub(size);                                           \
+        CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",               \
+	       (int)(size), ptr);
+
+#else /* !OBD_DEBUG_MEMUSAGE */
+
+#define OBD_ALLOC_POST(ptr, size, name) ((void)0)
+#define OBD_FREE_PRE(ptr, size, name)   ((void)0)
+
+#endif /* !OBD_DEBUG_MEMUSAGE */
+
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
+do {									      \
+	if (cptab)							      \
+		ptr = cfs_cpt_malloc((cptab), (cpt), (size),		      \
+				     (flags) | __GFP_ZERO | __GFP_NOWARN);    \
+	if (!(cptab) || unlikely(!(ptr))) /* retry without CPT if failure */  \
+		ptr = kmalloc(size, (flags) | __GFP_ZERO);		      \
+	if (likely((ptr) != NULL))					      \
+		OBD_ALLOC_POST((ptr), (size), "kmalloced");		      \
+} while (0)
+
+#define OBD_ALLOC_GFP(ptr, size, gfp_mask)				      \
+	__OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask)
+
+#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_NOFS)
+#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_KERNEL)
+#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof(*(ptr)))
+#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof(*(ptr)))
+#define OBD_ALLOC_PTR_ARRAY(ptr, n) OBD_ALLOC(ptr, (n) * sizeof(*(ptr)))
+#define OBD_ALLOC_PTR_ARRAY_WAIT(ptr, n)				      \
+		OBD_ALLOC_WAIT(ptr, (n) * sizeof(*(ptr)))
+
+#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask)		      \
+	__OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask)
+
+#define OBD_CPT_ALLOC(ptr, cptab, cpt, size)				      \
+	OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
+
+#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt)				      \
+	OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof(*(ptr)))
+
+/* Direct use of __vmalloc() allows for protection flag specification
+ * (and particularly to not set __GFP_FS, which is likely to cause some
+ * deadlock situations in our code).
+ */
+#define __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size)			      \
+do {									      \
+	(ptr) = cptab == NULL ?						      \
+		__ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO) :   \
+		cfs_cpt_vzalloc(cptab, cpt, size);			      \
+	if (unlikely((ptr) == NULL)) {                                        \
+		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",           \
+		       (int)(size));                                          \
+		CERROR("%llu total bytes allocated by Lustre, %lld by LNET\n",\
+		       obd_memory_sum(), libcfs_kmem_read());\
+	} else {                                                              \
+		OBD_ALLOC_POST(ptr, size, "vmalloced");                       \
+	}                                                                     \
+} while(0)
+
+#define OBD_VMALLOC(ptr, size)						      \
+	 __OBD_VMALLOC_VERBOSE(ptr, NULL, 0, size)
+#define OBD_CPT_VMALLOC(ptr, cptab, cpt, size)				      \
+	 __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size)
+
+#define OBD_ALLOC_LARGE(ptr, size)                                            \
+do {                                                                          \
+	/* LU-8196 - force large allocations to use vmalloc, not kmalloc */   \
+	if ((size) > KMALLOC_MAX_SIZE)                                          \
+		ptr = NULL;                                                   \
+	else                                                                  \
+		OBD_ALLOC_GFP(ptr, size, GFP_NOFS | __GFP_NOWARN);            \
+	if (ptr == NULL)                                                      \
+                OBD_VMALLOC(ptr, size);                                       \
+} while (0)
+
+#define OBD_ALLOC_PTR_ARRAY_LARGE(ptr, n)				\
+	OBD_ALLOC_LARGE(ptr, (n) * sizeof(*(ptr)))
+
+#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size)			      \
+do {									      \
+	OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS | __GFP_NOWARN);    \
+	if (ptr == NULL)                                                      \
+		OBD_CPT_VMALLOC(ptr, cptab, cpt, size);			      \
+} while (0)
+
+#ifdef CONFIG_DEBUG_SLAB
+#define POISON(ptr, c, s) do {} while (0)
+#define POISON_PTR(ptr)  ((void)0)
+#else
+#define POISON(ptr, c, s) memset(ptr, c, s)
+#define POISON_PTR(ptr)  (ptr) = (void *)0xdeadbeef
+#endif
+
+#ifdef POISON_BULK
+#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_SIZE); \
+                                    kunmap(page); } while (0)
+#else
+#define POISON_PAGE(page, val) do { } while (0)
+#endif
+
+#define OBD_FREE(ptr, size)						      \
+do {									      \
+	OBD_FREE_PRE(ptr, size, "kfreed");				      \
+	POISON(ptr, 0x5a, size);					      \
+	kfree(ptr);							      \
+	POISON_PTR(ptr);						      \
+} while (0)
+
+#define OBD_FREE_LARGE(ptr, size)					      \
+do {									      \
+	if (is_vmalloc_addr(ptr)) {					      \
+		OBD_FREE_PRE(ptr, size, "vfreed");			      \
+		POISON(ptr, 0x5a, size);				      \
+		libcfs_vfree_atomic(ptr);				      \
+		POISON_PTR(ptr);					      \
+	} else {							      \
+		OBD_FREE(ptr, size);					      \
+	}                                                                     \
+} while (0)
+
+#define OBD_FREE_PTR_ARRAY_LARGE(ptr, n)			\
+	OBD_FREE_LARGE(ptr, (n) * sizeof(*(ptr)))
+
+/* we memset() the slab object to 0 when allocation succeeds, so DO NOT
+ * HAVE A CTOR THAT DOES ANYTHING.  its work will be cleared here.  we'd
+ * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */
+#define OBD_SLAB_FREE_RTN0(ptr, slab)                                         \
+({                                                                            \
+	kmem_cache_free((slab), (ptr));                                    \
+        (ptr) = NULL;                                                         \
+        0;                                                                    \
+})
+
+#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type)	      \
+do {									      \
+	LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt()));		      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmem_cache_zalloc(slab, (type)) :			      \
+		cfs_mem_cache_cpt_alloc(slab, cptab, cpt, (type) | __GFP_ZERO); \
+	if (likely((ptr)))                                                    \
+		OBD_ALLOC_POST(ptr, size, "slab-alloced");                    \
+} while(0)
+
+#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags)			      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags)
+#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags)	      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags)
+
+#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof(*(ptr)))
+#define OBD_FREE_PTR_ARRAY(ptr, n) OBD_FREE(ptr, (n) * sizeof(*(ptr)))
+
+#define OBD_SLAB_FREE(ptr, slab, size)                                        \
+do {                                                                          \
+        OBD_FREE_PRE(ptr, size, "slab-freed");                                \
+	POISON(ptr, 0x5a, size);					      \
+	kmem_cache_free(slab, ptr);                                        \
+        POISON_PTR(ptr);                                                      \
+} while(0)
+
+#define OBD_SLAB_ALLOC(ptr, slab, size)					      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, size, GFP_NOFS)
+
+#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size)			      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, GFP_NOFS)
+
+#define OBD_SLAB_ALLOC_PTR(ptr, slab)					      \
+	OBD_SLAB_ALLOC(ptr, slab, sizeof(*(ptr)))
+
+#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt)			      \
+	OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof(*(ptr)))
+
+#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags)			      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof(*(ptr)), flags)
+
+#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags)		      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof(*(ptr)), flags)
+
+#define OBD_SLAB_FREE_PTR(ptr, slab)					      \
+	OBD_SLAB_FREE((ptr), (slab), sizeof(*(ptr)))
+
+#define KEY_IS(str) \
+        (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0)
+
+#ifdef HAVE_SERVER_SUPPORT
+/* LUSTRE_LMA_FL_MASKS defines which flags will be stored in LMA */
+
+static inline int lma_to_lustre_flags(__u32 lma_flags)
+{
+	return (((lma_flags & LMAI_ORPHAN) ? LUSTRE_ORPHAN_FL : 0) |
+		((lma_flags & LMAI_ENCRYPT) ? LUSTRE_ENCRYPT_FL : 0));
+}
+
+static inline int lustre_to_lma_flags(__u32 la_flags)
+{
+	return (((la_flags & LUSTRE_ORPHAN_FL) ? LMAI_ORPHAN : 0) |
+		((la_flags & LUSTRE_ENCRYPT_FL) ? LMAI_ENCRYPT : 0));
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history.
+ */
+static inline int ll_ext_to_inode_flags(int ext_flags)
+{
+	return (((ext_flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+		((ext_flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+		((ext_flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+		((ext_flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+#if defined(S_ENCRYPTED)
+		((ext_flags & LUSTRE_ENCRYPT_FL)   ? S_ENCRYPTED : 0) |
+#endif
+		((ext_flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int inode_flags)
+{
+	return (((inode_flags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+		((inode_flags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+		((inode_flags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+		((inode_flags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+#if defined(S_ENCRYPTED)
+		((inode_flags & S_ENCRYPTED) ? LUSTRE_ENCRYPT_FL   : 0) |
+#endif
+		((inode_flags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
+struct obd_heat_instance {
+	__u64 ohi_heat;
+	__u64 ohi_time_second;
+	__u64 ohi_count;
+};
+
+/* Define a fixed 4096-byte encryption unit size */
+#define LUSTRE_ENCRYPTION_BLOCKBITS   12
+#define LUSTRE_ENCRYPTION_UNIT_SIZE   ((size_t)1 << LUSTRE_ENCRYPTION_BLOCKBITS)
+#define LUSTRE_ENCRYPTION_MASK        (~(LUSTRE_ENCRYPTION_UNIT_SIZE - 1))
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_target.h b/drivers/staging/lustrefsx/lustre/include/obd_target.h
new file mode 100644
index 0000000000000..60337ca659ba2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_target.h
@@ -0,0 +1,73 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __OBD_TARGET_H
+#define __OBD_TARGET_H
+#include <lprocfs_status.h>
+
+/* server-side individual type definitions */
+
+#define OBT_MAGIC       0xBDDECEAE
+/* hold common fields for "target" device */
+struct obd_device_target {
+	__u32			obt_magic;
+	__u32			obt_instance;
+	struct lu_target       *obt_lut;
+	__u64			obt_mount_count;
+	struct obd_job_stats	obt_jobstats;
+	struct nm_config_file	*obt_nodemap_config_file;
+};
+
+#define OBJ_SUBDIR_COUNT 32 /* set to zero for no subdirs */
+
+struct filter_obd {
+	/* NB this field MUST be first */
+	struct obd_device_target	 fo_obt;
+};
+
+struct echo_obd {
+	struct obd_device_target	eo_obt;
+	struct obdo			eo_oa;
+	spinlock_t			eo_lock;
+	u64				eo_lastino;
+	struct lustre_handle		eo_nl_lock;
+	atomic_t			eo_prep;
+};
+
+struct ost_obd {
+	struct ptlrpc_service	*ost_service;
+	struct ptlrpc_service	*ost_create_service;
+	struct ptlrpc_service	*ost_io_service;
+	struct ptlrpc_service	*ost_seq_service;
+	struct ptlrpc_service	*ost_out_service;
+	struct mutex		 ost_health_mutex;
+};
+
+#endif /* __OBD_TARGET_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obj_update.h b/drivers/staging/lustrefsx/lustre/include/obj_update.h
new file mode 100644
index 0000000000000..8c88de86005ea
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obj_update.h
@@ -0,0 +1,115 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Selection of object_update and object_update_param handling functions
+ */
+
+#ifndef _OBJ_UPDATE_H_
+#define _OBJ_UPDATE_H_
+
+#include <uapi/linux/lustre/lustre_idl.h>
+
+static inline size_t
+object_update_param_size(const struct object_update_param *param)
+{
+	return cfs_size_round(sizeof(*param) + param->oup_len);
+}
+
+static inline size_t
+object_update_params_size(const struct object_update *update)
+{
+	const struct object_update_param *param;
+	size_t				 total_size = 0;
+	unsigned int			 i;
+
+	param = &update->ou_params[0];
+	for (i = 0; i < update->ou_params_count; i++) {
+		size_t size = object_update_param_size(param);
+
+		param = (struct object_update_param *)((char *)param + size);
+		total_size += size;
+	}
+
+	return total_size;
+}
+
+static inline size_t
+object_update_size(const struct object_update *update)
+{
+	return offsetof(struct object_update, ou_params[0]) +
+	       object_update_params_size(update);
+}
+
+static inline struct object_update *
+object_update_request_get(const struct object_update_request *our,
+			  unsigned int index, size_t *size)
+{
+	void	*ptr;
+	unsigned int i;
+
+	if (index >= our->ourq_count)
+		return NULL;
+
+	ptr = (void *)&our->ourq_updates[0];
+	for (i = 0; i < index; i++)
+		ptr += object_update_size(ptr);
+
+	if (size != NULL)
+		*size = object_update_size(ptr);
+
+	return ptr;
+}
+
+
+
+static inline struct object_update_result *
+object_update_result_get(const struct object_update_reply *reply,
+			 unsigned int index, size_t *size)
+{
+	__u16 count = reply->ourp_count;
+	unsigned int i;
+	void *ptr;
+
+	if (index >= count)
+		return NULL;
+
+	ptr = (char *)reply +
+	      cfs_size_round(offsetof(struct object_update_reply,
+				      ourp_lens[count]));
+	for (i = 0; i < index; i++) {
+		if (reply->ourp_lens[i] == 0)
+			return NULL;
+
+		ptr += cfs_size_round(reply->ourp_lens[i]);
+	}
+
+	if (size != NULL)
+		*size = reply->ourp_lens[index];
+
+	return ptr;
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/range_lock.h b/drivers/staging/lustrefsx/lustre/include/range_lock.h
new file mode 100644
index 0000000000000..674b27d52be75
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/range_lock.h
@@ -0,0 +1,77 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Range lock is used to allow multiple threads writing a single shared
+ * file given each thread is writing to a non-overlapping portion of the
+ * file.
+ *
+ * Refer to the possible upstream kernel version of range lock by
+ * Jan Kara <jack@suse.cz>: https://lkml.org/lkml/2013/1/31/480
+ *
+ * This file could later replaced by the upstream kernel version.
+ */
+/*
+ * Author: Prakash Surya <surya1@llnl.gov>
+ * Author: Bobi Jam <bobijam.xu@intel.com>
+ */
+#ifndef _RANGE_LOCK_H
+#define _RANGE_LOCK_H
+
+#include <libcfs/libcfs.h>
+
+#define RL_FMT "[%llu, %llu]"
+#define RL_PARA(range)					\
+	(unsigned long long)(range)->rl_start,	\
+	(unsigned long long)(range)->rl_end
+
+struct range_lock {
+	__u64				rl_start,
+					rl_end,
+					rl_subtree_last;
+	struct rb_node			rl_rb;
+	/**
+	 * Process to enqueue this lock.
+	 */
+	struct task_struct		*rl_task;
+	/**
+	 * Number of ranges which are blocking acquisition of the lock
+	 */
+	unsigned int			rl_blocking_ranges;
+	/**
+	 * Sequence number of range lock. This number is used to get to know
+	 * the order the locks are queued.  One lock can only block another
+	 * if it has a higher rl_sequence.
+	 */
+	__u64				rl_sequence;
+};
+
+struct range_lock_tree {
+	struct interval_tree_root	rlt_root;
+	spinlock_t			rlt_lock;
+	__u64				rlt_sequence;
+};
+
+void range_lock_tree_init(struct range_lock_tree *tree);
+void range_lock_init(struct range_lock *lock, __u64 start, __u64 end);
+int  range_lock(struct range_lock_tree *tree, struct range_lock *lock);
+void range_unlock(struct range_lock_tree *tree, struct range_lock *lock);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/seq_range.h b/drivers/staging/lustrefsx/lustre/include/seq_range.h
new file mode 100644
index 0000000000000..374d1932f0bdf
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/seq_range.h
@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Define lu_seq_range  associated functions
+ */
+
+#ifndef _SEQ_RANGE_H_
+#define _SEQ_RANGE_H_
+
+#include <uapi/linux/lustre/lustre_idl.h>
+
+/**
+ * computes the sequence range type \a range
+ */
+
+static inline unsigned fld_range_type(const struct lu_seq_range *range)
+{
+	return range->lsr_flags & LU_SEQ_RANGE_MASK;
+}
+
+/**
+ *  Is this sequence range an OST? \a range
+ */
+
+static inline bool fld_range_is_ost(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_OST;
+}
+
+/**
+ *  Is this sequence range an MDT? \a range
+ */
+
+static inline bool fld_range_is_mdt(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_MDT;
+}
+
+/**
+ * ANY range is only used when the fld client sends a fld query request,
+ * but it does not know whether the seq is an MDT or OST, so it will send the
+ * request with ANY type, which means any seq type from the lookup can be
+ * expected. /a range
+ */
+static inline unsigned fld_range_is_any(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_ANY;
+}
+
+/**
+ * Apply flags to range \a range \a flags
+ */
+
+static inline void fld_range_set_type(struct lu_seq_range *range,
+				      unsigned flags)
+{
+	range->lsr_flags |= flags;
+}
+
+/**
+ * Add MDT to range type \a range
+ */
+
+static inline void fld_range_set_mdt(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_MDT);
+}
+
+/**
+ * Add OST to range type \a range
+ */
+
+static inline void fld_range_set_ost(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_OST);
+}
+
+/**
+ * Add ANY to range type \a range
+ */
+
+static inline void fld_range_set_any(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_ANY);
+}
+
+/**
+ * computes width of given sequence range \a range
+ */
+
+static inline __u64 lu_seq_range_space(const struct lu_seq_range *range)
+{
+	return range->lsr_end - range->lsr_start;
+}
+
+/**
+ * initialize range to zero \a range
+ */
+
+static inline void lu_seq_range_init(struct lu_seq_range *range)
+{
+	memset(range, 0, sizeof(*range));
+}
+
+/**
+ * check if given seq id \a s is within given range \a range
+ */
+
+static inline bool lu_seq_range_within(const struct lu_seq_range *range,
+				       __u64 seq)
+{
+	return seq >= range->lsr_start && seq < range->lsr_end;
+}
+
+/**
+ * Is the range sane?  Is the end after the beginning? \a range
+ */
+
+static inline bool lu_seq_range_is_sane(const struct lu_seq_range *range)
+{
+	return range->lsr_end >= range->lsr_start;
+}
+
+/**
+ * Is the range 0? \a range
+ */
+
+static inline bool lu_seq_range_is_zero(const struct lu_seq_range *range)
+{
+	return range->lsr_start == 0 && range->lsr_end == 0;
+}
+
+/**
+ * Is the range out of space? \a range
+ */
+
+static inline bool lu_seq_range_is_exhausted(const struct lu_seq_range *range)
+{
+	return lu_seq_range_space(range) == 0;
+}
+
+/**
+ * return 0 if two ranges have the same location, nonzero if they are
+ * different \a r1 \a r2
+ */
+
+static inline int lu_seq_range_compare_loc(const struct lu_seq_range *r1,
+					   const struct lu_seq_range *r2)
+{
+	return r1->lsr_index != r2->lsr_index ||
+		r1->lsr_flags != r2->lsr_flags;
+}
+
+/**
+ * printf string and argument list for sequence range
+ */
+#define DRANGE "[%#16.16llx-%#16.16llx]:%x:%s"
+
+#define PRANGE(range)				\
+	(unsigned long long)(range)->lsr_start,	\
+	(unsigned long long)(range)->lsr_end,	\
+	(range)->lsr_index,			\
+	fld_range_is_mdt(range) ? "mdt" : "ost"
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lgss.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lgss.h
new file mode 100644
index 0000000000000..52c67fe981f60
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lgss.h
@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2022, Whamcloud.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _LGSS_H
+#define _LGSS_H
+
+#include <linux/types.h>
+
+/*
+ * sparse kernel source annotations
+ */
+#ifndef __user
+#define __user
+#endif
+
+struct lgssd_ioctl_param {
+	/* in */
+	__u32 version;
+	__u32 secid;
+	char __user *uuid;
+	__u32 lustre_svc;
+	__kernel_uid_t uid;
+	__kernel_gid_t gid;
+	__u64 send_token_size;
+	char __user *send_token;
+	__u64 reply_buf_size;
+	char __user *reply_buf;
+	/* out */
+	__u64 status;
+	__u64 reply_length;
+};
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_access_log.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_access_log.h
new file mode 100644
index 0000000000000..4972976725ced
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_access_log.h
@@ -0,0 +1,85 @@
+#ifndef _LUSTRE_ACCESS_LOG_H
+# define _LUSTRE_ACCESS_LOG_H
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS.
+ */
+#ifdef __KERNEL__
+# include <uapi/linux/lustre/lustre_user.h>
+#else
+# include <linux/lustre/lustre_user.h>
+#endif
+
+enum ofd_access_flags {
+	OFD_ACCESS_READ = 0x1,
+	OFD_ACCESS_WRITE = 0x2,
+};
+
+struct ofd_access_entry_v1 {
+	struct lu_fid	oae_parent_fid; /* 16 */
+	__u64		oae_begin; /* 24 */
+	__u64		oae_end; /* 32 */
+	__u64		oae_time; /* 40 */
+	__u32		oae_size; /* 44 */
+	__u32		oae_segment_count; /* 48 */
+	__u32		oae_flags; /* 52 enum ofd_access_flags */
+	__u32		oae_reserved1; /* 56 */
+	__u32		oae_reserved2; /* 60 */
+	__u32		oae_reserved3; /* 64 */
+};
+
+/* The name of the subdirectory of devtmpfs (/dev) containing the
+ * control and access log char devices. */
+#define LUSTRE_ACCESS_LOG_DIR_NAME "lustre-access-log"
+
+enum {
+	LUSTRE_ACCESS_LOG_VERSION_1 = 0x00010000,
+	LUSTRE_ACCESS_LOG_TYPE_OFD = 0x1,
+	LUSTRE_ACCESS_LOG_NAME_SIZE = 128,
+};
+
+struct lustre_access_log_info_v1 {
+	__u32	lali_version; /* LUSTRE_ACCESS_LOG_VERSION_1 */
+	__u32	lali_type; /* LUSTRE_ACCESS_LOG_TYPE_OFD */
+	char	lali_name[LUSTRE_ACCESS_LOG_NAME_SIZE]; /* obd_name */
+	__u32	lali_log_size;
+	__u32	lali_entry_size;
+	/* Underscore prefixed members are intended for test and debug
+	 * purposes only. */
+	__u32	_lali_head;
+	__u32	_lali_tail;
+	__u32	_lali_entry_space;
+	__u32	_lali_entry_count;
+	__u32	_lali_drop_count;
+	__u32	_lali_is_closed;
+};
+
+enum {
+	/* /dev/lustre-access-log/control ioctl: return lustre access log
+	 * interface version. */
+	LUSTRE_ACCESS_LOG_IOCTL_VERSION = _IO('O', 0x81),
+
+	/* /dev/lustre-access-log/control ioctl: return device major
+	 * used for access log devices. (The major is dynamically
+	 * allocated during ofd module initialization. */
+	LUSTRE_ACCESS_LOG_IOCTL_MAJOR = _IO('O', 0x82),
+
+	/* /dev/lustre-access-log/control ioctl: get global control event
+	 * count and store it into file private_data. */
+	LUSTRE_ACCESS_LOG_IOCTL_PRESCAN = _IO('O', 0x83),
+
+	/* /dev/lustre-access-log/OBDNAME ioctl: populate struct
+	 * lustre_access_log_info_v1 for the current device. */
+	LUSTRE_ACCESS_LOG_IOCTL_INFO = _IOR('O', 0x84, struct lustre_access_log_info_v1),
+
+	/* /dev/lustre-access-log/OBDNAME ioctl: only entries whose
+	 * PFID MDT index is equal to arg will be added to the log. A
+	 * value of 0xfffffffff ((__u32)-1) will disable filtering
+	 * which is the default.  Added in V2. */
+	LUSTRE_ACCESS_LOG_IOCTL_FILTER = _IOW('O', 0x85, __u32),
+};
+
+#endif /* _LUSTRE_ACCESS_LOG_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
new file mode 100644
index 0000000000000..38084241d8998
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
@@ -0,0 +1,74 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * lustre/include/lustre/lustre_barrier_user.h
+ *
+ * Lustre write barrier (on MDT) userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+#ifndef _LUSTRE_BARRIER_USER_H
+# define _LUSTRE_BARRIER_USER_H
+
+#include <linux/types.h>
+#include <linux/lustre/lustre_user.h>
+
+#define BARRIER_VERSION_V1	1
+#define BARRIER_TIMEOUT_DEFAULT	30
+
+enum barrier_commands {
+	BC_FREEZE	= 1,
+	BC_THAW		= 2,
+	BC_STAT		= 3,
+	BC_RESCAN	= 4,
+};
+
+enum barrier_status {
+	BS_INIT		= 0,
+	BS_FREEZING_P1	= 1,
+	BS_FREEZING_P2	= 2,
+	BS_FROZEN	= 3,
+	BS_THAWING	= 4,
+	BS_THAWED	= 5,
+	BS_FAILED	= 6,
+	BS_EXPIRED	= 7,
+	BS_RESCAN	= 8,
+};
+
+struct barrier_ctl {
+	__u32	bc_version;
+	__u32	bc_cmd;
+	union {
+		__s32	bc_timeout;
+		__u32	bc_total;
+	};
+	union {
+		__u32	bc_status;
+		__u32	bc_absence;
+	};
+	char	bc_name[12];
+	__u32	bc_padding;
+};
+
+#endif /* _LUSTRE_BARRIER_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
new file mode 100644
index 0000000000000..97bd28f188380
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
@@ -0,0 +1,346 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _UAPI_LUSTRE_CFG_H
+#define _UAPI_LUSTRE_CFG_H
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/lustre/lustre_user.h>
+
+/** \defgroup cfg cfg
+ *
+ * @{
+ */
+
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+	__ALIGN_KERNEL(offsetof(struct lustre_cfg, lcfg_buflens[(count)]), 8)
+
+/** If the LCFG_REQUIRED bit is set in a configuration command,
+ * then the client is required to understand this parameter
+ * in order to mount the filesystem. If it does not understand
+ * a REQUIRED command the client mount will fail.
+ */
+#define LCFG_REQUIRED	0x0001000
+
+enum lcfg_command_type {
+	LCFG_ATTACH		  = 0x00cf001, /**< create a new obd instance */
+	LCFG_DETACH		  = 0x00cf002, /**< destroy obd instance */
+	LCFG_SETUP		  = 0x00cf003, /**< call type-specific setup */
+	LCFG_CLEANUP		  = 0x00cf004, /**< call type-specific cleanup
+						 */
+	LCFG_ADD_UUID		  = 0x00cf005, /**< add a nid to a niduuid */
+	LCFG_DEL_UUID		  = 0x00cf006, /**< remove a nid from
+						 *  a niduuid
+						 */
+	LCFG_MOUNTOPT		  = 0x00cf007, /**< create a profile
+						 * (mdc, osc)
+						 */
+	LCFG_DEL_MOUNTOPT	  = 0x00cf008, /**< destroy a profile */
+	LCFG_SET_TIMEOUT	  = 0x00cf009, /**< set obd_timeout */
+	LCFG_SET_UPCALL		  = 0x00cf00a, /**< deprecated */
+	LCFG_ADD_CONN		  = 0x00cf00b, /**< add a failover niduuid to
+						 *  an obd
+						 */
+	LCFG_DEL_CONN		  = 0x00cf00c, /**< remove a failover niduuid */
+	LCFG_LOV_ADD_OBD	  = 0x00cf00d, /**< add an osc to a lov */
+	LCFG_LOV_DEL_OBD	  = 0x00cf00e, /**< remove an osc from a lov */
+	LCFG_PARAM		  = 0x00cf00f, /**< set a proc parameter */
+	LCFG_MARKER		  = 0x00cf010, /**< metadata about next
+						 *  cfg rec
+						 */
+	LCFG_LOG_START		  = 0x00ce011, /**< mgc only, process a
+						 *  cfg log
+						 */
+	LCFG_LOG_END		  = 0x00ce012, /**< stop processing updates */
+	LCFG_LOV_ADD_INA	  = 0x00ce013, /**< like LOV_ADD_OBD,
+						 *  inactive
+						 */
+	LCFG_ADD_MDC		  = 0x00cf014, /**< add an mdc to a lmv */
+	LCFG_DEL_MDC		  = 0x00cf015, /**< remove an mdc from a lmv */
+	LCFG_SPTLRPC_CONF	  = 0x00ce016, /**< security */
+	LCFG_POOL_NEW		  = 0x00ce020, /**< create an ost pool name */
+	LCFG_POOL_ADD		  = 0x00ce021, /**< add an ost to a pool */
+	LCFG_POOL_REM		  = 0x00ce022, /**< remove an ost from a pool */
+	LCFG_POOL_DEL		  = 0x00ce023, /**< destroy an ost pool name */
+	LCFG_SET_LDLM_TIMEOUT	  = 0x00ce030, /**< set ldlm_timeout */
+	LCFG_PRE_CLEANUP	  = 0x00cf031, /**< call type-specific pre
+						 * cleanup cleanup
+						 */
+	LCFG_SET_PARAM		  = 0x00ce032, /**< use set_param syntax to set
+						 * a proc parameters
+						 */
+	LCFG_NODEMAP_ADD	  = 0x00ce040, /**< create a cluster */
+	LCFG_NODEMAP_DEL	  = 0x00ce041, /**< destroy a cluster */
+	LCFG_NODEMAP_ADD_RANGE	  = 0x00ce042, /**< add a nid range */
+	LCFG_NODEMAP_DEL_RANGE	  = 0x00ce043, /**< delete an nid range */
+	LCFG_NODEMAP_ADD_UIDMAP	  = 0x00ce044, /**< add a uidmap */
+	LCFG_NODEMAP_DEL_UIDMAP	  = 0x00ce045, /**< delete a uidmap */
+	LCFG_NODEMAP_ADD_GIDMAP	  = 0x00ce046, /**< add a gidmap */
+	LCFG_NODEMAP_DEL_GIDMAP	  = 0x00ce047, /**< delete a gidmap */
+	LCFG_NODEMAP_ACTIVATE	  = 0x00ce048, /**< activate cluster
+						 *  id mapping
+						 */
+	LCFG_NODEMAP_ADMIN	  = 0x00ce049, /**< allow cluster to use id 0 */
+	LCFG_NODEMAP_ADD_PROJIDMAP	  = 0x00ce04a, /**< add a projidmap */
+	LCFG_NODEMAP_DEL_PROJIDMAP	  = 0x00ce04b, /**< delete a projidmap */
+	LCFG_NODEMAP_TRUSTED	  = 0x00ce050, /**< trust a clusters ids */
+	LCFG_NODEMAP_SQUASH_UID	  = 0x00ce051, /**< default map uid */
+	LCFG_NODEMAP_SQUASH_GID	  = 0x00ce052, /**< default map gid */
+	LCFG_NODEMAP_ADD_SHKEY	  = 0x00ce053, /**< add shared key to cluster */
+	LCFG_NODEMAP_DEL_SHKEY	  = 0x00ce054, /**< delete shared key from
+						 *  cluster
+						 */
+	LCFG_NODEMAP_TEST_NID	  = 0x00ce055, /**< test for nodemap
+						 *  membership
+						 */
+	LCFG_NODEMAP_TEST_ID	  = 0x00ce056, /**< test uid/gid mapping */
+	LCFG_NODEMAP_SET_FILESET  = 0x00ce057, /**< set fileset */
+	LCFG_NODEMAP_DENY_UNKNOWN = 0x00ce058, /**< deny squashed nodemap
+						 *  users
+						 */
+	LCFG_NODEMAP_MAP_MODE	  = 0x00ce059, /**< set the mapping mode */
+	LCFG_NODEMAP_AUDIT_MODE	  = 0x00ce05a, /**< set the audit mode */
+	LCFG_NODEMAP_SET_SEPOL	  = 0x00ce05b, /**< set SELinux policy */
+	LCFG_NODEMAP_FORBID_ENCRYPT	= 0x00ce05c, /**< forbid encryption */
+	LCFG_NODEMAP_SQUASH_PROJID	= 0x00ce05d, /**< default map projid */
+};
+
+struct lustre_cfg_bufs {
+	void  *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32 lcfg_bufcount;
+};
+
+struct lustre_cfg {
+	__u32 lcfg_version;
+	__u32 lcfg_command;
+
+	__u32 lcfg_num;
+	__u32 lcfg_flags;
+	__u64 lcfg_nid;
+	__u32 lcfg_nal;		/* not used any more */
+
+	__u32 lcfg_bufcount;
+	__u32 lcfg_buflens[0];
+};
+
+struct lcfg_type_data {
+	__u32	 ltd_type;
+	char	*ltd_name;
+	char	*ltd_bufs[4];
+};
+
+static struct lcfg_type_data lcfg_data_table[] = {
+	{ LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } },
+	{ LCFG_DETACH, "detach", { "1", "2", "3", "4" } },
+	{ LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } },
+	{ LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } },
+	{ LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" }  },
+	{ LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" }  },
+	{ LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" }  },
+	{ LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" }  },
+	{ LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" }  },
+	{ LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } },
+	{ LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } },
+	{ LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } },
+	{ LCFG_MARKER, "marker", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } },
+	{ LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } },
+	{ LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } },
+	{ LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } },
+	{ LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" }  },
+	{ LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } },
+	{ LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout",
+	  { "parameter", "2", "3", "4" } },
+	{ LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } },
+	{ 0, NULL, { NULL, NULL, NULL, NULL } }
+};
+
+static inline struct lcfg_type_data *lcfg_cmd2data(__u32 cmd)
+{
+	int i = 0;
+
+	while (lcfg_data_table[i].ltd_type != 0) {
+		if (lcfg_data_table[i].ltd_type == cmd)
+			return &lcfg_data_table[i];
+		i++;
+	}
+	return NULL;
+}
+
+enum cfg_record_type {
+	PORTALS_CFG_TYPE	= 1,
+	LUSTRE_CFG_TYPE		= 123,
+};
+
+#define LUSTRE_CFG_BUFLEN(lcfg, idx)					\
+	((lcfg)->lcfg_bufcount <= (idx) ? 0 : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+				       __u32 index, void *buf, __u32 buflen)
+{
+	if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+		return;
+
+	if (!bufs)
+		return;
+
+	if (bufs->lcfg_bufcount <= index)
+		bufs->lcfg_bufcount = index + 1;
+
+	bufs->lcfg_buf[index] = buf;
+	bufs->lcfg_buflen[index] = buflen;
+}
+
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+					      __u32 index, char *str)
+{
+	lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
+
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs,
+					 char *name)
+{
+	memset((bufs), 0, sizeof(*bufs));
+	if (name)
+		lustre_cfg_bufs_set_string(bufs, 0, name);
+}
+
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, __u32 index)
+{
+	__u32 i;
+	__kernel_size_t offset;
+	__u32 bufcount;
+
+	if (!lcfg)
+		return NULL;
+
+	bufcount = lcfg->lcfg_bufcount;
+	if (index >= bufcount)
+		return NULL;
+
+	offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < index; i++)
+		offset += __ALIGN_KERNEL(lcfg->lcfg_buflens[i], 8);
+	return (char *)lcfg + offset;
+}
+
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+					struct lustre_cfg *lcfg)
+{
+	__u32 i;
+
+	bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+	for (i = 0; i < bufs->lcfg_bufcount; i++) {
+		bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+		bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
+	}
+}
+
+static inline __u32 lustre_cfg_len(__u32 bufcount, __u32 *buflens)
+{
+	__u32 i;
+	__u32 len;
+
+	len = LCFG_HDR_SIZE(bufcount);
+	for (i = 0; i < bufcount; i++)
+		len += __ALIGN_KERNEL(buflens[i], 8);
+
+	return __ALIGN_KERNEL(len, 8);
+}
+
+static inline void lustre_cfg_init(struct lustre_cfg *lcfg, int cmd,
+				   struct lustre_cfg_bufs *bufs)
+{
+	char *ptr;
+	__u32 i;
+
+	lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+	lcfg->lcfg_command = cmd;
+	lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
+
+	ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+		lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+		if (bufs->lcfg_buf[i]) {
+			memcpy(ptr, bufs->lcfg_buf[i], bufs->lcfg_buflen[i]);
+			ptr += __ALIGN_KERNEL(bufs->lcfg_buflen[i], 8);
+		}
+	}
+}
+
+static inline int lustre_cfg_sanity_check(void *buf, __kernel_size_t len)
+{
+	struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
+
+	if (!lcfg)
+		return -EINVAL;
+
+	/* check that the first bits of the struct are valid */
+	if (len < LCFG_HDR_SIZE(0))
+		return -EINVAL;
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+		return -EINVAL;
+
+	if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+		return -EINVAL;
+
+	/* check that the buflens are valid */
+	if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+		return -EINVAL;
+
+	/* make sure all the pointers point inside the data */
+	if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+		return -EINVAL;
+
+	return 0;
+}
+
+/** @} cfg */
+
+#endif /* _UAPI_LUSTRE_CFG_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
new file mode 100644
index 0000000000000..54f73fdcca9ce
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
@@ -0,0 +1,231 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@seagate.com>
+ */
+
+#ifndef _UAPI_LUSTRE_DISK_H
+#define _UAPI_LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+#include <linux/types.h>
+
+/****************** on-disk files ********************/
+
+#define MDT_LOGS_DIR		"LOGS"	/* COMPAT_146 */
+#define MOUNT_CONFIGS_DIR	"CONFIGS"
+#define CONFIGS_FILE		"mountdata"
+/** Persistent mount data are stored on the disk in this file. */
+#define MOUNT_DATA_FILE		MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
+#define LAST_RCVD		"last_rcvd"
+#define REPLY_DATA		"reply_data"
+#define LOV_OBJID		"lov_objid"
+#define LOV_OBJSEQ		"lov_objseq"
+#define HEALTH_CHECK		"health_check"
+#define CAPA_KEYS		"capa_keys"
+#define CHANGELOG_USERS		"changelog_users"
+#define MGS_NIDTBL_DIR		"NIDTBL_VERSIONS"
+#define QMT_DIR			"quota_master"
+#define QSD_DIR			"quota_slave"
+#define QSD_DIR_DT		"quota_slave_dt"
+#define QSD_DIR_MD		"quota_slave_md"
+#define HSM_ACTIONS		"hsm_actions"
+#define LFSCK_DIR		"LFSCK"
+#define LFSCK_BOOKMARK		"lfsck_bookmark"
+#define LFSCK_LAYOUT		"lfsck_layout"
+#define LFSCK_NAMESPACE		"lfsck_namespace"
+#define REMOTE_PARENT_DIR	"REMOTE_PARENT_DIR"
+#define INDEX_BACKUP_DIR	"index_backup"
+#define MDT_ORPHAN_DIR		"PENDING"
+
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+	__u32 ldd_magic;
+	__u32 ldd_feature_compat;	/* compatible feature flags */
+	__u32 ldd_feature_rocompat;	/* read-only compatible feature flags */
+	__u32 ldd_feature_incompat;	/* incompatible feature flags */
+
+	__u32 ldd_config_ver;		/* config rewrite count - not used */
+	__u32 ldd_flags;		/* LDD_SV_TYPE */
+	__u32 ldd_svindex;		/* server index (0001), must match
+					 * svname
+					 */
+	__u32 ldd_mount_type;		/* target fs type LDD_MT_* */
+	char  ldd_fsname[64];		/* filesystem this server is part of,
+					 * MTI_NAME_MAXLEN
+					 */
+	char  ldd_svname[64];		/* this server's name (lustre-mdt0001)*/
+	__u8  ldd_uuid[40];		/* server UUID (COMPAT_146) */
+
+	char  ldd_userdata[1024 - 200];	/* arbitrary user string '200' */
+	__u8  ldd_padding[4096 - 1024];	/* 1024 */
+	char  ldd_mount_opts[4096];	/* target fs mount opts '4096' */
+	char  ldd_params[4096];		/* key=value pairs '8192' */
+};
+
+/****************** persistent mount data *********************/
+
+#define LDD_F_SV_TYPE_MDT	0x0001
+#define LDD_F_SV_TYPE_OST	0x0002
+#define LDD_F_SV_TYPE_MGS	0x0004
+#define LDD_F_SV_TYPE_MASK	(LDD_F_SV_TYPE_MDT  | \
+				 LDD_F_SV_TYPE_OST  | \
+				 LDD_F_SV_TYPE_MGS)
+#define LDD_F_SV_ALL		0x0008
+/** need an index assignment */
+#define LDD_F_NEED_INDEX	0x0010
+/** never registered */
+#define LDD_F_VIRGIN		0x0020
+/** update the config logs for this server */
+#define LDD_F_UPDATE		0x0040
+/** rewrite the LDD */
+#define LDD_F_REWRITE_LDD	0x0080
+/** regenerate config logs for this fs or server */
+#define LDD_F_WRITECONF		0x0100
+/** COMPAT_14 */
+/*#define LDD_F_UPGRADE14		0x0200 deprecated since 1.8 */
+/** process as lctl conf_param */
+#define LDD_F_PARAM		0x0400
+/** all nodes are specified as service nodes */
+#define LDD_F_NO_PRIMNODE	0x1000
+/** IR enable flag */
+#define LDD_F_IR_CAPABLE	0x2000
+/** the MGS refused to register the target. */
+#define LDD_F_ERROR		0x4000
+/** process at lctl conf_param */
+#define LDD_F_PARAM2		0x8000
+/** the target shouldn't use local logs */
+#define LDD_F_NO_LOCAL_LOGS	0x10000
+
+#define LDD_MAGIC 0x1dd00001
+
+#define XATTR_TARGET_RENAME "trusted.rename_tgt"
+
+enum ldd_mount_type {
+	LDD_MT_EXT3 = 0,
+	LDD_MT_LDISKFS,
+	LDD_MT_SMFS,
+	LDD_MT_REISERFS,
+	LDD_MT_LDISKFS2,
+	LDD_MT_ZFS,
+	LDD_MT_LAST
+};
+
+/****************** last_rcvd file *********************/
+
+#define LR_EXPIRE_INTERVALS 16	/**< number of intervals to track transno */
+#define LR_SERVER_SIZE	512
+#define LR_CLIENT_START	8192
+#define LR_CLIENT_SIZE	128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+
+/*
+ * Data stored per server at the head of the last_rcvd file. In le32 order.
+ */
+struct lr_server_data {
+	__u8  lsd_uuid[40];	   /* server UUID */
+	__u64 lsd_last_transno;    /* last completed transaction ID */
+	__u64 lsd_compat14;	   /* reserved - compat with old last_rcvd */
+	__u64 lsd_mount_count;	   /* incarnation number */
+	__u32 lsd_feature_compat;  /* compatible feature flags */
+	__u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+	__u32 lsd_feature_incompat;/* incompatible feature flags */
+	__u32 lsd_server_size;	   /* size of server data area */
+	__u32 lsd_client_start;    /* start of per-client data area */
+	__u16 lsd_client_size;	   /* size of per-client data area */
+	__u16 lsd_subdir_count;    /* number of subdirectories for objects */
+	__u64 lsd_catalog_oid;	   /* recovery catalog object id */
+	__u32 lsd_catalog_ogen;    /* recovery catalog inode generation */
+	__u8  lsd_peeruuid[40];    /* UUID of MDS associated with this OST */
+	__u32 lsd_osd_index;	   /* index number of OST in LOV */
+	__u32 lsd_padding1;	   /* was lsd_mdt_index, unused in 2.4.0 */
+	__u32 lsd_start_epoch;	   /* VBR: start epoch from last boot */
+	/** transaction values since lsd_trans_table_time */
+	__u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+	/** start point of transno table below */
+	__u32 lsd_trans_table_time; /* time of first slot in table above */
+	__u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+	__u8  lsd_padding[LR_SERVER_SIZE - 288];
+};
+
+/* Data stored per client in the last_rcvd file. In le32 order. */
+struct lsd_client_data {
+	__u8  lcd_uuid[40];		/* client UUID */
+	__u64 lcd_last_transno;		/* last completed transaction ID */
+	__u64 lcd_last_xid;		/* xid for the last transaction */
+	__u32 lcd_last_result;		/* result from last RPC */
+	__u32 lcd_last_data;		/* per-op data (disposition for
+					 * open &c.)
+					 */
+	/* for MDS_CLOSE requests */
+	__u64 lcd_last_close_transno;	/* last completed transaction ID */
+	__u64 lcd_last_close_xid;	/* xid for the last transaction */
+	__u32 lcd_last_close_result;	/* result from last RPC */
+	__u32 lcd_last_close_data;	/* per-op data */
+	/* VBR: last versions */
+	__u64 lcd_pre_versions[4];
+	__u32 lcd_last_epoch;
+	/* generation counter of client slot in last_rcvd */
+	__u32 lcd_generation;
+	__u8  lcd_padding[LR_CLIENT_SIZE - 128];
+};
+
+/* Data stored in each slot of the reply_data file.
+ *
+ * The lrd_client_gen field is assigned with lcd_generation value
+ * to allow identify which client the reply data belongs to.
+ */
+struct lsd_reply_data {
+	__u64 lrd_transno;	/* transaction number */
+	__u64 lrd_xid;		/* transmission id */
+	__u64 lrd_data;		/* per-operation data */
+	__u32 lrd_result;	/* request result */
+	__u32 lrd_client_gen;	/* client generation */
+};
+
+/* Header of the reply_data file */
+#define LRH_MAGIC 0xbdabda01
+struct lsd_reply_header {
+	__u32	lrh_magic;
+	__u32	lrh_header_size;
+	__u32	lrh_reply_size;
+	__u8	lrh_pad[sizeof(struct lsd_reply_data) - 12];
+};
+
+/** @} disk */
+
+#endif /* _UAPI_LUSTRE_DISK_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
new file mode 100644
index 0000000000000..f11ad3b3b2115
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
@@ -0,0 +1,364 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ * Copyright 2016 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * all fid manipulation functions go here
+ *
+ * FIDS are globally unique within a Lustre filessytem, and are made up
+ * of three parts: sequence, Object ID, and version.
+ *
+ */
+#ifndef _UAPI_LUSTRE_FID_H_
+#define _UAPI_LUSTRE_FID_H_
+
+#include <linux/types.h>
+#include <linux/lustre/lustre_idl.h>
+
+/** returns fid object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+	return fid->f_seq;
+}
+
+/** returns fid object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+	return fid->f_oid;
+}
+
+/** returns fid object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+	return fid->f_ver;
+}
+
+static inline void fid_zero(struct lu_fid *fid)
+{
+	memset(fid, 0, sizeof(*fid));
+}
+
+static inline __u64 fid_ver_oid(const struct lu_fid *fid)
+{
+	return (__u64)fid_ver(fid) << 32 | fid_oid(fid);
+}
+
+static inline bool fid_seq_is_mdt0(__u64 seq)
+{
+	return seq == FID_SEQ_OST_MDT0;
+}
+
+static inline bool fid_seq_is_mdt(__u64 seq)
+{
+	return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL;
+};
+
+static inline bool fid_seq_is_echo(__u64 seq)
+{
+	return seq == FID_SEQ_ECHO;
+}
+
+static inline bool fid_is_echo(const struct lu_fid *fid)
+{
+	return fid_seq_is_echo(fid_seq(fid));
+}
+
+static inline bool fid_seq_is_llog(__u64 seq)
+{
+	return seq == FID_SEQ_LLOG;
+}
+
+static inline bool fid_is_llog(const struct lu_fid *fid)
+{
+	/* file with OID == 0 is not llog but contains last oid */
+	return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0;
+}
+
+static inline bool fid_seq_is_rsvd(__u64 seq)
+{
+	return seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD;
+};
+
+static inline bool fid_seq_is_special(__u64 seq)
+{
+	return seq == FID_SEQ_SPECIAL;
+};
+
+static inline bool fid_seq_is_local_file(__u64 seq)
+{
+	return seq == FID_SEQ_LOCAL_FILE ||
+	       seq == FID_SEQ_LOCAL_NAME;
+};
+
+static inline bool fid_seq_is_root(__u64 seq)
+{
+	return seq == FID_SEQ_ROOT;
+}
+
+static inline bool fid_seq_is_dot(__u64 seq)
+{
+	return seq == FID_SEQ_DOT_LUSTRE;
+}
+
+static inline bool fid_seq_is_default(__u64 seq)
+{
+	return seq == FID_SEQ_LOV_DEFAULT;
+}
+
+static inline bool fid_is_mdt0(const struct lu_fid *fid)
+{
+	return fid_seq_is_mdt0(fid_seq(fid));
+}
+
+static inline void lu_root_fid(struct lu_fid *fid)
+{
+	fid->f_seq = FID_SEQ_ROOT;
+	fid->f_oid = FID_OID_ROOT;
+	fid->f_ver = 0;
+}
+
+static inline void lu_echo_root_fid(struct lu_fid *fid)
+{
+	fid->f_seq = FID_SEQ_ROOT;
+	fid->f_oid = FID_OID_ECHO_ROOT;
+	fid->f_ver = 0;
+}
+
+static inline void lu_update_log_fid(struct lu_fid *fid, __u32 index)
+{
+	fid->f_seq = FID_SEQ_UPDATE_LOG;
+	fid->f_oid = index;
+	fid->f_ver = 0;
+}
+
+static inline void lu_update_log_dir_fid(struct lu_fid *fid, __u32 index)
+{
+	fid->f_seq = FID_SEQ_UPDATE_LOG_DIR;
+	fid->f_oid = index;
+	fid->f_ver = 0;
+}
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is an igif; otherwise false.
+ */
+static inline bool fid_seq_is_igif(__u64 seq)
+{
+	return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline bool fid_is_igif(const struct lu_fid *fid)
+{
+	return fid_seq_is_igif(fid_seq(fid));
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is an idif; otherwise false.
+ */
+static inline bool fid_seq_is_idif(__u64 seq)
+{
+	return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX;
+}
+
+static inline bool fid_is_idif(const struct lu_fid *fid)
+{
+	return fid_seq_is_idif(fid_seq(fid));
+}
+
+static inline bool fid_is_local_file(const struct lu_fid *fid)
+{
+	return fid_seq_is_local_file(fid_seq(fid));
+}
+
+static inline bool fid_seq_is_norm(__u64 seq)
+{
+	return (seq >= FID_SEQ_NORMAL);
+}
+
+static inline bool fid_is_norm(const struct lu_fid *fid)
+{
+	return fid_seq_is_norm(fid_seq(fid));
+}
+
+static inline int fid_is_layout_rbtree(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_LAYOUT_RBTREE;
+}
+
+static inline bool fid_seq_is_update_log(__u64 seq)
+{
+	return seq == FID_SEQ_UPDATE_LOG;
+}
+
+static inline bool fid_is_update_log(const struct lu_fid *fid)
+{
+	return fid_seq_is_update_log(fid_seq(fid));
+}
+
+static inline bool fid_seq_is_update_log_dir(__u64 seq)
+{
+	return seq == FID_SEQ_UPDATE_LOG_DIR;
+}
+
+static inline bool fid_is_update_log_dir(const struct lu_fid *fid)
+{
+	return fid_seq_is_update_log_dir(fid_seq(fid));
+}
+
+/* convert an OST objid into an IDIF FID SEQ number */
+static inline __u64 fid_idif_seq(__u64 id, __u32 ost_idx)
+{
+	return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert a packed IDIF FID into an OST objid */
+static inline __u64 fid_idif_id(__u64 seq, __u32 oid, __u32 ver)
+{
+	return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid;
+}
+
+static inline __u32 idif_ost_idx(__u64 seq)
+{
+	return (seq >> 16) & 0xffff;
+}
+
+/* extract ost index from IDIF FID */
+static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid)
+{
+	return idif_ost_idx(fid_seq(fid));
+}
+
+/* Check whether the fid is for LAST_ID */
+static inline bool fid_is_last_id(const struct lu_fid *fid)
+{
+	if (fid_oid(fid) != 0)
+		return false;
+
+	if (fid_is_idif(fid) && ((fid_seq(fid) & 0xFFFF) != 0))
+		return false;
+
+	if (fid_seq(fid) == FID_SEQ_UPDATE_LOG ||
+	    fid_seq(fid) == FID_SEQ_UPDATE_LOG_DIR ||
+	    fid_seq_is_igif(fid_seq(fid)))
+		return false;
+
+	return true;
+}
+
+/**
+ * Get inode number from an igif.
+ * \param fid an igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline __kernel_ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+	return fid_seq(fid);
+}
+
+/**
+ * Get inode generation from an igif.
+ * \param fid an igif to get inode generation from.
+ * \return inode generation for the igif.
+ */
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+	return fid_oid(fid);
+}
+
+/**
+ * Build igif from the inode number/generation.
+ */
+static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen)
+{
+	fid->f_seq = ino;
+	fid->f_oid = gen;
+	fid->f_ver = 0;
+}
+
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
+static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = __cpu_to_le64(fid_seq(src));
+	dst->f_oid = __cpu_to_le32(fid_oid(src));
+	dst->f_ver = __cpu_to_le32(fid_ver(src));
+}
+
+static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = __le64_to_cpu(fid_seq(src));
+	dst->f_oid = __le32_to_cpu(fid_oid(src));
+	dst->f_ver = __le32_to_cpu(fid_ver(src));
+}
+
+static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = __cpu_to_be64(fid_seq(src));
+	dst->f_oid = __cpu_to_be32(fid_oid(src));
+	dst->f_ver = __cpu_to_be32(fid_ver(src));
+}
+
+static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = __be64_to_cpu(fid_seq(src));
+	dst->f_oid = __be32_to_cpu(fid_oid(src));
+	dst->f_ver = __be32_to_cpu(fid_ver(src));
+}
+
+static inline bool fid_is_sane(const struct lu_fid *fid)
+{
+	return fid && ((fid_seq(fid) >= FID_SEQ_START && !fid_ver(fid)) ||
+			fid_is_igif(fid) || fid_is_idif(fid) ||
+			fid_seq_is_rsvd(fid_seq(fid)));
+}
+
+static inline bool lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1)
+{
+	return !memcmp(f0, f1, sizeof(*f0));
+}
+
+static inline int lu_fid_cmp(const struct lu_fid *f0,
+			     const struct lu_fid *f1)
+{
+	if (fid_seq(f0) != fid_seq(f1))
+		return fid_seq(f0) > fid_seq(f1) ? 1 : -1;
+
+	if (fid_oid(f0) != fid_oid(f1))
+		return fid_oid(f0) > fid_oid(f1) ? 1 : -1;
+
+	if (fid_ver(f0) != fid_ver(f1))
+		return fid_ver(f0) > fid_ver(f1) ? 1 : -1;
+
+	return 0;
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
new file mode 100644
index 0000000000000..d530794f4e9cb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
@@ -0,0 +1,99 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+#ifdef __KERNEL__
+# include <linux/stddef.h>
+#else
+# include <stddef.h>
+#endif
+#include <linux/fiemap.h>
+#include <linux/types.h>
+
+/**
+ * XXX: We use fiemap_extent::fe_reserved[0], notice the high 16bits of it
+ * is used to locate the stripe number starting from the very beginning to
+ * resume the fiemap call.
+ */
+#define fe_device	fe_reserved[0]
+
+static inline int get_fe_device(struct fiemap_extent *fe)
+{
+	return fe->fe_device & 0xffff;
+}
+static inline void set_fe_device(struct fiemap_extent *fe, int devno)
+{
+	fe->fe_device = (fe->fe_device & 0xffff0000) | (devno & 0xffff);
+}
+static inline int get_fe_stripenr(struct fiemap_extent *fe)
+{
+	return fe->fe_device >> 16;
+}
+static inline void set_fe_stripenr(struct fiemap_extent *fe, int nr)
+{
+	fe->fe_device = (fe->fe_device & 0xffff) | (nr << 16);
+}
+static inline void set_fe_device_stripenr(struct fiemap_extent *fe, int devno,
+					  int nr)
+{
+	fe->fe_device = (nr << 16) | (devno & 0xffff);
+}
+
+static inline __kernel_size_t fiemap_count_to_size(__kernel_size_t extent_count)
+{
+	return sizeof(struct fiemap) + extent_count *
+				       sizeof(struct fiemap_extent);
+}
+
+static inline unsigned int fiemap_size_to_count(__kernel_size_t array_size)
+{
+	return (array_size - sizeof(struct fiemap)) /
+	       sizeof(struct fiemap_extent);
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+#define FIEMAP_EXTENT_NET       0x80000000 /* Data stored remotely.
+					    * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
new file mode 100644
index 0000000000000..5edfca121c4df
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
@@ -0,0 +1,3755 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Lustre wire protocol definitions.
+ */
+
+/** \defgroup lustreidl lustreidl
+ *
+ * Lustre wire protocol definitions.
+ *
+ * ALL structs passing over the wire should be declared here.  Structs
+ * that are used in interfaces with userspace should go in lustre_user.h.
+ *
+ * All structs being declared here should be built from simple fixed-size
+ * types defined in linux/types.h or be built from other types or
+ * structs also declared in this file.  Similarly, all flags and magic
+ * values in those structs should also be declared here.  This ensures
+ * that the Lustre wire protocol is not influenced by external dependencies.
+ *
+ * The only other acceptable items in this file are VERY SIMPLE accessor
+ * functions to avoid callers grubbing inside the structures. Nothing that
+ * depends on external functions or definitions should be in here.
+ *
+ * Structs must be properly aligned to put 64-bit values on an 8-byte
+ * boundary.  Any structs being added here must also be added to
+ * utils/wirecheck.c and "make newwiretest" run to regenerate the
+ * utils/wiretest.c sources.  This allows us to verify that wire structs
+ * have the proper alignment/size on all architectures.
+ *
+ * DO NOT CHANGE any of the structs, flags, values declared here and used
+ * in released Lustre versions.  Some structs may have padding fields that
+ * can be used.  Some structs might allow addition at the end (verify this
+ * in the code to ensure that new/old clients that see this larger struct
+ * do not fail, otherwise you need to implement protocol compatibility).
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_IDL_H_
+#define _LUSTRE_IDL_H_
+
+#include <asm/byteorder.h>
+#include <linux/errno.h>
+#include <linux/fiemap.h>
+#include <linux/types.h>
+#include <linux/lnet/lnet-types.h>
+#include <linux/lustre/lustre_user.h>
+#include <linux/lustre/lustre_ver.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ *  GENERAL STUFF
+ */
+/* FOO_REQUEST_PORTAL is for incoming requests on the FOO
+ * FOO_REPLY_PORTAL   is for incoming replies on the FOO
+ * FOO_BULK_PORTAL    is for incoming bulk on the FOO
+ */
+
+#define CONNMGR_REQUEST_PORTAL          1
+#define CONNMGR_REPLY_PORTAL            2
+#define OSC_REPLY_PORTAL                4
+#define OST_IO_PORTAL                   6
+#define OST_CREATE_PORTAL               7
+#define OST_BULK_PORTAL                 8
+#define MDC_REPLY_PORTAL               10
+#define MDS_REQUEST_PORTAL             12
+#define MDS_IO_PORTAL			13
+#define MDS_BULK_PORTAL                14
+#define LDLM_CB_REQUEST_PORTAL         15
+#define LDLM_CB_REPLY_PORTAL           16
+#define LDLM_CANCEL_REQUEST_PORTAL     17
+#define LDLM_CANCEL_REPLY_PORTAL       18
+/* #define MDS_SETATTR_PORTAL             22 obsolete after 2.13 */
+#define MDS_READPAGE_PORTAL            23
+#define OUT_PORTAL			24
+#define MGC_REPLY_PORTAL               25
+#define MGS_REQUEST_PORTAL             26
+#define MGS_REPLY_PORTAL               27
+#define OST_REQUEST_PORTAL             28
+#define FLD_REQUEST_PORTAL             29
+#define SEQ_METADATA_PORTAL            30
+#define SEQ_DATA_PORTAL                31
+#define SEQ_CONTROLLER_PORTAL          32
+#define MGS_BULK_PORTAL                33
+/* #define DVS_PORTAL			63 */
+/* reserved for Cray DVS - spitzcor@cray.com, roe@cray.com, n8851@cray.com */
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ * Same structure is used in fld module where lsr_index field holds mdt id
+ * of the home mdt.
+ */
+struct lu_seq_range {
+	__u64 lsr_start;
+	__u64 lsr_end;
+	__u32 lsr_index;
+	__u32 lsr_flags;
+};
+
+struct lu_seq_range_array {
+	__u32 lsra_count;
+	__u32 lsra_padding;
+	struct lu_seq_range lsra_lsr[0];
+};
+
+#define LU_SEQ_RANGE_MDT	0x0
+#define LU_SEQ_RANGE_OST	0x1
+#define LU_SEQ_RANGE_ANY	0x3
+
+#define LU_SEQ_RANGE_MASK	0x3
+
+/** \defgroup lu_fid lu_fid
+ * @{ */
+
+extern void lustre_lma_swab(struct lustre_mdt_attrs *lma);
+extern void lustre_lma_init(struct lustre_mdt_attrs *lma,
+			    const struct lu_fid *fid,
+			    __u32 compat, __u32 incompat);
+extern void lustre_loa_swab(struct lustre_ost_attrs *loa,
+			    bool to_cpu);
+extern void lustre_loa_init(struct lustre_ost_attrs *loa,
+			    const struct lu_fid *fid,
+			    __u32 compat, __u32 incompat);
+
+/* copytool can use any nonnegative integer to represent archive-Ids during
+ * register with MDT thru kuc.
+ * archive num = 0 => all
+ * archive num from 1 to MAX_U32
+ */
+#define LL_HSM_ORIGIN_MAX_ARCHIVE	(sizeof(__u32) * 8)
+/* the max count of archive ids that one agent can support */
+#define LL_HSM_MAX_ARCHIVES_PER_AGENT	1024
+
+/**
+ * HSM on-disk attributes stored in a separate xattr.
+ */
+struct hsm_attrs {
+	/** Bitfield for supported data in this structure. For future use. */
+	__u32	hsm_compat;
+
+	/** HSM flags, see hsm_flags enum below */
+	__u32	hsm_flags;
+	/** backend archive id associated with the file */
+	__u64	hsm_arch_id;
+	/** version associated with the last archiving, if any */
+	__u64	hsm_arch_ver;
+};
+extern void lustre_hsm_swab(struct hsm_attrs *attrs);
+
+/**
+ * fid constants
+ */
+enum {
+	/** LASTID file has zero OID */
+	LUSTRE_FID_LASTID_OID = 0UL,
+        /** initial fid id value */
+        LUSTRE_FID_INIT_OID  = 1UL
+};
+
+/**
+ * Different FID Format
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0
+ *
+ * FID:
+ * File IDentifier generated by client from range allocated by the seq service.
+ * First 0x400 sequences [2^33, 2^33 + 0x400] are reserved for system use. Note
+ * that on ldiskfs MDTs that IGIF FIDs can use inode numbers starting at 12,
+ * but this is in the IGIF SEQ rangeand does not conflict with assigned FIDs.
+ *
+ * IGIF:
+ * Inode and Generation In FID, a surrogate FID used to globally identify an
+ * existing object on OLD formatted MDT file system. This would only be used on
+ * MDT0 in a DNE filesystem, because there are not expected to be any OLD
+ * formatted DNE filesystems. Belongs to a sequence in [12, 2^32 - 1] range,
+ * where sequence number is inode number, and inode generation is used as OID.
+ * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ * which is the maximum possible for an ldiskfs backend. NOTE: This assumes
+ * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ * to clients, which has always been true.
+ *
+ * IDIF:
+ * Object ID in FID, a surrogate FID used to globally identify an existing
+ * object on OLD formatted OST file system. Belongs to a sequence in
+ * [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *	1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object ID.
+ * The generation of unique SEQ values per OST allows the IDIF FIDs to be
+ * identified in the FLD correctly. The OID field is calculated as:
+ *	objid & 0xffffffff
+ * that is, it consists of lower 32 bits of object ID. NOTE This assumes that
+ * no more than 2^48-1 objects have ever been created on an OST, and that no
+ * more than 65535 OSTs are in use. Both are very reasonable assumptions (can
+ * uniquely map all objects on an OST that created 1M objects per second for 9
+ * years, or combinations thereof).
+ *
+ * OST_MDT0:
+ * Surrogate FID used to identify an existing object on OLD formatted OST
+ * filesystem. Belongs to the reserved sequence 0, and is used internally prior
+ * to the introduction of FID-on-OST, at which point IDIF will be used to
+ * identify objects as residing on a specific OST.
+ *
+ * LLOG:
+ * For Lustre Log objects the object sequence 1 is used. This is compatible with
+ * both OLD and NEW.1 namespaces, as this SEQ number is in the ext3/ldiskfs
+ * reserved inode range and does not conflict with IGIF sequence numbers.
+ *
+ * ECHO:
+ * For testing OST IO performance the object sequence 2 is used. This is
+ * compatible with both OLD and NEW.1 namespaces, as this SEQ number is in the
+ * ext3/ldiskfs reserved inode range and does not conflict with IGIF sequence
+ * numbers.
+ *
+ * OST_MDT1 .. OST_MAX:
+ * For testing with multiple MDTs the object sequence 3 through 9 is used,
+ * allowing direct mapping of MDTs 1 through 7 respectively, for a total of 8
+ * MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ * mappings. However, this SEQ range is only for testing prior to any production
+ * DNE release, as the objects in this range conflict across all OSTs, as the
+ * OST index is not part of the FID.
+ *
+ *
+ * For compatibility with existing OLD OST network protocol structures, the FID
+ * must map onto the o_id and o_gr in a manner that ensures existing objects are
+ * identified consistently for IO, as well as onto the lock namespace to ensure
+ * both IDIFs map onto the same objects for IO as well as resources in the DLM.
+ *
+ * DLM OLD OBIF/IDIF:
+ * resource[] = {o_id, o_seq, 0, 0};  // o_seq == 0 for production releases
+ *
+ * DLM NEW.1 FID (this is the same for both the MDT and OST):
+ * resource[] = {SEQ, OID, VER, HASH};
+ *
+ * Note that for mapping IDIF values to DLM resource names the o_id may be
+ * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ * for the o_id numbers to overlap FID SEQ numbers in the resource. However, in
+ * all production releases the OLD o_seq field is always zero, and all valid FID
+ * OID values are non-zero, so the lock resources will not collide.
+ *
+ * For objects within the IDIF range, group extraction (non-CMD) will be:
+ * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ * o_seq = 0;  // formerly group number
+ */
+
+/**
+ * Note that reserved SEQ numbers below 12 will conflict with ldiskfs
+ * inodes in the IGIF namespace, so these reserved SEQ numbers can be
+ * used for other purposes and not risk collisions with existing inodes.
+ */
+enum fid_seq {
+	FID_SEQ_OST_MDT0	= 0,
+	FID_SEQ_LLOG		= 1, /* unnamed llogs */
+	FID_SEQ_ECHO		= 2,
+	FID_SEQ_UNUSED_START	= 3, /* Unused */
+	FID_SEQ_UNUSED_END	= 9, /* Unused */
+	FID_SEQ_LLOG_NAME	= 10, /* named llogs */
+	FID_SEQ_RSVD		= 11,
+	FID_SEQ_IGIF		= 12,
+	FID_SEQ_IGIF_MAX	= 0x0ffffffffULL,
+	FID_SEQ_IDIF		= 0x100000000ULL,
+	FID_SEQ_IDIF_MAX	= 0x1ffffffffULL,
+	/* Normal FID sequence starts from this value, i.e. 1<<33 */
+	FID_SEQ_START		= 0x200000000ULL,
+	/* sequence for local pre-defined FIDs listed in local_oid */
+	FID_SEQ_LOCAL_FILE	= 0x200000001ULL,
+	FID_SEQ_DOT_LUSTRE	= 0x200000002ULL,
+	/* sequence is used for local named objects FIDs generated
+	 * by local_object_storage library */
+	FID_SEQ_LOCAL_NAME	= 0x200000003ULL,
+	/* Because current FLD will only cache the fid sequence, instead
+	 * of oid on the client side, if the FID needs to be exposed to
+	 * clients sides, it needs to make sure all of fids under one
+	 * sequence will be located in one MDT. */
+	FID_SEQ_SPECIAL		= 0x200000004ULL,
+	FID_SEQ_QUOTA		= 0x200000005ULL,
+	FID_SEQ_QUOTA_GLB	= 0x200000006ULL,
+	FID_SEQ_ROOT		= 0x200000007ULL,  /* Located on MDT0 */
+	FID_SEQ_LAYOUT_RBTREE	= 0x200000008ULL,
+	/* sequence is used for update logs of cross-MDT operation */
+	FID_SEQ_UPDATE_LOG	= 0x200000009ULL,
+	/* Sequence is used for the directory under which update logs
+	 * are created. */
+	FID_SEQ_UPDATE_LOG_DIR	= 0x20000000aULL,
+	FID_SEQ_NORMAL		= 0x200000400ULL,
+	FID_SEQ_LOV_DEFAULT	= 0xffffffffffffffffULL
+};
+
+#define OBIF_OID_MAX_BITS           32
+#define OBIF_MAX_OID                (1ULL << OBIF_OID_MAX_BITS)
+#define OBIF_OID_MASK               ((1ULL << OBIF_OID_MAX_BITS) - 1)
+#define IDIF_OID_MAX_BITS           48
+#define IDIF_MAX_OID                (1ULL << IDIF_OID_MAX_BITS)
+#define IDIF_OID_MASK               ((1ULL << IDIF_OID_MAX_BITS) - 1)
+
+/** OID for FID_SEQ_SPECIAL */
+enum special_oid {
+        /* Big Filesystem Lock to serialize rename operations */
+        FID_OID_SPECIAL_BFL     = 1UL,
+};
+
+/** OID for FID_SEQ_DOT_LUSTRE */
+enum dot_lustre_oid {
+	FID_OID_DOT_LUSTRE	= 1UL,
+	FID_OID_DOT_LUSTRE_OBF	= 2UL,
+	FID_OID_DOT_LUSTRE_LPF	= 3UL,
+};
+
+/** OID for FID_SEQ_ROOT */
+enum root_oid {
+	FID_OID_ROOT		= 1UL,
+	FID_OID_ECHO_ROOT	= 2UL,
+};
+
+struct lu_orphan_rec {
+	/* The MDT-object's FID referenced by the orphan OST-object */
+	struct lu_fid	lor_fid;
+	__u32		lor_uid;
+	__u32		lor_gid;
+};
+
+struct lu_orphan_ent {
+	/* The orphan OST-object's FID */
+	struct lu_fid		loe_key;
+	struct lu_orphan_rec	loe_rec;
+};
+
+struct lu_orphan_rec_v2 {
+	struct lu_orphan_rec	lor_rec;
+	struct ost_layout	lor_layout;
+	__u32			lor_padding;
+};
+
+struct lu_orphan_ent_v2 {
+	/* The orphan OST-object's FID */
+	struct lu_fid		loe_key;
+	struct lu_orphan_rec_v2	loe_rec;
+};
+
+struct lu_orphan_rec_v3 {
+	struct lu_orphan_rec	lor_rec;
+	struct ost_layout	lor_layout;
+	/* The OST-object declared layout version in PFID EA.*/
+	__u32			lor_layout_version;
+	/* The OST-object declared layout range (of version) in PFID EA.*/
+	__u32			lor_range;
+	__u32			lor_padding_1;
+	__u64			lor_padding_2;
+};
+
+struct lu_orphan_ent_v3 {
+	/* The orphan OST-object's FID */
+	struct lu_fid		loe_key;
+	struct lu_orphan_rec_v3	loe_rec;
+};
+
+/** @} lu_fid */
+
+/** \defgroup lu_dir lu_dir
+ * @{ */
+
+/**
+ * Enumeration of possible directory entry attributes.
+ *
+ * Attributes follow directory entry header in the order they appear in this
+ * enumeration.
+ */
+enum lu_dirent_attrs {
+	LUDA_FID		= 0x0001,
+	LUDA_TYPE		= 0x0002,
+	LUDA_64BITHASH		= 0x0004,
+
+	/* The following attrs are used for MDT internal only,
+	 * not visible to client */
+
+	/* Something in the record is unknown, to be verified in further. */
+	LUDA_UNKNOWN		= 0x0400,
+	/* Ignore this record, go to next directly. */
+	LUDA_IGNORE		= 0x0800,
+	/* The system is upgraded, has beed or to be repaired (dryrun). */
+	LUDA_UPGRADE		= 0x1000,
+	/* The dirent has been repaired, or to be repaired (dryrun). */
+	LUDA_REPAIR		= 0x2000,
+	/* Only check but not repair the dirent inconsistency */
+	LUDA_VERIFY_DRYRUN	= 0x4000,
+	/* Verify the dirent consistency */
+	LUDA_VERIFY		= 0x8000,
+};
+
+#define LU_DIRENT_ATTRS_MASK	0xff00
+
+/**
+ * Layout of readdir pages, as transmitted on wire.
+ */
+struct lu_dirent {
+        /** valid if LUDA_FID is set. */
+        struct lu_fid lde_fid;
+        /** a unique entry identifier: a hash or an offset. */
+        __u64         lde_hash;
+        /** total record length, including all attributes. */
+        __u16         lde_reclen;
+        /** name length */
+        __u16         lde_namelen;
+        /** optional variable size attributes following this entry.
+         *  taken from enum lu_dirent_attrs.
+         */
+        __u32         lde_attrs;
+        /** name is followed by the attributes indicated in ->ldp_attrs, in
+         *  their natural order. After the last attribute, padding bytes are
+         *  added to make ->lde_reclen a multiple of 8.
+         */
+        char          lde_name[0];
+};
+
+/*
+ * Definitions of optional directory entry attributes formats.
+ *
+ * Individual attributes do not have their length encoded in a generic way. It
+ * is assumed that consumer of an attribute knows its format. This means that
+ * it is impossible to skip over an unknown attribute, except by skipping over all
+ * remaining attributes (by using ->lde_reclen), which is not too
+ * constraining, because new server versions will append new attributes at
+ * the end of an entry.
+ */
+
+/**
+ * Fid directory attribute: a fid of an object referenced by the entry. This
+ * will be almost always requested by the client and supplied by the server.
+ *
+ * Aligned to 8 bytes.
+ */
+/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
+
+/**
+ * File type.
+ *
+ * Aligned to 2 bytes.
+ */
+struct luda_type {
+        __u16 lt_type;
+};
+
+struct lu_dirpage {
+        __u64            ldp_hash_start;
+        __u64            ldp_hash_end;
+        __u32            ldp_flags;
+        __u32            ldp_pad0;
+        struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+        /**
+         * dirpage contains no entry.
+         */
+        LDF_EMPTY   = 1 << 0,
+        /**
+         * last entry's lde_hash equals ldp_hash_end.
+         */
+        LDF_COLLIDE = 1 << 1
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+	if (__le32_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+		return NULL;
+	else
+		return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+	struct lu_dirent *next;
+
+	if (__le16_to_cpu(ent->lde_reclen) != 0)
+		next = ((void *)ent) + __le16_to_cpu(ent->lde_reclen);
+	else
+		next = NULL;
+
+	return next;
+}
+
+static inline __kernel_size_t lu_dirent_calc_size(size_t namelen, __u16 attr)
+{
+	__kernel_size_t size;
+
+	if (attr & LUDA_TYPE) {
+		const __kernel_size_t align = sizeof(struct luda_type) - 1;
+
+		size = (sizeof(struct lu_dirent) + namelen + 1 + align) &
+		       ~align;
+		size += sizeof(struct luda_type);
+	} else {
+		size = sizeof(struct lu_dirent) + namelen + 1;
+	}
+
+	return (size + 7) & ~7;
+}
+
+static inline __u16 lu_dirent_type_get(struct lu_dirent *ent)
+{
+	__u16 type = 0;
+	struct luda_type *lt;
+	int len = 0;
+
+	if (__le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
+		const unsigned int align = sizeof(struct luda_type) - 1;
+
+		len = __le16_to_cpu(ent->lde_namelen);
+		len = (len + align) & ~align;
+		lt = (void *)ent->lde_name + len;
+		type = __le16_to_cpu(lt->lt_type);
+	}
+
+	return type;
+}
+
+#define MDS_DIR_END_OFF 0xfffffffffffffffeULL
+
+/**
+ * MDS_READPAGE page size
+ *
+ * This is the directory page size packed in MDS_READPAGE RPC.
+ * It's different than PAGE_SIZE because the client needs to
+ * access the struct lu_dirpage header packed at the beginning of
+ * the "page" and without this there isn't any way to know find the
+ * lu_dirpage header is if client and server PAGE_SIZE differ.
+ */
+#define LU_PAGE_SHIFT 12
+#define LU_PAGE_SIZE  (1UL << LU_PAGE_SHIFT)
+#define LU_PAGE_MASK  (~(LU_PAGE_SIZE - 1))
+
+#define LU_PAGE_COUNT (1 << (PAGE_SHIFT - LU_PAGE_SHIFT))
+
+/** @} lu_dir */
+
+struct lustre_handle {
+        __u64 cookie;
+};
+#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
+
+static inline bool lustre_handle_is_used(const struct lustre_handle *lh)
+{
+	return lh->cookie != 0;
+}
+
+static inline bool lustre_handle_equal(const struct lustre_handle *lh1,
+				       const struct lustre_handle *lh2)
+{
+	return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+				      const struct lustre_handle *src)
+{
+	tgt->cookie = src->cookie;
+}
+
+/* lustre_msg struct magic.  DON'T use swabbed values of MAGIC as magic! */
+enum lustre_msg_magic {
+	LUSTRE_MSG_MAGIC_V2		= 0x0BD00BD3,
+	LUSTRE_MSG_MAGIC_V2_SWABBED	= 0xD30BD00B,
+	LUSTRE_MSG_MAGIC		= LUSTRE_MSG_MAGIC_V2
+};
+
+/* flags for lm_flags */
+enum lustre_msghdr {
+	MSGHDR_AT_SUPPORT	= 0x1,	/* adaptive timeouts, lm_cksum valid
+					 * in early reply messages */
+	MSGHDR_CKSUM_INCOMPAT18	= 0x2,	/* compat for 1.8, needs to be set well
+					 * beyond 2.8.0 for compatibility */
+};
+
+#define lustre_msg lustre_msg_v2
+/* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
+struct lustre_msg_v2 {
+	__u32 lm_bufcount;	/* number of buffers in lm_buflens[] */
+	__u32 lm_secflvr;	/* 0 = no crypto, or sptlrpc security flavour */
+	__u32 lm_magic;		/* RPC version magic = LUSTRE_MSG_MAGIC_V2 */
+	__u32 lm_repsize;	/* size of preallocated reply buffer */
+	__u32 lm_cksum;		/* CRC32 of ptlrpc_body early reply messages */
+	__u32 lm_flags;		/* enum lustre_msghdr MSGHDR_* flags */
+	__u32 lm_padding_2;	/* unused */
+	__u32 lm_padding_3;	/* unused */
+	__u32 lm_buflens[0];	/* length of additional buffers in bytes,
+				 * padded to a multiple of 8 bytes. */
+	/*
+	 * message buffers are packed after padded lm_buflens[] array,
+	 * padded to a multiple of 8 bytes each to align contents.
+	 */
+};
+
+/* ptlrpc_body packet pb_types */
+#define PTL_RPC_MSG_REQUEST	4711	/* normal RPC request message */
+#define PTL_RPC_MSG_ERR		4712	/* error reply if request unprocessed */
+#define PTL_RPC_MSG_REPLY	4713	/* normal RPC reply message */
+
+/* ptlrpc_body pb_version ((target_version << 16) | rpc_version) */
+enum lustre_msg_version {
+	PTLRPC_MSG_VERSION	= 0x00000003,
+	LUSTRE_VERSION_MASK	= 0xffff0000,
+	LUSTRE_OBD_VERSION	= 0x00010000,
+	LUSTRE_MDS_VERSION	= 0x00020000,
+	LUSTRE_OST_VERSION	= 0x00030000,
+	LUSTRE_DLM_VERSION	= 0x00040000,
+	LUSTRE_LOG_VERSION	= 0x00050000,
+	LUSTRE_MGS_VERSION	= 0x00060000,
+};
+
+/* pb_flags that apply to all request messages */
+/* #define MSG_LAST_REPLAY	0x0001 obsolete 2.0 => {REQ,LOCK}_REPLAY_DONE */
+#define MSG_RESENT		0x0002 /* was previously sent, no reply seen */
+#define MSG_REPLAY		0x0004 /* was processed, got reply, recovery */
+/* #define MSG_AT_SUPPORT	0x0008 obsolete since 1.5, AT always enabled */
+/* #define MSG_DELAY_REPLAY	0x0010 obsolete since 2.0 */
+/* #define MSG_VERSION_REPLAY	0x0020 obsolete since 1.8.2, VBR always on */
+#define MSG_REQ_REPLAY_DONE	0x0040 /* request replay over, locks next */
+#define MSG_LOCK_REPLAY_DONE	0x0080 /* lock replay over, client done */
+
+/* pb_op_flags for connect opcodes: MDS_CONNECT, OST_CONNECT, MGS_CONNECT */
+#define MSG_CONNECT_RECOVERING	0x00000001 /* target is in recovery */
+#define MSG_CONNECT_RECONNECT	0x00000002 /* tgt already has client import */
+#define MSG_CONNECT_REPLAYABLE	0x00000004 /* target supports RPC replay */
+/* #define MSG_CONNECT_PEER	0x00000008 obsolete since 1.2, removed in 1.5 */
+#define MSG_CONNECT_LIBCLIENT	0x00000010 /* obsolete since 2.3, removed 2.6 */
+#define MSG_CONNECT_INITIAL	0x00000020 /* first client connection attempt */
+/* #define MSG_CONNECT_ASYNC	0x00000040 obsolete since 1.5 */
+#define MSG_CONNECT_NEXT_VER	0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO	0x00000100 /* client sent transno in replay */
+
+/* number of previous object versions in pb_pre_versions[] */
+#define PTLRPC_NUM_VERSIONS     4
+/* without gss, ptlrpc_body is put at the first buffer. */
+struct ptlrpc_body_v3 {
+	struct lustre_handle pb_handle;
+	__u32 pb_type;		/* request/reply/err type: PTL_RPC_MSG_* */
+	__u32 pb_version;	/* LUSTRE_*_VERSION | PTLRPC_MSG_VERSION */
+	__u32 pb_opc;		/* RPC opcodes: MDS_*, OST_*, LDLM_, ... */
+	__u32 pb_status;	/* negative Linux x86 error number */
+	__u64 pb_last_xid;	/* highest replied XID w/o lower unreplied XID*/
+	__u16 pb_tag;		/* multiple modifying RPCs virtual slot index */
+	__u16 pb_padding0;
+	__u32 pb_padding1;
+	__u64 pb_last_committed;/* rep: highest pb_transno committed to disk */
+	__u64 pb_transno;	/* server-assigned transno for modifying RPCs */
+	__u32 pb_flags;		/* req: MSG_* flags */
+	__u32 pb_op_flags;	/* req: MSG_CONNECT_* flags */
+	__u32 pb_conn_cnt;	/* connect instance of this client on server */
+	__u32 pb_timeout;	/* req: max wait time; rep: service estimate */
+	__u32 pb_service_time;	/* rep: server arrival to reply in seconds */
+	__u32 pb_limit;		/* rep: dynamic DLM LRU lock count limit */
+	__u64 pb_slv;		/* rep: dynamic DLM LRU server lock volume */
+	/* VBR: rep: previous pb_version(s) of objects modified by this RPC */
+	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	__u64 pb_mbits;	/**< match bits for bulk request */
+	/* padding for future needs - fix lustre_swab_ptlrpc_body() also */
+	__u64 pb_padding64_0;
+	__u64 pb_padding64_1;
+	__u64 pb_padding64_2;
+	char  pb_jobid[LUSTRE_JOBID_SIZE]; /* req: ASCII jobid from env + NUL */
+};
+#define ptlrpc_body     ptlrpc_body_v3
+
+struct ptlrpc_body_v2 {
+        struct lustre_handle pb_handle;
+        __u32 pb_type;
+        __u32 pb_version;
+        __u32 pb_opc;
+        __u32 pb_status;
+	__u64 pb_last_xid; /* highest replied XID without lower unreplied XID */
+	__u16 pb_tag;      /* virtual slot idx for multiple modifying RPCs */
+	__u16 pb_padding0;
+	__u32 pb_padding1;
+        __u64 pb_last_committed;
+        __u64 pb_transno;
+        __u32 pb_flags;
+        __u32 pb_op_flags;
+        __u32 pb_conn_cnt;
+        __u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+        __u32 pb_service_time; /* for rep, actual service time, also used for
+                                  net_latency of req */
+        __u32 pb_limit;
+        __u64 pb_slv;
+        /* VBR: pre-versions */
+        __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	__u64 pb_mbits;	/**< unused in V2 */
+        /* padding for future needs */
+	__u64 pb_padding64_0;
+	__u64 pb_padding64_1;
+	__u64 pb_padding64_2;
+};
+
+/* message body offset for lustre_msg_v2 */
+/* ptlrpc body offset in all request/reply messages */
+#define MSG_PTLRPC_BODY_OFF             0
+
+/* normal request/reply message record offset */
+#define REQ_REC_OFF                     1
+#define REPLY_REC_OFF                   1
+
+/* ldlm request message body offset */
+#define DLM_LOCKREQ_OFF                 1 /* lockreq offset */
+#define DLM_REQ_REC_OFF                 2 /* normal dlm request record offset */
+
+/* ldlm intent lock message body offset */
+#define DLM_INTENT_IT_OFF               2 /* intent lock it offset */
+#define DLM_INTENT_REC_OFF              3 /* intent lock record offset */
+
+/* ldlm reply message body offset */
+#define DLM_LOCKREPLY_OFF               1 /* lockrep offset */
+#define DLM_REPLY_REC_OFF               2 /* reply record offset */
+
+/** only use in req->rq_{req,rep}_swab_mask */
+#define MSG_PTLRPC_HEADER_OFF           31
+
+/* Connect flags */
+#define OBD_CONNECT_RDONLY                0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX                 0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS                   0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT                 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK              0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION              0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL            0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL                  0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR               0x100ULL /*client use extended attr */
+#define OBD_CONNECT_LARGE_ACL		0x200ULL /* more than 32 ACL entries */
+/* was OBD_CONNECT_TRUNCLOCK		0x400ULL *locks on server for punch */
+/* temporary reuse until 2.21.53 to indicate pre-2.15 client, see LU-15478 */
+#define OBD_CONNECT_OLD_FALLOC		0x400ULL /* missing o_valid flags */
+#define OBD_CONNECT_TRANSNO		0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS	       0x1000ULL /* not checked in 2.11+ */
+#define OBD_CONNECT_BARRIER	       0x2000ULL /* write barrier. Resevered to
+						  * avoid use on client.
+						  */
+#define OBD_CONNECT_ATTRFID            0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH            0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT        0x10000ULL /* Remote client, never used
+						  * in production. Removed in
+						  * 2.9. Keep this flag to
+						  * avoid reusing.
+						  */
+#define OBD_CONNECT_RMT_CLIENT_FORCE  0x20000ULL /* Remote client by force,
+						  * never used in production.
+						  * Removed in 2.9. Keep this
+						  * flag to avoid reusing.
+						  */
+#define OBD_CONNECT_BRW_SIZE          0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64           0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA         0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA         0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET        0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM              0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT              0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE      0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS         0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL            0x8000000ULL /* obsolete since 2.8 */
+#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM          0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID            0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR            0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3        0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK  0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN   0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE    0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20       0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK   0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH    0x4000000000ULL /* client supports 64-bits
+                                                  * directory hash */
+#define OBD_CONNECT_MAXBYTES     0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV   0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS    0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK       0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+                                                  * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+                                                  * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+						   * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE	0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS	0x4000000000000ULL/* pings not required */
+#define OBD_CONNECT_FLOCK_DEAD	0x8000000000000ULL/* improved flock deadlock detection */
+#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/* create stripe disposition*/
+#define OBD_CONNECT_OPEN_BY_FID	0x20000000000000ULL /* open by fid won't pack
+						       name in request */
+#define OBD_CONNECT_LFSCK      0x40000000000000ULL/* support online LFSCK */
+#define OBD_CONNECT_UNLINK_CLOSE 0x100000000000000ULL/* close file in unlink */
+#define OBD_CONNECT_MULTIMODRPCS 0x200000000000000ULL /* support multiple modify
+							 RPCs in parallel */
+#define OBD_CONNECT_DIR_STRIPE	 0x400000000000000ULL /* striped DNE dir */
+#define OBD_CONNECT_SUBTREE	0x800000000000000ULL /* fileset mount */
+/* was OBD_CONNECT_LOCKAHEAD_OLD 0x1000000000000000ULL old lockahead 2.12-2.13*/
+
+/** bulk matchbits is sent within ptlrpc_body */
+#define OBD_CONNECT_BULK_MBITS	 0x2000000000000000ULL
+#define OBD_CONNECT_OBDOPACK	 0x4000000000000000ULL /* compact OUT obdo */
+#define OBD_CONNECT_FLAGS2	 0x8000000000000000ULL /* second flags word */
+/* ocd_connect_flags2 flags */
+#define OBD_CONNECT2_FILE_SECCTX	 0x1ULL /* set file security context at create */
+#define OBD_CONNECT2_LOCKAHEAD		 0x2ULL /* ladvise lockahead v2 */
+#define OBD_CONNECT2_DIR_MIGRATE	 0x4ULL /* migrate striped dir */
+#define OBD_CONNECT2_SUM_STATFS		0x8ULL /* MDT return aggregated stats */
+#define OBD_CONNECT2_OVERSTRIPING	0x10ULL /* OST overstriping support */
+#define OBD_CONNECT2_FLR		0x20ULL /* FLR support */
+#define OBD_CONNECT2_WBC_INTENTS	0x40ULL /* create/unlink/... intents for wbc, also operations under client-held parent locks */
+#define OBD_CONNECT2_LOCK_CONVERT	0x80ULL /* IBITS lock convert support */
+#define OBD_CONNECT2_ARCHIVE_ID_ARRAY	0x100ULL /* store HSM archive_id in array */
+#define OBD_CONNECT2_INC_XID		0x200ULL /* Increasing xid */
+#define OBD_CONNECT2_SELINUX_POLICY	0x400ULL /* has client SELinux policy */
+#define OBD_CONNECT2_LSOM		0x800ULL /* LSOM support */
+#define OBD_CONNECT2_PCC		0x1000ULL /* Persistent Client Cache */
+#define OBD_CONNECT2_CRUSH		0x2000ULL /* crush hash striped directory */
+#define OBD_CONNECT2_ASYNC_DISCARD	0x4000ULL /* support async DoM data discard */
+#define OBD_CONNECT2_ENCRYPT		0x8000ULL /* client-to-disk encrypt */
+#define OBD_CONNECT2_FIDMAP	       0x10000ULL /* FID map */
+#define OBD_CONNECT2_GETATTR_PFID      0x20000ULL /* pack parent FID in getattr */
+#define OBD_CONNECT2_LSEEK	       0x40000ULL /* SEEK_HOLE/DATA RPC */
+#define OBD_CONNECT2_DOM_LVB	       0x80000ULL /* pack DOM glimpse data in LVB */
+#define OBD_CONNECT2_REP_MBITS	      0x100000ULL /* match reply mbits not xid*/
+#define OBD_CONNECT2_MODE_CONVERT     0x200000ULL /* LDLM mode convert */
+#define OBD_CONNECT2_BATCH_RPC        0x400000ULL /* Multi-RPC batch request */
+#define OBD_CONNECT2_PCCRO	      0x800000ULL /* Read-only PCC */
+#define OBD_CONNECT2_ATOMIC_OPEN_LOCK 0x4000000ULL/* request lock on 1st open */
+#define OBD_CONNECT2_ENCRYPT_NAME     0x8000000ULL /* name encrypt */
+/* risk of forwards incompatibility with upstream - use high order bits to mitigate */
+#define OBD_CONNECT2_MDLL_BYPASS 0x800000000000000ULL /* disable metadata lazy load */
+#define OBD_CONNECT2_MDLL           0x1000000000000000ULL /* enable metadata lazy load */
+#define OBD_CONNECT2_MDLL_AUTO_REFRESH 0x2000000000000000ULL /* enable metadata lazy load auto-refresh */
+/* XXX README XXX:
+ * Please DO NOT add flag values here before first ensuring that this same
+ * flag value is not in use on some other branch.  Please clear any such
+ * changes with senior engineers before starting to use a new flag.  Then,
+ * submit a small patch against EVERY branch that ONLY adds the new flag,
+ * updates obd_connect_names[], adds the flag to check_obd_connect_data(),
+ * and updates wiretests accordingly, so it can be approved and landed easily
+ * to reserve the flag for future use.
+ */
+
+#define OCD_HAS_FLAG(ocd, flg)  \
+        (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg))
+
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE
+#else
+#define LRU_RESIZE_CONNECT_FLAG 0
+#endif
+
+#define MDT_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+				OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | \
+				OBD_CONNECT_ATTRFID | OBD_CONNECT_CANCELSET | \
+				OBD_CONNECT_AT | OBD_CONNECT_BRW_SIZE | \
+				OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | \
+				LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_VBR | \
+				OBD_CONNECT_LOV_V3 | OBD_CONNECT_FULL20 | \
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
+				OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+				OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\
+				OBD_CONNECT_FLOCK_DEAD | \
+				OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \
+				OBD_CONNECT_OPEN_BY_FID | \
+				OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_GRANT | \
+				OBD_CONNECT_SRVLOCK | OBD_CONNECT_BULK_MBITS |\
+				OBD_CONNECT_CKSUM |\
+				OBD_CONNECT_MULTIMODRPCS |\
+				OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL |\
+				OBD_CONNECT_GRANT_PARAM | \
+				OBD_CONNECT_GRANT_SHRINK | \
+				OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2)
+
+#define MDT_CONNECT_SUPPORTED2 (OBD_CONNECT2_FILE_SECCTX | \
+				OBD_CONNECT2_DIR_MIGRATE | \
+				OBD_CONNECT2_SUM_STATFS | \
+				OBD_CONNECT2_OVERSTRIPING | \
+				OBD_CONNECT2_FLR |\
+				OBD_CONNECT2_LOCK_CONVERT | \
+				OBD_CONNECT2_ARCHIVE_ID_ARRAY | \
+				OBD_CONNECT2_INC_XID | \
+				OBD_CONNECT2_SELINUX_POLICY | \
+				OBD_CONNECT2_LSOM | \
+				OBD_CONNECT2_ASYNC_DISCARD | \
+				OBD_CONNECT2_PCC | \
+				OBD_CONNECT2_MDLL_BYPASS | \
+				OBD_CONNECT2_MDLL | \
+				OBD_CONNECT2_MDLL_AUTO_REFRESH | \
+				OBD_CONNECT2_CRUSH | \
+				OBD_CONNECT2_ENCRYPT | \
+				OBD_CONNECT2_GETATTR_PFID |\
+				OBD_CONNECT2_LSEEK | OBD_CONNECT2_DOM_LVB |\
+				OBD_CONNECT2_REP_MBITS | \
+				OBD_CONNECT2_ATOMIC_OPEN_LOCK | \
+				OBD_CONNECT2_ENCRYPT_NAME)
+
+#define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+				OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_INDEX | \
+				OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CANCELSET | \
+				OBD_CONNECT_AT | LRU_RESIZE_CONNECT_FLAG | \
+				OBD_CONNECT_CKSUM | OBD_CONNECT_VBR | \
+				OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \
+				OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 |\
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \
+				OBD_CONNECT_MAX_EASIZE | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
+				OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+				OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | \
+				OBD_CONNECT_BULK_MBITS | \
+				OBD_CONNECT_GRANT_PARAM | \
+				OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2)
+
+#define OST_CONNECT_SUPPORTED2 (OBD_CONNECT2_LOCKAHEAD | OBD_CONNECT2_INC_XID |\
+				OBD_CONNECT2_ENCRYPT | OBD_CONNECT2_LSEEK |\
+				OBD_CONNECT2_REP_MBITS)
+
+#define ECHO_CONNECT_SUPPORTED (OBD_CONNECT_FID | OBD_CONNECT_FLAGS2)
+#define ECHO_CONNECT_SUPPORTED2 OBD_CONNECT2_REP_MBITS
+
+#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
+				OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+				OBD_CONNECT_PINGLESS |\
+				OBD_CONNECT_BULK_MBITS | OBD_CONNECT_BARRIER | \
+				OBD_CONNECT_FLAGS2)
+
+#define MGS_CONNECT_SUPPORTED2 OBD_CONNECT2_REP_MBITS
+
+/* Features required for this version of the client to work with server */
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_FID |	\
+				 OBD_CONNECT_ATTRFID |	\
+				 OBD_CONNECT_FULL20)
+
+/* This structure is used for both request and reply.
+ *
+ * If we eventually have separate connect data for different types, which we
+ * almost certainly will, then perhaps we stick a union in here. */
+struct obd_connect_data {
+	__u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+	__u32 ocd_version;	 /* lustre release version number */
+	__u32 ocd_grant;	 /* initial cache grant amount (bytes) */
+	__u32 ocd_index;	 /* LOV index to connect to */
+	__u32 ocd_brw_size;	 /* Maximum BRW size in bytes */
+        __u64 ocd_ibits_known;   /* inode bits this client understands */
+	__u8  ocd_grant_blkbits; /* log2 of the backend filesystem blocksize */
+	__u8  ocd_grant_inobits; /* log2 of the per-inode space consumption */
+	__u16 ocd_grant_tax_kb;  /* extent insertion overhead, in 1K blocks */
+	__u32 ocd_grant_max_blks;/* maximum number of blocks per extent */
+        __u64 ocd_transno;       /* first transno from client to be replayed */
+        __u32 ocd_group;         /* MDS group on OST */
+        __u32 ocd_cksum_types;   /* supported checksum algorithms */
+        __u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+        __u32 ocd_instance;      /* instance # of this target */
+        __u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+        /* Fields after ocd_maxbytes are only accessible by the receiver
+         * if the corresponding flag in ocd_connect_flags is set. Accessing
+         * any field after ocd_maxbytes on the receiver without a valid flag
+         * may result in out-of-bound memory access and kernel oops. */
+	__u16 ocd_maxmodrpcs;    /* Maximum modify RPCs in parallel */
+	__u16 padding0;          /* added 2.1.0. also fix lustre_swab_connect */
+	__u32 padding1;          /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 ocd_connect_flags2;
+        __u64 padding3;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding4;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding5;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding6;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding7;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding8;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding9;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingA;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingB;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingC;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingD;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingE;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingF;          /* added 2.1.0. also fix lustre_swab_connect */
+};
+/* XXX README XXX:
+ * Please DO NOT use any fields here before first ensuring that this same
+ * field is not in use on some other branch.  Please clear any such changes
+ * with senior engineers before starting to use a new field.  Then, submit
+ * a small patch against EVERY branch that ONLY adds the new field along with
+ * the matching OBD_CONNECT flag, so that can be approved and landed easily to
+ * reserve the flag for future use. */
+
+/*
+ * Supported checksum algorithms. Up to 32 checksum types are supported.
+ * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
+ * Please update DECLARE_CKSUM_NAME in obd_cksum.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags, OBD_CKSUM_ALL flag,
+ * OBD_FL_CKSUM_ALL flag and potentially OBD_CKSUM_T10_ALL flag.
+ */
+enum cksum_types {
+	OBD_CKSUM_CRC32		= 0x00000001,
+	OBD_CKSUM_ADLER		= 0x00000002,
+	OBD_CKSUM_CRC32C	= 0x00000004,
+	OBD_CKSUM_RESERVED	= 0x00000008,
+	OBD_CKSUM_T10IP512	= 0x00000010,
+	OBD_CKSUM_T10IP4K	= 0x00000020,
+	OBD_CKSUM_T10CRC512	= 0x00000040,
+	OBD_CKSUM_T10CRC4K	= 0x00000080,
+};
+
+#define OBD_CKSUM_T10_ALL (OBD_CKSUM_T10IP512 | OBD_CKSUM_T10IP4K | \
+	OBD_CKSUM_T10CRC512 | OBD_CKSUM_T10CRC4K)
+
+#define OBD_CKSUM_ALL (OBD_CKSUM_CRC32 | OBD_CKSUM_ADLER | OBD_CKSUM_CRC32C | \
+		       OBD_CKSUM_T10_ALL)
+
+/*
+ * The default checksum algorithm used on top of T10PI GRD tags for RPC.
+ * Considering that the checksum-of-checksums is only computing CRC32 on a
+ * 4KB chunk of GRD tags for a 1MB RPC for 512B sectors, or 16KB of GRD
+ * tags for 16MB of 4KB sectors, this is only 1/256 or 1/1024 of the
+ * total data being checksummed, so the checksum type used here should not
+ * affect overall system performance noticeably.
+ */
+#define OBD_CKSUM_T10_TOP OBD_CKSUM_ADLER
+
+/*
+ *   OST requests: OBDO & OBD request records
+ */
+
+/* opcodes */
+enum ost_cmd {
+        OST_REPLY      =  0,       /* reply ? */
+        OST_GETATTR    =  1,
+        OST_SETATTR    =  2,
+        OST_READ       =  3,
+        OST_WRITE      =  4,
+        OST_CREATE     =  5,
+        OST_DESTROY    =  6,
+        OST_GET_INFO   =  7,
+        OST_CONNECT    =  8,
+        OST_DISCONNECT =  9,
+        OST_PUNCH      = 10,
+        OST_OPEN       = 11,
+        OST_CLOSE      = 12,
+        OST_STATFS     = 13,
+        OST_SYNC       = 16,
+        OST_SET_INFO   = 17,
+	OST_QUOTACHECK = 18, /* not used since 2.4 */
+	OST_QUOTACTL   = 19,
+	OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
+	OST_LADVISE    = 21,
+	OST_FALLOCATE  = 22,
+	OST_SEEK       = 23,
+	OST_LAST_OPC /* must be < 33 to avoid MDS_GETATTR */
+};
+#define OST_FIRST_OPC  OST_REPLY
+
+enum obdo_flags {
+        OBD_FL_INLINEDATA   = 0x00000001,
+        OBD_FL_OBDMDEXISTS  = 0x00000002,
+        OBD_FL_DELORPHAN    = 0x00000004, /* if set in o_flags delete orphans */
+        OBD_FL_NORPC        = 0x00000008, /* set in o_flags do in OSC not OST */
+        OBD_FL_IDONLY       = 0x00000010, /* set in o_flags only adjust obj id*/
+        OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+        OBD_FL_DEBUG_CHECK  = 0x00000040, /* echo client/server debug check */
+	OBD_FL_NO_PRJQUOTA  = 0x00000080, /* the object's project is over
+					   * quota */
+        OBD_FL_NO_USRQUOTA  = 0x00000100, /* the object's owner is over quota */
+        OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
+        OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
+        OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
+	OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+	OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+	OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+	OBD_FL_CKSUM_T10IP512  = 0x00005000, /* T10PI IP cksum, 512B sector */
+	OBD_FL_CKSUM_T10IP4K   = 0x00006000, /* T10PI IP cksum, 4KB sector */
+	OBD_FL_CKSUM_T10CRC512 = 0x00007000, /* T10PI CRC cksum, 512B sector */
+	OBD_FL_CKSUM_T10CRC4K  = 0x00008000, /* T10PI CRC cksum, 4KB sector */
+	OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+	OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+	OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client.
+                                           * XXX: obsoleted - reserved for old
+                                           * clients prior than 2.2 */
+        OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
+        OBD_FL_NOSPC_BLK    = 0x00100000, /* no more block space on OST */
+	OBD_FL_FLUSH	    = 0x00200000, /* flush pages on the OST */
+	OBD_FL_SHORT_IO	    = 0x00400000, /* short io request */
+	OBD_FL_ROOT_SQUASH  = 0x00800000, /* root squash */
+	/* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */
+
+	/*
+	 * Note that while the original checksum values were separate bits,
+	 * in 2.x we can actually allow all values from 1-31. T10-PI checksum
+	 * types already use values which are not separate bits.
+	 */
+	OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
+			      OBD_FL_CKSUM_CRC32C | OBD_FL_CKSUM_T10IP512 |
+			      OBD_FL_CKSUM_T10IP4K | OBD_FL_CKSUM_T10CRC512 |
+			      OBD_FL_CKSUM_T10CRC4K,
+
+	OBD_FL_NO_QUOTA_ALL = OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA |
+			      OBD_FL_NO_PRJQUOTA,
+};
+
+/*
+ * All LOV EA magics should have the same postfix, if some new version
+ * Lustre instroduces new LOV EA magic, then when down-grade to an old
+ * Lustre, even though the old version system does not recognizes such
+ * new magic, it still can distinguish the corrupted cases by checking
+ * the magic's postfix.
+ */
+#define LOV_MAGIC_MAGIC 0x0BD0
+#define LOV_MAGIC_MASK  0xFFFF
+
+#define LOV_MAGIC_V1		(0x0BD10000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC_JOIN_V1	(0x0BD20000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC_V3		(0x0BD30000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC_MIGRATE	(0x0BD40000 | LOV_MAGIC_MAGIC)
+/* reserved for specifying OSTs */
+#define LOV_MAGIC_SPECIFIC	(0x0BD50000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC		LOV_MAGIC_V1
+#define LOV_MAGIC_COMP_V1	(0x0BD60000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC_FOREIGN	(0x0BD70000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC_SEL		(0x0BD80000 | LOV_MAGIC_MAGIC)
+
+/*
+ * magic for fully defined striping
+ * the idea is that we should have different magics for striping "hints"
+ * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct
+ * lov_mds_md_v[13]). at the moment the magics are used in wire protocol,
+ * we can't just change it w/o long way preparation, but we still need a
+ * mechanism to allow LOD to differentiate hint versus ready striping.
+ * so, at the moment we do a trick: MDT knows what to expect from request
+ * depending on the case (replay uses ready striping, non-replay req uses
+ * hints), so MDT replaces magic with appropriate one and now LOD can
+ * easily understand what's inside -bzzz
+ *
+ * those *_DEF magics are only used on server side internally, they
+ * won't be put on wire or disk.
+ */
+#define LOV_MAGIC_DEFINED		0x10000000
+#define LOV_MAGIC_V1_DEFINED		(LOV_MAGIC_DEFINED | LOV_MAGIC_V1)
+#define LOV_MAGIC_V3_DEFINED		(LOV_MAGIC_DEFINED | LOV_MAGIC_V3)
+#define LOV_MAGIC_COMP_V1_DEFINED	(LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1)
+
+#define lov_pattern(pattern)		(pattern & ~LOV_PATTERN_F_MASK)
+#define lov_pattern_flags(pattern)	(pattern & LOV_PATTERN_F_MASK)
+
+#define lov_ost_data lov_ost_data_v1
+struct lov_ost_data_v1 {          /* per-stripe data structure (little-endian)*/
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;          /* generation of this l_ost_idx */
+	__u32 l_ost_idx;          /* OST index in LOV (lov_tgt_desc->tgts) */
+};
+
+#define lov_mds_md lov_mds_md_v1
+struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_MAGIC_V1 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+#define MAX_MD_SIZE_OLD (sizeof(struct lov_mds_md) +			\
+			 4 * sizeof(struct lov_ost_data))
+#define MAX_MD_SIZE (sizeof(struct lov_comp_md_v1) +			\
+		     4 * (sizeof(struct lov_comp_md_entry_v1) +		\
+			  MAX_MD_SIZE_OLD))
+#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
+
+/* This is the default MDT reply size allocated, should the striping be bigger,
+ * it will be reallocated in mdt_fix_reply.
+ * 100 stripes is a bit less than 2.5k of data */
+#define DEF_REP_MD_SIZE (sizeof(struct lov_mds_md) + \
+			 100 * sizeof(struct lov_ost_data))
+
+#define XATTR_NAME_ACL_ACCESS   "system.posix_acl_access"
+#define XATTR_NAME_ACL_DEFAULT  "system.posix_acl_default"
+#define XATTR_USER_PREFIX       "user."
+#define XATTR_TRUSTED_PREFIX    "trusted."
+#define XATTR_SECURITY_PREFIX   "security."
+#define XATTR_ENCRYPTION_PREFIX	"encryption."
+
+#define XATTR_NAME_SOM		"trusted.som"
+#define XATTR_NAME_LOV          "trusted.lov"
+#define XATTR_NAME_LMA          "trusted.lma"
+#define XATTR_NAME_LMV          "trusted.lmv"
+#define XATTR_NAME_DEFAULT_LMV	"trusted.dmv"
+#define XATTR_NAME_LINK         "trusted.link"
+#define XATTR_NAME_FID          "trusted.fid"
+#define XATTR_NAME_VERSION      "trusted.version"
+#define XATTR_NAME_SOM		"trusted.som"
+#define XATTR_NAME_HSM		"trusted.hsm"
+#define XATTR_NAME_LFSCK_BITMAP "trusted.lfsck_bitmap"
+#define XATTR_NAME_DUMMY	"trusted.dummy"
+#define XATTR_NAME_PROJID	"trusted.projid"
+
+#define LL_XATTR_NAME_ENCRYPTION_CONTEXT_OLD XATTR_SECURITY_PREFIX"c"
+#define LL_XATTR_NAME_ENCRYPTION_CONTEXT XATTR_ENCRYPTION_PREFIX"c"
+
+#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_ns"
+#define XATTR_NAME_MAX_LEN	32 /* increase this, if there is longer name. */
+
+struct lov_mds_md_v3 {            /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_MAGIC_V3 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* must be 32bit aligned */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (stripes == (__u16)-1)
+		stripes = 0;
+
+	if (lmm_magic == LOV_MAGIC_V3)
+		return sizeof(struct lov_mds_md_v3) +
+				stripes * sizeof(struct lov_ost_data_v1);
+	else
+		return sizeof(struct lov_mds_md_v1) +
+				stripes * sizeof(struct lov_ost_data_v1);
+}
+
+static inline __u32
+lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic)
+{
+	switch (lmm_magic) {
+	case LOV_MAGIC_V1: {
+		struct lov_mds_md_v1 lmm;
+
+		if (buf_size < sizeof(lmm))
+			return 0;
+
+		return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]);
+	}
+	case LOV_MAGIC_V3: {
+		struct lov_mds_md_v3 lmm;
+
+		if (buf_size < sizeof(lmm))
+			return 0;
+
+		return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]);
+	}
+	default:
+		return 0;
+	}
+}
+
+#define OBD_MD_FLID        (0x00000001ULL) /* object ID */
+#define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
+#define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
+#define OBD_MD_FLCTIME     (0x00000008ULL) /* change time */
+#define OBD_MD_FLSIZE      (0x00000010ULL) /* size */
+#define OBD_MD_FLBLOCKS    (0x00000020ULL) /* allocated blocks count */
+#define OBD_MD_FLBLKSZ     (0x00000040ULL) /* block size */
+#define OBD_MD_FLMODE      (0x00000080ULL) /* access bits (mode & ~S_IFMT) */
+#define OBD_MD_FLTYPE      (0x00000100ULL) /* object type (mode & S_IFMT) */
+#define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
+#define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
+#define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+#define OBD_MD_DOM_SIZE    (0X00001000ULL) /* Data-on-MDT component size */
+#define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
+#define OBD_MD_FLPARENT    (0x00004000ULL) /* parent FID */
+#define OBD_MD_LAYOUT_VERSION (0x00008000ULL) /* OST object layout version */
+#define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
+#define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
+#define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
+#define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
+#define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
+/*	OBD_MD_FLQOS       (0x00200000ULL) has never been used */
+/*	OBD_MD_FLCOOKIE    (0x00800000ULL) obsolete in 2.8 */
+#define OBD_MD_FLPRJQUOTA  (0x00400000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
+#define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
+/*	OBD_MD_FLEPOCH     (0x04000000ULL) obsolete 2.7.50 */
+                                           /* ->mds if epoch opens or closes */
+#define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
+#define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
+#define OBD_MD_FLUSRQUOTA  (0x20000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGRPQUOTA  (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
+
+#define OBD_MD_MDS         (0x0000000100000000ULL) /* where an inode lives on */
+/*	OBD_MD_REINT       (0x0000000200000000ULL) obsolete 1.8 */
+#define OBD_MD_MEA         (0x0000000400000000ULL) /* CMD split EA  */
+#define OBD_MD_TSTATE      (0x0000000800000000ULL) /* transient state field */
+
+#define OBD_MD_FLXATTR       (0x0000001000000000ULL) /* xattr */
+#define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
+#define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL         (0x0000008000000000ULL) /* ACL */
+#define OBD_MD_FLAGSTATFS    (0x0000010000000000ULL) /* aggregated statfs */
+/*	OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) obsolete 2.7.54 */
+/*	OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) obsolete 2.7.54 */
+/*      OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) obsolete 2.3.58*/
+#define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
+#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
+                                                      * under lock; for xattr
+                                                      * requests means the
+                                                      * client holds the lock */
+#define OBD_MD_FLOBJCOUNT    (0x0000400000000000ULL) /* for multiple destroy */
+
+#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
+#define OBD_MD_CLOSE_INTENT_EXECED (0x0020000000000000ULL) /* close intent
+							      executed */
+
+#define OBD_MD_DEFAULT_MEA   (0x0040000000000000ULL) /* default MEA */
+#define OBD_MD_FLOSTLAYOUT   (0x0080000000000000ULL) /* contain ost_layout */
+#define OBD_MD_FLPROJID      (0x0100000000000000ULL) /* project ID */
+#define OBD_MD_SECCTX        (0x0200000000000000ULL) /* embed security xattr */
+#define OBD_MD_FLLAZYSIZE    (0x0400000000000000ULL) /* Lazy size */
+#define OBD_MD_FLLAZYBLOCKS  (0x0800000000000000ULL) /* Lazy blocks */
+#define OBD_MD_FLBTIME       (0x1000000000000000ULL) /* birth time */
+#define OBD_MD_ENCCTX        (0x2000000000000000ULL) /* embed encryption ctx */
+#define OBD_MD_NAMEHASH      (0x4000000000000000ULL) /* use hash instead of name
+						      * in case of encryption
+						      */
+
+#define OBD_MD_FLALLQUOTA (OBD_MD_FLUSRQUOTA | \
+			   OBD_MD_FLGRPQUOTA | \
+			   OBD_MD_FLPRJQUOTA)
+
+#define OBD_MD_FLGETATTR (OBD_MD_FLID    | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
+			  OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
+			  OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
+			  OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
+			  OBD_MD_FLPARENT | OBD_MD_FLRDEV  | OBD_MD_FLGROUP | \
+			  OBD_MD_FLPROJID | OBD_MD_FLBTIME)
+
+#define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS)
+
+/* don't forget obdo_fid which is way down at the bottom so it can
+ * come after the definition of llog_cookie */
+
+enum hss_valid {
+	HSS_SETMASK	= 0x01,
+	HSS_CLEARMASK	= 0x02,
+	HSS_ARCHIVE_ID	= 0x04,
+};
+
+struct hsm_state_set {
+	__u32	hss_valid;
+	__u32	hss_archive_id;
+	__u64	hss_setmask;
+	__u64	hss_clearmask;
+};
+
+/* ost_body.data values for OST_BRW */
+
+#define OBD_BRW_READ            0x01
+#define OBD_BRW_WRITE           0x02
+#define OBD_BRW_RWMASK          (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_NDELAY		0x04 /* Non-delay RPC should be issued for
+				      * this page. Non-delay RPCs have bit
+				      * rq_no_delay set. */
+#define OBD_BRW_SYNC            0x08 /* this page is a part of synchronous
+                                      * transfer and is not accounted in
+                                      * the grant. */
+#define OBD_BRW_CHECK           0x10
+#define OBD_BRW_FROM_GRANT      0x20 /* the osc manages this under llite */
+#define OBD_BRW_GRANTED         0x40 /* the ost manages this */
+/* OBD_BRW_NOCACHE is currently neither set nor tested */
+#define OBD_BRW_NOCACHE         0x80 /* this page is a part of non-cached IO */
+#define OBD_BRW_NOQUOTA        0x100 /* do not enforce quota */
+#define OBD_BRW_SRVLOCK        0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC          0x400 /* Server may delay commit to disk */
+#define OBD_BRW_MEMALLOC       0x800 /* Client runs in the "kswapd" context */
+#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
+#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+#define OBD_BRW_SOFT_SYNC     0x4000 /* This flag notifies the server
+				      * that the client is running low on
+				      * space for unstable pages; asking
+				      * it to sync quickly */
+#define OBD_BRW_OVER_PRJQUOTA 0x8000 /* Running out of project quota */
+#define OBD_BRW_RDMA_ONLY    0x20000 /* RPC contains RDMA-only pages*/
+#define OBD_BRW_SYS_RESOURCE 0x40000 /* page has CAP_SYS_RESOURCE */
+
+#define OBD_BRW_OVER_ALLQUOTA (OBD_BRW_OVER_USRQUOTA | \
+			       OBD_BRW_OVER_GRPQUOTA | \
+			       OBD_BRW_OVER_PRJQUOTA)
+
+#define OBD_BRW_DONE	0x40000000UL   /*
+					* osd-ldiskfs inernal,
+					* IO has been issued before
+					*/
+#define OBD_BRW_LOCAL1	0x80000000UL	/*
+					 * osd-ldiskfs internal,
+					 * page mapped to real block
+					 */
+
+#define OBD_BRW_LOCALS (OBD_BRW_LOCAL1 | OBD_BRW_DONE)
+
+#define OBD_MAX_GRANT 0x7fffffffUL /* Max grant allowed to one client: 2 GiB */
+
+#define OBD_OBJECT_EOF LUSTRE_EOF
+
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
+struct obd_ioobj {
+	struct ost_id	ioo_oid;	/* object ID, if multi-obj BRW */
+	__u32		ioo_max_brw;	/* low 16 bits were o_mode before 2.4,
+					 * now (PTLRPC_BULK_OPS_COUNT - 1) in
+					 * high 16 bits in 2.4 and later */
+	__u32		ioo_bufcnt;	/* number of niobufs for this object */
+};
+
+/* NOTE: IOOBJ_MAX_BRW_BITS defines the _offset_ of the max_brw field in
+ * ioo_max_brw, NOT the maximum number of bits in PTLRPC_BULK_OPS_BITS.
+ * That said, ioo_max_brw is a 32-bit field so the limit is also 16 bits. */
+#define IOOBJ_MAX_BRW_BITS	16
+#define ioobj_max_brw_get(ioo)	(((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
+#define ioobj_max_brw_set(ioo, num)					\
+do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+
+/* multiple of 8 bytes => can array */
+struct niobuf_remote {
+	__u64	rnb_offset;
+	__u32	rnb_len;
+	__u32	rnb_flags;
+};
+
+/* lock value block communicated between the filter and llite */
+
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
+ * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
+#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
+#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
+#define OST_LVB_IS_ERR(blocks)                                          \
+        ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK)
+#define OST_LVB_SET_ERR(blocks, rc)                                     \
+        do { blocks = OST_LVB_ERR_INIT + rc; } while (0)
+#define OST_LVB_GET_ERR(blocks)    (int)(blocks - OST_LVB_ERR_INIT)
+
+struct ost_lvb_v1 {
+	__u64	lvb_size;
+	__s64	lvb_mtime;
+	__s64	lvb_atime;
+	__s64	lvb_ctime;
+	__u64	lvb_blocks;
+};
+
+struct ost_lvb {
+	__u64	lvb_size;
+	__s64	lvb_mtime;
+	__s64	lvb_atime;
+	__s64	lvb_ctime;
+	__u64	lvb_blocks;
+	__u32	lvb_mtime_ns;
+	__u32	lvb_atime_ns;
+	__u32	lvb_ctime_ns;
+	__u32	lvb_padding;
+};
+
+/*
+ *   lquota data structures
+ */
+
+/* The lquota_id structure is an union of all the possible identifier types that
+ * can be used with quota, this includes:
+ * - 64-bit user ID
+ * - 64-bit group ID
+ * - a FID which can be used for per-directory quota in the future */
+union lquota_id {
+	struct lu_fid	qid_fid; /* FID for per-directory quota */
+	__u64		qid_uid; /* user identifier */
+	__u64		qid_gid; /* group identifier */
+	__u64		qid_projid; /* project identifier */
+};
+
+/* quotactl management */
+struct obd_quotactl {
+	__u32			qc_cmd;
+	__u32			qc_type; /* see Q_* flag below */
+	__u32			qc_id;
+	__u32			qc_stat;
+	struct obd_dqinfo	qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+	char			qc_poolname[];
+};
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+/* NOTE:
+ * - in and out maybe a type of struct if_quotactl or struct obd_quotactl
+ * - in and out need not be of the same type.
+ */
+#define __QCTL_COPY(out, in, need_pname)				\
+do {									\
+	Q_COPY(out, in, qc_cmd);					\
+	Q_COPY(out, in, qc_type);					\
+	Q_COPY(out, in, qc_id);						\
+	Q_COPY(out, in, qc_stat);					\
+	Q_COPY(out, in, qc_dqinfo);					\
+	Q_COPY(out, in, qc_dqblk);					\
+	if (need_pname && LUSTRE_Q_CMD_IS_POOL(in->qc_cmd)) {		\
+		size_t len = strnlen(in->qc_poolname, LOV_MAXPOOLNAME);	\
+									\
+		memcpy(out->qc_poolname, in->qc_poolname, len);		\
+		out->qc_poolname[len] = '\0';				\
+	}								\
+} while (0)
+
+#define QCTL_COPY(out, in) __QCTL_COPY(out, in, true)
+#define QCTL_COPY_NO_PNAME(out, in) __QCTL_COPY(out, in, false)
+
+/* Body of quota request used for quota acquire/release RPCs between quota
+ * master (aka QMT) and slaves (ak QSD). */
+struct quota_body {
+	struct lu_fid	qb_fid;     /* FID of global index packing the pool ID
+				      * and type (data or metadata) as well as
+				      * the quota type (user or group). */
+	union lquota_id	qb_id;      /* uid or gid or directory FID */
+	__u32		qb_flags;   /* see below */
+	__u32		qb_padding;
+	__u64		qb_count;   /* acquire/release count (kbytes/inodes) */
+	__u64		qb_usage;   /* current slave usage (kbytes/inodes) */
+	__u64		qb_slv_ver; /* slave index file version */
+	struct lustre_handle	qb_lockh;     /* per-ID lock handle */
+	struct lustre_handle	qb_glb_lockh; /* global lock handle */
+	__u64		qb_padding1[4];
+};
+
+/* When the quota_body is used in the reply of quota global intent
+ * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */
+#define qb_slv_fid	qb_fid
+/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in
+ * quota reply */
+#define qb_qunit	qb_usage
+
+#define QUOTA_DQACQ_FL_ACQ	0x1  /* acquire quota */
+#define QUOTA_DQACQ_FL_PREACQ	0x2  /* pre-acquire */
+#define QUOTA_DQACQ_FL_REL	0x4  /* release quota */
+#define QUOTA_DQACQ_FL_REPORT	0x8  /* report usage */
+
+/* Quota types currently supported */
+enum {
+	LQUOTA_TYPE_USR	= 0x00, /* maps to USRQUOTA */
+	LQUOTA_TYPE_GRP	= 0x01, /* maps to GRPQUOTA */
+	LQUOTA_TYPE_PRJ	= 0x02, /* maps to PRJQUOTA */
+	LQUOTA_TYPE_MAX
+};
+
+/* There are 2 different resource types on which a quota limit can be enforced:
+ * - inodes on the MDTs
+ * - blocks on the OSTs */
+enum {
+	LQUOTA_RES_MD		= 0x01, /* skip 0 to avoid null oid in FID */
+	LQUOTA_RES_DT		= 0x02,
+	LQUOTA_LAST_RES,
+	LQUOTA_FIRST_RES	= LQUOTA_RES_MD
+};
+#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1)
+
+/*
+ * Space accounting support
+ * Format of an accounting record, providing disk usage information for a given
+ * user or group
+ */
+struct lquota_acct_rec { /* 16 bytes */
+	__u64 bspace;  /* current space in use */
+	__u64 ispace;  /* current # inodes in use */
+};
+
+/*
+ * Global quota index support
+ * Format of a global record, providing global quota settings for a given quota
+ * identifier
+ */
+struct lquota_glb_rec { /* 32 bytes */
+	__u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */
+	__u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */
+	__u64 qbr_time;      /* grace time, in seconds */
+	__u64 qbr_granted;   /* how much is granted to slaves, in #inodes or
+			      * kbytes */
+};
+
+/*
+ * Slave index support
+ * Format of a slave record, recording how much space is granted to a given
+ * slave
+ */
+struct lquota_slv_rec { /* 8 bytes */
+	__u64 qsr_granted; /* space granted to the slave for the key=ID,
+			    * in #inodes or kbytes */
+};
+
+/* Data structures associated with the quota locks */
+
+/* Glimpse descriptor used for the index & per-ID quota locks */
+struct ldlm_gl_lquota_desc {
+	union lquota_id	gl_id;    /* quota ID subject to the glimpse */
+	__u64		gl_flags; /* see LQUOTA_FL* below */
+	__u64		gl_ver;   /* new index version */
+	__u64		gl_hardlimit; /* new hardlimit or qunit value */
+	__u64		gl_softlimit; /* new softlimit */
+	__u64		gl_time;
+	__u64		gl_pad2;
+};
+#define gl_qunit	gl_hardlimit /* current qunit value used when
+				      * glimpsing per-ID quota locks */
+
+/* quota glimpse flags */
+#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */
+
+/* LVB used with quota (global and per-ID) locks */
+struct lquota_lvb {
+	__u64	lvb_flags;	/* see LQUOTA_FL* above */
+	__u64	lvb_id_may_rel; /* space that might be released later */
+	__u64	lvb_id_rel;     /* space released by the slave for this ID */
+	__u64	lvb_id_qunit;   /* current qunit value */
+	__u64	lvb_pad1;
+};
+
+/* LVB used with global quota lock */
+#define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
+
+/* op codes */
+enum quota_cmd {
+	QUOTA_DQACQ	= 601,
+	QUOTA_DQREL	= 602,
+	QUOTA_LAST_OPC
+};
+#define QUOTA_FIRST_OPC	QUOTA_DQACQ
+
+/*
+ *   MDS REQ RECORDS
+ */
+
+/* opcodes */
+enum mds_cmd {
+	MDS_GETATTR		= 33,
+	MDS_GETATTR_NAME	= 34,
+	MDS_CLOSE		= 35,
+	MDS_REINT		= 36,
+	MDS_READPAGE		= 37,
+	MDS_CONNECT		= 38,
+	MDS_DISCONNECT		= 39,
+	MDS_GET_ROOT		= 40,
+	MDS_STATFS		= 41,
+	MDS_PIN			= 42, /* obsolete, never used in a release */
+	MDS_UNPIN		= 43, /* obsolete, never used in a release */
+	MDS_SYNC		= 44,
+	MDS_DONE_WRITING	= 45, /* obsolete since 2.8.0 */
+	MDS_SET_INFO		= 46,
+	MDS_QUOTACHECK		= 47, /* not used since 2.4 */
+	MDS_QUOTACTL		= 48,
+	MDS_GETXATTR		= 49,
+	MDS_SETXATTR		= 50, /* obsolete, now it's MDS_REINT op */
+	MDS_WRITEPAGE		= 51,
+	MDS_IS_SUBDIR		= 52, /* obsolete, never used in a release */
+	MDS_GET_INFO		= 53,
+	MDS_HSM_STATE_GET	= 54,
+	MDS_HSM_STATE_SET	= 55,
+	MDS_HSM_ACTION		= 56,
+	MDS_HSM_PROGRESS	= 57,
+	MDS_HSM_REQUEST		= 58,
+	MDS_HSM_CT_REGISTER	= 59,
+	MDS_HSM_CT_UNREGISTER	= 60,
+	MDS_SWAP_LAYOUTS	= 61,
+	MDS_RMFID		= 62,
+	MDS_LAST_OPC
+};
+
+#define MDS_FIRST_OPC    MDS_GETATTR
+
+
+/* opcodes for object update */
+enum update_cmd {
+	OUT_UPDATE	= 1000,
+	OUT_UPDATE_LAST_OPC
+};
+
+#define OUT_UPDATE_FIRST_OPC    OUT_UPDATE
+
+/*
+ * Do not exceed 63
+ */
+
+enum mds_reint_op {
+	REINT_SETATTR  = 1,
+	REINT_CREATE   = 2,
+	REINT_LINK     = 3,
+	REINT_UNLINK   = 4,
+	REINT_RENAME   = 5,
+	REINT_OPEN     = 6,
+	REINT_SETXATTR = 7,
+	REINT_RMENTRY  = 8,
+	REINT_MIGRATE  = 9,
+	REINT_RESYNC   = 10,
+	REINT_MAX
+};
+
+/* the disposition of the intent outlines what was executed */
+#define DISP_IT_EXECD        0x00000001
+#define DISP_LOOKUP_EXECD    0x00000002
+#define DISP_LOOKUP_NEG      0x00000004
+#define DISP_LOOKUP_POS      0x00000008
+#define DISP_OPEN_CREATE     0x00000010
+#define DISP_OPEN_OPEN       0x00000020
+#define DISP_ENQ_COMPLETE    0x00400000		/* obsolete and unused */
+#define DISP_ENQ_OPEN_REF    0x00800000
+#define DISP_ENQ_CREATE_REF  0x01000000
+#define DISP_OPEN_LOCK       0x02000000
+#define DISP_OPEN_LEASE      0x04000000
+#define DISP_OPEN_STRIPE     0x08000000
+#define DISP_OPEN_DENY	     0x10000000
+
+/* INODE LOCK PARTS */
+enum mds_ibits_locks {
+	MDS_INODELOCK_LOOKUP	= 0x000001, /* For namespace, dentry etc.  Was
+					     * used to protect permission (mode,
+					     * owner, group, etc) before 2.4. */
+	MDS_INODELOCK_UPDATE	= 0x000002, /* size, links, timestamps */
+	MDS_INODELOCK_OPEN	= 0x000004, /* For opened files */
+	MDS_INODELOCK_LAYOUT	= 0x000008, /* for layout */
+
+	/* The PERM bit is added in 2.4, and is used to protect permission
+	 * (mode, owner, group, ACL, etc.) separate from LOOKUP lock.
+	 * For remote directories (in DNE) these locks will be granted by
+	 * different MDTs (different LDLM namespace).
+	 *
+	 * For local directory, the MDT always grants UPDATE|PERM together.
+	 * For remote directory, master MDT (where remote directory is) grants
+	 * UPDATE|PERM, and remote MDT (where name entry is) grants LOOKUP_LOCK.
+	 */
+	MDS_INODELOCK_PERM	= 0x000010,
+	MDS_INODELOCK_XATTR	= 0x000020, /* non-permission extended attrs */
+	MDS_INODELOCK_DOM	= 0x000040, /* Data for Data-on-MDT files */
+	/* Do not forget to increase MDS_INODELOCK_NUMBITS when adding bits */
+};
+#define MDS_INODELOCK_NUMBITS 7
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1 << MDS_INODELOCK_NUMBITS) - 1)
+/* DOM lock shouldn't be canceled early, use this macro for ELC */
+#define MDS_INODELOCK_ELC (MDS_INODELOCK_FULL & ~MDS_INODELOCK_DOM)
+
+/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * name[2,3] fields that need to be used for the quota id (also a FID). */
+enum {
+        LUSTRE_RES_ID_SEQ_OFF = 0,
+        LUSTRE_RES_ID_VER_OID_OFF = 1,
+        LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */
+	LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2,
+	LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3,
+        LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+#define MDS_STATUS_CONN 1
+#define MDS_STATUS_LOV 2
+
+enum {
+	/* these should be identical to their EXT4_*_FL counterparts, they are
+	 * redefined here only to avoid dragging in fs/ext4/ext4.h */
+	LUSTRE_SYNC_FL		= 0x00000008, /* Synchronous updates */
+	LUSTRE_IMMUTABLE_FL	= 0x00000010, /* Immutable file */
+	LUSTRE_APPEND_FL	= 0x00000020, /* file writes may only append */
+	LUSTRE_NODUMP_FL	= 0x00000040, /* do not dump file */
+	LUSTRE_NOATIME_FL	= 0x00000080, /* do not update atime */
+	LUSTRE_INDEX_FL		= 0x00001000, /* hash-indexed directory */
+	LUSTRE_DIRSYNC_FL	= 0x00010000, /* dirsync behaviour (dir only) */
+	LUSTRE_TOPDIR_FL	= 0x00020000, /* Top of directory hierarchies*/
+	LUSTRE_INLINE_DATA_FL	= 0x10000000, /* Inode has inline data. */
+	LUSTRE_PROJINHERIT_FL	= 0x20000000, /* Create with parents projid */
+
+	/* These flags will not be identical to any EXT4_*_FL counterparts,
+	 * and only reserved for lustre purpose. Note: these flags might
+	 * be conflict with some of EXT4 flags, so
+	 * 1. these conflict flags needs to be removed when the flag is
+	 * wired by la_flags see osd_attr_get().
+	 * 2. If these flags needs to be stored into inode, they will be
+	 * stored in LMA. see LMAI_XXXX */
+	LUSTRE_ORPHAN_FL	= 0x00002000,
+	LUSTRE_SET_SYNC_FL	= 0x00040000, /* Synchronous setattr on OSTs */
+	LUSTRE_ENCRYPT_FL	= 0x00800000, /* encrypted file */
+
+	LUSTRE_LMA_FL_MASKS	= LUSTRE_ENCRYPT_FL | LUSTRE_ORPHAN_FL,
+};
+
+#ifndef FS_XFLAG_SYNC
+#define FS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
+#endif
+#ifndef FS_XFLAG_NOATIME
+#define FS_XFLAG_NOATIME	0x00000040	/* do not update access time */
+#endif
+#ifndef FS_XFLAG_IMMUTABLE
+#define FS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
+#endif
+#ifndef FS_XFLAG_APPEND
+#define FS_XFLAG_APPEND		0x00000010	/* all writes append */
+#endif
+#ifndef FS_XFLAG_PROJINHERIT
+#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+#endif
+
+/* 64 possible states */
+enum md_transient_state {
+	MS_RESTORE	= (1 << 0),	/* restore is running */
+};
+
+struct mdt_body {
+	struct lu_fid mbo_fid1;
+	struct lu_fid mbo_fid2;
+	struct lustre_handle mbo_open_handle;
+	__u64	mbo_valid;
+	__u64	mbo_size; /* Offset, in the case of MDS_READPAGE */
+	__s64	mbo_mtime;
+	__s64	mbo_atime;
+	__s64	mbo_ctime;
+	__u64	mbo_blocks; /* XID, in the case of MDS_READPAGE */
+	__u64	mbo_version; /* was mbo_ioepoch before 2.11 */
+	__u64	mbo_t_state; /* transient file state defined in
+			      * enum md_transient_state
+			      * was "ino" until 2.4.0 */
+	__u32	mbo_fsuid;
+	__u32	mbo_fsgid;
+	__u32	mbo_capability;
+	__u32	mbo_mode;
+	__u32	mbo_uid;
+	__u32	mbo_gid;
+	__u32	mbo_flags; /* most replies: LUSTRE_*_FL file attributes,
+			    * data_version: OBD_FL_* flags
+			    */
+	__u32	mbo_rdev;
+	__u32	mbo_nlink; /* #bytes to read in the case of MDS_READPAGE */
+	__u32	mbo_layout_gen; /* was "generation" until 2.4.0 */
+	__u32	mbo_suppgid;
+	__u32	mbo_eadatasize;
+	__u32	mbo_aclsize;
+	__u32	mbo_max_mdsize;
+	__u32	mbo_unused3; /* was max_cookiesize until 2.8 */
+	__u32	mbo_uid_h; /* high 32-bits of uid, for FUID */
+	__u32	mbo_gid_h; /* high 32-bits of gid, for FUID */
+	__u32	mbo_projid;
+	__u64	mbo_dom_size; /* size of DOM component */
+	__u64	mbo_dom_blocks; /* blocks consumed by DOM component */
+	__u64	mbo_btime;
+	__u64	mbo_padding_9; /* also fix lustre_swab_mdt_body */
+	__u64	mbo_padding_10;
+}; /* 216 */
+
+struct mdt_ioepoch {
+	struct lustre_handle mio_open_handle;
+	__u64 mio_unused1; /* was ioepoch */
+	__u32 mio_unused2; /* was flags */
+	__u32 mio_padding;
+};
+
+/* permissions for md_perm.mp_perm */
+enum {
+        CFS_SETUID_PERM = 0x01,
+        CFS_SETGID_PERM = 0x02,
+        CFS_SETGRP_PERM = 0x04,
+};
+
+struct mdt_rec_setattr {
+        __u32           sa_opcode;
+        __u32           sa_cap;
+        __u32           sa_fsuid;
+        __u32           sa_fsuid_h;
+        __u32           sa_fsgid;
+        __u32           sa_fsgid_h;
+        __u32           sa_suppgid;
+        __u32           sa_suppgid_h;
+        __u32           sa_padding_1;
+        __u32           sa_padding_1_h;
+        struct lu_fid   sa_fid;
+        __u64           sa_valid;
+        __u32           sa_uid;
+        __u32           sa_gid;
+        __u64           sa_size;
+        __u64           sa_blocks;
+	__s64		sa_mtime;
+	__s64		sa_atime;
+	__s64		sa_ctime;
+        __u32           sa_attr_flags;
+        __u32           sa_mode;
+	__u32           sa_bias;      /* some operation flags */
+	__u32		sa_projid;
+        __u32           sa_padding_4;
+        __u32           sa_padding_5;
+};
+
+/*
+ * Attribute flags used in mdt_rec_setattr::sa_valid.
+ * The kernel's #defines for ATTR_* should not be used over the network
+ * since the client and MDS may run different kernels (see bug 13828)
+ * Therefore, we should only use MDS_ATTR_* attributes for sa_valid.
+ */
+enum mds_attr_flags {
+	MDS_ATTR_MODE =		      0x1ULL, /* = 1 */
+	MDS_ATTR_UID =		      0x2ULL, /* = 2 */
+	MDS_ATTR_GID =		      0x4ULL, /* = 4 */
+	MDS_ATTR_SIZE =		      0x8ULL, /* = 8 */
+	MDS_ATTR_ATIME =	     0x10ULL, /* = 16 */
+	MDS_ATTR_MTIME =	     0x20ULL, /* = 32 */
+	MDS_ATTR_CTIME =	     0x40ULL, /* = 64 */
+	MDS_ATTR_ATIME_SET =	     0x80ULL, /* = 128 */
+	MDS_ATTR_MTIME_SET =	    0x100ULL, /* = 256 */
+	MDS_ATTR_FORCE =	    0x200ULL, /* = 512, change it */
+	MDS_ATTR_ATTR_FLAG =	    0x400ULL, /* = 1024 */
+	MDS_ATTR_KILL_SUID =	    0x800ULL, /* = 2048 */
+	MDS_ATTR_KILL_SGID =	   0x1000ULL, /* = 4096 */
+	MDS_ATTR_CTIME_SET =	   0x2000ULL, /* = 8192 */
+	MDS_ATTR_FROM_OPEN =	   0x4000ULL, /* = 16384, from open O_TRUNC */
+	MDS_ATTR_BLOCKS =	   0x8000ULL, /* = 32768 */
+	MDS_ATTR_PROJID =	  0x10000ULL, /* = 65536 */
+	MDS_ATTR_LSIZE =	  0x20000ULL, /* = 131072 */
+	MDS_ATTR_LBLOCKS =	  0x40000ULL, /* = 262144 */
+	MDS_ATTR_OVERRIDE =	0x2000000ULL, /* = 33554432 */
+};
+
+enum mds_op_bias {
+/*	MDS_CHECK_SPLIT		= 1 << 0, obsolete before 2.3.58 */
+	/* used for remote object getattr/open by name: in the original
+	 * getattr/open request, MDT found the object against name is on another
+	 * MDT, then packed FID and LOOKUP lock in reply and returned -EREMOTE,
+	 * and client knew it's a remote object, then set this flag in
+	 * getattr/open request and sent to the corresponding MDT to finish
+	 * getattr/open, which fetched attributes and UPDATE lock/opened file.
+	 */
+	MDS_CROSS_REF		= 1 << 1,
+/*	MDS_VTX_BYPASS		= 1 << 2, obsolete since 2.3.54 */
+	MDS_PERM_BYPASS		= 1 << 3,
+/*	MDS_SOM			= 1 << 4, obsolete since 2.8.0 */
+	MDS_QUOTA_IGNORE	= 1 << 5,
+/*	MDS_CLOSE_CLEANUP	= 1 << 6, obsolete since 2.3.51 */
+	MDS_KEEP_ORPHAN		= 1 << 7,
+	MDS_RECOV_OPEN		= 1 << 8,
+	MDS_DATA_MODIFIED	= 1 << 9,
+	MDS_CREATE_VOLATILE	= 1 << 10,
+	MDS_OWNEROVERRIDE	= 1 << 11,
+	MDS_HSM_RELEASE		= 1 << 12,
+	MDS_CLOSE_MIGRATE	= 1 << 13,
+	MDS_CLOSE_LAYOUT_SWAP	= 1 << 14,
+	MDS_CLOSE_LAYOUT_MERGE	= 1 << 15,
+	MDS_CLOSE_RESYNC_DONE	= 1 << 16,
+	MDS_CLOSE_LAYOUT_SPLIT	= 1 << 17,
+	MDS_TRUNC_KEEP_LEASE	= 1 << 18,
+	MDS_PCC_ATTACH		= 1 << 19,
+	MDS_CLOSE_UPDATE_TIMES	= 1 << 20,
+	/* setstripe create only, don't restripe if target exists */
+	MDS_SETSTRIPE_CREATE	= 1 << 21,
+	MDS_FID_OP		= 1 << 22,
+	/* migrate dirent only */
+	MDS_MIGRATE_NSONLY	= 1 << 23,
+};
+
+#define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |         \
+			  MDS_CLOSE_LAYOUT_MERGE | MDS_CLOSE_LAYOUT_SPLIT | \
+			  MDS_CLOSE_RESYNC_DONE)
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_create {
+	__u32		cr_opcode;
+	__u32		cr_cap;
+	__u32		cr_fsuid;
+	__u32		cr_fsuid_h;
+	__u32		cr_fsgid;
+	__u32		cr_fsgid_h;
+	__u32		cr_suppgid1;
+	__u32		cr_suppgid1_h;
+	__u32		cr_suppgid2;
+	__u32		cr_suppgid2_h;
+	struct lu_fid	cr_fid1;
+	struct lu_fid	cr_fid2;
+	struct lustre_handle cr_open_handle_old; /* in case of open replay */
+	__s64		cr_time;
+	union {
+		__u64		cr_rdev;
+		__u32		cr_archive_id;
+	};
+	__u64		cr_ioepoch;
+	__u64		cr_padding_1;   /* rr_blocks */
+	__u32		cr_mode;
+	__u32		cr_bias;
+	/* use of helpers set/get_mrc_cr_flags() is needed to access
+	 * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+	 * extend cr_flags size without breaking 1.8 compat */
+	__u32		cr_flags_l;	/* for use with open, low  32 bits  */
+	__u32		cr_flags_h;	/* for use with open, high 32 bits */
+	__u32		cr_umask;	/* umask for create */
+	__u32		cr_padding_4;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_link {
+        __u32           lk_opcode;
+        __u32           lk_cap;
+        __u32           lk_fsuid;
+        __u32           lk_fsuid_h;
+        __u32           lk_fsgid;
+        __u32           lk_fsgid_h;
+        __u32           lk_suppgid1;
+        __u32           lk_suppgid1_h;
+        __u32           lk_suppgid2;
+        __u32           lk_suppgid2_h;
+        struct lu_fid   lk_fid1;
+        struct lu_fid   lk_fid2;
+	__s64		lk_time;
+        __u64           lk_padding_1;   /* rr_atime */
+        __u64           lk_padding_2;   /* rr_ctime */
+        __u64           lk_padding_3;   /* rr_size */
+        __u64           lk_padding_4;   /* rr_blocks */
+        __u32           lk_bias;
+        __u32           lk_padding_5;   /* rr_mode */
+        __u32           lk_padding_6;   /* rr_flags */
+        __u32           lk_padding_7;   /* rr_padding_2 */
+        __u32           lk_padding_8;   /* rr_padding_3 */
+        __u32           lk_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_unlink {
+        __u32           ul_opcode;
+        __u32           ul_cap;
+        __u32           ul_fsuid;
+        __u32           ul_fsuid_h;
+        __u32           ul_fsgid;
+        __u32           ul_fsgid_h;
+        __u32           ul_suppgid1;
+        __u32           ul_suppgid1_h;
+        __u32           ul_suppgid2;
+        __u32           ul_suppgid2_h;
+        struct lu_fid   ul_fid1;
+        struct lu_fid   ul_fid2;
+	__s64		ul_time;
+        __u64           ul_padding_2;   /* rr_atime */
+        __u64           ul_padding_3;   /* rr_ctime */
+        __u64           ul_padding_4;   /* rr_size */
+        __u64           ul_padding_5;   /* rr_blocks */
+        __u32           ul_bias;
+        __u32           ul_mode;
+        __u32           ul_padding_6;   /* rr_flags */
+        __u32           ul_padding_7;   /* rr_padding_2 */
+        __u32           ul_padding_8;   /* rr_padding_3 */
+        __u32           ul_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_rename {
+        __u32           rn_opcode;
+        __u32           rn_cap;
+        __u32           rn_fsuid;
+        __u32           rn_fsuid_h;
+        __u32           rn_fsgid;
+        __u32           rn_fsgid_h;
+        __u32           rn_suppgid1;
+        __u32           rn_suppgid1_h;
+        __u32           rn_suppgid2;
+        __u32           rn_suppgid2_h;
+        struct lu_fid   rn_fid1;
+        struct lu_fid   rn_fid2;
+	__s64		rn_time;
+        __u64           rn_padding_1;   /* rr_atime */
+        __u64           rn_padding_2;   /* rr_ctime */
+        __u64           rn_padding_3;   /* rr_size */
+        __u64           rn_padding_4;   /* rr_blocks */
+        __u32           rn_bias;        /* some operation flags */
+        __u32           rn_mode;        /* cross-ref rename has mode */
+        __u32           rn_padding_5;   /* rr_flags */
+        __u32           rn_padding_6;   /* rr_padding_2 */
+        __u32           rn_padding_7;   /* rr_padding_3 */
+        __u32           rn_padding_8;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_setxattr {
+        __u32           sx_opcode;
+        __u32           sx_cap;
+        __u32           sx_fsuid;
+        __u32           sx_fsuid_h;
+        __u32           sx_fsgid;
+        __u32           sx_fsgid_h;
+        __u32           sx_suppgid1;
+        __u32           sx_suppgid1_h;
+        __u32           sx_suppgid2;
+        __u32           sx_suppgid2_h;
+        struct lu_fid   sx_fid;
+        __u64           sx_padding_1;   /* These three are rr_fid2 */
+        __u32           sx_padding_2;
+        __u32           sx_padding_3;
+        __u64           sx_valid;
+	__s64		sx_time;
+        __u64           sx_padding_5;   /* rr_ctime */
+        __u64           sx_padding_6;   /* rr_size */
+        __u64           sx_padding_7;   /* rr_blocks */
+        __u32           sx_size;
+        __u32           sx_flags;
+        __u32           sx_padding_8;   /* rr_flags */
+        __u32           sx_padding_9;   /* rr_padding_2 */
+        __u32           sx_padding_10;  /* rr_padding_3 */
+        __u32           sx_padding_11;  /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec
+ * FLR: for file resync MDS_REINT_RESYNC RPC. */
+struct mdt_rec_resync {
+	__u32           rs_opcode;
+	__u32           rs_cap;
+	__u32           rs_fsuid;
+	__u32           rs_fsuid_h;
+	__u32           rs_fsgid;
+	__u32           rs_fsgid_h;
+	__u32           rs_suppgid1;
+	__u32           rs_suppgid1_h;
+	__u32           rs_suppgid2;
+	__u32           rs_suppgid2_h;
+	struct lu_fid   rs_fid;
+	__u8		rs_padding0[sizeof(struct lu_fid)];
+	struct lustre_handle rs_lease_handle;	/* rr_mtime */
+	__s64		rs_padding1;	/* rr_atime */
+	__s64		rs_padding2;	/* rr_ctime */
+	__u64           rs_padding3;	/* rr_size */
+	__u64           rs_padding4;	/* rr_blocks */
+	__u32           rs_bias;
+	__u32           rs_padding5;	/* rr_mode */
+	__u32           rs_padding6;	/* rr_flags */
+	__u32           rs_padding7;	/* rr_flags_h */
+	__u32           rs_padding8;	/* rr_umask */
+	__u16           rs_mirror_id;
+	__u16           rs_padding9;	/* rr_padding_4 */
+};
+
+/*
+ * mdt_rec_reint is the template for all mdt_reint_xxx structures.
+ * Do NOT change the size of various members, otherwise the value
+ * will be broken in lustre_swab_mdt_rec_reint().
+ *
+ * If you add new members in other mdt_reint_xxx structres and need to use the
+ * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also.
+ */
+struct mdt_rec_reint {
+	__u32           rr_opcode;
+	__u32           rr_cap;
+	__u32           rr_fsuid;
+	__u32           rr_fsuid_h;
+	__u32           rr_fsgid;
+	__u32           rr_fsgid_h;
+	__u32           rr_suppgid1;
+	__u32           rr_suppgid1_h;
+	__u32           rr_suppgid2;
+	__u32           rr_suppgid2_h;
+	struct lu_fid   rr_fid1;
+	struct lu_fid   rr_fid2;
+	__s64		rr_mtime;
+	__s64		rr_atime;
+	__s64		rr_ctime;
+	__u64           rr_size;
+	__u64           rr_blocks;
+	__u32           rr_bias;
+	__u32           rr_mode;
+	__u32           rr_flags;
+	__u32           rr_flags_h;
+	__u32           rr_umask;
+	__u16		rr_mirror_id;
+	__u16           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+};
+
+#define LMV_DESC_QOS_MAXAGE_DEFAULT 60  /* Seconds */
+
+/* lmv structures */
+struct lmv_desc {
+	__u32 ld_tgt_count;		/* how many MDS's */
+	__u32 ld_active_tgt_count;	/* how many active */
+	__u32 ld_default_stripe_count;	/* how many objects are used */
+	__u32 ld_pattern;		/* default hash pattern */
+	__u64 ld_default_hash_size;
+	__u64 ld_padding_1;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_qos_maxage;		/* in second */
+	__u32 ld_padding_3;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_4;		/* also fix lustre_swab_lmv_desc */
+	struct obd_uuid ld_uuid;
+};
+
+/* LMV layout EA, and it will be stored both in master and slave object */
+struct lmv_mds_md_v1 {
+	__u32 lmv_magic;
+	__u32 lmv_stripe_count;
+	__u32 lmv_master_mdt_index;	/* On master object, it is master
+					 * MDT index, on slave object, it
+					 * is stripe index of the slave obj */
+	__u32 lmv_hash_type;		/* dir stripe policy, i.e. indicate
+					 * which hash function to be used,
+					 * Note: only lower 16 bits is being
+					 * used for now. Higher 16 bits will
+					 * be used to mark the object status,
+					 * for example migrating or dead. */
+	__u32 lmv_layout_version;	/* increased each time layout changed,
+					 * by directory migration, restripe
+					 * and LFSCK. */
+	__u32 lmv_migrate_offset;	/* once this is set, it means this
+					 * directory is been migrated, stripes
+					 * before this offset belong to target,
+					 * from this to source. */
+	__u32 lmv_migrate_hash;		/* hash type of source stripes of
+					 * migrating directory */
+	__u32 lmv_padding2;
+	__u64 lmv_padding3;
+	char lmv_pool_name[LOV_MAXPOOLNAME + 1];	/* pool name */
+	struct lu_fid lmv_stripe_fids[0];	/* FIDs for each stripe */
+};
+
+/* stripe count before directory split */
+#define lmv_split_offset	lmv_migrate_offset
+/* stripe count after directory merge */
+#define lmv_merge_offset	lmv_migrate_offset
+/* directory hash type after merge */
+#define lmv_merge_hash		lmv_migrate_hash
+
+/* foreign LMV EA */
+struct lmv_foreign_md {
+	__u32 lfm_magic;	/* magic number = LMV_MAGIC_FOREIGN */
+	__u32 lfm_length;	/* length of lfm_value */
+	__u32 lfm_type;		/* type, see LU_FOREIGN_TYPE_ */
+	__u32 lfm_flags;	/* flags, type specific */
+	char lfm_value[];	/* free format value */
+};
+
+#define LMV_MAGIC_V1	0x0CD20CD0    /* normal stripe lmv magic */
+#define LMV_MAGIC	LMV_MAGIC_V1
+
+/* #define LMV_USER_MAGIC 0x0CD30CD0 */
+#define LMV_MAGIC_STRIPE 0x0CD40CD0 /* magic for dir sub_stripe */
+#define LMV_MAGIC_FOREIGN 0x0CD50CD0 /* magic for lmv foreign */
+
+/**
+ * The FNV-1a hash algorithm is as follows:
+ *	hash = FNV_offset_basis
+ *	for each octet_of_data to be hashed
+ *		hash = hash XOR octet_of_data
+ *		hash = hash × FNV_prime
+ *	return hash
+ * http://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function#FNV-1a_hash
+ *
+ * http://www.isthe.com/chongo/tech/comp/fnv/index.html#FNV-reference-source
+ * FNV_prime is 2^40 + 2^8 + 0xb3 = 0x100000001b3ULL
+ **/
+#define LUSTRE_FNV_1A_64_PRIME	0x100000001b3ULL
+#define LUSTRE_FNV_1A_64_OFFSET_BIAS 0xcbf29ce484222325ULL
+static inline __u64 lustre_hash_fnv_1a_64(const void *buf, __kernel_size_t size)
+{
+	__u64 hash = LUSTRE_FNV_1A_64_OFFSET_BIAS;
+	const unsigned char *p = buf;
+	__kernel_size_t i;
+
+	for (i = 0; i < size; i++) {
+		hash ^= p[i];
+		hash *= LUSTRE_FNV_1A_64_PRIME;
+	}
+
+	return hash;
+}
+
+/* CRUSH placement group count */
+#define LMV_CRUSH_PG_COUNT	4096
+
+union lmv_mds_md {
+	__u32			 lmv_magic;
+	struct lmv_mds_md_v1	 lmv_md_v1;
+	struct lmv_user_md	 lmv_user_md;
+	struct lmv_foreign_md	 lmv_foreign_md;
+};
+
+static inline __kernel_ssize_t lmv_mds_md_size(int stripe_count,
+					       unsigned int lmm_magic)
+{
+	__kernel_ssize_t len = -EINVAL;
+
+	switch (lmm_magic) {
+	case LMV_MAGIC_V1: {
+		struct lmv_mds_md_v1 *lmm1;
+
+		len = sizeof(*lmm1);
+		len += stripe_count * sizeof(lmm1->lmv_stripe_fids[0]);
+		break; }
+	default:
+		break;
+	}
+	return len;
+}
+
+static inline int lmv_mds_md_stripe_count_get(const union lmv_mds_md *lmm)
+{
+	switch (__le32_to_cpu(lmm->lmv_magic)) {
+	case LMV_MAGIC_V1:
+		return __le32_to_cpu(lmm->lmv_md_v1.lmv_stripe_count);
+	case LMV_USER_MAGIC:
+		return __le32_to_cpu(lmm->lmv_user_md.lum_stripe_count);
+	default:
+		return -EINVAL;
+	}
+}
+
+static inline int lmv_mds_md_hash_type_get(const union lmv_mds_md *lmm)
+{
+	switch (__le32_to_cpu(lmm->lmv_magic)) {
+	case LMV_MAGIC_V1:
+		return __le32_to_cpu(lmm->lmv_md_v1.lmv_hash_type);
+	case LMV_USER_MAGIC:
+		return __le32_to_cpu(lmm->lmv_user_md.lum_hash_type);
+	default:
+		return -EINVAL;
+	}
+}
+
+enum fld_rpc_opc {
+	FLD_QUERY	= 900,
+	FLD_READ	= 901,
+	FLD_LAST_OPC,
+	FLD_FIRST_OPC   = FLD_QUERY
+};
+
+enum seq_rpc_opc {
+        SEQ_QUERY                       = 700,
+        SEQ_LAST_OPC,
+        SEQ_FIRST_OPC                   = SEQ_QUERY
+};
+
+enum seq_op {
+        SEQ_ALLOC_SUPER = 0,
+        SEQ_ALLOC_META = 1
+};
+
+enum fld_op {
+	FLD_CREATE = 0,
+	FLD_DELETE = 1,
+	FLD_LOOKUP = 2,
+};
+
+/* LFSCK opcodes */
+enum lfsck_cmd {
+	LFSCK_NOTIFY		= 1101,
+	LFSCK_QUERY		= 1102,
+	LFSCK_LAST_OPC,
+	LFSCK_FIRST_OPC		= LFSCK_NOTIFY
+};
+
+/*
+ *  LOV data structures
+ */
+
+#define LOV_MAX_UUID_BUFFER_SIZE  8192
+/* The size of the buffer the lov/mdc reserves for the
+ * array of UUIDs returned by the MDS.  With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+#define LOV_DESC_QOS_MAXAGE_DEFAULT 5  /* Seconds */
+#define LOV_DESC_STRIPE_SIZE_DEFAULT (1 << LNET_MTU_BITS)
+
+/* LOV settings descriptor (should only contain static info) */
+struct lov_desc {
+	__u32 ld_tgt_count;		/* how many OBD's */
+	__u32 ld_active_tgt_count;	/* how many active */
+	__s32 ld_default_stripe_count;	/* how many objects are used */
+	__u32 ld_pattern;		/* default PATTERN_RAID0 */
+	__u64 ld_default_stripe_size;	/* in bytes */
+	__s64 ld_default_stripe_offset;	/* starting OST index */
+	__u32 ld_padding_0;		/* unused */
+	__u32 ld_qos_maxage;		/* in second */
+	__u32 ld_padding_1;		/* also fix lustre_swab_lov_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lov_desc */
+	struct obd_uuid ld_uuid;
+};
+
+#define ld_magic ld_active_tgt_count       /* for swabbing from llogs */
+
+/*
+ *   LDLM requests:
+ */
+/* opcodes -- MUST be distinct from OST/MDS opcodes */
+enum ldlm_cmd {
+        LDLM_ENQUEUE     = 101,
+        LDLM_CONVERT     = 102,
+        LDLM_CANCEL      = 103,
+        LDLM_BL_CALLBACK = 104,
+        LDLM_CP_CALLBACK = 105,
+        LDLM_GL_CALLBACK = 106,
+        LDLM_SET_INFO    = 107,
+        LDLM_LAST_OPC
+};
+#define LDLM_FIRST_OPC LDLM_ENQUEUE
+
+#define RES_NAME_SIZE 4
+struct ldlm_res_id {
+        __u64 name[RES_NAME_SIZE];
+};
+
+#define DLDLMRES	"[%#llx:%#llx:%#llx].%#llx"
+#define PLDLMRES(res)	(unsigned long long)(res)->lr_name.name[0],	\
+			(unsigned long long)(res)->lr_name.name[1],	\
+			(unsigned long long)(res)->lr_name.name[2],	\
+			(unsigned long long)(res)->lr_name.name[3]
+
+/* lock types */
+enum ldlm_mode {
+	LCK_MINMODE	= 0,
+	LCK_EX		= 1,
+	LCK_PW		= 2,
+	LCK_PR		= 4,
+	LCK_CW		= 8,
+	LCK_CR		= 16,
+	LCK_NL		= 32,
+	LCK_GROUP	= 64,
+	LCK_COS		= 128,
+	LCK_MAXMODE
+};
+
+#define LCK_MODE_NUM    8
+
+enum ldlm_type {
+	LDLM_PLAIN	= 10,
+	LDLM_EXTENT	= 11,
+	LDLM_FLOCK	= 12,
+	LDLM_IBITS	= 13,
+	LDLM_MAX_TYPE
+};
+
+#define LDLM_MIN_TYPE LDLM_PLAIN
+
+struct ldlm_extent {
+        __u64 start;
+        __u64 end;
+        __u64 gid;
+};
+
+static inline bool ldlm_extent_equal(const struct ldlm_extent *ex1,
+				    const struct ldlm_extent *ex2)
+{
+	return ex1->start == ex2->start && ex1->end == ex2->end;
+}
+
+struct ldlm_inodebits {
+	__u64 bits;
+	union {
+		__u64 try_bits; /* optional bits to try */
+		__u64 cancel_bits; /* for lock convert */
+	};
+	__u64 li_gid;
+};
+
+struct ldlm_flock_wire {
+        __u64 lfw_start;
+        __u64 lfw_end;
+        __u64 lfw_owner;
+        __u32 lfw_padding;
+        __u32 lfw_pid;
+};
+
+/* it's important that the fields of the ldlm_extent structure match
+ * the first fields of the ldlm_flock structure because there is only
+ * one ldlm_swab routine to process the ldlm_policy_data_t union. if
+ * this ever changes we will need to swab the union differently based
+ * on the resource type. */
+
+union ldlm_wire_policy_data {
+	struct ldlm_extent	l_extent;
+	struct ldlm_flock_wire	l_flock;
+	struct ldlm_inodebits	l_inodebits;
+};
+
+struct barrier_lvb {
+	__u32	lvb_status;
+	__u32	lvb_index;
+	__u64	lvb_padding;
+};
+
+struct ldlm_gl_barrier_desc {
+	__u32	lgbd_status;
+	__u32	lgbd_timeout;
+	__u64	lgbd_padding;
+};
+
+union ldlm_gl_desc {
+	struct ldlm_gl_lquota_desc	lquota_desc;
+	struct ldlm_gl_barrier_desc	barrier_desc;
+};
+
+enum ldlm_intent_flags {
+	IT_OPEN        = 0x00000001,
+	IT_CREAT       = 0x00000002,
+	IT_OPEN_CREAT  = IT_OPEN | IT_CREAT, /* To allow case label. */
+	IT_READDIR     = 0x00000004, /* Used by mdc, not put on the wire. */
+	IT_GETATTR     = 0x00000008,
+	IT_LOOKUP      = 0x00000010,
+/*	IT_UNLINK      = 0x00000020, Obsolete. */
+/*	IT_TRUNC       = 0x00000040, Obsolete. */
+	IT_GETXATTR    = 0x00000080,
+/*	IT_EXEC        = 0x00000100, Obsolete. */
+/*	IT_PIN         = 0x00000200, Obsolete. */
+	IT_LAYOUT      = 0x00000400,
+	IT_QUOTA_DQACQ = 0x00000800,
+	IT_QUOTA_CONN  = 0x00001000,
+/*	IT_SETXATTR    = 0x00002000, Obsolete. */
+	IT_GLIMPSE     = 0x00004000,
+	IT_BRW	       = 0x00008000,
+};
+
+struct ldlm_intent {
+	__u64 opc;
+};
+
+struct ldlm_resource_desc {
+	enum ldlm_type	   lr_type;
+	__u32		   lr_pad; /* also fix lustre_swab_ldlm_resource_desc */
+	struct ldlm_res_id lr_name;
+};
+
+struct ldlm_lock_desc {
+	struct ldlm_resource_desc l_resource;
+	enum ldlm_mode l_req_mode;
+	enum ldlm_mode l_granted_mode;
+	union ldlm_wire_policy_data l_policy_data;
+};
+
+#define LDLM_LOCKREQ_HANDLES 2
+#define LDLM_ENQUEUE_CANCEL_OFF 1
+
+struct ldlm_request {
+	__u32 lock_flags;		/* LDLM_FL_*, see lustre_dlm_flags.h */
+	__u32 lock_count;		/* number of locks in lock_handle[] */
+	struct ldlm_lock_desc lock_desc;/* lock descriptor */
+	struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+};
+
+struct ldlm_reply {
+        __u32 lock_flags;
+        __u32 lock_padding;     /* also fix lustre_swab_ldlm_reply */
+        struct ldlm_lock_desc lock_desc;
+        struct lustre_handle lock_handle;
+        __u64  lock_policy_res1;
+        __u64  lock_policy_res2;
+};
+
+#define ldlm_flags_to_wire(flags)    ((__u32)(flags))
+#define ldlm_flags_from_wire(flags)  ((__u64)(flags))
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+enum mgs_cmd {
+	MGS_CONNECT	= 250,
+	MGS_DISCONNECT	= 251,
+	MGS_EXCEPTION	= 252,	/* node died, etc. */
+	MGS_TARGET_REG	= 253,	/* whenever target starts up */
+	MGS_TARGET_DEL	= 254,
+	MGS_SET_INFO	= 255,
+	MGS_CONFIG_READ	= 256,
+	MGS_LAST_OPC,
+	MGS_FIRST_OPC	= MGS_CONNECT
+};
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0)
+#define MGS_PARAM_MAXLEN 1024
+#define KEY_SET_INFO "set_info"
+
+struct mgs_send_param {
+	char		mgs_param[MGS_PARAM_MAXLEN];
+};
+#endif
+
+/* We pass this info to the MGS so it can write config logs */
+#define MTI_NAME_MAXLEN  64
+#define MTI_PARAM_MAXLEN 4096
+#define MTI_NIDS_MAX     32
+struct mgs_target_info {
+	__u32		mti_lustre_ver;
+	__u32		mti_stripe_index;
+	__u32		mti_config_ver;
+	__u32		mti_flags;    /* LDD_F_* */
+	__u32		mti_nid_count;
+	__u32		mti_instance; /* Running instance of target */
+	char		mti_fsname[MTI_NAME_MAXLEN];
+	char		mti_svname[MTI_NAME_MAXLEN];
+	char		mti_uuid[sizeof(struct obd_uuid)];
+	__u64		mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t) */
+	char		mti_params[MTI_PARAM_MAXLEN];
+};
+
+struct mgs_nidtbl_entry {
+        __u64           mne_version;    /* table version of this entry */
+        __u32           mne_instance;   /* target instance # */
+        __u32           mne_index;      /* target index */
+        __u32           mne_length;     /* length of this entry - by bytes */
+        __u8            mne_type;       /* target type LDD_F_SV_TYPE_OST/MDT */
+        __u8            mne_nid_type;   /* type of nid(mbz). for ipv6. */
+        __u8            mne_nid_size;   /* size of each NID, by bytes */
+        __u8            mne_nid_count;  /* # of NIDs in buffer */
+        union {
+                lnet_nid_t nids[0];     /* variable size buffer for NIDs. */
+        } u;
+};
+
+enum mgs_cfg_type {
+	MGS_CFG_T_CONFIG	= 0,
+	MGS_CFG_T_SPTLRPC	= 1,
+	MGS_CFG_T_RECOVER	= 2,
+	MGS_CFG_T_PARAMS	= 3,
+	MGS_CFG_T_NODEMAP	= 4,
+	MGS_CFG_T_BARRIER	= 5,
+	MGS_CFG_T_MAX
+};
+
+struct mgs_config_body {
+	char     mcb_name[MTI_NAME_MAXLEN]; /* logname */
+	__u64    mcb_offset;    /* next index of config log to request */
+	__u16    mcb_type;      /* type of log: MGS_CFG_T_[CONFIG|RECOVER] */
+	__u8     mcb_nm_cur_pass;
+	__u8     mcb_bits;      /* bits unit size of config log */
+	__u32    mcb_units;     /* # of units for bulk transfer */
+};
+
+struct mgs_config_res {
+	__u64    mcr_offset;    /* index of last config log */
+	union {
+		__u64    mcr_size;		/* size of the log */
+		__u64	 mcr_nm_cur_pass;	/* current nodemap config pass */
+	};
+};
+
+/* Config marker flags (in config log) */
+#define CM_START       0x01
+#define CM_END         0x02
+#define CM_SKIP        0x04
+#define CM_UPGRADE146  0x08
+#define CM_EXCLUDE     0x10
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+	__u32	cm_step;       /* aka config version */
+	__u32	cm_flags;
+	__u32	cm_vers;       /* lustre release version number */
+	__u32	cm_padding;    /* 64 bit align */
+	__s64	cm_createtime; /*when this record was first created */
+	__s64	cm_canceltime; /*when this record is no longer valid*/
+	char	cm_tgtname[MTI_NAME_MAXLEN];
+	char	cm_comment[MTI_NAME_MAXLEN];
+};
+
+/*
+ * Opcodes for multiple servers.
+ */
+enum obd_cmd {
+	OBD_PING	= 400,
+/*	OBD_LOG_CANCEL	= 401, obsolete since 1.5 */
+/*	OBD_QC_CALLBACK	= 402, obsolete since 2.4 */
+	OBD_IDX_READ	= 403,
+	OBD_LAST_OPC,
+	OBD_FIRST_OPC = OBD_PING
+};
+
+/**
+ * llog contexts indices.
+ *
+ * There is compatibility problem with indexes below, they are not
+ * continuous and must keep their numbers for compatibility needs.
+ * See LU-5218 for details.
+ */
+enum llog_ctxt_id {
+	LLOG_CONFIG_ORIG_CTXT  =  0,
+	LLOG_CONFIG_REPL_CTXT = 1,
+	LLOG_MDS_OST_ORIG_CTXT = 2,
+	LLOG_MDS_OST_REPL_CTXT = 3, /* kept just to avoid re-assignment */
+	LLOG_SIZE_ORIG_CTXT = 4,
+	LLOG_SIZE_REPL_CTXT = 5,
+	LLOG_TEST_ORIG_CTXT = 8,
+	LLOG_TEST_REPL_CTXT = 9, /* kept just to avoid re-assignment */
+	LLOG_CHANGELOG_ORIG_CTXT = 12, /**< changelog generation on mdd */
+	LLOG_CHANGELOG_REPL_CTXT = 13, /**< changelog access on clients */
+	/* for multiple changelog consumers */
+	LLOG_CHANGELOG_USER_ORIG_CTXT = 14,
+	LLOG_AGENT_ORIG_CTXT = 15, /**< agent requests generation on cdt */
+	LLOG_UPDATELOG_ORIG_CTXT = 16, /* update log. reserve for the client */
+	LLOG_UPDATELOG_REPL_CTXT = 17, /* update log. reserve for the client */
+	LLOG_MAX_CTXTS
+};
+
+/** Identifier for a single log object */
+struct llog_logid {
+	struct ost_id		lgl_oi;
+        __u32                   lgl_ogen;
+} __attribute__((packed));
+
+/** Records written to the CATALOGS list */
+#define CATLIST "CATALOGS"
+struct llog_catid {
+        struct llog_logid       lci_logid;
+        __u32                   lci_padding1;
+        __u32                   lci_padding2;
+        __u32                   lci_padding3;
+} __attribute__((packed));
+
+/* Log data record types - there is no specific reason that these need to
+ * be related to the RPC opcodes, but no reason not to (may be handy later?)
+ */
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK  0xfff00000
+
+enum llog_op_type {
+	LLOG_PAD_MAGIC		= LLOG_OP_MAGIC | 0x00000,
+	OST_SZ_REC		= LLOG_OP_MAGIC | 0x00f00,
+	/* OST_RAID1_REC	= LLOG_OP_MAGIC | 0x01000, never used */
+	MDS_UNLINK_REC		= LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) |
+				  REINT_UNLINK, /* obsolete after 2.5.0 */
+	MDS_UNLINK64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_UNLINK,
+	/* MDS_SETATTR_REC	= LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */
+	MDS_SETATTR64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_SETATTR,
+	OBD_CFG_REC		= LLOG_OP_MAGIC | 0x20000,
+	/* PTL_CFG_REC		= LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */
+	LLOG_GEN_REC		= LLOG_OP_MAGIC | 0x40000,
+	/* LLOG_JOIN_REC	= LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
+	CHANGELOG_REC		= LLOG_OP_MAGIC | 0x60000,
+	CHANGELOG_USER_REC	= LLOG_OP_MAGIC | 0x70000,
+	CHANGELOG_USER_REC2	= LLOG_OP_MAGIC | 0x70002,
+	HSM_AGENT_REC		= LLOG_OP_MAGIC | 0x80000,
+	UPDATE_REC		= LLOG_OP_MAGIC | 0xa0000, /* Resevered to avoid
+							    * use on client.
+							    */
+	LLOG_HDR_MAGIC		= LLOG_OP_MAGIC | 0x45539,
+	LLOG_LOGID_MAGIC	= LLOG_OP_MAGIC | 0x4553b,
+};
+
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+	(((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
+
+/** Log record header - stored in little endian order.
+ * Each record must start with this struct, end with a llog_rec_tail,
+ * and be a multiple of 256 bits in size.
+ */
+struct llog_rec_hdr {
+	__u32	lrh_len;
+	__u32	lrh_index;
+	__u32	lrh_type;
+	__u32	lrh_id;
+} __attribute__((packed));
+
+struct llog_rec_tail {
+	__u32	lrt_len;
+	__u32	lrt_index;
+} __attribute__((packed));
+
+/* Where data follow just after header */
+#define REC_DATA(ptr)						\
+	((void *)((char *)ptr + sizeof(struct llog_rec_hdr)))
+
+#define REC_DATA_LEN(rec)					\
+	(rec->lrh_len - sizeof(struct llog_rec_hdr) -		\
+	 sizeof(struct llog_rec_tail))
+
+struct llog_logid_rec {
+	struct llog_rec_hdr	lid_hdr;
+	struct llog_logid	lid_id;
+	__u32			lid_padding1;
+	__u64			lid_padding2;
+	__u64			lid_padding3;
+	struct llog_rec_tail	lid_tail;
+} __attribute__((packed));
+
+struct llog_unlink_rec {
+	struct llog_rec_hdr	lur_hdr;
+	__u64			lur_oid;
+	__u32			lur_oseq;
+	__u32			lur_count;
+	struct llog_rec_tail	lur_tail;
+} __attribute__((packed));
+
+struct llog_unlink64_rec {
+	struct llog_rec_hdr	lur_hdr;
+	struct lu_fid		lur_fid;
+	__u32			lur_count; /* to destroy the lost precreated */
+	__u32			lur_padding1;
+	__u64			lur_padding2;
+	__u64			lur_padding3;
+	struct llog_rec_tail    lur_tail;
+} __attribute__((packed));
+
+struct llog_setattr64_rec {
+	struct llog_rec_hdr	lsr_hdr;
+	struct ost_id		lsr_oi;
+	__u32			lsr_uid;
+	__u32			lsr_uid_h;
+	__u32			lsr_gid;
+	__u32			lsr_gid_h;
+	__u64			lsr_valid;
+	struct llog_rec_tail    lsr_tail;
+} __attribute__((packed));
+
+/* Extended to support project quota */
+struct llog_setattr64_rec_v2 {
+	struct llog_rec_hdr	lsr_hdr;
+	struct ost_id		lsr_oi;
+	__u32			lsr_uid;
+	__u32			lsr_uid_h;
+	__u32			lsr_gid;
+	__u32			lsr_gid_h;
+	__u64			lsr_valid;
+	__u32			lsr_projid;
+	__u32			lsr_layout_version;
+	__u64			lsr_padding2;
+	__u64			lsr_padding3;
+	struct llog_rec_tail	lsr_tail;
+} __attribute__((packed));
+
+struct llog_size_change_rec {
+	struct llog_rec_hdr	lsc_hdr;
+	struct ll_fid		lsc_fid;
+	__u32			lsc_ioepoch;
+	__u32			lsc_padding1;
+	__u64			lsc_padding2;
+	__u64			lsc_padding3;
+	struct llog_rec_tail	lsc_tail;
+} __attribute__((packed));
+
+#define CHANGELOG_MAGIC 0xca103000
+
+/** \a changelog_rec_type's that can't be masked */
+#define CHANGELOG_MINMASK BIT(CL_MARK)
+/** bits covering all \a changelog_rec_type's */
+#define CHANGELOG_ALLMASK (BIT(CL_LAST) - 1)
+/** default \a changelog_rec_type mask. Allow all of them, except
+ * CL_ATIME since it can really be time consuming, and not necessary
+ * under normal use.
+ * Remove also CL_OPEN, CL_GETXATTR and CL_DN_OPEN from default list as it can
+ * be costly and only necessary for audit purpose.
+ */
+#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & \
+	~(BIT(CL_ATIME) | BIT(CL_OPEN) | BIT(CL_GETXATTR) | BIT(CL_DN_OPEN)))
+
+/* changelog llog name, needed by client replicators */
+#define CHANGELOG_CATALOG "changelog_catalog"
+
+struct changelog_setinfo {
+        __u64 cs_recno;
+        __u32 cs_id;
+} __attribute__((packed));
+
+/** changelog record */
+struct llog_changelog_rec {
+	struct llog_rec_hdr  cr_hdr;
+	struct changelog_rec cr; /**< Variable length field */
+	struct llog_rec_tail cr_do_not_use; /**< for_sizeof_only */
+} __attribute__((packed));
+
+#define CHANGELOG_USER_PREFIX "cl"
+#define CHANGELOG_USER_NAMELEN 16 /* base name including NUL terminator */
+#define CHANGELOG_USER_NAMELEN_FULL 30 /* basename plus 'cl$ID-' prefix */
+
+struct llog_changelog_user_rec {
+	struct llog_rec_hdr   cur_hdr;
+	__u32                 cur_id;
+	/* only intended to be used in relative time comparisons to
+	 * detect idle users */
+	__u32                 cur_time;
+	__u64                 cur_endrec;
+	struct llog_rec_tail  cur_tail;
+} __attribute__((packed));
+
+/* this is twice the size of CHANGELOG_USER_REC */
+struct llog_changelog_user_rec2 {
+	struct llog_rec_hdr	cur_hdr;
+	__u32			cur_id;
+	/* only for use in relative time comparisons to detect idle users */
+	__u32			cur_time;
+	__u64			cur_endrec;
+	__u32			cur_mask;
+	__u32			cur_padding1;
+	char			cur_name[CHANGELOG_USER_NAMELEN];
+	__u64			cur_padding2;
+	__u64			cur_padding3;
+	struct llog_rec_tail	cur_tail;
+} __attribute__((packed));
+
+enum agent_req_status {
+	ARS_WAITING,
+	ARS_STARTED,
+	ARS_FAILED,
+	ARS_CANCELED,
+	ARS_SUCCEED,
+};
+
+static inline const char *agent_req_status2name(enum agent_req_status ars)
+{
+	switch (ars) {
+	case ARS_WAITING:
+		return "WAITING";
+	case ARS_STARTED:
+		return "STARTED";
+	case ARS_FAILED:
+		return "FAILED";
+	case ARS_CANCELED:
+		return "CANCELED";
+	case ARS_SUCCEED:
+		return "SUCCEED";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+struct llog_agent_req_rec {
+	struct llog_rec_hdr	arr_hdr;	/**< record header */
+	__u32			arr_status;	/**< status of the request */
+						/* must match enum
+						 * agent_req_status */
+	__u32			arr_archive_id;	/**< backend archive number */
+	__u64			arr_flags;	/**< req flags */
+	__u64			arr_compound_id; /**< compound cookie, ignored */
+	__u64			arr_req_create;	/**< req. creation time */
+	__u64			arr_req_change;	/**< req. status change time */
+	struct hsm_action_item	arr_hai;	/**< req. to the agent */
+	struct llog_rec_tail	arr_tail; /**< record tail for_sizezof_only */
+} __attribute__((packed));
+
+/* Old llog gen for compatibility */
+struct llog_gen {
+	__u64 mnt_cnt;
+	__u64 conn_cnt;
+} __attribute__((packed));
+
+struct llog_gen_rec {
+	struct llog_rec_hdr	lgr_hdr;
+	struct llog_gen		lgr_gen;
+	__u64			padding1;
+	__u64			padding2;
+	__u64			padding3;
+	struct llog_rec_tail	lgr_tail;
+};
+
+/* flags for the logs */
+enum llog_flag {
+	LLOG_F_ZAP_WHEN_EMPTY	= 0x1,
+	LLOG_F_IS_CAT		= 0x2,
+	LLOG_F_IS_PLAIN		= 0x4,
+	LLOG_F_EXT_JOBID	= 0x8,
+	LLOG_F_IS_FIXSIZE	= 0x10,
+	LLOG_F_EXT_EXTRA_FLAGS  = 0x20,
+	LLOG_F_EXT_X_UIDGID	= 0x40,
+	LLOG_F_EXT_X_NID	= 0x80,
+	LLOG_F_EXT_X_OMODE	= 0x100,
+	LLOG_F_EXT_X_XATTR	= 0x200,
+	LLOG_F_RM_ON_ERR	= 0x400,
+
+	/* Note: Flags covered by LLOG_F_EXT_MASK will be inherited from
+	 * catlog to plain log, so do not add LLOG_F_IS_FIXSIZE here,
+	 * because the catlog record is usually fixed size, but its plain
+	 * log record can be variable */
+	LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID | LLOG_F_EXT_EXTRA_FLAGS |
+			  LLOG_F_EXT_X_UIDGID | LLOG_F_EXT_X_NID |
+			  LLOG_F_EXT_X_OMODE | LLOG_F_EXT_X_XATTR,
+};
+
+/* means first record of catalog */
+enum {
+	LLOG_CAT_FIRST = -1,
+};
+
+/* On-disk header structure of each log object, stored in little endian order */
+#define LLOG_MIN_CHUNK_SIZE	8192
+#define LLOG_HEADER_SIZE        (96) /* sizeof (llog_log_hdr) + sizeof(llh_tail)
+				      * - sizeof(llh_bitmap) */
+#define LLOG_BITMAP_BYTES       (LLOG_MIN_CHUNK_SIZE - LLOG_HEADER_SIZE)
+#define LLOG_MIN_REC_SIZE       (24) /* round(llog_rec_hdr + llog_rec_tail) */
+
+struct llog_log_hdr {
+	struct llog_rec_hdr	llh_hdr;
+	__s64			llh_timestamp;
+	__u32			llh_count;
+	__u32			llh_bitmap_offset;
+	__u32			llh_size;
+	__u32			llh_flags;
+	/* for a catalog the first/oldest and still in-use plain slot is just
+	 * next to it. It will serve as the upper limit after Catalog has
+	 * wrapped around */
+	__u32			llh_cat_idx;
+	struct obd_uuid		llh_tgtuuid;
+	__u32			llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32)-23];
+	/* These fields must always be at the end of the llog_log_hdr.
+	 * Note: llh_bitmap size is variable because llog chunk size could be
+	 * bigger than LLOG_MIN_CHUNK_SIZE, i.e. sizeof(llog_log_hdr) > 8192
+	 * bytes, and the real size is stored in llh_hdr.lrh_len, which means
+	 * llh_tail should only be refered by LLOG_HDR_TAIL().
+	 * But this structure is also used by client/server llog interface
+	 * (see llog_client.c), it will be kept in its original way to avoid
+	 * compatiblity issue. */
+	__u32			llh_bitmap[LLOG_BITMAP_BYTES / sizeof(__u32)];
+	struct llog_rec_tail	llh_tail;
+} __attribute__((packed));
+#undef LLOG_HEADER_SIZE
+#undef LLOG_BITMAP_BYTES
+
+#define LLOG_HDR_BITMAP_SIZE(llh)	(__u32)((llh->llh_hdr.lrh_len -	\
+					 llh->llh_bitmap_offset -	\
+					 sizeof(llh->llh_tail)) * 8)
+#define LLOG_HDR_BITMAP(llh)	(__u32 *)((char *)(llh) +		\
+					  (llh)->llh_bitmap_offset)
+#define LLOG_HDR_TAIL(llh)	((struct llog_rec_tail *)((char *)llh +	\
+						 llh->llh_hdr.lrh_len -	\
+						 sizeof(llh->llh_tail)))
+
+/** log cookies are used to reference a specific log file and a record therein,
+    and pass record offset from llog_process_thread to llog_write */
+struct llog_cookie {
+	union {
+		struct llog_logid	lgc_lgl;
+		__u64			lgc_offset;
+	};
+        __u32                   lgc_subsys;
+        __u32                   lgc_index;
+        __u32                   lgc_padding;
+} __attribute__((packed));
+
+/** llog protocol */
+enum llogd_rpc_ops {
+	LLOG_ORIGIN_HANDLE_CREATE	= 501,
+	LLOG_ORIGIN_HANDLE_NEXT_BLOCK	= 502,
+	LLOG_ORIGIN_HANDLE_READ_HEADER	= 503,
+/*	LLOG_ORIGIN_HANDLE_WRITE_REC	= 504, Obsolete by 2.1. */
+/*	LLOG_ORIGIN_HANDLE_CLOSE	= 505, Obsolete by 1.8. */
+/*	LLOG_ORIGIN_CONNECT		= 506, Obsolete by 2.4. */
+/*	LLOG_CATINFO			= 507, Obsolete by 2.3. */
+	LLOG_ORIGIN_HANDLE_PREV_BLOCK	= 508,
+	LLOG_ORIGIN_HANDLE_DESTROY	= 509, /* Obsolete by 2.11. */
+	LLOG_LAST_OPC,
+	LLOG_FIRST_OPC			= LLOG_ORIGIN_HANDLE_CREATE
+};
+
+struct llogd_body {
+        struct llog_logid  lgd_logid;
+        __u32 lgd_ctxt_idx;
+        __u32 lgd_llh_flags;
+        __u32 lgd_index;
+        __u32 lgd_saved_index;
+        __u32 lgd_len;
+        __u64 lgd_cur_offset;
+} __attribute__((packed));
+
+struct llogd_conn_body {
+        struct llog_gen         lgdc_gen;
+        struct llog_logid       lgdc_logid;
+        __u32                   lgdc_ctxt_idx;
+} __attribute__((packed));
+
+/* Note: 64-bit types are 64-bit aligned in structure */
+struct obdo {
+	__u64			o_valid;	/* hot fields in this obdo */
+	struct ost_id		o_oi;
+	__u64			o_parent_seq;
+	__u64			o_size;		/* o_size-o_blocks == ost_lvb */
+	__s64			o_mtime;
+	__s64			o_atime;
+	__s64			o_ctime;
+	__u64			o_blocks;	/* brw: cli sent cached bytes */
+	__u64			o_grant;
+
+	/* 32-bit fields start here: keep an even number of them via padding */
+	__u32			o_blksize;	/* optimal IO blocksize */
+	__u32			o_mode;		/* brw: cli sent cache remain */
+	__u32			o_uid;
+	__u32			o_gid;
+	__u32			o_flags;
+	__u32			o_nlink;	/* brw: checksum */
+	__u32			o_parent_oid;
+	__u32			o_misc;		/* brw: o_dropped */
+
+	__u64			o_ioepoch;	/* epoch in ost writes */
+	__u32			o_stripe_idx;	/* holds stripe idx */
+	__u32			o_parent_ver;
+	struct lustre_handle	o_handle;	/* brw: lock handle to prolong
+						 * locks */
+	/* Originally, the field is llog_cookie for destroy with unlink cookie
+	 * from MDS, it is obsolete in 2.8. Then reuse it by client to transfer
+	 * layout and PFL information in IO, setattr RPCs. Since llog_cookie is
+	 * not used on wire any longer, remove it from the obdo, then it can be
+	 * enlarged freely in the further without affect related RPCs.
+	 *
+	 * sizeof(ost_layout) + sieof(__u32) == sizeof(llog_cookie). */
+	struct ost_layout	o_layout;
+	__u32			o_layout_version;
+	__u32			o_uid_h;
+	__u32			o_gid_h;
+
+	__u64			o_data_version;	/* getattr: sum of iversion for
+						 * each stripe.
+						 * brw: grant space consumed on
+						 * the client for the write */
+	__u32			o_projid;
+	__u32			o_padding_4;	/* also fix
+						 * lustre_swab_obdo() */
+	__u64			o_padding_5;
+	__u64			o_padding_6;
+};
+
+#define o_dirty   o_blocks
+#define o_undirty o_mode
+#define o_dropped o_misc
+#define o_cksum   o_nlink
+#define o_grant_used o_data_version
+#define o_falloc_mode o_nlink
+
+struct lfsck_request {
+	__u32		lr_event;
+	__u32		lr_index;
+	__u32		lr_flags;
+	__u32		lr_valid;
+	union {
+		__u32	lr_speed;
+		__u32	lr_status;
+	};
+	__u16		lr_version;
+	__u16		lr_active;
+	__u16		lr_param;
+	__u16		lr_async_windows;
+	__u32		lr_flags2;
+	struct lu_fid	lr_fid;
+	struct lu_fid	lr_fid2;
+	__u32		lr_comp_id;
+	__u32		lr_padding_0;
+	__u64		lr_padding_1;
+	__u64		lr_padding_2;
+	__u64		lr_padding_3;
+};
+
+struct lfsck_reply {
+	__u32		lr_status;
+	__u32		lr_padding_1;
+	__u64		lr_repaired;
+};
+
+enum lfsck_events {
+	LE_LASTID_REBUILDING	= 1,
+	LE_LASTID_REBUILT	= 2,
+	LE_PHASE1_DONE		= 3,
+	LE_PHASE2_DONE		= 4,
+	LE_START		= 5,
+	LE_STOP 		= 6,
+	LE_QUERY		= 7,
+	/* LE_FID_ACCESSED	= 8, moved to lfsck_events_local */
+	LE_PEER_EXIT		= 9,
+	LE_CONDITIONAL_DESTROY	= 10,
+	LE_PAIRS_VERIFY 	= 11,
+	LE_SET_LMV_MASTER	= 15,
+	LE_SET_LMV_SLAVE	= 16,
+};
+
+enum lfsck_event_flags {
+	LEF_TO_OST		= 0x00000001,
+	LEF_FROM_OST		= 0x00000002,
+	LEF_SET_LMV_HASH	= 0x00000004,
+	LEF_SET_LMV_ALL		= 0x00000008,
+	LEF_RECHECK_NAME_HASH	= 0x00000010,
+	LEF_QUERY_ALL		= 0x00000020,
+};
+
+/* request structure for OST's */
+struct ost_body {
+	struct obdo oa;
+};
+
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+	char		lfik_name[8];
+	struct obdo	lfik_oa;
+	struct fiemap	lfik_fiemap;
+};
+
+#define IDX_INFO_MAGIC 0x3D37CC37
+
+/* Index file transfer through the network. The server serializes the index into
+ * a byte stream which is sent to the client via a bulk transfer */
+struct idx_info {
+	__u32		ii_magic;
+
+	/* reply: see idx_info_flags below */
+	__u32		ii_flags;
+
+	/* request & reply: number of lu_idxpage (to be) transferred */
+	__u16		ii_count;
+	__u16		ii_pad0;
+
+	/* request: requested attributes passed down to the iterator API */
+	__u32		ii_attrs;
+
+	/* request & reply: index file identifier (FID) */
+	struct lu_fid	ii_fid;
+
+	/* reply: version of the index file before starting to walk the index.
+	 * Please note that the version can be modified at any time during the
+	 * transfer */
+	__u64		ii_version;
+
+	/* request: hash to start with:
+	 * reply: hash of the first entry of the first lu_idxpage and hash
+	 *        of the entry to read next if any */
+	__u64		ii_hash_start;
+	__u64		ii_hash_end;
+
+	/* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is
+	 * set */
+	__u16		ii_keysize;
+
+	/* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC
+	 * is set */
+	__u16		ii_recsize;
+
+	__u32		ii_pad1;
+	__u64		ii_pad2;
+	__u64		ii_pad3;
+};
+
+#define II_END_OFF	MDS_DIR_END_OFF /* all entries have been read */
+
+/* List of flags used in idx_info::ii_flags */
+enum idx_info_flags {
+	II_FL_NOHASH	= 1 << 0, /* client doesn't care about hash value */
+	II_FL_VARKEY	= 1 << 1, /* keys can be of variable size */
+	II_FL_VARREC	= 1 << 2, /* records can be of variable size */
+	II_FL_NONUNQ	= 1 << 3, /* index supports non-unique keys */
+	II_FL_NOKEY	= 1 << 4, /* client doesn't care about key */
+};
+
+#define LIP_MAGIC 0x8A6D6B6C
+
+/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */
+struct lu_idxpage {
+	/* 16-byte header */
+	__u32	lip_magic;
+	__u16	lip_flags;
+	__u16	lip_nr;   /* number of entries in the container */
+	__u64	lip_pad0; /* additional padding for future use */
+
+	/* key/record pairs are stored in the remaining 4080 bytes.
+	 * depending upon the flags in idx_info::ii_flags, each key/record
+	 * pair might be preceded by:
+	 * - a hash value
+	 * - the key size (II_FL_VARKEY is set)
+	 * - the record size (II_FL_VARREC is set)
+	 *
+	 * For the time being, we only support fixed-size key & record. */
+	char	lip_entries[0];
+};
+
+#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries))
+
+/* Gather all possible type associated with a 4KB container */
+union lu_page {
+	struct lu_dirpage	lp_dir; /* for MDS_READPAGE */
+	struct lu_idxpage	lp_idx; /* for OBD_IDX_READ */
+	char			lp_array[LU_PAGE_SIZE];
+};
+
+/* security opcodes */
+enum sec_cmd {
+        SEC_CTX_INIT            = 801,
+        SEC_CTX_INIT_CONT       = 802,
+        SEC_CTX_FINI            = 803,
+        SEC_LAST_OPC,
+        SEC_FIRST_OPC           = SEC_CTX_INIT
+};
+
+/** The link ea holds 1 \a link_ea_entry for each hardlink */
+#define LINK_EA_MAGIC 0x11EAF1DFUL
+struct link_ea_header {
+	__u32 leh_magic;
+	__u32 leh_reccount;
+	__u64 leh_len;	/* total size */
+	__u32 leh_overflow_time;
+	__u32 leh_padding;
+};
+
+/** Hardlink data is name and parent fid.
+ * Stored in this crazy struct for maximum packing and endian-neutrality
+ */
+struct link_ea_entry {
+        /** __u16 stored big-endian, unaligned */
+        unsigned char      lee_reclen[2];
+        unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
+        char               lee_name[0];
+} __attribute__((packed));
+
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+	struct lu_fid	gf_fid;
+	__u64		gf_recno;
+	__u32		gf_linkno;
+	__u32		gf_pathlen;
+	union {
+		char		gf_path[0];
+		struct lu_fid	gf_root_fid[0];
+	} gf_u;
+} __attribute__((packed));
+
+/** path2parent request/reply structures */
+struct getparent {
+	struct lu_fid	gp_fid;         /**< parent FID */
+	__u32		gp_linkno;	/**< hardlink number */
+	__u32		gp_name_size;   /**< size of the name field */
+	char		gp_name[0];     /**< zero-terminated link name */
+} __attribute__((packed));
+
+enum layout_intent_opc {
+	LAYOUT_INTENT_ACCESS	= 0,	/** generic access */
+	LAYOUT_INTENT_READ	= 1,	/** not used */
+	LAYOUT_INTENT_WRITE	= 2,	/** write file, for comp layout */
+	LAYOUT_INTENT_GLIMPSE	= 3,	/** not used */
+	LAYOUT_INTENT_TRUNC	= 4,	/** truncate file, for comp layout */
+	LAYOUT_INTENT_RELEASE	= 5,	/** reserved for HSM release */
+	LAYOUT_INTENT_RESTORE	= 6,	/** reserved for HSM restore */
+};
+
+/* enqueue layout lock with intent */
+struct layout_intent {
+	__u32 li_opc;	/* intent operation for enqueue, read, write etc */
+	__u32 li_flags;
+	struct lu_extent li_extent;
+} __attribute__((packed));
+
+/**
+ * On the wire version of hsm_progress structure.
+ *
+ * Contains the userspace hsm_progress and some internal fields.
+ */
+struct hsm_progress_kernel {
+	/* Field taken from struct hsm_progress */
+	struct lu_fid		hpk_fid;
+	__u64			hpk_cookie;
+	struct hsm_extent	hpk_extent;
+	__u16			hpk_flags;
+	__u16			hpk_errval; /* positive val */
+	__u32			hpk_padding1;
+	/* Additional fields */
+	__u64			hpk_data_version;
+	__u64			hpk_padding2;
+} __attribute__((packed));
+
+/**
+ * OUT_UPDATE RPC Format
+ *
+ * During the cross-ref operation, the Master MDT, which the client send the
+ * request to, will disassembly the operation into object updates, then OSP
+ * will send these updates to the remote MDT to be executed.
+ *
+ * An UPDATE_OBJ RPC does a list of updates.  Each update belongs to an
+ * operation and does a type of modification to an object.
+ *
+ * Request Format
+ *
+ *   update_buf
+ *   update (1st)
+ *   update (2nd)
+ *   ...
+ *   update (ub_count-th)
+ *
+ * ub_count must be less than or equal to UPDATE_PER_RPC_MAX.
+ *
+ * Reply Format
+ *
+ *   update_reply
+ *   rc [+ buffers] (1st)
+ *   rc [+ buffers] (2st)
+ *   ...
+ *   rc [+ buffers] (nr_count-th)
+ *
+ * ur_count must be less than or equal to UPDATE_PER_RPC_MAX and should usually
+ * be equal to ub_count.
+ */
+
+/**
+ * Type of each update, if adding/deleting update, please also update
+ * update_opcode in lustre/target/out_lib.c.
+ */
+enum update_type {
+	OUT_START		= 0,
+	OUT_CREATE		= 1,
+	OUT_DESTROY		= 2,
+	OUT_REF_ADD		= 3,
+	OUT_REF_DEL		= 4,
+	OUT_ATTR_SET		= 5,
+	OUT_ATTR_GET		= 6,
+	OUT_XATTR_SET		= 7,
+	OUT_XATTR_GET		= 8,
+	OUT_INDEX_LOOKUP	= 9,
+	OUT_INDEX_INSERT	= 10,
+	OUT_INDEX_DELETE	= 11,
+	OUT_WRITE		= 12,
+	OUT_XATTR_DEL		= 13,
+	OUT_PUNCH		= 14,
+	OUT_READ		= 15,
+	OUT_NOOP		= 16,
+	OUT_XATTR_LIST		= 17,
+	OUT_LAST
+};
+
+enum update_flag {
+	UPDATE_FL_OST		= 0x00000001,	/* op from OST (not MDT) */
+	UPDATE_FL_SYNC		= 0x00000002,	/* commit before replying */
+	UPDATE_FL_COMMITTED	= 0x00000004,	/* op committed globally */
+	UPDATE_FL_NOLOG		= 0x00000008	/* for idempotent updates */
+};
+
+struct object_update_param {
+	__u16	oup_len;	/* length of this parameter */
+	__u16	oup_padding;
+	__u32	oup_padding2;
+	char	oup_buf[0];
+} __attribute__((packed));
+
+/* object update */
+struct object_update {
+	__u16		ou_type;		/* enum update_type */
+	__u16		ou_params_count;	/* update parameters count */
+	__u32		ou_result_size;		/* how many bytes can return */
+	__u32		ou_flags;		/* enum update_flag */
+	__u32		ou_padding1;		/* padding 1 */
+	__u64		ou_batchid;		/* op transno on master */
+	struct lu_fid	ou_fid;			/* object to be updated */
+	struct object_update_param ou_params[0]; /* update params */
+};
+
+#define	UPDATE_REQUEST_MAGIC_V1	0xBDDE0001
+#define	UPDATE_REQUEST_MAGIC_V2	0xBDDE0002
+#define	UPDATE_REQUEST_MAGIC	UPDATE_REQUEST_MAGIC_V2
+/* Hold object_updates sending to the remote OUT in single RPC */
+struct object_update_request {
+	__u32			ourq_magic;
+	__u16			ourq_count;	/* number of ourq_updates[] */
+	__u16			ourq_padding;
+	struct object_update	ourq_updates[0];
+};
+
+#define OUT_UPDATE_HEADER_MAGIC		0xBDDF0001
+#define OUT_UPDATE_MAX_INLINE_SIZE	4096
+/* Header for updates request between MDTs */
+struct out_update_header {
+	__u32		ouh_magic;
+	__u32		ouh_count;
+	__u32		ouh_inline_length;
+	__u32		ouh_reply_size;
+	__u32		ouh_inline_data[0];
+};
+
+struct out_update_buffer {
+	__u32	oub_size;
+	__u32	oub_padding;
+};
+
+/* the result of object update */
+struct object_update_result {
+	__u32   our_rc;
+	__u16   our_datalen;
+	__u16   our_padding;
+	__u32   our_data[0];
+};
+
+#define UPDATE_REPLY_MAGIC_V1	0x00BD0001
+#define UPDATE_REPLY_MAGIC_V2	0x00BD0002
+#define UPDATE_REPLY_MAGIC	UPDATE_REPLY_MAGIC_V2
+/* Hold object_update_results being replied from the remote OUT. */
+struct object_update_reply {
+	__u32	ourp_magic;
+	__u16	ourp_count;
+	__u16	ourp_padding;
+	__u16	ourp_lens[0];
+};
+
+/* read update result */
+struct out_read_reply {
+	__u32	orr_size;
+	__u32	orr_padding;
+	__u64	orr_offset;
+	char	orr_data[0];
+};
+
+/** layout swap request structure
+ * fid1 and fid2 are in mdt_body
+ */
+struct mdc_swap_layouts {
+	__u64           msl_flags;
+} __attribute__((packed));
+
+#define INLINE_RESYNC_ARRAY_SIZE	15
+struct close_data_resync_done {
+	__u32	resync_count;
+	__u32	resync_ids_inline[INLINE_RESYNC_ARRAY_SIZE];
+};
+
+struct close_data {
+	struct lustre_handle	cd_handle;
+	struct lu_fid		cd_fid;
+	__u64			cd_data_version;
+	union {
+		__u64				cd_reserved[8];
+		struct close_data_resync_done	cd_resync;
+		/* split close */
+		__u16				cd_mirror_id;
+		/* PCC release */
+		__u32				cd_archive_id;
+	};
+};
+
+/* Update llog format */
+struct update_op {
+	struct lu_fid	uop_fid;
+	__u16		uop_type;
+	__u16		uop_param_count;
+	__u16		uop_params_off[];
+} __attribute__((packed));
+
+struct update_ops {
+	struct update_op	uops_op[0];
+};
+
+struct update_params {
+	struct object_update_param	up_params[0];
+};
+
+enum update_records_flag {
+	UPDATE_RECORD_CONTINUE = 1 >> 0,
+};
+/*
+ * This is the update record format used to store the updates in
+ * disk. All updates of the operation will be stored in ur_ops.
+ * All of parameters for updates of the operation will be stored
+ * in ur_params.
+ * To save the space of the record, parameters in ur_ops will only
+ * remember their offset in ur_params, so to avoid storing duplicate
+ * parameters in ur_params, which can help us save a lot space for
+ * operation like creating striped directory.
+ */
+struct update_records {
+	__u64			ur_master_transno;
+	__u64			ur_batchid;
+	__u32			ur_flags;
+	/* If the operation includes multiple updates, then ur_index
+	 * means the index of the update inside the whole updates. */
+	__u32			ur_index;
+	__u32			ur_update_count;
+	__u32			ur_param_count;
+	struct update_ops	ur_ops;
+	 /* Note ur_ops has a variable size, so comment out
+	  * the following ur_params, in case some use it directly
+	  * update_records->ur_params
+	  *
+	  * struct update_params	ur_params;
+	  */
+};
+
+struct llog_update_record {
+	struct llog_rec_hdr     lur_hdr;
+	struct update_records   lur_update_rec;
+	/* Note ur_update_rec has a variable size, so comment out
+	* the following ur_tail, in case someone use it directly
+	*
+	* struct llog_rec_tail lur_tail;
+	*/
+};
+
+/* sepol string format is:
+ * <1-digit for SELinux status>:<policy name>:<policy version>:<policy hash>
+ */
+/* Max length of the sepol string
+ * Should be large enough to contain a sha512sum of the policy
+ */
+#define SELINUX_MODE_LEN 1
+#define SELINUX_POLICY_VER_LEN 3 /* 3 chars to leave room for the future */
+#define SELINUX_POLICY_HASH_LEN 64
+#define LUSTRE_NODEMAP_SEPOL_LENGTH (SELINUX_MODE_LEN + NAME_MAX + \
+				     SELINUX_POLICY_VER_LEN + \
+				     SELINUX_POLICY_HASH_LEN + 3)
+
+/* nodemap records, uses 32 byte record length */
+#define LUSTRE_NODEMAP_NAME_LENGTH 16
+struct nodemap_cluster_rec {
+	char	ncr_name[LUSTRE_NODEMAP_NAME_LENGTH + 1];
+	__u8	ncr_flags;
+	__u16	ncr_padding1;
+	__u32	ncr_squash_projid;
+	__u32	ncr_squash_uid;
+	__u32	ncr_squash_gid;
+};
+
+/* lnet_nid_t is 8 bytes */
+struct nodemap_range_rec {
+	lnet_nid_t	nrr_start_nid;
+	lnet_nid_t	nrr_end_nid;
+	__u64		nrr_padding1;
+	__u64		nrr_padding2;
+};
+
+struct nodemap_id_rec {
+	__u32	nir_id_fs;
+	__u32	nir_padding1;
+	__u64	nir_padding2;
+	__u64	nir_padding3;
+	__u64	nir_padding4;
+};
+
+struct nodemap_global_rec {
+	__u8	ngr_is_active;
+	__u8	ngr_padding1;
+	__u16	ngr_padding2;
+	__u32	ngr_padding3;
+	__u64	ngr_padding4;
+	__u64	ngr_padding5;
+	__u64	ngr_padding6;
+};
+
+union nodemap_rec {
+	struct nodemap_cluster_rec ncr;
+	struct nodemap_range_rec nrr;
+	struct nodemap_id_rec nir;
+	struct nodemap_global_rec ngr;
+};
+
+/*
+ * rawobj stuff for GSS
+ */
+typedef struct netobj_s {
+	__u32 len;
+	__u8 data[0];
+} netobj_t;
+
+typedef struct rawobj_s {
+	__u32 len;
+	__u8 *data;
+} rawobj_t;
+
+/*
+ * GSS headers
+ * following 3 headers must have the same sizes and offsets
+ */
+struct gss_header {
+	__u8 gh_version;     /* gss version */
+	__u8 gh_sp;          /* sec part */
+	__u16 gh_pad0;
+	__u32 gh_flags;       /* wrap flags */
+	__u32 gh_proc;        /* proc */
+	__u32 gh_seq;         /* sequence */
+	__u32 gh_svc;         /* service */
+	__u32 gh_pad1;
+	__u32 gh_pad2;
+	__u32 gh_pad3;
+	netobj_t gh_handle;      /* context handle */
+};
+
+struct gss_rep_header {
+	__u8 gh_version;
+	__u8 gh_sp;
+	__u16 gh_pad0;
+	__u32 gh_flags;
+	__u32 gh_proc;
+	__u32 gh_major;
+	__u32 gh_minor;
+	__u32 gh_seqwin;
+	__u32 gh_pad2;
+	__u32 gh_pad3;
+	netobj_t gh_handle;
+};
+
+struct gss_err_header {
+	__u8 gh_version;
+	__u8 gh_sp;
+	__u16 gh_pad0;
+	__u32 gh_flags;
+	__u32 gh_proc;
+	__u32 gh_major;
+	__u32 gh_minor;
+	__u32 gh_pad1;
+	__u32 gh_pad2;
+	__u32 gh_pad3;
+	netobj_t gh_handle;
+};
+
+/*
+ * GSS part of wire context information sent from client, saved and
+ * used later by server.
+ */
+struct gss_wire_ctx {
+	__u32 gw_flags;
+	__u32 gw_proc;
+	__u32 gw_seq;
+	__u32 gw_svc;
+	rawobj_t gw_handle;
+};
+
+/* This is the lu_ladvise struct which goes out on the wire.
+ * Corresponds to the userspace arg llapi_lu_ladvise.
+ * value[1-4] are unspecified fields, used differently by different advices */
+struct lu_ladvise {
+	__u16 lla_advice;	/* advice type */
+	__u16 lla_value1;	/* values for different advice types */
+	__u32 lla_value2;
+	__u64 lla_start;	/* first byte of extent for advice */
+	__u64 lla_end;		/* last byte of extent for advice */
+	__u32 lla_value3;
+	__u32 lla_value4;
+};
+
+/* This is the ladvise_hdr which goes on the wire, corresponds to the userspace
+ * arg llapi_ladvise_hdr.
+ * value[1-3] are unspecified fields, used differently by different advices */
+struct ladvise_hdr {
+	__u32			lah_magic;	/* LADVISE_MAGIC */
+	__u32			lah_count;	/* number of advices */
+	__u64			lah_flags;	/* from enum ladvise_flag */
+	__u32			lah_value1;	/* unused */
+	__u32			lah_value2;	/* unused */
+	__u64			lah_value3;	/* unused */
+	struct lu_ladvise	lah_advise[0];	/* advices in this header */
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+/** @} lustreidl */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
new file mode 100644
index 0000000000000..bfd91cf52e6fb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
@@ -0,0 +1,231 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+#ifndef _UAPI_LUSTRE_IOCTL_H
+#define _UAPI_LUSTRE_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/lustre/lustre_idl.h>
+
+/*
+ * sparse kernel source annotations
+ */
+#ifndef __user
+#define __user
+#endif
+
+enum md_echo_cmd {
+	ECHO_MD_CREATE		= 1, /* Open/Create file on MDT */
+	ECHO_MD_MKDIR		= 2, /* Mkdir on MDT */
+	ECHO_MD_DESTROY		= 3, /* Unlink file on MDT */
+	ECHO_MD_RMDIR		= 4, /* Rmdir on MDT */
+	ECHO_MD_LOOKUP		= 5, /* Lookup on MDT */
+	ECHO_MD_GETATTR		= 6, /* Getattr on MDT */
+	ECHO_MD_SETATTR		= 7, /* Setattr on MDT */
+	ECHO_MD_ALLOC_FID	= 8, /* Get FIDs from MDT */
+};
+
+#define OBD_DEV_ID 1
+#define OBD_DEV_NAME "obd"
+#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME
+
+#define OBD_IOCTL_VERSION	0x00010004
+#define OBD_DEV_BY_DEVNAME	0xffffd0de
+
+struct obd_ioctl_data {
+	__u32		ioc_len;
+	__u32		ioc_version;
+
+	union {
+		__u64	ioc_cookie;
+		__u64	ioc_u64_1;
+	};
+	union {
+		__u32	ioc_conn1;
+		__u32	ioc_u32_1;
+	};
+	union {
+		__u32	ioc_conn2;
+		__u32	ioc_u32_2;
+	};
+
+	struct obdo	ioc_obdo1;
+	struct obdo	ioc_obdo2;
+
+	__u64		ioc_count;
+	__u64		ioc_offset;
+	__u32		ioc_dev;
+	__u32		ioc_command;
+
+	__u64		ioc_nid;
+	__u32		ioc_nal;
+	__u32		ioc_type;
+
+	/* buffers the kernel will treat as user pointers */
+	__u32		ioc_plen1;
+	char __user    *ioc_pbuf1;
+	__u32		ioc_plen2;
+	char __user    *ioc_pbuf2;
+
+	/* inline buffers for various arguments */
+	__u32		ioc_inllen1;
+	char	       *ioc_inlbuf1;
+	__u32		ioc_inllen2;
+	char	       *ioc_inlbuf2;
+	__u32		ioc_inllen3;
+	char	       *ioc_inlbuf3;
+	__u32		ioc_inllen4;
+	char	       *ioc_inlbuf4;
+
+	char		ioc_bulk[0];
+};
+
+struct obd_ioctl_hdr {
+	__u32		ioc_len;
+	__u32		ioc_version;
+};
+
+static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data)
+{
+	__u32 len = __ALIGN_KERNEL(sizeof(*data), 8);
+
+	len += __ALIGN_KERNEL(data->ioc_inllen1, 8);
+	len += __ALIGN_KERNEL(data->ioc_inllen2, 8);
+	len += __ALIGN_KERNEL(data->ioc_inllen3, 8);
+	len += __ALIGN_KERNEL(data->ioc_inllen4, 8);
+
+	return len;
+}
+
+/*
+ * OBD_IOC_DATA_TYPE is only for compatibility reasons with older
+ * Linux Lustre user tools. New ioctls should NOT use this macro as
+ * the ioctl "size". Instead the ioctl should get a "size" argument
+ * which is the actual data type used by the ioctl, to ensure the
+ * ioctl interface is versioned correctly.
+ */
+#define OBD_IOC_DATA_TYPE	long
+
+#define OBD_IOC_CREATE		_IOWR('f', 101, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DESTROY		_IOW('f', 104, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_PREALLOCATE	_IOWR('f', 105, OBD_IOC_DATA_TYPE) */
+
+#define OBD_IOC_SETATTR		_IOW('f', 107, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETATTR		_IOWR('f', 108, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ		_IOWR('f', 109, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_WRITE		_IOWR('f', 110, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_STATFS		_IOWR('f', 113, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SYNC		_IOW('f', 114, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_BRW_READ	_IOWR('f', 125, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_WRITE	_IOWR('f', 126, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NAME2DEV	_IOWR('f', 127, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETDTNAME	_IOR('f', 127, char[MAX_OBD_NAME])
+/* ioctl codes 128-143 are reserved for fsverity */
+#define OBD_IOC_UUID2DEV	_IOWR('f', 130, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETNAME_OLD	_IOWR('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETMDNAME	_IOR('f', 131, char[MAX_OBD_NAME])
+/*      OBD_IOC_LOV_GET_CONFIG	_IOWR('f', 132, OBD_IOC_DATA_TYPE) until 2.14 */
+#define OBD_IOC_CLIENT_RECOVER	_IOW('f', 133, OBD_IOC_DATA_TYPE)
+/* ioctl codes 128-143 are reserved for fsverity */
+/* FS_IOC_ENABLE_VERITY		_IOW('f', 133, struct fsverity_enable_arg) */
+/* FS_IOC_MEASURE_VERITY	_IOW('f', 134, struct fsverity_digest) */
+/* was	OBD_IOC_NO_TRANSNO	_IOW('f', 140, OBD_IOC_DATA_TYPE) until 2.14 */
+#define OBD_IOC_SET_READONLY	_IOW('f', 141, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ABORT_RECOVERY	_IOR('f', 142, OBD_IOC_DATA_TYPE)
+enum obd_abort_recovery_flags {
+	OBD_FLG_ABORT_RECOV_OST	= 0x00008, /* LMD_FLG_ABORT_RECOV */
+	OBD_FLG_ABORT_RECOV_MDT	= 0x40000, /* LMD_FLG_ABORT_RECOV_MDT */
+};
+/* ioctl codes 128-143 are reserved for fsverity */
+#define OBD_GET_VERSION		_IOWR('f', 144, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_GSS_SUPPORT	_IOWR('f', 145, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_CLOSE_UUID	_IOWR('f', 147, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_CHANGELOG_SEND	_IOW('f', 148, OBD_IOC_DATA_TYPE) */
+#define OBD_IOC_GETDEVICE	_IOWR('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FID2PATH	_IOWR('f', 150, OBD_IOC_DATA_TYPE)
+/*	lustre/lustre_user.h	151-153 */
+/*	OBD_IOC_LOV_SETSTRIPE	154 LL_IOC_LOV_SETSTRIPE */
+/*	OBD_IOC_LOV_GETSTRIPE	155 LL_IOC_LOV_GETSTRIPE */
+/*	OBD_IOC_LOV_SETEA	156 LL_IOC_LOV_SETEA */
+/*	lustre/lustre_user.h	157-159 */
+/*	OBD_IOC_QUOTACHECK	_IOW('f', 160, int) */
+/*	OBD_IOC_POLL_QUOTACHECK	_IOR('f', 161, struct if_quotacheck *) */
+#define OBD_IOC_QUOTACTL	_IOWR('f', 162, struct if_quotactl)
+/*	lustre/lustre_user.h	163-176 */
+#define OBD_IOC_CHANGELOG_REG	_IOW('f', 177, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_DEREG	_IOW('f', 178, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_CLEAR	_IOW('f', 179, struct obd_ioctl_data)
+/*	OBD_IOC_RECORD		_IOWR('f', 180, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_ENDRECORD	_IOWR('f', 181, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_PARSE		_IOWR('f', 182, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_DORECORD	_IOWR('f', 183, OBD_IOC_DATA_TYPE) */
+#define OBD_IOC_PROCESS_CFG	_IOWR('f', 184, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_DUMP_LOG	_IOWR('f', 185, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_CLEAR_LOG	_IOWR('f', 186, OBD_IOC_DATA_TYPE) */
+#define OBD_IOC_PARAM		_IOW('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL		_IOWR('f', 188, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_REPLACE_NIDS	_IOWR('f', 189, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CATLOGLIST	_IOWR('f', 190, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_INFO	_IOWR('f', 191, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_PRINT	_IOWR('f', 192, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CANCEL	_IOWR('f', 193, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_REMOVE	_IOWR('f', 194, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CHECK	_IOWR('f', 195, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_LLOG_CATINFO	_IOWR('f', 196, OBD_IOC_DATA_TYPE) */
+#define OBD_IOC_NODEMAP		_IOWR('f', 197, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLEAR_CONFIGS   _IOWR('f', 198, OBD_IOC_DATA_TYPE)
+
+/*	ECHO_IOC_GET_STRIPE	_IOWR('f', 200, OBD_IOC_DATA_TYPE) */
+/*	ECHO_IOC_SET_STRIPE	_IOWR('f', 201, OBD_IOC_DATA_TYPE) */
+/*	ECHO_IOC_ENQUEUE	_IOWR('f', 202, OBD_IOC_DATA_TYPE) */
+/*	ECHO_IOC_CANCEL		_IOWR('f', 203, OBD_IOC_DATA_TYPE) */
+
+#define OBD_IOC_LCFG_FORK	_IOWR('f', 208, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LCFG_ERASE	_IOWR('f', 209, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GET_OBJ_VERSION	_IOR('f', 210, OBD_IOC_DATA_TYPE)
+
+/*	lustre/lustre_user.h	211-220 */
+/* was #define OBD_IOC_GET_MNTOPT	_IOW('f', 220, mntopt_t) until 2.11 */
+#define OBD_IOC_ECHO_MD		_IOR('f', 221, struct obd_ioctl_data)
+#define OBD_IOC_ECHO_ALLOC_SEQ	_IOWR('f', 222, struct obd_ioctl_data)
+#define OBD_IOC_START_LFSCK	_IOWR('f', 230, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STOP_LFSCK	_IOW('f', 231, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_QUERY_LFSCK	_IOR('f', 232, struct obd_ioctl_data)
+#define OBD_IOC_CHLG_POLL	_IOR('f', 233, long)
+/*	lustre/lustre_user.h	240-249 */
+/* was	LIBCFS_IOC_DEBUG_MASK	_IOWR('f', 250, long) until 2.11 */
+
+#define OBD_IOC_BARRIER		_IOWR('f', 261, OBD_IOC_DATA_TYPE)
+
+#define IOC_OSC_SET_ACTIVE	_IOWR('h', 21, void *)
+
+#endif /* _UAPI_LUSTRE_IOCTL_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
new file mode 100644
index 0000000000000..26819ff7995cf
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
@@ -0,0 +1,98 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.
+ *
+ * LGPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Author: Nathan Rutman <nathan.rutman@seagate.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * The definitions below are used in the kernel and userspace.
+ */
+
+#ifndef __UAPI_KERNELCOMM_H__
+#define __UAPI_KERNELCOMM_H__
+
+#include <linux/types.h>
+
+/* KUC message header.
+ * All current and future KUC messages should use this header.
+ * To avoid having to include Lustre headers from libcfs, define this here.
+ */
+struct kuc_hdr {
+	__u16 kuc_magic;
+	__u8  kuc_transport;  /* Each new Lustre feature should use a different
+				 transport */
+	__u8  kuc_flags;
+	__u16 kuc_msgtype;    /* Message type or opcode, transport-specific */
+	__u16 kuc_msglen;     /* Including header */
+} __attribute__((aligned(sizeof(__u64))));
+
+
+#define KUC_MAGIC  0x191C /*Lustre9etLinC */
+
+/* kuc_msgtype values are defined in each transport */
+enum kuc_transport_type {
+	KUC_TRANSPORT_GENERIC   = 1,
+	KUC_TRANSPORT_HSM       = 2,
+};
+
+enum kuc_generic_message_type {
+	KUC_MSG_SHUTDOWN = 1,
+};
+
+/* KUC Broadcast Groups. This determines which userspace process hears which
+ * messages.  Mutliple transports may be used within a group, or multiple
+ * groups may use the same transport.  Broadcast
+ * groups need not be used if e.g. a UID is specified instead;
+ * use group 0 to signify unicast.
+ */
+#define KUC_GRP_HSM	0x02
+#define KUC_GRP_MAX	KUC_GRP_HSM
+
+enum lk_flags {
+	LK_FLG_STOP	= 0x0001,
+	LK_FLG_DATANR	= 0x0002,
+};
+#define LK_NOFD -1U
+
+/* kernelcomm control structure, passed from userspace to kernel.
+ * For compatibility with old copytools, users who pass ARCHIVE_IDs
+ * to kernel using lk_data_count and lk_data should fill lk_flags with
+ * LK_FLG_DATANR. Otherwise kernel will take lk_data_count as bitmap of
+ * ARCHIVE IDs.
+ */
+struct lustre_kernelcomm {
+	__u32 lk_wfd;
+	__u32 lk_rfd;
+	__u32 lk_uid;
+	__u32 lk_group;
+	__u32 lk_data_count;
+	__u32 lk_flags;
+	__u32 lk_data[0];
+} __attribute__((packed));
+
+#endif	/* __UAPI_KERNELCOMM_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
new file mode 100644
index 0000000000000..68c8d3a1009c4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
@@ -0,0 +1,238 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+#include <linux/types.h>
+#include <linux/lustre/lustre_user.h>
+
+/**
+ * state machine:
+ *
+ *					LS_INIT
+ *					   |
+ *				     (lfsck|start)
+ *					   |
+ *					   v
+ *				   LS_SCANNING_PHASE1
+ *					|	^
+ *					|	:
+ *					| (lfsck:restart)
+ *					|	:
+ *					v	:
+ *	-----------------------------------------------------------------
+ *	|		    |^		|^	   |^	      |^	|^
+ *	|		    |:		|:	   |:	      |:	|:
+ *	v		    v:		v:	   v:	      v:	v:
+ * LS_SCANNING_PHASE2	LS_FAILED  LS_STOPPED  LS_PAUSED LS_CRASHED LS_PARTIAL
+ *			  (CO_)       (CO_)	 (CO_)
+ *	|	^	    ^:		^:	   ^:	      ^:	^:
+ *	|	:	    |:		|:	   |:	      |:	|:
+ *	| (lfsck:restart)   |:		|:	   |:	      |:	|:
+ *	v	:	    |v		|v	   |v	      |v	|v
+ *	-----------------------------------------------------------------
+ *	    |
+ *	    v
+ *    LS_COMPLETED
+ */
+enum lfsck_status {
+	/* The lfsck file is new created, for new MDT, upgrading from old disk,
+	 * or re-creating the lfsck file manually. */
+	LS_INIT			= 0,
+
+	/* The first-step system scanning. The checked items during the phase1
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE1	= 1,
+
+	/* The second-step system scanning. The checked items during the phase2
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE2	= 2,
+
+	/* The LFSCK processing has completed for all objects. */
+	LS_COMPLETED		= 3,
+
+	/* The LFSCK exited automatically for failure, will not auto restart. */
+	LS_FAILED		= 4,
+
+	/* The LFSCK is stopped manually, will not auto restart. */
+	LS_STOPPED		= 5,
+
+	/* LFSCK is paused automatically when umount,
+	 * will be restarted automatically when remount. */
+	LS_PAUSED		= 6,
+
+	/* System crashed during the LFSCK,
+	 * will be restarted automatically after recovery. */
+	LS_CRASHED		= 7,
+
+	/* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */
+	LS_PARTIAL		= 8,
+
+	/* The LFSCK is failed because its controller is failed. */
+	LS_CO_FAILED		= 9,
+
+	/* The LFSCK is stopped because its controller is stopped. */
+	LS_CO_STOPPED		= 10,
+
+	/* The LFSCK is paused because its controller is paused. */
+	LS_CO_PAUSED		= 11,
+
+	LS_MAX
+};
+
+static inline const char *lfsck_status2name(int status)
+{
+	static const char * const lfsck_status_names[] = {
+		[LS_INIT]		= "init",
+		[LS_SCANNING_PHASE1]	= "scanning-phase1",
+		[LS_SCANNING_PHASE2]	= "scanning-phase2",
+		[LS_COMPLETED]		= "completed",
+		[LS_FAILED]		= "failed",
+		[LS_STOPPED]		= "stopped",
+		[LS_PAUSED]		= "paused",
+		[LS_CRASHED]		= "crashed",
+		[LS_PARTIAL]		= "partial",
+		[LS_CO_FAILED]		= "co-failed",
+		[LS_CO_STOPPED]		= "co-stopped",
+		[LS_CO_PAUSED]		= "co-paused"
+	};
+
+	if (status < 0 || status >= LS_MAX)
+		return "unknown";
+
+	return lfsck_status_names[status];
+}
+
+enum lfsck_param_flags {
+	/* Reset LFSCK iterator position to the device beginning. */
+	LPF_RESET		= 0x0001,
+
+	/* Exit when fail. */
+	LPF_FAILOUT		= 0x0002,
+
+	/* Dryrun mode, only check without modification */
+	LPF_DRYRUN		= 0x0004,
+
+	/* LFSCK runs on all targets. */
+	LPF_ALL_TGT		= 0x0008,
+
+	/* Broadcast the command to other MDTs. Only valid on the sponsor MDT */
+	LPF_BROADCAST		= 0x0010,
+
+	/* Handle orphan OST-objects. */
+	LPF_OST_ORPHAN		= 0x0020,
+
+	/* Create OST-object for dangling LOV EA. */
+	LPF_CREATE_OSTOBJ	= 0x0040,
+
+	/* Create MDT-object for dangling name entry. */
+	LPF_CREATE_MDTOBJ	= 0x0080,
+
+	/* Do not return until the LFSCK not running. */
+	LPF_WAIT		= 0x0100,
+
+	/* Delay to create OST-object for dangling LOV EA. */
+	LPF_DELAY_CREATE_OSTOBJ	= 0x0200,
+};
+
+enum lfsck_type {
+	/* For MDT and OST internal OSD consistency check/repair. */
+	LFSCK_TYPE_SCRUB	= 0x0000,
+
+	/* For MDT-OST (layout, object) consistency check/repair. */
+	LFSCK_TYPE_LAYOUT	= 0x0001,
+
+	/* For MDT (FID-in-dirent, linkEA) consistency check/repair. */
+	LFSCK_TYPE_NAMESPACE	= 0x0004,
+	LFSCK_TYPES_SUPPORTED	= (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT |
+				   LFSCK_TYPE_NAMESPACE),
+	LFSCK_TYPES_DEF		= LFSCK_TYPES_SUPPORTED,
+	LFSCK_TYPES_ALL		= ((__u16)(~0))
+};
+
+#define LFSCK_VERSION_V1	1
+#define LFSCK_VERSION_V2	2
+
+#define LFSCK_SPEED_NO_LIMIT	0
+#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
+#define LFSCK_ASYNC_WIN_DEFAULT 1024
+#define LFSCK_ASYNC_WIN_MAX	((__u16)(~0))
+#define LFSCK_TYPE_BITS		16
+
+enum lfsck_start_valid {
+	LSV_SPEED_LIMIT		= 0x00000001,
+	LSV_ERROR_HANDLE	= 0x00000002,
+	LSV_DRYRUN		= 0x00000004,
+	LSV_ASYNC_WINDOWS	= 0x00000008,
+	LSV_CREATE_OSTOBJ	= 0x00000010,
+	LSV_CREATE_MDTOBJ	= 0x00000020,
+	LSV_DELAY_CREATE_OSTOBJ	= 0x00000040,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
+	__u32   ls_valid;
+
+	/* How many items can be scanned at most per second. */
+	__u32   ls_speed_limit;
+
+	/* For compatibility between user space tools and kernel service. */
+	__u16   ls_version;
+
+	/* Which LFSCK components to be (have been) started. */
+	__u16   ls_active;
+
+	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+	__u16   ls_flags;
+
+	/* The windows size for async requests pipeline. */
+	__u16   ls_async_windows;
+};
+
+struct lfsck_stop {
+	__u32	ls_status;
+	__u16	ls_flags;
+	__u16	ls_padding_1; /* For 64-bits aligned. */
+	__u64	ls_padding_2;
+};
+
+struct lfsck_query {
+	__u16	lu_types;
+	__u16	lu_flags;
+	__u32	lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u32	lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u64	lu_repaired[LFSCK_TYPE_BITS];
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
new file mode 100644
index 0000000000000..bcf46eb21e6c2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
@@ -0,0 +1,80 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_log_user.h
+ *
+ * Userspace-usable portion of Generic infrastructure for managing
+ * a collection of logs.
+ * See lustre_log.h for more details.
+ */
+
+#ifndef _LUSTRE_LOG_USER_H
+#define _LUSTRE_LOG_USER_H
+
+#include <linux/types.h>
+#include <linux/lustre/lustre_fid.h>
+
+/*  Lustre logs use FIDs constructed from oi_id and oi_seq directly,
+ *  without attempting to use the IGIF and IDIF ranges as is done
+ *  elsewhere, because of compatibility concerns (see lu-2888).
+ */
+
+static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid)
+{
+	/* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS)
+	 * logid's by non-zero ogen (inode generation) and convert them
+	 * into IGIF */
+	if (id->lgl_ogen == 0) {
+		fid->f_seq = id->lgl_oi.oi.oi_seq;
+		fid->f_oid = id->lgl_oi.oi.oi_id;
+		fid->f_ver = 0;
+	} else {
+		lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen);
+	}
+}
+
+static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id)
+{
+	id->lgl_oi.oi.oi_seq = fid->f_seq;
+	id->lgl_oi.oi.oi_id = fid->f_oid;
+	id->lgl_ogen = 0;
+}
+
+static inline void logid_set_id(struct llog_logid *log_id, __u64 id)
+{
+	log_id->lgl_oi.oi.oi_id = id;
+}
+
+static inline __u64 logid_id(struct llog_logid *log_id)
+{
+	return log_id->lgl_oi.oi.oi_id;
+}
+
+#endif /* ifndef _LUSTRE_LOG_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
new file mode 100644
index 0000000000000..90fa213f83e90
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
@@ -0,0 +1,237 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Define ost_id  associated functions
+ */
+
+#ifndef _UAPI_LUSTRE_OSTID_H_
+#define _UAPI_LUSTRE_OSTID_H_
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/lustre/lustre_fid.h>
+
+static inline __u64 lmm_oi_id(const struct ost_id *oi)
+{
+	return oi->oi.oi_id;
+}
+
+static inline __u64 lmm_oi_seq(const struct ost_id *oi)
+{
+	return oi->oi.oi_seq;
+}
+
+static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq)
+{
+	oi->oi.oi_seq = seq;
+}
+
+static inline void lmm_oi_set_id(struct ost_id *oi, __u64 oid)
+{
+	oi->oi.oi_id = oid;
+}
+
+static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi,
+				    const struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq);
+}
+
+static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi,
+				    const struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq);
+}
+
+/* extract OST sequence (group) from a wire ost_id (id/seq) pair */
+static inline __u64 ostid_seq(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+		return FID_SEQ_OST_MDT0;
+
+	if (fid_seq_is_default(ostid->oi.oi_seq))
+		return FID_SEQ_LOV_DEFAULT;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return FID_SEQ_OST_MDT0;
+
+	return fid_seq(&ostid->oi_fid);
+}
+
+/* extract OST objid from a wire ost_id (id/seq) pair */
+static inline __u64 ostid_id(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+		return ostid->oi.oi_id & IDIF_OID_MASK;
+
+	if (fid_seq_is_default(ostid->oi.oi_seq))
+		return ostid->oi.oi_id;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return fid_idif_id(fid_seq(&ostid->oi_fid),
+				   fid_oid(&ostid->oi_fid), 0);
+
+	return fid_oid(&ostid->oi_fid);
+}
+
+static inline void ostid_set_seq(struct ost_id *oi, __u64 seq)
+{
+	if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) {
+		oi->oi.oi_seq = seq;
+	} else {
+		oi->oi_fid.f_seq = seq;
+		/*
+		 * Note: if f_oid + f_ver is zero, we need init it
+		 * to be 1, otherwise, ostid_seq will treat this
+		 * as old ostid (oi_seq == 0)
+		 */
+		if (!oi->oi_fid.f_oid && !oi->oi_fid.f_ver)
+			oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID;
+	}
+}
+
+static inline void ostid_set_seq_mdt0(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_OST_MDT0);
+}
+
+static inline void ostid_set_seq_echo(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_ECHO);
+}
+
+static inline void ostid_set_seq_llog(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_LLOG);
+}
+
+static inline void ostid_cpu_to_le(const struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) {
+		dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq);
+	} else {
+		fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+static inline void ostid_le_to_cpu(const struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) {
+		dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq);
+	} else {
+		fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+/**
+ * Sigh, because pre-2.4 uses
+ * struct lov_mds_md_v1 {
+ *	........
+ *	__u64 lmm_object_id;
+ *	__u64 lmm_object_seq;
+ *      ......
+ *      }
+ * to identify the LOV(MDT) object, and lmm_object_seq will
+ * be normal_fid, which make it hard to combine these conversion
+ * to ostid_to FID. so we will do lmm_oi/fid conversion separately
+ *
+ * We can tell the lmm_oi by this way,
+ * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0
+ * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL
+ * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k},
+ *      lmm_oi.f_ver = 0
+ *
+ * But currently lmm_oi/lsm_oi does not have any "real" usages,
+ * except for printing some information, and the user can always
+ * get the real FID from LMA, besides this multiple case check might
+ * make swab more complicate. So we will keep using id/seq for lmm_oi.
+ */
+
+static inline void fid_to_lmm_oi(const struct lu_fid *fid,
+				 struct ost_id *oi)
+{
+	oi->oi.oi_id = fid_oid(fid);
+	oi->oi.oi_seq = fid_seq(fid);
+}
+
+/**
+ * Unpack an OST object id/seq (group) into a FID.  This is needed for
+ * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper
+ * FIDs.  Note that if an id/seq is already in FID/IDIF format it will
+ * be passed through unchanged.  Only legacy OST objects in "group 0"
+ * will be mapped into the IDIF namespace so that they can fit into the
+ * struct lu_fid fields without loss.
+ */
+static inline int ostid_to_fid(struct lu_fid *fid, const struct ost_id *ostid,
+			       __u32 ost_idx)
+{
+	__u64 seq = ostid_seq(ostid);
+
+	if (ost_idx > 0xffff)
+		return -EBADF;
+
+	if (fid_seq_is_mdt0(seq)) {
+		__u64 oid = ostid_id(ostid);
+
+		/* This is a "legacy" (old 1.x/2.early) OST object in "group 0"
+		 * that we map into the IDIF namespace.  It allows up to 2^48
+		 * objects per OST, as this is the object namespace that has
+		 * been in production for years.  This can handle create rates
+		 * of 1M objects/s/OST for 9 years, or combinations thereof.
+		 */
+		if (oid >= IDIF_MAX_OID)
+			return -EBADF;
+
+		fid->f_seq = fid_idif_seq(oid, ost_idx);
+		/* truncate to 32 bits by assignment */
+		fid->f_oid = oid;
+		/* in theory, not currently used */
+		fid->f_ver = oid >> 48;
+	} else if (!fid_seq_is_default(seq)) {
+		/* This is either an IDIF object, which identifies objects
+		 * across all OSTs, or a regular FID.  The IDIF namespace
+		 * maps legacy OST objects into the FID namespace.  In both
+		 * cases, we just pass the FID through, no conversion needed.
+		 */
+		if (ostid->oi_fid.f_ver)
+			return -EBADF;
+
+		*fid = ostid->oi_fid;
+	}
+
+	return 0;
+}
+#endif /* _UAPI_LUSTRE_OSTID_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h
new file mode 100644
index 0000000000000..8b9177046d999
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _UAPI_LUSTRE_PARAM_H
+#define _UAPI_LUSTRE_PARAM_H
+
+/** \defgroup param param
+ *
+ * @{
+ */
+
+/****************** User-settable parameter keys *********************/
+/* e.g.
+ *	tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda
+ *	lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0
+ *		    ... testfs-MDT0000.lov.stripesize=4M
+ *		    ... testfs-OST0000.ost.client_cache_seconds=15
+ *		    ... testfs.sys.timeout=<secs>
+ *		    ... testfs.llite.max_read_ahead_mb=16
+ */
+
+/* System global or special params not handled in obd's proc
+ * See mgs_write_log_sys()
+ */
+#define PARAM_TIMEOUT              "timeout="          /* global */
+#define PARAM_LDLM_TIMEOUT         "ldlm_timeout="     /* global */
+#define PARAM_AT_MIN               "at_min="           /* global */
+#define PARAM_AT_MAX               "at_max="           /* global */
+#define PARAM_AT_EXTRA             "at_extra="         /* global */
+#define PARAM_AT_EARLY_MARGIN      "at_early_margin="  /* global */
+#define PARAM_AT_HISTORY           "at_history="       /* global */
+#define PARAM_JOBID_VAR		   "jobid_var="	       /* global */
+#define PARAM_MGSNODE              "mgsnode="          /* only at mounttime */
+#define PARAM_FAILNODE             "failover.node="    /* add failover nid */
+#define PARAM_FAILMODE             "failover.mode="    /* initial mount only */
+#define PARAM_ACTIVE               "active="           /* activate/deactivate */
+#define PARAM_NETWORK              "network="          /* bind on nid */
+#define PARAM_ID_UPCALL		"identity_upcall="  /* identity upcall */
+#define PARAM_ROOTSQUASH	   "root_squash="      /* root squash */
+#define PARAM_NOSQUASHNIDS	   "nosquash_nids="    /* no squash nids */
+
+/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */
+#define PARAM_OST		"ost."
+#define PARAM_OSD		"osd."
+#define PARAM_OSC		"osc."
+#define PARAM_MDT		"mdt."
+#define PARAM_HSM		"mdt.hsm."
+#define PARAM_MDD		"mdd."
+#define PARAM_MDC		"mdc."
+#define PARAM_LLITE		"llite."
+#define PARAM_LOV		"lov."
+#define PARAM_LOD		"lod."
+#define PARAM_OSP		"osp."
+#define PARAM_SYS		"sys."		/* global */
+#define PARAM_SRPC		"srpc."
+#define PARAM_SRPC_FLVR		"srpc.flavor."
+#define PARAM_SRPC_UDESC	"srpc.udesc.cli2mdt"
+#define PARAM_SEC		"security."
+#define PARAM_QUOTA		"quota."	/* global */
+
+/** @} param */
+
+#endif /* _UAPI_LUSTRE_PARAM_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
new file mode 100644
index 0000000000000..de4fe08aa2eac
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
@@ -0,0 +1,2795 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+#ifndef __KERNEL__
+# define __USE_ISOC99	1
+# include <stdbool.h>
+# include <stdio.h> /* snprintf() */
+# include <sys/stat.h>
+
+# define __USE_GNU      1
+# define __USE_XOPEN2K8  1
+# define FILEID_LUSTRE 0x97 /* for name_to_handle_at() (and llapi_fd2fid()) */
+#endif /* !__KERNEL__ */
+
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/quota.h>
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/lustre/lustre_fiemap.h>
+#include <linux/lustre/lustre_ver.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifdef __STRICT_ANSI__
+#define typeof  __typeof__
+#endif
+
+/*
+ * This is a temporary solution of adding quota type.
+ * Should be removed as soon as system header is updated.
+ */
+#undef LL_MAXQUOTAS
+#define LL_MAXQUOTAS 3
+#undef INITQFNAMES
+#define INITQFNAMES { \
+    "user",	/* USRQUOTA */ \
+    "group",	/* GRPQUOTA */ \
+    "project",	/* PRJQUOTA */ \
+    "undefined", \
+};
+#ifndef USRQUOTA
+#define USRQUOTA 0
+#endif
+#ifndef GRPQUOTA
+#define GRPQUOTA 1
+#endif
+#ifndef PRJQUOTA
+#define PRJQUOTA 2
+#endif
+
+/*
+ * We need to always use 64bit version because the structure
+ * is shared across entire cluster where 32bit and 64bit machines
+ * are co-existing.
+ */
+#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64)
+typedef struct stat64   lstat_t;
+#define lstat_f  lstat64
+#define fstat_f         fstat64
+#define fstatat_f       fstatat64
+#else
+typedef struct stat     lstat_t;
+#define lstat_f  lstat
+#define fstat_f         fstat
+#define fstatat_f       fstatat
+#endif
+
+#ifndef STATX_BASIC_STATS
+/*
+ * Timestamp structure for the timestamps in struct statx.
+ *
+ * tv_sec holds the number of seconds before (negative) or after (positive)
+ * 00:00:00 1st January 1970 UTC.
+ *
+ * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time.
+ *
+ * __reserved is held in case we need a yet finer resolution.
+ */
+struct statx_timestamp {
+	__s64	tv_sec;
+	__u32	tv_nsec;
+	__s32	__reserved;
+};
+
+/*
+ * Structures for the extended file attribute retrieval system call
+ * (statx()).
+ *
+ * The caller passes a mask of what they're specifically interested in as a
+ * parameter to statx().  What statx() actually got will be indicated in
+ * st_mask upon return.
+ *
+ * For each bit in the mask argument:
+ *
+ * - if the datum is not supported:
+ *
+ *   - the bit will be cleared, and
+ *
+ *   - the datum will be set to an appropriate fabricated value if one is
+ *     available (eg. CIFS can take a default uid and gid), otherwise
+ *
+ *   - the field will be cleared;
+ *
+ * - otherwise, if explicitly requested:
+ *
+ *   - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is
+ *     set or if the datum is considered out of date, and
+ *
+ *   - the field will be filled in and the bit will be set;
+ *
+ * - otherwise, if not requested, but available in approximate form without any
+ *   effort, it will be filled in anyway, and the bit will be set upon return
+ *   (it might not be up to date, however, and no attempt will be made to
+ *   synchronise the internal state first);
+ *
+ * - otherwise the field and the bit will be cleared before returning.
+ *
+ * Items in STATX_BASIC_STATS may be marked unavailable on return, but they
+ * will have values installed for compatibility purposes so that stat() and
+ * co. can be emulated in userspace.
+ */
+struct statx {
+	/* 0x00 */
+	__u32	stx_mask;	/* What results were written [uncond] */
+	__u32	stx_blksize;	/* Preferred general I/O size [uncond] */
+	__u64	stx_attributes;	/* Flags conveying information about the file [uncond] */
+	/* 0x10 */
+	__u32	stx_nlink;	/* Number of hard links */
+	__u32	stx_uid;	/* User ID of owner */
+	__u32	stx_gid;	/* Group ID of owner */
+	__u16	stx_mode;	/* File mode */
+	__u16	__spare0[1];
+	/* 0x20 */
+	__u64	stx_ino;	/* Inode number */
+	__u64	stx_size;	/* File size */
+	__u64	stx_blocks;	/* Number of 512-byte blocks allocated */
+	__u64	stx_attributes_mask; /* Mask to show what's supported in stx_attributes */
+	/* 0x40 */
+	struct statx_timestamp	stx_atime;	/* Last access time */
+	struct statx_timestamp	stx_btime;	/* File creation time */
+	struct statx_timestamp	stx_ctime;	/* Last attribute change time */
+	struct statx_timestamp	stx_mtime;	/* Last data modification time */
+	/* 0x80 */
+	__u32	stx_rdev_major;	/* Device ID of special file [if bdev/cdev] */
+	__u32	stx_rdev_minor;
+	__u32	stx_dev_major;	/* ID of device containing file [uncond] */
+	__u32	stx_dev_minor;
+	/* 0x90 */
+	__u64	__spare2[14];	/* Spare space for future expansion */
+	/* 0x100 */
+};
+
+/*
+ * Flags to be stx_mask
+ *
+ * Query request/result mask for statx() and struct statx::stx_mask.
+ *
+ * These bits should be set in the mask argument of statx() to request
+ * particular items when calling statx().
+ */
+#define STATX_TYPE		0x00000001U	/* Want/got stx_mode & S_IFMT */
+#define STATX_MODE		0x00000002U	/* Want/got stx_mode & ~S_IFMT */
+#define STATX_NLINK		0x00000004U	/* Want/got stx_nlink */
+#define STATX_UID		0x00000008U	/* Want/got stx_uid */
+#define STATX_GID		0x00000010U	/* Want/got stx_gid */
+#define STATX_ATIME		0x00000020U	/* Want/got stx_atime */
+#define STATX_MTIME		0x00000040U	/* Want/got stx_mtime */
+#define STATX_CTIME		0x00000080U	/* Want/got stx_ctime */
+#define STATX_INO		0x00000100U	/* Want/got stx_ino */
+#define STATX_SIZE		0x00000200U	/* Want/got stx_size */
+#define STATX_BLOCKS		0x00000400U	/* Want/got stx_blocks */
+#define STATX_BASIC_STATS	0x000007ffU	/* The stuff in the normal stat struct */
+#define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
+#define STATX_ALL		0x00000fffU	/* All currently supported flags */
+#define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
+
+/*
+ * Attributes to be found in stx_attributes and masked in stx_attributes_mask.
+ *
+ * These give information about the features or the state of a file that might
+ * be of use to ordinary userspace programs such as GUIs or ls rather than
+ * specialised tools.
+ *
+ * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS
+ * semantically.  Where possible, the numerical value is picked to correspond
+ * also.
+ */
+#define STATX_ATTR_COMPRESSED		0x00000004 /* [I] File is compressed by the fs */
+#define STATX_ATTR_IMMUTABLE		0x00000010 /* [I] File is marked immutable */
+#define STATX_ATTR_APPEND		0x00000020 /* [I] File is append-only */
+#define STATX_ATTR_NODUMP		0x00000040 /* [I] File is not to be dumped */
+#define STATX_ATTR_ENCRYPTED		0x00000800 /* [I] File requires key to decrypt in fs */
+
+#define STATX_ATTR_AUTOMOUNT		0x00001000 /* Dir: Automount trigger */
+
+#define AT_STATX_SYNC_TYPE	0x6000	/* Type of synchronisation required from statx() */
+#define AT_STATX_SYNC_AS_STAT	0x0000	/* - Do whatever stat() does */
+#define AT_STATX_FORCE_SYNC	0x2000	/* - Force the attributes to be sync'd with the server */
+#define AT_STATX_DONT_SYNC	0x4000	/* - Don't sync attributes with the server */
+
+#endif /* STATX_BASIC_STATS */
+
+typedef struct statx lstatx_t;
+
+#define LUSTRE_EOF 0xffffffffffffffffULL
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#define FSFILT_IOC_GETVERSION		_IOR('f', 3, long)
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+	OS_STATFS_DEGRADED	= 0x00000001, /**< RAID degraded/rebuilding */
+	OS_STATFS_READONLY	= 0x00000002, /**< filesystem is read-only */
+	OS_STATFS_NOPRECREATE	= 0x00000004, /**< no object precreation */
+	OS_STATFS_UNUSED1	= 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATFS_UNUSED2	= 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATFS_ENOSPC	= 0x00000020, /**< not enough free space */
+	OS_STATFS_ENOINO	= 0x00000040, /**< not enough inodes */
+	OS_STATFS_SUM		= 0x00000100, /**< aggregated for all tagrets */
+	OS_STATFS_NONROT	= 0x00000200, /**< non-rotational device */
+};
+
+/** filesystem statistics/attributes for target device */
+struct obd_statfs {
+	__u64		os_type;	/* EXT4_SUPER_MAGIC, UBERBLOCK_MAGIC */
+	__u64		os_blocks;	/* total size in #os_bsize blocks */
+	__u64		os_bfree;	/* number of unused blocks */
+	__u64		os_bavail;	/* blocks available for allocation */
+	__u64		os_files;	/* total number of objects */
+	__u64		os_ffree;	/* # objects that could be created */
+	__u8		os_fsid[40];	/* identifier for filesystem */
+	__u32		os_bsize;	/* block size in bytes for os_blocks */
+	__u32		os_namelen;	/* maximum length of filename in bytes*/
+	__u64		os_maxbytes;	/* maximum object size in bytes */
+	__u32		os_state;       /**< obd_statfs_state OS_STATFS_* */
+	__u32		os_fprecreated;	/* objs available now to the caller */
+					/* used in QoS code to find preferred
+					 * OSTs */
+	__u32           os_granted;	/* space granted for MDS */
+	__u32           os_spare3;	/* Unused padding fields.  Remember */
+	__u32           os_spare4;	/* to fix lustre_swab_obd_statfs() */
+	__u32           os_spare5;
+	__u32           os_spare6;
+	__u32           os_spare7;
+	__u32           os_spare8;
+	__u32           os_spare9;
+};
+
+/** additional filesystem attributes for target device */
+struct obd_statfs_info {
+	__u32		os_reserved_mb_low;	/* reserved mb low */
+	__u32		os_reserved_mb_high;	/* reserved mb high */
+	bool		os_enable_pre;		/* enable pre create logic */
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+	* FID sequence. Sequence is a unit of migration: all files (objects)
+	* with FIDs from a given sequence are stored on the same server.
+	* Lustre should support 2^64 objects, so even if each sequence
+	* has only a single object we can still enumerate 2^64 objects.
+	**/
+	__u64 f_seq;
+	/* FID number within sequence. */
+	__u32 f_oid;
+	/**
+	 * FID version, used to distinguish different versions (in the sense
+	 * of snapshots, etc.) of the same file system object. Not currently
+	 * used.
+	 **/
+	__u32 f_ver;
+} __attribute__((packed));
+
+static inline bool fid_is_zero(const struct lu_fid *fid)
+{
+	return fid->f_seq == 0 && fid->f_oid == 0;
+}
+
+/* The data name_to_handle_at() places in a struct file_handle (at f_handle) */
+struct lustre_file_handle {
+	struct lu_fid lfh_child;
+	struct lu_fid lfh_parent;
+};
+
+/* Currently, the filter_fid::ff_parent::f_ver is not the real parent
+ * MDT-object's FID::f_ver, instead it is the OST-object index in its
+ * parent MDT-object's layout EA. */
+#define f_stripe_idx f_ver
+
+struct ost_layout {
+	__u32	ol_stripe_size;
+	__u32	ol_stripe_count;
+	__u64	ol_comp_start;
+	__u64	ol_comp_end;
+	__u32	ol_comp_id;
+} __attribute__((packed));
+
+/* The filter_fid structure has changed several times over its lifetime.
+ * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and
+ * stripe_index and the "self FID" (objid/seq) to be able to recover the
+ * OST objects in case of corruption.  With the move to 2.4 and OSD-API for
+ * the OST, the "trusted.lma" xattr was added to the OST objects to store
+ * the "self FID" to be consistent with the MDT on-disk format, and the
+ * filter_fid only stored the MDT inode parent FID and stripe index.
+ *
+ * In 2.10, the addition of PFL composite layouts required more information
+ * to be stored into the filter_fid in order to be able to identify which
+ * component the OST object belonged.  As well, the stripe size may vary
+ * between components, so it was no longer safe to assume the stripe size
+ * or stripe_count of a file.  This is also more robust for plain layouts.
+ *
+ * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not
+ * enough space to store both the filter_fid and LMA in the inode, so they
+ * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid
+ * an extra seek for every OST object access.
+ *
+ * In 2.11, FLR mirror layouts also need to store the layout version and
+ * range so that writes to old versions of the layout are not allowed.
+ * That ensures that mirrored objects are not modified by evicted clients,
+ * and ensures that the components are correctly marked stale on the MDT.
+ */
+struct filter_fid_18_23 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	__u64			ff_objid;
+	__u64			ff_seq;
+};
+
+struct filter_fid_24_29 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+};
+
+struct filter_fid_210 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	struct ost_layout	ff_layout;
+};
+
+struct filter_fid {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	struct ost_layout	ff_layout;
+	__u32			ff_layout_version;
+	__u32			ff_range; /* range of layout version that
+					   * write are allowed */
+} __attribute__((packed));
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+struct lu_fid;
+
+enum lma_compat {
+	LMAC_HSM	 = 0x00000001,
+/*	LMAC_SOM	 = 0x00000002, obsolete since 2.8.0 */
+	LMAC_NOT_IN_OI	 = 0x00000004, /* the object does NOT need OI mapping */
+	LMAC_FID_ON_OST  = 0x00000008, /* For OST-object, its OI mapping is
+				       * under /O/<seq>/d<x>. */
+	LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */
+	LMAC_COMP_INFO	 = 0x00000020, /* Component info in the LMA EA. */
+	LMAC_IDX_BACKUP  = 0x00000040, /* Has index backup. */
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+	LMAI_RELEASED		= 0x00000001, /* file is released */
+	LMAI_AGENT		= 0x00000002, /* agent inode */
+	LMAI_REMOTE_PARENT	= 0x00000004, /* the parent of the object
+						 is on the remote MDT */
+	LMAI_STRIPED		= 0x00000008, /* striped directory inode */
+	LMAI_ORPHAN		= 0x00000010, /* inode is orphan */
+	LMAI_ENCRYPT		= 0x00000020, /* inode is encrypted */
+	LMA_INCOMPAT_SUPP	= (LMAI_AGENT | LMAI_REMOTE_PARENT | \
+				   LMAI_STRIPED | LMAI_ORPHAN | LMAI_ENCRYPT)
+};
+
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+	/**
+	 * Bitfield for supported data in this structure. From enum lma_compat.
+	 * lma_self_fid and lma_flags are always available.
+	 */
+	__u32   lma_compat;
+	/**
+	 * Per-file incompat feature list. Lustre version should support all
+	 * flags set in this field. The supported feature mask is available in
+	 * LMA_INCOMPAT_SUPP.
+	 */
+	__u32   lma_incompat;
+	/** FID of this inode */
+	struct lu_fid  lma_self_fid;
+};
+
+struct lustre_ost_attrs {
+	/* Use lustre_mdt_attrs directly for now, need a common header
+	 * structure if want to change lustre_mdt_attrs in future. */
+	struct lustre_mdt_attrs loa_lma;
+
+	/* Below five elements are for OST-object's PFID EA, the
+	 * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits)
+	 * and the stripe_index (low 16 bits), the size should not exceed
+	 * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag
+	 * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size
+	 * are valid; if the flag LMAC_COMP_INFO is set, then the next three
+	 * loa_comp_* elements are valid. */
+	struct lu_fid	loa_parent_fid;
+	__u32		loa_stripe_size;
+	__u32		loa_comp_id;
+	__u64		loa_comp_start;
+	__u64		loa_comp_end;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+enum lustre_som_flags {
+	/* Unknow or no SoM data, must get size from OSTs. */
+	SOM_FL_UNKNOWN	= 0x0000,
+	/* Known strictly correct, FLR or DoM file (SoM guaranteed). */
+	SOM_FL_STRICT	= 0x0001,
+	/* Known stale - was right at some point in the past, but it is
+	 * known (or likely) to be incorrect now (e.g. opened for write). */
+	SOM_FL_STALE	= 0x0002,
+	/* Approximate, may never have been strictly correct,
+	 * need to sync SOM data to achieve eventual consistency. */
+	SOM_FL_LAZY	= 0x0004,
+};
+
+struct lustre_som_attrs {
+	__u16	lsa_valid;
+	__u16	lsa_reserved[3];
+	__u64	lsa_size;
+	__u64	lsa_blocks;
+};
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+	union {
+		struct {
+			__u64	oi_id;
+			__u64	oi_seq;
+		} oi;
+		struct lu_fid oi_fid;
+	};
+} __attribute__((packed));
+
+#define DOSTID "%#llx:%llu"
+#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \
+		   ((unsigned long long)ostid_id(oi))
+
+struct ll_futimes_3 {
+	__u64 lfu_atime_sec;
+	__u64 lfu_atime_nsec;
+	__u64 lfu_mtime_sec;
+	__u64 lfu_mtime_nsec;
+	__u64 lfu_ctime_sec;
+	__u64 lfu_ctime_nsec;
+};
+
+/*
+ * Maximum number of mirrors currently implemented.
+ */
+#define LUSTRE_MIRROR_COUNT_MAX		16
+
+/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
+enum ll_lease_mode {
+	LL_LEASE_RDLCK	= 0x01,
+	LL_LEASE_WRLCK	= 0x02,
+	LL_LEASE_UNLCK	= 0x04,
+};
+
+enum ll_lease_flags {
+	LL_LEASE_RESYNC		= 0x1,
+	LL_LEASE_RESYNC_DONE	= 0x2,
+	LL_LEASE_LAYOUT_MERGE	= 0x4,
+	LL_LEASE_LAYOUT_SPLIT	= 0x8,
+	LL_LEASE_PCC_ATTACH	= 0x10,
+};
+
+#define IOC_IDS_MAX	4096
+struct ll_ioc_lease {
+	__u32		lil_mode;
+	__u32		lil_flags;
+	__u32		lil_count;
+	__u32		lil_ids[0];
+};
+
+struct ll_ioc_lease_id {
+	__u32		lil_mode;
+	__u32		lil_flags;
+	__u32		lil_count;
+	__u16		lil_mirror_id;
+	__u16		lil_padding1;
+	__u64		lil_padding2;
+	__u32		lil_ids[0];
+};
+
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/*	lustre_ioctl.h			101-150 */
+/* ioctl codes 128-143 are reserved for fsverity */
+#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
+#define LL_IOC_LOV_SETSTRIPE            _IOW ('f', 154, long)
+#define LL_IOC_LOV_SETSTRIPE_NEW	_IOWR('f', 154, struct lov_user_md)
+#define LL_IOC_LOV_GETSTRIPE            _IOW ('f', 155, long)
+#define LL_IOC_LOV_GETSTRIPE_NEW	_IOR('f', 155, struct lov_user_md)
+#define LL_IOC_LOV_SETEA                _IOW ('f', 156, long)
+/*	LL_IOC_RECREATE_OBJ             157 obsolete */
+/*	LL_IOC_RECREATE_FID             157 obsolete */
+#define LL_IOC_GROUP_LOCK               _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK             _IOW ('f', 159, long)
+/*	LL_IOC_QUOTACHECK               160 OBD_IOC_QUOTACHECK */
+/*	LL_IOC_POLL_QUOTACHECK		161 OBD_IOC_POLL_QUOTACHECK */
+/*	LL_IOC_QUOTACTL			162 OBD_IOC_QUOTACTL */
+#define IOC_OBD_STATFS                  _IOWR('f', 164, struct obd_statfs *)
+/*	IOC_LOV_GETINFO                 165 obsolete */
+#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
+/*	LL_IOC_RMTACL                   167 obsolete */
+#define LL_IOC_GETOBDCOUNT              _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH             _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH             _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO               _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS        _IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX               _IOR ('f', 175, int)
+#define LL_IOC_FUTIMES_3		_IOWR('f', 176, struct ll_futimes_3)
+#define LL_IOC_FLR_SET_MIRROR		_IOW ('f', 177, long)
+/*	lustre_ioctl.h			177-210 */
+#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
+						struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
+						struct hsm_current_action)
+/*	lustre_ioctl.h			221-232 */
+#define LL_IOC_LMV_SETSTRIPE		_IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE		_IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY		_IOWR('f', 242, __u64)
+#define LL_IOC_RMFID			_IOR('f', 242, struct fid_array)
+#define LL_IOC_UNLOCK_FOREIGN		_IO('f', 242)
+#define LL_IOC_SET_LEASE		_IOWR('f', 243, struct ll_ioc_lease)
+#define LL_IOC_SET_LEASE_OLD		_IOWR('f', 243, long)
+#define LL_IOC_GET_LEASE		_IO('f', 244)
+#define LL_IOC_HSM_IMPORT		_IOWR('f', 245, struct hsm_user_import)
+#define LL_IOC_LMV_SET_DEFAULT_STRIPE	_IOWR('f', 246, struct lmv_user_md)
+#define LL_IOC_MIGRATE			_IOR('f', 247, int)
+#define LL_IOC_FID2MDTIDX		_IOWR('f', 248, struct lu_fid)
+#define LL_IOC_GETPARENT		_IOWR('f', 249, struct getparent)
+#define LL_IOC_LADVISE			_IOR('f', 250, struct llapi_lu_ladvise)
+#define LL_IOC_HEAT_GET			_IOWR('f', 251, struct lu_heat)
+#define LL_IOC_HEAT_SET			_IOW('f', 251, __u64)
+#define LL_IOC_PCC_DETACH		_IOW('f', 252, struct lu_pcc_detach)
+#define LL_IOC_PCC_DETACH_BY_FID	_IOW('f', 252, struct lu_pcc_detach_fid)
+#define LL_IOC_PCC_STATE		_IOR('f', 252, struct lu_pcc_state)
+#define LL_IOC_PROJECT			_IOW('f', 253, struct lu_project)
+
+#ifndef	FS_IOC_FSGETXATTR
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+*/
+struct fsxattr {
+	__u32           fsx_xflags;     /* xflags field value (get/set) */
+	__u32           fsx_extsize;    /* extsize field value (get/set)*/
+	__u32           fsx_nextents;   /* nextents field value (get)   */
+	__u32           fsx_projid;     /* project identifier (get/set) */
+	unsigned char   fsx_pad[12];
+};
+#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+#endif
+#ifndef FS_XFLAG_PROJINHERIT
+#define FS_XFLAG_PROJINHERIT		0x00000200
+#endif
+
+
+#define LL_STATFS_LMV		1
+#define LL_STATFS_LOV		2
+#define LL_STATFS_NODELAY	4
+
+#define IOC_MDC_TYPE		'i'
+#define IOC_MDC_LOOKUP		_IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE	_IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO_V1	_IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data_v1 *)
+#define IOC_MDC_GETFILEINFO_V2	_IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data)
+#define LL_IOC_MDC_GETINFO_V1	_IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data_v1 *)
+#define LL_IOC_MDC_GETINFO_V2	_IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data)
+#define IOC_MDC_GETFILEINFO	IOC_MDC_GETFILEINFO_V1
+#define LL_IOC_MDC_GETINFO	LL_IOC_MDC_GETINFO_V1
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
+ * files, but are unlikely to be used in practice and are not harmful if
+ * used incorrectly.  O_NOCTTY and FASYNC are only meaningful for character
+ * devices and are safe for use on new files. See LU-4209. */
+/* To be compatible with old statically linked binary we keep the check for
+ * the older 0100000000 flag.  This is already removed upstream.  LU-812. */
+#define O_LOV_DELAY_CREATE_1_8	0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */
+#ifndef FASYNC
+#define FASYNC			00020000   /* fcntl, for BSD compatibility */
+#endif
+#define O_LOV_DELAY_CREATE_MASK	(O_NOCTTY | FASYNC)
+#define O_LOV_DELAY_CREATE		(O_LOV_DELAY_CREATE_1_8 | \
+					 O_LOV_DELAY_CREATE_MASK)
+/* O_FILE_ENC principle is similar to O_LOV_DELAY_CREATE above,
+ * for access to encrypted files without the encryption key.
+ */
+#define O_FILE_ENC		(O_NOCTTY | O_NDELAY)
+
+#define O_LU_NOIMPORT_MASK	(O_NOCTTY | O_DSYNC | O_DIRECT)
+#define O_LU_NOIMPORT		O_LU_NOIMPORT_MASK
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA        0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_FLOCK_WARNING   0x00000020 /* warned about disabled flock */
+
+#define LOV_USER_MAGIC_V1	0x0BD10BD0
+#define LOV_USER_MAGIC		LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1	0x0BD20BD0
+#define LOV_USER_MAGIC_V3	0x0BD30BD0
+/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */
+#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0	/* for specific OSTs */
+#define LOV_USER_MAGIC_COMP_V1	0x0BD60BD0
+#define LOV_USER_MAGIC_FOREIGN	0x0BD70BD0
+#define LOV_USER_MAGIC_SEL	0x0BD80BD0
+
+#define LMV_USER_MAGIC		0x0CD30CD0    /* default lmv magic */
+#define LMV_USER_MAGIC_V0	0x0CD20CD0    /* old default lmv magic*/
+#define LMV_USER_MAGIC_SPECIFIC	0x0CD40CD0
+
+#define LOV_PATTERN_NONE		0x000
+#define LOV_PATTERN_RAID0		0x001
+#define LOV_PATTERN_RAID1		0x002
+#define LOV_PATTERN_MDT			0x100
+#define LOV_PATTERN_OVERSTRIPING	0x200
+#define LOV_PATTERN_FOREIGN		0x400
+
+#define LOV_PATTERN_F_MASK	0xffff0000
+#define LOV_PATTERN_F_HOLE	0x40000000 /* there is hole in LOV EA */
+#define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
+#define LOV_PATTERN_DEFAULT	0xffffffff
+
+#define LOV_OFFSET_DEFAULT      ((__u16)-1)
+#define LMV_OFFSET_DEFAULT      ((__u32)-1)
+
+static inline bool lov_pattern_supported(__u32 pattern)
+{
+	return (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_RAID0 ||
+	       (pattern & ~LOV_PATTERN_F_RELEASED) ==
+			(LOV_PATTERN_RAID0 | LOV_PATTERN_OVERSTRIPING) ||
+	       (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_MDT;
+}
+
+/* RELEASED and MDT patterns are not valid in many places, so rather than
+ * having many extra checks on lov_pattern_supported, we have this separate
+ * check for non-released, non-DOM components
+ */
+static inline bool lov_pattern_supported_normal_comp(__u32 pattern)
+{
+	return pattern == LOV_PATTERN_RAID0 ||
+	       pattern == (LOV_PATTERN_RAID0 | LOV_PATTERN_OVERSTRIPING);
+
+}
+
+#define LOV_MAXPOOLNAME 15
+#define LOV_POOLNAMEF "%.15s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ~((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define XATTR_LUSTRE_PREFIX	"lustre."
+#define XATTR_LUSTRE_LOV	XATTR_LUSTRE_PREFIX"lov"
+
+/* Please update if XATTR_LUSTRE_LOV".set" groks more flags in the future */
+#define allowed_lustre_lov(att) (strcmp((att), XATTR_LUSTRE_LOV".add") == 0 || \
+			strcmp((att), XATTR_LUSTRE_LOV".set") == 0 || \
+			strcmp((att), XATTR_LUSTRE_LOV".set.flags") == 0 || \
+			strcmp((att), XATTR_LUSTRE_LOV".del") == 0)
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;          /* generation of this OST index */
+	__u32 l_ost_idx;          /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed, __may_alias__));
+
+struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed, __may_alias__));
+
+struct lov_foreign_md {
+	__u32 lfm_magic;	/* magic number = LOV_MAGIC_FOREIGN */
+	__u32 lfm_length;	/* length of lfm_value */
+	__u32 lfm_type;		/* type, see LU_FOREIGN_TYPE_ */
+	__u32 lfm_flags;	/* flags, type specific */
+	char lfm_value[];
+} __attribute__((packed));
+
+#define foreign_size(lfm) (((struct lov_foreign_md *)lfm)->lfm_length + \
+			   offsetof(struct lov_foreign_md, lfm_value))
+
+#define foreign_size_le(lfm) \
+	(le32_to_cpu(((struct lov_foreign_md *)lfm)->lfm_length) + \
+	offsetof(struct lov_foreign_md, lfm_value))
+
+/**
+ * The stripe size fields are shared for the extension size storage, however
+ * the extension size is stored in KB, not bytes.
+ */
+#define SEL_UNIT_SIZE 1024llu
+
+struct lu_extent {
+	__u64	e_start;
+	__u64	e_end;
+} __attribute__((packed));
+
+#define DEXT "[%#llx, %#llx)"
+#define PEXT(ext) (unsigned long long)(ext)->e_start, (unsigned long long)(ext)->e_end
+
+static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
+					   struct lu_extent *e2)
+{
+	return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
+}
+
+static inline bool lu_extent_is_whole(struct lu_extent *e)
+{
+	return e->e_start == 0 && e->e_end == LUSTRE_EOF;
+}
+
+enum lov_comp_md_entry_flags {
+	LCME_FL_STALE	  = 0x00000001,	/* FLR: stale data */
+	LCME_FL_PREF_RD	  = 0x00000002,	/* FLR: preferred for reading */
+	LCME_FL_PREF_WR	  = 0x00000004,	/* FLR: preferred for writing */
+	LCME_FL_PREF_RW	  = LCME_FL_PREF_RD | LCME_FL_PREF_WR,
+	LCME_FL_OFFLINE	  = 0x00000008,	/* Not used */
+	LCME_FL_INIT	  = 0x00000010,	/* instantiated */
+	LCME_FL_NOSYNC	  = 0x00000020,	/* FLR: no sync for the mirror */
+	LCME_FL_EXTENSION = 0x00000040,	/* extension comp, never init */
+	LCME_FL_NEG	  = 0x80000000	/* used to indicate a negative flag,
+					 * won't be stored on disk
+					 */
+};
+
+#define LCME_KNOWN_FLAGS	(LCME_FL_NEG | LCME_FL_INIT | LCME_FL_STALE | \
+				 LCME_FL_PREF_RW | LCME_FL_NOSYNC | \
+				 LCME_FL_EXTENSION)
+
+/* The component flags can be set by users at creation/modification time. */
+#define LCME_USER_COMP_FLAGS	(LCME_FL_PREF_RW | LCME_FL_NOSYNC | \
+				 LCME_FL_EXTENSION)
+
+/* The mirror flags can be set by users at creation time. */
+#define LCME_USER_MIRROR_FLAGS	(LCME_FL_PREF_RW)
+
+/* The allowed flags obtained from the client at component creation time. */
+#define LCME_CL_COMP_FLAGS	(LCME_USER_MIRROR_FLAGS | LCME_FL_EXTENSION)
+
+/* The mirror flags sent by client */
+#define LCME_MIRROR_FLAGS	(LCME_FL_NOSYNC)
+
+/* These flags have meaning when set in a default layout and will be inherited
+ * from the default/template layout set on a directory.
+ */
+#define LCME_TEMPLATE_FLAGS	(LCME_FL_PREF_RW | LCME_FL_NOSYNC | \
+				 LCME_FL_EXTENSION)
+
+/* the highest bit in obdo::o_layout_version is used to mark if the file is
+ * being resynced. */
+#define LU_LAYOUT_RESYNC	LCME_FL_NEG
+
+/* lcme_id can be specified as certain flags, and the the first
+ * bit of lcme_id is used to indicate that the ID is representing
+ * certain LCME_FL_* but not a real ID. Which implies we can have
+ * at most 31 flags (see LCME_FL_XXX). */
+enum lcme_id {
+	LCME_ID_INVAL	= 0x0,
+	LCME_ID_MAX	= 0x7FFFFFFF,
+	LCME_ID_ALL	= 0xFFFFFFFF,
+	LCME_ID_NOT_ID	= LCME_FL_NEG
+};
+
+#define LCME_ID_MASK	LCME_ID_MAX
+
+struct lov_comp_md_entry_v1 {
+	__u32			lcme_id;        /* unique id of component */
+	__u32			lcme_flags;     /* LCME_FL_XXX */
+	struct lu_extent	lcme_extent;    /* file extent for component */
+	__u32			lcme_offset;    /* offset of component blob,
+						   start from lov_comp_md_v1 */
+	__u32			lcme_size;      /* size of component blob */
+	__u32			lcme_layout_gen;
+	__u64			lcme_timestamp;	/* snapshot time if applicable*/
+	__u32			lcme_padding_1;
+} __attribute__((packed));
+
+#define SEQ_ID_MAX		0x0000FFFF
+#define SEQ_ID_MASK		SEQ_ID_MAX
+/* bit 30:16 of lcme_id is used to store mirror id */
+#define MIRROR_ID_MASK		0x7FFF0000
+#define MIRROR_ID_NEG		0x8000
+#define MIRROR_ID_SHIFT		16
+
+static inline __u32 pflr_id(__u16 mirror_id, __u16 seqid)
+{
+	return ((mirror_id << MIRROR_ID_SHIFT) & MIRROR_ID_MASK) | seqid;
+}
+
+static inline __u16 mirror_id_of(__u32 id)
+{
+	return (id & MIRROR_ID_MASK) >> MIRROR_ID_SHIFT;
+}
+
+/**
+ * on-disk data for lcm_flags. Valid if lcm_magic is LOV_MAGIC_COMP_V1.
+ */
+enum lov_comp_md_flags {
+	/* the least 4 bits are used by FLR to record file state */
+	LCM_FL_NONE		= 0x0,
+	LCM_FL_RDONLY		= 0x1,
+	LCM_FL_WRITE_PENDING	= 0x2,
+	LCM_FL_SYNC_PENDING	= 0x3,
+	LCM_FL_PCC_RDONLY	= 0x8,
+	LCM_FL_FLR_MASK		= 0xB,
+};
+
+struct lov_comp_md_v1 {
+	__u32	lcm_magic;      /* LOV_USER_MAGIC_COMP_V1 */
+	__u32	lcm_size;       /* overall size including this struct */
+	__u32	lcm_layout_gen;
+	__u16	lcm_flags;
+	__u16	lcm_entry_count;
+	/* lcm_mirror_count stores the number of actual mirrors minus 1,
+	 * so that non-flr files will have value 0 meaning 1 mirror. */
+	__u16	lcm_mirror_count;
+	__u16	lcm_padding1[3];
+	__u64	lcm_padding2;
+	struct lov_comp_md_entry_v1 lcm_entries[0];
+} __attribute__((packed));
+
+static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (stripes == (__u16)-1)
+		stripes = 0;
+
+	if (lmm_magic == LOV_USER_MAGIC_V1)
+		return sizeof(struct lov_user_md_v1) +
+			      stripes * sizeof(struct lov_user_ost_data_v1);
+	return sizeof(struct lov_user_md_v3) +
+				stripes * sizeof(struct lov_user_ost_data_v1);
+}
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#define lov_user_mds_data lov_user_mds_data_v2
+struct lov_user_mds_data_v1 {
+	lstat_t lmd_st;                 /* MDS stat struct */
+	struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v2 {
+	struct lu_fid lmd_fid;		/* Lustre FID */
+	lstatx_t lmd_stx;		/* MDS statx struct */
+	__u64 lmd_flags;		/* MDS stat flags */
+	__u32 lmd_lmmsize;		/* LOV EA size */
+	__u32 lmd_padding;		/* unused */
+	struct lov_user_md_v1 lmd_lmm;	/* LOV EA user data */
+} __attribute__((packed));
+
+struct lmv_user_mds_data {
+	struct lu_fid	lum_fid;
+	__u32		lum_padding;
+	__u32		lum_mds;
+} __attribute__((packed, __may_alias__));
+
+enum lmv_hash_type {
+	LMV_HASH_TYPE_UNKNOWN	= 0,	/* 0 is reserved for testing purpose */
+	LMV_HASH_TYPE_ALL_CHARS = 1,
+	LMV_HASH_TYPE_FNV_1A_64 = 2,
+	LMV_HASH_TYPE_CRUSH	= 3,
+	LMV_HASH_TYPE_MAX,
+};
+
+static __attribute__((unused)) const char *mdt_hash_name[] = {
+	"none",
+	"all_char",
+	"fnv_1a_64",
+	"crush",
+};
+
+#define LMV_HASH_TYPE_DEFAULT LMV_HASH_TYPE_FNV_1A_64
+
+/* Right now only the lower part(0-16bits) of lmv_hash_type is being used,
+ * and the higher part will be the flag to indicate the status of object,
+ * for example the object is being migrated. And the hash function
+ * might be interpreted differently with different flags. */
+#define LMV_HASH_TYPE_MASK 0x0000ffff
+
+static inline bool lmv_is_known_hash_type(__u32 type)
+{
+	return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
+	       (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS ||
+	       (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_CRUSH;
+}
+
+/* fixed layout, such directories won't split automatically */
+/* NB, update LMV_HASH_FLAG_KNOWN when adding new flag */
+#define LMV_HASH_FLAG_FIXED		0x02000000
+#define LMV_HASH_FLAG_MERGE		0x04000000
+#define LMV_HASH_FLAG_SPLIT		0x08000000
+
+/* The striped directory has ever lost its master LMV EA, then LFSCK
+ * re-generated it. This flag is used to indicate such case. It is an
+ * on-disk flag. */
+#define LMV_HASH_FLAG_LOST_LMV		0x10000000
+
+#define LMV_HASH_FLAG_BAD_TYPE		0x20000000
+#define LMV_HASH_FLAG_MIGRATION		0x80000000
+
+#define LMV_HASH_FLAG_LAYOUT_CHANGE	\
+	(LMV_HASH_FLAG_MIGRATION | LMV_HASH_FLAG_SPLIT | LMV_HASH_FLAG_MERGE)
+
+#define LMV_HASH_FLAG_KNOWN		0xbe000000
+
+/* both SPLIT and MIGRATION are set for directory split */
+static inline bool lmv_hash_is_splitting(__u32 hash)
+{
+	return (hash & LMV_HASH_FLAG_LAYOUT_CHANGE) ==
+	       (LMV_HASH_FLAG_SPLIT | LMV_HASH_FLAG_MIGRATION);
+}
+
+/* both MERGE and MIGRATION are set for directory merge */
+static inline bool lmv_hash_is_merging(__u32 hash)
+{
+	return (hash & LMV_HASH_FLAG_LAYOUT_CHANGE) ==
+	       (LMV_HASH_FLAG_MERGE | LMV_HASH_FLAG_MIGRATION);
+}
+
+/* only MIGRATION is set for directory migration */
+static inline bool lmv_hash_is_migrating(__u32 hash)
+{
+	return (hash & LMV_HASH_FLAG_LAYOUT_CHANGE) == LMV_HASH_FLAG_MIGRATION;
+}
+
+static inline bool lmv_hash_is_restriping(__u32 hash)
+{
+	return lmv_hash_is_splitting(hash) || lmv_hash_is_merging(hash);
+}
+
+static inline bool lmv_hash_is_layout_changing(__u32 hash)
+{
+	return lmv_hash_is_splitting(hash) || lmv_hash_is_merging(hash) ||
+	       lmv_hash_is_migrating(hash);
+}
+
+struct lustre_foreign_type {
+	__u32		lft_type;
+	const char	*lft_name;
+};
+
+/**
+ * LOV/LMV foreign types
+ **/
+enum lustre_foreign_types {
+	LU_FOREIGN_TYPE_NONE = 0,
+	LU_FOREIGN_TYPE_SYMLINK = 0xda05,
+	/* must be the max/last one */
+	LU_FOREIGN_TYPE_UNKNOWN = 0xffffffff,
+};
+
+extern struct lustre_foreign_type lu_foreign_types[];
+
+/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above,
+ * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */
+#define LMV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+	__u32	lum_magic;	   /* must be the first field */
+	__u32	lum_stripe_count;  /* dirstripe count */
+	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
+	__u32	lum_hash_type;     /* Dir stripe policy */
+	__u32	lum_type;	   /* LMV type: default */
+	__u8	lum_max_inherit;   /* inherit depth of default LMV */
+	__u8	lum_max_inherit_rr;	/* inherit depth of default LMV to round-robin mkdir */
+	__u16	lum_padding1;
+	__u32	lum_padding2;
+	__u32	lum_padding3;
+	char	lum_pool_name[LOV_MAXPOOLNAME + 1];
+	struct	lmv_user_mds_data  lum_objects[0];
+} __attribute__((packed));
+
+static inline __u32 lmv_foreign_to_md_stripes(__u32 size)
+{
+	if (size <= sizeof(struct lmv_user_md))
+		return 0;
+
+	size -= sizeof(struct lmv_user_md);
+	return (size + sizeof(struct lmv_user_mds_data) - 1) /
+	       sizeof(struct lmv_user_mds_data);
+}
+
+/*
+ * NB, historically default layout didn't set type, but use XATTR name to differ
+ * from normal layout, for backward compatibility, define LMV_TYPE_DEFAULT 0x0,
+ * and still use the same method.
+ */
+enum lmv_type {
+	LMV_TYPE_DEFAULT = 0x0000,
+};
+
+/* lum_max_inherit will be decreased by 1 after each inheritance if it's not
+ * LMV_INHERIT_UNLIMITED or > LMV_INHERIT_MAX.
+ */
+enum {
+	/* for historical reason, 0 means unlimited inheritance */
+	LMV_INHERIT_UNLIMITED		= 0,
+	/* unlimited lum_max_inherit by default for plain stripe (0 or 1) */
+	LMV_INHERIT_DEFAULT_PLAIN	= LMV_INHERIT_UNLIMITED,
+	/* not inherit any more */
+	LMV_INHERIT_END			= 1,
+	/* for multiple stripes, the default lum_max_inherit is 3 */
+	LMV_INHERIT_DEFAULT_STRIPED	= 3,
+	/* max inherit depth */
+	LMV_INHERIT_MAX			= 250,
+	/* [251, 254] are reserved */
+	/* not set, or when inherit depth goes beyond end,  */
+	LMV_INHERIT_NONE		= 255,
+};
+
+enum {
+	/* not set, or when inherit_rr depth goes beyond end,  */
+	LMV_INHERIT_RR_NONE		= 0,
+	/* disable lum_max_inherit_rr by default */
+	LMV_INHERIT_RR_DEFAULT		= LMV_INHERIT_RR_NONE,
+	/* not inherit any more */
+	LMV_INHERIT_RR_END		= 1,
+	/* default inherit_rr of ROOT */
+	LMV_INHERIT_RR_ROOT		= 3,
+	/* max inherit depth */
+	LMV_INHERIT_RR_MAX		= 250,
+	/* [251, 254] are reserved */
+	/* unlimited inheritance */
+	LMV_INHERIT_RR_UNLIMITED	= 255,
+};
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+	int size = sizeof(struct lmv_user_md);
+
+	if (lmm_magic == LMV_USER_MAGIC_SPECIFIC)
+		size += stripes * sizeof(struct lmv_user_mds_data);
+
+	return size;
+}
+
+struct ll_recreate_obj {
+	__u64 lrc_id;
+	__u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+	__u64 id;         /* holds object id */
+	__u32 generation; /* holds object generation */
+	__u32 f_type;     /* holds object type or stripe idx when passing it to
+			   * OST for saving into EA. */
+};
+
+#define UUID_MAX        40
+struct obd_uuid {
+	char uuid[UUID_MAX];
+};
+
+static inline bool obd_uuid_equals(const struct obd_uuid *u1,
+				   const struct obd_uuid *u2)
+{
+	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+	return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+	strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+	uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(const struct obd_uuid *uuid)
+{
+	if (uuid == NULL)
+		return NULL;
+
+	if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+		/* Obviously not safe, but for printfs, no real harm done...
+		   we're always null-terminated, even in a race. */
+		static char temp[sizeof(*uuid->uuid)];
+
+		memcpy(temp, uuid->uuid, sizeof(*uuid->uuid) - 1);
+		temp[sizeof(*uuid->uuid) - 1] = '\0';
+
+		return temp;
+	}
+	return (char *)(uuid->uuid);
+}
+
+#define LUSTRE_MAXFSNAME 8
+#define LUSTRE_MAXINSTANCE 16
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+	char *p;
+
+	strncpy(buf, uuid, buflen - 1);
+	buf[buflen - 1] = '\0';
+	p = strrchr(buf, '-');
+	if (p != NULL)
+		*p = '\0';
+}
+
+/* printf display format for Lustre FIDs
+ * usage: printf("file FID is "DFID"\n", PFID(fid)); */
+#define FID_NOBRACE_LEN 40
+#define FID_LEN (FID_NOBRACE_LEN + 2)
+#define DFID_NOBRACE "%#llx:0x%x:0x%x"
+#define DFID "[" DFID_NOBRACE "]"
+#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver
+
+/* scanf input parse format for fids in DFID_NOBRACE format
+ * Need to strip '[' from DFID format first or use "["SFID"]" at caller.
+ * usage: sscanf(fidstr, SFID, RFID(&fid)); */
+#define SFID "0x%llx:0x%x:0x%x"
+#define RFID(fid) (unsigned long long *)&((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver)
+
+/********* Quotas **********/
+
+/* From linux/fs/quota/quota.c */
+static inline __u64 toqb(__kernel_size_t space)
+{
+	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
+#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO	0x800102 /* get obd quota info */
+#define Q_GETOQUOTA	0x800103 /* get obd quotas */
+#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* deprecated as of 2.4 */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* deprecated as of 2.4 */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* deprecated as of 2.4 */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* deprecated as of 2.4 */
+#define LUSTRE_Q_GETDEFAULT  0x80000d     /* get default quota */
+#define LUSTRE_Q_SETDEFAULT  0x80000e     /* set default quota */
+#define LUSTRE_Q_GETQUOTAPOOL	0x80000f  /* get user pool quota */
+#define LUSTRE_Q_SETQUOTAPOOL	0x800010  /* set user pool quota */
+#define LUSTRE_Q_GETINFOPOOL	0x800011  /* get pool quota info */
+#define LUSTRE_Q_SETINFOPOOL	0x800012  /* set pool quota info */
+#define LUSTRE_Q_GETDEFAULT_POOL	0x800013  /* get default pool quota*/
+#define LUSTRE_Q_SETDEFAULT_POOL	0x800014  /* set default pool quota */
+#define LUSTRE_Q_DELETEQID	0x800015  /* delete quota ID */
+/* In the current Lustre implementation, the grace time is either the time
+ * or the timestamp to be used after some quota ID exceeds the soft limt,
+ * 48 bits should be enough, its high 16 bits can be used as quota flags.
+ * */
+#define LQUOTA_GRACE_BITS	48
+#define LQUOTA_GRACE_MASK	((1ULL << LQUOTA_GRACE_BITS) - 1)
+#define LQUOTA_GRACE_MAX	LQUOTA_GRACE_MASK
+#define LQUOTA_GRACE(t)		(t & LQUOTA_GRACE_MASK)
+#define LQUOTA_FLAG(t)		(t >> LQUOTA_GRACE_BITS)
+#define LQUOTA_GRACE_FLAG(t, f)	((__u64)t | (__u64)f << LQUOTA_GRACE_BITS)
+
+/* special grace time, only notify the user when its quota is over soft limit
+ * but doesn't block new writes until the hard limit is reached. */
+#define NOTIFY_GRACE		"notify"
+#define NOTIFY_GRACE_TIME	LQUOTA_GRACE_MASK
+
+/* different quota flags */
+
+/* the default quota flag, the corresponding quota ID will use the default
+ * quota setting, the hardlimit and softlimit of its quota record in the global
+ * quota file will be set to 0, the low 48 bits of the grace will be set to 0
+ * and high 16 bits will contain this flag (see above comment).
+ * */
+#define LQUOTA_FLAG_DEFAULT	0x0001
+#define LQUOTA_FLAG_DELETED	0x0002
+
+#define LUSTRE_Q_CMD_IS_POOL(cmd)		\
+	(cmd == LUSTRE_Q_GETQUOTAPOOL ||	\
+	 cmd == LUSTRE_Q_SETQUOTAPOOL ||	\
+	 cmd == LUSTRE_Q_SETINFOPOOL ||		\
+	 cmd == LUSTRE_Q_GETINFOPOOL ||		\
+	 cmd == LUSTRE_Q_SETDEFAULT_POOL ||	\
+	 cmd == LUSTRE_Q_GETDEFAULT_POOL)
+
+#define ALLQUOTA 255       /* set all quota */
+static inline const char *qtype_name(int qtype)
+{
+	switch (qtype) {
+	case USRQUOTA:
+		return "usr";
+	case GRPQUOTA:
+		return "grp";
+	case PRJQUOTA:
+		return "prj";
+	}
+	return "unknown";
+}
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+	__u64 pdd_nid;
+	__u32 pdd_perm;
+	__u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+	__u32                            idd_magic;
+	__u32                            idd_err;
+	__u32                            idd_uid;
+	__u32                            idd_gid;
+	__u32                            idd_nperms;
+	__u32                            idd_ngroups;
+	struct perm_downcall_data idd_perms[N_PERMS_MAX];
+	__u32                            idd_groups[0];
+};
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0)
+/* old interface struct is deprecated in 2.14 */
+#define SEPOL_DOWNCALL_MAGIC_OLD 0x8b8bb842
+struct sepol_downcall_data_old {
+	__u32		sdd_magic;
+	__s64		sdd_sepol_mtime;
+	__u16		sdd_sepol_len;
+	char		sdd_sepol[0];
+};
+#endif
+
+#define SEPOL_DOWNCALL_MAGIC 0x8b8bb843
+struct sepol_downcall_data {
+	__u32		sdd_magic;
+	__u16		sdd_sepol_len;
+	__u16		sdd_padding1;
+	__s64		sdd_sepol_mtime;
+	char		sdd_sepol[0];
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: ".^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN	14
+
+enum lustre_quota_version {
+	LUSTRE_QUOTA_V2 = 1
+};
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+	__u64 dqi_bgrace;
+	__u64 dqi_igrace;
+	__u32 dqi_flags;
+	__u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+	__u64 dqb_bhardlimit;	/* kbytes unit */
+	__u64 dqb_bsoftlimit;	/* kbytes unit */
+	__u64 dqb_curspace;	/* bytes unit */
+	__u64 dqb_ihardlimit;
+	__u64 dqb_isoftlimit;
+	__u64 dqb_curinodes;
+	__u64 dqb_btime;
+	__u64 dqb_itime;
+	__u32 dqb_valid;
+	__u32 dqb_padding;
+};
+
+enum {
+	QC_GENERAL      = 0,
+	QC_MDTIDX       = 1,
+	QC_OSTIDX       = 2,
+	QC_UUID         = 3
+};
+
+struct if_quotactl {
+	__u32                   qc_cmd;
+	__u32                   qc_type;
+	__u32                   qc_id;
+	__u32                   qc_stat;
+	__u32                   qc_valid;
+	__u32                   qc_idx;
+	struct obd_dqinfo       qc_dqinfo;
+	struct obd_dqblk        qc_dqblk;
+	char                    obd_type[16];
+	struct obd_uuid         obd_uuid;
+	char			qc_poolname[];
+};
+
+/* swap layout flags */
+#define SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
+#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
+#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
+#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
+#define SWAP_LAYOUTS_CLOSE		(1 << 4)
+
+/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
+#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
+struct lustre_swap_layouts {
+	__u64	sl_flags;
+	__u32	sl_fd;
+	__u32	sl_gid;
+	__u64	sl_dv1;
+	__u64	sl_dv2;
+};
+
+/** Bit-mask of valid attributes */
+/* The LA_* flags are written to disk as part of the ChangeLog records
+ * so they are part of the on-disk and network protocol, and cannot be changed.
+ * Only the first 12 bits are currently saved.
+ */
+enum la_valid {
+	LA_ATIME	= 1 << 0,	/* 0x00001 */
+	LA_MTIME	= 1 << 1,	/* 0x00002 */
+	LA_CTIME	= 1 << 2,	/* 0x00004 */
+	LA_SIZE		= 1 << 3,	/* 0x00008 */
+	LA_MODE		= 1 << 4,	/* 0x00010 */
+	LA_UID		= 1 << 5,	/* 0x00020 */
+	LA_GID		= 1 << 6,	/* 0x00040 */
+	LA_BLOCKS	= 1 << 7,	/* 0x00080 */
+	LA_TYPE		= 1 << 8,	/* 0x00100 */
+	LA_FLAGS	= 1 << 9,	/* 0x00200 */
+	LA_NLINK	= 1 << 10,	/* 0x00400 */
+	LA_RDEV		= 1 << 11,	/* 0x00800 */
+	LA_BLKSIZE	= 1 << 12,	/* 0x01000 */
+	LA_KILL_SUID	= 1 << 13,	/* 0x02000 */
+	LA_KILL_SGID	= 1 << 14,	/* 0x04000 */
+	LA_PROJID	= 1 << 15,	/* 0x08000 */
+	LA_LAYOUT_VERSION = 1 << 16,	/* 0x10000 */
+	LA_LSIZE	= 1 << 17,	/* 0x20000 */
+	LA_LBLOCKS	= 1 << 18,	/* 0x40000 */
+	LA_BTIME	= 1 << 19,	/* 0x80000 */
+	/**
+	 * Attributes must be transmitted to OST objects
+	 */
+	LA_REMOTE_ATTR_SET = (LA_UID | LA_GID | LA_PROJID | LA_LAYOUT_VERSION)
+};
+
+#define MDS_FMODE_READ           00000001
+#define MDS_FMODE_WRITE          00000002
+
+#define MDS_FMODE_CLOSED         00000000
+#define MDS_FMODE_EXEC           00000004
+/*	MDS_FMODE_EPOCH          01000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_TRUNC          02000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_SOM            04000000 obsolete since 2.8.0 */
+
+#define MDS_OPEN_CREATED         00000010
+/*	MDS_OPEN_CROSS           00000020 obsolete in 2.12, internal use only */
+
+#define MDS_OPEN_CREAT           00000100
+#define MDS_OPEN_EXCL            00000200
+#define MDS_OPEN_TRUNC           00001000
+#define MDS_OPEN_APPEND          00002000
+#define MDS_OPEN_SYNC            00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_NOIMPORT	020000000 /* nocache object create */
+#define MDS_OPEN_BY_FID		040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+					   * We do not support JOIN FILE
+					   * anymore, reserve this flags
+					   * just for preventing such bit
+					   * to be reused. */
+
+#define MDS_OPEN_LOCK         04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+					      * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+						unlinked */
+#define MDS_OPEN_LEASE	   01000000000000ULL /* Open the file and grant lease
+					      * delegation, succeed if it's not
+					      * being opened with conflict mode.
+					      */
+#define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
+
+#define MDS_OPEN_RESYNC    04000000000000ULL /* FLR: file resync */
+#define MDS_OPEN_PCC      010000000000000ULL /* PCC: auto RW-PCC cache attach
+					      * for newly created file */
+#define MDS_OP_WITH_FID   020000000000000ULL /* operation carried out by FID */
+#define MDS_OPEN_DEFAULT_LMV  040000000000000ULL /* open fetches default LMV */
+
+/* lustre internal open flags, which should not be set from user space */
+#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |	\
+			      MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |	\
+			      MDS_OPEN_BY_FID | MDS_OPEN_LEASE |	\
+			      MDS_OPEN_RELEASE | MDS_OPEN_RESYNC |	\
+			      MDS_OPEN_PCC | MDS_OP_WITH_FID |		\
+			      MDS_OPEN_DEFAULT_LMV)
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+	CL_NONE     = -1,
+	CL_MARK     = 0,
+	CL_CREATE   = 1,  /* namespace */
+	CL_MKDIR    = 2,  /* namespace */
+	CL_HARDLINK = 3,  /* namespace */
+	CL_SOFTLINK = 4,  /* namespace */
+	CL_MKNOD    = 5,  /* namespace */
+	CL_UNLINK   = 6,  /* namespace */
+	CL_RMDIR    = 7,  /* namespace */
+	CL_RENAME   = 8,  /* namespace */
+	CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+	CL_OPEN     = 10, /* not currently used */
+	CL_CLOSE    = 11, /* may be written to log only with mtime change */
+	CL_LAYOUT   = 12, /* file layout/striping modified */
+	CL_TRUNC    = 13,
+	CL_SETATTR  = 14,
+	CL_SETXATTR = 15,
+	CL_XATTR    = CL_SETXATTR, /* Deprecated name */
+	CL_HSM      = 16, /* HSM specific events, see flags */
+	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+	CL_CTIME    = 18,
+	CL_ATIME    = 19,
+	CL_MIGRATE  = 20,
+	CL_FLRW     = 21, /* FLR: file was firstly written */
+	CL_RESYNC   = 22, /* FLR: file was resync-ed */
+	CL_GETXATTR = 23,
+	CL_DN_OPEN  = 24, /* denied open */
+	CL_LAST,
+};
+
+static inline const char *changelog_type2str(int type) {
+	static const char *const changelog_str[] = {
+		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
+		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT",
+		"FLRW",  "RESYNC","GXATR", "NOPEN",
+	};
+
+	if (type >= 0 && type < CL_LAST)
+		return changelog_str[type];
+	return NULL;
+}
+
+/* 12 bits of per-record data can be stored in the bottom of the flags */
+#define CLF_FLAGSHIFT   12
+enum changelog_rec_flags {
+	CLF_VERSION	= 0x1000,
+	CLF_RENAME	= 0x2000,
+	CLF_JOBID	= 0x4000,
+	CLF_EXTRA_FLAGS = 0x8000,
+	CLF_SUPPORTED	= CLF_VERSION | CLF_RENAME | CLF_JOBID |
+			  CLF_EXTRA_FLAGS,
+	CLF_FLAGMASK	= (1U << CLF_FLAGSHIFT) - 1,
+	CLF_VERMASK	= ~CLF_FLAGMASK,
+};
+
+
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+				     /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST		0x0001 /* rename unlink last hardlink
+					* of target */
+#define CLF_RENAME_LAST_EXISTS	0x0002 /* rename unlink last hardlink of target
+					* has an archive in backend */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L        0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H        6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST        15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+				   >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY        1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+	HE_ARCHIVE      = 0,
+	HE_RESTORE      = 1,
+	HE_CANCEL       = 2,
+	HE_RELEASE      = 3,
+	HE_REMOVE       = 4,
+	HE_STATE        = 5,
+	HE_SPARE1       = 6,
+	HE_SPARE2       = 7,
+	/* Leaving HE_SPARE2 as is. Its referred in the Lemur code */
+	HE_IMPORT       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+	return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H,
+					    CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(enum changelog_rec_flags *clf_flags,
+				    enum hsm_event he)
+{
+	*clf_flags = (enum changelog_rec_flags)
+		(*clf_flags | (he << CLF_HSM_EVENT_L));
+}
+
+static inline __u16 hsm_get_cl_flags(enum changelog_rec_flags clf_flags)
+{
+	return CLF_GET_BITS(clf_flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(enum changelog_rec_flags *clf_flags,
+				    unsigned int bits)
+{
+	*clf_flags = (enum changelog_rec_flags)
+		(*clf_flags | (bits << CLF_HSM_FLAG_L));
+}
+
+static inline int hsm_get_cl_error(enum changelog_rec_flags clf_flags)
+{
+	return CLF_GET_BITS(clf_flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(enum changelog_rec_flags *clf_flags,
+				    unsigned int error)
+{
+	*clf_flags = (enum changelog_rec_flags)
+		(*clf_flags | (error << CLF_HSM_ERR_L));
+}
+
+enum changelog_rec_extra_flags {
+	CLFE_INVALID	= 0,
+	CLFE_UIDGID	= 0x0001,
+	CLFE_NID	= 0x0002,
+	CLFE_OPEN	= 0x0004,
+	CLFE_XATTR	= 0x0008,
+	CLFE_SUPPORTED	= CLFE_UIDGID | CLFE_NID | CLFE_OPEN | CLFE_XATTR
+};
+
+enum changelog_send_flag {
+	/* Not yet implemented */
+	CHANGELOG_FLAG_FOLLOW      = 0x01,
+	/* Blocking IO makes sense in case of slow user parsing of the records,
+	 * but it also prevents us from cleaning up if the records are not
+	 * consumed. */
+	CHANGELOG_FLAG_BLOCK       = 0x02,
+	/* Pack jobid into the changelog records if available. */
+	CHANGELOG_FLAG_JOBID       = 0x04,
+	/* Pack additional flag bits into the changelog record */
+	CHANGELOG_FLAG_EXTRA_FLAGS = 0x08,
+};
+
+enum changelog_send_extra_flag {
+	/* Pack uid/gid into the changelog record */
+	CHANGELOG_EXTRA_FLAG_UIDGID = 0x01,
+	/* Pack nid into the changelog record */
+	CHANGELOG_EXTRA_FLAG_NID    = 0x02,
+	/* Pack open mode into the changelog record */
+	CHANGELOG_EXTRA_FLAG_OMODE  = 0x04,
+	/* Pack xattr name into the changelog record */
+	CHANGELOG_EXTRA_FLAG_XATTR  = 0x08,
+};
+
+/* unlink/rename/rmdir would log with the full path.
+ * Set the max to use PATH_MAX
+ */
+#define CR_MAXSIZE __ALIGN_KERNEL(NAME_MAX + PATH_MAX + 2 + \
+				  changelog_rec_offset(CLF_SUPPORTED, \
+						       CLFE_SUPPORTED), 8)
+
+/* 31 usable bytes string + null terminator. */
+#define LUSTRE_JOBID_SIZE	32
+
+/* This is the minimal changelog record. It can contain extensions
+ * such as rename fields or process jobid. Its exact content is described
+ * by the cr_flags and cr_extra_flags.
+ *
+ * Extensions are packed in the same order as their corresponding flags,
+ * then in the same order as their corresponding extra flags.
+ */
+struct changelog_rec {
+	__u16			cr_namelen;
+	__u16			cr_flags; /**< \a changelog_rec_flags */
+	__u32			cr_type;  /**< \a changelog_rec_type */
+	__u64			cr_index; /**< changelog record number */
+	__u64			cr_prev;  /**< last index for this target fid */
+	__u64			cr_time;
+	union {
+		struct lu_fid	cr_tfid;        /**< target fid */
+		__u32		cr_markerflags; /**< CL_MARK flags */
+	};
+	struct lu_fid		cr_pfid;        /**< parent fid */
+} __attribute__ ((packed));
+
+/* Changelog extension for RENAME. */
+struct changelog_ext_rename {
+	struct lu_fid		cr_sfid;     /**< source fid, or zero */
+	struct lu_fid		cr_spfid;    /**< source parent fid, or zero */
+};
+
+/* Changelog extension to include JOBID. */
+struct changelog_ext_jobid {
+	char	cr_jobid[LUSTRE_JOBID_SIZE];	/**< zero-terminated string. */
+};
+
+/* Changelog extension to include additional flags. */
+struct changelog_ext_extra_flags {
+	__u64 cr_extra_flags; /* Additional CLFE_* flags */
+};
+
+/* Changelog extra extension to include UID/GID. */
+struct changelog_ext_uidgid {
+	__u64	cr_uid;
+	__u64	cr_gid;
+};
+
+/* Changelog extra extension to include NID. */
+struct changelog_ext_nid {
+	/* have __u64 instead of lnet_nid_t type for use by client api */
+	__u64 cr_nid;
+	/* for use when IPv6 support is added */
+	__u64 extra;
+	__u32 padding;
+};
+
+/* Changelog extra extension to include low 32 bits of MDS_OPEN_* flags. */
+struct changelog_ext_openmode {
+	__u32 cr_openflags;
+};
+
+/* Changelog extra extension to include xattr */
+struct changelog_ext_xattr {
+	char cr_xattr[XATTR_NAME_MAX + 1]; /**< zero-terminated string. */
+};
+
+static inline struct changelog_ext_extra_flags *changelog_rec_extra_flags(
+	const struct changelog_rec *rec);
+
+static inline __kernel_size_t changelog_rec_offset(enum changelog_rec_flags crf,
+					  enum changelog_rec_extra_flags cref)
+{
+	__kernel_size_t size = sizeof(struct changelog_rec);
+
+	if (crf & CLF_RENAME)
+		size += sizeof(struct changelog_ext_rename);
+
+	if (crf & CLF_JOBID)
+		size += sizeof(struct changelog_ext_jobid);
+
+	if (crf & CLF_EXTRA_FLAGS) {
+		size += sizeof(struct changelog_ext_extra_flags);
+		if (cref & CLFE_UIDGID)
+			size += sizeof(struct changelog_ext_uidgid);
+		if (cref & CLFE_NID)
+			size += sizeof(struct changelog_ext_nid);
+		if (cref & CLFE_OPEN)
+			size += sizeof(struct changelog_ext_openmode);
+		if (cref & CLFE_XATTR)
+			size += sizeof(struct changelog_ext_xattr);
+	}
+
+	return size;
+}
+
+static inline __kernel_size_t changelog_rec_size(const struct changelog_rec *rec)
+{
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = (enum changelog_rec_extra_flags)
+			 changelog_rec_extra_flags(rec)->cr_extra_flags;
+
+	return changelog_rec_offset(
+		(enum changelog_rec_flags)rec->cr_flags, cref);
+}
+
+static inline __kernel_size_t changelog_rec_varsize(const struct changelog_rec *rec)
+{
+	return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen;
+}
+
+static inline
+struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = (enum changelog_rec_flags)
+		(rec->cr_flags & CLF_VERSION);
+
+	return (struct changelog_ext_rename *)((char *)rec +
+					       changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The jobid follows the rename extension, if present */
+static inline
+struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = (enum changelog_rec_flags)
+				(rec->cr_flags & (CLF_VERSION | CLF_RENAME));
+
+	return (struct changelog_ext_jobid *)((char *)rec +
+					      changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The additional flags follow the rename and jobid extensions, if present */
+static inline
+struct changelog_ext_extra_flags *changelog_rec_extra_flags(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = (enum changelog_rec_flags)
+	    (rec->cr_flags & (CLF_VERSION | CLF_RENAME | CLF_JOBID));
+
+	return (struct changelog_ext_extra_flags *)((char *)rec +
+						 changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The uid/gid is the first extra extension */
+static inline
+struct changelog_ext_uidgid *changelog_rec_uidgid(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = (enum changelog_rec_flags)
+	    (rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS));
+
+	return (struct changelog_ext_uidgid *)((char *)rec +
+					       changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The nid is the second extra extension */
+static inline
+struct changelog_ext_nid *changelog_rec_nid(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = (enum changelog_rec_flags)
+	    (rec->cr_flags &
+	     (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS));
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = (enum changelog_rec_extra_flags)
+			(changelog_rec_extra_flags(rec)->cr_extra_flags &
+			 CLFE_UIDGID);
+
+	return (struct changelog_ext_nid *)((char *)rec +
+					    changelog_rec_offset(crf, cref));
+}
+
+/* The OPEN mode is the third extra extension */
+static inline
+struct changelog_ext_openmode *changelog_rec_openmode(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = (enum changelog_rec_flags)
+		(rec->cr_flags &
+		 (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS));
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS) {
+		cref = (enum changelog_rec_extra_flags)
+			(changelog_rec_extra_flags(rec)->cr_extra_flags &
+			 (CLFE_UIDGID | CLFE_NID));
+	}
+
+	return (struct changelog_ext_openmode *)((char *)rec +
+					       changelog_rec_offset(crf, cref));
+}
+
+/* The xattr name is the fourth extra extension */
+static inline
+struct changelog_ext_xattr *changelog_rec_xattr(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = (enum changelog_rec_flags)
+	    (rec->cr_flags &
+	     (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS));
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = (enum changelog_rec_extra_flags)
+		    (changelog_rec_extra_flags(rec)->cr_extra_flags &
+			(CLFE_UIDGID | CLFE_NID | CLFE_OPEN));
+
+	return (struct changelog_ext_xattr *)((char *)rec +
+					      changelog_rec_offset(crf, cref));
+}
+
+/* The name follows the rename, jobid  and extra flags extns, if present */
+static inline char *changelog_rec_name(const struct changelog_rec *rec)
+{
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = (enum changelog_rec_extra_flags)
+		    changelog_rec_extra_flags(rec)->cr_extra_flags;
+
+	return (char *)rec + changelog_rec_offset(
+		(enum changelog_rec_flags)(rec->cr_flags & CLF_SUPPORTED),
+		(enum changelog_rec_extra_flags)(cref & CLFE_SUPPORTED));
+}
+
+static inline char *changelog_rec_sname(const struct changelog_rec *rec)
+{
+	char *str = changelog_rec_name(rec);
+
+	while (*str != '\0')
+		str++;
+	return str + 1;
+}
+
+static inline __kernel_size_t changelog_rec_snamelen(const struct changelog_rec *rec)
+{
+	return strlen(changelog_rec_sname(rec));
+}
+
+/**
+ * Remap a record to the desired format as specified by the crf flags.
+ * The record must be big enough to contain the final remapped version.
+ * Superfluous extension fields are removed and missing ones are added
+ * and zeroed. The flags of the record are updated accordingly.
+ *
+ * The jobid and rename extensions can be added to a record, to match the
+ * format an application expects, typically. In this case, the newly added
+ * fields will be zeroed.
+ * The Jobid field can be removed, to guarantee compatibility with older
+ * clients that don't expect this field in the records they process.
+ *
+ * The following assumptions are being made:
+ *   - CLF_RENAME will not be removed
+ *   - CLF_JOBID will not be added without CLF_RENAME being added too
+ *   - CLF_EXTRA_FLAGS will not be added without CLF_JOBID being added too
+ *
+ * @param[in,out]  rec         The record to remap.
+ * @param[in]      crf_wanted  Flags describing the desired extensions.
+ * @param[in]      cref_want   Flags describing the desired extra extensions.
+ */
+static inline int changelog_remap_rec(struct changelog_rec *rec,
+				       __kernel_size_t rec_size,
+				       enum changelog_rec_flags crf_wanted,
+				       enum changelog_rec_extra_flags cref_want)
+{
+	char *xattr_mov = NULL;
+	char *omd_mov = NULL;
+	char *nid_mov = NULL;
+	char *uidgid_mov = NULL;
+	char *ef_mov;
+	char *jid_mov;
+	char *rnm_mov;
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	crf_wanted = (enum changelog_rec_flags)
+	    (crf_wanted & CLF_SUPPORTED);
+	cref_want = (enum changelog_rec_extra_flags)
+	    (cref_want & CLFE_SUPPORTED);
+
+	if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted) {
+		if (!(rec->cr_flags & CLF_EXTRA_FLAGS) ||
+		    (rec->cr_flags & CLF_EXTRA_FLAGS &&
+		    (changelog_rec_extra_flags(rec)->cr_extra_flags &
+							CLFE_SUPPORTED) ==
+								     cref_want))
+			return 0;
+	}
+
+	if ((changelog_rec_offset(crf_wanted, cref_want) + rec->cr_namelen) >
+	    rec_size)
+		return -EOVERFLOW;
+
+	/* First move the variable-length name field */
+	memmove((char *)rec + changelog_rec_offset(crf_wanted, cref_want),
+		changelog_rec_name(rec), rec->cr_namelen);
+
+	/* Locations of extensions in the remapped record */
+	if (rec->cr_flags & CLF_EXTRA_FLAGS) {
+		xattr_mov = (char *)rec +
+			changelog_rec_offset(
+			    (enum changelog_rec_flags)
+				    (crf_wanted & CLF_SUPPORTED),
+			    (enum changelog_rec_extra_flags)
+				    (cref_want & ~CLFE_XATTR));
+		omd_mov = (char *)rec +
+			changelog_rec_offset(
+			    (enum changelog_rec_flags)
+				    (crf_wanted & CLF_SUPPORTED),
+			    (enum changelog_rec_extra_flags)
+				    (cref_want & ~(CLFE_OPEN | CLFE_XATTR)));
+		nid_mov = (char *)rec +
+			changelog_rec_offset(
+			    (enum changelog_rec_flags)
+				(crf_wanted & CLF_SUPPORTED),
+			    (enum changelog_rec_extra_flags)
+				(cref_want &
+				 ~(CLFE_NID | CLFE_OPEN | CLFE_XATTR)));
+		uidgid_mov = (char *)rec +
+			changelog_rec_offset(
+				(enum changelog_rec_flags)
+				    (crf_wanted & CLF_SUPPORTED),
+				(enum changelog_rec_extra_flags)
+				    (cref_want & ~(CLFE_UIDGID |
+							   CLFE_NID |
+							   CLFE_OPEN |
+							   CLFE_XATTR)));
+		cref = (enum changelog_rec_extra_flags)
+			changelog_rec_extra_flags(rec)->cr_extra_flags;
+	}
+
+	ef_mov  = (char *)rec +
+		  changelog_rec_offset(
+				(enum changelog_rec_flags)
+				 (crf_wanted & ~CLF_EXTRA_FLAGS), CLFE_INVALID);
+	jid_mov = (char *)rec +
+		  changelog_rec_offset((enum changelog_rec_flags)(crf_wanted &
+				       ~(CLF_EXTRA_FLAGS | CLF_JOBID)),
+				       CLFE_INVALID);
+	rnm_mov = (char *)rec +
+		  changelog_rec_offset((enum changelog_rec_flags)(crf_wanted &
+				       ~(CLF_EXTRA_FLAGS |
+					 CLF_JOBID |
+					 CLF_RENAME)),
+				       CLFE_INVALID);
+
+	/* Move the extension fields to the desired positions */
+	if ((crf_wanted & CLF_EXTRA_FLAGS) &&
+	    (rec->cr_flags & CLF_EXTRA_FLAGS)) {
+		if ((cref_want & CLFE_XATTR) && (cref & CLFE_XATTR))
+			memmove(xattr_mov, changelog_rec_xattr(rec),
+				sizeof(struct changelog_ext_xattr));
+
+		if ((cref_want & CLFE_OPEN) && (cref & CLFE_OPEN))
+			memmove(omd_mov, changelog_rec_openmode(rec),
+				sizeof(struct changelog_ext_openmode));
+
+		if ((cref_want & CLFE_NID) && (cref & CLFE_NID))
+			memmove(nid_mov, changelog_rec_nid(rec),
+				sizeof(struct changelog_ext_nid));
+
+		if ((cref_want & CLFE_UIDGID) && (cref & CLFE_UIDGID))
+			memmove(uidgid_mov, changelog_rec_uidgid(rec),
+				sizeof(struct changelog_ext_uidgid));
+
+		memmove(ef_mov, changelog_rec_extra_flags(rec),
+			sizeof(struct changelog_ext_extra_flags));
+	}
+
+	if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID))
+		memmove(jid_mov, changelog_rec_jobid(rec),
+			sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME))
+		memmove(rnm_mov, changelog_rec_rename(rec),
+			sizeof(struct changelog_ext_rename));
+
+	/* Clear newly added fields */
+	if (xattr_mov && (cref_want & CLFE_XATTR) &&
+	    !(cref & CLFE_XATTR))
+		memset(xattr_mov, 0, sizeof(struct changelog_ext_xattr));
+
+	if (omd_mov && (cref_want & CLFE_OPEN) &&
+	    !(cref & CLFE_OPEN))
+		memset(omd_mov, 0, sizeof(struct changelog_ext_openmode));
+
+	if (nid_mov && (cref_want & CLFE_NID) &&
+	    !(cref & CLFE_NID))
+		memset(nid_mov, 0, sizeof(struct changelog_ext_nid));
+
+	if (uidgid_mov && (cref_want & CLFE_UIDGID) &&
+	    !(cref & CLFE_UIDGID))
+		memset(uidgid_mov, 0, sizeof(struct changelog_ext_uidgid));
+
+	if ((crf_wanted & CLF_EXTRA_FLAGS) &&
+	    !(rec->cr_flags & CLF_EXTRA_FLAGS))
+		memset(ef_mov, 0, sizeof(struct changelog_ext_extra_flags));
+
+	if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID))
+		memset(jid_mov, 0, sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME))
+		memset(rnm_mov, 0, sizeof(struct changelog_ext_rename));
+
+	/* Update the record's flags accordingly */
+	rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted;
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		changelog_rec_extra_flags(rec)->cr_extra_flags =
+			changelog_rec_extra_flags(rec)->cr_extra_flags |
+			cref_want;
+
+	return 0;
+}
+
+enum changelog_message_type {
+	CL_RECORD = 10, /* message is a changelog_rec */
+	CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+	__u64	idv_version;
+	__u32	idv_layout_version; /* FLR: layout version for OST objects */
+	__u32	idv_flags;	/* enum ioc_data_version_flags */
+};
+
+enum ioc_data_version_flags {
+	LL_DV_RD_FLUSH	= (1 << 0), /* Flush dirty pages from clients */
+	LL_DV_WR_FLUSH	= (1 << 1), /* Flush all caching pages from clients */
+	LL_DV_SZ_UPDATE	= (1 << 2), /* Update the file size on the client */
+};
+
+#ifndef offsetof
+#define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+#define dot_fscrypt_name ".fscrypt"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+	HS_NONE		= 0x00000000,
+	HS_EXISTS	= 0x00000001,
+	HS_DIRTY	= 0x00000002,
+	HS_RELEASED	= 0x00000004,
+	HS_ARCHIVED	= 0x00000008,
+	HS_NORELEASE	= 0x00000010,
+	HS_NOARCHIVE	= 0x00000020,
+	HS_LOST		= 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+	HPS_NONE	= 0,
+	HPS_WAITING	= 1,
+	HPS_RUNNING	= 2,
+	HPS_DONE	= 3,
+};
+
+static inline const char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+	switch  (s) {
+	case HPS_WAITING:	return "waiting";
+	case HPS_RUNNING:	return "running";
+	case HPS_DONE:		return "done";
+	default:		return "unknown";
+	}
+}
+
+struct hsm_extent {
+	__u64 offset;
+	__u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+	/** Current HSM states, from enum hsm_states. */
+	__u32			hus_states;
+	__u32			hus_archive_id;
+	/**  The current undergoing action, if there is one */
+	__u32			hus_in_progress_state;
+	__u32			hus_in_progress_action;
+	struct hsm_extent	hus_in_progress_location;
+	char			hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+	struct lu_fid	hssi_fid;
+	__u64		hssi_setmask;
+	__u64		hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+	/**  The current undergoing action, if there is one */
+	/* state is one of hsm_progress_states */
+	__u32			hca_state;
+	/* action is one of hsm_user_action */
+	__u32			hca_action;
+	struct hsm_extent	hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+	HUA_NONE    =  1, /* no action (noop) */
+	HUA_ARCHIVE = 10, /* copy to hsm */
+	HUA_RESTORE = 11, /* prestage */
+	HUA_RELEASE = 12, /* drop ost objects */
+	HUA_REMOVE  = 13, /* remove from archive */
+	HUA_CANCEL  = 14, /* cancel a request */
+	HUA_IMPORT  = 15, /* add a new file */
+};
+
+static inline const char *hsm_user_action2name(enum hsm_user_action  a)
+{
+	switch  (a) {
+	case HUA_NONE:    return "NOOP";
+	case HUA_ARCHIVE: return "ARCHIVE";
+	case HUA_RESTORE: return "RESTORE";
+	case HUA_RELEASE: return "RELEASE";
+	case HUA_REMOVE:  return "REMOVE";
+	case HUA_CANCEL:  return "CANCEL";
+	case HUA_IMPORT:  return "IMPORT";
+	default:          return "UNKNOWN";
+	}
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, cannot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+	__u32 hr_action;	/* enum hsm_user_action */
+	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
+	__u64 hr_flags;		/* request flags */
+	__u32 hr_itemcount;	/* item count in hur_user_item vector */
+	__u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       struct lu_fid        hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+	struct hsm_request	hur_request;
+	struct hsm_user_item	hur_user_item[0];
+	/* extra data blob at end of struct (after all
+	 * hur_user_items), only use helpers to access it
+	 */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/**
+ * Compute the current length of the provided hsm_user_request.  This returns -1
+ * instead of an errno because __kernel_ssize_t is defined to be only
+ * [ -1, SSIZE_MAX ]
+ *
+ * return -1 on bounds check error.
+ */
+static inline __kernel_size_t hur_len(struct hsm_user_request *hur)
+{
+	__u64	size;
+
+	/* can't overflow a __u64 since hr_itemcount is only __u32 */
+	size = offsetof(struct hsm_user_request, hur_user_item[0]) +
+		(__u64)hur->hur_request.hr_itemcount *
+		sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
+
+	if ((__kernel_ssize_t)size < 0)
+		return -1;
+
+	return size;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+	HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+	HSMA_NONE    = 10, /* no action */
+	HSMA_ARCHIVE = 20, /* arbitrary offset */
+	HSMA_RESTORE = 21,
+	HSMA_REMOVE  = 22,
+	HSMA_CANCEL  = 23,
+	HSMA_IMPORT  = 24
+};
+
+static inline const char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+	switch  (a) {
+	case HSMA_NONE:    return "NOOP";
+	case HSMA_ARCHIVE: return "ARCHIVE";
+	case HSMA_RESTORE: return "RESTORE";
+	case HSMA_REMOVE:  return "REMOVE";
+	case HSMA_CANCEL:  return "CANCEL";
+	case HSMA_IMPORT:  return "IMPORT";
+	default:           return "UNKNOWN";
+	}
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+	__u32      hai_len;     /* valid size of this struct */
+	__u32      hai_action;  /* hsm_copytool_action, but use known size */
+	struct lu_fid hai_fid;     /* Lustre FID to operate on */
+	struct lu_fid hai_dfid;    /* fid used for data access */
+	struct hsm_extent hai_extent;  /* byte range to operate on */
+	__u64      hai_cookie;  /* action cookie from coordinator */
+	__u64      hai_gid;     /* grouplock id */
+	char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/**
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ *
+ * \param hai [IN]        record to print
+ * \param buffer [IN,OUT] buffer to write the hex string to
+ * \param len [IN]        max buffer length
+ *
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(const struct hsm_action_item *hai,
+					char *buffer, __kernel_size_t len)
+{
+	int i;
+	int data_len;
+	char *ptr;
+
+	ptr = buffer;
+	data_len = hai->hai_len - sizeof(*hai);
+	for (i = 0; (i < data_len) && (len > 2); i++) {
+		snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]);
+		ptr += 2;
+		len -= 2;
+	}
+
+	*ptr = '\0';
+
+	return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+	__u32 hal_version;
+	__u32 hal_count;       /* number of hai's to follow */
+	__u64 hal_compound_id; /* returned by coordinator, ignored */
+	__u64 hal_flags;
+	__u32 hal_archive_id; /* which archive backend */
+	__u32 padding1;
+	char  hal_fsname[0];   /* null-terminated */
+	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+	   boundaries. See hai_zero */
+} __attribute__((packed));
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal)
+{
+	__kernel_size_t offset = __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8);
+
+	return (struct hsm_action_item *)(hal->hal_fsname + offset);
+}
+
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+	__kernel_size_t offset = __ALIGN_KERNEL(hai->hai_len, 8);
+
+	return (struct hsm_action_item *)((char *)hai + offset);
+}
+
+/* Return size of an hsm_action_list */
+static inline __kernel_size_t hal_size(struct hsm_action_list *hal)
+{
+	__u32 i;
+	__kernel_size_t sz;
+	struct hsm_action_item *hai;
+
+	sz = sizeof(*hal) + __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8);
+	hai = hai_first(hal);
+	for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai))
+		sz += __ALIGN_KERNEL(hai->hai_len, 8);
+
+	return sz;
+}
+
+/* HSM file import
+ * describe the attributes to be set on imported file
+ */
+struct hsm_user_import {
+	__u64		hui_size;
+	__u64		hui_atime;
+	__u64		hui_mtime;
+	__u32		hui_atime_ns;
+	__u32		hui_mtime_ns;
+	__u32		hui_uid;
+	__u32		hui_gid;
+	__u32		hui_mode;
+	__u32		hui_archive_id;
+};
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+	struct lu_fid		hp_fid;
+	__u64			hp_cookie;
+	struct hsm_extent	hp_extent;
+	__u16			hp_flags;
+	__u16			hp_errval; /* positive val */
+	__u32			padding;
+};
+
+struct hsm_copy {
+	__u64			hc_data_version;
+	__u16			hc_flags;
+	__u16			hc_errval; /* positive val */
+	__u32			padding;
+	struct hsm_action_item	hc_hai;
+};
+
+enum lu_ladvise_type {
+	LU_LADVISE_INVALID	= 0,
+	LU_LADVISE_WILLREAD	= 1,
+	LU_LADVISE_DONTNEED	= 2,
+	LU_LADVISE_LOCKNOEXPAND = 3,
+	LU_LADVISE_LOCKAHEAD	= 4,
+	LU_LADVISE_MAX
+};
+
+#define LU_LADVISE_NAMES {						\
+	[LU_LADVISE_WILLREAD]		= "willread",			\
+	[LU_LADVISE_DONTNEED]		= "dontneed",			\
+	[LU_LADVISE_LOCKNOEXPAND]	= "locknoexpand",		\
+	[LU_LADVISE_LOCKAHEAD]		= "lockahead",			\
+}
+
+/* This is the userspace argument for ladvise.  It is currently the same as
+ * what goes on the wire (struct lu_ladvise), but is defined separately as we
+ * may need info which is only used locally. */
+struct llapi_lu_ladvise {
+	__u16 lla_advice;	/* advice type */
+	__u16 lla_value1;	/* values for different advice types */
+	__u32 lla_value2;
+	__u64 lla_start;	/* first byte of extent for advice */
+	__u64 lla_end;		/* last byte of extent for advice */
+	__u32 lla_value3;
+	__u32 lla_value4;
+};
+
+enum ladvise_flag {
+	LF_ASYNC	= 0x00000001,
+	LF_UNSET        = 0x00000002,
+};
+
+#define LADVISE_MAGIC 0x1ADF1CE0
+/* Masks of valid flags for each advice */
+#define LF_LOCKNOEXPAND_MASK LF_UNSET
+/* Flags valid for all advices not explicitly specified */
+#define LF_DEFAULT_MASK LF_ASYNC
+/* All flags */
+#define LF_MASK (LF_ASYNC | LF_UNSET)
+
+#define lla_lockahead_mode   lla_value1
+#define lla_peradvice_flags    lla_value2
+#define lla_lockahead_result lla_value3
+
+/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which
+ * is used on the wire.  It is defined separately as we may need info which is
+ * only used locally. */
+struct llapi_ladvise_hdr {
+	__u32			lah_magic;	/* LADVISE_MAGIC */
+	__u32			lah_count;	/* number of advices */
+	__u64			lah_flags;	/* from enum ladvise_flag */
+	__u32			lah_value1;	/* unused */
+	__u32			lah_value2;	/* unused */
+	__u64			lah_value3;	/* unused */
+	struct llapi_lu_ladvise	lah_advise[0];	/* advices in this header */
+};
+
+#define LAH_COUNT_MAX	(1024)
+
+/* Shared key */
+enum sk_crypt_alg {
+	SK_CRYPT_INVALID	= -1,
+	SK_CRYPT_EMPTY		= 0,
+	SK_CRYPT_AES256_CTR	= 1,
+};
+
+enum sk_hmac_alg {
+	SK_HMAC_INVALID	= -1,
+	SK_HMAC_EMPTY	= 0,
+	SK_HMAC_SHA256	= 1,
+	SK_HMAC_SHA512	= 2,
+};
+
+struct sk_crypt_type {
+	const char     *sct_name;
+	int		sct_type;
+};
+
+struct sk_hmac_type {
+	const char     *sht_name;
+	int		sht_type;
+};
+
+enum lock_mode_user {
+	MODE_READ_USER = 1,
+	MODE_WRITE_USER,
+	MODE_MAX_USER,
+};
+
+#define LOCK_MODE_NAMES { \
+	[MODE_READ_USER]  = "READ",\
+	[MODE_WRITE_USER] = "WRITE"\
+}
+
+enum lockahead_results {
+	LLA_RESULT_SENT = 0,
+	LLA_RESULT_DIFFERENT,
+	LLA_RESULT_SAME,
+};
+
+enum lu_heat_flag_bit {
+	LU_HEAT_FLAG_BIT_INVALID = 0,
+	LU_HEAT_FLAG_BIT_OFF,
+	LU_HEAT_FLAG_BIT_CLEAR,
+};
+
+enum lu_heat_flag {
+	LU_HEAT_FLAG_OFF	= 1ULL << LU_HEAT_FLAG_BIT_OFF,
+	LU_HEAT_FLAG_CLEAR	= 1ULL << LU_HEAT_FLAG_BIT_CLEAR,
+};
+
+enum obd_heat_type {
+	OBD_HEAT_READSAMPLE	= 0,
+	OBD_HEAT_WRITESAMPLE	= 1,
+	OBD_HEAT_READBYTE	= 2,
+	OBD_HEAT_WRITEBYTE	= 3,
+	OBD_HEAT_COUNT
+};
+
+#define LU_HEAT_NAMES {					\
+	[OBD_HEAT_READSAMPLE]	= "readsample",		\
+	[OBD_HEAT_WRITESAMPLE]	= "writesample",	\
+	[OBD_HEAT_READBYTE]	= "readbyte",		\
+	[OBD_HEAT_WRITEBYTE]	= "writebyte",		\
+}
+
+struct lu_heat {
+	__u32 lh_count;
+	__u32 lh_flags;
+	__u64 lh_heat[0];
+};
+
+enum lu_pcc_type {
+	LU_PCC_NONE = 0,
+	LU_PCC_READWRITE,
+	LU_PCC_MAX
+};
+
+static inline const char *pcc_type2string(enum lu_pcc_type type)
+{
+	switch (type) {
+	case LU_PCC_NONE:
+		return "none";
+	case LU_PCC_READWRITE:
+		return "readwrite";
+	default:
+		return "fault";
+	}
+}
+
+struct lu_pcc_attach {
+	__u32 pcca_type; /* PCC type */
+	__u32 pcca_id; /* archive ID for readwrite, group ID for readonly */
+};
+
+enum lu_pcc_detach_opts {
+	PCC_DETACH_OPT_NONE = 0, /* Detach only, keep the PCC copy */
+	PCC_DETACH_OPT_UNCACHE, /* Remove the cached file after detach */
+};
+
+struct lu_pcc_detach_fid {
+	/* fid of the file to detach */
+	struct lu_fid	pccd_fid;
+	__u32		pccd_opt;
+};
+
+struct lu_pcc_detach {
+	__u32		pccd_opt;
+};
+
+enum lu_pcc_state_flags {
+	PCC_STATE_FL_NONE		= 0x0,
+	/* The inode attr is cached locally */
+	PCC_STATE_FL_ATTR_VALID		= 0x01,
+	/* The file is being attached into PCC */
+	PCC_STATE_FL_ATTACHING		= 0x02,
+};
+
+struct lu_pcc_state {
+	__u32	pccs_type; /* enum lu_pcc_type */
+	__u32	pccs_open_count;
+	__u32	pccs_flags; /* enum lu_pcc_state_flags */
+	__u32	pccs_padding;
+	char	pccs_path[PATH_MAX];
+};
+
+enum lu_project_type {
+	LU_PROJECT_NONE = 0,
+	LU_PROJECT_SET,
+	LU_PROJECT_GET,
+	LU_PROJECT_MAX
+};
+
+struct lu_project {
+	__u32	project_type; /* enum lu_project_type */
+	__u32	project_id;
+	__u32	project_xflags;
+	__u32	project_reserved;
+	char	project_name[NAME_MAX + 1];
+};
+
+struct fid_array {
+	__u32 fa_nr;
+	/* make header's size equal lu_fid */
+	__u32 fa_padding0;
+	__u64 fa_padding1;
+	struct lu_fid fa_fids[0];
+};
+#define OBD_MAX_FIDS_IN_ARRAY	4096
+
+/* more types could be defined upon need for more complex
+ * format to be used in foreign symlink LOV/LMV EAs, like
+ * one to describe a delimiter string and occurence number
+ * of delimited sub-string, ...
+ */
+enum ll_foreign_symlink_upcall_item_type {
+	EOB_TYPE = 1,
+	STRING_TYPE = 2,
+	POSLEN_TYPE = 3,
+};
+
+/* may need to be modified to allow for more format items to be defined, and
+ * like for ll_foreign_symlink_upcall_item_type enum
+ */
+struct ll_foreign_symlink_upcall_item {
+	__u32 type;
+	union {
+		struct {
+			__u32 pos;
+			__u32 len;
+		};
+		struct {
+			size_t size;
+			union {
+				/* internal storage of constant string */
+				char *string;
+				/* upcall stores constant string in a raw */
+				char bytestring[0];
+			};
+		};
+	};
+};
+
+#define POSLEN_ITEM_SZ (offsetof(struct ll_foreign_symlink_upcall_item, len) + \
+		sizeof(((struct ll_foreign_symlink_upcall_item *)0)->len))
+#define STRING_ITEM_SZ(sz) ( \
+	offsetof(struct ll_foreign_symlink_upcall_item, bytestring) + \
+	(sz + sizeof(__u32) - 1) / sizeof(__u32) * sizeof(__u32))
+
+/* presently limited to not cause max stack frame size to be reached
+ * because of temporary automatic array of
+ * "struct ll_foreign_symlink_upcall_item" presently used in
+ * foreign_symlink_upcall_info_store()
+ */
+#define MAX_NB_UPCALL_ITEMS 32
+
+#if defined(__cplusplus)
+}
+#endif
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
new file mode 100644
index 0000000000000..5983a5ba1b366
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
@@ -0,0 +1,33 @@
+#ifndef _LUSTRE_VER_H_
+#define _LUSTRE_VER_H_
+
+/*
+ * LUSTRE_VERSION_STRING
+ *
+ * Note that some files may seem to include this header unnecessarily.
+ * If the file uses LUSTRE_VERSION_STRING, it is likely doing the include
+ * for compatibility with the Lustre code in the Linux kernel.
+ * In the Linux kernel, they are likely hard coding LUSTRE_VERSION_STRING
+ * right here in this file.  The out-of-kernel Lustre code generates
+ * LUSTRE_VERSION_STRING in autoconf with AC_DEFINE.
+ */
+
+#define OBD_OCD_VERSION(major, minor, patch, fix)			\
+	(((major) << 24) + ((minor) << 16) + ((patch) << 8) + (fix))
+
+#define OBD_OCD_VERSION_MAJOR(version)	((int)((version) >> 24) & 255)
+#define OBD_OCD_VERSION_MINOR(version)	((int)((version) >> 16) & 255)
+#define OBD_OCD_VERSION_PATCH(version)	((int)((version) >>  8) & 255)
+#define OBD_OCD_VERSION_FIX(version)	((int)((version) >>  0) & 255)
+
+#define LUSTRE_VERSION_CODE						\
+	OBD_OCD_VERSION(LUSTRE_MAJOR, LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX)
+
+/* If lustre version of client and servers it connects to differs by more
+ * than this amount, client would issue a warning.
+ * (set in lustre/autoconf/lustre-version.ac) */
+#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 3, 50, 0)
+
+extern int allow_version_mismatch;
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
new file mode 100644
index 0000000000000..6ac4a40185b5e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
@@ -0,0 +1,153 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _UPCALL_CACHE_H
+#define _UPCALL_CACHE_H
+
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lnet/lnet-types.h>
+
+/** \defgroup ucache ucache
+ *
+ * @{
+ */
+
+#define UC_CACHE_NEW            0x01
+#define UC_CACHE_ACQUIRING      0x02
+#define UC_CACHE_INVALID        0x04
+#define UC_CACHE_EXPIRED        0x08
+
+#define UC_CACHE_IS_NEW(i)          ((i)->ue_flags & UC_CACHE_NEW)
+#define UC_CACHE_IS_INVALID(i)      ((i)->ue_flags & UC_CACHE_INVALID)
+#define UC_CACHE_IS_ACQUIRING(i)    ((i)->ue_flags & UC_CACHE_ACQUIRING)
+#define UC_CACHE_IS_EXPIRED(i)      ((i)->ue_flags & UC_CACHE_EXPIRED)
+#define UC_CACHE_IS_VALID(i)        ((i)->ue_flags == 0)
+
+#define UC_CACHE_SET_NEW(i)         ((i)->ue_flags |= UC_CACHE_NEW)
+#define UC_CACHE_SET_INVALID(i)     ((i)->ue_flags |= UC_CACHE_INVALID)
+#define UC_CACHE_SET_ACQUIRING(i)   ((i)->ue_flags |= UC_CACHE_ACQUIRING)
+#define UC_CACHE_SET_EXPIRED(i)     ((i)->ue_flags |= UC_CACHE_EXPIRED)
+#define UC_CACHE_SET_VALID(i)       ((i)->ue_flags = 0)
+
+#define UC_CACHE_CLEAR_NEW(i)       ((i)->ue_flags &= ~UC_CACHE_NEW)
+#define UC_CACHE_CLEAR_ACQUIRING(i) ((i)->ue_flags &= ~UC_CACHE_ACQUIRING)
+#define UC_CACHE_CLEAR_INVALID(i)   ((i)->ue_flags &= ~UC_CACHE_INVALID)
+#define UC_CACHE_CLEAR_EXPIRED(i)   ((i)->ue_flags &= ~UC_CACHE_EXPIRED)
+
+struct upcall_cache_entry;
+
+struct md_perm {
+	lnet_nid_t      mp_nid;
+	uint32_t	mp_perm;
+};
+
+struct md_identity {
+	struct upcall_cache_entry *mi_uc_entry;
+	uid_t                      mi_uid;
+	gid_t                      mi_gid;
+	struct group_info          *mi_ginfo;
+	int                        mi_nperms;
+	struct md_perm            *mi_perms;
+};
+
+struct upcall_cache_entry {
+	struct list_head	ue_hash;
+	uint64_t		ue_key;
+	atomic_t		ue_refcount;
+	int			ue_flags;
+	wait_queue_head_t	ue_waitq;
+	time64_t		ue_acquire_expire;
+	time64_t		ue_expire;
+	union {
+		struct md_identity	identity;
+	} u;
+};
+
+#define UC_CACHE_HASH_SIZE        (128)
+#define UC_CACHE_HASH_INDEX(id)   ((id) & (UC_CACHE_HASH_SIZE - 1))
+#define UC_CACHE_UPCALL_MAXPATH   (1024UL)
+
+struct upcall_cache;
+
+struct upcall_cache_ops {
+	void            (*init_entry)(struct upcall_cache_entry *, void *args);
+	void            (*free_entry)(struct upcall_cache *,
+				      struct upcall_cache_entry *);
+	int             (*upcall_compare)(struct upcall_cache *,
+					  struct upcall_cache_entry *,
+					  __u64 key, void *args);
+	int             (*downcall_compare)(struct upcall_cache *,
+					    struct upcall_cache_entry *,
+					    __u64 key, void *args);
+	int             (*do_upcall)(struct upcall_cache *,
+				     struct upcall_cache_entry *);
+	int             (*parse_downcall)(struct upcall_cache *,
+					  struct upcall_cache_entry *, void *);
+};
+
+struct upcall_cache {
+	struct list_head	uc_hashtable[UC_CACHE_HASH_SIZE];
+	spinlock_t		uc_lock;
+	struct rw_semaphore	uc_upcall_rwsem;
+
+	char			uc_name[40];		/* for upcall */
+	char			uc_upcall[UC_CACHE_UPCALL_MAXPATH];
+	time64_t		uc_acquire_expire;	/* seconds */
+	time64_t		uc_entry_expire;	/* seconds */
+	struct upcall_cache_ops	*uc_ops;
+};
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+						  __u64 key, void *args);
+void upcall_cache_put_entry(struct upcall_cache *cache,
+			    struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+			  void *args);
+void upcall_cache_flush(struct upcall_cache *cache, int force);
+
+static inline void upcall_cache_flush_idle(struct upcall_cache *cache)
+{
+	upcall_cache_flush(cache, 0);
+}
+
+static inline void upcall_cache_flush_all(struct upcall_cache *cache)
+{
+	upcall_cache_flush(cache, 1);
+}
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args);
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+				       struct upcall_cache_ops *ops);
+void upcall_cache_cleanup(struct upcall_cache *cache);
+
+/** @} ucache */
+
+#endif /* _UPCALL_CACHE_H */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c
new file mode 100644
index 0000000000000..0c10b3098276e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c
@@ -0,0 +1,73 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include <libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <lustre_lib.h>
+
+/**
+ * Lock a lock and its resource.
+ *
+ * LDLM locking uses resource to serialize access to locks
+ * but there is a case when we change resource of lock upon
+ * enqueue reply. We rely on rcu_assign_pointer(lock->l_resource, new_res)
+ * being an atomic operation.
+ */
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock)
+{
+	struct ldlm_resource *res;
+
+	rcu_read_lock();
+	while (1) {
+		res = rcu_dereference(lock->l_resource);
+		lock_res(res);
+		if (res == lock->l_resource) {
+			ldlm_set_res_locked(lock);
+			rcu_read_unlock();
+			return res;
+		}
+		unlock_res(res);
+	}
+}
+EXPORT_SYMBOL(lock_res_and_lock);
+
+/**
+ * Unlock a lock and its resource previously locked with lock_res_and_lock
+ */
+void unlock_res_and_lock(struct ldlm_lock *lock)
+{
+	ldlm_clear_res_locked(lock);
+
+	unlock_res(lock->l_resource);
+}
+EXPORT_SYMBOL(unlock_res_and_lock);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
new file mode 100644
index 0000000000000..7d8e5e2de885a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
@@ -0,0 +1,1095 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ldlm/ldlm_extent.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of EXTENT lock type
+ *
+ * EXTENT lock type is for locking a contiguous range of values, represented
+ * by 64-bit starting and ending offsets (inclusive). There are several extent
+ * lock modes, some of which may be mutually incompatible. Extent locks are
+ * considered incompatible if their modes are incompatible and their extents
+ * intersect.  See the lock mode compatibility matrix in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <libcfs/libcfs.h>
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+#ifdef HAVE_SERVER_SUPPORT
+# define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1)
+
+/**
+ * Fix up the ldlm_extent after expanding it.
+ *
+ * After expansion has been done, we might still want to do certain adjusting
+ * based on overall contention of the resource and the like to avoid granting
+ * overly wide locks.
+ */
+static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req,
+					      struct ldlm_extent *new_ex,
+					      int conflicting)
+{
+	enum ldlm_mode req_mode = req->l_req_mode;
+	__u64 req_start = req->l_req_extent.start;
+	__u64 req_end = req->l_req_extent.end;
+	__u64 req_align, mask;
+
+        if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) {
+                if (req_end < req_start + LDLM_MAX_GROWN_EXTENT)
+                        new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT,
+                                          new_ex->end);
+        }
+
+        if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) {
+                EXIT;
+                return;
+        }
+
+        /* we need to ensure that the lock extent is properly aligned to what
+         * the client requested. Also we need to make sure it's also server
+         * page size aligned otherwise a server page can be covered by two
+         * write locks. */
+	mask = PAGE_SIZE;
+        req_align = (req_end + 1) | req_start;
+        if (req_align != 0 && (req_align & (mask - 1)) == 0) {
+                while ((req_align & mask) == 0)
+                        mask <<= 1;
+        }
+        mask -= 1;
+        /* We can only shrink the lock, not grow it.
+         * This should never cause lock to be smaller than requested,
+         * since requested lock was already aligned on these boundaries. */
+        new_ex->start = ((new_ex->start - 1) | mask) + 1;
+        new_ex->end = ((new_ex->end + 1) & ~mask) - 1;
+        LASSERTF(new_ex->start <= req_start,
+		 "mask %#llx grant start %llu req start %llu\n",
+                 mask, new_ex->start, req_start);
+        LASSERTF(new_ex->end >= req_end,
+		 "mask %#llx grant end %llu req end %llu\n",
+                 mask, new_ex->end, req_end);
+}
+
+/**
+ * Return the maximum extent that:
+ * - contains the requested extent
+ * - does not overlap existing conflicting extents outside the requested one
+ *
+ * This allows clients to request a small required extent range, but if there
+ * is no contention on the lock the full lock can be granted to the client.
+ * This avoids the need for many smaller lock requests to be granted in the
+ * common (uncontended) case.
+ *
+ * Use interval tree to expand the lock extent for granted lock.
+ */
+static void ldlm_extent_internal_policy_granted(struct ldlm_lock *req,
+                                                struct ldlm_extent *new_ex)
+{
+	struct ldlm_resource *res = req->l_resource;
+	enum ldlm_mode req_mode = req->l_req_mode;
+	__u64 req_start = req->l_req_extent.start;
+	__u64 req_end = req->l_req_extent.end;
+	struct ldlm_interval_tree *tree;
+	struct interval_node_extent limiter = {
+		.start	= new_ex->start,
+		.end	= new_ex->end,
+	};
+	int conflicting = 0;
+	int idx;
+	ENTRY;
+
+	lockmode_verify(req_mode);
+
+	/* Using interval tree to handle the LDLM extent granted locks. */
+        for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		struct interval_node_extent ext = {
+			.start	= req_start,
+			.end	= req_end,
+		};
+
+                tree = &res->lr_itree[idx];
+                if (lockmode_compat(tree->lit_mode, req_mode))
+                        continue;
+
+                conflicting += tree->lit_size;
+                if (conflicting > 4)
+                        limiter.start = req_start;
+
+                if (interval_is_overlapped(tree->lit_root, &ext))
+                        CDEBUG(D_INFO, 
+                               "req_mode = %d, tree->lit_mode = %d, "
+                               "tree->lit_size = %d\n",
+                               req_mode, tree->lit_mode, tree->lit_size);
+                interval_expand(tree->lit_root, &ext, &limiter);
+                limiter.start = max(limiter.start, ext.start);
+                limiter.end = min(limiter.end, ext.end);
+                if (limiter.start == req_start && limiter.end == req_end)
+                        break;
+        }
+
+        new_ex->start = limiter.start;
+        new_ex->end = limiter.end;
+        LASSERT(new_ex->start <= req_start);
+        LASSERT(new_ex->end >= req_end);
+
+        ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
+        EXIT;
+}
+
+/* The purpose of this function is to return:
+ * - the maximum extent
+ * - containing the requested extent
+ * - and not overlapping existing conflicting extents outside the requested one
+ */
+static void
+ldlm_extent_internal_policy_waiting(struct ldlm_lock *req,
+                                    struct ldlm_extent *new_ex)
+{
+	struct ldlm_resource *res = req->l_resource;
+	enum ldlm_mode req_mode = req->l_req_mode;
+	__u64 req_start = req->l_req_extent.start;
+	__u64 req_end = req->l_req_extent.end;
+	struct ldlm_lock *lock;
+	int conflicting = 0;
+	ENTRY;
+
+	lockmode_verify(req_mode);
+
+	/* for waiting locks */
+	list_for_each_entry(lock, &res->lr_waiting, l_res_link) {
+		struct ldlm_extent *l_extent = &lock->l_policy_data.l_extent;
+
+		/* We already hit the minimum requested size, search no more */
+		if (new_ex->start == req_start && new_ex->end == req_end) {
+			EXIT;
+			return;
+		}
+
+                /* Don't conflict with ourselves */
+                if (req == lock)
+                        continue;
+
+                /* Locks are compatible, overlap doesn't matter */
+                /* Until bug 20 is fixed, try to avoid granting overlapping
+                 * locks on one client (they take a long time to cancel) */
+                if (lockmode_compat(lock->l_req_mode, req_mode) &&
+                    lock->l_export != req->l_export)
+                        continue;
+
+                /* If this is a high-traffic lock, don't grow downwards at all
+                 * or grow upwards too much */
+                ++conflicting;
+                if (conflicting > 4)
+                        new_ex->start = req_start;
+
+                /* If lock doesn't overlap new_ex, skip it. */
+                if (!ldlm_extent_overlap(l_extent, new_ex))
+                        continue;
+
+                /* Locks conflicting in requested extents and we can't satisfy
+                 * both locks, so ignore it.  Either we will ping-pong this
+                 * extent (we would regardless of what extent we granted) or
+                 * lock is unused and it shouldn't limit our extent growth. */
+                if (ldlm_extent_overlap(&lock->l_req_extent,&req->l_req_extent))
+                        continue;
+
+                /* We grow extents downwards only as far as they don't overlap
+                 * with already-granted locks, on the assumption that clients
+                 * will be writing beyond the initial requested end and would
+                 * then need to enqueue a new lock beyond previous request.
+                 * l_req_extent->end strictly < req_start, checked above. */
+                if (l_extent->start < req_start && new_ex->start != req_start) {
+                        if (l_extent->end >= req_start)
+                                new_ex->start = req_start;
+                        else
+                                new_ex->start = min(l_extent->end+1, req_start);
+                }
+
+                /* If we need to cancel this lock anyways because our request
+                 * overlaps the granted lock, we grow up to its requested
+                 * extent start instead of limiting this extent, assuming that
+                 * clients are writing forwards and the lock had over grown
+                 * its extent downwards before we enqueued our request. */
+                if (l_extent->end > req_end) {
+                        if (l_extent->start <= req_end)
+                                new_ex->end = max(lock->l_req_extent.start - 1,
+                                                  req_end);
+                        else
+                                new_ex->end = max(l_extent->start - 1, req_end);
+                }
+        }
+
+        ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
+        EXIT;
+}
+
+
+/* In order to determine the largest possible extent we can grant, we need
+ * to scan all of the queues. */
+static void ldlm_extent_policy(struct ldlm_resource *res,
+			       struct ldlm_lock *lock, __u64 *flags)
+{
+	struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
+
+	if (lock->l_export == NULL)
+		/*
+		 * this is a local lock taken by server (e.g., as a part of
+		 * OST-side locking, or unlink handling). Expansion doesn't
+		 * make a lot of sense for local locks, because they are
+		 * dropped immediately on operation completion and would only
+		 * conflict with other threads.
+		 */
+		return;
+
+	if (lock->l_policy_data.l_extent.start == 0 &&
+	    lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
+		/* fast-path whole file locks */
+		return;
+
+	/* Because reprocess_queue zeroes flags and uses it to return
+	 * LDLM_FL_LOCK_CHANGED, we must check for the NO_EXPANSION flag
+	 * in the lock flags rather than the 'flags' argument */
+	if (likely(!(lock->l_flags & LDLM_FL_NO_EXPANSION))) {
+		ldlm_extent_internal_policy_granted(lock, &new_ex);
+		ldlm_extent_internal_policy_waiting(lock, &new_ex);
+	} else {
+		LDLM_DEBUG(lock, "Not expanding manually requested lock.\n");
+		new_ex.start = lock->l_policy_data.l_extent.start;
+		new_ex.end = lock->l_policy_data.l_extent.end;
+		/* In case the request is not on correct boundaries, we call
+		 * fixup. (normally called in ldlm_extent_internal_policy_*) */
+		ldlm_extent_internal_policy_fixup(lock, &new_ex, 0);
+	}
+
+	if (!ldlm_extent_equal(&new_ex, &lock->l_policy_data.l_extent)) {
+		*flags |= LDLM_FL_LOCK_CHANGED;
+		lock->l_policy_data.l_extent.start = new_ex.start;
+		lock->l_policy_data.l_extent.end = new_ex.end;
+	}
+}
+
+static bool ldlm_check_contention(struct ldlm_lock *lock, int contended_locks)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	time64_t now = ktime_get_seconds();
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_SET_CONTENTION))
+		return true;
+
+	CDEBUG(D_DLMTRACE, "contended locks = %d\n", contended_locks);
+	if (contended_locks > ldlm_res_to_ns(res)->ns_contended_locks)
+		res->lr_contention_time = now;
+
+	return now < res->lr_contention_time +
+		     ldlm_res_to_ns(res)->ns_contention_time;
+}
+
+struct ldlm_extent_compat_args {
+	struct list_head *work_list;
+	struct ldlm_lock *lock;
+	enum ldlm_mode mode;
+	int *locks;
+	int *compat;
+};
+
+static enum interval_iter ldlm_extent_compat_cb(struct interval_node *n,
+						void *data)
+{
+	struct ldlm_extent_compat_args *priv = data;
+	struct ldlm_interval *node = to_ldlm_interval(n);
+	struct ldlm_extent *extent;
+	struct list_head *work_list = priv->work_list;
+	struct ldlm_lock *lock, *enq = priv->lock;
+	enum ldlm_mode mode = priv->mode;
+	int count = 0;
+	ENTRY;
+
+	LASSERT(!list_empty(&node->li_group));
+
+	list_for_each_entry(lock, &node->li_group, l_sl_policy) {
+                /* interval tree is for granted lock */
+                LASSERTF(mode == lock->l_granted_mode,
+                         "mode = %s, lock->l_granted_mode = %s\n",
+                         ldlm_lockname[mode],
+                         ldlm_lockname[lock->l_granted_mode]);
+                count++;
+		if (lock->l_blocking_ast &&
+		    lock->l_granted_mode != LCK_GROUP)
+                        ldlm_add_ast_work_item(lock, enq, work_list);
+        }
+
+        /* don't count conflicting glimpse locks */
+        extent = ldlm_interval_extent(node);
+        if (!(mode == LCK_PR &&
+            extent->start == 0 && extent->end == OBD_OBJECT_EOF))
+                *priv->locks += count;
+
+        if (priv->compat)
+                *priv->compat = 0;
+
+        RETURN(INTERVAL_ITER_CONT);
+}
+
+/**
+ * Determine if the lock is compatible with all locks on the queue.
+ *
+ * If \a work_list is provided, conflicting locks are linked there.
+ * If \a work_list is not provided, we exit this function on first conflict.
+ *
+ * \retval 0 if the lock is not compatible
+ * \retval 1 if the lock is compatible
+ * \retval 2 if \a req is a group lock and it is compatible and requires
+ *           no further checking
+ * \retval negative error, such as EAGAIN for group locks
+ */
+static int
+ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
+			 __u64 *flags, struct list_head *work_list,
+			 int *contended_locks)
+{
+	struct ldlm_resource *res = req->l_resource;
+	enum ldlm_mode req_mode = req->l_req_mode;
+	__u64 req_start = req->l_req_extent.start;
+	__u64 req_end = req->l_req_extent.end;
+	struct ldlm_lock *lock;
+	int check_contention;
+	int compat = 1;
+	ENTRY;
+
+        lockmode_verify(req_mode);
+
+        /* Using interval tree for granted lock */
+        if (queue == &res->lr_granted) {
+                struct ldlm_interval_tree *tree;
+                struct ldlm_extent_compat_args data = {.work_list = work_list,
+                                               .lock = req,
+                                               .locks = contended_locks,
+                                               .compat = &compat };
+                struct interval_node_extent ex = { .start = req_start,
+                                                   .end = req_end };
+                int idx, rc;
+
+                for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+                        tree = &res->lr_itree[idx];
+                        if (tree->lit_root == NULL) /* empty tree, skipped */
+                                continue;
+
+                        data.mode = tree->lit_mode;
+                        if (lockmode_compat(req_mode, tree->lit_mode)) {
+                                struct ldlm_interval *node;
+                                struct ldlm_extent *extent;
+
+                                if (req_mode != LCK_GROUP)
+                                        continue;
+
+                                /* group lock, grant it immediately if
+                                 * compatible */
+                                node = to_ldlm_interval(tree->lit_root);
+                                extent = ldlm_interval_extent(node);
+                                if (req->l_policy_data.l_extent.gid ==
+                                    extent->gid)
+                                        RETURN(2);
+                        }
+
+                        if (tree->lit_mode == LCK_GROUP) {
+				if (*flags & (LDLM_FL_BLOCK_NOWAIT |
+					      LDLM_FL_SPECULATIVE)) {
+					compat = -EAGAIN;
+                                        goto destroylock;
+                                }
+
+                                if (!work_list)
+                                        RETURN(0);
+
+                                /* if work list is not NULL,add all
+                                   locks in the tree to work list */
+                                compat = 0;
+                                interval_iterate(tree->lit_root,
+                                                 ldlm_extent_compat_cb, &data);
+                                continue;
+                        }
+
+			/* We've found a potentially blocking lock, check
+			 * compatibility.  This handles locks other than GROUP
+			 * locks, which are handled separately above.
+			 *
+			 * Locks with FL_SPECULATIVE are asynchronous requests
+			 * which must never wait behind another lock, so they
+			 * fail if any conflicting lock is found. */
+			if (!work_list || (*flags & LDLM_FL_SPECULATIVE)) {
+				rc = interval_is_overlapped(tree->lit_root,
+							    &ex);
+				if (rc) {
+					if (!work_list) {
+						RETURN(0);
+					} else {
+						compat = -EAGAIN;
+						goto destroylock;
+					}
+				}
+                        } else {
+                                interval_search(tree->lit_root, &ex,
+                                                ldlm_extent_compat_cb, &data);
+				if (!list_empty(work_list) && compat)
+                                        compat = 0;
+                        }
+                }
+        } else { /* for waiting queue */
+		list_for_each_entry(lock, queue, l_res_link) {
+                        check_contention = 1;
+
+			/* We stop walking the queue if we hit ourselves so
+			 * we don't take conflicting locks enqueued after us
+			 * into account, or we'd wait forever. */
+                        if (req == lock)
+                                break;
+
+                        /* locks are compatible, overlap doesn't matter */
+                        if (lockmode_compat(lock->l_req_mode, req_mode)) {
+                                if (req_mode == LCK_PR &&
+                                    ((lock->l_policy_data.l_extent.start <=
+                                      req->l_policy_data.l_extent.start) &&
+                                     (lock->l_policy_data.l_extent.end >=
+                                      req->l_policy_data.l_extent.end))) {
+					/* If we met a PR lock just like us or
+					   wider, and nobody down the list
+					   conflicted with it, that means we
+					   can skip processing of the rest of
+					   the list and safely place ourselves
+					   at the end of the list, or grant
+					   (dependent if we met an conflicting
+					   locks before in the list).  In case
+					   of 1st enqueue only we continue
+					   traversing if there is something
+					   conflicting down the list because
+					   we need to make sure that something
+					   is marked as AST_SENT as well, in
+					   cse of empy worklist we would exit
+					   on first conflict met. */
+					/* There IS a case where such flag is
+					   not set for a lock, yet it blocks
+					   something. Luckily for us this is
+					   only during destroy, so lock is
+					   exclusive. So here we are safe */
+					if (!ldlm_is_ast_sent(lock))
+						RETURN(compat);
+                                }
+
+                                /* non-group locks are compatible, overlap doesn't
+                                   matter */
+                                if (likely(req_mode != LCK_GROUP))
+                                        continue;
+
+                                /* If we are trying to get a GROUP lock and there is
+                                   another one of this kind, we need to compare gid */
+                                if (req->l_policy_data.l_extent.gid ==
+                                    lock->l_policy_data.l_extent.gid) {
+                                        /* If existing lock with matched gid is granted,
+                                           we grant new one too. */
+					if (ldlm_is_granted(lock))
+						RETURN(2);
+
+                                        /* Otherwise we are scanning queue of waiting
+                                         * locks and it means current request would
+                                         * block along with existing lock (that is
+                                         * already blocked.
+                                         * If we are in nonblocking mode - return
+                                         * immediately */
+					if (*flags & (LDLM_FL_BLOCK_NOWAIT
+						      | LDLM_FL_SPECULATIVE)) {
+						compat = -EAGAIN;
+                                                goto destroylock;
+                                        }
+                                        /* If this group lock is compatible with another
+                                         * group lock on the waiting list, they must be
+                                         * together in the list, so they can be granted
+                                         * at the same time.  Otherwise the later lock
+                                         * can get stuck behind another, incompatible,
+                                         * lock. */
+                                        ldlm_resource_insert_lock_after(lock, req);
+                                        /* Because 'lock' is not granted, we can stop
+                                         * processing this queue and return immediately.
+                                         * There is no need to check the rest of the
+                                         * list. */
+                                        RETURN(0);
+                                }
+                        }
+
+			if (unlikely(req_mode == LCK_GROUP &&
+				     !ldlm_is_granted(lock))) {
+                                compat = 0;
+                                if (lock->l_req_mode != LCK_GROUP) {
+                                        /* Ok, we hit non-GROUP lock, there should be no
+                                           more GROUP locks later on, queue in front of
+                                           first non-GROUP lock */
+
+					ldlm_resource_insert_lock_before(lock, req);
+                                        break;
+                                }
+				LASSERT(req->l_policy_data.l_extent.gid !=
+					lock->l_policy_data.l_extent.gid);
+				continue;
+                        }
+
+                        if (unlikely(lock->l_req_mode == LCK_GROUP)) {
+				/* If compared lock is GROUP, then requested is
+				 * PR/PW so this is not compatible; extent
+				 * range does not matter */
+				if (*flags & (LDLM_FL_BLOCK_NOWAIT
+					      | LDLM_FL_SPECULATIVE)) {
+					compat = -EAGAIN;
+                                        goto destroylock;
+                                }
+                        } else if (lock->l_policy_data.l_extent.end < req_start ||
+                                   lock->l_policy_data.l_extent.start > req_end) {
+                                /* if a non group lock doesn't overlap skip it */
+                                continue;
+                        } else if (lock->l_req_extent.end < req_start ||
+                                   lock->l_req_extent.start > req_end) {
+                                /* false contention, the requests doesn't really overlap */
+                                check_contention = 0;
+                        }
+
+                        if (!work_list)
+                                RETURN(0);
+
+			if (*flags & LDLM_FL_SPECULATIVE) {
+				compat = -EAGAIN;
+				goto destroylock;
+			}
+
+                        /* don't count conflicting glimpse locks */
+                        if (lock->l_req_mode == LCK_PR &&
+                            lock->l_policy_data.l_extent.start == 0 &&
+                            lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
+                                check_contention = 0;
+
+                        *contended_locks += check_contention;
+
+                        compat = 0;
+			if (lock->l_blocking_ast &&
+			    lock->l_req_mode != LCK_GROUP)
+                                ldlm_add_ast_work_item(lock, req, work_list);
+                }
+        }
+
+        if (ldlm_check_contention(req, *contended_locks) &&
+            compat == 0 &&
+            (*flags & LDLM_FL_DENY_ON_CONTENTION) &&
+            req->l_req_mode != LCK_GROUP &&
+            req_end - req_start <=
+            ldlm_res_to_ns(req->l_resource)->ns_max_nolock_size)
+                GOTO(destroylock, compat = -EUSERS);
+
+        RETURN(compat);
+destroylock:
+	list_del_init(&req->l_res_link);
+        ldlm_lock_destroy_nolock(req);
+        RETURN(compat);
+}
+
+/**
+ * This function refresh eviction timer for cancelled lock.
+ * \param[in] lock		ldlm lock for refresh
+ * \param[in] arg		ldlm prolong arguments, timeout, export, extent
+ *				and counter are used
+ */
+void ldlm_lock_prolong_one(struct ldlm_lock *lock,
+			   struct ldlm_prolong_args *arg)
+{
+	timeout_t timeout;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PROLONG_PAUSE, 3);
+
+	if (arg->lpa_export != lock->l_export ||
+	    lock->l_flags & LDLM_FL_DESTROYED)
+		/* ignore unrelated locks */
+		return;
+
+	arg->lpa_locks_cnt++;
+
+	if (!(lock->l_flags & LDLM_FL_AST_SENT))
+		/* ignore locks not being cancelled */
+		return;
+
+	/* We are in the middle of the process - BL AST is sent, CANCEL
+	 * is ahead. Take half of BL AT + IO AT process time.
+	 */
+	timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1);
+
+	arg->lpa_blocks_cnt++;
+
+	/* OK. this is a possible lock the user holds doing I/O
+	 * let's refresh eviction timer for it.
+	 */
+	ldlm_refresh_waiting_lock(lock, timeout);
+}
+EXPORT_SYMBOL(ldlm_lock_prolong_one);
+
+static enum interval_iter ldlm_resource_prolong_cb(struct interval_node *n,
+						   void *data)
+{
+	struct ldlm_prolong_args *arg = data;
+	struct ldlm_interval *node = to_ldlm_interval(n);
+	struct ldlm_lock *lock;
+
+	ENTRY;
+
+	LASSERT(!list_empty(&node->li_group));
+
+	list_for_each_entry(lock, &node->li_group, l_sl_policy) {
+		ldlm_lock_prolong_one(lock, arg);
+	}
+
+	RETURN(INTERVAL_ITER_CONT);
+}
+
+/**
+ * Walk through granted tree and prolong locks if they overlaps extent.
+ *
+ * \param[in] arg		prolong args
+ */
+void ldlm_resource_prolong(struct ldlm_prolong_args *arg)
+{
+	struct ldlm_interval_tree *tree;
+	struct ldlm_resource *res;
+	struct interval_node_extent ex = { .start = arg->lpa_extent.start,
+					   .end = arg->lpa_extent.end };
+	int idx;
+
+	ENTRY;
+
+	res = ldlm_resource_get(arg->lpa_export->exp_obd->obd_namespace, NULL,
+				&arg->lpa_resid, LDLM_EXTENT, 0);
+	if (IS_ERR(res)) {
+		CDEBUG(D_DLMTRACE, "Failed to get resource for resid %llu/%llu\n",
+		       arg->lpa_resid.name[0], arg->lpa_resid.name[1]);
+		RETURN_EXIT;
+	}
+
+	lock_res(res);
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		tree = &res->lr_itree[idx];
+		if (tree->lit_root == NULL) /* empty tree, skipped */
+			continue;
+
+		/* There is no possibility to check for the groupID
+		 * so all the group locks are considered as valid
+		 * here, especially because the client is supposed
+		 * to check it has such a lock before sending an RPC.
+		 */
+		if (!(tree->lit_mode & arg->lpa_mode))
+			continue;
+
+		interval_search(tree->lit_root, &ex,
+				ldlm_resource_prolong_cb, arg);
+	}
+
+	unlock_res(res);
+	ldlm_resource_putref(res);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_resource_prolong);
+
+/**
+ * Process a granting attempt for extent lock.
+ * Must be called with ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ */
+int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
+			     enum ldlm_process_intention intention,
+			     enum ldlm_error *err, struct list_head *work_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	int rc, rc2 = 0;
+	int contended_locks = 0;
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
+	ENTRY;
+
+	LASSERT(!ldlm_is_granted(lock));
+	LASSERT(!(*flags & LDLM_FL_DENY_ON_CONTENTION) ||
+		!ldlm_is_ast_discard_data(lock));
+	check_res_locked(res);
+	*err = ELDLM_OK;
+
+	if (intention == LDLM_PROCESS_RESCAN) {
+		/* Careful observers will note that we don't handle -EAGAIN
+		 * here, but it's ok for a non-obvious reason -- compat_queue
+		 * can only return -EAGAIN if (flags & BLOCK_NOWAIT |
+		 * SPECULATIVE). flags should always be zero here, and if that
+		 * ever stops being true, we want to find out. */
+                LASSERT(*flags == 0);
+                rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags,
+					      NULL, &contended_locks);
+                if (rc == 1) {
+                        rc = ldlm_extent_compat_queue(&res->lr_waiting, lock,
+						      flags, NULL,
+                                                      &contended_locks);
+                }
+                if (rc == 0)
+                        RETURN(LDLM_ITER_STOP);
+
+                ldlm_resource_unlink_lock(lock);
+
+                if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_EVICT_RACE))
+                        ldlm_extent_policy(res, lock, flags);
+		ldlm_grant_lock(lock, grant_work);
+                RETURN(LDLM_ITER_CONTINUE);
+        }
+
+        contended_locks = 0;
+	rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags,
+				      work_list, &contended_locks);
+	if (rc < 0)
+		GOTO(out, *err = rc);
+
+	if (rc != 2) {
+		rc2 = ldlm_extent_compat_queue(&res->lr_waiting, lock,
+					       flags, work_list,
+					       &contended_locks);
+		if (rc2 < 0)
+			GOTO(out, *err = rc = rc2);
+	}
+
+	if (rc + rc2 == 2) {
+		ldlm_extent_policy(res, lock, flags);
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, grant_work);
+	} else {
+		/* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to
+		 * force client to wait for the lock endlessly once
+		 * the lock is enqueued -bzzz */
+		*flags |= LDLM_FL_NO_TIMEOUT;
+	}
+
+	RETURN(LDLM_ITER_CONTINUE);
+out:
+	return rc;
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+struct ldlm_kms_shift_args {
+	__u64	old_kms;
+	__u64	kms;
+	bool    complete;
+};
+
+/* Callback for interval_iterate functions, used by ldlm_extent_shift_Kms */
+static enum interval_iter ldlm_kms_shift_cb(struct interval_node *n,
+					    void *args)
+{
+	struct ldlm_kms_shift_args *arg = args;
+	struct ldlm_interval *node = to_ldlm_interval(n);
+	struct ldlm_lock *tmplock;
+	struct ldlm_lock *lock = NULL;
+
+	ENTRY;
+
+	/* Since all locks in an interval have the same extent, we can just
+	 * use the first lock without kms_ignore set. */
+	list_for_each_entry(tmplock, &node->li_group, l_sl_policy) {
+		if (ldlm_is_kms_ignore(tmplock))
+			continue;
+
+		lock = tmplock;
+
+		break;
+	}
+
+	/* No locks in this interval without kms_ignore set */
+	if (!lock)
+		RETURN(INTERVAL_ITER_CONT);
+
+	/* If we find a lock with a greater or equal kms, we are not the
+	 * highest lock (or we share that distinction with another lock), and
+	 * don't need to update KMS.  Return old_kms and stop looking. */
+	if (lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF ||
+	    lock->l_policy_data.l_extent.end + 1 >= arg->old_kms) {
+		arg->kms = arg->old_kms;
+		arg->complete = true;
+		RETURN(INTERVAL_ITER_STOP);
+	}
+
+	if (lock->l_policy_data.l_extent.end + 1 > arg->kms)
+		arg->kms = lock->l_policy_data.l_extent.end + 1;
+
+	/* Since interval_iterate_reverse starts with the highest lock and
+	 * works down, for PW locks, we only need to check if we should update
+	 * the kms, then stop walking the tree.  PR locks are not exclusive, so
+	 * the highest start does not imply the highest end and we must
+	 * continue. (Only one group lock is allowed per resource, so this is
+	 * irrelevant for group locks.)*/
+	if (lock->l_granted_mode == LCK_PW)
+		RETURN(INTERVAL_ITER_STOP);
+	else
+		RETURN(INTERVAL_ITER_CONT);
+}
+
+/* When a lock is cancelled by a client, the KMS may undergo change if this
+ * is the "highest lock".  This function returns the new KMS value, updating
+ * it only if we were the highest lock.
+ *
+ * Caller must hold lr_lock already.
+ *
+ * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct ldlm_interval_tree *tree;
+	struct ldlm_kms_shift_args args;
+	int idx = 0;
+
+	ENTRY;
+
+	args.old_kms = old_kms;
+	args.kms = 0;
+	args.complete = false;
+
+	/* don't let another thread in ldlm_extent_shift_kms race in
+	 * just after we finish and take our lock into account in its
+	 * calculation of the kms */
+	ldlm_set_kms_ignore(lock);
+
+	/* We iterate over the lock trees, looking for the largest kms smaller
+	 * than the current one. */
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		tree = &res->lr_itree[idx];
+
+		/* If our already known kms is >= than the highest 'end' in
+		 * this tree, we don't need to check this tree, because
+		 * the kms from a tree can be lower than in_max_high (due to
+		 * kms_ignore), but it can never be higher. */
+		if (!tree->lit_root || args.kms >= tree->lit_root->in_max_high)
+			continue;
+
+		interval_iterate_reverse(tree->lit_root, ldlm_kms_shift_cb,
+					 &args);
+
+		/* this tells us we're not the highest lock, so we don't need
+		 * to check the remaining trees */
+		if (args.complete)
+			break;
+	}
+
+	LASSERTF(args.kms <= args.old_kms, "kms %llu old_kms %llu\n", args.kms,
+		 args.old_kms);
+
+	RETURN(args.kms);
+}
+EXPORT_SYMBOL(ldlm_extent_shift_kms);
+
+struct kmem_cache *ldlm_interval_slab;
+static struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+	struct ldlm_interval *node;
+	ENTRY;
+
+	LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+	if (node == NULL)
+		RETURN(NULL);
+
+	INIT_LIST_HEAD(&node->li_group);
+	ldlm_interval_attach(node, lock);
+	RETURN(node);
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+        if (node) {
+		LASSERT(list_empty(&node->li_group));
+                LASSERT(!interval_is_intree(&node->li_node));
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+        }
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+                          struct ldlm_lock *l)
+{
+        LASSERT(l->l_tree_node == NULL);
+        LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+	list_add_tail(&l->l_sl_policy, &n->li_group);
+        l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+        struct ldlm_interval *n = l->l_tree_node;
+
+        if (n == NULL)
+                return NULL;
+
+	LASSERT(!list_empty(&n->li_group));
+        l->l_tree_node = NULL;
+	list_del_init(&l->l_sl_policy);
+
+	return list_empty(&n->li_group) ? n : NULL;
+}
+
+static inline int ldlm_mode_to_index(enum ldlm_mode mode)
+{
+	int index;
+
+	LASSERT(mode != 0);
+	LASSERT(is_power_of_2(mode));
+	index = ilog2(mode);
+	LASSERT(index < LCK_MODE_NUM);
+	return index;
+}
+
+int ldlm_extent_alloc_lock(struct ldlm_lock *lock)
+{
+	lock->l_tree_node = NULL;
+	if (ldlm_interval_alloc(lock) == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+/** Add newly granted lock into interval tree for the resource. */
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+                          struct ldlm_lock *lock)
+{
+        struct interval_node *found, **root;
+        struct ldlm_interval *node;
+        struct ldlm_extent *extent;
+	int idx, rc;
+
+	LASSERT(ldlm_is_granted(lock));
+
+        node = lock->l_tree_node;
+        LASSERT(node != NULL);
+        LASSERT(!interval_is_intree(&node->li_node));
+
+	idx = ldlm_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == BIT(idx));
+	LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+        /* node extent initialize */
+        extent = &lock->l_policy_data.l_extent;
+
+	rc = interval_set(&node->li_node, extent->start, extent->end);
+	LASSERT(!rc);
+
+        root = &res->lr_itree[idx].lit_root;
+        found = interval_insert(&node->li_node, root);
+        if (found) { /* The policy group found. */
+                struct ldlm_interval *tmp = ldlm_interval_detach(lock);
+                LASSERT(tmp != NULL);
+                ldlm_interval_free(tmp);
+                ldlm_interval_attach(to_ldlm_interval(found), lock);
+        }
+        res->lr_itree[idx].lit_size++;
+
+        /* even though we use interval tree to manage the extent lock, we also
+         * add the locks into grant list, for debug purpose, .. */
+        ldlm_resource_add_lock(res, &res->lr_granted, lock);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GRANT_CHECK)) {
+		struct ldlm_lock *lck;
+
+		list_for_each_entry_reverse(lck, &res->lr_granted,
+					    l_res_link) {
+			if (lck == lock)
+				continue;
+			if (lockmode_compat(lck->l_granted_mode,
+					    lock->l_granted_mode))
+				continue;
+			if (ldlm_extent_overlap(&lck->l_req_extent,
+						&lock->l_req_extent)) {
+				CDEBUG(D_ERROR, "granting conflicting lock %p "
+						"%p\n", lck, lock);
+				ldlm_resource_dump(D_ERROR, res);
+				LBUG();
+			}
+		}
+	}
+}
+
+/** Remove cancelled lock from resource interval tree. */
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct ldlm_interval *node = lock->l_tree_node;
+	struct ldlm_interval_tree *tree;
+	int idx;
+
+	if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */
+		return;
+
+	idx = ldlm_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == BIT(idx));
+	tree = &res->lr_itree[idx];
+
+	LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+	tree->lit_size--;
+	node = ldlm_interval_detach(lock);
+	if (node) {
+		interval_erase(&node->li_node, &tree->lit_root);
+		ldlm_interval_free(node);
+	}
+}
+
+void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				      union ldlm_policy_data *lpolicy)
+{
+	lpolicy->l_extent.start = wpolicy->l_extent.start;
+	lpolicy->l_extent.end = wpolicy->l_extent.end;
+	lpolicy->l_extent.gid = wpolicy->l_extent.gid;
+}
+
+void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				      union ldlm_wire_policy_data *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_extent.start = lpolicy->l_extent.start;
+	wpolicy->l_extent.end = lpolicy->l_extent.end;
+	wpolicy->l_extent.gid = lpolicy->l_extent.gid;
+}
+
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
new file mode 100644
index 0000000000000..745c1ea580fa8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
@@ -0,0 +1,958 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company LP.
+ * Developed under the sponsorship of the US Government under
+ * Subcontract No. B514193
+ *
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+/**
+ * This file implements POSIX lock type for Lustre.
+ * Its policy properties are start and end of extent and PID.
+ *
+ * These locks are only done through MDS due to POSIX semantics requiring
+ * e.g. that locks could be only partially released and as such split into
+ * two parts, and also that two adjacent locks from the same process may be
+ * merged into a single wider lock.
+ *
+ * Lock modes are mapped like this:
+ * PR and PW for READ and WRITE locks
+ * NL to request a releasing of a portion of the lock
+ *
+ * These flock locks never timeout.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/list.h>
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag);
+
+/**
+ * list_for_remaining_safe - iterate over the remaining entries in a list
+ *              and safeguard against removal of a list entry.
+ * \param pos   the &struct list_head to use as a loop counter. pos MUST
+ *              have been initialized prior to using it in this macro.
+ * \param n     another &struct list_head to use as temporary storage
+ * \param head  the head for your list.
+ */
+#define list_for_remaining_safe(pos, n, head) \
+	for (n = pos->next; pos != (head); pos = n, n = pos->next)
+
+static inline int
+ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+	return ((new->l_policy_data.l_flock.owner ==
+		 lock->l_policy_data.l_flock.owner) &&
+		(new->l_export == lock->l_export));
+}
+
+static inline int
+ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+	return ((new->l_policy_data.l_flock.start <=
+		 lock->l_policy_data.l_flock.end) &&
+		(new->l_policy_data.l_flock.end >=
+		 lock->l_policy_data.l_flock.start));
+}
+
+static inline void ldlm_flock_blocking_link(struct ldlm_lock *req,
+					    struct ldlm_lock *lock)
+{
+	/* For server only */
+	if (req->l_export == NULL)
+		return;
+
+	LASSERT(hlist_unhashed(&req->l_exp_flock_hash));
+
+	req->l_policy_data.l_flock.blocking_owner =
+		lock->l_policy_data.l_flock.owner;
+	req->l_policy_data.l_flock.blocking_export =
+		lock->l_export;
+	atomic_set(&req->l_policy_data.l_flock.blocking_refs, 0);
+
+	cfs_hash_add(req->l_export->exp_flock_hash,
+		     &req->l_policy_data.l_flock.owner,
+		     &req->l_exp_flock_hash);
+}
+
+static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req)
+{
+	/* For server only */
+	if (req->l_export == NULL)
+		return;
+
+	check_res_locked(req->l_resource);
+	if (req->l_export->exp_flock_hash != NULL &&
+	    !hlist_unhashed(&req->l_exp_flock_hash))
+		cfs_hash_del(req->l_export->exp_flock_hash,
+			     &req->l_policy_data.l_flock.owner,
+			     &req->l_exp_flock_hash);
+}
+
+static inline void
+ldlm_flock_destroy(struct ldlm_lock *lock, enum ldlm_mode mode, __u64 flags)
+{
+	ENTRY;
+
+	LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: %#llx)",
+		   mode, flags);
+
+	/* Safe to not lock here, since it should be empty anyway */
+	LASSERT(hlist_unhashed(&lock->l_exp_flock_hash));
+
+	list_del_init(&lock->l_res_link);
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		/* client side - set a flag to prevent sending a CANCEL */
+		lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
+
+		/* when reaching here, it is under lock_res_and_lock(). Thus,
+		 * need call the nolock version of ldlm_lock_decref_internal
+		 */
+		ldlm_lock_decref_internal_nolock(lock, mode);
+	}
+
+	ldlm_lock_destroy_nolock(lock);
+	EXIT;
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * POSIX locks deadlock detection code.
+ *
+ * Given a new lock \a req and an existing lock \a bl_lock it conflicts
+ * with, we need to iterate through all blocked POSIX locks for this
+ * export and see if there is a deadlock condition arising. (i.e. when
+ * one client holds a lock on something and want a lock on something
+ * else and at the same time another client has the opposite situation).
+ */
+
+struct ldlm_flock_lookup_cb_data {
+	__u64 *bl_owner;
+	struct ldlm_lock *lock;
+	struct obd_export *exp;
+};
+
+static int ldlm_flock_lookup_cb(struct obd_export *exp, void *data)
+{
+	struct ldlm_flock_lookup_cb_data *cb_data = data;
+	struct ldlm_lock *lock;
+
+	if (exp->exp_failed)
+		return 0;
+
+	lock = cfs_hash_lookup(exp->exp_flock_hash, cb_data->bl_owner);
+	if (lock == NULL)
+		return 0;
+
+	/* Stop on first found lock. Same process can't sleep twice */
+	cb_data->lock = lock;
+	cb_data->exp = class_export_get(exp);
+
+	return 1;
+}
+
+static int
+ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock)
+{
+	struct obd_export *req_exp = req->l_export;
+	struct obd_export *bl_exp = bl_lock->l_export;
+	__u64 req_owner = req->l_policy_data.l_flock.owner;
+	__u64 bl_owner = bl_lock->l_policy_data.l_flock.owner;
+
+	/* For server only */
+	if (req_exp == NULL)
+		return 0;
+
+	class_export_get(bl_exp);
+	while (1) {
+		struct ldlm_flock_lookup_cb_data cb_data = {
+			.bl_owner = &bl_owner,
+			.lock = NULL,
+			.exp = NULL,
+		};
+		struct ptlrpc_connection *bl_exp_conn;
+		struct obd_export *bl_exp_new;
+		struct ldlm_lock *lock = NULL;
+		struct ldlm_flock *flock;
+
+		bl_exp_conn = bl_exp->exp_connection;
+		if (bl_exp->exp_flock_hash != NULL) {
+			int found;
+
+			found = obd_nid_export_for_each(bl_exp->exp_obd,
+							&bl_exp_conn->c_peer.nid,
+							ldlm_flock_lookup_cb,
+							&cb_data);
+			if (found)
+				lock = cb_data.lock;
+		}
+		if (lock == NULL)
+			break;
+
+		class_export_put(bl_exp);
+		bl_exp = cb_data.exp;
+
+		LASSERT(req != lock);
+		flock = &lock->l_policy_data.l_flock;
+		LASSERT(flock->owner == bl_owner);
+		bl_owner = flock->blocking_owner;
+		bl_exp_new = class_export_get(flock->blocking_export);
+		class_export_put(bl_exp);
+
+		cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash);
+		bl_exp = bl_exp_new;
+
+		if (bl_exp->exp_failed)
+			break;
+
+		if (bl_owner == req_owner &&
+		    nid_same(&bl_exp_conn->c_peer.nid,
+			      &req_exp->exp_connection->c_peer.nid)) {
+			class_export_put(bl_exp);
+			return 1;
+		}
+	}
+	class_export_put(bl_exp);
+
+	return 0;
+}
+
+static void ldlm_flock_cancel_on_deadlock(struct ldlm_lock *lock,
+					  struct list_head *work_list)
+{
+	CDEBUG(D_INFO, "reprocess deadlock req=%p\n", lock);
+
+	if ((exp_connect_flags(lock->l_export) &
+	     OBD_CONNECT_FLOCK_DEAD) == 0) {
+		CERROR("deadlock found, but client doesn't support flock canceliation\n");
+	} else {
+		LASSERT(lock->l_completion_ast);
+		LASSERT(!ldlm_is_ast_sent(lock));
+		lock->l_flags |= (LDLM_FL_AST_SENT | LDLM_FL_CANCEL_ON_BLOCK |
+				  LDLM_FL_FLOCK_DEADLOCK);
+		ldlm_flock_blocking_unlink(lock);
+		ldlm_resource_unlink_lock(lock);
+		ldlm_add_ast_work_item(lock, NULL, work_list);
+	}
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * Process a granting attempt for flock lock.
+ * Must be called under ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ */
+int
+ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+			enum ldlm_process_intention intention,
+			enum ldlm_error *err, struct list_head *work_list)
+{
+	struct ldlm_resource *res = req->l_resource;
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	struct list_head *tmp;
+	struct list_head *ownlocks = NULL;
+	struct ldlm_lock *lock = NULL;
+	struct ldlm_lock *new = req;
+	struct ldlm_lock *new2 = NULL;
+	enum ldlm_mode mode = req->l_req_mode;
+	int local = ns_is_client(ns);
+	int added = (mode == LCK_NL);
+	int overlaps = 0;
+	int splitted = 0;
+	const struct ldlm_callback_suite null_cbs = { NULL };
+#ifdef HAVE_SERVER_SUPPORT
+	struct list_head *grant_work = (intention == LDLM_PROCESS_ENQUEUE ?
+					NULL : work_list);
+#endif
+	ENTRY;
+
+	CDEBUG(D_DLMTRACE, "flags %#llx owner %llu pid %u mode %u start "
+	       "%llu end %llu\n", *flags,
+	       new->l_policy_data.l_flock.owner,
+	       new->l_policy_data.l_flock.pid, mode,
+	       req->l_policy_data.l_flock.start,
+	       req->l_policy_data.l_flock.end);
+
+	*err = ELDLM_OK;
+
+	if (local) {
+		/* No blocking ASTs are sent to the clients for
+		 * Posix file & record locks
+		 */
+		req->l_blocking_ast = NULL;
+	} else {
+		/* Called on the server for lock cancels. */
+		req->l_blocking_ast = ldlm_flock_blocking_ast;
+	}
+
+reprocess:
+	if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) {
+		/* This loop determines where this processes locks start
+		 * in the resource lr_granted list.
+		 */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					  l_res_link);
+			if (ldlm_same_flock_owner(lock, req)) {
+				ownlocks = tmp;
+				break;
+			}
+		}
+	}
+#ifdef HAVE_SERVER_SUPPORT
+	else {
+		int reprocess_failed = 0;
+		lockmode_verify(mode);
+
+		/* This loop determines if there are existing locks
+		 * that conflict with the new lock request.
+		 */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					  l_res_link);
+
+			if (ldlm_same_flock_owner(lock, req)) {
+				if (!ownlocks)
+					ownlocks = tmp;
+				continue;
+			}
+
+			if (req->l_req_mode == LCK_PR &&
+			    lock->l_granted_mode == LCK_PR &&
+			    lock->l_policy_data.l_flock.start <=
+				req->l_policy_data.l_flock.start &&
+			    lock->l_policy_data.l_flock.end >=
+				req->l_policy_data.l_flock.end) {
+				/* there can't be granted WR lock */
+				break;
+			}
+			/* locks are compatible, overlap doesn't matter */
+			if (lockmode_compat(lock->l_granted_mode, mode))
+				continue;
+
+			if (!ldlm_flocks_overlap(lock, req))
+				continue;
+
+			if (intention != LDLM_PROCESS_ENQUEUE) {
+				if (ldlm_flock_deadlock(req, lock)) {
+					ldlm_flock_cancel_on_deadlock(
+						req, grant_work);
+					RETURN(LDLM_ITER_CONTINUE);
+				}
+				reprocess_failed = 1;
+				break;
+			}
+
+			if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = -EAGAIN;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+			if (*flags & LDLM_FL_TEST_LOCK) {
+				ldlm_flock_destroy(req, mode, *flags);
+				req->l_req_mode = lock->l_granted_mode;
+				req->l_policy_data.l_flock.pid =
+					lock->l_policy_data.l_flock.pid;
+				req->l_policy_data.l_flock.start =
+					lock->l_policy_data.l_flock.start;
+				req->l_policy_data.l_flock.end =
+					lock->l_policy_data.l_flock.end;
+				*flags |= LDLM_FL_LOCK_CHANGED;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+			/* add lock to blocking list before deadlock
+			 * check to prevent race
+			 */
+			ldlm_flock_blocking_link(req, lock);
+
+			if (ldlm_flock_deadlock(req, lock)) {
+				ldlm_flock_blocking_unlink(req);
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = -EDEADLK;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+			ldlm_resource_add_lock(res, &res->lr_waiting, req);
+			*flags |= LDLM_FL_BLOCK_GRANTED;
+			RETURN(LDLM_ITER_STOP);
+		}
+		if (reprocess_failed)
+			RETURN(LDLM_ITER_CONTINUE);
+	}
+
+	if (*flags & LDLM_FL_TEST_LOCK) {
+		ldlm_flock_destroy(req, mode, *flags);
+		req->l_req_mode = LCK_NL;
+		*flags |= LDLM_FL_LOCK_CHANGED;
+		RETURN(LDLM_ITER_STOP);
+	}
+
+	/* In case we had slept on this lock request take it off of the
+	 * deadlock detection hash list.
+	 */
+	ldlm_flock_blocking_unlink(req);
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Scan the locks owned by this process that overlap this request.
+	 * We may have to merge or split existing locks.
+	 */
+	if (!ownlocks)
+		ownlocks = &res->lr_granted;
+
+	list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) {
+		lock = list_entry(ownlocks, struct ldlm_lock, l_res_link);
+
+		if (!ldlm_same_flock_owner(lock, new))
+			break;
+
+		if (lock->l_granted_mode == mode) {
+			/* If the modes are the same then we need to process
+			 * locks that overlap OR adjoin the new lock. The extra
+			 * logic condition is necessary to deal with arithmetic
+			 * overflow and underflow.
+			 */
+			if ((new->l_policy_data.l_flock.start >
+			     (lock->l_policy_data.l_flock.end + 1))
+			    && (lock->l_policy_data.l_flock.end !=
+				OBD_OBJECT_EOF))
+				continue;
+
+			if ((new->l_policy_data.l_flock.end <
+			     (lock->l_policy_data.l_flock.start - 1))
+			    && (lock->l_policy_data.l_flock.start != 0))
+				break;
+
+			if (new->l_policy_data.l_flock.start <
+			    lock->l_policy_data.l_flock.start) {
+				lock->l_policy_data.l_flock.start =
+					new->l_policy_data.l_flock.start;
+			} else {
+				new->l_policy_data.l_flock.start =
+					lock->l_policy_data.l_flock.start;
+			}
+
+			if (new->l_policy_data.l_flock.end >
+			    lock->l_policy_data.l_flock.end) {
+				lock->l_policy_data.l_flock.end =
+					new->l_policy_data.l_flock.end;
+			} else {
+				new->l_policy_data.l_flock.end =
+					lock->l_policy_data.l_flock.end;
+			}
+
+			if (added) {
+				ldlm_flock_destroy(lock, mode, *flags);
+			} else {
+				new = lock;
+				added = 1;
+			}
+			continue;
+		}
+
+		if (new->l_policy_data.l_flock.start >
+		    lock->l_policy_data.l_flock.end)
+			continue;
+
+		if (new->l_policy_data.l_flock.end <
+		    lock->l_policy_data.l_flock.start)
+			break;
+
+		++overlaps;
+
+		if (new->l_policy_data.l_flock.start <=
+		    lock->l_policy_data.l_flock.start) {
+			if (new->l_policy_data.l_flock.end <
+			    lock->l_policy_data.l_flock.end) {
+				lock->l_policy_data.l_flock.start =
+					new->l_policy_data.l_flock.end + 1;
+				break;
+			}
+			ldlm_flock_destroy(lock, lock->l_req_mode, *flags);
+			continue;
+		}
+		if (new->l_policy_data.l_flock.end >=
+		    lock->l_policy_data.l_flock.end) {
+			lock->l_policy_data.l_flock.end =
+				new->l_policy_data.l_flock.start - 1;
+			continue;
+		}
+
+		/* split the existing lock into two locks */
+
+		/* if this is an F_UNLCK operation then we could avoid
+		 * allocating a new lock and use the req lock passed in
+		 * with the request but this would complicate the reply
+		 * processing since updates to req get reflected in the
+		 * reply. The client side replays the lock request so
+		 * it must see the original lock data in the reply.
+		 */
+
+		/* XXX - if ldlm_lock_new() can sleep we should
+		 * release the lr_lock, allocate the new lock,
+		 * and restart processing this lock.
+		 */
+		if (new2 == NULL) {
+			unlock_res_and_lock(req);
+			new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK,
+						lock->l_granted_mode, &null_cbs,
+						NULL, 0, LVB_T_NONE);
+			lock_res_and_lock(req);
+			if (IS_ERR(new2)) {
+				ldlm_flock_destroy(req, lock->l_granted_mode,
+						   *flags);
+				*err = PTR_ERR(new2);
+				RETURN(LDLM_ITER_STOP);
+			}
+			goto reprocess;
+		}
+
+		splitted = 1;
+
+		new2->l_granted_mode = lock->l_granted_mode;
+		new2->l_policy_data.l_flock.pid =
+			new->l_policy_data.l_flock.pid;
+		new2->l_policy_data.l_flock.owner =
+			new->l_policy_data.l_flock.owner;
+		new2->l_policy_data.l_flock.start =
+			lock->l_policy_data.l_flock.start;
+		new2->l_policy_data.l_flock.end =
+			new->l_policy_data.l_flock.start - 1;
+		lock->l_policy_data.l_flock.start =
+			new->l_policy_data.l_flock.end + 1;
+		new2->l_conn_export = lock->l_conn_export;
+		if (lock->l_export != NULL) {
+			new2->l_export = class_export_lock_get(lock->l_export,
+							       new2);
+			if (new2->l_export->exp_lock_hash &&
+			    hlist_unhashed(&new2->l_exp_hash))
+				cfs_hash_add(new2->l_export->exp_lock_hash,
+					     &new2->l_remote_handle,
+					     &new2->l_exp_hash);
+		}
+		if (*flags == LDLM_FL_WAIT_NOREPROC)
+			ldlm_lock_addref_internal_nolock(new2,
+							 lock->l_granted_mode);
+
+		/* insert new2 at lock */
+		ldlm_resource_add_lock(res, ownlocks, new2);
+		LDLM_LOCK_RELEASE(new2);
+		break;
+	}
+
+	/* if new2 is created but never used, destroy it*/
+	if (splitted == 0 && new2 != NULL)
+		ldlm_lock_destroy_nolock(new2);
+
+	/* At this point we're granting the lock request. */
+	req->l_granted_mode = req->l_req_mode;
+
+	/* Add req to the granted queue before calling ldlm_reprocess_all(). */
+	if (!added) {
+		list_del_init(&req->l_res_link);
+		/* insert new lock before ownlocks in list. */
+		ldlm_resource_add_lock(res, ownlocks, req);
+	}
+
+	if (*flags != LDLM_FL_WAIT_NOREPROC) {
+#ifdef HAVE_SERVER_SUPPORT
+		if (intention == LDLM_PROCESS_ENQUEUE) {
+			/* If this is an unlock, reprocess the waitq and
+			 * send completions ASTs for locks that can now be
+			 * granted. The only problem with doing this
+			 * reprocessing here is that the completion ASTs for
+			 * newly granted locks will be sent before the unlock
+			 * completion is sent. It shouldn't be an issue. Also
+			 * note that ldlm_process_flock_lock() will recurse,
+			 * but only once because 'intention' won't be
+			 * LDLM_PROCESS_ENQUEUE from ldlm_reprocess_queue.
+			 */
+			if ((mode == LCK_NL) && overlaps) {
+				LIST_HEAD(rpc_list);
+				int rc;
+
+restart:
+				ldlm_reprocess_queue(res, &res->lr_waiting,
+						     &rpc_list,
+						     LDLM_PROCESS_RESCAN, 0);
+
+				unlock_res_and_lock(req);
+				rc = ldlm_run_ast_work(ns, &rpc_list,
+						       LDLM_WORK_CP_AST);
+				lock_res_and_lock(req);
+				if (rc == -ERESTART)
+					GOTO(restart, rc);
+			}
+		} else {
+			LASSERT(req->l_completion_ast);
+			ldlm_add_ast_work_item(req, NULL, grant_work);
+		}
+#else /* !HAVE_SERVER_SUPPORT */
+		/* The only one possible case for client-side calls flock
+		 * policy function is ldlm_flock_completion_ast inside which
+		 * carries LDLM_FL_WAIT_NOREPROC flag.
+		 */
+		CERROR("Illegal parameter for client-side-only module.\n");
+		LBUG();
+#endif /* HAVE_SERVER_SUPPORT */
+	}
+
+	/* In case we're reprocessing the requested lock we can't destroy
+	 * it until after calling ldlm_add_ast_work_item() above so that laawi()
+	 * can bump the reference count on \a req. Otherwise \a req
+	 * could be freed before the completion AST can be sent.
+	 */
+	if (added)
+		ldlm_flock_destroy(req, mode, *flags);
+
+	ldlm_resource_dump(D_INFO, res);
+	RETURN(LDLM_ITER_CONTINUE);
+}
+
+/**
+ * Flock completion callback function.
+ *
+ * \param lock [in,out]: A lock to be handled
+ * \param flags    [in]: flags
+ * \param *data    [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg
+ *
+ * \retval 0    : success
+ * \retval <0   : failure
+ */
+int
+ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	struct file_lock *getlk = lock->l_ast_data;
+	struct obd_device *obd;
+	enum ldlm_error err;
+	int rc = 0;
+	ENTRY;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT2, 4);
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT3)) {
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_FAIL_LOC;
+		unlock_res_and_lock(lock);
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT3, 4);
+	}
+	CDEBUG(D_DLMTRACE, "flags: %#llx data: %p getlk: %p\n",
+	       flags, data, getlk);
+
+	LASSERT(flags != LDLM_FL_WAIT_NOREPROC);
+
+	if (flags & LDLM_FL_FAILED)
+		goto granted;
+
+	if (!(flags & LDLM_FL_BLOCKED_MASK)) {
+		if (NULL == data)
+			/* mds granted the lock in the reply */
+			goto granted;
+		/* CP AST RPC: lock get granted, wake it up */
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock,
+		   "client-side enqueue returned a blocked lock, sleeping");
+	obd = class_exp2obd(lock->l_conn_export);
+
+	/* Go to sleep until the lock is granted. */
+	rc = l_wait_event_abortable(lock->l_waitq,
+				    is_granted_or_cancelled(lock));
+	if (rc < 0) {
+		/* take lock off the deadlock detection hash list. */
+		lock_res_and_lock(lock);
+		ldlm_flock_blocking_unlink(lock);
+
+		/* client side - set flag to prevent lock from being
+		 * put on LRU list
+		 */
+		ldlm_set_cbpending(lock);
+		unlock_res_and_lock(lock);
+
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		RETURN(rc);
+	}
+
+granted:
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT4)) {
+		lock_res_and_lock(lock);
+		/* DEADLOCK is always set with CBPENDING */
+		lock->l_flags |= LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING;
+		unlock_res_and_lock(lock);
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT4, 4);
+	}
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT5)) {
+		lock_res_and_lock(lock);
+		/* DEADLOCK is always set with CBPENDING */
+		lock->l_flags |= (LDLM_FL_FAIL_LOC |
+				  LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING);
+		unlock_res_and_lock(lock);
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT5, 4);
+	}
+
+	lock_res_and_lock(lock);
+
+
+	/* Protect against race where lock could have been just destroyed
+	 * due to overlap in ldlm_process_flock_lock().
+	 */
+	if (ldlm_is_destroyed(lock)) {
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
+
+		/* An error is still to be returned, to propagate it up to
+		 * ldlm_cli_enqueue_fini() caller. */
+		RETURN(-EIO);
+	}
+
+	/* ldlm_lock_enqueue() has already placed lock on the granted list. */
+	ldlm_resource_unlink_lock(lock);
+
+	/* Import invalidation. We need to actually release the lock
+	 * references being held, so that it can go away. No point in
+	 * holding the lock even if app still believes it has it, since
+	 * server already dropped it anyway. Only for granted locks too.
+	 */
+	/* Do the same for DEADLOCK'ed locks. */
+	if (ldlm_is_failed(lock) || ldlm_is_flock_deadlock(lock)) {
+		int mode;
+
+		if (flags & LDLM_FL_TEST_LOCK)
+			LASSERT(ldlm_is_test_lock(lock));
+
+		if (ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock))
+			mode = getlk->fl_type;
+		else
+			mode = lock->l_req_mode;
+
+		if (ldlm_is_flock_deadlock(lock)) {
+			LDLM_DEBUG(lock, "client-side enqueue deadlock "
+				   "received");
+			rc = -EDEADLK;
+		}
+		ldlm_flock_destroy(lock, mode, LDLM_FL_WAIT_NOREPROC);
+		unlock_res_and_lock(lock);
+
+		/* Need to wake up the waiter if we were evicted */
+		wake_up(&lock->l_waitq);
+
+		/* An error is still to be returned, to propagate it up to
+		 * ldlm_cli_enqueue_fini() caller.
+		 */
+		RETURN(rc ? : -EIO);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue granted");
+
+	if (flags & LDLM_FL_TEST_LOCK) {
+		/*
+		 * fcntl(F_GETLK) request
+		 * The old mode was saved in getlk->fl_type so that if the mode
+		 * in the lock changes we can decref the appropriate refcount.
+		 */
+		LASSERT(ldlm_is_test_lock(lock));
+		ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC);
+		switch (lock->l_granted_mode) {
+		case LCK_PR:
+			getlk->fl_type = F_RDLCK;
+			break;
+		case LCK_PW:
+			getlk->fl_type = F_WRLCK;
+			break;
+		default:
+			getlk->fl_type = F_UNLCK;
+		}
+		getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid;
+		getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start;
+		getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end;
+	} else {
+		__u64 noreproc = LDLM_FL_WAIT_NOREPROC;
+
+		/* We need to reprocess the lock to do merges or splits
+		 * with existing locks owned by this process.
+		 */
+		ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL);
+	}
+	unlock_res_and_lock(lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_flock_completion_ast);
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag)
+{
+	ENTRY;
+
+	LASSERT(lock);
+	LASSERT(flag == LDLM_CB_CANCELING);
+
+	/* take lock off the deadlock detection hash list. */
+	lock_res_and_lock(lock);
+	ldlm_flock_blocking_unlink(lock);
+	unlock_res_and_lock(lock);
+	RETURN(0);
+}
+
+void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy)
+{
+	lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+	lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+	lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+	lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner;
+}
+
+void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_flock.lfw_start = lpolicy->l_flock.start;
+	wpolicy->l_flock.lfw_end = lpolicy->l_flock.end;
+	wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid;
+	wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner;
+}
+
+/*
+ * Export handle<->flock hash operations.
+ */
+static unsigned
+ldlm_export_flock_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u64_hash(*(__u64 *)key, mask);
+}
+
+static void *
+ldlm_export_flock_key(struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	return &lock->l_policy_data.l_flock.owner;
+}
+
+static int
+ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64));
+}
+
+static void *
+ldlm_export_flock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+}
+
+static void
+ldlm_export_flock_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	LDLM_LOCK_GET(lock);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_get(flock->blocking_export);
+	atomic_inc(&flock->blocking_refs);
+}
+
+static void
+ldlm_export_flock_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_put(flock->blocking_export);
+	if (atomic_dec_and_test(&flock->blocking_refs)) {
+		flock->blocking_owner = 0;
+		flock->blocking_export = NULL;
+	}
+	LDLM_LOCK_RELEASE(lock);
+}
+
+static struct cfs_hash_ops ldlm_export_flock_ops = {
+	.hs_hash        = ldlm_export_flock_hash,
+	.hs_key         = ldlm_export_flock_key,
+	.hs_keycmp      = ldlm_export_flock_keycmp,
+	.hs_object      = ldlm_export_flock_object,
+	.hs_get         = ldlm_export_flock_get,
+	.hs_put         = ldlm_export_flock_put,
+	.hs_put_locked  = ldlm_export_flock_put,
+};
+
+int ldlm_init_flock_export(struct obd_export *exp)
+{
+	if( strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0)
+		RETURN(0);
+
+	exp->exp_flock_hash =
+		cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+				HASH_EXP_LOCK_CUR_BITS,
+				HASH_EXP_LOCK_MAX_BITS,
+				HASH_EXP_LOCK_BKT_BITS, 0,
+				CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				&ldlm_export_flock_ops,
+				CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE);
+	if (!exp->exp_flock_hash)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+
+void ldlm_destroy_flock_export(struct obd_export *exp)
+{
+	ENTRY;
+	if (exp->exp_flock_hash) {
+		cfs_hash_putref(exp->exp_flock_hash);
+		exp->exp_flock_hash = NULL;
+	}
+	EXIT;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
new file mode 100644
index 0000000000000..41257a97dc571
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
@@ -0,0 +1,667 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ldlm/ldlm_inodebits.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of IBITS lock type
+ *
+ * IBITS lock type contains a bit mask determining various properties of an
+ * object. The meanings of specific bits are specific to the caller and are
+ * opaque to LDLM code.
+ *
+ * Locks with intersecting bitmasks and conflicting lock modes (e.g.  LCK_PW)
+ * are considered conflicting.  See the lock mode compatibility matrix
+ * in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <obd_class.h>
+
+#include "ldlm_internal.h"
+
+#ifdef HAVE_SERVER_SUPPORT
+
+/**
+ * It should iterate through all waiting locks on a given resource queue and
+ * attempt to grant them. An optimization is to check only heads waitintg
+ * locks for each inodebit type.
+ *
+ * Must be called with resource lock held.
+ */
+int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res,
+				   struct list_head *queue,
+				   struct list_head *work_list,
+				   enum ldlm_process_intention intention,
+				   __u64 mask)
+{
+	__u64 flags;
+	int rc = LDLM_ITER_CONTINUE;
+	enum ldlm_error err;
+	LIST_HEAD(bl_ast_list);
+	struct ldlm_ibits_queues *queues = res->lr_ibits_queues;
+	int i;
+
+	ENTRY;
+
+	check_res_locked(res);
+
+	LASSERT(res->lr_type == LDLM_IBITS);
+	LASSERT(intention == LDLM_PROCESS_RESCAN ||
+		intention == LDLM_PROCESS_RECOVERY);
+
+	if (intention == LDLM_PROCESS_RECOVERY)
+		return ldlm_reprocess_queue(res, queue, work_list, intention,
+					    0);
+
+restart:
+	CDEBUG(D_DLMTRACE, "--- Reprocess resource "DLDLMRES" (%p)\n",
+	       PLDLMRES(res), res);
+	if (mask)
+		CDEBUG(D_DLMTRACE, "Hint %llx\n", mask);
+	else
+		mask = MDS_INODELOCK_FULL;
+
+	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) {
+		LIST_HEAD(rpc_list);
+		struct list_head *head = &queues->liq_waiting[i];
+		struct ldlm_lock *pending;
+		struct ldlm_ibits_node *node;
+
+		if (list_empty(head) || !(mask & (1 << i)))
+			continue;
+
+		node = list_entry(head->next, struct ldlm_ibits_node,
+				  lin_link[i]);
+
+		pending = node->lock;
+		LDLM_DEBUG(pending, "Reprocessing lock from queue %d", i);
+
+		flags = 0;
+		rc = ldlm_process_inodebits_lock(pending, &flags, intention,
+						 &err, &rpc_list);
+		if (ldlm_is_granted(pending)) {
+			list_splice(&rpc_list, work_list);
+			mask |= pending->l_policy_data.l_inodebits.bits;
+			i = ffs(pending->l_policy_data.l_inodebits.bits) - 2;
+		} else {
+			list_splice(&rpc_list, &bl_ast_list);
+		}
+	}
+
+	if (!list_empty(&bl_ast_list)) {
+		unlock_res(res);
+
+		rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list,
+				       LDLM_WORK_BL_AST);
+
+		lock_res(res);
+		if (rc == -ERESTART) {
+			mask = 0;
+			GOTO(restart, rc);
+		}
+	}
+
+	if (!list_empty(&bl_ast_list))
+		ldlm_discard_bl_list(&bl_ast_list);
+
+	RETURN(rc);
+}
+
+/**
+ * Determine if the lock is compatible with all locks on the queue.
+ *
+ * If \a work_list is provided, conflicting locks are linked there.
+ * If \a work_list is not provided, we exit this function on first conflict.
+ *
+ * \retval 0 if there are conflicting locks in the \a queue
+ * \retval 1 if the lock is compatible to all locks in \a queue
+ *
+ * IBITS locks in granted queue are organized in bunches of
+ * same-mode/same-bits locks called "skip lists". The First lock in the
+ * bunch contains a pointer to the end of the bunch.  This allows us to
+ * skip an entire bunch when iterating the list in search for conflicting
+ * locks if first lock of the bunch is not conflicting with us.
+ */
+static int
+ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
+			    __u64 *ldlm_flags, struct list_head *work_list)
+{
+	enum ldlm_mode req_mode = req->l_req_mode;
+	struct list_head *tmp;
+	struct ldlm_lock *lock;
+	__u64 req_bits = req->l_policy_data.l_inodebits.bits;
+	__u64 *try_bits = &req->l_policy_data.l_inodebits.try_bits;
+	int compat = 1;
+
+	ENTRY;
+
+	lockmode_verify(req_mode);
+
+	/* There is no sense in lock with no bits set. Also such a lock
+	 * would be compatible with any other bit lock.
+	 * Meanwhile that can be true if there were just try_bits and all
+	 * are failed, so just exit gracefully and let the caller to care.
+	 */
+	if ((req_bits | *try_bits) == 0)
+		RETURN(0);
+
+	/* Group lock could be only DOM */
+	if (unlikely(req_mode == LCK_GROUP &&
+		     (req_bits | *try_bits) != MDS_INODELOCK_DOM))
+		RETURN(-EPROTO);
+
+	list_for_each(tmp, queue) {
+		struct list_head *mode_tail;
+
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		/* We stop walking the queue if we hit ourselves so we don't
+		 * take conflicting locks enqueued after us into account,
+		 * or we'd wait forever. */
+		if (req == lock)
+			RETURN(compat);
+
+		/* last lock in mode group */
+		LASSERT(lock->l_sl_mode.prev != NULL);
+		mode_tail = &list_entry(lock->l_sl_mode.prev, struct ldlm_lock,
+					l_sl_mode)->l_res_link;
+
+		/* if request lock is not COS_INCOMPAT and COS is disabled,
+		 * they are compatible, IOW this request is from a local
+		 * transaction on a DNE system. */
+		if (lock->l_req_mode == LCK_COS && !ldlm_is_cos_incompat(req) &&
+		    !ldlm_is_cos_enabled(req)) {
+			/* jump to last lock in mode group */
+			tmp = mode_tail;
+			continue;
+		}
+
+		if (lockmode_compat(lock->l_req_mode, req_mode)) {
+			/* non group locks are compatible, bits don't matter */
+			if (likely(req_mode != LCK_GROUP)) {
+				/* jump to last lock in mode group */
+				tmp = mode_tail;
+				continue;
+			}
+
+			if (req->l_policy_data.l_inodebits.li_gid ==
+			    lock->l_policy_data.l_inodebits.li_gid) {
+				if (ldlm_is_granted(lock))
+					RETURN(2);
+
+				if (*ldlm_flags & LDLM_FL_BLOCK_NOWAIT)
+					RETURN(-EWOULDBLOCK);
+
+				/* Place the same group together */
+				ldlm_resource_insert_lock_after(lock, req);
+				RETURN(0);
+			}
+		}
+
+		/* GROUP locks are placed to a head of the waiting list, but
+		 * grouped by gid. */
+		if (unlikely(req_mode == LCK_GROUP && !ldlm_is_granted(lock))) {
+			compat = 0;
+			if (lock->l_req_mode != LCK_GROUP) {
+				/* Already not a GROUP lock, insert before. */
+				ldlm_resource_insert_lock_before(lock, req);
+				break;
+			}
+			/* Still GROUP but a different gid(the same gid would
+			 * be handled above). Keep searching for the same gid */
+			LASSERT(req->l_policy_data.l_inodebits.li_gid !=
+				lock->l_policy_data.l_inodebits.li_gid);
+			continue;
+		}
+
+		for (;;) {
+			struct list_head *head;
+
+			/* Advance loop cursor to last lock in policy group. */
+			tmp = &list_entry(lock->l_sl_policy.prev,
+					  struct ldlm_lock,
+					  l_sl_policy)->l_res_link;
+
+			/* New lock's try_bits are filtered out by ibits
+			 * of all locks in both granted and waiting queues.
+			 */
+			*try_bits &= ~(lock->l_policy_data.l_inodebits.bits |
+				lock->l_policy_data.l_inodebits.try_bits);
+
+			if ((req_bits | *try_bits) == 0)
+				RETURN(0);
+
+			/* The new lock ibits is more preferable than try_bits
+			 * of waiting locks so drop conflicting try_bits in
+			 * the waiting queue.
+			 * Notice that try_bits of granted locks must be zero.
+			 */
+			lock->l_policy_data.l_inodebits.try_bits &= ~req_bits;
+
+			/* Locks with overlapping bits conflict. */
+			if (lock->l_policy_data.l_inodebits.bits & req_bits) {
+				/* COS lock mode has a special compatibility
+				 * requirement: it is only compatible with
+				 * locks from the same client. */
+				if (lock->l_req_mode == LCK_COS &&
+				    !ldlm_is_cos_incompat(req) &&
+				    ldlm_is_cos_enabled(req) &&
+				    lock->l_client_cookie == req->l_client_cookie)
+					goto skip_work_list;
+
+				compat = 0;
+
+				if (unlikely(lock->l_req_mode == LCK_GROUP)) {
+					LASSERT(ldlm_has_dom(lock));
+
+					if (*ldlm_flags & LDLM_FL_BLOCK_NOWAIT)
+						RETURN(-EWOULDBLOCK);
+
+					/* Local combined DOM lock came across
+					 * GROUP DOM lock, it makes the thread
+					 * to be blocked for a long time, not
+					 * allowed, the trybits to be used
+					 * instead.
+					 */
+					if (!req->l_export &&
+					    (req_bits & MDS_INODELOCK_DOM) &&
+					    (req_bits & ~MDS_INODELOCK_DOM))
+						LBUG();
+
+					goto skip_work_list;
+				}
+
+				/* Found a conflicting policy group. */
+				if (!work_list)
+					RETURN(0);
+
+				/* Add locks of the policy group to @work_list
+				 * as blocking locks for @req */
+				if (lock->l_blocking_ast)
+					ldlm_add_ast_work_item(lock, req,
+							       work_list);
+				head = &lock->l_sl_policy;
+				list_for_each_entry(lock, head, l_sl_policy)
+					if (lock->l_blocking_ast)
+						ldlm_add_ast_work_item(lock,
+								req, work_list);
+			}
+skip_work_list:
+			if (tmp == mode_tail)
+				break;
+
+			tmp = tmp->next;
+			lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+		} /* Loop over policy groups within one mode group. */
+	} /* Loop over mode groups within @queue. */
+
+	RETURN(compat);
+}
+
+/**
+ * Process a granting attempt for IBITS lock.
+ * Must be called with ns lock held
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ */
+int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *ldlm_flags,
+				enum ldlm_process_intention intention,
+				enum ldlm_error *err,
+				struct list_head *work_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
+	int rc, rc2 = 0;
+	ENTRY;
+
+	*err = ELDLM_LOCK_ABORTED;
+	LASSERT(!ldlm_is_granted(lock));
+	check_res_locked(res);
+
+	if (intention == LDLM_PROCESS_RESCAN) {
+		struct list_head *bl_list =
+			*ldlm_flags & LDLM_FL_BLOCK_NOWAIT ? NULL : work_list;
+
+		LASSERT(lock->l_policy_data.l_inodebits.bits != 0);
+
+		/* It is possible that some of granted locks was not canceled
+		 * but converted and is kept in granted queue. So there is
+		 * a window where lock with 'ast_sent' might become granted
+		 * again. Meanwhile a new lock may appear in that window and
+		 * conflicts with the converted lock so the following scenario
+		 * is possible:
+		 *
+		 * 1) lock1 conflicts with lock2
+		 * 2) bl_ast was sent for lock2
+		 * 3) lock3 comes and conflicts with lock2 too
+		 * 4) no bl_ast sent because lock2->l_bl_ast_sent is 1
+		 * 5) lock2 was converted for lock1 but not for lock3
+		 * 6) lock1 granted, lock3 still is waiting for lock2, but
+		 *    there will never be another bl_ast for that
+		 *
+		 * To avoid this scenario the work_list is used below to collect
+		 * any blocked locks from granted queue during every reprocess
+		 * and bl_ast will be sent if needed.
+		 */
+		*ldlm_flags = 0;
+		rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock,
+						 ldlm_flags, bl_list);
+		if (!rc)
+			RETURN(LDLM_ITER_STOP);
+		rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock,
+						 ldlm_flags, NULL);
+		if (!rc)
+			RETURN(LDLM_ITER_STOP);
+
+		/* grant also try_bits if any */
+		if (lock->l_policy_data.l_inodebits.try_bits != 0) {
+			lock->l_policy_data.l_inodebits.bits |=
+				lock->l_policy_data.l_inodebits.try_bits;
+			lock->l_policy_data.l_inodebits.try_bits = 0;
+			*ldlm_flags |= LDLM_FL_LOCK_CHANGED;
+		}
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, grant_work);
+
+		*err = ELDLM_OK;
+		RETURN(LDLM_ITER_CONTINUE);
+	}
+
+	rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock,
+					 ldlm_flags, work_list);
+	if (rc < 0)
+		GOTO(out, *err = rc);
+
+	if (rc != 2) {
+		rc2 = ldlm_inodebits_compat_queue(&res->lr_waiting, lock,
+						  ldlm_flags, work_list);
+		if (rc2 < 0)
+			GOTO(out, *err = rc = rc2);
+	}
+
+	if (rc + rc2 != 2) {
+		/* if there were only bits to try and all are conflicting */
+		if ((lock->l_policy_data.l_inodebits.bits |
+		     lock->l_policy_data.l_inodebits.try_bits)) {
+			/* There is no sense to set LDLM_FL_NO_TIMEOUT to
+			 * @ldlm_flags for DOM lock while they are enqueued
+			 * through intents, i.e. @lock here is local which does
+			 * not timeout. */
+			*err = ELDLM_OK;
+		}
+	} else {
+		/* grant also all remaining try_bits */
+		if (lock->l_policy_data.l_inodebits.try_bits != 0) {
+			lock->l_policy_data.l_inodebits.bits |=
+				lock->l_policy_data.l_inodebits.try_bits;
+			lock->l_policy_data.l_inodebits.try_bits = 0;
+			*ldlm_flags |= LDLM_FL_LOCK_CHANGED;
+		}
+		LASSERT(lock->l_policy_data.l_inodebits.bits);
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, grant_work);
+		*err = ELDLM_OK;
+	}
+
+	RETURN(LDLM_ITER_CONTINUE);
+out:
+	return rc;
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy)
+{
+	lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+	/**
+	 * try_bits and li_gid are to be handled outside of generic
+	 * write_to_local due to different behavior on a server and client.
+	 */
+}
+
+void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+	wpolicy->l_inodebits.try_bits = lpolicy->l_inodebits.try_bits;
+	wpolicy->l_inodebits.li_gid = lpolicy->l_inodebits.li_gid;
+}
+
+/**
+ * Attempt to convert already granted IBITS lock with several bits set to
+ * a lock with less bits (downgrade).
+ *
+ * Such lock conversion is used to keep lock with non-blocking bits instead of
+ * cancelling it, introduced for better support of DoM files.
+ */
+int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop)
+{
+	ENTRY;
+
+	check_res_locked(lock->l_resource);
+
+	/* Just return if there are no conflicting bits */
+	if ((lock->l_policy_data.l_inodebits.bits & to_drop) == 0) {
+		LDLM_WARN(lock, "try to drop unset bits %#llx/%#llx",
+			  lock->l_policy_data.l_inodebits.bits, to_drop);
+		/* nothing to do */
+		RETURN(0);
+	}
+
+	/* remove lock from a skiplist and put in the new place
+	 * according with new inodebits */
+	ldlm_resource_unlink_lock(lock);
+	lock->l_policy_data.l_inodebits.bits &= ~to_drop;
+	ldlm_grant_lock_with_skiplist(lock);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_inodebits_drop);
+
+/* convert single lock */
+int ldlm_cli_inodebits_convert(struct ldlm_lock *lock,
+			       enum ldlm_cancel_flags cancel_flags)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	struct ldlm_lock_desc ld = { { 0 } };
+	__u64 drop_bits, new_bits;
+	__u32 flags = 0;
+	int rc;
+
+	ENTRY;
+
+	check_res_locked(lock->l_resource);
+
+	/* Lock is being converted already */
+	if (ldlm_is_converting(lock)) {
+		if (!(cancel_flags & LCF_ASYNC)) {
+			unlock_res_and_lock(lock);
+			wait_event_idle(lock->l_waitq,
+					is_lock_converted(lock));
+			lock_res_and_lock(lock);
+		}
+		RETURN(0);
+	}
+
+	/* lru_cancel may happen in parallel and call ldlm_cli_cancel_list()
+	 * independently.
+	 */
+	if (ldlm_is_canceling(lock))
+		RETURN(-EINVAL);
+
+	/* no need in only local convert */
+	if (lock->l_flags & (LDLM_FL_LOCAL_ONLY | LDLM_FL_CANCEL_ON_BLOCK))
+		RETURN(-EINVAL);
+
+	drop_bits = lock->l_policy_data.l_inodebits.cancel_bits;
+	/* no cancel bits - means that caller needs full cancel */
+	if (drop_bits == 0)
+		RETURN(-EINVAL);
+
+	new_bits = lock->l_policy_data.l_inodebits.bits & ~drop_bits;
+	/* check if all lock bits are dropped, proceed with cancel */
+	if (!new_bits)
+		RETURN(-EINVAL);
+
+	/* check if no dropped bits, consider this as successful convert */
+	if (lock->l_policy_data.l_inodebits.bits == new_bits)
+		RETURN(0);
+
+	ldlm_set_converting(lock);
+	/* Finally call cancel callback for remaining bits only.
+	 * It is important to have converting flag during that
+	 * so blocking_ast callback can distinguish convert from
+	 * cancels.
+	 */
+	ld.l_policy_data.l_inodebits.cancel_bits = drop_bits;
+	unlock_res_and_lock(lock);
+	lock->l_blocking_ast(lock, &ld, lock->l_ast_data, LDLM_CB_CANCELING);
+	/* now notify server about convert */
+	rc = ldlm_cli_convert_req(lock, &flags, new_bits);
+	lock_res_and_lock(lock);
+	if (rc)
+		GOTO(full_cancel, rc);
+
+	/* Finally clear these bits in lock ibits */
+	ldlm_inodebits_drop(lock, drop_bits);
+
+	/* Being locked again check if lock was canceled, it is important
+	 * to do and don't drop cbpending below
+	 */
+	if (ldlm_is_canceling(lock))
+		GOTO(full_cancel, rc = -EINVAL);
+
+	/* also check again if more bits to be cancelled appeared */
+	if (drop_bits != lock->l_policy_data.l_inodebits.cancel_bits)
+		GOTO(clear_converting, rc = -EAGAIN);
+
+	/* clear cbpending flag early, it is safe to match lock right after
+	 * client convert because it is downgrade always.
+	 */
+	ldlm_clear_cbpending(lock);
+	ldlm_clear_bl_ast(lock);
+	spin_lock(&ns->ns_lock);
+	if (list_empty(&lock->l_lru))
+		ldlm_lock_add_to_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+
+	/* the job is done, zero the cancel_bits. If more conflicts appear,
+	 * it will result in another cycle of ldlm_cli_inodebits_convert().
+	 */
+full_cancel:
+	lock->l_policy_data.l_inodebits.cancel_bits = 0;
+clear_converting:
+	ldlm_clear_converting(lock);
+	RETURN(rc);
+}
+
+int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock)
+{
+	if (ldlm_is_ns_srv(lock)) {
+		int i;
+
+		OBD_SLAB_ALLOC_PTR(lock->l_ibits_node, ldlm_inodebits_slab);
+		if (lock->l_ibits_node == NULL)
+			return -ENOMEM;
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+			INIT_LIST_HEAD(&lock->l_ibits_node->lin_link[i]);
+		lock->l_ibits_node->lock = lock;
+	} else {
+		lock->l_ibits_node = NULL;
+	}
+	return 0;
+}
+
+void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head,
+			     struct ldlm_lock *lock, bool tail)
+{
+	int i;
+
+	if (!ldlm_is_ns_srv(lock))
+		return;
+
+	if (head == &res->lr_waiting) {
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) {
+			if (!(lock->l_policy_data.l_inodebits.bits & BIT(i)))
+				continue;
+			if (tail)
+				list_add_tail(&lock->l_ibits_node->lin_link[i],
+					 &res->lr_ibits_queues->liq_waiting[i]);
+			else
+				list_add(&lock->l_ibits_node->lin_link[i],
+					 &res->lr_ibits_queues->liq_waiting[i]);
+		}
+	} else if (head == &res->lr_granted && lock->l_ibits_node != NULL) {
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+			LASSERT(list_empty(&lock->l_ibits_node->lin_link[i]));
+		OBD_SLAB_FREE_PTR(lock->l_ibits_node, ldlm_inodebits_slab);
+		lock->l_ibits_node = NULL;
+	} else if (head != &res->lr_granted) {
+		/* we are inserting in a middle of a list, after @head */
+		struct ldlm_lock *orig = list_entry(head, struct ldlm_lock,
+						    l_res_link);
+		LASSERT(orig->l_policy_data.l_inodebits.bits ==
+			lock->l_policy_data.l_inodebits.bits);
+		/* The is no a use case to insert before with exactly matched
+		 * set of bits */
+		LASSERT(tail == false);
+
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) {
+			if (!(lock->l_policy_data.l_inodebits.bits & (1 << i)))
+				continue;
+			list_add(&lock->l_ibits_node->lin_link[i],
+				 &orig->l_ibits_node->lin_link[i]);
+		}
+	}
+}
+
+void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock)
+{
+	int i;
+
+	ldlm_unlink_lock_skiplist(lock);
+	if (!ldlm_is_ns_srv(lock))
+		return;
+
+	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+		list_del_init(&lock->l_ibits_node->lin_link[i]);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
new file mode 100644
index 0000000000000..517ab6091de5c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
@@ -0,0 +1,424 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+extern int ldlm_srv_namespace_nr;
+extern int ldlm_cli_namespace_nr;
+extern struct mutex ldlm_srv_namespace_lock;
+extern struct list_head ldlm_srv_namespace_list;
+extern struct mutex ldlm_cli_namespace_lock;
+extern struct list_head ldlm_cli_active_namespace_list;
+extern struct list_head ldlm_cli_inactive_namespace_list;
+extern unsigned int ldlm_cancel_unused_locks_before_replay;
+extern struct kmem_cache *ldlm_glimpse_work_kmem;
+
+static inline int ldlm_namespace_nr_read(enum ldlm_side client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		ldlm_srv_namespace_nr : ldlm_cli_namespace_nr;
+}
+
+static inline void ldlm_namespace_nr_inc(enum ldlm_side client)
+{
+	if (client == LDLM_NAMESPACE_SERVER)
+		ldlm_srv_namespace_nr++;
+	else
+		ldlm_cli_namespace_nr++;
+}
+
+static inline void ldlm_namespace_nr_dec(enum ldlm_side client)
+{
+	if (client == LDLM_NAMESPACE_SERVER)
+		ldlm_srv_namespace_nr--;
+	else
+		ldlm_cli_namespace_nr--;
+}
+
+static inline struct list_head *ldlm_namespace_list(enum ldlm_side client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_list : &ldlm_cli_active_namespace_list;
+}
+
+static inline
+struct list_head *ldlm_namespace_inactive_list(enum ldlm_side client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_list : &ldlm_cli_inactive_namespace_list;
+}
+
+static inline struct mutex *ldlm_namespace_lock(enum ldlm_side client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock;
+}
+
+/* ns_bref is the number of resources in this namespace */
+static inline int ldlm_ns_empty(struct ldlm_namespace *ns)
+{
+	return atomic_read(&ns->ns_bref) == 0;
+}
+
+void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *,
+					  enum ldlm_side);
+void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *,
+					    enum ldlm_side);
+struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side);
+
+/* ldlm_request.c */
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int min,
+		    enum ldlm_cancel_flags cancel_flags,
+		    enum ldlm_lru_flags lru_flags);
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
+			  struct list_head *cancels, int min, int max,
+			  enum ldlm_cancel_flags cancel_flags,
+			  enum ldlm_lru_flags lru_flags);
+extern unsigned int ldlm_enqueue_min;
+/* ldlm_resource.c */
+extern struct kmem_cache *ldlm_resource_slab;
+extern struct kmem_cache *ldlm_lock_slab;
+extern struct kmem_cache *ldlm_inodebits_slab;
+extern struct kmem_cache *ldlm_interval_tree_slab;
+
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+                                     struct ldlm_lock *new);
+void ldlm_resource_insert_lock_before(struct ldlm_lock *original,
+				      struct ldlm_lock *new);
+
+/* ldlm_lock.c */
+
+typedef enum {
+	LDLM_WORK_BL_AST,
+	LDLM_WORK_CP_AST,
+	LDLM_WORK_REVOKE_AST,
+	LDLM_WORK_GL_AST
+} ldlm_desc_ast_t;
+
+void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock);
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size);
+struct ldlm_lock *
+ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
+		 enum ldlm_type type, enum ldlm_mode mode,
+		 const struct ldlm_callback_suite *cbs,
+		 void *data, __u32 lvb_len, enum lvb_type lvb_type);
+enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
+				  struct ldlm_namespace *,
+				  struct ldlm_lock **,
+				  void *cookie, __u64 *flags);
+void ldlm_lock_addref_internal(struct ldlm_lock *, enum ldlm_mode mode);
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode);
+void ldlm_lock_decref_internal(struct ldlm_lock *, enum ldlm_mode mode);
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode);
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list);
+#ifdef HAVE_SERVER_SUPPORT
+int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
+			 struct list_head *work_list,
+			 enum ldlm_process_intention intention, __u64 hint);
+int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
+			      struct list_head *rpc_list);
+void ldlm_discard_bl_list(struct list_head *bl_list);
+void ldlm_clear_blocking_lock(struct ldlm_lock *lock);
+void ldlm_clear_blocking_data(struct ldlm_lock *lock);
+#endif
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+		      ldlm_desc_ast_t ast_type);
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
+int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use);
+#define ldlm_lock_remove_from_lru(lock) \
+		ldlm_lock_remove_from_lru_check(lock, ktime_set(0, 0))
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock);
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
+
+int ldlm_export_cancel_blocked_locks(struct obd_export *exp);
+int ldlm_export_cancel_locks(struct obd_export *exp);
+void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock);
+
+/* ldlm_lockd.c */
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+                           struct ldlm_lock *lock);
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns,
+			   struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   enum ldlm_cancel_flags cancel_flags);
+int ldlm_bl_to_thread_ns(struct ldlm_namespace *ns);
+int ldlm_bl_thread_wakeup(void);
+
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+                             struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+
+#ifdef HAVE_SERVER_SUPPORT
+/* ldlm_plain.c */
+int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
+			    enum ldlm_process_intention intention,
+			    enum ldlm_error *err, struct list_head *work_list);
+
+/* ldlm_inodebits.c */
+int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags,
+				enum ldlm_process_intention intention,
+				enum ldlm_error *err,
+				struct list_head *work_list);
+int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res,
+				   struct list_head *queue,
+				   struct list_head *work_list,
+				   enum ldlm_process_intention intention,
+				   __u64 hint);
+/* ldlm_extent.c */
+int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
+			     enum ldlm_process_intention intention,
+			     enum ldlm_error *err, struct list_head *work_list);
+#endif
+int ldlm_extent_alloc_lock(struct ldlm_lock *lock);
+void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
+
+int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock);
+void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head,
+			     struct ldlm_lock *lock, bool tail);
+void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock);
+
+/* ldlm_flock.c */
+int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+			    enum ldlm_process_intention intention,
+			    enum ldlm_error *err, struct list_head *work_list);
+int ldlm_init_flock_export(struct obd_export *exp);
+void ldlm_destroy_flock_export(struct obd_export *exp);
+
+/* l_lock.c */
+void l_check_ns_lock(struct ldlm_namespace *ns);
+void l_check_no_ns_lock(struct ldlm_namespace *ns);
+
+extern struct dentry *ldlm_svc_debugfs_dir;
+
+struct ldlm_state {
+        struct ptlrpc_service *ldlm_cb_service;
+        struct ptlrpc_service *ldlm_cancel_service;
+        struct ptlrpc_client *ldlm_client;
+        struct ldlm_bl_pool *ldlm_bl_pool;
+};
+
+/* interval tree, for LDLM_EXTENT. */
+extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
+extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+extern void ldlm_interval_free(struct ldlm_interval *node);
+/* this function must be called with res lock held */
+static inline struct ldlm_extent *
+ldlm_interval_extent(struct ldlm_interval *node)
+{
+        struct ldlm_lock *lock;
+	LASSERT(!list_empty(&node->li_group));
+
+	lock = list_entry(node->li_group.next, struct ldlm_lock,
+                              l_sl_policy);
+        return &lock->l_policy_data.l_extent;
+}
+
+int ldlm_init(void);
+void ldlm_exit(void);
+
+enum ldlm_policy_res {
+        LDLM_POLICY_CANCEL_LOCK,
+        LDLM_POLICY_KEEP_LOCK,
+        LDLM_POLICY_SKIP_LOCK
+};
+
+#define LDLM_POOL_SYSFS_PRINT_int(v) sprintf(buf, "%d\n", v)
+#define LDLM_POOL_SYSFS_SET_int(a, b) { a = b; }
+#define LDLM_POOL_SYSFS_PRINT_u64(v) sprintf(buf, "%lld\n", v)
+#define LDLM_POOL_SYSFS_SET_u64(a, b) { a = b; }
+#define LDLM_POOL_SYSFS_PRINT_atomic(v) sprintf(buf, "%d\n", atomic_read(&v))
+#define LDLM_POOL_SYSFS_SET_atomic(a, b) atomic_set(&a, b)
+
+#define LDLM_POOL_SYSFS_READER_SHOW(var, type)				   \
+	static ssize_t var##_show(struct kobject *kobj,			   \
+				  struct attribute *attr,		   \
+				  char *buf)				   \
+	{								   \
+		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
+						    pl_kobj);		   \
+		type tmp;						   \
+									   \
+		spin_lock(&pl->pl_lock);				   \
+		tmp = pl->pl_##var;					   \
+		spin_unlock(&pl->pl_lock);				   \
+									   \
+		return LDLM_POOL_SYSFS_PRINT_##type(tmp);		   \
+	}								   \
+	struct __##var##__dummy_read {;} /* semicolon catcher */
+
+#define LDLM_POOL_SYSFS_WRITER_STORE(var, type)				   \
+	static ssize_t var##_store(struct kobject *kobj,		   \
+				   struct attribute *attr,		   \
+				   const char *buffer,			   \
+				   size_t count)			   \
+	{								   \
+		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
+						    pl_kobj);		   \
+		unsigned long tmp;					   \
+		int rc;							   \
+									   \
+		rc = kstrtoul(buffer, 10, &tmp);			   \
+		if (rc < 0) {						   \
+			return rc;					   \
+		}							   \
+									   \
+		spin_lock(&pl->pl_lock);				   \
+		LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp);		   \
+		spin_unlock(&pl->pl_lock);				   \
+									   \
+		return count;						   \
+	}								   \
+	struct __##var##__dummy_write {; } /* semicolon catcher */
+
+#define LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(var, type)			   \
+	static ssize_t var##_show(struct kobject *kobj,			   \
+				  struct attribute *attr,		   \
+				  char *buf)				   \
+	{								   \
+		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
+						    pl_kobj);		   \
+									   \
+		return LDLM_POOL_SYSFS_PRINT_##type(pl->pl_##var);	   \
+	}								   \
+	struct __##var##__dummy_read {; } /* semicolon catcher */
+
+#define LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(var, type)			   \
+	static ssize_t var##_store(struct kobject *kobj,		   \
+				   struct attribute *attr,		   \
+				   const char *buffer,			   \
+				   size_t count)			   \
+	{								   \
+		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
+						    pl_kobj);		   \
+		unsigned long tmp;					   \
+		int rc;							   \
+									   \
+		rc = kstrtoul(buffer, 10, &tmp);			   \
+		if (rc < 0) {						   \
+			return rc;					   \
+		}							   \
+									   \
+		LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp);		   \
+									   \
+		return count;						   \
+	}								   \
+	struct __##var##__dummy_write {; } /* semicolon catcher */
+
+static inline void
+ldlm_add_var(struct ldebugfs_vars *vars, struct dentry *debugfs_entry,
+	     const char *name, void *data, const struct file_operations *ops)
+{
+	vars->name = name;
+	vars->data = data;
+	vars->fops = ops;
+	ldebugfs_add_vars(debugfs_entry, vars, NULL);
+}
+
+static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
+{
+	int ret = 0;
+
+	lock_res_and_lock(lock);
+	ret = is_granted_or_cancelled_nolock(lock);
+	unlock_res_and_lock(lock);
+
+	return ret;
+}
+
+static inline bool is_bl_done(struct ldlm_lock *lock)
+{
+	bool bl_done = true;
+
+	if (!ldlm_is_bl_done(lock)) {
+		lock_res_and_lock(lock);
+		bl_done = ldlm_is_bl_done(lock);
+		unlock_res_and_lock(lock);
+	}
+
+	return bl_done;
+}
+
+static inline bool is_lock_converted(struct ldlm_lock *lock)
+{
+	bool ret = 0;
+
+	lock_res_and_lock(lock);
+	ret = (lock->l_policy_data.l_inodebits.cancel_bits == 0);
+	unlock_res_and_lock(lock);
+
+	return ret;
+}
+
+typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *,
+					    union ldlm_policy_data *);
+typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *,
+					    union ldlm_wire_policy_data *);
+void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy);
+void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy);
+void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy);
+void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy);
+void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				      union ldlm_policy_data *lpolicy);
+void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				      union ldlm_wire_policy_data *wpolicy);
+void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy);
+void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy);
+
+/* ldlm_reclaim.c */
+#ifdef HAVE_SERVER_SUPPORT
+extern __u64 ldlm_reclaim_threshold;
+extern __u64 ldlm_lock_limit;
+extern __u64 ldlm_reclaim_threshold_mb;
+extern __u64 ldlm_lock_limit_mb;
+extern struct percpu_counter ldlm_granted_total;
+#endif
+int ldlm_reclaim_setup(void);
+void ldlm_reclaim_cleanup(void);
+void ldlm_reclaim_add(struct ldlm_lock *lock);
+void ldlm_reclaim_del(struct ldlm_lock *lock);
+bool ldlm_reclaim_full(void);
+
+static inline bool ldlm_res_eq(const struct ldlm_res_id *res0,
+			       const struct ldlm_res_id *res1)
+{
+	return memcmp(res0, res1, sizeof(*res0)) == 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
new file mode 100644
index 0000000000000..bf61555c331da
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
@@ -0,0 +1,3569 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+/**
+ * This file deals with various client/target related logic including recovery.
+ *
+ * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
+ * should be moved.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <cl_object.h>
+#include <linux/fs_struct.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ldlm_internal.h"
+
+/*
+ * @priority: If non-zero, move the selected connection to the list head.
+ * @create: If zero, only search in existing connections.
+ */
+static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority, int create)
+{
+	struct ptlrpc_connection *ptlrpc_conn;
+	struct obd_import_conn *imp_conn = NULL, *item;
+	lnet_nid_t nid4refnet = LNET_NID_ANY;
+	u32 refnet = imp->imp_conn_restricted_net;
+	int rc = 0;
+
+	ENTRY;
+
+	if (!create && !priority) {
+		CDEBUG(D_HA, "Nothing to do\n");
+		RETURN(-EINVAL);
+	}
+
+	/* refnet is used to restrict network connections */
+	if (refnet != LNET_NIDNET(LNET_NID_ANY)) {
+		CDEBUG(D_HA, "imp %s: restrict %s to %s net\n",
+		       imp->imp_obd->obd_name, uuid->uuid,
+		       libcfs_net2str(refnet));
+		nid4refnet = LNET_MKNID(refnet, 0);
+	}
+
+	ptlrpc_conn = ptlrpc_uuid_to_connection(uuid, nid4refnet);
+	if (!ptlrpc_conn) {
+		CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid);
+		RETURN(-ENOENT);
+	}
+
+	if (create) {
+		OBD_ALLOC(imp_conn, sizeof(*imp_conn));
+		if (!imp_conn)
+			GOTO(out_put, rc = -ENOMEM);
+	}
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(item, &imp->imp_conn_list, oic_item) {
+		if (obd_uuid_equals(uuid, &item->oic_uuid)) {
+			if (priority) {
+				list_move(&item->oic_item,
+					  &imp->imp_conn_list);
+				item->oic_last_attempt = 0;
+			}
+			CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n",
+			       imp, imp->imp_obd->obd_name, uuid->uuid,
+			       (priority ? ", moved to head" : ""));
+			spin_unlock(&imp->imp_lock);
+			GOTO(out_free, rc = 0);
+		}
+	}
+	/* No existing import connection found for \a uuid. */
+	if (create) {
+		imp_conn->oic_conn = ptlrpc_conn;
+		imp_conn->oic_uuid = *uuid;
+		imp_conn->oic_last_attempt = 0;
+		if (priority)
+			list_add(&imp_conn->oic_item, &imp->imp_conn_list);
+		else
+			list_add_tail(&imp_conn->oic_item,
+				      &imp->imp_conn_list);
+		CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n",
+		       imp, imp->imp_obd->obd_name, uuid->uuid,
+		       (priority ? "head" : "tail"));
+	} else {
+		spin_unlock(&imp->imp_lock);
+		GOTO(out_free, rc = -ENOENT);
+	}
+
+	spin_unlock(&imp->imp_lock);
+	RETURN(0);
+out_free:
+	if (imp_conn)
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+out_put:
+	ptlrpc_connection_put(ptlrpc_conn);
+	RETURN(rc);
+}
+
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	return import_set_conn(imp, uuid, 1, 0);
+}
+
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority)
+{
+	return import_set_conn(imp, uuid, priority, 1);
+}
+EXPORT_SYMBOL(client_import_add_conn);
+
+int client_import_dyn_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			       lnet_nid_t prim_nid, int priority)
+{
+	struct ptlrpc_connection *ptlrpc_conn;
+	int rc;
+
+	ptlrpc_conn = ptlrpc_uuid_to_connection(uuid, prim_nid);
+	if (!ptlrpc_conn) {
+		const char *str_uuid = obd_uuid2str(uuid);
+
+		rc = class_add_uuid(str_uuid, prim_nid);
+		if (rc) {
+			CERROR("%s: failed to add UUID '%s': rc = %d\n",
+			       imp->imp_obd->obd_name, str_uuid, rc);
+			return rc;
+		}
+	}
+	return import_set_conn(imp, uuid, priority, 1);
+}
+EXPORT_SYMBOL(client_import_dyn_add_conn);
+
+int client_import_add_nids_to_conn(struct obd_import *imp, lnet_nid_t *nids,
+				   int nid_count, struct obd_uuid *uuid)
+{
+	struct obd_import_conn *conn;
+	int rc = -ENOENT;
+
+	ENTRY;
+	if (nid_count <= 0 || !nids)
+		return rc;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		if (class_check_uuid(&conn->oic_uuid, nids[0])) {
+			*uuid = conn->oic_uuid;
+			spin_unlock(&imp->imp_lock);
+			rc = class_add_nids_to_uuid(&conn->oic_uuid, nids,
+						    nid_count);
+			RETURN(rc);
+		}
+	}
+	spin_unlock(&imp->imp_lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_add_nids_to_conn);
+
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	struct obd_import_conn *imp_conn;
+	struct obd_export *dlmexp;
+	int rc = -ENOENT;
+
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (list_empty(&imp->imp_conn_list)) {
+		LASSERT(!imp->imp_connection);
+		GOTO(out, rc);
+	}
+
+	list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) {
+		if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid))
+			continue;
+		LASSERT(imp_conn->oic_conn);
+
+		if (imp_conn == imp->imp_conn_current) {
+			LASSERT(imp_conn->oic_conn == imp->imp_connection);
+
+			if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+			    imp->imp_state != LUSTRE_IMP_DISCON) {
+				CERROR("can't remove current connection\n");
+				GOTO(out, rc = -EBUSY);
+			}
+
+			ptlrpc_connection_put(imp->imp_connection);
+			imp->imp_connection = NULL;
+
+			dlmexp = class_conn2export(&imp->imp_dlm_handle);
+			if (dlmexp && dlmexp->exp_connection) {
+				LASSERT(dlmexp->exp_connection ==
+					imp_conn->oic_conn);
+				ptlrpc_connection_put(dlmexp->exp_connection);
+				dlmexp->exp_connection = NULL;
+			}
+
+			if (dlmexp != NULL)
+				class_export_put(dlmexp);
+		}
+
+		list_del(&imp_conn->oic_item);
+		ptlrpc_connection_put(imp_conn->oic_conn);
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+		CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
+		       imp, imp->imp_obd->obd_name, uuid->uuid);
+		rc = 0;
+		break;
+	}
+out:
+	spin_unlock(&imp->imp_lock);
+	if (rc == -ENOENT)
+		CERROR("connection %s not found\n", uuid->uuid);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_del_conn);
+
+/**
+ * Find conn UUID by peer NID. \a peer is a server NID. This function is used
+ * to find a conn uuid of \a imp which can reach \a peer.
+ */
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+			    struct obd_uuid *uuid)
+{
+	struct obd_import_conn *conn;
+	int rc = -ENOENT;
+
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		/* Check if conn UUID does have this peer NID. */
+		if (class_check_uuid(&conn->oic_uuid, peer)) {
+			*uuid = conn->oic_uuid;
+			rc = 0;
+			break;
+		}
+	}
+	spin_unlock(&imp->imp_lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_find_conn);
+
+void client_destroy_import(struct obd_import *imp)
+{
+	/*
+	 * Drop security policy instance after all RPCs have finished/aborted
+	 * to let all busy contexts be released.
+	 */
+	class_import_get(imp);
+	class_destroy_import(imp);
+	sptlrpc_import_sec_put(imp);
+	class_import_put(imp);
+}
+EXPORT_SYMBOL(client_destroy_import);
+
+/**
+ * Check whether or not the OSC is on MDT.
+ * In the config log,
+ * osc on MDT
+ *	setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
+ * osc on client
+ *	setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
+ *
+ **/
+static int osc_on_mdt(char *obdname)
+{
+	char *ptr;
+
+	ptr = strrchr(obdname, '-');
+	if (ptr == NULL)
+		return 0;
+
+	if (strncmp(ptr + 1, "MDT", 3) == 0)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Configure an RPC client OBD device.
+ *
+ * lcfg parameters:
+ * 1 - client UUID
+ * 2 - server UUID
+ * 3 - inactive-on-startup
+ * 4 - restrictive net
+ */
+int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp;
+	struct obd_uuid server_uuid;
+	int rq_portal, rp_portal, connect_op;
+	const char *name = obd->obd_type->typ_name;
+	enum ldlm_ns_type ns_type = LDLM_NS_TYPE_UNKNOWN;
+	char *cli_name = lustre_cfg_buf(lcfg, 0);
+	int rc;
+
+	ENTRY;
+
+	/*
+	 * In a more perfect world, we would hang a ptlrpc_client off of
+	 * obd_type and just use the values from there.
+	 */
+	if (!strcmp(name, LUSTRE_OSC_NAME)) {
+		rq_portal = OST_REQUEST_PORTAL;
+		rp_portal = OSC_REPLY_PORTAL;
+		connect_op = OST_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_OST;
+		ns_type = LDLM_NS_TYPE_OSC;
+	} else if (!strcmp(name, LUSTRE_MDC_NAME) ||
+		   !strcmp(name, LUSTRE_LWP_NAME)) {
+		rq_portal = MDS_REQUEST_PORTAL;
+		rp_portal = MDC_REPLY_PORTAL;
+		connect_op = MDS_CONNECT;
+		if (is_lwp_on_ost(cli_name))
+			cli->cl_sp_me = LUSTRE_SP_OST;
+		else if (is_lwp_on_mdt(cli_name))
+			cli->cl_sp_me = LUSTRE_SP_MDT;
+		else
+			cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_MDT;
+		ns_type = LDLM_NS_TYPE_MDC;
+	} else if (!strcmp(name, LUSTRE_OSP_NAME)) {
+		if (strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL) {
+			/* OSP_on_MDT for other MDTs */
+			connect_op = MDS_CONNECT;
+			cli->cl_sp_to = LUSTRE_SP_MDT;
+			ns_type = LDLM_NS_TYPE_MDC;
+			rq_portal = OUT_PORTAL;
+		} else {
+			/* OSP on MDT for OST */
+			connect_op = OST_CONNECT;
+			cli->cl_sp_to = LUSTRE_SP_OST;
+			ns_type = LDLM_NS_TYPE_OSC;
+			rq_portal = OST_REQUEST_PORTAL;
+		}
+		rp_portal = OSC_REPLY_PORTAL;
+		cli->cl_sp_me = LUSTRE_SP_MDT;
+	} else if (!strcmp(name, LUSTRE_MGC_NAME)) {
+		rq_portal = MGS_REQUEST_PORTAL;
+		rp_portal = MGC_REPLY_PORTAL;
+		connect_op = MGS_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_MGC;
+		cli->cl_sp_to = LUSTRE_SP_MGS;
+		cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID;
+		ns_type = LDLM_NS_TYPE_MGC;
+	} else {
+		CERROR("unknown client OBD type \"%s\", can't setup\n",
+		       name);
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("requires a TARGET UUID\n");
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
+		CERROR("client UUID must be less than 38 characters\n");
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
+		CERROR("setup requires a SERVER UUID\n");
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
+		CERROR("target UUID must be less than 38 characters\n");
+		RETURN(-EINVAL);
+	}
+
+	init_rwsem(&cli->cl_sem);
+	mutex_init(&cli->cl_mgc_mutex);
+	cli->cl_seq = NULL;
+	init_rwsem(&cli->cl_seq_rwsem);
+	cli->cl_conn_count = 0;
+	memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
+	       min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
+		     sizeof(server_uuid)));
+
+	cli->cl_dirty_pages = 0;
+	cli->cl_dirty_max_pages = 0;
+	cli->cl_avail_grant = 0;
+	/* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */
+	/*
+	 * cl_dirty_max_pages may be changed at connect time in
+	 * ptlrpc_connect_interpret().
+	 */
+	client_adjust_max_dirty(cli);
+	init_waitqueue_head(&cli->cl_cache_waiters);
+	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_write_list);
+	INIT_LIST_HEAD(&cli->cl_loi_read_list);
+	spin_lock_init(&cli->cl_loi_list_lock);
+	atomic_set(&cli->cl_pending_w_pages, 0);
+	atomic_set(&cli->cl_pending_r_pages, 0);
+	cli->cl_r_in_flight = 0;
+	cli->cl_w_in_flight = 0;
+
+	spin_lock_init(&cli->cl_read_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_offset_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_offset_hist.oh_lock);
+
+	/* lru for osc. */
+	INIT_LIST_HEAD(&cli->cl_lru_osc);
+	atomic_set(&cli->cl_lru_shrinkers, 0);
+	atomic_long_set(&cli->cl_lru_busy, 0);
+	atomic_long_set(&cli->cl_lru_in_list, 0);
+	INIT_LIST_HEAD(&cli->cl_lru_list);
+	spin_lock_init(&cli->cl_lru_list_lock);
+	atomic_long_set(&cli->cl_unstable_count, 0);
+	INIT_LIST_HEAD(&cli->cl_shrink_list);
+	INIT_LIST_HEAD(&cli->cl_grant_chain);
+
+	INIT_LIST_HEAD(&cli->cl_flight_waiters);
+	cli->cl_rpcs_in_flight = 0;
+
+	init_waitqueue_head(&cli->cl_destroy_waitq);
+	atomic_set(&cli->cl_destroy_in_flight, 0);
+
+
+	cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+	cli->cl_preferred_cksum_type = 0;
+#ifdef ENABLE_CHECKSUM
+	/* Turn on checksumming by default. */
+	cli->cl_checksum = 1;
+	/*
+	 * The supported checksum types will be worked out at connect time
+	 * Set cl_chksum* to CRC32 for now to avoid returning screwed info
+	 * through procfs.
+	 */
+	cli->cl_cksum_type = cli->cl_supp_cksum_types;
+#endif
+	atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
+
+	/*
+	 * Set it to possible maximum size. It may be reduced by ocd_brw_size
+	 * from OFD after connecting.
+	 */
+	cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
+
+	cli->cl_max_short_io_bytes = OBD_DEF_SHORT_IO_BYTES;
+
+	/*
+	 * set cl_chunkbits default value to PAGE_SHIFT,
+	 * it will be updated at OSC connection time.
+	 */
+	cli->cl_chunkbits = PAGE_SHIFT;
+
+	if (!strcmp(name, LUSTRE_MDC_NAME)) {
+		cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
+	} else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 128 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 2;
+	} else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 256 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 3;
+	} else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 512 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 4;
+	} else {
+		if (osc_on_mdt(obd->obd_name))
+			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX;
+		else
+			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
+	}
+
+	spin_lock_init(&cli->cl_mod_rpcs_lock);
+	spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
+	cli->cl_max_mod_rpcs_in_flight = 0;
+	cli->cl_mod_rpcs_in_flight = 0;
+	cli->cl_close_rpcs_in_flight = 0;
+	init_waitqueue_head(&cli->cl_mod_rpcs_waitq);
+	cli->cl_mod_rpcs_init = ktime_get_real();
+	cli->cl_mod_tag_bitmap = NULL;
+
+	INIT_LIST_HEAD(&cli->cl_chg_dev_linkage);
+
+	if (connect_op == MDS_CONNECT) {
+		cli->cl_max_mod_rpcs_in_flight = cli->cl_max_rpcs_in_flight - 1;
+		OBD_ALLOC(cli->cl_mod_tag_bitmap,
+			  BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+		if (cli->cl_mod_tag_bitmap == NULL)
+			GOTO(err, rc = -ENOMEM);
+	}
+
+	rc = ldlm_get_ref();
+	if (rc) {
+		CERROR("ldlm_get_ref failed: %d\n", rc);
+		GOTO(err, rc);
+	}
+
+	ptlrpc_init_client(rq_portal, rp_portal, name,
+			   &obd->obd_ldlm_client);
+
+	imp = class_new_import(obd);
+	if (imp == NULL)
+		GOTO(err_ldlm, rc = -ENOENT);
+	imp->imp_client = &obd->obd_ldlm_client;
+	imp->imp_connect_op = connect_op;
+	memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+	       LUSTRE_CFG_BUFLEN(lcfg, 1));
+	class_import_put(imp);
+
+	if (lustre_cfg_buf(lcfg, 4)) {
+		__u32 refnet = libcfs_str2net(lustre_cfg_string(lcfg, 4));
+
+		if (refnet == LNET_NET_ANY) {
+			rc = -EINVAL;
+			CERROR("%s: bad mount option 'network=%s': rc = %d\n",
+			       obd->obd_name, lustre_cfg_string(lcfg, 4),
+			       rc);
+			GOTO(err_import, rc);
+		}
+		imp->imp_conn_restricted_net = refnet;
+	} else {
+		imp->imp_conn_restricted_net = LNET_NIDNET(LNET_NID_ANY);
+	}
+
+	rc = client_import_add_conn(imp, &server_uuid, 1);
+	if (rc) {
+		CERROR("can't add initial connection\n");
+		GOTO(err_import, rc);
+	}
+	imp->imp_connection = NULL;
+
+	cli->cl_import = imp;
+	/* cli->cl_max_mds_easize updated by mdc_init_ea_size() */
+	cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+		if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
+			CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
+			       name, obd->obd_name,
+			       cli->cl_target_uuid.uuid);
+			spin_lock(&imp->imp_lock);
+			imp->imp_deactive = 1;
+			spin_unlock(&imp->imp_lock);
+		}
+	}
+
+	obd->obd_namespace = ldlm_namespace_new(obd, obd->obd_name,
+						LDLM_NAMESPACE_CLIENT,
+						LDLM_NAMESPACE_GREEDY,
+						ns_type);
+	if (IS_ERR(obd->obd_namespace)) {
+		rc = PTR_ERR(obd->obd_namespace);
+		CERROR("%s: unable to create client namespace: rc = %d\n",
+		       obd->obd_name, rc);
+		obd->obd_namespace = NULL;
+		GOTO(err_import, rc);
+	}
+
+	RETURN(rc);
+
+err_import:
+	class_destroy_import(imp);
+err_ldlm:
+	ldlm_put_ref();
+err:
+	if (cli->cl_mod_tag_bitmap != NULL)
+		OBD_FREE(cli->cl_mod_tag_bitmap,
+			 BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+	cli->cl_mod_tag_bitmap = NULL;
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_obd_setup);
+
+int client_obd_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+
+	ENTRY;
+
+	ldlm_namespace_free_post(obd->obd_namespace);
+	obd->obd_namespace = NULL;
+
+	obd_cleanup_client_import(obd);
+	LASSERT(obd->u.cli.cl_import == NULL);
+
+	ldlm_put_ref();
+
+	if (cli->cl_mod_tag_bitmap != NULL)
+		OBD_FREE(cli->cl_mod_tag_bitmap,
+			 BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+	cli->cl_mod_tag_bitmap = NULL;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(client_obd_cleanup);
+
+/* ->o_connect() method for client side (OSC and MDC and MGC) */
+int client_connect_import(const struct lu_env *env,
+			  struct obd_export **exp,
+			  struct obd_device *obd, struct obd_uuid *cluuid,
+			  struct obd_connect_data *data, void *localdata)
+{
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp = cli->cl_import;
+	struct obd_connect_data *ocd;
+	struct lustre_handle conn = { 0 };
+	int rc;
+
+	ENTRY;
+
+	*exp = NULL;
+	down_write(&cli->cl_sem);
+	if (cli->cl_conn_count > 0)
+		GOTO(out_sem, rc = -EALREADY);
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc)
+		GOTO(out_sem, rc);
+
+	cli->cl_conn_count++;
+	*exp = class_conn2export(&conn);
+
+	LASSERT(obd->obd_namespace);
+
+	imp->imp_dlm_handle = conn;
+	rc = ptlrpc_init_import(imp);
+	if (rc != 0)
+		GOTO(out_ldlm, rc);
+
+	ocd = &imp->imp_connect_data;
+	if (data) {
+		*ocd = *data;
+		imp->imp_connect_flags_orig = data->ocd_connect_flags;
+		imp->imp_connect_flags2_orig = data->ocd_connect_flags2;
+	}
+
+	rc = ptlrpc_connect_import(imp);
+	if (rc != 0) {
+		LASSERT(imp->imp_state == LUSTRE_IMP_DISCON);
+		GOTO(out_ldlm, rc);
+	}
+	LASSERT(*exp != NULL && (*exp)->exp_connection);
+
+	if (data) {
+		LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
+			 ocd->ocd_connect_flags, "old %#llx, new %#llx\n",
+			 data->ocd_connect_flags, ocd->ocd_connect_flags);
+		data->ocd_connect_flags = ocd->ocd_connect_flags;
+		data->ocd_connect_flags2 = ocd->ocd_connect_flags2;
+	}
+
+	ptlrpc_pinger_add_import(imp);
+
+	EXIT;
+
+	if (rc) {
+out_ldlm:
+		cli->cl_conn_count--;
+		class_disconnect(*exp);
+		*exp = NULL;
+	}
+out_sem:
+	up_write(&cli->cl_sem);
+
+	if (!rc && localdata) {
+		LASSERT(cli->cl_cache == NULL); /* only once */
+		cli->cl_cache = (struct cl_client_cache *)localdata;
+		cl_cache_incref(cli->cl_cache);
+		cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
+
+		/* add this osc into entity list */
+		LASSERT(list_empty(&cli->cl_lru_osc));
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(client_connect_import);
+
+int client_disconnect_export(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct client_obd *cli;
+	struct obd_import *imp;
+	int rc = 0, err;
+
+	ENTRY;
+
+	if (!obd) {
+		CERROR("invalid export for disconnect: exp %p cookie %#llx\n",
+		       exp, exp ? exp->exp_handle.h_cookie : -1);
+		RETURN(-EINVAL);
+	}
+
+	cli = &obd->u.cli;
+	imp = cli->cl_import;
+
+	down_write(&cli->cl_sem);
+	CDEBUG(D_INFO, "disconnect %s - %zu\n", obd->obd_name,
+		cli->cl_conn_count);
+
+	if (cli->cl_conn_count == 0) {
+		CERROR("disconnecting disconnected device (%s)\n",
+		       obd->obd_name);
+		GOTO(out_disconnect, rc = -EINVAL);
+	}
+
+	cli->cl_conn_count--;
+	if (cli->cl_conn_count != 0)
+		GOTO(out_disconnect, rc = 0);
+
+	/*
+	 * Mark import deactivated now, so we don't try to reconnect if any
+	 * of the cleanup RPCs fails (e.g. LDLM cancel, etc).  We don't
+	 * fully deactivate the import, or that would drop all requests.
+	 */
+	spin_lock(&imp->imp_lock);
+	imp->imp_deactive = 1;
+	spin_unlock(&imp->imp_lock);
+
+	/*
+	 * Some non-replayable imports (MDS's OSCs) are pinged, so just
+	 * delete it regardless.  (It's safe to delete an import that was
+	 * never added.)
+	 */
+	(void)ptlrpc_pinger_del_import(imp);
+
+	if (obd->obd_namespace != NULL) {
+		/* obd_force == local only */
+		ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
+				       obd->obd_force ? LCF_LOCAL : 0, NULL);
+		ldlm_namespace_free_prior(obd->obd_namespace, imp,
+					  obd->obd_force);
+	}
+
+	/*
+	 * There's no need to hold sem while disconnecting an import,
+	 * and it may actually cause deadlock in GSS.
+	 */
+	up_write(&cli->cl_sem);
+	rc = ptlrpc_disconnect_import(imp, 0);
+	down_write(&cli->cl_sem);
+
+	ptlrpc_invalidate_import(imp);
+
+	EXIT;
+
+out_disconnect:
+	/*
+	 * Use server style - class_disconnect should be always called for
+	 * o_disconnect.
+	 */
+	err = class_disconnect(exp);
+	if (!rc && err)
+		rc = err;
+
+	up_write(&cli->cl_sem);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_disconnect_export);
+
+#ifdef HAVE_SERVER_SUPPORT
+int server_disconnect_export(struct obd_export *exp)
+{
+	int rc;
+
+	ENTRY;
+
+	/* Disconnect early so that clients can't keep using export. */
+	rc = class_disconnect(exp);
+	/* Close import to avoid sending any requests. */
+	if (exp->exp_imp_reverse)
+		ptlrpc_cleanup_imp(exp->exp_imp_reverse);
+
+	ldlm_bl_thread_wakeup();
+
+	/* complete all outstanding replies */
+	spin_lock(&exp->exp_lock);
+	while (!list_empty(&exp->exp_outstanding_replies)) {
+		struct ptlrpc_reply_state *rs =
+			list_entry(exp->exp_outstanding_replies.next,
+				       struct ptlrpc_reply_state, rs_exp_list);
+		struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+		spin_lock(&svcpt->scp_rep_lock);
+
+		list_del_init(&rs->rs_exp_list);
+
+		spin_lock(&rs->rs_lock);
+		/* clear rs_convert_lock to make sure rs is handled and put */
+		rs->rs_convert_lock = 0;
+		ptlrpc_schedule_difficult_reply(rs);
+		spin_unlock(&rs->rs_lock);
+
+		spin_unlock(&svcpt->scp_rep_lock);
+	}
+	spin_unlock(&exp->exp_lock);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(server_disconnect_export);
+
+static inline int target_check_recovery_timer(struct obd_device *target)
+{
+	ktime_t remaining;
+	s64 timeout;
+
+	if (!target->obd_recovering || target->obd_recovery_start == 0)
+		return 0;
+
+	remaining = hrtimer_get_remaining(&target->obd_recovery_timer);
+	timeout = ktime_divns(remaining, NSEC_PER_SEC);
+	if (timeout > -30)
+		return 0;
+
+	/* the recovery timer should expire, but it isn't triggered,
+	 * it's better to abort the recovery of this target to speed up
+	 * the recovery of the whole cluster. */
+	spin_lock(&target->obd_dev_lock);
+	if (target->obd_recovering) {
+		CERROR("%s: Aborting recovery\n", target->obd_name);
+		target->obd_abort_recovery = 1;
+		wake_up(&target->obd_next_transno_waitq);
+	}
+	spin_unlock(&target->obd_dev_lock);
+	return 0;
+}
+
+/*
+ * --------------------------------------------------------------------------
+ * from old lib/target.c
+ * --------------------------------------------------------------------------
+ */
+static int target_handle_reconnect(struct lustre_handle *conn,
+				   struct obd_export *exp,
+				   struct obd_uuid *cluuid)
+{
+	struct obd_device *target;
+	struct lustre_handle *hdl;
+	ktime_t remaining;
+	s64 timeout;
+	int rc = 0;
+
+	ENTRY;
+	hdl = &exp->exp_imp_reverse->imp_remote_handle;
+	if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
+		conn->cookie = exp->exp_handle.h_cookie;
+		CDEBUG(D_HA,
+		       "connect export for UUID '%s' at %p, cookie %#llx\n",
+		       cluuid->uuid, exp, conn->cookie);
+		RETURN(0);
+	}
+
+	target = exp->exp_obd;
+
+	/* Might be a re-connect after a partition. */
+	if (memcmp(&conn->cookie, &hdl->cookie, sizeof(conn->cookie))) {
+		LCONSOLE_WARN("%s: already connected client %s (at %s) with handle %#llx. Rejecting client with the same UUID trying to reconnect with handle %#llx\n",
+			      target->obd_name,
+			      obd_uuid2str(&exp->exp_client_uuid),
+			      obd_export_nid2str(exp),
+			      hdl->cookie, conn->cookie);
+		memset(conn, 0, sizeof(*conn));
+		/*
+		 * target_handle_connect() treats EALREADY and
+		 * -EALREADY differently.  -EALREADY is an error
+		 * (same UUID, different handle).
+		 */
+		RETURN(-EALREADY);
+	}
+
+	if (!target->obd_recovering) {
+		LCONSOLE_WARN("%s: Client %s (at %s) reconnecting\n",
+			target->obd_name, obd_uuid2str(&exp->exp_client_uuid),
+			obd_export_nid2str(exp));
+		GOTO(out_already, rc);
+	}
+
+	remaining = hrtimer_get_remaining(&target->obd_recovery_timer);
+	timeout = ktime_divns(remaining, NSEC_PER_SEC);
+	if (timeout > 0) {
+		LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n",
+			      target->obd_name,
+			      obd_uuid2str(&exp->exp_client_uuid),
+			      obd_export_nid2str(exp),
+			      atomic_read(&target->obd_max_recoverable_clients),
+			      timeout / 60, timeout % 60);
+	} else {
+		struct target_distribute_txn_data *tdtd;
+		int size = 0;
+		int count = 0;
+		char *buf = NULL;
+
+		target_check_recovery_timer(target);
+
+		tdtd = class_exp2tgt(exp)->lut_tdtd;
+		if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
+			buf = tdtd->tdtd_show_update_logs_retrievers(
+				tdtd->tdtd_show_retrievers_cbdata,
+				&size, &count);
+
+		if (count > 0)
+			LCONSOLE_WARN("%s: Client %s (at %s) reconnecting, waiting for %d MDTs (%s) in recovery for %lld:%.02lld. Please wait until all MDTs recovered or you may force MDT evicition via 'lctl --device %s abort_recovery.\n",
+				      target->obd_name,
+				      obd_uuid2str(&exp->exp_client_uuid),
+				      obd_export_nid2str(exp), count,
+				      buf ? buf : "unknown (not enough RAM)",
+				      (abs(timeout) + target->obd_recovery_timeout) / 60,
+				      (abs(timeout) + target->obd_recovery_timeout) % 60,
+				      target->obd_name);
+		else
+			LCONSOLE_WARN("%s: Recovery already passed deadline %lld:%.02lld. If you do not want to wait more, you may force taget eviction via 'lctl --device %s abort_recovery.\n",
+				      target->obd_name, abs(timeout) / 60,
+				      abs(timeout) % 60, target->obd_name);
+
+		if (buf != NULL)
+			OBD_FREE(buf, size);
+	}
+
+out_already:
+	conn->cookie = exp->exp_handle.h_cookie;
+	/*
+	 * target_handle_connect() treats EALREADY and
+	 * -EALREADY differently.  EALREADY means we are
+	 * doing a valid reconnect from the same client.
+	 */
+	RETURN(EALREADY);
+}
+
+static void
+check_and_start_recovery_timer(struct obd_device *obd,
+			       struct ptlrpc_request *req, int new_client);
+
+/**
+ * update flags for import during reconnect process
+ */
+static int rev_import_flags_update(struct obd_import *revimp,
+				   struct ptlrpc_request *req)
+{
+	int rc;
+	struct obd_connect_data *data;
+
+	data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
+
+	if (data->ocd_connect_flags & OBD_CONNECT_AT)
+		revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+	else
+		revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+	revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+
+	revimp->imp_connect_data = *data;
+	rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr);
+	if (rc) {
+		CERROR("%s: cannot get reverse import %s security: rc = %d\n",
+			revimp->imp_client->cli_name,
+			libcfs_id2str(req->rq_peer), rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * Allocate a new reverse import for an export.
+ *
+ * \retval -errno in case error hit
+ * \retval 0 if reverse import correctly init
+ **/
+int rev_import_init(struct obd_export *export)
+{
+	struct obd_device *obd = export->exp_obd;
+	struct obd_import *revimp;
+
+	LASSERT(export->exp_imp_reverse == NULL);
+
+	revimp = class_new_import(obd);
+	if (revimp == NULL)
+		return -ENOMEM;
+
+	revimp->imp_remote_handle.cookie = 0ULL;
+	revimp->imp_client = &obd->obd_ldlm_client;
+	revimp->imp_dlm_fake = 1;
+
+	/* it is safe to connect import in new state as no sends possible */
+	spin_lock(&export->exp_lock);
+	export->exp_imp_reverse = revimp;
+	spin_unlock(&export->exp_lock);
+	class_import_put(revimp);
+
+	return 0;
+}
+EXPORT_SYMBOL(rev_import_init);
+
+/**
+ * Handle reconnect for an export.
+ *
+ * \param exp export to handle reconnect process
+ * \param req client reconnect request
+ *
+ * \retval -rc in case securitfy flavor can't be changed
+ * \retval 0 in case none problems
+ */
+static int rev_import_reconnect(struct obd_export *exp,
+				struct ptlrpc_request *req)
+{
+	struct obd_import *revimp = exp->exp_imp_reverse;
+	struct lustre_handle *lh;
+	int rc;
+
+	/* avoid sending a request until import flags are changed */
+	ptlrpc_import_enter_resend(revimp);
+
+	ptlrpc_connection_put(revimp->imp_connection);
+
+	/*
+	 * client from recovery don't have a handle so we need to take from
+	 * request. it may produce situation when wrong client connected
+	 * to recovery as we trust a client uuid
+	 */
+	lh = req_capsule_client_get(&req->rq_pill, &RMF_CONN);
+	revimp->imp_remote_handle = *lh;
+
+	/*
+	 * unknown versions will be caught in
+	 * ptlrpc_handle_server_req_in->lustre_unpack_msg()
+	 */
+	revimp->imp_msg_magic = req->rq_reqmsg->lm_magic;
+
+	revimp->imp_connection = ptlrpc_connection_addref(exp->exp_connection);
+
+	rc = rev_import_flags_update(revimp, req);
+	if (rc != 0) {
+		/*
+		 * it is safe to still be in RECOVERY phase as we are not able
+		 * to setup correct security flavor so requests are not able to
+		 * be delivered correctly
+		 */
+		return rc;
+	}
+
+	/* resend all rpc's via new connection */
+	return ptlrpc_import_recovery_state_machine(revimp);
+}
+
+int target_handle_connect(struct ptlrpc_request *req)
+{
+	struct obd_device *target = NULL;
+	struct obd_export *export = NULL;
+	/*
+	 * connect handle - filled from target_handle_reconnect in
+	 * reconnect case
+	 */
+	struct lustre_handle conn;
+	struct lustre_handle *tmp;
+	struct obd_uuid cluuid;
+	char *str;
+	int rc = 0;
+	char *target_start;
+	int target_len;
+	bool mds_conn = false, lw_client = false, initial_conn = false;
+	bool mds_mds_conn = false;
+	bool new_mds_mds_conn = false;
+	struct obd_connect_data *data, *tmpdata;
+	int size, tmpsize;
+	lnet_nid_t *client_nid = NULL;
+	struct ptlrpc_connection *pcon = NULL;
+
+	ENTRY;
+
+	OBD_RACE(OBD_FAIL_TGT_CONN_RACE);
+
+	str = req_capsule_client_get(&req->rq_pill, &RMF_TGTUUID);
+	if (str == NULL) {
+		DEBUG_REQ(D_ERROR, req, "bad target UUID for connect");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	target = class_dev_by_str(str);
+	if (!target) {
+		deuuidify(str, NULL, &target_start, &target_len);
+		LCONSOLE_ERROR_MSG(0x137,
+				   "%s: not available for connect from %s (no target). If you are running an HA pair check that the target is mounted on the other server.\n",
+				   str, libcfs_nid2str(req->rq_peer.nid));
+		GOTO(out, rc = -ENODEV);
+	}
+
+	spin_lock(&target->obd_dev_lock);
+
+	target->obd_conn_inprogress++;
+
+	if (target->obd_stopping || !target->obd_set_up) {
+		spin_unlock(&target->obd_dev_lock);
+
+		deuuidify(str, NULL, &target_start, &target_len);
+		LCONSOLE_INFO("%.*s: Not available for connect from %s (%s)\n",
+			      target_len, target_start,
+			      libcfs_nid2str(req->rq_peer.nid),
+			      (target->obd_stopping ?
+			       "stopping" : "not set up"));
+		GOTO(out, rc = -ENODEV);
+	}
+
+	if (target->obd_no_conn) {
+		spin_unlock(&target->obd_dev_lock);
+
+		CDEBUG(D_INFO,
+		       "%s: Temporarily refusing client connection from %s\n",
+		       target->obd_name, libcfs_nid2str(req->rq_peer.nid));
+		GOTO(out, rc = -EAGAIN);
+	}
+
+	spin_unlock(&target->obd_dev_lock);
+
+	str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
+	if (str == NULL) {
+		DEBUG_REQ(D_ERROR, req, "bad client UUID for connect");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	obd_str2uuid(&cluuid, str);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_CONN);
+	if (tmp == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	conn = *tmp;
+
+	size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
+				    RCL_CLIENT);
+	if (size < 0 || size > 8 * sizeof(struct obd_connect_data))
+		GOTO(out, rc = -EPROTO);
+	data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
+	if (!data)
+		GOTO(out, rc = -EPROTO);
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out, rc);
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	/*
+	 * Don't allow clients to connect that are using old 1.8 format
+	 * protocol conventions (LUSTRE_MSG_MAGIC_v1, !MSGHDR_CKSUM_INCOMPAT18,
+	 * ldlm_flock_policy_wire format, MDT_ATTR_xTIME_SET, etc).  The
+	 * FULL20 flag should be set on all connections since 2.0, but no
+	 * longer affects behaviour.
+	 *
+	 * Later this check will be disabled and the flag can be retired
+	 * completely once interop with 3.0 is no longer needed.
+	 */
+	if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
+		GOTO(out, rc = -EPROTO);
+
+	/*
+	 * Don't allow liblustre clients to connect.
+	 * - testing was disabled in v2_2_50_0-61-g6a75d65
+	 * - building was disabled in v2_5_58_0-28-g7277179
+	 * - client code was deleted in v2_6_50_0-101-gcdfbc72,
+	 * - clients were refused connect for version difference > 0.0.1.32
+	 */
+	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
+		DEBUG_REQ(D_WARNING, req, "Refusing libclient connection");
+		GOTO(out, rc = -EPROTO);
+	}
+#endif
+
+	/*
+	 * Note: lw_client is needed in MDS-MDS failover during update log
+	 * processing, so we needs to allow lw_client to be connected at
+	 * anytime, instead of only the initial connection
+	 */
+	lw_client = OCD_HAS_FLAG(data, LIGHTWEIGHT);
+
+	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
+		initial_conn = true;
+		mds_conn = OCD_HAS_FLAG(data, MDS);
+		mds_mds_conn = OCD_HAS_FLAG(data, MDS_MDS);
+
+		/*
+		 * OBD_CONNECT_MNE_SWAB is removed at 2.14
+		 * Checking OBD_CONNECT_FID can be removed in the future.
+		 *
+		 * Via check OBD_CONNECT_FID, we can distinguish whether
+		 * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from
+		 * MGC or MDT, since MGC does not use OBD_CONNECT_FID.
+		 */
+		if (!lw_client &&
+		    (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
+		    (data->ocd_connect_flags & OBD_CONNECT_FID) &&
+		    (data->ocd_connect_flags & OBD_CONNECT_VERSION)) {
+			__u32 major = OBD_OCD_VERSION_MAJOR(data->ocd_version);
+			__u32 minor = OBD_OCD_VERSION_MINOR(data->ocd_version);
+			__u32 patch = OBD_OCD_VERSION_PATCH(data->ocd_version);
+
+			/*
+			 * We do not support the MDT-MDT interoperations with
+			 * different version MDT because of protocol changes.
+			 */
+			if (unlikely(major != LUSTRE_MAJOR ||
+				     minor != LUSTRE_MINOR ||
+				     abs(patch - LUSTRE_PATCH) > 3)) {
+				LCONSOLE_WARN("%s (%u.%u.%u.%u) refused the connection from different version MDT (%d.%d.%d.%d) %s %s\n",
+					      target->obd_name, LUSTRE_MAJOR,
+					      LUSTRE_MINOR, LUSTRE_PATCH,
+					      LUSTRE_FIX, major, minor, patch,
+					      OBD_OCD_VERSION_FIX(data->ocd_version),
+					      libcfs_nid2str(req->rq_peer.nid),
+					      str);
+				GOTO(out, rc = -EPROTO);
+			}
+		}
+	}
+
+	/* lctl gets a backstage, all-access pass. */
+	if (obd_uuid_equals(&cluuid, &target->obd_uuid))
+		goto dont_check_exports;
+
+	export = obd_uuid_lookup(target, &cluuid);
+	if (!export)
+		goto no_export;
+
+	/* We've found an export in the hash. */
+
+	spin_lock(&export->exp_lock);
+
+	if (export->exp_connecting) { /* b=9635, et. al. */
+		spin_unlock(&export->exp_lock);
+		LCONSOLE_WARN("%s: Export %p already connecting from %s\n",
+			      export->exp_obd->obd_name, export,
+			      libcfs_nid2str(req->rq_peer.nid));
+		class_export_put(export);
+		export = NULL;
+		rc = -EALREADY;
+	} else if ((mds_conn || (lw_client && initial_conn) ||
+		   OCD_HAS_FLAG(data, MDS_MDS)) && export->exp_connection) {
+		spin_unlock(&export->exp_lock);
+		if (req->rq_peer.nid !=
+		    lnet_nid_to_nid4(&export->exp_connection->c_peer.nid)) {
+			/* MDS or LWP reconnected after failover. */
+			LCONSOLE_WARN("%s: Received %s connection from %s, removing former export from %s\n",
+				      target->obd_name,
+				      lw_client ? "LWP" : "MDS",
+				      libcfs_nid2str(req->rq_peer.nid),
+				      libcfs_nidstr(&export->exp_connection->c_peer.nid));
+		} else {
+			/* New connection from the same NID. */
+			LCONSOLE_WARN("%s: Received new %s connection from %s, %s former export from same NID\n",
+				      target->obd_name,
+				      lw_client ? "LWP" : "MDS",
+				      libcfs_nid2str(req->rq_peer.nid),
+				      OCD_HAS_FLAG(data, MDS_MDS) ?
+				      "keep" : "remove");
+		}
+
+		if (req->rq_peer.nid ==
+		    lnet_nid_to_nid4(&export->exp_connection->c_peer.nid) &&
+		    OCD_HAS_FLAG(data, MDS_MDS)) {
+			/*
+			 * Because exports between MDTs will always be
+			 * kept, let's do not fail such export if they
+			 * come from the same NID, otherwise it might
+			 * cause eviction between MDTs, which might
+			 * cause namespace inconsistency
+			 */
+			spin_lock(&export->exp_lock);
+			export->exp_connecting = 1;
+			export->exp_conn_cnt = 0;
+			spin_unlock(&export->exp_lock);
+			conn.cookie = export->exp_handle.h_cookie;
+			rc = EALREADY;
+		} else {
+			class_fail_export(export);
+			class_export_put(export);
+			export = NULL;
+			rc = 0;
+		}
+	} else if (export->exp_connection != NULL && initial_conn &&
+		   req->rq_peer.nid != lnet_nid_to_nid4(&export->exp_connection->c_peer.nid)) {
+		spin_unlock(&export->exp_lock);
+		/* In MDS failover we have static UUID but NID can change. */
+		LCONSOLE_WARN("%s: Client %s seen on new nid %s when existing nid %s is already connected\n",
+			      target->obd_name, cluuid.uuid,
+			      libcfs_nid2str(req->rq_peer.nid),
+			      libcfs_nidstr(
+				      &export->exp_connection->c_peer.nid));
+		rc = -EALREADY;
+		class_export_put(export);
+		export = NULL;
+	} else if (OBD_FAIL_PRECHECK(OBD_FAIL_TGT_RECOVERY_CONNECT) &&
+		   !lw_client) {
+		spin_unlock(&export->exp_lock);
+		rc = -EAGAIN;
+	} else {
+		export->exp_connecting = 1;
+		spin_unlock(&export->exp_lock);
+		LASSERT(export->exp_obd == target);
+
+		rc = target_handle_reconnect(&conn, export, &cluuid);
+	}
+
+	/* If we found an export, we already unlocked. */
+	if (!export) {
+no_export:
+		OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout);
+	} else if (req->rq_export == NULL &&
+		   atomic_read(&export->exp_rpc_count) > 0) {
+		LCONSOLE_WARN("%s: Client %s (at %s) refused connection, still busy with %d references\n",
+			      target->obd_name, cluuid.uuid,
+			      libcfs_nid2str(req->rq_peer.nid),
+			      refcount_read(&export->exp_handle.h_ref));
+			GOTO(out, rc = -EBUSY);
+	} else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 &&
+		   rc != EALREADY) {
+		if (!strstr(cluuid.uuid, "mdt"))
+			LCONSOLE_WARN("%s: Rejecting reconnect from the known client %s (at %s) because it is indicating it is a new client\n",
+				      target->obd_name, cluuid.uuid,
+				      libcfs_nid2str(req->rq_peer.nid));
+		GOTO(out, rc = -EALREADY);
+	} else {
+		OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
+	}
+
+	if (rc < 0)
+		GOTO(out, rc);
+
+	CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n",
+	       target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+	       target->obd_recovering ? "recovering/" : "", data->ocd_transno,
+	       export, ktime_get_seconds(),
+	       export ? export->exp_last_request_time : 0);
+
+	/*
+	 * If this is the first time a client connects, reset the recovery
+	 * timer. Discard lightweight connections which might be local.
+	 */
+	if (!lw_client && rc == 0 && target->obd_recovering)
+		check_and_start_recovery_timer(target, req, export == NULL);
+
+	/*
+	 * We want to handle EALREADY but *not* -EALREADY from
+	 * target_handle_reconnect(), return reconnection state in a flag.
+	 */
+	if (rc == EALREADY) {
+		lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
+		rc = 0;
+	} else {
+		LASSERT(rc == 0);
+	}
+
+	/* Tell the client if we support replayable requests. */
+	if (target->obd_replayable)
+		lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE);
+	client_nid = &req->rq_peer.nid;
+
+	if (export == NULL) {
+		/* allow lightweight connections during recovery */
+		/*
+		 * allow "new" MDT to be connected during recovery, since we
+		 * need retrieve recovery update records from it
+		 */
+		if (target->obd_recovering && !lw_client && !mds_mds_conn) {
+			struct hrtimer *timer = &target->obd_recovery_timer;
+			ktime_t remaining;
+			s64 timeout, left;
+			int in_progress;
+			int connected;
+			int known;
+			int stale;
+			char *msg;
+
+			connected = atomic_read(&target->obd_connected_clients);
+			in_progress = atomic_read(&target->obd_lock_replay_clients);
+			known =
+			   atomic_read(&target->obd_max_recoverable_clients);
+			stale = target->obd_stale_clients;
+			remaining = hrtimer_get_remaining(timer);
+			left = ktime_divns(remaining, NSEC_PER_SEC);
+
+			if (ktime_to_ns(remaining) > 0) {
+				msg = "to recover in";
+				timeout = left;
+			} else {
+				msg = "already passed deadline";
+				timeout = -left;
+
+				target_check_recovery_timer(target);
+			}
+
+			LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n",
+				      target->obd_name, cluuid.uuid,
+				      libcfs_nid2str(req->rq_peer.nid), known,
+				      connected - in_progress, in_progress,
+				      stale, msg, timeout / 60, timeout % 60);
+			rc = -EBUSY;
+		} else {
+dont_check_exports:
+			rc = obd_connect(req->rq_svc_thread->t_env,
+					 &export, target, &cluuid, data,
+					 client_nid);
+			if (mds_conn && OBD_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG))
+				lustre_msg_add_op_flags(req->rq_repmsg,
+							MSG_CONNECT_RECOVERING);
+			if (rc == 0) {
+				conn.cookie = export->exp_handle.h_cookie;
+				rc = rev_import_init(export);
+			}
+
+			if (mds_mds_conn)
+				new_mds_mds_conn = true;
+		}
+	} else {
+		rc = obd_reconnect(req->rq_svc_thread->t_env,
+				   export, target, &cluuid, data, client_nid);
+	}
+	if (rc)
+		GOTO(out, rc);
+
+	LASSERT(target->u.obt.obt_magic == OBT_MAGIC);
+	data->ocd_instance = target->u.obt.obt_instance;
+
+	/*
+	 * Return only the parts of obd_connect_data that we understand, so the
+	 * client knows that we don't understand the rest.
+	 */
+	if (data) {
+		tmpsize = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
+					       RCL_SERVER);
+		tmpdata = req_capsule_server_get(&req->rq_pill,
+						 &RMF_CONNECT_DATA);
+		/*
+		 * Don't use struct assignment here, because the client reply
+		 * buffer may be smaller/larger than the local struct
+		 * obd_connect_data.
+		 */
+		memcpy(tmpdata, data, min(tmpsize, size));
+	}
+
+	/*
+	 * If the client and the server are the same node, we will already
+	 * have an export that really points to the client's DLM export,
+	 * because we have a shared handles table.
+	 *
+	 * XXX this will go away when shaver stops sending the "connect" handle
+	 * in the real "remote handle" field of the request --phik 24 Apr 2003
+	 */
+	ptlrpc_request_change_export(req, export);
+
+	pcon = ptlrpc_connection_get(req->rq_peer, req->rq_self, &cluuid);
+	if (pcon == NULL)
+		GOTO(out, rc = -ENOTCONN);
+
+	spin_lock(&export->exp_lock);
+
+	if (export->exp_disconnected) {
+		spin_unlock(&export->exp_lock);
+		GOTO(out, rc = -ENODEV);
+	}
+	if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+		spin_unlock(&export->exp_lock);
+		CDEBUG(D_RPCTRACE,
+		       "%s: %s already connected at greater or equal conn_cnt: %d >= %d\n",
+		       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+		       export->exp_conn_cnt,
+		       lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+		GOTO(out, rc = -EALREADY);
+	}
+	LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
+	export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+
+	/* Check to see if connection came from another NID. */
+	if (export->exp_connection != NULL &&
+	    lnet_nid_to_nid4(&export->exp_connection->c_peer.nid) !=
+	    req->rq_peer.nid) {
+		obd_nid_del(export->exp_obd, export);
+		ptlrpc_connection_put(export->exp_connection);
+		export->exp_connection = NULL;
+	}
+
+	if (export->exp_connection == NULL) {
+		export->exp_connection = pcon;
+		pcon = NULL;
+	}
+	obd_nid_add(export->exp_obd, export);
+
+	spin_unlock(&export->exp_lock);
+
+	lustre_msg_set_handle(req->rq_repmsg, &conn);
+
+	rc = rev_import_reconnect(export, req);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	if (target->obd_recovering && !export->exp_in_recovery && !lw_client) {
+		int has_transno;
+		__u64 transno = data->ocd_transno;
+
+		spin_lock(&export->exp_lock);
+		/*
+		 * possible race with class_disconnect_stale_exports,
+		 * export may be already in the eviction process
+		 */
+		if (export->exp_failed) {
+			spin_unlock(&export->exp_lock);
+			GOTO(out, rc = -ENODEV);
+		}
+		export->exp_in_recovery = 1;
+		export->exp_req_replay_needed = 1;
+		export->exp_lock_replay_needed = 1;
+		spin_unlock(&export->exp_lock);
+
+		has_transno = !!(lustre_msg_get_op_flags(req->rq_reqmsg) &
+				 MSG_CONNECT_TRANSNO);
+		if (has_transno && transno == 0)
+			CWARN("Connect with zero transno!\n");
+
+		if (has_transno && transno > 0 &&
+		    transno < target->obd_next_recovery_transno &&
+		    transno > target->obd_last_committed) {
+			/* Another way is to use cmpxchg() to be lock-free. */
+			spin_lock(&target->obd_recovery_task_lock);
+			if (transno < target->obd_next_recovery_transno)
+				target->obd_next_recovery_transno = transno;
+			spin_unlock(&target->obd_recovery_task_lock);
+		}
+
+		atomic_inc(&target->obd_req_replay_clients);
+		atomic_inc(&target->obd_lock_replay_clients);
+		/*
+		 * Note: MDS-MDS connection is allowed to be connected during
+		 * recovery, no matter if the exports needs to be recoveried.
+		 * Because we need retrieve updates logs from all other MDTs.
+		 * So if the MDS-MDS export is new, obd_max_recoverable_clients
+		 * also needs to be increased to match other recovery checking
+		 * condition.
+		 */
+		if (new_mds_mds_conn)
+			atomic_inc(&target->obd_max_recoverable_clients);
+
+		if (atomic_inc_return(&target->obd_connected_clients) ==
+		    atomic_read(&target->obd_max_recoverable_clients))
+			wake_up(&target->obd_next_transno_waitq);
+	}
+
+	/* Tell the client we're in recovery, when client is involved in it. */
+	if (target->obd_recovering && !lw_client)
+		lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
+
+out:
+	if (export) {
+		spin_lock(&export->exp_lock);
+		export->exp_connecting = 0;
+		spin_unlock(&export->exp_lock);
+
+		class_export_put(export);
+	}
+	if (target != NULL) {
+		spin_lock(&target->obd_dev_lock);
+		target->obd_conn_inprogress--;
+		spin_unlock(&target->obd_dev_lock);
+		class_decref(target, "find", current);
+	}
+	if (pcon)
+		ptlrpc_connection_put(pcon);
+	req->rq_status = rc;
+	RETURN(rc);
+}
+
+int target_handle_disconnect(struct ptlrpc_request *req)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(rc);
+
+	/* In case of target disconnect, updating sec ctx immediately is
+	 * required in order to record latest sequence number used.
+	 * Sequence is normally updated on export destroy, but this event
+	 * can occur too late, ie after a new target connect request has
+	 * been processed.
+	 * Maintaining correct sequence when client connection becomes idle
+	 * ensures that GSS does not erroneously consider requests as replays.
+	 */
+	rc = sptlrpc_export_update_ctx(req->rq_export);
+	if (rc)
+		RETURN(rc);
+
+	/* Keep the rq_export around so we can send the reply. */
+	req->rq_status = obd_disconnect(class_export_get(req->rq_export));
+
+	RETURN(0);
+}
+
+void target_destroy_export(struct obd_export *exp)
+{
+	struct obd_import *imp = NULL;
+	/*
+	 * exports created from last_rcvd data, and "fake"
+	 * exports created by lctl don't have an import
+	 */
+	spin_lock(&exp->exp_lock);
+	if (exp->exp_imp_reverse != NULL) {
+		imp = exp->exp_imp_reverse;
+		exp->exp_imp_reverse = NULL;
+	}
+	spin_unlock(&exp->exp_lock);
+	if (imp != NULL)
+		client_destroy_import(imp);
+
+	LASSERT_ATOMIC_ZERO(&exp->exp_locks_count);
+	LASSERT_ATOMIC_ZERO(&exp->exp_rpc_count);
+	LASSERT_ATOMIC_ZERO(&exp->exp_cb_count);
+	LASSERT_ATOMIC_ZERO(&exp->exp_replay_count);
+}
+EXPORT_SYMBOL(target_destroy_export);
+
+/*
+ * Recovery functions
+ */
+static void target_request_copy_get(struct ptlrpc_request *req)
+{
+	class_export_rpc_inc(req->rq_export);
+	LASSERT(list_empty(&req->rq_list));
+	INIT_LIST_HEAD(&req->rq_replay_list);
+
+	/* Increase refcount to keep request in queue. */
+	atomic_inc(&req->rq_refcount);
+	/* Let export know it has replays to be handled. */
+	atomic_inc(&req->rq_export->exp_replay_count);
+}
+
+static void target_request_copy_put(struct ptlrpc_request *req)
+{
+	LASSERT(list_empty(&req->rq_replay_list));
+	LASSERT_ATOMIC_POS(&req->rq_export->exp_replay_count);
+
+	atomic_dec(&req->rq_export->exp_replay_count);
+	class_export_rpc_dec(req->rq_export);
+	ptlrpc_server_drop_request(req);
+}
+
+static int target_exp_enqueue_req_replay(struct ptlrpc_request *req)
+{
+	__u64 transno = lustre_msg_get_transno(req->rq_reqmsg);
+	struct obd_export *exp = req->rq_export;
+	struct ptlrpc_request *reqiter;
+	struct ptlrpc_request *dup_req = NULL;
+	int dup = 0;
+
+	LASSERT(exp);
+
+	spin_lock(&exp->exp_lock);
+	list_for_each_entry(reqiter, &exp->exp_req_replay_queue,
+			    rq_replay_list) {
+		if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) {
+			dup_req = reqiter;
+			dup = 1;
+			break;
+		}
+	}
+
+	if (dup) {
+		/* We expect it with RESENT and REPLAY flags. */
+		if ((lustre_msg_get_flags(req->rq_reqmsg) &
+		    (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY))
+			CERROR("invalid flags %x of resent replay\n",
+			       lustre_msg_get_flags(req->rq_reqmsg));
+
+		if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+			__u32 new_conn;
+
+			new_conn = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+			if (new_conn >
+			    lustre_msg_get_conn_cnt(dup_req->rq_reqmsg))
+				lustre_msg_set_conn_cnt(dup_req->rq_reqmsg,
+							new_conn);
+		}
+	} else {
+		list_add_tail(&req->rq_replay_list,
+			      &exp->exp_req_replay_queue);
+	}
+
+	spin_unlock(&exp->exp_lock);
+	return dup;
+}
+
+static void target_exp_dequeue_req_replay(struct ptlrpc_request *req)
+{
+	LASSERT(!list_empty(&req->rq_replay_list));
+	LASSERT(req->rq_export);
+
+	spin_lock(&req->rq_export->exp_lock);
+	list_del_init(&req->rq_replay_list);
+	spin_unlock(&req->rq_export->exp_lock);
+}
+
+static void target_finish_recovery(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+
+	ENTRY;
+
+	/* Only log a recovery message when recovery has occurred. */
+	if (obd->obd_recovery_start) {
+		time64_t now = ktime_get_seconds();
+		time64_t elapsed_time;
+
+		elapsed_time = max_t(time64_t, now - obd->obd_recovery_start,
+				     1);
+		LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients %d recovered and %d %s evicted.\n",
+			      obd->obd_name, elapsed_time / 60,
+			      elapsed_time % 60,
+			      atomic_read(&obd->obd_max_recoverable_clients),
+			      atomic_read(&obd->obd_connected_clients),
+			      obd->obd_stale_clients,
+			      obd->obd_stale_clients == 1 ? "was" : "were");
+		if (obd->obd_stale_clients && do_dump_on_eviction(obd))
+			libcfs_debug_dumplog();
+	}
+
+	ldlm_reprocess_recovery_done(obd->obd_namespace);
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_req_replay_queue) ||
+	    !list_empty(&obd->obd_lock_replay_queue) ||
+	    !list_empty(&obd->obd_final_req_queue)) {
+		CERROR("%s: Recovery queues ( %s%s%s) are not empty\n",
+		       obd->obd_name,
+		       list_empty(&obd->obd_req_replay_queue) ? "" : "req ",
+		       list_empty(&obd->obd_lock_replay_queue) ? \
+				  "" : "lock ",
+		       list_empty(&obd->obd_final_req_queue) ? \
+				  "" : "final ");
+		spin_unlock(&obd->obd_recovery_task_lock);
+		LBUG();
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	obd->obd_recovery_end = ktime_get_seconds();
+
+	/* When recovery finished, cleanup orphans on MDS and OST. */
+	if (obd->obd_type && OBP(obd, postrecov)) {
+		int rc = OBP(obd, postrecov)(obd);
+
+		if (rc < 0)
+			LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+				      obd->obd_name, rc);
+	}
+	EXIT;
+}
+
+static void abort_req_replay_queue(struct obd_device *obd)
+{
+	struct ptlrpc_request *req, *n;
+	LIST_HEAD(abort_list);
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	list_splice_init(&obd->obd_req_replay_queue, &abort_list);
+	spin_unlock(&obd->obd_recovery_task_lock);
+	list_for_each_entry_safe(req, n, &abort_list, rq_list) {
+		DEBUG_REQ(D_WARNING, req, "aborted:");
+		req->rq_status = -ENOTCONN;
+		if (ptlrpc_error(req)) {
+			DEBUG_REQ(D_ERROR, req,
+				  "failed abort_req_reply; skipping");
+		}
+		target_exp_dequeue_req_replay(req);
+		target_request_copy_put(req);
+	}
+}
+
+static void abort_lock_replay_queue(struct obd_device *obd)
+{
+	struct ptlrpc_request *req, *n;
+	LIST_HEAD(abort_list);
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	list_splice_init(&obd->obd_lock_replay_queue, &abort_list);
+	spin_unlock(&obd->obd_recovery_task_lock);
+	list_for_each_entry_safe(req, n, &abort_list, rq_list) {
+		DEBUG_REQ(D_ERROR, req, "aborted:");
+		req->rq_status = -ENOTCONN;
+		if (ptlrpc_error(req)) {
+			DEBUG_REQ(D_ERROR, req,
+				  "failed abort_lock_reply; skipping");
+		}
+		target_request_copy_put(req);
+	}
+}
+
+/*
+ * Called from a cleanup function if the device is being cleaned up
+ * forcefully.  The exports should all have been disconnected already,
+ * the only thing left to do is
+ * - clear the recovery flags
+ * - cancel the timer
+ * - free queued requests and replies, but don't send replies
+ * Because the obd_stopping flag is set, no new requests should be received.
+ */
+void target_cleanup_recovery(struct obd_device *obd)
+{
+	struct ptlrpc_request *req, *n;
+	LIST_HEAD(clean_list);
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_recovering) {
+		spin_unlock(&obd->obd_dev_lock);
+		EXIT;
+		return;
+	}
+	obd->obd_recovering = obd->obd_abort_recovery = 0;
+	obd->obd_abort_recov_mdt = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	target_cancel_recovery_timer(obd);
+	list_splice_init(&obd->obd_req_replay_queue, &clean_list);
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	list_for_each_entry_safe(req, n, &clean_list, rq_list) {
+		LASSERT(req->rq_reply_state == NULL);
+		target_exp_dequeue_req_replay(req);
+		target_request_copy_put(req);
+	}
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	list_splice_init(&obd->obd_lock_replay_queue, &clean_list);
+	list_splice_init(&obd->obd_final_req_queue, &clean_list);
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	list_for_each_entry_safe(req, n, &clean_list, rq_list) {
+		LASSERT(req->rq_reply_state == NULL);
+		target_request_copy_put(req);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(target_cleanup_recovery);
+
+/* obd_recovery_task_lock should be held */
+void target_cancel_recovery_timer(struct obd_device *obd)
+{
+	CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
+	hrtimer_cancel(&obd->obd_recovery_timer);
+}
+
+static void target_start_recovery_timer(struct obd_device *obd)
+{
+	ktime_t delay;
+
+	if (obd->obd_recovery_start != 0)
+		return;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_recovering || obd->obd_abort_recovery) {
+		spin_unlock(&obd->obd_dev_lock);
+		return;
+	}
+
+	LASSERT(obd->obd_recovery_timeout != 0);
+
+	if (obd->obd_recovery_start != 0) {
+		spin_unlock(&obd->obd_dev_lock);
+		return;
+	}
+
+	obd->obd_recovery_start = ktime_get_seconds();
+	delay = ktime_set(obd->obd_recovery_start +
+			  obd->obd_recovery_timeout, 0);
+	hrtimer_start(&obd->obd_recovery_timer, delay, HRTIMER_MODE_ABS);
+	spin_unlock(&obd->obd_dev_lock);
+
+	LCONSOLE_WARN("%s: Will be in recovery for at least %u:%02u, or until %d client%s reconnect%s\n",
+		      obd->obd_name,
+		      obd->obd_recovery_timeout / 60,
+		      obd->obd_recovery_timeout % 60,
+		      atomic_read(&obd->obd_max_recoverable_clients),
+		      (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
+		      "" : "s",
+		      (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
+		      "s" : "");
+}
+
+/**
+ * extend recovery window.
+ *
+ * if @extend is true, extend recovery window to have @dr_timeout remaining
+ * at least; otherwise, make sure the recovery timeout value is not less
+ * than @dr_timeout.
+ */
+static void extend_recovery_timer(struct obd_device *obd, timeout_t dr_timeout,
+				  bool extend)
+{
+	ktime_t left_ns;
+	timeout_t timeout;
+	timeout_t left;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_recovering || obd->obd_abort_recovery ||
+	    obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return;
+	}
+	LASSERT(obd->obd_recovery_start != 0);
+
+	left_ns = hrtimer_get_remaining(&obd->obd_recovery_timer);
+	left = ktime_divns(left_ns, NSEC_PER_SEC);
+
+	if (extend) {
+		timeout = obd->obd_recovery_timeout;
+		/* dr_timeout will happen after the hrtimer has expired.
+		 * Add the excess time to the soft recovery timeout without
+		 * exceeding the hard recovery timeout.
+		 */
+		if (dr_timeout > left) {
+			timeout += dr_timeout - left;
+			timeout = min_t(timeout_t, obd->obd_recovery_time_hard,
+					timeout);
+		}
+	} else {
+		timeout = clamp_t(timeout_t, dr_timeout,
+				  obd->obd_recovery_timeout,
+				  obd->obd_recovery_time_hard);
+	}
+
+	if (timeout == obd->obd_recovery_time_hard)
+		CWARN("%s: extended recovery timer reached hard limit: %d, extend: %d\n",
+		      obd->obd_name, timeout, extend);
+
+	if (obd->obd_recovery_timeout < timeout) {
+		ktime_t end, now;
+
+		obd->obd_recovery_timeout = timeout;
+		end = ktime_set(obd->obd_recovery_start + timeout, 0);
+		now = ktime_set(ktime_get_seconds(), 0);
+		left_ns = ktime_sub(end, now);
+		hrtimer_start(&obd->obd_recovery_timer, end, HRTIMER_MODE_ABS);
+		left = ktime_divns(left_ns, NSEC_PER_SEC);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_HA, "%s: recovery timer will expire in %d seconds\n",
+		obd->obd_name, left);
+}
+
+/* Reset the timer with each new client connection */
+/*
+ * This timer is actually reconnect_timer, which is for making sure
+ * the total recovery window is at least as big as my reconnect
+ * attempt timing. So the initial recovery time_out will be set to
+ * OBD_RECOVERY_FACTOR * obd_timeout. If the timeout coming
+ * from client is bigger than this, then the recovery time_out will
+ * be extended to make sure the client could be reconnected, in the
+ * process, the timeout from the new client should be ignored.
+ */
+static void
+check_and_start_recovery_timer(struct obd_device *obd,
+			       struct ptlrpc_request *req,
+			       int new_client)
+{
+	timeout_t service_timeout = lustre_msg_get_service_timeout(req->rq_reqmsg);
+	struct obd_device_target *obt = &obd->u.obt;
+
+	if (!new_client && service_timeout)
+		/*
+		 * Teach server about old server's estimates, as first guess
+		 * at how long new requests will take.
+		 */
+		at_measured(&req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
+			    service_timeout);
+
+	target_start_recovery_timer(obd);
+
+	/*
+	 * Convert the service time to RPC timeout,
+	 * and reuse service_timeout to limit stack usage.
+	 */
+	service_timeout = at_est2timeout(service_timeout);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) &&
+	    service_timeout < at_extra)
+		service_timeout = at_extra;
+
+	/*
+	 * We expect other clients to timeout within service_timeout, then try
+	 * to reconnect, then try the failover server.  The max delay between
+	 * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL.
+	 */
+	service_timeout += 2 * INITIAL_CONNECT_TIMEOUT;
+
+	LASSERT(obt->obt_magic == OBT_MAGIC);
+	service_timeout += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
+	if (service_timeout > obd->obd_recovery_timeout && !new_client)
+		extend_recovery_timer(obd, service_timeout, false);
+}
+
+/** Health checking routines */
+static inline int exp_connect_healthy(struct obd_export *exp)
+{
+	return exp->exp_in_recovery;
+}
+
+/** if export done req_replay or has replay in queue */
+static inline int exp_req_replay_healthy(struct obd_export *exp)
+{
+	return (!exp->exp_req_replay_needed ||
+		atomic_read(&exp->exp_replay_count) > 0);
+}
+
+
+static inline int exp_req_replay_healthy_or_from_mdt(struct obd_export *exp)
+{
+	return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
+	       exp_req_replay_healthy(exp);
+}
+
+/** if export done lock_replay or has replay in queue */
+static inline int exp_lock_replay_healthy(struct obd_export *exp)
+{
+	return (!exp->exp_lock_replay_needed ||
+		atomic_read(&exp->exp_replay_count) > 0);
+}
+
+static inline int exp_vbr_healthy(struct obd_export *exp)
+{
+	return !exp->exp_vbr_failed;
+}
+
+static inline int exp_finished(struct obd_export *exp)
+{
+	return exp->exp_in_recovery && !exp->exp_lock_replay_needed;
+}
+
+static inline int exp_finished_or_from_mdt(struct obd_export *exp)
+{
+	return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
+		exp_finished(exp);
+}
+
+static int check_for_next_transno(struct lu_target *lut)
+{
+	struct ptlrpc_request *req = NULL;
+	struct obd_device *obd = lut->lut_obd;
+	struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+	int wake_up = 0, connected, completed, queue_len;
+	__u64 req_transno = 0;
+	__u64 update_transno = 0;
+	__u64 next_transno = 0;
+
+	ENTRY;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_req_replay_queue)) {
+		req = list_entry(obd->obd_req_replay_queue.next,
+				     struct ptlrpc_request, rq_list);
+		req_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	}
+
+	if (!obd->obd_abort_recov_mdt && tdtd)
+		update_transno = distribute_txn_get_next_transno(tdtd);
+
+	connected = atomic_read(&obd->obd_connected_clients);
+	completed = connected - atomic_read(&obd->obd_req_replay_clients);
+	queue_len = obd->obd_requests_queued_for_recovery;
+	next_transno = obd->obd_next_recovery_transno;
+
+	CDEBUG(D_HA,
+	       "max: %d, connected: %d, completed: %d, queue_len: %d, req_transno: %llu, next_transno: %llu\n",
+	       atomic_read(&obd->obd_max_recoverable_clients),
+	       connected, completed,
+	       queue_len, req_transno, next_transno);
+
+	if (obd->obd_abort_recovery) {
+		CDEBUG(D_HA, "waking for aborted recovery\n");
+		wake_up = 1;
+	} else if (obd->obd_recovery_expired) {
+		CDEBUG(D_HA, "waking for expired recovery\n");
+		wake_up = 1;
+	} else if (!obd->obd_abort_recov_mdt && tdtd && req &&
+		   is_req_replayed_by_update(req)) {
+		LASSERTF(req_transno < next_transno,
+			 "req_transno %llu next_transno%llu\n", req_transno,
+			 next_transno);
+		CDEBUG(D_HA, "waking for duplicate req (%llu)\n",
+		       req_transno);
+		wake_up = 1;
+	} else if (req_transno == next_transno ||
+		   (update_transno != 0 && update_transno <= next_transno)) {
+		CDEBUG(D_HA, "waking for next (%lld)\n", next_transno);
+		wake_up = 1;
+	} else if (queue_len > 0 &&
+		   queue_len == atomic_read(&obd->obd_req_replay_clients)) {
+		/** handle gaps occured due to lost reply or VBR */
+		LASSERTF(req_transno >= next_transno,
+			 "req_transno: %llu, next_transno: %llu\n",
+			 req_transno, next_transno);
+		CDEBUG(D_HA,
+		       "%s: waking for gap in transno, VBR is %s (skip: %lld, ql: %d, comp: %d, conn: %d, next: %lld, next_update %lld last_committed: %lld)\n",
+		       obd->obd_name, obd->obd_version_recov ? "ON" : "OFF",
+		       next_transno, queue_len, completed, connected,
+		       req_transno, update_transno, obd->obd_last_committed);
+		obd->obd_next_recovery_transno = req_transno;
+		wake_up = 1;
+	} else if (atomic_read(&obd->obd_req_replay_clients) == 0) {
+		CDEBUG(D_HA, "waking for completed recovery\n");
+		wake_up = 1;
+	} else if (OBD_FAIL_CHECK(OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS)) {
+		CDEBUG(D_HA,
+		       "accepting transno gaps is explicitly allowed by fail_lock, waking up (%lld)\n",
+		       next_transno);
+		obd->obd_next_recovery_transno = req_transno;
+		wake_up = 1;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+	return wake_up;
+}
+
+static int check_for_next_lock(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+	int wake_up = 0;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_lock_replay_queue)) {
+		CDEBUG(D_HA, "waking for next lock\n");
+		wake_up = 1;
+	} else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
+		CDEBUG(D_HA, "waking for completed lock replay\n");
+		wake_up = 1;
+	} else if (obd->obd_abort_recovery) {
+		CDEBUG(D_HA, "waking for aborted recovery\n");
+		wake_up = 1;
+	} else if (obd->obd_recovery_expired) {
+		CDEBUG(D_HA, "waking for expired recovery\n");
+		wake_up = 1;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	return wake_up;
+}
+
+static int check_update_llog(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+	struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+	if (obd->obd_abort_recovery) {
+		CDEBUG(D_HA, "waking for aborted recovery\n");
+		return 1;
+	}
+
+	if (atomic_read(&tdtd->tdtd_recovery_threads_count) == 0) {
+		CDEBUG(D_HA, "waking for completion of reading update log\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * wait for recovery events,
+ * check its status with help of check_routine
+ * evict dead clients via health_check
+ */
+static int target_recovery_overseer(struct lu_target *lut,
+				    int (*check_routine)(struct lu_target *),
+				    int (*health_check)(struct obd_export *))
+{
+	struct obd_device *obd = lut->lut_obd;
+	struct target_distribute_txn_data *tdtd;
+	time64_t last = 0;
+	time64_t now;
+repeat:
+	if (obd->obd_recovering && obd->obd_recovery_start == 0) {
+		now = ktime_get_seconds();
+		if (now - last > 600) {
+			LCONSOLE_INFO("%s: in recovery but waiting for the first client to connect\n",
+				      obd->obd_name);
+			last = now;
+		}
+	}
+	if (obd->obd_recovery_start != 0 && ktime_get_seconds() >=
+	      (obd->obd_recovery_start + obd->obd_recovery_time_hard)) {
+		__u64 next_update_transno = 0;
+
+		/*
+		 * Only abort the recovery if there are no update recovery
+		 * left in the queue
+		 */
+		spin_lock(&obd->obd_recovery_task_lock);
+		if (!obd->obd_abort_recov_mdt && lut->lut_tdtd) {
+			next_update_transno =
+				distribute_txn_get_next_transno(lut->lut_tdtd);
+
+			tdtd = lut->lut_tdtd;
+			/*
+			 * If next_update_transno == 0, it probably because
+			 * updatelog retrieve threads did not get any records
+			 * yet, let's wait those threads stopped
+			 */
+			if (next_update_transno == 0) {
+				spin_unlock(&obd->obd_recovery_task_lock);
+
+				while (wait_event_timeout(
+					tdtd->tdtd_recovery_threads_waitq,
+					check_update_llog(lut),
+					cfs_time_seconds(60)) == 0);
+
+				spin_lock(&obd->obd_recovery_task_lock);
+				next_update_transno =
+					distribute_txn_get_next_transno(tdtd);
+			}
+		}
+
+		if (next_update_transno != 0 && !obd->obd_abort_recovery) {
+			obd->obd_next_recovery_transno = next_update_transno;
+			spin_unlock(&obd->obd_recovery_task_lock);
+			/*
+			 * Disconnect unfinished exports from clients, and
+			 * keep connection from MDT to make sure the update
+			 * recovery will still keep trying until some one
+			 * manually abort the recovery
+			 */
+			class_disconnect_stale_exports(obd,
+						exp_finished_or_from_mdt);
+			/* Abort all of replay & replay lock req from clients */
+			abort_req_replay_queue(obd);
+			abort_lock_replay_queue(obd);
+			CDEBUG(D_HA,
+			       "%s: there are still update replay (%#llx)in the queue.\n",
+			       obd->obd_name, next_update_transno);
+		} else {
+			obd->obd_abort_recovery = 1;
+			spin_unlock(&obd->obd_recovery_task_lock);
+			CWARN("%s recovery is aborted by hard timeout\n",
+			      obd->obd_name);
+		}
+	}
+
+	while (wait_event_timeout(obd->obd_next_transno_waitq,
+				  check_routine(lut),
+				  cfs_time_seconds(60)) == 0)
+		; /* wait indefinitely for event, but don't trigger watchdog */
+
+	if (obd->obd_abort_recovery) {
+		CWARN("recovery is aborted, evict exports in recovery\n");
+		if (lut->lut_tdtd != NULL) {
+			tdtd = lut->lut_tdtd;
+			/*
+			 * Let's wait all of the update log recovery thread
+			 * finished
+			 */
+			wait_event_idle(
+				tdtd->tdtd_recovery_threads_waitq,
+				atomic_read(&tdtd->tdtd_recovery_threads_count)
+				== 0);
+			/* Then abort the update recovery list */
+			dtrq_list_destroy(lut->lut_tdtd);
+		}
+
+		/** evict exports which didn't finish recovery yet */
+		class_disconnect_stale_exports(obd, exp_finished);
+		return 1;
+	} else if (obd->obd_recovery_expired) {
+		obd->obd_recovery_expired = 0;
+
+		/** If some clients died being recovered, evict them */
+		LCONSOLE_WARN("%s: recovery is timed out, evict stale exports\n",
+			      obd->obd_name);
+		/** evict cexports with no replay in queue, they are stalled */
+		class_disconnect_stale_exports(obd, health_check);
+
+		/** continue with VBR */
+		spin_lock(&obd->obd_dev_lock);
+		obd->obd_version_recov = 1;
+		spin_unlock(&obd->obd_dev_lock);
+		/**
+		 * reset timer, recovery will proceed with versions now,
+		 * timeout is set just to handle reconnection delays
+		 */
+		extend_recovery_timer(obd, RECONNECT_DELAY_MAX, true);
+		/**
+		 * Wait for recovery events again, after evicting bad clients
+		 */
+		goto repeat;
+	}
+	return 0;
+}
+
+static struct ptlrpc_request *target_next_replay_lock(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+	struct ptlrpc_request *req = NULL;
+
+	CDEBUG(D_HA, "Waiting for lock\n");
+	if (target_recovery_overseer(lut, check_for_next_lock,
+				     exp_lock_replay_healthy))
+		abort_lock_replay_queue(obd);
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_lock_replay_queue)) {
+		req = list_entry(obd->obd_lock_replay_queue.next,
+				     struct ptlrpc_request, rq_list);
+		list_del_init(&req->rq_list);
+		spin_unlock(&obd->obd_recovery_task_lock);
+	} else {
+		spin_unlock(&obd->obd_recovery_task_lock);
+		LASSERT(list_empty(&obd->obd_lock_replay_queue));
+		LASSERT(atomic_read(&obd->obd_lock_replay_clients) == 0);
+		/** evict exports failed VBR */
+		class_disconnect_stale_exports(obd, exp_vbr_healthy);
+	}
+	return req;
+}
+
+static struct ptlrpc_request *target_next_final_ping(struct obd_device *obd)
+{
+	struct ptlrpc_request *req = NULL;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_final_req_queue)) {
+		req = list_entry(obd->obd_final_req_queue.next,
+				     struct ptlrpc_request, rq_list);
+		list_del_init(&req->rq_list);
+		spin_unlock(&obd->obd_recovery_task_lock);
+		if (req->rq_export->exp_in_recovery) {
+			spin_lock(&req->rq_export->exp_lock);
+			req->rq_export->exp_in_recovery = 0;
+			spin_unlock(&req->rq_export->exp_lock);
+		}
+	} else {
+		spin_unlock(&obd->obd_recovery_task_lock);
+	}
+	return req;
+}
+
+static void handle_recovery_req(struct ptlrpc_thread *thread,
+				struct ptlrpc_request *req,
+				svc_handler_t handler)
+{
+	ENTRY;
+
+	/**
+	 * export can be evicted during recovery, no need to handle replays for
+	 * it after that, discard such request silently
+	 */
+	if (req->rq_export->exp_disconnected)
+		RETURN_EXIT;
+
+	req->rq_session.lc_thread = thread;
+	req->rq_svc_thread = thread;
+	req->rq_svc_thread->t_env->le_ses = &req->rq_session;
+
+	/* thread context */
+	lu_context_enter(&thread->t_env->le_ctx);
+	(void)handler(req);
+	lu_context_exit(&thread->t_env->le_ctx);
+
+	req->rq_svc_thread->t_env->le_ses = NULL;
+
+	/* don't reset timer for final stage */
+	if (!exp_finished(req->rq_export)) {
+		timeout_t timeout = obd_timeout;
+
+		/**
+		 * Add request @timeout to the recovery time so next request from
+		 * this client may come in recovery time
+		 */
+		if (!AT_OFF) {
+			struct ptlrpc_service_part *svcpt;
+			timeout_t est_timeout;
+
+			svcpt = req->rq_rqbd->rqbd_svcpt;
+			/*
+			 * If the server sent early reply for this request,
+			 * the client will recalculate the timeout according to
+			 * current server estimate service time, so we will
+			 * use the maxium timeout here for waiting the client
+			 * sending the next req
+			 */
+			est_timeout = at_get(&svcpt->scp_at_estimate);
+			timeout = max_t(timeout_t, at_est2timeout(est_timeout),
+					lustre_msg_get_timeout(req->rq_reqmsg));
+			/*
+			 * Add 2 net_latency, one for balance rq_deadline
+			 * (see ptl_send_rpc), one for resend the req to server,
+			 * Note: client will pack net_latency in replay req
+			 * (see ptlrpc_replay_req)
+			 */
+			timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg);
+		}
+		extend_recovery_timer(class_exp2obd(req->rq_export), timeout,
+				      true);
+	}
+	EXIT;
+}
+
+/** Checking routines for recovery */
+static int check_for_recovery_ready(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+	unsigned int clnts = atomic_read(&obd->obd_connected_clients);
+
+	CDEBUG(D_HA,
+	       "connected %d stale %d max_recoverable_clients %d abort %d expired %d\n",
+	       clnts, obd->obd_stale_clients,
+	       atomic_read(&obd->obd_max_recoverable_clients),
+	       obd->obd_abort_recovery, obd->obd_recovery_expired);
+
+	if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
+		LASSERT(clnts <=
+			atomic_read(&obd->obd_max_recoverable_clients));
+		if (clnts + obd->obd_stale_clients <
+		    atomic_read(&obd->obd_max_recoverable_clients))
+			return 0;
+	}
+
+	if (!obd->obd_abort_recov_mdt && lut->lut_tdtd != NULL) {
+		if (!lut->lut_tdtd->tdtd_replay_ready &&
+		    !obd->obd_abort_recovery && !obd->obd_stopping) {
+			/*
+			 * Let's extend recovery timer, in case the recovery
+			 * timer expired, and some clients got evicted
+			 */
+			extend_recovery_timer(obd, obd->obd_recovery_timeout,
+					      true);
+			CDEBUG(D_HA,
+			       "%s update recovery is not ready, extend recovery %d\n",
+			       obd->obd_name, obd->obd_recovery_timeout);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+enum {
+	REQUEST_RECOVERY = 1,
+	UPDATE_RECOVERY = 2,
+};
+
+static __u64 get_next_replay_req_transno(struct obd_device *obd)
+{
+	__u64 transno = 0;
+
+	if (!list_empty(&obd->obd_req_replay_queue)) {
+		struct ptlrpc_request *req;
+
+		req = list_entry(obd->obd_req_replay_queue.next,
+				 struct ptlrpc_request, rq_list);
+		transno = lustre_msg_get_transno(req->rq_reqmsg);
+	}
+
+	return transno;
+}
+
+static __u64 get_next_transno(struct lu_target *lut, int *type)
+{
+	struct obd_device *obd = lut->lut_obd;
+	struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+	__u64 transno = 0;
+	__u64 update_transno;
+
+	ENTRY;
+
+	transno = get_next_replay_req_transno(obd);
+	if (type != NULL)
+		*type = REQUEST_RECOVERY;
+
+	if (!tdtd || obd->obd_abort_recov_mdt)
+		RETURN(transno);
+
+	update_transno = distribute_txn_get_next_transno(tdtd);
+	if (transno == 0 || (transno >= update_transno &&
+			     update_transno != 0)) {
+		transno = update_transno;
+		if (type != NULL)
+			*type = UPDATE_RECOVERY;
+	}
+
+	RETURN(transno);
+}
+
+/**
+ * drop duplicate replay request
+ *
+ * Because the operation has been replayed by update recovery, the request
+ * with the same transno will be dropped and also notify the client to send
+ * next replay request.
+ *
+ * \param[in] env	execution environment
+ * \param[in] obd	failover obd device
+ * \param[in] req	request to be dropped
+ */
+static void drop_duplicate_replay_req(struct lu_env *env,
+				      struct obd_device *obd,
+				      struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req,
+		  "remove t%lld from %s because duplicate update records found",
+		  lustre_msg_get_transno(req->rq_reqmsg),
+		  libcfs_nid2str(req->rq_peer.nid));
+
+	/*
+	 * Right now, only for MDS reint operation update replay and
+	 * normal request replay can have the same transno
+	 */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT) {
+		req_capsule_set(&req->rq_pill, &RQF_MDS_REINT);
+		req->rq_status = req_capsule_server_pack(&req->rq_pill);
+		if (likely(req->rq_export))
+			target_committed_to_req(req);
+		lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
+		target_send_reply(req, req->rq_status, 0);
+	} else {
+		DEBUG_REQ(D_ERROR, req, "wrong opc from %s",
+		libcfs_nid2str(req->rq_peer.nid));
+	}
+	target_exp_dequeue_req_replay(req);
+	target_request_copy_put(req);
+	obd->obd_replayed_requests++;
+}
+
+#define WATCHDOG_TIMEOUT (obd_timeout * 10)
+
+static void replay_request_or_update(struct lu_env *env,
+				     struct lu_target *lut,
+				     struct target_recovery_data *trd,
+				     struct ptlrpc_thread *thread)
+{
+	struct obd_device *obd = lut->lut_obd;
+	struct ptlrpc_request *req = NULL;
+	int type;
+	__u64 transno;
+
+	ENTRY;
+
+	CDEBUG(D_HA, "Waiting for transno %lld\n",
+	       obd->obd_next_recovery_transno);
+
+	/* Replay all of request and update by transno */
+	do {
+		struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+		CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
+
+		/**
+		 * It is needed to extend recovery window above
+		 *  recovery_time_soft. Extending is possible only in the
+		 *  end of recovery window (see more details in
+		 *  handle_recovery_req()).
+		 */
+		CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
+
+		if (target_recovery_overseer(lut, check_for_next_transno,
+					exp_req_replay_healthy_or_from_mdt)) {
+			abort_req_replay_queue(obd);
+			abort_lock_replay_queue(obd);
+			goto abort;
+		}
+
+		spin_lock(&obd->obd_recovery_task_lock);
+		transno = get_next_transno(lut, &type);
+		if (type == REQUEST_RECOVERY && transno != 0) {
+			/*
+			 * Drop replay request from client side, if the
+			 * replay has been executed by update with the
+			 * same transno
+			 */
+			req = list_entry(obd->obd_req_replay_queue.next,
+					struct ptlrpc_request, rq_list);
+
+			list_del_init(&req->rq_list);
+			obd->obd_requests_queued_for_recovery--;
+			spin_unlock(&obd->obd_recovery_task_lock);
+
+			/*
+			 * Let's check if the request has been redone by
+			 * update replay
+			 */
+			if (is_req_replayed_by_update(req)) {
+				struct distribute_txn_replay_req *dtrq;
+
+				dtrq = distribute_txn_lookup_finish_list(tdtd,
+								      transno);
+				LASSERT(dtrq != NULL);
+				spin_lock(&tdtd->tdtd_replay_list_lock);
+				list_del_init(&dtrq->dtrq_list);
+				spin_unlock(&tdtd->tdtd_replay_list_lock);
+				dtrq_destroy(dtrq);
+
+				drop_duplicate_replay_req(env, obd, req);
+
+				continue;
+			}
+
+			LASSERT(trd->trd_processing_task == current->pid);
+			DEBUG_REQ(D_HA, req, "processing x%llu t%lld from %s",
+				  req->rq_xid,
+				  lustre_msg_get_transno(req->rq_reqmsg),
+				  libcfs_nid2str(req->rq_peer.nid));
+
+			ptlrpc_watchdog_init(&thread->t_watchdog,
+					     WATCHDOG_TIMEOUT);
+			handle_recovery_req(thread, req,
+					    trd->trd_recovery_handler);
+			ptlrpc_watchdog_disable(&thread->t_watchdog);
+
+			/**
+			 * bz18031: increase next_recovery_transno before
+			 * target_request_copy_put() will drop exp_rpc reference
+			 */
+			spin_lock(&obd->obd_recovery_task_lock);
+			obd->obd_next_recovery_transno++;
+			spin_unlock(&obd->obd_recovery_task_lock);
+			target_exp_dequeue_req_replay(req);
+			target_request_copy_put(req);
+			obd->obd_replayed_requests++;
+		} else if (type == UPDATE_RECOVERY && transno != 0) {
+			struct distribute_txn_replay_req *dtrq;
+			int rc;
+
+			spin_unlock(&obd->obd_recovery_task_lock);
+
+			LASSERT(tdtd != NULL);
+			dtrq = distribute_txn_get_next_req(tdtd);
+			lu_context_enter(&thread->t_env->le_ctx);
+			ptlrpc_watchdog_init(&thread->t_watchdog,
+					     WATCHDOG_TIMEOUT);
+			rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+			ptlrpc_watchdog_disable(&thread->t_watchdog);
+			lu_context_exit(&thread->t_env->le_ctx);
+			extend_recovery_timer(obd, obd_timeout, true);
+
+			if (rc == 0 && dtrq->dtrq_xid != 0) {
+				CDEBUG(D_HA,
+				       "Move x%llu t%llu to finish list\n",
+				       dtrq->dtrq_xid,
+				       dtrq->dtrq_master_transno);
+
+				/* Add it to the replay finish list */
+				spin_lock(&tdtd->tdtd_replay_list_lock);
+				list_add(&dtrq->dtrq_list,
+					 &tdtd->tdtd_replay_finish_list);
+				spin_unlock(&tdtd->tdtd_replay_list_lock);
+
+				spin_lock(&obd->obd_recovery_task_lock);
+				if (transno == obd->obd_next_recovery_transno)
+					obd->obd_next_recovery_transno++;
+				else if (transno >
+					 obd->obd_next_recovery_transno)
+					obd->obd_next_recovery_transno =
+								transno + 1;
+				spin_unlock(&obd->obd_recovery_task_lock);
+			} else {
+				dtrq_destroy(dtrq);
+			}
+		} else {
+			spin_unlock(&obd->obd_recovery_task_lock);
+abort:
+			LASSERT(list_empty(&obd->obd_req_replay_queue));
+			LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
+			/** evict exports failed VBR */
+			class_disconnect_stale_exports(obd, exp_vbr_healthy);
+			break;
+		}
+	} while (1);
+}
+
+static int target_recovery_thread(void *arg)
+{
+	struct lu_target *lut = arg;
+	struct obd_device *obd = lut->lut_obd;
+	struct ptlrpc_request *req;
+	struct target_recovery_data *trd = &obd->obd_recovery_data;
+	unsigned long delta;
+	struct lu_env *env;
+	struct ptlrpc_thread *thread = NULL;
+	int rc = 0;
+
+	ENTRY;
+	unshare_fs_struct();
+	OBD_ALLOC_PTR(thread);
+	if (thread == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		GOTO(out_thread, rc = -ENOMEM);
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_env, rc);
+
+	rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD);
+	if (rc)
+		GOTO(out_env_remove, rc);
+
+	thread->t_env = env;
+	thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */
+	thread->t_task = current;
+	env->le_ctx.lc_thread = thread;
+	tgt_io_thread_init(thread); /* init thread_big_cache for IO requests */
+
+	CDEBUG(D_HA, "%s: started recovery thread pid %d\n", obd->obd_name,
+	       current->pid);
+	trd->trd_processing_task = current->pid;
+
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_recovering = 1;
+	spin_unlock(&obd->obd_dev_lock);
+	complete(&trd->trd_starting);
+
+	/* first of all, we have to know the first transno to replay */
+	if (target_recovery_overseer(lut, check_for_recovery_ready,
+				     exp_connect_healthy)) {
+		abort_req_replay_queue(obd);
+		abort_lock_replay_queue(obd);
+		if (lut->lut_tdtd != NULL)
+			dtrq_list_destroy(lut->lut_tdtd);
+	}
+
+	/* next stage: replay requests or update */
+	delta = jiffies;
+	CDEBUG(D_INFO, "1: request replay stage - %d clients from t%llu\n",
+	       atomic_read(&obd->obd_req_replay_clients),
+	       obd->obd_next_recovery_transno);
+	replay_request_or_update(env, lut, trd, thread);
+
+	/**
+	 * The second stage: replay locks
+	 */
+	CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
+	       atomic_read(&obd->obd_lock_replay_clients));
+	while ((req = target_next_replay_lock(lut))) {
+		LASSERT(trd->trd_processing_task == current->pid);
+		DEBUG_REQ(D_HA, req, "processing lock from %s:",
+			  libcfs_nid2str(req->rq_peer.nid));
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_REPLAY)) {
+			req->rq_status = -ENODEV;
+			target_request_copy_put(req);
+			continue;
+		}
+		handle_recovery_req(thread, req,
+				    trd->trd_recovery_handler);
+		target_request_copy_put(req);
+		obd->obd_replayed_locks++;
+	}
+
+	/**
+	 * The third stage: reply on final pings, at this moment all clients
+	 * must have request in final queue
+	 */
+	CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_RECONNECT, cfs_fail_val);
+	CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n");
+	/** Update server last boot epoch */
+	tgt_boot_epoch_update(lut);
+	/*
+	 * We drop recoverying flag to forward all new requests
+	 * to regular mds_handle() since now
+	 */
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_recovering = obd->obd_abort_recovery = 0;
+	obd->obd_abort_recov_mdt = 0;
+	spin_unlock(&obd->obd_dev_lock);
+	spin_lock(&obd->obd_recovery_task_lock);
+	target_cancel_recovery_timer(obd);
+	spin_unlock(&obd->obd_recovery_task_lock);
+	while ((req = target_next_final_ping(obd))) {
+		LASSERT(trd->trd_processing_task == current->pid);
+		DEBUG_REQ(D_HA, req, "processing final ping from %s:",
+			  libcfs_nid2str(req->rq_peer.nid));
+		handle_recovery_req(thread, req,
+				    trd->trd_recovery_handler);
+		/*
+		 * Because the waiting client can not send ping to server,
+		 * so we need refresh the last_request_time, to avoid the
+		 * export is being evicted
+		 */
+		ptlrpc_update_export_timer(req->rq_export, 0);
+		target_request_copy_put(req);
+	}
+
+	delta = jiffies_to_msecs(jiffies - delta) / MSEC_PER_SEC;
+	CDEBUG(D_INFO, "4: recovery completed in %lus - %d/%d reqs/locks\n",
+	       delta, obd->obd_replayed_requests, obd->obd_replayed_locks);
+	if (delta > OBD_RECOVERY_TIME_SOFT) {
+		CWARN("too long recovery - read logs\n");
+		libcfs_debug_dumplog();
+	}
+
+	target_finish_recovery(lut);
+	lu_context_fini(&env->le_ctx);
+	trd->trd_processing_task = 0;
+	complete_all(&trd->trd_finishing);
+	tgt_io_thread_done(thread);
+out_env_remove:
+	lu_env_remove(env);
+out_env:
+	OBD_FREE_PTR(env);
+out_thread:
+	OBD_FREE_PTR(thread);
+	RETURN(rc);
+}
+
+static int target_start_recovery_thread(struct lu_target *lut,
+					svc_handler_t handler)
+{
+	struct obd_device *obd = lut->lut_obd;
+	int rc = 0;
+	struct target_recovery_data *trd = &obd->obd_recovery_data;
+	int index;
+
+	memset(trd, 0, sizeof(*trd));
+	init_completion(&trd->trd_starting);
+	init_completion(&trd->trd_finishing);
+	trd->trd_recovery_handler = handler;
+
+	rc = server_name2index(obd->obd_name, &index, NULL);
+	if (rc < 0)
+		return rc;
+
+	if (!IS_ERR(kthread_run(target_recovery_thread,
+				lut, "tgt_recover_%d", index))) {
+		wait_for_completion(&trd->trd_starting);
+		LASSERT(obd->obd_recovering != 0);
+	} else {
+		rc = -ECHILD;
+	}
+
+	return rc;
+}
+
+void target_stop_recovery_thread(struct obd_device *obd)
+{
+	if (obd->obd_recovery_data.trd_processing_task > 0) {
+		struct target_recovery_data *trd = &obd->obd_recovery_data;
+		/** recovery can be done but postrecovery is not yet */
+		spin_lock(&obd->obd_dev_lock);
+		if (obd->obd_recovering) {
+			CERROR("%s: Aborting recovery\n", obd->obd_name);
+			obd->obd_abort_recovery = 1;
+			wake_up(&obd->obd_next_transno_waitq);
+		}
+		spin_unlock(&obd->obd_dev_lock);
+		wait_for_completion(&trd->trd_finishing);
+	}
+}
+EXPORT_SYMBOL(target_stop_recovery_thread);
+
+void target_recovery_fini(struct obd_device *obd)
+{
+	class_disconnect_exports(obd);
+	target_stop_recovery_thread(obd);
+	target_cleanup_recovery(obd);
+}
+EXPORT_SYMBOL(target_recovery_fini);
+
+static enum hrtimer_restart target_recovery_expired(struct hrtimer *timer)
+{
+	struct obd_device *obd = container_of(timer, struct obd_device,
+					      obd_recovery_timer);
+
+	CDEBUG(D_HA,
+	       "%s: recovery timed out; %d clients are still in recovery after %llu seconds (%d clients connected)\n",
+	       obd->obd_name, atomic_read(&obd->obd_lock_replay_clients),
+	       ktime_get_seconds() - obd->obd_recovery_start,
+	       atomic_read(&obd->obd_connected_clients));
+
+	obd->obd_recovery_expired = 1;
+	wake_up(&obd->obd_next_transno_waitq);
+	return HRTIMER_NORESTART;
+}
+
+void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
+{
+	struct obd_device *obd = lut->lut_obd;
+
+	if (lut->lut_bottom->dd_rdonly)
+		return;
+
+	if (atomic_read(&obd->obd_max_recoverable_clients) == 0) {
+		/** Update server last boot epoch */
+		tgt_boot_epoch_update(lut);
+		return;
+	}
+
+	CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, "
+	       "last_transno %llu\n", obd->obd_name,
+	       atomic_read(&obd->obd_max_recoverable_clients),
+	       obd->obd_last_committed);
+	LASSERT(obd->obd_stopping == 0);
+	obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
+	obd->obd_recovery_start = 0;
+	obd->obd_recovery_end = 0;
+
+	hrtimer_init(&obd->obd_recovery_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_ABS);
+	obd->obd_recovery_timer.function = &target_recovery_expired;
+	target_start_recovery_thread(lut, handler);
+}
+EXPORT_SYMBOL(target_recovery_init);
+
+static int target_process_req_flags(struct obd_device *obd,
+				    struct ptlrpc_request *req)
+{
+	struct obd_export *exp = req->rq_export;
+
+	LASSERT(exp != NULL);
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) {
+		/* client declares he's ready to replay locks */
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_req_replay_needed) {
+			exp->exp_req_replay_needed = 0;
+			spin_unlock(&exp->exp_lock);
+
+			LASSERT_ATOMIC_POS(&obd->obd_req_replay_clients);
+			atomic_dec(&obd->obd_req_replay_clients);
+		} else {
+			spin_unlock(&exp->exp_lock);
+		}
+	}
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
+		/*
+		 * client declares he's ready to complete recovery
+		 * so, we put the request on th final queue
+		 */
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_lock_replay_needed) {
+			exp->exp_lock_replay_needed = 0;
+			spin_unlock(&exp->exp_lock);
+
+			LASSERT_ATOMIC_POS(&obd->obd_lock_replay_clients);
+			atomic_dec(&obd->obd_lock_replay_clients);
+		} else {
+			spin_unlock(&exp->exp_lock);
+		}
+	}
+	return 0;
+}
+
+int target_queue_recovery_request(struct ptlrpc_request *req,
+				  struct obd_device *obd)
+{
+	__u64 transno = lustre_msg_get_transno(req->rq_reqmsg);
+	struct ptlrpc_request *reqiter;
+	int inserted = 0;
+
+	ENTRY;
+
+	if (obd->obd_recovery_data.trd_processing_task == current->pid) {
+		/* Processing the queue right now, don't re-add. */
+		RETURN(1);
+	}
+
+	target_process_req_flags(obd, req);
+
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
+		if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
+			if (cfs_fail_val == 1) {
+				cfs_race_state = 1;
+				cfs_fail_val = 0;
+				wake_up(&cfs_race_waitq);
+
+				schedule_timeout_interruptible(
+					cfs_time_seconds(1));
+			}
+		}
+
+		/*
+		 * client declares he's ready to complete recovery
+		 * so, we put the request on th final queue
+		 */
+		target_request_copy_get(req);
+		DEBUG_REQ(D_HA, req, "queue final req");
+		wake_up(&obd->obd_next_transno_waitq);
+		spin_lock(&obd->obd_recovery_task_lock);
+		if (obd->obd_recovering) {
+			struct ptlrpc_request *tmp;
+			struct ptlrpc_request *duplicate = NULL;
+
+			if (likely(!req->rq_export->exp_replay_done)) {
+				req->rq_export->exp_replay_done = 1;
+				list_add_tail(&req->rq_list,
+					      &obd->obd_final_req_queue);
+				spin_unlock(&obd->obd_recovery_task_lock);
+				RETURN(0);
+			}
+
+			/*
+			 * XXX O(n), but only happens if final ping is
+			 * timed out, probably reorganize the list as
+			 * a hash list later
+			 */
+			list_for_each_entry_safe(reqiter, tmp,
+						 &obd->obd_final_req_queue,
+						 rq_list) {
+				if (reqiter->rq_export == req->rq_export) {
+					list_del_init(&reqiter->rq_list);
+					duplicate = reqiter;
+					break;
+				}
+			}
+
+			list_add_tail(&req->rq_list,
+				      &obd->obd_final_req_queue);
+			req->rq_export->exp_replay_done = 1;
+			spin_unlock(&obd->obd_recovery_task_lock);
+
+			if (duplicate != NULL) {
+				DEBUG_REQ(D_HA, duplicate,
+					  "put prev final req");
+				target_request_copy_put(duplicate);
+			}
+			RETURN(0);
+		} else {
+			spin_unlock(&obd->obd_recovery_task_lock);
+			target_request_copy_put(req);
+			RETURN(obd->obd_stopping ? -ENOTCONN : 1);
+		}
+	}
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) {
+		/* client declares he's ready to replay locks */
+		target_request_copy_get(req);
+		DEBUG_REQ(D_HA, req, "queue lock replay req");
+		wake_up(&obd->obd_next_transno_waitq);
+		spin_lock(&obd->obd_recovery_task_lock);
+		LASSERT(obd->obd_recovering);
+		/* usually due to recovery abort */
+		if (!req->rq_export->exp_in_recovery) {
+			spin_unlock(&obd->obd_recovery_task_lock);
+			target_request_copy_put(req);
+			RETURN(-ENOTCONN);
+		}
+		LASSERT(req->rq_export->exp_lock_replay_needed);
+		list_add_tail(&req->rq_list, &obd->obd_lock_replay_queue);
+		spin_unlock(&obd->obd_recovery_task_lock);
+		RETURN(0);
+	}
+
+	/*
+	 * CAVEAT EMPTOR: The incoming request message has been swabbed
+	 * (i.e. buflens etc are in my own byte order), but type-dependent
+	 * buffers (eg mdt_body, ost_body etc) have NOT been swabbed.
+	 */
+
+	if (!transno) {
+		INIT_LIST_HEAD(&req->rq_list);
+		DEBUG_REQ(D_HA, req, "not queueing");
+		RETURN(1);
+	}
+
+	/*
+	 * If we're processing the queue, we want don't want to queue this
+	 * message.
+	 *
+	 * Also, if this request has a transno less than the one we're waiting
+	 * for, we should process it now.  It could (and currently always will)
+	 * be an open request for a descriptor that was opened some time ago.
+	 *
+	 * Also, a resent, replayed request that has already been
+	 * handled will pass through here and be processed immediately.
+	 */
+	CDEBUG(D_HA,
+	       "Next recovery transno: %llu, current: %llu, replaying\n",
+	       obd->obd_next_recovery_transno, transno);
+
+	/*
+	 * If the request has been replayed by update replay, then sends this
+	 * request to the recovery thread (replay_request_or_update()), where
+	 * it will be handled
+	 */
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (transno < obd->obd_next_recovery_transno &&
+	    !is_req_replayed_by_update(req)) {
+		/* Processing the queue right now, don't re-add. */
+		LASSERT(list_empty(&req->rq_list));
+		spin_unlock(&obd->obd_recovery_task_lock);
+		RETURN(1);
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_DROP))
+		RETURN(0);
+
+	target_request_copy_get(req);
+	if (!req->rq_export->exp_in_recovery) {
+		target_request_copy_put(req);
+		RETURN(-ENOTCONN);
+	}
+	LASSERT(req->rq_export->exp_req_replay_needed);
+
+	if (target_exp_enqueue_req_replay(req)) {
+		DEBUG_REQ(D_ERROR, req, "dropping resent queued req");
+		target_request_copy_put(req);
+		RETURN(0);
+	}
+
+	/* XXX O(n^2) */
+	spin_lock(&obd->obd_recovery_task_lock);
+	LASSERT(obd->obd_recovering);
+	list_for_each_entry(reqiter, &obd->obd_req_replay_queue, rq_list) {
+		if (lustre_msg_get_transno(reqiter->rq_reqmsg) > transno) {
+			list_add_tail(&req->rq_list, &reqiter->rq_list);
+			inserted = 1;
+			goto added;
+		}
+
+		if (unlikely(lustre_msg_get_transno(reqiter->rq_reqmsg) ==
+			     transno)) {
+			DEBUG_REQ(D_ERROR, req,
+				  "dropping replay: transno has been claimed by another client");
+			spin_unlock(&obd->obd_recovery_task_lock);
+			target_exp_dequeue_req_replay(req);
+			target_request_copy_put(req);
+			RETURN(0);
+		}
+	}
+added:
+	if (!inserted)
+		list_add_tail(&req->rq_list, &obd->obd_req_replay_queue);
+
+	obd->obd_requests_queued_for_recovery++;
+	spin_unlock(&obd->obd_recovery_task_lock);
+	wake_up(&obd->obd_next_transno_waitq);
+	RETURN(0);
+}
+
+void target_committed_to_req(struct ptlrpc_request *req)
+{
+	struct obd_export *exp = req->rq_export;
+
+	if (!exp->exp_obd->obd_no_transno && req->rq_repmsg != NULL)
+		lustre_msg_set_last_committed(req->rq_repmsg,
+					      exp->exp_last_committed);
+	else
+		DEBUG_REQ(D_IOCTL, req,
+			  "not sending last_committed update (%d/%d)",
+			  exp->exp_obd->obd_no_transno,
+			  req->rq_repmsg == NULL);
+
+	CDEBUG(D_INFO, "last_committed %llu, transno %llu, xid %llu\n",
+	       exp->exp_last_committed, req->rq_transno, req->rq_xid);
+}
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * Packs current SLV and Limit into \a req.
+ */
+int target_pack_pool_reply(struct ptlrpc_request *req)
+{
+	struct obd_device *obd;
+
+	ENTRY;
+
+	/*
+	 * Check that we still have all structures alive as this may
+	 * be some late RPC at shutdown time.
+	 */
+	if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
+		     !exp_connect_lru_resize(req->rq_export))) {
+		lustre_msg_set_slv(req->rq_repmsg, 0);
+		lustre_msg_set_limit(req->rq_repmsg, 0);
+		RETURN(0);
+	}
+
+	/* OBD is alive here as export is alive, which we checked above. */
+	obd = req->rq_export->exp_obd;
+
+	read_lock(&obd->obd_pool_lock);
+	lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv);
+	lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+
+	RETURN(0);
+}
+
+static int target_send_reply_msg(struct ptlrpc_request *req,
+				 int rc, int fail_id)
+{
+	if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+		DEBUG_REQ(D_ERROR, req, "dropping reply");
+		return -ECOMM;
+	}
+	/*
+	 * We can have a null rq_reqmsg in the event of bad signature or
+	 * no context when unwrapping
+	 */
+	if (req->rq_reqmsg &&
+	    unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT &&
+	    OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET_REP)))
+		return -ECOMM;
+
+	if (unlikely(rc)) {
+		DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+		req->rq_status = rc;
+		return ptlrpc_send_error(req, 1);
+	}
+	DEBUG_REQ(D_NET, req, "sending reply");
+
+	return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT);
+}
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+	struct ptlrpc_service_part *svcpt;
+	int netrc;
+	struct ptlrpc_reply_state *rs;
+	struct obd_export *exp;
+
+	ENTRY;
+
+	if (req->rq_no_reply) {
+		EXIT;
+		return;
+	}
+
+	svcpt = req->rq_rqbd->rqbd_svcpt;
+	rs = req->rq_reply_state;
+	if (rs == NULL || !rs->rs_difficult) {
+		/* no notifiers */
+		target_send_reply_msg(req, rc, fail_id);
+		EXIT;
+		return;
+	}
+
+	/* must be an export if locks saved */
+	LASSERT(req->rq_export != NULL);
+	/* req/reply consistent */
+	LASSERT(rs->rs_svcpt == svcpt);
+
+	/* "fresh" reply */
+	LASSERT(!rs->rs_scheduled);
+	LASSERT(!rs->rs_scheduled_ever);
+	LASSERT(!rs->rs_handled);
+	LASSERT(!rs->rs_sent);
+	LASSERT(!rs->rs_unlinked);
+	LASSERT(rs->rs_export == NULL);
+	LASSERT(list_empty(&rs->rs_obd_list));
+	LASSERT(list_empty(&rs->rs_exp_list));
+
+	exp = class_export_get(req->rq_export);
+
+	/* disable reply scheduling while I'm setting up */
+	rs->rs_scheduled = 1;
+	rs->rs_sent      = 0;
+	rs->rs_unlinked  = 0;
+	rs->rs_xid       = req->rq_xid;
+	rs->rs_transno   = req->rq_transno;
+	rs->rs_export    = exp;
+	rs->rs_opc       = lustre_msg_get_opc(req->rq_reqmsg);
+
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n",
+	       rs->rs_transno, exp->exp_last_committed);
+	if (rs->rs_transno > exp->exp_last_committed) {
+		/* not committed already */
+		list_add_tail(&rs->rs_obd_list,
+				  &exp->exp_uncommitted_replies);
+	}
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+
+	spin_lock(&exp->exp_lock);
+	list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
+	spin_unlock(&exp->exp_lock);
+
+	netrc = target_send_reply_msg(req, rc, fail_id);
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	atomic_inc(&svcpt->scp_nreps_difficult);
+
+	if (netrc != 0) {
+		/*
+		 * error sending: reply is off the net.  Also we need +1
+		 * reply ref until ptlrpc_handle_rs() is done
+		 * with the reply state (if the send was successful, there
+		 * would have been +1 ref for the net, which
+		 * reply_out_callback leaves alone)
+		 */
+		rs->rs_sent = 1;
+		rs->rs_unlinked = 1;
+		ptlrpc_rs_addref(rs);
+	}
+
+	spin_lock(&rs->rs_lock);
+	if (rs->rs_transno <= exp->exp_last_committed ||
+	    (rs->rs_unlinked && !rs->rs_no_ack) ||
+	    list_empty(&rs->rs_exp_list) ||     /* completed already */
+	    list_empty(&rs->rs_obd_list)) {
+		CDEBUG(D_HA, "Schedule reply immediately\n");
+		ptlrpc_dispatch_difficult_reply(rs);
+	} else {
+		list_add(&rs->rs_list, &svcpt->scp_rep_active);
+		rs->rs_scheduled = 0;	/* allow notifier to schedule */
+	}
+	spin_unlock(&rs->rs_lock);
+	spin_unlock(&svcpt->scp_rep_lock);
+	EXIT;
+}
+
+enum ldlm_mode lck_compat_array[] = {
+	[LCK_EX]    = LCK_COMPAT_EX,
+	[LCK_PW]    = LCK_COMPAT_PW,
+	[LCK_PR]    = LCK_COMPAT_PR,
+	[LCK_CW]    = LCK_COMPAT_CW,
+	[LCK_CR]    = LCK_COMPAT_CR,
+	[LCK_NL]    = LCK_COMPAT_NL,
+	[LCK_GROUP] = LCK_COMPAT_GROUP,
+	[LCK_COS]   = LCK_COMPAT_COS,
+};
+
+/**
+ * Rather arbitrary mapping from LDLM error codes to errno values. This should
+ * not escape to the user level.
+ */
+int ldlm_error2errno(enum ldlm_error error)
+{
+	int result;
+
+	switch (error) {
+	case ELDLM_OK:
+	case ELDLM_LOCK_MATCHED:
+		result = 0;
+		break;
+	case ELDLM_LOCK_CHANGED:
+		result = -ESTALE;
+		break;
+	case ELDLM_LOCK_ABORTED:
+		result = -ENAVAIL;
+		break;
+	case ELDLM_LOCK_REPLACED:
+		result = -ESRCH;
+		break;
+	case ELDLM_NO_LOCK_DATA:
+		result = -ENOENT;
+		break;
+	case ELDLM_NAMESPACE_EXISTS:
+		result = -EEXIST;
+		break;
+	case ELDLM_BAD_NAMESPACE:
+		result = -EBADF;
+		break;
+	default:
+		if (((int)error) < 0) { /* cast to signed type */
+			result = error; /* as ldlm_error can be unsigned */
+		} else {
+			CERROR("Invalid DLM result code: %d\n", error);
+			result = -EPROTO;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(ldlm_error2errno);
+
+/**
+ * Dual to ldlm_error2errno(): maps errno values back to enum ldlm_error.
+ */
+enum ldlm_error ldlm_errno2error(int err_no)
+{
+	int error;
+
+	switch (err_no) {
+	case 0:
+		error = ELDLM_OK;
+		break;
+	case -ESTALE:
+		error = ELDLM_LOCK_CHANGED;
+		break;
+	case -ENAVAIL:
+		error = ELDLM_LOCK_ABORTED;
+		break;
+	case -ESRCH:
+		error = ELDLM_LOCK_REPLACED;
+		break;
+	case -ENOENT:
+		error = ELDLM_NO_LOCK_DATA;
+		break;
+	case -EEXIST:
+		error = ELDLM_NAMESPACE_EXISTS;
+		break;
+	case -EBADF:
+		error = ELDLM_BAD_NAMESPACE;
+		break;
+	default:
+		error = err_no;
+	}
+	return error;
+}
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+	if (!list_empty(&exp->exp_locks_list)) {
+		struct ldlm_lock *lock;
+
+		CERROR("dumping locks for export %p, ignore if the unmount doesn't hang\n",
+		       exp);
+		list_for_each_entry(lock, &exp->exp_locks_list,
+					l_exp_refs_link)
+			LDLM_ERROR(lock, "lock:");
+	}
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+#endif
+
+#ifdef HAVE_SERVER_SUPPORT
+static inline const char *bulk2type(struct ptlrpc_request *req)
+{
+	if (req->rq_bulk_read)
+		return "READ";
+	if (req->rq_bulk_write)
+		return "WRITE";
+	return "UNKNOWN";
+}
+
+int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_request *req = desc->bd_req;
+	time64_t start = ktime_get_seconds();
+	time64_t deadline;
+	int rc = 0;
+
+	ENTRY;
+
+	/* If there is eviction in progress, wait for it to finish. */
+	wait_event_idle(
+		exp->exp_obd->obd_evict_inprogress_waitq,
+		!atomic_read(&exp->exp_obd->obd_evict_inprogress));
+
+	/* Check if client was evicted or reconnected already. */
+	if (exp->exp_failed ||
+	    exp->exp_conn_cnt > lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+		rc = -ENOTCONN;
+	} else {
+		if (req->rq_bulk_read)
+			rc = sptlrpc_svc_wrap_bulk(req, desc);
+
+		if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS))
+			req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
+		else /* old version, bulk matchbits is rq_xid */
+			req->rq_mbits = req->rq_xid;
+
+		if (rc == 0)
+			rc = ptlrpc_start_bulk_transfer(desc);
+	}
+
+	if (rc < 0) {
+		DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc = %d",
+			  bulk2type(req), rc);
+		RETURN(rc);
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
+		ptlrpc_abort_bulk(desc);
+		RETURN(0);
+	}
+
+	/* limit actual bulk transfer to bulk_timeout seconds */
+	deadline = start + bulk_timeout;
+	if (deadline > req->rq_deadline)
+		deadline = req->rq_deadline;
+
+	do {
+		time64_t timeoutl = deadline - ktime_get_seconds();
+		time64_t rq_deadline;
+
+		while (timeoutl >= 0 &&
+		       wait_event_idle_timeout(
+			       desc->bd_waitq,
+			       !ptlrpc_server_bulk_active(desc) ||
+			       exp->exp_failed ||
+			       exp->exp_conn_cnt >
+			       lustre_msg_get_conn_cnt(req->rq_reqmsg),
+			       timeoutl ? cfs_time_seconds(1) : 1) == 0)
+			timeoutl -= 1;
+		rc = timeoutl < 0 ? -ETIMEDOUT : 0;
+
+		/* Wait again if we changed rq_deadline. */
+		rq_deadline = READ_ONCE(req->rq_deadline);
+		deadline = start + bulk_timeout;
+		if (deadline > rq_deadline)
+			deadline = rq_deadline;
+	} while (rc == -ETIMEDOUT &&
+		 deadline > ktime_get_seconds());
+
+	if (rc == -ETIMEDOUT) {
+		DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds",
+			  bulk2type(req), deadline - start,
+			  ktime_get_real_seconds() - deadline);
+		ptlrpc_abort_bulk(desc);
+	} else if (exp->exp_failed) {
+		DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
+			  bulk2type(req));
+		rc = -ENOTCONN;
+		ptlrpc_abort_bulk(desc);
+	} else if (exp->exp_conn_cnt >
+		   lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+		DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
+			  bulk2type(req));
+		/* We don't reply anyway. */
+		rc = -ETIMEDOUT;
+		ptlrpc_abort_bulk(desc);
+	} else if (desc->bd_failure) {
+		DEBUG_REQ(D_ERROR, req, "network error on bulk %s",
+			  bulk2type(req));
+		/* XXX should this be a different errno? */
+		rc = -ETIMEDOUT;
+	} else {
+		if (req->rq_bulk_write)
+			rc = sptlrpc_svc_unwrap_bulk(req, desc);
+		if (rc == 0 && desc->bd_nob_transferred != desc->bd_nob) {
+			DEBUG_REQ(D_ERROR, req, "truncated bulk %s %d(%d)",
+				  bulk2type(req), desc->bd_nob_transferred,
+				  desc->bd_nob);
+			/* XXX should this be a different errno? */
+			rc = -ETIMEDOUT;
+		}
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(target_bulk_io);
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
new file mode 100644
index 0000000000000..3ceadc9086a97
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
@@ -0,0 +1,2898 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ldlm/ldlm_lock.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <libcfs/libcfs.h>
+
+#include <lustre_swab.h>
+#include <obd_class.h>
+
+#include "ldlm_internal.h"
+
+struct kmem_cache *ldlm_glimpse_work_kmem;
+EXPORT_SYMBOL(ldlm_glimpse_work_kmem);
+
+/* lock types */
+char *ldlm_lockname[] = {
+	[0] = "--",
+	[LCK_EX] = "EX",
+	[LCK_PW] = "PW",
+	[LCK_PR] = "PR",
+	[LCK_CW] = "CW",
+	[LCK_CR] = "CR",
+	[LCK_NL] = "NL",
+	[LCK_GROUP] = "GROUP",
+	[LCK_COS] = "COS"
+};
+EXPORT_SYMBOL(ldlm_lockname);
+
+char *ldlm_typename[] = {
+	[LDLM_PLAIN] = "PLN",
+	[LDLM_EXTENT] = "EXT",
+	[LDLM_FLOCK] = "FLK",
+	[LDLM_IBITS] = "IBT",
+};
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire_to_local[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE]  = ldlm_plain_policy_wire_to_local,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_wire_to_local,
+	[LDLM_FLOCK - LDLM_MIN_TYPE]  = ldlm_flock_policy_wire_to_local,
+	[LDLM_IBITS - LDLM_MIN_TYPE]  = ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE]  = ldlm_plain_policy_local_to_wire,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_local_to_wire,
+	[LDLM_FLOCK - LDLM_MIN_TYPE]  = ldlm_flock_policy_local_to_wire,
+	[LDLM_IBITS - LDLM_MIN_TYPE]  = ldlm_ibits_policy_local_to_wire,
+};
+
+/**
+ * Converts lock policy from local format to on the wire lock_desc format
+ */
+void ldlm_convert_policy_to_wire(enum ldlm_type type,
+				 const union ldlm_policy_data *lpolicy,
+				 union ldlm_wire_policy_data *wpolicy)
+{
+	ldlm_policy_local_to_wire_t convert;
+
+	convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE];
+
+	convert(lpolicy, wpolicy);
+}
+
+/**
+ * Converts lock policy from on the wire lock_desc format to local format
+ */
+void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type,
+				  const union ldlm_wire_policy_data *wpolicy,
+				  union ldlm_policy_data *lpolicy)
+{
+	ldlm_policy_wire_to_local_t convert;
+
+	convert = ldlm_policy_wire_to_local[type - LDLM_MIN_TYPE];
+
+	convert(wpolicy, lpolicy);
+}
+
+const char *ldlm_it2str(enum ldlm_intent_flags it)
+{
+	switch (it) {
+	case IT_OPEN:
+		return "open";
+	case IT_CREAT:
+		return "creat";
+	case (IT_OPEN | IT_CREAT):
+		return "open|creat";
+	case IT_READDIR:
+		return "readdir";
+	case IT_GETATTR:
+		return "getattr";
+	case IT_LOOKUP:
+		return "lookup";
+	case IT_GETXATTR:
+		return "getxattr";
+	case IT_LAYOUT:
+		return "layout";
+	default:
+		CERROR("Unknown intent 0x%08x\n", it);
+		return "UNKNOWN";
+	}
+}
+EXPORT_SYMBOL(ldlm_it2str);
+
+#ifdef HAVE_SERVER_SUPPORT
+static ldlm_processing_policy ldlm_processing_policy_table[] = {
+	[LDLM_PLAIN]	= ldlm_process_plain_lock,
+	[LDLM_EXTENT]	= ldlm_process_extent_lock,
+	[LDLM_FLOCK]	= ldlm_process_flock_lock,
+	[LDLM_IBITS]	= ldlm_process_inodebits_lock,
+};
+
+ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res)
+{
+        return ldlm_processing_policy_table[res->lr_type];
+}
+EXPORT_SYMBOL(ldlm_get_processing_policy);
+
+static ldlm_reprocessing_policy ldlm_reprocessing_policy_table[] = {
+	[LDLM_PLAIN]	= ldlm_reprocess_queue,
+	[LDLM_EXTENT]	= ldlm_reprocess_queue,
+	[LDLM_FLOCK]	= ldlm_reprocess_queue,
+	[LDLM_IBITS]	= ldlm_reprocess_inodebits_queue,
+};
+
+ldlm_reprocessing_policy ldlm_get_reprocessing_policy(struct ldlm_resource *res)
+{
+	return ldlm_reprocessing_policy_table[res->lr_type];
+}
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
+{
+        ns->ns_policy = arg;
+}
+EXPORT_SYMBOL(ldlm_register_intent);
+
+/*
+ * REFCOUNTED LOCK OBJECTS
+ */
+
+
+/**
+ * Get a reference on a lock.
+ *
+ * Lock refcounts, during creation:
+ *   - one special one for allocation, dec'd only once in destroy
+ *   - one for being a lock that's in-use
+ *   - one for the addref associated with a new lock
+ */
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock)
+{
+	refcount_inc(&lock->l_handle.h_ref);
+        return lock;
+}
+EXPORT_SYMBOL(ldlm_lock_get);
+
+static void lock_handle_free(struct rcu_head *rcu)
+{
+	struct ldlm_lock *lock = container_of(rcu, struct ldlm_lock,
+					      l_handle.h_rcu);
+
+	OBD_FREE_PRE(lock, sizeof(*lock), "slab-freed");
+	kmem_cache_free(ldlm_lock_slab, lock);
+}
+
+/**
+ * Release lock reference.
+ *
+ * Also frees the lock if it was last reference.
+ */
+void ldlm_lock_put(struct ldlm_lock *lock)
+{
+        ENTRY;
+
+        LASSERT(lock->l_resource != LP_POISON);
+	LASSERT(refcount_read(&lock->l_handle.h_ref) > 0);
+	if (refcount_dec_and_test(&lock->l_handle.h_ref)) {
+                struct ldlm_resource *res;
+
+                LDLM_DEBUG(lock,
+                           "final lock_put on destroyed lock, freeing it.");
+
+                res = lock->l_resource;
+		LASSERT(ldlm_is_destroyed(lock));
+		LASSERT(list_empty(&lock->l_exp_list));
+		LASSERT(list_empty(&lock->l_res_link));
+		LASSERT(list_empty(&lock->l_pending_chain));
+
+                lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
+                                     LDLM_NSS_LOCKS);
+                lu_ref_del(&res->lr_reference, "lock", lock);
+                if (lock->l_export) {
+                        class_export_lock_put(lock->l_export, lock);
+                        lock->l_export = NULL;
+                }
+
+                if (lock->l_lvb_data != NULL)
+                        OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
+
+		if (res->lr_type == LDLM_EXTENT) {
+			ldlm_interval_free(ldlm_interval_detach(lock));
+		} else if (res->lr_type == LDLM_IBITS) {
+			if (lock->l_ibits_node != NULL)
+				OBD_SLAB_FREE_PTR(lock->l_ibits_node,
+						  ldlm_inodebits_slab);
+		}
+		ldlm_resource_putref(res);
+		lock->l_resource = NULL;
+                lu_ref_fini(&lock->l_reference);
+		call_rcu(&lock->l_handle.h_rcu, lock_handle_free);
+        }
+
+        EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_put);
+
+/**
+ * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked.
+ */
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
+{
+	int rc = 0;
+	if (!list_empty(&lock->l_lru)) {
+		struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+		LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+		if (ns->ns_last_pos == &lock->l_lru)
+			ns->ns_last_pos = lock->l_lru.prev;
+		list_del_init(&lock->l_lru);
+		LASSERT(ns->ns_nr_unused > 0);
+		ns->ns_nr_unused--;
+		rc = 1;
+	}
+	return rc;
+}
+
+/**
+ * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first.
+ *
+ * If \a last_use is non-zero, it will remove the lock from LRU only if
+ * it matches lock's l_last_used.
+ *
+ * \retval 0 if \a last_use is set, the lock is not in LRU list or \a last_use
+ *           doesn't match lock's l_last_used;
+ *           otherwise, the lock hasn't been in the LRU list.
+ * \retval 1 the lock was in LRU list and removed.
+ */
+int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	int rc = 0;
+
+	ENTRY;
+	if (ldlm_is_ns_srv(lock)) {
+		LASSERT(list_empty(&lock->l_lru));
+		RETURN(0);
+	}
+
+	spin_lock(&ns->ns_lock);
+	if (!ktime_compare(last_use, ktime_set(0, 0)) ||
+	    !ktime_compare(last_use, lock->l_last_used))
+		rc = ldlm_lock_remove_from_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked.
+ */
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	lock->l_last_used = ktime_get();
+	LASSERT(list_empty(&lock->l_lru));
+	LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+	list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+	LASSERT(ns->ns_nr_unused >= 0);
+	ns->ns_nr_unused++;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks
+ * first.
+ */
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	ENTRY;
+	spin_lock(&ns->ns_lock);
+	ldlm_lock_add_to_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+}
+
+/**
+ * Moves LDLM lock \a lock that is already in namespace LRU to the tail of
+ * the LRU. Performs necessary LRU locking
+ */
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	ENTRY;
+	if (ldlm_is_ns_srv(lock)) {
+		LASSERT(list_empty(&lock->l_lru));
+		EXIT;
+		return;
+	}
+
+	spin_lock(&ns->ns_lock);
+	if (!list_empty(&lock->l_lru)) {
+		ldlm_lock_remove_from_lru_nolock(lock);
+		ldlm_lock_add_to_lru_nolock(lock);
+	}
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+}
+
+/**
+ * Helper to destroy a locked lock.
+ *
+ * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock
+ * Must be called with l_lock and lr_lock held.
+ *
+ * Does not actually free the lock data, but rather marks the lock as
+ * destroyed by setting l_destroyed field in the lock to 1.  Destroys a
+ * handle->lock association too, so that the lock can no longer be found
+ * and removes the lock from LRU list.  Actual lock freeing occurs when
+ * last lock reference goes away.
+ *
+ * Original comment (of some historical value):
+ * This used to have a 'strict' flag, which recovery would use to mark an
+ * in-use lock as needing-to-die.  Lest I am ever tempted to put it back, I
+ * shall explain why it's gone: with the new hash table scheme, once you call
+ * ldlm_lock_destroy, you can never drop your final references on this lock.
+ * Because it's not in the hash table anymore.  -phil
+ */
+static int ldlm_lock_destroy_internal(struct ldlm_lock *lock)
+{
+        ENTRY;
+
+        if (lock->l_readers || lock->l_writers) {
+                LDLM_ERROR(lock, "lock still has references");
+                LBUG();
+        }
+
+	if (!list_empty(&lock->l_res_link)) {
+                LDLM_ERROR(lock, "lock still on resource");
+                LBUG();
+        }
+
+	if (ldlm_is_destroyed(lock)) {
+		LASSERT(list_empty(&lock->l_lru));
+		EXIT;
+		return 0;
+	}
+	ldlm_set_destroyed(lock);
+
+	if (lock->l_export && lock->l_export->exp_lock_hash) {
+		/* NB: it's safe to call cfs_hash_del() even lock isn't
+		 * in exp_lock_hash. */
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_del(lock->l_export->exp_lock_hash,
+			     &lock->l_remote_handle, &lock->l_exp_hash);
+	}
+
+        ldlm_lock_remove_from_lru(lock);
+        class_handle_unhash(&lock->l_handle);
+
+        EXIT;
+        return 1;
+}
+
+/**
+ * Destroys a LDLM lock \a lock. Performs necessary locking first.
+ */
+void ldlm_lock_destroy(struct ldlm_lock *lock)
+{
+        int first;
+        ENTRY;
+        lock_res_and_lock(lock);
+        first = ldlm_lock_destroy_internal(lock);
+        unlock_res_and_lock(lock);
+
+        /* drop reference from hashtable only for first destroy */
+        if (first) {
+                lu_ref_del(&lock->l_reference, "hash", lock);
+                LDLM_LOCK_RELEASE(lock);
+        }
+        EXIT;
+}
+
+/**
+ * Destroys a LDLM lock \a lock that is already locked.
+ */
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock)
+{
+        int first;
+        ENTRY;
+        first = ldlm_lock_destroy_internal(lock);
+        /* drop reference from hashtable only for first destroy */
+        if (first) {
+                lu_ref_del(&lock->l_reference, "hash", lock);
+                LDLM_LOCK_RELEASE(lock);
+        }
+        EXIT;
+}
+
+static const char lock_handle_owner[] = "ldlm";
+
+/**
+ *
+ * Allocate and initialize new lock structure.
+ *
+ * usage: pass in a resource on which you have done ldlm_resource_get
+ *        new lock will take over the refcount.
+ * returns: lock with refcount 2 - one for current caller and one for remote
+ */
+static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	if (resource == NULL)
+		LBUG();
+
+	OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, GFP_NOFS);
+	if (lock == NULL)
+		RETURN(NULL);
+
+	RCU_INIT_POINTER(lock->l_resource, resource);
+	lu_ref_add(&resource->lr_reference, "lock", lock);
+
+	refcount_set(&lock->l_handle.h_ref, 2);
+	INIT_LIST_HEAD(&lock->l_res_link);
+	INIT_LIST_HEAD(&lock->l_lru);
+	INIT_LIST_HEAD(&lock->l_pending_chain);
+	INIT_LIST_HEAD(&lock->l_bl_ast);
+	INIT_LIST_HEAD(&lock->l_cp_ast);
+	INIT_LIST_HEAD(&lock->l_rk_ast);
+	init_waitqueue_head(&lock->l_waitq);
+	lock->l_blocking_lock = NULL;
+	INIT_LIST_HEAD(&lock->l_sl_mode);
+	INIT_LIST_HEAD(&lock->l_sl_policy);
+	INIT_HLIST_NODE(&lock->l_exp_hash);
+	INIT_HLIST_NODE(&lock->l_exp_flock_hash);
+
+	lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
+			     LDLM_NSS_LOCKS);
+	INIT_HLIST_NODE(&lock->l_handle.h_link);
+	class_handle_hash(&lock->l_handle, lock_handle_owner);
+
+	lu_ref_init(&lock->l_reference);
+	lu_ref_add(&lock->l_reference, "hash", lock);
+	lock->l_callback_timestamp = 0;
+	lock->l_activity = 0;
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&lock->l_exp_refs_link);
+	lock->l_exp_refs_nr = 0;
+	lock->l_exp_refs_target = NULL;
+#endif
+	INIT_LIST_HEAD(&lock->l_exp_list);
+
+	RETURN(lock);
+}
+
+/**
+ * Moves LDLM lock \a lock to another resource.
+ * This is used on client when server returns some other lock than requested
+ * (typically as a result of intent operation)
+ */
+int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+                              const struct ldlm_res_id *new_resid)
+{
+	struct ldlm_resource *oldres;
+        struct ldlm_resource *newres;
+        int type;
+        ENTRY;
+
+        LASSERT(ns_is_client(ns));
+
+	oldres = lock_res_and_lock(lock);
+	if (memcmp(new_resid, &oldres->lr_name,
+		   sizeof(oldres->lr_name)) == 0) {
+                /* Nothing to do */
+                unlock_res_and_lock(lock);
+                RETURN(0);
+        }
+
+        LASSERT(new_resid->name[0] != 0);
+
+        /* This function assumes that the lock isn't on any lists */
+	LASSERT(list_empty(&lock->l_res_link));
+
+        type = oldres->lr_type;
+        unlock_res_and_lock(lock);
+
+	newres = ldlm_resource_get(ns, NULL, new_resid, type, 1);
+	if (IS_ERR(newres))
+		RETURN(PTR_ERR(newres));
+
+        lu_ref_add(&newres->lr_reference, "lock", lock);
+        /*
+	 * To flip the lock from the old to the new resource, oldres
+	 * and newres have to be locked. Resource spin-locks are taken
+	 * in the memory address order to avoid dead-locks.
+	 * As this is the only circumstance where ->l_resource
+	 * can change, and this cannot race with itself, it is safe
+	 * to access lock->l_resource without being careful about locking.
+         */
+        oldres = lock->l_resource;
+        if (oldres < newres) {
+                lock_res(oldres);
+                lock_res_nested(newres, LRT_NEW);
+        } else {
+                lock_res(newres);
+                lock_res_nested(oldres, LRT_NEW);
+        }
+        LASSERT(memcmp(new_resid, &oldres->lr_name,
+                       sizeof oldres->lr_name) != 0);
+	rcu_assign_pointer(lock->l_resource, newres);
+        unlock_res(oldres);
+	unlock_res(newres);
+
+        /* ...and the flowers are still standing! */
+        lu_ref_del(&oldres->lr_reference, "lock", lock);
+        ldlm_resource_putref(oldres);
+
+        RETURN(0);
+}
+
+/** \defgroup ldlm_handles LDLM HANDLES
+ * Ways to get hold of locks without any addresses.
+ * @{
+ */
+
+/**
+ * Fills in handle for LDLM lock \a lock into supplied \a lockh
+ * Does not take any references.
+ */
+void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh)
+{
+	lockh->cookie = lock->l_handle.h_cookie;
+}
+EXPORT_SYMBOL(ldlm_lock2handle);
+
+/**
+ * Obtain a lock reference by handle.
+ *
+ * if \a flags: atomically get the lock and set the flags.
+ *              Return NULL if flag already set
+ */
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle,
+				     __u64 flags)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	LASSERT(handle);
+
+	if (!lustre_handle_is_used(handle))
+		RETURN(NULL);
+
+	lock = class_handle2object(handle->cookie, lock_handle_owner);
+	if (lock == NULL)
+		RETURN(NULL);
+
+	if (lock->l_export != NULL && lock->l_export->exp_failed) {
+		CDEBUG(D_INFO, "lock export failed: lock %p, exp %p\n",
+		       lock, lock->l_export);
+		LDLM_LOCK_PUT(lock);
+		RETURN(NULL);
+	}
+
+	/* It's unlikely but possible that someone marked the lock as
+	 * destroyed after we did handle2object on it */
+	if ((flags == 0) && !ldlm_is_destroyed(lock)) {
+		lu_ref_add_atomic(&lock->l_reference, "handle", lock);
+		RETURN(lock);
+	}
+
+	lock_res_and_lock(lock);
+
+	LASSERT(lock->l_resource != NULL);
+
+	lu_ref_add_atomic(&lock->l_reference, "handle", lock);
+	if (unlikely(ldlm_is_destroyed(lock))) {
+		unlock_res_and_lock(lock);
+		CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock);
+		LDLM_LOCK_PUT(lock);
+		RETURN(NULL);
+	}
+
+	/* If we're setting flags, make sure none of them are already set. */
+	if (flags != 0) {
+		if ((lock->l_flags & flags) != 0) {
+			unlock_res_and_lock(lock);
+			LDLM_LOCK_PUT(lock);
+			RETURN(NULL);
+		}
+
+		lock->l_flags |= flags;
+	}
+
+	unlock_res_and_lock(lock);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(__ldlm_handle2lock);
+/** @} ldlm_handles */
+
+/**
+ * Fill in "on the wire" representation for given LDLM lock into supplied
+ * lock descriptor \a desc structure.
+ */
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
+{
+	ldlm_res2desc(lock->l_resource, &desc->l_resource);
+	desc->l_req_mode = lock->l_req_mode;
+	desc->l_granted_mode = lock->l_granted_mode;
+	ldlm_convert_policy_to_wire(lock->l_resource->lr_type,
+				    &lock->l_policy_data,
+				    &desc->l_policy_data);
+}
+
+/**
+ * Add a lock to list of conflicting locks to send AST to.
+ *
+ * Only add if we have not sent a blocking AST to the lock yet.
+ */
+static void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+				  struct list_head *work_list)
+{
+	if (!ldlm_is_ast_sent(lock)) {
+		LDLM_DEBUG(lock, "lock incompatible; sending blocking AST.");
+		ldlm_set_ast_sent(lock);
+		/* If the enqueuing client said so, tell the AST recipient to
+		 * discard dirty data, rather than writing back. */
+		if (ldlm_is_ast_discard_data(new))
+			ldlm_set_discard_data(lock);
+
+		/* Lock can be converted from a blocking state back to granted
+		 * after lock convert or COS downgrade but still be in an
+		 * older bl_list because it is controlled only by
+		 * ldlm_work_bl_ast_lock(), let it be processed there.
+		 */
+		if (list_empty(&lock->l_bl_ast)) {
+			list_add(&lock->l_bl_ast, work_list);
+			LDLM_LOCK_GET(lock);
+		}
+		LASSERT(lock->l_blocking_lock == NULL);
+		lock->l_blocking_lock = LDLM_LOCK_GET(new);
+	}
+}
+
+/**
+ * Add a lock to list of just granted locks to send completion AST to.
+ */
+static void ldlm_add_cp_work_item(struct ldlm_lock *lock,
+				  struct list_head *work_list)
+{
+	if (!ldlm_is_cp_reqd(lock)) {
+		ldlm_set_cp_reqd(lock);
+                LDLM_DEBUG(lock, "lock granted; sending completion AST.");
+		LASSERT(list_empty(&lock->l_cp_ast));
+		list_add(&lock->l_cp_ast, work_list);
+                LDLM_LOCK_GET(lock);
+        }
+}
+
+/**
+ * Aggregator function to add AST work items into a list. Determines
+ * what sort of an AST work needs to be done and calls the proper
+ * adding function.
+ * Must be called with lr_lock held.
+ */
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list)
+{
+        ENTRY;
+        check_res_locked(lock->l_resource);
+        if (new)
+                ldlm_add_bl_work_item(lock, new, work_list);
+        else
+                ldlm_add_cp_work_item(lock, work_list);
+        EXIT;
+}
+
+/**
+ * Add specified reader/writer reference to LDLM lock with handle \a lockh.
+ * r/w reference type is determined by \a mode
+ * Calls ldlm_lock_addref_internal.
+ */
+void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode)
+{
+	struct ldlm_lock *lock;
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie);
+	ldlm_lock_addref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_addref);
+
+/**
+ * Helper function.
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * r/w reference type is determined by \a mode
+ * Removes lock from LRU if it is there.
+ * Assumes the LDLM lock is already locked.
+ */
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock,
+				      enum ldlm_mode mode)
+{
+        ldlm_lock_remove_from_lru(lock);
+        if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+                lock->l_readers++;
+                lu_ref_add_atomic(&lock->l_reference, "reader", lock);
+        }
+        if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+                lock->l_writers++;
+                lu_ref_add_atomic(&lock->l_reference, "writer", lock);
+        }
+        LDLM_LOCK_GET(lock);
+        lu_ref_add_atomic(&lock->l_reference, "user", lock);
+        LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]);
+}
+
+/**
+ * Attempts to add reader/writer reference to a lock with handle \a lockh, and
+ * fails if lock is already LDLM_FL_CBPENDING or destroyed.
+ *
+ * \retval 0 success, lock was addref-ed
+ *
+ * \retval -EAGAIN lock is being canceled.
+ */
+int ldlm_lock_addref_try(const struct lustre_handle *lockh, enum ldlm_mode mode)
+{
+        struct ldlm_lock *lock;
+        int               result;
+
+        result = -EAGAIN;
+        lock = ldlm_handle2lock(lockh);
+        if (lock != NULL) {
+                lock_res_and_lock(lock);
+                if (lock->l_readers != 0 || lock->l_writers != 0 ||
+		    !ldlm_is_cbpending(lock)) {
+                        ldlm_lock_addref_internal_nolock(lock, mode);
+                        result = 0;
+                }
+                unlock_res_and_lock(lock);
+                LDLM_LOCK_PUT(lock);
+        }
+        return result;
+}
+EXPORT_SYMBOL(ldlm_lock_addref_try);
+
+/**
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work.
+ * Only called for local locks.
+ */
+void ldlm_lock_addref_internal(struct ldlm_lock *lock, enum ldlm_mode mode)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_addref_internal_nolock(lock, mode);
+	unlock_res_and_lock(lock);
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Assumes LDLM lock is already locked.
+ * only called in ldlm_flock_destroy and for local locks.
+ * Does NOT add lock to LRU if no r/w references left to accomodate flock locks
+ * that cannot be placed in LRU.
+ */
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock,
+				      enum ldlm_mode mode)
+{
+        LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+        if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+                LASSERT(lock->l_readers > 0);
+                lu_ref_del(&lock->l_reference, "reader", lock);
+                lock->l_readers--;
+        }
+        if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+                LASSERT(lock->l_writers > 0);
+                lu_ref_del(&lock->l_reference, "writer", lock);
+                lock->l_writers--;
+        }
+
+        lu_ref_del(&lock->l_reference, "user", lock);
+        LDLM_LOCK_RELEASE(lock);    /* matches the LDLM_LOCK_GET() in addref */
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Locks LDLM lock first.
+ * If the lock is determined to be client lock on a client and r/w refcount
+ * drops to zero and the lock is not blocked, the lock is added to LRU lock
+ * on the namespace.
+ * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called.
+ */
+void ldlm_lock_decref_internal(struct ldlm_lock *lock, enum ldlm_mode mode)
+{
+	struct ldlm_namespace *ns;
+
+	ENTRY;
+
+	lock_res_and_lock(lock);
+
+	ns = ldlm_lock_to_ns(lock);
+
+	ldlm_lock_decref_internal_nolock(lock, mode);
+
+	if ((ldlm_is_local(lock) || lock->l_req_mode == LCK_GROUP) &&
+	    !lock->l_readers && !lock->l_writers) {
+		/* If this is a local lock on a server namespace and this was
+		 * the last reference, cancel the lock.
+		 *
+		 * Group locks are special:
+		 * They must not go in LRU, but they are not called back
+		 * like non-group locks, instead they are manually released.
+		 * They have an l_writers reference which they keep until
+		 * they are manually released, so we remove them when they have
+		 * no more reader or writer references. - LU-6368 */
+		ldlm_set_cbpending(lock);
+	}
+
+	if (!lock->l_readers && !lock->l_writers && ldlm_is_cbpending(lock)) {
+		unsigned int mask = D_DLMTRACE;
+
+		/* If we received a blocked AST and this was the last reference,
+		 * run the callback. */
+		if (ldlm_is_ns_srv(lock) && lock->l_export)
+			mask |= D_WARNING;
+		LDLM_DEBUG_LIMIT(mask, lock,
+				 "final decref done on %sCBPENDING lock",
+				 mask & D_WARNING ? "non-local " : "");
+
+		LDLM_LOCK_GET(lock); /* dropped by bl thread */
+		ldlm_lock_remove_from_lru(lock);
+		unlock_res_and_lock(lock);
+
+		if (ldlm_is_fail_loc(lock))
+			OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+		if (ldlm_is_atomic_cb(lock) ||
+                    ldlm_bl_to_thread_lock(ns, NULL, lock) != 0)
+			ldlm_handle_bl_callback(ns, NULL, lock);
+        } else if (ns_is_client(ns) &&
+		   !lock->l_readers && !lock->l_writers &&
+		   !ldlm_is_no_lru(lock) &&
+		   !ldlm_is_bl_ast(lock) &&
+		   !ldlm_is_converting(lock)) {
+
+		/* If this is a client-side namespace and this was the last
+		 * reference, put it on the LRU.
+		 */
+		ldlm_lock_add_to_lru(lock);
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "add lock into lru list");
+
+		if (ldlm_is_fail_loc(lock))
+			OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+		ldlm_pool_recalc(&ns->ns_pool, true);
+	} else {
+		LDLM_DEBUG(lock, "do not add lock into lru list");
+		unlock_res_and_lock(lock);
+	}
+
+	EXIT;
+}
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle \a lockh
+ */
+void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode)
+{
+        struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+	LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie);
+        ldlm_lock_decref_internal(lock, mode);
+        LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref);
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle
+ * \a lockh and mark it for subsequent cancellation once r/w refcount
+ * drops to zero instead of putting into LRU.
+ *
+ */
+void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh,
+				 enum ldlm_mode mode)
+{
+        struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+        ENTRY;
+
+        LASSERT(lock != NULL);
+
+        LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+        lock_res_and_lock(lock);
+	ldlm_set_cbpending(lock);
+        unlock_res_and_lock(lock);
+        ldlm_lock_decref_internal(lock, mode);
+        LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref_and_cancel);
+
+struct sl_insert_point {
+	struct list_head *res_link;
+	struct list_head *mode_link;
+	struct list_head *policy_link;
+};
+
+/**
+ * Finds a position to insert the new lock into granted lock list.
+ *
+ * Used for locks eligible for skiplist optimization.
+ *
+ * Parameters:
+ *      queue [input]:  the granted list where search acts on;
+ *      req [input]:    the lock whose position to be located;
+ *      prev [output]:  positions within 3 lists to insert @req to
+ * Return Value:
+ *      filled @prev
+ * NOTE: called by
+ *  - ldlm_grant_lock_with_skiplist
+ */
+static void search_granted_lock(struct list_head *queue,
+                                struct ldlm_lock *req,
+                                struct sl_insert_point *prev)
+{
+	struct list_head *tmp;
+        struct ldlm_lock *lock, *mode_end, *policy_end;
+        ENTRY;
+
+	list_for_each(tmp, queue) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		mode_end = list_entry(lock->l_sl_mode.prev,
+                                          struct ldlm_lock, l_sl_mode);
+
+                if (lock->l_req_mode != req->l_req_mode) {
+                        /* jump to last lock of mode group */
+                        tmp = &mode_end->l_res_link;
+                        continue;
+                }
+
+                /* suitable mode group is found */
+                if (lock->l_resource->lr_type == LDLM_PLAIN) {
+                        /* insert point is last lock of the mode group */
+                        prev->res_link = &mode_end->l_res_link;
+                        prev->mode_link = &mode_end->l_sl_mode;
+                        prev->policy_link = &req->l_sl_policy;
+                        EXIT;
+                        return;
+                } else if (lock->l_resource->lr_type == LDLM_IBITS) {
+                        for (;;) {
+                                policy_end =
+					list_entry(lock->l_sl_policy.prev,
+                                                       struct ldlm_lock,
+                                                       l_sl_policy);
+
+                                if (lock->l_policy_data.l_inodebits.bits ==
+                                    req->l_policy_data.l_inodebits.bits) {
+                                        /* insert point is last lock of
+                                         * the policy group */
+                                        prev->res_link =
+                                                &policy_end->l_res_link;
+                                        prev->mode_link =
+                                                &policy_end->l_sl_mode;
+                                        prev->policy_link =
+                                                &policy_end->l_sl_policy;
+                                        EXIT;
+                                        return;
+                                }
+
+                                if (policy_end == mode_end)
+                                        /* done with mode group */
+                                        break;
+
+                                /* go to next policy group within mode group */
+                                tmp = policy_end->l_res_link.next;
+				lock = list_entry(tmp, struct ldlm_lock,
+                                                      l_res_link);
+                        }  /* loop over policy groups within the mode group */
+
+                        /* insert point is last lock of the mode group,
+                         * new policy group is started */
+                        prev->res_link = &mode_end->l_res_link;
+                        prev->mode_link = &mode_end->l_sl_mode;
+                        prev->policy_link = &req->l_sl_policy;
+                        EXIT;
+                        return;
+                } else {
+                        LDLM_ERROR(lock,"is not LDLM_PLAIN or LDLM_IBITS lock");
+                        LBUG();
+                }
+        }
+
+        /* insert point is last lock on the queue,
+         * new mode group and new policy group are started */
+        prev->res_link = queue->prev;
+        prev->mode_link = &req->l_sl_mode;
+        prev->policy_link = &req->l_sl_policy;
+        EXIT;
+}
+
+/**
+ * Add a lock into resource granted list after a position described by
+ * \a prev.
+ */
+static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
+                                       struct sl_insert_point *prev)
+{
+        struct ldlm_resource *res = lock->l_resource;
+        ENTRY;
+
+        check_res_locked(res);
+
+        ldlm_resource_dump(D_INFO, res);
+        LDLM_DEBUG(lock, "About to add lock:");
+
+	if (ldlm_is_destroyed(lock)) {
+                CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+                return;
+        }
+
+	LASSERT(list_empty(&lock->l_res_link));
+	LASSERT(list_empty(&lock->l_sl_mode));
+	LASSERT(list_empty(&lock->l_sl_policy));
+
+	/*
+	 * lock->link == prev->link means lock is first starting the group.
+	 * Don't re-add to itself to suppress kernel warnings.
+	 */
+	if (&lock->l_res_link != prev->res_link)
+		list_add(&lock->l_res_link, prev->res_link);
+	if (&lock->l_sl_mode != prev->mode_link)
+		list_add(&lock->l_sl_mode, prev->mode_link);
+	if (&lock->l_sl_policy != prev->policy_link)
+		list_add(&lock->l_sl_policy, prev->policy_link);
+
+        EXIT;
+}
+
+/**
+ * Add a lock to granted list on a resource maintaining skiplist
+ * correctness.
+ */
+void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+{
+	struct sl_insert_point prev;
+
+	LASSERT(ldlm_is_granted(lock));
+
+	search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+	ldlm_granted_list_add_lock(lock, &prev);
+}
+
+/**
+ * Perform lock granting bookkeeping.
+ *
+ * Includes putting the lock into granted list and updating lock mode.
+ * NOTE: called by
+ *  - ldlm_lock_enqueue
+ *  - ldlm_reprocess_queue
+ *
+ * must be called with lr_lock held
+ */
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
+{
+        struct ldlm_resource *res = lock->l_resource;
+        ENTRY;
+
+        check_res_locked(res);
+
+        lock->l_granted_mode = lock->l_req_mode;
+
+	if (work_list && lock->l_completion_ast != NULL)
+		ldlm_add_ast_work_item(lock, NULL, work_list);
+
+        if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
+                ldlm_grant_lock_with_skiplist(lock);
+        else if (res->lr_type == LDLM_EXTENT)
+                ldlm_extent_add_lock(res, lock);
+	else if (res->lr_type == LDLM_FLOCK) {
+		/* We should not add locks to granted list in the following
+		 * cases:
+		 * - this is an UNLOCK but not a real lock;
+		 * - this is a TEST lock;
+		 * - this is a F_CANCELLK lock (async flock has req_mode == 0)
+		 * - this is a deadlock (flock cannot be granted) */
+		if (lock->l_req_mode == 0 ||
+		    lock->l_req_mode == LCK_NL ||
+		    ldlm_is_test_lock(lock) ||
+		    ldlm_is_flock_deadlock(lock))
+			RETURN_EXIT;
+		ldlm_resource_add_lock(res, &res->lr_granted, lock);
+	} else {
+		LBUG();
+	}
+
+        ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock);
+        EXIT;
+}
+
+/**
+ * Check if the given @lock meets the criteria for a match.
+ * A reference on the lock is taken if matched.
+ *
+ * @lock	test-against this lock
+ * @data	parameters
+ *
+ * RETURN	returns true if @lock matches @data, false otherwise
+ */
+static bool lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
+{
+	union ldlm_policy_data *lpol = &lock->l_policy_data;
+	enum ldlm_mode match = LCK_MINMODE;
+
+	if (lock == data->lmd_old)
+		return true;
+
+	/* Check if this lock can be matched.
+	 * Used by LU-2919(exclusive open) for open lease lock */
+	if (ldlm_is_excl(lock))
+		return false;
+
+	/* llite sometimes wants to match locks that will be
+	 * canceled when their users drop, but we allow it to match
+	 * if it passes in CBPENDING and the lock still has users.
+	 * this is generally only going to be used by children
+	 * whose parents already hold a lock so forward progress
+	 * can still happen. */
+	if (ldlm_is_cbpending(lock) &&
+	    !(data->lmd_flags & LDLM_FL_CBPENDING))
+		return false;
+
+	if (!(data->lmd_match & LDLM_MATCH_UNREF) && ldlm_is_cbpending(lock) &&
+	    lock->l_readers == 0 && lock->l_writers == 0)
+		return false;
+
+	if (!(lock->l_req_mode & *data->lmd_mode))
+		return false;
+
+	/* When we search for ast_data, we are not doing a traditional match,
+	 * so we don't worry about IBITS or extent matching.
+	 */
+	if (data->lmd_match & (LDLM_MATCH_AST | LDLM_MATCH_AST_ANY)) {
+		if (!lock->l_ast_data)
+			return false;
+
+		if (data->lmd_match & LDLM_MATCH_AST_ANY)
+			goto matched;
+	}
+
+	match = lock->l_req_mode;
+
+	switch (lock->l_resource->lr_type) {
+	case LDLM_EXTENT:
+		if (!(data->lmd_match & LDLM_MATCH_RIGHT) &&
+		    (lpol->l_extent.start > data->lmd_policy->l_extent.start ||
+		     lpol->l_extent.end < data->lmd_policy->l_extent.end))
+			return false;
+
+		if (unlikely(match == LCK_GROUP) &&
+		    data->lmd_policy->l_extent.gid != LDLM_GID_ANY &&
+		    lpol->l_extent.gid != data->lmd_policy->l_extent.gid)
+			return false;
+		break;
+	case LDLM_IBITS:
+		/* We match if we have existing lock with same or wider set
+		   of bits. */
+		if ((lpol->l_inodebits.bits &
+		     data->lmd_policy->l_inodebits.bits) !=
+		    data->lmd_policy->l_inodebits.bits)
+			return false;
+
+		if (unlikely(match == LCK_GROUP) &&
+		    data->lmd_policy->l_inodebits.li_gid != LDLM_GID_ANY &&
+		    lpol->l_inodebits.li_gid !=
+		    data->lmd_policy->l_inodebits.li_gid)
+			return false;
+		break;
+	default:
+		;
+	}
+
+	/* We match if we have existing lock with same or wider set
+	   of bits. */
+	if (!(data->lmd_match & LDLM_MATCH_UNREF) && LDLM_HAVE_MASK(lock, GONE))
+		return false;
+
+	if (!equi(data->lmd_flags & LDLM_FL_LOCAL_ONLY, ldlm_is_local(lock)))
+		return false;
+
+	/* Filter locks by skipping flags */
+	if (data->lmd_skip_flags & lock->l_flags)
+		return false;
+
+matched:
+	if (data->lmd_flags & LDLM_FL_TEST_LOCK) {
+		LDLM_LOCK_GET(lock);
+		ldlm_lock_touch_in_lru(lock);
+	} else {
+		ldlm_lock_addref_internal_nolock(lock, match);
+	}
+
+	*data->lmd_mode = match;
+	data->lmd_lock = lock;
+
+	return true;
+}
+
+static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
+{
+	struct ldlm_interval *node = to_ldlm_interval(in);
+	struct ldlm_match_data *data = args;
+	struct ldlm_lock *lock;
+
+	list_for_each_entry(lock, &node->li_group, l_sl_policy) {
+		if (lock_matches(lock, data))
+			return INTERVAL_ITER_STOP;
+	}
+	return INTERVAL_ITER_CONT;
+}
+
+/**
+ * Search for a lock with given parameters in interval trees.
+ *
+ * \param res      search for a lock in this resource
+ * \param data	   parameters
+ *
+ * \retval a referenced lock or NULL.
+ */
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+			       struct ldlm_match_data *data)
+{
+	struct interval_node_extent ext = {
+		.start     = data->lmd_policy->l_extent.start,
+		.end       = data->lmd_policy->l_extent.end
+	};
+	int idx;
+
+	data->lmd_lock = NULL;
+
+	if (data->lmd_match & LDLM_MATCH_RIGHT)
+		ext.end = OBD_OBJECT_EOF;
+
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		struct ldlm_interval_tree *tree = &res->lr_itree[idx];
+
+		if (tree->lit_root == NULL)
+			continue;
+
+		if (!(tree->lit_mode & *data->lmd_mode))
+			continue;
+
+		interval_search(tree->lit_root, &ext,
+				itree_overlap_cb, data);
+		if (data->lmd_lock)
+			return data->lmd_lock;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(search_itree);
+
+
+/**
+ * Search for a lock with given properties in a queue.
+ *
+ * \param queue    search for a lock in this queue
+ * \param data	   parameters
+ *
+ * \retval a referenced lock or NULL.
+ */
+static struct ldlm_lock *search_queue(struct list_head *queue,
+				      struct ldlm_match_data *data)
+{
+	struct ldlm_lock *lock;
+
+	data->lmd_lock = NULL;
+
+	list_for_each_entry(lock, queue, l_res_link)
+		if (lock_matches(lock, data))
+			return data->lmd_lock;
+
+	return NULL;
+}
+
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock)
+{
+	if ((lock->l_flags & LDLM_FL_FAIL_NOTIFIED) == 0) {
+		lock->l_flags |= LDLM_FL_FAIL_NOTIFIED;
+		wake_up(&lock->l_waitq);
+	}
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match_locked);
+
+void ldlm_lock_fail_match(struct ldlm_lock *lock)
+{
+        lock_res_and_lock(lock);
+        ldlm_lock_fail_match_locked(lock);
+        unlock_res_and_lock(lock);
+}
+
+/**
+ * Mark lock as "matchable" by OST.
+ *
+ * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB
+ * is not yet valid.
+ * Assumes LDLM lock is already locked.
+ */
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock)
+{
+	ldlm_set_lvb_ready(lock);
+	wake_up(&lock->l_waitq);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match_locked);
+
+/**
+ * Mark lock as "matchable" by OST.
+ * Locks the lock and then \see ldlm_lock_allow_match_locked
+ */
+void ldlm_lock_allow_match(struct ldlm_lock *lock)
+{
+        lock_res_and_lock(lock);
+        ldlm_lock_allow_match_locked(lock);
+        unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match);
+
+/**
+ * Attempt to find a lock with specified properties.
+ *
+ * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is
+ * set in \a flags
+ *
+ * Can be called in two ways:
+ *
+ * If 'ns' is NULL, then lockh describes an existing lock that we want to look
+ * for a duplicate of.
+ *
+ * Otherwise, all of the fields must be filled in, to match against.
+ *
+ * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the
+ *     server (ie, connh is NULL)
+ * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted
+ *     list will be considered
+ * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked
+ *     to be canceled can still be matched as long as they still have reader
+ *     or writer refernces
+ * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock,
+ *     just tell us if we would have matched.
+ *
+ * \retval 1 if it finds an already-existing lock that is compatible; in this
+ * case, lockh is filled in with a addref()ed lock
+ *
+ * We also check security context, and if that fails we simply return 0 (to
+ * keep caller code unchanged), the context failure will be discovered by
+ * caller sometime later.
+ */
+enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
+					 __u64 flags, __u64 skip_flags,
+					 const struct ldlm_res_id *res_id,
+					 enum ldlm_type type,
+					 union ldlm_policy_data *policy,
+					 enum ldlm_mode mode,
+					 struct lustre_handle *lockh,
+					 enum ldlm_match_flags match_flags)
+{
+	struct ldlm_match_data data = {
+		.lmd_old = NULL,
+		.lmd_lock = NULL,
+		.lmd_mode = &mode,
+		.lmd_policy = policy,
+		.lmd_flags = flags,
+		.lmd_skip_flags = skip_flags,
+		.lmd_match = match_flags,
+	};
+	struct ldlm_resource *res;
+	struct ldlm_lock *lock;
+	int matched;
+
+	ENTRY;
+
+	if (ns == NULL) {
+		data.lmd_old = ldlm_handle2lock(lockh);
+		LASSERT(data.lmd_old != NULL);
+
+		ns = ldlm_lock_to_ns(data.lmd_old);
+		res_id = &data.lmd_old->l_resource->lr_name;
+		type = data.lmd_old->l_resource->lr_type;
+		*data.lmd_mode = data.lmd_old->l_req_mode;
+	}
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 0);
+	if (IS_ERR(res)) {
+		LASSERT(data.lmd_old == NULL);
+		RETURN(0);
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	lock_res(res);
+	if (res->lr_type == LDLM_EXTENT)
+		lock = search_itree(res, &data);
+	else
+		lock = search_queue(&res->lr_granted, &data);
+	if (!lock && !(flags & LDLM_FL_BLOCK_GRANTED))
+		lock = search_queue(&res->lr_waiting, &data);
+	matched = lock ? mode : 0;
+	unlock_res(res);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+
+	if (lock) {
+		ldlm_lock2handle(lock, lockh);
+		if ((flags & LDLM_FL_LVB_READY) &&
+		    (!ldlm_is_lvb_ready(lock))) {
+			__u64 wait_flags = LDLM_FL_LVB_READY |
+				LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED;
+
+			if (lock->l_completion_ast) {
+				int err = lock->l_completion_ast(lock,
+							LDLM_FL_WAIT_NOREPROC,
+							NULL);
+				if (err)
+					GOTO(out_fail_match, matched = 0);
+			}
+
+			wait_event_idle_timeout(
+				lock->l_waitq,
+				lock->l_flags & wait_flags,
+				cfs_time_seconds(obd_timeout));
+
+			if (!ldlm_is_lvb_ready(lock))
+				GOTO(out_fail_match, matched = 0);
+		}
+
+		/* check user's security context */
+		if (lock->l_conn_export &&
+		    sptlrpc_import_check_ctx(
+				class_exp2cliimp(lock->l_conn_export)))
+			GOTO(out_fail_match, matched = 0);
+
+		LDLM_DEBUG(lock, "matched (%llu %llu)",
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+			   res_id->name[2] : policy->l_extent.start,
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+			   res_id->name[3] : policy->l_extent.end);
+
+out_fail_match:
+		if (flags & LDLM_FL_TEST_LOCK)
+			LDLM_LOCK_RELEASE(lock);
+		else if (!matched)
+			ldlm_lock_decref_internal(lock, mode);
+	}
+
+	/* less verbose for test-only */
+	if (!matched && !(flags & LDLM_FL_TEST_LOCK)) {
+		LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
+				  "%llu/%llu (%llu %llu)", ns,
+				  type, mode, res_id->name[0], res_id->name[1],
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				  res_id->name[2] : policy->l_extent.start,
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				  res_id->name[3] : policy->l_extent.end);
+	}
+	if (data.lmd_old != NULL)
+		LDLM_LOCK_PUT(data.lmd_old);
+
+	return matched;
+}
+EXPORT_SYMBOL(ldlm_lock_match_with_skip);
+
+enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
+					   __u64 *bits)
+{
+	struct ldlm_lock *lock;
+	enum ldlm_mode mode = 0;
+	ENTRY;
+
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		if (LDLM_HAVE_MASK(lock, GONE))
+			GOTO(out, mode);
+
+		if (ldlm_is_cbpending(lock) &&
+                    lock->l_readers == 0 && lock->l_writers == 0)
+                        GOTO(out, mode);
+
+                if (bits)
+                        *bits = lock->l_policy_data.l_inodebits.bits;
+                mode = lock->l_granted_mode;
+                ldlm_lock_addref_internal_nolock(lock, mode);
+        }
+
+        EXIT;
+
+out:
+        if (lock != NULL) {
+                unlock_res_and_lock(lock);
+                LDLM_LOCK_PUT(lock);
+        }
+        return mode;
+}
+EXPORT_SYMBOL(ldlm_revalidate_lock_handle);
+
+/** The caller must guarantee that the buffer is large enough. */
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size)
+{
+	void *lvb;
+	ENTRY;
+
+	LASSERT(data != NULL);
+	LASSERT(size >= 0);
+
+	switch (lock->l_lvb_type) {
+	case LVB_T_OST:
+		if (size == sizeof(struct ost_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+		} else if (size == sizeof(struct ost_lvb_v1)) {
+			struct ost_lvb *olvb = data;
+
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb_v1);
+			else
+				lvb = req_capsule_server_sized_swab_get(pill,
+						&RMF_DLM_LVB, size,
+						lustre_swab_ost_lvb_v1);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+			olvb->lvb_mtime_ns = 0;
+			olvb->lvb_atime_ns = 0;
+			olvb->lvb_ctime_ns = 0;
+		} else {
+			LDLM_ERROR(lock, "Replied unexpected ost LVB size %d",
+				   size);
+			RETURN(-EINVAL);
+		}
+		break;
+	case LVB_T_LQUOTA:
+		if (size == sizeof(struct lquota_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+		} else {
+			LDLM_ERROR(lock, "Replied unexpected lquota LVB size %d",
+				   size);
+			RETURN(-EINVAL);
+		}
+		break;
+	case LVB_T_LAYOUT:
+		if (size == 0)
+			break;
+
+		if (loc == RCL_CLIENT)
+			lvb = req_capsule_client_get(pill, &RMF_DLM_LVB);
+		else
+			lvb = req_capsule_server_get(pill, &RMF_DLM_LVB);
+		if (unlikely(lvb == NULL)) {
+			LDLM_ERROR(lock, "no LVB");
+			RETURN(-EPROTO);
+		}
+
+		memcpy(data, lvb, size);
+		break;
+	default:
+		LDLM_ERROR(lock, "Unknown LVB type: %d", lock->l_lvb_type);
+		libcfs_debug_dumpstack(NULL);
+		RETURN(-EINVAL);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Create and fill in new LDLM lock with specified properties.
+ * Returns a referenced lock
+ */
+struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
+				   const struct ldlm_res_id *res_id,
+				   enum ldlm_type type,
+				   enum ldlm_mode mode,
+				   const struct ldlm_callback_suite *cbs,
+				   void *data, __u32 lvb_len,
+				   enum lvb_type lvb_type)
+{
+	struct ldlm_lock	*lock;
+	struct ldlm_resource	*res;
+	int			rc;
+	ENTRY;
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 1);
+	if (IS_ERR(res))
+		RETURN(ERR_CAST(res));
+
+	lock = ldlm_lock_new(res);
+	if (!lock) {
+		ldlm_resource_putref(res);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	lock->l_req_mode = mode;
+	lock->l_ast_data = data;
+	lock->l_pid = current->pid;
+	if (ns_is_server(ns))
+		ldlm_set_ns_srv(lock);
+	if (cbs) {
+		lock->l_blocking_ast = cbs->lcs_blocking;
+		lock->l_completion_ast = cbs->lcs_completion;
+		lock->l_glimpse_ast = cbs->lcs_glimpse;
+	}
+
+	switch (type) {
+	case LDLM_EXTENT:
+		rc = ldlm_extent_alloc_lock(lock);
+		break;
+	case LDLM_IBITS:
+		rc = ldlm_inodebits_alloc_lock(lock);
+		break;
+	default:
+		rc = 0;
+	}
+	if (rc)
+		GOTO(out, rc);
+
+	if (lvb_len) {
+		lock->l_lvb_len = lvb_len;
+		OBD_ALLOC_LARGE(lock->l_lvb_data, lvb_len);
+		if (lock->l_lvb_data == NULL)
+			GOTO(out, rc = -ENOMEM);
+	}
+
+	lock->l_lvb_type = lvb_type;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK))
+		GOTO(out, rc = -ENOENT);
+
+	RETURN(lock);
+
+out:
+	ldlm_lock_destroy(lock);
+	LDLM_LOCK_RELEASE(lock);
+	RETURN(ERR_PTR(rc));
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+static enum ldlm_error ldlm_lock_enqueue_helper(struct ldlm_lock *lock,
+					     __u64 *flags)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	enum ldlm_error rc = ELDLM_OK;
+	LIST_HEAD(rpc_list);
+	ldlm_processing_policy policy;
+
+	ENTRY;
+
+	policy = ldlm_get_processing_policy(res);
+	policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, &rpc_list);
+	if (rc == ELDLM_OK && lock->l_granted_mode != lock->l_req_mode &&
+	    res->lr_type != LDLM_FLOCK)
+		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list);
+
+	if (!list_empty(&rpc_list))
+		ldlm_discard_bl_list(&rpc_list);
+
+	RETURN(rc);
+}
+#endif
+
+/**
+ * Enqueue (request) a lock.
+ *
+ * Does not block. As a result of enqueue the lock would be put
+ * into granted or waiting list.
+ *
+ * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag
+ * set, skip all the enqueueing and delegate lock processing to intent policy
+ * function.
+ */
+enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
+				  struct ldlm_namespace *ns,
+				  struct ldlm_lock **lockp,
+				  void *cookie, __u64 *flags)
+{
+	struct ldlm_lock *lock = *lockp;
+	struct ldlm_resource *res;
+	int local = ns_is_client(ns);
+	enum ldlm_error rc = ELDLM_OK;
+	struct ldlm_interval *node = NULL;
+#ifdef HAVE_SERVER_SUPPORT
+	bool reconstruct = false;
+#endif
+	ENTRY;
+
+        /* policies are not executed on the client or during replay */
+        if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
+            && !local && ns->ns_policy) {
+		rc = ns->ns_policy(env, ns, lockp, cookie, lock->l_req_mode,
+				   *flags, NULL);
+                if (rc == ELDLM_LOCK_REPLACED) {
+                        /* The lock that was returned has already been granted,
+                         * and placed into lockp.  If it's not the same as the
+                         * one we passed in, then destroy the old one and our
+                         * work here is done. */
+                        if (lock != *lockp) {
+                                ldlm_lock_destroy(lock);
+                                LDLM_LOCK_RELEASE(lock);
+                        }
+                        *flags |= LDLM_FL_LOCK_CHANGED;
+                        RETURN(0);
+		} else if (rc != ELDLM_OK &&
+			   ldlm_is_granted(lock)) {
+			LASSERT(*flags & LDLM_FL_RESENT);
+			/* It may happen that ns_policy returns an error in
+			 * resend case, object may be unlinked or just some
+			 * error occurs. It is unclear if lock reached the
+			 * client in the original reply, just leave the lock on
+			 * server, not returning it again to client. Due to
+			 * LU-6529, the server will not OOM. */
+			RETURN(rc);
+                } else if (rc != ELDLM_OK ||
+                           (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) {
+                        ldlm_lock_destroy(lock);
+                        RETURN(rc);
+                }
+        }
+
+	if (*flags & LDLM_FL_RESENT) {
+		/* Reconstruct LDLM_FL_SRV_ENQ_MASK @flags for reply.
+		 * Set LOCK_CHANGED always.
+		 * Check if the lock is granted for BLOCK_GRANTED.
+		 * Take NO_TIMEOUT from the lock as it is inherited through
+		 * LDLM_FL_INHERIT_MASK */
+		*flags |= LDLM_FL_LOCK_CHANGED;
+		if (!ldlm_is_granted(lock))
+			*flags |= LDLM_FL_BLOCK_GRANTED;
+		*flags |= lock->l_flags & LDLM_FL_NO_TIMEOUT;
+		RETURN(ELDLM_OK);
+	}
+
+#ifdef HAVE_SERVER_SUPPORT
+	/* For a replaying lock, it might be already in granted list. So
+	 * unlinking the lock will cause the interval node to be freed, we
+	 * have to allocate the interval node early otherwise we can't regrant
+	 * this lock in the future. - jay
+	 *
+	 * The only time the ldlm_resource changes for the ldlm_lock is when
+	 * ldlm_lock_change_resource() is called and that only happens for
+	 * the Lustre client case.
+	 */
+	if (!local && (*flags & LDLM_FL_REPLAY) &&
+	    lock->l_resource->lr_type == LDLM_EXTENT)
+		OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+
+	reconstruct = !local && lock->l_resource->lr_type == LDLM_FLOCK &&
+		      !(*flags & LDLM_FL_TEST_LOCK);
+	if (reconstruct) {
+		rc = req_can_reconstruct(cookie, NULL);
+		if (rc != 0) {
+			if (rc == 1)
+				rc = 0;
+			RETURN(rc);
+		}
+	}
+#endif
+	res = lock_res_and_lock(lock);
+	if (local && ldlm_is_granted(lock)) {
+                /* The server returned a blocked lock, but it was granted
+                 * before we got a chance to actually enqueue it.  We don't
+                 * need to do anything else. */
+                *flags &= ~LDLM_FL_BLOCKED_MASK;
+		GOTO(out, rc = ELDLM_OK);
+        }
+
+        ldlm_resource_unlink_lock(lock);
+        if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+                if (node == NULL) {
+                        ldlm_lock_destroy_nolock(lock);
+                        GOTO(out, rc = -ENOMEM);
+                }
+
+		INIT_LIST_HEAD(&node->li_group);
+                ldlm_interval_attach(node, lock);
+                node = NULL;
+        }
+
+	/* Some flags from the enqueue want to make it into the AST, via the
+	 * lock's l_flags. */
+	if (*flags & LDLM_FL_AST_DISCARD_DATA)
+		ldlm_set_ast_discard_data(lock);
+	if (*flags & LDLM_FL_TEST_LOCK)
+		ldlm_set_test_lock(lock);
+	if (*flags & LDLM_FL_COS_INCOMPAT)
+		ldlm_set_cos_incompat(lock);
+	if (*flags & LDLM_FL_COS_ENABLED)
+		ldlm_set_cos_enabled(lock);
+
+	/* This distinction between local lock trees is very important; a client
+	 * namespace only has information about locks taken by that client, and
+	 * thus doesn't have enough information to decide for itself if it can
+	 * be granted (below).  In this case, we do exactly what the server
+	 * tells us to do, as dictated by the 'flags'.
+	 *
+	 * We do exactly the same thing during recovery, when the server is
+	 * more or less trusting the clients not to lie.
+	 *
+	 * FIXME (bug 268): Detect obvious lies by checking compatibility in
+	 * granted queue. */
+        if (local) {
+		if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+		else
+			ldlm_grant_lock(lock, NULL);
+		GOTO(out, rc = ELDLM_OK);
+#ifdef HAVE_SERVER_SUPPORT
+	} else if (*flags & LDLM_FL_REPLAY) {
+		if (*flags & LDLM_FL_BLOCK_WAIT) {
+			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+			GOTO(out, rc = ELDLM_OK);
+		} else if (*flags & LDLM_FL_BLOCK_GRANTED) {
+			ldlm_grant_lock(lock, NULL);
+			GOTO(out, rc = ELDLM_OK);
+		}
+		/* If no flags, fall through to normal enqueue path. */
+	}
+
+	rc = ldlm_lock_enqueue_helper(lock, flags);
+	GOTO(out, rc);
+#else
+        } else {
+                CERROR("This is client-side-only module, cannot handle "
+                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+                LBUG();
+        }
+#endif
+
+out:
+        unlock_res_and_lock(lock);
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (reconstruct) {
+		struct ptlrpc_request *req = cookie;
+
+		tgt_mk_reply_data(NULL, NULL,
+				  &req->rq_export->exp_target_data,
+				  req, 0, NULL, false, 0);
+	}
+#endif
+        if (node)
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+        return rc;
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Iterate through all waiting locks on a given resource queue and attempt to
+ * grant them.
+ *
+ * Must be called with resource lock held.
+ */
+int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
+			 struct list_head *work_list,
+			 enum ldlm_process_intention intention, __u64 hint)
+{
+	struct list_head *tmp, *pos;
+	ldlm_processing_policy policy;
+	__u64 flags;
+	int rc = LDLM_ITER_CONTINUE;
+	enum ldlm_error err;
+	LIST_HEAD(bl_ast_list);
+
+	ENTRY;
+
+	check_res_locked(res);
+
+	policy = ldlm_get_processing_policy(res);
+	LASSERT(policy);
+	LASSERT(intention == LDLM_PROCESS_RESCAN ||
+		intention == LDLM_PROCESS_RECOVERY);
+
+restart:
+	list_for_each_safe(tmp, pos, queue) {
+		struct ldlm_lock *pending;
+		LIST_HEAD(rpc_list);
+
+		pending = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+                CDEBUG(D_INFO, "Reprocessing lock %p\n", pending);
+
+                flags = 0;
+		rc = policy(pending, &flags, intention, &err, &rpc_list);
+		if (pending->l_granted_mode == pending->l_req_mode ||
+		    res->lr_type == LDLM_FLOCK) {
+			list_splice(&rpc_list, work_list);
+		} else {
+			list_splice(&rpc_list, &bl_ast_list);
+		}
+		/*
+		 * When this is called from recovery done, we always want
+		 * to scan the whole list no matter what 'rc' is returned.
+		 */
+		if (rc != LDLM_ITER_CONTINUE &&
+		    intention == LDLM_PROCESS_RESCAN)
+			break;
+        }
+
+	if (!list_empty(&bl_ast_list)) {
+		unlock_res(res);
+
+		rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list,
+				       LDLM_WORK_BL_AST);
+
+		lock_res(res);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+	}
+
+	if (!list_empty(&bl_ast_list))
+		ldlm_discard_bl_list(&bl_ast_list);
+
+        RETURN(intention == LDLM_PROCESS_RESCAN ? rc : LDLM_ITER_CONTINUE);
+}
+
+/**
+ * Conflicting locks are detected for a lock to be enqueued, add the lock
+ * into waiting list and send blocking ASTs to the conflicting locks.
+ *
+ * \param[in] lock		The lock to be enqueued.
+ * \param[out] flags		Lock flags for the lock to be enqueued.
+ * \param[in] rpc_list		Conflicting locks list.
+ *
+ * \retval -ERESTART:	Some lock was instantly canceled while sending
+ * 			blocking ASTs, caller needs to re-check conflicting
+ * 			locks.
+ * \retval -EAGAIN:	Lock was destroyed, caller should return error.
+ * \reval 0:		Lock is successfully added in waiting list.
+ */
+int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
+			      struct list_head *rpc_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	int rc;
+	ENTRY;
+
+	check_res_locked(res);
+
+	/* If either of the compat_queue()s returned failure, then we
+	 * have ASTs to send and must go onto the waiting list.
+	 *
+	 * bug 2322: we used to unlink and re-add here, which was a
+	 * terrible folly -- if we goto restart, we could get
+	 * re-ordered!  Causes deadlock, because ASTs aren't sent! */
+	if (list_empty(&lock->l_res_link))
+		ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+	unlock_res(res);
+
+	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), rpc_list,
+			       LDLM_WORK_BL_AST);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_OST_FAIL_RACE) &&
+	    !ns_is_client(ldlm_res_to_ns(res)))
+		class_fail_export(lock->l_export);
+
+	if (rc == -ERESTART)
+		ldlm_reprocess_all(res, 0);
+
+	lock_res(res);
+	if (rc == -ERESTART) {
+		/* 15715: The lock was granted and destroyed after
+		 * resource lock was dropped. Interval node was freed
+		 * in ldlm_lock_destroy. Anyway, this always happens
+		 * when a client is being evicted. So it would be
+		 * ok to return an error. -jay */
+		if (ldlm_is_destroyed(lock))
+			RETURN(-EAGAIN);
+
+		/* lock was granted while resource was unlocked. */
+		if (ldlm_is_granted(lock)) {
+			/* bug 11300: if the lock has been granted,
+			 * break earlier because otherwise, we will go
+			 * to restart and ldlm_resource_unlink will be
+			 * called and it causes the interval node to be
+			 * freed. Then we will fail at
+			 * ldlm_extent_add_lock() */
+			*flags &= ~LDLM_FL_BLOCKED_MASK;
+		}
+
+	}
+	*flags |= LDLM_FL_BLOCK_GRANTED;
+
+	RETURN(0);
+}
+
+/**
+ * Discard all AST work items from list.
+ *
+ * If for whatever reason we do not want to send ASTs to conflicting locks
+ * anymore, disassemble the list with this function.
+ */
+void ldlm_discard_bl_list(struct list_head *bl_list)
+{
+	struct ldlm_lock *lock, *tmp;
+
+	ENTRY;
+
+	list_for_each_entry_safe(lock, tmp, bl_list, l_bl_ast) {
+		LASSERT(!list_empty(&lock->l_bl_ast));
+		list_del_init(&lock->l_bl_ast);
+		ldlm_clear_ast_sent(lock);
+		LASSERT(lock->l_bl_ast_run == 0);
+		ldlm_clear_blocking_lock(lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+	EXIT;
+}
+
+/**
+ * Process a call to blocking AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock *lock;
+	struct ldlm_lock_desc d;
+	struct ldlm_bl_desc bld;
+	int rc;
+
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
+
+	/* nobody should touch l_bl_ast but some locks in the list may become
+	 * granted after lock convert or COS downgrade, these locks should be
+	 * just skipped here and removed from the list.
+	 */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_bl_ast);
+
+	/* lock is not blocking lock anymore, but was kept in the list because
+	 * it can managed only here.
+	 */
+	if (!ldlm_is_ast_sent(lock)) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(0);
+	}
+
+	LASSERT(lock->l_blocking_lock);
+	ldlm_lock2desc(lock->l_blocking_lock, &d);
+	/* copy blocking lock ibits in cancel_bits as well,
+	 * new client may use them for lock convert and it is
+	 * important to use new field to convert locks from
+	 * new servers only
+	 */
+	d.l_policy_data.l_inodebits.cancel_bits =
+		lock->l_blocking_lock->l_policy_data.l_inodebits.bits;
+
+	/* Blocking lock is being destroyed here but some information about it
+	 * may be needed inside l_blocking_ast() function below,
+	 * e.g. in mdt_blocking_ast(). So save needed data in bl_desc.
+	 */
+	bld.bl_same_client = lock->l_client_cookie ==
+			     lock->l_blocking_lock->l_client_cookie;
+	bld.bl_cos_incompat = ldlm_is_cos_incompat(lock->l_blocking_lock);
+	arg->bl_desc = &bld;
+
+	LASSERT(ldlm_is_ast_sent(lock));
+	LASSERT(lock->l_bl_ast_run == 0);
+	lock->l_bl_ast_run++;
+	ldlm_clear_blocking_lock(lock);
+	unlock_res_and_lock(lock);
+
+	rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
+
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to revocation AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock_desc   desc;
+	int                     rc;
+	struct ldlm_lock       *lock;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast);
+	list_del_init(&lock->l_rk_ast);
+
+	/* the desc just pretend to exclusive */
+	ldlm_lock2desc(lock, &desc);
+	desc.l_req_mode = LCK_EX;
+	desc.l_granted_mode = 0;
+
+	rc = lock->l_blocking_ast(lock, &desc, (void*)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to glimpse AST callback for a lock in ast_work list
+ */
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg		*arg = opaq;
+	struct ldlm_glimpse_work	*gl_work;
+	struct ldlm_lock		*lock;
+	int				 rc = 0;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work,
+				 gl_list);
+	list_del_init(&gl_work->gl_list);
+
+	lock = gl_work->gl_lock;
+
+	/* transfer the glimpse descriptor to ldlm_cb_set_arg */
+	arg->gl_desc = gl_work->gl_desc;
+	arg->gl_interpret_reply = gl_work->gl_interpret_reply;
+	arg->gl_interpret_data = gl_work->gl_interpret_data;
+
+	/* invoke the actual glimpse callback */
+	rc = lock->l_glimpse_ast(lock, (void *)arg);
+	if (rc == 0)
+		rc = 1; /* update LVB if this is server lock */
+	else if (rc == -ELDLM_NO_LOCK_DATA)
+		ldlm_lvbo_update(lock->l_resource, lock, NULL, 1);
+
+	LDLM_LOCK_RELEASE(lock);
+	if (gl_work->gl_flags & LDLM_GL_WORK_SLAB_ALLOCATED)
+		OBD_SLAB_FREE_PTR(gl_work, ldlm_glimpse_work_kmem);
+	else
+		OBD_FREE_PTR(gl_work);
+
+	RETURN(rc);
+}
+#endif
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock *lock;
+	ldlm_completion_callback completion_callback;
+	int rc = 0;
+
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+	/* It's possible to receive a completion AST before we've set
+	 * the l_completion_ast pointer: either because the AST arrived
+	 * before the reply, or simply because there's a small race
+	 * window between receiving the reply and finishing the local
+	 * enqueue. (bug 842)
+	 *
+	 * This can't happen with the blocking_ast, however, because we
+	 * will never call the local blocking_ast until we drop our
+	 * reader/writer reference, which we won't do until we get the
+	 * reply and finish enqueueing. */
+
+	/* nobody should touch l_cp_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_cp_ast);
+	LASSERT(ldlm_is_cp_reqd(lock));
+	/* save l_completion_ast since it can be changed by
+	 * mds_intent_policy(), see bug 14225 */
+	completion_callback = lock->l_completion_ast;
+	ldlm_clear_cp_reqd(lock);
+	unlock_res_and_lock(lock);
+
+	if (completion_callback != NULL)
+		rc = completion_callback(lock, 0, (void *)arg);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process list of locks in need of ASTs being sent.
+ *
+ * Used on server to send multiple ASTs together instead of sending one by
+ * one.
+ */
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+		      ldlm_desc_ast_t ast_type)
+{
+	struct ldlm_cb_set_arg *arg;
+	set_producer_func work_ast_lock;
+	int rc;
+
+	if (list_empty(rpc_list))
+		RETURN(0);
+
+	OBD_ALLOC_PTR(arg);
+	if (arg == NULL)
+		RETURN(-ENOMEM);
+
+	atomic_set(&arg->restart, 0);
+	arg->list = rpc_list;
+
+	switch (ast_type) {
+	case LDLM_WORK_CP_AST:
+		arg->type = LDLM_CP_CALLBACK;
+		work_ast_lock = ldlm_work_cp_ast_lock;
+		break;
+#ifdef HAVE_SERVER_SUPPORT
+	case LDLM_WORK_BL_AST:
+		arg->type = LDLM_BL_CALLBACK;
+		work_ast_lock = ldlm_work_bl_ast_lock;
+		break;
+	case LDLM_WORK_REVOKE_AST:
+		arg->type = LDLM_BL_CALLBACK;
+		work_ast_lock = ldlm_work_revoke_ast_lock;
+		break;
+	case LDLM_WORK_GL_AST:
+		arg->type = LDLM_GL_CALLBACK;
+		work_ast_lock = ldlm_work_gl_ast_lock;
+		break;
+#endif
+	default:
+		LBUG();
+	}
+
+	/* We create a ptlrpc request set with flow control extension.
+	 * This request set will use the work_ast_lock function to produce new
+	 * requests and will send a new request each time one completes in order
+	 * to keep the number of requests in flight to ns_max_parallel_ast */
+	arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX,
+				     work_ast_lock, arg);
+	if (arg->set == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	ptlrpc_set_wait(NULL, arg->set);
+	ptlrpc_set_destroy(arg->set);
+
+	rc = atomic_read(&arg->restart) ? -ERESTART : 0;
+	GOTO(out, rc);
+out:
+	OBD_FREE_PTR(arg);
+	return rc;
+}
+
+/**
+ * Try to grant all waiting locks on a resource.
+ *
+ * Calls ldlm_reprocess_queue on waiting queue.
+ *
+ * Typically called after some resource locks are cancelled to see
+ * if anything could be granted as a result of the cancellation.
+ */
+static void __ldlm_reprocess_all(struct ldlm_resource *res,
+				 enum ldlm_process_intention intention,
+				 __u64 hint)
+{
+	LIST_HEAD(rpc_list);
+#ifdef HAVE_SERVER_SUPPORT
+	ldlm_reprocessing_policy reprocess;
+	struct obd_device *obd;
+	int rc;
+
+	ENTRY;
+
+	/* Local lock trees don't get reprocessed. */
+	if (ns_is_client(ldlm_res_to_ns(res))) {
+		EXIT;
+		return;
+	}
+
+	/* Disable reprocess during lock replay stage but allow during
+	 * request replay stage.
+	 */
+	obd = ldlm_res_to_ns(res)->ns_obd;
+	if (obd->obd_recovering &&
+	    atomic_read(&obd->obd_req_replay_clients) == 0)
+		RETURN_EXIT;
+restart:
+	lock_res(res);
+	reprocess = ldlm_get_reprocessing_policy(res);
+	reprocess(res, &res->lr_waiting, &rpc_list, intention, hint);
+	unlock_res(res);
+
+	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list,
+			       LDLM_WORK_CP_AST);
+	if (rc == -ERESTART) {
+		LASSERT(list_empty(&rpc_list));
+		hint = 0;
+		goto restart;
+	}
+#else
+	ENTRY;
+
+	if (!ns_is_client(ldlm_res_to_ns(res))) {
+		CERROR("This is client-side-only module, cannot handle "
+		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+#endif
+	EXIT;
+}
+
+void ldlm_reprocess_all(struct ldlm_resource *res, __u64 hint)
+{
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN, hint);
+}
+EXPORT_SYMBOL(ldlm_reprocess_all);
+
+static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+	/* This is only called once after recovery done. LU-8306. */
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY, 0);
+	return 0;
+}
+
+/**
+ * Iterate through all resources on a namespace attempting to grant waiting
+ * locks.
+ */
+void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns)
+{
+	ENTRY;
+
+	if (ns != NULL) {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_reprocess_res, NULL, 0);
+	}
+	EXIT;
+}
+
+/**
+ * Helper function to call blocking AST for LDLM lock \a lock in a
+ * "cancelling" mode.
+ */
+void ldlm_cancel_callback(struct ldlm_lock *lock)
+{
+	check_res_locked(lock->l_resource);
+	if (!ldlm_is_cancel(lock)) {
+		ldlm_set_cancel(lock);
+		if (lock->l_blocking_ast) {
+			unlock_res_and_lock(lock);
+			lock->l_blocking_ast(lock, NULL, lock->l_ast_data,
+					     LDLM_CB_CANCELING);
+			lock_res_and_lock(lock);
+		} else {
+			LDLM_DEBUG(lock, "no blocking ast");
+		}
+
+		/* only canceller can set bl_done bit */
+		ldlm_set_bl_done(lock);
+		wake_up(&lock->l_waitq);
+	} else if (!ldlm_is_bl_done(lock)) {
+		/* The lock is guaranteed to have been canceled once
+		 * returning from this function. */
+		unlock_res_and_lock(lock);
+		wait_event_idle(lock->l_waitq, is_bl_done(lock));
+		lock_res_and_lock(lock);
+	}
+}
+
+/**
+ * Remove skiplist-enabled LDLM lock \a req from granted list
+ */
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req)
+{
+        if (req->l_resource->lr_type != LDLM_PLAIN &&
+            req->l_resource->lr_type != LDLM_IBITS)
+                return;
+
+	list_del_init(&req->l_sl_policy);
+	list_del_init(&req->l_sl_mode);
+}
+
+/**
+ * Attempts to cancel LDLM lock \a lock that has no reader/writer references.
+ */
+void ldlm_lock_cancel(struct ldlm_lock *lock)
+{
+        struct ldlm_resource *res;
+        struct ldlm_namespace *ns;
+        ENTRY;
+
+        lock_res_and_lock(lock);
+
+        res = lock->l_resource;
+        ns  = ldlm_res_to_ns(res);
+
+        /* Please do not, no matter how tempting, remove this LBUG without
+         * talking to me first. -phik */
+        if (lock->l_readers || lock->l_writers) {
+                LDLM_ERROR(lock, "lock still has references");
+		unlock_res_and_lock(lock);
+                LBUG();
+        }
+
+	if (ldlm_is_waited(lock))
+		ldlm_del_waiting_lock(lock);
+
+        /* Releases cancel callback. */
+        ldlm_cancel_callback(lock);
+
+	/* Yes, second time, just in case it was added again while we were
+	 * running with no res lock in ldlm_cancel_callback */
+	if (ldlm_is_waited(lock))
+		ldlm_del_waiting_lock(lock);
+
+        ldlm_resource_unlink_lock(lock);
+        ldlm_lock_destroy_nolock(lock);
+
+	if (ldlm_is_granted(lock))
+		ldlm_pool_del(&ns->ns_pool, lock);
+
+        /* Make sure we will not be called again for same lock what is possible
+         * if not to zero out lock->l_granted_mode */
+        lock->l_granted_mode = LCK_MINMODE;
+        unlock_res_and_lock(lock);
+
+        EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_cancel);
+
+/**
+ * Set opaque data into the lock that only makes sense to upper layer.
+ */
+int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data)
+{
+        struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+        int rc = -EINVAL;
+        ENTRY;
+
+        if (lock) {
+                if (lock->l_ast_data == NULL)
+                        lock->l_ast_data = data;
+                if (lock->l_ast_data == data)
+                        rc = 0;
+                LDLM_LOCK_PUT(lock);
+        }
+        RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_lock_set_data);
+
+struct export_cl_data {
+	const struct lu_env	*ecl_env;
+	struct obd_export	*ecl_exp;
+	int			ecl_loop;
+};
+
+static void ldlm_cancel_lock_for_export(struct obd_export *exp,
+					struct ldlm_lock *lock,
+					struct export_cl_data *ecl)
+{
+	struct ldlm_resource *res;
+
+	res = ldlm_resource_getref(lock->l_resource);
+
+	ldlm_lvbo_update(res, lock, NULL, 1);
+	ldlm_lock_cancel(lock);
+	if (!exp->exp_obd->obd_stopping)
+		ldlm_reprocess_all(res, lock->l_policy_data.l_inodebits.bits);
+	ldlm_resource_putref(res);
+
+	ecl->ecl_loop++;
+	if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) {
+		CDEBUG(D_INFO, "Export %p, %d locks cancelled.\n",
+		       exp, ecl->ecl_loop);
+	}
+}
+
+/**
+ * Iterator function for ldlm_export_cancel_locks.
+ * Cancels passed locks.
+ */
+static int
+ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode, void *data)
+
+{
+	struct export_cl_data	*ecl = (struct export_cl_data *)data;
+	struct obd_export	*exp  = ecl->ecl_exp;
+	struct ldlm_lock	*lock = cfs_hash_object(hs, hnode);
+
+	LDLM_LOCK_GET(lock);
+	ldlm_cancel_lock_for_export(exp, lock, ecl);
+	LDLM_LOCK_RELEASE(lock);
+
+	return 0;
+}
+
+/**
+ * Cancel all blocked locks for given export.
+ *
+ * Typically called on client disconnection/eviction
+ */
+int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
+{
+	struct lu_env env;
+	struct export_cl_data	ecl = {
+		.ecl_exp	= exp,
+		.ecl_loop	= 0,
+	};
+	int rc;
+
+	rc = lu_env_init(&env, LCT_DT_THREAD);
+	if (rc)
+		RETURN(rc);
+	ecl.ecl_env = &env;
+
+	while (!list_empty(&exp->exp_bl_list)) {
+		struct ldlm_lock *lock;
+
+		spin_lock_bh(&exp->exp_bl_list_lock);
+		if (!list_empty(&exp->exp_bl_list)) {
+			lock = list_entry(exp->exp_bl_list.next,
+					  struct ldlm_lock, l_exp_list);
+			LDLM_LOCK_GET(lock);
+			list_del_init(&lock->l_exp_list);
+		} else {
+			lock = NULL;
+		}
+		spin_unlock_bh(&exp->exp_bl_list_lock);
+
+		if (lock == NULL)
+			break;
+
+		ldlm_cancel_lock_for_export(exp, lock, &ecl);
+		LDLM_LOCK_RELEASE(lock);
+	}
+
+	lu_env_fini(&env);
+
+	CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, "
+	       "left on hash table %d.\n", exp, ecl.ecl_loop,
+	       atomic_read(&exp->exp_lock_hash->hs_count));
+
+	return ecl.ecl_loop;
+}
+
+/**
+ * Cancel all locks for given export.
+ *
+ * Typically called after client disconnection/eviction
+ */
+int ldlm_export_cancel_locks(struct obd_export *exp)
+{
+	struct export_cl_data ecl;
+	struct lu_env env;
+	int rc;
+
+	rc = lu_env_init(&env, LCT_DT_THREAD);
+	if (rc)
+		RETURN(rc);
+	ecl.ecl_env = &env;
+	ecl.ecl_exp = exp;
+	ecl.ecl_loop = 0;
+
+	cfs_hash_for_each_empty(exp->exp_lock_hash,
+				ldlm_cancel_locks_for_export_cb, &ecl);
+
+	CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, "
+	       "left on hash table %d.\n", exp, ecl.ecl_loop,
+	       atomic_read(&exp->exp_lock_hash->hs_count));
+
+	if (ecl.ecl_loop > 0 &&
+	    atomic_read(&exp->exp_lock_hash->hs_count) == 0 &&
+	    exp->exp_obd->obd_stopping)
+		ldlm_reprocess_recovery_done(exp->exp_obd->obd_namespace);
+
+	lu_env_fini(&env);
+
+	return ecl.ecl_loop;
+}
+
+/**
+ * Downgrade an PW/EX lock to COS | CR mode.
+ *
+ * A lock mode convertion from PW/EX mode to less conflict mode. The
+ * convertion may fail if lock was canceled before downgrade, but it doesn't
+ * indicate any problem, because such lock has no reader or writer, and will
+ * be released soon.
+ *
+ * Used by Commit on Sharing (COS) code to force object changes commit in case
+ * of conflict. Converted lock is considered as new lock and all blocking AST
+ * things are cleared, so any pending or new blocked lock on that lock will
+ * cause new call to blocking_ast and force resource object commit.
+ *
+ * Also used by layout_change to replace EX lock to CR lock.
+ *
+ * \param lock A lock to convert
+ * \param new_mode new lock mode
+ */
+void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
+{
+#ifdef HAVE_SERVER_SUPPORT
+	ENTRY;
+
+	LASSERT(new_mode == LCK_COS || new_mode == LCK_CR);
+
+	lock_res_and_lock(lock);
+
+	if (!(lock->l_granted_mode & (LCK_PW | LCK_EX))) {
+		unlock_res_and_lock(lock);
+
+		LASSERT(lock->l_granted_mode == LCK_MINMODE);
+		LDLM_DEBUG(lock, "lock was canceled before downgrade");
+		RETURN_EXIT;
+	}
+
+	ldlm_resource_unlink_lock(lock);
+	/*
+	 * Remove the lock from pool as it will be added again in
+	 * ldlm_grant_lock() called below.
+	 */
+	ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+
+	/* Consider downgraded lock as a new lock and clear all states
+	 * related to a previous blocking AST processing.
+	 */
+	ldlm_clear_blocking_data(lock);
+
+	lock->l_req_mode = new_mode;
+	ldlm_grant_lock(lock, NULL);
+	unlock_res_and_lock(lock);
+
+	ldlm_reprocess_all(lock->l_resource,
+			   lock->l_policy_data.l_inodebits.bits);
+
+	EXIT;
+#endif
+}
+EXPORT_SYMBOL(ldlm_lock_mode_downgrade);
+
+/**
+ * Print lock with lock handle \a lockh description into debug log.
+ *
+ * Used when printing all locks on a resource for debug purposes.
+ */
+void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh)
+{
+        struct ldlm_lock *lock;
+
+        if (!((libcfs_debug | D_ERROR) & level))
+                return;
+
+        lock = ldlm_handle2lock(lockh);
+        if (lock == NULL)
+                return;
+
+        LDLM_DEBUG_LIMIT(level, lock, "###");
+
+        LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_dump_handle);
+
+/**
+ * Print lock information with custom message into debug log.
+ * Helper function.
+ */
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+                      struct libcfs_debug_msg_data *msgdata,
+                      const char *fmt, ...)
+{
+        va_list args;
+        struct obd_export *exp = lock->l_export;
+	struct ldlm_resource *resource = NULL;
+	struct va_format vaf;
+        char *nid = "local";
+
+	rcu_read_lock();
+	resource = rcu_dereference(lock->l_resource);
+	if (resource && !atomic_inc_not_zero(&resource->lr_refcount))
+		resource = NULL;
+	rcu_read_unlock();
+
+        va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+        if (exp && exp->exp_connection) {
+		nid = obd_export_nid2str(exp);
+        } else if (exp && exp->exp_obd != NULL) {
+                struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
+		nid = obd_import_nid2str(imp);
+        }
+
+        if (resource == NULL) {
+		libcfs_debug_msg(msgdata,
+				 "%pV ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+				 &vaf,
+				 lock,
+				 lock->l_handle.h_cookie,
+				 refcount_read(&lock->l_handle.h_ref),
+				 lock->l_readers, lock->l_writers,
+				 ldlm_lockname[lock->l_granted_mode],
+				 ldlm_lockname[lock->l_req_mode],
+				 lock->l_flags, nid,
+				 lock->l_remote_handle.cookie,
+				 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
+				 lock->l_pid, lock->l_callback_timestamp,
+				 lock->l_lvb_type);
+                va_end(args);
+                return;
+        }
+
+	switch (resource->lr_type) {
+	case LDLM_EXTENT:
+		libcfs_debug_msg(msgdata,
+				 "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+				 &vaf,
+				 ldlm_lock_to_ns_name(lock), lock,
+				 lock->l_handle.h_cookie,
+				 refcount_read(&lock->l_handle.h_ref),
+				 lock->l_readers, lock->l_writers,
+				 ldlm_lockname[lock->l_granted_mode],
+				 ldlm_lockname[lock->l_req_mode],
+				 PLDLMRES(resource),
+				 atomic_read(&resource->lr_refcount),
+				 ldlm_typename[resource->lr_type],
+				 lock->l_policy_data.l_extent.start,
+				 lock->l_policy_data.l_extent.end,
+				 lock->l_req_extent.start, lock->l_req_extent.end,
+				 lock->l_req_extent.gid,
+				 lock->l_flags, nid,
+				 lock->l_remote_handle.cookie,
+				 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
+				 lock->l_pid, lock->l_callback_timestamp,
+				 lock->l_lvb_type);
+		break;
+
+	case LDLM_FLOCK:
+		libcfs_debug_msg(msgdata,
+				 "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld\n",
+				 &vaf,
+				 ldlm_lock_to_ns_name(lock), lock,
+				 lock->l_handle.h_cookie,
+				 refcount_read(&lock->l_handle.h_ref),
+				 lock->l_readers, lock->l_writers,
+				 ldlm_lockname[lock->l_granted_mode],
+				 ldlm_lockname[lock->l_req_mode],
+				 PLDLMRES(resource),
+				 atomic_read(&resource->lr_refcount),
+				 ldlm_typename[resource->lr_type],
+				 lock->l_policy_data.l_flock.pid,
+				 lock->l_policy_data.l_flock.start,
+				 lock->l_policy_data.l_flock.end,
+				 lock->l_flags, nid,
+				 lock->l_remote_handle.cookie,
+				 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
+				 lock->l_pid, lock->l_callback_timestamp);
+		break;
+
+	case LDLM_IBITS:
+		libcfs_debug_msg(msgdata,
+				 "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+				 &vaf,
+				 ldlm_lock_to_ns_name(lock),
+				 lock, lock->l_handle.h_cookie,
+				 refcount_read(&lock->l_handle.h_ref),
+				 lock->l_readers, lock->l_writers,
+				 ldlm_lockname[lock->l_granted_mode],
+				 ldlm_lockname[lock->l_req_mode],
+				 PLDLMRES(resource),
+				 lock->l_policy_data.l_inodebits.bits,
+				 lock->l_policy_data.l_inodebits.try_bits,
+				 atomic_read(&resource->lr_refcount),
+				 ldlm_typename[resource->lr_type],
+				 lock->l_policy_data.l_inodebits.li_gid,
+				 lock->l_flags, nid,
+				 lock->l_remote_handle.cookie,
+				 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
+				 lock->l_pid, lock->l_callback_timestamp,
+				 lock->l_lvb_type);
+		break;
+
+	default:
+		libcfs_debug_msg(msgdata,
+				 "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+				 &vaf,
+				 ldlm_lock_to_ns_name(lock),
+				 lock, lock->l_handle.h_cookie,
+				 refcount_read(&lock->l_handle.h_ref),
+				 lock->l_readers, lock->l_writers,
+				 ldlm_lockname[lock->l_granted_mode],
+				 ldlm_lockname[lock->l_req_mode],
+				 PLDLMRES(resource),
+				 atomic_read(&resource->lr_refcount),
+				 ldlm_typename[resource->lr_type],
+				 lock->l_flags, nid,
+				 lock->l_remote_handle.cookie,
+				 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
+				 lock->l_pid, lock->l_callback_timestamp,
+				 lock->l_lvb_type);
+		break;
+	}
+	va_end(args);
+	ldlm_resource_putref(resource);
+}
+EXPORT_SYMBOL(_ldlm_lock_debug);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
new file mode 100644
index 0000000000000..f82df7df0e444
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
@@ -0,0 +1,3488 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ldlm/ldlm_lockd.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-mem.h>
+#include <lustre_errno.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+static int ldlm_num_threads;
+module_param(ldlm_num_threads, int, 0444);
+MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start");
+
+static unsigned int ldlm_cpu_bind = 1;
+module_param(ldlm_cpu_bind, uint, 0444);
+MODULE_PARM_DESC(ldlm_cpu_bind,
+		 "bind DLM service threads to particular CPU partitions");
+
+static char *ldlm_cpts;
+module_param(ldlm_cpts, charp, 0444);
+MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on");
+
+static DEFINE_MUTEX(ldlm_ref_mutex);
+static int ldlm_refcount;
+
+struct kobject *ldlm_kobj;
+struct kset *ldlm_ns_kset;
+struct kset *ldlm_svc_kset;
+
+/* LDLM state */
+
+static struct ldlm_state *ldlm_state;
+
+/*
+ * timeout for initial callback (AST) reply (bz10399)
+ * Due to having to send a 32 bit time value over the
+ * wire return it as timeout_t instead of time64_t
+ */
+static inline timeout_t ldlm_get_rq_timeout(void)
+{
+	/* Non-AT value */
+	timeout_t timeout = min(ldlm_timeout, obd_timeout / 3);
+
+	return timeout < 1 ? 1 : timeout;
+}
+
+struct ldlm_bl_pool {
+	spinlock_t blp_lock;
+
+	/*
+	 * blp_prio_list is used for callbacks that should be handled
+	 * as a priority. It is used for LDLM_FL_DISCARD_DATA requests.
+	 * see b=13843
+	 */
+	struct list_head blp_prio_list;
+
+	/*
+	 * blp_list is used for all other callbacks which are likely
+	 * to take longer to process.
+	 */
+	struct list_head blp_list;
+
+	wait_queue_head_t blp_waitq;
+	struct completion blp_comp;
+	atomic_t blp_num_threads;
+	atomic_t blp_busy_threads;
+	int blp_min_threads;
+	int blp_max_threads;
+};
+
+struct ldlm_bl_work_item {
+	struct list_head	blwi_entry;
+	struct ldlm_namespace	*blwi_ns;
+	struct ldlm_lock_desc	blwi_ld;
+	struct ldlm_lock	*blwi_lock;
+	struct list_head	blwi_head;
+	int			blwi_count;
+	struct completion	blwi_comp;
+	enum ldlm_cancel_flags	blwi_flags;
+	int			blwi_mem_pressure;
+};
+
+#ifdef HAVE_SERVER_SUPPORT
+
+/**
+ * Protects both waiting_locks_list and expired_lock_thread.
+ */
+static DEFINE_SPINLOCK(waiting_locks_spinlock); /* BH lock (timer) */
+
+/**
+ * List for contended locks.
+ *
+ * As soon as a lock is contended, it gets placed on this list and
+ * expected time to get a response is filled in the lock. A special
+ * thread walks the list looking for locks that should be released and
+ * schedules client evictions for those that have not been released in
+ * time.
+ *
+ * All access to it should be under waiting_locks_spinlock.
+ */
+static LIST_HEAD(waiting_locks_list);
+static void waiting_locks_callback(TIMER_DATA_TYPE unused);
+static CFS_DEFINE_TIMER(waiting_locks_timer, waiting_locks_callback, 0, 0);
+
+enum elt_state {
+	ELT_STOPPED,
+	ELT_READY,
+	ELT_TERMINATE,
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(expired_lock_wait_queue);
+static enum elt_state expired_lock_thread_state = ELT_STOPPED;
+static int expired_lock_dump;
+static LIST_HEAD(expired_lock_list);
+
+static int ldlm_lock_busy(struct ldlm_lock *lock);
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout);
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout);
+
+static inline int have_expired_locks(void)
+{
+	int need_to_run;
+
+	ENTRY;
+	spin_lock_bh(&waiting_locks_spinlock);
+	need_to_run = !list_empty(&expired_lock_list);
+	spin_unlock_bh(&waiting_locks_spinlock);
+
+	RETURN(need_to_run);
+}
+
+/**
+ * Check expired lock list for expired locks and time them out.
+ */
+static int expired_lock_main(void *arg)
+{
+	struct list_head *expired = &expired_lock_list;
+	int do_dump;
+
+	ENTRY;
+
+	expired_lock_thread_state = ELT_READY;
+	wake_up(&expired_lock_wait_queue);
+
+	while (1) {
+		wait_event_idle(expired_lock_wait_queue,
+				have_expired_locks() ||
+				expired_lock_thread_state == ELT_TERMINATE);
+
+		spin_lock_bh(&waiting_locks_spinlock);
+		if (expired_lock_dump) {
+			spin_unlock_bh(&waiting_locks_spinlock);
+
+			/* from waiting_locks_callback, but not in timer */
+			libcfs_debug_dumplog();
+
+			spin_lock_bh(&waiting_locks_spinlock);
+			expired_lock_dump = 0;
+		}
+
+		do_dump = 0;
+
+		while (!list_empty(expired)) {
+			struct obd_export *export;
+			struct ldlm_lock *lock;
+
+			lock = list_entry(expired->next, struct ldlm_lock,
+					  l_pending_chain);
+			if ((void *)lock < LP_POISON + PAGE_SIZE &&
+			    (void *)lock >= LP_POISON) {
+				spin_unlock_bh(&waiting_locks_spinlock);
+				CERROR("free lock on elt list %p\n", lock);
+				LBUG();
+			}
+			list_del_init(&lock->l_pending_chain);
+			if ((void *)lock->l_export <
+			     LP_POISON + PAGE_SIZE &&
+			    (void *)lock->l_export >= LP_POISON) {
+				CERROR("lock with free export on elt list %p\n",
+				       lock->l_export);
+				lock->l_export = NULL;
+				LDLM_ERROR(lock, "free export");
+				/*
+				 * release extra ref grabbed by
+				 * ldlm_add_waiting_lock() or
+				 * ldlm_failed_ast()
+				 */
+				LDLM_LOCK_RELEASE(lock);
+				continue;
+			}
+
+			if (ldlm_is_destroyed(lock)) {
+				/*
+				 * release the lock refcount where
+				 * waiting_locks_callback() founds
+				 */
+				LDLM_LOCK_RELEASE(lock);
+				continue;
+			}
+			export = class_export_lock_get(lock->l_export, lock);
+			spin_unlock_bh(&waiting_locks_spinlock);
+
+			/* Check if we need to prolong timeout */
+			if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
+			    lock->l_callback_timestamp != 0 && /* not AST error */
+			    ldlm_lock_busy(lock)) {
+				LDLM_DEBUG(lock, "prolong the busy lock");
+				lock_res_and_lock(lock);
+				ldlm_add_waiting_lock(lock,
+						ldlm_bl_timeout(lock) >> 1);
+				unlock_res_and_lock(lock);
+			} else {
+				spin_lock_bh(&export->exp_bl_list_lock);
+				list_del_init(&lock->l_exp_list);
+				spin_unlock_bh(&export->exp_bl_list_lock);
+
+				LDLM_ERROR(lock,
+					   "lock callback timer expired after %llds: evicting client at %s ",
+					   ktime_get_seconds() -
+					   lock->l_blast_sent,
+					   obd_export_nid2str(export));
+				ldlm_lock_to_ns(lock)->ns_timeouts++;
+				if (do_dump_on_eviction(export->exp_obd))
+					do_dump++;
+				class_fail_export(export);
+			}
+			class_export_lock_put(export, lock);
+			/*
+			 * release extra ref grabbed by ldlm_add_waiting_lock()
+			 * or ldlm_failed_ast()
+			 */
+			LDLM_LOCK_RELEASE(lock);
+
+			spin_lock_bh(&waiting_locks_spinlock);
+		}
+		spin_unlock_bh(&waiting_locks_spinlock);
+
+		if (do_dump) {
+			CERROR("dump the log upon eviction\n");
+			libcfs_debug_dumplog();
+		}
+
+		if (expired_lock_thread_state == ELT_TERMINATE)
+			break;
+	}
+
+	expired_lock_thread_state = ELT_STOPPED;
+	wake_up(&expired_lock_wait_queue);
+	RETURN(0);
+}
+
+/**
+ * Check if there is a request in the export request list
+ * which prevents the lock canceling.
+ */
+static int ldlm_lock_busy(struct ldlm_lock *lock)
+{
+	struct ptlrpc_request *req;
+	int match = 0;
+
+	ENTRY;
+
+	if (lock->l_export == NULL)
+		return 0;
+
+	spin_lock(&lock->l_export->exp_rpc_lock);
+	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
+				rq_exp_list) {
+		if (req->rq_ops->hpreq_lock_match) {
+			match = req->rq_ops->hpreq_lock_match(req, lock);
+			if (match)
+				break;
+		}
+	}
+	spin_unlock(&lock->l_export->exp_rpc_lock);
+	RETURN(match);
+}
+
+/* This is called from within a timer interrupt and cannot schedule */
+static void waiting_locks_callback(TIMER_DATA_TYPE unused)
+{
+	struct ldlm_lock *lock;
+	int need_dump = 0;
+
+	spin_lock_bh(&waiting_locks_spinlock);
+	while (!list_empty(&waiting_locks_list)) {
+		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
+				  l_pending_chain);
+		if (lock->l_callback_timestamp > ktime_get_seconds() ||
+		    lock->l_req_mode == LCK_GROUP)
+			break;
+
+		/*
+		 * no needs to take an extra ref on the lock since it was in
+		 * the waiting_locks_list and ldlm_add_waiting_lock()
+		 * already grabbed a ref
+		 */
+		list_move(&lock->l_pending_chain, &expired_lock_list);
+		need_dump = 1;
+	}
+
+	if (!list_empty(&expired_lock_list)) {
+		if (obd_dump_on_timeout && need_dump)
+			expired_lock_dump = __LINE__;
+
+		wake_up(&expired_lock_wait_queue);
+	}
+
+	/*
+	 * Make sure the timer will fire again if we have any locks
+	 * left.
+	 */
+	if (!list_empty(&waiting_locks_list)) {
+		time64_t now = ktime_get_seconds();
+		timeout_t delta = 0;
+
+		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
+				  l_pending_chain);
+		if (lock->l_callback_timestamp - now > 0)
+			delta = lock->l_callback_timestamp - now;
+		mod_timer(&waiting_locks_timer,
+			  jiffies + cfs_time_seconds(delta));
+	}
+	spin_unlock_bh(&waiting_locks_spinlock);
+}
+
+/**
+ * Add lock to the list of contended locks.
+ *
+ * Indicate that we're waiting for a client to call us back cancelling a given
+ * lock.  We add it to the pending-callback chain, and schedule the lock-timeout
+ * timer to fire appropriately.  (We round up to the next second, to avoid
+ * floods of timer firings during periods of high lock contention and traffic).
+ * As done by ldlm_add_waiting_lock(), the caller must grab a lock reference
+ * if it has been added to the waiting list (1 is returned).
+ *
+ * Called with the namespace lock held.
+ */
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t delay)
+{
+	unsigned long timeout_jiffies = jiffies;
+	time64_t deadline;
+	timeout_t timeout;
+
+	lock->l_blast_sent = ktime_get_seconds();
+	if (!list_empty(&lock->l_pending_chain))
+		return 0;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT) ||
+	    OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+		delay = 1;
+
+	deadline = lock->l_blast_sent + delay;
+	if (likely(deadline > lock->l_callback_timestamp))
+		lock->l_callback_timestamp = deadline;
+
+	timeout = clamp_t(timeout_t,
+			  lock->l_callback_timestamp - lock->l_blast_sent,
+			  0, delay);
+	timeout_jiffies += cfs_time_seconds(timeout);
+
+	if (time_before(timeout_jiffies, waiting_locks_timer.expires) ||
+	    !timer_pending(&waiting_locks_timer))
+		mod_timer(&waiting_locks_timer, timeout_jiffies);
+
+	/*
+	 * if the new lock has a shorter timeout than something earlier on
+	 * the list, we'll wait the longer amount of time; no big deal.
+	 */
+	/* FIFO */
+	list_add_tail(&lock->l_pending_chain, &waiting_locks_list);
+	return 1;
+}
+
+static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
+{
+	spin_lock_bh(&lock->l_export->exp_bl_list_lock);
+	if (list_empty(&lock->l_exp_list)) {
+		if (!ldlm_is_granted(lock))
+			list_add_tail(&lock->l_exp_list,
+				      &lock->l_export->exp_bl_list);
+		else
+			list_add(&lock->l_exp_list,
+				 &lock->l_export->exp_bl_list);
+	}
+	spin_unlock_bh(&lock->l_export->exp_bl_list_lock);
+
+	/*
+	 * A blocked lock is added. Adjust the position in
+	 * the stale list if the export is in the list.
+	 * If export is stale and not in the list - it is being
+	 * processed and will be placed on the right position
+	 * on obd_stale_export_put().
+	 */
+	if (!list_empty(&lock->l_export->exp_stale_list))
+		obd_stale_export_adjust(lock->l_export);
+}
+
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout)
+{
+	int ret;
+
+	/* NB: must be called with hold of lock_res_and_lock() */
+	LASSERT(ldlm_is_res_locked(lock));
+	LASSERT(!ldlm_is_cancel_on_block(lock));
+
+	/*
+	 * Do not put cross-MDT lock in the waiting list, since we
+	 * will not evict it due to timeout for now
+	 */
+	if (lock->l_export != NULL &&
+	    (exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS))
+		return 0;
+
+	spin_lock_bh(&waiting_locks_spinlock);
+	if (ldlm_is_cancel(lock)) {
+		spin_unlock_bh(&waiting_locks_spinlock);
+		return 0;
+	}
+
+	if (ldlm_is_destroyed(lock)) {
+		static time64_t next;
+
+		spin_unlock_bh(&waiting_locks_spinlock);
+		LDLM_ERROR(lock, "not waiting on destroyed lock (b=5653)");
+		if (ktime_get_seconds() > next) {
+			next = ktime_get_seconds() + 14400;
+			libcfs_debug_dumpstack(NULL);
+		}
+		return 0;
+	}
+
+	ldlm_set_waited(lock);
+	ret = __ldlm_add_waiting_lock(lock, timeout);
+	if (ret) {
+		/*
+		 * grab ref on the lock if it has been added to the
+		 * waiting list
+		 */
+		LDLM_LOCK_GET(lock);
+	}
+	spin_unlock_bh(&waiting_locks_spinlock);
+
+	if (ret)
+		ldlm_add_blocked_lock(lock);
+
+	LDLM_DEBUG(lock, "%sadding to wait list(timeout: %d, AT: %s)",
+		   ret == 0 ? "not re-" : "", timeout,
+		   AT_OFF ? "off" : "on");
+	return ret;
+}
+
+/**
+ * Remove a lock from the pending list, likely because it had its cancellation
+ * callback arrive without incident.  This adjusts the lock-timeout timer if
+ * needed.  Returns 0 if the lock wasn't pending after all, 1 if it was.
+ * As done by ldlm_del_waiting_lock(), the caller must release the lock
+ * reference when the lock is removed from any list (1 is returned).
+ *
+ * Called with namespace lock held.
+ */
+static int __ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+	struct list_head *list_next;
+
+	if (list_empty(&lock->l_pending_chain))
+		return 0;
+
+	list_next = lock->l_pending_chain.next;
+	if (lock->l_pending_chain.prev == &waiting_locks_list) {
+		/* Removing the head of the list, adjust timer. */
+		if (list_next == &waiting_locks_list) {
+			/* No more, just cancel. */
+			del_timer(&waiting_locks_timer);
+		} else {
+			time64_t now = ktime_get_seconds();
+			struct ldlm_lock *next;
+			timeout_t delta = 0;
+
+			next = list_entry(list_next, struct ldlm_lock,
+					  l_pending_chain);
+			if (next->l_callback_timestamp - now > 0)
+				delta = lock->l_callback_timestamp - now;
+
+			mod_timer(&waiting_locks_timer,
+				  jiffies + cfs_time_seconds(delta));
+		}
+	}
+	list_del_init(&lock->l_pending_chain);
+
+	return 1;
+}
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+	int ret;
+
+	if (lock->l_export == NULL) {
+		/* We don't have a "waiting locks list" on clients. */
+		CDEBUG(D_DLMTRACE, "Client lock %p : no-op\n", lock);
+		return 0;
+	}
+
+	spin_lock_bh(&waiting_locks_spinlock);
+	ret = __ldlm_del_waiting_lock(lock);
+	ldlm_clear_waited(lock);
+	spin_unlock_bh(&waiting_locks_spinlock);
+
+	/* remove the lock out of export blocking list */
+	spin_lock_bh(&lock->l_export->exp_bl_list_lock);
+	list_del_init(&lock->l_exp_list);
+	spin_unlock_bh(&lock->l_export->exp_bl_list_lock);
+
+	if (ret) {
+		/*
+		 * release lock ref if it has indeed been removed
+		 * from a list
+		 */
+		LDLM_LOCK_RELEASE(lock);
+	}
+
+	LDLM_DEBUG(lock, "%s", ret == 0 ? "wasn't waiting" : "removed");
+	return ret;
+}
+
+/**
+ * Prolong the contended lock waiting time.
+ *
+ * Called with namespace lock held.
+ */
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout)
+{
+	if (lock->l_export == NULL) {
+		/* We don't have a "waiting locks list" on clients. */
+		LDLM_DEBUG(lock, "client lock: no-op");
+		return 0;
+	}
+
+	if (exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) {
+		/* We don't have a "waiting locks list" on OSP. */
+		LDLM_DEBUG(lock, "MDS-MDS lock: no-op");
+		return 0;
+	}
+
+	spin_lock_bh(&waiting_locks_spinlock);
+
+	if (list_empty(&lock->l_pending_chain)) {
+		spin_unlock_bh(&waiting_locks_spinlock);
+		LDLM_DEBUG(lock, "wasn't waiting");
+		return 0;
+	}
+
+	/*
+	 * we remove/add the lock to the waiting list, so no needs to
+	 * release/take a lock reference
+	 */
+	__ldlm_del_waiting_lock(lock);
+	__ldlm_add_waiting_lock(lock, timeout);
+	spin_unlock_bh(&waiting_locks_spinlock);
+
+	LDLM_DEBUG(lock, "refreshed to %ds", timeout);
+	return 1;
+}
+EXPORT_SYMBOL(ldlm_refresh_waiting_lock);
+
+#else /* HAVE_SERVER_SUPPORT */
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+	RETURN(0);
+}
+
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout)
+{
+	RETURN(0);
+}
+
+#endif /* !HAVE_SERVER_SUPPORT */
+
+#ifdef HAVE_SERVER_SUPPORT
+
+/**
+ * Calculate the per-export Blocking timeout (covering BL AST, data flush,
+ * lock cancel, and their replies). Used for lock callback timeout and AST
+ * re-send period.
+ *
+ * \param[in] lock        lock which is getting the blocking callback
+ *
+ * \retval            timeout in seconds to wait for the client reply
+ */
+timeout_t ldlm_bl_timeout(struct ldlm_lock *lock)
+{
+	timeout_t timeout;
+
+	if (AT_OFF)
+		return obd_timeout / 2;
+
+	/*
+	 * Since these are non-updating timeouts, we should be conservative.
+	 * Take more than usually, 150%
+	 * It would be nice to have some kind of "early reply" mechanism for
+	 * lock callbacks too...
+	 */
+	timeout = at_get(&lock->l_export->exp_bl_lock_at);
+	return max_t(timeout_t, timeout + (timeout >> 1),
+		     (timeout_t)ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_bl_timeout);
+
+/**
+ * Perform lock cleanup if AST sending failed.
+ */
+static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
+			    const char *ast_type)
+{
+	LCONSOLE_ERROR_MSG(0x138,
+			   "%s: A client on nid %s was evicted due to a lock %s callback time out: rc %d\n",
+			   lock->l_export->exp_obd->obd_name,
+			   obd_export_nid2str(lock->l_export), ast_type, rc);
+
+	if (obd_dump_on_timeout)
+		libcfs_debug_dumplog();
+	spin_lock_bh(&waiting_locks_spinlock);
+	if (__ldlm_del_waiting_lock(lock) == 0)
+		/*
+		 * the lock was not in any list, grab an extra ref before adding
+		 * the lock to the expired list
+		 */
+		LDLM_LOCK_GET(lock);
+	/* differentiate it from expired locks */
+	lock->l_callback_timestamp = 0;
+	list_add(&lock->l_pending_chain, &expired_lock_list);
+	wake_up(&expired_lock_wait_queue);
+	spin_unlock_bh(&waiting_locks_spinlock);
+}
+
+/**
+ * Perform lock cleanup if AST reply came with error.
+ */
+static int ldlm_handle_ast_error(struct ldlm_lock *lock,
+				 struct ptlrpc_request *req, int rc,
+				 const char *ast_type)
+{
+	struct lnet_processid *peer = &req->rq_import->imp_connection->c_peer;
+
+	if (!req->rq_replied || (rc && rc != -EINVAL)) {
+		if (ldlm_is_cancel(lock)) {
+			LDLM_DEBUG(lock,
+				   "%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)",
+				   ast_type, req, req->rq_xid,
+				   libcfs_nidstr(&peer->nid));
+			ldlm_lock_cancel(lock);
+			rc = -ERESTART;
+		} else if (rc == -ENODEV || rc == -ESHUTDOWN ||
+			   (rc == -EIO &&
+			    req->rq_import->imp_state == LUSTRE_IMP_CLOSED)) {
+			/*
+			 * Upon umount process the AST fails because cannot be
+			 * sent. This shouldn't lead to the client eviction.
+			 * -ENODEV error is returned by ptl_send_rpc() for
+			 *  new request in such import.
+			 * -SHUTDOWN is returned by ptlrpc_import_delay_req()
+			 *  if imp_invalid is set or obd_no_recov.
+			 * Meanwhile there is also check for LUSTRE_IMP_CLOSED
+			 * in ptlrpc_import_delay_req() as well with -EIO code.
+			 * In all such cases errors are ignored.
+			 */
+			LDLM_DEBUG(lock,
+				   "%s AST can't be sent due to a server %s failure or umount process: rc = %d\n",
+				    ast_type,
+				     req->rq_import->imp_obd->obd_name, rc);
+		} else {
+			LDLM_ERROR(lock,
+				   "client (nid %s) %s %s AST (req@%p x%llu status %d rc %d), evict it",
+				   libcfs_nidstr(&peer->nid),
+				   req->rq_replied ? "returned error from" :
+				   "failed to reply to",
+				   ast_type, req, req->rq_xid,
+				   (req->rq_repmsg != NULL) ?
+				   lustre_msg_get_status(req->rq_repmsg) : 0,
+				   rc);
+			ldlm_failed_ast(lock, rc, ast_type);
+		}
+		return rc;
+	}
+
+	if (rc == -EINVAL) {
+		struct ldlm_resource *res = lock->l_resource;
+
+		LDLM_DEBUG(lock,
+			   "client (nid %s) returned %d from %s AST (req@%p x%llu) - normal race",
+			   libcfs_nidstr(&peer->nid),
+			   req->rq_repmsg ?
+			   lustre_msg_get_status(req->rq_repmsg) : -1,
+			   ast_type, req, req->rq_xid);
+		if (res) {
+			/*
+			 * update lvbo to return proper attributes.
+			 * see b=23174
+			 */
+			ldlm_resource_getref(res);
+			ldlm_lvbo_update(res, lock, NULL, 1);
+			ldlm_resource_putref(res);
+		}
+		ldlm_lock_cancel(lock);
+		rc = -ERESTART;
+	}
+
+	return rc;
+}
+
+static int ldlm_cb_interpret(const struct lu_env *env,
+			     struct ptlrpc_request *req, void *args, int rc)
+{
+	struct ldlm_cb_async_args *ca = args;
+	struct ldlm_lock *lock = ca->ca_lock;
+	struct ldlm_cb_set_arg *arg  = ca->ca_set_arg;
+
+	ENTRY;
+
+	LASSERT(lock != NULL);
+
+	switch (arg->type) {
+	case LDLM_GL_CALLBACK:
+		/*
+		 * Update the LVB from disk if the AST failed
+		 * (this is a legal race)
+		 *
+		 * - Glimpse callback of local lock just returns
+		 *   -ELDLM_NO_LOCK_DATA.
+		 * - Glimpse callback of remote lock might return
+		 *   -ELDLM_NO_LOCK_DATA when inode is cleared. LU-274
+		 */
+		if (unlikely(arg->gl_interpret_reply)) {
+			rc = arg->gl_interpret_reply(NULL, req, args, rc);
+		} else if (rc == -ELDLM_NO_LOCK_DATA) {
+			LDLM_DEBUG(lock,
+				   "lost race - client has a lock but no inode");
+			ldlm_lvbo_update(lock->l_resource, lock, NULL, 1);
+		} else if (rc != 0) {
+			rc = ldlm_handle_ast_error(lock, req, rc, "glimpse");
+		} else {
+			rc = ldlm_lvbo_update(lock->l_resource,
+					      lock, req, 1);
+		}
+		break;
+	case LDLM_BL_CALLBACK:
+		if (rc != 0)
+			rc = ldlm_handle_ast_error(lock, req, rc, "blocking");
+		break;
+	case LDLM_CP_CALLBACK:
+		if (rc != 0)
+			rc = ldlm_handle_ast_error(lock, req, rc, "completion");
+		break;
+	default:
+		LDLM_ERROR(lock, "invalid opcode for lock callback %d",
+			   arg->type);
+		LBUG();
+	}
+
+	/* release extra reference taken in ldlm_ast_fini() */
+	LDLM_LOCK_RELEASE(lock);
+
+	if (rc == -ERESTART)
+		atomic_inc(&arg->restart);
+
+	RETURN(0);
+}
+
+static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
+{
+	struct ldlm_cb_async_args *ca = data;
+	struct ldlm_lock *lock = ca->ca_lock;
+
+	ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
+}
+
+static inline int ldlm_ast_fini(struct ptlrpc_request *req,
+				struct ldlm_cb_set_arg *arg,
+				struct ldlm_lock *lock,
+				int instant_cancel)
+{
+	int rc = 0;
+
+	ENTRY;
+
+	if (unlikely(instant_cancel)) {
+		rc = ptl_send_rpc(req, 1);
+		ptlrpc_req_finished(req);
+		if (rc == 0)
+			atomic_inc(&arg->restart);
+	} else {
+		LDLM_LOCK_GET(lock);
+		ptlrpc_set_add_req(arg->set, req);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Check if there are requests in the export request list which prevent
+ * the lock canceling and make these requests high priority ones.
+ */
+static void ldlm_lock_reorder_req(struct ldlm_lock *lock)
+{
+	struct ptlrpc_request *req;
+
+	ENTRY;
+
+	if (lock->l_export == NULL) {
+		LDLM_DEBUG(lock, "client lock: no-op");
+		RETURN_EXIT;
+	}
+
+	spin_lock(&lock->l_export->exp_rpc_lock);
+	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
+			    rq_exp_list) {
+		/*
+		 * Do not process requests that were not yet added to there
+		 * incoming queue or were already removed from there for
+		 * processing. We evaluate ptlrpc_nrs_req_can_move() without
+		 * holding svcpt->scp_req_lock, and then redo the check with
+		 * the lock held once we need to obtain a reliable result.
+		 */
+		if (ptlrpc_nrs_req_can_move(req) &&
+		    req->rq_ops->hpreq_lock_match &&
+		    req->rq_ops->hpreq_lock_match(req, lock))
+			ptlrpc_nrs_req_hp_move(req);
+	}
+	spin_unlock(&lock->l_export->exp_rpc_lock);
+	EXIT;
+}
+
+/**
+ * ->l_blocking_ast() method for server-side locks. This is invoked when newly
+ * enqueued server lock conflicts with given one.
+ *
+ * Sends blocking AST RPC to the client owning that lock; arms timeout timer
+ * to wait for client response.
+ */
+int ldlm_server_blocking_ast(struct ldlm_lock *lock,
+			     struct ldlm_lock_desc *desc,
+			     void *data, int flag)
+{
+	struct ldlm_cb_async_args *ca;
+	struct ldlm_cb_set_arg *arg = data;
+	struct ldlm_request *body;
+	struct ptlrpc_request  *req;
+	int instant_cancel = 0;
+	int rc = 0;
+
+	ENTRY;
+
+	if (flag == LDLM_CB_CANCELING)
+		/* Don't need to do anything here. */
+		RETURN(0);
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_SRV_BL_AST)) {
+		LDLM_DEBUG(lock, "dropping BL AST");
+		RETURN(0);
+	}
+
+	LASSERT(lock);
+	LASSERT(data != NULL);
+	if (lock->l_export->exp_obd->obd_recovering != 0)
+		LDLM_ERROR(lock, "BUG 6063: lock collide during recovery");
+
+	ldlm_lock_reorder_req(lock);
+
+	req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
+					&RQF_LDLM_BL_CALLBACK,
+					LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	ca = ptlrpc_req_async_args(ca, req);
+	ca->ca_set_arg = arg;
+	ca->ca_lock = lock;
+
+	req->rq_interpret_reply = ldlm_cb_interpret;
+
+	lock_res_and_lock(lock);
+	if (ldlm_is_destroyed(lock)) {
+		/* What's the point? */
+		unlock_res_and_lock(lock);
+		ptlrpc_req_finished(req);
+		RETURN(0);
+	}
+
+	if (!ldlm_is_granted(lock)) {
+		/*
+		 * this blocking AST will be communicated as part of the
+		 * completion AST instead
+		 */
+		ldlm_add_blocked_lock(lock);
+		ldlm_set_waited(lock);
+		unlock_res_and_lock(lock);
+
+		ptlrpc_req_finished(req);
+		LDLM_DEBUG(lock, "lock not granted, not sending blocking AST");
+		RETURN(0);
+	}
+
+	if (ldlm_is_cancel_on_block(lock))
+		instant_cancel = 1;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+	body->lock_handle[1].cookie = lock->l_handle.h_cookie;
+	body->lock_desc = *desc;
+	body->lock_flags |= ldlm_flags_to_wire(lock->l_flags & LDLM_FL_AST_MASK);
+
+	LDLM_DEBUG(lock, "server preparing blocking AST");
+
+	ptlrpc_request_set_replen(req);
+	ldlm_set_cbpending(lock);
+	if (instant_cancel) {
+		unlock_res_and_lock(lock);
+		ldlm_lock_cancel(lock);
+
+		req->rq_no_resend = 1;
+	} else {
+		LASSERT(ldlm_is_granted(lock));
+		ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock));
+		unlock_res_and_lock(lock);
+
+		/* Do not resend after lock callback timeout */
+		req->rq_delay_limit = ldlm_bl_timeout(lock);
+		req->rq_resend_cb = ldlm_update_resend;
+	}
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+	/* ptlrpc_request_alloc_pack already set timeout */
+	if (AT_OFF)
+		req->rq_timeout = ldlm_get_rq_timeout();
+
+	if (lock->l_export && lock->l_export->exp_nid_stats &&
+	    lock->l_export->exp_nid_stats->nid_ldlm_stats)
+		lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats,
+				     LDLM_BL_CALLBACK - LDLM_FIRST_OPC);
+
+	rc = ldlm_ast_fini(req, arg, lock, instant_cancel);
+
+	RETURN(rc);
+}
+
+/**
+ * ->l_completion_ast callback for a remote lock in server namespace.
+ *
+ *  Sends AST to the client notifying it of lock granting.  If initial
+ *  lock response was not sent yet, instead of sending another RPC, just
+ *  mark the lock as granted and client will understand
+ */
+int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	struct ldlm_cb_set_arg *arg = data;
+	struct ldlm_request *body;
+	struct ptlrpc_request *req;
+	struct ldlm_cb_async_args *ca;
+	int instant_cancel = 0;
+	int rc = 0;
+	int lvb_len;
+
+	ENTRY;
+
+	LASSERT(lock != NULL);
+	LASSERT(data != NULL);
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_SRV_CP_AST)) {
+		LDLM_DEBUG(lock, "dropping CP AST");
+		RETURN(0);
+	}
+
+	req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse,
+				   &RQF_LDLM_CP_CALLBACK);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* server namespace, doesn't need lock */
+	lvb_len = ldlm_lvbo_size(lock);
+	/*
+	 * LU-3124 & LU-2187: to not return layout in completion AST because
+	 * it may deadlock for LU-2187, or client may not have enough space
+	 * for large layout. The layout will be returned to client with an
+	 * extra RPC to fetch xattr.lov
+	 */
+	if (ldlm_has_layout(lock))
+		lvb_len = 0;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len);
+	rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	ca = ptlrpc_req_async_args(ca, req);
+	ca->ca_set_arg = arg;
+	ca->ca_lock = lock;
+
+	req->rq_interpret_reply = ldlm_cb_interpret;
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+
+	body->lock_handle[0] = lock->l_remote_handle;
+	body->lock_handle[1].cookie = lock->l_handle.h_cookie;
+	body->lock_flags = ldlm_flags_to_wire(flags);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	if (lvb_len > 0) {
+		void *lvb = req_capsule_client_get(&req->rq_pill, &RMF_DLM_LVB);
+		lvb_len = ldlm_lvbo_fill(lock, lvb, &lvb_len);
+		if (lvb_len < 0) {
+			/*
+			 * We still need to send the RPC to wake up the blocked
+			 * enqueue thread on the client.
+			 *
+			 * Consider old client, there is no better way to notify
+			 * the failure, just zero-sized the LVB, then the client
+			 * will fail out as "-EPROTO".
+			 */
+			req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, 0,
+					   RCL_CLIENT);
+			instant_cancel = 1;
+		} else {
+			req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, lvb_len,
+					   RCL_CLIENT);
+		}
+	}
+
+	LDLM_DEBUG(lock, "server preparing completion AST");
+
+	ptlrpc_request_set_replen(req);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+	/* ptlrpc_request_pack already set timeout */
+	if (AT_OFF)
+		req->rq_timeout = ldlm_get_rq_timeout();
+
+	/* We only send real blocking ASTs after the lock is granted */
+	lock_res_and_lock(lock);
+	if (ldlm_is_ast_sent(lock)) {
+		body->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT);
+		/* Copy AST flags like LDLM_FL_DISCARD_DATA. */
+		body->lock_flags |= ldlm_flags_to_wire(lock->l_flags &
+						       LDLM_FL_AST_MASK);
+
+		/*
+		 * We might get here prior to ldlm_handle_enqueue setting
+		 * LDLM_FL_CANCEL_ON_BLOCK flag. Then we will put this lock
+		 * into waiting list, but this is safe and similar code in
+		 * ldlm_handle_enqueue will call ldlm_lock_cancel() still,
+		 * that would not only cancel the lock, but will also remove
+		 * it from waiting list
+		 */
+		if (ldlm_is_cancel_on_block(lock)) {
+			unlock_res_and_lock(lock);
+			ldlm_lock_cancel(lock);
+
+			instant_cancel = 1;
+			req->rq_no_resend = 1;
+
+			lock_res_and_lock(lock);
+		} else {
+			/* start the lock-timeout clock */
+			ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock));
+			/* Do not resend after lock callback timeout */
+			req->rq_delay_limit = ldlm_bl_timeout(lock);
+			req->rq_resend_cb = ldlm_update_resend;
+		}
+	}
+	unlock_res_and_lock(lock);
+
+	if (lock->l_export && lock->l_export->exp_nid_stats &&
+	    lock->l_export->exp_nid_stats->nid_ldlm_stats)
+		lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats,
+				     LDLM_CP_CALLBACK - LDLM_FIRST_OPC);
+
+	rc = ldlm_ast_fini(req, arg, lock, instant_cancel);
+
+	RETURN(lvb_len < 0 ? lvb_len : rc);
+}
+
+/**
+ * Server side ->l_glimpse_ast handler for client locks.
+ *
+ * Sends glimpse AST to the client and waits for reply. Then updates
+ * lvbo with the result.
+ */
+int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
+{
+	struct ldlm_cb_set_arg *arg = data;
+	struct ldlm_request *body;
+	struct ptlrpc_request *req;
+	struct ldlm_cb_async_args *ca;
+	int rc;
+	struct req_format *req_fmt;
+
+	ENTRY;
+
+	LASSERT(lock != NULL);
+
+	if (arg->gl_desc != NULL)
+		/* There is a glimpse descriptor to pack */
+		req_fmt = &RQF_LDLM_GL_CALLBACK_DESC;
+	else
+		req_fmt = &RQF_LDLM_GL_CALLBACK;
+
+	req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
+					req_fmt, LUSTRE_DLM_VERSION,
+					LDLM_GL_CALLBACK);
+
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	if (arg->gl_desc != NULL) {
+		/* copy the GL descriptor */
+		union ldlm_gl_desc	*desc;
+
+		desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC);
+		*desc = *arg->gl_desc;
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+	ldlm_lock2desc(lock, &body->lock_desc);
+
+	ca = ptlrpc_req_async_args(ca, req);
+	ca->ca_set_arg = arg;
+	ca->ca_lock = lock;
+
+	/* server namespace, doesn't need lock */
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			     ldlm_lvbo_size(lock));
+	ptlrpc_request_set_replen(req);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+	/* ptlrpc_request_alloc_pack already set timeout */
+	if (AT_OFF)
+		req->rq_timeout = ldlm_get_rq_timeout();
+
+	req->rq_interpret_reply = ldlm_cb_interpret;
+
+	if (lock->l_export && lock->l_export->exp_nid_stats) {
+		struct nid_stat *nid_stats = lock->l_export->exp_nid_stats;
+
+		lprocfs_counter_incr(nid_stats->nid_ldlm_stats,
+				     LDLM_GL_CALLBACK - LDLM_FIRST_OPC);
+	}
+
+	rc = ldlm_ast_fini(req, arg, lock, 0);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_server_glimpse_ast);
+
+int ldlm_glimpse_locks(struct ldlm_resource *res,
+		       struct list_head *gl_work_list)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), gl_work_list,
+			       LDLM_WORK_GL_AST);
+	if (rc == -ERESTART)
+		ldlm_reprocess_all(res, 0);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_glimpse_locks);
+
+/* return LDLM lock associated with a lock callback request */
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req)
+{
+	struct ldlm_cb_async_args *ca;
+	struct ldlm_lock *lock;
+
+	ENTRY;
+
+	ca = ptlrpc_req_async_args(ca, req);
+	lock = ca->ca_lock;
+	if (lock == NULL)
+		RETURN(ERR_PTR(-EFAULT));
+
+	RETURN(lock);
+}
+EXPORT_SYMBOL(ldlm_request_lock);
+
+/**
+ * Main server-side entry point into LDLM for enqueue. This is called by ptlrpc
+ * service threads to carry out client lock enqueueing requests.
+ */
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
+			 struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs)
+{
+	struct ldlm_reply *dlm_rep;
+	__u64 flags;
+	enum ldlm_error err = ELDLM_OK;
+	struct ldlm_lock *lock = NULL;
+	void *cookie = NULL;
+	int rc = 0;
+	struct ldlm_resource *res = NULL;
+	const struct lu_env *env = req->rq_svc_thread->t_env;
+
+	ENTRY;
+
+	LDLM_DEBUG_NOLOCK("server-side enqueue handler START");
+
+	ldlm_request_cancel(req, dlm_req, LDLM_ENQUEUE_CANCEL_OFF, LATF_SKIP);
+	flags = ldlm_flags_from_wire(dlm_req->lock_flags);
+
+	LASSERT(req->rq_export);
+
+	/* for intent enqueue the stat will be updated inside intent policy */
+	if (ptlrpc_req2svc(req)->srv_stats != NULL &&
+	    !(dlm_req->lock_flags & LDLM_FL_HAS_INTENT))
+		ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats);
+
+	if (req->rq_export && req->rq_export->exp_nid_stats &&
+	    req->rq_export->exp_nid_stats->nid_ldlm_stats)
+		lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
+				     LDLM_ENQUEUE - LDLM_FIRST_OPC);
+
+	if (unlikely(dlm_req->lock_desc.l_resource.lr_type < LDLM_MIN_TYPE ||
+		     dlm_req->lock_desc.l_resource.lr_type >= LDLM_MAX_TYPE)) {
+		DEBUG_REQ(D_ERROR, req, "invalid lock request type %d",
+			  dlm_req->lock_desc.l_resource.lr_type);
+		GOTO(out, rc = -EFAULT);
+	}
+
+	if (unlikely(dlm_req->lock_desc.l_req_mode <= LCK_MINMODE ||
+		     dlm_req->lock_desc.l_req_mode >= LCK_MAXMODE ||
+		     dlm_req->lock_desc.l_req_mode &
+		     (dlm_req->lock_desc.l_req_mode-1))) {
+		DEBUG_REQ(D_ERROR, req, "invalid lock request mode %d",
+			  dlm_req->lock_desc.l_req_mode);
+		GOTO(out, rc = -EFAULT);
+	}
+
+	if (unlikely((flags & LDLM_FL_REPLAY) ||
+		     (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) {
+		/* Find an existing lock in the per-export lock hash */
+		/*
+		 * In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp()
+		 */
+		/* coverity[overrun-buffer-val] */
+		lock = cfs_hash_lookup(req->rq_export->exp_lock_hash,
+				       (void *)&dlm_req->lock_handle[0]);
+		if (lock != NULL) {
+			DEBUG_REQ(D_DLMTRACE, req,
+				  "found existing lock cookie %#llx",
+				  lock->l_handle.h_cookie);
+			flags |= LDLM_FL_RESENT;
+			GOTO(existing_lock, rc = 0);
+		}
+	} else {
+		if (ldlm_reclaim_full()) {
+			DEBUG_REQ(D_DLMTRACE, req,
+				  "Too many granted locks, reject current enqueue request and let the client retry later");
+			GOTO(out, rc = -EINPROGRESS);
+		}
+	}
+
+	/* The lock's callback data might be set in the policy function */
+	lock = ldlm_lock_create(ns, &dlm_req->lock_desc.l_resource.lr_name,
+				dlm_req->lock_desc.l_resource.lr_type,
+				dlm_req->lock_desc.l_req_mode,
+				cbs, NULL, 0, LVB_T_NONE);
+	if (IS_ERR(lock)) {
+		rc = PTR_ERR(lock);
+		lock = NULL;
+		GOTO(out, rc);
+	}
+
+	lock->l_remote_handle = dlm_req->lock_handle[0];
+	LDLM_DEBUG(lock, "server-side enqueue handler, new lock created");
+
+	/*
+	 * Initialize resource lvb but not for a lock being replayed since
+	 * Client already got lvb sent in this case.
+	 * This must occur early since some policy methods assume resource
+	 * lvb is available (lr_lvb_data != NULL).
+	 */
+	res = lock->l_resource;
+	if (!(flags & LDLM_FL_REPLAY)) {
+		/* non-replayed lock, delayed lvb init may need to be done */
+		rc = ldlm_lvbo_init(res);
+		if (rc < 0) {
+			LDLM_DEBUG(lock, "delayed lvb init failed (rc %d)", rc);
+			GOTO(out, rc);
+		}
+	}
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_BLOCKED, obd_timeout * 2);
+	/*
+	 * Don't enqueue a lock onto the export if it is been disonnected
+	 * due to eviction (b=3822) or server umount (b=24324).
+	 * Cancel it now instead.
+	 */
+	if (req->rq_export->exp_disconnected) {
+		LDLM_ERROR(lock, "lock on disconnected export %p",
+			   req->rq_export);
+		GOTO(out, rc = -ENOTCONN);
+	}
+
+	lock->l_export = class_export_lock_get(req->rq_export, lock);
+	if (lock->l_export->exp_lock_hash)
+		cfs_hash_add(lock->l_export->exp_lock_hash,
+			     &lock->l_remote_handle,
+			     &lock->l_exp_hash);
+
+	/*
+	 * Inherit the enqueue flags before the operation, because we do not
+	 * keep the res lock on return and next operations (BL AST) may proceed
+	 * without them.
+	 */
+	lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+					      LDLM_FL_INHERIT_MASK);
+
+	ldlm_convert_policy_to_local(req->rq_export,
+				     dlm_req->lock_desc.l_resource.lr_type,
+				     &dlm_req->lock_desc.l_policy_data,
+				     &lock->l_policy_data);
+	if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT) {
+		lock->l_req_extent = lock->l_policy_data.l_extent;
+	} else if (dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) {
+		lock->l_policy_data.l_inodebits.try_bits =
+			dlm_req->lock_desc.l_policy_data.l_inodebits.try_bits;
+		lock->l_policy_data.l_inodebits.li_gid =
+			dlm_req->lock_desc.l_policy_data.l_inodebits.li_gid;
+	}
+
+existing_lock:
+	cookie = req;
+	if (!(flags & LDLM_FL_HAS_INTENT)) {
+		/* based on the assumption that lvb size never changes during
+		 * resource life time otherwise it need resource->lr_lock's
+		 * protection */
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB,
+				     RCL_SERVER, ldlm_lvbo_size(lock));
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR))
+			GOTO(out, rc = -ENOMEM);
+
+		rc = req_capsule_server_pack(&req->rq_pill);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	err = ldlm_lock_enqueue(env, ns, &lock, cookie, &flags);
+	if (err) {
+		if ((int)err < 0)
+			rc = (int)err;
+		GOTO(out, err);
+	}
+
+	dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+
+	ldlm_lock2desc(lock, &dlm_rep->lock_desc);
+	ldlm_lock2handle(lock, &dlm_rep->lock_handle);
+
+	if (lock && lock->l_resource->lr_type == LDLM_EXTENT)
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 6);
+
+	/*
+	 * We never send a blocking AST until the lock is granted, but
+	 * we can tell it right now
+	 */
+	lock_res_and_lock(lock);
+
+	/*
+	 * Now take into account flags to be inherited from original lock
+	 * request both in reply to client and in our own lock flags.
+	 */
+	dlm_rep->lock_flags = ldlm_flags_to_wire(flags);
+	lock->l_flags |= flags & LDLM_FL_INHERIT_MASK;
+
+	/*
+	 * Don't move a pending lock onto the export if it has already been
+	 * disconnected due to eviction (b=5683) or server umount (b=24324).
+	 * Cancel it now instead.
+	 */
+	if (unlikely(req->rq_export->exp_disconnected ||
+		     OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT))) {
+		LDLM_ERROR(lock, "lock on destroyed export %p", req->rq_export);
+		rc = -ENOTCONN;
+	} else if (ldlm_is_ast_sent(lock)) {
+		/* fill lock desc for possible lock convert */
+		if (lock->l_blocking_lock &&
+		    lock->l_resource->lr_type == LDLM_IBITS) {
+			struct ldlm_lock *bl_lock = lock->l_blocking_lock;
+			struct ldlm_lock_desc *rep_desc = &dlm_rep->lock_desc;
+
+			LDLM_DEBUG(lock,
+				   "save blocking bits %llx in granted lock",
+				   bl_lock->l_policy_data.l_inodebits.bits);
+			/*
+			 * If lock is blocked then save blocking ibits
+			 * in returned lock policy for the possible lock
+			 * convert on a client.
+			 */
+			rep_desc->l_policy_data.l_inodebits.cancel_bits =
+				bl_lock->l_policy_data.l_inodebits.bits;
+		}
+		dlm_rep->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT);
+		if (ldlm_is_granted(lock)) {
+			/*
+			 * Only cancel lock if it was granted, because it would
+			 * be destroyed immediately and would never be granted
+			 * in the future, causing timeouts on client.  Not
+			 * granted lock will be cancelled immediately after
+			 * sending completion AST.
+			 */
+			if (ldlm_is_cancel_on_block(lock)) {
+				unlock_res_and_lock(lock);
+				ldlm_lock_cancel(lock);
+				lock_res_and_lock(lock);
+			} else {
+				ldlm_add_waiting_lock(lock,
+						      ldlm_bl_timeout(lock));
+			}
+		}
+	}
+	unlock_res_and_lock(lock);
+
+	EXIT;
+out:
+	req->rq_status = rc ?: err; /* return either error - b=11190 */
+	if (!req->rq_packed_final) {
+		int rc1 = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc == 0)
+			rc = rc1;
+	}
+
+	/*
+	 * The LOCK_CHANGED code in ldlm_lock_enqueue depends on this
+	 * ldlm_reprocess_all.  If this moves, revisit that code. -phil
+	 */
+	if (lock != NULL) {
+		LDLM_DEBUG(lock,
+			   "server-side enqueue handler, sending reply (err=%d, rc=%d)",
+			   err, rc);
+
+		if (rc == 0 &&
+		    req_capsule_has_field(&req->rq_pill, &RMF_DLM_LVB,
+					  RCL_SERVER) &&
+		    ldlm_lvbo_size(lock) > 0) {
+			void *buf;
+			int buflen;
+
+retry:
+			buf = req_capsule_server_get(&req->rq_pill,
+						     &RMF_DLM_LVB);
+			LASSERTF(buf != NULL, "req %p, lock %p\n", req, lock);
+			buflen = req_capsule_get_size(&req->rq_pill,
+					&RMF_DLM_LVB, RCL_SERVER);
+			/*
+			 * non-replayed lock, delayed lvb init may
+			 * need to be occur now
+			 */
+			if ((buflen > 0) && !(flags & LDLM_FL_REPLAY)) {
+				int rc2;
+
+				rc2 = ldlm_lvbo_fill(lock, buf, &buflen);
+				if (rc2 >= 0) {
+					req_capsule_shrink(&req->rq_pill,
+							   &RMF_DLM_LVB,
+							   rc2, RCL_SERVER);
+				} else if (rc2 == -ERANGE) {
+					rc2 = req_capsule_server_grow(
+							&req->rq_pill,
+							&RMF_DLM_LVB, buflen);
+					if (!rc2) {
+						goto retry;
+					} else {
+						/*
+						 * if we can't grow the buffer,
+						 * it's ok to return empty lvb
+						 * to client.
+						 */
+						req_capsule_shrink(
+							&req->rq_pill,
+							&RMF_DLM_LVB, 0,
+							RCL_SERVER);
+					}
+				} else {
+					rc = rc2;
+				}
+			} else if (flags & LDLM_FL_REPLAY) {
+				/* no LVB resend upon replay */
+				if (buflen > 0)
+					req_capsule_shrink(&req->rq_pill,
+							   &RMF_DLM_LVB,
+							   0, RCL_SERVER);
+				else
+					rc = buflen;
+			} else {
+				rc = buflen;
+			}
+		}
+
+		if (rc != 0 && !(flags & LDLM_FL_RESENT)) {
+			if (lock->l_export) {
+				ldlm_lock_cancel(lock);
+			} else {
+				lock_res_and_lock(lock);
+				ldlm_resource_unlink_lock(lock);
+				ldlm_lock_destroy_nolock(lock);
+				unlock_res_and_lock(lock);
+			}
+			ldlm_reprocess_all(lock->l_resource, lock->l_policy_data.l_inodebits.bits);
+		}
+
+		if (!err && !ldlm_is_cbpending(lock) &&
+		    dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK)
+			ldlm_reprocess_all(lock->l_resource,
+					   lock->l_policy_data.l_inodebits.bits);
+
+		LDLM_LOCK_RELEASE(lock);
+	}
+
+	LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)",
+			  lock, rc);
+
+	return rc;
+}
+
+/*
+ * Clear the blocking lock, the race is possible between ldlm_handle_convert0()
+ * and ldlm_work_bl_ast_lock(), so this is done under lock with check for NULL.
+ */
+void ldlm_clear_blocking_lock(struct ldlm_lock *lock)
+{
+	if (lock->l_blocking_lock) {
+		LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+		lock->l_blocking_lock = NULL;
+	}
+}
+
+/* A lock can be converted to new ibits or mode and should be considered
+ * as new lock. Clear all states related to a previous blocking AST
+ * processing so new conflicts will cause new blocking ASTs.
+ *
+ * This is used during lock convert below and lock downgrade to COS mode in
+ * ldlm_lock_mode_downgrade().
+ */
+void ldlm_clear_blocking_data(struct ldlm_lock *lock)
+{
+	ldlm_clear_ast_sent(lock);
+	lock->l_bl_ast_run = 0;
+	ldlm_clear_blocking_lock(lock);
+}
+
+/**
+ * Main LDLM entry point for server code to process lock conversion requests.
+ */
+int ldlm_handle_convert0(struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req)
+{
+	struct obd_export *exp = req->rq_export;
+	struct ldlm_reply *dlm_rep;
+	struct ldlm_lock *lock;
+	__u64 bits;
+	__u64 new_bits;
+	int rc;
+
+	ENTRY;
+
+	if (exp && exp->exp_nid_stats && exp->exp_nid_stats->nid_ldlm_stats)
+		lprocfs_counter_incr(exp->exp_nid_stats->nid_ldlm_stats,
+				     LDLM_CONVERT - LDLM_FIRST_OPC);
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(rc);
+
+	dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	dlm_rep->lock_flags = dlm_req->lock_flags;
+
+	lock = ldlm_handle2lock(&dlm_req->lock_handle[0]);
+	if (!lock) {
+		LDLM_DEBUG_NOLOCK("server lock is canceled already");
+		req->rq_status = ELDLM_NO_LOCK_DATA;
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "server-side convert handler START");
+
+	lock_res_and_lock(lock);
+	bits = lock->l_policy_data.l_inodebits.bits;
+	new_bits = dlm_req->lock_desc.l_policy_data.l_inodebits.bits;
+
+	if (ldlm_is_cancel(lock)) {
+		LDLM_DEBUG(lock, "convert on canceled lock!");
+		unlock_res_and_lock(lock);
+		GOTO(out_put, rc = ELDLM_NO_LOCK_DATA);
+	}
+
+	if (dlm_req->lock_desc.l_req_mode != lock->l_granted_mode) {
+		LDLM_ERROR(lock, "lock mode differs!");
+		unlock_res_and_lock(lock);
+		GOTO(out_put, rc = -EPROTO);
+	}
+
+	if (bits == new_bits) {
+		/*
+		 * This can be valid situation if CONVERT RPCs are
+		 * re-ordered. Just finish silently
+		 */
+		LDLM_DEBUG(lock, "lock is converted already!");
+		unlock_res_and_lock(lock);
+	} else {
+		if (ldlm_is_waited(lock))
+			ldlm_del_waiting_lock(lock);
+
+		ldlm_clear_cbpending(lock);
+		lock->l_policy_data.l_inodebits.cancel_bits = 0;
+		ldlm_inodebits_drop(lock, bits & ~new_bits);
+
+		ldlm_clear_blocking_data(lock);
+		unlock_res_and_lock(lock);
+
+		/* All old bits should be reprocessed to send new BL AST if
+		 * it wasn't sent earlier due to LDLM_FL_AST_SENT bit set.
+		 * */
+		ldlm_reprocess_all(lock->l_resource, bits);
+	}
+
+	dlm_rep->lock_handle = lock->l_remote_handle;
+	ldlm_ibits_policy_local_to_wire(&lock->l_policy_data,
+					&dlm_rep->lock_desc.l_policy_data);
+	rc = ELDLM_OK;
+	EXIT;
+out_put:
+	LDLM_DEBUG(lock, "server-side convert handler END, rc = %d", rc);
+	LDLM_LOCK_PUT(lock);
+	req->rq_status = rc;
+	return 0;
+}
+
+/**
+ * Cancel all the locks whose handles are packed into ldlm_request
+ *
+ * Called by server code expecting such combined cancel activity
+ * requests.
+ */
+int ldlm_request_cancel(struct ptlrpc_request *req,
+			const struct ldlm_request *dlm_req,
+			int first, enum lustre_at_flags flags)
+{
+	struct ldlm_resource *res, *pres = NULL;
+	struct ldlm_lock *lock;
+	int i, count, done = 0;
+	unsigned int size;
+
+	ENTRY;
+
+	size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT);
+	if (size <= offsetof(struct ldlm_request, lock_handle) ||
+	    (size - offsetof(struct ldlm_request, lock_handle)) /
+	     sizeof(struct lustre_handle) < dlm_req->lock_count)
+		RETURN(0);
+
+	count = dlm_req->lock_count ? dlm_req->lock_count : 1;
+	if (first >= count)
+		RETURN(0);
+
+	if (count == 1 && dlm_req->lock_handle[0].cookie == 0)
+		RETURN(0);
+
+	/*
+	 * There is no lock on the server at the replay time,
+	 * skip lock cancelling to make replay tests to pass.
+	 */
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+		RETURN(0);
+
+	LDLM_DEBUG_NOLOCK("server-side cancel handler START: %d locks, starting at %d",
+			  count, first);
+
+	for (i = first; i < count; i++) {
+		lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
+		if (!lock) {
+			/* below message checked in replay-single.sh test_36 */
+			LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llu)",
+					  dlm_req->lock_handle[i].cookie);
+			continue;
+		}
+
+		res = lock->l_resource;
+		done++;
+
+		/*
+		 * This code is an optimization to only attempt lock
+		 * granting on the resource (that could be CPU-expensive)
+		 * after we are done cancelling lock in that resource.
+		 */
+		if (res != pres) {
+			if (pres != NULL) {
+				ldlm_reprocess_all(pres, 0);
+				LDLM_RESOURCE_DELREF(pres);
+				ldlm_resource_putref(pres);
+			}
+			if (res != NULL) {
+				ldlm_resource_getref(res);
+				LDLM_RESOURCE_ADDREF(res);
+
+				if (!ldlm_is_discard_data(lock))
+					ldlm_lvbo_update(res, lock,
+							 NULL, 1);
+			}
+			pres = res;
+		}
+
+		if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock) &&
+		    lock->l_blast_sent != 0) {
+			timeout_t delay = 0;
+
+			if (ktime_get_seconds() > lock->l_blast_sent)
+				delay = ktime_get_seconds() -
+					lock->l_blast_sent;
+			LDLM_DEBUG(lock,
+				   "server cancels blocked lock after %ds",
+				   delay);
+			at_measured(&lock->l_export->exp_bl_lock_at, delay);
+		}
+		ldlm_lock_cancel(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+	if (pres != NULL) {
+		ldlm_reprocess_all(pres, 0);
+		LDLM_RESOURCE_DELREF(pres);
+		ldlm_resource_putref(pres);
+	}
+	LDLM_DEBUG_NOLOCK("server-side cancel handler END");
+	RETURN(done);
+}
+EXPORT_SYMBOL(ldlm_request_cancel);
+
+/**
+ * Main LDLM entry point for server code to cancel locks.
+ *
+ * Typically gets called from service handler on LDLM_CANCEL opc.
+ */
+int ldlm_handle_cancel(struct ptlrpc_request *req)
+{
+	struct ldlm_request *dlm_req;
+	int rc;
+
+	ENTRY;
+
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL) {
+		CDEBUG(D_INFO, "bad request buffer for cancel\n");
+		RETURN(-EFAULT);
+	}
+
+	if (req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) <
+	    offsetof(struct ldlm_request, lock_handle[1]))
+		RETURN(-EPROTO);
+
+	if (req->rq_export && req->rq_export->exp_nid_stats &&
+	    req->rq_export->exp_nid_stats->nid_ldlm_stats)
+		lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
+				     LDLM_CANCEL - LDLM_FIRST_OPC);
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(rc);
+
+	if (!ldlm_request_cancel(req, dlm_req, 0, LATF_STATS))
+		req->rq_status = LUSTRE_ESTALE;
+
+	RETURN(ptlrpc_reply(req));
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * Server may pass additional information about blocking lock.
+ * For IBITS locks it is conflicting bits which can be used for
+ * lock convert instead of cancel.
+ */
+void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	check_res_locked(lock->l_resource);
+	if (ns_is_client(ns) && ld &&
+	    (lock->l_resource->lr_type == LDLM_IBITS)) {
+		/*
+		 * Lock description contains policy of blocking lock,
+		 * and its cancel_bits is used to pass conflicting bits.
+		 * NOTE: ld can be NULL or can be not NULL but zeroed if
+		 * passed from ldlm_bl_thread_blwi(), check below used bits
+		 * in ld to make sure it is valid description.
+		 *
+		 * If server may replace lock resource keeping the same cookie,
+		 * never use cancel bits from different resource, full cancel
+		 * is to be used.
+		 */
+		if (ld->l_policy_data.l_inodebits.cancel_bits &&
+		    ldlm_res_eq(&ld->l_resource.lr_name,
+				&lock->l_resource->lr_name) &&
+		    !(ldlm_is_cbpending(lock) &&
+		      lock->l_policy_data.l_inodebits.cancel_bits == 0)) {
+			/* always combine conflicting ibits */
+			lock->l_policy_data.l_inodebits.cancel_bits |=
+				ld->l_policy_data.l_inodebits.cancel_bits;
+		} else {
+			/* If cancel_bits are not obtained or
+			 * if the lock is already CBPENDING and
+			 * has no cancel_bits set
+			 * - the full lock is to be cancelled
+			 */
+			lock->l_policy_data.l_inodebits.cancel_bits = 0;
+		}
+	}
+}
+
+/**
+ * Callback handler for receiving incoming blocking ASTs.
+ *
+ * This can only happen on client side.
+ */
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+	int do_ast;
+
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+	lock_res_and_lock(lock);
+
+	/* get extra information from desc if any */
+	ldlm_bl_desc2lock(ld, lock);
+	ldlm_set_cbpending(lock);
+
+	do_ast = (!lock->l_readers && !lock->l_writers);
+	unlock_res_and_lock(lock);
+
+	if (do_ast) {
+		CDEBUG(D_DLMTRACE,
+		       "Lock %p already unused, calling callback (%p)\n",
+		       lock, lock->l_blocking_ast);
+		if (lock->l_blocking_ast != NULL)
+			lock->l_blocking_ast(lock, ld, lock->l_ast_data,
+					     LDLM_CB_BLOCKING);
+	} else {
+		CDEBUG(D_DLMTRACE,
+		       "Lock %p is referenced, will be cancelled later\n",
+		       lock);
+	}
+
+	LDLM_DEBUG(lock, "client blocking callback handler END");
+	LDLM_LOCK_RELEASE(lock);
+	EXIT;
+}
+
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_no_reply)
+		return 0;
+
+	req->rq_status = rc;
+	if (!req->rq_packed_final) {
+		rc = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc)
+			return rc;
+	}
+	return ptlrpc_reply(req);
+}
+
+/**
+ * Callback handler for receiving incoming completion ASTs.
+ *
+ * This only can happen on client side.
+ */
+static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
+                                    struct ldlm_namespace *ns,
+                                    struct ldlm_request *dlm_req,
+                                    struct ldlm_lock *lock)
+{
+	LIST_HEAD(ast_list);
+	int lvb_len;
+	int rc = 0;
+
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client completion callback handler START");
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
+		long to = cfs_time_seconds(1);
+
+		ldlm_callback_reply(req, 0);
+
+		while (to > 0) {
+			to = schedule_timeout_interruptible(to);
+			if (ldlm_is_granted(lock) ||
+			    ldlm_is_destroyed(lock))
+				break;
+		}
+	}
+
+	lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT);
+	if (lvb_len < 0) {
+		LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len);
+		GOTO(out, rc = lvb_len);
+	} else if (lvb_len > 0) {
+		if (lock->l_lvb_len > 0) {
+			/* for extent lock, lvb contains ost_lvb{}. */
+			LASSERT(lock->l_lvb_data != NULL);
+
+			if (unlikely(lock->l_lvb_len < lvb_len)) {
+				LDLM_ERROR(lock,
+					   "Replied LVB is larger than expectation, expected = %d, replied = %d",
+					   lock->l_lvb_len, lvb_len);
+				GOTO(out, rc = -EINVAL);
+			}
+		}
+	}
+
+	lock_res_and_lock(lock);
+
+	if (!ldlm_res_eq(&dlm_req->lock_desc.l_resource.lr_name,
+			 &lock->l_resource->lr_name)) {
+		ldlm_resource_unlink_lock(lock);
+		unlock_res_and_lock(lock);
+		rc = ldlm_lock_change_resource(ns, lock,
+				&dlm_req->lock_desc.l_resource.lr_name);
+		if (rc < 0) {
+			LDLM_ERROR(lock, "Failed to allocate resource");
+			GOTO(out, rc);
+		}
+		LDLM_DEBUG(lock, "completion AST, new resource");
+		lock_res_and_lock(lock);
+	}
+
+	if (ldlm_is_failed(lock)) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(-EINVAL);
+	}
+
+	if (ldlm_is_destroyed(lock) ||
+	    ldlm_is_granted(lock)) {
+		/* b=11300: the lock has already been granted */
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "Double grant race happened");
+		GOTO(out, rc = 0);
+	}
+
+	/*
+	 * If we receive the completion AST before the actual enqueue returned,
+	 * then we might need to switch lock modes, resources, or extents.
+	 */
+	if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
+		lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
+		LDLM_DEBUG(lock, "completion AST, new lock mode");
+	}
+
+	if (lock->l_resource->lr_type != LDLM_PLAIN) {
+		ldlm_convert_policy_to_local(req->rq_export,
+					  dlm_req->lock_desc.l_resource.lr_type,
+					  &dlm_req->lock_desc.l_policy_data,
+					  &lock->l_policy_data);
+		LDLM_DEBUG(lock, "completion AST, new policy data");
+	}
+
+	ldlm_resource_unlink_lock(lock);
+
+	if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+		/*
+		 * BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast.
+		 */
+		ldlm_lock_remove_from_lru(lock);
+		ldlm_bl_desc2lock(&dlm_req->lock_desc, lock);
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+		LDLM_DEBUG(lock, "completion AST includes blocking AST");
+	}
+
+	if (lock->l_lvb_len > 0) {
+		rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT,
+				   lock->l_lvb_data, lvb_len);
+		if (rc < 0) {
+			unlock_res_and_lock(lock);
+			GOTO(out, rc);
+		}
+	}
+
+	ldlm_grant_lock(lock, &ast_list);
+	unlock_res_and_lock(lock);
+
+	LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
+
+	/*
+	 * Let Enqueue to call osc_lock_upcall() and initialize
+	 * l_ast_data
+	 */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2);
+
+	ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST);
+
+	LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)",
+			  lock);
+	GOTO(out, rc);
+
+out:
+	if (rc < 0) {
+		lock_res_and_lock(lock);
+		ldlm_set_failed(lock);
+		unlock_res_and_lock(lock);
+		wake_up(&lock->l_waitq);
+	}
+	LDLM_LOCK_RELEASE(lock);
+
+	return 0;
+}
+
+/**
+ * Callback handler for receiving incoming glimpse ASTs.
+ *
+ * This only can happen on client side.  After handling the glimpse AST
+ * we also consider dropping the lock here if it is unused locally for a
+ * long time.
+ */
+static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
+				    struct ldlm_namespace *ns,
+				    struct ldlm_request *dlm_req,
+				    struct ldlm_lock *lock)
+{
+	struct ldlm_lock_desc *ld = &dlm_req->lock_desc;
+	int rc = -ENOSYS;
+
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client glimpse AST callback handler");
+
+	if (lock->l_glimpse_ast != NULL)
+		rc = lock->l_glimpse_ast(lock, req);
+
+	if (req->rq_repmsg != NULL) {
+		ptlrpc_reply(req);
+	} else {
+		req->rq_status = rc;
+		ptlrpc_error(req);
+	}
+
+	lock_res_and_lock(lock);
+	if (lock->l_granted_mode == LCK_PW &&
+	    !lock->l_readers && !lock->l_writers &&
+	    ktime_after(ktime_get(),
+			ktime_add(lock->l_last_used, ns->ns_dirty_age_limit))) {
+		unlock_res_and_lock(lock);
+
+		/* For MDS glimpse it is always DOM lock, set corresponding
+		 * cancel_bits to perform lock convert if needed
+		 */
+		if (lock->l_resource->lr_type == LDLM_IBITS)
+			ld->l_policy_data.l_inodebits.cancel_bits =
+							MDS_INODELOCK_DOM;
+		if (ldlm_bl_to_thread_lock(ns, ld, lock))
+			ldlm_handle_bl_callback(ns, ld, lock);
+
+		EXIT;
+		return;
+	}
+	unlock_res_and_lock(lock);
+	LDLM_LOCK_RELEASE(lock);
+	EXIT;
+}
+
+static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
+			       enum ldlm_cancel_flags cancel_flags)
+{
+	struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+	ENTRY;
+
+	spin_lock(&blp->blp_lock);
+	if (blwi->blwi_lock &&
+	    ldlm_is_discard_data(blwi->blwi_lock)) {
+		/* add LDLM_FL_DISCARD_DATA requests to the priority list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list);
+	} else {
+		/* other blocking callbacks are added to the regular list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_list);
+	}
+	spin_unlock(&blp->blp_lock);
+
+	wake_up(&blp->blp_waitq);
+
+	/*
+	 * can not check blwi->blwi_flags as blwi could be already freed in
+	 * LCF_ASYNC mode
+	 */
+	if (!(cancel_flags & LCF_ASYNC))
+		wait_for_completion(&blwi->blwi_comp);
+
+	RETURN(0);
+}
+
+static inline void init_blwi(struct ldlm_bl_work_item *blwi,
+			     struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct list_head *cancels, int count,
+			     struct ldlm_lock *lock,
+			     enum ldlm_cancel_flags cancel_flags)
+{
+	init_completion(&blwi->blwi_comp);
+	INIT_LIST_HEAD(&blwi->blwi_head);
+
+	if (current->flags & PF_MEMALLOC)
+		blwi->blwi_mem_pressure = 1;
+
+	blwi->blwi_ns = ns;
+	blwi->blwi_flags = cancel_flags;
+	if (ld != NULL)
+		blwi->blwi_ld = *ld;
+	if (count) {
+		list_splice_init(cancels, &blwi->blwi_head);
+		blwi->blwi_count = count;
+	} else {
+		blwi->blwi_lock = lock;
+	}
+}
+
+/**
+ * Queues a list of locks \a cancels containing \a count locks
+ * for later processing by a blocking thread.  If \a count is zero,
+ * then the lock referenced as \a lock is queued instead.
+ *
+ * The blocking thread would then call ->l_blocking_ast callback in the lock.
+ * If list addition fails an error is returned and caller is supposed to
+ * call ->l_blocking_ast itself.
+ */
+static int ldlm_bl_to_thread(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct ldlm_lock *lock,
+			     struct list_head *cancels, int count,
+			     enum ldlm_cancel_flags cancel_flags)
+{
+	ENTRY;
+
+	if (cancels && count == 0)
+		RETURN(0);
+
+	if (cancel_flags & LCF_ASYNC) {
+		struct ldlm_bl_work_item *blwi;
+
+		OBD_ALLOC(blwi, sizeof(*blwi));
+		if (blwi == NULL)
+			RETURN(-ENOMEM);
+		init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags);
+
+		RETURN(__ldlm_bl_to_thread(blwi, cancel_flags));
+	} else {
+		/*
+		 * if it is synchronous call do minimum mem alloc, as it could
+		 * be triggered from kernel shrinker
+		 */
+		struct ldlm_bl_work_item blwi;
+
+		memset(&blwi, 0, sizeof(blwi));
+		init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags);
+		RETURN(__ldlm_bl_to_thread(&blwi, cancel_flags));
+	}
+}
+
+
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct ldlm_lock *lock)
+{
+	return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC);
+}
+
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   enum ldlm_cancel_flags cancel_flags)
+{
+	return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags);
+}
+
+int ldlm_bl_to_thread_ns(struct ldlm_namespace *ns)
+{
+	return ldlm_bl_to_thread(ns, NULL, NULL, NULL, 0, LCF_ASYNC);
+}
+
+int ldlm_bl_thread_wakeup(void)
+{
+	wake_up(&ldlm_state->ldlm_bl_pool->blp_waitq);
+	return 0;
+}
+
+/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */
+static int ldlm_handle_setinfo(struct ptlrpc_request *req)
+{
+	struct obd_device *obd = req->rq_export->exp_obd;
+	char *key;
+	void *val;
+	int keylen, vallen;
+	int rc = -ENOSYS;
+
+	ENTRY;
+
+	DEBUG_REQ(D_HSM, req, "%s: handle setinfo", obd->obd_name);
+
+	req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO);
+
+	key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	if (key == NULL) {
+		DEBUG_REQ(D_IOCTL, req, "no set_info key");
+		RETURN(-EFAULT);
+	}
+	keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY,
+				      RCL_CLIENT);
+	val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+	if (val == NULL) {
+		DEBUG_REQ(D_IOCTL, req, "no set_info val");
+		RETURN(-EFAULT);
+	}
+	vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL,
+				      RCL_CLIENT);
+
+	/* We are responsible for swabbing contents of val */
+
+	if (KEY_IS(KEY_HSM_COPYTOOL_SEND))
+		/* Pass it on to mdc (the "export" in this case) */
+		rc = obd_set_info_async(req->rq_svc_thread->t_env,
+					req->rq_export,
+					sizeof(KEY_HSM_COPYTOOL_SEND),
+					KEY_HSM_COPYTOOL_SEND,
+					vallen, val, NULL);
+	else
+		DEBUG_REQ(D_WARNING, req, "ignoring unknown key '%s'", key);
+
+	return rc;
+}
+
+static inline void ldlm_callback_errmsg(struct ptlrpc_request *req,
+					const char *msg, int rc,
+					const struct lustre_handle *handle)
+{
+	DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req,
+		  "%s, NID=%s lock=%#llx: rc = %d",
+		  msg, libcfs_id2str(req->rq_peer),
+		  handle ? handle->cookie : 0, rc);
+	if (req->rq_no_reply)
+		CWARN("No reply was sent, maybe cause b=21636.\n");
+	else if (rc)
+		CWARN("Send reply failed, maybe cause b=21636.\n");
+}
+
+/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */
+static int ldlm_callback_handler(struct ptlrpc_request *req)
+{
+	struct ldlm_namespace *ns;
+	struct ldlm_request *dlm_req;
+	struct ldlm_lock *lock;
+	int rc;
+
+	ENTRY;
+
+	/*
+	 * Requests arrive in sender's byte order.  The ptlrpc service
+	 * handler has already checked and, if necessary, byte-swapped the
+	 * incoming request message body, but I am responsible for the
+	 * message buffers.
+	 */
+
+	/* do nothing for sec context finalize */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI)
+		RETURN(0);
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+	if (req->rq_export == NULL) {
+		rc = ldlm_callback_reply(req, -ENOTCONN);
+		ldlm_callback_errmsg(req, "Operate on unconnected server",
+				     rc, NULL);
+		RETURN(0);
+	}
+
+	LASSERT(req->rq_export != NULL);
+	LASSERT(req->rq_export->exp_obd != NULL);
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case LDLM_BL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) {
+			if (cfs_fail_err)
+				ldlm_callback_reply(req, -(int)cfs_fail_err);
+			RETURN(0);
+		}
+		break;
+	case LDLM_CP_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
+			RETURN(0);
+		break;
+	case LDLM_GL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET))
+			RETURN(0);
+		break;
+	case LDLM_SET_INFO:
+		rc = ldlm_handle_setinfo(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	default:
+		CERROR("unknown opcode %u\n",
+		       lustre_msg_get_opc(req->rq_reqmsg));
+		ldlm_callback_reply(req, -EPROTO);
+		RETURN(0);
+	}
+
+	ns = req->rq_export->exp_obd->obd_namespace;
+	LASSERT(ns != NULL);
+
+	req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL) {
+		rc = ldlm_callback_reply(req, -EPROTO);
+		ldlm_callback_errmsg(req, "Operate without parameter", rc,
+				     NULL);
+		RETURN(0);
+	}
+
+	/*
+	 * Force a known safe race, send a cancel to the server for a lock
+	 * which the server has already started a blocking callback on.
+	 */
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0);
+		if (rc < 0)
+			CERROR("ldlm_cli_cancel: %d\n", rc);
+	}
+
+	lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0);
+	if (!lock) {
+		CDEBUG(D_DLMTRACE,
+		       "callback on lock %#llx - lock disappeared\n",
+		       dlm_req->lock_handle[0].cookie);
+		rc = ldlm_callback_reply(req, -EINVAL);
+		ldlm_callback_errmsg(req, "Operate with invalid parameter", rc,
+				     &dlm_req->lock_handle[0]);
+		RETURN(0);
+	}
+
+	if (ldlm_is_fail_loc(lock) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
+		OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+	/* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
+	lock_res_and_lock(lock);
+	lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+					      LDLM_FL_AST_MASK);
+	if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		/*
+		 * If somebody cancels lock and cache is already dropped,
+		 * or lock is failed before cp_ast received on client,
+		 * we can tell the server we have no lock. Otherwise, we
+		 * should send cancel after dropping the cache.
+		 */
+		if ((ldlm_is_canceling(lock) && ldlm_is_bl_done(lock)) ||
+		     ldlm_is_failed(lock)) {
+			LDLM_DEBUG(lock,
+				   "callback on lock %llx - lock disappeared",
+				   dlm_req->lock_handle[0].cookie);
+			unlock_res_and_lock(lock);
+			LDLM_LOCK_RELEASE(lock);
+			rc = ldlm_callback_reply(req, -EINVAL);
+			ldlm_callback_errmsg(req, "Operate on stale lock", rc,
+					     &dlm_req->lock_handle[0]);
+			RETURN(0);
+		}
+		/*
+		 * BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast.
+		 */
+		ldlm_lock_remove_from_lru(lock);
+		ldlm_set_bl_ast(lock);
+	}
+	if (lock->l_remote_handle.cookie == 0)
+		lock->l_remote_handle = dlm_req->lock_handle[1];
+	unlock_res_and_lock(lock);
+
+	/*
+	 * We want the ost thread to get this reply so that it can respond
+	 * to ost requests (write cache writeback) that might be triggered
+	 * in the callback.
+	 *
+	 * But we'd also like to be able to indicate in the reply that we're
+	 * cancelling right now, because it's unused, or have an intent result
+	 * in the reply, so we might have to push the responsibility for sending
+	 * the reply down into the AST handlers, alas.
+	 */
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case LDLM_BL_CALLBACK:
+		CDEBUG(D_INODE, "blocking ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
+		if (!ldlm_is_cancel_on_block(lock)) {
+			rc = ldlm_callback_reply(req, 0);
+			if (req->rq_no_reply || rc)
+				ldlm_callback_errmsg(req, "Normal process", rc,
+						     &dlm_req->lock_handle[0]);
+		}
+		if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+			ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+		break;
+	case LDLM_CP_CALLBACK:
+		CDEBUG(D_INODE, "completion ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+		rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+		if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE))
+			ldlm_callback_reply(req, rc);
+		break;
+	case LDLM_GL_CALLBACK:
+		CDEBUG(D_INODE, "glimpse ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+		ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+		break;
+	default:
+		LBUG(); /* checked above */
+	}
+
+	RETURN(0);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Main handler for canceld thread.
+ *
+ * Separated into its own thread to avoid deadlocks.
+ */
+static int ldlm_cancel_handler(struct ptlrpc_request *req)
+{
+	int rc;
+
+	ENTRY;
+
+	/*
+	 * Requests arrive in sender's byte order.  The ptlrpc service
+	 * handler has already checked and, if necessary, byte-swapped the
+	 * incoming request message body, but I am responsible for the
+	 * message buffers.
+	 */
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+	if (req->rq_export == NULL) {
+		struct ldlm_request *dlm_req;
+
+		CERROR("%s from %s arrived at %llu with bad export cookie %llu\n",
+		       ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)),
+		       libcfs_nid2str(req->rq_peer.nid),
+		       (unsigned long long)req->rq_arrival_time.tv_sec,
+		       lustre_msg_get_handle(req->rq_reqmsg)->cookie);
+
+		if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) {
+			req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+			dlm_req = req_capsule_client_get(&req->rq_pill,
+							 &RMF_DLM_REQ);
+			if (dlm_req != NULL)
+				ldlm_lock_dump_handle(D_ERROR,
+						      &dlm_req->lock_handle[0]);
+		}
+		ldlm_callback_reply(req, -ENOTCONN);
+		RETURN(0);
+	}
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	/* XXX FIXME move this back to mds/handler.c, b=249 */
+	case LDLM_CANCEL:
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+		CDEBUG(D_INODE, "cancel\n");
+		if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET) ||
+		    CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND) ||
+		    CFS_FAIL_CHECK(OBD_FAIL_LDLM_BL_EVICT))
+			RETURN(0);
+		rc = ldlm_handle_cancel(req);
+		break;
+	case LDLM_CONVERT:
+	{
+		struct ldlm_request *dlm_req;
+
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
+		CDEBUG(D_INODE, "convert\n");
+
+		dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+		if (dlm_req == NULL) {
+			CDEBUG(D_INFO, "bad request buffer for cancel\n");
+			rc = ldlm_callback_reply(req, -EPROTO);
+		} else {
+			req->rq_status = ldlm_handle_convert0(req, dlm_req);
+			rc = ptlrpc_reply(req);
+		}
+		break;
+	}
+	default:
+		CERROR("invalid opcode %d\n",
+		       lustre_msg_get_opc(req->rq_reqmsg));
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+		rc = ldlm_callback_reply(req, -EINVAL);
+	}
+
+	RETURN(rc);
+}
+
+static int ldlm_cancel_hpreq_lock_match(struct ptlrpc_request *req,
+					struct ldlm_lock *lock)
+{
+	struct ldlm_request *dlm_req;
+	struct lustre_handle lockh;
+	int rc = 0;
+	int i;
+
+	ENTRY;
+
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL)
+		RETURN(0);
+
+	ldlm_lock2handle(lock, &lockh);
+	for (i = 0; i < dlm_req->lock_count; i++) {
+		if (lustre_handle_equal(&dlm_req->lock_handle[i],
+					&lockh)) {
+			DEBUG_REQ(D_RPCTRACE, req,
+				  "Prio raised by lock %#llx", lockh.cookie);
+			rc = 1;
+			break;
+		}
+	}
+
+	RETURN(rc);
+}
+
+static int ldlm_cancel_hpreq_check(struct ptlrpc_request *req)
+{
+	struct ldlm_request *dlm_req;
+	int rc = 0;
+	int i;
+	unsigned int size;
+
+	ENTRY;
+
+	/* no prolong in recovery */
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+		RETURN(0);
+
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL)
+		RETURN(-EFAULT);
+
+	size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT);
+	if (size <= offsetof(struct ldlm_request, lock_handle) ||
+	    (size - offsetof(struct ldlm_request, lock_handle)) /
+	     sizeof(struct lustre_handle) < dlm_req->lock_count)
+		RETURN(-EPROTO);
+
+	for (i = 0; i < dlm_req->lock_count; i++) {
+		struct ldlm_lock *lock;
+
+		lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
+		if (lock == NULL)
+			continue;
+
+		rc = ldlm_is_ast_sent(lock) ? 1 : 0;
+		if (rc)
+			LDLM_DEBUG(lock, "hpreq cancel/convert lock");
+		LDLM_LOCK_PUT(lock);
+
+		if (rc)
+			break;
+	}
+
+	RETURN(rc);
+}
+
+static struct ptlrpc_hpreq_ops ldlm_cancel_hpreq_ops = {
+	.hpreq_lock_match = ldlm_cancel_hpreq_lock_match,
+	.hpreq_check      = ldlm_cancel_hpreq_check,
+	.hpreq_fini       = NULL,
+};
+
+static int ldlm_hpreq_handler(struct ptlrpc_request *req)
+{
+	ENTRY;
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+	if (req->rq_export == NULL)
+		RETURN(0);
+
+	if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) {
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+		req->rq_ops = &ldlm_cancel_hpreq_ops;
+	} else if (LDLM_CONVERT == lustre_msg_get_opc(req->rq_reqmsg)) {
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
+		req->rq_ops = &ldlm_cancel_hpreq_ops;
+	}
+	RETURN(0);
+}
+
+static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			       struct hlist_node *hnode, void *data)
+
+{
+	struct list_head *rpc_list = data;
+	struct ldlm_lock *lock = cfs_hash_object(hs, hnode);
+
+	lock_res_and_lock(lock);
+
+	if (!ldlm_is_granted(lock)) {
+		unlock_res_and_lock(lock);
+		return 0;
+	}
+
+	LASSERT(lock->l_resource);
+	if (lock->l_resource->lr_type != LDLM_IBITS &&
+	    lock->l_resource->lr_type != LDLM_PLAIN) {
+		unlock_res_and_lock(lock);
+		return 0;
+	}
+
+	if (ldlm_is_ast_sent(lock)) {
+		unlock_res_and_lock(lock);
+		return 0;
+	}
+
+	LASSERT(lock->l_blocking_ast);
+	LASSERT(!lock->l_blocking_lock);
+
+	ldlm_set_ast_sent(lock);
+	if (lock->l_export && lock->l_export->exp_lock_hash) {
+		/*
+		 * NB: it's safe to call cfs_hash_del() even lock isn't
+		 * in exp_lock_hash.
+		 */
+		/*
+		 * In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp()
+		 */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_del(lock->l_export->exp_lock_hash,
+			     &lock->l_remote_handle, &lock->l_exp_hash);
+	}
+
+	list_add_tail(&lock->l_rk_ast, rpc_list);
+	LDLM_LOCK_GET(lock);
+
+	unlock_res_and_lock(lock);
+	return 0;
+}
+
+void ldlm_revoke_export_locks(struct obd_export *exp)
+{
+	int rc;
+	LIST_HEAD(rpc_list);
+	ENTRY;
+
+	cfs_hash_for_each_nolock(exp->exp_lock_hash,
+				 ldlm_revoke_lock_cb, &rpc_list, 0);
+	rc = ldlm_run_ast_work(exp->exp_obd->obd_namespace, &rpc_list,
+			  LDLM_WORK_REVOKE_AST);
+
+	if (rc == -ERESTART)
+		ldlm_reprocess_recovery_done(exp->exp_obd->obd_namespace);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_revoke_export_locks);
+#endif /* HAVE_SERVER_SUPPORT */
+
+static int ldlm_bl_get_work(struct ldlm_bl_pool *blp,
+			    struct ldlm_bl_work_item **p_blwi,
+			    struct obd_export **p_exp)
+{
+	struct ldlm_bl_work_item *blwi = NULL;
+	static unsigned int num_bl;
+	static unsigned int num_stale;
+	int num_th = atomic_read(&blp->blp_num_threads);
+
+	*p_exp = obd_stale_export_get();
+
+	spin_lock(&blp->blp_lock);
+	if (*p_exp != NULL) {
+		if (num_th == 1 || ++num_stale < num_th) {
+			spin_unlock(&blp->blp_lock);
+			return 1;
+		}
+		num_stale = 0;
+	}
+
+	/* process a request from the blp_list at least every blp_num_threads */
+	if (!list_empty(&blp->blp_list) &&
+	    (list_empty(&blp->blp_prio_list) || num_bl == 0))
+		blwi = list_entry(blp->blp_list.next,
+				  struct ldlm_bl_work_item, blwi_entry);
+	else
+		if (!list_empty(&blp->blp_prio_list))
+			blwi = list_entry(blp->blp_prio_list.next,
+					  struct ldlm_bl_work_item,
+					  blwi_entry);
+
+	if (blwi) {
+		if (++num_bl >= num_th)
+			num_bl = 0;
+		list_del(&blwi->blwi_entry);
+	}
+	spin_unlock(&blp->blp_lock);
+	*p_blwi = blwi;
+
+	if (*p_exp != NULL && *p_blwi != NULL) {
+		obd_stale_export_put(*p_exp);
+		*p_exp = NULL;
+	}
+
+	return (*p_blwi != NULL || *p_exp != NULL) ? 1 : 0;
+}
+
+/* This only contains temporary data until the thread starts */
+struct ldlm_bl_thread_data {
+	struct ldlm_bl_pool	*bltd_blp;
+	struct completion	bltd_comp;
+	int			bltd_num;
+};
+
+static int ldlm_bl_thread_main(void *arg);
+
+static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp, bool check_busy)
+{
+	struct ldlm_bl_thread_data bltd = { .bltd_blp = blp };
+	struct task_struct *task;
+
+	init_completion(&bltd.bltd_comp);
+
+	bltd.bltd_num = atomic_inc_return(&blp->blp_num_threads);
+	if (bltd.bltd_num >= blp->blp_max_threads) {
+		atomic_dec(&blp->blp_num_threads);
+		return 0;
+	}
+
+	LASSERTF(bltd.bltd_num > 0, "thread num:%d\n", bltd.bltd_num);
+	if (check_busy &&
+	    atomic_read(&blp->blp_busy_threads) < (bltd.bltd_num - 1)) {
+		atomic_dec(&blp->blp_num_threads);
+		return 0;
+	}
+
+	task = kthread_run(ldlm_bl_thread_main, &bltd, "ldlm_bl_%02d",
+			   bltd.bltd_num);
+	if (IS_ERR(task)) {
+		CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n",
+		       bltd.bltd_num, PTR_ERR(task));
+		atomic_dec(&blp->blp_num_threads);
+		return PTR_ERR(task);
+	}
+	wait_for_completion(&bltd.bltd_comp);
+
+	return 0;
+}
+
+/* Not fatal if racy and have a few too many threads */
+static int ldlm_bl_thread_need_create(struct ldlm_bl_pool *blp,
+				      struct ldlm_bl_work_item *blwi)
+{
+	if (atomic_read(&blp->blp_num_threads) >= blp->blp_max_threads)
+		return 0;
+
+	if (atomic_read(&blp->blp_busy_threads) <
+	    atomic_read(&blp->blp_num_threads))
+		return 0;
+
+	if (blwi != NULL && (blwi->blwi_ns == NULL ||
+			     blwi->blwi_mem_pressure))
+		return 0;
+
+	return 1;
+}
+
+static int ldlm_bl_thread_blwi(struct ldlm_bl_pool *blp,
+			       struct ldlm_bl_work_item *blwi)
+{
+	/* '1' for consistency with code that checks !mpflag to restore */
+	unsigned int mpflags = 1;
+
+	ENTRY;
+
+	if (blwi->blwi_ns == NULL)
+		/* added by ldlm_cleanup() */
+		RETURN(LDLM_ITER_STOP);
+
+	if (blwi->blwi_mem_pressure)
+		mpflags = memalloc_noreclaim_save();
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL2, 4);
+
+	if (blwi->blwi_count) {
+		int count;
+		/*
+		 * The special case when we cancel locks in lru
+		 * asynchronously, we pass the list of locks here.
+		 * Thus locks are marked LDLM_FL_CANCELING, but NOT
+		 * canceled locally yet.
+		 */
+		count = ldlm_cli_cancel_list_local(&blwi->blwi_head,
+						   blwi->blwi_count,
+						   LCF_BL_AST);
+		ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL,
+				     blwi->blwi_flags);
+	} else if (blwi->blwi_lock) {
+		ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld,
+					blwi->blwi_lock);
+	} else {
+		ldlm_pool_recalc(&blwi->blwi_ns->ns_pool, true);
+		spin_lock(&blwi->blwi_ns->ns_lock);
+		blwi->blwi_ns->ns_rpc_recalc = 0;
+		spin_unlock(&blwi->blwi_ns->ns_lock);
+		ldlm_namespace_put(blwi->blwi_ns);
+	}
+
+	if (blwi->blwi_mem_pressure)
+		memalloc_noreclaim_restore(mpflags);
+
+	if (blwi->blwi_flags & LCF_ASYNC)
+		OBD_FREE(blwi, sizeof(*blwi));
+	else
+		complete(&blwi->blwi_comp);
+
+	RETURN(0);
+}
+
+/**
+ * Cancel stale locks on export. Cancel blocked locks first.
+ * If the given export has blocked locks, the next in the list may have
+ * them too, thus cancel not blocked locks only if the current export has
+ * no blocked locks.
+ **/
+static int ldlm_bl_thread_exports(struct ldlm_bl_pool *blp,
+				  struct obd_export *exp)
+{
+	int num;
+
+	ENTRY;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 4);
+
+	num = ldlm_export_cancel_blocked_locks(exp);
+	if (num == 0)
+		ldlm_export_cancel_locks(exp);
+
+	obd_stale_export_put(exp);
+
+	RETURN(0);
+}
+
+
+/**
+ * Main blocking requests processing thread.
+ *
+ * Callers put locks into its queue by calling ldlm_bl_to_thread.
+ * This thread in the end ends up doing actual call to ->l_blocking_ast
+ * for queued locks.
+ */
+static int ldlm_bl_thread_main(void *arg)
+{
+	struct lu_env *env;
+	struct ldlm_bl_pool *blp;
+	struct ldlm_bl_thread_data *bltd = arg;
+	int rc;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(env);
+	if (!env)
+		RETURN(-ENOMEM);
+	rc = lu_env_init(env, LCT_DT_THREAD);
+	if (rc)
+		GOTO(out_env, rc);
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_env_fini, rc);
+
+	blp = bltd->bltd_blp;
+
+	complete(&bltd->bltd_comp);
+	/* cannot use bltd after this, it is only on caller's stack */
+
+	while (1) {
+		struct ldlm_bl_work_item *blwi = NULL;
+		struct obd_export *exp = NULL;
+		int rc;
+
+		rc = ldlm_bl_get_work(blp, &blwi, &exp);
+
+		if (rc == 0)
+			wait_event_idle_exclusive(blp->blp_waitq,
+						  ldlm_bl_get_work(blp, &blwi,
+								   &exp));
+		atomic_inc(&blp->blp_busy_threads);
+
+		if (ldlm_bl_thread_need_create(blp, blwi))
+			/* discard the return value, we tried */
+			ldlm_bl_thread_start(blp, true);
+
+		if (exp)
+			rc = ldlm_bl_thread_exports(blp, exp);
+		else if (blwi)
+			rc = ldlm_bl_thread_blwi(blp, blwi);
+
+		atomic_dec(&blp->blp_busy_threads);
+
+		if (rc == LDLM_ITER_STOP)
+			break;
+
+		/*
+		 * If there are many namespaces, we will not sleep waiting for
+		 * work, and must do a cond_resched to avoid holding the CPU
+		 * for too long
+		 */
+		cond_resched();
+	}
+
+	atomic_dec(&blp->blp_num_threads);
+	complete(&blp->blp_comp);
+
+	lu_env_remove(env);
+out_env_fini:
+	lu_env_fini(env);
+out_env:
+	OBD_FREE_PTR(env);
+	RETURN(rc);
+}
+
+
+static int ldlm_setup(void);
+static int ldlm_cleanup(void);
+
+int ldlm_get_ref(void)
+{
+	int rc = 0;
+
+	ENTRY;
+	mutex_lock(&ldlm_ref_mutex);
+	if (++ldlm_refcount == 1) {
+		rc = ldlm_setup();
+		if (rc)
+			ldlm_refcount--;
+	}
+	mutex_unlock(&ldlm_ref_mutex);
+
+	RETURN(rc);
+}
+
+void ldlm_put_ref(void)
+{
+	ENTRY;
+	mutex_lock(&ldlm_ref_mutex);
+	if (ldlm_refcount == 1) {
+		int rc = ldlm_cleanup();
+
+		if (rc)
+			CERROR("ldlm_cleanup failed: %d\n", rc);
+		else
+			ldlm_refcount--;
+	} else {
+		ldlm_refcount--;
+	}
+	mutex_unlock(&ldlm_ref_mutex);
+
+	EXIT;
+}
+
+/*
+ * Export handle<->lock hash operations.
+ */
+static unsigned
+ldlm_export_lock_hash(struct cfs_hash *hs, const void *key, unsigned int mask)
+{
+	return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask);
+}
+
+static void *
+ldlm_export_lock_key(struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	return &lock->l_remote_handle;
+}
+
+static void
+ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key)
+{
+	struct ldlm_lock     *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	lock->l_remote_handle = *(struct lustre_handle *)key;
+}
+
+static int
+ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return lustre_handle_equal(ldlm_export_lock_key(hnode), key);
+}
+
+static void *
+ldlm_export_lock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+}
+
+static void
+ldlm_export_lock_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	LDLM_LOCK_GET(lock);
+}
+
+static void
+ldlm_export_lock_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	LDLM_LOCK_RELEASE(lock);
+}
+
+static struct cfs_hash_ops ldlm_export_lock_ops = {
+	.hs_hash        = ldlm_export_lock_hash,
+	.hs_key         = ldlm_export_lock_key,
+	.hs_keycmp      = ldlm_export_lock_keycmp,
+	.hs_keycpy      = ldlm_export_lock_keycpy,
+	.hs_object      = ldlm_export_lock_object,
+	.hs_get         = ldlm_export_lock_get,
+	.hs_put         = ldlm_export_lock_put,
+	.hs_put_locked  = ldlm_export_lock_put,
+};
+
+int ldlm_init_export(struct obd_export *exp)
+{
+	int rc;
+
+	ENTRY;
+
+	exp->exp_lock_hash =
+		cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+				HASH_EXP_LOCK_CUR_BITS,
+				HASH_EXP_LOCK_MAX_BITS,
+				HASH_EXP_LOCK_BKT_BITS, 0,
+				CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				&ldlm_export_lock_ops,
+				CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY |
+				CFS_HASH_NBLK_CHANGE);
+
+	if (!exp->exp_lock_hash)
+		RETURN(-ENOMEM);
+
+	rc = ldlm_init_flock_export(exp);
+	if (rc)
+		GOTO(err, rc);
+
+	RETURN(0);
+err:
+	ldlm_destroy_export(exp);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_init_export);
+
+void ldlm_destroy_export(struct obd_export *exp)
+{
+	ENTRY;
+	cfs_hash_putref(exp->exp_lock_hash);
+	exp->exp_lock_hash = NULL;
+
+	ldlm_destroy_flock_export(exp);
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_export);
+
+static ssize_t cancel_unused_locks_before_replay_show(struct kobject *kobj,
+						      struct attribute *attr,
+						      char *buf)
+{
+	return sprintf(buf, "%d\n", ldlm_cancel_unused_locks_before_replay);
+}
+
+static ssize_t cancel_unused_locks_before_replay_store(struct kobject *kobj,
+						       struct attribute *attr,
+						       const char *buffer,
+						       size_t count)
+{
+	int rc;
+	unsigned long val;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	ldlm_cancel_unused_locks_before_replay = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(cancel_unused_locks_before_replay);
+
+static struct attribute *ldlm_attrs[] = {
+	&lustre_attr_cancel_unused_locks_before_replay.attr,
+	NULL,
+};
+
+static struct attribute_group ldlm_attr_group = {
+	.attrs = ldlm_attrs,
+};
+
+static int ldlm_setup(void)
+{
+	static struct ptlrpc_service_conf	conf;
+	struct ldlm_bl_pool		       *blp = NULL;
+#ifdef HAVE_SERVER_SUPPORT
+	struct task_struct *task;
+#endif /* HAVE_SERVER_SUPPORT */
+	int i;
+	int rc = 0;
+
+	ENTRY;
+
+	if (ldlm_state != NULL)
+		RETURN(-EALREADY);
+
+	OBD_ALLOC(ldlm_state, sizeof(*ldlm_state));
+	if (ldlm_state == NULL)
+		RETURN(-ENOMEM);
+
+	ldlm_kobj = kobject_create_and_add("ldlm", &lustre_kset->kobj);
+	if (!ldlm_kobj)
+		GOTO(out, -ENOMEM);
+
+	rc = sysfs_create_group(ldlm_kobj, &ldlm_attr_group);
+	if (rc)
+		GOTO(out, rc);
+
+	ldlm_ns_kset = kset_create_and_add("namespaces", NULL, ldlm_kobj);
+	if (!ldlm_ns_kset)
+		GOTO(out, -ENOMEM);
+
+	ldlm_svc_kset = kset_create_and_add("services", NULL, ldlm_kobj);
+	if (!ldlm_svc_kset)
+		GOTO(out, -ENOMEM);
+
+	rc = ldlm_debugfs_setup();
+	if (rc != 0)
+		GOTO(out, rc);
+
+	memset(&conf, 0, sizeof(conf));
+	conf = (typeof(conf)) {
+		.psc_name		= "ldlm_cbd",
+		.psc_watchdog_factor	= 2,
+		.psc_buf		= {
+			.bc_nbufs		= LDLM_CLIENT_NBUFS,
+			.bc_buf_size		= LDLM_BUFSIZE,
+			.bc_req_max_size	= LDLM_MAXREQSIZE,
+			.bc_rep_max_size	= LDLM_MAXREPSIZE,
+			.bc_req_portal		= LDLM_CB_REQUEST_PORTAL,
+			.bc_rep_portal		= LDLM_CB_REPLY_PORTAL,
+		},
+		.psc_thr		= {
+			.tc_thr_name		= "ldlm_cb",
+			.tc_thr_factor		= LDLM_THR_FACTOR,
+			.tc_nthrs_init		= LDLM_NTHRS_INIT,
+			.tc_nthrs_base		= LDLM_NTHRS_BASE,
+			.tc_nthrs_max		= LDLM_NTHRS_MAX,
+			.tc_nthrs_user		= ldlm_num_threads,
+			.tc_cpu_bind		= ldlm_cpu_bind,
+			.tc_ctx_tags		= LCT_MD_THREAD | LCT_DT_THREAD,
+		},
+		.psc_cpt		= {
+			.cc_pattern		= ldlm_cpts,
+			.cc_affinity		= true,
+		},
+		.psc_ops		= {
+			.so_req_handler		= ldlm_callback_handler,
+		},
+	};
+	ldlm_state->ldlm_cb_service = \
+			ptlrpc_register_service(&conf, ldlm_svc_kset,
+						ldlm_svc_debugfs_dir);
+	if (IS_ERR(ldlm_state->ldlm_cb_service)) {
+		CERROR("failed to start service\n");
+		rc = PTR_ERR(ldlm_state->ldlm_cb_service);
+		ldlm_state->ldlm_cb_service = NULL;
+		GOTO(out, rc);
+	}
+
+#ifdef HAVE_SERVER_SUPPORT
+	memset(&conf, 0, sizeof(conf));
+	conf = (typeof(conf)) {
+		.psc_name		= "ldlm_canceld",
+		.psc_watchdog_factor	= 6,
+		.psc_buf		= {
+			.bc_nbufs		= LDLM_SERVER_NBUFS,
+			.bc_buf_size		= LDLM_BUFSIZE,
+			.bc_req_max_size	= LDLM_MAXREQSIZE,
+			.bc_rep_max_size	= LDLM_MAXREPSIZE,
+			.bc_req_portal		= LDLM_CANCEL_REQUEST_PORTAL,
+			.bc_rep_portal		= LDLM_CANCEL_REPLY_PORTAL,
+
+		},
+		.psc_thr		= {
+			.tc_thr_name		= "ldlm_cn",
+			.tc_thr_factor		= LDLM_THR_FACTOR,
+			.tc_nthrs_init		= LDLM_NTHRS_INIT,
+			.tc_nthrs_base		= LDLM_NTHRS_BASE,
+			.tc_nthrs_max		= LDLM_NTHRS_MAX,
+			.tc_nthrs_user		= ldlm_num_threads,
+			.tc_cpu_bind		= ldlm_cpu_bind,
+			.tc_ctx_tags		= LCT_MD_THREAD | \
+						  LCT_DT_THREAD | \
+						  LCT_CL_THREAD,
+		},
+		.psc_cpt		= {
+			.cc_pattern		= ldlm_cpts,
+			.cc_affinity		= true,
+		},
+		.psc_ops		= {
+			.so_req_handler		= ldlm_cancel_handler,
+			.so_hpreq_handler	= ldlm_hpreq_handler,
+		},
+	};
+	ldlm_state->ldlm_cancel_service = \
+			ptlrpc_register_service(&conf, ldlm_svc_kset,
+						ldlm_svc_debugfs_dir);
+	if (IS_ERR(ldlm_state->ldlm_cancel_service)) {
+		CERROR("failed to start service\n");
+		rc = PTR_ERR(ldlm_state->ldlm_cancel_service);
+		ldlm_state->ldlm_cancel_service = NULL;
+		GOTO(out, rc);
+	}
+#endif /* HAVE_SERVER_SUPPORT */
+
+	OBD_ALLOC(blp, sizeof(*blp));
+	if (blp == NULL)
+		GOTO(out, rc = -ENOMEM);
+	ldlm_state->ldlm_bl_pool = blp;
+
+	spin_lock_init(&blp->blp_lock);
+	INIT_LIST_HEAD(&blp->blp_list);
+	INIT_LIST_HEAD(&blp->blp_prio_list);
+	init_waitqueue_head(&blp->blp_waitq);
+	atomic_set(&blp->blp_num_threads, 0);
+	atomic_set(&blp->blp_busy_threads, 0);
+
+	if (ldlm_num_threads == 0) {
+		blp->blp_min_threads = LDLM_NTHRS_INIT;
+		blp->blp_max_threads = LDLM_NTHRS_MAX;
+	} else {
+		blp->blp_min_threads = blp->blp_max_threads = \
+			min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT,
+							 ldlm_num_threads));
+	}
+
+	for (i = 0; i < blp->blp_min_threads; i++) {
+		rc = ldlm_bl_thread_start(blp, false);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+#ifdef HAVE_SERVER_SUPPORT
+	task = kthread_run(expired_lock_main, NULL, "ldlm_elt");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Cannot start ldlm expired-lock thread: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	wait_event(expired_lock_wait_queue,
+		   expired_lock_thread_state == ELT_READY);
+#endif /* HAVE_SERVER_SUPPORT */
+
+	rc = ldlm_pools_init();
+	if (rc) {
+		CERROR("Failed to initialize LDLM pools: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	rc = ldlm_reclaim_setup();
+	if (rc) {
+		CERROR("Failed to setup reclaim thread: rc = %d\n", rc);
+		GOTO(out, rc);
+	}
+	RETURN(0);
+
+ out:
+	ldlm_cleanup();
+	RETURN(rc);
+}
+
+static int ldlm_cleanup(void)
+{
+	ENTRY;
+
+	if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) ||
+	    !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) {
+		CERROR("ldlm still has namespaces; clean these up first.\n");
+		ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+		ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+		RETURN(-EBUSY);
+	}
+
+	ldlm_reclaim_cleanup();
+	ldlm_pools_fini();
+
+	if (ldlm_state->ldlm_bl_pool != NULL) {
+		struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+		while (atomic_read(&blp->blp_num_threads) > 0) {
+			struct ldlm_bl_work_item blwi = { .blwi_ns = NULL };
+
+			init_completion(&blp->blp_comp);
+
+			spin_lock(&blp->blp_lock);
+			list_add_tail(&blwi.blwi_entry, &blp->blp_list);
+			wake_up(&blp->blp_waitq);
+			spin_unlock(&blp->blp_lock);
+
+			wait_for_completion(&blp->blp_comp);
+		}
+
+		OBD_FREE(blp, sizeof(*blp));
+	}
+
+	if (ldlm_state->ldlm_cb_service != NULL)
+		ptlrpc_unregister_service(ldlm_state->ldlm_cb_service);
+#ifdef HAVE_SERVER_SUPPORT
+	if (ldlm_state->ldlm_cancel_service != NULL)
+		ptlrpc_unregister_service(ldlm_state->ldlm_cancel_service);
+#endif
+
+	if (ldlm_ns_kset)
+		kset_unregister(ldlm_ns_kset);
+	if (ldlm_svc_kset)
+		kset_unregister(ldlm_svc_kset);
+	if (ldlm_kobj) {
+		sysfs_remove_group(ldlm_kobj, &ldlm_attr_group);
+		kobject_put(ldlm_kobj);
+	}
+
+	ldlm_debugfs_cleanup();
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (expired_lock_thread_state != ELT_STOPPED) {
+		expired_lock_thread_state = ELT_TERMINATE;
+		wake_up(&expired_lock_wait_queue);
+		wait_event(expired_lock_wait_queue,
+			   expired_lock_thread_state == ELT_STOPPED);
+	}
+#endif
+
+	OBD_FREE(ldlm_state, sizeof(*ldlm_state));
+	ldlm_state = NULL;
+
+	RETURN(0);
+}
+
+int ldlm_init(void)
+{
+	ldlm_resource_slab = kmem_cache_create("ldlm_resources",
+					       sizeof(struct ldlm_resource), 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_resource_slab == NULL)
+		return -ENOMEM;
+
+	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
+			      sizeof(struct ldlm_lock), 0,
+			      SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_lock_slab == NULL)
+		goto out_resource;
+
+	ldlm_interval_slab = kmem_cache_create("interval_node",
+					sizeof(struct ldlm_interval),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_interval_slab == NULL)
+		goto out_lock;
+
+	ldlm_interval_tree_slab = kmem_cache_create("interval_tree",
+			sizeof(struct ldlm_interval_tree) * LCK_MODE_NUM,
+			0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_interval_tree_slab == NULL)
+		goto out_interval;
+
+#ifdef HAVE_SERVER_SUPPORT
+	ldlm_inodebits_slab = kmem_cache_create("ldlm_ibits_node",
+						sizeof(struct ldlm_ibits_node),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_inodebits_slab == NULL)
+		goto out_interval_tree;
+
+	ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
+					sizeof(struct ldlm_glimpse_work),
+					0, 0, NULL);
+	if (ldlm_glimpse_work_kmem == NULL)
+		goto out_inodebits;
+#endif
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	class_export_dump_hook = ldlm_dump_export_locks;
+#endif
+	return 0;
+#ifdef HAVE_SERVER_SUPPORT
+out_inodebits:
+	kmem_cache_destroy(ldlm_inodebits_slab);
+out_interval_tree:
+	kmem_cache_destroy(ldlm_interval_tree_slab);
+#endif
+out_interval:
+	kmem_cache_destroy(ldlm_interval_slab);
+out_lock:
+	kmem_cache_destroy(ldlm_lock_slab);
+out_resource:
+	kmem_cache_destroy(ldlm_resource_slab);
+
+	return -ENOMEM;
+}
+
+void ldlm_exit(void)
+{
+	if (ldlm_refcount)
+		CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
+	synchronize_rcu();
+	kmem_cache_destroy(ldlm_resource_slab);
+	/*
+	 * ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+	 * rcu_barrier() to wait all outstanding RCU callbacks to complete,
+	 * so that ldlm_lock_free() get a chance to be called.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ldlm_lock_slab);
+	kmem_cache_destroy(ldlm_interval_slab);
+	kmem_cache_destroy(ldlm_interval_tree_slab);
+#ifdef HAVE_SERVER_SUPPORT
+	kmem_cache_destroy(ldlm_inodebits_slab);
+	kmem_cache_destroy(ldlm_glimpse_work_kmem);
+#endif
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
new file mode 100644
index 0000000000000..38a94b159000a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
@@ -0,0 +1,180 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ldlm/ldlm_plain.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of PLAIN lock type.
+ *
+ * PLAIN locks are the simplest form of LDLM locking, and are used when
+ * there only needs to be a single lock on a resource. This avoids some
+ * of the complexity of EXTENT and IBITS lock types, but doesn't allow
+ * different "parts" of a resource to be locked concurrently.  Example
+ * use cases for PLAIN locks include locking of MGS configuration logs
+ * and (as of Lustre 2.4) quota records.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Determine if the lock is compatible with all locks on the queue.
+ *
+ * If \a work_list is provided, conflicting locks are linked there.
+ * If \a work_list is not provided, we exit this function on first conflict.
+ *
+ * \retval 0 if there are conflicting locks in the \a queue
+ * \retval 1 if the lock is compatible to all locks in \a queue
+ */
+static inline int
+ldlm_plain_compat_queue(struct list_head *queue, struct ldlm_lock *req,
+			struct list_head *work_list)
+{
+	enum ldlm_mode req_mode = req->l_req_mode;
+	struct ldlm_lock *lock, *next_lock;
+	int compat = 1;
+
+	ENTRY;
+	lockmode_verify(req_mode);
+
+	list_for_each_entry_safe(lock, next_lock, queue, l_res_link) {
+
+		/*
+		 * We stop walking the queue if we hit ourselves so we don't
+		 * take conflicting locks enqueued after us into account,
+		 * or we'd wait forever.
+		 */
+		if (req == lock)
+			RETURN(compat);
+
+		/* Advance loop cursor to last lock of mode group. */
+		next_lock = list_entry(list_entry(lock->l_sl_mode.prev,
+						  struct ldlm_lock,
+						  l_sl_mode)->l_res_link.next,
+				       struct ldlm_lock, l_res_link);
+
+		if (lockmode_compat(lock->l_req_mode, req_mode))
+			continue;
+
+		if (!work_list)
+			RETURN(0);
+
+		compat = 0;
+
+		/*
+		 * Add locks of the mode group to \a work_list as
+		 * blocking locks for \a req.
+		 */
+		if (lock->l_blocking_ast)
+			ldlm_add_ast_work_item(lock, req, work_list);
+
+		{
+			struct list_head *head;
+
+			head = &lock->l_sl_mode;
+			list_for_each_entry(lock, head, l_sl_mode)
+				if (lock->l_blocking_ast)
+					ldlm_add_ast_work_item(lock, req,
+							       work_list);
+		}
+	}
+
+	RETURN(compat);
+}
+
+/**
+ * Process a granting attempt for plain lock.
+ * Must be called with ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ */
+int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
+			    enum ldlm_process_intention intention,
+			    enum ldlm_error *err, struct list_head *work_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
+	int rc;
+
+	ENTRY;
+	LASSERT(!ldlm_is_granted(lock));
+	check_res_locked(res);
+	*err = ELDLM_OK;
+
+	if (intention == LDLM_PROCESS_RESCAN) {
+		LASSERT(work_list != NULL);
+		rc = ldlm_plain_compat_queue(&res->lr_granted, lock, NULL);
+		if (!rc)
+			RETURN(LDLM_ITER_STOP);
+		rc = ldlm_plain_compat_queue(&res->lr_waiting, lock, NULL);
+		if (!rc)
+			RETURN(LDLM_ITER_STOP);
+
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, grant_work);
+		RETURN(LDLM_ITER_CONTINUE);
+	}
+
+	rc = ldlm_plain_compat_queue(&res->lr_granted, lock, work_list);
+	rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, work_list);
+
+	if (rc == 2) {
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, grant_work);
+	}
+
+	RETURN(LDLM_ITER_CONTINUE);
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy)
+{
+	/* No policy for plain locks */
+}
+
+void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy)
+{
+	/* No policy for plain locks */
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
new file mode 100644
index 0000000000000..d23240c7f19c9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
@@ -0,0 +1,1568 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ldlm/ldlm_pool.c
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+/*
+ * Idea of this code is rather simple. Each second, for each server namespace
+ * we have SLV - server lock volume which is calculated on current number of
+ * granted locks, grant speed for past period, etc - that is, locking load.
+ * This SLV number may be thought as a flow definition for simplicity. It is
+ * sent to clients with each occasion to let them know what is current load
+ * situation on the server. By default, at the beginning, SLV on server is
+ * set max value which is calculated as the following: allow to one client
+ * have all locks of limit ->pl_limit for 10h.
+ *
+ * Next, on clients, number of cached locks is not limited artificially in any
+ * way as it was before. Instead, client calculates CLV, that is, client lock
+ * volume for each lock and compares it with last SLV from the server. CLV is
+ * calculated as the number of locks in LRU * lock live time in seconds. If
+ * CLV > SLV - lock is canceled.
+ *
+ * Client has LVF, that is, lock volume factor which regulates how much
+ * sensitive client should be about last SLV from server. The higher LVF is the
+ * more locks will be canceled on client. Default value for it is 1. Setting
+ * LVF to 2 means that client will cancel locks 2 times faster.
+ *
+ * Locks on a client will be canceled more intensively in these cases:
+ * (1) if SLV is smaller, that is, load is higher on the server;
+ * (2) client has a lot of locks (the more locks are held by client, the bigger
+ *     chances that some of them should be canceled);
+ * (3) client has old locks (taken some time ago);
+ *
+ * Thus, according to flow paradigm that we use for better understanding SLV,
+ * CLV is the volume of particle in flow described by SLV. According to this,
+ * if flow is getting thinner, more and more particles become outside of it and
+ * as particles are locks, they should be canceled.
+ *
+ * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com).
+ * Andreas Dilger(adilger@clusterfs.com) proposed few nice ideas like using LVF
+ * and many cleanups. Flow definition to allow more easy understanding of the
+ * logic belongs to Nikita Danilov(nikita@clusterfs.com) as well as many
+ * cleanups and fixes. And design and implementation are done by Yury Umanets
+ * (umka@clusterfs.com).
+ *
+ * Glossary for terms used:
+ *
+ * pl_limit - Number of allowed locks in pool. Applies to server and client
+ * side (tunable);
+ *
+ * pl_granted - Number of granted locks (calculated);
+ * pl_grant_rate - Number of granted locks for last T (calculated);
+ * pl_cancel_rate - Number of canceled locks for last T (calculated);
+ * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
+ * pl_grant_plan - Planned number of granted locks for next T (calculated);
+ * pl_server_lock_volume - Current server lock volume (calculated);
+ *
+ * As it may be seen from list above, we have few possible tunables which may
+ * affect behavior much. They all may be modified via sysfs. However, they also
+ * give a possibility for constructing few pre-defined behavior policies. If
+ * none of predefines is suitable for a working pattern being used, new one may
+ * be "constructed" via sysfs tunables.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/workqueue.h>
+#include <libcfs/linux/linux-mem.h>
+#include <lustre_dlm.h>
+#include <cl_object.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include "ldlm_internal.h"
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+
+/*
+ * 50 ldlm locks for 1MB of RAM.
+ */
+#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_SHIFT)) * 50)
+
+/*
+ * Maximal possible grant step plan in %.
+ */
+#define LDLM_POOL_MAX_GSP (30)
+
+/*
+ * Minimal possible grant step plan in %.
+ */
+#define LDLM_POOL_MIN_GSP (1)
+
+/*
+ * This controls the speed of reaching LDLM_POOL_MAX_GSP
+ * with increasing thread period.
+ */
+#define LDLM_POOL_GSP_STEP_SHIFT (2)
+
+/*
+ * LDLM_POOL_GSP% of all locks is default GP.
+ */
+#define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_MAX_GSP) / 100)
+
+/*
+ * Max age for locks on clients.
+ */
+#define LDLM_POOL_MAX_AGE (36000)
+
+/*
+ * The granularity of SLV calculation.
+ */
+#define LDLM_POOL_SLV_SHIFT (10)
+
+static inline __u64 dru(__u64 val, __u32 shift, int round_up)
+{
+	return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift;
+}
+
+static inline __u64 ldlm_pool_slv_max(__u32 L)
+{
+	/*
+	 * Allow to have all locks for 1 client for 10 hrs.
+	 * Formula is the following: limit * 10h / 1 client.
+	 */
+	__u64 lim = (__u64)L *  LDLM_POOL_MAX_AGE / 1;
+	return lim;
+}
+
+static inline __u64 ldlm_pool_slv_min(__u32 L)
+{
+	return 1;
+}
+
+enum {
+	LDLM_POOL_FIRST_STAT = 0,
+	LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT,
+	LDLM_POOL_GRANT_STAT,
+	LDLM_POOL_CANCEL_STAT,
+	LDLM_POOL_GRANT_RATE_STAT,
+	LDLM_POOL_CANCEL_RATE_STAT,
+	LDLM_POOL_GRANT_PLAN_STAT,
+	LDLM_POOL_SLV_STAT,
+	LDLM_POOL_SHRINK_REQTD_STAT,
+	LDLM_POOL_SHRINK_FREED_STAT,
+	LDLM_POOL_RECALC_STAT,
+	LDLM_POOL_TIMING_STAT,
+	LDLM_POOL_LAST_STAT
+};
+
+static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
+{
+	return container_of(pl, struct ldlm_namespace, ns_pool);
+}
+
+/**
+ * Calculates suggested grant_step in % of available locks for passed
+ * \a period. This is later used in grant_plan calculations.
+ */
+static inline int ldlm_pool_t2gsp(unsigned int t)
+{
+	/*
+	 * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP
+	 * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
+	 *
+	 * How this will affect execution is the following:
+	 *
+	 * - for thread period 1s we will have grant_step 1% which good from
+	 * pov of taking some load off from server and push it out to clients.
+	 * This is like that because 1% for grant_step means that server will
+	 * not allow clients to get lots of locks in short period of time and
+	 * keep all old locks in their caches. Clients will always have to
+	 * get some locks back if they want to take some new;
+	 *
+	 * - for thread period 10s (which is default) we will have 23% which
+	 * means that clients will have enough of room to take some new locks
+	 * without getting some back. All locks from this 23% which were not
+	 * taken by clients in current period will contribute in SLV growing.
+	 * SLV growing means more locks cached on clients until limit or grant
+	 * plan is reached.
+	 */
+	return LDLM_POOL_MAX_GSP -
+		((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >>
+		 (t >> LDLM_POOL_GSP_STEP_SHIFT));
+}
+
+static inline int ldlm_pool_granted(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_granted);
+}
+
+/**
+ * Recalculates next grant limit on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
+{
+	int granted, grant_step, limit;
+
+	limit = ldlm_pool_get_limit(pl);
+	granted = ldlm_pool_granted(pl);
+
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	grant_step = ((limit - granted) * grant_step) / 100;
+	pl->pl_grant_plan = granted + grant_step;
+	limit = (limit * 5) >> 2;
+	if (pl->pl_grant_plan > limit)
+		pl->pl_grant_plan = limit;
+}
+
+/**
+ * Recalculates next SLV on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
+{
+	int granted;
+	int grant_plan;
+	int round_up;
+	__u64 slv;
+	__u64 slv_factor;
+	__u64 grant_usage;
+	__u32 limit;
+
+	slv = pl->pl_server_lock_volume;
+	grant_plan = pl->pl_grant_plan;
+	limit = ldlm_pool_get_limit(pl);
+	granted = ldlm_pool_granted(pl);
+	round_up = granted < limit;
+
+	grant_usage = max_t(int, limit - (granted - grant_plan), 1);
+
+	/*
+	 * Find out SLV change factor which is the ratio of grant usage
+	 * from limit. SLV changes as fast as the ratio of grant plan
+	 * consumption. The more locks from grant plan are not consumed
+	 * by clients in last interval (idle time), the faster grows
+	 * SLV. And the opposite, the more grant plan is over-consumed
+	 * (load time) the faster drops SLV.
+	 */
+	slv_factor = (grant_usage << LDLM_POOL_SLV_SHIFT);
+	do_div(slv_factor, limit);
+	slv = slv * slv_factor;
+	slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up);
+
+	if (slv > ldlm_pool_slv_max(limit))
+		slv = ldlm_pool_slv_max(limit);
+	else if (slv < ldlm_pool_slv_min(limit))
+		slv = ldlm_pool_slv_min(limit);
+
+	pl->pl_server_lock_volume = slv;
+}
+
+/**
+ * Recalculates next stats on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_stats(struct ldlm_pool *pl, timeout_t period)
+{
+	int grant_plan = pl->pl_grant_plan;
+	__u64 slv = pl->pl_server_lock_volume;
+	int granted = ldlm_pool_granted(pl);
+	int grant_rate = atomic_read(&pl->pl_grant_rate) / period;
+	int cancel_rate = atomic_read(&pl->pl_cancel_rate) / period;
+
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
+			    slv);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+			    granted);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+			    grant_rate);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+			    grant_plan);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+			    cancel_rate);
+}
+
+/**
+ * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd.
+ */
+static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
+{
+	struct obd_device *obd;
+
+	/*
+	 * Set new SLV in obd field for using it later without accessing the
+	 * pool. This is required to avoid race between sending reply to client
+	 * with new SLV and cleanup server stack in which we can't guarantee
+	 * that namespace is still alive. We know only that obd is alive as
+	 * long as valid export is alive.
+	 */
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL);
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_slv = pl->pl_server_lock_volume;
+	write_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates all pool fields on passed \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+static int ldlm_srv_pool_recalc(struct ldlm_pool *pl, bool force)
+{
+	timeout_t recalc_interval_sec;
+
+	ENTRY;
+
+	recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time;
+	if (!force && recalc_interval_sec < pl->pl_recalc_period)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+	recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time;
+	if (!force && recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+		RETURN(0);
+	}
+	/*
+	 * Recalc SLV after last period. This should be done
+	 * _before_ recalculating new grant plan.
+	 */
+	ldlm_pool_recalc_slv(pl);
+
+	/*
+	 * Make sure that pool informed obd of last SLV changes.
+	 */
+	ldlm_srv_pool_push_slv(pl);
+
+	/*
+	 * Update grant_plan for new period.
+	 */
+	ldlm_pool_recalc_grant_plan(pl);
+
+	pl->pl_recalc_time = ktime_get_seconds();
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			    recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+	RETURN(0);
+}
+
+/**
+ * This function is used on server side as main entry point for memory
+ * pressure handling. It decreases SLV on \a pl according to passed
+ * \a nr and \a gfp_mask.
+ *
+ * Our goal here is to decrease SLV such a way that clients hold \a nr
+ * locks smaller in next 10h.
+ */
+static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
+				int nr,  gfp_t gfp_mask)
+{
+	__u32 limit;
+
+	/*
+	 * VM is asking how many entries may be potentially freed.
+	 */
+	if (nr == 0)
+		return ldlm_pool_granted(pl);
+
+	/*
+	 * Client already canceled locks but server is already in shrinker
+	 * and can't cancel anything. Let's catch this race.
+	 */
+	if (ldlm_pool_granted(pl) == 0)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+
+	/*
+	 * We want shrinker to possibly cause cancellation of @nr locks from
+	 * clients or grant approximately @nr locks smaller next intervals.
+	 *
+	 * This is why we decreased SLV by @nr. This effect will only be as
+	 * long as one re-calc interval (1s these days) and this should be
+	 * enough to pass this decreased SLV to all clients. On next recalc
+	 * interval pool will either increase SLV if locks load is not high
+	 * or will keep on same level or even decrease again, thus, shrinker
+	 * decreased SLV will affect next recalc intervals and this way will
+	 * make locking load lower.
+	 */
+	if (nr < pl->pl_server_lock_volume) {
+		pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr;
+	} else {
+		limit = ldlm_pool_get_limit(pl);
+		pl->pl_server_lock_volume = ldlm_pool_slv_min(limit);
+	}
+
+	/*
+	 * Make sure that pool informed obd of last SLV changes.
+	 */
+	ldlm_srv_pool_push_slv(pl);
+	spin_unlock(&pl->pl_lock);
+
+	/*
+	 * We did not really free any memory here so far, it only will be
+	 * freed later may be, so that we return 0 to not confuse VM.
+	 */
+	return 0;
+}
+
+/**
+ * Setup server side pool \a pl with passed \a limit.
+ */
+static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
+{
+	struct obd_device *obd;
+
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL && obd != LP_POISON);
+	LASSERT(obd->obd_type != LP_POISON);
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_limit = limit;
+	write_unlock(&obd->obd_pool_lock);
+
+	ldlm_pool_set_limit(pl, limit);
+	return 0;
+}
+
+/**
+ * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl.
+ */
+static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl)
+{
+	struct obd_device *obd;
+
+	/*
+	 * Get new SLV and Limit from obd which is updated with coming
+	 * RPCs.
+	 */
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL);
+	read_lock(&obd->obd_pool_lock);
+	pl->pl_server_lock_volume = obd->obd_pool_slv;
+	ldlm_pool_set_limit(pl, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates client size pool \a pl according to current SLV and Limit.
+ */
+static int ldlm_cli_pool_recalc(struct ldlm_pool *pl, bool force)
+{
+	timeout_t recalc_interval_sec;
+	int ret;
+
+	ENTRY;
+
+	recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time;
+	if (!force && recalc_interval_sec < pl->pl_recalc_period)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+	/*
+	 * Check if we need to recalc lists now.
+	 */
+	recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time;
+	if (!force && recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+		RETURN(0);
+	}
+
+	/*
+	 * Make sure that pool knows last SLV and Limit from obd.
+	 */
+	ldlm_cli_pool_pop_slv(pl);
+	spin_unlock(&pl->pl_lock);
+
+	/*
+	 * In the time of canceling locks on client we do not need to maintain
+	 * sharp timing, we only want to cancel locks asap according to new SLV.
+	 * It may be called when SLV has changed much, this is why we do not
+	 * take into account pl->pl_recalc_time here.
+	 */
+	ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, 0);
+
+	spin_lock(&pl->pl_lock);
+	/*
+	 * Time of LRU resizing might be longer than period,
+	 * so update after LRU resizing rather than before it.
+	 */
+	pl->pl_recalc_time = ktime_get_seconds();
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			    recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+	RETURN(ret);
+}
+
+/**
+ * This function is main entry point for memory pressure handling on client
+ * side.  Main goal of this function is to cancel some number of locks on
+ * passed \a pl according to \a nr and \a gfp_mask.
+ */
+static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
+				int nr, gfp_t gfp_mask)
+{
+	struct ldlm_namespace *ns;
+	int unused;
+
+	ns = ldlm_pl2ns(pl);
+
+	/*
+	 * Do not cancel locks in case lru resize is disabled for this ns.
+	 */
+	if (!ns_connect_lru_resize(ns))
+		RETURN(0);
+
+	/*
+	 * Make sure that pool knows last SLV and Limit from obd.
+	 */
+	spin_lock(&pl->pl_lock);
+	ldlm_cli_pool_pop_slv(pl);
+	spin_unlock(&pl->pl_lock);
+
+	spin_lock(&ns->ns_lock);
+	unused = ns->ns_nr_unused;
+	spin_unlock(&ns->ns_lock);
+
+	if (nr == 0)
+		return (unused / 100) * sysctl_vfs_cache_pressure;
+	else
+		return ldlm_cancel_lru(ns, nr, LCF_ASYNC, 0);
+}
+
+static struct ldlm_pool_ops ldlm_srv_pool_ops = {
+	.po_recalc = ldlm_srv_pool_recalc,
+	.po_shrink = ldlm_srv_pool_shrink,
+	.po_setup  = ldlm_srv_pool_setup
+};
+
+static struct ldlm_pool_ops ldlm_cli_pool_ops = {
+	.po_recalc = ldlm_cli_pool_recalc,
+	.po_shrink = ldlm_cli_pool_shrink
+};
+
+/**
+ * Pool recalc wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ *
+ * \retval		time in seconds for the next recalc of this pool
+ */
+time64_t ldlm_pool_recalc(struct ldlm_pool *pl, bool force)
+{
+	timeout_t recalc_interval_sec;
+	int count;
+
+	recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time;
+	if (recalc_interval_sec > 0) {
+		spin_lock(&pl->pl_lock);
+		recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time;
+
+		if (recalc_interval_sec > 0) {
+			/*
+			 * Update pool statistics every recalc interval.
+			 */
+			ldlm_pool_recalc_stats(pl, recalc_interval_sec);
+
+			/*
+			 * Zero out all rates and speed for the last period.
+			 */
+			atomic_set(&pl->pl_grant_rate, 0);
+			atomic_set(&pl->pl_cancel_rate, 0);
+		}
+		spin_unlock(&pl->pl_lock);
+	}
+
+	if (pl->pl_ops->po_recalc != NULL) {
+		count = pl->pl_ops->po_recalc(pl, force);
+		lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+				    count);
+	}
+
+	return pl->pl_recalc_time + pl->pl_recalc_period;
+}
+
+/**
+ * Pool shrink wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask)
+{
+	int cancel = 0;
+
+	if (pl->pl_ops->po_shrink != NULL) {
+		cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
+		if (nr > 0) {
+			lprocfs_counter_add(pl->pl_stats,
+					    LDLM_POOL_SHRINK_REQTD_STAT,
+					    nr);
+			lprocfs_counter_add(pl->pl_stats,
+					    LDLM_POOL_SHRINK_FREED_STAT,
+					    cancel);
+			CDEBUG(D_DLMTRACE,
+			       "%s: request to shrink %d locks, shrunk %d\n",
+			       pl->pl_name, nr, cancel);
+		}
+	}
+	return cancel;
+}
+
+/**
+ * Pool setup wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ *
+ * Sets passed \a limit into pool \a pl.
+ */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+	if (pl->pl_ops->po_setup != NULL)
+		return pl->pl_ops->po_setup(pl, limit);
+	return 0;
+}
+
+static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused)
+{
+	int granted, grant_rate, cancel_rate, grant_step;
+	int grant_speed, grant_plan, lvf;
+	struct ldlm_pool *pl = m->private;
+	timeout_t period;
+	__u64 slv, clv;
+	__u32 limit;
+
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	clv = pl->pl_client_lock_volume;
+	limit = ldlm_pool_get_limit(pl);
+	grant_plan = pl->pl_grant_plan;
+	granted = ldlm_pool_granted(pl);
+	period = ktime_get_seconds() - pl->pl_recalc_time;
+	if (period <= 0)
+		period = 1;
+	grant_rate = atomic_read(&pl->pl_grant_rate) / period;
+	cancel_rate = atomic_read(&pl->pl_cancel_rate) / period;
+	grant_speed = grant_rate - cancel_rate;
+	lvf = atomic_read(&pl->pl_lock_volume_factor);
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	spin_unlock(&pl->pl_lock);
+
+	seq_printf(m, "LDLM pool state (%s):\n"
+		   "  SLV: %llu\n"
+		   "  CLV: %llu\n"
+		   "  LVF: %d\n",
+		   pl->pl_name, slv, clv, (lvf * 100) >> 8);
+
+	if (ns_is_server(ldlm_pl2ns(pl))) {
+		seq_printf(m, "  GSP: %d%%\n", grant_step);
+		seq_printf(m, "  GP:  %d\n", grant_plan);
+	}
+
+	seq_printf(m, "  GR:  %d\n  CR:  %d\n  GS:  %d\n  G:   %d\n  L:   %d\n",
+		   grant_rate, cancel_rate, grant_speed,
+		   granted, limit);
+	return 0;
+}
+
+LDEBUGFS_SEQ_FOPS_RO(lprocfs_pool_state);
+
+static ssize_t grant_speed_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,
+					    pl_kobj);
+	int grant_speed;
+	timeout_t period;
+
+	spin_lock(&pl->pl_lock);
+	/* serialize with ldlm_pool_recalc */
+	period = ktime_get_seconds() - pl->pl_recalc_time;
+	if (period <= 0)
+		period = 1;
+	grant_speed = (atomic_read(&pl->pl_grant_rate) -
+		       atomic_read(&pl->pl_cancel_rate)) / period;
+	spin_unlock(&pl->pl_lock);
+	return sprintf(buf, "%d\n", grant_speed);
+}
+LUSTRE_RO_ATTR(grant_speed);
+
+LDLM_POOL_SYSFS_READER_SHOW(grant_plan, int);
+LUSTRE_RO_ATTR(grant_plan);
+
+LDLM_POOL_SYSFS_READER_SHOW(recalc_period, int);
+LDLM_POOL_SYSFS_WRITER_STORE(recalc_period, int);
+LUSTRE_RW_ATTR(recalc_period);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(server_lock_volume, u64);
+LUSTRE_RO_ATTR(server_lock_volume);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(client_lock_volume, u64);
+LUSTRE_RO_ATTR(client_lock_volume);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(limit, atomic);
+LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(limit, atomic);
+LUSTRE_RW_ATTR(limit);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(granted, atomic);
+LUSTRE_RO_ATTR(granted);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(cancel_rate, atomic);
+LUSTRE_RO_ATTR(cancel_rate);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(grant_rate, atomic);
+LUSTRE_RO_ATTR(grant_rate);
+
+static ssize_t lock_volume_factor_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
+{
+	struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, pl_kobj);
+	unsigned long tmp;
+
+	tmp = (atomic_read(&pl->pl_lock_volume_factor) * 100) >> 8;
+	return sprintf(buf, "%lu\n", tmp);
+}
+
+static ssize_t lock_volume_factor_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
+{
+	struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, pl_kobj);
+	unsigned long tmp;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &tmp);
+	if (rc < 0) {
+		return rc;
+	}
+
+	tmp = (tmp << 8) / 100;
+	atomic_set(&pl->pl_lock_volume_factor, tmp);
+
+	return count;
+
+}
+LUSTRE_RW_ATTR(lock_volume_factor);
+
+static ssize_t recalc_time_show(struct kobject *kobj,
+				struct attribute *attr,
+				char *buf)
+{
+	struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, pl_kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			ktime_get_seconds() - pl->pl_recalc_time);
+}
+LUSTRE_RO_ATTR(recalc_time);
+
+/* These are for pools in /sys/fs/lustre/ldlm/namespaces/.../pool */
+static struct attribute *ldlm_pl_attrs[] = {
+	&lustre_attr_grant_speed.attr,
+	&lustre_attr_grant_plan.attr,
+	&lustre_attr_recalc_period.attr,
+	&lustre_attr_server_lock_volume.attr,
+	&lustre_attr_client_lock_volume.attr,
+	&lustre_attr_recalc_time.attr,
+	&lustre_attr_limit.attr,
+	&lustre_attr_granted.attr,
+	&lustre_attr_cancel_rate.attr,
+	&lustre_attr_grant_rate.attr,
+	&lustre_attr_lock_volume_factor.attr,
+	NULL,
+};
+
+KOBJ_ATTRIBUTE_GROUPS(ldlm_pl);
+
+static void ldlm_pl_release(struct kobject *kobj)
+{
+	struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,
+					    pl_kobj);
+	complete(&pl->pl_kobj_unregister);
+}
+
+static struct kobj_type ldlm_pl_ktype = {
+	.default_groups = KOBJ_ATTR_GROUPS(ldlm_pl),
+	.sysfs_ops	= &lustre_sysfs_ops,
+	.release	= ldlm_pl_release,
+};
+
+static int ldlm_pool_sysfs_init(struct ldlm_pool *pl)
+{
+	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+	int err;
+
+	init_completion(&pl->pl_kobj_unregister);
+	err = kobject_init_and_add(&pl->pl_kobj, &ldlm_pl_ktype, &ns->ns_kobj,
+				   "pool");
+
+	return err;
+}
+
+static int ldlm_pool_debugfs_init(struct ldlm_pool *pl)
+{
+	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+	struct dentry *debugfs_ns_parent;
+	struct ldebugfs_vars pool_vars[2];
+	int rc = 0;
+
+	ENTRY;
+
+	debugfs_ns_parent = ns->ns_debugfs_entry;
+	if (IS_ERR_OR_NULL(debugfs_ns_parent)) {
+		CERROR("%s: debugfs entry is not initialized\n",
+		       ldlm_ns_name(ns));
+		GOTO(out, rc = -EINVAL);
+	}
+	pl->pl_debugfs_entry = debugfs_create_dir("pool", debugfs_ns_parent);
+
+	memset(pool_vars, 0, sizeof(pool_vars));
+
+	ldlm_add_var(&pool_vars[0], pl->pl_debugfs_entry, "state", pl,
+		     &lprocfs_pool_state_fops);
+
+	pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
+					   LDLM_POOL_FIRST_STAT, 0);
+	if (!pl->pl_stats)
+		GOTO(out, rc = -ENOMEM);
+
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "granted", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "grant", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "cancel", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "grant_rate", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "cancel_rate", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "grant_plan", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "slv", "slv");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "shrink_request", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "shrink_freed", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "recalc_freed", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			     LPROCFS_CNTR_AVGMINMAX, "recalc_timing", "sec");
+	debugfs_create_file("stats", 0644, pl->pl_debugfs_entry,
+			    pl->pl_stats, &ldebugfs_stats_seq_fops);
+
+	EXIT;
+out:
+	return rc;
+}
+
+static void ldlm_pool_sysfs_fini(struct ldlm_pool *pl)
+{
+	kobject_put(&pl->pl_kobj);
+	wait_for_completion(&pl->pl_kobj_unregister);
+}
+
+static void ldlm_pool_debugfs_fini(struct ldlm_pool *pl)
+{
+	if (pl->pl_stats != NULL) {
+		lprocfs_free_stats(&pl->pl_stats);
+		pl->pl_stats = NULL;
+	}
+	debugfs_remove_recursive(pl->pl_debugfs_entry);
+	pl->pl_debugfs_entry = NULL;
+}
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, enum ldlm_side client)
+{
+	int rc;
+
+	ENTRY;
+
+	spin_lock_init(&pl->pl_lock);
+	atomic_set(&pl->pl_granted, 0);
+	pl->pl_recalc_time = ktime_get_seconds();
+	atomic_set(&pl->pl_lock_volume_factor, 1 << 8);
+
+	atomic_set(&pl->pl_grant_rate, 0);
+	atomic_set(&pl->pl_cancel_rate, 0);
+	pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
+
+	snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
+		 ldlm_ns_name(ns), idx);
+
+	if (client == LDLM_NAMESPACE_SERVER) {
+		pl->pl_ops = &ldlm_srv_pool_ops;
+		ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
+		pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+		pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L);
+	} else {
+		ldlm_pool_set_limit(pl, 1);
+		pl->pl_server_lock_volume = 0;
+		pl->pl_ops = &ldlm_cli_pool_ops;
+		pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+	}
+	pl->pl_client_lock_volume = 0;
+	rc = ldlm_pool_debugfs_init(pl);
+	if (rc)
+		RETURN(rc);
+
+	rc = ldlm_pool_sysfs_init(pl);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
+
+	RETURN(rc);
+}
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+	ENTRY;
+	ldlm_pool_sysfs_fini(pl);
+	ldlm_pool_debugfs_fini(pl);
+
+	/*
+	 * Pool should not be used after this point. We can't free it here as
+	 * it lives in struct ldlm_namespace, but still interested in catching
+	 * any abnormal using cases.
+	 */
+	POISON(pl, 0x5a, sizeof(*pl));
+	EXIT;
+}
+
+/**
+ * Add new taken ldlm lock \a lock into pool \a pl accounting.
+ */
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * FLOCK locks are special in a sense that they are almost never
+	 * cancelled, instead special kind of lock is used to drop them.
+	 * also there is no LRU for flock locks, so no point in tracking
+	 * them anyway.
+	 *
+	 * PLAIN locks are used by config and quota, the quantity is small
+	 * and usually they are not in LRU.
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK ||
+	    lock->l_resource->lr_type == LDLM_PLAIN)
+		return;
+
+	ldlm_reclaim_add(lock);
+
+	atomic_inc(&pl->pl_granted);
+	atomic_inc(&pl->pl_grant_rate);
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
+	/*
+	 * Do not do pool recalc for client side as all locks which
+	 * potentially may be canceled has already been packed into
+	 * enqueue/cancel rpc. Also we do not want to run out of stack
+	 * with too long call paths.
+	 */
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl, false);
+}
+
+/**
+ * Remove ldlm lock \a lock from pool \a pl accounting.
+ */
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * Filter out FLOCK & PLAIN locks. Read above comment in
+	 * ldlm_pool_add().
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK ||
+	    lock->l_resource->lr_type == LDLM_PLAIN)
+		return;
+
+	ldlm_reclaim_del(lock);
+
+	LASSERT(atomic_read(&pl->pl_granted) > 0);
+	atomic_dec(&pl->pl_granted);
+	atomic_inc(&pl->pl_cancel_rate);
+
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
+
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl, false);
+}
+
+/**
+ * Returns current \a pl SLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+
+/**
+ * Sets passed \a slv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_server_lock_volume = slv;
+	spin_unlock(&pl->pl_lock);
+}
+
+/**
+ * Returns current \a pl CLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_client_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+
+/**
+ * Sets passed \a clv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_client_lock_volume = clv;
+	spin_unlock(&pl->pl_lock);
+}
+
+/**
+ * Returns current \a pl limit.
+ */
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_limit);
+}
+
+/**
+ * Sets passed \a limit to \a pl.
+ */
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+	atomic_set(&pl->pl_limit, limit);
+}
+
+/**
+ * Returns current LVF from \a pl.
+ */
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_lock_volume_factor);
+}
+
+/*
+ * count locks from all namespaces (if possible). Returns number of
+ * cached locks.
+ */
+static unsigned long ldlm_pools_count(enum ldlm_side client, gfp_t gfp_mask)
+{
+	unsigned long total = 0;
+	int nr_ns;
+	struct ldlm_namespace *ns;
+	struct ldlm_namespace *ns_old = NULL; /* loop detection */
+
+	if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS))
+		return 0;
+
+	/*
+	 * Find out how many resources we may release.
+	 */
+	for (nr_ns = ldlm_namespace_nr_read(client);
+	     nr_ns > 0; nr_ns--) {
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			return 0;
+		}
+		ns = ldlm_namespace_first_locked(client);
+
+		if (ns == ns_old) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+
+		if (ldlm_ns_empty(ns)) {
+			ldlm_namespace_move_to_inactive_locked(ns, client);
+			mutex_unlock(ldlm_namespace_lock(client));
+			continue;
+		}
+
+		if (ns_old == NULL)
+			ns_old = ns;
+
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_to_active_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+		total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
+		ldlm_namespace_put(ns);
+	}
+
+	return total;
+}
+
+static unsigned long ldlm_pools_scan(enum ldlm_side client, int nr,
+				     gfp_t gfp_mask)
+{
+	unsigned long freed = 0;
+	int tmp, nr_ns;
+	struct ldlm_namespace *ns;
+
+	if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS))
+		return -1;
+
+	/*
+	 * Shrink at least ldlm_namespace_nr_read(client) namespaces.
+	 */
+	for (tmp = nr_ns = ldlm_namespace_nr_read(client);
+	     tmp > 0; tmp--) {
+		int cancel, nr_locks;
+
+		/*
+		 * Do not call shrink under ldlm_namespace_lock(client)
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+		ns = ldlm_namespace_first_locked(client);
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_to_active_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+
+		nr_locks = ldlm_pool_granted(&ns->ns_pool);
+		/*
+		 * We use to shrink propotionally but with new shrinker API,
+		 * we lost the total number of freeable locks.
+		 */
+		cancel = 1 + min_t(int, nr_locks, nr / nr_ns);
+		freed += ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
+		ldlm_namespace_put(ns);
+	}
+	/*
+	 * we only decrease the SLV in server pools shrinker, return
+	 * SHRINK_STOP to kernel to avoid needless loop. LU-1128
+	 */
+	return (client == LDLM_NAMESPACE_SERVER) ? SHRINK_STOP : freed;
+}
+
+#ifdef HAVE_SHRINKER_COUNT
+static unsigned long ldlm_pools_srv_count(struct shrinker *s,
+					  struct shrink_control *sc)
+{
+	return ldlm_pools_count(LDLM_NAMESPACE_SERVER, sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_srv_scan(struct shrinker *s,
+					 struct shrink_control *sc)
+{
+	return ldlm_pools_scan(LDLM_NAMESPACE_SERVER, sc->nr_to_scan,
+			       sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_cli_count(struct shrinker *s,
+					  struct shrink_control *sc)
+{
+	return ldlm_pools_count(LDLM_NAMESPACE_CLIENT, sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_cli_scan(struct shrinker *s,
+					 struct shrink_control *sc)
+{
+	return ldlm_pools_scan(LDLM_NAMESPACE_CLIENT, sc->nr_to_scan,
+			       sc->gfp_mask);
+}
+
+static struct shrinker ldlm_pools_srv_shrinker = {
+	.count_objects	= ldlm_pools_srv_count,
+	.scan_objects	= ldlm_pools_srv_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+static struct shrinker ldlm_pools_cli_shrinker = {
+	.count_objects	= ldlm_pools_cli_count,
+	.scan_objects	= ldlm_pools_cli_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+#else
+/*
+ * Cancel \a nr locks from all namespaces (if possible). Returns number of
+ * cached locks after shrink is finished. All namespaces are asked to
+ * cancel approximately equal amount of locks to keep balancing.
+ */
+static int ldlm_pools_shrink(enum ldlm_side client, int nr, gfp_t gfp_mask)
+{
+	unsigned long total = 0;
+
+	if (client == LDLM_NAMESPACE_CLIENT && nr != 0 &&
+	    !(gfp_mask & __GFP_FS))
+		return -1;
+
+	total = ldlm_pools_count(client, gfp_mask);
+
+	if (nr == 0 || total == 0)
+		return total;
+
+	return ldlm_pools_scan(client, nr, gfp_mask);
+}
+
+static int ldlm_pools_srv_shrink(struct shrinker *shrinker,
+				 struct shrink_control *sc)
+{
+	return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER,
+				 sc->nr_to_scan, sc->gfp_mask);
+}
+
+static int ldlm_pools_cli_shrink(struct shrinker *shrinker,
+				 struct shrink_control *sc)
+{
+	return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT,
+				 sc->nr_to_scan, sc->gfp_mask);
+}
+
+static struct shrinker ldlm_pools_srv_shrinker = {
+	.shrink = ldlm_pools_srv_shrink,
+	.seeks = DEFAULT_SEEKS,
+};
+
+static struct shrinker ldlm_pools_cli_shrinker = {
+	.shrink = ldlm_pools_cli_shrink,
+	.seeks = DEFAULT_SEEKS,
+};
+#endif /* HAVE_SHRINKER_COUNT */
+
+static time64_t ldlm_pools_recalc_delay(enum ldlm_side side)
+{
+	struct ldlm_namespace *ns;
+	struct ldlm_namespace *ns_old = NULL;
+	/* seconds of sleep if no active namespaces */
+	time64_t delay = ktime_get_seconds() +
+			 (side == LDLM_NAMESPACE_SERVER ?
+			  LDLM_POOL_SRV_DEF_RECALC_PERIOD :
+			  LDLM_POOL_CLI_DEF_RECALC_PERIOD);
+	int nr;
+
+	/* Recalc at least ldlm_namespace_nr(side) namespaces. */
+	for (nr = ldlm_namespace_nr_read(side); nr > 0; nr--) {
+		int skip;
+		/*
+		 * Lock the list, get first @ns in the list, getref, move it
+		 * to the tail, unlock and call pool recalc. This way we avoid
+		 * calling recalc under @ns lock, which is really good as we
+		 * get rid of potential deadlock on side nodes when canceling
+		 * locks synchronously.
+		 */
+		mutex_lock(ldlm_namespace_lock(side));
+		if (list_empty(ldlm_namespace_list(side))) {
+			mutex_unlock(ldlm_namespace_lock(side));
+			break;
+		}
+		ns = ldlm_namespace_first_locked(side);
+
+		if (ns_old == ns) { /* Full pass complete */
+			mutex_unlock(ldlm_namespace_lock(side));
+			break;
+		}
+
+		/* We got an empty namespace, need to move it back to inactive
+		 * list.
+		 * The race with parallel resource creation is fine:
+		 * - If they do namespace_get before our check, we fail the
+		 *   check and they move this item to the end of the list anyway
+		 * - If we do the check and then they do namespace_get, then
+		 *   we move the namespace to inactive and they will move
+		 *   it back to active (synchronised by the lock, so no clash
+		 *   there).
+		 */
+		if (ldlm_ns_empty(ns)) {
+			ldlm_namespace_move_to_inactive_locked(ns, side);
+			mutex_unlock(ldlm_namespace_lock(side));
+			continue;
+		}
+
+		if (ns_old == NULL)
+			ns_old = ns;
+
+		spin_lock(&ns->ns_lock);
+		/*
+		 * skip ns which is being freed, and we don't want to increase
+		 * its refcount again, not even temporarily. bz21519 & LU-499.
+		 */
+		if (ns->ns_stopping) {
+			skip = 1;
+		} else {
+			skip = 0;
+			ldlm_namespace_get(ns);
+		}
+		spin_unlock(&ns->ns_lock);
+
+		ldlm_namespace_move_to_active_locked(ns, side);
+		mutex_unlock(ldlm_namespace_lock(side));
+
+		/*
+		 * After setup is done - recalc the pool.
+		 */
+		if (!skip) {
+			delay = min(delay,
+				    ldlm_pool_recalc(&ns->ns_pool, false));
+			ldlm_namespace_put(ns);
+		}
+	}
+
+	return delay;
+}
+
+static void ldlm_pools_recalc_task(struct work_struct *ws);
+static DECLARE_DELAYED_WORK(ldlm_pools_recalc_work, ldlm_pools_recalc_task);
+
+static void ldlm_pools_recalc_task(struct work_struct *ws)
+{
+	/* seconds of sleep if no active namespaces */
+	time64_t delay;
+#ifdef HAVE_SERVER_SUPPORT
+	struct ldlm_namespace *ns;
+	unsigned long nr_l = 0, nr_p = 0, l;
+	int equal = 0;
+
+	/* Check all modest namespaces first. */
+	mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+	list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER),
+			    ns_list_chain) {
+		if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+			continue;
+
+		l = ldlm_pool_granted(&ns->ns_pool);
+		if (l == 0)
+			l = 1;
+
+		/*
+		 * Set the modest pools limit equal to their avg granted
+		 * locks + ~6%.
+		 */
+		l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+		ldlm_pool_setup(&ns->ns_pool, l);
+		nr_l += l;
+		nr_p++;
+	}
+
+	/*
+	 * Make sure than modest namespaces did not eat more that 2/3
+	 * of limit.
+	 */
+	if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+		CWARN("'Modest' pools eat out 2/3 of server locks limit (%lu of %lu). This means that you have too many clients for this amount of server RAM. Upgrade server!\n",
+		      nr_l, LDLM_POOL_HOST_L);
+		equal = 1;
+	}
+
+	/* The rest is given to greedy namespaces. */
+	list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER),
+			    ns_list_chain) {
+		if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+			continue;
+
+		if (equal) {
+			/*
+			 * In the case 2/3 locks are eaten out by
+			 * modest pools, we re-setup equal limit
+			 * for _all_ pools.
+			 */
+			l = LDLM_POOL_HOST_L /
+				ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER);
+		} else {
+			/*
+			 * All the rest of greedy pools will have
+			 * all locks in equal parts.
+			 */
+			l = (LDLM_POOL_HOST_L - nr_l) /
+				(ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER) -
+				 nr_p);
+		}
+		ldlm_pool_setup(&ns->ns_pool, l);
+	}
+	mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+
+	delay = min(ldlm_pools_recalc_delay(LDLM_NAMESPACE_SERVER),
+		    ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT));
+#else  /* !HAVE_SERVER_SUPPORT */
+	delay = ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT);
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Wake up the blocking threads from time to time. */
+	ldlm_bl_thread_wakeup();
+
+	delay -= ktime_get_seconds();
+	if (delay <= 0) {
+		/* Prevent too frequent recalculation. */
+		CDEBUG(D_DLMTRACE, "Negative interval(%lld)\n", delay);
+		delay = 1;
+	}
+
+	schedule_delayed_work(&ldlm_pools_recalc_work, cfs_time_seconds(delay));
+}
+
+static bool ldlm_pools_init_done;
+
+int ldlm_pools_init(void)
+{
+	time64_t delay;
+	int rc;
+
+#ifdef HAVE_SERVER_SUPPORT
+	delay = min(LDLM_POOL_SRV_DEF_RECALC_PERIOD,
+		    LDLM_POOL_CLI_DEF_RECALC_PERIOD);
+#else
+	delay = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+#endif
+
+	rc = register_shrinker(&ldlm_pools_srv_shrinker);
+	if (rc)
+		goto out;
+
+	rc = register_shrinker(&ldlm_pools_cli_shrinker);
+	if (rc)
+		goto out_shrinker;
+
+	schedule_delayed_work(&ldlm_pools_recalc_work, delay);
+	ldlm_pools_init_done = true;
+	return 0;
+
+out_shrinker:
+	unregister_shrinker(&ldlm_pools_cli_shrinker);
+out:
+	return rc;
+}
+
+void ldlm_pools_fini(void)
+{
+	if (ldlm_pools_init_done) {
+		unregister_shrinker(&ldlm_pools_srv_shrinker);
+		unregister_shrinker(&ldlm_pools_cli_shrinker);
+
+		cancel_delayed_work_sync(&ldlm_pools_recalc_work);
+	}
+
+	ldlm_pools_init_done = false;
+}
+
+#else /* !HAVE_LRU_RESIZE_SUPPORT */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+	return 0;
+}
+
+time64_t ldlm_pool_recalc(struct ldlm_pool *pl, bool force)
+{
+	return 0;
+}
+
+int ldlm_pool_shrink(struct ldlm_pool *pl,
+		     int nr, gfp_t gfp_mask)
+{
+	return 0;
+}
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, enum ldlm_side client)
+{
+	return 0;
+}
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+	return;
+}
+
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	return;
+}
+
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	return;
+}
+
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+	return 1;
+}
+
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+	return;
+}
+
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+	return 1;
+}
+
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+	return;
+}
+
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+	return 0;
+}
+
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+	return;
+}
+
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+	return 0;
+}
+
+int ldlm_pools_init(void)
+{
+	return 0;
+}
+
+void ldlm_pools_fini(void)
+{
+	return;
+}
+
+#endif /* HAVE_LRU_RESIZE_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c
new file mode 100644
index 0000000000000..d371dc2cade21
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c
@@ -0,0 +1,415 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu    Yawei    <yawei.niu@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/kthread.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+/*
+ * To avoid ldlm lock exhausting server memory, two global parameters:
+ * ldlm_reclaim_threshold & ldlm_lock_limit are used for reclaiming
+ * granted locks and rejecting incoming enqueue requests defensively.
+ *
+ * ldlm_reclaim_threshold: When the amount of granted locks reaching this
+ * threshold, server start to revoke locks gradually.
+ *
+ * ldlm_lock_limit: When the amount of granted locks reaching this
+ * threshold, server will return -EINPROGRESS to any incoming enqueue
+ * request until the lock count is shrunk below the threshold again.
+ *
+ * ldlm_reclaim_threshold & ldlm_lock_limit is set to 20% & 30% of the
+ * total memory by default. It is tunable via proc entry, when it's set
+ * to 0, the feature is disabled.
+ */
+
+#ifdef HAVE_SERVER_SUPPORT
+
+/* Lock count is stored in ldlm_reclaim_threshold & ldlm_lock_limit */
+__u64 ldlm_reclaim_threshold;
+__u64 ldlm_lock_limit;
+
+/* Represents ldlm_reclaim_threshold & ldlm_lock_limit in MB, used for
+ * proc interface. */
+__u64 ldlm_reclaim_threshold_mb;
+__u64 ldlm_lock_limit_mb;
+
+struct percpu_counter		ldlm_granted_total;
+static atomic_t			ldlm_nr_reclaimer;
+static s64			ldlm_last_reclaim_age_ns;
+static ktime_t			ldlm_last_reclaim_time;
+
+struct ldlm_reclaim_cb_data {
+	struct list_head	 rcd_rpc_list;
+	int			 rcd_added;
+	int			 rcd_total;
+	int			 rcd_cursor;
+	int			 rcd_start;
+	bool			 rcd_skip;
+	s64			 rcd_age_ns;
+	struct cfs_hash_bd	*rcd_prev_bd;
+};
+
+static inline bool ldlm_lock_reclaimable(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	/* FLOCK & PLAIN lock are not reclaimable. FLOCK is
+	 * explicitly controlled by application, PLAIN lock
+	 * is used by quota global lock and config lock.
+	 */
+	if (ns->ns_client == LDLM_NAMESPACE_SERVER &&
+	    (lock->l_resource->lr_type == LDLM_IBITS ||
+	     lock->l_resource->lr_type == LDLM_EXTENT))
+		return true;
+	return false;
+}
+
+/**
+ * Callback function for revoking locks from certain resource.
+ *
+ * \param [in] hs	ns_rs_hash
+ * \param [in] bd	current bucket of ns_rsh_hash
+ * \param [in] hnode	hnode of the resource
+ * \param [in] arg	opaque data
+ *
+ * \retval 0		continue the scan
+ * \retval 1		stop the iteration
+ */
+static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode, void *arg)
+
+{
+	struct ldlm_resource		*res;
+	struct ldlm_reclaim_cb_data	*data;
+	struct ldlm_lock		*lock;
+	struct ldlm_ns_bucket		*nsb;
+	int				 rc = 0;
+
+	data = (struct ldlm_reclaim_cb_data *)arg;
+
+	LASSERTF(data->rcd_added < data->rcd_total, "added:%d >= total:%d\n",
+		 data->rcd_added, data->rcd_total);
+
+	nsb = cfs_hash_bd_extra_get(hs, bd);
+	res = cfs_hash_object(hs, hnode);
+
+	if (data->rcd_prev_bd != bd) {
+		if (data->rcd_prev_bd != NULL)
+			ldlm_res_to_ns(res)->ns_reclaim_start++;
+		data->rcd_prev_bd = bd;
+		data->rcd_cursor = 0;
+		data->rcd_start = nsb->nsb_reclaim_start %
+				  cfs_hash_bd_count_get(bd);
+	}
+
+	if (data->rcd_skip && data->rcd_cursor < data->rcd_start) {
+		data->rcd_cursor++;
+		return 0;
+	}
+
+	nsb->nsb_reclaim_start++;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+		if (!ldlm_lock_reclaimable(lock))
+			continue;
+
+		if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW) &&
+		    ktime_before(ktime_get(),
+				 ktime_add_ns(lock->l_last_used,
+					      data->rcd_age_ns)))
+			continue;
+
+		if (!ldlm_is_ast_sent(lock)) {
+			ldlm_set_ast_sent(lock);
+			LASSERT(list_empty(&lock->l_rk_ast));
+			list_add(&lock->l_rk_ast, &data->rcd_rpc_list);
+			LDLM_LOCK_GET(lock);
+			if (++data->rcd_added == data->rcd_total) {
+				rc = 1; /* stop the iteration */
+				break;
+			}
+		}
+	}
+	unlock_res(res);
+
+	return rc;
+}
+
+/**
+ * Revoke locks from the resources of a namespace in a roundrobin
+ * manner.
+ *
+ * \param[in] ns	namespace to do the lock revoke on
+ * \param[in] count	count of lock to be revoked
+ * \param[in] age	only revoke locks older than the 'age'
+ * \param[in] skip	scan from the first lock on resource if the
+ *			'skip' is false, otherwise, continue scan
+ *			from the last scanned position
+ * \param[out] count	count of lock still to be revoked
+ */
+static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count,
+			     s64 age_ns, bool skip)
+{
+	struct ldlm_reclaim_cb_data	data;
+	int				idx, type, start;
+	int				rc;
+	ENTRY;
+
+	LASSERT(*count != 0);
+
+	if (ns->ns_obd) {
+		type = server_name2index(ns->ns_obd->obd_name, &idx, NULL);
+		if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) {
+			EXIT;
+			return;
+		}
+	}
+
+	if (atomic_read(&ns->ns_bref) == 0) {
+		EXIT;
+		return;
+	}
+
+	INIT_LIST_HEAD(&data.rcd_rpc_list);
+	data.rcd_added = 0;
+	data.rcd_total = *count;
+	data.rcd_age_ns = age_ns;
+	data.rcd_skip = skip;
+	data.rcd_prev_bd = NULL;
+	start = ns->ns_reclaim_start % CFS_HASH_NBKT(ns->ns_rs_hash);
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_reclaim_lock_cb, &data,
+				 start);
+
+	CDEBUG(D_DLMTRACE, "NS(%s): %d locks to be reclaimed, found %d/%d "
+	       "locks.\n", ldlm_ns_name(ns), *count, data.rcd_added,
+	       data.rcd_total);
+
+	LASSERTF(*count >= data.rcd_added, "count:%d, added:%d\n", *count,
+		 data.rcd_added);
+
+	rc  = ldlm_run_ast_work(ns, &data.rcd_rpc_list, LDLM_WORK_REVOKE_AST);
+	if (rc == -ERESTART)
+		ldlm_reprocess_recovery_done(ns);
+
+	*count -= data.rcd_added;
+	EXIT;
+}
+
+#define LDLM_RECLAIM_BATCH	512
+#define LDLM_RECLAIM_AGE_MIN	(300 * NSEC_PER_SEC)
+#define LDLM_RECLAIM_AGE_MAX	(LDLM_DEFAULT_MAX_ALIVE * NSEC_PER_SEC * 3 / 4)
+
+static inline s64 ldlm_reclaim_age(void)
+{
+	s64 age_ns = ldlm_last_reclaim_age_ns;
+	ktime_t now = ktime_get();
+	ktime_t diff;
+
+	diff = ktime_sub(now, ldlm_last_reclaim_time);
+	age_ns += ktime_to_ns(diff);
+	if (age_ns > LDLM_RECLAIM_AGE_MAX)
+		age_ns = LDLM_RECLAIM_AGE_MAX;
+	else if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2))
+		age_ns = LDLM_RECLAIM_AGE_MIN;
+	return age_ns;
+}
+
+/**
+ * Revoke certain amount of locks from all the server namespaces
+ * in a roundrobin manner. Lock age is used to avoid reclaim on
+ * the non-aged locks.
+ */
+static void ldlm_reclaim_ns(void)
+{
+	struct ldlm_namespace	*ns;
+	int			 count = LDLM_RECLAIM_BATCH;
+	int			 ns_nr, nr_processed;
+	enum ldlm_side		 ns_cli = LDLM_NAMESPACE_SERVER;
+	s64 age_ns;
+	bool			 skip = true;
+	ENTRY;
+
+	if (!atomic_add_unless(&ldlm_nr_reclaimer, 1, 1)) {
+		EXIT;
+		return;
+	}
+
+	age_ns = ldlm_reclaim_age();
+again:
+	nr_processed = 0;
+	ns_nr = ldlm_namespace_nr_read(ns_cli);
+	while (count > 0 && nr_processed < ns_nr) {
+		mutex_lock(ldlm_namespace_lock(ns_cli));
+
+		if (list_empty(ldlm_namespace_list(ns_cli))) {
+			mutex_unlock(ldlm_namespace_lock(ns_cli));
+			goto out;
+		}
+
+		ns = ldlm_namespace_first_locked(ns_cli);
+		ldlm_namespace_move_to_active_locked(ns, ns_cli);
+		mutex_unlock(ldlm_namespace_lock(ns_cli));
+
+		ldlm_reclaim_res(ns, &count, age_ns, skip);
+		ldlm_namespace_put(ns);
+		nr_processed++;
+	}
+
+	if (count > 0 && age_ns > LDLM_RECLAIM_AGE_MIN) {
+		age_ns >>= 1;
+		if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2))
+			age_ns = LDLM_RECLAIM_AGE_MIN;
+		skip = false;
+		goto again;
+	}
+
+	ldlm_last_reclaim_age_ns = age_ns;
+	ldlm_last_reclaim_time = ktime_get();
+out:
+	atomic_add_unless(&ldlm_nr_reclaimer, -1, 0);
+	EXIT;
+}
+
+void ldlm_reclaim_add(struct ldlm_lock *lock)
+{
+	if (!ldlm_lock_reclaimable(lock))
+		return;
+	percpu_counter_add(&ldlm_granted_total, 1);
+	lock->l_last_used = ktime_get();
+}
+
+void ldlm_reclaim_del(struct ldlm_lock *lock)
+{
+	if (!ldlm_lock_reclaimable(lock))
+		return;
+	percpu_counter_sub(&ldlm_granted_total, 1);
+}
+
+/**
+ * Check on the total granted locks: return true if it reaches the
+ * high watermark (ldlm_lock_limit), otherwise return false; It also
+ * triggers lock reclaim if the low watermark (ldlm_reclaim_threshold)
+ * is reached.
+ *
+ * \retval true		high watermark reached.
+ * \retval false	high watermark not reached.
+ */
+bool ldlm_reclaim_full(void)
+{
+	__u64 high = ldlm_lock_limit;
+	__u64 low = ldlm_reclaim_threshold;
+
+	if (low != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW))
+		low = cfs_fail_val;
+
+	if (low != 0 &&
+	    percpu_counter_sum_positive(&ldlm_granted_total) > low)
+		ldlm_reclaim_ns();
+
+	if (high != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH))
+		high = cfs_fail_val;
+
+	if (high != 0 &&
+	    percpu_counter_sum_positive(&ldlm_granted_total) > high)
+		return true;
+
+	return false;
+}
+
+static inline __u64 ldlm_ratio2locknr(int ratio)
+{
+	__u64 locknr;
+
+	locknr = ((__u64)NUM_CACHEPAGES << PAGE_SHIFT) * ratio;
+	do_div(locknr, 100 * sizeof(struct ldlm_lock));
+
+	return locknr;
+}
+
+static inline __u64 ldlm_locknr2mb(__u64 locknr)
+{
+	return (locknr * sizeof(struct ldlm_lock) + 512 * 1024) >> 20;
+}
+
+#define LDLM_WM_RATIO_LOW_DEFAULT	20
+#define LDLM_WM_RATIO_HIGH_DEFAULT	30
+
+int ldlm_reclaim_setup(void)
+{
+	atomic_set(&ldlm_nr_reclaimer, 0);
+
+	ldlm_reclaim_threshold = ldlm_ratio2locknr(LDLM_WM_RATIO_LOW_DEFAULT);
+	ldlm_reclaim_threshold_mb = ldlm_locknr2mb(ldlm_reclaim_threshold);
+	ldlm_lock_limit = ldlm_ratio2locknr(LDLM_WM_RATIO_HIGH_DEFAULT);
+	ldlm_lock_limit_mb = ldlm_locknr2mb(ldlm_lock_limit);
+
+	ldlm_last_reclaim_age_ns = LDLM_RECLAIM_AGE_MAX;
+	ldlm_last_reclaim_time = ktime_get();
+
+#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+	return percpu_counter_init(&ldlm_granted_total, 0, GFP_KERNEL);
+#else
+	return percpu_counter_init(&ldlm_granted_total, 0);
+#endif
+}
+
+void ldlm_reclaim_cleanup(void)
+{
+	percpu_counter_destroy(&ldlm_granted_total);
+}
+
+#else /* HAVE_SERVER_SUPPORT */
+
+bool ldlm_reclaim_full(void)
+{
+	return false;
+}
+
+void ldlm_reclaim_add(struct ldlm_lock *lock)
+{
+}
+
+void ldlm_reclaim_del(struct ldlm_lock *lock)
+{
+}
+
+int ldlm_reclaim_setup(void)
+{
+	return 0;
+}
+
+void ldlm_reclaim_cleanup(void)
+{
+}
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
new file mode 100644
index 0000000000000..38ca9ed9caad6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
@@ -0,0 +1,2650 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+/**
+ * This file contains Asynchronous System Trap (AST) handlers and related
+ * LDLM request-processing routines.
+ *
+ * An AST is a callback issued on a lock when its state is changed. There are
+ * several different types of ASTs (callbacks) registered for each lock:
+ *
+ * - completion AST: when a lock is enqueued by some process, but cannot be
+ *   granted immediately due to other conflicting locks on the same resource,
+ *   the completion AST is sent to notify the caller when the lock is
+ *   eventually granted
+ *
+ * - blocking AST: when a lock is granted to some process, if another process
+ *   enqueues a conflicting (blocking) lock on a resource, a blocking AST is
+ *   sent to notify the holder(s) of the lock(s) of the conflicting lock
+ *   request. The lock holder(s) must release their lock(s) on that resource in
+ *   a timely manner or be evicted by the server.
+ *
+ * - glimpse AST: this is used when a process wants information about a lock
+ *   (i.e. the lock value block (LVB)) but does not necessarily require holding
+ *   the lock. If the resource is locked, the lock holder(s) are sent glimpse
+ *   ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL
+ *   their lock(s) if they are idle. If the resource is not locked, the server
+ *   may grant the lock.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/fs_struct.h>
+#include <lustre_errno.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <obd.h>
+
+#include "ldlm_internal.h"
+
+unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+module_param(ldlm_enqueue_min, uint, 0644);
+MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum");
+EXPORT_SYMBOL(ldlm_enqueue_min);
+
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
+struct lock_wait_data {
+	struct ldlm_lock *lwd_lock;
+	__u32             lwd_conn_cnt;
+};
+
+struct ldlm_async_args {
+	struct lustre_handle lock_handle;
+};
+
+/**
+ * ldlm_request_bufsize
+ *
+ * If opcode=LDLM_ENQUEUE, 1 slot is already occupied,
+ * LDLM_LOCKREQ_HANDLE -1 slots are available.
+ * Otherwise, LDLM_LOCKREQ_HANDLE slots are available.
+ *
+ * \param[in] count
+ * \param[in] type
+ *
+ * \retval size of the request buffer
+ */
+int ldlm_request_bufsize(int count, int type)
+{
+	int avail = LDLM_LOCKREQ_HANDLES;
+
+	if (type == LDLM_ENQUEUE)
+		avail -= LDLM_ENQUEUE_CANCEL_OFF;
+
+	if (count > avail)
+		avail = (count - avail) * sizeof(struct lustre_handle);
+	else
+		avail = 0;
+
+	return sizeof(struct ldlm_request) + avail;
+}
+
+void ldlm_expired_completion_wait(struct lock_wait_data *lwd)
+{
+	struct ldlm_lock *lock = lwd->lwd_lock;
+	struct obd_import *imp;
+	struct obd_device *obd;
+
+	ENTRY;
+	if (lock->l_conn_export == NULL) {
+		static time64_t next_dump, last_dump;
+
+		LDLM_ERROR(lock,
+			   "lock timed out (enqueued at %lld, %llds ago); not entering recovery in server code, just going back to sleep",
+			   lock->l_activity,
+			   ktime_get_real_seconds() - lock->l_activity);
+		if (ktime_get_seconds() > next_dump) {
+			last_dump = next_dump;
+			next_dump = ktime_get_seconds() + 300;
+			ldlm_namespace_dump(D_DLMTRACE,
+					    ldlm_lock_to_ns(lock));
+			if (last_dump == 0)
+				libcfs_debug_dumplog();
+		}
+		RETURN_EXIT;
+	}
+
+	obd = lock->l_conn_export->exp_obd;
+	imp = obd->u.cli.cl_import;
+	ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
+	LDLM_ERROR(lock,
+		   "lock timed out (enqueued at %lld, %llds ago), entering recovery for %s@%s",
+		   lock->l_activity,
+		   ktime_get_real_seconds() - lock->l_activity,
+		   obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
+
+	EXIT;
+}
+
+int is_granted_or_cancelled_nolock(struct ldlm_lock *lock)
+{
+	int ret = 0;
+
+	check_res_locked(lock->l_resource);
+	if (ldlm_is_granted(lock) && !ldlm_is_cp_reqd(lock))
+		ret = 1;
+	else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock))
+		ret = 1;
+	return ret;
+}
+EXPORT_SYMBOL(is_granted_or_cancelled_nolock);
+
+/**
+ * Calculate the Completion timeout (covering enqueue, BL AST, data flush,
+ * lock cancel, and their replies). Used for lock completion timeout on the
+ * client side.
+ *
+ * \param[in] lock        lock which is waiting the completion callback
+ *
+ * \retval            timeout in seconds to wait for the server reply
+ */
+/*
+ * We use the same basis for both server side and client side functions
+ * from a single node.
+ */
+static timeout_t ldlm_cp_timeout(struct ldlm_lock *lock)
+{
+	timeout_t timeout;
+
+	if (AT_OFF)
+		return obd_timeout;
+
+	/*
+	 * Wait a long time for enqueue - server may have to callback a
+	 * lock from another client.  Server will evict the other client if it
+	 * doesn't respond reasonably, and then give us the lock.
+	 */
+	timeout = at_get(ldlm_lock_to_ns_at(lock));
+	return max(3 * timeout, (timeout_t)ldlm_enqueue_min);
+}
+
+/**
+ * Helper function for ldlm_completion_ast(), updating timings when lock is
+ * actually granted.
+ */
+static int ldlm_completion_tail(struct ldlm_lock *lock, void *data)
+{
+	int result = 0;
+
+	if (ldlm_is_destroyed(lock) || ldlm_is_failed(lock)) {
+		LDLM_DEBUG(lock, "client-side enqueue: destroyed");
+		result = -EIO;
+	} else if (data == NULL) {
+		LDLM_DEBUG(lock, "client-side enqueue: granted");
+	} else {
+		/* Take into AT only CP RPC, not immediately granted locks */
+		timeout_t delay = 0;
+
+		/* Discard negative timeouts. We should also limit the
+		 * maximum value of the timeout
+		 */
+		if (ktime_get_real_seconds() > lock->l_activity)
+			delay = ktime_get_real_seconds() - lock->l_activity;
+
+		LDLM_DEBUG(lock, "client-side enqueue: granted after %ds",
+			   delay);
+		/* Update our time estimate */
+		at_measured(ldlm_lock_to_ns_at(lock), delay);
+	}
+	return result;
+}
+
+/**
+ * Implementation of ->l_completion_ast() for a client, that doesn't wait
+ * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
+ * other threads that cannot block for long.
+ */
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	ENTRY;
+
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+		RETURN(0);
+	}
+
+	if (!(flags & LDLM_FL_BLOCKED_MASK)) {
+		wake_up(&lock->l_waitq);
+		RETURN(ldlm_completion_tail(lock, data));
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, going forward");
+	ldlm_reprocess_all(lock->l_resource, 0);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_completion_ast_async);
+
+/**
+ * Generic LDLM "completion" AST. This is called in several cases:
+ *
+ *     - when a reply to an ENQUEUE RPC is received from the server
+ *       (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at
+ *       this point (determined by flags);
+ *
+ *     - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has
+ *       been granted;
+ *
+ *     - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock
+ *       gets correct lvb;
+ *
+ *     - to force all locks when resource is destroyed (cleanup_resource());
+ *
+ * If lock is not granted in the first case, this function waits until second
+ * or penultimate cases happen in some other thread.
+ *
+ */
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	/* XXX ALLOCATE - 160 bytes */
+	struct lock_wait_data lwd;
+	struct obd_device *obd;
+	struct obd_import *imp = NULL;
+	timeout_t timeout;
+	int rc = 0;
+
+	ENTRY;
+
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+		goto noreproc;
+	}
+
+	if (!(flags & LDLM_FL_BLOCKED_MASK)) {
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked locksleeping");
+
+noreproc:
+
+	obd = class_exp2obd(lock->l_conn_export);
+
+	/* if this is a local lock, then there is no import */
+	if (obd != NULL)
+		imp = obd->u.cli.cl_import;
+
+	timeout = ldlm_cp_timeout(lock);
+
+	lwd.lwd_lock = lock;
+	lock->l_activity = ktime_get_real_seconds();
+
+	if (imp != NULL) {
+		spin_lock(&imp->imp_lock);
+		lwd.lwd_conn_cnt = imp->imp_conn_cnt;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	if (ns_is_client(ldlm_lock_to_ns(lock)) &&
+	    OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
+				 OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
+		ldlm_set_fail_loc(lock);
+		rc = -EINTR;
+	} else {
+		/* Go to sleep until the lock is granted or cancelled. */
+		if (ldlm_is_no_timeout(lock)) {
+			LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
+			rc = l_wait_event_abortable(
+				lock->l_waitq,
+				is_granted_or_cancelled(lock));
+		} else {
+			if (wait_event_idle_timeout(
+				    lock->l_waitq,
+				    is_granted_or_cancelled(lock),
+				    cfs_time_seconds(timeout)) == 0) {
+				ldlm_expired_completion_wait(&lwd);
+				rc = l_wait_event_abortable(
+					lock->l_waitq,
+					is_granted_or_cancelled(lock));
+			}
+		}
+	}
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		RETURN(rc);
+	}
+
+	RETURN(ldlm_completion_tail(lock, data));
+}
+EXPORT_SYMBOL(ldlm_completion_ast);
+
+/**
+ * A helper to build a blocking AST function
+ *
+ * Perform a common operation for blocking ASTs:
+ * defferred lock cancellation.
+ *
+ * \param lock the lock blocking or canceling AST was called on
+ * \retval 0
+ * \see mdt_blocking_ast
+ * \see ldlm_blocking_ast
+ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
+{
+	int do_ast;
+
+	ENTRY;
+
+	ldlm_set_cbpending(lock);
+	do_ast = (!lock->l_readers && !lock->l_writers);
+	unlock_res_and_lock(lock);
+
+	if (do_ast) {
+		struct lustre_handle lockh;
+		int rc;
+
+		LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0)
+			CERROR("ldlm_cli_cancel: %d\n", rc);
+	} else {
+		LDLM_DEBUG(lock,
+			   "Lock still has references, will be cancelled later");
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
+
+/**
+ * Server blocking AST
+ *
+ * ->l_blocking_ast() callback for LDLM locks acquired by server-side
+ * OBDs.
+ *
+ * \param lock the lock which blocks a request or cancelling lock
+ * \param desc unused
+ * \param data unused
+ * \param flag indicates whether this cancelling or blocking callback
+ * \retval 0
+ * \see ldlm_blocking_ast_nocheck
+ */
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		      void *data, int flag)
+{
+	ENTRY;
+
+	if (flag == LDLM_CB_CANCELING) {
+		/* Don't need to do anything here. */
+		RETURN(0);
+	}
+
+	lock_res_and_lock(lock);
+	/*
+	 * Get this: if ldlm_blocking_ast is racing with intent_policy, such
+	 * that ldlm_blocking_ast is called just before intent_policy method
+	 * takes the lr_lock, then by the time we get the lock, we might not
+	 * be the correct blocking function anymore.  So check, and return
+	 * early, if so.
+	 */
+	if (lock->l_blocking_ast != ldlm_blocking_ast) {
+		unlock_res_and_lock(lock);
+		RETURN(0);
+	}
+	RETURN(ldlm_blocking_ast_nocheck(lock));
+}
+EXPORT_SYMBOL(ldlm_blocking_ast);
+
+/**
+ * Implements ldlm_lock::l_glimpse_ast for extent locks acquired on the server.
+ *
+ * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for that is
+ * rather subtle: with OST-side locking, it may so happen that _all_ extent
+ * locks are held by the OST. If client wants to obtain the current file size
+ * it calls ll_glimpse_size(), and (as all locks are held only on the server),
+ * this dummy glimpse callback fires and does nothing. The client still
+ * receives the correct file size due to the following fragment of code in
+ * ldlm_cb_interpret():
+ *
+ *	if (rc == -ELDLM_NO_LOCK_DATA) {
+ *		LDLM_DEBUG(lock, "lost race - client has a lock but no"
+ *			   "inode");
+ *		ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
+ *	}
+ *
+ * That is, after the glimpse returns this error, ofd_lvbo_update() is called
+ * and returns the updated file attributes from the inode to the client.
+ *
+ * See also comment in ofd_intent_policy() on why servers must set a non-NULL
+ * l_glimpse_ast when grabbing DLM locks.  Otherwise, the server will assume
+ * that the object is in the process of being destroyed.
+ *
+ * \param[in] lock	DLM lock being glimpsed, unused
+ * \param[in] reqp	pointer to ptlrpc_request, unused
+ *
+ * \retval		-ELDLM_NO_LOCK_DATA to get attributes from disk object
+ */
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+{
+	return -ELDLM_NO_LOCK_DATA;
+}
+
+/**
+ * Enqueue a local lock (typically on a server).
+ */
+int ldlm_cli_enqueue_local(const struct lu_env *env,
+			   struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   enum ldlm_type type, union ldlm_policy_data *policy,
+			   enum ldlm_mode mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh)
+{
+	struct ldlm_lock *lock;
+	int err;
+	const struct ldlm_callback_suite cbs = { .lcs_completion = completion,
+						 .lcs_blocking   = blocking,
+						 .lcs_glimpse    = glimpse,
+	};
+
+	ENTRY;
+
+	LASSERT(!(*flags & LDLM_FL_REPLAY));
+	if (unlikely(ns_is_client(ns))) {
+		CERROR("Trying to enqueue local lock in a shadow namespace\n");
+		LBUG();
+	}
+
+	lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len,
+				lvb_type);
+	if (IS_ERR(lock))
+		GOTO(out_nolock, err = PTR_ERR(lock));
+
+	err = ldlm_lvbo_init(lock->l_resource);
+	if (err < 0) {
+		LDLM_ERROR(lock, "delayed lvb init failed (rc %d)", err);
+		ldlm_lock_destroy_nolock(lock);
+		GOTO(out, err);
+	}
+
+	ldlm_lock2handle(lock, lockh);
+
+	/*
+	 * NB: we don't have any lock now (lock_res_and_lock)
+	 * because it's a new lock
+	 */
+	ldlm_lock_addref_internal_nolock(lock, mode);
+	ldlm_set_local(lock);
+	if (*flags & LDLM_FL_ATOMIC_CB)
+		ldlm_set_atomic_cb(lock);
+
+	if (*flags & LDLM_FL_CANCEL_ON_BLOCK)
+		ldlm_set_cancel_on_block(lock);
+
+	if (policy != NULL)
+		lock->l_policy_data = *policy;
+	if (client_cookie != NULL)
+		lock->l_client_cookie = *client_cookie;
+	if (type == LDLM_EXTENT) {
+		/* extent lock without policy is a bug */
+		if (policy == NULL)
+			LBUG();
+
+		lock->l_req_extent = policy->l_extent;
+	}
+
+	err = ldlm_lock_enqueue(env, ns, &lock, policy, flags);
+	if (unlikely(err != ELDLM_OK))
+		GOTO(out, err);
+
+	if (policy != NULL)
+		*policy = lock->l_policy_data;
+
+	if (lock->l_completion_ast)
+		lock->l_completion_ast(lock, *flags, NULL);
+
+	LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
+	EXIT;
+ out:
+	LDLM_LOCK_RELEASE(lock);
+ out_nolock:
+	return err;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
+
+static void failed_lock_cleanup(struct ldlm_namespace *ns,
+				struct ldlm_lock *lock, int mode)
+{
+	int need_cancel = 0;
+
+	/* Set a flag to prevent us from sending a CANCEL (b=407) */
+	lock_res_and_lock(lock);
+	/* Check that lock is not granted or failed, we might race. */
+	if (!ldlm_is_granted(lock) && !ldlm_is_failed(lock)) {
+		/*
+		 * Make sure that this lock will not be found by raced
+		 * bl_ast and -EINVAL reply is sent to server anyways.
+		 * b=17645
+		 */
+		lock->l_flags |= LDLM_FL_FAILED |
+				 LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
+		if (!(ldlm_is_bl_ast(lock) &&
+		      lock->l_remote_handle.cookie != 0))
+			lock->l_flags |= LDLM_FL_LOCAL_ONLY;
+		need_cancel = 1;
+	}
+	unlock_res_and_lock(lock);
+
+	if (need_cancel)
+		LDLM_DEBUG(lock,
+			   "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
+	else
+		LDLM_DEBUG(lock, "lock was granted or failed in race");
+
+	/*
+	 * XXX - HACK because we shouldn't call ldlm_lock_destroy()
+	 *       from llite/file.c/ll_file_flock().
+	 */
+	/*
+	 * This code makes for the fact that we do not have blocking handler on
+	 * a client for flock locks. As such this is the place where we must
+	 * completely kill failed locks. (interrupted and those that
+	 * were waiting to be granted when server evicted us.
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK) {
+		lock_res_and_lock(lock);
+		if (!ldlm_is_destroyed(lock)) {
+			ldlm_resource_unlink_lock(lock);
+			ldlm_lock_decref_internal_nolock(lock, mode);
+			ldlm_lock_destroy_nolock(lock);
+		}
+		unlock_res_and_lock(lock);
+	} else {
+		ldlm_lock_decref_internal(lock, mode);
+	}
+}
+
+static bool ldlm_request_slot_needed(struct ldlm_enqueue_info *einfo)
+{
+	/* exclude EXTENT locks and DOM-only IBITS locks because they
+	 * are asynchronous and don't wait on server being blocked.
+	 */
+	return einfo->ei_req_slot &&
+	       (einfo->ei_type == LDLM_FLOCK ||
+		(einfo->ei_type == LDLM_IBITS &&
+		 einfo->ei_inodebits != MDS_INODELOCK_DOM));
+}
+
+/**
+ * Finishing portion of client lock enqueue code.
+ *
+ * Called after receiving reply from server.
+ */
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  struct ldlm_enqueue_info *einfo,
+			  __u8 with_policy, __u64 *ldlm_flags, void *lvb,
+			  __u32 lvb_len, const struct lustre_handle *lockh,
+			  int rc, bool request_slot)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	const struct lu_env *env = NULL;
+	int is_replay = *ldlm_flags & LDLM_FL_REPLAY;
+	struct ldlm_lock *lock;
+	struct ldlm_reply *reply;
+	int cleanup_phase = 1;
+
+	ENTRY;
+
+	if (request_slot)
+		obd_put_request_slot(&req->rq_import->imp_obd->u.cli);
+
+	ptlrpc_put_mod_rpc_slot(req);
+
+	if (req && req->rq_svc_thread)
+		env = req->rq_svc_thread->t_env;
+
+	lock = ldlm_handle2lock(lockh);
+	/* ldlm_cli_enqueue is holding a reference on this lock. */
+	if (!lock) {
+		LASSERT(einfo->ei_type == LDLM_FLOCK);
+		RETURN(-ENOLCK);
+	}
+
+	LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len),
+		 "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len);
+
+	if (rc != ELDLM_OK) {
+		LASSERT(!is_replay);
+		LDLM_DEBUG(lock, "client-side enqueue END (%s)",
+			   rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
+
+		if (rc != ELDLM_LOCK_ABORTED)
+			GOTO(cleanup, rc);
+	}
+
+	/* Before we return, swab the reply */
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL)
+		GOTO(cleanup, rc = -EPROTO);
+
+	if (lvb_len > 0) {
+		int size = 0;
+
+		size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB,
+					    RCL_SERVER);
+		if (size < 0) {
+			LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size);
+			GOTO(cleanup, rc = size);
+		} else if (unlikely(size > lvb_len)) {
+			LDLM_ERROR(lock,
+				   "Replied LVB is larger than expectation, expected = %d, replied = %d",
+				   lvb_len, size);
+			GOTO(cleanup, rc = -EINVAL);
+		}
+		lvb_len = size;
+	}
+
+	if (rc == ELDLM_LOCK_ABORTED) {
+		if (lvb_len > 0 && lvb != NULL)
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lvb, lvb_len);
+		GOTO(cleanup, rc = rc ? : ELDLM_LOCK_ABORTED);
+	}
+
+	/* lock enqueued on the server */
+	cleanup_phase = 0;
+
+	lock_res_and_lock(lock);
+	/* Key change rehash lock in per-export hash with new key */
+	if (exp->exp_lock_hash) {
+		/*
+		 * In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp()
+		 */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_rehash_key(exp->exp_lock_hash,
+				    &lock->l_remote_handle,
+				    &reply->lock_handle,
+				    &lock->l_exp_hash);
+	} else {
+		lock->l_remote_handle = reply->lock_handle;
+	}
+
+	*ldlm_flags = ldlm_flags_from_wire(reply->lock_flags);
+	lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+					      LDLM_FL_INHERIT_MASK);
+	unlock_res_and_lock(lock);
+
+	CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: %#llx\n",
+	       lock, reply->lock_handle.cookie, *ldlm_flags);
+
+	/*
+	 * If enqueue returned a blocked lock but the completion handler has
+	 * already run, then it fixed up the resource and we don't need to do it
+	 * again.
+	 */
+	if ((*ldlm_flags) & LDLM_FL_LOCK_CHANGED) {
+		int newmode = reply->lock_desc.l_req_mode;
+
+		LASSERT(!is_replay);
+		if (newmode && newmode != lock->l_req_mode) {
+			LDLM_DEBUG(lock, "server returned different mode %s",
+				   ldlm_lockname[newmode]);
+			lock->l_req_mode = newmode;
+		}
+
+		if (!ldlm_res_eq(&reply->lock_desc.l_resource.lr_name,
+				 &lock->l_resource->lr_name)) {
+			CDEBUG(D_INFO,
+			       "remote intent success, locking "DLDLMRES", instead of "DLDLMRES"\n",
+			       PLDLMRES(&reply->lock_desc.l_resource),
+			       PLDLMRES(lock->l_resource));
+
+			rc = ldlm_lock_change_resource(ns, lock,
+					&reply->lock_desc.l_resource.lr_name);
+			if (rc || lock->l_resource == NULL)
+				GOTO(cleanup, rc = -ENOMEM);
+			LDLM_DEBUG(lock, "client-side enqueue, new resource");
+		}
+
+		if (with_policy) {
+			/* We assume lock type cannot change on server*/
+			ldlm_convert_policy_to_local(exp,
+						lock->l_resource->lr_type,
+						&reply->lock_desc.l_policy_data,
+						&lock->l_policy_data);
+		}
+
+		if (einfo->ei_type != LDLM_PLAIN)
+			LDLM_DEBUG(lock,
+				   "client-side enqueue, new policy data");
+	}
+
+	if ((*ldlm_flags) & LDLM_FL_AST_SENT) {
+		lock_res_and_lock(lock);
+		ldlm_bl_desc2lock(&reply->lock_desc, lock);
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+	}
+
+	/*
+	 * If the lock has already been granted by a completion AST, don't
+	 * clobber the LVB with an older one.
+	 */
+	if (lvb_len > 0) {
+		/*
+		 * We must lock or a racing completion might update lvb without
+		 * letting us know and we'll clobber the correct value.
+		 * Cannot unlock after the check either, a that still leaves
+		 * a tiny window for completion to get in
+		 */
+		lock_res_and_lock(lock);
+		if (!ldlm_is_granted(lock))
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lock->l_lvb_data, lvb_len);
+		unlock_res_and_lock(lock);
+		if (rc < 0) {
+			cleanup_phase = 1;
+			GOTO(cleanup, rc);
+		}
+	}
+
+	if (!is_replay) {
+		rc = ldlm_lock_enqueue(env, ns, &lock, NULL, ldlm_flags);
+		if (lock->l_completion_ast != NULL) {
+			int err = lock->l_completion_ast(lock, *ldlm_flags,
+							 NULL);
+
+			if (!rc)
+				rc = err;
+			if (rc)
+				cleanup_phase = 1;
+		}
+	}
+
+	if (lvb_len > 0 && lvb != NULL) {
+		/*
+		 * Copy the LVB here, and not earlier, because the completion
+		 * AST (if any) can override what we got in the reply
+		 */
+		memcpy(lvb, lock->l_lvb_data, lvb_len);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue END");
+	EXIT;
+cleanup:
+	if (cleanup_phase == 1 && rc)
+		failed_lock_cleanup(ns, lock, einfo->ei_mode);
+	/* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
+	LDLM_LOCK_PUT(lock);
+	LDLM_LOCK_RELEASE(lock);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
+
+/**
+ * Estimate number of lock handles that would fit into request of given
+ * size.  PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
+ * a single page on the send/receive side. XXX: 512 should be changed to
+ * more adequate value.
+ */
+static inline int ldlm_req_handles_avail(int req_size, int off)
+{
+	int avail;
+
+	avail = min_t(int, LDLM_MAXREQSIZE, PAGE_SIZE - 512) - req_size;
+	if (likely(avail >= 0))
+		avail /= (int)sizeof(struct lustre_handle);
+	else
+		avail = 0;
+	avail += LDLM_LOCKREQ_HANDLES - off;
+
+	return avail;
+}
+
+static inline int ldlm_capsule_handles_avail(struct req_capsule *pill,
+					     enum req_location loc,
+					     int off)
+{
+	__u32 size = req_capsule_msg_size(pill, loc);
+
+	return ldlm_req_handles_avail(size, off);
+}
+
+static inline int ldlm_format_handles_avail(struct obd_import *imp,
+					    const struct req_format *fmt,
+					    enum req_location loc, int off)
+{
+	__u32 size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc);
+
+	return ldlm_req_handles_avail(size, off);
+}
+
+/**
+ * Cancel LRU locks and pack them into the enqueue request. Pack there the given
+ * \a count locks in \a cancels.
+ *
+ * This is to be called by functions preparing their own requests that
+ * might contain lists of locks to cancel in addition to actual operation
+ * that needs to be performed.
+ */
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count)
+{
+	struct ldlm_namespace	*ns = exp->exp_obd->obd_namespace;
+	struct req_capsule	*pill = &req->rq_pill;
+	struct ldlm_request	*dlm = NULL;
+	LIST_HEAD(head);
+	int avail, to_free = 0, pack = 0;
+	int rc;
+
+	ENTRY;
+
+	if (cancels == NULL)
+		cancels = &head;
+	if (ns_connect_cancelset(ns)) {
+		/* Estimate the amount of available space in the request. */
+		req_capsule_filled_sizes(pill, RCL_CLIENT);
+		avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
+
+		/* If we have reached the limit, free +1 slot for the new one */
+		if (!ns_connect_lru_resize(ns) && opc == LDLM_ENQUEUE &&
+		    ns->ns_nr_unused >= ns->ns_max_unused)
+			to_free = 1;
+
+		/*
+		 * Cancel LRU locks here _only_ if the server supports
+		 * EARLY_CANCEL. Otherwise we have to send extra CANCEL
+		 * RPC, which will make us slower.
+		 */
+		if (avail > count)
+			count += ldlm_cancel_lru_local(ns, cancels, to_free,
+						       avail - count, 0,
+						       LDLM_LRU_FLAG_NO_WAIT);
+		if (avail > count)
+			pack = count;
+		else
+			pack = avail;
+		req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT,
+				     ldlm_request_bufsize(pack, opc));
+	}
+
+	rc = ptlrpc_request_pack(req, version, opc);
+	if (rc) {
+		ldlm_lock_list_put(cancels, l_bl_ast, count);
+		RETURN(rc);
+	}
+
+	if (ns_connect_cancelset(ns)) {
+		if (canceloff) {
+			dlm = req_capsule_client_get(pill, &RMF_DLM_REQ);
+			LASSERT(dlm);
+			/*
+			 * Skip first lock handler in ldlm_request_pack(),
+			 * this method will increment @lock_count according
+			 * to the lock handle amount actually written to
+			 * the buffer.
+			 */
+			dlm->lock_count = canceloff;
+		}
+		/* Pack into the request @pack lock handles. */
+		ldlm_cli_cancel_list(cancels, pack, req, 0);
+		/* Prepare and send separate cancel RPC for others. */
+		ldlm_cli_cancel_list(cancels, count - pack, NULL, 0);
+	} else {
+		ldlm_lock_list_put(cancels, l_bl_ast, count);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_prep_elc_req);
+
+int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
+			  struct list_head *cancels, int count)
+{
+	return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+				 LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
+}
+EXPORT_SYMBOL(ldlm_prep_enqueue_req);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+	struct ptlrpc_request *req;
+	int rc;
+
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+EXPORT_SYMBOL(ldlm_enqueue_pack);
+
+/**
+ * Client-side lock enqueue.
+ *
+ * If a request has some specific initialisation it is passed in \a reqp,
+ * otherwise it is created in ldlm_cli_enqueue.
+ *
+ * Supports sync and async requests, pass \a async flag accordingly. If a
+ * request was created in ldlm_cli_enqueue and it is the async request,
+ * pass it to the caller in \a reqp.
+ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     union ldlm_policy_data const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async)
+{
+	struct ldlm_namespace *ns;
+	struct ldlm_lock      *lock;
+	struct ldlm_request   *body;
+	int                    is_replay = *flags & LDLM_FL_REPLAY;
+	int                    req_passed_in = 1;
+	int                    rc, err;
+	bool		       need_req_slot;
+	struct ptlrpc_request *req;
+
+	ENTRY;
+
+	LASSERT(exp != NULL);
+
+	ns = exp->exp_obd->obd_namespace;
+
+	/*
+	 * If we're replaying this lock, just check some invariants.
+	 * If we're creating a new lock, get everything all setup nice.
+	 */
+	if (is_replay) {
+		lock = ldlm_handle2lock_long(lockh, 0);
+		LASSERT(lock != NULL);
+		LDLM_DEBUG(lock, "client-side enqueue START");
+		LASSERT(exp == lock->l_conn_export);
+	} else {
+		const struct ldlm_callback_suite cbs = {
+			.lcs_completion = einfo->ei_cb_cp,
+			.lcs_blocking	= einfo->ei_cb_bl,
+			.lcs_glimpse	= einfo->ei_cb_gl
+		};
+		lock = ldlm_lock_create(ns, res_id, einfo->ei_type,
+					einfo->ei_mode, &cbs, einfo->ei_cbdata,
+					lvb_len, lvb_type);
+		if (IS_ERR(lock))
+			RETURN(PTR_ERR(lock));
+
+		if (einfo->ei_cb_created)
+			einfo->ei_cb_created(lock);
+
+		/* for the local lock, add the reference */
+		ldlm_lock_addref_internal(lock, einfo->ei_mode);
+		ldlm_lock2handle(lock, lockh);
+		if (policy != NULL)
+			lock->l_policy_data = *policy;
+
+		if (einfo->ei_type == LDLM_EXTENT) {
+			/* extent lock without policy is a bug */
+			if (policy == NULL)
+				LBUG();
+
+			lock->l_req_extent = policy->l_extent;
+		}
+		LDLM_DEBUG(lock, "client-side enqueue START, flags %#llx",
+			   *flags);
+	}
+
+	lock->l_conn_export = exp;
+	lock->l_export = NULL;
+	lock->l_blocking_ast = einfo->ei_cb_bl;
+	lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL |
+				    LDLM_FL_ATOMIC_CB));
+	lock->l_activity = ktime_get_real_seconds();
+
+	/* lock not sent to server yet */
+	if (reqp == NULL || *reqp == NULL) {
+		req = ldlm_enqueue_pack(exp, lvb_len);
+		if (IS_ERR(req)) {
+			failed_lock_cleanup(ns, lock, einfo->ei_mode);
+			LDLM_LOCK_RELEASE(lock);
+			RETURN(PTR_ERR(req));
+		}
+
+		req_passed_in = 0;
+		if (reqp)
+			*reqp = req;
+	} else {
+		int len;
+
+		req = *reqp;
+		len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ,
+					   RCL_CLIENT);
+		LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n",
+			 DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
+	}
+
+	if (*flags & LDLM_FL_NDELAY) {
+		DEBUG_REQ(D_DLMTRACE, req, "enqueue lock with no delay");
+		req->rq_no_resend = req->rq_no_delay = 1;
+		/*
+		 * probably set a shorter timeout value and handle ETIMEDOUT
+		 * in osc_lock_upcall() correctly
+		 */
+		/* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+	}
+
+	/* Dump lock data into the request buffer */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+	body->lock_handle[0] = *lockh;
+
+	/* extended LDLM opcodes in client stats */
+	if (exp->exp_obd->obd_svc_stats != NULL) {
+		/* glimpse is intent with no intent buffer */
+		if (*flags & LDLM_FL_HAS_INTENT &&
+		    !req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT,
+					   RCL_CLIENT))
+			lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
+					     PTLRPC_LAST_CNTR +
+					     LDLM_GLIMPSE_ENQUEUE);
+		else
+			ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats);
+	}
+
+	/* It is important to obtain modify RPC slot first (if applicable), so
+	 * that threads that are waiting for a modify RPC slot are not polluting
+	 * our rpcs in flight counter. */
+
+	if (einfo->ei_mod_slot)
+		ptlrpc_get_mod_rpc_slot(req);
+
+	need_req_slot = ldlm_request_slot_needed(einfo);
+
+	if (need_req_slot) {
+		rc = obd_get_request_slot(&req->rq_import->imp_obd->u.cli);
+		if (rc) {
+			if (einfo->ei_mod_slot)
+				ptlrpc_put_mod_rpc_slot(req);
+			failed_lock_cleanup(ns, lock, einfo->ei_mode);
+			LDLM_LOCK_RELEASE(lock);
+			if (!req_passed_in)
+				ptlrpc_req_finished(req);
+			GOTO(out, rc);
+		}
+	}
+
+	if (async) {
+		LASSERT(reqp != NULL);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "sending request");
+
+	rc = ptlrpc_queue_wait(req);
+
+	err = ldlm_cli_enqueue_fini(exp, req, einfo, policy ? 1 : 0, flags,
+				    lvb, lvb_len, lockh, rc, need_req_slot);
+
+	/*
+	 * If ldlm_cli_enqueue_fini did not find the lock, we need to free
+	 * one reference that we took
+	 */
+	if (err == -ENOLCK)
+		LDLM_LOCK_RELEASE(lock);
+	else
+		rc = err;
+
+out:
+	if (!req_passed_in && req != NULL) {
+		ptlrpc_req_finished(req);
+		if (reqp)
+			*reqp = NULL;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue);
+
+/**
+ * Client-side IBITS lock convert.
+ *
+ * Inform server that lock has been converted instead of canceling.
+ * Server finishes convert on own side and does reprocess to grant
+ * all related waiting locks.
+ *
+ * Since convert means only ibits downgrading, client doesn't need to
+ * wait for server reply to finish local converting process so this request
+ * is made asynchronous.
+ *
+ */
+int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits)
+{
+	struct ldlm_request *body;
+	struct ptlrpc_request *req;
+	struct obd_export *exp = lock->l_conn_export;
+
+	ENTRY;
+
+	LASSERT(exp != NULL);
+
+	/*
+	 * this is better to check earlier and it is done so already,
+	 * but this check is kept too as final one to issue an error
+	 * if any new code will miss such check.
+	 */
+	if (!exp_connect_lock_convert(exp)) {
+		LDLM_ERROR(lock, "server doesn't support lock convert\n");
+		RETURN(-EPROTO);
+	}
+
+	if (lock->l_resource->lr_type != LDLM_IBITS) {
+		LDLM_ERROR(lock, "convert works with IBITS locks only.");
+		RETURN(-EINVAL);
+	}
+
+	LDLM_DEBUG(lock, "client-side convert");
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+					LDLM_CONVERT);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+
+	body->lock_desc.l_req_mode = lock->l_req_mode;
+	body->lock_desc.l_granted_mode = lock->l_granted_mode;
+
+	body->lock_desc.l_policy_data.l_inodebits.bits = new_bits;
+	body->lock_desc.l_policy_data.l_inodebits.cancel_bits = 0;
+
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+	body->lock_count = 1;
+
+	ptlrpc_request_set_replen(req);
+
+	/*
+	 * Use cancel portals for convert as well as high-priority handling.
+	 */
+	req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+	req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+
+	ptlrpc_at_set_req_timeout(req);
+
+	if (exp->exp_obd->obd_svc_stats != NULL)
+		lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
+				     LDLM_CONVERT - LDLM_FIRST_OPC);
+
+	ptlrpcd_add_req(req);
+	RETURN(0);
+}
+
+/**
+ * Cancel locks locally.
+ * Returns:
+ * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server
+ * \retval LDLM_FL_CANCELING otherwise;
+ * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC.
+ */
+static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
+{
+	__u64 rc = LDLM_FL_LOCAL_ONLY;
+
+	ENTRY;
+
+	if (lock->l_conn_export) {
+		bool local_only;
+
+		LDLM_DEBUG(lock, "client-side cancel");
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL,
+				 cfs_fail_val);
+
+		/* Set this flag to prevent others from getting new references*/
+		lock_res_and_lock(lock);
+		ldlm_set_cbpending(lock);
+		local_only = !!(lock->l_flags &
+				(LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
+		ldlm_cancel_callback(lock);
+		rc = (ldlm_is_bl_ast(lock)) ?
+			LDLM_FL_BL_AST : LDLM_FL_CANCELING;
+		unlock_res_and_lock(lock);
+
+		if (local_only) {
+			CDEBUG(D_DLMTRACE,
+			       "not sending request (at caller's instruction)\n");
+			rc = LDLM_FL_LOCAL_ONLY;
+		}
+		ldlm_lock_cancel(lock);
+	} else {
+		if (ns_is_client(ldlm_lock_to_ns(lock))) {
+			LDLM_ERROR(lock, "Trying to cancel local lock");
+			LBUG();
+		}
+		LDLM_DEBUG(lock, "server-side local cancel");
+		ldlm_lock_cancel(lock);
+		ldlm_reprocess_all(lock->l_resource,
+				   lock->l_policy_data.l_inodebits.bits);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Pack \a count locks in \a head into ldlm_request buffer of request \a req.
+ */
+static void ldlm_cancel_pack(struct ptlrpc_request *req,
+			     struct list_head *head, int count)
+{
+	struct ldlm_request *dlm;
+	struct ldlm_lock *lock;
+	int max, packed = 0;
+
+	ENTRY;
+
+	dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	LASSERT(dlm != NULL);
+
+	/* Check the room in the request buffer. */
+	max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) -
+		sizeof(struct ldlm_request);
+	max /= sizeof(struct lustre_handle);
+	max += LDLM_LOCKREQ_HANDLES;
+	LASSERT(max >= dlm->lock_count + count);
+
+	/*
+	 * XXX: it would be better to pack lock handles grouped by resource.
+	 * so that the server cancel would call filter_lvbo_update() less
+	 * frequently.
+	 */
+	list_for_each_entry(lock, head, l_bl_ast) {
+		if (!count--)
+			break;
+		LASSERT(lock->l_conn_export);
+		/* Pack the lock handle to the given request buffer. */
+		LDLM_DEBUG(lock, "packing");
+		dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle;
+		packed++;
+	}
+	CDEBUG(D_DLMTRACE, "%d locks packed\n", packed);
+	EXIT;
+}
+
+/**
+ * Prepare and send a batched cancel RPC. It will include \a count lock
+ * handles of locks given in \a cancels list.
+ */
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
+			int count, enum ldlm_cancel_flags flags)
+{
+	struct ptlrpc_request *req = NULL;
+	struct obd_import *imp;
+	int free, sent = 0;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(exp != NULL);
+	LASSERT(count > 0);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
+
+	if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
+		RETURN(count);
+
+	free = ldlm_format_handles_avail(class_exp2cliimp(exp),
+					 &RQF_LDLM_CANCEL, RCL_CLIENT, 0);
+	if (count > free)
+		count = free;
+
+	while (1) {
+		imp = class_exp2cliimp(exp);
+		if (imp == NULL || imp->imp_invalid) {
+			CDEBUG(D_DLMTRACE,
+			       "skipping cancel on invalid import %p\n", imp);
+			RETURN(count);
+		}
+
+		req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL);
+		if (req == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
+				     ldlm_request_bufsize(count, LDLM_CANCEL));
+
+		rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL);
+		if (rc) {
+			ptlrpc_request_free(req);
+			GOTO(out, rc);
+		}
+
+		/*
+		 * If OSP want cancel cross-MDT lock, let's not block it in
+		 * in recovery, otherwise the lock will not released, if
+		 * the remote target is also in recovery, and it also need
+		 * this lock, it might cause deadlock.
+		 */
+		if (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS &&
+		    exp->exp_obd->obd_lu_dev != NULL &&
+		    exp->exp_obd->obd_lu_dev->ld_site != NULL) {
+			struct lu_device *top_dev;
+
+			top_dev = exp->exp_obd->obd_lu_dev->ld_site->ls_top_dev;
+			if (top_dev != NULL &&
+			    top_dev->ld_obd->obd_recovering)
+				req->rq_allow_replay = 1;
+		}
+
+		req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+		req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+		ptlrpc_at_set_req_timeout(req);
+
+		ldlm_cancel_pack(req, cancels, count);
+
+		ptlrpc_request_set_replen(req);
+		if (flags & LCF_ASYNC) {
+			ptlrpcd_add_req(req);
+			sent = count;
+			GOTO(out, 0);
+		}
+
+		rc = ptlrpc_queue_wait(req);
+		if (rc == LUSTRE_ESTALE) {
+			CDEBUG(D_DLMTRACE,
+			       "client/server (nid %s) out of sync -- not fatal\n",
+			       libcfs_nidstr(&req->rq_import->imp_connection->c_peer.nid));
+			rc = 0;
+		} else if (rc == -ETIMEDOUT && /* check there was no reconnect*/
+			   req->rq_import_generation == imp->imp_generation) {
+			ptlrpc_req_finished(req);
+			continue;
+		} else if (rc != ELDLM_OK) {
+			/* -ESHUTDOWN is common on umount */
+			CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "Got rc %d from cancel RPC: canceling anyway\n",
+				     rc);
+			break;
+		}
+		sent = count;
+		break;
+	}
+
+	ptlrpc_req_finished(req);
+	EXIT;
+out:
+	return sent ? sent : rc;
+}
+
+static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
+{
+	LASSERT(imp != NULL);
+	return &imp->imp_obd->obd_namespace->ns_pool;
+}
+
+/**
+ * Update client's OBD pool related fields with new SLV and Limit from \a req.
+ */
+int ldlm_cli_update_pool(struct ptlrpc_request *req)
+{
+	struct ldlm_namespace *ns;
+	struct obd_device *obd;
+	__u64 new_slv, ratio;
+	__u32 new_limit;
+
+	ENTRY;
+	if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
+		     !imp_connect_lru_resize(req->rq_import)))
+		/* Do nothing for corner cases. */
+		RETURN(0);
+
+	/*
+	 * In some cases RPC may contain SLV and limit zeroed out. This
+	 * is the case when server does not support LRU resize feature.
+	 * This is also possible in some recovery cases when server-side
+	 * reqs have no reference to the OBD export and thus access to
+	 * server-side namespace is not possible.
+	 */
+	if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
+	    lustre_msg_get_limit(req->rq_repmsg) == 0) {
+		DEBUG_REQ(D_HA, req,
+			  "Zero SLV or limit found (SLV=%llu, limit=%u)",
+			  lustre_msg_get_slv(req->rq_repmsg),
+			  lustre_msg_get_limit(req->rq_repmsg));
+		RETURN(0);
+	}
+
+	new_limit = lustre_msg_get_limit(req->rq_repmsg);
+	new_slv = lustre_msg_get_slv(req->rq_repmsg);
+	obd = req->rq_import->imp_obd;
+
+	read_lock(&obd->obd_pool_lock);
+	if (obd->obd_pool_slv == new_slv &&
+	    obd->obd_pool_limit == new_limit) {
+		read_unlock(&obd->obd_pool_lock);
+		RETURN(0);
+	}
+	read_unlock(&obd->obd_pool_lock);
+
+	/*
+	 * OBD device keeps the new pool attributes before they are handled by
+	 * the pool.
+	 */
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_slv = new_slv;
+	obd->obd_pool_limit = new_limit;
+	write_unlock(&obd->obd_pool_lock);
+
+	/*
+	 * Check if an urgent pool recalc is needed, let it to be a change of
+	 * SLV on 10%. It is applicable to LRU resize enabled case only.
+	 */
+	ns = obd->obd_namespace;
+	if (!ns_connect_lru_resize(ns) ||
+	    ldlm_pool_get_slv(&ns->ns_pool) < new_slv)
+		RETURN(0);
+
+	ratio = 100 * new_slv / ldlm_pool_get_slv(&ns->ns_pool);
+	if (100 - ratio >= ns->ns_recalc_pct &&
+	    !ns->ns_stopping && !ns->ns_rpc_recalc) {
+		bool recalc = false;
+
+		spin_lock(&ns->ns_lock);
+		if (!ns->ns_stopping && !ns->ns_rpc_recalc) {
+			ldlm_namespace_get(ns);
+			recalc = true;
+			ns->ns_rpc_recalc = 1;
+		}
+		spin_unlock(&ns->ns_lock);
+		if (recalc)
+			ldlm_bl_to_thread_ns(ns);
+	}
+
+	RETURN(0);
+}
+
+int ldlm_cli_convert(struct ldlm_lock *lock,
+		     enum ldlm_cancel_flags cancel_flags)
+{
+	int rc = -EINVAL;
+
+	LASSERT(!lock->l_readers && !lock->l_writers);
+	LDLM_DEBUG(lock, "client lock convert START");
+
+	if (lock->l_resource->lr_type == LDLM_IBITS) {
+		lock_res_and_lock(lock);
+		do {
+			rc = ldlm_cli_inodebits_convert(lock, cancel_flags);
+		} while (rc == -EAGAIN);
+		unlock_res_and_lock(lock);
+	}
+
+	LDLM_DEBUG(lock, "client lock convert END");
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_cli_convert);
+
+/**
+ * Client side lock cancel.
+ *
+ * Lock must not have any readers or writers by this time.
+ */
+int ldlm_cli_cancel(const struct lustre_handle *lockh,
+		    enum ldlm_cancel_flags cancel_flags)
+{
+	struct obd_export *exp;
+	int avail, count = 1;
+	__u64 rc = 0;
+	struct ldlm_namespace *ns;
+	struct ldlm_lock *lock;
+	LIST_HEAD(cancels);
+
+	ENTRY;
+
+	lock = ldlm_handle2lock_long(lockh, 0);
+	if (lock == NULL) {
+		LDLM_DEBUG_NOLOCK("lock is already being destroyed");
+		RETURN(0);
+	}
+
+	lock_res_and_lock(lock);
+	LASSERT(!ldlm_is_converting(lock));
+
+	/* Lock is being canceled and the caller doesn't want to wait */
+	if (ldlm_is_canceling(lock)) {
+		if (cancel_flags & LCF_ASYNC) {
+			unlock_res_and_lock(lock);
+		} else {
+			unlock_res_and_lock(lock);
+			wait_event_idle(lock->l_waitq, is_bl_done(lock));
+		}
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(0);
+	}
+
+	ldlm_set_canceling(lock);
+	unlock_res_and_lock(lock);
+
+	if (cancel_flags & LCF_LOCAL)
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE,
+				 cfs_fail_val);
+
+	rc = ldlm_cli_cancel_local(lock);
+	if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) {
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(0);
+	}
+	/*
+	 * Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
+	 * RPC which goes to canceld portal, so we can cancel other LRU locks
+	 * here and send them all as one LDLM_CANCEL RPC.
+	 */
+	LASSERT(list_empty(&lock->l_bl_ast));
+	list_add(&lock->l_bl_ast, &cancels);
+
+	exp = lock->l_conn_export;
+	if (exp_connect_cancelset(exp)) {
+		avail = ldlm_format_handles_avail(class_exp2cliimp(exp),
+						  &RQF_LDLM_CANCEL,
+						  RCL_CLIENT, 0);
+		LASSERT(avail > 0);
+
+		ns = ldlm_lock_to_ns(lock);
+		count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
+					       LCF_BL_AST, 0);
+	}
+	ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel);
+
+/**
+ * Locally cancel up to \a count locks in list \a cancels.
+ * Return the number of cancelled locks.
+ */
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       enum ldlm_cancel_flags cancel_flags)
+{
+	LIST_HEAD(head);
+	struct ldlm_lock *lock, *next;
+	int left = 0, bl_ast = 0;
+	__u64 rc;
+
+	left = count;
+	list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
+		if (left-- == 0)
+			break;
+
+		if (cancel_flags & LCF_LOCAL) {
+			rc = LDLM_FL_LOCAL_ONLY;
+			ldlm_lock_cancel(lock);
+		} else {
+			rc = ldlm_cli_cancel_local(lock);
+		}
+		/*
+		 * Until we have compound requests and can send LDLM_CANCEL
+		 * requests batched with generic RPCs, we need to send cancels
+		 * with the LDLM_FL_BL_AST flag in a separate RPC from
+		 * the one being generated now.
+		 */
+		if (!(cancel_flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
+			LDLM_DEBUG(lock, "Cancel lock separately");
+			list_move(&lock->l_bl_ast, &head);
+			bl_ast++;
+			continue;
+		}
+		if (rc == LDLM_FL_LOCAL_ONLY) {
+			/* CANCEL RPC should not be sent to server. */
+			list_del_init(&lock->l_bl_ast);
+			LDLM_LOCK_RELEASE(lock);
+			count--;
+		}
+	}
+	if (bl_ast > 0) {
+		count -= bl_ast;
+		ldlm_cli_cancel_list(&head, bl_ast, NULL, 0);
+	}
+
+	RETURN(count);
+}
+
+/**
+ * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static enum ldlm_policy_res
+ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+			   int added, int min)
+{
+	enum ldlm_policy_res result = LDLM_POLICY_CANCEL_LOCK;
+
+	/*
+	 * don't check @added & @min since we want to process all locks
+	 * from unused list.
+	 * It's fine to not take lock to access lock->l_resource since
+	 * the lock has already been granted so it won't change.
+	 */
+	switch (lock->l_resource->lr_type) {
+		case LDLM_EXTENT:
+		case LDLM_IBITS:
+			if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0)
+				break;
+			fallthrough;
+		default:
+			result = LDLM_POLICY_SKIP_LOCK;
+			break;
+	}
+
+	RETURN(result);
+}
+
+/**
+ * Callback function for LRU-resize policy. Decides whether to keep
+ * \a lock in LRU for \a added in current scan and \a min number of locks
+ * to be preferably canceled.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int added, int min)
+{
+	ktime_t cur = ktime_get();
+	struct ldlm_pool *pl = &ns->ns_pool;
+	u64 slv, lvf, lv;
+	s64 la;
+
+	if (added < min)
+		return LDLM_POLICY_CANCEL_LOCK;
+
+	/*
+	 * Despite of the LV, It doesn't make sense to keep the lock which
+	 * is unused for ns_max_age time.
+	 */
+	if (ktime_after(cur, ktime_add(lock->l_last_used, ns->ns_max_age)))
+		return LDLM_POLICY_CANCEL_LOCK;
+
+	slv = ldlm_pool_get_slv(pl);
+	lvf = ldlm_pool_get_lvf(pl);
+	la = div_u64(ktime_to_ns(ktime_sub(cur, lock->l_last_used)),
+		     NSEC_PER_SEC);
+	lv = lvf * la * ns->ns_nr_unused >> 8;
+
+	/* Inform pool about current CLV to see it via debugfs. */
+	ldlm_pool_set_clv(pl, lv);
+
+	/*
+	 * Stop when SLV is not yet come from server or lv is smaller than
+	 * it is.
+	 */
+	if (slv == 0 || lv < slv)
+		return LDLM_POLICY_KEEP_LOCK;
+
+	return LDLM_POLICY_CANCEL_LOCK;
+}
+
+static enum ldlm_policy_res
+ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns,
+				struct ldlm_lock *lock,
+				int added, int min)
+{
+	enum ldlm_policy_res result;
+
+	result = ldlm_cancel_lrur_policy(ns, lock, added, min);
+	if (result == LDLM_POLICY_KEEP_LOCK)
+		return result;
+
+	return ldlm_cancel_no_wait_policy(ns, lock, added, min);
+}
+
+/**
+ * Callback function for aged policy. Decides whether to keep
+ * \a lock in LRU for \a added in current scan and \a min number of locks
+ * to be preferably canceled.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int added, int min)
+{
+	if ((added >= min) &&
+	    ktime_before(ktime_get(),
+			 ktime_add(lock->l_last_used, ns->ns_max_age)))
+		return LDLM_POLICY_KEEP_LOCK;
+
+	return LDLM_POLICY_CANCEL_LOCK;
+}
+
+static enum ldlm_policy_res
+ldlm_cancel_aged_no_wait_policy(struct ldlm_namespace *ns,
+				struct ldlm_lock *lock,
+				int added, int min)
+{
+	enum ldlm_policy_res result;
+
+	result = ldlm_cancel_aged_policy(ns, lock, added, min);
+	if (result == LDLM_POLICY_KEEP_LOCK)
+		return result;
+
+	return ldlm_cancel_no_wait_policy(ns, lock, added, min);
+}
+
+typedef enum ldlm_policy_res
+(*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+			    int added, int min);
+
+static ldlm_cancel_lru_policy_t
+ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
+{
+	if (ns_connect_lru_resize(ns)) {
+		if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+			return ldlm_cancel_lrur_no_wait_policy;
+		else
+			return ldlm_cancel_lrur_policy;
+	} else {
+		if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+			return ldlm_cancel_aged_no_wait_policy;
+		else
+			return ldlm_cancel_aged_policy;
+	}
+}
+
+/**
+ * - Free space in LRU for \a min new locks,
+ *   redundant unused locks are canceled locally;
+ * - also cancel locally unused aged locks;
+ * - do not cancel more than \a max locks;
+ * - if some locks are cancelled, try to cancel at least \a batch locks
+ * - GET the found locks and add them into the \a cancels list.
+ *
+ * A client lock can be added to the l_bl_ast list only when it is
+ * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing
+ * CANCEL.  There are the following use cases:
+ * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and
+ * ldlm_cli_cancel(), which check and set this flag properly. As any
+ * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
+ * later without any special locking.
+ *
+ * Locks are cancelled according to the LRU resize policy (SLV from server)
+ * if LRU resize is enabled; otherwise, the "aged policy" is used;
+ *
+ * LRU flags:
+ * ----------------------------------------
+ *
+ * flags & LDLM_LRU_FLAG_NO_WAIT - cancel locks w/o sending any RPCs or waiting
+ *				   for any outstanding RPC to complete.
+ *
+ * flags & LDLM_CANCEL_CLEANUP - when cancelling read locks, do not check for
+ *				 other read locks covering the same pages, just
+ *				 discard those pages.
+ */
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
+				 struct list_head *cancels,
+				 int min, int max, int batch,
+				 enum ldlm_lru_flags lru_flags)
+{
+	ldlm_cancel_lru_policy_t pf;
+	int added = 0;
+	int no_wait = lru_flags & LDLM_LRU_FLAG_NO_WAIT;
+	ENTRY;
+
+	/*
+	 * Let only 1 thread to proceed. However, not for those which have the
+	 * @max limit given (ELC), as LRU may be left not cleaned up in full.
+	 */
+	if (max == 0) {
+		if (test_and_set_bit(LDLM_LRU_CANCEL, &ns->ns_flags))
+			RETURN(0);
+	} else if (test_bit(LDLM_LRU_CANCEL, &ns->ns_flags))
+		RETURN(0);
+
+	LASSERT(ergo(max, min <= max));
+	/* No sense to give @batch for ELC */
+	LASSERT(ergo(max, batch == 0));
+
+	if (!ns_connect_lru_resize(ns))
+		min = max_t(int, min, ns->ns_nr_unused - ns->ns_max_unused);
+
+	/* If at least 1 lock is to be cancelled, cancel at least @batch locks */
+	if (min && min < batch)
+		min = batch;
+
+	pf = ldlm_cancel_lru_policy(ns, lru_flags);
+	LASSERT(pf != NULL);
+
+	/* For any flags, stop scanning if @max is reached. */
+	while (!list_empty(&ns->ns_unused_list) && (max == 0 || added < max)) {
+		struct ldlm_lock *lock;
+		struct list_head *item, *next;
+		enum ldlm_policy_res result;
+		ktime_t last_use = ktime_set(0, 0);
+
+		spin_lock(&ns->ns_lock);
+		item = no_wait ? ns->ns_last_pos : &ns->ns_unused_list;
+		for (item = item->next, next = item->next;
+		     item != &ns->ns_unused_list;
+		     item = next, next = item->next) {
+			lock = list_entry(item, struct ldlm_lock, l_lru);
+
+			/* No locks which got blocking requests. */
+			LASSERT(!ldlm_is_bl_ast(lock));
+
+			if (!ldlm_is_canceling(lock))
+				break;
+
+			/*
+			 * Somebody is already doing CANCEL. No need for this
+			 * lock in LRU, do not traverse it again.
+			 */
+			ldlm_lock_remove_from_lru_nolock(lock);
+		}
+		if (item == &ns->ns_unused_list) {
+			spin_unlock(&ns->ns_lock);
+			break;
+		}
+
+		last_use = lock->l_last_used;
+
+		LDLM_LOCK_GET(lock);
+		spin_unlock(&ns->ns_lock);
+		lu_ref_add(&lock->l_reference, __FUNCTION__, current);
+
+		/*
+		 * Pass the lock through the policy filter and see if it
+		 * should stay in LRU.
+		 *
+		 * Even for shrinker policy we stop scanning if
+		 * we find a lock that should stay in the cache.
+		 * We should take into account lock age anyway
+		 * as a new lock is a valuable resource even if
+		 * it has a low weight.
+		 *
+		 * That is, for shrinker policy we drop only
+		 * old locks, but additionally choose them by
+		 * their weight. Big extent locks will stay in
+		 * the cache.
+		 */
+		result = pf(ns, lock, added, min);
+		if (result == LDLM_POLICY_KEEP_LOCK) {
+			lu_ref_del(&lock->l_reference, __func__, current);
+			LDLM_LOCK_RELEASE(lock);
+			break;
+		}
+
+		if (result == LDLM_POLICY_SKIP_LOCK) {
+			lu_ref_del(&lock->l_reference, __func__, current);
+			if (no_wait) {
+				spin_lock(&ns->ns_lock);
+				if (!list_empty(&lock->l_lru) &&
+				    lock->l_lru.prev == ns->ns_last_pos)
+					ns->ns_last_pos = &lock->l_lru;
+				spin_unlock(&ns->ns_lock);
+			}
+
+			LDLM_LOCK_RELEASE(lock);
+			continue;
+		}
+
+		lock_res_and_lock(lock);
+		/* Check flags again under the lock. */
+		if (ldlm_is_canceling(lock) ||
+		    ldlm_lock_remove_from_lru_check(lock, last_use) == 0) {
+			/*
+			 * Another thread is removing lock from LRU, or
+			 * somebody is already doing CANCEL, or there
+			 * is a blocking request which will send cancel
+			 * by itself, or the lock is no longer unused or
+			 * the lock has been used since the pf() call and
+			 * pages could be put under it.
+			 */
+			unlock_res_and_lock(lock);
+			lu_ref_del(&lock->l_reference, __FUNCTION__, current);
+			LDLM_LOCK_RELEASE(lock);
+			continue;
+		}
+		LASSERT(!lock->l_readers && !lock->l_writers);
+
+		/*
+		 * If we have chosen to cancel this lock voluntarily, we
+		 * better send cancel notification to server, so that it
+		 * frees appropriate state. This might lead to a race
+		 * where while we are doing cancel here, server is also
+		 * silently cancelling this lock.
+		 */
+		ldlm_clear_cancel_on_block(lock);
+
+		/*
+		 * Setting the CBPENDING flag is a little misleading,
+		 * but prevents an important race; namely, once
+		 * CBPENDING is set, the lock can accumulate no more
+		 * readers/writers. Since readers and writers are
+		 * already zero here, ldlm_lock_decref() won't see
+		 * this flag and call l_blocking_ast
+		 */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
+
+		if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
+		    (lock->l_resource->lr_type == LDLM_EXTENT ||
+		     ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR)
+			ldlm_set_discard_data(lock);
+
+		/*
+		 * We can't re-add to l_lru as it confuses the
+		 * refcounting in ldlm_lock_remove_from_lru() if an AST
+		 * arrives after we drop lr_lock below. We use l_bl_ast
+		 * and can't use l_pending_chain as it is used both on
+		 * server and client nevertheless b=5666 says it is
+		 * used only on server
+		 */
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+		unlock_res_and_lock(lock);
+		lu_ref_del(&lock->l_reference, __FUNCTION__, current);
+		added++;
+		/* Once a lock added, batch the requested amount */
+		if (min == 0)
+			min = batch;
+	}
+
+	if (max == 0)
+		clear_bit(LDLM_LRU_CANCEL, &ns->ns_flags);
+
+	RETURN(added);
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
+			  int min, int max,
+			  enum ldlm_cancel_flags cancel_flags,
+			  enum ldlm_lru_flags lru_flags)
+{
+	int added;
+
+	added = ldlm_prepare_lru_list(ns, cancels, min, max, 0, lru_flags);
+	if (added <= 0)
+		return added;
+
+	return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
+}
+
+/**
+ * Cancel at least \a min locks from given namespace LRU.
+ *
+ * When called with LCF_ASYNC the blocking callback will be handled
+ * in a thread and this function will return after the thread has been
+ * asked to call the callback.  When called with LCF_ASYNC the blocking
+ * callback will be performed in this function.
+ */
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int min,
+		    enum ldlm_cancel_flags cancel_flags,
+		    enum ldlm_lru_flags lru_flags)
+{
+	LIST_HEAD(cancels);
+	int count, rc;
+
+	ENTRY;
+
+	/*
+	 * Just prepare the list of locks, do not actually cancel them yet.
+	 * Locks are cancelled later in a separate thread.
+	 */
+	count = ldlm_prepare_lru_list(ns, &cancels, min, 0,
+				      ns->ns_cancel_batch, lru_flags);
+	rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
+	if (rc == 0)
+		RETURN(count);
+
+	RETURN(0);
+}
+
+/**
+ * Find and cancel locally unused locks found on resource, matched to the
+ * given policy, mode. GET the found locks and add them into the \a cancels
+ * list.
+ */
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       union ldlm_policy_data *policy,
+			       enum ldlm_mode mode, __u64 lock_flags,
+			       enum ldlm_cancel_flags cancel_flags,
+			       void *opaque)
+{
+	struct ldlm_lock *lock;
+	int count = 0;
+
+	ENTRY;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+		if (opaque != NULL && lock->l_ast_data != opaque) {
+			LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+				   lock->l_ast_data, opaque);
+			continue;
+		}
+
+		if (lock->l_readers || lock->l_writers)
+			continue;
+
+		/*
+		 * If somebody is already doing CANCEL, or blocking AST came
+		 * then skip this lock.
+		 */
+		if (ldlm_is_bl_ast(lock) || ldlm_is_canceling(lock))
+			continue;
+
+		if (lockmode_compat(lock->l_granted_mode, mode))
+			continue;
+
+		/*
+		 * If policy is given and this is IBITS lock, add to list only
+		 * those locks that match by policy.
+		 */
+		if (policy && (lock->l_resource->lr_type == LDLM_IBITS)) {
+			if (!(lock->l_policy_data.l_inodebits.bits &
+			      policy->l_inodebits.bits))
+				continue;
+			/* Skip locks with DoM bit if it is not set in policy
+			 * to don't flush data by side-bits. Lock convert will
+			 * drop those bits separately.
+			 */
+			if (ldlm_has_dom(lock) &&
+			    !(policy->l_inodebits.bits & MDS_INODELOCK_DOM))
+				continue;
+		}
+
+		/* See CBPENDING comment in ldlm_cancel_lru */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
+				 lock_flags;
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+		LDLM_LOCK_GET(lock);
+		count++;
+	}
+	unlock_res(res);
+
+	RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
+}
+EXPORT_SYMBOL(ldlm_cancel_resource_local);
+
+/**
+ * Cancel client-side locks from a list and send/prepare cancel RPCs to the
+ * server.
+ * If \a req is NULL, send CANCEL request to server with handles of locks
+ * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests
+ * separately per lock.
+ * If \a req is not NULL, put handles of locks in \a cancels into the request
+ * buffer at the offset \a off.
+ * Destroy \a cancels at the end.
+ */
+int ldlm_cli_cancel_list(struct list_head *cancels, int count,
+			 struct ptlrpc_request *req,
+			 enum ldlm_cancel_flags flags)
+{
+	struct ldlm_lock *lock;
+	int res = 0;
+
+	ENTRY;
+
+	if (list_empty(cancels) || count == 0)
+		RETURN(0);
+
+	/*
+	 * XXX: requests (both batched and not) could be sent in parallel.
+	 * Usually it is enough to have just 1 RPC, but it is possible that
+	 * there are too many locks to be cancelled in LRU or on a resource.
+	 * It would also speed up the case when the server does not support
+	 * the feature.
+	 */
+	while (count > 0) {
+		LASSERT(!list_empty(cancels));
+		lock = list_entry(cancels->next, struct ldlm_lock,
+				  l_bl_ast);
+		LASSERT(lock->l_conn_export);
+
+		if (exp_connect_cancelset(lock->l_conn_export)) {
+			res = count;
+			if (req)
+				ldlm_cancel_pack(req, cancels, count);
+			else
+				res = ldlm_cli_cancel_req(lock->l_conn_export,
+							  cancels, count,
+							  flags);
+		} else {
+			res = ldlm_cli_cancel_req(lock->l_conn_export,
+						  cancels, 1, flags);
+		}
+
+		if (res < 0) {
+			CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "ldlm_cli_cancel_list: %d\n", res);
+			res = count;
+		}
+
+		count -= res;
+		ldlm_lock_list_put(cancels, l_bl_ast, res);
+	}
+	LASSERT(count == 0);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list);
+
+/**
+ * Cancel all locks on a resource that have 0 readers/writers.
+ *
+ * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * to notify the server.
+ */
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    union ldlm_policy_data *policy,
+				    enum ldlm_mode mode,
+				    enum ldlm_cancel_flags flags, void *opaque)
+{
+	struct ldlm_resource *res;
+	LIST_HEAD(cancels);
+	int count;
+	int rc;
+
+	ENTRY;
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (IS_ERR(res)) {
+		/* This is not a problem. */
+		CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]);
+		RETURN(0);
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
+					   0, flags | LCF_BL_AST, opaque);
+	rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
+	if (rc != ELDLM_OK)
+		CERROR("canceling unused lock "DLDLMRES": rc = %d\n",
+		       PLDLMRES(res), rc);
+
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
+
+struct ldlm_cli_cancel_arg {
+	int     lc_flags;
+	void   *lc_opaque;
+};
+
+static int
+ldlm_cli_hash_cancel_unused(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			    struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource           *res = cfs_hash_object(hs, hnode);
+	struct ldlm_cli_cancel_arg     *lc = arg;
+
+	ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+					NULL, LCK_MINMODE, lc->lc_flags,
+					lc->lc_opaque);
+	/* must return 0 for hash iteration */
+	return 0;
+}
+
+/**
+ * Cancel all locks on a namespace (or a specific resource, if given)
+ * that have 0 readers/writers.
+ *
+ * If flags & LCF_LOCAL, throw the locks away without trying
+ * to notify the server.
+ */
+int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   enum ldlm_cancel_flags flags, void *opaque)
+{
+	struct ldlm_cli_cancel_arg arg = {
+		.lc_flags       = flags,
+		.lc_opaque      = opaque,
+	};
+
+	ENTRY;
+
+	if (ns == NULL)
+		RETURN(ELDLM_OK);
+
+	if (res_id != NULL) {
+		RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
+						       LCK_MINMODE, flags,
+						       opaque));
+	} else {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_cli_hash_cancel_unused, &arg, 0);
+		RETURN(ELDLM_OK);
+	}
+}
+
+/* Lock iterators. */
+
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+			  void *closure)
+{
+	struct list_head *tmp, *next;
+	struct ldlm_lock *lock;
+	int rc = LDLM_ITER_CONTINUE;
+
+	ENTRY;
+
+	if (!res)
+		RETURN(LDLM_ITER_CONTINUE);
+
+	lock_res(res);
+	list_for_each_safe(tmp, next, &res->lr_granted) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+
+	list_for_each_safe(tmp, next, &res->lr_waiting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+out:
+	unlock_res(res);
+	RETURN(rc);
+}
+
+struct iter_helper_data {
+	ldlm_iterator_t iter;
+	void *closure;
+};
+
+static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
+{
+	struct iter_helper_data *helper = closure;
+
+	return helper->iter(lock, helper->closure);
+}
+
+static int ldlm_res_iter_helper(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode, void *arg)
+
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+	return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+				     LDLM_ITER_STOP;
+}
+
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+			    ldlm_iterator_t iter, void *closure)
+
+{
+	struct iter_helper_data helper = { .iter = iter, .closure = closure };
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_iter_helper, &helper, 0);
+
+}
+
+/*
+ * non-blocking function to manipulate a lock whose cb_data is being put away.
+ * return  0:  find no resource
+ *       > 0:  must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
+ *       < 0:  errors
+ */
+int ldlm_resource_iterate(struct ldlm_namespace *ns,
+			  const struct ldlm_res_id *res_id,
+			  ldlm_iterator_t iter, void *data)
+{
+	struct ldlm_resource *res;
+	int rc;
+
+	ENTRY;
+
+	LASSERTF(ns != NULL, "must pass in namespace\n");
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (IS_ERR(res))
+		RETURN(0);
+
+	LDLM_RESOURCE_ADDREF(res);
+	rc = ldlm_resource_foreach(res, iter, data);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_iterate);
+
+/* Lock replay */
+static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
+{
+	struct list_head *list = closure;
+
+	/* we use l_pending_chain here, because it's unused on clients. */
+	LASSERTF(list_empty(&lock->l_pending_chain),
+		 "lock %p next %p prev %p\n",
+		 lock, &lock->l_pending_chain.next,
+		 &lock->l_pending_chain.prev);
+	/*
+	 * b=9573: don't replay locks left after eviction, or
+	 * b=17614: locks being actively cancelled. Get a reference
+	 * on a lock so that it does not disapear under us (e.g. due to cancel)
+	 */
+	if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_BL_DONE))) {
+		list_add(&lock->l_pending_chain, list);
+		LDLM_LOCK_GET(lock);
+	}
+
+	return LDLM_ITER_CONTINUE;
+}
+
+static int replay_lock_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req, void *args, int rc)
+{
+	struct ldlm_async_args *aa = args;
+	struct ldlm_lock     *lock;
+	struct ldlm_reply    *reply;
+	struct obd_export    *exp;
+
+	ENTRY;
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	wake_up(&req->rq_import->imp_replay_waitq);
+
+	if (rc != ELDLM_OK)
+		GOTO(out, rc);
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lock = ldlm_handle2lock(&aa->lock_handle);
+	if (!lock) {
+		CERROR("received replay ack for unknown local cookie %#llx remote cookie %#llx from server %s id %s\n",
+		       aa->lock_handle.cookie, reply->lock_handle.cookie,
+		       req->rq_export->exp_client_uuid.uuid,
+		       libcfs_id2str(req->rq_peer));
+		GOTO(out, rc = -ESTALE);
+	}
+
+	/* Key change rehash lock in per-export hash with new key */
+	exp = req->rq_export;
+	if (exp && exp->exp_lock_hash) {
+		/*
+		 * In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp()
+		 */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_rehash_key(exp->exp_lock_hash,
+				    &lock->l_remote_handle,
+				    &reply->lock_handle,
+				    &lock->l_exp_hash);
+	} else {
+		lock->l_remote_handle = reply->lock_handle;
+	}
+
+	LDLM_DEBUG(lock, "replayed lock:");
+	ptlrpc_import_recovery_state_machine(req->rq_import);
+	LDLM_LOCK_PUT(lock);
+out:
+	if (rc != ELDLM_OK)
+		ptlrpc_connect_import(req->rq_import);
+
+	RETURN(rc);
+}
+
+static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
+{
+	struct ptlrpc_request *req;
+	struct ldlm_async_args *aa;
+	struct ldlm_request   *body;
+	int flags;
+
+	ENTRY;
+
+
+	/* b=11974: Do not replay a lock which is actively being canceled */
+	if (ldlm_is_bl_done(lock)) {
+		LDLM_DEBUG(lock, "Not replaying canceled lock:");
+		RETURN(0);
+	}
+
+	/*
+	 * If this is reply-less callback lock, we cannot replay it, since
+	 * server might have long dropped it, but notification of that event was
+	 * lost by network. (and server granted conflicting lock already)
+	 */
+	if (ldlm_is_cancel_on_block(lock)) {
+		LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+		ldlm_lock_cancel(lock);
+		RETURN(0);
+	}
+
+	/*
+	 * If granted mode matches the requested mode, this lock is granted.
+	 *
+	 * If we haven't been granted anything and are on a resource list,
+	 * then we're blocked/waiting.
+	 *
+	 * If we haven't been granted anything and we're NOT on a resource list,
+	 * then we haven't got a reply yet and don't have a known disposition.
+	 * This happens whenever a lock enqueue is the request that triggers
+	 * recovery.
+	 */
+	if (ldlm_is_granted(lock))
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+	else if (!list_empty(&lock->l_res_link))
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+	else
+		flags = LDLM_FL_REPLAY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
+					LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* We're part of recovery, so don't wait for it. */
+	req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+	/* If the state changed while we were prepared, don't wait */
+	req->rq_no_delay = 1;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(flags);
+
+	ldlm_lock2handle(lock, &body->lock_handle[0]);
+	if (lock->l_lvb_len > 0)
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB);
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			     lock->l_lvb_len);
+	ptlrpc_request_set_replen(req);
+	/*
+	 * notify the server we've replayed all requests.
+	 * also, we mark the request to be put on a dedicated
+	 * queue to be processed after all request replayes.
+	 * b=6063
+	 */
+	lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
+
+	LDLM_DEBUG(lock, "replaying lock:");
+
+	atomic_inc(&imp->imp_replay_inflight);
+	aa = ptlrpc_req_async_args(aa, req);
+	aa->lock_handle = body->lock_handle[0];
+	req->rq_interpret_reply = replay_lock_interpret;
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+	int canceled;
+	LIST_HEAD(cancels);
+
+	CDEBUG(D_DLMTRACE,
+	       "Dropping as many unused locks as possible before replay for namespace %s (%d)\n",
+	       ldlm_ns_name(ns), ns->ns_nr_unused);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val);
+
+	/*
+	 * We don't need to care whether or not LRU resize is enabled
+	 * because the LDLM_LRU_FLAG_NO_WAIT policy doesn't use the
+	 * count parameter
+	 */
+	canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+					 LCF_LOCAL, LDLM_LRU_FLAG_NO_WAIT);
+
+	CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+			   canceled, ldlm_ns_name(ns));
+}
+
+static int lock_can_replay(struct obd_import *imp)
+{
+	struct client_obd *cli = &imp->imp_obd->u.cli;
+
+	CDEBUG(D_HA, "check lock replay limit, inflights = %u(%u)\n",
+	       atomic_read(&imp->imp_replay_inflight) - 1,
+	       cli->cl_max_rpcs_in_flight);
+
+	/* +1 due to ldlm_lock_replay() increment */
+	return atomic_read(&imp->imp_replay_inflight) <
+	       1 + min_t(u32, cli->cl_max_rpcs_in_flight, 8);
+}
+
+int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit)
+{
+	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+	LIST_HEAD(list);
+	struct ldlm_lock *lock, *next;
+	int rc = 0;
+
+	ENTRY;
+
+	while (atomic_read(&imp->imp_replay_inflight) != 1)
+		cond_resched();
+
+	/* don't replay locks if import failed recovery */
+	if (imp->imp_vbr_failed)
+		RETURN(0);
+
+	if (ldlm_cancel_unused_locks_before_replay)
+		ldlm_cancel_unused_locks_for_replay(ns);
+
+	ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+
+	list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+		list_del_init(&lock->l_pending_chain);
+		/* If we disconnected in the middle - cleanup and let
+		 * reconnection to happen again. LU-14027 */
+		if (rc || (imp->imp_state != LUSTRE_IMP_REPLAY_LOCKS)) {
+			LDLM_LOCK_RELEASE(lock);
+			continue;
+		}
+		rc = replay_one_lock(imp, lock);
+		LDLM_LOCK_RELEASE(lock);
+
+		if (rate_limit)
+			wait_event_idle_exclusive(imp->imp_replay_waitq,
+						  lock_can_replay(imp));
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Lock replay uses rate control and can sleep waiting so
+ * must be in separate thread from ptlrpcd itself
+ */
+static int ldlm_lock_replay_thread(void *data)
+{
+	struct obd_import *imp = data;
+
+	unshare_fs_struct();
+	CDEBUG(D_HA, "lock replay thread %s to %s@%s\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	__ldlm_replay_locks(imp, true);
+	atomic_dec(&imp->imp_replay_inflight);
+	ptlrpc_import_recovery_state_machine(imp);
+	class_import_put(imp);
+
+	return 0;
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+	struct task_struct *task;
+	int rc = 0;
+
+	/* ensure this doesn't fall to 0 before all have been queued */
+	if (atomic_inc_return(&imp->imp_replay_inflight) > 1) {
+		atomic_dec(&imp->imp_replay_inflight);
+		return 0;
+	}
+	class_import_get(imp);
+
+	task = kthread_run(ldlm_lock_replay_thread, imp, "ldlm_lock_replay");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CDEBUG(D_HA, "can't start lock replay thread: rc = %d\n", rc);
+
+		/* run lock replay without rate control */
+		rc = __ldlm_replay_locks(imp, false);
+		atomic_dec(&imp->imp_replay_inflight);
+		class_import_put(imp);
+	}
+
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
new file mode 100644
index 0000000000000..92fef33fc9860
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
@@ -0,0 +1,1809 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ldlm/ldlm_resource.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include <obd_class.h>
+#include <libcfs/linux/linux-hash.h>
+#include "ldlm_internal.h"
+
+struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
+struct kmem_cache *ldlm_interval_tree_slab;
+struct kmem_cache *ldlm_inodebits_slab;
+
+int ldlm_srv_namespace_nr = 0;
+int ldlm_cli_namespace_nr = 0;
+
+DEFINE_MUTEX(ldlm_srv_namespace_lock);
+LIST_HEAD(ldlm_srv_namespace_list);
+
+DEFINE_MUTEX(ldlm_cli_namespace_lock);
+/* Client Namespaces that have active resources in them.
+ * Once all resources go away, ldlm_poold moves such namespaces to the
+ * inactive list */
+LIST_HEAD(ldlm_cli_active_namespace_list);
+/* Client namespaces that don't have any locks in them */
+LIST_HEAD(ldlm_cli_inactive_namespace_list);
+
+static struct dentry *ldlm_debugfs_dir;
+static struct dentry *ldlm_ns_debugfs_dir;
+struct dentry *ldlm_svc_debugfs_dir;
+
+/* during debug dump certain amount of granted locks for one resource to avoid
+ * DDOS. */
+static unsigned int ldlm_dump_granted_max = 256;
+
+static ssize_t ldebugfs_dump_ns_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+	RETURN(count);
+}
+
+LDEBUGFS_FOPS_WR_ONLY(ldlm, dump_ns);
+
+static int ldlm_rw_uint_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%u\n", *(unsigned int *)m->private);
+	return 0;
+}
+
+static ssize_t
+ldlm_rw_uint_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+
+	if (!count)
+		return 0;
+
+	return kstrtouint_from_user(buffer, count, 0,
+				    (unsigned int *)seq->private);
+}
+
+LDEBUGFS_SEQ_FOPS(ldlm_rw_uint);
+
+#ifdef HAVE_SERVER_SUPPORT
+
+static int seq_watermark_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%llu\n", *(__u64 *)m->private);
+	return 0;
+}
+
+static ssize_t seq_watermark_write(struct file *file,
+				   const char __user *buffer, size_t count,
+				   loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	u64 value;
+	__u64 watermark;
+	__u64 *data = m->private;
+	bool wm_low = (data == &ldlm_reclaim_threshold_mb) ? true : false;
+	char kernbuf[22] = "";
+	int rc;
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	rc = sysfs_memparse(kernbuf, count, &value, "MiB");
+	if (rc < 0) {
+		CERROR("Failed to set %s, rc = %d.\n",
+		       wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb",
+		       rc);
+		return rc;
+	} else if (value != 0 && value < (1 << 20)) {
+		CERROR("%s should be greater than 1MB.\n",
+		       wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb");
+		return -EINVAL;
+	}
+	watermark = value >> 20;
+
+	if (wm_low) {
+		if (ldlm_lock_limit_mb != 0 && watermark > ldlm_lock_limit_mb) {
+			CERROR("lock_reclaim_threshold_mb must be smaller than "
+			       "lock_limit_mb.\n");
+			return -EINVAL;
+		}
+
+		*data = watermark;
+		if (watermark != 0) {
+			watermark <<= 20;
+			do_div(watermark, sizeof(struct ldlm_lock));
+		}
+		ldlm_reclaim_threshold = watermark;
+	} else {
+		if (ldlm_reclaim_threshold_mb != 0 &&
+		    watermark < ldlm_reclaim_threshold_mb) {
+			CERROR("lock_limit_mb must be greater than "
+			       "lock_reclaim_threshold_mb.\n");
+			return -EINVAL;
+		}
+
+		*data = watermark;
+		if (watermark != 0) {
+			watermark <<= 20;
+			do_div(watermark, sizeof(struct ldlm_lock));
+		}
+		ldlm_lock_limit = watermark;
+	}
+
+	return count;
+}
+
+static int seq_watermark_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, seq_watermark_show, inode->i_private);
+}
+
+static const struct file_operations ldlm_watermark_fops = {
+	.owner		= THIS_MODULE,
+	.open		= seq_watermark_open,
+	.read		= seq_read,
+	.write		= seq_watermark_write,
+	.llseek		= seq_lseek,
+	.release	= lprocfs_single_release,
+};
+
+static int seq_granted_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%llu\n", percpu_counter_sum_positive(
+		   (struct percpu_counter *)m->private));
+	return 0;
+}
+
+static int seq_granted_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, seq_granted_show, inode->i_private);
+}
+
+static const struct file_operations ldlm_granted_fops = {
+	.owner	= THIS_MODULE,
+	.open	= seq_granted_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release = seq_release,
+};
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+static struct ldebugfs_vars ldlm_debugfs_list[] = {
+	{ .name	=	"dump_namespaces",
+	  .fops	=	&ldlm_dump_ns_fops,
+	  .proc_mode =	0222 },
+	{ .name	=	"dump_granted_max",
+	  .fops	=	&ldlm_rw_uint_fops,
+	  .data	=	&ldlm_dump_granted_max },
+#ifdef HAVE_SERVER_SUPPORT
+	{ .name =	"lock_reclaim_threshold_mb",
+	  .fops =	&ldlm_watermark_fops,
+	  .data =	&ldlm_reclaim_threshold_mb },
+	{ .name =	"lock_limit_mb",
+	  .fops =	&ldlm_watermark_fops,
+	  .data =	&ldlm_lock_limit_mb },
+	{ .name =	"lock_granted_count",
+	  .fops =	&ldlm_granted_fops,
+	  .data =	&ldlm_granted_total },
+#endif
+	{ NULL }
+};
+
+int ldlm_debugfs_setup(void)
+{
+	ENTRY;
+	ldlm_debugfs_dir = debugfs_create_dir(OBD_LDLM_DEVICENAME,
+					     debugfs_lustre_root);
+	ldlm_ns_debugfs_dir = debugfs_create_dir("namespaces",
+						 ldlm_debugfs_dir);
+	ldlm_svc_debugfs_dir = debugfs_create_dir("services",
+						  ldlm_debugfs_dir);
+
+	ldebugfs_add_vars(ldlm_debugfs_dir, ldlm_debugfs_list, NULL);
+
+	RETURN(0);
+}
+
+void ldlm_debugfs_cleanup(void)
+{
+	debugfs_remove_recursive(ldlm_debugfs_dir);
+
+	ldlm_svc_debugfs_dir = NULL;
+	ldlm_ns_debugfs_dir = NULL;
+	ldlm_debugfs_dir = NULL;
+}
+
+static ssize_t resource_count_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	__u64			res = 0;
+	int			i;
+
+	/* result is not strictly consistant */
+	for (i = 0; i < (1 << ns->ns_bucket_bits); i++)
+		res += atomic_read(&ns->ns_rs_buckets[i].nsb_count);
+	return sprintf(buf, "%lld\n", res);
+}
+LUSTRE_RO_ATTR(resource_count);
+
+static ssize_t lock_count_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	__u64			locks;
+
+	locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS,
+					LPROCFS_FIELDS_FLAGS_SUM);
+	return sprintf(buf, "%lld\n", locks);
+}
+LUSTRE_RO_ATTR(lock_count);
+
+static ssize_t lock_unused_count_show(struct kobject *kobj,
+				      struct attribute *attr,
+				      char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%d\n", ns->ns_nr_unused);
+}
+LUSTRE_RO_ATTR(lock_unused_count);
+
+static ssize_t lru_size_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	__u32 *nr = &ns->ns_max_unused;
+
+	if (ns_connect_lru_resize(ns))
+		nr = &ns->ns_nr_unused;
+	return sprintf(buf, "%u\n", *nr);
+}
+
+static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int lru_resize;
+	int err;
+
+	if (strncmp(buffer, "clear", 5) == 0) {
+		CDEBUG(D_DLMTRACE,
+		       "dropping all unused locks from namespace %s\n",
+		       ldlm_ns_name(ns));
+		/* Try to cancel all @ns_nr_unused locks. */
+		ldlm_cancel_lru(ns, INT_MAX, 0, LDLM_LRU_FLAG_CLEANUP);
+		return count;
+	}
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0) {
+		CERROR("lru_size: invalid value written\n");
+		return -EINVAL;
+	}
+	lru_resize = (tmp == 0);
+
+	if (ns_connect_lru_resize(ns)) {
+		if (!lru_resize)
+			ns->ns_max_unused = (unsigned int)tmp;
+
+		if (tmp > ns->ns_nr_unused)
+			tmp = ns->ns_nr_unused;
+		tmp = ns->ns_nr_unused - tmp;
+
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s unused locks from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_nr_unused,
+		       (unsigned int)tmp);
+
+		if (!lru_resize) {
+			CDEBUG(D_DLMTRACE,
+			       "disable lru_resize for namespace %s\n",
+			       ldlm_ns_name(ns));
+			ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
+		}
+		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, 0);
+	} else {
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s max_unused from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_max_unused,
+		       (unsigned int)tmp);
+
+		/* Make sure that LRU resize was originally supported before
+		 * turning it on here.
+		 */
+		if (lru_resize &&
+		    (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) {
+			CDEBUG(D_DLMTRACE,
+			       "enable lru_resize for namespace %s\n",
+			       ldlm_ns_name(ns));
+			ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+		}
+		ns->ns_max_unused = (unsigned int)tmp;
+		ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
+	}
+
+	return count;
+}
+LUSTRE_RW_ATTR(lru_size);
+
+static ssize_t lru_cancel_batch_show(struct kobject *kobj,
+				 struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return snprintf(buf, sizeof(buf) - 1, "%u\n", ns->ns_cancel_batch);
+}
+
+static ssize_t lru_cancel_batch_store(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+
+	if (kstrtoul(buffer, 10, &tmp))
+		return -EINVAL;
+
+	ns->ns_cancel_batch = (unsigned int)tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(lru_cancel_batch);
+
+static ssize_t ns_recalc_pct_show(struct kobject *kobj,
+				  struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return snprintf(buf, sizeof(buf) - 1, "%u\n", ns->ns_recalc_pct);
+}
+
+static ssize_t ns_recalc_pct_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+
+	if (kstrtoul(buffer, 10, &tmp))
+		return -EINVAL;
+
+	if (tmp > 100)
+		return -ERANGE;
+
+	ns->ns_recalc_pct = (unsigned int)tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(ns_recalc_pct);
+
+static ssize_t lru_max_age_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%lld\n", ktime_to_ms(ns->ns_max_age));
+}
+
+static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	int scale = NSEC_PER_MSEC;
+	unsigned long long tmp;
+	char *buf;
+
+	/* Did the user ask in seconds or milliseconds. Default is in ms */
+	buf = strstr(buffer, "ms");
+	if (!buf) {
+		buf = strchr(buffer, 's');
+		if (buf)
+			scale = NSEC_PER_SEC;
+	}
+
+	if (buf)
+		*buf = '\0';
+
+	if (kstrtoull(buffer, 10, &tmp))
+		return -EINVAL;
+
+	ns->ns_max_age = ktime_set(0, tmp * scale);
+
+	return count;
+}
+LUSTRE_RW_ATTR(lru_max_age);
+
+static ssize_t early_lock_cancel_show(struct kobject *kobj,
+				      struct attribute *attr,
+				      char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%d\n", ns_connect_cancelset(ns));
+}
+
+static ssize_t early_lock_cancel_store(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buffer,
+				       size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long supp = -1;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &supp);
+	if (rc < 0)
+		return rc;
+
+	if (supp == 0)
+		ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET;
+	else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET)
+		ns->ns_connect_flags |= OBD_CONNECT_CANCELSET;
+	return count;
+}
+LUSTRE_RW_ATTR(early_lock_cancel);
+
+static ssize_t dirty_age_limit_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n",
+			 ktime_divns(ns->ns_dirty_age_limit, NSEC_PER_SEC));
+}
+
+static ssize_t dirty_age_limit_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long long tmp;
+
+	if (kstrtoull(buffer, 10, &tmp))
+		return -EINVAL;
+
+	ns->ns_dirty_age_limit = ktime_set(tmp, 0);
+
+	return count;
+}
+LUSTRE_RW_ATTR(dirty_age_limit);
+
+#ifdef HAVE_SERVER_SUPPORT
+static ssize_t ctime_age_limit_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", ns->ns_ctime_age_limit);
+}
+
+static ssize_t ctime_age_limit_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+
+	if (kstrtoul(buffer, 10, &tmp))
+		return -EINVAL;
+
+	ns->ns_ctime_age_limit = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(ctime_age_limit);
+
+static ssize_t lock_timeouts_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%d\n", ns->ns_timeouts);
+}
+LUSTRE_RO_ATTR(lock_timeouts);
+
+static ssize_t max_nolock_bytes_show(struct kobject *kobj,
+				     struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%u\n", ns->ns_max_nolock_size);
+}
+
+static ssize_t max_nolock_bytes_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int err;
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
+		return -EINVAL;
+
+	ns->ns_max_nolock_size = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_nolock_bytes);
+
+static ssize_t contention_seconds_show(struct kobject *kobj,
+				       struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", ns->ns_contention_time);
+}
+
+static ssize_t contention_seconds_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned int tmp;
+
+	if (kstrtouint(buffer, 10, &tmp))
+		return -EINVAL;
+
+	ns->ns_contention_time = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(contention_seconds);
+
+static ssize_t contended_locks_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%u\n", ns->ns_contended_locks);
+}
+
+static ssize_t contended_locks_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int err;
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
+		return -EINVAL;
+
+	ns->ns_contended_locks = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(contended_locks);
+
+static ssize_t max_parallel_ast_show(struct kobject *kobj,
+				     struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%u\n", ns->ns_max_parallel_ast);
+}
+
+static ssize_t max_parallel_ast_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int err;
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
+		return -EINVAL;
+
+	ns->ns_max_parallel_ast = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_parallel_ast);
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+/* These are for namespaces in /sys/fs/lustre/ldlm/namespaces/ */
+static struct attribute *ldlm_ns_attrs[] = {
+	&lustre_attr_resource_count.attr,
+	&lustre_attr_lock_count.attr,
+	&lustre_attr_lock_unused_count.attr,
+	&lustre_attr_ns_recalc_pct.attr,
+	&lustre_attr_lru_size.attr,
+	&lustre_attr_lru_cancel_batch.attr,
+	&lustre_attr_lru_max_age.attr,
+	&lustre_attr_early_lock_cancel.attr,
+	&lustre_attr_dirty_age_limit.attr,
+#ifdef HAVE_SERVER_SUPPORT
+	&lustre_attr_ctime_age_limit.attr,
+	&lustre_attr_lock_timeouts.attr,
+	&lustre_attr_max_nolock_bytes.attr,
+	&lustre_attr_contention_seconds.attr,
+	&lustre_attr_contended_locks.attr,
+	&lustre_attr_max_parallel_ast.attr,
+#endif
+	NULL,
+};
+
+static void ldlm_ns_release(struct kobject *kobj)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	complete(&ns->ns_kobj_unregister);
+}
+
+KOBJ_ATTRIBUTE_GROUPS(ldlm_ns);
+
+static struct kobj_type ldlm_ns_ktype = {
+	.default_groups = KOBJ_ATTR_GROUPS(ldlm_ns),
+	.sysfs_ops	= &lustre_sysfs_ops,
+	.release	= ldlm_ns_release,
+};
+
+static void ldlm_namespace_debugfs_unregister(struct ldlm_namespace *ns)
+{
+	if (IS_ERR_OR_NULL(ns->ns_debugfs_entry))
+		CERROR("dlm namespace %s has no procfs dir?\n",
+		       ldlm_ns_name(ns));
+	else
+		debugfs_remove_recursive(ns->ns_debugfs_entry);
+
+	if (ns->ns_stats != NULL)
+		lprocfs_free_stats(&ns->ns_stats);
+}
+
+void ldlm_namespace_sysfs_unregister(struct ldlm_namespace *ns)
+{
+	kobject_put(&ns->ns_kobj);
+	wait_for_completion(&ns->ns_kobj_unregister);
+}
+
+int ldlm_namespace_sysfs_register(struct ldlm_namespace *ns)
+{
+	int err;
+
+	ns->ns_kobj.kset = ldlm_ns_kset;
+	init_completion(&ns->ns_kobj_unregister);
+	err = kobject_init_and_add(&ns->ns_kobj, &ldlm_ns_ktype, NULL,
+				   "%s", ldlm_ns_name(ns));
+
+	ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0);
+	if (!ns->ns_stats) {
+		kobject_put(&ns->ns_kobj);
+		return -ENOMEM;
+	}
+
+	lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS,
+			     LPROCFS_CNTR_AVGMINMAX, "locks", "locks");
+
+	return err;
+}
+
+static int ldlm_namespace_debugfs_register(struct ldlm_namespace *ns)
+{
+	struct dentry *ns_entry;
+
+	if (!IS_ERR_OR_NULL(ns->ns_debugfs_entry)) {
+		ns_entry = ns->ns_debugfs_entry;
+	} else {
+		ns_entry = debugfs_create_dir(ldlm_ns_name(ns),
+					      ldlm_ns_debugfs_dir);
+		if (!ns_entry)
+			return -ENOMEM;
+		ns->ns_debugfs_entry = ns_entry;
+	}
+
+	return 0;
+}
+#undef MAX_STRING_SIZE
+
+static unsigned ldlm_res_hop_hash(struct cfs_hash *hs,
+				  const void *key, unsigned int mask)
+{
+	const struct ldlm_res_id *id = key;
+	unsigned int val = 0;
+	unsigned int i;
+
+	for (i = 0; i < RES_NAME_SIZE; i++)
+		val += id->name[i];
+	return val & mask;
+}
+
+static unsigned int ldlm_res_hop_fid_hash(const struct ldlm_res_id *id, unsigned int bits)
+{
+	struct lu_fid       fid;
+	__u32               hash;
+	__u32               val;
+
+	fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF];
+	fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF];
+	fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+	hash = fid_flatten32(&fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) {
+		val = id->name[LUSTRE_RES_ID_HSH_OFF];
+	} else {
+		val = fid_oid(&fid);
+	}
+	hash += (val >> 5) + (val << 11);
+	return cfs_hash_32(hash, bits);
+}
+
+static void *ldlm_res_hop_key(struct hlist_node *hnode)
+{
+	struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	return &res->lr_name;
+}
+
+static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	return ldlm_res_eq((const struct ldlm_res_id *)key,
+			   (const struct ldlm_res_id *)&res->lr_name);
+}
+
+static void *ldlm_res_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_resource, lr_hash);
+}
+
+static void
+ldlm_res_hop_get_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	ldlm_resource_getref(res);
+}
+
+static void ldlm_res_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	ldlm_resource_putref(res);
+}
+
+static struct cfs_hash_ops ldlm_ns_hash_ops = {
+	.hs_hash        = ldlm_res_hop_hash,
+	.hs_key         = ldlm_res_hop_key,
+	.hs_keycmp      = ldlm_res_hop_keycmp,
+	.hs_keycpy      = NULL,
+	.hs_object      = ldlm_res_hop_object,
+	.hs_get         = ldlm_res_hop_get_locked,
+	.hs_put         = ldlm_res_hop_put
+};
+
+static struct {
+	/** hash bucket bits */
+	unsigned		nsd_bkt_bits;
+	/** hash bits */
+	unsigned		nsd_all_bits;
+} ldlm_ns_hash_defs[] = {
+	[LDLM_NS_TYPE_MDC] = {
+		.nsd_bkt_bits   = 11,
+		.nsd_all_bits   = 16,
+	},
+	[LDLM_NS_TYPE_MDT] = {
+		.nsd_bkt_bits   = 14,
+		.nsd_all_bits   = 21,
+	},
+	[LDLM_NS_TYPE_OSC] = {
+		.nsd_bkt_bits   = 8,
+		.nsd_all_bits   = 12,
+	},
+	[LDLM_NS_TYPE_OST] = {
+		.nsd_bkt_bits   = 11,
+		.nsd_all_bits   = 17,
+	},
+	[LDLM_NS_TYPE_MGC] = {
+		.nsd_bkt_bits   = 3,
+		.nsd_all_bits   = 4,
+	},
+	[LDLM_NS_TYPE_MGT] = {
+		.nsd_bkt_bits   = 3,
+		.nsd_all_bits   = 4,
+	},
+};
+
+/**
+ * Create and initialize new empty namespace.
+ */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+					  enum ldlm_side client,
+					  enum ldlm_appetite apt,
+					  enum ldlm_ns_type ns_type)
+{
+	struct ldlm_namespace *ns = NULL;
+	int idx;
+	int rc;
+
+	ENTRY;
+	LASSERT(obd != NULL);
+
+	rc = ldlm_get_ref();
+	if (rc) {
+		CERROR("%s: ldlm_get_ref failed: rc = %d\n", name, rc);
+		RETURN(ERR_PTR(rc));
+	}
+
+	if (ns_type >= ARRAY_SIZE(ldlm_ns_hash_defs) ||
+	    ldlm_ns_hash_defs[ns_type].nsd_bkt_bits == 0) {
+		rc = -EINVAL;
+		CERROR("%s: unknown namespace type %d: rc = %d\n",
+		       name, ns_type, rc);
+		GOTO(out_ref, rc);
+	}
+
+	OBD_ALLOC_PTR(ns);
+	if (!ns)
+		GOTO(out_ref, rc = -ENOMEM);
+
+	ns->ns_rs_hash = cfs_hash_create(name,
+					 ldlm_ns_hash_defs[ns_type].nsd_all_bits,
+					 ldlm_ns_hash_defs[ns_type].nsd_all_bits,
+					 ldlm_ns_hash_defs[ns_type].nsd_bkt_bits,
+					 0,
+					 CFS_HASH_MIN_THETA,
+					 CFS_HASH_MAX_THETA,
+					 &ldlm_ns_hash_ops,
+					 CFS_HASH_DEPTH |
+					 CFS_HASH_BIGNAME |
+					 CFS_HASH_SPIN_BKTLOCK |
+					 CFS_HASH_NO_ITEMREF);
+	if (!ns->ns_rs_hash)
+		GOTO(out_ns, rc = -ENOMEM);
+
+	ns->ns_bucket_bits = ldlm_ns_hash_defs[ns_type].nsd_all_bits -
+			     ldlm_ns_hash_defs[ns_type].nsd_bkt_bits;
+
+	OBD_ALLOC_PTR_ARRAY_LARGE(ns->ns_rs_buckets, 1 << ns->ns_bucket_bits);
+	if (!ns->ns_rs_buckets)
+		GOTO(out_hash, rc = -ENOMEM);
+
+	for (idx = 0; idx < (1 << ns->ns_bucket_bits); idx++) {
+		struct ldlm_ns_bucket *nsb = &ns->ns_rs_buckets[idx];
+
+		at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0);
+		nsb->nsb_namespace = ns;
+		nsb->nsb_reclaim_start = 0;
+		atomic_set(&nsb->nsb_count, 0);
+	}
+
+	ns->ns_obd = obd;
+	ns->ns_appetite = apt;
+	ns->ns_client = client;
+	ns->ns_name = kstrdup(name, GFP_KERNEL);
+	if (!ns->ns_name)
+		GOTO(out_hash, rc = -ENOMEM);
+
+	INIT_LIST_HEAD(&ns->ns_list_chain);
+	INIT_LIST_HEAD(&ns->ns_unused_list);
+	spin_lock_init(&ns->ns_lock);
+	atomic_set(&ns->ns_bref, 0);
+	init_waitqueue_head(&ns->ns_waitq);
+
+	ns->ns_max_nolock_size    = NS_DEFAULT_MAX_NOLOCK_BYTES;
+	ns->ns_contention_time    = NS_DEFAULT_CONTENTION_SECONDS;
+	ns->ns_contended_locks    = NS_DEFAULT_CONTENDED_LOCKS;
+
+	ns->ns_max_parallel_ast   = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
+	ns->ns_nr_unused          = 0;
+	ns->ns_max_unused         = LDLM_DEFAULT_LRU_SIZE;
+	ns->ns_cancel_batch       = LDLM_DEFAULT_LRU_SHRINK_BATCH;
+	ns->ns_recalc_pct         = LDLM_DEFAULT_SLV_RECALC_PCT;
+	ns->ns_max_age            = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0);
+	ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
+	ns->ns_dirty_age_limit    = ktime_set(LDLM_DIRTY_AGE_LIMIT, 0);
+	ns->ns_timeouts           = 0;
+	ns->ns_orig_connect_flags = 0;
+	ns->ns_connect_flags      = 0;
+	ns->ns_stopping           = 0;
+	ns->ns_reclaim_start	  = 0;
+	ns->ns_last_pos		  = &ns->ns_unused_list;
+	ns->ns_flags		  = 0;
+
+	rc = ldlm_namespace_sysfs_register(ns);
+	if (rc) {
+		CERROR("%s: cannot initialize ns sysfs: rc = %d\n", name, rc);
+		GOTO(out_hash, rc);
+	}
+
+	rc = ldlm_namespace_debugfs_register(ns);
+	if (rc) {
+		CERROR("%s: cannot initialize ns proc: rc = %d\n", name, rc);
+		GOTO(out_sysfs, rc);
+	}
+
+	idx = ldlm_namespace_nr_read(client);
+	rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client);
+	if (rc) {
+		CERROR("%s: cannot initialize lock pool, rc = %d\n", name, rc);
+		GOTO(out_proc, rc);
+	}
+
+	ldlm_namespace_register(ns, client);
+	RETURN(ns);
+out_proc:
+	ldlm_namespace_debugfs_unregister(ns);
+out_sysfs:
+	ldlm_namespace_sysfs_unregister(ns);
+	ldlm_namespace_cleanup(ns, 0);
+out_hash:
+	OBD_FREE_PTR_ARRAY_LARGE(ns->ns_rs_buckets, 1 << ns->ns_bucket_bits);
+	kfree(ns->ns_name);
+	cfs_hash_putref(ns->ns_rs_hash);
+out_ns:
+        OBD_FREE_PTR(ns);
+out_ref:
+	ldlm_put_ref();
+	RETURN(ERR_PTR(rc));
+}
+EXPORT_SYMBOL(ldlm_namespace_new);
+
+/**
+ * Cancel and destroy all locks on a resource.
+ *
+ * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just
+ * clean up.  This is currently only used for recovery, and we make
+ * certain assumptions as a result--notably, that we shouldn't cancel
+ * locks with refs.
+ */
+static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
+			     __u64 flags)
+{
+	struct list_head *tmp;
+	int rc = 0, client = ns_is_client(ldlm_res_to_ns(res));
+	bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY);
+
+	do {
+		struct ldlm_lock *lock = NULL;
+
+		/* First, we look for non-cleaned-yet lock
+		 * all cleaned locks are marked by CLEANED flag. */
+		lock_res(res);
+		list_for_each(tmp, q) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					  l_res_link);
+			if (ldlm_is_cleaned(lock)) {
+				lock = NULL;
+				continue;
+			}
+			LDLM_LOCK_GET(lock);
+			ldlm_set_cleaned(lock);
+			break;
+		}
+
+		if (lock == NULL) {
+			unlock_res(res);
+			break;
+		}
+
+		/* Set CBPENDING so nothing in the cancellation path
+		 * can match this lock. */
+		ldlm_set_cbpending(lock);
+		ldlm_set_failed(lock);
+		lock->l_flags |= flags;
+
+		/* ... without sending a CANCEL message for local_only. */
+		if (local_only)
+			ldlm_set_local_only(lock);
+
+		if (local_only && (lock->l_readers || lock->l_writers)) {
+			/*
+			 * This is a little bit gross, but much better than the
+			 * alternative: pretend that we got a blocking AST from
+			 * the server, so that when the lock is decref'd, it
+			 * will go away ...
+			 */
+			unlock_res(res);
+			LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY");
+			if (lock->l_flags & LDLM_FL_FAIL_LOC)
+				schedule_timeout_uninterruptible(
+					cfs_time_seconds(4));
+
+			if (lock->l_completion_ast)
+				lock->l_completion_ast(lock,
+						       LDLM_FL_FAILED, NULL);
+			LDLM_LOCK_RELEASE(lock);
+			continue;
+		}
+
+		if (client) {
+			struct lustre_handle lockh;
+
+			unlock_res(res);
+			ldlm_lock2handle(lock, &lockh);
+			rc = ldlm_cli_cancel(&lockh, LCF_LOCAL);
+			if (rc)
+				CERROR("ldlm_cli_cancel: %d\n", rc);
+		} else {
+			unlock_res(res);
+			LDLM_DEBUG(lock,
+				   "Freeing a lock still held by a client node");
+			ldlm_lock_cancel(lock);
+		}
+		LDLM_LOCK_RELEASE(lock);
+	} while (1);
+}
+
+static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			       struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	__u64 flags = *(__u64 *)arg;
+
+	cleanup_resource(res, &res->lr_granted, flags);
+	cleanup_resource(res, &res->lr_waiting, flags);
+
+	return 0;
+}
+
+static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				  struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource  *res = cfs_hash_object(hs, hnode);
+
+	lock_res(res);
+	CERROR("%s: namespace resource "DLDLMRES" (%p) refcount nonzero "
+	       "(%d) after lock cleanup; forcing cleanup.\n",
+	       ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res,
+	       atomic_read(&res->lr_refcount) - 1);
+
+	/* Use D_NETERROR since it is in the default mask */
+	ldlm_resource_dump(D_NETERROR, res);
+	unlock_res(res);
+	return 0;
+}
+
+/**
+ * Cancel and destroy all locks in the namespace.
+ *
+ * Typically used during evictions when server notified client that it was
+ * evicted and all of its state needs to be destroyed.
+ * Also used during shutdown.
+ */
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags)
+{
+	if (ns == NULL) {
+		CDEBUG(D_INFO, "NULL ns, skipping cleanup\n");
+		return ELDLM_OK;
+	}
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean,
+				 &flags, 0);
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain,
+				 NULL, 0);
+	return ELDLM_OK;
+}
+EXPORT_SYMBOL(ldlm_namespace_cleanup);
+
+/**
+ * Attempts to free namespace.
+ *
+ * Only used when namespace goes away, like during an unmount.
+ */
+static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force)
+{
+	ENTRY;
+
+	/* At shutdown time, don't call the cancellation callback */
+	ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0);
+
+	if (atomic_read(&ns->ns_bref) > 0) {
+		int rc;
+		CDEBUG(D_DLMTRACE,
+		       "dlm namespace %s free waiting on refcount %d\n",
+		       ldlm_ns_name(ns), atomic_read(&ns->ns_bref));
+force_wait:
+		if (force)
+			rc = wait_event_idle_timeout(
+				ns->ns_waitq,
+				atomic_read(&ns->ns_bref) == 0,
+				cfs_time_seconds(1) / 4);
+		else
+			rc = l_wait_event_abortable(
+				ns->ns_waitq, atomic_read(&ns->ns_bref) == 0);
+
+		/* Forced cleanups should be able to reclaim all references,
+		 * so it's safe to wait forever... we can't leak locks... */
+		if (force && rc == 0) {
+			rc = -ETIMEDOUT;
+			LCONSOLE_ERROR("Forced cleanup waiting for %s "
+				       "namespace with %d resources in use, "
+				       "(rc=%d)\n", ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			GOTO(force_wait, rc);
+		}
+
+		if (atomic_read(&ns->ns_bref)) {
+			LCONSOLE_ERROR("Cleanup waiting for %s namespace "
+				       "with %d resources in use, (rc=%d)\n",
+				       ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			RETURN(ELDLM_NAMESPACE_EXISTS);
+		}
+		CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n",
+		       ldlm_ns_name(ns));
+	}
+
+	RETURN(ELDLM_OK);
+}
+
+/**
+ * Performs various cleanups for passed \a ns to make it drop refc and be
+ * ready for freeing. Waits for refc == 0.
+ *
+ * The following is done:
+ * (0) Unregister \a ns from its list to make inaccessible for potential
+ * users like pools thread and others;
+ * (1) Clear all locks in \a ns.
+ */
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+                               struct obd_import *imp,
+                               int force)
+{
+	int rc;
+
+	ENTRY;
+	if (!ns) {
+		EXIT;
+		return;
+	}
+
+	spin_lock(&ns->ns_lock);
+	ns->ns_stopping = 1;
+	spin_unlock(&ns->ns_lock);
+
+	/*
+	 * Can fail with -EINTR when force == 0 in which case try harder.
+	 */
+	rc = __ldlm_namespace_free(ns, force);
+	if (rc != ELDLM_OK) {
+		if (imp) {
+			ptlrpc_disconnect_import(imp, 0);
+			ptlrpc_invalidate_import(imp);
+		}
+
+		/*
+		 * With all requests dropped and the import inactive
+		 * we are gaurenteed all reference will be dropped.
+		 */
+		rc = __ldlm_namespace_free(ns, 1);
+		LASSERT(rc == 0);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_namespace_free_prior);
+
+/**
+ * Performs freeing memory structures related to \a ns. This is only done
+ * when ldlm_namespce_free_prior() successfully removed all resources
+ * referencing \a ns and its refc == 0.
+ */
+void ldlm_namespace_free_post(struct ldlm_namespace *ns)
+{
+	ENTRY;
+	if (!ns) {
+		EXIT;
+		return;
+	}
+
+	/* Make sure that nobody can find this ns in its list. */
+	ldlm_namespace_unregister(ns, ns->ns_client);
+	/* Fini pool _before_ parent proc dir is removed. This is important as
+	 * ldlm_pool_fini() removes own proc dir which is child to @dir.
+	 * Removing it after @dir may cause oops. */
+	ldlm_pool_fini(&ns->ns_pool);
+
+	ldlm_namespace_debugfs_unregister(ns);
+	ldlm_namespace_sysfs_unregister(ns);
+	cfs_hash_putref(ns->ns_rs_hash);
+	OBD_FREE_PTR_ARRAY_LARGE(ns->ns_rs_buckets, 1 << ns->ns_bucket_bits);
+	kfree(ns->ns_name);
+	/* Namespace \a ns should be not on list at this time, otherwise
+	 * this will cause issues related to using freed \a ns in poold
+	 * thread.
+	 */
+	LASSERT(list_empty(&ns->ns_list_chain));
+	OBD_FREE_PTR(ns);
+	ldlm_put_ref();
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_namespace_free_post);
+
+/**
+ * Cleanup the resource, and free namespace.
+ * bug 12864:
+ * Deadlock issue:
+ * proc1: destroy import
+ *        class_disconnect_export(grab cl_sem) ->
+ *              -> ldlm_namespace_free ->
+ *              -> lprocfs_remove(grab _lprocfs_lock).
+ * proc2: read proc info
+ *        lprocfs_fops_read(grab _lprocfs_lock) ->
+ *              -> osc_rd_active, etc(grab cl_sem).
+ *
+ * So that I have to split the ldlm_namespace_free into two parts - the first
+ * part ldlm_namespace_free_prior is used to cleanup the resource which is
+ * being used; the 2nd part ldlm_namespace_free_post is used to unregister the
+ * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem
+ * held.
+ */
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+			 struct obd_import *imp,
+			 int force)
+{
+	ldlm_namespace_free_prior(ns, imp, force);
+	ldlm_namespace_free_post(ns);
+}
+EXPORT_SYMBOL(ldlm_namespace_free);
+
+void ldlm_namespace_get(struct ldlm_namespace *ns)
+{
+	atomic_inc(&ns->ns_bref);
+}
+
+/* This is only for callers that care about refcount */
+static int ldlm_namespace_get_return(struct ldlm_namespace *ns)
+{
+	return atomic_inc_return(&ns->ns_bref);
+}
+
+void ldlm_namespace_put(struct ldlm_namespace *ns)
+{
+	if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) {
+		wake_up(&ns->ns_waitq);
+		spin_unlock(&ns->ns_lock);
+	}
+}
+
+/** Register \a ns in the list of namespaces */
+void ldlm_namespace_register(struct ldlm_namespace *ns, enum ldlm_side client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(list_empty(&ns->ns_list_chain));
+	list_add(&ns->ns_list_chain, ldlm_namespace_inactive_list(client));
+	ldlm_namespace_nr_inc(client);
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Unregister \a ns from the list of namespaces. */
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, enum ldlm_side client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	/* Some asserts and possibly other parts of the code are still
+	 * using list_empty(&ns->ns_list_chain). This is why it is
+	 * important to use list_del_init() here. */
+	list_del_init(&ns->ns_list_chain);
+	ldlm_namespace_nr_dec(client);
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *ns,
+					  enum ldlm_side client)
+{
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *ns,
+					    enum ldlm_side client)
+{
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	list_move_tail(&ns->ns_list_chain,
+		       ldlm_namespace_inactive_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side client)
+{
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	LASSERT(!list_empty(ldlm_namespace_list(client)));
+	return container_of(ldlm_namespace_list(client)->next,
+			    struct ldlm_namespace, ns_list_chain);
+}
+
+static bool ldlm_resource_extent_new(struct ldlm_resource *res)
+{
+	int idx;
+
+	OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab,
+		       sizeof(*res->lr_itree) * LCK_MODE_NUM);
+	if (res->lr_itree == NULL)
+		return false;
+	/* Initialize interval trees for each lock mode. */
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		res->lr_itree[idx].lit_size = 0;
+		res->lr_itree[idx].lit_mode = BIT(idx);
+		res->lr_itree[idx].lit_root = NULL;
+	}
+	return true;
+}
+
+static bool ldlm_resource_inodebits_new(struct ldlm_resource *res)
+{
+	int i;
+
+	OBD_ALLOC_PTR(res->lr_ibits_queues);
+	if (res->lr_ibits_queues == NULL)
+		return false;
+	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+		INIT_LIST_HEAD(&res->lr_ibits_queues->liq_waiting[i]);
+	return true;
+}
+
+/** Create and initialize new resource. */
+static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type)
+{
+	struct ldlm_resource *res;
+	bool rc;
+
+	OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS);
+	if (res == NULL)
+		return NULL;
+
+	switch (ldlm_type) {
+	case LDLM_EXTENT:
+		rc = ldlm_resource_extent_new(res);
+		break;
+	case LDLM_IBITS:
+		rc = ldlm_resource_inodebits_new(res);
+		break;
+	default:
+		rc = true;
+		break;
+	}
+	if (!rc) {
+		OBD_SLAB_FREE_PTR(res, ldlm_resource_slab);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&res->lr_granted);
+	INIT_LIST_HEAD(&res->lr_waiting);
+
+	atomic_set(&res->lr_refcount, 1);
+	spin_lock_init(&res->lr_lock);
+	lu_ref_init(&res->lr_reference);
+
+	/* Since LVB init can be delayed now, there is no longer need to
+	 * immediatelly acquire mutex here. */
+	mutex_init(&res->lr_lvb_mutex);
+	res->lr_lvb_initialized = false;
+
+	return res;
+}
+
+static void __ldlm_resource_free(struct rcu_head *head)
+{
+	struct ldlm_resource *res = container_of(head, struct ldlm_resource,
+						 lr_rcu);
+
+	OBD_SLAB_FREE_PTR(res, ldlm_resource_slab);
+}
+
+static void ldlm_resource_free(struct ldlm_resource *res)
+{
+	if (res->lr_type == LDLM_EXTENT) {
+		if (res->lr_itree != NULL)
+			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
+				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
+	} else if (res->lr_type == LDLM_IBITS) {
+		if (res->lr_ibits_queues != NULL)
+			OBD_FREE_PTR(res->lr_ibits_queues);
+	}
+
+	call_rcu(&res->lr_rcu, __ldlm_resource_free);
+}
+
+/**
+ * Return a reference to resource with given name, creating it if necessary.
+ * Args: namespace with ns_lock unlocked
+ * Locks: takes and releases NS hash-lock and res->lr_lock
+ * Returns: referenced, unlocked ldlm_resource or ERR_PTR
+ */
+struct ldlm_resource *
+ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
+		  const struct ldlm_res_id *name, enum ldlm_type type,
+		  int create)
+{
+	struct hlist_node	*hnode;
+	struct ldlm_resource	*res = NULL;
+	struct cfs_hash_bd		bd;
+	__u64			version;
+	int			ns_refcount = 0;
+	int hash;
+
+	LASSERT(ns != NULL);
+	LASSERT(parent == NULL);
+	LASSERT(ns->ns_rs_hash != NULL);
+	LASSERT(name->name[0] != 0);
+
+	cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0);
+	hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+	if (hnode != NULL) {
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+		GOTO(found, res);
+	}
+
+	version = cfs_hash_bd_version_get(&bd);
+	cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+
+	if (create == 0)
+		return ERR_PTR(-ENOENT);
+
+	LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE,
+		 "type: %d\n", type);
+	res = ldlm_resource_new(type);
+	if (res == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	hash = ldlm_res_hop_fid_hash(name, ns->ns_bucket_bits);
+	res->lr_ns_bucket = &ns->ns_rs_buckets[hash];
+	res->lr_name = *name;
+	res->lr_type = type;
+
+	cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+	hnode = (version == cfs_hash_bd_version_get(&bd)) ? NULL :
+		cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+
+	if (hnode != NULL) {
+		/* Someone won the race and already added the resource. */
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		/* Clean lu_ref for failed resource. */
+		lu_ref_fini(&res->lr_reference);
+		ldlm_resource_free(res);
+found:
+		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+		return res;
+	}
+	/* We won! Let's add the resource. */
+	cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash);
+	if (atomic_inc_return(&res->lr_ns_bucket->nsb_count) == 1)
+		ns_refcount = ldlm_namespace_get_return(ns);
+
+	cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2);
+
+	/* Let's see if we happened to be the very first resource in this
+	 * namespace. If so, and this is a client namespace, we need to move
+	 * the namespace into the active namespaces list to be patrolled by
+	 * the ldlm_poold. */
+	if (ns_is_client(ns) && ns_refcount == 1) {
+		mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+		ldlm_namespace_move_to_active_locked(ns, LDLM_NAMESPACE_CLIENT);
+		mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+	}
+
+	return res;
+}
+EXPORT_SYMBOL(ldlm_resource_get);
+
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
+{
+	LASSERT(res != NULL);
+	LASSERT(res != LP_POISON);
+	atomic_inc(&res->lr_refcount);
+	CDEBUG(D_INFO, "getref res: %p count: %d\n", res,
+	       atomic_read(&res->lr_refcount));
+	return res;
+}
+
+static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd,
+                                         struct ldlm_resource *res)
+{
+	struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+
+	if (!list_empty(&res->lr_granted)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	if (!list_empty(&res->lr_waiting)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+			       bd, &res->lr_hash);
+	lu_ref_fini(&res->lr_reference);
+	if (atomic_dec_and_test(&nsb->nsb_count))
+		ldlm_namespace_put(nsb->nsb_namespace);
+}
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	struct cfs_hash_bd   bd;
+
+	LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "putref res: %p count: %d\n",
+	       res, atomic_read(&res->lr_refcount) - 1);
+
+	cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd);
+	if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) {
+		__ldlm_resource_putref_final(&bd, res);
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+			ns->ns_lvbo->lvbo_free(res);
+		ldlm_resource_free(res);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_resource_putref);
+
+static void __ldlm_resource_add_lock(struct ldlm_resource *res,
+				     struct list_head *head,
+				     struct ldlm_lock *lock,
+				     bool tail)
+{
+	check_res_locked(res);
+
+	if (ldlm_is_destroyed(lock)) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		return;
+	}
+
+	LASSERT(list_empty(&lock->l_res_link));
+
+	if (tail)
+		list_add_tail(&lock->l_res_link, head);
+	else
+		list_add(&lock->l_res_link, head);
+
+	if (res->lr_type == LDLM_IBITS)
+		ldlm_inodebits_add_lock(res, head, lock, tail);
+
+	ldlm_resource_dump(D_INFO, res);
+}
+
+/**
+ * Add a lock into a given resource into specified lock list.
+ */
+void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
+			    struct ldlm_lock *lock)
+{
+	LDLM_DEBUG(lock, "About to add this lock");
+
+	__ldlm_resource_add_lock(res, head, lock, true);
+}
+
+/**
+ * Insert a lock into resource after specified lock.
+ */
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+				     struct ldlm_lock *new)
+{
+	LASSERT(!list_empty(&original->l_res_link));
+
+	LDLM_DEBUG(new, "About to insert this lock after %p: ", original);
+	__ldlm_resource_add_lock(original->l_resource,
+				 &original->l_res_link,
+				 new, false);
+}
+
+/**
+ * Insert a lock into resource before the specified lock.
+ *
+ * IBITS waiting locks are to be inserted to the ibit lists as well, and only
+ * the insert-after operation is supported for them, because the set of bits
+ * of the previous and the new locks must match. Therefore, get the previous
+ * lock and insert after.
+ */
+void ldlm_resource_insert_lock_before(struct ldlm_lock *original,
+                                      struct ldlm_lock *new)
+{
+	LASSERT(!list_empty(&original->l_res_link));
+
+	LDLM_DEBUG(new, "About to insert this lock before %p: ", original);
+	__ldlm_resource_add_lock(original->l_resource,
+				 original->l_res_link.prev, new, false);
+}
+
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
+{
+	int type = lock->l_resource->lr_type;
+
+	check_res_locked(lock->l_resource);
+	switch (type) {
+	case LDLM_PLAIN:
+		ldlm_unlink_lock_skiplist(lock);
+		break;
+	case LDLM_EXTENT:
+		ldlm_extent_unlink_lock(lock);
+		break;
+	case LDLM_IBITS:
+		ldlm_inodebits_unlink_lock(lock);
+		break;
+	}
+	list_del_init(&lock->l_res_link);
+}
+EXPORT_SYMBOL(ldlm_resource_unlink_lock);
+
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc)
+{
+	desc->lr_type = res->lr_type;
+	desc->lr_name = res->lr_name;
+}
+
+/**
+ * Print information about all locks in all namespaces on this node to debug
+ * log.
+ */
+void ldlm_dump_all_namespaces(enum ldlm_side client, int level)
+{
+	struct list_head *tmp;
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	mutex_lock(ldlm_namespace_lock(client));
+
+	list_for_each(tmp, ldlm_namespace_list(client)) {
+		struct ldlm_namespace *ns;
+
+		ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain);
+		ldlm_namespace_dump(level, ns);
+	}
+
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+static int ldlm_res_hash_dump(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	int    level = (int)(unsigned long)arg;
+
+	lock_res(res);
+	ldlm_resource_dump(level, res);
+	unlock_res(res);
+
+	return 0;
+}
+
+/**
+ * Print information about all locks in this namespace on this node to debug
+ * log.
+ */
+void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
+{
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n",
+	       ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
+	       ns_is_client(ns) ? "client" : "server");
+
+	if (ktime_get_seconds() < ns->ns_next_dump)
+		return;
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_hash_dump,
+				 (void *)(unsigned long)level, 0);
+	spin_lock(&ns->ns_lock);
+	ns->ns_next_dump = ktime_get_seconds() + 10;
+	spin_unlock(&ns->ns_lock);
+}
+
+/**
+ * Print information about all locks in this resource to debug log.
+ */
+void ldlm_resource_dump(int level, struct ldlm_resource *res)
+{
+	struct ldlm_lock *lock;
+	unsigned int granted = 0;
+
+	BUILD_BUG_ON(RES_NAME_SIZE != 4);
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Resource: "DLDLMRES" (%p) refcount = %d\n",
+	       PLDLMRES(res), res, atomic_read(&res->lr_refcount));
+
+	if (!list_empty(&res->lr_granted)) {
+		CDEBUG(level, "Granted locks (in reverse order):\n");
+		list_for_each_entry_reverse(lock, &res->lr_granted,
+						l_res_link) {
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+			if (!(level & D_CANTMASK) &&
+			    ++granted > ldlm_dump_granted_max) {
+				CDEBUG(level,
+				       "only dump %d granted locks to avoid DDOS.\n",
+				       granted);
+				break;
+			}
+		}
+	}
+
+	if (!list_empty(&res->lr_waiting)) {
+		CDEBUG(level, "Waiting locks:\n");
+		list_for_each_entry(lock, &res->lr_waiting, l_res_link)
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+	}
+}
+EXPORT_SYMBOL(ldlm_resource_dump);
diff --git a/drivers/staging/lustrefsx/lustre/llite/acl.c b/drivers/staging/lustrefsx/lustre/llite/acl.c
new file mode 100644
index 0000000000000..bdd6841781409
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/acl.c
@@ -0,0 +1,136 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/acl.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#include "llite_internal.h"
+
+struct posix_acl *ll_get_acl(
+ #ifdef HAVE_ACL_WITH_DENTRY
+	struct user_namespace *ns, struct dentry *dentry, int type)
+ #elif defined HAVE_GET_ACL_RCU_ARG
+	struct inode *inode, int type, bool rcu)
+ #else
+	struct inode *inode, int type)
+ #endif /* HAVE_GET_ACL_RCU_ARG */
+{
+#ifdef HAVE_ACL_WITH_DENTRY
+	struct inode *inode = dentry->d_inode;
+#endif
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct posix_acl *acl = NULL;
+	ENTRY;
+
+#ifdef HAVE_GET_ACL_RCU_ARG
+	if (rcu)
+		return ERR_PTR(-ECHILD);
+#endif
+
+	read_lock(&lli->lli_lock);
+	/* VFS' acl_permission_check->check_acl will release the refcount */
+	acl = posix_acl_dup(lli->lli_posix_acl);
+	read_unlock(&lli->lli_lock);
+
+	RETURN(acl);
+}
+
+#ifdef HAVE_IOP_SET_ACL
+int ll_set_acl(struct user_namespace *mnt_userns,
+#ifdef HAVE_ACL_WITH_DENTRY
+	       struct dentry *dentry,
+#else
+	       struct inode *inode,
+#endif
+	       struct posix_acl *acl, int type)
+{
+#ifdef HAVE_ACL_WITH_DENTRY
+	struct inode *inode = dentry->d_inode;
+#endif
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	const char *name = NULL;
+	char *value = NULL;
+	size_t value_size = 0;
+	int rc = 0;
+	ENTRY;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
+		if (acl)
+			rc = posix_acl_update_mode(mnt_userns, inode,
+						   &inode->i_mode, &acl);
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			rc = acl ? -EACCES : 0;
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+	if (rc)
+		return rc;
+
+	if (acl) {
+		value_size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(value_size, GFP_NOFS);
+		if (value == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
+		if (rc < 0)
+			GOTO(out_value, rc);
+	}
+
+	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
+			 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
+			 name, value, value_size, 0, 0, &req);
+
+	ptlrpc_req_finished(req);
+out_value:
+	kfree(value);
+out:
+	if (rc)
+		forget_cached_acl(inode, type);
+	else
+		set_cached_acl(inode, type, acl);
+	RETURN(rc);
+}
+#endif /* HAVE_IOP_SET_ACL */
diff --git a/drivers/staging/lustrefsx/lustre/llite/crypto.c b/drivers/staging/lustrefsx/lustre/llite/crypto.c
new file mode 100644
index 0000000000000..a832d4d119d6e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/crypto.c
@@ -0,0 +1,562 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2019, 2020, Whamcloud.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#include "llite_internal.h"
+
+#ifdef HAVE_LUSTRE_CRYPTO
+#include <libcfs/libcfs_crypto.h>
+
+static int ll_get_context(struct inode *inode, void *ctx, size_t len)
+{
+	int rc;
+
+	/* Get enc context xattr directly instead of going through the VFS,
+	 * as there is no xattr handler for "encryption.".
+	 */
+	rc = ll_xattr_list(inode, xattr_for_enc(inode),
+			   XATTR_ENCRYPTION_T, ctx, len, OBD_MD_FLXATTR);
+
+	/* used as encryption unit size */
+	if (S_ISREG(inode->i_mode))
+		inode->i_blkbits = LUSTRE_ENCRYPTION_BLOCKBITS;
+	return rc;
+}
+
+int ll_set_encflags(struct inode *inode, void *encctx, __u32 encctxlen,
+		    bool preload)
+{
+	unsigned int ext_flags;
+	int rc = 0;
+
+	/* used as encryption unit size */
+	if (S_ISREG(inode->i_mode))
+		inode->i_blkbits = LUSTRE_ENCRYPTION_BLOCKBITS;
+	ext_flags = ll_inode_to_ext_flags(inode->i_flags) | LUSTRE_ENCRYPT_FL;
+	ll_update_inode_flags(inode, ext_flags);
+
+	if (encctx && encctxlen)
+		rc = ll_xattr_cache_insert(inode,
+					   xattr_for_enc(inode),
+					   encctx, encctxlen);
+	if (rc)
+		return rc;
+
+	return preload ? llcrypt_get_encryption_info(inode) : 0;
+}
+
+/* ll_set_context has 2 distinct behaviors, depending on the value of inode
+ * parameter:
+ * - inode is NULL:
+ *   passed fs_data is a struct md_op_data *. We need to store enc ctx in
+ *   op_data, so that it will be sent along to the server with the request that
+ *   the caller is preparing, thus saving a setxattr request.
+ * - inode is not NULL:
+ *   normal case, letting proceed with setxattr operation.
+ *   This use case should only be used when explicitly setting a new encryption
+ *   policy on an existing, empty directory.
+ */
+static int ll_set_context(struct inode *inode, const void *ctx, size_t len,
+			  void *fs_data)
+{
+	struct ptlrpc_request *req = NULL;
+	struct ll_sb_info *sbi;
+	int rc;
+
+	if (inode == NULL) {
+		struct md_op_data *op_data = (struct md_op_data *)fs_data;
+
+		if (!op_data)
+			return -EINVAL;
+
+		OBD_ALLOC(op_data->op_file_encctx, len);
+		if (op_data->op_file_encctx == NULL)
+			return -ENOMEM;
+		op_data->op_file_encctx_size = len;
+		memcpy(op_data->op_file_encctx, ctx, len);
+		return 0;
+	}
+
+	/* Encrypting the root directory is not allowed */
+	if (is_root_inode(inode))
+		return -EPERM;
+
+	sbi = ll_i2sbi(inode);
+	/* Send setxattr request to lower layers directly instead of going
+	 * through the VFS, as there is no xattr handler for "encryption.".
+	 */
+	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
+			 OBD_MD_FLXATTR, xattr_for_enc(inode),
+			 ctx, len, XATTR_CREATE, ll_i2suppgid(inode), &req);
+	if (rc)
+		return rc;
+	ptlrpc_req_finished(req);
+
+	return ll_set_encflags(inode, (void *)ctx, len, false);
+}
+
+/**
+ * ll_file_open_encrypt() - overlay to llcrypt_file_open
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * This overlay function is necessary to handle encrypted file open without
+ * the key. We allow this access pattern to applications that know what they
+ * are doing, by using the specific flag O_FILE_ENC.
+ * This flag is only compatible with O_DIRECT IOs, to make sure ciphertext
+ * data is wiped from page cache once IOs are finished.
+ */
+int ll_file_open_encrypt(struct inode *inode, struct file *filp)
+{
+	int rc;
+
+	rc = llcrypt_file_open(inode, filp);
+	if (likely(rc != -ENOKEY))
+		return rc;
+
+	if (rc == -ENOKEY &&
+	    (filp->f_flags & O_FILE_ENC) == O_FILE_ENC &&
+	    filp->f_flags & O_DIRECT)
+		/* allow file open with O_FILE_ENC flag when we have O_DIRECT */
+		rc = 0;
+
+	return rc;
+}
+
+void llcrypt_free_ctx(void *encctx, __u32 size)
+{
+	if (encctx)
+		OBD_FREE(encctx, size);
+}
+
+#ifdef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED
+bool ll_sb_has_test_dummy_encryption(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = s2lsi(sb)->lsi_llsbi;
+
+	return sbi ?
+	       unlikely(test_bit(LL_SBI_TEST_DUMMY_ENCRYPTION, sbi->ll_flags)) :
+	       false;
+}
+
+static bool ll_dummy_context(struct inode *inode)
+{
+	return ll_sb_has_test_dummy_encryption(inode->i_sb);
+}
+#else
+static const union llcrypt_context *
+ll_get_dummy_context(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	return lsi ? lsi->lsi_dummy_enc_ctx.ctx : NULL;
+}
+
+bool ll_sb_has_test_dummy_encryption(struct super_block *sb)
+{
+	return ll_get_dummy_context(sb) != NULL;
+}
+#endif
+
+bool ll_sbi_has_encrypt(struct ll_sb_info *sbi)
+{
+	return test_bit(LL_SBI_ENCRYPT, sbi->ll_flags);
+}
+
+void ll_sbi_set_encrypt(struct ll_sb_info *sbi, bool set)
+{
+	if (set) {
+		set_bit(LL_SBI_ENCRYPT, sbi->ll_flags);
+	} else {
+		clear_bit(LL_SBI_ENCRYPT, sbi->ll_flags);
+		clear_bit(LL_SBI_TEST_DUMMY_ENCRYPTION, sbi->ll_flags);
+	}
+}
+
+bool ll_sbi_has_name_encrypt(struct ll_sb_info *sbi)
+{
+	return test_bit(LL_SBI_ENCRYPT_NAME, sbi->ll_flags);
+}
+
+void ll_sbi_set_name_encrypt(struct ll_sb_info *sbi, bool set)
+{
+	if (set)
+		set_bit(LL_SBI_ENCRYPT_NAME, sbi->ll_flags);
+	else
+		clear_bit(LL_SBI_ENCRYPT_NAME, sbi->ll_flags);
+}
+
+static bool ll_empty_dir(struct inode *inode)
+{
+	/* used by llcrypt_ioctl_set_policy(), because a policy can only be set
+	 * on an empty dir.
+	 */
+	/* Here we choose to return true, meaning we always call .set_context.
+	 * Then we rely on server side, with mdd_fix_attr() that calls
+	 * mdd_dir_is_empty() when setting encryption flag on directory.
+	 */
+	return true;
+}
+
+/**
+ * ll_setup_filename() - overlay to llcrypt_setup_filename
+ * @dir: the directory that will be searched
+ * @iname: the user-provided filename being searched for
+ * @lookup: 1 if we're allowed to proceed without the key because it's
+ *	->lookup() or we're finding the dir_entry for deletion; 0 if we cannot
+ *	proceed without the key because we're going to create the dir_entry.
+ * @fname: the filename information to be filled in
+ * @fid: fid retrieved from user-provided filename
+ *
+ * This overlay function is necessary to properly encode @fname after
+ * encryption, as it will be sent over the wire.
+ * This overlay function is also necessary to handle the case of operations
+ * carried out without the key. Normally llcrypt makes use of digested names in
+ * that case. Having a digested name works for local file systems that can call
+ * llcrypt_match_name(), but Lustre server side is not aware of encryption.
+ * So for keyless @lookup operations on long names, for Lustre we choose to
+ * present to users the encoded struct ll_digest_filename, instead of a digested
+ * name. FID and name hash can then easily be extracted and put into the
+ * requests sent to servers.
+ */
+int ll_setup_filename(struct inode *dir, const struct qstr *iname,
+		      int lookup, struct llcrypt_name *fname,
+		      struct lu_fid *fid)
+{
+	int digested = 0;
+	struct qstr dname;
+	int rc;
+
+	if (fid && IS_ENCRYPTED(dir) && llcrypt_policy_has_filename_enc(dir) &&
+	    !llcrypt_has_encryption_key(dir)) {
+		struct lustre_sb_info *lsi = s2lsi(dir->i_sb);
+
+		if ((!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI) &&
+		     iname->name[0] == LLCRYPT_DIGESTED_CHAR) ||
+		    ((lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI) &&
+		     iname->name[0] == LLCRYPT_DIGESTED_CHAR_OLD))
+			digested = 1;
+	}
+
+	dname.name = iname->name + digested;
+	dname.len = iname->len - digested;
+
+	if (fid) {
+		fid->f_seq = 0;
+		fid->f_oid = 0;
+		fid->f_ver = 0;
+	}
+	rc = llcrypt_setup_filename(dir, &dname, lookup, fname);
+	if (rc == -ENOENT && lookup) {
+		if (((is_root_inode(dir) &&
+		     iname->len == strlen(dot_fscrypt_name) &&
+		     strncmp(iname->name, dot_fscrypt_name, iname->len) == 0) ||
+		     (!llcrypt_has_encryption_key(dir) &&
+		      unlikely(filename_is_volatile(iname->name,
+						    iname->len, NULL))))) {
+			/* In case of subdir mount of an encrypted directory,
+			 * we allow lookup of /.fscrypt directory.
+			 */
+			/* For purpose of migration or mirroring without enc key
+			 * we allow lookup of volatile file without enc context.
+			 */
+			memset(fname, 0, sizeof(struct llcrypt_name));
+			fname->disk_name.name = (unsigned char *)iname->name;
+			fname->disk_name.len = iname->len;
+			rc = 0;
+		} else if (!llcrypt_has_encryption_key(dir)) {
+			rc = -ENOKEY;
+		}
+	}
+	if (rc)
+		return rc;
+
+	if (digested) {
+		/* Without the key, for long names user should have struct
+		 * ll_digest_filename representation of the dentry instead of
+		 * the name. So make sure it is valid, return fid and put
+		 * excerpt of cipher text name in disk_name.
+		 */
+		struct ll_digest_filename *digest;
+
+		if (fname->crypto_buf.len < sizeof(struct ll_digest_filename)) {
+			rc = -EINVAL;
+			goto out_free;
+		}
+		digest = (struct ll_digest_filename *)fname->disk_name.name;
+		*fid = digest->ldf_fid;
+		if (!fid_is_sane(fid)) {
+			rc = -EINVAL;
+			goto out_free;
+		}
+		fname->disk_name.name = digest->ldf_excerpt;
+		fname->disk_name.len = sizeof(digest->ldf_excerpt);
+	}
+	if (IS_ENCRYPTED(dir) &&
+	    !name_is_dot_or_dotdot(fname->disk_name.name,
+				   fname->disk_name.len)) {
+		int presented_len = critical_chars(fname->disk_name.name,
+						   fname->disk_name.len);
+		char *buf;
+
+		buf = kmalloc(presented_len + 1, GFP_NOFS);
+		if (!buf) {
+			rc = -ENOMEM;
+			goto out_free;
+		}
+
+		if (presented_len == fname->disk_name.len)
+			memcpy(buf, fname->disk_name.name, presented_len);
+		else
+			critical_encode(fname->disk_name.name,
+					fname->disk_name.len, buf);
+		buf[presented_len] = '\0';
+		kfree(fname->crypto_buf.name);
+		fname->crypto_buf.name = buf;
+		fname->crypto_buf.len = presented_len;
+		fname->disk_name.name = fname->crypto_buf.name;
+		fname->disk_name.len = fname->crypto_buf.len;
+	}
+
+	return rc;
+
+out_free:
+	llcrypt_free_filename(fname);
+	return rc;
+}
+
+/**
+ * ll_fname_disk_to_usr() - overlay to llcrypt_fname_disk_to_usr
+ * @inode: the inode to convert name
+ * @hash: major hash for inode
+ * @minor_hash: minor hash for inode
+ * @iname: the user-provided filename needing conversion
+ * @oname: the filename information to be filled in
+ * @fid: the user-provided fid for filename
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * This overlay function is necessary to properly decode @iname before
+ * decryption, as it comes from the wire.
+ * This overlay function is also necessary to handle the case of operations
+ * carried out without the key. Normally llcrypt makes use of digested names in
+ * that case. Having a digested name works for local file systems that can call
+ * llcrypt_match_name(), but Lustre server side is not aware of encryption.
+ * So for keyless @lookup operations on long names, for Lustre we choose to
+ * present to users the encoded struct ll_digest_filename, instead of a digested
+ * name. FID and name hash can then easily be extracted and put into the
+ * requests sent to servers.
+ */
+int ll_fname_disk_to_usr(struct inode *inode,
+			 u32 hash, u32 minor_hash,
+			 struct llcrypt_str *iname, struct llcrypt_str *oname,
+			 struct lu_fid *fid)
+{
+	struct llcrypt_str lltr = LLTR_INIT(iname->name, iname->len);
+	struct ll_digest_filename digest;
+	int digested = 0;
+	char *buf = NULL;
+	int rc;
+
+	if (IS_ENCRYPTED(inode)) {
+		if (!name_is_dot_or_dotdot(lltr.name, lltr.len) &&
+		    strnchr(lltr.name, lltr.len, '=')) {
+			/* Only proceed to critical decode if
+			 * iname contains espace char '='.
+			 */
+			int len = lltr.len;
+
+			buf = kmalloc(len, GFP_NOFS);
+			if (!buf)
+				return -ENOMEM;
+
+			len = critical_decode(lltr.name, len, buf);
+			lltr.name = buf;
+			lltr.len = len;
+		}
+		if (lltr.len > LL_CRYPTO_BLOCK_SIZE * 2 &&
+		    !llcrypt_has_encryption_key(inode) &&
+		    llcrypt_policy_has_filename_enc(inode)) {
+			struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+
+			digested = 1;
+			/* Without the key for long names, set the dentry name
+			 * to the representing struct ll_digest_filename. It
+			 * will be encoded by llcrypt for display, and will
+			 * enable further lookup requests.
+			 */
+			if (!fid)
+				return -EINVAL;
+			digest.ldf_fid = *fid;
+			memcpy(digest.ldf_excerpt,
+			       LLCRYPT_EXTRACT_DIGEST(lltr.name, lltr.len),
+			       sizeof(digest.ldf_excerpt));
+
+			lltr.name = (char *)&digest;
+			lltr.len = sizeof(digest);
+
+			if (!(lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI))
+				oname->name[0] = LLCRYPT_DIGESTED_CHAR;
+			else
+				oname->name[0] = LLCRYPT_DIGESTED_CHAR_OLD;
+			oname->name = oname->name + 1;
+			oname->len--;
+		}
+	}
+
+	rc = llcrypt_fname_disk_to_usr(inode, hash, minor_hash, &lltr, oname);
+
+	kfree(buf);
+	oname->name = oname->name - digested;
+	oname->len = oname->len + digested;
+
+	return rc;
+}
+
+/* Copied from llcrypt_d_revalidate, as it is not exported */
+/*
+ * Validate dentries in encrypted directories to make sure we aren't potentially
+ * caching stale dentries after a key has been added.
+ */
+int ll_revalidate_d_crypto(struct dentry *dentry, unsigned int flags)
+{
+	struct dentry *dir;
+	int err;
+	int valid;
+
+	/*
+	 * Plaintext names are always valid, since llcrypt doesn't support
+	 * reverting to ciphertext names without evicting the directory's inode
+	 * -- which implies eviction of the dentries in the directory.
+	 */
+	if (!llcrypt_is_nokey_name(dentry))
+		return 1;
+
+	/*
+	 * Ciphertext name; valid if the directory's key is still unavailable.
+	 *
+	 * Although llcrypt forbids rename() on ciphertext names, we still must
+	 * use dget_parent() here rather than use ->d_parent directly.  That's
+	 * because a corrupted fs image may contain directory hard links, which
+	 * the VFS handles by moving the directory's dentry tree in the dcache
+	 * each time ->lookup() finds the directory and it already has a dentry
+	 * elsewhere.  Thus ->d_parent can be changing, and we must safely grab
+	 * a reference to some ->d_parent to prevent it from being freed.
+	 */
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	dir = dget_parent(dentry);
+	err = llcrypt_get_encryption_info(d_inode(dir));
+	valid = !llcrypt_has_encryption_key(d_inode(dir));
+	dput(dir);
+
+	if (err < 0)
+		return err;
+
+	return valid;
+}
+
+const struct llcrypt_operations lustre_cryptops = {
+	.key_prefix		= "lustre:",
+	.get_context		= ll_get_context,
+	.set_context		= ll_set_context,
+#ifdef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED
+	.dummy_context		= ll_dummy_context,
+#else
+	.get_dummy_context	= ll_get_dummy_context,
+#endif
+	.empty_dir		= ll_empty_dir,
+	.max_namelen		= NAME_MAX,
+};
+#else /* !HAVE_LUSTRE_CRYPTO */
+int ll_set_encflags(struct inode *inode, void *encctx, __u32 encctxlen,
+		    bool preload)
+{
+	return 0;
+}
+
+int ll_file_open_encrypt(struct inode *inode, struct file *filp)
+{
+	return llcrypt_file_open(inode, filp);
+}
+
+void llcrypt_free_ctx(void *encctx, __u32 size)
+{
+}
+
+bool ll_sb_has_test_dummy_encryption(struct super_block *sb)
+{
+	return false;
+}
+
+bool ll_sbi_has_encrypt(struct ll_sb_info *sbi)
+{
+	return false;
+}
+
+void ll_sbi_set_encrypt(struct ll_sb_info *sbi, bool set)
+{
+}
+
+bool ll_sbi_has_name_encrypt(struct ll_sb_info *sbi)
+{
+	return false;
+}
+
+void ll_sbi_set_name_encrypt(struct ll_sb_info *sbi, bool set)
+{
+}
+
+int ll_setup_filename(struct inode *dir, const struct qstr *iname,
+		      int lookup, struct llcrypt_name *fname,
+		      struct lu_fid *fid)
+{
+	if (fid) {
+		fid->f_seq = 0;
+		fid->f_oid = 0;
+		fid->f_ver = 0;
+	}
+
+	return llcrypt_setup_filename(dir, iname, lookup, fname);
+}
+
+int ll_fname_disk_to_usr(struct inode *inode,
+			 u32 hash, u32 minor_hash,
+			 struct llcrypt_str *iname, struct llcrypt_str *oname,
+			 struct lu_fid *fid)
+{
+	return llcrypt_fname_disk_to_usr(inode, hash, minor_hash, iname, oname);
+}
+
+int ll_revalidate_d_crypto(struct dentry *dentry, unsigned int flags)
+{
+	return 1;
+}
+#endif
+
diff --git a/drivers/staging/lustrefsx/lustre/llite/dcache.c b/drivers/staging/lustrefsx/lustre/llite/dcache.c
new file mode 100644
index 0000000000000..b736bfc948ede
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/dcache.c
@@ -0,0 +1,388 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/quotaops.h>
+#include <linux/kernel.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_dlm.h>
+
+#include "llite_internal.h"
+
+static void free_dentry_data(struct rcu_head *head)
+{
+	struct ll_dentry_data *lld;
+
+	lld = container_of(head, struct ll_dentry_data, lld_rcu_head);
+	OBD_FREE_PTR(lld);
+}
+
+/* should NOT be called with the dcache lock, see fs/dcache.c */
+static void ll_release(struct dentry *de)
+{
+        struct ll_dentry_data *lld;
+        ENTRY;
+        LASSERT(de != NULL);
+        lld = ll_d2d(de);
+        if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */
+                RETURN_EXIT;
+
+	de->d_fsdata = NULL;
+	call_rcu(&lld->lld_rcu_head, free_dentry_data);
+
+	EXIT;
+}
+
+/* Compare if two dentries are the same.  Don't match if the existing dentry
+ * is marked invalid.  Returns 1 if different, 0 if the same.
+ *
+ * This avoids a race where ll_lookup_it() instantiates a dentry, but we get
+ * an AST before calling d_revalidate_it().  The dentry still exists (marked
+ * INVALID) so d_lookup() matches it, but we have no lock on it (so
+ * lock_match() fails) and we spin around real_lookup().
+ *
+ * This race doesn't apply to lookups in d_alloc_parallel(), and for
+ * those we want to ensure that only one dentry with a given name is
+ * in ll_lookup_nd() at a time.  So allow invalid dentries to match
+ * while d_in_lookup().  We will be called again when the lookup
+ * completes, and can give a different answer then.
+ */
+#if defined(HAVE_D_COMPARE_5ARGS)
+static int ll_dcompare(const struct dentry *parent, const struct dentry *dentry,
+		       unsigned int len, const char *str,
+		       const struct qstr *name)
+#elif defined(HAVE_D_COMPARE_4ARGS)
+static int ll_dcompare(const struct dentry *dentry, unsigned int len,
+		       const char *str, const struct qstr *name)
+#endif
+{
+	ENTRY;
+
+	if (len != name->len)
+		RETURN(1);
+
+	if (memcmp(str, name->name, len))
+		RETURN(1);
+
+	CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
+	       name->len, name->name, dentry, dentry->d_flags,
+	       ll_d_count(dentry));
+
+	/* mountpoint is always valid */
+	if (d_mountpoint((struct dentry *)dentry))
+		RETURN(0);
+
+	/* ensure exclusion against parallel lookup of the same name */
+	if (d_in_lookup((struct dentry *)dentry))
+		return 0;
+
+	if (d_lustre_invalid(dentry))
+		RETURN(1);
+
+	RETURN(0);
+}
+
+/**
+ * Called when last reference to a dentry is dropped and dcache wants to know
+ * whether or not it should cache it:
+ * - return 1 to delete the dentry immediately
+ * - return 0 to cache the dentry
+ * Should NOT be called with the dcache lock, see fs/dcache.c
+ */
+static int ll_ddelete(const struct dentry *de)
+{
+	ENTRY;
+	LASSERT(de);
+
+	CDEBUG(D_DENTRY, "%s dentry %pd (%p, parent %p, inode %p) %s%s\n",
+	       d_lustre_invalid(de) ? "deleting" : "keeping",
+	       de, de, de->d_parent, de->d_inode,
+	       d_unhashed((struct dentry *)de) ? "" : "hashed,",
+	       list_empty(&de->d_subdirs) ? "" : "subdirs");
+
+	/* kernel >= 2.6.38 last refcount is decreased after this function. */
+	LASSERT(ll_d_count(de) == 1);
+
+	if (d_lustre_invalid(de))
+		RETURN(1);
+	RETURN(0);
+}
+
+#ifdef HAVE_D_INIT
+static int ll_d_init(struct dentry *de)
+{
+	struct ll_dentry_data *lld;
+
+	OBD_ALLOC_PTR(lld);
+	lld->lld_invalid = 1;
+	de->d_fsdata = lld;
+	return 0;
+}
+#else /* !HAVE_D_INIT */
+
+bool ll_d_setup(struct dentry *de, bool do_put)
+{
+	struct ll_dentry_data *lld;
+	bool success = true;
+
+	if (de->d_fsdata)
+		return success;
+
+	OBD_ALLOC_PTR(lld);
+	if (likely(lld)) {
+		spin_lock(&de->d_lock);
+		/* Since the first d_fsdata test was not
+		 * done under the spinlock it could have
+		 * changed by time the memory is allocated.
+		 */
+		if (!de->d_fsdata) {
+			lld->lld_invalid = 1;
+			de->d_fsdata = lld;
+		}
+		spin_unlock(&de->d_lock);
+		/* See if we lost the race to set d_fsdata. */
+		if (de->d_fsdata != lld)
+			OBD_FREE_PTR(lld);
+	} else {
+		success = false;
+		if (do_put)
+			dput(de);
+	}
+
+	return success;
+}
+#endif /* !HAVE_D_INIT */
+
+void ll_intent_drop_lock(struct lookup_intent *it)
+{
+	if (it->it_op && it->it_lock_mode) {
+		struct lustre_handle handle;
+
+		handle.cookie = it->it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "releasing lock with cookie %#llx from it %p\n",
+		       handle.cookie, it);
+		ldlm_lock_decref(&handle, it->it_lock_mode);
+
+		/* bug 494: intent_release may be called multiple times, from
+		 * this thread and we don't want to double-decref this lock */
+		it->it_lock_mode = 0;
+		if (it->it_remote_lock_mode != 0) {
+			handle.cookie = it->it_remote_lock_handle;
+
+			CDEBUG(D_DLMTRACE,
+			       "releasing remote lock with cookie %#llx from it %p\n",
+			       handle.cookie, it);
+			ldlm_lock_decref(&handle,
+					 it->it_remote_lock_mode);
+			it->it_remote_lock_mode = 0;
+		}
+	}
+}
+
+void ll_intent_release(struct lookup_intent *it)
+{
+        ENTRY;
+
+        CDEBUG(D_INFO, "intent %p released\n", it);
+        ll_intent_drop_lock(it);
+        /* We are still holding extra reference on a request, need to free it */
+        if (it_disposition(it, DISP_ENQ_OPEN_REF))
+		ptlrpc_req_finished(it->it_request); /* ll_file_open */
+
+	if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */
+		ptlrpc_req_finished(it->it_request);
+
+	it->it_disposition = 0;
+	it->it_request = NULL;
+	EXIT;
+}
+
+/* mark aliases invalid and prune unused aliases */
+void ll_prune_aliases(struct inode *inode)
+{
+	struct dentry *dentry;
+	ENTRY;
+
+	LASSERT(inode != NULL);
+
+	CDEBUG(D_INODE, "marking dentries for inode "DFID"(%p) invalid\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias)
+		d_lustre_invalidate(dentry);
+	spin_unlock(&inode->i_lock);
+
+	d_prune_aliases(inode);
+
+        EXIT;
+}
+
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+			    struct lookup_intent *it,
+			    struct dentry *de)
+{
+	struct inode *inode = de->d_inode;
+	__u64 bits = 0;
+	int rc = 0;
+
+        ENTRY;
+
+	if (!request)
+		RETURN(0);
+
+	if (it_disposition(it, DISP_LOOKUP_NEG))
+		RETURN(-ENOENT);
+
+	rc = ll_prep_inode(&inode, &request->rq_pill, NULL, it);
+	if (rc)
+		RETURN(rc);
+
+	ll_set_lock_data(ll_i2sbi(inode)->ll_md_exp, inode, it,
+			 &bits);
+	if (bits & MDS_INODELOCK_LOOKUP) {
+		ll_update_dir_depth(de->d_parent->d_inode, inode);
+		if (!ll_d_setup(de, true))
+			RETURN(-ENOMEM);
+		d_lustre_revalidate(de);
+	}
+
+	RETURN(rc);
+}
+
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
+{
+	LASSERT(it != NULL);
+	LASSERT(dentry != NULL);
+
+	if (it->it_lock_mode && dentry->d_inode != NULL) {
+		struct inode *inode = dentry->d_inode;
+		struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)\n",
+		       PFID(ll_inode2fid(inode)), inode);
+		ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+	}
+
+	/* drop lookup or getattr locks immediately */
+	if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR)
+		ll_intent_drop_lock(it);
+}
+
+static int ll_revalidate_dentry(struct dentry *dentry,
+				unsigned int lookup_flags)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct ll_dentry_data *lld = dentry->d_fsdata;
+	struct ll_sb_info *sbi;
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%s, flags=%u\n",
+	       dentry->d_name.name, lookup_flags);
+
+	rc = ll_revalidate_d_crypto(dentry, lookup_flags);
+	if (rc != 1)
+		return rc;
+
+	/* If this is intermediate component path lookup and we were able to get
+	 * to this dentry, then its lock has not been revoked and the
+	 * path component is valid. */
+	if (lookup_flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))
+		return 1;
+
+	/* Symlink - always valid as long as the dentry was found */
+	/* only special case is to prevent ELOOP error from VFS during open
+	 * of a foreign symlink file/dir with O_NOFOLLOW, like it happens for
+	 * real symlinks. This will allow to open foreign symlink file/dir
+	 * for get[dir]stripe/unlock ioctl()s.
+	 */
+	if (d_is_symlink(dentry)) {
+		if (!S_ISLNK(dentry->d_inode->i_mode) &&
+		    !(lookup_flags & LOOKUP_FOLLOW))
+			return 0;
+		else
+			return 1;
+	}
+
+	/*
+	 * VFS warns us that this is the second go around and previous
+	 * operation failed (most likely open|creat), so this time
+	 * we better talk to the server via the lookup path by name,
+	 * not by fid.
+	 */
+	if (lookup_flags & LOOKUP_REVAL)
+		return 0;
+
+	if (lookup_flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	/*
+	 * To support metadata lazy load, we want to bypass negative lookup cache
+	 * on the client. A negative dentry cache is a dentry node that does not
+	 * have an inode associated with it. In these cases, return 0 here
+	 * to force a lookup call to the server.
+	 */
+	sbi = ll_s2sbi(dentry->d_sb);
+	if (d_is_negative(dentry) &&
+		sbi->ll_neg_dentry_timeout != OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS) {
+		LASSERT(lld != NULL);
+		if (!lld->lld_neg_cache_timeout)
+			lld->lld_neg_cache_timeout = jiffies + sbi->ll_neg_dentry_timeout * HZ;
+
+		if (time_after(jiffies, lld->lld_neg_cache_timeout)) {
+			CDEBUG(D_VFSTRACE,
+			       "negative dentry past timeout - flags: %u\n", lookup_flags);
+			return 0;
+		}
+		CDEBUG(D_VFSTRACE,
+		       "negative dentry within timeout - flags: %u\n", lookup_flags);
+	}
+
+	if (dentry_may_statahead(dir, dentry))
+		ll_revalidate_statahead(dir, &dentry, dentry->d_inode == NULL);
+
+	return 1;
+}
+
+const struct dentry_operations ll_d_ops = {
+#ifdef HAVE_D_INIT
+	.d_init		= ll_d_init,
+#endif
+	.d_revalidate	= ll_revalidate_dentry,
+	.d_release	= ll_release,
+	.d_delete	= ll_ddelete,
+	.d_compare	= ll_dcompare,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/dir.c b/drivers/staging/lustrefsx/lustre/llite/dir.c
new file mode 100644
index 0000000000000..12125350ae7b0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/dir.c
@@ -0,0 +1,2616 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/llite/dir.c
+ *
+ * Directory code for lustre client.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/version.h>
+#include <linux/security.h>
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+#include <linux/uaccess.h>
+#include <linux/buffer_head.h>   // for wait_on_buffer
+#include <linux/pagevec.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_compat.h>
+#include <lustre_fid.h>
+#include <lustre_kernelcomm.h>
+#include <lustre_swab.h>
+#include <libcfs/libcfs_crypto.h>
+
+#include "llite_internal.h"
+
+static int ll_check_and_trigger_restore(struct inode *dir)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	const int max_retry = atomic_read(&sbi->ll_dir_restore_max_retry_count);
+	int retry_count = 0;
+	u32 hus_states;
+	__u32 gen = 0;
+	int rc;
+
+	/* Skip restore if server does not support or if disabled */
+	if (!exp_mdll(sbi->ll_md_exp) || exp_bypass_mdll(sbi->ll_md_exp))
+		return 0;
+
+	/*
+	 * TODO-MDLL:
+	 * use API that does a cached read instead of
+	 * going to the mdt for getting the hsm state.
+	 * Tracked with Simba-21644
+	 */
+try_again:
+	rc = ll_get_hsm_state(dir, &hus_states);
+	if (rc == 0 && (hus_states & HS_RELEASED)) {
+		CDEBUG(D_HSM,
+		       "MDLL Calling ll_layout_restore for dir "DFID" retry: %d"
+		       "\n", PFID(ll_inode2fid(dir)), retry_count);
+		rc = ll_layout_restore(dir, 0, OBD_OBJECT_EOF);
+		if (rc) {
+			CERROR("MDLL ll_layout_restore ("DFID") error rc: %d\n",
+			       PFID(ll_inode2fid(dir)), rc);
+			rc = -EAGAIN;
+			if (max_retry == 0)
+				goto out_exit;
+		} else {
+			CDEBUG(D_HSM, "MDLL Restore triggered for dir "DFID"\n",
+			       PFID(ll_inode2fid(dir)));
+			ll_layout_refresh(dir, &gen);
+			CDEBUG(D_HSM, "MDLL Restore done for dir "DFID"\n",
+			       PFID(ll_inode2fid(dir)));
+		}
+		/* If the max_retry is set to 0, then the behavior would be
+		 * without a retry. There wont be any check for the hsm state
+		 * after the completed restore. This case would be similar to
+		 * the behaviour without this retry changes. The default
+		 * value of the max_retry would be 1.
+		 * A value of -1 would retry indefinitely.
+		 */
+		/* In case of an mdt restart, the ll_layout_refresh would
+		 * return back only after the mdt has restarted and the
+		 * existing network connection gets a reset. When the retry
+		 * happens, the mdt would be up and running.
+		 * Ideally the directory restore would be done with a single
+		 * retry if the mdt does not crash/restart again.
+		 */
+		if ((max_retry < 0) ||
+		    (max_retry >= 0 && retry_count < max_retry)) {
+			retry_count++;
+			goto try_again;
+		} else if (max_retry > 0 && retry_count >= max_retry) {
+			rc = ll_get_hsm_state(dir, &hus_states);
+			if (rc == 0 && (hus_states & HS_RELEASED)) {
+				CDEBUG(D_HSM,
+				       "MDLL reached max retry %d for ("DFID")"
+				       "hsm_state: %d\n",
+				       retry_count, PFID(ll_inode2fid(dir)),
+				       hus_states);
+				rc = -EAGAIN;
+				goto out_exit;
+			}
+		}
+	}
+	if (rc != 0) {
+		CDEBUG(D_HSM,
+		       "MDLL error calling ll_get_hsm_state for dir "DFID" rc: "
+		       "%d\n", PFID(ll_inode2fid(dir)), rc);
+		rc = -EAGAIN;
+	}
+
+out_exit:
+	return rc;
+}
+
+/*
+ * (new) readdir implementation overview.
+ *
+ * Original lustre readdir implementation cached exact copy of raw directory
+ * pages on the client. These pages were indexed in client page cache by
+ * logical offset in the directory file. This design, while very simple and
+ * intuitive had some inherent problems:
+ *
+ *     . it implies that byte offset to the directory entry serves as a
+ *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
+ *     ext3/htree directory entries may move due to splits, and more
+ *     importantly,
+ *
+ *     . it is incompatible with the design of split directories for cmd3,
+ *     that assumes that names are distributed across nodes based on their
+ *     hash, and so readdir should be done in hash order.
+ *
+ * New readdir implementation does readdir in hash order, and uses hash of a
+ * file name as a telldir/seekdir cookie. This led to number of complications:
+ *
+ *     . hash is not unique, so it cannot be used to index cached directory
+ *     pages on the client (note, that it requires a whole pageful of hash
+ *     collided entries to cause two pages to have identical hashes);
+ *
+ *     . hash is not unique, so it cannot, strictly speaking, be used as an
+ *     entry cookie. ext3/htree has the same problem and lustre implementation
+ *     mimics their solution: seekdir(hash) positions directory at the first
+ *     entry with the given hash.
+ *
+ * Client side.
+ *
+ * 0. caching
+ *
+ * Client caches directory pages using hash of the first entry as an index. As
+ * noted above hash is not unique, so this solution doesn't work as is:
+ * special processing is needed for "page hash chains" (i.e., sequences of
+ * pages filled with entries all having the same hash value).
+ *
+ * First, such chains have to be detected. To this end, server returns to the
+ * client the hash of the first entry on the page next to one returned. When
+ * client detects that this hash is the same as hash of the first entry on the
+ * returned page, page hash collision has to be handled. Pages in the
+ * hash chain, except first one, are termed "overflow pages".
+ *
+ * Proposed (unimplimented) solution to index uniqueness problem is to
+ * not cache overflow pages.  Instead, when page hash collision is
+ * detected, all overflow pages from emerging chain should be
+ * immediately requested from the server and placed in a special data
+ * structure.  This data structure can be used by ll_readdir() to
+ * process entries from overflow pages.  When readdir invocation
+ * finishes, overflow pages are discarded.  If page hash collision chain
+ * weren't completely processed, next call to readdir will again detect
+ * page hash collision, again read overflow pages in, process next
+ * portion of entries and again discard the pages.  This is not as
+ * wasteful as it looks, because, given reasonable hash, page hash
+ * collisions are extremely rare.
+ *
+ * 1. directory positioning
+ *
+ * When seekdir(hash) is called, original
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Server.
+ *
+ * identification of and access to overflow pages
+ *
+ * page format
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted. See
+ * mdc_adjust_dirpages().
+ *
+ */
+struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
+			     __u64 offset, int *partial_readdir_rc)
+{
+	struct md_readdir_info mrinfo = {
+					.mr_blocking_ast = ll_md_blocking_ast };
+	struct page *page;
+	int rc;
+
+	rc = ll_check_and_trigger_restore(dir);
+	if (rc != 0)
+		return ERR_PTR(rc);
+
+	rc = md_read_page(ll_i2mdexp(dir), op_data, &mrinfo, offset, &page);
+	if (rc != 0)
+		return ERR_PTR(rc);
+
+	if (partial_readdir_rc && mrinfo.mr_partial_readdir_rc)
+		*partial_readdir_rc = mrinfo.mr_partial_readdir_rc;
+
+	return page;
+}
+
+void ll_release_page(struct inode *inode, struct page *page,
+		     bool remove)
+{
+	kunmap(page);
+
+	/* Always remove the page for striped dir, because the page is
+	 * built from temporarily in LMV layer */
+	if (inode && ll_dir_striped(inode)) {
+		__free_page(page);
+		return;
+	}
+
+	if (remove) {
+		lock_page(page);
+		if (likely(page->mapping != NULL))
+			cfs_delete_from_page_cache(page);
+		unlock_page(page);
+	}
+	put_page(page);
+}
+
+#ifdef HAVE_DIR_CONTEXT
+int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data,
+		struct dir_context *ctx, int *partial_readdir_rc)
+{
+#else
+int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data,
+		void *cookie, filldir_t filldir, int *partial_readdir_rc)
+{
+#endif
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	__u64 pos = *ppos;
+	bool is_api32 = ll_need_32bit_api(sbi);
+	bool is_hash64 = test_bit(LL_SBI_64BIT_HASH, sbi->ll_flags);
+	struct page *page;
+	bool done = false;
+	struct llcrypt_str lltr = LLTR_INIT(NULL, 0);
+	int rc = 0;
+	ENTRY;
+
+	if (IS_ENCRYPTED(inode)) {
+		rc = llcrypt_fname_alloc_buffer(inode, NAME_MAX, &lltr);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+	page = ll_get_dir_page(inode, op_data, pos, partial_readdir_rc);
+
+	while (rc == 0 && !done) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+		__u64 hash;
+		__u64 next;
+
+		if (IS_ERR(page)) {
+			rc = PTR_ERR(page);
+			break;
+		}
+
+		hash = MDS_DIR_END_OFF;
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL && !done;
+		     ent = lu_dirent_next(ent)) {
+			__u16          type;
+			int            namelen;
+			struct lu_fid  fid;
+			__u64          lhash;
+			__u64          ino;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			if (hash < pos) /* Skip until we find target hash */
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (namelen == 0) /* Skip dummy record */
+				continue;
+
+			if (is_api32 && is_hash64)
+				lhash = hash >> 32;
+			else
+				lhash = hash;
+			fid_le_to_cpu(&fid, &ent->lde_fid);
+			ino = cl_fid_build_ino(&fid, is_api32);
+			type = S_DT(lu_dirent_type_get(ent));
+			/* For ll_nfs_get_name_filldir(), it will try to access
+			 * 'ent' through 'lde_name', so the parameter 'name'
+			 * for 'filldir()' must be part of the 'ent'. */
+#ifdef HAVE_DIR_CONTEXT
+			ctx->pos = lhash;
+			if (!IS_ENCRYPTED(inode)) {
+				done = !dir_emit(ctx, ent->lde_name, namelen,
+						 ino, type);
+			} else {
+				/* Directory is encrypted */
+				int save_len = lltr.len;
+				struct llcrypt_str de_name =
+					LLTR_INIT(ent->lde_name, namelen);
+
+				rc = ll_fname_disk_to_usr(inode, 0, 0, &de_name,
+							  &lltr, &fid);
+				de_name = lltr;
+				lltr.len = save_len;
+				if (rc) {
+					done = 1;
+					break;
+				}
+				done = !dir_emit(ctx, de_name.name, de_name.len,
+						 ino, type);
+			}
+#else
+			/* HAVE_DIR_CONTEXT is defined from kernel 3.11, whereas
+			 * IS_ENCRYPTED is brought by kernel 4.14.
+			 * So there is no need to handle encryption case here.
+			 */
+			done = filldir(cookie, ent->lde_name, namelen, lhash,
+				       ino, type);
+#endif
+		}
+
+		if (done) {
+			pos = hash;
+			ll_release_page(inode, page, false);
+			break;
+		}
+
+		next = le64_to_cpu(dp->ldp_hash_end);
+		pos = next;
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			done = 1;
+			ll_release_page(inode, page, false);
+		} else {
+			/*
+			 * Normal case: continue to the next
+			 * page.
+			 */
+			ll_release_page(inode, page,
+					le32_to_cpu(dp->ldp_flags) &
+					LDF_COLLIDE);
+			next = pos;
+			page = ll_get_dir_page(inode, op_data, pos,
+					       partial_readdir_rc);
+		}
+	}
+#ifdef HAVE_DIR_CONTEXT
+	ctx->pos = pos;
+#else
+	*ppos = pos;
+#endif
+	llcrypt_fname_free_buffer(&lltr);
+	RETURN(rc);
+}
+
+#ifdef HAVE_DIR_CONTEXT
+static int ll_iterate(struct file *filp, struct dir_context *ctx)
+#else
+static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
+#endif
+{
+	struct inode *inode = file_inode(filp);
+	struct ll_file_data *lfd = filp->private_data;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	bool hash64 = test_bit(LL_SBI_64BIT_HASH, sbi->ll_flags);
+	int api32 = ll_need_32bit_api(sbi);
+	struct md_op_data *op_data;
+	struct lu_fid pfid = { 0 };
+	ktime_t kstart = ktime_get();
+	/* result of possible partial readdir */
+	int partial_readdir_rc = 0;
+	__u64 pos;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(lfd != NULL);
+	pos = lfd->lfd_pos;
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op:inode="DFID"(%p) pos/size%lu/%llu 32bit_api %d\n",
+	       PFID(ll_inode2fid(inode)),
+	       inode, (unsigned long)pos, i_size_read(inode), api32);
+
+	if (IS_ENCRYPTED(inode)) {
+		rc = llcrypt_get_encryption_info(inode);
+		if (rc && rc != -ENOKEY)
+			GOTO(out, rc);
+	}
+
+	if (pos == MDS_DIR_END_OFF)
+		/*
+		 * end-of-file.
+		 */
+		GOTO(out, rc = 0);
+
+	if (unlikely(ll_dir_striped(inode))) {
+		/*
+		 * This is only needed for striped dir to fill ..,
+		 * see lmv_read_page()
+		 */
+		if (file_dentry(filp)->d_parent != NULL &&
+		    file_dentry(filp)->d_parent->d_inode != NULL) {
+			__u64 ibits = MDS_INODELOCK_LOOKUP;
+			struct inode *parent =
+				file_dentry(filp)->d_parent->d_inode;
+
+			if (ll_have_md_lock(parent, &ibits, LCK_MINMODE))
+				pfid = *ll_inode2fid(parent);
+		}
+
+		/* If it can not find in cache, do lookup .. on the master
+		 * object */
+		if (fid_is_zero(&pfid)) {
+			rc = ll_dir_get_parent_fid(inode, &pfid);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, inode);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+
+	/* foreign dirs are browsed out of Lustre */
+	if (unlikely(op_data->op_mea1 != NULL &&
+		     op_data->op_mea1->lsm_md_magic == LMV_MAGIC_FOREIGN)) {
+		ll_finish_md_op_data(op_data);
+		RETURN(-ENODATA);
+	}
+
+	op_data->op_fid3 = pfid;
+
+#ifdef HAVE_DIR_CONTEXT
+	ctx->pos = pos;
+	rc = ll_dir_read(inode, &pos, op_data, ctx, &partial_readdir_rc);
+	pos = ctx->pos;
+#else
+	rc = ll_dir_read(inode, &pos, op_data, cookie, filldir,
+			 &partial_readdir_rc);
+#endif
+	lfd->lfd_pos = pos;
+	if (!lfd->fd_partial_readdir_rc)
+		lfd->fd_partial_readdir_rc = partial_readdir_rc;
+
+	if (pos == MDS_DIR_END_OFF) {
+		if (api32)
+			pos = LL_DIR_END_OFF_32BIT;
+		else
+			pos = LL_DIR_END_OFF;
+	} else {
+		if (api32 && hash64)
+			pos = pos >> 32;
+	}
+#ifdef HAVE_DIR_CONTEXT
+	ctx->pos = pos;
+#else
+	filp->f_pos = pos;
+#endif
+	ll_finish_md_op_data(op_data);
+
+out:
+	if (!rc)
+		ll_stats_ops_tally(sbi, LPROC_LL_READDIR,
+				   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(rc);
+}
+
+/**
+ * Create striped directory with specified stripe(@lump)
+ *
+ * \param[in] dparent	the parent of the directory.
+ * \param[in] lump	the specified stripes.
+ * \param[in] dirname	the name of the directory.
+ * \param[in] mode	the specified mode of the directory.
+ *
+ * \retval		=0 if striped directory is being created successfully.
+ *                      <0 if the creation is failed.
+ */
+static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
+			       size_t len, const char *dirname, umode_t mode,
+			       bool createonly)
+{
+	struct inode *parent = dparent->d_inode;
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct ll_sb_info *sbi = ll_i2sbi(parent);
+	struct inode *inode = NULL;
+	struct dentry dentry = {
+		.d_parent = dparent,
+		.d_name = {
+			.name = dirname,
+			.len = strlen(dirname),
+			.hash = ll_full_name_hash(dparent, dirname,
+						  strlen(dirname)),
+		},
+		.d_sb = dparent->d_sb,
+	};
+	bool encrypt = false;
+	int hash_flags;
+	int err;
+
+	ENTRY;
+	if (unlikely(!lmv_user_magic_supported(lump->lum_magic)))
+		RETURN(-EINVAL);
+
+	if (lump->lum_magic != LMV_MAGIC_FOREIGN) {
+		CDEBUG(D_VFSTRACE,
+		       "VFS Op:inode="DFID"(%p) name %s stripe_offset %d, stripe_count: %u\n",
+		       PFID(ll_inode2fid(parent)), parent, dirname,
+		       (int)lump->lum_stripe_offset, lump->lum_stripe_count);
+	} else {
+		struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lump;
+
+		CDEBUG(D_VFSTRACE,
+		       "VFS Op:inode="DFID"(%p) name %s foreign, length %u, value '%.*s'\n",
+		       PFID(ll_inode2fid(parent)), parent, dirname,
+		       lfm->lfm_length, lfm->lfm_length, lfm->lfm_value);
+	}
+
+	if (lump->lum_stripe_count > 1 &&
+	    !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_DIR_STRIPE))
+		RETURN(-EINVAL);
+
+	if (IS_DEADDIR(parent) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LLITE_NO_CHECK_DEAD))
+		RETURN(-ENOENT);
+
+	/* MDS < 2.14 doesn't support 'crush' hash type, and cannot handle
+	 * unknown hash if client doesn't set a valid one. switch to fnv_1a_64.
+	 */
+	if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_CRUSH)) {
+		enum lmv_hash_type type = lump->lum_hash_type &
+					  LMV_HASH_TYPE_MASK;
+
+		if (type >= LMV_HASH_TYPE_CRUSH ||
+		    type == LMV_HASH_TYPE_UNKNOWN)
+			lump->lum_hash_type = (lump->lum_hash_type ^ type) |
+					      LMV_HASH_TYPE_FNV_1A_64;
+	}
+
+	hash_flags = lump->lum_hash_type & ~LMV_HASH_TYPE_MASK;
+	if (hash_flags & ~LMV_HASH_FLAG_KNOWN)
+		RETURN(-EINVAL);
+
+	if (unlikely(!lmv_user_magic_supported(cpu_to_le32(lump->lum_magic))))
+		lustre_swab_lmv_user_md(lump);
+
+	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+		mode &= ~current_umask();
+	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname,
+				     strlen(dirname), mode, LUSTRE_OPC_MKDIR,
+				     lump);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_dir_depth = ll_i2info(parent)->lli_inherit_depth ?:
+				ll_i2info(parent)->lli_dir_depth;
+
+	if (ll_sbi_has_encrypt(sbi) &&
+	    (IS_ENCRYPTED(parent) ||
+	     unlikely(ll_sb_has_test_dummy_encryption(parent->i_sb)))) {
+		err = llcrypt_get_encryption_info(parent);
+		if (err)
+			GOTO(out_op_data, err);
+		if (!llcrypt_has_encryption_key(parent))
+			GOTO(out_op_data, err = -ENOKEY);
+		encrypt = true;
+	}
+
+	if (test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) {
+		/* selinux_dentry_init_security() uses dentry->d_parent and name
+		 * to determine the security context for the file. So our fake
+		 * dentry should be real enough for this purpose. */
+		err = ll_dentry_init_security(&dentry, mode, &dentry.d_name,
+					      &op_data->op_file_secctx_name,
+					      &op_data->op_file_secctx_name_size,
+					      &op_data->op_file_secctx,
+					      &op_data->op_file_secctx_size,
+					      &op_data->op_file_secctx_slot);
+		if (err < 0)
+			GOTO(out_op_data, err);
+	}
+
+	if (encrypt) {
+		err = llcrypt_inherit_context(parent, NULL, op_data, false);
+		if (err)
+			GOTO(out_op_data, err);
+	}
+
+	op_data->op_cli_flags |= CLI_SET_MEA;
+	if (createonly)
+		op_data->op_bias |= MDS_SETSTRIPE_CREATE;
+
+	err = md_create(sbi->ll_md_exp, op_data, lump, len, mode,
+			from_kuid(&init_user_ns, current_fsuid()),
+			from_kgid(&init_user_ns, current_fsgid()),
+			current_cap(), 0, &request);
+	if (err)
+		GOTO(out_request, err);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE, cfs_fail_val);
+
+	err = ll_prep_inode(&inode, &request->rq_pill, parent->i_sb, NULL);
+	if (err)
+		GOTO(out_inode, err);
+
+	dentry.d_inode = inode;
+
+	if (test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags))
+		err = ll_inode_notifysecctx(inode, op_data->op_file_secctx,
+					    op_data->op_file_secctx_size);
+	else
+		err = ll_inode_init_security(&dentry, inode, parent);
+
+	if (err)
+		GOTO(out_inode, err);
+
+	if (encrypt) {
+		err = ll_set_encflags(inode, op_data->op_file_encctx,
+				      op_data->op_file_encctx_size, false);
+		if (err)
+			GOTO(out_inode, err);
+	}
+
+out_inode:
+	iput(inode);
+out_request:
+	ptlrpc_req_finished(request);
+out_op_data:
+	ll_finish_md_op_data(op_data);
+
+	return err;
+}
+
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+                     int set_default)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
+	int lum_size;
+	ENTRY;
+
+	if (lump != NULL) {
+		switch (lump->lmm_magic) {
+		case LOV_USER_MAGIC_V1:
+			lum_size = sizeof(struct lov_user_md_v1);
+			break;
+		case LOV_USER_MAGIC_V3:
+			lum_size = sizeof(struct lov_user_md_v3);
+			break;
+		case LOV_USER_MAGIC_COMP_V1:
+			lum_size = ((struct lov_comp_md_v1 *)lump)->lcm_size;
+			break;
+		case LMV_USER_MAGIC: {
+			struct lmv_user_md *lmv = (struct lmv_user_md *)lump;
+
+			/* MDS < 2.14 doesn't support 'crush' hash type, and
+			 * cannot handle unknown hash if client doesn't set a
+			 * valid one. switch to fnv_1a_64.
+			 */
+			if (!(exp_connect_flags2(sbi->ll_md_exp) &
+			      OBD_CONNECT2_CRUSH)) {
+				enum lmv_hash_type type = lmv->lum_hash_type &
+							  LMV_HASH_TYPE_MASK;
+
+				if (type >= LMV_HASH_TYPE_CRUSH ||
+				    type == LMV_HASH_TYPE_UNKNOWN)
+					lmv->lum_hash_type =
+						(lmv->lum_hash_type ^ type) |
+						LMV_HASH_TYPE_FNV_1A_64;
+			}
+			if (lmv->lum_magic != cpu_to_le32(LMV_USER_MAGIC))
+				lustre_swab_lmv_user_md(lmv);
+			lum_size = sizeof(*lmv);
+			break;
+		}
+		case LOV_USER_MAGIC_SPECIFIC: {
+			struct lov_user_md_v3 *v3 =
+				(struct lov_user_md_v3 *)lump;
+			if (v3->lmm_stripe_count > LOV_MAX_STRIPE_COUNT)
+				RETURN(-EINVAL);
+			lum_size = lov_user_md_size(v3->lmm_stripe_count,
+						    LOV_USER_MAGIC_SPECIFIC);
+			break;
+		}
+		default:
+			CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+					" %#08x != %#08x nor %#08x\n",
+					lump->lmm_magic, LOV_USER_MAGIC_V1,
+					LOV_USER_MAGIC_V3);
+			RETURN(-EINVAL);
+		}
+
+		/*
+		 * This is coming from userspace, so should be in
+		 * local endian.  But the MDS would like it in little
+		 * endian, so we swab it before we send it.
+		 */
+		if ((__swab32(lump->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
+		    le32_to_cpu(LOV_MAGIC_MAGIC))
+			lustre_swab_lov_user_md(lump, 0);
+	} else {
+		lum_size = sizeof(struct lov_user_md_v1);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	/* swabbing is done in lov_setstripe() on server side */
+	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req);
+	ll_finish_md_op_data(op_data);
+	ptlrpc_req_finished(req);
+	if (rc)
+		RETURN(rc);
+
+	RETURN(rc);
+}
+
+int ll_dir_get_default_layout(struct inode *inode, void **plmm, int *plmm_size,
+			      struct ptlrpc_request **request, u64 valid,
+			      enum get_default_layout_type type)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct mdt_body   *body;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *req = NULL;
+	int lmm_size = OBD_MAX_DEFAULT_EA_SIZE;
+	struct md_op_data *op_data;
+	struct lu_fid fid;
+	int rc;
+
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, lmm_size,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = valid | OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+
+	if (type == GET_DEFAULT_LAYOUT_ROOT) {
+		lu_root_fid(&op_data->op_fid1);
+		fid = op_data->op_fid1;
+	} else {
+		fid = *ll_inode2fid(inode);
+	}
+
+	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr failed on inode "DFID": rc %d\n",
+		       PFID(&fid), rc);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	lmm_size = body->mbo_eadatasize;
+
+	if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+	    lmm_size == 0) {
+		GOTO(out, rc = -ENODATA);
+	}
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill,
+					   &RMF_MDT_MD, lmm_size);
+	LASSERT(lmm != NULL);
+
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	/* We don't swab objects for directories */
+	switch (le32_to_cpu(lmm->lmm_magic)) {
+	case LOV_MAGIC_V1:
+	case LOV_MAGIC_V3:
+	case LOV_MAGIC_COMP_V1:
+	case LOV_USER_MAGIC_SPECIFIC:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
+		break;
+	case LMV_MAGIC_V1:
+		if (LMV_MAGIC != cpu_to_le32(LMV_MAGIC))
+			lustre_swab_lmv_mds_md((union lmv_mds_md *)lmm);
+		break;
+	case LMV_USER_MAGIC:
+		if (LMV_USER_MAGIC != cpu_to_le32(LMV_USER_MAGIC))
+			lustre_swab_lmv_user_md((struct lmv_user_md *)lmm);
+		break;
+	case LMV_MAGIC_FOREIGN: {
+		struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lmm;
+
+		if (LMV_MAGIC_FOREIGN != cpu_to_le32(LMV_MAGIC_FOREIGN)) {
+			__swab32s(&lfm->lfm_magic);
+			__swab32s(&lfm->lfm_length);
+			__swab32s(&lfm->lfm_type);
+			__swab32s(&lfm->lfm_flags);
+		}
+		break;
+	}
+	default:
+		CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+		rc = -EPROTO;
+	}
+out:
+	*plmm = lmm;
+	*plmm_size = lmm_size;
+	*request = req;
+	return rc;
+}
+
+/**
+ * This function will be used to get default LOV/LMV/Default LMV
+ * @valid will be used to indicate which stripe it will retrieve.
+ * If the directory does not have its own default layout, then the
+ * function will request the default layout from root FID.
+ *	OBD_MD_MEA		LMV stripe EA
+ *	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
+ *	otherwise		Default LOV EA.
+ * Each time, it can only retrieve 1 stripe EA
+ **/
+int ll_dir_getstripe_default(struct inode *inode, void **plmm, int *plmm_size,
+			     struct ptlrpc_request **request,
+			     struct ptlrpc_request **root_request,
+			     u64 valid)
+{
+	struct ptlrpc_request *req = NULL;
+	struct ptlrpc_request *root_req = NULL;
+	struct lov_mds_md *lmm = NULL;
+	int lmm_size = 0;
+	int rc = 0;
+	ENTRY;
+
+	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
+				       &req, valid, 0);
+	if (rc == -ENODATA && !fid_is_root(ll_inode2fid(inode)) &&
+	    !(valid & OBD_MD_MEA) && root_request != NULL) {
+		int rc2 = ll_dir_get_default_layout(inode, (void **)&lmm,
+						    &lmm_size, &root_req, valid,
+						    GET_DEFAULT_LAYOUT_ROOT);
+		if (rc2 == 0)
+			rc = 0;
+	}
+
+	*plmm = lmm;
+	*plmm_size = lmm_size;
+	*request = req;
+	if (root_request != NULL)
+		*root_request = root_req;
+
+	RETURN(rc);
+}
+
+/**
+ * This function will be used to get default LOV/LMV/Default LMV
+ * @valid will be used to indicate which stripe it will retrieve
+ *	OBD_MD_MEA		LMV stripe EA
+ *	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
+ *	otherwise		Default LOV EA.
+ * Each time, it can only retrieve 1 stripe EA
+ **/
+int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
+		     struct ptlrpc_request **request, u64 valid)
+{
+	struct ptlrpc_request *req = NULL;
+	struct lov_mds_md *lmm = NULL;
+	int lmm_size = 0;
+	int rc = 0;
+	ENTRY;
+
+	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
+				       &req, valid, 0);
+
+	*plmm = lmm;
+	*plmm_size = lmm_size;
+	*request = req;
+
+	RETURN(rc);
+}
+
+int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid)
+{
+	struct md_op_data	*op_data;
+	int			rc;
+	int			mdt_index;
+	ENTRY;
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		RETURN(-ENOMEM);
+
+	op_data->op_flags |= MF_GET_MDT_IDX;
+	op_data->op_fid1 = *fid;
+	rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
+	mdt_index = op_data->op_mds;
+	OBD_FREE_PTR(op_data);
+	if (rc < 0)
+		RETURN(rc);
+
+	RETURN(mdt_index);
+}
+
+/*
+ *  Get MDT index for the inode.
+ */
+int ll_get_mdt_idx(struct inode *inode)
+{
+	return ll_get_mdt_idx_by_fid(ll_i2sbi(inode), ll_inode2fid(inode));
+}
+
+/**
+ * Generic handler to do any pre-copy work.
+ *
+ * It sends a first hsm_progress (with extent length == 0) to coordinator as a
+ * first information for it that real work has started.
+ *
+ * Moreover, for a ARCHIVE request, it will sample the file data version and
+ * store it in \a copy.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc = 0;
+	int				 rc2;
+	ENTRY;
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
+	hpk.hpk_extent.length = 0;
+	hpk.hpk_flags = 0;
+	hpk.hpk_errval = 0;
+	hpk.hpk_data_version = 0;
+
+
+	/* For archive request, we need to read the current file version. */
+	if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get inode for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval is >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			GOTO(progress, rc = PTR_ERR(inode));
+		}
+
+		/* Read current file data version */
+		rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH);
+		iput(inode);
+		if (rc != 0) {
+			CDEBUG(D_HSM, "Could not read file data version of "
+				      DFID" (rc = %d). Archive request ("
+				      "%#llx) could not be done.\n",
+				      PFID(&copy->hc_hai.hai_fid), rc,
+				      copy->hc_hai.hai_cookie);
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+		/* Store in the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		copy->hc_data_version = data_version;
+
+	} else if (copy->hc_hai.hai_action == HSMA_IMPORT) {
+
+		/* IMPORT sends its progress using alloc fid when possible */
+		hpk.hpk_fid = copy->hc_hai.hai_dfid;
+	}
+
+progress:
+	/* On error, the request should be considered as completed */
+	if (hpk.hpk_errval > 0)
+		hpk.hpk_flags |= HP_FLAG_COMPLETED;
+
+	rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			    &hpk, NULL);
+
+	/* Return first error */
+	RETURN(rc != 0 ? rc : rc2);
+}
+
+/**
+ * Generic handler to do any post-copy work.
+ *
+ * It will send the last hsm_progress update to coordinator to inform it
+ * that copy is finished and whether it was successful or not.
+ *
+ * Moreover,
+ * - for ARCHIVE request, it will sample the file data version and compare it
+ *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
+ *   will be considered as failed.
+ * - for RESTORE request, it will sample the file data version and send it to
+ *   coordinator which is useful if the file was imported as 'released'.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc = 0;
+	int				 rc2;
+	ENTRY;
+
+	/* If you modify the logic here, also check llapi_hsm_copy_end(). */
+	/* Take care: copy->hc_hai.hai_action, len, gid and data are not
+	 * initialized if copy_end was called with copy == NULL.
+	 */
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent = copy->hc_hai.hai_extent;
+	hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
+	hpk.hpk_errval = copy->hc_errval;
+	hpk.hpk_data_version = 0;
+
+	/* For archive request, we need to check the file data was not changed.
+	 *
+	 * For restore request, we need to send the file data version, this is
+	 * useful when the file was created using hsm_import.
+	 */
+	if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
+	     (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
+	    (copy->hc_errval == 0)) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get lsm for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			GOTO(progress, rc = PTR_ERR(inode));
+		}
+
+		rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH);
+		iput(inode);
+		if (rc) {
+			CDEBUG(D_HSM, "Could not read file data version. "
+				      "Request could not be confirmed.\n");
+			if (hpk.hpk_errval == 0)
+				hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+		/* Store in the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		hpk.hpk_data_version = data_version;
+
+		/* File could have been stripped during archiving, so we need
+		 * to check anyway. */
+		if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
+		    (copy->hc_data_version != data_version)) {
+			CDEBUG(D_HSM, "File data version mismatched. "
+			      "File content was changed during archiving. "
+			       DFID", start:%#llx current:%#llx\n",
+			       PFID(&copy->hc_hai.hai_fid),
+			       copy->hc_data_version, data_version);
+			/* File was changed, send error to cdt. Do not ask for
+			 * retry because if a file is modified frequently,
+			 * the cdt will loop on retried archive requests.
+			 * The policy engine will ask for a new archive later
+			 * when the file will not be modified for some tunable
+			 * time */
+			hpk.hpk_flags &= ~HP_FLAG_RETRY;
+			rc = -EBUSY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+	} else if (copy->hc_hai.hai_action == HSMA_IMPORT) {
+
+		/* IMPORT sends its progress using alloc fid when possible */
+		hpk.hpk_fid = copy->hc_hai.hai_dfid;
+	}
+
+progress:
+	rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			    &hpk, NULL);
+
+	/* Return first error */
+	RETURN(rc != 0 ? rc : rc2);
+}
+
+
+static int copy_and_ct_start(int cmd, struct obd_export *exp,
+			     const struct lustre_kernelcomm __user *data)
+{
+	struct lustre_kernelcomm *lk;
+	struct lustre_kernelcomm *tmp;
+	size_t size = sizeof(*lk);
+	size_t new_size;
+	int i;
+	int rc;
+
+	/* copy data from userspace to get numbers of archive_id */
+	OBD_ALLOC(lk, size);
+	if (lk == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(lk, data, size))
+		GOTO(out_lk, rc = -EFAULT);
+
+	if (lk->lk_flags & LK_FLG_STOP)
+		goto do_ioctl;
+
+	if (!(lk->lk_flags & LK_FLG_DATANR)) {
+		__u32 archive_mask = lk->lk_data_count;
+		int count;
+
+		/* old hsm agent to old MDS */
+		if (!exp_connect_archive_id_array(exp))
+			goto do_ioctl;
+
+		/* old hsm agent to new MDS */
+		lk->lk_flags |= LK_FLG_DATANR;
+
+		if (archive_mask == 0)
+			goto do_ioctl;
+
+		count = hweight32(archive_mask);
+		new_size = offsetof(struct lustre_kernelcomm, lk_data[count]);
+		OBD_ALLOC(tmp, new_size);
+		if (tmp == NULL)
+			GOTO(out_lk, rc = -ENOMEM);
+
+		memcpy(tmp, lk, size);
+		tmp->lk_data_count = count;
+		OBD_FREE(lk, size);
+		lk = tmp;
+		size = new_size;
+
+		count = 0;
+		for (i = 0; i < sizeof(archive_mask) * 8; i++) {
+			if (BIT(i) & archive_mask) {
+				lk->lk_data[count] = i + 1;
+				count++;
+			}
+		}
+		goto do_ioctl;
+	}
+
+	/* new hsm agent to new mds */
+	if (lk->lk_data_count > 0) {
+		new_size = offsetof(struct lustre_kernelcomm,
+				    lk_data[lk->lk_data_count]);
+		OBD_ALLOC(tmp, new_size);
+		if (tmp == NULL)
+			GOTO(out_lk, rc = -ENOMEM);
+
+		OBD_FREE(lk, size);
+		lk = tmp;
+		size = new_size;
+
+		if (copy_from_user(lk, data, size))
+			GOTO(out_lk, rc = -EFAULT);
+	}
+
+	/* new hsm agent to old MDS */
+	if (!exp_connect_archive_id_array(exp)) {
+		__u32 archives = 0;
+
+		if (lk->lk_data_count > LL_HSM_ORIGIN_MAX_ARCHIVE)
+			GOTO(out_lk, rc = -EINVAL);
+
+		for (i = 0; i < lk->lk_data_count; i++) {
+			if (lk->lk_data[i] > LL_HSM_ORIGIN_MAX_ARCHIVE) {
+				rc = -EINVAL;
+				CERROR("%s: archive id %d requested but only "
+				       "[0 - %zu] supported: rc = %d\n",
+				       exp->exp_obd->obd_name, lk->lk_data[i],
+				       LL_HSM_ORIGIN_MAX_ARCHIVE, rc);
+				GOTO(out_lk, rc);
+			}
+
+			if (lk->lk_data[i] == 0) {
+				archives = 0;
+				break;
+			}
+
+			archives |= (1 << (lk->lk_data[i] - 1));
+		}
+		lk->lk_flags &= ~LK_FLG_DATANR;
+		lk->lk_data_count = archives;
+	}
+do_ioctl:
+	rc = obd_iocontrol(cmd, exp, size, lk, NULL);
+out_lk:
+	OBD_FREE(lk, size);
+	return rc;
+}
+
+static int check_owner(int type, int id)
+{
+	switch (type) {
+	case USRQUOTA:
+		if (!uid_eq(current_euid(), make_kuid(&init_user_ns, id)))
+			return -EPERM;
+		break;
+	case GRPQUOTA:
+		if (!in_egroup_p(make_kgid(&init_user_ns, id)))
+			return -EPERM;
+		break;
+	case PRJQUOTA:
+		break;
+	}
+	return 0;
+}
+
+int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int cmd = qctl->qc_cmd;
+	int type = qctl->qc_type;
+	int id = qctl->qc_id;
+	int valid = qctl->qc_valid;
+	int rc = 0;
+
+	ENTRY;
+
+	switch (cmd) {
+	case Q_SETQUOTA:
+	case Q_SETINFO:
+	case LUSTRE_Q_SETDEFAULT:
+	case LUSTRE_Q_SETQUOTAPOOL:
+	case LUSTRE_Q_SETINFOPOOL:
+	case LUSTRE_Q_SETDEFAULT_POOL:
+	case LUSTRE_Q_DELETEQID:
+		if (!capable(CAP_SYS_ADMIN))
+			RETURN(-EPERM);
+
+		if (sb->s_flags & SB_RDONLY)
+			RETURN(-EROFS);
+		break;
+	case Q_GETQUOTA:
+	case LUSTRE_Q_GETDEFAULT:
+	case LUSTRE_Q_GETQUOTAPOOL:
+	case LUSTRE_Q_GETDEFAULT_POOL:
+		if (check_owner(type, id) &&
+		    (!capable(CAP_SYS_ADMIN)))
+			RETURN(-EPERM);
+		break;
+	case Q_GETINFO:
+	case LUSTRE_Q_GETINFOPOOL:
+		break;
+	default:
+		CERROR("unsupported quotactl op: %#x\n", cmd);
+		RETURN(-ENOTSUPP);
+	}
+
+	if (valid != QC_GENERAL) {
+		if (cmd == Q_GETINFO)
+			qctl->qc_cmd = Q_GETOINFO;
+		else if (cmd == Q_GETQUOTA ||
+			 cmd == LUSTRE_Q_GETQUOTAPOOL)
+			qctl->qc_cmd = Q_GETOQUOTA;
+		else
+			RETURN(-EINVAL);
+
+                switch (valid) {
+                case QC_MDTIDX:
+                        rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+                                           sizeof(*qctl), qctl, NULL);
+                        break;
+                case QC_OSTIDX:
+                        rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
+                                           sizeof(*qctl), qctl, NULL);
+                        break;
+                case QC_UUID:
+                        rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+                                           sizeof(*qctl), qctl, NULL);
+                        if (rc == -EAGAIN)
+                                rc = obd_iocontrol(OBD_IOC_QUOTACTL,
+                                                   sbi->ll_dt_exp,
+                                                   sizeof(*qctl), qctl, NULL);
+                        break;
+                default:
+                        rc = -EINVAL;
+                        break;
+                }
+
+                if (rc)
+                        RETURN(rc);
+
+                qctl->qc_cmd = cmd;
+        } else {
+                struct obd_quotactl *oqctl;
+		int oqctl_len = sizeof(*oqctl);
+
+		if (LUSTRE_Q_CMD_IS_POOL(cmd))
+			oqctl_len += LOV_MAXPOOLNAME + 1;
+
+		OBD_ALLOC(oqctl, oqctl_len);
+		if (oqctl == NULL)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(sbi->ll_md_exp, oqctl);
+		if (rc) {
+			OBD_FREE(oqctl, oqctl_len);
+			RETURN(rc);
+		}
+                /* If QIF_SPACE is not set, client should collect the
+                 * space usage from OSSs by itself */
+		if ((cmd == Q_GETQUOTA || cmd == LUSTRE_Q_GETQUOTAPOOL) &&
+		    !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
+		    !oqctl->qc_dqblk.dqb_curspace) {
+			struct obd_quotactl *oqctl_tmp;
+			int qctl_len = sizeof(*oqctl_tmp) + LOV_MAXPOOLNAME + 1;
+
+			OBD_ALLOC(oqctl_tmp, qctl_len);
+			if (oqctl_tmp == NULL)
+				GOTO(out, rc = -ENOMEM);
+
+			if (cmd == LUSTRE_Q_GETQUOTAPOOL) {
+				oqctl_tmp->qc_cmd = LUSTRE_Q_GETQUOTAPOOL;
+				memcpy(oqctl_tmp->qc_poolname,
+				       qctl->qc_poolname,
+				       LOV_MAXPOOLNAME + 1);
+			} else {
+				oqctl_tmp->qc_cmd = Q_GETOQUOTA;
+			}
+                        oqctl_tmp->qc_id = oqctl->qc_id;
+                        oqctl_tmp->qc_type = oqctl->qc_type;
+
+                        /* collect space usage from OSTs */
+                        oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+                        rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
+                        if (!rc || rc == -EREMOTEIO) {
+                                oqctl->qc_dqblk.dqb_curspace =
+                                        oqctl_tmp->qc_dqblk.dqb_curspace;
+                                oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
+                        }
+
+			/* collect space & inode usage from MDTs */
+			oqctl_tmp->qc_cmd = Q_GETOQUOTA;
+			oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+			oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
+			rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
+			if (!rc || rc == -EREMOTEIO) {
+				oqctl->qc_dqblk.dqb_curspace +=
+					oqctl_tmp->qc_dqblk.dqb_curspace;
+				oqctl->qc_dqblk.dqb_curinodes =
+					oqctl_tmp->qc_dqblk.dqb_curinodes;
+				oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
+			} else {
+				oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
+			}
+
+			OBD_FREE(oqctl_tmp, qctl_len);
+                }
+out:
+		QCTL_COPY(qctl, oqctl);
+		OBD_FREE(oqctl, oqctl_len);
+	}
+
+	RETURN(rc);
+}
+
+int ll_rmfid(struct file *file, void __user *arg)
+{
+	const struct fid_array __user *ufa = arg;
+	struct inode *inode = file_inode(file);
+	struct fid_array *lfa = NULL;
+	size_t size;
+	unsigned nr;
+	int i, rc, *rcs = NULL;
+	ENTRY;
+
+	if (!capable(CAP_DAC_READ_SEARCH) &&
+	    !test_bit(LL_SBI_USER_FID2PATH, ll_i2sbi(inode)->ll_flags))
+		RETURN(-EPERM);
+	/* Only need to get the buflen */
+	if (get_user(nr, &ufa->fa_nr))
+		RETURN(-EFAULT);
+	/* DoS protection */
+	if (nr > OBD_MAX_FIDS_IN_ARRAY)
+		RETURN(-E2BIG);
+
+	size = offsetof(struct fid_array, fa_fids[nr]);
+	OBD_ALLOC(lfa, size);
+	if (!lfa)
+		RETURN(-ENOMEM);
+	OBD_ALLOC_PTR_ARRAY(rcs, nr);
+	if (!rcs)
+		GOTO(free_lfa, rc = -ENOMEM);
+
+	if (copy_from_user(lfa, arg, size))
+		GOTO(free_rcs, rc = -EFAULT);
+
+	/* Call mdc_iocontrol */
+	rc = md_rmfid(ll_i2mdexp(file_inode(file)), lfa, rcs, NULL);
+	if (!rc) {
+		for (i = 0; i < nr; i++)
+			if (rcs[i])
+				lfa->fa_fids[i].f_ver = rcs[i];
+		if (copy_to_user(arg, lfa, size))
+			rc = -EFAULT;
+	}
+
+free_rcs:
+	OBD_FREE_PTR_ARRAY(rcs, nr);
+free_lfa:
+	OBD_FREE(lfa, size);
+
+	RETURN(rc);
+}
+
+/* This function tries to get a single name component,
+ * to send to the server. No actual path traversal involved,
+ * so we limit to NAME_MAX */
+static char *ll_getname(const char __user *filename)
+{
+	int ret = 0, len;
+	char *tmp;
+
+	OBD_ALLOC(tmp, NAME_MAX + 1);
+
+	if (!tmp)
+		return ERR_PTR(-ENOMEM);
+
+	len = strncpy_from_user(tmp, filename, NAME_MAX + 1);
+	if (len < 0)
+		ret = -ENOENT;
+	else if (len > NAME_MAX)
+		ret = -ENAMETOOLONG;
+
+	if (ret) {
+		OBD_FREE(tmp, NAME_MAX + 1);
+		tmp =  ERR_PTR(ret);
+	}
+	return tmp;
+}
+
+#define ll_putname(filename) OBD_FREE(filename, NAME_MAX + 1);
+
+static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct dentry *dentry = file_dentry(file);
+	struct inode *inode = file_inode(file);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_ioctl_data *data = NULL;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%#x\n",
+	       PFID(ll_inode2fid(inode)), inode, cmd);
+
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		return -ENOTTY;
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+	case FS_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION:
+	case FS_IOC_GETVERSION:
+		RETURN(put_user(inode->i_generation, (int __user *)arg));
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field. */
+	case FS_IOC_SETVERSION:
+		RETURN(-ENOTSUPP);
+
+	case LL_IOC_GET_MDTIDX: {
+		int mdtidx;
+
+		mdtidx = ll_get_mdt_idx(inode);
+		if (mdtidx < 0)
+			RETURN(mdtidx);
+
+		if (put_user((int)mdtidx, (int __user *)arg))
+			RETURN(-EFAULT);
+
+		return 0;
+	}
+	case IOC_MDC_LOOKUP: {
+		int namelen, len = 0;
+		char *filename;
+
+		rc = obd_ioctl_getdata(&data, &len, (void __user *)arg);
+		if (rc != 0)
+			RETURN(rc);
+
+		filename = data->ioc_inlbuf1;
+		namelen = strlen(filename);
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			GOTO(out_free, rc = -EINVAL);
+		}
+
+		rc = ll_get_fid_by_name(inode, filename, namelen, NULL, NULL);
+		if (rc < 0) {
+			CERROR("%s: lookup %.*s failed: rc = %d\n",
+			       sbi->ll_fsname, namelen, filename, rc);
+			GOTO(out_free, rc);
+		}
+out_free:
+		OBD_FREE_LARGE(data, len);
+		return rc;
+	}
+	case LL_IOC_LMV_SETSTRIPE: {
+		struct lmv_user_md  *lum;
+		char *filename;
+		int namelen = 0;
+		int lumlen = 0;
+		umode_t mode;
+		bool createonly = false;
+		int len;
+		int rc;
+
+		rc = obd_ioctl_getdata(&data, &len, (void __user *)arg);
+		if (rc)
+			RETURN(rc);
+
+		if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+		    data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
+			GOTO(lmv_out_free, rc = -EINVAL);
+
+		filename = data->ioc_inlbuf1;
+		namelen = data->ioc_inllen1;
+
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+		lumlen = data->ioc_inllen2;
+
+		if (!lmv_user_magic_supported(lum->lum_magic)) {
+			CERROR("%s: wrong lum magic %x : rc = %d\n", filename,
+			       lum->lum_magic, -EINVAL);
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+
+		if ((lum->lum_magic == LMV_USER_MAGIC ||
+		     lum->lum_magic == LMV_USER_MAGIC_SPECIFIC) &&
+		    lumlen < sizeof(*lum)) {
+			CERROR("%s: wrong lum size %d for magic %x : rc = %d\n",
+			       filename, lumlen, lum->lum_magic, -EINVAL);
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+
+		if (lum->lum_magic == LMV_MAGIC_FOREIGN &&
+		    lumlen < sizeof(struct lmv_foreign_md)) {
+			CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
+			       filename, lum->lum_magic, lumlen, -EFAULT);
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+
+		mode = data->ioc_type;
+		createonly = data->ioc_obdo1.o_flags & OBD_FL_OBDMDEXISTS;
+		rc = ll_dir_setdirstripe(dentry, lum, lumlen, filename, mode,
+					 createonly);
+lmv_out_free:
+		OBD_FREE_LARGE(data, len);
+		RETURN(rc);
+
+	}
+	case LL_IOC_LMV_SET_DEFAULT_STRIPE: {
+		struct lmv_user_md	  lum;
+		struct lmv_user_md __user *ulump =
+					(struct lmv_user_md __user *)arg;
+		int			  rc;
+
+		if (copy_from_user(&lum, ulump, sizeof(lum)))
+			RETURN(-EFAULT);
+
+		if (lum.lum_magic != LMV_USER_MAGIC)
+			RETURN(-EINVAL);
+
+		rc = ll_dir_setstripe(inode, (struct lov_user_md *)&lum, 0);
+
+		RETURN(rc);
+	}
+	case LL_IOC_LOV_SETSTRIPE_NEW:
+	case LL_IOC_LOV_SETSTRIPE: {
+		struct lov_user_md_v3 *lumv3 = NULL;
+		struct lov_user_md_v1 lumv1;
+		struct lov_user_md_v1 *lumv1_ptr = &lumv1;
+		struct lov_user_md_v1 __user *lumv1p =
+			(struct lov_user_md_v1 __user *)arg;
+		struct lov_user_md_v3 __user *lumv3p =
+			(struct lov_user_md_v3 __user *)arg;
+		int lum_size = 0;
+
+		int set_default = 0;
+
+		BUILD_BUG_ON(sizeof(struct lov_user_md_v3) <=
+			     sizeof(struct lov_comp_md_v1));
+		BUILD_BUG_ON(sizeof(*lumv3) != sizeof(*lumv3p));
+		/* first try with v1 which is smaller than v3 */
+		if (copy_from_user(&lumv1, lumv1p, sizeof(lumv1)))
+			RETURN(-EFAULT);
+
+		if (is_root_inode(inode))
+			set_default = 1;
+
+		switch (lumv1.lmm_magic) {
+		case LOV_USER_MAGIC_V3:
+		case LOV_USER_MAGIC_SPECIFIC:
+			lum_size = ll_lov_user_md_size(&lumv1);
+			if (lum_size < 0)
+				RETURN(lum_size);
+			OBD_ALLOC(lumv3, lum_size);
+			if (!lumv3)
+				RETURN(-ENOMEM);
+			if (copy_from_user(lumv3, lumv3p, lum_size))
+				GOTO(out, rc = -EFAULT);
+			lumv1_ptr = (struct lov_user_md_v1 *)lumv3;
+			break;
+		case LOV_USER_MAGIC_V1:
+			break;
+		default:
+			GOTO(out, rc = -ENOTSUPP);
+		}
+
+		/* in v1 and v3 cases lumv1 points to data */
+		rc = ll_dir_setstripe(inode, lumv1_ptr, set_default);
+out:
+		if (lumv3)
+			OBD_FREE(lumv3, lum_size);
+		RETURN(rc);
+	}
+	case LL_IOC_LMV_GETSTRIPE: {
+		struct lmv_user_md __user *ulmv =
+					(struct lmv_user_md __user *)arg;
+		struct lmv_user_md	lum;
+		struct ptlrpc_request	*request = NULL;
+		struct ptlrpc_request	*root_request = NULL;
+		union lmv_mds_md	*lmm = NULL;
+		int			lmmsize;
+		u64			valid = 0;
+		struct lmv_user_md	*tmp = NULL;
+		int			mdt_index;
+		int			lum_size;
+		int			stripe_count;
+		int			max_stripe_count;
+		int			i;
+		int			rc;
+
+		if (copy_from_user(&lum, ulmv, sizeof(*ulmv)))
+			RETURN(-EFAULT);
+
+		max_stripe_count = lum.lum_stripe_count;
+		/* lum_magic will indicate which stripe the ioctl will like
+		 * to get, LMV_MAGIC_V1 is for normal LMV stripe, LMV_USER_MAGIC
+		 * is for default LMV stripe */
+		if (lum.lum_magic == LMV_MAGIC_V1)
+			valid |= OBD_MD_MEA;
+		else if (lum.lum_magic == LMV_USER_MAGIC)
+			valid |= OBD_MD_DEFAULT_MEA;
+		else
+			RETURN(-EINVAL);
+
+		rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmmsize,
+					      &request, &root_request, valid);
+		if (rc != 0)
+			GOTO(finish_req, rc);
+
+		/* Get default LMV EA */
+		if (lum.lum_magic == LMV_USER_MAGIC) {
+			struct lmv_user_md *lum;
+			struct ll_inode_info *lli;
+
+			if (lmmsize > sizeof(*ulmv))
+				GOTO(finish_req, rc = -EINVAL);
+
+			lum = (struct lmv_user_md *)lmm;
+			if (lum->lum_max_inherit == LMV_INHERIT_NONE)
+				GOTO(finish_req, rc = -ENODATA);
+
+			if (root_request != NULL) {
+				lli = ll_i2info(inode);
+				if (lum->lum_max_inherit !=
+				    LMV_INHERIT_UNLIMITED) {
+					if (lum->lum_max_inherit <
+						LMV_INHERIT_END ||
+					    lum->lum_max_inherit >
+						LMV_INHERIT_MAX ||
+					    lum->lum_max_inherit <=
+						lli->lli_dir_depth)
+						GOTO(finish_req, rc = -ENODATA);
+
+					lum->lum_max_inherit -=
+						lli->lli_dir_depth;
+				}
+
+				if (lum->lum_max_inherit_rr !=
+					LMV_INHERIT_RR_UNLIMITED) {
+					if (lum->lum_max_inherit_rr ==
+						LMV_INHERIT_NONE ||
+					    lum->lum_max_inherit_rr <
+						LMV_INHERIT_RR_END ||
+					    lum->lum_max_inherit_rr >
+						LMV_INHERIT_RR_MAX ||
+					    lum->lum_max_inherit_rr <=
+						lli->lli_dir_depth) {
+						lum->lum_max_inherit_rr =
+							LMV_INHERIT_RR_NONE;
+						goto out_copy;
+					}
+
+					lum->lum_max_inherit_rr -=
+						lli->lli_dir_depth;
+				}
+			}
+out_copy:
+			if (copy_to_user(ulmv, lmm, lmmsize))
+				GOTO(finish_req, rc = -EFAULT);
+
+			GOTO(finish_req, rc);
+		}
+
+		/* if foreign LMV case, fake stripes number */
+		if (lmm->lmv_magic == LMV_MAGIC_FOREIGN) {
+			struct lmv_foreign_md *lfm;
+
+			lfm = (struct lmv_foreign_md *)lmm;
+			if (lfm->lfm_length < XATTR_SIZE_MAX -
+			    offsetof(typeof(*lfm), lfm_value)) {
+				__u32 size = lfm->lfm_length +
+					     offsetof(typeof(*lfm), lfm_value);
+
+				stripe_count = lmv_foreign_to_md_stripes(size);
+			} else {
+				CERROR("invalid %d foreign size returned\n",
+					    lfm->lfm_length);
+				return -EINVAL;
+			}
+		} else {
+			stripe_count = lmv_mds_md_stripe_count_get(lmm);
+		}
+		if (max_stripe_count < stripe_count) {
+			lum.lum_stripe_count = stripe_count;
+			if (copy_to_user(ulmv, &lum, sizeof(lum)))
+				GOTO(finish_req, rc = -EFAULT);
+			GOTO(finish_req, rc = -E2BIG);
+		}
+
+		/* enough room on user side and foreign case */
+		if (lmm->lmv_magic == LMV_MAGIC_FOREIGN) {
+			struct lmv_foreign_md *lfm;
+			__u32 size;
+
+			lfm = (struct lmv_foreign_md *)lmm;
+			size = lfm->lfm_length +
+			       offsetof(struct lmv_foreign_md, lfm_value);
+			if (copy_to_user(ulmv, lfm, size))
+				GOTO(finish_req, rc = -EFAULT);
+			GOTO(finish_req, rc);
+		}
+
+		lum_size = lmv_user_md_size(stripe_count,
+					    LMV_USER_MAGIC_SPECIFIC);
+		OBD_ALLOC(tmp, lum_size);
+		if (tmp == NULL)
+			GOTO(finish_req, rc = -ENOMEM);
+
+		mdt_index = ll_get_mdt_idx(inode);
+		if (mdt_index < 0)
+			GOTO(out_tmp, rc = -ENOMEM);
+
+		tmp->lum_magic = LMV_MAGIC_V1;
+		tmp->lum_stripe_count = 0;
+		tmp->lum_stripe_offset = mdt_index;
+		tmp->lum_hash_type = lmv_mds_md_hash_type_get(lmm);
+		for (i = 0; i < stripe_count; i++) {
+			struct lu_fid	fid;
+
+			fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]);
+			if (fid_is_sane(&fid)) {
+				mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
+				if (mdt_index < 0)
+					GOTO(out_tmp, rc = mdt_index);
+
+				tmp->lum_objects[i].lum_mds = mdt_index;
+				tmp->lum_objects[i].lum_fid = fid;
+			}
+
+			tmp->lum_stripe_count++;
+		}
+
+		if (copy_to_user(ulmv, tmp, lum_size))
+			GOTO(out_tmp, rc = -EFAULT);
+out_tmp:
+		OBD_FREE(tmp, lum_size);
+finish_req:
+		ptlrpc_req_finished(request);
+		ptlrpc_req_finished(root_request);
+		return rc;
+	}
+
+	case LL_IOC_UNLOCK_FOREIGN:
+		/* if not a foreign symlink do nothing */
+		if (ll_foreign_is_removable(dentry, true)) {
+			CDEBUG(D_INFO,
+			       "prevent rmdir of non-foreign dir ("DFID")\n",
+			       PFID(ll_inode2fid(inode)));
+			RETURN(-EOPNOTSUPP);
+		}
+		RETURN(0);
+
+	case LL_IOC_REMOVE_ENTRY: {
+		char		*filename = NULL;
+		int		 namelen = 0;
+		int		 rc;
+
+		/* Here is a little hack to avoid sending REINT_RMENTRY to
+		 * unsupported server, which might crash the server(LU-2730),
+		 * Because both LVB_TYPE and REINT_RMENTRY will be supported
+		 * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
+		 * server will support REINT_RMENTRY XXX*/
+		if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
+			RETURN(-ENOTSUPP);
+
+		filename = ll_getname((const char __user *)arg);
+		if (IS_ERR(filename))
+			RETURN(PTR_ERR(filename));
+
+		namelen = strlen(filename);
+		if (namelen < 1)
+			GOTO(out_rmdir, rc = -EINVAL);
+
+		rc = ll_rmdir_entry(inode, filename, namelen);
+out_rmdir:
+                if (filename)
+                        ll_putname(filename);
+		RETURN(rc);
+	}
+	case LL_IOC_RMFID:
+		RETURN(ll_rmfid(file, (void __user *)arg));
+	case LL_IOC_LOV_SWAP_LAYOUTS:
+		RETURN(-EPERM);
+	case IOC_OBD_STATFS:
+		RETURN(ll_obd_statfs(inode, (void __user *)arg));
+	case LL_IOC_LOV_GETSTRIPE:
+	case LL_IOC_LOV_GETSTRIPE_NEW:
+	case LL_IOC_MDC_GETINFO_V1:
+	case LL_IOC_MDC_GETINFO_V2:
+	case IOC_MDC_GETFILEINFO_V1:
+	case IOC_MDC_GETFILEINFO_V2:
+	case IOC_MDC_GETFILESTRIPE: {
+		struct ptlrpc_request *request = NULL;
+		struct ptlrpc_request *root_request = NULL;
+		struct lov_user_md __user *lump;
+		struct lov_mds_md *lmm = NULL;
+		struct mdt_body *body;
+		char *filename = NULL;
+		lstat_t __user *statp = NULL;
+		lstatx_t __user *stxp = NULL;
+		__u64 __user *flagsp = NULL;
+		__u32 __user *lmmsizep = NULL;
+		struct lu_fid __user *fidp = NULL;
+		int lmmsize;
+		bool api32;
+
+		if (cmd == IOC_MDC_GETFILEINFO_V1 ||
+		    cmd == IOC_MDC_GETFILEINFO_V2 ||
+		    cmd == IOC_MDC_GETFILESTRIPE) {
+			filename = ll_getname((const char __user *)arg);
+			if (IS_ERR(filename))
+				RETURN(PTR_ERR(filename));
+
+			rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+						      &lmmsize, &request);
+		} else {
+			rc = ll_dir_getstripe_default(inode, (void **)&lmm,
+						      &lmmsize, &request,
+						      &root_request, 0);
+		}
+
+		if (request) {
+			body = req_capsule_server_get(&request->rq_pill,
+						      &RMF_MDT_BODY);
+			LASSERT(body != NULL);
+		} else {
+			GOTO(out_req, rc);
+		}
+
+		if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO_V1 ||
+				       cmd == LL_IOC_MDC_GETINFO_V1 ||
+				       cmd == IOC_MDC_GETFILEINFO_V2 ||
+				       cmd == LL_IOC_MDC_GETINFO_V2)) {
+			lmmsize = 0;
+			rc = 0;
+		}
+
+		if (rc < 0)
+			GOTO(out_req, rc);
+
+		if (cmd == IOC_MDC_GETFILESTRIPE ||
+		    cmd == LL_IOC_LOV_GETSTRIPE ||
+		    cmd == LL_IOC_LOV_GETSTRIPE_NEW) {
+			lump = (struct lov_user_md __user *)arg;
+		} else if (cmd == IOC_MDC_GETFILEINFO_V1 ||
+			   cmd == LL_IOC_MDC_GETINFO_V1){
+			struct lov_user_mds_data_v1 __user *lmdp;
+
+			lmdp = (struct lov_user_mds_data_v1 __user *)arg;
+			statp = &lmdp->lmd_st;
+			lump = &lmdp->lmd_lmm;
+		} else {
+			struct lov_user_mds_data __user *lmdp;
+
+			lmdp = (struct lov_user_mds_data __user *)arg;
+			fidp = &lmdp->lmd_fid;
+			stxp = &lmdp->lmd_stx;
+			flagsp = &lmdp->lmd_flags;
+			lmmsizep = &lmdp->lmd_lmmsize;
+			lump = &lmdp->lmd_lmm;
+		}
+
+		if (lmmsize == 0) {
+			/* If the file has no striping then zero out *lump so
+			 * that the caller isn't confused by garbage. */
+			if (clear_user(lump, sizeof(*lump)))
+				GOTO(out_req, rc = -EFAULT);
+		} else if (copy_to_user(lump, lmm, lmmsize)) {
+			if (copy_to_user(lump, lmm, sizeof(*lump)))
+				GOTO(out_req, rc = -EFAULT);
+			rc = -EOVERFLOW;
+		}
+		api32 = test_bit(LL_SBI_32BIT_API, sbi->ll_flags);
+
+		if (cmd == IOC_MDC_GETFILEINFO_V1 ||
+		    cmd == LL_IOC_MDC_GETINFO_V1) {
+			lstat_t st = { 0 };
+
+			st.st_dev	= inode->i_sb->s_dev;
+			st.st_mode	= body->mbo_mode;
+			st.st_nlink	= body->mbo_nlink;
+			st.st_uid	= body->mbo_uid;
+			st.st_gid	= body->mbo_gid;
+			st.st_rdev	= body->mbo_rdev;
+			if (llcrypt_require_key(inode) == -ENOKEY)
+				st.st_size = round_up(st.st_size,
+						   LUSTRE_ENCRYPTION_UNIT_SIZE);
+			else
+				st.st_size = body->mbo_size;
+			st.st_blksize	= PAGE_SIZE;
+			st.st_blocks	= body->mbo_blocks;
+			st.st_atime	= body->mbo_atime;
+			st.st_mtime	= body->mbo_mtime;
+			st.st_ctime	= body->mbo_ctime;
+			st.st_ino	= cl_fid_build_ino(&body->mbo_fid1,
+							   api32);
+
+			if (copy_to_user(statp, &st, sizeof(st)))
+				GOTO(out_req, rc = -EFAULT);
+		} else if (cmd == IOC_MDC_GETFILEINFO_V2 ||
+			   cmd == LL_IOC_MDC_GETINFO_V2) {
+			lstatx_t stx = { 0 };
+			__u64 valid = body->mbo_valid;
+
+			stx.stx_blksize = PAGE_SIZE;
+			stx.stx_nlink = body->mbo_nlink;
+			stx.stx_uid = body->mbo_uid;
+			stx.stx_gid = body->mbo_gid;
+			stx.stx_mode = body->mbo_mode;
+			stx.stx_ino = cl_fid_build_ino(&body->mbo_fid1,
+						       api32);
+			if (llcrypt_require_key(inode) == -ENOKEY)
+				stx.stx_size = round_up(stx.stx_size,
+						   LUSTRE_ENCRYPTION_UNIT_SIZE);
+			else
+				stx.stx_size = body->mbo_size;
+			stx.stx_blocks = body->mbo_blocks;
+			stx.stx_atime.tv_sec = body->mbo_atime;
+			stx.stx_ctime.tv_sec = body->mbo_ctime;
+			stx.stx_mtime.tv_sec = body->mbo_mtime;
+			stx.stx_btime.tv_sec = body->mbo_btime;
+			stx.stx_rdev_major = MAJOR(body->mbo_rdev);
+			stx.stx_rdev_minor = MINOR(body->mbo_rdev);
+			stx.stx_dev_major = MAJOR(inode->i_sb->s_dev);
+			stx.stx_dev_minor = MINOR(inode->i_sb->s_dev);
+			stx.stx_mask |= STATX_BASIC_STATS | STATX_BTIME;
+
+			/*
+			 * For a striped directory, the size and blocks returned
+			 * from MDT is not correct.
+			 * The size and blocks are aggregated by client across
+			 * all stripes.
+			 * Thus for a striped directory, do not return the valid
+			 * FLSIZE and FLBLOCKS flags to the caller.
+			 * However, this whould be better decided by the MDS
+			 * instead of the client.
+			 */
+			if (cmd == LL_IOC_MDC_GETINFO_V2 &&
+			    ll_i2info(inode)->lli_lsm_md != NULL)
+				valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+
+			if (flagsp && copy_to_user(flagsp, &valid,
+						   sizeof(*flagsp)))
+				GOTO(out_req, rc = -EFAULT);
+
+			if (fidp && copy_to_user(fidp, &body->mbo_fid1,
+						 sizeof(*fidp)))
+				GOTO(out_req, rc = -EFAULT);
+
+			if (!(valid & OBD_MD_FLSIZE))
+				stx.stx_mask &= ~STATX_SIZE;
+			if (!(valid & OBD_MD_FLBLOCKS))
+				stx.stx_mask &= ~STATX_BLOCKS;
+
+			if (stxp && copy_to_user(stxp, &stx, sizeof(stx)))
+				GOTO(out_req, rc = -EFAULT);
+
+			if (lmmsizep && copy_to_user(lmmsizep, &lmmsize,
+						     sizeof(*lmmsizep)))
+				GOTO(out_req, rc = -EFAULT);
+		}
+
+		EXIT;
+out_req:
+		ptlrpc_req_finished(request);
+		ptlrpc_req_finished(root_request);
+		if (filename)
+			ll_putname(filename);
+		return rc;
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl;
+		int qctl_len = sizeof(*qctl) + LOV_MAXPOOLNAME + 1;
+
+		OBD_ALLOC(qctl, qctl_len);
+		if (!qctl)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl)))
+			GOTO(out_quotactl, rc = -EFAULT);
+
+		if (LUSTRE_Q_CMD_IS_POOL(qctl->qc_cmd)) {
+			char __user *from = (char __user *)arg +
+					offsetof(typeof(*qctl), qc_poolname);
+			if (copy_from_user(qctl->qc_poolname, from,
+					   LOV_MAXPOOLNAME + 1))
+				GOTO(out_quotactl, rc = -EFAULT);
+		}
+
+		rc = quotactl_ioctl(inode->i_sb, qctl);
+		if (rc == 0 &&
+		    copy_to_user((void __user *)arg, qctl, sizeof(*qctl)))
+                        rc = -EFAULT;
+
+out_quotactl:
+		OBD_FREE(qctl, qctl_len);
+		RETURN(rc);
+        }
+	case OBD_IOC_GETNAME_OLD:
+	case OBD_IOC_GETDTNAME:
+	case OBD_IOC_GETMDNAME:
+                RETURN(ll_get_obd_name(inode, cmd, arg));
+	case LL_IOC_HSM_STATE_GET: {
+		struct md_op_data *op_data;
+		struct hsm_user_state *hus;
+		int rc;
+
+		OBD_ALLOC_PTR(hus);
+		if (hus == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hus);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hus);
+			RETURN(PTR_ERR(op_data));
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hus);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_STATE_SET: {
+		struct hsm_state_set *hss;
+		int rc;
+
+		OBD_ALLOC_PTR(hss);
+		if (hss == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
+			OBD_FREE_PTR(hss);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_hsm_state_set(inode, hss);
+
+		OBD_FREE_PTR(hss);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data *op_data;
+		struct hsm_current_action *hca;
+		int rc;
+
+		OBD_ALLOC_PTR(hca);
+		if (hca == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hca);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hca);
+			RETURN(PTR_ERR(op_data));
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hca);
+		RETURN(rc);
+	}
+        case LL_IOC_FLUSHCTX:
+                RETURN(ll_flush_ctx(inode));
+        case LL_IOC_GETOBDCOUNT: {
+		u32 count, vallen;
+                struct obd_export *exp;
+
+		if (copy_from_user(&count, (int __user *)arg, sizeof(int)))
+                        RETURN(-EFAULT);
+
+                /* get ost count when count is zero, get mdt count otherwise */
+                exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
+                vallen = sizeof(count);
+		rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
+				  KEY_TGT_COUNT, &vallen, &count);
+                if (rc) {
+                        CERROR("get target count failed: %d\n", rc);
+                        RETURN(rc);
+                }
+
+		if (copy_to_user((int __user *)arg, &count, sizeof(int)))
+                        RETURN(-EFAULT);
+
+                RETURN(0);
+        }
+        case LL_IOC_PATH2FID:
+		if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
+                                     sizeof(struct lu_fid)))
+                        RETURN(-EFAULT);
+                RETURN(0);
+        case LL_IOC_GET_CONNECT_FLAGS: {
+		RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL,
+				     (void __user *)arg));
+        }
+	case OBD_IOC_FID2PATH:
+		RETURN(ll_fid2path(inode, (void __user *)arg));
+	case LL_IOC_GETPARENT:
+		RETURN(ll_getparent(file, (void __user *)arg));
+	case LL_IOC_FID2MDTIDX: {
+		struct obd_export *exp = ll_i2mdexp(inode);
+		struct lu_fid	  fid;
+		__u32		  index;
+
+		if (copy_from_user(&fid, (const struct lu_fid __user *)arg,
+				   sizeof(fid)))
+			RETURN(-EFAULT);
+
+		/* Call mdc_iocontrol */
+		rc = obd_iocontrol(LL_IOC_FID2MDTIDX, exp, sizeof(fid), &fid,
+				   (__u32 __user *)&index);
+		if (rc != 0)
+			RETURN(rc);
+
+		RETURN(index);
+	}
+	case LL_IOC_HSM_REQUEST: {
+		struct hsm_user_request	*hur;
+		ssize_t			 totalsize;
+
+		OBD_ALLOC_PTR(hur);
+		if (hur == NULL)
+			RETURN(-ENOMEM);
+
+		/* We don't know the true size yet; copy the fixed-size part */
+		if (copy_from_user(hur, (void __user *)arg, sizeof(*hur))) {
+			OBD_FREE_PTR(hur);
+			RETURN(-EFAULT);
+		}
+
+		/* Compute the whole struct size */
+		totalsize = hur_len(hur);
+		OBD_FREE_PTR(hur);
+		if (totalsize < 0)
+			RETURN(-E2BIG);
+
+		/* Final size will be more than double totalsize */
+		if (totalsize >= MDS_MAXREQSIZE / 3)
+			RETURN(-E2BIG);
+
+		OBD_ALLOC_LARGE(hur, totalsize);
+		if (hur == NULL)
+			RETURN(-ENOMEM);
+
+		/* Copy the whole struct */
+		if (copy_from_user(hur, (void __user *)arg, totalsize))
+			GOTO(out_hur, rc = -EFAULT);
+
+		if (hur->hur_request.hr_action == HUA_RELEASE) {
+			const struct lu_fid *fid;
+			struct inode *f;
+			int i;
+
+			for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
+				fid = &hur->hur_user_item[i].hui_fid;
+				f = search_inode_for_lustre(inode->i_sb, fid);
+				if (IS_ERR(f)) {
+					rc = PTR_ERR(f);
+					break;
+				}
+
+				rc = ll_hsm_release(f);
+				iput(f);
+				if (rc != 0)
+					break;
+			}
+		} else {
+			rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
+					   hur, NULL);
+		}
+
+out_hur:
+		OBD_FREE_LARGE(hur, totalsize);
+
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_PROGRESS: {
+		struct hsm_progress_kernel	hpk;
+		struct hsm_progress		hp;
+
+		if (copy_from_user(&hp, (void __user *)arg, sizeof(hp)))
+			RETURN(-EFAULT);
+
+		hpk.hpk_fid = hp.hp_fid;
+		hpk.hpk_cookie = hp.hp_cookie;
+		hpk.hpk_extent = hp.hp_extent;
+		hpk.hpk_flags = hp.hp_flags;
+		hpk.hpk_errval = hp.hp_errval;
+		hpk.hpk_data_version = 0;
+
+		/* File may not exist in Lustre; all progress
+		 * reported to Lustre root */
+		rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
+				   NULL);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_CT_START:
+		if (!capable(CAP_SYS_ADMIN))
+			RETURN(-EPERM);
+
+		rc = copy_and_ct_start(cmd, sbi->ll_md_exp,
+				       (struct lustre_kernelcomm __user *)arg);
+		RETURN(rc);
+
+	case LL_IOC_HSM_COPY_START: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		OBD_ALLOC_PTR(copy);
+		if (copy == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(copy, (char __user *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_ioc_copy_start(inode->i_sb, copy);
+		if (copy_to_user((char __user *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_IMPORT: {
+		struct hsm_user_import *hui;
+
+		OBD_ALLOC_PTR(hui);
+		if (hui == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
+			OBD_FREE_PTR(hui);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_hsm_import(inode, file, hui);
+
+		CDEBUG(D_HSM, "MDLL hsm_state import: %d\n", rc);
+		OBD_FREE_PTR(hui);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_COPY_END: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		OBD_ALLOC_PTR(copy);
+		if (copy == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(copy, (char __user *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_ioc_copy_end(inode->i_sb, copy);
+		CDEBUG(D_HSM, "MDLL hsm_copy_end: %d\n", rc);
+
+		if (copy_to_user((char __user *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		RETURN(rc);
+	}
+	case LL_IOC_MIGRATE: {
+		struct lmv_user_md *lum;
+		int len;
+		char *filename;
+		int namelen = 0;
+		__u32 flags;
+		int rc;
+
+		rc = obd_ioctl_getdata(&data, &len, (void __user *)arg);
+		if (rc)
+			RETURN(rc);
+
+		if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+		    data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
+			GOTO(migrate_free, rc = -EINVAL);
+
+		filename = data->ioc_inlbuf1;
+		namelen = data->ioc_inllen1;
+		flags = data->ioc_type;
+
+		if (namelen < 1 || namelen != strlen(filename) + 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			GOTO(migrate_free, rc = -EINVAL);
+		}
+
+		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+		if (lum->lum_magic != LMV_USER_MAGIC &&
+		    lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) {
+			rc = -EINVAL;
+			CERROR("%s: wrong lum magic %x: rc = %d\n",
+			       filename, lum->lum_magic, rc);
+			GOTO(migrate_free, rc);
+		}
+
+		rc = ll_migrate(inode, file, lum, filename, flags);
+migrate_free:
+		OBD_FREE_LARGE(data, len);
+
+		RETURN(rc);
+	}
+	case FS_IOC_FSGETXATTR:
+		RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
+	case FS_IOC_FSSETXATTR:
+		RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
+	case LL_IOC_PROJECT:
+		RETURN(ll_ioctl_project(file, cmd, arg));
+	case LL_IOC_PCC_DETACH_BY_FID: {
+		struct lu_pcc_detach_fid *detach;
+		struct lu_fid *fid;
+		struct inode *inode2;
+		unsigned long ino;
+
+		OBD_ALLOC_PTR(detach);
+		if (detach == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(detach,
+				   (const struct lu_pcc_detach_fid __user *)arg,
+				   sizeof(*detach)))
+			GOTO(out_detach, rc = -EFAULT);
+
+		fid = &detach->pccd_fid;
+		ino = cl_fid_build_ino(fid, ll_need_32bit_api(sbi));
+		inode2 = ilookup5(inode->i_sb, ino, ll_test_inode_by_fid, fid);
+		if (inode2 == NULL)
+			/* Target inode is not in inode cache, and PCC file
+			 * has aleady released, return immdiately.
+			 */
+			GOTO(out_detach, rc = 0);
+
+		if (!S_ISREG(inode2->i_mode))
+			GOTO(out_iput, rc = -EINVAL);
+
+		if (!inode_owner_or_capable(&init_user_ns, inode2))
+			GOTO(out_iput, rc = -EPERM);
+
+		rc = pcc_ioctl_detach(inode2, detach->pccd_opt);
+out_iput:
+		iput(inode2);
+out_detach:
+		OBD_FREE_PTR(detach);
+		RETURN(rc);
+	}
+#ifdef HAVE_LUSTRE_CRYPTO
+	case LL_IOC_SET_ENCRYPTION_POLICY:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		return llcrypt_ioctl_set_policy(file, (const void __user *)arg);
+	case LL_IOC_GET_ENCRYPTION_POLICY_EX:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		return llcrypt_ioctl_get_policy_ex(file, (void __user *)arg);
+	case LL_IOC_ADD_ENCRYPTION_KEY:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		rc = llcrypt_ioctl_add_key(file, (void __user *)arg);
+#ifdef CONFIG_LL_ENCRYPTION
+		if (!rc)
+			sptlrpc_enc_pool_add_user();
+#endif
+		return rc;
+	case LL_IOC_REMOVE_ENCRYPTION_KEY:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		rc = llcrypt_ioctl_remove_key(file, (void __user *)arg);
+#ifdef CONFIG_LL_ENCRYPTION
+		if (!rc)
+			sptlrpc_enc_pool_del_user();
+#endif
+		return rc;
+	case LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		rc = llcrypt_ioctl_remove_key_all_users(file,
+							(void __user *)arg);
+#ifdef CONFIG_LL_ENCRYPTION
+		if (!rc)
+			sptlrpc_enc_pool_del_user();
+#endif
+		return rc;
+	case LL_IOC_GET_ENCRYPTION_KEY_STATUS:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		return llcrypt_ioctl_get_key_status(file, (void __user *)arg);
+#endif
+	default:
+		RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL,
+				     (void __user *)arg));
+	}
+}
+
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ll_file_data *fd = file->private_data;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int api32 = ll_need_32bit_api(sbi);
+	loff_t ret = -EINVAL;
+	ENTRY;
+
+	inode_lock(inode);
+	switch (origin) {
+	case SEEK_SET:
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+		break;
+	case SEEK_END:
+		if (offset > 0)
+			GOTO(out, ret);
+		if (api32)
+			offset += LL_DIR_END_OFF_32BIT;
+		else
+			offset += LL_DIR_END_OFF;
+		break;
+	default:
+		GOTO(out, ret);
+	}
+
+	if (offset >= 0 &&
+	    ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
+	     (!api32 && offset <= LL_DIR_END_OFF))) {
+		if (offset != file->f_pos) {
+			bool hash64;
+
+			hash64 = test_bit(LL_SBI_64BIT_HASH, sbi->ll_flags);
+			if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
+			    (!api32 && offset == LL_DIR_END_OFF))
+				fd->lfd_pos = MDS_DIR_END_OFF;
+			else if (api32 && hash64)
+				fd->lfd_pos = offset << 32;
+			else
+				fd->lfd_pos = offset;
+			file->f_pos = offset;
+			file->f_version = 0;
+		}
+		ret = offset;
+	}
+	GOTO(out, ret);
+
+out:
+	inode_unlock(inode);
+	return ret;
+}
+
+static int ll_dir_open(struct inode *inode, struct file *file)
+{
+        ENTRY;
+        RETURN(ll_file_open(inode, file));
+}
+
+static int ll_dir_release(struct inode *inode, struct file *file)
+{
+        ENTRY;
+        RETURN(ll_file_release(inode, file));
+}
+
+/* notify error if partially read striped directory */
+static int ll_dir_flush(struct file *file, fl_owner_t id)
+{
+	struct ll_file_data *lfd = file->private_data;
+	int rc = lfd->fd_partial_readdir_rc;
+
+	lfd->fd_partial_readdir_rc = 0;
+
+	return rc;
+}
+
+const struct file_operations ll_dir_operations = {
+	.llseek		= ll_dir_seek,
+	.open		= ll_dir_open,
+	.release	= ll_dir_release,
+	.read		= generic_read_dir,
+#ifdef HAVE_DIR_CONTEXT
+	.iterate_shared	= ll_iterate,
+#else
+	.readdir	= ll_readdir,
+#endif
+	.unlocked_ioctl	= ll_dir_ioctl,
+	.fsync		= ll_fsync,
+	.flush		= ll_dir_flush,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
new file mode 100644
index 0000000000000..1b3d8d90d193c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -0,0 +1,6260 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_dlm.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+#include <linux/falloc.h>
+#include <linux/ktime.h>
+
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_swab.h>
+
+#include "cl_object.h"
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+struct split_param {
+	struct inode	*sp_inode;
+	__u16		sp_mirror_id;
+};
+
+struct pcc_param {
+	__u64	pa_data_version;
+	__u32	pa_archive_id;
+	__u32	pa_layout_gen;
+};
+
+static int
+ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+			  bool *lease_broken);
+
+static struct ll_file_data *ll_file_data_get(void)
+{
+	struct ll_file_data *fd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
+	if (fd == NULL)
+		return NULL;
+
+	fd->fd_write_failed = false;
+	pcc_file_init(&fd->fd_pcc_file);
+
+	return fd;
+}
+
+static void ll_file_data_put(struct ll_file_data *fd)
+{
+        if (fd != NULL)
+                OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+}
+
+/**
+ * Packs all the attributes into @op_data for the CLOSE rpc.
+ */
+static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
+                             struct obd_client_handle *och)
+{
+	ENTRY;
+
+	ll_prep_md_op_data(op_data, inode, NULL, NULL,
+			   0, 0, LUSTRE_OPC_ANY, NULL);
+
+	op_data->op_attr.ia_mode = inode->i_mode;
+	op_data->op_attr.ia_atime = inode->i_atime;
+	op_data->op_attr.ia_mtime = inode->i_mtime;
+	op_data->op_attr.ia_ctime = inode->i_ctime;
+	/* In case of encrypted file without the key, visible size was rounded
+	 * up to next LUSTRE_ENCRYPTION_UNIT_SIZE, and clear text size was
+	 * stored into lli_lazysize in ll_merge_attr(), so set proper file size
+	 * now that we are closing.
+	 */
+	if (llcrypt_require_key(inode) == -ENOKEY &&
+	    ll_i2info(inode)->lli_attr_valid & OBD_MD_FLLAZYSIZE)
+		op_data->op_attr.ia_size = ll_i2info(inode)->lli_lazysize;
+	else
+		op_data->op_attr.ia_size = i_size_read(inode);
+	op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+				      ATTR_MTIME | ATTR_MTIME_SET |
+				      ATTR_CTIME);
+	op_data->op_xvalid |= OP_XVALID_CTIME_SET;
+	op_data->op_attr_blocks = inode->i_blocks;
+	op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
+	if (test_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags))
+		op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
+	op_data->op_open_handle = och->och_open_handle;
+
+	if (och->och_flags & FMODE_WRITE &&
+	    test_and_clear_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags))
+		/* For HSM: if inode data has been modified, pack it so that
+		 * MDT can set data dirty flag in the archive. */
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+
+	EXIT;
+}
+
+/**
+ * Perform a close, possibly with a bias.
+ * The meaning of "data" depends on the value of "bias".
+ *
+ * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
+ * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
+ * swap layouts with.
+ */
+static int ll_close_inode_openhandle(struct inode *inode,
+				     struct obd_client_handle *och,
+				     enum mds_op_bias bias, void *data)
+{
+	struct obd_export *md_exp = ll_i2mdexp(inode);
+	const struct ll_inode_info *lli = ll_i2info(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+	ENTRY;
+
+	if (class_exp2obd(md_exp) == NULL) {
+		CERROR("%s: invalid MDC connection handle closing "DFID"\n",
+		       ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
+		GOTO(out, rc = 0);
+	}
+
+	OBD_ALLOC_PTR(op_data);
+	/* We leak openhandle and request here on error, but not much to be
+	 * done in OOM case since app won't retry close on error either. */
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	ll_prepare_close(inode, op_data, och);
+	switch (bias) {
+	case MDS_CLOSE_LAYOUT_MERGE:
+		/* merge blocks from the victim inode */
+		op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
+		op_data->op_attr.ia_valid |= ATTR_SIZE;
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
+		fallthrough;
+	case MDS_CLOSE_LAYOUT_SPLIT:
+	case MDS_CLOSE_LAYOUT_SWAP: {
+		struct split_param *sp = data;
+
+		LASSERT(data != NULL);
+		op_data->op_bias |= bias;
+		op_data->op_data_version = 0;
+		op_data->op_lease_handle = och->och_lease_handle;
+		if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
+			op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
+			op_data->op_mirror_id = sp->sp_mirror_id;
+		} else {
+			op_data->op_fid2 = *ll_inode2fid(data);
+		}
+		break;
+	}
+
+	case MDS_CLOSE_RESYNC_DONE: {
+		struct ll_ioc_lease *ioc = data;
+
+		LASSERT(data != NULL);
+		op_data->op_attr_blocks +=
+			ioc->lil_count * op_data->op_attr_blocks;
+		op_data->op_attr.ia_valid |= ATTR_SIZE;
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
+		op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
+
+		op_data->op_lease_handle = och->och_lease_handle;
+		op_data->op_data = &ioc->lil_ids[0];
+		op_data->op_data_size =
+			ioc->lil_count * sizeof(ioc->lil_ids[0]);
+		break;
+	}
+
+	case MDS_PCC_ATTACH: {
+		struct pcc_param *param = data;
+
+		LASSERT(data != NULL);
+		op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
+		op_data->op_archive_id = param->pa_archive_id;
+		op_data->op_data_version = param->pa_data_version;
+		op_data->op_lease_handle = och->och_lease_handle;
+		break;
+	}
+
+	case MDS_HSM_RELEASE:
+		LASSERT(data != NULL);
+		op_data->op_bias |= MDS_HSM_RELEASE;
+		op_data->op_data_version = *(__u64 *)data;
+		op_data->op_lease_handle = och->och_lease_handle;
+		op_data->op_attr.ia_valid |= ATTR_SIZE;
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
+		break;
+
+	default:
+		LASSERT(data == NULL);
+		break;
+	}
+
+	if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
+		op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
+	if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
+		op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
+
+	rc = md_close(md_exp, op_data, och->och_mod, &req);
+	if (rc != 0 && rc != -EINTR)
+		CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
+		       md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
+
+	if (rc == 0 && op_data->op_bias & bias) {
+		struct mdt_body *body;
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
+			rc = -EBUSY;
+
+		if (bias & MDS_PCC_ATTACH) {
+			struct pcc_param *param = data;
+
+			param->pa_layout_gen = body->mbo_layout_gen;
+		}
+	}
+
+	ll_finish_md_op_data(op_data);
+	EXIT;
+out:
+
+	md_clear_open_replay_data(md_exp, och);
+	och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
+	OBD_FREE_PTR(och);
+
+	ptlrpc_req_finished(req);	/* This is close request */
+	return rc;
+}
+
+int ll_md_real_close(struct inode *inode, fmode_t fmode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_client_handle **och_p;
+	struct obd_client_handle *och;
+	__u64 *och_usecount;
+	int rc = 0;
+	ENTRY;
+
+	if (fmode & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else if (fmode & FMODE_EXEC) {
+		och_p = &lli->lli_mds_exec_och;
+		och_usecount = &lli->lli_open_fd_exec_count;
+	} else {
+		LASSERT(fmode & FMODE_READ);
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (*och_usecount > 0) {
+		/* There are still users of this handle, so skip
+		 * freeing it. */
+		mutex_unlock(&lli->lli_och_mutex);
+		RETURN(0);
+	}
+
+	och = *och_p;
+	*och_p = NULL;
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (och != NULL) {
+		/* There might be a race and this handle may already
+		 * be closed. */
+		rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+	}
+
+	RETURN(rc);
+}
+
+static int ll_md_close(struct inode *inode, struct file *file)
+{
+	union ldlm_policy_data policy = {
+		.l_inodebits	= { MDS_INODELOCK_OPEN },
+	};
+	__u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
+	struct ll_file_data *fd = file->private_data;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lustre_handle lockh;
+	enum ldlm_mode lockmode;
+	int rc = 0;
+	ENTRY;
+
+	/* clear group lock, if present */
+	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+		ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_lease_och != NULL) {
+		bool lease_broken;
+		struct obd_client_handle *lease_och;
+
+		lease_och = fd->fd_lease_och;
+		fd->fd_lease_och = NULL;
+		mutex_unlock(&lli->lli_och_mutex);
+
+		/* Usually the lease is not released when the
+		 * application crashed, we need to release here. */
+		rc = ll_lease_close(lease_och, inode, &lease_broken);
+
+		mutex_lock(&lli->lli_och_mutex);
+
+		CDEBUG_LIMIT(rc ? D_ERROR : D_INODE,
+			     "Clean up lease "DFID" %d/%d\n",
+			     PFID(&lli->lli_fid), rc, lease_broken);
+	}
+
+	if (fd->fd_och != NULL) {
+		struct obd_client_handle *och;
+
+		och = fd->fd_och;
+		fd->fd_och = NULL;
+		mutex_unlock(&lli->lli_och_mutex);
+
+		rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+		GOTO(out, rc);
+	}
+
+        /* Let's see if we have good enough OPEN lock on the file and if
+           we can skip talking to MDS */
+	if (fd->fd_omode & FMODE_WRITE) {
+		lockmode = LCK_CW;
+		LASSERT(lli->lli_open_fd_write_count);
+		lli->lli_open_fd_write_count--;
+	} else if (fd->fd_omode & FMODE_EXEC) {
+		lockmode = LCK_PR;
+		LASSERT(lli->lli_open_fd_exec_count);
+		lli->lli_open_fd_exec_count--;
+	} else {
+		lockmode = LCK_CR;
+		LASSERT(lli->lli_open_fd_read_count);
+		lli->lli_open_fd_read_count--;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+
+	/* LU-4398: do not cache write open lock if the file has exec bit */
+	if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
+	    !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
+			   LDLM_IBITS, &policy, lockmode, &lockh))
+		rc = ll_md_real_close(inode, fd->fd_omode);
+
+out:
+	file->private_data = NULL;
+	ll_file_data_put(fd);
+
+	RETURN(rc);
+}
+
+/* While this returns an error code, fput() the caller does not, so we need
+ * to make every effort to clean up all of our state here.  Also, applications
+ * rarely check close errors and even if an error is returned they will not
+ * re-try the close call.
+ */
+int ll_file_release(struct inode *inode, struct file *file)
+{
+	struct ll_file_data *fd;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ktime_t kstart = ktime_get();
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+	fd = file->private_data;
+	LASSERT(fd != NULL);
+
+	/* The last ref on @file, maybe not the the owner pid of statahead,
+	 * because parent and child process can share the same file handle. */
+	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
+		ll_deauthorize_statahead(inode, fd);
+
+	if (is_root_inode(inode)) {
+		file->private_data = NULL;
+		ll_file_data_put(fd);
+		GOTO(out, rc = 0);
+	}
+
+	pcc_file_release(inode, file);
+
+	if (!S_ISDIR(inode->i_mode)) {
+		if (lli->lli_clob != NULL)
+			lov_read_and_clear_async_rc(lli->lli_clob);
+		lli->lli_async_rc = 0;
+	}
+
+	lli->lli_close_fd_time = ktime_get();
+
+	rc = ll_md_close(inode, file);
+
+	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
+		libcfs_debug_dumplog();
+
+out:
+	if (!rc && !is_root_inode(inode))
+		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE,
+				   ktime_us_delta(ktime_get(), kstart));
+	RETURN(rc);
+}
+
+static inline int ll_dom_readpage(void *data, struct page *page)
+{
+	/* since ll_dom_readpage is a page cache helper, it is safe to assume
+	 * mapping and host pointers are set here
+	 */
+	struct inode *inode;
+	struct niobuf_local *lnb = data;
+	void *kaddr;
+	int rc = 0;
+
+	inode = page2inode(page);
+
+	kaddr = kmap_atomic(page);
+	memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
+	if (lnb->lnb_len < PAGE_SIZE)
+		memset(kaddr + lnb->lnb_len, 0,
+		       PAGE_SIZE - lnb->lnb_len);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	kunmap_atomic(kaddr);
+
+	if (inode && IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
+		if (!llcrypt_has_encryption_key(inode)) {
+			CDEBUG(D_SEC, "no enc key for "DFID"\n",
+			       PFID(ll_inode2fid(inode)));
+			rc = -ENOKEY;
+		} else {
+			unsigned int offs = 0;
+
+			while (offs < PAGE_SIZE) {
+				/* decrypt only if page is not empty */
+				if (memcmp(page_address(page) + offs,
+					   page_address(ZERO_PAGE(0)),
+					   LUSTRE_ENCRYPTION_UNIT_SIZE) == 0)
+					break;
+
+				rc = llcrypt_decrypt_pagecache_blocks(page,
+						    LUSTRE_ENCRYPTION_UNIT_SIZE,
+								      offs);
+				if (rc)
+					break;
+
+				offs += LUSTRE_ENCRYPTION_UNIT_SIZE;
+			}
+		}
+	}
+	unlock_page(page);
+
+	return rc;
+}
+
+#ifdef HAVE_READ_CACHE_PAGE_WANTS_FILE
+static inline int ll_dom_read_folio(struct file *file, struct folio *folio0)
+{
+	return ll_dom_readpage(file->private_data, folio_page(folio0, 0));
+}
+#else
+#define ll_dom_read_folio	ll_dom_readpage
+#endif
+
+void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *vmpage;
+	struct niobuf_remote *rnb;
+	struct mdt_body *body;
+	char *data;
+	unsigned long index, start;
+	struct niobuf_local lnb;
+	__u16 refcheck;
+	int rc;
+
+	ENTRY;
+
+	if (obj == NULL)
+		RETURN_EXIT;
+
+	if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE,
+				       RCL_SERVER))
+		RETURN_EXIT;
+
+	rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
+	if (rnb == NULL || rnb->rnb_len == 0)
+		RETURN_EXIT;
+
+	/* LU-11595: Server may return whole file and that is OK always or
+	 * it may return just file tail and its offset must be aligned with
+	 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
+	 * smaller then offset may be not aligned and that data is just ignored.
+	 */
+	if (rnb->rnb_offset & ~PAGE_MASK)
+		RETURN_EXIT;
+
+	/* Server returns whole file or just file tail if it fills in reply
+	 * buffer, in both cases total size should be equal to the file size.
+	 */
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size &&
+	    !(inode && IS_ENCRYPTED(inode))) {
+		CERROR("%s: server returns off/len %llu/%u but size %llu\n",
+		       ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
+		       rnb->rnb_len, body->mbo_dom_size);
+		RETURN_EXIT;
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN_EXIT;
+	io = vvp_env_thread_io(env);
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+	rc = cl_io_init(env, io, CIT_MISC, obj);
+	if (rc)
+		GOTO(out_io, rc);
+
+	CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
+	       rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
+
+	data = (char *)rnb + sizeof(*rnb);
+
+	lnb.lnb_file_offset = rnb->rnb_offset;
+	start = lnb.lnb_file_offset >> PAGE_SHIFT;
+	index = 0;
+	LASSERT((lnb.lnb_file_offset & ~PAGE_MASK) == 0);
+	lnb.lnb_page_offset = 0;
+	do {
+		struct cl_page *page;
+
+		lnb.lnb_data = data + (index << PAGE_SHIFT);
+		lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
+		if (lnb.lnb_len > PAGE_SIZE)
+			lnb.lnb_len = PAGE_SIZE;
+
+		vmpage = ll_read_cache_page(mapping, index + start,
+					    ll_dom_read_folio, &lnb);
+		if (IS_ERR(vmpage)) {
+			CWARN("%s: cannot fill page %lu for "DFID
+			      " with data: rc = %li\n",
+			      ll_i2sbi(inode)->ll_fsname, index + start,
+			      PFID(lu_object_fid(&obj->co_lu)),
+			      PTR_ERR(vmpage));
+			break;
+		}
+		lock_page(vmpage);
+		if (vmpage->mapping == NULL) {
+			unlock_page(vmpage);
+			put_page(vmpage);
+			/* page was truncated */
+			break;
+		}
+		/* attach VM page to CL page cache */
+		page = cl_page_find(env, obj, vmpage->index, vmpage,
+				    CPT_CACHEABLE);
+		if (IS_ERR(page)) {
+			ClearPageUptodate(vmpage);
+			unlock_page(vmpage);
+			put_page(vmpage);
+			break;
+		}
+		cl_page_export(env, page, 1);
+		cl_page_put(env, page);
+		unlock_page(vmpage);
+		put_page(vmpage);
+		index++;
+	} while (rnb->rnb_len > (index << PAGE_SHIFT));
+
+out_io:
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+
+	EXIT;
+}
+
+static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
+				struct lookup_intent *itp)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
+	struct dentry *parent = de->d_parent;
+	char *name = NULL;
+	int len = 0;
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+	ENTRY;
+
+	LASSERT(parent != NULL);
+	LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
+
+	/* if server supports open-by-fid, or file name is invalid, don't pack
+	 * name in open request */
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
+	    !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
+retry:
+		len = de->d_name.len;
+		name = kmalloc(len + 1, GFP_NOFS);
+		if (!name)
+			RETURN(-ENOMEM);
+
+		/* race here */
+		spin_lock(&de->d_lock);
+		if (len != de->d_name.len) {
+			spin_unlock(&de->d_lock);
+			kfree(name);
+			goto retry;
+		}
+		memcpy(name, de->d_name.name, len);
+		name[len] = '\0';
+		spin_unlock(&de->d_lock);
+
+		if (!lu_name_is_valid_2(name, len)) {
+			kfree(name);
+			RETURN(-ESTALE);
+		}
+	}
+
+	op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
+				     name, len, 0, LUSTRE_OPC_OPEN, NULL);
+	if (IS_ERR(op_data)) {
+		kfree(name);
+		RETURN(PTR_ERR(op_data));
+	}
+	op_data->op_data = lmm;
+	op_data->op_data_size = lmmsize;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_OPEN_DELAY, cfs_fail_val);
+
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
+			    &ll_md_blocking_ast, 0);
+	kfree(name);
+	ll_finish_md_op_data(op_data);
+	if (rc == -ESTALE) {
+		/* reason for keep own exit path - don`t flood log
+		 * with messages with -ESTALE errors.
+		 */
+		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
+		     it_open_error(DISP_OPEN_OPEN, itp))
+			GOTO(out, rc);
+		ll_release_openhandle(de, itp);
+		GOTO(out, rc);
+	}
+
+	if (it_disposition(itp, DISP_LOOKUP_NEG))
+		GOTO(out, rc = -ENOENT);
+
+	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
+		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
+		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	rc = ll_prep_inode(&de->d_inode, &req->rq_pill, NULL, itp);
+
+	if (!rc && itp->it_lock_mode) {
+		__u64 bits = 0;
+
+		/* If we got a lock back and it has a LOOKUP bit set,
+		 * make sure the dentry is marked as valid so we can find it.
+		 * We don't need to care about actual hashing since other bits
+		 * of kernel will deal with that later.
+		 */
+		ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, &bits);
+		if (bits & MDS_INODELOCK_LOOKUP) {
+			d_lustre_revalidate(de);
+			ll_update_dir_depth(parent->d_inode, de->d_inode);
+		}
+
+		/* if DoM bit returned along with LAYOUT bit then there
+		 * can be read-on-open data returned.
+		 */
+		if (bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT)
+			ll_dom_finish_open(de->d_inode, req);
+	}
+
+out:
+	ptlrpc_req_finished(req);
+	ll_intent_drop_lock(itp);
+
+	/* We did open by fid, but by the time we got to the server, the object
+	 * disappeared.  This is possible if the object was unlinked, but it's
+	 * also possible if the object was unlinked by a rename.  In the case
+	 * of an object renamed over our existing one, we can't fail this open.
+	 * O_CREAT also goes through this path if we had an existing dentry,
+	 * and it's obviously wrong to return ENOENT for O_CREAT.
+	 *
+	 * Instead let's return -ESTALE, and the VFS will retry the open with
+	 * LOOKUP_REVAL, which we catch in ll_revalidate_dentry and fail to
+	 * revalidate, causing a lookup.  This causes extra lookups in the case
+	 * where we had a dentry in cache but the file is being unlinked and we
+	 * lose the race with unlink, but this should be very rare.
+	 */
+	if (rc == -ENOENT)
+		rc = -ESTALE;
+
+	RETURN(rc);
+}
+
+static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
+		       struct obd_client_handle *och)
+{
+	struct mdt_body *body;
+
+	body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
+	och->och_open_handle = body->mbo_open_handle;
+	och->och_fid = body->mbo_fid1;
+	och->och_lease_handle.cookie = it->it_lock_handle;
+	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+	och->och_flags = it->it_flags;
+
+	return md_set_open_replay_data(md_exp, och, it);
+}
+
+static int ll_local_open(struct file *file, struct lookup_intent *it,
+			 struct ll_file_data *fd, struct obd_client_handle *och)
+{
+	struct inode *inode = file_inode(file);
+	ENTRY;
+
+	LASSERT(!file->private_data);
+
+	LASSERT(fd != NULL);
+
+	if (och) {
+		int rc;
+
+		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	file->private_data = fd;
+	ll_readahead_init(inode, &fd->fd_ras);
+	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+
+	RETURN(0);
+}
+
+void ll_track_file_opens(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+	/* do not skew results with delays from never-opened inodes */
+	if (ktime_to_ns(lli->lli_close_fd_time))
+		ll_stats_ops_tally(sbi, LPROC_LL_INODE_OPCLTM,
+			   ktime_us_delta(ktime_get(), lli->lli_close_fd_time));
+
+	if (ktime_after(ktime_get(),
+			ktime_add_ms(lli->lli_close_fd_time,
+				     sbi->ll_oc_max_ms))) {
+		lli->lli_open_fd_count = 1;
+		lli->lli_close_fd_time = ns_to_ktime(0);
+	} else {
+		lli->lli_open_fd_count++;
+	}
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_OCOUNT,
+			   lli->lli_open_fd_count);
+}
+
+/* Open a file, and (for the very first open) create objects on the OSTs at
+ * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
+ * creation or open until ll_lov_setstripe() ioctl is called.
+ *
+ * If we already have the stripe MD locally then we don't request it in
+ * md_open(), by passing a lmm_size = 0.
+ *
+ * It is up to the application to ensure no other processes open this file
+ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
+ * used.  We might be able to avoid races of that sort by getting lli_open_sem
+ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
+ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
+ */
+int ll_file_open(struct inode *inode, struct file *file)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+					  .it_flags = file->f_flags };
+	struct obd_client_handle **och_p = NULL;
+	__u64 *och_usecount = NULL;
+	struct ll_file_data *fd;
+	ktime_t kstart = ktime_get();
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
+	       PFID(ll_inode2fid(inode)), inode, file->f_flags);
+
+	it = file->private_data; /* XXX: compat macro */
+	file->private_data = NULL; /* prevent ll_local_open assertion */
+
+	if (S_ISREG(inode->i_mode)) {
+		rc = ll_file_open_encrypt(inode, file);
+		if (rc) {
+			if (it && it->it_disposition)
+				ll_release_openhandle(file_dentry(file), it);
+			GOTO(out_nofiledata, rc);
+		}
+	}
+
+	fd = ll_file_data_get();
+	if (fd == NULL)
+		GOTO(out_nofiledata, rc = -ENOMEM);
+
+	fd->fd_file = file;
+	if (S_ISDIR(inode->i_mode))
+		ll_authorize_statahead(inode, fd);
+
+	ll_track_file_opens(inode);
+	if (is_root_inode(inode)) {
+		file->private_data = fd;
+		RETURN(0);
+	}
+
+	if (!it || !it->it_disposition) {
+		CDEBUG(D_HSM, "MDLL file->f_flags=0x%x/0%o\n",
+		       file->f_flags, file->f_flags);
+		/* Convert f_flags into access mode. We cannot use file->f_mode,
+		 * because everything but O_ACCMODE mask was stripped from
+		 * there */
+		if ((oit.it_flags + 1) & O_ACCMODE)
+			oit.it_flags++;
+		if (file->f_flags & O_TRUNC)
+			oit.it_flags |= FMODE_WRITE;
+
+		/* kernel only call f_op->open in dentry_open.  filp_open calls
+		 * dentry_open after call to open_namei that checks permissions.
+		 * Only nfsd_open call dentry_open directly without checking
+		 * permissions and because of that this code below is safe.
+		 */
+		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+		/* We do not want O_EXCL here, presumably we opened the file
+		 * already? XXX - NFS implications? */
+		oit.it_flags &= ~O_EXCL;
+
+		/* bug20584, if "it_flags" contains O_CREAT, the file will be
+		 * created if necessary, then "IT_CREAT" should be set to keep
+		 * consistent with it */
+		if (oit.it_flags & O_CREAT)
+			oit.it_op |= IT_CREAT;
+
+		it = &oit;
+	}
+
+restart:
+	/* Let's see if we have file open on MDS already. */
+	if (it->it_flags & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else if (it->it_flags & FMODE_EXEC) {
+		och_p = &lli->lli_mds_exec_och;
+		och_usecount = &lli->lli_open_fd_exec_count;
+	} else {
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (*och_p) { /* Open handle is present */
+		if (it_disposition(it, DISP_OPEN_OPEN)) {
+			/* Well, there's extra open request that we do not need,
+			 * let's close it somehow. This will decref request. */
+			rc = it_open_error(DISP_OPEN_OPEN, it);
+			if (rc) {
+				mutex_unlock(&lli->lli_och_mutex);
+				GOTO(out_openerr, rc);
+			}
+
+			ll_release_openhandle(file_dentry(file), it);
+		}
+		(*och_usecount)++;
+
+		rc = ll_local_open(file, it, fd, NULL);
+		if (rc) {
+			(*och_usecount)--;
+			mutex_unlock(&lli->lli_och_mutex);
+			GOTO(out_openerr, rc);
+		}
+	} else {
+		LASSERT(*och_usecount == 0);
+		if (!it->it_disposition) {
+			struct dentry *dentry = file_dentry(file);
+			struct ll_sb_info *sbi = ll_i2sbi(inode);
+			struct ll_dentry_data *ldd;
+
+			/* We cannot just request lock handle now, new ELC code
+			 * means that one of other OPEN locks for this file
+			 * could be cancelled, and since blocking ast handler
+			 * would attempt to grab och_mutex as well, that would
+			 * result in a deadlock
+			 */
+			mutex_unlock(&lli->lli_och_mutex);
+			/*
+			 * Normally called under two situations:
+			 * 1. NFS export.
+			 * 2. A race/condition on MDS resulting in no open
+			 *    handle to be returned from LOOKUP|OPEN request,
+			 *    for example if the target entry was a symlink.
+			 *
+			 * In NFS path we know there's pathologic behavior
+			 * so we always enable open lock caching when coming
+			 * from there. It's detected by setting a flag in
+			 * ll_iget_for_nfs.
+			 *
+			 * After reaching number of opens of this inode
+			 * we always ask for an open lock on it to handle
+			 * bad userspace actors that open and close files
+			 * in a loop for absolutely no good reason
+			 */
+
+			ldd = ll_d2d(dentry);
+			if (filename_is_volatile(dentry->d_name.name,
+						 dentry->d_name.len,
+						 NULL)) {
+				/* There really is nothing here, but this
+				 * make this more readable I think.
+				 * We do not want openlock for volatile
+				 * files under any circumstances
+				 */
+			} else if (ldd && ldd->lld_nfs_dentry) {
+				/* NFS path. This also happens to catch
+				 * open by fh files I guess
+				 */
+				it->it_flags |= MDS_OPEN_LOCK;
+				/* clear the flag for future lookups */
+				ldd->lld_nfs_dentry = 0;
+			} else if (sbi->ll_oc_thrsh_count > 0) {
+				/* Take MDS_OPEN_LOCK with many opens */
+				if (lli->lli_open_fd_count >=
+				    sbi->ll_oc_thrsh_count)
+					it->it_flags |= MDS_OPEN_LOCK;
+
+				/* If this is open after we just closed */
+				else if (ktime_before(ktime_get(),
+					    ktime_add_ms(lli->lli_close_fd_time,
+							 sbi->ll_oc_thrsh_ms)))
+					it->it_flags |= MDS_OPEN_LOCK;
+			}
+
+			/*
+			 * Always specify MDS_OPEN_BY_FID because we don't want
+			 * to get file with different fid.
+			 */
+			it->it_flags |= MDS_OPEN_BY_FID;
+			rc = ll_intent_file_open(dentry, NULL, 0, it);
+			if (rc)
+				GOTO(out_openerr, rc);
+
+			goto restart;
+		}
+		OBD_ALLOC(*och_p, sizeof(struct obd_client_handle));
+		if (!*och_p)
+			GOTO(out_och_free, rc = -ENOMEM);
+
+		(*och_usecount)++;
+
+		/* md_intent_lock() didn't get a request ref if there was an
+		 * open error, so don't do cleanup on the request here
+		 * (bug 3430) */
+		/* XXX (green): Should not we bail out on any error here, not
+		 * just open error? */
+		rc = it_open_error(DISP_OPEN_OPEN, it);
+		if (rc != 0)
+			GOTO(out_och_free, rc);
+
+		LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
+			 "inode %p: disposition %x, status %d\n", inode,
+			 it_disposition(it, ~0), it->it_status);
+
+		rc = ll_local_open(file, it, fd, *och_p);
+		if (rc)
+			GOTO(out_och_free, rc);
+	}
+
+	rc = pcc_file_open(inode, file);
+	if (rc)
+		GOTO(out_och_free, rc);
+
+	mutex_unlock(&lli->lli_och_mutex);
+
+	fd = NULL;
+
+	/* Must do this outside lli_och_mutex lock to prevent deadlock where
+	   different kind of OPEN lock for this same inode gets cancelled
+	   by ldlm_cancel_lru */
+	if (!S_ISREG(inode->i_mode))
+		GOTO(out_och_free, rc);
+	cl_lov_delay_create_clear(&file->f_flags);
+	cl_lu_noimport_clear(&file->f_flags);
+	GOTO(out_och_free, rc);
+
+out_och_free:
+	if (rc) {
+		if (och_p && *och_p) {
+			OBD_FREE(*och_p, sizeof(struct obd_client_handle));
+			*och_p = NULL; /* OBD_FREE writes some magic there */
+			(*och_usecount)--;
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+
+out_openerr:
+		if (lli->lli_opendir_key == fd)
+			ll_deauthorize_statahead(inode, fd);
+
+		if (fd != NULL)
+			ll_file_data_put(fd);
+	} else {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN,
+				   ktime_us_delta(ktime_get(), kstart));
+	}
+
+out_nofiledata:
+	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->it_request);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+
+	return rc;
+}
+
+static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
+			struct ldlm_lock_desc *desc, void *data, int flag)
+{
+	int rc;
+	struct lustre_handle lockh;
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+			RETURN(rc);
+		}
+		break;
+	case LDLM_CB_CANCELING:
+		/* do nothing */
+		break;
+	}
+	RETURN(0);
+}
+
+/**
+ * When setting a lease on a file, we take ownership of the lli_mds_*_och
+ * and save it as fd->fd_och so as to force client to reopen the file even
+ * if it has an open lock in cache already.
+ */
+static int ll_lease_och_acquire(struct inode *inode, struct file *file,
+				struct lustre_handle *old_open_handle)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = file->private_data;
+	struct obd_client_handle **och_p;
+	__u64 *och_usecount;
+	int rc = 0;
+	ENTRY;
+
+	/* Get the openhandle of the file */
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_lease_och != NULL)
+		GOTO(out_unlock, rc = -EBUSY);
+
+	if (fd->fd_och == NULL) {
+		if (file->f_mode & FMODE_WRITE) {
+			LASSERT(lli->lli_mds_write_och != NULL);
+			och_p = &lli->lli_mds_write_och;
+			och_usecount = &lli->lli_open_fd_write_count;
+		} else {
+			LASSERT(lli->lli_mds_read_och != NULL);
+			och_p = &lli->lli_mds_read_och;
+			och_usecount = &lli->lli_open_fd_read_count;
+		}
+
+		if (*och_usecount > 1)
+			GOTO(out_unlock, rc = -EBUSY);
+
+		fd->fd_och = *och_p;
+		*och_usecount = 0;
+		*och_p = NULL;
+	}
+
+	*old_open_handle = fd->fd_och->och_open_handle;
+
+	EXIT;
+out_unlock:
+	mutex_unlock(&lli->lli_och_mutex);
+	return rc;
+}
+
+/**
+ * Release ownership on lli_mds_*_och when putting back a file lease.
+ */
+static int ll_lease_och_release(struct inode *inode, struct file *file)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = file->private_data;
+	struct obd_client_handle **och_p;
+	struct obd_client_handle *old_och = NULL;
+	__u64 *och_usecount;
+	int rc = 0;
+	ENTRY;
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (file->f_mode & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else {
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	/* The file may have been open by another process (broken lease) so
+	 * *och_p is not NULL. In this case we should simply increase usecount
+	 * and close fd_och.
+	 */
+	if (*och_p != NULL) {
+		old_och = fd->fd_och;
+		(*och_usecount)++;
+	} else {
+		*och_p = fd->fd_och;
+		*och_usecount = 1;
+	}
+	fd->fd_och = NULL;
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (old_och != NULL)
+		rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
+
+	RETURN(rc);
+}
+
+/**
+ * Acquire a lease and open the file.
+ */
+static struct obd_client_handle *
+ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
+	      __u64 open_flags)
+{
+	struct lookup_intent it = { .it_op = IT_OPEN };
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	struct lustre_handle old_open_handle = { 0 };
+	struct obd_client_handle *och = NULL;
+	int rc;
+	int rc2;
+	ENTRY;
+
+	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
+		RETURN(ERR_PTR(-EINVAL));
+
+	if (file != NULL) {
+		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
+			RETURN(ERR_PTR(-EPERM));
+
+		rc = ll_lease_och_acquire(inode, file, &old_open_handle);
+		if (rc)
+			RETURN(ERR_PTR(rc));
+	}
+
+	OBD_ALLOC_PTR(och);
+	if (och == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
+					LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+
+	/* To tell the MDT this openhandle is from the same owner */
+	op_data->op_open_handle = old_open_handle;
+
+	it.it_flags = fmode | open_flags;
+	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
+			    &ll_md_blocking_lease_ast,
+	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
+	 * it can be cancelled which may mislead applications that the lease is
+	 * broken;
+	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
+	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
+	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
+			    LDLM_FL_NO_LRU | LDLM_FL_EXCL);
+	ll_finish_md_op_data(op_data);
+	ptlrpc_req_finished(req);
+	if (rc < 0)
+		GOTO(out_release_it, rc);
+
+	if (it_disposition(&it, DISP_LOOKUP_NEG))
+		GOTO(out_release_it, rc = -ENOENT);
+
+	rc = it_open_error(DISP_OPEN_OPEN, &it);
+	if (rc)
+		GOTO(out_release_it, rc);
+
+	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
+	rc = ll_och_fill(sbi->ll_md_exp, &it, och);
+	if (rc)
+		GOTO(out_release_it, rc);
+
+	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
+		GOTO(out_close, rc = -EOPNOTSUPP);
+
+	/* already get lease, handle lease lock */
+	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+	if (!it.it_lock_mode ||
+	    !(it.it_lock_bits & MDS_INODELOCK_OPEN)) {
+		/* open lock must return for lease */
+		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
+			PFID(ll_inode2fid(inode)), it.it_lock_mode,
+			it.it_lock_bits);
+		GOTO(out_close, rc = -EPROTO);
+	}
+
+	ll_intent_release(&it);
+	RETURN(och);
+
+out_close:
+	/* Cancel open lock */
+	if (it.it_lock_mode != 0) {
+		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
+					    it.it_lock_mode);
+		it.it_lock_mode = 0;
+		och->och_lease_handle.cookie = 0ULL;
+	}
+	rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
+	if (rc2 < 0)
+		CERROR("%s: error closing file "DFID": %d\n",
+		       sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
+	och = NULL; /* och has been freed in ll_close_inode_openhandle() */
+out_release_it:
+	ll_intent_release(&it);
+out:
+	if (och != NULL)
+		OBD_FREE_PTR(och);
+	RETURN(ERR_PTR(rc));
+}
+
+/**
+ * Check whether a layout swap can be done between two inodes.
+ *
+ * \param[in] inode1  First inode to check
+ * \param[in] inode2  Second inode to check
+ *
+ * \retval 0 on success, layout swap can be performed between both inodes
+ * \retval negative error code if requirements are not met
+ */
+static int ll_check_swap_layouts_validity(struct inode *inode1,
+					  struct inode *inode2)
+{
+	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+		return -EINVAL;
+
+	if (inode_permission(&init_user_ns, inode1, MAY_WRITE) ||
+	    inode_permission(&init_user_ns, inode2, MAY_WRITE))
+		return -EPERM;
+
+	if (inode1->i_sb != inode2->i_sb)
+		return -EXDEV;
+
+	return 0;
+}
+
+static int ll_swap_layouts_close(struct obd_client_handle *och,
+				 struct inode *inode, struct inode *inode2)
+{
+	const struct lu_fid	*fid1 = ll_inode2fid(inode);
+	const struct lu_fid	*fid2;
+	int			 rc;
+	ENTRY;
+
+	CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
+	       ll_i2sbi(inode)->ll_fsname, PFID(fid1));
+
+	rc = ll_check_swap_layouts_validity(inode, inode2);
+	if (rc < 0)
+		GOTO(out_free_och, rc);
+
+	/* We now know that inode2 is a lustre inode */
+	fid2 = ll_inode2fid(inode2);
+
+	rc = lu_fid_cmp(fid1, fid2);
+	if (rc == 0)
+		GOTO(out_free_och, rc = -EINVAL);
+
+	/* Close the file and {swap,merge} layouts between inode & inode2.
+	 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
+	 * because we still need it to pack l_remote_handle to MDT. */
+	rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
+				       inode2);
+
+	och = NULL; /* freed in ll_close_inode_openhandle() */
+
+out_free_och:
+	if (och != NULL)
+		OBD_FREE_PTR(och);
+
+	RETURN(rc);
+}
+
+/**
+ * Release lease and close the file.
+ * It will check if the lease has ever broken.
+ */
+static int ll_lease_close_intent(struct obd_client_handle *och,
+				 struct inode *inode,
+				 bool *lease_broken, enum mds_op_bias bias,
+				 void *data)
+{
+	struct ldlm_lock *lock;
+	bool cancelled = true;
+	int rc;
+	ENTRY;
+
+	lock = ldlm_handle2lock(&och->och_lease_handle);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		cancelled = ldlm_is_cancel(lock);
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+
+	CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
+	       PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
+
+	if (lease_broken != NULL)
+		*lease_broken = cancelled;
+
+	if (!cancelled && !bias)
+		ldlm_cli_cancel(&och->och_lease_handle, 0);
+
+	if (cancelled) { /* no need to excute intent */
+		bias = 0;
+		data = NULL;
+	}
+
+	rc = ll_close_inode_openhandle(inode, och, bias, data);
+	RETURN(rc);
+}
+
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+			  bool *lease_broken)
+{
+	return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
+}
+
+/**
+ * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
+ */
+static int ll_lease_file_resync(struct obd_client_handle *och,
+				struct inode *inode, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ll_ioc_lease_id ioc;
+	__u64 data_version_unused;
+	int rc;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
+			   sizeof(ioc)))
+		RETURN(-EFAULT);
+
+	/* before starting file resync, it's necessary to clean up page cache
+	 * in client memory, otherwise once the layout version is increased,
+	 * writing back cached data will be denied the OSTs. */
+	rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
+	if (rc)
+		GOTO(out, rc);
+
+	op_data->op_lease_handle = och->och_lease_handle;
+	op_data->op_mirror_id = ioc.lil_mirror_id;
+	rc = md_file_resync(sbi->ll_md_exp, op_data);
+	if (rc)
+		GOTO(out, rc);
+
+	EXIT;
+out:
+	ll_finish_md_op_data(op_data);
+	return rc;
+}
+
+int ll_merge_attr(const struct lu_env *env, struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct cl_attr *attr = vvp_env_thread_attr(env);
+	s64 atime;
+	s64 mtime;
+	s64 ctime;
+	int rc = 0;
+
+	ENTRY;
+
+	ll_inode_size_lock(inode);
+
+	/* Merge timestamps the most recently obtained from MDS with
+	 * timestamps obtained from OSTs.
+	 *
+	 * Do not overwrite atime of inode because it may be refreshed
+	 * by file_accessed() function. If the read was served by cache
+	 * data, there is no RPC to be sent so that atime may not be
+	 * transferred to OSTs at all. MDT only updates atime at close time
+	 * if it's at least 'mdd.*.atime_diff' older.
+	 * All in all, the atime in Lustre does not strictly comply with
+	 * POSIX. Solving this problem needs to send an RPC to MDT for each
+	 * read, this will hurt performance.
+	 */
+	if (test_and_clear_bit(LLIF_UPDATE_ATIME, &lli->lli_flags) ||
+	    inode->i_atime.tv_sec < lli->lli_atime)
+		inode->i_atime.tv_sec = lli->lli_atime;
+
+	inode->i_mtime.tv_sec = lli->lli_mtime;
+	inode->i_ctime.tv_sec = lli->lli_ctime;
+
+	mtime = inode->i_mtime.tv_sec;
+	atime = inode->i_atime.tv_sec;
+	ctime = inode->i_ctime.tv_sec;
+
+	cl_object_attr_lock(obj);
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
+		rc = -EINVAL;
+	else
+		rc = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+
+	if (rc != 0)
+		GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
+
+	if (atime < attr->cat_atime)
+		atime = attr->cat_atime;
+
+	if (ctime < attr->cat_ctime)
+		ctime = attr->cat_ctime;
+
+	if (mtime < attr->cat_mtime)
+		mtime = attr->cat_mtime;
+
+	CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
+	       PFID(&lli->lli_fid), attr->cat_size);
+
+	if (llcrypt_require_key(inode) == -ENOKEY) {
+		/* Without the key, round up encrypted file size to next
+		 * LUSTRE_ENCRYPTION_UNIT_SIZE. Clear text size is put in
+		 * lli_lazysize for proper file size setting at close time.
+		 */
+		lli->lli_attr_valid |= OBD_MD_FLLAZYSIZE;
+		lli->lli_lazysize = attr->cat_size;
+		attr->cat_size = round_up(attr->cat_size,
+					  LUSTRE_ENCRYPTION_UNIT_SIZE);
+	}
+	i_size_write(inode, attr->cat_size);
+	inode->i_blocks = attr->cat_blocks;
+
+	inode->i_mtime.tv_sec = mtime;
+	inode->i_atime.tv_sec = atime;
+	inode->i_ctime.tv_sec = ctime;
+
+out_size_unlock:
+	ll_inode_size_unlock(inode);
+
+	RETURN(rc);
+}
+
+/**
+ * Set designated mirror for I/O.
+ *
+ * So far only read, write, and truncated can support to issue I/O to
+ * designated mirror.
+ */
+void ll_io_set_mirror(struct cl_io *io, const struct file *file)
+{
+	struct ll_file_data *fd = file->private_data;
+
+	/* clear layout version for generic(non-resync) I/O in case it carries
+	 * stale layout version due to I/O restart */
+	io->ci_layout_version = 0;
+
+	/* FLR: disable non-delay for designated mirror I/O because obviously
+	 * only one mirror is available */
+	if (fd->fd_designated_mirror > 0) {
+		io->ci_ndelay = 0;
+		io->ci_designated_mirror = fd->fd_designated_mirror;
+		io->ci_layout_version = fd->fd_layout_version;
+	}
+
+	CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
+	       file->f_path.dentry->d_name.name, io->ci_designated_mirror);
+}
+
+/*
+ * This is relatime_need_update() from Linux 5.17, which is not exported.
+ */
+static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
+				struct timespec64 now)
+{
+
+	if (!(mnt->mnt_flags & MNT_RELATIME))
+		return 1;
+	/*
+	 * Is mtime younger than atime? If yes, update atime:
+	 */
+	if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+		return 1;
+	/*
+	 * Is ctime younger than atime? If yes, update atime:
+	 */
+	if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
+		return 1;
+
+	/*
+	 * Is the previous atime value older than 6 hours? If yes,
+	 * update atime:
+	 */
+	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 6*60*60)
+		return 1;
+	/*
+	 * Good, we can skip the atime update:
+	 */
+	return 0;
+}
+
+/*
+ * Very similar to kernel function: !__atime_needs_update()
+ */
+static bool file_is_noatime(const struct file *file)
+{
+	struct vfsmount *mnt = file->f_path.mnt;
+	struct inode *inode = file_inode((struct file *)file);
+	struct timespec64 now;
+
+	if (file->f_flags & O_NOATIME)
+		return true;
+
+	if (inode->i_flags & S_NOATIME)
+		return true;
+
+	if (IS_NOATIME(inode))
+		return true;
+
+	if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
+		return true;
+
+	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return true;
+
+	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return true;
+
+	now = current_time(inode);
+
+	if (!relatime_need_update(mnt, inode, now))
+		return true;
+
+	return false;
+}
+
+void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
+		struct vvp_io_args *args)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_file_data *fd  = file->private_data;
+
+	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+	io->ci_lock_no_expand = fd->ll_lock_no_expand;
+
+	if (iot == CIT_WRITE) {
+		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
+		io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
+					   file->f_flags & O_DIRECT ||
+					   IS_SYNC(inode));
+#ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
+		io->u.ci_wr.wr_sync  |= !!(args &&
+					   (args->u.normal.via_iocb->ki_flags &
+					    IOCB_DSYNC));
+#endif
+	}
+
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	io->ci_lockreq = CILR_MAYBE;
+	if (ll_file_nolock(file)) {
+		io->ci_lockreq = CILR_NEVER;
+		io->ci_no_srvlock = 1;
+	} else if (file->f_flags & O_APPEND) {
+		io->ci_lockreq = CILR_MANDATORY;
+	}
+	io->ci_noatime = file_is_noatime(file);
+	io->ci_async_readahead = false;
+
+	/* FLR: only use non-delay I/O for read as there is only one
+	 * avaliable mirror for write. */
+	io->ci_ndelay = !(iot == CIT_WRITE);
+
+	ll_io_set_mirror(io, file);
+}
+
+static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
+			__u64 count)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	enum obd_heat_type sample_type;
+	enum obd_heat_type iobyte_type;
+	__u64 now = ktime_get_real_seconds();
+
+	if (!ll_sbi_has_file_heat(sbi) ||
+	    lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
+		return;
+
+	if (iot == CIT_READ) {
+		sample_type = OBD_HEAT_READSAMPLE;
+		iobyte_type = OBD_HEAT_READBYTE;
+	} else if (iot == CIT_WRITE) {
+		sample_type = OBD_HEAT_WRITESAMPLE;
+		iobyte_type = OBD_HEAT_WRITEBYTE;
+	} else {
+		return;
+	}
+
+	spin_lock(&lli->lli_heat_lock);
+	obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
+		     sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
+	obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
+		     sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
+	spin_unlock(&lli->lli_heat_lock);
+}
+
+static ssize_t
+ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
+		   struct file *file, enum cl_io_type iot,
+		   loff_t *ppos, size_t count)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_file_data *fd  = file->private_data;
+	struct range_lock range;
+	bool range_locked = false;
+	struct cl_io *io;
+	ssize_t result = 0;
+	int rc = 0;
+	int rc2 = 0;
+	unsigned int retried = 0, dio_lock = 0;
+	bool is_aio = false;
+	bool is_parallel_dio = false;
+	struct cl_dio_aio *ci_dio_aio = NULL;
+	size_t per_bytes;
+	bool partial_io = false;
+	size_t max_io_pages, max_cached_pages;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
+		file_dentry(file)->d_name.name,
+		iot == CIT_READ ? "read" : "write", *ppos, count);
+
+	max_io_pages = PTLRPC_MAX_BRW_PAGES * OBD_MAX_RIF_DEFAULT;
+	max_cached_pages = sbi->ll_cache->ccc_lru_max;
+	if (max_io_pages > (max_cached_pages >> 2))
+		max_io_pages = max_cached_pages >> 2;
+
+	io = vvp_env_thread_io(env);
+	if (file->f_flags & O_DIRECT) {
+		if (file->f_flags & O_APPEND)
+			dio_lock = 1;
+		if (!is_sync_kiocb(args->u.normal.via_iocb))
+			is_aio = true;
+
+		/* the kernel does not support AIO on pipes, and parallel DIO
+		 * uses part of the AIO path, so we must not do parallel dio
+		 * to pipes
+		 */
+		is_parallel_dio = !iov_iter_is_pipe(args->u.normal.via_iter) &&
+			       !is_aio;
+
+		if (!ll_sbi_has_parallel_dio(sbi))
+			is_parallel_dio = false;
+
+		ci_dio_aio = cl_dio_aio_alloc(args->u.normal.via_iocb,
+					  ll_i2info(inode)->lli_clob, is_aio);
+		if (!ci_dio_aio)
+			GOTO(out, rc = -ENOMEM);
+	}
+
+restart:
+	/**
+	 * IO block size need be aware of cached page limit, otherwise
+	 * if we have small max_cached_mb but large block IO issued, io
+	 * could not be finished and blocked whole client.
+	 */
+	if (file->f_flags & O_DIRECT)
+		per_bytes = count;
+	else
+		per_bytes = min(max_io_pages << PAGE_SHIFT, count);
+	partial_io = per_bytes < count;
+	io = vvp_env_thread_io(env);
+	ll_io_init(io, file, iot, args);
+	io->ci_dio_aio = ci_dio_aio;
+	io->ci_dio_lock = dio_lock;
+	io->ci_ndelay_tried = retried;
+	io->ci_parallel_dio = is_parallel_dio;
+
+	if (cl_io_rw_init(env, io, iot, *ppos, per_bytes) == 0) {
+		if (file->f_flags & O_APPEND)
+			range_lock_init(&range, 0, LUSTRE_EOF);
+		else
+			range_lock_init(&range, *ppos, *ppos + per_bytes - 1);
+
+		vio->vui_fd  = file->private_data;
+		vio->vui_iter = args->u.normal.via_iter;
+		vio->vui_iocb = args->u.normal.via_iocb;
+		/* Direct IO reads must also take range lock,
+		 * or multiple reads will try to work on the same pages
+		 * See LU-6227 for details.
+		 */
+		if (((iot == CIT_WRITE) ||
+		    (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
+		    !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+			CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
+			       RL_PARA(&range));
+			rc = range_lock(&lli->lli_write_tree, &range);
+			if (rc < 0)
+				GOTO(out, rc);
+
+			range_locked = true;
+		}
+
+		ll_cl_add(inode, env, io, LCC_RW);
+		rc = cl_io_loop(env, io);
+		ll_cl_remove(inode, env);
+	} else {
+		/* cl_io_rw_init() handled IO */
+		rc = io->ci_result;
+	}
+
+	if (io->ci_dio_aio && !is_aio) {
+		struct cl_sync_io *anchor = &io->ci_dio_aio->cda_sync;
+
+		/* for dio, EIOCBQUEUED is an implementation detail,
+		 * and we don't return it to userspace
+		 */
+		if (rc == -EIOCBQUEUED)
+			rc = 0;
+
+		/* N/B: parallel DIO may be disabled during i/o submission;
+		 * if that occurs, I/O shifts to sync, so it's all resolved
+		 * before we get here, and this wait call completes
+		 * immediately.
+		 */
+		rc2 = cl_sync_io_wait_recycle(env, anchor, 0, 0);
+		if (rc2 < 0)
+			rc = rc2;
+	}
+
+	if (range_locked) {
+		CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
+		       RL_PARA(&range));
+		range_unlock(&lli->lli_write_tree, &range);
+			range_locked = false;
+	}
+
+	/*
+	 * In order to move forward AIO, ci_nob was increased,
+	 * but that doesn't mean io have been finished, it just
+	 * means io have been submited, we will always return
+	 * EIOCBQUEUED to the caller, So we could only return
+	 * number of bytes in non-AIO case.
+	 */
+	if (io->ci_nob > 0) {
+		if (!is_aio) {
+			if (rc2 == 0) {
+				result += io->ci_nob;
+				*ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
+			} else if (rc2) {
+				result = 0;
+			}
+		}
+		count -= io->ci_nob;
+
+		/* prepare IO restart */
+		if (count > 0)
+			args->u.normal.via_iter = vio->vui_iter;
+
+		if (partial_io) {
+			/**
+			 * Reexpand iov count because it was zero
+			 * after IO finish.
+			 */
+			iov_iter_reexpand(vio->vui_iter, count);
+			if (per_bytes == io->ci_nob)
+				io->ci_need_restart = 1;
+		}
+	}
+out:
+	cl_io_fini(env, io);
+
+	CDEBUG(D_VFSTRACE,
+	       "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
+	       file->f_path.dentry->d_name.name,
+	       iot, rc, result, io->ci_need_restart);
+
+	if ((rc == 0 || rc == -ENODATA || rc == -ENOLCK) &&
+	    count > 0 && io->ci_need_restart) {
+		CDEBUG(D_VFSTRACE,
+		       "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
+		       file_dentry(file)->d_name.name,
+		       iot == CIT_READ ? "read" : "write",
+		       *ppos, count, result, rc);
+		/* preserve the tried count for FLR */
+		retried = io->ci_ndelay_tried;
+		dio_lock = io->ci_dio_lock;
+		goto restart;
+	}
+
+	if (io->ci_dio_aio) {
+		/*
+		 * VFS will call aio_complete() if no -EIOCBQUEUED
+		 * is returned for AIO, so we can not call aio_complete()
+		 * in our end_io().
+		 *
+		 * NB: This is safe because the atomic_dec_and_lock  in
+		 * cl_sync_io_init has implicit memory barriers, so this will
+		 * be seen by whichever thread completes the DIO/AIO, even if
+		 * it's not this one
+		 */
+		if (rc != -EIOCBQUEUED)
+			io->ci_dio_aio->cda_no_aio_complete = 1;
+		/**
+		 * Drop one extra reference so that end_io() could be
+		 * called for this IO context, we could call it after
+		 * we make sure all AIO requests have been proceed.
+		 */
+		cl_sync_io_note(env, &io->ci_dio_aio->cda_sync,
+				rc == -EIOCBQUEUED ? 0 : rc);
+		if (!is_aio) {
+			LASSERT(io->ci_dio_aio->cda_creator_free);
+			cl_dio_aio_free(env, io->ci_dio_aio);
+			io->ci_dio_aio = NULL;
+		}
+	}
+
+	if (iot == CIT_READ) {
+		if (result > 0)
+			ll_stats_ops_tally(ll_i2sbi(inode),
+					   LPROC_LL_READ_BYTES, result);
+	} else if (iot == CIT_WRITE) {
+		if (result > 0) {
+			ll_stats_ops_tally(ll_i2sbi(inode),
+					   LPROC_LL_WRITE_BYTES, result);
+			fd->fd_write_failed = false;
+		} else if (result == 0 && rc == 0) {
+			rc = io->ci_result;
+			if (rc < 0)
+				fd->fd_write_failed = true;
+			else
+				fd->fd_write_failed = false;
+		} else if (rc != -ERESTARTSYS) {
+			fd->fd_write_failed = true;
+		}
+	}
+
+	CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
+	if (result > 0)
+		ll_heat_add(inode, iot, result);
+
+	RETURN(result > 0 ? result : rc);
+}
+
+/**
+ * The purpose of fast read is to overcome per I/O overhead and improve IOPS
+ * especially for small I/O.
+ *
+ * To serve a read request, CLIO has to create and initialize a cl_io and
+ * then request DLM lock. This has turned out to have siginificant overhead
+ * and affects the performance of small I/O dramatically.
+ *
+ * It's not necessary to create a cl_io for each I/O. Under the help of read
+ * ahead, most of the pages being read are already in memory cache and we can
+ * read those pages directly because if the pages exist, the corresponding DLM
+ * lock must exist so that page content must be valid.
+ *
+ * In fast read implementation, the llite speculatively finds and reads pages
+ * in memory cache. There are three scenarios for fast read:
+ *   - If the page exists and is uptodate, kernel VM will provide the data and
+ *     CLIO won't be intervened;
+ *   - If the page was brought into memory by read ahead, it will be exported
+ *     and read ahead parameters will be updated;
+ *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
+ *     it will go back and invoke normal read, i.e., a cl_io will be created
+ *     and DLM lock will be requested.
+ *
+ * POSIX compliance: posix standard states that read is intended to be atomic.
+ * Lustre read implementation is in line with Linux kernel read implementation
+ * and neither of them complies with POSIX standard in this matter. Fast read
+ * doesn't make the situation worse on single node but it may interleave write
+ * results from multiple nodes due to short read handling in ll_file_aio_read().
+ *
+ * \param env - lu_env
+ * \param iocb - kiocb from kernel
+ * \param iter - user space buffers where the data will be copied
+ *
+ * \retval - number of bytes have been read, or error code if error occurred.
+ */
+static ssize_t
+ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t result;
+
+	if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
+		return 0;
+
+	/* NB: we can't do direct IO for fast read because it will need a lock
+	 * to make IO engine happy. */
+	if (iocb->ki_filp->f_flags & O_DIRECT)
+		return 0;
+
+	result = generic_file_read_iter(iocb, iter);
+
+	/* If the first page is not in cache, generic_file_aio_read() will be
+	 * returned with -ENODATA.  Fall back to full read path.
+	 * See corresponding code in ll_readpage().
+	 *
+	 * if we raced with page deletion, we might get EIO.  Rather than add
+	 * locking to the fast path for this rare case, fall back to the full
+	 * read path.  (See vvp_io_read_start() for rest of handling.
+	 */
+	if (result == -ENODATA || result == -EIO)
+		result = 0;
+
+	if (result > 0) {
+		ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
+		ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
+				   LPROC_LL_READ_BYTES, result);
+	}
+
+	return result;
+}
+
+/**
+ * Confine read iter lest read beyond the EOF
+ *
+ * \param iocb [in]	kernel iocb
+ * \param to [in]	reader iov_iter
+ *
+ * \retval <0	failure
+ * \retval 0	success
+ * \retval >0	@iocb->ki_pos has passed the EOF
+ */
+static int file_read_confine_iter(struct lu_env *env, struct kiocb *iocb,
+				  struct iov_iter *to)
+{
+	struct cl_attr *attr = vvp_env_thread_attr(env);
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	loff_t read_end = iocb->ki_pos + iov_iter_count(to);
+	loff_t kms;
+	loff_t size;
+	int rc;
+
+	cl_object_attr_lock(lli->lli_clob);
+	rc = cl_object_attr_get(env, lli->lli_clob, attr);
+	cl_object_attr_unlock(lli->lli_clob);
+	if (rc != 0)
+		return rc;
+
+	kms = attr->cat_kms;
+	/* if read beyond end-of-file, adjust read count */
+	if (kms > 0 && (iocb->ki_pos >= kms || read_end > kms)) {
+		rc = ll_glimpse_size(inode);
+		if (rc != 0)
+			return rc;
+
+		size = i_size_read(inode);
+		if (iocb->ki_pos >= size || read_end > size) {
+			CDEBUG(D_VFSTRACE,
+			       "%s: read [%llu, %llu] over eof, kms %llu, file_size %llu.\n",
+			       file_dentry(file)->d_name.name,
+			       iocb->ki_pos, read_end, kms, size);
+
+			if (iocb->ki_pos >= size)
+				return 1;
+
+			if (read_end > size)
+				iov_iter_truncate(to, size - iocb->ki_pos);
+		}
+	}
+
+	return rc;
+}
+
+/*
+ * Read from a file (through the page cache).
+ */
+static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct lu_env *env;
+	struct vvp_io_args *args;
+	struct file *file = iocb->ki_filp;
+	ssize_t result;
+	ssize_t rc2;
+	__u16 refcheck;
+	ktime_t kstart = ktime_get();
+	bool cached;
+	bool stale_data = false;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE|D_IOTRACE, "file %s:"DFID", ppos: %lld, count: %zu\n",
+	       file_dentry(file)->d_name.name,
+	       PFID(ll_inode2fid(file_inode(file))), iocb->ki_pos,
+	       iov_iter_count(to));
+
+	if (!iov_iter_count(to))
+		RETURN(0);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	result = file_read_confine_iter(env, iocb, to);
+	if (result < 0)
+		GOTO(out, result);
+	else if (result > 0)
+		stale_data = true;
+
+	/**
+	 * Currently when PCC read failed, we do not fall back to the
+	 * normal read path, just return the error.
+	 * The resaon is that: for RW-PCC, the file data may be modified
+	 * in the PCC and inconsistent with the data on OSTs (or file
+	 * data has been removed from the Lustre file system), at this
+	 * time, fallback to the normal read path may read the wrong
+	 * data.
+	 * TODO: for RO-PCC (readonly PCC), fall back to normal read
+	 * path: read data from data copy on OSTs.
+	 */
+	result = pcc_file_read_iter(iocb, to, &cached);
+	if (cached)
+		GOTO(out, result);
+
+	ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to));
+
+	result = ll_do_fast_read(iocb, to);
+	if (result < 0 || iov_iter_count(to) == 0)
+		GOTO(out, result);
+
+	args = ll_env_args(env);
+	args->u.normal.via_iter = to;
+	args->u.normal.via_iocb = iocb;
+
+	rc2 = ll_file_io_generic(env, args, file, CIT_READ,
+				 &iocb->ki_pos, iov_iter_count(to));
+	if (rc2 > 0)
+		result += rc2;
+	else if (result == 0)
+		result = rc2;
+
+out:
+	cl_env_put(env, &refcheck);
+
+	if (stale_data && result > 0) {
+		/**
+		 * we've reached EOF before the read, the data read are cached
+		 * stale data.
+		 */
+		iov_iter_truncate(to, 0);
+		result = 0;
+	}
+
+	if (result > 0) {
+		ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
+				  file->private_data, iocb->ki_pos, result,
+				  READ);
+		ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ,
+				   ktime_us_delta(ktime_get(), kstart));
+	}
+
+	RETURN(result);
+}
+
+/**
+ * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
+ * If a page is already in the page cache and dirty (and some other things -
+ * See ll_tiny_write_begin for the instantiation of these rules), then we can
+ * write to it without doing a full I/O, because Lustre already knows about it
+ * and will write it out.  This saves a lot of processing time.
+ *
+ * All writes here are within one page, so exclusion is handled by the page
+ * lock on the vm page.  We do not do tiny writes for writes which touch
+ * multiple pages because it's very unlikely multiple sequential pages are
+ * are already dirty.
+ *
+ * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
+ * and are unlikely to be to already dirty pages.
+ *
+ * Attribute updates are important here, we do them in ll_tiny_write_end.
+ */
+static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t count = iov_iter_count(iter);
+	struct  file *file = iocb->ki_filp;
+	struct  inode *inode = file_inode(file);
+	bool    lock_inode = !IS_NOSEC(inode);
+	ssize_t result = 0;
+
+	ENTRY;
+
+	/* Restrict writes to single page and < PAGE_SIZE.  See comment at top
+	 * of function for why.
+	 */
+	if (count >= PAGE_SIZE ||
+	    (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
+		RETURN(0);
+
+	if (unlikely(lock_inode))
+		inode_lock(inode);
+	result = __generic_file_write_iter(iocb, iter);
+
+	if (unlikely(lock_inode))
+		inode_unlock(inode);
+
+	/* If the page is not already dirty, ll_tiny_write_begin returns
+	 * -ENODATA.  We continue on to normal write.
+	 */
+	if (result == -ENODATA)
+		result = 0;
+
+	if (result > 0) {
+		ll_heat_add(inode, CIT_WRITE, result);
+		set_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags);
+	}
+
+	CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
+
+	RETURN(result);
+}
+
+/*
+ * Write to a file (through the page cache).
+ */
+static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct vvp_io_args *args;
+	struct lu_env *env;
+	ssize_t rc_tiny = 0, rc_normal;
+	struct file *file = iocb->ki_filp;
+	__u16 refcheck;
+	bool cached;
+	ktime_t kstart = ktime_get();
+	int result;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE|D_IOTRACE, "file %s:"DFID", ppos: %lld, count: %zu\n",
+	       file_dentry(file)->d_name.name,
+	       PFID(ll_inode2fid(file_inode(file))), iocb->ki_pos,
+	       iov_iter_count(from));
+
+	if (!iov_iter_count(from))
+		GOTO(out, rc_normal = 0);
+
+	/**
+	 * When PCC write failed, we usually do not fall back to the normal
+	 * write path, just return the error. But there is a special case when
+	 * returned error code is -ENOSPC due to running out of space on PCC HSM
+	 * bakcend. At this time, it will fall back to normal I/O path and
+	 * retry the I/O. As the file is in HSM released state, it will restore
+	 * the file data to OSTs first and redo the write again. And the
+	 * restore process will revoke the layout lock and detach the file
+	 * from PCC cache automatically.
+	 */
+	result = pcc_file_write_iter(iocb, from, &cached);
+	if (cached && result != -ENOSPC && result != -EDQUOT)
+		GOTO(out, rc_normal = result);
+
+	/* NB: we can't do direct IO for tiny writes because they use the page
+	 * cache, we can't do sync writes because tiny writes can't flush
+	 * pages, and we can't do append writes because we can't guarantee the
+	 * required DLM locks are held to protect file size.
+	 */
+	if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
+	    !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
+		rc_tiny = ll_do_tiny_write(iocb, from);
+
+	/* In case of error, go on and try normal write - Only stop if tiny
+	 * write completed I/O.
+	 */
+	if (iov_iter_count(from) == 0)
+		GOTO(out, rc_normal = rc_tiny);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	args = ll_env_args(env);
+	args->u.normal.via_iter = from;
+	args->u.normal.via_iocb = iocb;
+
+	rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
+				       &iocb->ki_pos, iov_iter_count(from));
+
+	/* On success, combine bytes written. */
+	if (rc_tiny >= 0 && rc_normal > 0)
+		rc_normal += rc_tiny;
+	/* On error, only return error from normal write if tiny write did not
+	 * write any bytes.  Otherwise return bytes written by tiny write.
+	 */
+	else if (rc_tiny > 0)
+		rc_normal = rc_tiny;
+
+	cl_env_put(env, &refcheck);
+out:
+	if (rc_normal > 0) {
+		ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
+				  file->private_data, iocb->ki_pos,
+				  rc_normal, WRITE);
+		ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE,
+				   ktime_us_delta(ktime_get(), kstart));
+	}
+
+	RETURN(rc_normal);
+}
+
+#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+/*
+ * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
+ */
+static int ll_file_get_iov_count(const struct iovec *iov,
+				 unsigned long *nr_segs, size_t *count,
+				 int access_flags)
+{
+	size_t cnt = 0;
+	unsigned long seg;
+
+	for (seg = 0; seg < *nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		cnt += iv->iov_len;
+		if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(access_flags, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		*nr_segs = seg;
+		cnt -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+	*count = cnt;
+	return 0;
+}
+
+static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	struct iov_iter	to;
+	size_t iov_count;
+	ssize_t result;
+	ENTRY;
+
+	result = ll_file_get_iov_count(iov, &nr_segs, &iov_count, VERIFY_READ);
+	if (result)
+		RETURN(result);
+
+	if (!iov_count)
+		RETURN(0);
+
+# ifdef HAVE_IOV_ITER_INIT_DIRECTION
+	iov_iter_init(&to, READ, iov, nr_segs, iov_count);
+# else /* !HAVE_IOV_ITER_INIT_DIRECTION */
+	iov_iter_init(&to, iov, nr_segs, iov_count, 0);
+# endif /* HAVE_IOV_ITER_INIT_DIRECTION */
+
+	result = ll_file_read_iter(iocb, &to);
+
+	RETURN(result);
+}
+
+static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct iovec   iov = { .iov_base = buf, .iov_len = count };
+	struct kiocb   kiocb;
+	ssize_t        result;
+
+	ENTRY;
+
+	if (!count)
+		RETURN(0);
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
+#ifdef HAVE_KIOCB_KI_LEFT
+	kiocb.ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+	kiocb.i_nbytes = count;
+#endif
+
+	result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
+	*ppos = kiocb.ki_pos;
+
+	RETURN(result);
+}
+
+/*
+ * Write to a file (through the page cache).
+ * AIO stuff
+ */
+static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				 unsigned long nr_segs, loff_t pos)
+{
+	struct iov_iter from;
+	size_t iov_count;
+	ssize_t result;
+	ENTRY;
+
+	result = ll_file_get_iov_count(iov, &nr_segs, &iov_count, VERIFY_WRITE);
+	if (result)
+		RETURN(result);
+
+	if (!iov_count)
+		RETURN(0);
+
+# ifdef HAVE_IOV_ITER_INIT_DIRECTION
+	iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
+# else /* !HAVE_IOV_ITER_INIT_DIRECTION */
+	iov_iter_init(&from, iov, nr_segs, iov_count, 0);
+# endif /* HAVE_IOV_ITER_INIT_DIRECTION */
+
+	result = ll_file_write_iter(iocb, &from);
+
+	RETURN(result);
+}
+
+static ssize_t ll_file_write(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct iovec   iov = { .iov_base = (void __user *)buf,
+			       .iov_len = count };
+	struct kiocb   kiocb;
+	ssize_t        result;
+
+	ENTRY;
+
+	if (!count)
+		RETURN(0);
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
+#ifdef HAVE_KIOCB_KI_LEFT
+	kiocb.ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+	kiocb.ki_nbytes = count;
+#endif
+
+	result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
+	*ppos = kiocb.ki_pos;
+
+	RETURN(result);
+}
+#endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
+			     __u64 flags, struct lov_user_md *lum, int lum_size)
+{
+	struct lookup_intent oit = {
+		.it_op = IT_OPEN,
+		.it_flags = flags | MDS_OPEN_BY_FID,
+	};
+	int rc;
+	ENTRY;
+
+	if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
+	    le32_to_cpu(LOV_MAGIC_MAGIC)) {
+		/* this code will only exist for big-endian systems */
+		lustre_swab_lov_user_md(lum, 0);
+	}
+
+	ll_inode_size_lock(inode);
+	rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
+	if (rc < 0)
+		GOTO(out_unlock, rc);
+
+	ll_release_openhandle(dentry, &oit);
+
+out_unlock:
+	ll_inode_size_unlock(inode);
+	ll_intent_release(&oit);
+
+	RETURN(rc);
+}
+
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+                             struct lov_mds_md **lmmp, int *lmm_size,
+                             struct ptlrpc_request **request)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct mdt_body  *body;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data;
+	int rc, lmmsize;
+
+	ENTRY;
+
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
+				     strlen(filename), lmmsize,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
+		       filename, rc);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL); /* checked by mdc_getattr_name */
+
+	lmmsize = body->mbo_eadatasize;
+
+	if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+	    lmmsize == 0)
+		GOTO(out, rc = -ENODATA);
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
+	LASSERT(lmm != NULL);
+
+	if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
+	    lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
+	    lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
+	    lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
+		GOTO(out, rc = -EPROTO);
+
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian. We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
+		int stripe_count = 0;
+
+		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
+		    lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+			stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+			if (le32_to_cpu(lmm->lmm_pattern) &
+			    LOV_PATTERN_F_RELEASED)
+				stripe_count = 0;
+			lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
+
+			/* if function called for directory - we should
+			 * avoid swab not existent lsm objects
+			 */
+			if (lmm->lmm_magic == LOV_MAGIC_V1 &&
+			    S_ISREG(body->mbo_mode))
+				lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				stripe_count);
+			else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
+				 S_ISREG(body->mbo_mode))
+				lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				stripe_count);
+		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1)) {
+			lustre_swab_lov_comp_md_v1(
+				(struct lov_comp_md_v1 *)lmm);
+		}
+	}
+
+	if (lmm->lmm_magic == LOV_MAGIC_COMP_V1) {
+		struct lov_comp_md_v1 *comp_v1 = NULL;
+		struct lov_comp_md_entry_v1 *ent;
+		struct lov_user_md_v1 *v1;
+		__u32 off;
+		int i = 0;
+
+		comp_v1 = (struct lov_comp_md_v1 *)lmm;
+		/* Dump the striping information */
+		for (; i < comp_v1->lcm_entry_count; i++) {
+			ent = &comp_v1->lcm_entries[i];
+			off = ent->lcme_offset;
+			v1 = (struct lov_user_md_v1 *)((char *)lmm + off);
+			CDEBUG(D_INFO,
+			       "comp[%d]: stripe_count=%u, stripe_size=%u\n",
+			       i, v1->lmm_stripe_count, v1->lmm_stripe_size);
+		}
+
+		/**
+		 * Return valid stripe_count and stripe_size instead of 0 for
+		 * DoM files to avoid divide-by-zero for older userspace that
+		 * calls this ioctl, e.g. lustre ADIO driver.
+		 */
+		if (lmm->lmm_stripe_count == 0)
+			lmm->lmm_stripe_count = 1;
+		if (lmm->lmm_stripe_size == 0) {
+			/* Since the first component of the file data is placed
+			 * on the MDT for faster access, the stripe_size of the
+			 * second one is always that applications which are
+			 * doing large IOs.
+			 */
+			if (lmm->lmm_pattern == LOV_PATTERN_MDT)
+				i = comp_v1->lcm_entry_count > 1 ? 1 : 0;
+			else
+				i = comp_v1->lcm_entry_count > 1 ?
+				    comp_v1->lcm_entry_count - 1 : 0;
+			ent = &comp_v1->lcm_entries[i];
+			off = ent->lcme_offset;
+			v1 = (struct lov_user_md_v1 *)((char *)lmm + off);
+			lmm->lmm_stripe_size = v1->lmm_stripe_size;
+		}
+	}
+out:
+	*lmmp = lmm;
+	*lmm_size = lmmsize;
+	*request = req;
+	RETURN(rc);
+}
+
+static int ll_lov_setea(struct inode *inode, struct file *file,
+			void __user *arg)
+{
+	__u64			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
+	struct lov_user_md	*lump;
+	int			 lum_size = sizeof(struct lov_user_md) +
+					    sizeof(struct lov_user_ost_data);
+	int			 rc;
+	ENTRY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	OBD_ALLOC_LARGE(lump, lum_size);
+	if (lump == NULL)
+                RETURN(-ENOMEM);
+
+	if (copy_from_user(lump, arg, lum_size))
+		GOTO(out_lump, rc = -EFAULT);
+
+	rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
+				      lum_size);
+	cl_lov_delay_create_clear(&file->f_flags);
+
+out_lump:
+	OBD_FREE_LARGE(lump, lum_size);
+	RETURN(rc);
+}
+
+static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
+{
+	struct lu_env	*env;
+	__u16		refcheck;
+	int		rc;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+static int ll_lov_setstripe(struct inode *inode, struct file *file,
+			    void __user *arg)
+{
+	struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
+	struct lov_user_md	  *klum;
+	int			   lum_size, rc;
+	__u64			   flags = FMODE_WRITE;
+	ENTRY;
+
+	rc = ll_copy_user_md(lum, &klum);
+	if (rc < 0)
+		RETURN(rc);
+
+	lum_size = rc;
+	rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
+				      lum_size);
+	if (!rc) {
+		__u32 gen;
+
+		rc = put_user(0, &lum->lmm_stripe_count);
+		if (rc)
+			GOTO(out, rc);
+
+		rc = ll_layout_refresh(inode, &gen);
+		if (rc)
+			GOTO(out, rc);
+
+		rc = ll_file_getstripe(inode, arg, lum_size);
+		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode) &&
+		    ll_i2info(inode)->lli_clob) {
+			struct iattr attr = { 0 };
+
+			rc = cl_setattr_ost(ll_i2info(inode)->lli_clob, &attr,
+					    OP_XVALID_FLAGS, LUSTRE_ENCRYPT_FL);
+		}
+	}
+	cl_lov_delay_create_clear(&file->f_flags);
+
+out:
+	OBD_FREE_LARGE(klum, lum_size);
+	RETURN(rc);
+}
+
+
+static int
+ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct ll_file_data *fd = file->private_data;
+	struct ll_grouplock grouplock;
+	int rc;
+	ENTRY;
+
+	if (arg == 0) {
+		CWARN("group id for group lock must not be 0\n");
+		RETURN(-EINVAL);
+	}
+
+	if (ll_file_nolock(file))
+		RETURN(-EOPNOTSUPP);
+retry:
+	if (file->f_flags & O_NONBLOCK) {
+		if (!mutex_trylock(&lli->lli_group_mutex))
+			RETURN(-EAGAIN);
+	} else
+		mutex_lock(&lli->lli_group_mutex);
+
+	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+		CWARN("group lock already existed with gid %lu\n",
+		      fd->fd_grouplock.lg_gid);
+		GOTO(out, rc = -EINVAL);
+	}
+	if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
+		if (file->f_flags & O_NONBLOCK)
+			GOTO(out, rc = -EAGAIN);
+		mutex_unlock(&lli->lli_group_mutex);
+		wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
+		GOTO(retry, rc = 0);
+	}
+	LASSERT(fd->fd_grouplock.lg_lock == NULL);
+
+	/**
+	 * XXX: group lock needs to protect all OST objects while PFL
+	 * can add new OST objects during the IO, so we'd instantiate
+	 * all OST objects before getting its group lock.
+	 */
+	if (obj) {
+		struct lu_env *env;
+		__u16 refcheck;
+		struct cl_layout cl = {
+			.cl_is_composite = false,
+		};
+		struct lu_extent ext = {
+			.e_start = 0,
+			.e_end = OBD_OBJECT_EOF,
+		};
+
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env))
+			GOTO(out, rc = PTR_ERR(env));
+
+		rc = cl_object_layout_get(env, obj, &cl);
+		if (rc >= 0 && cl.cl_is_composite)
+			rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
+						    &ext);
+
+		cl_env_put(env, &refcheck);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
+			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
+
+	if (rc)
+		GOTO(out, rc);
+
+	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
+	fd->fd_grouplock = grouplock;
+	if (lli->lli_group_users == 0)
+		lli->lli_group_gid = grouplock.lg_gid;
+	lli->lli_group_users++;
+
+	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
+out:
+	mutex_unlock(&lli->lli_group_mutex);
+
+	RETURN(rc);
+}
+
+static int ll_put_grouplock(struct inode *inode, struct file *file,
+			    unsigned long arg)
+{
+	struct ll_inode_info   *lli = ll_i2info(inode);
+	struct ll_file_data    *fd = file->private_data;
+	struct ll_grouplock	grouplock;
+	int			rc;
+	ENTRY;
+
+	mutex_lock(&lli->lli_group_mutex);
+	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		CWARN("no group lock held\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	LASSERT(fd->fd_grouplock.lg_lock != NULL);
+
+	if (fd->fd_grouplock.lg_gid != arg) {
+		CWARN("group lock %lu doesn't match current id %lu\n",
+		      arg, fd->fd_grouplock.lg_gid);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	grouplock = fd->fd_grouplock;
+	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
+	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
+
+	cl_put_grouplock(&grouplock);
+
+	lli->lli_group_users--;
+	if (lli->lli_group_users == 0) {
+		lli->lli_group_gid = 0;
+		wake_up_var(&lli->lli_group_users);
+	}
+	CDEBUG(D_INFO, "group lock %lu released\n", arg);
+	GOTO(out, rc = 0);
+out:
+	mutex_unlock(&lli->lli_group_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * Close inode open handle
+ *
+ * \param dentry [in]     dentry which contains the inode
+ * \param it     [in,out] intent which contains open info and result
+ *
+ * \retval 0     success
+ * \retval <0    failure
+ */
+int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
+{
+	struct inode *inode = dentry->d_inode;
+	struct obd_client_handle *och;
+	int rc;
+	ENTRY;
+
+	LASSERT(inode);
+
+	/* Root ? Do nothing. */
+	if (is_root_inode(inode))
+		RETURN(0);
+
+	/* No open handle to close? Move away */
+	if (!it_disposition(it, DISP_OPEN_OPEN))
+		RETURN(0);
+
+	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
+
+	OBD_ALLOC(och, sizeof(*och));
+	if (!och)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+out:
+	/* this one is in place of ll_file_open */
+	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->it_request);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Get size for inode for which FIEMAP mapping is requested.
+ * Make the FIEMAP get_info call and returns the result.
+ * \param fiemap	kernel buffer to hold extens
+ * \param num_bytes	kernel buffer size
+ */
+static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
+			size_t num_bytes)
+{
+	struct lu_env			*env;
+	__u16				refcheck;
+	int				rc = 0;
+	struct ll_fiemap_info_key	fmkey = { .lfik_name = KEY_FIEMAP, };
+	ENTRY;
+
+	/* Checks for fiemap flags */
+	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+		return -EBADR;
+	}
+
+	/* Check for FIEMAP_FLAG_SYNC */
+	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
+		rc = filemap_fdatawrite(inode->i_mapping);
+		if (rc)
+			return rc;
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	if (i_size_read(inode) == 0) {
+		rc = ll_glimpse_size(inode);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+	obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
+	obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
+
+	/* If filesize is 0, then there would be no objects for mapping */
+	if (fmkey.lfik_oa.o_size == 0) {
+		fiemap->fm_mapped_extents = 0;
+		GOTO(out, rc = 0);
+	}
+
+	fmkey.lfik_fiemap = *fiemap;
+
+	rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
+			      &fmkey, fiemap, &num_bytes);
+out:
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+int ll_fid2path(struct inode *inode, void __user *arg)
+{
+	struct obd_export	*exp = ll_i2mdexp(inode);
+	const struct getinfo_fid2path __user *gfin = arg;
+	__u32			 pathlen;
+	struct getinfo_fid2path	*gfout;
+	size_t			 outsize;
+	int			 rc;
+
+	ENTRY;
+
+	if (!capable(CAP_DAC_READ_SEARCH) &&
+	    !test_bit(LL_SBI_USER_FID2PATH, ll_i2sbi(inode)->ll_flags))
+		RETURN(-EPERM);
+
+	/* Only need to get the buflen */
+	if (get_user(pathlen, &gfin->gf_pathlen))
+		RETURN(-EFAULT);
+
+	if (pathlen > PATH_MAX)
+		RETURN(-EINVAL);
+
+	outsize = sizeof(*gfout) + pathlen;
+	OBD_ALLOC(gfout, outsize);
+	if (gfout == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(gfout, arg, sizeof(*gfout)))
+		GOTO(gf_free, rc = -EFAULT);
+	/* append root FID after gfout to let MDT know the root FID so that it
+	 * can lookup the correct path, this is mainly for fileset.
+	 * old server without fileset mount support will ignore this. */
+	*gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
+
+	/* Call mdc_iocontrol */
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
+	if (rc != 0)
+		GOTO(gf_free, rc);
+
+	if (copy_to_user(arg, gfout, outsize))
+		rc = -EFAULT;
+
+gf_free:
+	OBD_FREE(gfout, outsize);
+	RETURN(rc);
+}
+
+static int
+ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
+{
+	struct cl_object *obj = ll_i2info(inode)->lli_clob;
+	struct lu_env *env;
+	struct cl_io *io;
+	__u16  refcheck;
+	int result;
+
+	ENTRY;
+
+	ioc->idv_version = 0;
+	ioc->idv_layout_version = UINT_MAX;
+
+	/* If no file object initialized, we consider its version is 0. */
+	if (obj == NULL)
+		RETURN(0);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = obj;
+	io->u.ci_data_version.dv_data_version = 0;
+	io->u.ci_data_version.dv_layout_version = UINT_MAX;
+	io->u.ci_data_version.dv_flags = ioc->idv_flags;
+
+restart:
+	if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
+		result = cl_io_loop(env, io);
+	else
+		result = io->ci_result;
+
+	ioc->idv_version = io->u.ci_data_version.dv_data_version;
+	ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
+
+	cl_io_fini(env, io);
+
+	if (unlikely(io->ci_need_restart))
+		goto restart;
+
+	cl_env_put(env, &refcheck);
+
+	RETURN(result);
+}
+
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param flags if do sync on the OST side;
+ *		0: no sync
+ *		LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
+ *		LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+{
+	struct ioc_data_version ioc = { .idv_flags = flags };
+	int rc;
+
+	rc = ll_ioc_data_version(inode, &ioc);
+	if (!rc)
+		*data_version = ioc.idv_version;
+
+	return rc;
+}
+
+/*
+ * Trigger a HSM release request for the provided inode.
+ */
+int ll_hsm_release(struct inode *inode)
+{
+	struct lu_env *env;
+	struct obd_client_handle *och = NULL;
+	__u64 data_version = 0;
+	__u16 refcheck;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
+	       ll_i2sbi(inode)->ll_fsname,
+	       PFID(&ll_i2info(inode)->lli_fid));
+
+	/*
+	 * For directory, this is not the right
+	 * way to do the release. Ideally this should clean
+	 * up the directory without triggering update to the backend.
+	 * Right now, this just sets the RELEASED bit for the
+	 * directory. This is left as is so as to have a way to set
+	 * the RELEASED bit as a deug/recovery method
+	 * instead of doing a rm on the directory.
+	 * TODO-MDLL: Tracking SIM - Simba-21969
+	 */
+	if (S_ISDIR(inode->i_mode))
+		och = ll_lease_open(inode, NULL, FMODE_READ, MDS_OPEN_RELEASE);
+	else
+		och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
+	if (IS_ERR(och))
+		GOTO(out, rc = PTR_ERR(och));
+
+	/* Grab latest data_version and [am]time values */
+	rc = ll_data_version(inode, &data_version,
+			     LL_DV_WR_FLUSH | LL_DV_SZ_UPDATE);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	/* Don't need to merge these attrs for directories */
+	if (!S_ISDIR(inode->i_mode)) {
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env))
+			GOTO(out, rc = PTR_ERR(env));
+
+		rc = ll_merge_attr(env, inode);
+		cl_env_put(env, &refcheck);
+
+		/* If error happen, we have the wrong size for a file.
+		 * Don't release it.
+		 */
+		if (rc != 0)
+			GOTO(out, rc);
+	}
+
+	/* Release the file.
+	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
+	 * we still need it to pack l_remote_handle to MDT. */
+	rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
+				       &data_version);
+	och = NULL;
+
+	EXIT;
+out:
+	if (och != NULL && !IS_ERR(och)) /* close the file */
+		ll_lease_close(och, inode, NULL);
+
+	return rc;
+}
+
+struct ll_swap_stack {
+	__u64			 dv1;
+	__u64			 dv2;
+	struct inode		*inode1;
+	struct inode		*inode2;
+	bool			 check_dv1;
+	bool			 check_dv2;
+};
+
+static int ll_swap_layouts(struct file *file1, struct file *file2,
+			   struct lustre_swap_layouts *lsl)
+{
+	struct mdc_swap_layouts	 msl;
+	struct md_op_data	*op_data;
+	__u32			 gid;
+	__u64			 dv;
+	struct ll_swap_stack	*llss = NULL;
+	int			 rc;
+
+	OBD_ALLOC_PTR(llss);
+	if (llss == NULL)
+		RETURN(-ENOMEM);
+
+	llss->inode1 = file_inode(file1);
+	llss->inode2 = file_inode(file2);
+
+	rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
+	if (rc < 0)
+		GOTO(free, rc);
+
+	/* we use 2 bool because it is easier to swap than 2 bits */
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
+		llss->check_dv1 = true;
+
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
+		llss->check_dv2 = true;
+
+	/* we cannot use lsl->sl_dvX directly because we may swap them */
+	llss->dv1 = lsl->sl_dv1;
+	llss->dv2 = lsl->sl_dv2;
+
+	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
+	if (rc == 0) /* same file, done! */
+		GOTO(free, rc);
+
+	if (rc < 0) { /* sequentialize it */
+		swap(llss->inode1, llss->inode2);
+		swap(file1, file2);
+		swap(llss->dv1, llss->dv2);
+		swap(llss->check_dv1, llss->check_dv2);
+	}
+
+	gid = lsl->sl_gid;
+	if (gid != 0) { /* application asks to flush dirty cache */
+		rc = ll_get_grouplock(llss->inode1, file1, gid);
+		if (rc < 0)
+			GOTO(free, rc);
+
+		rc = ll_get_grouplock(llss->inode2, file2, gid);
+		if (rc < 0) {
+			ll_put_grouplock(llss->inode1, file1, gid);
+			GOTO(free, rc);
+		}
+	}
+
+	/* ultimate check, before swaping the layouts we check if
+	 * dataversion has changed (if requested) */
+	if (llss->check_dv1) {
+		rc = ll_data_version(llss->inode1, &dv, 0);
+		if (rc)
+			GOTO(putgl, rc);
+		if (dv != llss->dv1)
+			GOTO(putgl, rc = -EAGAIN);
+	}
+
+	if (llss->check_dv2) {
+		rc = ll_data_version(llss->inode2, &dv, 0);
+		if (rc)
+			GOTO(putgl, rc);
+		if (dv != llss->dv2)
+			GOTO(putgl, rc = -EAGAIN);
+	}
+
+	/* struct md_op_data is used to send the swap args to the mdt
+	 * only flags is missing, so we use struct mdc_swap_layouts
+	 * through the md_op_data->op_data */
+	/* flags from user space have to be converted before they are send to
+	 * server, no flag is sent today, they are only used on the client */
+	msl.msl_flags = 0;
+	rc = -ENOMEM;
+	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
+				     0, LUSTRE_OPC_ANY, &msl);
+	if (IS_ERR(op_data))
+		GOTO(free, rc = PTR_ERR(op_data));
+
+	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
+			   sizeof(*op_data), op_data, NULL);
+	ll_finish_md_op_data(op_data);
+
+	if (rc < 0)
+		GOTO(putgl, rc);
+
+putgl:
+	if (gid != 0) {
+		ll_put_grouplock(llss->inode2, file2, gid);
+		ll_put_grouplock(llss->inode1, file1, gid);
+	}
+
+free:
+	if (llss != NULL)
+		OBD_FREE_PTR(llss);
+
+	RETURN(rc);
+}
+
+int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
+{
+	struct obd_export *exp = ll_i2mdexp(inode);
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+
+	/* Detect out-of range masks */
+	if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
+		RETURN(-EINVAL);
+
+	/* Non-root users are forbidden to set or clear flags which are
+	 * NOT defined in HSM_USER_MASK. */
+	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
+	    !capable(CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	if (!exp_connect_archive_id_array(exp)) {
+		/* Detect out-of range archive id */
+		if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
+		    (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
+			RETURN(-EINVAL);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, hss);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
+			   op_data, NULL);
+
+	ll_finish_md_op_data(op_data);
+
+	RETURN(rc);
+}
+
+int ll_hsm_import(struct inode *inode, struct file *file,
+		  struct hsm_user_import *hui)
+{
+	struct hsm_state_set	*hss = NULL;
+	struct iattr		*attr = NULL;
+	int			 rc;
+	ENTRY;
+
+	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+		RETURN(-EINVAL);
+
+	/* set HSM flags */
+	OBD_ALLOC_PTR(hss);
+	if (hss == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
+	hss->hss_archive_id = hui->hui_archive_id;
+	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
+	rc = ll_hsm_state_set(inode, hss);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	OBD_ALLOC_PTR(attr);
+	if (attr == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
+
+	if (S_ISDIR(inode->i_mode))
+		attr->ia_mode |= S_IFDIR;
+	else
+		attr->ia_mode |= S_IFREG;
+
+	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
+	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
+	attr->ia_size = hui->hui_size;
+	attr->ia_mtime.tv_sec = hui->hui_mtime;
+	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
+	attr->ia_atime.tv_sec = hui->hui_atime;
+	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
+
+	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
+			 ATTR_UID | ATTR_GID |
+			 ATTR_MTIME | ATTR_MTIME_SET |
+			 ATTR_ATIME | ATTR_ATIME_SET;
+
+	/*
+	 * TODO-MDLL check if this needs to be done here
+	 * or in ll_setattr_raw().  The ll_setattr_raw does a
+	 * unlock() before it calls the ll_md_setattr() for
+	 * regular files using S_ISREG(). Calling this for
+	 * inodes other than files might result in a deadlock.
+	 * Tracked with Simba-20393.
+	 */
+	if (S_ISREG(inode->i_mode))
+		inode_lock(inode);
+
+	rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
+	if (rc == -ENODATA)
+		rc = 0;
+
+	if (S_ISREG(inode->i_mode))
+		inode_unlock(inode);
+
+out:
+	if (hss != NULL)
+		OBD_FREE_PTR(hss);
+
+	if (attr != NULL)
+		OBD_FREE_PTR(attr);
+
+	RETURN(rc);
+}
+
+static inline long ll_lease_type_from_fmode(fmode_t fmode)
+{
+	return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
+	       ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
+}
+
+static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
+{
+	struct inode *inode = file_inode(file);
+	struct iattr ia = {
+		.ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
+			    ATTR_MTIME | ATTR_MTIME_SET |
+			    ATTR_CTIME,
+		.ia_atime = {
+			.tv_sec = lfu->lfu_atime_sec,
+			.tv_nsec = lfu->lfu_atime_nsec,
+		},
+		.ia_mtime = {
+			.tv_sec = lfu->lfu_mtime_sec,
+			.tv_nsec = lfu->lfu_mtime_nsec,
+		},
+		.ia_ctime = {
+			.tv_sec = lfu->lfu_ctime_sec,
+			.tv_nsec = lfu->lfu_ctime_nsec,
+		},
+	};
+	int rc;
+	ENTRY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	if (!S_ISREG(inode->i_mode))
+		RETURN(-EINVAL);
+
+	inode_lock(inode);
+	rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
+			    false);
+	inode_unlock(inode);
+
+	RETURN(rc);
+}
+
+static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
+{
+	switch (mode) {
+	case MODE_READ_USER:
+		return CLM_READ;
+	case MODE_WRITE_USER:
+		return CLM_WRITE;
+	default:
+		return -EINVAL;
+	}
+}
+
+static const char *const user_lockname[] = LOCK_MODE_NAMES;
+
+/* Used to allow the upper layers of the client to request an LDLM lock
+ * without doing an actual read or write.
+ *
+ * Used for ladvise lockahead to manually request specific locks.
+ *
+ * \param[in] file	file this ladvise lock request is on
+ * \param[in] ladvise	ladvise struct describing this lock request
+ *
+ * \retval 0		success, no detailed result available (sync requests
+ *			and requests sent to the server [not handled locally]
+ *			cannot return detailed results)
+ * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
+ *					 see definitions for details.
+ * \retval negative	negative errno on error
+ */
+int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
+{
+	struct lu_env *env = NULL;
+	struct cl_io *io  = NULL;
+	struct cl_lock *lock = NULL;
+	struct cl_lock_descr *descr = NULL;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	enum cl_lock_mode cl_mode;
+	off_t start = ladvise->lla_start;
+	off_t end = ladvise->lla_end;
+	int result;
+	__u16 refcheck;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE,
+	       "Lock request: file=%pd, inode=%p, mode=%s start=%llu, end=%llu\n",
+	       dentry, dentry->d_inode,
+	       user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
+	       (__u64) end);
+
+	cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
+	if (cl_mode < 0)
+		GOTO(out, result = cl_mode);
+
+	/* Get IO environment */
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result <= 0)
+		GOTO(out, result);
+
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result > 0) {
+		/*
+		 * nothing to do for this io. This currently happens when
+		 * stripe sub-object's are not yet created.
+		 */
+		result = io->ci_result;
+	} else if (result == 0) {
+		lock = vvp_env_lock(env);
+		descr = &lock->cll_descr;
+
+		descr->cld_obj   = io->ci_obj;
+		/* Convert byte offsets to pages */
+		descr->cld_start = cl_index(io->ci_obj, start);
+		descr->cld_end   = cl_index(io->ci_obj, end);
+		descr->cld_mode  = cl_mode;
+		/* CEF_MUST is used because we do not want to convert a
+		 * lockahead request to a lockless lock */
+		descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND;
+
+		if (ladvise->lla_peradvice_flags & LF_ASYNC)
+			descr->cld_enq_flags |= CEF_SPECULATIVE;
+
+		result = cl_lock_request(env, io, lock);
+
+		/* On success, we need to release the lock */
+		if (result >= 0)
+			cl_lock_release(env, lock);
+	}
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+
+	/* -ECANCELED indicates a matching lock with a different extent
+	 * was already present, and -EEXIST indicates a matching lock
+	 * on exactly the same extent was already present.
+	 * We convert them to positive values for userspace to make
+	 * recognizing true errors easier.
+	 * Note we can only return these detailed results on async requests,
+	 * as sync requests look the same as i/o requests for locking. */
+	if (result == -ECANCELED)
+		result = LLA_RESULT_DIFFERENT;
+	else if (result == -EEXIST)
+		result = LLA_RESULT_SAME;
+
+out:
+	RETURN(result);
+}
+static const char *const ladvise_names[] = LU_LADVISE_NAMES;
+
+static int ll_ladvise_sanity(struct inode *inode,
+			     struct llapi_lu_ladvise *ladvise)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	enum lu_ladvise_type advice = ladvise->lla_advice;
+	/* Note the peradvice flags is a 32 bit field, so per advice flags must
+	 * be in the first 32 bits of enum ladvise_flags */
+	__u32 flags = ladvise->lla_peradvice_flags;
+	/* 3 lines at 80 characters per line, should be plenty */
+	int rc = 0;
+
+	if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
+		rc = -EINVAL;
+		CDEBUG(D_VFSTRACE,
+		       "%s: advice with value '%d' not recognized, last supported advice is %s (value '%d'): rc = %d\n",
+		       sbi->ll_fsname, advice,
+		       ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
+		GOTO(out, rc);
+	}
+
+	/* Per-advice checks */
+	switch (advice) {
+	case LU_LADVISE_LOCKNOEXPAND:
+		if (flags & ~LF_LOCKNOEXPAND_MASK) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
+			       "rc = %d\n", sbi->ll_fsname, flags,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		break;
+	case LU_LADVISE_LOCKAHEAD:
+		/* Currently only READ and WRITE modes can be requested */
+		if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
+		    ladvise->lla_lockahead_mode == 0) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
+			       "rc = %d\n", sbi->ll_fsname,
+			       ladvise->lla_lockahead_mode,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		fallthrough;
+	case LU_LADVISE_WILLREAD:
+	case LU_LADVISE_DONTNEED:
+	default:
+		/* Note fall through above - These checks apply to all advices
+		 * except LOCKNOEXPAND */
+		if (flags & ~LF_DEFAULT_MASK) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
+			       "rc = %d\n", sbi->ll_fsname, flags,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		if (ladvise->lla_start >= ladvise->lla_end) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
+			       "for %s: rc = %d\n", sbi->ll_fsname,
+			       ladvise->lla_start, ladvise->lla_end,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		break;
+	}
+
+out:
+	return rc;
+}
+#undef ERRSIZE
+
+/*
+ * Give file access advices
+ *
+ * The ladvise interface is similar to Linux fadvise() system call, except it
+ * forwards the advices directly from Lustre client to server. The server side
+ * codes will apply appropriate read-ahead and caching techniques for the
+ * corresponding files.
+ *
+ * A typical workload for ladvise is e.g. a bunch of different clients are
+ * doing small random reads of a file, so prefetching pages into OSS cache
+ * with big linear reads before the random IO is a net benefit. Fetching
+ * all that data into each client cache with fadvise() may not be, due to
+ * much more data being sent to the client.
+ */
+static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
+		      struct llapi_lu_ladvise *ladvise)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct cl_ladvise_io *lio;
+	int rc;
+	__u16 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+
+	/* initialize parameters for ladvise */
+	lio = &io->u.ci_ladvise;
+	lio->li_start = ladvise->lla_start;
+	lio->li_end = ladvise->lla_end;
+	lio->li_fid = ll_inode2fid(inode);
+	lio->li_advice = ladvise->lla_advice;
+	lio->li_flags = flags;
+
+	if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
+		rc = cl_io_loop(env, io);
+	else
+		rc = io->ci_result;
+
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+static int ll_lock_noexpand(struct file *file, int flags)
+{
+	struct ll_file_data *fd = file->private_data;
+
+	fd->ll_lock_no_expand = !(flags & LF_UNSET);
+
+	return 0;
+}
+
+int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg)
+{
+	struct fsxattr fsxattr;
+
+	if (copy_from_user(&fsxattr,
+			   (const struct fsxattr __user *)arg,
+			   sizeof(fsxattr)))
+		RETURN(-EFAULT);
+
+	fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
+	if (test_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags))
+		fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+	fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
+	if (copy_to_user((struct fsxattr __user *)arg,
+			 &fsxattr, sizeof(fsxattr)))
+		RETURN(-EFAULT);
+
+	RETURN(0);
+}
+
+int ll_ioctl_check_project(struct inode *inode, __u32 xflags,
+			   __u32 projid)
+{
+	/*
+	 * Project Quota ID state is only allowed to change from within the init
+	 * namespace. Enforce that restriction only if we are trying to change
+	 * the quota ID state. Everything else is allowed in user namespaces.
+	 */
+	if (current_user_ns() == &init_user_ns) {
+		/*
+		 * Caller is allowed to change the project ID. if it is being
+		 * changed, make sure that the new value is valid.
+		 */
+		if (ll_i2info(inode)->lli_projid != projid &&
+		     !projid_valid(make_kprojid(&init_user_ns, projid)))
+			return -EINVAL;
+
+		return 0;
+	}
+
+	if (ll_i2info(inode)->lli_projid != projid)
+		return -EINVAL;
+
+	if (test_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags)) {
+		if (!(xflags & FS_XFLAG_PROJINHERIT))
+			return -EINVAL;
+	} else {
+		if (xflags & FS_XFLAG_PROJINHERIT)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ll_set_project(struct inode *inode, __u32 xflags, __u32 projid)
+{
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	struct cl_object *obj;
+	unsigned int inode_flags;
+	int rc = 0;
+
+	rc = ll_ioctl_check_project(inode, xflags, projid);
+	if (rc)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	inode_flags = ll_xflags_to_inode_flags(xflags);
+	op_data->op_attr_flags = ll_inode_to_ext_flags(inode_flags);
+	if (xflags & FS_XFLAG_PROJINHERIT)
+		op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
+	op_data->op_projid = projid;
+	op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
+	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL, 0, &req);
+	ptlrpc_req_finished(req);
+	if (rc)
+		GOTO(out_fsxattr, rc);
+	ll_update_inode_flags(inode, op_data->op_attr_flags);
+
+	/* Avoid OST RPC if this is only ioctl setting project inherit flag */
+	if (xflags == 0 || xflags == FS_XFLAG_PROJINHERIT)
+		GOTO(out_fsxattr, rc);
+
+	obj = ll_i2info(inode)->lli_clob;
+	if (obj) {
+		struct iattr attr = { 0 };
+
+		rc = cl_setattr_ost(obj, &attr, OP_XVALID_FLAGS, xflags);
+	}
+
+out_fsxattr:
+	ll_finish_md_op_data(op_data);
+	RETURN(rc);
+}
+
+int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg)
+{
+	struct fsxattr fsxattr;
+
+	ENTRY;
+
+	if (copy_from_user(&fsxattr,
+			   (const struct fsxattr __user *)arg,
+			   sizeof(fsxattr)))
+		RETURN(-EFAULT);
+
+	RETURN(ll_set_project(inode, fsxattr.fsx_xflags,
+			      fsxattr.fsx_projid));
+}
+
+int ll_ioctl_project(struct file *file, unsigned int cmd,
+		     unsigned long arg)
+{
+	struct lu_project lu_project;
+	struct dentry *dentry = file_dentry(file);
+	struct inode *inode = file_inode(file);
+	struct dentry *child_dentry = NULL;
+	int rc = 0, name_len;
+
+	if (copy_from_user(&lu_project,
+			   (const struct lu_project __user *)arg,
+			   sizeof(lu_project)))
+		RETURN(-EFAULT);
+
+	/* apply child dentry if name is valid */
+	name_len = strnlen(lu_project.project_name, NAME_MAX);
+	if (name_len > 0 && name_len <= NAME_MAX) {
+		inode_lock(inode);
+		child_dentry = lookup_one_len(lu_project.project_name,
+					      dentry, name_len);
+		inode_unlock(inode);
+		if (IS_ERR(child_dentry)) {
+			rc = PTR_ERR(child_dentry);
+			goto out;
+		}
+		inode = child_dentry->d_inode;
+		if (!inode) {
+			rc = -ENOENT;
+			goto out;
+		}
+	} else if (name_len > NAME_MAX) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	switch (lu_project.project_type) {
+	case LU_PROJECT_SET:
+		rc = ll_set_project(inode, lu_project.project_xflags,
+				    lu_project.project_id);
+		break;
+	case LU_PROJECT_GET:
+		lu_project.project_xflags =
+				ll_inode_flags_to_xflags(inode->i_flags);
+		if (test_bit(LLIF_PROJECT_INHERIT,
+			     &ll_i2info(inode)->lli_flags))
+			lu_project.project_xflags |= FS_XFLAG_PROJINHERIT;
+		lu_project.project_id = ll_i2info(inode)->lli_projid;
+		if (copy_to_user((struct lu_project __user *)arg,
+				 &lu_project, sizeof(lu_project))) {
+			rc = -EFAULT;
+			goto out;
+		}
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+out:
+	if (!IS_ERR_OR_NULL(child_dentry))
+		dput(child_dentry);
+	RETURN(rc);
+}
+
+static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
+				 unsigned long arg)
+{
+	struct inode		*inode = file_inode(file);
+	struct ll_file_data	*fd = file->private_data;
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct obd_client_handle *och = NULL;
+	struct split_param sp;
+	struct pcc_param param;
+	bool lease_broken = false;
+	fmode_t fmode = 0;
+	enum mds_op_bias bias = 0;
+	struct file *layout_file = NULL;
+	void *data = NULL;
+	size_t data_size = 0;
+	bool attached = false;
+	long rc, rc2 = 0;
+
+	ENTRY;
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_lease_och != NULL) {
+		och = fd->fd_lease_och;
+		fd->fd_lease_och = NULL;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (och == NULL)
+		RETURN(-ENOLCK);
+
+	fmode = och->och_flags;
+
+	switch (ioc->lil_flags) {
+	case LL_LEASE_RESYNC_DONE:
+		if (ioc->lil_count > IOC_IDS_MAX)
+			GOTO(out_lease_close, rc = -EINVAL);
+
+		data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
+		OBD_ALLOC(data, data_size);
+		if (!data)
+			GOTO(out_lease_close, rc = -ENOMEM);
+
+		if (copy_from_user(data, (void __user *)arg, data_size))
+			GOTO(out_lease_close, rc = -EFAULT);
+
+		bias = MDS_CLOSE_RESYNC_DONE;
+		break;
+	case LL_LEASE_LAYOUT_MERGE: {
+		int fd;
+
+		if (ioc->lil_count != 1)
+			GOTO(out_lease_close, rc = -EINVAL);
+
+		arg += sizeof(*ioc);
+		if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
+			GOTO(out_lease_close, rc = -EFAULT);
+
+		layout_file = fget(fd);
+		if (!layout_file)
+			GOTO(out_lease_close, rc = -EBADF);
+
+		if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
+				(layout_file->f_flags & O_ACCMODE) == O_RDONLY)
+			GOTO(out_lease_close, rc = -EPERM);
+
+		data = file_inode(layout_file);
+		bias = MDS_CLOSE_LAYOUT_MERGE;
+		break;
+	}
+	case LL_LEASE_LAYOUT_SPLIT: {
+		int fdv;
+		int mirror_id;
+
+		if (ioc->lil_count != 2)
+			GOTO(out_lease_close, rc = -EINVAL);
+
+		arg += sizeof(*ioc);
+		if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
+			GOTO(out_lease_close, rc = -EFAULT);
+
+		arg += sizeof(__u32);
+		if (copy_from_user(&mirror_id, (void __user *)arg,
+				   sizeof(__u32)))
+			GOTO(out_lease_close, rc = -EFAULT);
+
+		layout_file = fget(fdv);
+		if (!layout_file)
+			GOTO(out_lease_close, rc = -EBADF);
+
+		/* if layout_file == file, it means to destroy the mirror */
+		sp.sp_inode = file_inode(layout_file);
+		sp.sp_mirror_id = (__u16)mirror_id;
+		data = &sp;
+		bias = MDS_CLOSE_LAYOUT_SPLIT;
+		break;
+	}
+	case LL_LEASE_PCC_ATTACH:
+		if (ioc->lil_count != 1)
+			RETURN(-EINVAL);
+
+		if (IS_ENCRYPTED(inode))
+			RETURN(-EOPNOTSUPP);
+
+		arg += sizeof(*ioc);
+		if (copy_from_user(&param.pa_archive_id, (void __user *)arg,
+				   sizeof(__u32)))
+			GOTO(out_lease_close, rc2 = -EFAULT);
+
+		rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
+		if (rc2)
+			GOTO(out_lease_close, rc2);
+
+		attached = true;
+		/* Grab latest data version */
+		rc2 = ll_data_version(inode, &param.pa_data_version,
+				     LL_DV_WR_FLUSH);
+		if (rc2)
+			GOTO(out_lease_close, rc2);
+
+		data = &param;
+		bias = MDS_PCC_ATTACH;
+		break;
+	default:
+		/* without close intent */
+		break;
+	}
+
+out_lease_close:
+	rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	rc = ll_lease_och_release(inode, file);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	if (lease_broken)
+		fmode = 0;
+	EXIT;
+
+out:
+	switch (ioc->lil_flags) {
+	case LL_LEASE_RESYNC_DONE:
+		if (data)
+			OBD_FREE(data, data_size);
+		break;
+	case LL_LEASE_LAYOUT_MERGE:
+	case LL_LEASE_LAYOUT_SPLIT:
+		if (layout_file)
+			fput(layout_file);
+
+		ll_layout_refresh(inode, &fd->fd_layout_version);
+		break;
+	case LL_LEASE_PCC_ATTACH:
+		if (!rc)
+			rc = rc2;
+		rc = pcc_readwrite_attach_fini(file, inode,
+					       param.pa_layout_gen,
+					       lease_broken, rc,
+					       attached);
+		break;
+	}
+
+	if (!rc)
+		rc = ll_lease_type_from_fmode(fmode);
+	RETURN(rc);
+}
+
+static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
+			      unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = file->private_data;
+	struct obd_client_handle *och = NULL;
+	__u64 open_flags = 0;
+	bool lease_broken;
+	fmode_t fmode;
+	long rc;
+	ENTRY;
+
+	switch (ioc->lil_mode) {
+	case LL_LEASE_WRLCK:
+		if (!(file->f_mode & FMODE_WRITE))
+			RETURN(-EPERM);
+		fmode = FMODE_WRITE;
+		break;
+	case LL_LEASE_RDLCK:
+		if (!(file->f_mode & FMODE_READ))
+			RETURN(-EPERM);
+		fmode = FMODE_READ;
+		break;
+	case LL_LEASE_UNLCK:
+		RETURN(ll_file_unlock_lease(file, ioc, arg));
+	default:
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+
+	/* apply for lease */
+	if (ioc->lil_flags & LL_LEASE_RESYNC)
+		open_flags = MDS_OPEN_RESYNC;
+	och = ll_lease_open(inode, file, fmode, open_flags);
+	if (IS_ERR(och))
+		RETURN(PTR_ERR(och));
+
+	if (ioc->lil_flags & LL_LEASE_RESYNC) {
+		rc = ll_lease_file_resync(och, inode, arg);
+		if (rc) {
+			ll_lease_close(och, inode, NULL);
+			RETURN(rc);
+		}
+		rc = ll_layout_refresh(inode, &fd->fd_layout_version);
+		if (rc) {
+			ll_lease_close(och, inode, NULL);
+			RETURN(rc);
+		}
+	}
+
+	rc = 0;
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_lease_och == NULL) {
+		fd->fd_lease_och = och;
+		och = NULL;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+	if (och != NULL) {
+		/* impossible now that only excl is supported for now */
+		ll_lease_close(och, inode, &lease_broken);
+		rc = -EBUSY;
+	}
+	RETURN(rc);
+}
+
+static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	__u64 now = ktime_get_real_seconds();
+	int i;
+
+	spin_lock(&lli->lli_heat_lock);
+	heat->lh_flags = lli->lli_heat_flags;
+	for (i = 0; i < heat->lh_count; i++)
+		heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
+						now, sbi->ll_heat_decay_weight,
+						sbi->ll_heat_period_second);
+	spin_unlock(&lli->lli_heat_lock);
+}
+
+static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc = 0;
+
+	spin_lock(&lli->lli_heat_lock);
+	if (flags & LU_HEAT_FLAG_CLEAR)
+		obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
+
+	if (flags & LU_HEAT_FLAG_OFF)
+		lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
+	else
+		lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
+
+	spin_unlock(&lli->lli_heat_lock);
+
+	RETURN(rc);
+}
+
+static long
+ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode		*inode = file_inode(file);
+	struct ll_file_data	*fd = file->private_data;
+	int			 flags, rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
+	       PFID(ll_inode2fid(inode)), inode, cmd);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		RETURN(-ENOTTY);
+
+	switch (cmd) {
+	case LL_IOC_GETFLAGS:
+		/* Get the current value of the file flags */
+		return put_user(fd->fd_flags, (int __user *)arg);
+        case LL_IOC_SETFLAGS:
+        case LL_IOC_CLRFLAGS:
+                /* Set or clear specific file flags */
+                /* XXX This probably needs checks to ensure the flags are
+                 *     not abused, and to handle any flag side effects.
+                 */
+		if (get_user(flags, (int __user *) arg))
+                        RETURN(-EFAULT);
+
+                if (cmd == LL_IOC_SETFLAGS) {
+                        if ((flags & LL_FILE_IGNORE_LOCK) &&
+                            !(file->f_flags & O_DIRECT)) {
+                                CERROR("%s: unable to disable locking on "
+                                       "non-O_DIRECT file\n", current->comm);
+                                RETURN(-EINVAL);
+                        }
+
+                        fd->fd_flags |= flags;
+                } else {
+                        fd->fd_flags &= ~flags;
+                }
+                RETURN(0);
+	case LL_IOC_LOV_SETSTRIPE:
+	case LL_IOC_LOV_SETSTRIPE_NEW:
+		RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
+	case LL_IOC_LOV_SETEA:
+		RETURN(ll_lov_setea(inode, file, (void __user *)arg));
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct file *file2;
+		struct lustre_swap_layouts lsl;
+
+		if (copy_from_user(&lsl, (char __user *)arg,
+				   sizeof(struct lustre_swap_layouts)))
+			RETURN(-EFAULT);
+
+		if ((file->f_flags & O_ACCMODE) == O_RDONLY)
+			RETURN(-EPERM);
+
+		file2 = fget(lsl.sl_fd);
+		if (file2 == NULL)
+			RETURN(-EBADF);
+
+		/* O_WRONLY or O_RDWR */
+		if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
+			GOTO(out, rc = -EPERM);
+
+		if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
+			struct inode			*inode2;
+			struct ll_inode_info		*lli;
+			struct obd_client_handle	*och = NULL;
+
+			lli = ll_i2info(inode);
+			mutex_lock(&lli->lli_och_mutex);
+			if (fd->fd_lease_och != NULL) {
+				och = fd->fd_lease_och;
+				fd->fd_lease_och = NULL;
+			}
+			mutex_unlock(&lli->lli_och_mutex);
+			if (och == NULL)
+				GOTO(out, rc = -ENOLCK);
+			inode2 = file_inode(file2);
+			rc = ll_swap_layouts_close(och, inode, inode2);
+		} else {
+			rc = ll_swap_layouts(file, file2, &lsl);
+		}
+out:
+		fput(file2);
+		RETURN(rc);
+	}
+	case LL_IOC_LOV_GETSTRIPE:
+	case LL_IOC_LOV_GETSTRIPE_NEW:
+		RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
+	case FS_IOC_GETFLAGS:
+	case FS_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION:
+	case FS_IOC_GETVERSION:
+		RETURN(put_user(inode->i_generation, (int __user *)arg));
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field. */
+	case FS_IOC_SETVERSION:
+		RETURN(-ENOTSUPP);
+
+        case LL_IOC_GROUP_LOCK:
+                RETURN(ll_get_grouplock(inode, file, arg));
+        case LL_IOC_GROUP_UNLOCK:
+                RETURN(ll_put_grouplock(inode, file, arg));
+        case IOC_OBD_STATFS:
+		RETURN(ll_obd_statfs(inode, (void __user *)arg));
+
+	case LL_IOC_FLUSHCTX:
+		RETURN(ll_flush_ctx(inode));
+	case LL_IOC_PATH2FID: {
+		if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
+				 sizeof(struct lu_fid)))
+			RETURN(-EFAULT);
+
+		RETURN(0);
+	}
+	case LL_IOC_GETPARENT:
+		RETURN(ll_getparent(file, (struct getparent __user *)arg));
+
+	case OBD_IOC_FID2PATH:
+		RETURN(ll_fid2path(inode, (void __user *)arg));
+	case LL_IOC_DATA_VERSION: {
+		struct ioc_data_version	idv;
+		int rc;
+
+		if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
+			RETURN(-EFAULT);
+
+		idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
+		rc = ll_ioc_data_version(inode, &idv);
+
+		if (rc == 0 &&
+		    copy_to_user((char __user *)arg, &idv, sizeof(idv)))
+			RETURN(-EFAULT);
+
+		RETURN(rc);
+	}
+
+        case LL_IOC_GET_MDTIDX: {
+                int mdtidx;
+
+                mdtidx = ll_get_mdt_idx(inode);
+                if (mdtidx < 0)
+                        RETURN(mdtidx);
+
+		if (put_user((int)mdtidx, (int __user *)arg))
+                        RETURN(-EFAULT);
+
+                RETURN(0);
+        }
+	case OBD_IOC_GETNAME_OLD:
+	case OBD_IOC_GETDTNAME:
+	case OBD_IOC_GETMDNAME:
+		RETURN(ll_get_obd_name(inode, cmd, arg));
+	case LL_IOC_HSM_STATE_GET: {
+		struct md_op_data	*op_data;
+		struct hsm_user_state	*hus;
+		int			 rc;
+
+		OBD_ALLOC_PTR(hus);
+		if (hus == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hus);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hus);
+			RETURN(PTR_ERR(op_data));
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hus);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_STATE_SET: {
+		struct hsm_state_set	*hss;
+		int			 rc;
+
+		OBD_ALLOC_PTR(hss);
+		if (hss == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
+			OBD_FREE_PTR(hss);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_hsm_state_set(inode, hss);
+
+		OBD_FREE_PTR(hss);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data *op_data;
+		struct hsm_current_action *hca;
+		const char *action;
+		int rc;
+
+		OBD_ALLOC_PTR(hca);
+		if (hca == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hca);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hca);
+			RETURN(PTR_ERR(op_data));
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+		if (rc < 0)
+			GOTO(skip_copy, rc);
+
+		/* The hsm_current_action retreived from the server could
+		 * contain corrupt information. If it is incorrect data collect
+		 * debug information. We still send the data even if incorrect
+		 * to user land to handle.
+		 */
+		action = hsm_user_action2name(hca->hca_action);
+		if (strcmp(action, "UNKNOWN") == 0 ||
+		    hca->hca_state > HPS_DONE) {
+			CDEBUG(D_HSM,
+			       "HSM current state %s action %s, offset = %llu, length %llu\n",
+			       hsm_progress_state2name(hca->hca_state), action,
+			       hca->hca_location.offset, hca->hca_location.length);
+		}
+
+		if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
+			rc = -EFAULT;
+skip_copy:
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hca);
+		RETURN(rc);
+	}
+	case LL_IOC_SET_LEASE_OLD: {
+		struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
+
+		RETURN(ll_file_set_lease(file, &ioc, 0));
+	}
+	case LL_IOC_SET_LEASE: {
+		struct ll_ioc_lease ioc;
+
+		if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
+			RETURN(-EFAULT);
+
+		RETURN(ll_file_set_lease(file, &ioc, arg));
+	}
+	case LL_IOC_GET_LEASE: {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct ldlm_lock *lock = NULL;
+		fmode_t fmode = 0;
+
+		mutex_lock(&lli->lli_och_mutex);
+		if (fd->fd_lease_och != NULL) {
+			struct obd_client_handle *och = fd->fd_lease_och;
+
+			lock = ldlm_handle2lock(&och->och_lease_handle);
+			if (lock != NULL) {
+				lock_res_and_lock(lock);
+				if (!ldlm_is_cancel(lock))
+					fmode = och->och_flags;
+
+				unlock_res_and_lock(lock);
+				LDLM_LOCK_PUT(lock);
+			}
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+
+		RETURN(ll_lease_type_from_fmode(fmode));
+	}
+	case LL_IOC_HSM_IMPORT: {
+		struct hsm_user_import *hui;
+
+		OBD_ALLOC_PTR(hui);
+		if (hui == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
+			OBD_FREE_PTR(hui);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_hsm_import(inode, file, hui);
+
+		OBD_FREE_PTR(hui);
+		RETURN(rc);
+	}
+	case LL_IOC_FUTIMES_3: {
+		struct ll_futimes_3 lfu;
+
+		if (copy_from_user(&lfu,
+				   (const struct ll_futimes_3 __user *)arg,
+				   sizeof(lfu)))
+			RETURN(-EFAULT);
+
+		RETURN(ll_file_futimes_3(file, &lfu));
+	}
+	case LL_IOC_LADVISE: {
+		struct llapi_ladvise_hdr *k_ladvise_hdr;
+		struct llapi_ladvise_hdr __user *u_ladvise_hdr;
+		int i;
+		int num_advise;
+		int alloc_size = sizeof(*k_ladvise_hdr);
+
+		rc = 0;
+		u_ladvise_hdr = (void __user *)arg;
+		OBD_ALLOC_PTR(k_ladvise_hdr);
+		if (k_ladvise_hdr == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
+			GOTO(out_ladvise, rc = -EFAULT);
+
+		if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
+		    k_ladvise_hdr->lah_count < 1)
+			GOTO(out_ladvise, rc = -EINVAL);
+
+		num_advise = k_ladvise_hdr->lah_count;
+		if (num_advise >= LAH_COUNT_MAX)
+			GOTO(out_ladvise, rc = -EFBIG);
+
+		OBD_FREE_PTR(k_ladvise_hdr);
+		alloc_size = offsetof(typeof(*k_ladvise_hdr),
+				      lah_advise[num_advise]);
+		OBD_ALLOC(k_ladvise_hdr, alloc_size);
+		if (k_ladvise_hdr == NULL)
+			RETURN(-ENOMEM);
+
+		/*
+		 * TODO: submit multiple advices to one server in a single RPC
+		 */
+		if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
+			GOTO(out_ladvise, rc = -EFAULT);
+
+		for (i = 0; i < num_advise; i++) {
+			struct llapi_lu_ladvise *k_ladvise =
+					&k_ladvise_hdr->lah_advise[i];
+			struct llapi_lu_ladvise __user *u_ladvise =
+					&u_ladvise_hdr->lah_advise[i];
+
+			rc = ll_ladvise_sanity(inode, k_ladvise);
+			if (rc)
+				GOTO(out_ladvise, rc);
+
+			switch (k_ladvise->lla_advice) {
+			case LU_LADVISE_LOCKNOEXPAND:
+				rc = ll_lock_noexpand(file,
+					       k_ladvise->lla_peradvice_flags);
+				GOTO(out_ladvise, rc);
+			case LU_LADVISE_LOCKAHEAD:
+
+				rc = ll_file_lock_ahead(file, k_ladvise);
+
+				if (rc < 0)
+					GOTO(out_ladvise, rc);
+
+				if (put_user(rc,
+					     &u_ladvise->lla_lockahead_result))
+					GOTO(out_ladvise, rc = -EFAULT);
+				break;
+			default:
+				rc = ll_ladvise(inode, file,
+						k_ladvise_hdr->lah_flags,
+						k_ladvise);
+				if (rc)
+					GOTO(out_ladvise, rc);
+				break;
+			}
+
+		}
+
+out_ladvise:
+		OBD_FREE(k_ladvise_hdr, alloc_size);
+		RETURN(rc);
+	}
+	case LL_IOC_FLR_SET_MIRROR: {
+		/* mirror I/O must be direct to avoid polluting page cache
+		 * by stale data. */
+		if (!(file->f_flags & O_DIRECT))
+			RETURN(-EINVAL);
+
+		fd->fd_designated_mirror = (__u32)arg;
+		RETURN(0);
+	}
+	case FS_IOC_FSGETXATTR:
+		RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
+	case FS_IOC_FSSETXATTR:
+		RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
+	case LL_IOC_PROJECT:
+		RETURN(ll_ioctl_project(file, cmd, arg));
+	case BLKSSZGET:
+		RETURN(put_user(PAGE_SIZE, (int __user *)arg));
+	case LL_IOC_HEAT_GET: {
+		struct lu_heat uheat;
+		struct lu_heat *heat;
+		int size;
+
+		if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
+			RETURN(-EFAULT);
+
+		if (uheat.lh_count > OBD_HEAT_COUNT)
+			uheat.lh_count = OBD_HEAT_COUNT;
+
+		size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
+		OBD_ALLOC(heat, size);
+		if (heat == NULL)
+			RETURN(-ENOMEM);
+
+		heat->lh_count = uheat.lh_count;
+		ll_heat_get(inode, heat);
+		rc = copy_to_user((char __user *)arg, heat, size);
+		OBD_FREE(heat, size);
+		RETURN(rc ? -EFAULT : 0);
+	}
+	case LL_IOC_HEAT_SET: {
+		__u64 flags;
+
+		if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
+			RETURN(-EFAULT);
+
+		rc = ll_heat_set(inode, flags);
+		RETURN(rc);
+	}
+	case LL_IOC_PCC_DETACH: {
+		struct lu_pcc_detach *detach;
+
+		OBD_ALLOC_PTR(detach);
+		if (detach == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(detach,
+				   (const struct lu_pcc_detach __user *)arg,
+				   sizeof(*detach)))
+			GOTO(out_detach_free, rc = -EFAULT);
+
+		if (!S_ISREG(inode->i_mode))
+			GOTO(out_detach_free, rc = -EINVAL);
+
+		if (!inode_owner_or_capable(&init_user_ns, inode))
+			GOTO(out_detach_free, rc = -EPERM);
+
+		rc = pcc_ioctl_detach(inode, detach->pccd_opt);
+out_detach_free:
+		OBD_FREE_PTR(detach);
+		RETURN(rc);
+	}
+	case LL_IOC_PCC_STATE: {
+		struct lu_pcc_state __user *ustate =
+			(struct lu_pcc_state __user *)arg;
+		struct lu_pcc_state *state;
+
+		OBD_ALLOC_PTR(state);
+		if (state == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(state, ustate, sizeof(*state)))
+			GOTO(out_state, rc = -EFAULT);
+
+		rc = pcc_ioctl_state(file, inode, state);
+		if (rc)
+			GOTO(out_state, rc);
+
+		if (copy_to_user(ustate, state, sizeof(*state)))
+			GOTO(out_state, rc = -EFAULT);
+
+out_state:
+		OBD_FREE_PTR(state);
+		RETURN(rc);
+	}
+#ifdef HAVE_LUSTRE_CRYPTO
+	case LL_IOC_SET_ENCRYPTION_POLICY:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		return llcrypt_ioctl_set_policy(file, (const void __user *)arg);
+	case LL_IOC_GET_ENCRYPTION_POLICY_EX:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		return llcrypt_ioctl_get_policy_ex(file, (void __user *)arg);
+	case LL_IOC_ADD_ENCRYPTION_KEY:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		return llcrypt_ioctl_add_key(file, (void __user *)arg);
+	case LL_IOC_REMOVE_ENCRYPTION_KEY:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		return llcrypt_ioctl_remove_key(file, (void __user *)arg);
+	case LL_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		return llcrypt_ioctl_remove_key_all_users(file,
+							  (void __user *)arg);
+	case LL_IOC_GET_ENCRYPTION_KEY_STATUS:
+		if (!ll_sbi_has_encrypt(ll_i2sbi(inode)))
+			return -EOPNOTSUPP;
+		return llcrypt_ioctl_get_key_status(file, (void __user *)arg);
+#endif
+
+	case LL_IOC_UNLOCK_FOREIGN: {
+		struct dentry *dentry = file_dentry(file);
+
+		/* if not a foreign symlink do nothing */
+		if (ll_foreign_is_removable(dentry, true)) {
+			CDEBUG(D_INFO,
+			       "prevent unlink of non-foreign file ("DFID")\n",
+			       PFID(ll_inode2fid(inode)));
+			RETURN(-EOPNOTSUPP);
+		}
+		RETURN(0);
+	}
+
+	default:
+		RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
+				     (void __user *)arg));
+	}
+}
+
+loff_t ll_lseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file_inode(file);
+	struct lu_env *env;
+	struct cl_io *io;
+	struct cl_lseek_io *lsio;
+	__u16 refcheck;
+	int rc;
+	loff_t retval;
+
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	ll_io_set_mirror(io, file);
+
+	lsio = &io->u.ci_lseek;
+	lsio->ls_start = offset;
+	lsio->ls_whence = whence;
+	lsio->ls_result = -ENXIO;
+
+	do {
+		rc = cl_io_init(env, io, CIT_LSEEK, io->ci_obj);
+		if (!rc) {
+			struct vvp_io *vio = vvp_env_io(env);
+
+			vio->vui_fd = file->private_data;
+			rc = cl_io_loop(env, io);
+		} else {
+			rc = io->ci_result;
+		}
+		retval = rc ? : lsio->ls_result;
+		cl_io_fini(env, io);
+	} while (unlikely(io->ci_need_restart));
+
+	cl_env_put(env, &refcheck);
+
+	/* Without the key, SEEK_HOLE return value has to be
+	 * rounded up to next LUSTRE_ENCRYPTION_UNIT_SIZE.
+	 */
+	if (llcrypt_require_key(inode) == -ENOKEY && whence == SEEK_HOLE)
+		retval = round_up(retval, LUSTRE_ENCRYPTION_UNIT_SIZE);
+
+	RETURN(retval);
+}
+
+static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file_inode(file);
+	loff_t retval = offset, eof = 0;
+	ktime_t kstart = ktime_get();
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
+	       PFID(ll_inode2fid(inode)), inode, retval, retval,
+	       origin);
+
+	if (origin == SEEK_END) {
+		retval = ll_glimpse_size(inode);
+		if (retval != 0)
+			RETURN(retval);
+		eof = i_size_read(inode);
+	}
+
+	if (origin == SEEK_HOLE || origin == SEEK_DATA) {
+		if (offset < 0)
+			return -ENXIO;
+
+		/* flush local cache first if any */
+		cl_sync_file_range(inode, offset, OBD_OBJECT_EOF,
+				   CL_FSYNC_LOCAL, 0);
+
+		retval = ll_lseek(file, offset, origin);
+		if (retval < 0)
+			return retval;
+		retval = vfs_setpos(file, retval, ll_file_maxbytes(inode));
+	} else {
+		retval = generic_file_llseek_size(file, offset, origin,
+						  ll_file_maxbytes(inode), eof);
+	}
+	if (retval >= 0)
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK,
+				   ktime_us_delta(ktime_get(), kstart));
+	RETURN(retval);
+}
+
+static int ll_flush(struct file *file, fl_owner_t id)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = file->private_data;
+	int rc, err;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	rc = lli->lli_async_rc;
+	lli->lli_async_rc = 0;
+	if (lli->lli_clob != NULL) {
+		err = lov_read_and_clear_async_rc(lli->lli_clob);
+		if (rc == 0)
+			rc = err;
+	}
+
+	/* The application has been told write failure already.
+	 * Do not report failure again. */
+	if (fd->fd_write_failed)
+		return 0;
+	return rc ? -EIO : 0;
+}
+
+/**
+ * Called to make sure a portion of file has been written out.
+ * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
+ *
+ * Return how many pages have been written.
+ */
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode, int ignore_layout)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct cl_fsync_io *fio;
+	int result;
+	__u16 refcheck;
+	ENTRY;
+
+	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
+	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
+		RETURN(-EINVAL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	io->ci_ignore_layout = ignore_layout;
+
+	/* initialize parameters for sync */
+	fio = &io->u.ci_fsync;
+	fio->fi_start = start;
+	fio->fi_end = end;
+	fio->fi_fid = ll_inode2fid(inode);
+	fio->fi_mode = mode;
+	fio->fi_nr_written = 0;
+
+	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
+		result = cl_io_loop(env, io);
+	else
+		result = io->ci_result;
+	if (result == 0)
+		result = fio->fi_nr_written;
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+
+	RETURN(result);
+}
+
+/*
+ * When dentry is provided (the 'else' case), file_dentry() may be
+ * null and dentry must be used directly rather than pulled from
+ * file_dentry() as is done otherwise.
+ */
+
+int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct dentry *dentry = file_dentry(file);
+	struct inode *inode = dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ptlrpc_request *req;
+	ktime_t kstart = ktime_get();
+	int rc, err;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op:inode="DFID"(%p), start %lld, end %lld, datasync %d\n",
+	       PFID(ll_inode2fid(inode)), inode, start, end, datasync);
+
+	/* fsync's caller has already called _fdata{sync,write}, we want
+	 * that IO to finish before calling the osc and mdc sync methods */
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	if (!S_ISDIR(inode->i_mode)) {
+		err = lli->lli_async_rc;
+		lli->lli_async_rc = 0;
+		if (rc == 0)
+			rc = err;
+		if (lli->lli_clob != NULL) {
+			err = lov_read_and_clear_async_rc(lli->lli_clob);
+			if (rc == 0)
+				rc = err;
+		}
+	}
+
+	err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
+	if (!rc)
+		rc = err;
+	if (!err)
+		ptlrpc_req_finished(req);
+
+	if (S_ISREG(inode->i_mode)) {
+		struct ll_file_data *fd = file->private_data;
+		bool cached;
+
+		/* Sync metadata on MDT first, and then sync the cached data
+		 * on PCC.
+		 */
+		err = pcc_fsync(file, start, end, datasync, &cached);
+		if (!cached)
+			err = cl_sync_file_range(inode, start, end,
+						 CL_FSYNC_ALL, 0);
+		if (rc == 0 && err < 0)
+			rc = err;
+		if (rc < 0)
+			fd->fd_write_failed = true;
+		else
+			fd->fd_write_failed = false;
+	}
+
+	if (!rc)
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC,
+				   ktime_us_delta(ktime_get(), kstart));
+	RETURN(rc);
+}
+
+static int
+ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ldlm_enqueue_info einfo = {
+		.ei_type	= LDLM_FLOCK,
+		.ei_cb_cp	= ldlm_flock_completion_ast,
+		.ei_cbdata	= file_lock,
+	};
+	struct md_op_data *op_data;
+	struct lustre_handle lockh = { 0 };
+	union ldlm_policy_data flock = { { 0 } };
+	int fl_type = file_lock->fl_type;
+	ktime_t kstart = ktime_get();
+	__u64 flags = 0;
+	int rc;
+	int rc2 = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
+	       PFID(ll_inode2fid(inode)), file_lock);
+
+	if (file_lock->fl_flags & FL_FLOCK) {
+		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
+		/* flocks are whole-file locks */
+		flock.l_flock.end = OFFSET_MAX;
+		/* For flocks owner is determined by the local file desctiptor*/
+		flock.l_flock.owner = (unsigned long)file_lock->fl_file;
+	} else if (file_lock->fl_flags & FL_POSIX) {
+		flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
+		flock.l_flock.start = file_lock->fl_start;
+		flock.l_flock.end = file_lock->fl_end;
+	} else {
+		RETURN(-EINVAL);
+	}
+	flock.l_flock.pid = file_lock->fl_pid;
+
+#if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
+	/* Somewhat ugly workaround for svc lockd.
+	 * lockd installs custom fl_lmops->lm_compare_owner that checks
+	 * for the fl_owner to be the same (which it always is on local node
+	 * I guess between lockd processes) and then compares pid.
+	 * As such we assign pid to the owner field to make it all work,
+	 * conflict with normal locks is unlikely since pid space and
+	 * pointer space for current->files are not intersecting */
+	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
+		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
+#endif
+
+	switch (fl_type) {
+        case F_RDLCK:
+                einfo.ei_mode = LCK_PR;
+                break;
+        case F_UNLCK:
+                /* An unlock request may or may not have any relation to
+                 * existing locks so we may not be able to pass a lock handle
+                 * via a normal ldlm_lock_cancel() request. The request may even
+                 * unlock a byte range in the middle of an existing lock. In
+                 * order to process an unlock request we need all of the same
+                 * information that is given with a normal read or write record
+                 * lock request. To avoid creating another ldlm unlock (cancel)
+                 * message we'll treat a LCK_NL flock request as an unlock. */
+                einfo.ei_mode = LCK_NL;
+                break;
+        case F_WRLCK:
+                einfo.ei_mode = LCK_PW;
+                break;
+        default:
+		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
+                RETURN (-ENOTSUPP);
+        }
+
+        switch (cmd) {
+        case F_SETLKW:
+#ifdef F_SETLKW64
+        case F_SETLKW64:
+#endif
+                flags = 0;
+                break;
+        case F_SETLK:
+#ifdef F_SETLK64
+        case F_SETLK64:
+#endif
+                flags = LDLM_FL_BLOCK_NOWAIT;
+                break;
+        case F_GETLK:
+#ifdef F_GETLK64
+        case F_GETLK64:
+#endif
+                flags = LDLM_FL_TEST_LOCK;
+                break;
+        default:
+                CERROR("unknown fcntl lock command: %d\n", cmd);
+                RETURN (-EINVAL);
+        }
+
+	/* Save the old mode so that if the mode in the lock changes we
+	 * can decrement the appropriate reader or writer refcount. */
+	file_lock->fl_type = einfo.ei_mode;
+
+        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                     LUSTRE_OPC_ANY, NULL);
+        if (IS_ERR(op_data))
+                RETURN(PTR_ERR(op_data));
+
+	CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
+	       "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
+	       flock.l_flock.pid, flags, einfo.ei_mode,
+	       flock.l_flock.start, flock.l_flock.end);
+
+	rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
+			flags);
+
+	/* Restore the file lock type if not TEST lock. */
+	if (!(flags & LDLM_FL_TEST_LOCK))
+		file_lock->fl_type = fl_type;
+
+#ifdef HAVE_LOCKS_LOCK_FILE_WAIT
+	if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
+	    !(flags & LDLM_FL_TEST_LOCK))
+		rc2  = locks_lock_file_wait(file, file_lock);
+#else
+        if ((file_lock->fl_flags & FL_FLOCK) &&
+            (rc == 0 || file_lock->fl_type == F_UNLCK))
+		rc2  = flock_lock_file_wait(file, file_lock);
+        if ((file_lock->fl_flags & FL_POSIX) &&
+            (rc == 0 || file_lock->fl_type == F_UNLCK) &&
+            !(flags & LDLM_FL_TEST_LOCK))
+		rc2  = posix_lock_file_wait(file, file_lock);
+#endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
+
+	if (rc2 && file_lock->fl_type != F_UNLCK) {
+		einfo.ei_mode = LCK_NL;
+		md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
+			   &lockh, flags);
+		rc = rc2;
+	}
+
+	ll_finish_md_op_data(op_data);
+
+	if (!rc)
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK,
+				   ktime_us_delta(ktime_get(), kstart));
+	RETURN(rc);
+}
+
+int ll_get_fid_by_name(struct inode *parent, const char *name,
+		       int namelen, struct lu_fid *fid,
+		       struct inode **inode)
+{
+	struct md_op_data *op_data = NULL;
+	struct mdt_body *body;
+	struct ptlrpc_request *req;
+	int rc;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
+	rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out_req, rc = -EFAULT);
+	if (fid != NULL)
+		*fid = body->mbo_fid1;
+
+	if (inode != NULL)
+		rc = ll_prep_inode(inode, &req->rq_pill, parent->i_sb, NULL);
+out_req:
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
+	       const char *name, __u32 flags)
+{
+	struct dentry *dchild = NULL;
+	struct inode *child_inode = NULL;
+	struct md_op_data *op_data;
+	struct ptlrpc_request *request = NULL;
+	struct obd_client_handle *och = NULL;
+	struct qstr qstr;
+	struct mdt_body	*body;
+	__u64 data_version = 0;
+	size_t namelen = strlen(name);
+	int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
+	       PFID(ll_inode2fid(parent)), name,
+	       lum->lum_stripe_offset, lum->lum_stripe_count);
+
+	if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
+	    lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
+		lustre_swab_lmv_user_md(lum);
+
+	/* Get child FID first */
+	qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
+	qstr.name = name;
+	qstr.len = namelen;
+	dchild = d_lookup(file_dentry(file), &qstr);
+	if (dchild) {
+		if (dchild->d_inode)
+			child_inode = igrab(dchild->d_inode);
+		dput(dchild);
+	}
+
+	if (!child_inode) {
+		rc = ll_get_fid_by_name(parent, name, namelen, NULL,
+					&child_inode);
+		if (rc)
+			RETURN(rc);
+	}
+
+	if (!child_inode)
+		RETURN(-ENOENT);
+
+	if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
+	      OBD_CONNECT2_DIR_MIGRATE)) {
+		if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
+		    ll_dir_striped(child_inode)) {
+			CERROR("%s: MDT doesn't support stripe directory "
+			       "migration!\n", ll_i2sbi(parent)->ll_fsname);
+			GOTO(out_iput, rc = -EOPNOTSUPP);
+		}
+	}
+
+	/*
+	 * lfs migrate command needs to be blocked on the client
+	 * by checking the migrate FID against the FID of the
+	 * filesystem root.
+	 */
+	if (is_root_inode(child_inode))
+		GOTO(out_iput, rc = -EINVAL);
+
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
+				     child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		GOTO(out_iput, rc = PTR_ERR(op_data));
+
+	inode_lock(child_inode);
+	op_data->op_fid3 = *ll_inode2fid(child_inode);
+	if (!fid_is_sane(&op_data->op_fid3)) {
+		CERROR("%s: migrate %s, but FID "DFID" is insane\n",
+		       ll_i2sbi(parent)->ll_fsname, name,
+		       PFID(&op_data->op_fid3));
+		GOTO(out_unlock, rc = -EINVAL);
+	}
+
+	op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
+	op_data->op_data = lum;
+	op_data->op_data_size = lumlen;
+
+	/* migrate dirent only for subdirs if MDS_MIGRATE_NSONLY set */
+	if (S_ISDIR(child_inode->i_mode) && (flags & MDS_MIGRATE_NSONLY) &&
+	    lmv_dir_layout_changing(ll_i2info(parent)->lli_lsm_md))
+		op_data->op_bias |= MDS_MIGRATE_NSONLY;
+
+again:
+	if (S_ISREG(child_inode->i_mode)) {
+		och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
+		if (IS_ERR(och)) {
+			rc = PTR_ERR(och);
+			och = NULL;
+			GOTO(out_unlock, rc);
+		}
+
+		rc = ll_data_version(child_inode, &data_version,
+				     LL_DV_WR_FLUSH);
+		if (rc != 0)
+			GOTO(out_close, rc);
+
+		op_data->op_open_handle = och->och_open_handle;
+		op_data->op_data_version = data_version;
+		op_data->op_lease_handle = och->och_lease_handle;
+		op_data->op_bias |= MDS_CLOSE_MIGRATE;
+
+		spin_lock(&och->och_mod->mod_open_req->rq_lock);
+		och->och_mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&och->och_mod->mod_open_req->rq_lock);
+	}
+
+	rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data,
+		       op_data->op_name, op_data->op_namelen,
+		       op_data->op_name, op_data->op_namelen, &request);
+	if (rc == 0) {
+		LASSERT(request != NULL);
+		ll_update_times(request, parent);
+	}
+
+	if (rc == 0 || rc == -EAGAIN) {
+		body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body != NULL);
+
+		/* If the server does release layout lock, then we cleanup
+		 * the client och here, otherwise release it in out_close: */
+		if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
+			obd_mod_put(och->och_mod);
+			md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
+						  och);
+			och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
+			OBD_FREE_PTR(och);
+			och = NULL;
+		}
+	}
+
+	if (request != NULL) {
+		ptlrpc_req_finished(request);
+		request = NULL;
+	}
+
+	/* Try again if the lease has cancelled. */
+	if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
+		goto again;
+
+out_close:
+	if (och)
+		ll_lease_close(och, child_inode, NULL);
+	if (!rc)
+		clear_nlink(child_inode);
+out_unlock:
+	inode_unlock(child_inode);
+	ll_finish_md_op_data(op_data);
+out_iput:
+	iput(child_inode);
+	RETURN(rc);
+}
+
+static int
+ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+	struct ll_file_data *fd = file->private_data;
+	ENTRY;
+
+	/*
+	 * In order to avoid flood of warning messages, only print one message
+	 * for one file. And the entire message rate on the client is limited
+	 * by CDEBUG_LIMIT too.
+	 */
+	if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
+		fd->fd_flags |= LL_FILE_FLOCK_WARNING;
+		CDEBUG_LIMIT(D_CONSOLE,
+			     "flock disabled, mount with '-o [local]flock' to enable\r\n");
+	}
+	RETURN(-ENOSYS);
+}
+
+/**
+ * test if some locks matching bits and l_req_mode are acquired
+ * - bits can be in different locks
+ * - if found clear the common lock bits in *bits
+ * - the bits not found, are kept in *bits
+ * \param inode [IN]
+ * \param bits [IN] searched lock bits [IN]
+ * \param l_req_mode [IN] searched lock mode
+ * \retval boolean, true iff all bits are found
+ */
+int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
+{
+	struct lustre_handle lockh;
+	union ldlm_policy_data policy;
+	enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
+			      (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
+	struct lu_fid *fid;
+	__u64 flags;
+	int i;
+	ENTRY;
+
+        if (!inode)
+               RETURN(0);
+
+        fid = &ll_i2info(inode)->lli_fid;
+        CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
+               ldlm_lockname[mode]);
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
+	for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
+		policy.l_inodebits.bits = *bits & BIT(i);
+		if (policy.l_inodebits.bits == 0)
+			continue;
+
+		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
+				  &policy, mode, &lockh)) {
+			struct ldlm_lock *lock;
+
+			lock = ldlm_handle2lock(&lockh);
+			if (lock) {
+				*bits &=
+					~(lock->l_policy_data.l_inodebits.bits);
+				LDLM_LOCK_PUT(lock);
+			} else {
+				*bits &= ~policy.l_inodebits.bits;
+			}
+		}
+	}
+        RETURN(*bits == 0);
+}
+
+enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
+			       struct lustre_handle *lockh, __u64 flags,
+			       enum ldlm_mode mode)
+{
+	union ldlm_policy_data policy = { .l_inodebits = { bits } };
+	struct lu_fid *fid;
+	enum ldlm_mode rc;
+	ENTRY;
+
+	fid = &ll_i2info(inode)->lli_fid;
+	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
+
+	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
+			   fid, LDLM_IBITS, &policy, mode, lockh);
+
+	RETURN(rc);
+}
+
+static int ll_inode_revalidate_fini(struct inode *inode, int rc)
+{
+	/* Already unlinked. Just update nlink and return success */
+	if (rc == -ENOENT) {
+		clear_nlink(inode);
+		/* If it is striped directory, and there is bad stripe
+		 * Let's revalidate the dentry again, instead of returning
+		 * error */
+		if (ll_dir_striped(inode))
+			return 0;
+
+		/* This path cannot be hit for regular files unless in
+		 * case of obscure races, so no need to to validate
+		 * size. */
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return 0;
+	} else if (rc != 0) {
+		CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
+			     "%s: revalidate FID "DFID" error: rc = %d\n",
+			     ll_i2sbi(inode)->ll_fsname,
+			     PFID(ll_inode2fid(inode)), rc);
+	}
+
+	return rc;
+}
+
+static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
+{
+	struct inode *parent;
+	struct inode *inode = dentry->d_inode;
+	struct obd_export *exp = ll_i2mdexp(inode);
+	struct lookup_intent oit = {
+		.it_op = op,
+	};
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data;
+	const char *name = NULL;
+	size_t namelen = 0;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
+	       PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
+
+	if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID) {
+		parent = dentry->d_parent->d_inode;
+		name = dentry->d_name.name;
+		namelen = dentry->d_name.len;
+	} else {
+		parent = inode;
+	}
+
+	op_data = ll_prep_md_op_data(NULL, parent, inode, name, namelen, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	/* Call getattr by fid */
+	if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID)
+		op_data->op_flags = MF_GETATTR_BY_FID;
+	rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		rc = ll_inode_revalidate_fini(inode, rc);
+		GOTO(out, rc);
+	}
+
+	rc = ll_revalidate_it_finish(req, &oit, dentry);
+	if (rc != 0) {
+		ll_intent_release(&oit);
+		GOTO(out, rc);
+	}
+
+	/* Unlinked? Unhash dentry, so it is not picked up later by
+	 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+	 * here to preserve get_cwd functionality on 2.6.
+	 * Bug 10503 */
+	if (!dentry->d_inode->i_nlink)
+		d_lustre_invalidate(dentry);
+
+	ll_lookup_finish_locks(&oit, dentry);
+out:
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+static int ll_merge_md_attr(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_attr attr = { 0 };
+	int rc;
+
+	LASSERT(lli->lli_lsm_md != NULL);
+
+	if (!lmv_dir_striped(lli->lli_lsm_md))
+		RETURN(0);
+
+	down_read(&lli->lli_lsm_sem);
+	rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
+			   &attr, ll_md_blocking_ast);
+	up_read(&lli->lli_lsm_sem);
+	if (rc != 0)
+		RETURN(rc);
+
+	spin_lock(&inode->i_lock);
+	set_nlink(inode, attr.cat_nlink);
+	spin_unlock(&inode->i_lock);
+
+	inode->i_blocks = attr.cat_blocks;
+	i_size_write(inode, attr.cat_size);
+
+	ll_i2info(inode)->lli_atime = attr.cat_atime;
+	ll_i2info(inode)->lli_mtime = attr.cat_mtime;
+	ll_i2info(inode)->lli_ctime = attr.cat_ctime;
+
+	RETURN(0);
+}
+
+int ll_getattr_dentry(struct dentry *de, struct kstat *stat, u32 request_mask,
+		      unsigned int flags, bool foreign)
+{
+	struct inode *inode = de->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct inode *dir = de->d_parent->d_inode;
+	bool need_glimpse = true;
+	ktime_t kstart = ktime_get();
+	int rc;
+
+	/* The OST object(s) determine the file size, blocks and mtime. */
+	if (!(request_mask & STATX_SIZE || request_mask & STATX_BLOCKS ||
+	      request_mask & STATX_MTIME))
+		need_glimpse = false;
+
+	if (dentry_may_statahead(dir, de))
+		ll_start_statahead(dir, de, need_glimpse &&
+				   !(flags & AT_STATX_DONT_SYNC));
+
+	if (flags & AT_STATX_DONT_SYNC)
+		GOTO(fill_attr, rc = 0);
+
+	rc = ll_inode_revalidate(de, IT_GETATTR);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* foreign file/dir are always of zero length, so don't
+	 * need to validate size.
+	 */
+	if (S_ISREG(inode->i_mode) && !foreign) {
+		bool cached;
+
+		if (!need_glimpse)
+			GOTO(fill_attr, rc);
+
+		rc = pcc_inode_getattr(inode, request_mask, flags, &cached);
+		if (cached && rc < 0)
+			RETURN(rc);
+
+		if (cached)
+			GOTO(fill_attr, rc);
+
+		/*
+		 * If the returned attr is masked with OBD_MD_FLSIZE &
+		 * OBD_MD_FLBLOCKS & OBD_MD_FLMTIME, it means that the file size
+		 * or blocks obtained from MDT is strictly correct, and the file
+		 * is usually not being modified by clients, and the [a|m|c]time
+		 * got from MDT is also strictly correct.
+		 * Under this circumstance, it does not need to send glimpse
+		 * RPCs to OSTs for file attributes such as the size and blocks.
+		 */
+		if (lli->lli_attr_valid & OBD_MD_FLSIZE &&
+		    lli->lli_attr_valid & OBD_MD_FLBLOCKS &&
+		    lli->lli_attr_valid & OBD_MD_FLMTIME) {
+			inode->i_mtime.tv_sec = lli->lli_mtime;
+			if (lli->lli_attr_valid & OBD_MD_FLATIME)
+				inode->i_atime.tv_sec = lli->lli_atime;
+			if (lli->lli_attr_valid & OBD_MD_FLCTIME)
+				inode->i_ctime.tv_sec = lli->lli_ctime;
+			GOTO(fill_attr, rc);
+		}
+
+		/* In case of restore, the MDT has the right size and has
+		 * already send it back without granting the layout lock,
+		 * inode is up-to-date so glimpse is useless.
+		 * Also to glimpse we need the layout, in case of a running
+		 * restore the MDT holds the layout lock so the glimpse will
+		 * block up to the end of restore (getattr will block)
+		 */
+		if (!test_bit(LLIF_FILE_RESTORING, &lli->lli_flags)) {
+			rc = ll_glimpse_size(inode);
+			if (rc < 0)
+				RETURN(rc);
+		}
+	} else {
+		/* If object isn't regular a file then don't validate size. */
+		/* foreign dir is not striped dir */
+		if (ll_dir_striped(inode) && !foreign) {
+			rc = ll_merge_md_attr(inode);
+			if (rc < 0)
+				RETURN(rc);
+		}
+
+		if (lli->lli_attr_valid & OBD_MD_FLATIME)
+			inode->i_atime.tv_sec = lli->lli_atime;
+		if (lli->lli_attr_valid & OBD_MD_FLMTIME)
+			inode->i_mtime.tv_sec = lli->lli_mtime;
+		if (lli->lli_attr_valid & OBD_MD_FLCTIME)
+			inode->i_ctime.tv_sec = lli->lli_ctime;
+	}
+
+fill_attr:
+	OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
+
+	if (ll_need_32bit_api(sbi)) {
+		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
+		stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
+		stat->rdev = ll_compat_encode_dev(inode->i_rdev);
+	} else {
+		stat->ino = inode->i_ino;
+		stat->dev = inode->i_sb->s_dev;
+		stat->rdev = inode->i_rdev;
+	}
+
+	/* foreign symlink to be exposed as a real symlink */
+	if (!foreign)
+		stat->mode = inode->i_mode;
+	else
+		stat->mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
+
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	/* stat->blksize is used to tell about preferred IO size */
+	if (sbi->ll_stat_blksize)
+		stat->blksize = sbi->ll_stat_blksize;
+	else if (S_ISREG(inode->i_mode))
+		stat->blksize = 1 << min(PTLRPC_MAX_BRW_BITS + 1,
+					 LL_MAX_BLKSIZE_BITS);
+	else
+		stat->blksize = 1 << inode->i_sb->s_blocksize_bits;
+
+	stat->nlink = inode->i_nlink;
+	stat->size = i_size_read(inode);
+	stat->blocks = inode->i_blocks;
+
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR)
+	if (flags & AT_STATX_DONT_SYNC) {
+		if (stat->size == 0 &&
+		    lli->lli_attr_valid & OBD_MD_FLLAZYSIZE)
+			stat->size = lli->lli_lazysize;
+		if (stat->blocks == 0 &&
+		    lli->lli_attr_valid & OBD_MD_FLLAZYBLOCKS)
+			stat->blocks = lli->lli_lazyblocks;
+	}
+
+	if (lli->lli_attr_valid & OBD_MD_FLBTIME) {
+		stat->result_mask |= STATX_BTIME;
+		stat->btime.tv_sec = lli->lli_btime;
+	}
+
+	stat->attributes_mask = STATX_ATTR_IMMUTABLE | STATX_ATTR_APPEND;
+#ifdef HAVE_LUSTRE_CRYPTO
+	stat->attributes_mask |= STATX_ATTR_ENCRYPTED;
+#endif
+	stat->attributes |= ll_inode_to_ext_flags(inode->i_flags);
+	/* if Lustre specific LUSTRE_ENCRYPT_FL flag is set, also set
+	 * ext4 equivalent to please statx
+	 */
+	if (stat->attributes & LUSTRE_ENCRYPT_FL)
+		stat->attributes |= STATX_ATTR_ENCRYPTED;
+	stat->result_mask &= request_mask;
+#endif
+
+	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR,
+			   ktime_us_delta(ktime_get(), kstart));
+
+	return 0;
+}
+
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR)
+int ll_getattr(struct user_namespace *mnt_userns, const struct path *path,
+	       struct kstat *stat, u32 request_mask, unsigned int flags)
+{
+	return ll_getattr_dentry(path->dentry, stat, request_mask, flags,
+				 false);
+}
+#else
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+	return ll_getattr_dentry(de, stat, STATX_BASIC_STATS,
+				 AT_STATX_SYNC_AS_STAT, false);
+}
+#endif
+
+int cl_falloc(struct file *file, struct inode *inode, int mode, loff_t offset,
+	      loff_t len)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	__u16 refcheck;
+	int rc;
+	loff_t size = i_size_read(inode);
+
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	ll_io_set_mirror(io, file);
+
+	io->ci_verify_layout = 1;
+	io->u.ci_setattr.sa_parent_fid = lu_object_fid(&io->ci_obj->co_lu);
+	io->u.ci_setattr.sa_falloc_mode = mode;
+	io->u.ci_setattr.sa_falloc_offset = offset;
+	io->u.ci_setattr.sa_falloc_end = offset + len;
+	io->u.ci_setattr.sa_subtype = CL_SETATTR_FALLOCATE;
+
+	CDEBUG(D_INODE, "UID %u GID %u\n",
+	       from_kuid(&init_user_ns, inode->i_uid),
+	       from_kgid(&init_user_ns, inode->i_gid));
+
+	io->u.ci_setattr.sa_falloc_uid = from_kuid(&init_user_ns, inode->i_uid);
+	io->u.ci_setattr.sa_falloc_gid = from_kgid(&init_user_ns, inode->i_gid);
+
+	if (io->u.ci_setattr.sa_falloc_end > size) {
+		loff_t newsize = io->u.ci_setattr.sa_falloc_end;
+
+		/* Check new size against VFS/VM file size limit and rlimit */
+		rc = inode_newsize_ok(inode, newsize);
+		if (rc)
+			goto out;
+		if (newsize > ll_file_maxbytes(inode)) {
+			CDEBUG(D_INODE, "file size too large %llu > %llu\n",
+			       (unsigned long long)newsize,
+			       ll_file_maxbytes(inode));
+			rc = -EFBIG;
+			goto out;
+		}
+	}
+
+	do {
+		rc = cl_io_init(env, io, CIT_SETATTR, io->ci_obj);
+		if (!rc)
+			rc = cl_io_loop(env, io);
+		else
+			rc = io->ci_result;
+		cl_io_fini(env, io);
+	} while (unlikely(io->ci_need_restart));
+
+out:
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+long ll_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(filp);
+	int rc;
+
+	if (offset < 0 || len <= 0)
+		RETURN(-EINVAL);
+	/*
+	 * Encrypted inodes can't handle collapse range or zero range or insert
+	 * range since we would need to re-encrypt blocks with a different IV or
+	 * XTS tweak (which are based on the logical block number).
+	 * Similar to what ext4 does.
+	 */
+	if (IS_ENCRYPTED(inode) &&
+	    (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
+		     FALLOC_FL_ZERO_RANGE)))
+		RETURN(-EOPNOTSUPP);
+
+	/*
+	 * mode == 0 (which is standard prealloc) and PUNCH is supported
+	 * Rest of mode options are not supported yet.
+	 */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		RETURN(-EOPNOTSUPP);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FALLOCATE, 1);
+
+	rc = cl_falloc(filp, inode, mode, offset, len);
+	/*
+	 * ENOTSUPP (524) is an NFSv3 specific error code erroneously
+	 * used by Lustre in several places. Retuning it here would
+	 * confuse applications that explicity test for EOPNOTSUPP
+	 * (95) and fall back to ftruncate().
+	 */
+	if (rc == -ENOTSUPP)
+		rc = -EOPNOTSUPP;
+
+	RETURN(rc);
+}
+
+static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		     __u64 start, __u64 len)
+{
+	int		rc;
+	size_t		num_bytes;
+	struct fiemap	*fiemap;
+	unsigned int	extent_count = fieinfo->fi_extents_max;
+
+	num_bytes = sizeof(*fiemap) + (extent_count *
+				       sizeof(struct fiemap_extent));
+	OBD_ALLOC_LARGE(fiemap, num_bytes);
+
+	if (fiemap == NULL)
+		RETURN(-ENOMEM);
+
+	fiemap->fm_flags = fieinfo->fi_flags;
+	fiemap->fm_extent_count = fieinfo->fi_extents_max;
+	fiemap->fm_start = start;
+	fiemap->fm_length = len;
+	if (extent_count > 0 &&
+	    copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
+			   sizeof(struct fiemap_extent)) != 0)
+		GOTO(out, rc = -EFAULT);
+
+	rc = ll_do_fiemap(inode, fiemap, num_bytes);
+
+	if (IS_ENCRYPTED(inode)) {
+		int i;
+
+		for (i = 0; i < fiemap->fm_mapped_extents; i++)
+			fiemap->fm_extents[i].fe_flags |=
+				FIEMAP_EXTENT_DATA_ENCRYPTED |
+				FIEMAP_EXTENT_ENCODED;
+	}
+
+	fieinfo->fi_flags = fiemap->fm_flags;
+	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
+	if (extent_count > 0 &&
+	    copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
+			 fiemap->fm_mapped_extents *
+			 sizeof(struct fiemap_extent)) != 0)
+		GOTO(out, rc = -EFAULT);
+out:
+	OBD_FREE_LARGE(fiemap, num_bytes);
+	return rc;
+}
+
+int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode,
+			int mask)
+{
+	int rc = 0;
+	struct ll_sb_info *sbi;
+	struct root_squash_info *squash;
+	struct cred *cred = NULL;
+	const struct cred *old_cred = NULL;
+	bool squash_id = false;
+	ktime_t kstart = ktime_get();
+
+	ENTRY;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	/*
+	 * as root inode are NOT getting validated in lookup operation,
+	 * need to do it before permission check.
+	 */
+
+	if (is_root_inode(inode)) {
+		rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
+		if (rc)
+			RETURN(rc);
+	}
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
+	       PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
+
+	/* squash fsuid/fsgid if needed */
+	sbi = ll_i2sbi(inode);
+	squash = &sbi->ll_squash;
+	if (unlikely(squash->rsi_uid != 0 &&
+		     uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
+		     !test_bit(LL_SBI_NOROOTSQUASH, sbi->ll_flags))) {
+			squash_id = true;
+	}
+	if (squash_id) {
+		CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
+		       __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
+		       squash->rsi_uid, squash->rsi_gid);
+
+		/* update current process's credentials
+		 * and FS capability */
+		cred = prepare_creds();
+		if (cred == NULL)
+			RETURN(-ENOMEM);
+
+		cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
+		cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
+		cred->cap_effective = cap_drop_nfsd_set(cred->cap_effective);
+		cred->cap_effective = cap_drop_fs_set(cred->cap_effective);
+
+		old_cred = override_creds(cred);
+	}
+
+	rc = generic_permission(mnt_userns, inode, mask);
+	/* restore current process's credentials and FS capability */
+	if (squash_id) {
+		revert_creds(old_cred);
+		put_cred(cred);
+	}
+
+	if (!rc)
+		ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM,
+				   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(rc);
+}
+
+/* -o localflock - only provides locally consistent flock locks */
+static const struct file_operations ll_file_operations = {
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+# endif
+	.read_iter	= ll_file_read_iter,
+	.write_iter	= ll_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.read		= ll_file_read,
+	.aio_read	= ll_file_aio_read,
+	.write		= ll_file_write,
+	.aio_write	= ll_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.unlocked_ioctl	= ll_file_ioctl,
+	.open		= ll_file_open,
+	.release	= ll_file_release,
+	.mmap		= ll_file_mmap,
+	.llseek		= ll_file_seek,
+#ifndef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT
+	.splice_read	= generic_file_splice_read,
+#else
+	.splice_read	= pcc_file_splice_read,
+#endif
+#ifdef HAVE_ITER_FILE_SPLICE_WRITE
+	.splice_write	= iter_file_splice_write,
+#endif
+	.fsync		= ll_fsync,
+	.flush		= ll_flush,
+	.fallocate	= ll_fallocate,
+};
+
+static const struct file_operations ll_file_operations_flock = {
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+# endif /* HAVE_SYNC_READ_WRITE */
+	.read_iter	= ll_file_read_iter,
+	.write_iter	= ll_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.read		= ll_file_read,
+	.aio_read	= ll_file_aio_read,
+	.write		= ll_file_write,
+	.aio_write	= ll_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.unlocked_ioctl	= ll_file_ioctl,
+	.open		= ll_file_open,
+	.release	= ll_file_release,
+	.mmap		= ll_file_mmap,
+	.llseek		= ll_file_seek,
+#ifndef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT
+	.splice_read	= generic_file_splice_read,
+#else
+	.splice_read	= pcc_file_splice_read,
+#endif
+#ifdef HAVE_ITER_FILE_SPLICE_WRITE
+	.splice_write	= iter_file_splice_write,
+#endif
+	.fsync		= ll_fsync,
+	.flush		= ll_flush,
+	.flock		= ll_file_flock,
+	.lock		= ll_file_flock,
+	.fallocate	= ll_fallocate,
+};
+
+/* These are for -o noflock - to return ENOSYS on flock calls */
+static const struct file_operations ll_file_operations_noflock = {
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+# endif /* HAVE_SYNC_READ_WRITE */
+	.read_iter	= ll_file_read_iter,
+	.write_iter	= ll_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.read		= ll_file_read,
+	.aio_read	= ll_file_aio_read,
+	.write		= ll_file_write,
+	.aio_write	= ll_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.unlocked_ioctl	= ll_file_ioctl,
+	.open		= ll_file_open,
+	.release	= ll_file_release,
+	.mmap		= ll_file_mmap,
+	.llseek		= ll_file_seek,
+#ifndef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT
+	.splice_read	= generic_file_splice_read,
+#else
+	.splice_read	= pcc_file_splice_read,
+#endif
+#ifdef HAVE_ITER_FILE_SPLICE_WRITE
+	.splice_write	= iter_file_splice_write,
+#endif
+	.fsync		= ll_fsync,
+	.flush		= ll_flush,
+	.flock		= ll_file_noflock,
+	.lock		= ll_file_noflock,
+	.fallocate	= ll_fallocate,
+};
+
+const struct inode_operations ll_file_inode_operations = {
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr	= ll_removexattr,
+#endif
+	.listxattr	= ll_listxattr,
+	.fiemap		= ll_fiemap,
+	.get_acl	= ll_get_acl,
+#ifdef HAVE_IOP_SET_ACL
+	.set_acl	= ll_set_acl,
+#endif
+};
+
+const struct file_operations *ll_select_file_operations(struct ll_sb_info *sbi)
+{
+	const struct file_operations *fops = &ll_file_operations_noflock;
+
+	if (test_bit(LL_SBI_FLOCK, sbi->ll_flags))
+		fops = &ll_file_operations_flock;
+	else if (test_bit(LL_SBI_LOCALFLOCK, sbi->ll_flags))
+		fops = &ll_file_operations;
+
+	return fops;
+}
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct lu_env *env;
+	int rc;
+	__u16 refcheck;
+	ENTRY;
+
+	if (obj == NULL)
+		RETURN(0);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = cl_conf_set(env, lli->lli_clob, conf);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	if (conf->coc_opc == OBJECT_CONF_SET) {
+		struct ldlm_lock *lock = conf->coc_lock;
+		struct cl_layout cl = {
+			.cl_layout_gen = 0,
+		};
+
+		LASSERT(lock != NULL);
+		LASSERT(ldlm_has_layout(lock));
+
+		/* it can only be allowed to match after layout is
+		 * applied to inode otherwise false layout would be
+		 * seen. Applying layout shoud happen before dropping
+		 * the intent lock. */
+		ldlm_lock_allow_match(lock);
+
+		rc = cl_object_layout_get(env, obj, &cl);
+		if (rc < 0)
+			GOTO(out, rc);
+
+		CDEBUG(D_VFSTRACE,
+		       DFID": layout version change: %u -> %u\n",
+		       PFID(&lli->lli_fid), ll_layout_version_get(lli),
+		       cl.cl_layout_gen);
+		ll_layout_version_set(lli, cl.cl_layout_gen);
+	}
+
+out:
+	cl_env_put(env, &refcheck);
+
+	RETURN(rc < 0 ? rc : 0);
+}
+
+/* Fetch layout from MDT with getxattr request, if it's not ready yet */
+static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
+
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req;
+	void *lvbdata;
+	void *lmm;
+	int lmmsize;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
+	       PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
+	       lock->l_lvb_data, lock->l_lvb_len);
+
+	if (lock->l_lvb_data != NULL)
+		RETURN(0);
+
+	/* if layout lock was granted right away, the layout is returned
+	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
+	 * blocked and then granted via completion ast, we have to fetch
+	 * layout here. Please note that we can't use the LVB buffer in
+	 * completion AST because it doesn't have a large enough buffer */
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
+			 XATTR_NAME_LOV, lmmsize, &req);
+	if (rc < 0) {
+		if (rc == -ENODATA)
+			GOTO(out, rc = 0); /* empty layout */
+		else
+			RETURN(rc);
+	}
+
+	lmmsize = rc;
+	rc = 0;
+	if (lmmsize == 0) /* empty layout */
+		GOTO(out, rc = 0);
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
+	if (lmm == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	OBD_ALLOC_LARGE(lvbdata, lmmsize);
+	if (lvbdata == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	memcpy(lvbdata, lmm, lmmsize);
+	lock_res_and_lock(lock);
+	if (unlikely(lock->l_lvb_data == NULL)) {
+		lock->l_lvb_type = LVB_T_LAYOUT;
+		lock->l_lvb_data = lvbdata;
+		lock->l_lvb_len = lmmsize;
+		lvbdata = NULL;
+	}
+	unlock_res_and_lock(lock);
+
+	if (lvbdata)
+		OBD_FREE_LARGE(lvbdata, lmmsize);
+
+	EXIT;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/**
+ * Apply the layout to the inode. Layout lock is held and will be released
+ * in this function.
+ */
+static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
+			      struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info    *sbi = ll_i2sbi(inode);
+	struct ldlm_lock *lock;
+	struct cl_object_conf conf;
+	int rc = 0;
+	bool lvb_ready;
+	bool wait_layout = false;
+	ENTRY;
+
+	LASSERT(lustre_handle_is_used(lockh));
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERT(lock != NULL);
+
+	if (!ldlm_has_layout(lock))
+		GOTO(out, rc = -EAGAIN);
+
+	LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
+		   PFID(&lli->lli_fid), inode);
+
+	/* in case this is a caching lock and reinstate with new inode */
+	md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
+
+	lock_res_and_lock(lock);
+	lvb_ready = ldlm_is_lvb_ready(lock);
+	unlock_res_and_lock(lock);
+
+	/* checking lvb_ready is racy but this is okay. The worst case is
+	 * that multi processes may configure the file on the same time. */
+	if (lvb_ready)
+		GOTO(out, rc = 0);
+
+	rc = ll_layout_fetch(inode, lock);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* for layout lock, lmm is stored in lock's lvb.
+	 * lvb_data is immutable if the lock is held so it's safe to access it
+	 * without res lock.
+	 *
+	 * set layout to file. Unlikely this will fail as old layout was
+	 * surely eliminated */
+	memset(&conf, 0, sizeof conf);
+	conf.coc_opc = OBJECT_CONF_SET;
+	conf.coc_inode = inode;
+	conf.coc_lock = lock;
+	conf.u.coc_layout.lb_buf = lock->l_lvb_data;
+	conf.u.coc_layout.lb_len = lock->l_lvb_len;
+	rc = ll_layout_conf(inode, &conf);
+
+	/* refresh layout failed, need to wait */
+	wait_layout = rc == -EBUSY;
+	EXIT;
+out:
+	LDLM_LOCK_PUT(lock);
+	ldlm_lock_decref(lockh, mode);
+
+	/* wait for IO to complete if it's still being used. */
+	if (wait_layout) {
+		CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
+		       sbi->ll_fsname, PFID(&lli->lli_fid), inode);
+
+		memset(&conf, 0, sizeof conf);
+		conf.coc_opc = OBJECT_CONF_WAIT;
+		conf.coc_inode = inode;
+		rc = ll_layout_conf(inode, &conf);
+		if (rc == 0)
+			rc = -EAGAIN;
+
+		CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
+		       sbi->ll_fsname, PFID(&lli->lli_fid), rc);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Issue layout intent RPC to MDS.
+ * \param inode [in]	file inode
+ * \param intent [in]	layout intent
+ *
+ * \retval 0	on success
+ * \retval < 0	error code
+ */
+static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
+{
+	struct ll_inode_info  *lli = ll_i2info(inode);
+	struct ll_sb_info     *sbi = ll_i2sbi(inode);
+	struct md_op_data     *op_data;
+	struct lookup_intent it;
+	struct ptlrpc_request *req;
+	int rc;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
+				     0, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_data = intent;
+	op_data->op_data_size = sizeof(*intent);
+
+	memset(&it, 0, sizeof(it));
+	it.it_op = IT_LAYOUT;
+	if (intent->li_opc == LAYOUT_INTENT_WRITE ||
+	    intent->li_opc == LAYOUT_INTENT_TRUNC)
+		it.it_flags = FMODE_WRITE;
+
+	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
+			  sbi->ll_fsname, PFID(&lli->lli_fid), inode);
+
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
+			    &ll_md_blocking_ast, 0);
+	if (it.it_request != NULL)
+		ptlrpc_req_finished(it.it_request);
+	it.it_request = NULL;
+
+	ll_finish_md_op_data(op_data);
+
+	/* set lock data in case this is a new lock */
+	if (!rc)
+		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+
+	ll_intent_drop_lock(&it);
+
+	RETURN(rc);
+}
+
+/**
+ * This function checks if there exists a LAYOUT lock on the client side,
+ * or enqueues it if it doesn't have one in cache.
+ *
+ * This function will not hold layout lock so it may be revoked any time after
+ * this function returns. Any operations depend on layout should be redone
+ * in that case.
+ *
+ * This function should be called before lov_io_init() to get an uptodate
+ * layout version, the caller should save the version number and after IO
+ * is finished, this function should be called again to verify that layout
+ * is not changed during IO time.
+ */
+int ll_layout_refresh(struct inode *inode, __u32 *gen)
+{
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct ll_sb_info	*sbi = ll_i2sbi(inode);
+	struct lustre_handle lockh;
+	struct layout_intent intent = {
+		.li_opc = LAYOUT_INTENT_ACCESS,
+	};
+	enum ldlm_mode mode;
+	int rc;
+	ENTRY;
+
+	*gen = ll_layout_version_get(lli);
+	if (!test_bit(LL_SBI_LAYOUT_LOCK, sbi->ll_flags) ||
+	    *gen != CL_LAYOUT_GEN_NONE)
+		RETURN(0);
+
+	/* sanity checks */
+	LASSERT(fid_is_sane(ll_inode2fid(inode)));
+
+	/* take layout lock mutex to enqueue layout lock exclusively. */
+	mutex_lock(&lli->lli_layout_mutex);
+
+	while (1) {
+		/* mostly layout lock is caching on the local side, so try to
+		 * match it before grabbing layout lock mutex. */
+		mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
+				       LCK_CR | LCK_CW | LCK_PR |
+				       LCK_PW | LCK_EX);
+		if (mode != 0) { /* hit cached lock */
+			rc = ll_layout_lock_set(&lockh, mode, inode);
+			if (rc == -EAGAIN)
+				continue;
+			break;
+		}
+
+		rc = ll_layout_intent(inode, &intent);
+		if (rc != 0)
+			break;
+	}
+
+	if (rc == 0)
+		*gen = ll_layout_version_get(lli);
+	mutex_unlock(&lli->lli_layout_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * Issue layout intent RPC indicating where in a file an IO is about to write.
+ *
+ * \param[in] inode	file inode.
+ * \param[in] ext	write range with start offset of fille in bytes where
+ *			an IO is about to write, and exclusive end offset in
+ *			bytes.
+ *
+ * \retval 0	on success
+ * \retval < 0	error code
+ */
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+			   struct lu_extent *ext)
+{
+	struct layout_intent intent = {
+		.li_opc = opc,
+		.li_extent.e_start = ext->e_start,
+		.li_extent.e_end = ext->e_end,
+	};
+	int rc;
+	ENTRY;
+
+	rc = ll_layout_intent(inode, &intent);
+
+	RETURN(rc);
+}
+
+/**
+ *  This function send a restore request to the MDT
+ */
+int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
+{
+	struct hsm_user_request	*hur;
+	int			 len, rc;
+	ENTRY;
+
+	len = sizeof(struct hsm_user_request) +
+	      sizeof(struct hsm_user_item);
+	OBD_ALLOC(hur, len);
+	if (hur == NULL)
+		RETURN(-ENOMEM);
+
+	hur->hur_request.hr_action = HUA_RESTORE;
+	hur->hur_request.hr_archive_id = 0;
+	hur->hur_request.hr_flags = 0;
+	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
+	       sizeof(hur->hur_user_item[0].hui_fid));
+	hur->hur_user_item[0].hui_extent.offset = offset;
+	hur->hur_user_item[0].hui_extent.length = length;
+	hur->hur_request.hr_itemcount = 1;
+	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
+			   len, hur, NULL);
+	OBD_FREE(hur, len);
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/foreign_symlink.h b/drivers/staging/lustrefsx/lustre/llite/foreign_symlink.h
new file mode 100644
index 0000000000000..b1c10e4c6156e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/foreign_symlink.h
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#ifndef LLITE_FOREIGN_SYMLINK_H
+#define LLITE_FOREIGN_SYMLINK_H
+
+/* llite/llite_foreign_symlink.c */
+ssize_t foreign_symlink_enable_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf);
+ssize_t foreign_symlink_enable_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count);
+ssize_t foreign_symlink_prefix_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf);
+ssize_t foreign_symlink_prefix_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count);
+ssize_t foreign_symlink_upcall_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf);
+ssize_t foreign_symlink_upcall_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count);
+ssize_t foreign_symlink_upcall_info_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count);
+extern struct inode_operations ll_foreign_file_symlink_inode_operations;
+extern struct inode_operations ll_foreign_dir_symlink_inode_operations;
+
+#endif /* LLITE_FOREIGN_SYMLINK_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/glimpse.c b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
new file mode 100644
index 0000000000000..bd5e6b691cff0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
@@ -0,0 +1,228 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * glimpse code used by vvp (and other Lustre clients in the future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ */
+
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+
+#include <lustre_dlm.h>
+#include <lustre_mdc.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+
+#include "cl_object.h"
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+static const struct cl_lock_descr whole_file = {
+	.cld_start = 0,
+	.cld_end   = CL_PAGE_EOF,
+	.cld_mode  = CLM_READ
+};
+
+/*
+ * Check whether file has possible unwritten pages.
+ *
+ * \retval 1    file is mmap-ed or has dirty pages
+ *         0    otherwise
+ */
+blkcnt_t dirty_cnt(struct inode *inode)
+{
+	blkcnt_t cnt = 0;
+	struct vvp_object *vob = cl_inode2vvp(inode);
+	void *results[1];
+
+	if (inode->i_mapping != NULL)
+		cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+						  results, 0, 1,
+						  PAGECACHE_TAG_DIRTY);
+	if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0)
+		cnt = 1;
+
+	return (cnt > 0) ? 1 : 0;
+}
+
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl)
+{
+	const struct lu_fid *fid = lu_object_fid(&clob->co_lu);
+	struct cl_lock *lock = vvp_env_lock(env);
+	struct cl_lock_descr *descr = &lock->cll_descr;
+	int result;
+
+	ENTRY;
+	result = 0;
+
+	CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
+
+	/* NOTE: this looks like DLM lock request, but it may
+	 *       not be one. Due to CEF_GLIMPSE flag (translated
+	 *       to LDLM_FL_HAS_INTENT by osc), this is
+	 *       glimpse request, that won't revoke any
+	 *       conflicting DLM locks held. Instead,
+	 *       ll_glimpse_callback() will be called on each
+	 *       client holding a DLM lock against this file,
+	 *       and resulting size will be returned for each
+	 *       stripe. DLM lock on [0, EOF] is acquired only
+	 *       if there were no conflicting locks. If there
+	 *       were conflicting locks, enqueuing or waiting
+	 *       fails with -ENAVAIL, but valid inode
+	 *       attributes are returned anyway. */
+	*descr = whole_file;
+	descr->cld_obj = clob;
+	descr->cld_mode = CLM_READ;
+	descr->cld_enq_flags = CEF_GLIMPSE | CEF_MUST;
+	if (agl)
+		descr->cld_enq_flags |= CEF_SPECULATIVE | CEF_NONBLOCK;
+	/*
+	 * CEF_MUST protects glimpse lock from conversion into
+	 * a lockless mode.
+	 */
+	result = cl_lock_request(env, io, lock);
+	if (result < 0)
+		RETURN(result);
+
+	if (!agl) {
+		ll_merge_attr(env, inode);
+		if (i_size_read(inode) > 0 && inode->i_blocks == 0) {
+			/*
+			 * LU-417: Add dirty pages block count
+			 * lest i_blocks reports 0, some "cp" or
+			 * "tar" may think it's a completely
+			 * sparse file and skip it.
+			 */
+			inode->i_blocks = dirty_cnt(inode);
+		}
+	}
+
+	cl_lock_release(env, lock);
+
+	RETURN(result);
+}
+
+/**
+ * Get an IO environment for special operations such as glimpse locks and
+ * manually requested locks (ladvise lockahead)
+ *
+ * \param[in]  inode	inode the operation is being performed on
+ * \param[out] envout	thread specific execution environment
+ * \param[out] ioout	client io description
+ * \param[out] refcheck	reference check
+ *
+ * \retval 1		on success
+ * \retval 0		not a regular file, cannot get environment
+ * \retval negative	negative errno on error
+ */
+int cl_io_get(struct inode *inode, struct lu_env **envout,
+	      struct cl_io **ioout, u16 *refcheck)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *clob = lli->lli_clob;
+	int result;
+
+	if (S_ISREG(inode->i_mode)) {
+		env = cl_env_get(refcheck);
+		if (!IS_ERR(env)) {
+			io = vvp_env_thread_io(env);
+			io->ci_obj = clob;
+			*envout = env;
+			*ioout  = io;
+			result = 1;
+		} else {
+			result = PTR_ERR(env);
+		}
+	} else {
+		result = 0;
+	}
+	return result;
+}
+
+int cl_glimpse_size0(struct inode *inode, int agl)
+{
+	/*
+	 * We don't need ast_flags argument to cl_glimpse_size(), because
+	 * osc_lock_enqueue() takes care of the possible deadlock that said
+	 * argument was introduced to avoid.
+	 */
+	/*
+	 * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
+	 * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
+	 * blocking anyway.
+	 */
+	struct lu_env *env = NULL;
+	struct cl_io *io  = NULL;
+	u16 refcheck;
+	int retried = 0;
+	int result;
+
+	ENTRY;
+
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result <= 0)
+		RETURN(result);
+
+	do {
+		io->ci_ndelay_tried = retried++;
+		io->ci_ndelay = io->ci_verify_layout = 1;
+		result = cl_io_init(env, io, CIT_GLIMPSE, io->ci_obj);
+		if (result > 0) {
+			/*
+			 * nothing to do for this io. This currently happens
+			 * when stripe sub-object's are not yet created.
+			 */
+			result = io->ci_result;
+		} else if (result == 0) {
+			result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+						 agl);
+			/**
+			 * need to limit retries for FLR mirrors if fast read
+			 * is short because of concurrent truncate.
+			 */
+			if (!agl && result == -EAGAIN &&
+			    !io->ci_tried_all_mirrors)
+				io->ci_need_restart = 1;
+		}
+
+		OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, cfs_fail_val ?: 4);
+		cl_io_fini(env, io);
+	} while (unlikely(io->ci_need_restart));
+
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
new file mode 100644
index 0000000000000..dfc7edf29e81a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
@@ -0,0 +1,287 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <libcfs/libcfs.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+#include <lustre_mdc.h>
+#include <cl_object.h>
+
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/**
+ * An `emergency' environment used by cl_inode_fini() when cl_env_get()
+ * fails. Access to this environment is serialized by cl_inode_fini_guard
+ * mutex.
+ */
+struct lu_env *cl_inode_fini_env;
+__u16 cl_inode_fini_refcheck;
+
+/**
+ * A mutex serializing calls to slp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+static DEFINE_MUTEX(cl_inode_fini_guard);
+
+int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
+		   enum op_xvalid xvalid, unsigned int attr_flags)
+{
+	struct lu_env *env;
+	struct cl_io  *io;
+	int result;
+	__u16 refcheck;
+
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = obj;
+	io->ci_verify_layout = 1;
+
+	io->u.ci_setattr.sa_attr.lvb_atime = attr->ia_atime.tv_sec;
+	io->u.ci_setattr.sa_attr.lvb_mtime = attr->ia_mtime.tv_sec;
+	io->u.ci_setattr.sa_attr.lvb_ctime = attr->ia_ctime.tv_sec;
+	io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
+	io->u.ci_setattr.sa_attr_flags = attr_flags;
+	io->u.ci_setattr.sa_avalid = attr->ia_valid;
+	io->u.ci_setattr.sa_xvalid = xvalid;
+	io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu);
+	if (attr->ia_valid & ATTR_SIZE)
+		io->u.ci_setattr.sa_subtype = CL_SETATTR_TRUNC;
+again:
+	if (attr->ia_valid & ATTR_FILE)
+		ll_io_set_mirror(io, attr->ia_file);
+
+	if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
+		struct vvp_io *vio = vvp_env_io(env);
+
+		if (attr->ia_valid & ATTR_FILE)
+			/*
+			 * populate the file descriptor for ftruncate to honor
+			 * group lock - see LU-787
+			 */
+			vio->vui_fd = attr->ia_file->private_data;
+
+		result = cl_io_loop(env, io);
+	} else {
+		result = io->ci_result;
+	}
+	cl_io_fini(env, io);
+	if (unlikely(io->ci_need_restart))
+		goto again;
+
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+/**
+ * Initialize or update CLIO structures for regular files when new
+ * meta-data arrives from the server.
+ *
+ * \param inode regular file inode
+ * \param md    new file metadata from MDS
+ * - allocates cl_object if necessary,
+ * - updated layout, if object was already here.
+ */
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
+{
+	struct lu_env        *env;
+	struct ll_inode_info *lli;
+        struct cl_object     *clob;
+        struct lu_site       *site;
+        struct lu_fid        *fid;
+	struct cl_object_conf conf = {
+		.coc_inode = inode,
+		.u = {
+			.coc_layout = md->layout,
+		}
+	};
+        int result = 0;
+	__u16 refcheck;
+
+	LASSERT(md->body->mbo_valid & OBD_MD_FLID);
+	LASSERT(S_ISREG(inode->i_mode));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	site = ll_i2sbi(inode)->ll_site;
+	lli  = ll_i2info(inode);
+	fid  = &lli->lli_fid;
+	LASSERT(fid_is_sane(fid));
+
+	if (lli->lli_clob == NULL) {
+		/* clob is slave of inode, empty lli_clob means for new inode,
+		 * there is no clob in cache with the given fid, so it is
+		 * unnecessary to perform lookup-alloc-lookup-insert, just
+		 * alloc and insert directly.
+		 */
+		if (!(inode->i_state & I_NEW)) {
+			result = -EIO;
+			CERROR("%s: unexpected not-NEW inode "DFID": rc = %d\n",
+			       ll_i2sbi(inode)->ll_fsname, PFID(fid), result);
+			goto out;
+		}
+
+		conf.coc_lu.loc_flags = LOC_F_NEW;
+		clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+				      fid, &conf);
+		if (!IS_ERR(clob)) {
+			/*
+			 * No locking is necessary, as new inode is
+			 * locked by I_NEW bit.
+			 */
+			lli->lli_clob = clob;
+			lu_object_ref_add(&clob->co_lu, "inode", inode);
+		} else {
+			result = PTR_ERR(clob);
+		}
+	} else {
+		result = cl_conf_set(env, lli->lli_clob, &conf);
+		if (result == -EBUSY) {
+			/* ignore the error since I/O will handle it later */
+			result = 0;
+		}
+	}
+
+	if (result != 0)
+		CERROR("%s: failed to initialize cl_object "DFID": rc = %d\n",
+		       ll_i2sbi(inode)->ll_fsname, PFID(fid), result);
+
+out:
+	cl_env_put(env, &refcheck);
+
+	return result;
+}
+
+/**
+ * Wait for others drop their references of the object at first, then we drop
+ * the last one, which will lead to the object be destroyed immediately.
+ * Must be called after cl_object_kill() against this object.
+ *
+ * The reason we want to do this is: destroying top object will wait for sub
+ * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs)
+ * to initiate top object destroying which may deadlock. See bz22520.
+ */
+static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
+{
+	struct lu_object_header *header = obj->co_lu.lo_header;
+
+	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
+		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
+		wait_queue_head_t *wq;
+
+		wq = lu_site_wq_from_fid(site, &header->loh_fid);
+
+		wait_event(*wq, atomic_read(&header->loh_ref) == 1);
+	}
+
+	cl_object_put(env, obj);
+}
+
+void cl_inode_fini(struct inode *inode)
+{
+	struct lu_env           *env;
+	struct ll_inode_info    *lli  = ll_i2info(inode);
+	struct cl_object        *clob = lli->lli_clob;
+	__u16  refcheck;
+	int emergency;
+
+	if (clob != NULL) {
+		env = cl_env_get(&refcheck);
+		emergency = IS_ERR(env);
+		if (emergency) {
+			mutex_lock(&cl_inode_fini_guard);
+			LASSERT(cl_inode_fini_env != NULL);
+			env = cl_inode_fini_env;
+		}
+
+                /*
+                 * cl_object cache is a slave to inode cache (which, in turn
+                 * is a slave to dentry cache), don't keep cl_object in memory
+                 * when its master is evicted.
+                 */
+                cl_object_kill(env, clob);
+                lu_object_ref_del(&clob->co_lu, "inode", inode);
+                cl_object_put_last(env, clob);
+                lli->lli_clob = NULL;
+		if (emergency)
+			mutex_unlock(&cl_inode_fini_guard);
+		else
+			cl_env_put(env, &refcheck);
+	}
+}
+
+/**
+ * build inode number from passed @fid.
+ *
+ * For 32-bit systems or syscalls limit the inode number to a 32-bit value
+ * to avoid EOVERFLOW errors.  This will inevitably result in inode number
+ * collisions, but fid_flatten32() tries hard to avoid this if possible.
+ */
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32)
+{
+	if (BITS_PER_LONG == 32 || api32)
+		RETURN(fid_flatten32(fid));
+
+	RETURN(fid_flatten(fid));
+}
+
+/**
+ * build inode generation from passed @fid.  If our FID overflows the 32-bit
+ * inode number then return a non-zero generation to distinguish them.
+ */
+__u32 cl_fid_build_gen(const struct lu_fid *fid)
+{
+	if (fid_is_igif(fid))
+		RETURN(lu_igif_gen(fid));
+
+	RETURN(fid_flatten(fid) >> 32);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
new file mode 100644
index 0000000000000..70290ad705018
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * cl code used by vvp (and other Lustre clients in the future).
+ *
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <cl_object.h>
+
+#include "llite_internal.h"
+
+/*
+ * Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC.
+ */
+static int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
+{
+	u32 val_size;
+	u32 max_easize;
+	u32 def_easize;
+	int rc;
+
+	ENTRY;
+
+	val_size = sizeof(max_easize);
+	rc = obd_get_info(NULL, dt_exp, sizeof(KEY_MAX_EASIZE), KEY_MAX_EASIZE,
+			  &val_size, &max_easize);
+	if (rc != 0)
+		RETURN(rc);
+
+	val_size = sizeof(def_easize);
+	rc = obd_get_info(NULL, dt_exp, sizeof(KEY_DEFAULT_EASIZE),
+			  KEY_DEFAULT_EASIZE, &val_size, &def_easize);
+	if (rc != 0)
+		RETURN(rc);
+
+	/*
+	 * default cookiesize is 0 because from 2.4 server doesn't send
+	 * llog cookies to client.
+	 */
+	CDEBUG(D_HA, "updating def/max_easize: %d/%d\n",
+	       def_easize, max_easize);
+
+	rc = md_init_ea_size(md_exp, max_easize, def_easize);
+	RETURN(rc);
+}
+
+/**
+ * This function is used as an upcall-callback hooked llite clients
+ * into obd_notify() listeners chain to handle notifications about
+ * change of import connect_flags. See lustre_common_fill_super().
+ */
+int cl_ocd_update(struct obd_device *host, struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner)
+{
+	struct lustre_client_ocd *lco;
+	struct client_obd *cli;
+	u64 flags;
+	int result;
+
+	ENTRY;
+
+	if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    watched->obd_set_up && !watched->obd_stopping) {
+		cli = &watched->u.cli;
+		lco = owner;
+		flags = cli->cl_import->imp_connect_data.ocd_connect_flags;
+		CDEBUG(D_SUPER, "Changing connect_flags: %#llx -> %#llx\n",
+		       lco->lco_flags, flags);
+		mutex_lock(&lco->lco_lock);
+		lco->lco_flags &= flags;
+		/* for each osc event update ea size */
+		if (lco->lco_dt_exp)
+			cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp);
+
+		mutex_unlock(&lco->lco_lock);
+		result = 0;
+	} else {
+		CERROR("unexpected notification from %s %s"
+		       "(setup:%d,stopping:%d)!\n",
+		       watched->obd_type->typ_name,
+		       watched->obd_name, watched->obd_set_up,
+		       watched->obd_stopping);
+		result = -EINVAL;
+	}
+	RETURN(result);
+}
+
+#define GROUPLOCK_SCOPE "grouplock"
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		     struct ll_grouplock *lg)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct cl_lock *lock;
+	struct cl_lock_descr *descr;
+	u32 enqflags;
+	u16 refcheck;
+	int rc;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = obj;
+
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc != 0) {
+		cl_io_fini(env, io);
+		cl_env_put(env, &refcheck);
+		/* Does not make sense to take GL for released layout */
+		if (rc > 0)
+			rc = -ENOTSUPP;
+		return rc;
+	}
+
+	lock = vvp_env_lock(env);
+	descr = &lock->cll_descr;
+	descr->cld_obj = obj;
+	descr->cld_start = 0;
+	descr->cld_end = CL_PAGE_EOF;
+	descr->cld_gid = gid;
+	descr->cld_mode = CLM_GROUP;
+
+	enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0);
+	descr->cld_enq_flags = enqflags;
+
+	rc = cl_lock_request(env, io, lock);
+	if (rc < 0) {
+		cl_io_fini(env, io);
+		cl_env_put(env, &refcheck);
+		return rc;
+	}
+
+	lg->lg_env = env;
+	lg->lg_io = io;
+	lg->lg_lock = lock;
+	lg->lg_gid = gid;
+
+	return 0;
+}
+
+void cl_put_grouplock(struct ll_grouplock *lg)
+{
+	struct lu_env  *env  = lg->lg_env;
+	struct cl_io   *io   = lg->lg_io;
+	struct cl_lock *lock = lg->lg_lock;
+
+	LASSERT(lg->lg_env != NULL);
+	LASSERT(lg->lg_gid != 0);
+
+	cl_lock_release(env, lock);
+	cl_io_fini(env, io);
+	cl_env_put(env, NULL);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_foreign.c b/drivers/staging/lustrefsx/lustre/llite/llite_foreign.c
new file mode 100644
index 0000000000000..9e2a7cbd44c08
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_foreign.c
@@ -0,0 +1,281 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2020 Intel Corporation.
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "llite_internal.h"
+
+static void ll_manage_foreign_file(struct inode *inode,
+				   struct lov_foreign_md *lfm)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+	if (le32_to_cpu(lfm->lfm_type) == LU_FOREIGN_TYPE_SYMLINK) {
+		CDEBUG(D_INFO,
+		       "%s: inode %p of fid "DFID": Foreign file of type symlink, faking a symlink\n",
+		       sbi->ll_fsname, inode, PFID(ll_inode2fid(inode)));
+		/* change inode_operations to add symlink methods, and clear
+		 * IOP_NOFOLLOW to ensure file will be treated as a symlink
+		 * by Kernel (see in * d_flags_for_inode()).
+		 */
+		inode->i_op = &ll_foreign_file_symlink_inode_operations;
+		inode->i_opflags &= ~IOP_NOFOLLOW;
+	} else {
+		CDEBUG(D_INFO,
+		       "%s: inode %p of fid "DFID": Foreign file of type %ux, nothing special to do\n",
+		       sbi->ll_fsname, inode, PFID(ll_inode2fid(inode)),
+		       le32_to_cpu(lfm->lfm_type));
+	}
+}
+
+static void ll_manage_foreign_dir(struct inode *inode,
+				  struct lmv_foreign_md *lfm)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+	if (lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK) {
+		CDEBUG(D_INFO,
+		       "%s: inode %p of fid "DFID": Foreign dir of type symlink, faking a symlink\n",
+		       sbi->ll_fsname, inode, PFID(ll_inode2fid(inode)));
+		/* change inode_operations to add symlink methods
+		 * IOP_NOFOLLOW should not be set for dirs
+		 */
+		inode->i_op = &ll_foreign_dir_symlink_inode_operations;
+	} else {
+		CDEBUG(D_INFO,
+		       "%s: inode %p of fid "DFID": Foreign dir of type %ux, nothing special to do\n",
+		       sbi->ll_fsname, inode, PFID(ll_inode2fid(inode)),
+		       le32_to_cpu(lfm->lfm_type));
+	}
+}
+
+int ll_manage_foreign(struct inode *inode, struct lustre_md *lmd)
+{
+	int rc = 0;
+
+	ENTRY;
+	/* apply any foreign file/dir policy */
+	if (S_ISREG((inode)->i_mode)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct cl_object *obj = lli->lli_clob;
+
+		if (lmd->layout.lb_buf != NULL && lmd->layout.lb_len != 0) {
+			struct lov_foreign_md *lfm = lmd->layout.lb_buf;
+
+			if (lfm->lfm_magic == LOV_MAGIC_FOREIGN)
+				ll_manage_foreign_file(inode, lfm);
+			GOTO(out, rc);
+		}
+
+		if (obj) {
+			struct lov_foreign_md lfm = {
+				.lfm_magic = LOV_MAGIC,
+			};
+			struct cl_layout cl = {
+				.cl_buf.lb_buf = &lfm,
+				.cl_buf.lb_len = sizeof(lfm),
+			};
+			struct lu_env *env;
+			u16 refcheck;
+
+			env = cl_env_get(&refcheck);
+			if (IS_ERR(env))
+				GOTO(out, rc = PTR_ERR(env));
+			rc = cl_object_layout_get(env, obj, &cl);
+			/* error is likely to be -ERANGE because of the small
+			 * buffer we use, only the content is significant here
+			 */
+			if (rc < 0 && rc != -ERANGE) {
+				cl_env_put(env, &refcheck);
+				GOTO(out, rc);
+			}
+			if (lfm.lfm_magic == LOV_MAGIC_FOREIGN)
+				ll_manage_foreign_file(inode, &lfm);
+			cl_env_put(env, &refcheck);
+		}
+	} else if (S_ISDIR((inode)->i_mode)) {
+		if (lmd->lfm != NULL &&
+		    lmd->lfm->lfm_magic == LMV_MAGIC_FOREIGN) {
+			ll_manage_foreign_dir(inode, lmd->lfm);
+		} else {
+			struct ll_inode_info *lli = ll_i2info(inode);
+			struct lmv_foreign_md *lfm;
+
+			down_read(&lli->lli_lsm_sem);
+			lfm = (struct lmv_foreign_md *)(lli->lli_lsm_md);
+			if (lfm &&  lfm->lfm_magic == LMV_MAGIC_FOREIGN)
+				ll_manage_foreign_dir(inode, lfm);
+			up_read(&lli->lli_lsm_sem);
+		}
+	}
+out:
+	RETURN(rc);
+}
+
+/* dentry must be spliced to inode (dentry->d_inode != NULL) !!! */
+bool ll_foreign_is_openable(struct dentry *dentry, unsigned int flags)
+{
+	/* check for faked symlink here as they should not be opened (unless
+	 * O_NOFOLLOW!) and thus wants ll_atomic_open() to return 1 from
+	 * finish_no_open() in order to get follow_link() to be called in both
+	 * path_lookupat() and path_openupat().
+	 * This will not break regular symlink handling as they have
+	 * been treated/filtered upstream.
+	 */
+	if (d_is_symlink(dentry) && !S_ISLNK(dentry->d_inode->i_mode) &&
+	    !(flags & O_NOFOLLOW))
+		return false;
+
+	return true;
+}
+
+static bool should_preserve_foreign_file(struct lov_foreign_md *lfm,
+					 struct ll_inode_info *lli, bool unset)
+{
+	/* for now, only avoid foreign fake symlink file removal */
+
+	if (unset)
+		if (lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK) {
+			set_bit(LLIF_FOREIGN_REMOVABLE, &lli->lli_flags);
+			return true;
+		} else {
+			return false;
+		}
+	else
+		return lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK &&
+			!test_bit(LLIF_FOREIGN_REMOVABLE, &lli->lli_flags);
+}
+
+static bool should_preserve_foreign_dir(struct lmv_foreign_md *lfm,
+					struct ll_inode_info *lli, bool unset)
+{
+	/* for now, only avoid foreign fake symlink dir removal */
+
+	if (unset)
+		if (lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK) {
+			set_bit(LLIF_FOREIGN_REMOVABLE, &lli->lli_flags);
+			return true;
+		} else {
+			return false;
+		}
+	else
+		return lfm->lfm_type == LU_FOREIGN_TYPE_SYMLINK &&
+			!test_bit(LLIF_FOREIGN_REMOVABLE, &lli->lli_flags);
+}
+
+/* XXX
+ * instead of fetching type from foreign LOV/LMV, we may simply
+ * check (d_is_symlink(dentry) && !S_ISLNK(dentry->d_inode->i_mode))
+ * to identify a fake symlink
+ */
+bool ll_foreign_is_removable(struct dentry *dentry, bool unset)
+{
+	struct inode *inode = dentry->d_inode;
+	struct qstr *name = &dentry->d_name;
+	bool preserve_foreign = false;
+	int rc = 0;
+
+	ENTRY;
+	if (inode == NULL)
+		return 0;
+
+	/* some foreign types may not be allowed to be unlinked in order to
+	 * keep references with external objects
+	 */
+	if (S_ISREG(inode->i_mode)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct cl_object *obj = lli->lli_clob;
+
+		if (obj) {
+			struct lov_foreign_md lfm = {
+				.lfm_magic = LOV_MAGIC,
+			};
+			struct cl_layout cl = {
+				.cl_buf.lb_buf = &lfm,
+				.cl_buf.lb_len = sizeof(lfm),
+			};
+			struct lu_env *env;
+			u16 refcheck;
+
+			env = cl_env_get(&refcheck);
+			if (IS_ERR(env))
+				GOTO(out, rc = PTR_ERR(env));
+			rc = cl_object_layout_get(env, obj, &cl);
+			/* error is likely to be -ERANGE because of the small
+			 * buffer we use, only the content is significant here
+			 */
+			if (rc < 0 && rc != -ERANGE) {
+				cl_env_put(env, &refcheck);
+				goto out;
+			} else {
+				rc = 0;
+			}
+			if (lfm.lfm_magic == LOV_MAGIC_FOREIGN)
+				preserve_foreign =
+					should_preserve_foreign_file(&lfm, lli,
+								     unset);
+			cl_env_put(env, &refcheck);
+			if (preserve_foreign) {
+				CDEBUG(D_INFO,
+				       "%s unlink of foreign file (%.*s, "DFID")\n",
+				       unset ? "allow" : "prevent",
+				       name->len, name->name,
+				       PFID(ll_inode2fid(inode)));
+				RETURN(false);
+			}
+		} else {
+			CDEBUG(D_INFO,
+			       "unable to check if file (%.*s, "DFID") is foreign...\n",
+			       name->len, name->name,
+			       PFID(ll_inode2fid(inode)));
+			/* XXX should we prevent removal ?? */
+		}
+	} else if (S_ISDIR(inode->i_mode)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct lmv_foreign_md *lfm;
+
+		down_read(&lli->lli_lsm_sem);
+		lfm = (struct lmv_foreign_md *)(lli->lli_lsm_md);
+		if (!lfm)
+			CDEBUG(D_INFO,
+			       "unable to check if dir (%.*s, "DFID") is foreign...\n",
+			       name->len, name->name,
+			       PFID(ll_inode2fid(inode)));
+		else if (lfm->lfm_magic == LMV_MAGIC_FOREIGN)
+			preserve_foreign = should_preserve_foreign_dir(lfm, lli,
+								       unset);
+		up_read(&lli->lli_lsm_sem);
+		if (preserve_foreign) {
+			CDEBUG(D_INFO,
+			       "%s unlink of foreign dir (%.*s, "DFID")\n",
+			       unset ? "allow" : "prevent",
+			       name->len, name->name,
+			       PFID(ll_inode2fid(inode)));
+			RETURN(false);
+		}
+	}
+
+out:
+	RETURN(true);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_foreign_symlink.c b/drivers/staging/lustrefsx/lustre/llite/llite_foreign_symlink.c
new file mode 100644
index 0000000000000..b9ee7daf2e3ca
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_foreign_symlink.c
@@ -0,0 +1,865 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2020 Intel Corporation.
+ */
+/*
+ * Foreign symlink implementation.
+ *
+ * Methods in this source file allow to construct a relative path from the
+ * LOV/LMV foreign content, to complement it with a prefix, and then to
+ * expose it to the VFS as a symlink destination.
+ * The default/internal mechanism simply takes the full foreign free string
+ * as the relative path, and for more complex internal formats an upcall has
+ * been implemented to provide format's details (presently just in terms of
+ * constant strings and substrings positions in EA, but this can be enhanced)
+ * to llite layer.
+ */
+
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/version.h>
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "llite_internal.h"
+
+/* allocate space for "/<prefix>/<suffix>'\0'" and copy prefix in,
+ * returns start position for suffix in *destname
+ * must be called with ll_foreign_symlink_sem locked for read, to
+ * protect against sbi->ll_foreign_symlink_prefix change
+ * on output, provides position where to start prefix complement
+ */
+static int foreign_symlink_alloc_and_copy_prefix(struct ll_sb_info *sbi,
+						 struct inode *inode,
+						 char **destname,
+						 size_t suffix_size)
+{
+	size_t prefix_size, full_size;
+
+	ENTRY;
+
+	/* allocate enough for "/<prefix>/<suffix>'\0'" */
+	prefix_size = sbi->ll_foreign_symlink_prefix_size - 1;
+	full_size = suffix_size + prefix_size + 3;
+	if (full_size > PATH_MAX) {
+		CERROR("%s: inode "DFID": resolved destination path too long\n",
+		       sbi->ll_fsname, PFID(ll_inode2fid(inode)));
+		RETURN(-EINVAL);
+	}
+	OBD_ALLOC(*destname, full_size);
+	if (*destname == NULL)
+		RETURN(-ENOMEM);
+
+	memcpy(*destname + 1, sbi->ll_foreign_symlink_prefix,
+	       prefix_size);
+	(*destname)[0] = '/';
+	(*destname)[prefix_size + 1] = '/';
+
+	RETURN(prefix_size + 2);
+}
+
+/* if no upcall registered, default foreign symlink parsing method
+ * is to use the full lfm_value as a relative path to complement
+ * foreign_prefix
+ */
+static int ll_foreign_symlink_default_parse(struct ll_sb_info *sbi,
+					    struct inode *inode,
+					    struct lov_foreign_md *lfm,
+					    char **destname)
+{
+	int suffix_pos;
+
+	down_read(&sbi->ll_foreign_symlink_sem);
+	suffix_pos = foreign_symlink_alloc_and_copy_prefix(sbi, inode,
+							   destname,
+							   lfm->lfm_length);
+	up_read(&sbi->ll_foreign_symlink_sem);
+
+	if (suffix_pos < 0)
+		RETURN(suffix_pos);
+
+	memcpy(*destname + suffix_pos, lfm->lfm_value,
+	       lfm->lfm_length);
+	(*destname)[suffix_pos + lfm->lfm_length] = '\0';
+
+	RETURN(0);
+}
+
+/* if an upcall has been registered, foreign symlink will be
+ * constructed as per upcall provided format
+ * presently we only support a serie of constant strings and sub-strings
+ * to be taken from lfm_value content
+ */
+static int ll_foreign_symlink_upcall_parse(struct ll_sb_info *sbi,
+					   struct inode *inode,
+					   struct lov_foreign_md *lfm,
+					   char **destname)
+{
+	int pos = 0, suffix_pos = -1, items_size = 0;
+	struct ll_foreign_symlink_upcall_item *foreign_symlink_items =
+			sbi->ll_foreign_symlink_upcall_items;
+	int i = 0, rc = 0;
+
+	ENTRY;
+
+	down_read(&sbi->ll_foreign_symlink_sem);
+
+	/* compute size of relative path of destination path
+	 * could be done once during upcall items/infos reading
+	 * and stored as new ll_sb_info field
+	 */
+	for (i = 0; i < sbi->ll_foreign_symlink_upcall_nb_items; i++) {
+		switch (foreign_symlink_items[i].type) {
+		case STRING_TYPE:
+			items_size += foreign_symlink_items[i].size;
+			break;
+		case POSLEN_TYPE:
+			items_size += foreign_symlink_items[i].len;
+			break;
+		case EOB_TYPE:
+			/* should be the last item */
+			break;
+		default:
+			CERROR("%s: unexpected type '%u' found in items\n",
+			       sbi->ll_fsname, foreign_symlink_items[i].type);
+			GOTO(failed, rc = -EINVAL);
+		}
+	}
+
+	suffix_pos = foreign_symlink_alloc_and_copy_prefix(sbi, inode, destname,
+							   items_size);
+	if (suffix_pos < 0)
+		GOTO(failed, rc = suffix_pos);
+
+	/* rescan foreign_symlink_items[] to create faked symlink dest path */
+	i = 0;
+	while (foreign_symlink_items[i].type != EOB_TYPE) {
+		if (foreign_symlink_items[i].type == STRING_TYPE) {
+			memcpy(*destname + suffix_pos + pos,
+			       foreign_symlink_items[i].string,
+			       foreign_symlink_items[i].size);
+			pos += foreign_symlink_items[i].size;
+		} else if (foreign_symlink_items[i].type == POSLEN_TYPE) {
+			if (lfm->lfm_length < foreign_symlink_items[i].pos +
+					      foreign_symlink_items[i].len) {
+				CERROR("%s:  "DFID" foreign EA too short to find (%u,%u) item\n",
+				       sbi->ll_fsname,
+				       PFID(ll_inode2fid(inode)),
+				       foreign_symlink_items[i].pos,
+				       foreign_symlink_items[i].len);
+				GOTO(failed, rc = -EINVAL);
+			}
+			memcpy(*destname + suffix_pos + pos,
+			       lfm->lfm_value + foreign_symlink_items[i].pos,
+			       foreign_symlink_items[i].len);
+			pos += foreign_symlink_items[i].len;
+		} else {
+			CERROR("%s: unexpected type '%u' found in items\n",
+			       sbi->ll_fsname, foreign_symlink_items[i].type);
+			GOTO(failed, rc = -EINVAL);
+		}
+		i++;
+	}
+failed:
+	up_read(&sbi->ll_foreign_symlink_sem);
+
+	if (rc != 0 && suffix_pos >= 0) {
+		OBD_FREE_LARGE(*destname, suffix_pos + items_size);
+		*destname = NULL;
+	}
+
+	RETURN(rc);
+}
+
+static int ll_foreign_symlink_parse(struct ll_sb_info *sbi,
+				    struct inode *inode,
+				    struct lov_foreign_md *lfm,
+				    char **destname)
+{
+	int rc;
+
+	/* if no user-land upcall registered, assuming whole free field
+	 * of foreign LOV is relative path of faked symlink destination,
+	 * to be completed by prefix
+	 */
+	if (!test_bit(LL_SBI_FOREIGN_SYMLINK_UPCALL, sbi->ll_flags))
+		rc = ll_foreign_symlink_default_parse(sbi, inode, lfm,
+						      destname);
+	else /* upcall is available */
+		rc = ll_foreign_symlink_upcall_parse(sbi, inode, lfm,
+						     destname);
+	return rc;
+}
+
+/* Don't need lli_size_mutex locked as LOV/LMV are EAs
+ * and should not be stored in data blocks
+ */
+static int ll_foreign_readlink_internal(struct inode *inode, char **symname)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct lov_foreign_md *lfm = NULL;
+	char *destname = NULL;
+	size_t lfm_size = 0;
+	int rc;
+
+	ENTRY;
+
+	if (S_ISREG(inode->i_mode)) {
+		struct cl_object *obj = lli->lli_clob;
+		struct cl_layout cl = {
+			.cl_buf.lb_len = 0, /* to get real size */
+		};
+		struct lu_env *env;
+		u16 refcheck;
+
+		if (!obj) {
+			CERROR("%s: inode "DFID": can not get layout, no cl_object\n",
+			       sbi->ll_fsname, PFID(ll_inode2fid(inode)));
+			GOTO(failed, rc = -EINVAL);
+		}
+
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env))
+			RETURN(PTR_ERR(env));
+		/* get layout size */
+		rc = cl_object_layout_get(env, obj, &cl);
+		if (rc <= 0) {
+			CERROR("%s: inode "DFID": error trying to get layout size : %d\n",
+			       sbi->ll_fsname, PFID(ll_inode2fid(inode)), rc);
+			cl_env_put(env, &refcheck);
+			RETURN(rc);
+		}
+		OBD_ALLOC(lfm, rc);
+		if (!lfm) {
+			CERROR("%s: inode "DFID": can not allocate enough mem to get layout\n",
+			       sbi->ll_fsname, PFID(ll_inode2fid(inode)));
+			cl_env_put(env, &refcheck);
+			RETURN(-ENOMEM);
+		}
+		cl.cl_buf.lb_len = rc;
+		cl.cl_buf.lb_buf = lfm;
+		/* get layout */
+		rc = cl_object_layout_get(env, obj, &cl);
+		if (rc <= 0) {
+			CERROR("%s: inode "DFID": error trying to get layout : %d\n",
+			       sbi->ll_fsname, PFID(ll_inode2fid(inode)), rc);
+			OBD_FREE(lfm, cl.cl_buf.lb_len);
+			cl_env_put(env, &refcheck);
+			RETURN(rc);
+		}
+		lfm_size = cl.cl_buf.lb_len;
+		cl_env_put(env, &refcheck);
+	} else if (S_ISDIR(inode->i_mode)) {
+		down_read(&lli->lli_lsm_sem);
+
+		/* should be casted lmv_foreign_md, but it is ok as both foreign LOV
+		 * and LMV formats are identical, and then we also only need
+		 * one set of parsing routines for both foreign files and dirs!
+		 */
+		lfm = (struct lov_foreign_md *)(lli->lli_lsm_md);
+		if (lfm != NULL) {
+			CDEBUG(D_INFO, "%s: inode "DFID": LMV cached found\n",
+			       sbi->ll_fsname, PFID(ll_inode2fid(inode)));
+		} else {
+			CERROR("%s: inode "DFID": cannot get layout, no LMV cached\n",
+			       sbi->ll_fsname, PFID(ll_inode2fid(inode)));
+			GOTO(failed, rc = -EINVAL);
+		}
+	} else {
+		CERROR("%s: inode "DFID": not a regular file nor directory\n",
+		       sbi->ll_fsname, PFID(ll_inode2fid(inode)));
+		GOTO(failed, rc = -EINVAL);
+	}
+
+	/* XXX no assert nor double check of magic, length and type ? */
+
+	rc = ll_foreign_symlink_parse(sbi, inode, lfm, &destname);
+
+failed:
+	if (S_ISDIR(inode->i_mode))
+		up_read(&lli->lli_lsm_sem);
+
+	if (S_ISREG(inode->i_mode) && lfm)
+		OBD_FREE(lfm, lfm_size);
+
+	if (!rc) {
+		*symname = destname;
+		CDEBUG(D_INFO,
+		       "%s: inode "DFID": faking symlink to dest '%s'\n",
+		       sbi->ll_fsname, PFID(ll_inode2fid(inode)), destname);
+	}
+
+	RETURN(rc);
+}
+
+#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA
+static void ll_foreign_put_link(struct dentry *dentry,
+			struct nameidata *nd, void *cookie)
+#else
+# ifdef HAVE_IOP_GET_LINK
+static void ll_foreign_put_link(void *cookie)
+# else
+static void ll_foreign_put_link(struct inode *unused, void *cookie)
+# endif
+#endif
+{
+	/* to avoid allocating an unnecessary big buffer, and since ways to
+	 * build the symlink path from foreign LOV/LMV can be multiple and
+	 * not constant. So it size is not known and we need to use
+	 * strlen(cookie)+1 to determine its size and to avoid false positive
+	 * to be reported by memory leak check code
+	 */
+	OBD_FREE_LARGE(cookie, strlen(cookie) + 1);
+}
+
+#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA
+static void *ll_foreign_follow_link(struct dentry *dentry,
+				      struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc;
+	char *symname = NULL;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	/*
+	 * Limit the recursive symlink depth to 5 instead of default
+	 * 8 links when kernel has 4k stack to prevent stack overflow.
+	 * For 8k stacks we need to limit it to 7 for local servers.
+	 */
+	if (THREAD_SIZE < 8192 && current->link_count >= 6)
+		rc = -ELOOP;
+	else if (THREAD_SIZE == 8192 && current->link_count >= 8)
+		rc = -ELOOP;
+	else
+		rc = ll_foreign_readlink_internal(inode, &symname);
+
+	if (rc)
+		symname = ERR_PTR(rc);
+
+	nd_set_link(nd, symname);
+	RETURN(symname);
+}
+
+#elif defined(HAVE_IOP_GET_LINK)
+static const char *ll_foreign_get_link(struct dentry *dentry,
+				       struct inode *inode,
+				       struct delayed_call *done)
+{
+	char *symname = NULL;
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	if (!dentry)
+		RETURN(ERR_PTR(-ECHILD));
+	rc = ll_foreign_readlink_internal(inode, &symname);
+
+	/*
+	 * symname must be freed when we are done
+	 *
+	 * XXX we may avoid the need to do so if we use
+	 * lli_symlink_name cache to retain symname and
+	 * let ll_clear_inode free it...
+	 */
+	set_delayed_call(done, ll_foreign_put_link, symname);
+	RETURN(rc ? ERR_PTR(rc) : symname);
+}
+
+# else /* !HAVE_IOP_GET_LINK */
+static const char *ll_foreign_follow_link(struct dentry *dentry,
+					    void **cookie)
+{
+	struct inode *inode = d_inode(dentry);
+	char *symname = NULL;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	rc = ll_foreign_readlink_internal(inode, &symname);
+	if (rc < 0)
+		return ERR_PTR(rc);
+
+	/* XXX need to also return symname in cookie in order to delay
+	 * its release ??
+	 */
+
+	RETURN(symname);
+}
+
+#endif /* HAVE_SYMLINK_OPS_USE_NAMEIDATA, HAVE_IOP_GET_LINK */
+
+/*
+ * Should only be called for already in-use/cache foreign dir inode
+ * when foreign fake-symlink behaviour has been enabled afterward
+ */
+static struct dentry *ll_foreign_dir_lookup(struct inode *parent,
+					 struct dentry *dentry,
+					 unsigned int flags)
+{
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n",
+	       dentry->d_name.len, dentry->d_name.name,
+	       PFID(ll_inode2fid(parent)), parent);
+
+	return ERR_PTR(-ENODATA);
+}
+
+static bool has_same_mount_namespace(struct ll_sb_info *sbi)
+{
+	bool same;
+
+	same = (sbi->ll_mnt_ns == current->nsproxy->mnt_ns);
+	if (!same)
+		LCONSOLE_WARN("%s: client mount %s and '%s.%d' not in same mnt-namespace\n",
+			      sbi->ll_fsname, sbi->ll_kset.kobj.name,
+			      current->comm, current->pid);
+
+	return same;
+}
+
+ssize_t foreign_symlink_enable_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n",
+			test_bit(LL_SBI_FOREIGN_SYMLINK, sbi->ll_flags));
+}
+
+/*
+ * XXX
+ * There should be already in-use/cached inodes of foreign files/dirs who
+ * will not-be/continue-to-be handled as fake-symlink, depending if
+ * feature is being enabled/disabled, until being revalidated.
+ * Also, does it require sbi->ll_lock protection ?
+ */
+ssize_t foreign_symlink_enable_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	if (!has_same_mount_namespace(sbi))
+		return -EINVAL;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		set_bit(LL_SBI_FOREIGN_SYMLINK, sbi->ll_flags);
+	else
+		clear_bit(LL_SBI_FOREIGN_SYMLINK, sbi->ll_flags);
+
+	return count;
+}
+
+ssize_t foreign_symlink_prefix_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	ssize_t size;
+
+	down_read(&sbi->ll_foreign_symlink_sem);
+	size = snprintf(buf, PAGE_SIZE, "%s\n", sbi->ll_foreign_symlink_prefix);
+	up_read(&sbi->ll_foreign_symlink_sem);
+
+	return size;
+}
+
+ssize_t foreign_symlink_prefix_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	char *new, *old;
+	size_t new_len, old_len;
+
+	if (!has_same_mount_namespace(sbi))
+		return -EINVAL;
+
+	/* XXX strip buffer of any CR/LF,space,... ?? */
+
+	/* check buffer looks like a valid absolute path */
+	if (*buffer != '/') {
+		CERROR("foreign symlink prefix must be an absolute path\n");
+		return -EINVAL;
+	}
+	new_len = strnlen(buffer, count);
+	if (new_len < count)
+		CDEBUG(D_INFO, "NUL byte found in %zu bytes\n", count);
+	if (new_len > PATH_MAX) {
+		CERROR("%s: foreign symlink prefix length %zu > PATH_MAX\n",
+		       sbi->ll_fsname, new_len);
+		return -EINVAL;
+	}
+	OBD_ALLOC(new, new_len + 1);
+	if (new == NULL) {
+		CERROR("%s: can not allocate space for foreign path prefix\n",
+		       sbi->ll_fsname);
+		return -ENOSPC;
+	}
+
+	down_write(&sbi->ll_foreign_symlink_sem);
+	old_len = sbi->ll_foreign_symlink_prefix_size;
+	old = sbi->ll_foreign_symlink_prefix;
+	memcpy(new, buffer, new_len);
+	*(new + new_len) = '\0';
+
+	sbi->ll_foreign_symlink_prefix = new;
+	sbi->ll_foreign_symlink_prefix_size = new_len + 1;
+	up_write(&sbi->ll_foreign_symlink_sem);
+
+	if (old)
+		OBD_FREE(old, old_len);
+
+	return new_len;
+}
+
+ssize_t foreign_symlink_upcall_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	ssize_t size;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	down_read(&sbi->ll_foreign_symlink_sem);
+	size = snprintf(buf, PAGE_SIZE, "%s\n", sbi->ll_foreign_symlink_upcall);
+	up_read(&sbi->ll_foreign_symlink_sem);
+
+	return size;
+}
+
+ssize_t foreign_symlink_upcall_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	char *old = NULL, *new = NULL;
+	size_t new_len;
+
+	if (!has_same_mount_namespace(sbi))
+		return -EINVAL;
+
+	/* XXX strip buffer of any CR/LF,space,... ?? */
+
+	/* check buffer looks like a valid absolute path */
+	if (*buffer != '/' && strcmp(buffer, "none")) {
+		CERROR("foreign symlink upcall must be an absolute path\n");
+		return -EINVAL;
+	}
+	new_len = strnlen(buffer, count);
+	if (new_len < count)
+		CDEBUG(D_INFO, "NULL byte found in %zu bytes\n", count);
+	if (new_len > PATH_MAX) {
+		CERROR("%s: foreign symlink upcall path length %zu > PATH_MAX\n",
+		       sbi->ll_fsname, new_len);
+		return -EINVAL;
+	}
+
+	OBD_ALLOC(new, new_len + 1);
+	if (new == NULL) {
+		CERROR("%s: can not allocate space for foreign symlink upcall path\n",
+		       sbi->ll_fsname);
+		return -ENOSPC;
+	}
+	memcpy(new, buffer, new_len);
+	*(new + new_len) = '\0';
+
+	down_write(&sbi->ll_foreign_symlink_sem);
+	old = sbi->ll_foreign_symlink_upcall;
+
+	sbi->ll_foreign_symlink_upcall = new;
+	/* LL_SBI_FOREIGN_SYMLINK_UPCALL will be set by
+	 * foreign_symlink_upcall_info_store() upon valid being provided
+	 * by upcall
+	 * XXX there is a potential race if there are multiple concurent
+	 * attempts to set upcall path and execution occur in different
+	 * order, we may end up using the format provided by a different
+	 * upcall than the one set in ll_foreign_symlink_upcall
+	 */
+	clear_bit(LL_SBI_FOREIGN_SYMLINK_UPCALL, sbi->ll_flags);
+	up_write(&sbi->ll_foreign_symlink_sem);
+
+	if (strcmp(new, "none")) {
+		char *argv[] = {
+			  [0] = new,
+			  /* sbi sysfs object name */
+			  [1] = (char *)sbi->ll_kset.kobj.name,
+			  [2] = NULL
+		};
+		char *envp[] = {
+			  [0] = "HOME=/",
+			  [1] = "PATH=/sbin:/usr/sbin",
+			  [2] = NULL
+		};
+		int rc;
+
+		rc = call_usermodehelper(new, argv, envp, UMH_WAIT_EXEC);
+		if (rc < 0)
+			CERROR("%s: error invoking foreign symlink upcall %s: rc %d\n",
+			       sbi->ll_fsname, new, rc);
+		else
+			CDEBUG(D_INFO, "%s: invoked upcall %s\n",
+			       sbi->ll_fsname, new);
+	}
+
+	if (old)
+		OBD_FREE_LARGE(old, strlen(old) + 1);
+
+	return new_len;
+}
+
+/* foreign_symlink_upcall_info_store() stores format items in
+ * foreign_symlink_items[], and foreign_symlink_upcall_parse()
+ * uses it to parse each foreign symlink LOV/LMV EAs
+ */
+ssize_t foreign_symlink_upcall_info_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	struct ll_foreign_symlink_upcall_item items[MAX_NB_UPCALL_ITEMS], *item;
+	struct ll_foreign_symlink_upcall_item *new_items, *old_items;
+	size_t remaining = count;
+	int nb_items = 0, old_nb_items, i, rc = 0;
+
+	ENTRY;
+
+	if (!has_same_mount_namespace(sbi))
+		return -EINVAL;
+
+	/* parse buffer to check validity of infos and fill symlink format
+	 * descriptors
+	 */
+
+	if (count % sizeof(__u32) != 0) {
+		CERROR("%s: invalid size '%zu' of infos buffer returned by foreign symlink upcall\n",
+		       sbi->ll_fsname, count);
+		RETURN(-EINVAL);
+	}
+
+	/* evaluate number of items provided */
+	while (remaining > 0) {
+		item = (struct ll_foreign_symlink_upcall_item *)
+				&buffer[count - remaining];
+		switch (item->type) {
+		case STRING_TYPE: {
+			/* a constant string following */
+			if (item->size >= remaining -
+			    offsetof(struct ll_foreign_symlink_upcall_item,
+				     bytestring) - sizeof(item->type)) {
+				/* size of string must not overflow remaining
+				 * bytes minus EOB_TYPE item
+				 */
+				CERROR("%s: constant string too long in infos buffer returned by foreign symlink upcall\n",
+				       sbi->ll_fsname);
+				GOTO(failed, rc = -EINVAL);
+			}
+			OBD_ALLOC(items[nb_items].string,
+				  item->size);
+			if (items[nb_items].string == NULL) {
+				CERROR("%s: constant string allocation has failed for constant string of size %zu\n",
+				       sbi->ll_fsname, item->size);
+				GOTO(failed, rc = -ENOMEM);
+			}
+			memcpy(items[nb_items].string,
+			       item->bytestring, item->size);
+			items[nb_items].size = item->size;
+			/* string items to fit on __u32 boundary */
+			remaining = remaining - STRING_ITEM_SZ(item->size);
+			break;
+		}
+		case POSLEN_TYPE: {
+			/* a tuple (pos,len) following to delimit a sub-string
+			 * in lfm_value
+			 */
+			items[nb_items].pos = item->pos;
+			items[nb_items].len = item->len;
+			remaining -= POSLEN_ITEM_SZ;
+			break;
+		}
+		case EOB_TYPE:
+			if (remaining != sizeof(item->type)) {
+				CERROR("%s: early end of infos buffer returned by foreign symlink upcall\n",
+				       sbi->ll_fsname);
+				GOTO(failed, rc = -EINVAL);
+			}
+			remaining -= sizeof(item->type);
+			break;
+		default:
+			CERROR("%s: wrong type '%u' encountered at pos %zu , with %zu remaining bytes, in infos buffer returned by foreign symlink upcall\n",
+			       sbi->ll_fsname, (__u32)buffer[count - remaining],
+			       count - remaining, remaining);
+			GOTO(failed, rc = -EINVAL);
+		}
+
+		items[nb_items].type = item->type;
+		nb_items++;
+		if (nb_items >= MAX_NB_UPCALL_ITEMS) {
+			CERROR("%s: too many items in infos buffer returned by foreign symlink upcall\n",
+			       sbi->ll_fsname);
+			GOTO(failed, rc = -EINVAL);
+		}
+	}
+	/* valid format has been provided by foreign symlink user upcall */
+	OBD_ALLOC_LARGE(new_items, nb_items *
+			sizeof(struct ll_foreign_symlink_upcall_item));
+	if (new_items == NULL) {
+		CERROR("%s: constant string allocation has failed for constant string of size %zu\n",
+		       sbi->ll_fsname, nb_items *
+			sizeof(struct ll_foreign_symlink_upcall_item));
+		GOTO(failed, rc = -ENOMEM);
+	}
+	for (i = 0; i < nb_items; i++)
+		*((struct ll_foreign_symlink_upcall_item *)new_items + i) =
+			items[i];
+
+	down_write(&sbi->ll_foreign_symlink_sem);
+	old_items = sbi->ll_foreign_symlink_upcall_items;
+	old_nb_items = sbi->ll_foreign_symlink_upcall_nb_items;
+	sbi->ll_foreign_symlink_upcall_items = new_items;
+	sbi->ll_foreign_symlink_upcall_nb_items = nb_items;
+	set_bit(LL_SBI_FOREIGN_SYMLINK_UPCALL, sbi->ll_flags);
+	up_write(&sbi->ll_foreign_symlink_sem);
+
+	/* free old_items */
+	if (old_items != NULL) {
+		for (i = 0 ; i < old_nb_items; i++)
+			if (old_items[i].type == STRING_TYPE)
+				OBD_FREE(old_items[i].string,
+					 old_items[i].size);
+
+		OBD_FREE_LARGE(old_items, old_nb_items *
+			       sizeof(struct ll_foreign_symlink_upcall_item));
+	}
+
+failed:
+	/* clean items[] and free any strings */
+	if (rc != 0) {
+		for (i = 0; i < nb_items; i++) {
+			switch (items[i].type) {
+			case STRING_TYPE:
+				OBD_FREE(items[i].string, items[i].size);
+				items[i].string = NULL;
+				items[i].size = 0;
+				break;
+			case POSLEN_TYPE:
+				items[i].pos = 0;
+				items[i].len = 0;
+				break;
+			case EOB_TYPE:
+				break;
+			default:
+				CERROR("%s: wrong '%u'type encountered in foreign symlink upcall items\n",
+				       sbi->ll_fsname, items[i].type);
+				GOTO(failed, rc = -EINVAL);
+				break;
+			}
+			items[i].type = 0;
+		}
+	}
+
+	RETURN(rc == 0 ? count : rc);
+}
+
+/* foreign fake-symlink version of ll_getattr() */
+#if defined(HAVE_USER_NAMESPACE_ARG)
+int ll_foreign_symlink_getattr(struct user_namespace *mnt_userns,
+			       const struct path *path, struct kstat *stat,
+			       u32 request_mask, unsigned int flags)
+{
+	return ll_getattr_dentry(path->dentry, stat, request_mask, flags,
+				 true);
+}
+#elif defined(HAVE_INODEOPS_ENHANCED_GETATTR)
+int ll_foreign_symlink_getattr(const struct path *path, struct kstat *stat,
+			       u32 request_mask, unsigned int flags)
+{
+	return ll_getattr_dentry(path->dentry, stat, request_mask, flags,
+				 true);
+}
+#else
+int ll_foreign_symlink_getattr(struct vfsmount *mnt, struct dentry *de,
+			       struct kstat *stat)
+{
+	return ll_getattr_dentry(de, stat, STATX_BASIC_STATS,
+				 AT_STATX_SYNC_AS_STAT, true);
+}
+#endif
+
+struct inode_operations ll_foreign_file_symlink_inode_operations = {
+#ifdef HAVE_IOP_GENERIC_READLINK
+	.readlink	= generic_readlink,
+#endif
+	.setattr	= ll_setattr,
+#ifdef HAVE_IOP_GET_LINK
+	.get_link	= ll_foreign_get_link,
+#else
+	.follow_link	= ll_foreign_follow_link,
+	/* .put_link method required since need to release symlink copy buf */
+	.put_link	= ll_foreign_put_link,
+#endif
+	.getattr	= ll_foreign_symlink_getattr,
+	.permission	= ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr	= ll_removexattr,
+#endif
+	.listxattr	= ll_listxattr,
+};
+
+struct inode_operations ll_foreign_dir_symlink_inode_operations = {
+	.lookup		= ll_foreign_dir_lookup,
+#ifdef HAVE_IOP_GENERIC_READLINK
+	.readlink	= generic_readlink,
+#endif
+	.setattr	= ll_setattr,
+#ifdef HAVE_IOP_GET_LINK
+	.get_link	= ll_foreign_get_link,
+#else
+	.follow_link	= ll_foreign_follow_link,
+	.put_link	= ll_foreign_put_link,
+#endif
+	.getattr	= ll_foreign_symlink_getattr,
+	.permission	= ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr	= ll_removexattr,
+#endif
+	.listxattr	= ll_listxattr,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
new file mode 100644
index 0000000000000..a52f12abf289b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
@@ -0,0 +1,1860 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef LLITE_INTERNAL_H
+#define LLITE_INTERNAL_H
+#include <obd.h>
+#include <lustre_disk.h>  /* for s2sbi */
+#include <lustre_linkea.h>
+
+/* for struct cl_lock_descr and struct cl_io */
+#include <cl_object.h>
+#include <lustre_lmv.h>
+#include <lustre_mdc.h>
+#include <lustre_intent.h>
+#include <linux/compat.h>
+#include <linux/aio.h>
+#include <linux/parser.h>
+#include <linux/seqlock.h>
+#include <lustre_compat.h>
+#include <lustre_crypto.h>
+#include <range_lock.h>
+
+#include "vvp_internal.h"
+#include "pcc.h"
+#include "foreign_symlink.h"
+
+#ifndef FMODE_EXEC
+#define FMODE_EXEC 0
+#endif
+
+#ifndef HAVE_VM_FAULT_RETRY
+#define VM_FAULT_RETRY 0
+#endif
+
+/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it.
+ * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */
+#ifndef LOOKUP_CONTINUE
+#define LOOKUP_CONTINUE LOOKUP_PARENT
+#endif
+
+/** Only used on client-side for indicating the tail of dir hash/offset. */
+#define LL_DIR_END_OFF          0x7fffffffffffffffULL
+#define LL_DIR_END_OFF_32BIT    0x7fffffffUL
+
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS 22
+
+#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
+
+#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+
+struct ll_dentry_data {
+	unsigned int			lld_sa_generation;
+	unsigned int			lld_invalid:1;
+	unsigned int			lld_nfs_dentry:1;
+	struct rcu_head			lld_rcu_head;
+	unsigned long			lld_neg_cache_timeout;
+};
+
+#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
+
+#define LLI_INODE_MAGIC                 0x111d0de5
+#define LLI_INODE_DEAD                  0xdeadd00d
+
+struct ll_getname_data {
+#ifdef HAVE_DIR_CONTEXT
+	struct dir_context	ctx;
+#endif
+	char		*lgd_name;	/* points to a buffer with NAME_MAX+1 size */
+	struct lu_fid	lgd_fid;	/* target fid we are looking for */
+	int		lgd_found;	/* inode matched? */
+};
+
+struct ll_grouplock {
+	struct lu_env	*lg_env;
+	struct cl_io	*lg_io;
+	struct cl_lock	*lg_lock;
+	unsigned long	 lg_gid;
+};
+
+/* See comment on trunc_sem_down_read_nowait */
+struct ll_trunc_sem {
+	/* when positive, this is a count of readers, when -1, it indicates
+	 * the semaphore is held for write, and 0 is unlocked
+	 */
+	atomic_t	ll_trunc_readers;
+	/* this tracks a count of waiting writers */
+	atomic_t	ll_trunc_waiters;
+};
+
+struct ll_inode_info {
+	__u32				lli_inode_magic;
+	rwlock_t			lli_lock;
+
+	volatile unsigned long		lli_flags;
+	struct posix_acl		*lli_posix_acl;
+
+	/* identifying fields for both metadata and data stacks. */
+	struct lu_fid			lli_fid;
+	/* master inode fid for stripe directory */
+	struct lu_fid			lli_pfid;
+
+	/* We need all three because every inode may be opened in different
+	 * modes */
+	struct obd_client_handle       *lli_mds_read_och;
+	struct obd_client_handle       *lli_mds_write_och;
+	struct obd_client_handle       *lli_mds_exec_och;
+	__u64				lli_open_fd_read_count;
+	__u64				lli_open_fd_write_count;
+	__u64				lli_open_fd_exec_count;
+
+	/* Number of times this inode was opened */
+	u64				lli_open_fd_count;
+	/* When last close was performed on this inode */
+	ktime_t				lli_close_fd_time;
+
+	/* Protects access to och pointers and their usage counters */
+	struct mutex			lli_och_mutex;
+
+	struct inode			lli_vfs_inode;
+
+	/* the most recent timestamps obtained from mds */
+	s64				lli_atime;
+	s64				lli_mtime;
+	s64				lli_ctime;
+	s64				lli_btime;
+	spinlock_t			lli_agl_lock;
+
+	/* Try to make the d::member and f::member are aligned. Before using
+	 * these members, make clear whether it is directory or not. */
+	union {
+		/* for directory */
+		struct {
+			/* metadata statahead */
+			/* since parent-child threads can share the same @file
+			 * struct, "opendir_key" is the token when dir close for
+			 * case of parent exit before child -- it is me should
+			 * cleanup the dir readahead. */
+			void			       *lli_opendir_key;
+			struct ll_statahead_info       *lli_sai;
+			/* protect statahead stuff. */
+			spinlock_t			lli_sa_lock;
+			/* "opendir_pid" is the token when lookup/revalid
+			 * -- I am the owner of dir statahead. */
+			pid_t				lli_opendir_pid;
+			/* directory depth to ROOT */
+			unsigned short			lli_dir_depth;
+			/* directory depth to ancestor whose default LMV is
+			 * inherited.
+			 */
+			unsigned short			lli_inherit_depth;
+			/* stat will try to access statahead entries or start
+			 * statahead if this flag is set, and this flag will be
+			 * set upon dir open, and cleared when dir is closed,
+			 * statahead hit ratio is too low, or start statahead
+			 * thread failed. */
+			unsigned short			lli_sa_enabled:1;
+			/* generation for statahead */
+			unsigned int			lli_sa_generation;
+			/* rw lock protects lli_lsm_md */
+			struct rw_semaphore		lli_lsm_sem;
+			/* directory stripe information */
+			struct lmv_stripe_md		*lli_lsm_md;
+			/* directory default LMV */
+			struct lmv_stripe_md		*lli_default_lsm_md;
+		};
+
+		/* for non-directory */
+		struct {
+			struct mutex		lli_size_mutex;
+			char		       *lli_symlink_name;
+			struct ll_trunc_sem	lli_trunc_sem;
+			struct range_lock_tree	lli_write_tree;
+			struct mutex		lli_setattr_mutex;
+
+			struct rw_semaphore	lli_glimpse_sem;
+			ktime_t			lli_glimpse_time;
+			struct list_head	lli_agl_list;
+			__u64			lli_agl_index;
+
+			/* for writepage() only to communicate to fsync */
+			int			lli_async_rc;
+
+			/* protect the file heat fields */
+			spinlock_t			lli_heat_lock;
+			__u32				lli_heat_flags;
+			struct obd_heat_instance	lli_heat_instances[OBD_HEAT_COUNT];
+
+			/*
+			 * Whenever a process try to read/write the file, the
+			 * jobid of the process will be saved here, and it'll
+			 * be packed into the write PRC when flush later.
+			 *
+			 * So the read/write statistics for jobid will not be
+			 * accurate if the file is shared by different jobs.
+			 */
+			char                    lli_jobid[LUSTRE_JOBID_SIZE];
+
+			struct mutex		 lli_pcc_lock;
+			enum lu_pcc_state_flags	 lli_pcc_state;
+			/*
+			 * @lli_pcc_generation saves the gobal PCC generation
+			 * when the file was successfully attached into PCC.
+			 * The flags of the PCC dataset are saved in
+			 * @lli_pcc_dsflags.
+			 * The gobal PCC generation will be increased when add
+			 * or delete a PCC backend, or change the configuration
+			 * parameters for PCC.
+			 * If @lli_pcc_generation is same as the gobal PCC
+			 * generation, we can use the saved flags of the PCC
+			 * dataset to determine whether need to try auto attach
+			 * safely.
+			 */
+			__u64			 lli_pcc_generation;
+			enum pcc_dataset_flags	 lli_pcc_dsflags;
+			struct pcc_inode	*lli_pcc_inode;
+
+			struct mutex		 lli_group_mutex;
+			__u64			 lli_group_users;
+			unsigned long		 lli_group_gid;
+
+			__u64			 lli_attr_valid;
+			__u64			 lli_lazysize;
+			__u64			 lli_lazyblocks;
+		};
+	};
+
+	/* XXX: For following frequent used members, although they maybe special
+	 *      used for non-directory object, it is some time-wasting to check
+	 *      whether the object is directory or not before using them. On the
+	 *      other hand, currently, sizeof(f) > sizeof(d), it cannot reduce
+	 *      the "ll_inode_info" size even if moving those members into u.f.
+	 *      So keep them out side.
+	 *
+	 *      In the future, if more members are added only for directory,
+	 *      some of the following members can be moved into u.f.
+	 */
+	struct cl_object		*lli_clob;
+
+	/* mutex to request for layout lock exclusively. */
+	struct mutex			lli_layout_mutex;
+	/* Layout version, protected by lli_layout_lock */
+	__u32				lli_layout_gen;
+	spinlock_t			lli_layout_lock;
+
+	__u32				lli_projid;   /* project id */
+
+	struct rw_semaphore		lli_xattrs_list_rwsem;
+	struct mutex			lli_xattrs_enq_lock;
+	struct list_head		lli_xattrs; /* ll_xattr_entry->xe_list */
+	struct list_head		lli_lccs; /* list of ll_cl_context */
+	seqlock_t			lli_page_inv_lock;
+};
+
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define inode_permission(ns, inode, mask)	inode_permission(inode, mask)
+#define generic_permission(ns, inode, mask)	generic_permission(inode, mask)
+#define simple_setattr(ns, de, iattr)		simple_setattr(de, iattr)
+#define ll_inode_permission(ns, inode, mask)	ll_inode_permission(inode, mask)
+#ifdef HAVE_INODEOPS_ENHANCED_GETATTR
+#define ll_getattr(ns, path, stat, mask, fl)	ll_getattr(path, stat, mask, fl)
+#endif /* HAVE_INODEOPS_ENHANCED_GETATTR */
+#define ll_setattr(ns, de, attr)		ll_setattr(de, attr)
+#endif
+
+static inline void ll_trunc_sem_init(struct ll_trunc_sem *sem)
+{
+	atomic_set(&sem->ll_trunc_readers, 0);
+	atomic_set(&sem->ll_trunc_waiters, 0);
+}
+
+/* This version of down read ignores waiting writers, meaning if the semaphore
+ * is already held for read, this down_read will 'join' that reader and also
+ * take the semaphore.
+ *
+ * This lets us avoid an unusual deadlock.
+ *
+ * We must take lli_trunc_sem in read mode on entry in to various i/o paths
+ * in Lustre, in order to exclude truncates.  Some of these paths then need to
+ * take the mmap_lock, while still holding the trunc_sem.  The problem is that
+ * page faults hold the mmap_lock when calling in to Lustre, and then must also
+ * take the trunc_sem to exclude truncate.
+ *
+ * This means the locking order for trunc_sem and mmap_lock is sometimes AB,
+ * sometimes BA.  This is almost OK because in both cases, we take the trunc
+ * sem for read, so it doesn't block.
+ *
+ * However, if a write mode user (truncate, a setattr op) arrives in the
+ * middle of this, the second reader on the truncate_sem will wait behind that
+ * writer.
+ *
+ * So we have, on our truncate sem, in order (where 'reader' and 'writer' refer
+ * to the mode in which they take the semaphore):
+ * reader (holding mmap_lock, needs truncate_sem)
+ * writer
+ * reader (holding truncate sem, waiting for mmap_lock)
+ *
+ * And so the readers deadlock.
+ *
+ * The solution is this modified semaphore, where this down_read ignores
+ * waiting write operations, and all waiters are woken up at once, so readers
+ * using down_read_nowait cannot get stuck behind waiting writers, regardless
+ * of the order they arrived in.
+ *
+ * down_read_nowait is only used in the page fault case, where we already hold
+ * the mmap_lock.  This is because otherwise repeated read and write operations
+ * (which take the truncate sem) could prevent a truncate from ever starting.
+ * This could still happen with page faults, but without an even more complex
+ * mechanism, this is unavoidable.
+ *
+ * LU-12460
+ */
+static inline void trunc_sem_down_read_nowait(struct ll_trunc_sem *sem)
+{
+	wait_var_event(&sem->ll_trunc_readers,
+		       atomic_inc_unless_negative(&sem->ll_trunc_readers));
+}
+
+static inline void trunc_sem_down_read(struct ll_trunc_sem *sem)
+{
+	wait_var_event(&sem->ll_trunc_readers,
+		       atomic_read(&sem->ll_trunc_waiters) == 0 &&
+		       atomic_inc_unless_negative(&sem->ll_trunc_readers));
+}
+
+static inline void trunc_sem_up_read(struct ll_trunc_sem *sem)
+{
+	if (atomic_dec_return(&sem->ll_trunc_readers) == 0 &&
+	    atomic_read(&sem->ll_trunc_waiters))
+		wake_up_var(&sem->ll_trunc_readers);
+}
+
+static inline void trunc_sem_down_write(struct ll_trunc_sem *sem)
+{
+	atomic_inc(&sem->ll_trunc_waiters);
+	wait_var_event(&sem->ll_trunc_readers,
+		       atomic_cmpxchg(&sem->ll_trunc_readers, 0, -1) == 0);
+	atomic_dec(&sem->ll_trunc_waiters);
+}
+
+static inline void trunc_sem_up_write(struct ll_trunc_sem *sem)
+{
+	atomic_set(&sem->ll_trunc_readers, 0);
+	/* match the smp_mb() in wait_var_event()->prepare_to_wait() */
+	smp_mb();
+	wake_up_var(&sem->ll_trunc_readers);
+}
+
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+static inline void lli_clear_acl(struct ll_inode_info *lli)
+{
+	if (lli->lli_posix_acl) {
+		posix_acl_release(lli->lli_posix_acl);
+		lli->lli_posix_acl = NULL;
+	}
+}
+
+static inline void lli_replace_acl(struct ll_inode_info *lli,
+				   struct lustre_md *md)
+{
+	write_lock(&lli->lli_lock);
+	if (lli->lli_posix_acl)
+		posix_acl_release(lli->lli_posix_acl);
+	lli->lli_posix_acl = md->posix_acl;
+	write_unlock(&lli->lli_lock);
+}
+#else
+static inline void lli_clear_acl(struct ll_inode_info *lli)
+{
+}
+
+static inline void lli_replace_acl(struct ll_inode_info *lli,
+				   struct lustre_md *md)
+{
+}
+#endif
+
+static inline __u32 ll_layout_version_get(struct ll_inode_info *lli)
+{
+	__u32 gen;
+
+	spin_lock(&lli->lli_layout_lock);
+	gen = lli->lli_layout_gen;
+	spin_unlock(&lli->lli_layout_lock);
+
+	return gen;
+}
+
+static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen)
+{
+	spin_lock(&lli->lli_layout_lock);
+	lli->lli_layout_gen = gen;
+	spin_unlock(&lli->lli_layout_lock);
+}
+
+enum ll_file_flags {
+	/* File data is modified. */
+	LLIF_DATA_MODIFIED      = 0,
+	/* File is being restored */
+	LLIF_FILE_RESTORING	= 1,
+	/* Xattr cache is attached to the file */
+	LLIF_XATTR_CACHE	= 2,
+	/* Project inherit */
+	LLIF_PROJECT_INHERIT	= 3,
+	/* update atime from MDS even if it's older than local inode atime. */
+	LLIF_UPDATE_ATIME	= 4,
+	/* foreign file/dir can be unlinked unconditionnaly */
+	LLIF_FOREIGN_REMOVABLE	= 5,
+	/* Xattr cache is filled */
+	LLIF_XATTR_CACHE_FILLED	= 7,
+
+};
+
+int ll_xattr_cache_destroy(struct inode *inode);
+int ll_xattr_cache_empty(struct inode *inode);
+
+int ll_xattr_cache_get(struct inode *inode,
+		       const char *name,
+		       char *buffer,
+		       size_t size,
+		       __u64 valid);
+
+int ll_xattr_cache_insert(struct inode *inode,
+			  const char *name,
+			  char *buffer,
+			  size_t size);
+
+static inline bool obd_connect_has_secctx(struct obd_connect_data *data)
+{
+#ifdef CONFIG_SECURITY
+	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
+		data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX;
+#else
+	return false;
+#endif
+}
+
+static inline void obd_connect_set_secctx(struct obd_connect_data *data)
+{
+#ifdef CONFIG_SECURITY
+	data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
+#endif
+}
+
+/* Only smack and selinux is known to use security contexts */
+static inline bool ll_xattr_is_seclabel(const char *name)
+{
+	return !strcmp(name, XATTR_NAME_SELINUX) ||
+		!strcmp(name, XATTR_NAME_SMACK);
+}
+
+static inline bool ll_xattr_suffix_is_seclabel(const char *suffix)
+{
+	return !strcmp(suffix, XATTR_SELINUX_SUFFIX) ||
+		!strcmp(suffix, XATTR_SMACK_SUFFIX);
+}
+
+int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
+			    const char **secctx_name, __u32 *secctx_name_size,
+			    void **secctx, __u32 *secctx_size,
+			    int *secctx_slot);
+
+int ll_inode_init_security(struct dentry *dentry, struct inode *inode,
+			   struct inode *dir);
+
+int ll_inode_notifysecctx(struct inode *inode,
+			  void *secctx, __u32 secctxlen);
+
+void ll_secctx_name_free(struct ll_sb_info *sbi);
+
+int ll_secctx_name_store(struct inode *in);
+
+__u32 ll_secctx_name_get(struct ll_sb_info *sbi, const char **secctx_name);
+
+int ll_security_secctx_name_filter(struct ll_sb_info *sbi, int xattr_type,
+				   const char *suffix);
+
+static inline bool obd_connect_has_enc(struct obd_connect_data *data)
+{
+#ifdef HAVE_LUSTRE_CRYPTO
+	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
+		data->ocd_connect_flags2 & OBD_CONNECT2_ENCRYPT;
+#else
+	return false;
+#endif
+}
+
+static inline void obd_connect_set_enc(struct obd_connect_data *data)
+{
+#ifdef HAVE_LUSTRE_CRYPTO
+	data->ocd_connect_flags2 |= OBD_CONNECT2_ENCRYPT;
+#endif
+}
+
+static inline bool obd_connect_has_name_enc(struct obd_connect_data *data)
+{
+#ifdef HAVE_LUSTRE_CRYPTO
+	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
+		data->ocd_connect_flags2 & OBD_CONNECT2_ENCRYPT_NAME;
+#else
+	return false;
+#endif
+}
+
+static inline void obd_connect_set_name_enc(struct obd_connect_data *data)
+{
+#ifdef HAVE_LUSTRE_CRYPTO
+	data->ocd_connect_flags2 |= OBD_CONNECT2_ENCRYPT_NAME;
+#endif
+}
+
+/*
+ * Locking to guarantee consistency of non-atomic updates to long long i_size,
+ * consistency between file size and KMS.
+ *
+ * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order.
+ */
+
+void ll_inode_size_lock(struct inode *inode);
+void ll_inode_size_unlock(struct inode *inode);
+
+static inline struct ll_inode_info *ll_i2info(struct inode *inode)
+{
+	return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+}
+
+static inline struct pcc_inode *ll_i2pcci(struct inode *inode)
+{
+	return ll_i2info(inode)->lli_pcc_inode;
+}
+
+/* default to use at least 16M for fast read if possible */
+#define RA_REMAIN_WINDOW_MIN			MiB_TO_PAGES(16UL)
+
+/* default read-ahead on a given client mountpoint. */
+#define SBI_DEFAULT_READ_AHEAD_MAX		MiB_TO_PAGES(1024UL)
+
+/* default read-ahead for a single file descriptor */
+#define SBI_DEFAULT_READ_AHEAD_PER_FILE_MAX	MiB_TO_PAGES(256UL)
+
+/* default read-ahead full files smaller than limit on the second read */
+#define SBI_DEFAULT_READ_AHEAD_WHOLE_MAX	MiB_TO_PAGES(2UL)
+
+/* default range pages */
+#define SBI_DEFAULT_RA_RANGE_PAGES		MiB_TO_PAGES(1ULL)
+
+/* Min range pages */
+#define RA_MIN_MMAP_RANGE_PAGES			16UL
+
+enum ra_stat {
+        RA_STAT_HIT = 0,
+        RA_STAT_MISS,
+        RA_STAT_DISTANT_READPAGE,
+        RA_STAT_MISS_IN_WINDOW,
+        RA_STAT_FAILED_GRAB_PAGE,
+        RA_STAT_FAILED_MATCH,
+        RA_STAT_DISCARDED,
+        RA_STAT_ZERO_LEN,
+        RA_STAT_ZERO_WINDOW,
+        RA_STAT_EOF,
+        RA_STAT_MAX_IN_FLIGHT,
+        RA_STAT_WRONG_GRAB_PAGE,
+	RA_STAT_FAILED_REACH_END,
+	RA_STAT_ASYNC,
+	RA_STAT_FAILED_FAST_READ,
+	RA_STAT_MMAP_RANGE_READ,
+	_NR_RA_STAT,
+};
+
+struct ll_ra_info {
+	atomic_t	ra_cur_pages;
+	unsigned long	ra_max_pages;
+	unsigned long	ra_max_pages_per_file;
+	unsigned long	ra_range_pages;
+	unsigned long	ra_max_read_ahead_whole_pages;
+	struct workqueue_struct  *ll_readahead_wq;
+	/*
+	 * Max number of active works could be triggered
+	 * for async readahead.
+	 */
+	unsigned int ra_async_max_active;
+	/* how many async readahead triggered in flight */
+	atomic_t ra_async_inflight;
+	/* Threshold to control when to trigger async readahead */
+	unsigned long ra_async_pages_per_file_threshold;
+};
+
+/* ra_io_arg will be filled in the beginning of ll_readahead with
+ * ras_lock, then the following ll_read_ahead_pages will read RA
+ * pages according to this arg, all the items in this structure are
+ * counted by page index.
+ */
+struct ra_io_arg {
+	pgoff_t		ria_start_idx;	/* start offset of read-ahead*/
+	pgoff_t		ria_end_idx;	/* end offset of read-ahead*/
+	unsigned long	ria_reserved;	/* reserved pages for read-ahead */
+	pgoff_t		ria_end_idx_min;/* minimum end to cover current read */
+	bool		ria_eof;	/* reach end of file */
+	/* If stride read pattern is detected, ria_stoff is the byte offset
+	 * where stride read is started. Note: for normal read-ahead, the
+	 * value here is meaningless, and also it will not be accessed*/
+	loff_t		ria_stoff;
+	/* ria_length and ria_bytes are the length and pages length in the
+	 * stride I/O mode. And they will also be used to check whether
+	 * it is stride I/O read-ahead in the read-ahead pages*/
+	loff_t		ria_length;
+	loff_t		ria_bytes;
+};
+
+/* LL_HIST_MAX=32 causes an overflow */
+#define LL_HIST_MAX 28
+#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
+#define LL_PROCESS_HIST_MAX 10
+struct per_process_info {
+	pid_t pid;
+	struct obd_histogram pp_r_hist;
+	struct obd_histogram pp_w_hist;
+};
+
+/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */
+struct ll_rw_extents_info {
+	ktime_t pp_init;
+	struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1];
+};
+
+#define LL_OFFSET_HIST_MAX 100
+struct ll_rw_process_info {
+        pid_t                     rw_pid;
+        int                       rw_op;
+        loff_t                    rw_range_start;
+        loff_t                    rw_range_end;
+        loff_t                    rw_last_file_pos;
+        loff_t                    rw_offset;
+        size_t                    rw_smallest_extent;
+        size_t                    rw_largest_extent;
+        struct ll_file_data      *rw_last_file;
+};
+
+enum stats_track_type {
+        STATS_TRACK_ALL = 0,  /* track all processes */
+        STATS_TRACK_PID,      /* track process with this pid */
+        STATS_TRACK_PPID,     /* track processes with this ppid */
+        STATS_TRACK_GID,      /* track processes with this gid */
+        STATS_TRACK_LAST,
+};
+
+/* flags for sbi->ll_flags */
+enum ll_sbi_flags {
+	LL_SBI_NOLCK,			/* DLM locking disabled directio-only */
+	LL_SBI_CHECKSUM,		/* checksum each page as it's written */
+	LL_SBI_LOCALFLOCK,		/* local flocks instead of fs-wide */
+	LL_SBI_FLOCK,			/* flock enabled */
+	LL_SBI_USER_XATTR,		/* support user xattr */
+	LL_SBI_LRU_RESIZE,		/* lru resize support */
+	LL_SBI_LAZYSTATFS,		/* lazystatfs mount option */
+	LL_SBI_32BIT_API,		/* generate 32 bit inodes. */
+	LL_SBI_USER_FID2PATH,		/* fid2path by unprivileged users */
+	LL_SBI_VERBOSE,			/* verbose mount/umount */
+	LL_SBI_ALWAYS_PING,		/* ping even if server suppress_pings */
+	LL_SBI_TEST_DUMMY_ENCRYPTION,	/* test dummy encryption */
+	LL_SBI_ENCRYPT,			/* client side encryption */
+	LL_SBI_FOREIGN_SYMLINK,		/* foreign fake-symlink support */
+	LL_SBI_FOREIGN_SYMLINK_UPCALL,	/* foreign fake-symlink upcall set */
+	LL_SBI_ALLOW_VERSION_MISMATCH,  /* allow client/server version mismatch */
+	LL_SBI_MDLL_BYPASS,		/* disable metadata lazy load */
+	LL_SBI_NUM_MOUNT_OPT,
+
+	LL_SBI_ACL,			/* support ACL */
+	LL_SBI_AGL_ENABLED,		/* enable agl */
+	LL_SBI_64BIT_HASH,		/* support 64-bits dir hash/offset */
+	LL_SBI_LAYOUT_LOCK,		/* layout lock support */
+	LL_SBI_XATTR_CACHE,		/* support for xattr cache */
+	LL_SBI_NOROOTSQUASH,		/* do not apply root squash */
+	LL_SBI_FAST_READ,		/* fast read support */
+	LL_SBI_FILE_SECCTX,		/* file security context at create */
+	LL_SBI_TINY_WRITE,		/* tiny write support */
+	LL_SBI_FILE_HEAT,		/* file heat support */
+	LL_SBI_PARALLEL_DIO,		/* parallel (async) O_DIRECT RPCs */
+	LL_SBI_ENCRYPT_NAME,		/* name encryption */
+	LL_SBI_MDLL_AUTO_REFRESH,	/* enable metadata lazy load */
+	LL_SBI_MDLL,			/* enable metadata lazy load auto-refresh */
+	LL_SBI_NUM_FLAGS
+};
+
+int ll_sbi_flags_seq_show(struct seq_file *m, void *v);
+
+/* This is embedded into llite super-blocks to keep track of connect
+ * flags (capabilities) supported by all imports given mount is
+ * connected to. */
+struct lustre_client_ocd {
+	/* This is conjunction of connect_flags across all imports
+	 * (LOVs) this mount is connected to. This field is updated by
+	 * cl_ocd_update() under ->lco_lock. */
+	__u64			 lco_flags;
+	struct mutex		 lco_lock;
+	struct obd_export	*lco_md_exp;
+	struct obd_export	*lco_dt_exp;
+};
+
+struct ll_sb_info {
+	/* this protects pglist and ra_info.  It isn't safe to
+	 * grab from interrupt contexts */
+	spinlock_t		 ll_lock;
+	spinlock_t		 ll_pp_extent_lock; /* pp_extent entry*/
+	spinlock_t		 ll_process_lock; /* ll_rw_process_info */
+	struct obd_uuid		 ll_sb_uuid;
+	struct obd_export	*ll_md_exp;
+	struct obd_export	*ll_dt_exp;
+	struct obd_device	*ll_md_obd;
+	struct obd_device	*ll_dt_obd;
+	struct dentry		*ll_debugfs_entry;
+	struct lu_fid		 ll_root_fid; /* root object fid */
+	struct mnt_namespace	*ll_mnt_ns;
+
+	DECLARE_BITMAP(ll_flags, LL_SBI_NUM_FLAGS); /* enum ll_sbi_flags */
+	unsigned int		 ll_xattr_cache_enabled:1,
+				 ll_xattr_cache_set:1, /* already set to 0/1 */
+				 ll_client_common_fill_super_succeeded:1,
+				 ll_checksum_set:1,
+				 ll_inode_cache_enabled:1;
+
+	struct lustre_client_ocd ll_lco;
+
+	struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
+
+	/* Used to track "unstable" pages on a client, and maintain a
+	 * LRU list of clean pages. An "unstable" page is defined as
+	 * any page which is sent to a server as part of a bulk request,
+	 * but is uncommitted to stable storage. */
+	struct cl_client_cache	 *ll_cache;
+
+	struct lprocfs_stats     *ll_ra_stats;
+
+	struct ll_ra_info         ll_ra_info;
+	unsigned int              ll_namelen;
+	const struct file_operations *ll_fop;
+
+	struct lu_site           *ll_site;
+	struct cl_device         *ll_cl;
+
+	/* Statistics */
+	struct ll_rw_extents_info *ll_rw_extents_info;
+	int			  ll_extent_process_count;
+	unsigned int		  ll_offset_process_count;
+	struct ll_rw_process_info *ll_rw_process_info;
+	struct ll_rw_process_info *ll_rw_offset_info;
+	ktime_t			  ll_process_stats_init;
+	unsigned int		  ll_rw_offset_entry_count;
+	int			  ll_stats_track_id;
+	enum stats_track_type	  ll_stats_track_type;
+	int			  ll_rw_stats_on;
+
+	/* metadata stat-ahead */
+	unsigned int		  ll_sa_running_max;/* max concurrent
+						     * statahead instances */
+	unsigned int		  ll_sa_max;     /* max statahead RPCs */
+	atomic_t		  ll_sa_total;   /* statahead thread started
+						  * count */
+	atomic_t		  ll_sa_wrong;   /* statahead thread stopped for
+						  * low hit ratio */
+	atomic_t		  ll_sa_running; /* running statahead thread
+						  * count */
+	atomic_t		  ll_agl_total;  /* AGL thread started count */
+
+	dev_t			  ll_sdev_orig; /* save s_dev before assign for
+						 * clustred nfs */
+	/* root squash */
+	struct root_squash_info	  ll_squash;
+	struct path		  ll_mnt;
+
+	/* st_blksize returned by stat(2), when non-zero */
+	unsigned int		  ll_stat_blksize;
+
+	/* maximum relative age of cached statfs results */
+	unsigned int		  ll_statfs_max_age;
+
+	/*
+	 * seconds after which negative dentries should be invalidated.
+	 * -1 disables invalidation of negative entries based on timeout
+	 * 0 always triggers serverside validation
+	 */
+	int			  ll_neg_dentry_timeout;
+
+	/*
+	 * MDLL directory restore retry count
+	 * This would determine the number of times the restore would be
+	 * retried before returning error to the client. The retry would
+	 * be based on the released bit of the directory.
+	 * A value of -1 would retry indefinitely.
+	 */
+#define LL_MDLL_DIR_RESTORE_DEF_RETRY_COUNT 1
+	atomic_t		  ll_dir_restore_max_retry_count;
+
+	struct kset		  ll_kset;	/* sysfs object */
+	struct completion	  ll_kobj_unregister;
+
+	/* File heat */
+	unsigned int		  ll_heat_decay_weight;
+	unsigned int		  ll_heat_period_second;
+
+	/* Opens of the same inode before we start requesting open lock */
+	u32			  ll_oc_thrsh_count;
+
+	/* Time in ms between last inode close and next open to be considered
+	 * instant back to back and would trigger an open lock request
+	 */
+	u32			  ll_oc_thrsh_ms;
+
+	/* Time in ms after last file close that we no longer count prior opens*/
+	u32			  ll_oc_max_ms;
+
+	/* filesystem fsname */
+	char			  ll_fsname[LUSTRE_MAXFSNAME + 1];
+
+	/* Persistent Client Cache */
+	struct pcc_super	  ll_pcc_super;
+
+	/* to protect vs updates in all following foreign symlink fields */
+	struct rw_semaphore	  ll_foreign_symlink_sem;
+	/* foreign symlink path prefix */
+	char			 *ll_foreign_symlink_prefix;
+	/* full prefix size including leading '\0' */
+	size_t			  ll_foreign_symlink_prefix_size;
+	/* foreign symlink path upcall */
+	char			 *ll_foreign_symlink_upcall;
+	/* foreign symlink path upcall infos */
+	struct ll_foreign_symlink_upcall_item *ll_foreign_symlink_upcall_items;
+	/* foreign symlink path upcall nb infos */
+	unsigned int		  ll_foreign_symlink_upcall_nb_items;
+
+	/* cached file security context xattr name. e.g: security.selinux */
+	char *ll_secctx_name;
+	__u32 ll_secctx_name_size;
+};
+
+#define SBI_DEFAULT_HEAT_DECAY_WEIGHT	((80 * 256 + 50) / 100)
+#define SBI_DEFAULT_HEAT_PERIOD_SECOND	(60)
+
+#define SBI_DEFAULT_OPENCACHE_THRESHOLD_COUNT	(5)
+#define SBI_DEFAULT_OPENCACHE_THRESHOLD_MS	(100) /* 0.1 second */
+#define SBI_DEFAULT_OPENCACHE_THRESHOLD_MAX_MS	(60000) /* 1 minute */
+
+/*
+ * per file-descriptor read-ahead data.
+ */
+struct ll_readahead_state {
+	spinlock_t	ras_lock;
+	/* End byte that read(2) try to read.  */
+	loff_t		ras_last_read_end_bytes;
+        /*
+	 * number of bytes read after last read-ahead window reset. As window
+         * is reset on each seek, this is effectively a number of consecutive
+         * accesses. Maybe ->ras_accessed_in_window is better name.
+         *
+         * XXX nikita: window is also reset (by ras_update()) when Lustre
+         * believes that memory pressure evicts read-ahead pages. In that
+         * case, it probably doesn't make sense to expand window to
+         * PTLRPC_MAX_BRW_PAGES on the third access.
+         */
+	loff_t		ras_consecutive_bytes;
+        /*
+         * number of read requests after the last read-ahead window reset
+         * As window is reset on each seek, this is effectively the number
+         * on consecutive read request and is used to trigger read-ahead.
+         */
+	unsigned long	ras_consecutive_requests;
+        /*
+         * Parameters of current read-ahead window. Handled by
+         * ras_update(). On the initial access to the file or after a seek,
+         * window is reset to 0. After 3 consecutive accesses, window is
+         * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by
+         * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
+         */
+	pgoff_t		ras_window_start_idx;
+	pgoff_t		ras_window_pages;
+
+	/* Page index where min range read starts */
+	pgoff_t		ras_range_min_start_idx;
+	/* Page index where mmap range read ends */
+	pgoff_t		ras_range_max_end_idx;
+	/* number of mmap pages where last time detected */
+	pgoff_t		ras_last_range_pages;
+	/* number of mmap range requests */
+	pgoff_t		ras_range_requests;
+
+	/*
+	 * Optimal RPC size in pages.
+	 * It decides how many pages will be sent for each read-ahead.
+	 */
+	unsigned long	ras_rpc_pages;
+        /*
+         * Where next read-ahead should start at. This lies within read-ahead
+         * window. Read-ahead window is read in pieces rather than at once
+         * because: 1. lustre limits total number of pages under read-ahead by
+         * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages
+         * not covered by DLM lock.
+         */
+	pgoff_t		ras_next_readahead_idx;
+        /*
+         * Total number of ll_file_read requests issued, reads originating
+         * due to mmap are not counted in this total.  This value is used to
+         * trigger full file read-ahead after multiple reads to a small file.
+         */
+	unsigned long	ras_requests;
+        /*
+         * The following 3 items are used for detecting the stride I/O
+         * mode.
+         * In stride I/O mode,
+         * ...............|-----data-----|****gap*****|--------|******|....
+	 *    offset      |-stride_bytes-|-stride_gap-|
+         * ras_stride_offset = offset;
+	 * ras_stride_length = stride_bytes + stride_gap;
+	 * ras_stride_bytes = stride_bytes;
+	 * Note: all these three items are counted by bytes.
+	 */
+	loff_t		ras_stride_offset;
+	loff_t		ras_stride_length;
+	loff_t		ras_stride_bytes;
+        /*
+         * number of consecutive stride request count, and it is similar as
+         * ras_consecutive_requests, but used for stride I/O mode.
+         * Note: only more than 2 consecutive stride request are detected,
+         * stride read-ahead will be enable
+         */
+	unsigned long	ras_consecutive_stride_requests;
+	/* index of the last page that async readahead starts */
+	pgoff_t		ras_async_last_readpage_idx;
+	/* whether we should increase readahead window */
+	bool		ras_need_increase_window;
+	/* whether ra miss check should be skipped */
+	bool		ras_no_miss_check;
+};
+
+struct ll_readahead_work {
+	/** File to readahead */
+	struct file			*lrw_file;
+	pgoff_t				 lrw_start_idx;
+	pgoff_t				 lrw_end_idx;
+	pid_t				 lrw_user_pid;
+
+	/* async worker to handler read */
+	struct work_struct		 lrw_readahead_work;
+	char				 lrw_jobid[LUSTRE_JOBID_SIZE];
+};
+
+extern struct kmem_cache *ll_file_data_slab;
+struct lustre_handle;
+struct ll_file_data {
+	struct ll_readahead_state fd_ras;
+	struct ll_grouplock fd_grouplock;
+	__u64 lfd_pos;
+	__u32 fd_flags;
+	fmode_t fd_omode;
+	/* openhandle if lease exists for this file.
+	 * Borrow lli->lli_och_mutex to protect assignment */
+	struct obd_client_handle *fd_lease_och;
+	struct obd_client_handle *fd_och;
+	struct file *fd_file;
+	/* Indicate whether need to report failure when close.
+	 * true: failure is known, not report again.
+	 * false: unknown failure, should report. */
+	bool fd_write_failed;
+	bool ll_lock_no_expand;
+	/* Used by mirrored file to lead IOs to a specific mirror, usually
+	 * for mirror resync. 0 means default. */
+	__u32 fd_designated_mirror;
+	/* The layout version when resync starts. Resync I/O should carry this
+	 * layout version for verification to OST objects */
+	__u32 fd_layout_version;
+	struct pcc_file fd_pcc_file;
+	/* striped directory may read partially if some stripe inaccessible,
+	 * -errno is saved here, and will return to user in close().
+	 */
+	int fd_partial_readdir_rc;
+};
+
+void llite_tunables_unregister(void);
+int llite_tunables_register(void);
+
+static inline struct inode *ll_info2i(struct ll_inode_info *lli)
+{
+        return &lli->lli_vfs_inode;
+}
+
+__u32 ll_i2suppgid(struct inode *i);
+void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
+
+static inline int ll_need_32bit_api(struct ll_sb_info *sbi)
+{
+#if BITS_PER_LONG == 32
+	return 1;
+#elif defined(CONFIG_COMPAT)
+	if (unlikely(test_bit(LL_SBI_32BIT_API, sbi->ll_flags)))
+		return true;
+
+# ifdef CONFIG_X86_X32
+	/* in_compat_syscall() returns true when called from a kthread
+	 * and CONFIG_X86_X32 is enabled, which is wrong. So check
+	 * whether the caller comes from a syscall (ie. not a kthread)
+	 * before calling in_compat_syscall(). */
+	if (current->flags & PF_KTHREAD)
+		return false;
+# endif
+
+	return unlikely(in_compat_syscall());
+#else
+	return unlikely(test_bit(LL_SBI_32BIT_API, sbi->ll_flags));
+#endif
+}
+
+static inline bool ll_sbi_has_fast_read(struct ll_sb_info *sbi)
+{
+	return test_bit(LL_SBI_FAST_READ, sbi->ll_flags);
+}
+
+static inline bool ll_sbi_has_tiny_write(struct ll_sb_info *sbi)
+{
+	return test_bit(LL_SBI_TINY_WRITE, sbi->ll_flags);
+}
+
+static inline bool ll_sbi_has_file_heat(struct ll_sb_info *sbi)
+{
+	return test_bit(LL_SBI_FILE_HEAT, sbi->ll_flags);
+}
+
+static inline bool ll_sbi_has_foreign_symlink(struct ll_sb_info *sbi)
+{
+	return test_bit(LL_SBI_FOREIGN_SYMLINK, sbi->ll_flags);
+}
+
+static inline bool ll_sbi_has_parallel_dio(struct ll_sb_info *sbi)
+{
+	return test_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags);
+}
+
+void ll_ras_enter(struct file *f, loff_t pos, size_t count);
+
+/* llite/lcommon_misc.c */
+int cl_ocd_update(struct obd_device *host, struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner);
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		     struct ll_grouplock *lg);
+void cl_put_grouplock(struct ll_grouplock *lg);
+
+/* llite/lproc_llite.c */
+int ll_debugfs_register_super(struct super_block *sb, const char *name);
+void ll_debugfs_unregister_super(struct super_block *sb);
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, long count);
+void ll_free_rw_stats_info(struct ll_sb_info *sbi);
+
+enum {
+	LPROC_LL_READ_BYTES,
+	LPROC_LL_WRITE_BYTES,
+	LPROC_LL_READ,
+	LPROC_LL_WRITE,
+	LPROC_LL_IOCTL,
+	LPROC_LL_OPEN,
+	LPROC_LL_RELEASE,
+	LPROC_LL_MMAP,
+	LPROC_LL_FAULT,
+	LPROC_LL_MKWRITE,
+	LPROC_LL_LLSEEK,
+	LPROC_LL_FSYNC,
+	LPROC_LL_READDIR,
+	LPROC_LL_SETATTR,
+	LPROC_LL_TRUNC,
+	LPROC_LL_FLOCK,
+	LPROC_LL_GETATTR,
+	LPROC_LL_CREATE,
+	LPROC_LL_LINK,
+	LPROC_LL_UNLINK,
+	LPROC_LL_SYMLINK,
+	LPROC_LL_MKDIR,
+	LPROC_LL_RMDIR,
+	LPROC_LL_MKNOD,
+	LPROC_LL_RENAME,
+	LPROC_LL_STATFS,
+	LPROC_LL_SETXATTR,
+	LPROC_LL_GETXATTR,
+	LPROC_LL_GETXATTR_HITS,
+	LPROC_LL_LISTXATTR,
+	LPROC_LL_REMOVEXATTR,
+	LPROC_LL_INODE_PERM,
+	LPROC_LL_FALLOCATE,
+	LPROC_LL_INODE_OCOUNT,
+	LPROC_LL_INODE_OPCLTM,
+	LPROC_LL_FILE_OPCODES
+};
+
+/* llite/dir.c */
+enum get_default_layout_type {
+	GET_DEFAULT_LAYOUT_ROOT = 1,
+};
+
+extern const struct file_operations ll_dir_operations;
+extern const struct inode_operations ll_dir_inode_operations;
+#ifdef HAVE_DIR_CONTEXT
+int ll_dir_read(struct inode *inode, __u64 *pos, struct md_op_data *op_data,
+		struct dir_context *ctx, int *partial_readdir_rc);
+#else
+int ll_dir_read(struct inode *inode, __u64 *pos, struct md_op_data *op_data,
+		void *cookie, filldir_t filldir, int *partial_readdir_rc);
+#endif
+int ll_get_mdt_idx(struct inode *inode);
+int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid);
+struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
+			      __u64 offset, int *partial_readdir_rc);
+void ll_release_page(struct inode *inode, struct page *page, bool remove);
+int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl);
+
+/* llite/namei.c */
+extern const struct inode_operations ll_special_inode_operations;
+
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+                      struct lustre_md *lic);
+int ll_test_inode_by_fid(struct inode *inode, void *opaque);
+int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+                       void *data, int flag);
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de);
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen);
+void ll_update_times(struct ptlrpc_request *request, struct inode *inode);
+
+/* llite/rw.c */
+int ll_writepage(struct page *page, struct writeback_control *wbc);
+int ll_writepages(struct address_space *, struct writeback_control *wbc);
+int ll_readpage(struct file *file, struct page *page);
+#ifdef HAVE_AOPS_READ_FOLIO
+int ll_read_folio(struct file *file, struct folio *folio);
+#endif
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+			   struct cl_page *page, struct file *file);
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
+int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
+
+enum lcc_type;
+void ll_cl_add(struct inode *inode, const struct lu_env *env, struct cl_io *io,
+	       enum lcc_type type);
+void ll_cl_remove(struct inode *inode, const struct lu_env *env);
+struct ll_cl_context *ll_cl_find(struct inode *inode);
+
+extern const struct address_space_operations ll_aops;
+
+/* llite/file.c */
+extern const struct inode_operations ll_file_inode_operations;
+const struct file_operations *ll_select_file_operations(struct ll_sb_info *sbi);
+extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
+			   enum ldlm_mode l_req_mode);
+extern enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
+				      struct lustre_handle *lockh, __u64 flags,
+				      enum ldlm_mode mode);
+
+int ll_file_open(struct inode *inode, struct file *file);
+int ll_file_release(struct inode *inode, struct file *file);
+int ll_release_openhandle(struct dentry *, struct lookup_intent *);
+int ll_md_real_close(struct inode *inode, fmode_t fmode);
+void ll_track_file_opens(struct inode *inode);
+extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+                              struct ll_file_data *file, loff_t pos,
+                              size_t count, int rw);
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR)
+int ll_getattr(struct user_namespace *mnt_userns, const struct path *path,
+	       struct kstat *stat, u32 request_mask, unsigned int flags);
+#else
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+#endif /* HAVE_USER_NAMESPACE_ARG */
+int ll_getattr_dentry(struct dentry *de, struct kstat *stat, u32 request_mask,
+		      unsigned int flags, bool foreign);
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+struct posix_acl *ll_get_acl(
+ #ifdef HAVE_ACL_WITH_DENTRY
+	struct user_namespace *, struct dentry *, int);
+ #elif defined HAVE_GET_ACL_RCU_ARG
+	struct inode *inode, int type, bool rcu);
+ #else
+	struct inode *inode, int type);
+ #endif /* HAVE_GET_ACL_RCU_ARG */
+
+int ll_set_acl(struct user_namespace *mnt_userns,
+ #ifdef HAVE_ACL_WITH_DENTRY
+	       struct dentry *dentry,
+ #else
+	       struct inode *inode,
+ #endif
+	       struct posix_acl *acl, int type);
+#else  /* !CONFIG_LUSTRE_FS_POSIX_ACL */
+#define ll_get_acl NULL
+#define ll_set_acl NULL
+#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
+
+static inline int ll_xflags_to_inode_flags(int xflags)
+{
+	return ((xflags & FS_XFLAG_SYNC)      ? S_SYNC      : 0) |
+	       ((xflags & FS_XFLAG_NOATIME)   ? S_NOATIME   : 0) |
+	       ((xflags & FS_XFLAG_APPEND)    ? S_APPEND    : 0) |
+	       ((xflags & FS_XFLAG_IMMUTABLE) ? S_IMMUTABLE : 0);
+}
+
+static inline int ll_inode_flags_to_xflags(int inode_flags)
+{
+	return ((inode_flags & S_SYNC)      ? FS_XFLAG_SYNC      : 0) |
+	       ((inode_flags & S_NOATIME)   ? FS_XFLAG_NOATIME   : 0) |
+	       ((inode_flags & S_APPEND)    ? FS_XFLAG_APPEND    : 0) |
+	       ((inode_flags & S_IMMUTABLE) ? FS_XFLAG_IMMUTABLE : 0);
+}
+
+int ll_migrate(struct inode *parent, struct file *file,
+	       struct lmv_user_md *lum, const char *name, __u32 flags);
+int ll_get_fid_by_name(struct inode *parent, const char *name,
+		       int namelen, struct lu_fid *fid, struct inode **inode);
+int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode,
+			int mask);
+int ll_ioctl_check_project(struct inode *inode, __u32 xflags, __u32 projid);
+int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg);
+int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg);
+int ll_ioctl_project(struct file *file, unsigned int cmd,
+		     unsigned long arg);
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
+			     __u64 flags, struct lov_user_md *lum,
+			     int lum_size);
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+                             struct lov_mds_md **lmm, int *lmm_size,
+                             struct ptlrpc_request **request);
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+                     int set_default);
+int ll_dir_get_default_layout(struct inode *inode, void **plmm, int *plmm_size,
+			      struct ptlrpc_request **request, u64 valid,
+			      enum get_default_layout_type type);
+int ll_dir_getstripe_default(struct inode *inode, void **lmmp,
+			     int *lmm_size, struct ptlrpc_request **request,
+			     struct ptlrpc_request **root_request, u64 valid);
+int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
+		     struct ptlrpc_request **request, u64 valid);
+int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
+int ll_merge_attr(const struct lu_env *env, struct inode *inode);
+int ll_fid2path(struct inode *inode, void __user *arg);
+int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
+int ll_hsm_release(struct inode *inode);
+int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss);
+void ll_io_set_mirror(struct cl_io *io, const struct file *file);
+int ll_hsm_import(struct inode *inode, struct file *file,
+		  struct hsm_user_import *hui);
+
+/* llite/dcache.c */
+
+extern const struct dentry_operations ll_d_ops;
+#ifndef HAVE_D_INIT
+bool ll_d_setup(struct dentry *de, bool do_put);
+
+static inline bool lld_is_init(struct dentry *dentry)
+{
+	return ll_d2d(dentry);
+}
+#else
+#define ll_d_setup(de, do_put) (true)
+#define lld_is_init(dentry) (true)
+#endif
+
+void ll_intent_drop_lock(struct lookup_intent *);
+void ll_intent_release(struct lookup_intent *);
+void ll_prune_aliases(struct inode *inode);
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+                            struct lookup_intent *it, struct dentry *de);
+
+/* llite/llite_lib.c */
+extern const struct super_operations lustre_super_operations;
+
+void ll_lli_init(struct ll_inode_info *lli);
+int ll_fill_super(struct super_block *sb);
+void ll_put_super(struct super_block *sb);
+void ll_kill_super(struct super_block *sb);
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
+void ll_dir_clear_lsm_md(struct inode *inode);
+void ll_clear_inode(struct inode *inode);
+int volatile_ref_file(const char *volatile_name, int volatile_len,
+		      struct file **ref_file);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
+		   enum op_xvalid xvalid, bool hsm_import);
+int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de,
+	       struct iattr *attr);
+int ll_statfs(struct dentry *de, struct kstatfs *sfs);
+int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
+		       u32 flags);
+int ll_update_inode(struct inode *inode, struct lustre_md *md);
+void ll_update_inode_flags(struct inode *inode, unsigned int ext_flags);
+void ll_update_dir_depth(struct inode *dir, struct inode *inode);
+int ll_read_inode2(struct inode *inode, void *opaque);
+void ll_truncate_inode_pages_final(struct inode *inode);
+void ll_delete_inode(struct inode *inode);
+int ll_iocontrol(struct inode *inode, struct file *file,
+                 unsigned int cmd, unsigned long arg);
+int ll_flush_ctx(struct inode *inode);
+void ll_umount_begin(struct super_block *sb);
+int ll_remount_fs(struct super_block *sb, int *flags, char *data);
+int ll_show_options(struct seq_file *seq, struct dentry *dentry);
+void ll_dirty_page_discard_warn(struct inode *inode, int ioret);
+int ll_prep_inode(struct inode **inode, struct req_capsule *pill,
+		  struct super_block *sb, struct lookup_intent *it);
+int ll_obd_statfs(struct inode *inode, void __user *arg);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
+int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
+int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize);
+
+void ll_unlock_md_op_lsm(struct md_op_data *op_data);
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+				      struct inode *i1, struct inode *i2,
+				      const char *name, size_t namelen,
+				      __u32 mode, enum md_op_code opc,
+				      void *data);
+void ll_finish_md_op_data(struct md_op_data *op_data);
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
+void ll_compute_rootsquash_state(struct ll_sb_info *sbi);
+ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
+			struct lov_user_md **kbuf);
+void ll_open_cleanup(struct super_block *sb, struct req_capsule *pill);
+
+void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req);
+
+/* Compute expected user md size when passing in a md from user space */
+static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum)
+{
+	switch (lum->lmm_magic) {
+	case LOV_USER_MAGIC_V1:
+		return sizeof(struct lov_user_md_v1);
+	case LOV_USER_MAGIC_V3:
+		return sizeof(struct lov_user_md_v3);
+	case LOV_USER_MAGIC_SPECIFIC:
+		if (lum->lmm_stripe_count > LOV_MAX_STRIPE_COUNT)
+			return -EINVAL;
+
+		return lov_user_md_size(lum->lmm_stripe_count,
+					LOV_USER_MAGIC_SPECIFIC);
+	case LOV_USER_MAGIC_COMP_V1:
+		return ((struct lov_comp_md_v1 *)lum)->lcm_size;
+	case LOV_USER_MAGIC_FOREIGN:
+		return foreign_size(lum);
+	}
+
+	return -EINVAL;
+}
+
+/* llite/llite_nfs.c */
+extern const struct export_operations lustre_export_operations;
+__u32 get_uuid2int(const char *name, int len);
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid);
+int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid);
+
+/* llite/symlink.c */
+extern const struct inode_operations ll_fast_symlink_inode_operations;
+
+/**
+ * IO arguments for various VFS I/O interfaces.
+ */
+struct vvp_io_args {
+        /** normal/sendfile/splice */
+        union {
+                struct {
+                        struct kiocb      *via_iocb;
+			struct iov_iter   *via_iter;
+                } normal;
+        } u;
+};
+
+enum lcc_type {
+	LCC_RW = 1,
+	LCC_MMAP
+};
+
+struct ll_cl_context {
+	struct list_head	 lcc_list;
+	void			*lcc_cookie;
+	const struct lu_env	*lcc_env;
+	struct cl_io		*lcc_io;
+	struct cl_page		*lcc_page;
+	enum lcc_type		 lcc_type;
+	pgoff_t			 lcc_end_index;
+};
+
+struct ll_thread_info {
+	struct vvp_io_args	lti_args;
+	struct ra_io_arg	lti_ria;
+	struct ll_cl_context	lti_io_ctx;
+};
+
+extern struct lu_context_key ll_thread_key;
+
+static inline struct ll_thread_info *ll_env_info(const struct lu_env *env)
+{
+	struct ll_thread_info *lti;
+
+	lti = lu_context_key_get(&env->le_ctx, &ll_thread_key);
+	LASSERT(lti != NULL);
+
+	return lti;
+}
+
+static inline struct vvp_io_args *ll_env_args(const struct lu_env *env)
+{
+	return &ll_env_info(env)->lti_args;
+}
+
+void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
+		struct vvp_io_args *args);
+
+/* llite/llite_mmap.c */
+
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
+void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma,
+		     unsigned long addr, size_t count);
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+                               size_t count);
+
+#define    ll_s2sbi(sb)        (s2lsi(sb)->lsi_llsbi)
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2dtexp(struct super_block *sb)
+{
+        return ll_s2sbi(sb)->ll_dt_exp;
+}
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2mdexp(struct super_block *sb)
+{
+        return ll_s2sbi(sb)->ll_md_exp;
+}
+
+static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi)
+{
+        struct obd_device *obd = sbi->ll_md_exp->exp_obd;
+        if (obd == NULL)
+                LBUG();
+        return &obd->u.cli;
+}
+
+// FIXME: replace the name of this with LL_SB to conform to kernel stuff
+static inline struct ll_sb_info *ll_i2sbi(struct inode *inode)
+{
+        return ll_s2sbi(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2dtexp(struct inode *inode)
+{
+        return ll_s2dtexp(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2mdexp(struct inode *inode)
+{
+        return ll_s2mdexp(inode->i_sb);
+}
+
+static inline struct lu_fid *ll_inode2fid(struct inode *inode)
+{
+        struct lu_fid *fid;
+
+        LASSERT(inode != NULL);
+        fid = &ll_i2info(inode)->lli_fid;
+
+        return fid;
+}
+
+static inline bool ll_dir_striped(struct inode *inode)
+{
+	LASSERT(inode);
+	return S_ISDIR(inode->i_mode) &&
+	       lmv_dir_striped(ll_i2info(inode)->lli_lsm_md);
+}
+
+static inline loff_t ll_file_maxbytes(struct inode *inode)
+{
+	struct cl_object *obj = ll_i2info(inode)->lli_clob;
+
+	if (obj == NULL)
+		return MAX_LFS_FILESIZE;
+
+	return min_t(loff_t, cl_object_maxbytes(obj), MAX_LFS_FILESIZE);
+}
+
+/* llite/xattr.c */
+extern const struct xattr_handler *ll_xattr_handlers[];
+
+#define XATTR_USER_T		1
+#define XATTR_TRUSTED_T		2
+#define XATTR_SECURITY_T	3
+#define XATTR_ACL_ACCESS_T	4
+#define XATTR_ACL_DEFAULT_T	5
+#define XATTR_LUSTRE_T		6
+#define XATTR_OTHER_T		7
+#define XATTR_ENCRYPTION_T	9
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ll_xattr_list(struct inode *inode, const char *name, int type,
+		  void *buffer, size_t size, u64 valid);
+const struct xattr_handler *get_xattr_type(const char *name);
+int ll_get_hsm_state(struct inode *inode, u32 *hus_states);
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+int cl_sb_init(struct super_block *sb);
+int cl_sb_fini(struct super_block *sb);
+
+enum ras_update_flags {
+	LL_RAS_HIT  = 0x1,
+	LL_RAS_MMAP = 0x2
+};
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
+void ll_ra_stats_inc(struct inode *inode, enum ra_stat which);
+
+/* statahead.c */
+
+#define LL_SA_RPC_MIN           2
+#define LL_SA_RPC_DEF           32
+#define LL_SA_RPC_MAX           512
+
+/* XXX: If want to support more concurrent statahead instances,
+ *	please consider to decentralize the RPC lists attached
+ *	on related import, such as imp_{sending,delayed}_list.
+ *	LU-11079 */
+#define LL_SA_RUNNING_MAX	256
+#define LL_SA_RUNNING_DEF	16
+
+#define LL_SA_CACHE_BIT         5
+#define LL_SA_CACHE_SIZE        (1 << LL_SA_CACHE_BIT)
+#define LL_SA_CACHE_MASK        (LL_SA_CACHE_SIZE - 1)
+
+/* per inode struct, for dir only */
+struct ll_statahead_info {
+	struct dentry	       *sai_dentry;
+	atomic_t		sai_refcount;   /* when access this struct, hold
+						 * refcount */
+	unsigned int            sai_max;        /* max ahead of lookup */
+	__u64                   sai_sent;       /* stat requests sent count */
+	__u64                   sai_replied;    /* stat requests which received
+						 * reply */
+	__u64                   sai_index;      /* index of statahead entry */
+	__u64                   sai_index_wait; /* index of entry which is the
+						 * caller is waiting for */
+	__u64                   sai_hit;        /* hit count */
+	__u64                   sai_miss;       /* miss count:
+						 * for "ls -al" case, includes
+						 * hidden dentry miss;
+						 * for "ls -l" case, it does not
+						 * include hidden dentry miss.
+						 * "sai_miss_hidden" is used for
+						 * the later case.
+						 */
+	unsigned int            sai_consecutive_miss; /* consecutive miss */
+	unsigned int            sai_miss_hidden;/* "ls -al", but first dentry
+						 * is not a hidden one */
+	unsigned int            sai_skip_hidden;/* skipped hidden dentry count
+						 */
+	unsigned int            sai_ls_all:1,   /* "ls -al", do stat-ahead for
+						 * hidden entries */
+				sai_in_readpage:1;/* statahead is in readdir()*/
+	wait_queue_head_t	sai_waitq;	/* stat-ahead wait queue */
+	struct task_struct	*sai_task;	/* stat-ahead thread */
+	struct task_struct	*sai_agl_task;	/* AGL thread */
+	struct list_head	sai_interim_entries; /* entries which got async
+						      * stat reply, but not
+						      * instantiated */
+	struct list_head	sai_entries;    /* completed entries */
+	struct list_head	sai_agls;	/* AGLs to be sent */
+	struct list_head	sai_cache[LL_SA_CACHE_SIZE];
+	spinlock_t		sai_cache_lock[LL_SA_CACHE_SIZE];
+	atomic_t		sai_cache_count; /* entry count in cache */
+};
+
+int ll_revalidate_statahead(struct inode *dir, struct dentry **dentry,
+			    bool unplug);
+int ll_start_statahead(struct inode *dir, struct dentry *dentry, bool agl);
+void ll_authorize_statahead(struct inode *dir, void *key);
+void ll_deauthorize_statahead(struct inode *dir, void *key);
+
+/* glimpse.c */
+blkcnt_t dirty_cnt(struct inode *inode);
+
+int cl_glimpse_size0(struct inode *inode, int agl);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl);
+
+static inline int cl_glimpse_size(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 0);
+}
+
+/* AGL is 'asychronous glimpse lock', which is a speculative lock taken as
+ * part of statahead */
+static inline int cl_agl(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 1);
+}
+
+int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise);
+
+int cl_io_get(struct inode *inode, struct lu_env **envout,
+	      struct cl_io **ioout, __u16 *refcheck);
+
+static inline int ll_glimpse_size(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	down_read(&lli->lli_glimpse_sem);
+	rc = cl_glimpse_size(inode);
+	lli->lli_glimpse_time = ktime_get();
+	up_read(&lli->lli_glimpse_sem);
+	return rc;
+}
+
+/* dentry may statahead when statahead is enabled and current process has opened
+ * parent directory, and this dentry hasn't accessed statahead cache before */
+static inline bool
+dentry_may_statahead(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_inode_info  *lli;
+	struct ll_dentry_data *ldd;
+
+	if (ll_i2sbi(dir)->ll_sa_max == 0)
+		return false;
+
+	lli = ll_i2info(dir);
+
+	/* statahead is not allowed for this dir, there may be three causes:
+	 * 1. dir is not opened.
+	 * 2. statahead hit ratio is too low.
+	 * 3. previous stat started statahead thread failed. */
+	if (!lli->lli_sa_enabled)
+		return false;
+
+	/* not the same process, don't statahead */
+	if (lli->lli_opendir_pid != current->pid)
+		return false;
+
+	/*
+	 * When stating a dentry, kernel may trigger 'revalidate' or 'lookup'
+	 * multiple times, eg. for 'getattr', 'getxattr' and etc.
+	 * For patchless client, lookup intent is not accurate, which may
+	 * misguide statahead. For example:
+	 * The 'revalidate' call for 'getattr' and 'getxattr' of a dentry will
+	 * have the same intent -- IT_GETATTR, while one dentry should access
+	 * statahead cache once, otherwise statahead windows is messed up.
+	 * The solution is as following:
+	 * Assign 'lld_sa_generation' with 'lli_sa_generation' when a dentry
+	 * IT_GETATTR for the first time, and subsequent IT_GETATTR will
+	 * bypass interacting with statahead cache by checking
+	 * 'lld_sa_generation == lli->lli_sa_generation'.
+	 */
+	ldd = ll_d2d(dentry);
+	if (ldd != NULL && lli->lli_sa_generation &&
+	    ldd->lld_sa_generation == lli->lli_sa_generation)
+		return false;
+
+	return true;
+}
+
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode, int ignore_layout);
+
+static inline int ll_file_nolock(const struct file *file)
+{
+	struct ll_file_data *fd = file->private_data;
+	struct inode *inode = file_inode((struct file *)file);
+
+	LASSERT(fd != NULL);
+	return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
+		test_bit(LL_SBI_NOLCK, ll_i2sbi(inode)->ll_flags));
+}
+
+static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode,
+                                    struct lookup_intent *it, __u64 *bits)
+{
+	if (!it->it_lock_set) {
+		struct lustre_handle handle;
+
+		/* If this inode is a remote object, it will get two
+		 * separate locks in different namespaces, Master MDT,
+		 * where the name entry is, will grant LOOKUP lock,
+		 * remote MDT, where the object is, will grant
+		 * UPDATE|PERM lock. The inode will be attched to both
+		 * LOOKUP and PERM locks, so revoking either locks will
+		 * case the dcache being cleared */
+		if (it->it_remote_lock_mode) {
+			handle.cookie = it->it_remote_lock_handle;
+			CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID
+			       "(%p) for remote lock %#llx\n",
+			       PFID(ll_inode2fid(inode)), inode,
+			       handle.cookie);
+			md_set_lock_data(exp, &handle, inode, NULL);
+		}
+
+		handle.cookie = it->it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)"
+		       " for lock %#llx\n",
+		       PFID(ll_inode2fid(inode)), inode, handle.cookie);
+
+		md_set_lock_data(exp, &handle, inode, &it->it_lock_bits);
+		it->it_lock_set = 1;
+	}
+
+	if (bits != NULL)
+		*bits = it->it_lock_bits;
+}
+
+static inline int d_lustre_invalid(const struct dentry *dentry)
+{
+	return !ll_d2d(dentry) || ll_d2d(dentry)->lld_invalid;
+}
+
+/*
+ * Mark dentry INVALID, if dentry refcount is zero (this is normally case for
+ * ll_md_blocking_ast), it will be pruned by ll_prune_aliases() and
+ * ll_prune_negative_children(); otherwise dput() of the last refcount will
+ * unhash this dentry and kill it.
+ */
+static inline void d_lustre_invalidate(struct dentry *dentry)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(dentry->d_sb);
+
+	CDEBUG(D_DENTRY, "invalidate dentry %pd (%p) parent %p inode %p refc %d\n",
+	       dentry, dentry,
+	       dentry->d_parent, dentry->d_inode, ll_d_count(dentry));
+
+	spin_lock(&dentry->d_lock);
+	if (lld_is_init(dentry)) {
+		if (sbi->ll_neg_dentry_timeout != OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS)
+			ll_d2d(dentry)->lld_neg_cache_timeout =
+				jiffies + sbi->ll_neg_dentry_timeout * HZ;
+		ll_d2d(dentry)->lld_invalid = 1;
+	}
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void d_lustre_revalidate(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	LASSERT(ll_d2d(dentry));
+	ll_d2d(dentry)->lld_invalid = 0;
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline dev_t ll_compat_encode_dev(dev_t dev)
+{
+	/* The compat_sys_*stat*() syscalls will fail unless the
+	 * device majors and minors are both less than 256. Note that
+	 * the value returned here will be passed through
+	 * old_encode_dev() in cp_compat_stat(). And so we are not
+	 * trying to return a valid compat (u16) device number, just
+	 * one that will pass the old_valid_dev() check. */
+
+	return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
+}
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
+int ll_layout_refresh(struct inode *inode, __u32 *gen);
+int ll_layout_restore(struct inode *inode, loff_t start, __u64 length);
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+			   struct lu_extent *ext);
+
+int ll_xattr_init(void);
+void ll_xattr_fini(void);
+
+int ll_page_sync_io(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *page, enum cl_req_type crt);
+
+int ll_getparent(struct file *file, struct getparent __user *arg);
+
+/* lcommon_cl.c */
+int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
+		   enum op_xvalid xvalid, unsigned int attr_flags);
+
+extern struct lu_env *cl_inode_fini_env;
+extern __u16 cl_inode_fini_refcheck;
+
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+
+u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
+u32 cl_fid_build_gen(const struct lu_fid *fid);
+
+static inline struct pcc_super *ll_i2pccs(struct inode *inode)
+{
+	return &ll_i2sbi(inode)->ll_pcc_super;
+}
+
+static inline struct pcc_super *ll_info2pccs(struct ll_inode_info *lli)
+{
+	return ll_i2pccs(ll_info2i(lli));
+}
+
+/* crypto.c */
+/* The digested form is made of a FID (16 bytes) followed by the second-to-last
+ * ciphertext block (16 bytes), so a total length of 32 bytes.
+ * That way, llcrypt does not compute a digested form of this digest.
+ */
+struct ll_digest_filename {
+	struct lu_fid ldf_fid;
+	char ldf_excerpt[LL_CRYPTO_BLOCK_SIZE];
+};
+
+int ll_setup_filename(struct inode *dir, const struct qstr *iname,
+		      int lookup, struct llcrypt_name *fname,
+		      struct lu_fid *fid);
+int ll_fname_disk_to_usr(struct inode *inode,
+			 u32 hash, u32 minor_hash,
+			 struct llcrypt_str *iname, struct llcrypt_str *oname,
+			 struct lu_fid *fid);
+int ll_revalidate_d_crypto(struct dentry *dentry, unsigned int flags);
+int ll_file_open_encrypt(struct inode *inode, struct file *filp);
+static inline char *xattr_for_enc(struct inode *inode)
+{
+	if (ll_sbi_has_name_encrypt(ll_i2sbi(inode)))
+		return LL_XATTR_NAME_ENCRYPTION_CONTEXT;
+
+	return LL_XATTR_NAME_ENCRYPTION_CONTEXT_OLD;
+}
+#ifdef HAVE_LUSTRE_CRYPTO
+extern const struct llcrypt_operations lustre_cryptops;
+#endif
+
+/* llite/llite_foreign.c */
+int ll_manage_foreign(struct inode *inode, struct lustre_md *lmd);
+bool ll_foreign_is_openable(struct dentry *dentry, unsigned int flags);
+bool ll_foreign_is_removable(struct dentry *dentry, bool unset);
+
+int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+
+#endif /* LLITE_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
new file mode 100644
index 0000000000000..0623294d9f4c0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -0,0 +1,3909 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/llite/llite_lib.c
+ *
+ * Lustre Light Super operations
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/statfs.h>
+#include <linux/time.h>
+#include <linux/file.h>
+#include <linux/types.h>
+#include <libcfs/linux/linux-uuid.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/user_namespace.h>
+#include <linux/delay.h>
+#include <linux/uidgid.h>
+#include <linux/fs_struct.h>
+
+#ifndef HAVE_CPUS_READ_LOCK
+#include <libcfs/linux/linux-cpu.h>
+#endif
+#include <libcfs/linux/linux-misc.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#ifdef HAVE_UAPI_LINUX_MOUNT_H
+#include <uapi/linux/mount.h>
+#endif
+
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_param.h>
+#include <lustre_log.h>
+#include <cl_object.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_file_data_slab;
+
+#ifndef log2
+#define log2(n) ffz(~(n))
+#endif
+
+/**
+ * If there is only one number of core visible to Lustre,
+ * async readahead will be disabled, to avoid massive over
+ * subscription, we use 1/2 of active cores as default max
+ * async readahead requests.
+ */
+static inline unsigned int ll_get_ra_async_max_active(void)
+{
+	return cfs_cpt_weight(cfs_cpt_tab, CFS_CPT_ANY) >> 1;
+}
+
+static struct ll_sb_info *ll_init_sbi(void)
+{
+	struct ll_sb_info *sbi = NULL;
+	unsigned long pages;
+	unsigned long lru_page_max;
+	struct sysinfo si;
+	int rc;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(sbi);
+	if (sbi == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = pcc_super_init(&sbi->ll_pcc_super);
+	if (rc < 0)
+		GOTO(out_sbi, rc);
+
+	spin_lock_init(&sbi->ll_lock);
+	mutex_init(&sbi->ll_lco.lco_lock);
+	spin_lock_init(&sbi->ll_pp_extent_lock);
+	spin_lock_init(&sbi->ll_process_lock);
+        sbi->ll_rw_stats_on = 0;
+	sbi->ll_statfs_max_age = OBD_STATFS_CACHE_SECONDS;
+	sbi->ll_neg_dentry_timeout = OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS;
+
+        si_meminfo(&si);
+        pages = si.totalram - si.totalhigh;
+	lru_page_max = pages / 2;
+
+	sbi->ll_ra_info.ra_async_max_active = ll_get_ra_async_max_active();
+	sbi->ll_ra_info.ll_readahead_wq =
+		cfs_cpt_bind_workqueue("ll-readahead-wq", cfs_cpt_tab,
+				       0, CFS_CPT_ANY,
+				       sbi->ll_ra_info.ra_async_max_active);
+	if (IS_ERR(sbi->ll_ra_info.ll_readahead_wq))
+		GOTO(out_pcc, rc = PTR_ERR(sbi->ll_ra_info.ll_readahead_wq));
+
+	/* initialize ll_cache data */
+	sbi->ll_cache = cl_cache_init(lru_page_max);
+	if (sbi->ll_cache == NULL)
+		GOTO(out_destroy_ra, rc = -ENOMEM);
+
+	/* initialize foreign symlink prefix path */
+	OBD_ALLOC(sbi->ll_foreign_symlink_prefix, sizeof("/mnt/"));
+	if (sbi->ll_foreign_symlink_prefix == NULL)
+		GOTO(out_destroy_ra, rc = -ENOMEM);
+	memcpy(sbi->ll_foreign_symlink_prefix, "/mnt/", sizeof("/mnt/"));
+	sbi->ll_foreign_symlink_prefix_size = sizeof("/mnt/");
+
+	/* initialize foreign symlink upcall path, none by default */
+	OBD_ALLOC(sbi->ll_foreign_symlink_upcall, sizeof("none"));
+	if (sbi->ll_foreign_symlink_upcall == NULL)
+		GOTO(out_destroy_ra, rc = -ENOMEM);
+	memcpy(sbi->ll_foreign_symlink_upcall, "none", sizeof("none"));
+	sbi->ll_foreign_symlink_upcall_items = NULL;
+	sbi->ll_foreign_symlink_upcall_nb_items = 0;
+	init_rwsem(&sbi->ll_foreign_symlink_sem);
+	/* foreign symlink support (LL_SBI_FOREIGN_SYMLINK in ll_flags)
+	 * not enabled by default
+	 */
+
+	sbi->ll_secctx_name = NULL;
+	sbi->ll_secctx_name_size = 0;
+
+	sbi->ll_ra_info.ra_max_pages =
+		min(pages / 32, SBI_DEFAULT_READ_AHEAD_MAX);
+	sbi->ll_ra_info.ra_max_pages_per_file =
+		min(sbi->ll_ra_info.ra_max_pages / 4,
+		    SBI_DEFAULT_READ_AHEAD_PER_FILE_MAX);
+	sbi->ll_ra_info.ra_async_pages_per_file_threshold =
+				sbi->ll_ra_info.ra_max_pages_per_file;
+	sbi->ll_ra_info.ra_range_pages = SBI_DEFAULT_RA_RANGE_PAGES;
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1;
+	atomic_set(&sbi->ll_ra_info.ra_async_inflight, 0);
+
+	set_bit(LL_SBI_VERBOSE, sbi->ll_flags);
+#ifdef ENABLE_CHECKSUM
+	set_bit(LL_SBI_CHECKSUM, sbi->ll_flags);
+#endif
+#ifdef ENABLE_FLOCK
+	set_bit(LL_SBI_FLOCK, sbi->ll_flags);
+#endif
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+	set_bit(LL_SBI_LRU_RESIZE, sbi->ll_flags);
+#endif
+	set_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags);
+
+	/* metadata statahead is enabled by default */
+	sbi->ll_sa_running_max = LL_SA_RUNNING_DEF;
+	sbi->ll_sa_max = LL_SA_RPC_DEF;
+	atomic_set(&sbi->ll_sa_total, 0);
+	atomic_set(&sbi->ll_sa_wrong, 0);
+	atomic_set(&sbi->ll_sa_running, 0);
+	atomic_set(&sbi->ll_agl_total, 0);
+	set_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags);
+	set_bit(LL_SBI_FAST_READ, sbi->ll_flags);
+	set_bit(LL_SBI_TINY_WRITE, sbi->ll_flags);
+	set_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags);
+	ll_sbi_set_encrypt(sbi, true);
+	ll_sbi_set_name_encrypt(sbi, true);
+
+	/* root squash */
+	sbi->ll_squash.rsi_uid = 0;
+	sbi->ll_squash.rsi_gid = 0;
+	INIT_LIST_HEAD(&sbi->ll_squash.rsi_nosquash_nids);
+	spin_lock_init(&sbi->ll_squash.rsi_lock);
+
+	/* Per-filesystem file heat */
+	sbi->ll_heat_decay_weight = SBI_DEFAULT_HEAT_DECAY_WEIGHT;
+	sbi->ll_heat_period_second = SBI_DEFAULT_HEAT_PERIOD_SECOND;
+
+	/* Per-fs open heat level before requesting open lock */
+	sbi->ll_oc_thrsh_count = SBI_DEFAULT_OPENCACHE_THRESHOLD_COUNT;
+	sbi->ll_oc_max_ms = SBI_DEFAULT_OPENCACHE_THRESHOLD_MAX_MS;
+	sbi->ll_oc_thrsh_ms = SBI_DEFAULT_OPENCACHE_THRESHOLD_MS;
+
+	/* MDLL */
+	atomic_set(&sbi->ll_dir_restore_max_retry_count,
+		   LL_MDLL_DIR_RESTORE_DEF_RETRY_COUNT);
+
+	RETURN(sbi);
+out_destroy_ra:
+	if (sbi->ll_foreign_symlink_prefix)
+		OBD_FREE(sbi->ll_foreign_symlink_prefix, sizeof("/mnt/"));
+	if (sbi->ll_cache) {
+		cl_cache_decref(sbi->ll_cache);
+		sbi->ll_cache = NULL;
+	}
+	destroy_workqueue(sbi->ll_ra_info.ll_readahead_wq);
+out_pcc:
+	pcc_super_fini(&sbi->ll_pcc_super);
+out_sbi:
+	OBD_FREE_PTR(sbi);
+	RETURN(ERR_PTR(rc));
+}
+
+static void ll_free_sbi(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	ENTRY;
+
+	if (sbi != NULL) {
+		if (!list_empty(&sbi->ll_squash.rsi_nosquash_nids))
+			cfs_free_nidlist(&sbi->ll_squash.rsi_nosquash_nids);
+		if (sbi->ll_ra_info.ll_readahead_wq)
+			destroy_workqueue(sbi->ll_ra_info.ll_readahead_wq);
+		if (sbi->ll_cache != NULL) {
+			cl_cache_decref(sbi->ll_cache);
+			sbi->ll_cache = NULL;
+		}
+		if (sbi->ll_foreign_symlink_prefix) {
+			OBD_FREE(sbi->ll_foreign_symlink_prefix,
+				 sbi->ll_foreign_symlink_prefix_size);
+			sbi->ll_foreign_symlink_prefix = NULL;
+		}
+		if (sbi->ll_foreign_symlink_upcall) {
+			OBD_FREE(sbi->ll_foreign_symlink_upcall,
+				 strlen(sbi->ll_foreign_symlink_upcall) +
+				       1);
+			sbi->ll_foreign_symlink_upcall = NULL;
+		}
+		if (sbi->ll_foreign_symlink_upcall_items) {
+			int i;
+			int nb_items = sbi->ll_foreign_symlink_upcall_nb_items;
+			struct ll_foreign_symlink_upcall_item *items =
+				sbi->ll_foreign_symlink_upcall_items;
+
+			for (i = 0 ; i < nb_items; i++)
+				if (items[i].type == STRING_TYPE)
+					OBD_FREE(items[i].string,
+						       items[i].size);
+
+			OBD_FREE_LARGE(items, nb_items *
+				sizeof(struct ll_foreign_symlink_upcall_item));
+			sbi->ll_foreign_symlink_upcall_items = NULL;
+		}
+		if (sbi->ll_secctx_name)
+			ll_secctx_name_free(sbi);
+
+		ll_free_rw_stats_info(sbi);
+		pcc_super_fini(&sbi->ll_pcc_super);
+		OBD_FREE(sbi, sizeof(*sbi));
+	}
+	EXIT;
+}
+
+static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
+{
+	struct inode *root = NULL;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_statfs *osfs = NULL;
+	struct ptlrpc_request *request = NULL;
+	struct obd_connect_data *data = NULL;
+	struct obd_uuid *uuid;
+	struct md_op_data *op_data;
+	struct lustre_md lmd;
+	u64 valid;
+	int size, err, checksum;
+	bool api32;
+	void *encctx;
+	int encctxlen;
+
+	ENTRY;
+	sbi->ll_md_obd = class_name2obd(md);
+	if (!sbi->ll_md_obd) {
+                CERROR("MD %s: not setup or attached\n", md);
+                RETURN(-EINVAL);
+        }
+
+        OBD_ALLOC_PTR(data);
+        if (data == NULL)
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC_PTR(osfs);
+        if (osfs == NULL) {
+                OBD_FREE_PTR(data);
+                RETURN(-ENOMEM);
+        }
+
+	/* pass client page size via ocd_grant_blkbits, the server should report
+	 * back its backend blocksize for grant calculation purpose */
+	data->ocd_grant_blkbits = PAGE_SHIFT;
+
+	/* indicate MDT features supported by this client */
+	data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+				  OBD_CONNECT_ATTRFID  | OBD_CONNECT_GRANT |
+				  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_SRVLOCK  |
+				  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+				  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+				  OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
+				  OBD_CONNECT_64BITHASH |
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS|
+				  OBD_CONNECT_MAX_EASIZE |
+				  OBD_CONNECT_FLOCK_DEAD |
+				  OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK |
+				  OBD_CONNECT_OPEN_BY_FID |
+				  OBD_CONNECT_DIR_STRIPE |
+				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM |
+				  OBD_CONNECT_SUBTREE |
+				  OBD_CONNECT_MULTIMODRPCS |
+				  OBD_CONNECT_GRANT_PARAM |
+				  OBD_CONNECT_GRANT_SHRINK |
+				  OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2;
+
+	data->ocd_connect_flags2 = OBD_CONNECT2_DIR_MIGRATE |
+				   OBD_CONNECT2_SUM_STATFS |
+				   OBD_CONNECT2_OVERSTRIPING |
+				   OBD_CONNECT2_FLR |
+				   OBD_CONNECT2_LOCK_CONVERT |
+				   OBD_CONNECT2_ARCHIVE_ID_ARRAY |
+				   OBD_CONNECT2_INC_XID |
+				   OBD_CONNECT2_LSOM |
+				   OBD_CONNECT2_ASYNC_DISCARD |
+				   OBD_CONNECT2_PCC |
+				   OBD_CONNECT2_CRUSH | OBD_CONNECT2_LSEEK |
+				   OBD_CONNECT2_GETATTR_PFID |
+				   OBD_CONNECT2_DOM_LVB |
+				   OBD_CONNECT2_REP_MBITS |
+				   OBD_CONNECT2_ATOMIC_OPEN_LOCK |
+				   OBD_CONNECT2_MDLL;
+
+	if (test_bit(LL_SBI_MDLL, sbi->ll_flags))
+		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL;
+
+	if (test_bit(LL_SBI_MDLL_BYPASS, sbi->ll_flags))
+		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_BYPASS;
+
+	if (test_bit(LL_SBI_MDLL_AUTO_REFRESH, sbi->ll_flags))
+		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_AUTO_REFRESH;
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+	if (test_bit(LL_SBI_LRU_RESIZE, sbi->ll_flags))
+                data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#endif
+	data->ocd_connect_flags |= OBD_CONNECT_ACL_FLAGS;
+
+	data->ocd_cksum_types = obd_cksum_types_supported_client();
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
+		/* flag mdc connection as lightweight, only used for test
+		 * purpose, use with care */
+		data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
+
+	data->ocd_ibits_known = MDS_INODELOCK_FULL;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+
+	if (sb->s_flags & SB_RDONLY)
+		data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+	if (test_bit(LL_SBI_USER_XATTR, sbi->ll_flags))
+		data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+
+#ifdef SB_NOSEC
+	/* Setting this indicates we correctly support S_NOSEC (See kernel
+	 * commit 9e1f1de02c2275d7172e18dc4e7c2065777611bf)
+	 */
+	sb->s_flags |= SB_NOSEC;
+#endif
+	sbi->ll_fop = ll_select_file_operations(sbi);
+
+	/* always ping even if server suppress_pings */
+	if (test_bit(LL_SBI_ALWAYS_PING, sbi->ll_flags))
+		data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+	obd_connect_set_secctx(data);
+	if (ll_sbi_has_encrypt(sbi)) {
+		obd_connect_set_name_enc(data);
+		obd_connect_set_enc(data);
+	}
+
+#if defined(CONFIG_SECURITY)
+	data->ocd_connect_flags2 |= OBD_CONNECT2_SELINUX_POLICY;
+#endif
+
+	data->ocd_brw_size = MD_MAX_BRW_SIZE;
+
+	err = obd_connect(NULL, &sbi->ll_md_exp, sbi->ll_md_obd,
+			  &sbi->ll_sb_uuid, data, sbi->ll_cache);
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
+				   "recovery, of which this client is not a "
+				   "part. Please wait for recovery to complete,"
+				   " abort, or time out.\n", md);
+		GOTO(out, err);
+	} else if (err) {
+		CERROR("cannot connect to %s: rc = %d\n", md, err);
+		GOTO(out, err);
+	}
+
+	sbi->ll_md_exp->exp_connect_data = *data;
+
+	err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init metadata layer FID infrastructure, "
+		       "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_md, err);
+	}
+
+	/* For mount, we only need fs info from MDT0, and also in DNE, it
+	 * can make sure the client can be mounted as long as MDT0 is
+	 * avaible */
+	err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
+			ktime_get_seconds() - sbi->ll_statfs_max_age,
+			OBD_STATFS_FOR_MDT0);
+	if (err)
+		GOTO(out_md_fid, err);
+
+	/* This needs to be after statfs to ensure connect has finished.
+	 * Note that "data" does NOT contain the valid connect reply.
+	 * If connecting to a 1.8 server there will be no LMV device, so
+	 * we can access the MDC export directly and exp_connect_flags will
+	 * be non-zero, but if accessing an upgraded 2.1 server it will
+	 * have the correct flags filled in.
+	 * XXX: fill in the LMV exp_connect_flags from MDC(s). */
+	valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
+	if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
+	    valid != CLIENT_CONNECT_MDT_REQD) {
+		char *buf;
+
+		OBD_ALLOC_WAIT(buf, PAGE_SIZE);
+		obd_connect_flags2str(buf, PAGE_SIZE,
+				      valid ^ CLIENT_CONNECT_MDT_REQD, 0, ",");
+		LCONSOLE_ERROR_MSG(0x170, "Server %s does not support "
+				   "feature(s) needed for correct operation "
+				   "of this client (%s). Please upgrade "
+				   "server or downgrade client.\n",
+				   sbi->ll_md_exp->exp_obd->obd_name, buf);
+		OBD_FREE(buf, PAGE_SIZE);
+		GOTO(out_md_fid, err = -EPROTO);
+	}
+
+	size = sizeof(*data);
+	err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
+			   KEY_CONN_DATA,  &size, data);
+	if (err) {
+		CERROR("%s: Get connect data failed: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_md_fid, err);
+	}
+
+	LASSERT(osfs->os_bsize);
+	sb->s_blocksize = osfs->os_bsize;
+	sb->s_blocksize_bits = log2(osfs->os_bsize);
+	sb->s_magic = LL_SUPER_MAGIC;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sbi->ll_inode_cache_enabled = 1;
+	sbi->ll_namelen = osfs->os_namelen;
+	sbi->ll_mnt.mnt = current->fs->root.mnt;
+	sbi->ll_mnt_ns = current->nsproxy->mnt_ns;
+
+	if (test_bit(LL_SBI_USER_XATTR, sbi->ll_flags) &&
+	    !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+		LCONSOLE_INFO("Disabling user_xattr feature because "
+			      "it is not supported on the server\n");
+		clear_bit(LL_SBI_USER_XATTR, sbi->ll_flags);
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef SB_POSIXACL
+		sb->s_flags |= SB_POSIXACL;
+#endif
+		set_bit(LL_SBI_ACL, sbi->ll_flags);
+	} else {
+		LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
+#ifdef SB_POSIXACL
+		sb->s_flags &= ~SB_POSIXACL;
+#endif
+		clear_bit(LL_SBI_ACL, sbi->ll_flags);
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
+		set_bit(LL_SBI_64BIT_HASH, sbi->ll_flags);
+
+	if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK)
+		set_bit(LL_SBI_LAYOUT_LOCK, sbi->ll_flags);
+
+	if (obd_connect_has_secctx(data))
+		set_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags);
+
+	if (ll_sbi_has_encrypt(sbi) && !obd_connect_has_enc(data)) {
+		if (ll_sb_has_test_dummy_encryption(sb))
+			LCONSOLE_WARN("%s: server %s does not support encryption feature, encryption deactivated.\n",
+				      sbi->ll_fsname,
+				      sbi->ll_md_exp->exp_obd->obd_name);
+		ll_sbi_set_encrypt(sbi, false);
+	}
+
+	if (ll_sbi_has_name_encrypt(sbi) && !obd_connect_has_name_enc(data)) {
+		struct  lustre_sb_info *lsi = s2lsi(sb);
+
+		if (ll_sb_has_test_dummy_encryption(sb))
+			LCONSOLE_WARN("%s: server %s does not support name encryption, not using it.\n",
+				      sbi->ll_fsname,
+				      sbi->ll_md_exp->exp_obd->obd_name);
+		lsi->lsi_flags &= ~LSI_FILENAME_ENC;
+		lsi->lsi_flags &= ~LSI_FILENAME_ENC_B64_OLD_CLI;
+		ll_sbi_set_name_encrypt(sbi, false);
+	}
+
+	if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) {
+		if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) {
+			LCONSOLE_INFO("%s: disabling xattr cache due to "
+				      "unknown maximum xattr size.\n", dt);
+		} else if (!sbi->ll_xattr_cache_set) {
+			/* If xattr_cache is already set (no matter 0 or 1)
+			 * during processing llog, it won't be enabled here. */
+			set_bit(LL_SBI_XATTR_CACHE, sbi->ll_flags);
+			sbi->ll_xattr_cache_enabled = 1;
+		}
+	}
+
+	sbi->ll_dt_obd = class_name2obd(dt);
+	if (!sbi->ll_dt_obd) {
+		CERROR("DT %s: not setup or attached\n", dt);
+		GOTO(out_md_fid, err = -ENODEV);
+	}
+
+	/* pass client page size via ocd_grant_blkbits, the server should report
+	 * back its backend blocksize for grant calculation purpose */
+	data->ocd_grant_blkbits = PAGE_SHIFT;
+
+	/* indicate OST features supported by this client */
+	data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION |
+				  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
+				  OBD_CONNECT_SRVLOCK |
+				  OBD_CONNECT_AT | OBD_CONNECT_OSS_CAPA |
+				  OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
+				  OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES |
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK |
+				  OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK |
+				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_SHORTIO |
+				  OBD_CONNECT_FLAGS2 | OBD_CONNECT_GRANT_SHRINK;
+	data->ocd_connect_flags2 = OBD_CONNECT2_LOCKAHEAD |
+				   OBD_CONNECT2_INC_XID | OBD_CONNECT2_LSEEK |
+				   OBD_CONNECT2_REP_MBITS;
+
+	if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_GRANT_PARAM))
+		data->ocd_connect_flags |= OBD_CONNECT_GRANT_PARAM;
+
+	/* OBD_CONNECT_CKSUM should always be set, even if checksums are
+	 * disabled by default, because it can still be enabled on the
+	 * fly via /sys. As a consequence, we still need to come to an
+	 * agreement on the supported algorithms at connect time
+	 */
+	data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
+		data->ocd_cksum_types = OBD_CKSUM_ADLER;
+	else
+		data->ocd_cksum_types = obd_cksum_types_supported_client();
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+	data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#endif
+	/* always ping even if server suppress_pings */
+	if (test_bit(LL_SBI_ALWAYS_PING, sbi->ll_flags))
+		data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+	if (ll_sbi_has_encrypt(sbi))
+		obd_connect_set_enc(data);
+
+	CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d "
+	       "ocd_grant: %d\n", data->ocd_connect_flags,
+	       data->ocd_version, data->ocd_grant);
+
+	sbi->ll_dt_obd->obd_upcall.onu_owner = &sbi->ll_lco;
+	sbi->ll_dt_obd->obd_upcall.onu_upcall = cl_ocd_update;
+
+	data->ocd_brw_size = DT_MAX_BRW_SIZE;
+
+	err = obd_connect(NULL, &sbi->ll_dt_exp, sbi->ll_dt_obd,
+			  &sbi->ll_sb_uuid, data, sbi->ll_cache);
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
+				   "recovery, of which this client is not a "
+				   "part.  Please wait for recovery to "
+				   "complete, abort, or time out.\n", dt);
+		GOTO(out_md, err);
+	} else if (err) {
+		CERROR("%s: Cannot connect to %s: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
+		GOTO(out_md, err);
+	}
+
+	if (ll_sbi_has_encrypt(sbi) &&
+	    !obd_connect_has_enc(&sbi->ll_dt_obd->u.lov.lov_ocd)) {
+		if (ll_sb_has_test_dummy_encryption(sb))
+			LCONSOLE_WARN("%s: server %s does not support encryption feature, encryption deactivated.\n",
+				      sbi->ll_fsname, dt);
+		ll_sbi_set_encrypt(sbi, false);
+	} else if (ll_sb_has_test_dummy_encryption(sb)) {
+		LCONSOLE_WARN("Test dummy encryption mode enabled\n");
+	}
+
+	sbi->ll_dt_exp->exp_connect_data = *data;
+
+	/* Don't change value if it was specified in the config log */
+	if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1) {
+		sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+			max_t(unsigned long, SBI_DEFAULT_READ_AHEAD_WHOLE_MAX,
+			      (data->ocd_brw_size >> PAGE_SHIFT));
+		if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages >
+		    sbi->ll_ra_info.ra_max_pages_per_file)
+			sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+				sbi->ll_ra_info.ra_max_pages_per_file;
+	}
+
+	err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init data layer FID infrastructure, "
+		       "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err);
+		GOTO(out_dt, err);
+	}
+
+	mutex_lock(&sbi->ll_lco.lco_lock);
+	sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+	sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
+	sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
+	mutex_unlock(&sbi->ll_lco.lco_lock);
+
+	fid_zero(&sbi->ll_root_fid);
+	err = md_get_root(sbi->ll_md_exp, get_mount_fileset(sb),
+			   &sbi->ll_root_fid);
+	if (err) {
+		CERROR("cannot mds_connect: rc = %d\n", err);
+		GOTO(out_lock_cn_cb, err);
+	}
+	if (!fid_is_sane(&sbi->ll_root_fid)) {
+		CERROR("%s: Invalid root fid "DFID" during mount\n",
+		       sbi->ll_md_exp->exp_obd->obd_name,
+		       PFID(&sbi->ll_root_fid));
+		GOTO(out_lock_cn_cb, err = -EINVAL);
+	}
+	CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
+
+	sb->s_op = &lustre_super_operations;
+	sb->s_xattr = ll_xattr_handlers;
+#if THREAD_SIZE >= 8192 /*b=17630*/
+	sb->s_export_op = &lustre_export_operations;
+#endif
+#ifdef HAVE_LUSTRE_CRYPTO
+	llcrypt_set_ops(sb, &lustre_cryptops);
+#endif
+
+	/* make root inode
+	 * XXX: move this to after cbd setup? */
+	valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMODEASIZE |
+		OBD_MD_ENCCTX;
+	if (test_bit(LL_SBI_ACL, sbi->ll_flags))
+		valid |= OBD_MD_FLACL;
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out_lock_cn_cb, err = -ENOMEM);
+
+	op_data->op_fid1 = sbi->ll_root_fid;
+	op_data->op_mode = 0;
+	op_data->op_valid = valid;
+
+	err = md_getattr(sbi->ll_md_exp, op_data, &request);
+
+	/* We need enc ctx info, so reset it in op_data to
+	 * prevent it from being freed.
+	 */
+	encctx = op_data->op_file_encctx;
+	encctxlen = op_data->op_file_encctx_size;
+	op_data->op_file_encctx = NULL;
+	op_data->op_file_encctx_size = 0;
+	OBD_FREE_PTR(op_data);
+	if (err) {
+		CERROR("%s: md_getattr failed for root: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_lock_cn_cb, err);
+	}
+
+	err = md_get_lustre_md(sbi->ll_md_exp, &request->rq_pill,
+			       sbi->ll_dt_exp, sbi->ll_md_exp, &lmd);
+	if (err) {
+		CERROR("failed to understand root inode md: rc = %d\n", err);
+		ptlrpc_req_finished(request);
+		GOTO(out_lock_cn_cb, err);
+	}
+
+	LASSERT(fid_is_sane(&sbi->ll_root_fid));
+	api32 = test_bit(LL_SBI_32BIT_API, sbi->ll_flags);
+	root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid, api32), &lmd);
+	md_free_lustre_md(sbi->ll_md_exp, &lmd);
+
+	if (IS_ERR(root)) {
+		lmd_clear_acl(&lmd);
+		err = IS_ERR(root) ? PTR_ERR(root) : -EBADF;
+		root = NULL;
+		CERROR("%s: bad ll_iget() for root: rc = %d\n",
+		       sbi->ll_fsname, err);
+		ptlrpc_req_finished(request);
+		GOTO(out_root, err);
+	}
+
+	err = ll_secctx_name_store(root);
+	if (err < 0 && ll_security_xattr_wanted(root))
+		CWARN("%s: file security contextes not supported: rc = %d\n",
+		      sbi->ll_fsname, err);
+
+	err = 0;
+	if (encctxlen) {
+		CDEBUG(D_SEC,
+		       "server returned encryption ctx for root inode "DFID"\n",
+		       PFID(&sbi->ll_root_fid));
+		err = ll_set_encflags(root, encctx, encctxlen, true);
+		if (err)
+			CWARN("%s: cannot set enc ctx for "DFID": rc = %d\n",
+			      sbi->ll_fsname,
+			      PFID(&sbi->ll_root_fid), err);
+	}
+	ptlrpc_req_finished(request);
+
+	checksum = test_bit(LL_SBI_CHECKSUM, sbi->ll_flags);
+	if (sbi->ll_checksum_set) {
+		err = obd_set_info_async(NULL, sbi->ll_dt_exp,
+					 sizeof(KEY_CHECKSUM), KEY_CHECKSUM,
+					 sizeof(checksum), &checksum, NULL);
+		if (err) {
+			CERROR("%s: Set checksum failed: rc = %d\n",
+			       sbi->ll_dt_exp->exp_obd->obd_name, err);
+			GOTO(out_root, err);
+		}
+	}
+	cl_sb_init(sb);
+
+	sb->s_root = d_make_root(root);
+	if (sb->s_root == NULL) {
+		err = -ENOMEM;
+		CERROR("%s: can't make root dentry: rc = %d\n",
+		       sbi->ll_fsname, err);
+		GOTO(out_root, err);
+	}
+
+	sbi->ll_sdev_orig = sb->s_dev;
+
+	/* We set sb->s_dev equal on all lustre clients in order to support
+	 * NFS export clustering.  NFSD requires that the FSID be the same
+	 * on all clients. */
+	/* s_dev is also used in lt_compare() to compare two fs, but that is
+	 * only a node-local comparison. */
+	uuid = obd_get_uuid(sbi->ll_md_exp);
+	if (uuid != NULL)
+		sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
+
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+
+	if (sbi->ll_dt_obd) {
+		err = sysfs_create_link(&sbi->ll_kset.kobj,
+					&sbi->ll_dt_obd->obd_kset.kobj,
+					sbi->ll_dt_obd->obd_type->typ_name);
+		if (err < 0) {
+			CERROR("%s: could not register %s in llite: rc = %d\n",
+			       dt, sbi->ll_fsname, err);
+			err = 0;
+		}
+	}
+
+	if (sbi->ll_md_obd) {
+		err = sysfs_create_link(&sbi->ll_kset.kobj,
+					&sbi->ll_md_obd->obd_kset.kobj,
+					sbi->ll_md_obd->obd_type->typ_name);
+		if (err < 0) {
+			CERROR("%s: could not register %s in llite: rc = %d\n",
+			       md, sbi->ll_fsname, err);
+			err = 0;
+		}
+	}
+
+	RETURN(err);
+out_root:
+	iput(root);
+out_lock_cn_cb:
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+out_dt:
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+	sbi->ll_dt_obd = NULL;
+out_md_fid:
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+out_md:
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+	sbi->ll_md_obd = NULL;
+out:
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+	return err;
+}
+
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	size = sizeof(*lmmsize);
+	rc = obd_get_info(NULL, sbi->ll_dt_exp, sizeof(KEY_MAX_EASIZE),
+			  KEY_MAX_EASIZE, &size, lmmsize);
+	if (rc != 0) {
+		CERROR("%s: cannot get max LOV EA size: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INFO, "max LOV ea size: %d\n", *lmmsize);
+
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
+			  KEY_MAX_EASIZE, &size, lmmsize);
+	if (rc)
+		CERROR("Get max mdsize error rc %d\n", rc);
+
+	CDEBUG(D_INFO, "max LMV ea size: %d\n", *lmmsize);
+
+	RETURN(rc);
+}
+
+/**
+ * Get the value of the default_easize parameter.
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] sbi	superblock info for this filesystem
+ * \param[out] lmmsize	pointer to storage location for value
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE),
+			 KEY_DEFAULT_EASIZE, &size, lmmsize);
+	if (rc)
+		CERROR("Get default mdsize error rc %d\n", rc);
+
+	RETURN(rc);
+}
+
+/**
+ * Set the default_easize parameter to the given value.
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] sbi	superblock info for this filesystem
+ * \param[in] lmmsize	the size to set
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+int ll_set_default_mdsize(struct ll_sb_info *sbi, int lmmsize)
+{
+	int rc;
+
+	if (lmmsize < sizeof(struct lov_mds_md) ||
+	    lmmsize > OBD_MAX_DEFAULT_EA_SIZE)
+		return -EINVAL;
+
+	rc = obd_set_info_async(NULL, sbi->ll_md_exp,
+				sizeof(KEY_DEFAULT_EASIZE), KEY_DEFAULT_EASIZE,
+				sizeof(int), &lmmsize, NULL);
+
+	RETURN(rc);
+}
+
+static void client_common_put_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	ENTRY;
+
+	cl_sb_fini(sb);
+
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+
+	ll_debugfs_unregister_super(sb);
+
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+
+	EXIT;
+}
+
+void ll_kill_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+	ENTRY;
+
+	/* not init sb ?*/
+	if (!(sb->s_flags & SB_ACTIVE))
+		return;
+
+	sbi = ll_s2sbi(sb);
+	/* we need restore s_dev from changed for clustred NFS before put_super
+	 * because new kernels have cached s_dev and change sb->s_dev in
+	 * put_super not affected real removing devices */
+	if (sbi) {
+		sb->s_dev = sbi->ll_sdev_orig;
+
+		/* wait running statahead threads to quit */
+		while (atomic_read(&sbi->ll_sa_running) > 0)
+			schedule_timeout_uninterruptible(
+				cfs_time_seconds(1) >> 3);
+	}
+
+	EXIT;
+}
+
+/* Since we use this table for ll_sbi_flags_seq_show make
+ * sure what you want displayed for a specific token that
+ * is listed more than once below be listed first. For
+ * example we want "checksum" displayed, not "nochecksum"
+ * for the sbi_flags.
+ */
+static const match_table_t ll_sbi_flags_name = {
+	{LL_SBI_NOLCK,			"nolock"},
+	{LL_SBI_CHECKSUM,		"checksum"},
+	{LL_SBI_CHECKSUM,		"nochecksum"},
+	{LL_SBI_LOCALFLOCK,		"localflock"},
+	{LL_SBI_FLOCK,			"flock"},
+	{LL_SBI_FLOCK,			"noflock"},
+	{LL_SBI_USER_XATTR,		"user_xattr"},
+	{LL_SBI_USER_XATTR,		"nouser_xattr"},
+	{LL_SBI_LRU_RESIZE,		"lruresize"},
+	{LL_SBI_LRU_RESIZE,		"nolruresize"},
+	{LL_SBI_LAZYSTATFS,		"lazystatfs"},
+	{LL_SBI_LAZYSTATFS,		"nolazystatfs"},
+	{LL_SBI_32BIT_API,		"32bitapi"},
+	{LL_SBI_USER_FID2PATH,		"user_fid2path"},
+	{LL_SBI_USER_FID2PATH,		"nouser_fid2path"},
+	{LL_SBI_VERBOSE,		"verbose"},
+	{LL_SBI_VERBOSE,		"noverbose"},
+	{LL_SBI_ALWAYS_PING,		"always_ping"},
+	{LL_SBI_TEST_DUMMY_ENCRYPTION,	"test_dummy_encryption=%s"},
+	{LL_SBI_TEST_DUMMY_ENCRYPTION,	"test_dummy_encryption"},
+	{LL_SBI_ENCRYPT,		"encrypt"},
+	{LL_SBI_ENCRYPT,		"noencrypt"},
+	{LL_SBI_FOREIGN_SYMLINK,	"foreign_symlink=%s"},
+	{LL_SBI_ALLOW_VERSION_MISMATCH, "allow_version_mismatch"},
+	{LL_SBI_MDLL_BYPASS,		"mdll_bypass"},
+	{LL_SBI_NUM_MOUNT_OPT,		NULL},
+
+	{LL_SBI_ACL,			"acl"},
+	{LL_SBI_AGL_ENABLED,		"agl"},
+	{LL_SBI_64BIT_HASH,		"64bit_hash"},
+	{LL_SBI_LAYOUT_LOCK,		"layout"},
+	{LL_SBI_XATTR_CACHE,		"xattr_cache"},
+	{LL_SBI_NOROOTSQUASH,		"norootsquash"},
+	{LL_SBI_FAST_READ,		"fast_read"},
+	{LL_SBI_FILE_SECCTX,		"file_secctx"},
+	{LL_SBI_TINY_WRITE,		"tiny_write"},
+	{LL_SBI_FILE_HEAT,		"file_heat"},
+	{LL_SBI_PARALLEL_DIO,		"parallel_dio"},
+	{LL_SBI_ENCRYPT_NAME,		"name_encrypt"},
+	{LL_SBI_MDLL_AUTO_REFRESH,	"mdll_auto_refresh"},
+	{LL_SBI_MDLL,			"mdll"},
+};
+
+int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	int i;
+
+	for (i = 0; i < LL_SBI_NUM_FLAGS; i++) {
+		int j;
+
+		if (!test_bit(i, ll_s2sbi(sb)->ll_flags))
+			continue;
+
+		for (j = 0; j < ARRAY_SIZE(ll_sbi_flags_name); j++) {
+			if (ll_sbi_flags_name[j].token == i &&
+			    ll_sbi_flags_name[j].pattern) {
+				seq_printf(m, "%s ",
+					   ll_sbi_flags_name[j].pattern);
+				break;
+			}
+		}
+	}
+	seq_puts(m, "\b\n");
+	return 0;
+}
+
+/* non-client-specific mount options are parsed in lmd_parse */
+static int ll_options(char *options, struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *s2, *s1, *opts;
+	int err = 0;
+
+	ENTRY;
+	if (!options)
+		RETURN(0);
+
+	/* Disallow version mismatch by default */
+	allow_version_mismatch = 0;
+
+	/* Don't stomp on lmd_opts */
+	opts = kstrdup(options, GFP_KERNEL);
+	if (!opts)
+		RETURN(-ENOMEM);
+	s1 = opts;
+	s2 = opts;
+
+	CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+
+	while ((s1 = strsep(&opts, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		bool turn_off = false;
+		int token;
+
+		if (!*s1)
+			continue;
+
+		CDEBUG(D_SUPER, "next opt=%s\n", s1);
+
+		if (strncmp(s1, "no", 2) == 0)
+			turn_off = true;
+
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = NULL;
+		args[0].from = NULL;
+		token = match_token(s1, ll_sbi_flags_name, args);
+		if (token == LL_SBI_NUM_MOUNT_OPT) {
+			if (match_wildcard("context", s1) ||
+			    match_wildcard("fscontext", s1) ||
+			    match_wildcard("defcontext", s1) ||
+			    match_wildcard("rootcontext",s1))
+				continue;
+
+			LCONSOLE_ERROR_MSG(0x152,
+					   "Unknown option '%s', won't mount.\n",
+					   s1);
+			RETURN(-EINVAL);
+		}
+
+		switch (token) {
+		case LL_SBI_NOLCK:
+		case LL_SBI_32BIT_API:
+		case LL_SBI_64BIT_HASH:
+		case LL_SBI_ALWAYS_PING:
+		case LL_SBI_MDLL_AUTO_REFRESH:
+		case LL_SBI_MDLL:
+		case LL_SBI_MDLL_BYPASS:
+			set_bit(token, sbi->ll_flags);
+			break;
+
+		case LL_SBI_FLOCK:
+			clear_bit(LL_SBI_LOCALFLOCK, sbi->ll_flags);
+			if (turn_off)
+				clear_bit(LL_SBI_FLOCK, sbi->ll_flags);
+			else
+				set_bit(token, sbi->ll_flags);
+			break;
+
+		case LL_SBI_LOCALFLOCK:
+			clear_bit(LL_SBI_FLOCK, sbi->ll_flags);
+			set_bit(token, sbi->ll_flags);
+			break;
+
+		case LL_SBI_CHECKSUM:
+			sbi->ll_checksum_set = 1;
+			fallthrough;
+		case LL_SBI_USER_XATTR:
+		case LL_SBI_USER_FID2PATH:
+		case LL_SBI_LRU_RESIZE:
+		case LL_SBI_LAZYSTATFS:
+		case LL_SBI_VERBOSE:
+			if (turn_off)
+				clear_bit(token, sbi->ll_flags);
+			else
+				set_bit(token, sbi->ll_flags);
+			break;
+		case LL_SBI_TEST_DUMMY_ENCRYPTION: {
+#ifdef HAVE_LUSTRE_CRYPTO
+#ifdef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED
+			set_bit(token, sbi->ll_flags);
+#else
+			struct lustre_sb_info *lsi = s2lsi(sb);
+
+			err = llcrypt_set_test_dummy_encryption(sb, &args[0],
+								&lsi->lsi_dummy_enc_ctx);
+			if (!err)
+				break;
+
+			if (err == -EEXIST)
+				LCONSOLE_WARN(
+					 "Can't change test_dummy_encryption");
+			else if (err == -EINVAL)
+				LCONSOLE_WARN(
+					"Value of option \"%s\" unrecognized",
+					options);
+			else
+				LCONSOLE_WARN(
+					 "Error processing option \"%s\" [%d]",
+					 options, err);
+			err = -1;
+#endif
+#else
+			LCONSOLE_WARN("Test dummy encryption mount option ignored: encryption not supported\n");
+#endif
+			break;
+		}
+		case LL_SBI_ENCRYPT:
+#ifdef HAVE_LUSTRE_CRYPTO
+			if (turn_off)
+				clear_bit(token, sbi->ll_flags);
+			else
+				set_bit(token, sbi->ll_flags);
+#else
+			LCONSOLE_WARN("noencrypt or encrypt mount option ignored: encryption not supported\n");
+#endif
+			break;
+		case LL_SBI_ALLOW_VERSION_MISMATCH:
+			allow_version_mismatch = 1;
+			break;
+		case LL_SBI_FOREIGN_SYMLINK:
+			/* non-default prefix provided ? */
+			if (args->from) {
+				size_t old_len;
+				char *old;
+
+				/* path must be absolute */
+				if (args->from[0] != '/') {
+					LCONSOLE_ERROR_MSG(0x152,
+							   "foreign prefix '%s' must be an absolute path\n",
+							   args->from);
+					RETURN(-EINVAL);
+				}
+
+				old_len = sbi->ll_foreign_symlink_prefix_size;
+				old = sbi->ll_foreign_symlink_prefix;
+				/* alloc for path length and '\0' */
+				sbi->ll_foreign_symlink_prefix = match_strdup(args);
+				if (!sbi->ll_foreign_symlink_prefix) {
+					/* restore previous */
+					sbi->ll_foreign_symlink_prefix = old;
+					sbi->ll_foreign_symlink_prefix_size =
+						old_len;
+					RETURN(-ENOMEM);
+				}
+				sbi->ll_foreign_symlink_prefix_size =
+					args->to - args->from + 1;
+				OBD_ALLOC_POST(sbi->ll_foreign_symlink_prefix,
+					       sbi->ll_foreign_symlink_prefix_size,
+					       "kmalloced");
+				if (old)
+					OBD_FREE(old, old_len);
+
+				/* enable foreign symlink support */
+				set_bit(token, sbi->ll_flags);
+			} else {
+				LCONSOLE_ERROR_MSG(0x152,
+						   "invalid %s option\n", s1);
+			}
+		fallthrough;
+		default:
+			break;
+		}
+        }
+	kfree(opts);
+	RETURN(err);
+}
+
+void ll_lli_init(struct ll_inode_info *lli)
+{
+	lli->lli_inode_magic = LLI_INODE_MAGIC;
+	lli->lli_flags = 0;
+	rwlock_init(&lli->lli_lock);
+	lli->lli_posix_acl = NULL;
+	/* Do not set lli_fid, it has been initialized already. */
+	fid_zero(&lli->lli_pfid);
+	lli->lli_mds_read_och = NULL;
+        lli->lli_mds_write_och = NULL;
+        lli->lli_mds_exec_och = NULL;
+        lli->lli_open_fd_read_count = 0;
+        lli->lli_open_fd_write_count = 0;
+        lli->lli_open_fd_exec_count = 0;
+	mutex_init(&lli->lli_och_mutex);
+	spin_lock_init(&lli->lli_agl_lock);
+	spin_lock_init(&lli->lli_layout_lock);
+	ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE);
+	lli->lli_clob = NULL;
+
+	init_rwsem(&lli->lli_xattrs_list_rwsem);
+	mutex_init(&lli->lli_xattrs_enq_lock);
+
+	LASSERT(lli->lli_vfs_inode.i_mode != 0);
+	if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
+		lli->lli_opendir_key = NULL;
+		lli->lli_sai = NULL;
+		spin_lock_init(&lli->lli_sa_lock);
+		lli->lli_opendir_pid = 0;
+		lli->lli_sa_enabled = 0;
+		init_rwsem(&lli->lli_lsm_sem);
+	} else {
+		mutex_init(&lli->lli_size_mutex);
+		mutex_init(&lli->lli_setattr_mutex);
+		lli->lli_symlink_name = NULL;
+		ll_trunc_sem_init(&lli->lli_trunc_sem);
+		range_lock_tree_init(&lli->lli_write_tree);
+		init_rwsem(&lli->lli_glimpse_sem);
+		lli->lli_glimpse_time = ktime_set(0, 0);
+		INIT_LIST_HEAD(&lli->lli_agl_list);
+		lli->lli_agl_index = 0;
+		lli->lli_async_rc = 0;
+		spin_lock_init(&lli->lli_heat_lock);
+		obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
+		lli->lli_heat_flags = 0;
+		mutex_init(&lli->lli_pcc_lock);
+		lli->lli_pcc_state = PCC_STATE_FL_NONE;
+		lli->lli_pcc_inode = NULL;
+		lli->lli_pcc_dsflags = PCC_DATASET_INVALID;
+		lli->lli_pcc_generation = 0;
+		mutex_init(&lli->lli_group_mutex);
+		lli->lli_group_users = 0;
+		lli->lli_group_gid = 0;
+	}
+	mutex_init(&lli->lli_layout_mutex);
+	memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid));
+	/* ll_cl_context initialize */
+	INIT_LIST_HEAD(&lli->lli_lccs);
+	seqlock_init(&lli->lli_page_inv_lock);
+}
+
+#define MAX_STRING_SIZE 128
+
+#ifndef HAVE_SUPER_SETUP_BDI_NAME
+#ifndef HAVE_BDI_CAP_MAP_COPY
+# define BDI_CAP_MAP_COPY	0
+#endif
+
+static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+	struct  lustre_sb_info *lsi = s2lsi(sb);
+	char buf[MAX_STRING_SIZE];
+	va_list args;
+	int err;
+
+	err = bdi_init(&lsi->lsi_bdi);
+	if (err)
+		return err;
+
+	lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+	lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+	lsi->lsi_bdi.name = "lustre";
+	va_start(args, fmt);
+	vsnprintf(buf, MAX_STRING_SIZE, fmt, args);
+	va_end(args);
+	err = bdi_register(&lsi->lsi_bdi, NULL, "%s", buf);
+	va_end(args);
+	if (!err)
+		sb->s_bdi = &lsi->lsi_bdi;
+
+	return err;
+}
+#endif /* !HAVE_SUPER_SETUP_BDI_NAME */
+
+int ll_fill_super(struct super_block *sb)
+{
+	struct	lustre_profile *lprof = NULL;
+	struct	lustre_sb_info *lsi = s2lsi(sb);
+	struct	ll_sb_info *sbi = NULL;
+	char	*dt = NULL, *md = NULL;
+	char	*profilenm = get_profile_name(sb);
+	struct config_llog_instance *cfg;
+	/* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
+	const int instlen = LUSTRE_MAXINSTANCE + 2;
+	unsigned long cfg_instance = ll_get_cfg_instance(sb);
+	char name[MAX_STRING_SIZE];
+	int md_len = 0;
+	int dt_len = 0;
+	uuid_t uuid;
+	char *ptr;
+	int len;
+	int err;
+
+	ENTRY;
+	/* for ASLR, to map between cfg_instance and hashed ptr */
+	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
+	       profilenm, cfg_instance, sb);
+
+	OBD_RACE(OBD_FAIL_LLITE_RACE_MOUNT);
+
+	OBD_ALLOC_PTR(cfg);
+	if (cfg == NULL)
+		GOTO(out_free_cfg, err = -ENOMEM);
+
+	/* client additional sb info */
+	lsi->lsi_llsbi = sbi = ll_init_sbi();
+	if (IS_ERR(sbi))
+		GOTO(out_free_cfg, err = PTR_ERR(sbi));
+
+	err = ll_options(lsi->lsi_lmd->lmd_opts, sb);
+	if (err)
+		GOTO(out_free_cfg, err);
+
+	/* LSI_FILENAME_ENC is only used by embedded llcrypt */
+#ifdef CONFIG_LL_ENCRYPTION
+	if (ll_sb_has_test_dummy_encryption(sb))
+		/* enable filename encryption by default for dummy enc mode */
+		lsi->lsi_flags |= LSI_FILENAME_ENC;
+	else
+		/* filename encryption is disabled by default */
+		lsi->lsi_flags &= ~LSI_FILENAME_ENC;
+	/* Lustre 2.15 uses old-style base64 encoding by default */
+	lsi->lsi_flags |= LSI_FILENAME_ENC_B64_OLD_CLI;
+#endif
+
+	/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
+	sb->s_d_op = &ll_d_ops;
+
+	/* UUID handling */
+	generate_random_uuid(uuid.b);
+	snprintf(sbi->ll_sb_uuid.uuid, sizeof(sbi->ll_sb_uuid), "%pU", uuid.b);
+
+	CDEBUG(D_CONFIG, "llite sb uuid: %s\n", sbi->ll_sb_uuid.uuid);
+
+	/* Get fsname */
+	len = strlen(profilenm);
+	ptr = strrchr(profilenm, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	if (len > LUSTRE_MAXFSNAME) {
+		if (unlikely(len >= MAX_STRING_SIZE))
+			len = MAX_STRING_SIZE - 1;
+		strncpy(name, profilenm, len);
+		name[len] = '\0';
+		err = -ENAMETOOLONG;
+		CERROR("%s: fsname longer than %u characters: rc = %d\n",
+		       name, LUSTRE_MAXFSNAME, err);
+		GOTO(out_free_cfg, err);
+	}
+	strncpy(sbi->ll_fsname, profilenm, len);
+	sbi->ll_fsname[len] = '\0';
+
+	/* Mount info */
+	snprintf(name, sizeof(name), "%.*s-%016lx", len,
+		 profilenm, cfg_instance);
+
+	err = super_setup_bdi_name(sb, "%s", name);
+	if (err)
+		GOTO(out_free_cfg, err);
+
+	/* disable kernel readahead */
+	sb->s_bdi->ra_pages = 0;
+#ifdef HAVE_BDI_IO_PAGES
+	sb->s_bdi->io_pages = 0;
+#endif
+
+	/* Call ll_debugfs_register_super() before lustre_process_log()
+	 * so that "llite.*.*" params can be processed correctly.
+	 */
+	err = ll_debugfs_register_super(sb, name);
+	if (err < 0) {
+		CERROR("%s: could not register mountpoint in llite: rc = %d\n",
+		       sbi->ll_fsname, err);
+		err = 0;
+	}
+
+	/* The cfg_instance is a value unique to this super, in case some
+	 * joker tries to mount the same fs at two mount points.
+	 */
+	cfg->cfg_instance = cfg_instance;
+	cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+	cfg->cfg_callback = class_config_llog_handler;
+	cfg->cfg_sub_clds = CONFIG_SUB_CLIENT;
+	/* set up client obds */
+	err = lustre_process_log(sb, profilenm, cfg);
+	if (err < 0)
+		GOTO(out_debugfs, err);
+
+	/* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
+	lprof = class_get_profile(profilenm);
+	if (lprof == NULL) {
+		LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
+				   " read from the MGS.  Does that filesystem "
+				   "exist?\n", profilenm);
+		GOTO(out_debugfs, err = -EINVAL);
+	}
+	CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
+	       lprof->lp_md, lprof->lp_dt);
+
+	dt_len = strlen(lprof->lp_dt) + instlen + 2;
+	OBD_ALLOC(dt, dt_len);
+	if (!dt)
+		GOTO(out_profile, err = -ENOMEM);
+	snprintf(dt, dt_len - 1, "%s-%016lx", lprof->lp_dt, cfg_instance);
+
+	md_len = strlen(lprof->lp_md) + instlen + 2;
+	OBD_ALLOC(md, md_len);
+	if (!md)
+		GOTO(out_free_dt, err = -ENOMEM);
+	snprintf(md, md_len - 1, "%s-%016lx", lprof->lp_md, cfg_instance);
+
+	/* connections, registrations, sb setup */
+	err = client_common_fill_super(sb, md, dt);
+	if (err < 0)
+		GOTO(out_free_md, err);
+
+	sbi->ll_client_common_fill_super_succeeded = 1;
+
+out_free_md:
+	if (md)
+		OBD_FREE(md, md_len);
+out_free_dt:
+	if (dt)
+		OBD_FREE(dt, dt_len);
+out_profile:
+	if (lprof)
+		class_put_profile(lprof);
+out_debugfs:
+	if (err < 0)
+		ll_debugfs_unregister_super(sb);
+out_free_cfg:
+	if (cfg)
+		OBD_FREE_PTR(cfg);
+
+	if (err)
+		ll_put_super(sb);
+	else if (test_bit(LL_SBI_VERBOSE, sbi->ll_flags))
+		LCONSOLE_WARN("Mounted %s\n", profilenm);
+	RETURN(err);
+} /* ll_fill_super */
+
+void ll_put_super(struct super_block *sb)
+{
+	struct config_llog_instance cfg, params_cfg;
+	struct obd_device *obd;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	unsigned long cfg_instance = ll_get_cfg_instance(sb);
+	long ccc_count;
+	int next, force = 1, rc = 0;
+	ENTRY;
+
+	if (IS_ERR(sbi))
+		GOTO(out_no_sbi, 0);
+
+	/* Should replace instance_id with something better for ASLR */
+	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
+	       profilenm, cfg_instance, sb);
+
+	cfg.cfg_instance = cfg_instance;
+	lustre_end_log(sb, profilenm, &cfg);
+
+	params_cfg.cfg_instance = cfg_instance;
+	lustre_end_log(sb, PARAMS_FILENAME, &params_cfg);
+
+	if (sbi->ll_md_exp) {
+		obd = class_exp2obd(sbi->ll_md_exp);
+		if (obd)
+			force = obd->obd_force;
+	}
+
+	/* Wait for unstable pages to be committed to stable storage */
+	if (force == 0) {
+		rc = l_wait_event_abortable(
+			sbi->ll_cache->ccc_unstable_waitq,
+			atomic_long_read(&sbi->ll_cache->ccc_unstable_nr) == 0);
+	}
+
+	ccc_count = atomic_long_read(&sbi->ll_cache->ccc_unstable_nr);
+	if (force == 0 && rc != -ERESTARTSYS)
+		LASSERTF(ccc_count == 0, "count: %li\n", ccc_count);
+
+	/* We need to set force before the lov_disconnect in
+	 * lustre_common_put_super, since l_d cleans up osc's as well.
+	 */
+	if (force) {
+		next = 0;
+		while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
+						     &next)) != NULL) {
+			obd->obd_force = force;
+		}
+	}
+
+	if (sbi->ll_client_common_fill_super_succeeded) {
+		/* Only if client_common_fill_super succeeded */
+		client_common_put_super(sb);
+	}
+
+	next = 0;
+	while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)))
+		class_manual_cleanup(obd);
+
+	if (test_bit(LL_SBI_VERBOSE, sbi->ll_flags))
+		LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
+
+	if (profilenm)
+		class_del_profile(profilenm);
+
+#ifndef HAVE_SUPER_SETUP_BDI_NAME
+	if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
+		bdi_destroy(&lsi->lsi_bdi);
+		lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
+	}
+#endif
+
+	llcrypt_free_dummy_context(&lsi->lsi_dummy_enc_ctx);
+	ll_free_sbi(sb);
+	lsi->lsi_llsbi = NULL;
+out_no_sbi:
+	lustre_common_put_super(sb);
+
+	cl_env_cache_purge(~0);
+
+	EXIT;
+} /* client_put_super */
+
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
+{
+	struct inode *inode = NULL;
+
+	/* NOTE: we depend on atomic igrab() -bzzz */
+	lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode) {
+		struct ll_inode_info * lli;
+		lli = ll_i2info(lock->l_resource->lr_lvb_inode);
+		if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+			inode = igrab(lock->l_resource->lr_lvb_inode);
+		} else {
+			inode = lock->l_resource->lr_lvb_inode;
+			LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+					 D_WARNING, lock, "lr_lvb_inode %p is "
+					 "bogus: magic %08x",
+					 lock->l_resource->lr_lvb_inode,
+					 lli->lli_inode_magic);
+			inode = NULL;
+		}
+	}
+	unlock_res_and_lock(lock);
+	return inode;
+}
+
+void ll_dir_clear_lsm_md(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	LASSERT(S_ISDIR(inode->i_mode));
+
+	if (lli->lli_lsm_md) {
+		lmv_free_memmd(lli->lli_lsm_md);
+		lli->lli_lsm_md = NULL;
+	}
+
+	if (lli->lli_default_lsm_md) {
+		lmv_free_memmd(lli->lli_default_lsm_md);
+		lli->lli_default_lsm_md = NULL;
+	}
+}
+
+static struct inode *ll_iget_anon_dir(struct super_block *sb,
+				      const struct lu_fid *fid,
+				      struct lustre_md *md)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_inode_info *lli;
+	struct mdt_body *body = md->body;
+	struct inode *inode;
+	ino_t ino;
+
+	ENTRY;
+
+	LASSERT(md->lmv);
+	ino = cl_fid_build_ino(fid, test_bit(LL_SBI_32BIT_API, sbi->ll_flags));
+	inode = iget_locked(sb, ino);
+	if (inode == NULL) {
+		CERROR("%s: failed get simple inode "DFID": rc = -ENOENT\n",
+		       sbi->ll_fsname, PFID(fid));
+		RETURN(ERR_PTR(-ENOENT));
+	}
+
+	lli = ll_i2info(inode);
+	if (inode->i_state & I_NEW) {
+		inode->i_mode = (inode->i_mode & ~S_IFMT) |
+				(body->mbo_mode & S_IFMT);
+		LASSERTF(S_ISDIR(inode->i_mode), "Not slave inode "DFID"\n",
+			 PFID(fid));
+
+		inode->i_mtime.tv_sec = 0;
+		inode->i_atime.tv_sec = 0;
+		inode->i_ctime.tv_sec = 0;
+		inode->i_rdev = 0;
+
+#ifdef HAVE_BACKING_DEV_INFO
+		/* initializing backing dev info. */
+		inode->i_mapping->backing_dev_info =
+						&s2lsi(inode->i_sb)->lsi_bdi;
+#endif
+		inode->i_op = &ll_dir_inode_operations;
+		inode->i_fop = &ll_dir_operations;
+		lli->lli_fid = *fid;
+		ll_lli_init(lli);
+
+		/* master object FID */
+		lli->lli_pfid = body->mbo_fid1;
+		CDEBUG(D_INODE, "lli %p slave "DFID" master "DFID"\n",
+		       lli, PFID(fid), PFID(&lli->lli_pfid));
+		unlock_new_inode(inode);
+	} else {
+		/* in directory restripe/auto-split, a directory will be
+		 * transformed to a stripe if it's plain, set its pfid here,
+		 * otherwise ll_lock_cancel_bits() can't find the master inode.
+		 */
+		lli->lli_pfid = body->mbo_fid1;
+	}
+
+	RETURN(inode);
+}
+
+static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md)
+{
+	struct lu_fid *fid;
+	struct lmv_stripe_md *lsm = md->lmv;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int i;
+
+	LASSERT(lsm != NULL);
+
+	CDEBUG(D_INODE, "%s: "DFID" set dir layout:\n",
+	       ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
+	lsm_md_dump(D_INODE, lsm);
+
+	if (!lmv_dir_striped(lsm))
+		goto out;
+
+	/* XXX sigh, this lsm_root initialization should be in
+	 * LMV layer, but it needs ll_iget right now, so we
+	 * put this here right now. */
+	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+		fid = &lsm->lsm_md_oinfo[i].lmo_fid;
+		LASSERT(lsm->lsm_md_oinfo[i].lmo_root == NULL);
+
+		if (!fid_is_sane(fid))
+			continue;
+
+		/* Unfortunately ll_iget will call ll_update_inode,
+		 * where the initialization of slave inode is slightly
+		 * different, so it reset lsm_md to NULL to avoid
+		 * initializing lsm for slave inode. */
+		lsm->lsm_md_oinfo[i].lmo_root =
+				ll_iget_anon_dir(inode->i_sb, fid, md);
+		if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) {
+			int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root);
+
+			lsm->lsm_md_oinfo[i].lmo_root = NULL;
+			while (i-- > 0) {
+				iput(lsm->lsm_md_oinfo[i].lmo_root);
+				lsm->lsm_md_oinfo[i].lmo_root = NULL;
+			}
+			return rc;
+		}
+	}
+out:
+	lli->lli_lsm_md = lsm;
+
+	return 0;
+}
+
+static void ll_update_default_lsm_md(struct inode *inode, struct lustre_md *md)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	ENTRY;
+
+	if (!md->default_lmv) {
+		/* clear default lsm */
+		if (lli->lli_default_lsm_md) {
+			down_write(&lli->lli_lsm_sem);
+			if (lli->lli_default_lsm_md) {
+				lmv_free_memmd(lli->lli_default_lsm_md);
+				lli->lli_default_lsm_md = NULL;
+			}
+			lli->lli_inherit_depth = 0;
+			up_write(&lli->lli_lsm_sem);
+		}
+		RETURN_EXIT;
+	}
+
+	if (lli->lli_default_lsm_md) {
+		/* do nonthing if default lsm isn't changed */
+		down_read(&lli->lli_lsm_sem);
+		if (lli->lli_default_lsm_md &&
+		    lsm_md_eq(lli->lli_default_lsm_md, md->default_lmv)) {
+			up_read(&lli->lli_lsm_sem);
+			RETURN_EXIT;
+		}
+		up_read(&lli->lli_lsm_sem);
+	}
+
+	down_write(&lli->lli_lsm_sem);
+	if (lli->lli_default_lsm_md)
+		lmv_free_memmd(lli->lli_default_lsm_md);
+	lli->lli_default_lsm_md = md->default_lmv;
+	lsm_md_dump(D_INODE, md->default_lmv);
+	md->default_lmv = NULL;
+	up_write(&lli->lli_lsm_sem);
+	RETURN_EXIT;
+}
+
+static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lmv_stripe_md *lsm = md->lmv;
+	struct cl_attr	*attr;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(S_ISDIR(inode->i_mode));
+	CDEBUG(D_INODE, "update lsm %p of "DFID"\n", lli->lli_lsm_md,
+	       PFID(ll_inode2fid(inode)));
+
+	/* update default LMV */
+	if (md->default_lmv)
+		ll_update_default_lsm_md(inode, md);
+
+	/* after dir migration/restripe, a stripe may be turned into a
+	 * directory, in this case, zero out its lli_pfid.
+	 */
+	if (unlikely(fid_is_norm(&lli->lli_pfid)))
+		fid_zero(&lli->lli_pfid);
+
+	/*
+	 * no striped information from request, lustre_md from req does not
+	 * include stripeEA, see ll_md_setattr()
+	 */
+	if (!lsm)
+		RETURN(0);
+
+	/*
+	 * normally dir layout doesn't change, only take read lock to check
+	 * that to avoid blocking other MD operations.
+	 */
+	down_read(&lli->lli_lsm_sem);
+
+	/* some current lookup initialized lsm, and unchanged */
+	if (lli->lli_lsm_md && lsm_md_eq(lli->lli_lsm_md, lsm))
+		GOTO(unlock, rc = 0);
+
+	/* if dir layout doesn't match, check whether version is increased,
+	 * which means layout is changed, this happens in dir split/merge and
+	 * lfsck.
+	 *
+	 * foreign LMV should not change.
+	 */
+	if (lli->lli_lsm_md && lmv_dir_striped(lli->lli_lsm_md) &&
+	    lsm->lsm_md_layout_version <=
+	    lli->lli_lsm_md->lsm_md_layout_version) {
+		CERROR("%s: "DFID" dir layout mismatch:\n",
+		       ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
+		lsm_md_dump(D_ERROR, lli->lli_lsm_md);
+		lsm_md_dump(D_ERROR, lsm);
+		GOTO(unlock, rc = -EINVAL);
+	}
+
+	up_read(&lli->lli_lsm_sem);
+	down_write(&lli->lli_lsm_sem);
+	/* clear existing lsm */
+	if (lli->lli_lsm_md) {
+		lmv_free_memmd(lli->lli_lsm_md);
+		lli->lli_lsm_md = NULL;
+	}
+
+	rc = ll_init_lsm_md(inode, md);
+	up_write(&lli->lli_lsm_sem);
+
+	if (rc)
+		RETURN(rc);
+
+	/* set md->lmv to NULL, so the following free lustre_md will not free
+	 * this lsm.
+	 */
+	md->lmv = NULL;
+
+	/* md_merge_attr() may take long, since lsm is already set, switch to
+	 * read lock.
+	 */
+	down_read(&lli->lli_lsm_sem);
+
+	if (!lmv_dir_striped(lli->lli_lsm_md))
+		GOTO(unlock, rc = 0);
+
+	OBD_ALLOC_PTR(attr);
+	if (!attr)
+		GOTO(unlock, rc = -ENOMEM);
+
+	/* validate the lsm */
+	rc = md_merge_attr(ll_i2mdexp(inode), lli->lli_lsm_md, attr,
+			   ll_md_blocking_ast);
+	if (!rc) {
+		if (md->body->mbo_valid & OBD_MD_FLNLINK)
+			md->body->mbo_nlink = attr->cat_nlink;
+		if (md->body->mbo_valid & OBD_MD_FLSIZE)
+			md->body->mbo_size = attr->cat_size;
+		if (md->body->mbo_valid & OBD_MD_FLATIME)
+			md->body->mbo_atime = attr->cat_atime;
+		if (md->body->mbo_valid & OBD_MD_FLCTIME)
+			md->body->mbo_ctime = attr->cat_ctime;
+		if (md->body->mbo_valid & OBD_MD_FLMTIME)
+			md->body->mbo_mtime = attr->cat_mtime;
+	}
+
+	OBD_FREE_PTR(attr);
+	GOTO(unlock, rc);
+unlock:
+	up_read(&lli->lli_lsm_sem);
+
+	return rc;
+}
+
+void ll_clear_inode(struct inode *inode)
+{
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+        ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+	if (S_ISDIR(inode->i_mode)) {
+		/* these should have been cleared in ll_file_release */
+		LASSERT(lli->lli_opendir_key == NULL);
+		LASSERT(lli->lli_sai == NULL);
+		LASSERT(lli->lli_opendir_pid == 0);
+	} else {
+		pcc_inode_free(inode);
+	}
+
+	md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
+
+        LASSERT(!lli->lli_open_fd_write_count);
+        LASSERT(!lli->lli_open_fd_read_count);
+        LASSERT(!lli->lli_open_fd_exec_count);
+
+        if (lli->lli_mds_write_och)
+                ll_md_real_close(inode, FMODE_WRITE);
+        if (lli->lli_mds_exec_och)
+                ll_md_real_close(inode, FMODE_EXEC);
+        if (lli->lli_mds_read_och)
+                ll_md_real_close(inode, FMODE_READ);
+
+        if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) {
+                OBD_FREE(lli->lli_symlink_name,
+                         strlen(lli->lli_symlink_name) + 1);
+                lli->lli_symlink_name = NULL;
+        }
+
+	ll_xattr_cache_destroy(inode);
+
+	forget_all_cached_acls(inode);
+	lli_clear_acl(lli);
+	lli->lli_inode_magic = LLI_INODE_DEAD;
+
+	if (S_ISDIR(inode->i_mode))
+		ll_dir_clear_lsm_md(inode);
+	else if (S_ISREG(inode->i_mode) && !is_bad_inode(inode))
+		LASSERT(list_empty(&lli->lli_agl_list));
+
+	/*
+	 * XXX This has to be done before lsm is freed below, because
+	 * cl_object still uses inode lsm.
+	 */
+	cl_inode_fini(inode);
+
+	llcrypt_put_encryption_info(inode);
+
+	EXIT;
+}
+
+static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data)
+{
+	struct lustre_md md;
+	struct inode *inode = dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *request = NULL;
+	int rc, ia_valid;
+
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	/* If this is a chgrp of a regular file, we want to reserve enough
+	 * quota to cover the entire file size.
+	 */
+	if (S_ISREG(inode->i_mode) && op_data->op_attr.ia_valid & ATTR_GID &&
+	    from_kgid(&init_user_ns, op_data->op_attr.ia_gid) !=
+	    from_kgid(&init_user_ns, inode->i_gid)) {
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
+		op_data->op_attr_blocks = inode->i_blocks;
+	}
+
+
+	rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &request);
+	if (rc) {
+		ptlrpc_req_finished(request);
+		if (rc == -ENOENT) {
+			clear_nlink(inode);
+			/* Unlinked special device node? Or just a race?
+			 * Pretend we done everything. */
+			if (!S_ISREG(inode->i_mode) &&
+			    !S_ISDIR(inode->i_mode)) {
+				ia_valid = op_data->op_attr.ia_valid;
+				op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
+				rc = simple_setattr(&init_user_ns, dentry,
+						    &op_data->op_attr);
+				op_data->op_attr.ia_valid = ia_valid;
+			}
+		} else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
+			CERROR("md_setattr fails: rc = %d\n", rc);
+		}
+		RETURN(rc);
+	}
+
+	rc = md_get_lustre_md(sbi->ll_md_exp, &request->rq_pill, sbi->ll_dt_exp,
+			      sbi->ll_md_exp, &md);
+        if (rc) {
+                ptlrpc_req_finished(request);
+                RETURN(rc);
+        }
+
+	ia_valid = op_data->op_attr.ia_valid;
+	/* inode size will be in ll_setattr_ost, can't do it now since dirty
+	 * cache is not cleared yet. */
+	op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
+	if (S_ISREG(inode->i_mode))
+		inode_lock(inode);
+	rc = simple_setattr(&init_user_ns, dentry, &op_data->op_attr);
+	if (S_ISREG(inode->i_mode))
+		inode_unlock(inode);
+	op_data->op_attr.ia_valid = ia_valid;
+
+	rc = ll_update_inode(inode, &md);
+	ptlrpc_req_finished(request);
+
+	RETURN(rc);
+}
+
+/**
+ * Zero portion of page that is part of @inode.
+ * This implies, if necessary:
+ * - taking cl_lock on range corresponding to concerned page
+ * - grabbing vm page
+ * - associating cl_page
+ * - proceeding to clio read
+ * - zeroing range in page
+ * - proceeding to cl_page flush
+ * - releasing cl_lock
+ *
+ * \param[in] inode	inode
+ * \param[in] index	page index
+ * \param[in] offset	offset in page to start zero from
+ * \param[in] len	len to zero
+ *
+ * \retval 0		on success
+ * \retval negative	errno on failure
+ */
+int ll_io_zero_page(struct inode *inode, pgoff_t index, pgoff_t offset,
+		    unsigned len)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *clob = lli->lli_clob;
+	__u16 refcheck;
+	struct lu_env *env = NULL;
+	struct cl_io *io = NULL;
+	struct cl_page *clpage = NULL;
+	struct page *vmpage = NULL;
+	unsigned from = index << PAGE_SHIFT;
+	struct cl_lock *lock = NULL;
+	struct cl_lock_descr *descr = NULL;
+	struct cl_2queue *queue = NULL;
+	struct cl_sync_io *anchor = NULL;
+	bool holdinglock = false;
+	int rc;
+
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = clob;
+	rc = cl_io_rw_init(env, io, CIT_WRITE, from, PAGE_SIZE);
+	if (rc)
+		GOTO(putenv, rc);
+
+	lock = vvp_env_lock(env);
+	descr = &lock->cll_descr;
+	descr->cld_obj   = io->ci_obj;
+	descr->cld_start = cl_index(io->ci_obj, from);
+	descr->cld_end   = cl_index(io->ci_obj, from + PAGE_SIZE - 1);
+	descr->cld_mode  = CLM_WRITE;
+	descr->cld_enq_flags = CEF_MUST | CEF_NONBLOCK;
+
+	/* request lock for page */
+	rc = cl_lock_request(env, io, lock);
+	/* -ECANCELED indicates a matching lock with a different extent
+	 * was already present, and -EEXIST indicates a matching lock
+	 * on exactly the same extent was already present.
+	 * In both cases it means we are covered.
+	 */
+	if (rc == -ECANCELED || rc == -EEXIST)
+		rc = 0;
+	else if (rc < 0)
+		GOTO(iofini, rc);
+	else
+		holdinglock = true;
+
+	/* grab page */
+	vmpage = grab_cache_page_nowait(inode->i_mapping, index);
+	if (vmpage == NULL)
+		GOTO(rellock, rc = -EOPNOTSUPP);
+
+	if (!PageDirty(vmpage)) {
+		/* associate cl_page */
+		clpage = cl_page_find(env, clob, vmpage->index,
+				      vmpage, CPT_CACHEABLE);
+		if (IS_ERR(clpage))
+			GOTO(pagefini, rc = PTR_ERR(clpage));
+
+		cl_page_assume(env, io, clpage);
+	}
+
+	if (!PageUptodate(vmpage) && !PageDirty(vmpage) &&
+	    !PageWriteback(vmpage)) {
+		/* read page */
+		/* Set PagePrivate2 to detect special case of empty page
+		 * in osc_brw_fini_request().
+		 * It is also used to tell ll_io_read_page() that we do not
+		 * want the vmpage to be unlocked.
+		 */
+		SetPagePrivate2(vmpage);
+		rc = ll_io_read_page(env, io, clpage, NULL);
+		if (!PagePrivate2(vmpage)) {
+			/* PagePrivate2 was cleared in osc_brw_fini_request()
+			 * meaning we read an empty page. In this case, in order
+			 * to avoid allocating unnecessary block in truncated
+			 * file, we must not zero and write as below. Subsequent
+			 * server-side truncate will handle things correctly.
+			 */
+			cl_page_unassume(env, io, clpage);
+			GOTO(clpfini, rc = 0);
+		}
+		ClearPagePrivate2(vmpage);
+		if (rc)
+			GOTO(clpfini, rc);
+	}
+
+	/* Thanks to PagePrivate2 flag, ll_io_read_page() did not unlock
+	 * the vmpage, so we are good to proceed and zero range in page.
+	 */
+	zero_user(vmpage, offset, len);
+
+	if (holdinglock && clpage) {
+		/* explicitly write newly modified page */
+		queue = &io->ci_queue;
+		cl_2queue_init(queue);
+		anchor = &vvp_env_info(env)->vti_anchor;
+		cl_sync_io_init(anchor, 1);
+		clpage->cp_sync_io = anchor;
+		cl_2queue_add(queue, clpage, true);
+		rc = cl_io_submit_rw(env, io, CRT_WRITE, queue);
+		if (rc)
+			GOTO(queuefini1, rc);
+		rc = cl_sync_io_wait(env, anchor, 0);
+		if (rc)
+			GOTO(queuefini2, rc);
+		cl_page_assume(env, io, clpage);
+
+queuefini2:
+		cl_2queue_discard(env, io, queue);
+queuefini1:
+		cl_2queue_disown(env, io, queue);
+		cl_2queue_fini(env, queue);
+	}
+
+clpfini:
+	if (clpage)
+		cl_page_put(env, clpage);
+pagefini:
+	unlock_page(vmpage);
+	put_page(vmpage);
+rellock:
+	if (holdinglock)
+		cl_lock_release(env, lock);
+iofini:
+	cl_io_fini(env, io);
+putenv:
+	if (env)
+		cl_env_put(env, &refcheck);
+
+	RETURN(rc);
+}
+
+/**
+ * Get reference file from volatile file name.
+ * Volatile file name may look like:
+ * <parent>/LUSTRE_VOLATILE_HDR:<mdt_index>:<random>:fd=<fd>
+ * where fd is opened descriptor of reference file.
+ *
+ * \param[in] volatile_name	volatile file name
+ * \param[in] volatile_len	volatile file name length
+ * \param[out] ref_file		pointer to struct file of reference file
+ *
+ * \retval 0		on success
+ * \retval negative	errno on failure
+ */
+int volatile_ref_file(const char *volatile_name, int volatile_len,
+		      struct file **ref_file)
+{
+	char *p, *q, *fd_str;
+	int fd, rc;
+
+	p = strnstr(volatile_name, ":fd=", volatile_len);
+	if (!p || strlen(p + 4) == 0)
+		return -EINVAL;
+
+	q = strchrnul(p + 4, ':');
+	fd_str = kstrndup(p + 4, q - p - 4, GFP_NOFS);
+	if (!fd_str)
+		return -ENOMEM;
+	rc = kstrtouint(fd_str, 10, &fd);
+	kfree(fd_str);
+	if (rc)
+		return -EINVAL;
+
+	*ref_file = fget(fd);
+	if (!(*ref_file))
+		return -EINVAL;
+	return 0;
+}
+
+/* If this inode has objects allocated to it (lsm != NULL), then the OST
+ * object(s) determine the file size and mtime.  Otherwise, the MDS will
+ * keep these values until such a time that objects are allocated for it.
+ * We do the MDS operations first, as it is checking permissions for us.
+ * We don't to the MDS RPC if there is nothing that we want to store there,
+ * otherwise there is no harm in updating mtime/atime on the MDS if we are
+ * going to do an RPC anyways.
+ *
+ * If we are doing a truncate, we will send the mtime and ctime updates
+ * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
+ * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
+ * at the same time.
+ *
+ * In case of HSMimport, we only set attr on MDS.
+ */
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
+		   enum op_xvalid xvalid, bool hsm_import)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct md_op_data *op_data = NULL;
+	ktime_t kstart = ktime_get();
+	int rc = 0;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "%s: setattr inode "DFID"(%p) from %llu to %llu, "
+	       "valid %x, hsm_import %d\n",
+	       ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid),
+	       inode, i_size_read(inode), attr->ia_size, attr->ia_valid,
+	       hsm_import);
+
+	if (attr->ia_valid & ATTR_SIZE) {
+                /* Check new size against VFS/VM file size limit and rlimit */
+                rc = inode_newsize_ok(inode, attr->ia_size);
+                if (rc)
+                        RETURN(rc);
+
+                /* The maximum Lustre file size is variable, based on the
+                 * OST maximum object size and number of stripes.  This
+                 * needs another check in addition to the VFS check above. */
+                if (attr->ia_size > ll_file_maxbytes(inode)) {
+			CDEBUG(D_INODE,"file "DFID" too large %llu > %llu\n",
+                               PFID(&lli->lli_fid), attr->ia_size,
+                               ll_file_maxbytes(inode));
+                        RETURN(-EFBIG);
+                }
+
+                attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+        }
+
+	/* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
+	if (attr->ia_valid & TIMES_SET_FLAGS) {
+		if ((!uid_eq(current_fsuid(), inode->i_uid)) &&
+		    !capable(CAP_FOWNER))
+			RETURN(-EPERM);
+	}
+
+	/* We mark all of the fields "set" so MDS/OST does not re-set them */
+	if (!(xvalid & OP_XVALID_CTIME_SET) &&
+	     (attr->ia_valid & ATTR_CTIME)) {
+		attr->ia_ctime = current_time(inode);
+		xvalid |= OP_XVALID_CTIME_SET;
+	}
+	if (!(attr->ia_valid & ATTR_ATIME_SET) &&
+	    (attr->ia_valid & ATTR_ATIME)) {
+		attr->ia_atime = current_time(inode);
+                attr->ia_valid |= ATTR_ATIME_SET;
+        }
+	if (!(attr->ia_valid & ATTR_MTIME_SET) &&
+	    (attr->ia_valid & ATTR_MTIME)) {
+		attr->ia_mtime = current_time(inode);
+                attr->ia_valid |= ATTR_MTIME_SET;
+        }
+
+        if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime %lld, ctime %lld, now = %lld\n",
+		       (s64)attr->ia_mtime.tv_sec, (s64)attr->ia_ctime.tv_sec,
+		       ktime_get_real_seconds());
+
+	if (S_ISREG(inode->i_mode))
+		inode_unlock(inode);
+
+	/* We always do an MDS RPC, even if we're only changing the size;
+	 * only the MDS knows whether truncate() should fail with -ETXTBUSY */
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (!hsm_import && attr->ia_valid & ATTR_SIZE) {
+		/* If we are changing file size, file content is
+		 * modified, flag it.
+		 */
+		xvalid |= OP_XVALID_OWNEROVERRIDE;
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+		clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags);
+	}
+
+	if (attr->ia_valid & ATTR_FILE) {
+		struct ll_file_data *fd = attr->ia_file->private_data;
+
+		if (fd->fd_lease_och)
+			op_data->op_bias |= MDS_TRUNC_KEEP_LEASE;
+	}
+
+	op_data->op_attr = *attr;
+	op_data->op_xvalid = xvalid;
+
+	rc = ll_md_setattr(dentry, op_data);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(inode->i_mode) || hsm_import)
+		GOTO(out, rc = 0);
+
+	if (attr->ia_valid & (ATTR_SIZE | ATTR_ATIME | ATTR_ATIME_SET |
+			      ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME) ||
+	    xvalid & OP_XVALID_CTIME_SET) {
+		bool cached = false;
+
+		rc = pcc_inode_setattr(inode, attr, &cached);
+		if (cached) {
+			if (rc) {
+				CERROR("%s: PCC inode "DFID" setattr failed: "
+				       "rc = %d\n",
+				       ll_i2sbi(inode)->ll_fsname,
+				       PFID(&lli->lli_fid), rc);
+				GOTO(out, rc);
+			}
+		} else {
+			unsigned int flags = 0;
+
+			/* For truncate and utimes sending attributes to OSTs,
+			 * setting mtime/atime to the past will be performed
+			 * under PW [0:EOF] extent lock (new_size:EOF for
+			 * truncate). It may seem excessive to send mtime/atime
+			 * updates to OSTs when not setting times to past, but
+			 * it is necessary due to possible time
+			 * de-synchronization between MDT inode and OST objects
+			 */
+			if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
+				xvalid |= OP_XVALID_FLAGS;
+				flags = LUSTRE_ENCRYPT_FL;
+				/* Call to ll_io_zero_page is not necessary if
+				 * truncating on PAGE_SIZE boundary, because
+				 * whole pages will be wiped.
+				 * In case of Direct IO, all we need is to set
+				 * new size.
+				 */
+				if (attr->ia_valid & ATTR_SIZE &&
+				    attr->ia_size & ~PAGE_MASK &&
+				    !(attr->ia_valid & ATTR_FILE &&
+				      attr->ia_file->f_flags & O_DIRECT)) {
+					pgoff_t offset =
+						attr->ia_size & (PAGE_SIZE - 1);
+
+					rc = ll_io_zero_page(inode,
+						    attr->ia_size >> PAGE_SHIFT,
+						    offset, PAGE_SIZE - offset);
+					if (rc)
+						GOTO(out, rc);
+				}
+				/* If encrypted volatile file without the key,
+				 * we need to fetch size from reference file,
+				 * and set it on OST objects. This happens when
+				 * migrating or extending an encrypted file
+				 * without the key.
+				 */
+				if (filename_is_volatile(dentry->d_name.name,
+							 dentry->d_name.len,
+							 NULL) &&
+				    llcrypt_require_key(inode) == -ENOKEY) {
+					struct file *ref_file;
+					struct inode *ref_inode;
+					struct ll_inode_info *ref_lli;
+					struct cl_object *ref_obj;
+					struct cl_attr ref_attr = { 0 };
+					struct lu_env *env;
+					__u16 refcheck;
+
+					rc = volatile_ref_file(
+						dentry->d_name.name,
+						dentry->d_name.len,
+						&ref_file);
+					if (rc)
+						GOTO(out, rc);
+
+					ref_inode = file_inode(ref_file);
+					if (!ref_inode) {
+						fput(ref_file);
+						GOTO(out, rc = -EINVAL);
+					}
+
+					env = cl_env_get(&refcheck);
+					if (IS_ERR(env))
+						GOTO(out, rc = PTR_ERR(env));
+
+					ref_lli = ll_i2info(ref_inode);
+					ref_obj = ref_lli->lli_clob;
+					cl_object_attr_lock(ref_obj);
+					rc = cl_object_attr_get(env, ref_obj,
+								&ref_attr);
+					cl_object_attr_unlock(ref_obj);
+					cl_env_put(env, &refcheck);
+					fput(ref_file);
+					if (rc)
+						GOTO(out, rc);
+
+					attr->ia_valid |= ATTR_SIZE;
+					attr->ia_size = ref_attr.cat_size;
+				}
+			}
+			rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, flags);
+		}
+	}
+
+	/* If the file was restored, it needs to set dirty flag.
+	 *
+	 * We've already sent MDS_DATA_MODIFIED flag in
+	 * ll_md_setattr() for truncate. However, the MDT refuses to
+	 * set the HS_DIRTY flag on released files, so we have to set
+	 * it again if the file has been restored. Please check how
+	 * LLIF_DATA_MODIFIED is set in vvp_io_setattr_fini().
+	 *
+	 * Please notice that if the file is not released, the previous
+	 * MDS_DATA_MODIFIED has taken effect and usually
+	 * LLIF_DATA_MODIFIED is not set(see vvp_io_setattr_fini()).
+	 * This way we can save an RPC for common open + trunc
+	 * operation. */
+	if (test_and_clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags)) {
+		struct hsm_state_set hss = {
+			.hss_valid = HSS_SETMASK,
+			.hss_setmask = HS_DIRTY,
+		};
+		int rc2;
+
+		rc2 = ll_hsm_state_set(inode, &hss);
+		/* truncate and write can happen at the same time, so that
+		 * the file can be set modified even though the file is not
+		 * restored from released state, and ll_hsm_state_set() is
+		 * not applicable for the file, and rc2 < 0 is normal in this
+		 * case. */
+		if (rc2 < 0)
+			CDEBUG(D_INFO, DFID "HSM set dirty failed: rc2 = %d\n",
+			       PFID(ll_inode2fid(inode)), rc2);
+	}
+
+	EXIT;
+out:
+	if (op_data != NULL)
+		ll_finish_md_op_data(op_data);
+
+	if (S_ISREG(inode->i_mode)) {
+		inode_lock(inode);
+		if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
+			inode_dio_wait(inode);
+		/* Once we've got the i_mutex, it's safe to set the S_NOSEC
+		 * flag.  ll_update_inode (called from ll_md_setattr), clears
+		 * inode flags, so there is a gap where S_NOSEC is not set.
+		 * This can cause a writer to take the i_mutex unnecessarily,
+		 * but this is safe to do and should be rare. */
+		inode_has_no_xattr(inode);
+	}
+
+	if (!rc)
+		ll_stats_ops_tally(ll_i2sbi(inode), attr->ia_valid & ATTR_SIZE ?
+					LPROC_LL_TRUNC : LPROC_LL_SETATTR,
+				   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(rc);
+}
+
+int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de,
+	       struct iattr *attr)
+{
+	int mode = de->d_inode->i_mode;
+	enum op_xvalid xvalid = 0;
+	int rc;
+
+	rc = llcrypt_prepare_setattr(de, attr);
+	if (rc)
+		return rc;
+
+	if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
+			      (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
+		xvalid |= OP_XVALID_OWNEROVERRIDE;
+
+	if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
+			       (ATTR_SIZE|ATTR_MODE)) &&
+	    (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
+	     (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	      !(attr->ia_mode & S_ISGID))))
+		attr->ia_valid |= ATTR_FORCE;
+
+	if ((attr->ia_valid & ATTR_MODE) &&
+	    (mode & S_ISUID) &&
+	    !(attr->ia_mode & S_ISUID) &&
+	    !(attr->ia_valid & ATTR_KILL_SUID))
+		attr->ia_valid |= ATTR_KILL_SUID;
+
+	if ((attr->ia_valid & ATTR_MODE) &&
+	    ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	    !(attr->ia_mode & S_ISGID) &&
+	    !(attr->ia_valid & ATTR_KILL_SGID))
+		attr->ia_valid |= ATTR_KILL_SGID;
+
+	return ll_setattr_raw(de, attr, xvalid, false);
+}
+
+int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
+		       u32 flags)
+{
+	struct obd_statfs obd_osfs = { 0 };
+	time64_t max_age;
+	int rc;
+
+	ENTRY;
+	max_age = ktime_get_seconds() - sbi->ll_statfs_max_age;
+
+	if (test_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags))
+		flags |= OBD_STATFS_NODELAY;
+
+	rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+	if (rc)
+		RETURN(rc);
+
+	osfs->os_type = LL_SUPER_MAGIC;
+
+	CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n",
+	      osfs->os_bavail, osfs->os_blocks, osfs->os_ffree, osfs->os_files);
+
+	if (osfs->os_state & OS_STATFS_SUM)
+		GOTO(out, rc);
+
+	rc = obd_statfs(NULL, sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+	if (rc) /* Possibly a filesystem with no OSTs.  Report MDT totals. */
+		GOTO(out, rc = 0);
+
+	CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n",
+	       obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+	       obd_osfs.os_files);
+
+	osfs->os_bsize = obd_osfs.os_bsize;
+	osfs->os_blocks = obd_osfs.os_blocks;
+	osfs->os_bfree = obd_osfs.os_bfree;
+	osfs->os_bavail = obd_osfs.os_bavail;
+
+	/* If we have _some_ OSTs, but don't have as many free objects on the
+	 * OSTs as inodes on the MDTs, reduce the reported number of inodes
+	 * to compensate, so that the "inodes in use" number is correct.
+	 * This should be kept in sync with lod_statfs() behaviour.
+	 */
+	if (obd_osfs.os_files && obd_osfs.os_ffree < osfs->os_ffree) {
+		osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+				 obd_osfs.os_ffree;
+		osfs->os_ffree = obd_osfs.os_ffree;
+	}
+
+out:
+	RETURN(rc);
+}
+
+static int ll_statfs_project(struct inode *inode, struct kstatfs *sfs)
+{
+	struct if_quotactl qctl = {
+		.qc_cmd = LUSTRE_Q_GETQUOTA,
+		.qc_type = PRJQUOTA,
+		.qc_valid = QC_GENERAL,
+	};
+	u64 limit, curblock;
+	int ret;
+
+	qctl.qc_id = ll_i2info(inode)->lli_projid;
+	ret = quotactl_ioctl(inode->i_sb, &qctl);
+	if (ret) {
+		/* ignore errors if project ID does not have
+		 * a quota limit or feature unsupported.
+		 */
+		if (ret == -ESRCH || ret == -EOPNOTSUPP)
+			ret = 0;
+		return ret;
+	}
+
+	limit = ((qctl.qc_dqblk.dqb_bsoftlimit ?
+		 qctl.qc_dqblk.dqb_bsoftlimit :
+		 qctl.qc_dqblk.dqb_bhardlimit) * 1024) / sfs->f_bsize;
+	if (limit && sfs->f_blocks > limit) {
+		curblock = (qctl.qc_dqblk.dqb_curspace +
+				sfs->f_bsize - 1) / sfs->f_bsize;
+		sfs->f_blocks = limit;
+		sfs->f_bfree = sfs->f_bavail =
+			(sfs->f_blocks > curblock) ?
+			(sfs->f_blocks - curblock) : 0;
+	}
+
+	limit = qctl.qc_dqblk.dqb_isoftlimit ?
+		qctl.qc_dqblk.dqb_isoftlimit :
+		qctl.qc_dqblk.dqb_ihardlimit;
+	if (limit && sfs->f_files > limit) {
+		sfs->f_files = limit;
+		sfs->f_ffree = (sfs->f_files >
+			qctl.qc_dqblk.dqb_curinodes) ?
+			(sfs->f_files - qctl.qc_dqblk.dqb_curinodes) : 0;
+	}
+
+	return 0;
+}
+
+int ll_statfs(struct dentry *de, struct kstatfs *sfs)
+{
+	struct super_block *sb = de->d_sb;
+	struct obd_statfs osfs;
+	__u64 fsid = huge_encode_dev(sb->s_dev);
+	ktime_t kstart = ktime_get();
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:sb=%s (%p)\n", sb->s_id, sb);
+
+	/* Some amount of caching on the client is allowed */
+	rc = ll_statfs_internal(ll_s2sbi(sb), &osfs, OBD_STATFS_SUM);
+	if (rc)
+		return rc;
+
+        statfs_unpack(sfs, &osfs);
+
+        /* We need to downshift for all 32-bit kernels, because we can't
+         * tell if the kernel is being called via sys_statfs64() or not.
+         * Stop before overflowing f_bsize - in which case it is better
+         * to just risk EOVERFLOW if caller is using old sys_statfs(). */
+        if (sizeof(long) < 8) {
+                while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
+                        sfs->f_bsize <<= 1;
+
+                        osfs.os_blocks >>= 1;
+                        osfs.os_bfree >>= 1;
+                        osfs.os_bavail >>= 1;
+                }
+        }
+
+	sfs->f_blocks = osfs.os_blocks;
+	sfs->f_bfree = osfs.os_bfree;
+	sfs->f_bavail = osfs.os_bavail;
+	sfs->f_fsid.val[0] = (__u32)fsid;
+	sfs->f_fsid.val[1] = (__u32)(fsid >> 32);
+	if (ll_i2info(de->d_inode)->lli_projid)
+		return ll_statfs_project(de->d_inode, sfs);
+
+	ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STATFS,
+			   ktime_us_delta(ktime_get(), kstart));
+
+	return 0;
+}
+
+void ll_inode_size_lock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	lli = ll_i2info(inode);
+	mutex_lock(&lli->lli_size_mutex);
+}
+
+void ll_inode_size_unlock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	lli = ll_i2info(inode);
+	mutex_unlock(&lli->lli_size_mutex);
+}
+
+void ll_update_inode_flags(struct inode *inode, unsigned int ext_flags)
+{
+	/* do not clear encryption flag */
+	ext_flags |= ll_inode_to_ext_flags(inode->i_flags) & LUSTRE_ENCRYPT_FL;
+	inode->i_flags = ll_ext_to_inode_flags(ext_flags);
+	if (ext_flags & LUSTRE_PROJINHERIT_FL)
+		set_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags);
+	else
+		clear_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags);
+}
+
+int ll_update_inode(struct inode *inode, struct lustre_md *md)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = md->body;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	bool api32;
+	int rc = 0;
+
+	if (body->mbo_valid & OBD_MD_FLEASIZE) {
+		rc = cl_file_inode_init(inode, md);
+		if (rc)
+			return rc;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		rc = ll_update_lsm_md(inode, md);
+		if (rc != 0)
+			return rc;
+	}
+
+	if (body->mbo_valid & OBD_MD_FLACL)
+		lli_replace_acl(lli, md);
+
+	api32 = test_bit(LL_SBI_32BIT_API, sbi->ll_flags);
+	inode->i_ino = cl_fid_build_ino(&body->mbo_fid1, api32);
+	inode->i_generation = cl_fid_build_gen(&body->mbo_fid1);
+
+	if (body->mbo_valid & OBD_MD_FLATIME) {
+		if (body->mbo_atime > inode->i_atime.tv_sec)
+			inode->i_atime.tv_sec = body->mbo_atime;
+		lli->lli_atime = body->mbo_atime;
+	}
+
+	if (body->mbo_valid & OBD_MD_FLMTIME) {
+		if (body->mbo_mtime > inode->i_mtime.tv_sec) {
+			CDEBUG(D_INODE,
+			       "setting ino %lu mtime from %lld to %llu\n",
+			       inode->i_ino, (s64)inode->i_mtime.tv_sec,
+			       body->mbo_mtime);
+			inode->i_mtime.tv_sec = body->mbo_mtime;
+		}
+		lli->lli_mtime = body->mbo_mtime;
+	}
+
+	if (body->mbo_valid & OBD_MD_FLCTIME) {
+		if (body->mbo_ctime > inode->i_ctime.tv_sec)
+			inode->i_ctime.tv_sec = body->mbo_ctime;
+		lli->lli_ctime = body->mbo_ctime;
+	}
+
+	if (body->mbo_valid & OBD_MD_FLBTIME)
+		lli->lli_btime = body->mbo_btime;
+
+	/* Clear i_flags to remove S_NOSEC before permissions are updated */
+	if (body->mbo_valid & OBD_MD_FLFLAGS)
+		ll_update_inode_flags(inode, body->mbo_flags);
+	if (body->mbo_valid & OBD_MD_FLMODE)
+		inode->i_mode = (inode->i_mode & S_IFMT) |
+				(body->mbo_mode & ~S_IFMT);
+
+	if (body->mbo_valid & OBD_MD_FLTYPE)
+		inode->i_mode = (inode->i_mode & ~S_IFMT) |
+				(body->mbo_mode & S_IFMT);
+
+	LASSERT(inode->i_mode != 0);
+	if (body->mbo_valid & OBD_MD_FLUID)
+		inode->i_uid = make_kuid(&init_user_ns, body->mbo_uid);
+	if (body->mbo_valid & OBD_MD_FLGID)
+		inode->i_gid = make_kgid(&init_user_ns, body->mbo_gid);
+	if (body->mbo_valid & OBD_MD_FLPROJID)
+		lli->lli_projid = body->mbo_projid;
+	if (body->mbo_valid & OBD_MD_FLNLINK) {
+		spin_lock(&inode->i_lock);
+		set_nlink(inode, body->mbo_nlink);
+		spin_unlock(&inode->i_lock);
+	}
+	if (body->mbo_valid & OBD_MD_FLRDEV)
+		inode->i_rdev = old_decode_dev(body->mbo_rdev);
+
+	if (body->mbo_valid & OBD_MD_FLID) {
+		/* FID shouldn't be changed! */
+		if (fid_is_sane(&lli->lli_fid)) {
+			LASSERTF(lu_fid_eq(&lli->lli_fid, &body->mbo_fid1),
+				 "Trying to change FID "DFID
+				 " to the "DFID", inode "DFID"(%p)\n",
+				 PFID(&lli->lli_fid), PFID(&body->mbo_fid1),
+				 PFID(ll_inode2fid(inode)), inode);
+		} else {
+			lli->lli_fid = body->mbo_fid1;
+		}
+	}
+
+	LASSERT(fid_seq(&lli->lli_fid) != 0);
+
+	/* In case of encrypted file without the key, please do not lose
+	 * clear text size stored into lli_lazysize in ll_merge_attr(),
+	 * we will need it in ll_prepare_close().
+	 */
+	if (lli->lli_attr_valid & OBD_MD_FLLAZYSIZE && lli->lli_lazysize &&
+	    llcrypt_require_key(inode) == -ENOKEY)
+		lli->lli_attr_valid = body->mbo_valid | OBD_MD_FLLAZYSIZE;
+	else
+		lli->lli_attr_valid = body->mbo_valid;
+	if (body->mbo_valid & OBD_MD_FLSIZE) {
+		i_size_write(inode, body->mbo_size);
+
+		CDEBUG(D_VFSTRACE, "inode="DFID", updating i_size %llu\n",
+		       PFID(ll_inode2fid(inode)),
+		       (unsigned long long)body->mbo_size);
+
+		if (body->mbo_valid & OBD_MD_FLBLOCKS)
+			inode->i_blocks = body->mbo_blocks;
+	} else {
+		if (body->mbo_valid & OBD_MD_FLLAZYSIZE)
+			lli->lli_lazysize = body->mbo_size;
+		if (body->mbo_valid & OBD_MD_FLLAZYBLOCKS)
+			lli->lli_lazyblocks = body->mbo_blocks;
+	}
+
+	if (body->mbo_valid & OBD_MD_TSTATE) {
+		/* Set LLIF_FILE_RESTORING if restore ongoing and
+		 * clear it when done to ensure to start again
+		 * glimpsing updated attrs
+		 */
+		if (body->mbo_t_state & MS_RESTORE)
+			set_bit(LLIF_FILE_RESTORING, &lli->lli_flags);
+		else
+			clear_bit(LLIF_FILE_RESTORING, &lli->lli_flags);
+	}
+
+	return 0;
+}
+
+/* child default LMV is inherited from parent */
+static inline bool ll_default_lmv_inherited(struct lmv_stripe_md *pdmv,
+					    struct lmv_stripe_md *cdmv)
+{
+	if (!pdmv || !cdmv)
+		return false;
+
+	if (pdmv->lsm_md_magic != cdmv->lsm_md_magic ||
+	    pdmv->lsm_md_stripe_count != cdmv->lsm_md_stripe_count ||
+	    pdmv->lsm_md_master_mdt_index != cdmv->lsm_md_master_mdt_index ||
+	    pdmv->lsm_md_hash_type != cdmv->lsm_md_hash_type)
+		return false;
+
+	if (cdmv->lsm_md_max_inherit !=
+	    lmv_inherit_next(pdmv->lsm_md_max_inherit))
+		return false;
+
+	if (cdmv->lsm_md_max_inherit_rr !=
+	    lmv_inherit_rr_next(pdmv->lsm_md_max_inherit_rr))
+		return false;
+
+	return true;
+}
+
+/* update directory depth to ROOT, called after LOOKUP lock is fetched. */
+void ll_update_dir_depth(struct inode *dir, struct inode *inode)
+{
+	struct ll_inode_info *plli;
+	struct ll_inode_info *lli;
+
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	if (inode == dir)
+		return;
+
+	plli = ll_i2info(dir);
+	lli = ll_i2info(inode);
+	lli->lli_dir_depth = plli->lli_dir_depth + 1;
+	if (plli->lli_default_lsm_md && lli->lli_default_lsm_md) {
+		down_read(&plli->lli_lsm_sem);
+		down_read(&lli->lli_lsm_sem);
+		if (ll_default_lmv_inherited(plli->lli_default_lsm_md,
+					     lli->lli_default_lsm_md))
+			lli->lli_inherit_depth =
+				plli->lli_inherit_depth + 1;
+		else
+			lli->lli_inherit_depth = 0;
+		up_read(&lli->lli_lsm_sem);
+		up_read(&plli->lli_lsm_sem);
+	} else {
+		lli->lli_inherit_depth = 0;
+	}
+
+	CDEBUG(D_INODE, DFID" depth %hu default LMV depth %hu\n",
+	       PFID(&lli->lli_fid), lli->lli_dir_depth, lli->lli_inherit_depth);
+}
+
+void ll_truncate_inode_pages_final(struct inode *inode)
+{
+	struct address_space *mapping = &inode->i_data;
+	unsigned long nrpages;
+	unsigned long flags;
+
+	truncate_inode_pages_final(mapping);
+
+	/* Workaround for LU-118: Note nrpages may not be totally updated when
+	 * truncate_inode_pages() returns, as there can be a page in the process
+	 * of deletion (inside __delete_from_page_cache()) in the specified
+	 * range. Thus mapping->nrpages can be non-zero when this function
+	 * returns even after truncation of the whole mapping.  Only do this if
+	 * npages isn't already zero.
+	 */
+	nrpages = mapping->nrpages;
+	if (nrpages) {
+		ll_xa_lock_irqsave(&mapping->i_pages, flags);
+		nrpages = mapping->nrpages;
+		ll_xa_unlock_irqrestore(&mapping->i_pages, flags);
+	} /* Workaround end */
+
+	LASSERTF(nrpages == 0, "%s: inode="DFID"(%p) nrpages=%lu, "
+		 "see https://jira.whamcloud.com/browse/LU-118\n",
+		 ll_i2sbi(inode)->ll_fsname,
+		 PFID(ll_inode2fid(inode)), inode, nrpages);
+}
+
+int ll_read_inode2(struct inode *inode, void *opaque)
+{
+        struct lustre_md *md = opaque;
+        struct ll_inode_info *lli = ll_i2info(inode);
+	int	rc;
+        ENTRY;
+
+        CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+               PFID(&lli->lli_fid), inode);
+
+        /* Core attributes from the MDS first.  This is a new inode, and
+         * the VFS doesn't zero times in the core inode so we have to do
+         * it ourselves.  They will be overwritten by either MDS or OST
+	 * attributes - we just need to make sure they aren't newer.
+	 */
+	inode->i_mtime.tv_sec = 0;
+	inode->i_atime.tv_sec = 0;
+	inode->i_ctime.tv_sec = 0;
+	inode->i_rdev = 0;
+	rc = ll_update_inode(inode, md);
+	if (rc != 0)
+		RETURN(rc);
+
+        /* OIDEBUG(inode); */
+
+#ifdef HAVE_BACKING_DEV_INFO
+	/* initializing backing dev info. */
+	inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi;
+#endif
+        if (S_ISREG(inode->i_mode)) {
+                struct ll_sb_info *sbi = ll_i2sbi(inode);
+                inode->i_op = &ll_file_inode_operations;
+                inode->i_fop = sbi->ll_fop;
+                inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
+                EXIT;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &ll_dir_inode_operations;
+                inode->i_fop = &ll_dir_operations;
+                EXIT;
+        } else if (S_ISLNK(inode->i_mode)) {
+                inode->i_op = &ll_fast_symlink_inode_operations;
+                EXIT;
+        } else {
+                inode->i_op = &ll_special_inode_operations;
+
+		init_special_inode(inode, inode->i_mode,
+				   inode->i_rdev);
+
+                EXIT;
+        }
+
+	return 0;
+}
+
+void ll_delete_inode(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ENTRY;
+
+	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) {
+		/* It is last chance to write out dirty pages,
+		 * otherwise we may lose data while umount.
+		 *
+		 * If i_nlink is 0 then just discard data. This is safe because
+		 * local inode gets i_nlink 0 from server only for the last
+		 * unlink, so that file is not opened somewhere else
+		 */
+		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, inode->i_nlink ?
+				   CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1);
+	}
+
+	ll_truncate_inode_pages_final(inode);
+	ll_clear_inode(inode);
+	clear_inode(inode);
+
+        EXIT;
+}
+
+int ll_iocontrol(struct inode *inode, struct file *file,
+                 unsigned int cmd, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	int rc, flags = 0;
+	ENTRY;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS: {
+                struct mdt_body *body;
+                struct md_op_data *op_data;
+
+                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                             0, 0, LUSTRE_OPC_ANY,
+                                             NULL);
+                if (IS_ERR(op_data))
+                        RETURN(PTR_ERR(op_data));
+
+                op_data->op_valid = OBD_MD_FLFLAGS;
+                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+                ll_finish_md_op_data(op_data);
+                if (rc) {
+			CERROR("%s: failure inode "DFID": rc = %d\n",
+			       sbi->ll_md_exp->exp_obd->obd_name,
+			       PFID(ll_inode2fid(inode)), rc);
+                        RETURN(-abs(rc));
+                }
+
+                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+		flags = body->mbo_flags;
+		/* if Lustre specific LUSTRE_ENCRYPT_FL flag is set, also set
+		 * ext4 equivalent to please lsattr and other e2fsprogs tools
+		 */
+		if (flags & LUSTRE_ENCRYPT_FL)
+			flags |= STATX_ATTR_ENCRYPTED;
+
+		ptlrpc_req_finished(req);
+
+		RETURN(put_user(flags, (int __user *)arg));
+	}
+	case FS_IOC_SETFLAGS: {
+		struct iattr *attr;
+		struct md_op_data *op_data;
+		struct cl_object *obj;
+		struct fsxattr fa = { 0 };
+
+		if (get_user(flags, (int __user *)arg))
+			RETURN(-EFAULT);
+
+		fa.fsx_projid = ll_i2info(inode)->lli_projid;
+		if (flags & LUSTRE_PROJINHERIT_FL)
+			fa.fsx_xflags = FS_XFLAG_PROJINHERIT;
+
+		rc = ll_ioctl_check_project(inode, fa.fsx_xflags,
+					    fa.fsx_projid);
+		if (rc)
+			RETURN(rc);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
+
+		op_data->op_attr_flags = flags;
+		op_data->op_xvalid |= OP_XVALID_FLAGS;
+		rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &req);
+                ll_finish_md_op_data(op_data);
+                ptlrpc_req_finished(req);
+		if (rc)
+			RETURN(rc);
+
+		ll_update_inode_flags(inode, flags);
+
+		obj = ll_i2info(inode)->lli_clob;
+		if (obj == NULL)
+			RETURN(0);
+
+		OBD_ALLOC_PTR(attr);
+		if (attr == NULL)
+			RETURN(-ENOMEM);
+
+		rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS, flags);
+
+		OBD_FREE_PTR(attr);
+		RETURN(rc);
+        }
+        default:
+                RETURN(-ENOSYS);
+        }
+
+        RETURN(0);
+}
+
+int ll_flush_ctx(struct inode *inode)
+{
+	struct ll_sb_info  *sbi = ll_i2sbi(inode);
+
+	CDEBUG(D_SEC, "flush context for user %d\n",
+	       from_kuid(&init_user_ns, current_uid()));
+
+	obd_set_info_async(NULL, sbi->ll_md_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	obd_set_info_async(NULL, sbi->ll_dt_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	return 0;
+}
+
+/* umount -f client means force down, don't save state */
+void ll_umount_begin(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct obd_ioctl_data *ioc_data;
+	int cnt;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
+	       sb->s_count, atomic_read(&sb->s_active));
+
+	obd = class_exp2obd(sbi->ll_md_exp);
+	if (obd == NULL) {
+		CERROR("Invalid MDC connection handle %#llx\n",
+		       sbi->ll_md_exp->exp_handle.h_cookie);
+		EXIT;
+		return;
+	}
+	obd->obd_force = 1;
+
+	obd = class_exp2obd(sbi->ll_dt_exp);
+	if (obd == NULL) {
+		CERROR("Invalid LOV connection handle %#llx\n",
+		       sbi->ll_dt_exp->exp_handle.h_cookie);
+		EXIT;
+		return;
+	}
+	obd->obd_force = 1;
+
+	OBD_ALLOC_PTR(ioc_data);
+	if (ioc_data) {
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
+			      sizeof *ioc_data, ioc_data, NULL);
+
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
+			      sizeof *ioc_data, ioc_data, NULL);
+
+		OBD_FREE_PTR(ioc_data);
+	}
+
+	/* Really, we'd like to wait until there are no requests outstanding,
+	 * and then continue.  For now, we just periodically checking for vfs
+	 * to decrement mnt_cnt and hope to finish it within 10sec.
+	 */
+	cnt = 10;
+	while (cnt > 0 &&
+	       !may_umount(sbi->ll_mnt.mnt)) {
+		ssleep(1);
+		cnt -= 1;
+	}
+
+	EXIT;
+}
+
+int ll_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	int err;
+	__u32 read_only;
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & SB_RDONLY)) {
+		read_only = *flags & MS_RDONLY;
+		err = obd_set_info_async(NULL, sbi->ll_md_exp,
+					 sizeof(KEY_READ_ONLY),
+					 KEY_READ_ONLY, sizeof(read_only),
+					 &read_only, NULL);
+		if (err) {
+			LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
+				      profilenm, read_only ?
+				      "read-only" : "read-write", err);
+			return err;
+		}
+
+		if (read_only)
+			sb->s_flags |= SB_RDONLY;
+		else
+			sb->s_flags &= ~SB_RDONLY;
+
+		if (test_bit(LL_SBI_VERBOSE, sbi->ll_flags))
+			LCONSOLE_WARN("Remounted %s %s\n", profilenm,
+				      read_only ?  "read-only" : "read-write");
+	}
+	return 0;
+}
+
+/**
+ * Cleanup the open handle that is cached on MDT-side.
+ *
+ * For open case, the client side open handling thread may hit error
+ * after the MDT grant the open. Under such case, the client should
+ * send close RPC to the MDT as cleanup; otherwise, the open handle
+ * on the MDT will be leaked there until the client umount or evicted.
+ *
+ * In further, if someone unlinked the file, because the open handle
+ * holds the reference on such file/object, then it will block the
+ * subsequent threads that want to locate such object via FID.
+ *
+ * \param[in] sb	super block for this file-system
+ * \param[in] open_req	pointer to the original open request
+ */
+void ll_open_cleanup(struct super_block *sb, struct req_capsule *pill)
+{
+	struct mdt_body			*body;
+	struct md_op_data		*op_data;
+	struct ptlrpc_request		*close_req = NULL;
+	struct obd_export		*exp	   = ll_s2sbi(sb)->ll_md_exp;
+	ENTRY;
+
+	body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL) {
+		CWARN("%s: cannot allocate op_data to release open handle for "
+		      DFID"\n", ll_s2sbi(sb)->ll_fsname, PFID(&body->mbo_fid1));
+
+		RETURN_EXIT;
+	}
+
+	op_data->op_fid1 = body->mbo_fid1;
+	op_data->op_open_handle = body->mbo_open_handle;
+	op_data->op_mod_time = ktime_get_real_seconds();
+	md_close(exp, op_data, NULL, &close_req);
+	ptlrpc_req_finished(close_req);
+	ll_finish_md_op_data(op_data);
+
+	EXIT;
+}
+
+/* set filesystem-wide default LMV for subdir mount if it's enabled on ROOT. */
+static int ll_fileset_default_lmv_fixup(struct inode *inode,
+					struct lustre_md *md)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	union lmv_mds_md *lmm = NULL;
+	int size = 0;
+	int rc;
+
+	LASSERT(is_root_inode(inode));
+	LASSERT(!fid_is_root(&sbi->ll_root_fid));
+	LASSERT(!md->default_lmv);
+
+	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &size, &req,
+				       OBD_MD_DEFAULT_MEA,
+				       GET_DEFAULT_LAYOUT_ROOT);
+	if (rc && rc != -ENODATA)
+		GOTO(out, rc);
+
+	rc = 0;
+	if (lmm && size) {
+		rc = md_unpackmd(sbi->ll_md_exp, &md->default_lmv, lmm, size);
+		if (rc < 0)
+			GOTO(out, rc);
+
+		rc = 0;
+	}
+	EXIT;
+out:
+	if (req)
+		ptlrpc_req_finished(req);
+	return rc;
+}
+
+int ll_prep_inode(struct inode **inode, struct req_capsule *pill,
+		  struct super_block *sb, struct lookup_intent *it)
+{
+	struct ll_sb_info *sbi = NULL;
+	struct lustre_md md = { NULL };
+	bool default_lmv_deleted = false;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(*inode || sb);
+	sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+	rc = md_get_lustre_md(sbi->ll_md_exp, pill, sbi->ll_dt_exp,
+			      sbi->ll_md_exp, &md);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	/*
+	 * clear default_lmv only if intent_getattr reply doesn't contain it.
+	 * but it needs to be done after iget, check this early because
+	 * ll_update_lsm_md() may change md.
+	 */
+	if (it && (it->it_op & (IT_LOOKUP | IT_GETATTR)) &&
+	    S_ISDIR(md.body->mbo_mode) && !md.default_lmv) {
+		if (unlikely(*inode && is_root_inode(*inode) &&
+			     !fid_is_root(&sbi->ll_root_fid))) {
+			rc = ll_fileset_default_lmv_fixup(*inode, &md);
+			if (rc)
+				GOTO(out, rc);
+		}
+
+		if (!md.default_lmv)
+			default_lmv_deleted = true;
+	}
+
+	if (*inode) {
+		rc = ll_update_inode(*inode, &md);
+		if (rc != 0)
+			GOTO(out, rc);
+	} else {
+		bool api32 = test_bit(LL_SBI_32BIT_API, sbi->ll_flags);
+		struct lu_fid *fid1 = &md.body->mbo_fid1;
+
+		LASSERT(sb != NULL);
+
+		/*
+		 * At this point server returns to client's same fid as client
+		 * generated for creating. So using ->fid1 is okay here.
+		 */
+		if (!fid_is_sane(fid1)) {
+			CERROR("%s: Fid is insane "DFID"\n",
+				sbi->ll_fsname, PFID(fid1));
+			GOTO(out, rc = -EINVAL);
+		}
+
+		*inode = ll_iget(sb, cl_fid_build_ino(fid1, api32), &md);
+		if (IS_ERR(*inode)) {
+                        lmd_clear_acl(&md);
+                        rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM;
+                        *inode = NULL;
+                        CERROR("new_inode -fatal: rc %d\n", rc);
+                        GOTO(out, rc);
+                }
+        }
+
+	/* Handling piggyback layout lock.
+	 * Layout lock can be piggybacked by getattr and open request.
+	 * The lsm can be applied to inode only if it comes with a layout lock
+	 * otherwise correct layout may be overwritten, for example:
+	 * 1. proc1: mdt returns a lsm but not granting layout
+	 * 2. layout was changed by another client
+	 * 3. proc2: refresh layout and layout lock granted
+	 * 4. proc1: to apply a stale layout */
+	if (it != NULL && it->it_lock_mode != 0) {
+		struct lustre_handle lockh;
+		struct ldlm_lock *lock;
+
+		lockh.cookie = it->it_lock_handle;
+		lock = ldlm_handle2lock(&lockh);
+		LASSERT(lock != NULL);
+		if (ldlm_has_layout(lock)) {
+			struct cl_object_conf conf;
+
+			memset(&conf, 0, sizeof(conf));
+			conf.coc_opc = OBJECT_CONF_SET;
+			conf.coc_inode = *inode;
+			conf.coc_lock = lock;
+			conf.u.coc_layout = md.layout;
+			(void)ll_layout_conf(*inode, &conf);
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+
+	if (default_lmv_deleted)
+		ll_update_default_lsm_md(*inode, &md);
+
+	/* we may want to apply some policy for foreign file/dir */
+	if (ll_sbi_has_foreign_symlink(sbi)) {
+		rc = ll_manage_foreign(*inode, &md);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	GOTO(out, rc = 0);
+
+out:
+	/* cleanup will be done if necessary */
+	md_free_lustre_md(sbi->ll_md_exp, &md);
+
+	if (rc != 0 && it != NULL && it->it_op & IT_OPEN) {
+		ll_intent_drop_lock(it);
+		ll_open_cleanup(sb != NULL ? sb : (*inode)->i_sb, pill);
+	}
+
+	return rc;
+}
+
+int ll_obd_statfs(struct inode *inode, void __user *arg)
+{
+	struct ll_sb_info *sbi = NULL;
+	struct obd_export *exp;
+	struct obd_ioctl_data *data = NULL;
+	__u32 type;
+	int len = 0, rc;
+
+	if (inode)
+		sbi = ll_i2sbi(inode);
+	if (!sbi)
+		GOTO(out_statfs, rc = -EINVAL);
+
+	rc = obd_ioctl_getdata(&data, &len, arg);
+	if (rc)
+		GOTO(out_statfs, rc);
+
+	if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
+	    !data->ioc_pbuf1 || !data->ioc_pbuf2)
+		GOTO(out_statfs, rc = -EINVAL);
+
+	if (data->ioc_inllen1 != sizeof(__u32) ||
+	    data->ioc_inllen2 != sizeof(__u32) ||
+	    data->ioc_plen1 != sizeof(struct obd_statfs) ||
+	    data->ioc_plen2 != sizeof(struct obd_uuid))
+		GOTO(out_statfs, rc = -EINVAL);
+
+	memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
+	if (type & LL_STATFS_LMV)
+		exp = sbi->ll_md_exp;
+	else if (type & LL_STATFS_LOV)
+		exp = sbi->ll_dt_exp;
+	else
+		GOTO(out_statfs, rc = -ENODEV);
+
+	rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, data, NULL);
+	if (rc)
+		GOTO(out_statfs, rc);
+out_statfs:
+	OBD_FREE_LARGE(data, len);
+	return rc;
+}
+
+/*
+ * this is normally called in ll_fini_md_op_data(), but sometimes it needs to
+ * be called early to avoid deadlock.
+ */
+void ll_unlock_md_op_lsm(struct md_op_data *op_data)
+{
+	if (op_data->op_mea2_sem) {
+		up_read_non_owner(op_data->op_mea2_sem);
+		op_data->op_mea2_sem = NULL;
+	}
+
+	if (op_data->op_mea1_sem) {
+		up_read_non_owner(op_data->op_mea1_sem);
+		op_data->op_mea1_sem = NULL;
+	}
+}
+
+/* this function prepares md_op_data hint for passing it down to MD stack. */
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+				      struct inode *i1, struct inode *i2,
+				      const char *name, size_t namelen,
+				      __u32 mode, enum md_op_code opc,
+				      void *data)
+{
+	struct llcrypt_name fname = { 0 };
+	int rc;
+
+	LASSERT(i1 != NULL);
+
+	if (name == NULL) {
+		/* Do not reuse namelen for something else. */
+		if (namelen != 0)
+			return ERR_PTR(-EINVAL);
+	} else {
+		if ((!IS_ENCRYPTED(i1) ||
+		     (opc != LUSTRE_OPC_LOOKUP && opc != LUSTRE_OPC_CREATE)) &&
+		    namelen > ll_i2sbi(i1)->ll_namelen)
+			return ERR_PTR(-ENAMETOOLONG);
+
+		/* "/" is not valid name, but it's allowed */
+		if (!lu_name_is_valid_2(name, namelen) &&
+		    strncmp("/", name, namelen) != 0)
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (op_data == NULL)
+		OBD_ALLOC_PTR(op_data);
+
+	if (op_data == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ll_i2gids(op_data->op_suppgids, i1, i2);
+	/* If the client is using a subdir mount and looks at what it sees as
+	 * /.fscrypt, interpret it as the .fscrypt dir at the root of the fs.
+	 */
+	if (unlikely(i1->i_sb && i1->i_sb->s_root && is_root_inode(i1) &&
+		     !fid_is_root(ll_inode2fid(i1)) &&
+		     name && namelen == strlen(dot_fscrypt_name) &&
+		     strncmp(name, dot_fscrypt_name, namelen) == 0))
+		lu_root_fid(&op_data->op_fid1);
+	else
+		op_data->op_fid1 = *ll_inode2fid(i1);
+
+	if (S_ISDIR(i1->i_mode)) {
+		down_read_non_owner(&ll_i2info(i1)->lli_lsm_sem);
+		op_data->op_mea1_sem = &ll_i2info(i1)->lli_lsm_sem;
+		op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md;
+		op_data->op_default_mea1 = ll_i2info(i1)->lli_default_lsm_md;
+	}
+
+	if (i2) {
+		op_data->op_fid2 = *ll_inode2fid(i2);
+		if (S_ISDIR(i2->i_mode)) {
+			if (i2 != i1) {
+				/* i2 is typically a child of i1, and MUST be
+				 * further from the root to avoid deadlocks.
+				 */
+				down_read_non_owner(&ll_i2info(i2)->lli_lsm_sem);
+				op_data->op_mea2_sem =
+						&ll_i2info(i2)->lli_lsm_sem;
+			}
+			op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md;
+		}
+	} else {
+		fid_zero(&op_data->op_fid2);
+	}
+
+	if (test_bit(LL_SBI_64BIT_HASH, ll_i2sbi(i1)->ll_flags))
+		op_data->op_cli_flags |= CLI_HASH64;
+
+	if (ll_need_32bit_api(ll_i2sbi(i1)))
+		op_data->op_cli_flags |= CLI_API32;
+
+	if ((i2 && is_root_inode(i2)) ||
+	    opc == LUSTRE_OPC_LOOKUP || opc == LUSTRE_OPC_CREATE) {
+		/* In case of lookup, ll_setup_filename() has already been
+		 * called in ll_lookup_it(), so just take provided name.
+		 * Also take provided name if we are dealing with root inode.
+		 */
+		fname.disk_name.name = (unsigned char *)name;
+		fname.disk_name.len = namelen;
+	} else if (name && namelen) {
+		struct qstr dname = QSTR_INIT(name, namelen);
+		struct inode *dir;
+		struct lu_fid *pfid = NULL;
+		struct lu_fid fid;
+		int lookup;
+
+		if (!S_ISDIR(i1->i_mode) && i2 && S_ISDIR(i2->i_mode)) {
+			/* special case when called from ll_link() */
+			dir = i2;
+			lookup = 0;
+		} else {
+			dir = i1;
+			lookup = (int)(opc == LUSTRE_OPC_ANY);
+		}
+		if (opc == LUSTRE_OPC_ANY && lookup)
+			pfid = &fid;
+		rc = ll_setup_filename(dir, &dname, lookup, &fname, pfid);
+		if (rc) {
+			ll_finish_md_op_data(op_data);
+			return ERR_PTR(rc);
+		}
+		if (pfid && !fid_is_zero(pfid)) {
+			if (i2 == NULL)
+				op_data->op_fid2 = fid;
+			op_data->op_bias = MDS_FID_OP;
+		}
+		if (fname.disk_name.name &&
+		    fname.disk_name.name != (unsigned char *)name) {
+			/* op_data->op_name must be freed after use */
+			op_data->op_flags |= MF_OPNAME_KMALLOCED;
+		}
+	}
+
+	/* In fact LUSTRE_OPC_LOOKUP, LUSTRE_OPC_OPEN
+	 * are LUSTRE_OPC_ANY
+	 */
+	if (opc == LUSTRE_OPC_LOOKUP || opc == LUSTRE_OPC_OPEN)
+		op_data->op_code = LUSTRE_OPC_ANY;
+	else
+		op_data->op_code = opc;
+	op_data->op_name = fname.disk_name.name;
+	op_data->op_namelen = fname.disk_name.len;
+	op_data->op_mode = mode;
+	op_data->op_mod_time = ktime_get_real_seconds();
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = current_cap();
+	op_data->op_mds = 0;
+	if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
+	     filename_is_volatile(name, namelen, &op_data->op_mds)) {
+		op_data->op_bias |= MDS_CREATE_VOLATILE;
+	}
+	op_data->op_data = data;
+
+	return op_data;
+}
+
+void ll_finish_md_op_data(struct md_op_data *op_data)
+{
+	ll_unlock_md_op_lsm(op_data);
+	ll_security_release_secctx(op_data->op_file_secctx,
+				   op_data->op_file_secctx_size,
+				   op_data->op_file_secctx_slot);
+	if (op_data->op_flags & MF_OPNAME_KMALLOCED)
+		/* allocated via ll_setup_filename called
+		 * from ll_prep_md_op_data
+		 */
+		kfree(op_data->op_name);
+	llcrypt_free_ctx(op_data->op_file_encctx, op_data->op_file_encctx_size);
+	OBD_FREE_PTR(op_data);
+}
+
+int ll_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+	struct ll_sb_info *sbi;
+	int i;
+
+	LASSERT(seq && dentry);
+	sbi = ll_s2sbi(dentry->d_sb);
+
+	if (test_bit(LL_SBI_NOLCK, sbi->ll_flags))
+		seq_puts(seq, "nolock");
+
+	for (i = 1; ll_sbi_flags_name[i].token != LL_SBI_NUM_MOUNT_OPT; i++) {
+		/* match_table in some cases has patterns for both enabled and
+		 * disabled cases. Ignore 'no'xxx versions if bit is set.
+		 */
+		if (test_bit(ll_sbi_flags_name[i].token, sbi->ll_flags) &&
+		    strncmp(ll_sbi_flags_name[i].pattern, "no", 2)) {
+			if (ll_sbi_flags_name[i].token ==
+			    LL_SBI_FOREIGN_SYMLINK) {
+				seq_show_option(seq, "foreign_symlink",
+						sbi->ll_foreign_symlink_prefix);
+			} else {
+				seq_printf(seq, ",%s",
+					   ll_sbi_flags_name[i].pattern);
+			}
+
+			/* You can have either localflock or flock but not
+			 * both. If localflock is set don't print flock or
+			 * noflock.
+			 */
+			if (ll_sbi_flags_name[i].token == LL_SBI_LOCALFLOCK)
+				i += 2;
+		} else if (!test_bit(ll_sbi_flags_name[i].token, sbi->ll_flags) &&
+			   !strncmp(ll_sbi_flags_name[i].pattern, "no", 2)) {
+			seq_printf(seq, ",%s",
+				   ll_sbi_flags_name[i].pattern);
+		}
+	}
+
+	llcrypt_show_test_dummy_encryption(seq, ',', dentry->d_sb);
+
+	if (test_bit(LL_SBI_MDLL, sbi->ll_flags))
+		seq_puts(seq, ",mdll");
+
+	if (test_bit(LL_SBI_MDLL_BYPASS, sbi->ll_flags))
+		seq_puts(seq, ",mdll_bypass");
+
+	if (test_bit(LL_SBI_MDLL_AUTO_REFRESH, sbi->ll_flags))
+		seq_puts(seq, ",mdll_auto_refresh");
+
+	RETURN(0);
+}
+
+/**
+ * Get obd name by cmd, and copy out to user space
+ */
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct obd_device *obd;
+        ENTRY;
+
+	if (cmd == OBD_IOC_GETNAME_OLD || cmd == OBD_IOC_GETDTNAME)
+                obd = class_exp2obd(sbi->ll_dt_exp);
+        else if (cmd == OBD_IOC_GETMDNAME)
+                obd = class_exp2obd(sbi->ll_md_exp);
+        else
+                RETURN(-EINVAL);
+
+        if (!obd)
+                RETURN(-ENOENT);
+
+	if (copy_to_user((void __user *)arg, obd->obd_name,
+			 strlen(obd->obd_name) + 1))
+		RETURN(-EFAULT);
+
+	RETURN(0);
+}
+
+struct dname_buf {
+	struct work_struct db_work;
+	struct dentry *db_dentry;
+	/* Let's hope the path is not too long, 32 bytes for the work struct
+	 * on my kernel
+	 */
+	char buf[PAGE_SIZE - sizeof(struct work_struct) - sizeof(void *)];
+};
+
+static void ll_dput_later(struct work_struct *work)
+{
+	struct dname_buf *db = container_of(work, struct dname_buf, db_work);
+
+	dput(db->db_dentry);
+	free_page((unsigned long)db);
+}
+
+static char* ll_d_path(struct dentry *dentry, char *buf, int bufsize)
+{
+	char *path = NULL;
+
+	struct path p;
+
+	p.dentry = dentry;
+	p.mnt = current->fs->root.mnt;
+	path_get(&p);
+	path = d_path(&p, buf, bufsize);
+	path_put(&p);
+	return path;
+}
+
+void ll_dirty_page_discard_warn(struct inode *inode, int ioret)
+{
+	struct dname_buf *db;
+	char  *path = NULL;
+	struct dentry *dentry = NULL;
+
+	/* this can be called inside spin lock so use GFP_ATOMIC. */
+	db = (struct dname_buf *)__get_free_page(GFP_ATOMIC);
+	if (db != NULL) {
+
+		dentry = d_find_alias(inode);
+		if (dentry != NULL)
+			path = ll_d_path(dentry, db->buf, sizeof(db->buf));
+	}
+
+	/* The below message is checked in recovery-small.sh test_24b */
+	CDEBUG(D_WARNING,
+	       "%s: dirty page discard: %s/fid: "DFID"/%s may get corrupted "
+	       "(rc %d)\n", ll_i2sbi(inode)->ll_fsname,
+	       s2lsi(inode->i_sb)->lsi_lmd->lmd_dev,
+	       PFID(ll_inode2fid(inode)),
+	       (path && !IS_ERR(path)) ? path : "", ioret);
+
+	if (dentry != NULL) {
+		/* We cannot dput here since if we happen to be the last holder
+		 * then we can end up waiting for page evictions that
+		 * in turn wait for RPCs that need this instance of ptlrpcd
+		 * (callng brw_interpret->*page_completion*->vmpage_error->here)
+		 * LU-15340
+		 */
+		INIT_WORK(&db->db_work, ll_dput_later);
+		db->db_dentry = dentry;
+		schedule_work(&db->db_work);
+	} else {
+		if (db != NULL)
+			free_page((unsigned long)db);
+	}
+}
+
+ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
+			struct lov_user_md **kbuf)
+{
+	struct lov_user_md	lum;
+	ssize_t			lum_size;
+	ENTRY;
+
+	if (copy_from_user(&lum, md, sizeof(lum)))
+		RETURN(-EFAULT);
+
+	lum_size = ll_lov_user_md_size(&lum);
+	if (lum_size < 0)
+		RETURN(lum_size);
+
+	OBD_ALLOC_LARGE(*kbuf, lum_size);
+	if (*kbuf == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(*kbuf, md, lum_size) != 0) {
+		OBD_FREE_LARGE(*kbuf, lum_size);
+		RETURN(-EFAULT);
+	}
+
+	RETURN(lum_size);
+}
+
+/*
+ * Compute llite root squash state after a change of root squash
+ * configuration setting or add/remove of a lnet nid
+ */
+void ll_compute_rootsquash_state(struct ll_sb_info *sbi)
+{
+	struct root_squash_info *squash = &sbi->ll_squash;
+	int i;
+	bool matched;
+	struct lnet_processid id;
+
+	/* Update norootsquash flag */
+	spin_lock(&squash->rsi_lock);
+	if (list_empty(&squash->rsi_nosquash_nids))
+		clear_bit(LL_SBI_NOROOTSQUASH, sbi->ll_flags);
+	else {
+		/* Do not apply root squash as soon as one of our NIDs is
+		 * in the nosquash_nids list */
+		matched = false;
+		i = 0;
+		while (LNetGetId(i++, &id) != -ENOENT) {
+			if (nid_is_lo0(&id.nid))
+				continue;
+			if (cfs_match_nid(lnet_nid_to_nid4(&id.nid),
+					  &squash->rsi_nosquash_nids)) {
+				matched = true;
+				break;
+			}
+		}
+		if (matched)
+			set_bit(LL_SBI_NOROOTSQUASH, sbi->ll_flags);
+		else
+			clear_bit(LL_SBI_NOROOTSQUASH, sbi->ll_flags);
+	}
+	spin_unlock(&squash->rsi_lock);
+}
+
+/**
+ * Parse linkea content to extract information about a given hardlink
+ *
+ * \param[in]   ldata      - Initialized linkea data
+ * \param[in]   linkno     - Link identifier
+ * \param[out]  parent_fid - The entry's parent FID
+ * \param[out]  ln         - Entry name destination buffer
+ *
+ * \retval 0 on success
+ * \retval Appropriate negative error code on failure
+ */
+static int ll_linkea_decode(struct linkea_data *ldata, unsigned int linkno,
+			    struct lu_fid *parent_fid, struct lu_name *ln)
+{
+	unsigned int	idx;
+	int		rc;
+	ENTRY;
+
+	rc = linkea_init_with_rec(ldata);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (linkno >= ldata->ld_leh->leh_reccount)
+		/* beyond last link */
+		RETURN(-ENODATA);
+
+	linkea_first_entry(ldata);
+	for (idx = 0; ldata->ld_lee != NULL; idx++) {
+		linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, ln,
+				    parent_fid);
+		if (idx == linkno)
+			break;
+
+		linkea_next_entry(ldata);
+	}
+
+	if (idx < linkno)
+		RETURN(-ENODATA);
+
+	RETURN(0);
+}
+
+/**
+ * Get parent FID and name of an identified link. Operation is performed for
+ * a given link number, letting the caller iterate over linkno to list one or
+ * all links of an entry.
+ *
+ * \param[in]     file - File descriptor against which to perform the operation
+ * \param[in,out] arg  - User-filled structure containing the linkno to operate
+ *                       on and the available size. It is eventually filled with
+ *                       the requested information or left untouched on error
+ *
+ * \retval - 0 on success
+ * \retval - Appropriate negative error code on failure
+ */
+int ll_getparent(struct file *file, struct getparent __user *arg)
+{
+	struct inode		*inode = file_inode(file);
+	struct linkea_data	*ldata;
+	struct lu_buf		 buf = LU_BUF_NULL;
+	struct lu_name		 ln;
+	struct lu_fid		 parent_fid;
+	__u32			 linkno;
+	__u32			 name_size;
+	int			 rc;
+
+	ENTRY;
+
+	if (!capable(CAP_DAC_READ_SEARCH) &&
+	    !test_bit(LL_SBI_USER_FID2PATH, ll_i2sbi(inode)->ll_flags))
+		RETURN(-EPERM);
+
+	if (get_user(name_size, &arg->gp_name_size))
+		RETURN(-EFAULT);
+
+	if (get_user(linkno, &arg->gp_linkno))
+		RETURN(-EFAULT);
+
+	if (name_size > PATH_MAX)
+		RETURN(-EINVAL);
+
+	OBD_ALLOC(ldata, sizeof(*ldata));
+	if (ldata == NULL)
+		RETURN(-ENOMEM);
+
+	rc = linkea_data_new(ldata, &buf);
+	if (rc < 0)
+		GOTO(ldata_free, rc);
+
+	rc = ll_xattr_list(inode, XATTR_NAME_LINK, XATTR_TRUSTED_T, buf.lb_buf,
+			   buf.lb_len, OBD_MD_FLXATTR);
+	if (rc < 0)
+		GOTO(lb_free, rc);
+
+	rc = ll_linkea_decode(ldata, linkno, &parent_fid, &ln);
+	if (rc < 0)
+		GOTO(lb_free, rc);
+
+	if (ln.ln_namelen >= name_size)
+		GOTO(lb_free, rc = -EOVERFLOW);
+
+	if (copy_to_user(&arg->gp_fid, &parent_fid, sizeof(arg->gp_fid)))
+		GOTO(lb_free, rc = -EFAULT);
+
+	if (copy_to_user(&arg->gp_name, ln.ln_name, ln.ln_namelen))
+		GOTO(lb_free, rc = -EFAULT);
+
+	if (put_user('\0', arg->gp_name + ln.ln_namelen))
+		GOTO(lb_free, rc = -EFAULT);
+
+lb_free:
+	lu_buf_free(&buf);
+ldata_free:
+	OBD_FREE(ldata, sizeof(*ldata));
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
new file mode 100644
index 0000000000000..16d73ebd71146
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -0,0 +1,616 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "llite_internal.h"
+#include <lustre_compat.h>
+
+static const struct vm_operations_struct ll_file_vm_ops;
+
+void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma,
+		     unsigned long addr, size_t count)
+{
+	policy->l_extent.start = ((addr - vma->vm_start) & PAGE_MASK) +
+				 (vma->vm_pgoff << PAGE_SHIFT);
+	policy->l_extent.end = (policy->l_extent.start + count - 1) |
+			       ~PAGE_MASK;
+}
+
+/*
+ * Linux commit v6.0-rc3-225-gf39af05949a4
+ * mm: add VMA iterator
+ */
+#ifndef VMA_ITERATOR
+#define vma_iterator vm_area_struct *
+#define vma_iter_init(vmip, mm, addr) *(vmip) = find_vma(mm, addr)
+#define for_each_vma(vmi, vma) \
+	for (vma = vmi; vma != NULL; vma = vma->vm_next)
+#endif
+
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+			       size_t count)
+{
+	struct vm_area_struct *vma, *ret = NULL;
+	struct vma_iterator vmi;
+
+	ENTRY;
+
+	/* mmap_lock must have been held by caller. */
+	LASSERT(!mmap_write_trylock(mm));
+
+	vma_iter_init(&vmi, mm, addr);
+	for_each_vma(vmi, vma) {
+		if (vma->vm_start < (addr + count))
+			break;
+		if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+		    vma->vm_flags & VM_SHARED) {
+			ret = vma;
+			break;
+		}
+	}
+	RETURN(ret);
+}
+
+/**
+ * API independent part for page fault initialization.
+ * \param env - corespondent lu_env to processing
+ * \param vma - virtual memory area addressed to page fault
+ * \param index - page index corespondent to fault.
+ * \param mkwrite - whether it is mmap write.
+ *
+ * \return error codes from cl_io_init.
+ */
+static struct cl_io *
+ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma,
+		pgoff_t index, bool mkwrite)
+{
+	struct file	       *file = vma->vm_file;
+	struct inode	       *inode = file_inode(file);
+	struct cl_io	       *io;
+	struct cl_fault_io     *fio;
+	int			rc;
+	ENTRY;
+
+	if (ll_file_nolock(file))
+		RETURN(ERR_PTR(-EOPNOTSUPP));
+
+restart:
+	io = vvp_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	LASSERT(io->ci_obj != NULL);
+
+	fio = &io->u.ci_fault;
+	fio->ft_index = index;
+	fio->ft_executable = vma->vm_flags & VM_EXEC;
+
+	if (mkwrite) {
+		fio->ft_mkwrite = 1;
+		fio->ft_writable = 1;
+	}
+
+	CDEBUG(D_MMAP,
+	       DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n",
+	       PFID(&ll_i2info(inode)->lli_fid), vma, vma->vm_start,
+	       vma->vm_end, vma->vm_flags, fio->ft_index);
+
+	if (vma->vm_flags & VM_SEQ_READ)
+		io->ci_seq_read = 1;
+	else if (vma->vm_flags & VM_RAND_READ)
+		io->ci_rand_read = 1;
+
+	rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj);
+	if (rc == 0) {
+		struct vvp_io *vio = vvp_env_io(env);
+		struct ll_file_data *fd = file->private_data;
+
+		LASSERT(vio->vui_cl.cis_io == io);
+
+		/* mmap lock must be MANDATORY it has to cache
+		 * pages. */
+		io->ci_lockreq = CILR_MANDATORY;
+		vio->vui_fd = fd;
+	} else {
+		cl_io_fini(env, io);
+		if (io->ci_need_restart)
+			goto restart;
+
+		io = ERR_PTR(rc);
+	}
+
+	RETURN(io);
+}
+
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+                            bool *retry)
+{
+	struct lu_env           *env;
+	struct cl_io            *io;
+	struct vvp_io           *vio;
+	int                      result;
+	__u16			 refcheck;
+	sigset_t old, new;
+	struct inode             *inode = NULL;
+	struct ll_inode_info     *lli;
+	ENTRY;
+
+	LASSERT(vmpage != NULL);
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = ll_fault_io_init(env, vma, vmpage->index, true);
+	if (IS_ERR(io))
+		GOTO(out, result = PTR_ERR(io));
+
+	result = io->ci_result;
+	if (result < 0)
+		GOTO(out_io, result);
+
+	vio = vvp_env_io(env);
+	vio->u.fault.ft_vma    = vma;
+	vio->u.fault.ft_vmpage = vmpage;
+
+	siginitsetinv(&new, sigmask(SIGKILL) | sigmask(SIGTERM));
+	sigprocmask(SIG_BLOCK, &new, &old);
+
+	inode = vvp_object_inode(io->ci_obj);
+	lli = ll_i2info(inode);
+
+	result = cl_io_loop(env, io);
+
+	sigprocmask(SIG_SETMASK, &old, NULL);
+
+        if (result == 0) {
+                lock_page(vmpage);
+                if (vmpage->mapping == NULL) {
+                        unlock_page(vmpage);
+
+                        /* page was truncated and lock was cancelled, return
+                         * ENODATA so that VM_FAULT_NOPAGE will be returned
+                         * to handle_mm_fault(). */
+                        if (result == 0)
+                                result = -ENODATA;
+                } else if (!PageDirty(vmpage)) {
+                        /* race, the page has been cleaned by ptlrpcd after
+                         * it was unlocked, it has to be added into dirty
+                         * cache again otherwise this soon-to-dirty page won't
+                         * consume any grants, even worse if this page is being
+                         * transferred because it will break RPC checksum.
+                         */
+                        unlock_page(vmpage);
+
+                        CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has "
+                               "been written out, retry.\n",
+                               vmpage, vmpage->index);
+
+                        *retry = true;
+                        result = -EAGAIN;
+                }
+
+		if (result == 0)
+			set_bit(LLIF_DATA_MODIFIED, &lli->lli_flags);
+        }
+        EXIT;
+
+out_io:
+	cl_io_fini(env, io);
+out:
+	cl_env_put(env, &refcheck);
+	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
+	LASSERT(ergo(result == 0, PageLocked(vmpage)));
+
+	/* if page has been unmapped, presumably due to lock reclaim for
+	 * concurrent usage, add some delay before retrying to prevent
+	 * entering live-lock situation with competitors
+	 */
+	if (result == -ENODATA && inode != NULL) {
+		CDEBUG(D_MMAP, "delaying new page-fault for inode %p to "
+			       "prevent live-lock\n", inode);
+		msleep(10);
+	}
+
+	return result;
+}
+
+static inline int to_fault_error(int result)
+{
+	switch(result) {
+	case 0:
+		result = VM_FAULT_LOCKED;
+		break;
+	case -ENOMEM:
+		result = VM_FAULT_OOM;
+		break;
+	default:
+		result = VM_FAULT_SIGBUS;
+		break;
+	}
+	return result;
+}
+
+int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	int ret;
+	unsigned int seq;
+
+	/* this seqlock lets us notice if a page has been deleted on this inode
+	 * during the fault process, allowing us to catch an erroneous SIGBUS
+	 * See LU-16160
+	 */
+	do {
+		seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock);
+		ret = __ll_filemap_fault(vma, vmf);
+	} while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) &&
+		 (ret & VM_FAULT_SIGBUS));
+
+	return ret;
+}
+
+/**
+ * Lustre implementation of a vm_operations_struct::fault() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * \param vma - is virtiual area struct related to page fault
+ * \param vmf - structure which describe type and address where hit fault
+ *
+ * \return allocated and filled _locked_ page for address
+ * \retval VM_FAULT_ERROR on general error
+ * \retval NOPAGE_OOM not have memory for allocate new page
+ */
+static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	struct lu_env           *env;
+	struct cl_io            *io;
+	struct vvp_io           *vio = NULL;
+	struct page             *vmpage;
+	int                      result = 0;
+	int                      fault_ret = 0;
+	__u16			 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	if (ll_sbi_has_fast_read(ll_i2sbi(inode))) {
+		/* do fast fault */
+		bool allow_retry = vmf->flags & FAULT_FLAG_ALLOW_RETRY;
+		bool has_retry = vmf->flags & FAULT_FLAG_RETRY_NOWAIT;
+
+		/* To avoid loops, instruct downstream to not drop mmap_sem */
+		/**
+		 * only need FAULT_FLAG_ALLOW_RETRY prior to Linux 5.1
+		 * (6b4c9f4469819), where FAULT_FLAG_RETRY_NOWAIT is enough
+		 * to not drop mmap_sem when failed to lock the page.
+		 */
+		vmf->flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+		ll_cl_add(inode, env, NULL, LCC_MMAP);
+		fault_ret = ll_filemap_fault(vma, vmf);
+		ll_cl_remove(inode, env);
+		if (!has_retry)
+			vmf->flags &= ~FAULT_FLAG_RETRY_NOWAIT;
+		if (!allow_retry)
+			vmf->flags &= ~FAULT_FLAG_ALLOW_RETRY;
+
+		/* - If there is no error, then the page was found in cache and
+		 *   uptodate;
+		 * - If VM_FAULT_RETRY is set, the page existed but failed to
+		 *   lock. We will try slow path to avoid loops.
+		 * - Otherwise, it should try normal fault under DLM lock. */
+		if (!(fault_ret & VM_FAULT_RETRY) &&
+		    !(fault_ret & VM_FAULT_ERROR))
+			GOTO(out, result = 0);
+
+		fault_ret = 0;
+	}
+
+	io = ll_fault_io_init(env, vma, vmf->pgoff, false);
+	if (IS_ERR(io))
+		GOTO(out, result = PTR_ERR(io));
+
+	result = io->ci_result;
+	if (result == 0) {
+		vio = vvp_env_io(env);
+		vio->u.fault.ft_vma       = vma;
+		vio->u.fault.ft_vmpage    = NULL;
+		vio->u.fault.ft_vmf = vmf;
+		vio->u.fault.ft_flags = 0;
+		vio->u.fault.ft_flags_valid = 0;
+
+		/* May call ll_readpage() */
+		ll_cl_add(inode, env, io, LCC_MMAP);
+
+		result = cl_io_loop(env, io);
+
+		ll_cl_remove(inode, env);
+
+		/* ft_flags are only valid if we reached
+		 * the call to filemap_fault */
+		if (vio->u.fault.ft_flags_valid)
+			fault_ret = vio->u.fault.ft_flags;
+
+		vmpage = vio->u.fault.ft_vmpage;
+		if (result != 0 && vmpage != NULL) {
+			put_page(vmpage);
+			vmf->page = NULL;
+		}
+        }
+	cl_io_fini(env, io);
+
+out:
+	cl_env_put(env, &refcheck);
+	if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
+		fault_ret |= to_fault_error(result);
+
+	CDEBUG(D_MMAP, "%s fault %d/%d\n", current->comm, fault_ret, result);
+	RETURN(fault_ret);
+}
+
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+static vm_fault_t ll_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+#else
+static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+#endif
+	int count = 0;
+	bool printed = false;
+	bool cached;
+	vm_fault_t result;
+	ktime_t kstart = ktime_get();
+	sigset_t old, new;
+
+	result = pcc_fault(vma, vmf, &cached);
+	if (cached)
+		goto out;
+
+	CDEBUG(D_MMAP|D_IOTRACE,
+	       DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n",
+	       PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid),
+	       vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff);
+
+	/* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
+	 * so that it can be killed by admin but not cause segfault by
+	 * other signals.
+	 */
+	siginitsetinv(&new, sigmask(SIGKILL) | sigmask(SIGTERM));
+	sigprocmask(SIG_BLOCK, &new, &old);
+
+	/* make sure offset is not a negative number */
+	if (vmf->pgoff > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
+		return VM_FAULT_SIGBUS;
+
+restart:
+	result = ll_fault0(vma, vmf);
+	if (vmf->page &&
+	    !(result & (VM_FAULT_RETRY | VM_FAULT_ERROR | VM_FAULT_LOCKED))) {
+		struct page *vmpage = vmf->page;
+
+		/* lock the page, then check if this page has been truncated
+		 * or deleted from Lustre and retry if so
+		 */
+		lock_page(vmpage);
+		if (unlikely(vmpage->mapping == NULL) ||
+		    vmpage->private == 0) { /* unlucky */
+			unlock_page(vmpage);
+			put_page(vmpage);
+			vmf->page = NULL;
+
+			if (!printed && ++count > 16) {
+				struct inode *inode = file_inode(vma->vm_file);
+
+				CWARN("%s: FID "DFID" under heavy mmap contention by '%s', consider revising IO pattern\n",
+				      ll_i2sbi(inode)->ll_fsname,
+				      PFID(&ll_i2info(inode)->lli_fid),
+				      current->comm);
+				printed = true;
+			}
+
+			goto restart;
+		}
+
+		result |= VM_FAULT_LOCKED;
+	}
+	sigprocmask(SIG_SETMASK, &old, NULL);
+
+out:
+	if (vmf->page && result == VM_FAULT_LOCKED) {
+		ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)),
+				  current->pid, vma->vm_file->private_data,
+				  cl_offset(NULL, vmf->page->index), PAGE_SIZE,
+				  READ);
+		ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
+				   LPROC_LL_FAULT,
+				   ktime_us_delta(ktime_get(), kstart));
+	}
+
+	return result;
+}
+
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+static vm_fault_t ll_page_mkwrite(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+#else
+static vm_fault_t ll_page_mkwrite(struct vm_area_struct *vma,
+				  struct vm_fault *vmf)
+{
+#endif
+	int count = 0;
+	bool printed = false;
+	bool retry;
+	bool cached;
+	ktime_t kstart = ktime_get();
+	vm_fault_t result;
+
+	CDEBUG(D_MMAP|D_IOTRACE,
+	       DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n",
+	       PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid),
+	       vma, vma->vm_start, vma->vm_end, vma->vm_flags,
+	       vmf->page->index);
+
+	result = pcc_page_mkwrite(vma, vmf, &cached);
+	if (cached)
+		goto out;
+
+	file_update_time(vma->vm_file);
+	do {
+		retry = false;
+		result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+		if (!printed && ++count > 16) {
+			const struct dentry *de = file_dentry(vma->vm_file);
+
+			CWARN("app(%s): the page %lu of file "DFID" is under heavy contention\n",
+			      current->comm, vmf->pgoff,
+			      PFID(ll_inode2fid(de->d_inode)));
+			printed = true;
+		}
+	} while (retry);
+
+	switch (result) {
+	case 0:
+		LASSERT(PageLocked(vmf->page));
+		result = VM_FAULT_LOCKED;
+		break;
+	case -ENODATA:
+	case -EFAULT:
+		result = VM_FAULT_NOPAGE;
+		break;
+	case -ENOMEM:
+		result = VM_FAULT_OOM;
+		break;
+	case -EAGAIN:
+		result = VM_FAULT_RETRY;
+		break;
+	default:
+		result = VM_FAULT_SIGBUS;
+		break;
+	}
+
+out:
+	if (result == VM_FAULT_LOCKED) {
+		ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)),
+				  current->pid, vma->vm_file->private_data,
+				  cl_offset(NULL, vmf->page->index), PAGE_SIZE,
+				  WRITE);
+		ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
+				   LPROC_LL_MKWRITE,
+				   ktime_us_delta(ktime_get(), kstart));
+	}
+
+	return result;
+}
+
+/**
+ *  To avoid cancel the locks covering mmapped region for lock cache pressure,
+ *  we track the mapped vma count in vvp_object::vob_mmap_cnt.
+ */
+static void ll_vm_open(struct vm_area_struct * vma)
+{
+	struct inode *inode    = file_inode(vma->vm_file);
+	struct vvp_object *vob = cl_inode2vvp(inode);
+
+	ENTRY;
+	LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0);
+	atomic_inc(&vob->vob_mmap_cnt);
+	pcc_vm_open(vma);
+	EXIT;
+}
+
+/**
+ * Dual to ll_vm_open().
+ */
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+	struct inode      *inode = file_inode(vma->vm_file);
+	struct vvp_object *vob   = cl_inode2vvp(inode);
+
+	ENTRY;
+	atomic_dec(&vob->vob_mmap_cnt);
+	LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0);
+	pcc_vm_close(vma);
+	EXIT;
+}
+
+static const struct vm_operations_struct ll_file_vm_ops = {
+	.fault			= ll_fault,
+	.page_mkwrite		= ll_page_mkwrite,
+	.open			= ll_vm_open,
+	.close			= ll_vm_close,
+};
+
+int ll_file_mmap(struct file *file, struct vm_area_struct * vma)
+{
+	struct inode *inode = file_inode(file);
+	ktime_t kstart = ktime_get();
+	bool cached;
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE | D_MMAP,
+	       "VFS_Op: fid="DFID" vma=%p start=%#lx end=%#lx vm_flags=%#lx\n",
+	       PFID(&ll_i2info(inode)->lli_fid),
+	       vma, vma->vm_start, vma->vm_end, vma->vm_flags);
+
+	if (ll_file_nolock(file))
+		RETURN(-EOPNOTSUPP);
+
+	rc = pcc_file_mmap(file, vma, &cached);
+	if (cached && rc != 0)
+		RETURN(rc);
+
+	rc = generic_file_mmap(file, vma);
+	if (rc == 0) {
+		vma->vm_ops = &ll_file_vm_ops;
+		vma->vm_ops->open(vma);
+		/* update the inode's size and mtime */
+		if (!cached)
+			rc = ll_glimpse_size(inode);
+	}
+
+	if (!rc)
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MMAP,
+				   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
new file mode 100644
index 0000000000000..7d16d9d165506
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
@@ -0,0 +1,401 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/lustre/llite/llite_nfs.c
+ *
+ * NFS export of Lustre Light File System
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Huang Hua <huanghua@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include "llite_internal.h"
+#include <linux/exportfs.h>
+
+u32 get_uuid2int(const char *name, int len)
+{
+	u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+
+	while (len--) {
+		u32 key = key1 + (key0 ^ (*name++ * 7152373));
+
+		if (key & 0x80000000)
+			key -= 0x7fffffff;
+
+		key1 = key0;
+		key0 = key;
+	}
+	return (key0 << 1);
+}
+
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ptlrpc_request *req = NULL;
+	struct inode *inode = NULL;
+	int eadatalen = 0;
+	unsigned long hash = cl_fid_build_ino(fid, ll_need_32bit_api(sbi));
+	struct md_op_data *op_data;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid));
+
+	inode = ilookup5(sb, hash, ll_test_inode_by_fid, (void *)fid);
+	if (inode)
+		RETURN(inode);
+
+	rc = ll_get_default_mdsize(sbi, &eadatalen);
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	/*
+	 * Because inode is NULL, ll_prep_md_op_data can not
+	 * be used here. So we allocate op_data ourselves
+	 */
+	OBD_ALLOC_PTR(op_data);
+	if (!op_data)
+		return ERR_PTR(-ENOMEM);
+
+	op_data->op_fid1 = *fid;
+	op_data->op_mode = eadatalen;
+	op_data->op_valid = OBD_MD_FLEASIZE;
+
+	/* mds_fid2dentry ignores f_type */
+	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+	OBD_FREE_PTR(op_data);
+	if (rc) {
+		/*
+		 * Suppress erroneous/confusing messages when NFS
+		 * is out of sync and requests old data.
+		 */
+		CDEBUG(D_INFO, "can't get object attrs, fid "DFID", rc %d\n",
+				PFID(fid), rc);
+		RETURN(ERR_PTR(rc));
+	}
+	rc = ll_prep_inode(&inode, &req->rq_pill, sb, NULL);
+	ptlrpc_req_finished(req);
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	RETURN(inode);
+}
+
+static struct dentry *
+ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent)
+{
+	struct inode  *inode;
+	struct dentry *result;
+
+	ENTRY;
+
+	if (!fid_is_sane(fid))
+		RETURN(ERR_PTR(-ESTALE));
+
+	CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid));
+
+	inode = search_inode_for_lustre(sb, fid);
+	if (IS_ERR(inode))
+		RETURN(ERR_CAST(inode));
+
+	if (is_bad_inode(inode)) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		RETURN(ERR_PTR(-ESTALE));
+	}
+
+	/* N.B. d_obtain_alias() drops inode ref on error */
+	result = d_obtain_alias(inode);
+	if (!IS_ERR(result)) {
+		struct ll_dentry_data *ldd;
+
+		if (!ll_d_setup(result, true))
+			RETURN(ERR_PTR(-ENOMEM));
+		ldd = ll_d2d(result);
+		/*
+		 * Need to signal to the ll_file_open that
+		 * we came from NFS and so opencache needs to be
+		 * enabled for this one
+		 */
+		spin_lock(&result->d_lock);
+		ldd->lld_nfs_dentry = 1;
+		spin_unlock(&result->d_lock);
+	}
+
+	RETURN(result);
+}
+
+#ifndef FILEID_INVALID
+#define FILEID_INVALID 0xff
+#endif
+#ifndef FILEID_LUSTRE
+#define FILEID_LUSTRE  0x97
+#endif
+
+/**
+ * \a connectable - is nfsd will connect himself or this should be done
+ *                  at lustre
+ *
+ * The return value is file handle type:
+ * 1 -- contains child file handle;
+ * 2 -- contains child file handle and parent file handle;
+ * 255 -- error.
+ */
+static int ll_encode_fh(struct inode *inode, u32 *fh, int *plen,
+			struct inode *parent)
+{
+	int fileid_len = sizeof(struct lustre_file_handle) / 4;
+	struct lustre_file_handle *lfh = (void *)fh;
+
+	ENTRY;
+
+	CDEBUG(D_INFO, "%s: encoding for ("DFID") maxlen=%d minlen=%d\n",
+	       ll_i2sbi(inode)->ll_fsname,
+	       PFID(ll_inode2fid(inode)), *plen, fileid_len);
+
+	if (*plen < fileid_len) {
+		*plen = fileid_len;
+		RETURN(FILEID_INVALID);
+	}
+
+	lfh->lfh_child = *ll_inode2fid(inode);
+	if (parent)
+		lfh->lfh_parent = *ll_inode2fid(parent);
+	else
+		fid_zero(&lfh->lfh_parent);
+	*plen = fileid_len;
+
+	RETURN(FILEID_LUSTRE);
+}
+
+static inline int
+do_nfs_get_name_filldir(struct ll_getname_data *lgd, const char *name,
+			int namelen, loff_t hash, u64 ino, unsigned int type)
+{
+	/*
+	 * It is hack to access lde_fid for comparison with lgd_fid.
+	 * So the input 'name' must be part of the 'lu_dirent', and
+	 * so must appear to be a non-const pointer to an empty array.
+	 */
+	char (*n)[0] = (void *)name;
+	/* NOTE: This should be container_of().  However container_of() in
+	 * kernels earlier than v4.13-rc1~37^2~94 cause this to generate a
+	 * warning, which fails when we compile with -Werror.  Those earlier
+	 * kernels don't have container_of_safe, calling that instead will use
+	 * the lustre-local version which doesn't generate the warning.
+	 */
+	struct lu_dirent *lde = container_of_safe(n, struct lu_dirent, lde_name);
+	struct lu_fid fid;
+
+	fid_le_to_cpu(&fid, &lde->lde_fid);
+	if (lu_fid_eq(&fid, &lgd->lgd_fid)) {
+		memcpy(lgd->lgd_name, name, namelen);
+		lgd->lgd_name[namelen] = 0;
+		lgd->lgd_found = 1;
+	}
+	return lgd->lgd_found;
+}
+
+#ifdef HAVE_FILLDIR_USE_CTX_RETURN_BOOL
+static bool
+ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, int namelen,
+			loff_t hash, u64 ino, unsigned int type)
+{
+	struct ll_getname_data *lgd =
+		container_of(ctx, struct ll_getname_data, ctx);
+	int err = do_nfs_get_name_filldir(lgd, name, namelen, hash, ino, type);
+
+	return err == 0;
+}
+#elif defined(HAVE_FILLDIR_USE_CTX)
+static int
+ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, int namelen,
+			loff_t hash, u64 ino, unsigned int type)
+{
+	struct ll_getname_data *lgd =
+		container_of(ctx, struct ll_getname_data, ctx);
+
+	return do_nfs_get_name_filldir(lgd, name, namelen, hash, ino, type);
+}
+#else
+static int ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen,
+				   loff_t hash, u64 ino, unsigned int type)
+{
+	struct ll_getname_data *lgd = cookie;
+
+	return do_nfs_get_name_filldir(lgd, name, namelen, hash, ino, type);
+}
+#endif /* HAVE_FILLDIR_USE_CTX */
+
+static int ll_get_name(struct dentry *dentry, char *name, struct dentry *child)
+{
+	struct inode *dir = dentry->d_inode;
+	struct ll_getname_data lgd = {
+		.lgd_name = name,
+		.lgd_fid = ll_i2info(child->d_inode)->lli_fid,
+#ifdef HAVE_DIR_CONTEXT
+		.ctx.actor = ll_nfs_get_name_filldir,
+#endif
+		.lgd_found = 0,
+	};
+	struct md_op_data *op_data;
+	u64 pos = 0;
+	int rc;
+
+	ENTRY;
+
+	if (!dir || !S_ISDIR(dir->i_mode))
+		GOTO(out, rc = -ENOTDIR);
+
+	if (!dir->i_fop)
+		GOTO(out, rc = -EINVAL);
+
+	op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, dir);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+
+	inode_lock(dir);
+#ifdef HAVE_DIR_CONTEXT
+	rc = ll_dir_read(dir, &pos, op_data, &lgd.ctx, NULL);
+#else
+	rc = ll_dir_read(dir, &pos, op_data, &lgd, ll_nfs_get_name_filldir,
+			 NULL);
+#endif
+	inode_unlock(dir);
+	ll_finish_md_op_data(op_data);
+	if (!rc && !lgd.lgd_found)
+		rc = -ENOENT;
+	EXIT;
+out:
+	return rc;
+}
+
+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
+				      int fh_len, int fh_type)
+{
+	struct lustre_file_handle *lfh = (struct lustre_file_handle *)fid;
+
+	if (fh_type != FILEID_LUSTRE)
+		RETURN(ERR_PTR(-EPROTO));
+
+	RETURN(ll_iget_for_nfs(sb, &lfh->lfh_child, &lfh->lfh_parent));
+}
+
+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
+				      int fh_len, int fh_type)
+{
+	struct lustre_file_handle *lfh = (struct lustre_file_handle *)fid;
+
+	if (fh_type != FILEID_LUSTRE)
+		RETURN(ERR_PTR(-EPROTO));
+
+	RETURN(ll_iget_for_nfs(sb, &lfh->lfh_parent, NULL));
+}
+
+int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid)
+{
+	struct ptlrpc_request	*req = NULL;
+	struct ll_sb_info	*sbi;
+	struct mdt_body		*body;
+	static const char	dotdot[] = "..";
+	struct md_op_data	*op_data;
+	int			rc;
+	int			lmmsize;
+
+	ENTRY;
+
+	LASSERT(dir && S_ISDIR(dir->i_mode));
+
+	sbi = ll_s2sbi(dir->i_sb);
+
+	CDEBUG(D_INFO, "%s: getting parent for ("DFID")\n",
+	       sbi->ll_fsname, PFID(ll_inode2fid(dir)));
+
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc != 0)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot,
+				     strlen(dotdot), lmmsize,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc != 0) {
+		CERROR("%s: failure inode "DFID" get parent: rc = %d\n",
+		       sbi->ll_fsname, PFID(ll_inode2fid(dir)), rc);
+		RETURN(rc);
+	}
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+	/*
+	 * LU-3952: MDT may lost the FID of its parent, we should not crash
+	 * the NFS server, ll_iget_for_nfs() will handle the error.
+	 */
+	if (body->mbo_valid & OBD_MD_FLID) {
+		CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n",
+		       PFID(ll_inode2fid(dir)), PFID(&body->mbo_fid1));
+		*parent_fid = body->mbo_fid1;
+	}
+
+	ptlrpc_req_finished(req);
+	RETURN(0);
+}
+
+static struct dentry *ll_get_parent(struct dentry *dchild)
+{
+	struct lu_fid parent_fid = { 0 };
+	int rc;
+	struct dentry *dentry;
+
+	ENTRY;
+
+	rc = ll_dir_get_parent_fid(dchild->d_inode, &parent_fid);
+	if (rc != 0)
+		RETURN(ERR_PTR(rc));
+
+	dentry = ll_iget_for_nfs(dchild->d_inode->i_sb, &parent_fid, NULL);
+
+	RETURN(dentry);
+}
+
+const struct export_operations lustre_export_operations = {
+	.get_parent = ll_get_parent,
+	.encode_fh  = ll_encode_fh,
+	.get_name   = ll_get_name,
+	.fh_to_dentry = ll_fh_to_dentry,
+	.fh_to_parent = ll_fh_to_parent,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
new file mode 100644
index 0000000000000..af2629f1e9c32
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -0,0 +1,2585 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/version.h>
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+
+#include <uapi/linux/lustre/lustre_param.h>
+#include <lprocfs_status.h>
+#include <obd_support.h>
+
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+static struct kobject *llite_kobj;
+static struct dentry *llite_root;
+
+static void llite_kobj_release(struct kobject *kobj)
+{
+	if (!IS_ERR_OR_NULL(llite_root)) {
+		debugfs_remove(llite_root);
+		llite_root = NULL;
+	}
+
+	kfree(kobj);
+}
+
+static struct kobj_type llite_kobj_ktype = {
+	.release	= llite_kobj_release,
+	.sysfs_ops	= &lustre_sysfs_ops,
+};
+
+int llite_tunables_register(void)
+{
+	int rc;
+
+	llite_kobj = kzalloc(sizeof(*llite_kobj), GFP_KERNEL);
+	if (!llite_kobj)
+		return -ENOMEM;
+
+	llite_kobj->kset = lustre_kset;
+	rc = kobject_init_and_add(llite_kobj, &llite_kobj_ktype,
+				  &lustre_kset->kobj, "%s", "llite");
+	if (rc)
+		goto free_kobj;
+
+	llite_root = debugfs_create_dir("llite", debugfs_lustre_root);
+	return 0;
+
+free_kobj:
+	kobject_put(llite_kobj);
+	llite_kobj = NULL;
+
+	return rc;
+}
+
+void llite_tunables_unregister(void)
+{
+	kobject_put(llite_kobj);
+	llite_kobj = NULL;
+}
+
+/* <debugfs>/lustre/llite mount point registration */
+static const struct file_operations ll_rw_extents_stats_fops;
+static const struct file_operations ll_rw_extents_stats_pp_fops;
+static const struct file_operations ll_rw_offset_stats_fops;
+
+/**
+ * ll_stats_pid_write() - Determine if stats collection should be enabled
+ * @buf: Buffer containing the data written
+ * @len: Number of bytes in the buffer
+ *
+ * Several proc files begin collecting stats when a value is written, and stop
+ * collecting when either '0' or 'disable' is written. This function checks the
+ * written value to see if collection should be enabled or disabled.
+ *
+ * Return: If '0' or 'disable' is provided, 0 is returned. If the text
+ * equivalent of a number is written, that number is returned. Otherwise,
+ * 1 is returned. Non-zero return values indicate collection should be enabled.
+ */
+static s64 ll_stats_pid_write(const char __user *buf, size_t len)
+{
+	unsigned long long value = 1;
+	char kernbuf[16];
+	int rc;
+
+	rc = kstrtoull_from_user(buf, len, 0, &value);
+	if (rc < 0 && len < sizeof(kernbuf)) {
+		if (copy_from_user(kernbuf, buf, len))
+			return -EFAULT;
+		kernbuf[len] = 0;
+
+		if (kernbuf[len - 1] == '\n')
+			kernbuf[len - 1] = 0;
+
+		if (strncasecmp(kernbuf, "disable", 7) == 0)
+			value = 0;
+	}
+
+	return value;
+}
+
+static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%u\n", osfs.os_bsize);
+}
+LUSTRE_RO_ATTR(blocksize);
+
+static ssize_t stat_blocksize_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return sprintf(buf, "%u\n", sbi->ll_stat_blksize);
+}
+
+static ssize_t stat_blocksize_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buffer,
+				    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	if (val != 0 && (val < PAGE_SIZE || (val & (val - 1))) != 0)
+		return -ERANGE;
+
+	sbi->ll_stat_blksize = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(stat_blocksize);
+
+static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_blocks;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytestotal);
+
+static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bfree;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytesfree);
+
+static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bavail;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytesavail);
+
+static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_files);
+}
+LUSTRE_RO_ATTR(filestotal);
+
+static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_ffree);
+}
+LUSTRE_RO_ATTR(filesfree);
+
+static ssize_t client_type_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "local client\n");
+}
+LUSTRE_RO_ATTR(client_type);
+
+LUSTRE_RW_ATTR(foreign_symlink_enable);
+
+LUSTRE_RW_ATTR(foreign_symlink_prefix);
+
+LUSTRE_RW_ATTR(foreign_symlink_upcall);
+
+LUSTRE_WO_ATTR(foreign_symlink_upcall_info);
+
+static ssize_t fstype_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	return sprintf(buf, "lustre\n");
+}
+LUSTRE_RO_ATTR(fstype);
+
+static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return sprintf(buf, "%s\n", sbi->ll_sb_uuid.uuid);
+}
+LUSTRE_RO_ATTR(uuid);
+
+static int ll_site_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+
+	/*
+	 * See description of statistical counters in struct cl_site, and
+	 * struct lu_site.
+	 */
+	return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m);
+}
+
+LDEBUGFS_SEQ_FOPS_RO(ll_site_stats);
+
+static ssize_t max_read_ahead_mb_show(struct kobject *kobj,
+				      struct attribute *attr, char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n",
+			PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages));
+}
+
+static ssize_t max_read_ahead_mb_store(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buffer, size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	u64 ra_max_mb, pages_number;
+	int rc;
+
+	rc = sysfs_memparse(buffer, count, &ra_max_mb, "MiB");
+	if (rc)
+		return rc;
+
+	pages_number = round_up(ra_max_mb, 1024 * 1024) >> PAGE_SHIFT;
+	CDEBUG(D_INFO, "%s: set max_read_ahead_mb=%llu (%llu pages)\n",
+	       sbi->ll_fsname, PAGES_TO_MiB(pages_number), pages_number);
+	if (pages_number > cfs_totalram_pages() / 2) {
+		/* 1/2 of RAM */
+		CERROR("%s: cannot set max_read_ahead_mb=%llu > totalram/2=%luMB\n",
+		       sbi->ll_fsname, PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(cfs_totalram_pages() / 2));
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_read_ahead_mb);
+
+static ssize_t max_read_ahead_per_file_mb_show(struct kobject *kobj,
+					       struct attribute *attr,
+					       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n",
+			 PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file));
+}
+
+static ssize_t max_read_ahead_per_file_mb_store(struct kobject *kobj,
+						struct attribute *attr,
+						const char *buffer,
+						size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	u64 ra_max_file_mb, pages_number;
+	int rc;
+
+	rc = sysfs_memparse(buffer, count, &ra_max_file_mb, "MiB");
+	if (rc)
+		return rc;
+
+	pages_number = round_up(ra_max_file_mb, 1024 * 1024) >> PAGE_SHIFT;
+	if (pages_number > sbi->ll_ra_info.ra_max_pages) {
+		CERROR("%s: cannot set max_read_ahead_per_file_mb=%llu > max_read_ahead_mb=%lu\n",
+		       sbi->ll_fsname, PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages));
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_read_ahead_per_file_mb);
+
+static ssize_t max_read_ahead_whole_mb_show(struct kobject *kobj,
+					    struct attribute *attr, char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n",
+			 PAGES_TO_MiB(sbi->ll_ra_info.ra_max_read_ahead_whole_pages));
+}
+
+static ssize_t max_read_ahead_whole_mb_store(struct kobject *kobj,
+					     struct attribute *attr,
+					     const char *buffer, size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	u64 ra_max_whole_mb, pages_number;
+	int rc;
+
+	rc = sysfs_memparse(buffer, count, &ra_max_whole_mb, "MiB");
+	if (rc)
+		return rc;
+
+	pages_number = round_up(ra_max_whole_mb, 1024 * 1024) >> PAGE_SHIFT;
+	/* Cap this at the current max readahead window size, the readahead
+	 * algorithm does this anyway so it's pointless to set it larger.
+	 */
+	if (pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
+		CERROR("%s: cannot set max_read_ahead_whole_mb=%llu > max_read_ahead_per_file_mb=%lu\n",
+		       sbi->ll_fsname, PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file));
+
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_read_ahead_whole_mb);
+
+static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block     *sb    = m->private;
+	struct ll_sb_info      *sbi   = ll_s2sbi(sb);
+	struct cl_client_cache *cache = sbi->ll_cache;
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	long max_cached_mb;
+	long unused_mb;
+
+	mutex_lock(&cache->ccc_max_cache_mb_lock);
+	max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max);
+	unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left));
+	mutex_unlock(&cache->ccc_max_cache_mb_lock);
+
+	seq_printf(m, "users: %d\n"
+		      "max_cached_mb: %ld\n"
+		      "used_mb: %ld\n"
+		      "unused_mb: %ld\n"
+		      "reclaim_count: %u\n"
+		      "max_read_ahead_mb: %lu\n"
+		      "used_read_ahead_mb: %d\n",
+		   atomic_read(&cache->ccc_users),
+		   max_cached_mb,
+		   max_cached_mb - unused_mb,
+		   unused_mb,
+		   cache->ccc_lru_shrinkers,
+		   PAGES_TO_MiB(ra->ra_max_pages),
+		   PAGES_TO_MiB(atomic_read(&ra->ra_cur_pages)));
+	return 0;
+}
+
+static ssize_t ll_max_cached_mb_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct cl_client_cache *cache = sbi->ll_cache;
+	struct lu_env *env;
+	long diff = 0;
+	long nrpages = 0;
+	__u16 refcheck;
+	u64 pages_number;
+	int rc;
+	char kernbuf[128], *ptr;
+
+	ENTRY;
+	if (count >= sizeof(kernbuf))
+		RETURN(-EINVAL);
+
+	if (copy_from_user(kernbuf, buffer, count))
+		RETURN(-EFAULT);
+	kernbuf[count] = '\0';
+
+	ptr = lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count);
+	rc = sysfs_memparse(ptr, count, &pages_number, "MiB");
+	if (rc)
+		RETURN(rc);
+
+	pages_number >>= PAGE_SHIFT;
+
+	if (pages_number < 0 || pages_number > cfs_totalram_pages()) {
+		CERROR("%s: can't set max cache more than %lu MB\n",
+		       sbi->ll_fsname,
+		       PAGES_TO_MiB(cfs_totalram_pages()));
+		RETURN(-ERANGE);
+	}
+	/* Allow enough cache so clients can make well-formed RPCs */
+	pages_number = max_t(long, pages_number, PTLRPC_MAX_BRW_PAGES);
+
+	mutex_lock(&cache->ccc_max_cache_mb_lock);
+	diff = pages_number - cache->ccc_lru_max;
+
+	/* easy - add more LRU slots. */
+	if (diff >= 0) {
+		atomic_long_add(diff, &cache->ccc_lru_left);
+		GOTO(out, rc = 0);
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		GOTO(out_unlock, rc = PTR_ERR(env));
+
+	diff = -diff;
+	while (diff > 0) {
+		long tmp;
+
+		/* reduce LRU budget from free slots. */
+		do {
+			long lru_left_old, lru_left_new, lru_left_ret;
+
+			lru_left_old = atomic_long_read(&cache->ccc_lru_left);
+			if (lru_left_old == 0)
+				break;
+
+			lru_left_new = lru_left_old > diff ?
+					lru_left_old - diff : 0;
+			lru_left_ret =
+				atomic_long_cmpxchg(&cache->ccc_lru_left,
+						    lru_left_old,
+						    lru_left_new);
+			if (likely(lru_left_old == lru_left_ret)) {
+				diff -= lru_left_old - lru_left_new;
+				nrpages += lru_left_old - lru_left_new;
+				break;
+			}
+		} while (1);
+
+		if (diff <= 0)
+			break;
+
+		if (sbi->ll_dt_exp == NULL) { /* being initialized */
+			rc = -ENODEV;
+			break;
+		}
+
+		/* Request extra free slots to avoid them all being used
+		 * by other processes before this can continue shrinking.
+		 */
+		tmp = diff + min_t(long, diff, MiB_TO_PAGES(1024));
+		/* difficult - have to ask OSCs to drop LRU slots. */
+		rc = obd_set_info_async(env, sbi->ll_dt_exp,
+				sizeof(KEY_CACHE_LRU_SHRINK),
+				KEY_CACHE_LRU_SHRINK,
+				sizeof(tmp), &tmp, NULL);
+		if (rc < 0)
+			break;
+	}
+	cl_env_put(env, &refcheck);
+
+out:
+	if (rc >= 0) {
+		cache->ccc_lru_max = pages_number;
+		rc = count;
+	} else {
+		atomic_long_add(nrpages, &cache->ccc_lru_left);
+	}
+out_unlock:
+	mutex_unlock(&cache->ccc_max_cache_mb_lock);
+	return rc;
+}
+LDEBUGFS_SEQ_FOPS(ll_max_cached_mb);
+
+static ssize_t checksums_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 test_bit(LL_SBI_CHECKSUM, sbi->ll_flags));
+}
+
+static ssize_t checksums_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int tmp;
+	int rc;
+
+	if (!sbi->ll_dt_exp)
+		/* Not set up yet */
+		return -EAGAIN;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+	if (val)
+		set_bit(LL_SBI_CHECKSUM, sbi->ll_flags);
+	else
+		clear_bit(LL_SBI_CHECKSUM, sbi->ll_flags);
+	tmp = val;
+
+	rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				KEY_CHECKSUM, sizeof(tmp), &tmp, NULL);
+	if (rc)
+		CWARN("Failed to set OSC checksum flags: %d\n", rc);
+
+	return count;
+}
+LUSTRE_RW_ATTR(checksums);
+
+LUSTRE_ATTR(checksum_pages, 0644, checksums_show, checksums_store);
+
+static ssize_t ll_rd_track_id(struct kobject *kobj, char *buf,
+			      enum stats_track_type type)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	if (sbi->ll_stats_track_type == type)
+		return sprintf(buf, "%d\n", sbi->ll_stats_track_id);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+		return sprintf(buf, "0 (all)\n");
+
+	return sprintf(buf, "untracked\n");
+}
+
+static ssize_t ll_wr_track_id(struct kobject *kobj, const char *buffer,
+			      size_t count, enum stats_track_type type)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long pid;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &pid);
+	if (rc)
+		return rc;
+
+	sbi->ll_stats_track_id = pid;
+	if (pid == 0)
+		sbi->ll_stats_track_type = STATS_TRACK_ALL;
+	else
+		sbi->ll_stats_track_type = type;
+	lprocfs_clear_stats(sbi->ll_stats);
+	return count;
+}
+
+static ssize_t stats_track_pid_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
+{
+	return ll_rd_track_id(kobj, buf, STATS_TRACK_PID);
+}
+
+static ssize_t stats_track_pid_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer,
+				     size_t count)
+{
+	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PID);
+}
+LUSTRE_RW_ATTR(stats_track_pid);
+
+static ssize_t stats_track_ppid_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
+{
+	return ll_rd_track_id(kobj, buf, STATS_TRACK_PPID);
+}
+
+static ssize_t stats_track_ppid_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer,
+				      size_t count)
+{
+	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PPID);
+}
+LUSTRE_RW_ATTR(stats_track_ppid);
+
+static ssize_t stats_track_gid_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
+{
+	return ll_rd_track_id(kobj, buf, STATS_TRACK_GID);
+}
+
+static ssize_t stats_track_gid_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer,
+				     size_t count)
+{
+	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_GID);
+}
+LUSTRE_RW_ATTR(stats_track_gid);
+
+static ssize_t statahead_running_max_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_sa_running_max);
+}
+
+static ssize_t statahead_running_max_store(struct kobject *kobj,
+					   struct attribute *attr,
+					   const char *buffer,
+					   size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val <= LL_SA_RUNNING_MAX) {
+		sbi->ll_sa_running_max = val;
+		return count;
+	}
+
+	CERROR("Bad statahead_running_max value %lu. Valid values "
+	       "are in the range [0, %d]\n", val, LL_SA_RUNNING_MAX);
+
+	return -ERANGE;
+}
+LUSTRE_RW_ATTR(statahead_running_max);
+
+static ssize_t statahead_max_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return sprintf(buf, "%u\n", sbi->ll_sa_max);
+}
+
+static ssize_t statahead_max_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val <= LL_SA_RPC_MAX)
+		sbi->ll_sa_max = val;
+	else
+		CERROR("Bad statahead_max value %lu. Valid values are in the range [0, %d]\n",
+		       val, LL_SA_RPC_MAX);
+
+	return count;
+}
+LUSTRE_RW_ATTR(statahead_max);
+
+static ssize_t statahead_agl_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 test_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags));
+}
+
+static ssize_t statahead_agl_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		set_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags);
+	else
+		clear_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags);
+
+	return count;
+}
+LUSTRE_RW_ATTR(statahead_agl);
+
+static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "statahead total: %u\n"
+		      "statahead wrong: %u\n"
+		      "agl total: %u\n",
+		   atomic_read(&sbi->ll_sa_total),
+		   atomic_read(&sbi->ll_sa_wrong),
+		   atomic_read(&sbi->ll_agl_total));
+	return 0;
+}
+
+LDEBUGFS_SEQ_FOPS_RO(ll_statahead_stats);
+
+static ssize_t lazystatfs_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 test_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags));
+}
+
+static ssize_t lazystatfs_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buffer,
+				size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		set_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags);
+	else
+		clear_bit(LL_SBI_LAZYSTATFS, sbi->ll_flags);
+
+	return count;
+}
+LUSTRE_RW_ATTR(lazystatfs);
+
+static ssize_t statfs_max_age_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_statfs_max_age);
+}
+
+static ssize_t statfs_max_age_store(struct kobject *kobj,
+				    struct attribute *attr, const char *buffer,
+				    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+	if (val > OBD_STATFS_CACHE_MAX_AGE)
+		return -EINVAL;
+
+	sbi->ll_statfs_max_age = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(statfs_max_age);
+
+static ssize_t neg_dentry_timeout_show(struct kobject *kobj, struct attribute *attr,
+									   char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+	                                      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", sbi->ll_neg_dentry_timeout);
+}
+
+static ssize_t neg_dentry_timeout_store(struct kobject *kobj,
+										struct attribute *attr, const char *buffer,
+										size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+	                                      ll_kset.kobj);
+	int val;
+	int rc;
+
+	rc = kstrtoint(buffer, 10, &val);
+	if (rc)
+		return rc;
+	if (val < OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS)
+		return -EINVAL;
+
+	sbi->ll_neg_dentry_timeout = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(neg_dentry_timeout);
+
+static ssize_t max_easize_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int ealen;
+	int rc;
+
+	rc = ll_get_max_mdsize(sbi, &ealen);
+	if (rc)
+		return rc;
+
+	/* Limit xattr size returned to userspace based on kernel maximum */
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 ealen > XATTR_SIZE_MAX ? XATTR_SIZE_MAX : ealen);
+}
+LUSTRE_RO_ATTR(max_easize);
+
+/**
+ * Get default_easize.
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] m		seq_file handle
+ * \param[in] v		unused for single entry
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+static ssize_t default_easize_show(struct kobject *kobj,
+				   struct attribute *attr,
+				   char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int ealen;
+	int rc;
+
+	rc = ll_get_default_mdsize(sbi, &ealen);
+	if (rc)
+		return rc;
+
+	/* Limit xattr size returned to userspace based on kernel maximum */
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 ealen > XATTR_SIZE_MAX ? XATTR_SIZE_MAX : ealen);
+}
+
+/**
+ * Set default_easize.
+ *
+ * Range checking on the passed value is handled by
+ * ll_set_default_mdsize().
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] file	proc file
+ * \param[in] buffer	string passed from user space
+ * \param[in] count	\a buffer length
+ * \param[in] off	unused for single entry
+ *
+ * \retval positive	\a count on success
+ * \retval negative	negated errno on failure
+ */
+static ssize_t default_easize_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buffer,
+				    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	if (count == 0)
+		return 0;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	rc = ll_set_default_mdsize(sbi, val);
+	if (rc)
+		return rc;
+
+	return count;
+}
+LUSTRE_RW_ATTR(default_easize);
+
+LDEBUGFS_SEQ_FOPS_RO(ll_sbi_flags);
+
+static ssize_t xattr_cache_show(struct kobject *kobj,
+				struct attribute *attr,
+				char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return sprintf(buf, "%u\n", sbi->ll_xattr_cache_enabled);
+}
+
+static ssize_t xattr_cache_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer,
+				 size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	if (val && !test_bit(LL_SBI_XATTR_CACHE, sbi->ll_flags))
+		return -ENOTSUPP;
+
+	sbi->ll_xattr_cache_enabled = val;
+	sbi->ll_xattr_cache_set = 1;
+
+	return count;
+}
+LUSTRE_RW_ATTR(xattr_cache);
+
+static ssize_t tiny_write_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 test_bit(LL_SBI_TINY_WRITE, sbi->ll_flags));
+}
+
+static ssize_t tiny_write_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buffer,
+				size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	spin_lock(&sbi->ll_lock);
+	if (val)
+		set_bit(LL_SBI_TINY_WRITE, sbi->ll_flags);
+	else
+		clear_bit(LL_SBI_TINY_WRITE, sbi->ll_flags);
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(tiny_write);
+
+static ssize_t parallel_dio_show(struct kobject *kobj,
+				 struct attribute *attr,
+				 char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n",
+			test_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags));
+}
+
+static ssize_t parallel_dio_store(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buffer,
+				  size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	spin_lock(&sbi->ll_lock);
+	if (val)
+		set_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags);
+	else
+		clear_bit(LL_SBI_PARALLEL_DIO, sbi->ll_flags);
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(parallel_dio);
+
+static ssize_t max_read_ahead_async_active_show(struct kobject *kobj,
+					       struct attribute *attr,
+					       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 sbi->ll_ra_info.ra_async_max_active);
+}
+
+static ssize_t max_read_ahead_async_active_store(struct kobject *kobj,
+						 struct attribute *attr,
+						 const char *buffer,
+						 size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	/**
+	 * It doesn't make any sense to make it exceed what
+	 * workqueue could acutally support. This can easily
+	 * over subscripe the cores but Lustre internally
+	 * throttles to avoid those impacts.
+	 */
+	if (val > WQ_UNBOUND_MAX_ACTIVE) {
+		CERROR("%s: cannot set max_read_ahead_async_active=%u larger than %u\n",
+		       sbi->ll_fsname, val, WQ_UNBOUND_MAX_ACTIVE);
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_async_max_active = val;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_read_ahead_async_active);
+
+static ssize_t read_ahead_async_file_threshold_mb_show(struct kobject *kobj,
+						       struct attribute *attr,
+						       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n", PAGES_TO_MiB(
+			 sbi->ll_ra_info.ra_async_pages_per_file_threshold));
+}
+
+static ssize_t
+read_ahead_async_file_threshold_mb_store(struct kobject *kobj,
+					 struct attribute *attr,
+					 const char *buffer, size_t count)
+{
+	unsigned long pages_number;
+	unsigned long max_ra_per_file;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &pages_number);
+	if (rc)
+		return rc;
+
+	pages_number = MiB_TO_PAGES(pages_number);
+	max_ra_per_file = sbi->ll_ra_info.ra_max_pages_per_file;
+	if (pages_number < 0 || pages_number > max_ra_per_file) {
+		CERROR("%s: can't set read_ahead_async_file_threshold_mb=%lu > "
+		       "max_read_readahead_per_file_mb=%lu\n", sbi->ll_fsname,
+		       PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(max_ra_per_file));
+		return -ERANGE;
+	}
+	sbi->ll_ra_info.ra_async_pages_per_file_threshold = pages_number;
+
+	return count;
+}
+LUSTRE_RW_ATTR(read_ahead_async_file_threshold_mb);
+
+static ssize_t read_ahead_range_kb_show(struct kobject *kobj,
+					struct attribute *attr,char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%lu\n",
+			sbi->ll_ra_info.ra_range_pages << (PAGE_SHIFT - 10));
+}
+
+static ssize_t
+read_ahead_range_kb_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	unsigned long pages_number;
+	unsigned long max_ra_per_file;
+	u64 val;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	int rc;
+
+	rc = sysfs_memparse(buffer, count, &val, "KiB");
+	if (rc < 0)
+		return rc;
+
+	pages_number = val >> PAGE_SHIFT;
+	/* Disable mmap range read */
+	if (pages_number == 0)
+		goto out;
+
+	max_ra_per_file = sbi->ll_ra_info.ra_max_pages_per_file;
+	if (pages_number > max_ra_per_file ||
+	    pages_number < RA_MIN_MMAP_RANGE_PAGES)
+		return -ERANGE;
+
+out:
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_range_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(read_ahead_range_kb);
+
+static ssize_t fast_read_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 test_bit(LL_SBI_FAST_READ, sbi->ll_flags));
+}
+
+static ssize_t fast_read_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	spin_lock(&sbi->ll_lock);
+	if (val)
+		set_bit(LL_SBI_FAST_READ, sbi->ll_flags);
+	else
+		clear_bit(LL_SBI_FAST_READ, sbi->ll_flags);
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(fast_read);
+
+static ssize_t file_heat_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 test_bit(LL_SBI_FILE_HEAT, sbi->ll_flags));
+}
+
+static ssize_t file_heat_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	spin_lock(&sbi->ll_lock);
+	if (val)
+		set_bit(LL_SBI_FILE_HEAT, sbi->ll_flags);
+	else
+		clear_bit(LL_SBI_FILE_HEAT, sbi->ll_flags);
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(file_heat);
+
+static ssize_t heat_decay_percentage_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 (sbi->ll_heat_decay_weight * 100 + 128) / 256);
+}
+
+static ssize_t heat_decay_percentage_store(struct kobject *kobj,
+					   struct attribute *attr,
+					   const char *buffer,
+					   size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	if (val < 0 || val > 100)
+		return -ERANGE;
+
+	sbi->ll_heat_decay_weight = (val * 256 + 50) / 100;
+
+	return count;
+}
+LUSTRE_RW_ATTR(heat_decay_percentage);
+
+static ssize_t heat_period_second_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_heat_period_second);
+}
+
+static ssize_t heat_period_second_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	if (val <= 0)
+		return -ERANGE;
+
+	sbi->ll_heat_period_second = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(heat_period_second);
+
+static ssize_t opencache_threshold_count_show(struct kobject *kobj,
+					      struct attribute *attr,
+					      char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	if (sbi->ll_oc_thrsh_count)
+		return snprintf(buf, PAGE_SIZE, "%u\n",
+				sbi->ll_oc_thrsh_count);
+	else
+		return snprintf(buf, PAGE_SIZE, "off\n");
+}
+
+static ssize_t opencache_threshold_count_store(struct kobject *kobj,
+					       struct attribute *attr,
+					       const char *buffer,
+					       size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc) {
+		bool enable;
+		/* also accept "off" to disable and "on" to always cache */
+		rc = kstrtobool(buffer, &enable);
+		if (rc)
+			return rc;
+		val = enable;
+	}
+	sbi->ll_oc_thrsh_count = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(opencache_threshold_count);
+
+static ssize_t opencache_threshold_ms_show(struct kobject *kobj,
+					   struct attribute *attr,
+					   char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_oc_thrsh_ms);
+}
+
+static ssize_t opencache_threshold_ms_store(struct kobject *kobj,
+					    struct attribute *attr,
+					    const char *buffer,
+					    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	sbi->ll_oc_thrsh_ms = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(opencache_threshold_ms);
+
+static ssize_t opencache_max_ms_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_oc_max_ms);
+}
+
+static ssize_t opencache_max_ms_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer,
+				      size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	sbi->ll_oc_max_ms = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(opencache_max_ms);
+
+static ssize_t inode_cache_show(struct kobject *kobj,
+				struct attribute *attr,
+				char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_inode_cache_enabled);
+}
+
+static ssize_t inode_cache_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer,
+				 size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	sbi->ll_inode_cache_enabled = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(inode_cache);
+
+static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block	*sb    = m->private;
+	struct ll_sb_info	*sbi   = ll_s2sbi(sb);
+	struct cl_client_cache	*cache = sbi->ll_cache;
+	long pages;
+	int mb;
+
+	pages = atomic_long_read(&cache->ccc_unstable_nr);
+	mb    = (pages * PAGE_SIZE) >> 20;
+
+	seq_printf(m, "unstable_check:     %8d\n"
+		      "unstable_pages: %12ld\n"
+		      "unstable_mb:        %8d\n",
+		   cache->ccc_unstable_check, pages, mb);
+	return 0;
+}
+
+static ssize_t ll_unstable_stats_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *unused)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)seq->private);
+	char kernbuf[128];
+	bool val;
+	int rc;
+
+	if (count == 0)
+		return 0;
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) -
+		  kernbuf;
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	/* borrow lru lock to set the value */
+	spin_lock(&sbi->ll_cache->ccc_lru_lock);
+	sbi->ll_cache->ccc_unstable_check = val;
+	spin_unlock(&sbi->ll_cache->ccc_lru_lock);
+
+	return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ll_unstable_stats);
+
+static int ll_root_squash_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct root_squash_info *squash = &sbi->ll_squash;
+
+	seq_printf(m, "%u:%u\n", squash->rsi_uid, squash->rsi_gid);
+	return 0;
+}
+
+static ssize_t ll_root_squash_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct root_squash_info *squash = &sbi->ll_squash;
+
+	return lprocfs_wr_root_squash(buffer, count, squash, sbi->ll_fsname);
+}
+
+LDEBUGFS_SEQ_FOPS(ll_root_squash);
+
+static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct root_squash_info *squash = &sbi->ll_squash;
+	int len;
+
+	spin_lock(&squash->rsi_lock);
+	if (!list_empty(&squash->rsi_nosquash_nids)) {
+		len = cfs_print_nidlist(m->buf + m->count, m->size - m->count,
+					&squash->rsi_nosquash_nids);
+		m->count += len;
+		seq_putc(m, '\n');
+	} else {
+		seq_puts(m, "NONE\n");
+	}
+	spin_unlock(&squash->rsi_lock);
+
+	return 0;
+}
+
+static ssize_t ll_nosquash_nids_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct root_squash_info *squash = &sbi->ll_squash;
+	int rc;
+
+	rc = lprocfs_wr_nosquash_nids(buffer, count, squash, sbi->ll_fsname);
+	if (rc < 0)
+		return rc;
+
+	ll_compute_rootsquash_state(sbi);
+
+	return rc;
+}
+
+LDEBUGFS_SEQ_FOPS(ll_nosquash_nids);
+
+#ifdef CONFIG_LL_ENCRYPTION
+static int ll_filename_enc_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	seq_printf(m, "%u\n", lsi->lsi_flags & LSI_FILENAME_ENC ? 1 : 0);
+	return 0;
+}
+
+static ssize_t ll_filename_enc_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	bool val;
+	int rc;
+
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val) {
+		if (!ll_sbi_has_name_encrypt(sbi)) {
+			/* server does not support name encryption,
+			 * so force it to NULL on client
+			 */
+			CDEBUG(D_SEC, "%s: server does not support name encryption\n",
+			       sbi->ll_fsname);
+			lsi->lsi_flags &= ~LSI_FILENAME_ENC;
+			return -EOPNOTSUPP;
+		}
+
+		lsi->lsi_flags |= LSI_FILENAME_ENC;
+	} else {
+		lsi->lsi_flags &= ~LSI_FILENAME_ENC;
+	}
+
+	return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ll_filename_enc);
+
+static int ll_old_b64_enc_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	seq_printf(m, "%u\n",
+		   lsi->lsi_flags & LSI_FILENAME_ENC_B64_OLD_CLI ? 1 : 0);
+	return 0;
+}
+
+static ssize_t ll_old_b64_enc_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	bool val;
+	int rc;
+
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val) {
+		if (!ll_sbi_has_name_encrypt(sbi)) {
+			/* server does not support name encryption,
+			 * so force it to NULL on client
+			 */
+			CDEBUG(D_SEC,
+			       "%s: server does not support name encryption\n",
+			       sbi->ll_fsname);
+			lsi->lsi_flags &= ~LSI_FILENAME_ENC_B64_OLD_CLI;
+			return -EOPNOTSUPP;
+		}
+
+		lsi->lsi_flags |= LSI_FILENAME_ENC_B64_OLD_CLI;
+	} else {
+		lsi->lsi_flags &= ~LSI_FILENAME_ENC_B64_OLD_CLI;
+	}
+
+	return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ll_old_b64_enc);
+#endif /* CONFIG_LL_ENCRYPTION */
+
+static int ll_pcc_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return pcc_super_dump(&sbi->ll_pcc_super, m);
+}
+
+static ssize_t ll_pcc_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int rc;
+	char *kernbuf;
+
+	if (count >= LPROCFS_WR_PCC_MAX_CMD)
+		return -EINVAL;
+
+	if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_PCC))
+		return -EOPNOTSUPP;
+
+	OBD_ALLOC(kernbuf, count + 1);
+	if (kernbuf == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		GOTO(out_free_kernbuff, rc = -EFAULT);
+
+	rc = pcc_cmd_handle(kernbuf, count, &sbi->ll_pcc_super);
+out_free_kernbuff:
+	OBD_FREE(kernbuf, count + 1);
+	return rc ? rc : count;
+}
+LDEBUGFS_SEQ_FOPS(ll_pcc);
+
+static int ll_mdll_dir_restore_max_retry_count_seq_show(struct seq_file *m,
+							void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%d\n",
+		   atomic_read(&sbi->ll_dir_restore_max_retry_count));
+
+	return 0;
+}
+
+static ssize_t
+ll_mdll_dir_restore_max_retry_count_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = kstrtoint_from_user(buffer, count, 0, &val);
+	if (rc)
+		return rc;
+
+	/*
+	 * Right now there is no limitation set on the retry count.
+	 * This is done as we dont know what the right max limit
+	 * would be. The max value would depend on the number of
+	 * files in the directory that is being restored and as well
+	 * if the mdt keeps restarting. The client calls are
+	 * interruptible and can be used to break from long retries.
+	 */
+	if (val < -1)
+		return -EINVAL;
+
+	atomic_set(&sbi->ll_dir_restore_max_retry_count, val);
+
+	return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ll_mdll_dir_restore_max_retry_count);
+
+struct ldebugfs_vars lprocfs_llite_obd_vars[] = {
+	{ .name	=	"site",
+	  .fops	=	&ll_site_stats_fops			},
+	{ .name	=	"max_cached_mb",
+	  .fops	=	&ll_max_cached_mb_fops			},
+	{ .name	=	"statahead_stats",
+	  .fops	=	&ll_statahead_stats_fops		},
+	{ .name	=	"unstable_stats",
+	  .fops	=	&ll_unstable_stats_fops			},
+	{ .name =	"sbi_flags",
+	  .fops =	&ll_sbi_flags_fops			},
+	{ .name	=	"root_squash",
+	  .fops	=	&ll_root_squash_fops			},
+	{ .name	=	"nosquash_nids",
+	  .fops	=	&ll_nosquash_nids_fops			},
+	{ .name =	"pcc",
+	  .fops =	&ll_pcc_fops,				},
+#ifdef CONFIG_LL_ENCRYPTION
+	{ .name =	"enable_filename_encryption",
+	  .fops =	&ll_filename_enc_fops,			},
+	{ .name =	"filename_enc_use_old_base64",
+	  .fops =	&ll_old_b64_enc_fops,			},
+#endif
+	{ .name	=	"mdll_dir_restore_max_retry_count",
+	  .fops	=	&ll_mdll_dir_restore_max_retry_count_fops	},
+	{ NULL }
+};
+
+#define MAX_STRING_SIZE 128
+
+static struct attribute *llite_attrs[] = {
+	&lustre_attr_blocksize.attr,
+	&lustre_attr_stat_blocksize.attr,
+	&lustre_attr_kbytestotal.attr,
+	&lustre_attr_kbytesfree.attr,
+	&lustre_attr_kbytesavail.attr,
+	&lustre_attr_filestotal.attr,
+	&lustre_attr_filesfree.attr,
+	&lustre_attr_client_type.attr,
+	&lustre_attr_foreign_symlink_enable.attr,
+	&lustre_attr_foreign_symlink_prefix.attr,
+	&lustre_attr_foreign_symlink_upcall.attr,
+	&lustre_attr_foreign_symlink_upcall_info.attr,
+	&lustre_attr_fstype.attr,
+	&lustre_attr_uuid.attr,
+	&lustre_attr_checksums.attr,
+	&lustre_attr_checksum_pages.attr,
+	&lustre_attr_max_read_ahead_mb.attr,
+	&lustre_attr_max_read_ahead_per_file_mb.attr,
+	&lustre_attr_max_read_ahead_whole_mb.attr,
+	&lustre_attr_max_read_ahead_async_active.attr,
+	&lustre_attr_read_ahead_async_file_threshold_mb.attr,
+	&lustre_attr_read_ahead_range_kb.attr,
+	&lustre_attr_stats_track_pid.attr,
+	&lustre_attr_stats_track_ppid.attr,
+	&lustre_attr_stats_track_gid.attr,
+	&lustre_attr_statahead_running_max.attr,
+	&lustre_attr_statahead_max.attr,
+	&lustre_attr_statahead_agl.attr,
+	&lustre_attr_lazystatfs.attr,
+	&lustre_attr_statfs_max_age.attr,
+	&lustre_attr_max_easize.attr,
+	&lustre_attr_default_easize.attr,
+	&lustre_attr_xattr_cache.attr,
+	&lustre_attr_fast_read.attr,
+	&lustre_attr_tiny_write.attr,
+	&lustre_attr_neg_dentry_timeout.attr,
+	&lustre_attr_parallel_dio.attr,
+	&lustre_attr_file_heat.attr,
+	&lustre_attr_heat_decay_percentage.attr,
+	&lustre_attr_heat_period_second.attr,
+	&lustre_attr_opencache_threshold_count.attr,
+	&lustre_attr_opencache_threshold_ms.attr,
+	&lustre_attr_opencache_max_ms.attr,
+	&lustre_attr_inode_cache.attr,
+	NULL,
+};
+
+KOBJ_ATTRIBUTE_GROUPS(llite); /* creates llite_groups */
+
+static void sbi_kobj_release(struct kobject *kobj)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	complete(&sbi->ll_kobj_unregister);
+}
+
+static struct kobj_type sbi_ktype = {
+	.default_groups = KOBJ_ATTR_GROUPS(llite),
+	.sysfs_ops      = &lustre_sysfs_ops,
+	.release        = sbi_kobj_release,
+};
+
+static const struct llite_file_opcode {
+	__u32		opcode;
+	__u32		type;
+	const char	*opname;
+} llite_opcode_table[LPROC_LL_FILE_OPCODES] = {
+	/* file operation */
+	{ LPROC_LL_READ_BYTES,	LPROCFS_TYPE_BYTES_FULL, "read_bytes" },
+	{ LPROC_LL_WRITE_BYTES,	LPROCFS_TYPE_BYTES_FULL, "write_bytes" },
+	{ LPROC_LL_READ,	LPROCFS_TYPE_LATENCY,	"read" },
+	{ LPROC_LL_WRITE,	LPROCFS_TYPE_LATENCY,	"write" },
+	{ LPROC_LL_IOCTL,	LPROCFS_TYPE_REQS,	"ioctl" },
+	{ LPROC_LL_OPEN,	LPROCFS_TYPE_LATENCY,	"open" },
+	{ LPROC_LL_RELEASE,	LPROCFS_TYPE_LATENCY,	"close" },
+	{ LPROC_LL_MMAP,	LPROCFS_TYPE_LATENCY,	"mmap" },
+	{ LPROC_LL_FAULT,	LPROCFS_TYPE_LATENCY,	"page_fault" },
+	{ LPROC_LL_MKWRITE,	LPROCFS_TYPE_LATENCY,	"page_mkwrite" },
+	{ LPROC_LL_LLSEEK,	LPROCFS_TYPE_LATENCY,	"seek" },
+	{ LPROC_LL_FSYNC,	LPROCFS_TYPE_LATENCY,	"fsync" },
+	{ LPROC_LL_READDIR,	LPROCFS_TYPE_LATENCY,	"readdir" },
+	{ LPROC_LL_INODE_OCOUNT,LPROCFS_TYPE_REQS |
+				LPROCFS_CNTR_AVGMINMAX |
+				LPROCFS_CNTR_STDDEV,	"opencount" },
+	{ LPROC_LL_INODE_OPCLTM,LPROCFS_TYPE_LATENCY,	"openclosetime" },
+	/* inode operation */
+	{ LPROC_LL_SETATTR,	LPROCFS_TYPE_LATENCY,	"setattr" },
+	{ LPROC_LL_TRUNC,	LPROCFS_TYPE_LATENCY,	"truncate" },
+	{ LPROC_LL_FLOCK,	LPROCFS_TYPE_LATENCY,	"flock" },
+	{ LPROC_LL_GETATTR,	LPROCFS_TYPE_LATENCY,	"getattr" },
+	{ LPROC_LL_FALLOCATE,	LPROCFS_TYPE_LATENCY, "fallocate"},
+	/* dir inode operation */
+	{ LPROC_LL_CREATE,	LPROCFS_TYPE_LATENCY,	"create" },
+	{ LPROC_LL_LINK,	LPROCFS_TYPE_LATENCY,	"link" },
+	{ LPROC_LL_UNLINK,	LPROCFS_TYPE_LATENCY,	"unlink" },
+	{ LPROC_LL_SYMLINK,	LPROCFS_TYPE_LATENCY,	"symlink" },
+	{ LPROC_LL_MKDIR,	LPROCFS_TYPE_LATENCY,	"mkdir" },
+	{ LPROC_LL_RMDIR,	LPROCFS_TYPE_LATENCY,	"rmdir" },
+	{ LPROC_LL_MKNOD,	LPROCFS_TYPE_LATENCY,	"mknod" },
+	{ LPROC_LL_RENAME,	LPROCFS_TYPE_LATENCY,	"rename" },
+	/* special inode operation */
+	{ LPROC_LL_STATFS,	LPROCFS_TYPE_LATENCY,	"statfs" },
+	{ LPROC_LL_SETXATTR,	LPROCFS_TYPE_LATENCY,	"setxattr" },
+	{ LPROC_LL_GETXATTR,	LPROCFS_TYPE_LATENCY,	"getxattr" },
+	{ LPROC_LL_GETXATTR_HITS, LPROCFS_TYPE_REQS,	"getxattr_hits" },
+	{ LPROC_LL_LISTXATTR,	LPROCFS_TYPE_LATENCY,	"listxattr" },
+	{ LPROC_LL_REMOVEXATTR,	LPROCFS_TYPE_LATENCY,	"removexattr" },
+	{ LPROC_LL_INODE_PERM,	LPROCFS_TYPE_LATENCY,	"inode_permission" },
+};
+
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, long count)
+{
+	if (!sbi->ll_stats)
+		return;
+
+	if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_PID &&
+		 sbi->ll_stats_track_id == current->pid)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_PPID &&
+		 sbi->ll_stats_track_id == current->real_parent->pid)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_GID &&
+		 sbi->ll_stats_track_id ==
+			from_kgid(&init_user_ns, current_gid()))
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+}
+EXPORT_SYMBOL(ll_stats_ops_tally);
+
+static const char *const ra_stat_string[] = {
+	[RA_STAT_HIT]			= "hits",
+	[RA_STAT_MISS]			= "misses",
+	[RA_STAT_DISTANT_READPAGE]	= "readpage_not_consecutive",
+	[RA_STAT_MISS_IN_WINDOW]	= "miss_inside_window",
+	[RA_STAT_FAILED_GRAB_PAGE]	= "failed_grab_cache_page",
+	[RA_STAT_FAILED_MATCH]		= "failed_lock_match",
+	[RA_STAT_DISCARDED]		= "read_but_discarded",
+	[RA_STAT_ZERO_LEN]		= "zero_length_file",
+	[RA_STAT_ZERO_WINDOW]		= "zero_size_window",
+	[RA_STAT_EOF]			= "readahead_to_eof",
+	[RA_STAT_MAX_IN_FLIGHT]		= "hit_max_readahead_issue",
+	[RA_STAT_WRONG_GRAB_PAGE]	= "wrong_page_from_grab_cache_page",
+	[RA_STAT_FAILED_REACH_END]	= "failed_to_reach_end",
+	[RA_STAT_ASYNC]			= "async_readahead",
+	[RA_STAT_FAILED_FAST_READ]	= "failed_to_fast_read",
+	[RA_STAT_MMAP_RANGE_READ]	= "mmap_range_read",
+};
+
+int ll_debugfs_register_super(struct super_block *sb, const char *name)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int err, id;
+
+	ENTRY;
+	LASSERT(sbi);
+
+	if (IS_ERR_OR_NULL(llite_root))
+		goto out_ll_kset;
+
+	sbi->ll_debugfs_entry = debugfs_create_dir(name, llite_root);
+	ldebugfs_add_vars(sbi->ll_debugfs_entry, lprocfs_llite_obd_vars, sb);
+
+	debugfs_create_file("dump_page_cache", 0444, sbi->ll_debugfs_entry, sbi,
+			    &vvp_dump_pgcache_file_ops);
+
+	debugfs_create_file("extents_stats", 0644, sbi->ll_debugfs_entry, sbi,
+				 &ll_rw_extents_stats_fops);
+
+	debugfs_create_file("extents_stats_per_process", 0644,
+			    sbi->ll_debugfs_entry, sbi,
+			    &ll_rw_extents_stats_pp_fops);
+
+	debugfs_create_file("offset_stats", 0644, sbi->ll_debugfs_entry, sbi,
+			    &ll_rw_offset_stats_fops);
+
+	/* File operations stats */
+	sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
+					    LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_stats == NULL)
+		GOTO(out_debugfs, err = -ENOMEM);
+
+	/* do counter init */
+	for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
+		u32 type = llite_opcode_table[id].type;
+		void *ptr = "unknown";
+
+		if (type & LPROCFS_TYPE_REQS)
+			ptr = "reqs";
+		else if (type & LPROCFS_TYPE_BYTES)
+			ptr = "bytes";
+		else if (type & LPROCFS_TYPE_USEC)
+			ptr = "usec";
+		lprocfs_counter_init(sbi->ll_stats,
+				     llite_opcode_table[id].opcode, type,
+				     llite_opcode_table[id].opname, ptr);
+	}
+
+	debugfs_create_file("stats", 0644, sbi->ll_debugfs_entry,
+			    sbi->ll_stats, &ldebugfs_stats_seq_fops);
+
+	sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
+					       LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_ra_stats == NULL)
+		GOTO(out_stats, err = -ENOMEM);
+
+	for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
+		lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
+				     ra_stat_string[id], "pages");
+
+	debugfs_create_file("read_ahead_stats", 0644, sbi->ll_debugfs_entry,
+			    sbi->ll_ra_stats, &ldebugfs_stats_seq_fops);
+
+out_ll_kset:
+	/* Yes we also register sysfs mount kset here as well */
+	sbi->ll_kset.kobj.parent = llite_kobj;
+	sbi->ll_kset.kobj.ktype = &sbi_ktype;
+	init_completion(&sbi->ll_kobj_unregister);
+	err = kobject_set_name(&sbi->ll_kset.kobj, "%s", name);
+	if (err)
+		GOTO(out_ra_stats, err);
+
+	err = kset_register(&sbi->ll_kset);
+	if (err)
+		GOTO(out_ra_stats, err);
+
+	lsi->lsi_kobj = kobject_get(&sbi->ll_kset.kobj);
+
+	RETURN(0);
+out_ra_stats:
+	lprocfs_free_stats(&sbi->ll_ra_stats);
+out_stats:
+	lprocfs_free_stats(&sbi->ll_stats);
+out_debugfs:
+	debugfs_remove_recursive(sbi->ll_debugfs_entry);
+
+	RETURN(err);
+}
+
+void ll_debugfs_unregister_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	debugfs_remove_recursive(sbi->ll_debugfs_entry);
+
+	if (sbi->ll_dt_obd)
+		sysfs_remove_link(&sbi->ll_kset.kobj,
+				  sbi->ll_dt_obd->obd_type->typ_name);
+
+	if (sbi->ll_md_obd)
+		sysfs_remove_link(&sbi->ll_kset.kobj,
+				  sbi->ll_md_obd->obd_type->typ_name);
+
+	kobject_put(lsi->lsi_kobj);
+
+	kset_unregister(&sbi->ll_kset);
+	wait_for_completion(&sbi->ll_kobj_unregister);
+
+	lprocfs_free_stats(&sbi->ll_ra_stats);
+	lprocfs_free_stats(&sbi->ll_stats);
+}
+#undef MAX_STRING_SIZE
+
+static void ll_display_extents_info(struct ll_rw_extents_info *rw_extents,
+				    struct seq_file *seq, int which)
+{
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	unsigned long start, end, r, w;
+	char *unitp = "KMGTPEZY";
+	int i, units = 10;
+	struct per_process_info *pp_info;
+
+	pp_info = &rw_extents->pp_extents[which];
+	read_cum = 0;
+	write_cum = 0;
+	start = 0;
+
+	for (i = 0; i < LL_HIST_MAX; i++) {
+		read_tot += pp_info->pp_r_hist.oh_buckets[i];
+		write_tot += pp_info->pp_w_hist.oh_buckets[i];
+	}
+
+	for (i = 0; i < LL_HIST_MAX; i++) {
+		r = pp_info->pp_r_hist.oh_buckets[i];
+		w = pp_info->pp_w_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		end = 1 << (i + LL_HIST_START - units);
+		seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4u %4u  | "
+			   "%14lu %4u %4u\n", start, *unitp, end, *unitp,
+			   (i == LL_HIST_MAX - 1) ? '+' : ' ',
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		start = end;
+		if (start == (1 << 10)) {
+			start = 1;
+			units += 10;
+			unitp++;
+		}
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+}
+
+static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
+{
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *rw_extents = sbi->ll_rw_extents_info;
+	int k;
+
+	if (!sbi->ll_rw_stats_on || !rw_extents) {
+		seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n");
+		return 0;
+	}
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	lprocfs_stats_header(seq, ktime_get_real(), rw_extents->pp_init, 25,
+			     ":", true, "");
+	seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+	seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+		   "extents", "calls", "%", "cum%", "calls", "%", "cum%");
+
+	for (k = 0; k < LL_PROCESS_HIST_MAX; k++) {
+		if (rw_extents->pp_extents[k].pid != 0) {
+			seq_printf(seq, "\nPID: %d\n",
+				   rw_extents->pp_extents[k].pid);
+			ll_display_extents_info(rw_extents, seq, k);
+		}
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	return 0;
+}
+
+static int alloc_rw_stats_info(struct ll_sb_info *sbi)
+{
+	struct ll_rw_extents_info *rw_extents;
+	struct ll_rw_process_info *offset;
+	struct ll_rw_process_info *process;
+	int i, rc = 0;
+
+	OBD_ALLOC(rw_extents, sizeof(*rw_extents));
+	if (!rw_extents)
+		return -ENOMEM;
+
+	for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+		spin_lock_init(&rw_extents->pp_extents[i].pp_r_hist.oh_lock);
+		spin_lock_init(&rw_extents->pp_extents[i].pp_w_hist.oh_lock);
+	}
+	rw_extents->pp_init = ktime_get_real();
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	if (!sbi->ll_rw_extents_info)
+		sbi->ll_rw_extents_info = rw_extents;
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	/* another writer allocated the struct before we got the lock */
+	if (sbi->ll_rw_extents_info != rw_extents)
+		OBD_FREE(rw_extents, sizeof(*rw_extents));
+
+	OBD_ALLOC(process, sizeof(*process) * LL_PROCESS_HIST_MAX);
+	if (!process)
+		GOTO(out, rc = -ENOMEM);
+	OBD_ALLOC(offset, sizeof(*offset) * LL_OFFSET_HIST_MAX);
+	if (!offset)
+		GOTO(out_free, rc = -ENOMEM);
+
+	spin_lock(&sbi->ll_process_lock);
+	if (!sbi->ll_rw_process_info)
+		sbi->ll_rw_process_info = process;
+	if (!sbi->ll_rw_offset_info)
+		sbi->ll_rw_offset_info = offset;
+	spin_unlock(&sbi->ll_process_lock);
+	sbi->ll_process_stats_init = ktime_get_real();
+
+	/* another writer allocated the structs before we got the lock */
+	if (sbi->ll_rw_offset_info != offset)
+		OBD_FREE(offset, sizeof(*offset) * LL_OFFSET_HIST_MAX);
+	if (sbi->ll_rw_process_info != process) {
+out_free:
+		OBD_FREE(process, sizeof(*process) * LL_PROCESS_HIST_MAX);
+	}
+
+out:
+	return rc;
+}
+
+void ll_free_rw_stats_info(struct ll_sb_info *sbi)
+{
+	if (sbi->ll_rw_extents_info) {
+		OBD_FREE(sbi->ll_rw_extents_info,
+			 sizeof(*sbi->ll_rw_extents_info));
+		sbi->ll_rw_extents_info = NULL;
+	}
+	if (sbi->ll_rw_offset_info) {
+		OBD_FREE(sbi->ll_rw_offset_info,
+			 sizeof(*sbi->ll_rw_offset_info) * LL_OFFSET_HIST_MAX);
+		sbi->ll_rw_offset_info = NULL;
+	}
+	if (sbi->ll_rw_process_info) {
+		OBD_FREE(sbi->ll_rw_process_info,
+			sizeof(*sbi->ll_rw_process_info) * LL_PROCESS_HIST_MAX);
+		sbi->ll_rw_process_info = NULL;
+	}
+}
+
+static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
+						const char __user *buf,
+						size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *rw_extents;
+	int i;
+	__s64 value;
+
+	if (len == 0)
+		return -EINVAL;
+
+	value = ll_stats_pid_write(buf, len);
+
+	if (value == 0) {
+		sbi->ll_rw_stats_on = 0;
+	} else {
+		if (!sbi->ll_rw_extents_info) {
+			int rc = alloc_rw_stats_info(sbi);
+
+			if (rc)
+				return rc;
+		}
+		sbi->ll_rw_stats_on = 1;
+	}
+
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	rw_extents = sbi->ll_rw_extents_info;
+	if (rw_extents) {
+		rw_extents->pp_init = ktime_get_real();
+		for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+			rw_extents->pp_extents[i].pid = 0;
+			lprocfs_oh_clear(&rw_extents->pp_extents[i].pp_r_hist);
+			lprocfs_oh_clear(&rw_extents->pp_extents[i].pp_w_hist);
+		}
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	return len;
+}
+
+LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats_pp);
+
+static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *rw_extents = sbi->ll_rw_extents_info;
+
+	if (!sbi->ll_rw_stats_on || !rw_extents) {
+		seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n");
+		return 0;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	lprocfs_stats_header(seq, ktime_get_real(), rw_extents->pp_init, 25,
+			     ":", true, "");
+
+	seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+	seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+		   "extents", "calls", "%", "cum%",
+		   "calls", "%", "cum%");
+
+	ll_display_extents_info(rw_extents, seq, LL_PROCESS_HIST_MAX);
+	spin_unlock(&sbi->ll_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
+					     const char __user *buf,
+					     size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *rw_extents;
+	int i;
+	__s64 value;
+
+	if (len == 0)
+		return -EINVAL;
+
+	value = ll_stats_pid_write(buf, len);
+
+	if (value == 0) {
+		sbi->ll_rw_stats_on = 0;
+	} else {
+		if (!sbi->ll_rw_extents_info) {
+			int rc = alloc_rw_stats_info(sbi);
+
+			if (rc)
+				return rc;
+		}
+		sbi->ll_rw_stats_on = 1;
+	}
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	rw_extents = sbi->ll_rw_extents_info;
+	if (rw_extents) {
+		rw_extents->pp_init = ktime_get_real();
+		for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+			rw_extents->pp_extents[i].pid = 0;
+			lprocfs_oh_clear(&rw_extents->pp_extents[i].pp_r_hist);
+			lprocfs_oh_clear(&rw_extents->pp_extents[i].pp_w_hist);
+		}
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	return len;
+}
+
+LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats);
+
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+		       struct ll_file_data *file, loff_t pos,
+		       size_t count, int rw)
+{
+	int i, cur = -1;
+	struct ll_rw_process_info *process;
+	struct ll_rw_process_info *offset;
+	int *off_count = &sbi->ll_rw_offset_entry_count;
+	int *process_count = &sbi->ll_offset_process_count;
+	struct ll_rw_extents_info *rw_extents;
+
+	if (!sbi->ll_rw_stats_on)
+		return;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	rw_extents = sbi->ll_rw_extents_info;
+	if (!rw_extents) {
+		spin_unlock(&sbi->ll_pp_extent_lock);
+		return;
+	}
+
+	/* Extent statistics */
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (rw_extents->pp_extents[i].pid == pid) {
+			cur = i;
+			break;
+		}
+	}
+
+	if (cur == -1) {
+		/* new process */
+		sbi->ll_extent_process_count =
+			(sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX;
+		cur = sbi->ll_extent_process_count;
+		rw_extents->pp_extents[cur].pid = pid;
+		lprocfs_oh_clear(&rw_extents->pp_extents[cur].pp_r_hist);
+		lprocfs_oh_clear(&rw_extents->pp_extents[cur].pp_w_hist);
+	}
+
+	for (i = 0; (count >= 1 << (LL_HIST_START + i)) &&
+	     (i < (LL_HIST_MAX - 1)); i++);
+	if (rw == 0) {
+		rw_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+		rw_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+	} else {
+		rw_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+		rw_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	spin_lock(&sbi->ll_process_lock);
+	process = sbi->ll_rw_process_info;
+	offset = sbi->ll_rw_offset_info;
+	if (!process || !offset)
+		goto out_unlock;
+
+	/* Offset statistics */
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (process[i].rw_pid == pid) {
+			if (process[i].rw_last_file != file) {
+				process[i].rw_range_start = pos;
+				process[i].rw_last_file_pos = pos + count;
+				process[i].rw_smallest_extent = count;
+				process[i].rw_largest_extent = count;
+				process[i].rw_offset = 0;
+				process[i].rw_last_file = file;
+				goto out_unlock;
+			}
+                        if (process[i].rw_last_file_pos != pos) {
+                                *off_count =
+                                    (*off_count + 1) % LL_OFFSET_HIST_MAX;
+                                offset[*off_count].rw_op = process[i].rw_op;
+                                offset[*off_count].rw_pid = pid;
+                                offset[*off_count].rw_range_start =
+                                        process[i].rw_range_start;
+                                offset[*off_count].rw_range_end =
+                                        process[i].rw_last_file_pos;
+                                offset[*off_count].rw_smallest_extent =
+                                        process[i].rw_smallest_extent;
+                                offset[*off_count].rw_largest_extent =
+                                        process[i].rw_largest_extent;
+                                offset[*off_count].rw_offset =
+                                        process[i].rw_offset;
+                                process[i].rw_op = rw;
+                                process[i].rw_range_start = pos;
+                                process[i].rw_smallest_extent = count;
+                                process[i].rw_largest_extent = count;
+                                process[i].rw_offset = pos -
+                                        process[i].rw_last_file_pos;
+                        }
+			if (process[i].rw_smallest_extent > count)
+				process[i].rw_smallest_extent = count;
+			if (process[i].rw_largest_extent < count)
+				process[i].rw_largest_extent = count;
+			process[i].rw_last_file_pos = pos + count;
+			goto out_unlock;
+		}
+	}
+	*process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX;
+	process[*process_count].rw_pid = pid;
+	process[*process_count].rw_op = rw;
+	process[*process_count].rw_range_start = pos;
+	process[*process_count].rw_last_file_pos = pos + count;
+	process[*process_count].rw_smallest_extent = count;
+	process[*process_count].rw_largest_extent = count;
+	process[*process_count].rw_offset = 0;
+	process[*process_count].rw_last_file = file;
+
+out_unlock:
+	spin_unlock(&sbi->ll_process_lock);
+}
+
+static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_process_info *offset;
+	struct ll_rw_process_info *process;
+	int i;
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n");
+		return 0;
+	}
+
+	spin_lock(&sbi->ll_process_lock);
+	lprocfs_stats_header(seq, ktime_get_real(), sbi->ll_process_stats_init,
+			     25, ":", true, "");
+	seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n",
+		   "R/W", "PID", "RANGE START", "RANGE END",
+		   "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET");
+
+	/* We stored the discontiguous offsets here; print them first */
+	offset = sbi->ll_rw_offset_info;
+	for (i = 0; offset && i < LL_OFFSET_HIST_MAX; i++) {
+		if (offset[i].rw_pid != 0)
+			seq_printf(seq,
+				  "%3c %10d %14llu %14llu %17lu %17lu %14lld\n",
+				   offset[i].rw_op == READ ? 'R' : 'W',
+				   offset[i].rw_pid,
+				   offset[i].rw_range_start,
+				   offset[i].rw_range_end,
+				   (unsigned long)offset[i].rw_smallest_extent,
+				   (unsigned long)offset[i].rw_largest_extent,
+				   offset[i].rw_offset);
+	}
+
+	/* Then print the current offsets for each process */
+	process = sbi->ll_rw_process_info;
+	for (i = 0; process && i < LL_PROCESS_HIST_MAX; i++) {
+		if (process[i].rw_pid != 0)
+			seq_printf(seq,
+				  "%3c %10d %14llu %14llu %17lu %17lu %14lld\n",
+				   process[i].rw_op == READ ? 'R' : 'W',
+				   process[i].rw_pid,
+				   process[i].rw_range_start,
+				   process[i].rw_last_file_pos,
+				   (unsigned long)process[i].rw_smallest_extent,
+				   (unsigned long)process[i].rw_largest_extent,
+				   process[i].rw_offset);
+	}
+	spin_unlock(&sbi->ll_process_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
+					    const char __user *buf,
+					    size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	__s64 value;
+
+	if (len == 0)
+		return -EINVAL;
+
+	value = ll_stats_pid_write(buf, len);
+
+	if (value == 0) {
+		sbi->ll_rw_stats_on = 0;
+	} else {
+		if (!sbi->ll_rw_process_info || !sbi->ll_rw_offset_info) {
+			int rc = alloc_rw_stats_info(sbi);
+
+			if (rc)
+				return rc;
+		}
+		sbi->ll_rw_stats_on = 1;
+	}
+
+	spin_lock(&sbi->ll_process_lock);
+	sbi->ll_offset_process_count = 0;
+	sbi->ll_rw_offset_entry_count = 0;
+	sbi->ll_process_stats_init = ktime_get_real();
+	if (sbi->ll_rw_process_info)
+		memset(sbi->ll_rw_process_info, 0,
+		       sizeof(struct ll_rw_process_info) * LL_PROCESS_HIST_MAX);
+	if (sbi->ll_rw_offset_info)
+		memset(sbi->ll_rw_offset_info, 0,
+		       sizeof(struct ll_rw_process_info) * LL_OFFSET_HIST_MAX);
+	spin_unlock(&sbi->ll_process_lock);
+
+	return len;
+}
+
+LDEBUGFS_SEQ_FOPS(ll_rw_offset_stats);
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
new file mode 100644
index 0000000000000..5af9ab7477a93
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -0,0 +1,2222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+#include "llite_internal.h"
+
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define ll_create_nd(ns, dir, de, mode, ex)	ll_create_nd(dir, de, mode, ex)
+#define ll_mkdir(ns, dir, dch, mode)		ll_mkdir(dir, dch, mode)
+#define ll_mknod(ns, dir, dch, mode, rd)	ll_mknod(dir, dch, mode, rd)
+#ifdef HAVE_IOPS_RENAME_WITH_FLAGS
+#define ll_rename(ns, src, sdc, tgt, tdc, fl)	ll_rename(src, sdc, tgt, tdc, fl)
+#else
+#define ll_rename(ns, src, sdc, tgt, tdc)	ll_rename(src, sdc, tgt, tdc)
+#endif /* HAVE_IOPS_RENAME_WITH_FLAGS */
+#define ll_symlink(nd, dir, dch, old)		ll_symlink(dir, dch, old)
+#endif
+
+static int ll_create_it(struct inode *dir, struct dentry *dentry,
+			struct lookup_intent *it,
+			void *secctx, __u32 secctxlen, bool encrypt,
+			void *encctx, __u32 encctxlen, unsigned int open_flags);
+
+/* called from iget5_locked->find_inode() under inode_lock spinlock */
+static int ll_test_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct lustre_md	*md = opaque;
+
+	if (unlikely(!(md->body->mbo_valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return 0;
+	}
+
+	if (!lu_fid_eq(&lli->lli_fid, &md->body->mbo_fid1))
+		return 0;
+
+	return 1;
+}
+
+static int ll_set_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = ((struct lustre_md *)opaque)->body;
+
+	if (unlikely(!(body->mbo_valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return -EINVAL;
+	}
+
+	lli->lli_fid = body->mbo_fid1;
+	if (unlikely(!(body->mbo_valid & OBD_MD_FLTYPE))) {
+		CERROR("Can not initialize inode "DFID" without object type: "
+		       "valid = %#llx\n",
+		       PFID(&lli->lli_fid), body->mbo_valid);
+		return -EINVAL;
+	}
+
+	inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mbo_mode & S_IFMT);
+	if (unlikely(inode->i_mode == 0)) {
+		CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid));
+		return -EINVAL;
+	}
+
+	ll_lli_init(lli);
+
+	return 0;
+}
+
+
+/**
+ * Get an inode by inode number(@hash), which is already instantiated by
+ * the intent lookup).
+ */
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+                      struct lustre_md *md)
+{
+	struct inode	*inode;
+	int		rc = 0;
+
+	ENTRY;
+
+        LASSERT(hash != 0);
+        inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
+	if (inode == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	if (inode->i_state & I_NEW) {
+		rc = ll_read_inode2(inode, md);
+		if (rc == 0 && S_ISREG(inode->i_mode) &&
+		    ll_i2info(inode)->lli_clob == NULL)
+			rc = cl_file_inode_init(inode, md);
+
+		if (rc != 0) {
+			/* Let's clear directory lsm here, otherwise
+			 * make_bad_inode() will reset the inode mode
+			 * to regular, then ll_clear_inode will not
+			 * be able to clear lsm_md */
+			if (S_ISDIR(inode->i_mode))
+				ll_dir_clear_lsm_md(inode);
+			make_bad_inode(inode);
+			unlock_new_inode(inode);
+			iput(inode);
+			inode = ERR_PTR(rc);
+		} else {
+			inode_has_no_xattr(inode);
+			unlock_new_inode(inode);
+		}
+	} else if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = ERR_PTR(-ESTALE);
+	} else if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
+		rc = ll_update_inode(inode, md);
+		CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p): rc = %d\n",
+		       PFID(&md->body->mbo_fid1), inode, rc);
+		if (rc != 0) {
+			if (S_ISDIR(inode->i_mode))
+				ll_dir_clear_lsm_md(inode);
+			iput(inode);
+			inode = ERR_PTR(rc);
+		}
+	}
+
+        RETURN(inode);
+}
+
+/* mark negative sub file dentries invalid and prune unused dentries */
+static void ll_prune_negative_children(struct inode *dir)
+{
+	struct dentry *dentry;
+	struct dentry *child;
+
+	ENTRY;
+
+restart:
+	spin_lock(&dir->i_lock);
+	hlist_for_each_entry(dentry, &dir->i_dentry, d_alias) {
+		spin_lock(&dentry->d_lock);
+		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+			if (child->d_inode)
+				continue;
+
+			spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+			if (lld_is_init(child))
+				ll_d2d(child)->lld_invalid = 1;
+			if (!ll_d_count(child)) {
+				dget_dlock(child);
+				__d_drop(child);
+				spin_unlock(&child->d_lock);
+				spin_unlock(&dentry->d_lock);
+				spin_unlock(&dir->i_lock);
+
+				CDEBUG(D_DENTRY, "prune negative dentry %pd\n",
+				       child);
+
+				dput(child);
+				goto restart;
+			}
+			spin_unlock(&child->d_lock);
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&dir->i_lock);
+
+	EXIT;
+}
+
+int ll_test_inode_by_fid(struct inode *inode, void *opaque)
+{
+	return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque);
+}
+
+static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
+{
+	struct lu_env *env;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	__u16 refcheck;
+	int rc;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val);
+
+	/* reach MDC layer to flush data under  the DoM ldlm lock */
+	rc = cl_object_flush(env, lli->lli_clob, lock);
+	if (rc == -ENODATA) {
+		CDEBUG(D_INODE, "inode "DFID" layout has no DoM stripe\n",
+		       PFID(ll_inode2fid(inode)));
+		/* most likely result of layout change, do nothing */
+		rc = 0;
+	}
+
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel)
+{
+	struct inode *inode = ll_inode_from_resource_lock(lock);
+	struct ll_inode_info *lli;
+	__u64 bits = to_cancel;
+	int rc;
+
+	ENTRY;
+
+	if (!inode) {
+		/* That means the inode is evicted most likely and may cause
+		 * the skipping of lock cleanups below, so print the message
+		 * about that in log.
+		 */
+		if (lock->l_resource->lr_lvb_inode)
+			LDLM_DEBUG(lock,
+				   "can't take inode for the lock (%sevicted)\n",
+				   lock->l_resource->lr_lvb_inode->i_state &
+				   I_FREEING ? "" : "not ");
+		RETURN_EXIT;
+	}
+
+	if (!fid_res_name_eq(ll_inode2fid(inode),
+			     &lock->l_resource->lr_name)) {
+		LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
+			   PFID(ll_inode2fid(inode)), inode);
+		LBUG();
+	}
+
+	if (bits & MDS_INODELOCK_XATTR) {
+		ll_xattr_cache_empty(inode);
+		bits &= ~MDS_INODELOCK_XATTR;
+	}
+
+	/* For OPEN locks we differentiate between lock modes
+	 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+	if (bits & MDS_INODELOCK_OPEN)
+		ll_have_md_lock(inode, &bits, lock->l_req_mode);
+
+	if (bits & MDS_INODELOCK_OPEN) {
+		fmode_t fmode;
+
+		switch (lock->l_req_mode) {
+		case LCK_CW:
+			fmode = FMODE_WRITE;
+			break;
+		case LCK_PR:
+			fmode = FMODE_EXEC;
+			break;
+		case LCK_CR:
+			fmode = FMODE_READ;
+			break;
+		default:
+			LDLM_ERROR(lock, "bad lock mode for OPEN lock");
+			LBUG();
+		}
+
+		ll_md_real_close(inode, fmode);
+
+		bits &= ~MDS_INODELOCK_OPEN;
+	}
+
+	if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+		    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM |
+		    MDS_INODELOCK_DOM))
+		ll_have_md_lock(inode, &bits, LCK_MINMODE);
+
+	if (bits & MDS_INODELOCK_DOM) {
+		rc =  ll_dom_lock_cancel(inode, lock);
+		if (rc < 0)
+			CDEBUG(D_INODE, "cannot flush DoM data "
+			       DFID": rc = %d\n",
+			       PFID(ll_inode2fid(inode)), rc);
+	}
+
+	if (bits & MDS_INODELOCK_LAYOUT) {
+		struct cl_object_conf conf = {
+			.coc_opc = OBJECT_CONF_INVALIDATE,
+			.coc_inode = inode,
+		};
+
+		rc = ll_layout_conf(inode, &conf);
+		if (rc < 0)
+			CDEBUG(D_INODE, "cannot invalidate layout of "
+			       DFID": rc = %d\n",
+			       PFID(ll_inode2fid(inode)), rc);
+	}
+
+	lli = ll_i2info(inode);
+
+	if (bits & MDS_INODELOCK_UPDATE)
+		set_bit(LLIF_UPDATE_ATIME, &lli->lli_flags);
+
+	if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
+		CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
+		       "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
+		       lli, PFID(&lli->lli_pfid));
+		truncate_inode_pages(inode->i_mapping, 0);
+
+		if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
+			struct inode *master_inode = NULL;
+			unsigned long hash;
+
+			/* This is slave inode, since all of the child dentry
+			 * is connected on the master inode, so we have to
+			 * invalidate the negative children on master inode */
+			CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
+			       PFID(ll_inode2fid(inode)), PFID(&lli->lli_pfid));
+
+			hash = cl_fid_build_ino(&lli->lli_pfid,
+					ll_need_32bit_api(ll_i2sbi(inode)));
+
+			/* Do not lookup the inode with ilookup5, otherwise
+			 * it will cause dead lock,
+			 * 1. Client1 send chmod req to the MDT0, then on MDT0,
+			 * it enqueues master and all of its slaves lock,
+			 * (mdt_attr_set() -> mdt_lock_slaves()), after gets
+			 * master and stripe0 lock, it will send the enqueue
+			 * req (for stripe1) to MDT1, then MDT1 finds the lock
+			 * has been granted to client2. Then MDT1 sends blocking
+			 * ast to client2.
+			 * 2. At the same time, client2 tries to unlink
+			 * the striped dir (rm -rf striped_dir), and during
+			 * lookup, it will hold the master inode of the striped
+			 * directory, whose inode state is NEW, then tries to
+			 * revalidate all of its slaves, (ll_prep_inode()->
+			 * ll_iget()->ll_read_inode2()-> ll_update_inode().).
+			 * And it will be blocked on the server side because
+			 * of 1.
+			 * 3. Then the client get the blocking_ast req, cancel
+			 * the lock, but being blocked if using ->ilookup5()),
+			 * because master inode state is NEW. */
+			master_inode = ilookup5_nowait(inode->i_sb, hash,
+							ll_test_inode_by_fid,
+							(void *)&lli->lli_pfid);
+			if (master_inode) {
+				ll_prune_negative_children(master_inode);
+				iput(master_inode);
+			}
+		} else {
+			ll_prune_negative_children(inode);
+		}
+	}
+
+	/* at umount s_root becomes NULL */
+	if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
+	    inode->i_sb->s_root && !is_root_inode(inode))
+		ll_prune_aliases(inode);
+
+	if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM))
+		forget_all_cached_acls(inode);
+
+	iput(inode);
+	RETURN_EXIT;
+}
+
+/* Check if the given lock may be downgraded instead of canceling and
+ * that convert is really needed. */
+int ll_md_need_convert(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	struct inode *inode;
+	__u64 wanted = lock->l_policy_data.l_inodebits.cancel_bits;
+	__u64 bits = lock->l_policy_data.l_inodebits.bits & ~wanted;
+	enum ldlm_mode mode = LCK_MINMODE;
+
+	if (!lock->l_conn_export ||
+	    !exp_connect_lock_convert(lock->l_conn_export))
+		return 0;
+
+	if (!wanted || !bits || ldlm_is_cancel(lock))
+		return 0;
+
+	/* do not convert locks other than DOM for now */
+	if (!((bits | wanted) & MDS_INODELOCK_DOM))
+		return 0;
+
+	/* We may have already remaining bits in some other lock so
+	 * lock convert will leave us just extra lock for the same bit.
+	 * Check if client has other lock with the same bits and the same
+	 * or lower mode and don't convert if any.
+	 */
+	switch (lock->l_req_mode) {
+	case LCK_PR:
+		mode = LCK_PR;
+		fallthrough;
+	case LCK_PW:
+		mode |= LCK_CR;
+		break;
+	case LCK_CW:
+		mode = LCK_CW;
+		fallthrough;
+	case LCK_CR:
+		mode |= LCK_CR;
+		break;
+	default:
+		/* do not convert other modes */
+		return 0;
+	}
+
+	/* is lock is too old to be converted? */
+	lock_res_and_lock(lock);
+	if (ktime_after(ktime_get(),
+			ktime_add(lock->l_last_used, ns->ns_dirty_age_limit))) {
+		unlock_res_and_lock(lock);
+		return 0;
+	}
+	unlock_res_and_lock(lock);
+
+	inode = ll_inode_from_resource_lock(lock);
+	ll_have_md_lock(inode, &bits, mode);
+	iput(inode);
+	return !!(bits);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *ld,
+		       void *data, int flag)
+{
+	struct lustre_handle lockh;
+	int rc;
+
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+	{
+		__u64 cancel_flags = LCF_ASYNC;
+
+		/* if lock convert is not needed then still have to
+		 * pass lock via ldlm_cli_convert() to keep all states
+		 * correct, set cancel_bits to full lock bits to cause
+		 * full cancel to happen.
+		 */
+		if (!ll_md_need_convert(lock)) {
+			lock_res_and_lock(lock);
+			lock->l_policy_data.l_inodebits.cancel_bits =
+					lock->l_policy_data.l_inodebits.bits;
+			unlock_res_and_lock(lock);
+		}
+		rc = ldlm_cli_convert(lock, cancel_flags);
+		if (!rc)
+			RETURN(0);
+		/* continue with cancel otherwise */
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, cancel_flags);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
+			RETURN(rc);
+		}
+		break;
+	}
+	case LDLM_CB_CANCELING:
+	{
+		__u64 to_cancel = lock->l_policy_data.l_inodebits.bits;
+
+		/* Nothing to do for non-granted locks */
+		if (!ldlm_is_granted(lock))
+			break;
+
+		/* If 'ld' is supplied then bits to be cancelled are passed
+		 * implicitly by lock converting and cancel_bits from 'ld'
+		 * should be used. Otherwise full cancel is being performed
+		 * and lock inodebits are used.
+		 *
+		 * Note: we cannot rely on cancel_bits in lock itself at this
+		 * moment because they can be changed by concurrent thread,
+		 * so ldlm_cli_inodebits_convert() pass cancel bits implicitly
+		 * in 'ld' parameter.
+		 */
+		if (ld) {
+			/* partial bits cancel allowed only during convert */
+			LASSERT(ldlm_is_converting(lock));
+			/* mask cancel bits by lock bits so only no any unused
+			 * bits are passed to ll_lock_cancel_bits()
+			 */
+			to_cancel &= ld->l_policy_data.l_inodebits.cancel_bits;
+		}
+		ll_lock_cancel_bits(lock, to_cancel);
+		break;
+	}
+	default:
+		LBUG();
+	}
+
+	RETURN(0);
+}
+
+__u32 ll_i2suppgid(struct inode *i)
+{
+	if (in_group_p(i->i_gid))
+		return (__u32)from_kgid(&init_user_ns, i->i_gid);
+	else
+		return (__u32) __kgid_val(INVALID_GID);
+}
+
+/* Pack the required supplementary groups into the supplied groups array.
+ * If we don't need to use the groups from the target inode(s) then we
+ * instead pack one or more groups from the user's supplementary group
+ * array in case it might be useful.  Not needed if doing an MDS-side upcall. */
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2)
+{
+	LASSERT(i1 != NULL);
+	LASSERT(suppgids != NULL);
+
+	suppgids[0] = ll_i2suppgid(i1);
+
+	if (i2)
+		suppgids[1] = ll_i2suppgid(i2);
+	else
+		suppgids[1] = -1;
+}
+
+/*
+ * try to reuse three types of dentry:
+ * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid
+ *    by concurrent .revalidate).
+ * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may
+ *    be cleared by others calling d_lustre_revalidate).
+ * 3. DISCONNECTED alias.
+ */
+static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry)
+{
+	struct dentry *alias, *discon_alias, *invalid_alias;
+
+	if (hlist_empty(&inode->i_dentry))
+		return NULL;
+
+	discon_alias = invalid_alias = NULL;
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		LASSERT(alias != dentry);
+
+		spin_lock(&alias->d_lock);
+		if ((alias->d_flags & DCACHE_DISCONNECTED) &&
+		    S_ISDIR(inode->i_mode))
+			/* LASSERT(last_discon == NULL); LU-405, bz 20055 */
+			discon_alias = alias;
+		else if (alias->d_parent == dentry->d_parent             &&
+			 alias->d_name.hash == dentry->d_name.hash       &&
+			 alias->d_name.len == dentry->d_name.len         &&
+			 memcmp(alias->d_name.name, dentry->d_name.name,
+				dentry->d_name.len) == 0)
+			invalid_alias = alias;
+		spin_unlock(&alias->d_lock);
+
+		if (invalid_alias)
+			break;
+	}
+	alias = invalid_alias ?: discon_alias ?: NULL;
+	if (alias) {
+		spin_lock(&alias->d_lock);
+		dget_dlock(alias);
+		spin_unlock(&alias->d_lock);
+	}
+	spin_unlock(&inode->i_lock);
+
+	return alias;
+}
+
+/*
+ * Similar to d_splice_alias(), but lustre treats invalid alias
+ * similar to DCACHE_DISCONNECTED, and tries to use it anyway.
+ */
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
+{
+	struct dentry *new;
+
+	if (inode) {
+		new = ll_find_alias(inode, de);
+		if (new) {
+			if (!ll_d_setup(new, true))
+				return ERR_PTR(-ENOMEM);
+			d_move(new, de);
+			iput(inode);
+			CDEBUG(D_DENTRY,
+			       "Reuse dentry %p inode %p refc %d flags %#x\n",
+			      new, new->d_inode, ll_d_count(new), new->d_flags);
+			return new;
+		}
+	}
+	if (!ll_d_setup(de, false))
+		return ERR_PTR(-ENOMEM);
+	d_add(de, inode);
+
+	/* this needs only to be done for foreign symlink dirs as
+	 * DCACHE_SYMLINK_TYPE is already set by d_flags_for_inode()
+	 * kernel routine for files with symlink ops (ie, real symlink)
+	 */
+	if (inode && S_ISDIR(inode->i_mode) &&
+	    ll_sbi_has_foreign_symlink(ll_i2sbi(inode)) &&
+#ifdef HAVE_IOP_GET_LINK
+	    inode->i_op->get_link) {
+#else
+	    inode->i_op->follow_link) {
+#endif
+		CDEBUG(D_INFO, "%s: inode "DFID": faking foreign dir as a symlink\n",
+		       ll_i2sbi(inode)->ll_fsname, PFID(ll_inode2fid(inode)));
+		spin_lock(&de->d_lock);
+		/* like d_flags_for_inode() already does for files */
+		de->d_flags = (de->d_flags & ~DCACHE_ENTRY_TYPE) |
+			      DCACHE_SYMLINK_TYPE;
+		spin_unlock(&de->d_lock);
+	}
+
+	CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
+	       de, de->d_inode, ll_d_count(de), de->d_flags);
+        return de;
+}
+
+static int ll_lookup_it_finish(struct ptlrpc_request *request,
+			       struct lookup_intent *it,
+			       struct inode *parent, struct dentry **de,
+			       void *secctx, __u32 secctxlen,
+			       void *encctx, __u32 encctxlen,
+			       ktime_t kstart, bool encrypt)
+{
+	struct inode		 *inode = NULL;
+	__u64			  bits = 0;
+	int			  rc;
+	struct dentry *alias;
+	ENTRY;
+
+	/* NB 1 request reference will be taken away by ll_intent_lock()
+	 * when I return */
+	CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
+	       it->it_disposition);
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+		struct req_capsule *pill = &request->rq_pill;
+		struct mdt_body *body = req_capsule_server_get(pill,
+							       &RMF_MDT_BODY);
+
+		rc = ll_prep_inode(&inode, &request->rq_pill, (*de)->d_sb, it);
+		if (rc)
+			RETURN(rc);
+
+		/* If encryption context was returned by MDT, put it in
+		 * inode now to save an extra getxattr and avoid deadlock.
+		 */
+		if (body->mbo_valid & OBD_MD_ENCCTX) {
+			encctx = req_capsule_server_get(pill, &RMF_FILE_ENCCTX);
+			encctxlen = req_capsule_get_size(pill,
+							 &RMF_FILE_ENCCTX,
+							 RCL_SERVER);
+
+			if (encctxlen) {
+				CDEBUG(D_SEC,
+				       "server returned encryption ctx for "DFID"\n",
+				       PFID(ll_inode2fid(inode)));
+				rc = ll_xattr_cache_insert(inode,
+							   xattr_for_enc(inode),
+							   encctx, encctxlen);
+				if (rc)
+					CWARN("%s: cannot set enc ctx for "DFID": rc = %d\n",
+					      ll_i2sbi(inode)->ll_fsname,
+					      PFID(ll_inode2fid(inode)), rc);
+				else if (encrypt) {
+					rc = llcrypt_get_encryption_info(inode);
+					if (rc)
+						CDEBUG(D_SEC,
+						 "cannot get enc info for "DFID": rc = %d\n",
+						 PFID(ll_inode2fid(inode)), rc);
+				}
+			}
+		}
+
+		ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+		/* OPEN can return data if lock has DoM+LAYOUT bits set */
+		if (it->it_op & IT_OPEN &&
+		    bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT)
+			ll_dom_finish_open(inode, request);
+
+		/* We used to query real size from OSTs here, but actually
+		 * this is not needed. For stat() calls size would be updated
+		 * from subsequent do_revalidate()->ll_inode_revalidate_it() in
+		 * 2.4 and
+		 * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+		 * Everybody else who needs correct file size would call
+		 * ll_glimpse_size or some equivalent themselves anyway.
+		 * Also see bug 7198.
+		 */
+
+		/* If security context was returned by MDT, put it in
+		 * inode now to save an extra getxattr from security hooks,
+		 * and avoid deadlock.
+		 */
+		if (body->mbo_valid & OBD_MD_SECCTX) {
+			secctx = req_capsule_server_get(pill, &RMF_FILE_SECCTX);
+			secctxlen = req_capsule_get_size(pill,
+							   &RMF_FILE_SECCTX,
+							   RCL_SERVER);
+
+			if (secctxlen)
+				CDEBUG(D_SEC, "server returned security context"
+				       " for "DFID"\n",
+				       PFID(ll_inode2fid(inode)));
+		}
+
+		/* resume normally on error */
+		ll_inode_notifysecctx(inode, secctx, secctxlen);
+	}
+
+	/* Only hash *de if it is unhashed (new dentry).
+	 * Atoimc_open may passin hashed dentries for open.
+	 */
+	alias = ll_splice_alias(inode, *de);
+	if (IS_ERR(alias))
+		GOTO(out, rc = PTR_ERR(alias));
+
+	*de = alias;
+
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+		/* we have lookup look - unhide dentry */
+		if (bits & MDS_INODELOCK_LOOKUP) {
+			d_lustre_revalidate(*de);
+			ll_update_dir_depth(parent, (*de)->d_inode);
+		}
+
+		if (encrypt) {
+			rc = llcrypt_get_encryption_info(inode);
+			if (rc)
+				GOTO(out, rc);
+		}
+	} else if (!it_disposition(it, DISP_OPEN_CREATE)) {
+		/*
+		 * If file was created on the server, the dentry is revalidated
+		 * in ll_create_it if the lock allows for it.
+		 */
+		/* Check that parent has UPDATE lock. */
+		struct lookup_intent parent_it = {
+					.it_op = IT_GETATTR,
+					.it_lock_handle = 0 };
+		struct lu_fid	fid = ll_i2info(parent)->lli_fid;
+
+		/* If it is striped directory, get the real stripe parent */
+		if (unlikely(ll_dir_striped(parent))) {
+			rc = md_get_fid_from_lsm(ll_i2mdexp(parent),
+						 ll_i2info(parent)->lli_lsm_md,
+						 (*de)->d_name.name,
+						 (*de)->d_name.len, &fid);
+			if (rc != 0)
+				GOTO(out, rc);
+		}
+
+		if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, &fid,
+				       NULL)) {
+			d_lustre_revalidate(*de);
+			ll_intent_release(&parent_it);
+		}
+	}
+
+	if (it_disposition(it, DISP_OPEN_CREATE)) {
+		ll_stats_ops_tally(ll_i2sbi(parent), LPROC_LL_MKNOD,
+				   ktime_us_delta(ktime_get(), kstart));
+	}
+
+	GOTO(out, rc = 0);
+
+out:
+	if (rc != 0 && it->it_op & IT_OPEN) {
+		ll_intent_drop_lock(it);
+		ll_open_cleanup((*de)->d_sb, &request->rq_pill);
+	}
+
+	return rc;
+}
+
+static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
+				   struct lookup_intent *it,
+				   void **secctx, __u32 *secctxlen,
+				   int *secctxslot,
+				   struct pcc_create_attach *pca,
+				   bool encrypt,
+				   void **encctx, __u32 *encctxlen)
+{
+	ktime_t kstart = ktime_get();
+	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+	struct dentry *save = dentry, *retval;
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data = NULL;
+	struct lov_user_md *lum = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(parent);
+	__u32 opc;
+	int rc;
+	struct llcrypt_name fname;
+	struct lu_fid fid;
+	ENTRY;
+
+	if (dentry->d_name.len > sbi->ll_namelen)
+		RETURN(ERR_PTR(-ENAMETOOLONG));
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), intent=%s\n",
+	       dentry, PFID(ll_inode2fid(parent)), parent, LL_IT2STR(it));
+
+	if (d_mountpoint(dentry))
+		CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
+
+	if (it == NULL || it->it_op == IT_GETXATTR)
+		it = &lookup_it;
+
+	if (it->it_op == IT_GETATTR && dentry_may_statahead(parent, dentry)) {
+		rc = ll_revalidate_statahead(parent, &dentry, 0);
+		if (rc == 1)
+			RETURN(dentry == save ? NULL : dentry);
+	}
+
+	if (it->it_op & IT_OPEN && it->it_flags & FMODE_WRITE &&
+	    dentry->d_sb->s_flags & SB_RDONLY)
+		RETURN(ERR_PTR(-EROFS));
+
+	if (it->it_op & IT_CREAT)
+		opc = LUSTRE_OPC_CREATE;
+	else
+		opc = LUSTRE_OPC_LOOKUP;
+
+	/* Here we should be calling llcrypt_prepare_lookup(). But it installs a
+	 * custom ->d_revalidate() method, so we lose ll_d_ops.
+	 * To workaround this, call ll_setup_filename() and do the rest
+	 * manually. Also make a copy of llcrypt_d_revalidate() (unfortunately
+	 * not exported function) and call it from ll_revalidate_dentry(), to
+	 * ensure we do not cache stale dentries after a key has been added.
+	 */
+	rc = ll_setup_filename(parent, &dentry->d_name, 1, &fname, &fid);
+	if ((!rc || rc == -ENOENT) && fname.is_ciphertext_name) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags |= DCACHE_NOKEY_NAME;
+		spin_unlock(&dentry->d_lock);
+	}
+	if (rc == -ENOENT)
+		RETURN(NULL);
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, fname.disk_name.name,
+				     fname.disk_name.len, 0, opc, NULL);
+	if (IS_ERR(op_data)) {
+		llcrypt_free_filename(&fname);
+		RETURN(ERR_CAST(op_data));
+	}
+	if (!fid_is_zero(&fid)) {
+		op_data->op_fid2 = fid;
+		op_data->op_bias = MDS_FID_OP;
+		if (it->it_op & IT_OPEN)
+			it->it_flags |= MDS_OPEN_BY_FID;
+	}
+
+	/* enforce umask if acl disabled or MDS doesn't support umask */
+	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+		it->it_create_mode &= ~current_umask();
+
+	if (it->it_op & IT_CREAT &&
+	    test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) {
+		rc = ll_dentry_init_security(dentry, it->it_create_mode,
+					     &dentry->d_name,
+					     &op_data->op_file_secctx_name,
+					     &op_data->op_file_secctx_name_size,
+					     &op_data->op_file_secctx,
+					     &op_data->op_file_secctx_size,
+					     &op_data->op_file_secctx_slot);
+		if (rc < 0)
+			GOTO(out, retval = ERR_PTR(rc));
+		if (secctx != NULL)
+			*secctx = op_data->op_file_secctx;
+		if (secctxlen != NULL)
+			*secctxlen = op_data->op_file_secctx_size;
+		if (secctxslot != NULL)
+			*secctxslot = op_data->op_file_secctx_slot;
+	} else {
+		if (secctx != NULL)
+			*secctx = NULL;
+		if (secctxlen != NULL)
+			*secctxlen = 0;
+		if (secctxslot != NULL)
+			*secctxslot = 0;
+	}
+	if (it->it_op & IT_CREAT && encrypt) {
+		if (unlikely(filename_is_volatile(dentry->d_name.name,
+						  dentry->d_name.len, NULL))) {
+			/* get encryption context from reference file */
+			int ctx_size = LLCRYPT_ENC_CTX_SIZE;
+			struct lustre_sb_info *lsi;
+			struct file *ref_file;
+			struct inode *ref_inode;
+			void *ctx;
+
+			rc = volatile_ref_file(dentry->d_name.name,
+					       dentry->d_name.len,
+					       &ref_file);
+			if (rc)
+				GOTO(out, retval = ERR_PTR(rc));
+
+			ref_inode = file_inode(ref_file);
+			if (!ref_inode) {
+				fput(ref_file);
+				GOTO(inherit, rc = -EINVAL);
+			}
+
+			lsi = s2lsi(ref_inode->i_sb);
+
+getctx:
+			OBD_ALLOC(ctx, ctx_size);
+			if (!ctx)
+				GOTO(out, retval = ERR_PTR(-ENOMEM));
+
+#ifdef CONFIG_LL_ENCRYPTION
+			rc = lsi->lsi_cop->get_context(ref_inode,
+						       ctx, ctx_size);
+#elif defined(HAVE_LUSTRE_CRYPTO)
+			rc = ref_inode->i_sb->s_cop->get_context(ref_inode,
+								 ctx, ctx_size);
+#else
+			rc = -ENODATA;
+#endif
+			if (rc == -ERANGE) {
+				OBD_FREE(ctx, ctx_size);
+				ctx_size *= 2;
+				goto getctx;
+			}
+			fput(ref_file);
+			if (rc < 0) {
+				OBD_FREE(ctx, ctx_size);
+				GOTO(inherit, rc);
+			}
+
+			op_data->op_file_encctx_size = rc;
+			if (rc == ctx_size) {
+				op_data->op_file_encctx = ctx;
+			} else {
+				OBD_ALLOC(op_data->op_file_encctx,
+					  op_data->op_file_encctx_size);
+				if (!op_data->op_file_encctx) {
+					OBD_FREE(ctx, ctx_size);
+					GOTO(out, retval = ERR_PTR(-ENOMEM));
+				}
+				memcpy(op_data->op_file_encctx, ctx,
+				       op_data->op_file_encctx_size);
+				OBD_FREE(ctx, ctx_size);
+			}
+		} else {
+inherit:
+			rc = llcrypt_inherit_context(parent, NULL, op_data,
+						     false);
+			if (rc)
+				GOTO(out, retval = ERR_PTR(rc));
+		}
+		if (encctx != NULL)
+			*encctx = op_data->op_file_encctx;
+		if (encctxlen != NULL)
+			*encctxlen = op_data->op_file_encctx_size;
+	} else {
+		if (encctx != NULL)
+			*encctx = NULL;
+		if (encctxlen != NULL)
+			*encctxlen = 0;
+	}
+
+	/* ask for security context upon intent:
+	 * get name of security xattr to request to server
+	 */
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN))
+		op_data->op_file_secctx_name_size =
+			ll_secctx_name_get(sbi, &op_data->op_file_secctx_name);
+
+	if (pca && pca->pca_dataset) {
+		OBD_ALLOC_PTR(lum);
+		if (lum == NULL)
+			GOTO(out, retval = ERR_PTR(-ENOMEM));
+
+		lum->lmm_magic = LOV_USER_MAGIC_V1;
+		lum->lmm_pattern = LOV_PATTERN_F_RELEASED | LOV_PATTERN_RAID0;
+		op_data->op_data = lum;
+		op_data->op_data_size = sizeof(*lum);
+		op_data->op_archive_id = pca->pca_dataset->pccd_rwid;
+		it->it_flags |= MDS_OPEN_PCC;
+	}
+
+	rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
+			    &ll_md_blocking_ast, 0);
+	/* If the MDS allows the client to chgrp (CFS_SETGRP_PERM), but the
+	 * client does not know which suppgid should be sent to the MDS, or
+	 * some other(s) changed the target file's GID after this RPC sent
+	 * to the MDS with the suppgid as the original GID, then we should
+	 * try again with right suppgid. */
+	if (rc == -EACCES && it->it_op & IT_OPEN &&
+	    it_disposition(it, DISP_OPEN_DENY)) {
+		struct mdt_body *body;
+
+		LASSERT(req != NULL);
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (op_data->op_suppgids[0] == body->mbo_gid ||
+		    op_data->op_suppgids[1] == body->mbo_gid ||
+		    !in_group_p(make_kgid(&init_user_ns, body->mbo_gid)))
+			GOTO(out, retval = ERR_PTR(-EACCES));
+
+		fid_zero(&op_data->op_fid2);
+		op_data->op_suppgids[1] = body->mbo_gid;
+		ptlrpc_req_finished(req);
+		req = NULL;
+		ll_intent_release(it);
+		rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
+				    &ll_md_blocking_ast, 0);
+	}
+
+	if (rc < 0)
+		GOTO(out, retval = ERR_PTR(rc));
+
+	if (pca && pca->pca_dataset) {
+		rc = pcc_inode_create(parent->i_sb, pca->pca_dataset,
+				      &op_data->op_fid2,
+				      &pca->pca_dentry);
+		if (rc)
+			GOTO(out, retval = ERR_PTR(rc));
+	}
+
+	/* dir layout may change */
+	ll_unlock_md_op_lsm(op_data);
+	rc = ll_lookup_it_finish(req, it, parent, &dentry,
+				 secctx != NULL ? *secctx : NULL,
+				 secctxlen != NULL ? *secctxlen : 0,
+				 encctx != NULL ? *encctx : NULL,
+				 encctxlen != NULL ? *encctxlen : 0,
+				 kstart, encrypt);
+	if (rc != 0) {
+		ll_intent_release(it);
+		GOTO(out, retval = ERR_PTR(rc));
+	}
+
+	if ((it->it_op & IT_OPEN) && dentry->d_inode &&
+	    !S_ISREG(dentry->d_inode->i_mode) &&
+	    !S_ISDIR(dentry->d_inode->i_mode)) {
+		ll_release_openhandle(dentry, it);
+	}
+	ll_lookup_finish_locks(it, dentry);
+
+	GOTO(out, retval = (dentry == save) ? NULL : dentry);
+
+out:
+	if (op_data != NULL && !IS_ERR(op_data)) {
+		if (secctx != NULL && secctxlen != NULL) {
+			/* caller needs sec ctx info, so reset it in op_data to
+			 * prevent it from being freed */
+			op_data->op_file_secctx = NULL;
+			op_data->op_file_secctx_size = 0;
+		}
+		if (encctx != NULL && encctxlen != NULL &&
+		    it->it_op & IT_CREAT && encrypt) {
+			/* caller needs enc ctx info, so reset it in op_data to
+			 * prevent it from being freed
+			 */
+			op_data->op_file_encctx = NULL;
+			op_data->op_file_encctx_size = 0;
+		}
+		llcrypt_free_filename(&fname);
+		ll_finish_md_op_data(op_data);
+	}
+
+	if (lum != NULL)
+		OBD_FREE_PTR(lum);
+
+	ptlrpc_req_finished(req);
+	return retval;
+}
+
+static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct lookup_intent *itp, it = { .it_op = IT_GETATTR };
+	struct dentry *de;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), flags=%u\n",
+	       dentry, PFID(ll_inode2fid(parent)), parent, flags);
+
+	/*
+	 * Optimize away (CREATE && !OPEN). Let .create handle the race.
+	 * but only if we have write permissions there, otherwise we need
+	 * to proceed with lookup. LU-4185
+	 */
+	if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN) &&
+	    (inode_permission(&init_user_ns,
+			      parent, MAY_WRITE | MAY_EXEC) == 0))
+		return NULL;
+
+	if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE))
+		itp = NULL;
+	else
+		itp = &it;
+	de = ll_lookup_it(parent, dentry, itp, NULL, NULL, NULL, NULL, false,
+			  NULL, NULL);
+
+	if (itp != NULL)
+		ll_intent_release(itp);
+
+	return de;
+}
+
+#ifdef FMODE_CREATED /* added in Linux v4.18-rc1-20-g73a09dd */
+# define ll_is_opened(o, f)		((f)->f_mode & FMODE_OPENED)
+# define ll_finish_open(f, d, o)	finish_open((f), (d), NULL)
+# define ll_last_arg
+# define ll_set_created(o, f)						\
+do {									\
+	(f)->f_mode |= FMODE_CREATED;					\
+} while (0)
+
+#else
+# define ll_is_opened(o, f)		(*(o))
+# define ll_finish_open(f, d, o)	finish_open((f), (d), NULL, (o))
+# define ll_last_arg			, int *opened
+# define ll_set_created(o, f)						\
+do {									\
+	*(o) |= FILE_CREATED;						\
+} while (0)
+
+#endif
+
+/*
+ * For cached negative dentry and new dentry, handle lookup/create/open
+ * together.
+ */
+static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
+			  struct file *file, unsigned open_flags,
+			  umode_t mode ll_last_arg)
+{
+	struct lookup_intent *it;
+	struct dentry *de;
+	long long lookup_flags = LOOKUP_OPEN;
+	void *secctx = NULL;
+	__u32 secctxlen = 0;
+	int secctxslot = 0;
+	void *encctx = NULL;
+	__u32 encctxlen = 0;
+	struct ll_sb_info *sbi = NULL;
+	struct pcc_create_attach pca = { NULL, NULL };
+	bool encrypt = false;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op:name=%pd, dir="DFID"(%p), file %p, open_flags %x, mode %x opened %d\n",
+	       dentry, PFID(ll_inode2fid(dir)), dir, file, open_flags, mode,
+	       ll_is_opened(opened, file));
+
+	/* Only negative dentries enter here */
+	LASSERT(dentry->d_inode == NULL);
+
+	if (!d_unhashed(dentry)) {
+		/* A valid negative dentry that just passed revalidation,
+		 * there's little point to try and open it server-side,
+		 * even though there's a minuscule chance it might succeed.
+		 * Either way it's a valid race to just return -ENOENT here.
+		 */
+		if (!(open_flags & O_CREAT))
+			return -ENOENT;
+
+		/* Otherwise we just unhash it to be rehashed afresh via
+		 * lookup if necessary
+		 */
+		d_drop(dentry);
+	}
+
+	OBD_ALLOC(it, sizeof(*it));
+	if (!it)
+		RETURN(-ENOMEM);
+
+	it->it_op = IT_OPEN;
+	if (open_flags & O_CREAT) {
+		it->it_op |= IT_CREAT;
+		lookup_flags |= LOOKUP_CREATE;
+		sbi = ll_i2sbi(dir);
+		/* Volatile file is used for HSM restore, so do not use PCC */
+		if (!filename_is_volatile(dentry->d_name.name,
+					  dentry->d_name.len, NULL)) {
+			struct pcc_matcher item;
+			struct pcc_dataset *dataset;
+
+			item.pm_uid = from_kuid(&init_user_ns, current_uid());
+			item.pm_gid = from_kgid(&init_user_ns, current_gid());
+			item.pm_projid = ll_i2info(dir)->lli_projid;
+			item.pm_name = &dentry->d_name;
+			dataset = pcc_dataset_match_get(&sbi->ll_pcc_super,
+							&item);
+			pca.pca_dataset = dataset;
+		}
+	}
+	it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
+	it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
+	it->it_flags &= ~MDS_OPEN_FL_INTERNAL;
+
+	if (ll_sbi_has_encrypt(ll_i2sbi(dir)) && IS_ENCRYPTED(dir)) {
+		/* in case of create, this is going to be a regular file because
+		 * we set S_IFREG bit on it->it_create_mode above
+		 */
+		rc = llcrypt_get_encryption_info(dir);
+		if (rc)
+			GOTO(out_release, rc);
+		encrypt = true;
+		if (open_flags & O_CREAT) {
+			/* For migration or mirroring without enc key, we still
+			 * need to be able to create a volatile file.
+			 */
+			if (!llcrypt_has_encryption_key(dir) &&
+			    (!filename_is_volatile(dentry->d_name.name,
+						   dentry->d_name.len, NULL) ||
+			    (open_flags & O_FILE_ENC) != O_FILE_ENC ||
+			    !(open_flags & O_DIRECT)))
+				GOTO(out_release, rc = -ENOKEY);
+		}
+	}
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE2, cfs_fail_val);
+
+	/* We can only arrive at this path when we have no inode, so
+	 * we only need to request open lock if it was requested
+	 * for every open
+	 */
+	if (ll_i2sbi(dir)->ll_oc_thrsh_count == 1 &&
+	    exp_connect_flags2(ll_i2mdexp(dir)) &
+	    OBD_CONNECT2_ATOMIC_OPEN_LOCK)
+		it->it_flags |= MDS_OPEN_LOCK;
+
+	/* Dentry added to dcache tree in ll_lookup_it */
+	de = ll_lookup_it(dir, dentry, it, &secctx, &secctxlen, &secctxslot,
+			  &pca, encrypt, &encctx, &encctxlen);
+	if (IS_ERR(de))
+		rc = PTR_ERR(de);
+	else if (de != NULL)
+		dentry = de;
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val);
+
+	if (!rc) {
+		if (it_disposition(it, DISP_OPEN_CREATE)) {
+			/* Dentry instantiated in ll_create_it. */
+			rc = ll_create_it(dir, dentry, it, secctx, secctxlen,
+					  encrypt, encctx, encctxlen,
+					  open_flags);
+			ll_security_release_secctx(secctx, secctxlen,
+						   secctxslot);
+			llcrypt_free_ctx(encctx, encctxlen);
+			if (rc) {
+				/* We dget in ll_splice_alias. */
+				if (de != NULL)
+					dput(de);
+				goto out_release;
+			}
+
+			rc = pcc_inode_create_fini(dentry->d_inode, &pca);
+			if (rc) {
+				if (de != NULL)
+					dput(de);
+				GOTO(out_release, rc);
+			}
+
+			ll_set_created(opened, file);
+		} else {
+			/* Open the file with O_CREAT, but the file already
+			 * existed on MDT. This may happend in the case that
+			 * the LOOKUP ibits lock is revoked and the
+			 * corresponding dentry cache is deleted.
+			 * i.e. In the current Lustre, the truncate operation
+			 * will revoke the LOOKUP ibits lock, and the file
+			 * dentry cache will be invalidated. The following open
+			 * with O_CREAT flag will call into ->atomic_open, the
+			 * file was wrongly though as newly created file and
+			 * try to auto cache the file. So after client knows it
+			 * is not a DISP_OPEN_CREATE, it should cleanup the
+			 * already created PCC copy.
+			 */
+			pcc_create_attach_cleanup(dir->i_sb, &pca);
+
+			if (open_flags & O_CREAT && encrypt &&
+			    dentry->d_inode) {
+				rc = ll_set_encflags(dentry->d_inode, encctx,
+						     encctxlen, true);
+				llcrypt_free_ctx(encctx, encctxlen);
+				if (rc)
+					GOTO(out_release, rc);
+			}
+		}
+
+		/* check also if a foreign file is openable */
+		if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN) &&
+		    ll_foreign_is_openable(dentry, open_flags)) {
+			/* Open dentry. */
+			if (S_ISFIFO(dentry->d_inode->i_mode)) {
+				/* We cannot call open here as it might
+				 * deadlock. This case is unreachable in
+				 * practice because of OBD_CONNECT_NODEVOH. */
+				rc = finish_no_open(file, de);
+			} else {
+				file->private_data = it;
+				rc = ll_finish_open(file, dentry, opened);
+				/* We dget in ll_splice_alias. finish_open takes
+				 * care of dget for fd open.
+				 */
+				if (de != NULL)
+					dput(de);
+			}
+		} else {
+			rc = finish_no_open(file, de);
+		}
+	} else {
+		pcc_create_attach_cleanup(dir->i_sb, &pca);
+	}
+
+out_release:
+	ll_intent_release(it);
+	OBD_FREE(it, sizeof(*it));
+
+	RETURN(rc);
+}
+
+/* We depend on "mode" being set with the proper file type/umask by now */
+static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it)
+{
+        struct inode *inode = NULL;
+        struct ptlrpc_request *request = NULL;
+        struct ll_sb_info *sbi = ll_i2sbi(dir);
+        int rc;
+        ENTRY;
+
+	LASSERT(it && it->it_disposition);
+
+	LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF));
+	request = it->it_request;
+	it_clear_disposition(it, DISP_ENQ_CREATE_REF);
+	rc = ll_prep_inode(&inode, &request->rq_pill, dir->i_sb, it);
+	if (rc)
+		GOTO(out, inode = ERR_PTR(rc));
+
+	/* Pause to allow for a race with concurrent access by fid */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_NODE_PAUSE, cfs_fail_val);
+
+        /* We asked for a lock on the directory, but were granted a
+         * lock on the inode.  Since we finally have an inode pointer,
+         * stuff it in the lock. */
+	CDEBUG(D_DLMTRACE, "setting l_ast_data to inode "DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+        ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+        EXIT;
+ out:
+        ptlrpc_req_finished(request);
+        return inode;
+}
+
+/*
+ * By the time this is called, we already have created the directory cache
+ * entry for the new file, but it is so far negative - it has no inode.
+ *
+ * We defer creating the OBD object(s) until open, to keep the intent and
+ * non-intent code paths similar, and also because we do not have the MDS
+ * inode number before calling ll_create_node() (which is needed for LOV),
+ * so we would need to do yet another RPC to the MDS to store the LOV EA
+ * data on the MDS.  If needed, we would pass the PACKED lmm as data and
+ * lmm_size in datalen (the MDS still has code which will handle that).
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ll_create_it(struct inode *dir, struct dentry *dentry,
+			struct lookup_intent *it,
+			void *secctx, __u32 secctxlen, bool encrypt,
+			void *encctx, __u32 encctxlen, unsigned int open_flags)
+{
+	struct inode *inode;
+	__u64 bits = 0;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), intent=%s\n",
+	       dentry, PFID(ll_inode2fid(dir)), dir, LL_IT2STR(it));
+
+	rc = it_open_error(DISP_OPEN_CREATE, it);
+	if (rc)
+		RETURN(rc);
+
+	inode = ll_create_node(dir, it);
+	if (IS_ERR(inode))
+		RETURN(PTR_ERR(inode));
+
+	/* must be done before d_instantiate, because it calls
+	 * security_d_instantiate, which means a getxattr if security
+	 * context is not set yet
+	 */
+	rc = ll_inode_notifysecctx(inode, secctx, secctxlen);
+	if (rc)
+		RETURN(rc);
+
+	d_instantiate(dentry, inode);
+
+	if (encrypt) {
+		bool preload = true;
+
+		/* For migration or mirroring without enc key, we
+		 * create a volatile file without enc context.
+		 */
+		if (!llcrypt_has_encryption_key(dir) &&
+		    filename_is_volatile(dentry->d_name.name,
+					 dentry->d_name.len, NULL) &&
+		    (open_flags & O_FILE_ENC) == O_FILE_ENC &&
+		    open_flags & O_DIRECT)
+			preload = false;
+		rc = ll_set_encflags(inode, encctx, encctxlen, preload);
+		if (rc)
+			RETURN(rc);
+	}
+
+	if (!test_bit(LL_SBI_FILE_SECCTX, ll_i2sbi(inode)->ll_flags)) {
+		rc = ll_inode_init_security(dentry, inode, dir);
+		if (rc)
+			RETURN(rc);
+	}
+
+	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits);
+	if (bits & MDS_INODELOCK_LOOKUP) {
+		d_lustre_revalidate(dentry);
+		ll_update_dir_depth(dir, inode);
+	}
+
+	RETURN(0);
+}
+
+void ll_update_times(struct ptlrpc_request *request, struct inode *inode)
+{
+	struct mdt_body *body = req_capsule_server_get(&request->rq_pill,
+						       &RMF_MDT_BODY);
+
+	LASSERT(body);
+	if (body->mbo_valid & OBD_MD_FLMTIME &&
+	    body->mbo_mtime > inode->i_mtime.tv_sec) {
+		CDEBUG(D_INODE,
+		       "setting fid " DFID " mtime from %lld to %llu\n",
+		       PFID(ll_inode2fid(inode)),
+		       (s64)inode->i_mtime.tv_sec, body->mbo_mtime);
+		inode->i_mtime.tv_sec = body->mbo_mtime;
+	}
+
+	if (body->mbo_valid & OBD_MD_FLCTIME &&
+	    body->mbo_ctime > inode->i_ctime.tv_sec)
+		inode->i_ctime.tv_sec = body->mbo_ctime;
+}
+
+/* once default LMV (space balanced) is set on ROOT, it should take effect if
+ * default LMV is not set on parent directory.
+ */
+static void ll_qos_mkdir_prep(struct md_op_data *op_data, struct inode *dir)
+{
+	struct inode *root = dir->i_sb->s_root->d_inode;
+	struct ll_inode_info *rlli = ll_i2info(root);
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct lmv_stripe_md *lsm;
+	unsigned short depth;
+
+	op_data->op_dir_depth = lli->lli_inherit_depth ?: lli->lli_dir_depth;
+	depth = lli->lli_dir_depth;
+
+	/* parent directory is striped */
+	if (unlikely(lli->lli_lsm_md))
+		return;
+
+	/* default LMV set on parent directory */
+	if (unlikely(lli->lli_default_lsm_md))
+		return;
+
+	/* parent is ROOT */
+	if (unlikely(dir == root))
+		return;
+
+	/* default LMV not set on ROOT */
+	if (!rlli->lli_default_lsm_md)
+		return;
+
+	down_read(&rlli->lli_lsm_sem);
+	lsm = rlli->lli_default_lsm_md;
+	if (!lsm)
+		goto unlock;
+
+	/* not space balanced */
+	if (lsm->lsm_md_master_mdt_index != LMV_OFFSET_DEFAULT)
+		goto unlock;
+
+	/**
+	 * Check if the fs default is to be applied.
+	 * depth == 0 means 'not inited' for not root dir.
+	 */
+	if (lsm->lsm_md_max_inherit != LMV_INHERIT_NONE &&
+	    (lsm->lsm_md_max_inherit == LMV_INHERIT_UNLIMITED ||
+	     (depth && lsm->lsm_md_max_inherit > depth))) {
+		op_data->op_flags |= MF_QOS_MKDIR;
+		if (lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE &&
+		    (lsm->lsm_md_max_inherit_rr == LMV_INHERIT_RR_UNLIMITED ||
+		     (depth && lsm->lsm_md_max_inherit_rr > depth)))
+			op_data->op_flags |= MF_RR_MKDIR;
+		CDEBUG(D_INODE, DFID" requests qos mkdir %#x\n",
+		       PFID(&lli->lli_fid), op_data->op_flags);
+	}
+unlock:
+	up_read(&rlli->lli_lsm_sem);
+}
+
+static int ll_new_node(struct inode *dir, struct dentry *dchild,
+		       const char *tgt, umode_t mode, __u64 rdev, __u32 opc)
+{
+	struct qstr *name = &dchild->d_name;
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data = NULL;
+	struct inode *inode = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	struct llcrypt_str *disk_link = NULL;
+	bool encrypt = false;
+	int err;
+
+	ENTRY;
+	if (unlikely(tgt != NULL)) {
+		disk_link = (struct llcrypt_str *)rdev;
+		rdev = 0;
+		if (!disk_link)
+			RETURN(-EINVAL);
+	}
+
+again:
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+				     name->len, 0, opc, NULL);
+	if (IS_ERR(op_data))
+		GOTO(err_exit, err = PTR_ERR(op_data));
+
+	if (S_ISDIR(mode))
+		ll_qos_mkdir_prep(op_data, dir);
+
+	if (test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) {
+		err = ll_dentry_init_security(dchild, mode, &dchild->d_name,
+					      &op_data->op_file_secctx_name,
+					      &op_data->op_file_secctx_name_size,
+					      &op_data->op_file_secctx,
+					      &op_data->op_file_secctx_size,
+					      &op_data->op_file_secctx_slot);
+		if (err < 0)
+			GOTO(err_exit, err);
+	}
+
+	if (ll_sbi_has_encrypt(sbi) &&
+	    ((IS_ENCRYPTED(dir) &&
+	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) ||
+	     (unlikely(ll_sb_has_test_dummy_encryption(dir->i_sb)) &&
+	      S_ISDIR(mode)))) {
+		err = llcrypt_get_encryption_info(dir);
+		if (err)
+			GOTO(err_exit, err);
+		if (!llcrypt_has_encryption_key(dir))
+			GOTO(err_exit, err = -ENOKEY);
+		encrypt = true;
+	}
+
+	if (encrypt) {
+		err = llcrypt_inherit_context(dir, NULL, op_data, false);
+		if (err)
+			GOTO(err_exit, err);
+
+		if (S_ISLNK(mode)) {
+			/* llcrypt needs inode to encrypt target name, so create
+			 * a fake inode and associate encryption context got
+			 * from llcrypt_inherit_context.
+			 */
+			struct inode *fakeinode =
+				dchild->d_sb->s_op->alloc_inode(dchild->d_sb);
+
+			if (!fakeinode)
+				GOTO(err_exit, err = -ENOMEM);
+			fakeinode->i_sb = dchild->d_sb;
+			fakeinode->i_mode |= S_IFLNK;
+#ifdef IOP_XATTR
+			fakeinode->i_opflags |= IOP_XATTR;
+#endif
+			ll_lli_init(ll_i2info(fakeinode));
+			err = ll_set_encflags(fakeinode,
+					      op_data->op_file_encctx,
+					      op_data->op_file_encctx_size,
+					      true);
+			if (!err)
+				err = __llcrypt_encrypt_symlink(fakeinode, tgt,
+								strlen(tgt),
+								disk_link);
+
+			ll_xattr_cache_destroy(fakeinode);
+			llcrypt_put_encryption_info(fakeinode);
+			dchild->d_sb->s_op->destroy_inode(fakeinode);
+			if (err)
+				GOTO(err_exit, err);
+		}
+	}
+
+	err = md_create(sbi->ll_md_exp, op_data, tgt ? disk_link->name : NULL,
+			tgt ? disk_link->len : 0, mode,
+			from_kuid(&init_user_ns, current_fsuid()),
+			from_kgid(&init_user_ns, current_fsgid()),
+			current_cap(), rdev, &request);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 15, 58, 0)
+	/*
+	 * server < 2.12.58 doesn't pack default LMV in intent_getattr reply,
+	 * fetch default LMV here.
+	 */
+	if (unlikely(err == -EREMOTE)) {
+		struct ll_inode_info	*lli = ll_i2info(dir);
+		struct lmv_user_md	*lum;
+		int			lumsize;
+		int			err2;
+
+		ptlrpc_req_finished(request);
+		request = NULL;
+		ll_finish_md_op_data(op_data);
+		op_data = NULL;
+
+		err2 = ll_dir_getstripe(dir, (void **)&lum, &lumsize, &request,
+					OBD_MD_DEFAULT_MEA);
+		if (err2 == 0) {
+			struct lustre_md md = { NULL };
+
+			md.body = req_capsule_server_get(&request->rq_pill,
+							 &RMF_MDT_BODY);
+			if (!md.body)
+				GOTO(err_exit, err = -EPROTO);
+
+			OBD_ALLOC_PTR(md.default_lmv);
+			if (!md.default_lmv)
+				GOTO(err_exit, err = -ENOMEM);
+
+			md.default_lmv->lsm_md_magic = lum->lum_magic;
+			md.default_lmv->lsm_md_stripe_count =
+				lum->lum_stripe_count;
+			md.default_lmv->lsm_md_master_mdt_index =
+				lum->lum_stripe_offset;
+			md.default_lmv->lsm_md_hash_type = lum->lum_hash_type;
+			md.default_lmv->lsm_md_max_inherit =
+				lum->lum_max_inherit;
+			md.default_lmv->lsm_md_max_inherit_rr =
+				lum->lum_max_inherit_rr;
+
+			err = ll_update_inode(dir, &md);
+			md_free_lustre_md(sbi->ll_md_exp, &md);
+			if (err)
+				GOTO(err_exit, err);
+		} else if (err2 == -ENODATA && lli->lli_default_lsm_md) {
+			/*
+			 * If there are no default stripe EA on the MDT, but the
+			 * client has default stripe, then it probably means
+			 * default stripe EA has just been deleted.
+			 */
+			down_write(&lli->lli_lsm_sem);
+			if (lli->lli_default_lsm_md)
+				OBD_FREE_PTR(lli->lli_default_lsm_md);
+			lli->lli_default_lsm_md = NULL;
+			up_write(&lli->lli_lsm_sem);
+		} else {
+			GOTO(err_exit, err);
+		}
+
+		ptlrpc_req_finished(request);
+		request = NULL;
+		goto again;
+	}
+#endif
+
+	if (err < 0)
+		GOTO(err_exit, err);
+
+	ll_update_times(request, dir);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_NEWNODE_PAUSE, cfs_fail_val);
+
+	err = ll_prep_inode(&inode, &request->rq_pill, dchild->d_sb, NULL);
+	if (err)
+		GOTO(err_exit, err);
+
+	/* must be done before d_instantiate, because it calls
+	 * security_d_instantiate, which means a getxattr if security
+	 * context is not set yet
+	 */
+	err = ll_inode_notifysecctx(inode,
+				    op_data->op_file_secctx,
+				    op_data->op_file_secctx_size);
+	if (err)
+		GOTO(err_exit, err);
+
+	d_instantiate(dchild, inode);
+
+	if (encrypt) {
+		err = ll_set_encflags(inode, op_data->op_file_encctx,
+				      op_data->op_file_encctx_size, true);
+		if (err)
+			GOTO(err_exit, err);
+
+		if (S_ISLNK(mode)) {
+			struct ll_inode_info *lli = ll_i2info(inode);
+
+			/* Cache the plaintext symlink target
+			 * for later use by get_link()
+			 */
+			OBD_ALLOC(lli->lli_symlink_name, strlen(tgt) + 1);
+			/* do not return an error if we cannot
+			 * cache the symlink locally
+			 */
+			if (lli->lli_symlink_name)
+				memcpy(lli->lli_symlink_name,
+				       tgt, strlen(tgt) + 1);
+		}
+	}
+
+	if (!test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) {
+		err = ll_inode_init_security(dchild, inode, dir);
+		if (err)
+			GOTO(err_exit, err);
+	}
+
+	EXIT;
+err_exit:
+	if (request != NULL)
+		ptlrpc_req_finished(request);
+
+	if (!IS_ERR_OR_NULL(op_data))
+		ll_finish_md_op_data(op_data);
+
+	RETURN(err);
+}
+
+static int ll_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		    struct dentry *dchild, umode_t mode, dev_t rdev)
+{
+	ktime_t kstart = ktime_get();
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p) mode %o dev %x\n",
+	       dchild, PFID(ll_inode2fid(dir)), dir, mode, rdev);
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+
+	switch (mode & S_IFMT) {
+	case 0:
+		mode |= S_IFREG;
+		fallthrough;
+	case S_IFREG:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		err = ll_new_node(dir, dchild, NULL, mode, old_encode_dev(rdev),
+				  LUSTRE_OPC_MKNOD);
+		break;
+	case S_IFDIR:
+		err = -EPERM;
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD,
+				   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(err);
+}
+
+/*
+ * Plain create. Intent create is handled in atomic_open.
+ */
+static int ll_create_nd(struct user_namespace *mnt_userns,
+			struct inode *dir, struct dentry *dentry,
+			umode_t mode, bool want_excl)
+{
+	ktime_t kstart = ktime_get();
+	int rc;
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val);
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op:name=%pd, dir="DFID"(%p), flags=%u, excl=%d\n",
+	       dentry, PFID(ll_inode2fid(dir)), dir, mode, want_excl);
+
+	/* Using mknod(2) to create a regular file is designed to not recognize
+	 * volatile file name, so we use ll_mknod() here. */
+	rc = ll_mknod(mnt_userns, dir, dentry, mode, 0);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, unhashed %d\n",
+	       dentry, d_unhashed(dentry));
+
+	if (!rc)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE,
+				   ktime_us_delta(ktime_get(), kstart));
+
+	return rc;
+}
+
+static int ll_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dchild, const char *oldpath)
+{
+	ktime_t kstart = ktime_get();
+	int len = strlen(oldpath);
+	struct llcrypt_str disk_link;
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), target=%.*s\n",
+	       dchild, PFID(ll_inode2fid(dir)), dir, 3000, oldpath);
+
+	err = llcrypt_prepare_symlink(dir, oldpath, len, dir->i_sb->s_blocksize,
+				      &disk_link);
+	if (err)
+		RETURN(err);
+
+	err = ll_new_node(dir, dchild, oldpath, S_IFLNK | S_IRWXUGO,
+			  (__u64)&disk_link, LUSTRE_OPC_SYMLINK);
+
+	if (disk_link.name != (unsigned char *)oldpath)
+		kfree(disk_link.name);
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK,
+				   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(err);
+}
+
+static int ll_link(struct dentry *old_dentry, struct inode *dir,
+		   struct dentry *new_dentry)
+{
+	struct inode *src = old_dentry->d_inode;
+	struct qstr *name = &new_dentry->d_name;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	ktime_t kstart = ktime_get();
+	int err;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op: inode="DFID"(%p), dir="DFID"(%p), target=%pd\n",
+	       PFID(ll_inode2fid(src)), src,
+	       PFID(ll_inode2fid(dir)), dir, new_dentry);
+
+	err = llcrypt_prepare_link(old_dentry, dir, new_dentry);
+	if (err)
+		RETURN(err);
+
+	op_data = ll_prep_md_op_data(NULL, src, dir, name->name, name->len,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	err = md_link(sbi->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		GOTO(out, err);
+
+	ll_update_times(request, dir);
+	ll_stats_ops_tally(sbi, LPROC_LL_LINK,
+			   ktime_us_delta(ktime_get(), kstart));
+	EXIT;
+out:
+	ptlrpc_req_finished(request);
+	RETURN(err);
+}
+
+static int ll_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		    struct dentry *dchild, umode_t mode)
+{
+	ktime_t kstart = ktime_get();
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p)\n",
+	       dchild, PFID(ll_inode2fid(dir)), dir);
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+
+	mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR;
+
+	err = ll_new_node(dir, dchild, NULL, mode, 0, LUSTRE_OPC_MKDIR);
+	if (err == 0)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR,
+				   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(err);
+}
+
+static int ll_rmdir(struct inode *dir, struct dentry *dchild)
+{
+	struct qstr *name = &dchild->d_name;
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	ktime_t kstart = ktime_get();
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p)\n",
+	       dchild, PFID(ll_inode2fid(dir)), dir);
+
+	if (unlikely(d_mountpoint(dchild)))
+                RETURN(-EBUSY);
+
+	/* some foreign dir may not be allowed to be removed */
+	if (!ll_foreign_is_removable(dchild, false))
+		RETURN(-EPERM);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len,
+				     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (dchild->d_inode != NULL)
+		op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
+
+	if (fid_is_zero(&op_data->op_fid2))
+		op_data->op_fid2 = op_data->op_fid3;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (!rc) {
+		struct mdt_body *body;
+
+		ll_update_times(request, dir);
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR,
+				   ktime_us_delta(ktime_get(), kstart));
+
+		/*
+		 * The server puts attributes in on the last unlink, use them
+		 * to update the link count so the inode can be freed
+		 * immediately.
+		 */
+		body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+		if (body->mbo_valid & OBD_MD_FLNLINK) {
+			spin_lock(&dchild->d_inode->i_lock);
+			set_nlink(dchild->d_inode, body->mbo_nlink);
+			spin_unlock(&dchild->d_inode->i_lock);
+		}
+	}
+
+	ptlrpc_req_finished(request);
+
+	RETURN(rc);
+}
+
+/**
+ * Remove dir entry
+ **/
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	ktime_t kstart = ktime_get();
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n",
+	       namelen, name, PFID(ll_inode2fid(dir)), dir);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name),
+				     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+	op_data->op_cli_flags |= CLI_RM_ENTRY;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (!rc)
+		ll_update_times(request, dir);
+
+	ptlrpc_req_finished(request);
+	if (!rc)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR,
+				   ktime_us_delta(ktime_get(), kstart));
+	RETURN(rc);
+}
+
+static int ll_unlink(struct inode *dir, struct dentry *dchild)
+{
+	struct qstr *name = &dchild->d_name;
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct mdt_body *body;
+	ktime_t kstart = ktime_get();
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p)\n",
+	       dchild, PFID(ll_inode2fid(dir)), dir);
+
+	/*
+	 * XXX: unlink bind mountpoint maybe call to here,
+	 * just check it as vfs_unlink does.
+	 */
+	if (unlikely(d_mountpoint(dchild)))
+		RETURN(-EBUSY);
+
+	/* some foreign file/dir may not be allowed to be unlinked */
+	if (!ll_foreign_is_removable(dchild, false))
+		RETURN(-EPERM);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
+	/* notify lower layer if inode has dirty pages */
+	if (S_ISREG(dchild->d_inode->i_mode) &&
+	    ll_i2info(dchild->d_inode)->lli_clob &&
+	    dirty_cnt(dchild->d_inode))
+		op_data->op_cli_flags |= CLI_DIRTY_DATA;
+	if (fid_is_zero(&op_data->op_fid2))
+		op_data->op_fid2 = op_data->op_fid3;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc)
+		GOTO(out, rc);
+
+	/*
+	 * The server puts attributes in on the last unlink, use them to update
+	 * the link count so the inode can be freed immediately.
+	 */
+	body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+	if (body->mbo_valid & OBD_MD_FLNLINK) {
+		spin_lock(&dchild->d_inode->i_lock);
+		set_nlink(dchild->d_inode, body->mbo_nlink);
+		spin_unlock(&dchild->d_inode->i_lock);
+	}
+
+	ll_update_times(request, dir);
+
+out:
+	ptlrpc_req_finished(request);
+	if (!rc)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK,
+				   ktime_us_delta(ktime_get(), kstart));
+	RETURN(rc);
+}
+
+static int ll_rename(struct user_namespace *mnt_userns,
+		     struct inode *src, struct dentry *src_dchild,
+		     struct inode *tgt, struct dentry *tgt_dchild
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS)
+		     , unsigned int flags
+#endif
+		     )
+{
+	struct ptlrpc_request *request = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(src);
+	struct md_op_data *op_data;
+	ktime_t kstart = ktime_get();
+	umode_t mode = 0;
+	struct llcrypt_name foldname, fnewname;
+	int err;
+	ENTRY;
+
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS)
+	if (flags)
+		return -EINVAL;
+#endif
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op:oldname=%pd, src_dir="DFID"(%p), newname=%pd, tgt_dir="DFID"(%p)\n",
+	       src_dchild, PFID(ll_inode2fid(src)), src,
+	       tgt_dchild, PFID(ll_inode2fid(tgt)), tgt);
+
+	if (unlikely(d_mountpoint(src_dchild) || d_mountpoint(tgt_dchild)))
+		RETURN(-EBUSY);
+
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS)
+	err = llcrypt_prepare_rename(src, src_dchild, tgt, tgt_dchild, flags);
+#else
+	err = llcrypt_prepare_rename(src, src_dchild, tgt, tgt_dchild, 0);
+#endif
+	if (err)
+		RETURN(err);
+	/* we prevent an encrypted file from being renamed
+	 * into an unencrypted dir
+	 */
+	if (IS_ENCRYPTED(src) && !IS_ENCRYPTED(tgt))
+		RETURN(-EXDEV);
+
+	if (src_dchild->d_inode)
+		mode = src_dchild->d_inode->i_mode;
+
+	if (tgt_dchild->d_inode)
+		mode = tgt_dchild->d_inode->i_mode;
+
+	op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, mode,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	/* If the client is using a subdir mount and does a rename to what it
+	 * sees as /.fscrypt, interpret it as the .fscrypt dir at fs root.
+	 */
+	if (unlikely(is_root_inode(tgt) && !fid_is_root(ll_inode2fid(tgt)) &&
+		     tgt_dchild->d_name.len == strlen(dot_fscrypt_name) &&
+		     strncmp(tgt_dchild->d_name.name, dot_fscrypt_name,
+			     tgt_dchild->d_name.len) == 0))
+		lu_root_fid(&op_data->op_fid2);
+
+	if (src_dchild->d_inode)
+		op_data->op_fid3 = *ll_inode2fid(src_dchild->d_inode);
+
+	if (tgt_dchild->d_inode)
+		op_data->op_fid4 = *ll_inode2fid(tgt_dchild->d_inode);
+
+	err = ll_setup_filename(src, &src_dchild->d_name, 1, &foldname, NULL);
+	if (err)
+		RETURN(err);
+	err = ll_setup_filename(tgt, &tgt_dchild->d_name, 1, &fnewname, NULL);
+	if (err) {
+		llcrypt_free_filename(&foldname);
+		RETURN(err);
+	}
+	err = md_rename(sbi->ll_md_exp, op_data,
+			foldname.disk_name.name, foldname.disk_name.len,
+			fnewname.disk_name.name, fnewname.disk_name.len,
+			&request);
+	llcrypt_free_filename(&foldname);
+	llcrypt_free_filename(&fnewname);
+	ll_finish_md_op_data(op_data);
+	if (!err) {
+		ll_update_times(request, src);
+		ll_update_times(request, tgt);
+	}
+
+	ptlrpc_req_finished(request);
+
+	if (!err) {
+		d_move(src_dchild, tgt_dchild);
+		ll_stats_ops_tally(sbi, LPROC_LL_RENAME,
+				   ktime_us_delta(ktime_get(), kstart));
+	}
+
+	RETURN(err);
+}
+
+const struct inode_operations ll_dir_inode_operations = {
+	.mknod		= ll_mknod,
+	.atomic_open	= ll_atomic_open,
+	.lookup		= ll_lookup_nd,
+	.create		= ll_create_nd,
+	/* We need all these non-raw things for NFSD, to not patch it. */
+	.unlink		= ll_unlink,
+	.mkdir		= ll_mkdir,
+	.rmdir		= ll_rmdir,
+	.symlink	= ll_symlink,
+	.link		= ll_link,
+	.rename		= ll_rename,
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr	= ll_removexattr,
+#endif
+	.listxattr	= ll_listxattr,
+	.get_acl	= ll_get_acl,
+#ifdef HAVE_IOP_SET_ACL
+	.set_acl	= ll_set_acl,
+#endif
+};
+
+const struct inode_operations ll_special_inode_operations = {
+	.setattr        = ll_setattr,
+	.getattr        = ll_getattr,
+	.permission     = ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr    = ll_removexattr,
+#endif
+	.listxattr      = ll_listxattr,
+	.get_acl	= ll_get_acl,
+#ifdef HAVE_IOP_SET_ACL
+	.set_acl	= ll_set_acl,
+#endif
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/pcc.c b/drivers/staging/lustrefsx/lustre/llite/pcc.c
new file mode 100644
index 0000000000000..9f176b5ea92fa
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/pcc.c
@@ -0,0 +1,2748 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, DDN Storage Corporation.
+ */
+/*
+ * Persistent Client Cache
+ *
+ * PCC is a new framework which provides a group of local cache on Lustre
+ * client side. It works in two modes: RW-PCC enables a read-write cache on the
+ * local SSDs of a single client; RO-PCC provides a read-only cache on the
+ * local SSDs of multiple clients. Less overhead is visible to the applications
+ * and network latencies and lock conflicts can be significantly reduced.
+ *
+ * For RW-PCC, no global namespace will be provided. Each client uses its own
+ * local storage as a cache for itself. Local file system is used to manage
+ * the data on local caches. Cached I/O is directed to local file system while
+ * normal I/O is directed to OSTs. RW-PCC uses HSM for data synchronization.
+ * It uses HSM copytool to restore file from local caches to Lustre OSTs. Each
+ * PCC has a copytool instance running with unique archive number. Any remote
+ * access from another Lustre client would trigger the data synchronization. If
+ * a client with RW-PCC goes offline, the cached data becomes inaccessible for
+ * other client temporarily. And after the RW-PCC client reboots and the
+ * copytool restarts, the data will be accessible again.
+ *
+ * Following is what will happen in different conditions for RW-PCC:
+ *
+ * > When file is being created on RW-PCC
+ *
+ * A normal HSM released file is created on MDT;
+ * An empty mirror file is created on local cache;
+ * The HSM status of the Lustre file will be set to archived and released;
+ * The archive number will be set to the proper value.
+ *
+ * > When file is being prefetched to RW-PCC
+ *
+ * An file is copied to the local cache;
+ * The HSM status of the Lustre file will be set to archived and released;
+ * The archive number will be set to the proper value.
+ *
+ * > When file is being accessed from PCC
+ *
+ * Data will be read directly from local cache;
+ * Metadata will be read from MDT, except file size;
+ * File size will be got from local cache.
+ *
+ * > When PCC cached file is being accessed on another client
+ *
+ * RW-PCC cached files are automatically restored when a process on another
+ * client tries to read or modify them. The corresponding I/O will block
+ * waiting for the released file to be restored. This is transparent to the
+ * process.
+ *
+ * For RW-PCC, when a file is being created, a rule-based policy is used to
+ * determine whether it will be cached. Rule-based caching of newly created
+ * files can determine which file can use a cache on PCC directly without any
+ * admission control.
+ *
+ * RW-PCC design can accelerate I/O intensive applications with one-to-one
+ * mappings between files and accessing clients. However, in several use cases,
+ * files will never be updated, but need to be read simultaneously from many
+ * clients. RO-PCC implements a read-only caching on Lustre clients using
+ * SSDs. RO-PCC is based on the same framework as RW-PCC, expect
+ * that no HSM mechanism is used.
+ *
+ * The main advantages to use this SSD cache on the Lustre clients via PCC
+ * is that:
+ * - The I/O stack becomes much simpler for the cached data, as there is no
+ *   interference with I/Os from other clients, which enables easier
+ *   performance optimizations;
+ * - The requirements on the HW inside the client nodes are small, any kind of
+ *   SSDs or even HDDs can be used as cache devices;
+ * - Caching reduces the pressure on the object storage targets (OSTs), as
+ *   small or random I/Os can be regularized to big sequential I/Os and
+ *   temporary files do not even need to be flushed to OSTs.
+ *
+ * PCC can accelerate applications with certain I/O patterns:
+ * - small-sized random writes (< 1MB) from a single client
+ * - repeated read of data that is larger than RAM
+ * - clients with high network latency
+ *
+ * Author: Li Xi <lixi@ddn.com>
+ * Author: Qian Yingjin <qian@ddn.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "pcc.h"
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <lustre_compat.h>
+#include "llite_internal.h"
+
+struct kmem_cache *pcc_inode_slab;
+
+int pcc_super_init(struct pcc_super *super)
+{
+	struct cred *cred;
+
+	super->pccs_cred = cred = prepare_creds();
+	if (!cred)
+		return -ENOMEM;
+
+	/* Never override disk quota limits or use reserved space */
+	cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
+	init_rwsem(&super->pccs_rw_sem);
+	INIT_LIST_HEAD(&super->pccs_datasets);
+	super->pccs_generation = 1;
+
+	return 0;
+}
+
+/* Rule based auto caching */
+static void pcc_id_list_free(struct list_head *id_list)
+{
+	struct pcc_match_id *id, *n;
+
+	list_for_each_entry_safe(id, n, id_list, pmi_linkage) {
+		list_del_init(&id->pmi_linkage);
+		OBD_FREE_PTR(id);
+	}
+}
+
+static void pcc_fname_list_free(struct list_head *fname_list)
+{
+	struct pcc_match_fname *fname, *n;
+
+	list_for_each_entry_safe(fname, n, fname_list, pmf_linkage) {
+		OBD_FREE(fname->pmf_name, strlen(fname->pmf_name) + 1);
+		list_del_init(&fname->pmf_linkage);
+		OBD_FREE_PTR(fname);
+	}
+}
+
+static void pcc_expression_free(struct pcc_expression *expr)
+{
+	LASSERT(expr->pe_field >= PCC_FIELD_UID &&
+		expr->pe_field < PCC_FIELD_MAX);
+	switch (expr->pe_field) {
+	case PCC_FIELD_UID:
+	case PCC_FIELD_GID:
+	case PCC_FIELD_PROJID:
+		pcc_id_list_free(&expr->pe_cond);
+		break;
+	case PCC_FIELD_FNAME:
+		pcc_fname_list_free(&expr->pe_cond);
+		break;
+	default:
+		LBUG();
+	}
+	OBD_FREE_PTR(expr);
+}
+
+static void pcc_conjunction_free(struct pcc_conjunction *conjunction)
+{
+	struct pcc_expression *expression, *n;
+
+	LASSERT(list_empty(&conjunction->pc_linkage));
+	list_for_each_entry_safe(expression, n,
+				 &conjunction->pc_expressions,
+				 pe_linkage) {
+		list_del_init(&expression->pe_linkage);
+		pcc_expression_free(expression);
+	}
+	OBD_FREE_PTR(conjunction);
+}
+
+static void pcc_rule_conds_free(struct list_head *cond_list)
+{
+	struct pcc_conjunction *conjunction, *n;
+
+	list_for_each_entry_safe(conjunction, n, cond_list, pc_linkage) {
+		list_del_init(&conjunction->pc_linkage);
+		pcc_conjunction_free(conjunction);
+	}
+}
+
+static void pcc_cmd_fini(struct pcc_cmd *cmd)
+{
+	if (cmd->pccc_cmd == PCC_ADD_DATASET) {
+		if (!list_empty(&cmd->u.pccc_add.pccc_conds))
+			pcc_rule_conds_free(&cmd->u.pccc_add.pccc_conds);
+		if (cmd->u.pccc_add.pccc_conds_str)
+			OBD_FREE(cmd->u.pccc_add.pccc_conds_str,
+				 strlen(cmd->u.pccc_add.pccc_conds_str) + 1);
+	}
+}
+
+#define PCC_DISJUNCTION_DELIM	(',')
+#define PCC_CONJUNCTION_DELIM	('&')
+#define PCC_EXPRESSION_DELIM	('=')
+
+static int
+pcc_fname_list_add(struct cfs_lstr *id, struct list_head *fname_list)
+{
+	struct pcc_match_fname *fname;
+
+	OBD_ALLOC_PTR(fname);
+	if (fname == NULL)
+		return -ENOMEM;
+
+	OBD_ALLOC(fname->pmf_name, id->ls_len + 1);
+	if (fname->pmf_name == NULL) {
+		OBD_FREE_PTR(fname);
+		return -ENOMEM;
+	}
+
+	memcpy(fname->pmf_name, id->ls_str, id->ls_len);
+	list_add_tail(&fname->pmf_linkage, fname_list);
+	return 0;
+}
+
+static int
+pcc_fname_list_parse(char *str, int len, struct list_head *fname_list)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+
+	ENTRY;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(fname_list);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = pcc_fname_list_add(&res, fname_list);
+		if (rc)
+			break;
+	}
+	if (rc)
+		pcc_fname_list_free(fname_list);
+	RETURN(rc);
+}
+
+static int
+pcc_id_list_parse(char *str, int len, struct list_head *id_list,
+		  enum pcc_field type)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+
+	ENTRY;
+
+	if (type != PCC_FIELD_UID && type != PCC_FIELD_GID &&
+	    type != PCC_FIELD_PROJID)
+		RETURN(-EINVAL);
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(id_list);
+	while (src.ls_str) {
+		struct pcc_match_id *id;
+		__u32 id_val;
+
+		if (cfs_gettok(&src, ' ', &res) == 0)
+			GOTO(out, rc = -EINVAL);
+
+		if (!cfs_str2num_check(res.ls_str, res.ls_len,
+				       &id_val, 0, (u32)~0U))
+			GOTO(out, rc = -EINVAL);
+
+		OBD_ALLOC_PTR(id);
+		if (id == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		id->pmi_id = id_val;
+		list_add_tail(&id->pmi_linkage, id_list);
+	}
+out:
+	if (rc)
+		pcc_id_list_free(id_list);
+	RETURN(rc);
+}
+
+static inline bool
+pcc_check_field(struct cfs_lstr *field, char *str)
+{
+	int len = strlen(str);
+
+	return (field->ls_len == len &&
+		strncmp(field->ls_str, str, len) == 0);
+}
+
+static int
+pcc_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
+{
+	struct pcc_expression *expr;
+	struct cfs_lstr field;
+	int rc = 0;
+
+	OBD_ALLOC_PTR(expr);
+	if (expr == NULL)
+		return -ENOMEM;
+
+	rc = cfs_gettok(src, PCC_EXPRESSION_DELIM, &field);
+	if (rc == 0 || src->ls_len <= 2 || src->ls_str[0] != '{' ||
+	    src->ls_str[src->ls_len - 1] != '}')
+		GOTO(out, rc = -EINVAL);
+
+	/* Skip '{' and '}' */
+	src->ls_str++;
+	src->ls_len -= 2;
+
+	if (pcc_check_field(&field, "uid")) {
+		if (pcc_id_list_parse(src->ls_str,
+				      src->ls_len,
+				      &expr->pe_cond,
+				      PCC_FIELD_UID) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->pe_field = PCC_FIELD_UID;
+	} else if (pcc_check_field(&field, "gid")) {
+		if (pcc_id_list_parse(src->ls_str,
+				      src->ls_len,
+				      &expr->pe_cond,
+				      PCC_FIELD_GID) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->pe_field = PCC_FIELD_GID;
+	} else if (pcc_check_field(&field, "projid")) {
+		if (pcc_id_list_parse(src->ls_str,
+				      src->ls_len,
+				      &expr->pe_cond,
+				      PCC_FIELD_PROJID) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->pe_field = PCC_FIELD_PROJID;
+	} else if (pcc_check_field(&field, "fname")) {
+		if (pcc_fname_list_parse(src->ls_str,
+					 src->ls_len,
+					 &expr->pe_cond) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->pe_field = PCC_FIELD_FNAME;
+	} else {
+		GOTO(out, rc = -EINVAL);
+	}
+
+	list_add_tail(&expr->pe_linkage, cond_list);
+	return 0;
+out:
+	OBD_FREE_PTR(expr);
+	return rc;
+}
+
+static int
+pcc_conjunction_parse(struct cfs_lstr *src, struct list_head *cond_list)
+{
+	struct pcc_conjunction *conjunction;
+	struct cfs_lstr expr;
+	int rc = 0;
+
+	OBD_ALLOC_PTR(conjunction);
+	if (conjunction == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&conjunction->pc_expressions);
+	list_add_tail(&conjunction->pc_linkage, cond_list);
+
+	while (src->ls_str) {
+		rc = cfs_gettok(src, PCC_CONJUNCTION_DELIM, &expr);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = pcc_expression_parse(&expr,
+					  &conjunction->pc_expressions);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+static int pcc_conds_parse(char *str, int len, struct list_head *cond_list)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(cond_list);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, PCC_DISJUNCTION_DELIM, &res);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = pcc_conjunction_parse(&res, cond_list);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+static int pcc_id_parse(struct pcc_cmd *cmd, const char *id)
+{
+	int rc;
+
+	OBD_ALLOC(cmd->u.pccc_add.pccc_conds_str, strlen(id) + 1);
+	if (cmd->u.pccc_add.pccc_conds_str == NULL)
+		return -ENOMEM;
+
+	memcpy(cmd->u.pccc_add.pccc_conds_str, id, strlen(id));
+
+	rc = pcc_conds_parse(cmd->u.pccc_add.pccc_conds_str,
+			     strlen(cmd->u.pccc_add.pccc_conds_str),
+			     &cmd->u.pccc_add.pccc_conds);
+	if (rc)
+		pcc_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int
+pcc_parse_value_pair(struct pcc_cmd *cmd, char *buffer)
+{
+	char *key, *val;
+	unsigned long id;
+	int rc;
+
+	val = buffer;
+	key = strsep(&val, "=");
+	if (val == NULL || strlen(val) == 0)
+		return -EINVAL;
+
+	/* Key of the value pair */
+	if (strcmp(key, "rwid") == 0) {
+		rc = kstrtoul(val, 10, &id);
+		if (rc)
+			return rc;
+		if (id <= 0)
+			return -EINVAL;
+		cmd->u.pccc_add.pccc_rwid = id;
+	} else if (strcmp(key, "roid") == 0) {
+		rc = kstrtoul(val, 10, &id);
+		if (rc)
+			return rc;
+		if (id <= 0)
+			return -EINVAL;
+		cmd->u.pccc_add.pccc_roid = id;
+	} else if (strcmp(key, "auto_attach") == 0) {
+		rc = kstrtoul(val, 10, &id);
+		if (rc)
+			return rc;
+		if (id == 0)
+			cmd->u.pccc_add.pccc_flags &= ~PCC_DATASET_AUTO_ATTACH;
+	} else if (strcmp(key, "open_attach") == 0) {
+		rc = kstrtoul(val, 10, &id);
+		if (rc)
+			return rc;
+		if (id == 0)
+			cmd->u.pccc_add.pccc_flags &= ~PCC_DATASET_OPEN_ATTACH;
+	} else if (strcmp(key, "io_attach") == 0) {
+		rc = kstrtoul(val, 10, &id);
+		if (rc)
+			return rc;
+		if (id == 0)
+			cmd->u.pccc_add.pccc_flags &= ~PCC_DATASET_IO_ATTACH;
+	} else if (strcmp(key, "stat_attach") == 0) {
+		rc = kstrtoul(val, 10, &id);
+		if (rc)
+			return rc;
+		if (id == 0)
+			cmd->u.pccc_add.pccc_flags &= ~PCC_DATASET_STAT_ATTACH;
+	} else if (strcmp(key, "rwpcc") == 0) {
+		rc = kstrtoul(val, 10, &id);
+		if (rc)
+			return rc;
+		if (id > 0)
+			cmd->u.pccc_add.pccc_flags |= PCC_DATASET_RWPCC;
+	} else if (strcmp(key, "ropcc") == 0) {
+		rc = kstrtoul(val, 10, &id);
+		if (rc)
+			return rc;
+		if (id > 0)
+			cmd->u.pccc_add.pccc_flags |= PCC_DATASET_ROPCC;
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+pcc_parse_value_pairs(struct pcc_cmd *cmd, char *buffer)
+{
+	char *val;
+	char *token;
+	int rc;
+
+	switch (cmd->pccc_cmd) {
+	case PCC_ADD_DATASET:
+		/* Enable auto attach by default */
+		cmd->u.pccc_add.pccc_flags |= PCC_DATASET_AUTO_ATTACH;
+		break;
+	case PCC_DEL_DATASET:
+	case PCC_CLEAR_ALL:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	val = buffer;
+	while (val != NULL && strlen(val) != 0) {
+		token = strsep(&val, " ");
+		rc = pcc_parse_value_pair(cmd, token);
+		if (rc)
+			return rc;
+	}
+
+	switch (cmd->pccc_cmd) {
+	case PCC_ADD_DATASET:
+		if (cmd->u.pccc_add.pccc_flags & PCC_DATASET_RWPCC &&
+		    cmd->u.pccc_add.pccc_flags & PCC_DATASET_ROPCC)
+			return -EINVAL;
+		/*
+		 * By default, a PCC backend can provide caching service for
+		 * both RW-PCC and RO-PCC.
+		 */
+		if ((cmd->u.pccc_add.pccc_flags & PCC_DATASET_PCC_ALL) == 0)
+			cmd->u.pccc_add.pccc_flags |= PCC_DATASET_PCC_ALL;
+
+		/* For RW-PCC, the value of @rwid must be non zero. */
+		if (cmd->u.pccc_add.pccc_flags & PCC_DATASET_RWPCC &&
+		    cmd->u.pccc_add.pccc_rwid == 0)
+			return -EINVAL;
+
+		break;
+	case PCC_DEL_DATASET:
+	case PCC_CLEAR_ALL:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void
+pcc_dataset_rule_fini(struct pcc_match_rule *rule)
+{
+	if (!list_empty(&rule->pmr_conds))
+		pcc_rule_conds_free(&rule->pmr_conds);
+	LASSERT(rule->pmr_conds_str != NULL);
+	OBD_FREE(rule->pmr_conds_str, strlen(rule->pmr_conds_str) + 1);
+}
+
+static int
+pcc_dataset_rule_init(struct pcc_match_rule *rule, struct pcc_cmd *cmd)
+{
+	int rc = 0;
+
+	LASSERT(cmd->u.pccc_add.pccc_conds_str);
+	OBD_ALLOC(rule->pmr_conds_str,
+		  strlen(cmd->u.pccc_add.pccc_conds_str) + 1);
+	if (rule->pmr_conds_str == NULL)
+		return -ENOMEM;
+
+	memcpy(rule->pmr_conds_str,
+	       cmd->u.pccc_add.pccc_conds_str,
+	       strlen(cmd->u.pccc_add.pccc_conds_str));
+
+	INIT_LIST_HEAD(&rule->pmr_conds);
+	if (!list_empty(&cmd->u.pccc_add.pccc_conds))
+		rc = pcc_conds_parse(rule->pmr_conds_str,
+					  strlen(rule->pmr_conds_str),
+					  &rule->pmr_conds);
+
+	if (rc)
+		pcc_dataset_rule_fini(rule);
+
+	return rc;
+}
+
+/* Rule Matching */
+static int
+pcc_id_list_match(struct list_head *id_list, __u32 id_val)
+{
+	struct pcc_match_id *id;
+
+	list_for_each_entry(id, id_list, pmi_linkage) {
+		if (id->pmi_id == id_val)
+			return 1;
+	}
+	return 0;
+}
+
+static bool
+cfs_match_wildcard(const char *pattern, const char *content)
+{
+	if (*pattern == '\0' && *content == '\0')
+		return true;
+
+	if (*pattern == '*' && *(pattern + 1) != '\0' && *content == '\0')
+		return false;
+
+	while (*pattern == *content) {
+		pattern++;
+		content++;
+		if (*pattern == '\0' && *content == '\0')
+			return true;
+
+		if (*pattern == '*' && *(pattern + 1) != '\0' &&
+		    *content == '\0')
+			return false;
+	}
+
+	if (*pattern == '*')
+		return (cfs_match_wildcard(pattern + 1, content) ||
+			cfs_match_wildcard(pattern, content + 1));
+
+	return false;
+}
+
+static int
+pcc_fname_list_match(struct list_head *fname_list, const char *name)
+{
+	struct pcc_match_fname *fname;
+
+	list_for_each_entry(fname, fname_list, pmf_linkage) {
+		if (cfs_match_wildcard(fname->pmf_name, name))
+			return 1;
+	}
+	return 0;
+}
+
+static int
+pcc_expression_match(struct pcc_expression *expr, struct pcc_matcher *matcher)
+{
+	switch (expr->pe_field) {
+	case PCC_FIELD_UID:
+		return pcc_id_list_match(&expr->pe_cond, matcher->pm_uid);
+	case PCC_FIELD_GID:
+		return pcc_id_list_match(&expr->pe_cond, matcher->pm_gid);
+	case PCC_FIELD_PROJID:
+		return pcc_id_list_match(&expr->pe_cond, matcher->pm_projid);
+	case PCC_FIELD_FNAME:
+		return pcc_fname_list_match(&expr->pe_cond,
+					    matcher->pm_name->name);
+	default:
+		return 0;
+	}
+}
+
+static int
+pcc_conjunction_match(struct pcc_conjunction *conjunction,
+		      struct pcc_matcher *matcher)
+{
+	struct pcc_expression *expr;
+	int matched;
+
+	list_for_each_entry(expr, &conjunction->pc_expressions, pe_linkage) {
+		matched = pcc_expression_match(expr, matcher);
+		if (!matched)
+			return 0;
+	}
+
+	return 1;
+}
+
+static int
+pcc_cond_match(struct pcc_match_rule *rule, struct pcc_matcher *matcher)
+{
+	struct pcc_conjunction *conjunction;
+	int matched;
+
+	list_for_each_entry(conjunction, &rule->pmr_conds, pc_linkage) {
+		matched = pcc_conjunction_match(conjunction, matcher);
+		if (matched)
+			return 1;
+	}
+
+	return 0;
+}
+
+struct pcc_dataset*
+pcc_dataset_match_get(struct pcc_super *super, struct pcc_matcher *matcher)
+{
+	struct pcc_dataset *dataset;
+	struct pcc_dataset *selected = NULL;
+
+	down_read(&super->pccs_rw_sem);
+	list_for_each_entry(dataset, &super->pccs_datasets, pccd_linkage) {
+		if (!(dataset->pccd_flags & PCC_DATASET_RWPCC))
+			continue;
+
+		if (pcc_cond_match(&dataset->pccd_rule, matcher)) {
+			atomic_inc(&dataset->pccd_refcount);
+			selected = dataset;
+			break;
+		}
+	}
+	up_read(&super->pccs_rw_sem);
+	if (selected)
+		CDEBUG(D_CACHE, "PCC create, matched %s - %d:%d:%d:%s\n",
+		       dataset->pccd_rule.pmr_conds_str,
+		       matcher->pm_uid, matcher->pm_gid,
+		       matcher->pm_projid, matcher->pm_name->name);
+
+	return selected;
+}
+
+/**
+ * pcc_dataset_add - Add a Cache policy to control which files need be
+ * cached and where it will be cached.
+ *
+ * @super:	superblock of pcc
+ * @cmd:	pcc command
+ */
+static int
+pcc_dataset_add(struct pcc_super *super, struct pcc_cmd *cmd)
+{
+	char *pathname = cmd->pccc_pathname;
+	struct pcc_dataset *dataset;
+	struct pcc_dataset *tmp;
+	bool found = false;
+	int rc;
+
+	OBD_ALLOC_PTR(dataset);
+	if (dataset == NULL)
+		return -ENOMEM;
+
+	rc = kern_path(pathname, LOOKUP_DIRECTORY, &dataset->pccd_path);
+	if (unlikely(rc)) {
+		OBD_FREE_PTR(dataset);
+		return rc;
+	}
+	strncpy(dataset->pccd_pathname, pathname, PATH_MAX);
+	dataset->pccd_rwid = cmd->u.pccc_add.pccc_rwid;
+	dataset->pccd_roid = cmd->u.pccc_add.pccc_roid;
+	dataset->pccd_flags = cmd->u.pccc_add.pccc_flags;
+	atomic_set(&dataset->pccd_refcount, 1);
+
+	rc = pcc_dataset_rule_init(&dataset->pccd_rule, cmd);
+	if (rc) {
+		pcc_dataset_put(dataset);
+		return rc;
+	}
+
+	down_write(&super->pccs_rw_sem);
+	list_for_each_entry(tmp, &super->pccs_datasets, pccd_linkage) {
+		if (strcmp(tmp->pccd_pathname, pathname) == 0 ||
+		    (dataset->pccd_rwid != 0 &&
+		     dataset->pccd_rwid == tmp->pccd_rwid) ||
+		    (dataset->pccd_roid != 0 &&
+		     dataset->pccd_roid == tmp->pccd_roid)) {
+			found = true;
+			break;
+		}
+	}
+	if (!found)
+		list_add(&dataset->pccd_linkage, &super->pccs_datasets);
+	up_write(&super->pccs_rw_sem);
+
+	if (found) {
+		pcc_dataset_put(dataset);
+		rc = -EEXIST;
+	}
+
+	return rc;
+}
+
+struct pcc_dataset *
+pcc_dataset_get(struct pcc_super *super, enum lu_pcc_type type, __u32 id)
+{
+	struct pcc_dataset *dataset;
+	struct pcc_dataset *selected = NULL;
+
+	if (id == 0)
+		return NULL;
+
+	/*
+	 * archive ID (read-write ID) or read-only ID is unique in the list,
+	 * we just return last added one as first priority.
+	 */
+	down_read(&super->pccs_rw_sem);
+	list_for_each_entry(dataset, &super->pccs_datasets, pccd_linkage) {
+		if (type == LU_PCC_READWRITE && (dataset->pccd_rwid != id ||
+		    !(dataset->pccd_flags & PCC_DATASET_RWPCC)))
+			continue;
+		atomic_inc(&dataset->pccd_refcount);
+		selected = dataset;
+		break;
+	}
+	up_read(&super->pccs_rw_sem);
+	if (selected)
+		CDEBUG(D_CACHE, "matched id %u, PCC mode %d\n", id, type);
+
+	return selected;
+}
+
+void
+pcc_dataset_put(struct pcc_dataset *dataset)
+{
+	if (atomic_dec_and_test(&dataset->pccd_refcount)) {
+		pcc_dataset_rule_fini(&dataset->pccd_rule);
+		path_put(&dataset->pccd_path);
+		OBD_FREE_PTR(dataset);
+	}
+}
+
+static int
+pcc_dataset_del(struct pcc_super *super, char *pathname)
+{
+	struct list_head *l, *tmp;
+	struct pcc_dataset *dataset;
+	int rc = -ENOENT;
+
+	down_write(&super->pccs_rw_sem);
+	list_for_each_safe(l, tmp, &super->pccs_datasets) {
+		dataset = list_entry(l, struct pcc_dataset, pccd_linkage);
+		if (strcmp(dataset->pccd_pathname, pathname) == 0) {
+			list_del_init(&dataset->pccd_linkage);
+			pcc_dataset_put(dataset);
+			super->pccs_generation++;
+			rc = 0;
+			break;
+		}
+	}
+	up_write(&super->pccs_rw_sem);
+	return rc;
+}
+
+static void
+pcc_dataset_dump(struct pcc_dataset *dataset, struct seq_file *m)
+{
+	seq_printf(m, "%s:\n", dataset->pccd_pathname);
+	seq_printf(m, "  rwid: %u\n", dataset->pccd_rwid);
+	seq_printf(m, "  flags: %x\n", dataset->pccd_flags);
+	seq_printf(m, "  autocache: %s\n", dataset->pccd_rule.pmr_conds_str);
+}
+
+int
+pcc_super_dump(struct pcc_super *super, struct seq_file *m)
+{
+	struct pcc_dataset *dataset;
+
+	down_read(&super->pccs_rw_sem);
+	list_for_each_entry(dataset, &super->pccs_datasets, pccd_linkage) {
+		pcc_dataset_dump(dataset, m);
+	}
+	up_read(&super->pccs_rw_sem);
+	return 0;
+}
+
+static void pcc_remove_datasets(struct pcc_super *super)
+{
+	struct pcc_dataset *dataset, *tmp;
+
+	down_write(&super->pccs_rw_sem);
+	list_for_each_entry_safe(dataset, tmp,
+				 &super->pccs_datasets, pccd_linkage) {
+		list_del(&dataset->pccd_linkage);
+		pcc_dataset_put(dataset);
+	}
+	super->pccs_generation++;
+	up_write(&super->pccs_rw_sem);
+}
+
+void pcc_super_fini(struct pcc_super *super)
+{
+	pcc_remove_datasets(super);
+	put_cred(super->pccs_cred);
+}
+
+static bool pathname_is_valid(const char *pathname)
+{
+	/* Needs to be absolute path */
+	if (pathname == NULL || strlen(pathname) == 0 ||
+	    strlen(pathname) >= PATH_MAX || pathname[0] != '/')
+		return false;
+	return true;
+}
+
+static struct pcc_cmd *
+pcc_cmd_parse(char *buffer, unsigned long count)
+{
+	static struct pcc_cmd *cmd;
+	char *token;
+	char *val;
+	int rc = 0;
+
+	OBD_ALLOC_PTR(cmd);
+	if (cmd == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	/* clear all setting */
+	if (strncmp(buffer, "clear", 5) == 0) {
+		cmd->pccc_cmd = PCC_CLEAR_ALL;
+		GOTO(out, rc = 0);
+	}
+
+	val = buffer;
+	token = strsep(&val, " ");
+	if (val == NULL || strlen(val) == 0)
+		GOTO(out_free_cmd, rc = -EINVAL);
+
+	/* Type of the command */
+	if (strcmp(token, "add") == 0)
+		cmd->pccc_cmd = PCC_ADD_DATASET;
+	else if (strcmp(token, "del") == 0)
+		cmd->pccc_cmd = PCC_DEL_DATASET;
+	else
+		GOTO(out_free_cmd, rc = -EINVAL);
+
+	/* Pathname of the dataset */
+	token = strsep(&val, " ");
+	if ((val == NULL && cmd->pccc_cmd != PCC_DEL_DATASET) ||
+	    !pathname_is_valid(token))
+		GOTO(out_free_cmd, rc = -EINVAL);
+	cmd->pccc_pathname = token;
+
+	if (cmd->pccc_cmd == PCC_ADD_DATASET) {
+		/* List of ID */
+		LASSERT(val);
+		token = val;
+		val = strrchr(token, '}');
+		if (!val)
+			GOTO(out_free_cmd, rc = -EINVAL);
+
+		/* Skip '}' */
+		val++;
+		if (*val == '\0') {
+			val = NULL;
+		} else if (*val == ' ') {
+			*val = '\0';
+			val++;
+		} else {
+			GOTO(out_free_cmd, rc = -EINVAL);
+		}
+
+		rc = pcc_id_parse(cmd, token);
+		if (rc)
+			GOTO(out_free_cmd, rc);
+
+		rc = pcc_parse_value_pairs(cmd, val);
+		if (rc)
+			GOTO(out_cmd_fini, rc = -EINVAL);
+	}
+	goto out;
+out_cmd_fini:
+	pcc_cmd_fini(cmd);
+out_free_cmd:
+	OBD_FREE_PTR(cmd);
+out:
+	if (rc)
+		cmd = ERR_PTR(rc);
+	return cmd;
+}
+
+int pcc_cmd_handle(char *buffer, unsigned long count,
+		   struct pcc_super *super)
+{
+	int rc = 0;
+	struct pcc_cmd *cmd;
+
+	cmd = pcc_cmd_parse(buffer, count);
+	if (IS_ERR(cmd))
+		return PTR_ERR(cmd);
+
+	switch (cmd->pccc_cmd) {
+	case PCC_ADD_DATASET:
+		rc = pcc_dataset_add(super, cmd);
+		break;
+	case PCC_DEL_DATASET:
+		rc = pcc_dataset_del(super, cmd->pccc_pathname);
+		break;
+	case PCC_CLEAR_ALL:
+		pcc_remove_datasets(super);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	pcc_cmd_fini(cmd);
+	OBD_FREE_PTR(cmd);
+	return rc;
+}
+
+static inline void pcc_inode_lock(struct inode *inode)
+{
+	mutex_lock(&ll_i2info(inode)->lli_pcc_lock);
+}
+
+static inline void pcc_inode_unlock(struct inode *inode)
+{
+	mutex_unlock(&ll_i2info(inode)->lli_pcc_lock);
+}
+
+static void pcc_inode_init(struct pcc_inode *pcci, struct ll_inode_info *lli)
+{
+	pcci->pcci_lli = lli;
+	lli->lli_pcc_inode = pcci;
+	atomic_set(&pcci->pcci_refcount, 0);
+	pcci->pcci_type = LU_PCC_NONE;
+	pcci->pcci_layout_gen = CL_LAYOUT_GEN_NONE;
+	atomic_set(&pcci->pcci_active_ios, 0);
+	init_waitqueue_head(&pcci->pcci_waitq);
+}
+
+static void pcc_inode_fini(struct pcc_inode *pcci)
+{
+	struct ll_inode_info *lli = pcci->pcci_lli;
+
+	path_put(&pcci->pcci_path);
+	pcci->pcci_type = LU_PCC_NONE;
+	OBD_SLAB_FREE_PTR(pcci, pcc_inode_slab);
+	lli->lli_pcc_inode = NULL;
+}
+
+static void pcc_inode_get(struct pcc_inode *pcci)
+{
+	atomic_inc(&pcci->pcci_refcount);
+}
+
+static void pcc_inode_put(struct pcc_inode *pcci)
+{
+	if (atomic_dec_and_test(&pcci->pcci_refcount))
+		pcc_inode_fini(pcci);
+}
+
+void pcc_inode_free(struct inode *inode)
+{
+	struct pcc_inode *pcci = ll_i2pcci(inode);
+
+	if (pcci) {
+		WARN_ON(atomic_read(&pcci->pcci_refcount) > 1);
+		pcc_inode_put(pcci);
+	}
+}
+
+/*
+ * TODO:
+ * As Andreas suggested, we'd better use new layout to
+ * reduce overhead:
+ * (fid->f_oid >> 16 & oxFFFF)/FID
+ */
+#define PCC_DATASET_MAX_PATH (6 * 5 + FID_NOBRACE_LEN + 1)
+static int pcc_fid2dataset_path(char *buf, int sz, struct lu_fid *fid)
+{
+	return scnprintf(buf, sz, "%04x/%04x/%04x/%04x/%04x/%04x/"
+			 DFID_NOBRACE,
+			 (fid)->f_oid       & 0xFFFF,
+			 (fid)->f_oid >> 16 & 0xFFFF,
+			 (unsigned int)((fid)->f_seq       & 0xFFFF),
+			 (unsigned int)((fid)->f_seq >> 16 & 0xFFFF),
+			 (unsigned int)((fid)->f_seq >> 32 & 0xFFFF),
+			 (unsigned int)((fid)->f_seq >> 48 & 0xFFFF),
+			 PFID(fid));
+}
+
+static inline const struct cred *pcc_super_cred(struct super_block *sb)
+{
+	return ll_s2sbi(sb)->ll_pcc_super.pccs_cred;
+}
+
+void pcc_file_init(struct pcc_file *pccf)
+{
+	pccf->pccf_file = NULL;
+	pccf->pccf_type = LU_PCC_NONE;
+}
+
+static inline bool pcc_auto_attach_enabled(enum pcc_dataset_flags flags,
+					   enum pcc_io_type iot)
+{
+	if (iot == PIT_OPEN)
+		return flags & PCC_DATASET_OPEN_ATTACH;
+	if (iot == PIT_GETATTR)
+		return flags & PCC_DATASET_STAT_ATTACH;
+	else
+		return flags & PCC_DATASET_AUTO_ATTACH;
+}
+
+static const char pcc_xattr_layout[] = XATTR_USER_PREFIX "PCC.layout";
+
+static int pcc_layout_xattr_set(struct pcc_inode *pcci, __u32 gen)
+{
+	struct dentry *pcc_dentry = pcci->pcci_path.dentry;
+	struct ll_inode_info *lli = pcci->pcci_lli;
+	int rc;
+
+	ENTRY;
+
+	if (!(lli->lli_pcc_dsflags & PCC_DATASET_AUTO_ATTACH))
+		RETURN(0);
+
+	rc = ll_vfs_setxattr(pcc_dentry, pcc_dentry->d_inode, pcc_xattr_layout,
+			     &gen, sizeof(gen), 0);
+
+	RETURN(rc);
+}
+
+static int pcc_get_layout_info(struct inode *inode, struct cl_layout *clt)
+{
+	struct lu_env *env;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	__u16 refcheck;
+	int rc;
+
+	ENTRY;
+
+	if (!lli->lli_clob)
+		RETURN(-EINVAL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = cl_object_layout_get(env, lli->lli_clob, clt);
+	if (rc < 0)
+		CDEBUG(D_INODE, "Cannot get layout for "DFID"\n",
+		       PFID(ll_inode2fid(inode)));
+
+	cl_env_put(env, &refcheck);
+	RETURN(rc < 0 ? rc : 0);
+}
+
+/* Must be called with pcci->pcci_lock held */
+static void pcc_inode_attach_init(struct pcc_dataset *dataset,
+				  struct pcc_inode *pcci,
+				  struct dentry *dentry,
+				  enum lu_pcc_type type)
+{
+	pcci->pcci_path.mnt = mntget(dataset->pccd_path.mnt);
+	pcci->pcci_path.dentry = dentry;
+	LASSERT(atomic_read(&pcci->pcci_refcount) == 0);
+	atomic_set(&pcci->pcci_refcount, 1);
+	pcci->pcci_type = type;
+	pcci->pcci_attr_valid = false;
+}
+
+static inline void pcc_inode_dsflags_set(struct ll_inode_info *lli,
+					 struct pcc_dataset *dataset)
+{
+	lli->lli_pcc_generation = ll_info2pccs(lli)->pccs_generation;
+	lli->lli_pcc_dsflags = dataset->pccd_flags;
+}
+
+static void pcc_inode_attach_set(struct pcc_super *super,
+				 struct pcc_dataset *dataset,
+				 struct ll_inode_info *lli,
+				 struct pcc_inode *pcci,
+				 struct dentry *dentry,
+				 enum lu_pcc_type type)
+{
+	pcc_inode_init(pcci, lli);
+	pcc_inode_attach_init(dataset, pcci, dentry, type);
+	down_read(&super->pccs_rw_sem);
+	pcc_inode_dsflags_set(lli, dataset);
+	up_read(&super->pccs_rw_sem);
+}
+
+static inline void pcc_layout_gen_set(struct pcc_inode *pcci,
+				      __u32 gen)
+{
+	pcci->pcci_layout_gen = gen;
+}
+
+static inline bool pcc_inode_has_layout(struct pcc_inode *pcci)
+{
+	return pcci->pcci_layout_gen != CL_LAYOUT_GEN_NONE;
+}
+
+static struct dentry *pcc_lookup(struct dentry *base, char *pathname)
+{
+	char *ptr = NULL, *component;
+	struct dentry *parent;
+	struct dentry *child = ERR_PTR(-ENOENT);
+
+	ptr = pathname;
+
+	/* move past any initial '/' to the start of the first path component*/
+	while (*ptr == '/')
+		ptr++;
+
+	/* store the start of the first path component */
+	component = ptr;
+
+	parent = dget(base);
+	while (ptr) {
+		/* find the start of the next component - if we don't find it,
+		 * the current component is the last component
+		 */
+		ptr = strchr(ptr, '/');
+		/* put a NUL char in place of the '/' before the next compnent
+		 * so we can treat this component as a string; note the full
+		 * path string is NUL terminated to this is not needed for the
+		 * last component
+		 */
+		if (ptr)
+			*ptr = '\0';
+
+		/* look up the current component */
+		inode_lock(parent->d_inode);
+		child = lookup_one_len(component, parent, strlen(component));
+		inode_unlock(parent->d_inode);
+
+		/* repair the path string: put '/' back in place of the NUL */
+		if (ptr)
+			*ptr = '/';
+
+		dput(parent);
+
+		if (IS_ERR_OR_NULL(child))
+			break;
+
+		/* we may find a cached negative dentry */
+		if (!d_is_positive(child)) {
+			dput(child);
+			child = NULL;
+			break;
+		}
+
+		/* descend in to the next level of the path */
+		parent = child;
+
+		/* move the pointer past the '/' to the next component */
+		if (ptr)
+			ptr++;
+		component = ptr;
+	}
+
+	/* NULL child means we didn't find anything */
+	if (!child)
+		child = ERR_PTR(-ENOENT);
+
+	return child;
+}
+
+static int pcc_try_dataset_attach(struct inode *inode, __u32 gen,
+				  enum lu_pcc_type type,
+				  struct pcc_dataset *dataset,
+				  bool *cached)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct pcc_inode *pcci = lli->lli_pcc_inode;
+	const struct cred *old_cred;
+	struct dentry *pcc_dentry = NULL;
+	char pathname[PCC_DATASET_MAX_PATH];
+	__u32 pcc_gen;
+	int rc;
+
+	ENTRY;
+
+	if (type == LU_PCC_READWRITE &&
+	    !(dataset->pccd_flags & PCC_DATASET_RWPCC))
+		RETURN(0);
+
+	rc = pcc_fid2dataset_path(pathname, PCC_DATASET_MAX_PATH,
+				  &lli->lli_fid);
+
+	old_cred = override_creds(pcc_super_cred(inode->i_sb));
+	pcc_dentry = pcc_lookup(dataset->pccd_path.dentry, pathname);
+	if (IS_ERR(pcc_dentry)) {
+		rc = PTR_ERR(pcc_dentry);
+		CDEBUG(D_CACHE, "%s: path lookup error on "DFID":%s: rc = %d\n",
+		       ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid),
+		       pathname, rc);
+		/* ignore this error */
+		GOTO(out, rc = 0);
+	}
+
+	rc = ll_vfs_getxattr(pcc_dentry, pcc_dentry->d_inode, pcc_xattr_layout,
+			     &pcc_gen, sizeof(pcc_gen));
+	if (rc < 0)
+		/* ignore this error */
+		GOTO(out_put_pcc_dentry, rc = 0);
+
+	rc = 0;
+	/* The file is still valid cached in PCC, attach it immediately. */
+	if (pcc_gen == gen) {
+		CDEBUG(D_CACHE, DFID" L.Gen (%d) consistent, auto attached.\n",
+		       PFID(&lli->lli_fid), gen);
+		if (!pcci) {
+			OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS);
+			if (pcci == NULL)
+				GOTO(out_put_pcc_dentry, rc = -ENOMEM);
+
+			pcc_inode_init(pcci, lli);
+			dget(pcc_dentry);
+			pcc_inode_attach_init(dataset, pcci, pcc_dentry, type);
+		} else {
+			/*
+			 * This happened when a file was once attached into
+			 * PCC, and some processes keep this file opened
+			 * (pcci->refcount > 1) and corresponding PCC file
+			 * without any I/O activity, and then this file was
+			 * detached by the manual detach command or the
+			 * revocation of the layout lock (i.e. cached LRU lock
+			 * shrinking).
+			 */
+			pcc_inode_get(pcci);
+			pcci->pcci_type = type;
+		}
+		pcc_inode_dsflags_set(lli, dataset);
+		pcc_layout_gen_set(pcci, gen);
+		*cached = true;
+	}
+out_put_pcc_dentry:
+	dput(pcc_dentry);
+out:
+	revert_creds(old_cred);
+	RETURN(rc);
+}
+
+static int pcc_try_datasets_attach(struct inode *inode, enum pcc_io_type iot,
+				   __u32 gen, enum lu_pcc_type type,
+				   bool *cached)
+{
+	struct pcc_super *super = &ll_i2sbi(inode)->ll_pcc_super;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct pcc_dataset *dataset = NULL, *tmp;
+	int rc = 0;
+
+	ENTRY;
+
+	down_read(&super->pccs_rw_sem);
+	list_for_each_entry_safe(dataset, tmp,
+				 &super->pccs_datasets, pccd_linkage) {
+		if (!pcc_auto_attach_enabled(dataset->pccd_flags, iot))
+			break;
+
+		rc = pcc_try_dataset_attach(inode, gen, type, dataset, cached);
+		if (rc < 0 || (!rc && *cached))
+			break;
+	}
+
+	/*
+	 * Update the saved dataset flags for the inode accordingly if failed.
+	 */
+	if (!rc && !*cached) {
+		/*
+		 * Currently auto attach strategy for a PCC backend is
+		 * unchangeable once once it was added into the PCC datasets on
+		 * a client as the support to change auto attach strategy is
+		 * not implemented yet.
+		 */
+		/*
+		 * If tried to attach from one PCC backend:
+		 * @lli_pcc_generation > 0:
+		 * 1) The file was once attached into PCC, but now the
+		 * corresponding PCC backend should be removed from the client;
+		 * 2) The layout generation was changed, the data has been
+		 * restored;
+		 * 3) The corresponding PCC copy is not existed on PCC
+		 * @lli_pcc_generation == 0:
+		 * The file is never attached into PCC but in a HSM released
+		 * state, or once attached into PCC but the inode was evicted
+		 * from icache later.
+		 * Set the saved dataset flags with PCC_DATASET_NONE. Then this
+		 * file will skip from the candidates to try auto attach until
+		 * the file is attached into PCC again.
+		 *
+		 * If the file was never attached into PCC, or once attached but
+		 * its inode was evicted from icache (lli_pcc_generation == 0),
+		 * or the corresponding dataset was removed from the client,
+		 * set the saved dataset flags with PCC_DATASET_NONE.
+		 *
+		 * TODO: If the file was once attached into PCC but not try to
+		 * auto attach due to the change of the configuration parameters
+		 * for this dataset (i.e. change from auto attach enabled to
+		 * auto attach disabled for this dataset), update the saved
+		 * dataset flags with the found one.
+		 */
+		lli->lli_pcc_dsflags = PCC_DATASET_NONE;
+	}
+	up_read(&super->pccs_rw_sem);
+
+	RETURN(rc);
+}
+
+/*
+ * TODO: For RW-PCC, it is desirable to store HSM info as a layout (LU-10606).
+ * Thus the client can get archive ID from the layout directly. When try to
+ * attach the file automatically which is in HSM released state (according to
+ * LOV_PATTERN_F_RELEASED in the layout), it can determine whether the file is
+ * valid cached on PCC more precisely according to the @rwid (archive ID) in
+ * the PCC dataset and the archive ID in HSM attrs.
+ */
+static int pcc_try_auto_attach(struct inode *inode, bool *cached,
+			       enum pcc_io_type iot)
+{
+	struct pcc_super *super = &ll_i2sbi(inode)->ll_pcc_super;
+	struct cl_layout clt = {
+		.cl_layout_gen = 0,
+		.cl_is_released = false,
+	};
+	struct ll_inode_info *lli = ll_i2info(inode);
+	__u32 gen;
+	int rc;
+
+	ENTRY;
+
+	/*
+	 * Quick check whether there is PCC device.
+	 */
+	if (list_empty(&super->pccs_datasets))
+		RETURN(0);
+
+	/*
+	 * The file layout lock was cancelled. And this open does not
+	 * obtain valid layout lock from MDT (i.e. the file is being
+	 * HSM restoring).
+	 */
+	if (iot == PIT_OPEN) {
+		if (ll_layout_version_get(lli) == CL_LAYOUT_GEN_NONE)
+			RETURN(0);
+	} else {
+		rc = ll_layout_refresh(inode, &gen);
+		if (rc)
+			RETURN(rc);
+	}
+
+	rc = pcc_get_layout_info(inode, &clt);
+	if (rc)
+		RETURN(rc);
+
+	if (iot != PIT_OPEN && gen != clt.cl_layout_gen) {
+		CDEBUG(D_CACHE, DFID" layout changed from %d to %d.\n",
+		       PFID(ll_inode2fid(inode)), gen, clt.cl_layout_gen);
+		RETURN(-EINVAL);
+	}
+
+	if (clt.cl_is_released)
+		rc = pcc_try_datasets_attach(inode, iot, clt.cl_layout_gen,
+					     LU_PCC_READWRITE, cached);
+
+	RETURN(rc);
+}
+
+static inline bool pcc_may_auto_attach(struct inode *inode,
+				       enum pcc_io_type iot)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct pcc_super *super = ll_i2pccs(inode);
+
+	/* Known the file was not in any PCC backend. */
+	if (lli->lli_pcc_dsflags & PCC_DATASET_NONE)
+		return false;
+
+	/*
+	 * lli_pcc_generation == 0 means that the file was never attached into
+	 * PCC, or may be once attached into PCC but detached as the inode is
+	 * evicted from icache (i.e. "echo 3 > /proc/sys/vm/drop_caches" or
+	 * icache shrinking due to the memory pressure), which will cause the
+	 * file detach from PCC when releasing the inode from icache.
+	 * In either case, we still try to attach.
+	 */
+	/* lli_pcc_generation == 0, or the PCC setting was changed,
+	 * or there is no PCC setup on the client and the try will return
+	 * immediately in pcc_try_auto_attach().
+	 */
+	if (super->pccs_generation != lli->lli_pcc_generation)
+		return true;
+
+	/* The cached setting @lli_pcc_dsflags is valid */
+	if (iot == PIT_OPEN)
+		return lli->lli_pcc_dsflags & PCC_DATASET_OPEN_ATTACH;
+
+	if (iot == PIT_GETATTR)
+		return lli->lli_pcc_dsflags & PCC_DATASET_STAT_ATTACH;
+
+	return lli->lli_pcc_dsflags & PCC_DATASET_IO_ATTACH;
+}
+
+int pcc_file_open(struct inode *inode, struct file *file)
+{
+	struct pcc_inode *pcci;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = file->private_data;
+	struct pcc_file *pccf = &fd->fd_pcc_file;
+	struct file *pcc_file;
+	struct path *path;
+	bool cached = false;
+	int rc = 0;
+
+	ENTRY;
+
+	if (!S_ISREG(inode->i_mode))
+		RETURN(0);
+
+	if (IS_ENCRYPTED(inode))
+		RETURN(0);
+
+	pcc_inode_lock(inode);
+	pcci = ll_i2pcci(inode);
+
+	if (lli->lli_pcc_state & PCC_STATE_FL_ATTACHING)
+		GOTO(out_unlock, rc = 0);
+
+	if (!pcci || !pcc_inode_has_layout(pcci)) {
+		if (pcc_may_auto_attach(inode, PIT_OPEN))
+			rc = pcc_try_auto_attach(inode, &cached, PIT_OPEN);
+
+		if (rc < 0 || !cached)
+			GOTO(out_unlock, rc);
+
+		if (!pcci)
+			pcci = ll_i2pcci(inode);
+	}
+
+	pcc_inode_get(pcci);
+	WARN_ON(pccf->pccf_file);
+
+	path = &pcci->pcci_path;
+	CDEBUG(D_CACHE, "opening pcc file '%pd'\n", path->dentry);
+
+	pcc_file = dentry_open(path, file->f_flags,
+			       pcc_super_cred(inode->i_sb));
+	if (IS_ERR_OR_NULL(pcc_file)) {
+		rc = pcc_file == NULL ? -EINVAL : PTR_ERR(pcc_file);
+		pcc_inode_put(pcci);
+	} else {
+		pccf->pccf_file = pcc_file;
+		pccf->pccf_type = pcci->pcci_type;
+	}
+
+out_unlock:
+	pcc_inode_unlock(inode);
+	RETURN(rc);
+}
+
+void pcc_file_release(struct inode *inode, struct file *file)
+{
+	struct pcc_inode *pcci;
+	struct ll_file_data *fd = file->private_data;
+	struct pcc_file *pccf;
+	struct path *path;
+
+	ENTRY;
+
+	if (!S_ISREG(inode->i_mode) || fd == NULL)
+		RETURN_EXIT;
+
+	pccf = &fd->fd_pcc_file;
+	pcc_inode_lock(inode);
+	if (pccf->pccf_file == NULL)
+		goto out;
+
+	pcci = ll_i2pcci(inode);
+	LASSERT(pcci);
+	path = &pcci->pcci_path;
+	CDEBUG(D_CACHE, "releasing pcc file \"%pd\"\n", path->dentry);
+	pcc_inode_put(pcci);
+	fput(pccf->pccf_file);
+	pccf->pccf_file = NULL;
+out:
+	pcc_inode_unlock(inode);
+	RETURN_EXIT;
+}
+
+static void pcc_io_init(struct inode *inode, enum pcc_io_type iot, bool *cached)
+{
+	struct pcc_inode *pcci;
+
+	pcc_inode_lock(inode);
+	pcci = ll_i2pcci(inode);
+	if (pcci && pcc_inode_has_layout(pcci)) {
+		LASSERT(atomic_read(&pcci->pcci_refcount) > 0);
+		atomic_inc(&pcci->pcci_active_ios);
+		*cached = true;
+	} else {
+		*cached = false;
+		if (pcc_may_auto_attach(inode, iot)) {
+			(void) pcc_try_auto_attach(inode, cached, iot);
+			if (*cached) {
+				pcci = ll_i2pcci(inode);
+				LASSERT(atomic_read(&pcci->pcci_refcount) > 0);
+				atomic_inc(&pcci->pcci_active_ios);
+			}
+		}
+	}
+	pcc_inode_unlock(inode);
+}
+
+static void pcc_io_fini(struct inode *inode)
+{
+	struct pcc_inode *pcci = ll_i2pcci(inode);
+
+	LASSERT(pcci && atomic_read(&pcci->pcci_active_ios) > 0);
+	if (atomic_dec_and_test(&pcci->pcci_active_ios))
+		wake_up(&pcci->pcci_waitq);
+}
+
+
+static ssize_t
+__pcc_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+	return file->f_op->read_iter(iocb, iter);
+#else
+	struct iovec iov;
+	struct iov_iter i;
+	ssize_t bytes = 0;
+
+	iov_for_each(iov, i, *iter) {
+		ssize_t res;
+
+		res = file->f_op->aio_read(iocb, &iov, 1, iocb->ki_pos);
+		if (-EIOCBQUEUED == res)
+			res = wait_on_sync_kiocb(iocb);
+		if (res <= 0) {
+			if (bytes == 0)
+				bytes = res;
+			break;
+		}
+
+		bytes += res;
+		if (res < iov.iov_len)
+			break;
+	}
+
+	if (bytes > 0)
+		iov_iter_advance(iter, bytes);
+	return bytes;
+#endif
+}
+
+ssize_t pcc_file_read_iter(struct kiocb *iocb,
+			   struct iov_iter *iter, bool *cached)
+{
+	struct file *file = iocb->ki_filp;
+	struct ll_file_data *fd = file->private_data;
+	struct pcc_file *pccf = &fd->fd_pcc_file;
+	struct inode *inode = file_inode(file);
+	ssize_t result;
+
+	ENTRY;
+
+	if (pccf->pccf_file == NULL) {
+		*cached = false;
+		RETURN(0);
+	}
+
+	pcc_io_init(inode, PIT_READ, cached);
+	if (!*cached)
+		RETURN(0);
+
+	iocb->ki_filp = pccf->pccf_file;
+	/* generic_file_aio_read does not support ext4-dax,
+	 * __pcc_file_read_iter uses ->aio_read hook directly
+	 * to add support for ext4-dax.
+	 */
+	result = __pcc_file_read_iter(iocb, iter);
+	iocb->ki_filp = file;
+
+	pcc_io_fini(inode);
+	RETURN(result);
+}
+
+static ssize_t
+__pcc_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+	return file->f_op->write_iter(iocb, iter);
+#else
+	struct iovec iov;
+	struct iov_iter i;
+	ssize_t bytes = 0;
+
+	iov_for_each(iov, i, *iter) {
+		ssize_t res;
+
+		res = file->f_op->aio_write(iocb, &iov, 1, iocb->ki_pos);
+		if (-EIOCBQUEUED == res)
+			res = wait_on_sync_kiocb(iocb);
+		if (res <= 0) {
+			if (bytes == 0)
+				bytes = res;
+			break;
+		}
+
+		bytes += res;
+		if (res < iov.iov_len)
+			break;
+	}
+
+	if (bytes > 0)
+		iov_iter_advance(iter, bytes);
+	return bytes;
+#endif
+}
+
+ssize_t pcc_file_write_iter(struct kiocb *iocb,
+			    struct iov_iter *iter, bool *cached)
+{
+	struct file *file = iocb->ki_filp;
+	struct ll_file_data *fd = file->private_data;
+	struct pcc_file *pccf = &fd->fd_pcc_file;
+	struct inode *inode = file_inode(file);
+	ssize_t result;
+
+	ENTRY;
+
+	if (pccf->pccf_file == NULL) {
+		*cached = false;
+		RETURN(0);
+	}
+
+	if (pccf->pccf_type != LU_PCC_READWRITE) {
+		*cached = false;
+		RETURN(-EAGAIN);
+	}
+
+	pcc_io_init(inode, PIT_WRITE, cached);
+	if (!*cached)
+		RETURN(0);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PCC_FAKE_ERROR))
+		GOTO(out, result = -ENOSPC);
+
+	iocb->ki_filp = pccf->pccf_file;
+
+	/* Since __pcc_file_write_iter makes write calls via
+	 * the normal vfs interface to the local PCC file system,
+	 * the inode lock is not needed.
+	 */
+	result = __pcc_file_write_iter(iocb, iter);
+	iocb->ki_filp = file;
+out:
+	pcc_io_fini(inode);
+	RETURN(result);
+}
+
+int pcc_inode_setattr(struct inode *inode, struct iattr *attr,
+		      bool *cached)
+{
+	int rc;
+	const struct cred *old_cred;
+	struct iattr attr2 = *attr;
+	struct dentry *pcc_dentry;
+	struct pcc_inode *pcci;
+
+	ENTRY;
+
+	if (!S_ISREG(inode->i_mode)) {
+		*cached = false;
+		RETURN(0);
+	}
+
+	pcc_io_init(inode, PIT_SETATTR, cached);
+	if (!*cached)
+		RETURN(0);
+
+	attr2.ia_valid = attr->ia_valid & (ATTR_SIZE | ATTR_ATIME |
+			 ATTR_ATIME_SET | ATTR_MTIME | ATTR_MTIME_SET |
+			 ATTR_CTIME | ATTR_UID | ATTR_GID);
+	pcci = ll_i2pcci(inode);
+	pcc_dentry = pcci->pcci_path.dentry;
+	inode_lock(pcc_dentry->d_inode);
+	old_cred = override_creds(pcc_super_cred(inode->i_sb));
+#ifdef HAVE_USER_NAMESPACE_ARG
+	rc = pcc_dentry->d_inode->i_op->setattr(&init_user_ns, pcc_dentry,
+						&attr2);
+#else
+	rc = pcc_dentry->d_inode->i_op->setattr(pcc_dentry, &attr2);
+#endif
+	revert_creds(old_cred);
+	inode_unlock(pcc_dentry->d_inode);
+
+	pcc_io_fini(inode);
+	RETURN(rc);
+}
+
+int pcc_inode_getattr(struct inode *inode, u32 request_mask,
+		      unsigned int flags, bool *cached)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	const struct cred *old_cred;
+	struct kstat stat;
+	s64 atime;
+	s64 mtime;
+	s64 ctime;
+	int rc;
+
+	ENTRY;
+
+	if (!S_ISREG(inode->i_mode)) {
+		*cached = false;
+		RETURN(0);
+	}
+
+	pcc_io_init(inode, PIT_GETATTR, cached);
+	if (!*cached)
+		RETURN(0);
+
+	old_cred = override_creds(pcc_super_cred(inode->i_sb));
+	rc = ll_vfs_getattr(&ll_i2pcci(inode)->pcci_path, &stat, request_mask,
+			    flags);
+	revert_creds(old_cred);
+	if (rc)
+		GOTO(out, rc);
+
+	ll_inode_size_lock(inode);
+	if (test_and_clear_bit(LLIF_UPDATE_ATIME, &lli->lli_flags) ||
+	    inode->i_atime.tv_sec < lli->lli_atime)
+		inode->i_atime.tv_sec = lli->lli_atime;
+
+	inode->i_mtime.tv_sec = lli->lli_mtime;
+	inode->i_ctime.tv_sec = lli->lli_ctime;
+
+	atime = inode->i_atime.tv_sec;
+	mtime = inode->i_mtime.tv_sec;
+	ctime = inode->i_ctime.tv_sec;
+
+	if (atime < stat.atime.tv_sec)
+		atime = stat.atime.tv_sec;
+
+	if (ctime < stat.ctime.tv_sec)
+		ctime = stat.ctime.tv_sec;
+
+	if (mtime < stat.mtime.tv_sec)
+		mtime = stat.mtime.tv_sec;
+
+	i_size_write(inode, stat.size);
+	inode->i_blocks = stat.blocks;
+
+	inode->i_atime.tv_sec = atime;
+	inode->i_mtime.tv_sec = mtime;
+	inode->i_ctime.tv_sec = ctime;
+
+	ll_inode_size_unlock(inode);
+out:
+	pcc_io_fini(inode);
+	RETURN(rc);
+}
+
+#ifdef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT
+ssize_t pcc_file_splice_read(struct file *in_file, loff_t *ppos,
+			     struct pipe_inode_info *pipe,
+			     size_t count, unsigned int flags)
+{
+	struct inode *inode = file_inode(in_file);
+	struct ll_file_data *fd = in_file->private_data;
+	struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+	bool cached = false;
+	ssize_t result;
+
+	ENTRY;
+
+	if (!pcc_file)
+		RETURN(default_file_splice_read(in_file, ppos, pipe,
+						count, flags));
+
+	pcc_io_init(inode, PIT_SPLICE_READ, &cached);
+	if (!cached)
+		RETURN(default_file_splice_read(in_file, ppos, pipe,
+						count, flags));
+
+	result = default_file_splice_read(pcc_file, ppos, pipe, count, flags);
+
+	pcc_io_fini(inode);
+	RETURN(result);
+}
+#endif /* HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT */
+
+int pcc_fsync(struct file *file, loff_t start, loff_t end,
+	      int datasync, bool *cached)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_file_data *fd = file->private_data;
+	struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+	int rc;
+
+	ENTRY;
+
+	if (!pcc_file) {
+		*cached = false;
+		RETURN(0);
+	}
+
+	pcc_io_init(inode, PIT_FSYNC, cached);
+	if (!*cached)
+		RETURN(0);
+
+	rc = file_inode(pcc_file)->i_fop->fsync(pcc_file,
+						start, end, datasync);
+
+	pcc_io_fini(inode);
+	RETURN(rc);
+}
+
+int pcc_file_mmap(struct file *file, struct vm_area_struct *vma,
+		  bool *cached)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_file_data *fd = file->private_data;
+	struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+	struct pcc_inode *pcci;
+	int rc = 0;
+
+	ENTRY;
+
+	if (!pcc_file || !file_inode(pcc_file)->i_fop->mmap) {
+		*cached = false;
+		RETURN(0);
+	}
+
+	pcc_inode_lock(inode);
+	pcci = ll_i2pcci(inode);
+	if (pcci && pcc_inode_has_layout(pcci)) {
+		LASSERT(atomic_read(&pcci->pcci_refcount) > 1);
+		*cached = true;
+		vma->vm_file = pcc_file;
+		rc = file_inode(pcc_file)->i_fop->mmap(pcc_file, vma);
+		vma->vm_file = file;
+		/* Save the vm ops of backend PCC */
+		vma->vm_private_data = (void *)vma->vm_ops;
+	} else {
+		*cached = false;
+	}
+	pcc_inode_unlock(inode);
+
+	RETURN(rc);
+}
+
+void pcc_vm_open(struct vm_area_struct *vma)
+{
+	struct pcc_inode *pcci;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct ll_file_data *fd = file->private_data;
+	struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+	struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data;
+
+	ENTRY;
+
+	if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->open)
+		RETURN_EXIT;
+
+	pcc_inode_lock(inode);
+	pcci = ll_i2pcci(inode);
+	if (pcci && pcc_inode_has_layout(pcci)) {
+		vma->vm_file = pcc_file;
+		pcc_vm_ops->open(vma);
+		vma->vm_file = file;
+	}
+	pcc_inode_unlock(inode);
+	EXIT;
+}
+
+void pcc_vm_close(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct ll_file_data *fd = file->private_data;
+	struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+	struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data;
+
+	ENTRY;
+
+	if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->close)
+		RETURN_EXIT;
+
+	pcc_inode_lock(inode);
+	/* Layout lock maybe revoked here */
+	vma->vm_file = pcc_file;
+	pcc_vm_ops->close(vma);
+	vma->vm_file = file;
+	pcc_inode_unlock(inode);
+	EXIT;
+}
+
+int pcc_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		     bool *cached)
+{
+	struct page *page = vmf->page;
+	struct mm_struct *mm = vma->vm_mm;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct ll_file_data *fd = file->private_data;
+	struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+	struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data;
+	int rc;
+
+	ENTRY;
+
+	if (!pcc_file || !pcc_vm_ops) {
+		*cached = false;
+		RETURN(0);
+	}
+
+	if (!pcc_vm_ops->page_mkwrite &&
+	    page->mapping == pcc_file->f_mapping) {
+		CDEBUG(D_MMAP,
+		       "%s: PCC backend fs not support ->page_mkwrite()\n",
+		       ll_i2sbi(inode)->ll_fsname);
+		pcc_ioctl_detach(inode, PCC_DETACH_OPT_UNCACHE);
+		mmap_read_unlock(mm);
+		*cached = true;
+		RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE);
+	}
+	/* Pause to allow for a race with concurrent detach */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE, cfs_fail_val);
+
+	pcc_io_init(inode, PIT_PAGE_MKWRITE, cached);
+	if (!*cached) {
+		/* This happens when the file is detached from PCC after got
+		 * the fault page via ->fault() on the inode of the PCC copy.
+		 * Here it can not simply fall back to normal Lustre I/O path.
+		 * The reason is that the address space of fault page used by
+		 * ->page_mkwrite() is still the one of PCC inode. In the
+		 * normal Lustre ->page_mkwrite() I/O path, it will be wrongly
+		 * handled as the address space of the fault page is not
+		 * consistent with the one of the Lustre inode (though the
+		 * fault page was truncated).
+		 * As the file is detached from PCC, the fault page must
+		 * be released frist, and retry the mmap write (->fault() and
+		 * ->page_mkwrite).
+		 * We use an ugly and tricky method by returning
+		 * VM_FAULT_NOPAGE | VM_FAULT_RETRY to the caller
+		 * __do_page_fault and retry the memory fault handling.
+		 */
+		if (page->mapping == pcc_file->f_mapping) {
+			*cached = true;
+			mmap_read_unlock(mm);
+			RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE);
+		}
+
+		RETURN(0);
+	}
+
+	/*
+	 * This fault injection can also be used to simulate -ENOSPC and
+	 * -EDQUOT failure of underlying PCC backend fs.
+	 */
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PCC_DETACH_MKWRITE)) {
+		pcc_io_fini(inode);
+		pcc_ioctl_detach(inode, PCC_DETACH_OPT_UNCACHE);
+		mmap_read_unlock(mm);
+		RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE);
+	}
+
+	vma->vm_file = pcc_file;
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+	rc = pcc_vm_ops->page_mkwrite(vmf);
+#else
+	rc = pcc_vm_ops->page_mkwrite(vma, vmf);
+#endif
+	vma->vm_file = file;
+
+	pcc_io_fini(inode);
+	RETURN(rc);
+}
+
+int pcc_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+	      bool *cached)
+{
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct ll_file_data *fd = file->private_data;
+	struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+	struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data;
+	int rc;
+
+	ENTRY;
+
+	if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->fault) {
+		*cached = false;
+		RETURN(0);
+	}
+
+	pcc_io_init(inode, PIT_FAULT, cached);
+	if (!*cached)
+		RETURN(0);
+
+	vma->vm_file = pcc_file;
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+	rc = pcc_vm_ops->fault(vmf);
+#else
+	rc = pcc_vm_ops->fault(vma, vmf);
+#endif
+	vma->vm_file = file;
+
+	pcc_io_fini(inode);
+	RETURN(rc);
+}
+
+static void __pcc_layout_invalidate(struct pcc_inode *pcci)
+{
+	pcci->pcci_type = LU_PCC_NONE;
+	pcc_layout_gen_set(pcci, CL_LAYOUT_GEN_NONE);
+	if (atomic_read(&pcci->pcci_active_ios) == 0)
+		return;
+
+	CDEBUG(D_CACHE, "Waiting for IO completion: %d\n",
+		       atomic_read(&pcci->pcci_active_ios));
+	wait_event_idle(pcci->pcci_waitq,
+			atomic_read(&pcci->pcci_active_ios) == 0);
+}
+
+void pcc_layout_invalidate(struct inode *inode)
+{
+	struct pcc_inode *pcci;
+
+	ENTRY;
+
+	pcc_inode_lock(inode);
+	pcci = ll_i2pcci(inode);
+	if (pcci && pcc_inode_has_layout(pcci)) {
+		LASSERT(atomic_read(&pcci->pcci_refcount) > 0);
+		__pcc_layout_invalidate(pcci);
+
+		CDEBUG(D_CACHE, "Invalidate "DFID" layout gen %d\n",
+		       PFID(&ll_i2info(inode)->lli_fid), pcci->pcci_layout_gen);
+
+		pcc_inode_put(pcci);
+	}
+	pcc_inode_unlock(inode);
+
+	EXIT;
+}
+
+static int pcc_inode_remove(struct inode *inode, struct dentry *pcc_dentry)
+{
+	int rc;
+
+	rc = vfs_unlink(&init_user_ns,
+			pcc_dentry->d_parent->d_inode, pcc_dentry);
+	if (rc)
+		CWARN("%s: failed to unlink PCC file %pd, rc = %d\n",
+		      ll_i2sbi(inode)->ll_fsname, pcc_dentry, rc);
+
+	return rc;
+}
+
+/* Create directory under base if directory does not exist */
+static struct dentry *
+pcc_mkdir(struct dentry *base, const char *name, umode_t mode)
+{
+	int rc;
+	struct dentry *dentry;
+	struct inode *dir = base->d_inode;
+
+	inode_lock(dir);
+	dentry = lookup_one_len(name, base, strlen(name));
+	if (IS_ERR(dentry))
+		goto out;
+
+	if (d_is_positive(dentry))
+		goto out;
+
+	rc = vfs_mkdir(&init_user_ns, dir, dentry, mode);
+	if (rc) {
+		dput(dentry);
+		dentry = ERR_PTR(rc);
+		goto out;
+	}
+out:
+	inode_unlock(dir);
+	return dentry;
+}
+
+static struct dentry *
+pcc_mkdir_p(struct dentry *root, char *path, umode_t mode)
+{
+	char *ptr, *entry_name;
+	struct dentry *parent;
+	struct dentry *child = ERR_PTR(-EINVAL);
+
+	ptr = path;
+	while (*ptr == '/')
+		ptr++;
+
+	entry_name = ptr;
+	parent = dget(root);
+	while ((ptr = strchr(ptr, '/')) != NULL) {
+		*ptr = '\0';
+		child = pcc_mkdir(parent, entry_name, mode);
+		*ptr = '/';
+		dput(parent);
+		if (IS_ERR(child))
+			break;
+
+		parent = child;
+		ptr++;
+		entry_name = ptr;
+	}
+
+	return child;
+}
+
+/* Create file under base. If file already exist, return failure */
+static struct dentry *
+pcc_create(struct dentry *base, const char *name, umode_t mode)
+{
+	int rc;
+	struct dentry *dentry;
+	struct inode *dir = base->d_inode;
+
+	inode_lock(dir);
+	dentry = lookup_one_len(name, base, strlen(name));
+	if (IS_ERR(dentry))
+		goto out;
+
+	if (d_is_positive(dentry))
+		goto out;
+
+	rc = vfs_create(&init_user_ns, dir, dentry, mode, false);
+	if (rc) {
+		dput(dentry);
+		dentry = ERR_PTR(rc);
+		goto out;
+	}
+out:
+	inode_unlock(dir);
+	return dentry;
+}
+
+static int __pcc_inode_create(struct pcc_dataset *dataset,
+			      struct lu_fid *fid,
+			      struct dentry **dentry)
+{
+	char *path;
+	struct dentry *base;
+	struct dentry *child;
+	int rc = 0;
+
+	OBD_ALLOC(path, PCC_DATASET_MAX_PATH);
+	if (path == NULL)
+		return -ENOMEM;
+
+	pcc_fid2dataset_path(path, PCC_DATASET_MAX_PATH, fid);
+
+	base = pcc_mkdir_p(dataset->pccd_path.dentry, path, 0);
+	if (IS_ERR(base)) {
+		rc = PTR_ERR(base);
+		GOTO(out, rc);
+	}
+
+	snprintf(path, PCC_DATASET_MAX_PATH, DFID_NOBRACE, PFID(fid));
+	child = pcc_create(base, path, 0);
+	if (IS_ERR(child)) {
+		rc = PTR_ERR(child);
+		GOTO(out_base, rc);
+	}
+	*dentry = child;
+
+out_base:
+	dput(base);
+out:
+	OBD_FREE(path, PCC_DATASET_MAX_PATH);
+	return rc;
+}
+
+/*
+ * Reset uid, gid or size for the PCC copy masked by @valid.
+ * TODO: Set the project ID for PCC copy.
+ */
+int pcc_inode_reset_iattr(struct dentry *dentry, unsigned int valid,
+			  kuid_t uid, kgid_t gid, loff_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct iattr attr;
+	int rc;
+
+	ENTRY;
+
+	attr.ia_valid = valid;
+	attr.ia_uid = uid;
+	attr.ia_gid = gid;
+	attr.ia_size = size;
+
+	inode_lock(inode);
+	rc = notify_change(&init_user_ns, dentry, &attr, NULL);
+	inode_unlock(inode);
+
+	RETURN(rc);
+}
+
+int pcc_inode_create(struct super_block *sb, struct pcc_dataset *dataset,
+		     struct lu_fid *fid, struct dentry **pcc_dentry)
+{
+	const struct cred *old_cred;
+	int rc;
+
+	old_cred = override_creds(pcc_super_cred(sb));
+	rc = __pcc_inode_create(dataset, fid, pcc_dentry);
+	revert_creds(old_cred);
+	return rc;
+}
+
+int pcc_inode_create_fini(struct inode *inode, struct pcc_create_attach *pca)
+{
+	struct dentry *pcc_dentry = pca->pca_dentry;
+	struct pcc_super *super = ll_i2pccs(inode);
+	const struct cred *old_cred;
+	struct pcc_inode *pcci;
+	int rc;
+
+	ENTRY;
+
+	if (!pca->pca_dataset)
+		RETURN(0);
+
+	if (!inode)
+		GOTO(out_dataset_put, rc = 0);
+
+	LASSERT(pcc_dentry);
+
+	old_cred = override_creds(super->pccs_cred);
+	pcc_inode_lock(inode);
+	LASSERT(ll_i2pcci(inode) == NULL);
+	OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS);
+	if (pcci == NULL)
+		GOTO(out_put, rc = -ENOMEM);
+
+	rc = pcc_inode_reset_iattr(pcc_dentry, ATTR_UID | ATTR_GID,
+				   old_cred->suid, old_cred->sgid, 0);
+	if (rc)
+		GOTO(out_put, rc);
+
+	pcc_inode_attach_set(super, pca->pca_dataset, ll_i2info(inode),
+			     pcci, pcc_dentry, LU_PCC_READWRITE);
+
+	rc = pcc_layout_xattr_set(pcci, 0);
+	if (rc) {
+		(void) pcc_inode_remove(inode, pcci->pcci_path.dentry);
+		pcc_inode_put(pcci);
+		GOTO(out_unlock, rc);
+	}
+
+	/* Set the layout generation of newly created file with 0 */
+	pcc_layout_gen_set(pcci, 0);
+
+out_put:
+	if (rc) {
+		(void) pcc_inode_remove(inode, pcc_dentry);
+		dput(pcc_dentry);
+
+		if (pcci)
+			OBD_SLAB_FREE_PTR(pcci, pcc_inode_slab);
+	}
+out_unlock:
+	pcc_inode_unlock(inode);
+	revert_creds(old_cred);
+out_dataset_put:
+	pcc_dataset_put(pca->pca_dataset);
+	RETURN(rc);
+}
+
+void pcc_create_attach_cleanup(struct super_block *sb,
+			       struct pcc_create_attach *pca)
+{
+	if (!pca->pca_dataset)
+		return;
+
+	if (pca->pca_dentry) {
+		const struct cred *old_cred;
+		int rc;
+
+		old_cred = override_creds(pcc_super_cred(sb));
+		rc = vfs_unlink(&init_user_ns,
+				pca->pca_dentry->d_parent->d_inode,
+				pca->pca_dentry);
+		if (rc)
+			CWARN("%s: failed to unlink PCC file %pd: rc = %d\n",
+			      ll_s2sbi(sb)->ll_fsname, pca->pca_dentry, rc);
+		/* ignore the unlink failure */
+		revert_creds(old_cred);
+		dput(pca->pca_dentry);
+	}
+
+	pcc_dataset_put(pca->pca_dataset);
+}
+
+static int pcc_filp_write(struct file *filp, const void *buf, ssize_t count,
+			  loff_t *offset)
+{
+	while (count > 0) {
+		ssize_t size;
+
+		size = cfs_kernel_write(filp, buf, count, offset);
+		if (size < 0)
+			return size;
+		count -= size;
+		buf += size;
+	}
+	return 0;
+}
+
+static ssize_t pcc_copy_data(struct file *src, struct file *dst)
+{
+	ssize_t rc = 0;
+	ssize_t rc2;
+	loff_t pos, offset = 0;
+	size_t buf_len = 1048576;
+	void *buf;
+
+	ENTRY;
+
+	OBD_ALLOC_LARGE(buf, buf_len);
+	if (buf == NULL)
+		RETURN(-ENOMEM);
+
+	while (1) {
+		if (signal_pending(current))
+			GOTO(out_free, rc = -EINTR);
+
+		pos = offset;
+		rc2 = cfs_kernel_read(src, buf, buf_len, &pos);
+		if (rc2 < 0)
+			GOTO(out_free, rc = rc2);
+		else if (rc2 == 0)
+			break;
+
+		pos = offset;
+		rc = pcc_filp_write(dst, buf, rc2, &pos);
+		if (rc < 0)
+			GOTO(out_free, rc);
+		offset += rc2;
+	}
+
+	rc = offset;
+out_free:
+	OBD_FREE_LARGE(buf, buf_len);
+	RETURN(rc);
+}
+
+static int pcc_attach_allowed_check(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct pcc_inode *pcci;
+	int rc = 0;
+
+	ENTRY;
+
+	pcc_inode_lock(inode);
+	if (lli->lli_pcc_state & PCC_STATE_FL_ATTACHING)
+		GOTO(out_unlock, rc = -EBUSY);
+
+	pcci = ll_i2pcci(inode);
+	if (pcci && pcc_inode_has_layout(pcci))
+		GOTO(out_unlock, rc = -EEXIST);
+
+	lli->lli_pcc_state |= PCC_STATE_FL_ATTACHING;
+out_unlock:
+	pcc_inode_unlock(inode);
+	RETURN(rc);
+}
+
+int pcc_readwrite_attach(struct file *file, struct inode *inode,
+			 __u32 archive_id)
+{
+	struct pcc_dataset *dataset;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct pcc_super *super = ll_i2pccs(inode);
+	struct pcc_inode *pcci;
+	const struct cred *old_cred;
+	struct dentry *dentry;
+	struct file *pcc_filp;
+	struct path path;
+	ssize_t ret;
+	int rc;
+
+	ENTRY;
+
+	rc = pcc_attach_allowed_check(inode);
+	if (rc)
+		RETURN(rc);
+
+	dataset = pcc_dataset_get(&ll_i2sbi(inode)->ll_pcc_super,
+				  LU_PCC_READWRITE, archive_id);
+	if (dataset == NULL)
+		RETURN(-ENOENT);
+
+	old_cred = override_creds(super->pccs_cred);
+	rc = __pcc_inode_create(dataset, &lli->lli_fid, &dentry);
+	if (rc)
+		GOTO(out_dataset_put, rc);
+
+	path.mnt = dataset->pccd_path.mnt;
+	path.dentry = dentry;
+	pcc_filp = dentry_open(&path, O_WRONLY | O_LARGEFILE, current_cred());
+	if (IS_ERR_OR_NULL(pcc_filp)) {
+		rc = pcc_filp == NULL ? -EINVAL : PTR_ERR(pcc_filp);
+		GOTO(out_dentry, rc);
+	}
+
+	rc = pcc_inode_reset_iattr(dentry, ATTR_UID | ATTR_GID,
+				   old_cred->uid, old_cred->gid, 0);
+	if (rc)
+		GOTO(out_fput, rc);
+
+	ret = pcc_copy_data(file, pcc_filp);
+	if (ret < 0)
+		GOTO(out_fput, rc = ret);
+
+	/*
+	 * It must to truncate the PCC copy to the same size of the Lustre
+	 * copy after copy data. Otherwise, it may get wrong file size after
+	 * re-attach a file. See LU-13023 for details.
+	 */
+	rc = pcc_inode_reset_iattr(dentry, ATTR_SIZE, KUIDT_INIT(0),
+				   KGIDT_INIT(0), ret);
+	if (rc)
+		GOTO(out_fput, rc);
+
+	/* Pause to allow for a race with concurrent HSM remove */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PCC_ATTACH_PAUSE, cfs_fail_val);
+
+	pcc_inode_lock(inode);
+	pcci = ll_i2pcci(inode);
+	LASSERT(!pcci);
+	OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS);
+	if (pcci == NULL)
+		GOTO(out_unlock, rc = -ENOMEM);
+
+	pcc_inode_attach_set(super, dataset, lli, pcci,
+			     dentry, LU_PCC_READWRITE);
+out_unlock:
+	pcc_inode_unlock(inode);
+out_fput:
+	fput(pcc_filp);
+out_dentry:
+	if (rc) {
+		(void) pcc_inode_remove(inode, dentry);
+		dput(dentry);
+	}
+out_dataset_put:
+	pcc_dataset_put(dataset);
+	revert_creds(old_cred);
+
+	RETURN(rc);
+}
+
+int pcc_readwrite_attach_fini(struct file *file, struct inode *inode,
+			      __u32 gen, bool lease_broken, int rc,
+			      bool attached)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	const struct cred *old_cred;
+	struct pcc_inode *pcci;
+	__u32 gen2;
+
+	ENTRY;
+
+	old_cred = override_creds(pcc_super_cred(inode->i_sb));
+	pcc_inode_lock(inode);
+	pcci = ll_i2pcci(inode);
+	if (rc || lease_broken) {
+		if (attached && pcci)
+			pcc_inode_put(pcci);
+
+		GOTO(out_unlock, rc);
+	}
+
+	/* PCC inode may be released due to layout lock revocatioin */
+	if (!pcci)
+		GOTO(out_unlock, rc = -ESTALE);
+
+	LASSERT(attached);
+	rc = pcc_layout_xattr_set(pcci, gen);
+	if (rc)
+		GOTO(out_put, rc);
+
+	LASSERT(lli->lli_pcc_state & PCC_STATE_FL_ATTACHING);
+	rc = ll_layout_refresh(inode, &gen2);
+	if (!rc) {
+		if (gen2 == gen) {
+			pcc_layout_gen_set(pcci, gen);
+		} else {
+			CDEBUG(D_CACHE,
+			       DFID" layout changed from %d to %d.\n",
+			       PFID(ll_inode2fid(inode)), gen, gen2);
+			GOTO(out_put, rc = -ESTALE);
+		}
+	}
+
+out_put:
+	if (rc) {
+		(void) pcc_inode_remove(inode, pcci->pcci_path.dentry);
+		pcc_inode_put(pcci);
+	}
+out_unlock:
+	lli->lli_pcc_state &= ~PCC_STATE_FL_ATTACHING;
+	pcc_inode_unlock(inode);
+	revert_creds(old_cred);
+	RETURN(rc);
+}
+
+static int pcc_hsm_remove(struct inode *inode)
+{
+	struct hsm_user_request *hur;
+	__u32 gen;
+	int len;
+	int rc;
+
+	ENTRY;
+
+	rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF);
+	if (rc) {
+		CDEBUG(D_CACHE, DFID" RESTORE failure: %d\n",
+		       PFID(&ll_i2info(inode)->lli_fid), rc);
+		RETURN(rc);
+	}
+
+	ll_layout_refresh(inode, &gen);
+
+	len = sizeof(struct hsm_user_request) +
+	      sizeof(struct hsm_user_item);
+	OBD_ALLOC(hur, len);
+	if (hur == NULL)
+		RETURN(-ENOMEM);
+
+	hur->hur_request.hr_action = HUA_REMOVE;
+	hur->hur_request.hr_archive_id = 0;
+	hur->hur_request.hr_flags = 0;
+	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
+	       sizeof(hur->hur_user_item[0].hui_fid));
+	hur->hur_user_item[0].hui_extent.offset = 0;
+	hur->hur_user_item[0].hui_extent.length = OBD_OBJECT_EOF;
+	hur->hur_request.hr_itemcount = 1;
+	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
+			   len, hur, NULL);
+	if (rc)
+		CDEBUG(D_CACHE, DFID" HSM REMOVE failure: %d\n",
+		       PFID(&ll_i2info(inode)->lli_fid), rc);
+
+	OBD_FREE(hur, len);
+	RETURN(rc);
+}
+
+int pcc_ioctl_detach(struct inode *inode, __u32 opt)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct pcc_inode *pcci;
+	bool hsm_remove = false;
+	int rc = 0;
+
+	ENTRY;
+
+	pcc_inode_lock(inode);
+	pcci = lli->lli_pcc_inode;
+	if (!pcci || lli->lli_pcc_state & PCC_STATE_FL_ATTACHING ||
+	    !pcc_inode_has_layout(pcci))
+		GOTO(out_unlock, rc = 0);
+
+	LASSERT(atomic_read(&pcci->pcci_refcount) > 0);
+
+	if (pcci->pcci_type == LU_PCC_READWRITE) {
+		if (opt == PCC_DETACH_OPT_UNCACHE) {
+			hsm_remove = true;
+			/*
+			 * The file will be removed from PCC, set the flags
+			 * with PCC_DATASET_NONE even the later removal of the
+			 * PCC copy fails.
+			 */
+			lli->lli_pcc_dsflags = PCC_DATASET_NONE;
+		}
+
+		__pcc_layout_invalidate(pcci);
+		pcc_inode_put(pcci);
+	}
+
+out_unlock:
+	pcc_inode_unlock(inode);
+	if (hsm_remove) {
+		const struct cred *old_cred;
+
+		old_cred = override_creds(pcc_super_cred(inode->i_sb));
+		rc = pcc_hsm_remove(inode);
+		revert_creds(old_cred);
+	}
+
+	RETURN(rc);
+}
+
+int pcc_ioctl_state(struct file *file, struct inode *inode,
+		    struct lu_pcc_state *state)
+{
+	int rc = 0;
+	int count;
+	char *buf;
+	char *path;
+	int buf_len = sizeof(state->pccs_path);
+	struct ll_file_data *fd = file->private_data;
+	struct pcc_file *pccf = &fd->fd_pcc_file;
+	struct pcc_inode *pcci;
+
+	ENTRY;
+
+	if (buf_len <= 0)
+		RETURN(-EINVAL);
+
+	OBD_ALLOC(buf, buf_len);
+	if (buf == NULL)
+		RETURN(-ENOMEM);
+
+	pcc_inode_lock(inode);
+	pcci = ll_i2pcci(inode);
+	if (pcci == NULL) {
+		state->pccs_type = LU_PCC_NONE;
+		GOTO(out_unlock, rc = 0);
+	}
+
+	count = atomic_read(&pcci->pcci_refcount);
+	if (count == 0) {
+		state->pccs_type = LU_PCC_NONE;
+		state->pccs_open_count = 0;
+		GOTO(out_unlock, rc = 0);
+	}
+
+	if (pcc_inode_has_layout(pcci))
+		count--;
+	if (pccf->pccf_file != NULL)
+		count--;
+	state->pccs_type = pcci->pcci_type;
+	state->pccs_open_count = count;
+	state->pccs_flags = ll_i2info(inode)->lli_pcc_state;
+	path = dentry_path_raw(pcci->pcci_path.dentry, buf, buf_len);
+	if (IS_ERR(path))
+		GOTO(out_unlock, rc = PTR_ERR(path));
+
+	if (strlcpy(state->pccs_path, path, buf_len) >= buf_len)
+		GOTO(out_unlock, rc = -ENAMETOOLONG);
+
+out_unlock:
+	pcc_inode_unlock(inode);
+	OBD_FREE(buf, buf_len);
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/pcc.h b/drivers/staging/lustrefsx/lustre/llite/pcc.h
new file mode 100644
index 0000000000000..067daefb939c6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/pcc.h
@@ -0,0 +1,268 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, DDN Storage Corporation.
+ */
+/*
+ *
+ * Persistent Client Cache
+ *
+ * Author: Li Xi <lixi@ddn.com>
+ */
+
+#ifndef LLITE_PCC_H
+#define LLITE_PCC_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/mm.h>
+#include <uapi/linux/lustre/lustre_user.h>
+
+extern struct kmem_cache *pcc_inode_slab;
+
+#define LPROCFS_WR_PCC_MAX_CMD 4096
+
+/* User/Group/Project ID */
+struct pcc_match_id {
+	__u32			pmi_id;
+	struct list_head	pmi_linkage;
+};
+
+/* wildcard file name */
+struct pcc_match_fname {
+	char			*pmf_name;
+	struct list_head	 pmf_linkage;
+};
+
+enum pcc_field {
+	PCC_FIELD_UID,
+	PCC_FIELD_GID,
+	PCC_FIELD_PROJID,
+	PCC_FIELD_FNAME,
+	PCC_FIELD_MAX
+};
+
+struct pcc_expression {
+	enum pcc_field		pe_field;
+	struct list_head	pe_cond;
+	struct list_head	pe_linkage;
+};
+
+struct pcc_conjunction {
+	/* link to disjunction */
+	struct list_head	pc_linkage;
+	/* list of logical conjunction */
+	struct list_head	pc_expressions;
+};
+
+/**
+ * Match rule for auto PCC-cached files.
+ */
+struct pcc_match_rule {
+	char			*pmr_conds_str;
+	struct list_head	 pmr_conds;
+};
+
+struct pcc_matcher {
+	__u32		 pm_uid;
+	__u32		 pm_gid;
+	__u32		 pm_projid;
+	struct qstr	*pm_name;
+};
+
+enum pcc_dataset_flags {
+	PCC_DATASET_INVALID	= 0x0,
+	/* Indicate that known the file is not in PCC. */
+	PCC_DATASET_NONE	= 0x01,
+	/* Try auto attach at open, enabled by default */
+	PCC_DATASET_OPEN_ATTACH	= 0x02,
+	/* Try auto attach during IO when layout refresh, enabled by default */
+	PCC_DATASET_IO_ATTACH	= 0x04,
+	/* Try auto attach at stat */
+	PCC_DATASET_STAT_ATTACH	= 0x08,
+	PCC_DATASET_AUTO_ATTACH	= PCC_DATASET_OPEN_ATTACH |
+				  PCC_DATASET_IO_ATTACH |
+				  PCC_DATASET_STAT_ATTACH,
+	/* PCC backend is only used for RW-PCC */
+	PCC_DATASET_RWPCC	= 0x10,
+	/* PCC backend is only used for RO-PCC */
+	PCC_DATASET_ROPCC	= 0x20,
+	/* PCC backend provides caching services for both RW-PCC and RO-PCC */
+	PCC_DATASET_PCC_ALL	= PCC_DATASET_RWPCC | PCC_DATASET_ROPCC,
+};
+
+struct pcc_dataset {
+	__u32			pccd_rwid;	 /* Archive ID */
+	__u32			pccd_roid;	 /* Readonly ID */
+	struct pcc_match_rule	pccd_rule;	 /* Match rule */
+	enum pcc_dataset_flags	pccd_flags;	 /* Flags of PCC backend */
+	char			pccd_pathname[PATH_MAX]; /* full path */
+	struct path		pccd_path;	 /* Root path */
+	struct list_head	pccd_linkage;  /* Linked to pccs_datasets */
+	atomic_t		pccd_refcount; /* Reference count */
+};
+
+struct pcc_super {
+	/* Protect pccs_datasets */
+	struct rw_semaphore	 pccs_rw_sem;
+	/* List of datasets */
+	struct list_head	 pccs_datasets;
+	/* creds of process who forced instantiation of super block */
+	const struct cred	*pccs_cred;
+	/*
+	 * Gobal PCC Generation: it will be increased once the configuration
+	 * for PCC is changed, i.e. add or delete a PCC backend, modify the
+	 * parameters for PCC.
+	 */
+	__u64			 pccs_generation;
+};
+
+struct pcc_inode {
+	struct ll_inode_info	*pcci_lli;
+	/* Cache path on local file system */
+	struct path		 pcci_path;
+	/*
+	 * If reference count is 0, then the cache is not inited, if 1, then
+	 * no one is using it.
+	 */
+	atomic_t		 pcci_refcount;
+	/* Whether readonly or readwrite PCC */
+	enum lu_pcc_type	 pcci_type;
+	/* Whether the inode attr is cached locally */
+	bool			 pcci_attr_valid;
+	/* Layout generation */
+	__u32			 pcci_layout_gen;
+	/*
+	 * How many IOs are on going on this cached object. Layout can be
+	 * changed only if there is no active IO.
+	 */
+	atomic_t		 pcci_active_ios;
+	/* Waitq - wait for PCC I/O completion. */
+	wait_queue_head_t	 pcci_waitq;
+};
+
+struct pcc_file {
+	/* Opened cache file */
+	struct file		*pccf_file;
+	/* Whether readonly or readwrite PCC */
+	enum lu_pcc_type	 pccf_type;
+};
+
+enum pcc_io_type {
+	/* read system call */
+	PIT_READ = 1,
+	/* write system call */
+	PIT_WRITE,
+	/* truncate, utime system calls */
+	PIT_SETATTR,
+	/* stat system call */
+	PIT_GETATTR,
+	/* mmap write handling */
+	PIT_PAGE_MKWRITE,
+	/* page fault handling */
+	PIT_FAULT,
+	/* fsync system call handling */
+	PIT_FSYNC,
+#ifdef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT
+	/* splice_read system call */
+	PIT_SPLICE_READ,
+#endif
+	/* open system call */
+	PIT_OPEN
+};
+
+enum pcc_cmd_type {
+	PCC_ADD_DATASET = 0,
+	PCC_DEL_DATASET,
+	PCC_CLEAR_ALL,
+};
+
+struct pcc_cmd {
+	enum pcc_cmd_type			 pccc_cmd;
+	char					*pccc_pathname;
+	union {
+		struct pcc_cmd_add {
+			__u32			 pccc_rwid;
+			__u32			 pccc_roid;
+			struct list_head	 pccc_conds;
+			char			*pccc_conds_str;
+			enum pcc_dataset_flags	 pccc_flags;
+		} pccc_add;
+		struct pcc_cmd_del {
+			__u32			 pccc_pad;
+		} pccc_del;
+	} u;
+};
+
+struct pcc_create_attach {
+	struct pcc_dataset *pca_dataset;
+	struct dentry *pca_dentry;
+};
+
+int pcc_super_init(struct pcc_super *super);
+void pcc_super_fini(struct pcc_super *super);
+int pcc_cmd_handle(char *buffer, unsigned long count,
+		   struct pcc_super *super);
+int pcc_super_dump(struct pcc_super *super, struct seq_file *m);
+int pcc_readwrite_attach(struct file *file, struct inode *inode,
+			 __u32 arch_id);
+int pcc_readwrite_attach_fini(struct file *file, struct inode *inode,
+			      __u32 gen, bool lease_broken, int rc,
+			      bool attached);
+int pcc_ioctl_detach(struct inode *inode, __u32 opt);
+int pcc_ioctl_state(struct file *file, struct inode *inode,
+		    struct lu_pcc_state *state);
+void pcc_file_init(struct pcc_file *pccf);
+int pcc_file_open(struct inode *inode, struct file *file);
+void pcc_file_release(struct inode *inode, struct file *file);
+ssize_t pcc_file_read_iter(struct kiocb *iocb, struct iov_iter *iter,
+			   bool *cached);
+ssize_t pcc_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+			    bool *cached);
+int pcc_inode_getattr(struct inode *inode, u32 request_mask,
+		      unsigned int flags, bool *cached);
+int pcc_inode_setattr(struct inode *inode, struct iattr *attr, bool *cached);
+#ifdef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT
+ssize_t pcc_file_splice_read(struct file *in_file, loff_t *ppos,
+			     struct pipe_inode_info *pipe, size_t count,
+			     unsigned int flags);
+#endif
+int pcc_fsync(struct file *file, loff_t start, loff_t end,
+	      int datasync, bool *cached);
+int pcc_file_mmap(struct file *file, struct vm_area_struct *vma, bool *cached);
+void pcc_vm_open(struct vm_area_struct *vma);
+void pcc_vm_close(struct vm_area_struct *vma);
+int pcc_fault(struct vm_area_struct *mva, struct vm_fault *vmf, bool *cached);
+int pcc_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		     bool *cached);
+int pcc_inode_create(struct super_block *sb, struct pcc_dataset *dataset,
+		     struct lu_fid *fid, struct dentry **pcc_dentry);
+int pcc_inode_create_fini(struct inode *inode, struct pcc_create_attach *pca);
+void pcc_create_attach_cleanup(struct super_block *sb,
+			       struct pcc_create_attach *pca);
+struct pcc_dataset *pcc_dataset_match_get(struct pcc_super *super,
+					  struct pcc_matcher *matcher);
+void pcc_dataset_put(struct pcc_dataset *dataset);
+void pcc_inode_free(struct inode *inode);
+void pcc_layout_invalidate(struct inode *inode);
+#endif /* LLITE_PCC_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw.c b/drivers/staging/lustrefsx/lustre/llite/rw.c
new file mode 100644
index 0000000000000..7a48518c6c22c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/rw.c
@@ -0,0 +1,2046 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/llite/rw.c
+ *
+ * Lustre Lite I/O page cache routines shared by different kernel revs
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+/* current_is_kswapd() */
+#include <linux/swap.h>
+#include <linux/task_io_accounting_ops.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_cksum.h>
+#include "llite_internal.h"
+#include <lustre_compat.h>
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
+
+/**
+ * Get readahead pages from the filesystem readahead pool of the client for a
+ * thread.
+ *
+ * /param sbi superblock for filesystem readahead state ll_ra_info
+ * /param ria per-thread readahead state
+ * /param pages number of pages requested for readahead for the thread.
+ *
+ * WARNING: This algorithm is used to reduce contention on sbi->ll_lock.
+ * It should work well if the ra_max_pages is much greater than the single
+ * file's read-ahead window, and not too many threads contending for
+ * these readahead pages.
+ *
+ * TODO: There may be a 'global sync problem' if many threads are trying
+ * to get an ra budget that is larger than the remaining readahead pages
+ * and reach here at exactly the same time. They will compute /a ret to
+ * consume the remaining pages, but will fail at atomic_add_return() and
+ * get a zero ra window, although there is still ra space remaining. - Jay */
+
+static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
+				     struct ra_io_arg *ria,
+				     unsigned long pages,
+				     unsigned long pages_min)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	long ret;
+
+        ENTRY;
+
+	WARN_ON_ONCE(pages_min > pages);
+	/**
+	 * Don't try readahead aggresively if we are limited
+	 * LRU pages, otherwise, it could cause deadlock.
+	 */
+	pages = min(sbi->ll_cache->ccc_lru_max >> 2, pages);
+	/**
+	 * if this happen, we reserve more pages than needed,
+	 * this will make us leak @ra_cur_pages, because
+	 * ll_ra_count_put() acutally freed @pages.
+	 */
+	if (unlikely(pages_min > pages))
+		pages_min = pages;
+
+	/*
+	 * If read-ahead pages left are less than 1M, do not do read-ahead,
+	 * otherwise it will form small read RPC(< 1M), which hurt server
+	 * performance a lot.
+	 */
+	ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages),
+		  pages);
+	if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
+		GOTO(out, ret = 0);
+
+	if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
+		atomic_sub(ret, &ra->ra_cur_pages);
+		ret = 0;
+	}
+
+out:
+	if (ret < pages_min) {
+		/* override ra limit for maximum performance */
+		atomic_add(pages_min - ret, &ra->ra_cur_pages);
+		ret = pages_min;
+	}
+	RETURN(ret);
+}
+
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long pages)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	atomic_sub(pages, &ra->ra_cur_pages);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
+{
+	LASSERTF(which < _NR_RA_STAT, "which: %u\n", which);
+	lprocfs_counter_incr(sbi->ll_ra_stats, which);
+}
+
+static inline bool ll_readahead_enabled(struct ll_sb_info *sbi)
+{
+	return sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
+		sbi->ll_ra_info.ra_max_pages > 0;
+}
+
+void ll_ra_stats_inc(struct inode *inode, enum ra_stat which)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	ll_ra_stats_inc_sbi(sbi, which);
+}
+
+#define RAS_CDEBUG(ras) \
+	CDEBUG(D_READA,							     \
+	       "lre %llu cr %lu cb %llu wsi %lu wp %lu nra %lu rpc %lu "     \
+	       "r %lu csr %lu so %llu sb %llu sl %llu lr %lu\n",	     \
+	       ras->ras_last_read_end_bytes, ras->ras_consecutive_requests,  \
+	       ras->ras_consecutive_bytes, ras->ras_window_start_idx,	     \
+	       ras->ras_window_pages, ras->ras_next_readahead_idx,	     \
+	       ras->ras_rpc_pages, ras->ras_requests,			     \
+	       ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
+	       ras->ras_stride_bytes, ras->ras_stride_length,		     \
+	       ras->ras_async_last_readpage_idx)
+
+static bool pos_in_window(loff_t pos, loff_t point,
+			  unsigned long before, unsigned long after)
+{
+	loff_t start = point - before;
+	loff_t end = point + after;
+
+	if (start > point)
+		start = 0;
+	if (end < point)
+		end = ~0;
+
+	return start <= pos && pos <= end;
+}
+
+enum ll_ra_page_hint {
+	MAYNEED = 0, /* this page possibly accessed soon */
+	WILLNEED /* this page is gurateed to be needed */
+};
+
+/**
+ * Initiates read-ahead of a page with given index.
+ *
+ * \retval +ve: page was already uptodate so it will be skipped
+ *              from being added;
+ * \retval -ve: page wasn't added to \a queue for error;
+ * \retval   0: page was added into \a queue for read ahead.
+ */
+static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *queue, pgoff_t index,
+			      enum ll_ra_page_hint hint)
+{
+	struct cl_object *clob  = io->ci_obj;
+	struct inode     *inode = vvp_object_inode(clob);
+	struct page      *vmpage = NULL;
+	struct cl_page   *page;
+	struct vvp_page  *vpg;
+	enum ra_stat      which = _NR_RA_STAT; /* keep gcc happy */
+	int               rc    = 0;
+	const char       *msg   = NULL;
+
+	ENTRY;
+
+	switch (hint) {
+	case MAYNEED:
+		vmpage = grab_cache_page_nowait(inode->i_mapping, index);
+		if (vmpage == NULL) {
+			which = RA_STAT_FAILED_GRAB_PAGE;
+			msg   = "g_c_p_n failed";
+			GOTO(out, rc = -EBUSY);
+		}
+		break;
+	case WILLNEED:
+		vmpage = find_or_create_page(inode->i_mapping, index,
+					     GFP_NOFS);
+		if (vmpage == NULL)
+			GOTO(out, rc = -ENOMEM);
+		break;
+	default:
+		/* should not come here */
+		GOTO(out, rc = -EINVAL);
+	}
+ 
+	/* Check if vmpage was truncated or reclaimed */
+	if (vmpage->mapping != inode->i_mapping) {
+		which = RA_STAT_WRONG_GRAB_PAGE;
+		msg   = "g_c_p_n returned invalid page";
+		GOTO(out, rc = -EBUSY);
+	}
+
+	page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
+	if (IS_ERR(page)) {
+		which = RA_STAT_FAILED_GRAB_PAGE;
+		msg   = "cl_page_find failed";
+		GOTO(out, rc = PTR_ERR(page));
+	}
+
+	lu_ref_add(&page->cp_reference, "ra", current);
+	cl_page_assume(env, io, page);
+	vpg = cl2vvp_page(cl_object_page_slice(clob, page));
+	if (!vpg->vpg_defer_uptodate && !PageUptodate(vmpage)) {
+		if (hint == MAYNEED) {
+			vpg->vpg_defer_uptodate = 1;
+			vpg->vpg_ra_used = 0;
+		}
+		cl_page_list_add(queue, page, true);
+	} else {
+		/* skip completed pages */
+		cl_page_unassume(env, io, page);
+		/* This page is already uptodate, returning a positive number
+		 * to tell the callers about this */
+		rc = 1;
+	}
+
+	lu_ref_del(&page->cp_reference, "ra", current);
+	cl_page_put(env, page);
+
+out:
+	if (vmpage != NULL) {
+		if (rc != 0)
+			unlock_page(vmpage);
+		put_page(vmpage);
+	}
+	if (msg != NULL && hint == MAYNEED) {
+		ll_ra_stats_inc(inode, which);
+		CDEBUG(D_READA, "%s\n", msg);
+
+	}
+
+	RETURN(rc);
+}
+
+#define RIA_DEBUG(ria)							\
+	CDEBUG(D_READA, "rs %lu re %lu ro %llu rl %llu rb %llu\n",	\
+	       ria->ria_start_idx, ria->ria_end_idx, ria->ria_stoff,	\
+	       ria->ria_length, ria->ria_bytes)
+
+static inline int stride_io_mode(struct ll_readahead_state *ras)
+{
+        return ras->ras_consecutive_stride_requests > 1;
+}
+
+/* The function calculates how many bytes will be read in
+ * [off, off + length], in such stride IO area,
+ * stride_offset = st_off, stride_lengh = st_len,
+ * stride_bytes = st_bytes
+ *
+ *   |------------------|*****|------------------|*****|------------|*****|....
+ * st_off
+ *   |--- st_bytes     ---|
+ *   |-----     st_len   -----|
+ *
+ *              How many bytes it should read in such pattern
+ *              |-------------------------------------------------------------|
+ *              off
+ *              |<------                  length                      ------->|
+ *
+ *          =   |<----->|  +  |-------------------------------------| +   |---|
+ *             start_left                 st_bytes * i                 end_left
+ */
+static loff_t stride_byte_count(loff_t st_off, loff_t st_len, loff_t st_bytes,
+				loff_t off, loff_t length)
+{
+	u64 start = off > st_off ? off - st_off : 0;
+	u64 end = off + length > st_off ? off + length - st_off : 0;
+	u64 start_left;
+	u64 end_left;
+	u64 bytes_count;
+
+	if (st_len == 0 || length == 0 || end == 0)
+		return length;
+
+	start = div64_u64_rem(start, st_len, &start_left);
+	if (start_left < st_bytes)
+		start_left = st_bytes - start_left;
+	else
+		start_left = 0;
+
+	end = div64_u64_rem(end, st_len, &end_left);
+	if (end_left > st_bytes)
+		end_left = st_bytes;
+
+	CDEBUG(D_READA, "start %llu, end %llu start_left %llu end_left %llu\n",
+	       start, end, start_left, end_left);
+
+	if (start == end)
+		bytes_count = end_left - (st_bytes - start_left);
+	else
+		bytes_count = start_left +
+			st_bytes * (end - start - 1) + end_left;
+
+	CDEBUG(D_READA,
+	       "st_off %llu, st_len %llu st_bytes %llu off %llu length %llu bytescount %llu\n",
+	       st_off, st_len, st_bytes, off, length, bytes_count);
+
+	return bytes_count;
+}
+
+static unsigned long ria_page_count(struct ra_io_arg *ria)
+{
+	loff_t length_bytes = ria->ria_end_idx >= ria->ria_start_idx ?
+		(loff_t)(ria->ria_end_idx -
+			 ria->ria_start_idx + 1) << PAGE_SHIFT : 0;
+	loff_t bytes_count;
+
+	if (ria->ria_length > ria->ria_bytes && ria->ria_bytes &&
+	    (ria->ria_length & ~PAGE_MASK || ria->ria_bytes & ~PAGE_MASK ||
+	     ria->ria_stoff & ~PAGE_MASK)) {
+		/* Over-estimate un-aligned page stride read */
+		unsigned long pg_count = ((ria->ria_bytes +
+					   PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
+		pg_count *= length_bytes / ria->ria_length + 1;
+
+		return pg_count;
+	}
+	bytes_count = stride_byte_count(ria->ria_stoff, ria->ria_length,
+					ria->ria_bytes,
+					(loff_t)ria->ria_start_idx<<PAGE_SHIFT,
+					length_bytes);
+	return (bytes_count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
+static pgoff_t ras_align(struct ll_readahead_state *ras, pgoff_t index)
+{
+	unsigned opt_size = min(ras->ras_window_pages, ras->ras_rpc_pages);
+
+	if (opt_size == 0)
+		opt_size = 1;
+	return index - (index % opt_size);
+}
+
+/* Check whether the index is in the defined ra-window */
+static bool ras_inside_ra_window(pgoff_t idx, struct ra_io_arg *ria)
+{
+	loff_t pos = (loff_t)idx << PAGE_SHIFT;
+
+	/* If ria_length == ria_bytes, it means non-stride I/O mode,
+         * idx should always inside read-ahead window in this case
+         * For stride I/O mode, just check whether the idx is inside
+	 * the ria_bytes.
+	 */
+	if (ria->ria_length == 0 || ria->ria_length == ria->ria_bytes)
+		return true;
+
+	if (pos >= ria->ria_stoff) {
+		u64 offset;
+
+		div64_u64_rem(pos - ria->ria_stoff, ria->ria_length, &offset);
+
+		if (offset < ria->ria_bytes ||
+		    (ria->ria_length - offset) < PAGE_SIZE)
+			return true;
+	} else if (pos + PAGE_SIZE > ria->ria_stoff) {
+		return true;
+	}
+
+	return false;
+}
+
+static unsigned long
+ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page_list *queue, struct ll_readahead_state *ras,
+		    struct ra_io_arg *ria, pgoff_t *ra_end, pgoff_t skip_index)
+{
+	struct cl_read_ahead ra = { 0 };
+	/* busy page count is per stride */
+	int rc = 0, count = 0, busy_page_count = 0;
+	pgoff_t page_idx;
+
+	LASSERT(ria != NULL);
+	RIA_DEBUG(ria);
+
+	for (page_idx = ria->ria_start_idx;
+	     page_idx <= ria->ria_end_idx && ria->ria_reserved > 0;
+	     page_idx++) {
+		if (skip_index && page_idx == skip_index)
+			continue;
+		if (ras_inside_ra_window(page_idx, ria)) {
+			if (ra.cra_end_idx == 0 || ra.cra_end_idx < page_idx) {
+				pgoff_t end_idx;
+
+				/*
+				 * Do not shrink ria_end_idx at any case until
+				 * the minimum end of current read is covered.
+				 *
+				 * Do not extend read lock accross stripe if
+				 * lock contention detected.
+				 */
+				if (ra.cra_contention &&
+				    page_idx > ria->ria_end_idx_min) {
+					ria->ria_end_idx = *ra_end;
+					break;
+				}
+
+				cl_read_ahead_release(env, &ra);
+
+				rc = cl_io_read_ahead(env, io, page_idx, &ra);
+				if (rc < 0)
+					break;
+
+				 /*
+				  * Only shrink ria_end_idx if the matched
+				  * LDLM lock doesn't cover more.
+				  */
+				if (page_idx > ra.cra_end_idx) {
+					ria->ria_end_idx = ra.cra_end_idx;
+					break;
+				}
+
+				CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n",
+				       page_idx, ra.cra_end_idx,
+				       ra.cra_rpc_pages);
+				LASSERTF(ra.cra_end_idx >= page_idx,
+					 "object: %p, indcies %lu / %lu\n",
+					 io->ci_obj, ra.cra_end_idx, page_idx);
+				/* update read ahead RPC size.
+				 * NB: it's racy but doesn't matter */
+				if (ras->ras_rpc_pages != ra.cra_rpc_pages &&
+				    ra.cra_rpc_pages > 0)
+					ras->ras_rpc_pages = ra.cra_rpc_pages;
+				if (!skip_index) {
+					/* trim it to align with optimal RPC size */
+					end_idx = ras_align(ras, ria->ria_end_idx + 1);
+					if (end_idx > 0 && !ria->ria_eof)
+						ria->ria_end_idx = end_idx - 1;
+				}
+				if (ria->ria_end_idx < ria->ria_end_idx_min)
+					ria->ria_end_idx = ria->ria_end_idx_min;
+			}
+			if (page_idx > ria->ria_end_idx)
+				break;
+
+			/* If the page is inside the read-ahead window */
+			rc = ll_read_ahead_page(env, io, queue, page_idx,
+						MAYNEED);
+			if (rc < 0 && rc != -EBUSY)
+				break;
+			if (rc == -EBUSY) {
+				busy_page_count++;
+				CDEBUG(D_READA,
+				       "skip busy page: %lu\n", page_idx);
+				/* For page unaligned readahead the first
+				 * last pages of each region can be read by
+				 * another reader on the same node, and so
+				 * may be busy. So only stop for > 2 busy
+				 * pages. */
+				if (busy_page_count > 2)
+					break;
+			}
+
+			*ra_end = page_idx;
+			/* Only subtract from reserve & count the page if we
+			 * really did readahead on that page. */
+			if (rc == 0) {
+				ria->ria_reserved--;
+				count++;
+			}
+		} else if (stride_io_mode(ras)) {
+			/* If it is not in the read-ahead window, and it is
+			 * read-ahead mode, then check whether it should skip
+			 * the stride gap.
+			 */
+			loff_t pos = (loff_t)page_idx << PAGE_SHIFT;
+			u64 offset;
+
+			div64_u64_rem(pos - ria->ria_stoff, ria->ria_length,
+				      &offset);
+			if (offset >= ria->ria_bytes) {
+				pos += (ria->ria_length - offset);
+				if ((pos >> PAGE_SHIFT) >= page_idx + 1)
+					page_idx = (pos >> PAGE_SHIFT) - 1;
+				busy_page_count = 0;
+				CDEBUG(D_READA,
+				       "Stride: jump %llu pages to %lu\n",
+				       ria->ria_length - offset, page_idx);
+				continue;
+			}
+		}
+	}
+
+	cl_read_ahead_release(env, &ra);
+
+	return count;
+}
+
+static void ll_readahead_work_free(struct ll_readahead_work *work)
+{
+	fput(work->lrw_file);
+	OBD_FREE_PTR(work);
+}
+
+static void ll_readahead_handle_work(struct work_struct *wq);
+static void ll_readahead_work_add(struct inode *inode,
+				  struct ll_readahead_work *work)
+{
+	INIT_WORK(&work->lrw_readahead_work, ll_readahead_handle_work);
+	queue_work(ll_i2sbi(inode)->ll_ra_info.ll_readahead_wq,
+		   &work->lrw_readahead_work);
+}
+
+static int ll_readahead_file_kms(const struct lu_env *env,
+				struct cl_io *io, __u64 *kms)
+{
+	struct cl_object *clob;
+	struct inode *inode;
+	struct cl_attr *attr = vvp_env_thread_attr(env);
+	int ret;
+
+	clob = io->ci_obj;
+	inode = vvp_object_inode(clob);
+
+	cl_object_attr_lock(clob);
+	ret = cl_object_attr_get(env, clob, attr);
+	cl_object_attr_unlock(clob);
+
+	if (ret != 0)
+		RETURN(ret);
+
+	*kms = attr->cat_kms;
+	return 0;
+}
+
+static void ll_readahead_handle_work(struct work_struct *wq)
+{
+	struct ll_readahead_work *work;
+	struct lu_env *env;
+	__u16 refcheck;
+	struct ra_io_arg *ria;
+	struct inode *inode;
+	struct ll_file_data *fd;
+	struct ll_readahead_state *ras;
+	struct cl_io *io;
+	struct cl_2queue *queue;
+	pgoff_t ra_end_idx = 0;
+	unsigned long pages, pages_min = 0;
+	struct file *file;
+	__u64 kms;
+	int rc;
+	pgoff_t eof_index;
+	struct ll_sb_info *sbi;
+
+	work = container_of(wq, struct ll_readahead_work,
+			    lrw_readahead_work);
+	fd = work->lrw_file->private_data;
+	ras = &fd->fd_ras;
+	file = work->lrw_file;
+	inode = file_inode(file);
+	sbi = ll_i2sbi(inode);
+
+	CDEBUG(D_READA|D_IOTRACE,
+	       "%s: async ra from %lu to %lu triggered by user pid %d\n",
+	       file_dentry(file)->d_name.name, work->lrw_start_idx,
+	       work->lrw_end_idx, work->lrw_user_pid);
+
+	env = cl_env_alloc(&refcheck, LCT_NOREF);
+	if (IS_ERR(env))
+		GOTO(out_free_work, rc = PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	ll_io_init(io, file, CIT_READ, NULL);
+
+	rc = ll_readahead_file_kms(env, io, &kms);
+	if (rc != 0)
+		GOTO(out_put_env, rc);
+
+	if (kms == 0) {
+		ll_ra_stats_inc(inode, RA_STAT_ZERO_LEN);
+		GOTO(out_put_env, rc = 0);
+	}
+
+	ria = &ll_env_info(env)->lti_ria;
+	memset(ria, 0, sizeof(*ria));
+
+	ria->ria_start_idx = work->lrw_start_idx;
+	/* Truncate RA window to end of file */
+	eof_index = (pgoff_t)(kms - 1) >> PAGE_SHIFT;
+	if (eof_index <= work->lrw_end_idx) {
+		work->lrw_end_idx = eof_index;
+		ria->ria_eof = true;
+	}
+	if (work->lrw_end_idx <= work->lrw_start_idx)
+		GOTO(out_put_env, rc = 0);
+
+	ria->ria_end_idx = work->lrw_end_idx;
+	pages = ria->ria_end_idx - ria->ria_start_idx + 1;
+	ria->ria_reserved = ll_ra_count_get(sbi, ria,
+					    ria_page_count(ria), pages_min);
+
+	CDEBUG(D_READA,
+	       "async reserved pages: %lu/%lu/%lu, ra_cur %d, ra_max %lu\n",
+	       ria->ria_reserved, pages, pages_min,
+	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
+	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
+
+	if (ria->ria_reserved < pages) {
+		ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
+		if (PAGES_TO_MiB(ria->ria_reserved) < 1) {
+			ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
+			GOTO(out_put_env, rc = 0);
+		}
+	}
+
+	rc = cl_io_rw_init(env, io, CIT_READ, ria->ria_start_idx, pages);
+	if (rc)
+		GOTO(out_put_env, rc);
+
+	/* overwrite jobid inited in vvp_io_init() */
+	if (strncmp(ll_i2info(inode)->lli_jobid, work->lrw_jobid,
+		    sizeof(work->lrw_jobid)))
+		memcpy(ll_i2info(inode)->lli_jobid, work->lrw_jobid,
+		       sizeof(work->lrw_jobid));
+
+	vvp_env_io(env)->vui_fd = fd;
+	io->ci_state = CIS_LOCKED;
+	io->ci_async_readahead = true;
+	rc = cl_io_start(env, io);
+	if (rc)
+		GOTO(out_io_fini, rc);
+
+	queue = &io->ci_queue;
+	cl_2queue_init(queue);
+
+	rc = ll_read_ahead_pages(env, io, &queue->c2_qin, ras, ria,
+				 &ra_end_idx, 0);
+	if (ria->ria_reserved != 0)
+		ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
+	if (queue->c2_qin.pl_nr > 0) {
+		int count = queue->c2_qin.pl_nr;
+
+		rc = cl_io_submit_rw(env, io, CRT_READ, queue);
+		if (rc == 0)
+			task_io_account_read(PAGE_SIZE * count);
+	}
+	if (ria->ria_end_idx == ra_end_idx && ra_end_idx == (kms >> PAGE_SHIFT))
+		ll_ra_stats_inc(inode, RA_STAT_EOF);
+
+	if (ra_end_idx != ria->ria_end_idx)
+		ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END);
+
+	/* TODO: discard all pages until page reinit route is implemented */
+	cl_page_list_discard(env, io, &queue->c2_qin);
+
+	/* Unlock unsent read pages in case of error. */
+	cl_page_list_disown(env, io, &queue->c2_qin);
+
+	cl_2queue_fini(env, queue);
+out_io_fini:
+	cl_io_end(env, io);
+	cl_io_fini(env, io);
+out_put_env:
+	cl_env_put(env, &refcheck);
+out_free_work:
+	if (ra_end_idx > 0)
+		ll_ra_stats_inc_sbi(ll_i2sbi(inode), RA_STAT_ASYNC);
+	atomic_dec(&sbi->ll_ra_info.ra_async_inflight);
+	ll_readahead_work_free(work);
+}
+
+static int ll_readahead(const struct lu_env *env, struct cl_io *io,
+			struct cl_page_list *queue,
+			struct ll_readahead_state *ras, bool hit,
+			struct file *file, pgoff_t skip_index,
+			pgoff_t *start_idx)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+	struct ll_thread_info *lti = ll_env_info(env);
+	unsigned long pages, pages_min = 0;
+	pgoff_t ra_end_idx = 0, end_idx = 0;
+	struct inode *inode;
+	struct ra_io_arg *ria = &lti->lti_ria;
+	struct cl_object *clob;
+	int ret = 0;
+	__u64 kms;
+	struct ll_sb_info *sbi;
+	struct ll_ra_info *ra;
+
+        ENTRY;
+
+	clob = io->ci_obj;
+	inode = vvp_object_inode(clob);
+	sbi = ll_i2sbi(inode);
+	ra = &sbi->ll_ra_info;
+
+	/**
+	 * In case we have a limited max_cached_mb, readahead
+	 * should be stopped if it have run out of all LRU slots.
+	 */
+	if (atomic_read(&ra->ra_cur_pages) >= sbi->ll_cache->ccc_lru_max) {
+		ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
+		RETURN(0);
+	}
+
+	memset(ria, 0, sizeof(*ria));
+	ret = ll_readahead_file_kms(env, io, &kms);
+	if (ret != 0)
+		RETURN(ret);
+
+	if (kms == 0) {
+		ll_ra_stats_inc(inode, RA_STAT_ZERO_LEN);
+		RETURN(0);
+	}
+
+	spin_lock(&ras->ras_lock);
+
+	/**
+	 * Note: other thread might rollback the ras_next_readahead_idx,
+	 * if it can not get the full size of prepared pages, see the
+	 * end of this function. For stride read ahead, it needs to
+	 * make sure the offset is no less than ras_stride_offset,
+	 * so that stride read ahead can work correctly.
+	 */
+	if (stride_io_mode(ras))
+		*start_idx = max_t(pgoff_t, ras->ras_next_readahead_idx,
+				  ras->ras_stride_offset >> PAGE_SHIFT);
+	else
+		*start_idx = ras->ras_next_readahead_idx;
+
+	if (ras->ras_window_pages > 0)
+		end_idx = ras->ras_window_start_idx + ras->ras_window_pages - 1;
+
+	if (skip_index)
+		end_idx = *start_idx + ras->ras_window_pages - 1;
+
+	/* Enlarge the RA window to encompass the full read */
+	if (vio->vui_ra_valid &&
+	    end_idx < vio->vui_ra_start_idx + vio->vui_ra_pages - 1)
+		end_idx = vio->vui_ra_start_idx + vio->vui_ra_pages - 1;
+
+	if (end_idx != 0) {
+		pgoff_t eof_index;
+
+		/* Truncate RA window to end of file */
+		eof_index = (pgoff_t)((kms - 1) >> PAGE_SHIFT);
+		if (eof_index <= end_idx) {
+			end_idx = eof_index;
+			ria->ria_eof = true;
+		}
+	}
+	ria->ria_start_idx = *start_idx;
+	ria->ria_end_idx = end_idx;
+	/* If stride I/O mode is detected, get stride window*/
+	if (stride_io_mode(ras)) {
+		ria->ria_stoff = ras->ras_stride_offset;
+		ria->ria_length = ras->ras_stride_length;
+		ria->ria_bytes = ras->ras_stride_bytes;
+	}
+	spin_unlock(&ras->ras_lock);
+
+	if (end_idx == 0) {
+		ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW);
+		RETURN(0);
+	}
+	pages = ria_page_count(ria);
+	if (pages == 0) {
+		ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW);
+		RETURN(0);
+	}
+
+	RAS_CDEBUG(ras);
+	CDEBUG(D_READA, DFID": ria: %lu/%lu, bead: %lu/%lu, hit: %d\n",
+	       PFID(lu_object_fid(&clob->co_lu)),
+	       ria->ria_start_idx, ria->ria_end_idx,
+	       vio->vui_ra_valid ? vio->vui_ra_start_idx : 0,
+	       vio->vui_ra_valid ? vio->vui_ra_pages : 0,
+	       hit);
+
+	/* at least to extend the readahead window to cover current read */
+	if (!hit && vio->vui_ra_valid &&
+	    vio->vui_ra_start_idx + vio->vui_ra_pages > ria->ria_start_idx) {
+		ria->ria_end_idx_min =
+			vio->vui_ra_start_idx + vio->vui_ra_pages - 1;
+		pages_min = vio->vui_ra_start_idx + vio->vui_ra_pages -
+				ria->ria_start_idx;
+		 /**
+		  * For performance reason, exceeding @ra_max_pages
+		  * are allowed, but this should be limited with RPC
+		  * size in case a large block size read issued. Trim
+		  * to RPC boundary.
+		  */
+		pages_min = min(pages_min, ras->ras_rpc_pages -
+				(ria->ria_start_idx % ras->ras_rpc_pages));
+	}
+
+	/* don't over reserved for mmap range read */
+	if (skip_index)
+		pages_min = 0;
+	if (pages_min > pages)
+		pages = pages_min;
+	ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, pages,
+					    pages_min);
+	if (ria->ria_reserved < pages)
+		ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
+
+	CDEBUG(D_READA, "reserved pages: %lu/%lu/%lu, ra_cur %d, ra_max %lu\n",
+	       ria->ria_reserved, pages, pages_min,
+	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
+	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
+
+	ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end_idx,
+				  skip_index);
+	if (ria->ria_reserved != 0)
+		ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
+
+	if (ra_end_idx == end_idx && ra_end_idx == (kms >> PAGE_SHIFT))
+		ll_ra_stats_inc(inode, RA_STAT_EOF);
+
+	CDEBUG(D_READA,
+	       "ra_end_idx = %lu end_idx = %lu stride end = %lu pages = %d\n",
+	       ra_end_idx, end_idx, ria->ria_end_idx, ret);
+
+	if (ra_end_idx != end_idx)
+		ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END);
+	if (ra_end_idx > 0) {
+		/* update the ras so that the next read-ahead tries from
+		 * where we left off. */
+		spin_lock(&ras->ras_lock);
+		ras->ras_next_readahead_idx = ra_end_idx + 1;
+		spin_unlock(&ras->ras_lock);
+		RAS_CDEBUG(ras);
+	}
+
+	RETURN(ret);
+}
+
+static int ll_readpages(const struct lu_env *env, struct cl_io *io,
+			struct cl_page_list *queue,
+			pgoff_t start, pgoff_t end)
+{
+	int ret = 0;
+	__u64 kms;
+	pgoff_t page_idx;
+	int count = 0;
+
+	ENTRY;
+
+	ret = ll_readahead_file_kms(env, io, &kms);
+	if (ret != 0)
+		RETURN(ret);
+
+	if (kms == 0)
+		RETURN(0);
+
+	if (end != 0) {
+		unsigned long end_index;
+
+		end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT);
+		if (end_index <= end)
+			end = end_index;
+	}
+
+	for (page_idx = start; page_idx <= end; page_idx++) {
+		ret= ll_read_ahead_page(env, io, queue, page_idx,
+					WILLNEED);
+		if (ret < 0)
+			break;
+		else if (ret == 0) /* ret 1 is already uptodate */
+			count++;
+	}
+
+	RETURN(count > 0 ? count : ret);
+}
+
+static void ras_set_start(struct ll_readahead_state *ras, pgoff_t index)
+{
+	ras->ras_window_start_idx = ras_align(ras, index);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_reset(struct ll_readahead_state *ras, pgoff_t index)
+{
+	ras->ras_consecutive_requests = 0;
+	ras->ras_consecutive_bytes = 0;
+	ras->ras_window_pages = 0;
+	ras_set_start(ras, index);
+	ras->ras_next_readahead_idx = max(ras->ras_window_start_idx, index + 1);
+
+	RAS_CDEBUG(ras);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_stride_reset(struct ll_readahead_state *ras)
+{
+        ras->ras_consecutive_stride_requests = 0;
+        ras->ras_stride_length = 0;
+	ras->ras_stride_bytes = 0;
+        RAS_CDEBUG(ras);
+}
+
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
+{
+	spin_lock_init(&ras->ras_lock);
+	ras->ras_rpc_pages = PTLRPC_MAX_BRW_PAGES;
+	ras_reset(ras, 0);
+	ras->ras_last_read_end_bytes = 0;
+	ras->ras_requests = 0;
+	ras->ras_range_min_start_idx = 0;
+	ras->ras_range_max_end_idx = 0;
+	ras->ras_range_requests = 0;
+	ras->ras_last_range_pages = 0;
+}
+
+/*
+ * Check whether the read request is in the stride window.
+ * If it is in the stride window, return true, otherwise return false.
+ */
+static bool read_in_stride_window(struct ll_readahead_state *ras,
+				  loff_t pos, loff_t count)
+{
+	loff_t stride_gap;
+
+	if (ras->ras_stride_length == 0 || ras->ras_stride_bytes == 0 ||
+	    ras->ras_stride_bytes == ras->ras_stride_length)
+		return false;
+
+	stride_gap = pos - ras->ras_last_read_end_bytes - 1;
+
+	/* If it is contiguous read */
+	if (stride_gap == 0)
+		return ras->ras_consecutive_bytes + count <=
+			ras->ras_stride_bytes;
+
+	/* Otherwise check the stride by itself */
+	return (ras->ras_stride_length - ras->ras_stride_bytes) == stride_gap &&
+		ras->ras_consecutive_bytes == ras->ras_stride_bytes &&
+		count <= ras->ras_stride_bytes;
+}
+
+static void ras_init_stride_detector(struct ll_readahead_state *ras,
+				     loff_t pos, loff_t count)
+{
+	loff_t stride_gap = pos - ras->ras_last_read_end_bytes - 1;
+
+        LASSERT(ras->ras_consecutive_stride_requests == 0);
+
+	if (pos <= ras->ras_last_read_end_bytes) {
+                /*Reset stride window for forward read*/
+                ras_stride_reset(ras);
+                return;
+        }
+
+	ras->ras_stride_bytes = ras->ras_consecutive_bytes;
+	ras->ras_stride_length = stride_gap + ras->ras_consecutive_bytes;
+	ras->ras_consecutive_stride_requests++;
+	ras->ras_stride_offset = pos;
+
+        RAS_CDEBUG(ras);
+}
+
+static unsigned long
+stride_page_count(struct ll_readahead_state *ras, loff_t len)
+{
+	loff_t bytes_count =
+		stride_byte_count(ras->ras_stride_offset,
+				  ras->ras_stride_length, ras->ras_stride_bytes,
+				  ras->ras_window_start_idx << PAGE_SHIFT, len);
+
+	return (bytes_count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
+/* Stride Read-ahead window will be increased inc_len according to
+ * stride I/O pattern */
+static void ras_stride_increase_window(struct ll_readahead_state *ras,
+				       struct ll_ra_info *ra, loff_t inc_bytes)
+{
+	loff_t window_bytes, stride_bytes;
+	u64 left_bytes;
+	u64 step;
+	loff_t end;
+
+	/* temporarily store in page units to reduce LASSERT() cost below */
+	end = ras->ras_window_start_idx + ras->ras_window_pages;
+
+	LASSERT(ras->ras_stride_length > 0);
+	LASSERTF(end >= (ras->ras_stride_offset >> PAGE_SHIFT),
+		 "window_start_idx %lu, window_pages %lu stride_offset %llu\n",
+		 ras->ras_window_start_idx, ras->ras_window_pages,
+		 ras->ras_stride_offset);
+
+	end <<= PAGE_SHIFT;
+	if (end <= ras->ras_stride_offset)
+		stride_bytes = 0;
+	else
+		stride_bytes = end - ras->ras_stride_offset;
+
+	div64_u64_rem(stride_bytes, ras->ras_stride_length, &left_bytes);
+	window_bytes = (ras->ras_window_pages << PAGE_SHIFT);
+	if (left_bytes < ras->ras_stride_bytes) {
+		if (ras->ras_stride_bytes - left_bytes >= inc_bytes) {
+			window_bytes += inc_bytes;
+			goto out;
+		} else {
+			window_bytes += (ras->ras_stride_bytes - left_bytes);
+			inc_bytes -= (ras->ras_stride_bytes - left_bytes);
+		}
+	} else {
+		window_bytes += (ras->ras_stride_length - left_bytes);
+	}
+
+	LASSERT(ras->ras_stride_bytes != 0);
+
+	step = div64_u64_rem(inc_bytes, ras->ras_stride_bytes, &left_bytes);
+
+	window_bytes += step * ras->ras_stride_length + left_bytes;
+	LASSERT(window_bytes > 0);
+
+out:
+	if (stride_page_count(ras, window_bytes) <=
+	    ra->ra_max_pages_per_file || ras->ras_window_pages == 0)
+		ras->ras_window_pages = (window_bytes >> PAGE_SHIFT);
+
+	LASSERT(ras->ras_window_pages > 0);
+
+	RAS_CDEBUG(ras);
+}
+
+static void ras_increase_window(struct inode *inode,
+				struct ll_readahead_state *ras,
+				struct ll_ra_info *ra)
+{
+	/* The stretch of ra-window should be aligned with max rpc_size
+	 * but current clio architecture does not support retrieve such
+	 * information from lower layer. FIXME later
+	 */
+	if (stride_io_mode(ras)) {
+		ras_stride_increase_window(ras, ra,
+				      (loff_t)ras->ras_rpc_pages << PAGE_SHIFT);
+	} else {
+		pgoff_t window_pages;
+
+		window_pages = min(ras->ras_window_pages + ras->ras_rpc_pages,
+				   ra->ra_max_pages_per_file);
+		if (window_pages < ras->ras_rpc_pages)
+			ras->ras_window_pages = window_pages;
+		else
+			ras->ras_window_pages = ras_align(ras, window_pages);
+	}
+}
+
+/**
+ * Seek within 8 pages are considered as sequential read for now.
+ */
+static inline bool is_loose_seq_read(struct ll_readahead_state *ras, loff_t pos)
+{
+	return pos_in_window(pos, ras->ras_last_read_end_bytes,
+			     8UL << PAGE_SHIFT, 8UL << PAGE_SHIFT);
+}
+
+static inline bool is_loose_mmap_read(struct ll_sb_info *sbi,
+				      struct ll_readahead_state *ras,
+				      unsigned long pos)
+{
+	unsigned long range_pages = sbi->ll_ra_info.ra_range_pages;
+
+	return pos_in_window(pos, ras->ras_last_read_end_bytes,
+			     range_pages << PAGE_SHIFT,
+			     range_pages << PAGE_SHIFT);
+}
+
+/**
+ * We have observed slow mmap read performances for some
+ * applications. The problem is if access pattern is neither
+ * sequential nor stride, but could be still adjacent in a
+ * small range and then seek a random position.
+ *
+ * So the pattern could be something like this:
+ *
+ * [1M data] [hole] [0.5M data] [hole] [0.7M data] [1M data]
+ *
+ *
+ * Every time an application reads mmap data, it may not only
+ * read a single 4KB page, but aslo a cluster of nearby pages in
+ * a range(e.g. 1MB) of the first page after a cache miss.
+ *
+ * The readahead engine is modified to track the range size of
+ * a cluster of mmap reads, so that after a seek and/or cache miss,
+ * the range size is used to efficiently prefetch multiple pages
+ * in a single RPC rather than many small RPCs.
+ */
+static void ras_detect_cluster_range(struct ll_readahead_state *ras,
+				     struct ll_sb_info *sbi,
+				     unsigned long pos, unsigned long count)
+{
+	pgoff_t last_pages, pages;
+	pgoff_t end_idx = (pos + count - 1) >> PAGE_SHIFT;
+
+	last_pages = ras->ras_range_max_end_idx -
+			ras->ras_range_min_start_idx + 1;
+	/* First time come here */
+	if (!ras->ras_range_max_end_idx)
+		goto out;
+
+	/* Random or Stride read */
+	if (!is_loose_mmap_read(sbi, ras, pos))
+		goto out;
+
+	ras->ras_range_requests++;
+	if (ras->ras_range_max_end_idx < end_idx)
+		ras->ras_range_max_end_idx = end_idx;
+
+	if (ras->ras_range_min_start_idx > (pos >> PAGE_SHIFT))
+		ras->ras_range_min_start_idx = pos >> PAGE_SHIFT;
+
+	/* Out of range, consider it as random or stride */
+	pages = ras->ras_range_max_end_idx -
+			ras->ras_range_min_start_idx + 1;
+	if (pages <= sbi->ll_ra_info.ra_range_pages)
+		return;
+out:
+	ras->ras_last_range_pages = last_pages;
+	ras->ras_range_requests = 0;
+	ras->ras_range_min_start_idx = pos >> PAGE_SHIFT;
+	ras->ras_range_max_end_idx = end_idx;
+}
+
+static void ras_detect_read_pattern(struct ll_readahead_state *ras,
+				    struct ll_sb_info *sbi,
+				    loff_t pos, size_t count, bool mmap)
+{
+	bool stride_detect = false;
+	pgoff_t index = pos >> PAGE_SHIFT;
+
+	/*
+	 * Reset the read-ahead window in two cases. First when the app seeks
+	 * or reads to some other part of the file. Secondly if we get a
+	 * read-ahead miss that we think we've previously issued. This can
+	 * be a symptom of there being so many read-ahead pages that the VM
+	 * is reclaiming it before we get to it.
+	 */
+	if (!is_loose_seq_read(ras, pos)) {
+		/* Check whether it is in stride I/O mode */
+		if (!read_in_stride_window(ras, pos, count)) {
+			if (ras->ras_consecutive_stride_requests == 0)
+				ras_init_stride_detector(ras, pos, count);
+			else
+				ras_stride_reset(ras);
+			ras->ras_consecutive_bytes = 0;
+			ras_reset(ras, index);
+		} else {
+			ras->ras_consecutive_bytes = 0;
+			ras->ras_consecutive_requests = 0;
+			if (++ras->ras_consecutive_stride_requests > 1)
+				stride_detect = true;
+			RAS_CDEBUG(ras);
+		}
+		ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
+	} else if (stride_io_mode(ras)) {
+		/*
+		 * If this is contiguous read but in stride I/O mode
+		 * currently, check whether stride step still is valid,
+		 * if invalid, it will reset the stride ra window to
+		 * be zero.
+		 */
+		if (!read_in_stride_window(ras, pos, count)) {
+			ras_stride_reset(ras);
+			ras->ras_window_pages = 0;
+			ras->ras_next_readahead_idx = index;
+		}
+	}
+
+	ras->ras_consecutive_bytes += count;
+	if (mmap) {
+		pgoff_t idx = ras->ras_consecutive_bytes >> PAGE_SHIFT;
+		unsigned long ra_range_pages =
+				max_t(unsigned long, RA_MIN_MMAP_RANGE_PAGES,
+				      sbi->ll_ra_info.ra_range_pages);
+
+		if ((idx >= ra_range_pages &&
+		     idx % ra_range_pages == 0) || stride_detect)
+			ras->ras_need_increase_window = true;
+	} else if ((ras->ras_consecutive_requests > 1 || stride_detect)) {
+		ras->ras_need_increase_window = true;
+	}
+
+	ras->ras_last_read_end_bytes = pos + count - 1;
+}
+
+void ll_ras_enter(struct file *f, loff_t pos, size_t count)
+{
+	struct ll_file_data *fd = f->private_data;
+	struct ll_readahead_state *ras = &fd->fd_ras;
+	struct inode *inode = file_inode(f);
+	unsigned long index = pos >> PAGE_SHIFT;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+	spin_lock(&ras->ras_lock);
+	ras->ras_requests++;
+	ras->ras_consecutive_requests++;
+	ras->ras_need_increase_window = false;
+	ras->ras_no_miss_check = false;
+	/*
+	 * On the second access to a file smaller than the tunable
+	 * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+	 * file up to ra_max_pages_per_file.  This is simply a best effort
+	 * and only occurs once per open file. Normal RA behavior is reverted
+	 * to for subsequent IO.
+	 */
+	if (ras->ras_requests >= 2) {
+		__u64 kms_pages;
+		struct ll_ra_info *ra = &sbi->ll_ra_info;
+
+		kms_pages = (i_size_read(inode) + PAGE_SIZE - 1) >>
+			    PAGE_SHIFT;
+
+		CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages,
+		       ra->ra_max_read_ahead_whole_pages,
+		       ra->ra_max_pages_per_file);
+
+		if (kms_pages &&
+		    kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+			ras->ras_window_start_idx = 0;
+			ras->ras_next_readahead_idx = index + 1;
+			ras->ras_window_pages = min(ra->ra_max_pages_per_file,
+					    ra->ra_max_read_ahead_whole_pages);
+			ras->ras_no_miss_check = true;
+			GOTO(out_unlock, 0);
+		}
+	}
+	ras_detect_read_pattern(ras, sbi, pos, count, false);
+out_unlock:
+	spin_unlock(&ras->ras_lock);
+}
+
+static bool index_in_stride_window(struct ll_readahead_state *ras,
+				   pgoff_t index)
+{
+	loff_t pos = (loff_t)index << PAGE_SHIFT;
+
+	if (ras->ras_stride_length == 0 || ras->ras_stride_bytes == 0 ||
+	    ras->ras_stride_bytes == ras->ras_stride_length)
+		return false;
+
+	if (pos >= ras->ras_stride_offset) {
+		u64 offset;
+
+		div64_u64_rem(pos - ras->ras_stride_offset,
+			      ras->ras_stride_length, &offset);
+		if (offset < ras->ras_stride_bytes ||
+		    ras->ras_stride_length - offset < PAGE_SIZE)
+			return true;
+	} else if (ras->ras_stride_offset - pos < PAGE_SIZE) {
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * ll_ras_enter() is used to detect read pattern according to pos and count.
+ *
+ * ras_update() is used to detect cache miss and
+ * reset window or increase window accordingly
+ */
+static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+		       struct ll_readahead_state *ras, pgoff_t index,
+		       enum ras_update_flags flags, struct cl_io *io)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	bool hit = flags & LL_RAS_HIT;
+
+	ENTRY;
+	spin_lock(&ras->ras_lock);
+
+	if (!hit)
+		CDEBUG(D_READA|D_IOTRACE, DFID " pages at %lu miss.\n",
+		       PFID(ll_inode2fid(inode)), index);
+	ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+	/*
+	 * The readahead window has been expanded to cover whole
+	 * file size, we don't care whether ra miss happen or not.
+	 * Because we will read whole file to page cache even if
+	 * some pages missed.
+	 */
+	if (ras->ras_no_miss_check)
+		GOTO(out_unlock, 0);
+
+	if (io && io->ci_rand_read)
+		GOTO(out_unlock, 0);
+
+	if (io && io->ci_seq_read) {
+		if (!hit) {
+			/* to avoid many small read RPC here */
+			ras->ras_window_pages = sbi->ll_ra_info.ra_range_pages;
+			ll_ra_stats_inc_sbi(sbi, RA_STAT_MMAP_RANGE_READ);
+		}
+		goto skip;
+	}
+
+	if (flags & LL_RAS_MMAP) {
+		unsigned long ra_pages;
+
+		ras_detect_cluster_range(ras, sbi, index << PAGE_SHIFT,
+					 PAGE_SIZE);
+		ras_detect_read_pattern(ras, sbi, (loff_t)index << PAGE_SHIFT,
+					PAGE_SIZE, true);
+
+		/* we did not detect anything but we could prefetch */
+		if (!ras->ras_need_increase_window &&
+		    ras->ras_window_pages <= sbi->ll_ra_info.ra_range_pages &&
+		    ras->ras_range_requests >= 2) {
+			if (!hit) {
+				ra_pages = max_t(unsigned long,
+					RA_MIN_MMAP_RANGE_PAGES,
+					ras->ras_last_range_pages);
+				if (index < ra_pages / 2)
+					index = 0;
+				else
+					index -= ra_pages / 2;
+				ras->ras_window_pages = ra_pages;
+				ll_ra_stats_inc_sbi(sbi,
+					RA_STAT_MMAP_RANGE_READ);
+			} else {
+				ras->ras_window_pages = 0;
+			}
+			goto skip;
+		}
+	}
+
+	if (!hit && ras->ras_window_pages &&
+	    index < ras->ras_next_readahead_idx &&
+	    pos_in_window(index, ras->ras_window_start_idx, 0,
+			  ras->ras_window_pages)) {
+		ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+		ras->ras_need_increase_window = false;
+
+		if (index_in_stride_window(ras, index) &&
+		    stride_io_mode(ras)) {
+			/*
+			 * if (index != ras->ras_last_readpage + 1)
+			 *	ras->ras_consecutive_pages = 0;
+			 */
+			ras_reset(ras, index);
+
+			/*
+			 * If stride-RA hit cache miss, the stride
+			 * detector will not be reset to avoid the
+			 * overhead of redetecting read-ahead mode,
+			 * but on the condition that the stride window
+			 * is still intersect with normal sequential
+			 * read-ahead window.
+			 */
+			if (ras->ras_window_start_idx < ras->ras_stride_offset)
+				ras_stride_reset(ras);
+			RAS_CDEBUG(ras);
+		} else {
+			/*
+			 * Reset both stride window and normal RA
+			 * window.
+			 */
+			ras_reset(ras, index);
+			/* ras->ras_consecutive_pages++; */
+			ras->ras_consecutive_bytes = 0;
+			ras_stride_reset(ras);
+			GOTO(out_unlock, 0);
+		}
+	}
+
+skip:
+	ras_set_start(ras, index);
+
+	if (stride_io_mode(ras)) {
+		/* Since stride readahead is sentivite to the offset
+		 * of read-ahead, so we use original offset here,
+		 * instead of ras_window_start_idx, which is RPC aligned.
+		 */
+		ras->ras_next_readahead_idx = max(index + 1,
+						  ras->ras_next_readahead_idx);
+		ras->ras_window_start_idx =
+				max_t(pgoff_t, ras->ras_window_start_idx,
+				      ras->ras_stride_offset >> PAGE_SHIFT);
+	} else {
+		if (ras->ras_next_readahead_idx < ras->ras_window_start_idx)
+			ras->ras_next_readahead_idx = ras->ras_window_start_idx;
+		if (!hit)
+			ras->ras_next_readahead_idx = index + 1;
+	}
+
+	if (ras->ras_need_increase_window) {
+		ras_increase_window(inode, ras, ra);
+		ras->ras_need_increase_window = false;
+	}
+
+	EXIT;
+out_unlock:
+	spin_unlock(&ras->ras_lock);
+}
+
+int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
+{
+	struct inode	       *inode = vmpage->mapping->host;
+	struct ll_inode_info   *lli   = ll_i2info(inode);
+        struct lu_env          *env;
+        struct cl_io           *io;
+        struct cl_page         *page;
+        struct cl_object       *clob;
+	bool redirtied = false;
+	bool unlocked = false;
+        int result;
+	__u16 refcheck;
+        ENTRY;
+
+        LASSERT(PageLocked(vmpage));
+        LASSERT(!PageWriteback(vmpage));
+
+	LASSERT(ll_i2dtexp(inode) != NULL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		GOTO(out, result = PTR_ERR(env));
+
+        clob  = ll_i2info(inode)->lli_clob;
+        LASSERT(clob != NULL);
+
+	io = vvp_env_thread_io(env);
+        io->ci_obj = clob;
+	io->ci_ignore_layout = 1;
+        result = cl_io_init(env, io, CIT_MISC, clob);
+        if (result == 0) {
+                page = cl_page_find(env, clob, vmpage->index,
+                                    vmpage, CPT_CACHEABLE);
+		if (!IS_ERR(page)) {
+			lu_ref_add(&page->cp_reference, "writepage",
+				   current);
+			cl_page_assume(env, io, page);
+			result = cl_page_flush(env, io, page);
+			if (result != 0) {
+				/*
+				 * Re-dirty page on error so it retries write,
+				 * but not in case when IO has actually
+				 * occurred and completed with an error.
+				 */
+				if (!PageError(vmpage)) {
+					redirty_page_for_writepage(wbc, vmpage);
+					result = 0;
+					redirtied = true;
+				}
+			}
+			cl_page_disown(env, io, page);
+			unlocked = true;
+			lu_ref_del(&page->cp_reference,
+				   "writepage", current);
+			cl_page_put(env, page);
+		} else {
+			result = PTR_ERR(page);
+		}
+        }
+        cl_io_fini(env, io);
+
+	if (redirtied && wbc->sync_mode == WB_SYNC_ALL) {
+		loff_t offset = cl_offset(clob, vmpage->index);
+
+		/* Flush page failed because the extent is being written out.
+		 * Wait for the write of extent to be finished to avoid
+		 * breaking kernel which assumes ->writepage should mark
+		 * PageWriteback or clean the page. */
+		result = cl_sync_file_range(inode, offset,
+					    offset + PAGE_SIZE - 1,
+					    CL_FSYNC_LOCAL, 1);
+		if (result > 0) {
+			/* actually we may have written more than one page.
+			 * decreasing this page because the caller will count
+			 * it. */
+			wbc->nr_to_write -= result - 1;
+			result = 0;
+		}
+	}
+
+	cl_env_put(env, &refcheck);
+	GOTO(out, result);
+
+out:
+	if (result < 0) {
+		if (!lli->lli_async_rc)
+			lli->lli_async_rc = result;
+		SetPageError(vmpage);
+		if (!unlocked)
+			unlock_page(vmpage);
+	}
+	return result;
+}
+
+int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	loff_t start;
+	loff_t end;
+	enum cl_fsync_mode mode;
+	int range_whole = 0;
+	int result;
+	ENTRY;
+
+	if (wbc->range_cyclic) {
+		start = (loff_t)mapping->writeback_index << PAGE_SHIFT;
+		end = OBD_OBJECT_EOF;
+	} else {
+		start = wbc->range_start;
+		end = wbc->range_end;
+		if (end == LLONG_MAX) {
+			end = OBD_OBJECT_EOF;
+			range_whole = start == 0;
+		}
+	}
+
+	mode = CL_FSYNC_NONE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		mode = CL_FSYNC_LOCAL;
+
+	if (ll_i2info(inode)->lli_clob == NULL)
+		RETURN(0);
+
+	/* for directio, it would call writepages() to evict cached pages
+	 * inside the IO context of write, which will cause deadlock at
+	 * layout_conf since it waits for active IOs to complete. */
+	result = cl_sync_file_range(inode, start, end, mode, 1);
+	if (result > 0) {
+		wbc->nr_to_write -= result;
+		result = 0;
+	 }
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
+		if (end == OBD_OBJECT_EOF)
+			mapping->writeback_index = 0;
+		else
+			mapping->writeback_index = (end >> PAGE_SHIFT) + 1;
+	}
+	RETURN(result);
+}
+
+struct ll_cl_context *ll_cl_find(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_cl_context *lcc;
+	struct ll_cl_context *found = NULL;
+
+	read_lock(&lli->lli_lock);
+	list_for_each_entry(lcc, &lli->lli_lccs, lcc_list) {
+		if (lcc->lcc_cookie == current) {
+			found = lcc;
+			break;
+		}
+	}
+	read_unlock(&lli->lli_lock);
+
+	return found;
+}
+
+void ll_cl_add(struct inode *inode, const struct lu_env *env, struct cl_io *io,
+	       enum lcc_type type)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx;
+
+	memset(lcc, 0, sizeof(*lcc));
+	INIT_LIST_HEAD(&lcc->lcc_list);
+	lcc->lcc_cookie = current;
+	lcc->lcc_env = env;
+	lcc->lcc_io = io;
+	lcc->lcc_type = type;
+
+	write_lock(&lli->lli_lock);
+	list_add(&lcc->lcc_list, &lli->lli_lccs);
+	write_unlock(&lli->lli_lock);
+}
+
+void ll_cl_remove(struct inode *inode, const struct lu_env *env)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx;
+
+	write_lock(&lli->lli_lock);
+	list_del_init(&lcc->lcc_list);
+	write_unlock(&lli->lli_lock);
+}
+
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+			   struct cl_page *page, struct file *file)
+{
+	struct inode              *inode  = vvp_object_inode(page->cp_obj);
+	struct ll_sb_info         *sbi    = ll_i2sbi(inode);
+	struct ll_file_data       *fd     = NULL;
+	struct ll_readahead_state *ras    = NULL;
+	struct cl_2queue          *queue  = &io->ci_queue;
+	struct cl_sync_io	  *anchor = NULL;
+	struct vvp_page           *vpg;
+	int			   rc = 0, rc2 = 0;
+	bool			   uptodate;
+	struct vvp_io *vio = vvp_env_io(env);
+	bool mmap = !vio->vui_ra_valid;
+	pgoff_t ra_start_index = 0;
+	pgoff_t io_start_index;
+	pgoff_t io_end_index;
+	bool unlockpage = true;
+	ENTRY;
+
+	if (file) {
+		fd = file->private_data;
+		ras = &fd->fd_ras;
+	}
+
+	/* PagePrivate2 is set in ll_io_zero_page() to tell us the vmpage
+	 * must not be unlocked after processing.
+	 */
+	if (page->cp_vmpage && PagePrivate2(page->cp_vmpage))
+		unlockpage = false;
+
+	vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
+	uptodate = vpg->vpg_defer_uptodate;
+
+	if (ll_readahead_enabled(sbi) && !vpg->vpg_ra_updated && ras) {
+		enum ras_update_flags flags = 0;
+
+		if (uptodate)
+			flags |= LL_RAS_HIT;
+		if (mmap)
+			flags |= LL_RAS_MMAP;
+		ras_update(sbi, inode, ras, vvp_index(vpg), flags, io);
+	}
+
+	cl_2queue_init(queue);
+	if (uptodate) {
+		vpg->vpg_ra_used = 1;
+		cl_page_export(env, page, 1);
+		cl_page_disown(env, io, page);
+	} else {
+		anchor = &vvp_env_info(env)->vti_anchor;
+		cl_sync_io_init(anchor, 1);
+		page->cp_sync_io = anchor;
+
+		cl_2queue_add(queue, page, true);
+	}
+
+	/* mmap does not set the ci_rw fields */
+	if (!mmap) {
+		io_start_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos);
+		io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos +
+					io->u.ci_rw.crw_count - 1);
+	} else {
+		io_start_index = vvp_index(vpg);
+		io_end_index = vvp_index(vpg);
+	}
+
+	if (ll_readahead_enabled(sbi) && ras && !io->ci_rand_read) {
+		pgoff_t skip_index = 0;
+
+		if (ras->ras_next_readahead_idx < vvp_index(vpg))
+			skip_index = vvp_index(vpg);
+		rc2 = ll_readahead(env, io, &queue->c2_qin, ras,
+				   uptodate, file, skip_index,
+				   &ra_start_index);
+		CDEBUG(D_READA|D_IOTRACE,
+		       DFID " %d pages read ahead at %lu, triggered by user read at %lu\n",
+		       PFID(ll_inode2fid(inode)), rc2, ra_start_index,
+		       vvp_index(vpg));
+	} else if (vvp_index(vpg) == io_start_index &&
+		   io_end_index - io_start_index > 0) {
+		rc2 = ll_readpages(env, io, &queue->c2_qin, io_start_index + 1,
+				   io_end_index);
+		CDEBUG(D_READA, DFID " %d pages read at %lu\n",
+		       PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
+	}
+
+	if (queue->c2_qin.pl_nr > 0) {
+		int count = queue->c2_qin.pl_nr;
+		rc = cl_io_submit_rw(env, io, CRT_READ, queue);
+		if (rc == 0)
+			task_io_account_read(PAGE_SIZE * count);
+	}
+
+
+	if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */
+		rc = cl_sync_io_wait(env, anchor, 0);
+
+		cl_page_assume(env, io, page);
+		cl_page_list_del(env, &queue->c2_qout, page);
+
+		if (!PageUptodate(cl_page_vmpage(page))) {
+			/* Failed to read a mirror, discard this page so that
+			 * new page can be created with new mirror.
+			 *
+			 * TODO: this is not needed after page reinit
+			 * route is implemented */
+			cl_page_discard(env, io, page);
+		}
+		if (unlockpage)
+			cl_page_disown(env, io, page);
+	}
+
+	/* TODO: discard all pages until page reinit route is implemented */
+	cl_page_list_discard(env, io, &queue->c2_qin);
+
+	/* Unlock unsent read pages in case of error. */
+	cl_page_list_disown(env, io, &queue->c2_qin);
+
+	cl_2queue_fini(env, queue);
+
+	RETURN(rc);
+}
+
+/*
+ * Possible return value:
+ * 0 no async readahead triggered and fast read could not be used.
+ * 1 no async readahead, but fast read could be used.
+ * 2 async readahead triggered and fast read could be used too.
+ * < 0 on error.
+ */
+static int kickoff_async_readahead(struct file *file, unsigned long pages)
+{
+	struct ll_readahead_work *lrw;
+	struct inode *inode = file_inode(file);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_file_data *fd = file->private_data;
+	struct ll_readahead_state *ras = &fd->fd_ras;
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	unsigned long throttle;
+	pgoff_t start_idx = ras_align(ras, ras->ras_next_readahead_idx);
+	pgoff_t end_idx = start_idx + pages - 1;
+
+	/**
+	 * In case we have a limited max_cached_mb, readahead
+	 * should be stopped if it have run out of all LRU slots.
+	 */
+	if (atomic_read(&ra->ra_cur_pages) >= sbi->ll_cache->ccc_lru_max) {
+		ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
+		return 0;
+	}
+
+	throttle = min(ra->ra_async_pages_per_file_threshold,
+		       ra->ra_max_pages_per_file);
+	/*
+	 * If this is strided i/o or the window is smaller than the
+	 * throttle limit, we do not do async readahead. Otherwise,
+	 * we do async readahead, allowing the user thread to do fast i/o.
+	 */
+	if (stride_io_mode(ras) || !throttle ||
+	    ras->ras_window_pages < throttle ||
+	    atomic_read(&ra->ra_async_inflight) > ra->ra_async_max_active)
+		return 0;
+
+	if ((atomic_read(&ra->ra_cur_pages) + pages) > ra->ra_max_pages)
+		return 0;
+
+	if (ras->ras_async_last_readpage_idx == start_idx)
+		return 1;
+
+	/* ll_readahead_work_free() free it */
+	OBD_ALLOC_PTR(lrw);
+	if (lrw) {
+		atomic_inc(&sbi->ll_ra_info.ra_async_inflight);
+		lrw->lrw_file = get_file(file);
+		lrw->lrw_start_idx = start_idx;
+		lrw->lrw_end_idx = end_idx;
+		lrw->lrw_user_pid = current->pid;
+		spin_lock(&ras->ras_lock);
+		ras->ras_next_readahead_idx = end_idx + 1;
+		ras->ras_async_last_readpage_idx = start_idx;
+		spin_unlock(&ras->ras_lock);
+		memcpy(lrw->lrw_jobid, ll_i2info(inode)->lli_jobid,
+		       sizeof(lrw->lrw_jobid));
+		ll_readahead_work_add(inode, lrw);
+	} else {
+		return -ENOMEM;
+	}
+
+	return 2;
+}
+
+/*
+ * Check if we can issue a readahead RPC, if that is
+ * the case, we can't do fast IO because we will need
+ * a cl_io to issue the RPC.
+ */
+static bool ll_use_fast_io(struct file *file,
+			   struct ll_readahead_state *ras, pgoff_t index)
+{
+	unsigned long fast_read_pages =
+		max(RA_REMAIN_WINDOW_MIN, ras->ras_rpc_pages);
+	loff_t skip_pages;
+	loff_t stride_bytes = ras->ras_stride_bytes;
+
+	if (stride_io_mode(ras) && stride_bytes) {
+		skip_pages = (ras->ras_stride_length +
+			ras->ras_stride_bytes - 1) / stride_bytes;
+		skip_pages *= fast_read_pages;
+	} else {
+		skip_pages = fast_read_pages;
+	}
+
+	if (ras->ras_window_start_idx + ras->ras_window_pages <
+	    ras->ras_next_readahead_idx + skip_pages ||
+	    kickoff_async_readahead(file, fast_read_pages) > 0)
+		return true;
+
+	return false;
+}
+
+int ll_readpage(struct file *file, struct page *vmpage)
+{
+	struct inode *inode = file_inode(file);
+	struct cl_object *clob = ll_i2info(inode)->lli_clob;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	const struct lu_env *env = NULL;
+	struct cl_read_ahead ra = { 0 };
+	struct ll_cl_context *lcc;
+	struct cl_io *io = NULL;
+	struct cl_page *page;
+	int result;
+	ENTRY;
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LLITE_READPAGE_PAUSE)) {
+		unlock_page(vmpage);
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_READPAGE_PAUSE, cfs_fail_val);
+		lock_page(vmpage);
+	}
+
+	/*
+	 * The @vmpage got truncated.
+	 * This is a kernel bug introduced since kernel 5.12:
+	 * comment: cbd59c48ae2bcadc4a7599c29cf32fd3f9b78251
+	 * ("mm/filemap: use head pages in generic_file_buffered_read")
+	 *
+	 * The page end offset calculation in filemap_get_read_batch() was off
+	 * by one.  When a read is submitted with end offset 1048575, then it
+	 * calculates the end page for read of 256 where it should be 255. This
+	 * results in the readpage() for the page with index 256 is over stripe
+	 * boundary and may not covered by a DLM extent lock.
+	 *
+	 * This happens in a corner race case: filemap_get_read_batch() adds
+	 * the page with index 256 for read which is not in the current read
+	 * I/O context, and this page is being invalidated and will be removed
+	 * from page cache due to the lock protected it being revoken. This
+	 * results in this page in the read path not covered by any DLM lock.
+	 *
+	 * The solution is simple. Check whether the page was truncated in
+	 * ->readpage(). If so, just return AOP_TRUNCATED_PAGE to the upper
+	 * caller. Then the kernel will retry to batch pages, and it will not
+	 * add the truncated page into batches as it was removed from page
+	 * cache of the file.
+	 */
+	if (vmpage->mapping != inode->i_mapping) {
+		unlock_page(vmpage);
+		RETURN(AOP_TRUNCATED_PAGE);
+	}
+
+	lcc = ll_cl_find(inode);
+	if (lcc != NULL) {
+		env = lcc->lcc_env;
+		io  = lcc->lcc_io;
+	}
+
+	if (io == NULL) { /* fast read */
+		struct inode *inode = file_inode(file);
+		struct ll_file_data *fd = file->private_data;
+		struct ll_readahead_state *ras = &fd->fd_ras;
+		struct lu_env  *local_env = NULL;
+		struct vvp_page *vpg;
+
+		CDEBUG(D_VFSTRACE, "fast read pgno: %ld\n", vmpage->index);
+
+		result = -ENODATA;
+
+		/* TODO: need to verify the layout version to make sure
+		 * the page is not invalid due to layout change. */
+		page = cl_vmpage_page(vmpage, clob);
+		if (page == NULL) {
+			unlock_page(vmpage);
+			ll_ra_stats_inc_sbi(sbi, RA_STAT_FAILED_FAST_READ);
+			RETURN(result);
+		}
+
+		vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
+		if (vpg->vpg_defer_uptodate) {
+			enum ras_update_flags flags = LL_RAS_HIT;
+
+			if (lcc && lcc->lcc_type == LCC_MMAP)
+				flags |= LL_RAS_MMAP;
+
+			/* For fast read, it updates read ahead state only
+			 * if the page is hit in cache because non cache page
+			 * case will be handled by slow read later. */
+			ras_update(sbi, inode, ras, vvp_index(vpg), flags, io);
+			/* avoid duplicate ras_update() call */
+			vpg->vpg_ra_updated = 1;
+
+			if (ll_use_fast_io(file, ras, vvp_index(vpg)))
+				result = 0;
+		}
+
+		if (!env) {
+			local_env = cl_env_percpu_get();
+			env = local_env;
+		}
+
+		/* export the page and skip io stack */
+		if (result == 0) {
+			vpg->vpg_ra_used = 1;
+			cl_page_export(env, page, 1);
+		} else {
+			ll_ra_stats_inc_sbi(sbi, RA_STAT_FAILED_FAST_READ);
+		}
+		/* release page refcount before unlocking the page to ensure
+		 * the object won't be destroyed in the calling path of
+		 * cl_page_put(). Please see comment in ll_releasepage(). */
+		cl_page_put(env, page);
+		unlock_page(vmpage);
+		if (local_env)
+			cl_env_percpu_put(local_env);
+
+		RETURN(result);
+	}
+
+	if (lcc && lcc->lcc_type != LCC_MMAP) {
+		CDEBUG(D_VFSTRACE, "pgno:%ld, beyond read end_index:%ld\n",
+		       vmpage->index, lcc->lcc_end_index);
+
+		/*
+		 * This handles a kernel bug introduced in kernel 5.12:
+		 * comment: cbd59c48ae2bcadc4a7599c29cf32fd3f9b78251
+		 * ("mm/filemap: use head pages in generic_file_buffered_read")
+		 *
+		 * See above in this function for a full description of the
+		 * bug.  Briefly, the kernel will try to read 1 more page than
+		 * was actually requested *if that page is already in cache*.
+		 *
+		 * Because this page is beyond the boundary of the requested
+		 * read, Lustre does not lock it as part of the read.  This
+		 * means we must check if there is a valid dlmlock on this
+		 * this page and reference it before we attempt to read in the
+		 * page.  If there is not a valid dlmlock, then we are racing
+		 * with dlmlock cancellation and the page is being removed
+		 * from the cache.
+		 *
+		 * That means we should return AOP_TRUNCATED_PAGE, which will
+		 * cause the kernel to retry the read, which should allow the
+		 * page to be removed from cache as the lock is cancelled.
+		 *
+		 * This should never occur except in kernels with the bug
+		 * mentioned above.
+		 */
+		if (vmpage->index >= lcc->lcc_end_index) {
+			result = cl_io_read_ahead(env, io, vmpage->index, &ra);
+			if (result < 0 || vmpage->index > ra.cra_end_idx) {
+				cl_read_ahead_release(env, &ra);
+				unlock_page(vmpage);
+				RETURN(AOP_TRUNCATED_PAGE);
+			}
+		}
+	}
+
+	/**
+	 * Direct read can fall back to buffered read, but DIO is done
+	 * with lockless i/o, and buffered requires LDLM locking, so in
+	 * this case we must restart without lockless.
+	 */
+	if (file->f_flags & O_DIRECT &&
+	    lcc && lcc->lcc_type == LCC_RW &&
+	    !io->ci_dio_lock) {
+		unlock_page(vmpage);
+		io->ci_dio_lock = 1;
+		io->ci_need_restart = 1;
+		GOTO(out, result = -ENOLCK);
+	}
+
+	LASSERT(io->ci_state == CIS_IO_GOING);
+	page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
+	if (!IS_ERR(page)) {
+		LASSERT(page->cp_type == CPT_CACHEABLE);
+		if (likely(!PageUptodate(vmpage))) {
+			cl_page_assume(env, io, page);
+
+			result = ll_io_read_page(env, io, page, file);
+		} else {
+			/* Page from a non-object file. */
+			unlock_page(vmpage);
+			result = 0;
+		}
+		cl_page_put(env, page);
+	} else {
+		unlock_page(vmpage);
+		result = PTR_ERR(page);
+	}
+
+out:
+	if (ra.cra_release != NULL)
+		cl_read_ahead_release(env, &ra);
+
+	/* this delay gives time for the actual read of the page to finish and
+	 * unlock the page in vvp_page_completion_read before we return to our
+	 * caller and the caller tries to use the page, allowing us to test
+	 * races with the page being unlocked after readpage() but before it's
+	 * used by the caller
+	 */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_READPAGE_PAUSE2, cfs_fail_val);
+
+	RETURN(result);
+}
+
+#ifdef HAVE_AOPS_READ_FOLIO
+int ll_read_folio(struct file *file, struct folio *folio)
+{
+	return ll_readpage(file, folio_page(folio, 0));
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw26.c b/drivers/staging/lustrefsx/lustre/llite/rw26.c
new file mode 100644
index 0000000000000..792e88003fb3d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/rw26.c
@@ -0,0 +1,1023 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/lustre/llite/rw26.c
+ *
+ * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mpage.h>
+#include <linux/pagemap.h>
+#include <linux/string.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+#include <linux/migrate.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "llite_internal.h"
+#include <lustre_compat.h>
+
+#ifdef HAVE_INVALIDATE_FOLIO
+/**
+ * Implements Linux VM address_space::invalidate_folio() method. This method is
+ * called when the folio is truncated from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, off] bytes of the folio remain valid (this is for a case of non-page
+ * aligned truncate). Lustre leaves partially truncated folios in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static void ll_invalidate_folio(struct folio *folio, size_t offset, size_t len)
+{
+	struct inode *inode;
+	struct lu_env *env;
+	struct cl_page *page;
+	struct cl_object *obj;
+
+	LASSERT(!folio_test_writeback(folio));
+	LASSERT(folio_test_locked(folio));
+
+	if (!(offset == 0 && len == folio_size(folio)) &&
+	    !folio_test_large(folio))
+		return;
+
+	/* Drop the pages from the folio */
+	env = cl_env_percpu_get();
+	LASSERT(!IS_ERR(env));
+
+	inode = folio_inode(folio);
+	obj = ll_i2info(inode)->lli_clob;
+	if (obj != NULL) {
+		int n, npgs = folio_nr_pages(folio);
+
+		for (n = 0; n < npgs; n++) {
+			struct page *vmpage = folio_page(folio, n);
+
+			LASSERT(PageLocked(vmpage));
+			LASSERT(!PageWriteback(vmpage));
+
+			page = cl_vmpage_page(vmpage, obj);
+			if (page != NULL) {
+				cl_page_delete(env, page);
+				cl_page_put(env, page);
+			}
+		}
+	} else {
+		LASSERT(!folio_get_private(folio));
+	}
+	cl_env_percpu_put(env);
+}
+#else
+
+/**
+ * Implements Linux VM address_space::invalidatepage() method. This method is
+ * called when the page is truncate from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, offset] bytes of the page remain valid (this is for a case of not-page
+ * aligned truncate). Lustre leaves partially truncated page in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static void ll_invalidatepage(struct page *vmpage,
+#ifdef HAVE_INVALIDATE_RANGE
+				unsigned int offset, unsigned int length
+#else
+				unsigned long offset
+#endif
+			     )
+{
+	struct inode     *inode;
+	struct lu_env    *env;
+	struct cl_page   *page;
+	struct cl_object *obj;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageWriteback(vmpage));
+
+	/*
+	 * It is safe to not check anything in invalidatepage/releasepage
+	 * below because they are run with page locked and all our io is
+	 * happening with locked page too
+	 */
+#ifdef HAVE_INVALIDATE_RANGE
+	if (offset == 0 && length == PAGE_SIZE) {
+#else
+	if (offset == 0) {
+#endif
+		/* See the comment in ll_releasepage() */
+		env = cl_env_percpu_get();
+		LASSERT(!IS_ERR(env));
+
+		inode = vmpage->mapping->host;
+		obj = ll_i2info(inode)->lli_clob;
+		if (obj != NULL) {
+			page = cl_vmpage_page(vmpage, obj);
+			if (page != NULL) {
+				cl_page_delete(env, page);
+				cl_page_put(env, page);
+			}
+		} else
+			LASSERT(vmpage->private == 0);
+
+		cl_env_percpu_put(env);
+	}
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LLITE_PAGE_INVALIDATE_PAUSE)) {
+		unlock_page(vmpage);
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PAGE_INVALIDATE_PAUSE,
+				 cfs_fail_val);
+		lock_page(vmpage);
+	}
+}
+#endif
+
+static bool do_release_page(struct page *vmpage, gfp_t wait)
+{
+	struct lu_env *env;
+	struct cl_object *obj;
+	struct cl_page *page;
+	struct address_space *mapping;
+	int result = 0;
+
+	ENTRY;
+
+	LASSERT(PageLocked(vmpage));
+	if (PageWriteback(vmpage) || PageDirty(vmpage))
+		RETURN(0);
+
+	mapping = vmpage->mapping;
+	if (mapping == NULL)
+		RETURN(1);
+
+	obj = ll_i2info(mapping->host)->lli_clob;
+	if (obj == NULL)
+		RETURN(1);
+
+	page = cl_vmpage_page(vmpage, obj);
+	if (page == NULL)
+		RETURN(1);
+
+	env = cl_env_percpu_get();
+	LASSERT(!IS_ERR(env));
+
+	if (!cl_page_in_use(page)) {
+		result = 1;
+		cl_page_delete(env, page);
+	}
+
+	/* To use percpu env array, the call path can not be rescheduled;
+	 * otherwise percpu array will be messed if ll_releaspage() called
+	 * again on the same CPU.
+	 *
+	 * If this page holds the last refc of cl_object, the following
+	 * call path may cause reschedule:
+	 *   cl_page_put -> cl_page_free -> cl_object_put ->
+	 *     lu_object_put -> lu_object_free -> lov_delete_raid0.
+	 *
+	 * However, the kernel can't get rid of this inode until all pages have
+	 * been cleaned up. Now that we hold page lock here, it's pretty safe
+	 * that we won't get into object delete path.
+	 */
+	LASSERT(cl_object_refc(obj) > 1);
+	cl_page_put(env, page);
+
+	cl_env_percpu_put(env);
+	RETURN(result);
+}
+
+#ifdef HAVE_AOPS_RELEASE_FOLIO
+static bool ll_release_folio(struct folio *folio, gfp_t wait)
+{
+	struct page *vmpage = folio_page(folio, 0);
+
+	/* folio_nr_pages(folio) == 1 is fixed with grab_cache_page* */
+	BUG_ON(folio_nr_pages(folio) != 1);
+
+	return do_release_page(vmpage, wait);
+}
+#else /* !HAVE_AOPS_RELEASE_FOLIO */
+#ifdef HAVE_RELEASEPAGE_WITH_INT
+#define RELEASEPAGE_ARG_TYPE int
+#else
+#define RELEASEPAGE_ARG_TYPE gfp_t
+#endif
+static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
+{
+	return do_release_page(vmpage, gfp_mask);
+}
+#endif /* HAVE_AOPS_RELEASE_FOLIO */
+
+static ssize_t ll_get_user_pages(int rw, struct iov_iter *iter,
+				struct page ***pages, ssize_t *npages,
+				size_t maxsize)
+{
+#if defined(HAVE_DIO_ITER)
+	size_t start;
+	size_t result;
+
+	result = iov_iter_get_pages_alloc2(iter, pages, maxsize, &start);
+	if (result > 0)
+		*npages = DIV_ROUND_UP(result + start, PAGE_SIZE);
+
+	return result;
+#else
+	unsigned long addr;
+	size_t page_count;
+	size_t size;
+	long result;
+
+	if (!maxsize)
+		return 0;
+
+	if (!iter->nr_segs)
+		return 0;
+
+	addr = (unsigned long)iter->iov->iov_base + iter->iov_offset;
+	if (addr & ~PAGE_MASK)
+		return -EINVAL;
+
+	size = min_t(size_t, maxsize, iter->iov->iov_len);
+	page_count = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	OBD_ALLOC_PTR_ARRAY_LARGE(*pages, page_count);
+	if (*pages == NULL)
+		return -ENOMEM;
+
+	mmap_read_lock(current->mm);
+	result = get_user_pages(current, current->mm, addr, page_count,
+				rw == READ, 0, *pages, NULL);
+	mmap_read_unlock(current->mm);
+
+	if (unlikely(result != page_count)) {
+		ll_release_user_pages(*pages, page_count);
+		*pages = NULL;
+
+		if (result >= 0)
+			return -EFAULT;
+
+		return result;
+	}
+	*npages = page_count;
+
+	return size;
+#endif
+}
+
+/* iov_iter_alignment() is introduced in 3.16 similar to HAVE_DIO_ITER */
+#if defined(HAVE_DIO_ITER)
+static unsigned long iov_iter_alignment_vfs(const struct iov_iter *i)
+{
+	return iov_iter_alignment(i);
+}
+#else /* copied from alignment_iovec() */
+static unsigned long iov_iter_alignment_vfs(const struct iov_iter *i)
+{
+	const struct iovec *iov = i->iov;
+	unsigned long res;
+	size_t size = i->count;
+	size_t n;
+
+	if (!size)
+		return 0;
+
+	res = (unsigned long)iov->iov_base + i->iov_offset;
+	n = iov->iov_len - i->iov_offset;
+	if (n >= size)
+		return res | size;
+
+	size -= n;
+	res |= n;
+	while (size > (++iov)->iov_len) {
+		res |= (unsigned long)iov->iov_base | iov->iov_len;
+		size -= iov->iov_len;
+	}
+	res |= (unsigned long)iov->iov_base | size;
+
+	return res;
+}
+#endif
+
+/*
+ * Lustre could relax a bit for alignment, io count is not
+ * necessary page alignment.
+ */
+static unsigned long ll_iov_iter_alignment(struct iov_iter *i)
+{
+	size_t orig_size = i->count;
+	size_t count = orig_size & ~PAGE_MASK;
+	unsigned long res;
+
+	if (!count)
+		return iov_iter_alignment_vfs(i);
+
+	if (orig_size > PAGE_SIZE) {
+		iov_iter_truncate(i, orig_size - count);
+		res = iov_iter_alignment_vfs(i);
+		iov_iter_reexpand(i, orig_size);
+
+		return res;
+	}
+
+	res = iov_iter_alignment_vfs(i);
+	/* start address is page aligned */
+	if ((res & ~PAGE_MASK) == orig_size)
+		return PAGE_SIZE;
+
+	return res;
+}
+
+static int
+ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, size_t size,
+		   int rw, struct inode *inode, struct cl_sub_dio *sdio)
+{
+	struct ll_dio_pages *pv = &sdio->csd_dio_pages;
+	struct cl_page    *page;
+	struct cl_2queue  *queue = &io->ci_queue;
+	struct cl_object  *obj = io->ci_obj;
+	struct cl_sync_io *anchor = &sdio->csd_sync;
+	loff_t offset   = pv->ldp_file_offset;
+	int io_pages    = 0;
+	size_t page_size = cl_page_size(obj);
+	int i;
+	ssize_t rc = 0;
+
+	ENTRY;
+
+	cl_2queue_init(queue);
+	for (i = 0; i < pv->ldp_count; i++) {
+		LASSERT(!(offset & (PAGE_SIZE - 1)));
+		page = cl_page_find(env, obj, cl_index(obj, offset),
+				    pv->ldp_pages[i], CPT_TRANSIENT);
+		if (IS_ERR(page)) {
+			rc = PTR_ERR(page);
+			break;
+		}
+		LASSERT(page->cp_type == CPT_TRANSIENT);
+		rc = cl_page_own(env, io, page);
+		if (rc) {
+			cl_page_put(env, page);
+			break;
+		}
+
+		page->cp_sync_io = anchor;
+		if (inode && IS_ENCRYPTED(inode)) {
+			/* In case of Direct IO on encrypted file, we need to
+			 * add a reference to the inode on the cl_page.
+			 * This info is required by llcrypt to proceed
+			 * to encryption/decryption.
+			 * This is safe because we know these pages are private
+			 * to the thread doing the Direct IO.
+			 */
+			page->cp_inode = inode;
+		}
+		/* We keep the refcount from cl_page_find, so we don't need
+		 * another one here
+		 */
+		cl_2queue_add(queue, page, false);
+		/*
+		 * Set page clip to tell transfer formation engine
+		 * that page has to be sent even if it is beyond KMS.
+		 */
+		if (size < page_size)
+			cl_page_clip(env, page, 0, size);
+		++io_pages;
+
+		offset += page_size;
+		size -= page_size;
+	}
+	if (rc == 0 && io_pages > 0) {
+		int iot = rw == READ ? CRT_READ : CRT_WRITE;
+
+		atomic_add(io_pages, &anchor->csi_sync_nr);
+		/*
+		 * Avoid out-of-order execution of adding inflight
+		 * modifications count and io submit.
+		 */
+		smp_mb();
+		rc = cl_io_submit_rw(env, io, iot, queue);
+		if (rc == 0) {
+			cl_page_list_splice(&queue->c2_qout, &sdio->csd_pages);
+		} else {
+			atomic_add(-queue->c2_qin.pl_nr,
+				   &anchor->csi_sync_nr);
+			cl_page_list_for_each(page, &queue->c2_qin)
+				page->cp_sync_io = NULL;
+		}
+		/* handle partially submitted reqs */
+		if (queue->c2_qin.pl_nr > 0) {
+			CERROR(DFID " failed to submit %d dio pages: %zd\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       queue->c2_qin.pl_nr, rc);
+			if (rc == 0)
+				rc = -EIO;
+		}
+	}
+
+	cl_2queue_discard(env, io, queue);
+	cl_2queue_disown(env, io, queue);
+	cl_2queue_fini(env, queue);
+	RETURN(rc);
+}
+
+#ifdef KMALLOC_MAX_SIZE
+#define MAX_MALLOC KMALLOC_MAX_SIZE
+#else
+#define MAX_MALLOC (128 * 1024)
+#endif
+
+/* This is the maximum size of a single O_DIRECT request, based on the
+ * kmalloc limit.  We need to fit all of the brw_page structs, each one
+ * representing PAGE_SIZE worth of user data, into a single buffer, and
+ * then truncate this to be a full-sized RPC.  For 4kB PAGE_SIZE this is
+ * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */
+#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_SIZE) & \
+		      ~((size_t)DT_MAX_BRW_SIZE - 1))
+
+static ssize_t
+ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
+{
+	struct ll_cl_context *lcc;
+	const struct lu_env *env;
+	struct cl_io *io;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct cl_dio_aio *ll_dio_aio;
+	struct cl_sub_dio *ldp_aio;
+	size_t count = iov_iter_count(iter);
+	ssize_t tot_bytes = 0, result = 0;
+	loff_t file_offset = iocb->ki_pos;
+	bool sync_submit = false;
+	struct vvp_io *vio;
+	ssize_t rc2;
+
+	/* Check EOF by ourselves */
+	if (rw == READ && file_offset >= i_size_read(inode))
+		return 0;
+
+	/* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+	if (file_offset & ~PAGE_MASK)
+		RETURN(-EINVAL);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), "
+	       "offset=%lld=%llx, pages %zd (max %lu)\n",
+	       PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
+	       file_offset, file_offset, count >> PAGE_SHIFT,
+	       MAX_DIO_SIZE >> PAGE_SHIFT);
+
+	/* Check that all user buffers are aligned as well */
+	if (ll_iov_iter_alignment(iter) & ~PAGE_MASK)
+		RETURN(-EINVAL);
+
+	lcc = ll_cl_find(inode);
+	if (lcc == NULL)
+		RETURN(-EIO);
+
+	env = lcc->lcc_env;
+	LASSERT(!IS_ERR(env));
+	vio = vvp_env_io(env);
+	io = lcc->lcc_io;
+	LASSERT(io != NULL);
+
+	ll_dio_aio = io->ci_dio_aio;
+	LASSERT(ll_dio_aio);
+	LASSERT(ll_dio_aio->cda_iocb == iocb);
+
+	/* We cannot do parallel submission of sub-I/Os - for AIO or regular
+	 * DIO - unless lockless because it causes us to release the lock
+	 * early.
+	 *
+	 * There are also several circumstances in which we must disable
+	 * parallel DIO, so we check if it is enabled.
+	 *
+	 * The check for "is_sync_kiocb" excludes AIO, which does not need to
+	 * be disabled in these situations.
+	 */
+	if (io->ci_dio_lock || (is_sync_kiocb(iocb) && !io->ci_parallel_dio))
+		sync_submit = true;
+
+	while (iov_iter_count(iter)) {
+		struct ll_dio_pages *pvec;
+		struct page **pages;
+
+		count = min_t(size_t, iov_iter_count(iter), MAX_DIO_SIZE);
+		if (rw == READ) {
+			if (file_offset >= i_size_read(inode))
+				break;
+
+			if (file_offset + count > i_size_read(inode))
+				count = i_size_read(inode) - file_offset;
+		}
+
+		/* if we are doing sync_submit, then we free this below,
+		 * otherwise it is freed on the final call to cl_sync_io_note
+		 * (either in this function or from a ptlrpcd daemon)
+		 */
+		ldp_aio = cl_sub_dio_alloc(ll_dio_aio, sync_submit);
+		if (!ldp_aio)
+			GOTO(out, result = -ENOMEM);
+
+		pvec = &ldp_aio->csd_dio_pages;
+
+		result = ll_get_user_pages(rw, iter, &pages,
+					   &pvec->ldp_count, count);
+		if (unlikely(result <= 0)) {
+			cl_sync_io_note(env, &ldp_aio->csd_sync, result);
+			if (sync_submit) {
+				LASSERT(ldp_aio->csd_creator_free);
+				cl_sub_dio_free(ldp_aio);
+			}
+			GOTO(out, result);
+		}
+
+		count = result;
+		pvec->ldp_file_offset = file_offset;
+		pvec->ldp_pages = pages;
+
+		result = ll_direct_rw_pages(env, io, count,
+					    rw, inode, ldp_aio);
+		/* We've submitted pages and can now remove the extra
+		 * reference for that
+		 */
+		cl_sync_io_note(env, &ldp_aio->csd_sync, result);
+
+		if (sync_submit) {
+			rc2 = cl_sync_io_wait(env, &ldp_aio->csd_sync,
+					     0);
+			if (result == 0 && rc2)
+				result = rc2;
+			LASSERT(ldp_aio->csd_creator_free);
+			cl_sub_dio_free(ldp_aio);
+		}
+		if (unlikely(result < 0))
+			GOTO(out, result);
+
+		iov_iter_advance(iter, count);
+		tot_bytes += count;
+		file_offset += count;
+	}
+
+out:
+	ll_dio_aio->cda_bytes += tot_bytes;
+
+	if (rw == WRITE)
+		vio->u.readwrite.vui_written += tot_bytes;
+	else
+		vio->u.readwrite.vui_read += tot_bytes;
+
+	/* AIO is not supported on pipes, so we cannot return EIOCBQEUED like
+	 * we normally would for both DIO and AIO here
+	 */
+	if (result == 0 && !iov_iter_is_pipe(iter))
+		result = -EIOCBQUEUED;
+
+	return result;
+}
+
+#if defined(HAVE_DIO_ITER)
+static ssize_t ll_direct_IO(
+#ifndef HAVE_IOV_ITER_RW
+	     int rw,
+#endif
+	     struct kiocb *iocb, struct iov_iter *iter
+#ifndef HAVE_DIRECTIO_2ARGS
+	     , loff_t file_offset
+#endif
+	     )
+{
+	int nrw;
+
+#ifndef HAVE_IOV_ITER_RW
+	nrw = rw;
+#else
+	nrw = iov_iter_rw(iter);
+#endif
+
+	return ll_direct_IO_impl(iocb, iter, nrw);
+}
+
+#else /* !defined(HAVE_DIO_ITER) */
+
+static ssize_t
+ll_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+	     loff_t file_offset, unsigned long nr_segs)
+{
+	struct iov_iter iter;
+
+	iov_iter_init(&iter, iov, nr_segs, iov_length(iov, nr_segs), 0);
+	return ll_direct_IO_impl(iocb, &iter, rw);
+}
+
+#endif /* !defined(HAVE_DIO_ITER) */
+
+/**
+ * Prepare partially written-to page for a write.
+ * @pg is owned when passed in and disowned when it returns non-zero result to
+ * the caller.
+ */
+static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io,
+				   struct cl_page *pg, struct file *file)
+{
+	struct cl_attr *attr   = vvp_env_thread_attr(env);
+	struct cl_object *obj  = io->ci_obj;
+	struct vvp_page *vpg   = cl_object_page_slice(obj, pg);
+	loff_t          offset = cl_offset(obj, vvp_index(vpg));
+	int             result;
+	ENTRY;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result) {
+		cl_page_disown(env, io, pg);
+		GOTO(out, result);
+	}
+
+	/*
+	 * If are writing to a new page, no need to read old data.
+	 * The extent locking will have updated the KMS, and for our
+	 * purposes here we can treat it like i_size.
+	 */
+	if (attr->cat_kms <= offset) {
+		char *kaddr = kmap_atomic(vpg->vpg_page);
+
+		memset(kaddr, 0, cl_page_size(obj));
+		kunmap_atomic(kaddr);
+		GOTO(out, result = 0);
+	}
+
+	if (vpg->vpg_defer_uptodate) {
+		vpg->vpg_ra_used = 1;
+		GOTO(out, result = 0);
+	}
+
+	result = ll_io_read_page(env, io, pg, file);
+	if (result)
+		GOTO(out, result);
+
+	/* ll_io_read_page() disowns the page */
+	result = cl_page_own(env, io, pg);
+	if (!result) {
+		if (!PageUptodate(cl_page_vmpage(pg))) {
+			cl_page_disown(env, io, pg);
+			result = -EIO;
+		}
+	} else if (result == -ENOENT) {
+		/* page was truncated */
+		result = -EAGAIN;
+	}
+	EXIT;
+
+out:
+	return result;
+}
+
+static int ll_tiny_write_begin(struct page *vmpage, struct address_space *mapping)
+{
+	/* Page must be present, up to date, dirty, and not in writeback. */
+	if (!vmpage || !PageUptodate(vmpage) || !PageDirty(vmpage) ||
+	    PageWriteback(vmpage) || vmpage->mapping != mapping)
+		return -ENODATA;
+
+	return 0;
+}
+
+static int ll_write_begin(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned int len,
+#ifdef HAVE_GRAB_CACHE_PAGE_WRITE_BEGIN_WITH_FLAGS
+			  unsigned int flags,
+#endif
+			  struct page **pagep, void **fsdata)
+{
+	struct ll_cl_context *lcc = NULL;
+	const struct lu_env  *env = NULL;
+	struct cl_io   *io = NULL;
+	struct cl_page *page = NULL;
+	struct inode *inode = file_inode(file);
+	struct cl_object *clob = ll_i2info(mapping->host)->lli_clob;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	struct page *vmpage = NULL;
+	unsigned from = pos & (PAGE_SIZE - 1);
+	unsigned to = from + len;
+	int result = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len);
+
+	lcc = ll_cl_find(inode);
+	if (lcc == NULL) {
+		vmpage = grab_cache_page_nowait(mapping, index);
+		result = ll_tiny_write_begin(vmpage, mapping);
+		GOTO(out, result);
+	}
+
+	env = lcc->lcc_env;
+	io  = lcc->lcc_io;
+
+	if (file->f_flags & O_DIRECT) {
+		/* direct IO failed because it couldn't clean up cached pages,
+		 * this causes a problem for mirror write because the cached
+		 * page may belong to another mirror, which will result in
+		 * problem submitting the I/O. */
+		if (io->ci_designated_mirror > 0)
+			GOTO(out, result = -EBUSY);
+
+		/**
+		 * Direct write can fall back to buffered read, but DIO is done
+		 * with lockless i/o, and buffered requires LDLM locking, so
+		 * in this case we must restart without lockless.
+		 */
+		if (!io->ci_dio_lock) {
+			io->ci_dio_lock = 1;
+			io->ci_need_restart = 1;
+			GOTO(out, result = -ENOLCK);
+		}
+	}
+again:
+	/* To avoid deadlock, try to lock page first. */
+	vmpage = grab_cache_page_nowait(mapping, index);
+
+	if (unlikely(vmpage == NULL ||
+		     PageDirty(vmpage) || PageWriteback(vmpage))) {
+		struct vvp_io *vio = vvp_env_io(env);
+		struct cl_page_list *plist = &vio->u.readwrite.vui_queue;
+
+                /* if the page is already in dirty cache, we have to commit
+		 * the pages right now; otherwise, it may cause deadlock
+		 * because it holds page lock of a dirty page and request for
+		 * more grants. It's okay for the dirty page to be the first
+		 * one in commit page list, though. */
+		if (vmpage != NULL && plist->pl_nr > 0) {
+			unlock_page(vmpage);
+			put_page(vmpage);
+			vmpage = NULL;
+		}
+
+		/* commit pages and then wait for page lock */
+		result = vvp_io_write_commit(env, io);
+		if (result < 0)
+			GOTO(out, result);
+
+		if (vmpage == NULL) {
+			vmpage = grab_cache_page_write_begin(mapping, index
+#ifdef HAVE_GRAB_CACHE_PAGE_WRITE_BEGIN_WITH_FLAGS
+							     , flags
+#endif
+							     );
+			if (vmpage == NULL)
+				GOTO(out, result = -ENOMEM);
+		}
+	}
+
+	/* page was truncated */
+	if (mapping != vmpage->mapping) {
+		CDEBUG(D_VFSTRACE, "page: %lu was truncated\n", index);
+		unlock_page(vmpage);
+		put_page(vmpage);
+		vmpage = NULL;
+		goto again;
+	}
+
+	page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
+	if (IS_ERR(page))
+		GOTO(out, result = PTR_ERR(page));
+
+	lcc->lcc_page = page;
+	lu_ref_add(&page->cp_reference, "cl_io", io);
+
+	cl_page_assume(env, io, page);
+	if (!PageUptodate(vmpage)) {
+		/*
+		 * We're completely overwriting an existing page,
+		 * so _don't_ set it up to date until commit_write
+		 */
+		if (from == 0 && to == PAGE_SIZE) {
+			CL_PAGE_HEADER(D_PAGE, env, page, "full page write\n");
+			POISON_PAGE(vmpage, 0x11);
+		} else {
+			/* TODO: can be optimized at OSC layer to check if it
+			 * is a lockless IO. In that case, it's not necessary
+			 * to read the data. */
+			result = ll_prepare_partial_page(env, io, page, file);
+			if (result) {
+				/* vmpage should have been unlocked */
+				put_page(vmpage);
+				vmpage = NULL;
+
+				if (result == -EAGAIN)
+					goto again;
+				GOTO(out, result);
+			}
+		}
+	}
+	EXIT;
+out:
+	if (result < 0) {
+		if (vmpage != NULL) {
+			unlock_page(vmpage);
+			put_page(vmpage);
+		}
+		/* On tiny_write failure, page and io are always null. */
+		if (!IS_ERR_OR_NULL(page)) {
+			lu_ref_del(&page->cp_reference, "cl_io", io);
+			cl_page_put(env, page);
+		}
+		if (io)
+			io->ci_result = result;
+	} else {
+		*pagep = vmpage;
+		*fsdata = lcc;
+	}
+	RETURN(result);
+}
+
+static int ll_tiny_write_end(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned int len, unsigned int copied,
+			     struct page *vmpage)
+{
+	struct cl_page *clpage = (struct cl_page *) vmpage->private;
+	loff_t kms = pos+copied;
+	loff_t to = kms & (PAGE_SIZE-1) ? kms & (PAGE_SIZE-1) : PAGE_SIZE;
+	__u16 refcheck;
+	struct lu_env *env = cl_env_get(&refcheck);
+	int rc = 0;
+
+	ENTRY;
+
+	if (IS_ERR(env)) {
+		rc = PTR_ERR(env);
+		goto out;
+	}
+
+	/* This page is dirty in cache, so it should have a cl_page pointer
+	 * set in vmpage->private.
+	 */
+	LASSERT(clpage != NULL);
+
+	if (copied == 0)
+		goto out_env;
+
+	/* Update the underlying size information in the OSC/LOV objects this
+	 * page is part of.
+	 */
+	cl_page_touch(env, clpage, to);
+
+out_env:
+	cl_env_put(env, &refcheck);
+
+out:
+	/* Must return page unlocked. */
+	unlock_page(vmpage);
+
+	RETURN(rc);
+}
+
+static int ll_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *vmpage, void *fsdata)
+{
+	struct ll_cl_context *lcc = fsdata;
+	const struct lu_env *env;
+	struct cl_io *io;
+	struct vvp_io *vio;
+	struct cl_page *page;
+	unsigned from = pos & (PAGE_SIZE - 1);
+	bool unplug = false;
+	int result = 0;
+	ENTRY;
+
+	put_page(vmpage);
+
+	CDEBUG(D_VFSTRACE, "pos %llu, len %u, copied %u\n", pos, len, copied);
+
+	if (lcc == NULL) {
+		result = ll_tiny_write_end(file, mapping, pos, len, copied,
+					   vmpage);
+		GOTO(out, result);
+	}
+
+	LASSERT(lcc != NULL);
+	env  = lcc->lcc_env;
+	page = lcc->lcc_page;
+	io   = lcc->lcc_io;
+	vio  = vvp_env_io(env);
+
+	LASSERT(cl_page_is_owned(page, io));
+	if (copied > 0) {
+		struct cl_page_list *plist = &vio->u.readwrite.vui_queue;
+
+		lcc->lcc_page = NULL; /* page will be queued */
+
+		/* Add it into write queue */
+		cl_page_list_add(plist, page, true);
+		if (plist->pl_nr == 1) /* first page */
+			vio->u.readwrite.vui_from = from;
+		else
+			LASSERT(from == 0);
+		vio->u.readwrite.vui_to = from + copied;
+
+		/* To address the deadlock in balance_dirty_pages() where
+		 * this dirty page may be written back in the same thread. */
+		if (PageDirty(vmpage))
+			unplug = true;
+
+		/* We may have one full RPC, commit it soon */
+		if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES)
+			unplug = true;
+
+		CL_PAGE_DEBUG(D_VFSTRACE, env, page,
+			      "queued page: %d.\n", plist->pl_nr);
+	} else {
+		cl_page_disown(env, io, page);
+
+		lcc->lcc_page = NULL;
+		lu_ref_del(&page->cp_reference, "cl_io", io);
+		cl_page_put(env, page);
+
+		/* page list is not contiguous now, commit it now */
+		unplug = true;
+	}
+	if (unplug || io->u.ci_wr.wr_sync)
+		result = vvp_io_write_commit(env, io);
+
+	if (result < 0)
+		io->ci_result = result;
+
+
+out:
+	RETURN(result >= 0 ? copied : result);
+}
+
+#ifdef CONFIG_MIGRATION
+static int ll_migrate_folio(struct address_space *mapping,
+			    struct folio_migr *newpage, struct folio_migr *page,
+			    enum migrate_mode mode)
+{
+	/* Always fail page migration until we have a proper implementation */
+	return -EIO;
+}
+#endif
+
+const struct address_space_operations ll_aops = {
+#ifdef HAVE_DIRTY_FOLIO
+	.dirty_folio		= filemap_dirty_folio,
+#else
+	.set_page_dirty		= __set_page_dirty_nobuffers,
+#endif
+#ifdef HAVE_INVALIDATE_FOLIO
+	.invalidate_folio	= ll_invalidate_folio,
+#else
+	.invalidatepage		= ll_invalidatepage,
+#endif
+#ifdef HAVE_AOPS_READ_FOLIO
+	.read_folio		= ll_read_folio,
+#else
+	.readpage		= ll_readpage,
+#endif
+#ifdef HAVE_AOPS_RELEASE_FOLIO
+	.release_folio		= ll_release_folio,
+#else
+	.releasepage		= (void *)ll_releasepage,
+#endif
+	.direct_IO		= ll_direct_IO,
+	.writepage		= ll_writepage,
+	.writepages		= ll_writepages,
+	.write_begin		= ll_write_begin,
+	.write_end		= ll_write_end,
+#ifdef CONFIG_MIGRATION
+	.migrate_folio		= ll_migrate_folio,
+#endif
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/statahead.c b/drivers/staging/lustrefsx/lustre/llite/statahead.c
new file mode 100644
index 0000000000000..faf2860c8b481
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/statahead.c
@@ -0,0 +1,1790 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_dlm.h>
+#include "llite_internal.h"
+
+#define SA_OMITTED_ENTRY_MAX 8ULL
+
+typedef enum {
+	/** negative values are for error cases */
+	SA_ENTRY_INIT = 0,      /** init entry */
+	SA_ENTRY_SUCC = 1,      /** stat succeed */
+	SA_ENTRY_INVA = 2,      /** invalid entry */
+} se_state_t;
+
+/*
+ * sa_entry is not refcounted: statahead thread allocates it and do async stat,
+ * and in async stat callback ll_statahead_interpret() will add it into
+ * sai_interim_entries, later statahead thread will call sa_handle_callback() to
+ * instantiate entry and move it into sai_entries, and then only scanner process
+ * can access and free it.
+ */
+struct sa_entry {
+	/* link into sai_interim_entries or sai_entries */
+	struct list_head	se_list;
+	/* link into sai hash table locally */
+	struct list_head	se_hash;
+	/* entry index in the sai */
+	__u64			se_index;
+	/* low layer ldlm lock handle */
+	__u64			se_handle;
+	/* entry status */
+	se_state_t		se_state;
+	/* entry size, contains name */
+	int			se_size;
+	/* pointer to async getattr enqueue info */
+	struct md_enqueue_info *se_minfo;
+	/* pointer to the async getattr request */
+	struct ptlrpc_request  *se_req;
+	/* pointer to the target inode */
+	struct inode	       *se_inode;
+	/* entry name */
+	struct qstr		se_qstr;
+	/* entry fid */
+	struct lu_fid		se_fid;
+};
+
+static unsigned int sai_generation;
+static DEFINE_SPINLOCK(sai_generation_lock);
+
+static inline int sa_unhashed(struct sa_entry *entry)
+{
+	return list_empty(&entry->se_hash);
+}
+
+/* sa_entry is ready to use */
+static inline int sa_ready(struct sa_entry *entry)
+{
+	/* Make sure sa_entry is updated and ready to use */
+	smp_rmb();
+	return (entry->se_state != SA_ENTRY_INIT);
+}
+
+/* hash value to put in sai_cache */
+static inline int sa_hash(int val)
+{
+	return val & LL_SA_CACHE_MASK;
+}
+
+/* hash entry into sai_cache */
+static inline void
+sa_rehash(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	int i = sa_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_add_tail(&entry->se_hash, &sai->sai_cache[i]);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+/* unhash entry from sai_cache */
+static inline void
+sa_unhash(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	int i = sa_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_del_init(&entry->se_hash);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+static inline int agl_should_run(struct ll_statahead_info *sai,
+				 struct inode *inode)
+{
+	return inode && S_ISREG(inode->i_mode) && sai->sai_agl_task;
+}
+
+static inline struct ll_inode_info *
+agl_first_entry(struct ll_statahead_info *sai)
+{
+	return list_first_entry(&sai->sai_agls, struct ll_inode_info,
+				lli_agl_list);
+}
+
+/* statahead window is full */
+static inline int sa_sent_full(struct ll_statahead_info *sai)
+{
+	return atomic_read(&sai->sai_cache_count) >= sai->sai_max;
+}
+
+/* got async stat replies */
+static inline int sa_has_callback(struct ll_statahead_info *sai)
+{
+	return !list_empty(&sai->sai_interim_entries);
+}
+
+static inline int agl_list_empty(struct ll_statahead_info *sai)
+{
+	return list_empty(&sai->sai_agls);
+}
+
+/**
+ * (1) hit ratio less than 80%
+ * or
+ * (2) consecutive miss more than 8
+ * then means low hit.
+ */
+static inline int sa_low_hit(struct ll_statahead_info *sai)
+{
+	return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) ||
+		(sai->sai_consecutive_miss > 8));
+}
+
+/*
+ * if the given index is behind of statahead window more than
+ * SA_OMITTED_ENTRY_MAX, then it is old.
+ */
+static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index)
+{
+	return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX <
+		sai->sai_index);
+}
+
+/* allocate sa_entry and hash it to allow scanner process to find it */
+static struct sa_entry *
+sa_alloc(struct dentry *parent, struct ll_statahead_info *sai, __u64 index,
+	 const char *name, int len, const struct lu_fid *fid)
+{
+	struct ll_inode_info *lli;
+	struct sa_entry *entry;
+	int entry_size;
+	char *dname;
+
+	ENTRY;
+
+	entry_size = sizeof(struct sa_entry) + (len & ~3) + 4;
+	OBD_ALLOC(entry, entry_size);
+	if (unlikely(!entry))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n",
+	       len, name, entry, index);
+
+	entry->se_index = index;
+
+	entry->se_state = SA_ENTRY_INIT;
+	entry->se_size = entry_size;
+	dname = (char *)entry + sizeof(struct sa_entry);
+	memcpy(dname, name, len);
+	dname[len] = 0;
+	entry->se_qstr.hash = ll_full_name_hash(parent, name, len);
+	entry->se_qstr.len = len;
+	entry->se_qstr.name = dname;
+	entry->se_fid = *fid;
+
+	lli = ll_i2info(sai->sai_dentry->d_inode);
+
+	spin_lock(&lli->lli_sa_lock);
+	INIT_LIST_HEAD(&entry->se_list);
+	sa_rehash(sai, entry);
+	spin_unlock(&lli->lli_sa_lock);
+
+	atomic_inc(&sai->sai_cache_count);
+
+	RETURN(entry);
+}
+
+/* free sa_entry, which should have been unhashed and not in any list */
+static void sa_free(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n",
+	       entry->se_qstr.len, entry->se_qstr.name, entry,
+	       entry->se_index);
+
+	LASSERT(list_empty(&entry->se_list));
+	LASSERT(sa_unhashed(entry));
+
+	OBD_FREE(entry, entry->se_size);
+	atomic_dec(&sai->sai_cache_count);
+}
+
+/*
+ * find sa_entry by name, used by directory scanner, lock is not needed because
+ * only scanner can remove the entry from cache.
+ */
+static struct sa_entry *
+sa_get(struct ll_statahead_info *sai, const struct qstr *qstr)
+{
+	struct sa_entry *entry;
+	int i = sa_hash(qstr->hash);
+
+	list_for_each_entry(entry, &sai->sai_cache[i], se_hash) {
+		if (entry->se_qstr.hash == qstr->hash &&
+		    entry->se_qstr.len == qstr->len &&
+		    memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0)
+			return entry;
+	}
+	return NULL;
+}
+
+/* unhash and unlink sa_entry, and then free it */
+static inline void
+sa_kill(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode);
+
+	LASSERT(!sa_unhashed(entry));
+	LASSERT(!list_empty(&entry->se_list));
+	LASSERT(sa_ready(entry));
+
+	sa_unhash(sai, entry);
+
+	spin_lock(&lli->lli_sa_lock);
+	list_del_init(&entry->se_list);
+	spin_unlock(&lli->lli_sa_lock);
+
+	iput(entry->se_inode);
+
+	sa_free(sai, entry);
+}
+
+/* called by scanner after use, sa_entry will be killed */
+static void
+sa_put(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	struct sa_entry *tmp, *next;
+
+	if (entry && entry->se_state == SA_ENTRY_SUCC) {
+		struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode);
+
+		sai->sai_hit++;
+		sai->sai_consecutive_miss = 0;
+		sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max);
+	} else {
+		sai->sai_miss++;
+		sai->sai_consecutive_miss++;
+	}
+
+	if (entry)
+		sa_kill(sai, entry);
+
+	/*
+	 * kill old completed entries, only scanner process does this, no need
+	 * to lock
+	 */
+	list_for_each_entry_safe(tmp, next, &sai->sai_entries, se_list) {
+		if (!is_omitted_entry(sai, tmp->se_index))
+			break;
+		sa_kill(sai, tmp);
+	}
+}
+
+/*
+ * update state and sort add entry to sai_entries by index, return true if
+ * scanner is waiting on this entry.
+ */
+static bool
+__sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
+{
+	struct sa_entry *se;
+	struct list_head *pos = &sai->sai_entries;
+	__u64 index = entry->se_index;
+
+	LASSERT(!sa_ready(entry));
+	LASSERT(list_empty(&entry->se_list));
+
+	list_for_each_entry_reverse(se, &sai->sai_entries, se_list) {
+		if (se->se_index < entry->se_index) {
+			pos = &se->se_list;
+			break;
+		}
+	}
+	list_add(&entry->se_list, pos);
+	/*
+	 * LU-9210: ll_statahead_interpet must be able to see this before
+	 * we wake it up
+	 */
+	smp_store_release(&entry->se_state,
+			  ret < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+
+	return (index == sai->sai_index_wait);
+}
+
+/* finish async stat RPC arguments */
+static void sa_fini_data(struct md_enqueue_info *minfo)
+{
+	struct md_op_data *op_data = &minfo->mi_data;
+
+	if (op_data->op_flags & MF_OPNAME_KMALLOCED)
+		/* allocated via ll_setup_filename called from sa_prep_data */
+		kfree(op_data->op_name);
+	ll_unlock_md_op_lsm(&minfo->mi_data);
+	iput(minfo->mi_dir);
+	OBD_FREE_PTR(minfo);
+}
+
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+				  struct md_enqueue_info *minfo, int rc);
+
+/*
+ * prepare arguments for async stat RPC.
+ */
+static struct md_enqueue_info *
+sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct md_op_data        *op_data;
+
+	OBD_ALLOC_PTR(minfo);
+	if (!minfo)
+		return ERR_PTR(-ENOMEM);
+
+	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child,
+				     entry->se_qstr.name, entry->se_qstr.len, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		OBD_FREE_PTR(minfo);
+		return (struct md_enqueue_info *)op_data;
+	}
+
+	if (!child)
+		op_data->op_fid2 = entry->se_fid;
+
+	minfo->mi_it.it_op = IT_GETATTR;
+	minfo->mi_dir = igrab(dir);
+	minfo->mi_cb = ll_statahead_interpret;
+	minfo->mi_cbdata = entry;
+
+	einfo = &minfo->mi_einfo;
+	einfo->ei_type   = LDLM_IBITS;
+	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+	einfo->ei_cb_bl  = ll_md_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = NULL;
+	einfo->ei_cbdata = NULL;
+	einfo->ei_req_slot = 1;
+
+	return minfo;
+}
+
+/*
+ * release resources used in async stat RPC, update entry state and wakeup if
+ * scanner process it waiting on this entry.
+ */
+static void
+sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode);
+	struct md_enqueue_info *minfo = entry->se_minfo;
+	struct ptlrpc_request *req = entry->se_req;
+	bool wakeup;
+
+	/* release resources used in RPC */
+	if (minfo) {
+		entry->se_minfo = NULL;
+		ll_intent_release(&minfo->mi_it);
+		sa_fini_data(minfo);
+	}
+
+	if (req) {
+		entry->se_req = NULL;
+		ptlrpc_req_finished(req);
+	}
+
+	spin_lock(&lli->lli_sa_lock);
+	wakeup = __sa_make_ready(sai, entry, ret);
+	spin_unlock(&lli->lli_sa_lock);
+
+	if (wakeup)
+		wake_up(&sai->sai_waitq);
+}
+
+/* insert inode into the list of sai_agls */
+static void ll_agl_add(struct ll_statahead_info *sai,
+		       struct inode *inode, int index)
+{
+	struct ll_inode_info *child  = ll_i2info(inode);
+	struct ll_inode_info *parent = ll_i2info(sai->sai_dentry->d_inode);
+
+	spin_lock(&child->lli_agl_lock);
+	if (child->lli_agl_index == 0) {
+		child->lli_agl_index = index;
+		spin_unlock(&child->lli_agl_lock);
+
+		LASSERT(list_empty(&child->lli_agl_list));
+
+		spin_lock(&parent->lli_agl_lock);
+		/* Re-check under the lock */
+		if (agl_should_run(sai, inode)) {
+			if (agl_list_empty(sai))
+				wake_up_process(sai->sai_agl_task);
+			igrab(inode);
+			list_add_tail(&child->lli_agl_list, &sai->sai_agls);
+		} else
+			child->lli_agl_index = 0;
+		spin_unlock(&parent->lli_agl_lock);
+	} else {
+		spin_unlock(&child->lli_agl_lock);
+	}
+}
+
+/* allocate sai */
+static struct ll_statahead_info *ll_sai_alloc(struct dentry *dentry)
+{
+	struct ll_statahead_info *sai;
+	struct ll_inode_info *lli = ll_i2info(dentry->d_inode);
+	int i;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(sai);
+	if (!sai)
+		RETURN(NULL);
+
+	sai->sai_dentry = dget(dentry);
+	atomic_set(&sai->sai_refcount, 1);
+	sai->sai_max = LL_SA_RPC_MIN;
+	sai->sai_index = 1;
+	init_waitqueue_head(&sai->sai_waitq);
+
+	INIT_LIST_HEAD(&sai->sai_interim_entries);
+	INIT_LIST_HEAD(&sai->sai_entries);
+	INIT_LIST_HEAD(&sai->sai_agls);
+
+	for (i = 0; i < LL_SA_CACHE_SIZE; i++) {
+		INIT_LIST_HEAD(&sai->sai_cache[i]);
+		spin_lock_init(&sai->sai_cache_lock[i]);
+	}
+	atomic_set(&sai->sai_cache_count, 0);
+
+	spin_lock(&sai_generation_lock);
+	lli->lli_sa_generation = ++sai_generation;
+	if (unlikely(sai_generation == 0))
+		lli->lli_sa_generation = ++sai_generation;
+	spin_unlock(&sai_generation_lock);
+
+	RETURN(sai);
+}
+
+/* free sai */
+static inline void ll_sai_free(struct ll_statahead_info *sai)
+{
+	LASSERT(sai->sai_dentry != NULL);
+	dput(sai->sai_dentry);
+	OBD_FREE_PTR(sai);
+}
+
+/*
+ * take refcount of sai if sai for @dir exists, which means statahead is on for
+ * this directory.
+ */
+static inline struct ll_statahead_info *ll_sai_get(struct inode *dir)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = NULL;
+
+	spin_lock(&lli->lli_sa_lock);
+	sai = lli->lli_sai;
+	if (sai)
+		atomic_inc(&sai->sai_refcount);
+	spin_unlock(&lli->lli_sa_lock);
+
+	return sai;
+}
+
+/*
+ * put sai refcount after use, if refcount reaches zero, free sai and sa_entries
+ * attached to it.
+ */
+static void ll_sai_put(struct ll_statahead_info *sai)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode);
+
+	if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
+		struct sa_entry *entry, *next;
+		struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode);
+
+		lli->lli_sai = NULL;
+		spin_unlock(&lli->lli_sa_lock);
+
+		LASSERT(!sai->sai_task);
+		LASSERT(!sai->sai_agl_task);
+		LASSERT(sai->sai_sent == sai->sai_replied);
+		LASSERT(!sa_has_callback(sai));
+
+		list_for_each_entry_safe(entry, next, &sai->sai_entries,
+					 se_list)
+			sa_kill(sai, entry);
+
+		LASSERT(atomic_read(&sai->sai_cache_count) == 0);
+		LASSERT(agl_list_empty(sai));
+
+		ll_sai_free(sai);
+		atomic_dec(&sbi->ll_sa_running);
+	}
+}
+
+/* Do NOT forget to drop inode refcount when into sai_agls. */
+static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	u64 index = lli->lli_agl_index;
+	ktime_t expire;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(list_empty(&lli->lli_agl_list));
+
+	/* AGL maybe fall behind statahead with one entry */
+	if (is_omitted_entry(sai, index + 1)) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	/*
+	 * In case of restore, the MDT has the right size and has already
+	 * sent it back without granting the layout lock, inode is up-to-date.
+	 * Then AGL (async glimpse lock) is useless.
+	 * Also to glimpse we need the layout, in case of a runninh restore
+	 * the MDT holds the layout lock so the glimpse will block up to the
+	 * end of restore (statahead/agl will block)
+	 */
+	if (test_bit(LLIF_FILE_RESTORING, &lli->lli_flags)) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	/* Someone is in glimpse (sync or async), do nothing. */
+	rc = down_write_trylock(&lli->lli_glimpse_sem);
+	if (rc == 0) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	/*
+	 * Someone triggered glimpse within 1 sec before.
+	 * 1) The former glimpse succeeded with glimpse lock granted by OST, and
+	 *    if the lock is still cached on client, AGL needs to do nothing. If
+	 *    it is cancelled by other client, AGL maybe cannot obtaion new lock
+	 *    for no glimpse callback triggered by AGL.
+	 * 2) The former glimpse succeeded, but OST did not grant glimpse lock.
+	 *    Under such case, it is quite possible that the OST will not grant
+	 *    glimpse lock for AGL also.
+	 * 3) The former glimpse failed, compared with other two cases, it is
+	 *    relative rare. AGL can ignore such case, and it will not muchly
+	 *    affect the performance.
+	 */
+	expire = ktime_sub_ns(ktime_get(), NSEC_PER_SEC);
+	if (ktime_to_ns(lli->lli_glimpse_time) &&
+	    ktime_before(expire, lli->lli_glimpse_time)) {
+		up_write(&lli->lli_glimpse_sem);
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	CDEBUG(D_READA,
+	       "Handling (init) async glimpse: inode = " DFID", idx = %llu\n",
+	       PFID(&lli->lli_fid), index);
+
+	cl_agl(inode);
+	lli->lli_agl_index = 0;
+	lli->lli_glimpse_time = ktime_get();
+	up_write(&lli->lli_glimpse_sem);
+
+	CDEBUG(D_READA,
+	       "Handled (init) async glimpse: inode= " DFID", idx = %llu, rc = %d\n",
+	       PFID(&lli->lli_fid), index, rc);
+
+	iput(inode);
+
+	EXIT;
+}
+
+/*
+ * prepare inode for sa entry, add it into agl list, now sa_entry is ready
+ * to be used by scanner process.
+ */
+static void sa_instantiate(struct ll_statahead_info *sai,
+			   struct sa_entry *entry)
+{
+	struct inode *dir = sai->sai_dentry->d_inode;
+	struct inode *child;
+	struct md_enqueue_info *minfo;
+	struct lookup_intent *it;
+	struct ptlrpc_request *req;
+	struct mdt_body *body;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(entry->se_handle != 0);
+
+	minfo = entry->se_minfo;
+	it = &minfo->mi_it;
+	req = entry->se_req;
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (!body)
+		GOTO(out, rc = -EFAULT);
+
+	child = entry->se_inode;
+	/* revalidate; unlinked and re-created with the same name */
+	if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->mbo_fid1))) {
+		if (child) {
+			entry->se_inode = NULL;
+			iput(child);
+		}
+		/* The mdt_body is invalid. Skip this entry */
+		GOTO(out, rc = -EAGAIN);
+	}
+
+	it->it_lock_handle = entry->se_handle;
+	rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
+	if (rc != 1)
+		GOTO(out, rc = -EAGAIN);
+
+	rc = ll_prep_inode(&child, &req->rq_pill, dir->i_sb, it);
+	if (rc)
+		GOTO(out, rc);
+
+	/* If encryption context was returned by MDT, put it in
+	 * inode now to save an extra getxattr.
+	 */
+	if (body->mbo_valid & OBD_MD_ENCCTX) {
+		void *encctx = req_capsule_server_get(&req->rq_pill,
+						      &RMF_FILE_ENCCTX);
+		__u32 encctxlen = req_capsule_get_size(&req->rq_pill,
+						       &RMF_FILE_ENCCTX,
+						       RCL_SERVER);
+
+		if (encctxlen) {
+			CDEBUG(D_SEC,
+			       "server returned encryption ctx for "DFID"\n",
+			       PFID(ll_inode2fid(child)));
+			rc = ll_xattr_cache_insert(child,
+						   xattr_for_enc(child),
+						   encctx, encctxlen);
+			if (rc)
+				CWARN("%s: cannot set enc ctx for "DFID": rc = %d\n",
+				      ll_i2sbi(child)->ll_fsname,
+				      PFID(ll_inode2fid(child)), rc);
+		}
+	}
+
+	CDEBUG(D_READA, "%s: setting %.*s"DFID" l_data to inode %p\n",
+	       ll_i2sbi(dir)->ll_fsname, entry->se_qstr.len,
+	       entry->se_qstr.name, PFID(ll_inode2fid(child)), child);
+	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+
+	entry->se_inode = child;
+
+	if (agl_should_run(sai, child))
+		ll_agl_add(sai, child, entry->se_index);
+
+	EXIT;
+
+out:
+	/*
+	 * sa_make_ready() will drop ldlm ibits lock refcount by calling
+	 * ll_intent_drop_lock() in spite of failures. Do not worry about
+	 * calling ll_intent_drop_lock() more than once.
+	 */
+	sa_make_ready(sai, entry, rc);
+}
+
+/* once there are async stat replies, instantiate sa_entry from replies */
+static void sa_handle_callback(struct ll_statahead_info *sai)
+{
+	struct ll_inode_info *lli;
+
+	lli = ll_i2info(sai->sai_dentry->d_inode);
+
+	spin_lock(&lli->lli_sa_lock);
+	while (sa_has_callback(sai)) {
+		struct sa_entry *entry;
+
+		entry = list_entry(sai->sai_interim_entries.next,
+				   struct sa_entry, se_list);
+		list_del_init(&entry->se_list);
+		spin_unlock(&lli->lli_sa_lock);
+
+		sa_instantiate(sai, entry);
+		spin_lock(&lli->lli_sa_lock);
+	}
+	spin_unlock(&lli->lli_sa_lock);
+}
+
+/*
+ * callback for async stat RPC, because this is called in ptlrpcd context, we
+ * only put sa_entry in sai_interim_entries, and wake up statahead thread to
+ * really prepare inode and instantiate sa_entry later.
+ */
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+				  struct md_enqueue_info *minfo, int rc)
+{
+	struct lookup_intent *it = &minfo->mi_it;
+	struct inode *dir = minfo->mi_dir;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = lli->lli_sai;
+	struct sa_entry *entry = (struct sa_entry *)minfo->mi_cbdata;
+	__u64 handle = 0;
+
+	ENTRY;
+
+	if (it_disposition(it, DISP_LOOKUP_NEG))
+		rc = -ENOENT;
+
+	/*
+	 * because statahead thread will wait for all inflight RPC to finish,
+	 * sai should be always valid, no need to refcount
+	 */
+	LASSERT(sai != NULL);
+	LASSERT(entry != NULL);
+
+	CDEBUG(D_READA, "sa_entry %.*s rc %d\n",
+	       entry->se_qstr.len, entry->se_qstr.name, rc);
+
+	if (rc != 0) {
+		ll_intent_release(it);
+		sa_fini_data(minfo);
+	} else {
+		/*
+		 * release ibits lock ASAP to avoid deadlock when statahead
+		 * thread enqueues lock on parent in readdir and another
+		 * process enqueues lock on child with parent lock held, eg.
+		 * unlink.
+		 */
+		handle = it->it_lock_handle;
+		ll_intent_drop_lock(it);
+		ll_unlock_md_op_lsm(&minfo->mi_data);
+	}
+
+	spin_lock(&lli->lli_sa_lock);
+	if (rc != 0) {
+		if (__sa_make_ready(sai, entry, rc))
+			wake_up(&sai->sai_waitq);
+	} else {
+		int first = 0;
+
+		entry->se_minfo = minfo;
+		entry->se_req = ptlrpc_request_addref(req);
+		/*
+		 * Release the async ibits lock ASAP to avoid deadlock
+		 * when statahead thread tries to enqueue lock on parent
+		 * for readpage and other tries to enqueue lock on child
+		 * with parent's lock held, for example: unlink.
+		 */
+		entry->se_handle = handle;
+		if (!sa_has_callback(sai))
+			first = 1;
+
+		list_add_tail(&entry->se_list, &sai->sai_interim_entries);
+		if (first && sai->sai_task)
+			wake_up_process(sai->sai_task);
+	}
+	sai->sai_replied++;
+
+	spin_unlock(&lli->lli_sa_lock);
+
+	RETURN(rc);
+}
+
+/* async stat for file not found in dcache */
+static int sa_lookup(struct inode *dir, struct sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	int                       rc;
+
+	ENTRY;
+
+	minfo = sa_prep_data(dir, NULL, entry);
+	if (IS_ERR(minfo))
+		RETURN(PTR_ERR(minfo));
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
+	if (rc < 0)
+		sa_fini_data(minfo);
+
+	RETURN(rc);
+}
+
+/**
+ * async stat for file found in dcache, similar to .revalidate
+ *
+ * \retval	1 dentry valid, no RPC sent
+ * \retval	0 dentry invalid, will send async stat RPC
+ * \retval	negative number upon error
+ */
+static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
+			 struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct lookup_intent it = { .it_op = IT_GETATTR,
+				    .it_lock_handle = 0 };
+	struct md_enqueue_info *minfo;
+	int rc;
+
+	ENTRY;
+
+	if (unlikely(!inode))
+		RETURN(1);
+
+	if (d_mountpoint(dentry))
+		RETURN(1);
+
+	minfo = sa_prep_data(dir, inode, entry);
+	if (IS_ERR(minfo))
+		RETURN(PTR_ERR(minfo));
+
+	entry->se_inode = igrab(inode);
+	rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),
+				NULL);
+	if (rc == 1) {
+		entry->se_handle = it.it_lock_handle;
+		ll_intent_release(&it);
+		sa_fini_data(minfo);
+		RETURN(1);
+	}
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
+	if (rc < 0) {
+		entry->se_inode = NULL;
+		iput(inode);
+		sa_fini_data(minfo);
+	}
+
+	RETURN(rc);
+}
+
+/* async stat for file with @name */
+static void sa_statahead(struct dentry *parent, const char *name, int len,
+			 const struct lu_fid *fid)
+{
+	struct inode *dir = parent->d_inode;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = lli->lli_sai;
+	struct dentry *dentry = NULL;
+	struct sa_entry *entry;
+	int rc;
+
+	ENTRY;
+
+	entry = sa_alloc(parent, sai, sai->sai_index, name, len, fid);
+	if (IS_ERR(entry))
+		RETURN_EXIT;
+
+	dentry = d_lookup(parent, &entry->se_qstr);
+	if (!dentry) {
+		rc = sa_lookup(dir, entry);
+	} else {
+		rc = sa_revalidate(dir, entry, dentry);
+		if (rc == 1 && agl_should_run(sai, dentry->d_inode))
+			ll_agl_add(sai, dentry->d_inode, entry->se_index);
+	}
+
+	if (dentry)
+		dput(dentry);
+
+	if (rc != 0)
+		sa_make_ready(sai, entry, rc);
+	else
+		sai->sai_sent++;
+
+	sai->sai_index++;
+
+	EXIT;
+}
+
+/* async glimpse (agl) thread main function */
+static int ll_agl_thread(void *arg)
+{
+	struct dentry *parent = (struct dentry *)arg;
+	struct inode *dir = parent->d_inode;
+	struct ll_inode_info *plli = ll_i2info(dir);
+	struct ll_inode_info *clli;
+	/*
+	 * We already own this reference, so it is safe to take it
+	 * without a lock.
+	 */
+	struct ll_statahead_info *sai = plli->lli_sai;
+
+	ENTRY;
+
+	CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n",
+	       sai, parent);
+
+	while (({set_current_state(TASK_IDLE);
+		 !kthread_should_stop(); })) {
+		spin_lock(&plli->lli_agl_lock);
+		clli = list_first_entry_or_null(&sai->sai_agls,
+						struct ll_inode_info,
+						lli_agl_list);
+		if (clli) {
+			__set_current_state(TASK_RUNNING);
+			list_del_init(&clli->lli_agl_list);
+			spin_unlock(&plli->lli_agl_lock);
+			ll_agl_trigger(&clli->lli_vfs_inode, sai);
+			cond_resched();
+		} else {
+			spin_unlock(&plli->lli_agl_lock);
+			schedule();
+		}
+	}
+	__set_current_state(TASK_RUNNING);
+	RETURN(0);
+}
+
+static void ll_stop_agl(struct ll_statahead_info *sai)
+{
+	struct dentry *parent = sai->sai_dentry;
+	struct ll_inode_info *plli = ll_i2info(parent->d_inode);
+	struct ll_inode_info *clli;
+	struct task_struct *agl_task;
+
+	spin_lock(&plli->lli_agl_lock);
+	agl_task = sai->sai_agl_task;
+	sai->sai_agl_task = NULL;
+	spin_unlock(&plli->lli_agl_lock);
+	if (!agl_task)
+		return;
+
+	CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n",
+	       sai, (unsigned int)agl_task->pid);
+	kthread_stop(agl_task);
+
+	spin_lock(&plli->lli_agl_lock);
+	while ((clli = list_first_entry_or_null(&sai->sai_agls,
+						struct ll_inode_info,
+						lli_agl_list)) != NULL) {
+		list_del_init(&clli->lli_agl_list);
+		spin_unlock(&plli->lli_agl_lock);
+		clli->lli_agl_index = 0;
+		iput(&clli->lli_vfs_inode);
+		spin_lock(&plli->lli_agl_lock);
+	}
+	spin_unlock(&plli->lli_agl_lock);
+	CDEBUG(D_READA, "agl thread stopped: sai %p, parent %pd\n",
+	       sai, parent);
+	ll_sai_put(sai);
+}
+
+/* start agl thread */
+static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai)
+{
+	int node = cfs_cpt_spread_node(cfs_cpt_tab, CFS_CPT_ANY);
+	struct ll_inode_info *plli;
+	struct task_struct *task;
+
+	ENTRY;
+
+	CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n",
+	       sai, parent);
+
+	plli = ll_i2info(parent->d_inode);
+	task = kthread_create_on_node(ll_agl_thread, parent, node, "ll_agl_%d",
+				      plli->lli_opendir_pid);
+	if (IS_ERR(task)) {
+		CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task));
+		RETURN_EXIT;
+	}
+	sai->sai_agl_task = task;
+	atomic_inc(&ll_i2sbi(d_inode(parent))->ll_agl_total);
+	/* Get an extra reference that the thread holds */
+	ll_sai_get(d_inode(parent));
+
+	wake_up_process(task);
+
+	EXIT;
+}
+
+/* statahead thread main function */
+static int ll_statahead_thread(void *arg)
+{
+	struct dentry *parent = (struct dentry *)arg;
+	struct inode *dir = parent->d_inode;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	struct ll_statahead_info *sai = lli->lli_sai;
+	int first = 0;
+	struct md_op_data *op_data;
+	struct page *page = NULL;
+	__u64 pos = 0;
+	int rc = 0;
+
+	ENTRY;
+
+	CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n",
+	       sai, parent);
+
+	OBD_ALLOC_PTR(op_data);
+	if (!op_data)
+		GOTO(out, rc = -ENOMEM);
+
+	while (pos != MDS_DIR_END_OFF && sai->sai_task) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		op_data = ll_prep_md_op_data(op_data, dir, dir, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, dir);
+		if (IS_ERR(op_data)) {
+			rc = PTR_ERR(op_data);
+			break;
+		}
+
+		sai->sai_in_readpage = 1;
+		page = ll_get_dir_page(dir, op_data, pos, NULL);
+		ll_unlock_md_op_lsm(op_data);
+		sai->sai_in_readpage = 0;
+		if (IS_ERR(page)) {
+			rc = PTR_ERR(page);
+			CDEBUG(D_READA,
+			       "error reading dir "DFID" at %llu /%llu opendir_pid = %u: rc = %d\n",
+			       PFID(ll_inode2fid(dir)), pos, sai->sai_index,
+			       lli->lli_opendir_pid, rc);
+			break;
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp);
+		     ent != NULL && sai->sai_task &&
+		     !sa_low_hit(sai);
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+			struct lu_fid fid;
+			struct llcrypt_str lltr = LLTR_INIT(NULL, 0);
+
+			hash = le64_to_cpu(ent->lde_hash);
+			if (unlikely(hash < pos))
+				/*
+				 * Skip until we find target hash value.
+				 */
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * Skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1) {
+					/*
+					 * skip "."
+					 */
+					continue;
+				} else if (name[1] == '.' && namelen == 2) {
+					/*
+					 * skip ".."
+					 */
+					continue;
+				} else if (!sai->sai_ls_all) {
+					/*
+					 * skip hidden files.
+					 */
+					sai->sai_skip_hidden++;
+					continue;
+				}
+			}
+
+			/*
+			 * don't stat-ahead first entry.
+			 */
+			if (unlikely(++first == 1))
+				continue;
+
+			fid_le_to_cpu(&fid, &ent->lde_fid);
+
+			while (({set_current_state(TASK_IDLE);
+				 sai->sai_task; })) {
+				if (sa_has_callback(sai)) {
+					__set_current_state(TASK_RUNNING);
+					sa_handle_callback(sai);
+				}
+
+				spin_lock(&lli->lli_agl_lock);
+				while (sa_sent_full(sai) &&
+				       !agl_list_empty(sai)) {
+					struct ll_inode_info *clli;
+
+					__set_current_state(TASK_RUNNING);
+					clli = agl_first_entry(sai);
+					list_del_init(&clli->lli_agl_list);
+					spin_unlock(&lli->lli_agl_lock);
+
+					ll_agl_trigger(&clli->lli_vfs_inode,
+						       sai);
+					cond_resched();
+					spin_lock(&lli->lli_agl_lock);
+				}
+				spin_unlock(&lli->lli_agl_lock);
+
+				if (!sa_sent_full(sai))
+					break;
+				schedule();
+			}
+			__set_current_state(TASK_RUNNING);
+
+			if (IS_ENCRYPTED(dir)) {
+				struct llcrypt_str de_name =
+					LLTR_INIT(ent->lde_name, namelen);
+				struct lu_fid fid;
+
+				rc = llcrypt_fname_alloc_buffer(dir, NAME_MAX,
+								&lltr);
+				if (rc < 0)
+					continue;
+
+				fid_le_to_cpu(&fid, &ent->lde_fid);
+				if (ll_fname_disk_to_usr(dir, 0, 0, &de_name,
+							 &lltr, &fid)) {
+					llcrypt_fname_free_buffer(&lltr);
+					continue;
+				}
+
+				name = lltr.name;
+				namelen = lltr.len;
+			}
+
+			sa_statahead(parent, name, namelen, &fid);
+			llcrypt_fname_free_buffer(&lltr);
+		}
+
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		ll_release_page(dir, page,
+				le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+
+		if (sa_low_hit(sai)) {
+			rc = -EFAULT;
+			atomic_inc(&sbi->ll_sa_wrong);
+			CDEBUG(D_READA,
+			       "Statahead for dir "DFID" hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stoppingstatahead thread: pid %d\n",
+			       PFID(&lli->lli_fid), sai->sai_hit,
+			       sai->sai_miss, sai->sai_sent,
+			       sai->sai_replied, current->pid);
+			break;
+		}
+	}
+	ll_finish_md_op_data(op_data);
+
+	if (rc < 0) {
+		spin_lock(&lli->lli_sa_lock);
+		sai->sai_task = NULL;
+		lli->lli_sa_enabled = 0;
+		spin_unlock(&lli->lli_sa_lock);
+	}
+
+	/*
+	 * statahead is finished, but statahead entries need to be cached, wait
+	 * for file release to stop me.
+	 */
+	while (({set_current_state(TASK_IDLE);
+		 sai->sai_task; })) {
+		if (sa_has_callback(sai)) {
+			__set_current_state(TASK_RUNNING);
+			sa_handle_callback(sai);
+		} else {
+			schedule();
+		}
+	}
+	__set_current_state(TASK_RUNNING);
+
+	EXIT;
+out:
+	ll_stop_agl(sai);
+
+	/*
+	 * wait for inflight statahead RPCs to finish, and then we can free sai
+	 * safely because statahead RPC will access sai data
+	 */
+	while (sai->sai_sent != sai->sai_replied)
+		/* in case we're not woken up, timeout wait */
+		msleep(125);
+
+	/* release resources held by statahead RPCs */
+	sa_handle_callback(sai);
+
+	CDEBUG(D_READA, "%s: statahead thread stopped: sai %p, parent %pd\n",
+	       sbi->ll_fsname, sai, parent);
+
+	spin_lock(&lli->lli_sa_lock);
+	sai->sai_task = NULL;
+	spin_unlock(&lli->lli_sa_lock);
+	wake_up(&sai->sai_waitq);
+
+	ll_sai_put(sai);
+
+	return rc;
+}
+
+/* authorize opened dir handle @key to statahead */
+void ll_authorize_statahead(struct inode *dir, void *key)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+
+	spin_lock(&lli->lli_sa_lock);
+	if (!lli->lli_opendir_key && !lli->lli_sai) {
+		/*
+		 * if lli_sai is not NULL, it means previous statahead is not
+		 * finished yet, we'd better not start a new statahead for now.
+		 */
+		LASSERT(lli->lli_opendir_pid == 0);
+		lli->lli_opendir_key = key;
+		lli->lli_opendir_pid = current->pid;
+		lli->lli_sa_enabled = 1;
+	}
+	spin_unlock(&lli->lli_sa_lock);
+}
+
+/*
+ * deauthorize opened dir handle @key to statahead, and notify statahead thread
+ * to quit if it's running.
+ */
+void ll_deauthorize_statahead(struct inode *dir, void *key)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai;
+
+	LASSERT(lli->lli_opendir_key == key);
+	LASSERT(lli->lli_opendir_pid != 0);
+
+	CDEBUG(D_READA, "deauthorize statahead for "DFID"\n",
+	       PFID(&lli->lli_fid));
+
+	spin_lock(&lli->lli_sa_lock);
+	lli->lli_opendir_key = NULL;
+	lli->lli_opendir_pid = 0;
+	lli->lli_sa_enabled = 0;
+	sai = lli->lli_sai;
+	if (sai && sai->sai_task) {
+		/*
+		 * statahead thread may not have quit yet because it needs to
+		 * cache entries, now it's time to tell it to quit.
+		 *
+		 * wake_up_process() provides the necessary barriers
+		 * to pair with set_current_state().
+		 */
+		struct task_struct *task = sai->sai_task;
+
+		sai->sai_task = NULL;
+		wake_up_process(task);
+	}
+	spin_unlock(&lli->lli_sa_lock);
+}
+
+enum {
+	/**
+	 * not first dirent, or is "."
+	 */
+	LS_NOT_FIRST_DE = 0,
+	/**
+	 * the first non-hidden dirent
+	 */
+	LS_FIRST_DE,
+	/**
+	 * the first hidden dirent, that is "."
+	 */
+	LS_FIRST_DOT_DE
+};
+
+/* file is first dirent under @dir */
+static int is_first_dirent(struct inode *dir, struct dentry *dentry)
+{
+	struct qstr *target = &dentry->d_name;
+	struct md_op_data *op_data;
+	int dot_de;
+	struct page *page = NULL;
+	int rc = LS_NOT_FIRST_DE;
+	__u64 pos = 0;
+	struct llcrypt_str lltr = LLTR_INIT(NULL, 0);
+
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, dir);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (IS_ENCRYPTED(dir)) {
+		int rc2 = llcrypt_fname_alloc_buffer(dir, NAME_MAX, &lltr);
+
+		if (rc2 < 0)
+			RETURN(rc2);
+	}
+
+	/**
+	 *FIXME choose the start offset of the readdir
+	 */
+
+	page = ll_get_dir_page(dir, op_data, 0, NULL);
+
+	while (1) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (IS_ERR(page)) {
+			struct ll_inode_info *lli = ll_i2info(dir);
+
+			rc = PTR_ERR(page);
+			CERROR("%s: reading dir "DFID" at %llu opendir_pid = %u : rc = %d\n",
+			       ll_i2sbi(dir)->ll_fsname,
+			       PFID(ll_inode2fid(dir)), pos,
+			       lli->lli_opendir_pid, rc);
+			break;
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			/*
+			 * The ll_get_dir_page() can return any page containing
+			 * the given hash which may be not the start hash.
+			 */
+			if (unlikely(hash < pos))
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1)
+					/*
+					 * skip "."
+					 */
+					continue;
+				else if (name[1] == '.' && namelen == 2)
+					/*
+					 * skip ".."
+					 */
+					continue;
+				else
+					dot_de = 1;
+			} else {
+				dot_de = 0;
+			}
+
+			if (dot_de && target->name[0] != '.') {
+				CDEBUG(D_READA, "%.*s skip hidden file %.*s\n",
+				       target->len, target->name,
+				       namelen, name);
+				continue;
+			}
+
+			if (IS_ENCRYPTED(dir)) {
+				struct llcrypt_str de_name =
+					LLTR_INIT(ent->lde_name, namelen);
+				struct lu_fid fid;
+
+				fid_le_to_cpu(&fid, &ent->lde_fid);
+				if (ll_fname_disk_to_usr(dir, 0, 0, &de_name,
+							 &lltr, &fid))
+					continue;
+				name = lltr.name;
+				namelen = lltr.len;
+			}
+
+			if (target->len != namelen ||
+			    memcmp(target->name, name, namelen) != 0)
+				rc = LS_NOT_FIRST_DE;
+			else if (!dot_de)
+				rc = LS_FIRST_DE;
+			else
+				rc = LS_FIRST_DOT_DE;
+
+			ll_release_page(dir, page, false);
+			GOTO(out, rc);
+		}
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			ll_release_page(dir, page, false);
+			GOTO(out, rc);
+		} else {
+			/*
+			 * chain is exhausted
+			 * Normal case: continue to the next page.
+			 */
+			ll_release_page(dir, page, le32_to_cpu(dp->ldp_flags) &
+					      LDF_COLLIDE);
+			page = ll_get_dir_page(dir, op_data, pos, NULL);
+		}
+	}
+	EXIT;
+out:
+	llcrypt_fname_free_buffer(&lltr);
+	ll_finish_md_op_data(op_data);
+
+	return rc;
+}
+
+/**
+ * revalidate @dentryp from statahead cache
+ *
+ * \param[in] dir	parent directory
+ * \param[in] sai	sai structure
+ * \param[out] dentryp	pointer to dentry which will be revalidated
+ * \param[in] unplug	unplug statahead window only (normally for negative
+ *			dentry)
+ * \retval		1 on success, dentry is saved in @dentryp
+ * \retval		0 if revalidation failed (no proper lock on client)
+ * \retval		negative number upon error
+ */
+static int revalidate_statahead_dentry(struct inode *dir,
+				       struct ll_statahead_info *sai,
+				       struct dentry **dentryp,
+				       bool unplug)
+{
+	struct sa_entry *entry = NULL;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	int rc = 0;
+
+	ENTRY;
+
+	if ((*dentryp)->d_name.name[0] == '.') {
+		if (sai->sai_ls_all ||
+		    sai->sai_miss_hidden >= sai->sai_skip_hidden) {
+			/*
+			 * Hidden dentry is the first one, or statahead
+			 * thread does not skip so many hidden dentries
+			 * before "sai_ls_all" enabled as below.
+			 */
+		} else {
+			if (!sai->sai_ls_all)
+				/*
+				 * It maybe because hidden dentry is not
+				 * the first one, "sai_ls_all" was not
+				 * set, then "ls -al" missed. Enable
+				 * "sai_ls_all" for such case.
+				 */
+				sai->sai_ls_all = 1;
+
+			/*
+			 * Such "getattr" has been skipped before
+			 * "sai_ls_all" enabled as above.
+			 */
+			sai->sai_miss_hidden++;
+			RETURN(-EAGAIN);
+		}
+	}
+
+	if (unplug)
+		GOTO(out, rc = 1);
+
+	entry = sa_get(sai, &(*dentryp)->d_name);
+	if (!entry)
+		GOTO(out, rc = -EAGAIN);
+
+	/* if statahead is busy in readdir, help it do post-work */
+	if (!sa_ready(entry) && sai->sai_in_readpage)
+		sa_handle_callback(sai);
+
+	if (!sa_ready(entry)) {
+		spin_lock(&lli->lli_sa_lock);
+		sai->sai_index_wait = entry->se_index;
+		spin_unlock(&lli->lli_sa_lock);
+		rc = wait_event_idle_timeout(sai->sai_waitq, sa_ready(entry),
+					     cfs_time_seconds(30));
+		if (rc == 0) {
+			/*
+			 * entry may not be ready, so it may be used by inflight
+			 * statahead RPC, don't free it.
+			 */
+			entry = NULL;
+			GOTO(out, rc = -EAGAIN);
+		}
+	}
+
+	/*
+	 * We need to see the value that was set immediately before we
+	 * were woken up.
+	 */
+	if (smp_load_acquire(&entry->se_state) == SA_ENTRY_SUCC &&
+	    entry->se_inode) {
+		struct inode *inode = entry->se_inode;
+		struct lookup_intent it = { .it_op = IT_GETATTR,
+					    .it_lock_handle =
+						entry->se_handle };
+		__u64 bits;
+
+		rc = md_revalidate_lock(ll_i2mdexp(dir), &it,
+					ll_inode2fid(inode), &bits);
+		if (rc == 1) {
+			if (!(*dentryp)->d_inode) {
+				struct dentry *alias;
+
+				alias = ll_splice_alias(inode, *dentryp);
+				if (IS_ERR(alias)) {
+					ll_intent_release(&it);
+					GOTO(out, rc = PTR_ERR(alias));
+				}
+				*dentryp = alias;
+				/*
+				 * statahead prepared this inode, transfer inode
+				 * refcount from sa_entry to dentry
+				 */
+				entry->se_inode = NULL;
+			} else if ((*dentryp)->d_inode != inode) {
+				/* revalidate, but inode is recreated */
+				CDEBUG(D_READA,
+				       "%s: stale dentry %pd inode " DFID", statahead inode "DFID "\n",
+				       ll_i2sbi(inode)->ll_fsname, *dentryp,
+				       PFID(ll_inode2fid((*dentryp)->d_inode)),
+				       PFID(ll_inode2fid(inode)));
+				ll_intent_release(&it);
+				GOTO(out, rc = -ESTALE);
+			}
+
+			if ((bits & MDS_INODELOCK_LOOKUP) &&
+			    d_lustre_invalid(*dentryp)) {
+				d_lustre_revalidate(*dentryp);
+				ll_update_dir_depth(dir, (*dentryp)->d_inode);
+			}
+
+			ll_intent_release(&it);
+		}
+	}
+out:
+	/*
+	 * statahead cached sa_entry can be used only once, and will be killed
+	 * right after use, so if lookup/revalidate accessed statahead cache,
+	 * set dentry ldd_sa_generation to parent lli_sa_generation, later if we
+	 * stat this file again, we know we've done statahead before, see
+	 * dentry_may_statahead().
+	 */
+	if (lld_is_init(*dentryp))
+		ll_d2d(*dentryp)->lld_sa_generation = lli->lli_sa_generation;
+	sa_put(sai, entry);
+	spin_lock(&lli->lli_sa_lock);
+	if (sai->sai_task)
+		wake_up_process(sai->sai_task);
+	spin_unlock(&lli->lli_sa_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * start statahead thread
+ *
+ * \param[in] dir	parent directory
+ * \param[in] dentry	dentry that triggers statahead, normally the first
+ *			dirent under @dir
+ * \param[in] agl	indicate whether AGL is needed
+ * \retval		-EAGAIN on success, because when this function is
+ *			called, it's already in lookup call, so client should
+ *			do it itself instead of waiting for statahead thread
+ *			to do it asynchronously.
+ * \retval		negative number upon error
+ */
+static int start_statahead_thread(struct inode *dir, struct dentry *dentry,
+				  bool agl)
+{
+	int node = cfs_cpt_spread_node(cfs_cpt_tab, CFS_CPT_ANY);
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = NULL;
+	struct dentry *parent = dentry->d_parent;
+	struct task_struct *task;
+	struct ll_sb_info *sbi = ll_i2sbi(parent->d_inode);
+	int first = LS_FIRST_DE;
+	int rc = 0;
+
+	ENTRY;
+
+	/* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
+	first = is_first_dirent(dir, dentry);
+	if (first == LS_NOT_FIRST_DE)
+		/* It is not "ls -{a}l" operation, no need statahead for it. */
+		GOTO(out, rc = -EFAULT);
+
+	if (unlikely(atomic_inc_return(&sbi->ll_sa_running) >
+				       sbi->ll_sa_running_max)) {
+		CDEBUG(D_READA,
+		       "Too many concurrent statahead instances, avoid new statahead instance temporarily.\n");
+		GOTO(out, rc = -EMFILE);
+	}
+
+	sai = ll_sai_alloc(parent);
+	if (!sai)
+		GOTO(out, rc = -ENOMEM);
+
+	sai->sai_ls_all = (first == LS_FIRST_DOT_DE);
+
+	/*
+	 * if current lli_opendir_key was deauthorized, or dir re-opened by
+	 * another process, don't start statahead, otherwise the newly spawned
+	 * statahead thread won't be notified to quit.
+	 */
+	spin_lock(&lli->lli_sa_lock);
+	if (unlikely(lli->lli_sai || !lli->lli_opendir_key ||
+		     lli->lli_opendir_pid != current->pid)) {
+		spin_unlock(&lli->lli_sa_lock);
+		GOTO(out, rc = -EPERM);
+	}
+	lli->lli_sai = sai;
+	spin_unlock(&lli->lli_sa_lock);
+
+	CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %pd]\n",
+	       current->pid, parent);
+
+	task = kthread_create_on_node(ll_statahead_thread, parent, node,
+				      "ll_sa_%u", lli->lli_opendir_pid);
+	if (IS_ERR(task)) {
+		spin_lock(&lli->lli_sa_lock);
+		lli->lli_sai = NULL;
+		spin_unlock(&lli->lli_sa_lock);
+		rc = PTR_ERR(task);
+		CERROR("can't start ll_sa thread, rc: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	if (test_bit(LL_SBI_AGL_ENABLED, ll_i2sbi(parent->d_inode)->ll_flags) &&
+	    agl)
+		ll_start_agl(parent, sai);
+
+	atomic_inc(&ll_i2sbi(parent->d_inode)->ll_sa_total);
+	sai->sai_task = task;
+
+	wake_up_process(task);
+	/*
+	 * We don't stat-ahead for the first dirent since we are already in
+	 * lookup.
+	 */
+	RETURN(-EAGAIN);
+
+out:
+	/*
+	 * once we start statahead thread failed, disable statahead so that
+	 * subsequent stat won't waste time to try it.
+	 */
+	spin_lock(&lli->lli_sa_lock);
+	if (lli->lli_opendir_pid == current->pid)
+		lli->lli_sa_enabled = 0;
+	spin_unlock(&lli->lli_sa_lock);
+
+	if (sai)
+		ll_sai_free(sai);
+	if (first != LS_NOT_FIRST_DE)
+		atomic_dec(&sbi->ll_sa_running);
+
+	RETURN(rc);
+}
+
+/*
+ * Check whether statahead for @dir was started.
+ */
+static inline bool ll_statahead_started(struct inode *dir, bool agl)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai;
+
+	spin_lock(&lli->lli_sa_lock);
+	sai = lli->lli_sai;
+	if (sai && (sai->sai_agl_task != NULL) != agl)
+		CDEBUG(D_READA,
+		       "%s: Statahead AGL hint changed from %d to %d\n",
+		       ll_i2sbi(dir)->ll_fsname,
+		       sai->sai_agl_task != NULL, agl);
+	spin_unlock(&lli->lli_sa_lock);
+
+	return !!sai;
+}
+
+/**
+ * statahead entry function, this is called when client getattr on a file, it
+ * will start statahead thread if this is the first dir entry, else revalidate
+ * dentry from statahead cache.
+ *
+ * \param[in]  dir	parent directory
+ * \param[out] dentryp	dentry to getattr
+ * \param[in]  agl	whether start the agl thread
+ *
+ * \retval		1 on success
+ * \retval		0 revalidation from statahead cache failed, caller needs
+ *			to getattr from server directly
+ * \retval		negative number on error, caller often ignores this and
+ *			then getattr from server
+ */
+int ll_start_statahead(struct inode *dir, struct dentry *dentry, bool agl)
+{
+	if (!ll_statahead_started(dir, agl))
+		return start_statahead_thread(dir, dentry, agl);
+	return 0;
+}
+
+/**
+ * revalidate dentry from statahead cache.
+ *
+ * \param[in]  dir	parent directory
+ * \param[out] dentryp	dentry to getattr
+ * \param[in]  unplug	unplug statahead window only (normally for negative
+ *			dentry)
+ * \retval		1 on success
+ * \retval		0 revalidation from statahead cache failed, caller needs
+ *			to getattr from server directly
+ * \retval		negative number on error, caller often ignores this and
+ *			then getattr from server
+ */
+int ll_revalidate_statahead(struct inode *dir, struct dentry **dentryp,
+			    bool unplug)
+{
+	struct ll_statahead_info *sai;
+	int rc = 0;
+
+	sai = ll_sai_get(dir);
+	if (sai) {
+		rc = revalidate_statahead_dentry(dir, sai, dentryp, unplug);
+		CDEBUG(D_READA, "revalidate statahead %pd: rc = %d.\n",
+		       *dentryp, rc);
+		ll_sai_put(sai);
+	}
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/super25.c b/drivers/staging/lustrefsx/lustre/llite/super25.c
new file mode 100644
index 0000000000000..3238621d3ef62
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/super25.c
@@ -0,0 +1,340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#define D_MOUNT (D_SUPER | D_CONFIG/*|D_WARNING */)
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <lprocfs_status.h>
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+static struct kmem_cache *ll_inode_cachep;
+
+static struct inode *ll_alloc_inode(struct super_block *sb)
+{
+	struct ll_inode_info *lli;
+#ifdef HAVE_ALLOC_INODE_SB
+	lli = alloc_inode_sb(sb, ll_inode_cachep, GFP_NOFS);
+	if (!lli)
+		return NULL;
+	OBD_ALLOC_POST(lli, sizeof(*lli), "slab-alloced");
+	memset(lli, 0, sizeof(*lli));
+#else
+	OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, GFP_NOFS);
+	if (!lli)
+		return NULL;
+#endif
+	inode_init_once(&lli->lli_vfs_inode);
+	return &lli->lli_vfs_inode;
+}
+
+static void ll_inode_destroy_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ll_inode_info *ptr = ll_i2info(inode);
+	llcrypt_free_inode(inode);
+	OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep);
+}
+
+static void ll_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ll_inode_destroy_callback);
+}
+
+static int ll_drop_inode(struct inode *inode)
+{
+  struct ll_sb_info *sbi = ll_i2sbi(inode);
+  int drop;
+
+  if (!sbi->ll_inode_cache_enabled)
+	return 1;
+
+  drop = generic_drop_inode(inode);
+  if (!drop)
+	  drop = llcrypt_drop_inode(inode);
+
+  return drop;
+}
+
+/* exported operations */
+const struct super_operations lustre_super_operations =
+{
+	.alloc_inode   = ll_alloc_inode,
+	.destroy_inode = ll_destroy_inode,
+	.drop_inode    = ll_drop_inode,
+	.evict_inode   = ll_delete_inode,
+	.put_super     = ll_put_super,
+	.statfs        = ll_statfs,
+	.umount_begin  = ll_umount_begin,
+	.remount_fs    = ll_remount_fs,
+	.show_options  = ll_show_options,
+};
+
+/**
+ * This is the entry point for the mount call into Lustre.
+ * This is called when a client is mounted, and this is
+ * where we start setting things up.
+ *
+ * @lmd2data data Mount options (e.g. -o flock,abort_recov)
+ */
+static int lustre_fill_super(struct super_block *sb, void *lmd2_data,
+			     int silent)
+{
+	struct lustre_mount_data *lmd;
+	struct lustre_sb_info *lsi;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	lsi = lustre_init_lsi(sb);
+	if (!lsi)
+		RETURN(-ENOMEM);
+	lmd = lsi->lsi_lmd;
+
+	/*
+	 * Disable lockdep during mount, because mount locking patterns are
+	 * 'special'.
+	 */
+	lockdep_off();
+
+	/*
+	 * LU-639: the OBD cleanup of last mount may not finish yet, wait here.
+	 */
+	obd_zombie_barrier();
+
+	/* Figure out the lmd from the mount options */
+	if (lmd_parse(lmd2_data, lmd)) {
+		lustre_put_lsi(sb);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (!lmd_is_client(lmd)) {
+#ifdef HAVE_SERVER_SUPPORT
+#if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(2, 15, 51, 0)
+		static bool printed;
+
+		if (!printed) {
+			LCONSOLE_WARN("%s: mounting server target with '-t lustre' deprecated, use '-t lustre_tgt'\n",
+				      lmd->lmd_profile);
+			printed = true;
+		}
+#endif
+		rc = server_fill_super(sb);
+#else
+		rc = -ENODEV;
+		CERROR("%s: This is client-side-only module, cannot handle server mount: rc = %d\n",
+		       lmd->lmd_profile, rc);
+		lustre_put_lsi(sb);
+#endif
+		GOTO(out, rc);
+	}
+
+	CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+	rc = lustre_start_mgc(sb);
+	if (rc) {
+		lustre_common_put_super(sb);
+		GOTO(out, rc);
+	}
+	/* Connect and start */
+	rc = ll_fill_super(sb);
+	/* ll_file_super will call lustre_common_put_super on failure,
+	 * which takes care of the module reference.
+	 *
+	 * If error happens in fill_super() call, @lsi will be killed there.
+	 * This is why we do not put it here.
+	 */
+out:
+	if (rc) {
+		CERROR("llite: Unable to mount %s: rc = %d\n",
+		       s2lsi(sb) ? lmd->lmd_dev : "<unknown>", rc);
+	} else {
+		CDEBUG(D_SUPER, "%s: Mount complete\n",
+		       lmd->lmd_dev);
+	}
+	lockdep_on();
+	return rc;
+}
+
+/***************** FS registration ******************/
+static struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+				   const char *devname, void *data)
+{
+	return mount_nodev(fs_type, flags, data, lustre_fill_super);
+}
+
+static void lustre_kill_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (lsi && !IS_SERVER(lsi))
+		ll_kill_super(sb);
+
+	kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+static struct file_system_type lustre_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "lustre",
+	.mount		= lustre_mount,
+	.kill_sb	= lustre_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE,
+};
+MODULE_ALIAS_FS("lustre");
+
+static int __init lustre_init(void)
+{
+	struct lnet_processid lnet_id;
+	int i, rc;
+	unsigned long lustre_inode_cache_flags;
+
+	BUILD_BUG_ON(sizeof(LUSTRE_VOLATILE_HDR) !=
+		     LUSTRE_VOLATILE_HDR_LEN + 1);
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre client module (%p).\n",
+	       &lustre_super_operations);
+
+	lustre_inode_cache_flags = SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+				   SLAB_MEM_SPREAD;
+#ifdef SLAB_ACCOUNT
+	lustre_inode_cache_flags |= SLAB_ACCOUNT;
+#endif
+
+	ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
+					    sizeof(struct ll_inode_info),
+					    0, lustre_inode_cache_flags, NULL);
+	if (ll_inode_cachep == NULL)
+		GOTO(out_cache, rc = -ENOMEM);
+
+	ll_file_data_slab = kmem_cache_create("ll_file_data",
+						 sizeof(struct ll_file_data), 0,
+						 SLAB_HWCACHE_ALIGN, NULL);
+	if (ll_file_data_slab == NULL)
+		GOTO(out_cache, rc = -ENOMEM);
+
+	pcc_inode_slab = kmem_cache_create("ll_pcc_inode",
+					   sizeof(struct pcc_inode), 0,
+					   SLAB_HWCACHE_ALIGN, NULL);
+	if (pcc_inode_slab == NULL)
+		GOTO(out_cache, rc = -ENOMEM);
+
+	rc = llite_tunables_register();
+	if (rc)
+		GOTO(out_cache, rc);
+
+	/* Nodes with small feet have little entropy. The NID for this
+	 * node gives the most entropy in the low bits. */
+	for (i = 0;; i++) {
+		if (LNetGetId(i, &lnet_id) == -ENOENT)
+			break;
+
+		add_device_randomness(&lnet_id.nid, sizeof(lnet_id.nid));
+	}
+
+	rc = vvp_global_init();
+	if (rc != 0)
+		GOTO(out_tunables, rc);
+
+	cl_inode_fini_env = cl_env_alloc(&cl_inode_fini_refcheck,
+					 LCT_REMEMBER | LCT_NOREF);
+	if (IS_ERR(cl_inode_fini_env))
+		GOTO(out_vvp, rc = PTR_ERR(cl_inode_fini_env));
+
+	cl_inode_fini_env->le_ctx.lc_cookie = 0x4;
+
+	rc = ll_xattr_init();
+	if (rc != 0)
+		GOTO(out_inode_fini_env, rc);
+
+	rc = register_filesystem(&lustre_fs_type);
+	if (rc)
+		GOTO(out_xattr, rc);
+
+	RETURN(0);
+
+out_xattr:
+	ll_xattr_fini();
+out_inode_fini_env:
+	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
+out_vvp:
+	vvp_global_fini();
+out_tunables:
+	llite_tunables_unregister();
+out_cache:
+	kmem_cache_destroy(ll_inode_cachep);
+	kmem_cache_destroy(ll_file_data_slab);
+	kmem_cache_destroy(pcc_inode_slab);
+	return rc;
+}
+
+static void __exit lustre_exit(void)
+{
+	unregister_filesystem(&lustre_fs_type);
+
+	llite_tunables_unregister();
+
+	ll_xattr_fini();
+	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
+	vvp_global_fini();
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
+	kmem_cache_destroy(ll_inode_cachep);
+	kmem_cache_destroy(ll_file_data_slab);
+	kmem_cache_destroy(pcc_inode_slab);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Client File System");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(lustre_init);
+module_exit(lustre_exit);
diff --git a/drivers/staging/lustrefsx/lustre/llite/symlink.c b/drivers/staging/lustrefsx/lustre/llite/symlink.c
new file mode 100644
index 0000000000000..1a4bf5f9aa5db
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/symlink.c
@@ -0,0 +1,338 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/version.h>
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "llite_internal.h"
+
+/* Must be called with lli_size_mutex locked */
+/* HAVE_IOP_GET_LINK is defined from kernel 4.5, whereas
+ * IS_ENCRYPTED is brought by kernel 4.14.
+ * So there is no need to handle encryption case otherwise.
+ */
+#ifdef HAVE_IOP_GET_LINK
+static int ll_readlink_internal(struct inode *inode,
+				struct ptlrpc_request **request,
+				char **symname, struct delayed_call *done)
+#else
+static int ll_readlink_internal(struct inode *inode,
+				struct ptlrpc_request **request, char **symname)
+#endif
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int rc, symlen = i_size_read(inode) + 1;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+
+	ENTRY;
+
+	*request = NULL;
+
+	if (lli->lli_symlink_name) {
+		int print_limit = min_t(int, PAGE_SIZE - 128, symlen);
+
+		*symname = lli->lli_symlink_name;
+		/*
+		 * If the total CDEBUG() size is larger than a page, it
+		 * will print a warning to the console, avoid this by
+		 * printing just the last part of the symlink.
+		 */
+		CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n",
+		       print_limit < symlen ? "..." : "", print_limit,
+		       (*symname) + symlen - print_limit, symlen);
+		RETURN(0);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen,
+                                     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_LINKNAME;
+	rc = md_getattr(sbi->ll_md_exp, op_data, request);
+	ll_finish_md_op_data(op_data);
+	if (rc) {
+		if (rc != -ENOENT)
+			CERROR("%s: inode "DFID": rc = %d\n",
+			       ll_i2sbi(inode)->ll_fsname,
+			       PFID(ll_inode2fid(inode)), rc);
+		GOTO(failed, rc);
+	}
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+	if ((body->mbo_valid & OBD_MD_LINKNAME) == 0) {
+		CERROR("OBD_MD_LINKNAME not set on reply\n");
+		GOTO(failed, rc = -EPROTO);
+	}
+
+	LASSERT(symlen != 0);
+	if (body->mbo_eadatasize != symlen) {
+		CERROR("%s: inode "DFID": symlink length %d not expected %d\n",
+		       sbi->ll_fsname, PFID(ll_inode2fid(inode)),
+		       body->mbo_eadatasize - 1, symlen - 1);
+                GOTO(failed, rc = -EPROTO);
+	}
+
+	*symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD);
+	if (!*symname ||
+	    (!IS_ENCRYPTED(inode) &&
+	     strnlen(*symname, symlen) != symlen - 1)) {
+		/* not full/NULL terminated */
+		CERROR("%s: inode "DFID": symlink not NULL terminated string of length %d\n",
+		       sbi->ll_fsname,
+		       PFID(ll_inode2fid(inode)), symlen - 1);
+		GOTO(failed, rc = -EPROTO);
+	}
+
+#ifdef HAVE_IOP_GET_LINK
+	if (IS_ENCRYPTED(inode)) {
+		const char *target = llcrypt_get_symlink(inode, *symname,
+							 symlen, done);
+		if (IS_ERR(target))
+			RETURN(PTR_ERR(target));
+		symlen = strlen(target) + 1;
+		*symname = (char *)target;
+
+		/* Do not cache symlink targets encoded without the key,
+		 * since those become outdated once the key is added.
+		 */
+		if (!llcrypt_has_encryption_key(inode))
+			RETURN(0);
+	}
+#endif
+
+	OBD_ALLOC(lli->lli_symlink_name, symlen);
+	/* do not return an error if we cannot cache the symlink locally */
+	if (lli->lli_symlink_name) {
+		memcpy(lli->lli_symlink_name, *symname, symlen);
+		*symname = lli->lli_symlink_name;
+	}
+	RETURN(0);
+
+failed:
+	RETURN(rc);
+}
+
+#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA
+static void ll_put_link(struct dentry *dentry,
+			struct nameidata *nd, void *cookie)
+#else
+# ifdef HAVE_IOP_GET_LINK
+static void ll_put_link(void *cookie)
+# else
+static void ll_put_link(struct inode *unused, void *cookie)
+# endif
+#endif
+{
+	ptlrpc_req_finished(cookie);
+}
+
+#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA
+static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ptlrpc_request *request = NULL;
+	int rc;
+	char *symname = NULL;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	/*
+	 * Limit the recursive symlink depth to 5 instead of default
+	 * 8 links when kernel has 4k stack to prevent stack overflow.
+	 * For 8k stacks we need to limit it to 7 for local servers.
+	 */
+	if (THREAD_SIZE < 8192 && current->link_count >= 6) {
+		rc = -ELOOP;
+	} else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
+		rc = -ELOOP;
+	} else {
+		ll_inode_size_lock(inode);
+		rc = ll_readlink_internal(inode, &request, &symname);
+		ll_inode_size_unlock(inode);
+	}
+	if (rc) {
+		ptlrpc_req_finished(request);
+		request = NULL;
+		symname = ERR_PTR(rc);
+	}
+
+	nd_set_link(nd, symname);
+	/*
+	 * symname may contain a pointer to the request message buffer,
+	 * we delay request releasing until ll_put_link then.
+	 */
+	RETURN(request);
+}
+#else
+# ifdef HAVE_IOP_GET_LINK
+static const char *ll_get_link(struct dentry *dentry,
+			       struct inode *inode,
+			       struct delayed_call *done)
+{
+	struct ptlrpc_request *request;
+	char *symname = NULL;
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, inode="DFID"(%p)\n",
+	       dentry, PFID(ll_inode2fid(inode)), inode);
+	if (!dentry)
+		RETURN(ERR_PTR(-ECHILD));
+	ll_inode_size_lock(inode);
+	rc = ll_readlink_internal(inode, &request, &symname, done);
+	ll_inode_size_unlock(inode);
+	if (rc < 0) {
+		ptlrpc_req_finished(request);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * symname may contain a pointer to the request message buffer,
+	 * we delay request releasing then.
+	 */
+	set_delayed_call(done, ll_put_link, request);
+	RETURN(symname);
+}
+# else
+static const char *ll_follow_link(struct dentry *dentry, void **cookie)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ptlrpc_request *request;
+	char *symname = NULL;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	ll_inode_size_lock(inode);
+	rc = ll_readlink_internal(inode, &request, &symname);
+	ll_inode_size_unlock(inode);
+	if (rc < 0) {
+		ptlrpc_req_finished(request);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * symname may contain a pointer to the request message buffer,
+	 * we delay request releasing until ll_put_link then.
+	 */
+	*cookie = request;
+	RETURN(symname);
+}
+# endif /* HAVE_IOP_GET_LINK */
+#endif /* HAVE_SYMLINK_OPS_USE_NAMEIDATA */
+
+/**
+ * ll_getattr_link() - link-specific getattr to set the correct st_size
+ *		       for encrypted symlinks
+ *
+ * Override st_size of encrypted symlinks to be the length of the decrypted
+ * symlink target (or the no-key encoded symlink target, if the key is
+ * unavailable) rather than the length of the encrypted symlink target. This is
+ * necessary for st_size to match the symlink target that userspace actually
+ * sees.  POSIX requires this, and some userspace programs depend on it.
+ *
+ * For non encrypted symlinks, this is a just calling ll_getattr().
+ * For encrypted symlinks, this additionally requires reading the symlink target
+ * from disk if needed, setting up the inode's encryption key if possible, and
+ * then decrypting or encoding the symlink target.  This makes lstat() more
+ * heavyweight than is normally the case.  However, decrypted symlink targets
+ * will be cached in ->i_link, so usually the symlink won't have to be read and
+ * decrypted again later if/when it is actually followed, readlink() is called,
+ * or lstat() is called again.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR)
+static int ll_getattr_link(
+#if defined(HAVE_USER_NAMESPACE_ARG)
+			   struct user_namespace *mnt_userns,
+#endif
+			   const struct path *path, struct kstat *stat,
+			   u32 request_mask, unsigned int flags)
+{
+	struct dentry *dentry = path->dentry;
+	struct inode *inode = d_inode(dentry);
+	DEFINE_DELAYED_CALL(done);
+	const char *link;
+	int rc;
+
+	rc = ll_getattr(mnt_userns, path, stat, request_mask, flags);
+	if (rc || !IS_ENCRYPTED(inode))
+		return rc;
+
+	/*
+	 * To get the symlink target that userspace will see (whether it's the
+	 * decrypted target or the no-key encoded target), we can just get it
+	 * in the same way the VFS does during path resolution and readlink().
+	 */
+	link = READ_ONCE(inode->i_link);
+	if (!link) {
+		link = inode->i_op->get_link(dentry, inode, &done);
+		if (IS_ERR(link))
+			return PTR_ERR(link);
+	}
+	stat->size = strlen(link);
+	do_delayed_call(&done);
+	return 0;
+}
+#else /* HAVE_INODEOPS_ENHANCED_GETATTR */
+#define ll_getattr_link ll_getattr
+#endif
+
+const struct inode_operations ll_fast_symlink_inode_operations = {
+#ifdef HAVE_IOP_GENERIC_READLINK
+	.readlink	= generic_readlink,
+#endif
+	.setattr	= ll_setattr,
+#ifdef HAVE_IOP_GET_LINK
+	.get_link	= ll_get_link,
+#else
+	.follow_link	= ll_follow_link,
+	.put_link	= ll_put_link,
+#endif
+	.getattr	= ll_getattr_link,
+	.permission	= ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr	= ll_removexattr,
+#endif
+	.listxattr	= ll_listxattr,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
new file mode 100644
index 0000000000000..d5bb6c18e22ed
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
@@ -0,0 +1,623 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * cl_device and cl_device_type implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd.h>
+#include "llite_internal.h"
+#include "vvp_internal.h"
+#include <linux/kallsyms.h>
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+/*
+ * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical
+ * "llite_" (var. "ll_") prefix.
+ */
+
+static struct kmem_cache *ll_thread_kmem;
+struct kmem_cache *vvp_object_kmem;
+static struct kmem_cache *vvp_session_kmem;
+static struct kmem_cache *vvp_thread_kmem;
+
+static struct lu_kmem_descr vvp_caches[] = {
+	{
+		.ckd_cache = &ll_thread_kmem,
+		.ckd_name  = "ll_thread_kmem",
+		.ckd_size  = sizeof(struct ll_thread_info),
+	},
+	{
+		.ckd_cache = &vvp_object_kmem,
+		.ckd_name  = "vvp_object_kmem",
+		.ckd_size  = sizeof(struct vvp_object),
+	},
+        {
+                .ckd_cache = &vvp_session_kmem,
+                .ckd_name  = "vvp_session_kmem",
+                .ckd_size  = sizeof (struct vvp_session)
+        },
+	{
+		.ckd_cache = &vvp_thread_kmem,
+		.ckd_name  = "vvp_thread_kmem",
+		.ckd_size  = sizeof(struct vvp_thread_info),
+	},
+        {
+                .ckd_cache = NULL
+        }
+};
+
+static void *ll_thread_key_init(const struct lu_context *ctx,
+				struct lu_context_key *key)
+{
+	struct ll_thread_info *lti;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lti, ll_thread_kmem, GFP_NOFS);
+	if (lti == NULL)
+		lti = ERR_PTR(-ENOMEM);
+
+	return lti;
+}
+
+static void ll_thread_key_fini(const struct lu_context *ctx,
+			       struct lu_context_key *key, void *data)
+{
+	struct ll_thread_info *lti = data;
+
+	OBD_SLAB_FREE_PTR(lti, ll_thread_kmem);
+}
+
+struct lu_context_key ll_thread_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = ll_thread_key_init,
+	.lct_fini = ll_thread_key_fini,
+};
+
+static void *vvp_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct vvp_session *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, GFP_NOFS);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void vvp_session_key_fini(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+        struct vvp_session *session = data;
+        OBD_SLAB_FREE_PTR(session, vvp_session_kmem);
+}
+
+struct lu_context_key vvp_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = vvp_session_key_init,
+        .lct_fini = vvp_session_key_fini
+};
+
+static void *vvp_thread_key_init(const struct lu_context *ctx,
+				 struct lu_context_key *key)
+{
+	struct vvp_thread_info *vti;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vti, vvp_thread_kmem, GFP_NOFS);
+	if (vti == NULL)
+		vti = ERR_PTR(-ENOMEM);
+	return vti;
+}
+
+static void vvp_thread_key_fini(const struct lu_context *ctx,
+				struct lu_context_key *key, void *data)
+{
+	struct vvp_thread_info *vti = data;
+	OBD_SLAB_FREE_PTR(vti, vvp_thread_kmem);
+}
+
+struct lu_context_key vvp_thread_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = vvp_thread_key_init,
+	.lct_fini = vvp_thread_key_fini,
+};
+
+/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(vvp, &ll_thread_key, &vvp_session_key, &vvp_thread_key);
+
+static const struct lu_device_operations vvp_lu_ops = {
+        .ldo_object_alloc      = vvp_object_alloc
+};
+
+static struct lu_device *vvp_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct vvp_device *vdv  = lu2vvp_dev(d);
+	struct cl_site    *site = lu2cl_site(d->ld_site);
+	struct lu_device  *next = cl2lu_dev(vdv->vdv_next);
+
+	if (d->ld_site != NULL) {
+		cl_site_fini(site);
+		OBD_FREE_PTR(site);
+	}
+
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(vdv);
+	return next;
+}
+
+static struct lu_device *vvp_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct vvp_device *vdv;
+	struct lu_device *lud;
+	struct cl_site *site;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(vdv);
+	if (vdv == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lud = &vdv->vdv_cl.cd_lu_dev;
+	cl_device_init(&vdv->vdv_cl, t);
+	vvp2lu_dev(vdv)->ld_ops = &vvp_lu_ops;
+
+	OBD_ALLOC_PTR(site);
+	if (site != NULL) {
+		rc = cl_site_init(site, &vdv->vdv_cl);
+		if (rc == 0)
+			rc = lu_site_init_finish(&site->cs_lu);
+		else {
+			LASSERT(lud->ld_site == NULL);
+			CERROR("Cannot init lu_site, rc %d.\n", rc);
+			OBD_FREE_PTR(site);
+		}
+	} else
+		rc = -ENOMEM;
+	if (rc != 0) {
+		vvp_device_free(env, lud);
+		lud = ERR_PTR(rc);
+	}
+	RETURN(lud);
+}
+
+static int vvp_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	struct vvp_device  *vdv;
+	int rc;
+	ENTRY;
+
+	vdv = lu2vvp_dev(d);
+	vdv->vdv_next = lu2cl_dev(next);
+
+	LASSERT(d->ld_site != NULL && next->ld_type != NULL);
+	next->ld_site = d->ld_site;
+	rc = next->ld_type->ldt_ops->ldto_device_init(
+		env, next, next->ld_type->ldt_name, NULL);
+	if (rc == 0) {
+		lu_device_get(next);
+		lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	}
+	RETURN(rc);
+}
+
+static struct lu_device *vvp_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	return cl2lu_dev(lu2vvp_dev(d)->vdv_next);
+}
+
+static const struct lu_device_type_operations vvp_device_type_ops = {
+        .ldto_init = vvp_type_init,
+        .ldto_fini = vvp_type_fini,
+
+        .ldto_start = vvp_type_start,
+        .ldto_stop  = vvp_type_stop,
+
+	.ldto_device_alloc	= vvp_device_alloc,
+	.ldto_device_free	= vvp_device_free,
+	.ldto_device_init	= vvp_device_init,
+	.ldto_device_fini	= vvp_device_fini,
+};
+
+struct lu_device_type vvp_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_VVP_NAME,
+        .ldt_ops      = &vvp_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+unsigned int (*vvp_account_page_dirtied)(struct page *page,
+					 struct address_space *mapping);
+
+/**
+ * A mutex serializing calls to vvp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+int vvp_global_init(void)
+{
+	int rc;
+
+	rc = lu_kmem_init(vvp_caches);
+	if (rc != 0)
+		return rc;
+
+	rc = lu_device_type_init(&vvp_device_type);
+	if (rc != 0)
+		goto out_kmem;
+
+#ifndef HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT
+#ifdef HAVE_KALLSYMS_LOOKUP_NAME
+	/*
+	 * Kernel v5.2-5678-gac1c3e4 no longer exports account_page_dirtied
+	 */
+	vvp_account_page_dirtied = (void *)
+		cfs_kallsyms_lookup_name("account_page_dirtied");
+#endif
+#endif
+
+	return 0;
+
+out_kmem:
+	lu_kmem_fini(vvp_caches);
+
+	return rc;
+}
+
+void vvp_global_fini(void)
+{
+	lu_device_type_fini(&vvp_device_type);
+	lu_kmem_fini(vvp_caches);
+}
+
+/*****************************************************************************
+ *
+ * mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct super_block *sb)
+{
+        struct ll_sb_info *sbi;
+        struct cl_device  *cl;
+        struct lu_env     *env;
+        int rc = 0;
+	__u16 refcheck;
+
+        sbi  = ll_s2sbi(sb);
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                cl = cl_type_setup(env, NULL, &vvp_device_type,
+                                   sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+                if (!IS_ERR(cl)) {
+                        sbi->ll_cl = cl;
+                        sbi->ll_site = cl2lu_dev(cl)->ld_site;
+                }
+                cl_env_put(env, &refcheck);
+        } else
+                rc = PTR_ERR(env);
+        RETURN(rc);
+}
+
+int cl_sb_fini(struct super_block *sb)
+{
+        struct ll_sb_info *sbi;
+        struct lu_env     *env;
+        struct cl_device  *cld;
+	__u16              refcheck;
+        int                result;
+
+        ENTRY;
+        sbi = ll_s2sbi(sb);
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                cld = sbi->ll_cl;
+
+                if (cld != NULL) {
+                        cl_stack_fini(env, cld);
+                        sbi->ll_cl = NULL;
+                        sbi->ll_site = NULL;
+                }
+                cl_env_put(env, &refcheck);
+                result = 0;
+        } else {
+                CERROR("Cannot cleanup cl-stack due to memory shortage.\n");
+                result = PTR_ERR(env);
+        }
+
+	RETURN(result);
+}
+
+/****************************************************************************
+ *
+ * debugfs/lustre/llite/$MNT/dump_page_cache
+ *
+ ****************************************************************************/
+
+struct vvp_seq_private {
+	struct ll_sb_info	*vsp_sbi;
+	struct lu_env		*vsp_env;
+	u16			vsp_refcheck;
+	struct cl_object	*vsp_clob;
+	struct rhashtable_iter	vsp_iter;
+	u32			vsp_page_index;
+	/*
+	 * prev_pos is the 'pos' of the last object returned
+	 * by ->start of ->next.
+	 */
+	loff_t			vvp_prev_pos;
+};
+
+unsigned int ll_filemap_get_one_page_contig(struct address_space *mapping,
+					     pgoff_t start, struct page **pg)
+{
+#ifdef HAVE_FILEMAP_GET_FOLIOS_CONTIG
+	struct folio_batch fbatch;
+	int nr;
+
+	folio_batch_init(&fbatch);
+	*pg = NULL;
+
+	nr = filemap_get_folios_contig(mapping, &start, start, &fbatch);
+	if (nr == PAGEVEC_SIZE) {
+		--nr;
+		*pg = folio_page(fbatch.folios[nr], 0);
+		return 1;
+	}
+	return 0;
+#else /* !HAVE_FILEMAP_GET_FOLIOS_CONTIG */
+	return find_get_pages_contig(mapping, start, 1, pg);
+#endif
+}
+
+static struct page *vvp_pgcache_current(struct vvp_seq_private *priv)
+{
+	struct lu_device *dev = &priv->vsp_sbi->ll_cl->cd_lu_dev;
+	struct lu_object_header *h;
+	struct page *vmpage = NULL;
+
+	rhashtable_walk_start(&priv->vsp_iter);
+	while ((h = rhashtable_walk_next(&priv->vsp_iter)) != NULL) {
+		struct inode *inode;
+		int nr;
+
+		if (IS_ERR(h)) {
+			if (PTR_ERR(h) == -EAGAIN)
+				continue;
+			break;
+		}
+
+		if (!priv->vsp_clob) {
+			struct lu_object *lu_obj;
+
+			lu_obj = lu_object_get_first(h, dev);
+			if (!lu_obj)
+				continue;
+
+			priv->vsp_clob = lu2cl(lu_obj);
+			lu_object_ref_add_atomic(lu_obj, "dump", current);
+			priv->vsp_page_index = 0;
+		}
+
+		inode = vvp_object_inode(priv->vsp_clob);
+		nr = ll_filemap_get_one_page_contig(inode->i_mapping,
+						    priv->vsp_page_index,
+						    &vmpage);
+		if (nr > 0) {
+			priv->vsp_page_index = vmpage->index;
+			break;
+		}
+		lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current);
+		cl_object_put(priv->vsp_env, priv->vsp_clob);
+		priv->vsp_clob = NULL;
+		priv->vsp_page_index = 0;
+	}
+	rhashtable_walk_stop(&priv->vsp_iter);
+	return vmpage;
+}
+
+#define seq_page_flag(seq, page, flag, has_flags) do {                  \
+	if (test_bit(PG_##flag, &(page)->flags)) {                  \
+                seq_printf(seq, "%s"#flag, has_flags ? "|" : "");       \
+                has_flags = 1;                                          \
+        }                                                               \
+} while(0)
+
+static void vvp_pgcache_page_show(const struct lu_env *env,
+				  struct seq_file *seq, struct cl_page *page)
+{
+	struct vvp_page *vpg;
+	struct page      *vmpage;
+	int              has_flags;
+
+	vpg = cl2vvp_page(cl_page_at(page, &vvp_device_type));
+	vmpage = vpg->vpg_page;
+	seq_printf(seq, " %5i | %p %p %s %s %s | %p "DFID"(%p) %lu %u [",
+		   0 /* gen */,
+		   vpg, page,
+		   "none",
+		   vpg->vpg_defer_uptodate ? "du" : "- ",
+		   PageWriteback(vmpage) ? "wb" : "-",
+		   vmpage,
+		   PFID(ll_inode2fid(vmpage->mapping->host)),
+		   vmpage->mapping->host, vmpage->index,
+		   page_count(vmpage));
+	has_flags = 0;
+	seq_page_flag(seq, vmpage, locked, has_flags);
+	seq_page_flag(seq, vmpage, error, has_flags);
+	seq_page_flag(seq, vmpage, referenced, has_flags);
+	seq_page_flag(seq, vmpage, uptodate, has_flags);
+	seq_page_flag(seq, vmpage, dirty, has_flags);
+	seq_page_flag(seq, vmpage, writeback, has_flags);
+	seq_printf(seq, "%s]\n", has_flags ? "" : "-");
+}
+
+static int vvp_pgcache_show(struct seq_file *f, void *v)
+{
+	struct vvp_seq_private *priv = f->private;
+	struct page *vmpage = v;
+	struct cl_page *page;
+
+	seq_printf(f, "%8lx@" DFID ": ", vmpage->index,
+		   PFID(lu_object_fid(&priv->vsp_clob->co_lu)));
+	lock_page(vmpage);
+	page = cl_vmpage_page(vmpage, priv->vsp_clob);
+	unlock_page(vmpage);
+	put_page(vmpage);
+
+	if (page) {
+		vvp_pgcache_page_show(priv->vsp_env, f, page);
+		cl_page_put(priv->vsp_env, page);
+	} else {
+		seq_puts(f, "missing\n");
+	}
+
+	return 0;
+}
+
+static void vvp_pgcache_rewind(struct vvp_seq_private *priv)
+{
+	if (priv->vvp_prev_pos) {
+		struct lu_site *s = priv->vsp_sbi->ll_cl->cd_lu_dev.ld_site;
+
+		rhashtable_walk_exit(&priv->vsp_iter);
+		rhashtable_walk_enter(&s->ls_obj_hash, &priv->vsp_iter);
+		priv->vvp_prev_pos = 0;
+		if (priv->vsp_clob) {
+			lu_object_ref_del(&priv->vsp_clob->co_lu, "dump",
+					  current);
+			cl_object_put(priv->vsp_env, priv->vsp_clob);
+		}
+		priv->vsp_clob = NULL;
+	}
+}
+
+static struct page *vvp_pgcache_next_page(struct vvp_seq_private *priv)
+{
+	priv->vsp_page_index += 1;
+	return vvp_pgcache_current(priv);
+}
+
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+	struct vvp_seq_private *priv = f->private;
+
+	if (*pos == 0) {
+		vvp_pgcache_rewind(priv);
+	} else if (*pos == priv->vvp_prev_pos) {
+		/* Return the current item */;
+	} else {
+		WARN_ON(*pos != priv->vvp_prev_pos + 1);
+		priv->vsp_page_index += 1;
+	}
+
+	priv->vvp_prev_pos = *pos;
+	return vvp_pgcache_current(priv);
+}
+
+static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	struct vvp_seq_private *priv = f->private;
+
+	WARN_ON(*pos != priv->vvp_prev_pos);
+	*pos += 1;
+	priv->vvp_prev_pos = *pos;
+	return vvp_pgcache_next_page(priv);
+}
+
+static void vvp_pgcache_stop(struct seq_file *f, void *v)
+{
+        /* Nothing to do */
+}
+
+static const struct seq_operations vvp_pgcache_ops = {
+	.start = vvp_pgcache_start,
+	.next  = vvp_pgcache_next,
+	.stop  = vvp_pgcache_stop,
+	.show  = vvp_pgcache_show
+};
+
+static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
+{
+	struct vvp_seq_private *priv;
+	struct lu_site *s;
+
+	priv = __seq_open_private(filp, &vvp_pgcache_ops, sizeof(*priv));
+	if (!priv)
+		return -ENOMEM;
+
+	priv->vsp_sbi = inode->i_private;
+	priv->vsp_env = cl_env_get(&priv->vsp_refcheck);
+	priv->vsp_clob = NULL;
+	if (IS_ERR(priv->vsp_env)) {
+		int err = PTR_ERR(priv->vsp_env);
+
+		seq_release_private(inode, filp);
+		return err;
+	}
+
+	s = priv->vsp_sbi->ll_cl->cd_lu_dev.ld_site;
+	rhashtable_walk_enter(&s->ls_obj_hash, &priv->vsp_iter);
+
+	return 0;
+}
+
+static int vvp_dump_pgcache_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct vvp_seq_private *priv = seq->private;
+
+	if (priv->vsp_clob) {
+		lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current);
+		cl_object_put(priv->vsp_env, priv->vsp_clob);
+	}
+	cl_env_put(priv->vsp_env, &priv->vsp_refcheck);
+	rhashtable_walk_exit(&priv->vsp_iter);
+	return seq_release_private(inode, file);
+}
+
+const struct file_operations vvp_dump_pgcache_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = vvp_dump_pgcache_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = vvp_dump_pgcache_seq_release,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
new file mode 100644
index 0000000000000..1511c320522d8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
@@ -0,0 +1,311 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Internal definitions for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef VVP_INTERNAL_H
+#define VVP_INTERNAL_H
+
+#include <cl_object.h>
+
+enum obd_notify_event;
+struct inode;
+struct lustre_md;
+struct obd_device;
+struct obd_export;
+struct page;
+
+/**
+ * IO state private to VVP layer.
+ */
+struct vvp_io {
+	/** super class */
+	struct cl_io_slice     vui_cl;
+	struct cl_io_lock_link vui_link;
+	/**
+	 * I/O vector information to or from which read/write is going.
+	 */
+	struct iov_iter *vui_iter;
+	/**
+	 * Total size for the left IO.
+	 */
+	size_t vui_tot_count;
+
+	union {
+		struct vvp_fault_io {
+			struct vm_area_struct	*ft_vma;
+			/**
+			 *  locked page returned from vvp_io
+			 */
+			struct page		*ft_vmpage;
+			/**
+			 * kernel fault info
+			 */
+			struct vm_fault		*ft_vmf;
+			/**
+			 * fault API used bitflags for return code.
+			 */
+			unsigned int		 ft_flags;
+			/**
+			 * check that flags are from filemap_fault
+			 */
+			bool			 ft_flags_valid;
+			struct cl_page_list	 ft_queue;
+		} fault;
+		struct {
+			struct cl_page_list vui_queue;
+			unsigned long vui_written;
+			unsigned long vui_read;
+			int vui_from;
+			int vui_to;
+		} readwrite; /* normal io */
+	} u;
+
+	/**
+	 * Layout version when this IO is initialized
+	 */
+	__u32			vui_layout_gen;
+	/**
+	* File descriptor against which IO is done.
+	*/
+	struct ll_file_data	*vui_fd;
+	struct kiocb		*vui_iocb;
+
+	/* Readahead state. */
+	pgoff_t			vui_ra_start_idx;
+	pgoff_t			vui_ra_pages;
+	/* Set when vui_ra_{start,count} have been initialized. */
+	bool			vui_ra_valid;
+};
+
+extern struct lu_device_type vvp_device_type;
+
+extern struct lu_context_key vvp_session_key;
+extern struct lu_context_key vvp_thread_key;
+
+extern struct kmem_cache *vvp_object_kmem;
+
+struct vvp_thread_info {
+	struct cl_lock		vti_lock;
+	struct cl_lock_descr	vti_descr;
+	struct cl_io		vti_io;
+	struct cl_attr		vti_attr;
+	struct cl_sync_io	vti_anchor;
+};
+
+static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
+{
+	struct vvp_thread_info *vti;
+
+	vti = lu_context_key_get(&env->le_ctx, &vvp_thread_key);
+	LASSERT(vti != NULL);
+
+	return vti;
+}
+
+static inline struct cl_lock *vvp_env_lock(const struct lu_env *env)
+{
+	struct cl_lock *lock = &vvp_env_info(env)->vti_lock;
+
+	memset(lock, 0, sizeof(*lock));
+
+	return lock;
+}
+
+static inline struct cl_attr *vvp_env_thread_attr(const struct lu_env *env)
+{
+	struct cl_attr *attr = &vvp_env_info(env)->vti_attr;
+
+	memset(attr, 0, sizeof(*attr));
+
+	return attr;
+}
+
+static inline struct cl_io *vvp_env_thread_io(const struct lu_env *env)
+{
+	struct cl_io *io = &vvp_env_info(env)->vti_io;
+
+	memset(io, 0, sizeof(*io));
+
+	return io;
+}
+
+struct vvp_session {
+	struct vvp_io vs_ios;
+};
+
+static inline struct vvp_session *vvp_env_session(const struct lu_env *env)
+{
+	struct vvp_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &vvp_session_key);
+	LASSERT(ses != NULL);
+
+	return ses;
+}
+
+static inline struct vvp_io *vvp_env_io(const struct lu_env *env)
+{
+	return &vvp_env_session(env)->vs_ios;
+}
+
+/**
+ * VPP-private object state.
+ */
+struct vvp_object {
+	struct cl_object_header vob_header;
+	struct cl_object        vob_cl;
+	struct inode           *vob_inode;
+
+	/**
+	 * Number of outstanding mmaps on this file.
+	 *
+	 * \see ll_vm_open(), ll_vm_close().
+	 */
+	atomic_t                vob_mmap_cnt;
+
+	/**
+	 * various flags
+	 * vob_discard_page_warned
+	 *     if pages belonging to this object are discarded when a client
+	 * is evicted, some debug info will be printed, this flag will be set
+	 * during processing the first discarded page, then avoid flooding
+	 * debug message for lots of discarded pages.
+	 *
+	 * \see ll_dirty_page_discard_warn.
+	 */
+	unsigned int		vob_discard_page_warned:1;
+};
+
+/**
+ * VVP-private page state.
+ */
+struct vvp_page {
+	struct cl_page_slice vpg_cl;
+	unsigned	vpg_defer_uptodate:1,
+			vpg_ra_updated:1,
+			vpg_ra_used:1;
+	/** VM page */
+	struct page	*vpg_page;
+};
+
+static inline struct vvp_page *cl2vvp_page(const struct cl_page_slice *slice)
+{
+	return container_of(slice, struct vvp_page, vpg_cl);
+}
+
+static inline pgoff_t vvp_index(struct vvp_page *vpg)
+{
+	return vpg->vpg_page->index;
+}
+
+struct vvp_device {
+	struct cl_device    vdv_cl;
+	struct cl_device   *vdv_next;
+};
+
+static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv)
+{
+	return &vdv->vdv_cl.cd_lu_dev;
+}
+
+static inline struct vvp_device *lu2vvp_dev(const struct lu_device *d)
+{
+	return container_of_safe(d, struct vvp_device, vdv_cl.cd_lu_dev);
+}
+
+static inline struct vvp_device *cl2vvp_dev(const struct cl_device *d)
+{
+	return container_of_safe(d, struct vvp_device, vdv_cl);
+}
+
+static inline struct vvp_object *cl2vvp(const struct cl_object *obj)
+{
+	return container_of_safe(obj, struct vvp_object, vob_cl);
+}
+
+static inline struct vvp_object *lu2vvp(const struct lu_object *obj)
+{
+	return container_of_safe(obj, struct vvp_object, vob_cl.co_lu);
+}
+
+static inline struct inode *vvp_object_inode(const struct cl_object *obj)
+{
+	return cl2vvp(obj)->vob_inode;
+}
+
+int vvp_object_invariant(const struct cl_object *obj);
+struct vvp_object *cl_inode2vvp(struct inode *inode);
+
+static inline struct page *cl2vm_page(const struct cl_page_slice *slice)
+{
+	return cl2vvp_page(slice)->vpg_page;
+}
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+# define CLOBINVRNT(env, clob, expr)					\
+	do {								\
+		if (unlikely(!(expr))) {				\
+			LU_OBJECT_DEBUG(D_ERROR, (env), &(clob)->co_lu, \
+					#expr);				\
+			LINVRNT(0);					\
+		}							\
+	} while (0)
+#else /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+# define CLOBINVRNT(env, clob, expr)					\
+	((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr))
+#endif /* CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+
+int lov_read_and_clear_async_rc(struct cl_object *clob);
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io);
+int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *page, pgoff_t index);
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+
+int vvp_global_init(void);
+void vvp_global_fini(void);
+
+#if !defined(HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT) || \
+defined(HAVE_KALLSYMS_LOOKUP_NAME)
+extern unsigned int (*vvp_account_page_dirtied)(struct page *page,
+						struct address_space *mapping);
+#endif
+
+extern const struct file_operations vvp_dump_pgcache_file_ops;
+
+#endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
new file mode 100644
index 0000000000000..37421981c2e10
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
@@ -0,0 +1,1853 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_io for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd.h>
+#include <linux/pagevec.h>
+#include <linux/memcontrol.h>
+#include <linux/falloc.h>
+
+#include "llite_internal.h"
+#include "vvp_internal.h"
+#include <libcfs/linux/linux-misc.h>
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct vvp_io *vio;
+
+	vio = container_of(slice, struct vvp_io, vui_cl);
+	LASSERT(vio == vvp_env_io(env));
+
+	return vio;
+}
+
+/**
+ * For swapping layout. The file's layout may have changed.
+ * To avoid populating pages to a wrong stripe, we have to verify the
+ * correctness of layout. It works because swapping layout processes
+ * have to acquire group lock.
+ */
+static bool can_populate_pages(const struct lu_env *env, struct cl_io *io,
+				struct inode *inode)
+{
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct vvp_io		*vio = vvp_env_io(env);
+	bool rc = true;
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		/* don't need lock here to check lli_layout_gen as we have held
+		 * extent lock and GROUP lock has to hold to swap layout */
+		if (ll_layout_version_get(lli) != vio->vui_layout_gen ||
+		    OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_LOST_LAYOUT, 0)) {
+			io->ci_need_restart = 1;
+			/* this will cause a short read/write */
+			io->ci_continue = 0;
+			rc = false;
+		}
+	case CIT_FAULT:
+		/* fault is okay because we've already had a page. */
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+static void vvp_object_size_lock(struct cl_object *obj)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	ll_inode_size_lock(inode);
+	cl_object_attr_lock(obj);
+}
+
+static void vvp_object_size_unlock(struct cl_object *obj)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	cl_object_attr_unlock(obj);
+	ll_inode_size_unlock(inode);
+}
+
+/**
+ * Helper function that if necessary adjusts file size (inode->i_size), when
+ * position at the offset \a pos is accessed. File size can be arbitrary stale
+ * on a Lustre client, but client at least knows KMS. If accessed area is
+ * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
+ *
+ * Locking: i_size_lock is used to serialize changes to inode size and to
+ * protect consistency between inode size and cl_object
+ * attributes. cl_object_size_lock() protects consistency between cl_attr's of
+ * top-object and sub-objects.
+ */
+static int vvp_prep_size(const struct lu_env *env, struct cl_object *obj,
+			 struct cl_io *io, loff_t start, size_t count,
+			 int *exceed)
+{
+	struct cl_attr *attr  = vvp_env_thread_attr(env);
+	struct inode   *inode = vvp_object_inode(obj);
+	loff_t          pos   = start + count - 1;
+	loff_t kms;
+	int result;
+
+	/*
+	 * Consistency guarantees: following possibilities exist for the
+	 * relation between region being accessed and real file size at this
+	 * moment:
+	 *
+	 *  (A): the region is completely inside of the file;
+	 *
+	 *  (B-x): x bytes of region are inside of the file, the rest is
+	 *  outside;
+	 *
+	 *  (C): the region is completely outside of the file.
+	 *
+	 * This classification is stable under DLM lock already acquired by
+	 * the caller, because to change the class, other client has to take
+	 * DLM lock conflicting with our lock. Also, any updates to ->i_size
+	 * by other threads on this client are serialized by
+	 * ll_inode_size_lock(). This guarantees that short reads are handled
+	 * correctly in the face of concurrent writes and truncates.
+	 */
+	vvp_object_size_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	if (result == 0) {
+		kms = attr->cat_kms;
+		if (pos > kms) {
+			/*
+			 * A glimpse is necessary to determine whether we
+			 * return a short read (B) or some zeroes at the end
+			 * of the buffer (C)
+			 */
+			vvp_object_size_unlock(obj);
+			result = cl_glimpse_lock(env, io, inode, obj, 0);
+			if (result == 0 && exceed != NULL) {
+				/* If objective page index exceed end-of-file
+				 * page index, return directly. Do not expect
+				 * kernel will check such case correctly.
+				 * linux-2.6.18-128.1.1 miss to do that.
+				 * --bug 17336 */
+				loff_t size = i_size_read(inode);
+				unsigned long cur_index = start >>
+					PAGE_SHIFT;
+
+				if ((size == 0 && cur_index != 0) ||
+				    (((size - 1) >> PAGE_SHIFT) <
+				     cur_index))
+					*exceed = 1;
+			}
+
+			return result;
+		} else {
+			/*
+			 * region is within kms and, hence, within real file
+			 * size (A). We need to increase i_size to cover the
+			 * read region so that generic_file_read() will do its
+			 * job, but that doesn't mean the kms size is
+			 * _correct_, it is only the _minimum_ size. If
+			 * someone does a stat they will get the correct size
+			 * which will always be >= the kms value here.
+			 * b=11081
+			 */
+			if (i_size_read(inode) < kms) {
+				i_size_write(inode, kms);
+				CDEBUG(D_VFSTRACE,
+				       DFID" updating i_size %llu\n",
+				       PFID(lu_object_fid(&obj->co_lu)),
+				       (__u64)i_size_read(inode));
+			}
+		}
+	}
+
+	vvp_object_size_unlock(obj);
+
+	return result;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int vvp_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+				 __u32 enqflags, enum cl_lock_mode mode,
+				 pgoff_t start, pgoff_t end)
+{
+	struct vvp_io          *vio   = vvp_env_io(env);
+	struct cl_lock_descr   *descr = &vio->vui_link.cill_descr;
+	struct cl_object       *obj   = io->ci_obj;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end);
+
+	memset(&vio->vui_link, 0, sizeof vio->vui_link);
+
+	if (vio->vui_fd && (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		descr->cld_mode = CLM_GROUP;
+		descr->cld_gid  = vio->vui_fd->fd_grouplock.lg_gid;
+		enqflags |= CEF_LOCK_MATCH;
+	} else {
+		descr->cld_mode  = mode;
+	}
+
+	descr->cld_obj   = obj;
+	descr->cld_start = start;
+	descr->cld_end   = end;
+	descr->cld_enq_flags = enqflags;
+
+	cl_io_lock_add(env, io, &vio->vui_link);
+
+	RETURN(0);
+}
+
+static int vvp_io_one_lock(const struct lu_env *env, struct cl_io *io,
+			   __u32 enqflags, enum cl_lock_mode mode,
+			   loff_t start, loff_t end)
+{
+	struct cl_object *obj = io->ci_obj;
+
+	return vvp_io_one_lock_index(env, io, enqflags, mode,
+				     cl_index(obj, start), cl_index(obj, end));
+}
+
+static int vvp_io_write_iter_init(const struct lu_env *env,
+				  const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+
+	cl_page_list_init(&vio->u.readwrite.vui_queue);
+	vio->u.readwrite.vui_written = 0;
+	vio->u.readwrite.vui_from = 0;
+	vio->u.readwrite.vui_to = PAGE_SIZE;
+
+	return 0;
+}
+
+static int vvp_io_read_iter_init(const struct lu_env *env,
+				 const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+
+	vio->u.readwrite.vui_read = 0;
+
+	return 0;
+}
+
+static void vvp_io_write_iter_fini(const struct lu_env *env,
+				   const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+
+	LASSERT(vio->u.readwrite.vui_queue.pl_nr == 0);
+}
+
+static int vvp_io_fault_iter_init(const struct lu_env *env,
+                                  const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio   = cl2vvp_io(env, ios);
+	struct inode  *inode = vvp_object_inode(ios->cis_obj);
+
+	LASSERT(inode == file_inode(vio->vui_fd->fd_file));
+
+	return 0;
+}
+
+static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct cl_io     *io  = ios->cis_io;
+	struct cl_object *obj = io->ci_obj;
+	struct vvp_io    *vio = cl2vvp_io(env, ios);
+	struct inode     *inode = vvp_object_inode(obj);
+	__u32		  gen = 0;
+	int rc;
+	ENTRY;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d "
+	       "need write layout %d, restore needed %d, invalidate_lock %d\n",
+	       PFID(lu_object_fid(&obj->co_lu)),
+	       io->ci_ignore_layout, io->ci_verify_layout,
+	       vio->vui_layout_gen, io->ci_need_write_intent,
+	       io->ci_restore_needed, io->ci_invalidate_page_cache);
+
+#ifdef HAVE_INVALIDATE_LOCK
+	if (io->ci_invalidate_page_cache) {
+		filemap_invalidate_unlock(inode->i_mapping);
+		io->ci_invalidate_page_cache = 0;
+	}
+#endif /* HAVE_INVALIDATE_LOCK */
+
+	if (io->ci_restore_needed) {
+		/* file was detected release, we need to restore it
+		 * before finishing the io
+		 */
+		rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF);
+		/* if restore registration failed, no restart,
+		 * we will return -ENODATA */
+		/* The layout will change after restore, so we need to
+		 * block on layout lock held by the MDT
+		 * as MDT will not send new layout in lvb (see LU-3124)
+		 * we have to explicitly fetch it, all this will be done
+		 * by ll_layout_refresh().
+		 * Even if ll_layout_restore() returns zero, it doesn't mean
+		 * that restore has been successful. Therefore it sets
+		 * ci_verify_layout so that it will check layout at the end
+		 * of this function.
+		 */
+		if (rc) {
+			io->ci_restore_needed = 1;
+			io->ci_need_restart = 0;
+			io->ci_verify_layout = 0;
+			io->ci_result = rc;
+			GOTO(out, rc);
+		}
+
+		io->ci_restore_needed = 0;
+
+		/* Even if ll_layout_restore() returns zero, it doesn't mean
+		 * that restore has been successful. Therefore it should verify
+		 * if there was layout change and restart I/O correspondingly.
+		 */
+		ll_layout_refresh(inode, &gen);
+		io->ci_need_restart = vio->vui_layout_gen != gen;
+		if (io->ci_need_restart) {
+			CDEBUG(D_VFSTRACE,
+			       DFID" layout changed from %d to %d.\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       vio->vui_layout_gen, gen);
+			/* today successful restore is the only possible
+			 * case */
+			/* restore was done, clear restoring state */
+			clear_bit(LLIF_FILE_RESTORING,
+				  &ll_i2info(vvp_object_inode(obj))->lli_flags);
+		}
+		GOTO(out, 0);
+	}
+
+	/**
+	 * dynamic layout change needed, send layout intent
+	 * RPC.
+	 */
+	if (io->ci_need_write_intent) {
+		enum layout_intent_opc opc = LAYOUT_INTENT_WRITE;
+
+		io->ci_need_write_intent = 0;
+
+		LASSERT(io->ci_type == CIT_WRITE || cl_io_is_fallocate(io) ||
+			cl_io_is_trunc(io) || cl_io_is_mkwrite(io));
+
+		CDEBUG(D_VFSTRACE, DFID" write layout, type %u "DEXT"\n",
+		       PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
+		       PEXT(&io->ci_write_intent));
+
+		if (cl_io_is_trunc(io))
+			opc = LAYOUT_INTENT_TRUNC;
+
+		rc = ll_layout_write_intent(inode, opc, &io->ci_write_intent);
+		io->ci_result = rc;
+		if (!rc)
+			io->ci_need_restart = 1;
+		GOTO(out, rc);
+	}
+
+	if (!io->ci_need_restart &&
+	    !io->ci_ignore_layout && io->ci_verify_layout) {
+		/* check layout version */
+		ll_layout_refresh(inode, &gen);
+		io->ci_need_restart = vio->vui_layout_gen != gen;
+		if (io->ci_need_restart) {
+			CDEBUG(D_VFSTRACE,
+			       DFID" layout changed from %d to %d.\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       vio->vui_layout_gen, gen);
+		}
+		GOTO(out, 0);
+	}
+out:
+	EXIT;
+}
+
+static void vvp_io_fault_fini(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        struct cl_io   *io   = ios->cis_io;
+        struct cl_page *page = io->u.ci_fault.ft_page;
+
+	CLOBINVRNT(env, io->ci_obj, vvp_object_invariant(io->ci_obj));
+
+        if (page != NULL) {
+                lu_ref_del(&page->cp_reference, "fault", io);
+                cl_page_put(env, page);
+                io->u.ci_fault.ft_page = NULL;
+        }
+        vvp_io_fini(env, ios);
+}
+
+static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+{
+        /*
+         * we only want to hold PW locks if the mmap() can generate
+         * writes back to the file and that only happens in shared
+         * writable vmas
+         */
+        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+                return CLM_WRITE;
+        return CLM_READ;
+}
+
+static int vvp_mmap_locks(const struct lu_env *env,
+			  struct vvp_io *vio, struct cl_io *io)
+{
+	struct vvp_thread_info *vti = vvp_env_info(env);
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct cl_lock_descr *descr = &vti->vti_descr;
+	union ldlm_policy_data policy;
+	struct iovec iov;
+	struct iov_iter i;
+	unsigned long addr;
+	ssize_t count;
+	int result = 0;
+	ENTRY;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	/* nfs or loop back device write */
+	if (vio->vui_iter == NULL)
+		RETURN(0);
+
+	/* No MM (e.g. NFS)? No vmas too. */
+	if (mm == NULL)
+		RETURN(0);
+
+	if (!iter_is_iovec(vio->vui_iter) && !iov_iter_is_kvec(vio->vui_iter))
+		RETURN(0);
+
+	for (i = *vio->vui_iter;
+	     iov_iter_count(&i);
+	     iov_iter_advance(&i, iov.iov_len)) {
+		iov = iov_iter_iovec(&i);
+		addr = (unsigned long)iov.iov_base;
+		count = iov.iov_len;
+
+		if (count == 0)
+			continue;
+
+		count += addr & ~PAGE_MASK;
+		addr &= PAGE_MASK;
+
+		mmap_read_lock(mm);
+		while ((vma = our_vma(mm, addr, count)) != NULL) {
+			struct dentry *de = file_dentry(vma->vm_file);
+			struct inode *inode = de->d_inode;
+			int flags = CEF_MUST;
+
+			if (ll_file_nolock(vma->vm_file)) {
+				/*
+				 * For no lock case is not allowed for mmap
+				 */
+				result = -EINVAL;
+				break;
+			}
+
+			/*
+			 * XXX: Required lock mode can be weakened: CIT_WRITE
+			 * io only ever reads user level buffer, and CIT_READ
+			 * only writes on it.
+			 */
+			policy_from_vma(&policy, vma, addr, count);
+			descr->cld_mode = vvp_mode_from_vma(vma);
+			descr->cld_obj = ll_i2info(inode)->lli_clob;
+			descr->cld_start = cl_index(descr->cld_obj,
+						    policy.l_extent.start);
+			descr->cld_end = cl_index(descr->cld_obj,
+						  policy.l_extent.end);
+			descr->cld_enq_flags = flags;
+			result = cl_io_lock_alloc_add(env, io, descr);
+
+			CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+			       descr->cld_mode, descr->cld_start,
+			       descr->cld_end);
+
+			if (result < 0)
+				break;
+
+			if (vma->vm_end - addr >= count)
+				break;
+
+			count -= vma->vm_end - addr;
+			addr = vma->vm_end;
+		}
+		mmap_read_unlock(mm);
+		if (result < 0)
+			break;
+	}
+	RETURN(result);
+}
+
+static void vvp_io_advance(const struct lu_env *env,
+			   const struct cl_io_slice *ios,
+			   size_t nob)
+{
+	struct cl_object *obj = ios->cis_io->ci_obj;
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	/*
+	 * Since 3.16(26978b8b4) vfs revert iov iter to
+	 * original position even io succeed, so instead
+	 * of relying on VFS, we move iov iter by ourselves.
+	 */
+	iov_iter_advance(vio->vui_iter, nob);
+	CDEBUG(D_VFSTRACE, "advancing %ld bytes\n", nob);
+	vio->vui_tot_count -= nob;
+	iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count);
+}
+
+static void vvp_io_update_iov(const struct lu_env *env,
+			      struct vvp_io *vio, struct cl_io *io)
+{
+	size_t size = io->u.ci_rw.crw_count;
+
+	if (!vio->vui_iter)
+		return;
+
+	iov_iter_truncate(vio->vui_iter, size);
+}
+
+static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+                          enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+	int result;
+	int ast_flags = 0;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+	ENTRY;
+
+	vvp_io_update_iov(env, vio, io);
+
+	if (io->u.ci_rw.crw_nonblock)
+		ast_flags |= CEF_NONBLOCK;
+	if (io->ci_lock_no_expand)
+		ast_flags |= CEF_LOCK_NO_EXPAND;
+	if (vio->vui_fd) {
+		/* Group lock held means no lockless any more */
+		if (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+			io->ci_dio_lock = 1;
+
+		if (ll_file_nolock(vio->vui_fd->fd_file) ||
+		    (vio->vui_fd->fd_file->f_flags & O_DIRECT &&
+		     !io->ci_dio_lock))
+			ast_flags |= CEF_NEVER;
+	}
+
+	result = vvp_mmap_locks(env, vio, io);
+	if (result == 0)
+		result = vvp_io_one_lock(env, io, ast_flags, mode, start, end);
+
+	RETURN(result);
+}
+
+static int vvp_io_read_lock(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	struct cl_io_rw_common *rd = &io->u.ci_rd.rd;
+	int result;
+
+	ENTRY;
+	result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos,
+				rd->crw_pos + rd->crw_count - 1);
+	RETURN(result);
+}
+
+static int vvp_io_fault_lock(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct cl_io *io   = ios->cis_io;
+        struct vvp_io *vio = cl2vvp_io(env, ios);
+        /*
+         * XXX LDLM_FL_CBPENDING
+         */
+	return vvp_io_one_lock_index(env,
+				     io, 0,
+				     vvp_mode_from_vma(vio->u.fault.ft_vma),
+				     io->u.ci_fault.ft_index,
+				     io->u.ci_fault.ft_index);
+}
+
+static int vvp_io_write_lock(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	loff_t start;
+	loff_t end;
+
+	if (io->u.ci_wr.wr_append) {
+		start = 0;
+		end   = OBD_OBJECT_EOF;
+	} else {
+		start = io->u.ci_wr.wr.crw_pos;
+		end   = start + io->u.ci_wr.wr.crw_count - 1;
+	}
+
+	RETURN(vvp_io_rw_lock(env, io, CLM_WRITE, start, end));
+}
+
+static int vvp_io_setattr_iter_init(const struct lu_env *env,
+				    const struct cl_io_slice *ios)
+
+{
+	return 0;
+}
+
+/**
+ * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io.
+ *
+ * Handles "lockless io" mode when extent locking is done by server.
+ */
+static int vvp_io_setattr_lock(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+	struct cl_io  *io  = ios->cis_io;
+	__u64 lock_start = 0;
+	__u64 lock_end = OBD_OBJECT_EOF;
+	__u32 enqflags = 0;
+
+	if (cl_io_is_trunc(io)) {
+		struct inode *inode = vvp_object_inode(io->ci_obj);
+
+		/* set enqueue flags to CEF_MUST in case of encrypted file,
+		 * to prevent lockless truncate
+		 */
+		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
+			enqflags = CEF_MUST;
+		else if (io->u.ci_setattr.sa_attr.lvb_size == 0)
+			enqflags = CEF_DISCARD_DATA;
+	} else if (cl_io_is_fallocate(io)) {
+		lock_start = io->u.ci_setattr.sa_falloc_offset;
+		lock_end = io->u.ci_setattr.sa_falloc_end - 1;
+	} else {
+		unsigned int valid = io->u.ci_setattr.sa_avalid;
+
+		if (!(valid & TIMES_SET_FLAGS))
+			return 0;
+
+		if ((!(valid & ATTR_MTIME) ||
+		     io->u.ci_setattr.sa_attr.lvb_mtime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime) &&
+		    (!(valid & ATTR_ATIME) ||
+		     io->u.ci_setattr.sa_attr.lvb_atime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime))
+			return 0;
+	}
+
+	return vvp_io_one_lock(env, io, enqflags, CLM_WRITE,
+			       lock_start, lock_end);
+}
+
+static int vvp_do_vmtruncate(struct inode *inode, size_t size)
+{
+	int     result;
+
+	/*
+	 * Only ll_inode_size_lock is taken at this level.
+	 */
+	ll_inode_size_lock(inode);
+	result = inode_newsize_ok(inode, size);
+	if (result < 0) {
+		ll_inode_size_unlock(inode);
+		return result;
+	}
+	i_size_write(inode, size);
+
+	ll_truncate_pagecache(inode, size);
+	ll_inode_size_unlock(inode);
+	return result;
+}
+
+static int vvp_io_setattr_time(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+        struct cl_io       *io    = ios->cis_io;
+        struct cl_object   *obj   = io->ci_obj;
+	struct cl_attr     *attr  = vvp_env_thread_attr(env);
+        int result;
+        unsigned valid = CAT_CTIME;
+
+	cl_object_attr_lock(obj);
+	attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+	if (io->u.ci_setattr.sa_avalid & ATTR_ATIME_SET) {
+		attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+		valid |= CAT_ATIME;
+	}
+	if (io->u.ci_setattr.sa_avalid & ATTR_MTIME_SET) {
+		attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+		valid |= CAT_MTIME;
+	}
+	result = cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+	return result;
+}
+
+static int vvp_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	struct inode *inode = vvp_object_inode(io->ci_obj);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int mode = io->u.ci_setattr.sa_falloc_mode;
+
+	if (cl_io_is_trunc(io)) {
+		trunc_sem_down_write(&lli->lli_trunc_sem);
+		mutex_lock(&lli->lli_setattr_mutex);
+		inode_dio_wait(inode);
+	} else if (cl_io_is_fallocate(io)) {
+		loff_t size;
+
+		trunc_sem_down_write(&lli->lli_trunc_sem);
+		mutex_lock(&lli->lli_setattr_mutex);
+		inode_dio_wait(inode);
+
+		ll_merge_attr(env, inode);
+		size = i_size_read(inode);
+		if (io->u.ci_setattr.sa_falloc_end > size &&
+		    !(mode & FALLOC_FL_KEEP_SIZE)) {
+			size = io->u.ci_setattr.sa_falloc_end;
+			io->u.ci_setattr.sa_avalid |= ATTR_SIZE;
+		}
+		io->u.ci_setattr.sa_attr.lvb_size = size;
+	} else {
+		mutex_lock(&lli->lli_setattr_mutex);
+	}
+
+	if (io->u.ci_setattr.sa_avalid & TIMES_SET_FLAGS)
+		return vvp_io_setattr_time(env, ios);
+
+	return 0;
+}
+
+static void vvp_io_setattr_end(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+	struct cl_io		*io    = ios->cis_io;
+	struct inode		*inode = vvp_object_inode(io->ci_obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+
+	if (cl_io_is_trunc(io)) {
+		/* Truncate in memory pages - they must be clean pages
+		 * because osc has already notified to destroy osc_extents. */
+		vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
+		mutex_unlock(&lli->lli_setattr_mutex);
+		trunc_sem_up_write(&lli->lli_trunc_sem);
+	} else if (cl_io_is_fallocate(io)) {
+		mutex_unlock(&lli->lli_setattr_mutex);
+		trunc_sem_up_write(&lli->lli_trunc_sem);
+	} else {
+		mutex_unlock(&lli->lli_setattr_mutex);
+	}
+}
+
+static void vvp_io_setattr_fini(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	bool restore_needed = ios->cis_io->ci_restore_needed;
+	struct inode *inode = vvp_object_inode(ios->cis_obj);
+
+	vvp_io_fini(env, ios);
+
+	if (restore_needed && !ios->cis_io->ci_restore_needed) {
+		/* restore finished, set data modified flag for HSM */
+		set_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags);
+	}
+}
+
+static int vvp_io_read_start(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+	struct cl_io *io = ios->cis_io;
+	struct cl_object *obj = io->ci_obj;
+	struct inode *inode = vvp_object_inode(obj);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct file *file = vio->vui_fd->fd_file;
+	loff_t pos = io->u.ci_rd.rd.crw_pos;
+	size_t cnt = io->u.ci_rd.rd.crw_count;
+	size_t tot = vio->vui_tot_count;
+	struct ll_cl_context *lcc;
+	unsigned int seq;
+	int exceed = 0;
+	int result;
+	int total_bytes_read = 0;
+	struct iov_iter iter;
+	pgoff_t page_offset;
+
+	ENTRY;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, "%s: read [%llu, %llu)\n",
+		file_dentry(file)->d_name.name,
+		pos, pos + cnt);
+
+	trunc_sem_down_read(&lli->lli_trunc_sem);
+
+	if (io->ci_async_readahead) {
+		file_accessed(file);
+		RETURN(0);
+	}
+
+	if (!can_populate_pages(env, io, inode))
+		RETURN(0);
+
+	if (!(file->f_flags & O_DIRECT)) {
+		result = cl_io_lru_reserve(env, io, pos, cnt);
+		if (result)
+			RETURN(result);
+	}
+
+	/* Unless this is reading a sparse file, otherwise the lock has already
+	 * been acquired so vvp_prep_size() is an empty op. */
+	result = vvp_prep_size(env, obj, io, pos, cnt, &exceed);
+	if (result != 0)
+		RETURN(result);
+	else if (exceed != 0)
+		GOTO(out, result);
+
+	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+			 "Read ino %lu, %zu bytes, offset %lld, size %llu\n",
+			 inode->i_ino, cnt, pos, i_size_read(inode));
+
+	/* initialize read-ahead window once per syscall */
+	if (!vio->vui_ra_valid) {
+		vio->vui_ra_valid = true;
+		vio->vui_ra_start_idx = cl_index(obj, pos);
+		vio->vui_ra_pages = 0;
+		page_offset = pos & ~PAGE_MASK;
+		if (page_offset) {
+			vio->vui_ra_pages++;
+			if (tot > PAGE_SIZE - page_offset)
+				tot -= (PAGE_SIZE - page_offset);
+			else
+				tot = 0;
+		}
+		vio->vui_ra_pages += (tot + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+		CDEBUG(D_READA, "tot %zu, ra_start %lu, ra_count %lu\n",
+		       vio->vui_tot_count, vio->vui_ra_start_idx,
+		       vio->vui_ra_pages);
+	}
+
+	/* BUG: 5972 */
+	file_accessed(file);
+	LASSERT(vio->vui_iocb->ki_pos == pos);
+	iter = *vio->vui_iter;
+
+	lcc = ll_cl_find(inode);
+	lcc->lcc_end_index = DIV_ROUND_UP(pos + iter.count, PAGE_SIZE);
+	CDEBUG(D_VFSTRACE, "count:%ld iocb pos:%lld\n", iter.count, pos);
+
+	/* this seqlock lets us notice if a page has been deleted on this inode
+	 * during the fault process, allowing us to catch an erroneous short
+	 * read or EIO
+	 * See LU-16160
+	 */
+	do {
+		seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock);
+		result = generic_file_read_iter(vio->vui_iocb, &iter);
+		if (result >= 0) {
+			io->ci_nob += result;
+			total_bytes_read += result;
+		}
+	/* if we got a short read or -EIO and we raced with page invalidation,
+	 * retry
+	 */
+	} while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) &&
+		 ((result >= 0 && iov_iter_count(&iter) > 0)
+		  || result == -EIO));
+
+out:
+	if (result >= 0) {
+		if (total_bytes_read < cnt)
+			io->ci_continue = 0;
+		result = 0;
+	} else if (result == -EIOCBQUEUED) {
+		io->ci_nob += vio->u.readwrite.vui_read;
+		vio->vui_iocb->ki_pos = pos + vio->u.readwrite.vui_read;
+	}
+
+	return result;
+}
+
+static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *plist, int from, int to)
+{
+	struct cl_2queue *queue = &io->ci_queue;
+	struct cl_page *page;
+	unsigned int bytes = 0;
+	int rc = 0;
+	ENTRY;
+
+	if (plist->pl_nr == 0)
+		RETURN(0);
+
+	if (from > 0 || to != PAGE_SIZE) {
+		page = cl_page_list_first(plist);
+		if (plist->pl_nr == 1) {
+			cl_page_clip(env, page, from, to);
+		} else {
+			if (from > 0)
+				cl_page_clip(env, page, from, PAGE_SIZE);
+			if (to != PAGE_SIZE) {
+				page = cl_page_list_last(plist);
+				cl_page_clip(env, page, 0, to);
+			}
+		}
+	}
+
+	cl_2queue_init(queue);
+	cl_page_list_splice(plist, &queue->c2_qin);
+	rc = cl_io_submit_sync(env, io, CRT_WRITE, queue, 0);
+
+	/* plist is not sorted any more */
+	cl_page_list_splice(&queue->c2_qin, plist);
+	cl_page_list_splice(&queue->c2_qout, plist);
+	cl_2queue_fini(env, queue);
+
+	if (rc == 0) {
+		/* calculate bytes */
+		bytes = plist->pl_nr << PAGE_SHIFT;
+		bytes -= from + PAGE_SIZE - to;
+
+		while (plist->pl_nr > 0) {
+			page = cl_page_list_first(plist);
+			cl_page_list_del(env, plist, page);
+
+			cl_page_clip(env, page, 0, PAGE_SIZE);
+
+			SetPageUptodate(cl_page_vmpage(page));
+			cl_page_disown(env, io, page);
+
+			/* held in ll_cl_init() */
+			lu_ref_del(&page->cp_reference, "cl_io", io);
+			cl_page_put(env, page);
+		}
+	}
+
+	RETURN(bytes > 0 ? bytes : rc);
+}
+
+/*
+ * From kernel v4.19-rc5-248-g9b89a0355144 use XArrary
+ * Prior kernels use radix_tree for tags
+ */
+static inline void ll_page_tag_dirty(struct page *page,
+				     struct address_space *mapping)
+{
+#ifndef HAVE_RADIX_TREE_TAG_SET
+	__xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY);
+#else
+	radix_tree_tag_set(&mapping->page_tree, page_index(page),
+			   PAGECACHE_TAG_DIRTY);
+#endif
+}
+
+/*
+ * Kernels 4.2 - 4.5 pass memcg argument to account_page_dirtied()
+ * Kernel v5.2-5678-gac1c3e4 no longer exports account_page_dirtied
+ */
+static inline void ll_account_page_dirtied(struct page *page,
+					   struct address_space *mapping)
+{
+#ifdef HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS
+	struct mem_cgroup *memcg = mem_cgroup_begin_page_stat(page);
+
+	account_page_dirtied(page, mapping, memcg);
+	mem_cgroup_end_page_stat(memcg);
+#elif defined(HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT)
+	account_page_dirtied(page, mapping);
+#else
+	vvp_account_page_dirtied(page, mapping);
+#endif
+	ll_page_tag_dirty(page, mapping);
+}
+
+/* Taken from kernel set_page_dirty, __set_page_dirty_nobuffers
+ * Last change to this area: b93b016313b3ba8003c3b8bb71f569af91f19fc7
+ *
+ * Current with Linus tip of tree (7/13/2019):
+ * v5.2-rc4-224-ge01e060fe0
+ *
+ * Backwards compat for 3.x, 5.x kernels relating to memcg handling
+ * & rename of radix tree to xarray.
+ */
+void vvp_set_pagevec_dirty(struct pagevec *pvec)
+{
+	struct page *page = pvec->pages[0];
+	int count = pagevec_count(pvec);
+	int i;
+#ifdef HAVE_KALLSYMS_LOOKUP_NAME
+	struct address_space *mapping = page->mapping;
+	unsigned long flags;
+	unsigned long skip_pages = 0;
+	int dirtied = 0;
+#endif
+
+	ENTRY;
+
+	BUILD_BUG_ON(PAGEVEC_SIZE > BITS_PER_LONG);
+	LASSERTF(page->mapping,
+		 "mapping must be set. page %p, page->private (cl_page) %p\n",
+		 page, (void *) page->private);
+
+	/*
+	 * kernels without HAVE_KALLSYMS_LOOKUP_NAME also don't have
+	 * account_dirty_page exported, and if we can't access that symbol,
+	 * we can't do page dirtying in batch (taking the xarray lock only once)
+	 * so we just fall back to a looped call to __set_page_dirty_nobuffers
+	 */
+#ifndef HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT
+	if (!vvp_account_page_dirtied) {
+		for (i = 0; i < count; i++)
+			__set_page_dirty_nobuffers(pvec->pages[i]);
+		EXIT;
+	}
+#endif
+
+#ifdef HAVE_KALLSYMS_LOOKUP_NAME
+	for (i = 0; i < count; i++) {
+		page = pvec->pages[i];
+
+		ClearPageReclaim(page);
+
+		vvp_lock_page_memcg(page);
+		if (TestSetPageDirty(page)) {
+			/* page is already dirty .. no extra work needed
+			 * set a flag for the i'th page to be skipped
+			 */
+			vvp_unlock_page_memcg(page);
+			skip_pages |= (1 << i);
+		}
+	}
+
+	ll_xa_lock_irqsave(&mapping->i_pages, flags);
+
+	/* Notes on differences with __set_page_dirty_nobuffers:
+	 * 1. We don't need to call page_mapping because we know this is a page
+	 * cache page.
+	 * 2. We have the pages locked, so there is no need for the careful
+	 * mapping/mapping2 dance.
+	 * 3. No mapping is impossible. (Race w/truncate mentioned in
+	 * dirty_nobuffers should be impossible because we hold the page lock.)
+	 * 4. All mappings are the same because i/o is only to one file.
+	 */
+	for (i = 0; i < count; i++) {
+		page = pvec->pages[i];
+		/* if the i'th page was unlocked above, skip it here */
+		if ((skip_pages >> i) & 1)
+			continue;
+
+		LASSERTF(page->mapping == mapping,
+			 "all pages must have the same mapping.  page %p, mapping %p, first mapping %p\n",
+			 page, page->mapping, mapping);
+		WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
+		ll_account_page_dirtied(page, mapping);
+		dirtied++;
+		vvp_unlock_page_memcg(page);
+	}
+	ll_xa_unlock_irqrestore(&mapping->i_pages, flags);
+
+	CDEBUG(D_VFSTRACE, "mapping %p, count %d, dirtied %d\n", mapping,
+	       count, dirtied);
+
+	if (mapping->host && dirtied) {
+		/* !PageAnon && !swapper_space */
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+	}
+#endif
+	EXIT;
+}
+
+static void write_commit_callback(const struct lu_env *env, struct cl_io *io,
+				  struct pagevec *pvec)
+{
+	int count = 0;
+	int i = 0;
+
+	ENTRY;
+
+	count = pagevec_count(pvec);
+	LASSERT(count > 0);
+
+	for (i = 0; i < count; i++) {
+		struct page *vmpage = pvec->pages[i];
+		SetPageUptodate(vmpage);
+	}
+
+	vvp_set_pagevec_dirty(pvec);
+
+	for (i = 0; i < count; i++) {
+		struct page *vmpage = pvec->pages[i];
+		struct cl_page *page = (struct cl_page *) vmpage->private;
+		cl_page_disown(env, io, page);
+		lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io));
+		cl_page_put(env, page);
+	}
+
+	EXIT;
+}
+
+/* make sure the page list is contiguous */
+static bool page_list_sanity_check(struct cl_object *obj,
+				   struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	pgoff_t index = CL_PAGE_EOF;
+
+	cl_page_list_for_each(page, plist) {
+		struct vvp_page *vpg = cl_object_page_slice(obj, page);
+
+		if (index == CL_PAGE_EOF) {
+			index = vvp_index(vpg);
+			continue;
+		}
+
+		++index;
+		if (index == vvp_index(vpg))
+			continue;
+
+		return false;
+	}
+	return true;
+}
+
+/* Return how many bytes have queued or written */
+int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_object *obj = io->ci_obj;
+	struct inode *inode = vvp_object_inode(obj);
+	struct vvp_io *vio = vvp_env_io(env);
+	struct cl_page_list *queue = &vio->u.readwrite.vui_queue;
+	struct cl_page *page;
+	int rc = 0;
+	int bytes = 0;
+	unsigned int npages = vio->u.readwrite.vui_queue.pl_nr;
+	ENTRY;
+
+	if (npages == 0)
+		RETURN(0);
+
+	CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n",
+		npages, vio->u.readwrite.vui_from, vio->u.readwrite.vui_to);
+
+	LASSERT(page_list_sanity_check(obj, queue));
+
+	/* submit IO with async write */
+	rc = cl_io_commit_async(env, io, queue,
+				vio->u.readwrite.vui_from,
+				vio->u.readwrite.vui_to,
+				write_commit_callback);
+	npages -= queue->pl_nr; /* already committed pages */
+	if (npages > 0) {
+		/* calculate how many bytes were written */
+		bytes = npages << PAGE_SHIFT;
+
+		/* first page */
+		bytes -= vio->u.readwrite.vui_from;
+		if (queue->pl_nr == 0) /* last page */
+			bytes -= PAGE_SIZE - vio->u.readwrite.vui_to;
+		LASSERTF(bytes > 0, "bytes = %d, pages = %d\n", bytes, npages);
+
+		vio->u.readwrite.vui_written += bytes;
+
+		CDEBUG(D_VFSTRACE, "Committed %d pages %d bytes, tot: %ld\n",
+			npages, bytes, vio->u.readwrite.vui_written);
+
+		/* the first page must have been written. */
+		vio->u.readwrite.vui_from = 0;
+	}
+	LASSERT(page_list_sanity_check(obj, queue));
+	LASSERT(ergo(rc == 0, queue->pl_nr == 0));
+
+	/* out of quota, try sync write */
+	if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		rc = vvp_io_commit_sync(env, io, queue,
+					vio->u.readwrite.vui_from,
+					vio->u.readwrite.vui_to);
+		if (rc > 0) {
+			vio->u.readwrite.vui_written += rc;
+			rc = 0;
+		}
+		if (lli->lli_clob != NULL)
+			lov_read_and_clear_async_rc(lli->lli_clob);
+		lli->lli_async_rc = 0;
+	}
+
+	/* update inode size */
+	ll_merge_attr(env, inode);
+
+	/* Now the pages in queue were failed to commit, discard them
+	 * unless they were dirtied before. */
+	while (queue->pl_nr > 0) {
+		page = cl_page_list_first(queue);
+		cl_page_list_del(env, queue, page);
+
+		if (!PageDirty(cl_page_vmpage(page)))
+			cl_page_discard(env, io, page);
+
+		cl_page_disown(env, io, page);
+
+		/* held in ll_cl_init() */
+		lu_ref_del(&page->cp_reference, "cl_io", io);
+		cl_page_put(env, page);
+	}
+	cl_page_list_fini(env, queue);
+
+	RETURN(rc);
+}
+
+static int vvp_io_write_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+	struct vvp_io		*vio   = cl2vvp_io(env, ios);
+	struct cl_io		*io    = ios->cis_io;
+	struct cl_object	*obj   = io->ci_obj;
+	struct inode		*inode = vvp_object_inode(obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+	struct file		*file  = vio->vui_fd->fd_file;
+	ssize_t			 result = 0;
+	loff_t			 pos = io->u.ci_wr.wr.crw_pos;
+	size_t			 cnt = io->u.ci_wr.wr.crw_count;
+	bool			 lock_inode = !IS_NOSEC(inode);
+	size_t nob = io->ci_nob;
+	struct iov_iter iter;
+	size_t written = 0;
+
+	ENTRY;
+
+	trunc_sem_down_read(&lli->lli_trunc_sem);
+
+	if (!can_populate_pages(env, io, inode))
+		RETURN(0);
+
+	if (cl_io_is_append(io)) {
+		/*
+		 * PARALLEL IO This has to be changed for parallel IO doing
+		 * out-of-order writes.
+		 */
+		ll_merge_attr(env, inode);
+		pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+		vio->vui_iocb->ki_pos = pos;
+	} else {
+		LASSERTF(vio->vui_iocb->ki_pos == pos,
+			 "ki_pos %lld [%lld, %lld)\n",
+			 vio->vui_iocb->ki_pos,
+			 pos, pos + cnt);
+	}
+
+	CDEBUG(D_VFSTRACE, "%s: write [%llu, %llu)\n",
+		file_dentry(file)->d_name.name,
+		pos, pos + cnt);
+
+	/* The maximum Lustre file size is variable, based on the OST maximum
+	 * object size and number of stripes.  This needs another check in
+	 * addition to the VFS checks earlier. */
+	if (pos + cnt > ll_file_maxbytes(inode)) {
+		CDEBUG(D_INODE,
+		       "%s: file %s ("DFID") offset %llu > maxbytes %llu\n",
+		       ll_i2sbi(inode)->ll_fsname,
+		       file_dentry(file)->d_name.name,
+		       PFID(ll_inode2fid(inode)), pos + cnt,
+		       ll_file_maxbytes(inode));
+		RETURN(-EFBIG);
+	}
+
+	/* Tests to verify we take the i_mutex correctly */
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_SEC) && !lock_inode)
+		RETURN(-EINVAL);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_NOSEC) && lock_inode)
+		RETURN(-EINVAL);
+
+	if (!(file->f_flags & O_DIRECT)) {
+		result = cl_io_lru_reserve(env, io, pos, cnt);
+		if (result)
+			RETURN(result);
+	}
+
+	if (vio->vui_iter == NULL) {
+		/* from a temp io in ll_cl_init(). */
+		result = 0;
+	} else {
+		/*
+		 * When using the locked AIO function (generic_file_aio_write())
+		 * testing has shown the inode mutex to be a limiting factor
+		 * with multi-threaded single shared file performance. To get
+		 * around this, we now use the lockless version. To maintain
+		 * consistency, proper locking to protect against writes,
+		 * trucates, etc. is handled in the higher layers of lustre.
+		 */
+		lock_inode = !IS_NOSEC(inode);
+		iter = *vio->vui_iter;
+
+		if (unlikely(lock_inode))
+			inode_lock(inode);
+		result = __generic_file_write_iter(vio->vui_iocb, &iter);
+		if (unlikely(lock_inode))
+			inode_unlock(inode);
+
+		written = result;
+		if (result > 0)
+#ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
+			result = generic_write_sync(vio->vui_iocb, result);
+#else
+		{
+			ssize_t err;
+
+			err = generic_write_sync(vio->vui_iocb->ki_filp, pos,
+						 result);
+			if (err < 0 && result > 0)
+				result = err;
+		}
+#endif
+	}
+
+	if (result > 0) {
+		result = vvp_io_write_commit(env, io);
+		/* Simulate short commit */
+		if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) {
+			vio->u.readwrite.vui_written >>= 1;
+			if (vio->u.readwrite.vui_written > 0)
+				io->ci_need_restart = 1;
+		}
+		if (vio->u.readwrite.vui_written > 0) {
+			result = vio->u.readwrite.vui_written;
+			CDEBUG(D_VFSTRACE, "%s: write nob %zd, result: %zd\n",
+				file_dentry(file)->d_name.name,
+				io->ci_nob, result);
+			io->ci_nob += result;
+		} else {
+			io->ci_continue = 0;
+		}
+	}
+	if (vio->vui_iocb->ki_pos != (pos + io->ci_nob - nob)) {
+		CDEBUG(D_VFSTRACE,
+		       "%s: write position mismatch: ki_pos %lld vs. pos %lld, written %zd, commit %zd: rc = %zd\n",
+		       file_dentry(file)->d_name.name,
+		       vio->vui_iocb->ki_pos, pos + io->ci_nob - nob,
+		       written, io->ci_nob - nob, result);
+		/*
+		 * Rewind ki_pos and vui_iter to where it has
+		 * successfully committed.
+		 */
+		vio->vui_iocb->ki_pos = pos + io->ci_nob - nob;
+	}
+	if (result > 0 || result == -EIOCBQUEUED) {
+		set_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags);
+
+		if (result != -EIOCBQUEUED && result < cnt)
+			io->ci_continue = 0;
+		if (result > 0)
+			result = 0;
+		/* move forward */
+		if (result == -EIOCBQUEUED) {
+			io->ci_nob += vio->u.readwrite.vui_written;
+			vio->vui_iocb->ki_pos = pos +
+					vio->u.readwrite.vui_written;
+		}
+	}
+
+	RETURN(result);
+}
+
+static void vvp_io_rw_end(const struct lu_env *env,
+			  const struct cl_io_slice *ios)
+{
+	struct inode		*inode = vvp_object_inode(ios->cis_obj);
+	struct ll_inode_info	*lli = ll_i2info(inode);
+
+	trunc_sem_up_read(&lli->lli_trunc_sem);
+}
+
+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+{
+	struct vm_fault *vmf = cfio->ft_vmf;
+
+	cfio->ft_flags = ll_filemap_fault(cfio->ft_vma, vmf);
+	cfio->ft_flags_valid = 1;
+
+	if (vmf->page) {
+		LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
+			       get_vmf_address(vmf));
+		if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) {
+			lock_page(vmf->page);
+			cfio->ft_flags |= VM_FAULT_LOCKED;
+		}
+
+		cfio->ft_vmpage = vmf->page;
+
+		return 0;
+	}
+
+	if (cfio->ft_flags & VM_FAULT_SIGBUS) {
+		CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", get_vmf_address(vmf));
+		return -EFAULT;
+	}
+
+	if (cfio->ft_flags & VM_FAULT_OOM) {
+		CDEBUG(D_PAGE, "got addr %p - OOM\n", get_vmf_address(vmf));
+		return -ENOMEM;
+	}
+
+	if (cfio->ft_flags & VM_FAULT_RETRY)
+		return -EAGAIN;
+
+	CERROR("unknown error in page fault %d\n", cfio->ft_flags);
+
+	return -EINVAL;
+}
+
+static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io,
+				    struct pagevec *pvec)
+{
+	vvp_set_pagevec_dirty(pvec);
+}
+
+static int vvp_io_fault_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+	struct vvp_io		*vio   = cl2vvp_io(env, ios);
+	struct cl_io		*io    = ios->cis_io;
+	struct cl_object	*obj   = io->ci_obj;
+	struct inode		*inode = vvp_object_inode(obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+	struct cl_fault_io	*fio   = &io->u.ci_fault;
+	struct vvp_fault_io	*cfio  = &vio->u.fault;
+	loff_t			 offset;
+	int			 result = 0;
+	struct page		*vmpage = NULL;
+	struct cl_page		*page;
+	loff_t			 size;
+	pgoff_t			 last_index;
+	ENTRY;
+
+	trunc_sem_down_read_nowait(&lli->lli_trunc_sem);
+
+        /* offset of the last byte on the page */
+        offset = cl_offset(obj, fio->ft_index + 1) - 1;
+        LASSERT(cl_index(obj, offset) == fio->ft_index);
+	result = vvp_prep_size(env, obj, io, 0, offset + 1, NULL);
+	if (result != 0)
+		RETURN(result);
+
+	/* must return locked page */
+	if (fio->ft_mkwrite) {
+		LASSERT(cfio->ft_vmpage != NULL);
+		lock_page(cfio->ft_vmpage);
+	} else {
+		result = vvp_io_kernel_fault(cfio);
+		if (result != 0)
+			RETURN(result);
+	}
+
+	vmpage = cfio->ft_vmpage;
+	LASSERT(PageLocked(vmpage));
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+		generic_error_remove_page(vmpage->mapping, vmpage);
+
+	size = i_size_read(inode);
+        /* Though we have already held a cl_lock upon this page, but
+         * it still can be truncated locally. */
+	if (unlikely((vmpage->mapping != inode->i_mapping) ||
+		     (page_offset(vmpage) > size))) {
+                CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
+
+                /* return +1 to stop cl_io_loop() and ll_fault() will catch
+                 * and retry. */
+                GOTO(out, result = +1);
+        }
+
+	last_index = cl_index(obj, size - 1);
+
+	if (fio->ft_mkwrite ) {
+		/*
+		 * Capture the size while holding the lli_trunc_sem from above
+		 * we want to make sure that we complete the mkwrite action
+		 * while holding this lock. We need to make sure that we are
+		 * not past the end of the file.
+		 */
+		if (last_index < fio->ft_index) {
+			CDEBUG(D_PAGE,
+				"llite: mkwrite and truncate race happened: "
+				"%p: 0x%lx 0x%lx\n",
+				vmpage->mapping,fio->ft_index,last_index);
+			/*
+			 * We need to return if we are
+			 * passed the end of the file. This will propagate
+			 * up the call stack to ll_page_mkwrite where
+			 * we will return VM_FAULT_NOPAGE. Any non-negative
+			 * value returned here will be silently
+			 * converted to 0. If the vmpage->mapping is null
+			 * the error code would be converted back to ENODATA
+			 * in ll_page_mkwrite0. Thus we return -ENODATA
+			 * to handle both cases
+			 */
+			GOTO(out, result = -ENODATA);
+		}
+	}
+
+	page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+	if (IS_ERR(page))
+		GOTO(out, result = PTR_ERR(page));
+
+	/* if page is going to be written, we should add this page into cache
+	 * earlier. */
+	if (fio->ft_mkwrite) {
+		wait_on_page_writeback(vmpage);
+		if (!PageDirty(vmpage)) {
+			struct cl_page_list *plist = &vio->u.fault.ft_queue;
+			struct vvp_page *vpg = cl_object_page_slice(obj, page);
+			int to = PAGE_SIZE;
+
+			/* vvp_page_assume() calls wait_on_page_writeback(). */
+			cl_page_assume(env, io, page);
+
+			cl_page_list_init(plist);
+			cl_page_list_add(plist, page, true);
+
+			/* size fixup */
+			if (last_index == vvp_index(vpg))
+				to = ((size - 1) & ~PAGE_MASK) + 1;
+
+			/* Do not set Dirty bit here so that in case IO is
+			 * started before the page is really made dirty, we
+			 * still have chance to detect it. */
+			result = cl_io_commit_async(env, io, plist, 0, to,
+						    mkwrite_commit_callback);
+			/* Have overquota flag, trying sync write to check
+			 * whether indeed out of quota */
+			if (result == -EDQUOT) {
+				cl_page_get(page);
+				result = vvp_io_commit_sync(env, io,
+							    plist, 0, to);
+				if (result >= 0) {
+					io->ci_noquota = 1;
+					cl_page_own(env, io, page);
+					cl_page_list_add(plist, page, true);
+					lu_ref_add(&page->cp_reference,
+						   "cl_io", io);
+					result = cl_io_commit_async(env, io,
+						plist, 0, to,
+						mkwrite_commit_callback);
+					io->ci_noquota = 0;
+				} else {
+					cl_page_put(env, page);
+				}
+			}
+
+			LASSERT(cl_page_is_owned(page, io));
+			cl_page_list_fini(env, plist);
+
+			vmpage = NULL;
+			if (result < 0) {
+				cl_page_discard(env, io, page);
+				cl_page_disown(env, io, page);
+
+				cl_page_put(env, page);
+
+				/* we're in big trouble, what can we do now? */
+				if (result == -EDQUOT)
+					result = -ENOSPC;
+				GOTO(out, result);
+			} else {
+				cl_page_disown(env, io, page);
+			}
+		}
+	}
+
+	/*
+	 * The ft_index is only used in the case of
+	 * a mkwrite action. We need to check
+	 * our assertions are correct, since
+	 * we should have caught this above
+	 */
+	LASSERT(!fio->ft_mkwrite || fio->ft_index <= last_index);
+	if (fio->ft_index == last_index)
+                /*
+                 * Last page is mapped partially.
+                 */
+                fio->ft_nob = size - cl_offset(obj, fio->ft_index);
+        else
+                fio->ft_nob = cl_page_size(obj);
+
+        lu_ref_add(&page->cp_reference, "fault", io);
+        fio->ft_page = page;
+        EXIT;
+
+out:
+	/* return unlocked vmpage to avoid deadlocking */
+	if (vmpage != NULL)
+		unlock_page(vmpage);
+
+	cfio->ft_flags &= ~VM_FAULT_LOCKED;
+
+	return result;
+}
+
+static void vvp_io_fault_end(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct inode		*inode = vvp_object_inode(ios->cis_obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+
+	CLOBINVRNT(env, ios->cis_io->ci_obj,
+		   vvp_object_invariant(ios->cis_io->ci_obj));
+	trunc_sem_up_read(&lli->lli_trunc_sem);
+}
+
+static int vvp_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	/* we should mark TOWRITE bit to each dirty page in radix tree to
+	 * verify pages have been written, but this is difficult because of
+	 * race. */
+	return 0;
+}
+
+static int vvp_io_read_ahead(const struct lu_env *env,
+			     const struct cl_io_slice *ios,
+			     pgoff_t start, struct cl_read_ahead *ra)
+{
+	int result = 0;
+	ENTRY;
+
+	if (ios->cis_io->ci_type == CIT_READ ||
+	    ios->cis_io->ci_type == CIT_FAULT) {
+		struct vvp_io *vio = cl2vvp_io(env, ios);
+
+		if (unlikely(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+			ra->cra_end_idx = CL_PAGE_EOF;
+			result = +1; /* no need to call down */
+		}
+	}
+
+	RETURN(result);
+}
+
+static int vvp_io_lseek_lock(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	__u64 lock_start = io->u.ci_lseek.ls_start;
+	__u64 lock_end = OBD_OBJECT_EOF;
+	__u32 enqflags = CEF_MUST; /* always take client lock */
+
+	return vvp_io_one_lock(env, io, enqflags, CLM_READ,
+			       lock_start, lock_end);
+}
+
+static int vvp_io_lseek_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	struct inode *inode = vvp_object_inode(io->ci_obj);
+	__u64 start = io->u.ci_lseek.ls_start;
+
+	inode_lock(inode);
+	inode_dio_wait(inode);
+
+	/* At the moment we have DLM lock so just update inode
+	 * to know the file size.
+	 */
+	ll_merge_attr(env, inode);
+	if (start >= i_size_read(inode)) {
+		io->u.ci_lseek.ls_result = -ENXIO;
+		return -ENXIO;
+	}
+	return 0;
+}
+
+static void vvp_io_lseek_end(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	struct inode *inode = vvp_object_inode(io->ci_obj);
+
+	if (io->u.ci_lseek.ls_result > i_size_read(inode))
+		io->u.ci_lseek.ls_result = -ENXIO;
+
+	inode_unlock(inode);
+}
+
+static const struct cl_io_operations vvp_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini	= vvp_io_fini,
+			.cio_iter_init = vvp_io_read_iter_init,
+			.cio_lock	= vvp_io_read_lock,
+			.cio_start	= vvp_io_read_start,
+			.cio_end	= vvp_io_rw_end,
+			.cio_advance	= vvp_io_advance,
+		},
+                [CIT_WRITE] = {
+			.cio_fini      = vvp_io_fini,
+			.cio_iter_init = vvp_io_write_iter_init,
+			.cio_iter_fini = vvp_io_write_iter_fini,
+			.cio_lock      = vvp_io_write_lock,
+			.cio_start     = vvp_io_write_start,
+			.cio_end       = vvp_io_rw_end,
+			.cio_advance   = vvp_io_advance,
+                },
+                [CIT_SETATTR] = {
+                        .cio_fini       = vvp_io_setattr_fini,
+                        .cio_iter_init  = vvp_io_setattr_iter_init,
+                        .cio_lock       = vvp_io_setattr_lock,
+                        .cio_start      = vvp_io_setattr_start,
+                        .cio_end        = vvp_io_setattr_end
+                },
+                [CIT_FAULT] = {
+                        .cio_fini      = vvp_io_fault_fini,
+                        .cio_iter_init = vvp_io_fault_iter_init,
+                        .cio_lock      = vvp_io_fault_lock,
+                        .cio_start     = vvp_io_fault_start,
+			.cio_end       = vvp_io_fault_end,
+                },
+		[CIT_FSYNC] = {
+			.cio_start	= vvp_io_fsync_start,
+			.cio_fini	= vvp_io_fini
+		},
+		[CIT_GLIMPSE] = {
+			.cio_fini	= vvp_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini	= vvp_io_fini
+		},
+		[CIT_LADVISE] = {
+			.cio_fini	= vvp_io_fini
+		},
+		[CIT_LSEEK] = {
+			.cio_fini      = vvp_io_fini,
+			.cio_lock      = vvp_io_lseek_lock,
+			.cio_start     = vvp_io_lseek_start,
+			.cio_end       = vvp_io_lseek_end,
+		},
+	},
+	.cio_read_ahead = vvp_io_read_ahead
+};
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+                struct cl_io *io)
+{
+	struct vvp_io      *vio   = vvp_env_io(env);
+	struct inode       *inode = vvp_object_inode(obj);
+	int                 result;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d "
+	       "restore needed %d\n",
+	       PFID(lu_object_fid(&obj->co_lu)),
+	       io->ci_ignore_layout, io->ci_verify_layout,
+	       vio->vui_layout_gen, io->ci_restore_needed);
+
+	CL_IO_SLICE_CLEAN(vio, vui_cl);
+	cl_io_slice_add(io, &vio->vui_cl, obj, &vvp_io_ops);
+	vio->vui_ra_valid = false;
+	result = 0;
+	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+		size_t count;
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		count = io->u.ci_rw.crw_count;
+		/* "If nbyte is 0, read() will return 0 and have no other
+		 *  results."  -- Single Unix Spec */
+		if (count == 0)
+			result = 1;
+		else
+			vio->vui_tot_count = count;
+
+		/* for read/write, we store the jobid in the inode, and
+		 * it'll be fetched by osc when building RPC.
+		 *
+		 * it's not accurate if the file is shared by different
+		 * jobs.
+		 */
+		lustre_get_jobid(lli->lli_jobid, sizeof(lli->lli_jobid));
+	} else if (io->ci_type == CIT_SETATTR) {
+		if (!cl_io_is_trunc(io))
+			io->ci_lockreq = CILR_MANDATORY;
+	}
+
+	/* Enqueue layout lock and get layout version. We need to do this
+	 * even for operations requiring to open file, such as read and write,
+	 * because it might not grant layout lock in IT_OPEN. */
+	if (result == 0 && !io->ci_ignore_layout) {
+		result = ll_layout_refresh(inode, &vio->vui_layout_gen);
+		if (result == -ENOENT)
+			/* If the inode on MDS has been removed, but the objects
+			 * on OSTs haven't been destroyed (async unlink), layout
+			 * fetch will return -ENOENT, we'd ingore this error
+			 * and continue with dirty flush. LU-3230. */
+			result = 0;
+		if (result < 0)
+			CERROR("%s: refresh file layout " DFID " error %d.\n",
+			       ll_i2sbi(inode)->ll_fsname,
+			       PFID(lu_object_fid(&obj->co_lu)), result);
+	}
+
+#ifdef HAVE_INVALIDATE_LOCK
+	if (io->ci_invalidate_page_cache)
+		filemap_invalidate_lock(inode->i_mapping);
+#endif /* HAVE_INVALIDATE_LOCK */
+
+	io->ci_result = result < 0 ? result : 0;
+	RETURN(result);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
new file mode 100644
index 0000000000000..2413da9498cd3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
@@ -0,0 +1,324 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * cl_object implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+
+#include <libcfs/libcfs.h>
+
+#include <obd.h>
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+int vvp_object_invariant(const struct cl_object *obj)
+{
+	struct inode		*inode	= vvp_object_inode(obj);
+	struct ll_inode_info	*lli	= ll_i2info(inode);
+
+	return (S_ISREG(inode->i_mode) || inode->i_mode == 0) &&
+	       lli->lli_clob == obj;
+}
+
+static int vvp_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	struct vvp_object    *obj   = lu2vvp(o);
+	struct inode         *inode = obj->vob_inode;
+	struct ll_inode_info *lli;
+
+	(*p)(env, cookie, "(%d) inode: %p ",
+	     atomic_read(&obj->vob_mmap_cnt),
+	     inode);
+	if (inode) {
+		lli = ll_i2info(inode);
+		(*p)(env, cookie, "%lu/%u %o %u %d %p "DFID,
+		     inode->i_ino, inode->i_generation, inode->i_mode,
+		     inode->i_nlink, atomic_read(&inode->i_count),
+		     lli->lli_clob, PFID(&lli->lli_fid));
+	}
+	return 0;
+}
+
+static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	/*
+	 * lov overwrites most of these fields in
+	 * lov_attr_get()->...lov_merge_lvb_kms(), except when inode
+	 * attributes are newer.
+	 */
+
+	attr->cat_size = i_size_read(inode);
+	attr->cat_mtime = inode->i_mtime.tv_sec;
+	attr->cat_atime = inode->i_atime.tv_sec;
+	attr->cat_ctime = inode->i_ctime.tv_sec;
+	attr->cat_blocks = inode->i_blocks;
+	attr->cat_uid = from_kuid(&init_user_ns, inode->i_uid);
+	attr->cat_gid = from_kgid(&init_user_ns, inode->i_gid);
+	attr->cat_projid = ll_i2info(inode)->lli_projid;
+	/* KMS is not known by this layer */
+	return 0; /* layers below have to fill in the rest */
+}
+
+static int vvp_attr_update(const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	if (valid & CAT_UID)
+		inode->i_uid = make_kuid(&init_user_ns, attr->cat_uid);
+	if (valid & CAT_GID)
+		inode->i_gid = make_kgid(&init_user_ns, attr->cat_gid);
+	if (valid & CAT_ATIME)
+		inode->i_atime.tv_sec = attr->cat_atime;
+	if (valid & CAT_MTIME)
+		inode->i_mtime.tv_sec = attr->cat_mtime;
+	if (valid & CAT_CTIME)
+		inode->i_ctime.tv_sec = attr->cat_ctime;
+	if (0 && valid & CAT_SIZE)
+		i_size_write(inode, attr->cat_size);
+	if (valid & CAT_PROJID)
+		ll_i2info(inode)->lli_projid = attr->cat_projid;
+	/* not currently necessary */
+	if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE|CAT_PROJID))
+		mark_inode_dirty(inode);
+	return 0;
+}
+
+static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(conf->coc_inode);
+
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		CDEBUG(D_VFSTRACE, DFID ": losing layout lock\n",
+		       PFID(&lli->lli_fid));
+
+		ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE);
+
+		/* Clean up page mmap for this inode.
+		 * The reason for us to do this is that if the page has
+		 * already been installed into memory space, the process
+		 * can access it without interacting with lustre, so this
+		 * page may be stale due to layout change, and the process
+		 * will never be notified.
+		 * This operation is expensive but mmap processes have to pay
+		 * a price themselves. */
+		unmap_mapping_range(conf->coc_inode->i_mapping,
+				    0, OBD_OBJECT_EOF, 0);
+		pcc_layout_invalidate(conf->coc_inode);
+	}
+	return 0;
+}
+
+static int vvp_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	struct inode *inode = vvp_object_inode(obj);
+	int rc;
+	ENTRY;
+
+	rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1);
+	if (rc < 0) {
+		CDEBUG(D_VFSTRACE, DFID ": writeback failed: %d\n",
+		       PFID(lu_object_fid(&obj->co_lu)), rc);
+		RETURN(rc);
+	}
+
+	ll_truncate_inode_pages_final(inode);
+	mapping_clear_exiting(inode->i_mapping);
+
+	RETURN(0);
+}
+
+static int vvp_object_glimpse(const struct lu_env *env,
+			      const struct cl_object *obj, struct ost_lvb *lvb)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	ENTRY;
+	lvb->lvb_mtime = inode->i_mtime.tv_sec;
+	lvb->lvb_atime = inode->i_atime.tv_sec;
+	lvb->lvb_ctime = inode->i_ctime.tv_sec;
+
+	/*
+	 * LU-417: Add dirty pages block count lest i_blocks reports 0, some
+	 * "cp" or "tar" on remote node may think it's a completely sparse file
+	 * and skip it.
+	 */
+	if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0)
+		lvb->lvb_blocks = dirty_cnt(inode);
+
+	RETURN(0);
+}
+
+static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+			     struct cl_req_attr *attr)
+{
+	struct inode *inode;
+	struct obdo  *oa;
+	u64 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLUID | OBD_MD_FLGID;
+
+	oa = attr->cra_oa;
+	inode = vvp_object_inode(obj);
+
+	if (attr->cra_type == CRT_WRITE) {
+		valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+		obdo_set_o_projid(oa, ll_i2info(inode)->lli_projid);
+	} else if (attr->cra_type == CRT_READ) {
+		valid_flags |= OBD_MD_FLATIME;
+	}
+	obdo_from_inode(oa, inode, valid_flags & attr->cra_flags);
+	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID))
+		oa->o_parent_oid++;
+	memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid,
+	       sizeof(attr->cra_jobid));
+}
+
+static const struct cl_object_operations vvp_ops = {
+	.coo_page_init    = vvp_page_init,
+	.coo_io_init      = vvp_io_init,
+	.coo_attr_get     = vvp_attr_get,
+	.coo_attr_update  = vvp_attr_update,
+	.coo_conf_set     = vvp_conf_set,
+	.coo_prune        = vvp_prune,
+	.coo_glimpse      = vvp_object_glimpse,
+	.coo_req_attr_set = vvp_req_attr_set
+};
+
+static int vvp_object_init0(const struct lu_env *env,
+			    struct vvp_object *vob,
+			    const struct cl_object_conf *conf)
+{
+	vob->vob_inode = conf->coc_inode;
+	cl_object_page_init(&vob->vob_cl, sizeof(struct vvp_page));
+	return 0;
+}
+
+static int vvp_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct vvp_device *dev = lu2vvp_dev(obj->lo_dev);
+	struct vvp_object *vob = lu2vvp(obj);
+	struct lu_object  *below;
+	struct lu_device  *under;
+	int result;
+
+	under = &dev->vdv_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below != NULL) {
+		const struct cl_object_conf *cconf;
+
+		cconf = lu2cl_conf(conf);
+		lu_object_add(obj, below);
+		result = vvp_object_init0(env, vob, cconf);
+	} else
+		result = -ENOMEM;
+
+	return result;
+}
+
+static void vvp_object_free_rcu(struct rcu_head *head)
+{
+	struct vvp_object *vob = container_of(head, struct vvp_object,
+					      vob_header.coh_lu.loh_rcu);
+
+	kmem_cache_free(vvp_object_kmem, vob);
+}
+
+static void vvp_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct vvp_object *vob = lu2vvp(obj);
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
+	OBD_FREE_PRE(vob, sizeof(*vob), "slab-freed");
+	call_rcu(&vob->vob_header.coh_lu.loh_rcu, vvp_object_free_rcu);
+}
+
+static const struct lu_object_operations vvp_lu_obj_ops = {
+	.loo_object_init	= vvp_object_init,
+	.loo_object_free	= vvp_object_free,
+	.loo_object_print	= vvp_object_print,
+};
+
+struct vvp_object *cl_inode2vvp(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+        struct cl_object     *obj = lli->lli_clob;
+        struct lu_object     *lu;
+
+        LASSERT(obj != NULL);
+        lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type);
+        LASSERT(lu != NULL);
+
+	return lu2vvp(lu);
+}
+
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct vvp_object *vob;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vob, vvp_object_kmem, GFP_NOFS);
+	if (vob != NULL) {
+		struct cl_object_header *hdr;
+
+		obj = &vob->vob_cl.co_lu;
+		hdr = &vob->vob_header;
+		cl_object_header_init(hdr);
+		hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page));
+
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		vob->vob_cl.co_ops = &vvp_ops;
+		obj->lo_ops = &vvp_lu_obj_ops;
+	} else
+		obj = NULL;
+	return obj;
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
new file mode 100644
index 0000000000000..5ee33e5c78b3e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
@@ -0,0 +1,485 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_page for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-flags.h>
+#include <linux/pagemap.h>
+
+#include <libcfs/libcfs.h>
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+static void vvp_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice,
+			  struct pagevec *pvec)
+{
+	struct vvp_page *vpg     = cl2vvp_page(slice);
+	struct page     *vmpage  = vpg->vpg_page;
+
+	/*
+	 * vmpage->private was already cleared when page was moved into
+	 * VPG_FREEING state.
+	 */
+	LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
+	LASSERT(vmpage != NULL);
+	if (pvec) {
+		if (!pagevec_add(pvec, vmpage))
+			pagevec_release(pvec);
+	} else {
+		put_page(vmpage);
+	}
+}
+
+static int vvp_page_own(const struct lu_env *env,
+			const struct cl_page_slice *slice, struct cl_io *io,
+			int nonblock)
+{
+	struct vvp_page *vpg    = cl2vvp_page(slice);
+	struct page     *vmpage = vpg->vpg_page;
+
+	ENTRY;
+
+	LASSERT(vmpage != NULL);
+	if (nonblock) {
+		if (!trylock_page(vmpage))
+			return -EAGAIN;
+
+		if (unlikely(PageWriteback(vmpage))) {
+			unlock_page(vmpage);
+			return -EAGAIN;
+		}
+
+		return 0;
+	}
+
+	lock_page(vmpage);
+	wait_on_page_writeback(vmpage);
+
+	RETURN(0);
+}
+
+static void vvp_page_assume(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_unassume(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+}
+
+static void vvp_page_disown(const struct lu_env *env,
+			    const struct cl_page_slice *slice, struct cl_io *io)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	ENTRY;
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	unlock_page(cl2vm_page(slice));
+
+	EXIT;
+}
+
+static void vvp_page_discard(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *unused)
+{
+	struct page     *vmpage = cl2vm_page(slice);
+	struct vvp_page *vpg    = cl2vvp_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used && vmpage->mapping)
+		ll_ra_stats_inc(vmpage->mapping->host, RA_STAT_DISCARDED);
+
+	generic_error_remove_page(vmpage->mapping, vmpage);
+}
+
+static void vvp_page_export(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    int uptodate)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	if (uptodate)
+		SetPageUptodate(vmpage);
+	else
+		ClearPageUptodate(vmpage);
+}
+
+static int vvp_page_is_vmlocked(const struct lu_env *env,
+				const struct cl_page_slice *slice)
+{
+	return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA;
+}
+
+static int vvp_page_prep_read(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	ENTRY;
+	/* Skip the page already marked as PG_uptodate. */
+	RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0);
+}
+
+static int vvp_page_prep_write(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	struct cl_page *pg = slice->cpl_page;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageDirty(vmpage));
+
+	/* ll_writepage path is not a sync write, so need to set page writeback
+	 * flag
+	 */
+	if (pg->cp_sync_io == NULL)
+		set_page_writeback(vmpage);
+
+	return 0;
+}
+
+static void vvp_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct cl_page *cp = slice->cpl_page;
+
+	if (cp->cp_type == CPT_CACHEABLE) {
+		struct page *vmpage = cp->cp_vmpage;
+		struct inode *inode = vmpage->mapping->host;
+
+		LASSERT(PageLocked(vmpage));
+		LASSERT((struct cl_page *)vmpage->private == cp);
+
+		/* Drop the reference count held in vvp_page_init */
+		atomic_dec(&cp->cp_ref);
+
+		ClearPagePrivate(vmpage);
+		vmpage->private = 0;
+
+		/* clearpageuptodate prevents the page being read by the
+		 * kernel after it has been deleted from Lustre, which avoids
+		 * potential stale data reads.  The seqlock allows us to see
+		 * that a page was potentially deleted and catch the resulting
+		 * SIGBUS - see ll_filemap_fault() (LU-16160) */
+		write_seqlock(&ll_i2info(inode)->lli_page_inv_lock);
+		ClearPageUptodate(vmpage);
+		write_sequnlock(&ll_i2info(inode)->lli_page_inv_lock);
+
+		/*
+		 * The reference from vmpage to cl_page is removed,
+		 * but the reference back is still here. It is removed
+		 * later in cl_page_free().
+		 */
+	}
+}
+
+/**
+ * Handles page transfer errors at VM level.
+ *
+ * This takes inode as a separate argument, because inode on which error is to
+ * be set can be different from \a vmpage inode in case of direct-io.
+ */
+static void vvp_vmpage_error(struct inode *inode, struct page *vmpage,
+			     int ioret)
+{
+	struct vvp_object *obj = cl_inode2vvp(inode);
+
+	if (ioret == 0) {
+		ClearPageError(vmpage);
+		obj->vob_discard_page_warned = 0;
+	} else {
+		SetPageError(vmpage);
+		if (ioret == -ENOSPC)
+			set_bit(AS_ENOSPC, &inode->i_mapping->flags);
+		else
+			set_bit(AS_EIO, &inode->i_mapping->flags);
+
+		if ((ioret == -ESHUTDOWN || ioret == -EINTR ||
+		     ioret == -EIO) && obj->vob_discard_page_warned == 0) {
+			obj->vob_discard_page_warned = 1;
+			ll_dirty_page_discard_warn(inode, ioret);
+		}
+	}
+}
+
+static void vvp_page_completion_read(const struct lu_env *env,
+				     const struct cl_page_slice *slice,
+				     int ioret)
+{
+	struct vvp_page *vpg    = cl2vvp_page(slice);
+	struct page     *vmpage = vpg->vpg_page;
+	struct cl_page  *page   = slice->cpl_page;
+	struct inode    *inode  = vvp_object_inode(page->cp_obj);
+
+	ENTRY;
+	LASSERT(PageLocked(vmpage));
+	CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret);
+
+	if (vpg->vpg_defer_uptodate)
+		ll_ra_count_put(ll_i2sbi(inode), 1);
+
+	if (ioret == 0)  {
+		if (!vpg->vpg_defer_uptodate)
+			cl_page_export(env, page, 1);
+	} else if (vpg->vpg_defer_uptodate) {
+		vpg->vpg_defer_uptodate = 0;
+		if (ioret == -EAGAIN) {
+			/* mirror read failed, it needs to destroy the page
+			 * because subpage would be from wrong osc when trying
+			 * to read from a new mirror
+			 */
+			generic_error_remove_page(vmpage->mapping, vmpage);
+		}
+	}
+
+	if (page->cp_sync_io == NULL)
+		unlock_page(vmpage);
+
+	EXIT;
+}
+
+static void vvp_page_completion_write(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      int ioret)
+{
+	struct vvp_page *vpg    = cl2vvp_page(slice);
+	struct cl_page  *pg     = slice->cpl_page;
+	struct page     *vmpage = vpg->vpg_page;
+
+	ENTRY;
+	CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
+
+	if (pg->cp_sync_io != NULL) {
+		LASSERT(PageLocked(vmpage));
+		LASSERT(!PageWriteback(vmpage));
+	} else {
+		LASSERT(PageWriteback(vmpage));
+		/*
+		 * Only mark the page error only when it's an async write
+		 * because applications won't wait for IO to finish.
+		 */
+		vvp_vmpage_error(vvp_object_inode(pg->cp_obj), vmpage, ioret);
+
+		end_page_writeback(vmpage);
+	}
+	EXIT;
+}
+
+/**
+ * Implements cl_page_operations::cpo_make_ready() method.
+ *
+ * This is called to yank a page from the transfer cache and to send it out as
+ * a part of transfer. This function try-locks the page. If try-lock failed,
+ * page is owned by some concurrent IO, and should be skipped (this is bad,
+ * but hopefully rare situation, as it usually results in transfer being
+ * shorter than possible).
+ *
+ * \retval 0      success, page can be placed into transfer
+ *
+ * \retval -EAGAIN page is either used by concurrent IO has been
+ * truncated. Skip it.
+ */
+static int vvp_page_make_ready(const struct lu_env *env,
+			       const struct cl_page_slice *slice)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	struct cl_page *pg = slice->cpl_page;
+	int result = 0;
+
+	lock_page(vmpage);
+	if (clear_page_dirty_for_io(vmpage)) {
+		LASSERT(pg->cp_state == CPS_CACHED);
+		/* This actually clears the dirty bit in the radix
+		 * tree.
+		 */
+		set_page_writeback(vmpage);
+		CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
+	} else if (pg->cp_state == CPS_PAGEOUT) {
+		/* is it possible for osc_flush_async_page() to already
+		 * make it ready?
+		 */
+		result = -EALREADY;
+	} else {
+		CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
+			      pg->cp_state);
+		LBUG();
+	}
+	unlock_page(vmpage);
+	RETURN(result);
+}
+
+static int vvp_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct vvp_page *vpg	= cl2vvp_page(slice);
+	struct page     *vmpage	= vpg->vpg_page;
+
+	(*printer)(env, cookie,
+		   LUSTRE_VVP_NAME"-page@%p(%d:%d) vm@%p ",
+		   vpg, vpg->vpg_defer_uptodate, vpg->vpg_ra_used, vmpage);
+
+	if (vmpage != NULL) {
+		(*printer)(env, cookie, "%lx %d:%d %lx %lu %slru",
+			   (long)vmpage->flags, page_count(vmpage),
+			   page_mapcount(vmpage), vmpage->private,
+			   page_index(vmpage),
+			   list_empty(&vmpage->lru) ? "not-" : "");
+	}
+
+	(*printer)(env, cookie, "\n");
+
+	return 0;
+}
+
+static int vvp_page_fail(const struct lu_env *env,
+			 const struct cl_page_slice *slice)
+{
+	/*
+	 * Cached read?
+	 */
+	LBUG();
+
+	return 0;
+}
+
+static const struct cl_page_operations vvp_page_ops = {
+	.cpo_own           = vvp_page_own,
+	.cpo_assume        = vvp_page_assume,
+	.cpo_unassume      = vvp_page_unassume,
+	.cpo_disown        = vvp_page_disown,
+	.cpo_discard       = vvp_page_discard,
+	.cpo_delete        = vvp_page_delete,
+	.cpo_export        = vvp_page_export,
+	.cpo_is_vmlocked   = vvp_page_is_vmlocked,
+	.cpo_fini          = vvp_page_fini,
+	.cpo_print         = vvp_page_print,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep       = vvp_page_prep_read,
+			.cpo_completion = vvp_page_completion_read,
+			.cpo_make_ready = vvp_page_fail,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep       = vvp_page_prep_write,
+			.cpo_completion = vvp_page_completion_write,
+			.cpo_make_ready = vvp_page_make_ready,
+		},
+	},
+};
+
+static void vvp_transient_page_discard(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       struct cl_io *unused)
+{
+	struct cl_page *page = slice->cpl_page;
+
+	/*
+	 * For transient pages, remove it from the radix tree.
+	 */
+	cl_page_delete(env, page);
+}
+
+static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
+					  const struct cl_page_slice *slice)
+{
+	return -EBUSY;
+}
+
+static const struct cl_page_operations vvp_transient_page_ops = {
+	.cpo_discard		= vvp_transient_page_discard,
+	.cpo_is_vmlocked	= vvp_transient_page_is_vmlocked,
+	.cpo_print		= vvp_page_print,
+};
+
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, pgoff_t index)
+{
+	struct vvp_page *vpg = cl_object_page_slice(obj, page);
+	struct page     *vmpage = page->cp_vmpage;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	vpg->vpg_page = vmpage;
+
+	if (page->cp_type == CPT_TRANSIENT) {
+		/* DIO pages are referenced by userspace, we don't need to take
+		 * a reference on them. (contrast with get_page() call above)
+		 */
+		cl_page_slice_add(page, &vpg->vpg_cl, obj,
+				  &vvp_transient_page_ops);
+	} else {
+		get_page(vmpage);
+		/* in cache, decref in vvp_page_delete */
+		atomic_inc(&page->cp_ref);
+		SetPagePrivate(vmpage);
+		vmpage->private = (unsigned long)page;
+		cl_page_slice_add(page, &vpg->vpg_cl, obj,
+				&vvp_page_ops);
+	}
+
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c
new file mode 100644
index 0000000000000..0f04ab22f61ec
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c
@@ -0,0 +1,934 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
+#include <linux/selinux.h>
+#endif
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_dlm.h>
+#include <lustre_swab.h>
+
+#include "llite_internal.h"
+
+#ifndef HAVE_XATTR_HANDLER_NAME
+static inline const char *xattr_prefix(const struct xattr_handler *handler)
+{
+	return handler->prefix;
+}
+#endif
+
+const struct xattr_handler *get_xattr_type(const char *name)
+{
+	int i;
+
+	for (i = 0; ll_xattr_handlers[i]; i++) {
+		const char *prefix = xattr_prefix(ll_xattr_handlers[i]);
+		size_t prefix_len = strlen(prefix);
+
+		if (!strncmp(prefix, name, prefix_len))
+			return ll_xattr_handlers[i];
+	}
+
+	return NULL;
+}
+
+static int xattr_type_filter(struct ll_sb_info *sbi,
+			     const struct xattr_handler *handler)
+{
+	/* No handler means XATTR_OTHER_T */
+	if (!handler)
+		return -EOPNOTSUPP;
+
+	if ((handler->flags == XATTR_ACL_ACCESS_T ||
+	     handler->flags == XATTR_ACL_DEFAULT_T) &&
+	    !test_bit(LL_SBI_ACL, sbi->ll_flags))
+                return -EOPNOTSUPP;
+
+	if (handler->flags == XATTR_USER_T &&
+	    !test_bit(LL_SBI_USER_XATTR, sbi->ll_flags))
+		return -EOPNOTSUPP;
+
+	if (handler->flags == XATTR_TRUSTED_T &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return 0;
+}
+
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define ll_xattr_set_common(hd, ns, de, inode, name, value, size, flags) \
+	ll_xattr_set_common(hd, de, inode, name, value, size, flags)
+#endif
+
+static int ll_xattr_set_common(const struct xattr_handler *handler,
+			       struct user_namespace *mnt_userns,
+			       struct dentry *dentry, struct inode *inode,
+			       const char *name, const void *value, size_t size,
+			       int flags)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	const char *pv = value;
+	char *fullname;
+	ktime_t kstart = ktime_get();
+	u64 valid;
+	int rc;
+	ENTRY;
+
+	/* When setxattr() is called with a size of 0 the value is
+	 * unconditionally replaced by "". When removexattr() is
+	 * called we get a NULL value and XATTR_REPLACE for flags. */
+	if (!value && flags == XATTR_REPLACE)
+		valid = OBD_MD_FLXATTRRM;
+	else
+		valid = OBD_MD_FLXATTR;
+
+	/* FIXME: enable IMA when the conditions are ready */
+	if (handler->flags == XATTR_SECURITY_T &&
+	    (!strcmp(name, "ima") || !strcmp(name, "evm")))
+		RETURN(-EOPNOTSUPP);
+
+	rc = xattr_type_filter(sbi, handler);
+	if (rc)
+		RETURN(rc);
+
+	if ((handler->flags == XATTR_ACL_ACCESS_T ||
+	     handler->flags == XATTR_ACL_DEFAULT_T) &&
+	    !inode_owner_or_capable(mnt_userns, inode))
+		RETURN(-EPERM);
+
+	/* b10667: ignore lustre special xattr for now */
+	if (!strcmp(name, "hsm") ||
+	    ((handler->flags == XATTR_TRUSTED_T && !strcmp(name, "lov")) ||
+	     (handler->flags == XATTR_LUSTRE_T && !strcmp(name, "lov"))))
+		RETURN(0);
+
+	rc = ll_security_secctx_name_filter(sbi, handler->flags, name);
+	if (rc)
+		RETURN(rc);
+
+	/*
+	 * In user.* namespace, only regular files and directories can have
+	 * extended attributes.
+	 */
+	if (handler->flags == XATTR_USER_T) {
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			RETURN(-EPERM);
+	}
+
+	/* This check is required for compatibility with 2.14, in which
+	 * encryption context is stored in security.c xattr.
+	 * Setting the encryption context should only be possible by llcrypt
+	 * when defining an encryption policy on a directory.
+	 * When new files/dirs are created in an encrypted dir, the enc
+	 * context is set directly in the create request.
+	 */
+	if (handler->flags == XATTR_SECURITY_T && strcmp(name, "c") == 0)
+		RETURN(-EPERM);
+
+	fullname = kasprintf(GFP_KERNEL, "%s%s", xattr_prefix(handler), name);
+	if (!fullname)
+		RETURN(-ENOMEM);
+
+	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, fullname,
+			 pv, size, flags, ll_i2suppgid(inode), &req);
+	kfree(fullname);
+	if (rc) {
+		if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) {
+			LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n");
+			clear_bit(LL_SBI_USER_XATTR, sbi->ll_flags);
+		}
+		RETURN(rc);
+	}
+
+	ptlrpc_req_finished(req);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), valid == OBD_MD_FLXATTRRM ?
+				LPROC_LL_REMOVEXATTR : LPROC_LL_SETXATTR,
+			   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(0);
+}
+
+int ll_get_hsm_state(struct inode *inode, u32 *hus_states)
+{
+	struct md_op_data *op_data;
+	struct hsm_user_state *hus;
+	int rc;
+
+	OBD_ALLOC_PTR(hus);
+	if (!hus)
+		return -ENOMEM;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, hus);
+	if (!IS_ERR(op_data)) {
+		rc = obd_iocontrol(LL_IOC_HSM_STATE_GET, ll_i2mdexp(inode),
+				   sizeof(*op_data), op_data, NULL);
+		if (!rc)
+			*hus_states = hus->hus_states;
+		else
+			CDEBUG(D_VFSTRACE, "obd_iocontrol failed. rc = %d\n",
+			       rc);
+
+		ll_finish_md_op_data(op_data);
+	} else {
+		rc = PTR_ERR(op_data);
+		CDEBUG(D_VFSTRACE, "Could not prepare the opdata. rc = %d\n",
+		       rc);
+	}
+	OBD_FREE_PTR(hus);
+	return rc;
+}
+
+static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump, size_t size)
+{
+	struct lov_comp_md_v1 *comp_v1 = (struct lov_comp_md_v1 *)lump;
+	struct lov_user_md *v1 = lump;
+	bool need_clear_release = false;
+	bool release_checked = false;
+	bool is_composite = false;
+	u16 entry_count = 1;
+	int rc = 0;
+	int i;
+
+	if (!lump)
+		return 0;
+
+	if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
+		if (size < sizeof(*comp_v1))
+			return -ERANGE;
+
+		entry_count = comp_v1->lcm_entry_count;
+		if (size < offsetof(typeof(*comp_v1), lcm_entries[entry_count]))
+			return -ERANGE;
+		is_composite = true;
+	}
+
+	for (i = 0; i < entry_count; i++) {
+		if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
+			void *ptr = comp_v1;
+
+			if (comp_v1->lcm_entries[i].lcme_offset + sizeof(*v1) >
+			    size)
+				return -ERANGE;
+
+			ptr += comp_v1->lcm_entries[i].lcme_offset;
+			v1 = (struct lov_user_md *)ptr;
+		}
+
+		/*
+		 * Attributes that are saved via getxattr will always
+		 * have the stripe_offset as 0. Instead, the MDS
+		 * should be allowed to pick the starting OST index.
+		 * b=17846
+		 */
+		if (!is_composite && v1->lmm_stripe_offset == 0)
+			v1->lmm_stripe_offset = -1;
+
+		/* Avoid anyone directly setting the RELEASED flag. */
+		if (v1->lmm_pattern & LOV_PATTERN_F_RELEASED) {
+			if (!release_checked) {
+				u32 state = HS_NONE;
+
+				rc = ll_get_hsm_state(inode, &state);
+				if (rc)
+					return rc;
+
+				if (!(state & HS_ARCHIVED))
+					need_clear_release = true;
+				release_checked = true;
+			}
+			if (need_clear_release)
+				v1->lmm_pattern ^= LOV_PATTERN_F_RELEASED;
+		}
+	}
+
+	return rc;
+}
+
+static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump,
+			   size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc = 0;
+
+	/*
+	 * It is possible to set an xattr to a "" value of zero size.
+	 * For this case we are going to treat it as a removal.
+	 */
+	if (!size && lump)
+		lump = NULL;
+
+	if (size && size < sizeof(*lump)) {
+		/* ll_adjust_lum() or ll_lov_user_md_size() might access
+		 * before size - just give up now.
+		 */
+		return -ERANGE;
+	}
+	rc = ll_adjust_lum(inode, lump, size);
+	if (rc)
+		return rc;
+
+	if (lump && S_ISREG(inode->i_mode)) {
+		u64 it_flags = FMODE_WRITE;
+		ssize_t lum_size;
+
+		lum_size = ll_lov_user_md_size(lump);
+		if (lum_size < 0 || size < lum_size)
+			return -ERANGE;
+
+		rc = ll_lov_setstripe_ea_info(inode, dentry, it_flags, lump,
+					      lum_size);
+		/**
+		 * b=10667: ignore -EEXIST.
+		 * Silently eat error on setting trusted.lov/lustre.lov
+		 * attribute for platforms that added the default option
+		 * to copy all attributes in 'cp' command. Both rsync and
+		 * tar --xattrs also will try to set LOVEA for existing
+		 * files.
+		 */
+		if (rc == -EEXIST)
+			rc = 0;
+	} else if (S_ISDIR(inode->i_mode)) {
+		if (size != 0 && size < sizeof(struct lov_user_md))
+			return -EINVAL;
+
+		rc = ll_dir_setstripe(inode, lump, 0);
+	}
+
+	return rc;
+}
+
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define ll_xattr_set(hd, ns, de, inode, name, value, size, flags) \
+	ll_xattr_set(hd, de, inode, name, value, size, flags)
+#endif
+
+static int ll_xattr_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct inode *inode,
+			const char *name, const void *value, size_t size,
+			int flags)
+{
+	ktime_t kstart = ktime_get();
+	int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR :
+					       LPROC_LL_SETXATTR;
+	int rc;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), xattr %s\n",
+	       PFID(ll_inode2fid(inode)), inode, name);
+
+	/* lustre/trusted.lov.xxx would be passed through xattr API */
+	if (!strcmp(name, "lov")) {
+		rc = ll_setstripe_ea(dentry, (struct lov_user_md *)value,
+				       size);
+		ll_stats_ops_tally(ll_i2sbi(inode), op_type,
+				   ktime_us_delta(ktime_get(), kstart));
+		return rc;
+	} else if (!strcmp(name, "lma") || !strcmp(name, "link")) {
+		ll_stats_ops_tally(ll_i2sbi(inode), op_type,
+				   ktime_us_delta(ktime_get(), kstart));
+		return 0;
+	}
+
+	if (strncmp(name, "lov.", 4) == 0 &&
+	    (__swab32(((struct lov_user_md *)value)->lmm_magic) &
+	    le32_to_cpu(LOV_MAGIC_MASK)) == le32_to_cpu(LOV_MAGIC_MAGIC))
+		lustre_swab_lov_user_md((struct lov_user_md *)value, 0);
+
+	return ll_xattr_set_common(handler, mnt_userns, dentry, inode, name,
+				   value, size, flags);
+}
+
+int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
+		  size_t size, u64 valid)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ptlrpc_request *req = NULL;
+        void *xdata;
+	int rc;
+	ENTRY;
+
+	/* This check is required for compatibility with 2.14, in which
+	 * encryption context is stored in security.c xattr. Accessing the
+	 * encryption context should only be possible by llcrypt.
+	 */
+	if (type == XATTR_SECURITY_T && strcmp(name, "security.c") == 0)
+		GOTO(out_xattr, rc = -EPERM);
+
+	if (sbi->ll_xattr_cache_enabled && type != XATTR_ACL_ACCESS_T &&
+	    (type != XATTR_SECURITY_T || !ll_xattr_is_seclabel(name)) &&
+	    (type != XATTR_TRUSTED_T || strcmp(name, XATTR_NAME_SOM))) {
+		rc = ll_xattr_cache_get(inode, name, buffer, size, valid);
+		if (rc == -EAGAIN)
+			goto getxattr_nocache;
+		if (rc < 0)
+			GOTO(out_xattr, rc);
+
+		/* Add "system.posix_acl_access" to the list */
+		if (lli->lli_posix_acl && valid & OBD_MD_FLXATTRLS) {
+			if (size == 0) {
+				rc += sizeof(XATTR_NAME_ACL_ACCESS);
+			} else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) {
+				memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS,
+				       sizeof(XATTR_NAME_ACL_ACCESS));
+				rc += sizeof(XATTR_NAME_ACL_ACCESS);
+			} else {
+				GOTO(out_xattr, rc = -ERANGE);
+			}
+		}
+	} else {
+getxattr_nocache:
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid,
+				 name, size, &req);
+		if (rc < 0)
+			GOTO(out_xattr, rc);
+
+		/* only detect the xattr size */
+		if (size == 0)
+			GOTO(out, rc);
+
+		if (size < rc)
+			GOTO(out, rc = -ERANGE);
+
+		/* do not need swab xattr data */
+		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+						     rc);
+		if (!xdata)
+			GOTO(out, rc = -EPROTO);
+
+		memcpy(buffer, xdata, rc);
+	}
+
+	EXIT;
+
+out_xattr:
+	if (rc == -EOPNOTSUPP && type == XATTR_USER_T) {
+		LCONSOLE_INFO("%s: disabling user_xattr feature because "
+			      "it is not supported on the server: rc = %d\n",
+			      sbi->ll_fsname, rc);
+		clear_bit(LL_SBI_USER_XATTR, sbi->ll_flags);
+	}
+out:
+        ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int ll_xattr_get_common(const struct xattr_handler *handler,
+			       struct dentry *dentry,
+			       struct inode *inode,
+			       const char *name, void *buffer, size_t size)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	ktime_t kstart = ktime_get();
+	char *fullname;
+	int rc;
+
+	ENTRY;
+
+	rc = xattr_type_filter(sbi, handler);
+	if (rc)
+		RETURN(rc);
+
+	rc = ll_security_secctx_name_filter(sbi, handler->flags, name);
+	if (rc)
+		RETURN(rc);
+
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+	/* posix acl is under protection of LOOKUP lock. when calling to this,
+	 * we just have path resolution to the target inode, so we have great
+	 * chance that cached ACL is uptodate.
+	 */
+	if (handler->flags == XATTR_ACL_ACCESS_T) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct posix_acl *acl;
+
+		read_lock(&lli->lli_lock);
+		acl = posix_acl_dup(lli->lli_posix_acl);
+		read_unlock(&lli->lli_lock);
+
+		if (!acl)
+			RETURN(-ENODATA);
+
+		rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+		posix_acl_release(acl);
+		RETURN(rc);
+	}
+	if (handler->flags == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
+		RETURN(-ENODATA);
+#endif
+
+	fullname = kasprintf(GFP_KERNEL, "%s%s", xattr_prefix(handler), name);
+	if (!fullname)
+		RETURN(-ENOMEM);
+
+	rc = ll_xattr_list(inode, fullname, handler->flags, buffer, size,
+			   OBD_MD_FLXATTR);
+	kfree(fullname);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR,
+			   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(rc);
+}
+
+static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size)
+{
+	ssize_t rc;
+
+	if (S_ISREG(inode->i_mode)) {
+		struct cl_object *obj = ll_i2info(inode)->lli_clob;
+		struct cl_layout cl = {
+			.cl_buf.lb_buf = buf,
+			.cl_buf.lb_len = buf_size,
+		};
+		struct lu_env *env;
+		u16 refcheck;
+
+		if (!obj)
+			RETURN(-ENODATA);
+
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env))
+			RETURN(PTR_ERR(env));
+
+		rc = cl_object_layout_get(env, obj, &cl);
+		if (rc < 0)
+			GOTO(out_env, rc);
+
+		if (!cl.cl_size)
+			GOTO(out_env, rc = -ENODATA);
+
+		rc = cl.cl_size;
+
+		if (!buf_size)
+			GOTO(out_env, rc);
+
+		LASSERT(buf && rc <= buf_size);
+
+		/*
+		 * Do not return layout gen for getxattr() since
+		 * otherwise it would confuse tar --xattr by
+		 * recognizing layout gen as stripe offset when the
+		 * file is restored. See LU-2809.
+		 */
+		if ((((struct lov_mds_md *)buf)->lmm_magic &
+		    __swab32(LOV_MAGIC_MAGIC)) == __swab32(LOV_MAGIC_MAGIC))
+			lustre_swab_lov_user_md((struct lov_user_md *)buf,
+						cl.cl_size);
+
+		switch (((struct lov_mds_md *)buf)->lmm_magic) {
+		case LOV_MAGIC_V1:
+		case LOV_MAGIC_V3:
+		case LOV_MAGIC_SPECIFIC:
+			((struct lov_mds_md *)buf)->lmm_layout_gen = 0;
+			break;
+		case LOV_MAGIC_COMP_V1:
+		case LOV_MAGIC_FOREIGN:
+			goto out_env;
+		default:
+			CERROR("Invalid LOV magic %08x\n",
+			       ((struct lov_mds_md *)buf)->lmm_magic);
+			GOTO(out_env, rc = -EINVAL);
+		}
+
+out_env:
+		cl_env_put(env, &refcheck);
+
+		RETURN(rc);
+	} else if (S_ISDIR(inode->i_mode)) {
+		struct ptlrpc_request *req = NULL;
+		struct ptlrpc_request *root_req = NULL;
+		struct lov_mds_md *lmm = NULL;
+		int lmm_size = 0;
+
+		rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmm_size,
+					      &req, &root_req, 0);
+		if (rc < 0)
+			GOTO(out_req, rc);
+
+		if (!buf_size)
+			GOTO(out_req, rc = lmm_size);
+
+		if (buf_size < lmm_size)
+			GOTO(out_req, rc = -ERANGE);
+
+		memcpy(buf, lmm, lmm_size);
+		GOTO(out_req, rc = lmm_size);
+out_req:
+		if (req)
+			ptlrpc_req_finished(req);
+		if (root_req)
+			ptlrpc_req_finished(root_req);
+
+		RETURN(rc);
+	} else {
+		RETURN(-ENODATA);
+	}
+}
+
+static int ll_xattr_get(const struct xattr_handler *handler,
+			struct dentry *dentry, struct inode *inode,
+			const char *name, void *buffer, size_t size)
+{
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n",
+	       PFID(ll_inode2fid(inode)), inode, name);
+
+	if (!strcmp(name, "lov")) {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+		return ll_getxattr_lov(inode, buffer, size);
+	}
+
+	return ll_xattr_get_common(handler, dentry, inode, name, buffer, size);
+}
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	ktime_t kstart = ktime_get();
+	char *xattr_name;
+	ssize_t rc, rc2;
+	size_t len, rem;
+
+	LASSERT(inode);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+	rc = ll_xattr_list(inode, NULL, XATTR_OTHER_T, buffer, size,
+			   OBD_MD_FLXATTRLS);
+	if (rc < 0)
+		RETURN(rc);
+
+	/*
+	 * If we're being called to get the size of the xattr list
+	 * (size == 0) then just assume that a lustre.lov xattr
+	 * exists.
+	 */
+	if (!size)
+		goto out;
+
+	xattr_name = buffer;
+	rem = rc;
+
+	while (rem > 0) {
+		const struct xattr_handler *xh = get_xattr_type(xattr_name);
+		bool hide_xattr = false;
+
+		/* Hide virtual project id xattr from the list when
+		 * parent has the inherit flag and the same project id,
+		 * so project id won't be messed up by copying the xattrs
+		 * when mv to a tree with different project id.
+		 */
+		if (xh && xh->flags == XATTR_TRUSTED_T &&
+		    strcmp(xattr_name, XATTR_NAME_PROJID) == 0) {
+			struct inode *dir = d_inode(dentry->d_parent);
+
+			if ((ll_i2info(inode)->lli_projid ==
+			     ll_i2info(dir)->lli_projid) &&
+			    test_bit(LLIF_PROJECT_INHERIT,
+				     &ll_i2info(dir)->lli_flags))
+				hide_xattr = true;
+		} else if (xh && xh->flags == XATTR_SECURITY_T &&
+			   strcmp(xattr_name, "security.c") == 0) {
+			/* Listing xattrs should not expose encryption
+			 * context. There is no handler defined for
+			 * XATTR_ENCRYPTION_PREFIX, so this test is just
+			 * needed for compatibility with 2.14, in which
+			 * encryption context is stored in security.c xattr.
+			 */
+			hide_xattr = true;
+		}
+
+		len = strnlen(xattr_name, rem - 1) + 1;
+		rem -= len;
+		if (!xattr_type_filter(sbi, hide_xattr ? NULL : xh)) {
+			/* Skip OK xattr type, leave it in buffer. */
+			xattr_name += len;
+			continue;
+		}
+
+		/*
+		 * Move up remaining xattrs in buffer
+		 * removing the xattr that is not OK.
+		 */
+		memmove(xattr_name, xattr_name + len, rem);
+		rc -= len;
+	}
+
+	rc2 = ll_getxattr_lov(inode, NULL, 0);
+	if (rc2 == -ENODATA)
+		RETURN(rc);
+
+	if (rc2 < 0)
+		RETURN(rc2);
+
+	if (size < rc + sizeof(XATTR_LUSTRE_LOV))
+		RETURN(-ERANGE);
+
+	memcpy(buffer + rc, XATTR_LUSTRE_LOV, sizeof(XATTR_LUSTRE_LOV));
+
+out:
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR,
+			   ktime_us_delta(ktime_get(), kstart));
+
+	RETURN(rc + sizeof(XATTR_LUSTRE_LOV));
+}
+
+#ifdef HAVE_XATTR_HANDLER_SIMPLIFIED
+static int ll_xattr_get_common_4_3(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   void *buffer, size_t size)
+{
+	return ll_xattr_get_common(handler, dentry, dentry->d_inode, name,
+				   buffer, size);
+}
+
+static int ll_xattr_get_4_3(const struct xattr_handler *handler,
+			    struct dentry *dentry, const char *name,
+			    void *buffer, size_t size)
+{
+	return ll_xattr_get(handler, dentry, dentry->d_inode, name, buffer,
+			    size);
+}
+
+static int ll_xattr_set_common_4_3(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	return ll_xattr_set_common(handler, dentry, dentry->d_inode, name,
+				   value, size, flags);
+}
+
+static int ll_xattr_set_4_3(const struct xattr_handler *handler,
+			    struct dentry *dentry, const char *name,
+			    const void *value, size_t size, int flags)
+{
+	return ll_xattr_set(handler, dentry, dentry->d_inode, name, value,
+			    size, flags);
+}
+
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+const struct xattr_handler *get_xattr_handler(int handler_flag)
+{
+	int i = 0;
+
+	while (ll_xattr_handlers[i]) {
+		if (ll_xattr_handlers[i]->flags == handler_flag)
+			return ll_xattr_handlers[i];
+		i++;
+	}
+	return NULL;
+}
+
+static int ll_xattr_get_common_3_11(struct dentry *dentry, const char *name,
+				   void *buffer, size_t size, int handler_flags)
+{
+	const struct xattr_handler *handler = get_xattr_handler(handler_flags);
+
+	if (!handler)
+		return -ENXIO;
+
+	return ll_xattr_get_common(handler, dentry, dentry->d_inode, name,
+				   buffer, size);
+}
+
+static int ll_xattr_get_3_11(struct dentry *dentry, const char *name,
+			    void *buffer, size_t size, int handler_flags)
+{
+	const struct xattr_handler *handler = get_xattr_handler(handler_flags);
+
+	if (!handler)
+		return -ENXIO;
+
+	return ll_xattr_get(handler, dentry, dentry->d_inode, name, buffer,
+			    size);
+}
+
+static int ll_xattr_set_common_3_11(struct dentry *dentry, const char *name,
+				   const void *value, size_t size, int flags,
+				   int handler_flags)
+{
+	const struct xattr_handler *handler = get_xattr_handler(handler_flags);
+
+	if (!handler)
+		return -ENXIO;
+
+	return ll_xattr_set_common(handler, NULL, dentry, dentry->d_inode, name,
+				   value, size, flags);
+}
+
+static int ll_xattr_set_3_11(struct dentry *dentry, const char *name,
+			    const void *value, size_t size, int flags,
+			    int handler_flags)
+{
+	const struct xattr_handler *handler = get_xattr_handler(handler_flags);
+
+	if (!handler)
+		return -ENXIO;
+
+	return ll_xattr_set(handler, NULL, dentry, dentry->d_inode, name, value,
+			    size, flags);
+}
+#endif
+
+static const struct xattr_handler ll_user_xattr_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.flags = XATTR_USER_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_common_4_3,
+	.set = ll_xattr_set_common_4_3,
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_common_3_11,
+	.set = ll_xattr_set_common_3_11,
+#else
+	.get = ll_xattr_get_common,
+	.set = ll_xattr_set_common,
+#endif
+};
+
+static const struct xattr_handler ll_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.flags = XATTR_TRUSTED_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_4_3,
+	.set = ll_xattr_set_4_3,
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_3_11,
+	.set = ll_xattr_set_3_11,
+#else
+	.get = ll_xattr_get,
+	.set = ll_xattr_set,
+#endif
+};
+
+static const struct xattr_handler ll_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.flags = XATTR_SECURITY_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_common_4_3,
+	.set = ll_xattr_set_common_4_3,
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_common_3_11,
+	.set = ll_xattr_set_common_3_11,
+#else
+	.get = ll_xattr_get_common,
+	.set = ll_xattr_set_common,
+#endif
+};
+
+static const struct xattr_handler ll_acl_access_xattr_handler = {
+#ifdef HAVE_XATTR_HANDLER_NAME
+	.name = XATTR_NAME_POSIX_ACL_ACCESS,
+#else
+	.prefix = XATTR_NAME_POSIX_ACL_ACCESS,
+#endif
+	.flags = XATTR_ACL_ACCESS_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_common_4_3,
+	.set = ll_xattr_set_common_4_3,
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_common_3_11,
+	.set = ll_xattr_set_common_3_11,
+#else
+	.get = ll_xattr_get_common,
+	.set = ll_xattr_set_common,
+#endif
+};
+
+static const struct xattr_handler ll_acl_default_xattr_handler = {
+#ifdef HAVE_XATTR_HANDLER_NAME
+	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
+#else
+	.prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
+#endif
+	.flags = XATTR_ACL_DEFAULT_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_common_4_3,
+	.set = ll_xattr_set_common_4_3,
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_common_3_11,
+	.set = ll_xattr_set_common_3_11,
+#else
+	.get = ll_xattr_get_common,
+	.set = ll_xattr_set_common,
+#endif
+};
+
+static const struct xattr_handler ll_lustre_xattr_handler = {
+	.prefix = XATTR_LUSTRE_PREFIX,
+	.flags = XATTR_LUSTRE_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_4_3,
+	.set = ll_xattr_set_4_3,
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_3_11,
+	.set = ll_xattr_set_3_11,
+#else
+	.get = ll_xattr_get,
+	.set = ll_xattr_set,
+#endif
+};
+
+const struct xattr_handler *ll_xattr_handlers[] = {
+	&ll_user_xattr_handler,
+	&ll_trusted_xattr_handler,
+	&ll_security_xattr_handler,
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+	&ll_acl_access_xattr_handler,
+	&ll_acl_default_xattr_handler,
+#endif
+	&ll_lustre_xattr_handler,
+	NULL,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
new file mode 100644
index 0000000000000..0a751744e4f20
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
@@ -0,0 +1,671 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ *
+ * Author: Andrew Perepechko <Andrew_Perepechko@xyratex.com>
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <obd_support.h>
+#include <lustre_dlm.h>
+#include "llite_internal.h"
+
+/* If we ever have hundreds of extended attributes, we might want to consider
+ * using a hash or a tree structure instead of list for faster lookups.
+ */
+struct ll_xattr_entry {
+	struct list_head	xe_list;    /* protected with
+					     * lli_xattrs_list_rwsem */
+	char			*xe_name;   /* xattr name, \0-terminated */
+	char			*xe_value;  /* xattr value */
+	unsigned		xe_namelen; /* strlen(xe_name) + 1 */
+	unsigned		xe_vallen;  /* xattr value length */
+};
+
+static struct kmem_cache *xattr_kmem;
+static struct lu_kmem_descr xattr_caches[] = {
+	{
+		.ckd_cache = &xattr_kmem,
+		.ckd_name  = "xattr_kmem",
+		.ckd_size  = sizeof(struct ll_xattr_entry)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+int ll_xattr_init(void)
+{
+	return lu_kmem_init(xattr_caches);
+}
+
+void ll_xattr_fini(void)
+{
+	lu_kmem_fini(xattr_caches);
+}
+
+/**
+ * Initializes xattr cache for an inode.
+ *
+ * This initializes the xattr list and marks cache presence.
+ */
+static void ll_xattr_cache_init(struct ll_inode_info *lli)
+{
+	ENTRY;
+
+	LASSERT(lli != NULL);
+
+	INIT_LIST_HEAD(&lli->lli_xattrs);
+	set_bit(LLIF_XATTR_CACHE, &lli->lli_flags);
+}
+
+/**
+ *  This looks for a specific extended attribute.
+ *
+ *  Find in @cache and return @xattr_name attribute in @xattr,
+ *  for the NULL @xattr_name return the first cached @xattr.
+ *
+ *  \retval 0        success
+ *  \retval -ENODATA if not found
+ */
+static int ll_xattr_cache_find(struct list_head *cache,
+			       const char *xattr_name,
+			       struct ll_xattr_entry **xattr)
+{
+	struct ll_xattr_entry *entry;
+
+	ENTRY;
+
+	list_for_each_entry(entry, cache, xe_list) {
+		/* xattr_name == NULL means look for any entry */
+		if (xattr_name == NULL ||
+		    strcmp(xattr_name, entry->xe_name) == 0) {
+			*xattr = entry;
+			CDEBUG(D_CACHE, "find: [%s]=%.*s\n",
+			       entry->xe_name, entry->xe_vallen,
+			       entry->xe_value);
+			RETURN(0);
+		}
+	}
+
+	RETURN(-ENODATA);
+}
+
+/**
+ * This adds an xattr.
+ *
+ * Add @xattr_name attr with @xattr_val value and @xattr_val_len length,
+ *
+ * \retval 0       success
+ * \retval -ENOMEM if no memory could be allocated for the cached attr
+ * \retval -EPROTO if duplicate xattr is being added
+ */
+static int ll_xattr_cache_add(struct list_head *cache,
+			      const char *xattr_name,
+			      const char *xattr_val,
+			      unsigned xattr_val_len)
+{
+	struct ll_xattr_entry *xattr;
+
+	ENTRY;
+
+	if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) {
+		if (!strcmp(xattr_name, LL_XATTR_NAME_ENCRYPTION_CONTEXT) ||
+		    !strcmp(xattr_name, LL_XATTR_NAME_ENCRYPTION_CONTEXT_OLD))
+			/* it means enc ctx was already in cache,
+			 * ignore error as it cannot be modified
+			 */
+			RETURN(0);
+
+		CDEBUG(D_CACHE, "duplicate xattr: [%s]\n", xattr_name);
+		RETURN(-EPROTO);
+	}
+
+	OBD_SLAB_ALLOC_PTR_GFP(xattr, xattr_kmem, GFP_NOFS);
+	if (xattr == NULL) {
+		CDEBUG(D_CACHE, "failed to allocate xattr\n");
+		RETURN(-ENOMEM);
+	}
+
+	xattr->xe_namelen = strlen(xattr_name) + 1;
+
+	OBD_ALLOC(xattr->xe_name, xattr->xe_namelen);
+	if (!xattr->xe_name) {
+		CDEBUG(D_CACHE, "failed to alloc xattr name %u\n",
+		       xattr->xe_namelen);
+		goto err_name;
+	}
+	OBD_ALLOC(xattr->xe_value, xattr_val_len);
+	if (!xattr->xe_value) {
+		CDEBUG(D_CACHE, "failed to alloc xattr value %d\n",
+		       xattr_val_len);
+		goto err_value;
+	}
+
+	memcpy(xattr->xe_name, xattr_name, xattr->xe_namelen);
+	memcpy(xattr->xe_value, xattr_val, xattr_val_len);
+	xattr->xe_vallen = xattr_val_len;
+	list_add(&xattr->xe_list, cache);
+
+	CDEBUG(D_CACHE, "set: [%s]=%.*s\n", xattr_name,
+		xattr_val_len, xattr_val);
+
+	RETURN(0);
+err_value:
+	OBD_FREE(xattr->xe_name, xattr->xe_namelen);
+err_name:
+	OBD_SLAB_FREE_PTR(xattr, xattr_kmem);
+
+	RETURN(-ENOMEM);
+}
+
+/**
+ * This removes an extended attribute from cache.
+ *
+ * Remove @xattr_name attribute from @cache.
+ *
+ * \retval 0        success
+ * \retval -ENODATA if @xattr_name is not cached
+ */
+static int ll_xattr_cache_del(struct list_head *cache,
+			      const char *xattr_name)
+{
+	struct ll_xattr_entry *xattr;
+
+	ENTRY;
+
+	CDEBUG(D_CACHE, "del xattr: %s\n", xattr_name);
+
+	if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) {
+		list_del(&xattr->xe_list);
+		OBD_FREE(xattr->xe_name, xattr->xe_namelen);
+		OBD_FREE(xattr->xe_value, xattr->xe_vallen);
+		OBD_SLAB_FREE_PTR(xattr, xattr_kmem);
+
+		RETURN(0);
+	}
+
+	RETURN(-ENODATA);
+}
+
+/**
+ * This iterates cached extended attributes.
+ *
+ * Walk over cached attributes in @cache and
+ * fill in @xld_buffer or only calculate buffer
+ * size if @xld_buffer is NULL.
+ *
+ * \retval >= 0     buffer list size
+ * \retval -ENODATA if the list cannot fit @xld_size buffer
+ */
+static int ll_xattr_cache_list(struct list_head *cache,
+			       char *xld_buffer,
+			       int xld_size)
+{
+	struct ll_xattr_entry *xattr, *tmp;
+	int xld_tail = 0;
+
+	ENTRY;
+
+	list_for_each_entry_safe(xattr, tmp, cache, xe_list) {
+		CDEBUG(D_CACHE, "list: buffer=%p[%d] name=%s\n",
+			xld_buffer, xld_tail, xattr->xe_name);
+
+		if (xld_buffer) {
+			xld_size -= xattr->xe_namelen;
+			if (xld_size < 0)
+				break;
+			memcpy(&xld_buffer[xld_tail],
+			       xattr->xe_name, xattr->xe_namelen);
+		}
+		xld_tail += xattr->xe_namelen;
+	}
+
+	if (xld_size < 0)
+		RETURN(-ERANGE);
+
+	RETURN(xld_tail);
+}
+
+/**
+ * Check if the xattr cache is initialized.
+ *
+ * \retval 0 @cache is not initialized
+ * \retval 1 @cache is initialized
+ */
+static int ll_xattr_cache_valid(struct ll_inode_info *lli)
+{
+	return test_bit(LLIF_XATTR_CACHE, &lli->lli_flags);
+}
+
+/**
+ * Check if the xattr cache is filled.
+ *
+ * \retval 0 @cache is not filled
+ * \retval 1 @cache is filled
+ */
+static int ll_xattr_cache_filled(struct ll_inode_info *lli)
+{
+	return test_bit(LLIF_XATTR_CACHE_FILLED, &lli->lli_flags);
+}
+
+/**
+ * This finalizes the xattr cache.
+ *
+ * Free all xattr memory. @lli is the inode info pointer.
+ *
+ * \retval 0 no error occured
+ */
+static int ll_xattr_cache_destroy_locked(struct ll_inode_info *lli)
+{
+	ENTRY;
+
+	if (!ll_xattr_cache_valid(lli))
+		RETURN(0);
+
+	while (ll_xattr_cache_del(&lli->lli_xattrs, NULL) == 0)
+		/* empty loop */ ;
+
+	clear_bit(LLIF_XATTR_CACHE_FILLED, &lli->lli_flags);
+	clear_bit(LLIF_XATTR_CACHE, &lli->lli_flags);
+
+	RETURN(0);
+}
+
+int ll_xattr_cache_destroy(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	ENTRY;
+
+	down_write(&lli->lli_xattrs_list_rwsem);
+	rc = ll_xattr_cache_destroy_locked(lli);
+	up_write(&lli->lli_xattrs_list_rwsem);
+
+	RETURN(rc);
+}
+
+/**
+ * ll_xattr_cache_empty - empty xattr cache for @ino
+ *
+ * Similar to ll_xattr_cache_destroy(), but preserves encryption context.
+ * So only LLIF_XATTR_CACHE_FILLED flag is cleared, but not LLIF_XATTR_CACHE.
+ */
+int ll_xattr_cache_empty(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_xattr_entry *entry, *n;
+
+	ENTRY;
+
+	down_write(&lli->lli_xattrs_list_rwsem);
+	if (!ll_xattr_cache_valid(lli) ||
+	    !ll_xattr_cache_filled(lli))
+		GOTO(out_empty, 0);
+
+	list_for_each_entry_safe(entry, n, &lli->lli_xattrs, xe_list) {
+		if (strcmp(entry->xe_name, xattr_for_enc(inode)) == 0)
+			continue;
+
+		CDEBUG(D_CACHE, "delete: %s\n", entry->xe_name);
+		list_del(&entry->xe_list);
+		OBD_FREE(entry->xe_name, entry->xe_namelen);
+		OBD_FREE(entry->xe_value, entry->xe_vallen);
+		OBD_SLAB_FREE_PTR(entry, xattr_kmem);
+	}
+	clear_bit(LLIF_XATTR_CACHE_FILLED, &lli->lli_flags);
+
+out_empty:
+	up_write(&lli->lli_xattrs_list_rwsem);
+	RETURN(0);
+}
+
+/**
+ * Match or enqueue a PR lock.
+ *
+ * Find or request an LDLM lock with xattr data.
+ * Since LDLM does not provide API for atomic match_or_enqueue,
+ * the function handles it with a separate enq lock.
+ * If successful, the function exits with a write lock held
+ * on lli_xattrs_list_rwsem.
+ *
+ * \retval 0       no error occured
+ * \retval -ENOMEM not enough memory
+ */
+static int ll_xattr_find_get_lock(struct inode *inode,
+				  struct lookup_intent *oit,
+				  struct ptlrpc_request **req)
+{
+	enum ldlm_mode mode;
+	struct lustre_handle lockh = { 0 };
+	struct md_op_data *op_data;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_export *exp = sbi->ll_md_exp;
+	int rc;
+
+	ENTRY;
+
+	mutex_lock(&lli->lli_xattrs_enq_lock);
+	/* inode may have been shrunk and recreated, so data is gone, match lock
+	 * only when data exists. */
+	if (ll_xattr_cache_filled(lli)) {
+		/* Try matching first. */
+		mode = ll_take_md_lock(inode, MDS_INODELOCK_XATTR, &lockh, 0,
+					LCK_PR);
+		if (mode != 0) {
+			/* fake oit in mdc_revalidate_lock() manner */
+			oit->it_lock_handle = lockh.cookie;
+			oit->it_lock_mode = mode;
+			goto out;
+		}
+	}
+
+	/* Enqueue if the lock isn't cached locally. */
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		mutex_unlock(&lli->lli_xattrs_enq_lock);
+		RETURN(PTR_ERR(op_data));
+	}
+
+	op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS;
+
+	rc = md_intent_lock(exp, op_data, oit, req, &ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	*req = oit->it_request;
+
+	if (rc < 0) {
+		CDEBUG(D_CACHE, "md_intent_lock failed with %d for fid "DFID"\n",
+		       rc, PFID(ll_inode2fid(inode)));
+		mutex_unlock(&lli->lli_xattrs_enq_lock);
+		RETURN(rc);
+	}
+
+out:
+	down_write(&lli->lli_xattrs_list_rwsem);
+	mutex_unlock(&lli->lli_xattrs_enq_lock);
+
+	RETURN(0);
+}
+
+/**
+ * Refill the xattr cache.
+ *
+ * Fetch and cache the whole of xattrs for @inode, thanks to the write lock
+ * on lli_xattrs_list_rwsem obtained from ll_xattr_find_get_lock().
+ * If successful, this write lock is kept.
+ *
+ * \retval 0       no error occured
+ * \retval -EPROTO network protocol error
+ * \retval -ENOMEM not enough memory for the cache
+ */
+static int ll_xattr_cache_refill(struct inode *inode)
+{
+	struct lookup_intent oit = { .it_op = IT_GETXATTR };
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	const char *xdata, *xval, *xtail, *xvtail;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body;
+	__u32 *xsizes;
+	int rc = 0, i;
+
+	ENTRY;
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_XATTR_PAUSE, cfs_fail_val ?: 2);
+
+	rc = ll_xattr_find_get_lock(inode, &oit, &req);
+	if (rc)
+		GOTO(err_req, rc);
+
+	/* Do we have the data at this point? */
+	if (ll_xattr_cache_filled(lli)) {
+		ll_stats_ops_tally(sbi, LPROC_LL_GETXATTR_HITS, 1);
+		ll_intent_drop_lock(&oit);
+		GOTO(err_req, rc = 0);
+	}
+
+	/* Matched but no cache? Cancelled on error by a parallel refill. */
+	if (unlikely(req == NULL)) {
+		CDEBUG(D_CACHE, "cancelled by a parallel getxattr\n");
+		ll_intent_drop_lock(&oit);
+		GOTO(err_unlock, rc = -EAGAIN);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL) {
+		CERROR("no MDT BODY in the refill xattr reply\n");
+		GOTO(err_cancel, rc = -EPROTO);
+	}
+	/* do not need swab xattr data */
+	xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+						body->mbo_eadatasize);
+	xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS,
+						body->mbo_aclsize);
+	xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS,
+					      body->mbo_max_mdsize *
+					      sizeof(__u32));
+	if (xdata == NULL || xval == NULL || xsizes == NULL) {
+		CERROR("wrong setxattr reply\n");
+		GOTO(err_cancel, rc = -EPROTO);
+	}
+
+	xtail = xdata + body->mbo_eadatasize;
+	xvtail = xval + body->mbo_aclsize;
+
+	CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail);
+
+	if (!ll_xattr_cache_valid(lli))
+		ll_xattr_cache_init(lli);
+
+	for (i = 0; i < body->mbo_max_mdsize; i++) {
+		CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval);
+		/* Perform consistency checks: attr names and vals in pill */
+		if (memchr(xdata, 0, xtail - xdata) == NULL) {
+			CERROR("xattr protocol violation (names are broken)\n");
+			rc = -EPROTO;
+		} else if (xval + *xsizes > xvtail) {
+			CERROR("xattr protocol violation (vals are broken)\n");
+			rc = -EPROTO;
+		} else if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_XATTR_ENOMEM)) {
+			rc = -ENOMEM;
+		} else if (!strcmp(xdata, XATTR_NAME_ACL_ACCESS)) {
+			/* Filter out ACL ACCESS since it's cached separately */
+			CDEBUG(D_CACHE, "not caching %s\n",
+			       XATTR_NAME_ACL_ACCESS);
+			rc = 0;
+		} else if (ll_xattr_is_seclabel(xdata)) {
+			/* Filter out security label, it is cached in slab */
+			CDEBUG(D_CACHE, "not caching %s\n", xdata);
+			rc = 0;
+		} else if (!strcmp(xdata, XATTR_NAME_SOM)) {
+			/* Filter out trusted.som, it is not cached on client */
+			CDEBUG(D_CACHE, "not caching trusted.som\n");
+			rc = 0;
+		} else {
+			rc = ll_xattr_cache_add(&lli->lli_xattrs, xdata, xval,
+						*xsizes);
+		}
+		if (rc < 0) {
+			ll_xattr_cache_destroy_locked(lli);
+			GOTO(err_cancel, rc);
+		}
+		xdata += strlen(xdata) + 1;
+		xval  += *xsizes;
+		xsizes++;
+	}
+
+	if (xdata != xtail || xval != xvtail)
+		CERROR("a hole in xattr data\n");
+	else
+		set_bit(LLIF_XATTR_CACHE_FILLED, &lli->lli_flags);
+
+	ll_set_lock_data(sbi->ll_md_exp, inode, &oit, NULL);
+	ll_intent_drop_lock(&oit);
+
+	ptlrpc_req_finished(req);
+	RETURN(0);
+
+err_cancel:
+	ldlm_lock_decref_and_cancel((struct lustre_handle *)
+				    &oit.it_lock_handle,
+				    oit.it_lock_mode);
+err_unlock:
+	up_write(&lli->lli_xattrs_list_rwsem);
+err_req:
+	if (rc == -ERANGE)
+		rc = -EAGAIN;
+
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+/**
+ * Get an xattr value or list xattrs using the write-through cache.
+ *
+ * Get the xattr value (@valid has OBD_MD_FLXATTR set) of @name or
+ * list xattr names (@valid has OBD_MD_FLXATTRLS set) for @inode.
+ * The resulting value/list is stored in @buffer if the former
+ * is not larger than @size.
+ *
+ * \retval 0        no error occured
+ * \retval -EPROTO  network protocol error
+ * \retval -ENOMEM  not enough memory for the cache
+ * \retval -ERANGE  the buffer is not large enough
+ * \retval -ENODATA no such attr or the list is empty
+ */
+int ll_xattr_cache_get(struct inode *inode,
+			const char *name,
+			char *buffer,
+			size_t size,
+			__u64 valid)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(!!(valid & OBD_MD_FLXATTR) ^ !!(valid & OBD_MD_FLXATTRLS));
+
+	down_read(&lli->lli_xattrs_list_rwsem);
+	/* For performance reasons, we do not want to refill complete xattr
+	 * cache if we are just interested in encryption context.
+	 */
+	if ((valid & OBD_MD_FLXATTRLS ||
+	     strcmp(name, xattr_for_enc(inode)) != 0) &&
+	    !ll_xattr_cache_filled(lli)) {
+		up_read(&lli->lli_xattrs_list_rwsem);
+		rc = ll_xattr_cache_refill(inode);
+		if (rc)
+			RETURN(rc);
+		/* Turn the write lock obtained in ll_xattr_cache_refill()
+		 * into a read lock.
+		 */
+		downgrade_write(&lli->lli_xattrs_list_rwsem);
+	} else {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR_HITS, 1);
+	}
+
+	if (!ll_xattr_cache_valid(lli))
+		GOTO(out, rc = -ENODATA);
+
+	if (valid & OBD_MD_FLXATTR) {
+		struct ll_xattr_entry *xattr;
+
+		rc = ll_xattr_cache_find(&lli->lli_xattrs, name, &xattr);
+		if (rc == 0) {
+			rc = xattr->xe_vallen;
+			/* zero size means we are only requested size in rc */
+			if (size != 0) {
+				if (size >= xattr->xe_vallen)
+					memcpy(buffer, xattr->xe_value,
+						xattr->xe_vallen);
+				else
+					rc = -ERANGE;
+			}
+		/* Return the project id when the virtual project id xattr
+		 * is explicitly asked.
+		 */
+		} else if (strcmp(name, XATTR_NAME_PROJID) == 0) {
+			/* 10 chars to hold u32 in decimal, plus ending \0 */
+			char projid[11];
+
+			rc = snprintf(projid, sizeof(projid),
+				      "%u", lli->lli_projid);
+			if (size != 0) {
+				if (rc <= size)
+					memcpy(buffer, projid, rc);
+				else
+					rc = -ERANGE;
+			}
+		}
+	} else if (valid & OBD_MD_FLXATTRLS) {
+		rc = ll_xattr_cache_list(&lli->lli_xattrs,
+					 size ? buffer : NULL, size);
+	}
+
+	GOTO(out, rc);
+out:
+	up_read(&lli->lli_xattrs_list_rwsem);
+
+	RETURN(rc);
+}
+
+/**
+ * Insert an xattr value into the cache.
+ *
+ * Add @name xattr with @buffer value and @size length for @inode.
+ * Init cache for @inode if necessary.
+ *
+ * \retval 0       success
+ * \retval < 0	   from ll_xattr_cache_add(), except -EPROTO is ignored for
+ *		   LL_XATTR_NAME_ENCRYPTION_CONTEXT xattr
+ */
+int ll_xattr_cache_insert(struct inode *inode,
+			  const char *name,
+			  char *buffer,
+			  size_t size)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	ENTRY;
+
+	down_write(&lli->lli_xattrs_list_rwsem);
+	if (!ll_xattr_cache_valid(lli))
+		ll_xattr_cache_init(lli);
+	rc = ll_xattr_cache_add(&lli->lli_xattrs, name, buffer, size);
+	up_write(&lli->lli_xattrs_list_rwsem);
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
new file mode 100644
index 0000000000000..df34ab353efb3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
@@ -0,0 +1,328 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright (c) 2014 Bull SAS
+ *
+ * Copyright (c) 2015, 2016, Intel Corporation.
+ * Author: Sebastien Buisson sebastien.buisson@bull.net
+ */
+
+/*
+ * lustre/llite/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/types.h>
+#include <linux/security.h>
+#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
+#include <linux/selinux.h>
+#endif
+#include <linux/xattr.h>
+#include "llite_internal.h"
+
+#ifndef XATTR_SELINUX_SUFFIX
+# define XATTR_SELINUX_SUFFIX "selinux"
+#endif
+
+#ifndef XATTR_NAME_SELINUX
+# define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
+#endif
+
+#ifdef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX
+#define HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG 1
+#endif
+
+/*
+ * Check for LL_SBI_FILE_SECCTX before calling.
+ */
+int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
+			    const char **secctx_name, __u32 *secctx_name_size,
+			    void **secctx, __u32 *secctx_size, int *secctx_slot)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(dentry->d_sb);
+#ifdef HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
+	const char *secctx_name_lsm = NULL;
+#endif
+#ifdef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX
+	struct lsmcontext ctx = {};
+#endif
+	int rc;
+
+	/*
+	 * Before kernel 5.15-rc1-20-g15bf32398ad4,
+	 * security_inode_init_security() does not return to us the name of the
+	 * extended attribute to store the context under (for example
+	 * "security.selinux"). So we only call it when we think we know what
+	 * the name of the extended attribute will be. This is OK-ish since
+	 * SELinux is the only module that implements
+	 * security_dentry_init_security(). Note that the NFS client code just
+	 * calls it and assumes that if anything is returned then it must come
+	 * from SELinux.
+	 */
+
+	*secctx_name_size = ll_secctx_name_get(sbi, secctx_name);
+	/* xattr name length == 0 means no LSM module manage file contexts */
+	if (*secctx_name_size == 0)
+		return 0;
+
+	rc = security_dentry_init_security(dentry, mode, name,
+#ifdef HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
+					   &secctx_name_lsm,
+#endif
+#ifdef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX
+					   &ctx);
+#else
+					   secctx, secctx_size);
+#endif
+	/* ignore error if the hook is not supported by the LSM module */
+	if (rc == -EOPNOTSUPP)
+		return 0;
+	if (rc < 0)
+		return rc;
+
+#ifdef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX
+	*secctx = ctx.context;
+	*secctx_size = ctx.len;
+	*secctx_slot = ctx.slot;
+#endif
+
+#ifdef HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
+	if (strncmp(*secctx_name, secctx_name_lsm, *secctx_name_size) != 0) {
+		CERROR("%s: LSM secctx_name '%s' does not match the one stored by Lustre '%s'\n",
+		      sbi->ll_fsname, secctx_name_lsm, *secctx_name);
+		return -EOPNOTSUPP;
+	}
+#endif
+
+	return 0;
+}
+
+/**
+ * A helper function for security_inode_init_security()
+ * that takes care of setting xattrs
+ *
+ * Get security context of @inode from @xattr_array,
+ * and put it in 'security.xxx' xattr of dentry
+ * stored in @fs_info.
+ *
+ * \retval 0        success
+ * \retval -ENOMEM  if no memory could be allocated for xattr name
+ * \retval < 0      failure to set xattr
+ */
+static int
+ll_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+	      void *fs_info)
+{
+	struct dentry *dentry = fs_info;
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name; xattr++) {
+		char *full_name;
+
+		full_name = kasprintf(GFP_KERNEL, "%s%s",
+				      XATTR_SECURITY_PREFIX, xattr->name);
+		if (!full_name) {
+			err = -ENOMEM;
+			break;
+		}
+
+		err = ll_vfs_setxattr(dentry, inode, full_name, xattr->value,
+				      xattr->value_len, XATTR_CREATE);
+		kfree(full_name);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+/**
+ * Initializes security context
+ *
+ * Get security context of @inode in @dir,
+ * and put it in 'security.xxx' xattr of @dentry.
+ *
+ * \retval 0        success, or SELinux is disabled
+ * \retval -ENOMEM  if no memory could be allocated for xattr name
+ * \retval < 0      failure to get security context or set xattr
+ */
+int
+ll_inode_init_security(struct dentry *dentry, struct inode *inode,
+		       struct inode *dir)
+{
+	int rc;
+
+	if (!ll_security_xattr_wanted(dir))
+		return 0;
+
+	rc = security_inode_init_security(inode, dir, NULL,
+					  &ll_initxattrs, dentry);
+	if (rc == -EOPNOTSUPP)
+		return 0;
+
+	return rc;
+}
+
+/**
+ * Notify security context to the security layer
+ *
+ * Notify security context @secctx of inode @inode to the security layer.
+ *
+ * \retval 0        success, or SELinux is disabled or not supported by the fs
+ * \retval < 0      failure to set the security context
+ */
+int ll_inode_notifysecctx(struct inode *inode,
+			  void *secctx, __u32 secctxlen)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int rc;
+
+	if (!test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags) ||
+	    !ll_security_xattr_wanted(inode) ||
+	    !secctx || !secctxlen)
+		return 0;
+
+	/* no need to protect selinux_inode_setsecurity() by
+	 * inode_lock. Taking it would lead to a client deadlock
+	 * LU-13617
+	 */
+	rc = security_inode_notifysecctx(inode, secctx, secctxlen);
+	if (rc)
+		CWARN("%s: cannot set security context for "DFID": rc = %d\n",
+		      sbi->ll_fsname, PFID(ll_inode2fid(inode)), rc);
+
+	return rc;
+}
+
+/**
+ * Free the security context xattr name used by policy
+ */
+void ll_secctx_name_free(struct ll_sb_info *sbi)
+{
+	OBD_FREE(sbi->ll_secctx_name, sbi->ll_secctx_name_size + 1);
+	sbi->ll_secctx_name = NULL;
+	sbi->ll_secctx_name_size = 0;
+}
+
+/**
+ * Get security context xattr name used by policy and save it.
+ *
+ * \retval > 0      length of xattr name
+ * \retval == 0     no LSM module registered supporting security contexts
+ * \retval <= 0     failure to get xattr name or xattr is not supported
+ */
+int ll_secctx_name_store(struct inode *in)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(in);
+	int rc = 0;
+
+	if (!ll_security_xattr_wanted(in))
+		return 0;
+
+	/* get size of xattr name */
+	rc = security_inode_listsecurity(in, NULL, 0);
+	if (rc <= 0)
+		return rc;
+
+	if (sbi->ll_secctx_name)
+		ll_secctx_name_free(sbi);
+
+	OBD_ALLOC(sbi->ll_secctx_name, rc + 1);
+	if (!sbi->ll_secctx_name)
+		return -ENOMEM;
+
+	/* save the xattr name */
+	sbi->ll_secctx_name_size = rc;
+	rc = security_inode_listsecurity(in, sbi->ll_secctx_name,
+					 sbi->ll_secctx_name_size);
+	if (rc <= 0)
+		goto err_free;
+
+	if (rc > sbi->ll_secctx_name_size) {
+		rc = -ERANGE;
+		goto err_free;
+	}
+
+	/* sanity check */
+	sbi->ll_secctx_name[rc] = '\0';
+	if (rc < sizeof(XATTR_SECURITY_PREFIX)) {
+		rc = -EINVAL;
+		goto err_free;
+	}
+	if (strncmp(sbi->ll_secctx_name, XATTR_SECURITY_PREFIX,
+		    sizeof(XATTR_SECURITY_PREFIX) - 1) != 0) {
+		rc = -EOPNOTSUPP;
+		goto err_free;
+	}
+
+	return rc;
+
+err_free:
+	ll_secctx_name_free(sbi);
+	return rc;
+}
+
+/**
+ * Retrieved file security context xattr name stored.
+ *
+ * \retval      security context xattr name size stored.
+ * \retval 0	no xattr name stored.
+ */
+__u32 ll_secctx_name_get(struct ll_sb_info *sbi, const char **secctx_name)
+{
+	if (!sbi->ll_secctx_name || !sbi->ll_secctx_name_size)
+		return 0;
+
+	*secctx_name = sbi->ll_secctx_name;
+
+	return sbi->ll_secctx_name_size;
+}
+
+/**
+ * Filter out xattr file security context if not managed by LSM
+ *
+ * This is done to improve performance for application that blindly try to get
+ * file context (like "ls -l" for security.linux).
+ * See LU-549 for more information.
+ *
+ * \retval 0		xattr not filtered
+ * \retval -EOPNOTSUPP	no enabled LSM security module supports the xattr
+ */
+int ll_security_secctx_name_filter(struct ll_sb_info *sbi, int xattr_type,
+				   const char *suffix)
+{
+	const char *cached_suffix = NULL;
+
+	if (xattr_type != XATTR_SECURITY_T ||
+	    !ll_xattr_suffix_is_seclabel(suffix))
+		return 0;
+
+	/* is the xattr label used by lsm ? */
+	if (!ll_secctx_name_get(sbi, &cached_suffix))
+		return -EOPNOTSUPP;
+
+	cached_suffix += sizeof(XATTR_SECURITY_PREFIX) - 1;
+	if (strcmp(suffix, cached_suffix) != 0)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
new file mode 100644
index 0000000000000..0b76f7b028835
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/math64.h>
+#include <linux/seq_file.h>
+
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds)
+{
+	struct obd_device *obd = lmv2obd_dev(lmv);
+	int rc;
+
+	ENTRY;
+
+	/*
+	 * FIXME: Currently ZFS still use local seq for ROOT unfortunately, and
+	 * this fid_is_local check should be removed once LU-2240 is fixed
+	 */
+	if (!fid_is_sane(fid) || !(fid_seq_in_fldb(fid_seq(fid)) ||
+				   fid_seq_is_local_file(fid_seq(fid)))) {
+		rc = -EINVAL;
+		CERROR("%s: invalid FID "DFID": rc = %d\n", obd->obd_name,
+		       PFID(fid), rc);
+		RETURN(rc);
+	}
+
+	rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds,
+			       LU_SEQ_RANGE_MDT, NULL);
+	if (rc) {
+		CERROR("%s: Error while looking for mds number. Seq %#llx: rc = %d\n",
+		       obd->obd_name, fid_seq(fid), rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+	       *mds, PFID(fid));
+
+	if (*mds >= lmv->lmv_mdt_descs.ltd_tgts_size) {
+		rc = -EINVAL;
+		CERROR("%s: FLD lookup got invalid mds #%x (max: %x) for fid="DFID": rc = %d\n",
+		       obd->obd_name, *mds, lmv->lmv_mdt_descs.ltd_tgts_size,
+		       PFID(fid), rc);
+	}
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
new file mode 100644
index 0000000000000..97f1d9f592de0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
@@ -0,0 +1,595 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/math64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <lustre_intent.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <lustre_mdc.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
+			     const struct lu_fid *parent_fid,
+			     struct ptlrpc_request **reqp,
+			     ldlm_blocking_callback cb_blocking,
+			     __u64 extra_lock_flags,
+			     const char *secctx_name, __u32 secctx_name_size)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct ptlrpc_request	*req = NULL;
+	struct lustre_handle	plock;
+	struct md_op_data	*op_data;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			pmode;
+	int			rc = 0;
+	ENTRY;
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	LASSERT((body->mbo_valid & OBD_MD_MDS));
+
+	/*
+	 * We got LOOKUP lock, but we really need attrs.
+	 */
+	pmode = it->it_lock_mode;
+	if (pmode) {
+		plock.cookie = it->it_lock_handle;
+		it->it_lock_mode = 0;
+		it->it_request = NULL;
+	}
+
+	LASSERT(fid_is_sane(&body->mbo_fid1));
+
+	tgt = lmv_fid2tgt(lmv, &body->mbo_fid1);
+	if (IS_ERR(tgt))
+		GOTO(out, rc = PTR_ERR(tgt));
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	op_data->op_fid1 = body->mbo_fid1;
+	/* Sent the parent FID to the remote MDT */
+	if (parent_fid != NULL) {
+		/* The parent fid is only for remote open to
+		 * check whether the open is from OBF,
+		 * see mdt_cross_open */
+		LASSERT(it->it_op & IT_OPEN);
+		op_data->op_fid2 = *parent_fid;
+	}
+
+	op_data->op_bias = MDS_CROSS_REF;
+	op_data->op_cli_flags = CLI_NO_SLOT;
+	CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n",
+	       PFID(&body->mbo_fid1), tgt->ltd_index);
+
+	/* ask for security context upon intent */
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN) &&
+	    secctx_name_size != 0 && secctx_name != NULL) {
+		op_data->op_file_secctx_name = secctx_name;
+		op_data->op_file_secctx_name_size = secctx_name_size;
+		CDEBUG(D_SEC, "'%.*s' is security xattr to fetch for "
+		       DFID"\n",
+		       secctx_name_size, secctx_name, PFID(&body->mbo_fid1));
+	}
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, it, &req, cb_blocking,
+			    extra_lock_flags);
+        if (rc)
+                GOTO(out_free_op_data, rc);
+
+	/*
+	 * LLite needs LOOKUP lock to track dentry revocation in order to
+	 * maintain dcache consistency. Thus drop UPDATE|PERM lock here
+	 * and put LOOKUP in request.
+	 */
+	if (it->it_lock_mode != 0) {
+		it->it_remote_lock_handle =
+					it->it_lock_handle;
+		it->it_remote_lock_mode = it->it_lock_mode;
+	}
+
+	if (pmode) {
+		it->it_lock_handle = plock.cookie;
+		it->it_lock_mode = pmode;
+	}
+
+	EXIT;
+out_free_op_data:
+	OBD_FREE_PTR(op_data);
+out:
+	if (rc && pmode)
+		ldlm_lock_decref(&plock, pmode);
+
+	ptlrpc_req_finished(*reqp);
+	*reqp = req;
+	return rc;
+}
+
+int lmv_revalidate_slaves(struct obd_export *exp,
+			  const struct lmv_stripe_md *lsm,
+			  ldlm_blocking_callback cb_blocking,
+			  int extra_lock_flags)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct ptlrpc_request *req = NULL;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+	int i;
+	int valid_stripe_count = 0;
+	int rc = 0;
+
+	ENTRY;
+
+	/**
+	 * revalidate slaves has some problems, temporarily return,
+	 * we may not need that
+	 */
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		RETURN(-ENOMEM);
+
+	/**
+	 * Loop over the stripe information, check validity and update them
+	 * from MDS if needed.
+	 */
+	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+		struct lu_fid		fid;
+		struct lookup_intent	it = { .it_op = IT_GETATTR };
+		struct lustre_handle	*lockh = NULL;
+		struct lmv_tgt_desc	*tgt = NULL;
+		struct inode		*inode;
+
+		fid = lsm->lsm_md_oinfo[i].lmo_fid;
+		inode = lsm->lsm_md_oinfo[i].lmo_root;
+
+		if (!inode)
+			continue;
+
+		/*
+		 * Prepare op_data for revalidating. Note that @fid2 shluld be
+		 * defined otherwise it will go to server and take new lock
+		 * which is not needed here.
+		 */
+		memset(op_data, 0, sizeof(*op_data));
+		op_data->op_fid1 = fid;
+		op_data->op_fid2 = fid;
+		/* shard revalidate only needs to fetch attributes and UPDATE
+		 * lock, which is similar to the bottom half of remote object
+		 * getattr, set this flag so that MDT skips checking whether
+		 * it's remote object.
+		 */
+		op_data->op_bias = MDS_CROSS_REF;
+		op_data->op_cli_flags = CLI_NO_SLOT;
+
+		tgt = lmv_tgt(lmv, lsm->lsm_md_oinfo[i].lmo_mds);
+		if (!tgt)
+			GOTO(cleanup, rc = -ENODEV);
+
+		CDEBUG(D_INODE, "Revalidate slave "DFID" -> mds #%u\n",
+		       PFID(&fid), tgt->ltd_index);
+
+		if (req != NULL) {
+			ptlrpc_req_finished(req);
+			req = NULL;
+		}
+
+		rc = md_intent_lock(tgt->ltd_exp, op_data, &it, &req,
+				    cb_blocking, extra_lock_flags);
+		if (rc == -ENOENT || rc == -ESHUTDOWN) {
+			/* skip stripe that doesn't exist or is inaccessible */
+			rc = 0;
+			continue;
+		}
+
+		if (rc < 0)
+			GOTO(cleanup, rc);
+
+		lockh = (struct lustre_handle *)&it.it_lock_handle;
+		if (rc > 0 && req == NULL) {
+			/* slave inode is still valid */
+			CDEBUG(D_INODE, "slave "DFID" is still valid.\n",
+			       PFID(&fid));
+			rc = 0;
+		} else {
+			/* refresh slave from server */
+			body = req_capsule_server_get(&req->rq_pill,
+						      &RMF_MDT_BODY);
+			if (body == NULL) {
+				if (it.it_lock_mode && lockh) {
+					ldlm_lock_decref(lockh,
+						 it.it_lock_mode);
+					it.it_lock_mode = 0;
+				}
+				GOTO(cleanup, rc = -ENOENT);
+			}
+
+			i_size_write(inode, body->mbo_size);
+			inode->i_blocks = body->mbo_blocks;
+			spin_lock(&inode->i_lock);
+			set_nlink(inode, body->mbo_nlink);
+			spin_unlock(&inode->i_lock);
+			inode->i_atime.tv_sec = body->mbo_atime;
+			inode->i_ctime.tv_sec = body->mbo_ctime;
+			inode->i_mtime.tv_sec = body->mbo_mtime;
+		}
+
+		md_set_lock_data(tgt->ltd_exp, lockh, inode, NULL);
+		if (it.it_lock_mode != 0 && lockh != NULL) {
+			ldlm_lock_decref(lockh, it.it_lock_mode);
+			it.it_lock_mode = 0;
+		}
+
+		valid_stripe_count++;
+	}
+
+cleanup:
+	if (req != NULL)
+		ptlrpc_req_finished(req);
+
+	/* if all stripes are invalid, return -ENOENT to notify user */
+	if (!rc && !valid_stripe_count)
+		rc = -ENOENT;
+
+	OBD_FREE_PTR(op_data);
+	RETURN(rc);
+}
+
+/*
+ * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
+ * may be split dir.
+ */
+static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+			   struct lookup_intent *it,
+			   struct ptlrpc_request **reqp,
+			   ldlm_blocking_callback cb_blocking,
+			   __u64 extra_lock_flags)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *body;
+	__u64 flags = it->it_flags;
+	int rc;
+
+	ENTRY;
+
+	/* do not allow file creation in foreign dir */
+	if ((it->it_op & IT_CREAT) && lmv_dir_foreign(op_data->op_mea1))
+		RETURN(-ENODATA);
+
+	if ((it->it_op & IT_CREAT) && !(flags & MDS_OPEN_BY_FID)) {
+		/* don't allow create under dir with bad hash */
+		if (lmv_dir_bad_hash(op_data->op_mea1))
+			RETURN(-EBADF);
+
+		if (lmv_dir_layout_changing(op_data->op_mea1)) {
+			if (flags & O_EXCL) {
+				/*
+				 * open(O_CREAT | O_EXCL) needs to check
+				 * existing name, which should be done on both
+				 * old and new layout, check old layout on
+				 * client side.
+				 */
+				rc = lmv_old_layout_lookup(lmv, op_data);
+				if (rc != -ENOENT)
+					RETURN(rc);
+
+				op_data->op_new_layout = true;
+			} else {
+				/*
+				 * open(O_CREAT) will be sent to MDT in old
+				 * layout first, to avoid creating new file
+				 * under old layout, clear O_CREAT.
+				 */
+				it->it_flags &= ~O_CREAT;
+			}
+		}
+	}
+
+retry:
+	if (it->it_flags & MDS_OPEN_BY_FID) {
+		LASSERT(fid_is_sane(&op_data->op_fid2));
+
+		/* for striped directory, we can't know parent stripe fid
+		 * without name, but we can set it to child fid, and MDT
+		 * will obtain it from linkea in open in such case. */
+		if (lmv_dir_striped(op_data->op_mea1))
+			op_data->op_fid1 = op_data->op_fid2;
+
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		op_data->op_mds = tgt->ltd_index;
+	} else {
+		LASSERT(fid_is_sane(&op_data->op_fid1));
+		LASSERT(fid_is_zero(&op_data->op_fid2));
+		LASSERT(op_data->op_name != NULL);
+
+		tgt = lmv_locate_tgt(lmv, op_data);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	}
+
+	/* If it is ready to open the file by FID, do not need
+	 * allocate FID at all, otherwise it will confuse MDT */
+	if ((it->it_op & IT_CREAT) && !(it->it_flags & MDS_OPEN_BY_FID)) {
+		/*
+		 * For lookup(IT_CREATE) cases allocate new fid and setup FLD
+		 * for it.
+		 */
+		rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID","
+	       " name='%s' -> mds #%u\n", PFID(&op_data->op_fid1),
+	       PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_index);
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking,
+			    extra_lock_flags);
+	if (rc != 0)
+		RETURN(rc);
+	/*
+	 * Nothing is found, do not access body->fid1 as it is zero and thus
+	 * pointless.
+	 */
+	if ((it->it_disposition & DISP_LOOKUP_NEG) &&
+	    !(it->it_disposition & DISP_OPEN_CREATE) &&
+	    !(it->it_disposition & DISP_OPEN_OPEN)) {
+		if (!(it->it_flags & MDS_OPEN_BY_FID) &&
+		    lmv_dir_retry_check_update(op_data)) {
+			ptlrpc_req_finished(*reqp);
+			it->it_request = NULL;
+			it->it_disposition = 0;
+			*reqp = NULL;
+
+			it->it_flags = flags;
+			fid_zero(&op_data->op_fid2);
+			goto retry;
+		}
+
+		RETURN(rc);
+	}
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
+		rc = lmv_intent_remote(exp, it, &op_data->op_fid1, reqp,
+				       cb_blocking, extra_lock_flags,
+				       op_data->op_file_secctx_name,
+				       op_data->op_file_secctx_name_size);
+		if (rc != 0)
+			RETURN(rc);
+
+		body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			RETURN(-EPROTO);
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * Handler for: getattr, lookup and revalidate cases.
+ */
+static int
+lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+		  struct lookup_intent *it, struct ptlrpc_request **reqp,
+		  ldlm_blocking_callback cb_blocking,
+		  __u64 extra_lock_flags)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = NULL;
+	struct mdt_body *body;
+	int rc;
+	ENTRY;
+
+	/* foreign dir is not striped */
+	if (lmv_dir_foreign(op_data->op_mea1)) {
+		/* only allow getattr/lookup for itself */
+		if (op_data->op_name != NULL)
+			RETURN(-ENODATA);
+		RETURN(0);
+	}
+
+retry:
+	if (op_data->op_flags & MF_GETATTR_BY_FID) {
+		/* getattr by FID, replace fid1 with stripe FID,
+		 * NB, don't replace if name is "/", because it may be a subtree
+		 * mount, and if it's a striped directory, fid1 will be replaced
+		 * to stripe FID by hash, while fid2 is master object FID, which
+		 * will be treated as a remote object if the two FIDs are
+		 * located on different MDTs, and LOOKUP lock can't be fetched.
+		 */
+		LASSERT(op_data->op_name);
+		if (op_data->op_namelen != 1 ||
+		    strncmp(op_data->op_name, "/", 1) != 0) {
+			tgt = lmv_locate_tgt(lmv, op_data);
+			if (IS_ERR(tgt))
+				RETURN(PTR_ERR(tgt));
+		}
+
+		/* name is used to locate stripe target, clear it here
+		 * to avoid packing name in request, so that MDS knows
+		 * it's getattr by FID.
+		 */
+		op_data->op_name = NULL;
+		op_data->op_namelen = 0;
+
+		/* getattr request is sent to MDT where fid2 inode is */
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
+	} else if (op_data->op_name) {
+		/* getattr by name */
+		tgt = lmv_locate_tgt(lmv, op_data);
+		if (!fid_is_sane(&op_data->op_fid2))
+			fid_zero(&op_data->op_fid2);
+	} else {
+		/* old way to getattr by FID, parent FID not packed */
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	}
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
+	       ", name='%s' -> mds #%u\n",
+	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
+	       op_data->op_name ? op_data->op_name : "<NULL>",
+	       tgt->ltd_index);
+
+	op_data->op_bias &= ~MDS_CROSS_REF;
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking,
+			    extra_lock_flags);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (*reqp == NULL) {
+		/* If RPC happens, lsm information will be revalidated
+		 * during update_inode process (see ll_update_lsm_md) */
+		if (lmv_dir_striped(op_data->op_mea2)) {
+			rc = lmv_revalidate_slaves(exp, op_data->op_mea2,
+						   cb_blocking,
+						   extra_lock_flags);
+			if (rc != 0)
+				RETURN(rc);
+		}
+		RETURN(rc);
+	} else if (it_disposition(it, DISP_LOOKUP_NEG) &&
+		   lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*reqp);
+		it->it_request = NULL;
+		it->it_disposition = 0;
+		*reqp = NULL;
+
+		goto retry;
+	}
+
+	if (!it_has_reply_body(it))
+		RETURN(0);
+
+	/*
+	 * MDS has returned success. Probably name has been resolved in
+	 * remote inode. Let's check this.
+	 */
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
+		rc = lmv_intent_remote(exp, it, NULL, reqp, cb_blocking,
+				       extra_lock_flags,
+				       op_data->op_file_secctx_name,
+				       op_data->op_file_secctx_name_size);
+		if (rc != 0)
+			RETURN(rc);
+		body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			RETURN(-EPROTO);
+	}
+
+	RETURN(rc);
+}
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    struct lookup_intent *it, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(it != NULL);
+	LASSERT(fid_is_sane(&op_data->op_fid1));
+
+	CDEBUG(D_INODE, "INTENT LOCK '%s' for "DFID" '%.*s' on "DFID"\n",
+		LL_IT2STR(it), PFID(&op_data->op_fid2),
+		(int)op_data->op_namelen, op_data->op_name,
+		PFID(&op_data->op_fid1));
+
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT | IT_GETXATTR))
+		rc = lmv_intent_lookup(exp, op_data, it, reqp, cb_blocking,
+				       extra_lock_flags);
+	else if (it->it_op & IT_OPEN)
+		rc = lmv_intent_open(exp, op_data, it, reqp, cb_blocking,
+				     extra_lock_flags);
+	else
+		LBUG();
+
+	if (rc < 0) {
+		struct lustre_handle lock_handle;
+
+		if (it->it_lock_mode != 0) {
+			lock_handle.cookie = it->it_lock_handle;
+			ldlm_lock_decref_and_cancel(&lock_handle,
+						    it->it_lock_mode);
+		}
+
+		it->it_lock_handle = 0;
+		it->it_lock_mode = 0;
+
+		if (it->it_remote_lock_mode != 0) {
+			lock_handle.cookie = it->it_remote_lock_handle;
+			ldlm_lock_decref_and_cancel(&lock_handle,
+						    it->it_remote_lock_mode);
+		}
+
+		it->it_remote_lock_handle = 0;
+		it->it_remote_lock_mode = 0;
+	}
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
new file mode 100644
index 0000000000000..a1d4436b6af80
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _LMV_INTERNAL_H_
+#define _LMV_INTERNAL_H_
+
+#include <obd.h>
+#include <lustre_lmv.h>
+
+#define LMV_MAX_TGT_COUNT 128
+
+#define LL_IT2STR(it)				        \
+	((it) ? ldlm_it2str((it)->it_op) : "0")
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    struct lookup_intent *it, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+		     void *, int);
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds);
+int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+		  struct lu_fid *fid, struct md_op_data *op_data);
+
+int lmv_revalidate_slaves(struct obd_export *exp,
+			  const struct lmv_stripe_md *lsm,
+			  ldlm_blocking_callback cb_blocking,
+			  int extra_lock_flags);
+
+int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+		     struct ptlrpc_request **preq);
+void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt,
+			 int activate);
+
+int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt);
+
+static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv)
+{
+	return container_of_safe(lmv, struct obd_device, u.lmv);
+}
+
+static inline struct lu_tgt_desc *
+lmv_tgt(struct lmv_obd *lmv, __u32 index)
+{
+	return index < lmv->lmv_mdt_descs.ltd_tgts_size ?
+		LTD_TGT(&lmv->lmv_mdt_descs, index) : NULL;
+}
+
+static inline bool
+lmv_mdt0_inited(struct lmv_obd *lmv)
+{
+	return lmv->lmv_mdt_descs.ltd_tgts_size > 0 &&
+	       test_bit(0, lmv->lmv_mdt_descs.ltd_tgt_bitmap);
+}
+
+#define lmv_foreach_tgt(lmv, tgt) ltd_foreach_tgt(&(lmv)->lmv_mdt_descs, tgt)
+
+#define lmv_foreach_tgt_safe(lmv, tgt, tmp) \
+	ltd_foreach_tgt_safe(&(lmv)->lmv_mdt_descs, tgt, tmp)
+
+static inline
+struct lu_tgt_desc *lmv_first_connected_tgt(struct lmv_obd *lmv)
+{
+	struct lu_tgt_desc *tgt;
+
+	tgt = ltd_first_tgt(&lmv->lmv_mdt_descs);
+	while (tgt && !tgt->ltd_exp)
+		tgt = ltd_next_tgt(&lmv->lmv_mdt_descs, tgt);
+
+	return tgt;
+}
+
+static inline
+struct lu_tgt_desc *lmv_next_connected_tgt(struct lmv_obd *lmv,
+					   struct lu_tgt_desc *tgt)
+{
+	do {
+		tgt = ltd_next_tgt(&lmv->lmv_mdt_descs, tgt);
+	} while (tgt && !tgt->ltd_exp);
+
+	return tgt;
+}
+
+#define lmv_foreach_connected_tgt(lmv, tgt) \
+	for (tgt = lmv_first_connected_tgt(lmv); tgt; \
+	     tgt = lmv_next_connected_tgt(lmv, tgt))
+
+static inline int
+lmv_fid2tgt_index(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+	u32 mdt_idx;
+	int rc;
+
+	if (lmv->lmv_mdt_count < 2)
+		return 0;
+
+	rc = lmv_fld_lookup(lmv, fid, &mdt_idx);
+	if (rc < 0)
+		return rc;
+
+	return mdt_idx;
+}
+
+static inline struct lmv_tgt_desc *
+lmv_fid2tgt(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+	struct lu_tgt_desc *tgt;
+	int index;
+
+	index = lmv_fid2tgt_index(lmv, fid);
+	if (index < 0)
+		return ERR_PTR(index);
+
+	tgt = lmv_tgt(lmv, index);
+
+	return tgt ? tgt : ERR_PTR(-ENODEV);
+}
+
+static inline int lmv_stripe_md_size(int stripe_count)
+{
+	struct lmv_stripe_md *lsm;
+
+	return sizeof(*lsm) + stripe_count * sizeof(lsm->lsm_md_oinfo[0]);
+}
+
+/* for file under migrating directory, return the target stripe info */
+static inline const struct lmv_oinfo *
+lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name,
+			int namelen, bool new_layout)
+{
+	int stripe_index;
+
+	LASSERT(lmv_dir_striped(lsm));
+
+	stripe_index = __lmv_name_to_stripe_index(lsm->lsm_md_hash_type,
+						  lsm->lsm_md_stripe_count,
+						  lsm->lsm_md_migrate_hash,
+						  lsm->lsm_md_migrate_offset,
+						  name, namelen, new_layout);
+	if (stripe_index < 0)
+		return ERR_PTR(stripe_index);
+
+	return &lsm->lsm_md_oinfo[stripe_index];
+}
+
+static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
+{
+	const struct lmv_stripe_md *lsm = op_data->op_mea1;
+
+	if (!lsm)
+		return false;
+
+	if (lmv_dir_layout_changing(lsm) && !op_data->op_new_layout) {
+		op_data->op_new_layout = true;
+		return true;
+	}
+
+	if (lmv_dir_bad_hash(lsm) &&
+	    op_data->op_stripe_index < lsm->lsm_md_stripe_count - 1) {
+		op_data->op_stripe_index++;
+		return true;
+	}
+
+	return false;
+}
+
+struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
+				    struct md_op_data *op_data);
+int lmv_old_layout_lookup(struct lmv_obd *lmv, struct md_op_data *op_data);
+
+/* lproc_lmv.c */
+int lmv_tunables_init(struct obd_device *obd);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
new file mode 100644
index 0000000000000..88ed384beb47d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
@@ -0,0 +1,3915 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/math64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include <lustre_lmv.h>
+#include <lprocfs_status.h>
+#include <cl_object.h>
+#include <lustre_fid.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_kernelcomm.h>
+#include "lmv_internal.h"
+
+static int lmv_check_connect(struct obd_device *obd);
+static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data);
+
+void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt,
+			 int activate)
+{
+	if (tgt->ltd_active == activate)
+		return;
+
+	tgt->ltd_active = activate;
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count +=
+		(activate ? 1 : -1);
+
+	tgt->ltd_exp->exp_obd->obd_inactive = !activate;
+}
+
+/**
+ * Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LMV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
+ */
+static int lmv_set_mdc_active(struct lmv_obd *lmv,
+			      const struct obd_uuid *uuid,
+			      int activate)
+{
+	struct lu_tgt_desc *tgt = NULL;
+	struct obd_device *obd;
+	int rc = 0;
+
+	ENTRY;
+
+	CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
+			lmv, uuid->uuid, activate);
+
+	spin_lock(&lmv->lmv_lock);
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n",
+		       tgt->ltd_index, tgt->ltd_uuid.uuid,
+		       tgt->ltd_exp->exp_handle.h_cookie);
+
+		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
+	}
+
+	if (!tgt)
+		GOTO(out_lmv_lock, rc = -EINVAL);
+
+	obd = class_exp2obd(tgt->ltd_exp);
+	if (obd == NULL)
+		GOTO(out_lmv_lock, rc = -ENOTCONN);
+
+	CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
+	       obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
+	       obd->obd_type->typ_name, tgt->ltd_index);
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
+
+	if (tgt->ltd_active == activate) {
+		CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+		       activate ? "" : "in");
+		GOTO(out_lmv_lock, rc);
+	}
+
+	CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
+	       activate ? "" : "in");
+	lmv_activate_target(lmv, tgt, activate);
+	EXIT;
+
+ out_lmv_lock:
+	spin_unlock(&lmv->lmv_lock);
+	return rc;
+}
+
+static struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
+
+	return tgt ? obd_get_uuid(tgt->ltd_exp) : NULL;
+}
+
+static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev)
+{
+        struct obd_connect_data *conn_data;
+        struct lmv_obd          *lmv = &obd->u.lmv;
+        struct obd_uuid         *uuid;
+        int                      rc = 0;
+        ENTRY;
+
+        if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+                CERROR("unexpected notification of %s %s!\n",
+                       watched->obd_type->typ_name,
+                       watched->obd_name);
+                RETURN(-EINVAL);
+        }
+
+        uuid = &watched->u.cli.cl_target_uuid;
+        if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+                /*
+                 * Set MDC as active before notifying the observer, so the
+                 * observer can use the MDC normally.
+                 */
+                rc = lmv_set_mdc_active(lmv, uuid,
+                                        ev == OBD_NOTIFY_ACTIVE);
+                if (rc) {
+                        CERROR("%sactivation of %s failed: %d\n",
+                               ev == OBD_NOTIFY_ACTIVE ? "" : "de",
+                               uuid->uuid, rc);
+                        RETURN(rc);
+                }
+	} else if (ev == OBD_NOTIFY_OCD) {
+		conn_data = &watched->u.cli.cl_import->imp_connect_data;
+		/*
+		 * XXX: Make sure that ocd_connect_flags from all targets are
+		 * the same. Otherwise one of MDTs runs wrong version or
+		 * something like this.  --umka
+		 */
+		obd->obd_self_export->exp_connect_data = *conn_data;
+	}
+
+	/*
+	 * Pass the notification up the chain.
+	 */
+	if (obd->obd_observer)
+		rc = obd_notify(obd->obd_observer, watched, ev);
+
+	RETURN(rc);
+}
+
+static int lmv_connect(const struct lu_env *env,
+		       struct obd_export **pexp, struct obd_device *obd,
+		       struct obd_uuid *cluuid, struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lustre_handle conn = { 0 };
+	struct obd_export *exp;
+	int rc;
+	ENTRY;
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc) {
+		CERROR("class_connection() returned %d\n", rc);
+		RETURN(rc);
+	}
+
+	exp = class_conn2export(&conn);
+
+	lmv->connected = 0;
+	lmv->conn_data = *data;
+	lmv->lmv_cache = localdata;
+
+	lmv->lmv_tgts_kobj = kobject_create_and_add("target_obds",
+						    &obd->obd_kset.kobj);
+	if (!lmv->lmv_tgts_kobj) {
+		CERROR("%s: cannot create /sys/fs/lustre/%s/%s/target_obds\n",
+		       obd->obd_name, obd->obd_type->typ_name, obd->obd_name);
+	}
+
+	rc = lmv_check_connect(obd);
+	if (rc != 0)
+		GOTO(out_sysfs, rc);
+
+	*pexp = exp;
+
+	RETURN(rc);
+
+out_sysfs:
+	if (lmv->lmv_tgts_kobj)
+		kobject_put(lmv->lmv_tgts_kobj);
+
+	class_disconnect(exp);
+
+	return rc;
+}
+
+static int lmv_init_ea_size(struct obd_export *exp, __u32 easize,
+			    __u32 def_easize)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int change = 0;
+	int rc = 0;
+
+	ENTRY;
+
+	if (lmv->max_easize < easize) {
+		lmv->max_easize = easize;
+		change = 1;
+	}
+	if (lmv->max_def_easize < def_easize) {
+		lmv->max_def_easize = def_easize;
+		change = 1;
+	}
+
+	if (change == 0)
+		RETURN(0);
+
+	if (lmv->connected == 0)
+		RETURN(0);
+
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		if (!tgt->ltd_active)
+			continue;
+
+		rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize);
+		if (rc) {
+			CERROR("%s: obd_init_ea_size() failed on MDT target %d:"
+			       " rc = %d\n", obd->obd_name, tgt->ltd_index, rc);
+			break;
+		}
+	}
+	RETURN(rc);
+}
+
+#define MAX_STRING_SIZE 128
+
+static int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct obd_device *mdc_obd;
+	struct obd_export *mdc_exp;
+	struct lu_fld_target target;
+	int  rc;
+	ENTRY;
+
+	mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+					&obd->obd_uuid);
+	if (!mdc_obd) {
+		CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s\n",
+	       mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+	       tgt->ltd_uuid.uuid, obd->obd_uuid.uuid);
+
+	if (!mdc_obd->obd_set_up) {
+		CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+		RETURN(-EINVAL);
+	}
+
+	rc = obd_connect(NULL, &mdc_exp, mdc_obd, &obd->obd_uuid,
+			 &lmv->conn_data, lmv->lmv_cache);
+	if (rc) {
+		CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
+		RETURN(rc);
+	}
+
+	/*
+	 * Init fid sequence client for this mdc and add new fld target.
+	 */
+	rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
+	if (rc)
+		RETURN(rc);
+
+	target.ft_srv = NULL;
+	target.ft_exp = mdc_exp;
+	target.ft_idx = tgt->ltd_index;
+
+	fld_client_add_target(&lmv->lmv_fld, &target);
+
+	rc = obd_register_observer(mdc_obd, obd);
+	if (rc) {
+		obd_disconnect(mdc_exp);
+		CERROR("target %s register_observer error %d\n",
+		       tgt->ltd_uuid.uuid, rc);
+		RETURN(rc);
+	}
+
+	if (obd->obd_observer) {
+		/*
+		 * Tell the observer about the new target.
+		 */
+		rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
+				OBD_NOTIFY_ACTIVE);
+		if (rc) {
+			obd_disconnect(mdc_exp);
+			RETURN(rc);
+		}
+	}
+
+	tgt->ltd_active = 1;
+	tgt->ltd_exp = mdc_exp;
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count++;
+
+	md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize);
+
+	rc = lu_qos_add_tgt(&lmv->lmv_qos, tgt);
+	if (rc) {
+		obd_disconnect(mdc_exp);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
+	       mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+	       atomic_read(&obd->obd_refcount));
+
+	lmv_statfs_check_update(obd, tgt);
+
+	if (lmv->lmv_tgts_kobj)
+		/* Even if we failed to create the link, that's fine */
+		rc = sysfs_create_link(lmv->lmv_tgts_kobj,
+				       &mdc_obd->obd_kset.kobj,
+				       mdc_obd->obd_name);
+	RETURN(0);
+}
+
+static void lmv_del_target(struct lmv_obd *lmv, struct lu_tgt_desc *tgt)
+{
+	LASSERT(tgt);
+	ltd_del_tgt(&lmv->lmv_mdt_descs, tgt);
+	OBD_FREE_PTR(tgt);
+}
+
+static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+			   __u32 index, int gen)
+{
+	struct obd_device *mdc_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct lu_tgt_descs *ltd = &lmv->lmv_mdt_descs;
+	int rc = 0;
+
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
+	mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
+					&obd->obd_uuid);
+	if (!mdc_obd) {
+		CERROR("%s: Target %s not attached: rc = %d\n",
+		       obd->obd_name, uuidp->uuid, -EINVAL);
+		RETURN(-EINVAL);
+	}
+
+	OBD_ALLOC_PTR(tgt);
+	if (!tgt)
+		RETURN(-ENOMEM);
+
+	mutex_init(&tgt->ltd_fid_mutex);
+	tgt->ltd_index = index;
+	tgt->ltd_uuid = *uuidp;
+	tgt->ltd_active = 0;
+
+	mutex_lock(&ltd->ltd_mutex);
+	rc = ltd_add_tgt(ltd, tgt);
+	mutex_unlock(&ltd->ltd_mutex);
+
+	if (rc)
+		GOTO(out_tgt, rc);
+
+	if (!lmv->connected)
+		/* lmv_check_connect() will connect this target. */
+		RETURN(0);
+
+	rc = lmv_connect_mdc(obd, tgt);
+	if (!rc) {
+		int easize = sizeof(struct lmv_stripe_md) +
+			lmv->lmv_mdt_count * sizeof(struct lu_fid);
+
+		lmv_init_ea_size(obd->obd_self_export, easize, 0);
+	}
+
+	RETURN(rc);
+
+out_tgt:
+	OBD_FREE_PTR(tgt);
+	return rc;
+}
+
+static int lmv_check_connect(struct obd_device *obd)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int easize;
+	int rc;
+
+	ENTRY;
+
+	if (lmv->connected)
+		RETURN(0);
+
+	mutex_lock(&lmv->lmv_mdt_descs.ltd_mutex);
+	if (lmv->connected)
+		GOTO(unlock, rc = 0);
+
+	if (!lmv->lmv_mdt_count) {
+		CERROR("%s: no targets configured: rc = -EINVAL\n",
+		       obd->obd_name);
+		GOTO(unlock, rc = -EINVAL);
+	}
+
+	if (!lmv_mdt0_inited(lmv)) {
+		CERROR("%s: no target configured for index 0: rc = -EINVAL.\n",
+		       obd->obd_name);
+		GOTO(unlock, rc = -EINVAL);
+	}
+
+	CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
+	       obd->obd_uuid.uuid, obd->obd_name);
+
+	lmv_foreach_tgt(lmv, tgt) {
+		rc = lmv_connect_mdc(obd, tgt);
+		if (rc)
+			GOTO(out_disc, rc);
+	}
+
+	lmv->connected = 1;
+	easize = lmv_mds_md_size(lmv->lmv_mdt_count, LMV_MAGIC);
+	lmv_init_ea_size(obd->obd_self_export, easize, 0);
+	EXIT;
+unlock:
+	mutex_unlock(&lmv->lmv_mdt_descs.ltd_mutex);
+
+	return rc;
+
+out_disc:
+	lmv_foreach_tgt(lmv, tgt) {
+		tgt->ltd_active = 0;
+		if (!tgt->ltd_exp)
+			continue;
+
+		--lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count;
+		obd_disconnect(tgt->ltd_exp);
+	}
+
+	goto unlock;
+}
+
+static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct obd_device *mdc_obd;
+	int rc;
+	ENTRY;
+
+	LASSERT(tgt != NULL);
+	LASSERT(obd != NULL);
+
+	mdc_obd = class_exp2obd(tgt->ltd_exp);
+
+	if (mdc_obd) {
+		mdc_obd->obd_force = obd->obd_force;
+		mdc_obd->obd_fail = obd->obd_fail;
+		mdc_obd->obd_no_recov = obd->obd_no_recov;
+
+		if (lmv->lmv_tgts_kobj)
+			sysfs_remove_link(lmv->lmv_tgts_kobj,
+					  mdc_obd->obd_name);
+	}
+
+	rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
+	if (rc)
+		CERROR("Can't finalize fids factory\n");
+
+	CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
+	       tgt->ltd_exp->exp_obd->obd_name,
+	       tgt->ltd_exp->exp_obd->obd_uuid.uuid);
+
+	obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
+	rc = obd_disconnect(tgt->ltd_exp);
+	if (rc) {
+		if (tgt->ltd_active) {
+			CERROR("Target %s disconnect error %d\n",
+			       tgt->ltd_uuid.uuid, rc);
+		}
+	}
+
+	lmv_activate_target(lmv, tgt, 0);
+	tgt->ltd_exp = NULL;
+	RETURN(0);
+}
+
+static int lmv_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	lmv_foreach_connected_tgt(lmv, tgt)
+		lmv_disconnect_mdc(obd, tgt);
+
+	if (lmv->lmv_tgts_kobj)
+		kobject_put(lmv->lmv_tgts_kobj);
+
+	if (!lmv->connected)
+		class_export_put(exp);
+	rc = class_disconnect(exp);
+	lmv->connected = 0;
+
+	RETURN(rc);
+}
+
+static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
+			void __user *uarg)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct getinfo_fid2path *gf;
+	struct lmv_tgt_desc *tgt;
+	struct getinfo_fid2path *remote_gf = NULL;
+	struct lu_fid root_fid;
+	int remote_gf_size = 0;
+	int rc;
+
+	gf = karg;
+	tgt = lmv_fid2tgt(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	root_fid = *gf->gf_u.gf_root_fid;
+	LASSERT(fid_is_sane(&root_fid));
+
+repeat_fid2path:
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
+	if (rc != 0 && rc != -EREMOTE)
+		GOTO(out_fid2path, rc);
+
+	/* If remote_gf != NULL, it means just building the
+	 * path on the remote MDT, copy this path segement to gf */
+	if (remote_gf != NULL) {
+		struct getinfo_fid2path *ori_gf;
+		char *ptr;
+		int len;
+
+		ori_gf = (struct getinfo_fid2path *)karg;
+		if (strlen(ori_gf->gf_u.gf_path) + 1 +
+		    strlen(gf->gf_u.gf_path) + 1 > ori_gf->gf_pathlen)
+			GOTO(out_fid2path, rc = -EOVERFLOW);
+
+		ptr = ori_gf->gf_u.gf_path;
+
+		len = strlen(gf->gf_u.gf_path);
+		/* move the current path to the right to release space
+		 * for closer-to-root part */
+		memmove(ptr + len + 1, ptr, strlen(ori_gf->gf_u.gf_path));
+		memcpy(ptr, gf->gf_u.gf_path, len);
+		ptr[len] = '/';
+	}
+
+	CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n",
+	       tgt->ltd_exp->exp_obd->obd_name,
+	       gf->gf_u.gf_path, PFID(&gf->gf_fid), gf->gf_recno,
+	       gf->gf_linkno);
+
+	if (rc == 0)
+		GOTO(out_fid2path, rc);
+
+	/* sigh, has to go to another MDT to do path building further */
+	if (remote_gf == NULL) {
+		remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
+		OBD_ALLOC(remote_gf, remote_gf_size);
+		if (remote_gf == NULL)
+			GOTO(out_fid2path, rc = -ENOMEM);
+		remote_gf->gf_pathlen = PATH_MAX;
+	}
+
+	if (!fid_is_sane(&gf->gf_fid)) {
+		CERROR("%s: invalid FID "DFID": rc = %d\n",
+		       tgt->ltd_exp->exp_obd->obd_name,
+		       PFID(&gf->gf_fid), -EINVAL);
+		GOTO(out_fid2path, rc = -EINVAL);
+	}
+
+	tgt = lmv_fid2tgt(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt))
+		GOTO(out_fid2path, rc = -EINVAL);
+
+	remote_gf->gf_fid = gf->gf_fid;
+	remote_gf->gf_recno = -1;
+	remote_gf->gf_linkno = -1;
+	memset(remote_gf->gf_u.gf_path, 0, remote_gf->gf_pathlen);
+	*remote_gf->gf_u.gf_root_fid = root_fid;
+	gf = remote_gf;
+	goto repeat_fid2path;
+
+out_fid2path:
+	if (remote_gf != NULL)
+		OBD_FREE(remote_gf, remote_gf_size);
+	RETURN(rc);
+}
+
+static int lmv_hsm_req_count(struct lmv_obd *lmv,
+			     const struct hsm_user_request *hur,
+			     const struct lmv_tgt_desc *tgt_mds)
+{
+	struct lmv_tgt_desc *curr_tgt;
+	__u32 i;
+	int nr = 0;
+
+	/* count how many requests must be sent to the given target */
+	for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
+		curr_tgt = lmv_fid2tgt(lmv, &hur->hur_user_item[i].hui_fid);
+		if (IS_ERR(curr_tgt))
+			RETURN(PTR_ERR(curr_tgt));
+		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid))
+			nr++;
+	}
+	return nr;
+}
+
+static int lmv_hsm_req_build(struct lmv_obd *lmv,
+			      struct hsm_user_request *hur_in,
+			      const struct lmv_tgt_desc *tgt_mds,
+			      struct hsm_user_request *hur_out)
+{
+	__u32 i, nr_out;
+	struct lmv_tgt_desc *curr_tgt;
+
+	/* build the hsm_user_request for the given target */
+	hur_out->hur_request = hur_in->hur_request;
+	nr_out = 0;
+	for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) {
+		curr_tgt = lmv_fid2tgt(lmv, &hur_in->hur_user_item[i].hui_fid);
+		if (IS_ERR(curr_tgt))
+			RETURN(PTR_ERR(curr_tgt));
+		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) {
+			hur_out->hur_user_item[nr_out] =
+						hur_in->hur_user_item[i];
+			nr_out++;
+		}
+	}
+	hur_out->hur_request.hr_itemcount = nr_out;
+	memcpy(hur_data(hur_out), hur_data(hur_in),
+	       hur_in->hur_request.hr_data_len);
+
+	RETURN(0);
+}
+
+static int lmv_hsm_ct_unregister(struct obd_device *obd, unsigned int cmd,
+				 int len, struct lustre_kernelcomm *lk,
+				 void __user *uarg)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	/* unregister request (call from llapi_hsm_copytool_fini) */
+	lmv_foreach_connected_tgt(lmv, tgt)
+		/* best effort: try to clean as much as possible
+		 * (continue on error) */
+		obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
+
+	/* Whatever the result, remove copytool from kuc groups.
+	 * Unreached coordinators will get EPIPE on next requests
+	 * and will unregister automatically.
+	 */
+	rc = libcfs_kkuc_group_rem(&obd->obd_uuid, lk->lk_uid, lk->lk_group);
+
+	RETURN(rc);
+}
+
+static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd,
+			       int len, struct lustre_kernelcomm *lk,
+			       void __user *uarg)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct file *filp;
+	bool any_set = false;
+	struct kkuc_ct_data *kcd;
+	size_t kcd_size;
+	struct lu_tgt_desc *tgt;
+	__u32 i;
+	int err;
+	int rc = 0;
+
+	ENTRY;
+
+	filp = fget(lk->lk_wfd);
+	if (!filp)
+		RETURN(-EBADF);
+
+	if (lk->lk_flags & LK_FLG_DATANR)
+		kcd_size = offsetof(struct kkuc_ct_data,
+				    kcd_archives[lk->lk_data_count]);
+	else
+		kcd_size = sizeof(*kcd);
+
+	OBD_ALLOC(kcd, kcd_size);
+	if (kcd == NULL)
+		GOTO(err_fput, rc = -ENOMEM);
+
+	kcd->kcd_nr_archives = lk->lk_data_count;
+	if (lk->lk_flags & LK_FLG_DATANR) {
+		kcd->kcd_magic = KKUC_CT_DATA_ARRAY_MAGIC;
+		if (lk->lk_data_count > 0)
+			memcpy(kcd->kcd_archives, lk->lk_data,
+			       sizeof(*kcd->kcd_archives) * lk->lk_data_count);
+	} else {
+		kcd->kcd_magic = KKUC_CT_DATA_BITMAP_MAGIC;
+	}
+
+	rc = libcfs_kkuc_group_add(filp, &obd->obd_uuid, lk->lk_uid,
+				   lk->lk_group, kcd, kcd_size);
+	OBD_FREE(kcd, kcd_size);
+	if (rc)
+		GOTO(err_fput, rc);
+
+	/* All or nothing: try to register to all MDS.
+	 * In case of failure, unregister from previous MDS,
+	 * except if it because of inactive target. */
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
+		if (err) {
+			if (tgt->ltd_active) {
+				/* permanent error */
+				CERROR("%s: iocontrol MDC %s on MDT"
+				       " idx %d cmd %x: err = %d\n",
+				       lmv2obd_dev(lmv)->obd_name,
+				       tgt->ltd_uuid.uuid, tgt->ltd_index, cmd,
+				       err);
+				rc = err;
+				lk->lk_flags |= LK_FLG_STOP;
+				i = tgt->ltd_index;
+				/* unregister from previous MDS */
+				lmv_foreach_connected_tgt(lmv, tgt) {
+					if (tgt->ltd_index >= i)
+						break;
+
+					obd_iocontrol(cmd, tgt->ltd_exp, len,
+						      lk, uarg);
+				}
+				GOTO(err_kkuc_rem, rc);
+			}
+			/* else: transient error.
+			 * kuc will register to the missing MDT
+			 * when it is back */
+		} else {
+			any_set = true;
+		}
+	}
+
+	if (!any_set)
+		/* no registration done: return error */
+		GOTO(err_kkuc_rem, rc = -ENOTCONN);
+
+	RETURN(0);
+
+err_kkuc_rem:
+	libcfs_kkuc_group_rem(&obd->obd_uuid, lk->lk_uid, lk->lk_group);
+
+err_fput:
+	fput(filp);
+	return rc;
+}
+
+static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
+			 int len, void *karg, void __user *uarg)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt = NULL;
+	int set = 0;
+	__u32 count = lmv->lmv_mdt_count;
+	int rc = 0;
+
+	ENTRY;
+
+	if (count == 0)
+		RETURN(-ENOTTY);
+
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *mdc_obd;
+		struct obd_statfs stat_buf = {0};
+		__u32 index;
+
+		memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+
+		if (index >= lmv->lmv_mdt_descs.ltd_tgts_size)
+			RETURN(-ENODEV);
+
+		tgt = lmv_tgt(lmv, index);
+		if (!tgt)
+			RETURN(-EAGAIN);
+
+		if (!tgt->ltd_active)
+			RETURN(-ENODATA);
+
+		mdc_obd = class_exp2obd(tgt->ltd_exp);
+		if (!mdc_obd)
+			RETURN(-EINVAL);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
+				 min((int) data->ioc_plen2,
+				     (int) sizeof(struct obd_uuid))))
+			RETURN(-EFAULT);
+
+		rc = obd_statfs(NULL, tgt->ltd_exp, &stat_buf,
+				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+				0);
+		if (rc)
+			RETURN(rc);
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				 min((int) data->ioc_plen1,
+				     (int) sizeof(stat_buf))))
+			RETURN(-EFAULT);
+		break;
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_MDTIDX) {
+			tgt = lmv_tgt(lmv, qctl->qc_idx);
+		} else if (qctl->qc_valid == QC_UUID) {
+			lmv_foreach_tgt(lmv, tgt) {
+				if (!obd_uuid_equals(&tgt->ltd_uuid,
+						     &qctl->obd_uuid))
+					continue;
+
+				if (!tgt->ltd_exp)
+					RETURN(-EINVAL);
+
+				break;
+			}
+		} else {
+			RETURN(-EINVAL);
+		}
+
+		if (!tgt || !tgt->ltd_exp)
+			RETURN(-EINVAL);
+
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = tgt->ltd_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		tgt = lmv_tgt(lmv, 0);
+		rc = -ENODATA;
+		if (tgt && tgt->ltd_exp)
+			rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_FID2MDTIDX: {
+		struct lu_fid *fid = karg;
+		int		mdt_index;
+
+		rc = lmv_fld_lookup(lmv, fid, &mdt_index);
+		if (rc != 0)
+			RETURN(rc);
+
+		/* Note: this is from llite(see ll_dir_ioctl()), @uarg does not
+		 * point to user space memory for FID2MDTIDX. */
+		*(__u32 *)uarg = mdt_index;
+		break;
+	}
+	case OBD_IOC_FID2PATH: {
+		rc = lmv_fid2path(exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_STATE_GET:
+	case LL_IOC_HSM_STATE_SET:
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data *op_data = karg;
+
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		if (tgt->ltd_exp == NULL)
+			RETURN(-EINVAL);
+
+		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_PROGRESS: {
+		const struct hsm_progress_kernel *hpk = karg;
+
+		tgt = lmv_fid2tgt(lmv, &hpk->hpk_fid);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_REQUEST: {
+		struct hsm_user_request *hur = karg;
+		unsigned int reqcount = hur->hur_request.hr_itemcount;
+
+		if (reqcount == 0)
+			RETURN(0);
+
+		/* if the request is about a single fid
+		 * or if there is a single MDS, no need to split
+		 * the request. */
+		if (reqcount == 1 || count == 1) {
+			tgt = lmv_fid2tgt(lmv, &hur->hur_user_item[0].hui_fid);
+			if (IS_ERR(tgt))
+				RETURN(PTR_ERR(tgt));
+			rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		} else {
+			/* split fid list to their respective MDS */
+			lmv_foreach_connected_tgt(lmv, tgt) {
+				int nr, rc1;
+				size_t reqlen;
+				struct hsm_user_request *req;
+
+				nr = lmv_hsm_req_count(lmv, hur, tgt);
+				if (nr < 0)
+					RETURN(nr);
+				if (nr == 0) /* nothing for this MDS */
+					continue;
+
+				/* build a request with fids for this MDS */
+				reqlen = offsetof(typeof(*hur),
+						  hur_user_item[nr])
+						+ hur->hur_request.hr_data_len;
+				OBD_ALLOC_LARGE(req, reqlen);
+				if (req == NULL)
+					RETURN(-ENOMEM);
+				rc1 = lmv_hsm_req_build(lmv, hur, tgt, req);
+				if (rc1 < 0)
+					GOTO(hsm_req_err, rc1);
+				rc1 = obd_iocontrol(cmd, tgt->ltd_exp, reqlen,
+						    req, uarg);
+hsm_req_err:
+				if (rc1 != 0 && rc == 0)
+					rc = rc1;
+				OBD_FREE_LARGE(req, reqlen);
+			}
+		}
+		break;
+	}
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct md_op_data *op_data = karg;
+		struct lmv_tgt_desc *tgt1, *tgt2;
+
+		tgt1 = lmv_fid2tgt(lmv, &op_data->op_fid1);
+		if (IS_ERR(tgt1))
+			RETURN(PTR_ERR(tgt1));
+
+		tgt2 = lmv_fid2tgt(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt2))
+			RETURN(PTR_ERR(tgt2));
+
+		if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
+			RETURN(-EINVAL);
+
+		/* only files on same MDT can have their layouts swapped */
+		if (tgt1->ltd_index != tgt2->ltd_index)
+			RETURN(-EPERM);
+
+		rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_CT_START: {
+		struct lustre_kernelcomm *lk = karg;
+		if (lk->lk_flags & LK_FLG_STOP)
+			rc = lmv_hsm_ct_unregister(obd, cmd, len, lk, uarg);
+		else
+			rc = lmv_hsm_ct_register(obd, cmd, len, lk, uarg);
+		break;
+	}
+	default:
+		lmv_foreach_connected_tgt(lmv, tgt) {
+			struct obd_device *mdc_obd;
+			int err;
+
+			/* ll_umount_begin() sets force flag but for lmv, not
+			 * mdc. Let's pass it through */
+			mdc_obd = class_exp2obd(tgt->ltd_exp);
+			mdc_obd->obd_force = obd->obd_force;
+			err = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+			if (err) {
+				if (tgt->ltd_active) {
+					CERROR("error: iocontrol MDC %s on MDT"
+					       " idx %d cmd %x: err = %d\n",
+					       tgt->ltd_uuid.uuid,
+					       tgt->ltd_index, cmd, err);
+					if (!rc)
+						rc = err;
+				}
+			} else
+				set = 1;
+		}
+		if (!set && !rc)
+			rc = -EIO;
+	}
+	RETURN(rc);
+}
+
+int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+		  struct lu_fid *fid, struct md_op_data *op_data)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(op_data);
+	LASSERT(fid);
+
+	tgt = lmv_tgt(lmv, op_data->op_mds);
+	if (!tgt)
+		RETURN(-ENODEV);
+
+	if (!tgt->ltd_active || !tgt->ltd_exp)
+		RETURN(-ENODEV);
+
+	/*
+	 * New seq alloc and FLD setup should be atomic. Otherwise we may find
+	 * on server that seq in new allocated fid is not yet known.
+	 */
+	mutex_lock(&tgt->ltd_fid_mutex);
+	rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL);
+	mutex_unlock(&tgt->ltd_fid_mutex);
+	if (rc > 0) {
+		LASSERT(fid_is_sane(fid));
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_desc	*desc;
+	struct lnet_processid lnet_id;
+	int i = 0;
+	int rc;
+
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("LMV setup requires a descriptor\n");
+		RETURN(-EINVAL);
+	}
+
+	desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
+	if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("Lmv descriptor size wrong: %d > %d\n",
+		       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	obd_str2uuid(&lmv->lmv_mdt_descs.ltd_lmv_desc.ld_uuid,
+		     desc->ld_uuid.uuid);
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count = 0;
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count = 0;
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage =
+		LMV_DESC_QOS_MAXAGE_DEFAULT;
+	lmv->max_def_easize = 0;
+	lmv->max_easize = 0;
+
+	spin_lock_init(&lmv->lmv_lock);
+
+	/*
+	 * initialize rr_index to lower 32bit of netid, so that client
+	 * can distribute subdirs evenly from the beginning.
+	 */
+	while (LNetGetId(i++, &lnet_id) != -ENOENT) {
+		if (!nid_is_lo0(&lnet_id.nid)) {
+			lmv->lmv_qos_rr_index = ntohl(lnet_id.nid.nid_addr[0]);
+			break;
+		}
+	}
+
+	rc = lmv_tunables_init(obd);
+	if (rc)
+		CWARN("%s: error adding LMV sysfs/debugfs files: rc = %d\n",
+		      obd->obd_name, rc);
+
+	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
+			     LUSTRE_CLI_FLD_HASH_DHT);
+	if (rc)
+		CERROR("Can't init FLD, err %d\n", rc);
+
+	rc = lu_tgt_descs_init(&lmv->lmv_mdt_descs, true);
+	if (rc)
+		CWARN("%s: error initialize target table: rc = %d\n",
+		      obd->obd_name, rc);
+
+	RETURN(rc);
+}
+
+static int lmv_cleanup(struct obd_device *obd)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt;
+	struct lu_tgt_desc *tmp;
+
+	ENTRY;
+
+	fld_client_fini(&lmv->lmv_fld);
+	lmv_foreach_tgt_safe(lmv, tgt, tmp)
+		lmv_del_target(lmv, tgt);
+	lu_tgt_descs_fini(&lmv->lmv_mdt_descs);
+
+	RETURN(0);
+}
+
+static int lmv_process_config(struct obd_device *obd, size_t len, void *buf)
+{
+	struct lustre_cfg	*lcfg = buf;
+	struct obd_uuid		obd_uuid;
+	int			gen;
+	__u32			index;
+	int			rc;
+	ENTRY;
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_ADD_MDC:
+		/* modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", &index) != 1)
+			GOTO(out, rc = -EINVAL);
+		if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1)
+			GOTO(out, rc = -EINVAL);
+		rc = lmv_add_target(obd, &obd_uuid, index, gen);
+		GOTO(out, rc);
+	default:
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		GOTO(out, rc = -EINVAL);
+	}
+out:
+	RETURN(rc);
+}
+
+static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags)
+{
+	int i;
+
+	if (flags & OBD_STATFS_FOR_MDT0)
+		return 0;
+
+	if (lmv->lmv_statfs_start || lmv->lmv_mdt_count == 1)
+		return lmv->lmv_statfs_start;
+
+	/* choose initial MDT for this client */
+	for (i = 0;; i++) {
+		struct lnet_processid lnet_id;
+		if (LNetGetId(i, &lnet_id) == -ENOENT)
+			break;
+
+		if (!nid_is_lo0(&lnet_id.nid)) {
+			/* We dont need a full 64-bit modulus, just enough
+			 * to distribute the requests across MDTs evenly.
+			 */
+			lmv->lmv_statfs_start = nidhash(&lnet_id.nid) %
+						lmv->lmv_mdt_count;
+			break;
+		}
+	}
+
+	return lmv->lmv_statfs_start;
+}
+
+static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct obd_statfs *temp;
+	struct lu_tgt_desc *tgt;
+	__u32 i;
+	__u32 idx;
+	int rc = 0;
+
+	ENTRY;
+
+	OBD_ALLOC(temp, sizeof(*temp));
+	if (temp == NULL)
+		RETURN(-ENOMEM);
+
+	/* distribute statfs among MDTs */
+	idx = lmv_select_statfs_mdt(lmv, flags);
+
+	for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++, idx++) {
+		idx = idx % lmv->lmv_mdt_descs.ltd_tgts_size;
+		tgt = lmv_tgt(lmv, idx);
+		if (!tgt || !tgt->ltd_exp)
+			continue;
+
+		rc = obd_statfs(env, tgt->ltd_exp, temp, max_age,
+				flags | OBD_STATFS_NESTED);
+		if (rc) {
+			CERROR("%s: can't stat MDS #%d: rc = %d\n",
+			       tgt->ltd_exp->exp_obd->obd_name, i, rc);
+			GOTO(out_free_temp, rc);
+		}
+
+		if (temp->os_state & OS_STATFS_SUM ||
+		    flags == OBD_STATFS_FOR_MDT0) {
+			/* reset to the last aggregated values
+			 * and don't sum with non-aggrated data */
+			/* If the statfs is from mount, it needs to retrieve
+			 * necessary information from MDT0. i.e. mount does
+			 * not need the merged osfs from all of MDT. Also
+			 * clients can be mounted as long as MDT0 is in
+			 * service */
+			*osfs = *temp;
+			break;
+		}
+
+		if (i == 0) {
+			*osfs = *temp;
+		} else {
+			osfs->os_bavail += temp->os_bavail;
+			osfs->os_blocks += temp->os_blocks;
+			osfs->os_ffree += temp->os_ffree;
+			osfs->os_files += temp->os_files;
+			osfs->os_granted += temp->os_granted;
+		}
+	}
+
+	EXIT;
+out_free_temp:
+	OBD_FREE(temp, sizeof(*temp));
+	return rc;
+}
+
+static int lmv_statfs_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct obd_device *obd = oinfo->oi_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = oinfo->oi_tgt;
+	struct obd_statfs *osfs = oinfo->oi_osfs;
+
+	/*
+	 * NB: don't deactivate TGT upon error, because we may not trigger async
+	 * statfs any longer, then there is no chance to activate TGT.
+	 */
+	if (!rc) {
+		spin_lock(&lmv->lmv_lock);
+		tgt->ltd_statfs = *osfs;
+		tgt->ltd_statfs_age = ktime_get_seconds();
+		spin_unlock(&lmv->lmv_lock);
+		set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags);
+	}
+
+	return rc;
+}
+
+/* update tgt statfs async if it's ld_qos_maxage old */
+int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct obd_info oinfo = {
+		.oi_obd	= obd,
+		.oi_tgt = tgt,
+		.oi_cb_up = lmv_statfs_update,
+	};
+	int rc;
+
+	if (ktime_get_seconds() - tgt->ltd_statfs_age <
+	    obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage)
+		return 0;
+
+	rc = obd_statfs_async(tgt->ltd_exp, &oinfo, 0, NULL);
+
+	return rc;
+}
+
+static int lmv_get_root(struct obd_export *exp, const char *fileset,
+			struct lu_fid *fid)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt = lmv_tgt(lmv, 0);
+	int rc;
+
+	ENTRY;
+
+	if (!tgt)
+		RETURN(-ENODEV);
+
+	rc = md_get_root(tgt->ltd_exp, fileset, fid);
+	RETURN(rc);
+}
+
+static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+			u64 obd_md_valid, const char *name, size_t buf_size,
+			struct ptlrpc_request **req)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_getxattr(tgt->ltd_exp, fid, obd_md_valid, name, buf_size, req);
+
+	RETURN(rc);
+}
+
+static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+			u64 obd_md_valid, const char *name,
+			const void *value, size_t value_size,
+			unsigned int xattr_flags, u32 suppgid,
+			struct ptlrpc_request **req)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_setxattr(tgt->ltd_exp, fid, obd_md_valid, name,
+			 value, value_size, xattr_flags, suppgid, req);
+
+	RETURN(rc);
+}
+
+static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
+		       struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = tgt->ltd_index;
+		RETURN(0);
+	}
+
+	rc = md_getattr(tgt->ltd_exp, op_data, request);
+
+	RETURN(rc);
+}
+
+static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt;
+
+	ENTRY;
+
+	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+	/*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.
+	 */
+	lmv_foreach_connected_tgt(lmv, tgt)
+		md_null_inode(tgt->ltd_exp, fid);
+
+	RETURN(0);
+}
+
+static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
+		     struct md_open_data *mod, struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+	rc = md_close(tgt->ltd_exp, op_data, mod, request);
+	RETURN(rc);
+}
+
+static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv,
+					      struct md_op_data *op_data)
+{
+	struct lu_tgt_desc *tgt, *cur = NULL;
+	__u64 total_avail = 0;
+	__u64 total_weight = 0;
+	__u64 cur_weight = 0;
+	int total_usable = 0;
+	__u64 rand;
+	int rc;
+
+	ENTRY;
+
+	if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+		RETURN(ERR_PTR(-EAGAIN));
+
+	down_write(&lmv->lmv_qos.lq_rw_sem);
+
+	if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+		GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+
+	rc = ltd_qos_penalties_calc(&lmv->lmv_mdt_descs);
+	if (rc)
+		GOTO(unlock, tgt = ERR_PTR(rc));
+
+	lmv_foreach_tgt(lmv, tgt) {
+		if (!tgt->ltd_exp || !tgt->ltd_active) {
+			tgt->ltd_qos.ltq_usable = 0;
+			continue;
+		}
+
+		tgt->ltd_qos.ltq_usable = 1;
+		lu_tgt_qos_weight_calc(tgt);
+		if (tgt->ltd_index == op_data->op_mds)
+			cur = tgt;
+		total_avail += tgt->ltd_qos.ltq_avail;
+		total_weight += tgt->ltd_qos.ltq_weight;
+		total_usable++;
+	}
+
+	/* If current MDT has above-average space and dir is not aleady using
+	 * round-robin to spread across more MDTs, stay on the parent MDT
+	 * to avoid creating needless remote MDT directories.  Remote dirs
+	 * close to the root balance space more effectively than bottom dirs,
+	 * so prefer to create remote dirs at top level of directory tree.
+	 * "16 / (dir_depth + 10)" is the factor to make it less likely
+	 * for top-level directories to stay local unless they have more than
+	 * average free space, while deep dirs prefer local until more full.
+	 *    depth=0 -> 160%, depth=3 -> 123%, depth=6 -> 100%,
+	 *    depth=9 -> 84%, depth=12 -> 73%, depth=15 -> 64%
+	 */
+	if (!lmv_op_default_rr_mkdir(op_data)) {
+		rand = total_avail * 16 /
+			(total_usable * (op_data->op_dir_depth + 10));
+		if (cur && cur->ltd_qos.ltq_avail >= rand) {
+			tgt = cur;
+			GOTO(unlock, tgt);
+		}
+	}
+
+	rand = lu_prandom_u64_max(total_weight);
+
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		if (!tgt->ltd_qos.ltq_usable)
+			continue;
+
+		cur_weight += tgt->ltd_qos.ltq_weight;
+		if (cur_weight < rand)
+			continue;
+
+		ltd_qos_update(&lmv->lmv_mdt_descs, tgt, &total_weight);
+		GOTO(unlock, tgt);
+	}
+
+	/* no proper target found */
+	GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+unlock:
+	up_write(&lmv->lmv_qos.lq_rw_sem);
+
+	return tgt;
+}
+
+static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv)
+{
+	struct lu_tgt_desc *tgt;
+	int i;
+	int index;
+
+	ENTRY;
+
+	spin_lock(&lmv->lmv_lock);
+	for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++) {
+		index = (i + lmv->lmv_qos_rr_index) %
+			lmv->lmv_mdt_descs.ltd_tgts_size;
+		tgt = lmv_tgt(lmv, index);
+		if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+			continue;
+
+		lmv->lmv_qos_rr_index = (tgt->ltd_index + 1) %
+					lmv->lmv_mdt_descs.ltd_tgts_size;
+		spin_unlock(&lmv->lmv_lock);
+
+		RETURN(tgt);
+	}
+	spin_unlock(&lmv->lmv_lock);
+
+	RETURN(ERR_PTR(-ENODEV));
+}
+
+/* locate MDT which is less full (avoid the most full MDT) */
+static struct lu_tgt_desc *lmv_locate_tgt_lf(struct lmv_obd *lmv)
+{
+	struct lu_tgt_desc *min = NULL;
+	struct lu_tgt_desc *tgt;
+	__u64 avail = 0;
+	__u64 rand;
+
+	ENTRY;
+
+	if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+		RETURN(ERR_PTR(-EAGAIN));
+
+	down_write(&lmv->lmv_qos.lq_rw_sem);
+
+	if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+		GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+
+	lmv_foreach_tgt(lmv, tgt) {
+		if (!tgt->ltd_exp || !tgt->ltd_active) {
+			tgt->ltd_qos.ltq_usable = 0;
+			continue;
+		}
+
+		tgt->ltd_qos.ltq_usable = 1;
+		lu_tgt_qos_weight_calc(tgt);
+		avail += tgt->ltd_qos.ltq_avail;
+		if (!min || min->ltd_qos.ltq_avail > tgt->ltd_qos.ltq_avail)
+			min = tgt;
+	}
+
+	/* avoid the most full MDT */
+	if (min)
+		avail -= min->ltd_qos.ltq_avail;
+
+	rand = lu_prandom_u64_max(avail);
+	avail = 0;
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		if (!tgt->ltd_qos.ltq_usable)
+			continue;
+
+		if (tgt == min)
+			continue;
+
+		avail += tgt->ltd_qos.ltq_avail;
+		if (avail < rand)
+			continue;
+
+		GOTO(unlock, tgt);
+	}
+
+	/* no proper target found */
+	GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+unlock:
+	up_write(&lmv->lmv_qos.lq_rw_sem);
+
+	RETURN(tgt);
+}
+
+/* locate MDT by file name, for striped directory, the file name hash decides
+ * which stripe its dirent is stored.
+ */
+static struct lmv_tgt_desc *
+lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
+		       const char *name, int namelen, struct lu_fid *fid,
+		       __u32 *mds, bool new_layout)
+{
+	struct lmv_tgt_desc *tgt;
+	const struct lmv_oinfo *oinfo;
+
+	if (!lmv_dir_striped(lsm) || !namelen) {
+		tgt = lmv_fid2tgt(lmv, fid);
+		if (IS_ERR(tgt))
+			return tgt;
+
+		*mds = tgt->ltd_index;
+		return tgt;
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) {
+		if (cfs_fail_val >= lsm->lsm_md_stripe_count)
+			return ERR_PTR(-EBADF);
+		oinfo = &lsm->lsm_md_oinfo[cfs_fail_val];
+	} else {
+		oinfo = lsm_name_to_stripe_info(lsm, name, namelen, new_layout);
+		if (IS_ERR(oinfo))
+			return ERR_CAST(oinfo);
+	}
+
+	/* check stripe FID is sane */
+	if (!fid_is_sane(&oinfo->lmo_fid))
+		return ERR_PTR(-ENODEV);
+
+	*fid = oinfo->lmo_fid;
+	*mds = oinfo->lmo_mds;
+	tgt = lmv_tgt(lmv, oinfo->lmo_mds);
+
+	CDEBUG(D_INODE, "locate MDT %u parent "DFID"\n", *mds, PFID(fid));
+
+	return tgt ? tgt : ERR_PTR(-ENODEV);
+}
+
+/**
+ * Locate MDT of op_data->op_fid1
+ *
+ * For striped directory, it will locate the stripe by name hash, if hash_type
+ * is unknown, it will return the stripe specified by 'op_data->op_stripe_index'
+ * which is set outside, and if dir is migrating, 'op_data->op_new_layout'
+ * indicates whether old or new layout is used to locate.
+ *
+ * For plain direcotry, it just locate the MDT of op_data->op_fid1.
+ *
+ * \param[in] lmv		LMV device
+ * \param[in/out] op_data	client MD stack parameters, name, namelen etc,
+ *                      	op_mds and op_fid1 will be updated if op_mea1
+ *                      	indicates fid1 represents a striped directory.
+ *
+ * retval		pointer to the lmv_tgt_desc if succeed.
+ *                      ERR_PTR(errno) if failed.
+ */
+struct lmv_tgt_desc *
+lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+	struct lmv_stripe_md *lsm = op_data->op_mea1;
+	struct lmv_oinfo *oinfo;
+	struct lmv_tgt_desc *tgt;
+
+	if (lmv_dir_foreign(lsm))
+		return ERR_PTR(-ENODATA);
+
+	/* During creating VOLATILE file, it should honor the mdt
+	 * index if the file under striped dir is being restored, see
+	 * ct_restore(). */
+	if (op_data->op_bias & MDS_CREATE_VOLATILE &&
+	    op_data->op_mds != LMV_OFFSET_DEFAULT) {
+		tgt = lmv_tgt(lmv, op_data->op_mds);
+		if (!tgt)
+			return ERR_PTR(-ENODEV);
+
+		if (lmv_dir_striped(lsm)) {
+			int i;
+
+			/* refill the right parent fid */
+			for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+				oinfo = &lsm->lsm_md_oinfo[i];
+				if (oinfo->lmo_mds == op_data->op_mds) {
+					op_data->op_fid1 = oinfo->lmo_fid;
+					break;
+				}
+			}
+
+			if (i == lsm->lsm_md_stripe_count)
+				op_data->op_fid1 = lsm->lsm_md_oinfo[0].lmo_fid;
+		}
+	} else if (lmv_dir_bad_hash(lsm)) {
+		LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count);
+		oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index];
+
+		op_data->op_fid1 = oinfo->lmo_fid;
+		op_data->op_mds = oinfo->lmo_mds;
+		tgt = lmv_tgt(lmv, oinfo->lmo_mds);
+		if (!tgt)
+			return ERR_PTR(-ENODEV);
+	} else {
+		tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1,
+				op_data->op_name, op_data->op_namelen,
+				&op_data->op_fid1, &op_data->op_mds,
+				op_data->op_new_layout);
+	}
+
+	return tgt;
+}
+
+/* Locate MDT of op_data->op_fid2 for link/rename */
+static struct lmv_tgt_desc *
+lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	LASSERT(op_data->op_name);
+	if (lmv_dir_layout_changing(op_data->op_mea2)) {
+		struct lu_fid fid1 = op_data->op_fid1;
+		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
+		struct ptlrpc_request *request = NULL;
+
+		/*
+		 * avoid creating new file under old layout of migrating
+		 * directory, check it here.
+		 */
+		tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea2,
+				op_data->op_name, op_data->op_namelen,
+				&op_data->op_fid2, &op_data->op_mds, false);
+		if (IS_ERR(tgt))
+			RETURN(tgt);
+
+		op_data->op_fid1 = op_data->op_fid2;
+		op_data->op_mea1 = op_data->op_mea2;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, &request);
+		op_data->op_fid1 = fid1;
+		op_data->op_mea1 = lsm1;
+		if (!rc) {
+			ptlrpc_req_finished(request);
+			RETURN(ERR_PTR(-EEXIST));
+		}
+
+		if (rc != -ENOENT)
+			RETURN(ERR_PTR(rc));
+	}
+
+	return lmv_locate_tgt_by_name(lmv, op_data->op_mea2, op_data->op_name,
+				op_data->op_namelen, &op_data->op_fid2,
+				&op_data->op_mds, true);
+}
+
+int lmv_old_layout_lookup(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+	struct lu_tgt_desc *tgt;
+	struct ptlrpc_request *request;
+	int rc;
+
+	LASSERT(lmv_dir_layout_changing(op_data->op_mea1));
+	LASSERT(!op_data->op_new_layout);
+
+	tgt = lmv_locate_tgt(lmv, op_data);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_getattr_name(tgt->ltd_exp, op_data, &request);
+	if (!rc) {
+		ptlrpc_req_finished(request);
+		return -EEXIST;
+	}
+
+	return rc;
+}
+
+/* mkdir by QoS upon 'lfs mkdir -i -1'.
+ *
+ * NB, mkdir by QoS only if parent is not striped, this is to avoid remote
+ * directories under striped directory.
+ */
+static inline bool lmv_op_user_qos_mkdir(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
+
+	if (op_data->op_code != LUSTRE_OPC_MKDIR)
+		return false;
+
+	if (lmv_dir_striped(op_data->op_mea1))
+		return false;
+
+	return (op_data->op_cli_flags & CLI_SET_MEA) && lum &&
+	       le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC &&
+	       le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT;
+}
+
+/* mkdir by QoS if either ROOT or parent default LMV is space balanced. */
+static inline bool lmv_op_default_qos_mkdir(const struct md_op_data *op_data)
+{
+	const struct lmv_stripe_md *lsm = op_data->op_default_mea1;
+
+	if (op_data->op_code != LUSTRE_OPC_MKDIR)
+		return false;
+
+	if (lmv_dir_striped(op_data->op_mea1))
+		return false;
+
+	return (op_data->op_flags & MF_QOS_MKDIR) ||
+	       (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT);
+}
+
+/* if parent default LMV is space balanced, and
+ * 1. max_inherit_rr is set
+ * 2. or parent is ROOT
+ * mkdir roundrobin. Or if parent doesn't have default LMV, while ROOT default
+ * LMV requests roundrobin mkdir, do the same.
+ * NB, this needs to check server is balanced, which is done by caller.
+ */
+static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data)
+{
+	const struct lmv_stripe_md *lsm = op_data->op_default_mea1;
+
+	return (op_data->op_flags & MF_RR_MKDIR) ||
+	       (lsm && lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE) ||
+	       fid_is_root(&op_data->op_fid1);
+}
+
+/* 'lfs mkdir -i <specific_MDT>' */
+static inline bool lmv_op_user_specific_mkdir(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
+
+	return op_data->op_code == LUSTRE_OPC_MKDIR &&
+	       op_data->op_cli_flags & CLI_SET_MEA && lum &&
+	       (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
+		le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) &&
+	       le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT;
+}
+
+/* parent default LMV master_mdt_index is not -1. */
+static inline bool
+lmv_op_default_specific_mkdir(const struct md_op_data *op_data)
+{
+	return op_data->op_code == LUSTRE_OPC_MKDIR &&
+	       op_data->op_default_mea1 &&
+	       op_data->op_default_mea1->lsm_md_master_mdt_index !=
+			LMV_OFFSET_DEFAULT;
+}
+
+/* locate MDT by space usage */
+static struct lu_tgt_desc *lmv_locate_tgt_by_space(struct lmv_obd *lmv,
+						   struct md_op_data *op_data,
+						   struct lmv_tgt_desc *tgt)
+{
+	struct lmv_tgt_desc *tmp = tgt;
+
+	tgt = lmv_locate_tgt_qos(lmv, op_data);
+	if (tgt == ERR_PTR(-EAGAIN)) {
+		if (ltd_qos_is_balanced(&lmv->lmv_mdt_descs) &&
+		    !lmv_op_default_rr_mkdir(op_data) &&
+		    !lmv_op_user_qos_mkdir(op_data))
+			/* if not necessary, don't create remote directory. */
+			tgt = tmp;
+		else
+			tgt = lmv_locate_tgt_rr(lmv);
+	}
+
+	/*
+	 * only update statfs after QoS mkdir, this means the cached statfs may
+	 * be stale, and current mkdir may not follow QoS accurately, but it's
+	 * not serious, and avoids periodic statfs when client doesn't mkdir by
+	 * QoS.
+	 */
+	if (!IS_ERR(tgt)) {
+		op_data->op_mds = tgt->ltd_index;
+		lmv_statfs_check_update(lmv2obd_dev(lmv), tgt);
+	}
+
+	return tgt;
+}
+
+int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
+		const void *data, size_t datalen, umode_t mode, uid_t uid,
+		gid_t gid, kernel_cap_t cap_effective, __u64 rdev,
+		struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *repbody;
+	int rc;
+
+	ENTRY;
+
+	if (!lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count)
+		RETURN(-EIO);
+
+	if (lmv_dir_bad_hash(op_data->op_mea1))
+		RETURN(-EBADF);
+
+	if (lmv_dir_layout_changing(op_data->op_mea1)) {
+		/*
+		 * if parent is migrating, create() needs to lookup existing
+		 * name in both old and new layout, check old layout on client.
+		 */
+		rc = lmv_old_layout_lookup(lmv, op_data);
+		if (rc != -ENOENT)
+			RETURN(rc);
+
+		op_data->op_new_layout = true;
+	}
+
+	tgt = lmv_locate_tgt(lmv, op_data);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/* the order to apply policy in mkdir:
+	 * 1. is "lfs mkdir -i N"? mkdir on MDT N.
+	 * 2. is "lfs mkdir -i -1"? mkdir by space usage.
+	 * 3. is starting MDT specified in default LMV? mkdir on MDT N.
+	 * 4. is default LMV space balanced? mkdir by space usage.
+	 */
+	if (lmv_op_user_specific_mkdir(op_data)) {
+		struct lmv_user_md *lum = op_data->op_data;
+
+		op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+		tgt = lmv_tgt(lmv, op_data->op_mds);
+		if (!tgt)
+			RETURN(-ENODEV);
+	} else if (lmv_op_user_qos_mkdir(op_data)) {
+		tgt = lmv_locate_tgt_by_space(lmv, op_data, tgt);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	} else if (lmv_op_default_specific_mkdir(op_data)) {
+		op_data->op_mds =
+			op_data->op_default_mea1->lsm_md_master_mdt_index;
+		tgt = lmv_tgt(lmv, op_data->op_mds);
+		if (!tgt)
+			RETURN(-ENODEV);
+	} else if (lmv_op_default_qos_mkdir(op_data)) {
+		tgt = lmv_locate_tgt_by_space(lmv, op_data, tgt);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	}
+
+retry:
+	rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "CREATE name '%.*s' "DFID" on "DFID" -> mds #%x\n",
+		(int)op_data->op_namelen, op_data->op_name,
+		PFID(&op_data->op_fid2), PFID(&op_data->op_fid1),
+		op_data->op_mds);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
+		       cap_effective, rdev, request);
+	if (rc == 0) {
+		if (*request == NULL)
+			RETURN(rc);
+		CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
+	}
+
+	/* dir restripe needs to send to MDT where dir is located */
+	if (rc != -EREMOTE ||
+	    !(exp_connect_flags2(exp) & OBD_CONNECT2_CRUSH))
+		RETURN(rc);
+
+	repbody = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	if (repbody == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(repbody->mbo_valid & OBD_MD_MDS)))
+		RETURN(rc);
+
+	op_data->op_fid2 = repbody->mbo_fid1;
+	ptlrpc_req_finished(*request);
+	*request = NULL;
+
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	op_data->op_mds = tgt->ltd_index;
+	goto retry;
+}
+
+static int
+lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+	    const union ldlm_policy_data *policy, struct md_op_data *op_data,
+	    struct lustre_handle *lockh, __u64 extra_lock_flags)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_INODE, "ENQUEUE on "DFID"\n", PFID(&op_data->op_fid1));
+
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "ENQUEUE on "DFID" -> mds #%u\n",
+	       PFID(&op_data->op_fid1), tgt->ltd_index);
+
+	rc = md_enqueue(tgt->ltd_exp, einfo, policy, op_data, lockh,
+			extra_lock_flags);
+
+	RETURN(rc);
+}
+
+int
+lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
+		 struct ptlrpc_request **preq)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *body;
+	int rc;
+
+	ENTRY;
+
+retry:
+	if (op_data->op_namelen == 2 &&
+	    op_data->op_name[0] == '.' && op_data->op_name[1] == '.')
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	else
+		tgt = lmv_locate_tgt(lmv, op_data);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
+		(int)op_data->op_namelen, op_data->op_name,
+		PFID(&op_data->op_fid1), tgt->ltd_index);
+
+	rc = md_getattr_name(tgt->ltd_exp, op_data, preq);
+	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*preq);
+		*preq = NULL;
+		goto retry;
+	}
+
+	if (rc)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*preq)->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	if (body->mbo_valid & OBD_MD_MDS) {
+		op_data->op_fid1 = body->mbo_fid1;
+		op_data->op_valid |= OBD_MD_FLCROSSREF;
+		op_data->op_namelen = 0;
+		op_data->op_name = NULL;
+
+		ptlrpc_req_finished(*preq);
+		*preq = NULL;
+
+		goto retry;
+	}
+
+	RETURN(rc);
+}
+
+#define md_op_data_fid(op_data, fl)                     \
+        (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
+         fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
+         fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
+         fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
+         NULL)
+
+static int lmv_early_cancel(struct obd_export *exp, struct lmv_tgt_desc *tgt,
+			    struct md_op_data *op_data, __u32 op_tgt,
+			    enum ldlm_mode mode, int bits, int flag)
+{
+	struct lu_fid *fid = md_op_data_fid(op_data, flag);
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	union ldlm_policy_data policy = { { 0 } };
+	int rc = 0;
+	ENTRY;
+
+	if (!fid_is_sane(fid))
+		RETURN(0);
+
+	if (tgt == NULL) {
+		tgt = lmv_fid2tgt(lmv, fid);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	}
+
+	if (tgt->ltd_index != op_tgt) {
+		CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
+		policy.l_inodebits.bits = bits;
+		rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
+				      mode, LCF_ASYNC, NULL);
+	} else {
+		CDEBUG(D_INODE,
+		       "EARLY_CANCEL skip operation target %d on "DFID"\n",
+		       op_tgt, PFID(fid));
+		op_data->op_flags |= flag;
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
+ * op_data->op_fid2
+ */
+static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
+                    struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int                      rc;
+	ENTRY;
+
+	LASSERT(op_data->op_namelen != 0);
+
+	CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
+	       PFID(&op_data->op_fid2), (int)op_data->op_namelen,
+	       op_data->op_name, PFID(&op_data->op_fid1));
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = current_cap();
+
+	tgt = lmv_locate_tgt2(lmv, op_data);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/*
+	 * Cancel UPDATE lock on child (fid1).
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID2;
+	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = md_link(tgt->ltd_exp, op_data, request);
+
+	RETURN(rc);
+}
+
+/* migrate the top directory */
+static inline bool lmv_op_topdir_migrate(const struct md_op_data *op_data)
+{
+	if (!S_ISDIR(op_data->op_mode))
+		return false;
+
+	if (lmv_dir_layout_changing(op_data->op_mea1))
+		return false;
+
+	return true;
+}
+
+/* migrate top dir to specific MDTs */
+static inline bool lmv_topdir_specific_migrate(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
+
+	if (!lmv_op_topdir_migrate(op_data))
+		return false;
+
+	return le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT;
+}
+
+/* migrate top dir in QoS mode if user issued "lfs migrate -m -1..." */
+static inline bool lmv_topdir_qos_migrate(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
+
+	if (!lmv_op_topdir_migrate(op_data))
+		return false;
+
+	return le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT;
+}
+
+static inline bool lmv_subdir_specific_migrate(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
+
+	if (!S_ISDIR(op_data->op_mode))
+		return false;
+
+	if (!lmv_dir_layout_changing(op_data->op_mea1))
+		return false;
+
+	return le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT;
+}
+
+static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
+			const char *name, size_t namelen,
+			struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_stripe_md *lsm = op_data->op_mea1;
+	struct lmv_tgt_desc *parent_tgt;
+	struct lmv_tgt_desc *sp_tgt;
+	struct lmv_tgt_desc *tp_tgt = NULL;
+	struct lmv_tgt_desc *child_tgt;
+	struct lmv_tgt_desc *tgt;
+	struct lu_fid target_fid = { 0 };
+	int rc;
+
+	ENTRY;
+
+	LASSERT(op_data->op_cli_flags & CLI_MIGRATE);
+
+	CDEBUG(D_INODE, "MIGRATE "DFID"/%.*s\n",
+	       PFID(&op_data->op_fid1), (int)namelen, name);
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = current_cap();
+
+	parent_tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	if (IS_ERR(parent_tgt))
+		RETURN(PTR_ERR(parent_tgt));
+
+	if (lmv_dir_striped(lsm)) {
+		const struct lmv_oinfo *oinfo;
+
+		oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false);
+		if (IS_ERR(oinfo))
+			RETURN(PTR_ERR(oinfo));
+
+		/* save source stripe FID in fid4 temporarily for ELC */
+		op_data->op_fid4 = oinfo->lmo_fid;
+		sp_tgt = lmv_tgt(lmv, oinfo->lmo_mds);
+		if (!sp_tgt)
+			RETURN(-ENODEV);
+
+		/*
+		 * if parent is being migrated too, fill op_fid2 with target
+		 * stripe fid, otherwise the target stripe is not created yet.
+		 */
+		if (lmv_dir_layout_changing(lsm)) {
+			oinfo = lsm_name_to_stripe_info(lsm, name, namelen,
+							true);
+			if (IS_ERR(oinfo))
+				RETURN(PTR_ERR(oinfo));
+
+			op_data->op_fid2 = oinfo->lmo_fid;
+			tp_tgt = lmv_tgt(lmv, oinfo->lmo_mds);
+			if (!tp_tgt)
+				RETURN(-ENODEV);
+
+			/* parent unchanged and update namespace only */
+			if (lu_fid_eq(&op_data->op_fid4, &op_data->op_fid2) &&
+			    op_data->op_bias & MDS_MIGRATE_NSONLY)
+				RETURN(-EALREADY);
+		}
+	} else {
+		sp_tgt = parent_tgt;
+	}
+
+	child_tgt = lmv_fid2tgt(lmv, &op_data->op_fid3);
+	if (IS_ERR(child_tgt))
+		RETURN(PTR_ERR(child_tgt));
+
+	if (lmv_topdir_specific_migrate(op_data)) {
+		struct lmv_user_md *lum = op_data->op_data;
+
+		op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+	} else if (lmv_topdir_qos_migrate(op_data)) {
+		tgt = lmv_locate_tgt_lf(lmv);
+		if (tgt == ERR_PTR(-EAGAIN))
+			tgt = lmv_locate_tgt_rr(lmv);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		op_data->op_mds = tgt->ltd_index;
+	} else if (lmv_subdir_specific_migrate(op_data)) {
+		struct lmv_user_md *lum = op_data->op_data;
+		__u32 i;
+
+		LASSERT(tp_tgt);
+		if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) {
+			/* adjust MDTs in lum, since subdir is located on where
+			 * its parent stripe is, not the first specified MDT.
+			 */
+			for (i = 0; i < le32_to_cpu(lum->lum_stripe_count);
+			     i++) {
+				if (le32_to_cpu(lum->lum_objects[i].lum_mds) ==
+				    tp_tgt->ltd_index)
+					break;
+			}
+
+			if (i == le32_to_cpu(lum->lum_stripe_count))
+				RETURN(-ENODEV);
+
+			lum->lum_objects[i].lum_mds =
+				lum->lum_objects[0].lum_mds;
+			lum->lum_objects[0].lum_mds =
+				cpu_to_le32(tp_tgt->ltd_index);
+		}
+		/* NB, the above adjusts subdir migration for command like
+		 * "lfs migrate -m 0,1,2 ...", but for migration like
+		 * "lfs migrate -m 0 -c 2 ...", the top dir is migrated to MDT0
+		 * and MDT1, however its subdir may be migrated to MDT1 and MDT2
+		 */
+
+		lum->lum_stripe_offset = cpu_to_le32(tp_tgt->ltd_index);
+		op_data->op_mds = tp_tgt->ltd_index;
+	} else if (tp_tgt) {
+		op_data->op_mds = tp_tgt->ltd_index;
+	} else {
+		op_data->op_mds = sp_tgt->ltd_index;
+	}
+
+	rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
+	if (rc)
+		RETURN(rc);
+
+	/*
+	 * for directory, send migrate request to the MDT where the object will
+	 * be migrated to, because we can't create a striped directory remotely.
+	 *
+	 * otherwise, send to the MDT where source is located because regular
+	 * file may open lease.
+	 *
+	 * NB. if MDT doesn't support DIR_MIGRATE, send to source MDT too for
+	 * backward compatibility.
+	 */
+	if (S_ISDIR(op_data->op_mode) &&
+	    (exp_connect_flags2(exp) & OBD_CONNECT2_DIR_MIGRATE)) {
+		tgt = lmv_fid2tgt(lmv, &target_fid);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	} else {
+		tgt = child_tgt;
+	}
+
+	/* cancel UPDATE lock of parent master object */
+	rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc)
+		RETURN(rc);
+
+	/* cancel UPDATE lock of source parent */
+	if (sp_tgt != parent_tgt) {
+		/*
+		 * migrate RPC packs master object FID, because we can only pack
+		 * two FIDs in reint RPC, but MDS needs to know both source
+		 * parent and target parent, and it will obtain them from master
+		 * FID and LMV, the other FID in RPC is kept for target.
+		 *
+		 * since this FID is not passed to MDC, cancel it anyway.
+		 */
+		rc = lmv_early_cancel(exp, sp_tgt, op_data, -1, LCK_EX,
+				      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID4);
+		if (rc)
+			RETURN(rc);
+
+		op_data->op_flags &= ~MF_MDC_CANCEL_FID4;
+	}
+	op_data->op_fid4 = target_fid;
+
+	/* cancel UPDATE locks of target parent */
+	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
+	if (rc)
+		RETURN(rc);
+
+	/* cancel LOOKUP lock of source if source is remote object */
+	if (child_tgt != sp_tgt) {
+		rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
+				      MF_MDC_CANCEL_FID3);
+		if (rc)
+			RETURN(rc);
+	}
+
+	/* cancel ELC locks of source */
+	rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_index, LCK_EX,
+			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
+	if (rc)
+		RETURN(rc);
+
+	rc = md_rename(tgt->ltd_exp, op_data, name, namelen, NULL, 0, request);
+
+	RETURN(rc);
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+		      const char *old, size_t oldlen,
+		      const char *new, size_t newlen,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *sp_tgt;
+	struct lmv_tgt_desc *tp_tgt = NULL;
+	struct lmv_tgt_desc *src_tgt = NULL;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *body;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(oldlen != 0);
+
+	if (op_data->op_cli_flags & CLI_MIGRATE) {
+		rc = lmv_migrate(exp, op_data, old, oldlen, request);
+		RETURN(rc);
+	}
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = current_cap();
+
+	op_data->op_name = new;
+	op_data->op_namelen = newlen;
+
+	tp_tgt = lmv_locate_tgt2(lmv, op_data);
+	if (IS_ERR(tp_tgt))
+		RETURN(PTR_ERR(tp_tgt));
+
+	/* Since the target child might be destroyed, and it might become
+	 * orphan, and we can only check orphan on the local MDT right now, so
+	 * we send rename request to the MDT where target child is located. If
+	 * target child does not exist, then it will send the request to the
+	 * target parent */
+	if (fid_is_sane(&op_data->op_fid4)) {
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid4);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	} else {
+		tgt = tp_tgt;
+	}
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID4;
+
+	/* cancel UPDATE locks of target parent */
+	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
+	if (rc != 0)
+		RETURN(rc);
+
+	if (fid_is_sane(&op_data->op_fid4)) {
+		/* cancel LOOKUP lock of target on target parent */
+		if (tgt != tp_tgt) {
+			rc = lmv_early_cancel(exp, tp_tgt, op_data,
+					      tgt->ltd_index, LCK_EX,
+					      MDS_INODELOCK_LOOKUP,
+					      MF_MDC_CANCEL_FID4);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+	if (fid_is_sane(&op_data->op_fid3)) {
+		src_tgt = lmv_fid2tgt(lmv, &op_data->op_fid3);
+		if (IS_ERR(src_tgt))
+			RETURN(PTR_ERR(src_tgt));
+
+		/* cancel ELC locks of source */
+		rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_index,
+				      LCK_EX, MDS_INODELOCK_ELC,
+				      MF_MDC_CANCEL_FID3);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	op_data->op_name = old;
+	op_data->op_namelen = oldlen;
+retry:
+	sp_tgt = lmv_locate_tgt(lmv, op_data);
+	if (IS_ERR(sp_tgt))
+		RETURN(PTR_ERR(sp_tgt));
+
+	/* cancel UPDATE locks of source parent */
+	rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc != 0)
+		RETURN(rc);
+
+	if (fid_is_sane(&op_data->op_fid3)) {
+		/* cancel LOOKUP lock of source on source parent */
+		if (src_tgt != sp_tgt) {
+			rc = lmv_early_cancel(exp, sp_tgt, op_data,
+					      tgt->ltd_index, LCK_EX,
+					      MDS_INODELOCK_LOOKUP,
+					      MF_MDC_CANCEL_FID3);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+rename:
+	CDEBUG(D_INODE, "RENAME "DFID"/%.*s to "DFID"/%.*s\n",
+		PFID(&op_data->op_fid1), (int)oldlen, old,
+		PFID(&op_data->op_fid2), (int)newlen, new);
+
+	rc = md_rename(tgt->ltd_exp, op_data, old, oldlen, new, newlen,
+			request);
+	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*request);
+		*request = NULL;
+		goto retry;
+	}
+
+	if (rc && rc != -EXDEV)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
+		RETURN(rc);
+
+	op_data->op_fid4 = body->mbo_fid1;
+
+	ptlrpc_req_finished(*request);
+	*request = NULL;
+
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid4);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (fid_is_sane(&op_data->op_fid4)) {
+		/* cancel LOOKUP lock of target on target parent */
+		if (tgt != tp_tgt) {
+			rc = lmv_early_cancel(exp, tp_tgt, op_data,
+					      tgt->ltd_index, LCK_EX,
+					      MDS_INODELOCK_LOOKUP,
+					      MF_MDC_CANCEL_FID4);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+	goto rename;
+}
+
+static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		       void *ea, size_t ealen, struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc = 0;
+
+	ENTRY;
+
+	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x/0x%x\n",
+	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid,
+	       op_data->op_xvalid);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, request);
+
+	RETURN(rc);
+}
+
+static int lmv_fsync(struct obd_export *exp, const struct lu_fid *fid,
+		     struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_fsync(tgt->ltd_exp, fid, request);
+	RETURN(rc);
+}
+
+struct stripe_dirent {
+	struct page		*sd_page;
+	struct lu_dirpage	*sd_dp;
+	struct lu_dirent	*sd_ent;
+	bool			 sd_eof;
+};
+
+struct lmv_dir_ctxt {
+	struct lmv_obd		*ldc_lmv;
+	struct md_op_data	*ldc_op_data;
+	struct md_readdir_info  *ldc_mrinfo;
+	__u64			 ldc_hash;
+	int			 ldc_count;
+	struct stripe_dirent	 ldc_stripes[0];
+};
+
+static inline void stripe_dirent_unload(struct stripe_dirent *stripe)
+{
+	if (stripe->sd_page) {
+		kunmap(stripe->sd_page);
+		put_page(stripe->sd_page);
+		stripe->sd_page = NULL;
+		stripe->sd_ent = NULL;
+	}
+}
+
+static inline void put_lmv_dir_ctxt(struct lmv_dir_ctxt *ctxt)
+{
+	int i;
+
+	for (i = 0; i < ctxt->ldc_count; i++)
+		stripe_dirent_unload(&ctxt->ldc_stripes[i]);
+}
+
+/* if @ent is dummy, or . .., get next */
+static struct lu_dirent *stripe_dirent_get(struct lmv_dir_ctxt *ctxt,
+					   struct lu_dirent *ent,
+					   int stripe_index)
+{
+	for (; ent; ent = lu_dirent_next(ent)) {
+		/* Skip dummy entry */
+		if (le16_to_cpu(ent->lde_namelen) == 0)
+			continue;
+
+		/* skip . and .. for other stripes */
+		if (stripe_index &&
+		    (strncmp(ent->lde_name, ".",
+			     le16_to_cpu(ent->lde_namelen)) == 0 ||
+		     strncmp(ent->lde_name, "..",
+			     le16_to_cpu(ent->lde_namelen)) == 0))
+			continue;
+
+		if (le64_to_cpu(ent->lde_hash) >= ctxt->ldc_hash)
+			break;
+	}
+
+	return ent;
+}
+
+static struct lu_dirent *stripe_dirent_load(struct lmv_dir_ctxt *ctxt,
+					    struct stripe_dirent *stripe,
+					    int stripe_index)
+{
+	struct md_op_data *op_data = ctxt->ldc_op_data;
+	struct lmv_oinfo *oinfo;
+	struct lu_fid fid = op_data->op_fid1;
+	struct inode *inode = op_data->op_data;
+	struct lmv_tgt_desc *tgt;
+	struct lu_dirent *ent = stripe->sd_ent;
+	__u64 hash = ctxt->ldc_hash;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(stripe == &ctxt->ldc_stripes[stripe_index]);
+	LASSERT(!ent);
+
+	do {
+		if (stripe->sd_page) {
+			__u64 end = le64_to_cpu(stripe->sd_dp->ldp_hash_end);
+
+			/* @hash should be the last dirent hash */
+			LASSERTF(hash <= end,
+				 "ctxt@%p stripe@%p hash %llx end %llx\n",
+				 ctxt, stripe, hash, end);
+			/* unload last page */
+			stripe_dirent_unload(stripe);
+			/* eof */
+			if (end == MDS_DIR_END_OFF) {
+				stripe->sd_eof = true;
+				break;
+			}
+			hash = end;
+		}
+
+		oinfo = &op_data->op_mea1->lsm_md_oinfo[stripe_index];
+		if (!oinfo->lmo_root) {
+			rc = -ENOENT;
+			break;
+		}
+
+		tgt = lmv_tgt(ctxt->ldc_lmv, oinfo->lmo_mds);
+		if (!tgt) {
+			rc = -ENODEV;
+			break;
+		}
+
+		/* op_data is shared by stripes, reset after use */
+		op_data->op_fid1 = oinfo->lmo_fid;
+		op_data->op_fid2 = oinfo->lmo_fid;
+		op_data->op_data = oinfo->lmo_root;
+
+		rc = md_read_page(tgt->ltd_exp, op_data, ctxt->ldc_mrinfo, hash,
+				  &stripe->sd_page);
+
+		op_data->op_fid1 = fid;
+		op_data->op_fid2 = fid;
+		op_data->op_data = inode;
+
+		if (rc)
+			break;
+
+		stripe->sd_dp = page_address(stripe->sd_page);
+		ent = stripe_dirent_get(ctxt, lu_dirent_start(stripe->sd_dp),
+					stripe_index);
+		/* in case a page filled with ., .. and dummy, read next */
+	} while (!ent);
+
+	stripe->sd_ent = ent;
+	if (rc) {
+		LASSERT(!ent);
+		/* treat error as eof, so dir can be partially accessed */
+		stripe->sd_eof = true;
+		ctxt->ldc_mrinfo->mr_partial_readdir_rc = rc;
+		LCONSOLE_WARN("dir "DFID" stripe %d readdir failed: %d, "
+			      "directory is partially accessed!\n",
+			      PFID(&ctxt->ldc_op_data->op_fid1), stripe_index,
+			      rc);
+	}
+
+	RETURN(ent);
+}
+
+static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc != 0)
+		RETURN(rc);
+
+	tgt = lmv_fid2tgt(lmv, &data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	data->op_flags |= MF_MDC_CANCEL_FID1;
+	rc = md_file_resync(tgt->ltd_exp, data);
+	RETURN(rc);
+}
+
+/**
+ * Get dirent with the closest hash for striped directory
+ *
+ * This function will search the dir entry, whose hash value is the
+ * closest(>=) to hash from all of sub-stripes, and it is only being called
+ * for striped directory.
+ *
+ * \param[in] ctxt		dir read context
+ *
+ * \retval                      dirent get the entry successfully
+ *                              NULL does not get the entry, normally it means
+ *                              it reaches the end of the directory, while read
+ *                              stripe dirent error is ignored to allow partial
+ *                              access.
+ */
+static struct lu_dirent *lmv_dirent_next(struct lmv_dir_ctxt *ctxt)
+{
+	struct stripe_dirent *stripe;
+	struct lu_dirent *ent = NULL;
+	int i;
+	int min = -1;
+
+	/* TODO: optimize with k-way merge sort */
+	for (i = 0; i < ctxt->ldc_count; i++) {
+		stripe = &ctxt->ldc_stripes[i];
+		if (stripe->sd_eof)
+			continue;
+
+		if (!stripe->sd_ent) {
+			stripe_dirent_load(ctxt, stripe, i);
+			if (!stripe->sd_ent) {
+				LASSERT(stripe->sd_eof);
+				continue;
+			}
+		}
+
+		if (min == -1 ||
+		    le64_to_cpu(ctxt->ldc_stripes[min].sd_ent->lde_hash) >
+		    le64_to_cpu(stripe->sd_ent->lde_hash)) {
+			min = i;
+			if (le64_to_cpu(stripe->sd_ent->lde_hash) ==
+			    ctxt->ldc_hash)
+				break;
+		}
+	}
+
+	if (min != -1) {
+		stripe = &ctxt->ldc_stripes[min];
+		ent = stripe->sd_ent;
+		/* pop found dirent */
+		stripe->sd_ent = stripe_dirent_get(ctxt, lu_dirent_next(ent),
+						   min);
+	}
+
+	return ent;
+}
+
+/**
+ * Build dir entry page for striped directory
+ *
+ * This function gets one entry by @offset from a striped directory. It will
+ * read entries from all of stripes, and choose one closest to the required
+ * offset(&offset). A few notes
+ * 1. skip . and .. for non-zero stripes, because there can only have one .
+ * and .. in a directory.
+ * 2. op_data will be shared by all of stripes, instead of allocating new
+ * one, so need to restore before reusing.
+ *
+ * \param[in] exp	obd export refer to LMV
+ * \param[in] op_data	hold those MD parameters of read_entry
+ * \param[in] mrinfo	ldlm callback being used in enqueue in mdc_read_entry,
+ *			and partial readdir result will be stored in it.
+ * \param[in] offset	starting hash offset
+ * \param[out] ppage	the page holding the entry. Note: because the entry
+ *                      will be accessed in upper layer, so we need hold the
+ *                      page until the usages of entry is finished, see
+ *                      ll_dir_entry_next.
+ *
+ * retval		=0 if get entry successfully
+ *                      <0 cannot get entry
+ */
+static int lmv_striped_read_page(struct obd_export *exp,
+				 struct md_op_data *op_data,
+				 struct md_readdir_info *mrinfo, __u64 offset,
+				 struct page **ppage)
+{
+	struct page *page = NULL;
+	struct lu_dirpage *dp;
+	void *start;
+	struct lu_dirent *ent;
+	struct lu_dirent *last_ent;
+	int stripe_count;
+	struct lmv_dir_ctxt *ctxt;
+	struct lu_dirent *next = NULL;
+	__u16 ent_size;
+	size_t left_bytes;
+	int rc = 0;
+	ENTRY;
+
+	/* Allocate a page and read entries from all of stripes and fill
+	 * the page by hash order */
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		RETURN(-ENOMEM);
+
+	/* Initialize the entry page */
+	dp = kmap(page);
+	memset(dp, 0, sizeof(*dp));
+	dp->ldp_hash_start = cpu_to_le64(offset);
+
+	start = dp + 1;
+	left_bytes = PAGE_SIZE - sizeof(*dp);
+	ent = start;
+	last_ent = ent;
+
+	/* initalize dir read context */
+	stripe_count = op_data->op_mea1->lsm_md_stripe_count;
+	OBD_ALLOC(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
+	if (!ctxt)
+		GOTO(free_page, rc = -ENOMEM);
+	ctxt->ldc_lmv = &exp->exp_obd->u.lmv;
+	ctxt->ldc_op_data = op_data;
+	ctxt->ldc_mrinfo = mrinfo;
+	ctxt->ldc_hash = offset;
+	ctxt->ldc_count = stripe_count;
+
+	while (1) {
+		next = lmv_dirent_next(ctxt);
+
+		/* end of directory */
+		if (!next) {
+			ctxt->ldc_hash = MDS_DIR_END_OFF;
+			break;
+		}
+		ctxt->ldc_hash = le64_to_cpu(next->lde_hash);
+
+		ent_size = le16_to_cpu(next->lde_reclen);
+
+		/* the last entry lde_reclen is 0, but it might not be the last
+		 * one of this temporay dir page */
+		if (!ent_size)
+			ent_size = lu_dirent_calc_size(
+					le16_to_cpu(next->lde_namelen),
+					le32_to_cpu(next->lde_attrs));
+		/* page full */
+		if (ent_size > left_bytes)
+			break;
+
+		memcpy(ent, next, ent_size);
+
+		/* Replace . with master FID and Replace .. with the parent FID
+		 * of master object */
+		if (strncmp(ent->lde_name, ".",
+			    le16_to_cpu(ent->lde_namelen)) == 0 &&
+		    le16_to_cpu(ent->lde_namelen) == 1)
+			fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid1);
+		else if (strncmp(ent->lde_name, "..",
+				   le16_to_cpu(ent->lde_namelen)) == 0 &&
+			   le16_to_cpu(ent->lde_namelen) == 2)
+			fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3);
+
+		CDEBUG(D_INODE, "entry %.*s hash %#llx\n",
+		       le16_to_cpu(ent->lde_namelen), ent->lde_name,
+		       le64_to_cpu(ent->lde_hash));
+
+		left_bytes -= ent_size;
+		ent->lde_reclen = cpu_to_le16(ent_size);
+		last_ent = ent;
+		ent = (void *)ent + ent_size;
+	};
+
+	last_ent->lde_reclen = 0;
+
+	if (ent == start)
+		dp->ldp_flags |= LDF_EMPTY;
+	else if (ctxt->ldc_hash == le64_to_cpu(last_ent->lde_hash))
+		dp->ldp_flags |= LDF_COLLIDE;
+	dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
+	dp->ldp_hash_end = cpu_to_le64(ctxt->ldc_hash);
+
+	put_lmv_dir_ctxt(ctxt);
+	OBD_FREE(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
+
+	*ppage = page;
+
+	RETURN(0);
+
+free_page:
+	kunmap(page);
+	__free_page(page);
+
+	return rc;
+}
+
+static int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
+			 struct md_readdir_info *mrinfo, __u64 offset,
+			 struct page **ppage)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	if (unlikely(lmv_dir_foreign(op_data->op_mea1)))
+		RETURN(-ENODATA);
+
+	if (unlikely(lmv_dir_striped(op_data->op_mea1))) {
+		rc = lmv_striped_read_page(exp, op_data, mrinfo, offset, ppage);
+		RETURN(rc);
+	}
+
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_read_page(tgt->ltd_exp, op_data, mrinfo, offset, ppage);
+
+	RETURN(rc);
+}
+
+/**
+ * Unlink a file/directory
+ *
+ * Unlink a file or directory under the parent dir. The unlink request
+ * usually will be sent to the MDT where the child is located, but if
+ * the client does not have the child FID then request will be sent to the
+ * MDT where the parent is located.
+ *
+ * If the parent is a striped directory then it also needs to locate which
+ * stripe the name of the child is located, and replace the parent FID
+ * (@op->op_fid1) with the stripe FID. Note: if the stripe is unknown,
+ * it will walk through all of sub-stripes until the child is being
+ * unlinked finally.
+ *
+ * \param[in] exp	export refer to LMV
+ * \param[in] op_data	different parameters transferred beween client
+ *                      MD stacks, name, namelen, FIDs etc.
+ *                      op_fid1 is the parent FID, op_fid2 is the child
+ *                      FID.
+ * \param[out] request	point to the request of unlink.
+ *
+ * retval		0 if succeed
+ *                      negative errno if failed.
+ */
+static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct lmv_tgt_desc *parent_tgt;
+	struct mdt_body *body;
+	int rc;
+
+	ENTRY;
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = current_cap();
+
+retry:
+	parent_tgt = lmv_locate_tgt(lmv, op_data);
+	if (IS_ERR(parent_tgt))
+		RETURN(PTR_ERR(parent_tgt));
+
+	if (likely(!fid_is_zero(&op_data->op_fid2))) {
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	} else {
+		tgt = parent_tgt;
+	}
+
+	/*
+	 * If child's fid is given, cancel unused locks for it if it is from
+	 * another export than parent.
+	 *
+	 * LOOKUP lock for child (fid3) should also be cancelled on parent
+	 * tgt_tgt in mdc_unlink().
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+	if (parent_tgt != tgt)
+		rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
+				      MF_MDC_CANCEL_FID3);
+
+	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX,
+			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n",
+	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
+	       tgt->ltd_index);
+
+	rc = md_unlink(tgt->ltd_exp, op_data, request);
+	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*request);
+		*request = NULL;
+		goto retry;
+	}
+
+	if (rc != -EREMOTE)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
+		RETURN(rc);
+
+	/* This is a remote object, try remote MDT. */
+	op_data->op_fid2 = body->mbo_fid1;
+	ptlrpc_req_finished(*request);
+	*request = NULL;
+
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	goto retry;
+}
+
+static int lmv_precleanup(struct obd_device *obd)
+{
+	ENTRY;
+	libcfs_kkuc_group_rem(&obd->obd_uuid, 0, KUC_GRP_HSM);
+	fld_client_debugfs_fini(&obd->u.lmv.lmv_fld);
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_md_stats(obd);
+	RETURN(0);
+}
+
+/**
+ * Get by key a value associated with a LMV device.
+ *
+ * Dispatch request to lower-layer devices as needed.
+ *
+ * \param[in] env		execution environment for this thread
+ * \param[in] exp		export for the LMV device
+ * \param[in] keylen		length of key identifier
+ * \param[in] key		identifier of key to get value for
+ * \param[in] vallen		size of \a val
+ * \param[out] val		pointer to storage location for value
+ * \param[in] lsm		optional striping metadata of object
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val)
+{
+	struct obd_device *obd;
+	struct lmv_obd *lmv;
+	struct lu_tgt_desc *tgt;
+	int rc = 0;
+
+	ENTRY;
+
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	lmv = &obd->u.lmv;
+	if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
+		LASSERT(*vallen == sizeof(__u32));
+		lmv_foreach_connected_tgt(lmv, tgt) {
+			if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
+					  vallen, val))
+				RETURN(0);
+		}
+		RETURN(-EINVAL);
+	} else if (KEY_IS(KEY_MAX_EASIZE) ||
+		   KEY_IS(KEY_DEFAULT_EASIZE) ||
+		   KEY_IS(KEY_CONN_DATA)) {
+		/*
+		 * Forwarding this request to first MDS, it should know LOV
+		 * desc.
+		 */
+		tgt = lmv_tgt(lmv, 0);
+		if (!tgt)
+			RETURN(-ENODEV);
+
+		rc = obd_get_info(env, tgt->ltd_exp, keylen, key, vallen, val);
+		if (!rc && KEY_IS(KEY_CONN_DATA))
+			exp->exp_connect_data = *(struct obd_connect_data *)val;
+		RETURN(rc);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lmv->lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count;
+		RETURN(0);
+	}
+
+	CDEBUG(D_IOCTL, "Invalid key\n");
+	RETURN(-EINVAL);
+}
+
+static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa,
+		     int *__rcs, struct ptlrpc_request_set *_set)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ptlrpc_request_set *set = _set;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	int tgt_count = lmv->lmv_mdt_count;
+	struct lu_tgt_desc *tgt;
+	struct fid_array *fat, **fas = NULL;
+	int i, rc, **rcs = NULL;
+
+	if (!set) {
+		set = ptlrpc_prep_set();
+		if (!set)
+			RETURN(-ENOMEM);
+	}
+
+	/* split FIDs by targets */
+	OBD_ALLOC_PTR_ARRAY(fas, tgt_count);
+	if (fas == NULL)
+		GOTO(out, rc = -ENOMEM);
+	OBD_ALLOC_PTR_ARRAY(rcs, tgt_count);
+	if (rcs == NULL)
+		GOTO(out_fas, rc = -ENOMEM);
+
+	for (i = 0; i < fa->fa_nr; i++) {
+		unsigned int idx;
+
+		rc = lmv_fld_lookup(lmv, &fa->fa_fids[i], &idx);
+		if (rc) {
+			CDEBUG(D_OTHER, "can't lookup "DFID": rc = %d\n",
+			       PFID(&fa->fa_fids[i]), rc);
+			continue;
+		}
+		LASSERT(idx < tgt_count);
+		if (!fas[idx])
+			OBD_ALLOC(fas[idx], offsetof(struct fid_array,
+				  fa_fids[fa->fa_nr]));
+		if (!fas[idx])
+			GOTO(out, rc = -ENOMEM);
+		if (!rcs[idx])
+			OBD_ALLOC_PTR_ARRAY(rcs[idx], fa->fa_nr);
+		if (!rcs[idx])
+			GOTO(out, rc = -ENOMEM);
+
+		fat = fas[idx];
+		fat->fa_fids[fat->fa_nr++] = fa->fa_fids[i];
+	}
+
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		fat = fas[tgt->ltd_index];
+		if (!fat || fat->fa_nr == 0)
+			continue;
+		rc = md_rmfid(tgt->ltd_exp, fat, rcs[tgt->ltd_index], set);
+	}
+
+	rc = ptlrpc_set_wait(NULL, set);
+	if (rc == 0) {
+		int j = 0;
+		for (i = 0; i < tgt_count; i++) {
+			fat = fas[i];
+			if (!fat || fat->fa_nr == 0)
+				continue;
+			/* copy FIDs back */
+			memcpy(fa->fa_fids + j, fat->fa_fids,
+			       fat->fa_nr * sizeof(struct lu_fid));
+			/* copy rcs back */
+			memcpy(__rcs + j, rcs[i], fat->fa_nr * sizeof(**rcs));
+			j += fat->fa_nr;
+		}
+	}
+	if (set != _set)
+		ptlrpc_set_destroy(set);
+
+out:
+	for (i = 0; i < tgt_count; i++) {
+		if (fas && fas[i])
+			OBD_FREE(fas[i], offsetof(struct fid_array,
+						fa_fids[fa->fa_nr]));
+		if (rcs && rcs[i])
+			OBD_FREE_PTR_ARRAY(rcs[i], fa->fa_nr);
+	}
+	if (rcs)
+		OBD_FREE_PTR_ARRAY(rcs, tgt_count);
+out_fas:
+	if (fas)
+		OBD_FREE_PTR_ARRAY(fas, tgt_count);
+
+	RETURN(rc);
+}
+
+/**
+ * Asynchronously set by key a value associated with a LMV device.
+ *
+ * Dispatch request to lower-layer devices as needed.
+ *
+ * \param[in] env	execution environment for this thread
+ * \param[in] exp	export for the LMV device
+ * \param[in] keylen	length of key identifier
+ * \param[in] key	identifier of key to store value for
+ * \param[in] vallen	size of value to store
+ * \param[in] val	pointer to data to be stored
+ * \param[in] set	optional list of related ptlrpc requests
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      __u32 keylen, void *key, __u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
+{
+	struct lmv_tgt_desc *tgt;
+	struct obd_device *obd;
+	struct lmv_obd *lmv;
+	int rc = 0;
+	ENTRY;
+
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+	lmv = &obd->u.lmv;
+
+	if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX) ||
+	    KEY_IS(KEY_DEFAULT_EASIZE)) {
+		int err = 0;
+
+		lmv_foreach_connected_tgt(lmv, tgt) {
+			err = obd_set_info_async(env, tgt->ltd_exp,
+						 keylen, key, vallen, val, set);
+			if (err && rc == 0)
+				rc = err;
+		}
+
+		RETURN(rc);
+	}
+
+	RETURN(-EINVAL);
+}
+
+static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
+			    const struct lmv_mds_md_v1 *lmm1)
+{
+	struct lmv_obd	*lmv = &exp->exp_obd->u.lmv;
+	int		stripe_count;
+	int		cplen;
+	int		i;
+	int		rc = 0;
+	ENTRY;
+
+	lsm->lsm_md_magic = le32_to_cpu(lmm1->lmv_magic);
+	lsm->lsm_md_stripe_count = le32_to_cpu(lmm1->lmv_stripe_count);
+	lsm->lsm_md_master_mdt_index = le32_to_cpu(lmm1->lmv_master_mdt_index);
+	if (OBD_FAIL_CHECK(OBD_FAIL_UNKNOWN_LMV_STRIPE))
+		lsm->lsm_md_hash_type = LMV_HASH_TYPE_UNKNOWN;
+	else
+		lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type);
+	lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version);
+	lsm->lsm_md_migrate_offset = le32_to_cpu(lmm1->lmv_migrate_offset);
+	lsm->lsm_md_migrate_hash = le32_to_cpu(lmm1->lmv_migrate_hash);
+	cplen = strlcpy(lsm->lsm_md_pool_name, lmm1->lmv_pool_name,
+			sizeof(lsm->lsm_md_pool_name));
+
+	if (cplen >= sizeof(lsm->lsm_md_pool_name))
+		RETURN(-E2BIG);
+
+	CDEBUG(D_INFO, "unpack lsm count %d/%d, master %d hash_type %#x/%#x "
+	       "layout_version %d\n", lsm->lsm_md_stripe_count,
+	       lsm->lsm_md_migrate_offset, lsm->lsm_md_master_mdt_index,
+	       lsm->lsm_md_hash_type, lsm->lsm_md_migrate_hash,
+	       lsm->lsm_md_layout_version);
+
+	stripe_count = le32_to_cpu(lmm1->lmv_stripe_count);
+	for (i = 0; i < stripe_count; i++) {
+		fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid,
+			      &lmm1->lmv_stripe_fids[i]);
+		/*
+		 * set default value -1, so lmv_locate_tgt() knows this stripe
+		 * target is not initialized.
+		 */
+		lsm->lsm_md_oinfo[i].lmo_mds = LMV_OFFSET_DEFAULT;
+		if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid))
+			continue;
+
+		rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid,
+				    &lsm->lsm_md_oinfo[i].lmo_mds);
+		if (rc == -ENOENT)
+			continue;
+
+		if (rc)
+			RETURN(rc);
+
+		CDEBUG(D_INFO, "unpack fid #%d "DFID"\n", i,
+		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
+	}
+
+	RETURN(rc);
+}
+
+static inline int lmv_unpack_user_md(struct obd_export *exp,
+				     struct lmv_stripe_md *lsm,
+				     const struct lmv_user_md *lmu)
+{
+	lsm->lsm_md_magic = le32_to_cpu(lmu->lum_magic);
+	lsm->lsm_md_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
+	lsm->lsm_md_master_mdt_index = le32_to_cpu(lmu->lum_stripe_offset);
+	lsm->lsm_md_hash_type = le32_to_cpu(lmu->lum_hash_type);
+	lsm->lsm_md_max_inherit = lmu->lum_max_inherit;
+	lsm->lsm_md_max_inherit_rr = lmu->lum_max_inherit_rr;
+	lsm->lsm_md_pool_name[LOV_MAXPOOLNAME] = 0;
+
+	return 0;
+}
+
+static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp,
+			const union lmv_mds_md *lmm, size_t lmm_size)
+{
+	struct lmv_stripe_md	 *lsm;
+	int			 lsm_size;
+	int			 rc;
+	bool			 allocated = false;
+	ENTRY;
+
+	LASSERT(lsmp != NULL);
+
+	lsm = *lsmp;
+	/* Free memmd */
+	if (lsm != NULL && lmm == NULL) {
+		int i;
+		struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lsm;
+
+		if (lfm->lfm_magic == LMV_MAGIC_FOREIGN) {
+			size_t lfm_size;
+
+			lfm_size = lfm->lfm_length + offsetof(typeof(*lfm),
+							      lfm_value[0]);
+			OBD_FREE_LARGE(lfm, lfm_size);
+			RETURN(0);
+		}
+
+		if (lmv_dir_striped(lsm)) {
+			for (i = 0; i < lsm->lsm_md_stripe_count; i++)
+				iput(lsm->lsm_md_oinfo[i].lmo_root);
+			lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count);
+		} else {
+			lsm_size = lmv_stripe_md_size(0);
+		}
+		OBD_FREE(lsm, lsm_size);
+		*lsmp = NULL;
+		RETURN(0);
+	}
+
+	/* foreign lmv case */
+	if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_FOREIGN) {
+		struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lsm;
+
+		if (lfm == NULL) {
+			OBD_ALLOC_LARGE(lfm, lmm_size);
+			if (lfm == NULL)
+				RETURN(-ENOMEM);
+			*lsmp = (struct lmv_stripe_md *)lfm;
+		}
+		lfm->lfm_magic = le32_to_cpu(lmm->lmv_foreign_md.lfm_magic);
+		lfm->lfm_length = le32_to_cpu(lmm->lmv_foreign_md.lfm_length);
+		lfm->lfm_type = le32_to_cpu(lmm->lmv_foreign_md.lfm_type);
+		lfm->lfm_flags = le32_to_cpu(lmm->lmv_foreign_md.lfm_flags);
+		memcpy(&lfm->lfm_value, &lmm->lmv_foreign_md.lfm_value,
+		       lfm->lfm_length);
+		RETURN(lmm_size);
+	}
+
+	if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_STRIPE)
+		RETURN(-EPERM);
+
+	/* Unpack memmd */
+	if (le32_to_cpu(lmm->lmv_magic) != LMV_MAGIC_V1 &&
+	    le32_to_cpu(lmm->lmv_magic) != LMV_USER_MAGIC) {
+		CERROR("%s: invalid lmv magic %x: rc = %d\n",
+		       exp->exp_obd->obd_name, le32_to_cpu(lmm->lmv_magic),
+		       -EIO);
+		RETURN(-EIO);
+	}
+
+	if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_V1)
+		lsm_size = lmv_stripe_md_size(lmv_mds_md_stripe_count_get(lmm));
+	else
+		/**
+		 * Unpack default dirstripe(lmv_user_md) to lmv_stripe_md,
+		 * stripecount should be 0 then.
+		 */
+		lsm_size = lmv_stripe_md_size(0);
+
+	if (lsm == NULL) {
+		OBD_ALLOC(lsm, lsm_size);
+		if (lsm == NULL)
+			RETURN(-ENOMEM);
+		allocated = true;
+		*lsmp = lsm;
+	}
+
+	switch (le32_to_cpu(lmm->lmv_magic)) {
+	case LMV_MAGIC_V1:
+		rc = lmv_unpack_md_v1(exp, lsm, &lmm->lmv_md_v1);
+		break;
+	case LMV_USER_MAGIC:
+		rc = lmv_unpack_user_md(exp, lsm, &lmm->lmv_user_md);
+		break;
+	default:
+		CERROR("%s: unrecognized magic %x\n", exp->exp_obd->obd_name,
+		       le32_to_cpu(lmm->lmv_magic));
+		rc = -EINVAL;
+		break;
+	}
+
+	if (rc != 0 && allocated) {
+		OBD_FREE(lsm, lsm_size);
+		*lsmp = NULL;
+		lsm_size = rc;
+	}
+	RETURN(lsm_size);
+}
+
+void lmv_free_memmd(struct lmv_stripe_md *lsm)
+{
+	lmv_unpackmd(NULL, &lsm, NULL, 0);
+}
+EXPORT_SYMBOL(lmv_free_memmd);
+
+static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+			     union ldlm_policy_data *policy,
+			     enum ldlm_mode mode, enum ldlm_cancel_flags flags,
+			     void *opaque)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	struct lu_tgt_desc *tgt;
+	int err;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(fid != NULL);
+
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		if (!tgt->ltd_active)
+			continue;
+
+		err = md_cancel_unused(tgt->ltd_exp, fid, policy, mode, flags,
+				       opaque);
+		if (!rc)
+			rc = err;
+	}
+	RETURN(rc);
+}
+
+static int lmv_set_lock_data(struct obd_export *exp,
+			     const struct lustre_handle *lockh,
+			     void *data, __u64 *bits)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
+	int rc;
+
+	ENTRY;
+
+	if (tgt == NULL || tgt->ltd_exp == NULL)
+		RETURN(-EINVAL);
+	rc =  md_set_lock_data(tgt->ltd_exp, lockh, data, bits);
+	RETURN(rc);
+}
+
+static enum ldlm_mode
+lmv_lock_match(struct obd_export *exp, __u64 flags,
+	       const struct lu_fid *fid, enum ldlm_type type,
+	       union ldlm_policy_data *policy,
+	       enum ldlm_mode mode, struct lustre_handle *lockh)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	enum ldlm_mode rc;
+	struct lu_tgt_desc *tgt;
+	int i;
+	int index;
+
+	ENTRY;
+
+	CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
+
+	/*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.  Try the MDT that the FID maps to first,
+	 * since this can be easily found, and only try others if that fails.
+	 */
+	for (i = 0, index = lmv_fid2tgt_index(lmv, fid);
+	     i < lmv->lmv_mdt_descs.ltd_tgts_size;
+	     i++, index = (index + 1) % lmv->lmv_mdt_descs.ltd_tgts_size) {
+		if (index < 0) {
+			CDEBUG(D_HA, "%s: "DFID" is inaccessible: rc = %d\n",
+			       obd->obd_name, PFID(fid), index);
+			index = 0;
+		}
+
+		tgt = lmv_tgt(lmv, index);
+		if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+			continue;
+
+		rc = md_lock_match(tgt->ltd_exp, flags, fid, type, policy, mode,
+				   lockh);
+		if (rc)
+			RETURN(rc);
+	}
+
+	RETURN(0);
+}
+
+static int
+lmv_get_lustre_md(struct obd_export *exp, struct req_capsule *pill,
+		  struct obd_export *dt_exp, struct obd_export *md_exp,
+		  struct lustre_md *md)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
+
+	if (!tgt || !tgt->ltd_exp)
+		return -EINVAL;
+
+	return md_get_lustre_md(tgt->ltd_exp, pill, dt_exp, md_exp, md);
+}
+
+static int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
+
+	ENTRY;
+
+	if (md->default_lmv) {
+		lmv_free_memmd(md->default_lmv);
+		md->default_lmv = NULL;
+	}
+	if (md->lmv != NULL) {
+		lmv_free_memmd(md->lmv);
+		md->lmv = NULL;
+	}
+	if (!tgt || !tgt->ltd_exp)
+		RETURN(-EINVAL);
+	RETURN(md_free_lustre_md(tgt->ltd_exp, md));
+}
+
+static int lmv_set_open_replay_data(struct obd_export *exp,
+				    struct obd_client_handle *och,
+				    struct lookup_intent *it)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	RETURN(md_set_open_replay_data(tgt->ltd_exp, och, it));
+}
+
+static int lmv_clear_open_replay_data(struct obd_export *exp,
+				      struct obd_client_handle *och)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	RETURN(md_clear_open_replay_data(tgt->ltd_exp, och));
+}
+
+static int lmv_intent_getattr_async(struct obd_export *exp,
+				    struct md_enqueue_info *minfo)
+{
+	struct md_op_data *op_data = &minfo->mi_data;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *ptgt;
+	struct lmv_tgt_desc *ctgt;
+	int rc;
+
+	ENTRY;
+
+	if (!fid_is_sane(&op_data->op_fid2))
+		RETURN(-EINVAL);
+
+	ptgt = lmv_locate_tgt(lmv, op_data);
+	if (IS_ERR(ptgt))
+		RETURN(PTR_ERR(ptgt));
+
+	ctgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
+	if (IS_ERR(ctgt))
+		RETURN(PTR_ERR(ctgt));
+
+	/*
+	 * remote object needs two RPCs to lookup and getattr, considering the
+	 * complexity don't support statahead for now.
+	 */
+	if (ctgt != ptgt)
+		RETURN(-EREMOTE);
+
+	rc = md_intent_getattr_async(ptgt->ltd_exp, minfo);
+
+	RETURN(rc);
+}
+
+static int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			       struct lu_fid *fid, __u64 *bits)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
+	RETURN(rc);
+}
+
+static int lmv_get_fid_from_lsm(struct obd_export *exp,
+				const struct lmv_stripe_md *lsm,
+				const char *name, int namelen,
+				struct lu_fid *fid)
+{
+	const struct lmv_oinfo *oinfo;
+
+	LASSERT(lmv_dir_striped(lsm));
+
+	oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false);
+	if (IS_ERR(oinfo))
+		return PTR_ERR(oinfo);
+
+	*fid = oinfo->lmo_fid;
+
+	RETURN(0);
+}
+
+/**
+ * For lmv, only need to send request to master MDT, and the master MDT will
+ * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
+ * we directly fetch data from the slave MDTs.
+ */
+static int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
+			struct obd_quotactl *oqctl)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
+	__u64 curspace, curinodes;
+	int rc = 0;
+
+	ENTRY;
+
+	if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) {
+		CERROR("master lmv inactive\n");
+		RETURN(-EIO);
+	}
+
+	if (oqctl->qc_cmd != Q_GETOQUOTA) {
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		RETURN(rc);
+	}
+
+	curspace = curinodes = 0;
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		int err;
+
+		if (!tgt->ltd_active)
+			continue;
+
+		err = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (err) {
+			CERROR("getquota on mdt %d failed. %d\n",
+			       tgt->ltd_index, err);
+			if (!rc)
+				rc = err;
+		} else {
+			curspace += oqctl->qc_dqblk.dqb_curspace;
+			curinodes += oqctl->qc_dqblk.dqb_curinodes;
+		}
+	}
+	oqctl->qc_dqblk.dqb_curspace = curspace;
+	oqctl->qc_dqblk.dqb_curinodes = curinodes;
+
+	RETURN(rc);
+}
+
+static int lmv_merge_attr(struct obd_export *exp,
+			  const struct lmv_stripe_md *lsm,
+			  struct cl_attr *attr,
+			  ldlm_blocking_callback cb_blocking)
+{
+	int rc;
+	int i;
+
+	if (!lmv_dir_striped(lsm))
+		return 0;
+
+	rc = lmv_revalidate_slaves(exp, lsm, cb_blocking, 0);
+	if (rc < 0)
+		return rc;
+
+	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+		struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root;
+
+		if (!inode)
+			continue;
+
+		CDEBUG(D_INFO,
+		       "" DFID " size %llu, blocks %llu nlink %u, atime %lld ctime %lld, mtime %lld.\n",
+		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid),
+		       i_size_read(inode), (unsigned long long)inode->i_blocks,
+		       inode->i_nlink, (s64)inode->i_atime.tv_sec,
+		       (s64)inode->i_ctime.tv_sec, (s64)inode->i_mtime.tv_sec);
+
+		/* for slave stripe, it needs to subtract nlink for . and .. */
+		if (i != 0)
+			attr->cat_nlink += inode->i_nlink - 2;
+		else
+			attr->cat_nlink = inode->i_nlink;
+
+		attr->cat_size += i_size_read(inode);
+		attr->cat_blocks += inode->i_blocks;
+
+		if (attr->cat_atime < inode->i_atime.tv_sec)
+			attr->cat_atime = inode->i_atime.tv_sec;
+
+		if (attr->cat_ctime < inode->i_ctime.tv_sec)
+			attr->cat_ctime = inode->i_ctime.tv_sec;
+
+		if (attr->cat_mtime < inode->i_mtime.tv_sec)
+			attr->cat_mtime = inode->i_mtime.tv_sec;
+	}
+	return 0;
+}
+
+static const struct obd_ops lmv_obd_ops = {
+        .o_owner                = THIS_MODULE,
+        .o_setup                = lmv_setup,
+        .o_cleanup              = lmv_cleanup,
+        .o_precleanup           = lmv_precleanup,
+        .o_process_config       = lmv_process_config,
+        .o_connect              = lmv_connect,
+        .o_disconnect           = lmv_disconnect,
+        .o_statfs               = lmv_statfs,
+        .o_get_info             = lmv_get_info,
+        .o_set_info_async       = lmv_set_info_async,
+        .o_notify               = lmv_notify,
+        .o_get_uuid             = lmv_get_uuid,
+	.o_fid_alloc		= lmv_fid_alloc,
+        .o_iocontrol            = lmv_iocontrol,
+        .o_quotactl             = lmv_quotactl
+};
+
+static const struct md_ops lmv_md_ops = {
+	.m_get_root		= lmv_get_root,
+        .m_null_inode		= lmv_null_inode,
+        .m_close                = lmv_close,
+        .m_create               = lmv_create,
+        .m_enqueue              = lmv_enqueue,
+        .m_getattr              = lmv_getattr,
+        .m_getxattr             = lmv_getxattr,
+        .m_getattr_name         = lmv_getattr_name,
+        .m_intent_lock          = lmv_intent_lock,
+        .m_link                 = lmv_link,
+        .m_rename               = lmv_rename,
+        .m_setattr              = lmv_setattr,
+        .m_setxattr             = lmv_setxattr,
+	.m_fsync		= lmv_fsync,
+	.m_file_resync		= lmv_file_resync,
+	.m_read_page		= lmv_read_page,
+        .m_unlink               = lmv_unlink,
+        .m_init_ea_size         = lmv_init_ea_size,
+        .m_cancel_unused        = lmv_cancel_unused,
+        .m_set_lock_data        = lmv_set_lock_data,
+        .m_lock_match           = lmv_lock_match,
+	.m_get_lustre_md        = lmv_get_lustre_md,
+	.m_free_lustre_md       = lmv_free_lustre_md,
+	.m_merge_attr		= lmv_merge_attr,
+        .m_set_open_replay_data = lmv_set_open_replay_data,
+        .m_clear_open_replay_data = lmv_clear_open_replay_data,
+        .m_intent_getattr_async = lmv_intent_getattr_async,
+	.m_revalidate_lock      = lmv_revalidate_lock,
+	.m_get_fid_from_lsm	= lmv_get_fid_from_lsm,
+	.m_unpackmd		= lmv_unpackmd,
+	.m_rmfid		= lmv_rmfid,
+};
+
+static int __init lmv_init(void)
+{
+	return class_register_type(&lmv_obd_ops, &lmv_md_ops, true,
+				   LUSTRE_LMV_NAME, NULL);
+}
+
+static void __exit lmv_exit(void)
+{
+	class_unregister_type(LUSTRE_LMV_NAME);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Metadata Volume");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(lmv_init);
+module_exit(lmv_exit);
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
new file mode 100644
index 0000000000000..c2a5db2f9daf1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
@@ -0,0 +1,322 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#include "lmv_internal.h"
+
+static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", obd->u.lmv.lmv_mdt_count);
+}
+LUSTRE_RO_ATTR(numobd);
+
+static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+		obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count);
+}
+LUSTRE_RO_ATTR(activeobd);
+
+static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n",
+			obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_uuid.uuid);
+}
+LUSTRE_RO_ATTR(desc_uuid);
+
+static ssize_t qos_maxage_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage);
+}
+
+static ssize_t qos_maxage_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buffer,
+				size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(qos_maxage);
+
+static ssize_t qos_prio_free_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u%%\n",
+			(obd->u.lmv.lmv_qos.lq_prio_free * 100 + 255) >> 8);
+}
+
+static ssize_t qos_prio_free_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	char buf[6], *tmp;
+	unsigned int val;
+	int rc;
+
+	/* "100%\n\0" should be largest string */
+	if (count >= sizeof(buf))
+		return -ERANGE;
+
+	strncpy(buf, buffer, sizeof(buf));
+	buf[sizeof(buf) - 1] = '\0';
+	tmp = strchr(buf, '%');
+	if (tmp)
+		*tmp = '\0';
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val > 100)
+		return -EINVAL;
+
+	lmv->lmv_qos.lq_prio_free = (val << 8) / 100;
+	set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags);
+	set_bit(LQ_RESET, &lmv->lmv_qos.lq_flags);
+
+	return count;
+}
+LUSTRE_RW_ATTR(qos_prio_free);
+
+static ssize_t qos_threshold_rr_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u%%\n",
+			(obd->u.lmv.lmv_qos.lq_threshold_rr * 100 +
+			(QOS_THRESHOLD_MAX - 1)) / QOS_THRESHOLD_MAX);
+}
+
+static ssize_t qos_threshold_rr_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer,
+				      size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	char buf[6], *tmp;
+	unsigned int val;
+	int rc;
+
+	/* "100%\n\0" should be largest string */
+	if (count >= sizeof(buf))
+		return -ERANGE;
+
+	strncpy(buf, buffer, sizeof(buf));
+	buf[sizeof(buf) - 1] = '\0';
+	tmp = strchr(buf, '%');
+	if (tmp)
+		*tmp = '\0';
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val > 100)
+		return -EINVAL;
+
+	lmv->lmv_qos.lq_threshold_rr = (val * QOS_THRESHOLD_MAX) / 100;
+	set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags);
+
+	return count;
+}
+LUSTRE_RW_ATTR(qos_threshold_rr);
+
+#ifdef CONFIG_PROC_FS
+static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_device *obd = p->private;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt;
+
+	while (*pos < lmv->lmv_mdt_descs.ltd_tgts_size) {
+		tgt = lmv_tgt(lmv, (__u32)*pos);
+		if (tgt)
+			return tgt;
+
+		++*pos;
+	}
+
+	return NULL;
+}
+
+static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_device *obd = p->private;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt;
+
+	++*pos;
+	while (*pos < lmv->lmv_mdt_descs.ltd_tgts_size) {
+		tgt = lmv_tgt(lmv, (__u32)*pos);
+		if (tgt)
+			return tgt;
+
+		++*pos;
+	}
+
+	return NULL;
+}
+
+static int lmv_tgt_seq_show(struct seq_file *p, void *v)
+{
+	struct lmv_tgt_desc     *tgt = v;
+
+	if (!tgt)
+		return 0;
+
+	seq_printf(p, "%u: %s %sACTIVE\n",
+		   tgt->ltd_index, tgt->ltd_uuid.uuid,
+		   tgt->ltd_active ? "" : "IN");
+	return 0;
+}
+
+static const struct seq_operations lmv_tgt_sops = {
+        .start                 = lmv_tgt_seq_start,
+        .stop                  = lmv_tgt_seq_stop,
+        .next                  = lmv_tgt_seq_next,
+        .show                  = lmv_tgt_seq_show,
+};
+
+static int lmv_target_seq_open(struct inode *inode, struct file *file)
+{
+        struct seq_file         *seq;
+        int                     rc;
+
+        rc = seq_open(file, &lmv_tgt_sops);
+        if (rc)
+                return rc;
+
+	seq = file->private_data;
+	seq->private = pde_data(inode);
+	return 0;
+}
+
+static const struct proc_ops lmv_proc_target_fops = {
+	PROC_OWNER(THIS_MODULE)
+	.proc_open	= lmv_target_seq_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+static struct attribute *lmv_attrs[] = {
+	&lustre_attr_activeobd.attr,
+	&lustre_attr_desc_uuid.attr,
+	&lustre_attr_numobd.attr,
+	&lustre_attr_qos_maxage.attr,
+	&lustre_attr_qos_prio_free.attr,
+	&lustre_attr_qos_threshold_rr.attr,
+	NULL,
+};
+
+KOBJ_ATTRIBUTE_GROUPS(lmv); /* creates lmv_groups */
+
+int lmv_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(lmv);
+	rc = lprocfs_obd_setup(obd, true);
+	if (rc)
+		goto out_failed;
+#ifdef CONFIG_PROC_FS
+	rc = lprocfs_alloc_md_stats(obd, 0);
+	if (rc) {
+		lprocfs_obd_cleanup(obd);
+		goto out_failed;
+	}
+
+	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+				0444, &lmv_proc_target_fops, obd);
+	if (rc) {
+		lprocfs_free_md_stats(obd);
+		lprocfs_obd_cleanup(obd);
+		CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+		      obd->obd_name, rc);
+		rc = 0;
+	}
+#endif /* CONFIG_PROC_FS */
+out_failed:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
new file mode 100644
index 0000000000000..1f5669800d62f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
@@ -0,0 +1,815 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Internal interfaces of LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#ifndef LOV_CL_INTERNAL_H
+#define LOV_CL_INTERNAL_H
+
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <cl_object.h>
+#include "lov_internal.h"
+
+/** \defgroup lov lov
+ * Logical object volume layer. This layer implements data striping (raid0).
+ *
+ * At the lov layer top-entity (object, page, lock, io) is connected to one or
+ * more sub-entities: top-object, representing a file is connected to a set of
+ * sub-objects, each representing a stripe, file-level top-lock is connected
+ * to a set of per-stripe sub-locks, top-page is connected to a (single)
+ * sub-page, and a top-level IO is connected to a set of (potentially
+ * concurrent) sub-IO's.
+ *
+ * Sub-object, sub-page, and sub-io have well-defined top-object and top-page
+ * respectively, while a single sub-lock can be part of multiple top-locks.
+ *
+ * Reference counting models are different for different types of entities:
+ *
+ *     - top-object keeps a reference to its sub-objects, and destroys them
+ *       when it is destroyed.
+ *
+ *     - top-page keeps a reference to its sub-page, and destroys it when it
+ *       is destroyed.
+ *
+ *     - IO's are not reference counted.
+ *
+ * To implement a connection between top and sub entities, lov layer is split
+ * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
+ * implementing full set of cl-interfaces. For example, top-object has vvp and
+ * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
+ * used to track child-parent relationship.
+ *
+ * @{
+ */
+
+struct lovsub_device;
+struct lovsub_object;
+
+enum lov_device_flags {
+	LOV_DEV_INITIALIZED = BIT(0),
+};
+
+/*
+ * Upper half.
+ */
+
+/* Data-on-MDT array item in lov_device::ld_md_tgts[] */
+struct lovdom_device {
+	struct cl_device	*ldm_mdc;
+	int			 ldm_idx;
+};
+
+struct lov_device {
+        /*
+         * XXX Locking of lov-private data is missing.
+         */
+        struct cl_device          ld_cl;
+        struct lov_obd           *ld_lov;
+        /** size of lov_device::ld_target[] array */
+        __u32                     ld_target_nr;
+        struct lovsub_device    **ld_target;
+        __u32                     ld_flags;
+
+	/* Data-on-MDT devices */
+	__u32			  ld_md_tgts_nr;
+	struct lovdom_device	 *ld_md_tgts;
+	struct obd_device	 *ld_lmv;
+	/* LU site for subdevices */
+	struct lu_site		  ld_site;
+};
+
+/**
+ * Layout type.
+ */
+enum lov_layout_type {
+	LLT_EMPTY,	/** empty file without body (mknod + truncate) */
+	LLT_RELEASED,	/** file with no objects (data in HSM) */
+	LLT_COMP,	/** support composite layout */
+	LLT_FOREIGN,	/** foreign layout */
+	LLT_NR
+};
+
+static inline char *llt2str(enum lov_layout_type llt)
+{
+	switch (llt) {
+	case LLT_EMPTY:
+		return "EMPTY";
+	case LLT_RELEASED:
+		return "RELEASED";
+	case LLT_COMP:
+		return "COMPOSITE";
+	case LLT_FOREIGN:
+		return "FOREIGN";
+	case LLT_NR:
+		LBUG();
+	}
+	LBUG();
+	return "";
+}
+
+/**
+ * Return lov_layout_entry_type associated with a given composite layout
+ * entry.
+ */
+static inline __u32 lov_entry_type(struct lov_stripe_md_entry *lsme)
+{
+	if ((lov_pattern(lsme->lsme_pattern) & LOV_PATTERN_RAID0) ||
+	    (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT) ||
+	    (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_FOREIGN))
+		return lov_pattern(lsme->lsme_pattern &
+				   ~LOV_PATTERN_OVERSTRIPING);
+	return 0;
+}
+
+struct lov_layout_entry;
+struct lov_object;
+struct lov_lock_sub;
+
+struct lov_comp_layout_entry_ops {
+	int (*lco_init)(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov, unsigned int index,
+			const struct cl_object_conf *conf,
+			struct lov_layout_entry *lle);
+	void (*lco_fini)(const struct lu_env *env,
+			 struct lov_layout_entry *lle);
+	int  (*lco_getattr)(const struct lu_env *env, struct lov_object *obj,
+			    unsigned int index, struct lov_layout_entry *lle,
+			    struct cl_attr **attr);
+};
+
+struct lov_layout_raid0 {
+	unsigned               lo_nr;
+	/**
+	 * When this is true, lov_object::lo_attr contains
+	 * valid up to date attributes for a top-level
+	 * object. This field is reset to 0 when attributes of
+	 * any sub-object change.
+	 */
+	bool		       lo_attr_valid;
+	/**
+	 * Array of sub-objects. Allocated when top-object is
+	 * created (lov_init_raid0()).
+	 *
+	 * Top-object is a strict master of its sub-objects:
+	 * it is created before them, and outlives its
+	 * children (this later is necessary so that basic
+	 * functions like cl_object_top() always
+	 * work). Top-object keeps a reference on every
+	 * sub-object.
+	 *
+	 * When top-object is destroyed (lov_delete_raid0())
+	 * it releases its reference to a sub-object and waits
+	 * until the latter is finally destroyed.
+	 */
+	struct lovsub_object **lo_sub;
+	/**
+	 * protect lo_sub
+	 */
+	spinlock_t		lo_sub_lock;
+	/**
+	 * Cached object attribute, built from sub-object
+	 * attributes.
+	 */
+	struct cl_attr         lo_attr;
+};
+
+struct lov_layout_dom {
+	/* keep this always at first place so DOM layout entry
+	 * can be addressed also as RAID0 after initialization.
+	 */
+	struct lov_layout_raid0 lo_dom_r0;
+	struct lovsub_object *lo_dom;
+	struct lov_oinfo *lo_loi;
+};
+
+struct lov_layout_entry {
+	__u32				lle_type;
+	unsigned int			lle_valid:1;
+	unsigned int			lle_preference;
+	struct lu_extent		*lle_extent;
+	struct lov_stripe_md_entry	*lle_lsme;
+	struct lov_comp_layout_entry_ops *lle_comp_ops;
+	union {
+		struct lov_layout_raid0	lle_raid0;
+		struct lov_layout_dom	lle_dom;
+	};
+};
+
+struct lov_mirror_entry {
+	unsigned short	lre_mirror_id;
+	unsigned short	lre_stale:1,	/* set if any components is stale */
+			lre_valid:1,	/* set if at least one of components
+					 * in this mirror is valid */
+			lre_foreign:1;	/* set if it is a foreign component */
+	int		lre_preference;	/* overall preference of this mirror */
+
+	unsigned short	lre_start;	/* index to lo_entries, start index of
+					 * this mirror */
+	unsigned short	lre_end;	/* end index of this mirror */
+};
+
+enum lov_object_flags {
+	/* Layout is invalid, set when layout lock is lost */
+	LO_LAYOUT_INVALID	= 0x1,
+};
+
+/**
+ * lov-specific file state.
+ *
+ * lov object has particular layout type, determining how top-object is built
+ * on top of sub-objects. Layout type can change dynamically. When this
+ * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode,
+ * all state pertaining to the old layout type is destroyed, and new state is
+ * constructed. All object methods take said semaphore in the shared mode,
+ * providing serialization against transition between layout types.
+ *
+ * To avoid multiple `if' or `switch' statements, selecting behavior for the
+ * current layout type, object methods perform double-dispatch, invoking
+ * function corresponding to the current layout type.
+ */
+struct lov_object {
+	struct cl_object	lo_cl;
+	/**
+	 * Serializes object operations with transitions between layout types.
+	 *
+	 * This semaphore is taken in shared mode by all object methods, and
+	 * is taken in exclusive mode when object type is changed.
+	 *
+	 * \see lov_object::lo_type
+	 */
+	struct rw_semaphore	lo_type_guard;
+	/**
+	 * Type of an object. Protected by lov_object::lo_type_guard.
+	 */
+	enum lov_layout_type	lo_type;
+	/**
+	 * Object flags.
+	 */
+	unsigned long		lo_obj_flags;
+	/**
+	 * How many IOs are on going on this object. Layout can be changed
+	 * only if there is no active IO.
+	 */
+	atomic_t	       lo_active_ios;
+	/**
+	 * Waitq - wait for no one else is using lo_lsm
+	 */
+	wait_queue_head_t	lo_waitq;
+	/**
+	 * Layout metadata. NULL if empty layout.
+	 */
+	struct lov_stripe_md  *lo_lsm;
+
+	union lov_layout_state {
+		struct lov_layout_state_empty {
+		} empty;
+		struct lov_layout_state_released {
+		} released;
+		struct lov_layout_composite {
+			/**
+			 * flags of lov_comp_md_v1::lcm_flags. Mainly used
+			 * by FLR.
+			 */
+			uint32_t        lo_flags;
+			/**
+			 * For FLR: index of preferred mirror to read.
+			 * Preferred mirror is initialized by the preferred
+			 * bit of lsme. It can be changed when the preferred
+			 * is inaccessible.
+			 * In order to make lov_lsm_entry() return the same
+			 * mirror in the same IO context, it's only possible
+			 * to change the preferred mirror when the
+			 * lo_active_ios reaches zero.
+			 */
+			int             lo_preferred_mirror;
+			/**
+			 * For FLR: Number of (valid) mirrors.
+			 */
+			unsigned        lo_mirror_count;
+			struct lov_mirror_entry *lo_mirrors;
+			/**
+			 * Current entry count of lo_entries, include
+			 * invalid entries.
+			 */
+			unsigned int    lo_entry_count;
+			struct lov_layout_entry *lo_entries;
+		} composite;
+	} u;
+	/**
+	 * Thread that acquired lov_object::lo_type_guard in an exclusive
+	 * mode.
+	 */
+	struct task_struct            *lo_owner;
+};
+
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_type == LLT_COMP);
+	LASSERTF(i < lov->u.composite.lo_entry_count,
+		 "entry %d entry_count %d\n", i,
+		 lov->u.composite.lo_entry_count);
+
+	return &lov->u.composite.lo_entries[i].lle_raid0;
+}
+
+static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_lsm != NULL);
+	LASSERT(i < lov->lo_lsm->lsm_entry_count);
+
+	return lov->lo_lsm->lsm_entries[i];
+}
+
+static inline unsigned lov_flr_state(const struct lov_object *lov)
+{
+	if (lov->lo_type != LLT_COMP)
+		return LCM_FL_NONE;
+
+	return lov->u.composite.lo_flags & LCM_FL_FLR_MASK;
+}
+
+static inline bool lov_is_flr(const struct lov_object *lov)
+{
+	return lov_flr_state(lov) != LCM_FL_NONE;
+}
+
+static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_type == LLT_COMP);
+	LASSERTF(i < lov->u.composite.lo_entry_count,
+		 "entry %d entry_count %d\n", i,
+		 lov->u.composite.lo_entry_count);
+
+	return &lov->u.composite.lo_entries[i];
+}
+
+#define lov_for_layout_entry(lov, entry, start, end)			\
+	if (lov->u.composite.lo_entries &&				\
+	    lov->u.composite.lo_entry_count > 0)			\
+		for (entry = lov_entry(lov, start);			\
+		     entry <= lov_entry(lov, end); entry++)
+
+#define lov_foreach_layout_entry(lov, entry)				\
+	lov_for_layout_entry(lov, entry, 0,				\
+			     (lov)->u.composite.lo_entry_count - 1)
+
+#define lov_foreach_mirror_layout_entry(lov, entry, lre)		\
+	lov_for_layout_entry(lov, entry, (lre)->lre_start, (lre)->lre_end)
+
+static inline struct lov_mirror_entry *
+lov_mirror_entry(struct lov_object *lov, int i)
+{
+	LASSERT(i < lov->u.composite.lo_mirror_count);
+	return &lov->u.composite.lo_mirrors[i];
+}
+
+#define lov_foreach_mirror_entry(lov, lre)				\
+	for (lre = lov_mirror_entry(lov, 0);				\
+	     lre <= lov_mirror_entry(lov,				\
+				lov->u.composite.lo_mirror_count - 1);	\
+	     lre++)
+
+static inline unsigned
+lov_layout_entry_index(struct lov_object *lov, struct lov_layout_entry *entry)
+{
+	struct lov_layout_entry *first = &lov->u.composite.lo_entries[0];
+	unsigned index = (unsigned)(entry - first);
+
+	LASSERT(entry >= first);
+	LASSERT(index < lov->u.composite.lo_entry_count);
+
+	return index;
+}
+
+/**
+ * State lov_lock keeps for each sub-lock.
+ */
+struct lov_lock_sub {
+	/** sub-lock itself */
+	struct cl_lock		sub_lock;
+	/** Set if the sublock has ever been enqueued, meaning it may
+	 * hold resources of underlying layers */
+	unsigned int		sub_is_enqueued:1,
+				sub_initialized:1;
+	int			sub_index;
+};
+
+/**
+ * lov-specific lock state.
+ */
+struct lov_lock {
+	struct cl_lock_slice	lls_cl;
+	/** Number of sub-locks in this lock */
+	int			lls_nr;
+	/** sublock array */
+	struct lov_lock_sub	lls_sub[0];
+};
+
+struct lov_page {
+	struct cl_page_slice	lps_cl;
+};
+
+/*
+ * Bottom half.
+ */
+
+struct lovsub_device {
+        struct cl_device   acid_cl;
+        struct cl_device  *acid_next;
+};
+
+struct lovsub_object {
+        struct cl_object_header lso_header;
+        struct cl_object        lso_cl;
+        struct lov_object      *lso_super;
+        int                     lso_index;
+};
+
+/**
+ * Describe the environment settings for sublocks.
+ */
+struct lov_sublock_env {
+        const struct lu_env *lse_env;
+        struct cl_io        *lse_io;
+};
+
+struct lov_thread_info {
+	struct cl_object_conf   lti_stripe_conf;
+	struct lu_fid           lti_fid;
+	struct ost_lvb          lti_lvb;
+	struct cl_2queue        lti_cl2q;
+	struct cl_page_list     lti_plist;
+};
+
+/**
+ * State that lov_io maintains for every sub-io.
+ */
+struct lov_io_sub {
+	/**
+	 * Linkage into a list (hanging off lov_io::lis_subios)
+	 */
+	struct list_head	sub_list;
+	/**
+	 * Linkage into a list (hanging off lov_io::lis_active) of all
+	 * sub-io's active for the current IO iteration.
+	 */
+	struct list_head	sub_linkage;
+	unsigned int		sub_subio_index;
+	/**
+	 * sub-io for a stripe. Ideally sub-io's can be stopped and resumed
+	 * independently, with lov acting as a scheduler to maximize overall
+	 * throughput.
+	 */
+	struct cl_io		sub_io;
+	/**
+	 * environment, in which sub-io executes.
+	 */
+	struct lu_env		*sub_env;
+	/**
+	 * environment's refcheck.
+	 *
+	 * \see cl_env_get()
+	 */
+	__u16			sub_refcheck;
+};
+
+/**
+ * IO state private for LOV.
+ */
+#define LIS_CACHE_ENTRY_NONE	-ENOENT
+struct lov_io {
+        /** super-class */
+        struct cl_io_slice lis_cl;
+
+	/**
+	 * FLR: index to lo_mirrors. Valid only if lov_is_flr() returns true.
+	 *
+	 * The mirror index of this io. Preserved over cl_io_init()
+	 * if io->ci_ndelay_tried is greater than zero.
+	 */
+	int			lis_mirror_index;
+	/**
+	 * FLR: the layout gen when lis_mirror_index was cached. The
+	 * mirror index makes sense only when the layout gen doesn't
+	 * change.
+	 */
+	int			lis_mirror_layout_gen;
+
+	/**
+	 * fields below this will be initialized in lov_io_init().
+	 */
+	unsigned		lis_preserved;
+
+        /**
+         * Pointer to the object slice. This is a duplicate of
+         * lov_io::lis_cl::cis_object.
+         */
+        struct lov_object *lis_object;
+        /**
+         * Original end-of-io position for this IO, set by the upper layer as
+         * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this,
+         * changes pos and count to fit IO into a single stripe and uses saved
+         * value to determine when IO iterations have to stop.
+         *
+         * This is used only for CIT_READ and CIT_WRITE io's.
+         */
+        loff_t             lis_io_endpos;
+
+	/**
+	 * Record the stripe index before the truncate size, used for setting OST
+	 * object size for truncate. LU-14128. lis_trunc_stripe_index[i] refers to
+	 * lov_object.u.composite.lo_entries[i].
+	 */
+	int *lis_trunc_stripe_index;
+
+        /**
+         * starting position within a file, for the current io loop iteration
+         * (stripe), used by ci_io_loop().
+         */
+	loff_t			lis_pos;
+	/**
+	 * end position with in a file, for the current stripe io. This is
+	 * exclusive (i.e., next offset after last byte affected by io).
+	 */
+	loff_t			lis_endpos;
+	int			lis_nr_subios;
+
+	/**
+	 * the index of ls_single_subio in ls_subios array
+	 */
+	int			lis_single_subio_index;
+	struct lov_io_sub	lis_single_subio;
+
+	/**
+	 * List of active sub-io's. Active sub-io's are under the range
+	 * of [lis_pos, lis_endpos).
+	 */
+	struct list_head	lis_active;
+	/**
+	 * All sub-io's created in this lov_io.
+	 */
+	struct list_head	lis_subios;
+	/* Cached results from stripe & offset calculations for page init */
+	int			lis_cached_entry;
+	int			lis_cached_stripe;
+	loff_t			lis_cached_off;
+	loff_t			lis_cached_suboff;
+	struct lov_io_sub	*lis_cached_sub;
+};
+
+struct lov_session {
+        struct lov_io          ls_io;
+        struct lov_sublock_env ls_subenv;
+};
+
+extern struct lu_device_type lov_device_type;
+extern struct lu_device_type lovsub_device_type;
+
+extern struct lu_context_key lov_key;
+extern struct lu_context_key lov_session_key;
+
+extern struct kmem_cache *lov_lock_kmem;
+extern struct kmem_cache *lov_object_kmem;
+extern struct kmem_cache *lov_thread_kmem;
+extern struct kmem_cache *lov_session_kmem;
+
+extern struct kmem_cache *lovsub_object_kmem;
+
+int   lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
+int   lov_lock_init_empty (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init_composite(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+int   lov_io_init_empty   (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+int   lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
+                               int stripe);
+
+int   lov_page_init_empty (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_page *page, pgoff_t index);
+int   lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_page *page, pgoff_t index);
+int   lov_page_init_foreign(const struct lu_env *env, struct cl_object *obj,
+			     struct cl_page *page, pgoff_t index);
+struct lu_object *lov_object_alloc   (const struct lu_env *env,
+                                      const struct lu_object_header *hdr,
+                                      struct lu_device *dev);
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+                                      const struct lu_object_header *hdr,
+                                      struct lu_device *dev);
+
+int lov_page_stripe(const struct cl_page *page);
+bool lov_page_is_empty(const struct cl_page *page);
+int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset);
+int lov_io_layout_at(struct lov_io *lio, __u64 offset);
+
+#define lov_foreach_target(lov, var)                    \
+        for (var = 0; var < lov_targets_nr(lov); ++var)
+
+static inline struct lu_extent *lov_io_extent(struct lov_io *io, int i)
+{
+	return &lov_lse(io->lis_object, i)->lsme_extent;
+}
+
+/**
+ * For layout entries within @ext.
+ */
+#define lov_foreach_io_layout(ind, lio, ext)				\
+	for (ind = lov_io_layout_at(lio, (ext)->e_start);		\
+	     ind >= 0 &&						\
+	     lu_extent_is_overlapped(lov_io_extent(lio, ind), ext);	\
+	     ind = lov_io_layout_at(lio, lov_io_extent(lio, ind)->e_end))
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct lov_session *lov_env_session(const struct lu_env *env)
+{
+        struct lov_session *ses;
+
+        ses = lu_context_key_get(env->le_ses, &lov_session_key);
+        LASSERT(ses != NULL);
+        return ses;
+}
+
+static inline struct lov_io *lov_env_io(const struct lu_env *env)
+{
+        return &lov_env_session(env)->ls_io;
+}
+
+static inline int lov_is_object(const struct lu_object *obj)
+{
+        return obj->lo_dev->ld_type == &lov_device_type;
+}
+
+static inline int lovsub_is_object(const struct lu_object *obj)
+{
+        return obj->lo_dev->ld_type == &lovsub_device_type;
+}
+
+static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
+{
+        return &lov->ld_cl.cd_lu_dev;
+}
+
+static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &lov_device_type);
+	return container_of(d, struct lov_device, ld_cl.cd_lu_dev);
+}
+
+static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
+{
+        return &lovsub->acid_cl;
+}
+
+static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
+{
+        return &lovsub2cl_dev(lovsub)->cd_lu_dev;
+}
+
+static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &lovsub_device_type);
+	return container_of(d, struct lovsub_device, acid_cl.cd_lu_dev);
+}
+
+static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
+{
+	LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
+	return container_of(d, struct lovsub_device, acid_cl);
+}
+
+static inline struct lu_object *lov2lu(struct lov_object *lov)
+{
+        return &lov->lo_cl.co_lu;
+}
+
+static inline struct cl_object *lov2cl(struct lov_object *lov)
+{
+        return &lov->lo_cl;
+}
+
+static inline struct lov_object *lu2lov(const struct lu_object *obj)
+{
+	LINVRNT(lov_is_object(obj));
+	return container_of(obj, struct lov_object, lo_cl.co_lu);
+}
+
+static inline struct lov_object *cl2lov(const struct cl_object *obj)
+{
+	LINVRNT(lov_is_object(&obj->co_lu));
+	return container_of(obj, struct lov_object, lo_cl);
+}
+
+static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
+{
+	return &los->lso_cl.co_lu;
+}
+
+static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
+{
+	return &los->lso_cl;
+}
+
+static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
+{
+	LINVRNT(lovsub_is_object(&obj->co_lu));
+	return container_of(obj, struct lovsub_object, lso_cl);
+}
+
+static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
+{
+	LINVRNT(lovsub_is_object(obj));
+	return container_of(obj, struct lovsub_object, lso_cl.co_lu);
+}
+
+static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
+	return container_of(slice, struct lov_lock, lls_cl);
+}
+
+static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
+	return container_of(slice, struct lov_page, lps_cl);
+}
+
+static inline struct lov_io *cl2lov_io(const struct lu_env *env,
+                                const struct cl_io_slice *ios)
+{
+        struct lov_io *lio;
+
+        lio = container_of(ios, struct lov_io, lis_cl);
+        LASSERT(lio == lov_env_io(env));
+        return lio;
+}
+
+static inline int lov_targets_nr(const struct lov_device *lov)
+{
+        return lov->ld_lov->desc.ld_tgt_count;
+}
+
+static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
+{
+        struct lov_thread_info *info;
+
+        info = lu_context_key_get(&env->le_ctx, &lov_key);
+        LASSERT(info != NULL);
+        return info;
+}
+
+/* lov_pack.c */
+int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
+		  struct lov_stripe_md *lsm, struct lov_user_md __user *lump,
+		  size_t size);
+
+/** @} lov */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
new file mode 100644
index 0000000000000..e83ee157fd7ff
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
@@ -0,0 +1,592 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_device and cl_device_type for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "lov_cl_internal.h"
+
+struct kmem_cache *lov_lock_kmem;
+struct kmem_cache *lov_object_kmem;
+struct kmem_cache *lov_thread_kmem;
+struct kmem_cache *lov_session_kmem;
+
+struct kmem_cache *lovsub_object_kmem;
+
+struct lu_kmem_descr lov_caches[] = {
+	{
+		.ckd_cache = &lov_lock_kmem,
+		.ckd_name  = "lov_lock_kmem",
+		.ckd_size  = sizeof(struct lov_lock)
+	},
+	{
+		.ckd_cache = &lov_object_kmem,
+		.ckd_name  = "lov_object_kmem",
+		.ckd_size  = sizeof(struct lov_object)
+	},
+	{
+		.ckd_cache = &lov_thread_kmem,
+		.ckd_name  = "lov_thread_kmem",
+		.ckd_size  = sizeof(struct lov_thread_info)
+	},
+	{
+		.ckd_cache = &lov_session_kmem,
+		.ckd_name  = "lov_session_kmem",
+		.ckd_size  = sizeof(struct lov_session)
+	},
+	{
+		.ckd_cache = &lovsub_object_kmem,
+		.ckd_name  = "lovsub_object_kmem",
+		.ckd_size  = sizeof(struct lovsub_object)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/*****************************************************************************
+ *
+ * Lov device and device type functions.
+ *
+ */
+
+static void *lov_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct lov_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS);
+	if (!info)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct lov_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, lov_thread_kmem);
+}
+
+struct lu_context_key lov_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = lov_key_init,
+	.lct_fini = lov_key_fini
+};
+
+static void *lov_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct lov_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS);
+	if (!info)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct lov_session *info = data;
+
+	OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+}
+
+struct lu_context_key lov_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = lov_session_key_init,
+	.lct_fini = lov_session_key_fini
+};
+
+/* type constructor/destructor: lov_type_{init,fini,start,stop}() */
+LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
+
+
+static int lov_mdc_dev_init(const struct lu_env *env, struct lov_device *ld,
+			    struct lu_device *mdc_dev, __u32 idx, __u32 nr)
+{
+	struct cl_device *cl;
+
+	ENTRY;
+	cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
+			   mdc_dev);
+	if (IS_ERR(cl))
+		RETURN(PTR_ERR(cl));
+
+	ld->ld_md_tgts[nr].ldm_mdc = cl;
+	ld->ld_md_tgts[nr].ldm_idx = idx;
+	RETURN(0);
+}
+
+static struct lu_device *lov_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	int i;
+
+	LASSERT(ld->ld_lov != NULL);
+
+	if (ld->ld_lmv) {
+		class_decref(ld->ld_lmv, "lov", d);
+		ld->ld_lmv = NULL;
+	}
+
+	if (ld->ld_md_tgts) {
+		for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+			if (!ld->ld_md_tgts[i].ldm_mdc)
+				continue;
+
+			cl_stack_fini(env, ld->ld_md_tgts[i].ldm_mdc);
+			ld->ld_md_tgts[i].ldm_mdc = NULL;
+			ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc = NULL;
+		}
+	}
+
+	if (ld->ld_target) {
+		lov_foreach_target(ld, i) {
+			struct lovsub_device *lsd;
+
+			lsd = ld->ld_target[i];
+			if (lsd) {
+				cl_stack_fini(env, lovsub2cl_dev(lsd));
+				ld->ld_target[i] = NULL;
+			}
+		}
+	}
+	RETURN(NULL);
+}
+
+static int lov_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	int i;
+	int rc = 0;
+
+	/* check all added already MDC subdevices and initialize them */
+	for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+		struct obd_device *mdc;
+		__u32 idx;
+
+		mdc = ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc;
+		idx = ld->ld_lov->lov_mdc_tgts[i].lmtd_index;
+
+		if (!mdc)
+			continue;
+
+		rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx, i);
+		if (rc) {
+			CERROR("%s: failed to add MDC %s as target: rc = %d\n",
+			       d->ld_obd->obd_name,
+			       obd_uuid2str(&mdc->obd_uuid), rc);
+			GOTO(out_err, rc);
+		}
+	}
+
+	if (!ld->ld_target)
+		RETURN(0);
+
+	lov_foreach_target(ld, i) {
+		struct lovsub_device *lsd;
+		struct cl_device *cl;
+		struct lov_tgt_desc *desc;
+
+		desc = ld->ld_lov->lov_tgts[i];
+		if (!desc)
+			continue;
+
+		cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
+				   desc->ltd_obd->obd_lu_dev);
+		if (IS_ERR(cl))
+			GOTO(out_err, rc = PTR_ERR(cl));
+
+		lsd = cl2lovsub_dev(cl);
+		ld->ld_target[i] = lsd;
+	}
+	ld->ld_flags |= LOV_DEV_INITIALIZED;
+	RETURN(0);
+
+out_err:
+	lu_device_fini(d);
+	RETURN(rc);
+}
+
+/* Free the lov specific data created for the back end lu_device. */
+static struct lu_device *lov_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	const int nr = ld->ld_target_nr;
+
+	lu_site_fini(&ld->ld_site);
+
+	cl_device_fini(lu2cl_dev(d));
+	if (ld->ld_target) {
+		OBD_FREE_PTR_ARRAY(ld->ld_target, nr);
+		ld->ld_target = NULL;
+	}
+	if (ld->ld_md_tgts) {
+		OBD_FREE_PTR_ARRAY(ld->ld_md_tgts, LOV_MDC_TGT_MAX);
+		ld->ld_md_tgts = NULL;
+	}
+	/* free array of MDCs */
+	if (ld->ld_lov->lov_mdc_tgts) {
+		OBD_FREE_PTR_ARRAY(ld->ld_lov->lov_mdc_tgts, LOV_MDC_TGT_MAX);
+		ld->ld_lov->lov_mdc_tgts = NULL;
+	}
+
+	OBD_FREE_PTR(ld);
+	return NULL;
+}
+
+static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
+			      __u32 index)
+{
+	struct lov_device *ld = lu2lov_dev(dev);
+
+	ENTRY;
+
+	if (ld->ld_target[index]) {
+		cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+		ld->ld_target[index] = NULL;
+	}
+	EXIT;
+}
+
+static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
+{
+	int result;
+	__u32 tgt_size;
+	__u32 sub_size;
+
+	ENTRY;
+	result = 0;
+	tgt_size = dev->ld_lov->lov_tgt_size;
+	sub_size = dev->ld_target_nr;
+	if (sub_size < tgt_size) {
+		struct lovsub_device **newd;
+		const size_t sz = sizeof(newd[0]);
+
+		OBD_ALLOC_PTR_ARRAY(newd, tgt_size);
+		if (newd) {
+			if (sub_size > 0) {
+				memcpy(newd, dev->ld_target, sub_size * sz);
+				OBD_FREE(dev->ld_target, sub_size * sz);
+			}
+
+			dev->ld_target = newd;
+			dev->ld_target_nr = tgt_size;
+		} else {
+			result = -ENOMEM;
+		}
+	}
+
+	RETURN(result);
+}
+
+static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
+			     __u32 index)
+{
+	struct obd_device    *obd = dev->ld_obd;
+	struct lov_device    *ld  = lu2lov_dev(dev);
+	struct lov_tgt_desc  *tgt;
+	struct lovsub_device *lsd;
+	struct cl_device     *cl;
+	int rc;
+
+	ENTRY;
+
+	lov_tgts_getref(obd);
+
+	tgt = obd->u.lov.lov_tgts[index];
+	LASSERT(tgt != NULL);
+	LASSERT(tgt->ltd_obd != NULL);
+
+	if (!tgt->ltd_obd->obd_set_up) {
+		CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+		RETURN(-EINVAL);
+	}
+
+	rc = lov_expand_targets(env, ld);
+	if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+		cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
+				   tgt->ltd_obd->obd_lu_dev);
+		if (!IS_ERR(cl)) {
+			lsd = cl2lovsub_dev(cl);
+			ld->ld_target[index] = lsd;
+		} else {
+			CERROR("add failed (%d), deleting %s\n", rc,
+			       obd_uuid2str(&tgt->ltd_uuid));
+			lov_cl_del_target(env, dev, index);
+			rc = PTR_ERR(cl);
+		}
+        }
+
+	lov_tgts_putref(obd);
+
+	RETURN(rc);
+}
+
+/**
+ * Add new MDC target device in LOV.
+ *
+ * This function is part of the configuration log processing. It adds new MDC
+ * device to the MDC device array indexed by their indexes.
+ *
+ * \param[in] env	execution environment
+ * \param[in] d		LU device of LOV device
+ * \param[in] mdc	MDC device to add
+ * \param[in] idx	MDC device index
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int lov_add_mdc_target(const struct lu_env *env, struct lu_device *d,
+			      struct obd_device *mdc, __u32 idx)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	struct obd_device *lov_obd = d->ld_obd;
+	struct obd_device *lmv_obd;
+	int next;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(mdc != NULL);
+	if (ld->ld_md_tgts_nr == LOV_MDC_TGT_MAX) {
+		/*
+		 * If the maximum value of LOV_MDC_TGT_MAX will become too
+		 * small then all MD target handling must be rewritten in LOD
+		 * manner, check lod_add_device() and related functionality.
+		 */
+		CERROR("%s: cannot serve more than %d MDC devices\n",
+		       lov_obd->obd_name, LOV_MDC_TGT_MAX);
+		RETURN(-ERANGE);
+	}
+
+	/*
+	 * grab FLD from lmv, do that here, when first MDC is added
+	 * to be sure LMV is set up and can be found
+	 */
+	if (!ld->ld_lmv) {
+		next = 0;
+		while ((lmv_obd = class_devices_in_group(&lov_obd->obd_uuid,
+							 &next)) != NULL) {
+			if ((strncmp(lmv_obd->obd_type->typ_name,
+				     LUSTRE_LMV_NAME,
+				     strlen(LUSTRE_LMV_NAME)) == 0))
+				break;
+		}
+		if (!lmv_obd) {
+			CERROR("%s: cannot find LMV OBD by UUID (%s)\n",
+			       lov_obd->obd_name,
+			       obd_uuid2str(&lmv_obd->obd_uuid));
+			RETURN(-ENODEV);
+		}
+		spin_lock(&lmv_obd->obd_dev_lock);
+		class_incref(lmv_obd, "lov", ld);
+		spin_unlock(&lmv_obd->obd_dev_lock);
+		ld->ld_lmv = lmv_obd;
+	}
+
+	LASSERT(lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc ==
+		NULL);
+
+	if (ld->ld_flags & LOV_DEV_INITIALIZED) {
+		rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx,
+				      ld->ld_md_tgts_nr);
+		if (rc) {
+			CERROR("%s: failed to add MDC %s as target: rc = %d\n",
+			       lov_obd->obd_name, obd_uuid2str(&mdc->obd_uuid),
+			       rc);
+			RETURN(rc);
+		}
+	}
+
+	lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc = mdc;
+	lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_index = idx;
+	ld->ld_md_tgts_nr++;
+
+	RETURN(rc);
+}
+
+static int lov_process_config(const struct lu_env *env,
+			      struct lu_device *d, struct lustre_cfg *cfg)
+{
+	struct obd_device *obd = d->ld_obd;
+	int cmd;
+	int rc;
+	int gen;
+	u32 index;
+
+	lov_tgts_getref(obd);
+
+	cmd = cfg->lcfg_command;
+
+	rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	switch (cmd) {
+	case LCFG_LOV_ADD_OBD:
+	case LCFG_LOV_ADD_INA:
+		rc = lov_cl_add_target(env, d, index);
+		if (rc != 0)
+			lov_del_target(d->ld_obd, index, NULL, 0);
+		break;
+	case LCFG_LOV_DEL_OBD:
+		lov_cl_del_target(env, d, index);
+		break;
+	case LCFG_ADD_MDC:
+	{
+		struct obd_device *mdc;
+		struct obd_uuid tgt_uuid;
+
+		/*
+		 * modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID
+		 */
+		if (LUSTRE_CFG_BUFLEN(cfg, 1) > sizeof(tgt_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&tgt_uuid, lustre_cfg_buf(cfg, 1));
+
+		rc = kstrtou32(lustre_cfg_buf(cfg, 2), 10, &index);
+		if (rc)
+			GOTO(out, rc);
+
+		mdc = class_find_client_obd(&tgt_uuid, LUSTRE_MDC_NAME,
+					    &obd->obd_uuid);
+		if (!mdc)
+			GOTO(out, rc = -ENODEV);
+		rc = lov_add_mdc_target(env, d, mdc, index);
+		break;
+	}
+	}
+out:
+	lov_tgts_putref(obd);
+	RETURN(rc);
+}
+
+static const struct lu_device_operations lov_lu_ops = {
+	.ldo_object_alloc      = lov_object_alloc,
+	.ldo_process_config    = lov_process_config,
+};
+
+static struct lu_device *lov_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct lov_device *ld;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(ld);
+	if (!ld)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&ld->ld_cl, t);
+	d = lov2lu_dev(ld);
+	d->ld_ops = &lov_lu_ops;
+
+	/* setup the LOV OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = lov_setup(obd, cfg);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Alloc MDC devices array */
+	/* XXX: need dynamic allocation at some moment */
+	OBD_ALLOC_PTR_ARRAY(ld->ld_md_tgts, LOV_MDC_TGT_MAX);
+	if (!ld->ld_md_tgts)
+		GOTO(out, rc = -ENOMEM);
+
+	ld->ld_md_tgts_nr = 0;
+
+	ld->ld_lov = &obd->u.lov;
+	OBD_ALLOC_PTR_ARRAY(ld->ld_lov->lov_mdc_tgts, LOV_MDC_TGT_MAX);
+	if (!ld->ld_lov->lov_mdc_tgts)
+		GOTO(out_md_tgts, rc = -ENOMEM);
+
+	rc = lu_site_init(&ld->ld_site, d);
+	if (rc != 0)
+		GOTO(out_mdc_tgts, rc);
+
+	rc = lu_site_init_finish(&ld->ld_site);
+	if (rc != 0)
+		GOTO(out_site, rc);
+
+	RETURN(d);
+out_site:
+	lu_site_fini(&ld->ld_site);
+out_mdc_tgts:
+	OBD_FREE_PTR_ARRAY(ld->ld_lov->lov_mdc_tgts, LOV_MDC_TGT_MAX);
+	ld->ld_lov->lov_mdc_tgts = NULL;
+out_md_tgts:
+	OBD_FREE_PTR_ARRAY(ld->ld_md_tgts, LOV_MDC_TGT_MAX);
+	ld->ld_md_tgts = NULL;
+out:
+	OBD_FREE_PTR(ld);
+
+	return ERR_PTR(rc);
+}
+
+static const struct lu_device_type_operations lov_device_type_ops = {
+	.ldto_init = lov_type_init,
+	.ldto_fini = lov_type_fini,
+
+	.ldto_start = lov_type_start,
+	.ldto_stop  = lov_type_stop,
+
+	.ldto_device_alloc = lov_device_alloc,
+	.ldto_device_free  = lov_device_free,
+
+	.ldto_device_init    = lov_device_init,
+	.ldto_device_fini    = lov_device_fini
+};
+
+struct lu_device_type lov_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOV_NAME,
+	.ldt_ops      = &lov_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
new file mode 100644
index 0000000000000..beb0f63df28e9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
@@ -0,0 +1,716 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/lov/lov_ea.c
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/math64.h>
+#include <linux/sort.h>
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include "lov_internal.h"
+
+static inline void
+lu_extent_le_to_cpu(struct lu_extent *dst, const struct lu_extent *src)
+{
+	dst->e_start = le64_to_cpu(src->e_start);
+	dst->e_end = le64_to_cpu(src->e_end);
+}
+
+/*
+ * Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES.
+ */
+static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt)
+{
+	struct obd_import *imp;
+	loff_t maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
+
+	if (!tgt->ltd_active)
+		return maxbytes;
+
+	imp = tgt->ltd_obd->u.cli.cl_import;
+	if (!imp)
+		return maxbytes;
+
+	spin_lock(&imp->imp_lock);
+	if ((imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_IDLE) &&
+	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
+	    imp->imp_connect_data.ocd_maxbytes > 0)
+		maxbytes = imp->imp_connect_data.ocd_maxbytes;
+
+	spin_unlock(&imp->imp_lock);
+
+	return maxbytes;
+}
+
+static int lsm_lmm_verify_v1v3(struct lov_mds_md *lmm, size_t lmm_size,
+			       u16 stripe_count)
+{
+	u32 pattern = le32_to_cpu(lmm->lmm_pattern);
+	int rc = 0;
+
+	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		rc = -EINVAL;
+		CERROR("lov: bad stripe count %d: rc = %d\n",
+		       stripe_count, rc);
+		lov_dump_lmm_common(D_WARNING, lmm);
+		goto out;
+	}
+
+	if (lmm_oi_id(&lmm->lmm_oi) == 0) {
+		rc = -EINVAL;
+		CERROR("lov: zero object id: rc = %d\n", rc);
+		lov_dump_lmm_common(D_WARNING, lmm);
+		goto out;
+	}
+
+	if (!lov_pattern_supported(lov_pattern(pattern))) {
+		rc = -EINVAL;
+		CERROR("lov: unrecognized striping pattern: rc = %d\n", rc);
+		lov_dump_lmm_common(D_WARNING, lmm);
+		goto out;
+	}
+
+	if (lmm->lmm_stripe_size == 0 ||
+	    (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
+		rc = -EINVAL;
+		CERROR("lov: bad stripe size %u: rc = %d\n",
+		       le32_to_cpu(lmm->lmm_stripe_size), rc);
+		lov_dump_lmm_common(D_WARNING, lmm);
+		goto out;
+	}
+
+out:
+	return rc;
+}
+
+static void lsme_free(struct lov_stripe_md_entry *lsme)
+{
+	unsigned int stripe_count;
+	unsigned int i;
+	size_t lsme_size;
+
+	if (lsme->lsme_magic == LOV_MAGIC_FOREIGN) {
+		/*
+		 * TODO: In addition to HSM foreign layout, It needs to add
+		 * support for other kinds of foreign layout types such as
+		 * DAOS, S3. When add these supports, it will use non-inline
+		 * @lov_hsm_base to store layout information, and need to
+		 * free extra allocated buffer.
+		 */
+		OBD_FREE_LARGE(lsme, sizeof(*lsme));
+		return;
+	}
+
+	stripe_count = lsme->lsme_stripe_count;
+	if (!lsme_inited(lsme) ||
+	    lsme->lsme_pattern & LOV_PATTERN_F_RELEASED)
+		stripe_count = 0;
+	for (i = 0; i < stripe_count; i++)
+		OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab);
+
+	lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]);
+	OBD_FREE_LARGE(lsme, lsme_size);
+}
+
+void lsm_free(struct lov_stripe_md *lsm)
+{
+	unsigned int entry_count = lsm->lsm_entry_count;
+	unsigned int i;
+	size_t lsm_size;
+
+	if (lsm->lsm_magic == LOV_MAGIC_FOREIGN) {
+		OBD_FREE_LARGE(lsm_foreign(lsm), lsm->lsm_foreign_size);
+	} else {
+		for (i = 0; i < entry_count; i++)
+			lsme_free(lsm->lsm_entries[i]);
+	}
+
+	lsm_size = lsm->lsm_magic == LOV_MAGIC_FOREIGN ?
+		   offsetof(typeof(*lsm), lsm_entries[1]) :
+		   offsetof(typeof(*lsm), lsm_entries[entry_count]);
+	OBD_FREE(lsm, lsm_size);
+}
+
+/**
+ * Unpack a struct lov_mds_md into a struct lov_stripe_md_entry.
+ *
+ * The caller should set id and extent.
+ */
+static struct lov_stripe_md_entry *
+lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
+	    const char *pool_name, bool inited, struct lov_ost_data_v1 *objects,
+	    loff_t *maxbytes)
+{
+	struct lov_stripe_md_entry *lsme;
+	size_t lsme_size;
+	loff_t min_stripe_maxbytes = 0;
+	loff_t lov_bytes;
+	u32 magic;
+	u32 pattern;
+	unsigned int stripe_count;
+	unsigned int i;
+	int rc;
+
+	magic = le32_to_cpu(lmm->lmm_magic);
+	if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3)
+		RETURN(ERR_PTR(-EINVAL));
+
+	pattern = le32_to_cpu(lmm->lmm_pattern);
+	if (pattern & LOV_PATTERN_F_RELEASED || !inited)
+		stripe_count = 0;
+	else
+		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+	if (buf_size < lov_mds_md_size(stripe_count, magic)) {
+		CERROR("LOV EA %s too small: %zu, need %u\n",
+		       magic == LOV_MAGIC_V1 ? "V1" : "V3", buf_size,
+		       lov_mds_md_size(stripe_count, magic == LOV_MAGIC_V1 ?
+				       LOV_MAGIC_V1 : LOV_MAGIC_V3));
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return ERR_PTR(-EINVAL);
+	}
+
+	rc = lsm_lmm_verify_v1v3(lmm, buf_size, stripe_count);
+	if (rc < 0)
+		return ERR_PTR(rc);
+
+	lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]);
+	OBD_ALLOC_LARGE(lsme, lsme_size);
+	if (!lsme)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lsme->lsme_magic = magic;
+	lsme->lsme_pattern = pattern;
+	lsme->lsme_flags = 0;
+	lsme->lsme_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
+	/* preserve the possible -1 stripe count for uninstantiated component */
+	lsme->lsme_stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+	lsme->lsme_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+
+	if (pool_name) {
+		size_t pool_name_len;
+
+		pool_name_len = strlcpy(lsme->lsme_pool_name, pool_name,
+					sizeof(lsme->lsme_pool_name));
+		if (pool_name_len >= sizeof(lsme->lsme_pool_name))
+			GOTO(out_lsme, rc = -E2BIG);
+	}
+
+	/* with Data-on-MDT set maxbytes to stripe size */
+	if (lsme_is_dom(lsme)) {
+		if (maxbytes) {
+			lov_bytes = lsme->lsme_stripe_size;
+			goto out_dom1;
+		} else {
+			goto out_dom2;
+		}
+	}
+
+	for (i = 0; i < stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_tgt_desc *ltd;
+
+		OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
+		if (!loi)
+			GOTO(out_lsme, rc = -ENOMEM);
+
+		lsme->lsme_oinfo[i] = loi;
+
+		ostid_le_to_cpu(&objects[i].l_ost_oi, &loi->loi_oi);
+		loi->loi_ost_idx = le32_to_cpu(objects[i].l_ost_idx);
+		loi->loi_ost_gen = le32_to_cpu(objects[i].l_ost_gen);
+		if (lov_oinfo_is_dummy(loi))
+			continue;
+
+		if (loi->loi_ost_idx >= lov->desc.ld_tgt_count &&
+		    !lov2obd(lov)->obd_process_conf) {
+			CERROR("%s: OST index %d more than OST count %d\n",
+			       (char*)lov->desc.ld_uuid.uuid,
+			       loi->loi_ost_idx, lov->desc.ld_tgt_count);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			GOTO(out_lsme, rc = -EINVAL);
+		}
+
+		ltd = lov->lov_tgts[loi->loi_ost_idx];
+		if (!ltd) {
+			CERROR("%s: OST index %d missing\n",
+			       (char*)lov->desc.ld_uuid.uuid, loi->loi_ost_idx);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			continue;
+		}
+
+		lov_bytes = lov_tgt_maxbytes(ltd);
+		if (min_stripe_maxbytes == 0 || lov_bytes < min_stripe_maxbytes)
+			min_stripe_maxbytes = lov_bytes;
+	}
+
+	if (maxbytes) {
+		if (min_stripe_maxbytes == 0)
+			min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
+
+		if (stripe_count == 0)
+			stripe_count = lov->desc.ld_tgt_count;
+
+		if (min_stripe_maxbytes <= LLONG_MAX / stripe_count)
+			lov_bytes = min_stripe_maxbytes * stripe_count;
+		else
+			lov_bytes = MAX_LFS_FILESIZE;
+out_dom1:
+		*maxbytes = min_t(loff_t, lov_bytes, MAX_LFS_FILESIZE);
+	}
+out_dom2:
+
+	return lsme;
+
+out_lsme:
+	for (i = 0; i < stripe_count; i++) {
+		struct lov_oinfo *loi = lsme->lsme_oinfo[i];
+
+		if (loi)
+			OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab);
+	}
+	OBD_FREE_LARGE(lsme, lsme_size);
+
+	return ERR_PTR(rc);
+}
+
+static struct
+lov_stripe_md *lsm_unpackmd_v1v3(struct lov_obd *lov, struct lov_mds_md *lmm,
+				 size_t buf_size, const char *pool_name,
+				 struct lov_ost_data_v1 *objects)
+{
+	struct lov_stripe_md *lsm;
+	struct lov_stripe_md_entry *lsme;
+	size_t lsm_size;
+	loff_t maxbytes;
+	u32 pattern;
+	int rc;
+
+	pattern = le32_to_cpu(lmm->lmm_pattern);
+
+	lsme = lsme_unpack(lov, lmm, buf_size, pool_name, true, objects,
+			   &maxbytes);
+	if (IS_ERR(lsme))
+		RETURN(ERR_CAST(lsme));
+
+	lsme->lsme_flags = LCME_FL_INIT;
+	lsme->lsme_extent.e_start = 0;
+	lsme->lsme_extent.e_end = LUSTRE_EOF;
+
+	lsm_size = offsetof(typeof(*lsm), lsm_entries[1]);
+	OBD_ALLOC(lsm, lsm_size);
+	if (!lsm)
+		GOTO(out_lsme, rc = -ENOMEM);
+
+	atomic_set(&lsm->lsm_refc, 1);
+	spin_lock_init(&lsm->lsm_lock);
+	lsm->lsm_maxbytes = maxbytes;
+	lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi);
+	lsm->lsm_magic = le32_to_cpu(lmm->lmm_magic);
+	lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+	lsm->lsm_entry_count = 1;
+	lsm->lsm_is_released = pattern & LOV_PATTERN_F_RELEASED;
+	lsm->lsm_entries[0] = lsme;
+
+	return lsm;
+
+out_lsme:
+	lsme_free(lsme);
+
+	return ERR_PTR(rc);
+}
+
+static struct lov_stripe_md *
+lsm_unpackmd_v1(struct lov_obd *lov, void *buf, size_t buf_size)
+{
+	struct lov_mds_md_v1 *lmm = buf;
+
+	return lsm_unpackmd_v1v3(lov, buf, buf_size, NULL, lmm->lmm_objects);
+}
+
+static const struct lsm_operations lsm_v1_ops = {
+	.lsm_unpackmd		= lsm_unpackmd_v1,
+};
+
+static struct lov_stripe_md *
+lsm_unpackmd_v3(struct lov_obd *lov, void *buf, size_t buf_size)
+{
+	struct lov_mds_md_v3 *lmm = buf;
+
+	return lsm_unpackmd_v1v3(lov, buf, buf_size, lmm->lmm_pool_name,
+				 lmm->lmm_objects);
+}
+
+static const struct lsm_operations lsm_v3_ops = {
+	.lsm_unpackmd		= lsm_unpackmd_v3,
+};
+
+static int lsm_verify_comp_md_v1(struct lov_comp_md_v1 *lcm,
+				 size_t lcm_buf_size)
+{
+	unsigned int entry_count;
+	unsigned int i;
+	size_t lcm_size;
+
+	lcm_size = le32_to_cpu(lcm->lcm_size);
+	if (lcm_buf_size < lcm_size) {
+		CERROR("bad LCM buffer size %zu, expected %zu\n",
+		       lcm_buf_size, lcm_size);
+		RETURN(-EINVAL);
+	}
+
+	entry_count = le16_to_cpu(lcm->lcm_entry_count);
+	for (i = 0; i < entry_count; i++) {
+		struct lov_comp_md_entry_v1 *lcme = &lcm->lcm_entries[i];
+		size_t blob_offset;
+		size_t blob_size;
+
+		blob_offset = le32_to_cpu(lcme->lcme_offset);
+		blob_size = le32_to_cpu(lcme->lcme_size);
+
+		if (lcm_size < blob_offset || lcm_size < blob_size ||
+		    lcm_size < blob_offset + blob_size) {
+			CERROR("LCM entry %u has invalid blob: "
+			       "LCM size = %zu, offset = %zu, size = %zu\n",
+			       le32_to_cpu(lcme->lcme_id),
+			       lcm_size, blob_offset, blob_size);
+			RETURN(-EINVAL);
+		}
+	}
+
+	return 0;
+}
+
+static struct lov_stripe_md_entry *
+lsme_unpack_foreign(struct lov_obd *lov, void *buf, size_t buf_size,
+		    bool inited, loff_t *maxbytes)
+{
+	struct lov_stripe_md_entry *lsme;
+	struct lov_foreign_md *lfm = buf;
+	__u32 magic;
+
+	ENTRY;
+
+	magic = le32_to_cpu(lfm->lfm_magic);
+	if (magic != LOV_MAGIC_FOREIGN)
+		RETURN(ERR_PTR(-EINVAL));
+
+	OBD_ALLOC_LARGE(lsme, sizeof(*lsme));
+	if (!lsme)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lsme->lsme_magic = magic;
+	lsme->lsme_pattern = LOV_PATTERN_FOREIGN;
+	lsme->lsme_flags = 0;
+
+	if (maxbytes)
+		*maxbytes = MAX_LFS_FILESIZE;
+
+	RETURN(lsme);
+}
+
+static struct lov_stripe_md_entry *
+lsme_unpack_comp(struct lov_obd *lov, struct lov_mds_md *lmm,
+		 size_t lmm_buf_size, bool inited, loff_t *maxbytes)
+{
+	unsigned int magic;
+
+	magic = le32_to_cpu(lmm->lmm_magic);
+	if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3 &&
+	    magic != LOV_MAGIC_FOREIGN)
+		RETURN(ERR_PTR(-EINVAL));
+
+	if (magic != LOV_MAGIC_FOREIGN &&
+	    le16_to_cpu(lmm->lmm_stripe_count) == 0 &&
+	    lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT)
+		RETURN(ERR_PTR(-EINVAL));
+
+	if (magic == LOV_MAGIC_V1) {
+		return lsme_unpack(lov, lmm, lmm_buf_size, NULL,
+				   inited, lmm->lmm_objects, maxbytes);
+	} else if (magic == LOV_MAGIC_V3) {
+		struct lov_mds_md_v3 *lmm3 = (struct lov_mds_md_v3 *)lmm;
+
+		return lsme_unpack(lov, lmm, lmm_buf_size, lmm3->lmm_pool_name,
+				   inited, lmm3->lmm_objects, maxbytes);
+	} else { /* LOV_MAGIC_FOREIGN */
+		return lsme_unpack_foreign(lov, lmm, lmm_buf_size,
+					   inited, maxbytes);
+	}
+}
+
+static struct lov_stripe_md *
+lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
+{
+	struct lov_comp_md_v1 *lcm = buf;
+	struct lov_stripe_md *lsm;
+	size_t lsm_size;
+	unsigned int entry_count = 0;
+	unsigned int i;
+	loff_t maxbytes;
+	int rc;
+
+	rc = lsm_verify_comp_md_v1(buf, buf_size);
+	if (rc < 0)
+		return ERR_PTR(rc);
+
+	entry_count = le16_to_cpu(lcm->lcm_entry_count);
+
+	lsm_size = offsetof(typeof(*lsm), lsm_entries[entry_count]);
+	OBD_ALLOC(lsm, lsm_size);
+	if (!lsm)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&lsm->lsm_refc, 1);
+	spin_lock_init(&lsm->lsm_lock);
+	lsm->lsm_magic = le32_to_cpu(lcm->lcm_magic);
+	lsm->lsm_layout_gen = le32_to_cpu(lcm->lcm_layout_gen);
+	lsm->lsm_entry_count = entry_count;
+	lsm->lsm_mirror_count = le16_to_cpu(lcm->lcm_mirror_count);
+	lsm->lsm_flags = le16_to_cpu(lcm->lcm_flags);
+	lsm->lsm_is_released = true;
+	lsm->lsm_maxbytes = LLONG_MIN;
+
+	for (i = 0; i < entry_count; i++) {
+		struct lov_comp_md_entry_v1 *lcme = &lcm->lcm_entries[i];
+		struct lov_stripe_md_entry *lsme;
+		size_t blob_offset;
+		size_t blob_size;
+		void *blob;
+
+		blob_offset = le32_to_cpu(lcme->lcme_offset);
+		blob_size = le32_to_cpu(lcme->lcme_size);
+		blob = (char *)lcm + blob_offset;
+
+		lsme = lsme_unpack_comp(lov, blob, blob_size,
+					le32_to_cpu(lcme->lcme_flags) &
+					LCME_FL_INIT,
+					(i == entry_count - 1) ? &maxbytes :
+								 NULL);
+		if (IS_ERR(lsme))
+			GOTO(out_lsm, rc = PTR_ERR(lsme));
+
+		if (!(lsme->lsme_pattern & LOV_PATTERN_F_RELEASED))
+			lsm->lsm_is_released = false;
+
+		lsm->lsm_entries[i] = lsme;
+		lsme->lsme_id = le32_to_cpu(lcme->lcme_id);
+		lsme->lsme_flags = le32_to_cpu(lcme->lcme_flags);
+		if (lsme->lsme_flags & LCME_FL_NOSYNC)
+			lsme->lsme_timestamp =
+				le64_to_cpu(lcme->lcme_timestamp);
+		lu_extent_le_to_cpu(&lsme->lsme_extent, &lcme->lcme_extent);
+
+		if (i == entry_count - 1) {
+			lsm->lsm_maxbytes = (loff_t)lsme->lsme_extent.e_start +
+					    maxbytes;
+			/*
+			 * the last component hasn't been defined, or
+			 * lsm_maxbytes overflowed.
+			 */
+			if (!lsme_is_dom(lsme) &&
+			    (lsme->lsme_extent.e_end != LUSTRE_EOF ||
+			     lsm->lsm_maxbytes <
+			     (loff_t)lsme->lsme_extent.e_start))
+				lsm->lsm_maxbytes = MAX_LFS_FILESIZE;
+		}
+	}
+
+	RETURN(lsm);
+
+out_lsm:
+	for (i = 0; i < entry_count; i++)
+		if (lsm->lsm_entries[i])
+			lsme_free(lsm->lsm_entries[i]);
+
+	OBD_FREE(lsm, lsm_size);
+
+	RETURN(ERR_PTR(rc));
+}
+
+static const struct lsm_operations lsm_comp_md_v1_ops = {
+	.lsm_unpackmd		= lsm_unpackmd_comp_md_v1,
+};
+
+static struct
+lov_stripe_md *lsm_unpackmd_foreign(struct lov_obd *lov, void *buf,
+				    size_t buf_size)
+{
+	struct lov_foreign_md *lfm = buf;
+	struct lov_stripe_md *lsm;
+	size_t lsm_size;
+	struct lov_stripe_md_entry *lsme;
+
+	lsm_size = offsetof(typeof(*lsm), lsm_entries[1]);
+	OBD_ALLOC(lsm, lsm_size);
+	if (lsm == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	atomic_set(&lsm->lsm_refc, 1);
+	spin_lock_init(&lsm->lsm_lock);
+	lsm->lsm_magic = le32_to_cpu(lfm->lfm_magic);
+	lsm->lsm_foreign_size = foreign_size_le(lfm);
+
+	/* alloc for full foreign EA including format fields */
+	OBD_ALLOC_LARGE(lsme, lsm->lsm_foreign_size);
+	if (lsme == NULL) {
+		OBD_FREE(lsm, lsm_size);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	/* copy full foreign EA including format fields */
+	memcpy(lsme, buf, lsm->lsm_foreign_size);
+
+	lsm_foreign(lsm) = lsme;
+
+	return lsm;
+}
+
+static const struct lsm_operations lsm_foreign_ops = {
+	.lsm_unpackmd		= lsm_unpackmd_foreign,
+};
+
+const struct lsm_operations *lsm_op_find(int magic)
+{
+	switch (magic) {
+	case LOV_MAGIC_V1:
+		return &lsm_v1_ops;
+	case LOV_MAGIC_V3:
+		return &lsm_v3_ops;
+	case LOV_MAGIC_COMP_V1:
+		return &lsm_comp_md_v1_ops;
+	case LOV_MAGIC_FOREIGN:
+		return &lsm_foreign_ops;
+	default:
+		CERROR("unrecognized lsm_magic %08x\n", magic);
+		return NULL;
+	}
+}
+
+void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm)
+{
+	int i, j;
+
+	CDEBUG_LIMIT(level,
+		     "lsm %p, objid "DOSTID", maxbytes %#llx, magic 0x%08X, refc: %d, entry: %u, mirror: %u, flags: %u,layout_gen %u\n",
+	       lsm, POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+	       atomic_read(&lsm->lsm_refc), lsm->lsm_entry_count,
+	       lsm->lsm_mirror_count, lsm->lsm_flags, lsm->lsm_layout_gen);
+
+	if (lsm->lsm_magic == LOV_MAGIC_FOREIGN) {
+		struct lov_foreign_md *lfm = (void *)lsm_foreign(lsm);
+
+		CDEBUG_LIMIT(level,
+			     "foreign LOV EA, magic %x, length %u, type %x, flags %x, value '%.*s'\n",
+		       lfm->lfm_magic, lfm->lfm_length, lfm->lfm_type,
+		       lfm->lfm_flags, lfm->lfm_length, lfm->lfm_value);
+		return;
+	}
+
+	for (i = 0; i < lsm->lsm_entry_count; i++) {
+		struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
+
+		CDEBUG(level, DEXT ": id: %u, flags: %x, "
+		       "magic 0x%08X, layout_gen %u, "
+		       "stripe count %u, sstripe size %u, "
+		       "pool: ["LOV_POOLNAMEF"]\n",
+		       PEXT(&lse->lsme_extent), lse->lsme_id, lse->lsme_flags,
+		       lse->lsme_magic, lse->lsme_layout_gen,
+		       lse->lsme_stripe_count, lse->lsme_stripe_size,
+		       lse->lsme_pool_name);
+		if (!lsme_inited(lse) ||
+		    lse->lsme_pattern & LOV_PATTERN_F_RELEASED)
+			continue;
+		for (j = 0; j < lse->lsme_stripe_count; j++) {
+			CDEBUG(level, "   oinfo:%p: ostid: "DOSTID
+			       " ost idx: %d gen: %d\n",
+			       lse->lsme_oinfo[j],
+			       POSTID(&lse->lsme_oinfo[j]->loi_oi),
+			       lse->lsme_oinfo[j]->loi_ost_idx,
+			       lse->lsme_oinfo[j]->loi_ost_gen);
+		}
+	}
+}
+
+int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset)
+{
+	int i;
+
+	for (i = 0; i < lsm->lsm_entry_count; i++) {
+		struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
+
+		if ((offset >= lse->lsme_extent.e_start &&
+		     offset < lse->lsme_extent.e_end) ||
+		    (offset == OBD_OBJECT_EOF &&
+		     lse->lsme_extent.e_end == OBD_OBJECT_EOF))
+			return i;
+	}
+
+	return -1;
+}
+
+/**
+ * lmm_layout_gen overlaps stripe_offset field, it needs to be reset back when
+ * sending to MDT for passing striping checks
+ */
+void lov_fix_ea_for_replay(void *lovea)
+{
+	struct lov_user_md *lmm = lovea;
+	struct lov_comp_md_v1 *c1;
+	int i;
+
+	switch (le32_to_cpu(lmm->lmm_magic)) {
+	case LOV_USER_MAGIC_V1:
+	case LOV_USER_MAGIC_V3:
+		lmm->lmm_stripe_offset = LOV_OFFSET_DEFAULT;
+		break;
+
+	case LOV_USER_MAGIC_COMP_V1:
+		c1 = (void *)lmm;
+		for (i = 0; i < le16_to_cpu(c1->lcm_entry_count); i++) {
+			struct lov_comp_md_entry_v1 *ent = &c1->lcm_entries[i];
+
+			if (le32_to_cpu(ent->lcme_flags) & LCME_FL_INIT) {
+				lmm = (void *)((char *)c1 +
+				      le32_to_cpu(ent->lcme_offset));
+				lmm->lmm_stripe_offset = LOV_OFFSET_DEFAULT;
+			}
+		}
+	}
+}
+EXPORT_SYMBOL(lov_fix_ea_for_replay);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
new file mode 100644
index 0000000000000..9341d2e80b7c4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
@@ -0,0 +1,375 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef LOV_INTERNAL_H
+#define LOV_INTERNAL_H
+
+#include <obd_class.h>
+#include <uapi/linux/lustre/lustre_user.h>
+
+/* If we are unable to get the maximum object size from the OST in
+ * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using
+ * the old maximum object size from ext3. */
+#define LUSTRE_EXT3_STRIPE_MAXBYTES 0x1fffffff000ULL
+
+struct lov_stripe_md_entry {
+	struct lu_extent	lsme_extent;
+	u32			lsme_id;
+	u32			lsme_magic;
+	u32			lsme_flags;
+	u32			lsme_pattern;
+	u64			lsme_timestamp;
+	u32			lsme_stripe_size;
+	u16			lsme_stripe_count;
+	u16			lsme_layout_gen;
+	char			lsme_pool_name[LOV_MAXPOOLNAME + 1];
+	struct lov_oinfo       *lsme_oinfo[];
+};
+
+static inline bool lsme_is_dom(struct lov_stripe_md_entry *lsme)
+{
+	return (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT);
+}
+
+static inline void copy_lsm_entry(struct lov_stripe_md_entry *dst,
+				  struct lov_stripe_md_entry *src)
+{
+	unsigned i;
+
+	for (i = 0; i < src->lsme_stripe_count; i++)
+		*dst->lsme_oinfo[i] = *src->lsme_oinfo[i];
+	memcpy(dst, src, offsetof(typeof(*src), lsme_oinfo));
+}
+
+struct lov_stripe_md {
+	atomic_t	lsm_refc;
+	spinlock_t	lsm_lock;
+	pid_t		lsm_lock_owner; /* debugging */
+
+	union {
+		/* maximum possible file size, might change as OSTs status
+		 * changes, e.g. disconnected, deactivated
+		 */
+		loff_t          lsm_maxbytes;
+		/* size of full foreign LOV */
+		size_t          lsm_foreign_size;
+	};
+	struct ost_id	lsm_oi;
+	u32		lsm_magic;
+	u32		lsm_layout_gen;
+	u16		lsm_flags;
+	bool		lsm_is_released;
+	u16		lsm_mirror_count;
+	u16		lsm_entry_count;
+	struct lov_stripe_md_entry *lsm_entries[];
+};
+
+#define lsm_foreign(lsm) (lsm->lsm_entries[0])
+
+static inline bool lsme_is_foreign(const struct lov_stripe_md_entry *lsme)
+{
+	return lsme->lsme_magic == LOV_MAGIC_FOREIGN;
+}
+
+static inline bool lsm_entry_is_foreign(const struct lov_stripe_md *lsm,
+					int index)
+{
+	return lsme_is_foreign(lsm->lsm_entries[index]);
+}
+
+static inline bool lsme_inited(const struct lov_stripe_md_entry *lsme)
+{
+	return lsme->lsme_flags & LCME_FL_INIT;
+}
+
+static inline bool lsm_entry_inited(const struct lov_stripe_md *lsm, int index)
+{
+	return lsme_inited(lsm->lsm_entries[index]);
+}
+
+static inline bool lsm_is_composite(__u32 magic)
+{
+	return magic == LOV_MAGIC_COMP_V1;
+}
+
+static inline size_t lov_comp_md_size(const struct lov_stripe_md *lsm)
+{
+	struct lov_stripe_md_entry *lsme;
+	size_t size;
+	int entry;
+
+	if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3)
+		return lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count,
+				       lsm->lsm_entries[0]->lsme_magic);
+
+	if (lsm->lsm_magic == LOV_MAGIC_FOREIGN)
+		return lsm->lsm_foreign_size;
+
+	LASSERT(lsm->lsm_magic == LOV_MAGIC_COMP_V1);
+
+	size = sizeof(struct lov_comp_md_v1) +
+	       sizeof(struct lov_comp_md_entry_v1) * lsm->lsm_entry_count;
+	for (entry = 0; entry < lsm->lsm_entry_count; entry++) {
+		u16 stripe_count;
+
+		lsme = lsm->lsm_entries[entry];
+
+		if (lsme_inited(lsme))
+			stripe_count = lsme->lsme_stripe_count;
+		else
+			stripe_count = 0;
+
+		size += lov_mds_md_size(stripe_count,
+					lsme->lsme_magic);
+	}
+
+	return size;
+}
+
+static inline bool lsm_has_objects(struct lov_stripe_md *lsm)
+{
+	return lsm != NULL && !lsm->lsm_is_released;
+}
+
+static inline unsigned int lov_comp_index(int entry, int stripe)
+{
+	LASSERT(entry >= 0 && entry <= SHRT_MAX);
+	LASSERT(stripe >= 0 && stripe < USHRT_MAX);
+
+	return entry << 16 | stripe;
+}
+
+static inline int lov_comp_stripe(int index)
+{
+	return index & 0xffff;
+}
+
+static inline int lov_comp_entry(int index)
+{
+	return index >> 16;
+}
+
+struct lsm_operations {
+	struct lov_stripe_md *(*lsm_unpackmd)(struct lov_obd *, void *, size_t);
+};
+
+const struct lsm_operations *lsm_op_find(int magic);
+void lsm_free(struct lov_stripe_md *lsm);
+
+/* lov_do_div64(a, b) returns a % b, and a = a / b.
+ * The 32-bit code is LOV-specific due to knowing about stripe limits in
+ * order to reduce the divisor to a 32-bit number.  If the divisor is
+ * already a 32-bit value the compiler handles this directly. */
+#if BITS_PER_LONG == 64
+# define lov_do_div64(n, base) ({					\
+	uint64_t __base = (base);					\
+	uint64_t __rem;							\
+	__rem = ((uint64_t)(n)) % __base;				\
+	(n) = ((uint64_t)(n)) / __base;					\
+	__rem;								\
+})
+#elif BITS_PER_LONG == 32
+# define lov_do_div64(n, base) ({					\
+	uint64_t __num = (n);						\
+	uint64_t __rem;							\
+	if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
+		int __remainder;					\
+		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)),		\
+			 "64 bit lov division %llu / %llu\n",		\
+			 __num, (uint64_t)(base));			\
+		__remainder = __num & (LOV_MIN_STRIPE_SIZE - 1);	\
+		__num >>= LOV_MIN_STRIPE_BITS;				\
+		__rem = do_div(__num, (base) >> LOV_MIN_STRIPE_BITS);	\
+		__rem <<= LOV_MIN_STRIPE_BITS;				\
+		__rem += __remainder;					\
+	} else {							\
+		__rem = do_div(__num, base);				\
+	}								\
+	(n) = __num;							\
+	__rem;								\
+})
+#endif
+
+#define pool_tgt_count(p) ((p)->pool_obds.op_count)
+#define pool_tgt_array(p) ((p)->pool_obds.op_array)
+#define pool_tgt_rw_sem(p) ((p)->pool_obds.op_rw_sem)
+
+struct pool_desc {
+	char			 pool_name[LOV_MAXPOOLNAME + 1];
+	struct lu_tgt_pool	 pool_obds;
+	atomic_t		 pool_refcount;
+	struct rhash_head	 pool_hash;	/* access by poolname */
+	struct list_head	 pool_list;	/* serial access */
+	struct rcu_head		 pool_rcu;
+	struct proc_dir_entry	*pool_proc_entry;
+	struct obd_device	*pool_lobd;	/* owner */
+};
+
+int lov_pool_hash_init(struct rhashtable *tbl);
+void lov_pool_hash_destroy(struct rhashtable *tbl);
+
+struct lov_request {
+	struct obd_info		 rq_oi;
+	struct lov_request_set	*rq_rqset;
+	struct list_head	 rq_link;
+	int			 rq_idx;	/* index in lov->tgts array */
+};
+
+struct lov_request_set {
+	struct obd_info		*set_oi;
+	struct obd_device	*set_obd;
+	int			 set_count;
+	atomic_t		 set_completes;
+	atomic_t		 set_success;
+	struct list_head	 set_list;
+};
+
+extern struct kmem_cache *lov_oinfo_slab;
+
+extern struct lu_kmem_descr lov_caches[];
+
+#define lov_uuid2str(lv, index) \
+        (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid)
+
+/* lov_merge.c */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index,
+                      struct ost_lvb *lvb, __u64 *kms_place);
+
+/* lov_offset.c */
+loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index);
+u64 lov_stripe_size(struct lov_stripe_md *lsm, int index,
+		    u64 ost_size, int stripeno);
+int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
+		      int stripeno, loff_t *obd_off);
+loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
+			  int stripeno);
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
+			  struct lu_extent *ext, u64 *obd_start, u64 *obd_end);
+int lov_stripe_number(struct lov_stripe_md *lsm, int index, loff_t lov_off);
+pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
+			 pgoff_t stripe_index, int stripe);
+
+/* lov_request.c */
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+                        struct lov_request_set **reqset);
+int lov_fini_statfs_set(struct lov_request_set *set);
+
+/* lov_obd.c */
+void lov_tgts_getref(struct obd_device *obd);
+void lov_tgts_putref(struct obd_device *obd);
+void lov_stripe_lock(struct lov_stripe_md *md);
+void lov_stripe_unlock(struct lov_stripe_md *md);
+void lov_fix_desc(struct lov_desc *desc);
+void lov_fix_desc_stripe_size(__u64 *val);
+void lov_fix_desc_stripe_count(__u32 *val);
+void lov_fix_desc_pattern(__u32 *val);
+void lov_fix_desc_qos_maxage(__u32 *val);
+__u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic,
+			   __u16 stripe_count);
+int lov_connect_obd(struct obd_device *obd, u32 index, int activate,
+		    struct obd_connect_data *data);
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+			    u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, u32 index,
+		   struct obd_uuid *uuidp, int gen);
+
+/* lov_pack.c */
+ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
+		     size_t buf_size);
+struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
+				   size_t buf_size);
+int lov_free_memmd(struct lov_stripe_md **lsmp);
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
+void lov_dump_lmm_common(int level, void *lmmp);
+
+/* lov_ea.c */
+void lsm_free_plain(struct lov_stripe_md *lsm);
+void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm);
+
+/* lproc_lov.c */
+int lov_tunables_init(struct obd_device *obd);
+
+/* lov_cl.c */
+extern struct lu_device_type lov_device_type;
+
+#define LOV_MDC_TGT_MAX 256
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+
+static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm)
+{
+	LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+	atomic_inc(&lsm->lsm_refc);
+	return lsm;
+}
+
+static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi)
+{
+	if (unlikely(loi->loi_oi.oi.oi_id == 0 &&
+		     loi->loi_oi.oi.oi_seq == 0 &&
+		     loi->loi_ost_idx == 0 &&
+		     loi->loi_ost_gen == 0))
+		return true;
+
+	return false;
+}
+
+static inline struct obd_device *lov2obd(const struct lov_obd *lov)
+{
+	return container_of_safe(lov, struct obd_device, u.lov);
+}
+
+static inline void lov_lsm2layout(struct lov_stripe_md *lsm,
+				  struct lov_stripe_md_entry *lsme,
+				  struct ost_layout *ol)
+{
+	ol->ol_stripe_size = lsme->lsme_stripe_size;
+	ol->ol_stripe_count = lsme->lsme_stripe_count;
+	if (lsm->lsm_magic == LOV_MAGIC_COMP_V1) {
+		ol->ol_comp_start = lsme->lsme_extent.e_start;
+		ol->ol_comp_end = lsme->lsme_extent.e_end;
+		ol->ol_comp_id = lsme->lsme_id;
+	} else {
+		ol->ol_comp_start = 0;
+		ol->ol_comp_end = 0;
+		ol->ol_comp_id = 0;
+	}
+}
+
+struct pool_desc *lov_pool_find(struct obd_device *obd, char *poolname);
+void lov_pool_putref(struct pool_desc *pool);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_io.c b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
new file mode 100644
index 0000000000000..ce4fa30b84b6f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
@@ -0,0 +1,1987 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_io for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static inline struct lov_io_sub *lov_sub_alloc(struct lov_io *lio, int index)
+{
+	struct lov_io_sub *sub;
+
+	if (lio->lis_nr_subios == 0) {
+		LASSERT(lio->lis_single_subio_index == -1);
+		sub = &lio->lis_single_subio;
+		lio->lis_single_subio_index = index;
+		memset(sub, 0, sizeof(*sub));
+	} else {
+		OBD_ALLOC_PTR(sub);
+	}
+
+	if (sub) {
+		INIT_LIST_HEAD(&sub->sub_list);
+		INIT_LIST_HEAD(&sub->sub_linkage);
+		sub->sub_subio_index = index;
+	}
+
+	return sub;
+}
+
+static inline void lov_sub_free(struct lov_io *lio, struct lov_io_sub *sub)
+{
+	if (sub->sub_subio_index == lio->lis_single_subio_index) {
+		LASSERT(sub == &lio->lis_single_subio);
+		lio->lis_single_subio_index = -1;
+	} else {
+		OBD_FREE_PTR(sub);
+	}
+}
+
+static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
+			    struct lov_io_sub *sub)
+{
+	ENTRY;
+
+	cl_io_fini(sub->sub_env, &sub->sub_io);
+
+	if (sub->sub_env && !IS_ERR(sub->sub_env)) {
+		cl_env_put(sub->sub_env, &sub->sub_refcheck);
+		sub->sub_env = NULL;
+	}
+	EXIT;
+}
+
+static inline bool
+is_index_within_mirror(struct lov_object *lov, int index, int mirror_index)
+{
+	struct lov_layout_composite *comp = &lov->u.composite;
+	struct lov_mirror_entry *lre = &comp->lo_mirrors[mirror_index];
+
+	return (index >= lre->lre_start && index <= lre->lre_end);
+}
+
+static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
+			   struct lov_io_sub *sub)
+{
+	struct lov_object *lov = lio->lis_object;
+	struct cl_io *sub_io;
+	struct cl_object *sub_obj;
+	struct cl_io *io = lio->lis_cl.cis_io;
+	int index = lov_comp_entry(sub->sub_subio_index);
+	int stripe = lov_comp_stripe(sub->sub_subio_index);
+	int result = 0;
+	LASSERT(sub->sub_env == NULL);
+	ENTRY;
+
+	if (unlikely(!lov_r0(lov, index)->lo_sub ||
+		     !lov_r0(lov, index)->lo_sub[stripe]))
+		RETURN(-EIO);
+
+	LASSERTF(ergo(lov_is_flr(lov),
+		      is_index_within_mirror(lov, index,
+					     lio->lis_mirror_index)),
+		 DFID "iot = %d, index = %d, mirror = %d\n",
+		 PFID(lu_object_fid(lov2lu(lov))), io->ci_type, index,
+		 lio->lis_mirror_index);
+
+	/* obtain new environment */
+	sub->sub_env = cl_env_get(&sub->sub_refcheck);
+	if (IS_ERR(sub->sub_env)) {
+		result = PTR_ERR(sub->sub_env);
+		RETURN(result);
+	}
+
+	sub_obj = lovsub2cl(lov_r0(lov, index)->lo_sub[stripe]);
+	sub_io  = &sub->sub_io;
+
+	sub_io->ci_obj    = sub_obj;
+	sub_io->ci_result = 0;
+
+	sub_io->ci_parent  = io;
+	sub_io->ci_lockreq = io->ci_lockreq;
+	sub_io->ci_type    = io->ci_type;
+	sub_io->ci_no_srvlock = io->ci_no_srvlock;
+	sub_io->ci_noatime = io->ci_noatime;
+	sub_io->ci_async_readahead = io->ci_async_readahead;
+	sub_io->ci_lock_no_expand = io->ci_lock_no_expand;
+	sub_io->ci_ndelay = io->ci_ndelay;
+	sub_io->ci_layout_version = io->ci_layout_version;
+	sub_io->ci_tried_all_mirrors = io->ci_tried_all_mirrors;
+
+	result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
+
+	if (result < 0)
+		lov_io_sub_fini(env, lio, sub);
+
+	RETURN(result);
+}
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env,
+			       struct lov_io *lio, int index)
+{
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	ENTRY;
+
+	list_for_each_entry(sub, &lio->lis_subios, sub_list) {
+		if (sub->sub_subio_index == index) {
+			rc = 1;
+			break;
+		}
+	}
+
+	if (rc == 0) {
+		sub = lov_sub_alloc(lio, index);
+		if (!sub)
+			GOTO(out, rc = -ENOMEM);
+
+		rc = lov_io_sub_init(env, lio, sub);
+		if (rc < 0) {
+			lov_sub_free(lio, sub);
+			GOTO(out, rc);
+		}
+
+		list_add_tail(&sub->sub_list, &lio->lis_subios);
+		lio->lis_nr_subios++;
+	}
+out:
+	if (rc < 0)
+		sub = ERR_PTR(rc);
+	else
+		sub->sub_io.ci_noquota = lio->lis_cl.cis_io->ci_noquota;
+	RETURN(sub);
+}
+
+/*****************************************************************************
+ *
+ * Lov io operations.
+ *
+ */
+static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
+                             struct cl_io *io)
+{
+	ENTRY;
+
+	LASSERT(lio->lis_object != NULL);
+
+	INIT_LIST_HEAD(&lio->lis_subios);
+	lio->lis_single_subio_index = -1;
+	lio->lis_nr_subios = 0;
+
+	RETURN(0);
+}
+
+/**
+ * Decide if it will need write intent RPC
+ */
+static int lov_io_mirror_write_intent(struct lov_io *lio,
+	struct lov_object *obj, struct cl_io *io)
+{
+	struct lov_layout_composite *comp = &obj->u.composite;
+	struct lu_extent *ext = &io->ci_write_intent;
+	struct lov_mirror_entry *lre;
+	struct lov_mirror_entry *primary;
+	struct lov_layout_entry *lle;
+	size_t count = 0;
+	ENTRY;
+
+	*ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos };
+	io->ci_need_write_intent = 0;
+
+	if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) ||
+	      cl_io_is_fallocate(io) || cl_io_is_trunc(io)))
+		RETURN(0);
+
+	/*
+	 * FLR: check if it needs to send a write intent RPC to server.
+	 * Writing to sync_pending file needs write intent RPC to change
+	 * the file state back to write_pending, so that the layout version
+	 * can be increased when the state changes to sync_pending at a later
+	 * time. Otherwise there exists a chance that an evicted client may
+	 * dirty the file data while resync client is working on it.
+	 * Designated I/O is allowed for resync workload.
+	 */
+	if (lov_flr_state(obj) == LCM_FL_RDONLY ||
+	    (lov_flr_state(obj) == LCM_FL_SYNC_PENDING &&
+	     io->ci_designated_mirror == 0)) {
+		io->ci_need_write_intent = 1;
+		RETURN(0);
+	}
+
+	LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING));
+	LASSERT(comp->lo_preferred_mirror >= 0);
+
+	/*
+	 * need to iterate all components to see if there are
+	 * multiple components covering the writing component
+	 */
+	primary = &comp->lo_mirrors[comp->lo_preferred_mirror];
+	LASSERT(!primary->lre_stale);
+	lov_foreach_mirror_layout_entry(obj, lle, primary) {
+		LASSERT(lle->lle_valid);
+		if (!lu_extent_is_overlapped(ext, lle->lle_extent))
+			continue;
+
+		ext->e_start = min(ext->e_start, lle->lle_extent->e_start);
+		ext->e_end = max(ext->e_end, lle->lle_extent->e_end);
+		++count;
+	}
+	if (count == 0) {
+		CERROR(DFID ": cannot find any valid components covering "
+		       "file extent "DEXT", mirror: %d\n",
+		       PFID(lu_object_fid(lov2lu(obj))), PEXT(ext),
+		       primary->lre_mirror_id);
+		RETURN(-EIO);
+	}
+
+	count = 0;
+	lov_foreach_mirror_entry(obj, lre) {
+		if (lre == primary)
+			continue;
+
+		lov_foreach_mirror_layout_entry(obj, lle, lre) {
+			if (!lle->lle_valid)
+				continue;
+
+			if (lu_extent_is_overlapped(ext, lle->lle_extent)) {
+				++count;
+				break;
+			}
+		}
+	}
+
+	CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to "
+	       "modify file extent "DEXT", iot: %d\n",
+	       PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type);
+
+	io->ci_need_write_intent = count > 0;
+
+	RETURN(0);
+}
+
+static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
+			       struct cl_io *io)
+{
+	struct lov_layout_composite *comp = &obj->u.composite;
+	int index;
+	int i;
+	int result;
+	ENTRY;
+
+	if (!lov_is_flr(obj)) {
+		/* only locks/pages are manipulated for CIT_MISC op, no
+		 * cl_io_loop() will be called, don't check/set mirror info.
+		 */
+		if (io->ci_type != CIT_MISC) {
+			LASSERT(comp->lo_preferred_mirror == 0);
+			lio->lis_mirror_index = comp->lo_preferred_mirror;
+		}
+		io->ci_ndelay = 0;
+		RETURN(0);
+	}
+
+	/* transfer the layout version for verification */
+	if (io->ci_layout_version == 0)
+		io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
+
+	/* find the corresponding mirror for designated mirror IO */
+	if (io->ci_designated_mirror > 0) {
+		struct lov_mirror_entry *entry;
+
+		LASSERT(!io->ci_ndelay);
+
+		CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n",
+		      lov_flr_state(obj));
+
+		if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE ||
+		     cl_io_is_fallocate(io)) &&
+		    (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) {
+			/*
+			 * For resync I/O, the ci_layout_version was the layout
+			 * version when resync starts. If it doesn't match the
+			 * current object layout version, it means the layout
+			 * has been changed
+			 */
+			RETURN(-ESTALE);
+		}
+
+		io->ci_layout_version |= LU_LAYOUT_RESYNC;
+
+		index = 0;
+		lio->lis_mirror_index = -1;
+		lov_foreach_mirror_entry(obj, entry) {
+			if (entry->lre_mirror_id ==
+			    io->ci_designated_mirror) {
+				lio->lis_mirror_index = index;
+				break;
+			}
+
+			index++;
+		}
+
+		RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0);
+	}
+
+	result = lov_io_mirror_write_intent(lio, obj, io);
+	if (result)
+		RETURN(result);
+
+	if (io->ci_need_write_intent) {
+		CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n",
+		       PFID(lu_object_fid(lov2lu(obj))),
+		       lio->lis_pos, lio->lis_endpos);
+
+		if (cl_io_is_trunc(io)) {
+			/**
+			 * for truncate, we uses [size, EOF) to judge whether
+			 * a write intent needs to be send, but we need to
+			 * restore the write extent to [0, size], in truncate,
+			 * the byte in the size position is accessed.
+			 */
+			io->ci_write_intent.e_start = 0;
+			io->ci_write_intent.e_end =
+					io->u.ci_setattr.sa_attr.lvb_size + 1;
+		}
+		/* stop cl_io_init() loop */
+		RETURN(1);
+	}
+
+	if (io->ci_ndelay_tried == 0 || /* first time to try */
+	    /* reset the mirror index if layout has changed */
+	    lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) {
+		lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen;
+		index = lio->lis_mirror_index = comp->lo_preferred_mirror;
+	} else {
+		index = lio->lis_mirror_index;
+		LASSERT(index >= 0);
+
+		/* move mirror index to the next one */
+		index = (index + 1) % comp->lo_mirror_count;
+	}
+
+	for (i = 0; i < comp->lo_mirror_count; i++) {
+		struct lu_extent ext = { .e_start = lio->lis_pos,
+					 .e_end   = lio->lis_pos + 1 };
+		struct lov_mirror_entry *lre;
+		struct lov_layout_entry *lle;
+		bool found = false;
+
+		lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count];
+		if (!lre->lre_valid)
+			continue;
+
+		if (lre->lre_foreign)
+			continue;
+
+		lov_foreach_mirror_layout_entry(obj, lle, lre) {
+			if (!lle->lle_valid)
+				continue;
+
+			if (lu_extent_is_overlapped(&ext, lle->lle_extent)) {
+				found = true;
+				break;
+			}
+		} /* each component of the mirror */
+		if (found) {
+			index = (index + i) % comp->lo_mirror_count;
+			break;
+		}
+	} /* each mirror */
+
+	if (i == comp->lo_mirror_count) {
+		CERROR(DFID": failed to find a component covering "
+		       "I/O region at %llu\n",
+		       PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos);
+
+		dump_lsm(D_ERROR, obj->lo_lsm);
+
+		RETURN(-EIO);
+	}
+
+	CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, "
+	       "have retried: %d, mirror count: %d\n",
+	       PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj),
+	       lio->lis_mirror_index, index, io->ci_ndelay_tried,
+	       comp->lo_mirror_count);
+
+	lio->lis_mirror_index = index;
+
+	/*
+	 * FLR: if all mirrors have been tried once, most likely the network
+	 * of this client has been partitioned. We should relinquish CPU for
+	 * a while before trying again.
+	 */
+	if (io->ci_ndelay && io->ci_ndelay_tried > 0 &&
+	    (io->ci_ndelay_tried % comp->lo_mirror_count == 0)) {
+		schedule_timeout_interruptible(cfs_time_seconds(1) / 100);
+		if (signal_pending(current))
+			RETURN(-EINTR);
+
+		/**
+		 * we'd set ci_tried_all_mirrors to turn off fast mirror
+		 * switching for read after we've tried all mirrors several
+		 * rounds.
+		 */
+		io->ci_tried_all_mirrors = io->ci_ndelay_tried %
+					   (comp->lo_mirror_count * 4) == 0;
+	}
+	++io->ci_ndelay_tried;
+
+	CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n",
+	       io->ci_ndelay ? "non-" : "");
+
+	RETURN(0);
+}
+
+static int lov_io_slice_init(struct lov_io *lio,
+			     struct lov_object *obj, struct cl_io *io)
+{
+	int index;
+	int result = 0;
+	ENTRY;
+
+	io->ci_result = 0;
+	lio->lis_object = obj;
+	lio->lis_cached_entry = LIS_CACHE_ENTRY_NONE;
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		lio->lis_pos = io->u.ci_rw.crw_pos;
+		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+		lio->lis_io_endpos = lio->lis_endpos;
+		if (cl_io_is_append(io)) {
+			LASSERT(io->ci_type == CIT_WRITE);
+
+			/*
+			 * If there is LOV EA hole, then we may cannot locate
+			 * the current file-tail exactly.
+			 */
+			if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern &
+				     LOV_PATTERN_F_HOLE))
+				GOTO(out, result = -EIO);
+
+			lio->lis_pos = 0;
+			lio->lis_endpos = OBD_OBJECT_EOF;
+		}
+		break;
+
+	case CIT_SETATTR:
+		if (cl_io_is_fallocate(io)) {
+			lio->lis_pos = io->u.ci_setattr.sa_falloc_offset;
+			lio->lis_endpos = io->u.ci_setattr.sa_falloc_end;
+		} else if (cl_io_is_trunc(io)) {
+			lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+			lio->lis_endpos = OBD_OBJECT_EOF;
+		} else {
+			lio->lis_pos = 0;
+			lio->lis_endpos = OBD_OBJECT_EOF;
+		}
+		break;
+
+	case CIT_DATA_VERSION:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	case CIT_FAULT: {
+		pgoff_t index = io->u.ci_fault.ft_index;
+
+		lio->lis_pos = cl_offset(io->ci_obj, index);
+		lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+		break;
+	}
+
+	case CIT_FSYNC: {
+		lio->lis_pos = io->u.ci_fsync.fi_start;
+		lio->lis_endpos = io->u.ci_fsync.fi_end;
+		break;
+	}
+
+	case CIT_LADVISE: {
+		lio->lis_pos = io->u.ci_ladvise.li_start;
+		lio->lis_endpos = io->u.ci_ladvise.li_end;
+		break;
+	}
+
+	case CIT_LSEEK: {
+		lio->lis_pos = io->u.ci_lseek.ls_start;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+	}
+
+	case CIT_GLIMPSE:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+
+		if (lov_flr_state(obj) == LCM_FL_RDONLY &&
+		    !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE))
+			/* SoM is accurate, no need glimpse */
+			GOTO(out, result = 1);
+		break;
+
+	case CIT_MISC:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	/*
+	 * CIT_MISC + ci_ignore_layout can identify the I/O from the OSC layer,
+	 * it won't care/access lov layout related info.
+	 */
+	if (io->ci_ignore_layout && io->ci_type == CIT_MISC)
+		GOTO(out, result = 0);
+
+	LASSERT(obj->lo_lsm != NULL);
+
+	result = lov_io_mirror_init(lio, obj, io);
+	if (result)
+		GOTO(out, result);
+
+	/* check if it needs to instantiate layout */
+	if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) ||
+	      cl_io_is_fallocate(io) ||
+	      (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0)))
+		GOTO(out, result = 0);
+
+	/*
+	 * for truncate, it only needs to instantiate the components
+	 * before the truncated size.
+	 */
+	if (cl_io_is_trunc(io)) {
+		io->ci_write_intent.e_start = 0;
+		/* for writes, e_end is endpos, the location of the file
+		 * pointer after the write is completed, so it is not accessed.
+		 * For truncate, 'end' is the size, and *is* acccessed.
+		 * In other words, writes are [start, end), but truncate is
+		 * [start, size], where both are included.  So add 1 to the
+		 * size when creating the write intent to account for this.
+		 */
+		io->ci_write_intent.e_end =
+			io->u.ci_setattr.sa_attr.lvb_size + 1;
+	} else {
+		io->ci_write_intent.e_start = lio->lis_pos;
+		io->ci_write_intent.e_end = lio->lis_endpos;
+	}
+
+	index = 0;
+	lov_foreach_io_layout(index, lio, &io->ci_write_intent) {
+		if (!lsm_entry_inited(obj->lo_lsm, index)) {
+			io->ci_need_write_intent = 1;
+			break;
+		}
+	}
+
+	if (io->ci_need_write_intent && io->ci_designated_mirror > 0) {
+		/*
+		 * REINT_SYNC RPC has already tried to instantiate all of the
+		 * components involved, obviously it didn't succeed. Skip this
+		 * mirror for now. The server won't be able to figure out
+		 * which mirror it should instantiate components
+		 */
+		CERROR(DFID": trying to instantiate components for designated "
+		       "I/O, file state: %d\n",
+		       PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj));
+
+		io->ci_need_write_intent = 0;
+		GOTO(out, result = -EIO);
+	}
+
+	if (io->ci_need_write_intent)
+		GOTO(out, result = 1);
+
+	EXIT;
+
+out:
+	return result;
+}
+
+static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+	struct lov_io_sub *sub;
+
+	ENTRY;
+	LASSERT(list_empty(&lio->lis_active));
+
+	while ((sub = list_first_entry_or_null(&lio->lis_subios,
+					       struct lov_io_sub,
+					       sub_list)) != NULL) {
+		list_del_init(&sub->sub_list);
+		lio->lis_nr_subios--;
+
+		lov_io_sub_fini(env, lio, sub);
+		lov_sub_free(lio, sub);
+	}
+	LASSERT(lio->lis_nr_subios == 0);
+
+	LASSERT(atomic_read(&lov->lo_active_ios) > 0);
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up(&lov->lo_waitq);
+	EXIT;
+}
+
+static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
+			       loff_t start, loff_t end)
+{
+	struct cl_io *io = &sub->sub_io;
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct cl_io *parent = lio->lis_cl.cis_io;
+	int index = lov_comp_entry(sub->sub_subio_index);
+	int stripe = lov_comp_stripe(sub->sub_subio_index);
+
+	switch (io->ci_type) {
+	case CIT_SETATTR: {
+		io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
+		io->u.ci_setattr.sa_attr_flags =
+			parent->u.ci_setattr.sa_attr_flags;
+		io->u.ci_setattr.sa_avalid = parent->u.ci_setattr.sa_avalid;
+		io->u.ci_setattr.sa_xvalid = parent->u.ci_setattr.sa_xvalid;
+		io->u.ci_setattr.sa_falloc_mode =
+			parent->u.ci_setattr.sa_falloc_mode;
+		io->u.ci_setattr.sa_stripe_index = stripe;
+		io->u.ci_setattr.sa_parent_fid =
+					parent->u.ci_setattr.sa_parent_fid;
+		/* For SETATTR(fallocate) pass the subtype to lower IO */
+		io->u.ci_setattr.sa_subtype = parent->u.ci_setattr.sa_subtype;
+		if (cl_io_is_fallocate(io)) {
+			io->u.ci_setattr.sa_falloc_offset = start;
+			io->u.ci_setattr.sa_falloc_end = end;
+			io->u.ci_setattr.sa_falloc_uid =
+				parent->u.ci_setattr.sa_falloc_uid;
+			io->u.ci_setattr.sa_falloc_gid =
+				parent->u.ci_setattr.sa_falloc_gid;
+		}
+		if (cl_io_is_trunc(io)) {
+			loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size;
+
+			new_size = lov_size_to_stripe(lsm, index, new_size,
+						      stripe);
+			io->u.ci_setattr.sa_attr.lvb_size = new_size;
+		}
+		lov_lsm2layout(lsm, lsm->lsm_entries[index],
+			       &io->u.ci_setattr.sa_layout);
+		break;
+	}
+	case CIT_DATA_VERSION: {
+		io->u.ci_data_version.dv_data_version = 0;
+		io->u.ci_data_version.dv_flags =
+			parent->u.ci_data_version.dv_flags;
+		break;
+	}
+	case CIT_FAULT: {
+		struct cl_object *obj = parent->ci_obj;
+		loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index);
+
+		io->u.ci_fault = parent->u.ci_fault;
+		off = lov_size_to_stripe(lsm, index, off, stripe);
+		io->u.ci_fault.ft_index = cl_index(obj, off);
+		break;
+	}
+	case CIT_FSYNC: {
+		io->u.ci_fsync.fi_start = start;
+		io->u.ci_fsync.fi_end = end;
+		io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid;
+		io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode;
+		break;
+	}
+	case CIT_READ:
+	case CIT_WRITE: {
+		io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+		io->ci_tried_all_mirrors = parent->ci_tried_all_mirrors;
+		if (cl_io_is_append(parent)) {
+			io->u.ci_wr.wr_append = 1;
+		} else {
+			io->u.ci_rw.crw_pos = start;
+			io->u.ci_rw.crw_count = end - start;
+		}
+		break;
+	}
+	case CIT_LADVISE: {
+		io->u.ci_ladvise.li_start = start;
+		io->u.ci_ladvise.li_end = end;
+		io->u.ci_ladvise.li_fid = parent->u.ci_ladvise.li_fid;
+		io->u.ci_ladvise.li_advice = parent->u.ci_ladvise.li_advice;
+		io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags;
+		break;
+	}
+	case CIT_LSEEK: {
+		io->u.ci_lseek.ls_start = start;
+		io->u.ci_lseek.ls_whence = parent->u.ci_lseek.ls_whence;
+		io->u.ci_lseek.ls_result = parent->u.ci_lseek.ls_result;
+		break;
+	}
+	case CIT_GLIMPSE:
+	case CIT_MISC:
+	default:
+		break;
+	}
+}
+
+static loff_t lov_offset_mod(loff_t val, int delta)
+{
+	if (val != OBD_OBJECT_EOF)
+		val += delta;
+	return val;
+}
+
+static int lov_io_add_sub(const struct lu_env *env, struct lov_io *lio,
+			  struct lov_io_sub *sub, u64 start, u64 end)
+{
+	int rc;
+
+	end = lov_offset_mod(end, 1);
+	lov_io_sub_inherit(sub, lio, start, end);
+	rc = cl_io_iter_init(sub->sub_env, &sub->sub_io);
+	if (rc != 0) {
+		cl_io_iter_fini(sub->sub_env, &sub->sub_io);
+		return rc;
+	}
+
+	list_add_tail(&sub->sub_linkage, &lio->lis_active);
+
+	return rc;
+}
+static int lov_io_iter_init(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	bool is_trunc = cl_io_is_trunc(ios->cis_io);
+	struct lov_io_sub *sub;
+	struct lu_extent ext;
+	int index;
+	int rc = 0;
+
+	ENTRY;
+
+	ext.e_start = lio->lis_pos;
+	ext.e_end = lio->lis_endpos;
+
+	if (is_trunc) {
+		OBD_ALLOC_PTR_ARRAY(lio->lis_trunc_stripe_index,
+				    lio->lis_object->u.composite.lo_entry_count);
+		if (lio->lis_trunc_stripe_index == NULL)
+			RETURN(-ENOMEM);
+	}
+
+	lov_foreach_io_layout(index, lio, &ext) {
+		struct lov_layout_entry *le = lov_entry(lio->lis_object, index);
+		struct lov_layout_raid0 *r0 = &le->lle_raid0;
+		u64 start;
+		u64 end;
+		int stripe;
+		bool tested_trunc_stripe = false;
+
+		if (is_trunc)
+			lio->lis_trunc_stripe_index[index] = -1;
+
+		CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n",
+		       index, lsm->lsm_entries[index]->lsme_flags);
+		if (!lsm_entry_inited(lsm, index)) {
+			/*
+			 * Read from uninitialized components should return
+			 * zero filled pages.
+			 */
+			continue;
+		}
+
+		if (lsm_entry_is_foreign(lsm, index))
+			continue;
+
+		if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) {
+			CERROR("I/O to invalid component: %d, mirror: %d\n",
+			       index, lio->lis_mirror_index);
+			RETURN(-EIO);
+		}
+
+		for (stripe = 0; stripe < r0->lo_nr; stripe++) {
+			if (!lov_stripe_intersects(lsm, index, stripe,
+						   &ext, &start, &end))
+				continue;
+
+			if (unlikely(!r0->lo_sub[stripe])) {
+				if (ios->cis_io->ci_type == CIT_READ ||
+				    ios->cis_io->ci_type == CIT_WRITE ||
+				    ios->cis_io->ci_type == CIT_FAULT)
+					RETURN(-EIO);
+
+				continue;
+			}
+
+			if (is_trunc && !tested_trunc_stripe) {
+				int prev;
+				u64 tr_start;
+
+				prev = (stripe == 0) ? r0->lo_nr - 1 :
+							stripe - 1;
+				/**
+				 * Only involving previous stripe if the
+				 * truncate in this component is at the
+				 * beginning of this stripe.
+				 */
+				tested_trunc_stripe = true;
+				if (ext.e_start < lsm->lsm_entries[index]->
+							lsme_extent.e_start) {
+					/* need previous stripe involvement */
+					lio->lis_trunc_stripe_index[index] = prev;
+				} else {
+					tr_start = ext.e_start;
+					tr_start = lov_do_div64(tr_start,
+						      stripe_width(lsm, index));
+					/* tr_start %= stripe_swidth */
+					if (tr_start == stripe * lsm->
+							lsm_entries[index]->
+							lsme_stripe_size)
+						lio->lis_trunc_stripe_index[index] = prev;
+				}
+			}
+
+			/* if the last stripe is the trunc stripeno */
+			if (is_trunc && lio->lis_trunc_stripe_index[index] == stripe)
+				lio->lis_trunc_stripe_index[index] = -1;
+
+			sub = lov_sub_get(env, lio,
+					  lov_comp_index(index, stripe));
+			if (IS_ERR(sub))
+				return PTR_ERR(sub);
+
+			rc = lov_io_add_sub(env, lio, sub, start, end);
+			if (rc != 0)
+				break;
+		}
+		if (rc != 0)
+			break;
+
+		if (is_trunc && lio->lis_trunc_stripe_index[index] != -1) {
+			stripe = lio->lis_trunc_stripe_index[index];
+			if (unlikely(!r0->lo_sub[stripe])) {
+				lio->lis_trunc_stripe_index[index] = -1;
+				continue;
+			}
+			sub = lov_sub_get(env, lio,
+					  lov_comp_index(index, stripe));
+			if (IS_ERR(sub))
+				return PTR_ERR(sub);
+
+			/**
+			 * the prev sub could be used by another truncate, we'd
+			 * skip it. LU-14128 happends when expand truncate +
+			 * read get wrong kms.
+			 */
+			if (!list_empty(&sub->sub_linkage)) {
+				lio->lis_trunc_stripe_index[index] = -1;
+				continue;
+			}
+
+			(void)lov_stripe_intersects(lsm, index, stripe, &ext,
+						    &start, &end);
+			rc = lov_io_add_sub(env, lio, sub, start, end);
+			if (rc != 0)
+				break;
+
+		}
+	}
+	RETURN(rc);
+}
+
+static int lov_io_rw_iter_init(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct cl_io *io = ios->cis_io;
+	struct lov_stripe_md_entry *lse;
+	loff_t start = io->u.ci_rw.crw_pos;
+	loff_t next;
+	int index;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+	ENTRY;
+
+	if (cl_io_is_append(io))
+		RETURN(lov_io_iter_init(env, ios));
+
+	index = lov_io_layout_at(lio, io->u.ci_rw.crw_pos);
+	if (index < 0) { /* non-existing layout component */
+		if (io->ci_type == CIT_READ) {
+			/*
+			 * TODO: it needs to detect the next component and
+			 * then set the next pos
+			 */
+			io->ci_continue = 0;
+
+			RETURN(lov_io_iter_init(env, ios));
+		}
+
+		RETURN(-ENODATA);
+	}
+
+	if (!lov_entry(lio->lis_object, index)->lle_valid &&
+	    !io->ci_designated_mirror)
+		RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO);
+
+	lse = lov_lse(lio->lis_object, index);
+
+	if (lsme_is_foreign(lse))
+		RETURN(-EINVAL);
+
+	next = MAX_LFS_FILESIZE;
+	if (lse->lsme_stripe_count > 1) {
+		unsigned long ssize = lse->lsme_stripe_size;
+
+		lov_do_div64(start, ssize);
+		next = (start + 1) * ssize;
+		if (next <= start * ssize)
+			next = MAX_LFS_FILESIZE;
+	}
+
+	LASSERTF(io->u.ci_rw.crw_pos >= lse->lsme_extent.e_start,
+		 "pos %lld, [%lld, %lld)\n", io->u.ci_rw.crw_pos,
+		 lse->lsme_extent.e_start, lse->lsme_extent.e_end);
+	next = min_t(__u64, next, lse->lsme_extent.e_end);
+	next = min_t(loff_t, next, lio->lis_io_endpos);
+
+	io->ci_continue = next < lio->lis_io_endpos;
+	io->u.ci_rw.crw_count = next - io->u.ci_rw.crw_pos;
+	lio->lis_pos    = io->u.ci_rw.crw_pos;
+	lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+	CDEBUG(D_VFSTRACE,
+	       "stripe: %llu chunk: [%llu, %llu) %llu, %zd\n",
+	       (__u64)start, lio->lis_pos, lio->lis_endpos,
+	       (__u64)lio->lis_io_endpos, io->u.ci_rw.crw_count);
+
+	/*
+	 * XXX The following call should be optimized: we know, that
+	 * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe.
+	 */
+	RETURN(lov_io_iter_init(env, ios));
+}
+
+static int lov_io_setattr_iter_init(const struct lu_env *env,
+				    const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct cl_io *io = ios->cis_io;
+	int index;
+	ENTRY;
+
+	if (cl_io_is_trunc(io) && lio->lis_pos > 0) {
+		index = lov_io_layout_at(lio, lio->lis_pos - 1);
+		/* no entry found for such offset */
+		if (index < 0)
+			RETURN(io->ci_result = -ENODATA);
+	}
+
+	RETURN(lov_io_iter_init(env, ios));
+}
+
+static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
+		       int (*iofunc)(const struct lu_env *, struct cl_io *))
+{
+	struct cl_io *parent = lio->lis_cl.cis_io;
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	ENTRY;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		rc = iofunc(sub->sub_env, &sub->sub_io);
+		if (rc)
+			break;
+
+		if (parent->ci_result == 0)
+			parent->ci_result = sub->sub_io.ci_result;
+	}
+	RETURN(rc);
+}
+
+static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+}
+
+static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+}
+
+static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	ENTRY;
+	/*
+	 * It's possible that lov_io_start() wasn't called against this
+	 * sub-io, either because previous sub-io failed, or upper layer
+	 * completed IO.
+	 */
+	if (io->ci_state == CIS_IO_GOING)
+		cl_io_end(env, io);
+	else
+		io->ci_state = CIS_IO_FINISHED;
+	RETURN(0);
+}
+
+static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	cl_io_iter_fini(env, io);
+	RETURN(0);
+}
+
+static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	cl_io_unlock(env, io);
+	RETURN(0);
+}
+
+static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	int rc;
+
+	/* Before ending each i/o, we must set lis_cached_entry to tell the
+	 * next i/o not to use stale cached lis information.
+	 */
+	cl2lov_io(env, ios)->lis_cached_entry = LIS_CACHE_ENTRY_NONE;
+
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+	LASSERT(rc == 0);
+}
+
+static void
+lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct cl_io *parent = lio->lis_cl.cis_io;
+	struct cl_data_version_io *pdv = &parent->u.ci_data_version;
+	struct lov_io_sub *sub;
+
+	ENTRY;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version;
+
+		lov_io_end_wrapper(sub->sub_env, &sub->sub_io);
+
+		pdv->dv_data_version += sdv->dv_data_version;
+		if (pdv->dv_layout_version > sdv->dv_layout_version)
+			pdv->dv_layout_version = sdv->dv_layout_version;
+
+		if (parent->ci_result == 0)
+			parent->ci_result = sub->sub_io.ci_result;
+	}
+
+	EXIT;
+}
+
+static void lov_io_iter_fini(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	int rc;
+
+	ENTRY;
+
+	if (lio->lis_trunc_stripe_index != NULL)
+		OBD_FREE_PTR_ARRAY(lio->lis_trunc_stripe_index,
+				   lio->lis_object->u.composite.lo_entry_count);
+	lio->lis_trunc_stripe_index = NULL;
+
+	rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+	LASSERT(rc == 0);
+	while (!list_empty(&lio->lis_active))
+		list_del_init(lio->lis_active.next);
+	EXIT;
+}
+
+static void lov_io_unlock(const struct lu_env *env,
+                          const struct cl_io_slice *ios)
+{
+	int rc;
+
+	ENTRY;
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+	LASSERT(rc == 0);
+	EXIT;
+}
+
+static int lov_io_read_ahead(const struct lu_env *env,
+			     const struct cl_io_slice *ios,
+			     pgoff_t start, struct cl_read_ahead *ra)
+{
+	struct lov_io		*lio = cl2lov_io(env, ios);
+	struct lov_object	*loo = lio->lis_object;
+	struct cl_object	*obj = lov2cl(loo);
+	struct lov_layout_raid0 *r0;
+	struct lov_io_sub	*sub;
+	loff_t			 offset;
+	loff_t			 suboff;
+	pgoff_t			 ra_end;
+	unsigned int		 pps; /* pages per stripe */
+	int			 stripe;
+	int			 index;
+	int			 rc;
+	ENTRY;
+
+	offset = cl_offset(obj, start);
+	index = lov_io_layout_at(lio, offset);
+	if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index) ||
+	    lsm_entry_is_foreign(loo->lo_lsm, index))
+		RETURN(-ENODATA);
+
+	/* avoid readahead to expand to stale components */
+	if (!lov_entry(loo, index)->lle_valid)
+		RETURN(-EIO);
+
+	stripe = lov_stripe_number(loo->lo_lsm, index, offset);
+
+	r0 = lov_r0(loo, index);
+	if (unlikely(!r0->lo_sub[stripe]))
+		RETURN(-EIO);
+
+	sub = lov_sub_get(env, lio, lov_comp_index(index, stripe));
+	if (IS_ERR(sub))
+		RETURN(PTR_ERR(sub));
+
+	lov_stripe_offset(loo->lo_lsm, index, offset, stripe, &suboff);
+	rc = cl_io_read_ahead(sub->sub_env, &sub->sub_io,
+			      cl_index(lovsub2cl(r0->lo_sub[stripe]), suboff),
+			      ra);
+
+	CDEBUG(D_READA, DFID " cra_end = %lu, stripes = %d, rc = %d\n",
+	       PFID(lu_object_fid(lov2lu(loo))), ra->cra_end_idx,
+		    r0->lo_nr, rc);
+	if (rc != 0)
+		RETURN(rc);
+
+	/**
+	 * Adjust the stripe index by layout of comp. ra->cra_end is the
+	 * maximum page index covered by an underlying DLM lock.
+	 * This function converts cra_end from stripe level to file level, and
+	 * make sure it's not beyond stripe and component boundary.
+	 */
+
+	/* cra_end is stripe level, convert it into file level */
+	ra_end = ra->cra_end_idx;
+	if (ra_end != CL_PAGE_EOF)
+		ra->cra_end_idx = lov_stripe_pgoff(loo->lo_lsm, index,
+						   ra_end, stripe);
+
+	/* boundary of current component */
+	ra_end = cl_index(obj, (loff_t)lov_io_extent(lio, index)->e_end);
+	if (ra_end != CL_PAGE_EOF && ra->cra_end_idx >= ra_end)
+		ra->cra_end_idx = ra_end - 1;
+
+	if (r0->lo_nr == 1) /* single stripe file */
+		RETURN(0);
+
+	pps = lov_lse(loo, index)->lsme_stripe_size >> PAGE_SHIFT;
+
+	CDEBUG(D_READA, DFID " max_index = %lu, pps = %u, index = %d, "
+	       "stripe_size = %u, stripe no = %u, start index = %lu\n",
+	       PFID(lu_object_fid(lov2lu(loo))), ra->cra_end_idx, pps, index,
+	       lov_lse(loo, index)->lsme_stripe_size, stripe, start);
+
+	/* never exceed the end of the stripe */
+	ra->cra_end_idx = min_t(pgoff_t, ra->cra_end_idx,
+				start + pps - start % pps - 1);
+	RETURN(0);
+}
+
+int lov_io_lru_reserve(const struct lu_env *env,
+		       const struct cl_io_slice *ios, loff_t pos, size_t bytes)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct lov_io_sub *sub;
+	struct lu_extent ext;
+	int index;
+	int rc = 0;
+
+	ENTRY;
+
+	ext.e_start = pos;
+	ext.e_end = pos + bytes;
+	lov_foreach_io_layout(index, lio, &ext) {
+		struct lov_layout_entry *le = lov_entry(lio->lis_object, index);
+		struct lov_layout_raid0 *r0 = &le->lle_raid0;
+		u64 start;
+		u64 end;
+		int stripe;
+
+		if (!lsm_entry_inited(lsm, index))
+			continue;
+
+		if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) {
+			CERROR(DFID": I/O to invalid component: %d, mirror: %d\n",
+			       PFID(lu_object_fid(lov2lu(lio->lis_object))),
+			       index, lio->lis_mirror_index);
+			RETURN(-EIO);
+		}
+
+		for (stripe = 0; stripe < r0->lo_nr; stripe++) {
+			if (!lov_stripe_intersects(lsm, index, stripe,
+						   &ext, &start, &end))
+				continue;
+
+			if (unlikely(!r0->lo_sub[stripe]))
+				RETURN(-EIO);
+
+			sub = lov_sub_get(env, lio,
+					  lov_comp_index(index, stripe));
+			if (IS_ERR(sub))
+				return PTR_ERR(sub);
+
+			rc = cl_io_lru_reserve(sub->sub_env, &sub->sub_io, start,
+					       end - start + 1);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+	RETURN(0);
+}
+
+/**
+ * lov implementation of cl_operations::cio_submit() method. It takes a list
+ * of pages in \a queue, splits it into per-stripe sub-lists, invokes
+ * cl_io_submit() on underlying devices to submit sub-lists, and then splices
+ * everything back.
+ *
+ * Major complication of this function is a need to handle memory cleansing:
+ * cl_io_submit() is called to write out pages as a part of VM memory
+ * reclamation, and hence it may not fail due to memory shortages (system
+ * dead-locks otherwise). To deal with this, some resources (sub-lists,
+ * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a
+ * not-memory cleansing context), and in case of memory shortage, these
+ * pre-allocated resources are used by lov_io_submit() under
+ * lov_device::ld_mutex mutex.
+ */
+static int lov_io_submit(const struct lu_env *env,
+			 const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct cl_page_list	*qin = &queue->c2_qin;
+	struct lov_io		*lio = cl2lov_io(env, ios);
+	struct lov_io_sub	*sub;
+	struct cl_page_list	*plist = &lov_env_info(env)->lti_plist;
+	struct cl_page		*page = cl_page_list_first(qin);
+	struct cl_page		*tmp;
+	bool dio = false;
+	int index;
+	int rc = 0;
+	ENTRY;
+
+	if (page->cp_type == CPT_TRANSIENT)
+		dio = true;
+
+	cl_page_list_init(plist);
+	while (qin->pl_nr > 0) {
+		struct cl_2queue  *cl2q = &lov_env_info(env)->lti_cl2q;
+
+		page = cl_page_list_first(qin);
+		if (lov_page_is_empty(page)) {
+			cl_page_list_move(&queue->c2_qout, qin, page);
+
+			/*
+			 * it could only be mirror read to get here therefore
+			 * the pages will be transient. We don't care about
+			 * the return code of cl_page_prep() at all.
+			 */
+			(void) cl_page_prep(env, ios->cis_io, page, crt);
+			cl_page_completion(env, page, crt, 0);
+			continue;
+		}
+
+		cl_2queue_init(cl2q);
+		cl_page_list_move(&cl2q->c2_qin, qin, page);
+
+		index = page->cp_lov_index;
+		/* DIO is already split by stripe */
+		if (!dio) {
+			cl_page_list_for_each_safe(page, tmp, qin) {
+				/* this page is not on this stripe */
+				if (index != page->cp_lov_index)
+					continue;
+
+				cl_page_list_move(&cl2q->c2_qin, qin, page);
+			}
+		} else {
+			cl_page_list_splice(qin, &cl2q->c2_qin);
+		}
+
+		sub = lov_sub_get(env, lio, index);
+		if (!IS_ERR(sub)) {
+			rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io,
+					     crt, cl2q);
+		} else {
+			rc = PTR_ERR(sub);
+		}
+
+		cl_page_list_splice(&cl2q->c2_qin, plist);
+		cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout);
+		cl_2queue_fini(env, cl2q);
+
+		if (rc != 0)
+			break;
+	}
+
+	cl_page_list_splice(plist, qin);
+	cl_page_list_fini(env, plist);
+
+	RETURN(rc);
+}
+
+static int lov_io_commit_async(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       struct cl_page_list *queue, int from, int to,
+			       cl_commit_cbt cb)
+{
+	struct cl_page_list *plist = &lov_env_info(env)->lti_plist;
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_io_sub *sub;
+	struct cl_page *page;
+	int rc = 0;
+	ENTRY;
+
+	if (lio->lis_nr_subios == 1) {
+		int idx = lio->lis_single_subio_index;
+
+		LASSERT(!lov_page_is_empty(cl_page_list_first(queue)));
+
+		sub = lov_sub_get(env, lio, idx);
+		LASSERT(!IS_ERR(sub));
+		LASSERT(sub == &lio->lis_single_subio);
+		rc = cl_io_commit_async(sub->sub_env, &sub->sub_io, queue,
+					from, to, cb);
+		RETURN(rc);
+	}
+
+	cl_page_list_init(plist);
+	while (queue->pl_nr > 0) {
+		int stripe_to = to;
+		int index;
+
+		LASSERT(plist->pl_nr == 0);
+		page = cl_page_list_first(queue);
+		LASSERT(!lov_page_is_empty(page));
+
+		cl_page_list_move(plist, queue, page);
+
+		index = page->cp_lov_index;
+		while (queue->pl_nr > 0) {
+			page = cl_page_list_first(queue);
+			if (index != page->cp_lov_index)
+				break;
+
+			cl_page_list_move(plist, queue, page);
+		}
+
+		if (queue->pl_nr > 0) /* still has more pages */
+			stripe_to = PAGE_SIZE;
+
+		sub = lov_sub_get(env, lio, index);
+		if (!IS_ERR(sub)) {
+			rc = cl_io_commit_async(sub->sub_env, &sub->sub_io,
+						plist, from, stripe_to, cb);
+		} else {
+			rc = PTR_ERR(sub);
+			break;
+		}
+
+		if (plist->pl_nr > 0) /* short write */
+			break;
+
+		from = 0;
+
+		if (lov_comp_entry(index) !=
+		    lov_comp_entry(page->cp_lov_index))
+			cl_io_extent_release(sub->sub_env, &sub->sub_io);
+	}
+
+	/* for error case, add the page back into the qin list */
+	LASSERT(ergo(rc == 0, plist->pl_nr == 0));
+	while (plist->pl_nr > 0) {
+		/* error occurred, add the uncommitted pages back into queue */
+		page = cl_page_list_last(plist);
+		cl_page_list_move_head(queue, plist, page);
+	}
+
+	RETURN(rc);
+}
+
+static int lov_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_fault_io *fio;
+	struct lov_io      *lio;
+	struct lov_io_sub  *sub;
+	loff_t offset;
+	int entry;
+	int stripe;
+
+	ENTRY;
+
+	fio = &ios->cis_io->u.ci_fault;
+	lio = cl2lov_io(env, ios);
+
+	/**
+	 * LU-14502: ft_page could be an existing cl_page associated with
+	 * the vmpage covering the fault index, and the page may still
+	 * refer to another mirror of an old IO.
+	 */
+	if (lov_is_flr(lio->lis_object)) {
+		offset = cl_offset(ios->cis_obj, fio->ft_index);
+		entry = lov_io_layout_at(lio, offset);
+		if (entry < 0) {
+			CERROR(DFID": page fault index %lu invalid component: "
+			       "%d, mirror: %d\n",
+			       PFID(lu_object_fid(&ios->cis_obj->co_lu)),
+			       fio->ft_index, entry,
+			       lio->lis_mirror_index);
+			RETURN(-EIO);
+		}
+		stripe = lov_stripe_number(lio->lis_object->lo_lsm,
+					   entry, offset);
+
+		if (fio->ft_page->cp_lov_index !=
+		    lov_comp_index(entry, stripe)) {
+			CDEBUG(D_INFO, DFID": page fault at index %lu, "
+			       "at mirror %u comp entry %u stripe %u, "
+			       "been used with comp entry %u stripe %u\n",
+			       PFID(lu_object_fid(&ios->cis_obj->co_lu)),
+			       fio->ft_index, lio->lis_mirror_index,
+			       entry, stripe,
+			       lov_comp_entry(fio->ft_page->cp_lov_index),
+			       lov_comp_stripe(fio->ft_page->cp_lov_index));
+
+			fio->ft_page->cp_lov_index =
+					lov_comp_index(entry, stripe);
+		}
+	}
+
+	sub = lov_sub_get(env, lio, fio->ft_page->cp_lov_index);
+	sub->sub_io.u.ci_fault.ft_nob = fio->ft_nob;
+
+	RETURN(lov_io_start(env, ios));
+}
+
+static int lov_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct cl_io *parent = ios->cis_io;
+	struct lov_io_sub *sub;
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+
+	ENTRY;
+
+	if (cl_io_is_fallocate(parent)) {
+		list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+			loff_t size = parent->u.ci_setattr.sa_attr.lvb_size;
+			int index = lov_comp_entry(sub->sub_subio_index);
+			int stripe = lov_comp_stripe(sub->sub_subio_index);
+
+			size = lov_size_to_stripe(lsm, index, size, stripe);
+			sub->sub_io.u.ci_setattr.sa_attr.lvb_size = size;
+			sub->sub_io.u.ci_setattr.sa_avalid =
+						parent->u.ci_setattr.sa_avalid;
+		}
+	}
+
+	RETURN(lov_io_start(env, ios));
+}
+
+static void lov_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_io_sub *sub;
+	unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written;
+	ENTRY;
+
+	*written = 0;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		struct cl_io *subio = &sub->sub_io;
+
+		lov_io_end_wrapper(sub->sub_env, subio);
+
+		if (subio->ci_result == 0)
+			*written += subio->u.ci_fsync.fi_nr_written;
+	}
+	RETURN_EXIT;
+}
+
+static void lov_io_lseek_end(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct cl_io *io = lio->lis_cl.cis_io;
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct lov_io_sub *sub;
+	loff_t offset = -ENXIO;
+	__u64 hole_off = 0;
+	bool seek_hole = io->u.ci_lseek.ls_whence == SEEK_HOLE;
+
+	ENTRY;
+
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		struct cl_io *subio = &sub->sub_io;
+		int index = lov_comp_entry(sub->sub_subio_index);
+		int stripe = lov_comp_stripe(sub->sub_subio_index);
+		loff_t sub_off, lov_off;
+		__u64 comp_end = lsm->lsm_entries[index]->lsme_extent.e_end;
+
+		lov_io_end_wrapper(sub->sub_env, subio);
+
+		if (io->ci_result == 0)
+			io->ci_result = sub->sub_io.ci_result;
+
+		if (io->ci_result)
+			continue;
+
+		CDEBUG(D_INFO, DFID": entry %x stripe %u: SEEK_%s from %lld\n",
+		       PFID(lu_object_fid(lov2lu(lio->lis_object))),
+		       index, stripe, seek_hole ? "HOLE" : "DATA",
+		       subio->u.ci_lseek.ls_start);
+
+		/* first subio with positive result is what we need */
+		sub_off = subio->u.ci_lseek.ls_result;
+		/* Expected error, offset is out of stripe file size */
+		if (sub_off == -ENXIO)
+			continue;
+		/* Any other errors are not expected with ci_result == 0 */
+		if (sub_off < 0) {
+			CDEBUG(D_INFO, "unexpected error: rc = %lld\n",
+			       sub_off);
+			io->ci_result = sub_off;
+			continue;
+		}
+		lov_off = lov_stripe_size(lsm, index, sub_off + 1, stripe) - 1;
+		if (lov_off < 0) {
+			/* the only way to get negatove lov_off here is too big
+			 * result. Return -EOVERFLOW then.
+			 */
+			io->ci_result = -EOVERFLOW;
+			CDEBUG(D_INFO, "offset %llu is too big: rc = %d\n",
+			       (u64)lov_off, io->ci_result);
+			continue;
+		}
+		if (lov_off < io->u.ci_lseek.ls_start) {
+			io->ci_result = -EINVAL;
+			CDEBUG(D_INFO, "offset %lld < start %lld: rc = %d\n",
+			       sub_off, io->u.ci_lseek.ls_start, io->ci_result);
+			continue;
+		}
+		/* resulting offset can be out of component range if stripe
+		 * object is full and its file size was returned as virtual
+		 * hole start. Skip this result, the next component will give
+		 * us correct lseek result but keep possible hole offset in
+		 * case there is no more components ahead
+		 */
+		if (lov_off >= comp_end) {
+			/* must be SEEK_HOLE case */
+			if (likely(seek_hole)) {
+				/* save comp end as potential hole offset */
+				hole_off = max_t(__u64, comp_end, hole_off);
+			} else {
+				io->ci_result = -EINVAL;
+				CDEBUG(D_INFO,
+				       "off %lld >= comp_end %llu: rc = %d\n",
+				       lov_off, comp_end, io->ci_result);
+			}
+			continue;
+		}
+
+		CDEBUG(D_INFO, "SEEK_%s: %lld->%lld/%lld: rc = %d\n",
+		       seek_hole ? "HOLE" : "DATA",
+		       subio->u.ci_lseek.ls_start, sub_off, lov_off,
+		       sub->sub_io.ci_result);
+		offset = min_t(__u64, offset, lov_off);
+	}
+	/* no result but some component returns hole as component end */
+	if (seek_hole && offset == -ENXIO && hole_off > 0)
+		offset = hole_off;
+
+	io->u.ci_lseek.ls_result = offset;
+	RETURN_EXIT;
+}
+
+static const struct cl_io_operations lov_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_setattr_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_setattr_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_DATA_VERSION] = {
+			.cio_fini       = lov_io_fini,
+			.cio_iter_init  = lov_io_iter_init,
+			.cio_iter_fini  = lov_io_iter_fini,
+			.cio_lock       = lov_io_lock,
+			.cio_unlock     = lov_io_unlock,
+			.cio_start      = lov_io_start,
+			.cio_end        = lov_io_data_version_end,
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_fault_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_FSYNC] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_fsync_end
+		},
+		[CIT_LADVISE] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_LSEEK] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_lseek_end
+		},
+		[CIT_GLIMPSE] = {
+			.cio_fini      = lov_io_fini,
+		},
+		[CIT_MISC] = {
+			.cio_fini      = lov_io_fini
+		}
+	},
+	.cio_read_ahead                = lov_io_read_ahead,
+	.cio_lru_reserve	       = lov_io_lru_reserve,
+	.cio_submit                    = lov_io_submit,
+	.cio_commit_async              = lov_io_commit_async,
+};
+
+/*****************************************************************************
+ *
+ * Empty lov io operations.
+ *
+ */
+
+static void lov_empty_io_fini(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+	ENTRY;
+
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up(&lov->lo_waitq);
+	EXIT;
+}
+
+static int lov_empty_io_submit(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       enum cl_req_type crt, struct cl_2queue *queue)
+{
+	return -EBADF;
+}
+
+static void lov_empty_impossible(const struct lu_env *env,
+                                 struct cl_io_slice *ios)
+{
+	LBUG();
+}
+
+#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
+
+/**
+ * An io operation vector for files without stripes.
+ */
+static const struct cl_io_operations lov_empty_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini       = lov_empty_io_fini,
+#if 0
+			.cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end        = LOV_EMPTY_IMPOSSIBLE
+#endif
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FSYNC] = {
+			.cio_fini      = lov_empty_io_fini
+		},
+		[CIT_LADVISE] = {
+			.cio_fini   = lov_empty_io_fini
+		},
+		[CIT_GLIMPSE] = {
+			.cio_fini      = lov_empty_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini      = lov_empty_io_fini
+		}
+	},
+	.cio_submit                    = lov_empty_io_submit,
+	.cio_commit_async              = LOV_EMPTY_IMPOSSIBLE
+};
+
+int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj,
+			  struct cl_io *io)
+{
+	struct lov_io *lio = lov_env_io(env);
+	struct lov_object *lov = cl2lov(obj);
+	int result;
+
+	ENTRY;
+
+	INIT_LIST_HEAD(&lio->lis_active);
+	result = lov_io_slice_init(lio, lov, io);
+	if (result)
+		GOTO(out, result);
+
+	result = lov_io_subio_init(env, lio, io);
+	if (!result) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+		atomic_inc(&lov->lo_active_ios);
+	}
+	EXIT;
+out:
+	io->ci_result = result < 0 ? result : 0;
+	return result;
+}
+
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+                      struct cl_io *io)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	int result;
+	ENTRY;
+
+	lio->lis_object = lov;
+	switch (io->ci_type) {
+	default:
+		LBUG();
+	case CIT_MISC:
+	case CIT_GLIMPSE:
+	case CIT_READ:
+		result = 0;
+		break;
+	case CIT_FSYNC:
+	case CIT_LADVISE:
+	case CIT_LSEEK:
+	case CIT_SETATTR:
+	case CIT_DATA_VERSION:
+		result = +1;
+		break;
+	case CIT_WRITE:
+		result = -EBADF;
+		break;
+	case CIT_FAULT:
+		result = -EFAULT;
+		CERROR("Page fault on a file without stripes: "DFID"\n",
+		       PFID(lu_object_fid(&obj->co_lu)));
+		break;
+	}
+	if (result == 0) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+		atomic_inc(&lov->lo_active_ios);
+	}
+
+	io->ci_result = result < 0 ? result : 0;
+	RETURN(result);
+}
+
+int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	int result;
+	ENTRY;
+
+	LASSERT(lov->lo_lsm != NULL);
+	lio->lis_object = lov;
+
+	switch (io->ci_type) {
+	default:
+		LASSERTF(0, "invalid type %d\n", io->ci_type);
+		result = -EOPNOTSUPP;
+		break;
+	case CIT_GLIMPSE:
+	case CIT_MISC:
+	case CIT_FSYNC:
+	case CIT_LADVISE:
+	case CIT_DATA_VERSION:
+		result = 1;
+		break;
+	case CIT_SETATTR:
+		/*
+		 * the truncate to 0 is managed by MDT:
+		 * - in open, for open O_TRUNC
+		 * - in setattr, for truncate
+		 */
+		/*
+		 * the truncate is for size > 0 so triggers a restore,
+		 * also trigger a restore for prealloc/punch
+		 */
+		if (cl_io_is_trunc(io) || cl_io_is_fallocate(io)) {
+			io->ci_restore_needed = 1;
+			result = -ENODATA;
+		} else
+			result = 1;
+		break;
+	case CIT_READ:
+	case CIT_WRITE:
+	case CIT_FAULT:
+	case CIT_LSEEK:
+		io->ci_restore_needed = 1;
+		result = -ENODATA;
+		break;
+	}
+
+	if (result == 0) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+		atomic_inc(&lov->lo_active_ios);
+	}
+
+	io->ci_result = result < 0 ? result : 0;
+	RETURN(result);
+}
+
+/**
+ * Return the index in composite:lo_entries by the file offset
+ */
+int lov_io_layout_at(struct lov_io *lio, __u64 offset)
+{
+	struct lov_object *lov = lio->lis_object;
+	struct lov_layout_composite *comp = &lov->u.composite;
+	int start_index = 0;
+	int end_index = comp->lo_entry_count - 1;
+	int i;
+
+	LASSERT(lov->lo_type == LLT_COMP);
+
+	/* This is actual file offset so nothing can cover eof. */
+	if (offset == LUSTRE_EOF)
+		return -1;
+
+	if (lov_is_flr(lov)) {
+		struct lov_mirror_entry *lre;
+
+		LASSERT(lio->lis_mirror_index >= 0);
+
+		lre = &comp->lo_mirrors[lio->lis_mirror_index];
+		start_index = lre->lre_start;
+		end_index = lre->lre_end;
+	}
+
+	for (i = start_index; i <= end_index; i++) {
+		struct lov_layout_entry *lle = lov_entry(lov, i);
+
+		LASSERT(!lsme_is_foreign(lle->lle_lsme));
+
+		if ((offset >= lle->lle_extent->e_start &&
+		     offset < lle->lle_extent->e_end) ||
+		    (offset == OBD_OBJECT_EOF &&
+		     lle->lle_extent->e_end == OBD_OBJECT_EOF))
+			return i;
+	}
+
+	return -1;
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
new file mode 100644
index 0000000000000..40777f3921586
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
@@ -0,0 +1,382 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_lock for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov lock operations.
+ *
+ */
+
+static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
+						   const struct cl_lock *parent,
+						   struct lov_lock_sub *lls)
+{
+	struct lov_sublock_env *subenv;
+	struct lov_io          *lio    = lov_env_io(env);
+	struct cl_io           *io     = lio->lis_cl.cis_io;
+	struct lov_io_sub      *sub;
+
+	subenv = &lov_env_session(env)->ls_subenv;
+
+	/*
+	 * FIXME: We tend to use the subio's env & io to call the sublock
+	 * lock operations because osc lock sometimes stores some control
+	 * variables in thread's IO infomation(Now only lockless information).
+	 * However, if the lock's host(object) is different from the object
+	 * for current IO, we have no way to get the subenv and subio because
+	 * they are not initialized at all. As a temp fix, in this case,
+	 * we still borrow the parent's env to call sublock operations.
+	 */
+	if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
+		subenv->lse_env = env;
+		subenv->lse_io = io;
+	} else {
+		sub = lov_sub_get(env, lio, lls->sub_index);
+		if (!IS_ERR(sub)) {
+			subenv->lse_env = sub->sub_env;
+			subenv->lse_io  = &sub->sub_io;
+		} else {
+			subenv = (void *)sub;
+		}
+	}
+	return subenv;
+}
+
+static int lov_sublock_init(const struct lu_env *env,
+			    const struct cl_lock *parent,
+			    struct lov_lock_sub *lls)
+{
+	struct lov_sublock_env *subenv;
+	int result;
+
+	ENTRY;
+
+	subenv = lov_sublock_env_get(env, parent, lls);
+	if (!IS_ERR(subenv)) {
+		result = cl_lock_init(subenv->lse_env, &lls->sub_lock,
+				      subenv->lse_io);
+	} else {
+		/* error occurs. */
+		result = PTR_ERR(subenv);
+	}
+	RETURN(result);
+}
+
+/**
+ * Creates sub-locks for a given lov_lock for the first time.
+ *
+ * Goes through all sub-objects of top-object, and creates sub-locks on every
+ * sub-object intersecting with top-lock extent. This is complicated by the
+ * fact that top-lock (that is being created) can be accessed concurrently
+ * through already created sub-locks (possibly shared with other top-locks).
+ */
+static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
+					  const struct cl_io *io,
+					  const struct cl_object *obj,
+					  struct cl_lock *lock)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	bool is_trunc = cl_io_is_trunc(io);
+	struct lov_lock *lovlck;
+	struct lu_extent ext;
+	loff_t start;
+	loff_t end;
+	int result = 0;
+	int i;
+	int index;
+	int nr;
+
+	ENTRY;
+
+	LASSERT(ergo(is_trunc, lio->lis_trunc_stripe_index != NULL));
+
+	ext.e_start = cl_offset(obj, lock->cll_descr.cld_start);
+	if (lock->cll_descr.cld_end == CL_PAGE_EOF)
+		ext.e_end = OBD_OBJECT_EOF;
+	else
+		ext.e_end  = cl_offset(obj, lock->cll_descr.cld_end + 1);
+
+	nr = 0;
+	lov_foreach_io_layout(index, lio, &ext) {
+		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
+
+		for (i = 0; i < r0->lo_nr; i++) {
+			if (likely(r0->lo_sub[i])) {/* spare layout */
+				if (lov_stripe_intersects(lov->lo_lsm, index, i, &ext, &start, &end) ||
+				    (is_trunc && i == lio->lis_trunc_stripe_index[index]))
+					nr++;
+			}
+		}
+	}
+	/**
+	 * Aggressive lock request (from cl_setattr_ost) which asks for
+	 * [eof, -1) lock, could come across uninstantiated layout extent,
+	 * hence a 0 nr is possible.
+	 */
+
+	OBD_ALLOC_LARGE(lovlck, offsetof(struct lov_lock, lls_sub[nr]));
+	if (!lovlck)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lovlck->lls_nr = nr;
+	nr = 0;
+	lov_foreach_io_layout(index, lov_env_io(env), &ext) {
+		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
+
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lov_lock_sub *lls;
+			struct cl_lock_descr *descr;
+
+			if (unlikely(!r0->lo_sub[i]))
+				continue;
+
+			if (lov_stripe_intersects(lov->lo_lsm, index, i, &ext, &start, &end) ||
+			    (is_trunc && i == lio->lis_trunc_stripe_index[index]))
+				goto init_sublock;
+
+			continue;
+init_sublock:
+			LASSERT(nr < lovlck->lls_nr);
+			lls = &lovlck->lls_sub[nr];
+			descr = &lls->sub_lock.cll_descr;
+			LASSERT(descr->cld_obj == NULL);
+			descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
+			descr->cld_start = cl_index(descr->cld_obj, start);
+			descr->cld_end   = cl_index(descr->cld_obj, end);
+			descr->cld_mode  = lock->cll_descr.cld_mode;
+			descr->cld_gid   = lock->cll_descr.cld_gid;
+			descr->cld_enq_flags = lock->cll_descr.cld_enq_flags;
+
+			lls->sub_index = lov_comp_index(index, i);
+
+			/* initialize sub lock */
+			result = lov_sublock_init(env, lock, lls);
+			if (result < 0)
+				break;
+
+			lls->sub_initialized = 1;
+			nr++;
+		}
+		if (result < 0)
+			break;
+	}
+	LASSERT(ergo(result == 0, nr == lovlck->lls_nr));
+
+	if (result != 0) {
+		for (i = 0; i < nr; ++i) {
+			if (!lovlck->lls_sub[i].sub_initialized)
+				break;
+
+			cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock);
+		}
+
+		OBD_FREE_LARGE(lovlck,
+				offsetof(struct lov_lock, lls_sub[nr]));
+		lovlck = ERR_PTR(result);
+	}
+
+	RETURN(lovlck);
+}
+
+static void lov_lock_fini(const struct lu_env *env,
+                          struct cl_lock_slice *slice)
+{
+	struct lov_lock *lovlck;
+	int i;
+
+	ENTRY;
+	lovlck = cl2lov_lock(slice);
+	for (i = 0; i < lovlck->lls_nr; ++i) {
+		LASSERT(!lovlck->lls_sub[i].sub_is_enqueued);
+		if (lovlck->lls_sub[i].sub_initialized)
+			cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock);
+	}
+	OBD_FREE_LARGE(lovlck,
+		       offsetof(struct lov_lock, lls_sub[lovlck->lls_nr]));
+	EXIT;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This
+ * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock
+ * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock
+ * state machines in the face of sub-locks sharing (by multiple top-locks),
+ * and concurrent sub-lock cancellations.
+ */
+static int lov_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, struct cl_sync_io *anchor)
+{
+	struct cl_lock *lock = slice->cls_lock;
+	struct lov_lock *lovlck = cl2lov_lock(slice);
+	int i;
+	int rc = 0;
+
+	ENTRY;
+
+	for (i = 0; i < lovlck->lls_nr; ++i) {
+		struct lov_lock_sub     *lls = &lovlck->lls_sub[i];
+		struct lov_sublock_env  *subenv;
+
+		subenv = lov_sublock_env_get(env, lock, lls);
+		if (IS_ERR(subenv)) {
+			rc = PTR_ERR(subenv);
+			break;
+		}
+
+		rc = cl_lock_enqueue(subenv->lse_env, subenv->lse_io,
+				     &lls->sub_lock, anchor);
+		if (rc != 0)
+			break;
+
+		lls->sub_is_enqueued = 1;
+	}
+	RETURN(rc);
+}
+
+static void lov_lock_cancel(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct cl_lock *lock = slice->cls_lock;
+	struct lov_lock *lovlck = cl2lov_lock(slice);
+	int i;
+
+	ENTRY;
+
+	for (i = 0; i < lovlck->lls_nr; ++i) {
+		struct lov_lock_sub *lls = &lovlck->lls_sub[i];
+		struct cl_lock *sublock = &lls->sub_lock;
+		struct lov_sublock_env *subenv;
+
+		if (!lls->sub_is_enqueued)
+			continue;
+
+		lls->sub_is_enqueued = 0;
+		subenv = lov_sublock_env_get(env, lock, lls);
+		if (!IS_ERR(subenv)) {
+			cl_lock_cancel(subenv->lse_env, sublock);
+		} else {
+			CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock,
+				      "lov_lock_cancel fails with %ld.\n",
+				      PTR_ERR(subenv));
+		}
+	}
+}
+
+static int lov_lock_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck = cl2lov_lock(slice);
+	int i;
+
+	(*p)(env, cookie, "%d\n", lck->lls_nr);
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct lov_lock_sub *sub;
+
+		sub = &lck->lls_sub[i];
+		(*p)(env, cookie, "    %d %x: ", i, sub->sub_is_enqueued);
+		cl_lock_print(env, cookie, p, &sub->sub_lock);
+	}
+	return 0;
+}
+
+static const struct cl_lock_operations lov_lock_ops = {
+	.clo_fini      = lov_lock_fini,
+	.clo_enqueue   = lov_lock_enqueue,
+	.clo_cancel    = lov_lock_cancel,
+	.clo_print     = lov_lock_print
+};
+
+int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result = 0;
+
+	ENTRY;
+	lck = lov_lock_sub_init(env, io, obj, lock);
+	if (!IS_ERR(lck))
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
+	else
+		result = PTR_ERR(lck);
+	RETURN(result);
+}
+
+static void lov_empty_lock_fini(const struct lu_env *env,
+				struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck = cl2lov_lock(slice);
+
+	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_empty_lock_print(const struct lu_env *env, void *cookie,
+			lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	(*p)(env, cookie, "empty\n");
+	return 0;
+}
+
+/* XXX: more methods will be added later. */
+static const struct cl_lock_operations lov_empty_lock_ops = {
+	.clo_fini  = lov_empty_lock_fini,
+	.clo_print = lov_empty_lock_print
+};
+
+int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+			struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result = -ENOMEM;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS);
+	if (lck) {
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
+		result = 0;
+	}
+	RETURN(result);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
new file mode 100644
index 0000000000000..30fb5b42ac656
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
@@ -0,0 +1,108 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include "lov_internal.h"
+
+/** Merge the lock value block(&lvb) attributes and KMS from each of the
+ * stripes in a file into a single lvb. It is expected that the caller
+ * initializes the current atime, mtime, ctime to avoid regressing a more
+ * uptodate time on the local client.
+ */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index,
+		      struct ost_lvb *lvb, __u64 *kms_place)
+{
+	struct lov_stripe_md_entry *lse = lsm->lsm_entries[index];
+	u64 size = 0;
+	u64 kms = 0;
+	u64 blocks = 0;
+	s64 current_mtime = lvb->lvb_mtime;
+	s64 current_atime = lvb->lvb_atime;
+	s64 current_ctime = lvb->lvb_ctime;
+	int i;
+	int rc = 0;
+
+	assert_spin_locked(&lsm->lsm_lock);
+	LASSERT(lsm->lsm_lock_owner == current->pid);
+
+	CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s=%llu m=%llu"
+	       " a=%llu c=%llu b=%llu\n", POSTID(&lsm->lsm_oi),
+	       lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime,
+	       lvb->lvb_blocks);
+	for (i = 0; i < lse->lsme_stripe_count; i++) {
+		struct lov_oinfo *loi = lse->lsme_oinfo[i];
+		u64 lov_size;
+		u64 tmpsize;
+
+		if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) {
+			rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+			continue;
+		}
+
+		tmpsize = loi->loi_kms;
+		lov_size = lov_stripe_size(lsm, index, tmpsize, i);
+		if (lov_size > kms)
+			kms = lov_size;
+
+		if (loi->loi_lvb.lvb_size > tmpsize)
+			tmpsize = loi->loi_lvb.lvb_size;
+
+		lov_size = lov_stripe_size(lsm, index, tmpsize, i);
+		if (lov_size > size)
+			size = lov_size;
+		/* merge blocks, mtime, atime */
+		blocks += loi->loi_lvb.lvb_blocks;
+		if (loi->loi_lvb.lvb_mtime > current_mtime)
+			current_mtime = loi->loi_lvb.lvb_mtime;
+		if (loi->loi_lvb.lvb_atime > current_atime)
+			current_atime = loi->loi_lvb.lvb_atime;
+		if (loi->loi_lvb.lvb_ctime > current_ctime)
+			current_ctime = loi->loi_lvb.lvb_ctime;
+
+		CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s=%llu m=%llu"
+		       " a=%llu c=%llu b=%llu\n", POSTID(&lsm->lsm_oi),
+		       loi->loi_ost_idx, loi->loi_lvb.lvb_size,
+		       loi->loi_lvb.lvb_mtime, loi->loi_lvb.lvb_atime,
+		       loi->loi_lvb.lvb_ctime, loi->loi_lvb.lvb_blocks);
+	}
+
+	*kms_place = kms;
+	lvb->lvb_size = size;
+	lvb->lvb_blocks = blocks;
+	lvb->lvb_mtime = current_mtime;
+	lvb->lvb_atime = current_atime;
+	lvb->lvb_ctime = current_ctime;
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
new file mode 100644
index 0000000000000..3efba72ddee2b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
@@ -0,0 +1,1350 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/lov/lov_obd.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <libcfs/libcfs.h>
+
+#include <cl_object.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_lib.h>
+#include <lustre_mds.h>
+#include <lustre_net.h>
+#include <uapi/linux/lustre/lustre_param.h>
+#include <lustre_swab.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+#include <obd_support.h>
+
+#include "lov_internal.h"
+
+/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
+   Any function that expects lov_tgts to remain stationary must take a ref. */
+void lov_tgts_getref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	/* nobody gets through here until lov_putref is done */
+	mutex_lock(&lov->lov_lock);
+	atomic_inc(&lov->lov_refcount);
+	mutex_unlock(&lov->lov_lock);
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
+
+void lov_tgts_putref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	mutex_lock(&lov->lov_lock);
+	/* ok to dec to 0 more than once -- ltd_exp's will be null */
+	if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) {
+		LIST_HEAD(kill);
+		struct lov_tgt_desc *tgt, *n;
+		int i;
+
+		CDEBUG(D_CONFIG, "destroying %d lov targets\n",
+		       lov->lov_death_row);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                        tgt = lov->lov_tgts[i];
+
+                        if (!tgt || !tgt->ltd_reap)
+                                continue;
+			list_add(&tgt->ltd_kill, &kill);
+                        /* XXX - right now there is a dependency on ld_tgt_count
+                         * being the maximum tgt index for computing the
+                         * mds_max_easize. So we can't shrink it. */
+			lu_tgt_pool_remove(&lov->lov_packed, i);
+                        lov->lov_tgts[i] = NULL;
+                        lov->lov_death_row--;
+                }
+		mutex_unlock(&lov->lov_lock);
+
+		list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
+			list_del(&tgt->ltd_kill);
+			/* Disconnect */
+			__lov_del_obd(obd, tgt);
+		}
+	} else {
+		mutex_unlock(&lov->lov_lock);
+	}
+}
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev);
+
+int lov_connect_osc(struct obd_device *obd, u32 index, int activate,
+		    struct obd_connect_data *data)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_uuid *tgt_uuid;
+	struct obd_device *tgt_obd;
+	static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
+	struct obd_import *imp;
+	int rc;
+	ENTRY;
+
+	if (lov->lov_tgts[index] == NULL)
+		RETURN(-EINVAL);
+
+        tgt_uuid = &lov->lov_tgts[index]->ltd_uuid;
+        tgt_obd = lov->lov_tgts[index]->ltd_obd;
+
+        if (!tgt_obd->obd_set_up) {
+                CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid));
+                RETURN(-EINVAL);
+        }
+
+        /* override the sp_me from lov */
+        tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me;
+
+        if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX))
+                data->ocd_index = index;
+
+        /*
+         * Divine LOV knows that OBDs under it are OSCs.
+         */
+        imp = tgt_obd->u.cli.cl_import;
+
+	if (activate) {
+		tgt_obd->obd_no_recov = 0;
+		/* FIXME this is probably supposed to be
+		   ptlrpc_set_import_active.  Horrible naming. */
+		ptlrpc_activate_import(imp, false);
+	}
+
+        rc = obd_register_observer(tgt_obd, obd);
+        if (rc) {
+                CERROR("Target %s register_observer error %d\n",
+                       obd_uuid2str(tgt_uuid), rc);
+                RETURN(rc);
+        }
+
+	if (imp->imp_invalid) {
+		CDEBUG(D_CONFIG, "%s: not connecting - administratively disabled\n",
+		       obd_uuid2str(tgt_uuid));
+		RETURN(0);
+	}
+
+	rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd,
+			 &lov_osc_uuid, data, lov->lov_cache);
+        if (rc || !lov->lov_tgts[index]->ltd_exp) {
+                CERROR("Target %s connect error %d\n",
+                       obd_uuid2str(tgt_uuid), rc);
+                RETURN(-ENODEV);
+        }
+
+        lov->lov_tgts[index]->ltd_reap = 0;
+
+        CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
+               obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
+
+	if (lov->lov_tgts_kobj) {
+		/* Even if we failed, that's ok */
+		rc = sysfs_create_link(lov->lov_tgts_kobj,
+				       &tgt_obd->obd_kset.kobj,
+				       tgt_obd->obd_name);
+		if (rc) {
+			CERROR("%s: can't register LOV target /sys/fs/lustre/%s/%s/target_obds/%s : rc = %d\n",
+			       obd->obd_name, obd->obd_type->typ_name,
+			       obd->obd_name,
+			       lov->lov_tgts[index]->ltd_exp->exp_obd->obd_name,
+			       rc);
+		}
+	}
+	RETURN(0);
+}
+
+static int lov_connect(const struct lu_env *env,
+                       struct obd_export **exp, struct obd_device *obd,
+                       struct obd_uuid *cluuid, struct obd_connect_data *data,
+                       void *localdata)
+{
+        struct lov_obd *lov = &obd->u.lov;
+        struct lov_tgt_desc *tgt;
+        struct lustre_handle conn;
+        int i, rc;
+        ENTRY;
+
+        CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects);
+
+        rc = class_connect(&conn, obd, cluuid);
+        if (rc)
+                RETURN(rc);
+
+        *exp = class_conn2export(&conn);
+
+        /* Why should there ever be more than 1 connect? */
+        lov->lov_connects++;
+        LASSERT(lov->lov_connects == 1);
+
+        memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd));
+        if (data)
+                lov->lov_ocd = *data;
+
+	lov_tgts_getref(obd);
+
+	if (localdata) {
+		lov->lov_cache = localdata;
+		cl_cache_incref(lov->lov_cache);
+	}
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		tgt = lov->lov_tgts[i];
+		if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
+			continue;
+		/* Flags will be lowest common denominator */
+		rc = lov_connect_osc(obd, i, tgt->ltd_activate, &lov->lov_ocd);
+                if (rc) {
+                        CERROR("%s: lov connect tgt %d failed: %d\n",
+                               obd->obd_name, i, rc);
+                        continue;
+                }
+                /* connect to administrative disabled ost */
+                if (!lov->lov_tgts[i]->ltd_exp)
+                        continue;
+
+		rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
+				OBD_NOTIFY_CONNECT);
+                if (rc) {
+                        CERROR("%s error sending notify %d\n",
+                               obd->obd_name, rc);
+                }
+        }
+
+	lov_tgts_putref(obd);
+
+	RETURN(0);
+}
+
+static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_device *osc_obd;
+	int rc;
+	ENTRY;
+
+	osc_obd = class_exp2obd(tgt->ltd_exp);
+	CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", obd->obd_name,
+	       osc_obd ? osc_obd->obd_name : "<no obd>");
+
+	if (tgt->ltd_active) {
+		tgt->ltd_active = 0;
+		lov->desc.ld_active_tgt_count--;
+		tgt->ltd_exp->exp_obd->obd_inactive = 1;
+	}
+
+	if (osc_obd) {
+		if (lov->lov_tgts_kobj)
+			sysfs_remove_link(lov->lov_tgts_kobj,
+					  osc_obd->obd_name);
+
+		/* Pass it on to our clients.
+		 * XXX This should be an argument to disconnect,
+		 * XXX not a back-door flag on the OBD.  Ah well.
+		 */
+		osc_obd->obd_force = obd->obd_force;
+		osc_obd->obd_fail = obd->obd_fail;
+		osc_obd->obd_no_recov = obd->obd_no_recov;
+	}
+
+	obd_register_observer(osc_obd, NULL);
+
+	rc = obd_disconnect(tgt->ltd_exp);
+	if (rc) {
+		CERROR("Target %s disconnect error %d\n",
+		       tgt->ltd_uuid.uuid, rc);
+		rc = 0;
+	}
+
+	tgt->ltd_exp = NULL;
+	RETURN(0);
+}
+
+static int lov_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	u32 index;
+	int rc;
+
+	ENTRY;
+	if (!lov->lov_tgts)
+		goto out;
+
+	/* Only disconnect the underlying layers on the final disconnect. */
+	lov->lov_connects--;
+	if (lov->lov_connects != 0) {
+		/* why should there be more than 1 connect? */
+		CWARN("%s: unexpected disconnect #%d\n",
+		      obd->obd_name, lov->lov_connects);
+		goto out;
+	}
+
+	/* hold another ref so lov_del_obd() doesn't spin in putref each time */
+	lov_tgts_getref(obd);
+
+	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+		if (lov->lov_tgts[index] && lov->lov_tgts[index]->ltd_exp) {
+			/* Disconnection is the last we know about an OBD */
+			lov_del_target(obd, index, NULL,
+				       lov->lov_tgts[index]->ltd_gen);
+		}
+	}
+	lov_tgts_putref(obd);
+
+out:
+	rc = class_disconnect(exp); /* bz 9811 */
+	RETURN(rc);
+}
+
+/* Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LOV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD is the wrong type (!)
+ *  any >= 0 : is log target index
+ */
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+			      enum obd_notify_event ev)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	int index;
+	bool activate, active;
+	ENTRY;
+
+	CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
+	       lov, uuid->uuid, ev);
+
+	lov_tgts_getref(obd);
+	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+		tgt = lov->lov_tgts[index];
+		if (tgt && obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
+	}
+
+	if (index == lov->desc.ld_tgt_count)
+		GOTO(out, index = -EINVAL);
+
+	if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) {
+		activate = (ev == OBD_NOTIFY_ACTIVATE);
+
+		/*
+		 * LU-642, initially inactive OSC could miss the obd_connect,
+		 * we make up for it here.
+		 */
+		if (activate && !tgt->ltd_exp) {
+			int rc;
+			struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
+
+			rc = obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
+					 &lov_osc_uuid, &lov->lov_ocd,
+					 lov->lov_cache);
+			if (rc || !tgt->ltd_exp)
+				GOTO(out, index = rc);
+		}
+
+		if (lov->lov_tgts[index]->ltd_activate == activate) {
+			CDEBUG(D_INFO, "OSC %s already %sactivate!\n",
+			       uuid->uuid, activate ? "" : "de");
+		} else {
+			lov->lov_tgts[index]->ltd_activate = activate;
+			CDEBUG(D_CONFIG, "%sactivate OSC %s\n",
+			       activate ? "" : "de", obd_uuid2str(uuid));
+		}
+	} else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) {
+		active = (ev == OBD_NOTIFY_ACTIVE);
+
+		if (lov->lov_tgts[index]->ltd_active == active) {
+			CDEBUG(D_INFO, "OSC %s already %sactive!\n",
+			       uuid->uuid, active ? "" : "in");
+			GOTO(out, index);
+		}
+		CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n",
+		       obd_uuid2str(uuid), active ? "" : "in");
+
+		lov->lov_tgts[index]->ltd_active = active;
+		if (active) {
+			lov->desc.ld_active_tgt_count++;
+			lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0;
+		} else {
+			lov->desc.ld_active_tgt_count--;
+			lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1;
+		}
+	} else {
+		CERROR("%s: unknown event %d for uuid %s\n", obd->obd_name,
+		       ev, uuid->uuid);
+	}
+
+	if (tgt->ltd_exp)
+		CDEBUG(D_INFO, "%s: lov idx %d conn %llx\n", obd_uuid2str(uuid),
+		       index, tgt->ltd_exp->exp_handle.h_cookie);
+
+ out:
+	lov_tgts_putref(obd);
+	RETURN(index);
+}
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev)
+{
+	int rc = 0;
+	struct lov_obd *lov = &obd->u.lov;
+	ENTRY;
+
+	down_read(&lov->lov_notify_lock);
+	if (!lov->lov_connects)
+		GOTO(out_notify_lock, rc = 0);
+
+	if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE ||
+	    ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) {
+		struct obd_uuid *uuid;
+
+		LASSERT(watched);
+
+		if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+			CERROR("unexpected notification of %s %s\n",
+			       watched->obd_type->typ_name, watched->obd_name);
+			GOTO(out_notify_lock, rc = -EINVAL);
+		}
+
+		uuid = &watched->u.cli.cl_target_uuid;
+
+		/* Set OSC as active before notifying the observer, so the
+		 * observer can use the OSC normally.
+		 */
+		rc = lov_set_osc_active(obd, uuid, ev);
+		if (rc < 0) {
+			CERROR("%s: event %d failed: rc = %d\n", obd->obd_name,
+			       ev, rc);
+			GOTO(out_notify_lock, rc);
+		}
+	}
+
+	/* Pass the notification up the chain. */
+	rc = obd_notify_observer(obd, watched, ev);
+
+out_notify_lock:
+	up_read(&lov->lov_notify_lock);
+
+	RETURN(rc);
+}
+
+static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+			  u32 index, int gen, int active)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *tgt_obd;
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_CONFIG, "uuid:%s idx:%u gen:%d active:%d\n",
+	       uuidp->uuid, index, gen, active);
+
+	if (gen <= 0) {
+		CERROR("%s: request to add '%s' with invalid generation: %d\n",
+		       obd->obd_name, uuidp->uuid, gen);
+		RETURN(-EINVAL);
+	}
+
+	tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME, &obd->obd_uuid);
+	if (tgt_obd == NULL)
+		RETURN(-EINVAL);
+
+	mutex_lock(&lov->lov_lock);
+
+	if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+		tgt = lov->lov_tgts[index];
+		rc = -EEXIST;
+		CERROR("%s: UUID %s already assigned at index %d: rc = %d\n",
+		       obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), index, rc);
+		mutex_unlock(&lov->lov_lock);
+		RETURN(rc);
+	}
+
+	if (index >= lov->lov_tgt_size) {
+		/* We need to reallocate the lov target array. */
+		struct lov_tgt_desc **newtgts, **old = NULL;
+		__u32 newsize, oldsize = 0;
+
+		newsize = max(lov->lov_tgt_size, 2U);
+		while (newsize < index + 1)
+			newsize = newsize << 1;
+		OBD_ALLOC_PTR_ARRAY(newtgts, newsize);
+		if (newtgts == NULL) {
+			mutex_unlock(&lov->lov_lock);
+			RETURN(-ENOMEM);
+		}
+
+		if (lov->lov_tgt_size) {
+			memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) *
+			       lov->lov_tgt_size);
+			old = lov->lov_tgts;
+			oldsize = lov->lov_tgt_size;
+		}
+
+		lov->lov_tgts = newtgts;
+		lov->lov_tgt_size = newsize;
+		smp_rmb();
+		if (old)
+			OBD_FREE_PTR_ARRAY(old, oldsize);
+
+		CDEBUG(D_CONFIG, "tgts: %p size: %d\n",
+		       lov->lov_tgts, lov->lov_tgt_size);
+	}
+
+        OBD_ALLOC_PTR(tgt);
+        if (!tgt) {
+		mutex_unlock(&lov->lov_lock);
+                RETURN(-ENOMEM);
+        }
+
+	rc = lu_tgt_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+        if (rc) {
+		mutex_unlock(&lov->lov_lock);
+                OBD_FREE_PTR(tgt);
+                RETURN(rc);
+        }
+
+        tgt->ltd_uuid = *uuidp;
+        tgt->ltd_obd = tgt_obd;
+        /* XXX - add a sanity check on the generation number. */
+        tgt->ltd_gen = gen;
+        tgt->ltd_index = index;
+        tgt->ltd_activate = active;
+        lov->lov_tgts[index] = tgt;
+        if (index >= lov->desc.ld_tgt_count)
+                lov->desc.ld_tgt_count = index + 1;
+
+	mutex_unlock(&lov->lov_lock);
+
+        CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
+                index, tgt->ltd_gen, lov->desc.ld_tgt_count);
+
+	if (lov->lov_connects == 0) {
+		/* lov_connect hasn't been called yet. We'll do the
+		   lov_connect_osc on this target when that fn first runs,
+		   because we don't know the connect flags yet. */
+		RETURN(0);
+	}
+
+	lov_tgts_getref(obd);
+
+	rc = lov_connect_osc(obd, index, active, &lov->lov_ocd);
+        if (rc)
+                GOTO(out, rc);
+
+        /* connect to administrative disabled ost */
+        if (!tgt->ltd_exp)
+                GOTO(out, rc = 0);
+
+	rc = lov_notify(obd, tgt->ltd_exp->exp_obd,
+			active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE);
+
+out:
+	if (rc) {
+		CERROR("%s: add failed, deleting %s: rc = %d\n",
+		       obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), rc);
+		lov_del_target(obd, index, NULL, 0);
+	}
+	lov_tgts_putref(obd);
+	RETURN(rc);
+}
+
+/* Schedule a target for deletion */
+int lov_del_target(struct obd_device *obd, u32 index,
+                   struct obd_uuid *uuidp, int gen)
+{
+        struct lov_obd *lov = &obd->u.lov;
+        int count = lov->desc.ld_tgt_count;
+        int rc = 0;
+        ENTRY;
+
+        if (index >= count) {
+                CERROR("LOV target index %d >= number of LOV OBDs %d.\n",
+                       index, count);
+                RETURN(-EINVAL);
+        }
+
+	/* to make sure there's no ongoing lov_notify() now */
+	down_write(&lov->lov_notify_lock);
+	lov_tgts_getref(obd);
+
+        if (!lov->lov_tgts[index]) {
+                CERROR("LOV target at index %d is not setup.\n", index);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) {
+                CERROR("LOV target UUID %s at index %d doesn't match %s.\n",
+                       lov_uuid2str(lov, index), index,
+                       obd_uuid2str(uuidp));
+                GOTO(out, rc = -EINVAL);
+        }
+
+        CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
+               lov_uuid2str(lov, index), index,
+               lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp,
+               lov->lov_tgts[index]->ltd_active);
+
+        lov->lov_tgts[index]->ltd_reap = 1;
+        lov->lov_death_row++;
+	/* we really delete it from lov_tgts_putref() */
+out:
+	lov_tgts_putref(obd);
+	up_write(&lov->lov_notify_lock);
+
+	RETURN(rc);
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+        struct obd_device *osc_obd;
+
+        LASSERT(tgt);
+        LASSERT(tgt->ltd_reap);
+
+        osc_obd = class_exp2obd(tgt->ltd_exp);
+
+        CDEBUG(D_CONFIG, "Removing tgt %s : %s\n",
+               tgt->ltd_uuid.uuid,
+               osc_obd ? osc_obd->obd_name : "<no obd>");
+
+        if (tgt->ltd_exp)
+                lov_disconnect_obd(obd, tgt);
+
+        OBD_FREE_PTR(tgt);
+
+        /* Manual cleanup - no cleanup logs to clean up the osc's.  We must
+           do it ourselves. And we can't do it from lov_cleanup,
+           because we just lost our only reference to it. */
+        if (osc_obd)
+                class_manual_cleanup(osc_obd);
+}
+
+void lov_fix_desc_stripe_size(__u64 *val)
+{
+	if (*val < LOV_MIN_STRIPE_SIZE) {
+		if (*val != 0)
+			LCONSOLE_INFO("Increasing default stripe size to "
+				      "minimum %u\n",
+				      LOV_DESC_STRIPE_SIZE_DEFAULT);
+		*val = LOV_DESC_STRIPE_SIZE_DEFAULT;
+	} else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) {
+		*val &= ~(LOV_MIN_STRIPE_SIZE - 1);
+		LCONSOLE_WARN("Changing default stripe size to %llu (a "
+			      "multiple of %u)\n",
+			      *val, LOV_MIN_STRIPE_SIZE);
+	}
+}
+
+void lov_fix_desc_stripe_count(__u32 *val)
+{
+        if (*val == 0)
+                *val = 1;
+}
+
+void lov_fix_desc_pattern(__u32 *val)
+{
+        /* from lov_setstripe */
+	if ((*val != 0) && !lov_pattern_supported_normal_comp(*val)) {
+		LCONSOLE_WARN("lov: Unknown stripe pattern: %#x\n", *val);
+		*val = 0;
+	}
+}
+
+void lov_fix_desc_qos_maxage(__u32 *val)
+{
+	if (*val == 0)
+		*val = LOV_DESC_QOS_MAXAGE_DEFAULT;
+}
+
+void lov_fix_desc(struct lov_desc *desc)
+{
+	lov_fix_desc_stripe_size(&desc->ld_default_stripe_size);
+	lov_fix_desc_stripe_count(&desc->ld_default_stripe_count);
+	lov_fix_desc_pattern(&desc->ld_pattern);
+	lov_fix_desc_qos_maxage(&desc->ld_qos_maxage);
+}
+
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lov_desc *desc;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc;
+	ENTRY;
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+                CERROR("LOV setup requires a descriptor\n");
+                RETURN(-EINVAL);
+        }
+
+        desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1);
+
+        if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+                CERROR("descriptor size wrong: %d > %d\n",
+                       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+                RETURN(-EINVAL);
+        }
+
+        if (desc->ld_magic != LOV_DESC_MAGIC) {
+                if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) {
+                            CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n",
+                                   obd->obd_name, desc);
+                            lustre_swab_lov_desc(desc);
+                } else {
+                        CERROR("%s: Bad lov desc magic: %#x\n",
+                               obd->obd_name, desc->ld_magic);
+                        RETURN(-EINVAL);
+                }
+        }
+
+        lov_fix_desc(desc);
+
+	desc->ld_active_tgt_count = 0;
+	lov->desc = *desc;
+	lov->lov_tgt_size = 0;
+
+	mutex_init(&lov->lov_lock);
+	atomic_set(&lov->lov_refcount, 0);
+	lov->lov_sp_me = LUSTRE_SP_CLI;
+
+	init_rwsem(&lov->lov_notify_lock);
+
+	INIT_LIST_HEAD(&lov->lov_pool_list);
+        lov->lov_pool_count = 0;
+	rc = lov_pool_hash_init(&lov->lov_pools_hash_body);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = lu_tgt_pool_init(&lov->lov_packed, 0);
+        if (rc)
+		GOTO(out, rc);
+
+	rc = lov_tunables_init(obd);
+	if (rc)
+		GOTO(out, rc);
+
+	lov->lov_tgts_kobj = kobject_create_and_add("target_obds",
+						    &obd->obd_kset.kobj);
+
+out:
+	return rc;
+}
+
+static int lov_cleanup(struct obd_device *obd)
+{
+        struct lov_obd *lov = &obd->u.lov;
+	struct list_head *pos, *tmp;
+        struct pool_desc *pool;
+        ENTRY;
+
+	if (lov->lov_tgts_kobj) {
+		kobject_put(lov->lov_tgts_kobj);
+		lov->lov_tgts_kobj = NULL;
+	}
+
+	list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+		pool = list_entry(pos, struct pool_desc, pool_list);
+                /* free pool structs */
+                CDEBUG(D_INFO, "delete pool %p\n", pool);
+		/* In the function below, .hs_keycmp resolves to
+		 * pool_hashkey_keycmp() */
+		/* coverity[overrun-buffer-val] */
+                lov_pool_del(obd, pool->pool_name);
+        }
+	lov_pool_hash_destroy(&lov->lov_pools_hash_body);
+	lu_tgt_pool_free(&lov->lov_packed);
+
+	lprocfs_obd_cleanup(obd);
+        if (lov->lov_tgts) {
+                int i;
+		lov_tgts_getref(obd);
+                for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i])
+				continue;
+
+			/* Inactive targets may never have connected */
+			if (lov->lov_tgts[i]->ltd_active)
+				/* We should never get here - these
+				 * should have been removed in the
+				 * disconnect. */
+				CERROR("%s: lov tgt %d not cleaned! "
+				       "deathrow=%d, lovrc=%d\n",
+				       obd->obd_name, i, lov->lov_death_row,
+				       atomic_read(&lov->lov_refcount));
+			lov_del_target(obd, i, NULL, 0);
+		}
+		lov_tgts_putref(obd);
+		OBD_FREE_PTR_ARRAY(lov->lov_tgts, lov->lov_tgt_size);
+		lov->lov_tgt_size = 0;
+	}
+
+	if (lov->lov_cache != NULL) {
+		cl_cache_decref(lov->lov_cache);
+		lov->lov_cache = NULL;
+	}
+
+        RETURN(0);
+}
+
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+			    u32 *indexp, int *genp)
+{
+	struct obd_uuid obd_uuid;
+	int cmd;
+	int rc = 0;
+
+	ENTRY;
+	switch (cmd = lcfg->lcfg_command) {
+	case LCFG_ADD_MDC:
+	case LCFG_DEL_MDC:
+		break;
+	case LCFG_LOV_ADD_OBD:
+	case LCFG_LOV_ADD_INA:
+	case LCFG_LOV_DEL_OBD: {
+		u32 index;
+		int gen;
+
+		/* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		rc = kstrtou32(lustre_cfg_buf(lcfg, 2), 10, indexp);
+		if (rc)
+			GOTO(out, rc);
+		rc = kstrtoint(lustre_cfg_buf(lcfg, 3), 10, genp);
+		if (rc)
+			GOTO(out, rc);
+		index = *indexp;
+		gen = *genp;
+		if (cmd == LCFG_LOV_ADD_OBD)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+		else if (cmd == LCFG_LOV_ADD_INA)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+		else
+			rc = lov_del_target(obd, index, &obd_uuid, gen);
+
+		GOTO(out, rc);
+	}
+	case LCFG_PARAM: {
+		struct lov_desc *desc = &(obd->u.lov.desc);
+		ssize_t count;
+
+		if (!desc)
+			GOTO(out, rc = -EINVAL);
+
+		count = class_modify_config(lcfg, PARAM_LOV,
+					    &obd->obd_kset.kobj);
+		GOTO(out, rc = count < 0 ? count : 0);
+        }
+        case LCFG_POOL_NEW:
+        case LCFG_POOL_ADD:
+        case LCFG_POOL_DEL:
+        case LCFG_POOL_REM:
+                GOTO(out, rc);
+
+        default: {
+                CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+                GOTO(out, rc = -EINVAL);
+
+        }
+        }
+out:
+        RETURN(rc);
+}
+
+static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_info oinfo = {
+		.oi_osfs = osfs,
+		.oi_flags = flags,
+	};
+	struct ptlrpc_request_set *rqset;
+	struct lov_request_set *set = NULL;
+	struct lov_request *req;
+	int rc = 0;
+	int rc2;
+
+	ENTRY;
+
+	rqset = ptlrpc_prep_set();
+	if (rqset == NULL)
+		RETURN(-ENOMEM);
+
+	rc = lov_prep_statfs_set(obd, &oinfo, &set);
+	if (rc < 0)
+		GOTO(out_rqset, rc);
+
+	list_for_each_entry(req, &set->set_list, rq_link) {
+		rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				      &req->rq_oi, max_age, rqset);
+		if (rc < 0)
+			GOTO(out_set, rc);
+	}
+
+	rc = ptlrpc_set_wait(env, rqset);
+
+out_set:
+	if (rc < 0)
+		atomic_set(&set->set_completes, 0);
+
+	rc2 = lov_fini_statfs_set(set);
+	if (rc == 0)
+		rc = rc2;
+
+out_rqset:
+	ptlrpc_set_destroy(rqset);
+
+	RETURN(rc);
+}
+
+static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void __user *uarg)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+
+	ENTRY;
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *osc_obd;
+		struct obd_statfs stat_buf = {0};
+		struct obd_import *imp;
+		__u32 index;
+		__u32 flags;
+
+		memcpy(&index, data->ioc_inlbuf2, sizeof(index));
+		if (index >= count)
+			RETURN(-ENODEV);
+
+		if (!lov->lov_tgts[index])
+			/* Try again with the next index */
+			RETURN(-EAGAIN);
+
+		osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+		if (!osc_obd)
+			RETURN(-EINVAL);
+
+		imp = osc_obd->u.cli.cl_import;
+		if (!lov->lov_tgts[index]->ltd_active &&
+		    imp->imp_state != LUSTRE_IMP_IDLE)
+			RETURN(-ENODATA);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
+				 min_t(unsigned long, data->ioc_plen2,
+				       sizeof(struct obd_uuid))))
+			RETURN(-EFAULT);
+
+		memcpy(&flags, data->ioc_inlbuf1, sizeof(flags));
+		flags = flags & LL_STATFS_NODELAY ? OBD_STATFS_NODELAY : 0;
+
+		/* got statfs data */
+		rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+				flags);
+		if (rc)
+			RETURN(rc);
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				 min_t(unsigned long, data->ioc_plen1,
+				       sizeof(struct obd_statfs))))
+			RETURN(-EFAULT);
+		break;
+        }
+        case OBD_IOC_QUOTACTL: {
+                struct if_quotactl *qctl = karg;
+                struct lov_tgt_desc *tgt = NULL;
+                struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_OSTIDX) {
+			if (count <= qctl->qc_idx)
+				RETURN(-EINVAL);
+
+			tgt = lov->lov_tgts[qctl->qc_idx];
+			if (!tgt || !tgt->ltd_exp)
+				RETURN(-EINVAL);
+                } else if (qctl->qc_valid == QC_UUID) {
+                        for (i = 0; i < count; i++) {
+                                tgt = lov->lov_tgts[i];
+                                if (!tgt ||
+                                    !obd_uuid_equals(&tgt->ltd_uuid,
+                                                     &qctl->obd_uuid))
+                                        continue;
+
+                                if (tgt->ltd_exp == NULL)
+                                        RETURN(-EINVAL);
+
+                                break;
+                        }
+                } else {
+                        RETURN(-EINVAL);
+                }
+
+                if (i >= count)
+                        RETURN(-EAGAIN);
+
+                LASSERT(tgt && tgt->ltd_exp);
+                OBD_ALLOC_PTR(oqctl);
+                if (!oqctl)
+                        RETURN(-ENOMEM);
+
+                QCTL_COPY(oqctl, qctl);
+                rc = obd_quotactl(tgt->ltd_exp, oqctl);
+                if (rc == 0) {
+                        QCTL_COPY(qctl, oqctl);
+                        qctl->qc_valid = QC_OSTIDX;
+                        qctl->obd_uuid = tgt->ltd_uuid;
+                }
+                OBD_FREE_PTR(oqctl);
+                break;
+        }
+	default: {
+		int set = 0;
+
+		if (count == 0)
+			RETURN(-ENOTTY);
+
+		for (i = 0; i < count; i++) {
+			int err;
+			struct obd_device *osc_obd;
+
+			/* OST was disconnected */
+			if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+				continue;
+
+			/* ll_umount_begin() sets force on lov, pass to osc */
+			osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+			if (osc_obd)
+				osc_obd->obd_force = obd->obd_force;
+			err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+					    len, karg, uarg);
+			if (err) {
+				if (lov->lov_tgts[i]->ltd_active) {
+					CDEBUG_LIMIT(err == -ENOTTY ?
+						     D_IOCTL : D_WARNING,
+						     "iocontrol OSC %s on OST idx %d cmd %x: err = %d\n",
+						     lov_uuid2str(lov, i),
+						     i, cmd, err);
+					if (!rc)
+						rc = err;
+				}
+			} else {
+				set = 1;
+			}
+		}
+		if (!set && !rc)
+			rc = -EIO;
+	}
+	}
+
+	RETURN(rc);
+}
+
+static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_desc *ld = &lov->desc;
+	int rc = 0;
+	ENTRY;
+
+	if (vallen == NULL || val == NULL)
+		RETURN(-EFAULT);
+
+	lov_tgts_getref(obd);
+
+	if (KEY_IS(KEY_MAX_EASIZE)) {
+		*((u32 *)val) = exp->exp_connect_data.ocd_max_easize;
+	} else if (KEY_IS(KEY_DEFAULT_EASIZE)) {
+		u32 def_stripe_count = min_t(u32, ld->ld_default_stripe_count,
+					     LOV_MAX_STRIPE_COUNT);
+
+		*((u32 *)val) = lov_mds_md_size(def_stripe_count, LOV_MAGIC_V3);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lov->desc.ld_tgt_count;
+	} else {
+		rc = -EINVAL;
+	}
+
+	lov_tgts_putref(obd);
+
+	RETURN(rc);
+}
+
+static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      __u32 keylen, void *key,
+			      __u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	bool do_inactive = false, no_set = false;
+	u32 i;
+	int rc = 0;
+	int err;
+
+	ENTRY;
+
+	if (set == NULL) {
+		no_set = true;
+		set = ptlrpc_prep_set();
+		if (!set)
+			RETURN(-ENOMEM);
+	}
+
+	lov_tgts_getref(obd);
+
+	if (KEY_IS(KEY_CHECKSUM))
+		do_inactive = true;
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		tgt = lov->lov_tgts[i];
+
+		/* OST was disconnected */
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			continue;
+
+		/* OST is inactive and we don't want inactive OSCs */
+		if (!tgt->ltd_active && !do_inactive)
+			continue;
+
+		err = obd_set_info_async(env, tgt->ltd_exp, keylen, key,
+					 vallen, val, set);
+
+		if (rc == 0)
+			rc = err;
+	}
+
+	/* cycle through MDC target for Data-on-MDT */
+	for (i = 0; i < LOV_MDC_TGT_MAX; i++) {
+		struct obd_device *mdc;
+
+		mdc = lov->lov_mdc_tgts[i].lmtd_mdc;
+		if (mdc == NULL)
+			continue;
+
+		err = obd_set_info_async(env, mdc->obd_self_export,
+					 keylen, key, vallen, val, set);
+		if (rc == 0)
+			rc = err;
+	}
+
+	lov_tgts_putref(obd);
+	if (no_set) {
+		err = ptlrpc_set_wait(env, set);
+		if (rc == 0)
+			rc = err;
+		ptlrpc_set_destroy(set);
+	}
+	RETURN(rc);
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md)
+__acquires(&md->lsm_lock)
+{
+	LASSERT(md->lsm_lock_owner != current->pid);
+	spin_lock(&md->lsm_lock);
+	LASSERT(md->lsm_lock_owner == 0);
+	md->lsm_lock_owner = current->pid;
+}
+
+void lov_stripe_unlock(struct lov_stripe_md *md)
+__releases(&md->lsm_lock)
+{
+	LASSERT(md->lsm_lock_owner == current->pid);
+	md->lsm_lock_owner = 0;
+	spin_unlock(&md->lsm_lock);
+}
+
+static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
+			struct obd_quotactl *oqctl)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct pool_desc *pool = NULL;
+	__u64 curspace = 0;
+	__u64 bhardlimit = 0;
+	int i, rc = 0;
+
+	ENTRY;
+	if (oqctl->qc_cmd != Q_GETOQUOTA &&
+	    oqctl->qc_cmd != LUSTRE_Q_SETQUOTA &&
+	    oqctl->qc_cmd != LUSTRE_Q_GETQUOTAPOOL) {
+		rc = -EFAULT;
+		CERROR("%s: bad quota opc %x for lov obd: rc = %d\n",
+		       obd->obd_name, oqctl->qc_cmd, rc);
+		RETURN(rc);
+	}
+
+	if (oqctl->qc_cmd == LUSTRE_Q_GETQUOTAPOOL) {
+		pool = lov_pool_find(obd, oqctl->qc_poolname);
+		if (!pool)
+			RETURN(-ENOENT);
+		/* Set Q_GETOQUOTA back as targets report it's own
+		 * usage and doesn't care about pools */
+		oqctl->qc_cmd = Q_GETOQUOTA;
+	}
+
+        /* for lov tgt */
+	lov_tgts_getref(obd);
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                int err;
+
+                tgt = lov->lov_tgts[i];
+
+                if (!tgt)
+                        continue;
+
+		if (pool &&
+		    lu_tgt_check_index(tgt->ltd_index, &pool->pool_obds))
+			continue;
+
+		if (!tgt->ltd_active || tgt->ltd_reap) {
+			if (oqctl->qc_cmd == Q_GETOQUOTA &&
+			    lov->lov_tgts[i]->ltd_activate) {
+				rc = -ENETDOWN;
+				CERROR("%s: ost %d is inactive: rc = %d\n",
+				       obd->obd_name, i, rc);
+			} else {
+				CDEBUG(D_HA, "ost %d is inactive\n", i);
+			}
+			continue;
+		}
+
+                err = obd_quotactl(tgt->ltd_exp, oqctl);
+                if (err) {
+                        if (tgt->ltd_active && !rc)
+                                rc = err;
+                        continue;
+                }
+
+                if (oqctl->qc_cmd == Q_GETOQUOTA) {
+                        curspace += oqctl->qc_dqblk.dqb_curspace;
+                        bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
+                }
+        }
+	lov_tgts_putref(obd);
+	if (pool)
+		lov_pool_putref(pool);
+
+        if (oqctl->qc_cmd == Q_GETOQUOTA) {
+                oqctl->qc_dqblk.dqb_curspace = curspace;
+                oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit;
+        }
+        RETURN(rc);
+}
+
+static const struct obd_ops lov_obd_ops = {
+	.o_owner		= THIS_MODULE,
+	.o_setup		= lov_setup,
+	.o_cleanup		= lov_cleanup,
+	.o_connect		= lov_connect,
+	.o_disconnect		= lov_disconnect,
+	.o_statfs		= lov_statfs,
+	.o_iocontrol		= lov_iocontrol,
+	.o_get_info		= lov_get_info,
+	.o_set_info_async	= lov_set_info_async,
+	.o_notify		= lov_notify,
+	.o_pool_new		= lov_pool_new,
+	.o_pool_rem		= lov_pool_remove,
+	.o_pool_add		= lov_pool_add,
+	.o_pool_del		= lov_pool_del,
+	.o_quotactl		= lov_quotactl,
+};
+
+struct kmem_cache *lov_oinfo_slab;
+
+static int __init lov_init(void)
+{
+	int rc;
+	ENTRY;
+
+        /* print an address of _any_ initialized kernel symbol from this
+         * module, to allow debugging with gdb that doesn't support data
+         * symbols from modules.*/
+        CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches);
+
+        rc = lu_kmem_init(lov_caches);
+        if (rc)
+                return rc;
+
+	lov_oinfo_slab = kmem_cache_create("lov_oinfo",
+					   sizeof(struct lov_oinfo), 0,
+					   SLAB_HWCACHE_ALIGN, NULL);
+        if (lov_oinfo_slab == NULL) {
+                lu_kmem_fini(lov_caches);
+                return -ENOMEM;
+        }
+
+	rc = class_register_type(&lov_obd_ops, NULL, true,
+				 LUSTRE_LOV_NAME, &lov_device_type);
+        if (rc) {
+		kmem_cache_destroy(lov_oinfo_slab);
+                lu_kmem_fini(lov_caches);
+        }
+
+        RETURN(rc);
+}
+
+static void __exit lov_exit(void)
+{
+	class_unregister_type(LUSTRE_LOV_NAME);
+	kmem_cache_destroy(lov_oinfo_slab);
+	lu_kmem_fini(lov_caches);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Object Volume");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(lov_init);
+module_exit(lov_exit);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_object.c b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
new file mode 100644
index 0000000000000..5aac191d64c4e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
@@ -0,0 +1,2336 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_object for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/random.h>
+
+#include "lov_cl_internal.h"
+
+static inline struct lov_device *lov_object_dev(struct lov_object *obj)
+{
+	return lu2lov_dev(obj->lo_cl.co_lu.lo_dev);
+}
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Layout operations.
+ *
+ */
+
+struct lov_layout_operations {
+	int (*llo_init)(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov, struct lov_stripe_md *lsm,
+			const struct cl_object_conf *conf,
+			union lov_layout_state *state);
+	int (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state);
+        void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
+                         union lov_layout_state *state);
+        int  (*llo_print)(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct lu_object *o);
+        int  (*llo_page_init)(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_page *page, pgoff_t index);
+        int  (*llo_lock_init)(const struct lu_env *env,
+                              struct cl_object *obj, struct cl_lock *lock,
+                              const struct cl_io *io);
+        int  (*llo_io_init)(const struct lu_env *env,
+                            struct cl_object *obj, struct cl_io *io);
+        int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
+                            struct cl_attr *attr);
+	int  (*llo_flush)(const struct lu_env *env, struct cl_object *obj,
+			  struct ldlm_lock *lock);
+};
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
+static struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
+
+static void lov_lsm_put(struct lov_stripe_md *lsm)
+{
+	if (lsm != NULL)
+		lov_free_memmd(&lsm);
+}
+
+/*****************************************************************************
+ *
+ * Lov object layout operations.
+ *
+ */
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+				      struct cl_device *dev,
+				      const struct lu_fid *fid,
+				      const struct cl_object_conf *conf)
+{
+	struct lu_object *o;
+
+	ENTRY;
+
+	o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+	LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+	RETURN(lu2cl(o));
+}
+
+static int lov_page_slice_fixup(struct lov_object *lov,
+				struct cl_object *stripe)
+{
+	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+	struct cl_object *o;
+
+	if (stripe == NULL)
+		return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
+		       cfs_size_round(sizeof(struct lov_page));
+
+	cl_object_for_each(o, stripe)
+		o->co_slice_off += hdr->coh_page_bufsize;
+
+	return cl_object_header(stripe)->coh_page_bufsize;
+}
+
+static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
+			struct cl_object *subobj, struct lov_oinfo *oinfo,
+			int idx)
+{
+	struct cl_object_header *hdr;
+	struct cl_object_header *subhdr;
+	struct cl_object_header *parent;
+	int entry = lov_comp_entry(idx);
+	int stripe = lov_comp_stripe(idx);
+	int result;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) {
+		/* For sanity:test_206.
+		 * Do not leave the object in cache to avoid accessing
+		 * freed memory. This is because osc_object is referring to
+		 * lov_oinfo of lsm_stripe_data which will be freed due to
+		 * this failure. */
+		cl_object_kill(env, subobj);
+		cl_object_put(env, subobj);
+		return -EIO;
+	}
+
+	hdr = cl_object_header(lov2cl(lov));
+	subhdr = cl_object_header(subobj);
+
+	CDEBUG(D_INODE, DFID"@%p[%d:%d] -> "DFID"@%p: ostid: "DOSTID
+	       " ost idx: %d gen: %d\n",
+	       PFID(lu_object_fid(&subobj->co_lu)), subhdr, entry, stripe,
+	       PFID(lu_object_fid(lov2lu(lov))), hdr, POSTID(&oinfo->loi_oi),
+	       oinfo->loi_ost_idx, oinfo->loi_ost_gen);
+
+	/* reuse ->coh_attr_guard to protect coh_parent change */
+	spin_lock(&subhdr->coh_attr_guard);
+	parent = subhdr->coh_parent;
+	if (parent == NULL) {
+		struct lovsub_object *lso = cl2lovsub(subobj);
+
+		subhdr->coh_parent = hdr;
+		spin_unlock(&subhdr->coh_attr_guard);
+		subhdr->coh_nesting = hdr->coh_nesting + 1;
+		lu_object_ref_add(&subobj->co_lu, "lov-parent", lov);
+		lso->lso_super = lov;
+		lso->lso_index = idx;
+		result = 0;
+	} else {
+		struct lu_object  *old_obj;
+		struct lov_object *old_lov;
+		unsigned int mask = D_INODE;
+
+		spin_unlock(&subhdr->coh_attr_guard);
+		old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
+		LASSERT(old_obj != NULL);
+		old_lov = cl2lov(lu2cl(old_obj));
+		if (test_bit(LO_LAYOUT_INVALID, &old_lov->lo_obj_flags)) {
+			/* the object's layout has already changed but isn't
+			 * refreshed */
+			lu_object_unhash(env, &subobj->co_lu);
+			result = -EAGAIN;
+		} else {
+			mask = D_ERROR;
+			result = -EIO;
+		}
+
+		LU_OBJECT_DEBUG(mask, env, &subobj->co_lu,
+				"stripe %d is already owned.", idx);
+		LU_OBJECT_DEBUG(mask, env, old_obj, "owned.");
+		LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n");
+		cl_object_put(env, subobj);
+	}
+	return result;
+}
+
+static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
+			  struct lov_object *lov, unsigned int index,
+			  const struct cl_object_conf *conf,
+			  struct lov_layout_entry *lle)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+	struct lov_thread_info *lti = lov_env_info(env);
+	struct cl_object_conf *subconf = &lti->lti_stripe_conf;
+	struct lu_fid *ofid = &lti->lti_fid;
+	struct cl_object *stripe;
+	struct lov_stripe_md_entry *lse  = lov_lse(lov, index);
+	int result;
+	int psz, sz;
+	int i;
+
+	ENTRY;
+
+	spin_lock_init(&r0->lo_sub_lock);
+	r0->lo_nr = lse->lsme_stripe_count;
+
+	OBD_ALLOC_PTR_ARRAY_LARGE(r0->lo_sub, r0->lo_nr);
+	if (r0->lo_sub == NULL)
+		GOTO(out, result = -ENOMEM);
+
+	psz = 0;
+	result = 0;
+	memset(subconf, 0, sizeof(*subconf));
+
+	/*
+	 * Create stripe cl_objects.
+	 */
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct cl_device *subdev;
+		struct lov_oinfo *oinfo = lse->lsme_oinfo[i];
+		int ost_idx = oinfo->loi_ost_idx;
+		struct obd_export *exp;
+
+		if (lov_oinfo_is_dummy(oinfo))
+			continue;
+
+		result = ostid_to_fid(ofid, &oinfo->loi_oi, oinfo->loi_ost_idx);
+		if (result != 0)
+			GOTO(out, result);
+
+		if (dev->ld_target[ost_idx] == NULL) {
+			CERROR("%s: OST %04x is not initialized\n",
+			       lov2obd(dev->ld_lov)->obd_name, ost_idx);
+			GOTO(out, result = -EIO);
+		}
+
+		exp = dev->ld_lov->lov_tgts[ost_idx]->ltd_exp;
+		if (likely(exp)) {
+			/* the more fast OSTs the better */
+			if (exp->exp_obd->obd_osfs.os_state & OS_STATFS_NONROT)
+				lle->lle_preference++;
+		}
+
+		subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+		subconf->u.coc_oinfo = oinfo;
+		LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
+		/* In the function below, .hs_keycmp resolves to
+		 * lu_obj_hop_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		stripe = lov_sub_find(env, subdev, ofid, subconf);
+		if (IS_ERR(stripe))
+			GOTO(out, result = PTR_ERR(stripe));
+
+		result = lov_init_sub(env, lov, stripe, oinfo,
+				      lov_comp_index(index, i));
+		if (result == -EAGAIN) { /* try again */
+			--i;
+			result = 0;
+			continue;
+		}
+
+		if (result == 0) {
+			r0->lo_sub[i] = cl2lovsub(stripe);
+
+			sz = lov_page_slice_fixup(lov, stripe);
+			LASSERT(ergo(psz > 0, psz == sz));
+			psz = sz;
+		}
+	}
+	if (result == 0)
+		result = psz;
+out:
+	RETURN(result);
+}
+
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+			       struct lov_layout_raid0 *r0,
+			       struct lovsub_object *los, int idx)
+{
+	struct cl_object        *sub;
+	struct lu_site          *site;
+	wait_queue_head_t *wq;
+
+        LASSERT(r0->lo_sub[idx] == los);
+
+	sub = lovsub2cl(los);
+	site = sub->co_lu.lo_dev->ld_site;
+	wq = lu_site_wq_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+        cl_object_kill(env, sub);
+        /* release a reference to the sub-object and ... */
+        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+        cl_object_put(env, sub);
+
+	/* ... wait until it is actually destroyed---sub-object clears its
+	 * ->lo_sub[] slot in lovsub_object_free() */
+	wait_event(*wq, r0->lo_sub[idx] != los);
+	LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+			     struct lov_layout_entry *lle)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+
+	ENTRY;
+
+        if (r0->lo_sub != NULL) {
+		int i;
+
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lovsub_object *los = r0->lo_sub[i];
+
+			if (los != NULL) {
+				cl_object_prune(env, &los->lso_cl);
+				/*
+				 * If top-level object is to be evicted from
+				 * the cache, so are its sub-objects.
+				 */
+				lov_subobject_kill(env, lov, r0, los, i);
+			}
+		}
+	}
+
+	EXIT;
+}
+
+static void lov_fini_raid0(const struct lu_env *env,
+			   struct lov_layout_entry *lle)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+
+	if (r0->lo_sub != NULL) {
+		OBD_FREE_PTR_ARRAY_LARGE(r0->lo_sub, r0->lo_nr);
+		r0->lo_sub = NULL;
+	}
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, const struct lov_layout_entry *lle)
+{
+	const struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+	int i;
+
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct lu_object *sub;
+
+		if (r0->lo_sub[i] != NULL) {
+			sub = lovsub2lu(r0->lo_sub[i]);
+			lu_object_print(env, cookie, p, sub);
+		} else {
+			(*p)(env, cookie, "sub %d absent\n", i);
+		}
+	}
+	return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
+			      unsigned int index, struct lov_layout_entry *lle,
+			      struct cl_attr **lov_attr)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
+	struct cl_attr *attr = &r0->lo_attr;
+	__u64 kms = 0;
+	int result = 0;
+
+	if (r0->lo_attr_valid) {
+		*lov_attr = attr;
+		return 0;
+	}
+
+	memset(lvb, 0, sizeof(*lvb));
+
+	/* XXX: timestamps can be negative by sanity:test_39m,
+	 * how can it be? */
+	lvb->lvb_atime = LLONG_MIN;
+	lvb->lvb_ctime = LLONG_MIN;
+	lvb->lvb_mtime = LLONG_MIN;
+
+	/*
+	 * XXX that should be replaced with a loop over sub-objects,
+	 * doing cl_object_attr_get() on them. But for now, let's
+	 * reuse old lov code.
+	 */
+
+	/*
+	 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+	 * happy. It's not needed, because new code uses
+	 * ->coh_attr_guard spin-lock to protect consistency of
+	 * sub-object attributes.
+	 */
+	lov_stripe_lock(lsm);
+	result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
+	lov_stripe_unlock(lsm);
+	if (result == 0) {
+		cl_lvb2attr(attr, lvb);
+		attr->cat_kms = kms;
+		r0->lo_attr_valid = 1;
+		*lov_attr = attr;
+	}
+
+	return result;
+}
+
+static struct lov_comp_layout_entry_ops raid0_ops = {
+	.lco_init      = lov_init_raid0,
+	.lco_fini      = lov_fini_raid0,
+	.lco_getattr   = lov_attr_get_raid0,
+};
+
+static int lov_attr_get_dom(const struct lu_env *env, struct lov_object *lov,
+			    unsigned int index, struct lov_layout_entry *lle,
+			    struct cl_attr **lov_attr)
+{
+	struct lov_layout_dom *dom = &lle->lle_dom;
+	struct lov_oinfo *loi = dom->lo_loi;
+	struct cl_attr *attr = &dom->lo_dom_r0.lo_attr;
+
+	if (dom->lo_dom_r0.lo_attr_valid) {
+		*lov_attr = attr;
+		return 0;
+	}
+
+	if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks))
+		return OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+
+	cl_lvb2attr(attr, &loi->loi_lvb);
+
+	/* DoM component size can be bigger than stripe size after
+	 * client's setattr RPC, so do not count anything beyond
+	 * component end. Alternatively, check that limit on server
+	 * and do not allow size overflow there. */
+	if (attr->cat_size > lle->lle_extent->e_end)
+		attr->cat_size = lle->lle_extent->e_end;
+
+	attr->cat_kms = attr->cat_size;
+
+	dom->lo_dom_r0.lo_attr_valid = 1;
+	*lov_attr = attr;
+
+	return 0;
+}
+
+/**
+ * Lookup FLD to get MDS index of the given DOM object FID.
+ *
+ * \param[in]  ld	LOV device
+ * \param[in]  fid	FID to lookup
+ * \param[out] nr	index in MDC array to return back
+ *
+ * \retval		0 and \a mds filled with MDS index if successful
+ * \retval		negative value on error
+ */
+static int lov_fld_lookup(struct lov_device *ld, const struct lu_fid *fid,
+			  __u32 *nr)
+{
+	__u32 mds_idx;
+	int i, rc;
+
+	ENTRY;
+
+	rc = fld_client_lookup(&ld->ld_lmv->u.lmv.lmv_fld, fid_seq(fid),
+			       &mds_idx, LU_SEQ_RANGE_MDT, NULL);
+	if (rc) {
+		CERROR("%s: error while looking for mds number. Seq %#llx"
+		       ", err = %d\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
+		       fid_seq(fid), rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+	       mds_idx, PFID(fid));
+
+	/* find proper MDC device in the array */
+	for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+		if (ld->ld_md_tgts[i].ldm_mdc != NULL &&
+		    ld->ld_md_tgts[i].ldm_idx == mds_idx)
+			break;
+	}
+
+	if (i == ld->ld_md_tgts_nr) {
+		CERROR("%s: cannot find corresponding MDC device for mds #%x "
+		       "for fid="DFID"\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
+		       mds_idx, PFID(fid));
+		rc = -EINVAL;
+	} else {
+		*nr = i;
+	}
+	RETURN(rc);
+}
+
+/**
+ * Implementation of lov_comp_layout_entry_ops::lco_init for DOM object.
+ *
+ * Init the DOM object for the first time. It prepares also RAID0 entry
+ * for it to use in common methods with ordinary RAID0 layout entries.
+ *
+ * \param[in] env	execution environment
+ * \param[in] dev	LOV device
+ * \param[in] lov	LOV object
+ * \param[in] index	Composite layout entry index in LSM
+ * \param[in] lle	Composite LOV layout entry
+ */
+static int lov_init_dom(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov, unsigned int index,
+			const struct cl_object_conf *conf,
+			struct lov_layout_entry *lle)
+{
+	struct lov_thread_info *lti = lov_env_info(env);
+	struct lov_stripe_md_entry *lsme = lov_lse(lov, index);
+	struct cl_object *clo;
+	struct lu_object *o = lov2lu(lov);
+	const struct lu_fid *fid = lu_object_fid(o);
+	struct cl_device *mdcdev;
+	struct lov_oinfo *loi = NULL;
+	struct cl_object_conf *sconf = &lti->lti_stripe_conf;
+	int rc;
+	__u32 idx = 0;
+
+	ENTRY;
+
+	/* DOM entry may be not zero index due to FLR but must start from 0 */
+	if (unlikely(lle->lle_extent->e_start != 0)) {
+		CERROR("%s: DOM entry must be the first stripe in a mirror\n",
+		       lov2obd(dev->ld_lov)->obd_name);
+		dump_lsm(D_ERROR, lov->lo_lsm);
+		RETURN(-EINVAL);
+	}
+
+	/* find proper MDS device */
+	rc = lov_fld_lookup(dev, fid, &idx);
+	if (rc)
+		RETURN(rc);
+
+	LASSERTF(dev->ld_md_tgts[idx].ldm_mdc != NULL,
+		 "LOV md target[%u] is NULL\n", idx);
+
+	/* check lsm is DOM, more checks are needed */
+	LASSERT(lsme->lsme_stripe_count == 0);
+
+	/*
+	 * Create lower cl_objects.
+	 */
+	mdcdev = dev->ld_md_tgts[idx].ldm_mdc;
+
+	LASSERTF(mdcdev != NULL, "non-initialized mdc subdev\n");
+
+	/* DoM object has no oinfo in LSM entry, create it exclusively */
+	OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
+	if (loi == NULL)
+		RETURN(-ENOMEM);
+
+	fid_to_ostid(lu_object_fid(lov2lu(lov)), &loi->loi_oi);
+
+	sconf->u.coc_oinfo = loi;
+again:
+	clo = lov_sub_find(env, mdcdev, fid, sconf);
+	if (IS_ERR(clo))
+		GOTO(out, rc = PTR_ERR(clo));
+
+	rc = lov_init_sub(env, lov, clo, loi, lov_comp_index(index, 0));
+	if (rc == -EAGAIN) /* try again */
+		goto again;
+	else if (rc != 0)
+		GOTO(out, rc);
+
+	lle->lle_dom.lo_dom = cl2lovsub(clo);
+	spin_lock_init(&lle->lle_dom.lo_dom_r0.lo_sub_lock);
+	lle->lle_dom.lo_dom_r0.lo_nr = 1;
+	lle->lle_dom.lo_dom_r0.lo_sub = &lle->lle_dom.lo_dom;
+	lle->lle_dom.lo_loi = loi;
+
+	rc = lov_page_slice_fixup(lov, clo);
+	RETURN(rc);
+
+out:
+	if (loi != NULL)
+		OBD_SLAB_FREE_PTR(loi, lov_oinfo_slab);
+	return rc;
+}
+
+/**
+ * Implementation of lov_layout_operations::llo_fini for DOM object.
+ *
+ * Finish the DOM object and free related memory.
+ *
+ * \param[in] env	execution environment
+ * \param[in] lov	LOV object
+ * \param[in] state	LOV layout state
+ */
+static void lov_fini_dom(const struct lu_env *env,
+			 struct lov_layout_entry *lle)
+{
+	if (lle->lle_dom.lo_dom != NULL)
+		lle->lle_dom.lo_dom = NULL;
+	if (lle->lle_dom.lo_loi != NULL)
+		OBD_SLAB_FREE_PTR(lle->lle_dom.lo_loi, lov_oinfo_slab);
+}
+
+static struct lov_comp_layout_entry_ops dom_ops = {
+	.lco_init = lov_init_dom,
+	.lco_fini = lov_fini_dom,
+	.lco_getattr = lov_attr_get_dom,
+};
+
+static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
+			      struct lov_object *lov, struct lov_stripe_md *lsm,
+			      const struct cl_object_conf *conf,
+			      union lov_layout_state *state)
+{
+	struct lov_layout_composite *comp = &state->composite;
+	struct lov_layout_entry *lle;
+	struct lov_mirror_entry *lre;
+	unsigned int entry_count;
+	unsigned int psz = 0;
+	unsigned int mirror_count;
+	int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK;
+	int result = 0;
+	unsigned int seq;
+	int i, j, preference;
+	bool dom_size = 0;
+
+	ENTRY;
+
+	LASSERT(lsm->lsm_entry_count > 0);
+	LASSERT(lov->lo_lsm == NULL);
+	lov->lo_lsm = lsm_addref(lsm);
+	set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
+
+	dump_lsm(D_INODE, lsm);
+
+	entry_count = lsm->lsm_entry_count;
+
+	comp->lo_flags = lsm->lsm_flags;
+	comp->lo_mirror_count = lsm->lsm_mirror_count + 1;
+	comp->lo_entry_count = lsm->lsm_entry_count;
+	comp->lo_preferred_mirror = -1;
+
+	if (equi(flr_state == LCM_FL_NONE, comp->lo_mirror_count > 1))
+		RETURN(-EINVAL);
+
+	OBD_ALLOC_PTR_ARRAY(comp->lo_mirrors, comp->lo_mirror_count);
+	if (comp->lo_mirrors == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC_PTR_ARRAY(comp->lo_entries, entry_count);
+	if (comp->lo_entries == NULL)
+		RETURN(-ENOMEM);
+
+	/* Initiate all entry types and extents data at first */
+	for (i = 0, j = 0, mirror_count = 1; i < entry_count; i++) {
+		int mirror_id = 0;
+
+		lle = &comp->lo_entries[i];
+
+		lle->lle_lsme = lsm->lsm_entries[i];
+		lle->lle_type = lov_entry_type(lle->lle_lsme);
+		lle->lle_preference = 0;
+		switch (lle->lle_type) {
+		case LOV_PATTERN_RAID0:
+			lle->lle_comp_ops = &raid0_ops;
+			break;
+		case LOV_PATTERN_MDT:
+			/* Allowed to have several DOM stripes in different
+			 * mirrors with the same DoM size.
+			 */
+			if (!dom_size) {
+				dom_size = lle->lle_lsme->lsme_extent.e_end;
+			} else if (dom_size !=
+				   lle->lle_lsme->lsme_extent.e_end) {
+				CERROR("%s: DOM entries with different sizes\n",
+				       lov2obd(dev->ld_lov)->obd_name);
+				dump_lsm(D_ERROR, lsm);
+				RETURN(-EINVAL);
+			}
+			lle->lle_comp_ops = &dom_ops;
+			break;
+		case LOV_PATTERN_FOREIGN:
+			lle->lle_comp_ops = NULL;
+			break;
+		default:
+			CERROR("%s: unknown composite layout entry type %i\n",
+			       lov2obd(dev->ld_lov)->obd_name,
+			       lsm->lsm_entries[i]->lsme_pattern);
+			dump_lsm(D_ERROR, lsm);
+			RETURN(-EIO);
+		}
+
+		lle->lle_extent = &lle->lle_lsme->lsme_extent;
+		lle->lle_valid = !(lle->lle_lsme->lsme_flags & LCME_FL_STALE);
+
+		if (flr_state != LCM_FL_NONE)
+			mirror_id = mirror_id_of(lle->lle_lsme->lsme_id);
+
+		lre = &comp->lo_mirrors[j];
+		if (i > 0) {
+			if (mirror_id == lre->lre_mirror_id) {
+				lre->lre_valid |= lle->lle_valid;
+				lre->lre_stale |= !lle->lle_valid;
+				lre->lre_foreign |=
+					lsme_is_foreign(lle->lle_lsme);
+				lre->lre_end = i;
+				continue;
+			}
+
+			/* new mirror detected, assume that the mirrors
+			 * are shorted in layout */
+			++mirror_count;
+			++j;
+			if (j >= comp->lo_mirror_count)
+				break;
+
+			lre = &comp->lo_mirrors[j];
+		}
+
+		/* entries must be sorted by mirrors */
+		lre->lre_mirror_id = mirror_id;
+		lre->lre_start = lre->lre_end = i;
+		lre->lre_preference = lle->lle_lsme->lsme_flags &
+					LCME_FL_PREF_RD ? 1000 : 0;
+		lre->lre_valid = lle->lle_valid;
+		lre->lre_stale = !lle->lle_valid;
+		lre->lre_foreign = lsme_is_foreign(lle->lle_lsme);
+	}
+
+	/* sanity check for FLR */
+	if (mirror_count != comp->lo_mirror_count) {
+		CDEBUG(D_INODE, DFID
+		       " doesn't have the # of mirrors it claims, %u/%u\n",
+		       PFID(lu_object_fid(lov2lu(lov))), mirror_count,
+		       comp->lo_mirror_count + 1);
+
+		GOTO(out, result = -EINVAL);
+	}
+
+	lov_foreach_layout_entry(lov, lle) {
+		int index = lov_layout_entry_index(lov, lle);
+
+		/**
+		 * If the component has not been init-ed on MDS side, for
+		 * PFL layout, we'd know that the components beyond this one
+		 * will be dynamically init-ed later on file write/trunc ops.
+		 */
+		if (!lsme_inited(lle->lle_lsme))
+			continue;
+
+		if (lsme_is_foreign(lle->lle_lsme))
+			continue;
+
+		result = lle->lle_comp_ops->lco_init(env, dev, lov, index,
+						     conf, lle);
+		if (result < 0)
+			break;
+
+		LASSERT(ergo(psz > 0, psz == result));
+		psz = result;
+	}
+
+	if (psz > 0)
+		cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
+
+	/* decide the preferred mirror. It uses the hash value of lov_object
+	 * so that different clients would use different mirrors for read. */
+	mirror_count = 0;
+	preference = -1;
+	seq = hash_long((unsigned long)lov, 8);
+	for (i = 0; i < comp->lo_mirror_count; i++) {
+		unsigned int idx = (i + seq) % comp->lo_mirror_count;
+
+		lre = lov_mirror_entry(lov, idx);
+		if (lre->lre_stale)
+			continue;
+
+		if (lre->lre_foreign)
+			continue;
+
+		mirror_count++; /* valid mirror */
+
+		/* aggregated preference of all involved OSTs */
+		for (j = lre->lre_start; j <= lre->lre_end; j++) {
+			lre->lre_preference +=
+				comp->lo_entries[j].lle_preference;
+		}
+
+		if (lre->lre_preference > preference) {
+			preference = lre->lre_preference;
+			comp->lo_preferred_mirror = idx;
+		}
+	}
+	if (!mirror_count) {
+		CDEBUG(D_INODE, DFID
+		       " doesn't have any valid mirrors\n",
+		       PFID(lu_object_fid(lov2lu(lov))));
+
+		comp->lo_preferred_mirror = 0;
+	}
+
+	LASSERT(comp->lo_preferred_mirror >= 0);
+
+	EXIT;
+out:
+	return result > 0 ? 0 : result;
+}
+
+static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
+			  struct lov_object *lov, struct lov_stripe_md *lsm,
+			  const struct cl_object_conf *conf,
+			  union lov_layout_state *state)
+{
+	return 0;
+}
+
+static int lov_init_released(const struct lu_env *env,
+			     struct lov_device *dev, struct lov_object *lov,
+			     struct lov_stripe_md *lsm,
+			     const struct cl_object_conf *conf,
+			     union lov_layout_state *state)
+{
+	LASSERT(lsm != NULL);
+	LASSERT(lsm->lsm_is_released);
+	LASSERT(lov->lo_lsm == NULL);
+
+	lov->lo_lsm = lsm_addref(lsm);
+	return 0;
+}
+
+static int lov_init_foreign(const struct lu_env *env,
+			    struct lov_device *dev, struct lov_object *lov,
+			    struct lov_stripe_md *lsm,
+			    const struct cl_object_conf *conf,
+			    union lov_layout_state *state)
+{
+	LASSERT(lsm != NULL);
+	LASSERT(lov->lo_type == LLT_FOREIGN);
+	LASSERT(lov->lo_lsm == NULL);
+
+	lov->lo_lsm = lsm_addref(lsm);
+	return 0;
+}
+
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED ||
+		lov->lo_type == LLT_FOREIGN);
+
+	lov_layout_wait(env, lov);
+	return 0;
+}
+
+static int lov_delete_composite(const struct lu_env *env,
+				struct lov_object *lov,
+				union lov_layout_state *state)
+{
+	struct lov_layout_entry *entry;
+
+	ENTRY;
+
+	dump_lsm(D_INODE, lov->lo_lsm);
+
+	lov_layout_wait(env, lov);
+	lov_foreach_layout_entry(lov, entry) {
+		if (entry->lle_lsme && lsme_is_foreign(entry->lle_lsme))
+			continue;
+
+		lov_delete_raid0(env, lov, entry);
+	}
+
+	RETURN(0);
+}
+
+static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
+}
+
+static void lov_fini_composite(const struct lu_env *env,
+			       struct lov_object *lov,
+			       union lov_layout_state *state)
+{
+	struct lov_layout_composite *comp = &state->composite;
+	ENTRY;
+
+	if (comp->lo_entries != NULL) {
+		struct lov_layout_entry *entry;
+
+		lov_foreach_layout_entry(lov, entry)
+			if (entry->lle_comp_ops)
+				entry->lle_comp_ops->lco_fini(env, entry);
+
+		OBD_FREE_PTR_ARRAY(comp->lo_entries, comp->lo_entry_count);
+		comp->lo_entries = NULL;
+	}
+
+	if (comp->lo_mirrors != NULL) {
+		OBD_FREE_PTR_ARRAY(comp->lo_mirrors, comp->lo_mirror_count);
+		comp->lo_mirrors = NULL;
+	}
+
+	memset(comp, 0, sizeof(*comp));
+
+	dump_lsm(D_INODE, lov->lo_lsm);
+	lov_free_memmd(&lov->lo_lsm);
+
+	EXIT;
+}
+
+static void lov_fini_released(const struct lu_env *env, struct lov_object *lov,
+				union lov_layout_state *state)
+{
+	ENTRY;
+	dump_lsm(D_INODE, lov->lo_lsm);
+	lov_free_memmd(&lov->lo_lsm);
+	EXIT;
+}
+
+static int lov_print_empty(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+        (*p)(env, cookie, "empty %d\n",
+	     test_bit(LO_LAYOUT_INVALID, &lu2lov(o)->lo_obj_flags));
+        return 0;
+}
+
+static int lov_print_composite(const struct lu_env *env, void *cookie,
+			       lu_printer_t p, const struct lu_object *o)
+{
+	struct lov_object *lov = lu2lov(o);
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	int i;
+
+	(*p)(env, cookie, "entries: %d, %s, lsm{%p 0x%08X %d %u}:\n",
+	     lsm->lsm_entry_count,
+	     test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" :
+	     "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+	     lsm->lsm_layout_gen);
+
+	for (i = 0; i < lsm->lsm_entry_count; i++) {
+		struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
+		struct lov_layout_entry *lle = lov_entry(lov, i);
+
+		(*p)(env, cookie,
+		     DEXT ": { 0x%08X, %u, %#x, %u, %#x, %u, %u }\n",
+		     PEXT(&lse->lsme_extent), lse->lsme_magic,
+		     lse->lsme_id, lse->lsme_pattern, lse->lsme_layout_gen,
+		     lse->lsme_flags, lse->lsme_stripe_count,
+		     lse->lsme_stripe_size);
+
+		if (!lsme_is_foreign(lse))
+			lov_print_raid0(env, cookie, p, lle);
+	}
+
+	return 0;
+}
+
+static int lov_print_released(const struct lu_env *env, void *cookie,
+				lu_printer_t p, const struct lu_object *o)
+{
+	struct lov_object	*lov = lu2lov(o);
+	struct lov_stripe_md	*lsm = lov->lo_lsm;
+
+	(*p)(env, cookie,
+		"released: %s, lsm{%p 0x%08X %d %u}:\n",
+		test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" :
+		"valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+		lsm->lsm_layout_gen);
+	return 0;
+}
+
+static int lov_print_foreign(const struct lu_env *env, void *cookie,
+				lu_printer_t p, const struct lu_object *o)
+{
+	struct lov_object	*lov = lu2lov(o);
+	struct lov_stripe_md	*lsm = lov->lo_lsm;
+
+	(*p)(env, cookie,
+		"foreign: %s, lsm{%p 0x%08X %d %u}:\n",
+		test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ?
+		"invalid" : "valid", lsm,
+		lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+		lsm->lsm_layout_gen);
+	(*p)(env, cookie,
+		"raw_ea_content '%.*s'\n",
+		(int)lsm->lsm_foreign_size, (char *)lsm_foreign(lsm));
+	return 0;
+}
+
+/**
+ * Implements cl_object_operations::coo_attr_get() method for an object
+ * without stripes (LLT_EMPTY layout type).
+ *
+ * The only attributes this layer is authoritative in this case is
+ * cl_attr::cat_blocks---it's 0.
+ */
+static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
+                              struct cl_attr *attr)
+{
+        attr->cat_blocks = 0;
+        return 0;
+}
+
+static int lov_attr_get_composite(const struct lu_env *env,
+				  struct cl_object *obj,
+				  struct cl_attr *attr)
+{
+	struct lov_object	*lov = cl2lov(obj);
+	struct lov_layout_entry *entry;
+	int			 result = 0;
+
+	ENTRY;
+
+	attr->cat_size = 0;
+	attr->cat_blocks = 0;
+	attr->cat_kms = 0;
+
+	lov_foreach_layout_entry(lov, entry) {
+		struct cl_attr *lov_attr = NULL;
+		int index = lov_layout_entry_index(lov, entry);
+
+		if (!entry->lle_valid)
+			continue;
+
+		/* PFL: This component has not been init-ed. */
+		if (!lsm_entry_inited(lov->lo_lsm, index))
+			continue;
+
+		result = entry->lle_comp_ops->lco_getattr(env, lov, index,
+							  entry, &lov_attr);
+		if (result < 0)
+			RETURN(result);
+
+		if (lov_attr == NULL)
+			continue;
+
+		CDEBUG(D_INODE, "COMP ID #%i: s=%llu m=%llu a=%llu c=%llu "
+		       "b=%llu\n", index - 1, lov_attr->cat_size,
+		       lov_attr->cat_mtime, lov_attr->cat_atime,
+		       lov_attr->cat_ctime, lov_attr->cat_blocks);
+
+		/* merge results */
+		attr->cat_blocks += lov_attr->cat_blocks;
+		if (attr->cat_size < lov_attr->cat_size)
+			attr->cat_size = lov_attr->cat_size;
+		if (attr->cat_kms < lov_attr->cat_kms)
+			attr->cat_kms = lov_attr->cat_kms;
+		if (attr->cat_atime < lov_attr->cat_atime)
+			attr->cat_atime = lov_attr->cat_atime;
+		if (attr->cat_ctime < lov_attr->cat_ctime)
+			attr->cat_ctime = lov_attr->cat_ctime;
+		if (attr->cat_mtime < lov_attr->cat_mtime)
+			attr->cat_mtime = lov_attr->cat_mtime;
+	}
+
+	RETURN(0);
+}
+
+static int lov_flush_composite(const struct lu_env *env,
+			       struct cl_object *obj,
+			       struct ldlm_lock *lock)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_layout_entry *lle;
+	int rc = -ENODATA;
+
+	ENTRY;
+
+	lov_foreach_layout_entry(lov, lle) {
+		if (!lsme_is_dom(lle->lle_lsme))
+			continue;
+		rc = cl_object_flush(env, lovsub2cl(lle->lle_dom.lo_dom), lock);
+		break;
+	}
+
+	RETURN(rc);
+}
+
+static int lov_flush_empty(const struct lu_env *env, struct cl_object *obj,
+			   struct ldlm_lock *lock)
+{
+	return 0;
+}
+
+const static struct lov_layout_operations lov_dispatch[] = {
+	[LLT_EMPTY] = {
+		.llo_init      = lov_init_empty,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_empty,
+		.llo_print     = lov_print_empty,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_empty,
+		.llo_getattr   = lov_attr_get_empty,
+		.llo_flush     = lov_flush_empty,
+	},
+	[LLT_RELEASED] = {
+		.llo_init      = lov_init_released,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_released,
+		.llo_print     = lov_print_released,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_released,
+		.llo_getattr   = lov_attr_get_empty,
+		.llo_flush     = lov_flush_empty,
+	},
+	[LLT_COMP] = {
+		.llo_init      = lov_init_composite,
+		.llo_delete    = lov_delete_composite,
+		.llo_fini      = lov_fini_composite,
+		.llo_print     = lov_print_composite,
+		.llo_page_init = lov_page_init_composite,
+		.llo_lock_init = lov_lock_init_composite,
+		.llo_io_init   = lov_io_init_composite,
+		.llo_getattr   = lov_attr_get_composite,
+		.llo_flush     = lov_flush_composite,
+	},
+	[LLT_FOREIGN] = {
+		.llo_init      = lov_init_foreign,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_released,
+		.llo_print     = lov_print_foreign,
+		.llo_page_init = lov_page_init_foreign,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_empty,
+		.llo_getattr   = lov_attr_get_empty,
+		.llo_flush     = lov_flush_empty,
+	},
+};
+
+/**
+ * Performs a double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH_NOLOCK(obj, op, ...)		\
+({							\
+	struct lov_object *__obj = (obj);		\
+	enum lov_layout_type __llt;			\
+							\
+	__llt = __obj->lo_type;				\
+	LASSERT(__llt < ARRAY_SIZE(lov_dispatch));	\
+	lov_dispatch[__llt].op(__VA_ARGS__);		\
+})
+
+/**
+ * Return lov_layout_type associated with a given lsm
+ */
+static enum lov_layout_type lov_type(struct lov_stripe_md *lsm)
+{
+	if (lsm == NULL)
+		return LLT_EMPTY;
+
+	if (lsm->lsm_is_released)
+		return LLT_RELEASED;
+
+	if (lsm->lsm_magic == LOV_MAGIC_V1 ||
+	    lsm->lsm_magic == LOV_MAGIC_V3 ||
+	    lsm->lsm_magic == LOV_MAGIC_COMP_V1)
+		return LLT_COMP;
+
+	if (lsm->lsm_magic == LOV_MAGIC_FOREIGN)
+		return LLT_FOREIGN;
+
+	return LLT_EMPTY;
+}
+
+static inline void lov_conf_freeze(struct lov_object *lov)
+{
+	CDEBUG(D_INODE, "To take share lov(%p) owner %p/%p\n",
+		lov, lov->lo_owner, current);
+	if (lov->lo_owner != current)
+		down_read(&lov->lo_type_guard);
+}
+
+static inline void lov_conf_thaw(struct lov_object *lov)
+{
+	CDEBUG(D_INODE, "To release share lov(%p) owner %p/%p\n",
+		lov, lov->lo_owner, current);
+	if (lov->lo_owner != current)
+		up_read(&lov->lo_type_guard);
+}
+
+#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...)                       \
+({                                                                      \
+        struct lov_object                      *__obj = (obj);          \
+        int                                     __lock = !!(lock);      \
+        typeof(lov_dispatch[0].op(__VA_ARGS__)) __result;               \
+                                                                        \
+        if (__lock)                                                     \
+                lov_conf_freeze(__obj);					\
+        __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__);          \
+        if (__lock)                                                     \
+                lov_conf_thaw(__obj);					\
+        __result;                                                       \
+})
+
+/**
+ * Performs a locked double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH(obj, op, ...)                     \
+        LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__)
+
+#define LOV_2DISPATCH_VOID(obj, op, ...)                                \
+do {                                                                    \
+        struct lov_object                      *__obj = (obj);          \
+        enum lov_layout_type                    __llt;                  \
+                                                                        \
+	lov_conf_freeze(__obj);						\
+        __llt = __obj->lo_type;                                         \
+	LASSERT(__llt < ARRAY_SIZE(lov_dispatch));			\
+        lov_dispatch[__llt].op(__VA_ARGS__);                            \
+	lov_conf_thaw(__obj);						\
+} while (0)
+
+static void lov_conf_lock(struct lov_object *lov)
+{
+	LASSERT(lov->lo_owner != current);
+	down_write(&lov->lo_type_guard);
+	LASSERT(lov->lo_owner == NULL);
+	lov->lo_owner = current;
+	CDEBUG(D_INODE, "Took exclusive lov(%p) owner %p\n",
+		lov, lov->lo_owner);
+}
+
+static void lov_conf_unlock(struct lov_object *lov)
+{
+	CDEBUG(D_INODE, "To release exclusive lov(%p) owner %p\n",
+		lov, lov->lo_owner);
+	lov->lo_owner = NULL;
+	up_write(&lov->lo_type_guard);
+}
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov)
+{
+	ENTRY;
+
+	while (atomic_read(&lov->lo_active_ios) > 0) {
+		CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n",
+			PFID(lu_object_fid(lov2lu(lov))),
+			atomic_read(&lov->lo_active_ios));
+
+		wait_event_idle(lov->lo_waitq,
+				atomic_read(&lov->lo_active_ios) == 0);
+	}
+	RETURN(0);
+}
+
+static int lov_layout_change(const struct lu_env *unused,
+			     struct lov_object *lov, struct lov_stripe_md *lsm,
+			     const struct cl_object_conf *conf)
+{
+	enum lov_layout_type llt = lov_type(lsm);
+	union lov_layout_state *state = &lov->u;
+	const struct lov_layout_operations *old_ops;
+	const struct lov_layout_operations *new_ops;
+	struct lov_device *lov_dev = lov_object_dev(lov);
+	struct lu_env *env;
+	__u16 refcheck;
+	int rc;
+	ENTRY;
+
+	LASSERT(lov->lo_type < ARRAY_SIZE(lov_dispatch));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	LASSERT(llt < ARRAY_SIZE(lov_dispatch));
+
+	CDEBUG(D_INODE, DFID" from %s to %s\n",
+	       PFID(lu_object_fid(lov2lu(lov))),
+	       llt2str(lov->lo_type), llt2str(llt));
+
+	old_ops = &lov_dispatch[lov->lo_type];
+	new_ops = &lov_dispatch[llt];
+
+	rc = cl_object_prune(env, &lov->lo_cl);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	rc = old_ops->llo_delete(env, lov, &lov->u);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	old_ops->llo_fini(env, lov, &lov->u);
+
+	LASSERT(atomic_read(&lov->lo_active_ios) == 0);
+
+	CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n",
+	       PFID(lu_object_fid(lov2lu(lov))), lov, llt);
+
+	/* page bufsize fixup */
+	cl_object_header(&lov->lo_cl)->coh_page_bufsize -=
+		lov_page_slice_fixup(lov, NULL);
+
+	lov->lo_type = llt;
+	rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state);
+	if (rc != 0) {
+		struct obd_device *obd = lov2obd(lov_dev->ld_lov);
+
+		CERROR("%s: cannot apply new layout on "DFID" : rc = %d\n",
+		       obd->obd_name, PFID(lu_object_fid(lov2lu(lov))), rc);
+		new_ops->llo_delete(env, lov, state);
+		new_ops->llo_fini(env, lov, state);
+		/* this file becomes an EMPTY file. */
+		lov->lo_type = LLT_EMPTY;
+		GOTO(out, rc);
+	}
+
+out:
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+/*****************************************************************************
+ *
+ * Lov object operations.
+ *
+ */
+static int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct lov_object            *lov   = lu2lov(obj);
+	struct lov_device            *dev   = lov_object_dev(lov);
+	const struct cl_object_conf  *cconf = lu2cl_conf(conf);
+	union lov_layout_state	     *set   = &lov->u;
+	const struct lov_layout_operations *ops;
+	struct lov_stripe_md *lsm = NULL;
+	int rc;
+	ENTRY;
+
+	init_rwsem(&lov->lo_type_guard);
+	atomic_set(&lov->lo_active_ios, 0);
+	init_waitqueue_head(&lov->lo_waitq);
+	cl_object_page_init(lu2cl(obj), sizeof(struct lov_page));
+
+	lov->lo_type = LLT_EMPTY;
+	if (cconf->u.coc_layout.lb_buf != NULL) {
+		lsm = lov_unpackmd(dev->ld_lov,
+				   cconf->u.coc_layout.lb_buf,
+				   cconf->u.coc_layout.lb_len);
+		if (IS_ERR(lsm))
+			RETURN(PTR_ERR(lsm));
+
+		dump_lsm(D_INODE, lsm);
+	}
+
+	/* no locking is necessary, as object is being created */
+	lov->lo_type = lov_type(lsm);
+	ops = &lov_dispatch[lov->lo_type];
+	rc = ops->llo_init(env, dev, lov, lsm, cconf, set);
+	if (rc != 0)
+		GOTO(out_lsm, rc);
+
+out_lsm:
+	lov_lsm_put(lsm);
+
+	RETURN(rc);
+}
+
+static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
+                        const struct cl_object_conf *conf)
+{
+	struct lov_stripe_md	*lsm = NULL;
+	struct lov_object	*lov = cl2lov(obj);
+	int			 result = 0;
+	ENTRY;
+
+	if (conf->coc_opc == OBJECT_CONF_SET &&
+	    conf->u.coc_layout.lb_buf != NULL) {
+		lsm = lov_unpackmd(lov_object_dev(lov)->ld_lov,
+				   conf->u.coc_layout.lb_buf,
+				   conf->u.coc_layout.lb_len);
+		if (IS_ERR(lsm))
+			RETURN(PTR_ERR(lsm));
+		dump_lsm(D_INODE, lsm);
+	}
+
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
+		GOTO(out_lsm, result = 0);
+	}
+
+	lov_conf_lock(lov);
+	if (conf->coc_opc == OBJECT_CONF_WAIT) {
+		if (test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) &&
+		    atomic_read(&lov->lo_active_ios) > 0) {
+			lov_conf_unlock(lov);
+			result = lov_layout_wait(env, lov);
+			lov_conf_lock(lov);
+		}
+		GOTO(out, result);
+	}
+
+	LASSERT(conf->coc_opc == OBJECT_CONF_SET);
+
+	if ((lsm == NULL && lov->lo_lsm == NULL) ||
+	    ((lsm != NULL && lov->lo_lsm != NULL) &&
+	     (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) &&
+	     (lov->lo_lsm->lsm_flags == lsm->lsm_flags) &&
+	     (lov->lo_lsm->lsm_entries[0]->lsme_pattern ==
+	      lsm->lsm_entries[0]->lsme_pattern))) {
+		/* same version of layout */
+		clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
+		GOTO(out, result = 0);
+	}
+
+	/* will change layout - check if there still exists active IO. */
+	if (atomic_read(&lov->lo_active_ios) > 0) {
+		set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
+		GOTO(out, result = -EBUSY);
+	}
+
+	result = lov_layout_change(env, lov, lsm, conf);
+	if (result)
+		set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
+	else
+		clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
+	EXIT;
+
+out:
+	lov_conf_unlock(lov);
+out_lsm:
+	lov_lsm_put(lsm);
+	CDEBUG(D_INODE, DFID" lo_layout_invalid=%u\n",
+	       PFID(lu_object_fid(lov2lu(lov))),
+	       test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags));
+	RETURN(result);
+}
+
+static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+        struct lov_object *lov = lu2lov(obj);
+
+        ENTRY;
+        LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u);
+        EXIT;
+}
+
+static void lov_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+        struct lov_object *lov = lu2lov(obj);
+
+        ENTRY;
+        LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u);
+        lu_object_fini(obj);
+        OBD_SLAB_FREE_PTR(lov, lov_object_kmem);
+        EXIT;
+}
+
+static int lov_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *o)
+{
+        return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o);
+}
+
+static int lov_page_init(const struct lu_env *env, struct cl_object *obj,
+			 struct cl_page *page, pgoff_t index)
+{
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_page_init, env, obj, page,
+				    index);
+}
+
+/**
+ * Implements cl_object_operations::clo_io_init() method for lov
+ * layer. Dispatches to the appropriate layout io initialization method.
+ */
+static int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+		       struct cl_io *io)
+{
+	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_preserved);
+
+	CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n",
+	       PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type,
+	       io->ci_ignore_layout, io->ci_verify_layout);
+
+	/* IO type CIT_MISC with ci_ignore_layout set are usually invoked from
+	 * the OSC layer. It shouldn't take lov layout conf lock in that case,
+	 * because as long as the OSC object exists, the layout can't be
+	 * reconfigured. */
+	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+			!(io->ci_ignore_layout && io->ci_type == CIT_MISC),
+			env, obj, io);
+}
+
+/**
+ * An implementation of cl_object_operations::clo_attr_get() method for lov
+ * layer. For raid0 layout this collects and merges attributes of all
+ * sub-objects.
+ */
+static int lov_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
+{
+        /* do not take lock, as this function is called under a
+         * spin-lock. Layout is protected from changing by ongoing IO. */
+        return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr);
+}
+
+static int lov_attr_update(const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid)
+{
+	/*
+	 * No dispatch is required here, as no layout implements this.
+	 */
+	return 0;
+}
+
+static int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	/* No need to lock because we've taken one refcount of layout.  */
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock,
+				    io);
+}
+
+/**
+ * We calculate on which OST the mapping will end. If the length of mapping
+ * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * will be one just before start_stripe. Else we check if the mapping
+ * intersects each OST and find last_stripe.
+ * This function returns the last_stripe and also sets the stripe_count
+ * over which the mapping is spread
+ *
+ * \param lsm [in]		striping information for the file
+ * \param index [in]		stripe component index
+ * \param ext [in]		logical extent of mapping
+ * \param start_stripe [in]	starting stripe of the mapping
+ * \param stripe_count [out]	the number of stripes across which to map is
+ *				returned
+ *
+ * \retval last_stripe		return the last stripe of the mapping
+ */
+static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, int index,
+				   struct lu_extent *ext,
+				   int start_stripe, int *stripe_count)
+{
+	struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index];
+	int init_stripe;
+	int last_stripe;
+	int i, j;
+
+	init_stripe = lov_stripe_number(lsm, index, ext->e_start);
+
+	if (ext->e_end - ext->e_start >
+	    lsme->lsme_stripe_size * lsme->lsme_stripe_count) {
+		if (init_stripe == start_stripe) {
+			last_stripe = (start_stripe < 1) ?
+				lsme->lsme_stripe_count - 1 : start_stripe - 1;
+			*stripe_count = lsme->lsme_stripe_count;
+		} else if (init_stripe < start_stripe) {
+			last_stripe = (init_stripe < 1) ?
+				lsme->lsme_stripe_count - 1 : init_stripe - 1;
+			*stripe_count = lsme->lsme_stripe_count -
+					(start_stripe - init_stripe);
+		} else {
+			last_stripe = init_stripe - 1;
+			*stripe_count = init_stripe - start_stripe;
+		}
+	} else {
+		for (j = 0, i = start_stripe; j < lsme->lsme_stripe_count;
+		     i = (i + 1) % lsme->lsme_stripe_count, j++) {
+			if (!lov_stripe_intersects(lsm, index,  i, ext, NULL,
+						   NULL))
+				break;
+			if ((start_stripe != init_stripe) && (i == init_stripe))
+				break;
+		}
+		*stripe_count = j;
+		last_stripe = (start_stripe + j - 1) % lsme->lsme_stripe_count;
+	}
+
+	return last_stripe;
+}
+
+/**
+ * Set fe_device and copy extents from local buffer into main return buffer.
+ *
+ * \param fiemap [out]		fiemap to hold all extents
+ * \param lcl_fm_ext [in]	array of fiemap extents get from OSC layer
+ * \param ost_index [in]	OST index to be written into the fm_device
+ *				field for each extent
+ * \param ext_count [in]	number of extents to be copied
+ * \param current_extent [in]	where to start copying in the extent array
+ */
+static void fiemap_prepare_and_copy_exts(struct fiemap *fiemap,
+					 struct fiemap_extent *lcl_fm_ext,
+					 int ost_index, unsigned int ext_count,
+					 int current_extent, int abs_stripeno)
+{
+	char		*to;
+	unsigned int	ext;
+
+	for (ext = 0; ext < ext_count; ext++) {
+		set_fe_device_stripenr(&lcl_fm_ext[ext], ost_index,
+				       abs_stripeno);
+		lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET;
+	}
+
+	/* Copy fm_extent's from fm_local to return buffer */
+	to = (char *)fiemap + fiemap_count_to_size(current_extent);
+	memcpy(to, lcl_fm_ext, ext_count * sizeof(struct fiemap_extent));
+}
+
+#define FIEMAP_BUFFER_SIZE 4096
+
+/**
+ * Non-zero fe_logical indicates that this is a continuation FIEMAP
+ * call. The local end offset and the device are sent in the first
+ * fm_extent. This function calculates the stripe number from the index.
+ * This function returns a stripe_no on which mapping is to be restarted.
+ *
+ * This function returns fm_end_offset which is the in-OST offset at which
+ * mapping should be restarted. If fm_end_offset=0 is returned then caller
+ * will re-calculate proper offset in next stripe.
+ * Note that the first extent is passed to lov_get_info via the value field.
+ *
+ * \param fiemap [in]		fiemap request header
+ * \param lsm [in]		striping information for the file
+ * \param index [in]		stripe component index
+ * \param ext [in]		logical extent of mapping
+ * \param start_stripe [out]	starting stripe will be returned in this
+ */
+static u64 fiemap_calc_fm_end_offset(struct fiemap *fiemap,
+				     struct lov_stripe_md *lsm,
+				     int index, struct lu_extent *ext,
+				     int *start_stripe)
+{
+	struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index];
+	u64 local_end = fiemap->fm_extents[0].fe_logical;
+	u64 lun_end;
+	u64 fm_end_offset;
+	int stripe_no = -1;
+
+	if (fiemap->fm_extent_count == 0 ||
+	    fiemap->fm_extents[0].fe_logical == 0)
+		return 0;
+
+	stripe_no = *start_stripe;
+
+	if (stripe_no == -1)
+		return -EINVAL;
+
+	/* If we have finished mapping on previous device, shift logical
+	 * offset to start of next device */
+	if (lov_stripe_intersects(lsm, index, stripe_no, ext, NULL, &lun_end) &&
+	    local_end < lun_end) {
+		fm_end_offset = local_end;
+	} else {
+		/* This is a special value to indicate that caller should
+		 * calculate offset in next stripe. */
+		fm_end_offset = 0;
+		*start_stripe = (stripe_no + 1) % lsme->lsme_stripe_count;
+	}
+
+	return fm_end_offset;
+}
+
+struct fiemap_state {
+	struct fiemap		*fs_fm;
+	struct lu_extent	fs_ext;		/* current entry extent */
+	u64			fs_length;
+	u64			fs_end_offset;	/* last iteration offset */
+	int			fs_cur_extent;	/* collected exts so far */
+	int			fs_cnt_need;	/* # of extents buf can hold */
+	int			fs_start_stripe;
+	int			fs_last_stripe;
+	bool			fs_device_done;	/* enough for this OST */
+	bool			fs_finish_stripe; /* reached fs_last_stripe */
+	bool			fs_enough;	/* enough for this call */
+};
+
+static struct cl_object *lov_find_subobj(const struct lu_env *env,
+					 struct lov_object *lov,
+					 struct lov_stripe_md *lsm,
+					 int index)
+{
+	struct lov_device	*dev = lu2lov_dev(lov2lu(lov)->lo_dev);
+	struct lov_thread_info  *lti = lov_env_info(env);
+	struct lu_fid		*ofid = &lti->lti_fid;
+	struct lov_oinfo	*oinfo;
+	struct cl_device	*subdev;
+	int			entry = lov_comp_entry(index);
+	int			stripe = lov_comp_stripe(index);
+	int			ost_idx;
+	int			rc;
+	struct cl_object	*result;
+
+	if (lov->lo_type != LLT_COMP)
+		GOTO(out, result = NULL);
+
+	if (entry >= lsm->lsm_entry_count ||
+	    stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
+		GOTO(out, result = NULL);
+
+	oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
+	ost_idx = oinfo->loi_ost_idx;
+	rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
+	if (rc != 0)
+		GOTO(out, result = NULL);
+
+	subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+	result = lov_sub_find(env, subdev, ofid, NULL);
+out:
+	if (result == NULL)
+		result = ERR_PTR(-EINVAL);
+	return result;
+}
+
+static int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
+			     struct lov_stripe_md *lsm, struct fiemap *fiemap,
+			     size_t *buflen, struct ll_fiemap_info_key *fmkey,
+			     int index, int stripe_last, int stripeno,
+			     struct fiemap_state *fs)
+{
+	struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index];
+	struct cl_object *subobj;
+	struct lov_obd *lov = lu2lov_dev(obj->co_lu.lo_dev)->ld_lov;
+	struct fiemap_extent *fm_ext = &fs->fs_fm->fm_extents[0];
+	u64 req_fm_len; /* max requested extent coverage */
+	u64 len_mapped_single_call;
+	u64 obd_start;
+	u64 obd_end;
+	unsigned int ext_count;
+	/* EOF for object */
+	bool ost_eof = false;
+	/* done with required mapping for this OST? */
+	bool ost_done = false;
+	int ost_index;
+	int rc = 0;
+
+	fs->fs_device_done = false;
+	/* Find out range of mapping on this stripe */
+	if ((lov_stripe_intersects(lsm, index, stripeno, &fs->fs_ext,
+				   &obd_start, &obd_end)) == 0)
+		return 0;
+
+	if (lov_oinfo_is_dummy(lsme->lsme_oinfo[stripeno]))
+		return -EIO;
+
+	/* If this is a continuation FIEMAP call and we are on
+	 * starting stripe then obd_start needs to be set to
+	 * end_offset */
+	if (fs->fs_end_offset != 0 && stripeno == fs->fs_start_stripe)
+		obd_start = fs->fs_end_offset;
+
+	if (lov_size_to_stripe(lsm, index, fs->fs_ext.e_end, stripeno) ==
+	    obd_start)
+		return 0;
+
+	req_fm_len = obd_end - obd_start + 1;
+	fs->fs_fm->fm_length = 0;
+	len_mapped_single_call = 0;
+
+	/* find lobsub object */
+	subobj = lov_find_subobj(env, cl2lov(obj), lsm,
+				 lov_comp_index(index, stripeno));
+	if (IS_ERR(subobj))
+		return PTR_ERR(subobj);
+	/* If the output buffer is very large and the objects have many
+	 * extents we may need to loop on a single OST repeatedly */
+	do {
+		if (fiemap->fm_extent_count > 0) {
+			/* Don't get too many extents. */
+			if (fs->fs_cur_extent + fs->fs_cnt_need >
+			    fiemap->fm_extent_count)
+				fs->fs_cnt_need = fiemap->fm_extent_count -
+						  fs->fs_cur_extent;
+		}
+
+		obd_start += len_mapped_single_call;
+		fs->fs_fm->fm_length = req_fm_len - len_mapped_single_call;
+		req_fm_len = fs->fs_fm->fm_length;
+		/**
+		 * If we've collected enough extent map, we'd request 1 more,
+		 * to see whether we coincidentally finished all available
+		 * extent map, so that FIEMAP_EXTENT_LAST would be set.
+		 */
+		fs->fs_fm->fm_extent_count = fs->fs_enough ?
+					     1 : fs->fs_cnt_need;
+		fs->fs_fm->fm_mapped_extents = 0;
+		fs->fs_fm->fm_flags = fiemap->fm_flags;
+
+		ost_index = lsme->lsme_oinfo[stripeno]->loi_ost_idx;
+
+		if (ost_index < 0 || ost_index >= lov->desc.ld_tgt_count)
+			GOTO(obj_put, rc = -EINVAL);
+		/* If OST is inactive, return extent with UNKNOWN flag. */
+		if (!lov->lov_tgts[ost_index]->ltd_active) {
+			fs->fs_fm->fm_flags |= FIEMAP_EXTENT_LAST;
+			fs->fs_fm->fm_mapped_extents = 1;
+
+			fm_ext[0].fe_logical = obd_start;
+			fm_ext[0].fe_length = obd_end - obd_start + 1;
+			fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+
+			goto inactive_tgt;
+		}
+
+		fs->fs_fm->fm_start = obd_start;
+		fs->fs_fm->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+		memcpy(&fmkey->lfik_fiemap, fs->fs_fm, sizeof(*fs->fs_fm));
+		*buflen = fiemap_count_to_size(fs->fs_fm->fm_extent_count);
+
+		rc = cl_object_fiemap(env, subobj, fmkey, fs->fs_fm, buflen);
+		if (rc != 0)
+			GOTO(obj_put, rc);
+inactive_tgt:
+		ext_count = fs->fs_fm->fm_mapped_extents;
+		if (ext_count == 0) {
+			ost_done = true;
+			fs->fs_device_done = true;
+			/* If last stripe has hold at the end,
+			 * we need to return */
+			if (stripeno == fs->fs_last_stripe) {
+				fiemap->fm_mapped_extents = 0;
+				fs->fs_finish_stripe = true;
+				GOTO(obj_put, rc);
+			}
+			break;
+		} else if (fs->fs_enough) {
+			/*
+			 * We've collected enough extents and there are
+			 * more extents after it.
+			 */
+			GOTO(obj_put, rc);
+		}
+
+		/* If we just need num of extents, got to next device */
+		if (fiemap->fm_extent_count == 0) {
+			fs->fs_cur_extent += ext_count;
+			break;
+		}
+
+		/* prepare to copy retrived map extents */
+		len_mapped_single_call = fm_ext[ext_count - 1].fe_logical +
+					 fm_ext[ext_count - 1].fe_length -
+					 obd_start;
+
+		/* Have we finished mapping on this device? */
+		if (req_fm_len <= len_mapped_single_call) {
+			ost_done = true;
+			fs->fs_device_done = true;
+		}
+
+		/* Clear the EXTENT_LAST flag which can be present on
+		 * the last extent */
+		if (fm_ext[ext_count - 1].fe_flags & FIEMAP_EXTENT_LAST)
+			fm_ext[ext_count - 1].fe_flags &= ~FIEMAP_EXTENT_LAST;
+		if (lov_stripe_size(lsm, index,
+				    fm_ext[ext_count - 1].fe_logical +
+				    fm_ext[ext_count - 1].fe_length,
+				    stripeno) >= fmkey->lfik_oa.o_size) {
+			ost_eof = true;
+			fs->fs_device_done = true;
+		}
+
+		fiemap_prepare_and_copy_exts(fiemap, fm_ext, ost_index,
+					     ext_count, fs->fs_cur_extent,
+					     stripe_last + stripeno);
+		fs->fs_cur_extent += ext_count;
+
+		/* Ran out of available extents? */
+		if (fs->fs_cur_extent >= fiemap->fm_extent_count)
+			fs->fs_enough = true;
+	} while (!ost_done && !ost_eof);
+
+	if (stripeno == fs->fs_last_stripe)
+		fs->fs_finish_stripe = true;
+obj_put:
+	cl_object_put(env, subobj);
+
+	return rc;
+}
+
+/**
+ * Break down the FIEMAP request and send appropriate calls to individual OSTs.
+ * This also handles the restarting of FIEMAP calls in case mapping overflows
+ * the available number of extents in single call.
+ *
+ * \param env [in]		lustre environment
+ * \param obj [in]		file object
+ * \param fmkey [in]		fiemap request header and other info
+ * \param fiemap [out]		fiemap buffer holding retrived map extents
+ * \param buflen [in/out]	max buffer length of @fiemap, when iterate
+ *				each OST, it is used to limit max map needed
+ * \retval 0	success
+ * \retval < 0	error
+ */
+static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
+			     struct ll_fiemap_info_key *fmkey,
+			     struct fiemap *fiemap, size_t *buflen)
+{
+	struct lov_stripe_md_entry *lsme;
+	struct lov_stripe_md *lsm;
+	struct fiemap *fm_local = NULL;
+	loff_t whole_start;
+	loff_t whole_end;
+	int entry;
+	int start_entry = -1;
+	int end_entry;
+	int cur_stripe = 0;
+	int stripe_count;
+	unsigned int buffer_size = FIEMAP_BUFFER_SIZE;
+	int rc = 0;
+	struct fiemap_state fs = { 0 };
+	struct lu_extent range;
+	int cur_ext;
+	int stripe_last;
+	int start_stripe = 0;
+	bool resume = false;
+	ENTRY;
+
+	lsm = lov_lsm_addref(cl2lov(obj));
+	if (lsm == NULL) {
+		/* no extent: there is no object for mapping */
+		fiemap->fm_mapped_extents = 0;
+		return 0;
+	}
+
+	if (!(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
+		/**
+		 * If the entry count > 1 or stripe_count > 1 and the
+		 * application does not understand DEVICE_ORDER flag,
+		 * it cannot interpret the extents correctly.
+		 */
+		if (lsm->lsm_entry_count > 1 ||
+		    (lsm->lsm_entry_count == 1 &&
+		     lsm->lsm_entries[0]->lsme_stripe_count > 1))
+			GOTO(out_lsm, rc = -ENOTSUPP);
+	}
+
+	/* No support for DOM layout yet. */
+	if (lsme_is_dom(lsm->lsm_entries[0]))
+		GOTO(out_lsm, rc = -ENOTSUPP);
+
+	if (lsm->lsm_is_released) {
+		if (fiemap->fm_start < fmkey->lfik_oa.o_size) {
+			/**
+			 * released file, return a minimal FIEMAP if
+			 * request fits in file-size.
+			 */
+			fiemap->fm_mapped_extents = 1;
+			fiemap->fm_extents[0].fe_logical = fiemap->fm_start;
+			if (fiemap->fm_start + fiemap->fm_length <
+			    fmkey->lfik_oa.o_size)
+				fiemap->fm_extents[0].fe_length =
+					fiemap->fm_length;
+			else
+				fiemap->fm_extents[0].fe_length =
+					fmkey->lfik_oa.o_size -
+					fiemap->fm_start;
+			fiemap->fm_extents[0].fe_flags |=
+				FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_LAST;
+		}
+		GOTO(out_lsm, rc = 0);
+	}
+
+	/* buffer_size is small to hold fm_extent_count of extents. */
+	if (fiemap_count_to_size(fiemap->fm_extent_count) < buffer_size)
+		buffer_size = fiemap_count_to_size(fiemap->fm_extent_count);
+
+	OBD_ALLOC_LARGE(fm_local, buffer_size);
+	if (fm_local == NULL)
+		GOTO(out_lsm, rc = -ENOMEM);
+
+	/**
+	 * Requested extent count exceeds the fiemap buffer size, shrink our
+	 * ambition.
+	 */
+	if (fiemap_count_to_size(fiemap->fm_extent_count) > *buflen)
+		fiemap->fm_extent_count = fiemap_size_to_count(*buflen);
+
+	fs.fs_enough = false;
+	fs.fs_cur_extent = 0;
+	fs.fs_fm = fm_local;
+	fs.fs_cnt_need = fiemap_size_to_count(buffer_size);
+
+	whole_start = fiemap->fm_start;
+	/* whole_start is beyond the end of the file */
+	if (whole_start > fmkey->lfik_oa.o_size)
+		GOTO(out_fm_local, rc = -EINVAL);
+	whole_end = (fiemap->fm_length == OBD_OBJECT_EOF) ?
+					fmkey->lfik_oa.o_size + 1 :
+					whole_start + fiemap->fm_length;
+	/**
+	 * If fiemap->fm_length != OBD_OBJECT_EOF but whole_end exceeds file
+	 * size
+	 */
+	if (whole_end > fmkey->lfik_oa.o_size + 1)
+		whole_end = fmkey->lfik_oa.o_size + 1;
+
+	/**
+	 * the high 16bits of fe_device remember which stripe the last
+	 * call has been arrived, we'd continue from there in this call.
+	 */
+	if (fiemap->fm_extent_count && fiemap->fm_extents[0].fe_logical)
+		resume = true;
+	stripe_last = get_fe_stripenr(&fiemap->fm_extents[0]);
+	/**
+	 * stripe_last records stripe number we've been processed in the last
+	 * call
+	 */
+	end_entry = lsm->lsm_entry_count - 1;
+	cur_stripe = 0;
+	for (entry = 0; entry <= end_entry; entry++) {
+		lsme = lsm->lsm_entries[entry];
+		if (cur_stripe + lsme->lsme_stripe_count >= stripe_last) {
+			start_entry = entry;
+			start_stripe = stripe_last - cur_stripe;
+			break;
+		}
+
+		cur_stripe += lsme->lsme_stripe_count;
+	}
+	if (start_entry == -1) {
+		CERROR(DFID": FIEMAP does not init start entry, cur_stripe=%d, "
+		       "stripe_last=%d\n", PFID(lu_object_fid(&obj->co_lu)),
+		       cur_stripe, stripe_last);
+		GOTO(out_fm_local, rc = -EINVAL);
+	}
+	/**
+	 * @start_entry & @start_stripe records the position of fiemap
+	 * resumption @stripe_last keeps recording the absolution position
+	 * we'are processing. @resume indicates we'd honor @start_stripe.
+	 */
+
+	range.e_start = whole_start;
+	range.e_end = whole_end;
+
+	for (entry = start_entry; entry <= end_entry; entry++) {
+		/* remeber to update stripe_last accordingly */
+		lsme = lsm->lsm_entries[entry];
+
+		/* FLR could contain component holes between entries */
+		if (!lsme_inited(lsme)) {
+			stripe_last += lsme->lsme_stripe_count;
+			resume = false;
+			continue;
+		}
+
+		if (!lu_extent_is_overlapped(&range, &lsme->lsme_extent)) {
+			stripe_last += lsme->lsme_stripe_count;
+			resume = false;
+			continue;
+		}
+
+		/* prepare for a component entry iteration */
+		if (lsme->lsme_extent.e_start > whole_start)
+			fs.fs_ext.e_start = lsme->lsme_extent.e_start;
+		else
+			fs.fs_ext.e_start = whole_start;
+		if (lsme->lsme_extent.e_end > whole_end)
+			fs.fs_ext.e_end = whole_end;
+		else
+			fs.fs_ext.e_end = lsme->lsme_extent.e_end;
+
+		/* Calculate start stripe, last stripe and length of mapping */
+		if (resume) {
+			fs.fs_start_stripe = start_stripe;
+			/* put stripe_last to the first stripe of the comp */
+			stripe_last -= start_stripe;
+			resume = false;
+		} else {
+			fs.fs_start_stripe = lov_stripe_number(lsm, entry,
+							fs.fs_ext.e_start);
+		}
+		fs.fs_last_stripe = fiemap_calc_last_stripe(lsm, entry,
+					&fs.fs_ext, fs.fs_start_stripe,
+					&stripe_count);
+		/**
+		 * A new mirror component is under process, reset
+		 * fs.fs_end_offset and then fiemap_for_stripe() starts from
+		 * the overlapping extent, otherwise starts from
+		 * fs.fs_end_offset.
+		 */
+		if (entry > start_entry && lsme->lsme_extent.e_start == 0) {
+			/* new mirror */
+			fs.fs_end_offset = 0;
+		} else {
+			fs.fs_end_offset = fiemap_calc_fm_end_offset(fiemap,
+						lsm, entry, &fs.fs_ext,
+						&fs.fs_start_stripe);
+		}
+
+		/* Check each stripe */
+		for (cur_stripe = fs.fs_start_stripe; stripe_count > 0;
+		     --stripe_count,
+		     cur_stripe = (cur_stripe + 1) % lsme->lsme_stripe_count) {
+			/* reset fs_finish_stripe */
+			fs.fs_finish_stripe = false;
+			rc = fiemap_for_stripe(env, obj, lsm, fiemap, buflen,
+					       fmkey, entry, stripe_last,
+					       cur_stripe, &fs);
+			if (rc < 0)
+				GOTO(out_fm_local, rc);
+			if (fs.fs_enough) {
+				stripe_last += cur_stripe;
+				GOTO(finish, rc);
+			}
+			if (fs.fs_finish_stripe)
+				break;
+		} /* for each stripe */
+		stripe_last += lsme->lsme_stripe_count;
+	} /* for covering layout component entry */
+
+finish:
+	if (fs.fs_cur_extent > 0)
+		cur_ext = fs.fs_cur_extent - 1;
+	else
+		cur_ext = 0;
+
+	/* done all the processing */
+	if (entry > end_entry)
+		fiemap->fm_extents[cur_ext].fe_flags |= FIEMAP_EXTENT_LAST;
+
+	/* Indicate that we are returning device offsets unless file just has
+	 * single stripe */
+	if (lsm->lsm_entry_count > 1 ||
+	    (lsm->lsm_entry_count == 1 &&
+	     lsm->lsm_entries[0]->lsme_stripe_count > 1))
+		fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER;
+
+	if (fiemap->fm_extent_count == 0)
+		goto skip_last_device_calc;
+
+skip_last_device_calc:
+	fiemap->fm_mapped_extents = fs.fs_cur_extent;
+out_fm_local:
+	OBD_FREE_LARGE(fm_local, buffer_size);
+
+out_lsm:
+	lov_lsm_put(lsm);
+	return rc;
+}
+
+static int lov_object_getstripe(const struct lu_env *env, struct cl_object *obj,
+				struct lov_user_md __user *lum, size_t size)
+{
+	struct lov_object	*lov = cl2lov(obj);
+	struct lov_stripe_md	*lsm;
+	int			rc = 0;
+	ENTRY;
+
+	lsm = lov_lsm_addref(lov);
+	if (lsm == NULL)
+		RETURN(-ENODATA);
+
+	rc = lov_getstripe(env, cl2lov(obj), lsm, lum, size);
+	lov_lsm_put(lsm);
+	RETURN(rc);
+}
+
+static int lov_object_layout_get(const struct lu_env *env,
+				 struct cl_object *obj,
+				 struct cl_layout *cl)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_stripe_md *lsm = lov_lsm_addref(lov);
+	struct lu_buf *buf = &cl->cl_buf;
+	ssize_t rc;
+	ENTRY;
+
+	if (lsm == NULL) {
+		cl->cl_size = 0;
+		cl->cl_layout_gen = CL_LAYOUT_GEN_EMPTY;
+
+		RETURN(0);
+	}
+
+	cl->cl_size = lov_comp_md_size(lsm);
+	cl->cl_layout_gen = lsm->lsm_layout_gen;
+	cl->cl_is_released = lsm->lsm_is_released;
+	cl->cl_is_composite = lsm_is_composite(lsm->lsm_magic);
+
+	rc = lov_lsm_pack(lsm, buf->lb_buf, buf->lb_len);
+	lov_lsm_put(lsm);
+
+	/* return error or number of bytes */
+	RETURN(rc);
+}
+
+static loff_t lov_object_maxbytes(struct cl_object *obj)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_stripe_md *lsm = lov_lsm_addref(lov);
+	loff_t maxbytes;
+
+	if (lsm == NULL)
+		return LLONG_MAX;
+
+	maxbytes = lsm->lsm_maxbytes;
+
+	lov_lsm_put(lsm);
+
+	return maxbytes;
+}
+
+static int lov_object_flush(const struct lu_env *env, struct cl_object *obj,
+			    struct ldlm_lock *lock)
+{
+	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_flush, true, env, obj,
+				     lock);
+}
+
+static const struct cl_object_operations lov_ops = {
+	.coo_page_init    = lov_page_init,
+	.coo_lock_init    = lov_lock_init,
+	.coo_io_init      = lov_io_init,
+	.coo_attr_get     = lov_attr_get,
+	.coo_attr_update  = lov_attr_update,
+	.coo_conf_set     = lov_conf_set,
+	.coo_getstripe    = lov_object_getstripe,
+	.coo_layout_get   = lov_object_layout_get,
+	.coo_maxbytes     = lov_object_maxbytes,
+	.coo_fiemap       = lov_object_fiemap,
+	.coo_object_flush = lov_object_flush
+};
+
+static const struct lu_object_operations lov_lu_obj_ops = {
+	.loo_object_init	= lov_object_init,
+	.loo_object_delete	= lov_object_delete,
+	.loo_object_release	= NULL,
+	.loo_object_free	= lov_object_free,
+	.loo_object_print	= lov_object_print,
+	.loo_object_invariant	= NULL,
+};
+
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct lov_object *lov;
+	struct lu_object  *obj;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, GFP_NOFS);
+	if (lov != NULL) {
+		obj = lov2lu(lov);
+		lu_object_init(obj, NULL, dev);
+		lov->lo_cl.co_ops = &lov_ops;
+		lov->lo_type = -1; /* invalid, to catch uninitialized type */
+		/*
+		 * object io operation vector (cl_object::co_iop) is installed
+		 * later in lov_object_init(), as different vectors are used
+		 * for object with different layouts.
+		 */
+		obj->lo_ops = &lov_lu_obj_ops;
+	} else
+		obj = NULL;
+	RETURN(obj);
+}
+
+static struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
+{
+	struct lov_stripe_md *lsm = NULL;
+
+	lov_conf_freeze(lov);
+	if (lov->lo_lsm != NULL) {
+		lsm = lsm_addref(lov->lo_lsm);
+		CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
+			lsm, atomic_read(&lsm->lsm_refc),
+			test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags),
+			current);
+	}
+	lov_conf_thaw(lov);
+	return lsm;
+}
+
+int lov_read_and_clear_async_rc(struct cl_object *clob)
+{
+	struct lu_object *luobj;
+	int rc = 0;
+	ENTRY;
+
+	luobj = lu_object_locate(&cl_object_header(clob)->coh_lu,
+				 &lov_device_type);
+	if (luobj != NULL) {
+		struct lov_object *lov = lu2lov(luobj);
+
+		lov_conf_freeze(lov);
+		switch (lov->lo_type) {
+		case LLT_COMP: {
+			struct lov_stripe_md *lsm;
+			int i;
+
+			lsm = lov->lo_lsm;
+			LASSERT(lsm != NULL);
+			for (i = 0; i < lsm->lsm_entry_count; i++) {
+				struct lov_stripe_md_entry *lse =
+						lsm->lsm_entries[i];
+				int j;
+
+				if (!lsme_inited(lse))
+					break;
+
+				for (j = 0; j < lse->lsme_stripe_count; j++) {
+					struct lov_oinfo *loi =
+							lse->lsme_oinfo[j];
+
+					if (lov_oinfo_is_dummy(loi))
+						continue;
+
+					if (loi->loi_ar.ar_rc && !rc)
+						rc = loi->loi_ar.ar_rc;
+					loi->loi_ar.ar_rc = 0;
+				}
+			}
+		}
+		fallthrough;
+		case LLT_RELEASED:
+		case LLT_EMPTY:
+		case LLT_FOREIGN:
+			break;
+		default:
+			LBUG();
+		}
+		lov_conf_thaw(lov);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lov_read_and_clear_async_rc);
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
new file mode 100644
index 0000000000000..86d4ae9745e07
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
@@ -0,0 +1,308 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+
+#include "lov_internal.h"
+
+loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index)
+{
+	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
+
+	LASSERT(index < lsm->lsm_entry_count);
+
+	if (lsme_is_dom(entry))
+		return (loff_t)entry->lsme_stripe_size;
+
+	return (loff_t)entry->lsme_stripe_size * entry->lsme_stripe_count;
+}
+
+/* compute object size given "stripeno" and the ost size */
+u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size,
+		    int stripeno)
+{
+	unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size;
+	unsigned long stripe_size;
+	loff_t swidth;
+	loff_t lov_size;
+
+	ENTRY;
+
+	if (ost_size == 0)
+		RETURN(0);
+
+	swidth = stripe_width(lsm, index);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_size = lov_do_div64(ost_size, ssize);
+	if (stripe_size)
+		lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
+	else
+		lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
+
+	RETURN(lov_size);
+}
+
+/**
+ * Compute file level page index by stripe level page offset
+ */
+pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
+			 pgoff_t stripe_index, int stripe)
+{
+	loff_t offset;
+
+	offset = lov_stripe_size(lsm, index,
+				 (stripe_index << PAGE_SHIFT) + 1,
+				 stripe);
+	return offset >> PAGE_SHIFT;
+}
+
+/*
+ * we have an offset in file backed by an lov and want to find out where
+ * that offset lands in our given stripe of the file.  for the easy
+ * case where the offset is within the stripe, we just have to scale the
+ * offset down to make it relative to the stripe instead of the lov.
+ *
+ * the harder case is what to do when the offset doesn't intersect the
+ * stripe.  callers will want start offsets clamped ahead to the start
+ * of the nearest stripe in the file.  end offsets similarly clamped to the
+ * nearest ending byte of a stripe in the file:
+ *
+ * all this function does is move offsets to the nearest region of the
+ * stripe, and it does its work "mod" the full length of all the stripes.
+ * consider a file with 3 stripes:
+ *
+ *             S                                              E
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * to find stripe 1's offsets for S and E, it divides by the full stripe
+ * width and does its math in the context of a single set of stripes:
+ *
+ *             S         E
+ * -----------------------------------
+ * |    0    |     1     |     2     |
+ * -----------------------------------
+ *
+ * it'll notice that E is outside stripe 1 and clamp it to the end of the
+ * stripe, then multiply it back out by lov_off to give the real offsets in
+ * the stripe:
+ *
+ *   S                   E
+ * ---------------------------------------------------------------------
+ * |    1    |     1     |     1     |    1    |     1     |     1     |
+ * ---------------------------------------------------------------------
+ *
+ * it would have done similarly and pulled S forward to the start of a 1
+ * stripe if, say, S had landed in a 0 stripe.
+ *
+ * this rounding isn't always correct.  consider an E lov offset that lands
+ * on a 0 stripe, the "mod stripe width" math will pull it forward to the
+ * start of a 1 stripe, when in fact it wanted to be rounded back to the end
+ * of a previous 1 stripe.  this logic is handled by callers and this is why:
+ *
+ * this function returns < 0 when the offset was "before" the stripe and
+ * was moved forward to the start of the stripe in question;  0 when it
+ * falls in the stripe and no shifting was done; > 0 when the offset
+ * was outside the stripe and was pulled back to its final byte.
+ */
+int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
+		      int stripeno, loff_t *obdoff)
+{
+	unsigned long ssize  = lsm->lsm_entries[index]->lsme_stripe_size;
+	loff_t stripe_off;
+	loff_t this_stripe;
+	loff_t swidth;
+	int ret = 0;
+
+	if (lov_off == OBD_OBJECT_EOF) {
+		*obdoff = OBD_OBJECT_EOF;
+		return 0;
+	}
+
+	swidth = stripe_width(lsm, index);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	this_stripe = (loff_t)stripeno * ssize;
+	if (stripe_off < this_stripe) {
+		stripe_off = 0;
+		ret = -1;
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			stripe_off = ssize;
+			ret = 1;
+		}
+	}
+
+	*obdoff = lov_off * ssize + stripe_off;
+	return ret;
+}
+
+/*
+ * Given a whole-file size and a stripe number, give the file size which
+ * corresponds to the individual object of that stripe.
+ *
+ * This behaves basically in the same was as lov_stripe_offset, except that
+ * file sizes falling before the beginning of a stripe are clamped to the end
+ * of the previous stripe, not the beginning of the next:
+ *
+ *                                               S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * if clamped to stripe 2 becomes:
+ *
+ *                                   S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ */
+loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
+			  int stripeno)
+{
+	unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size;
+	loff_t stripe_off;
+	loff_t this_stripe;
+	loff_t swidth;
+
+	if (file_size == OBD_OBJECT_EOF)
+		return OBD_OBJECT_EOF;
+
+	swidth = stripe_width(lsm, index);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(file_size, swidth);
+
+	this_stripe = (loff_t)stripeno * ssize;
+	if (stripe_off < this_stripe) {
+		/* Move to end of previous stripe, or zero */
+		if (file_size > 0) {
+			file_size--;
+			stripe_off = ssize;
+		} else {
+			stripe_off = 0;
+		}
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			/* Clamp to end of this stripe */
+			stripe_off = ssize;
+		}
+	}
+
+	return (file_size * ssize + stripe_off);
+}
+
+/*
+ * given an extent in an lov and a stripe, calculate the extent of the stripe
+ * that is contained within the lov extent.  this returns true if the given
+ * stripe does intersect with the lov extent.
+ *
+ * Closed interval [@obd_start, @obd_end] will be returned if caller needs them.
+ */
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
+			  struct lu_extent *ext, u64 *obd_start, u64 *obd_end)
+{
+	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
+	u64 start, end;
+	int start_side, end_side;
+	u64 loc_start, loc_end;
+
+	if (!lu_extent_is_overlapped(ext, &entry->lsme_extent))
+			return 0;
+
+	if (!obd_start)
+		obd_start = &loc_start;
+	if (!obd_end)
+		obd_end = &loc_end;
+
+	start = max_t(__u64, ext->e_start, entry->lsme_extent.e_start);
+	end = min_t(__u64, ext->e_end, entry->lsme_extent.e_end);
+	if (end != OBD_OBJECT_EOF)
+		end--;
+
+	start_side = lov_stripe_offset(lsm, index, start, stripeno, obd_start);
+	end_side = lov_stripe_offset(lsm, index, end, stripeno, obd_end);
+
+	CDEBUG(D_INODE, "[%lld->%lld] -> [(%d) %lld->%lld (%d)]\n",
+		start, end, start_side, *obd_start, *obd_end, end_side);
+
+	/*
+	 * this stripe doesn't intersect the file extent when neither
+	 * start or the end intersected the stripe and obd_start and
+	 * obd_end got rounded up to the save value.
+	 */
+	if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+		return 0;
+
+	/*
+	 * as mentioned in the lov_stripe_offset commentary, end
+	 * might have been shifted in the wrong direction.  This
+	 * happens when an end offset is before the stripe when viewed
+	 * through the "mod stripe size" math. we detect it being shifted
+	 * in the wrong direction and touch it up.
+	 * interestingly, this can't underflow since end must be > start
+	 * if we passed through the previous check.
+	 * (should we assert for that somewhere?)
+	 */
+	if (end_side != 0)
+		(*obd_end)--;
+
+	return 1;
+}
+
+/* compute which stripe number "lov_off" will be written into */
+int lov_stripe_number(struct lov_stripe_md *lsm, int index, loff_t lov_off)
+{
+	unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size;
+	loff_t stripe_off;
+	loff_t swidth;
+
+	swidth = stripe_width(lsm, index);
+
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	/* Puts stripe_off/ssize result into stripe_off */
+	lov_do_div64(stripe_off, ssize);
+
+	return stripe_off;
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
new file mode 100644
index 0000000000000..42f1446f046da
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
@@ -0,0 +1,483 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/lov/lov_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <lustre_net.h>
+#include <lustre_swab.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+
+#include "lov_cl_internal.h"
+#include "lov_internal.h"
+
+void lov_dump_lmm_common(int level, void *lmmp)
+{
+	struct lov_mds_md *lmm = lmmp;
+	struct ost_id oi;
+
+	lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
+	CDEBUG_LIMIT(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+		     POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+		     le32_to_cpu(lmm->lmm_pattern));
+	CDEBUG_LIMIT(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+		     le32_to_cpu(lmm->lmm_stripe_size),
+		     le16_to_cpu(lmm->lmm_stripe_count),
+		     le16_to_cpu(lmm->lmm_layout_gen));
+}
+
+static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
+				 int stripe_count)
+{
+	int i;
+
+	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		CDEBUG_LIMIT(level,
+			     "bad stripe_count %u > max_stripe_count %u\n",
+			     stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+		return;
+	}
+
+	for (i = 0; i < stripe_count; ++i, ++lod) {
+		struct ost_id oi;
+
+		ostid_le_to_cpu(&lod->l_ost_oi, &oi);
+		CDEBUG_LIMIT(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+			     le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+	}
+}
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
+{
+	lov_dump_lmm_common(level, lmm);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+/**
+ * Pack LOV striping metadata for disk storage format (in little
+ * endian byte order).
+ *
+ * This follows the getxattr() conventions. If \a buf_size is zero
+ * then return the size needed. If \a buf_size is too small then
+ * return -ERANGE. Otherwise return the size of the result.
+ */
+static ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf,
+				 size_t buf_size)
+{
+	struct lov_mds_md_v1 *lmmv1 = buf;
+	struct lov_mds_md_v3 *lmmv3 = buf;
+	struct lov_ost_data_v1 *lmm_objects;
+	size_t lmm_size;
+	unsigned int i;
+
+	ENTRY;
+
+	lmm_size = lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count,
+				   lsm->lsm_magic);
+	if (buf_size == 0)
+		RETURN(lmm_size);
+
+	if (buf_size < lmm_size)
+		RETURN(-ERANGE);
+
+	/*
+	 * lmmv1 and lmmv3 point to the same struct and have the
+	 * same first fields
+	 */
+	lmmv1->lmm_magic = cpu_to_le32(lsm->lsm_magic);
+	lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi);
+	lmmv1->lmm_stripe_size = cpu_to_le32(
+				lsm->lsm_entries[0]->lsme_stripe_size);
+	lmmv1->lmm_stripe_count = cpu_to_le16(
+				lsm->lsm_entries[0]->lsme_stripe_count);
+	lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_entries[0]->lsme_pattern);
+	lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen);
+
+	if (lsm->lsm_magic == LOV_MAGIC_V3) {
+		BUILD_BUG_ON(sizeof(lsm->lsm_entries[0]->lsme_pool_name) !=
+				    sizeof(lmmv3->lmm_pool_name));
+		strlcpy(lmmv3->lmm_pool_name,
+			lsm->lsm_entries[0]->lsme_pool_name,
+			sizeof(lmmv3->lmm_pool_name));
+		lmm_objects = lmmv3->lmm_objects;
+	} else {
+		lmm_objects = lmmv1->lmm_objects;
+	}
+
+	if (lsm->lsm_is_released)
+		RETURN(lmm_size);
+
+	for (i = 0; i < lsm->lsm_entries[0]->lsme_stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_entries[0]->lsme_oinfo[i];
+
+		ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+		lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+		lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+	}
+
+	RETURN(lmm_size);
+}
+
+static ssize_t lov_lsm_pack_foreign(const struct lov_stripe_md *lsm, void *buf,
+				    size_t buf_size)
+{
+	struct lov_foreign_md *lfm = buf;
+	size_t lfm_size;
+
+	lfm_size = lsm->lsm_foreign_size;
+
+	if (buf_size == 0)
+		RETURN(lfm_size);
+
+	/* if buffer too small return ERANGE but copy the size the
+	 * caller has requested anyway. This may be useful to get
+	 * only the header without the need to alloc the full size
+	 */
+	if (buf_size < lfm_size) {
+		memcpy(lfm, lsm_foreign(lsm), buf_size);
+		RETURN(-ERANGE);
+	}
+
+	/* full foreign LOV is already avail in its cache
+	 * no need to translate format fields to little-endian
+	 */
+	memcpy(lfm, lsm_foreign(lsm), lsm->lsm_foreign_size);
+
+	RETURN(lfm_size);
+}
+
+ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
+		     size_t buf_size)
+{
+	struct lov_comp_md_v1 *lcmv1 = buf;
+	struct lov_comp_md_entry_v1 *lcme;
+	struct lov_ost_data_v1 *lmm_objects;
+	size_t lmm_size;
+	unsigned int entry;
+	unsigned int offset;
+	unsigned int size;
+	unsigned int i;
+
+	ENTRY;
+
+	if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3)
+		return lov_lsm_pack_v1v3(lsm, buf, buf_size);
+
+	if (lsm->lsm_magic == LOV_MAGIC_FOREIGN)
+		return lov_lsm_pack_foreign(lsm, buf, buf_size);
+
+	lmm_size = lov_comp_md_size(lsm);
+	if (buf_size == 0)
+		RETURN(lmm_size);
+
+	if (buf_size < lmm_size)
+		RETURN(-ERANGE);
+
+	lcmv1->lcm_magic = cpu_to_le32(lsm->lsm_magic);
+	lcmv1->lcm_size = cpu_to_le32(lmm_size);
+	lcmv1->lcm_layout_gen = cpu_to_le32(lsm->lsm_layout_gen);
+	lcmv1->lcm_flags = cpu_to_le16(lsm->lsm_flags);
+	lcmv1->lcm_mirror_count = cpu_to_le16(lsm->lsm_mirror_count);
+	lcmv1->lcm_entry_count = cpu_to_le16(lsm->lsm_entry_count);
+
+	offset = sizeof(*lcmv1) + sizeof(*lcme) * lsm->lsm_entry_count;
+
+	for (entry = 0; entry < lsm->lsm_entry_count; entry++) {
+		struct lov_stripe_md_entry *lsme;
+		struct lov_mds_md *lmm;
+		__u16 stripe_count;
+
+		lsme = lsm->lsm_entries[entry];
+		lcme = &lcmv1->lcm_entries[entry];
+
+		lcme->lcme_id = cpu_to_le32(lsme->lsme_id);
+		lcme->lcme_flags = cpu_to_le32(lsme->lsme_flags);
+		if (lsme->lsme_flags & LCME_FL_NOSYNC)
+			lcme->lcme_timestamp =
+				cpu_to_le64(lsme->lsme_timestamp);
+		lcme->lcme_extent.e_start =
+			cpu_to_le64(lsme->lsme_extent.e_start);
+		lcme->lcme_extent.e_end =
+			cpu_to_le64(lsme->lsme_extent.e_end);
+		lcme->lcme_offset = cpu_to_le32(offset);
+
+		lmm = (struct lov_mds_md *)((char *)lcmv1 + offset);
+		lmm->lmm_magic = cpu_to_le32(lsme->lsme_magic);
+		/* lmm->lmm_oi not set */
+		lmm->lmm_pattern = cpu_to_le32(lsme->lsme_pattern);
+		lmm->lmm_stripe_size = cpu_to_le32(lsme->lsme_stripe_size);
+		lmm->lmm_stripe_count = cpu_to_le16(lsme->lsme_stripe_count);
+		lmm->lmm_layout_gen = cpu_to_le16(lsme->lsme_layout_gen);
+
+		if (lsme->lsme_magic == LOV_MAGIC_V3) {
+			struct lov_mds_md_v3 *lmmv3 =
+						(struct lov_mds_md_v3 *)lmm;
+
+			strlcpy(lmmv3->lmm_pool_name, lsme->lsme_pool_name,
+				sizeof(lmmv3->lmm_pool_name));
+			lmm_objects = lmmv3->lmm_objects;
+		} else {
+			lmm_objects =
+				((struct lov_mds_md_v1 *)lmm)->lmm_objects;
+		}
+
+		if (lsme_inited(lsme) &&
+		    !(lsme->lsme_pattern & LOV_PATTERN_F_RELEASED))
+			stripe_count = lsme->lsme_stripe_count;
+		else
+			stripe_count = 0;
+
+		for (i = 0; i < stripe_count; i++) {
+			struct lov_oinfo *loi = lsme->lsme_oinfo[i];
+
+			ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+			lmm_objects[i].l_ost_gen =
+					cpu_to_le32(loi->loi_ost_gen);
+			lmm_objects[i].l_ost_idx =
+					cpu_to_le32(loi->loi_ost_idx);
+		}
+
+		size = lov_mds_md_size(stripe_count, lsme->lsme_magic);
+		lcme->lcme_size = cpu_to_le32(size);
+		offset += size;
+	} /* for each layout component */
+
+	RETURN(lmm_size);
+}
+
+/* Find the max stripecount we should use */
+__u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
+{
+	__u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD;
+
+	if (!stripe_count)
+		stripe_count = lov->desc.ld_default_stripe_count;
+	if (stripe_count > lov->desc.ld_active_tgt_count)
+		stripe_count = lov->desc.ld_active_tgt_count;
+	if (!stripe_count)
+		stripe_count = 1;
+
+	/*
+	 * stripe count is based on whether ldiskfs can handle
+	 * larger EA sizes
+	 */
+	if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
+	    lov->lov_ocd.ocd_max_easize)
+		max_stripes = lov_mds_md_max_stripe_count(
+			lov->lov_ocd.ocd_max_easize, magic);
+
+	if (stripe_count > max_stripes)
+		stripe_count = max_stripes;
+
+	return stripe_count;
+}
+
+int lov_free_memmd(struct lov_stripe_md **lsmp)
+{
+	struct lov_stripe_md *lsm = *lsmp;
+	int refc;
+
+	*lsmp = NULL;
+	refc = atomic_dec_return(&lsm->lsm_refc);
+	LASSERT(refc >= 0);
+	if (refc == 0)
+		lsm_free(lsm);
+
+	return refc;
+}
+
+/*
+ * Unpack LOV object metadata from disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ */
+struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
+				   size_t buf_size)
+{
+	const struct lsm_operations *op;
+	struct lov_stripe_md *lsm;
+	u32 magic;
+
+	ENTRY;
+
+	if (buf_size < sizeof(magic))
+		RETURN(ERR_PTR(-EINVAL));
+
+	magic = le32_to_cpu(*(u32 *)buf);
+	op = lsm_op_find(magic);
+	if (!op)
+		RETURN(ERR_PTR(-EINVAL));
+
+	lsm = op->lsm_unpackmd(lov, buf, buf_size);
+
+	RETURN(lsm);
+}
+
+/*
+ * Retrieve object striping information.
+ *
+ * @lump is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_USER_MAGIC.
+ *
+ * If @size > 0, User specified limited buffer size, usually the buffer is from
+ * ll_lov_setstripe(), and the buffer can only hold basic layout template info.
+ */
+int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
+		  struct lov_stripe_md *lsm, struct lov_user_md __user *lump,
+		  size_t size)
+{
+	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+	struct lov_mds_md *lmmk, *lmm;
+	struct lov_foreign_md *lfm;
+	struct lov_user_md_v1 lum;
+	size_t lmmk_size, lum_size = 0;
+	ssize_t lmm_size;
+	int rc = 0;
+
+	ENTRY;
+
+	if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3 &&
+	    lsm->lsm_magic != LOV_MAGIC_COMP_V1 &&
+	    lsm->lsm_magic != LOV_MAGIC_FOREIGN) {
+		CERROR("bad LSM MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+		       lsm->lsm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+		GOTO(out, rc = -EIO);
+	}
+
+	lmmk_size = lov_comp_md_size(lsm);
+
+	OBD_ALLOC_LARGE(lmmk, lmmk_size);
+	if (!lmmk)
+		GOTO(out, rc = -ENOMEM);
+
+	lmm_size = lov_lsm_pack(lsm, lmmk, lmmk_size);
+	if (lmm_size < 0)
+		GOTO(out_free, rc = lmm_size);
+
+	if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
+		if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
+		    lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+			lustre_swab_lov_mds_md(lmmk);
+			lustre_swab_lov_user_md_objects(
+				(struct lov_user_ost_data *)lmmk->lmm_objects,
+				lmmk->lmm_stripe_count);
+		} else if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1)) {
+			lustre_swab_lov_comp_md_v1(
+					(struct lov_comp_md_v1 *)lmmk);
+		} else if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_FOREIGN)) {
+			lfm = (struct lov_foreign_md *)lmmk;
+			__swab32s(&lfm->lfm_magic);
+			__swab32s(&lfm->lfm_length);
+			__swab32s(&lfm->lfm_type);
+			__swab32s(&lfm->lfm_flags);
+		}
+	}
+
+	/*
+	 * Legacy appication passes limited buffer, we need to figure out
+	 * the user buffer size by the passed in lmm_stripe_count.
+	 */
+	if (lsm->lsm_magic != LOV_MAGIC_FOREIGN)
+		if (copy_from_user(&lum, lump, sizeof(struct lov_user_md_v1)))
+			GOTO(out_free, rc = -EFAULT);
+
+	if (lum.lmm_magic == LOV_USER_MAGIC_V1 ||
+	    lum.lmm_magic == LOV_USER_MAGIC_V3)
+		lum_size = lov_user_md_size(lum.lmm_stripe_count,
+					    lum.lmm_magic);
+
+	if (lum_size != 0) {
+		struct lov_mds_md *comp_md = lmmk;
+
+		/*
+		 * Legacy app (ADIO for instance) treats the layout as V1/V3
+		 * blindly, we'd return a reasonable V1/V3 for them.
+		 */
+		if (lmmk->lmm_magic == LOV_MAGIC_COMP_V1) {
+			struct lov_comp_md_v1 *comp_v1;
+			struct cl_object *cl_obj;
+			struct cl_attr attr;
+			int i;
+
+			attr.cat_size = 0;
+			cl_obj = cl_object_top(&obj->lo_cl);
+			cl_object_attr_lock(cl_obj);
+			cl_object_attr_get(env, cl_obj, &attr);
+			cl_object_attr_unlock(cl_obj);
+
+			/*
+			 * return the last instantiated component if file size
+			 * is non-zero, otherwise, return the last component.
+			 */
+			comp_v1 = (struct lov_comp_md_v1 *)lmmk;
+			i = attr.cat_size == 0 ? comp_v1->lcm_entry_count : 0;
+			for (; i < comp_v1->lcm_entry_count; i++) {
+				if (!(comp_v1->lcm_entries[i].lcme_flags &
+						LCME_FL_INIT))
+					break;
+			}
+			if (i > 0)
+				i--;
+			comp_md = (struct lov_mds_md *)((char *)comp_v1 +
+					comp_v1->lcm_entries[i].lcme_offset);
+			lum_size = comp_v1->lcm_entries[i].lcme_size;
+		}
+
+		lmm = comp_md;
+		lmm_size = min(lum_size, lmmk_size);
+	} else {
+		lmm = lmmk;
+		lmm_size = lmmk_size;
+	}
+
+	/**
+	 * User specified limited buffer size, usually the buffer is
+	 * from ll_lov_setstripe(), and the buffer can only hold basic
+	 * layout template info.
+	 */
+	if (size == 0 || size > lmm_size)
+		size = lmm_size;
+	if (copy_to_user(lump, lmm, size))
+		GOTO(out_free, rc = -EFAULT);
+
+out_free:
+	OBD_FREE_LARGE(lmmk, lmmk_size);
+out:
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_page.c b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
new file mode 100644
index 0000000000000..887b304e81d6e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
@@ -0,0 +1,197 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_page for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov page operations.
+ *
+ */
+
+static int lov_comp_page_print(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       void *cookie, lu_printer_t printer)
+{
+	struct lov_page *lp = cl2lov_page(slice);
+
+	return (*printer)(env, cookie,
+			  LUSTRE_LOV_NAME"-page@%p\n", lp);
+}
+
+static const struct cl_page_operations lov_comp_page_ops = {
+	.cpo_print = lov_comp_page_print
+};
+
+int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_page *page, pgoff_t index)
+{
+	struct lov_object *loo = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	struct cl_object *subobj;
+	struct cl_object *o;
+	struct lov_io_sub *sub;
+	struct lov_page *lpg = cl_object_page_slice(obj, page);
+	struct lov_layout_raid0 *r0;
+	loff_t offset;
+	loff_t suboff;
+	bool stripe_cached = false;
+	int entry;
+	int stripe;
+	int rc;
+
+	ENTRY;
+
+	/* Direct i/o (CPT_TRANSIENT) is split strictly to stripes, so we can
+	 * cache the stripe information.  Buffered i/o is differently
+	 * organized, and stripe calculation isn't a significant cost for
+	 * buffered i/o, so we only cache this for direct i/o.
+	 */
+	stripe_cached = lio->lis_cached_entry != LIS_CACHE_ENTRY_NONE &&
+			page->cp_type == CPT_TRANSIENT;
+
+	offset = cl_offset(obj, index);
+
+	if (stripe_cached) {
+		entry = lio->lis_cached_entry;
+		stripe = lio->lis_cached_stripe;
+		/* Offset can never go backwards in an i/o, so this is valid */
+		suboff = lio->lis_cached_suboff + offset - lio->lis_cached_off;
+	} else {
+		entry = lov_io_layout_at(lio, offset);
+
+		stripe = lov_stripe_number(loo->lo_lsm, entry, offset);
+		rc = lov_stripe_offset(loo->lo_lsm, entry, offset, stripe,
+				       &suboff);
+		LASSERT(rc == 0);
+		lio->lis_cached_entry = entry;
+		lio->lis_cached_stripe = stripe;
+		lio->lis_cached_off = offset;
+		lio->lis_cached_suboff = suboff;
+	}
+
+	if (entry < 0 || !lsm_entry_inited(loo->lo_lsm, entry)) {
+		/* non-existing layout component */
+		lov_page_init_empty(env, obj, page, index);
+		RETURN(0);
+	}
+
+	CDEBUG(D_PAGE, "offset %llu, entry %d, stripe %d, suboff %llu\n",
+	       offset, entry, stripe, suboff);
+
+	page->cp_lov_index = lov_comp_index(entry, stripe);
+	cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_comp_page_ops);
+
+	if (!stripe_cached) {
+		sub = lov_sub_get(env, lio, page->cp_lov_index);
+		if (IS_ERR(sub))
+			RETURN(PTR_ERR(sub));
+	} else {
+		sub = lio->lis_cached_sub;
+	}
+
+	lio->lis_cached_sub = sub;
+
+	r0 = lov_r0(loo, entry);
+	LASSERT(stripe < r0->lo_nr);
+
+	subobj = lovsub2cl(r0->lo_sub[stripe]);
+	cl_object_for_each(o, subobj) {
+		if (o->co_ops->coo_page_init) {
+			rc = o->co_ops->coo_page_init(sub->sub_env, o, page,
+						      cl_index(subobj, suboff));
+			if (rc != 0)
+				break;
+		}
+	}
+
+	RETURN(rc);
+}
+
+static int lov_empty_page_print(const struct lu_env *env,
+				const struct cl_page_slice *slice,
+				void *cookie, lu_printer_t printer)
+{
+	struct lov_page *lp = cl2lov_page(slice);
+
+	return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp);
+}
+
+static const struct cl_page_operations lov_empty_page_ops = {
+	.cpo_print = lov_empty_page_print
+};
+
+int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, pgoff_t index)
+{
+	struct lov_page *lpg = cl_object_page_slice(obj, page);
+	void *addr;
+
+	ENTRY;
+
+	page->cp_lov_index = ~0;
+	cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops);
+	addr = kmap(page->cp_vmpage);
+	memset(addr, 0, cl_page_size(obj));
+	kunmap(page->cp_vmpage);
+	cl_page_export(env, page, 1);
+	RETURN(0);
+}
+
+int lov_page_init_foreign(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, pgoff_t index)
+{
+	CDEBUG(D_PAGE, DFID" has no data\n", PFID(lu_object_fid(&obj->co_lu)));
+	RETURN(-ENODATA);
+}
+
+bool lov_page_is_empty(const struct cl_page *page)
+{
+	const struct cl_page_slice *slice = cl_page_at(page, &lov_device_type);
+
+	LASSERT(slice != NULL);
+	return slice->cpl_ops == &lov_empty_page_ops;
+}
+
+
+/** @} lov */
+
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
new file mode 100644
index 0000000000000..afccd0523c2c9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
@@ -0,0 +1,484 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-hash.h>
+#include <libcfs/linux/linux-fs.h>
+
+#include <obd.h>
+#include "lov_internal.h"
+
+#define pool_tgt(_p, _i) \
+		_p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]]
+
+static u32 pool_hashfh(const void *data, u32 len, u32 seed)
+{
+	const char *pool_name = data;
+
+	return hashlen_hash(cfs_hashlen_string((void *)(unsigned long)seed,
+					       pool_name));
+}
+
+static int pool_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct pool_desc *pool = obj;
+	const char *pool_name = arg->key;
+
+	return strcmp(pool_name, pool->pool_name);
+}
+
+static const struct rhashtable_params pools_hash_params = {
+	.key_len	= 1, /* actually variable */
+	.key_offset	= offsetof(struct pool_desc, pool_name),
+	.head_offset	= offsetof(struct pool_desc, pool_hash),
+	.hashfn		= pool_hashfh,
+	.obj_cmpfn	= pool_cmpfn,
+	.automatic_shrinking = true,
+};
+
+static void lov_pool_getref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	atomic_inc(&pool->pool_refcount);
+}
+
+void lov_pool_putref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	if (atomic_dec_and_test(&pool->pool_refcount)) {
+		LASSERT(list_empty(&pool->pool_list));
+		LASSERT(pool->pool_proc_entry == NULL);
+		lu_tgt_pool_free(&(pool->pool_obds));
+		kfree_rcu(pool, pool_rcu);
+		EXIT;
+	}
+}
+
+#ifdef CONFIG_PROC_FS
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+        int magic;
+        struct pool_desc *pool;
+        int idx;        /* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)s->private;
+	int prev_idx;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic);
+
+	(*pos)++;
+	/* test if end of file */
+	if (*pos > pool_tgt_count(iter->pool))
+		return NULL;
+
+	/* iterate to find a non empty entry */
+	prev_idx = iter->idx;
+	iter->idx++;
+	if (iter->idx >= pool_tgt_count(iter->pool)) {
+		iter->idx = prev_idx; /* we stay on the last entry */
+		return NULL;
+	}
+	/* return != NULL to continue */
+	return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+        struct pool_desc *pool = (struct pool_desc *)s->private;
+        struct pool_iterator *iter;
+
+        lov_pool_getref(pool);
+        if ((pool_tgt_count(pool) == 0) ||
+            (*pos >= pool_tgt_count(pool))) {
+                /* iter is not created, so stop() has no way to
+                 * find pool to dec ref */
+                lov_pool_putref(pool);
+                return NULL;
+        }
+
+        OBD_ALLOC_PTR(iter);
+        if (!iter)
+                return ERR_PTR(-ENOMEM);
+        iter->magic = POOL_IT_MAGIC;
+        iter->pool = pool;
+        iter->idx = 0;
+
+        /* we use seq_file private field to memorized iterator so
+         * we can free it at stop() */
+        /* /!\ do not forget to restore it to pool before freeing it */
+        s->private = iter;
+	down_read(&pool_tgt_rw_sem(pool));
+        if (*pos > 0) {
+                loff_t i;
+                void *ptr;
+
+                i = 0;
+                do {
+                     ptr = pool_proc_next(s, &iter, &i);
+                } while ((i < *pos) && (ptr != NULL));
+                return ptr;
+        }
+        return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+        /* in some cases stop() method is called 2 times, without
+         * calling start() method (see seq_read() from fs/seq_file.c)
+         * we have to free only if s->private is an iterator */
+        if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+		up_read(&pool_tgt_rw_sem(iter->pool));
+                /* we restore s->private so next call to pool_proc_start()
+                 * will work */
+                s->private = iter->pool;
+                lov_pool_putref(iter->pool);
+                OBD_FREE_PTR(iter);
+        }
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)v;
+        struct lov_tgt_desc *tgt;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic);
+	LASSERT(iter->pool != NULL);
+	LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+        tgt = pool_tgt(iter->pool, iter->idx);
+        if (tgt)
+                seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+        return 0;
+}
+
+static const struct seq_operations pool_proc_ops = {
+	.start		= pool_proc_start,
+	.next		= pool_proc_next,
+	.stop		= pool_proc_stop,
+	.show		= pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+        int rc;
+
+        rc = seq_open(file, &pool_proc_ops);
+        if (!rc) {
+                struct seq_file *s = file->private_data;
+		s->private = pde_data(inode);
+        }
+        return rc;
+}
+
+const static struct proc_ops pool_proc_operations = {
+	.proc_open	= pool_proc_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+static void pools_hash_exit(void *vpool, void *data)
+{
+	struct pool_desc *pool = vpool;
+
+	lov_pool_putref(pool);
+}
+
+int lov_pool_hash_init(struct rhashtable *tbl)
+{
+	return rhashtable_init(tbl, &pools_hash_params);
+}
+
+void lov_pool_hash_destroy(struct rhashtable *tbl)
+{
+	rhashtable_free_and_destroy(tbl, pools_hash_exit, NULL);
+}
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+        struct lov_obd *lov;
+        struct pool_desc *new_pool;
+        int rc;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+        if (strlen(poolname) > LOV_MAXPOOLNAME)
+                RETURN(-ENAMETOOLONG);
+
+	/* OBD_ALLOC doesn't work with direct use of kfree_rcu */
+	new_pool = kmalloc(sizeof(*new_pool), GFP_KERNEL);
+        if (new_pool == NULL)
+                RETURN(-ENOMEM);
+
+	strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name));
+	new_pool->pool_lobd = obd;
+	/* ref count init to 1 because when created a pool is always used
+	 * up to deletion
+	 */
+	atomic_set(&new_pool->pool_refcount, 1);
+	rc = lu_tgt_pool_init(&new_pool->pool_obds, 0);
+	if (rc)
+		GOTO(out_free_pool, rc);
+
+#ifdef CONFIG_PROC_FS
+	/* get ref for /proc file */
+        lov_pool_getref(new_pool);
+	new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+						       poolname, new_pool,
+						       &pool_proc_operations);
+	if (IS_ERR(new_pool->pool_proc_entry)) {
+		CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
+		new_pool->pool_proc_entry = NULL;
+		lov_pool_putref(new_pool);
+	}
+	CDEBUG(D_INFO, "pool %p - proc %p\n",
+	       new_pool, new_pool->pool_proc_entry);
+#endif
+
+	spin_lock(&obd->obd_dev_lock);
+	list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+	lov->lov_pool_count++;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* Add to hash table only when it is fully ready. */
+	rc = rhashtable_lookup_insert_fast(&lov->lov_pools_hash_body,
+					   &new_pool->pool_hash,
+					   pools_hash_params);
+	if (rc) {
+		if (rc != -EEXIST)
+			/*
+			 * Hide -E2BIG and -EBUSY which
+			 * are not helpful.
+			 */
+			rc = -ENOMEM;
+		GOTO(out_err, rc);
+	}
+
+        CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+               poolname, lov->lov_pool_count);
+
+        RETURN(0);
+
+out_err:
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&new_pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+        lprocfs_remove(&new_pool->pool_proc_entry);
+	lu_tgt_pool_free(&new_pool->pool_obds);
+out_free_pool:
+	OBD_FREE_PTR(new_pool);
+
+	return rc;
+}
+
+struct pool_desc *lov_pool_find(struct obd_device *obd, char *poolname)
+{
+	struct pool_desc *pool;
+	struct lov_obd *lov = &obd->u.lov;
+
+	rcu_read_lock();
+	pool = rhashtable_lookup(&lov->lov_pools_hash_body,
+				 poolname,
+				 pools_hash_params);
+	if (pool && !atomic_inc_not_zero(&pool->pool_refcount))
+		pool = NULL;
+	rcu_read_unlock();
+
+	return pool;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+	/* lookup and kill hash reference */
+	rcu_read_lock();
+	pool = rhashtable_lookup(&lov->lov_pools_hash_body, poolname,
+				 pools_hash_params);
+	if (pool && rhashtable_remove_fast(&lov->lov_pools_hash_body,
+					   &pool->pool_hash,
+					   pools_hash_params) != 0)
+		pool = NULL;
+	rcu_read_unlock();
+	if (!pool)
+		RETURN(-ENOENT);
+
+        if (pool->pool_proc_entry != NULL) {
+                CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
+                lprocfs_remove(&pool->pool_proc_entry);
+                lov_pool_putref(pool);
+        }
+
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* release last reference */
+	lov_pool_putref(pool);
+
+	RETURN(0);
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+        struct obd_uuid ost_uuid;
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        unsigned int lov_idx;
+        int rc;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+	rcu_read_lock();
+	pool = rhashtable_lookup(&lov->lov_pools_hash_body, poolname,
+				 pools_hash_params);
+	if (pool && !atomic_inc_not_zero(&pool->pool_refcount))
+		pool = NULL;
+	rcu_read_unlock();
+	if (!pool)
+		RETURN(-ENOENT);
+
+        obd_str2uuid(&ost_uuid, ostname);
+
+
+        /* search ost in lov array */
+	lov_tgts_getref(obd);
+        for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+                if (!lov->lov_tgts[lov_idx])
+                        continue;
+                if (obd_uuid_equals(&ost_uuid,
+                                    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+                        break;
+        }
+        /* test if ost found in lov */
+        if (lov_idx == lov->desc.ld_tgt_count)
+                GOTO(out, rc = -EINVAL);
+
+	rc = lu_tgt_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+        if (rc)
+                GOTO(out, rc);
+
+        CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
+               ostname, poolname,  pool_tgt_count(pool));
+
+        EXIT;
+out:
+	lov_tgts_putref(obd);
+	lov_pool_putref(pool);
+
+	return rc;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+        struct obd_uuid ost_uuid;
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        unsigned int lov_idx;
+        int rc = 0;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+	/* lookup and kill hash reference */
+	rcu_read_lock();
+	pool = rhashtable_lookup(&lov->lov_pools_hash_body, poolname,
+				 pools_hash_params);
+	if (pool && !atomic_inc_not_zero(&pool->pool_refcount))
+		pool = NULL;
+	rcu_read_unlock();
+	if (!pool)
+		RETURN(-ENOENT);
+
+        obd_str2uuid(&ost_uuid, ostname);
+
+	lov_tgts_getref(obd);
+        /* search ost in lov array, to get index */
+        for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+                if (!lov->lov_tgts[lov_idx])
+                        continue;
+
+                if (obd_uuid_equals(&ost_uuid,
+                                    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+                        break;
+        }
+
+        /* test if ost found in lov */
+        if (lov_idx == lov->desc.ld_tgt_count)
+                GOTO(out, rc = -EINVAL);
+
+	lu_tgt_pool_remove(&pool->pool_obds, lov_idx);
+
+        CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
+               poolname);
+
+        EXIT;
+out:
+	lov_tgts_putref(obd);
+	lov_pool_putref(pool);
+
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_request.c b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
new file mode 100644
index 0000000000000..4994011a7895b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
@@ -0,0 +1,392 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/delay.h>
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include "lov_internal.h"
+
+static void lov_init_set(struct lov_request_set *set)
+{
+	set->set_count = 0;
+	atomic_set(&set->set_completes, 0);
+	atomic_set(&set->set_success, 0);
+	INIT_LIST_HEAD(&set->set_list);
+}
+
+static void lov_finish_set(struct lov_request_set *set)
+{
+	struct list_head *pos, *n;
+	struct lov_request *req;
+
+	ENTRY;
+
+	LASSERT(set != NULL);
+	list_for_each_safe(pos, n, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+		list_del_init(&req->rq_link);
+
+		if (req->rq_oi.oi_osfs)
+			OBD_FREE_PTR(req->rq_oi.oi_osfs);
+
+		OBD_FREE_PTR(req);
+	}
+
+	OBD_FREE_PTR(set);
+	EXIT;
+}
+
+static void
+lov_update_set(struct lov_request_set *set, struct lov_request *req, int rc)
+{
+	atomic_inc(&set->set_completes);
+	if (rc == 0)
+		atomic_inc(&set->set_success);
+}
+
+static void
+lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
+{
+	list_add_tail(&req->rq_link, &set->set_list);
+	set->set_count++;
+	req->rq_rqset = set;
+}
+
+static int lov_check_set(struct lov_obd *lov, int idx)
+{
+	int rc = 0;
+
+	mutex_lock(&lov->lov_lock);
+
+	if (!lov->lov_tgts[idx] || lov->lov_tgts[idx]->ltd_active ||
+	    (lov->lov_tgts[idx]->ltd_exp &&
+	     class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
+		rc = 1;
+
+	mutex_unlock(&lov->lov_lock);
+	return rc;
+}
+
+/*
+ * Check if the OSC connection exists and is active.
+ * If the OSC has not yet had a chance to connect to the OST the first time,
+ * wait once for it to connect instead of returning an error.
+ */
+static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
+{
+	struct lov_tgt_desc *tgt;
+	struct obd_import *imp = NULL;
+	int rc = 0;
+	int cnt;
+
+	mutex_lock(&lov->lov_lock);
+
+	tgt = lov->lov_tgts[ost_idx];
+
+	if (unlikely(!tgt))
+		GOTO(out, rc = 0);
+
+	if (likely(tgt->ltd_active))
+		GOTO(out, rc = 1);
+
+	if (tgt->ltd_exp)
+		imp = class_exp2cliimp(tgt->ltd_exp);
+	if (imp && imp->imp_connect_tried)
+		GOTO(out, rc = 0);
+	if (imp && imp->imp_state == LUSTRE_IMP_IDLE)
+		GOTO(out, rc = 0);
+
+	mutex_unlock(&lov->lov_lock);
+
+	cnt = obd_timeout;
+	while (cnt > 0 &&
+	       !lov_check_set(lov, ost_idx)) {
+		ssleep(1);
+		cnt -= 1;
+	}
+	if (tgt->ltd_active)
+		return 1;
+
+	return 0;
+
+out:
+	mutex_unlock(&lov->lov_lock);
+	return rc;
+}
+
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add)                                           \
+	do {                                                            \
+		if ((tot) + (add) < (tot))                              \
+			(tot) = LOV_U64_MAX;                            \
+		else                                                    \
+			(tot) += (add);                                 \
+	} while (0)
+
+static int
+lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success)
+{
+	ENTRY;
+
+	if (success) {
+		__u32 expected_stripes = lov_get_stripe_count(&obd->u.lov,
+							      LOV_MAGIC, 0);
+		if (osfs->os_files != LOV_U64_MAX)
+			lov_do_div64(osfs->os_files, expected_stripes);
+		if (osfs->os_ffree != LOV_U64_MAX)
+			lov_do_div64(osfs->os_ffree, expected_stripes);
+
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
+		obd->obd_osfs_age = ktime_get_seconds();
+		spin_unlock(&obd->obd_osfs_lock);
+		RETURN(0);
+	}
+
+	RETURN(-EIO);
+}
+
+int lov_fini_statfs_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (!set)
+		RETURN(0);
+
+	if (atomic_read(&set->set_completes)) {
+		rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
+				     atomic_read(&set->set_success));
+	}
+
+	lov_finish_set(set);
+
+	RETURN(rc);
+}
+
+static void
+lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+		  int success)
+{
+	int shift = 0, quit = 0;
+	__u64 tmp;
+
+	if (success == 0) {
+		memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+	} else {
+		if (osfs->os_bsize != lov_sfs->os_bsize) {
+			/* assume all block sizes are always powers of 2 */
+			/* get the bits difference */
+			tmp = osfs->os_bsize | lov_sfs->os_bsize;
+			for (shift = 0; shift <= 64; ++shift) {
+				if (tmp & 1) {
+					if (quit)
+						break;
+					quit = 1;
+					shift = 0;
+				}
+				tmp >>= 1;
+			}
+		}
+
+		if (osfs->os_bsize < lov_sfs->os_bsize) {
+			osfs->os_bsize = lov_sfs->os_bsize;
+
+			osfs->os_bfree  >>= shift;
+			osfs->os_bavail >>= shift;
+			osfs->os_blocks >>= shift;
+		} else if (shift != 0) {
+			lov_sfs->os_bfree  >>= shift;
+			lov_sfs->os_bavail >>= shift;
+			lov_sfs->os_blocks >>= shift;
+		}
+#ifdef MIN_DF
+		/*
+		 * Sandia requested that df (and so, statfs) only
+		 * returned minimal available space on
+		 * a single OST, so people would be able to
+		 * write this much data guaranteed.
+		 */
+		if (osfs->os_bavail > lov_sfs->os_bavail) {
+			/*
+			 * Presumably if new bavail is smaller,
+			 * new bfree is bigger as well
+			 */
+			osfs->os_bfree = lov_sfs->os_bfree;
+			osfs->os_bavail = lov_sfs->os_bavail;
+		}
+#else
+		osfs->os_bfree += lov_sfs->os_bfree;
+		osfs->os_bavail += lov_sfs->os_bavail;
+#endif
+		osfs->os_blocks += lov_sfs->os_blocks;
+		/*
+		 * XXX not sure about this one - depends on policy.
+		 *   - could be minimum if we always stripe on all OBDs
+		 *     (but that would be wrong for any other policy,
+		 *     if one of the OBDs has no more objects left)
+		 *   - could be sum if we stripe whole objects
+		 *   - could be average, just to give a nice number
+		 *
+		 * To give a "reasonable" (if not wholly accurate)
+		 * number, we divide the total number of free objects
+		 * by expected stripe count (watch out for overflow).
+		 */
+		LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+		LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+	}
+}
+
+/*
+ * The callback for osc_statfs_async that finilizes a request info when a
+ * response is received.
+ */
+static int cb_statfs_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	struct lov_request_set *set;
+	struct obd_statfs *osfs, *lov_sfs;
+	struct lov_obd *lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *lovobd, *tgtobd;
+	int success;
+
+	ENTRY;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	set = lovreq->rq_rqset;
+	lovobd = set->set_obd;
+	lov = &lovobd->u.lov;
+	osfs = set->set_oi->oi_osfs;
+	lov_sfs = oinfo->oi_osfs;
+	success = atomic_read(&set->set_success);
+	/*
+	 * XXX: the same is done in lov_update_common_set, however
+	 * lovset->set_exp is not initialized.
+	 */
+	lov_update_set(set, lovreq, rc);
+	if (rc)
+		GOTO(out, rc);
+
+	lov_tgts_getref(lovobd);
+	tgt = lov->lov_tgts[lovreq->rq_idx];
+	if (!tgt || !tgt->ltd_active)
+		GOTO(out_update, rc);
+
+	tgtobd = class_exp2obd(tgt->ltd_exp);
+	spin_lock(&tgtobd->obd_osfs_lock);
+	memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
+	if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
+		tgtobd->obd_osfs_age = ktime_get_seconds();
+	spin_unlock(&tgtobd->obd_osfs_lock);
+
+out_update:
+	lov_update_statfs(osfs, lov_sfs, success);
+	lov_tgts_putref(lovobd);
+out:
+	RETURN(0);
+}
+
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+			struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc = 0, i;
+
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (!set)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_obd = obd;
+	set->set_oi = oinfo;
+
+	/* We only get block data from the OBD */
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		struct lov_tgt_desc *ltd = lov->lov_tgts[i];
+		struct lov_request *req;
+
+		if (!ltd) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", i);
+			continue;
+		}
+
+		/*
+		 * skip targets that have been explicitely disabled by the
+		 * administrator
+		 */
+		if (!ltd->ltd_exp) {
+			CDEBUG(D_HA, "lov idx %d administratively disabled\n",
+			       i);
+			continue;
+		}
+
+		if (oinfo->oi_flags & OBD_STATFS_NODELAY &&
+		    class_exp2cliimp(ltd->ltd_exp)->imp_state !=
+		    LUSTRE_IMP_IDLE && !ltd->ltd_active) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", i);
+			continue;
+		}
+
+		if (!ltd->ltd_active)
+			lov_check_and_wait_active(lov, i);
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (!req)
+			GOTO(out_set, rc = -ENOMEM);
+
+		OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+		if (!req->rq_oi.oi_osfs) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+
+		req->rq_idx = i;
+		req->rq_oi.oi_cb_up = cb_statfs_update;
+		req->rq_oi.oi_flags = oinfo->oi_flags;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_statfs_set(set);
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
new file mode 100644
index 0000000000000..4f2640bc7c530
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
@@ -0,0 +1,145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_device and cl_device_type for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov-sub device and device type functions.
+ *
+ */
+
+static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
+			      const char *name, struct lu_device *next)
+{
+	struct lovsub_device  *lsd = lu2lovsub_dev(d);
+	struct lu_device_type *ldt;
+	int rc;
+
+	ENTRY;
+	next->ld_site = d->ld_site;
+	ldt = next->ld_type;
+	LASSERT(ldt != NULL);
+	rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+	if (rc) {
+		next->ld_site = NULL;
+		RETURN(rc);
+	}
+
+	lu_device_get(next);
+	lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	lsd->acid_next = lu2cl_dev(next);
+	RETURN(rc);
+}
+
+static struct lu_device *lovsub_device_fini(const struct lu_env *env,
+					    struct lu_device *d)
+{
+	struct lu_device *next;
+	struct lovsub_device *lsd;
+
+	ENTRY;
+	lsd = lu2lovsub_dev(d);
+	next = cl2lu_dev(lsd->acid_next);
+	lsd->acid_next = NULL;
+	RETURN(next);
+}
+
+static struct lu_device *lovsub_device_free(const struct lu_env *env,
+					    struct lu_device *d)
+{
+	struct lovsub_device *lsd = lu2lovsub_dev(d);
+	struct lu_device *next = cl2lu_dev(lsd->acid_next);
+
+	lu_site_print(env, d->ld_site, &d->ld_ref, D_ERROR, lu_cdebug_printer);
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(lsd);
+	return next;
+}
+
+static const struct lu_device_operations lovsub_lu_ops = {
+	.ldo_object_alloc      = lovsub_object_alloc,
+	.ldo_process_config    = NULL,
+	.ldo_recovery_complete = NULL
+};
+
+static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
+					     struct lu_device_type *t,
+					     struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct lovsub_device *lsd;
+
+	OBD_ALLOC_PTR(lsd);
+	if (lsd) {
+		int result;
+
+		result = cl_device_init(&lsd->acid_cl, t);
+		if (result == 0) {
+			d = lovsub2lu_dev(lsd);
+			d->ld_ops         = &lovsub_lu_ops;
+		} else
+			d = ERR_PTR(result);
+	} else
+		d = ERR_PTR(-ENOMEM);
+	return d;
+}
+
+static const struct lu_device_type_operations lovsub_device_type_ops = {
+	.ldto_device_alloc = lovsub_device_alloc,
+	.ldto_device_free = lovsub_device_free,
+
+	.ldto_device_init = lovsub_device_init,
+	.ldto_device_fini = lovsub_device_fini
+};
+
+#define LUSTRE_LOVSUB_NAME         "lovsub"
+
+struct lu_device_type lovsub_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOVSUB_NAME,
+	.ldt_ops      = &lovsub_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+
+/** @} lov */
+
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
new file mode 100644
index 0000000000000..cd239733270ef
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_object for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub object operations.
+ *
+ */
+
+static int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+			      const struct lu_object_conf *conf)
+{
+	struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev);
+	struct lu_object *below;
+	struct lu_device *under;
+	int result;
+
+	ENTRY;
+	under = &dev->acid_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below) {
+		lu_object_add(obj, below);
+		cl_object_page_init(lu2cl(obj), 0);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+
+}
+
+static void lovsub_object_free_rcu(struct rcu_head *head)
+{
+	struct lovsub_object *los = container_of(head, struct lovsub_object,
+						 lso_header.coh_lu.loh_rcu);
+
+	kmem_cache_free(lovsub_object_kmem, los);
+}
+
+static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lovsub_object *los = lu2lovsub(obj);
+	struct lov_object *lov = los->lso_super;
+
+	ENTRY;
+
+	/*
+	 * We can't assume lov was assigned here, because of the shadow
+	 * object handling in lu_object_find.
+	 */
+	if (lov) {
+		int index = lov_comp_entry(los->lso_index);
+		int stripe = lov_comp_stripe(los->lso_index);
+		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
+
+		LASSERT(lov->lo_type == LLT_COMP);
+		LASSERT(r0->lo_sub[stripe] == los);
+		spin_lock(&r0->lo_sub_lock);
+		r0->lo_sub[stripe] = NULL;
+		spin_unlock(&r0->lo_sub_lock);
+	}
+
+	lu_object_fini(obj);
+	lu_object_header_fini(&los->lso_header.coh_lu);
+	OBD_FREE_PRE(los, sizeof(*los), "slab-freed");
+	call_rcu(&los->lso_header.coh_lu.loh_rcu, lovsub_object_free_rcu);
+	EXIT;
+}
+
+static int lovsub_object_print(const struct lu_env *env, void *cookie,
+			       lu_printer_t p, const struct lu_object *obj)
+{
+	struct lovsub_object *los = lu2lovsub(obj);
+
+	return (*p)(env, cookie, "[%d]", los->lso_index);
+}
+
+static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj,
+			      const struct cl_attr *attr, unsigned valid)
+{
+	struct lovsub_object *los = cl2lovsub(obj);
+	struct lov_object *lov = cl2lovsub(obj)->lso_super;
+
+	ENTRY;
+	lov_r0(lov, lov_comp_entry(los->lso_index))->lo_attr_valid = 0;
+	RETURN(0);
+}
+
+static int lovsub_object_glimpse(const struct lu_env *env,
+				 const struct cl_object *obj,
+				 struct ost_lvb *lvb)
+{
+	struct lovsub_object *los = cl2lovsub(obj);
+
+	ENTRY;
+	RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+}
+
+/**
+ * Implementation of struct cl_object_operations::coo_req_attr_set() for lovsub
+ * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx
+ * field, which is filled there.
+ */
+static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+				struct cl_req_attr *attr)
+{
+	struct lovsub_object *subobj = cl2lovsub(obj);
+	struct lov_stripe_md *lsm = subobj->lso_super->lo_lsm;
+
+	ENTRY;
+	cl_req_attr_set(env, &subobj->lso_super->lo_cl, attr);
+
+	/*
+	 * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it
+	 * unconditionally. It never changes anyway.
+	 */
+	attr->cra_oa->o_stripe_idx = lov_comp_stripe(subobj->lso_index);
+	lov_lsm2layout(lsm, lsm->lsm_entries[lov_comp_entry(subobj->lso_index)],
+		       &attr->cra_oa->o_layout);
+	attr->cra_oa->o_valid |= OBD_MD_FLOSTLAYOUT;
+	EXIT;
+}
+
+static const struct cl_object_operations lovsub_ops = {
+	.coo_attr_update  = lovsub_attr_update,
+	.coo_glimpse      = lovsub_object_glimpse,
+	.coo_req_attr_set = lovsub_req_attr_set
+};
+
+static const struct lu_object_operations lovsub_lu_obj_ops = {
+	.loo_object_init      = lovsub_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = lovsub_object_free,
+	.loo_object_print     = lovsub_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+				      const struct lu_object_header *unused,
+				      struct lu_device *dev)
+{
+	struct lovsub_object *los;
+	struct lu_object     *obj;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS);
+	if (los) {
+		struct cl_object_header *hdr;
+
+		obj = lovsub2lu(los);
+		hdr = &los->lso_header;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+		los->lso_cl.co_ops = &lovsub_ops;
+		obj->lo_ops = &lovsub_lu_obj_ops;
+	} else
+		obj = NULL;
+	RETURN(obj);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
new file mode 100644
index 0000000000000..ac7358100a3e4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
@@ -0,0 +1,310 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+#include <uapi/linux/lustre/lustre_param.h>
+#include "lov_internal.h"
+
+static ssize_t stripesize_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", desc->ld_default_stripe_size);
+}
+
+static ssize_t stripesize_store(struct kobject *kobj, struct attribute *attr,
+				const char *buf, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+	u64 val;
+	int rc;
+
+	rc = sysfs_memparse(buf, count, &val, "B");
+	if (rc < 0)
+		return rc;
+
+	lov_fix_desc_stripe_size(&val);
+	desc->ld_default_stripe_size = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(stripesize);
+
+static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+
+	return sprintf(buf, "%lld\n", desc->ld_default_stripe_offset);
+}
+
+static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+	long val;
+	int rc;
+
+	rc = kstrtol(buf, 0, &val);
+	if (rc)
+		return rc;
+	if (val < -1 || val > LOV_MAX_STRIPE_COUNT)
+		return -ERANGE;
+
+	desc->ld_default_stripe_offset = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(stripeoffset);
+
+static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+
+	return sprintf(buf, "%u\n", desc->ld_pattern);
+}
+
+static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr,
+				const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+	u32 pattern;
+	int rc;
+
+	rc = kstrtouint(buffer, 0, &pattern);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_pattern(&pattern);
+	desc->ld_pattern = pattern;
+
+	return count;
+}
+LUSTRE_RW_ATTR(stripetype);
+
+static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+
+	return sprintf(buf, "%d\n",
+		       (__s16)(desc->ld_default_stripe_count + 1) - 1);
+}
+
+static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+	int stripe_count;
+	int rc;
+
+	rc = kstrtoint(buffer, 0, &stripe_count);
+	if (rc)
+		return rc;
+
+	if (stripe_count < -1)
+		return -ERANGE;
+
+	lov_fix_desc_stripe_count(&stripe_count);
+	desc->ld_default_stripe_count = stripe_count;
+
+	return count;
+}
+LUSTRE_RW_ATTR(stripecount);
+
+static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+
+	return sprintf(buf, "%u\n", desc->ld_tgt_count);
+}
+LUSTRE_RO_ATTR(numobd);
+
+static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+
+	return sprintf(buf, "%u\n", desc->ld_active_tgt_count);
+}
+LUSTRE_RO_ATTR(activeobd);
+
+static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &obd->u.lov.desc;
+
+	return sprintf(buf, "%s\n", desc->ld_uuid.uuid);
+}
+LUSTRE_RO_ATTR(desc_uuid);
+
+#ifdef CONFIG_PROC_FS
+static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_device *obd = p->private;
+	struct lov_obd *lov = &obd->u.lov;
+
+	while (*pos < lov->desc.ld_tgt_count) {
+		if (lov->lov_tgts[*pos])
+			return lov->lov_tgts[*pos];
+		++*pos;
+	}
+	return NULL;
+}
+
+static void lov_tgt_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_device *obd = p->private;
+	struct lov_obd *lov = &obd->u.lov;
+
+	while (++*pos < lov->desc.ld_tgt_count) {
+		if (lov->lov_tgts[*pos])
+			return lov->lov_tgts[*pos];
+	}
+	return NULL;
+}
+
+static int lov_tgt_seq_show(struct seq_file *p, void *v)
+{
+        struct lov_tgt_desc *tgt = v;
+
+	seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
+		   obd_uuid2str(&tgt->ltd_uuid),
+		   tgt->ltd_active ? "" : "IN");
+	return 0;
+}
+
+static const struct seq_operations lov_tgt_sops = {
+        .start = lov_tgt_seq_start,
+        .stop = lov_tgt_seq_stop,
+        .next = lov_tgt_seq_next,
+        .show = lov_tgt_seq_show,
+};
+
+static int lov_target_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lov_tgt_sops);
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = pde_data(inode);
+	return 0;
+}
+
+static const struct proc_ops lov_proc_target_fops = {
+	PROC_OWNER(THIS_MODULE)
+	.proc_open	= lov_target_seq_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= lprocfs_seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+static struct attribute *lov_attrs[] = {
+	&lustre_attr_activeobd.attr,
+	&lustre_attr_numobd.attr,
+	&lustre_attr_desc_uuid.attr,
+	&lustre_attr_stripesize.attr,
+	&lustre_attr_stripeoffset.attr,
+	&lustre_attr_stripetype.attr,
+	&lustre_attr_stripecount.attr,
+	NULL,
+};
+
+KOBJ_ATTRIBUTE_GROUPS(lov); /* creates lov_groups */
+
+int lov_tunables_init(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	int rc;
+
+	obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(lov);
+	rc = lprocfs_obd_setup(obd, false);
+	if (rc)
+		GOTO(out, rc);
+
+#ifdef CONFIG_PROC_FS
+	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", 0444,
+				&lov_proc_target_fops, obd);
+	if (rc)
+		CWARN("%s: Error adding the target_obd file : rc %d\n",
+		      obd->obd_name, rc);
+
+	lov->lov_pool_proc_entry = lprocfs_register("pools",
+						    obd->obd_proc_entry,
+						    NULL, NULL);
+	if (IS_ERR(lov->lov_pool_proc_entry)) {
+		rc = PTR_ERR(lov->lov_pool_proc_entry);
+		CERROR("%s: error setting up debugfs for pools : rc %d\n",
+		       obd->obd_name, rc);
+		lov->lov_pool_proc_entry = NULL;
+	}
+#endif /* CONFIG_FS_PROC */
+out:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
new file mode 100644
index 0000000000000..91d9aade97d96
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
@@ -0,0 +1,795 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <obd_cksum.h>
+#include <lprocfs_status.h>
+#include <lustre_osc.h>
+#include <cl_object.h>
+#include "mdc_internal.h"
+
+static ssize_t active_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	ssize_t len;
+
+	with_imp_locked(obd, imp, len)
+		len = sprintf(buf, "%d\n", !imp->imp_deactive);
+	return len;
+}
+
+static ssize_t active_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp, *imp0;
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	with_imp_locked(obd, imp0, rc)
+		imp = class_import_get(imp0);
+	if (rc)
+		return rc;
+	/* opposite senses */
+	if (imp->imp_deactive == val)
+		rc = ptlrpc_set_import_active(imp, val);
+	else
+		CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n",
+		       val);
+	class_import_put(imp);
+	return rc ?: count;
+}
+LUSTRE_RW_ATTR(active);
+
+static ssize_t max_rpcs_in_flight_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	ssize_t len;
+	u32 max;
+
+	max = obd_get_max_rpcs_in_flight(&obd->u.cli);
+	len = sprintf(buf, "%u\n", max);
+
+	return len;
+}
+
+static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	rc = obd_set_max_rpcs_in_flight(&obd->u.cli, val);
+	if (rc)
+		count = rc;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_rpcs_in_flight);
+
+static ssize_t max_mod_rpcs_in_flight_show(struct kobject *kobj,
+					   struct attribute *attr,
+					   char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	u16 max;
+
+	max = obd_get_max_mod_rpcs_in_flight(&obd->u.cli);
+	return sprintf(buf, "%hu\n", max);
+}
+
+static ssize_t max_mod_rpcs_in_flight_store(struct kobject *kobj,
+					    struct attribute *attr,
+					    const char *buffer,
+					    size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	u16 val;
+	int rc;
+
+	rc = kstrtou16(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	rc = obd_set_max_mod_rpcs_in_flight(&obd->u.cli, val);
+	if (rc)
+		count = rc;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_mod_rpcs_in_flight);
+
+static int mdc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+
+	seq_printf(m, "%lu\n", PAGES_TO_MiB(cli->cl_dirty_max_pages));
+	return 0;
+}
+
+static ssize_t mdc_max_dirty_mb_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct seq_file *sfl = file->private_data;
+	struct obd_device *obd = sfl->private;
+	struct client_obd *cli = &obd->u.cli;
+	char kernbuf[22] = "";
+	u64 pages_number;
+	int rc;
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	rc = sysfs_memparse(kernbuf, count, &pages_number, "MiB");
+	if (rc < 0)
+		return rc;
+
+	/* MB -> pages */
+	pages_number = round_up(pages_number, 1024 * 1024) >> PAGE_SHIFT;
+	if (pages_number <= 0 ||
+	    pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) ||
+	    pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */
+		return -ERANGE;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_dirty_max_pages = pages_number;
+	osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_max_dirty_mb);
+
+DECLARE_CKSUM_NAME;
+
+static int mdc_checksum_type_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	int i;
+
+	if (obd == NULL)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if ((BIT(i) & obd->u.cli.cl_supp_cksum_types) == 0)
+			continue;
+		if (obd->u.cli.cl_cksum_type == BIT(i))
+			seq_printf(m, "[%s] ", cksum_name[i]);
+		else
+			seq_printf(m, "%s ", cksum_name[i]);
+	}
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+static ssize_t mdc_checksum_type_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	char kernbuf[10];
+	int rc = -EINVAL;
+	int i;
+
+	if (obd == NULL)
+		return 0;
+
+	if (count > sizeof(kernbuf) - 1)
+		return -EINVAL;
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	if (count > 0 && kernbuf[count - 1] == '\n')
+		kernbuf[count - 1] = '\0';
+	else
+		kernbuf[count] = '\0';
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if (strcasecmp(kernbuf, cksum_name[i]) == 0) {
+			obd->u.cli.cl_preferred_cksum_type = BIT(i);
+			if (obd->u.cli.cl_supp_cksum_types & BIT(i)) {
+				obd->u.cli.cl_cksum_type = BIT(i);
+				rc = count;
+			} else {
+				rc = -ENOTSUPP;
+			}
+			break;
+		}
+	}
+
+	return rc;
+}
+LPROC_SEQ_FOPS(mdc_checksum_type);
+
+static ssize_t checksums_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", !!obd->u.cli.cl_checksum);
+}
+
+static ssize_t checksums_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	obd->u.cli.cl_checksum = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(checksums);
+
+static ssize_t checksum_dump_show(struct kobject *kobj,
+				  struct attribute *attr, char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", !!obd->u.cli.cl_checksum_dump);
+}
+
+static ssize_t checksum_dump_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	obd->u.cli.cl_checksum_dump = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(checksum_dump);
+
+LUSTRE_ATTR(mds_conn_uuid, 0444, conn_uuid_show, NULL);
+LUSTRE_RO_ATTR(conn_uuid);
+
+LUSTRE_RW_ATTR(ping);
+
+static int mdc_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+	int shift = 20 - PAGE_SHIFT;
+
+	seq_printf(m, "used_mb: %ld\n"
+		   "busy_cnt: %ld\n"
+		   "reclaim: %llu\n",
+		   (atomic_long_read(&cli->cl_lru_in_list) +
+		    atomic_long_read(&cli->cl_lru_busy)) >> shift,
+		    atomic_long_read(&cli->cl_lru_busy),
+		   cli->cl_lru_reclaim);
+
+	return 0;
+}
+
+/* shrink the number of caching pages to a specific number */
+static ssize_t
+mdc_cached_mb_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	struct seq_file *sfl = file->private_data;
+	struct obd_device *obd = sfl->private;
+	struct client_obd *cli = &obd->u.cli;
+	u64 pages_number;
+	const char *tmp;
+	long rc;
+	char kernbuf[128];
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	tmp = lprocfs_find_named_value(kernbuf, "used_mb:", &count);
+	rc = sysfs_memparse(tmp, count, &pages_number, "MiB");
+	if (rc < 0)
+		return rc;
+
+	pages_number >>= PAGE_SHIFT;
+
+	rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number;
+	if (rc > 0) {
+		struct lu_env *env;
+		__u16 refcheck;
+
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			(void)osc_lru_shrink(env, cli, rc, true);
+			cl_env_put(env, &refcheck);
+		}
+	}
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_cached_mb);
+
+static int mdc_unstable_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+	long pages;
+	int mb;
+
+	pages = atomic_long_read(&cli->cl_unstable_count);
+	mb    = (pages * PAGE_SIZE) >> 20;
+
+	seq_printf(m, "unstable_pages: %20ld\n"
+		   "unstable_mb:              %10d\n", pages, mb);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(mdc_unstable_stats);
+
+static ssize_t mdc_rpc_stats_seq_write(struct file *file,
+				       const char __user *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *obd = seq->private;
+	struct client_obd *cli = &obd->u.cli;
+
+	lprocfs_oh_clear(&cli->cl_mod_rpcs_hist);
+
+	lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_read_page_hist);
+	lprocfs_oh_clear(&cli->cl_write_page_hist);
+	lprocfs_oh_clear(&cli->cl_read_offset_hist);
+	lprocfs_oh_clear(&cli->cl_write_offset_hist);
+	cli->cl_mod_rpcs_init = ktime_get_real();
+
+	return len;
+}
+
+static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *obd = seq->private;
+	struct client_obd *cli = &obd->u.cli;
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	int i;
+
+	obd_mod_rpc_stats_seq_show(cli, seq);
+
+	spin_lock(&cli->cl_loi_list_lock);
+
+	seq_printf(seq, "\nread RPCs in flight:  %d\n",
+		   cli->cl_r_in_flight);
+	seq_printf(seq, "write RPCs in flight: %d\n",
+		   cli->cl_w_in_flight);
+	seq_printf(seq, "pending write pages:  %d\n",
+		   atomic_read(&cli->cl_pending_w_pages));
+	seq_printf(seq, "pending read pages:   %d\n",
+		   atomic_read(&cli->cl_pending_r_pages));
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "pages per rpc         rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   1 << i, r, pct(r, read_tot),
+			   pct(read_cum, read_tot), w,
+			   pct(w, write_tot),
+			   pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "rpcs in flight        rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 1; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   i, r, pct(r, read_tot), pct(read_cum, read_tot), w,
+			   pct(w, write_tot), pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "offset                rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   (i == 0) ? 0 : 1 << (i - 1),
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+LPROC_SEQ_FOPS(mdc_rpc_stats);
+
+static int mdc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *obd = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(obd)->osc_stats;
+
+	lprocfs_stats_header(seq, ktime_get_real(), stats->os_init, 25, ":",
+			     true, "");
+	seq_printf(seq, "lockless_write_bytes\t\t%llu\n",
+		   stats->os_lockless_writes);
+	seq_printf(seq, "lockless_read_bytes\t\t%llu\n",
+		   stats->os_lockless_reads);
+	return 0;
+}
+
+static ssize_t mdc_stats_seq_write(struct file *file,
+				   const char __user *buf,
+				   size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *obd = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(obd)->osc_stats;
+
+	memset(stats, 0, sizeof(*stats));
+	stats->os_init = ktime_get_real();
+
+	return len;
+}
+LPROC_SEQ_FOPS(mdc_stats);
+
+static int mdc_dom_min_repsize_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	seq_printf(m, "%u\n", obd->u.cli.cl_dom_min_inline_repsize);
+
+	return 0;
+}
+
+static ssize_t mdc_dom_min_repsize_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint_from_user(buffer, count, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val > MDC_DOM_MAX_INLINE_REPSIZE)
+		return -ERANGE;
+
+	obd->u.cli.cl_dom_min_inline_repsize = val;
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_dom_min_repsize);
+
+static int mdc_lsom_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+
+	seq_printf(m, "%s\n", dev->u.cli.cl_lsom_update ? "On" : "Off");
+
+	return 0;
+}
+
+static ssize_t mdc_lsom_seq_write(struct file *file,
+				  const char __user *buffer,
+				  size_t count, loff_t *off)
+{
+	struct obd_device *dev;
+	bool val;
+	int rc;
+
+	dev =  ((struct seq_file *)file->private_data)->private;
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	dev->u.cli.cl_lsom_update = val;
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_lsom);
+
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, state);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, obd_max_pages_per_rpc);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, import);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov);
+
+struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
+	{ .name	=	"connect_flags",
+	  .fops	=	&mdc_connect_flags_fops	},
+	{ .name	=	"mds_server_uuid",
+	  .fops	=	&mdc_server_uuid_fops	},
+	{ .name =	"max_pages_per_rpc",
+	  .fops =	&mdc_obd_max_pages_per_rpc_fops },
+	{ .name =	"max_dirty_mb",
+	  .fops =	&mdc_max_dirty_mb_fops		},
+	{ .name	=	"mdc_cached_mb",
+	  .fops	=	&mdc_cached_mb_fops		},
+	{ .name	=	"checksum_type",
+	  .fops	=	&mdc_checksum_type_fops		},
+	{ .name	=	"timeouts",
+	  .fops	=	&mdc_timeouts_fops		},
+	{ .name	=	"import",
+	  .fops	=	&mdc_import_fops		},
+	{ .name	=	"state",
+	  .fops	=	&mdc_state_fops			},
+	{ .name	=	"pinger_recov",
+	  .fops	=	&mdc_pinger_recov_fops		},
+	{ .name	=	"rpc_stats",
+	  .fops	=	&mdc_rpc_stats_fops		},
+	{ .name	=	"unstable_stats",
+	  .fops	=	&mdc_unstable_stats_fops	},
+	{ .name	=	"mdc_stats",
+	  .fops	=	&mdc_stats_fops			},
+	{ .name	=	"mdc_dom_min_repsize",
+	  .fops	=	&mdc_dom_min_repsize_fops	},
+	{ .name =	"mdc_lsom",
+	  .fops =	&mdc_lsom_fops			},
+	{ NULL }
+};
+
+static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj,
+					 struct attribute *attr,
+					 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n", cli->cl_lost_grant);
+}
+LUSTRE_RO_ATTR(cur_lost_grant_bytes);
+
+static ssize_t cur_dirty_grant_bytes_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n", cli->cl_dirty_grant);
+}
+LUSTRE_RO_ATTR(cur_dirty_grant_bytes);
+
+static ssize_t grant_shrink_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	ssize_t len;
+
+	with_imp_locked(obd, imp, len)
+		len = scnprintf(buf, PAGE_SIZE, "%d\n",
+				!imp->imp_grant_shrink_disabled &&
+				OCD_HAS_FLAG(&imp->imp_connect_data,
+					     GRANT_SHRINK));
+
+	return len;
+}
+
+static ssize_t grant_shrink_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	bool val;
+	int rc;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	with_imp_locked(obd, imp, rc) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_grant_shrink_disabled = !val;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	return rc ?: count;
+}
+LUSTRE_RW_ATTR(grant_shrink);
+
+static ssize_t grant_shrink_interval_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%lld\n", obd->u.cli.cl_grant_shrink_interval);
+}
+
+static ssize_t grant_shrink_interval_store(struct kobject *kobj,
+					   struct attribute *attr,
+					   const char *buffer,
+					   size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val == 0)
+		return -ERANGE;
+
+	obd->u.cli.cl_grant_shrink_interval = val;
+	osc_update_next_shrink(&obd->u.cli);
+	osc_schedule_grant_work();
+
+	return count;
+}
+LUSTRE_RW_ATTR(grant_shrink_interval);
+
+static struct attribute *mdc_attrs[] = {
+	&lustre_attr_active.attr,
+	&lustre_attr_checksums.attr,
+	&lustre_attr_checksum_dump.attr,
+	&lustre_attr_max_rpcs_in_flight.attr,
+	&lustre_attr_max_mod_rpcs_in_flight.attr,
+	&lustre_attr_mds_conn_uuid.attr,
+	&lustre_attr_conn_uuid.attr,
+	&lustre_attr_ping.attr,
+	&lustre_attr_grant_shrink.attr,
+	&lustre_attr_grant_shrink_interval.attr,
+	&lustre_attr_cur_lost_grant_bytes.attr,
+	&lustre_attr_cur_dirty_grant_bytes.attr,
+	NULL,
+};
+
+KOBJ_ATTRIBUTE_GROUPS(mdc); /* creates mdc_groups */
+
+int mdc_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(mdc);
+	obd->obd_vars = lprocfs_mdc_obd_vars;
+
+	rc = lprocfs_obd_setup(obd, false);
+	if (rc)
+		goto out_failed;
+#ifdef CONFIG_PROC_FS
+	rc = lprocfs_alloc_md_stats(obd, 0);
+	if (rc) {
+		lprocfs_obd_cleanup(obd);
+		goto out_failed;
+	}
+#endif
+	rc = sptlrpc_lprocfs_cliobd_attach(obd);
+	if (rc) {
+#ifdef CONFIG_PROC_FS
+		lprocfs_free_md_stats(obd);
+#endif
+		lprocfs_obd_cleanup(obd);
+		goto out_failed;
+	}
+	ptlrpc_lprocfs_register_obd(obd);
+
+out_failed:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_acl.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_acl.c
new file mode 100644
index 0000000000000..81263900fc1fc
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_acl.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+#include <lustre_acl.h>
+
+#include "mdc_internal.h"
+
+int mdc_unpack_acl(struct req_capsule *pill, struct lustre_md *md)
+{
+	struct mdt_body	*body = md->body;
+	struct posix_acl *acl;
+	void *buf;
+	int rc;
+
+	/* for ACL, it's possible that FLACL is set but aclsize is zero.
+	 * only when aclsize != 0 there's an actual segment for ACL
+	 * in reply buffer.
+	 */
+	if (!body->mbo_aclsize) {
+		md->posix_acl = NULL;
+		return 0;
+	}
+
+	buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->mbo_aclsize);
+	if (!buf)
+		return -EPROTO;
+
+	acl = posix_acl_from_xattr(&init_user_ns, buf, body->mbo_aclsize);
+	if (IS_ERR_OR_NULL(acl)) {
+		rc = acl ? PTR_ERR(acl) : 0;
+		CERROR("convert xattr to acl: %d\n", rc);
+		return rc;
+	}
+
+	rc = posix_acl_valid(&init_user_ns, acl);
+	if (rc) {
+		CERROR("validate acl: %d\n", rc);
+		posix_acl_release(acl);
+		return rc;
+	}
+
+	md->posix_acl = acl;
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
new file mode 100644
index 0000000000000..843c4de8a43b7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
@@ -0,0 +1,881 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Commissariat a l'Energie Atomique et aux Energies
+ *                     Alternatives.
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * Author: Henri Doreau <henri.doreau@cea.fr>
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/poll.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+
+#include <lustre_log.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+
+#include "mdc_internal.h"
+
+
+/*
+ * -- Changelog delivery through character device --
+ */
+
+/**
+ * Mutex to protect chlg_registered_devices below
+ */
+static DEFINE_MUTEX(chlg_registered_dev_lock);
+
+/**
+ * Global linked list of all registered devices (one per MDT).
+ */
+static LIST_HEAD(chlg_registered_devices);
+
+
+struct chlg_registered_dev {
+	/* Device name of the form "changelog-{MDTNAME}" */
+	char			 ced_name[32];
+	/* changelog char device */
+	struct cdev		 ced_cdev;
+	struct device		 ced_device;
+	/* OBDs referencing this device (multiple mount point) */
+	struct list_head	 ced_obds;
+	/* Reference counter for proper deregistration */
+	struct kref		 ced_refs;
+	/* Link within the global chlg_registered_devices */
+	struct list_head	 ced_link;
+};
+
+struct chlg_reader_state {
+	/* Shortcut to the corresponding OBD device */
+	struct obd_device	   *crs_obd;
+	/* the corresponding chlg_registered_dev */
+	struct chlg_registered_dev *crs_ced;
+	/* Producer thread (if any) */
+	struct task_struct	   *crs_prod_task;
+	/* An error occurred that prevents from reading further */
+	int			    crs_err;
+	/* EOF, no more records available */
+	bool			    crs_eof;
+	/* Desired start position */
+	__u64			    crs_start_offset;
+	/* Wait queue for the catalog processing thread */
+	wait_queue_head_t	    crs_waitq_prod;
+	/* Wait queue for the record copy threads */
+	wait_queue_head_t	    crs_waitq_cons;
+	/* Mutex protecting crs_rec_count and crs_rec_queue */
+	struct mutex		    crs_lock;
+	/* Number of item in the list */
+	__u64			    crs_rec_count;
+	/* List of prefetched enqueued_record::enq_linkage_items */
+	struct list_head	    crs_rec_queue;
+	unsigned int		    crs_last_catidx;
+	unsigned int		    crs_last_idx;
+	bool			    crs_poll;
+};
+
+struct chlg_rec_entry {
+	/* Link within the chlg_reader_state::crs_rec_queue list */
+	struct list_head	enq_linkage;
+	/* Data (enq_record) field length */
+	__u64			enq_length;
+	/* Copy of a changelog record (see struct llog_changelog_rec) */
+	struct changelog_rec	enq_record[];
+};
+
+enum {
+	/* Number of records to prefetch locally. */
+	CDEV_CHLG_MAX_PREFETCH = 1024,
+};
+
+DEFINE_IDR(mdc_changelog_minor_idr);
+static DEFINE_SPINLOCK(chlg_minor_lock);
+
+static int chlg_minor_alloc(int *pminor)
+{
+	void *minor_allocated = (void *)-1;
+	int minor;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&chlg_minor_lock);
+	minor = idr_alloc(&mdc_changelog_minor_idr, minor_allocated, 0,
+			  MDC_CHANGELOG_DEV_COUNT, GFP_NOWAIT);
+	spin_unlock(&chlg_minor_lock);
+	idr_preload_end();
+
+	if (minor < 0)
+		return minor;
+
+	*pminor = minor;
+	return 0;
+}
+
+static void chlg_minor_free(int minor)
+{
+	spin_lock(&chlg_minor_lock);
+	idr_remove(&mdc_changelog_minor_idr, minor);
+	spin_unlock(&chlg_minor_lock);
+}
+
+static void chlg_device_release(struct device *dev)
+{
+	struct chlg_registered_dev *entry = dev_get_drvdata(dev);
+
+	chlg_minor_free(MINOR(entry->ced_cdev.dev));
+	OBD_FREE_PTR(entry);
+}
+
+/**
+ * Deregister a changelog character device whose refcount has reached zero.
+ */
+static void chlg_dev_clear(struct kref *kref)
+{
+	struct chlg_registered_dev *entry;
+
+	ENTRY;
+	entry = container_of(kref, struct chlg_registered_dev,
+			     ced_refs);
+
+	list_del(&entry->ced_link);
+	cdev_device_del(&entry->ced_cdev, &entry->ced_device);
+	put_device(&entry->ced_device);
+	EXIT;
+}
+
+static inline struct obd_device* chlg_obd_get(struct chlg_registered_dev *dev)
+{
+	struct obd_device *obd;
+
+	mutex_lock(&chlg_registered_dev_lock);
+	if (list_empty(&dev->ced_obds))
+		return NULL;
+
+	obd = list_first_entry(&dev->ced_obds, struct obd_device,
+			       u.cli.cl_chg_dev_linkage);
+	class_incref(obd, "changelog", dev);
+	mutex_unlock(&chlg_registered_dev_lock);
+	return obd;
+}
+
+static inline void chlg_obd_put(struct chlg_registered_dev *dev,
+			 struct obd_device *obd)
+{
+	class_decref(obd, "changelog", dev);
+}
+
+/**
+ * ChangeLog catalog processing callback invoked on each record.
+ * If the current record is eligible to userland delivery, push
+ * it into the crs_rec_queue where the consumer code will fetch it.
+ *
+ * @param[in]     env  (unused)
+ * @param[in]     llh  Client-side handle used to identify the llog
+ * @param[in]     hdr  Header of the current llog record
+ * @param[in,out] data chlg_reader_state passed from caller
+ *
+ * @return 0 or LLOG_PROC_* control code on success, negated error on failure.
+ */
+static int chlg_read_cat_process_cb(const struct lu_env *env,
+				    struct llog_handle *llh,
+				    struct llog_rec_hdr *hdr, void *data)
+{
+	struct llog_changelog_rec *rec;
+	struct chlg_reader_state *crs = data;
+	struct chlg_rec_entry *enq;
+	size_t len;
+	int rc;
+	ENTRY;
+
+	LASSERT(crs != NULL);
+	LASSERT(hdr != NULL);
+
+	rec = container_of(hdr, struct llog_changelog_rec, cr_hdr);
+
+	crs->crs_last_catidx = llh->lgh_hdr->llh_cat_idx;
+	crs->crs_last_idx = hdr->lrh_index;
+
+	if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
+		rc = -EINVAL;
+		CERROR("%s: not a changelog rec %x/%d in llog : rc = %d\n",
+		       crs->crs_obd->obd_name, rec->cr_hdr.lrh_type,
+		       rec->cr.cr_type, rc);
+		RETURN(rc);
+	}
+
+	/* Skip undesired records */
+	if (rec->cr.cr_index < crs->crs_start_offset)
+		RETURN(0);
+
+	CDEBUG(D_HSM, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID" %.*s\n",
+	       rec->cr.cr_index, rec->cr.cr_type,
+	       changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
+	       rec->cr.cr_flags & CLF_FLAGMASK,
+	       PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
+	       rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
+
+	wait_event_interruptible(crs->crs_waitq_prod,
+				 crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
+				 kthread_should_stop());
+
+	if (kthread_should_stop())
+		RETURN(LLOG_PROC_BREAK);
+
+	len = changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
+	OBD_ALLOC(enq, sizeof(*enq) + len);
+	if (enq == NULL)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&enq->enq_linkage);
+	enq->enq_length = len;
+	memcpy(enq->enq_record, &rec->cr, len);
+
+	mutex_lock(&crs->crs_lock);
+	list_add_tail(&enq->enq_linkage, &crs->crs_rec_queue);
+	crs->crs_rec_count++;
+	mutex_unlock(&crs->crs_lock);
+
+	wake_up(&crs->crs_waitq_cons);
+
+	RETURN(0);
+}
+
+/**
+ * Remove record from the list it is attached to and free it.
+ */
+static void enq_record_delete(struct chlg_rec_entry *rec)
+{
+	list_del(&rec->enq_linkage);
+	OBD_FREE(rec, sizeof(*rec) + rec->enq_length);
+}
+
+/**
+ * Record prefetch thread entry point. Opens the changelog catalog and starts
+ * reading records.
+ *
+ * @param[in,out]  args  chlg_reader_state passed from caller.
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_load(void *args)
+{
+	struct chlg_reader_state *crs = args;
+	struct chlg_registered_dev *ced = crs->crs_ced;
+	struct obd_device *obd = NULL;
+	struct llog_ctxt *ctx = NULL;
+	struct llog_handle *llh = NULL;
+	int rc;
+	ENTRY;
+
+	crs->crs_last_catidx = 0;
+	crs->crs_last_idx = 0;
+
+again:
+	obd = chlg_obd_get(ced);
+	if (obd == NULL)
+		RETURN(-ENODEV);
+
+	crs->crs_obd = obd;
+
+	ctx = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctx == NULL)
+		GOTO(err_out, rc = -ENOENT);
+
+	rc = llog_open(NULL, ctx, &llh, NULL, CHANGELOG_CATALOG,
+		       LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("%s: fail to open changelog catalog: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(err_out, rc);
+	}
+
+
+	rc = llog_init_handle(NULL, llh,
+			      LLOG_F_IS_CAT |
+			      LLOG_F_EXT_JOBID |
+			      LLOG_F_EXT_EXTRA_FLAGS |
+			      LLOG_F_EXT_X_UIDGID |
+			      LLOG_F_EXT_X_NID |
+			      LLOG_F_EXT_X_OMODE |
+			      LLOG_F_EXT_X_XATTR,
+			      NULL);
+	if (rc) {
+		CERROR("%s: fail to init llog handle: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(err_out, rc);
+	}
+
+	rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs,
+				crs->crs_last_catidx, crs->crs_last_idx);
+	if (rc < 0) {
+		CERROR("%s: fail to process llog: rc = %d\n", obd->obd_name, rc);
+		GOTO(err_out, rc);
+	}
+	if (!kthread_should_stop() && crs->crs_poll) {
+		llog_cat_close(NULL, llh);
+		llog_ctxt_put(ctx);
+		class_decref(obd, "changelog", crs);
+		schedule_timeout_interruptible(cfs_time_seconds(1));
+		goto again;
+	}
+
+	crs->crs_eof = true;
+
+err_out:
+	if (rc < 0)
+		crs->crs_err = rc;
+
+	wake_up(&crs->crs_waitq_cons);
+
+	if (llh != NULL)
+		llog_cat_close(NULL, llh);
+
+	if (ctx != NULL)
+		llog_ctxt_put(ctx);
+
+	crs->crs_obd = NULL;
+	chlg_obd_put(ced, obd);
+	wait_event_interruptible(crs->crs_waitq_prod, kthread_should_stop());
+
+	RETURN(rc);
+}
+
+static int chlg_start_thread(struct file *file)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	struct task_struct *task;
+	int rc = 0;
+
+	if (likely(crs->crs_prod_task))
+		return 0;
+	if (unlikely(file->f_mode & FMODE_READ) == 0)
+		return 0;
+
+	mutex_lock(&crs->crs_lock);
+	if (crs->crs_prod_task == NULL) {
+		task = kthread_run(chlg_load, crs, "chlg_load_thread");
+		if (IS_ERR(task)) {
+			rc = PTR_ERR(task);
+			CERROR("%s: cannot start changelog thread: rc = %d\n",
+			       crs->crs_ced->ced_name, rc);
+			GOTO(out, rc);
+		}
+		crs->crs_prod_task = task;
+	}
+out:
+	mutex_unlock(&crs->crs_lock);
+	return rc;
+}
+
+/**
+ * Read handler, dequeues records from the chlg_reader_state if any.
+ * No partial records are copied to userland so this function can return less
+ * data than required (short read).
+ *
+ * @param[in]   file   File pointer to the character device.
+ * @param[out]  buff   Userland buffer where to copy the records.
+ * @param[in]   count  Userland buffer size.
+ * @param[out]  ppos   File position, updated with the index number of the next
+ *		       record to read.
+ * @return number of copied bytes on success, negated error code on failure.
+ */
+static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
+			 loff_t *ppos)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	struct chlg_rec_entry *rec;
+	struct chlg_rec_entry *tmp;
+	size_t written_total = 0;
+	ssize_t rc;
+	LIST_HEAD(consumed);
+	ENTRY;
+
+	if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0) {
+		if (crs->crs_err < 0)
+			RETURN(crs->crs_err);
+		else if (crs->crs_eof)
+			RETURN(0);
+		else
+			RETURN(-EAGAIN);
+	}
+
+	rc = chlg_start_thread(file);
+	if (rc)
+		RETURN(rc);
+
+	rc = wait_event_interruptible(crs->crs_waitq_cons,
+			crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err);
+
+	mutex_lock(&crs->crs_lock);
+	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
+		if (written_total + rec->enq_length > count)
+			break;
+
+		if (copy_to_user(buff, rec->enq_record, rec->enq_length)) {
+			rc = -EFAULT;
+			break;
+		}
+
+		buff += rec->enq_length;
+		written_total += rec->enq_length;
+
+		crs->crs_rec_count--;
+		list_move_tail(&rec->enq_linkage, &consumed);
+
+		crs->crs_start_offset = rec->enq_record->cr_index + 1;
+	}
+	mutex_unlock(&crs->crs_lock);
+
+	if (written_total > 0) {
+		rc = written_total;
+		wake_up(&crs->crs_waitq_prod);
+	} else if (rc == 0) {
+		rc = crs->crs_err;
+	}
+
+	list_for_each_entry_safe(rec, tmp, &consumed, enq_linkage)
+		enq_record_delete(rec);
+
+	*ppos = crs->crs_start_offset;
+
+	RETURN(rc);
+}
+
+/**
+ * Jump to a given record index. Helper for chlg_llseek().
+ *
+ * @param[in,out]  crs     Internal reader state.
+ * @param[in]      offset  Desired offset (index record).
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_set_start_offset(struct chlg_reader_state *crs, __u64 offset)
+{
+	struct chlg_rec_entry *rec;
+	struct chlg_rec_entry *tmp;
+
+	mutex_lock(&crs->crs_lock);
+	if (offset < crs->crs_start_offset) {
+		mutex_unlock(&crs->crs_lock);
+		return -ERANGE;
+	}
+
+	crs->crs_start_offset = offset;
+	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
+		struct changelog_rec *cr = rec->enq_record;
+
+		if (cr->cr_index >= crs->crs_start_offset)
+			break;
+
+		crs->crs_rec_count--;
+		enq_record_delete(rec);
+	}
+
+	mutex_unlock(&crs->crs_lock);
+	wake_up(&crs->crs_waitq_prod);
+	return 0;
+}
+
+/**
+ * Move read pointer to a certain record index, encoded as an offset.
+ *
+ * @param[in,out] file   File pointer to the changelog character device
+ * @param[in]	  off    Offset to skip, actually a record index, not byte count
+ * @param[in]	  whence Relative/Absolute interpretation of the offset
+ * @return the resulting position on success or negated error code on failure.
+ */
+static loff_t chlg_llseek(struct file *file, loff_t off, int whence)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	loff_t pos;
+	int rc;
+
+	switch (whence) {
+	case SEEK_SET:
+		pos = off;
+		break;
+	case SEEK_CUR:
+		pos = file->f_pos + off;
+		break;
+	case SEEK_END:
+	default:
+		return -EINVAL;
+	}
+
+	/* We cannot go backward */
+	if (pos < file->f_pos)
+		return -EINVAL;
+
+	rc = chlg_set_start_offset(crs, pos);
+	if (rc != 0)
+		return rc;
+
+	file->f_pos = pos;
+	return pos;
+}
+
+/**
+ * Clear record range for a given changelog reader.
+ *
+ * @param[in]  crs     Current internal state.
+ * @param[in]  reader  Changelog reader ID (cl1, cl2...)
+ * @param[in]  record  Record index up which to clear
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_clear(struct chlg_reader_state *crs, __u32 reader, __u64 record)
+{
+	struct obd_device *obd = NULL;
+	struct changelog_setinfo cs  = {
+		.cs_recno = record,
+		.cs_id    = reader
+	};
+	int rc;
+
+	obd = chlg_obd_get(crs->crs_ced);
+	if (obd == NULL)
+		return -ENODEV;
+
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				strlen(KEY_CHANGELOG_CLEAR),
+				KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
+
+	chlg_obd_put(crs->crs_ced, obd);
+	return rc;
+}
+
+/** Maximum changelog control command size */
+#define CHLG_CONTROL_CMD_MAX	64
+
+/**
+ * Handle writes() into the changelog character device. Write() can be used
+ * to request special control operations.
+ *
+ * @param[in]  file  File pointer to the changelog character device
+ * @param[in]  buff  User supplied data (written data)
+ * @param[in]  count Number of written bytes
+ * @param[in]  off   (unused)
+ * @return number of written bytes on success, negated error code on failure.
+ */
+static ssize_t chlg_write(struct file *file, const char __user *buff,
+			  size_t count, loff_t *off)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	char *kbuf;
+	__u64 record;
+	__u32 reader;
+	int rc = 0;
+	ENTRY;
+
+	if (count > CHLG_CONTROL_CMD_MAX)
+		RETURN(-EINVAL);
+
+	OBD_ALLOC(kbuf, CHLG_CONTROL_CMD_MAX);
+	if (kbuf == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(kbuf, buff, count))
+		GOTO(out_kbuf, rc = -EFAULT);
+
+	kbuf[CHLG_CONTROL_CMD_MAX - 1] = '\0';
+
+	if (sscanf(kbuf, "clear:cl%u:%llu", &reader, &record) == 2)
+		rc = chlg_clear(crs, reader, record);
+	else
+		rc = -EINVAL;
+
+	EXIT;
+out_kbuf:
+	OBD_FREE(kbuf, CHLG_CONTROL_CMD_MAX);
+	return rc < 0 ? rc : count;
+}
+
+/**
+ * Open handler, initialize internal CRS state and spawn prefetch thread if
+ * needed.
+ * @param[in]  inode  Inode struct for the open character device.
+ * @param[in]  file   Corresponding file pointer.
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_open(struct inode *inode, struct file *file)
+{
+	struct chlg_reader_state *crs;
+	struct chlg_registered_dev *dev;
+	ENTRY;
+
+	dev = container_of(inode->i_cdev, struct chlg_registered_dev, ced_cdev);
+
+	OBD_ALLOC_PTR(crs);
+	if (!crs)
+		RETURN(-ENOMEM);
+
+	kref_get(&dev->ced_refs);
+	crs->crs_ced = dev;
+	crs->crs_err = false;
+	crs->crs_eof = false;
+
+	mutex_init(&crs->crs_lock);
+	INIT_LIST_HEAD(&crs->crs_rec_queue);
+	init_waitqueue_head(&crs->crs_waitq_prod);
+	init_waitqueue_head(&crs->crs_waitq_cons);
+	crs->crs_prod_task = NULL;
+
+	file->private_data = crs;
+	RETURN(0);
+}
+
+/**
+ * Close handler, release resources.
+ *
+ * @param[in]  inode  Inode struct for the open character device.
+ * @param[in]  file   Corresponding file pointer.
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_release(struct inode *inode, struct file *file)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	struct chlg_rec_entry *rec;
+	struct chlg_rec_entry *tmp;
+	int rc = 0;
+
+	if (crs->crs_prod_task)
+		rc = kthread_stop(crs->crs_prod_task);
+
+	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage)
+		enq_record_delete(rec);
+
+	kref_put(&crs->crs_ced->ced_refs, chlg_dev_clear);
+	OBD_FREE_PTR(crs);
+
+	return rc;
+}
+
+/**
+ * Poll handler, indicates whether the device is readable (new records) and
+ * writable (always).
+ *
+ * @param[in]  file   Device file pointer.
+ * @param[in]  wait   (opaque)
+ * @return combination of the poll status flags.
+ */
+static unsigned int chlg_poll(struct file *file, poll_table *wait)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	unsigned int mask = 0;
+	int rc;
+
+	rc = chlg_start_thread(file);
+	if (rc)
+		RETURN(rc);
+
+	mutex_lock(&crs->crs_lock);
+	poll_wait(file, &crs->crs_waitq_cons, wait);
+	if (crs->crs_rec_count > 0)
+		mask |= POLLIN | POLLRDNORM;
+	if (crs->crs_err)
+		mask |= POLLERR;
+	if (crs->crs_eof)
+		mask |= POLLHUP;
+	mutex_unlock(&crs->crs_lock);
+	return mask;
+}
+
+static long chlg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int rc;
+
+	struct chlg_reader_state *crs = file->private_data;
+	switch (cmd) {
+	case OBD_IOC_CHLG_POLL:
+		crs->crs_poll = !!arg;
+		rc = 0;
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+	return rc;
+}
+
+static const struct file_operations chlg_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= chlg_llseek,
+	.read		= chlg_read,
+	.write		= chlg_write,
+	.open		= chlg_open,
+	.release	= chlg_release,
+	.poll		= chlg_poll,
+	.unlocked_ioctl	= chlg_ioctl,
+};
+
+/**
+ * This uses obd_name of the form: "testfs-MDT0000-mdc-ffff88006501600"
+ * and returns a name of the form: "changelog-testfs-MDT0000".
+ */
+static void get_target_name(char *name, size_t name_len, struct obd_device *obd)
+{
+	int i;
+
+	snprintf(name, name_len, "%s", obd->obd_name);
+
+	/* Find the 2nd '-' from the end and truncate on it */
+	for (i = 0; i < 2; i++) {
+		char *p = strrchr(name, '-');
+
+		if (p == NULL)
+			return;
+		*p = '\0';
+	}
+}
+
+/**
+ * Find a changelog character device by name.
+ * All devices registered during MDC setup are listed in a global list with
+ * their names attached.
+ */
+static struct chlg_registered_dev *
+chlg_registered_dev_find_by_name(const char *name)
+{
+	struct chlg_registered_dev *dit;
+
+	LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
+	list_for_each_entry(dit, &chlg_registered_devices, ced_link)
+		if (strcmp(name, dit->ced_name) == 0)
+			return dit;
+	return NULL;
+}
+
+/**
+ * Find chlg_registered_dev structure for a given OBD device.
+ * This is bad O(n^2) but for each filesystem:
+ *   - N is # of MDTs times # of mount points
+ *   - this only runs at shutdown
+ */
+static struct chlg_registered_dev *
+chlg_registered_dev_find_by_obd(const struct obd_device *obd)
+{
+	struct chlg_registered_dev *dit;
+	struct obd_device *oit;
+
+	LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
+	list_for_each_entry(dit, &chlg_registered_devices, ced_link)
+		list_for_each_entry(oit, &dit->ced_obds,
+				    u.cli.cl_chg_dev_linkage)
+			if (oit == obd)
+				return dit;
+	return NULL;
+}
+
+/**
+ * Changelog character device initialization.
+ * Register a misc character device with a dynamic minor number, under a name
+ * of the form: 'changelog-fsname-MDTxxxx'. Reference this OBD device with it.
+ *
+ * @param[in] obd  This MDC obd_device.
+ * @return 0 on success, negated error code on failure.
+ */
+int mdc_changelog_cdev_init(struct obd_device *obd)
+{
+	struct chlg_registered_dev *exist;
+	struct chlg_registered_dev *entry;
+	int minor, rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(entry);
+	if (entry == NULL)
+		RETURN(-ENOMEM);
+
+	get_target_name(entry->ced_name, sizeof(entry->ced_name), obd);
+
+	kref_init(&entry->ced_refs);
+	INIT_LIST_HEAD(&entry->ced_obds);
+	INIT_LIST_HEAD(&entry->ced_link);
+
+	mutex_lock(&chlg_registered_dev_lock);
+	exist = chlg_registered_dev_find_by_name(entry->ced_name);
+	if (exist != NULL) {
+		kref_get(&exist->ced_refs);
+		list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &exist->ced_obds);
+		GOTO(out_unlock, rc = 0);
+	}
+
+	list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
+	list_add_tail(&entry->ced_link, &chlg_registered_devices);
+
+	rc = chlg_minor_alloc(&minor);
+	if (rc)
+		GOTO(out_unlock, rc);
+
+	device_initialize(&entry->ced_device);
+	entry->ced_device.devt = MKDEV(MAJOR(mdc_changelog_dev), minor);
+	entry->ced_device.class = mdc_changelog_class;
+	entry->ced_device.release = chlg_device_release;
+	dev_set_drvdata(&entry->ced_device, entry);
+	rc = dev_set_name(&entry->ced_device, "%s-%s", MDC_CHANGELOG_DEV_NAME,
+			  entry->ced_name);
+	if (rc)
+		GOTO(out_minor, rc);
+
+	/* Register new character device */
+	cdev_init(&entry->ced_cdev, &chlg_fops);
+	entry->ced_cdev.owner = THIS_MODULE;
+	rc = cdev_device_add(&entry->ced_cdev, &entry->ced_device);
+	if (rc)
+		GOTO(out_device_name, rc);
+
+	entry = NULL;	/* prevent it from being freed below */
+	GOTO(out_unlock, rc = 0);
+
+out_device_name:
+	kfree_const(entry->ced_device.kobj.name);
+
+out_minor:
+	chlg_minor_free(minor);
+
+	list_del_init(&obd->u.cli.cl_chg_dev_linkage);
+	list_del(&entry->ced_link);
+
+out_unlock:
+	mutex_unlock(&chlg_registered_dev_lock);
+	if (entry)
+		OBD_FREE_PTR(entry);
+	RETURN(rc);
+}
+
+/**
+ * Release OBD, decrease reference count of the corresponding changelog device.
+ */
+void mdc_changelog_cdev_finish(struct obd_device *obd)
+{
+	struct chlg_registered_dev *dev;
+
+	ENTRY;
+	mutex_lock(&chlg_registered_dev_lock);
+	dev = chlg_registered_dev_find_by_obd(obd);
+	list_del_init(&obd->u.cli.cl_chg_dev_linkage);
+	kref_put(&dev->ced_refs, chlg_dev_clear);
+	mutex_unlock(&chlg_registered_dev_lock);
+	EXIT;
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
new file mode 100644
index 0000000000000..45c0f3d20fcd1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
@@ -0,0 +1,1627 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_device, cl_req for MDC layer.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <obd_class.h>
+#include <lustre_osc.h>
+#include <linux/falloc.h>
+#include <uapi/linux/lustre/lustre_param.h>
+
+#include "mdc_internal.h"
+
+static void mdc_lock_build_policy(const struct lu_env *env,
+				  const struct cl_lock *lock,
+				  union ldlm_policy_data *policy)
+{
+	memset(policy, 0, sizeof *policy);
+	policy->l_inodebits.bits = MDS_INODELOCK_DOM;
+	if (lock) {
+		policy->l_inodebits.li_gid = lock->cll_descr.cld_gid;
+	}
+}
+
+int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+	return osc_ldlm_glimpse_ast(dlmlock, data);
+}
+
+static void mdc_lock_build_einfo(const struct lu_env *env,
+				 const struct cl_lock *lock,
+				 struct osc_object *osc,
+				 struct ldlm_enqueue_info *einfo)
+{
+	einfo->ei_type = LDLM_IBITS;
+	einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode);
+	einfo->ei_cb_bl = mdc_ldlm_blocking_ast;
+	einfo->ei_cb_cp = ldlm_completion_ast;
+	einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
+	einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */
+	einfo->ei_req_slot = 1;
+}
+
+static void mdc_lock_lvb_update(const struct lu_env *env,
+				struct osc_object *osc,
+				struct ldlm_lock *dlmlock,
+				struct ost_lvb *lvb);
+
+static int mdc_set_dom_lock_data(struct ldlm_lock *lock, void *data)
+{
+	int set = 0;
+
+	LASSERT(lock != NULL);
+	LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
+
+	lock_res_and_lock(lock);
+
+	if (lock->l_ast_data == NULL)
+		lock->l_ast_data = data;
+	if (lock->l_ast_data == data)
+		set = 1;
+
+	unlock_res_and_lock(lock);
+
+	return set;
+}
+
+int mdc_dom_lock_match(const struct lu_env *env, struct obd_export *exp,
+		       struct ldlm_res_id *res_id, enum ldlm_type type,
+		       union ldlm_policy_data *policy, enum ldlm_mode mode,
+		       __u64 *flags, struct osc_object *obj,
+		       struct lustre_handle *lockh,
+		       enum ldlm_match_flags match_flags)
+{
+	struct obd_device *obd = exp->exp_obd;
+	__u64 lflags = *flags;
+	enum ldlm_mode rc;
+
+	ENTRY;
+
+	rc = ldlm_lock_match_with_skip(obd->obd_namespace, lflags, 0,
+			     res_id, type, policy, mode, lockh, match_flags);
+	if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
+		RETURN(rc);
+
+	if (obj != NULL) {
+		struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+
+		LASSERT(lock != NULL);
+		if (mdc_set_dom_lock_data(lock, obj)) {
+			lock_res_and_lock(lock);
+			if (!ldlm_is_lvb_cached(lock)) {
+				LASSERT(lock->l_ast_data == obj);
+				mdc_lock_lvb_update(env, obj, lock, NULL);
+				ldlm_set_lvb_cached(lock);
+			}
+			unlock_res_and_lock(lock);
+		} else {
+			ldlm_lock_decref(lockh, rc);
+			rc = 0;
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Finds an existing lock covering a page with given index.
+ * Copy of osc_obj_dlmlock_at_pgoff() but for DoM IBITS lock.
+ */
+struct ldlm_lock *mdc_dlmlock_at_pgoff(const struct lu_env *env,
+				       struct osc_object *obj, pgoff_t index,
+				       enum osc_dap_flags dap_flags)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct ldlm_res_id *resname = &info->oti_resname;
+	union ldlm_policy_data *policy = &info->oti_policy;
+	struct lustre_handle lockh;
+	struct ldlm_lock *lock = NULL;
+	enum ldlm_mode mode;
+	__u64 flags;
+	enum ldlm_match_flags match_flags = 0;
+
+	ENTRY;
+
+	fid_build_reg_res_name(lu_object_fid(osc2lu(obj)), resname);
+	mdc_lock_build_policy(env, NULL, policy);
+	policy->l_inodebits.li_gid = LDLM_GID_ANY;
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
+	if (dap_flags & OSC_DAP_FL_TEST_LOCK)
+		flags |= LDLM_FL_TEST_LOCK;
+
+	if (dap_flags & OSC_DAP_FL_AST)
+		match_flags |= LDLM_MATCH_AST;
+
+	if (dap_flags & OSC_DAP_FL_CANCELING)
+		match_flags |= LDLM_MATCH_UNREF;
+
+again:
+	/* Next, search for already existing extent locks that will cover us */
+	/* If we're trying to read, we also search for an existing PW lock.  The
+	 * VFS and page cache already protect us locally, so lots of readers/
+	 * writers can share a single PW lock. */
+	mode = mdc_dom_lock_match(env, osc_export(obj), resname, LDLM_IBITS,
+				  policy, LCK_PR | LCK_PW | LCK_GROUP, &flags,
+				  obj, &lockh, match_flags);
+	if (mode != 0) {
+		lock = ldlm_handle2lock(&lockh);
+		/* RACE: the lock is cancelled so let's try again */
+		if (unlikely(lock == NULL))
+			goto again;
+	}
+
+	RETURN(lock);
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static bool mdc_check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+				     void **pvec, int count, void *cbdata)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct osc_object *osc = cbdata;
+	pgoff_t index;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		struct osc_page *ops = pvec[i];
+
+		index = osc_index(ops);
+		if (index >= info->oti_fn_index) {
+			struct ldlm_lock *tmp;
+			struct cl_page *page = ops->ops_cl.cpl_page;
+
+			/* refresh non-overlapped index */
+			tmp = mdc_dlmlock_at_pgoff(env, osc, index,
+					OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_AST);
+			if (tmp != NULL) {
+				info->oti_fn_index = CL_PAGE_EOF;
+				LDLM_LOCK_PUT(tmp);
+			} else if (cl_page_own(env, io, page) == 0) {
+				/* discard the page */
+				cl_page_discard(env, io, page);
+				cl_page_disown(env, io, page);
+			} else {
+				LASSERT(page->cp_state == CPS_FREEING);
+			}
+		}
+
+		info->oti_next_index = index + 1;
+	}
+	return true;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+static int mdc_lock_discard_pages(const struct lu_env *env,
+				  struct osc_object *osc,
+				  pgoff_t start, pgoff_t end,
+				  bool discard)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct cl_io *io = &info->oti_io;
+	osc_page_gang_cbt cb;
+	int result;
+
+	ENTRY;
+
+	io->ci_obj = cl_object_top(osc2cl(osc));
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		GOTO(out, result);
+
+	cb = discard ? osc_discard_cb : mdc_check_and_discard_cb;
+	info->oti_fn_index = info->oti_next_index = start;
+
+	osc_page_gang_lookup(env, io, osc, info->oti_next_index,
+			     end, cb, (void *)osc);
+out:
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+
+static int mdc_lock_flush(const struct lu_env *env, struct osc_object *obj,
+			  pgoff_t start, pgoff_t end, enum cl_lock_mode mode,
+			  bool discard)
+{
+	int result = 0;
+	int rc;
+
+	ENTRY;
+
+	if (mode == CLM_WRITE) {
+		result = osc_cache_writeback_range(env, obj, start, end, 1,
+						   discard);
+		CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n",
+		       obj, start, end, result,
+		       discard ? "discarded" : "written back");
+		if (result > 0)
+			result = 0;
+	}
+
+	/* Avoid lock matching with CLM_WRITE, there can be no other locks */
+	rc = mdc_lock_discard_pages(env, obj, start, end,
+				    mode == CLM_WRITE || discard);
+	if (result == 0 && rc < 0)
+		result = rc;
+
+	RETURN(result);
+}
+
+void mdc_lock_lockless_cancel(const struct lu_env *env,
+			      const struct cl_lock_slice *slice)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+	struct osc_object *osc = cl2osc(slice->cls_obj);
+	struct cl_lock_descr *descr = &slice->cls_lock->cll_descr;
+	int rc;
+
+	LASSERT(ols->ols_dlmlock == NULL);
+	rc = mdc_lock_flush(env, osc, descr->cld_start, descr->cld_end,
+			    descr->cld_mode, 0);
+	if (rc != 0)
+		CERROR("Pages for lockless lock %p were not purged(%d)\n",
+		       ols, rc);
+
+	osc_lock_wake_waiters(env, osc, ols);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int mdc_dlm_canceling(const struct lu_env *env,
+			     struct ldlm_lock *dlmlock)
+{
+	struct cl_object *obj = NULL;
+	int result = 0;
+	bool discard;
+	enum cl_lock_mode mode = CLM_READ;
+
+	ENTRY;
+
+	lock_res_and_lock(dlmlock);
+	if (!ldlm_is_granted(dlmlock)) {
+		dlmlock->l_ast_data = NULL;
+		unlock_res_and_lock(dlmlock);
+		RETURN(0);
+	}
+
+	discard = ldlm_is_discard_data(dlmlock);
+	if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))
+		mode = CLM_WRITE;
+
+	if (dlmlock->l_ast_data != NULL) {
+		obj = osc2cl(dlmlock->l_ast_data);
+		cl_object_get(obj);
+	}
+	unlock_res_and_lock(dlmlock);
+
+	/* if l_ast_data is NULL, the dlmlock was enqueued by AGL or
+	 * the object has been destroyed. */
+	if (obj != NULL) {
+		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+
+		/* Destroy pages covered by the extent of the DLM lock */
+		result = mdc_lock_flush(env, cl2osc(obj), cl_index(obj, 0),
+					CL_PAGE_EOF, mode, discard);
+		/* Losing a lock, set KMS to 0.
+		 * NB: assumed that DOM lock covers whole data on MDT.
+		 */
+		/* losing a lock, update kms */
+		lock_res_and_lock(dlmlock);
+		dlmlock->l_ast_data = NULL;
+		cl_object_attr_lock(obj);
+		attr->cat_kms = 0;
+		cl_object_attr_update(env, obj, attr, CAT_KMS);
+		cl_object_attr_unlock(obj);
+		unlock_res_and_lock(dlmlock);
+		cl_object_put(env, obj);
+	}
+	RETURN(result);
+}
+
+int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+			  struct ldlm_lock_desc *new, void *data, int reason)
+{
+	int rc = 0;
+
+	ENTRY;
+
+	switch (reason) {
+	case LDLM_CB_BLOCKING: {
+		struct lustre_handle lockh;
+
+		ldlm_lock2handle(dlmlock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc == -ENODATA)
+			rc = 0;
+		break;
+	}
+	case LDLM_CB_CANCELING: {
+		struct lu_env *env;
+		__u16 refcheck;
+
+		/*
+		 * This can be called in the context of outer IO, e.g.,
+		 *
+		 *    osc_enqueue_base()->...
+		 *      ->ldlm_prep_elc_req()->...
+		 *        ->ldlm_cancel_callback()->...
+		 *          ->osc_ldlm_blocking_ast()
+		 *
+		 * new environment has to be created to not corrupt outer
+		 * context.
+		 */
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env)) {
+			rc = PTR_ERR(env);
+			break;
+		}
+
+		rc = mdc_dlm_canceling(env, dlmlock);
+		cl_env_put(env, &refcheck);
+		break;
+	}
+	default:
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server.
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+void mdc_lock_lvb_update(const struct lu_env *env, struct osc_object *osc,
+			 struct ldlm_lock *dlmlock, struct ost_lvb *lvb)
+{
+	struct cl_object *obj = osc2cl(osc);
+	struct lov_oinfo *oinfo = osc->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	unsigned valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME |
+			 CAT_SIZE;
+	unsigned int setkms = 0;
+
+	ENTRY;
+
+	if (lvb == NULL) {
+		LASSERT(dlmlock != NULL);
+		lvb = &dlmlock->l_ost_lvb;
+	}
+	cl_lvb2attr(attr, lvb);
+
+	cl_object_attr_lock(obj);
+	if (dlmlock != NULL) {
+		__u64 size;
+
+		check_res_locked(dlmlock->l_resource);
+		size = lvb->lvb_size;
+
+		if (size >= oinfo->loi_kms) {
+			valid |= CAT_KMS;
+			attr->cat_kms = size;
+			setkms = 1;
+		}
+		ldlm_lock_allow_match_locked(dlmlock);
+	}
+
+	/* The size should not be less than the kms */
+	if (attr->cat_size < oinfo->loi_kms)
+		attr->cat_size = oinfo->loi_kms;
+
+	LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s "
+		   "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size,
+		   setkms ? "" : " leaving",
+		   setkms ? attr->cat_kms : oinfo->loi_kms,
+		   dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull);
+
+	cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+	EXIT;
+}
+
+static void mdc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
+			     struct lustre_handle *lockh)
+{
+	struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj);
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = ldlm_handle2lock_long(lockh, 0);
+	LASSERT(dlmlock != NULL);
+
+	/* lock reference taken by ldlm_handle2lock_long() is
+	 * owned by osc_lock and released in osc_lock_detach()
+	 */
+	lu_ref_add_atomic(&dlmlock->l_reference, "osc_lock", oscl);
+	oscl->ols_has_ref = 1;
+
+	LASSERT(oscl->ols_dlmlock == NULL);
+	oscl->ols_dlmlock = dlmlock;
+
+	/* This may be a matched lock for glimpse request, do not hold
+	 * lock reference in that case. */
+	if (!oscl->ols_glimpse) {
+		/* hold a refc for non glimpse lock which will
+		 * be released in osc_lock_cancel() */
+		lustre_handle_copy(&oscl->ols_handle, lockh);
+		ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode);
+		oscl->ols_hold = 1;
+	}
+
+	/* Lock must have been granted. */
+	lock_res_and_lock(dlmlock);
+	if (ldlm_is_granted(dlmlock)) {
+		struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
+
+		/* extend the lock extent, otherwise it will have problem when
+		 * we decide whether to grant a lockless lock. */
+		descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+		descr->cld_start = cl_index(descr->cld_obj, 0);
+		descr->cld_end = CL_PAGE_EOF;
+
+		/* no lvb update for matched lock */
+		if (!ldlm_is_lvb_cached(dlmlock)) {
+			LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+			LASSERT(osc == dlmlock->l_ast_data);
+			mdc_lock_lvb_update(env, osc, dlmlock, NULL);
+			ldlm_set_lvb_cached(dlmlock);
+		}
+	}
+	unlock_res_and_lock(dlmlock);
+
+	LASSERT(oscl->ols_state != OLS_GRANTED);
+	oscl->ols_state = OLS_GRANTED;
+	EXIT;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after mdc_enqueue_send() matched a local DLM
+ * lock.
+ */
+static int mdc_lock_upcall(void *cookie, struct lustre_handle *lockh,
+			   int errcode)
+{
+	struct osc_lock *oscl = cookie;
+	struct cl_lock_slice *slice = &oscl->ols_cl;
+	struct lu_env *env;
+	int rc;
+
+	ENTRY;
+
+	env = cl_env_percpu_get();
+	/* should never happen, similar to osc_ldlm_blocking_ast(). */
+	LASSERT(!IS_ERR(env));
+
+	rc = ldlm_error2errno(errcode);
+	if (oscl->ols_state == OLS_ENQUEUED) {
+		oscl->ols_state = OLS_UPCALL_RECEIVED;
+	} else if (oscl->ols_state == OLS_CANCELLED) {
+		rc = -EIO;
+	} else {
+		CERROR("Impossible state: %d\n", oscl->ols_state);
+		LBUG();
+	}
+
+	CDEBUG(D_INODE, "rc %d, err %d\n", rc, errcode);
+	if (rc == 0)
+		mdc_lock_granted(env, oscl, lockh);
+
+	/* Error handling, some errors are tolerable. */
+	if (oscl->ols_glimpse && rc == -ENAVAIL) {
+		LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+		mdc_lock_lvb_update(env, cl2osc(slice->cls_obj),
+				    NULL, &oscl->ols_lvb);
+		/* Hide the error. */
+		rc = 0;
+	}
+
+	if (oscl->ols_owner != NULL)
+		cl_sync_io_note(env, oscl->ols_owner, rc);
+	cl_env_percpu_put(env);
+
+	RETURN(rc);
+}
+
+/* This is needed only for old servers (before 2.14) support */
+int mdc_fill_lvb(struct req_capsule *pill, struct ost_lvb *lvb)
+{
+	struct mdt_body *body;
+
+	/* get LVB data from mdt_body otherwise */
+	body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+	if (!body)
+		RETURN(-EPROTO);
+
+	if (!(body->mbo_valid & OBD_MD_DOM_SIZE))
+		RETURN(-EPROTO);
+
+	mdc_body2lvb(body, lvb);
+	RETURN(0);
+}
+
+int mdc_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+		     osc_enqueue_upcall_f upcall, void *cookie,
+		     struct lustre_handle *lockh, enum ldlm_mode mode,
+		     __u64 *flags, int errcode)
+{
+	struct osc_lock *ols = cookie;
+	bool glimpse = *flags & LDLM_FL_HAS_INTENT;
+	int rc = 0;
+
+	ENTRY;
+
+	/* needed only for glimpse from an old server (< 2.14) */
+	if (glimpse && !exp_connect_dom_lvb(exp))
+		rc = mdc_fill_lvb(&req->rq_pill, &ols->ols_lvb);
+
+	if (glimpse && errcode == ELDLM_LOCK_ABORTED) {
+		struct ldlm_reply *rep;
+
+		rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+		if (likely(rep)) {
+			rep->lock_policy_res2 =
+				ptlrpc_status_ntoh(rep->lock_policy_res2);
+			if (rep->lock_policy_res2)
+				errcode = rep->lock_policy_res2;
+		} else {
+			rc = -EPROTO;
+		}
+		*flags |= LDLM_FL_LVB_READY;
+	} else if (errcode == ELDLM_OK) {
+		struct ldlm_lock *lock;
+
+		/* Callers have references, should be valid always */
+		lock = ldlm_handle2lock(lockh);
+
+		/* At this point ols_lvb must be filled with correct LVB either
+		 * by mdc_fill_lvb() above or by ldlm_cli_enqueue_fini().
+		 * DoM uses l_ost_lvb to store LVB data, so copy it here from
+		 * just updated ols_lvb.
+		 */
+		lock_res_and_lock(lock);
+		memcpy(&lock->l_ost_lvb, &ols->ols_lvb,
+		       sizeof(lock->l_ost_lvb));
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+		*flags |= LDLM_FL_LVB_READY;
+	}
+
+	/* Call the update callback. */
+	rc = (*upcall)(cookie, lockh, rc < 0 ? rc : errcode);
+
+	/* release the reference taken in ldlm_cli_enqueue() */
+	if (errcode == ELDLM_LOCK_MATCHED)
+		errcode = ELDLM_OK;
+	if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
+		ldlm_lock_decref(lockh, mode);
+
+	RETURN(rc);
+}
+
+int mdc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			  void *args, int rc)
+{
+	struct osc_enqueue_args *aa = args;
+	struct ldlm_lock *lock;
+	struct lustre_handle *lockh = &aa->oa_lockh;
+	enum ldlm_mode mode = aa->oa_mode;
+	struct ldlm_enqueue_info einfo = {
+		.ei_type = aa->oa_type,
+		.ei_mode = mode,
+	};
+
+	ENTRY;
+
+	LASSERT(!aa->oa_speculative);
+
+	/* ldlm_cli_enqueue is holding a reference on the lock, so it must
+	 * be valid. */
+	lock = ldlm_handle2lock(lockh);
+	LASSERTF(lock != NULL,
+		 "lockh %#llx, req %p, aa %p - client evicted?\n",
+		 lockh->cookie, req, aa);
+
+	/* Take an additional reference so that a blocking AST that
+	 * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+	 * to arrive after an upcall has been executed by
+	 * mdc_enqueue_fini().
+	 */
+	ldlm_lock_addref(lockh, mode);
+
+	/* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
+
+	/* Let CP AST to grant the lock first. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+	/* Complete obtaining the lock procedure. */
+	rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, &einfo, 1, aa->oa_flags,
+				   aa->oa_lvb, aa->oa_lvb ?
+				   sizeof(*aa->oa_lvb) : 0, lockh, rc, true);
+	/* Complete mdc stuff. */
+	rc = mdc_enqueue_fini(aa->oa_exp, req, aa->oa_upcall, aa->oa_cookie,
+			      lockh, mode, aa->oa_flags, rc);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+	ldlm_lock_decref(lockh, mode);
+	LDLM_LOCK_PUT(lock);
+	RETURN(rc);
+}
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int mdc_enqueue_send(const struct lu_env *env, struct obd_export *exp,
+		     struct ldlm_res_id *res_id, __u64 *flags,
+		     union ldlm_policy_data *policy, struct ost_lvb *lvb,
+		     osc_enqueue_upcall_f upcall, void *cookie,
+		     struct ldlm_enqueue_info *einfo, int async)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lustre_handle lockh = { 0 };
+	struct ptlrpc_request *req = NULL;
+	struct ldlm_intent *lit;
+	enum ldlm_mode mode;
+	bool glimpse = *flags & LDLM_FL_HAS_INTENT;
+	__u64 match_flags = *flags;
+	LIST_HEAD(cancels);
+	int rc, count;
+	int lvb_size;
+	bool compat_glimpse = glimpse && !exp_connect_dom_lvb(exp);
+
+	ENTRY;
+
+	mode = einfo->ei_mode;
+	if (einfo->ei_mode == LCK_PR)
+		mode |= LCK_PW;
+
+	match_flags |= LDLM_FL_LVB_READY;
+	if (glimpse)
+		match_flags |= LDLM_FL_BLOCK_GRANTED;
+	mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id,
+			       einfo->ei_type, policy, mode, &lockh);
+	if (mode) {
+		struct ldlm_lock *matched;
+
+		if (*flags & LDLM_FL_TEST_LOCK)
+			RETURN(ELDLM_OK);
+
+		matched = ldlm_handle2lock(&lockh);
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GLIMPSE_DDOS))
+			ldlm_set_kms_ignore(matched);
+
+		if (mdc_set_dom_lock_data(matched, einfo->ei_cbdata)) {
+			*flags |= LDLM_FL_LVB_READY;
+
+			/* We already have a lock, and it's referenced. */
+			(*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
+
+			ldlm_lock_decref(&lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(ELDLM_OK);
+		}
+		ldlm_lock_decref(&lockh, mode);
+		LDLM_LOCK_PUT(matched);
+	}
+
+	if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
+		RETURN(-ENOLCK);
+
+	/* Glimpse is intent on old server */
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), compat_glimpse ?
+				   &RQF_LDLM_INTENT : &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* For WRITE lock cancel other locks on resource early if any */
+	if (einfo->ei_mode & LCK_PW)
+		count = mdc_resource_get_unused_res(exp, res_id, &cancels,
+						    einfo->ei_mode,
+						    MDS_INODELOCK_DOM);
+	else
+		count = 0;
+
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (compat_glimpse) {
+		/* pack the glimpse intent */
+		lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+		lit->opc = IT_GLIMPSE;
+	}
+
+	/* users of mdc_enqueue() can pass this flag for ldlm_lock_match() */
+	*flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+	if (compat_glimpse) {
+		req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0);
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
+		lvb_size = 0;
+	} else {
+		lvb_size = sizeof(*lvb);
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+				     lvb_size);
+	}
+	ptlrpc_request_set_replen(req);
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+			      lvb_size, LVB_T_OST, &lockh, async);
+	if (async) {
+		if (!rc) {
+			struct osc_enqueue_args *aa;
+
+			aa = ptlrpc_req_async_args(aa, req);
+			aa->oa_exp = exp;
+			aa->oa_mode = einfo->ei_mode;
+			aa->oa_type = einfo->ei_type;
+			lustre_handle_copy(&aa->oa_lockh, &lockh);
+			aa->oa_upcall = upcall;
+			aa->oa_cookie = cookie;
+			aa->oa_speculative = false;
+			aa->oa_flags = flags;
+			aa->oa_lvb = compat_glimpse ? NULL : lvb;
+
+			req->rq_interpret_reply = mdc_enqueue_interpret;
+			ptlrpcd_add_req(req);
+		} else {
+			ptlrpc_req_finished(req);
+		}
+		RETURN(rc);
+	}
+
+	rc = mdc_enqueue_fini(exp, req, upcall, cookie, &lockh, einfo->ei_mode,
+			      flags, rc);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int mdc_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, struct cl_sync_io *anchor)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_object *osc = cl2osc(slice->cls_obj);
+	struct osc_lock *oscl = cl2osc_lock(slice);
+	struct cl_lock *lock = slice->cls_lock;
+	struct ldlm_res_id *resname = &info->oti_resname;
+	union ldlm_policy_data *policy = &info->oti_policy;
+	osc_enqueue_upcall_f upcall = mdc_lock_upcall;
+	void *cookie = (void *)oscl;
+	bool async = false;
+	int result;
+
+	ENTRY;
+
+	LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+		"lock = %p, ols = %p\n", lock, oscl);
+
+	if (oscl->ols_state == OLS_GRANTED)
+		RETURN(0);
+
+	/* Lockahead is not supported on MDT yet */
+	if (oscl->ols_flags & LDLM_FL_NO_EXPANSION) {
+		result = -EOPNOTSUPP;
+		RETURN(result);
+	}
+
+	if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
+		GOTO(enqueue_base, 0);
+
+	if (oscl->ols_glimpse) {
+		LASSERT(equi(oscl->ols_speculative, anchor == NULL));
+		async = true;
+		GOTO(enqueue_base, 0);
+	}
+
+	result = osc_lock_enqueue_wait(env, osc, oscl);
+	if (result < 0)
+		GOTO(out, result);
+
+	/* we can grant lockless lock right after all conflicting locks
+	 * are canceled. */
+	if (osc_lock_is_lockless(oscl)) {
+		oscl->ols_state = OLS_GRANTED;
+		oio->oi_lockless = 1;
+		RETURN(0);
+	}
+
+enqueue_base:
+	oscl->ols_state = OLS_ENQUEUED;
+	if (anchor != NULL) {
+		atomic_inc(&anchor->csi_sync_nr);
+		oscl->ols_owner = anchor;
+	}
+
+	/**
+	 * DLM lock's ast data must be osc_object;
+	 * DLM's enqueue callback set to osc_lock_upcall() with cookie as
+	 * osc_lock.
+	 */
+	fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
+	mdc_lock_build_policy(env, lock, policy);
+	LASSERT(!oscl->ols_speculative);
+	result = mdc_enqueue_send(env, osc_export(osc), resname,
+				  &oscl->ols_flags, policy, &oscl->ols_lvb,
+				  upcall, cookie, &oscl->ols_einfo, async);
+	if (result == 0) {
+		if (osc_lock_is_lockless(oscl)) {
+			oio->oi_lockless = 1;
+		} else if (!async) {
+			LASSERT(oscl->ols_state == OLS_GRANTED);
+			LASSERT(oscl->ols_hold);
+			LASSERT(oscl->ols_dlmlock != NULL);
+		}
+	}
+out:
+	if (result < 0) {
+		oscl->ols_state = OLS_CANCELLED;
+		osc_lock_wake_waiters(env, osc, oscl);
+
+		if (anchor != NULL)
+			cl_sync_io_note(env, anchor, result);
+	}
+	RETURN(result);
+}
+
+static const struct cl_lock_operations mdc_lock_lockless_ops = {
+	.clo_fini = osc_lock_fini,
+	.clo_enqueue = mdc_lock_enqueue,
+	.clo_cancel = mdc_lock_lockless_cancel,
+	.clo_print = osc_lock_print
+};
+
+static const struct cl_lock_operations mdc_lock_ops = {
+	.clo_fini	= osc_lock_fini,
+	.clo_enqueue	= mdc_lock_enqueue,
+	.clo_cancel	= osc_lock_cancel,
+	.clo_print	= osc_lock_print,
+};
+
+int mdc_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	struct osc_lock *ols;
+	__u32 enqflags = lock->cll_descr.cld_enq_flags;
+	__u64 flags = osc_enq2ldlm_flags(enqflags);
+
+	ENTRY;
+
+	/* Ignore AGL for Data-on-MDT, stat returns size data */
+	if ((enqflags & CEF_SPECULATIVE) != 0)
+		RETURN(0);
+
+	OBD_SLAB_ALLOC_PTR_GFP(ols, osc_lock_kmem, GFP_NOFS);
+	if (unlikely(ols == NULL))
+		RETURN(-ENOMEM);
+
+	ols->ols_state = OLS_NEW;
+	spin_lock_init(&ols->ols_lock);
+	INIT_LIST_HEAD(&ols->ols_waiting_list);
+	INIT_LIST_HEAD(&ols->ols_wait_entry);
+	INIT_LIST_HEAD(&ols->ols_nextlock_oscobj);
+	ols->ols_lockless_ops = &mdc_lock_lockless_ops;
+
+	ols->ols_flags = flags;
+	ols->ols_speculative = !!(enqflags & CEF_SPECULATIVE);
+	if (lock->cll_descr.cld_mode == CLM_GROUP)
+		ols->ols_flags |= LDLM_FL_ATOMIC_CB;
+
+	if (ols->ols_flags & LDLM_FL_HAS_INTENT) {
+		ols->ols_flags |= LDLM_FL_BLOCK_GRANTED;
+		ols->ols_glimpse = 1;
+	}
+	mdc_lock_build_einfo(env, lock, cl2osc(obj), &ols->ols_einfo);
+
+	cl_lock_slice_add(lock, &ols->ols_cl, obj, &mdc_lock_ops);
+
+	if (!(enqflags & CEF_MUST))
+		osc_lock_to_lockless(env, ols, (enqflags & CEF_NEVER));
+
+	if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io))
+		osc_lock_set_writer(env, io, obj, ols);
+
+	LDLM_DEBUG_NOLOCK("lock %p, mdc lock %p, flags %llx\n",
+			  lock, ols, ols->ols_flags);
+	RETURN(0);
+}
+
+/**
+ * IO operations.
+ *
+ * An implementation of cl_io_operations specific methods for MDC layer.
+ *
+ */
+static int mdc_async_upcall(void *a, int rc)
+{
+	struct osc_async_cbargs *args = a;
+
+	args->opc_rc = rc;
+	complete(&args->opc_sync);
+	return 0;
+}
+
+static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc,
+			       pgoff_t index, struct lustre_handle *lh)
+{
+	struct ldlm_lock *lock;
+
+	/* find DOM lock protecting object */
+	lock = mdc_dlmlock_at_pgoff(env, osc, index,
+				    OSC_DAP_FL_TEST_LOCK |
+				    OSC_DAP_FL_CANCELING);
+	if (lock == NULL) {
+		struct ldlm_resource *res;
+		struct ldlm_res_id *resname;
+
+		resname = &osc_env_info(env)->oti_resname;
+		fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
+		res = ldlm_resource_get(osc_export(osc)->exp_obd->obd_namespace,
+					NULL, resname, LDLM_IBITS, 0);
+		if (IS_ERR(res))
+			CERROR("No lock resource for "DFID"\n",
+				PFID(lu_object_fid(osc2lu(osc))));
+		else
+			ldlm_resource_dump(D_ERROR, res);
+		libcfs_debug_dumpstack(NULL);
+		return -ENOENT;
+	} else {
+		*lh = lock->l_remote_handle;
+		LDLM_LOCK_PUT(lock);
+	}
+	return 0;
+}
+
+static int mdc_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct cl_io *io = slice->cis_io;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	struct obdo *oa = &oio->oi_oa;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+	unsigned int ia_avalid = io->u.ci_setattr.sa_avalid;
+	enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid;
+	int rc;
+
+	/* silently ignore non-truncate setattr for Data-on-MDT object */
+	if (cl_io_is_trunc(io)) {
+		/* truncate cache dirty pages first */
+		rc = osc_cache_truncate_start(env, cl2osc(obj), size,
+					      &oio->oi_trunc);
+		if (rc < 0)
+			return rc;
+	} else if (cl_io_is_fallocate(io) &&
+		   io->u.ci_setattr.sa_falloc_mode & FALLOC_FL_PUNCH_HOLE) {
+		rc = osc_punch_start(env, io, obj);
+		if (rc < 0)
+			return rc;
+	}
+
+	if (oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		rc = cl_object_attr_get(env, obj, attr);
+		if (rc == 0) {
+			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+			unsigned int cl_valid = 0;
+
+			if (ia_avalid & ATTR_SIZE) {
+				attr->cat_size = size;
+				attr->cat_kms = size;
+				cl_valid = (CAT_SIZE | CAT_KMS);
+			}
+			if (ia_avalid & ATTR_MTIME_SET) {
+				attr->cat_mtime = lvb->lvb_mtime;
+				cl_valid |= CAT_MTIME;
+			}
+			if (ia_avalid & ATTR_ATIME_SET) {
+				attr->cat_atime = lvb->lvb_atime;
+				cl_valid |= CAT_ATIME;
+			}
+			if (ia_xvalid & OP_XVALID_CTIME_SET) {
+				attr->cat_ctime = lvb->lvb_ctime;
+				cl_valid |= CAT_CTIME;
+			}
+			rc = cl_object_attr_update(env, obj, attr, cl_valid);
+		}
+		cl_object_attr_unlock(obj);
+		if (rc < 0)
+			return rc;
+	}
+
+	if (!(ia_avalid & ATTR_SIZE) && !cl_io_is_fallocate(io))
+		return 0;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_mtime = attr->cat_mtime;
+	oa->o_atime = attr->cat_atime;
+	oa->o_ctime = attr->cat_ctime;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+		      OBD_MD_FLCTIME | OBD_MD_FLMTIME | OBD_MD_FLSIZE |
+		      OBD_MD_FLBLOCKS;
+
+	if (oio->oi_lockless) {
+		oa->o_flags = OBD_FL_SRVLOCK;
+		oa->o_valid |= OBD_MD_FLFLAGS;
+	} else {
+		rc = mdc_get_lock_handle(env, cl2osc(obj), CL_PAGE_EOF,
+					 &oa->o_handle);
+		if (!rc)
+			oa->o_valid |= OBD_MD_FLHANDLE;
+	}
+
+	init_completion(&cbargs->opc_sync);
+	if (cl_io_is_fallocate(io)) {
+		int falloc_mode = io->u.ci_setattr.sa_falloc_mode;
+
+		oa->o_size = io->u.ci_setattr.sa_falloc_offset;
+		oa->o_blocks = io->u.ci_setattr.sa_falloc_end;
+		rc = osc_fallocate_base(osc_export(cl2osc(obj)), oa,
+					mdc_async_upcall, cbargs, falloc_mode);
+	} else {
+		oa->o_size = size;
+		oa->o_blocks = OBD_OBJECT_EOF;
+		rc = osc_punch_send(osc_export(cl2osc(obj)), oa,
+				    mdc_async_upcall, cbargs);
+	}
+	cbargs->opc_rpc_sent = rc == 0;
+	return rc;
+}
+
+static int mdc_io_read_ahead(const struct lu_env *env,
+			     const struct cl_io_slice *ios,
+			     pgoff_t start, struct cl_read_ahead *ra)
+{
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	struct osc_io *oio = cl2osc_io(env, ios);
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = mdc_dlmlock_at_pgoff(env, osc, start, 0);
+	if (dlmlock == NULL)
+		RETURN(-ENODATA);
+
+	oio->oi_is_readahead = 1;
+	if (dlmlock->l_req_mode != LCK_PR) {
+		struct lustre_handle lockh;
+
+		ldlm_lock2handle(dlmlock, &lockh);
+		ldlm_lock_addref(&lockh, LCK_PR);
+		ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
+	}
+
+	ra->cra_rpc_pages = osc_cli(osc)->cl_max_pages_per_rpc;
+	ra->cra_end_idx = CL_PAGE_EOF;
+	ra->cra_release = osc_read_ahead_release;
+	ra->cra_dlmlock = dlmlock;
+	ra->cra_oio = oio;
+
+	RETURN(0);
+}
+
+int mdc_io_fsync_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct cl_io *io = slice->cis_io;
+	struct cl_fsync_io *fio = &io->u.ci_fsync;
+	struct cl_object *obj = slice->cis_obj;
+	struct osc_object *osc = cl2osc(obj);
+	int result = 0;
+
+	ENTRY;
+
+	/* a MDC lock always covers whole object, do sync for whole
+	 * possible range despite of supplied start/end values.
+	 */
+	result = osc_cache_writeback_range(env, osc, 0, CL_PAGE_EOF, 0,
+					   fio->fi_mode == CL_FSYNC_DISCARD);
+	if (result > 0) {
+		fio->fi_nr_written += result;
+		result = 0;
+	}
+	if (fio->fi_mode == CL_FSYNC_ALL) {
+		int rc;
+
+		rc = osc_cache_wait_range(env, osc, 0, CL_PAGE_EOF);
+		if (result == 0)
+			result = rc;
+		/* Use OSC sync code because it is asynchronous.
+		 * It is to be added into MDC and avoid the using of
+		 * OST_SYNC at both MDC and MDT.
+		 */
+		rc = osc_fsync_ost(env, osc, fio);
+		if (result == 0)
+			result = rc;
+	}
+
+	RETURN(result);
+}
+
+struct mdc_data_version_args {
+	struct osc_io *dva_oio;
+};
+
+static int
+mdc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			   void *args, int rc)
+{
+	struct mdc_data_version_args *dva = args;
+	struct osc_io *oio = dva->dva_oio;
+	const struct mdt_body *body;
+
+	ENTRY;
+	if (rc < 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	/* Prepare OBDO from mdt_body for CLIO */
+	oio->oi_oa.o_valid = body->mbo_valid;
+	oio->oi_oa.o_flags = body->mbo_flags;
+	oio->oi_oa.o_data_version = body->mbo_version;
+	oio->oi_oa.o_layout_version = body->mbo_layout_gen;
+	EXIT;
+out:
+	oio->oi_cbarg.opc_rc = rc;
+	complete(&oio->oi_cbarg.opc_sync);
+	return 0;
+}
+
+static int mdc_io_data_version_start(const struct lu_env *env,
+				     const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	struct osc_object *obj = cl2osc(slice->cis_obj);
+	struct obd_export *exp = osc_export(obj);
+	struct ptlrpc_request *req;
+	struct mdt_body *body;
+	struct mdc_data_version_args *dva;
+	int rc;
+
+	ENTRY;
+
+	memset(&oio->oi_oa, 0, sizeof(oio->oi_oa));
+	oio->oi_oa.o_oi.oi_fid = *lu_object_fid(osc2lu(obj));
+	oio->oi_oa.o_valid = OBD_MD_FLID;
+
+	init_completion(&cbargs->opc_sync);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+	body->mbo_fid1 = *lu_object_fid(osc2lu(obj));
+	body->mbo_valid = OBD_MD_FLID;
+	/* Indicate that data version is needed */
+	body->mbo_valid |= OBD_MD_FLDATAVERSION;
+	body->mbo_flags = 0;
+
+	if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) {
+		body->mbo_valid |= OBD_MD_FLFLAGS;
+		body->mbo_flags |= OBD_FL_SRVLOCK;
+		if (dv->dv_flags & LL_DV_WR_FLUSH)
+			body->mbo_flags |= OBD_FL_FLUSH;
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = mdc_data_version_interpret;
+	dva = ptlrpc_req_async_args(dva, req);
+	dva->dva_oio = oio;
+
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+
+static void mdc_io_data_version_end(const struct lu_env *env,
+				    const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+	ENTRY;
+	wait_for_completion(&cbargs->opc_sync);
+
+	if (cbargs->opc_rc != 0) {
+		slice->cis_io->ci_result = cbargs->opc_rc;
+	} else {
+		slice->cis_io->ci_result = 0;
+		if (!(oio->oi_oa.o_valid &
+		      (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
+			slice->cis_io->ci_result = -ENOTSUPP;
+
+		if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION)
+			dv->dv_layout_version = oio->oi_oa.o_layout_version;
+		if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)
+			dv->dv_data_version = oio->oi_oa.o_data_version;
+	}
+
+	EXIT;
+}
+
+static const struct cl_io_operations mdc_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_rw_iter_fini,
+			.cio_start     = osc_io_read_start,
+		},
+		[CIT_WRITE] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_rw_iter_fini,
+			.cio_start     = osc_io_write_start,
+			.cio_end       = osc_io_end,
+		},
+		[CIT_SETATTR] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start     = mdc_io_setattr_start,
+			.cio_end       = osc_io_setattr_end,
+		},
+		[CIT_DATA_VERSION] = {
+			.cio_start = mdc_io_data_version_start,
+			.cio_end   = mdc_io_data_version_end,
+		},
+		[CIT_FAULT] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start     = osc_io_fault_start,
+			.cio_end       = osc_io_end,
+		},
+		[CIT_FSYNC] = {
+			.cio_start = mdc_io_fsync_start,
+			.cio_end   = osc_io_fsync_end,
+		},
+		[CIT_LSEEK] = {
+			.cio_start  = osc_io_lseek_start,
+			.cio_end    = osc_io_lseek_end,
+		},
+	},
+	.cio_read_ahead   = mdc_io_read_ahead,
+	.cio_lru_reserve  = osc_io_lru_reserve,
+	.cio_submit	  = osc_io_submit,
+	.cio_commit_async = osc_io_commit_async,
+	.cio_extent_release = osc_io_extent_release,
+};
+
+int mdc_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	struct osc_io *oio = osc_env_io(env);
+
+	CL_IO_SLICE_CLEAN(oio, oi_cl);
+	cl_io_slice_add(io, &oio->oi_cl, obj, &mdc_io_ops);
+	return 0;
+}
+
+static void mdc_build_res_name(struct osc_object *osc,
+				   struct ldlm_res_id *resname)
+{
+	fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for MDC
+ * layer. MDC is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void mdc_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+			     struct cl_req_attr *attr)
+{
+	u64 flags = attr->cra_flags;
+
+	/* Copy object FID to cl_attr */
+	attr->cra_oa->o_oi.oi_fid = *lu_object_fid(&obj->co_lu);
+
+	if (flags & OBD_MD_FLGROUP)
+		attr->cra_oa->o_valid |= OBD_MD_FLGROUP;
+
+	if (flags & OBD_MD_FLID)
+		attr->cra_oa->o_valid |= OBD_MD_FLID;
+
+	if (flags & OBD_MD_FLHANDLE) {
+		struct osc_page *opg;
+
+		opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj));
+		if (!opg->ops_srvlock) {
+			int rc;
+
+			rc = mdc_get_lock_handle(env, cl2osc(obj),
+						 osc_index(opg),
+						 &attr->cra_oa->o_handle);
+			if (rc) {
+				CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page,
+					      "uncovered page!\n");
+				LBUG();
+			} else {
+				attr->cra_oa->o_valid |= OBD_MD_FLHANDLE;
+			}
+		}
+	}
+}
+
+static int mdc_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	if (OST_LVB_IS_ERR(oinfo->loi_lvb.lvb_blocks))
+		return OST_LVB_GET_ERR(oinfo->loi_lvb.lvb_blocks);
+
+	return osc_attr_get(env, obj, attr);
+}
+
+static int mdc_object_ast_clear(struct ldlm_lock *lock, void *data)
+{
+	struct osc_object *osc = (struct osc_object *)data;
+	struct ost_lvb *lvb = &lock->l_ost_lvb;
+	struct lov_oinfo *oinfo;
+	ENTRY;
+
+	if (lock->l_ast_data == data) {
+		lock->l_ast_data = NULL;
+
+		LASSERT(osc != NULL);
+		LASSERT(osc->oo_oinfo != NULL);
+		LASSERT(lvb != NULL);
+
+		/* Updates lvb in lock by the cached oinfo */
+		oinfo = osc->oo_oinfo;
+
+		LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: "
+			   "%llu %llu %llu by oinfo size %llu blocks %llu "
+			   "[cma]time %llu %llu %llu", lvb->lvb_size,
+			   lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime,
+			   lvb->lvb_atime, oinfo->loi_lvb.lvb_size,
+			   oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime,
+			   oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime);
+		LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms);
+
+		cl_object_attr_lock(&osc->oo_cl);
+		memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb));
+		cl_object_attr_unlock(&osc->oo_cl);
+		ldlm_clear_lvb_cached(lock);
+	}
+	RETURN(LDLM_ITER_CONTINUE);
+}
+
+int mdc_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	struct osc_object *osc = cl2osc(obj);
+	struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname;
+
+	/* DLM locks don't hold a reference of osc_object so we have to
+	 * clear it before the object is being destroyed. */
+	osc_build_res_name(osc, resname);
+	ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
+			      mdc_object_ast_clear, osc);
+	return 0;
+}
+
+static int mdc_object_flush(const struct lu_env *env, struct cl_object *obj,
+			    struct ldlm_lock *lock)
+{
+	/* if lock cancel is initiated from llite then it is combined
+	 * lock with DOM bit and it may have no l_ast_data initialized yet,
+	 * so init it here with given osc_object.
+	 */
+	mdc_set_dom_lock_data(lock, cl2osc(obj));
+	RETURN(mdc_dlm_canceling(env, lock));
+}
+
+static const struct cl_object_operations mdc_ops = {
+	.coo_page_init = osc_page_init,
+	.coo_lock_init = mdc_lock_init,
+	.coo_io_init = mdc_io_init,
+	.coo_attr_get = mdc_attr_get,
+	.coo_attr_update = osc_attr_update,
+	.coo_glimpse = osc_object_glimpse,
+	.coo_req_attr_set = mdc_req_attr_set,
+	.coo_prune = mdc_object_prune,
+	.coo_object_flush = mdc_object_flush
+};
+
+static const struct osc_object_operations mdc_object_ops = {
+	.oto_build_res_name = mdc_build_res_name,
+	.oto_dlmlock_at_pgoff = mdc_dlmlock_at_pgoff,
+};
+
+static int mdc_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct osc_object *osc = lu2osc(obj);
+
+	if (osc->oo_initialized)
+		return 0;
+
+	osc->oo_initialized = true;
+
+	return osc_object_init(env, obj, conf);
+}
+
+static void mdc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	osc_object_free(env, obj);
+}
+
+static const struct lu_object_operations mdc_lu_obj_ops = {
+	.loo_object_init = mdc_object_init,
+	.loo_object_delete = NULL,
+	.loo_object_release = NULL,
+	.loo_object_free = mdc_object_free,
+	.loo_object_print = osc_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *mdc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct osc_object *osc;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS);
+	if (osc != NULL) {
+		obj = osc2lu(osc);
+		lu_object_init(obj, NULL, dev);
+		osc->oo_cl.co_ops = &mdc_ops;
+		obj->lo_ops = &mdc_lu_obj_ops;
+		osc->oo_obj_ops = &mdc_object_ops;
+		osc->oo_initialized = false;
+	} else {
+		obj = NULL;
+	}
+	return obj;
+}
+
+static int mdc_process_config(const struct lu_env *env, struct lu_device *d,
+			      struct lustre_cfg *cfg)
+{
+	size_t count  = class_modify_config(cfg, PARAM_MDC,
+					    &d->ld_obd->obd_kset.kobj);
+	return count > 0 ? 0 : count;
+}
+
+const struct lu_device_operations mdc_lu_ops = {
+	.ldo_object_alloc = mdc_object_alloc,
+	.ldo_process_config = mdc_process_config,
+	.ldo_recovery_complete = NULL,
+};
+
+static struct lu_device *mdc_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct osc_device *oc;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(oc);
+	if (oc == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&oc->osc_cl, t);
+	d = osc2lu_dev(oc);
+	d->ld_ops = &mdc_lu_ops;
+
+	/* Setup MDC OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	if (obd == NULL)
+		RETURN(ERR_PTR(-ENODEV));
+
+	rc = mdc_setup(obd, cfg);
+	if (rc < 0) {
+		osc_device_free(env, d);
+		RETURN(ERR_PTR(rc));
+	}
+	oc->osc_exp = obd->obd_self_export;
+	oc->osc_stats.os_init = ktime_get_real();
+	RETURN(d);
+}
+
+static const struct lu_device_type_operations mdc_device_type_ops = {
+	.ldto_device_alloc = mdc_device_alloc,
+	.ldto_device_free = osc_device_free,
+	.ldto_device_init = osc_device_init,
+	.ldto_device_fini = osc_device_fini
+};
+
+struct lu_device_type mdc_device_type = {
+	.ldt_tags = LU_DEVICE_CL,
+	.ldt_name = LUSTRE_MDC_NAME,
+	.ldt_ops = &mdc_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
new file mode 100644
index 0000000000000..20a81bf4d294a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
@@ -0,0 +1,198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _MDC_INTERNAL_H
+#define _MDC_INTERNAL_H
+
+#include <lustre_mdc.h>
+
+int mdc_tunables_init(struct obd_device *obd);
+
+void mdc_pack_body(struct req_capsule *pill, const struct lu_fid *fid,
+		   u64 valid, size_t ea_size, u32 suppgid, u32 flags);
+void mdc_swap_layouts_pack(struct req_capsule *pill,
+			   struct md_op_data *op_data);
+void mdc_readdir_pack(struct req_capsule *pill, __u64 pgoff, size_t size,
+		      const struct lu_fid *fid);
+void mdc_getattr_pack(struct req_capsule *pill, __u64 valid, __u32 flags,
+		      struct md_op_data *data, size_t ea_size);
+void mdc_setattr_pack(struct req_capsule *pill, struct md_op_data *op_data,
+		      void *ea, size_t ealen);
+void mdc_create_pack(struct req_capsule *pill, struct md_op_data *op_data,
+		     const void *data, size_t datalen, umode_t mode,
+		     uid_t uid, gid_t gid, kernel_cap_t capability, u64 rdev);
+void mdc_open_pack(struct req_capsule *pill, struct md_op_data *op_data,
+		   umode_t mode, __u64 rdev, __u64 flags,
+		   const void *data, size_t datalen);
+void mdc_file_secctx_pack(struct req_capsule *pill,
+			  const char *secctx_name,
+			  const void *secctx, size_t secctx_size);
+void mdc_file_encctx_pack(struct req_capsule *pill,
+			  const void *encctx, size_t encctx_size);
+void mdc_file_sepol_pack(struct req_capsule *pill);
+
+void mdc_unlink_pack(struct req_capsule *pill, struct md_op_data *op_data);
+void mdc_link_pack(struct req_capsule *pill, struct md_op_data *op_data);
+void mdc_rename_pack(struct req_capsule *pill, struct md_op_data *op_data,
+		     const char *old, size_t oldlen,
+		     const char *new, size_t newlen);
+void mdc_migrate_pack(struct req_capsule *pill, struct md_op_data *op_data,
+			const char *name, size_t namelen);
+void mdc_close_pack(struct req_capsule *pill, struct md_op_data *op_data);
+
+/* mdc/mdc_locks.c */
+int mdc_set_lock_data(struct obd_export *exp,
+		      const struct lustre_handle *lockh,
+		      void *data, __u64 *bits);
+
+int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid);
+
+int mdc_intent_lock(struct obd_export *exp,
+		    struct md_op_data *op_data,
+		    struct lookup_intent *it,
+		    struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		const union ldlm_policy_data *policy,
+		struct md_op_data *op_data,
+		struct lustre_handle *lockh, __u64 extra_lock_flags);
+int mdc_resource_get_unused_res(struct obd_export *exp,
+				struct ldlm_res_id *res_id,
+				struct list_head *cancels,
+				enum ldlm_mode mode, __u64 bits);
+int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid,
+			    struct list_head *cancels, enum ldlm_mode mode,
+                            __u64 bits);
+/* mdc/mdc_request.c */
+int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+		  struct lu_fid *fid, struct md_op_data *op_data);
+int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg);
+
+struct obd_client_handle;
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct lookup_intent *it);
+
+void mdc_commit_open(struct ptlrpc_request *req);
+void mdc_replay_open(struct ptlrpc_request *req);
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+		const void *data, size_t datalen,
+		umode_t mode, uid_t uid, gid_t gid,
+		kernel_cap_t capability, __u64 rdev,
+		struct ptlrpc_request **request);
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+             struct ptlrpc_request **request);
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+		const char *old, size_t oldlen, const char *new, size_t newlen,
+		struct ptlrpc_request **request);
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, size_t ealen, struct ptlrpc_request **request);
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+	       struct ptlrpc_request **request);
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *data);
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+		      union ldlm_policy_data *policy, enum ldlm_mode mode,
+		      enum ldlm_cancel_flags flags, void *opaque);
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+                        struct lu_fid *fid, __u64 *bits);
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo);
+
+enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags,
+			      const struct lu_fid *fid, enum ldlm_type type,
+			      union ldlm_policy_data *policy,
+			      enum ldlm_mode mode, struct lustre_handle *lockh);
+
+
+#define MDC_CHANGELOG_DEV_COUNT LMV_MAX_STRIPE_COUNT
+#define MDC_CHANGELOG_DEV_NAME	"changelog"
+extern struct class *mdc_changelog_class;
+extern dev_t mdc_changelog_dev;
+extern struct idr mdc_changelog_minor_idr;
+
+int mdc_changelog_cdev_init(struct obd_device *obd);
+
+void mdc_changelog_cdev_finish(struct obd_device *obd);
+
+static inline int mdc_prep_elc_req(struct obd_export *exp,
+				   struct ptlrpc_request *req, int opc,
+				   struct list_head *cancels, int count)
+{
+	return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels,
+				 count);
+}
+
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+int mdc_unpack_acl(struct req_capsule *pill, struct lustre_md *md);
+#else
+static inline
+int mdc_unpack_acl(struct req_capsule *pill, struct lustre_md *md)
+{
+	return 0;
+}
+#endif
+
+static inline void mdc_body2lvb(struct mdt_body *body, struct ost_lvb *lvb)
+{
+	LASSERT(body->mbo_valid & OBD_MD_DOM_SIZE);
+	lvb->lvb_mtime = body->mbo_mtime;
+	lvb->lvb_atime = body->mbo_atime;
+	lvb->lvb_ctime = body->mbo_ctime;
+	lvb->lvb_blocks = body->mbo_dom_blocks;
+	lvb->lvb_size = body->mbo_dom_size;
+}
+
+static inline unsigned long hash_x_index(__u64 hash, int hash64)
+{
+	if (BITS_PER_LONG == 32 && hash64)
+		hash >>= 32;
+	/* save hash 0 with hash 1 */
+	return ~0UL - (hash + !hash);
+}
+
+/* mdc_dev.c */
+extern struct lu_device_type mdc_device_type;
+int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+			  struct ldlm_lock_desc *new, void *data, int flag);
+int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
+int mdc_fill_lvb(struct req_capsule *pill, struct ost_lvb *lvb);
+
+/* the minimum inline repsize should be PAGE_SIZE at least */
+#define MDC_DOM_DEF_INLINE_REPSIZE max(8192UL, PAGE_SIZE)
+#define MDC_DOM_MAX_INLINE_REPSIZE XATTR_SIZE_MAX
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
new file mode 100644
index 0000000000000..5d571d3c76e4c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
@@ -0,0 +1,674 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+
+#include <lustre_net.h>
+#include <obd_class.h>
+#include <obd.h>
+#include <cl_object.h>
+#include "mdc_internal.h"
+
+static void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags)
+{
+	mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll);
+	mrc->cr_flags_h = (__u32)(flags >> 32);
+}
+
+static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid)
+{
+	LASSERT(b);
+
+	b->mbo_suppgid = suppgid;
+	b->mbo_uid = from_kuid(&init_user_ns, current_uid());
+	b->mbo_gid = from_kgid(&init_user_ns, current_gid());
+	b->mbo_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	b->mbo_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	b->mbo_capability = current_cap().cap[0];
+}
+
+void mdc_swap_layouts_pack(struct req_capsule *pill,
+			   struct md_op_data *op_data)
+{
+	struct mdt_body *b = req_capsule_client_get(pill, &RMF_MDT_BODY);
+
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+	b->mbo_fid1 = op_data->op_fid1;
+	b->mbo_fid2 = op_data->op_fid2;
+	b->mbo_valid |= OBD_MD_FLID;
+}
+
+void mdc_pack_body(struct req_capsule *pill, const struct lu_fid *fid,
+		   u64 valid, size_t ea_size, u32 suppgid, u32 flags)
+{
+	struct mdt_body *b = req_capsule_client_get(pill, &RMF_MDT_BODY);
+	LASSERT(b);
+	b->mbo_valid = valid;
+	b->mbo_eadatasize = ea_size;
+	b->mbo_flags = flags;
+	__mdc_pack_body(b, suppgid);
+	if (fid) {
+		b->mbo_fid1 = *fid;
+		b->mbo_valid |= OBD_MD_FLID;
+	}
+}
+
+/**
+ * Pack a name (path component) into a request
+ *
+ * \param[in]	pill		request pill
+ * \param[in]	field		request field (usually RMF_NAME)
+ * \param[in]	name		path component
+ * \param[in]	name_len	length of path component
+ *
+ * \a field must be present in \a req and of size \a name_len + 1.
+ *
+ * \a name must be '\0' terminated of length \a name_len and represent
+ * a single path component (not contain '/').
+ */
+static void mdc_pack_name(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  const char *name, size_t name_len)
+{
+	char *buf;
+	size_t buf_size;
+	size_t cpy_len;
+
+	buf = req_capsule_client_get(pill, field);
+	buf_size = req_capsule_get_size(pill, field, RCL_CLIENT);
+
+	LASSERT(buf != NULL && buf_size == name_len + 1);
+
+	if (!name) {
+		buf[name_len] = '\0';
+		return;
+	}
+	cpy_len = strlcpy(buf, name, buf_size);
+
+	LASSERT(lu_name_is_valid_2(buf, cpy_len));
+	if (cpy_len != name_len)
+		CDEBUG(D_DENTRY, "%s len %zd != %zd, concurrent rename?\n",
+		       buf, name_len, cpy_len);
+}
+
+void mdc_file_secctx_pack(struct req_capsule *pill, const char *secctx_name,
+			  const void *secctx, size_t secctx_size)
+{
+	void *buf;
+	size_t buf_size;
+
+	if (secctx_name == NULL)
+		return;
+
+	buf = req_capsule_client_get(pill, &RMF_FILE_SECCTX_NAME);
+	buf_size = req_capsule_get_size(pill, &RMF_FILE_SECCTX_NAME,
+					RCL_CLIENT);
+
+	LASSERT(buf_size == strlen(secctx_name) + 1);
+	memcpy(buf, secctx_name, buf_size);
+
+	buf = req_capsule_client_get(pill, &RMF_FILE_SECCTX);
+	buf_size = req_capsule_get_size(pill, &RMF_FILE_SECCTX,
+					RCL_CLIENT);
+
+	LASSERT(buf_size == secctx_size);
+	memcpy(buf, secctx, buf_size);
+}
+
+void mdc_file_encctx_pack(struct req_capsule *pill,
+			  const void *encctx, size_t encctx_size)
+{
+	void *buf;
+	size_t buf_size;
+
+	if (encctx == NULL)
+		return;
+
+	buf = req_capsule_client_get(pill, &RMF_FILE_ENCCTX);
+	buf_size = req_capsule_get_size(pill, &RMF_FILE_ENCCTX,
+					RCL_CLIENT);
+
+	LASSERT(buf_size == encctx_size);
+	memcpy(buf, encctx, buf_size);
+}
+
+void mdc_file_sepol_pack(struct req_capsule *pill)
+{
+	void *buf;
+	size_t buf_size;
+	struct ptlrpc_request *req = pill->rc_req;
+
+	if (strlen(req->rq_sepol) == 0)
+		return;
+
+	buf = req_capsule_client_get(pill, &RMF_SELINUX_POL);
+	buf_size = req_capsule_get_size(pill, &RMF_SELINUX_POL,
+					RCL_CLIENT);
+
+	LASSERT(buf_size == strlen(req->rq_sepol) + 1);
+	snprintf(buf, strlen(req->rq_sepol) + 1, "%s", req->rq_sepol);
+}
+
+void mdc_readdir_pack(struct req_capsule *pill, __u64 pgoff, size_t size,
+		      const struct lu_fid *fid)
+{
+	struct mdt_body *b = req_capsule_client_get(pill, &RMF_MDT_BODY);
+
+	b->mbo_fid1 = *fid;
+	b->mbo_valid |= OBD_MD_FLID;
+	b->mbo_size = pgoff;			/* !! */
+	b->mbo_nlink = size;			/* !! */
+	__mdc_pack_body(b, -1);
+	b->mbo_mode = LUDA_FID | LUDA_TYPE;
+}
+
+/* packing of MDS records */
+void mdc_create_pack(struct req_capsule *pill, struct md_op_data *op_data,
+		     const void *data, size_t datalen, umode_t mode,
+		     uid_t uid, gid_t gid, kernel_cap_t cap_effective, u64 rdev)
+{
+	struct mdt_rec_create *rec;
+	char *tmp;
+	__u64 flags;
+
+	BUILD_BUG_ON(sizeof(struct mdt_rec_reint) !=
+		     sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+
+	rec->cr_opcode   = REINT_CREATE;
+	rec->cr_fsuid    = uid;
+	rec->cr_fsgid    = gid;
+	rec->cr_cap = cap_effective.cap[0];
+	rec->cr_fid1     = op_data->op_fid1;
+	rec->cr_fid2     = op_data->op_fid2;
+	rec->cr_mode     = mode;
+	rec->cr_rdev     = rdev;
+	rec->cr_time     = op_data->op_mod_time;
+	rec->cr_suppgid1 = op_data->op_suppgids[0];
+	rec->cr_suppgid2 = op_data->op_suppgids[1];
+	flags = 0;
+	if (op_data->op_bias & MDS_CREATE_VOLATILE)
+		flags |= MDS_OPEN_VOLATILE;
+	if (op_data->op_bias & MDS_SETSTRIPE_CREATE)
+		/* borrow MDS_OPEN_CREATE flag to indicate current setstripe
+		 * create only, and don't restripe if object exists.
+		 */
+		flags |= MDS_OPEN_CREAT;
+	set_mrc_cr_flags(rec, flags);
+	rec->cr_bias     = op_data->op_bias;
+	rec->cr_umask    = current_umask();
+
+	mdc_pack_name(pill, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+	if (data) {
+		tmp = req_capsule_client_get(pill, &RMF_EADATA);
+		memcpy(tmp, data, datalen);
+	}
+
+	mdc_file_secctx_pack(pill, op_data->op_file_secctx_name,
+			     op_data->op_file_secctx,
+			     op_data->op_file_secctx_size);
+
+	mdc_file_encctx_pack(pill, op_data->op_file_encctx,
+			     op_data->op_file_encctx_size);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(pill);
+}
+
+static inline __u64 mds_pack_open_flags(__u64 flags)
+{
+	__u64 cr_flags = (flags & MDS_OPEN_FL_INTERNAL);
+
+	if (flags & FMODE_READ)
+		cr_flags |= MDS_FMODE_READ;
+	if (flags & FMODE_WRITE)
+		cr_flags |= MDS_FMODE_WRITE;
+	if (flags & O_CREAT)
+		cr_flags |= MDS_OPEN_CREAT;
+	if (flags & O_EXCL)
+		cr_flags |= MDS_OPEN_EXCL;
+	if (flags & O_TRUNC)
+		cr_flags |= MDS_OPEN_TRUNC;
+	if (flags & O_APPEND)
+		cr_flags |= MDS_OPEN_APPEND;
+	if (flags & O_SYNC)
+		cr_flags |= MDS_OPEN_SYNC;
+	if (flags & O_DIRECTORY)
+		cr_flags |= MDS_OPEN_DIRECTORY;
+#ifdef FMODE_EXEC
+	if (flags & FMODE_EXEC)
+		cr_flags |= MDS_FMODE_EXEC;
+#endif
+	if (cl_is_lov_delay_create(flags))
+		cr_flags |= MDS_OPEN_DELAY_CREATE;
+
+	if (flags & O_NONBLOCK)
+		cr_flags |= MDS_OPEN_NORESTORE;
+
+	return cr_flags;
+}
+
+/* packing of MDS records */
+void mdc_open_pack(struct req_capsule *pill, struct md_op_data *op_data,
+		   umode_t mode, __u64 rdev, __u64 flags, const void *lmm,
+		   size_t lmmlen)
+{
+	struct mdt_rec_create *rec;
+	char *tmp;
+	__u64 cr_flags;
+
+	BUILD_BUG_ON(sizeof(struct mdt_rec_reint) !=
+		     sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+
+	/* XXX do something about time, uid, gid */
+	rec->cr_opcode = REINT_OPEN;
+	rec->cr_fsuid	= from_kuid(&init_user_ns, current_fsuid());
+	rec->cr_fsgid	= from_kgid(&init_user_ns, current_fsgid());
+	rec->cr_cap = current_cap().cap[0];
+	rec->cr_mode   = mode;
+	cr_flags	= mds_pack_open_flags(flags);
+	rec->cr_rdev   = rdev;
+	rec->cr_umask  = current_umask();
+	if (op_data != NULL) {
+		rec->cr_fid1       = op_data->op_fid1;
+		rec->cr_fid2       = op_data->op_fid2;
+		rec->cr_time       = op_data->op_mod_time;
+		rec->cr_suppgid1   = op_data->op_suppgids[0];
+		rec->cr_suppgid2   = op_data->op_suppgids[1];
+		rec->cr_bias       = op_data->op_bias;
+		rec->cr_open_handle_old = op_data->op_open_handle;
+
+		if (op_data->op_name) {
+			mdc_pack_name(pill, &RMF_NAME, op_data->op_name,
+				      op_data->op_namelen);
+
+			if (op_data->op_bias & MDS_CREATE_VOLATILE)
+				cr_flags |= MDS_OPEN_VOLATILE;
+		}
+
+		mdc_file_secctx_pack(pill, op_data->op_file_secctx_name,
+				     op_data->op_file_secctx,
+				     op_data->op_file_secctx_size);
+
+		mdc_file_encctx_pack(pill, op_data->op_file_encctx,
+				     op_data->op_file_encctx_size);
+
+		/* pack SELinux policy info if any */
+		mdc_file_sepol_pack(pill);
+	}
+
+	if (lmm) {
+		cr_flags |= MDS_OPEN_HAS_EA;
+		tmp = req_capsule_client_get(pill, &RMF_EADATA);
+		memcpy(tmp, lmm, lmmlen);
+		if (cr_flags & MDS_OPEN_PCC) {
+			LASSERT(op_data != NULL);
+			rec->cr_archive_id = op_data->op_archive_id;
+		}
+	}
+	cr_flags |= MDS_OPEN_DEFAULT_LMV;
+	set_mrc_cr_flags(rec, cr_flags);
+}
+
+static inline enum mds_attr_flags mdc_attr_pack(unsigned int ia_valid,
+						enum op_xvalid ia_xvalid)
+{
+	enum mds_attr_flags sa_valid = 0;
+
+	if (ia_valid & ATTR_MODE)
+		sa_valid |= MDS_ATTR_MODE;
+	if (ia_valid & ATTR_UID)
+		sa_valid |= MDS_ATTR_UID;
+	if (ia_valid & ATTR_GID)
+		sa_valid |= MDS_ATTR_GID;
+	if (ia_valid & ATTR_SIZE)
+		sa_valid |= MDS_ATTR_SIZE;
+	if (ia_valid & ATTR_ATIME)
+		sa_valid |= MDS_ATTR_ATIME;
+	if (ia_valid & ATTR_MTIME)
+		sa_valid |= MDS_ATTR_MTIME;
+	if (ia_valid & ATTR_CTIME)
+		sa_valid |= MDS_ATTR_CTIME;
+	if (ia_valid & ATTR_ATIME_SET)
+		sa_valid |= MDS_ATTR_ATIME_SET;
+	if (ia_valid & ATTR_MTIME_SET)
+		sa_valid |= MDS_ATTR_MTIME_SET;
+	if (ia_valid & ATTR_FORCE)
+		sa_valid |= MDS_ATTR_FORCE;
+	if (ia_xvalid & OP_XVALID_FLAGS)
+		sa_valid |= MDS_ATTR_ATTR_FLAG;
+	if (ia_valid & ATTR_KILL_SUID)
+		sa_valid |=  MDS_ATTR_KILL_SUID;
+	if (ia_valid & ATTR_KILL_SGID)
+		sa_valid |= MDS_ATTR_KILL_SGID;
+	if (ia_xvalid & OP_XVALID_CTIME_SET)
+		sa_valid |= MDS_ATTR_CTIME_SET;
+	if (ia_valid & ATTR_OPEN)
+		sa_valid |= MDS_ATTR_FROM_OPEN;
+	if (ia_xvalid & OP_XVALID_BLOCKS)
+		sa_valid |= MDS_ATTR_BLOCKS;
+	if (ia_xvalid & OP_XVALID_OWNEROVERRIDE)
+		/* NFSD hack (see bug 5781) */
+		sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+	if (ia_xvalid & OP_XVALID_PROJID)
+		sa_valid |= MDS_ATTR_PROJID;
+	if (ia_xvalid & OP_XVALID_LAZYSIZE)
+		sa_valid |= MDS_ATTR_LSIZE;
+	if (ia_xvalid & OP_XVALID_LAZYBLOCKS)
+		sa_valid |= MDS_ATTR_LBLOCKS;
+
+	return sa_valid;
+}
+
+static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
+				 struct md_op_data *op_data)
+{
+	rec->sa_opcode  = REINT_SETATTR;
+	rec->sa_fsuid	= from_kuid(&init_user_ns, current_fsuid());
+	rec->sa_fsgid	= from_kgid(&init_user_ns, current_fsgid());
+	rec->sa_cap = current_cap().cap[0];
+	rec->sa_suppgid = -1;
+
+	rec->sa_fid    = op_data->op_fid1;
+	rec->sa_valid  = mdc_attr_pack(op_data->op_attr.ia_valid,
+				       op_data->op_xvalid);
+	rec->sa_mode   = op_data->op_attr.ia_mode;
+	rec->sa_uid    = from_kuid(&init_user_ns, op_data->op_attr.ia_uid);
+	rec->sa_gid    = from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
+	rec->sa_projid = op_data->op_projid;
+	rec->sa_size   = op_data->op_attr.ia_size;
+	rec->sa_blocks = op_data->op_attr_blocks;
+	rec->sa_atime = op_data->op_attr.ia_atime.tv_sec;
+	rec->sa_mtime = op_data->op_attr.ia_mtime.tv_sec;
+	rec->sa_ctime = op_data->op_attr.ia_ctime.tv_sec;
+	rec->sa_attr_flags = op_data->op_attr_flags;
+	if ((op_data->op_attr.ia_valid & ATTR_GID) &&
+	    in_group_p(op_data->op_attr.ia_gid))
+		rec->sa_suppgid =
+			from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
+	else
+		rec->sa_suppgid = op_data->op_suppgids[0];
+
+	rec->sa_bias = op_data->op_bias;
+}
+
+static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
+			     struct md_op_data *op_data)
+{
+	epoch->mio_open_handle = op_data->op_open_handle;
+	epoch->mio_unused1 = 0;
+	epoch->mio_unused2 = 0;
+	epoch->mio_padding = 0;
+}
+
+void mdc_setattr_pack(struct req_capsule *pill, struct md_op_data *op_data,
+		      void *ea, size_t ealen)
+{
+	struct mdt_rec_setattr *rec;
+	struct lov_user_md *lum = NULL;
+
+	BUILD_BUG_ON(sizeof(struct mdt_rec_reint) !=
+		     sizeof(struct mdt_rec_setattr));
+	rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+	mdc_setattr_pack_rec(rec, op_data);
+
+	if (ealen == 0)
+		return;
+
+	lum = req_capsule_client_get(pill, &RMF_EADATA);
+	if (ea == NULL) { /* Remove LOV EA */
+		lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
+		lum->lmm_stripe_size = 0;
+		lum->lmm_stripe_count = 0;
+		lum->lmm_stripe_offset =
+		  (typeof(lum->lmm_stripe_offset))LOV_OFFSET_DEFAULT;
+	} else {
+		memcpy(lum, ea, ealen);
+	}
+}
+
+void mdc_unlink_pack(struct req_capsule *pill, struct md_op_data *op_data)
+{
+	struct mdt_rec_unlink *rec;
+
+	BUILD_BUG_ON(sizeof(struct mdt_rec_reint) !=
+		     sizeof(struct mdt_rec_unlink));
+	rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	rec->ul_opcode = op_data->op_cli_flags & CLI_RM_ENTRY ?
+					REINT_RMENTRY : REINT_UNLINK;
+	rec->ul_fsuid = op_data->op_fsuid;
+	rec->ul_fsgid = op_data->op_fsgid;
+	rec->ul_cap = op_data->op_cap.cap[0];
+	rec->ul_mode = op_data->op_mode;
+	rec->ul_suppgid1 = op_data->op_suppgids[0];
+	rec->ul_suppgid2 = -1;
+	rec->ul_fid1 = op_data->op_fid1;
+	rec->ul_fid2 = op_data->op_fid2;
+	rec->ul_time = op_data->op_mod_time;
+	rec->ul_bias = op_data->op_bias;
+
+	mdc_pack_name(pill, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(pill);
+}
+
+void mdc_link_pack(struct req_capsule *pill, struct md_op_data *op_data)
+{
+	struct mdt_rec_link *rec;
+
+	BUILD_BUG_ON(sizeof(struct mdt_rec_reint) !=
+		     sizeof(struct mdt_rec_link));
+	rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	rec->lk_opcode   = REINT_LINK;
+	rec->lk_fsuid    = op_data->op_fsuid; /* current->fsuid; */
+	rec->lk_fsgid    = op_data->op_fsgid; /* current->fsgid; */
+	rec->lk_cap      = op_data->op_cap.cap[0]; /* current->cap_effective; */
+	rec->lk_suppgid1 = op_data->op_suppgids[0];
+	rec->lk_suppgid2 = op_data->op_suppgids[1];
+	rec->lk_fid1     = op_data->op_fid1;
+	rec->lk_fid2     = op_data->op_fid2;
+	rec->lk_time     = op_data->op_mod_time;
+	rec->lk_bias     = op_data->op_bias;
+
+	mdc_pack_name(pill, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(pill);
+}
+
+static void mdc_close_intent_pack(struct req_capsule *pill,
+				  struct md_op_data *op_data)
+{
+	struct close_data	*data;
+	struct ldlm_lock	*lock;
+	enum mds_op_bias	 bias = op_data->op_bias;
+
+	if (!(bias & (MDS_CLOSE_INTENT | MDS_CLOSE_MIGRATE)))
+		return;
+
+	data = req_capsule_client_get(pill, &RMF_CLOSE_DATA);
+	LASSERT(data != NULL);
+
+	lock = ldlm_handle2lock(&op_data->op_lease_handle);
+	if (lock != NULL) {
+		data->cd_handle = lock->l_remote_handle;
+		LDLM_LOCK_PUT(lock);
+	}
+	ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+
+	data->cd_data_version = op_data->op_data_version;
+	data->cd_fid = op_data->op_fid2;
+
+	if (bias & MDS_CLOSE_LAYOUT_SPLIT) {
+		data->cd_mirror_id = op_data->op_mirror_id;
+	} else if (bias & MDS_CLOSE_RESYNC_DONE) {
+		struct close_data_resync_done *sync = &data->cd_resync;
+
+		BUILD_BUG_ON(sizeof(data->cd_resync) >
+			     sizeof(data->cd_reserved));
+		sync->resync_count = op_data->op_data_size / sizeof(__u32);
+		if (sync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+			memcpy(sync->resync_ids_inline, op_data->op_data,
+			       op_data->op_data_size);
+		} else {
+			size_t count = sync->resync_count;
+
+			memcpy(req_capsule_client_get(pill, &RMF_U32),
+			       op_data->op_data, count * sizeof(__u32));
+		}
+	} else if (bias & MDS_PCC_ATTACH) {
+		data->cd_archive_id = op_data->op_archive_id;
+	}
+}
+
+void mdc_rename_pack(struct req_capsule *pill, struct md_op_data *op_data,
+		     const char *old, size_t oldlen,
+		     const char *new, size_t newlen)
+{
+	struct mdt_rec_rename *rec;
+
+	BUILD_BUG_ON(sizeof(struct mdt_rec_reint) !=
+		     sizeof(struct mdt_rec_rename));
+	rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+
+	/* XXX do something about time, uid, gid */
+	rec->rn_opcode	 = REINT_RENAME;
+	rec->rn_fsuid    = op_data->op_fsuid;
+	rec->rn_fsgid    = op_data->op_fsgid;
+	rec->rn_cap      = op_data->op_cap.cap[0];
+	rec->rn_suppgid1 = op_data->op_suppgids[0];
+	rec->rn_suppgid2 = op_data->op_suppgids[1];
+	rec->rn_fid1     = op_data->op_fid1;
+	rec->rn_fid2     = op_data->op_fid2;
+	rec->rn_time     = op_data->op_mod_time;
+	rec->rn_mode     = op_data->op_mode;
+	rec->rn_bias     = op_data->op_bias;
+
+	mdc_pack_name(pill, &RMF_NAME, old, oldlen);
+
+	if (new != NULL)
+		mdc_pack_name(pill, &RMF_SYMTGT, new, newlen);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(pill);
+}
+
+void mdc_migrate_pack(struct req_capsule *pill, struct md_op_data *op_data,
+		      const char *name, size_t namelen)
+{
+	struct mdt_rec_rename *rec;
+	char *ea;
+
+	BUILD_BUG_ON(sizeof(struct mdt_rec_reint) !=
+		     sizeof(struct mdt_rec_rename));
+	rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+
+	rec->rn_opcode	 = REINT_MIGRATE;
+	rec->rn_fsuid    = op_data->op_fsuid;
+	rec->rn_fsgid    = op_data->op_fsgid;
+	rec->rn_cap      = op_data->op_cap.cap[0];
+	rec->rn_suppgid1 = op_data->op_suppgids[0];
+	rec->rn_suppgid2 = op_data->op_suppgids[1];
+	rec->rn_fid1     = op_data->op_fid1;
+	rec->rn_fid2	 = op_data->op_fid4;
+	rec->rn_time     = op_data->op_mod_time;
+	rec->rn_mode     = op_data->op_mode;
+	rec->rn_bias     = op_data->op_bias;
+
+	mdc_pack_name(pill, &RMF_NAME, name, namelen);
+
+	if (op_data->op_bias & MDS_CLOSE_MIGRATE) {
+		struct mdt_ioepoch *epoch;
+
+		mdc_close_intent_pack(pill, op_data);
+		epoch = req_capsule_client_get(pill, &RMF_MDT_EPOCH);
+		mdc_ioepoch_pack(epoch, op_data);
+	}
+
+	ea = req_capsule_client_get(pill, &RMF_EADATA);
+	memcpy(ea, op_data->op_data, op_data->op_data_size);
+}
+
+void mdc_getattr_pack(struct req_capsule *pill, __u64 valid, __u32 flags,
+		      struct md_op_data *op_data, size_t ea_size)
+{
+	struct mdt_body *b = req_capsule_client_get(pill, &RMF_MDT_BODY);
+
+	b->mbo_valid = valid;
+	if (op_data->op_bias & MDS_CROSS_REF)
+		b->mbo_valid |= OBD_MD_FLCROSSREF;
+	if (op_data->op_bias & MDS_FID_OP)
+		b->mbo_valid |= OBD_MD_NAMEHASH;
+	b->mbo_eadatasize = ea_size;
+	b->mbo_flags = flags;
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+
+	b->mbo_fid1 = op_data->op_fid1;
+	b->mbo_fid2 = op_data->op_fid2;
+	b->mbo_valid |= OBD_MD_FLID;
+
+	if (op_data->op_name != NULL)
+		mdc_pack_name(pill, &RMF_NAME, op_data->op_name,
+			      op_data->op_namelen);
+}
+
+void mdc_close_pack(struct req_capsule *pill, struct md_op_data *op_data)
+{
+	struct mdt_ioepoch *epoch;
+	struct mdt_rec_setattr *rec;
+
+	epoch = req_capsule_client_get(pill, &RMF_MDT_EPOCH);
+	rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+
+	mdc_setattr_pack_rec(rec, op_data);
+	/*
+	 * The client will zero out local timestamps when losing the IBITS lock
+	 * so any new RPC timestamps will update the client inode's timestamps.
+	 * There was a defect on the server side which allowed the atime to be
+	 * overwritten by a zeroed-out atime packed into the close RPC.
+	 *
+	 * Proactively clear the MDS_ATTR_ATIME flag in the RPC in this case
+	 * to avoid zeroing the atime on old unpatched servers.  See LU-8041.
+	 */
+	if (rec->sa_atime == 0)
+		rec->sa_valid &= ~MDS_ATTR_ATIME;
+
+	mdc_ioepoch_pack(epoch, op_data);
+	mdc_close_intent_pack(pill, op_data);
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
new file mode 100644
index 0000000000000..41692b39eb909
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
@@ -0,0 +1,1466 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include <lustre_intent.h>
+#include <lustre_mdc.h>
+#include <lustre_net.h>
+#include <lustre_req_layout.h>
+#include <lustre_swab.h>
+#include <lustre_acl.h>
+
+#include "mdc_internal.h"
+
+struct mdc_getattr_args {
+	struct obd_export		*ga_exp;
+	struct md_enqueue_info		*ga_minfo;
+};
+
+int it_open_error(int phase, struct lookup_intent *it)
+{
+	if (it_disposition(it, DISP_OPEN_LEASE)) {
+		if (phase >= DISP_OPEN_LEASE)
+			return it->it_status;
+		else
+			return 0;
+	}
+	if (it_disposition(it, DISP_OPEN_OPEN)) {
+		if (phase >= DISP_OPEN_OPEN)
+			return it->it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_OPEN_CREATE)) {
+		if (phase >= DISP_OPEN_CREATE)
+			return it->it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_LOOKUP_EXECD)) {
+		if (phase >= DISP_LOOKUP_EXECD)
+			return it->it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_IT_EXECD)) {
+		if (phase >= DISP_IT_EXECD)
+			return it->it_status;
+		else
+			return 0;
+	}
+
+	CERROR("it disp: %X, status: %d\n", it->it_disposition, it->it_status);
+	LBUG();
+
+	return 0;
+}
+EXPORT_SYMBOL(it_open_error);
+
+/* this must be called on a lockh that is known to have a referenced lock */
+int mdc_set_lock_data(struct obd_export *exp, const struct lustre_handle *lockh,
+		      void *data, __u64 *bits)
+{
+	struct ldlm_lock *lock;
+	struct inode *new_inode = data;
+
+	ENTRY;
+	if (bits)
+		*bits = 0;
+
+	if (!lustre_handle_is_used(lockh))
+		RETURN(0);
+
+	lock = ldlm_handle2lock(lockh);
+
+	LASSERT(lock != NULL);
+	lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode &&
+	    lock->l_resource->lr_lvb_inode != data) {
+		struct inode *old_inode = lock->l_resource->lr_lvb_inode;
+
+		LASSERTF(old_inode->i_state & I_FREEING,
+			 "Found existing inode %p/%lu/%u state %lu in lock: setting data to %p/%lu/%u\n",
+			 old_inode, old_inode->i_ino, old_inode->i_generation,
+			 old_inode->i_state,
+			 new_inode, new_inode->i_ino, new_inode->i_generation);
+	}
+	lock->l_resource->lr_lvb_inode = new_inode;
+	if (bits)
+		*bits = lock->l_policy_data.l_inodebits.bits;
+
+	unlock_res_and_lock(lock);
+	LDLM_LOCK_PUT(lock);
+
+	RETURN(0);
+}
+
+enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags,
+			      const struct lu_fid *fid, enum ldlm_type type,
+			      union ldlm_policy_data *policy,
+			      enum ldlm_mode mode, struct lustre_handle *lockh)
+{
+	struct ldlm_res_id res_id;
+	enum ldlm_mode rc;
+
+	ENTRY;
+	fid_build_reg_res_name(fid, &res_id);
+	/* LU-4405: Clear bits not supported by server */
+	policy->l_inodebits.bits &= exp_connect_ibits(exp);
+	rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
+			     &res_id, type, policy, mode, lockh);
+	RETURN(rc);
+}
+
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+		      union ldlm_policy_data *policy, enum ldlm_mode mode,
+		      enum ldlm_cancel_flags flags, void *opaque)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ldlm_res_id res_id;
+	int rc;
+
+	ENTRY;
+	fid_build_reg_res_name(fid, &res_id);
+	rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id,
+					     policy, mode, flags, opaque);
+	RETURN(rc);
+}
+
+int mdc_null_inode(struct obd_export *exp,
+		   const struct lu_fid *fid)
+{
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace;
+
+	ENTRY;
+	LASSERTF(ns != NULL, "no namespace passed\n");
+
+	fid_build_reg_res_name(fid, &res_id);
+
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if (IS_ERR(res))
+		RETURN(0);
+
+	lock_res(res);
+	res->lr_lvb_inode = NULL;
+	unlock_res(res);
+
+	ldlm_resource_putref(res);
+	RETURN(0);
+}
+
+static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
+{
+	/* Don't hold error requests for replay. */
+	if (req->rq_replay) {
+		spin_lock(&req->rq_lock);
+		req->rq_replay = 0;
+		spin_unlock(&req->rq_lock);
+	}
+	if (rc && req->rq_transno != 0) {
+		DEBUG_REQ(D_ERROR, req, "transno returned on error: rc = %d",
+			  rc);
+		LBUG();
+	}
+}
+
+/**
+ * Save a large LOV EA into the request buffer so that it is available
+ * for replay.  We don't do this in the initial request because the
+ * original request doesn't need this buffer (at most it sends just the
+ * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
+ * buffer and may also be difficult to allocate and save a very large
+ * request buffer for each open. (b=5707)
+ *
+ * OOM here may cause recovery failure if lmm is needed (only for the
+ * original open if the MDS crashed just when this client also OOM'd)
+ * but this is incredibly unlikely, and questionable whether the client
+ * could do MDS recovery under OOM anyways...
+ */
+static int mdc_save_lovea(struct ptlrpc_request *req, void *data, u32 size)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	void *lovea;
+	int rc = 0;
+
+	if (req_capsule_get_size(pill, &RMF_EADATA, RCL_CLIENT) < size) {
+		rc = sptlrpc_cli_enlarge_reqbuf(req, &RMF_EADATA, size);
+		if (rc) {
+			CERROR("%s: Can't enlarge ea size to %d: rc = %d\n",
+			       req->rq_export->exp_obd->obd_name,
+			       size, rc);
+			return rc;
+		}
+	} else {
+		req_capsule_shrink(pill, &RMF_EADATA, size, RCL_CLIENT);
+	}
+
+	req_capsule_set_size(pill, &RMF_EADATA, RCL_CLIENT, size);
+	lovea = req_capsule_client_get(pill, &RMF_EADATA);
+	if (lovea) {
+		memcpy(lovea, data, size);
+		lov_fix_ea_for_replay(lovea);
+	}
+
+	return rc;
+}
+
+static struct ptlrpc_request *
+mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
+		     struct md_op_data *op_data, __u32 acl_bufsize)
+{
+	struct ptlrpc_request *req;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ldlm_intent *lit;
+	const void *lmm = op_data->op_data;
+	__u32 lmmsize = op_data->op_data_size;
+	__u32  mdt_md_capsule_size;
+	LIST_HEAD(cancels);
+	int count = 0;
+	enum ldlm_mode mode;
+	int repsize, repsize_estimate;
+	int rc;
+
+	ENTRY;
+
+	mdt_md_capsule_size = obd->u.cli.cl_default_mds_easize;
+
+	it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
+
+	/* XXX: openlock is not cancelled for cross-refs. */
+	/* If inode is known, cancel conflicting OPEN locks. */
+	if (fid_is_sane(&op_data->op_fid2)) {
+		if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */
+			if (it->it_flags & MDS_FMODE_WRITE)
+				mode = LCK_EX;
+			else
+				mode = LCK_PR;
+		} else {
+			if (it->it_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC))
+				mode = LCK_CW;
+#ifdef FMODE_EXEC
+			else if (it->it_flags & FMODE_EXEC)
+				mode = LCK_PR;
+#endif
+			else
+				mode = LCK_CR;
+		}
+		count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+						&cancels, mode,
+						MDS_INODELOCK_OPEN);
+	}
+
+	/* If CREATE, cancel parent's UPDATE lock. */
+	if (it->it_op & IT_CREAT)
+		mode = LCK_EX;
+	else
+		mode = LCK_CR;
+	count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+					 &cancels, mode,
+					 MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_OPEN);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+	if (cl_is_lov_delay_create(it->it_flags)) {
+		/* open(O_LOV_DELAY_CREATE) won't pack lmm */
+		LASSERT(lmmsize == 0);
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+	} else {
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     max(lmmsize, obd->u.cli.cl_default_mds_easize));
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+			     RCL_CLIENT, op_data->op_file_secctx_name != NULL ?
+			     op_data->op_file_secctx_name_size : 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
+			     op_data->op_file_secctx_size);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, RCL_CLIENT,
+			     op_data->op_file_encctx_size);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	spin_lock(&req->rq_lock);
+	req->rq_replay = req->rq_import->imp_replayable;
+	spin_unlock(&req->rq_lock);
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the intended request */
+	mdc_open_pack(&req->rq_pill, op_data, it->it_create_mode, 0,
+		      it->it_flags, lmm, lmmsize);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     mdt_md_capsule_size);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+
+	if (!(it->it_op & IT_CREAT) && it->it_op & IT_OPEN &&
+	    req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+				  RCL_CLIENT) &&
+	    op_data->op_file_secctx_name_size > 0 &&
+	    op_data->op_file_secctx_name != NULL) {
+		char *secctx_name;
+
+		secctx_name = req_capsule_client_get(&req->rq_pill,
+						     &RMF_FILE_SECCTX_NAME);
+		memcpy(secctx_name, op_data->op_file_secctx_name,
+		       op_data->op_file_secctx_name_size);
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER,
+				     obd->u.cli.cl_max_mds_easize);
+
+		CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n",
+		       op_data->op_file_secctx_name_size,
+		       op_data->op_file_secctx_name);
+
+	} else {
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER, 0);
+	}
+
+	if (exp_connect_encrypt(exp) && !(it->it_op & IT_CREAT) &&
+	    it->it_op & IT_OPEN)
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX,
+				     RCL_SERVER,
+				     obd->u.cli.cl_max_mds_easize);
+	else
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX,
+				     RCL_SERVER, 0);
+
+	/**
+	 * Inline buffer for possible data from Data-on-MDT files.
+	 */
+	req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE, RCL_SERVER,
+			     sizeof(struct niobuf_remote));
+	req_capsule_set_size(&req->rq_pill, &RMF_DEFAULT_MDT_MD, RCL_SERVER,
+			     sizeof(struct lmv_user_md));
+	ptlrpc_request_set_replen(req);
+
+	/* Get real repbuf allocated size as rounded up power of 2 */
+	repsize = size_roundup_power2(req->rq_replen +
+				      lustre_msg_early_size);
+	/* Estimate free space for DoM files in repbuf */
+	repsize_estimate = repsize - (req->rq_replen -
+			   mdt_md_capsule_size +
+			   sizeof(struct lov_comp_md_v1) +
+			   sizeof(struct lov_comp_md_entry_v1) +
+			   lov_mds_md_size(0, LOV_MAGIC_V3));
+
+	if (repsize_estimate < obd->u.cli.cl_dom_min_inline_repsize) {
+		repsize = obd->u.cli.cl_dom_min_inline_repsize -
+			  repsize_estimate + sizeof(struct niobuf_remote);
+		req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE,
+				     RCL_SERVER,
+				     sizeof(struct niobuf_remote) + repsize);
+		ptlrpc_request_set_replen(req);
+		CDEBUG(D_INFO, "Increase repbuf by %d bytes, total: %d\n",
+		       repsize, req->rq_replen);
+		repsize = size_roundup_power2(req->rq_replen +
+					      lustre_msg_early_size);
+	}
+	/* The only way to report real allocated repbuf size to the server
+	 * is the lm_repsize but it must be set prior buffer allocation itself
+	 * due to security reasons - it is part of buffer used in signature
+	 * calculation (see LU-11414). Therefore the saved size is predicted
+	 * value as rq_replen rounded to the next higher power of 2.
+	 * Such estimation is safe. Though the final allocated buffer might
+	 * be even larger, it is not possible to know that at this point.
+	 */
+	req->rq_reqmsg->lm_repsize = repsize;
+	RETURN(req);
+}
+
+#define GA_DEFAULT_EA_NAME_LEN	 20
+#define GA_DEFAULT_EA_VAL_LEN	250
+#define GA_DEFAULT_EA_NUM	 10
+
+static struct ptlrpc_request *
+mdc_intent_getxattr_pack(struct obd_export *exp, struct lookup_intent *it,
+			 struct md_op_data *op_data)
+{
+	struct ptlrpc_request *req;
+	struct ldlm_intent *lit;
+	int rc, count = 0;
+	LIST_HEAD(cancels);
+	u32 ea_vals_buf_size = GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM;
+
+	ENTRY;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					&RQF_LDLM_INTENT_GETXATTR);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = IT_GETXATTR;
+	/* Message below is checked in sanity-selinux test_20d
+	 * and sanity-sec test_49
+	 */
+	CDEBUG(D_INFO, "%s: get xattrs for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&op_data->op_fid1));
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	/* If the supplied buffer is too small then the server will return
+	 * -ERANGE and llite will fallback to using non cached xattr
+	 * operations. On servers before 2.10.1 a (non-cached) listxattr RPC
+	 * for an orphan or dead file causes an oops. So let's try to avoid
+	 * sending too small a buffer to too old a server. This is effectively
+	 * undoing the memory conservation of LU-9417 when it would be *more*
+	 * likely to crash the server. See LU-9856.
+	 */
+	if (exp->exp_connect_data.ocd_version < OBD_OCD_VERSION(2, 10, 1, 0))
+		ea_vals_buf_size = max_t(u32, ea_vals_buf_size,
+					 exp->exp_connect_data.ocd_max_easize);
+#endif
+
+	/* pack the intended request */
+	mdc_pack_body(&req->rq_pill, &op_data->op_fid1, op_data->op_valid,
+		      ea_vals_buf_size, -1, 0);
+
+	/* get SELinux policy info if any */
+	mdc_file_sepol_pack(&req->rq_pill);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS, RCL_SERVER,
+			     ea_vals_buf_size);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS, RCL_SERVER,
+			     sizeof(u32) * GA_DEFAULT_EA_NUM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	RETURN(req);
+}
+
+static struct ptlrpc_request *
+mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it,
+			struct md_op_data *op_data, __u32 acl_bufsize)
+{
+	struct ptlrpc_request *req;
+	struct obd_device *obd = class_exp2obd(exp);
+	u64 valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE |
+		    OBD_MD_FLDIREA | OBD_MD_MEA | OBD_MD_FLACL |
+		    OBD_MD_DEFAULT_MEA;
+	struct ldlm_intent *lit;
+	__u32 easize;
+	bool have_secctx = false;
+	int rc;
+
+	ENTRY;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_GETATTR);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	/* send name of security xattr to get upon intent */
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR) &&
+	    req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+				  RCL_CLIENT) &&
+	    op_data->op_file_secctx_name_size > 0 &&
+	    op_data->op_file_secctx_name != NULL) {
+		have_secctx = true;
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+				     RCL_CLIENT,
+				     op_data->op_file_secctx_name_size);
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	if (obd->u.cli.cl_default_mds_easize > 0)
+		easize = obd->u.cli.cl_default_mds_easize;
+	else
+		easize = obd->u.cli.cl_max_mds_easize;
+
+	/* pack the intended request */
+	mdc_getattr_pack(&req->rq_pill, valid, it->it_flags, op_data, easize);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+	req_capsule_set_size(&req->rq_pill, &RMF_DEFAULT_MDT_MD, RCL_SERVER,
+			     sizeof(struct lmv_user_md));
+
+	if (have_secctx) {
+		char *secctx_name;
+
+		secctx_name = req_capsule_client_get(&req->rq_pill,
+						     &RMF_FILE_SECCTX_NAME);
+		memcpy(secctx_name, op_data->op_file_secctx_name,
+		       op_data->op_file_secctx_name_size);
+
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER, easize);
+
+		CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n",
+		       op_data->op_file_secctx_name_size,
+		       op_data->op_file_secctx_name);
+	} else {
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER, 0);
+	}
+
+	if (exp_connect_encrypt(exp) && it->it_op & (IT_LOOKUP | IT_GETATTR))
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX,
+				     RCL_SERVER, easize);
+	else
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX,
+				     RCL_SERVER, 0);
+
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp,
+						     struct lookup_intent *it,
+						     struct md_op_data *op_data)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct ldlm_intent *lit;
+	struct layout_intent *layout;
+	LIST_HEAD(cancels);
+	int count = 0, rc;
+
+	ENTRY;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				&RQF_LDLM_INTENT_LAYOUT);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	if (fid_is_sane(&op_data->op_fid2) && (it->it_op & IT_LAYOUT) &&
+	    (it->it_flags & FMODE_WRITE)) {
+		count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_LAYOUT);
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the layout intent request */
+	layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT);
+	LASSERT(op_data->op_data != NULL);
+	LASSERT(op_data->op_data_size == sizeof(*layout));
+	memcpy(layout, op_data->op_data, sizeof(*layout));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_enqueue_pack(struct obd_export *exp,
+					       int lvb_len)
+{
+	struct ptlrpc_request *req;
+	int rc;
+
+	ENTRY;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static int mdc_finish_enqueue(struct obd_export *exp,
+			      struct ptlrpc_request *req,
+			      struct ldlm_enqueue_info *einfo,
+			      struct lookup_intent *it,
+			      struct lustre_handle *lockh, int rc)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	struct ldlm_request *lockreq;
+	struct ldlm_reply *lockrep;
+	struct ldlm_lock *lock;
+	struct mdt_body *body = NULL;
+	void *lvb_data = NULL;
+	__u32 lvb_len = 0;
+
+	ENTRY;
+	LASSERT(rc >= 0);
+	/* Similarly, if we're going to replay this request, we don't want to
+	 * actually get a lock, just perform the intent.
+	 */
+	if (req->rq_transno || req->rq_replay) {
+		lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ);
+		lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY);
+	}
+
+	if (rc == ELDLM_LOCK_ABORTED) {
+		einfo->ei_mode = 0;
+		memset(lockh, 0, sizeof(*lockh));
+		rc = 0;
+	} else { /* rc = 0 */
+		lock = ldlm_handle2lock(lockh);
+		LASSERT(lock != NULL);
+
+		/* If server returned a different lock mode, fix up variables */
+		if (lock->l_req_mode != einfo->ei_mode) {
+			ldlm_lock_addref(lockh, lock->l_req_mode);
+			ldlm_lock_decref(lockh, einfo->ei_mode);
+			einfo->ei_mode = lock->l_req_mode;
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+
+	lockrep = req_capsule_server_get(pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */
+
+	it->it_disposition = (int)lockrep->lock_policy_res1;
+	it->it_status = (int)lockrep->lock_policy_res2;
+	it->it_lock_mode = einfo->ei_mode;
+	it->it_lock_handle = lockh->cookie;
+	it->it_request = req;
+
+	/* Technically speaking rq_transno must already be zero if
+	 * it_status is in error, so the check is a bit redundant.
+	 */
+	if ((!req->rq_transno || it->it_status < 0) && req->rq_replay)
+		mdc_clear_replay_flag(req, it->it_status);
+
+	/* If we're doing an IT_OPEN which did not result in an actual
+	 * successful open, then we need to remove the bit which saves
+	 * this request for unconditional replay.
+	 *
+	 * It's important that we do this first!  Otherwise we might exit the
+	 * function without doing so, and try to replay a failed create.
+	 * (b=3440)
+	 */
+	if (it->it_op & IT_OPEN && req->rq_replay &&
+	    (!it_disposition(it, DISP_OPEN_OPEN) || it->it_status != 0))
+		mdc_clear_replay_flag(req, it->it_status);
+
+	DEBUG_REQ(D_RPCTRACE, req, "op=%x disposition=%x, status=%d",
+		  it->it_op, it->it_disposition, it->it_status);
+
+	/* We know what to expect, so we do any byte flipping required here */
+	if (it_has_reply_body(it)) {
+		body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+		if (body == NULL) {
+			rc = -EPROTO;
+			CERROR("%s: cannot swab mdt_body: rc = %d\n",
+			       exp->exp_obd->obd_name, rc);
+			RETURN(rc);
+		}
+
+		if (it_disposition(it, DISP_OPEN_OPEN) &&
+		    !it_open_error(DISP_OPEN_OPEN, it)) {
+			/*
+			 * If this is a successful OPEN request, we need to set
+			 * replay handler and data early, so that if replay
+			 * happens immediately after swabbing below, new reply
+			 * is swabbed by that handler correctly.
+			 */
+			mdc_set_open_replay_data(NULL, NULL, it);
+		}
+
+		if (it_disposition(it, DISP_OPEN_CREATE) &&
+		    !it_open_error(DISP_OPEN_CREATE, it)) {
+			lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+					     LPROC_MD_CREATE);
+		}
+
+		if (body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) {
+			void *eadata;
+
+			mdc_update_max_ea_from_body(exp, body);
+
+			/*
+			 * The eadata is opaque; just check that it is there.
+			 * Eventually, obd_unpackmd() will check the contents.
+			 */
+			eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							body->mbo_eadatasize);
+			if (eadata == NULL)
+				RETURN(-EPROTO);
+
+			/* save LVB data and length if for layout lock */
+			lvb_data = eadata;
+			lvb_len = body->mbo_eadatasize;
+
+			/*
+			 * We save the reply LOV EA in case we have to replay a
+			 * create for recovery.  If we didn't allocate a large
+			 * enough request buffer above we need to reallocate it
+			 * here to hold the actual LOV EA.
+			 *
+			 * To not save LOV EA if request is not going to replay
+			 * (for example error one).
+			 */
+			if ((it->it_op & IT_OPEN) && req->rq_replay) {
+				rc = mdc_save_lovea(req, eadata,
+						    body->mbo_eadatasize);
+				if (rc) {
+					body->mbo_valid &= ~OBD_MD_FLEASIZE;
+					body->mbo_eadatasize = 0;
+					rc = 0;
+				}
+			}
+		}
+	} else if (it->it_op & IT_LAYOUT) {
+		/* maybe the lock was granted right away and layout
+		 * is packed into RMF_DLM_LVB of req
+		 */
+		lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER);
+		CDEBUG(D_INFO, "%s: layout return lvb %d transno %lld\n",
+		       class_exp2obd(exp)->obd_name, lvb_len, req->rq_transno);
+		if (lvb_len > 0) {
+			lvb_data = req_capsule_server_sized_get(pill,
+							&RMF_DLM_LVB, lvb_len);
+			if (lvb_data == NULL)
+				RETURN(-EPROTO);
+
+			/**
+			 * save replied layout data to the request buffer for
+			 * recovery consideration (lest MDS reinitialize
+			 * another set of OST objects).
+			 */
+			if (req->rq_transno)
+				(void)mdc_save_lovea(req, lvb_data, lvb_len);
+		}
+	}
+
+	/* fill in stripe data for layout lock.
+	 * LU-6581: trust layout data only if layout lock is granted. The MDT
+	 * has stopped sending layout unless the layout lock is granted. The
+	 * client still does this checking in case it's talking with an old
+	 * server. - Jinshan
+	 */
+	lock = ldlm_handle2lock(lockh);
+	if (lock == NULL)
+		RETURN(rc);
+
+	if (ldlm_has_layout(lock) && lvb_data != NULL &&
+	    !(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) {
+		void *lmm;
+
+		LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d",
+			ldlm_it2str(it->it_op), lvb_len);
+
+		OBD_ALLOC_LARGE(lmm, lvb_len);
+		if (lmm == NULL)
+			GOTO(out_lock, rc = -ENOMEM);
+
+		memcpy(lmm, lvb_data, lvb_len);
+
+		/* install lvb_data */
+		lock_res_and_lock(lock);
+		if (lock->l_lvb_data == NULL) {
+			lock->l_lvb_type = LVB_T_LAYOUT;
+			lock->l_lvb_data = lmm;
+			lock->l_lvb_len = lvb_len;
+			lmm = NULL;
+		}
+		unlock_res_and_lock(lock);
+		if (lmm != NULL)
+			OBD_FREE_LARGE(lmm, lvb_len);
+	}
+
+	if (ldlm_has_dom(lock)) {
+		LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
+
+		body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+		if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) {
+			LDLM_ERROR(lock, "%s: DoM lock without size.",
+				   exp->exp_obd->obd_name);
+			GOTO(out_lock, rc = -EPROTO);
+		}
+
+		LDLM_DEBUG(lock, "DoM lock is returned by: %s, size: %llu",
+			   ldlm_it2str(it->it_op), body->mbo_dom_size);
+
+		lock_res_and_lock(lock);
+		mdc_body2lvb(body, &lock->l_ost_lvb);
+		ldlm_lock_allow_match_locked(lock);
+		unlock_res_and_lock(lock);
+	}
+out_lock:
+	LDLM_LOCK_PUT(lock);
+
+	RETURN(rc);
+}
+
+static inline bool mdc_skip_mod_rpc_slot(const struct lookup_intent *it)
+{
+	if (it != NULL &&
+	    (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+	     it->it_op == IT_READDIR || it->it_op == IT_GETXATTR ||
+	     (it->it_op == IT_LAYOUT && !(it->it_flags & MDS_FMODE_WRITE))))
+		return true;
+	return false;
+}
+
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type.
+ */
+static int mdc_enqueue_base(struct obd_export *exp,
+			    struct ldlm_enqueue_info *einfo,
+			    const union ldlm_policy_data *policy,
+			    struct lookup_intent *it,
+			    struct md_op_data *op_data,
+			    struct lustre_handle *lockh,
+			    __u64 extra_lock_flags)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	__u64 flags, saved_flags = extra_lock_flags;
+	struct ldlm_res_id res_id;
+	static const union ldlm_policy_data lookup_policy = {
+				  .l_inodebits = { MDS_INODELOCK_LOOKUP } };
+	static const union ldlm_policy_data update_policy = {
+				  .l_inodebits = { MDS_INODELOCK_UPDATE } };
+	static const union ldlm_policy_data layout_policy = {
+				  .l_inodebits = { MDS_INODELOCK_LAYOUT } };
+	static const union ldlm_policy_data getxattr_policy = {
+				  .l_inodebits = { MDS_INODELOCK_XATTR } };
+	int generation, resends = 0;
+	struct ldlm_reply *lockrep;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	__u32 acl_bufsize;
+	enum lvb_type lvb_type = 0;
+	int rc;
+
+	ENTRY;
+	LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n",
+		 einfo->ei_type);
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+
+	if (it != NULL) {
+		LASSERT(policy == NULL);
+
+		saved_flags |= LDLM_FL_HAS_INTENT;
+		if (it->it_op & (IT_GETATTR | IT_READDIR))
+			policy = &update_policy;
+		else if (it->it_op & IT_LAYOUT)
+			policy = &layout_policy;
+		else if (it->it_op & IT_GETXATTR)
+			policy = &getxattr_policy;
+		else
+			policy = &lookup_policy;
+	}
+
+	generation = obd->u.cli.cl_import->imp_generation;
+	if (!it || (it->it_op & (IT_OPEN | IT_CREAT)))
+		acl_bufsize = min_t(__u32, imp->imp_connect_data.ocd_max_easize,
+				    XATTR_SIZE_MAX);
+	else
+		acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+
+resend:
+	flags = saved_flags;
+	if (it == NULL) {
+		/* The only way right now is FLOCK. */
+		LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
+			 einfo->ei_type);
+		res_id.name[3] = LDLM_FLOCK;
+		req = ldlm_enqueue_pack(exp, 0);
+	} else if (it->it_op & IT_OPEN) {
+		req = mdc_intent_open_pack(exp, it, op_data, acl_bufsize);
+	} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
+		req = mdc_intent_getattr_pack(exp, it, op_data, acl_bufsize);
+	} else if (it->it_op & IT_READDIR) {
+		req = mdc_enqueue_pack(exp, 0);
+	} else if (it->it_op & IT_LAYOUT) {
+		if (!imp_connect_lvb_type(imp))
+			RETURN(-EOPNOTSUPP);
+		req = mdc_intent_layout_pack(exp, it, op_data);
+		lvb_type = LVB_T_LAYOUT;
+	} else if (it->it_op & IT_GETXATTR) {
+		req = mdc_intent_getxattr_pack(exp, it, op_data);
+	} else {
+		LBUG();
+		RETURN(-EINVAL);
+	}
+
+	if (IS_ERR(req))
+		RETURN(PTR_ERR(req));
+
+	if (resends) {
+		req->rq_generation_set = 1;
+		req->rq_import_generation = generation;
+		req->rq_sent = ktime_get_real_seconds() + resends;
+	}
+
+	einfo->ei_req_slot = !(op_data->op_cli_flags & CLI_NO_SLOT);
+	einfo->ei_mod_slot = !mdc_skip_mod_rpc_slot(it);
+
+	/* With Data-on-MDT the glimpse callback is needed too.
+	 * It is set here in advance but not in mdc_finish_enqueue()
+	 * to avoid possible races. It is safe to have glimpse handler
+	 * for non-DOM locks and costs nothing.
+	 */
+	if (einfo->ei_cb_gl == NULL)
+		einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+			      0, lvb_type, lockh, 0);
+
+	if (!it) {
+		/* For flock requests we immediatelly return without further
+		 * delay and let caller deal with the rest, since rest of
+		 * this function metadata processing makes no sense for flock
+		 * requests anyway. But in case of problem during comms with
+		 * server (-ETIMEDOUT) or any signal/kill attempt (-EINTR),
+		 * we cannot rely on caller and this mainly for F_UNLCKs
+		 * (explicits or automatically generated by kernel to clean
+		 * current flocks upon exit) that can't be trashed.
+		 */
+		ptlrpc_req_finished(req);
+		if (((rc == -EINTR) || (rc == -ETIMEDOUT)) &&
+		    (einfo->ei_type == LDLM_FLOCK) &&
+		    (einfo->ei_mode == LCK_NL))
+			goto resend;
+		RETURN(rc);
+	}
+
+	if (rc < 0) {
+		CDEBUG(D_INFO,
+		      "%s: ldlm_cli_enqueue "DFID":"DFID"=%s failed: rc = %d\n",
+		      obd->obd_name, PFID(&op_data->op_fid1),
+		      PFID(&op_data->op_fid2), op_data->op_name ?: "", rc);
+
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL);
+
+	lockrep->lock_policy_res2 =
+		ptlrpc_status_ntoh(lockrep->lock_policy_res2);
+
+	/* Retry infinitely when the server returns -EINPROGRESS for the
+	 * intent operation, when server returns -EINPROGRESS for acquiring
+	 * intent lock, we'll retry in after_reply().
+	 */
+	if (it && (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		if (generation == obd->u.cli.cl_import->imp_generation) {
+			if (signal_pending(current))
+				RETURN(-EINTR);
+
+			resends++;
+			CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n",
+			       obd->obd_name, resends, it->it_op,
+			       PFID(&op_data->op_fid1),
+			       PFID(&op_data->op_fid2));
+			goto resend;
+		} else {
+			CDEBUG(D_HA, "resend cross eviction\n");
+			RETURN(-EIO);
+		}
+	}
+
+	if ((int)lockrep->lock_policy_res2 == -ERANGE &&
+	    it->it_op & (IT_OPEN | IT_GETATTR | IT_LOOKUP) &&
+	    acl_bufsize == LUSTRE_POSIX_ACL_MAX_SIZE_OLD) {
+		mdc_clear_replay_flag(req, -ERANGE);
+		ptlrpc_req_finished(req);
+		acl_bufsize = min_t(__u32, imp->imp_connect_data.ocd_max_easize,
+				    XATTR_SIZE_MAX);
+		goto resend;
+	}
+
+	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+	if (rc < 0) {
+		if (lustre_handle_is_used(lockh)) {
+			ldlm_lock_decref(lockh, einfo->ei_mode);
+			memset(lockh, 0, sizeof(*lockh));
+		}
+		ptlrpc_req_finished(req);
+
+		it->it_lock_handle = 0;
+		it->it_lock_mode = 0;
+		it->it_request = NULL;
+	}
+
+	RETURN(rc);
+}
+
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		const union ldlm_policy_data *policy,
+		struct md_op_data *op_data,
+		struct lustre_handle *lockh, __u64 extra_lock_flags)
+{
+	return mdc_enqueue_base(exp, einfo, policy, NULL,
+				op_data, lockh, extra_lock_flags);
+}
+
+static int mdc_finish_intent_lock(struct obd_export *exp,
+				  struct ptlrpc_request *request,
+				  struct md_op_data *op_data,
+				  struct lookup_intent *it,
+				  struct lustre_handle *lockh)
+{
+	struct lustre_handle old_lock;
+	struct ldlm_lock *lock;
+	int rc = 0;
+
+	ENTRY;
+	LASSERT(request != NULL);
+	LASSERT(request != LP_POISON);
+	LASSERT(request->rq_repmsg != LP_POISON);
+
+	if (it->it_op & IT_READDIR)
+		RETURN(0);
+
+	if (it->it_op & (IT_GETXATTR | IT_LAYOUT)) {
+		if (it->it_status != 0)
+			GOTO(out, rc = it->it_status);
+	} else {
+		if (!it_disposition(it, DISP_IT_EXECD)) {
+			/* The server failed before it even started executing
+			 * the intent, i.e. because it couldn't unpack the
+			 * request.
+			 */
+			LASSERT(it->it_status != 0);
+			GOTO(out, rc = it->it_status);
+		}
+		rc = it_open_error(DISP_IT_EXECD, it);
+		if (rc)
+			GOTO(out, rc);
+
+		rc = it_open_error(DISP_LOOKUP_EXECD, it);
+		if (rc)
+			GOTO(out, rc);
+
+		/* keep requests around for the multiple phases of the call
+		 * this shows the DISP_XX must guarantee we make it into the
+		 * call
+		 */
+		if (!it_disposition(it, DISP_ENQ_CREATE_REF) &&
+		    it_disposition(it, DISP_OPEN_CREATE) &&
+		    !it_open_error(DISP_OPEN_CREATE, it)) {
+			it_set_disposition(it, DISP_ENQ_CREATE_REF);
+			/* balanced in ll_create_node */
+			ptlrpc_request_addref(request);
+		}
+		if (!it_disposition(it, DISP_ENQ_OPEN_REF) &&
+		    it_disposition(it, DISP_OPEN_OPEN) &&
+		    !it_open_error(DISP_OPEN_OPEN, it)) {
+			it_set_disposition(it, DISP_ENQ_OPEN_REF);
+			/* balanced in ll_file_open */
+			ptlrpc_request_addref(request);
+			/* eviction in middle of open RPC processing b=11546 */
+			OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE,
+					 obd_timeout);
+		}
+
+		if (it->it_op & IT_CREAT) {
+			/* XXX this belongs in ll_create_it */
+		} else if (it->it_op == IT_OPEN) {
+			LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
+		} else {
+			LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP));
+		}
+	}
+
+	/* If we already have a matching lock, then cancel the new
+	 * one.  We have to set the data here instead of in
+	 * mdc_enqueue, because we need to use the child's inode as
+	 * the l_ast_data to match, and that's not available until
+	 * intent_finish has performed the iget().
+	 */
+	lock = ldlm_handle2lock(lockh);
+	if (lock) {
+		union ldlm_policy_data policy = lock->l_policy_data;
+
+		LDLM_DEBUG(lock, "matching against this");
+
+		if (it_has_reply_body(it)) {
+			struct mdt_body *body;
+
+			body = req_capsule_server_get(&request->rq_pill,
+						      &RMF_MDT_BODY);
+			/* mdc_enqueue checked */
+			LASSERT(body != NULL);
+			LASSERTF(fid_res_name_eq(&body->mbo_fid1,
+						 &lock->l_resource->lr_name),
+				 "Lock res_id: "DLDLMRES", fid: "DFID"\n",
+				 PLDLMRES(lock->l_resource),
+				 PFID(&body->mbo_fid1));
+		}
+		LDLM_LOCK_PUT(lock);
+
+		memcpy(&old_lock, lockh, sizeof(*lockh));
+		if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
+				   LDLM_IBITS, &policy, LCK_NL, &old_lock)) {
+			ldlm_lock_decref_and_cancel(lockh, it->it_lock_mode);
+			memcpy(lockh, &old_lock, sizeof(old_lock));
+			it->it_lock_handle = lockh->cookie;
+		}
+	}
+
+	EXIT;
+out:
+	CDEBUG(D_DENTRY,
+	       "D_IT dentry=%.*s intent=%s status=%d disp=%x: rc = %d\n",
+		(int)op_data->op_namelen, op_data->op_name,
+		ldlm_it2str(it->it_op), it->it_status, it->it_disposition, rc);
+
+	return rc;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits)
+{
+	/* We could just return 1 immediately, but as we should only be called
+	 * in revalidate_it if we already have a lock, let's verify that.
+	 */
+	struct ldlm_res_id res_id;
+	struct lustre_handle lockh;
+	union ldlm_policy_data policy;
+	enum ldlm_mode mode;
+
+	ENTRY;
+	if (it->it_lock_handle) {
+		lockh.cookie = it->it_lock_handle;
+		mode = ldlm_revalidate_lock_handle(&lockh, bits);
+	} else {
+		fid_build_reg_res_name(fid, &res_id);
+		switch (it->it_op) {
+		case IT_GETATTR:
+			/* File attributes are held under multiple bits:
+			 * nlink is under lookup lock, size and times are
+			 * under UPDATE lock and recently we've also got
+			 * a separate permissions lock for owner/group/acl that
+			 * were protected by lookup lock before.
+			 * Getattr must provide all of that information,
+			 * so we need to ensure we have all of those locks.
+			 * Unfortunately, if the bits are split across multiple
+			 * locks, there's no easy way to match all of them here,
+			 * so an extra RPC would be performed to fetch all
+			 * of those bits at once for now.
+			 */
+			/* For new MDTs(> 2.4), UPDATE|PERM should be enough,
+			 * but for old MDTs (< 2.4), permission is covered
+			 * by LOOKUP lock, so it needs to match all bits here.
+			 */
+			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE |
+						  MDS_INODELOCK_PERM;
+			break;
+		case IT_READDIR:
+			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
+			break;
+		case IT_LAYOUT:
+			policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT;
+			break;
+		default:
+			policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP;
+			break;
+		}
+
+		mode = mdc_lock_match(exp, LDLM_FL_BLOCK_GRANTED, fid,
+				      LDLM_IBITS, &policy,
+				      LCK_CR | LCK_CW | LCK_PR | LCK_PW,
+				      &lockh);
+	}
+
+	if (mode) {
+		it->it_lock_handle = lockh.cookie;
+		it->it_lock_mode = mode;
+	} else {
+		it->it_lock_handle = 0;
+		it->it_lock_mode = 0;
+	}
+
+	RETURN(!!mode);
+}
+
+/*
+ * This long block is all about fixing up the lock and request state
+ * so that it is correct as of the moment _before_ the operation was
+ * applied; that way, the VFS will think that everything is normal and
+ * call Lustre's regular VFS methods.
+ *
+ * If we're performing a creation, that means that unless the creation
+ * failed with EEXIST, we should fake up a negative dentry.
+ *
+ * For everything else, we want to lookup to succeed.
+ *
+ * One additional note: if CREATE or OPEN succeeded, we add an extra
+ * reference to the request because we need to keep it around until
+ * ll_create/ll_open gets called.
+ *
+ * The server will return to us, in it_disposition, an indication of
+ * exactly what it_status refers to.
+ *
+ * If DISP_OPEN_OPEN is set, then it_status refers to the open() call,
+ * otherwise if DISP_OPEN_CREATE is set, then it status is the
+ * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
+ * DISP_LOOKUP_POS will be set, indicating whether the child lookup
+ * was successful.
+ *
+ * Else, if DISP_LOOKUP_EXECD then it_status is the rc of the
+ * child lookup.
+ */
+int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    struct lookup_intent *it, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags)
+{
+	struct ldlm_enqueue_info einfo = {
+		.ei_type	= LDLM_IBITS,
+		.ei_mode	= it_to_lock_mode(it),
+		.ei_cb_bl	= cb_blocking,
+		.ei_cb_cp	= ldlm_completion_ast,
+		.ei_cb_gl	= mdc_ldlm_glimpse_ast,
+	};
+	struct lustre_handle lockh;
+	int rc = 0;
+
+	ENTRY;
+	LASSERT(it);
+	CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID
+		", intent: %s flags %#llo\n", (int)op_data->op_namelen,
+		op_data->op_name, PFID(&op_data->op_fid2),
+		PFID(&op_data->op_fid1), ldlm_it2str(it->it_op),
+		it->it_flags);
+
+	lockh.cookie = 0;
+	/* MDS_FID_OP is not a revalidate case */
+	if (fid_is_sane(&op_data->op_fid2) &&
+	    (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_READDIR)) &&
+	    !(op_data->op_bias & MDS_FID_OP)) {
+		/* We could just return 1 immediately, but since we should only
+		 * be called in revalidate_it if we already have a lock, let's
+		 * verify that.
+		 */
+		it->it_lock_handle = 0;
+		rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL);
+		/* Only return failure if it was not GETATTR by cfid
+		 * (from inode_revalidate()).
+		 */
+		if (rc || op_data->op_namelen != 0)
+			RETURN(rc);
+	}
+
+	/* For case if upper layer did not alloc fid, do it now. */
+	if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) {
+		rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc < 0) {
+			CERROR("%s: cannot allocate new FID: rc=%d\n",
+			       exp->exp_obd->obd_name, rc);
+			RETURN(rc);
+		}
+	}
+
+	rc = mdc_enqueue_base(exp, &einfo, NULL, it, op_data, &lockh,
+			      extra_lock_flags);
+	if (rc < 0)
+		RETURN(rc);
+
+	*reqp = it->it_request;
+	rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh);
+	RETURN(rc);
+}
+
+static int mdc_intent_getattr_async_interpret(const struct lu_env *env,
+					      struct ptlrpc_request *req,
+					      void *args, int rc)
+{
+	struct mdc_getattr_args *ga = args;
+	struct obd_export *exp = ga->ga_exp;
+	struct md_enqueue_info *minfo = ga->ga_minfo;
+	struct ldlm_enqueue_info *einfo = &minfo->mi_einfo;
+	struct lookup_intent *it = &minfo->mi_it;
+	struct lustre_handle *lockh = &minfo->mi_lockh;
+	struct ldlm_reply *lockrep;
+	__u64 flags = LDLM_FL_HAS_INTENT;
+
+	ENTRY;
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
+		rc = -ETIMEDOUT;
+
+	rc = ldlm_cli_enqueue_fini(exp, req, einfo, 1, &flags, NULL, 0,
+				   lockh, rc, true);
+	if (rc < 0) {
+		CERROR("%s: ldlm_cli_enqueue_fini() failed: rc = %d\n",
+		       exp->exp_obd->obd_name, rc);
+		mdc_clear_replay_flag(req, rc);
+		GOTO(out, rc);
+	}
+
+	lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL);
+
+	lockrep->lock_policy_res2 =
+		ptlrpc_status_ntoh(lockrep->lock_policy_res2);
+
+	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh);
+	EXIT;
+
+out:
+	minfo->mi_cb(req, minfo, rc);
+	return 0;
+}
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo)
+{
+	struct md_op_data *op_data = &minfo->mi_data;
+	struct lookup_intent *it = &minfo->mi_it;
+	struct ptlrpc_request *req;
+	struct mdc_getattr_args *ga;
+	struct ldlm_res_id res_id;
+	union ldlm_policy_data policy = {
+		.l_inodebits = { MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE }
+	};
+	__u64 flags = LDLM_FL_HAS_INTENT;
+	int rc = 0;
+
+	ENTRY;
+	CDEBUG(D_DLMTRACE,
+	       "name: %.*s in inode "DFID", intent: %s flags %#llo\n",
+	       (int)op_data->op_namelen, op_data->op_name,
+	       PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags);
+
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+	/* If the MDT return -ERANGE because of large ACL, then the sponsor
+	 * of the async getattr RPC will handle that by itself.
+	 */
+	req = mdc_intent_getattr_pack(exp, it, op_data,
+				      LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
+	if (IS_ERR(req))
+		RETURN(PTR_ERR(req));
+
+	/* With Data-on-MDT the glimpse callback is needed too.
+	 * It is set here in advance but not in mdc_finish_enqueue()
+	 * to avoid possible races. It is safe to have glimpse handler
+	 * for non-DOM locks and costs nothing.
+	 */
+	if (minfo->mi_einfo.ei_cb_gl == NULL)
+		minfo->mi_einfo.ei_cb_gl = mdc_ldlm_glimpse_ast;
+
+	rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy,
+			      &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1);
+	if (rc < 0) {
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	ga = ptlrpc_req_async_args(ga, req);
+	ga->ga_exp = exp;
+	ga->ga_minfo = minfo;
+
+	req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
new file mode 100644
index 0000000000000..f75d559981d5a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
@@ -0,0 +1,536 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <obd_class.h>
+#include "mdc_internal.h"
+#include <lustre_fid.h>
+
+/* mdc_setattr does its own semaphore handling */
+static int mdc_reint(struct ptlrpc_request *request, int level)
+{
+        int rc;
+
+        request->rq_send_state = level;
+
+	ptlrpc_get_mod_rpc_slot(request);
+	rc = ptlrpc_queue_wait(request);
+	ptlrpc_put_mod_rpc_slot(request);
+        if (rc)
+                CDEBUG(D_INFO, "error in handling %d\n", rc);
+        else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) {
+                rc = -EPROTO;
+        }
+        return rc;
+}
+
+/* Find and cancel locally locks matched by inode @bits & @mode in the resource
+ * found by @fid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+int mdc_resource_get_unused_res(struct obd_export *exp,
+				struct ldlm_res_id *res_id,
+				struct list_head *cancels,
+				enum ldlm_mode mode, __u64 bits)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	union ldlm_policy_data policy = { { 0 } };
+	struct ldlm_resource *res;
+	int count;
+
+	ENTRY;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		RETURN(0);
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (IS_ERR(res))
+		RETURN(0);
+	LDLM_RESOURCE_ADDREF(res);
+	/* Initialize ibits lock policy. */
+	policy.l_inodebits.bits = bits;
+	count = ldlm_cancel_resource_local(res, cancels, &policy, mode, 0, 0,
+					   NULL);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(count);
+}
+
+int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid,
+			    struct list_head *cancels, enum ldlm_mode mode,
+			    __u64 bits)
+{
+	struct ldlm_res_id res_id;
+
+	fid_build_reg_res_name(fid, &res_id);
+	return mdc_resource_get_unused_res(exp, &res_id, cancels, mode, bits);
+}
+
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, size_t ealen, struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+        struct ptlrpc_request *req;
+        int count = 0, rc;
+        __u64 bits;
+        ENTRY;
+
+        LASSERT(op_data != NULL);
+
+        bits = MDS_INODELOCK_UPDATE;
+        if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+                bits |= MDS_INODELOCK_LOOKUP;
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX, bits);
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_MDS_REINT_SETATTR);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT, 0);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+        if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime %lld, ctime %lld\n",
+		       (s64)op_data->op_attr.ia_mtime.tv_sec,
+		       (s64)op_data->op_attr.ia_ctime.tv_sec);
+	mdc_setattr_pack(&req->rq_pill, op_data, ea, ealen);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
+
+        ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+	if (rc == -ERESTARTSYS)
+                rc = 0;
+
+        *request = req;
+
+	RETURN(rc);
+}
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+		const void *data, size_t datalen,
+		umode_t mode, uid_t uid, gid_t gid,
+		kernel_cap_t cap_effective, __u64 rdev,
+		struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        int level, rc;
+        int count, resends = 0;
+        struct obd_import *import = exp->exp_obd->u.cli.cl_import;
+        int generation = import->imp_generation;
+	LIST_HEAD(cancels);
+        ENTRY;
+
+	/* For case if upper layer did not alloc fid, do it now. */
+	if (!fid_is_sane(&op_data->op_fid2)) {
+		/*
+		 * mdc_fid_alloc() may return errno 1 in case of switch to new
+		 * sequence, handle this.
+		 */
+		rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+rebuild:
+        count = 0;
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+            (fid_is_sane(&op_data->op_fid1)))
+                count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_CREATE_ACL);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+        req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                             data && datalen ? datalen : 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+			     RCL_CLIENT, op_data->op_file_secctx_name != NULL ?
+			     strlen(op_data->op_file_secctx_name) + 1 : 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
+			     op_data->op_file_secctx_size);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, RCL_CLIENT,
+			     op_data->op_file_encctx_size);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+        /*
+         * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
+         * tgt, for symlinks or lov MD data.
+         */
+	mdc_create_pack(&req->rq_pill, op_data, data, datalen, mode, uid,
+			gid, cap_effective, rdev);
+
+        ptlrpc_request_set_replen(req);
+
+	/* ask ptlrpc not to resend on EINPROGRESS since we have our own retry
+	 * logic here */
+	req->rq_no_retry_einprogress = 1;
+
+        if (resends) {
+                req->rq_generation_set = 1;
+                req->rq_import_generation = generation;
+		req->rq_sent = ktime_get_real_seconds() + resends;
+        }
+        level = LUSTRE_IMP_FULL;
+ resend:
+	rc = mdc_reint(req, level);
+
+        /* Resend if we were told to. */
+        if (rc == -ERESTARTSYS) {
+                level = LUSTRE_IMP_RECOVER;
+                goto resend;
+        } else if (rc == -EINPROGRESS) {
+		/* Retry create infinitely until succeed or get other
+		 * error code or interrupted. */
+		ptlrpc_req_finished(req);
+		if (generation == import->imp_generation) {
+			if (signal_pending(current))
+				RETURN(-EINTR);
+
+			resends++;
+			CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n",
+			       exp->exp_obd->obd_name, resends,
+			       PFID(&op_data->op_fid1),
+			       PFID(&op_data->op_fid2));
+			goto rebuild;
+                } else {
+                        CDEBUG(D_HA, "resend cross eviction\n");
+                        RETURN(-EIO);
+                }
+        }
+
+        *request = req;
+        RETURN(rc);
+}
+
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+               struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+        struct obd_device *obd = class_exp2obd(exp);
+        struct ptlrpc_request *req = *request;
+        int count = 0, rc;
+        ENTRY;
+
+        LASSERT(req == NULL);
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)))
+		/* cancel DOM lock only if it has no data to flush */
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 op_data->op_cli_flags &
+						 CLI_DIRTY_DATA ?
+						 MDS_INODELOCK_ELC :
+						 MDS_INODELOCK_FULL);
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_MDS_REINT_UNLINK);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_unlink_pack(&req->rq_pill, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+
+        *request = req;
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+        if (rc == -ERESTARTSYS)
+                rc = 0;
+        RETURN(rc);
+}
+
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+             struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+        struct ptlrpc_request *req;
+        int count = 0, rc;
+        ENTRY;
+
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+            (fid_is_sane(&op_data->op_fid2)))
+                count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+            (fid_is_sane(&op_data->op_fid1)))
+                count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                                 &cancels, LCK_EX,
+                                                 MDS_INODELOCK_UPDATE);
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_link_pack(&req->rq_pill, op_data);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+        *request = req;
+        if (rc == -ERESTARTSYS)
+                rc = 0;
+
+        RETURN(rc);
+}
+
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+		const char *old, size_t oldlen, const char *new, size_t newlen,
+		struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req;
+	int count = 0, rc;
+
+	ENTRY;
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+	    (fid_is_sane(&op_data->op_fid2)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_LOOKUP);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+	    (fid_is_sane(&op_data->op_fid4)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_ELC);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+			   op_data->op_cli_flags & CLI_MIGRATE ?
+			   &RQF_MDS_REINT_MIGRATE : &RQF_MDS_REINT_RENAME);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+	if (op_data->op_cli_flags & CLI_MIGRATE)
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+				     op_data->op_data_size);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (exp_connect_cancelset(exp) && req)
+		ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+	if (op_data->op_cli_flags & CLI_MIGRATE)
+		mdc_migrate_pack(&req->rq_pill, op_data, old, oldlen);
+	else
+		mdc_rename_pack(&req->rq_pill, op_data, old, oldlen,
+				new, newlen);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+	*request = req;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	RETURN(rc);
+}
+
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *op_data)
+{
+	LIST_HEAD(cancels);
+	struct ptlrpc_request *req;
+	struct ldlm_lock *lock;
+	struct mdt_rec_resync *rec;
+	int count = 0, rc;
+	ENTRY;
+
+	if (op_data->op_flags & MF_MDC_CANCEL_FID1 &&
+	    fid_is_sane(&op_data->op_fid1))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_LAYOUT);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_RESYNC);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	BUILD_BUG_ON(sizeof(*rec) != sizeof(struct mdt_rec_reint));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	rec->rs_opcode	= REINT_RESYNC;
+	rec->rs_fsuid	= op_data->op_fsuid;
+	rec->rs_fsgid	= op_data->op_fsgid;
+	rec->rs_cap	= op_data->op_cap.cap[0];
+	rec->rs_fid	= op_data->op_fid1;
+	rec->rs_bias	= op_data->op_bias;
+	rec->rs_mirror_id = op_data->op_mirror_id;
+
+	lock = ldlm_handle2lock(&op_data->op_lease_handle);
+	if (lock != NULL) {
+		rec->rs_lease_handle = lock->l_remote_handle;
+		LDLM_LOCK_PUT(lock);
+	}
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
new file mode 100644
index 0000000000000..93db70c6dc229
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
@@ -0,0 +1,3077 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/user_namespace.h>
+#include <linux/utsname.h>
+#include <linux/delay.h>
+#include <linux/uidgid.h>
+#include <linux/device.h>
+#include <linux/xarray.h>
+
+#include <lustre_errno.h>
+
+#include <cl_object.h>
+#include <llog_swab.h>
+#include <lprocfs_status.h>
+#include <lustre_acl.h>
+#include <lustre_compat.h>
+#include <lustre_fid.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_kernelcomm.h>
+#include <lustre_lmv.h>
+#include <lustre_log.h>
+#include <lustre_swab.h>
+#include <obd_class.h>
+#include <lustre_osc.h>
+
+#include "mdc_internal.h"
+
+#define REQUEST_MINOR 244
+
+static int mdc_cleanup(struct obd_device *obd);
+
+static inline int mdc_queue_wait(struct ptlrpc_request *req)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	int rc;
+
+	/* obd_get_request_slot() ensures that this client has no more
+	 * than cl_max_rpcs_in_flight RPCs simultaneously inf light
+	 * against an MDT. */
+	rc = obd_get_request_slot(cli);
+	if (rc != 0)
+		return rc;
+
+	rc = ptlrpc_queue_wait(req);
+	obd_put_request_slot(cli);
+
+	return rc;
+}
+
+/*
+ * Send MDS_GET_ROOT RPC to fetch root FID.
+ *
+ * If \a fileset is not NULL it should contain a subdirectory off
+ * the ROOT/ directory to be mounted on the client. Return the FID
+ * of the subdirectory to the client to mount onto its mountpoint.
+ *
+ * \param[in]	imp	MDC import
+ * \param[in]	fileset	fileset name, which could be NULL
+ * \param[out]	rootfid	root FID of this mountpoint
+ * \param[out]	pc	root capa will be unpacked and saved in this pointer
+ *
+ * \retval	0 on success, negative errno on failure
+ */
+static int mdc_get_root(struct obd_export *exp, const char *fileset,
+			 struct lu_fid *rootfid)
+{
+	struct ptlrpc_request	*req;
+	struct mdt_body		*body;
+	int			 rc;
+
+	ENTRY;
+
+	if (fileset && !(exp_connect_flags(exp) & OBD_CONNECT_SUBTREE))
+		RETURN(-ENOTSUPP);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				&RQF_MDS_GET_ROOT);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	if (fileset != NULL)
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     strlen(fileset) + 1);
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_ROOT);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0);
+	if (fileset != NULL) {
+		char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+
+		memcpy(name, fileset, strlen(fileset));
+	}
+	lustre_msg_add_flags(req->rq_reqmsg, LUSTRE_IMP_FULL);
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*rootfid = body->mbo_fid1;
+	CDEBUG(D_NET, "root fid="DFID", last_committed=%llu\n",
+	       PFID(rootfid), lustre_msg_get_last_committed(req->rq_repmsg));
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+/*
+ * This function now is known to always saying that it will receive 4 buffers
+ * from server. Even for cases when acl_size and md_size is zero, RPC header
+ * will contain 4 fields and RPC itself will contain zero size fields. This is
+ * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed
+ * and thus zero, it shrinks it, making zero size. The same story about
+ * md_size. And this is course of problem when client waits for smaller number
+ * of fields. This issue will be fixed later when client gets aware of RPC
+ * layouts.  --umka
+ */
+static int mdc_getattr_common(struct obd_export *exp,
+			      struct ptlrpc_request *req,
+			      struct md_op_data *op_data)
+{
+        struct req_capsule *pill = &req->rq_pill;
+        struct mdt_body    *body;
+        void               *eadata;
+        int                 rc;
+        ENTRY;
+
+        /* Request message already built. */
+        rc = ptlrpc_queue_wait(req);
+        if (rc != 0)
+                RETURN(rc);
+
+        /* sanity check for the reply */
+        body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+        if (body == NULL)
+                RETURN(-EPROTO);
+
+	CDEBUG(D_NET, "mode: %o\n", body->mbo_mode);
+
+	mdc_update_max_ea_from_body(exp, body);
+	if (body->mbo_eadatasize != 0) {
+		eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+						      body->mbo_eadatasize);
+		if (eadata == NULL)
+			RETURN(-EPROTO);
+	}
+
+	/* If encryption context was returned by MDT, put it in op_data
+	 * so that caller can set it on inode and save an extra getxattr.
+	 */
+	if (op_data && op_data->op_valid & OBD_MD_ENCCTX &&
+	    body->mbo_valid & OBD_MD_ENCCTX) {
+		op_data->op_file_encctx =
+			req_capsule_server_get(pill, &RMF_FILE_ENCCTX);
+		op_data->op_file_encctx_size =
+			req_capsule_get_size(pill, &RMF_FILE_ENCCTX,
+					     RCL_SERVER);
+	}
+
+        RETURN(0);
+}
+
+static void mdc_reset_acl_req(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_early_free_lock);
+	sptlrpc_cli_free_repbuf(req);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+	req->rq_repdata = NULL;
+	req->rq_reqdata_len = 0;
+	spin_unlock(&req->rq_early_free_lock);
+}
+
+static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
+		       struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct obd_import *imp = class_exp2cliimp(exp);
+	__u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+	int rc;
+	ENTRY;
+
+	/* Single MDS without an LMV case */
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = 0;
+		RETURN(0);
+	}
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/* LU-15245: avoid deadlock with modifying RPCs on MDS_REQUEST_PORTAL */
+	req->rq_request_portal = MDS_READPAGE_PORTAL;
+
+again:
+	mdc_pack_body(&req->rq_pill, &op_data->op_fid1, op_data->op_valid,
+		      op_data->op_mode, -1, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	if (exp_connect_encrypt(exp) && op_data->op_valid & OBD_MD_ENCCTX)
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX,
+				     RCL_SERVER,
+				     obd->u.cli.cl_max_mds_easize);
+	else
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX,
+				     RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_getattr_common(exp, req, op_data);
+	if (rc) {
+		if (rc == -ERANGE) {
+			acl_bufsize = min_t(__u32,
+					    imp->imp_connect_data.ocd_max_easize,
+					    XATTR_SIZE_MAX);
+			mdc_reset_acl_req(req);
+			goto again;
+		}
+
+		ptlrpc_req_finished(req);
+	} else {
+		*request = req;
+	}
+
+	RETURN(rc);
+}
+
+static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+			    struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	__u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+	int rc;
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR_NAME);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+        if (op_data->op_name) {
+                char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+                LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
+                                op_data->op_namelen);
+                memcpy(name, op_data->op_name, op_data->op_namelen);
+        }
+
+again:
+	mdc_pack_body(&req->rq_pill, &op_data->op_fid1, op_data->op_valid,
+		      op_data->op_mode, op_data->op_suppgids[0], 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_ENCCTX, RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+	if (op_data->op_bias & MDS_FID_OP) {
+		struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+							    &RMF_MDT_BODY);
+
+		if (b) {
+			b->mbo_valid |= OBD_MD_NAMEHASH;
+			b->mbo_fid2 = op_data->op_fid2;
+		}
+	}
+
+	rc = mdc_getattr_common(exp, req, NULL);
+	if (rc) {
+		if (rc == -ERANGE) {
+			acl_bufsize = min_t(__u32,
+					    imp->imp_connect_data.ocd_max_easize,
+					    XATTR_SIZE_MAX);
+			mdc_reset_acl_req(req);
+			goto again;
+		}
+
+		ptlrpc_req_finished(req);
+	} else {
+		*request = req;
+	}
+
+	RETURN(rc);
+}
+
+static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
+			    const struct lu_fid *fid, int opcode, u64 valid,
+			    const char *xattr_name, const char *input,
+			    int input_size, int output_size, int flags,
+			    __u32 suppgid, struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        int   xattr_namelen = 0;
+        char *tmp;
+        int   rc;
+        ENTRY;
+
+        *request = NULL;
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+	if (xattr_name) {
+		xattr_namelen = strlen(xattr_name) + 1;
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     xattr_namelen);
+	}
+	if (input_size)
+		LASSERT(input);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     input_size);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
+	/* Flush local XATTR locks to get rid of a possible cancel RPC */
+	if (opcode == MDS_REINT && fid_is_sane(fid) &&
+	    exp->exp_connect_data.ocd_ibits_known & MDS_INODELOCK_XATTR) {
+		LIST_HEAD(cancels);
+		int count;
+
+		/* Without that packing would fail */
+		if (input_size == 0)
+			req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+					     RCL_CLIENT, 0);
+
+		count = mdc_resource_get_unused(exp, fid,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_XATTR);
+
+		rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+		if (rc) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+	} else {
+		rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode);
+		if (rc) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+	}
+
+	if (opcode == MDS_REINT) {
+		struct mdt_rec_setxattr *rec;
+
+		BUILD_BUG_ON(sizeof(struct mdt_rec_setxattr) !=
+			     sizeof(struct mdt_rec_reint));
+		rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+		rec->sx_opcode = REINT_SETXATTR;
+		rec->sx_fsuid  = from_kuid(&init_user_ns, current_fsuid());
+		rec->sx_fsgid  = from_kgid(&init_user_ns, current_fsgid());
+		rec->sx_cap = current_cap().cap[0];
+		rec->sx_suppgid1 = suppgid;
+		rec->sx_suppgid2 = -1;
+		rec->sx_fid    = *fid;
+		rec->sx_valid  = valid | OBD_MD_FLCTIME;
+		rec->sx_time   = ktime_get_real_seconds();
+		rec->sx_size   = output_size;
+		rec->sx_flags  = flags;
+	} else {
+		mdc_pack_body(&req->rq_pill, fid, valid, output_size,
+			      suppgid, flags);
+		/* Avoid deadlock with modifying RPCs on MDS_REQUEST_PORTAL.
+		 * See LU-15245.
+		 */
+		req->rq_request_portal = MDS_READPAGE_PORTAL;
+	}
+
+        if (xattr_name) {
+                tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+                memcpy(tmp, xattr_name, xattr_namelen);
+        }
+        if (input_size) {
+                tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+                memcpy(tmp, input, input_size);
+        }
+
+	mdc_file_sepol_pack(&req->rq_pill);
+
+        if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
+                req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+                                     RCL_SERVER, output_size);
+        ptlrpc_request_set_replen(req);
+
+        /* make rpc */
+        if (opcode == MDS_REINT)
+		ptlrpc_get_mod_rpc_slot(req);
+
+        rc = ptlrpc_queue_wait(req);
+
+	if (opcode == MDS_REINT)
+		ptlrpc_put_mod_rpc_slot(req);
+
+        if (rc)
+                ptlrpc_req_finished(req);
+        else
+                *request = req;
+        RETURN(rc);
+}
+
+static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+			u64 obd_md_valid, const char *name,
+			const void *value, size_t value_size,
+			unsigned int xattr_flags, u32 suppgid,
+			struct ptlrpc_request **req)
+{
+	LASSERT(obd_md_valid == OBD_MD_FLXATTR ||
+		obd_md_valid == OBD_MD_FLXATTRRM);
+
+	return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
+				fid, MDS_REINT, obd_md_valid, name,
+				value, value_size, 0, xattr_flags, suppgid,
+				req);
+}
+
+static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+			u64 obd_md_valid, const char *name, size_t buf_size,
+			struct ptlrpc_request **req)
+{
+	struct mdt_body *body;
+	int rc;
+
+	LASSERT(obd_md_valid == OBD_MD_FLXATTR ||
+		obd_md_valid == OBD_MD_FLXATTRLS);
+
+	/* Message below is checked in sanity-selinux test_20d
+	 * and sanity-sec test_49
+	 */
+	CDEBUG(D_INFO, "%s: get xattr '%s' for "DFID"\n",
+	       exp->exp_obd->obd_name, name, PFID(fid));
+	rc = mdc_xattr_common(exp, &RQF_MDS_GETXATTR, fid, MDS_GETXATTR,
+			      obd_md_valid, name, NULL, 0, buf_size, 0, -1,
+			      req);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&(*req)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	/* only detect the xattr size */
+	if (buf_size == 0) {
+		/* LU-11109: Older MDTs do not distinguish
+		 * between nonexistent xattrs and zero length
+		 * values in this case. Newer MDTs will return
+		 * -ENODATA or set OBD_MD_FLXATTR. */
+		GOTO(out, rc = body->mbo_eadatasize);
+	}
+
+	if (body->mbo_eadatasize == 0) {
+		/* LU-11109: Newer MDTs set OBD_MD_FLXATTR on
+		 * success so that we can distinguish between
+		 * zero length value and nonexistent xattr.
+		 *
+		 * If OBD_MD_FLXATTR is not set then we keep
+		 * the old behavior and return -ENODATA for
+		 * getxattr() when mbo_eadatasize is 0. But
+		 * -ENODATA only makes sense for getxattr()
+		 * and not for listxattr(). */
+		if (body->mbo_valid & OBD_MD_FLXATTR)
+			GOTO(out, rc = 0);
+		else if (obd_md_valid == OBD_MD_FLXATTR)
+			GOTO(out, rc = -ENODATA);
+		else
+			GOTO(out, rc = 0);
+	}
+
+	GOTO(out, rc = body->mbo_eadatasize);
+out:
+	if (rc < 0) {
+		ptlrpc_req_finished(*req);
+		*req = NULL;
+	}
+
+	return rc;
+}
+
+static int mdc_get_lustre_md(struct obd_export *exp, struct req_capsule *pill,
+			     struct obd_export *dt_exp,
+			     struct obd_export *md_exp,
+			     struct lustre_md *md)
+{
+        int rc;
+        ENTRY;
+
+        LASSERT(md);
+        memset(md, 0, sizeof(*md));
+
+        md->body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+        LASSERT(md->body != NULL);
+
+	if (md->body->mbo_valid & OBD_MD_FLEASIZE) {
+		if (!S_ISREG(md->body->mbo_mode)) {
+			CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a "
+			       "regular file, but is not\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		if (md->body->mbo_eadatasize == 0) {
+			CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, "
+			       "but eadatasize 0\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		md->layout.lb_len = md->body->mbo_eadatasize;
+		md->layout.lb_buf = req_capsule_server_sized_get(pill,
+							&RMF_MDT_MD,
+							md->layout.lb_len);
+		if (md->layout.lb_buf == NULL)
+			GOTO(out, rc = -EPROTO);
+	} else if (md->body->mbo_valid & OBD_MD_FLDIREA) {
+		const union lmv_mds_md *lmv;
+		size_t lmv_size;
+
+		if (!S_ISDIR(md->body->mbo_mode)) {
+			CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a "
+			       "directory, but is not\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		if (md_exp->exp_obd->obd_type->typ_lu == &mdc_device_type) {
+			CERROR("%s: no LMV, upgrading from old version?\n",
+			       md_exp->exp_obd->obd_name);
+
+			GOTO(out_acl, rc = 0);
+		}
+
+		if (md->body->mbo_valid & OBD_MD_MEA) {
+			lmv_size = md->body->mbo_eadatasize;
+			if (lmv_size == 0) {
+				CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, "
+				       "but eadatasize 0\n");
+				RETURN(-EPROTO);
+			}
+
+			lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							   lmv_size);
+			if (lmv == NULL)
+				GOTO(out, rc = -EPROTO);
+
+			rc = md_unpackmd(md_exp, &md->lmv, lmv, lmv_size);
+			if (rc < 0)
+				GOTO(out, rc);
+
+			if (rc < (int)sizeof(*md->lmv)) {
+				struct lmv_foreign_md *lfm = md->lfm;
+
+				/* short (< sizeof(struct lmv_stripe_md))
+				 * foreign LMV case
+				 */
+				if (lfm->lfm_magic != LMV_MAGIC_FOREIGN) {
+					CDEBUG(D_INFO,
+					       "lmv size too small: %d < %d\n",
+					       rc, (int)sizeof(*md->lmv));
+					GOTO(out, rc = -EPROTO);
+				}
+			}
+		}
+
+		/* since 2.12.58 intent_getattr fetches default LMV */
+		if (md->body->mbo_valid & OBD_MD_DEFAULT_MEA) {
+			lmv_size = sizeof(struct lmv_user_md);
+			lmv = req_capsule_server_sized_get(pill,
+							   &RMF_DEFAULT_MDT_MD,
+							   lmv_size);
+			if (!lmv)
+				GOTO(out, rc = -EPROTO);
+
+			rc = md_unpackmd(md_exp, &md->default_lmv, lmv,
+					 lmv_size);
+			if (rc < 0)
+				GOTO(out, rc);
+
+			if (rc < (int)sizeof(*md->default_lmv)) {
+				CDEBUG(D_INFO,
+				       "default lmv size too small: %d < %d\n",
+					rc, (int)sizeof(*md->default_lmv));
+				GOTO(out, rc = -EPROTO);
+			}
+		}
+	}
+	rc = 0;
+
+out_acl:
+	if (md->body->mbo_valid & OBD_MD_FLACL) {
+		/* for ACL, it's possible that FLACL is set but aclsize is zero.
+		 * only when aclsize != 0 there's an actual segment for ACL
+		 * in reply buffer.
+		 */
+		rc = mdc_unpack_acl(pill, md);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	EXIT;
+out:
+	if (rc)
+		lmd_clear_acl(md);
+
+	return rc;
+}
+
+static int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+	ENTRY;
+	RETURN(0);
+}
+
+void mdc_replay_open(struct ptlrpc_request *req)
+{
+	struct md_open_data *mod = req->rq_cb_data;
+	struct ptlrpc_request *close_req;
+	struct obd_client_handle *och;
+	struct lustre_handle old_open_handle = { };
+	struct mdt_body *body;
+	struct ldlm_reply *rep;
+	ENTRY;
+
+	if (mod == NULL) {
+		DEBUG_REQ(D_ERROR, req,
+			  "cannot properly replay without open data");
+		EXIT;
+		return;
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (rep != NULL && rep->lock_policy_res2 != 0)
+		DEBUG_REQ(D_ERROR, req, "Open request replay failed with %ld ",
+			(long int)rep->lock_policy_res2);
+
+	spin_lock(&req->rq_lock);
+	och = mod->mod_och;
+	if (och && och->och_open_handle.cookie)
+		req->rq_early_free_repbuf = 1;
+	else
+		req->rq_early_free_repbuf = 0;
+	spin_unlock(&req->rq_lock);
+
+	if (req->rq_early_free_repbuf) {
+		struct lustre_handle *file_open_handle;
+
+		LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
+
+		file_open_handle = &och->och_open_handle;
+		CDEBUG(D_HA, "updating handle from %#llx to %#llx\n",
+		       file_open_handle->cookie, body->mbo_open_handle.cookie);
+		old_open_handle = *file_open_handle;
+		*file_open_handle = body->mbo_open_handle;
+	}
+
+	close_req = mod->mod_close_req;
+	if (close_req) {
+		__u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg);
+		struct mdt_ioepoch *epoch;
+
+		LASSERT(opc == MDS_CLOSE);
+		epoch = req_capsule_client_get(&close_req->rq_pill,
+					       &RMF_MDT_EPOCH);
+		LASSERT(epoch);
+
+		if (req->rq_early_free_repbuf)
+			LASSERT(old_open_handle.cookie ==
+				epoch->mio_open_handle.cookie);
+
+		DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
+		epoch->mio_open_handle = body->mbo_open_handle;
+	}
+	EXIT;
+}
+
+void mdc_commit_open(struct ptlrpc_request *req)
+{
+        struct md_open_data *mod = req->rq_cb_data;
+        if (mod == NULL)
+                return;
+
+        /**
+         * No need to touch md_open_data::mod_och, it holds a reference on
+         * \var mod and will zero references to each other, \var mod will be
+         * freed after that when md_open_data::mod_och will put the reference.
+         */
+
+        /**
+         * Do not let open request to disappear as it still may be needed
+         * for close rpc to happen (it may happen on evict only, otherwise
+         * ptlrpc_request::rq_replay does not let mdc_commit_open() to be
+         * called), just mark this rpc as committed to distinguish these 2
+         * cases, see mdc_close() for details. The open request reference will
+         * be put along with freeing \var mod.
+         */
+        ptlrpc_request_addref(req);
+	spin_lock(&req->rq_lock);
+	req->rq_committed = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_cb_data = NULL;
+	obd_mod_put(mod);
+}
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct lookup_intent *it)
+{
+	struct md_open_data	*mod;
+	struct mdt_rec_create	*rec;
+	struct mdt_body		*body;
+	struct ptlrpc_request	*open_req = it->it_request;
+	struct obd_import	*imp = open_req->rq_import;
+	ENTRY;
+
+        if (!open_req->rq_replay)
+                RETURN(0);
+
+        rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT);
+        body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
+        LASSERT(rec != NULL);
+        /* Incoming message in my byte order (it's been swabbed). */
+        /* Outgoing messages always in my byte order. */
+        LASSERT(body != NULL);
+
+	/* Only if the import is replayable, we set replay_open data */
+	if (och && imp->imp_replayable) {
+		mod = obd_mod_alloc();
+		if (mod == NULL) {
+			DEBUG_REQ(D_ERROR, open_req,
+				  "cannot allocate md_open_data");
+			RETURN(0);
+		}
+
+                /**
+                 * Take a reference on \var mod, to be freed on mdc_close().
+                 * It protects \var mod from being freed on eviction (commit
+                 * callback is called despite rq_replay flag).
+                 * Another reference for \var och.
+                 */
+                obd_mod_get(mod);
+                obd_mod_get(mod);
+
+		spin_lock(&open_req->rq_lock);
+		och->och_mod = mod;
+		mod->mod_och = och;
+		mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) ||
+				     it_disposition(it, DISP_OPEN_STRIPE);
+		mod->mod_open_req = open_req;
+		open_req->rq_cb_data = mod;
+		open_req->rq_commit_cb = mdc_commit_open;
+		open_req->rq_early_free_repbuf = 1;
+		spin_unlock(&open_req->rq_lock);
+	}
+
+	rec->cr_fid2 = body->mbo_fid1;
+	rec->cr_open_handle_old = body->mbo_open_handle;
+	open_req->rq_replay_cb = mdc_replay_open;
+	if (!fid_is_sane(&body->mbo_fid1)) {
+		DEBUG_REQ(D_ERROR, open_req,
+			  "saving replay request with insane FID " DFID,
+			  PFID(&body->mbo_fid1));
+		LBUG();
+	}
+
+	DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+	RETURN(0);
+}
+
+static void mdc_free_open(struct md_open_data *mod)
+{
+	int committed = 0;
+
+	if (mod->mod_is_create == 0 &&
+	    imp_connect_disp_stripe(mod->mod_open_req->rq_import))
+		committed = 1;
+
+	/**
+	 * No reason to asssert here if the open request has
+	 * rq_replay == 1. It means that mdc_close failed, and
+	 * close request wasn`t sent. It is not fatal to client.
+	 * The worst thing is eviction if the client gets open lock
+	 **/
+
+	DEBUG_REQ(D_RPCTRACE, mod->mod_open_req,
+		  "free open request, rq_replay=%d",
+		  mod->mod_open_req->rq_replay);
+
+	ptlrpc_request_committed(mod->mod_open_req, committed);
+	if (mod->mod_close_req)
+		ptlrpc_request_committed(mod->mod_close_req, committed);
+}
+
+static int mdc_clear_open_replay_data(struct obd_export *exp,
+				      struct obd_client_handle *och)
+{
+	struct md_open_data *mod = och->och_mod;
+	ENTRY;
+
+	/**
+	 * It is possible to not have \var mod in a case of eviction between
+	 * lookup and ll_file_open().
+	 **/
+	if (mod == NULL)
+		RETURN(0);
+
+	LASSERT(mod != LP_POISON);
+	LASSERT(mod->mod_open_req != NULL);
+
+	spin_lock(&mod->mod_open_req->rq_lock);
+	if (mod->mod_och)
+		mod->mod_och->och_open_handle.cookie = 0;
+	mod->mod_open_req->rq_early_free_repbuf = 0;
+	spin_unlock(&mod->mod_open_req->rq_lock);
+	mdc_free_open(mod);
+
+	mod->mod_och = NULL;
+	och->och_mod = NULL;
+	obd_mod_put(mod);
+
+	RETURN(0);
+}
+
+static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
+		     struct md_open_data *mod, struct ptlrpc_request **request)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct req_format     *req_fmt;
+	size_t		       u32_count = 0;
+	int                    rc;
+	int		       saved_rc = 0;
+	ENTRY;
+
+	CDEBUG(D_INODE, "%s: "DFID" file closed with intent: %x\n",
+	       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+	       op_data->op_bias);
+
+	if (op_data->op_bias & MDS_CLOSE_INTENT) {
+		req_fmt = &RQF_MDS_CLOSE_INTENT;
+		if (op_data->op_bias & MDS_HSM_RELEASE) {
+			/* allocate a FID for volatile file */
+			rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2,
+					   op_data);
+			if (rc < 0) {
+				CERROR("%s: "DFID" allocating FID: rc = %d\n",
+				       obd->obd_name, PFID(&op_data->op_fid1),
+				       rc);
+				/* save the errcode and proceed to close */
+				saved_rc = rc;
+			}
+		}
+		if (op_data->op_bias & MDS_CLOSE_RESYNC_DONE) {
+			size_t count = op_data->op_data_size / sizeof(__u32);
+
+			if (count > INLINE_RESYNC_ARRAY_SIZE)
+				u32_count = count;
+		}
+	} else {
+		req_fmt = &RQF_MDS_CLOSE;
+	}
+
+	*request = NULL;
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_CLOSE))
+		req = NULL;
+	else
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt);
+
+	/* Ensure that this close's handle is fixed up during replay. */
+	if (likely(mod != NULL)) {
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED open %p!\n", mod->mod_open_req);
+
+		mod->mod_close_req = req;
+
+		DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "matched open");
+		/* We no longer want to preserve this open for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
+	} else {
+		CDEBUG(D_HA, "couldn't find open req; expecting close error\n");
+	}
+	if (req == NULL) {
+		/**
+		 * TODO: repeat close after errors
+		 */
+		CWARN("%s: close of FID "DFID" failed, file reference will be "
+		      "dropped when this client unmounts or is evicted\n",
+		      obd->obd_name, PFID(&op_data->op_fid1));
+		GOTO(out, rc = -ENOMEM);
+	}
+
+	if (u32_count > 0)
+		req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT,
+				     u32_count * sizeof(__u32));
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		req = NULL;
+		GOTO(out, rc);
+	}
+
+        /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
+         * portal whose threads are not taking any DLM locks and are therefore
+         * always progressing */
+        req->rq_request_portal = MDS_READPAGE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
+
+	if (!obd->u.cli.cl_lsom_update ||
+	    !(exp_connect_flags2(exp) & OBD_CONNECT2_LSOM))
+		op_data->op_xvalid &= ~(OP_XVALID_LAZYSIZE |
+					OP_XVALID_LAZYBLOCKS);
+
+	mdc_close_pack(&req->rq_pill, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+
+        ptlrpc_request_set_replen(req);
+
+	ptlrpc_get_mod_rpc_slot(req);
+	rc = ptlrpc_queue_wait(req);
+	ptlrpc_put_mod_rpc_slot(req);
+
+	if (req->rq_repmsg == NULL) {
+		CDEBUG(D_RPCTRACE, "request %p failed to send: rc = %d\n", req,
+		       req->rq_status);
+		if (rc == 0)
+			rc = req->rq_status ?: -EIO;
+	} else if (rc == 0 || rc == -EAGAIN) {
+		struct mdt_body *body;
+
+		rc = lustre_msg_get_status(req->rq_repmsg);
+		if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+			DEBUG_REQ(D_ERROR, req,
+				  "type = PTL_RPC_MSG_ERR: rc = %d", rc);
+			if (rc > 0)
+				rc = -rc;
+		}
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			rc = -EPROTO;
+	} else if (rc == -ESTALE) {
+		/**
+		 * it can be allowed error after 3633 if open was committed and
+		 * server failed before close was sent. Let's check if mod
+		 * exists and return no error in that case
+		 */
+		if (mod) {
+			DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc);
+			LASSERT(mod->mod_open_req != NULL);
+			if (mod->mod_open_req->rq_committed)
+				rc = 0;
+		}
+	}
+
+out:
+        if (mod) {
+                if (rc != 0)
+                        mod->mod_close_req = NULL;
+                /* Since now, mod is accessed through open_req only,
+                 * thus close req does not keep a reference on mod anymore. */
+                obd_mod_put(mod);
+        }
+        *request = req;
+
+	RETURN(rc < 0 ? rc : saved_rc);
+}
+
+static int mdc_getpage(struct obd_export *exp, const struct lu_fid *fid,
+		       u64 offset, struct page **pages, int npages,
+		       struct ptlrpc_request **request)
+{
+	struct ptlrpc_request   *req;
+	struct ptlrpc_bulk_desc *desc;
+	int                      i;
+	int                      resends = 0;
+	int                      rc;
+	ENTRY;
+
+	*request = NULL;
+
+restart_bulk:
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	req->rq_request_portal = MDS_READPAGE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	desc = ptlrpc_prep_bulk_imp(req, npages, 1,
+				    PTLRPC_BULK_PUT_SINK,
+				    MDS_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_pin_ops);
+	if (desc == NULL) {
+		ptlrpc_req_finished(req);
+		RETURN(-ENOMEM);
+	}
+
+	/* NB req now owns desc and will free it when it gets freed */
+	for (i = 0; i < npages; i++)
+		desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0,
+						 PAGE_SIZE);
+
+	mdc_readdir_pack(&req->rq_pill, offset, PAGE_SIZE * npages, fid);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		ptlrpc_req_finished(req);
+		if (rc != -ETIMEDOUT)
+			RETURN(rc);
+
+		resends++;
+		if (!client_should_resend(resends, &exp->exp_obd->u.cli)) {
+			CERROR("%s: too many resend retries: rc = %d\n",
+			       exp->exp_obd->obd_name, -EIO);
+			RETURN(-EIO);
+		}
+
+		/* If a signal interrupts then the timeout returned will
+		 * not be zero. In that case return -EINTR
+		 */
+		if (msleep_interruptible(resends * 1000))
+			RETURN(-EINTR);
+
+		goto restart_bulk;
+	}
+
+	rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk,
+					  req->rq_bulk->bd_nob_transferred);
+	if (rc < 0) {
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) {
+		CERROR("%s: unexpected bytes transferred: %d (%ld expected)\n",
+		       exp->exp_obd->obd_name, req->rq_bulk->bd_nob_transferred,
+		       PAGE_SIZE * npages);
+		ptlrpc_req_finished(req);
+		RETURN(-EPROTO);
+	}
+
+	*request = req;
+	RETURN(0);
+}
+
+static void mdc_release_page(struct page *page, int remove)
+{
+	if (remove) {
+		lock_page(page);
+		if (likely(page->mapping != NULL))
+			cfs_delete_from_page_cache(page);
+		unlock_page(page);
+	}
+	put_page(page);
+}
+
+static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
+				    __u64 *start, __u64 *end, int hash64)
+{
+	/*
+	 * Complement of hash is used as an index so that
+	 * radix_tree_gang_lookup() can be used to find a page with starting
+	 * hash _smaller_ than one we are looking for.
+	 */
+	unsigned long offset = hash_x_index(*hash, hash64);
+	struct page *page;
+	unsigned long flags;
+	int found;
+
+	ll_xa_lock_irqsave(&mapping->i_pages, flags);
+	found = radix_tree_gang_lookup(&mapping->page_tree,
+				       (void **)&page, offset, 1);
+	if (found > 0 && !ll_xa_is_value(page)) {
+		struct lu_dirpage *dp;
+
+		get_page(page);
+		ll_xa_unlock_irqrestore(&mapping->i_pages, flags);
+		/*
+		 * In contrast to find_lock_page() we are sure that directory
+		 * page cannot be truncated (while DLM lock is held) and,
+		 * hence, can avoid restart.
+		 *
+		 * In fact, page cannot be locked here at all, because
+		 * mdc_read_page_remote does synchronous io.
+		 */
+		wait_on_page_locked(page);
+		if (PageUptodate(page)) {
+			dp = kmap(page);
+			if (BITS_PER_LONG == 32 && hash64) {
+				*start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+				*end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+				*hash  = *hash >> 32;
+			} else {
+				*start = le64_to_cpu(dp->ldp_hash_start);
+				*end   = le64_to_cpu(dp->ldp_hash_end);
+			}
+			if (unlikely(*start == 1 && *hash == 0))
+				*hash = *start;
+			else
+				LASSERTF(*start <= *hash, "start = %#llx"
+					 ",end = %#llx,hash = %#llx\n",
+					 *start, *end, *hash);
+			CDEBUG(D_VFSTRACE, "offset %lx [%#llx %#llx],"
+			      " hash %#llx\n", offset, *start, *end, *hash);
+			if (*hash > *end) {
+				kunmap(page);
+				mdc_release_page(page, 0);
+				page = NULL;
+			} else if (*end != *start && *hash == *end) {
+				/*
+				 * upon hash collision, remove this page,
+				 * otherwise put page reference, and
+				 * mdc_read_page_remote() will issue RPC to
+				 * fetch the page we want.
+				 */
+				kunmap(page);
+				mdc_release_page(page,
+				    le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+				page = NULL;
+			}
+		} else {
+			put_page(page);
+			page = ERR_PTR(-EIO);
+		}
+	} else {
+		ll_xa_unlock_irqrestore(&mapping->i_pages, flags);
+		page = NULL;
+	}
+	return page;
+}
+
+/*
+ * Adjust a set of pages, each page containing an array of lu_dirpages,
+ * so that each page can be used as a single logical lu_dirpage.
+ *
+ * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
+ * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
+ * struct lu_dirent.  It has size up to LU_PAGE_SIZE. The ldp_hash_end
+ * value is used as a cookie to request the next lu_dirpage in a
+ * directory listing that spans multiple pages (two in this example):
+ *   ________
+ *  |        |
+ * .|--------v-------   -----.
+ * |s|e|f|p|ent|ent| ... |ent|
+ * '--|--------------   -----'   Each PAGE contains a single
+ *    '------.                   lu_dirpage.
+ * .---------v-------   -----.
+ * |s|e|f|p|ent| 0 | ... | 0 |
+ * '-----------------   -----'
+ *
+ * However, on hosts where the native VM page size (PAGE_SIZE) is
+ * larger than LU_PAGE_SIZE, a single host page may contain multiple
+ * lu_dirpages. After reading the lu_dirpages from the MDS, the
+ * ldp_hash_end of the first lu_dirpage refers to the one immediately
+ * after it in the same PAGE (arrows simplified for brevity, but
+ * in general e0==s1, e1==s2, etc.):
+ *
+ * .--------------------   -----.
+ * |s0|e0|f0|p|ent|ent| ... |ent|
+ * |---v----------------   -----|
+ * |s1|e1|f1|p|ent|ent| ... |ent|
+ * |---v----------------   -----|  Here, each PAGE contains
+ *             ...                 multiple lu_dirpages.
+ * |---v----------------   -----|
+ * |s'|e'|f'|p|ent|ent| ... |ent|
+ * '---|----------------   -----'
+ *     v
+ * .----------------------------.
+ * |        next PAGE           |
+ *
+ * This structure is transformed into a single logical lu_dirpage as follows:
+ *
+ * - Replace e0 with e' so the request for the next lu_dirpage gets the page
+ *   labeled 'next PAGE'.
+ *
+ * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
+ *   a hash collision with the next page exists.
+ *
+ * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
+ *   to the first entry of the next lu_dirpage.
+ */
+#if PAGE_SIZE > LU_PAGE_SIZE
+static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs)
+{
+	int i;
+
+	for (i = 0; i < cfs_pgs; i++) {
+		struct lu_dirpage *dp = kmap(pages[i]);
+		struct lu_dirpage *first = dp;
+		struct lu_dirent *end_dirent = NULL;
+		struct lu_dirent *ent;
+		__u64 hash_end = dp->ldp_hash_end;
+		__u32 flags = dp->ldp_flags;
+
+		while (--lu_pgs > 0) {
+			ent = lu_dirent_start(dp);
+			for (end_dirent = ent; ent != NULL;
+			     end_dirent = ent, ent = lu_dirent_next(ent));
+
+			/* Advance dp to next lu_dirpage. */
+			dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
+
+			/* Check if we've reached the end of the PAGE. */
+			if (!((unsigned long)dp & ~PAGE_MASK))
+				break;
+
+			/* Save the hash and flags of this lu_dirpage. */
+			hash_end = dp->ldp_hash_end;
+			flags = dp->ldp_flags;
+
+			/* Check if lu_dirpage contains no entries. */
+			if (end_dirent == NULL)
+				break;
+
+			/* Enlarge the end entry lde_reclen from 0 to
+			 * first entry of next lu_dirpage. */
+			LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
+			end_dirent->lde_reclen =
+				cpu_to_le16((char *)(dp->ldp_entries) -
+					    (char *)end_dirent);
+		}
+
+		first->ldp_hash_end = hash_end;
+		first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
+		first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
+
+		kunmap(pages[i]);
+	}
+	LASSERTF(lu_pgs == 0, "left = %d\n", lu_pgs);
+}
+#else
+#define mdc_adjust_dirpages(pages, cfs_pgs, lu_pgs) do {} while (0)
+#endif	/* PAGE_SIZE > LU_PAGE_SIZE */
+
+/* parameters for readdir page */
+struct readpage_param {
+	struct md_op_data	*rp_mod;
+	__u64			rp_off;
+	int			rp_hash64;
+	struct obd_export	*rp_exp;
+};
+
+/**
+ * Read pages from server.
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted.
+ **/
+static int ll_mdc_read_page_remote(void *data, struct page *page0)
+{
+	struct readpage_param *rp = data;
+	struct page **page_pool;
+	struct page *page;
+	struct lu_dirpage *dp;
+	struct md_op_data *op_data = rp->rp_mod;
+	struct ptlrpc_request *req;
+	int max_pages;
+	struct inode *inode;
+	struct lu_fid *fid;
+	int rd_pgs = 0; /* number of pages actually read */
+	int npages;
+	int i;
+	int rc;
+	ENTRY;
+
+	max_pages = rp->rp_exp->exp_obd->u.cli.cl_max_pages_per_rpc;
+	inode = op_data->op_data;
+	fid = &op_data->op_fid1;
+	LASSERT(inode != NULL);
+
+	OBD_ALLOC_PTR_ARRAY_LARGE(page_pool, max_pages);
+	if (page_pool != NULL) {
+		page_pool[0] = page0;
+	} else {
+		page_pool = &page0;
+		max_pages = 1;
+	}
+
+	for (npages = 1; npages < max_pages; npages++) {
+		page = page_cache_alloc(inode->i_mapping);
+		if (page == NULL)
+			break;
+		page_pool[npages] = page;
+	}
+
+	rc = mdc_getpage(rp->rp_exp, fid, rp->rp_off, page_pool, npages, &req);
+	if (rc < 0) {
+		/* page0 is special, which was added into page cache early */
+		cfs_delete_from_page_cache(page0);
+	} else {
+		int lu_pgs;
+
+		rd_pgs = (req->rq_bulk->bd_nob_transferred + PAGE_SIZE - 1) >>
+			PAGE_SHIFT;
+		lu_pgs = req->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
+		LASSERT(!(req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
+
+		CDEBUG(D_INODE, "read %d(%d) pages\n", rd_pgs, lu_pgs);
+
+		mdc_adjust_dirpages(page_pool, rd_pgs, lu_pgs);
+
+		SetPageUptodate(page0);
+	}
+	unlock_page(page0);
+
+	ptlrpc_req_finished(req);
+	CDEBUG(D_CACHE, "read %d/%d pages\n", rd_pgs, npages);
+	for (i = 1; i < npages; i++) {
+		unsigned long	offset;
+		__u64		hash;
+		int ret;
+
+		page = page_pool[i];
+
+		if (rc < 0 || i >= rd_pgs) {
+			put_page(page);
+			continue;
+		}
+
+		SetPageUptodate(page);
+
+		dp = kmap(page);
+		hash = le64_to_cpu(dp->ldp_hash_start);
+		kunmap(page);
+
+		offset = hash_x_index(hash, rp->rp_hash64);
+
+		prefetchw(&page->flags);
+		ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
+					    GFP_KERNEL);
+		if (ret == 0)
+			unlock_page(page);
+		else
+			CDEBUG(D_VFSTRACE, "page %lu add to page cache failed:"
+			       " rc = %d\n", offset, ret);
+		put_page(page);
+	}
+
+	if (page_pool != &page0)
+		OBD_FREE_PTR_ARRAY_LARGE(page_pool, max_pages);
+
+	RETURN(rc);
+}
+
+#ifdef HAVE_READ_CACHE_PAGE_WANTS_FILE
+static inline int mdc_read_folio_remote(struct file *file, struct folio *folio)
+{
+	return ll_mdc_read_page_remote(file->private_data,
+				       folio_page(folio, 0));
+}
+#else
+#define mdc_read_folio_remote	ll_mdc_read_page_remote
+#endif
+
+/**
+ * Read dir page from cache first, if it can not find it, read it from
+ * server and add into the cache.
+ *
+ * \param[in] exp	MDC export
+ * \param[in] op_data	client MD stack parameters, transfering parameters
+ *                      between different layers on client MD stack.
+ * \param[in] mrinfo	callback required for ldlm lock enqueue during
+ *                      read page
+ * \param[in] hash_offset the hash offset of the page to be read
+ * \param[in] ppage	the page to be read
+ *
+ * retval		= 0 get the page successfully
+ *                      errno(<0) get the page failed
+ */
+static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data,
+			 struct md_readdir_info *mrinfo, __u64 hash_offset,
+			 struct page **ppage)
+{
+	struct lookup_intent	it = { .it_op = IT_READDIR };
+	struct page		*page;
+	struct inode		*dir = op_data->op_data;
+	struct address_space	*mapping;
+	struct lu_dirpage	*dp;
+	__u64			start = 0;
+	__u64			end = 0;
+	struct lustre_handle	lockh;
+	struct ptlrpc_request	*enq_req = NULL;
+	struct readpage_param	rp_param;
+	int rc;
+
+	ENTRY;
+
+	*ppage = NULL;
+
+	LASSERT(dir != NULL);
+	mapping = dir->i_mapping;
+
+	rc = mdc_intent_lock(exp, op_data, &it, &enq_req,
+			     mrinfo->mr_blocking_ast, 0);
+	if (enq_req != NULL)
+		ptlrpc_req_finished(enq_req);
+
+	if (rc < 0) {
+		CERROR("%s: "DFID" lock enqueue fails: rc = %d\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1), rc);
+		RETURN(rc);
+	}
+
+	rc = 0;
+	lockh.cookie = it.it_lock_handle;
+	mdc_set_lock_data(exp, &lockh, dir, NULL);
+
+	rp_param.rp_off = hash_offset;
+	rp_param.rp_hash64 = op_data->op_cli_flags & CLI_HASH64;
+	page = mdc_page_locate(mapping, &rp_param.rp_off, &start, &end,
+			       rp_param.rp_hash64);
+	if (IS_ERR(page)) {
+		CERROR("%s: dir page locate: "DFID" at %llu: rc %ld\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+		       rp_param.rp_off, PTR_ERR(page));
+		GOTO(out_unlock, rc = PTR_ERR(page));
+	} else if (page != NULL) {
+		/*
+		 * XXX nikita: not entirely correct handling of a corner case:
+		 * suppose hash chain of entries with hash value HASH crosses
+		 * border between pages P0 and P1. First both P0 and P1 are
+		 * cached, seekdir() is called for some entry from the P0 part
+		 * of the chain. Later P0 goes out of cache. telldir(HASH)
+		 * happens and finds P1, as it starts with matching hash
+		 * value. Remaining entries from P0 part of the chain are
+		 * skipped. (Is that really a bug?)
+		 *
+		 * Possible solutions: 0. don't cache P1 is such case, handle
+		 * it as an "overflow" page. 1. invalidate all pages at
+		 * once. 2. use HASH|1 as an index for P1.
+		 */
+		GOTO(hash_collision, page);
+	}
+
+	rp_param.rp_exp = exp;
+	rp_param.rp_mod = op_data;
+	page = ll_read_cache_page(mapping,
+				  hash_x_index(rp_param.rp_off,
+					       rp_param.rp_hash64),
+				  mdc_read_folio_remote, &rp_param);
+	if (IS_ERR(page)) {
+		CDEBUG(D_INFO, "%s: read cache page: "DFID" at %llu: %ld\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+		       rp_param.rp_off, PTR_ERR(page));
+		GOTO(out_unlock, rc = PTR_ERR(page));
+	}
+
+	wait_on_page_locked(page);
+	(void)kmap(page);
+	if (!PageUptodate(page)) {
+		CERROR("%s: page not updated: "DFID" at %llu: rc %d\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+		       rp_param.rp_off, -5);
+		goto fail;
+	}
+	if (!PageChecked(page))
+		SetPageChecked(page);
+	if (PageError(page)) {
+		CERROR("%s: page error: "DFID" at %llu: rc %d\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+		       rp_param.rp_off, -5);
+		goto fail;
+	}
+
+hash_collision:
+	dp = page_address(page);
+	if (BITS_PER_LONG == 32 && rp_param.rp_hash64) {
+		start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+		end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+		rp_param.rp_off = hash_offset >> 32;
+	} else {
+		start = le64_to_cpu(dp->ldp_hash_start);
+		end   = le64_to_cpu(dp->ldp_hash_end);
+		rp_param.rp_off = hash_offset;
+	}
+	if (end == start) {
+		LASSERT(start == rp_param.rp_off);
+		CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
+#if BITS_PER_LONG == 32
+		CWARN("Real page-wide hash collision at [%llu %llu] with "
+		      "hash %llu\n", le64_to_cpu(dp->ldp_hash_start),
+		      le64_to_cpu(dp->ldp_hash_end), hash_offset);
+#endif
+
+		/*
+		 * Fetch whole overflow chain...
+		 *
+		 * XXX not yet.
+		 */
+		goto fail;
+	}
+	*ppage = page;
+out_unlock:
+	ldlm_lock_decref(&lockh, it.it_lock_mode);
+	return rc;
+fail:
+	kunmap(page);
+	mdc_release_page(page, 1);
+	rc = -EIO;
+	goto out_unlock;
+}
+
+static int mdc_statfs_interpret(const struct lu_env *env,
+				struct ptlrpc_request *req, void *args, int rc)
+{
+	struct obd_info *oinfo = args;
+	struct obd_statfs *osfs;
+
+	if (!rc) {
+		osfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+		if (!osfs)
+			return -EPROTO;
+
+		oinfo->oi_osfs = osfs;
+
+		CDEBUG(D_CACHE, "blocks=%llu free=%llu avail=%llu "
+		       "objects=%llu free=%llu state=%x\n",
+			osfs->os_blocks, osfs->os_bfree, osfs->os_bavail,
+			osfs->os_files, osfs->os_ffree, osfs->os_state);
+	}
+
+	oinfo->oi_cb_up(oinfo, rc);
+
+	return rc;
+}
+
+static int mdc_statfs_async(struct obd_export *exp,
+			    struct obd_info *oinfo, time64_t max_age,
+			    struct ptlrpc_request_set *unused)
+{
+	struct ptlrpc_request *req;
+	struct obd_info *aa;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_STATFS,
+					LUSTRE_MDS_VERSION, MDS_STATFS);
+	if (req == NULL)
+		return -ENOMEM;
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = mdc_statfs_interpret;
+
+	aa = ptlrpc_req_async_args(aa, req);
+	*aa = *oinfo;
+
+	ptlrpcd_add_req(req);
+
+	return 0;
+}
+
+static int mdc_statfs(const struct lu_env *env,
+		      struct obd_export *exp, struct obd_statfs *osfs,
+		      time64_t max_age, __u32 flags)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct req_format *fmt;
+	struct ptlrpc_request *req;
+	struct obd_statfs *msfs;
+	struct obd_import *imp, *imp0;
+	int rc;
+	ENTRY;
+
+	/*
+	 * Since the request might also come from lprocfs, so we need
+	 * sync this with client_disconnect_export Bug15684
+	 */
+	with_imp_locked(obd, imp0, rc)
+		imp = class_import_get(imp0);
+	if (rc)
+		RETURN(rc);
+
+	fmt = &RQF_MDS_STATFS;
+	if ((exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS) &&
+	    (flags & OBD_STATFS_SUM))
+		fmt = &RQF_MDS_STATFS_NEW;
+	req = ptlrpc_request_alloc_pack(imp, fmt, LUSTRE_MDS_VERSION,
+					MDS_STATFS);
+	if (req == NULL)
+		GOTO(output, rc = -ENOMEM);
+	req->rq_allow_intr = 1;
+
+	if ((flags & OBD_STATFS_SUM) &&
+	    (exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS)) {
+		/* request aggregated states */
+		struct mdt_body *body;
+
+		body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			GOTO(out, rc = -EPROTO);
+		body->mbo_valid = OBD_MD_FLAGSTATFS;
+	}
+
+        ptlrpc_request_set_replen(req);
+
+        if (flags & OBD_STATFS_NODELAY) {
+                /* procfs requests not want stay in wait for avoid deadlock */
+                req->rq_no_resend = 1;
+                req->rq_no_delay = 1;
+        }
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc) {
+                /* check connection error first */
+                if (imp->imp_connect_error)
+                        rc = imp->imp_connect_error;
+                GOTO(out, rc);
+        }
+
+        msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+        if (msfs == NULL)
+                GOTO(out, rc = -EPROTO);
+
+        *osfs = *msfs;
+        EXIT;
+out:
+        ptlrpc_req_finished(req);
+output:
+        class_import_put(imp);
+        return rc;
+}
+
+static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf)
+{
+	__u32 keylen, vallen;
+	void *key;
+	int rc;
+
+	if (gf->gf_pathlen > PATH_MAX)
+		RETURN(-ENAMETOOLONG);
+	if (gf->gf_pathlen < 2)
+		RETURN(-EOVERFLOW);
+
+	/* Key is KEY_FID2PATH + getinfo_fid2path description */
+	keylen = cfs_size_round(sizeof(KEY_FID2PATH) + sizeof(*gf) +
+				sizeof(struct lu_fid));
+	OBD_ALLOC(key, keylen);
+	if (key == NULL)
+		RETURN(-ENOMEM);
+	memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH));
+	memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf));
+	memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf),
+	       gf->gf_u.gf_root_fid, sizeof(struct lu_fid));
+	CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno);
+
+	if (!fid_is_sane(&gf->gf_fid))
+		GOTO(out, rc = -EINVAL);
+
+        /* Val is struct getinfo_fid2path result plus path */
+        vallen = sizeof(*gf) + gf->gf_pathlen;
+
+	rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf);
+	if (rc != 0 && rc != -EREMOTE)
+		GOTO(out, rc);
+
+	if (vallen <= sizeof(*gf))
+		GOTO(out, rc = -EPROTO);
+	if (vallen > sizeof(*gf) + gf->gf_pathlen)
+		GOTO(out, rc = -EOVERFLOW);
+
+	CDEBUG(D_IOCTL, "path got "DFID" from %llu #%d: %s\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno,
+	       gf->gf_pathlen < 512 ? gf->gf_u.gf_path :
+	       /* only log the last 512 characters of the path */
+	       gf->gf_u.gf_path + gf->gf_pathlen - 512);
+
+out:
+	OBD_FREE(key, keylen);
+	return rc;
+}
+
+static int mdc_ioc_hsm_progress(struct obd_export *exp,
+				struct hsm_progress_kernel *hpk)
+{
+	struct obd_import		*imp = class_exp2cliimp(exp);
+	struct hsm_progress_kernel	*req_hpk;
+	struct ptlrpc_request		*req;
+	int				 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS,
+					LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0);
+
+	/* Copy hsm_progress struct */
+	req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS);
+	if (req_hpk == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*req_hpk = *hpk;
+	req_hpk->hpk_errval = lustre_errno_hton(hpk->hpk_errval);
+
+	ptlrpc_request_set_replen(req);
+
+	ptlrpc_get_mod_rpc_slot(req);
+	rc = ptlrpc_queue_wait(req);
+	ptlrpc_put_mod_rpc_slot(req);
+
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+/**
+ * Send hsm_ct_register to MDS
+ *
+ * \param[in]	imp		import
+ * \param[in]	archive_count	if in bitmap format, it is the bitmap,
+ *				else it is the count of archive_ids
+ * \param[in]	archives	if in bitmap format, it is NULL,
+ *				else it is archive_id lists
+ */
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archive_count,
+				   __u32 *archives)
+{
+	struct ptlrpc_request *req;
+	__u32 *archive_array;
+	size_t archives_size;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_CT_REGISTER);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	if (archives != NULL)
+		archives_size = sizeof(*archive_array) * archive_count;
+	else
+		archives_size = sizeof(archive_count);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_ARCHIVE,
+			     RCL_CLIENT, archives_size);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_CT_REGISTER);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(-ENOMEM);
+	}
+
+	mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0);
+
+	archive_array = req_capsule_client_get(&req->rq_pill,
+					       &RMF_MDS_HSM_ARCHIVE);
+	if (archive_array == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	if (archives != NULL)
+		memcpy(archive_array, archives, archives_size);
+	else
+		*archive_array = archive_count;
+
+	ptlrpc_request_set_replen(req);
+	req->rq_no_resend = 1;
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_current_action(struct obd_export *exp,
+				      struct md_op_data *op_data)
+{
+	struct hsm_current_action	*hca = op_data->op_data;
+	struct hsm_current_action	*req_hca;
+	struct ptlrpc_request		*req;
+	int				 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_ACTION);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(&req->rq_pill, &op_data->op_fid1, 0, 0,
+		      op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	req_hca = req_capsule_server_get(&req->rq_pill,
+					 &RMF_MDS_HSM_CURRENT_ACTION);
+	if (req_hca == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*hca = *req_hca;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp)
+{
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_UNREGISTER);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_get(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_user_state	*hus = op_data->op_data;
+	struct hsm_user_state	*req_hus;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_GET);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(&req->rq_pill, &op_data->op_fid1, 0, 0,
+		      op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE);
+	if (req_hus == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*hus = *req_hus;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_set(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_state_set	*hss = op_data->op_data;
+	struct hsm_state_set	*req_hss;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_SET);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(&req->rq_pill, &op_data->op_fid1, 0, 0,
+		      op_data->op_suppgids[0], 0);
+
+	/* Copy states */
+	req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET);
+	if (req_hss == NULL)
+		GOTO(out, rc = -EPROTO);
+	*req_hss = *hss;
+
+	ptlrpc_request_set_replen(req);
+
+	ptlrpc_get_mod_rpc_slot(req);
+	rc = ptlrpc_queue_wait(req);
+	ptlrpc_put_mod_rpc_slot(req);
+
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* For RESTORE and RELEASE the mdt will take EX lock on the file layout.
+ * So we can use early cancel on client side locks for that resource.
+ */
+static inline int mdc_hsm_request_lock_to_cancel(struct obd_export *exp,
+						 struct hsm_user_request *hur,
+						 struct list_head *cancels)
+{
+	struct hsm_user_item *hui = &hur->hur_user_item[0];
+	struct hsm_request *req_hr = &hur->hur_request;
+	int count = 0;
+	int i;
+
+	if (req_hr->hr_action != HUA_RESTORE &&
+	    req_hr->hr_action != HUA_RELEASE)
+		return 0;
+
+	for (i = 0; i < req_hr->hr_itemcount; i++, hui++) {
+		if (!fid_is_sane(&hui->hui_fid))
+			continue;
+		count += mdc_resource_get_unused(exp, &hui->hui_fid, cancels,
+						 LCK_EX, MDS_INODELOCK_LAYOUT);
+	}
+
+	return count;
+}
+
+static int mdc_ioc_hsm_request(struct obd_export *exp,
+			       struct hsm_user_request *hur)
+{
+	struct obd_import *imp = class_exp2cliimp(exp);
+	struct ptlrpc_request *req;
+	struct hsm_request *req_hr;
+	struct hsm_user_item *req_hui;
+	char *req_opaque;
+	LIST_HEAD(cancels);
+	int count;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT,
+			     hur->hur_request.hr_itemcount
+			     * sizeof(struct hsm_user_item));
+	req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT,
+			     hur->hur_request.hr_data_len);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/* Cancel existing locks */
+	count = mdc_hsm_request_lock_to_cancel(exp, hur, &cancels);
+	ldlm_cli_cancel_list(&cancels, count, NULL, 0);
+	mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0);
+
+	/* Copy hsm_request struct */
+	req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST);
+	if (req_hr == NULL)
+		GOTO(out, rc = -EPROTO);
+	*req_hr = hur->hur_request;
+
+	/* Copy hsm_user_item structs */
+	req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM);
+	if (req_hui == NULL)
+		GOTO(out, rc = -EPROTO);
+	memcpy(req_hui, hur->hur_user_item,
+	       hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item));
+
+	/* Copy opaque field */
+	req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA);
+	if (req_opaque == NULL)
+		GOTO(out, rc = -EPROTO);
+	memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len);
+
+	ptlrpc_request_set_replen(req);
+
+	ptlrpc_get_mod_rpc_slot(req);
+	rc = ptlrpc_queue_wait(req);
+	ptlrpc_put_mod_rpc_slot(req);
+
+	GOTO(out, rc);
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+                                struct lustre_kernelcomm *lk);
+
+static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                        struct obd_quotactl *oqctl)
+{
+	struct ptlrpc_request *req;
+	struct obd_quotactl *oqc;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_QUOTACTL);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+
+	if (LUSTRE_Q_CMD_IS_POOL(oqctl->qc_cmd))
+		req_capsule_set_size(&req->rq_pill,
+				     &RMF_OBD_QUOTACTL,
+				     RCL_CLIENT,
+				     sizeof(*oqc) + LOV_MAXPOOLNAME + 1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION,
+				 MDS_QUOTACTL);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	QCTL_COPY(oqc, oqctl);
+
+	ptlrpc_request_set_replen(req);
+	ptlrpc_at_set_req_timeout(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		CERROR("%s: ptlrpc_queue_wait failed: rc = %d\n",
+		       exp->exp_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	if (req->rq_repmsg &&
+	    (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+		QCTL_COPY(oqctl, oqc);
+	} else if (!rc) {
+		rc = -EPROTO;
+		CERROR("%s: cannot unpack obd_quotactl: rc = %d\n",
+			exp->exp_obd->obd_name, rc);
+	}
+out:
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static int mdc_ioc_swap_layouts(struct obd_export *exp,
+				struct md_op_data *op_data)
+{
+	LIST_HEAD(cancels);
+	struct ptlrpc_request	*req;
+	int			 rc, count;
+	struct mdc_swap_layouts *msl, *payload;
+	ENTRY;
+
+	msl = op_data->op_data;
+
+	/* When the MDT will get the MDS_SWAP_LAYOUTS RPC the
+	 * first thing it will do is to cancel the 2 layout
+	 * locks held by this client.
+	 * So the client must cancel its layout locks on the 2 fids
+	 * with the request RPC to avoid extra RPC round trips.
+	 */
+	count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels,
+					LCK_EX, MDS_INODELOCK_LAYOUT |
+					MDS_INODELOCK_XATTR);
+	count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels,
+					 LCK_EX, MDS_INODELOCK_LAYOUT |
+					 MDS_INODELOCK_XATTR);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_SWAP_LAYOUTS);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_swap_layouts_pack(&req->rq_pill, op_data);
+
+	payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS);
+	LASSERT(payload);
+
+	*payload = *msl;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void __user *uarg)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct obd_ioctl_data *data = karg;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	int rc;
+	ENTRY;
+
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("%s: cannot get module '%s'\n", obd->obd_name,
+		       module_name(THIS_MODULE));
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case OBD_IOC_FID2PATH:
+		rc = mdc_ioc_fid2path(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_CT_START:
+		rc = mdc_ioc_hsm_ct_start(exp, karg);
+		/* ignore if it was already registered on this MDS. */
+		if (rc == -EEXIST)
+			rc = 0;
+		GOTO(out, rc);
+	case LL_IOC_HSM_PROGRESS:
+		rc = mdc_ioc_hsm_progress(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_STATE_GET:
+		rc = mdc_ioc_hsm_state_get(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_STATE_SET:
+		rc = mdc_ioc_hsm_state_set(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_ACTION:
+		rc = mdc_ioc_hsm_current_action(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_REQUEST:
+		rc = mdc_ioc_hsm_request(exp, karg);
+		GOTO(out, rc);
+	case OBD_IOC_CLIENT_RECOVER:
+		rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0);
+		if (rc < 0)
+			GOTO(out, rc);
+		GOTO(out, rc = 0);
+	case IOC_OSC_SET_ACTIVE:
+		rc = ptlrpc_set_import_active(imp, data->ioc_offset);
+		GOTO(out, rc);
+	/*
+	 * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by
+	 * LMV instead of MDC. But when the cluster is upgraded from 1.8,
+	 * there'd be no LMV layer thus we might be called here. Eventually
+	 * this code should be removed.
+	 * bz20731, LU-592.
+	 */
+	case IOC_OBD_STATFS: {
+		struct obd_statfs stat_buf = {0};
+
+		if (*((__u32 *) data->ioc_inlbuf2) != 0)
+			GOTO(out, rc = -ENODEV);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd),
+				 min((int)data->ioc_plen2,
+				     (int)sizeof(struct obd_uuid))))
+			GOTO(out, rc = -EFAULT);
+
+		rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
+				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+				0);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			GOTO(out, rc = -EFAULT);
+
+		GOTO(out, rc = 0);
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct obd_quotactl *oqctl;
+
+		OBD_ALLOC_PTR(oqctl);
+		if (oqctl == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = obd->u.cli.cl_target_uuid;
+		}
+
+		OBD_FREE_PTR(oqctl);
+		GOTO(out, rc);
+	}
+	case LL_IOC_GET_CONNECT_FLAGS:
+		if (copy_to_user(uarg, exp_connect_flags_ptr(exp),
+				 sizeof(*exp_connect_flags_ptr(exp))))
+			GOTO(out, rc = -EFAULT);
+
+		GOTO(out, rc = 0);
+	case LL_IOC_LOV_SWAP_LAYOUTS:
+		rc = mdc_ioc_swap_layouts(exp, karg);
+		GOTO(out, rc);
+	default:
+		CERROR("unrecognised ioctl: cmd = %#x\n", cmd);
+		GOTO(out, rc = -ENOTTY);
+	}
+out:
+	module_put(THIS_MODULE);
+
+	return rc;
+}
+
+static int mdc_get_info_rpc(struct obd_export *exp,
+			    u32 keylen, void *key,
+			    u32 vallen, void *val)
+{
+        struct obd_import      *imp = class_exp2cliimp(exp);
+        struct ptlrpc_request  *req;
+        char                   *tmp;
+        int                     rc = -EINVAL;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+                             RCL_CLIENT, keylen);
+        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+			     RCL_CLIENT, sizeof(vallen));
+
+        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+        tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+        memcpy(tmp, key, keylen);
+        tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+	memcpy(tmp, &vallen, sizeof(vallen));
+
+        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+                             RCL_SERVER, vallen);
+        ptlrpc_request_set_replen(req);
+
+	/* if server failed to resolve FID, and OI scrub not able to fix it, it
+	 * will return -EINPROGRESS, ptlrpc_queue_wait() will keep retrying,
+	 * set request interruptible to avoid deadlock.
+	 */
+	if (KEY_IS(KEY_FID2PATH))
+		req->rq_allow_intr = 1;
+
+	rc = ptlrpc_queue_wait(req);
+	/* -EREMOTE means the get_info result is partial, and it needs to
+	 * continue on another MDT, see fid2path part in lmv_iocontrol */
+	if (rc == 0 || rc == -EREMOTE) {
+		tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL);
+		memcpy(val, tmp, vallen);
+		if (req_capsule_rep_need_swab(&req->rq_pill)) {
+			if (KEY_IS(KEY_FID2PATH))
+				lustre_swab_fid2path(val);
+		}
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static void lustre_swab_hai(struct hsm_action_item *h)
+{
+	__swab32s(&h->hai_len);
+	__swab32s(&h->hai_action);
+	lustre_swab_lu_fid(&h->hai_fid);
+	lustre_swab_lu_fid(&h->hai_dfid);
+	__swab64s(&h->hai_cookie);
+	__swab64s(&h->hai_extent.offset);
+	__swab64s(&h->hai_extent.length);
+	__swab64s(&h->hai_gid);
+}
+
+static void lustre_swab_hal(struct hsm_action_list *h)
+{
+	struct hsm_action_item	*hai;
+	__u32			 i;
+
+	__swab32s(&h->hal_version);
+	__swab32s(&h->hal_count);
+	__swab32s(&h->hal_archive_id);
+	__swab64s(&h->hal_flags);
+	hai = hai_first(h);
+	for (i = 0; i < h->hal_count; i++, hai = hai_next(hai))
+		lustre_swab_hai(hai);
+}
+
+static void lustre_swab_kuch(struct kuc_hdr *l)
+{
+        __swab16s(&l->kuc_magic);
+        /* __u8 l->kuc_transport */
+        __swab16s(&l->kuc_msgtype);
+        __swab16s(&l->kuc_msglen);
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+				struct lustre_kernelcomm *lk)
+{
+	struct obd_import *imp = class_exp2cliimp(exp);
+	int rc = 0;
+
+	if (lk->lk_group != KUC_GRP_HSM) {
+		CERROR("Bad copytool group %d\n", lk->lk_group);
+		return -EINVAL;
+	}
+
+	CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd,
+	       lk->lk_uid, lk->lk_group, lk->lk_flags);
+
+	if (lk->lk_flags & LK_FLG_STOP) {
+		/* Unregister with the coordinator */
+		rc = mdc_ioc_hsm_ct_unregister(imp);
+	} else {
+		__u32 *archives = NULL;
+
+		if ((lk->lk_flags & LK_FLG_DATANR) && lk->lk_data_count > 0)
+			archives = lk->lk_data;
+
+		rc = mdc_ioc_hsm_ct_register(imp, lk->lk_data_count, archives);
+	}
+
+	return rc;
+}
+
+/**
+ * Send a message to any listening copytools
+ * @param val KUC message (kuc_hdr + hsm_action_list)
+ * @param len total length of message
+ */
+static int mdc_hsm_copytool_send(const struct obd_uuid *uuid,
+				 size_t len, void *val)
+{
+	struct kuc_hdr		*lh = (struct kuc_hdr *)val;
+	struct hsm_action_list	*hal = (struct hsm_action_list *)(lh + 1);
+	int			 rc;
+	ENTRY;
+
+	if (len < sizeof(*lh) + sizeof(*hal)) {
+		CERROR("Short HSM message %zu < %zu\n", len,
+		       sizeof(*lh) + sizeof(*hal));
+		RETURN(-EPROTO);
+	}
+	if (lh->kuc_magic == __swab16(KUC_MAGIC)) {
+		lustre_swab_kuch(lh);
+		lustre_swab_hal(hal);
+	} else if (lh->kuc_magic != KUC_MAGIC) {
+		CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC);
+		RETURN(-EPROTO);
+	}
+
+	CDEBUG(D_HSM, " Received message mg=%x t=%d m=%d l=%d actions=%d "
+	       "on %s\n",
+	       lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype,
+	       lh->kuc_msglen, hal->hal_count, hal->hal_fsname);
+
+	/* Broadcast to HSM listeners */
+	rc = libcfs_kkuc_group_put(uuid, KUC_GRP_HSM, lh);
+
+	RETURN(rc);
+}
+
+/**
+ * callback function passed to kuc for re-registering each HSM copytool
+ * running on MDC, after MDT shutdown/recovery.
+ * @param data copytool registration data
+ * @param cb_arg callback argument (obd_import)
+ */
+static int mdc_hsm_ct_reregister(void *data, void *cb_arg)
+{
+	struct obd_import *imp = (struct obd_import *)cb_arg;
+	struct kkuc_ct_data *kcd = data;
+	__u32 *archives = NULL;
+	int rc;
+
+	if (kcd == NULL ||
+	    (kcd->kcd_magic != KKUC_CT_DATA_ARRAY_MAGIC &&
+	     kcd->kcd_magic != KKUC_CT_DATA_BITMAP_MAGIC))
+		return -EPROTO;
+
+	if (kcd->kcd_magic == KKUC_CT_DATA_BITMAP_MAGIC) {
+		CDEBUG(D_HA, "%s: recover copytool registration to MDT "
+		       "(archive=%#x)\n", imp->imp_obd->obd_name,
+		       kcd->kcd_nr_archives);
+	} else {
+		CDEBUG(D_HA, "%s: recover copytool registration to MDT "
+		       "(archive nr = %u)\n",
+		       imp->imp_obd->obd_name, kcd->kcd_nr_archives);
+		if (kcd->kcd_nr_archives != 0)
+			archives = kcd->kcd_archives;
+	}
+
+	rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_nr_archives, archives);
+	/* ignore error if the copytool is already registered */
+	return (rc == -EEXIST) ? 0 : rc;
+}
+
+/**
+ * Re-establish all kuc contexts with MDT
+ * after MDT shutdown/recovery.
+ */
+static int mdc_kuc_reregister(struct obd_import *imp)
+{
+	/* re-register HSM agents */
+	return libcfs_kkuc_group_foreach(&imp->imp_obd->obd_uuid, KUC_GRP_HSM,
+					 mdc_hsm_ct_reregister, imp);
+}
+
+static int mdc_set_info_async(const struct lu_env *env,
+			      struct obd_export *exp,
+			      u32 keylen, void *key,
+			      u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
+{
+	struct obd_import	*imp = class_exp2cliimp(exp);
+	int			 rc;
+	ENTRY;
+
+	if (KEY_IS(KEY_READ_ONLY)) {
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+
+		spin_lock(&imp->imp_lock);
+		if (*((int *)val)) {
+			imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags |=
+							OBD_CONNECT_RDONLY;
+		} else {
+			imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags &=
+							~OBD_CONNECT_RDONLY;
+		}
+		spin_unlock(&imp->imp_lock);
+
+                rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+                                       keylen, key, vallen, val, set);
+                RETURN(rc);
+        }
+        if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
+                rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+                                       keylen, key, vallen, val, set);
+                RETURN(rc);
+        }
+        if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) {
+		rc = mdc_hsm_copytool_send(&imp->imp_obd->obd_uuid, vallen,
+					   val);
+                RETURN(rc);
+        }
+
+	if (KEY_IS(KEY_DEFAULT_EASIZE)) {
+		__u32 *default_easize = val;
+
+		exp->exp_obd->u.cli.cl_default_mds_easize = *default_easize;
+		RETURN(0);
+	}
+
+	rc = osc_set_info_async(env, exp, keylen, key, vallen, val, set);
+	RETURN(rc);
+}
+
+static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val)
+{
+	int rc = -EINVAL;
+
+	if (KEY_IS(KEY_MAX_EASIZE)) {
+		__u32 mdsize, *max_easize;
+
+		if (*vallen != sizeof(int))
+			RETURN(-EINVAL);
+		mdsize = *(__u32 *)val;
+		if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+			exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+		max_easize = val;
+		*max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+		RETURN(0);
+	} else if (KEY_IS(KEY_DEFAULT_EASIZE)) {
+		__u32 *default_easize;
+
+		if (*vallen != sizeof(int))
+			RETURN(-EINVAL);
+		default_easize = val;
+		*default_easize = exp->exp_obd->u.cli.cl_default_mds_easize;
+		RETURN(0);
+        } else if (KEY_IS(KEY_CONN_DATA)) {
+                struct obd_import *imp = class_exp2cliimp(exp);
+                struct obd_connect_data *data = val;
+
+                if (*vallen != sizeof(*data))
+                        RETURN(-EINVAL);
+
+                *data = imp->imp_connect_data;
+                RETURN(0);
+        } else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((__u32 *)val) = 1;
+                RETURN(0);
+        }
+
+        rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val);
+
+        RETURN(rc);
+}
+
+static int mdc_fsync(struct obd_export *exp, const struct lu_fid *fid,
+		     struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        int                    rc;
+        ENTRY;
+
+        *request = NULL;
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+	mdc_pack_body(&req->rq_pill, fid, 0, 0, -1, 0);
+
+        ptlrpc_request_set_replen(req);
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                ptlrpc_req_finished(req);
+        else
+                *request = req;
+        RETURN(rc);
+}
+
+struct mdc_rmfid_args {
+	int *mra_rcs;
+	int mra_nr;
+};
+
+int mdc_rmfid_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			  void *args, int rc)
+{
+	struct mdc_rmfid_args *aa;
+	int *rcs, size;
+	ENTRY;
+
+	if (!rc) {
+		aa = ptlrpc_req_async_args(aa, req);
+
+		size = req_capsule_get_size(&req->rq_pill, &RMF_RCS,
+					    RCL_SERVER);
+		LASSERT(size == sizeof(int) * aa->mra_nr);
+		rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS);
+		LASSERT(rcs);
+		LASSERT(aa->mra_rcs);
+		LASSERT(aa->mra_nr);
+		memcpy(aa->mra_rcs, rcs, size);
+	}
+
+	RETURN(rc);
+}
+
+static int mdc_rmfid(struct obd_export *exp, struct fid_array *fa,
+		     int *rcs, struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	struct mdc_rmfid_args *aa;
+	struct mdt_body *b;
+	struct lu_fid *tmp;
+	int rc, flen;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_RMFID);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	flen = fa->fa_nr * sizeof(struct lu_fid);
+	req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY,
+			     RCL_CLIENT, flen);
+	req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY,
+			     RCL_SERVER, flen);
+	req_capsule_set_size(&req->rq_pill, &RMF_RCS,
+			     RCL_SERVER, fa->fa_nr * sizeof(__u32));
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_RMFID);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_FID_ARRAY);
+	memcpy(tmp, fa->fa_fids, flen);
+
+	mdc_pack_body(&req->rq_pill, NULL, 0, 0, -1, 0);
+	b = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+	b->mbo_ctime = ktime_get_real_seconds();
+
+	ptlrpc_request_set_replen(req);
+
+	LASSERT(rcs);
+	aa = ptlrpc_req_async_args(aa, req);
+	aa->mra_rcs = rcs;
+	aa->mra_nr = fa->fa_nr;
+	req->rq_interpret_reply = mdc_rmfid_interpret;
+
+	ptlrpc_set_add_req(set, req);
+	ptlrpc_check_set(NULL, set);
+
+	RETURN(rc);
+}
+
+static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc = 0;
+
+	LASSERT(imp->imp_obd == obd);
+
+	switch (event) {
+	case IMP_EVENT_DISCON:
+		spin_lock(&cli->cl_loi_list_lock);
+		cli->cl_avail_grant = 0;
+		cli->cl_lost_grant = 0;
+		spin_unlock(&cli->cl_loi_list_lock);
+		break;
+	case IMP_EVENT_INACTIVE:
+		/*
+		 * Flush current sequence to make client obtain new one
+		 * from server in case of disconnect/reconnect.
+		 */
+		down_read(&cli->cl_seq_rwsem);
+		if (cli->cl_seq)
+			seq_client_flush(cli->cl_seq);
+		up_read(&cli->cl_seq_rwsem);
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
+		break;
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+		struct lu_env *env;
+		__u16 refcheck;
+
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			/* Reset grants. All pages go to failing rpcs due to
+			 * the invalid import.
+			 */
+			osc_io_unplug(env, cli, NULL);
+
+			cfs_hash_for_each_nolock(ns->ns_rs_hash,
+						 osc_ldlm_resource_invalidate,
+						 env, 0);
+			cl_env_put(env, &refcheck);
+			ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+		} else {
+			rc = PTR_ERR(env);
+		}
+		break;
+	}
+	case IMP_EVENT_ACTIVE:
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
+		/* redo the kuc registration after reconnecting */
+		if (rc == 0)
+			rc = mdc_kuc_reregister(imp);
+		break;
+	case IMP_EVENT_OCD: {
+		struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+		if (OCD_HAS_FLAG(ocd, GRANT))
+			osc_init_grant(cli, ocd);
+
+		md_init_ea_size(obd->obd_self_export, ocd->ocd_max_easize, 0);
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
+		break;
+	}
+	case IMP_EVENT_DEACTIVATE:
+	case IMP_EVENT_ACTIVATE:
+		break;
+	default:
+		CERROR("Unknown import event %x\n", event);
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+		  struct lu_fid *fid, struct md_op_data *op_data)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	int rc = -EIO;
+
+	ENTRY;
+
+	down_read(&cli->cl_seq_rwsem);
+	if (cli->cl_seq)
+		rc = seq_client_alloc_fid(env, cli->cl_seq, fid);
+	up_read(&cli->cl_seq_rwsem);
+
+	RETURN(rc);
+}
+
+static struct obd_uuid *mdc_get_uuid(struct obd_export *exp)
+{
+        struct client_obd *cli = &exp->exp_obd->u.cli;
+        return &cli->cl_target_uuid;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying it during
+ * recovery, non zero value will be return if the lock can be canceled,
+ * or zero returned for not
+ */
+static int mdc_cancel_weight(struct ldlm_lock *lock)
+{
+	if (lock->l_resource->lr_type != LDLM_IBITS)
+		RETURN(0);
+
+	/* FIXME: if we ever get into a situation where there are too many
+	 * opened files with open locks on a single node, then we really
+	 * should replay these open locks to reget it */
+	if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
+		RETURN(0);
+
+	/* Special case for DoM locks, cancel only unused and granted locks */
+	if (ldlm_has_dom(lock) &&
+	    (lock->l_granted_mode != lock->l_req_mode ||
+	     osc_ldlm_weigh_ast(lock) != 0))
+		RETURN(0);
+
+	RETURN(1);
+}
+
+static int mdc_resource_inode_free(struct ldlm_resource *res)
+{
+	if (res->lr_lvb_inode)
+		res->lr_lvb_inode = NULL;
+
+	return 0;
+}
+
+static struct ldlm_valblock_ops inode_lvbo = {
+	.lvbo_free = mdc_resource_inode_free
+};
+
+static int mdc_llog_init(struct obd_device *obd)
+{
+	struct obd_llog_group	*olg = &obd->obd_olg;
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, obd,
+			&llog_client_ops);
+	if (rc < 0)
+		RETURN(rc);
+
+	ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT);
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+}
+
+static void mdc_llog_finish(struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctxt != NULL)
+		llog_cleanup(NULL, ctxt);
+
+	EXIT;
+}
+
+int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = osc_setup_common(obd, cfg);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = mdc_tunables_init(obd);
+	if (rc)
+		GOTO(err_osc_cleanup, rc);
+
+	obd->u.cli.cl_dom_min_inline_repsize = MDC_DOM_DEF_INLINE_REPSIZE;
+	obd->u.cli.cl_lsom_update = true;
+
+	ns_register_cancel(obd->obd_namespace, mdc_cancel_weight);
+
+	obd->obd_namespace->ns_lvbo = &inode_lvbo;
+
+	rc = mdc_llog_init(obd);
+        if (rc) {
+                CERROR("%s: failed to setup llogging subsystems: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(err_llog_cleanup, rc);
+        }
+
+	rc = mdc_changelog_cdev_init(obd);
+	if (rc) {
+		CERROR("%s: failed to setup changelog char device: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(err_changelog_cleanup, rc);
+	}
+
+	RETURN(rc);
+
+err_changelog_cleanup:
+	mdc_llog_finish(obd);
+err_llog_cleanup:
+	lprocfs_free_md_stats(obd);
+	ptlrpc_lprocfs_unregister_obd(obd);
+err_osc_cleanup:
+	osc_cleanup_common(obd);
+	return rc;
+}
+
+/* Initialize the default and maximum LOV EA sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold a default
+ * sized EA without having to calculate this (via a call into the
+ * LOV + OSCs) each time we make an RPC.  The maximum size is also tracked
+ * but not used to avoid wastefully vmalloc()'ing large reply buffers when
+ * a large number of stripes is possible.  If a larger reply buffer is
+ * required it will be reallocated in the ptlrpc layer due to overflow.
+ */
+static int mdc_init_ea_size(struct obd_export *exp, __u32 easize,
+			    __u32 def_easize)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	if (cli->cl_max_mds_easize < easize)
+		cli->cl_max_mds_easize = easize;
+
+	if (cli->cl_default_mds_easize < def_easize)
+		cli->cl_default_mds_easize = def_easize;
+
+	RETURN(0);
+}
+
+static int mdc_precleanup(struct obd_device *obd)
+{
+	ENTRY;
+
+	osc_precleanup_common(obd);
+	mdc_changelog_cdev_finish(obd);
+
+	obd_cleanup_client_import(obd);
+	ptlrpc_lprocfs_unregister_obd(obd);
+	lprocfs_free_md_stats(obd);
+	mdc_llog_finish(obd);
+	RETURN(0);
+}
+
+static int mdc_cleanup(struct obd_device *obd)
+{
+	return osc_cleanup_common(obd);
+}
+
+static const struct obd_ops mdc_obd_ops = {
+	.o_owner	    = THIS_MODULE,
+	.o_setup	    = mdc_setup,
+	.o_precleanup       = mdc_precleanup,
+	.o_cleanup	    = mdc_cleanup,
+	.o_add_conn	    = client_import_add_conn,
+	.o_del_conn	    = client_import_del_conn,
+	.o_connect	    = client_connect_import,
+	.o_reconnect	    = osc_reconnect,
+	.o_disconnect	    = osc_disconnect,
+	.o_iocontrol	    = mdc_iocontrol,
+	.o_set_info_async   = mdc_set_info_async,
+	.o_statfs	    = mdc_statfs,
+	.o_statfs_async     = mdc_statfs_async,
+	.o_fid_init	    = client_fid_init,
+	.o_fid_fini	    = client_fid_fini,
+	.o_fid_alloc	    = mdc_fid_alloc,
+	.o_import_event     = mdc_import_event,
+	.o_get_info	    = mdc_get_info,
+	.o_get_uuid	    = mdc_get_uuid,
+	.o_quotactl	    = mdc_quotactl,
+};
+
+static const struct md_ops mdc_md_ops = {
+	.m_get_root	    = mdc_get_root,
+	.m_null_inode	    = mdc_null_inode,
+	.m_close            = mdc_close,
+	.m_create           = mdc_create,
+	.m_enqueue          = mdc_enqueue,
+	.m_getattr          = mdc_getattr,
+	.m_getattr_name     = mdc_getattr_name,
+	.m_intent_lock      = mdc_intent_lock,
+	.m_link             = mdc_link,
+	.m_rename           = mdc_rename,
+	.m_setattr          = mdc_setattr,
+	.m_setxattr         = mdc_setxattr,
+	.m_getxattr         = mdc_getxattr,
+	.m_fsync		= mdc_fsync,
+	.m_file_resync		= mdc_file_resync,
+	.m_read_page		= mdc_read_page,
+	.m_unlink           = mdc_unlink,
+	.m_cancel_unused    = mdc_cancel_unused,
+	.m_init_ea_size     = mdc_init_ea_size,
+	.m_set_lock_data    = mdc_set_lock_data,
+	.m_lock_match       = mdc_lock_match,
+	.m_get_lustre_md    = mdc_get_lustre_md,
+	.m_free_lustre_md   = mdc_free_lustre_md,
+	.m_set_open_replay_data = mdc_set_open_replay_data,
+	.m_clear_open_replay_data = mdc_clear_open_replay_data,
+	.m_intent_getattr_async = mdc_intent_getattr_async,
+	.m_revalidate_lock      = mdc_revalidate_lock,
+	.m_rmfid		= mdc_rmfid,
+};
+
+dev_t mdc_changelog_dev;
+struct class *mdc_changelog_class;
+static int __init mdc_init(void)
+{
+	int rc = 0;
+	rc = alloc_chrdev_region(&mdc_changelog_dev, 0,
+				 MDC_CHANGELOG_DEV_COUNT,
+				 MDC_CHANGELOG_DEV_NAME);
+	if (rc)
+		return rc;
+
+	mdc_changelog_class = class_create(THIS_MODULE, MDC_CHANGELOG_DEV_NAME);
+	if (IS_ERR(mdc_changelog_class)) {
+		rc = PTR_ERR(mdc_changelog_class);
+		goto out_dev;
+	}
+
+	rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, true,
+				 LUSTRE_MDC_NAME, &mdc_device_type);
+	if (rc)
+		goto out_class;
+
+	return 0;
+
+out_class:
+	class_destroy(mdc_changelog_class);
+out_dev:
+	unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT);
+	return rc;
+}
+
+static void __exit mdc_exit(void)
+{
+	class_unregister_type(LUSTRE_MDC_NAME);
+	class_destroy(mdc_changelog_class);
+	unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT);
+	idr_destroy(&mdc_changelog_minor_idr);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Metadata Client");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(mdc_init);
+module_exit(mdc_exit);
diff --git a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
new file mode 100644
index 0000000000000..051e31559c647
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
@@ -0,0 +1,132 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "mgc_internal.h"
+
+#ifdef CONFIG_PROC_FS
+
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
+
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
+
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, import);
+
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, state);
+
+static int mgc_ir_state_seq_show(struct seq_file *m, void *v)
+{
+	return lprocfs_mgc_rd_ir_state(m, m->private);
+}
+
+LDEBUGFS_SEQ_FOPS_RO(mgc_ir_state);
+
+struct ldebugfs_vars ldebugfs_mgc_obd_vars[] = {
+	{ .name	=	"connect_flags",
+	  .fops	=	&mgc_connect_flags_fops	},
+	{ .name	=	"mgs_server_uuid",
+	  .fops	=	&mgc_server_uuid_fops	},
+	{ .name	=	"import",
+	  .fops	=	&mgc_import_fops	},
+	{ .name	=	"state",
+	  .fops	=	&mgc_state_fops		},
+	{ .name	=	"ir_state",
+	  .fops	=	&mgc_ir_state_fops	},
+	{ NULL }
+};
+#endif /* CONFIG_PROC_FS */
+
+LUSTRE_ATTR(mgs_conn_uuid, 0444, conn_uuid_show, NULL);
+LUSTRE_RO_ATTR(conn_uuid);
+
+LUSTRE_RW_ATTR(ping);
+
+ssize_t dynamic_nids_show(struct kobject *kobj, struct attribute *attr,
+			  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	ssize_t count;
+
+	ENTRY;
+	count = snprintf(buf, PAGE_SIZE, "%u\n", obd->obd_dynamic_nids);
+
+	RETURN(count);
+}
+
+ssize_t dynamic_nids_store(struct kobject *kobj, struct attribute *attr,
+			   const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
+	int rc;
+
+	ENTRY;
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_dynamic_nids = val;
+	spin_unlock(&obd->obd_dev_lock);
+
+	RETURN(count);
+}
+
+LUSTRE_RW_ATTR(dynamic_nids);
+
+static struct attribute *mgc_attrs[] = {
+	&lustre_attr_mgs_conn_uuid.attr,
+	&lustre_attr_conn_uuid.attr,
+	&lustre_attr_ping.attr,
+	&lustre_attr_dynamic_nids.attr,
+	NULL,
+};
+
+KOBJ_ATTRIBUTE_GROUPS(mgc);
+
+int mgc_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(mgc);
+	obd->obd_debugfs_vars = ldebugfs_mgc_obd_vars;
+	rc = lprocfs_obd_setup(obd, true);
+	if (rc)
+		return rc;
+
+	return sptlrpc_lprocfs_cliobd_attach(obd);
+}
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
new file mode 100644
index 0000000000000..2289972d1a82c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
@@ -0,0 +1,74 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef _MGC_INTERNAL_H
+#define _MGC_INTERNAL_H
+
+#include <libcfs/libcfs.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_export.h>
+
+int mgc_tunables_init(struct obd_device *obd);
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
+
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
+
+/* this timeout represents how many seconds MGC should wait before
+ * requeue config and recover lock to the MGS. We need to randomize this
+ * in order to not flood the MGS.
+ */
+#define MGC_TIMEOUT_MIN_SECONDS		5
+
+extern unsigned int mgc_requeue_timeout_min;
+
+static inline bool cld_is_sptlrpc(struct config_llog_data *cld)
+{
+	return cld->cld_type == MGS_CFG_T_SPTLRPC;
+}
+
+static inline bool cld_is_recover(struct config_llog_data *cld)
+{
+	return cld->cld_type == MGS_CFG_T_RECOVER;
+}
+
+static inline bool cld_is_nodemap(struct config_llog_data *cld)
+{
+	return cld->cld_type == MGS_CFG_T_NODEMAP;
+}
+
+static inline bool cld_is_barrier(struct config_llog_data *cld)
+{
+	return cld->cld_type == MGS_CFG_T_BARRIER;
+}
+
+#endif  /* _MGC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
new file mode 100644
index 0000000000000..39df17e03959e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
@@ -0,0 +1,2333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/mgc/mgc_request.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGC
+#define D_MGC D_CONFIG /*|D_WARNING*/
+
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+
+#include <dt_object.h>
+#include <lprocfs_status.h>
+#include <lustre_dlm.h>
+#include <lustre_disk.h>
+#include <lustre_log.h>
+#include <lustre_nodemap.h>
+#include <lustre_swab.h>
+#include <obd_class.h>
+#include <lustre_barrier.h>
+
+#include "mgc_internal.h"
+
+static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
+			  enum mgs_cfg_type type)
+{
+        __u64 resname = 0;
+
+	if (len > sizeof(resname)) {
+                CERROR("name too long: %s\n", name);
+                return -EINVAL;
+        }
+        if (len <= 0) {
+                CERROR("missing name: %s\n", name);
+                return -EINVAL;
+        }
+        memcpy(&resname, name, len);
+
+        /* Always use the same endianness for the resid */
+        memset(res_id, 0, sizeof(*res_id));
+        res_id->name[0] = cpu_to_le64(resname);
+        /* XXX: unfortunately, sptlprc and config llog share one lock */
+        switch(type) {
+	case MGS_CFG_T_CONFIG:
+	case MGS_CFG_T_SPTLRPC:
+                resname = 0;
+                break;
+	case MGS_CFG_T_RECOVER:
+	case MGS_CFG_T_PARAMS:
+	case MGS_CFG_T_NODEMAP:
+	case MGS_CFG_T_BARRIER:
+		resname = type;
+		break;
+        default:
+                LBUG();
+        }
+        res_id->name[1] = cpu_to_le64(resname);
+	CDEBUG(D_MGC, "log %s to resid %#llx/%#llx (%.8s)\n", name,
+               res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
+        return 0;
+}
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id,
+		     enum mgs_cfg_type type)
+{
+        /* fsname is at most 8 chars long, maybe contain "-".
+         * e.g. "lustre", "SUN-000" */
+        return mgc_name2resid(fsname, strlen(fsname), res_id, type);
+}
+EXPORT_SYMBOL(mgc_fsname2resid);
+
+int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id,
+		      enum mgs_cfg_type type)
+{
+	char *name_end;
+	int len;
+
+	/* logname consists of "fsname-nodetype".
+	 * e.g. "lustre-MDT0001", "SUN-000-client"
+	 * there is an exception: llog "params" */
+	name_end = strrchr(logname, '-');
+	if (!name_end)
+		len = strlen(logname);
+	else
+		len = name_end - logname;
+	return mgc_name2resid(logname, len, res_id, type);
+}
+EXPORT_SYMBOL(mgc_logname2resid);
+
+/********************** config llog list **********************/
+static LIST_HEAD(config_llog_list);
+static DEFINE_SPINLOCK(config_list_lock);	/* protects config_llog_list */
+
+/* Take a reference to a config log */
+static int config_log_get(struct config_llog_data *cld)
+{
+	ENTRY;
+	atomic_inc(&cld->cld_refcount);
+	CDEBUG(D_INFO, "log %s (%p) refs %d\n", cld->cld_logname, cld,
+		atomic_read(&cld->cld_refcount));
+	RETURN(0);
+}
+
+/* Drop a reference to a config log.  When no longer referenced,
+   we can free the config log data */
+static void config_log_put(struct config_llog_data *cld)
+{
+	ENTRY;
+
+	if (unlikely(!cld))
+		RETURN_EXIT;
+
+	CDEBUG(D_INFO, "log %s(%p) refs %d\n", cld->cld_logname, cld,
+		atomic_read(&cld->cld_refcount));
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/* spinlock to make sure no item with 0 refcount in the list */
+	if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) {
+		list_del(&cld->cld_list_chain);
+		spin_unlock(&config_list_lock);
+
+		CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+
+		config_log_put(cld->cld_barrier);
+		config_log_put(cld->cld_recover);
+		config_log_put(cld->cld_params);
+		config_log_put(cld->cld_nodemap);
+		config_log_put(cld->cld_sptlrpc);
+		if (cld_is_sptlrpc(cld)) {
+			cld->cld_stopping = 1;
+			sptlrpc_conf_log_stop(cld->cld_logname);
+		}
+
+		class_export_put(cld->cld_mgcexp);
+		OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+	}
+
+	EXIT;
+}
+
+/* Find a config log by name */
+static
+struct config_llog_data *config_log_find(char *logname,
+                                         struct config_llog_instance *cfg)
+{
+	struct config_llog_data *cld;
+	struct config_llog_data *found = NULL;
+	unsigned long cfg_instance;
+
+	ENTRY;
+	LASSERT(logname != NULL);
+
+	cfg_instance = cfg ? cfg->cfg_instance : 0;
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		/* check if cfg_instance is the one we want */
+		if (cfg_instance != cld->cld_cfg.cfg_instance)
+			continue;
+
+		/* instance may be NULL, should check name */
+		if (strcmp(logname, cld->cld_logname) == 0) {
+			found = cld;
+			config_log_get(found);
+			break;
+		}
+	}
+	spin_unlock(&config_list_lock);
+	RETURN(found);
+}
+
+static
+struct config_llog_data *do_config_log_add(struct obd_device *obd,
+					   char *logname,
+					   enum mgs_cfg_type type,
+					   struct config_llog_instance *cfg,
+					   struct super_block *sb)
+{
+	struct config_llog_data *cld;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_MGC, "do adding config log %s-%016lx\n", logname,
+	       cfg ? cfg->cfg_instance : 0);
+
+	OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
+	if (!cld)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = mgc_logname2resid(logname, &cld->cld_resid, type);
+	if (rc) {
+		OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+		RETURN(ERR_PTR(rc));
+	}
+
+	strcpy(cld->cld_logname, logname);
+	if (cfg)
+		cld->cld_cfg = *cfg;
+	else
+		cld->cld_cfg.cfg_callback = class_config_llog_handler;
+	mutex_init(&cld->cld_lock);
+	cld->cld_cfg.cfg_last_idx = 0;
+	cld->cld_cfg.cfg_flags = 0;
+	cld->cld_cfg.cfg_sb = sb;
+	cld->cld_type = type;
+	atomic_set(&cld->cld_refcount, 1);
+
+	/* Keep the mgc around until we are done */
+	cld->cld_mgcexp = class_export_get(obd->obd_self_export);
+
+	if (cld_is_sptlrpc(cld))
+		sptlrpc_conf_log_start(logname);
+
+	spin_lock(&config_list_lock);
+	list_add(&cld->cld_list_chain, &config_llog_list);
+	spin_unlock(&config_list_lock);
+
+	if (cld_is_sptlrpc(cld) || cld_is_nodemap(cld) || cld_is_barrier(cld)) {
+		rc = mgc_process_log(obd, cld);
+		if (rc && rc != -ENOENT)
+			CERROR("%s: failed processing log, type %d: rc = %d\n",
+			       obd->obd_name, type, rc);
+	}
+
+	RETURN(cld);
+}
+
+static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
+					char *fsname,
+					struct config_llog_instance *cfg,
+					struct super_block *sb)
+{
+	struct config_llog_instance lcfg = *cfg;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld;
+	char logname[32];
+
+	if (IS_OST(lsi))
+		return NULL;
+
+	/* for osp-on-ost, see lustre_start_osp() */
+	if (IS_MDT(lsi) && lcfg.cfg_instance)
+		return NULL;
+
+	/* We have to use different llog for clients and MDTs for DNE,
+	 * where only clients are notified if one of DNE server restarts.
+	 */
+	LASSERT(strlen(fsname) < sizeof(logname) / 2);
+	strncpy(logname, fsname, sizeof(logname));
+	if (IS_SERVER(lsi)) { /* mdt */
+		LASSERT(lcfg.cfg_instance == 0);
+		lcfg.cfg_instance = ll_get_cfg_instance(sb);
+		strncat(logname, "-mdtir", sizeof(logname));
+	} else {
+		LASSERT(lcfg.cfg_instance != 0);
+		strncat(logname, "-cliir", sizeof(logname));
+	}
+
+	cld = do_config_log_add(obd, logname, MGS_CFG_T_RECOVER, &lcfg, sb);
+	return cld;
+}
+
+static struct config_llog_data *
+config_log_find_or_add(struct obd_device *obd, char *logname,
+		       struct super_block *sb, enum mgs_cfg_type type,
+		       struct config_llog_instance *cfg)
+{
+	struct config_llog_instance lcfg = *cfg;
+	struct config_llog_data *cld;
+
+	/* Note class_config_llog_handler() depends on getting "obd" back */
+	lcfg.cfg_instance = sb ? ll_get_cfg_instance(sb) : (unsigned long)obd;
+
+	cld = config_log_find(logname, &lcfg);
+	if (unlikely(cld != NULL))
+		return cld;
+
+	return do_config_log_add(obd, logname, type, &lcfg, sb);
+}
+
+/** Add this log to the list of active logs watched by an MGC.
+ * Active means we're watching for updates.
+ * We have one active log per "mount" - client instance or servername.
+ * Each instance may be at a different point in the log.
+ */
+static struct config_llog_data *
+config_log_add(struct obd_device *obd, char *logname,
+	       struct config_llog_instance *cfg, struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld = NULL;
+	struct config_llog_data *sptlrpc_cld = NULL;
+	struct config_llog_data *params_cld = NULL;
+	struct config_llog_data *nodemap_cld = NULL;
+	struct config_llog_data *barrier_cld = NULL;
+	char seclogname[32];
+	char *ptr;
+	int rc;
+	bool locked = false;
+	ENTRY;
+
+	CDEBUG(D_MGC, "add config log %s-%016lx\n", logname,
+	       cfg->cfg_instance);
+
+	/*
+	 * for each regular log, the depended sptlrpc log name is
+	 * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
+	 */
+	ptr = strrchr(logname, '-');
+	if (ptr == NULL || ptr - logname > 8) {
+		CERROR("logname %s is too long\n", logname);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	memcpy(seclogname, logname, ptr - logname);
+	strcpy(seclogname + (ptr - logname), "-sptlrpc");
+
+	if (cfg->cfg_sub_clds & CONFIG_SUB_SPTLRPC) {
+		sptlrpc_cld = config_log_find_or_add(obd, seclogname, NULL,
+						     MGS_CFG_T_SPTLRPC, cfg);
+		if (IS_ERR(sptlrpc_cld)) {
+			CERROR("%s: can't create sptlrpc log %s: rc = %ld\n",
+			       obd->obd_name, seclogname, PTR_ERR(sptlrpc_cld));
+			RETURN(sptlrpc_cld);
+		}
+	}
+
+	if (!IS_MGS(lsi) && cfg->cfg_sub_clds & CONFIG_SUB_NODEMAP) {
+		nodemap_cld = config_log_find_or_add(obd, LUSTRE_NODEMAP_NAME,
+						     NULL, MGS_CFG_T_NODEMAP,
+						     cfg);
+		if (IS_ERR(nodemap_cld)) {
+			rc = PTR_ERR(nodemap_cld);
+			CERROR("%s: cannot create nodemap log: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out_sptlrpc, rc);
+		}
+	}
+
+	if (cfg->cfg_sub_clds & CONFIG_SUB_PARAMS) {
+		params_cld = config_log_find_or_add(obd, PARAMS_FILENAME, sb,
+						    MGS_CFG_T_PARAMS, cfg);
+		if (IS_ERR(params_cld)) {
+			rc = PTR_ERR(params_cld);
+			CERROR("%s: can't create params log: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out_nodemap, rc);
+		}
+	}
+
+	if (IS_MDT(s2lsi(sb)) && cfg->cfg_sub_clds & CONFIG_SUB_BARRIER) {
+		snprintf(seclogname + (ptr - logname), sizeof(seclogname) - 1,
+			 "-%s", BARRIER_FILENAME);
+		barrier_cld = config_log_find_or_add(obd, seclogname, sb,
+						     MGS_CFG_T_BARRIER, cfg);
+		if (IS_ERR(barrier_cld)) {
+			rc = PTR_ERR(barrier_cld);
+			CERROR("%s: can't create barrier log: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out_params, rc);
+		}
+	}
+
+	cld = do_config_log_add(obd, logname, MGS_CFG_T_CONFIG, cfg, sb);
+	if (IS_ERR(cld)) {
+		rc = PTR_ERR(cld);
+		CERROR("%s: can't create log: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(out_barrier, rc = PTR_ERR(cld));
+	}
+
+	LASSERT(lsi->lsi_lmd);
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) &&
+	    cfg->cfg_sub_clds & CONFIG_SUB_RECOVER) {
+		struct config_llog_data *recover_cld;
+
+		ptr = strrchr(seclogname, '-');
+		if (ptr != NULL) {
+			*ptr = 0;
+		} else {
+			CERROR("%s: sptlrpc log name not correct, %s: "
+			       "rc = %d\n", obd->obd_name, seclogname, -EINVAL);
+			GOTO(out_cld, rc = -EINVAL);
+		}
+
+		recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
+		if (IS_ERR(recover_cld)) {
+			rc = PTR_ERR(recover_cld);
+			CERROR("%s: can't create recover log: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out_cld, rc);
+		}
+
+		mutex_lock(&cld->cld_lock);
+		locked = true;
+		cld->cld_recover = recover_cld;
+	}
+
+	if (!locked)
+		mutex_lock(&cld->cld_lock);
+	cld->cld_params = params_cld;
+	cld->cld_barrier = barrier_cld;
+	cld->cld_nodemap = nodemap_cld;
+	cld->cld_sptlrpc = sptlrpc_cld;
+	mutex_unlock(&cld->cld_lock);
+
+	RETURN(cld);
+
+out_cld:
+	config_log_put(cld);
+out_barrier:
+	config_log_put(barrier_cld);
+out_params:
+	config_log_put(params_cld);
+out_nodemap:
+	config_log_put(nodemap_cld);
+out_sptlrpc:
+	config_log_put(sptlrpc_cld);
+
+	return ERR_PTR(rc);
+}
+
+DEFINE_MUTEX(llog_process_lock);
+
+static inline void config_mark_cld_stop_nolock(struct config_llog_data *cld)
+{
+	ENTRY;
+
+	spin_lock(&config_list_lock);
+	cld->cld_stopping = 1;
+	spin_unlock(&config_list_lock);
+
+	CDEBUG(D_INFO, "lockh %#llx\n", cld->cld_lockh.cookie);
+	if (!ldlm_lock_addref_try(&cld->cld_lockh, LCK_CR))
+		ldlm_lock_decref_and_cancel(&cld->cld_lockh, LCK_CR);
+}
+
+static inline void config_mark_cld_stop(struct config_llog_data *cld)
+{
+	if (cld) {
+		mutex_lock(&cld->cld_lock);
+		config_mark_cld_stop_nolock(cld);
+		mutex_unlock(&cld->cld_lock);
+	}
+}
+
+/** Stop watching for updates on this log.
+ */
+static int config_log_end(char *logname, struct config_llog_instance *cfg)
+{
+	struct config_llog_data *cld;
+	struct config_llog_data *cld_sptlrpc = NULL;
+	struct config_llog_data *cld_params = NULL;
+	struct config_llog_data *cld_recover = NULL;
+	struct config_llog_data *cld_nodemap = NULL;
+	struct config_llog_data *cld_barrier = NULL;
+	int rc = 0;
+
+	ENTRY;
+
+	cld = config_log_find(logname, cfg);
+	if (cld == NULL)
+		RETURN(-ENOENT);
+
+	mutex_lock(&cld->cld_lock);
+	/*
+	 * if cld_stopping is set, it means we didn't start the log thus
+	 * not owning the start ref. this can happen after previous umount:
+	 * the cld still hanging there waiting for lock cancel, and we
+	 * remount again but failed in the middle and call log_end without
+	 * calling start_log.
+	 */
+	if (unlikely(cld->cld_stopping)) {
+		mutex_unlock(&cld->cld_lock);
+		/* drop the ref from the find */
+		config_log_put(cld);
+		RETURN(rc);
+	}
+
+	cld_recover = cld->cld_recover;
+	cld->cld_recover = NULL;
+	cld_params = cld->cld_params;
+	cld->cld_params = NULL;
+	cld_nodemap = cld->cld_nodemap;
+	cld->cld_nodemap = NULL;
+	cld_barrier = cld->cld_barrier;
+	cld->cld_barrier = NULL;
+	cld_sptlrpc = cld->cld_sptlrpc;
+	cld->cld_sptlrpc = NULL;
+
+	config_mark_cld_stop_nolock(cld);
+	mutex_unlock(&cld->cld_lock);
+
+	config_mark_cld_stop(cld_recover);
+	config_log_put(cld_recover);
+	config_mark_cld_stop(cld_params);
+	config_log_put(cld_params);
+	config_mark_cld_stop(cld_barrier);
+	config_log_put(cld_barrier);
+	/* don't explicitly set cld_stopping on sptlrpc lock here, as other
+	 * targets may be active, it will be done in config_log_put if necessary
+	 */
+	config_log_put(cld_sptlrpc);
+	/* don't set cld_stopping on nm lock as other targets may be active */
+	config_log_put(cld_nodemap);
+
+	/* drop the ref from the find */
+	config_log_put(cld);
+	/* drop the start ref */
+	config_log_put(cld);
+
+	CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+	       rc);
+	RETURN(rc);
+}
+
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+	struct obd_device       *obd = data;
+	struct obd_import       *imp;
+	struct obd_connect_data *ocd;
+	struct config_llog_data *cld;
+	int rc = 0;
+
+	ENTRY;
+	LASSERT(obd);
+	with_imp_locked(obd, imp, rc) {
+		ocd = &imp->imp_connect_data;
+
+		seq_printf(m, "imperative_recovery: %s\n",
+			   OCD_HAS_FLAG(ocd, IMP_RECOV) ?
+			   "ENABLED" : "DISABLED");
+	}
+	if (rc)
+		RETURN(rc);
+
+	seq_printf(m, "client_state:\n");
+
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		if (cld->cld_recover == NULL)
+			continue;
+		seq_printf(m,  "    - { client: %s, nidtbl_version: %u }\n",
+			   cld->cld_logname,
+			   cld->cld_recover->cld_cfg.cfg_last_idx);
+	}
+	spin_unlock(&config_list_lock);
+
+	RETURN(0);
+}
+
+/* reenqueue any lost locks */
+#define RQ_RUNNING	0x1
+#define RQ_NOW		0x2
+#define RQ_LATER	0x4
+#define RQ_STOP		0x8
+#define RQ_PRECLEANUP	0x10
+static int                    rq_state = 0;
+static wait_queue_head_t      rq_waitq;
+static DECLARE_COMPLETION(rq_exit);
+static DECLARE_COMPLETION(rq_start);
+
+static void do_requeue(struct config_llog_data *cld)
+{
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/*
+	 * Do not run mgc_process_log on a disconnected export or an
+	 * export which is being disconnected. Take the client
+	 * semaphore to make the check non-racy.
+	 */
+	down_read_nested(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem,
+			 OBD_CLI_SEM_MGC);
+	if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+		CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
+		rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+		if (rc && rc != -ENOENT)
+			CERROR("failed processing log: %d\n", rc);
+	} else {
+		CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
+		       cld->cld_logname);
+	}
+	up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+
+        EXIT;
+}
+
+static int mgc_requeue_thread(void *data)
+{
+	int rc = 0;
+	bool first = true;
+	ENTRY;
+
+	CDEBUG(D_MGC, "Starting requeue thread\n");
+
+	/* Keep trying failed locks periodically */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_RUNNING;
+	while (!(rq_state & RQ_STOP)) {
+		struct config_llog_data *cld, *cld_prev;
+		int to;
+
+		/* Any new or requeued lostlocks will change the state */
+		rq_state &= ~(RQ_NOW | RQ_LATER);
+		spin_unlock(&config_list_lock);
+
+		if (first) {
+			first = false;
+			complete(&rq_start);
+		}
+
+		/* Always wait a few seconds to allow the server who
+		 * caused the lock revocation to finish its setup, plus some
+		 * random so everyone doesn't try to reconnect at once.
+		 */
+		to = mgc_requeue_timeout_min == 0 ? 1 : mgc_requeue_timeout_min;
+		to = cfs_time_seconds(mgc_requeue_timeout_min) +
+			get_random_u32_below(cfs_time_seconds(to));
+		wait_event_idle_timeout(rq_waitq,
+					rq_state & (RQ_STOP | RQ_PRECLEANUP), to);
+
+		/*
+		 * iterate & processing through the list. for each cld, process
+		 * its depending sptlrpc cld firstly (if any) and then itself.
+		 *
+		 * it's guaranteed any item in the list must have
+		 * reference > 0; and if cld_lostlock is set, at
+		 * least one reference is taken by the previous enqueue.
+		 */
+		cld_prev = NULL;
+
+		spin_lock(&config_list_lock);
+		rq_state &= ~RQ_PRECLEANUP;
+		list_for_each_entry(cld, &config_llog_list,
+				    cld_list_chain) {
+			if (!cld->cld_lostlock || cld->cld_stopping)
+				continue;
+
+			/* hold reference to avoid being freed during
+			 * subsequent processing. */
+			config_log_get(cld);
+			cld->cld_lostlock = 0;
+			spin_unlock(&config_list_lock);
+
+			config_log_put(cld_prev);
+			cld_prev = cld;
+
+			if (likely(!(rq_state & RQ_STOP))) {
+				do_requeue(cld);
+				spin_lock(&config_list_lock);
+			} else {
+				spin_lock(&config_list_lock);
+				break;
+			}
+		}
+		spin_unlock(&config_list_lock);
+		config_log_put(cld_prev);
+
+		/* Wait a bit to see if anyone else needs a requeue */
+		wait_event_idle(rq_waitq, rq_state & (RQ_NOW | RQ_STOP));
+		spin_lock(&config_list_lock);
+	}
+
+	/* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */
+	rq_state &= ~RQ_RUNNING;
+	spin_unlock(&config_list_lock);
+
+	complete(&rq_exit);
+
+	CDEBUG(D_MGC, "Ending requeue thread\n");
+	RETURN(rc);
+}
+
+/* Add a cld to the list to requeue.  Start the requeue thread if needed.
+   We are responsible for dropping the config log reference from here on out. */
+static void mgc_requeue_add(struct config_llog_data *cld)
+{
+	bool wakeup = false;
+	ENTRY;
+
+	CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n",
+		cld->cld_logname, atomic_read(&cld->cld_refcount),
+		cld->cld_stopping, rq_state);
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/* lets cancel an existent lock to mark cld as "lostlock" */
+	CDEBUG(D_INFO, "lockh %#llx\n", cld->cld_lockh.cookie);
+	if (!ldlm_lock_addref_try(&cld->cld_lockh, LCK_CR))
+		ldlm_lock_decref_and_cancel(&cld->cld_lockh, LCK_CR);
+
+	mutex_lock(&cld->cld_lock);
+	spin_lock(&config_list_lock);
+	if (!(rq_state & RQ_STOP) && !cld->cld_stopping) {
+		cld->cld_lostlock = 1;
+		rq_state |= RQ_NOW;
+		wakeup = true;
+	}
+	spin_unlock(&config_list_lock);
+	mutex_unlock(&cld->cld_lock);
+	if (wakeup)
+		wake_up(&rq_waitq);
+
+	EXIT;
+}
+
+/********************** class fns **********************/
+static int mgc_local_llog_init(const struct lu_env *env,
+			       struct obd_device *obd,
+			       struct obd_device *disk)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_ORIG_CTXT, disk,
+			&llog_osd_ops);
+	if (rc)
+		RETURN(rc);
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	LASSERT(ctxt);
+	ctxt->loc_dir = obd->u.cli.cl_mgc_configs_dir;
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+}
+
+static int mgc_local_llog_fini(const struct lu_env *env,
+			       struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	llog_cleanup(env, ctxt);
+
+	RETURN(0);
+}
+
+static int mgc_fs_setup(const struct lu_env *env, struct obd_device *obd,
+			struct super_block *sb)
+{
+	struct lustre_sb_info	*lsi = s2lsi(sb);
+	struct client_obd	*cli = &obd->u.cli;
+	struct lu_fid		 rfid, fid;
+	struct dt_object	*root, *dto;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(lsi);
+	LASSERT(lsi->lsi_dt_dev);
+
+	/* The mgc fs exclusion mutex. Only one fs can be setup at a time. */
+	mutex_lock(&cli->cl_mgc_mutex);
+
+	/* Setup the configs dir */
+	fid.f_seq = FID_SEQ_LOCAL_NAME;
+	fid.f_oid = 1;
+	fid.f_ver = 0;
+	rc = local_oid_storage_init(env, lsi->lsi_dt_dev, &fid,
+				    &cli->cl_mgc_los);
+	if (rc)
+		GOTO(out_mutex, rc);
+
+	rc = dt_root_get(env, lsi->lsi_dt_dev, &rfid);
+	if (rc)
+		GOTO(out_los, rc);
+
+	root = dt_locate_at(env, lsi->lsi_dt_dev, &rfid,
+			    &cli->cl_mgc_los->los_dev->dd_lu_dev, NULL);
+	if (unlikely(IS_ERR(root)))
+		GOTO(out_los, rc = PTR_ERR(root));
+
+	dto = local_file_find_or_create(env, cli->cl_mgc_los, root,
+					MOUNT_CONFIGS_DIR,
+					S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
+	dt_object_put_nocache(env, root);
+	if (IS_ERR(dto))
+		GOTO(out_los, rc = PTR_ERR(dto));
+
+	cli->cl_mgc_configs_dir = dto;
+
+	LASSERT(lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt);
+	rc = mgc_local_llog_init(env, obd, lsi->lsi_osd_exp->exp_obd);
+	if (rc)
+		GOTO(out_llog, rc);
+
+	/* We take an obd ref to insure that we can't get to mgc_cleanup
+	 * without calling mgc_fs_cleanup first. */
+	class_incref(obd, "mgc_fs", obd);
+
+	/* We keep the cl_mgc_sem until mgc_fs_cleanup */
+	EXIT;
+out_llog:
+	if (rc) {
+		dt_object_put(env, cli->cl_mgc_configs_dir);
+		cli->cl_mgc_configs_dir = NULL;
+	}
+out_los:
+	if (rc < 0) {
+		local_oid_storage_fini(env, cli->cl_mgc_los);
+out_mutex:
+		cli->cl_mgc_los = NULL;
+		mutex_unlock(&cli->cl_mgc_mutex);
+	}
+	return rc;
+}
+
+static int mgc_fs_cleanup(const struct lu_env *env, struct obd_device *obd)
+{
+	struct client_obd	*cli = &obd->u.cli;
+	ENTRY;
+
+	LASSERT(cli->cl_mgc_los != NULL);
+
+	mgc_local_llog_fini(env, obd);
+
+	dt_object_put_nocache(env, cli->cl_mgc_configs_dir);
+	cli->cl_mgc_configs_dir = NULL;
+
+	local_oid_storage_fini(env, cli->cl_mgc_los);
+	cli->cl_mgc_los = NULL;
+
+	class_decref(obd, "mgc_fs", obd);
+	mutex_unlock(&cli->cl_mgc_mutex);
+
+	RETURN(0);
+}
+
+static int mgc_llog_init(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	/* setup only remote ctxt, the local disk context is switched per each
+	 * filesystem during mgc_fs_setup() */
+	rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_REPL_CTXT, obd,
+			&llog_client_ops);
+	if (rc)
+		RETURN(rc);
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	LASSERT(ctxt);
+
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+}
+
+static int mgc_llog_fini(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(env, ctxt);
+
+	RETURN(0);
+}
+
+
+static atomic_t mgc_count = ATOMIC_INIT(0);
+static int mgc_precleanup(struct obd_device *obd)
+{
+	int	rc = 0;
+	int	temp;
+	ENTRY;
+
+	if (atomic_dec_and_test(&mgc_count)) {
+		LASSERT(rq_state & RQ_RUNNING);
+		/* stop requeue thread */
+		temp = RQ_STOP;
+	} else {
+		/* wakeup requeue thread to clean our cld */
+		temp = RQ_NOW | RQ_PRECLEANUP;
+	}
+
+	spin_lock(&config_list_lock);
+	rq_state |= temp;
+	spin_unlock(&config_list_lock);
+	wake_up(&rq_waitq);
+
+	if (temp & RQ_STOP)
+		wait_for_completion(&rq_exit);
+	obd_cleanup_client_import(obd);
+
+	rc = mgc_llog_fini(NULL, obd);
+	if (rc != 0)
+		CERROR("failed to cleanup llogging subsystems\n");
+
+	RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+        int rc;
+        ENTRY;
+
+        /* COMPAT_146 - old config logs may have added profiles we don't
+           know about */
+	if (atomic_read(&obd->obd_type->typ_refcnt) <= 1)
+                /* Only for the last mgc */
+                class_del_profiles();
+
+        lprocfs_obd_cleanup(obd);
+        ptlrpcd_decref();
+
+        rc = client_obd_cleanup(obd);
+        RETURN(rc);
+}
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct task_struct	*task;
+	int			 rc;
+	ENTRY;
+
+	rc = ptlrpcd_addref();
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(err_decref, rc);
+
+	rc = mgc_llog_init(NULL, obd);
+	if (rc) {
+		CERROR("failed to setup llogging subsystems\n");
+		GOTO(err_cleanup, rc);
+	}
+
+	rc = mgc_tunables_init(obd);
+	if (rc)
+		GOTO(err_sysfs, rc);
+
+	if (atomic_inc_return(&mgc_count) == 1) {
+		rq_state = 0;
+		init_waitqueue_head(&rq_waitq);
+
+		/* start requeue thread */
+		task = kthread_run(mgc_requeue_thread, NULL, "ll_cfg_requeue");
+		if (IS_ERR(task)) {
+			rc = PTR_ERR(task);
+			CERROR("%s: cannot start requeue thread: rc = %d; "
+			       "no more log updates\n",
+			       obd->obd_name, rc);
+			GOTO(err_sysfs, rc);
+		}
+		/* rc is the task_struct pointer of mgc_requeue_thread. */
+		rc = 0;
+		wait_for_completion(&rq_start);
+	}
+
+	RETURN(rc);
+
+err_sysfs:
+	lprocfs_obd_cleanup(obd);
+err_cleanup:
+	client_obd_cleanup(obd);
+err_decref:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+/* based on ll_mdc_blocking_ast */
+static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                            void *data, int flag)
+{
+        struct lustre_handle lockh;
+        struct config_llog_data *cld = (struct config_llog_data *)data;
+        int rc = 0;
+        ENTRY;
+
+        switch (flag) {
+        case LDLM_CB_BLOCKING:
+                /* mgs wants the lock, give it up... */
+                LDLM_DEBUG(lock, "MGC blocking CB");
+                ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		break;
+	case LDLM_CB_CANCELING:
+		/* We've given up the lock, prepare ourselves to update. */
+		LDLM_DEBUG(lock, "MGC cancel CB");
+
+		CDEBUG(D_MGC, "Lock res "DLDLMRES" (%.8s)\n",
+		       PLDLMRES(lock->l_resource),
+		       (char *)&lock->l_resource->lr_name.name[0]);
+
+		if (!cld) {
+			CDEBUG(D_INFO, "missing data, won't requeue\n");
+			break;
+		}
+
+		/* held at mgc_process_log(). */
+		LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+		lock->l_ast_data = NULL;
+		cld->cld_lockh.cookie = 0;
+		/* Are we done with this log? */
+		if (cld->cld_stopping) {
+			CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
+				cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+		/* Make sure not to re-enqueue when the mgc is stopping
+		   (we get called from client_disconnect_export) */
+		if (lock->l_conn_export == NULL ||
+		    lock->l_conn_export->exp_obd->u.cli.cl_conn_count == 0) {
+			CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n",
+				cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+
+                /* Re-enqueue now */
+                mgc_requeue_add(cld);
+                config_log_put(cld);
+                break;
+        default:
+                LBUG();
+        }
+
+        RETURN(rc);
+}
+
+/* Not sure where this should go... */
+/* This is the timeout value for MGS_CONNECT request plus a ping interval, such
+ * that we can have a chance to try the secondary MGS if any. */
+#define  MGC_ENQUEUE_LIMIT (INITIAL_CONNECT_TIMEOUT + (AT_OFF ? 0 : at_min) \
+				+ PING_INTERVAL)
+#define  MGC_TARGET_REG_LIMIT 10
+#define  MGC_TARGET_REG_LIMIT_MAX RECONNECT_DELAY_MAX
+#define  MGC_SEND_PARAM_LIMIT 10
+
+/* Take a config lock so we can get cancel notifications */
+static int mgc_enqueue(struct obd_export *exp, enum ldlm_type type,
+		       union ldlm_policy_data *policy, enum ldlm_mode mode,
+		       __u64 *flags, ldlm_glimpse_callback glimpse_callback,
+		       void *data, __u32 lvb_len, void *lvb_swabber,
+		       struct lustre_handle *lockh)
+{
+	struct config_llog_data *cld = (struct config_llog_data *)data;
+	struct ldlm_enqueue_info einfo = {
+		.ei_type	= type,
+		.ei_mode	= mode,
+		.ei_cb_bl	= mgc_blocking_ast,
+		.ei_cb_cp	= ldlm_completion_ast,
+		.ei_cb_gl	= glimpse_callback,
+	};
+	struct ptlrpc_request *req;
+	int short_limit = cld_is_sptlrpc(cld);
+	int rc;
+	ENTRY;
+
+	if (!exp)
+		RETURN(-EBADR);
+
+	CDEBUG(D_MGC, "Enqueue for %s (res %#llx)\n", cld->cld_logname,
+               cld->cld_resid.name[0]);
+
+        /* We need a callback for every lockholder, so don't try to
+           ldlm_lock_match (see rev 1.1.2.11.2.47) */
+        req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                        &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION,
+                                        LDLM_ENQUEUE);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0);
+        ptlrpc_request_set_replen(req);
+
+        /* check if this is server or client */
+        if (cld->cld_cfg.cfg_sb) {
+                struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb);
+		if (lsi && IS_SERVER(lsi))
+                        short_limit = 1;
+        }
+        /* Limit how long we will wait for the enqueue to complete */
+        req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT;
+        rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags,
+			      NULL, 0, LVB_T_NONE, lockh, 0);
+        /* A failed enqueue should still call the mgc_blocking_ast,
+           where it will be requeued if needed ("grant failed"). */
+        ptlrpc_req_finished(req);
+        RETURN(rc);
+}
+
+static int mgc_cancel(struct obd_export *exp, enum ldlm_mode mode,
+		      struct lustre_handle *lockh)
+{
+	ENTRY;
+
+	ldlm_lock_decref(lockh, mode);
+
+	RETURN(0);
+}
+
+static void mgc_notify_active(struct obd_device *unused)
+{
+	/* wakeup mgc_requeue_thread to requeue mgc lock */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_NOW;
+	spin_unlock(&config_list_lock);
+	wake_up(&rq_waitq);
+
+	/* TODO: Help the MGS rebuild nidtbl. -jay */
+}
+
+/* Send target_reg message to MGS */
+static int mgc_target_register(struct obd_export *exp,
+                               struct mgs_target_info *mti)
+{
+        struct ptlrpc_request  *req;
+        struct mgs_target_info *req_mti, *rep_mti;
+        int                     rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                        &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION,
+                                        MGS_TARGET_REG);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);
+        if (!req_mti) {
+                ptlrpc_req_finished(req);
+                RETURN(-ENOMEM);
+        }
+
+	memcpy(req_mti, mti, sizeof(*req_mti));
+	ptlrpc_request_set_replen(req);
+	CDEBUG(D_MGC, "register %s\n", mti->mti_svname);
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = MGC_TARGET_REG_LIMIT;
+
+	/* if the target needs to regenerate the config log in MGS, it's better
+	 * to use some longer limit to let MGC have time to change connection to
+	 * another MGS (or try again with the same MGS) for the target (server)
+	 * will fail and exit if the request expired due to delay limit. */
+	if (mti->mti_flags & (LDD_F_UPDATE | LDD_F_NEED_INDEX))
+		req->rq_delay_limit = MGC_TARGET_REG_LIMIT_MAX;
+
+        rc = ptlrpc_queue_wait(req);
+	if (ptlrpc_client_replied(req)) {
+		rep_mti = req_capsule_server_get(&req->rq_pill,
+						 &RMF_MGS_TARGET_INFO);
+		if (rep_mti)
+			memcpy(mti, rep_mti, sizeof(*rep_mti));
+	}
+	if (!rc) {
+		CDEBUG(D_MGC, "register %s got index = %d\n",
+		       mti->mti_svname, mti->mti_stripe_index);
+        }
+        ptlrpc_req_finished(req);
+
+        RETURN(rc);
+}
+
+static int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key,
+			      u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
+{
+        int rc = -EINVAL;
+        ENTRY;
+
+	/* Turn off initial_recov after we try all backup servers once */
+	if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		int value;
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+		value = *(int *)val;
+		CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n",
+		       imp->imp_obd->obd_name, value,
+		       imp->imp_deactive, imp->imp_invalid,
+		       imp->imp_replayable, imp->imp_obd->obd_replayable,
+		       ptlrpc_import_state_name(imp->imp_state));
+		/* Resurrect the import immediately if
+		 * 1. we previously got disconnected,
+		 * 2. value > 1 (at the same node with MGS)
+		 * */
+		if (imp->imp_state == LUSTRE_IMP_DISCON || value > 1)
+			ptlrpc_reconnect_import(imp);
+
+		RETURN(0);
+	}
+
+        /* FIXME move this to mgc_process_config */
+        if (KEY_IS(KEY_REGISTER_TARGET)) {
+                struct mgs_target_info *mti;
+                if (vallen != sizeof(struct mgs_target_info))
+                        RETURN(-EINVAL);
+                mti = (struct mgs_target_info *)val;
+                CDEBUG(D_MGC, "register_target %s %#x\n",
+                       mti->mti_svname, mti->mti_flags);
+                rc =  mgc_target_register(exp, mti);
+                RETURN(rc);
+        }
+	if (KEY_IS(KEY_SET_FS)) {
+		struct super_block *sb = (struct super_block *)val;
+
+		if (vallen != sizeof(struct super_block))
+			RETURN(-EINVAL);
+
+		rc = mgc_fs_setup(env, exp->exp_obd, sb);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_CLEAR_FS)) {
+		if (vallen != 0)
+			RETURN(-EINVAL);
+		rc = mgc_fs_cleanup(env, exp->exp_obd);
+		RETURN(rc);
+	}
+        if (KEY_IS(KEY_MGSSEC)) {
+                struct client_obd     *cli = &exp->exp_obd->u.cli;
+                struct sptlrpc_flavor  flvr;
+
+                /*
+                 * empty string means using current flavor, if which haven't
+                 * been set yet, set it as null.
+                 *
+                 * if flavor has been set previously, check the asking flavor
+                 * must match the existing one.
+                 */
+                if (vallen == 0) {
+                        if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID)
+                                RETURN(0);
+                        val = "null";
+                        vallen = 4;
+                }
+
+                rc = sptlrpc_parse_flavor(val, &flvr);
+                if (rc) {
+                        CERROR("invalid sptlrpc flavor %s to MGS\n",
+                               (char *) val);
+                        RETURN(rc);
+                }
+
+                /*
+                 * caller already hold a mutex
+                 */
+                if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) {
+                        cli->cl_flvr_mgc = flvr;
+                } else if (memcmp(&cli->cl_flvr_mgc, &flvr,
+                                  sizeof(flvr)) != 0) {
+                        char    str[20];
+
+                        sptlrpc_flavor2name(&cli->cl_flvr_mgc,
+                                            str, sizeof(str));
+                        LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but "
+                                       "currently %s is in use\n",
+                                       (char *) val, str);
+                        rc = -EPERM;
+                }
+                RETURN(rc);
+        }
+
+        RETURN(rc);
+}
+
+static int mgc_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val)
+{
+        int rc = -EINVAL;
+
+        if (KEY_IS(KEY_CONN_DATA)) {
+                struct obd_import *imp = class_exp2cliimp(exp);
+                struct obd_connect_data *data = val;
+
+                if (*vallen == sizeof(*data)) {
+                        *data = imp->imp_connect_data;
+                        rc = 0;
+                }
+        }
+
+        return rc;
+}
+
+static int mgc_import_event(struct obd_device *obd,
+                            struct obd_import *imp,
+                            enum obd_import_event event)
+{
+        int rc = 0;
+
+        LASSERT(imp->imp_obd == obd);
+        CDEBUG(D_MGC, "import event %#x\n", event);
+
+        switch (event) {
+        case IMP_EVENT_DISCON:
+                /* MGC imports should not wait for recovery */
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_down();
+                break;
+        case IMP_EVENT_INACTIVE:
+                break;
+        case IMP_EVENT_INVALIDATE: {
+                struct ldlm_namespace *ns = obd->obd_namespace;
+                ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+                break;
+        }
+	case IMP_EVENT_ACTIVE:
+		CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name);
+		/* Clearing obd_no_recov allows us to continue pinging */
+		obd->obd_no_recov = 0;
+		mgc_notify_active(obd);
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_up();
+		break;
+        case IMP_EVENT_OCD:
+                break;
+        case IMP_EVENT_DEACTIVATE:
+        case IMP_EVENT_ACTIVATE:
+                break;
+        default:
+                CERROR("Unknown import event %#x\n", event);
+                LBUG();
+        }
+        RETURN(rc);
+}
+
+enum {
+	CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_SHIFT),
+        CONFIG_READ_NRPAGES      = 4
+};
+
+static int mgc_apply_recover_logs(struct obd_device *mgc,
+				  struct config_llog_data *cld,
+				  __u64 max_version,
+				  void *data, int datalen, bool mne_swab)
+{
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb);
+	struct mgs_nidtbl_entry *entry;
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs bufs;
+	u64 prev_version = 0;
+	char inst[MTI_NAME_MAXLEN + 1];
+	char *buf;
+	int bufsz;
+	int pos = 0;
+	int rc  = 0;
+	int off = 0;
+	unsigned long dynamic_nids;
+
+	ENTRY;
+	LASSERT(cfg->cfg_instance != 0);
+	LASSERT(ll_get_cfg_instance(cfg->cfg_sb) == cfg->cfg_instance);
+
+	/* get dynamic nids setting */
+	dynamic_nids = mgc->obd_dynamic_nids;
+
+	if (!IS_SERVER(lsi)) {
+		pos = snprintf(inst, sizeof(inst), "%016lx", cfg->cfg_instance);
+		if (pos >= PAGE_SIZE)
+			return -E2BIG;
+#ifdef HAVE_SERVER_SUPPORT
+	} else {
+		LASSERT(IS_MDT(lsi));
+		rc = server_name2svname(lsi->lsi_svname, inst, NULL,
+					sizeof(inst));
+		if (rc)
+			RETURN(-EINVAL);
+#endif /* HAVE_SERVER_SUPPORT */
+	}
+
+	OBD_ALLOC(buf, PAGE_SIZE);
+	if (!buf)
+		return -ENOMEM;
+	bufsz = PAGE_SIZE;
+	pos = 0;
+
+	while (datalen > 0) {
+		int   entry_len = sizeof(*entry);
+		int   is_ost;
+		struct obd_device *obd;
+		struct obd_import *imp;
+		char *obdname;
+		char *cname;
+		char *params;
+		char *uuid;
+
+		rc = -EINVAL;
+		if (datalen < sizeof(*entry))
+			break;
+
+		entry = (typeof(entry))(data + off);
+
+		/* sanity check */
+		if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */
+			break;
+		if (entry->mne_nid_count == 0) /* at least one nid entry */
+			break;
+		if (entry->mne_nid_size != sizeof(lnet_nid_t))
+			break;
+
+		entry_len += entry->mne_nid_count * entry->mne_nid_size;
+		if (datalen < entry_len) /* must have entry_len at least */
+			break;
+
+		/* Keep this swab for normal mixed endian handling. LU-1644 */
+		if (mne_swab)
+			lustre_swab_mgs_nidtbl_entry(entry);
+		if (entry->mne_length > PAGE_SIZE) {
+			CERROR("MNE too large (%u)\n", entry->mne_length);
+			break;
+		}
+
+		if (entry->mne_length < entry_len)
+			break;
+
+		off     += entry->mne_length;
+		datalen -= entry->mne_length;
+		if (datalen < 0)
+			break;
+
+		if (entry->mne_version > max_version) {
+			CERROR("entry index(%lld) is over max_index(%lld)\n",
+			       entry->mne_version, max_version);
+			break;
+		}
+
+		if (prev_version >= entry->mne_version) {
+			CERROR("index unsorted, prev %lld, now %lld\n",
+			       prev_version, entry->mne_version);
+			break;
+		}
+		prev_version = entry->mne_version;
+
+		/*
+		 * Write a string with format "nid::instance" to
+		 * lustre/<osc|mdc>/<target>-<osc|mdc>-<instance>/import.
+		 */
+
+		is_ost = entry->mne_type == LDD_F_SV_TYPE_OST;
+		memset(buf, 0, bufsz);
+		obdname = buf;
+		pos = 0;
+
+		/* lustre-OST0001-osc-<instance #> */
+		strcpy(obdname, cld->cld_logname);
+		cname = strrchr(obdname, '-');
+		if (cname == NULL) {
+			CERROR("mgc %s: invalid logname %s\n",
+			       mgc->obd_name, obdname);
+			break;
+		}
+
+		pos = cname - obdname;
+		obdname[pos] = 0;
+		pos += sprintf(obdname + pos, "-%s%04x",
+			       is_ost ? "OST" : "MDT", entry->mne_index);
+
+		cname = is_ost ? "osc" : "mdc",
+			pos += snprintf(obdname + pos, bufsz, "-%s-%s", cname, inst);
+		lustre_cfg_bufs_reset(&bufs, obdname);
+
+		/* find the obd by obdname */
+		obd = class_name2obd(obdname);
+		if (obd == NULL) {
+			CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n",
+			       mgc->obd_name, obdname);
+			rc = 0;
+			/* this is a safe race, when the ost is starting up...*/
+			continue;
+		}
+
+		/* osc.import = "connection=<Conn UUID>::<target instance>" */
+		++pos;
+		params = buf + pos;
+		pos += sprintf(params, "%s.import=%s", cname, "connection=");
+		uuid = buf + pos;
+
+		with_imp_locked(obd, imp, rc) {
+			/* iterate all nids to find one */
+			/* find uuid by nid */
+			/* create import entries if they don't exist */
+			rc = client_import_add_nids_to_conn(
+				imp, entry->u.nids, entry->mne_nid_count,
+				(struct obd_uuid *)uuid);
+
+			if (rc == -ENOENT && dynamic_nids) {
+				/* create a new connection for this import */
+				char *primary_nid =
+					libcfs_nid2str(entry->u.nids[0]);
+				int prim_nid_len = strlen(primary_nid) + 1;
+				struct obd_uuid server_uuid;
+
+				if (prim_nid_len > UUID_MAX)
+					goto fail;
+				strncpy(server_uuid.uuid, primary_nid,
+					prim_nid_len);
+
+				CDEBUG(D_INFO, "Adding a connection for %s\n",
+				       primary_nid);
+
+				rc = client_import_dyn_add_conn(
+					imp, &server_uuid, entry->u.nids[0], 1);
+				if (rc < 0) {
+					CERROR("%s: Failed to add new connection with NID '%s' to import: rc = %d\n",
+					       obd->obd_name, primary_nid, rc);
+					goto fail;
+				}
+				rc = client_import_add_nids_to_conn(
+					imp, entry->u.nids,
+					entry->mne_nid_count,
+					(struct obd_uuid *)uuid);
+				if (rc < 0) {
+					CERROR("%s: failed to lookup UUID: rc = %d\n",
+					       obd->obd_name, rc);
+					goto fail;
+				}
+			}
+fail:;
+		}
+		if (rc == -ENODEV) {
+			/* client does not connect to the OST yet */
+			rc = 0;
+			continue;
+		}
+
+		if (rc < 0 && rc != -ENOSPC) {
+			CERROR("mgc: cannot find UUID by nid '%s': rc = %d\n",
+			       libcfs_nid2str(entry->u.nids[0]), rc);
+			break;
+		}
+
+		CDEBUG(D_INFO, "Found UUID '%s' by NID '%s'\n",
+		       uuid, libcfs_nid2str(entry->u.nids[0]));
+
+		pos += strlen(uuid);
+		pos += sprintf(buf + pos, "::%u", entry->mne_instance);
+		LASSERT(pos < bufsz);
+
+		lustre_cfg_bufs_set_string(&bufs, 1, params);
+
+		OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount,
+					       bufs.lcfg_buflen));
+		if (!lcfg) {
+			rc = -ENOMEM;
+			break;
+		}
+		lustre_cfg_init(lcfg, LCFG_PARAM, &bufs);
+
+		CDEBUG(D_INFO, "ir apply logs %lld/%lld for %s -> %s\n",
+		       prev_version, max_version, obdname, params);
+
+		rc = class_process_config(lcfg);
+		OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount,
+					      lcfg->lcfg_buflens));
+		if (rc)
+			CDEBUG(D_INFO, "process config for %s error %d\n",
+			       obdname, rc);
+
+		/* continue, even one with error */
+	}
+
+	OBD_FREE(buf, PAGE_SIZE);
+
+	RETURN(rc);
+}
+
+/**
+ * This function is called if this client was notified for target restarting
+ * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery or
+ * nodemap logs.
+ */
+static int mgc_process_recover_nodemap_log(struct obd_device *obd,
+					   struct config_llog_data *cld)
+{
+	struct ptlrpc_connection *mgc_conn;
+	struct ptlrpc_request *req = NULL;
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct mgs_config_body *body;
+	struct mgs_config_res *res;
+	struct nodemap_config *new_config = NULL;
+	struct lu_nodemap *recent_nodemap = NULL;
+	struct ptlrpc_bulk_desc *desc;
+	struct page **pages = NULL;
+	__u64 config_read_offset = 0;
+	__u8 nodemap_cur_pass = 0;
+	int nrpages = 0;
+	bool eof = true;
+	bool mne_swab = false;
+	int i;
+	int ealen;
+	int rc;
+	ENTRY;
+
+	mgc_conn = class_exp2cliimp(cld->cld_mgcexp)->imp_connection;
+
+	/* don't need to get local config */
+	if (cld_is_nodemap(cld) &&
+	    LNetIsPeerLocal(lnet_nid_to_nid4(&mgc_conn->c_peer.nid)))
+		GOTO(out, rc = 0);
+
+	/* allocate buffer for bulk transfer.
+	 * if this is the first time for this mgs to read logs,
+	 * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs
+	 * once; otherwise, it only reads increment of logs, this should be
+	 * small and CONFIG_READ_NRPAGES will be used.
+	 */
+	nrpages = CONFIG_READ_NRPAGES;
+	if (cfg->cfg_last_idx == 0 || cld_is_nodemap(cld))
+		nrpages = CONFIG_READ_NRPAGES_INIT;
+
+	OBD_ALLOC_PTR_ARRAY_LARGE(pages, nrpages);
+	if (pages == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < nrpages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (pages[i] == NULL)
+			GOTO(out, rc = -ENOMEM);
+	}
+
+again:
+#ifdef HAVE_SERVER_SUPPORT
+	if (cld_is_nodemap(cld) && config_read_offset == 0) {
+		new_config = nodemap_config_alloc();
+		if (IS_ERR(new_config)) {
+			rc = PTR_ERR(new_config);
+			new_config = NULL;
+			GOTO(out, rc);
+		}
+	}
+#endif
+	LASSERT(cld_is_recover(cld) || cld_is_nodemap(cld));
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+	req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
+				   &RQF_MGS_CONFIG_READ);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
+	if (rc)
+		GOTO(out, rc);
+
+	/* pack request */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
+	LASSERT(body != NULL);
+	LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
+	if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name))
+	    >= sizeof(body->mcb_name))
+		GOTO(out, rc = -E2BIG);
+	if (cld_is_nodemap(cld))
+		body->mcb_offset = config_read_offset;
+	else
+		body->mcb_offset = cfg->cfg_last_idx + 1;
+	body->mcb_type   = cld->cld_type;
+	body->mcb_bits   = PAGE_SHIFT;
+	body->mcb_units  = nrpages;
+	body->mcb_nm_cur_pass = nodemap_cur_pass;
+
+	/* allocate bulk transfer descriptor */
+	desc = ptlrpc_prep_bulk_imp(req, nrpages, 1,
+				    PTLRPC_BULK_PUT_SINK,
+				    MGS_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_pin_ops);
+	if (desc == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < nrpages; i++)
+		desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0,
+						 PAGE_SIZE);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
+	if (!res)
+		GOTO(out, rc = -EPROTO);
+
+	if (cld_is_nodemap(cld)) {
+		config_read_offset = res->mcr_offset;
+		eof = config_read_offset == II_END_OFF;
+		nodemap_cur_pass = res->mcr_nm_cur_pass;
+	} else {
+		if (res->mcr_size < res->mcr_offset)
+			GOTO(out, rc = -EINVAL);
+
+		/* always update the index even though it might have errors with
+		 * handling the recover logs
+		 */
+		cfg->cfg_last_idx = res->mcr_offset;
+		eof = res->mcr_offset == res->mcr_size;
+
+		CDEBUG(D_INFO, "Latest version %lld, more %d.\n",
+		       res->mcr_offset, eof == false);
+	}
+
+	ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
+	if (ealen < 0)
+		GOTO(out, rc = ealen);
+
+	if (ealen > nrpages << PAGE_SHIFT)
+		GOTO(out, rc = -EINVAL);
+
+	if (ealen == 0) { /* no logs transferred */
+#ifdef HAVE_SERVER_SUPPORT
+		/* config changed since first read RPC */
+		if (cld_is_nodemap(cld) && config_read_offset == 0) {
+			CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n");
+			GOTO(out, rc = -EAGAIN);
+		}
+#endif
+		if (!eof)
+			rc = -EINVAL;
+		GOTO(out, rc);
+	}
+
+	mne_swab = req_capsule_rep_need_swab(&req->rq_pill);
+
+	/* When a nodemap config is received, we build a new nodemap config,
+	 * with new nodemap structs. We keep track of the most recently added
+	 * nodemap since the config is read ordered by nodemap_id, and so it
+	 * is likely that the next record will be related. Because access to
+	 * the nodemaps is single threaded until the nodemap_config is active,
+	 * we don't need to reference count with recent_nodemap, though
+	 * recent_nodemap should be set to NULL when the nodemap_config
+	 * is either destroyed or set active.
+	 */
+	for (i = 0; i < nrpages && ealen > 0; i++) {
+		int rc2;
+		union lu_page	*ptr;
+
+		ptr = kmap(pages[i]);
+		if (cld_is_nodemap(cld))
+			rc2 = nodemap_process_idx_pages(new_config, ptr,
+						       &recent_nodemap);
+		else
+			rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset,
+						     ptr,
+						     min_t(int, ealen,
+							   PAGE_SIZE),
+						     mne_swab);
+		kunmap(pages[i]);
+		if (rc2 < 0) {
+			CWARN("%s: error processing %s log %s: rc = %d\n",
+			      obd->obd_name,
+			      cld_is_nodemap(cld) ? "nodemap" : "recovery",
+			      cld->cld_logname,
+			      rc2);
+			GOTO(out, rc = rc2);
+		}
+
+		ealen -= PAGE_SIZE;
+	}
+
+out:
+	if (req) {
+		ptlrpc_req_finished(req);
+		req = NULL;
+	}
+
+	if (rc == 0 && !eof)
+		goto again;
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (new_config != NULL) {
+		/* recent_nodemap cannot be used after set_active/dealloc */
+		if (rc == 0)
+			nodemap_config_set_active_mgc(new_config);
+		else
+			nodemap_config_dealloc(new_config);
+	}
+#endif
+
+	if (pages) {
+		for (i = 0; i < nrpages; i++) {
+			if (pages[i] == NULL)
+				break;
+			__free_page(pages[i]);
+		}
+		OBD_FREE_PTR_ARRAY_LARGE(pages, nrpages);
+	}
+	return rc;
+}
+
+static int mgc_barrier_glimpse_ast(struct ldlm_lock *lock, void *data)
+{
+	struct config_llog_data *cld = lock->l_ast_data;
+	int rc;
+	ENTRY;
+
+	if (cld->cld_stopping)
+		RETURN(-ENODEV);
+
+	rc = barrier_handler(s2lsi(cld->cld_cfg.cfg_sb)->lsi_dt_dev,
+			     (struct ptlrpc_request *)data);
+
+	RETURN(rc);
+}
+
+/* Copy a remote log locally */
+static int mgc_llog_local_copy(const struct lu_env *env,
+			       struct obd_device *obd,
+			       struct llog_ctxt *rctxt,
+			       struct llog_ctxt *lctxt, char *logname)
+{
+	char	*temp_log;
+	int	 rc;
+
+	ENTRY;
+
+	/*
+	 * - copy it to backup using llog_backup()
+	 * - copy remote llog to logname using llog_backup()
+	 * - if failed then move bakup to logname again
+	 */
+
+	OBD_ALLOC(temp_log, strlen(logname) + 2);
+	if (!temp_log)
+		RETURN(-ENOMEM);
+	sprintf(temp_log, "%sT", logname);
+
+	/* make a copy of local llog at first */
+	rc = llog_backup(env, obd, lctxt, lctxt, logname, temp_log);
+	if (rc < 0 && rc != -ENOENT)
+		GOTO(out, rc);
+	/* copy remote llog to the local copy */
+	rc = llog_backup(env, obd, rctxt, lctxt, logname, logname);
+	if (rc == -ENOENT) {
+		/* no remote llog, delete local one too */
+		llog_erase(env, lctxt, NULL, logname);
+	} else if (rc < 0) {
+		/* error during backup, get local one back from the copy */
+		llog_backup(env, obd, lctxt, lctxt, temp_log, logname);
+out:
+		CERROR("%s: failed to copy remote log %s: rc = %d\n",
+		       obd->obd_name, logname, rc);
+	}
+	llog_erase(env, lctxt, NULL, temp_log);
+	OBD_FREE(temp_log, strlen(logname) + 2);
+	return rc;
+}
+
+/* local_only means it cannot get remote llogs */
+static int mgc_process_cfg_log(struct obd_device *mgc,
+			       struct config_llog_data *cld, int local_only)
+{
+	struct llog_ctxt	*ctxt, *lctxt = NULL;
+	struct client_obd	*cli = &mgc->u.cli;
+	struct lustre_sb_info	*lsi = NULL;
+	int			 rc = 0;
+	struct lu_env		*env;
+
+	ENTRY;
+
+	LASSERT(cld);
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+
+	if (cld->cld_cfg.cfg_sb)
+		lsi = s2lsi(cld->cld_cfg.cfg_sb);
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		RETURN(-ENOMEM);
+
+	rc = lu_env_init(env, LCT_MG_THREAD);
+	if (rc)
+		GOTO(out_free, rc);
+
+	ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+	LASSERT(ctxt);
+
+	lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT);
+
+	/* Copy the setup log locally if we can. Don't mess around if we're
+	 * running an MGS though (logs are already local). */
+	if (lctxt && lsi && IS_SERVER(lsi) && !IS_MGS(lsi) &&
+	    cli->cl_mgc_configs_dir != NULL &&
+	    lu2dt_dev(cli->cl_mgc_configs_dir->do_lu.lo_dev) ==
+	    lsi->lsi_dt_dev) {
+		if (!local_only && !lsi->lsi_dt_dev->dd_rdonly) {
+			/* Only try to copy log if we have the lock. */
+			CDEBUG(D_INFO, "%s: copy local log %s\n",
+			       mgc->obd_name, cld->cld_logname);
+
+			rc = mgc_llog_local_copy(env, mgc, ctxt, lctxt,
+						 cld->cld_logname);
+			if (!rc)
+				lsi->lsi_flags &= ~LDD_F_NO_LOCAL_LOGS;
+		}
+		if (local_only || rc) {
+			if (unlikely(lsi->lsi_flags & LDD_F_NO_LOCAL_LOGS)
+			    || rc) {
+				CWARN("%s: local log %s are not valid and/or remote logs are not accessbile rc = %d\n",
+				      mgc->obd_name, cld->cld_logname, rc);
+				GOTO(out_pop, rc = -EIO);
+			}
+
+			if (strcmp(cld->cld_logname, PARAMS_FILENAME) != 0 &&
+			    llog_is_empty(env, lctxt, cld->cld_logname)) {
+				LCONSOLE_ERROR_MSG(0x13a, "Failed to get MGS log %s and no local copy.\n",
+						   cld->cld_logname);
+				GOTO(out_pop, rc = -ENOENT);
+			}
+			CDEBUG(D_MGC, "%s: Failed to get MGS log %s, using local copy for now, will try to update later.\n",
+			       mgc->obd_name, cld->cld_logname);
+			rc = 0;
+		}
+		/* Now, whether we copied or not, start using the local llog.
+		 * If we failed to copy, we'll start using whatever the old
+		 * log has. */
+		llog_ctxt_put(ctxt);
+		ctxt = lctxt;
+		lctxt = NULL;
+	} else {
+		if (local_only) /* no local log at client side */
+			GOTO(out_pop, rc = -EIO);
+	}
+
+	rc = -EAGAIN;
+	if (lsi && IS_SERVER(lsi) && !IS_MGS(lsi) &&
+	    lsi->lsi_dt_dev->dd_rdonly) {
+		struct llog_ctxt *rctxt;
+
+		/* Under readonly mode, we may have no local copy or local
+		 * copy is incomplete, so try to use remote llog firstly. */
+		rctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+		LASSERT(rctxt);
+
+		rc = class_config_parse_llog(env, rctxt, cld->cld_logname,
+					     &cld->cld_cfg);
+		llog_ctxt_put(rctxt);
+	}
+
+	if (rc && rc != -ENOENT)
+		rc = class_config_parse_llog(env, ctxt, cld->cld_logname,
+					     &cld->cld_cfg);
+
+	/*
+	 * update settings on existing OBDs.
+	 * the logname must be <fsname>-sptlrpc
+	 */
+	if (rc == 0 && cld_is_sptlrpc(cld))
+		class_notify_sptlrpc_conf(cld->cld_logname,
+					  strlen(cld->cld_logname) -
+					  strlen("-sptlrpc"));
+	EXIT;
+
+out_pop:
+	__llog_ctxt_put(env, ctxt);
+	if (lctxt)
+		__llog_ctxt_put(env, lctxt);
+
+	lu_env_fini(env);
+out_free:
+	OBD_FREE_PTR(env);
+	return rc;
+}
+
+static bool mgc_import_in_recovery(struct obd_import *imp)
+{
+	bool in_recovery = true;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_CLOSED)
+		in_recovery = false;
+	spin_unlock(&imp->imp_lock);
+
+	return in_recovery;
+}
+
+/**
+ * Get a configuration log from the MGS and process it.
+ *
+ * This function is called for both clients and servers to process the
+ * configuration log from the MGS.  The MGC enqueues a DLM lock on the
+ * log from the MGS, and if the lock gets revoked the MGC will be notified
+ * by the lock cancellation callback that the config log has changed,
+ * and will enqueue another MGS lock on it, and then continue processing
+ * the new additions to the end of the log.
+ *
+ * Since the MGC import is not replayable, if the import is being evicted
+ * (rcl == -ESHUTDOWN, \see ptlrpc_import_delay_req()), retry to process
+ * the log until recovery is finished or the import is closed.
+ *
+ * Make a local copy of the log before parsing it if appropriate (non-MGS
+ * server) so that the server can start even when the MGS is down.
+ *
+ * There shouldn't be multiple processes running process_log at once --
+ * sounds like badness.  It actually might be fine, as long as they're not
+ * trying to update from the same log simultaneously, in which case we
+ * should use a per-log semaphore instead of cld_lock.
+ *
+ * \param[in] mgc	MGC device by which to fetch the configuration log
+ * \param[in] cld	log processing state (stored in lock callback data)
+ *
+ * \retval		0 on success
+ * \retval		negative errno on failure
+ */
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
+{
+        struct lustre_handle lockh = { 0 };
+	__u64 flags = LDLM_FL_NO_LRU;
+	int rc = 0, rcl;
+	bool retry = false;
+        ENTRY;
+
+	LASSERT(cld != NULL);
+
+        /* I don't want multiple processes running process_log at once --
+           sounds like badness.  It actually might be fine, as long as
+           we're not trying to update from the same log
+           simultaneously (in which case we should use a per-log sem.) */
+restart:
+	mutex_lock(&cld->cld_lock);
+	if (cld->cld_stopping) {
+		mutex_unlock(&cld->cld_lock);
+		RETURN(0);
+	}
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+
+	CDEBUG(D_MGC, "Process log %s-%016lx from %d\n", cld->cld_logname,
+	       cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+	/* Get the cfg lock on the llog */
+	rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, LDLM_PLAIN, NULL,
+			  LCK_CR, &flags,
+			  cld_is_barrier(cld) ? mgc_barrier_glimpse_ast : NULL,
+			  cld, 0, NULL, &lockh);
+	if (rcl == 0) {
+		/* Get the cld, it will be released in mgc_blocking_ast. */
+		config_log_get(cld);
+		rc = ldlm_lock_set_data(&lockh, (void *)cld);
+		LASSERT(!lustre_handle_is_used(&cld->cld_lockh));
+		LASSERT(rc == 0);
+		cld->cld_lockh = lockh;
+	} else {
+		CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+		cld->cld_lockh.cookie = 0;
+
+		if (rcl == -ESHUTDOWN &&
+		    atomic_read(&mgc->u.cli.cl_mgc_refcount) > 0 && !retry) {
+			struct obd_import *imp;
+			long timeout = cfs_time_seconds(obd_timeout);
+
+			mutex_unlock(&cld->cld_lock);
+			imp = class_exp2cliimp(mgc->u.cli.cl_mgc_mgsexp);
+
+			/* Let's force the pinger, and wait the import to be
+			 * connected, note: since mgc import is non-replayable,
+			 * and even the import state is disconnected, it does
+			 * not mean the "recovery" is stopped, so we will keep
+			 * waitting until timeout or the import state is
+			 * FULL or closed */
+			ptlrpc_pinger_force(imp);
+
+			wait_event_idle_timeout(imp->imp_recovery_waitq,
+						!mgc_import_in_recovery(imp),
+						timeout);
+
+			if (imp->imp_state == LUSTRE_IMP_FULL) {
+				retry = true;
+				goto restart;
+			} else {
+				mutex_lock(&cld->cld_lock);
+				/* unlock/lock mutex, so check stopping again */
+				if (cld->cld_stopping) {
+					mutex_unlock(&cld->cld_lock);
+					RETURN(0);
+				}
+				spin_lock(&config_list_lock);
+				cld->cld_lostlock = 1;
+				spin_unlock(&config_list_lock);
+			}
+		} else {
+			/* mark cld_lostlock so that it will requeue
+			 * after MGC becomes available. */
+			spin_lock(&config_list_lock);
+			cld->cld_lostlock = 1;
+			spin_unlock(&config_list_lock);
+		}
+	}
+
+	if (cld_is_recover(cld) || cld_is_nodemap(cld)) {
+		if (!rcl)
+			rc = mgc_process_recover_nodemap_log(mgc, cld);
+		else if (cld_is_nodemap(cld))
+			rc = rcl;
+
+	} else if (!cld_is_barrier(cld)) {
+		rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
+	}
+
+	CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
+	       mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
+
+	/* Now drop the lock so MGS can revoke it */
+	if (!rcl) {
+		rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, LCK_CR, &lockh);
+		if (rcl)
+			CERROR("Can't drop cfg lock: %d\n", rcl);
+	}
+	mutex_unlock(&cld->cld_lock);
+
+	/* requeue nodemap lock immediately if transfer was interrupted */
+	if ((cld_is_nodemap(cld) && rc == -EAGAIN) ||
+	    (cld_is_recover(cld) && rc)) {
+		if (cld_is_recover(cld))
+			CWARN("%s: IR log %s failed, not fatal: rc = %d\n",
+			      mgc->obd_name, cld->cld_logname, rc);
+		mgc_requeue_add(cld);
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+
+/** Called from lustre_process_log.
+ * LCFG_LOG_START gets the config log from the MGS, processes it to start
+ * any services, and adds it to the list logs to watch (follow).
+ */
+static int mgc_process_config(struct obd_device *obd, size_t len, void *buf)
+{
+        struct lustre_cfg *lcfg = buf;
+        struct config_llog_instance *cfg = NULL;
+        char *logname;
+        int rc = 0;
+        ENTRY;
+
+        switch(lcfg->lcfg_command) {
+        case LCFG_LOV_ADD_OBD: {
+                /* Overloading this cfg command: register a new target */
+                struct mgs_target_info *mti;
+
+                if (LUSTRE_CFG_BUFLEN(lcfg, 1) !=
+                    sizeof(struct mgs_target_info))
+                        GOTO(out, rc = -EINVAL);
+
+                mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1);
+                CDEBUG(D_MGC, "add_target %s %#x\n",
+                       mti->mti_svname, mti->mti_flags);
+                rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti);
+                break;
+        }
+        case LCFG_LOV_DEL_OBD:
+                /* Unregister has no meaning at the moment. */
+                CERROR("lov_del_obd unimplemented\n");
+                rc = -ENOSYS;
+                break;
+        case LCFG_SPTLRPC_CONF: {
+                rc = sptlrpc_process_config(lcfg);
+                break;
+        }
+        case LCFG_LOG_START: {
+                struct config_llog_data *cld;
+                struct super_block *sb;
+
+                logname = lustre_cfg_string(lcfg, 1);
+                cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2);
+                sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3);
+
+                CDEBUG(D_MGC, "parse_log %s from %d\n", logname,
+                       cfg->cfg_last_idx);
+
+                /* We're only called through here on the initial mount */
+		cld = config_log_add(obd, logname, cfg, sb);
+		if (IS_ERR(cld)) {
+			rc = PTR_ERR(cld);
+			break;
+		}
+
+		rc = mgc_process_log(obd, cld);
+		if (rc == 0 && cld->cld_recover != NULL) {
+			if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
+					 imp_connect_data, IMP_RECOV)) {
+				rc = mgc_process_log(obd, cld->cld_recover);
+			} else {
+				struct config_llog_data *cir;
+
+				mutex_lock(&cld->cld_lock);
+				cir = cld->cld_recover;
+				cld->cld_recover = NULL;
+				mutex_unlock(&cld->cld_lock);
+				config_log_put(cir);
+			}
+
+			if (rc)
+				CERROR("Cannot process recover llog %d\n", rc);
+		}
+
+		if (rc == 0 && cld->cld_params != NULL) {
+			rc = mgc_process_log(obd, cld->cld_params);
+			if (rc == -ENOENT) {
+				CDEBUG(D_MGC, "There is no params "
+					      "config file yet\n");
+				rc = 0;
+			}
+			/* params log is optional */
+			if (rc)
+				CERROR("%s: can't process params llog: rc = %d\n",
+				       obd->obd_name, rc);
+		}
+
+                break;
+        }
+        case LCFG_LOG_END: {
+                logname = lustre_cfg_string(lcfg, 1);
+
+                if (lcfg->lcfg_bufcount >= 2)
+                        cfg = (struct config_llog_instance *)lustre_cfg_buf(
+                                lcfg, 2);
+                rc = config_log_end(logname, cfg);
+                break;
+        }
+        default: {
+                CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+                GOTO(out, rc = -EINVAL);
+
+        }
+        }
+out:
+        RETURN(rc);
+}
+
+static const struct obd_ops mgc_obd_ops = {
+        .o_owner        = THIS_MODULE,
+        .o_setup        = mgc_setup,
+        .o_precleanup   = mgc_precleanup,
+        .o_cleanup      = mgc_cleanup,
+        .o_add_conn     = client_import_add_conn,
+        .o_del_conn     = client_import_del_conn,
+        .o_connect      = client_connect_import,
+        .o_disconnect   = client_disconnect_export,
+        .o_set_info_async = mgc_set_info_async,
+        .o_get_info       = mgc_get_info,
+        .o_import_event = mgc_import_event,
+        .o_process_config = mgc_process_config,
+};
+
+static int mgc_param_requeue_timeout_min_set(const char *val,
+				     cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned int num;
+
+	rc = kstrtouint(val, 0, &num);
+	if (rc < 0)
+		return rc;
+	if (num > 120)
+		return -EINVAL;
+
+	mgc_requeue_timeout_min = num;
+
+	return 0;
+}
+
+static const struct kernel_param_ops param_ops_requeue_timeout_min = {
+	.set = mgc_param_requeue_timeout_min_set,
+	.get = param_get_uint,
+};
+
+#define param_check_requeue_timeout_min(name, p) \
+		__param_check(name, p, unsigned int)
+
+unsigned int mgc_requeue_timeout_min = MGC_TIMEOUT_MIN_SECONDS;
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(mgc_requeue_timeout_min, requeue_timeout_min, 0644);
+#else
+module_param_call(mgc_requeue_timeout_min, mgc_param_requeue_timeout_min_set,
+		  param_get_uint, &param_ops_requeue_timeout_min, 0644);
+#endif
+MODULE_PARM_DESC(mgc_requeue_timeout_min, "Minimal requeue time to refresh logs");
+
+static int __init mgc_init(void)
+{
+	return class_register_type(&mgc_obd_ops, NULL, false,
+				   LUSTRE_MGC_NAME, NULL);
+}
+
+static void __exit mgc_exit(void)
+{
+        class_unregister_type(LUSTRE_MGC_NAME);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(mgc_init);
+module_exit(mgc_exit);
diff --git a/drivers/staging/lustrefsx/lustre/nodist b/drivers/staging/lustrefsx/lustre/nodist
new file mode 100644
index 0000000000000..24f55bb96b97d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/nodist
@@ -0,0 +1,9 @@
+obd-*/obd-*
+CVS
+*~
+make.rules
+config.*
+*.o
+*.orig
+*.backup
+.depfiles
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
new file mode 100644
index 0000000000000..1f98113a1df3c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Internal cl interfaces.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+	/**
+	 * Used for submitting a sync I/O.
+	 */
+	struct cl_sync_io clt_anchor;
+};
+
+extern struct kmem_cache *cl_dio_aio_kmem;
+extern struct kmem_cache *cl_sub_dio_kmem;
+extern struct kmem_cache *cl_page_kmem_array[16];
+extern unsigned short cl_page_kmem_size_array[16];
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
new file mode 100644
index 0000000000000..295622f59875a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
@@ -0,0 +1,1439 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Client IO.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/list_sort.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+        return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+        return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+        struct cl_io *up;
+
+        up = io->ci_parent;
+        return
+                /*
+                 * io can own pages only when it is ongoing. Sub-io might
+                 * still be in CIS_LOCKED state when top-io is in
+                 * CIS_IO_GOING.
+                 */
+                ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+                     (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_io_slice    *slice;
+
+        LINVRNT(cl_io_type_is_valid(io->ci_type));
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+	while (!list_empty(&io->ci_layers)) {
+                slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+                                     cis_linkage);
+		list_del_init(&slice->cis_linkage);
+                if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+                        slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+                /*
+                 * Invalidate slice to catch use after free. This assumes that
+                 * slices are allocated within session and can be touched
+                 * after ->cio_fini() returns.
+                 */
+                slice->cis_io = NULL;
+        }
+        io->ci_state = CIS_FINI;
+
+	/* sanity check for layout change */
+	switch(io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+	case CIT_DATA_VERSION:
+	case CIT_FAULT:
+		break;
+	case CIT_FSYNC:
+		LASSERT(!io->ci_need_restart);
+		break;
+	case CIT_SETATTR:
+	case CIT_MISC:
+		/* Check ignore layout change conf */
+		LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+				!io->ci_need_restart));
+	case CIT_GLIMPSE:
+		break;
+	case CIT_LADVISE:
+	case CIT_LSEEK:
+		break;
+	default:
+		LBUG();
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+                       enum cl_io_type iot, struct cl_object *obj)
+{
+        struct cl_object *scan;
+        int result;
+
+        LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+        LINVRNT(cl_io_type_is_valid(iot));
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+        io->ci_type = iot;
+	INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+	INIT_LIST_HEAD(&io->ci_layers);
+
+        result = 0;
+        cl_object_for_each(scan, obj) {
+                if (scan->co_ops->coo_io_init != NULL) {
+                        result = scan->co_ops->coo_io_init(env, scan, io);
+                        if (result != 0)
+                                break;
+                }
+        }
+        if (result == 0)
+                io->ci_state = CIS_INIT;
+        RETURN(result);
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+                   enum cl_io_type iot, struct cl_object *obj)
+{
+        LASSERT(obj != cl_object_top(obj));
+
+        return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+               enum cl_io_type iot, struct cl_object *obj)
+{
+	LASSERT(obj == cl_object_top(obj));
+
+	/* clear I/O restart from previous instance */
+	io->ci_need_restart = 0;
+
+	return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+		  enum cl_io_type iot, loff_t pos, size_t count)
+{
+	LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+	LINVRNT(io->ci_obj != NULL);
+	ENTRY;
+
+	LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+			 "io range: %u [%llu, %llu) %u %u\n",
+			 iot, (__u64)pos, (__u64)pos + count,
+			 io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+	io->u.ci_rw.crw_pos    = pos;
+	io->u.ci_rw.crw_count  = count;
+	RETURN(cl_io_init(env, io, iot, io->ci_obj));
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+#ifdef HAVE_LIST_CMP_FUNC_T
+static int cl_lock_descr_cmp(void *priv,
+			     const struct list_head *a,
+			     const struct list_head *b)
+#else /* !HAVE_LIST_CMP_FUNC_T */
+static int cl_lock_descr_cmp(void *priv,
+			     struct list_head *a, struct list_head *b)
+#endif /* HAVE_LIST_CMP_FUNC_T */
+{
+	const struct cl_io_lock_link *l0 = list_entry(a, struct cl_io_lock_link,
+						      cill_linkage);
+	const struct cl_io_lock_link *l1 = list_entry(b, struct cl_io_lock_link,
+						      cill_linkage);
+	const struct cl_lock_descr *d0 = &l0->cill_descr;
+	const struct cl_lock_descr *d1 = &l1->cill_descr;
+
+	return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu),
+			  lu_object_fid(&d1->cld_obj->co_lu));
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+				const struct cl_lock_descr *d1)
+{
+	d0->cld_start = min(d0->cld_start, d1->cld_start);
+	d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+	if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+		d0->cld_mode = CLM_WRITE;
+
+	if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+		d0->cld_mode = CLM_GROUP;
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	struct cl_io_lock_link *scan;
+
+	ENTRY;
+	list_for_each_entry(scan, &set->cls_todo, cill_linkage) {
+		if (!cl_object_same(scan->cill_descr.cld_obj, need->cld_obj))
+			continue;
+
+		/* Merge locks for the same object because ldlm lock server
+		 * may expand the lock extent, otherwise there is a deadlock
+		 * case if two conflicted locks are queueud for the same object
+		 * and lock server expands one lock to overlap the another.
+		 * The side effect is that it can generate a multi-stripe lock
+		 * that may cause casacading problem */
+		cl_lock_descr_merge(&scan->cill_descr, need);
+		CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+		       scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+		       scan->cill_descr.cld_end);
+		RETURN(+1);
+	}
+	RETURN(0);
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+			   struct cl_lockset *set)
+{
+	struct cl_io_lock_link *link;
+	struct cl_io_lock_link *temp;
+	int result;
+
+	ENTRY;
+	result = 0;
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+		result = cl_lock_request(env, io, &link->cill_lock);
+		if (result < 0)
+			break;
+
+		list_move(&link->cill_linkage, &set->cls_done);
+	}
+	RETURN(result);
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+        int result = 0;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_IT_STARTED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+		if (result != 0)
+			break;
+	}
+        if (result == 0) {
+		/*
+		 * Sort locks in lexicographical order of their (fid,
+		 * start-offset) pairs to avoid deadlocks.
+		 */
+		list_sort(NULL, &io->ci_lockset.cls_todo, cl_lock_descr_cmp);
+                result = cl_lockset_lock(env, io, &io->ci_lockset);
+        }
+        if (result != 0)
+                cl_io_unlock(env, io);
+        else
+                io->ci_state = CIS_LOCKED;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+        struct cl_lockset        *set;
+        struct cl_io_lock_link   *link;
+        struct cl_io_lock_link   *temp;
+        const struct cl_io_slice *scan;
+
+        LASSERT(cl_io_is_loopable(io));
+        LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+        set = &io->ci_lockset;
+
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+		list_del_init(&link->cill_linkage);
+		if (link->cill_fini != NULL)
+			link->cill_fini(env, link);
+	}
+
+	list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+		list_del_init(&link->cill_linkage);
+		cl_lock_release(env, &link->cill_lock);
+		if (link->cill_fini != NULL)
+			link->cill_fini(env, link);
+	}
+
+	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+			scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+	}
+	io->ci_state = CIS_UNLOCKED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+        int result;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+        result = 0;
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+								      scan);
+		if (result != 0)
+			break;
+	}
+        if (result == 0)
+                io->ci_state = CIS_IT_STARTED;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state <= CIS_IT_STARTED ||
+		io->ci_state > CIS_IO_FINISHED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+			scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+	}
+	io->ci_state = CIS_IT_ENDED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+	const struct cl_io_slice *scan;
+
+	ENTRY;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+		nob == 0);
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(cl_io_invariant(io));
+
+	io->u.ci_rw.crw_pos   += nob;
+	io->u.ci_rw.crw_count -= nob;
+
+	/* layers have to be notified. */
+	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+			scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+								   nob);
+	}
+	EXIT;
+}
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+                   struct cl_io_lock_link *link)
+{
+        int result;
+
+        ENTRY;
+        if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+                result = +1;
+        else {
+		list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+                result = 0;
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+                                 struct cl_io_lock_link *link)
+{
+        OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                         struct cl_lock_descr *descr)
+{
+	struct cl_io_lock_link *link;
+	int result;
+
+	ENTRY;
+	OBD_ALLOC_PTR(link);
+	if (link != NULL) {
+		link->cill_descr = *descr;
+		link->cill_fini  = cl_free_io_lock_link;
+		result = cl_io_lock_add(env, io, link);
+		if (result) /* lock match */
+			link->cill_fini(env, link);
+	} else
+		result = -ENOMEM;
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+        int result = 0;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_LOCKED);
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+        io->ci_state = CIS_IO_GOING;
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+		if (result != 0)
+			break;
+	}
+        if (result >= 0)
+                result = 0;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_IO_GOING);
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+			scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+		/* TODO: error handling. */
+	}
+        io->ci_state = CIS_IO_FINISHED;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+/**
+ * Called by read io, to decide the readahead extent
+ *
+ * \see cl_io_operations::cio_read_ahead()
+ */
+int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io,
+		     pgoff_t start, struct cl_read_ahead *ra)
+{
+	const struct cl_io_slice *scan;
+	int                       result = 0;
+
+	LINVRNT(io->ci_type == CIT_READ ||
+		io->ci_type == CIT_FAULT ||
+		io->ci_type == CIT_WRITE);
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->cio_read_ahead == NULL)
+			continue;
+
+		result = scan->cis_iop->cio_read_ahead(env, scan, start, ra);
+		if (result != 0)
+			break;
+	}
+	RETURN(result > 0 ? 0 : result);
+}
+EXPORT_SYMBOL(cl_io_read_ahead);
+
+/**
+ * Called before io start, to reserve enough LRU slots to avoid
+ * deadlock.
+ *
+ * \see cl_io_operations::cio_lru_reserve()
+ */
+int cl_io_lru_reserve(const struct lu_env *env, struct cl_io *io,
+		      loff_t pos, size_t bytes)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->cio_lru_reserve) {
+			result = scan->cis_iop->cio_lru_reserve(env, scan,
+								pos, bytes);
+			if (result)
+				break;
+		}
+	}
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lru_reserve);
+
+/**
+ * Commit a list of contiguous pages into writeback cache.
+ *
+ * \returns 0 if all pages committed, or errcode if error occurred.
+ * \see cl_io_operations::cio_commit_async()
+ */
+int cl_io_commit_async(const struct lu_env *env, struct cl_io *io,
+		       struct cl_page_list *queue, int from, int to,
+		       cl_commit_cbt cb)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+	ENTRY;
+
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->cio_commit_async == NULL)
+			continue;
+		result = scan->cis_iop->cio_commit_async(env, scan, queue,
+							 from, to, cb);
+		if (result != 0)
+			break;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_commit_async);
+
+void cl_io_extent_release(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	ENTRY;
+
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->cio_extent_release == NULL)
+			continue;
+		scan->cis_iop->cio_extent_release(env, scan);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_extent_release);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+		    enum cl_req_type crt, struct cl_2queue *queue)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+	ENTRY;
+
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->cio_submit == NULL)
+			continue;
+		result = scan->cis_iop->cio_submit(env, scan, crt, queue);
+		if (result != 0)
+			break;
+	}
+	/*
+	 * If ->cio_submit() failed, no pages were sent.
+	 */
+	LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+		      enum cl_req_type iot, struct cl_2queue *queue,
+		      long timeout)
+{
+	struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+	struct cl_page *pg;
+	int rc;
+	ENTRY;
+
+	cl_page_list_for_each(pg, &queue->c2_qin) {
+		LASSERT(pg->cp_sync_io == NULL);
+		pg->cp_sync_io = anchor;
+	}
+
+	cl_sync_io_init(anchor, queue->c2_qin.pl_nr);
+	rc = cl_io_submit_rw(env, io, iot, queue);
+	if (rc == 0) {
+		/*
+		 * If some pages weren't sent for any reason (e.g.,
+		 * read found up-to-date pages in the cache, or write found
+		 * clean pages), count them as completed to avoid infinite
+		 * wait.
+		 */
+		cl_page_list_for_each(pg, &queue->c2_qin) {
+			pg->cp_sync_io = NULL;
+			cl_sync_io_note(env, anchor, 1);
+		}
+
+		/* wait for the IO to be finished. */
+		rc = cl_sync_io_wait(env, anchor, timeout);
+		cl_page_list_assume(env, io, &queue->c2_qout);
+	} else {
+		LASSERT(list_empty(&queue->c2_qout.pl_pages));
+		cl_page_list_for_each(pg, &queue->c2_qin)
+			pg->cp_sync_io = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ *    - cl_io_iter_init()
+ *
+ *    - cl_io_lock()
+ *
+ *    - cl_io_start()
+ *
+ *    - cl_io_end()
+ *
+ *    - cl_io_unlock()
+ *
+ *    - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+	int result = 0;
+	int rc = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	ENTRY;
+
+	do {
+		size_t nob;
+
+		io->ci_continue = 0;
+		result = cl_io_iter_init(env, io);
+		if (result == 0) {
+			nob    = io->ci_nob;
+			result = cl_io_lock(env, io);
+			if (result == 0) {
+				/*
+				 * Notify layers that locks has been taken,
+				 * and do actual i/o.
+				 *
+				 *   - llite: kms, short read;
+				 *   - llite: generic_file_read();
+				 */
+				result = cl_io_start(env, io);
+				/*
+				 * Send any remaining pending
+				 * io, etc.
+				 *
+				 **   - llite: ll_rw_stats_tally.
+				 */
+				cl_io_end(env, io);
+				cl_io_unlock(env, io);
+				cl_io_rw_advance(env, io, io->ci_nob - nob);
+			}
+		}
+		cl_io_iter_fini(env, io);
+		if (result)
+			rc = result;
+	} while ((result == 0 || result == -EIOCBQUEUED) &&
+		 io->ci_continue);
+
+	if (rc && !result)
+		result = rc;
+
+	if (result == -EAGAIN && io->ci_ndelay) {
+		io->ci_need_restart = 1;
+		result = 0;
+	}
+
+	if (result == 0)
+		result = io->ci_result;
+	RETURN(result < 0 ? result : 0);
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+		     struct cl_object *obj,
+		     const struct cl_io_operations *ops)
+{
+	struct list_head *linkage = &slice->cis_linkage;
+
+	LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+		list_empty(linkage));
+	ENTRY;
+
+	list_add_tail(linkage, &io->ci_layers);
+	slice->cis_io  = io;
+	slice->cis_obj = obj;
+	slice->cis_iop = ops;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+	ENTRY;
+	plist->pl_nr = 0;
+	INIT_LIST_HEAD(&plist->pl_pages);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page,
+		      bool get_ref)
+{
+	ENTRY;
+	/* it would be better to check that page is owned by "current" io, but
+	 * it is not passed here. */
+	LASSERT(page->cp_owner != NULL);
+
+	LASSERT(list_empty(&page->cp_batch));
+	list_add_tail(&page->cp_batch, &plist->pl_pages);
+	++plist->pl_nr;
+	lu_ref_add_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist);
+	if (get_ref)
+		cl_page_get(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+		      struct cl_page_list *plist, struct cl_page *page)
+{
+	LASSERT(plist->pl_nr > 0);
+	LASSERT(cl_page_is_vmlocked(env, page));
+
+	ENTRY;
+	list_del_init(&page->cp_batch);
+	--plist->pl_nr;
+	lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist);
+	cl_page_put(env, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+		       struct cl_page *page)
+{
+	LASSERT(src->pl_nr > 0);
+
+	ENTRY;
+	list_move_tail(&page->cp_batch, &dst->pl_pages);
+	--src->pl_nr;
+	++dst->pl_nr;
+	lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+		      src, dst);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * Moves a page from one page list to the head of another list.
+ */
+void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src,
+			    struct cl_page *page)
+{
+	LASSERT(src->pl_nr > 0);
+
+	ENTRY;
+	list_move(&page->cp_batch, &dst->pl_pages);
+	--src->pl_nr;
+	++dst->pl_nr;
+	lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+			src, dst);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move_head);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *src, struct cl_page_list *dst)
+{
+#ifdef CONFIG_LUSTRE_DEBUG_LU_REF
+	struct cl_page *page;
+	struct cl_page *tmp;
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, tmp, src)
+		lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref,
+			      "queue", src, dst);
+#else
+	ENTRY;
+#endif
+	dst->pl_nr += src->pl_nr;
+	src->pl_nr = 0;
+	list_splice_tail_init(&src->pl_pages, &dst->pl_pages);
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(plist->pl_nr > 0);
+
+		list_del_init(&page->cp_batch);
+		--plist->pl_nr;
+		/*
+		 * cl_page_disown0 rather than usual cl_page_disown() is used,
+		 * because pages are possibly in CPS_FREEING state already due
+		 * to the call to cl_page_list_discard().
+		 */
+		/*
+		 * XXX cl_page_disown0() will fail if page is not locked.
+		 */
+		cl_page_disown0(env, io, page);
+		lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+			      plist);
+		cl_page_put(env, page);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist)
+		cl_page_list_del(env, plist, page);
+	LASSERT(plist->pl_nr == 0);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+
+	cl_page_list_for_each(page, plist)
+		cl_page_assume(env, io, page);
+}
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	ENTRY;
+	cl_page_list_for_each(page, plist)
+		cl_page_discard(env, io, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_init(&queue->c2_qin);
+        cl_page_list_init(&queue->c2_qout);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page, bool get_ref)
+{
+	cl_page_list_add(&queue->c2_qin, page, get_ref);
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+                      struct cl_io *io, struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_disown(env, io, &queue->c2_qin);
+        cl_page_list_disown(env, io, &queue->c2_qout);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+                       struct cl_io *io, struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_discard(env, io, &queue->c2_qin);
+        cl_page_list_discard(env, io, &queue->c2_qout);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+                      struct cl_io *io, struct cl_2queue *queue)
+{
+        cl_page_list_assume(env, io, &queue->c2_qin);
+        cl_page_list_assume(env, io, &queue->c2_qout);
+}
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_fini(env, &queue->c2_qout);
+        cl_page_list_fini(env, &queue->c2_qin);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+	ENTRY;
+	cl_2queue_init(queue);
+	cl_2queue_add(queue, page, true);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top()
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+        ENTRY;
+        while (io->ci_parent != NULL)
+                io = io->ci_parent;
+        RETURN(io);
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_req_attr *attr)
+{
+	struct cl_object *scan;
+	ENTRY;
+
+	cl_object_for_each(scan, obj) {
+		if (scan->co_ops->coo_req_attr_set != NULL)
+			scan->co_ops->coo_req_attr_set(env, scan, attr);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/**
+ * Initialize synchronous io wait \a anchor for \a nr pages with optional
+ * \a end handler.
+ * \param anchor owned by caller, initialzied here.
+ * \param nr number of pages initally pending in sync.
+ * \param end optional callback sync_io completion, can be used to
+ *  trigger erasure coding, integrity, dedupe, or similar operation.
+ * \q end is called with a spinlock on anchor->csi_waitq.lock
+ */
+
+void cl_sync_io_init_notify(struct cl_sync_io *anchor, int nr,
+			    void *dio_aio, cl_sync_io_end_t *end)
+{
+	ENTRY;
+	memset(anchor, 0, sizeof(*anchor));
+	init_waitqueue_head(&anchor->csi_waitq);
+	atomic_set(&anchor->csi_sync_nr, nr);
+	anchor->csi_sync_rc = 0;
+	anchor->csi_end_io = end;
+	anchor->csi_dio_aio = dio_aio;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_init_notify);
+
+/**
+ * Wait until all IO completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every entity.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor,
+		    long timeout)
+{
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(timeout >= 0);
+
+	if (timeout > 0 &&
+	    wait_event_idle_timeout(anchor->csi_waitq,
+				    atomic_read(&anchor->csi_sync_nr) == 0,
+				    cfs_time_seconds(timeout)) == 0) {
+		rc = -ETIMEDOUT;
+		CERROR("IO failed: %d, still wait for %d remaining entries\n",
+		       rc, atomic_read(&anchor->csi_sync_nr));
+	}
+
+	wait_event_idle(anchor->csi_waitq,
+			atomic_read(&anchor->csi_sync_nr) == 0);
+	if (!rc)
+		rc = anchor->csi_sync_rc;
+
+	/* We take the lock to ensure that cl_sync_io_note() has finished */
+	spin_lock(&anchor->csi_waitq.lock);
+	LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+	spin_unlock(&anchor->csi_waitq.lock);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+static inline void dio_aio_complete(struct kiocb *iocb, ssize_t res)
+{
+#ifdef HAVE_AIO_COMPLETE
+	aio_complete(iocb, res, 0);
+#else
+	if (iocb->ki_complete)
+# ifdef HAVE_KIOCB_COMPLETE_2ARGS
+		iocb->ki_complete(iocb, res);
+# else
+		iocb->ki_complete(iocb, res, 0);
+# endif
+#endif
+}
+
+static void cl_dio_aio_end(const struct lu_env *env, struct cl_sync_io *anchor)
+{
+	struct cl_dio_aio *aio = container_of(anchor, typeof(*aio), cda_sync);
+	ssize_t ret = anchor->csi_sync_rc;
+
+	ENTRY;
+
+	if (!aio->cda_no_aio_complete)
+		dio_aio_complete(aio->cda_iocb, ret ?: aio->cda_bytes);
+
+	EXIT;
+}
+
+static void cl_sub_dio_end(const struct lu_env *env, struct cl_sync_io *anchor)
+{
+	struct cl_sub_dio *sdio = container_of(anchor, typeof(*sdio), csd_sync);
+	ssize_t ret = anchor->csi_sync_rc;
+
+	ENTRY;
+
+	/* release pages */
+	while (sdio->csd_pages.pl_nr > 0) {
+		struct cl_page *page = cl_page_list_first(&sdio->csd_pages);
+
+		cl_page_delete(env, page);
+		cl_page_list_del(env, &sdio->csd_pages, page);
+	}
+
+	ll_release_user_pages(sdio->csd_dio_pages.ldp_pages,
+			      sdio->csd_dio_pages.ldp_count);
+	cl_sync_io_note(env, &sdio->csd_ll_aio->cda_sync, ret);
+
+	EXIT;
+}
+
+struct cl_dio_aio *cl_dio_aio_alloc(struct kiocb *iocb, struct cl_object *obj,
+				    bool is_aio)
+{
+	struct cl_dio_aio *aio;
+
+	OBD_SLAB_ALLOC_PTR_GFP(aio, cl_dio_aio_kmem, GFP_NOFS);
+	if (aio != NULL) {
+		/*
+		 * Hold one ref so that it won't be released until
+		 * every pages is added.
+		 */
+		cl_sync_io_init_notify(&aio->cda_sync, 1, aio, cl_dio_aio_end);
+		aio->cda_iocb = iocb;
+		aio->cda_no_aio_complete = !is_aio;
+		/* if this is true AIO, the memory is freed by the last call
+		 * to cl_sync_io_note (when all the I/O is complete), because
+		 * no one is waiting (in the kernel) for this to complete
+		 *
+		 * in other cases, the last user is cl_sync_io_wait, and in
+		 * that case, the creator frees the struct after that call
+		 */
+		aio->cda_creator_free = !is_aio;
+
+		cl_object_get(obj);
+		aio->cda_obj = obj;
+	}
+	return aio;
+}
+EXPORT_SYMBOL(cl_dio_aio_alloc);
+
+struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio, bool sync)
+{
+	struct cl_sub_dio *sdio;
+
+	OBD_SLAB_ALLOC_PTR_GFP(sdio, cl_sub_dio_kmem, GFP_NOFS);
+	if (sdio != NULL) {
+		/*
+		 * Hold one ref so that it won't be released until
+		 * every pages is added.
+		 */
+		cl_sync_io_init_notify(&sdio->csd_sync, 1, sdio,
+				       cl_sub_dio_end);
+		cl_page_list_init(&sdio->csd_pages);
+
+		sdio->csd_ll_aio = ll_aio;
+		atomic_add(1,  &ll_aio->cda_sync.csi_sync_nr);
+		sdio->csd_creator_free = sync;
+	}
+	return sdio;
+}
+EXPORT_SYMBOL(cl_sub_dio_alloc);
+
+void cl_dio_aio_free(const struct lu_env *env, struct cl_dio_aio *aio)
+{
+	if (aio) {
+		cl_object_put(env, aio->cda_obj);
+		OBD_SLAB_FREE_PTR(aio, cl_dio_aio_kmem);
+	}
+}
+EXPORT_SYMBOL(cl_dio_aio_free);
+
+void cl_sub_dio_free(struct cl_sub_dio *sdio)
+{
+	if (sdio)
+		OBD_SLAB_FREE_PTR(sdio, cl_sub_dio_kmem);
+}
+EXPORT_SYMBOL(cl_sub_dio_free);
+/*
+ * ll_release_user_pages - tear down page struct array
+ * @pages: array of page struct pointers underlying target buffer
+ */
+void ll_release_user_pages(struct page **pages, int npages)
+{
+	int i;
+
+	if (npages == 0) {
+		LASSERT(!pages);
+		return;
+	}
+
+	for (i = 0; i < npages; i++) {
+		if (!pages[i])
+			break;
+		put_page(pages[i]);
+	}
+
+#if defined(HAVE_DIO_ITER)
+	kvfree(pages);
+#else
+	OBD_FREE_PTR_ARRAY_LARGE(pages, npages);
+#endif
+}
+EXPORT_SYMBOL(ll_release_user_pages);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor,
+		     int ioret)
+{
+	ENTRY;
+
+	if (anchor->csi_sync_rc == 0 && ioret < 0)
+		anchor->csi_sync_rc = ioret;
+	/*
+	 * Synchronous IO done without releasing page lock (e.g., as a part of
+	 * ->{prepare,commit}_write(). Completion is used to signal the end of
+	 * IO.
+	 */
+	LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+	if (atomic_dec_and_lock(&anchor->csi_sync_nr,
+				&anchor->csi_waitq.lock)) {
+		struct cl_sub_dio *sub_dio_aio = NULL;
+		struct cl_dio_aio *dio_aio = NULL;
+		void *csi_dio_aio = NULL;
+		bool creator_free = true;
+
+		cl_sync_io_end_t *end_io = anchor->csi_end_io;
+
+		/*
+		 * Holding the lock across both the decrement and
+		 * the wakeup ensures cl_sync_io_wait() doesn't complete
+		 * before the wakeup completes and the contents of
+		 * of anchor become unsafe to access as the owner is free
+		 * to immediately reclaim anchor when cl_sync_io_wait()
+		 * completes.
+		 */
+		wake_up_locked(&anchor->csi_waitq);
+		if (end_io)
+			end_io(env, anchor);
+
+		csi_dio_aio = anchor->csi_dio_aio;
+		sub_dio_aio = csi_dio_aio;
+		dio_aio = csi_dio_aio;
+
+		if (csi_dio_aio && end_io == cl_dio_aio_end)
+			creator_free = dio_aio->cda_creator_free;
+		else if (csi_dio_aio && end_io == cl_sub_dio_end)
+			creator_free = sub_dio_aio->csd_creator_free;
+
+		spin_unlock(&anchor->csi_waitq.lock);
+
+		if (csi_dio_aio) {
+			if (end_io == cl_dio_aio_end) {
+				if (!creator_free)
+					cl_dio_aio_free(env, dio_aio);
+			} else if (end_io == cl_sub_dio_end) {
+				if (!creator_free)
+					cl_sub_dio_free(sub_dio_aio);
+			}
+		}
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_note);
+
+int cl_sync_io_wait_recycle(const struct lu_env *env, struct cl_sync_io *anchor,
+			    long timeout, int ioret)
+{
+	int rc = 0;
+
+	/*
+	 * @anchor was inited as 1 to prevent end_io to be
+	 * called before we add all pages for IO, so drop
+	 * one extra reference to make sure we could wait
+	 * count to be zero.
+	 */
+	cl_sync_io_note(env, anchor, ioret);
+	/* Wait for completion of normal dio.
+	 * This replaces the EIOCBQEUED return from the DIO/AIO
+	 * path, and this is where AIO and DIO implementations
+	 * split.
+	 */
+	rc = cl_sync_io_wait(env, anchor, timeout);
+	/**
+	 * One extra reference again, as if @anchor is
+	 * reused we assume it as 1 before using.
+	 */
+	atomic_add(1, &anchor->csi_sync_nr);
+
+	return rc;
+}
+EXPORT_SYMBOL(cl_sync_io_wait_recycle);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
new file mode 100644
index 0000000000000..6dd0663161649
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
@@ -0,0 +1,289 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Client Extent Lock.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+			   const char *prefix, const struct cl_lock *lock,
+			   const char *func, const int line)
+{
+	struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+	CDEBUG(level, "%s: %p (%p/%d) at %s():%d\n",
+	       prefix, lock, env, h->coh_nesting, func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock)                         \
+        cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__)
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                       struct cl_object *obj,
+                       const struct cl_lock_operations *ops)
+{
+	ENTRY;
+	slice->cls_lock = lock;
+	list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+	slice->cls_obj = obj;
+	slice->cls_ops = ops;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+
+	cl_lock_trace(D_DLMTRACE, env, "destroy lock", lock);
+
+	while (!list_empty(&lock->cll_layers)) {
+		struct cl_lock_slice *slice;
+
+		slice = list_entry(lock->cll_layers.next,
+				struct cl_lock_slice, cls_linkage);
+		list_del_init(lock->cll_layers.next);
+		slice->cls_ops->clo_fini(env, slice);
+	}
+	POISON(lock, 0x5a, sizeof(*lock));
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_fini);
+
+int cl_lock_init(const struct lu_env *env, struct cl_lock *lock,
+		 const struct cl_io *io)
+{
+	struct cl_object *obj = lock->cll_descr.cld_obj;
+	struct cl_object *scan;
+	int result = 0;
+	ENTRY;
+
+	/* Make sure cl_lock::cll_descr is initialized. */
+	LASSERT(obj != NULL);
+
+	INIT_LIST_HEAD(&lock->cll_layers);
+	cl_object_for_each(scan, obj) {
+		if (scan->co_ops->coo_lock_init != NULL)
+			result = scan->co_ops->coo_lock_init(env, scan, lock,
+							     io);
+
+		if (result != 0) {
+			cl_lock_fini(env, lock);
+			break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_init);
+
+/**
+ * Returns a slice with a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype)
+{
+	const struct cl_lock_slice *slice;
+
+	ENTRY;
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+			RETURN(slice);
+	}
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	ENTRY;
+
+	cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_cancel != NULL)
+			slice->cls_ops->clo_cancel(env, slice);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Enqueue a lock.
+ * \param anchor: if we need to wait for resources before getting the lock,
+ *                use @anchor for the purpose.
+ * \retval 0  enqueue successfully
+ * \retval <0 error code
+ */
+int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io,
+		    struct cl_lock *lock, struct cl_sync_io *anchor)
+{
+	const struct cl_lock_slice *slice;
+	int rc = 0;
+
+	ENTRY;
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_enqueue == NULL)
+			continue;
+
+		rc = slice->cls_ops->clo_enqueue(env, slice, io, anchor);
+		if (rc != 0)
+			break;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_enqueue);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+int cl_lock_request(const struct lu_env *env, struct cl_io *io,
+		    struct cl_lock *lock)
+{
+	struct cl_sync_io	*anchor = NULL;
+	__u32			enq_flags = lock->cll_descr.cld_enq_flags;
+	int			rc;
+	ENTRY;
+
+	rc = cl_lock_init(env, lock, io);
+	if (rc < 0)
+		RETURN(rc);
+
+	if ((enq_flags & CEF_GLIMPSE) && !(enq_flags & CEF_SPECULATIVE)) {
+		anchor = &cl_env_info(env)->clt_anchor;
+		cl_sync_io_init(anchor, 1);
+	}
+
+	rc = cl_lock_enqueue(env, io, lock, anchor);
+
+	if (anchor != NULL) {
+		int rc2;
+
+		/* drop the reference count held at initialization time */
+		cl_sync_io_note(env, anchor, 0);
+		rc2 = cl_sync_io_wait(env, anchor, 0);
+		if (rc2 < 0 && rc == 0)
+			rc = rc2;
+	}
+
+	if (rc < 0)
+		cl_lock_release(env, lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+
+	cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+	cl_lock_cancel(env, lock);
+	cl_lock_fini(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+	static const char * const names[] = {
+		[CLM_READ]    = "R",
+		[CLM_WRITE]   = "W",
+		[CLM_GROUP]   = "G"
+	};
+	BUILD_BUG_ON(CLM_MAX != ARRAY_SIZE(names));
+	return names[mode];
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+			 lu_printer_t printer,
+			 const struct cl_lock_descr *descr)
+{
+	const struct lu_fid  *fid;
+
+	fid = lu_object_fid(&descr->cld_obj->co_lu);
+	(*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+
+	(*printer)(env, cookie, "lock@%p", lock);
+	cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+	(*printer)(env, cookie, " {\n");
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		(*printer)(env, cookie, "    %s@%p: ",
+			   slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+			   slice);
+		if (slice->cls_ops->clo_print != NULL)
+			slice->cls_ops->clo_print(env, cookie, printer, slice);
+		(*printer)(env, cookie, "\n");
+	}
+	(*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
new file mode 100644
index 0000000000000..8c29b5a164950
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
@@ -0,0 +1,1118 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Client Lustre Object.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+/*
+ * Locking.
+ *
+ *  i_mutex
+ *      PG_locked
+ *          ->coh_attr_guard
+ *          ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <libcfs/libcfs_hash.h> /* for cfs_hash stuff */
+#include <cl_object.h>
+#include <lu_object.h>
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+struct kmem_cache *cl_dio_aio_kmem;
+struct kmem_cache *cl_sub_dio_kmem;
+struct kmem_cache *cl_page_kmem_array[16];
+unsigned short cl_page_kmem_size_array[16];
+
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+	int result;
+
+	ENTRY;
+	result = lu_object_header_init(&h->coh_lu);
+	if (result == 0) {
+		spin_lock_init(&h->coh_attr_guard);
+		lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+		h->coh_page_bufsize = 0;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+        lu_object_header_fini(&h->coh_lu);
+}
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+                                 struct cl_device *cd, const struct lu_fid *fid,
+                                 const struct cl_object_conf *c)
+{
+	might_sleep();
+        return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+        lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+        lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+        struct cl_object_header *hdr = cl_object_header(o);
+        struct cl_object *top;
+
+        while (hdr->coh_parent != NULL)
+                hdr = hdr->coh_parent;
+
+        top = lu2cl(lu_object_top(&hdr->coh_lu));
+        CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+        return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+	return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_update().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+__acquires(cl_object_attr_guard(o))
+{
+	spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+__releases(cl_object_attr_guard(o))
+{
+	spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *top,
+			struct cl_attr *attr)
+{
+	struct cl_object *obj;
+	int result = 0;
+
+	assert_spin_locked(cl_object_attr_guard(top));
+	ENTRY;
+
+	cl_object_for_each(obj, top) {
+		if (obj->co_ops->coo_attr_get != NULL) {
+			result = obj->co_ops->coo_attr_get(env, obj, attr);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_upd_attr() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_update(const struct lu_env *env, struct cl_object *top,
+			  const struct cl_attr *attr, unsigned v)
+{
+	struct cl_object *obj;
+	int result = 0;
+
+	assert_spin_locked(cl_object_attr_guard(top));
+	ENTRY;
+
+	cl_object_for_each_reverse(obj, top) {
+		if (obj->co_ops->coo_attr_update != NULL) {
+			result = obj->co_ops->coo_attr_update(env, obj, attr,
+							      v);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_update);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *top,
+		      struct ost_lvb *lvb)
+{
+	struct cl_object *obj;
+	int result = 0;
+
+	ENTRY;
+	cl_object_for_each_reverse(obj, top) {
+		if (obj->co_ops->coo_glimpse != NULL) {
+			result = obj->co_ops->coo_glimpse(env, obj, lvb);
+			if (result != 0)
+				break;
+		}
+	}
+	LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top->co_lu.lo_header),
+			 "size: %llu mtime: %llu atime: %llu "
+			 "ctime: %llu blocks: %llu\n",
+			 lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+			 lvb->lvb_ctime, lvb->lvb_blocks);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *top,
+		const struct cl_object_conf *conf)
+{
+	struct cl_object *obj;
+	int result = 0;
+
+	ENTRY;
+	cl_object_for_each(obj, top) {
+		if (obj->co_ops->coo_conf_set != NULL) {
+			result = obj->co_ops->coo_conf_set(env, obj, conf);
+			if (result)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+int cl_object_prune(const struct lu_env *env, struct cl_object *top)
+{
+	struct cl_object *obj;
+	int result = 0;
+	ENTRY;
+
+	cl_object_for_each(obj, top) {
+		if (obj->co_ops->coo_prune != NULL) {
+			result = obj->co_ops->coo_prune(env, obj);
+			if (result)
+				break;
+		}
+	}
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Get stripe information of this object.
+ */
+int cl_object_getstripe(const struct lu_env *env, struct cl_object *top,
+			struct lov_user_md __user *uarg, size_t size)
+{
+	struct cl_object *obj;
+	int result = 0;
+	ENTRY;
+
+	cl_object_for_each(obj, top) {
+		if (obj->co_ops->coo_getstripe) {
+			result = obj->co_ops->coo_getstripe(env, obj, uarg,
+							    size);
+			if (result)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_getstripe);
+
+/**
+ * Get fiemap extents from file object.
+ *
+ * \param env [in]	lustre environment
+ * \param obj [in]	file object
+ * \param key [in]	fiemap request argument
+ * \param fiemap [out]	fiemap extents mapping retrived
+ * \param buflen [in]	max buffer length of @fiemap
+ *
+ * \retval 0	success
+ * \retval < 0	error
+ */
+int cl_object_fiemap(const struct lu_env *env, struct cl_object *top,
+		     struct ll_fiemap_info_key *key,
+		     struct fiemap *fiemap, size_t *buflen)
+{
+	struct cl_object *obj;
+	int result = 0;
+	ENTRY;
+
+	cl_object_for_each(obj, top) {
+		if (obj->co_ops->coo_fiemap) {
+			result = obj->co_ops->coo_fiemap(env, obj, key, fiemap,
+							 buflen);
+			if (result)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_fiemap);
+
+int cl_object_layout_get(const struct lu_env *env, struct cl_object *top,
+			 struct cl_layout *cl)
+{
+	struct cl_object *obj;
+	ENTRY;
+
+	cl_object_for_each(obj, top) {
+		if (obj->co_ops->coo_layout_get)
+			return obj->co_ops->coo_layout_get(env, obj, cl);
+	}
+
+	RETURN(-EOPNOTSUPP);
+}
+EXPORT_SYMBOL(cl_object_layout_get);
+
+loff_t cl_object_maxbytes(struct cl_object *top)
+{
+	struct cl_object *obj;
+	loff_t maxbytes = LLONG_MAX;
+	ENTRY;
+
+	cl_object_for_each(obj, top) {
+		if (obj->co_ops->coo_maxbytes)
+			maxbytes = min_t(loff_t, obj->co_ops->coo_maxbytes(obj),
+					 maxbytes);
+	}
+
+	RETURN(maxbytes);
+}
+EXPORT_SYMBOL(cl_object_maxbytes);
+
+int cl_object_flush(const struct lu_env *env, struct cl_object *top,
+			 struct ldlm_lock *lock)
+{
+	struct cl_object *obj;
+	int rc = 0;
+	ENTRY;
+
+	cl_object_for_each(obj, top) {
+		if (obj->co_ops->coo_object_flush) {
+			rc = obj->co_ops->coo_object_flush(env, obj, lock);
+			if (rc)
+				break;
+		}
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_object_flush);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+	struct cl_object_header *hdr = cl_object_header(obj);
+
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+	int i;
+
+        cs->cs_name = name;
+	for (i = 0; i < CS_NR; i++)
+		atomic_set(&cs->cs_stats[i], 0);
+}
+
+static int cache_stats_print(const struct cache_stats *cs,
+			     struct seq_file *m, int h)
+{
+	int i;
+
+	/*
+	 *   lookup    hit    total  cached create
+	 * env: ...... ...... ...... ...... ......
+	 */
+	if (h) {
+		const char *names[CS_NR] = CS_NAMES;
+
+		seq_printf(m, "%6s", " ");
+		for (i = 0; i < CS_NR; i++)
+			seq_printf(m, "%8s", names[i]);
+		seq_printf(m, "\n");
+	}
+
+	seq_printf(m, "%5.5s:", cs->cs_name);
+	for (i = 0; i < CS_NR; i++)
+		seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i]));
+	return 0;
+}
+
+static void cl_env_percpu_refill(void);
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+	size_t i;
+        int result;
+
+        result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+        if (result == 0) {
+                cache_stats_init(&s->cs_pages, "pages");
+                for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+			atomic_set(&s->cs_pages_state[0], 0);
+		cl_env_percpu_refill();
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+        lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+        .cs_name    = "envs",
+	.cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m)
+{
+	static const char *const pstate[] = {
+		[CPS_CACHED]	= "c",
+		[CPS_OWNED]	= "o",
+		[CPS_PAGEOUT]	= "w",
+		[CPS_PAGEIN]	= "r",
+		[CPS_FREEING]	= "f"
+	};
+	size_t i;
+
+/*
+       lookup    hit  total   busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+  env: ...... ...... ...... ...... ......
+ */
+	lu_site_stats_seq_print(&site->cs_lu, m);
+	cache_stats_print(&site->cs_pages, m, 1);
+	seq_printf(m, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+		seq_printf(m, "%s: %u ", pstate[i],
+			   atomic_read(&site->cs_pages_state[i]));
+	seq_printf(m, "]\n");
+	cache_stats_print(&cl_env_stats, m, 0);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it isn't easy to use task_struct->journal_info
+ * because Lustre code may call into other fs during memory reclaim, which
+ * has certain assumptions about journal_info. There are not currently any
+ * fields in task_struct that can be used for this purpose.
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * Since there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+static unsigned cl_envs_cached_max = 32; /* XXX: prototype: arbitrary limit
+					  * for now. */
+static struct cl_env_cache {
+	rwlock_t		cec_guard;
+	unsigned		cec_count;
+	struct list_head	cec_envs;
+} *cl_envs = NULL;
+
+struct cl_env {
+        void             *ce_magic;
+        struct lu_env     ce_lu;
+        struct lu_context ce_ses;
+
+        /*
+         * Linkage into global list of all client environments. Used for
+         * garbage collection.
+         */
+	struct list_head  ce_linkage;
+        /*
+         *
+         */
+        int               ce_ref;
+        /*
+         * Debugging field: address of the caller who made original
+         * allocation.
+         */
+        void             *ce_debug;
+};
+
+static void cl_env_inc(enum cache_stats_item item)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_inc(&cl_env_stats.cs_stats[item]);
+#endif
+}
+
+static void cl_env_dec(enum cache_stats_item item)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	LASSERT(atomic_read(&cl_env_stats.cs_stats[item]) > 0);
+	atomic_dec(&cl_env_stats.cs_stats[item]);
+#endif
+}
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+	LASSERT(cle->ce_ref == 0);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	LASSERT(cle->ce_debug == NULL);
+
+	cle->ce_ref = 1;
+	cle->ce_debug = debug;
+	cl_env_inc(CS_busy);
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, GFP_NOFS);
+	if (cle != NULL) {
+		int rc;
+
+		INIT_LIST_HEAD(&cle->ce_linkage);
+		cle->ce_magic = &cl_env_init0;
+		env = &cle->ce_lu;
+		rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+		if (rc == 0) {
+			rc = lu_context_init(&cle->ce_ses,
+					     LCT_SESSION | ses_tags);
+			if (rc == 0) {
+				lu_context_enter(&cle->ce_ses);
+				env->le_ses = &cle->ce_ses;
+				cl_env_init0(cle, debug);
+			} else
+				lu_env_fini(env);
+		}
+		if (rc != 0) {
+			OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+			env = ERR_PTR(rc);
+		} else {
+			cl_env_inc(CS_create);
+			cl_env_inc(CS_total);
+		}
+	} else
+		env = ERR_PTR(-ENOMEM);
+	return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+	cl_env_dec(CS_total);
+	lu_context_fini(&cle->ce_lu.le_ctx);
+	lu_context_fini(&cle->ce_ses);
+	OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static struct lu_env *cl_env_obtain(void *debug)
+{
+	struct cl_env *cle;
+	struct lu_env *env;
+	int cpu = get_cpu();
+
+	ENTRY;
+
+	read_lock(&cl_envs[cpu].cec_guard);
+	LASSERT(equi(cl_envs[cpu].cec_count == 0,
+		list_empty(&cl_envs[cpu].cec_envs)));
+	if (cl_envs[cpu].cec_count > 0) {
+		int rc;
+
+		cle = container_of(cl_envs[cpu].cec_envs.next, struct cl_env,
+				   ce_linkage);
+		list_del_init(&cle->ce_linkage);
+		cl_envs[cpu].cec_count--;
+		read_unlock(&cl_envs[cpu].cec_guard);
+		put_cpu();
+
+                env = &cle->ce_lu;
+                rc = lu_env_refill(env);
+                if (rc == 0) {
+                        cl_env_init0(cle, debug);
+                        lu_context_enter(&env->le_ctx);
+                        lu_context_enter(&cle->ce_ses);
+                } else {
+                        cl_env_fini(cle);
+                        env = ERR_PTR(rc);
+                }
+        } else {
+		read_unlock(&cl_envs[cpu].cec_guard);
+		put_cpu();
+		env = cl_env_new(lu_context_tags_default,
+				 lu_session_tags_default, debug);
+	}
+	RETURN(env);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+        return container_of(env, struct cl_env, ce_lu);
+}
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * Allocations are amortized through the global cache of environments.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(__u16 *refcheck)
+{
+        struct lu_env *env;
+
+	env = cl_env_obtain(__builtin_return_address(0));
+	if (!IS_ERR(env)) {
+		struct cl_env *cle;
+
+		cle = cl_env_container(env);
+		*refcheck = cle->ce_ref;
+		CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+        }
+        return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(__u16 *refcheck, __u32 tags)
+{
+        struct lu_env *env;
+
+        env = cl_env_new(tags, tags, __builtin_return_address(0));
+        if (!IS_ERR(env)) {
+                struct cl_env *cle;
+
+                cle = cl_env_container(env);
+                *refcheck = cle->ce_ref;
+                CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+        }
+        return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+        lu_context_exit(&cle->ce_lu.le_ctx);
+        lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Finalizes and frees a given number of cached environments. This is done to
+ * (1) free some memory (not currently hooked into VM), or (2) release
+ * references to modules.
+ */
+unsigned cl_env_cache_purge(unsigned nr)
+{
+	struct cl_env *cle;
+	unsigned i;
+
+	ENTRY;
+	for_each_possible_cpu(i) {
+		write_lock(&cl_envs[i].cec_guard);
+		for (; !list_empty(&cl_envs[i].cec_envs) && nr > 0; --nr) {
+			cle = container_of(cl_envs[i].cec_envs.next,
+					   struct cl_env, ce_linkage);
+			list_del_init(&cle->ce_linkage);
+			LASSERT(cl_envs[i].cec_count > 0);
+			cl_envs[i].cec_count--;
+			write_unlock(&cl_envs[i].cec_guard);
+
+			cl_env_fini(cle);
+			write_lock(&cl_envs[i].cec_guard);
+		}
+		LASSERT(equi(cl_envs[i].cec_count == 0,
+			list_empty(&cl_envs[i].cec_envs)));
+		write_unlock(&cl_envs[i].cec_guard);
+	}
+	RETURN(nr);
+}
+EXPORT_SYMBOL(cl_env_cache_purge);
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, __u16 *refcheck)
+{
+        struct cl_env *cle;
+
+        cle = cl_env_container(env);
+
+        LASSERT(cle->ce_ref > 0);
+        LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+        CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+        if (--cle->ce_ref == 0) {
+		int cpu = get_cpu();
+
+		cl_env_dec(CS_busy);
+		cle->ce_debug = NULL;
+		cl_env_exit(cle);
+		/*
+		 * Don't bother to take a lock here.
+		 *
+		 * Return environment to the cache only when it was allocated
+		 * with the standard tags.
+		 */
+		if (cl_envs[cpu].cec_count < cl_envs_cached_max &&
+		    (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == lu_context_tags_default &&
+		    (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == lu_session_tags_default) {
+			read_lock(&cl_envs[cpu].cec_guard);
+			list_add(&cle->ce_linkage, &cl_envs[cpu].cec_envs);
+			cl_envs[cpu].cec_count++;
+			read_unlock(&cl_envs[cpu].cec_guard);
+		} else
+			cl_env_fini(cle);
+		put_cpu();
+	}
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+        lvb->lvb_size   = attr->cat_size;
+        lvb->lvb_mtime  = attr->cat_mtime;
+        lvb->lvb_atime  = attr->cat_atime;
+        lvb->lvb_ctime  = attr->cat_ctime;
+        lvb->lvb_blocks = attr->cat_blocks;
+}
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+        attr->cat_size   = lvb->lvb_size;
+        attr->cat_mtime  = lvb->lvb_mtime;
+        attr->cat_atime  = lvb->lvb_atime;
+        attr->cat_ctime  = lvb->lvb_ctime;
+        attr->cat_blocks = lvb->lvb_blocks;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+static struct cl_env cl_env_percpu[NR_CPUS];
+static DEFINE_MUTEX(cl_env_percpu_mutex);
+
+static int cl_env_percpu_init(void)
+{
+	struct cl_env *cle;
+	int tags = LCT_REMEMBER | LCT_NOREF;
+	int i, j;
+	int rc = 0;
+
+	for_each_possible_cpu(i) {
+		struct lu_env *env;
+
+		rwlock_init(&cl_envs[i].cec_guard);
+		INIT_LIST_HEAD(&cl_envs[i].cec_envs);
+		cl_envs[i].cec_count = 0;
+
+		cle = &cl_env_percpu[i];
+		env = &cle->ce_lu;
+
+		INIT_LIST_HEAD(&cle->ce_linkage);
+		cle->ce_magic = &cl_env_init0;
+		rc = lu_env_init(env, LCT_CL_THREAD | tags);
+		if (rc == 0) {
+			rc = lu_context_init(&cle->ce_ses, LCT_SESSION | tags);
+                        if (rc == 0) {
+                                lu_context_enter(&cle->ce_ses);
+                                env->le_ses = &cle->ce_ses;
+			} else {
+				lu_env_fini(env);
+			}
+		}
+		if (rc != 0)
+			break;
+	}
+	if (rc != 0) {
+		/* Indices 0 to i (excluding i) were correctly initialized,
+		 * thus we must uninitialize up to i, the rest are undefined. */
+		for (j = 0; j < i; j++) {
+			cle = &cl_env_percpu[j];
+			lu_context_exit(&cle->ce_ses);
+			lu_context_fini(&cle->ce_ses);
+			lu_env_fini(&cle->ce_lu);
+		}
+	}
+
+	return rc;
+}
+
+static void cl_env_percpu_fini(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct cl_env *cle = &cl_env_percpu[i];
+
+		lu_context_exit(&cle->ce_ses);
+		lu_context_fini(&cle->ce_ses);
+		lu_env_fini(&cle->ce_lu);
+	}
+}
+
+static void cl_env_percpu_refill(void)
+{
+	int i;
+
+	mutex_lock(&cl_env_percpu_mutex);
+	for_each_possible_cpu(i)
+		lu_env_refill(&cl_env_percpu[i].ce_lu);
+	mutex_unlock(&cl_env_percpu_mutex);
+}
+
+void cl_env_percpu_put(struct lu_env *env)
+{
+	struct cl_env *cle;
+	int cpu;
+
+	cpu = smp_processor_id();
+	cle = cl_env_container(env);
+	LASSERT(cle == &cl_env_percpu[cpu]);
+
+	cle->ce_ref--;
+	LASSERT(cle->ce_ref == 0);
+
+	cl_env_dec(CS_busy);
+	cle->ce_debug = NULL;
+
+	put_cpu();
+}
+EXPORT_SYMBOL(cl_env_percpu_put);
+
+struct lu_env *cl_env_percpu_get(void)
+{
+	struct cl_env *cle;
+
+	cle = &cl_env_percpu[get_cpu()];
+	cl_env_init0(cle, __builtin_return_address(0));
+
+	return &cle->ce_lu;
+}
+EXPORT_SYMBOL(cl_env_percpu_get);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                                struct lu_device_type *ldt,
+                                struct lu_device *next)
+{
+        const char       *typename;
+        struct lu_device *d;
+
+        LASSERT(ldt != NULL);
+
+        typename = ldt->ldt_name;
+        d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+        if (!IS_ERR(d)) {
+                int rc;
+
+                if (site != NULL)
+                        d->ld_site = site;
+                rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+                if (rc == 0) {
+                        lu_device_get(d);
+                        lu_ref_add(&d->ld_reference,
+                                   "lu-stack", &lu_site_init);
+                } else {
+                        ldt->ldt_ops->ldto_device_free(env, d);
+                        CERROR("can't init device '%s', %d\n", typename, rc);
+                        d = ERR_PTR(rc);
+                }
+        } else
+                CERROR("Cannot allocate device: '%s'\n", typename);
+        return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+        lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+        return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl, struct cl_thread_info);
+
+static struct lu_context_key cl_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = cl_key_init,
+        .lct_fini = cl_key_fini,
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+	{
+		.ckd_cache = &cl_env_kmem,
+		.ckd_name  = "cl_env_kmem",
+		.ckd_size  = sizeof(struct cl_env)
+	},
+	{
+		.ckd_cache = &cl_dio_aio_kmem,
+		.ckd_name  = "cl_dio_aio_kmem",
+		.ckd_size  = sizeof(struct cl_dio_aio)
+	},
+	{
+		.ckd_cache = &cl_sub_dio_kmem,
+		.ckd_name  = "cl_sub_dio_kmem",
+		.ckd_size  = sizeof(struct cl_sub_dio)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+	int result;
+
+	OBD_ALLOC_PTR_ARRAY(cl_envs, num_possible_cpus());
+	if (cl_envs == NULL)
+		GOTO(out, result = -ENOMEM);
+
+	result = lu_kmem_init(cl_object_caches);
+	if (result)
+		GOTO(out_envs, result);
+
+	LU_CONTEXT_KEY_INIT(&cl_key);
+	result = lu_context_key_register(&cl_key);
+	if (result)
+		GOTO(out_kmem, result);
+
+	result = cl_env_percpu_init();
+	if (result) /* no cl_env_percpu_fini on error */
+		GOTO(out_keys, result);
+
+	return 0;
+
+out_keys:
+	lu_context_key_degister(&cl_key);
+out_kmem:
+	lu_kmem_fini(cl_object_caches);
+out_envs:
+	OBD_FREE_PTR_ARRAY(cl_envs, num_possible_cpus());
+out:
+	return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cl_page_kmem_array); i++) {
+		if (cl_page_kmem_array[i]) {
+			kmem_cache_destroy(cl_page_kmem_array[i]);
+			cl_page_kmem_array[i] = NULL;
+		}
+	}
+	cl_env_percpu_fini();
+	lu_context_key_degister(&cl_key);
+	lu_kmem_fini(cl_object_caches);
+	OBD_FREE_PTR_ARRAY(cl_envs, num_possible_cpus());
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
new file mode 100644
index 0000000000000..b573e8da3a1a3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
@@ -0,0 +1,1291 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Client Lustre Page.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg);
+static DEFINE_MUTEX(cl_page_kmem_mutex);
+
+#ifdef LIBCFS_DEBUG
+# define PASSERT(env, page, expr)                                       \
+  do {                                                                    \
+          if (unlikely(!(expr))) {                                      \
+                  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+                  LASSERT(0);                                           \
+          }                                                             \
+  } while (0)
+#else /* !LIBCFS_DEBUG */
+# define PASSERT(env, page, exp) \
+        ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+#endif /* !LIBCFS_DEBUG */
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+# define PINVRNT(env, page, expr)                                       \
+  do {                                                                    \
+          if (unlikely(!(expr))) {                                      \
+                  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+                  LINVRNT(0);                                           \
+          }                                                             \
+  } while (0)
+#else /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+# define PINVRNT(env, page, exp) \
+	 ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+#endif /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+
+/* Disable page statistic by default due to huge performance penalty. */
+static void cs_page_inc(const struct cl_object *obj,
+			enum cache_stats_item item)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_inc(&cl_object_site(obj)->cs_pages.cs_stats[item]);
+#endif
+}
+
+static void cs_page_dec(const struct cl_object *obj,
+			enum cache_stats_item item)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_dec(&cl_object_site(obj)->cs_pages.cs_stats[item]);
+#endif
+}
+
+static void cs_pagestate_inc(const struct cl_object *obj,
+			     enum cl_page_state state)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_inc(&cl_object_site(obj)->cs_pages_state[state]);
+#endif
+}
+
+static void cs_pagestate_dec(const struct cl_object *obj,
+			      enum cl_page_state state)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_dec(&cl_object_site(obj)->cs_pages_state[state]);
+#endif
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+	LASSERT(atomic_read(&page->cp_ref) > 0);
+	atomic_inc(&page->cp_ref);
+}
+
+static struct cl_page_slice *
+cl_page_slice_get(const struct cl_page *cl_page, int index)
+{
+	if (index < 0 || index >= cl_page->cp_layer_count)
+		return NULL;
+
+	/* To get the cp_layer_offset values fit under 256 bytes, we
+	 * use the offset beyond the end of struct cl_page.
+	 */
+	return (struct cl_page_slice *)((char *)cl_page + sizeof(*cl_page) +
+					cl_page->cp_layer_offset[index]);
+}
+
+#define cl_page_slice_for_each(cl_page, slice, i)		\
+	for (i = 0, slice = cl_page_slice_get(cl_page, 0);	\
+	     i < (cl_page)->cp_layer_count;			\
+	     slice = cl_page_slice_get(cl_page, ++i))
+
+#define cl_page_slice_for_each_reverse(cl_page, slice, i)	\
+	for (i = (cl_page)->cp_layer_count - 1,			\
+	     slice = cl_page_slice_get(cl_page, i); i >= 0;	\
+	     slice = cl_page_slice_get(cl_page, --i))
+
+/**
+ * Returns a slice within a cl_page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *cl_page,
+                   const struct lu_device_type *dtype)
+{
+	const struct cl_page_slice *slice;
+	int i;
+
+	ENTRY;
+
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+			RETURN(slice);
+	}
+
+	RETURN(NULL);
+}
+
+static void __cl_page_free(struct cl_page *cl_page, unsigned short bufsize)
+{
+	int index = cl_page->cp_kmem_index;
+
+	if (index >= 0) {
+		LASSERT(index < ARRAY_SIZE(cl_page_kmem_array));
+		LASSERT(cl_page_kmem_size_array[index] == bufsize);
+		OBD_SLAB_FREE(cl_page, cl_page_kmem_array[index], bufsize);
+	} else {
+		OBD_FREE(cl_page, bufsize);
+	}
+}
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *cl_page,
+			 struct pagevec *pvec)
+{
+	struct cl_object *obj  = cl_page->cp_obj;
+	unsigned short bufsize = cl_object_header(obj)->coh_page_bufsize;
+	struct cl_page_slice *slice;
+	int i;
+
+	ENTRY;
+	PASSERT(env, cl_page, list_empty(&cl_page->cp_batch));
+	PASSERT(env, cl_page, cl_page->cp_owner == NULL);
+	PASSERT(env, cl_page, cl_page->cp_state == CPS_FREEING);
+
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (unlikely(slice->cpl_ops->cpo_fini != NULL))
+			slice->cpl_ops->cpo_fini(env, slice, pvec);
+	}
+	cl_page->cp_layer_count = 0;
+	cs_page_dec(obj, CS_total);
+	cs_pagestate_dec(obj, cl_page->cp_state);
+	lu_object_ref_del_at(&obj->co_lu, &cl_page->cp_obj_ref,
+			     "cl_page", cl_page);
+	if (cl_page->cp_type != CPT_TRANSIENT)
+		cl_object_put(env, obj);
+	lu_ref_fini(&cl_page->cp_reference);
+	__cl_page_free(cl_page, bufsize);
+	EXIT;
+}
+
+static struct cl_page *__cl_page_alloc(struct cl_object *o)
+{
+	int i = 0;
+	struct cl_page *cl_page = NULL;
+	unsigned short bufsize = cl_object_header(o)->coh_page_bufsize;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PAGE_ALLOC))
+		return NULL;
+
+check:
+	/* the number of entries in cl_page_kmem_array is expected to
+	 * only be 2-3 entries, so the lookup overhead should be low.
+	 */
+	for ( ; i < ARRAY_SIZE(cl_page_kmem_array); i++) {
+		if (smp_load_acquire(&cl_page_kmem_size_array[i])
+		    == bufsize) {
+			OBD_SLAB_ALLOC_GFP(cl_page, cl_page_kmem_array[i],
+					   bufsize, GFP_NOFS);
+			if (cl_page)
+				cl_page->cp_kmem_index = i;
+			return cl_page;
+		}
+		if (cl_page_kmem_size_array[i] == 0)
+			break;
+	}
+
+	if (i < ARRAY_SIZE(cl_page_kmem_array)) {
+		char cache_name[32];
+
+		mutex_lock(&cl_page_kmem_mutex);
+		if (cl_page_kmem_size_array[i]) {
+			mutex_unlock(&cl_page_kmem_mutex);
+			goto check;
+		}
+		snprintf(cache_name, sizeof(cache_name),
+			 "cl_page_kmem-%u", bufsize);
+		cl_page_kmem_array[i] =
+			kmem_cache_create(cache_name, bufsize,
+					  0, 0, NULL);
+		if (cl_page_kmem_array[i] == NULL) {
+			mutex_unlock(&cl_page_kmem_mutex);
+			return NULL;
+		}
+		smp_store_release(&cl_page_kmem_size_array[i],
+				  bufsize);
+		mutex_unlock(&cl_page_kmem_mutex);
+		goto check;
+	} else {
+		OBD_ALLOC_GFP(cl_page, bufsize, GFP_NOFS);
+		if (cl_page)
+			cl_page->cp_kmem_index = -1;
+	}
+
+	return cl_page;
+}
+
+struct cl_page *cl_page_alloc(const struct lu_env *env, struct cl_object *o,
+			      pgoff_t ind, struct page *vmpage,
+			      enum cl_page_type type)
+{
+	struct cl_page *cl_page;
+	struct cl_object *head;
+
+	ENTRY;
+
+	cl_page = __cl_page_alloc(o);
+	if (cl_page != NULL) {
+		int result = 0;
+
+		/*
+		 * Please fix cl_page:cp_state/type declaration if
+		 * these assertions fail in the future.
+		 */
+		BUILD_BUG_ON((1 << CP_STATE_BITS) < CPS_NR); /* cp_state */
+		BUILD_BUG_ON((1 << CP_TYPE_BITS) < CPT_NR); /* cp_type */
+		atomic_set(&cl_page->cp_ref, 1);
+		cl_page->cp_obj = o;
+		if (type != CPT_TRANSIENT)
+			cl_object_get(o);
+		lu_object_ref_add_at(&o->co_lu, &cl_page->cp_obj_ref,
+				     "cl_page", cl_page);
+		cl_page->cp_vmpage = vmpage;
+		cl_page->cp_state = CPS_CACHED;
+		cl_page->cp_type = type;
+		if (type == CPT_TRANSIENT)
+			/* ref to correct inode will be added
+			 * in ll_direct_rw_pages
+			 */
+			cl_page->cp_inode = NULL;
+		else
+			cl_page->cp_inode = page2inode(vmpage);
+		INIT_LIST_HEAD(&cl_page->cp_batch);
+		lu_ref_init(&cl_page->cp_reference);
+		head = o;
+		cl_page->cp_page_index = ind;
+		cl_object_for_each(o, head) {
+			if (o->co_ops->coo_page_init != NULL) {
+				result = o->co_ops->coo_page_init(env, o,
+							cl_page, ind);
+				if (result != 0) {
+					cl_page_delete0(env, cl_page);
+					cl_page_free(env, cl_page, NULL);
+					cl_page = ERR_PTR(result);
+					break;
+				}
+			}
+		}
+		if (result == 0) {
+			cs_page_inc(o, CS_total);
+			cs_page_inc(o, CS_create);
+			cs_pagestate_dec(o, CPS_CACHED);
+		}
+	} else {
+		cl_page = ERR_PTR(-ENOMEM);
+	}
+	RETURN(cl_page);
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+struct cl_page *cl_page_find(const struct lu_env *env,
+			     struct cl_object *o,
+			     pgoff_t idx, struct page *vmpage,
+			     enum cl_page_type type)
+{
+	struct cl_page          *page = NULL;
+	struct cl_object_header *hdr;
+
+	LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+	might_sleep();
+
+	ENTRY;
+
+	hdr = cl_object_header(o);
+	cs_page_inc(o, CS_lookup);
+
+        CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+               idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+        /* fast path. */
+        if (type == CPT_CACHEABLE) {
+		/* vmpage lock is used to protect the child/parent
+		 * relationship */
+		LASSERT(PageLocked(vmpage));
+                /*
+                 * cl_vmpage_page() can be called here without any locks as
+                 *
+                 *     - "vmpage" is locked (which prevents ->private from
+                 *       concurrent updates), and
+                 *
+                 *     - "o" cannot be destroyed while current thread holds a
+                 *       reference on it.
+                 */
+                page = cl_vmpage_page(vmpage, o);
+		if (page != NULL) {
+			cs_page_inc(o, CS_hit);
+			RETURN(page);
+		}
+        }
+
+        /* allocate and initialize cl_page */
+        page = cl_page_alloc(env, o, idx, vmpage, type);
+	RETURN(page);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+	return cl_page_in_use_noref(pg);
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+			       struct cl_page *cl_page,
+			       enum cl_page_state state)
+{
+	enum cl_page_state old;
+
+	/*
+	 * Matrix of allowed state transitions [old][new], for sanity
+	 * checking.
+	 */
+	static const int allowed_transitions[CPS_NR][CPS_NR] = {
+		[CPS_CACHED] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 1, /* io finds existing cached page */
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 1, /* write-out from the cache */
+			[CPS_FREEING] = 1, /* eviction on the memory pressure */
+		},
+		[CPS_OWNED] = {
+			[CPS_CACHED]  = 1, /* release to the cache */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 1, /* start read immediately */
+			[CPS_PAGEOUT] = 1, /* start write immediately */
+			[CPS_FREEING] = 1, /* lock invalidation or truncate */
+		},
+		[CPS_PAGEIN] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_PAGEOUT] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_FREEING] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		}
+	};
+
+	ENTRY;
+	old = cl_page->cp_state;
+	PASSERT(env, cl_page, allowed_transitions[old][state]);
+	CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d -> %d\n", old, state);
+	PASSERT(env, cl_page, cl_page->cp_state == old);
+	PASSERT(env, cl_page, equi(state == CPS_OWNED,
+				   cl_page->cp_owner != NULL));
+
+	cs_pagestate_dec(cl_page->cp_obj, cl_page->cp_state);
+	cs_pagestate_inc(cl_page->cp_obj, state);
+	cl_page->cp_state = state;
+	EXIT;
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+                              struct cl_page *page, enum cl_page_state state)
+{
+        cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+	ENTRY;
+	cl_page_get_trust(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page, use the pagevec to release the pages
+ * in batch if provided.
+ *
+ * Users need to do a final pagevec_release() to release any trailing pages.
+ */
+void cl_pagevec_put(const struct lu_env *env, struct cl_page *page,
+		  struct pagevec *pvec)
+{
+        ENTRY;
+        CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+		       atomic_read(&page->cp_ref));
+
+	if (atomic_dec_and_test(&page->cp_ref)) {
+		LASSERT(page->cp_state == CPS_FREEING);
+
+		LASSERT(atomic_read(&page->cp_ref) == 0);
+		PASSERT(env, page, page->cp_owner == NULL);
+		PASSERT(env, page, list_empty(&page->cp_batch));
+		/*
+		 * Page is no longer reachable by other threads. Tear
+		 * it down.
+		 */
+		cl_page_free(env, page, pvec);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_pagevec_put);
+
+/**
+ * Releases a reference to a page, wrapper to cl_pagevec_put
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+	cl_pagevec_put(env, page, NULL);
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+	struct cl_page *page;
+
+	ENTRY;
+	LASSERT(PageLocked(vmpage));
+
+	/*
+	 * NOTE: absence of races and liveness of data are guaranteed by page
+	 *       lock on a "vmpage". That works because object destruction has
+	 *       bottom-to-top pass.
+	 */
+
+	page = (struct cl_page *)vmpage->private;
+	if (page != NULL) {
+		cl_page_get_trust(page);
+		LASSERT(page->cp_type == CPT_CACHEABLE);
+	}
+	RETURN(page);
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                       const struct lu_device_type *dtype)
+{
+        return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+	ENTRY;
+	if (page->cp_owner != NULL) {
+		LASSERT(page->cp_owner->ci_owned_nr > 0);
+		page->cp_owner->ci_owned_nr--;
+		page->cp_owner = NULL;
+	}
+	EXIT;
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+	ENTRY;
+	LASSERT(page->cp_owner != NULL);
+	page->cp_owner->ci_owned_nr++;
+	EXIT;
+}
+
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *cl_page)
+{
+	const struct cl_page_slice *slice;
+	enum cl_page_state state;
+	int i;
+
+        ENTRY;
+	state = cl_page->cp_state;
+	PINVRNT(env, cl_page, state == CPS_OWNED ||
+		state == CPS_FREEING);
+	PINVRNT(env, cl_page, cl_page_invariant(cl_page) ||
+		state == CPS_FREEING);
+	cl_page_owner_clear(cl_page);
+
+	if (state == CPS_OWNED)
+		cl_page_state_set(env, cl_page, CPS_CACHED);
+        /*
+	 * Completion call-backs are executed in the bottom-up order, so that
+	 * uppermost layer (llite), responsible for VFS/VM interaction runs
+	 * last and can release locks safely.
+	 */
+	cl_page_slice_for_each_reverse(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_disown != NULL)
+			(*slice->cpl_ops->cpo_disown)(env, slice, io);
+	}
+
+	EXIT;
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+	struct cl_io *top = cl_io_top((struct cl_io *)io);
+	LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+	ENTRY;
+	RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == top);
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre  !cl_page_is_owned(cl_page, io)
+ * \post result == 0 iff cl_page_is_owned(cl_page, io)
+ *
+ * \retval 0   success
+ *
+ * \retval -ve failure, e.g., cl_page was destroyed (and landed in
+ *             cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *             or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *cl_page, int nonblock)
+{
+	const struct cl_page_slice *slice;
+	int result = 0;
+	int i;
+
+	ENTRY;
+	PINVRNT(env, cl_page, !cl_page_is_owned(cl_page, io));
+        io = cl_io_top(io);
+
+	if (cl_page->cp_state == CPS_FREEING) {
+		result = -ENOENT;
+		goto out;
+	}
+
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_own)
+			result = (*slice->cpl_ops->cpo_own)(env, slice,
+							    io, nonblock);
+		if (result != 0)
+			break;
+	}
+	if (result > 0)
+		result = 0;
+
+	if (result == 0) {
+		PASSERT(env, cl_page, cl_page->cp_owner == NULL);
+		cl_page->cp_owner = cl_io_top(io);
+		cl_page_owner_set(cl_page);
+		if (cl_page->cp_state != CPS_FREEING) {
+			cl_page_state_set(env, cl_page, CPS_OWNED);
+		} else {
+			cl_page_disown0(env, io, cl_page);
+			result = -ENOENT;
+		}
+	}
+
+out:
+	PINVRNT(env, cl_page, ergo(result == 0,
+		cl_page_invariant(cl_page)));
+	RETURN(result);
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+        return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+                    struct cl_page *pg)
+{
+        return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(cl_page, io)
+ * \post cl_page_is_owned(cl_page, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+		    struct cl_io *io, struct cl_page *cl_page)
+{
+	const struct cl_page_slice *slice;
+	int i;
+
+	ENTRY;
+
+	PINVRNT(env, cl_page,
+		cl_object_same(cl_page->cp_obj, io->ci_obj));
+	io = cl_io_top(io);
+
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_assume != NULL)
+			(*slice->cpl_ops->cpo_assume)(env, slice, io);
+	}
+
+	PASSERT(env, cl_page, cl_page->cp_owner == NULL);
+	cl_page->cp_owner = cl_io_top(io);
+	cl_page_owner_set(cl_page);
+	cl_page_state_set(env, cl_page, CPS_OWNED);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves cl_page into cl_page_state::CPS_CACHED without releasing a lock
+ * on the underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre   cl_page_is_owned(cl_page, io)
+ * \post !cl_page_is_owned(cl_page, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+		      struct cl_io *io, struct cl_page *cl_page)
+{
+	const struct cl_page_slice *slice;
+	int i;
+
+        ENTRY;
+	PINVRNT(env, cl_page, cl_page_is_owned(cl_page, io));
+	PINVRNT(env, cl_page, cl_page_invariant(cl_page));
+
+	io = cl_io_top(io);
+	cl_page_owner_clear(cl_page);
+	cl_page_state_set(env, cl_page, CPS_CACHED);
+
+	cl_page_slice_for_each_reverse(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_unassume != NULL)
+			(*slice->cpl_ops->cpo_unassume)(env, slice, io);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io) ||
+		pg->cp_state == CPS_FREEING);
+
+	ENTRY;
+	io = cl_io_top(io);
+	cl_page_disown0(env, io, pg);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when cl_page is to be removed from the object, e.g.,
+ * as a result of truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(cl_page, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *cl_page)
+{
+	const struct cl_page_slice *slice;
+	int i;
+
+	PINVRNT(env, cl_page, cl_page_is_owned(cl_page, io));
+	PINVRNT(env, cl_page, cl_page_invariant(cl_page));
+
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_discard != NULL)
+			(*slice->cpl_ops->cpo_discard)(env, slice, io);
+	}
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * cl_pages, e.g. in an error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check cl_page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env,
+			    struct cl_page *cl_page)
+{
+	const struct cl_page_slice *slice;
+	int i;
+
+        ENTRY;
+
+	PASSERT(env, cl_page, cl_page->cp_state != CPS_FREEING);
+
+	/*
+	 * Severe all ways to obtain new pointers to @pg.
+	 */
+	cl_page_owner_clear(cl_page);
+	cl_page_state_set0(env, cl_page, CPS_FREEING);
+
+	cl_page_slice_for_each_reverse(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_delete != NULL)
+			(*slice->cpl_ops->cpo_delete)(env, slice);
+	}
+
+	EXIT;
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ *     - removes page from the radix trees,
+ *
+ *     - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre  VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	ENTRY;
+	cl_page_delete0(env, pg);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *cl_page,
+		    int uptodate)
+{
+	const struct cl_page_slice *slice;
+	int i;
+
+	PINVRNT(env, cl_page, cl_page_invariant(cl_page));
+
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_export != NULL)
+			(*slice->cpl_ops->cpo_export)(env, slice, uptodate);
+	}
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, if \a page is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env,
+			const struct cl_page *cl_page)
+{
+        const struct cl_page_slice *slice;
+	int result;
+
+	ENTRY;
+	slice = cl_page_slice_get(cl_page, 0);
+	PASSERT(env, cl_page, slice->cpl_ops->cpo_is_vmlocked != NULL);
+        /*
+	 * Call ->cpo_is_vmlocked() directly instead of going through
+         * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+         * cl_page_invariant().
+         */
+        result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+	PASSERT(env, cl_page, result == -EBUSY || result == -ENODATA);
+
+	RETURN(result == -EBUSY);
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+void cl_page_touch(const struct lu_env *env,
+		   const struct cl_page *cl_page, size_t to)
+{
+	const struct cl_page_slice *slice;
+	int i;
+
+	ENTRY;
+
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_page_touch != NULL)
+			(*slice->cpl_ops->cpo_page_touch)(env, slice, to);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_touch);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+        ENTRY;
+        RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+                             struct cl_page *pg, enum cl_req_type crt)
+{
+        /*
+         * Page is queued for IO, change its state.
+         */
+        ENTRY;
+        cl_page_owner_clear(pg);
+        cl_page_state_set(env, pg, cl_req_type_state(crt));
+        EXIT;
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+		 struct cl_page *cl_page, enum cl_req_type crt)
+{
+	const struct cl_page_slice *slice;
+	int result = 0;
+	int i;
+
+	PINVRNT(env, cl_page, cl_page_is_owned(cl_page, io));
+	PINVRNT(env, cl_page, cl_page_invariant(cl_page));
+	PINVRNT(env, cl_page, crt < CRT_NR);
+
+        /*
+	 * this has to be called bottom-to-top, so that llite can set up
+	 * PG_writeback without risking other layers deciding to skip this
+	 * page.
+	 */
+	if (crt >= CRT_NR)
+		return -EINVAL;
+
+	if (cl_page->cp_type != CPT_TRANSIENT) {
+		cl_page_slice_for_each(cl_page, slice, i) {
+			if (slice->cpl_ops->cpo_own)
+				result =
+				 (*slice->cpl_ops->io[crt].cpo_prep)(env,
+								     slice,
+								     io);
+			if (result != 0)
+				break;
+		}
+	}
+
+	if (result >= 0) {
+		result = 0;
+		cl_page_io_start(env, cl_page, crt);
+	}
+
+	CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", crt, result);
+	return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre  cl_page->cp_state == CPS_PAGEIN || cl_page->cp_state == CPS_PAGEOUT
+ * \post cl_page->cl_page_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+			struct cl_page *cl_page, enum cl_req_type crt,
+			int ioret)
+{
+	const struct cl_page_slice *slice;
+	struct cl_sync_io *anchor = cl_page->cp_sync_io;
+	int i;
+
+        ENTRY;
+	PASSERT(env, cl_page, crt < CRT_NR);
+	PASSERT(env, cl_page, cl_page->cp_state == cl_req_type_state(crt));
+
+	CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", crt, ioret);
+	cl_page_state_set(env, cl_page, CPS_CACHED);
+	if (crt >= CRT_NR)
+		return;
+
+	cl_page_slice_for_each_reverse(cl_page, slice, i) {
+		if (slice->cpl_ops->io[crt].cpo_completion != NULL)
+			(*slice->cpl_ops->io[crt].cpo_completion)(env, slice,
+								  ioret);
+	}
+
+	if (anchor != NULL) {
+		LASSERT(cl_page->cp_sync_io == anchor);
+		cl_page->cp_sync_io = NULL;
+		cl_sync_io_note(env, anchor, ioret);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre  cl_page->cp_state == CPS_CACHED
+ * \post cl_page->cp_state == CPS_PAGEIN || cl_page->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *cl_page,
+                       enum cl_req_type crt)
+{
+	const struct cl_page_slice *slice;
+	int result = 0;
+	int i;
+
+        ENTRY;
+	PINVRNT(env, cl_page, crt < CRT_NR);
+	if (crt >= CRT_NR)
+		RETURN(-EINVAL);
+
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_ops->io[crt].cpo_make_ready != NULL)
+			result = (*slice->cpl_ops->io[crt].cpo_make_ready)(env, slice);
+		if (result != 0)
+			break;
+	}
+
+	if (result >= 0) {
+		result = 0;
+		PASSERT(env, cl_page, cl_page->cp_state == CPS_CACHED);
+		cl_page_io_start(env, cl_page, crt);
+        }
+	CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", crt, result);
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Called if a page is being written back by kernel's intention.
+ *
+ * \pre  cl_page_is_owned(cl_page, io)
+ * \post ergo(result == 0, cl_page->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+		  struct cl_page *cl_page)
+{
+	const struct cl_page_slice *slice;
+	int result = 0;
+	int i;
+
+	ENTRY;
+	PINVRNT(env, cl_page, cl_page_is_owned(cl_page, io));
+	PINVRNT(env, cl_page, cl_page_invariant(cl_page));
+
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_flush != NULL)
+			result = (*slice->cpl_ops->cpo_flush)(env, slice, io);
+		if (result != 0)
+			break;
+	}
+	if (result > 0)
+		result = 0;
+
+	CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d\n", result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *cl_page,
+                  int from, int to)
+{
+	const struct cl_page_slice *slice;
+	int i;
+
+	PINVRNT(env, cl_page, cl_page_invariant(cl_page));
+
+	CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", from, to);
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_clip != NULL)
+			(*slice->cpl_ops->cpo_clip)(env, slice, from, to);
+	}
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+                          lu_printer_t printer, const struct cl_page *pg)
+{
+	(*printer)(env, cookie,
+		   "page@%p[%d %p %d %d %p]\n",
+		   pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+		   pg->cp_state, pg->cp_type,
+		   pg->cp_owner);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a cl_page to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_page *cl_page)
+{
+	const struct cl_page_slice *slice;
+	int result = 0;
+	int i;
+
+	cl_page_header_print(env, cookie, printer, cl_page);
+	cl_page_slice_for_each(cl_page, slice, i) {
+		if (slice->cpl_ops->cpo_print != NULL)
+			result = (*slice->cpl_ops->cpo_print)(env, slice,
+							     cookie, printer);
+		if (result != 0)
+			break;
+	}
+	(*printer)(env, cookie, "end page@%p\n", cl_page);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+	return (loff_t)idx << PAGE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+	return offset >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+size_t cl_page_size(const struct cl_object *obj)
+{
+	return 1UL << PAGE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *cl_page, struct cl_page_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_page_operations *ops)
+{
+	unsigned int offset = (char *)slice -
+			((char *)cl_page + sizeof(*cl_page));
+
+	ENTRY;
+	LASSERT(cl_page->cp_layer_count < CP_MAX_LAYER);
+	LASSERT(offset < (1 << sizeof(cl_page->cp_layer_offset[0]) * 8));
+	cl_page->cp_layer_offset[cl_page->cp_layer_count++] = offset;
+	slice->cpl_obj  = obj;
+	slice->cpl_ops  = ops;
+	slice->cpl_page = cl_page;
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+/**
+ * Allocate and initialize cl_cache, called by ll_init_sbi().
+ */
+struct cl_client_cache *cl_cache_init(unsigned long lru_page_max)
+{
+	struct cl_client_cache	*cache = NULL;
+
+	ENTRY;
+	OBD_ALLOC(cache, sizeof(*cache));
+	if (cache == NULL)
+		RETURN(NULL);
+
+	/* Initialize cache data */
+	atomic_set(&cache->ccc_users, 1);
+	cache->ccc_lru_max = lru_page_max;
+	atomic_long_set(&cache->ccc_lru_left, lru_page_max);
+	spin_lock_init(&cache->ccc_lru_lock);
+	INIT_LIST_HEAD(&cache->ccc_lru);
+
+	/* turn unstable check off by default as it impacts performance */
+	cache->ccc_unstable_check = 0;
+	atomic_long_set(&cache->ccc_unstable_nr, 0);
+	init_waitqueue_head(&cache->ccc_unstable_waitq);
+	mutex_init(&cache->ccc_max_cache_mb_lock);
+
+	RETURN(cache);
+}
+EXPORT_SYMBOL(cl_cache_init);
+
+/**
+ * Increase cl_cache refcount
+ */
+void cl_cache_incref(struct cl_client_cache *cache)
+{
+	atomic_inc(&cache->ccc_users);
+}
+EXPORT_SYMBOL(cl_cache_incref);
+
+/**
+ * Decrease cl_cache refcount and free the cache if refcount=0.
+ * Since llite, lov and osc all hold cl_cache refcount,
+ * the free will not cause race. (LU-6173)
+ */
+void cl_cache_decref(struct cl_client_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->ccc_users))
+		OBD_FREE(cache, sizeof(*cache));
+}
+EXPORT_SYMBOL(cl_cache_decref);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
new file mode 100644
index 0000000000000..f0c611827aebb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
@@ -0,0 +1,974 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/miscdevice.h>
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/oom.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <uapi/linux/lnet/lnetctl.h>
+#include <lustre_kernelcomm.h>
+#include <lprocfs_status.h>
+#include <cl_object.h>
+#ifdef HAVE_SERVER_SUPPORT
+# include <dt_object.h>
+# include <md_object.h>
+#endif /* HAVE_SERVER_SUPPORT */
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include "llog_internal.h"
+
+#ifdef CONFIG_PROC_FS
+static __u64 obd_max_alloc;
+#else
+__u64 obd_max_alloc;
+#endif
+
+static DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_lbug_on_eviction;
+EXPORT_SYMBOL(obd_lbug_on_eviction);
+unsigned long obd_max_dirty_pages;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_long_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* bulk transfer timeout, give up after 100s by default */
+unsigned int bulk_timeout = 100; /* seconds */
+EXPORT_SYMBOL(bulk_timeout);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+#ifdef CONFIG_PROC_FS
+struct lprocfs_stats *obd_memory = NULL;
+EXPORT_SYMBOL(obd_memory);
+#endif
+
+static int obdclass_oom_handler(struct notifier_block *self,
+				unsigned long notused, void *nfreed)
+{
+#ifdef CONFIG_PROC_FS
+	/* in bytes */
+	pr_info("obd_memory max: %llu, obd_memory current: %llu\n",
+		obd_memory_max(), obd_memory_sum());
+#endif /* CONFIG_PROC_FS */
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block obdclass_oom = {
+	.notifier_call = obdclass_oom_handler
+};
+
+static int class_resolve_dev_name(__u32 len, const char *name)
+{
+	int rc;
+	int dev;
+
+	ENTRY;
+	if (!len || !name) {
+		CERROR("No name passed,!\n");
+		GOTO(out, rc = -EINVAL);
+	}
+	if (name[len - 1] != 0) {
+		CERROR("Name not nul terminated!\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_IOCTL, "device name %s\n", name);
+	dev = class_name2dev(name);
+	if (dev == -1) {
+		CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+	rc = dev;
+
+out:
+	RETURN(rc);
+}
+
+#define OBD_MAX_IOCTL_BUFFER	8192
+
+static int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+	const int maxlen = 1 << 30;
+	if (data->ioc_len > maxlen) {
+		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen1 > maxlen) {
+		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen2 > maxlen) {
+		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen3 > maxlen) {
+		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen4 > maxlen) {
+		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) {
+		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) {
+		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) {
+		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) {
+		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf1 && data->ioc_plen1 == 0) {
+		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf2 && data->ioc_plen2 == 0) {
+		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) {
+		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) {
+		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (obd_ioctl_packlen(data) > data->ioc_len) {
+		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+		       obd_ioctl_packlen(data), data->ioc_len);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(struct obd_ioctl_data **datap, int *len, void __user *arg)
+{
+	struct obd_ioctl_hdr hdr;
+	struct obd_ioctl_data *data;
+	int offset = 0;
+
+	ENTRY;
+	if (copy_from_user(&hdr, arg, sizeof(hdr)))
+		RETURN(-EFAULT);
+
+	if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+		CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+		       OBD_IOCTL_VERSION, hdr.ioc_version);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("User buffer len %d exceeds %d max buffer\n",
+		       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+		CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	/* When there are lots of processes calling vmalloc on multi-core
+	 * system, the high lock contention will hurt performance badly,
+	 * obdfilter-survey is an example, which relies on ioctl. So we'd
+	 * better avoid vmalloc on ioctl path. LU-66
+	 */
+	OBD_ALLOC_LARGE(data, hdr.ioc_len);
+	if (!data) {
+		CERROR("Cannot allocate control buffer of len %d\n",
+		       hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+	*len = hdr.ioc_len;
+
+	if (copy_from_user(data, arg, hdr.ioc_len)) {
+		OBD_FREE_LARGE(data, hdr.ioc_len);
+		RETURN(-EFAULT);
+	}
+
+	if (obd_ioctl_is_invalid(data)) {
+		CERROR("ioctl not correctly formatted\n");
+		OBD_FREE_LARGE(data, hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1) {
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+		offset += cfs_size_round(data->ioc_inllen1);
+	}
+
+	if (data->ioc_inllen2) {
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen2);
+	}
+
+	if (data->ioc_inllen3) {
+		data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen3);
+	}
+
+	if (data->ioc_inllen4)
+		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+
+	*datap = data;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+	struct obd_ioctl_data *data;
+	struct obd_device *obd = NULL;
+	int err = 0, len = 0;
+
+	ENTRY;
+	CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+	if (obd_ioctl_getdata(&data, &len, (void __user *)arg)) {
+		CERROR("OBD ioctl: data error\n");
+		RETURN(-EINVAL);
+	}
+
+        switch (cmd) {
+        case OBD_IOC_PROCESS_CFG: {
+                struct lustre_cfg *lcfg;
+
+                if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+                        CERROR("No config buffer passed!\n");
+                        GOTO(out, err = -EINVAL);
+                }
+                OBD_ALLOC(lcfg, data->ioc_plen1);
+                if (lcfg == NULL)
+                        GOTO(out, err = -ENOMEM);
+		err = copy_from_user(lcfg, data->ioc_pbuf1,
+                                         data->ioc_plen1);
+                if (!err)
+                        err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+                if (!err)
+                        err = class_process_config(lcfg);
+
+                OBD_FREE(lcfg, data->ioc_plen1);
+                GOTO(out, err);
+        }
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	case OBD_GET_VERSION: {
+		static bool warned;
+
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		if (strlen(LUSTRE_VERSION_STRING) + 1 > data->ioc_inllen1) {
+			CERROR("ioctl buffer too small to hold version\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		if (!warned) {
+			warned = true;
+			CWARN("%s: ioctl(OBD_GET_VERSION) is deprecated, "
+			      "use llapi_get_version_string() and/or relink\n",
+			      current->comm);
+		}
+		memcpy(data->ioc_bulk, LUSTRE_VERSION_STRING,
+		       strlen(LUSTRE_VERSION_STRING) + 1);
+
+		if (copy_to_user((void __user *)arg, data, len))
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+#endif
+        case OBD_IOC_NAME2DEV: {
+                /* Resolve a device name.  This does not change the
+                 * currently selected device.
+                 */
+                int dev;
+
+                dev = class_resolve_dev_name(data->ioc_inllen1,
+                                             data->ioc_inlbuf1);
+                data->ioc_dev = dev;
+                if (dev < 0)
+                        GOTO(out, err = -EINVAL);
+
+		if (copy_to_user((void __user *)arg, data, sizeof(*data)))
+                        err = -EFAULT;
+                GOTO(out, err);
+        }
+
+        case OBD_IOC_UUID2DEV: {
+                /* Resolve a device uuid.  This does not change the
+                 * currently selected device.
+                 */
+                int dev;
+                struct obd_uuid uuid;
+
+                if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+                        CERROR("No UUID passed!\n");
+                        GOTO(out, err = -EINVAL);
+                }
+                if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+                        CERROR("UUID not NUL terminated!\n");
+                        GOTO(out, err = -EINVAL);
+                }
+
+                CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+                obd_str2uuid(&uuid, data->ioc_inlbuf1);
+                dev = class_uuid2dev(&uuid);
+                data->ioc_dev = dev;
+                if (dev == -1) {
+                        CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+                               data->ioc_inlbuf1);
+                        GOTO(out, err = -EINVAL);
+                }
+
+                CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+                       dev);
+		if (copy_to_user((void __user *)arg, data, sizeof(*data)))
+                        err = -EFAULT;
+                GOTO(out, err);
+        }
+
+        case OBD_IOC_GETDEVICE: {
+                int     index = data->ioc_count;
+                char    *status, *str;
+
+                if (!data->ioc_inlbuf1) {
+                        CERROR("No buffer passed in ioctl\n");
+                        GOTO(out, err = -EINVAL);
+                }
+                if (data->ioc_inllen1 < 128) {
+                        CERROR("ioctl buffer too small to hold version\n");
+                        GOTO(out, err = -EINVAL);
+                }
+
+                obd = class_num2obd(index);
+                if (!obd)
+                        GOTO(out, err = -ENOENT);
+
+		if (obd->obd_stopping)
+			status = "ST";
+		else if (obd->obd_inactive)
+			status = "IN";
+		else if (obd->obd_set_up)
+			status = "UP";
+		else if (obd->obd_attached)
+			status = "AT";
+		else
+			status = "--";
+
+                str = (char *)data->ioc_bulk;
+                snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+                         (int)index, status, obd->obd_type->typ_name,
+                         obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+
+		if (copy_to_user((void __user *)arg, data, len))
+			err = -EFAULT;
+
+		GOTO(out, err);
+        }
+
+        }
+
+        if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+                if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL)
+                        GOTO(out, err = -EINVAL);
+                if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME)
+                        GOTO(out, err = -EINVAL);
+                obd = class_name2obd(data->ioc_inlbuf4);
+        } else if (data->ioc_dev < class_devno_max()) {
+                obd = class_num2obd(data->ioc_dev);
+        } else {
+                CERROR("OBD ioctl: No device\n");
+                GOTO(out, err = -EINVAL);
+        }
+
+        if (obd == NULL) {
+                CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+                GOTO(out, err = -EINVAL);
+        }
+        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+        if (!obd->obd_set_up || obd->obd_stopping) {
+                CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev);
+                GOTO(out, err = -EINVAL);
+        }
+
+	err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+	if (err)
+		GOTO(out, err);
+
+	if (copy_to_user((void __user *)arg, data, len))
+		err = -EFAULT;
+out:
+	OBD_FREE_LARGE(data, len);
+	RETURN(err);
+} /* class_handle_ioctl */
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+			    unsigned long arg)
+{
+	int err = 0;
+
+	ENTRY;
+	/* Allow non-root access for some limited ioctls */
+	if (!capable(CAP_SYS_ADMIN))
+		RETURN(err = -EACCES);
+
+	if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+		RETURN(err = -ENOTTY);
+
+	err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+	RETURN(err);
+}
+
+/* declare character device */
+static const struct file_operations obd_psdev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= obd_class_ioctl,	/* unlocked_ioctl */
+};
+
+/* modules setup */
+static struct miscdevice obd_psdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= OBD_DEV_NAME,
+	.fops	= &obd_psdev_fops,
+};
+
+#define test_string_to_size_err(value, expect, def_unit, __rc)		       \
+({									       \
+	u64 __size;							       \
+	int __ret;							       \
+									       \
+	BUILD_BUG_ON(sizeof(value) >= 23);				       \
+	__ret = sysfs_memparse(value, sizeof(value) - 1, &__size, def_unit);   \
+	if (__ret != __rc)						       \
+		CERROR("string_helper: parsing '%s' expect rc %d != got %d\n", \
+		       value, __rc, __ret);				       \
+	else if (!__ret && (u64)expect != __size)			       \
+		CERROR("string_helper: parsing '%s' expect %llu != got %llu\n",\
+		       value, (u64)expect, __size);			       \
+	__ret;								       \
+})
+#define test_string_to_size_one(value, expect, def_unit)		       \
+	test_string_to_size_err(value, expect, def_unit, 0)
+
+static int __init obd_init_checks(void)
+{
+	__u64 u64val, div64val;
+	char buf[64];
+	int len, ret = 0;
+
+	CDEBUG(D_INFO, "OBD_OBJECT_EOF = %#llx\n", (__u64)OBD_OBJECT_EOF);
+
+	u64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+	if (len != 18) {
+		CERROR("u64 hex wrong length, strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+
+	div64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EOVERFLOW;
+	}
+	if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+		CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EOVERFLOW;
+	}
+	if (do_div(div64val, 256) != (u64val & 255)) {
+		CERROR("do_div(%#llx,256) != %llu\n", u64val, u64val & 255);
+		ret = -EOVERFLOW;
+	}
+	if (u64val >> 8 != div64val) {
+		CERROR("do_div(%#llx,256) %llu != %llu\n",
+		       u64val, div64val, u64val >> 8);
+		ret = -EOVERFLOW;
+	}
+	len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+	if (len != 18) {
+		CERROR("u64 hex wrong length! strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), "%llu", u64val);
+	if (len != 20) {
+		CERROR("u64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), "%lld", u64val);
+	if (len != 2) {
+		CERROR("s64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+		ret = -EINVAL;
+	}
+	if ((u64val & ~PAGE_MASK) >= PAGE_SIZE) {
+		CERROR("mask failed: u64val %llu >= %llu\n", u64val,
+		       (__u64)PAGE_SIZE);
+		ret = -EINVAL;
+	}
+	if (ret)
+		RETURN(ret);
+
+	/* invalid string */
+	if (!test_string_to_size_err("256B34", 256, "B", -EINVAL)) {
+		CERROR("string_helpers: format should be number then units\n");
+		ret = -EINVAL;
+	}
+	if (!test_string_to_size_err("132OpQ", 132, "B", -EINVAL)) {
+		CERROR("string_helpers: invalid units should be rejected\n");
+		ret = -EINVAL;
+	}
+	if (!test_string_to_size_err("1.82B", 1, "B", -EINVAL)) {
+		CERROR("string_helpers: 'B' with '.' should be invalid\n");
+		ret = -EINVAL;
+	}
+	if (test_string_to_size_one("343\n", 343, "B")) {
+		CERROR("string_helpers: should ignore newline\n");
+		ret = -EINVAL;
+	}
+	if (ret)
+		RETURN(ret);
+
+	/* memparse unit handling */
+	ret = 0;
+	ret += test_string_to_size_one("0B", 0, "B");
+	ret += test_string_to_size_one("512B", 512, "B");
+	ret += test_string_to_size_one("1.067kB", 1067, "B");
+	ret += test_string_to_size_one("1.042KiB", 1067, "B");
+	ret += test_string_to_size_one("8", 8388608, "M");
+	ret += test_string_to_size_one("65536", 65536, "B");
+	ret += test_string_to_size_one("128", 131072, "K");
+	ret += test_string_to_size_one("1M", 1048576, "B");
+	ret += test_string_to_size_one("0.5T", 549755813888ULL, "T");
+	ret += test_string_to_size_one("256.5G", 275414777856ULL, "G");
+	if (ret)
+		RETURN(ret);
+
+	/* string helper values */
+	ret += test_string_to_size_one("16", 16777216, "MiB");
+	ret += test_string_to_size_one("8.39MB", 8390000, "MiB");
+	ret += test_string_to_size_one("8.00MiB", 8388608, "MiB");
+	ret += test_string_to_size_one("256GB", 256000000000ULL, "GiB");
+	ret += test_string_to_size_one("238.731GiB", 256335459385ULL, "GiB");
+	if (ret)
+		RETURN(ret);
+
+	/* huge values */
+	ret += test_string_to_size_one("0.4TB", 400000000000ULL, "TiB");
+	ret += test_string_to_size_one("12.5TiB", 13743895347200ULL, "TiB");
+	ret += test_string_to_size_one("2PB", 2000000000000000ULL, "PiB");
+	ret += test_string_to_size_one("16PiB", 18014398509481984ULL, "PiB");
+	if (ret)
+		RETURN(ret);
+
+	/* huge values should overflow */
+	if (!test_string_to_size_err("1000EiB", 0, "EiB", -EOVERFLOW)) {
+		CERROR("string_helpers: failed to detect binary overflow\n");
+		ret = -EINVAL;
+	}
+	if (!test_string_to_size_err("1000EB", 0, "EiB", -EOVERFLOW)) {
+		CERROR("string_helpers: failed to detect decimal overflow\n");
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int __init obdclass_init(void)
+{
+	int err;
+
+	LCONSOLE_INFO("Lustre: Build Version: "LUSTRE_VERSION_STRING"\n");
+
+	register_oom_notifier(&obdclass_oom);
+
+	libcfs_kkuc_init();
+
+	err = obd_init_checks();
+	if (err)
+		return err;
+
+#ifdef CONFIG_PROC_FS
+	obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+					 LPROCFS_STATS_FLAG_NONE |
+					 LPROCFS_STATS_FLAG_IRQ_SAFE);
+	if (obd_memory == NULL) {
+		CERROR("kmalloc of 'obd_memory' failed\n");
+		return -ENOMEM;
+	}
+
+	lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+			     LPROCFS_CNTR_AVGMINMAX,
+			     "memused", "bytes");
+#endif
+	err = obd_zombie_impexp_init();
+	if (err)
+		goto cleanup_obd_memory;
+
+	err = class_handle_init();
+	if (err)
+		goto cleanup_zombie_impexp;
+
+	err = misc_register(&obd_psdev);
+	if (err) {
+		CERROR("cannot register OBD miscdevice: err = %d\n", err);
+		goto cleanup_class_handle;
+	}
+
+	/* Default the dirty page cache cap to 1/2 of system memory.
+	 * For clients with less memory, a larger fraction is needed
+	 * for other purposes (mostly for BGL). */
+	if (cfs_totalram_pages() <= 512 << (20 - PAGE_SHIFT))
+		obd_max_dirty_pages = cfs_totalram_pages() / 4;
+	else
+		obd_max_dirty_pages = cfs_totalram_pages() / 2;
+
+	err = obd_init_caches();
+	if (err)
+		goto cleanup_deregister;
+
+	err = class_procfs_init();
+	if (err)
+		goto cleanup_caches;
+
+	err = lu_global_init();
+	if (err)
+		goto cleanup_class_procfs;
+
+	err = cl_global_init();
+	if (err != 0)
+		goto cleanup_lu_global;
+
+	err = llog_info_init();
+	if (err)
+		goto cleanup_cl_global;
+
+#ifdef HAVE_SERVER_SUPPORT
+	err = dt_global_init();
+	if (err != 0)
+		goto cleanup_llog_info;
+
+	err = lu_ucred_global_init();
+	if (err != 0)
+		goto cleanup_dt_global;
+
+	err = lustre_tgt_register_fs();
+	if (err && err != -EBUSY) {
+		/* Don't fail if server code also registers "lustre_tgt" */
+		CERROR("obdclass: register fstype 'lustre_tgt' failed: rc = %d\n",
+		       err);
+		goto cleanup_lu_ucred_global;
+	}
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* simulate a late OOM situation now to require all
+	 * alloc'ed/initialized resources to be freed
+	 */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OBDCLASS_MODULE_LOAD)) {
+		/* force error to ensure module will be unloaded/cleaned */
+		err = -ENOMEM;
+		goto cleanup_all;
+	}
+	return 0;
+
+cleanup_all:
+#ifdef HAVE_SERVER_SUPPORT
+	/* fake error but filesystem has been registered */
+	lustre_tgt_unregister_fs();
+
+cleanup_lu_ucred_global:
+	lu_ucred_global_fini();
+
+cleanup_dt_global:
+	dt_global_fini();
+
+cleanup_llog_info:
+#endif /* HAVE_SERVER_SUPPORT */
+	llog_info_fini();
+
+cleanup_cl_global:
+	cl_global_fini();
+
+cleanup_lu_global:
+	lu_global_fini();
+
+cleanup_class_procfs:
+	class_procfs_clean();
+
+cleanup_caches:
+	obd_cleanup_caches();
+
+cleanup_deregister:
+	misc_deregister(&obd_psdev);
+
+cleanup_class_handle:
+	class_handle_cleanup();
+
+cleanup_zombie_impexp:
+	obd_zombie_impexp_stop();
+
+cleanup_obd_memory:
+#ifdef CONFIG_PROC_FS
+	lprocfs_free_stats(&obd_memory);
+#endif
+
+	unregister_oom_notifier(&obdclass_oom);
+	return err;
+}
+
+void obd_update_maxusage(void)
+{
+	__u64 max;
+
+	max = obd_memory_sum();
+
+	spin_lock(&obd_updatemax_lock);
+	if (max > obd_max_alloc)
+		obd_max_alloc = max;
+	spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#ifdef CONFIG_PROC_FS
+__u64 obd_memory_max(void)
+{
+	__u64 ret;
+
+	obd_update_maxusage();
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_alloc;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
+static void __exit obdclass_exit(void)
+{
+#ifdef CONFIG_PROC_FS
+	__u64 memory_leaked;
+	__u64 memory_max;
+#endif /* CONFIG_PROC_FS */
+	ENTRY;
+
+	misc_deregister(&obd_psdev);
+#ifdef HAVE_SERVER_SUPPORT
+	lustre_tgt_unregister_fs();
+	lu_ucred_global_fini();
+	dt_global_fini();
+#endif /* HAVE_SERVER_SUPPORT */
+	llog_info_fini();
+	cl_global_fini();
+	lu_global_fini();
+
+	obd_cleanup_caches();
+
+	class_procfs_clean();
+
+	class_handle_cleanup();
+	class_del_uuid(NULL); /* Delete all UUIDs. */
+	obd_zombie_impexp_stop();
+
+#ifdef CONFIG_PROC_FS
+	memory_leaked = obd_memory_sum();
+	memory_max = obd_memory_max();
+
+	lprocfs_free_stats(&obd_memory);
+	/* the below message is checked in test-framework.sh check_mem_leak() */
+	CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory max: %llu, leaked: %llu\n",
+	       memory_max, memory_leaked);
+#endif /* CONFIG_PROC_FS */
+
+	unregister_oom_notifier(&obdclass_oom);
+
+	EXIT;
+}
+
+void obd_heat_clear(struct obd_heat_instance *instance, int count)
+{
+	ENTRY;
+
+	memset(instance, 0, sizeof(*instance) * count);
+	RETURN_EXIT;
+}
+EXPORT_SYMBOL(obd_heat_clear);
+
+/*
+ * The file heat is calculated for every time interval period I. The access
+ * frequency during each period is counted. The file heat is only recalculated
+ * at the end of a time period.  And a percentage of the former file heat is
+ * lost when recalculated. The recursion formula to calculate the heat of the
+ * file f is as follow:
+ *
+ * Hi+1(f) = (1-P)*Hi(f)+ P*Ci
+ *
+ * Where Hi is the heat value in the period between time points i*I and
+ * (i+1)*I; Ci is the access count in the period; the symbol P refers to the
+ * weight of Ci. The larger the value the value of P is, the more influence Ci
+ * has on the file heat.
+ */
+void obd_heat_decay(struct obd_heat_instance *instance,  __u64 time_second,
+		    unsigned int weight, unsigned int period_second)
+{
+	u64 second;
+
+	ENTRY;
+
+	if (instance->ohi_time_second > time_second) {
+		obd_heat_clear(instance, 1);
+		RETURN_EXIT;
+	}
+
+	if (instance->ohi_time_second == 0)
+		RETURN_EXIT;
+
+	for (second = instance->ohi_time_second + period_second;
+	     second < time_second;
+	     second += period_second) {
+		instance->ohi_heat = instance->ohi_heat *
+				(256 - weight) / 256 +
+				instance->ohi_count * weight / 256;
+		instance->ohi_count = 0;
+		instance->ohi_time_second = second;
+	}
+	RETURN_EXIT;
+}
+EXPORT_SYMBOL(obd_heat_decay);
+
+__u64 obd_heat_get(struct obd_heat_instance *instance, unsigned int time_second,
+		   unsigned int weight, unsigned int period_second)
+{
+	ENTRY;
+
+	obd_heat_decay(instance, time_second, weight, period_second);
+
+	if (instance->ohi_count == 0)
+		RETURN(instance->ohi_heat);
+
+	RETURN(instance->ohi_heat * (256 - weight) / 256 +
+	       instance->ohi_count * weight / 256);
+}
+EXPORT_SYMBOL(obd_heat_get);
+
+void obd_heat_add(struct obd_heat_instance *instance,
+		  unsigned int time_second,  __u64 count,
+		  unsigned int weight, unsigned int period_second)
+{
+	ENTRY;
+
+	obd_heat_decay(instance, time_second, weight, period_second);
+	if (instance->ohi_time_second == 0) {
+		instance->ohi_time_second = time_second;
+		instance->ohi_heat = 0;
+		instance->ohi_count = count;
+	} else {
+		instance->ohi_count += count;
+	}
+	RETURN_EXIT;
+}
+EXPORT_SYMBOL(obd_heat_add);
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(obdclass_init);
+module_exit(obdclass_exit);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
new file mode 100644
index 0000000000000..ee17b36c9b337
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
@@ -0,0 +1,1292 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/list.h>
+#include <obd_class.h>
+#include <dt_object.h>
+/* fid_be_to_cpu() */
+#include <lustre_fid.h>
+#include <lustre_nodemap.h>
+#include <lustre_quota.h>
+#include <lustre_lfsck.h>
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+	.lct_init = dt_global_key_init,
+	.lct_fini = dt_global_key_fini
+};
+
+/*
+ * no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+		      struct dt_device *dev, struct thandle *th)
+{
+	int rc = 0;
+	struct dt_txn_callback *cb;
+
+	if (th->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		struct thandle *dtc_th = th;
+
+		if (cb->dtc_txn_start == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+
+		/*
+		 * Usually dt_txn_hook_start is called from bottom device,
+		 * and if the thandle has th_top, then we need use top
+		 * thandle for the callback in the top thandle layer
+		 */
+		if (th->th_top != NULL)
+			dtc_th = th->th_top;
+
+		rc = cb->dtc_txn_start(env, dtc_th, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th)
+{
+	struct dt_device *dev = th->th_dev;
+	struct dt_txn_callback *cb;
+	int rc = 0;
+
+	if (th->th_local)
+		return 0;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_DT_TXN_STOP))
+		return -EIO;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		struct thandle *dtc_th = th;
+
+		if (cb->dtc_txn_stop == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+
+		/*
+		 * Usually dt_txn_hook_stop is called from bottom device,
+		 * and if the thandle has th_top, then we need use top
+		 * thandle for the callback in the top thandle layer
+		 */
+		if (th->th_top != NULL)
+			dtc_th = th->th_top;
+
+		rc = cb->dtc_txn_stop(env, dtc_th, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+	INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+	return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+	lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+		   struct lu_object_header *h, struct lu_device *d)
+
+{
+	return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+	lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+	if (obj->do_index_ops == NULL)
+		obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+	return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+	enum dt_format_type result;
+
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		result = DFT_DIR;
+		break;
+	case S_IFREG:
+		result = DFT_REGULAR;
+		break;
+	case S_IFLNK:
+		result = DFT_SYM;
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		result = DFT_NODE;
+		break;
+	default:
+		LASSERTF(0, "invalid mode %o\n", mode);
+		result = 0; /* Just for satisfying compiler. */
+		break;
+	}
+	return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+                  const char *name, struct lu_fid *fid)
+{
+	if (dt_try_as_dir(env, dir))
+		return dt_lookup(env, dir, (struct dt_rec *)fid,
+				 (const struct dt_key *)name);
+	return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/*
+ * this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site
+ */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev,
+			       const struct lu_fid *fid,
+			       struct lu_device *top_dev,
+			       const struct lu_object_conf *conf)
+{
+	struct lu_object *lo;
+	struct lu_object *n;
+
+	lo = lu_object_find_at(env, top_dev, fid, conf);
+	if (IS_ERR(lo))
+		return ERR_CAST(lo);
+
+	LASSERT(lo != NULL);
+
+	list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+		if (n->lo_dev == &dev->dd_lu_dev)
+			return container_of(n, struct dt_object, do_lu);
+	}
+
+	lu_object_put(env, lo);
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find an object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry,
+			 void *data)
+{
+	struct dt_find_hint *dfh = data;
+	struct dt_device *dt = dfh->dfh_dt;
+	struct lu_fid *fid = dfh->dfh_fid;
+	struct dt_object *obj = dfh->dfh_o;
+	int rc;
+
+	rc = dt_lookup_dir(env, obj, entry, fid);
+	dt_object_put(env, obj);
+	if (rc == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (IS_ERR(obj))
+			rc = PTR_ERR(obj);
+	}
+	dfh->dfh_o = obj;
+
+	return rc;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+		   char *path, dt_entry_func_t entry_func,
+		   void *data)
+{
+	char *e;
+	int rc = 0;
+
+	while (1) {
+		e = strsep(&path, "/");
+		if (e == NULL)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+		rc = entry_func(env, e, data);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid)
+{
+	struct dt_thread_info *info = dt_info(env);
+	struct dt_find_hint *dfh = &info->dti_dfh;
+	struct dt_object *obj;
+	int result;
+
+
+	dfh->dfh_dt = dt;
+	dfh->dfh_fid = fid;
+
+	strlcpy(info->dti_buf, path, sizeof(info->dti_buf));
+
+	result = dt->dd_ops->dt_root_get(env, dt, fid);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (!IS_ERR(obj)) {
+			dfh->dfh_o = obj;
+			result = dt_path_parser(env, info->dti_buf,
+						dt_find_entry, dfh);
+			if (result != 0)
+				obj = ERR_PTR(result);
+			else
+				obj = dfh->dfh_o;
+		}
+	} else {
+		obj = ERR_PTR(result);
+	}
+	return obj;
+}
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+				     struct dt_device *dt,
+				     struct dt_object *p,
+				     const char *name,
+				     struct lu_fid *fid)
+{
+	struct dt_object *o;
+	int result;
+
+	result = dt_lookup_dir(env, p, name, fid);
+	if (result == 0)
+		o = dt_locate(env, dt, fid);
+	else
+		o = ERR_PTR(result);
+
+	return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ *      \param  dt      dt device
+ *      \param  fid     on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env, struct dt_device *dt,
+				const char *dirname, const char *filename,
+				struct lu_fid *fid)
+{
+	struct dt_object *file;
+	struct dt_object *dir;
+
+	dir = dt_store_resolve(env, dt, dirname, fid);
+	if (!IS_ERR(dir)) {
+		file = dt_reg_open(env, dt, dir, filename, fid);
+		dt_object_put(env, dir);
+	} else {
+		file = dir;
+	}
+
+	return file;
+}
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *at)
+{
+	struct dt_object *dto;
+	struct thandle *th;
+	int rc;
+
+	ENTRY;
+
+	dto = dt_locate(env, dt, fid);
+	if (IS_ERR(dto))
+		RETURN(dto);
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		RETURN(dto);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+	rc = dt_create(env, dto, at, NULL, dof, th);
+	if (rc)
+                GOTO(unlock, rc);
+	LASSERT(dt_object_exists(dto));
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, dt, th);
+out:
+	if (rc) {
+		dt_object_put(env, dto);
+		dto = ERR_PTR(rc);
+	}
+
+	RETURN(dto);
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+	int result;
+
+	LU_CONTEXT_KEY_INIT(&dt_key);
+	result = lu_context_key_register(&dt_key);
+	return result;
+}
+
+void dt_global_fini(void)
+{
+	lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+	    struct lu_buf *buf, loff_t *pos)
+{
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+	return dt->do_body_ops->dbo_read(env, dt, buf, pos);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage.  Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+                   struct lu_buf *buf, loff_t *pos)
+{
+	ssize_t size;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+	size = dt->do_body_ops->dbo_read(env, dt, buf, pos);
+	if (size < 0)
+		return size;
+	return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+	ssize_t size;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_write);
+
+	size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th);
+	if (size < 0)
+		return size;
+	return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+			   struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+
+	LASSERT(o);
+	vbuf.lb_buf = NULL;
+	vbuf.lb_len = sizeof(dt_obj_version_t);
+	return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+		    dt_obj_version_t version, struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+
+	rc = dt_xattr_set(env, o, &vbuf, xname, 0, th);
+	if (rc < 0)
+		CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	dt_obj_version_t version;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+	rc = dt_xattr_get(env, o, &vbuf, xname);
+	if (rc != sizeof(version)) {
+		CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+		version = 0;
+	}
+	return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck layout orphan */
+const struct dt_index_features dt_lfsck_layout_orphan_features = {
+	.dif_flags		= 0,
+	.dif_keysize_min	= sizeof(struct lu_fid),
+	.dif_keysize_max	= sizeof(struct lu_fid),
+	.dif_recsize_min	= sizeof(struct lu_orphan_rec_v3),
+	.dif_recsize_max	= sizeof(struct lu_orphan_rec_v3),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_layout_orphan_features);
+
+/* lfsck layout dangling */
+const struct dt_index_features dt_lfsck_layout_dangling_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(struct lfsck_layout_dangling_key),
+	.dif_keysize_max	= sizeof(struct lfsck_layout_dangling_key),
+	.dif_recsize_min	= sizeof(struct lu_fid),
+	.dif_recsize_max	= sizeof(struct lu_fid),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_layout_dangling_features);
+
+/* lfsck namespace */
+const struct dt_index_features dt_lfsck_namespace_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(struct lu_fid),
+	.dif_keysize_max	= sizeof(struct lu_fid),
+	.dif_recsize_min	= sizeof(__u8),
+	.dif_recsize_max	= sizeof(__u8),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_namespace_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* nodemap files, nodemap_rec size asserted in nodemap_storage.c */
+const struct dt_index_features dt_nodemap_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit nodemap/record id */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit nodemap/record id */
+	.dif_recsize_min	= sizeof(union nodemap_rec), /* 32 bytes */
+	.dif_recsize_max	= sizeof(union nodemap_rec), /* 32 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_nodemap_features);
+
+/*
+ * helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC
+ */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+								   __u32 mode)
+{
+	if (seq == FID_SEQ_QUOTA_GLB) {
+		/* global quota index */
+		if (!S_ISREG(mode))
+			/* global quota index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_glb_features;
+	} else if (seq == FID_SEQ_QUOTA) {
+		/* quota slave index */
+		if (!S_ISREG(mode))
+			/* slave index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_slv_features;
+	} else if (seq == FID_SEQ_LAYOUT_RBTREE){
+		return &dt_lfsck_layout_orphan_features;
+	} else if (seq >= FID_SEQ_NORMAL) {
+		/* object is part of the namespace, verify that it is a
+		 * directory */
+		if (!S_ISDIR(mode))
+			/* sorry, we can only deal with directory */
+			return ERR_PTR(-ENOTDIR);
+		return &dt_directory_features;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp  - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it   - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg  - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+			       size_t nob, const struct dt_it_ops *iops,
+			       struct dt_it *it, __u32 attr, void *arg)
+{
+	struct idx_info *ii = (struct idx_info *)arg;
+	struct lu_idxpage *lip = &lp->lp_idx;
+	char *entry;
+	__u64 hash;
+	__u16 hashsize = 0;
+	__u16 keysize = 0;
+	__u16 recsize;
+	int rc;
+
+	ENTRY;
+
+	if (nob < LIP_HDR_SIZE)
+		return -EINVAL;
+
+	/* initialize the header of the new container */
+	memset(lip, 0, LIP_HDR_SIZE);
+	lip->lip_magic = LIP_MAGIC;
+	nob           -= LIP_HDR_SIZE;
+
+	/* client wants to the 64-bit hash value associated with each record */
+	if (!(ii->ii_flags & II_FL_NOHASH))
+		hashsize = sizeof(hash);
+
+	entry = lip->lip_entries;
+	do {
+		/* fetch 64-bit hash value */
+		hash = iops->store(env, it);
+		ii->ii_hash_end = hash;
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+			if (lip->lip_nr != 0)
+				GOTO(out, rc = 0);
+		}
+
+		if (!(ii->ii_flags & II_FL_NOKEY)) {
+			keysize = iops->key_size(env, it);
+			if (!(ii->ii_flags & II_FL_VARKEY) &&
+			    keysize != ii->ii_keysize) {
+				CERROR("keysize mismatch %hu != %hu.\n",
+				       keysize, ii->ii_keysize);
+				GOTO(out, rc = -EINVAL);
+			}
+		}
+
+		/* and finally the record */
+		if (ii->ii_flags & II_FL_VARREC)
+			recsize = iops->rec_size(env, it, attr);
+		else
+			recsize = ii->ii_recsize;
+
+		if (nob < hashsize + keysize + recsize) {
+			if (lip->lip_nr == 0)
+				GOTO(out, rc = -E2BIG);
+			GOTO(out, rc = 0);
+		}
+
+		rc = iops->rec(env, it,
+			       (struct dt_rec *)(entry + hashsize + keysize),
+			       attr);
+		if (!rc) {
+			if (hashsize)
+				memcpy(entry, &hash, hashsize);
+			if (keysize) {
+				struct dt_key *key;
+
+				key = iops->key(env, it);
+				memcpy(entry + hashsize, key, keysize);
+			}
+			/* hash/key/record successfully copied! */
+			lip->lip_nr++;
+			if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+				ii->ii_hash_start = hash;
+			entry += hashsize + keysize + recsize;
+			nob -= hashsize + keysize + recsize;
+		} else if (rc != -ESTALE) {
+			GOTO(out, rc);
+		}
+
+		/* move on to the next record */
+		do {
+			rc = iops->next(env, it);
+		} while (rc == -ESTALE);
+	} while (rc == 0);
+
+	GOTO(out, rc);
+out:
+	if (rc >= 0 && lip->lip_nr > 0)
+		/* one more container */
+		ii->ii_count++;
+	if (rc > 0)
+		/* no more entries */
+		ii->ii_hash_end = II_END_OFF;
+	return rc;
+}
+
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ *                 with key/record pairs in the format wanted by the caller.
+ *                 If NULL, uses dt_index_page_build
+ * \param arg    - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg)
+{
+	struct dt_it *it;
+	const struct dt_it_ops *iops;
+	size_t pageidx, nob, nlupgs = 0;
+	int rc;
+	ENTRY;
+
+	LASSERT(rdpg->rp_pages != NULL);
+	LASSERT(obj->do_index_ops != NULL);
+
+	if (filler == NULL)
+		filler = dt_index_page_build;
+
+	nob = rdpg->rp_count;
+	if (nob == 0)
+		RETURN(-EFAULT);
+
+	/* Iterate through index and fill containers from @rdpg */
+	iops = &obj->do_index_ops->dio_it;
+	LASSERT(iops != NULL);
+	it = iops->init(env, obj, rdpg->rp_attrs);
+	if (IS_ERR(it))
+		RETURN(PTR_ERR(it));
+
+	rc = iops->load(env, it, rdpg->rp_hash);
+	if (rc == 0) {
+		/*
+		 * Iterator didn't find record with exactly the key requested.
+		 *
+		 * It is currently either
+		 *
+		 *     - positioned above record with key less than
+		 *     requested---skip it.
+		 *     - or not positioned at all (is in IAM_IT_SKEWED
+		 *     state)---position it on the next item.
+		 */
+		rc = iops->next(env, it);
+	} else if (rc > 0) {
+		rc = 0;
+	} else {
+		if (rc == -ENODATA)
+			rc = 0;
+		GOTO(out, rc);
+	}
+
+	/*
+	 * Fill containers one after the other. There might be multiple
+	 * containers per physical page.
+	 *
+	 * At this point and across for-loop:
+	 *  rc == 0 -> ok, proceed.
+	 *  rc >  0 -> end of index.
+	 *  rc <  0 -> error.
+	 */
+	for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+		union lu_page	*lp;
+		int		 i;
+
+		LASSERT(pageidx < rdpg->rp_npages);
+		lp = kmap(rdpg->rp_pages[pageidx]);
+
+		/* fill lu pages */
+		for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+			rc = filler(env, lp, min_t(size_t, nob, LU_PAGE_SIZE),
+				    iops, it, rdpg->rp_attrs, arg);
+			if (rc < 0)
+				break;
+			/* one more lu_page */
+			nlupgs++;
+			if (rc > 0)
+				/* end of index */
+				break;
+		}
+		kunmap(rdpg->rp_pages[i]);
+	}
+
+out:
+	iops->put(env, it);
+	iops->fini(env, it);
+
+	if (rc >= 0)
+		rc = min_t(size_t, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii  - is the idx_info structure packed by the client in the
+ *              OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+	const struct dt_index_features	*feat;
+	struct dt_object		*obj;
+	int				 rc;
+	ENTRY;
+
+	/*
+	 * rp_count shouldn't be null and should be a multiple of the container
+	 * size
+	 */
+	if (rdpg->rp_count == 0 || (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+		RETURN(-EFAULT);
+
+	if (!fid_is_quota(&ii->ii_fid) && !fid_is_layout_rbtree(&ii->ii_fid) &&
+	    !fid_is_norm(&ii->ii_fid))
+		RETURN(-EOPNOTSUPP);
+
+	/* lookup index object subject to the transfer */
+	obj = dt_locate(env, dev, &ii->ii_fid);
+	if (IS_ERR(obj))
+		RETURN(PTR_ERR(obj));
+	if (dt_object_exists(obj) == 0)
+		GOTO(out, rc = -ENOENT);
+
+	/* fetch index features associated with index object */
+	feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+				    lu_object_attr(&obj->do_lu));
+	if (IS_ERR(feat))
+		GOTO(out, rc = PTR_ERR(feat));
+
+	/* load index feature if not done already */
+	if (obj->do_index_ops == NULL) {
+		rc = obj->do_ops->do_index_try(env, obj, feat);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	/* fill ii_flags with supported index features */
+	ii->ii_flags &= (II_FL_NOHASH | II_FL_NOKEY | II_FL_VARKEY |
+			 II_FL_VARREC);
+
+	if (!(feat->dif_flags & DT_IND_VARKEY))
+		ii->ii_keysize = feat->dif_keysize_max;
+
+	if (!(feat->dif_flags & DT_IND_VARREC))
+		ii->ii_recsize = feat->dif_recsize_max;
+
+	if (feat->dif_flags & DT_IND_NONUNQ)
+		/* key isn't necessarily unique */
+		ii->ii_flags |= II_FL_NONUNQ;
+
+	if (!fid_is_layout_rbtree(&ii->ii_fid)) {
+		dt_read_lock(env, obj, 0);
+		/* fetch object version before walking the index */
+		ii->ii_version = dt_version_get(env, obj);
+	}
+
+	/* walk the index and fill lu_idxpages with key/record pairs */
+	rc = dt_index_walk(env, obj, rdpg, dt_index_page_build, ii);
+	if (!fid_is_layout_rbtree(&ii->ii_fid))
+		dt_read_unlock(env, obj);
+
+	if (rc == 0) {
+		/* index is empty */
+		LASSERT(ii->ii_count == 0);
+		ii->ii_hash_end = II_END_OFF;
+	}
+
+	GOTO(out, rc);
+out:
+	dt_object_put(env, obj);
+	return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#ifdef CONFIG_PROC_FS
+int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0)
+		seq_printf(m, "%u\n", (unsigned) osfs.os_bsize);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_blksize_seq_show);
+
+int lprocfs_dt_kbytestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_kbytestotal_seq_show);
+
+int lprocfs_dt_kbytesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_kbytesfree_seq_show);
+
+int lprocfs_dt_kbytesavail_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_kbytesavail_seq_show);
+
+int lprocfs_dt_filestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0)
+		seq_printf(m, "%llu\n", osfs.os_files);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_filestotal_seq_show);
+
+int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0)
+		seq_printf(m, "%llu\n", osfs.os_ffree);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_filesfree_seq_show);
+
+#endif /* CONFIG_PROC_FS */
+
+static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct lu_device *lu = dt2lu_dev(dt);
+
+	if (!lu->ld_obd)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", lu->ld_obd->obd_uuid.uuid);
+}
+LUSTRE_RO_ATTR(uuid);
+
+static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%u\n", (unsigned) osfs.os_bsize);
+}
+LUSTRE_RO_ATTR(blocksize);
+
+static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_blocks;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytestotal);
+
+static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bfree;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytesfree);
+
+static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bavail;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytesavail);
+
+static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_files);
+}
+LUSTRE_RO_ATTR(filestotal);
+
+static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_ffree);
+}
+LUSTRE_RO_ATTR(filesfree);
+
+static const struct attribute *dt_def_attrs[] = {
+	&lustre_attr_uuid.attr,
+	&lustre_attr_blocksize.attr,
+	&lustre_attr_kbytestotal.attr,
+	&lustre_attr_kbytesfree.attr,
+	&lustre_attr_kbytesavail.attr,
+	&lustre_attr_filestotal.attr,
+	&lustre_attr_filesfree.attr,
+	NULL,
+};
+
+static void dt_sysfs_release(struct kobject *kobj)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+
+	debugfs_remove_recursive(dt->dd_debugfs_entry);
+	dt->dd_debugfs_entry = NULL;
+
+	complete(&dt->dd_kobj_unregister);
+}
+
+int dt_tunables_fini(struct dt_device *dt)
+{
+	if (!dt)
+		return -EINVAL;
+
+	if (dt->dd_def_attrs)
+		sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs);
+
+	kobject_put(&dt->dd_kobj);
+	wait_for_completion(&dt->dd_kobj_unregister);
+
+	return 0;
+}
+EXPORT_SYMBOL(dt_tunables_fini);
+
+int dt_tunables_init(struct dt_device *dt, struct obd_type *type,
+		     const char *name, struct ldebugfs_vars *list)
+{
+	int rc;
+
+	dt->dd_ktype.sysfs_ops = &lustre_sysfs_ops;
+	dt->dd_ktype.release = dt_sysfs_release;
+
+	init_completion(&dt->dd_kobj_unregister);
+	rc = kobject_init_and_add(&dt->dd_kobj, &dt->dd_ktype, &type->typ_kobj,
+				  "%s", name);
+	if (rc)
+		return rc;
+
+	dt->dd_def_attrs = dt_def_attrs;
+
+	rc = sysfs_create_files(&dt->dd_kobj, dt->dd_def_attrs);
+	if (rc) {
+		kobject_put(&dt->dd_kobj);
+		return rc;
+	}
+
+	/*
+	 * No need to register debugfs if no enteries. This allows us to
+	 * choose between using dt_device or obd_device for debugfs.
+	 */
+	if (!list)
+		return rc;
+
+	dt->dd_debugfs_entry = debugfs_create_dir(name,
+						 type->typ_debugfs_entry);
+	ldebugfs_add_vars(dt->dd_debugfs_entry, list, dt);
+
+	return rc;
+}
+EXPORT_SYMBOL(dt_tunables_init);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
new file mode 100644
index 0000000000000..d8a689024659d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
@@ -0,0 +1,2348 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/pid_namespace.h>
+#include <linux/workqueue.h>
+#include <lustre_compat.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_kernelcomm.h>
+
+DEFINE_RWLOCK(obd_dev_lock);
+static struct obd_device *obd_devs[MAX_OBD_DEVICES];
+
+static struct kmem_cache *obd_device_cachep;
+static struct kobj_type class_ktype;
+static struct workqueue_struct *zombie_wq;
+
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+                              const char *status, int locks, int debug_level);
+
+static LIST_HEAD(obd_stale_exports);
+static DEFINE_SPINLOCK(obd_stale_export_lock);
+static atomic_t obd_stale_export_num = ATOMIC_INIT(0);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+	struct obd_device *obd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, GFP_NOFS);
+	if (obd != NULL) {
+		obd->obd_magic = OBD_DEVICE_MAGIC;
+	}
+	return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+        LASSERT(obd != NULL);
+        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+        if (obd->obd_namespace != NULL) {
+                CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+                       obd, obd->obd_namespace, obd->obd_force);
+                LBUG();
+        }
+        lu_ref_fini(&obd->obd_reference);
+        OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+	struct kobject *kobj = kset_find_obj(lustre_kset, name);
+
+	if (kobj && kobj->ktype == &class_ktype)
+		return container_of(kobj, struct obd_type, typ_kobj);
+
+	kobject_put(kobj);
+	return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+	struct obd_type *type;
+
+	type = class_search_type(name);
+#ifdef HAVE_MODULE_LOADING_SUPPORT
+        if (!type) {
+                const char *modname = name;
+
+#ifdef HAVE_SERVER_SUPPORT
+		if (strcmp(modname, "obdfilter") == 0)
+			modname = "ofd";
+
+		if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+			modname = LUSTRE_OSP_NAME;
+
+		if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+			modname = LUSTRE_MDT_NAME;
+#endif /* HAVE_SERVER_SUPPORT */
+
+		if (!request_module("%s", modname)) {
+			CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+			type = class_search_type(name);
+                } else {
+                        LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+                                           modname);
+                }
+        }
+#endif
+        if (type) {
+		if (try_module_get(type->typ_dt_ops->o_owner)) {
+			atomic_inc(&type->typ_refcnt);
+			/* class_search_type() returned a counted reference,
+			 * but we don't need that count any more as
+			 * we have one through typ_refcnt.
+			 */
+			kobject_put(&type->typ_kobj);
+		} else {
+			kobject_put(&type->typ_kobj);
+			type = NULL;
+		}
+	}
+	return type;
+}
+
+void class_put_type(struct obd_type *type)
+{
+	LASSERT(type);
+	module_put(type->typ_dt_ops->o_owner);
+	atomic_dec(&type->typ_refcnt);
+}
+
+static void class_sysfs_release(struct kobject *kobj)
+{
+	struct obd_type *type = container_of(kobj, struct obd_type, typ_kobj);
+
+	debugfs_remove_recursive(type->typ_debugfs_entry);
+	type->typ_debugfs_entry = NULL;
+
+	if (type->typ_lu)
+		lu_device_type_fini(type->typ_lu);
+
+#ifdef CONFIG_PROC_FS
+	if (type->typ_name && type->typ_procroot)
+		remove_proc_subtree(type->typ_name, proc_lustre_root);
+#endif
+	OBD_FREE(type, sizeof(*type));
+}
+
+static struct kobj_type class_ktype = {
+	.sysfs_ops      = &lustre_sysfs_ops,
+	.release        = class_sysfs_release,
+};
+
+#ifdef HAVE_SERVER_SUPPORT
+struct obd_type *class_add_symlinks(const char *name, bool enable_proc)
+{
+	struct dentry *symlink;
+	struct obd_type *type;
+	int rc;
+
+	type = class_search_type(name);
+	if (type) {
+		kobject_put(&type->typ_kobj);
+		return ERR_PTR(-EEXIST);
+	}
+
+	OBD_ALLOC(type, sizeof(*type));
+	if (!type)
+		return ERR_PTR(-ENOMEM);
+
+	type->typ_kobj.kset = lustre_kset;
+	rc = kobject_init_and_add(&type->typ_kobj, &class_ktype,
+				  &lustre_kset->kobj, "%s", name);
+	if (rc)
+		return ERR_PTR(rc);
+
+	symlink = debugfs_create_dir(name, debugfs_lustre_root);
+	type->typ_debugfs_entry = symlink;
+	type->typ_sym_filter = true;
+
+	if (enable_proc) {
+		type->typ_procroot = lprocfs_register(name, proc_lustre_root,
+						      NULL, NULL);
+		if (IS_ERR(type->typ_procroot)) {
+			CERROR("%s: can't create compat proc entry: %d\n",
+			       name, (int)PTR_ERR(type->typ_procroot));
+			type->typ_procroot = NULL;
+		}
+	}
+
+	return type;
+}
+EXPORT_SYMBOL(class_add_symlinks);
+#endif /* HAVE_SERVER_SUPPORT */
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(const struct obd_ops *dt_ops,
+			const struct md_ops *md_ops,
+			bool enable_proc,
+			const char *name, struct lu_device_type *ldt)
+{
+	struct obd_type *type;
+	int rc;
+
+	ENTRY;
+	/* sanity check */
+	LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+	type = class_search_type(name);
+	if (type) {
+#ifdef HAVE_SERVER_SUPPORT
+		if (type->typ_sym_filter)
+			goto dir_exist;
+#endif /* HAVE_SERVER_SUPPORT */
+		kobject_put(&type->typ_kobj);
+                CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+                RETURN(-EEXIST);
+        }
+
+        OBD_ALLOC(type, sizeof(*type));
+        if (type == NULL)
+		RETURN(-ENOMEM);
+
+	type->typ_lu = ldt ? OBD_LU_TYPE_SETUP : NULL;
+	type->typ_kobj.kset = lustre_kset;
+	kobject_init(&type->typ_kobj, &class_ktype);
+#ifdef HAVE_SERVER_SUPPORT
+dir_exist:
+#endif /* HAVE_SERVER_SUPPORT */
+
+	type->typ_dt_ops = dt_ops;
+	type->typ_md_ops = md_ops;
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (type->typ_sym_filter) {
+		type->typ_sym_filter = false;
+		kobject_put(&type->typ_kobj);
+		goto setup_ldt;
+	}
+#endif
+#ifdef CONFIG_PROC_FS
+	if (enable_proc && !type->typ_procroot) {
+		type->typ_procroot = lprocfs_register(name,
+						      proc_lustre_root,
+						      NULL, type);
+		if (IS_ERR(type->typ_procroot)) {
+			rc = PTR_ERR(type->typ_procroot);
+			type->typ_procroot = NULL;
+			GOTO(failed, rc);
+		}
+	}
+#endif
+	type->typ_debugfs_entry = debugfs_create_dir(name, debugfs_lustre_root);
+
+	rc = kobject_add(&type->typ_kobj, &lustre_kset->kobj, "%s", name);
+	if (rc)
+		GOTO(failed, rc);
+#ifdef HAVE_SERVER_SUPPORT
+setup_ldt:
+#endif
+	if (ldt) {
+		rc = lu_device_type_init(ldt);
+		smp_store_release(&type->typ_lu, rc ? NULL : ldt);
+		wake_up_var(&type->typ_lu);
+		if (rc)
+			GOTO(failed, rc);
+	}
+
+	RETURN(0);
+
+failed:
+	kobject_put(&type->typ_kobj);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+        struct obd_type *type = class_search_type(name);
+	int rc = 0;
+        ENTRY;
+
+        if (!type) {
+                CERROR("unknown obd type\n");
+                RETURN(-EINVAL);
+        }
+
+	if (atomic_read(&type->typ_refcnt)) {
+		CERROR("type %s has refcount (%d)\n", name,
+		       atomic_read(&type->typ_refcnt));
+                /* This is a bad situation, let's make the best of it */
+                /* Remove ops, but leave the name for debugging */
+		type->typ_dt_ops = NULL;
+		type->typ_md_ops = NULL;
+		GOTO(out_put, rc = -EBUSY);
+        }
+
+	/* Put the final ref */
+	kobject_put(&type->typ_kobj);
+out_put:
+	/* Put the ref returned by class_search_type() */
+	kobject_put(&type->typ_kobj);
+
+	RETURN(rc);
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Allocate the new obd_device and initialize it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name      obd device name.
+ * \param[in] uuid      obd device UUID
+ *
+ * \retval newdev         pointer to created obd_device
+ * \retval ERR_PTR(errno) on error
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name,
+				const char *uuid)
+{
+        struct obd_device *newdev;
+        struct obd_type *type = NULL;
+        ENTRY;
+
+        if (strlen(name) >= MAX_OBD_NAME) {
+                CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+                RETURN(ERR_PTR(-EINVAL));
+        }
+
+        type = class_get_type(type_name);
+        if (type == NULL){
+                CERROR("OBD: unknown type: %s\n", type_name);
+                RETURN(ERR_PTR(-ENODEV));
+        }
+
+        newdev = obd_device_alloc();
+	if (newdev == NULL) {
+		class_put_type(type);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+        LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+	strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1);
+	newdev->obd_type = type;
+	newdev->obd_minor = -1;
+
+	rwlock_init(&newdev->obd_pool_lock);
+	newdev->obd_pool_limit = 0;
+	newdev->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&newdev->obd_exports);
+	newdev->obd_num_exports = 0;
+	newdev->obd_grant_check_threshold = 100;
+	INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
+	INIT_LIST_HEAD(&newdev->obd_delayed_exports);
+	INIT_LIST_HEAD(&newdev->obd_exports_timed);
+	INIT_LIST_HEAD(&newdev->obd_nid_stats);
+	spin_lock_init(&newdev->obd_nid_lock);
+	spin_lock_init(&newdev->obd_dev_lock);
+	mutex_init(&newdev->obd_dev_mutex);
+	spin_lock_init(&newdev->obd_osfs_lock);
+	/* newdev->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	newdev->obd_osfs_age = ktime_get_seconds() - 1000;
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&newdev->obd_observer_link_sem);
+	/* recovery data */
+	spin_lock_init(&newdev->obd_recovery_task_lock);
+	init_waitqueue_head(&newdev->obd_next_transno_waitq);
+	init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&newdev->obd_req_replay_queue);
+	INIT_LIST_HEAD(&newdev->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&newdev->obd_final_req_queue);
+	INIT_LIST_HEAD(&newdev->obd_evict_list);
+	INIT_LIST_HEAD(&newdev->obd_lwp_list);
+
+	llog_group_init(&newdev->obd_olg);
+	/* Detach drops this */
+	atomic_set(&newdev->obd_refcount, 1);
+	lu_ref_init(&newdev->obd_reference);
+	lu_ref_add(&newdev->obd_reference, "newdev", newdev);
+
+	newdev->obd_conn_inprogress = 0;
+
+	strncpy(newdev->obd_uuid.uuid, uuid, UUID_MAX);
+
+	CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
+	       newdev->obd_name, newdev);
+
+	return newdev;
+}
+
+/**
+ * Free obd device.
+ *
+ * \param[in] obd obd_device to be freed
+ *
+ * \retval none
+ */
+void class_free_dev(struct obd_device *obd)
+{
+	struct obd_type *obd_type = obd->obd_type;
+
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x "
+		 "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd,
+		 "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+	LASSERTF(atomic_read(&obd->obd_refcount) == 0,
+		 "obd_refcount should be 0, not %d\n",
+		 atomic_read(&obd->obd_refcount));
+	LASSERT(obd_type != NULL);
+
+	CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n",
+	       obd->obd_name, obd->obd_type->typ_name);
+
+	CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+			 obd->obd_name, obd->obd_uuid.uuid);
+	if (obd->obd_stopping) {
+		int err;
+
+		/* If we're not stopping, we were never set up */
+		err = obd_cleanup(obd);
+		if (err)
+			CERROR("Cleanup %s returned %d\n",
+				obd->obd_name, err);
+	}
+
+	obd_device_free(obd);
+
+	class_put_type(obd_type);
+}
+
+/**
+ * Unregister obd device.
+ *
+ * Free slot in obd_dev[] used by \a obd.
+ *
+ * \param[in] new_obd obd_device to be unregistered
+ *
+ * \retval none
+ */
+void class_unregister_device(struct obd_device *obd)
+{
+	write_lock(&obd_dev_lock);
+	if (obd->obd_minor >= 0) {
+		LASSERT(obd_devs[obd->obd_minor] == obd);
+		obd_devs[obd->obd_minor] = NULL;
+		obd->obd_minor = -1;
+	}
+	write_unlock(&obd_dev_lock);
+}
+
+/**
+ * Register obd device.
+ *
+ * Find free slot in obd_devs[], fills it with \a new_obd.
+ *
+ * \param[in] new_obd obd_device to be registered
+ *
+ * \retval 0          success
+ * \retval -EEXIST    device with this name is registered
+ * \retval -EOVERFLOW obd_devs[] is full
+ */
+int class_register_device(struct obd_device *new_obd)
+{
+	int ret = 0;
+	int i;
+	int new_obd_minor = 0;
+	bool minor_assign = false;
+	bool retried = false;
+
+again:
+	write_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd != NULL &&
+		    (strcmp(new_obd->obd_name, obd->obd_name) == 0)) {
+
+			if (!retried) {
+				write_unlock(&obd_dev_lock);
+
+				/* the obd_device could be waited to be
+ 				 * destroyed by the "obd_zombie_impexp_thread".
+ 				 */
+				obd_zombie_barrier();
+				retried = true;
+				goto again;
+			}
+
+			CERROR("%s: already exists, won't add\n",
+			       obd->obd_name);
+			/* in case we found a free slot before duplicate */
+			minor_assign = false;
+			ret = -EEXIST;
+			break;
+		}
+		if (!minor_assign && obd == NULL) {
+			new_obd_minor = i;
+			minor_assign = true;
+		}
+	}
+
+	if (minor_assign) {
+		new_obd->obd_minor = new_obd_minor;
+		LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] "
+			 "%p\n", new_obd_minor, obd_devs[new_obd_minor]);
+		obd_devs[new_obd_minor] = new_obd;
+	} else {
+		if (ret == 0) {
+			ret = -EOVERFLOW;
+			CERROR("%s: all %u/%u devices used, increase "
+			       "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name,
+			       i, class_devno_max(), ret);
+		}
+	}
+	write_unlock(&obd_dev_lock);
+
+	RETURN(ret);
+}
+
+static int class_name2dev_nolock(const char *name)
+{
+        int i;
+
+        if (!name)
+                return -1;
+
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+		if (obd && strcmp(name, obd->obd_name) == 0) {
+                        /* Make sure we finished attaching before we give
+                           out any references */
+                        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                        if (obd->obd_attached) {
+                                return i;
+                        }
+                        break;
+                }
+        }
+
+        return -1;
+}
+
+int class_name2dev(const char *name)
+{
+	int i;
+
+	if (!name)
+		return -1;
+
+	read_lock(&obd_dev_lock);
+	i = class_name2dev_nolock(name);
+	read_unlock(&obd_dev_lock);
+
+	return i;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+        int dev = class_name2dev(name);
+
+        if (dev < 0 || dev > class_devno_max())
+                return NULL;
+        return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev_nolock(struct obd_uuid *uuid)
+{
+        int i;
+
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+                if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+                        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                        return i;
+                }
+        }
+
+        return -1;
+}
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	i = class_uuid2dev_nolock(uuid);
+	read_unlock(&obd_dev_lock);
+
+	return i;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+        int dev = class_uuid2dev(uuid);
+        if (dev < 0)
+                return NULL;
+        return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ *         otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+        struct obd_device *obd = NULL;
+
+        if (num < class_devno_max()) {
+                obd = obd_devs[num];
+                if (obd == NULL)
+                        return NULL;
+
+                LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                         "%p obd_magic %08x != %08x\n",
+                         obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+                LASSERTF(obd->obd_minor == num,
+                         "%p obd_minor %0d != %0d\n",
+                         obd, obd->obd_minor, num);
+        }
+
+        return obd;
+}
+EXPORT_SYMBOL(class_num2obd);
+
+/**
+ * Find obd in obd_dev[] by name or uuid.
+ *
+ * Increment obd's refcount if found.
+ *
+ * \param[in] str obd name or uuid
+ *
+ * \retval NULL    if not found
+ * \retval target  pointer to found obd_device
+ */
+struct obd_device *class_dev_by_str(const char *str)
+{
+	struct obd_device *target = NULL;
+	struct obd_uuid tgtuuid;
+	int rc;
+
+	obd_str2uuid(&tgtuuid, str);
+
+	read_lock(&obd_dev_lock);
+	rc = class_uuid2dev_nolock(&tgtuuid);
+	if (rc < 0)
+		rc = class_name2dev_nolock(str);
+
+	if (rc >= 0)
+		target = class_num2obd(rc);
+
+	if (target != NULL)
+		class_incref(target, "find", current);
+	read_unlock(&obd_dev_lock);
+
+	RETURN(target);
+}
+EXPORT_SYMBOL(class_dev_by_str);
+
+/**
+ * Get obd devices count. Device in any
+ *    state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+	int index, max_index = class_devno_max(), dev_count = 0;
+
+	read_lock(&obd_dev_lock);
+	for (index = 0; index <= max_index; index++) {
+		struct obd_device *obd = class_num2obd(index);
+		if (obd != NULL)
+			dev_count++;
+	}
+	read_unlock(&obd_dev_lock);
+
+	return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+        char *status;
+        int i;
+
+	read_lock(&obd_dev_lock);
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+                if (obd == NULL)
+                        continue;
+                if (obd->obd_stopping)
+                        status = "ST";
+                else if (obd->obd_set_up)
+                        status = "UP";
+                else if (obd->obd_attached)
+                        status = "AT";
+                else
+                        status = "--";
+                LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+                         i, status, obd->obd_type->typ_name,
+                         obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+        }
+	read_unlock(&obd_dev_lock);
+}
+
+/* Search for a client OBD connected to tgt_uuid.  If grp_uuid is
+ * specified, then only the client with that uuid is returned,
+ * otherwise any client connected to the tgt is returned.
+ */
+struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid,
+					 const char *type_name,
+					 struct obd_uuid *grp_uuid)
+{
+        int i;
+
+	read_lock(&obd_dev_lock);
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+                if (obd == NULL)
+                        continue;
+		if ((strncmp(obd->obd_type->typ_name, type_name,
+			     strlen(type_name)) == 0)) {
+                        if (obd_uuid_equals(tgt_uuid,
+                                            &obd->u.cli.cl_target_uuid) &&
+                            ((grp_uuid)? obd_uuid_equals(grp_uuid,
+                                                         &obd->obd_uuid) : 1)) {
+				read_unlock(&obd_dev_lock);
+                                return obd;
+                        }
+                }
+        }
+	read_unlock(&obd_dev_lock);
+
+        return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+ * searching at *next, and if a device is found, the next index to look
+ * at is saved in *next. If next is NULL, then the first matching device
+ * will always be returned.
+ */
+struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+        int i;
+
+        if (next == NULL)
+                i = 0;
+        else if (*next >= 0 && *next < class_devno_max())
+                i = *next;
+        else
+                return NULL;
+
+	read_lock(&obd_dev_lock);
+        for (; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+                if (obd == NULL)
+                        continue;
+                if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+                        if (next != NULL)
+                                *next = i+1;
+			read_unlock(&obd_dev_lock);
+                        return obd;
+                }
+        }
+	read_unlock(&obd_dev_lock);
+
+        return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+        struct obd_device  *obd;
+        const char         *type;
+        int                 i, rc = 0, rc2;
+
+        LASSERT(namelen > 0);
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		obd = class_num2obd(i);
+
+		if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+			continue;
+
+		/* only notify mdc, osc, osp, lwp, mdt, ost
+		 * because only these have a -sptlrpc llog */
+		type = obd->obd_type->typ_name;
+		if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OSP_NAME) != 0 &&
+		    strcmp(type, LUSTRE_LWP_NAME) != 0 &&
+		    strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OST_NAME) != 0)
+			continue;
+
+                if (strncmp(obd->obd_name, fsname, namelen))
+                        continue;
+
+                class_incref(obd, __FUNCTION__, obd);
+		read_unlock(&obd_dev_lock);
+                rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+                                         sizeof(KEY_SPTLRPC_CONF),
+                                         KEY_SPTLRPC_CONF, 0, NULL, NULL);
+                rc = rc ? rc : rc2;
+                class_decref(obd, __FUNCTION__, obd);
+		read_lock(&obd_dev_lock);
+        }
+	read_unlock(&obd_dev_lock);
+        return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+        ENTRY;
+        if (obd_device_cachep) {
+		kmem_cache_destroy(obd_device_cachep);
+                obd_device_cachep = NULL;
+        }
+
+        EXIT;
+}
+
+int obd_init_caches(void)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(obd_device_cachep == NULL);
+	obd_device_cachep = kmem_cache_create_usercopy("ll_obd_dev_cache",
+				sizeof(struct obd_device),
+				0, 0, 0, sizeof(struct obd_device), NULL);
+	if (!obd_device_cachep)
+		GOTO(out, rc = -ENOMEM);
+
+	RETURN(0);
+out:
+	obd_cleanup_caches();
+	RETURN(rc);
+}
+
+static const char export_handle_owner[] = "export";
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+        struct obd_export *export;
+        ENTRY;
+
+        if (!conn) {
+                CDEBUG(D_CACHE, "looking for null handle\n");
+                RETURN(NULL);
+        }
+
+        if (conn->cookie == -1) {  /* this means assign a new connection */
+                CDEBUG(D_CACHE, "want a new connection\n");
+                RETURN(NULL);
+        }
+
+	CDEBUG(D_INFO, "looking for export cookie %#llx\n", conn->cookie);
+	export = class_handle2object(conn->cookie, export_handle_owner);
+	RETURN(export);
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+        if (exp)
+                return exp->exp_obd;
+        return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+        struct obd_device *obd = exp->exp_obd;
+        if (obd == NULL)
+                return NULL;
+        return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+        struct obd_device *obd = exp->exp_obd;
+        ENTRY;
+
+	LASSERT(refcount_read(&exp->exp_handle.h_ref) == 0);
+	LASSERT(obd != NULL);
+
+        CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+               exp->exp_client_uuid.uuid, obd->obd_name);
+
+        /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+	ptlrpc_connection_put(exp->exp_connection);
+
+	LASSERT(list_empty(&exp->exp_outstanding_replies));
+	LASSERT(list_empty(&exp->exp_uncommitted_replies));
+	LASSERT(list_empty(&exp->exp_req_replay_queue));
+	LASSERT(list_empty(&exp->exp_hp_rpcs));
+        obd_destroy_export(exp);
+	/* self export doesn't hold a reference to an obd, although it
+	 * exists until freeing of the obd */
+	if (exp != obd->obd_self_export)
+		class_decref(obd, "export", exp);
+
+	OBD_FREE_PRE(exp, sizeof(*exp), "rcu");
+	kfree_rcu(exp, exp_handle.h_rcu);
+        EXIT;
+}
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+	refcount_inc(&exp->exp_handle.h_ref);
+	CDEBUG(D_INFO, "GET export %p refcount=%d\n", exp,
+	       refcount_read(&exp->exp_handle.h_ref));
+        return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+        LASSERT(exp != NULL);
+	LASSERT(refcount_read(&exp->exp_handle.h_ref) >  0);
+	LASSERT(refcount_read(&exp->exp_handle.h_ref) < LI_POISON);
+        CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+	       refcount_read(&exp->exp_handle.h_ref) - 1);
+
+	if (refcount_dec_and_test(&exp->exp_handle.h_ref)) {
+		struct obd_device *obd = exp->exp_obd;
+
+		CDEBUG(D_IOCTL, "final put %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+
+		/* release nid stat refererence */
+		lprocfs_exp_cleanup(exp);
+
+		if (exp == obd->obd_self_export) {
+			/* self export should be destroyed without
+			 * zombie thread as it doesn't hold a
+			 * reference to obd and doesn't hold any
+			 * resources */
+			class_export_destroy(exp);
+			/* self export is destroyed, no class
+			 * references exist and it is safe to free
+			 * obd */
+			class_free_dev(obd);
+		} else {
+			LASSERT(!list_empty(&exp->exp_obd_chain));
+			obd_zombie_export_add(exp);
+		}
+
+	}
+}
+EXPORT_SYMBOL(class_export_put);
+
+static void obd_zombie_exp_cull(struct work_struct *ws)
+{
+	struct obd_export *export;
+
+	export = container_of(ws, struct obd_export, exp_zombie_work);
+	class_export_destroy(export);
+}
+
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *__class_new_export(struct obd_device *obd,
+				      struct obd_uuid *cluuid, bool is_self)
+{
+        struct obd_export *export;
+        int rc = 0;
+        ENTRY;
+
+        OBD_ALLOC_PTR(export);
+        if (!export)
+                return ERR_PTR(-ENOMEM);
+
+        export->exp_conn_cnt = 0;
+        export->exp_lock_hash = NULL;
+	export->exp_flock_hash = NULL;
+	/* 2 = class_handle_hash + last */
+	refcount_set(&export->exp_handle.h_ref, 2);
+	atomic_set(&export->exp_rpc_count, 0);
+	atomic_set(&export->exp_cb_count, 0);
+	atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&export->exp_locks_list);
+	spin_lock_init(&export->exp_locks_list_guard);
+#endif
+	atomic_set(&export->exp_replay_count, 0);
+	export->exp_obd = obd;
+	INIT_LIST_HEAD(&export->exp_outstanding_replies);
+	spin_lock_init(&export->exp_uncommitted_replies_lock);
+	INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+	INIT_LIST_HEAD(&export->exp_req_replay_queue);
+	INIT_HLIST_NODE(&export->exp_handle.h_link);
+	INIT_LIST_HEAD(&export->exp_hp_rpcs);
+	INIT_LIST_HEAD(&export->exp_reg_rpcs);
+	class_handle_hash(&export->exp_handle, export_handle_owner);
+	export->exp_last_request_time = ktime_get_real_seconds();
+	spin_lock_init(&export->exp_lock);
+	spin_lock_init(&export->exp_rpc_lock);
+	INIT_HLIST_NODE(&export->exp_gen_hash);
+	spin_lock_init(&export->exp_bl_list_lock);
+	INIT_LIST_HEAD(&export->exp_bl_list);
+	INIT_LIST_HEAD(&export->exp_stale_list);
+	INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull);
+
+	export->exp_sp_peer = LUSTRE_SP_ANY;
+	export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+	export->exp_client_uuid = *cluuid;
+	obd_init_export(export);
+
+	at_init(&export->exp_bl_lock_at, obd_timeout, 0);
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+		/* shouldn't happen, but might race */
+		if (obd->obd_stopping)
+			GOTO(exit_unlock, rc = -ENODEV);
+
+		rc = obd_uuid_add(obd, export);
+                if (rc != 0) {
+			LCONSOLE_WARN("%s: denying duplicate export for %s: rc = %d\n",
+                                      obd->obd_name, cluuid->uuid, rc);
+			GOTO(exit_unlock, rc = -EALREADY);
+                }
+        }
+
+	if (!is_self) {
+		class_incref(obd, "export", export);
+		list_add_tail(&export->exp_obd_chain_timed,
+			      &obd->obd_exports_timed);
+		list_add(&export->exp_obd_chain, &obd->obd_exports);
+		obd->obd_num_exports++;
+	} else {
+		INIT_LIST_HEAD(&export->exp_obd_chain_timed);
+		INIT_LIST_HEAD(&export->exp_obd_chain);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+	RETURN(export);
+
+exit_unlock:
+	spin_unlock(&obd->obd_dev_lock);
+        class_handle_unhash(&export->exp_handle);
+        obd_destroy_export(export);
+        OBD_FREE_PTR(export);
+        return ERR_PTR(rc);
+}
+
+struct obd_export *class_new_export(struct obd_device *obd,
+				    struct obd_uuid *uuid)
+{
+	return __class_new_export(obd, uuid, false);
+}
+EXPORT_SYMBOL(class_new_export);
+
+struct obd_export *class_new_export_self(struct obd_device *obd,
+					 struct obd_uuid *uuid)
+{
+	return __class_new_export(obd, uuid, true);
+}
+
+void class_unlink_export(struct obd_export *exp)
+{
+	class_handle_unhash(&exp->exp_handle);
+
+	if (exp->exp_obd->obd_self_export == exp) {
+		class_export_put(exp);
+		return;
+	}
+
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	/* delete an uuid-export hashitem from hashtables */
+	if (exp != exp->exp_obd->obd_self_export)
+		obd_uuid_del(exp->exp_obd, exp);
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (!hlist_unhashed(&exp->exp_gen_hash)) {
+		struct tg_export_data	*ted = &exp->exp_target_data;
+		struct cfs_hash		*hash;
+
+		/* Because obd_gen_hash will not be released until
+		 * class_cleanup(), so hash should never be NULL here */
+		hash = cfs_hash_getref(exp->exp_obd->obd_gen_hash);
+		LASSERT(hash != NULL);
+		cfs_hash_del(hash, &ted->ted_lcd->lcd_generation,
+			     &exp->exp_gen_hash);
+		cfs_hash_putref(hash);
+	}
+#endif /* HAVE_SERVER_SUPPORT */
+
+	list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+	list_del_init(&exp->exp_obd_chain_timed);
+	exp->exp_obd->obd_num_exports--;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	atomic_inc(&obd_stale_export_num);
+
+	/* A reference is kept by obd_stale_exports list */
+	obd_stale_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+static void obd_zombie_import_free(struct obd_import *imp)
+{
+	struct obd_import_conn *imp_conn;
+
+	ENTRY;
+	CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+	       imp->imp_obd->obd_name);
+
+	LASSERT(refcount_read(&imp->imp_refcount) == 0);
+
+	ptlrpc_connection_put(imp->imp_connection);
+
+	while ((imp_conn = list_first_entry_or_null(&imp->imp_conn_list,
+						    struct obd_import_conn,
+						    oic_item)) != NULL) {
+		list_del_init(&imp_conn->oic_item);
+		ptlrpc_connection_put(imp_conn->oic_conn);
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+	}
+
+	LASSERT(imp->imp_sec == NULL);
+	LASSERTF(atomic_read(&imp->imp_reqs) == 0, "%s: imp_reqs = %d\n",
+		 imp->imp_obd->obd_name, atomic_read(&imp->imp_reqs));
+	class_decref(imp->imp_obd, "import", imp);
+	OBD_FREE_PTR(imp);
+	EXIT;
+}
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+	refcount_inc(&import->imp_refcount);
+        CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+	       refcount_read(&import->imp_refcount),
+               import->imp_obd->obd_name);
+        return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+	ENTRY;
+
+	LASSERT(refcount_read(&imp->imp_refcount) > 0);
+
+        CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+	       refcount_read(&imp->imp_refcount) - 1,
+               imp->imp_obd->obd_name);
+
+	if (refcount_dec_and_test(&imp->imp_refcount)) {
+                CDEBUG(D_INFO, "final put import %p\n", imp);
+                obd_zombie_import_add(imp);
+        }
+
+	EXIT;
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+        int i;
+        at_init(&at->iat_net_latency, 0, 0);
+        for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+                /* max service estimates are tracked on the server side, so
+                   don't use the AT history here, just use the last reported
+                   val. (But keep hist for proc histogram, worst_ever) */
+                at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+                        AT_FLG_NOHIST);
+        }
+}
+
+static void obd_zombie_imp_cull(struct work_struct *ws)
+{
+	struct obd_import *import;
+
+	import = container_of(ws, struct obd_import, imp_zombie_work);
+	obd_zombie_import_free(import);
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+	struct obd_import *imp;
+	struct pid_namespace *curr_pid_ns = ll_task_pid_ns(current);
+
+	OBD_ALLOC(imp, sizeof(*imp));
+	if (imp == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&imp->imp_pinger_chain);
+	INIT_LIST_HEAD(&imp->imp_replay_list);
+	INIT_LIST_HEAD(&imp->imp_sending_list);
+	INIT_LIST_HEAD(&imp->imp_delayed_list);
+	INIT_LIST_HEAD(&imp->imp_committed_list);
+	INIT_LIST_HEAD(&imp->imp_unreplied_list);
+	imp->imp_known_replied_xid = 0;
+	imp->imp_replay_cursor = &imp->imp_committed_list;
+	spin_lock_init(&imp->imp_lock);
+	imp->imp_last_success_conn = 0;
+	imp->imp_state = LUSTRE_IMP_NEW;
+	imp->imp_obd = class_incref(obd, "import", imp);
+	rwlock_init(&imp->imp_sec_lock);
+	init_waitqueue_head(&imp->imp_recovery_waitq);
+	INIT_WORK(&imp->imp_zombie_work, obd_zombie_imp_cull);
+
+	if (curr_pid_ns && curr_pid_ns->child_reaper)
+		imp->imp_sec_refpid = curr_pid_ns->child_reaper->pid;
+	else
+		imp->imp_sec_refpid = 1;
+
+	refcount_set(&imp->imp_refcount, 2);
+	atomic_set(&imp->imp_unregistering, 0);
+	atomic_set(&imp->imp_reqs, 0);
+	atomic_set(&imp->imp_inflight, 0);
+	atomic_set(&imp->imp_replay_inflight, 0);
+	init_waitqueue_head(&imp->imp_replay_waitq);
+	atomic_set(&imp->imp_inval_count, 0);
+	atomic_set(&imp->imp_waiting, 0);
+	INIT_LIST_HEAD(&imp->imp_conn_list);
+	init_imp_at(&imp->imp_at);
+
+	/* the default magic is V2, will be used in connect RPC, and
+	 * then adjusted according to the flags in request/reply. */
+	imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+	return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+	LASSERT(import != NULL);
+	LASSERT(import != LP_POISON);
+
+	spin_lock(&import->imp_lock);
+	import->imp_generation++;
+	spin_unlock(&import->imp_lock);
+	class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+
+        LASSERT(lock->l_exp_refs_nr >= 0);
+
+        if (lock->l_exp_refs_target != NULL &&
+            lock->l_exp_refs_target != exp) {
+                LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+                              exp, lock, lock->l_exp_refs_target);
+        }
+        if ((lock->l_exp_refs_nr ++) == 0) {
+		list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+                lock->l_exp_refs_target = exp;
+        }
+        CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+               lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+        LASSERT(lock->l_exp_refs_nr > 0);
+        if (lock->l_exp_refs_target != exp) {
+                LCONSOLE_WARN("lock %p, "
+                              "mismatching export pointers: %p, %p\n",
+                              lock, lock->l_exp_refs_target, exp);
+        }
+        if (-- lock->l_exp_refs_nr == 0) {
+		list_del_init(&lock->l_exp_refs_link);
+                lock->l_exp_refs_target = NULL;
+        }
+        CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+               lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+   be managed. This releases the export pointer reference, and returns
+   the export handle, so the export refcount is 1 when this function
+   returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+                  struct obd_uuid *cluuid)
+{
+        struct obd_export *export;
+        LASSERT(conn != NULL);
+        LASSERT(obd != NULL);
+        LASSERT(cluuid != NULL);
+        ENTRY;
+
+        export = class_new_export(obd, cluuid);
+        if (IS_ERR(export))
+                RETURN(PTR_ERR(export));
+
+        conn->cookie = export->exp_handle.h_cookie;
+        class_export_put(export);
+
+	CDEBUG(D_IOCTL, "connect: client %s, cookie %#llx\n",
+               cluuid->uuid, conn->cookie);
+        RETURN(0);
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+static void class_export_recovery_cleanup(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (obd->obd_recovering) {
+		if (exp->exp_in_recovery) {
+			spin_lock(&exp->exp_lock);
+			exp->exp_in_recovery = 0;
+			spin_unlock(&exp->exp_lock);
+			LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+			atomic_dec(&obd->obd_connected_clients);
+		}
+
+		/* if called during recovery then should update
+		 * obd_stale_clients counter,
+		 * lightweight exports are not counted */
+		if ((exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+			exp->exp_obd->obd_stale_clients++;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	spin_lock(&exp->exp_lock);
+	/** Cleanup req replay fields */
+	if (exp->exp_req_replay_needed) {
+		exp->exp_req_replay_needed = 0;
+
+		LASSERT(atomic_read(&obd->obd_req_replay_clients));
+		atomic_dec(&obd->obd_req_replay_clients);
+	}
+
+	/** Cleanup lock replay data */
+	if (exp->exp_lock_replay_needed) {
+		exp->exp_lock_replay_needed = 0;
+
+		LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+		atomic_dec(&obd->obd_lock_replay_clients);
+	}
+	spin_unlock(&exp->exp_lock);
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+        int already_disconnected;
+        ENTRY;
+
+        if (export == NULL) {
+                CWARN("attempting to free NULL export %p\n", export);
+                RETURN(-EINVAL);
+        }
+
+	spin_lock(&export->exp_lock);
+	already_disconnected = export->exp_disconnected;
+	export->exp_disconnected = 1;
+#ifdef HAVE_SERVER_SUPPORT
+	/*  We hold references of export for uuid hash
+	 *  and nid_hash and export link at least. So
+	 *  it is safe to call rh*table_remove_fast in
+	 *  there.
+	 */
+	obd_nid_del(export->exp_obd, export);
+#endif /* HAVE_SERVER_SUPPORT */
+	spin_unlock(&export->exp_lock);
+
+        /* class_cleanup(), abort_recovery(), and class_fail_export()
+         * all end up in here, and if any of them race we shouldn't
+         * call extra class_export_puts(). */
+	if (already_disconnected)
+                GOTO(no_disconn, already_disconnected);
+
+	CDEBUG(D_IOCTL, "disconnect: cookie %#llx\n",
+               export->exp_handle.h_cookie);
+
+        class_export_recovery_cleanup(export);
+        class_unlink_export(export);
+no_disconn:
+        class_export_put(export);
+        RETURN(0);
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+	int connected = 0;
+
+	if (exp) {
+		spin_lock(&exp->exp_lock);
+		connected = (exp->exp_conn_cnt > 0) && !exp->exp_failed;
+		spin_unlock(&exp->exp_lock);
+	}
+	return connected;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+                                         enum obd_option flags)
+{
+        int rc;
+        struct obd_export *exp;
+        ENTRY;
+
+        /* It's possible that an export may disconnect itself, but
+	 * nothing else will be added to this list.
+	 */
+	while ((exp = list_first_entry_or_null(list, struct obd_export,
+					       exp_obd_chain)) != NULL) {
+		/* need for safe call CDEBUG after obd_disconnect */
+		class_export_get(exp);
+
+		spin_lock(&exp->exp_lock);
+		exp->exp_flags = flags;
+		spin_unlock(&exp->exp_lock);
+
+                if (obd_uuid_equals(&exp->exp_client_uuid,
+                                    &exp->exp_obd->obd_uuid)) {
+                        CDEBUG(D_HA,
+                               "exp %p export uuid == obd uuid, don't discon\n",
+                               exp);
+                        /* Need to delete this now so we don't end up pointing
+                         * to work_list later when this export is cleaned up. */
+			list_del_init(&exp->exp_obd_chain);
+                        class_export_put(exp);
+                        continue;
+                }
+
+                class_export_get(exp);
+                CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
+		       "last request at %lld\n",
+                       exp->exp_obd->obd_name, obd_export_nid2str(exp),
+                       exp, exp->exp_last_request_time);
+                /* release one export reference anyway */
+                rc = obd_disconnect(exp);
+
+                CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+                       obd_export_nid2str(exp), exp, rc);
+                class_export_put(exp);
+        }
+        EXIT;
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+	LIST_HEAD(work_list);
+	ENTRY;
+
+	/* Move all of the exports from obd_exports to a work list, en masse. */
+	spin_lock(&obd->obd_dev_lock);
+	list_splice_init(&obd->obd_exports, &work_list);
+	list_splice_init(&obd->obd_delayed_exports, &work_list);
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!list_empty(&work_list)) {
+                CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+                       "disconnecting them\n", obd->obd_minor, obd);
+                class_disconnect_export_list(&work_list,
+                                             exp_flags_from_obd(obd));
+        } else
+                CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+                       obd->obd_minor, obd);
+        EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+                                    int (*test_export)(struct obd_export *))
+{
+	LIST_HEAD(work_list);
+	struct obd_export *exp, *n;
+	int evicted = 0;
+	ENTRY;
+
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry_safe(exp, n, &obd->obd_exports,
+				 exp_obd_chain) {
+                /* don't count self-export as client */
+                if (obd_uuid_equals(&exp->exp_client_uuid,
+                                    &exp->exp_obd->obd_uuid))
+                        continue;
+
+		/* don't evict clients which have no slot in last_rcvd
+		 * (e.g. lightweight connection) */
+		if (exp->exp_target_data.ted_lr_idx == -1)
+			continue;
+
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_failed || test_export(exp)) {
+			spin_unlock(&exp->exp_lock);
+			continue;
+		}
+		exp->exp_failed = 1;
+		spin_unlock(&exp->exp_lock);
+
+		list_move(&exp->exp_obd_chain, &work_list);
+		evicted++;
+		CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid,
+		       obd_export_nid2str(exp));
+		print_export_data(exp, "EVICTING", 0, D_HA);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (evicted)
+		LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+			      obd->obd_name, evicted);
+
+	class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+						 OBD_OPT_ABORT_RECOV);
+	EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+	int rc, already_failed;
+
+	spin_lock(&exp->exp_lock);
+	already_failed = exp->exp_failed;
+	exp->exp_failed = 1;
+	spin_unlock(&exp->exp_lock);
+
+        if (already_failed) {
+                CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+                       exp, exp->exp_client_uuid.uuid);
+                return;
+        }
+
+        CDEBUG(D_HA, "disconnecting export %p/%s\n",
+               exp, exp->exp_client_uuid.uuid);
+
+        if (obd_dump_on_timeout)
+                libcfs_debug_dumplog();
+
+	/* need for safe call CDEBUG after obd_disconnect */
+	class_export_get(exp);
+
+        /* Most callers into obd_disconnect are removing their own reference
+         * (request, for example) in addition to the one from the hash table.
+         * We don't have such a reference here, so make one. */
+        class_export_get(exp);
+        rc = obd_disconnect(exp);
+        if (rc)
+                CERROR("disconnecting export %p failed: %d\n", exp, rc);
+        else
+                CDEBUG(D_HA, "disconnected export %p/%s\n",
+                       exp, exp->exp_client_uuid.uuid);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+#ifdef HAVE_SERVER_SUPPORT
+
+static int take_first(struct obd_export *exp, void *data)
+{
+	struct obd_export **expp = data;
+
+	if (*expp)
+		/* already have one */
+		return 0;
+	if (exp->exp_failed)
+		/* Don't want this one */
+		return 0;
+	if (!refcount_inc_not_zero(&exp->exp_handle.h_ref))
+		/* Cannot get a ref on this one */
+		return 0;
+	*expp = exp;
+	return 1;
+}
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+	struct lnet_nid nid_key;
+	struct obd_export *doomed_exp;
+	int exports_evicted = 0;
+
+	libcfs_strnid(&nid_key, nid);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* umount has run already, so evict thread should leave
+	 * its task to umount thread now */
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	doomed_exp = NULL;
+	while (obd_nid_export_for_each(obd, &nid_key,
+				       take_first, &doomed_exp) > 0) {
+
+		LASSERTF(doomed_exp != obd->obd_self_export,
+			 "self-export is hashed by NID?\n");
+
+		LCONSOLE_WARN("%s: evicting %s (at %s) by administrative request\n",
+			      obd->obd_name,
+			      obd_uuid2str(&doomed_exp->exp_client_uuid),
+			      obd_export_nid2str(doomed_exp));
+
+		class_fail_export(doomed_exp);
+		class_export_put(doomed_exp);
+		exports_evicted++;
+		doomed_exp = NULL;
+	}
+
+	if (!exports_evicted)
+		CDEBUG(D_HA,
+		       "%s: can't disconnect NID '%s': no exports found\n",
+		       obd->obd_name, nid);
+	return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+	struct obd_export *doomed_exp = NULL;
+	struct obd_uuid doomed_uuid;
+	int exports_evicted = 0;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+        obd_str2uuid(&doomed_uuid, uuid);
+        if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+                CERROR("%s: can't evict myself\n", obd->obd_name);
+                return exports_evicted;
+        }
+
+	doomed_exp = obd_uuid_lookup(obd, &doomed_uuid);
+        if (doomed_exp == NULL) {
+                CERROR("%s: can't disconnect %s: no exports found\n",
+                       obd->obd_name, uuid);
+        } else {
+                CWARN("%s: evicting %s at adminstrative request\n",
+                       obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+                class_fail_export(doomed_exp);
+                class_export_put(doomed_exp);
+		obd_uuid_del(obd, doomed_exp);
+                exports_evicted++;
+        }
+
+        return exports_evicted;
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+			      int locks, int debug_level)
+{
+	struct ptlrpc_reply_state *rs;
+	struct ptlrpc_reply_state *first_reply = NULL;
+	int nreplies = 0;
+
+	spin_lock(&exp->exp_lock);
+	list_for_each_entry(rs, &exp->exp_outstanding_replies,
+			    rs_exp_list) {
+		if (nreplies == 0)
+			first_reply = rs;
+		nreplies++;
+	}
+	spin_unlock(&exp->exp_lock);
+
+	CDEBUG(debug_level, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: "
+	       "%p %s %llu stale:%d\n",
+	       exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+	       obd_export_nid2str(exp),
+	       refcount_read(&exp->exp_handle.h_ref),
+	       atomic_read(&exp->exp_rpc_count),
+	       atomic_read(&exp->exp_cb_count),
+	       atomic_read(&exp->exp_locks_count),
+	       exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+	       nreplies, first_reply, nreplies > 3 ? "..." : "",
+	       exp->exp_last_committed, !list_empty(&exp->exp_stale_list));
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	if (locks && class_export_dump_hook != NULL)
+		class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks, int debug_level)
+{
+        struct obd_export *exp;
+
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+		print_export_data(exp, "ACTIVE", locks, debug_level);
+	list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+		print_export_data(exp, "UNLINKED", locks, debug_level);
+	list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+		print_export_data(exp, "DELAYED", locks, debug_level);
+	spin_unlock(&obd->obd_dev_lock);
+}
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+	int waited = 2;
+	LASSERT(list_empty(&obd->obd_exports));
+	spin_lock(&obd->obd_dev_lock);
+	while (!list_empty(&obd->obd_unlinked_exports)) {
+		spin_unlock(&obd->obd_dev_lock);
+		schedule_timeout_uninterruptible(cfs_time_seconds(waited));
+		if (waited > 5 && is_power_of_2(waited)) {
+			LCONSOLE_WARN("%s is waiting for obd_unlinked_exports "
+				      "more than %d seconds. "
+				      "The obd refcount = %d. Is it stuck?\n",
+				      obd->obd_name, waited,
+				      atomic_read(&obd->obd_refcount));
+			dump_exports(obd, 1, D_CONSOLE | D_WARNING);
+		}
+		waited *= 2;
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/**
+ * Add export to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+	atomic_dec(&obd_stale_export_num);
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	LASSERT(!list_empty(&exp->exp_obd_chain));
+	list_del_init(&exp->exp_obd_chain);
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+	queue_work(zombie_wq, &exp->exp_zombie_work);
+}
+
+/**
+ * Add import to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+	LASSERT(imp->imp_sec == NULL);
+
+	queue_work(zombie_wq, &imp->imp_zombie_work);
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+	flush_workqueue(zombie_wq);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+struct obd_export *obd_stale_export_get(void)
+{
+	struct obd_export *exp = NULL;
+	ENTRY;
+
+	spin_lock(&obd_stale_export_lock);
+	if (!list_empty(&obd_stale_exports)) {
+		exp = list_first_entry(&obd_stale_exports,
+				       struct obd_export, exp_stale_list);
+		list_del_init(&exp->exp_stale_list);
+	}
+	spin_unlock(&obd_stale_export_lock);
+
+	if (exp) {
+		CDEBUG(D_DLMTRACE, "Get export %p: total %d\n", exp,
+		       atomic_read(&obd_stale_export_num));
+	}
+	RETURN(exp);
+}
+EXPORT_SYMBOL(obd_stale_export_get);
+
+void obd_stale_export_put(struct obd_export *exp)
+{
+	ENTRY;
+
+	LASSERT(list_empty(&exp->exp_stale_list));
+	if (exp->exp_lock_hash &&
+	    atomic_read(&exp->exp_lock_hash->hs_count)) {
+		CDEBUG(D_DLMTRACE, "Put export %p: total %d\n", exp,
+		       atomic_read(&obd_stale_export_num));
+
+		spin_lock_bh(&exp->exp_bl_list_lock);
+		spin_lock(&obd_stale_export_lock);
+		/* Add to the tail if there is no blocked locks,
+		 * to the head otherwise. */
+		if (list_empty(&exp->exp_bl_list))
+			list_add_tail(&exp->exp_stale_list,
+				      &obd_stale_exports);
+		else
+			list_add(&exp->exp_stale_list,
+				 &obd_stale_exports);
+
+		spin_unlock(&obd_stale_export_lock);
+		spin_unlock_bh(&exp->exp_bl_list_lock);
+	} else {
+		class_export_put(exp);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(obd_stale_export_put);
+
+/**
+ * Adjust the position of the export in the stale list,
+ * i.e. move to the head of the list if is needed.
+ **/
+void obd_stale_export_adjust(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	spin_lock_bh(&exp->exp_bl_list_lock);
+	spin_lock(&obd_stale_export_lock);
+
+	if (!list_empty(&exp->exp_stale_list) &&
+	    !list_empty(&exp->exp_bl_list))
+		list_move(&exp->exp_stale_list, &obd_stale_exports);
+
+	spin_unlock(&obd_stale_export_lock);
+	spin_unlock_bh(&exp->exp_bl_list_lock);
+}
+EXPORT_SYMBOL(obd_stale_export_adjust);
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+	zombie_wq = cfs_cpt_bind_workqueue("obd_zombid", cfs_cpt_tab,
+					   0, CFS_CPT_ANY,
+					   cfs_cpt_number(cfs_cpt_tab));
+
+	return IS_ERR(zombie_wq) ? PTR_ERR(zombie_wq) : 0;
+}
+
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+	destroy_workqueue(zombie_wq);
+	LASSERT(list_empty(&obd_stale_exports));
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+        return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr * kuc_ptr(void *p)
+{
+        struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+        LASSERT(lh->kuc_magic == KUC_MAGIC);
+        return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+        struct kuc_hdr *lh;
+        int len = kuc_len(payload_len);
+
+        OBD_ALLOC(lh, len);
+        if (lh == NULL)
+                return ERR_PTR(-ENOMEM);
+
+        lh->kuc_magic = KUC_MAGIC;
+        lh->kuc_transport = transport;
+        lh->kuc_msgtype = type;
+        lh->kuc_msglen = len;
+
+        return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+void kuc_free(void *p, int payload_len)
+{
+        struct kuc_hdr *lh = kuc_ptr(p);
+        OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);
+
+struct obd_request_slot_waiter {
+	struct list_head	orsw_entry;
+	wait_queue_head_t	orsw_waitq;
+	bool			orsw_signaled;
+};
+
+static bool obd_request_slot_avail(struct client_obd *cli,
+				   struct obd_request_slot_waiter *orsw)
+{
+	bool avail;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	avail = !!list_empty(&orsw->orsw_entry);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return avail;
+};
+
+/*
+ * For network flow control, the RPC sponsor needs to acquire a credit
+ * before sending the RPC. The credits count for a connection is defined
+ * by the "cl_max_rpcs_in_flight". If all the credits are occpuied, then
+ * the subsequent RPC sponsors need to wait until others released their
+ * credits, or the administrator increased the "cl_max_rpcs_in_flight".
+ */
+int obd_get_request_slot(struct client_obd *cli)
+{
+	struct obd_request_slot_waiter	 orsw;
+	int				 rc;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight) {
+		cli->cl_rpcs_in_flight++;
+		spin_unlock(&cli->cl_loi_list_lock);
+		return 0;
+	}
+
+	init_waitqueue_head(&orsw.orsw_waitq);
+	list_add_tail(&orsw.orsw_entry, &cli->cl_flight_waiters);
+	orsw.orsw_signaled = false;
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	rc = l_wait_event_abortable(orsw.orsw_waitq,
+				    obd_request_slot_avail(cli, &orsw) ||
+				    orsw.orsw_signaled);
+
+	/* Here, we must take the lock to avoid the on-stack 'orsw' to be
+	 * freed but other (such as obd_put_request_slot) is using it. */
+	spin_lock(&cli->cl_loi_list_lock);
+	if (rc != 0) {
+		if (!orsw.orsw_signaled) {
+			if (list_empty(&orsw.orsw_entry))
+				cli->cl_rpcs_in_flight--;
+			else
+				list_del(&orsw.orsw_entry);
+		}
+		rc = -EINTR;
+	}
+
+	if (orsw.orsw_signaled) {
+		LASSERT(list_empty(&orsw.orsw_entry));
+
+		rc = -EINTR;
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return rc;
+}
+EXPORT_SYMBOL(obd_get_request_slot);
+
+void obd_put_request_slot(struct client_obd *cli)
+{
+	struct obd_request_slot_waiter *orsw;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_rpcs_in_flight--;
+
+	/* If there is free slot, wakeup the first waiter. */
+	if (!list_empty(&cli->cl_flight_waiters) &&
+	    likely(cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight)) {
+		orsw = list_first_entry(&cli->cl_flight_waiters,
+					struct obd_request_slot_waiter,
+					orsw_entry);
+		list_del_init(&orsw->orsw_entry);
+		cli->cl_rpcs_in_flight++;
+		wake_up(&orsw->orsw_waitq);
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+}
+EXPORT_SYMBOL(obd_put_request_slot);
+
+__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli)
+{
+	return cli->cl_max_rpcs_in_flight;
+}
+EXPORT_SYMBOL(obd_get_max_rpcs_in_flight);
+
+int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max)
+{
+	struct obd_request_slot_waiter *orsw;
+	__u32				old;
+	int				diff;
+	int				i;
+	int				rc;
+
+	if (max > OBD_MAX_RIF_MAX || max < 1)
+		return -ERANGE;
+
+	CDEBUG(D_INFO, "%s: max = %hu max_mod = %u rif = %u\n",
+	       cli->cl_import->imp_obd->obd_name, max,
+	       cli->cl_max_mod_rpcs_in_flight, cli->cl_max_rpcs_in_flight);
+
+	if (strcmp(cli->cl_import->imp_obd->obd_type->typ_name,
+		   LUSTRE_MDC_NAME) == 0) {
+		/* adjust max_mod_rpcs_in_flight to ensure it is always
+		 * strictly lower that max_rpcs_in_flight */
+		if (max < 2) {
+			CERROR("%s: cannot set mdc.*.max_rpcs_in_flight=1\n",
+			       cli->cl_import->imp_obd->obd_name);
+			return -ERANGE;
+		}
+		if (max <= cli->cl_max_mod_rpcs_in_flight) {
+			rc = obd_set_max_mod_rpcs_in_flight(cli, max - 1);
+			if (rc != 0)
+				return rc;
+		}
+	}
+
+	spin_lock(&cli->cl_loi_list_lock);
+	old = cli->cl_max_rpcs_in_flight;
+	cli->cl_max_rpcs_in_flight = max;
+	client_adjust_max_dirty(cli);
+
+	diff = max - old;
+
+	/* We increase the max_rpcs_in_flight, then wakeup some waiters. */
+	for (i = 0; i < diff; i++) {
+		orsw = list_first_entry_or_null(&cli->cl_loi_read_list,
+						struct obd_request_slot_waiter,
+						orsw_entry);
+		if (!orsw)
+			break;
+
+		list_del_init(&orsw->orsw_entry);
+		cli->cl_rpcs_in_flight++;
+		wake_up(&orsw->orsw_waitq);
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(obd_set_max_rpcs_in_flight);
+
+__u16 obd_get_max_mod_rpcs_in_flight(struct client_obd *cli)
+{
+	return cli->cl_max_mod_rpcs_in_flight;
+}
+EXPORT_SYMBOL(obd_get_max_mod_rpcs_in_flight);
+
+int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max)
+{
+	struct obd_connect_data *ocd;
+	__u16 maxmodrpcs;
+	__u16 prev;
+
+	if (max > OBD_MAX_RIF_MAX || max < 1)
+		return -ERANGE;
+
+	ocd = &cli->cl_import->imp_connect_data;
+	CDEBUG(D_INFO, "%s: max = %hu flags = %llx, max_mod = %u rif = %u\n",
+	       cli->cl_import->imp_obd->obd_name, max, ocd->ocd_connect_flags,
+	       ocd->ocd_maxmodrpcs, cli->cl_max_rpcs_in_flight);
+
+	if (max == OBD_MAX_RIF_MAX)
+		max = OBD_MAX_RIF_MAX - 1;
+
+	/* Cannot exceed or equal max_rpcs_in_flight.  If we are asked to
+	 * increase this value, also bump up max_rpcs_in_flight to match.
+	 */
+	if (max >= cli->cl_max_rpcs_in_flight) {
+		CDEBUG(D_INFO,
+		       "%s: increasing max_rpcs_in_flight=%hu to allow larger max_mod_rpcs_in_flight=%u\n",
+		       cli->cl_import->imp_obd->obd_name, max + 1, max);
+		obd_set_max_rpcs_in_flight(cli, max + 1);
+	}
+
+	/* cannot exceed max modify RPCs in flight supported by the server,
+	 * but verify ocd_connect_flags is at least initialized first.  If
+	 * not, allow it and fix value later in ptlrpc_connect_set_flags().
+	 */
+	if (!ocd->ocd_connect_flags) {
+		maxmodrpcs = cli->cl_max_rpcs_in_flight - 1;
+	} else if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) {
+		maxmodrpcs = ocd->ocd_maxmodrpcs;
+		if (maxmodrpcs == 0) { /* connection not finished yet */
+			maxmodrpcs = cli->cl_max_rpcs_in_flight - 1;
+			CDEBUG(D_INFO,
+			       "%s: partial connect, assume maxmodrpcs=%hu\n",
+			       cli->cl_import->imp_obd->obd_name, maxmodrpcs);
+		}
+	} else {
+		maxmodrpcs = 1;
+	}
+	if (max > maxmodrpcs) {
+		CERROR("%s: can't set max_mod_rpcs_in_flight=%hu higher than ocd_maxmodrpcs=%hu returned by the server at connection\n",
+		       cli->cl_import->imp_obd->obd_name,
+		       max, maxmodrpcs);
+		return -ERANGE;
+	}
+
+	spin_lock(&cli->cl_mod_rpcs_lock);
+
+	prev = cli->cl_max_mod_rpcs_in_flight;
+	cli->cl_max_mod_rpcs_in_flight = max;
+
+	/* wakeup waiters if limit has been increased */
+	if (cli->cl_max_mod_rpcs_in_flight > prev)
+		wake_up(&cli->cl_mod_rpcs_waitq);
+
+	spin_unlock(&cli->cl_mod_rpcs_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(obd_set_max_mod_rpcs_in_flight);
+
+int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
+			       struct seq_file *seq)
+{
+	unsigned long mod_tot = 0, mod_cum;
+	int i;
+
+	spin_lock(&cli->cl_mod_rpcs_lock);
+	lprocfs_stats_header(seq, ktime_get_real(), cli->cl_mod_rpcs_init, 25,
+			     ":", true, "");
+	seq_printf(seq, "modify_RPCs_in_flight:  %hu\n",
+		   cli->cl_mod_rpcs_in_flight);
+
+	seq_printf(seq, "\n\t\t\tmodify\n");
+	seq_printf(seq, "rpcs in flight        rpcs   %% cum %%\n");
+
+	mod_tot = lprocfs_oh_sum(&cli->cl_mod_rpcs_hist);
+
+	mod_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long mod = cli->cl_mod_rpcs_hist.oh_buckets[i];
+
+		mod_cum += mod;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u\n",
+			   i, mod, pct(mod, mod_tot),
+			   pct(mod_cum, mod_tot));
+		if (mod_cum == mod_tot)
+			break;
+	}
+
+	spin_unlock(&cli->cl_mod_rpcs_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(obd_mod_rpc_stats_seq_show);
+
+/* The number of modify RPCs sent in parallel is limited
+ * because the server has a finite number of slots per client to
+ * store request result and ensure reply reconstruction when needed.
+ * On the client, this limit is stored in cl_max_mod_rpcs_in_flight
+ * that takes into account server limit and cl_max_rpcs_in_flight
+ * value.
+ * On the MDC client, to avoid a potential deadlock (see Bugzilla 3462),
+ * one close request is allowed above the maximum.
+ */
+static inline bool obd_mod_rpc_slot_avail_locked(struct client_obd *cli,
+						 bool close_req)
+{
+	bool avail;
+
+	/* A slot is available if
+	 * - number of modify RPCs in flight is less than the max
+	 * - it's a close RPC and no other close request is in flight
+	 */
+	avail = cli->cl_mod_rpcs_in_flight < cli->cl_max_mod_rpcs_in_flight ||
+		(close_req && cli->cl_close_rpcs_in_flight == 0);
+
+	return avail;
+}
+
+static inline bool obd_mod_rpc_slot_avail(struct client_obd *cli,
+					 bool close_req)
+{
+	bool avail;
+
+	spin_lock(&cli->cl_mod_rpcs_lock);
+	avail = obd_mod_rpc_slot_avail_locked(cli, close_req);
+	spin_unlock(&cli->cl_mod_rpcs_lock);
+	return avail;
+}
+
+
+/* Get a modify RPC slot from the obd client @cli according
+ * to the kind of operation @opc that is going to be sent
+ * and the intent @it of the operation if it applies.
+ * If the maximum number of modify RPCs in flight is reached
+ * the thread is put to sleep.
+ * Returns the tag to be set in the request message. Tag 0
+ * is reserved for non-modifying requests.
+ */
+__u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc)
+{
+	bool			close_req = false;
+	__u16			i, max;
+
+	if (opc == MDS_CLOSE)
+		close_req = true;
+
+	do {
+		spin_lock(&cli->cl_mod_rpcs_lock);
+		max = cli->cl_max_mod_rpcs_in_flight;
+		if (obd_mod_rpc_slot_avail_locked(cli, close_req)) {
+			/* there is a slot available */
+			cli->cl_mod_rpcs_in_flight++;
+			if (close_req)
+				cli->cl_close_rpcs_in_flight++;
+			lprocfs_oh_tally(&cli->cl_mod_rpcs_hist,
+					 cli->cl_mod_rpcs_in_flight);
+			/* find a free tag */
+			i = find_first_zero_bit(cli->cl_mod_tag_bitmap,
+						max + 1);
+			LASSERT(i < OBD_MAX_RIF_MAX);
+			LASSERT(!test_and_set_bit(i, cli->cl_mod_tag_bitmap));
+			spin_unlock(&cli->cl_mod_rpcs_lock);
+			/* tag 0 is reserved for non-modify RPCs */
+
+			CDEBUG(D_RPCTRACE,
+			       "%s: modify RPC slot %u is allocated opc %u, max %hu\n",
+			       cli->cl_import->imp_obd->obd_name,
+			       i + 1, opc, max);
+
+			return i + 1;
+		}
+		spin_unlock(&cli->cl_mod_rpcs_lock);
+
+		CDEBUG(D_RPCTRACE, "%s: sleeping for a modify RPC slot "
+		       "opc %u, max %hu\n",
+		       cli->cl_import->imp_obd->obd_name, opc, max);
+
+		wait_event_idle_exclusive(cli->cl_mod_rpcs_waitq,
+					  obd_mod_rpc_slot_avail(cli,
+								 close_req));
+	} while (true);
+}
+EXPORT_SYMBOL(obd_get_mod_rpc_slot);
+
+/* Put a modify RPC slot from the obd client @cli according
+ * to the kind of operation @opc that has been sent.
+ */
+void obd_put_mod_rpc_slot(struct client_obd *cli, __u32 opc, __u16 tag)
+{
+	bool			close_req = false;
+
+	if (tag == 0)
+		return;
+
+	if (opc == MDS_CLOSE)
+		close_req = true;
+
+	spin_lock(&cli->cl_mod_rpcs_lock);
+	cli->cl_mod_rpcs_in_flight--;
+	if (close_req)
+		cli->cl_close_rpcs_in_flight--;
+	/* release the tag in the bitmap */
+	LASSERT(tag - 1 < OBD_MAX_RIF_MAX);
+	LASSERT(test_and_clear_bit(tag - 1, cli->cl_mod_tag_bitmap) != 0);
+	spin_unlock(&cli->cl_mod_rpcs_lock);
+	/* LU-14741 - to prevent close RPCs stuck behind normal ones */
+	if (close_req)
+		wake_up_all(&cli->cl_mod_rpcs_waitq);
+	else
+		wake_up(&cli->cl_mod_rpcs_waitq);
+}
+EXPORT_SYMBOL(obd_put_mod_rpc_slot);
+
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
new file mode 100644
index 0000000000000..b89a6d2e86a61
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
@@ -0,0 +1,161 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/idmap.c
+ *
+ * Lustre user identity mapping.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+
+#include <lustre_idmap.h>
+#include <upcall_cache.h>
+#include <md_object.h>
+#include <obd_support.h>
+
+/*
+ * groups_search() is copied from linux kernel!
+ * A simple bsearch.
+ */
+static int lustre_groups_search(struct group_info *group_info,
+				gid_t grp)
+{
+	int left, right;
+
+	if (!group_info)
+		return 0;
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		int mid = (left + right) / 2;
+		int cmp = grp -
+			from_kgid(&init_user_ns, CFS_GROUP_AT(group_info, mid));
+
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+void lustre_groups_from_list(struct group_info *ginfo, gid_t *glist)
+{
+#ifdef HAVE_GROUP_INFO_GID
+	memcpy(ginfo->gid, glist, ginfo->ngroups * sizeof(__u32));
+#else
+	int i;
+	int count = ginfo->ngroups;
+
+	/* fill group_info from gid array */
+	for (i = 0; i < ginfo->nblocks && count > 0; i++) {
+		int cp_count = min(CFS_NGROUPS_PER_BLOCK, count);
+		int off = i * CFS_NGROUPS_PER_BLOCK;
+		int len = cp_count * sizeof(*glist);
+
+		memcpy(ginfo->blocks[i], glist + off, len);
+		count -= cp_count;
+	}
+#endif
+}
+EXPORT_SYMBOL(lustre_groups_from_list);
+
+/* groups_sort() is copied from linux kernel! */
+/* a simple shell-metzner sort */
+void lustre_groups_sort(struct group_info *group_info)
+{
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
+
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
+
+	while (stride) {
+		max = gidsetsize - stride;
+		for (base = 0; base < max; base++) {
+			int left = base;
+			int right = left + stride;
+			gid_t tmp = from_kgid(&init_user_ns,
+					      CFS_GROUP_AT(group_info, right));
+
+			while (left >= 0 &&
+			       tmp < from_kgid(&init_user_ns,
+					       CFS_GROUP_AT(group_info, left))) {
+				CFS_GROUP_AT(group_info, right) =
+					CFS_GROUP_AT(group_info, left);
+				right = left;
+				left -= stride;
+			}
+			CFS_GROUP_AT(group_info, right) =
+						make_kgid(&init_user_ns, tmp);
+		}
+		stride /= 3;
+	}
+}
+EXPORT_SYMBOL(lustre_groups_sort);
+
+int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
+{
+	int rc = 1;
+
+	if (grp != mu->uc_fsgid) {
+		struct group_info *group_info = NULL;
+
+		if (mu->uc_ginfo || !mu->uc_identity ||
+		    mu->uc_valid == UCRED_OLD)
+			if (grp == mu->uc_suppgids[0] ||
+			    grp == mu->uc_suppgids[1])
+				return 1;
+
+		if (mu->uc_ginfo)
+			group_info = mu->uc_ginfo;
+		else if (mu->uc_identity)
+			group_info = mu->uc_identity->mi_ginfo;
+
+		if (!group_info)
+			return 0;
+
+		atomic_inc(&group_info->usage);
+		rc = lustre_groups_search(group_info, grp);
+		if (atomic_dec_and_test(&group_info->usage))
+			groups_free(group_info);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lustre_in_group_p);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/integrity.c b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c
new file mode 100644
index 0000000000000..1ccec8a93985d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c
@@ -0,0 +1,277 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2018, DataDirect Networks Storage.
+ * Author: Li Xi.
+ *
+ * General data integrity functions
+ */
+#include <linux/blkdev.h>
+#include <linux/crc-t10dif.h>
+#include <asm/checksum.h>
+#include <obd_class.h>
+#include <obd_cksum.h>
+
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+__be16 obd_dif_crc_fn(void *data, unsigned int len)
+{
+	return cpu_to_be16(crc_t10dif(data, len));
+}
+EXPORT_SYMBOL(obd_dif_crc_fn);
+
+__be16 obd_dif_ip_fn(void *data, unsigned int len)
+{
+	return (__force __be16)ip_compute_csum(data, len);
+}
+EXPORT_SYMBOL(obd_dif_ip_fn);
+
+int obd_page_dif_generate_buffer(const char *obd_name, struct page *page,
+				 __u32 offset, __u32 length,
+				 __be16 *guard_start, int guard_number,
+				 int *used_number, int sector_size,
+				 obd_dif_csum_fn *fn)
+{
+	unsigned int i = offset;
+	unsigned int end = offset + length;
+	char *data_buf;
+	__be16 *guard_buf = guard_start;
+	unsigned int data_size;
+	int used = 0;
+
+	data_buf = kmap(page) + offset;
+	while (i < end) {
+		if (used >= guard_number) {
+			CERROR("%s: unexpected used guard number of DIF %u/%u, "
+			       "data length %u, sector size %u: rc = %d\n",
+			       obd_name, used, guard_number, length,
+			       sector_size, -E2BIG);
+			return -E2BIG;
+		}
+		data_size = min(round_up(i + 1, sector_size), end) - i;
+		*guard_buf = fn(data_buf, data_size);
+		guard_buf++;
+		data_buf += data_size;
+		i += data_size;
+		used++;
+	}
+	kunmap(page);
+	*used_number = used;
+
+	return 0;
+}
+EXPORT_SYMBOL(obd_page_dif_generate_buffer);
+
+static int __obd_t10_performance_test(const char *obd_name,
+				      enum cksum_types cksum_type,
+				      struct page *data_page,
+				      int repeat_number)
+{
+	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
+	struct ahash_request *req;
+	obd_dif_csum_fn *fn = NULL;
+	unsigned int bufsize;
+	unsigned char *buffer;
+	struct page *__page;
+	__be16 *guard_start;
+	int guard_number;
+	int used_number = 0;
+	int sector_size = 0;
+	__u32 cksum;
+	int rc = 0;
+	int rc2;
+	int used;
+	int i;
+
+	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
+	if (!fn)
+		return -EINVAL;
+
+	__page = alloc_page(GFP_KERNEL);
+	if (__page == NULL)
+		return -ENOMEM;
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
+		       obd_name, cfs_crypto_hash_name(cfs_alg), rc);
+		GOTO(out, rc);
+	}
+
+	buffer = kmap(__page);
+	guard_start = (__be16 *)buffer;
+	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	for (i = 0; i < repeat_number; i++) {
+		/*
+		 * The left guard number should be able to hold checksums of a
+		 * whole page
+		 */
+		rc = obd_page_dif_generate_buffer(obd_name, data_page, 0,
+						  PAGE_SIZE,
+						  guard_start + used_number,
+						  guard_number - used_number,
+						  &used, sector_size, fn);
+		if (rc)
+			break;
+
+		used_number += used;
+		if (used_number == guard_number) {
+			cfs_crypto_hash_update_page(req, __page, 0,
+				used_number * sizeof(*guard_start));
+			used_number = 0;
+		}
+	}
+	kunmap(__page);
+	if (rc)
+		GOTO(out_final, rc);
+
+	if (used_number != 0)
+		cfs_crypto_hash_update_page(req, __page, 0,
+			used_number * sizeof(*guard_start));
+
+	bufsize = sizeof(cksum);
+out_final:
+	rc2 = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
+	rc = rc ? rc : rc2;
+out:
+	__free_page(__page);
+
+	return rc;
+}
+
+/**
+ *  Array of T10PI checksum algorithm speed in MByte per second
+ */
+static int obd_t10_cksum_speeds[OBD_T10_CKSUM_MAX];
+
+static enum obd_t10_cksum_type
+obd_t10_cksum2type(enum cksum_types cksum_type)
+{
+	switch (cksum_type) {
+	case OBD_CKSUM_T10IP512:
+		return OBD_T10_CKSUM_IP512;
+	case OBD_CKSUM_T10IP4K:
+		return OBD_T10_CKSUM_IP4K;
+	case OBD_CKSUM_T10CRC512:
+		return OBD_T10_CKSUM_CRC512;
+	case OBD_CKSUM_T10CRC4K:
+		return OBD_T10_CKSUM_CRC4K;
+	default:
+		return OBD_T10_CKSUM_UNKNOWN;
+	}
+}
+
+static const char *obd_t10_cksum_name(enum obd_t10_cksum_type index)
+{
+	DECLARE_CKSUM_NAME;
+
+	/* Need to skip "crc32", "adler", "crc32c", "reserved" */
+	return cksum_name[3 + index];
+}
+
+/**
+ * Compute the speed of specified T10PI checksum type
+ *
+ * Run a speed test on the given T10PI checksum on buffer using a 1MB buffer
+ * size. This is a reasonable buffer size for Lustre RPCs, even if the actual
+ * RPC size is larger or smaller.
+ *
+ * The speed is stored internally in the obd_t10_cksum_speeds[] array, and
+ * is available through the obd_t10_cksum_speed() function.
+ *
+ * This function needs to stay the same as cfs_crypto_performance_test() so
+ * that the speeds are comparable. And this function should reflect the real
+ * cost of the checksum calculation.
+ *
+ * \param[in] obd_name		name of the OBD device
+ * \param[in] cksum_type	checksum type (OBD_CKSUM_T10*)
+ */
+static void obd_t10_performance_test(const char *obd_name,
+				     enum cksum_types cksum_type)
+{
+	enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type);
+	const int buf_len = max(PAGE_SIZE, 1048576UL);
+	unsigned long bcount;
+	unsigned long start;
+	unsigned long end;
+	struct page *page;
+	int rc = 0;
+	void *buf;
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	buf = kmap(page);
+	memset(buf, 0xAD, PAGE_SIZE);
+	kunmap(page);
+
+	for (start = jiffies, end = start + cfs_time_seconds(1) / 4,
+	     bcount = 0; time_before(jiffies, end) && rc == 0; bcount++) {
+		rc = __obd_t10_performance_test(obd_name, cksum_type, page,
+						buf_len >> PAGE_SHIFT);
+		if (rc)
+			break;
+	}
+	end = jiffies;
+	__free_page(page);
+out:
+	if (rc) {
+		obd_t10_cksum_speeds[index] = rc;
+		CDEBUG(D_INFO, "%s: T10 checksum algorithm %s test error: "
+		       "rc = %d\n", obd_name, obd_t10_cksum_name(index), rc);
+	} else {
+		unsigned long tmp;
+
+		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+		       1000) / (1024 * 1024);
+		obd_t10_cksum_speeds[index] = (int)tmp;
+		CDEBUG(D_CONFIG, "%s: T10 checksum algorithm %s speed = %d "
+		       "MB/s\n", obd_name, obd_t10_cksum_name(index),
+		       obd_t10_cksum_speeds[index]);
+	}
+}
+#endif /* CONFIG_CRC_T10DIF */
+
+int obd_t10_cksum_speed(const char *obd_name,
+			enum cksum_types cksum_type)
+{
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+	enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type);
+
+	if (unlikely(obd_t10_cksum_speeds[index] == 0)) {
+		static DEFINE_MUTEX(obd_t10_cksum_speed_mutex);
+
+		mutex_lock(&obd_t10_cksum_speed_mutex);
+		if (obd_t10_cksum_speeds[index] == 0)
+			obd_t10_performance_test(obd_name, cksum_type);
+		mutex_unlock(&obd_t10_cksum_speed_mutex);
+	}
+
+	return obd_t10_cksum_speeds[index];
+#else /* !CONFIG_CRC_T10DIF */
+	return 0;
+#endif /* !CONFIG_CRC_T10DIF */
+}
+EXPORT_SYMBOL(obd_t10_cksum_speed);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/interval_tree.c b/drivers/staging/lustrefsx/lustre/obdclass/interval_tree.c
new file mode 100644
index 0000000000000..6007d37f61b5d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/interval_tree.c
@@ -0,0 +1,772 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ldlm/interval_tree.c
+ *
+ * Interval tree library used by ldlm extent lock code
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#include <lustre_dlm.h>
+#include <interval_tree.h>
+
+enum {
+	INTERVAL_RED = 0,
+	INTERVAL_BLACK = 1
+};
+
+static inline int node_is_left_child(struct interval_node *node)
+{
+	LASSERT(node->in_parent != NULL);
+	return node == node->in_parent->in_left;
+}
+
+static inline int node_is_right_child(struct interval_node *node)
+{
+	LASSERT(node->in_parent != NULL);
+	return node == node->in_parent->in_right;
+}
+
+static inline int node_is_red(struct interval_node *node)
+{
+	return node->in_color == INTERVAL_RED;
+}
+
+static inline int node_is_black(struct interval_node *node)
+{
+	return node->in_color == INTERVAL_BLACK;
+}
+
+static inline int extent_compare(struct interval_node_extent *e1,
+				 struct interval_node_extent *e2)
+{
+	int rc;
+
+	if (e1->start == e2->start) {
+		if (e1->end < e2->end)
+			rc = -1;
+		else if (e1->end > e2->end)
+			rc = 1;
+		else
+			rc = 0;
+	} else {
+		if (e1->start < e2->start)
+			rc = -1;
+		else
+			rc = 1;
+	}
+	return rc;
+}
+
+static inline int extent_equal(struct interval_node_extent *e1,
+			       struct interval_node_extent *e2)
+{
+	return (e1->start == e2->start) && (e1->end == e2->end);
+}
+
+static inline int extent_overlapped(struct interval_node_extent *e1,
+				    struct interval_node_extent *e2)
+{
+	return (e1->start <= e2->end) && (e2->start <= e1->end);
+}
+
+static inline int node_compare(struct interval_node *n1,
+			       struct interval_node *n2)
+{
+	return extent_compare(&n1->in_extent, &n2->in_extent);
+}
+
+int node_equal(struct interval_node *n1, struct interval_node *n2)
+{
+	return extent_equal(&n1->in_extent, &n2->in_extent);
+}
+
+#define interval_for_each(node, root)                   \
+for (node = interval_first(root); node != NULL;         \
+	node = interval_next(node))
+
+#define interval_for_each_reverse(node, root)           \
+for (node = interval_last(root); node != NULL;          \
+	node = interval_prev(node))
+
+static struct interval_node *interval_first(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+	while (node->in_left)
+		node = node->in_left;
+	RETURN(node);
+}
+
+static struct interval_node *interval_last(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+	while (node->in_right)
+		node = node->in_right;
+	RETURN(node);
+}
+
+static struct interval_node *interval_next(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+	if (node->in_right)
+		RETURN(interval_first(node->in_right));
+	while (node->in_parent && node_is_right_child(node))
+		node = node->in_parent;
+	RETURN(node->in_parent);
+}
+
+static struct interval_node *interval_prev(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+
+	if (node->in_left)
+		RETURN(interval_last(node->in_left));
+
+	while (node->in_parent && node_is_left_child(node))
+		node = node->in_parent;
+
+	RETURN(node->in_parent);
+}
+
+enum interval_iter interval_iterate(struct interval_node *root,
+				    interval_callback_t func,
+				    void *data)
+{
+	struct interval_node *node;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+
+	ENTRY;
+
+	interval_for_each(node, root) {
+		rc = func(node, data);
+		if (rc == INTERVAL_ITER_STOP)
+			break;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate);
+
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+					    interval_callback_t func,
+					    void *data)
+{
+	struct interval_node *node;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+
+	ENTRY;
+
+	interval_for_each_reverse(node, root) {
+		rc = func(node, data);
+		if (rc == INTERVAL_ITER_STOP)
+			break;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate_reverse);
+
+/* try to find a node with same interval in the tree,
+ * if found, return the pointer to the node, otherwise return NULL
+ */
+struct interval_node *interval_find(struct interval_node *root,
+				    struct interval_node_extent *ex)
+{
+	struct interval_node *walk = root;
+	int rc;
+
+	ENTRY;
+
+	while (walk) {
+		rc = extent_compare(ex, &walk->in_extent);
+		if (rc == 0)
+			break;
+		else if (rc < 0)
+			walk = walk->in_left;
+		else
+			walk = walk->in_right;
+	}
+
+	RETURN(walk);
+}
+EXPORT_SYMBOL(interval_find);
+
+static void __rotate_change_maxhigh(struct interval_node *node,
+				    struct interval_node *rotate)
+{
+	__u64 left_max, right_max;
+
+	rotate->in_max_high = node->in_max_high;
+	left_max = node->in_left ? node->in_left->in_max_high : 0;
+	right_max = node->in_right ? node->in_right->in_max_high : 0;
+	node->in_max_high  = max3(interval_high(node),
+				  left_max, right_max);
+}
+
+/* The left rotation "pivots" around the link from node to node->right, and
+ * - node will be linked to node->right's left child, and
+ * - node->right's left child will be linked to node's right child.
+ */
+static void __rotate_left(struct interval_node *node,
+			  struct interval_node **root)
+{
+	struct interval_node *right = node->in_right;
+	struct interval_node *parent = node->in_parent;
+
+	node->in_right = right->in_left;
+	if (node->in_right)
+		right->in_left->in_parent = node;
+
+	right->in_left = node;
+	right->in_parent = parent;
+	if (parent) {
+		if (node_is_left_child(node))
+			parent->in_left = right;
+		else
+			parent->in_right = right;
+	} else {
+		*root = right;
+	}
+	node->in_parent = right;
+
+	/* update max_high for node and right */
+	__rotate_change_maxhigh(node, right);
+}
+
+/* The right rotation "pivots" around the link from node to node->left, and
+ * - node will be linked to node->left's right child, and
+ * - node->left's right child will be linked to node's left child.
+ */
+static void __rotate_right(struct interval_node *node,
+			   struct interval_node **root)
+{
+	struct interval_node *left = node->in_left;
+	struct interval_node *parent = node->in_parent;
+
+	node->in_left = left->in_right;
+	if (node->in_left)
+		left->in_right->in_parent = node;
+	left->in_right = node;
+
+	left->in_parent = parent;
+	if (parent) {
+		if (node_is_right_child(node))
+			parent->in_right = left;
+		else
+			parent->in_left = left;
+	} else {
+		*root = left;
+	}
+	node->in_parent = left;
+
+	/* update max_high for node and left */
+	__rotate_change_maxhigh(node, left);
+}
+
+#define interval_swap(a, b) do {                        \
+	struct interval_node *c = a; a = b; b = c;      \
+} while (0)
+
+/*
+ * Operations INSERT and DELETE, when run on a tree with n keys,
+ * take O(logN) time.Because they modify the tree, the result
+ * may violate the red-black properties.To restore these properties,
+ * we must change the colors of some of the nodes in the tree
+ * and also change the pointer structure.
+ */
+static void interval_insert_color(struct interval_node *node,
+				  struct interval_node **root)
+{
+	struct interval_node *parent, *gparent;
+
+	ENTRY;
+
+	while ((parent = node->in_parent) && node_is_red(parent)) {
+		gparent = parent->in_parent;
+		/* Parent is RED, so gparent must not be NULL */
+		if (node_is_left_child(parent)) {
+			struct interval_node *uncle;
+
+			uncle = gparent->in_right;
+			if (uncle && node_is_red(uncle)) {
+				uncle->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_BLACK;
+				gparent->in_color = INTERVAL_RED;
+				node = gparent;
+				continue;
+			}
+
+			if (parent->in_right == node) {
+				__rotate_left(parent, root);
+				interval_swap(node, parent);
+			}
+
+			parent->in_color = INTERVAL_BLACK;
+			gparent->in_color = INTERVAL_RED;
+			__rotate_right(gparent, root);
+		} else {
+			struct interval_node *uncle;
+
+			uncle = gparent->in_left;
+			if (uncle && node_is_red(uncle)) {
+				uncle->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_BLACK;
+				gparent->in_color = INTERVAL_RED;
+				node = gparent;
+				continue;
+			}
+
+			if (node_is_left_child(node)) {
+				__rotate_right(parent, root);
+				interval_swap(node, parent);
+			}
+
+			parent->in_color = INTERVAL_BLACK;
+			gparent->in_color = INTERVAL_RED;
+			__rotate_left(gparent, root);
+		}
+	}
+
+	(*root)->in_color = INTERVAL_BLACK;
+	EXIT;
+}
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                      struct interval_node **root)
+{
+	struct interval_node **p, *parent = NULL;
+
+	ENTRY;
+
+	LASSERT(!interval_is_intree(node));
+	p = root;
+        while (*p) {
+                parent = *p;
+                if (node_equal(parent, node))
+                        RETURN(parent);
+
+                /* max_high field must be updated after each iteration */
+                if (parent->in_max_high < interval_high(node))
+                        parent->in_max_high = interval_high(node);
+
+                if (node_compare(node, parent) < 0)
+                        p = &parent->in_left;
+                else 
+                        p = &parent->in_right;
+        }
+
+	/* link node into the tree */
+	node->in_parent = parent;
+	node->in_color = INTERVAL_RED;
+	node->in_left = node->in_right = NULL;
+	*p = node;
+
+	interval_insert_color(node, root);
+	node->in_intree = 1;
+
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(interval_insert);
+
+static inline int node_is_black_or_0(struct interval_node *node)
+{
+	return !node || node_is_black(node);
+}
+
+static void interval_erase_color(struct interval_node *node,
+				 struct interval_node *parent,
+				 struct interval_node **root)
+{
+	struct interval_node *tmp;
+
+	ENTRY;
+
+	while (node_is_black_or_0(node) && node != *root) {
+		if (parent->in_left == node) {
+			tmp = parent->in_right;
+			if (node_is_red(tmp)) {
+				tmp->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_RED;
+				__rotate_left(parent, root);
+				tmp = parent->in_right;
+			}
+			if (node_is_black_or_0(tmp->in_left) &&
+			    node_is_black_or_0(tmp->in_right)) {
+				tmp->in_color = INTERVAL_RED;
+				node = parent;
+				parent = node->in_parent;
+			} else {
+				if (node_is_black_or_0(tmp->in_right)) {
+					struct interval_node *o_left;
+
+					if ((o_left = tmp->in_left))
+						o_left->in_color =
+							INTERVAL_BLACK;
+					tmp->in_color = INTERVAL_RED;
+					__rotate_right(tmp, root);
+					tmp = parent->in_right;
+				}
+				tmp->in_color = parent->in_color;
+				parent->in_color = INTERVAL_BLACK;
+				if (tmp->in_right)
+					tmp->in_right->in_color =
+						INTERVAL_BLACK;
+				__rotate_left(parent, root);
+				node = *root;
+				break;
+			}
+		} else {
+			tmp = parent->in_left;
+			if (node_is_red(tmp)) {
+				tmp->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_RED;
+				__rotate_right(parent, root);
+				tmp = parent->in_left;
+			}
+			if (node_is_black_or_0(tmp->in_left) &&
+			    node_is_black_or_0(tmp->in_right)) {
+				tmp->in_color = INTERVAL_RED;
+				node = parent;
+				parent = node->in_parent;
+			} else {
+				if (node_is_black_or_0(tmp->in_left)) {
+					struct interval_node *o_right;
+
+					if ((o_right = tmp->in_right))
+						o_right->in_color =
+							INTERVAL_BLACK;
+					tmp->in_color = INTERVAL_RED;
+					__rotate_left(tmp, root);
+					tmp = parent->in_left;
+				}
+				tmp->in_color = parent->in_color;
+				parent->in_color = INTERVAL_BLACK;
+				if (tmp->in_left)
+					tmp->in_left->in_color = INTERVAL_BLACK;
+				__rotate_right(parent, root);
+				node = *root;
+				break;
+			}
+		}
+	}
+	if (node)
+		node->in_color = INTERVAL_BLACK;
+	EXIT;
+}
+
+/*
+ * if the @max_high value of @node is changed, this function traverse a path
+ * from node  up to the root to update max_high for the whole tree.
+ */
+static void update_maxhigh(struct interval_node *node,
+			   __u64  old_maxhigh)
+{
+	__u64 left_max, right_max;
+
+	ENTRY;
+
+	while (node) {
+		left_max = node->in_left ? node->in_left->in_max_high : 0;
+		right_max = node->in_right ? node->in_right->in_max_high : 0;
+		node->in_max_high = max3(interval_high(node),
+					 left_max, right_max);
+
+		if (node->in_max_high >= old_maxhigh)
+			break;
+		node = node->in_parent;
+	}
+	EXIT;
+}
+
+void interval_erase(struct interval_node *node,
+		    struct interval_node **root)
+{
+	struct interval_node *child, *parent;
+	int color;
+
+	ENTRY;
+
+	LASSERT(interval_is_intree(node));
+	node->in_intree = 0;
+	if (!node->in_left) {
+		child = node->in_right;
+	} else if (!node->in_right) {
+		child = node->in_left;
+	} else { /* Both left and right child are not NULL */
+		struct interval_node *old = node;
+
+		node = interval_next(node);
+		child = node->in_right;
+		parent = node->in_parent;
+		color = node->in_color;
+
+		if (child)
+			child->in_parent = parent;
+		if (parent == old)
+			parent->in_right = child;
+		else
+			parent->in_left = child;
+
+		node->in_color = old->in_color;
+		node->in_right = old->in_right;
+		node->in_left = old->in_left;
+		node->in_parent = old->in_parent;
+
+		if (old->in_parent) {
+			if (node_is_left_child(old))
+				old->in_parent->in_left = node;
+			else
+				old->in_parent->in_right = node;
+		} else {
+			*root = node;
+		}
+
+		old->in_left->in_parent = node;
+		if (old->in_right)
+			old->in_right->in_parent = node;
+		update_maxhigh(child ? : parent, node->in_max_high);
+		update_maxhigh(node, old->in_max_high);
+		if (parent == old)
+			parent = node;
+		goto color;
+	}
+	parent = node->in_parent;
+	color = node->in_color;
+
+	if (child)
+		child->in_parent = parent;
+	if (parent) {
+		if (node_is_left_child(node))
+			parent->in_left = child;
+		else
+			parent->in_right = child;
+	} else {
+		*root = child;
+	}
+
+	update_maxhigh(child ? : parent, node->in_max_high);
+
+color:
+	if (color == INTERVAL_BLACK)
+		interval_erase_color(child, parent, root);
+	EXIT;
+}
+EXPORT_SYMBOL(interval_erase);
+
+static inline int interval_may_overlap(struct interval_node *node,
+				       struct interval_node_extent *ext)
+{
+	return (ext->start <= node->in_max_high &&
+		ext->end >= interval_low(node));
+}
+
+/*
+ * This function finds all intervals that overlap interval ext,
+ * and calls func to handle resulted intervals one by one.
+ * in lustre, this function will find all conflicting locks in
+ * the granted queue and add these locks to the ast work list.
+ *
+ * {
+ *       if (node == NULL)
+ *               return 0;
+ *       if (ext->end < interval_low(node)) {
+ *               interval_search(node->in_left, ext, func, data);
+ *       } else if (interval_may_overlap(node, ext)) {
+ *               if (extent_overlapped(ext, &node->in_extent))
+ *                       func(node, data);
+ *               interval_search(node->in_left, ext, func, data);
+ *               interval_search(node->in_right, ext, func, data);
+ *       }
+ *       return 0;
+ * }
+ *
+ */
+enum interval_iter interval_search(struct interval_node *node,
+				   struct interval_node_extent *ext,
+				   interval_callback_t func,
+				   void *data)
+{
+	struct interval_node *parent;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+
+	ENTRY;
+
+	LASSERT(ext != NULL);
+	LASSERT(func != NULL);
+
+	while (node) {
+		if (ext->end < interval_low(node)) {
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+		} else if (interval_may_overlap(node, ext)) {
+			if (extent_overlapped(ext, &node->in_extent)) {
+				rc = func(node, data);
+				if (rc == INTERVAL_ITER_STOP)
+					break;
+			}
+
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+			if (node->in_right) {
+				node = node->in_right;
+				continue;
+			}
+		}
+
+		parent = node->in_parent;
+		while (parent) {
+			if (node_is_left_child(node) &&
+			    parent->in_right) {
+				/* If we ever got the left, it means that the
+				 * parent met ext->end<interval_low(parent), or
+				 * may_overlap(parent). If the former is true,
+				 * we needn't go back. So stop early and check
+				 * may_overlap(parent) after this loop.
+				 */
+				node = parent->in_right;
+				break;
+			}
+			node = parent;
+			parent = parent->in_parent;
+		}
+		if (parent == NULL || !interval_may_overlap(parent, ext))
+			break;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(interval_search);
+
+static enum interval_iter interval_overlap_cb(struct interval_node *n,
+					      void *args)
+{
+	*(int *)args = 1;
+	return INTERVAL_ITER_STOP;
+}
+
+int interval_is_overlapped(struct interval_node *root,
+			   struct interval_node_extent *ext)
+{
+	int has = 0;
+	(void)interval_search(root, ext, interval_overlap_cb, &has);
+	return has;
+}
+EXPORT_SYMBOL(interval_is_overlapped);
+
+/* Don't expand to low. Expanding downwards is expensive, and meaningless to
+ * some extents, because programs seldom do IO backward.
+ *
+ * The recursive algorithm of expanding low:
+ * expand_low {
+ *        struct interval_node *tmp;
+ *        static __u64 res = 0;
+ *
+ *        if (root == NULL)
+ *                return res;
+ *        if (root->in_max_high < low) {
+ *                res = max(root->in_max_high + 1, res);
+ *                return res;
+ *        } else if (low < interval_low(root)) {
+ *                interval_expand_low(root->in_left, low);
+ *                return res;
+ *        }
+ *
+ *        if (interval_high(root) < low)
+ *                res = max(interval_high(root) + 1, res);
+ *        interval_expand_low(root->in_left, low);
+ *        interval_expand_low(root->in_right, low);
+ *
+ *        return res;
+ * }
+ *
+ * It's much easy to eliminate the recursion, see interval_search for
+ * an example. -jay
+ */
+static inline __u64 interval_expand_low(struct interval_node *root, __u64 low)
+{
+	/* we only concern the empty tree right now. */
+	if (root == NULL)
+		return 0;
+	return low;
+}
+
+static inline __u64 interval_expand_high(struct interval_node *node, __u64 high)
+{
+	__u64 result = ~0;
+
+	while (node != NULL) {
+		if (node->in_max_high < high)
+			break;
+
+		if (interval_low(node) > high) {
+			result = interval_low(node) - 1;
+			node = node->in_left;
+		} else {
+			node = node->in_right;
+		}
+	}
+
+	return result;
+}
+
+/* expanding the extent based on @ext. */
+void interval_expand(struct interval_node *root,
+		     struct interval_node_extent *ext,
+		     struct interval_node_extent *limiter)
+{
+	/* The assertion of interval_is_overlapped is expensive because we may
+	 * travel many nodes to find the overlapped node.
+	 */
+	LASSERT(interval_is_overlapped(root, ext) == 0);
+	if (!limiter || limiter->start < ext->start)
+		ext->start = interval_expand_low(root, ext->start);
+	if (!limiter || limiter->end > ext->end)
+		ext->end = interval_expand_high(root, ext->end);
+	LASSERT(interval_is_overlapped(root, ext) == 0);
+}
+EXPORT_SYMBOL(interval_expand);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/jobid.c b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c
new file mode 100644
index 0000000000000..207a88bcae3c7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c
@@ -0,0 +1,932 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2017 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Store PID->JobID mappings
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+#include <linux/utsname.h>
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+static struct cfs_hash *jobid_hash;
+static struct cfs_hash_ops jobid_hash_ops;
+spinlock_t jobid_hash_lock;
+
+#define RESCAN_INTERVAL 30
+#define DELETE_INTERVAL 300
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+char obd_jobid_name[LUSTRE_JOBID_SIZE] = "%e.%u";
+
+/**
+ * Structure to store a single PID->JobID mapping
+ */
+struct jobid_pid_map {
+	struct hlist_node	jp_hash;
+	time64_t		jp_time;
+	spinlock_t		jp_lock; /* protects jp_jobid */
+	char			jp_jobid[LUSTRE_JOBID_SIZE];
+	unsigned int		jp_joblen;
+	atomic_t		jp_refcount;
+	pid_t			jp_pid;
+};
+
+/*
+ * Jobid can be set for a session (see setsid(2)) by writing to
+ * a sysfs file from any process in that session.
+ * The jobids are stored in a hash table indexed by the relevant
+ * struct pid.  We periodically look for entries where the pid has
+ * no PIDTYPE_SID tasks any more, and prune them.  This happens within
+ * 5 seconds of a jobid being added, and every 5 minutes when jobids exist,
+ * but none are added.
+ */
+#define JOBID_EXPEDITED_CLEAN (5)
+#define JOBID_BACKGROUND_CLEAN (5 * 60)
+
+struct session_jobid {
+	struct pid		*sj_session;
+	struct rhash_head	sj_linkage;
+	struct rcu_head		sj_rcu;
+	char			sj_jobid[1];
+};
+
+static const struct rhashtable_params jobid_params = {
+	.key_len	= sizeof(struct pid *),
+	.key_offset	= offsetof(struct session_jobid, sj_session),
+	.head_offset	= offsetof(struct session_jobid, sj_linkage),
+};
+
+static struct rhashtable session_jobids;
+
+/*
+ * jobid_current must be called with rcu_read_lock held.
+ * if it returns non-NULL, the string can only be used
+ * until rcu_read_unlock is called.
+ */
+char *jobid_current(void)
+{
+	struct pid *sid = task_session(current);
+	struct session_jobid *sj;
+
+	sj = rhashtable_lookup_fast(&session_jobids, &sid, jobid_params);
+	if (sj)
+		return sj->sj_jobid;
+	return NULL;
+}
+
+static void jobid_prune_expedite(void);
+/*
+ * jobid_set_current will try to add a new entry
+ * to the table.  If one exists with the same key, the
+ * jobid will be replaced
+ */
+int jobid_set_current(char *jobid)
+{
+	struct pid *sid;
+	struct session_jobid *sj, *origsj;
+	int ret;
+	int len = strlen(jobid);
+
+	sj = kmalloc(sizeof(*sj) + len, GFP_KERNEL);
+	if (!sj)
+		return -ENOMEM;
+	rcu_read_lock();
+	sid = task_session(current);
+	sj->sj_session = get_pid(sid);
+	strncpy(sj->sj_jobid, jobid, len+1);
+	origsj = rhashtable_lookup_get_insert_fast(&session_jobids,
+						   &sj->sj_linkage,
+						   jobid_params);
+	if (origsj == NULL) {
+		/* successful insert */
+		rcu_read_unlock();
+		jobid_prune_expedite();
+		return 0;
+	}
+
+	if (IS_ERR(origsj)) {
+		put_pid(sj->sj_session);
+		kfree(sj);
+		rcu_read_unlock();
+		return PTR_ERR(origsj);
+	}
+	ret = rhashtable_replace_fast(&session_jobids,
+				      &origsj->sj_linkage,
+				      &sj->sj_linkage,
+				      jobid_params);
+	if (ret) {
+		put_pid(sj->sj_session);
+		kfree(sj);
+		rcu_read_unlock();
+		return ret;
+	}
+	put_pid(origsj->sj_session);
+	rcu_read_unlock();
+	kfree_rcu(origsj, sj_rcu);
+	jobid_prune_expedite();
+
+	return 0;
+}
+
+static void jobid_free(void *vsj, void *arg)
+{
+	struct session_jobid *sj = vsj;
+
+	put_pid(sj->sj_session);
+	kfree(sj);
+}
+
+static void jobid_prune(struct work_struct *work);
+static DECLARE_DELAYED_WORK(jobid_prune_work, jobid_prune);
+static int jobid_prune_expedited;
+static void jobid_prune(struct work_struct *work)
+{
+	int remaining = 0;
+	struct rhashtable_iter iter;
+	struct session_jobid *sj;
+
+	jobid_prune_expedited = 0;
+	rhashtable_walk_enter(&session_jobids, &iter);
+	rhashtable_walk_start(&iter);
+	while ((sj = rhashtable_walk_next(&iter)) != NULL) {
+		if (IS_ERR(sj)) {
+			if (PTR_ERR(sj) == -EAGAIN)
+				continue;
+			break;
+		}
+		if (!hlist_empty(&sj->sj_session->tasks[PIDTYPE_SID])) {
+			remaining++;
+			continue;
+		}
+		if (rhashtable_remove_fast(&session_jobids,
+					   &sj->sj_linkage,
+					   jobid_params) == 0) {
+			put_pid(sj->sj_session);
+			kfree_rcu(sj, sj_rcu);
+		}
+	}
+	rhashtable_walk_stop(&iter);
+	rhashtable_walk_exit(&iter);
+	if (remaining)
+		schedule_delayed_work(&jobid_prune_work,
+				      cfs_time_seconds(JOBID_BACKGROUND_CLEAN));
+}
+
+static void jobid_prune_expedite(void)
+{
+	if (!jobid_prune_expedited) {
+		jobid_prune_expedited = 1;
+		mod_delayed_work(system_wq, &jobid_prune_work,
+				 cfs_time_seconds(JOBID_EXPEDITED_CLEAN));
+	}
+}
+
+static int cfs_access_process_vm(struct task_struct *tsk,
+				 struct mm_struct *mm,
+				 unsigned long addr,
+				 void *buf, int len, int write)
+{
+	/* Just copied from kernel for the kernels which doesn't
+	 * have access_process_vm() exported
+	 */
+	struct vm_area_struct *vma;
+	struct page *page;
+	void *old_buf = buf;
+
+	/* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
+	 * which is already holding mmap_sem for writes.  If some other
+	 * thread gets the write lock in the meantime, this thread will
+	 * block, but at least it won't deadlock on itself.  LU-1735
+	 */
+	if (!mmap_read_trylock(mm))
+		return -EDEADLK;
+
+	/* ignore errors, just check how much was successfully transferred */
+	while (len) {
+		int bytes, rc, offset;
+		void *maddr;
+
+#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
+		rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page,
+				    &vma);
+#elif defined(HAVE_GET_USER_PAGES_6ARG)
+		rc = get_user_pages(addr, 1, write, 1, &page, &vma);
+#else
+		rc = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma);
+#endif
+		if (rc <= 0)
+			break;
+
+		bytes = len;
+		offset = addr & (PAGE_SIZE-1);
+		if (bytes > PAGE_SIZE-offset)
+			bytes = PAGE_SIZE-offset;
+
+		maddr = kmap(page);
+		if (write) {
+			copy_to_user_page(vma, page, addr,
+					  maddr + offset, buf, bytes);
+			set_page_dirty_lock(page);
+		} else {
+			copy_from_user_page(vma, page, addr,
+					    buf, maddr + offset, bytes);
+		}
+		kunmap(page);
+		put_page(page);
+		len -= bytes;
+		buf += bytes;
+		addr += bytes;
+	}
+	mmap_read_unlock(mm);
+
+	return buf - old_buf;
+}
+
+/* Read the environment variable of current process specified by @key. */
+static int cfs_get_environ(const char *key, char *value, int *val_len)
+{
+	struct mm_struct *mm;
+	char *buffer;
+	int buf_len = PAGE_SIZE;
+	int key_len = strlen(key);
+	unsigned long addr;
+	int rc;
+	bool skip = false;
+
+	ENTRY;
+	buffer = kmalloc(buf_len, GFP_USER);
+	if (!buffer)
+		RETURN(-ENOMEM);
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(buffer);
+		RETURN(-EINVAL);
+	}
+
+	addr = mm->env_start;
+	while (addr < mm->env_end) {
+		int this_len, retval, scan_len;
+		char *env_start, *env_end;
+
+		memset(buffer, 0, buf_len);
+
+		this_len = min_t(int, mm->env_end - addr, buf_len);
+		retval = cfs_access_process_vm(current, mm, addr, buffer,
+					       this_len, 0);
+		if (retval < 0)
+			GOTO(out, rc = retval);
+		else if (retval != this_len)
+			break;
+
+		addr += retval;
+
+		/* Parse the buffer to find out the specified key/value pair.
+		 * The "key=value" entries are separated by '\0'.
+		 */
+		env_start = buffer;
+		scan_len = this_len;
+		while (scan_len) {
+			char *entry;
+			int entry_len;
+
+			env_end = memscan(env_start, '\0', scan_len);
+			LASSERT(env_end >= env_start &&
+				env_end <= env_start + scan_len);
+
+			/* The last entry of this buffer cross the buffer
+			 * boundary, reread it in next cycle.
+			 */
+			if (unlikely(env_end - env_start == scan_len)) {
+				/* Just skip the entry larger than page size,
+				 * it can't be jobID env variable.
+				 */
+				if (unlikely(scan_len == this_len))
+					skip = true;
+				else
+					addr -= scan_len;
+				break;
+			} else if (unlikely(skip)) {
+				skip = false;
+				goto skip;
+			}
+			entry = env_start;
+			entry_len = env_end - env_start;
+			CDEBUG(D_INFO, "key: %s, entry: %s\n", key, entry);
+
+			/* Key length + length of '=' */
+			if (entry_len > key_len + 1 &&
+			    entry[key_len] == '='  &&
+			    !memcmp(entry, key, key_len)) {
+				entry += key_len + 1;
+				entry_len -= key_len + 1;
+
+				/* The 'value' buffer passed in is too small.
+				 * Copy what fits, but return -EOVERFLOW.
+				 */
+				if (entry_len >= *val_len) {
+					memcpy(value, entry, *val_len);
+					value[*val_len - 1] = 0;
+					GOTO(out, rc = -EOVERFLOW);
+				}
+
+				memcpy(value, entry, entry_len);
+				*val_len = entry_len;
+				GOTO(out, rc = 0);
+			}
+skip:
+			scan_len -= (env_end - env_start + 1);
+			env_start = env_end + 1;
+		}
+	}
+	GOTO(out, rc = -ENOENT);
+
+out:
+	mmput(mm);
+	kfree((void *)buffer);
+	return rc;
+}
+
+/*
+ * Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/API. Then, the jobid must be cached.
+ */
+int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len)
+{
+	int rc;
+
+	rc = cfs_get_environ(jobid_var, jobid, jobid_len);
+	if (!rc)
+		goto out;
+
+	if (rc == -EOVERFLOW) {
+		/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+		 * variable length strings instead of just numbers), it
+		 * might make sense to keep the unique parts for JobID,
+		 * instead of just returning an error.  That means a
+		 * larger temp buffer for cfs_get_environ(), then
+		 * truncating the string at some separator to fit into
+		 * the specified jobid_len.  Fix later if needed. */
+		static ktime_t printed;
+
+		if (unlikely(ktime_to_ns(printed) == 0 ||
+			     ktime_after(ktime_get(),
+					 ktime_add_ns(printed,
+					     3600ULL * 24 * NSEC_PER_SEC)))) {
+			LCONSOLE_WARN("jobid: '%s' value too large (%d)\n",
+				      obd_jobid_var, *jobid_len);
+			printed = ktime_get();
+		}
+
+		rc = 0;
+	} else {
+		CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL ||
+			      rc == -EDEADLK) ? D_INFO : D_ERROR,
+			     "jobid: get '%s' failed: rc = %d\n",
+			     obd_jobid_var, rc);
+	}
+
+out:
+	return rc;
+}
+
+/*
+ * jobid_should_free_item
+ *
+ * Each item is checked to see if it should be released
+ * Removed from hash table by caller
+ * Actually freed in jobid_put_locked
+ *
+ * Returns 1 if item is to be freed, 0 if it is to be kept
+ */
+
+static int jobid_should_free_item(void *obj, void *data)
+{
+	char *jobid = data;
+	struct jobid_pid_map *pidmap = obj;
+	int rc = 0;
+
+	if (obj == NULL)
+		return 0;
+
+	if (jobid == NULL) {
+		WARN_ON_ONCE(atomic_read(&pidmap->jp_refcount) != 1);
+		return 1;
+	}
+
+	spin_lock(&pidmap->jp_lock);
+	/* prevent newly inserted items from deleting */
+	if (jobid[0] == '\0' && atomic_read(&pidmap->jp_refcount) == 1)
+		rc = 1;
+	else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL)
+		rc = 1;
+	else if (strcmp(pidmap->jp_jobid, jobid) == 0)
+		rc = 1;
+	spin_unlock(&pidmap->jp_lock);
+
+	return rc;
+}
+
+/*
+ * jobid_name_is_valid
+ *
+ * Checks if the jobid is a Lustre process
+ *
+ * Returns true if jobid is valid
+ * Returns false if jobid looks like it's a Lustre process
+ */
+static bool jobid_name_is_valid(char *jobid)
+{
+	const char *const lustre_reserved[] = { "ll_ping", "ptlrpc",
+						"ldlm", "ll_sa", NULL };
+	int i;
+
+	if (jobid[0] == '\0')
+		return false;
+
+	for (i = 0; lustre_reserved[i] != NULL; i++) {
+		if (strncmp(jobid, lustre_reserved[i],
+			    strlen(lustre_reserved[i])) == 0)
+			return false;
+	}
+	return true;
+}
+
+/*
+ * jobid_get_from_cache()
+ *
+ * Returns contents of jobid_var from process environment for current PID,
+ * or from the per-session jobid table.
+ * Values fetch from process environment will be cached for some time to avoid
+ * the overhead of scanning the environment.
+ *
+ * Return: -ENOMEM if allocating a new pidmap fails
+ *         -ENOENT if no entry could be found
+ *         +ve string length for success (something was returned in jobid)
+ */
+static int jobid_get_from_cache(char *jobid, size_t joblen)
+{
+	static time64_t last_expire;
+	bool expire_cache = false;
+	pid_t pid = current->pid;
+	struct jobid_pid_map *pidmap = NULL;
+	time64_t now = ktime_get_real_seconds();
+	int rc = 0;
+	ENTRY;
+
+	if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0) {
+		char *jid;
+
+		rcu_read_lock();
+		jid = jobid_current();
+		if (jid) {
+			strlcpy(jobid, jid, joblen);
+			joblen = strlen(jobid);
+		} else {
+			rc = -ENOENT;
+		}
+		rcu_read_unlock();
+		GOTO(out, rc);
+	}
+
+	LASSERT(jobid_hash != NULL);
+
+	/* scan hash periodically to remove old PID entries from cache */
+	spin_lock(&jobid_hash_lock);
+	if (unlikely(last_expire + DELETE_INTERVAL <= now)) {
+		expire_cache = true;
+		last_expire = now;
+	}
+	spin_unlock(&jobid_hash_lock);
+
+	if (expire_cache)
+		cfs_hash_cond_del(jobid_hash, jobid_should_free_item,
+				  "intentionally_bad_jobid");
+
+	/* first try to find PID in the hash and use that value */
+	pidmap = cfs_hash_lookup(jobid_hash, &pid);
+	if (pidmap == NULL) {
+		struct jobid_pid_map *pidmap2;
+
+		OBD_ALLOC_PTR(pidmap);
+		if (pidmap == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		pidmap->jp_pid = pid;
+		pidmap->jp_time = 0;
+		pidmap->jp_jobid[0] = '\0';
+		spin_lock_init(&pidmap->jp_lock);
+		INIT_HLIST_NODE(&pidmap->jp_hash);
+		/*
+		 * @pidmap might be reclaimed just after it is added into
+		 * hash list, init @jp_refcount as 1 to make sure memory
+		 * could be not freed during access.
+		 */
+		atomic_set(&pidmap->jp_refcount, 1);
+
+		/*
+		 * Add the newly created map to the hash, on key collision we
+		 * lost a racing addition and must destroy our newly allocated
+		 * map.  The object which exists in the hash will be returned.
+		 */
+		pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid,
+						  &pidmap->jp_hash);
+		if (unlikely(pidmap != pidmap2)) {
+			CDEBUG(D_INFO, "jobid: duplicate found for PID=%u\n",
+			       pid);
+			OBD_FREE_PTR(pidmap);
+			pidmap = pidmap2;
+		}
+	}
+
+	/*
+	 * If pidmap is old (this is always true for new entries) refresh it.
+	 * If obd_jobid_var is not found, cache empty entry and try again
+	 * later, to avoid repeat lookups for PID if obd_jobid_var missing.
+	 */
+	spin_lock(&pidmap->jp_lock);
+	if (pidmap->jp_time + RESCAN_INTERVAL <= now) {
+		char env_jobid[LUSTRE_JOBID_SIZE] = "";
+		int env_len = sizeof(env_jobid);
+
+		pidmap->jp_time = now;
+
+		spin_unlock(&pidmap->jp_lock);
+		rc = jobid_get_from_environ(obd_jobid_var, env_jobid, &env_len);
+
+		CDEBUG(D_INFO, "jobid: PID mapping established: %d->%s\n",
+		       pidmap->jp_pid, env_jobid);
+		spin_lock(&pidmap->jp_lock);
+		if (!rc) {
+			pidmap->jp_joblen = env_len;
+			strlcpy(pidmap->jp_jobid, env_jobid,
+				sizeof(pidmap->jp_jobid));
+			rc = 0;
+		} else if (rc == -ENOENT) {
+			/* It might have been deleted, clear out old entry */
+			pidmap->jp_joblen = 0;
+			pidmap->jp_jobid[0] = '\0';
+		}
+	}
+
+	/*
+	 * Regardless of how pidmap was found, if it contains a valid entry
+	 * use that for now.  If there was a technical error (e.g. -ENOMEM)
+	 * use the old cached value until it can be looked up again properly.
+	 * If a cached missing entry was found, return -ENOENT.
+	 */
+	if (pidmap->jp_joblen) {
+		strlcpy(jobid, pidmap->jp_jobid, joblen);
+		joblen = pidmap->jp_joblen;
+		rc = 0;
+	} else if (!rc) {
+		rc = -ENOENT;
+	}
+	spin_unlock(&pidmap->jp_lock);
+
+	cfs_hash_put(jobid_hash, &pidmap->jp_hash);
+
+	EXIT;
+out:
+	return rc < 0 ? rc : joblen;
+}
+
+/*
+ * jobid_interpret_string()
+ *
+ * Interpret the jobfmt string to expand specified fields, like coredumps do:
+ *   %e = executable
+ *   %g = gid
+ *   %h = hostname
+ *   %H = short hostname
+ *   %j = jobid from environment
+ *   %p = pid
+ *   %u = uid
+ *
+ * Unknown escape strings are dropped.  Other characters are copied through,
+ * excluding whitespace (to avoid making jobid parsing difficult).
+ *
+ * Return: -EOVERFLOW if the expanded string does not fit within @joblen
+ *         0 for success
+ */
+static int jobid_interpret_string(const char *jobfmt, char *jobid,
+				  ssize_t joblen)
+{
+	char c;
+
+	while ((c = *jobfmt++) && joblen > 1) {
+		char f, *p;
+		int l;
+
+		if (isspace(c)) /* Don't allow embedded spaces */
+			continue;
+
+		if (c != '%') {
+			*jobid = c;
+			joblen--;
+			jobid++;
+			*jobid = '\0';
+			continue;
+		}
+
+		switch ((f = *jobfmt++)) {
+		case 'e': /* executable name */
+			l = snprintf(jobid, joblen, "%s", current->comm);
+			break;
+		case 'g': /* group ID */
+			l = snprintf(jobid, joblen, "%u",
+				     from_kgid(&init_user_ns, current_fsgid()));
+			break;
+		case 'h': /* hostname */
+			l = snprintf(jobid, joblen, "%s",
+				     init_utsname()->nodename);
+			break;
+		case 'H': /* short hostname. Cut at first dot */
+			l = snprintf(jobid, joblen, "%s",
+				     init_utsname()->nodename);
+			p = strnchr(jobid, joblen, '.');
+			if (p) {
+				*p = '\0';
+				l = p - jobid;
+			}
+			break;
+		case 'j': /* jobid stored in process environment */
+			l = jobid_get_from_cache(jobid, joblen);
+			if (l < 0)
+				l = 0;
+			break;
+		case 'p': /* process ID */
+			l = snprintf(jobid, joblen, "%u", current->pid);
+			break;
+		case 'u': /* user ID */
+			l = snprintf(jobid, joblen, "%u",
+				     from_kuid(&init_user_ns, current_fsuid()));
+			break;
+		case '\0': /* '%' at end of format string */
+			l = 0;
+			goto out;
+		default: /* drop unknown %x format strings */
+			l = 0;
+			break;
+		}
+		jobid += l;
+		joblen -= l;
+	}
+	/*
+	 * This points at the end of the buffer, so long as jobid is always
+	 * incremented the same amount as joblen is decremented.
+	 */
+out:
+	jobid[joblen - 1] = '\0';
+
+	return joblen < 0 ? -EOVERFLOW : 0;
+}
+
+/*
+ * Hash initialization, copied from server-side job stats bucket sizes
+ */
+#define HASH_JOBID_BKT_BITS 5
+#define HASH_JOBID_CUR_BITS 7
+#define HASH_JOBID_MAX_BITS 12
+
+int jobid_cache_init(void)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (jobid_hash)
+		return 0;
+
+	spin_lock_init(&jobid_hash_lock);
+	jobid_hash = cfs_hash_create("JOBID_HASH", HASH_JOBID_CUR_BITS,
+				     HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS,
+				     0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				     &jobid_hash_ops, CFS_HASH_DEFAULT);
+	if (!jobid_hash) {
+		rc = -ENOMEM;
+	} else {
+		rc = rhashtable_init(&session_jobids, &jobid_params);
+		if (rc) {
+			cfs_hash_putref(jobid_hash);
+			jobid_hash = NULL;
+		}
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(jobid_cache_init);
+
+void jobid_cache_fini(void)
+{
+	struct cfs_hash *tmp_hash;
+	ENTRY;
+
+	spin_lock(&jobid_hash_lock);
+	tmp_hash = jobid_hash;
+	jobid_hash = NULL;
+	spin_unlock(&jobid_hash_lock);
+
+	cancel_delayed_work_sync(&jobid_prune_work);
+
+	if (tmp_hash != NULL) {
+		cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL);
+		cfs_hash_putref(tmp_hash);
+
+		rhashtable_free_and_destroy(&session_jobids, jobid_free, NULL);
+	}
+
+
+	EXIT;
+}
+EXPORT_SYMBOL(jobid_cache_fini);
+
+/*
+ * Hash operations for pid<->jobid
+ */
+static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key,
+			     unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(pid_t), mask);
+}
+
+static void *jobid_key(struct hlist_node *hnode)
+{
+	struct jobid_pid_map *pidmap;
+
+	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+	return &pidmap->jp_pid;
+}
+
+static int jobid_keycmp(const void *key, struct hlist_node *hnode)
+{
+	const pid_t *pid_key1;
+	const pid_t *pid_key2;
+
+	LASSERT(key != NULL);
+	pid_key1 = (pid_t *)key;
+	pid_key2 = (pid_t *)jobid_key(hnode);
+
+	return *pid_key1 == *pid_key2;
+}
+
+static void *jobid_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+}
+
+static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct jobid_pid_map *pidmap;
+
+	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+
+	atomic_inc(&pidmap->jp_refcount);
+}
+
+static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct jobid_pid_map *pidmap;
+
+	if (hnode == NULL)
+		return;
+
+	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+	LASSERT(atomic_read(&pidmap->jp_refcount) > 0);
+	if (atomic_dec_and_test(&pidmap->jp_refcount)) {
+		CDEBUG(D_INFO, "Freeing: %d->%s\n",
+		       pidmap->jp_pid, pidmap->jp_jobid);
+
+		OBD_FREE_PTR(pidmap);
+	}
+}
+
+static struct cfs_hash_ops jobid_hash_ops = {
+	.hs_hash	= jobid_hashfn,
+	.hs_keycmp	= jobid_keycmp,
+	.hs_key		= jobid_key,
+	.hs_object	= jobid_object,
+	.hs_get		= jobid_get,
+	.hs_put		= jobid_put_locked,
+	.hs_put_locked	= jobid_put_locked,
+};
+
+/**
+ * Generate the job identifier string for this process for tracking purposes.
+ *
+ * Fill in @jobid string based on the value of obd_jobid_var:
+ * JOBSTATS_DISABLE:      none
+ * JOBSTATS_NODELOCAL:    content of obd_jobid_name (jobid_interpret_string())
+ * JOBSTATS_PROCNAME_UID: process name/UID
+ * JOBSTATS_SESSION       per-session value set by
+ *                            /sys/fs/lustre/jobid_this_session
+ * anything else:         look up obd_jobid_var in the processes environment
+ *
+ * Return -ve error number, 0 on success.
+ */
+int lustre_get_jobid(char *jobid, size_t joblen)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (unlikely(joblen < 2)) {
+		if (joblen == 1)
+			jobid[0] = '\0';
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) {
+		/* Jobstats isn't enabled */
+		memset(jobid, 0, joblen);
+	} else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
+		/* Whole node dedicated to single job */
+		rc = jobid_interpret_string(obd_jobid_name, jobid, joblen);
+	} else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+		rc = jobid_interpret_string("%e.%u", jobid, joblen);
+	} else if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0 ||
+		   jobid_name_is_valid(current->comm)) {
+		/*
+		 * per-process jobid wanted, either from environment or from
+		 * per-session setting.
+		 * If obd_jobid_name contains "%j" or if getting the per-process
+		 * jobid directly fails, fall back to using obd_jobid_name.
+		 */
+		rc = -EAGAIN;
+		if (!strnstr(obd_jobid_name, "%j", joblen))
+			rc = jobid_get_from_cache(jobid, joblen);
+
+		/* fall back to jobid_name if jobid_var not available */
+		if (rc < 0) {
+			int rc2 = jobid_interpret_string(obd_jobid_name,
+							 jobid, joblen);
+			if (!rc2)
+				rc = 0;
+		}
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+/*
+ * lustre_jobid_clear
+ *
+ * Search cache for JobID given by @find_jobid.
+ * If any entries in the hash table match the value, they are removed
+ */
+void lustre_jobid_clear(const char *find_jobid)
+{
+	char jobid[LUSTRE_JOBID_SIZE];
+	char *end;
+
+	if (jobid_hash == NULL)
+		return;
+
+	strlcpy(jobid, find_jobid, sizeof(jobid));
+	/* trim \n off the end of the incoming jobid */
+	end = strchr(jobid, '\n');
+	if (end && *end == '\n')
+		*end = '\0';
+
+	CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid);
+	cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid);
+
+	CDEBUG(D_INFO, "%d items remain in jobID table\n",
+	       atomic_read(&jobid_hash->hs_count));
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
new file mode 100644
index 0000000000000..7afb9484a8a69
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
@@ -0,0 +1,262 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2015, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * Using pipes for all arches.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/file.h>
+
+#include <obd_support.h>
+#include <lustre_kernelcomm.h>
+
+/**
+ * libcfs_kkuc_msg_put - send an message from kernel to userspace
+ * @param fp to send the message to
+ * @param payload Payload data.  First field of payload is always
+ *   struct kuc_hdr
+ */
+int libcfs_kkuc_msg_put(struct file *filp, void *payload)
+{
+	struct kuc_hdr *kuch = (struct kuc_hdr *)payload;
+	ssize_t count = kuch->kuc_msglen;
+	loff_t offset = 0;
+	int rc = 0;
+
+	if (IS_ERR_OR_NULL(filp))
+		return -EBADF;
+
+	if (kuch->kuc_magic != KUC_MAGIC) {
+		CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic);
+		return -ENOSYS;
+	}
+
+	while (count > 0) {
+		rc = cfs_kernel_write(filp, payload, count, &offset);
+		if (rc < 0)
+			break;
+		count -= rc;
+		payload += rc;
+		rc = 0;
+	}
+
+	if (rc < 0)
+		CWARN("message send failed (%d)\n", rc);
+	else
+		CDEBUG(D_HSM, "Sent message rc=%d, fp=%p\n", rc, filp);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_msg_put);
+
+/* Broadcast groups are global across all mounted filesystems;
+ * i.e. registering for a group on 1 fs will get messages for that
+ * group from any fs */
+/** A single group registration has a uid and a file pointer */
+struct kkuc_reg {
+	struct list_head kr_chain;
+	struct obd_uuid	 kr_uuid;
+	int		 kr_uid;
+	struct file	*kr_fp;
+	char		 kr_data[0];
+};
+
+static struct list_head kkuc_groups[KUC_GRP_MAX + 1];
+/* Protect message sending against remove and adds */
+static DECLARE_RWSEM(kg_sem);
+
+static inline bool libcfs_kkuc_group_is_valid(int group)
+{
+	return 0 <= group && group < ARRAY_SIZE(kkuc_groups);
+}
+
+void libcfs_kkuc_init(void)
+{
+	int group;
+
+	for (group = 0; group < ARRAY_SIZE(kkuc_groups); group++)
+		INIT_LIST_HEAD(&kkuc_groups[group]);
+}
+
+/** Add a receiver to a broadcast group
+ * @param filp pipe to write into
+ * @param uid identifier for this receiver
+ * @param group group number
+ * @param data user data
+ */
+int libcfs_kkuc_group_add(struct file *filp, const struct obd_uuid *uuid,
+			  int uid, int group, void *data, size_t data_len)
+{
+	struct kkuc_reg *reg;
+
+	if (!libcfs_kkuc_group_is_valid(group)) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	/* fput in group_rem */
+	if (filp == NULL)
+		return -EBADF;
+
+	/* freed in group_rem */
+	reg = kzalloc(sizeof(*reg) + data_len, 0);
+	if (reg == NULL)
+		return -ENOMEM;
+
+	reg->kr_uuid = *uuid;
+	reg->kr_fp = filp;
+	reg->kr_uid = uid;
+	memcpy(reg->kr_data, data, data_len);
+
+	down_write(&kg_sem);
+	list_add(&reg->kr_chain, &kkuc_groups[group]);
+	up_write(&kg_sem);
+
+	CDEBUG(D_HSM, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_add);
+
+int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group)
+{
+	struct kkuc_reg *reg, *next;
+	ENTRY;
+
+	if (!libcfs_kkuc_group_is_valid(group)) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	if (uid == 0) {
+		/* Broadcast a shutdown message */
+		struct kuc_hdr lh;
+
+		lh.kuc_magic = KUC_MAGIC;
+		lh.kuc_transport = KUC_TRANSPORT_GENERIC;
+		lh.kuc_msgtype = KUC_MSG_SHUTDOWN;
+		lh.kuc_msglen = sizeof(lh);
+		libcfs_kkuc_group_put(uuid, group, &lh);
+	}
+
+	down_write(&kg_sem);
+	list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) {
+		if (obd_uuid_equals(uuid, &reg->kr_uuid) &&
+		    (uid == 0 || uid == reg->kr_uid)) {
+			list_del(&reg->kr_chain);
+			CDEBUG(D_HSM, "Removed uid=%d fp=%p from group %d\n",
+				reg->kr_uid, reg->kr_fp, group);
+			if (reg->kr_fp != NULL)
+				fput(reg->kr_fp);
+			kfree(reg);
+		}
+	}
+	up_write(&kg_sem);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_rem);
+
+int libcfs_kkuc_group_put(const struct obd_uuid *uuid, int group, void *payload)
+{
+	struct kkuc_reg	*reg;
+	int		 rc = 0;
+	int one_success = 0;
+	ENTRY;
+
+	if (!libcfs_kkuc_group_is_valid(group)) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	down_write(&kg_sem);
+
+	if (unlikely(list_empty(&kkuc_groups[group])) ||
+	    unlikely(OBD_FAIL_CHECK(OBD_FAIL_MDS_HSM_CT_REGISTER_NET))) {
+		/* no agent have fully registered, CDT will retry */
+		up_write(&kg_sem);
+		RETURN(-EAGAIN);
+	}
+
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (obd_uuid_equals(uuid, &reg->kr_uuid) &&
+		    reg->kr_fp != NULL) {
+			rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
+			if (rc == 0)
+				one_success = 1;
+			else if (rc == -EPIPE) {
+				fput(reg->kr_fp);
+				reg->kr_fp = NULL;
+			}
+		}
+	}
+	up_write(&kg_sem);
+
+	/* don't return an error if the message has been delivered
+	 * at least to one agent */
+	if (one_success)
+		rc = 0;
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_put);
+
+/**
+ * Calls a callback function for each link of the given kuc group.
+ * @param group the group to call the function on.
+ * @param cb_func the function to be called.
+ * @param cb_arg extra argument to be passed to the callback function.
+ */
+int libcfs_kkuc_group_foreach(const struct obd_uuid *uuid, int group,
+			      libcfs_kkuc_cb_t cb_func, void *cb_arg)
+{
+	struct kkuc_reg	*reg;
+	int		 rc = 0;
+	ENTRY;
+
+	if (!libcfs_kkuc_group_is_valid(group)) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		RETURN(-EINVAL);
+	}
+
+	down_read(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (obd_uuid_equals(uuid, &reg->kr_uuid) && reg->kr_fp != NULL)
+			rc = cb_func(reg->kr_data, cb_arg);
+	}
+	up_read(&kg_sem);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_foreach);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
new file mode 100644
index 0000000000000..2ea560fdc125d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
@@ -0,0 +1,330 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#include <obd.h>
+#include <lustre_linkea.h>
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf)
+{
+	ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_SIZE);
+	if (ldata->ld_buf->lb_buf == NULL)
+		return -ENOMEM;
+	ldata->ld_leh = ldata->ld_buf->lb_buf;
+	ldata->ld_leh->leh_magic = LINK_EA_MAGIC;
+	ldata->ld_leh->leh_reccount = 0;
+	ldata->ld_leh->leh_len = sizeof(struct link_ea_header);
+	ldata->ld_leh->leh_overflow_time = 0;
+	ldata->ld_leh->leh_padding = 0;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_data_new);
+
+int linkea_init(struct linkea_data *ldata)
+{
+	struct link_ea_header *leh;
+
+	LASSERT(ldata->ld_buf != NULL);
+	leh = ldata->ld_buf->lb_buf;
+	if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+		leh->leh_magic = LINK_EA_MAGIC;
+		leh->leh_reccount = __swab32(leh->leh_reccount);
+		leh->leh_len = __swab64(leh->leh_len);
+		leh->leh_overflow_time = __swab32(leh->leh_overflow_time);
+		leh->leh_padding = __swab32(leh->leh_padding);
+		/* individual entries are swabbed by linkea_entry_unpack() */
+	}
+
+	if (leh->leh_magic != LINK_EA_MAGIC)
+		return -EINVAL;
+
+	if (leh->leh_reccount == 0 && leh->leh_overflow_time == 0)
+		return -ENODATA;
+
+	ldata->ld_leh = leh;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_init);
+
+int linkea_init_with_rec(struct linkea_data *ldata)
+{
+	int rc;
+
+	rc = linkea_init(ldata);
+	if (!rc && ldata->ld_leh->leh_reccount == 0)
+		rc = -ENODATA;
+
+	return rc;
+}
+EXPORT_SYMBOL(linkea_init_with_rec);
+
+/**
+ * Pack a link_ea_entry.
+ * All elements are stored as chars to avoid alignment issues.
+ * Numbers are always big-endian
+ * \retval record length
+ */
+int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname,
+		      const struct lu_fid *pfid)
+{
+	struct lu_fid   tmpfid;
+	int             reclen;
+
+	tmpfid = *pfid;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_MUL_REF))
+		tmpfid.f_oid--;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH))
+		tmpfid.f_ver = ~0;
+	fid_cpu_to_be(&tmpfid, &tmpfid);
+	memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid));
+	memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen);
+	reclen = sizeof(struct link_ea_entry) + lname->ln_namelen;
+
+	lee->lee_reclen[0] = (reclen >> 8) & 0xff;
+	lee->lee_reclen[1] = reclen & 0xff;
+	return reclen;
+}
+EXPORT_SYMBOL(linkea_entry_pack);
+
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+			 struct lu_name *lname, struct lu_fid *pfid)
+{
+	LASSERT(lee != NULL);
+
+	*reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1];
+	memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid));
+	fid_be_to_cpu(pfid, pfid);
+	if (lname != NULL) {
+		lname->ln_name = lee->lee_name;
+		lname->ln_namelen = *reclen - sizeof(struct link_ea_entry);
+	}
+}
+EXPORT_SYMBOL(linkea_entry_unpack);
+
+bool linkea_will_overflow(struct linkea_data *ldata,
+			  const struct lu_name *lname)
+{
+	struct link_ea_header *leh = ldata->ld_leh;
+	int reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
+
+	if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE))
+		return true;
+	return false;
+}
+EXPORT_SYMBOL(linkea_will_overflow);
+
+/**
+ * Add a record to the end of link ea buf
+ **/
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		   const struct lu_fid *pfid, bool err_on_overflow)
+{
+	struct link_ea_header *leh = ldata->ld_leh;
+	int reclen;
+
+	LASSERT(leh != NULL);
+
+	if (lname == NULL || pfid == NULL)
+		return -EINVAL;
+
+	reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
+	if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE)) {
+		/* Use 32-bits to save the overflow time, although it will
+		 * shrink the ktime_get_real_seconds() returned 64-bits value
+		 * to 32-bits value, it is still quite large and can be used
+		 * for about 140 years. That is enough.
+		 */
+		leh->leh_overflow_time = ktime_get_real_seconds();
+		if (unlikely(leh->leh_overflow_time == 0))
+			leh->leh_overflow_time++;
+
+		CDEBUG(D_INODE, "No enough space to hold linkea entry '"
+		       DFID": %.*s' at %u\n", PFID(pfid), lname->ln_namelen,
+		       lname->ln_name, leh->leh_overflow_time);
+		return err_on_overflow ? -EOVERFLOW : 0;
+	}
+
+	if (leh->leh_len + reclen > ldata->ld_buf->lb_len) {
+		if (lu_buf_check_and_grow(ldata->ld_buf,
+					  leh->leh_len + reclen) < 0)
+			return -ENOMEM;
+
+		leh = ldata->ld_leh = ldata->ld_buf->lb_buf;
+	}
+
+	ldata->ld_lee = ldata->ld_buf->lb_buf + leh->leh_len;
+	ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid);
+	leh->leh_len += ldata->ld_reclen;
+	leh->leh_reccount++;
+	if (err_on_overflow)
+		CDEBUG(D_INODE,
+		       "New link_ea name '"DFID":<encrypted (%d)>' is added\n",
+		       PFID(pfid), lname->ln_namelen);
+	else
+		CDEBUG(D_INODE, "New link_ea name '"DFID":%.*s' is added\n",
+		       PFID(pfid), lname->ln_namelen, lname->ln_name);
+	return 0;
+}
+EXPORT_SYMBOL(linkea_add_buf);
+
+/** Del the current record from the link ea buf */
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		    bool is_encrypted)
+{
+	LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL);
+	LASSERT(ldata->ld_leh->leh_reccount > 0);
+
+	ldata->ld_leh->leh_reccount--;
+	ldata->ld_leh->leh_len -= ldata->ld_reclen;
+	memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen,
+		(char *)ldata->ld_leh + ldata->ld_leh->leh_len -
+		(char *)ldata->ld_lee);
+	if (is_encrypted)
+		CDEBUG(D_INODE,
+		       "Old link_ea name '<encrypted (%d)>' is removed\n",
+		       lname->ln_namelen);
+	else
+		CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n",
+		       lname->ln_namelen, lname->ln_name);
+
+	if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh +
+				      ldata->ld_leh->leh_len))
+		ldata->ld_lee = NULL;
+}
+EXPORT_SYMBOL(linkea_del_buf);
+
+int linkea_links_new(struct linkea_data *ldata, struct lu_buf *buf,
+		     const struct lu_name *cname, const struct lu_fid *pfid)
+{
+	int rc;
+
+	rc = linkea_data_new(ldata, buf);
+	if (!rc)
+		rc = linkea_add_buf(ldata, cname, pfid, false);
+
+	return rc;
+}
+EXPORT_SYMBOL(linkea_links_new);
+
+/**
+ * Mark the linkEA as overflow with current timestamp,
+ * and remove the last linkEA entry.
+ *
+ * Return the new linkEA size.
+ */
+int linkea_overflow_shrink(struct linkea_data *ldata)
+{
+	struct link_ea_header *leh;
+	struct lu_name tname;
+	struct lu_fid tfid;
+	int count;
+
+	leh = ldata->ld_leh = ldata->ld_buf->lb_buf;
+	if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+		leh->leh_magic = LINK_EA_MAGIC;
+		leh->leh_reccount = __swab32(leh->leh_reccount);
+		leh->leh_overflow_time = __swab32(leh->leh_overflow_time);
+		leh->leh_padding = __swab32(leh->leh_padding);
+	}
+
+	LASSERT(leh->leh_reccount > 0);
+
+	leh->leh_len = sizeof(struct link_ea_header);
+	leh->leh_reccount--;
+	if (unlikely(leh->leh_reccount == 0))
+		return 0;
+
+	leh->leh_overflow_time = ktime_get_real_seconds();
+	if (unlikely(leh->leh_overflow_time == 0))
+		leh->leh_overflow_time++;
+	ldata->ld_reclen = 0;
+	ldata->ld_lee = (struct link_ea_entry *)(leh + 1);
+	for (count = 0; count < leh->leh_reccount; count++) {
+		linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+				    &tname, &tfid);
+		leh->leh_len += ldata->ld_reclen;
+		ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+							 ldata->ld_reclen);
+	}
+
+	linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, &tname, &tfid);
+	CDEBUG(D_INODE, "No enough space to hold the last linkea entry '"
+	       DFID": %.*s', shrink it, left %d linkea entries, size %llu\n",
+	       PFID(&tfid), tname.ln_namelen, tname.ln_name,
+	       leh->leh_reccount, leh->leh_len);
+
+	return leh->leh_len;
+}
+EXPORT_SYMBOL(linkea_overflow_shrink);
+
+/**
+ * Check if such a link exists in linkEA.
+ *
+ * \param ldata link data the search to be done on
+ * \param lname name in the parent's directory entry pointing to this object
+ * \param pfid parent fid the link to be found for
+ *
+ * \retval   0 success
+ * \retval -ENOENT link does not exist
+ * \retval -ve on error
+ */
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+		      const struct lu_fid  *pfid)
+{
+	struct lu_name tmpname;
+	struct lu_fid  tmpfid;
+	int count;
+
+	LASSERT(ldata->ld_leh != NULL);
+
+	/* link #0, if leh_reccount == 0 we skip the loop and return -ENOENT */
+	if (likely(ldata->ld_leh->leh_reccount > 0))
+		ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+
+	for (count = 0; count < ldata->ld_leh->leh_reccount; count++) {
+		linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+				    &tmpname, &tmpfid);
+		if (tmpname.ln_namelen == lname->ln_namelen &&
+		    lu_fid_eq(&tmpfid, pfid) &&
+		    (strncmp(tmpname.ln_name, lname->ln_name,
+			     tmpname.ln_namelen) == 0))
+			break;
+		ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+							 ldata->ld_reclen);
+	}
+
+	if (count == ldata->ld_leh->leh_reccount) {
+		CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n",
+		       lname->ln_namelen, lname->ln_name);
+		ldata->ld_lee = NULL;
+		ldata->ld_reclen = 0;
+		return -ENOENT;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(linkea_links_find);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog.c b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
new file mode 100644
index 0000000000000..2c45c9673ae84
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
@@ -0,0 +1,1539 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/fs_struct.h>
+#include <linux/pid_namespace.h>
+#include <linux/kthread.h>
+#include <llog_swab.h>
+#include <lustre_log.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include "llog_internal.h"
+
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+static struct llog_handle *llog_alloc_handle(void)
+{
+	struct llog_handle *loghandle;
+
+	OBD_ALLOC_PTR(loghandle);
+	if (loghandle == NULL)
+		return NULL;
+
+	init_rwsem(&loghandle->lgh_lock);
+	mutex_init(&loghandle->lgh_hdr_mutex);
+	init_rwsem(&loghandle->lgh_last_sem);
+	INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+	refcount_set(&loghandle->lgh_refcount, 1);
+
+	return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+static void llog_free_handle(struct llog_handle *loghandle)
+{
+	LASSERT(loghandle != NULL);
+
+	/* failed llog_init_handle */
+	if (loghandle->lgh_hdr == NULL)
+		goto out;
+
+	if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+		LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+	else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		LASSERT(list_empty(&loghandle->u.chd.chd_head));
+	OBD_FREE_LARGE(loghandle->lgh_hdr, loghandle->lgh_hdr_size);
+out:
+	OBD_FREE_PTR(loghandle);
+}
+
+struct llog_handle *llog_handle_get(struct llog_handle *loghandle)
+{
+	if (refcount_inc_not_zero(&loghandle->lgh_refcount))
+		return loghandle;
+	return NULL;
+}
+
+int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle)
+{
+	int rc = 0;
+
+	if (refcount_dec_and_test(&loghandle->lgh_refcount)) {
+		const struct llog_operations *lop;
+
+		rc = llog_handle2ops(loghandle, &lop);
+		if (!rc) {
+			if (lop->lop_close)
+				rc = lop->lop_close(env, loghandle);
+			else
+				rc = -EOPNOTSUPP;
+		}
+		llog_free_handle(loghandle);
+	}
+	return rc;
+}
+
+static int llog_declare_destroy(const struct lu_env *env,
+				struct llog_handle *handle,
+				struct thandle *th)
+{
+	const struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_declare_destroy == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_declare_destroy(env, handle, th);
+
+	RETURN(rc);
+}
+
+int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle,
+		       struct thandle *th)
+{
+	const struct llog_operations *lop;
+	int rc;
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc < 0)
+		RETURN(rc);
+	if (lop->lop_destroy == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	LASSERT(handle->lgh_obj != NULL);
+	if (!llog_exist(handle))
+		RETURN(0);
+
+	rc = lop->lop_destroy(env, handle, th);
+
+	RETURN(rc);
+}
+
+int llog_destroy(const struct lu_env *env, struct llog_handle *handle)
+{
+	const struct llog_operations *lop;
+	struct dt_device *dt;
+	struct thandle *th;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc < 0)
+		RETURN(rc);
+	if (lop->lop_destroy == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	if (handle->lgh_obj == NULL) {
+		/* if lgh_obj == NULL, then it is from client side destroy */
+		rc = lop->lop_destroy(env, handle, NULL);
+		RETURN(rc);
+	}
+
+	if (!llog_exist(handle))
+		RETURN(0);
+
+	dt = lu2dt_dev(handle->lgh_obj->do_lu.lo_dev);
+
+	if (unlikely(unlikely(dt->dd_rdonly)))
+		RETURN(-EROFS);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = llog_declare_destroy(env, handle, th);
+	if (rc != 0)
+		GOTO(out_trans, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc < 0)
+		GOTO(out_trans, rc);
+
+	rc = lop->lop_destroy(env, handle, th);
+
+out_trans:
+	dt_trans_stop(env, dt, th);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_destroy);
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_arr_rec(const struct lu_env *env, struct llog_handle *loghandle,
+			int num, int *index)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct dt_device	*dt;
+	struct llog_log_hdr	*llh;
+	struct thandle		*th;
+	__u32			 tmp_lgc_index;
+	int			 rc, i = 0;
+	int rc1;
+	bool subtract_count = false;
+
+	ENTRY;
+
+	LASSERT(loghandle != NULL);
+	LASSERT(loghandle->lgh_ctxt != NULL);
+	LASSERT(loghandle->lgh_obj != NULL);
+
+	llh = loghandle->lgh_hdr;
+
+	CDEBUG(D_RPCTRACE, "Canceling %d records, first %d in log "DFID"\n",
+	       num, index[0], PFID(&loghandle->lgh_id.lgl_oi.oi_fid));
+
+	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+	if (unlikely(unlikely(dt->dd_rdonly)))
+		RETURN(0);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = llog_declare_write_rec(env, loghandle, &llh->llh_hdr, 0, th);
+	if (rc < 0)
+		GOTO(out_trans, rc);
+
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY)) {
+		rc = llog_declare_destroy(env, loghandle, th);
+		if (rc < 0)
+			GOTO(out_trans, rc);
+	}
+
+	th->th_wait_submit = 1;
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc < 0)
+		GOTO(out_trans, rc);
+
+	down_write(&loghandle->lgh_lock);
+	/* clear bitmap */
+	mutex_lock(&loghandle->lgh_hdr_mutex);
+	for (i = 0; i < num; ++i) {
+		if (index[i] == 0) {
+			CERROR("Can't cancel index 0 which is header\n");
+			GOTO(out_unlock, rc = -EINVAL);
+		}
+		if (!__test_and_clear_bit_le(index[i], LLOG_HDR_BITMAP(llh))) {
+			CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n",
+			       index[i]);
+			GOTO(out_unlock, rc = -ENOENT);
+		}
+	}
+	loghandle->lgh_hdr->llh_count -= num;
+	subtract_count = true;
+
+	/* Since llog_process_thread use lgi_cookie, it`s better to save them
+	 * and restore after using
+	 */
+	tmp_lgc_index = lgi->lgi_cookie.lgc_index;
+	/* Pass this index to llog_osd_write_rec(), which will use the index
+	 * to only update the necesary bitmap. */
+	lgi->lgi_cookie.lgc_index = index[0];
+	/* update header */
+	rc = llog_write_rec(env, loghandle, &llh->llh_hdr, (num != 1 ? NULL :
+			    &lgi->lgi_cookie), LLOG_HEADER_IDX, th);
+	lgi->lgi_cookie.lgc_index = tmp_lgc_index;
+
+	if (rc != 0)
+		GOTO(out_unlock, rc);
+
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1) &&
+	    ((loghandle->lgh_last_idx == LLOG_HDR_BITMAP_SIZE(llh) - 1) ||
+	     (loghandle->u.phd.phd_cat_handle != NULL &&
+	      loghandle->u.phd.phd_cat_handle->u.chd.chd_current_log !=
+		loghandle))) {
+		/* never try to destroy it again */
+		llh->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY;
+		rc = llog_trans_destroy(env, loghandle, th);
+		if (rc < 0) {
+			/* Sigh, can not destroy the final plain llog, but
+			 * the bitmap has been clearly, so the record can not
+			 * be accessed anymore, let's return 0 for now, and
+			 * the orphan will be handled by LFSCK. */
+			CERROR("%s: can't destroy empty llog "DFID": rc = %d\n",
+			       loghandle2name(loghandle),
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rc);
+			GOTO(out_unlock, rc = 0);
+		}
+		rc = LLOG_DEL_PLAIN;
+	}
+
+out_unlock:
+	if (rc < 0) {
+		/* restore bitmap while holding a mutex */
+		if (subtract_count) {
+			loghandle->lgh_hdr->llh_count += num;
+			subtract_count = false;
+		}
+		for (i = i - 1; i >= 0; i--)
+			set_bit_le(index[i], LLOG_HDR_BITMAP(llh));
+	}
+	mutex_unlock(&loghandle->lgh_hdr_mutex);
+	up_write(&loghandle->lgh_lock);
+out_trans:
+	rc1 = dt_trans_stop(env, dt, th);
+	if (rc == 0)
+		rc = rc1;
+	if (rc1 < 0) {
+		mutex_lock(&loghandle->lgh_hdr_mutex);
+		if (subtract_count)
+			loghandle->lgh_hdr->llh_count += num;
+		for (i = i - 1; i >= 0; i--)
+			set_bit_le(index[i], LLOG_HDR_BITMAP(llh));
+		mutex_unlock(&loghandle->lgh_hdr_mutex);
+	}
+	RETURN(rc);
+}
+
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index)
+{
+	return llog_cancel_arr_rec(env, loghandle, 1, &index);
+}
+
+int llog_read_header(const struct lu_env *env, struct llog_handle *handle,
+		     const struct obd_uuid *uuid)
+{
+	const struct llog_operations *lop;
+	int rc;
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	if (lop->lop_read_header == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_read_header(env, handle);
+	if (rc == LLOG_EEMPTY) {
+		struct llog_log_hdr *llh = handle->lgh_hdr;
+
+		/* lrh_len should be initialized in llog_init_handle */
+		handle->lgh_last_idx = 0; /* header is record with index 0 */
+		llh->llh_count = 1;         /* for the header record */
+		llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+		LASSERT(handle->lgh_ctxt->loc_chunk_size >=
+						LLOG_MIN_CHUNK_SIZE);
+		llh->llh_hdr.lrh_len = handle->lgh_ctxt->loc_chunk_size;
+		llh->llh_hdr.lrh_index = 0;
+		llh->llh_timestamp = ktime_get_real_seconds();
+		if (uuid)
+			memcpy(&llh->llh_tgtuuid, uuid,
+			       sizeof(llh->llh_tgtuuid));
+		llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+		/* Since update llog header might also call this function,
+		 * let's reset the bitmap to 0 here */
+		memset(LLOG_HDR_BITMAP(llh), 0, llh->llh_hdr.lrh_len -
+						llh->llh_bitmap_offset -
+						sizeof(llh->llh_tail));
+		set_bit_le(0, LLOG_HDR_BITMAP(llh));
+		LLOG_HDR_TAIL(llh)->lrt_len = llh->llh_hdr.lrh_len;
+		LLOG_HDR_TAIL(llh)->lrt_index = llh->llh_hdr.lrh_index;
+		rc = 0;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_read_header);
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid)
+{
+	struct llog_log_hdr	*llh;
+	enum llog_flag		 fmt = flags & LLOG_F_EXT_MASK;
+	int			 rc;
+	int			chunk_size = handle->lgh_ctxt->loc_chunk_size;
+	ENTRY;
+
+	LASSERT(handle->lgh_hdr == NULL);
+
+	LASSERT(chunk_size >= LLOG_MIN_CHUNK_SIZE);
+	OBD_ALLOC_LARGE(llh, chunk_size);
+	if (llh == NULL)
+		RETURN(-ENOMEM);
+
+	handle->lgh_hdr = llh;
+	handle->lgh_hdr_size = chunk_size;
+	/* first assign flags to use llog_client_ops */
+	llh->llh_flags = flags;
+	rc = llog_read_header(env, handle, uuid);
+	if (rc == 0) {
+		if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+			      flags & LLOG_F_IS_CAT) ||
+			     (llh->llh_flags & LLOG_F_IS_CAT &&
+			      flags & LLOG_F_IS_PLAIN))) {
+			CERROR("%s: llog type is %s but initializing %s\n",
+			       loghandle2name(handle),
+			       llh->llh_flags & LLOG_F_IS_CAT ?
+			       "catalog" : "plain",
+			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+			GOTO(out, rc = -EINVAL);
+		} else if (llh->llh_flags &
+			   (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+			/*
+			 * it is possible to open llog without specifying llog
+			 * type so it is taken from llh_flags
+			 */
+			flags = llh->llh_flags;
+		} else {
+			/* for some reason the llh_flags has no type set */
+			CERROR("llog type is not specified!\n");
+			GOTO(out, rc = -EINVAL);
+		}
+		if (unlikely(uuid &&
+			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+			CERROR("%s: llog uuid mismatch: %s/%s\n",
+			       loghandle2name(handle),
+			       (char *)uuid->uuid,
+			       (char *)llh->llh_tgtuuid.uuid);
+			GOTO(out, rc = -EEXIST);
+		}
+	}
+	if (flags & LLOG_F_IS_CAT) {
+		LASSERT(list_empty(&handle->u.chd.chd_head));
+		INIT_LIST_HEAD(&handle->u.chd.chd_head);
+		llh->llh_size = sizeof(struct llog_logid_rec);
+		llh->llh_flags |= LLOG_F_IS_FIXSIZE;
+	} else if (!(flags & LLOG_F_IS_PLAIN)) {
+		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+		       loghandle2name(handle), flags, LLOG_F_IS_CAT,
+		       LLOG_F_IS_PLAIN);
+		rc = -EINVAL;
+	}
+	llh->llh_flags |= fmt;
+out:
+	if (rc) {
+		OBD_FREE_LARGE(llh, chunk_size);
+		handle->lgh_hdr = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec)
+{
+	int chunk_size = llh->lgh_hdr->llh_hdr.lrh_len;
+
+	if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) {
+		CERROR("%s: record is too large: %d > %d\n",
+		       loghandle2name(llh), rec->lrh_len, chunk_size);
+		return -EINVAL;
+	}
+	if (rec->lrh_index >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) {
+		CERROR("%s: index is too high: %d\n",
+		       loghandle2name(llh), rec->lrh_index);
+		return -EINVAL;
+	}
+	if ((rec->lrh_type & LLOG_OP_MASK) != LLOG_OP_MAGIC) {
+		CERROR("%s: magic %x is bad\n",
+		       loghandle2name(llh), rec->lrh_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(llog_verify_record);
+
+static inline bool llog_is_index_skipable(int idx, struct llog_log_hdr *llh,
+					  struct llog_process_cat_data *cd)
+{
+	if (cd && (cd->lpcd_read_mode & LLOG_READ_MODE_RAW))
+		return false;
+
+	return !test_bit_le(idx, LLOG_HDR_BITMAP(llh));
+}
+
+static int llog_process_thread(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct llog_handle		*loghandle = lpi->lpi_loghandle;
+	struct llog_log_hdr		*llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data	*cd  = lpi->lpi_catdata;
+	struct llog_thread_info		*lti;
+	char				*buf;
+	size_t				 chunk_size;
+	__u64				 cur_offset;
+	int				 rc = 0, index = 1, last_index;
+	int				 saved_index = 0;
+	int				 last_called_index = 0;
+	bool				 repeated = false;
+	bool				refresh_idx = false;
+
+	ENTRY;
+
+	if (llh == NULL)
+		RETURN(-EINVAL);
+
+	lti = lpi->lpi_env == NULL ? NULL : llog_info(lpi->lpi_env);
+
+	cur_offset = chunk_size = llh->llh_hdr.lrh_len;
+	/* expect chunk_size to be power of two */
+	LASSERT(is_power_of_2(chunk_size));
+
+	OBD_ALLOC_LARGE(buf, chunk_size);
+	if (buf == NULL) {
+		lpi->lpi_rc = -ENOMEM;
+		RETURN(0);
+	}
+
+	if (cd != NULL) {
+		last_called_index = cd->lpcd_first_idx;
+		index = cd->lpcd_first_idx + 1;
+	}
+	if (cd && cd->lpcd_last_idx)
+		last_index = cd->lpcd_last_idx;
+	else if (cd && (cd->lpcd_read_mode & LLOG_READ_MODE_RAW))
+		last_index = loghandle->lgh_last_idx;
+	else
+		last_index = LLOG_HDR_BITMAP_SIZE(llh) - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+		off_t chunk_offset = 0;
+		unsigned int buf_offset = 0;
+		int	lh_last_idx;
+		int	synced_idx = 0;
+
+		/* skip records not set in bitmap */
+		while (index <= last_index &&
+		       llog_is_index_skipable(index, llh, cd))
+			++index;
+
+		/* There are no indices prior the last_index */
+		if (index > last_index)
+			break;
+
+		CDEBUG(D_OTHER, "index: %d last_index %d\n", index,
+		       last_index);
+
+repeat:
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, chunk_size);
+		/* the record index for outdated chunk data */
+		/* it is safe to process buffer until saved lgh_last_idx */
+		lh_last_idx = LLOG_HDR_TAIL(llh)->lrt_index;
+		rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+				     index, &cur_offset, buf, chunk_size);
+		if (repeated && rc)
+			CDEBUG(D_OTHER, "cur_offset %llu, chunk_offset %llu,"
+			       " buf_offset %u, rc = %d\n", cur_offset,
+			       (__u64)chunk_offset, buf_offset, rc);
+		if (rc == -ESTALE)
+			GOTO(out, rc = 0);
+		/* we`ve tried to reread the chunk, but there is no
+		 * new records */
+		if (rc == -EIO && repeated && (chunk_offset + buf_offset) ==
+		    cur_offset)
+			GOTO(out, rc = 0);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		/* NB: after llog_next_block() call the cur_offset is the
+		 * offset of the next block after read one.
+		 * The absolute offset of the current chunk is calculated
+		 * from cur_offset value and stored in chunk_offset variable.
+		 */
+		if ((cur_offset & (chunk_size - 1)) != 0)
+			chunk_offset = cur_offset & ~(chunk_size - 1);
+		else
+			chunk_offset = cur_offset - chunk_size;
+
+		/* NB: when rec->lrh_len is accessed it is already swabbed
+		 * since it is used at the "end" of the loop and the rec
+		 * swabbing is done at the beginning of the loop. */
+		for (rec = (struct llog_rec_hdr *)(buf + buf_offset);
+		     (char *)rec < buf + chunk_size;
+		     rec = llog_rec_hdr_next(rec)) {
+
+			CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+			       rec, rec->lrh_type);
+
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+
+			CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+			       rec->lrh_type, rec->lrh_index);
+
+			if (index == (synced_idx + 1) &&
+			    synced_idx == LLOG_HDR_TAIL(llh)->lrt_index)
+				GOTO(out, rc = 0);
+
+			if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) &&
+				cfs_fail_val == (unsigned int)
+					(loghandle->lgh_id.lgl_oi.oi.oi_id &
+					 0xFFFFFFFF)) {
+				OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT);
+			}
+
+			/* the bitmap could be changed during processing
+			 * records from the chunk. For wrapped catalog
+			 * it means we can read deleted record and try to
+			 * process it. Check this case and reread the chunk.
+			 * It is safe to process to lh_last_idx, including
+			 * lh_last_idx if it was synced. We can not do <=
+			 * comparison, cause for wrapped catalog lgh_last_idx
+			 * could be less than index. So we detect last index
+			 * for processing as index == lh_last_idx+1. But when
+			 * catalog is wrapped and full lgh_last_idx=llh_cat_idx,
+			 * the first processing index is llh_cat_idx+1.The
+			 * exception is !(lgh_last_idx == llh_cat_idx &&
+			 * index == llh_cat_idx + 1), and after simplification
+			 * it turns to
+			 * lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index
+			 * This exception is working for catalog only.
+			 */
+
+			if ((index == lh_last_idx && synced_idx != index) ||
+			    (index == (lh_last_idx + 1) &&
+			     lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index) ||
+			    (rec->lrh_index == 0 && !repeated)) {
+
+				/* save offset inside buffer for the re-read */
+				buf_offset = (char *)rec - (char *)buf;
+				cur_offset = chunk_offset;
+				repeated = true;
+				/* We need to be sure lgh_last_idx
+				 * record was saved to disk
+				 */
+				down_read(&loghandle->lgh_last_sem);
+				synced_idx = LLOG_HDR_TAIL(llh)->lrt_index;
+				up_read(&loghandle->lgh_last_sem);
+				CDEBUG(D_OTHER, "synced_idx: %d\n", synced_idx);
+				goto repeat;
+
+			}
+
+			repeated = false;
+
+			rc = llog_verify_record(loghandle, rec);
+			if (rc) {
+				CERROR("%s: invalid record in llog "DFID
+				       " record for index %d/%d: rc = %d\n",
+				       loghandle2name(loghandle),
+				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+				       rec->lrh_index, index, rc);
+				/*
+				 * the block seem to be corrupted, let's try
+				 * with the next one. reset rc to go to the
+				 * next chunk.
+				 */
+				refresh_idx = true;
+				index = 0;
+				GOTO(repeat, rc = 0);
+			}
+
+			if (rec->lrh_index < index) {
+				CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+				       rec->lrh_index);
+				continue;
+			}
+
+			if (rec->lrh_index > index) {
+				/* the record itself looks good, but we met a
+				 * gap which can be result of old bugs, just
+				 * keep going */
+				CERROR("%s: "DFID" index %u, expected %u\n",
+				       loghandle2name(loghandle),
+				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+				       rec->lrh_index, index);
+				index = rec->lrh_index;
+			}
+
+			CDEBUG(D_OTHER,
+			       "lrh_index: %d lrh_len: %d (%d remains)\n",
+			       rec->lrh_index, rec->lrh_len,
+			       (int)(buf + chunk_size - (char *)rec));
+
+			/* lgh_cur_offset is used only at llog_test_3 */
+			loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+						    chunk_offset;
+
+			/* if needed, process the callback on this record */
+			if (!llog_is_index_skipable(index, llh, cd)) {
+				struct llog_cookie *lgc;
+				__u64	tmp_off;
+				int	tmp_idx;
+
+				CDEBUG((llh->llh_flags & LLOG_F_IS_CAT ?
+					D_HA : D_OTHER),
+				       "index: %d, lh_last_idx: %d "
+				       "synced_idx: %d lgh_last_idx: %d\n",
+				       index, lh_last_idx, synced_idx,
+				       loghandle->lgh_last_idx);
+
+				if (lti != NULL) {
+					lgc = &lti->lgi_cookie;
+					/* store lu_env for recursive calls */
+					tmp_off = lgc->lgc_offset;
+					tmp_idx = lgc->lgc_index;
+
+					lgc->lgc_offset = (char *)rec -
+						(char *)buf + chunk_offset;
+					lgc->lgc_index = rec->lrh_index;
+				}
+				/* using lu_env for passing record offset to
+				 * llog_write through various callbacks */
+				rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+						 lpi->lpi_cbdata);
+				last_called_index = index;
+
+				if (lti != NULL) {
+					lgc->lgc_offset = tmp_off;
+					lgc->lgc_index = tmp_idx;
+				}
+
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					rc = llog_cancel_rec(lpi->lpi_env,
+							     loghandle,
+							     rec->lrh_index);
+					/* Allow parallel cancelling, ENOENT
+					 * means record was canceled at another
+					 * processing thread or callback
+					 */
+					if (rc == -ENOENT)
+						rc = 0;
+				}
+				if (rc)
+					GOTO(out, rc);
+				/* some stupid callbacks directly cancel records
+				 * and delete llog. Check it and stop
+				 * processing. */
+				if (loghandle->lgh_hdr == NULL ||
+				    loghandle->lgh_hdr->llh_count == 1)
+					GOTO(out, rc = 0);
+			}
+			/* exit if the last index is reached */
+			if (index >= last_index)
+				GOTO(out, rc = 0);
+			++index;
+		}
+	}
+
+out:
+	CDEBUG(D_HA, "stop processing %s "DOSTID":%x index %d count %d\n",
+	       ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : "plain"),
+	       POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen,
+	       index, llh->llh_count);
+
+	if (cd != NULL)
+		cd->lpcd_last_idx = last_called_index;
+
+	if (unlikely(rc == -EIO && loghandle->lgh_obj != NULL)) {
+		if (dt_object_remote(loghandle->lgh_obj)) {
+			/* If it is remote object, then -EIO might means
+			 * disconnection or eviction, let's return -EAGAIN,
+			 * so for update recovery log processing, it will
+			 * retry until the umount or abort recovery, see
+			 * lod_sub_recovery_thread() */
+			CERROR("%s retry remote llog process\n",
+			       loghandle2name(loghandle));
+			rc = -EAGAIN;
+		} else {
+			/* something bad happened to the processing of a local
+			 * llog file, probably I/O error or the log got
+			 * corrupted to be able to finally release the log we
+			 * discard any remaining bits in the header */
+			CERROR("%s: Local llog found corrupted #"DOSTID":%x"
+			       " %s index %d count %d\n",
+			       loghandle2name(loghandle),
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen,
+			       ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" :
+				"plain"), index, llh->llh_count);
+
+			while (index <= last_index) {
+				if (test_bit_le(index,
+						  LLOG_HDR_BITMAP(llh)) != 0)
+					llog_cancel_rec(lpi->lpi_env, loghandle,
+							index);
+				index++;
+			}
+			rc = 0;
+		}
+	}
+
+	OBD_FREE_LARGE(buf, chunk_size);
+	lpi->lpi_rc = rc;
+	return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct lu_env			 env;
+	int				 rc;
+	struct nsproxy			*new_ns, *curr_ns = current->nsproxy;
+
+	task_lock(lpi->lpi_reftask);
+	new_ns = lpi->lpi_reftask->nsproxy;
+	if (curr_ns != new_ns) {
+		get_nsproxy(new_ns);
+
+		current->nsproxy = new_ns;
+		/* XXX: we should call put_nsproxy() instead of
+		 * atomic_dec(&ns->count) directly. But put_nsproxy() cannot be
+		 * used outside of the kernel itself, because it calls
+		 * free_nsproxy() which is not exported by the kernel
+		 * (defined in kernel/nsproxy.c) */
+		if (curr_ns)
+			atomic_dec(&curr_ns->count);
+	}
+	task_unlock(lpi->lpi_reftask);
+
+	unshare_fs_struct();
+	/* client env has no keys, tags is just 0 */
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		goto out;
+	lpi->lpi_env = &env;
+
+	rc = llog_process_thread(arg);
+
+	lu_env_fini(&env);
+out:
+	complete(&lpi->lpi_completion);
+	return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+	struct llog_process_info *lpi;
+	struct llog_process_data *d = data;
+	struct llog_process_cat_data *cd = catdata;
+	__u32 flags = loghandle->lgh_hdr->llh_flags;
+	int rc;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(lpi);
+	if (lpi == NULL) {
+		CERROR("cannot alloc pointer\n");
+		RETURN(-ENOMEM);
+	}
+	lpi->lpi_loghandle = loghandle;
+	lpi->lpi_cb        = cb;
+	lpi->lpi_cbdata    = data;
+	lpi->lpi_catdata   = catdata;
+
+	CDEBUG(D_OTHER, "Processing "DFID" flags 0x%03x startcat %d startidx %d first_idx %d last_idx %d read_mode %d\n",
+	       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), flags,
+	       (flags & LLOG_F_IS_CAT) && d ? d->lpd_startcat : -1,
+	       (flags & LLOG_F_IS_CAT) && d ? d->lpd_startidx : -1,
+	       cd ? cd->lpcd_first_idx : -1, cd ? cd->lpcd_last_idx : -1,
+	       cd ? cd->lpcd_read_mode : -1);
+	if (fork) {
+		struct task_struct *task;
+
+		/* The new thread can't use parent env,
+		 * init the new one in llog_process_thread_daemonize. */
+		lpi->lpi_env = NULL;
+		init_completion(&lpi->lpi_completion);
+		/* take reference to current, so that
+		 * llog_process_thread_daemonize() can use it to switch to
+		 * namespace associated with current  */
+		lpi->lpi_reftask = current;
+		task = kthread_run(llog_process_thread_daemonize, lpi,
+				   "llog_process_thread");
+		if (IS_ERR(task)) {
+			rc = PTR_ERR(task);
+			CERROR("%s: cannot start thread: rc = %d\n",
+			       loghandle2name(loghandle), rc);
+			GOTO(out_lpi, rc);
+		}
+		wait_for_completion(&lpi->lpi_completion);
+	} else {
+		lpi->lpi_env = env;
+		llog_process_thread(lpi);
+	}
+	rc = lpi->lpi_rc;
+
+out_lpi:
+	OBD_FREE_PTR(lpi);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata)
+{
+	int rc;
+	rc = llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+	return rc == LLOG_DEL_PLAIN ? 0 : rc;
+}
+EXPORT_SYMBOL(llog_process);
+
+static inline const struct cred *llog_raise_resource(void)
+{
+	struct cred *cred = NULL;
+
+	if (cap_raised(current_cap(), CAP_SYS_RESOURCE))
+		return cred;
+
+	cred = prepare_creds();
+	if (!cred)
+		return cred;
+
+	cap_raise(cred->cap_effective, CAP_SYS_RESOURCE);
+	return override_creds(cred);
+}
+
+static inline void llog_restore_resource(const struct cred *old_cred)
+{
+	if (old_cred)
+		revert_creds(old_cred);
+}
+
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata)
+{
+        struct llog_log_hdr *llh = loghandle->lgh_hdr;
+        struct llog_process_cat_data *cd = catdata;
+        void *buf;
+        int rc = 0, first_index = 1, index, idx;
+	__u32	chunk_size = llh->llh_hdr.lrh_len;
+        ENTRY;
+
+	OBD_ALLOC_LARGE(buf, chunk_size);
+	if (buf == NULL)
+		RETURN(-ENOMEM);
+
+	if (cd != NULL)
+		first_index = cd->lpcd_first_idx + 1;
+	if (cd != NULL && cd->lpcd_last_idx)
+		index = cd->lpcd_last_idx;
+	else
+		index = LLOG_HDR_BITMAP_SIZE(llh) - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+		struct llog_rec_tail *tail;
+
+		/* skip records not set in bitmap */
+		while (index >= first_index &&
+		       llog_is_index_skipable(index, llh, cd))
+			--index;
+
+		LASSERT(index >= first_index - 1);
+		if (index == first_index - 1)
+			break;
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, chunk_size);
+		rc = llog_prev_block(env, loghandle, index, buf, chunk_size);
+		if (rc)
+			GOTO(out, rc);
+
+		rec = buf;
+		idx = rec->lrh_index;
+		CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+                while (idx < index) {
+			rec = (void *)rec + rec->lrh_len;
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+                        idx ++;
+                }
+		LASSERT(idx == index);
+		tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+		/* process records in buffer, starting where we found one */
+		while ((void *)tail > buf) {
+			if (tail->lrt_index == 0)
+				GOTO(out, rc = 0); /* no more records */
+
+			/* if needed, process the callback on this record */
+			if (!llog_is_index_skipable(index, llh, cd)) {
+				rec = (void *)tail - tail->lrt_len +
+				      sizeof(*tail);
+
+				rc = cb(env, loghandle, rec, data);
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					rc = llog_cancel_rec(env, loghandle,
+							     tail->lrt_index);
+				}
+                                if (rc)
+                                        GOTO(out, rc);
+                        }
+
+                        /* previous record, still in buffer? */
+                        --index;
+                        if (index < first_index)
+                                GOTO(out, rc = 0);
+			tail = (void *)tail - tail->lrt_len;
+                }
+        }
+
+out:
+	if (buf != NULL)
+		OBD_FREE_LARGE(buf, chunk_size);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ *      llog_open - open llog, may not exist
+ *      llog_exist - check if llog exists
+ *      llog_close - close opened llog, pair for open, frees llog_handle
+ *      llog_declare_create - declare llog creation
+ *      llog_create - create new llog on disk, need transaction handle
+ *      llog_declare_write_rec - declaration of llog write
+ *      llog_write_rec - write llog record on disk, need transaction handle
+ *      llog_declare_add - declare llog catalog record addition
+ *      llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+	const struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_exist == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_exist(loghandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th)
+{
+	const struct cred *old_cred;
+	const struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_declare_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	old_cred = llog_raise_resource();
+	rc = lop->lop_declare_create(env, loghandle, th);
+	llog_restore_resource(old_cred);
+	RETURN(rc);
+}
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th)
+{
+	const struct cred *old_cred;
+	const struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	old_cred = llog_raise_resource();
+	rc = lop->lop_create(env, handle, th);
+	llog_restore_resource(old_cred);
+	RETURN(rc);
+}
+
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th)
+{
+	const struct cred *old_cred;
+	const struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	LASSERT(lop);
+	if (lop->lop_declare_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	old_cred = llog_raise_resource();
+	rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+	llog_restore_resource(old_cred);
+	RETURN(rc);
+}
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int idx, struct thandle *th)
+{
+	const struct cred *old_cred;
+	const struct llog_operations *lop;
+	int rc, buflen;
+
+	ENTRY;
+
+	/* API sanity checks */
+	if (handle == NULL) {
+		CERROR("loghandle is missed\n");
+		RETURN(-EPROTO);
+	} else if (handle->lgh_obj == NULL) {
+		CERROR("loghandle %p with NULL object\n",
+			handle);
+		RETURN(-EPROTO);
+	} else if (th == NULL) {
+		CERROR("%s: missed transaction handle\n",
+		       loghandle2name(handle));
+		RETURN(-EPROTO);
+	} else if (handle->lgh_hdr == NULL) {
+		CERROR("%s: loghandle %p with no header\n",
+		       loghandle2name(handle), handle);
+		RETURN(-EPROTO);
+	}
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	if (lop->lop_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	buflen = rec->lrh_len;
+	LASSERT(cfs_size_round(buflen) == buflen);
+
+	old_cred = llog_raise_resource();
+	rc = lop->lop_write_rec(env, handle, rec, logcookies, idx, th);
+	llog_restore_resource(old_cred);
+	RETURN(rc);
+}
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     struct thandle *th)
+{
+	const struct cred *old_cred;
+	int rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	old_cred = llog_raise_resource();
+	rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, th);
+	llog_restore_resource(old_cred);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	const struct cred *old_cred;
+	int rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_declare_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	old_cred = llog_raise_resource();
+	rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+	llog_restore_resource(old_cred);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name)
+{
+	struct dt_device	*d;
+	struct thandle		*th;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+	if (rc)
+		RETURN(rc);
+
+	if (llog_exist(*res))
+		RETURN(0);
+
+	LASSERT((*res)->lgh_obj != NULL);
+
+	d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+	if (unlikely(unlikely(d->dd_rdonly)))
+		RETURN(-EROFS);
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	/* Create update llog object synchronously, which
+	 * happens during inialization process see
+	 * lod_sub_prep_llog(), to make sure the update
+	 * llog object is created before corss-MDT writing
+	 * updates into the llog object */
+	if (ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID)
+		th->th_sync = 1;
+
+	th->th_wait_submit = 1;
+	rc = llog_declare_create(env, *res, th);
+	if (rc == 0) {
+		rc = dt_trans_start_local(env, d, th);
+		if (rc == 0)
+			rc = llog_create(env, *res, th);
+	}
+	dt_trans_stop(env, d, th);
+out:
+	if (rc)
+		llog_close(env, *res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name)
+{
+	struct llog_handle	*handle;
+	int			 rc = 0, rc2;
+
+	ENTRY;
+
+	/* nothing to erase */
+	if (name == NULL && logid == NULL)
+		RETURN(0);
+
+	rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+	if (rc == 0)
+		rc = llog_destroy(env, handle);
+
+	rc2 = llog_close(env, handle);
+	if (rc == 0)
+		rc = rc2;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, int idx)
+{
+	struct dt_device	*dt;
+	struct thandle		*th;
+	bool			need_cookie;
+	int			rc;
+
+	ENTRY;
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+	LASSERT(loghandle->lgh_obj != NULL);
+
+	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+	if (unlikely(unlikely(dt->dd_rdonly)))
+		RETURN(-EROFS);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	th->th_wait_submit = 1;
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	need_cookie = !(idx == LLOG_HEADER_IDX || idx == LLOG_NEXT_IDX);
+
+	down_write(&loghandle->lgh_lock);
+	if (need_cookie) {
+		struct llog_thread_info *lti = llog_info(env);
+
+		/* cookie comes from llog_process_thread */
+		rc = llog_write_rec(env, loghandle, rec, &lti->lgi_cookie,
+				    rec->lrh_index, th);
+		/* upper layer didn`t pass cookie so change rc */
+		rc = (rc == 1 ? 0 : rc);
+	} else {
+		rc = llog_write_rec(env, loghandle, rec, NULL, idx, th);
+	}
+
+	up_write(&loghandle->lgh_lock);
+out_trans:
+	dt_trans_stop(env, dt, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param)
+{
+	const struct cred *old_cred;
+	int	 rc;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_logops);
+
+	if (ctxt->loc_logops->lop_open == NULL) {
+		*lgh = NULL;
+		RETURN(-EOPNOTSUPP);
+	}
+
+	*lgh = llog_alloc_handle();
+	if (*lgh == NULL)
+		RETURN(-ENOMEM);
+	(*lgh)->lgh_ctxt = ctxt;
+	(*lgh)->lgh_logops = ctxt->loc_logops;
+
+	old_cred = llog_raise_resource();
+	rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+	llog_restore_resource(old_cred);
+	if (rc) {
+		llog_free_handle(*lgh);
+		*lgh = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+	return llog_handle_put(env, loghandle);
+}
+EXPORT_SYMBOL(llog_close);
+
+/**
+ * Helper function to get the llog size in records. It is used by MGS
+ * mostly to check that config llog exists and contains data.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ctxt	llog context
+ * \param[in] name	llog name
+ *
+ * \retval		true if there are records in llog besides a header
+ * \retval		false on error or llog without records
+ */
+int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt,
+		  char *name)
+{
+	struct llog_handle	*llh;
+	int			 rc = 0;
+
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		if (likely(rc == -ENOENT))
+			rc = 0;
+		GOTO(out, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_close, rc);
+	rc = llog_get_size(llh);
+
+out_close:
+	llog_close(env, llh);
+out:
+	/* The header is record 1, the llog is still considered as empty
+	 * if there is only header */
+	return (rc <= 1);
+}
+EXPORT_SYMBOL(llog_is_empty);
+
+/* this callback run in raw read mode (canceled record are processed) */
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+		      struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_handle *copy_llh = data;
+	int idx = rec->lrh_index;
+	int rc;
+
+	ENTRY;
+
+	/* Append all records */
+	rc = llog_write(env, copy_llh, rec, LLOG_NEXT_IDX);
+
+	/* Cancel the record if it is canceled on the source */
+	if (!rc && !test_bit_le(idx, LLOG_HDR_BITMAP(llh->lgh_hdr)))
+		rc = llog_cancel_rec(env, copy_llh, copy_llh->lgh_last_idx);
+
+	RETURN(rc);
+}
+
+/* backup plain llog */
+int llog_backup(const struct lu_env *env, struct obd_device *obd,
+		struct llog_ctxt *ctxt, struct llog_ctxt *bctxt,
+		char *name, char *backup)
+{
+	struct llog_handle *llh, *bllh;
+	struct llog_process_cat_data cd = {0};
+	int rc;
+
+	ENTRY;
+
+	/* open original log */
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		/* the -ENOENT case is also reported to the caller
+		 * but silently so it should handle that if needed.
+		 */
+		if (rc != -ENOENT)
+			CERROR("%s: failed to open log %s: rc = %d\n",
+			       obd->obd_name, name, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	/* Make sure there's no old backup log */
+	rc = llog_erase(env, bctxt, NULL, backup);
+	if (rc < 0 && rc != -ENOENT)
+		GOTO(out_close, rc);
+
+	/* open backup log */
+	rc = llog_open_create(env, bctxt, &bllh, NULL, backup);
+	if (rc) {
+		CERROR("%s: failed to open backup logfile %s: rc = %d\n",
+		       obd->obd_name, backup, rc);
+		GOTO(out_close, rc);
+	}
+
+	/* check that backup llog is not the same object as original one */
+	if (llh->lgh_obj == bllh->lgh_obj) {
+		CERROR("%s: backup llog %s to itself (%s), objects %p/%p\n",
+		       obd->obd_name, name, backup, llh->lgh_obj,
+		       bllh->lgh_obj);
+		GOTO(out_backup, rc = -EEXIST);
+	}
+
+	rc = llog_init_handle(env, bllh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_backup, rc);
+
+	/* Read canceled records to have an exact copy */
+	cd.lpcd_read_mode = LLOG_READ_MODE_RAW;
+	/* Copy log record by record */
+	rc = llog_process_or_fork(env, llh, llog_copy_handler, (void *)bllh,
+				  &cd, false);
+	if (rc)
+		CERROR("%s: failed to backup log %s: rc = %d\n",
+		       obd->obd_name, name, rc);
+out_backup:
+	llog_close(env, bllh);
+out_close:
+	llog_close(env, llh);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_backup);
+
+/* Get size of llog */
+__u64 llog_size(const struct lu_env *env, struct llog_handle *llh)
+{
+	int rc;
+	struct lu_attr la;
+
+	rc = llh->lgh_obj->do_ops->do_attr_get(env, llh->lgh_obj, &la);
+	if (rc) {
+		CERROR("%s: attr_get failed for "DFID": rc = %d\n",
+		       loghandle2name(llh), PFID(&llh->lgh_id.lgl_oi.oi_fid),
+		       rc);
+		return 0;
+	}
+
+	return la.la_size;
+}
+EXPORT_SYMBOL(llog_size);
+
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
new file mode 100644
index 0000000000000..ba44ad3003559
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
@@ -0,0 +1,1198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+	LLOGH_CAT,
+	LLOGH_LOG,
+};
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+			    struct llog_handle *cathandle,
+			    struct llog_handle *loghandle,
+			    struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_logid_rec	*rec = &lgi->lgi_logid;
+	struct thandle *handle = NULL;
+	struct dt_device *dt = NULL;
+	struct llog_log_hdr	*llh = cathandle->lgh_hdr;
+	int			 rc, index;
+
+	ENTRY;
+
+	index = (cathandle->lgh_last_idx + 1) %
+		(OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) ? (cfs_fail_val + 1) :
+						LLOG_HDR_BITMAP_SIZE(llh));
+
+	/* check that new llog index will not overlap with the first one.
+	 * - llh_cat_idx is the index just before the first/oldest still in-use
+	 *	index in catalog
+	 * - lgh_last_idx is the last/newest used index in catalog
+	 *
+	 * When catalog is not wrapped yet then lgh_last_idx is always larger
+	 * than llh_cat_idx. After the wrap around lgh_last_idx re-starts
+	 * from 0 and llh_cat_idx becomes the upper limit for it
+	 *
+	 * Check if catalog has already wrapped around or not by comparing
+	 * last_idx and cat_idx */
+	if ((index == llh->llh_cat_idx + 1 && llh->llh_count > 1) ||
+	    (index == 0 && llh->llh_cat_idx == 0)) {
+		if (cathandle->lgh_name == NULL) {
+			CWARN("%s: there are no more free slots in catalog "
+			      DFID":%x\n",
+			      loghandle2name(loghandle),
+			      PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
+			      cathandle->lgh_id.lgl_ogen);
+		} else {
+			CWARN("%s: there are no more free slots in "
+			      "catalog %s\n", loghandle2name(loghandle),
+			      cathandle->lgh_name);
+		}
+		RETURN(-ENOSPC);
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+		RETURN(-ENOSPC);
+
+	if (loghandle->lgh_hdr != NULL) {
+		/* If llog object is remote and creation is failed, lgh_hdr
+		 * might be left over here, free it first */
+		LASSERT(!llog_exist(loghandle));
+		OBD_FREE_LARGE(loghandle->lgh_hdr, loghandle->lgh_hdr_size);
+		loghandle->lgh_hdr = NULL;
+	}
+
+	if (th == NULL) {
+		dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev);
+
+		handle = dt_trans_create(env, dt);
+		if (IS_ERR(handle))
+			RETURN(PTR_ERR(handle));
+
+		/* Create update llog object synchronously, which
+		 * happens during inialization process see
+		 * lod_sub_prep_llog(), to make sure the update
+		 * llog object is created before corss-MDT writing
+		 * updates into the llog object */
+		if (cathandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID)
+			handle->th_sync = 1;
+
+		handle->th_wait_submit = 1;
+
+		rc = llog_declare_create(env, loghandle, handle);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		rec->lid_hdr.lrh_len = sizeof(*rec);
+		rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+		rec->lid_id = loghandle->lgh_id;
+		rc = llog_declare_write_rec(env, cathandle, &rec->lid_hdr, -1,
+					    handle);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		rc = dt_trans_start_local(env, dt, handle);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		th = handle;
+	}
+
+	rc = llog_create(env, loghandle, th);
+	/* if llog is already created, no need to initialize it */
+	if (rc == -EEXIST) {
+		GOTO(out, rc = 0);
+	} else if (rc != 0) {
+		CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+		       loghandle2name(loghandle), rc);
+		GOTO(out, rc);
+	}
+
+	rc = llog_init_handle(env, loghandle,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &cathandle->lgh_hdr->llh_tgtuuid);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* build the record for this log in the catalog */
+	rec->lid_hdr.lrh_len = sizeof(*rec);
+	rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+	rec->lid_id = loghandle->lgh_id;
+
+	/* append the new record into catalog. The new index will be
+	 * assigned to the record and updated in rec header */
+	rc = llog_write_rec(env, cathandle, &rec->lid_hdr,
+			    &loghandle->u.phd.phd_cookie, LLOG_NEXT_IDX, th);
+	if (rc < 0)
+		GOTO(out_destroy, rc);
+
+	CDEBUG(D_OTHER, "new plain log "DFID".%u of catalog "DFID"\n",
+	       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rec->lid_hdr.lrh_index,
+	       PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
+
+	loghandle->lgh_hdr->llh_cat_idx = rec->lid_hdr.lrh_index;
+
+	/* limit max size of plain llog so that space can be
+	 * released sooner, especially on small filesystems */
+	/* 2MB for the cases when free space hasn't been learned yet */
+	loghandle->lgh_max_size = 2 << 20;
+	dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev);
+	rc = dt_statfs(env, dt, &lgi->lgi_statfs);
+	if (rc == 0 && lgi->lgi_statfs.os_bfree > 0) {
+		__u64 freespace = (lgi->lgi_statfs.os_bfree *
+				  lgi->lgi_statfs.os_bsize) >> 6;
+		if (freespace < loghandle->lgh_max_size)
+			loghandle->lgh_max_size = freespace;
+		/* shouldn't be > 128MB in any case?
+		 * it's 256K records of 512 bytes each */
+		if (freespace > (128 << 20))
+			loghandle->lgh_max_size = 128 << 20;
+	}
+	if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_PLAIN_RECORDS) ||
+		     OBD_FAIL_PRECHECK(OBD_FAIL_CATALOG_FULL_CHECK))) {
+		// limit the numer of plain records for test
+		loghandle->lgh_max_size = loghandle->lgh_hdr_size +
+		       cfs_fail_val * 64;
+	}
+
+	rc = 0;
+
+out:
+	if (handle != NULL) {
+		handle->th_result = rc >= 0 ? 0 : rc;
+		dt_trans_stop(env, dt, handle);
+	}
+	RETURN(rc);
+
+out_destroy:
+	/* to signal llog_cat_close() it shouldn't try to destroy the llog,
+	 * we want to destroy it in this transaction, otherwise the object
+	 * becomes an orphan */
+	loghandle->lgh_hdr->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY;
+	/* this is to mimic full log, so another llog_cat_current_log()
+	 * can skip it and ask for another onet */
+	loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(loghandle->lgh_hdr) + 1;
+	llog_trans_destroy(env, loghandle, th);
+	if (handle != NULL)
+		dt_trans_stop(env, dt, handle);
+	RETURN(rc);
+}
+
+static int llog_cat_refresh(const struct lu_env *env,
+			    struct llog_handle *cathandle)
+{
+	struct llog_handle *loghandle;
+	int rc;
+
+	down_write(&cathandle->lgh_lock);
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+			    u.phd.phd_entry) {
+		if (!llog_exist(loghandle))
+			continue;
+
+		down_write(&loghandle->lgh_lock);
+		rc = llog_read_header(env, loghandle, NULL);
+		up_write(&loghandle->lgh_lock);
+		if (rc)
+			goto unlock;
+	}
+
+	rc = llog_read_header(env, cathandle, NULL);
+unlock:
+	up_write(&cathandle->lgh_lock);
+
+	return rc;
+}
+
+/*
+ * prepare current/next log for catalog.
+ *
+ * if \a *ploghandle is NULL, open it, and declare create, NB, if \a
+ * *ploghandle is remote, create it synchronously here, see comments
+ * below.
+ *
+ * \a cathandle->lgh_lock is down_read-ed, it gets down_write-ed if \a
+ * *ploghandle has to be opened.
+ */
+static int llog_cat_prep_log(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_handle **ploghandle,
+			     struct thandle *th)
+{
+	int rc;
+	int sem_upgraded;
+
+start:
+	rc = 0;
+	sem_upgraded = 0;
+	if (IS_ERR_OR_NULL(*ploghandle)) {
+		up_read(&cathandle->lgh_lock);
+		down_write(&cathandle->lgh_lock);
+		sem_upgraded = 1;
+		if (IS_ERR_OR_NULL(*ploghandle)) {
+			struct llog_handle *loghandle;
+
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (!rc) {
+				*ploghandle = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+					      &cathandle->u.chd.chd_head);
+			}
+		}
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	rc = llog_exist(*ploghandle);
+	if (rc < 0)
+		GOTO(out, rc);
+	if (rc)
+		GOTO(out, rc = 0);
+
+	if (dt_object_remote(cathandle->lgh_obj)) {
+		down_write_nested(&(*ploghandle)->lgh_lock, LLOGH_LOG);
+		if (!llog_exist(*ploghandle)) {
+			/* For remote operation, if we put the llog object
+			 * creation in the current transaction, then the
+			 * llog object will not be created on the remote
+			 * target until the transaction stop, if other
+			 * operations start before the transaction stop,
+			 * and use the same llog object, will be dependent
+			 * on the success of this transaction. So let's
+			 * create the llog object synchronously here to
+			 * remove the dependency. */
+			rc = llog_cat_new_log(env, cathandle, *ploghandle,
+					      NULL);
+			if (rc == -ESTALE) {
+				up_write(&(*ploghandle)->lgh_lock);
+				if (sem_upgraded)
+					up_write(&cathandle->lgh_lock);
+				else
+					up_read(&cathandle->lgh_lock);
+
+				rc = llog_cat_refresh(env, cathandle);
+				down_read_nested(&cathandle->lgh_lock,
+						 LLOGH_CAT);
+				if (rc)
+					return rc;
+				/* *ploghandle might become NULL, restart */
+				goto start;
+			}
+		}
+		up_write(&(*ploghandle)->lgh_lock);
+	} else {
+		struct llog_thread_info	*lgi = llog_info(env);
+		struct llog_logid_rec *lirec = &lgi->lgi_logid;
+
+		rc = llog_declare_create(env, *ploghandle, th);
+		if (rc)
+			GOTO(out, rc);
+
+		lirec->lid_hdr.lrh_len = sizeof(*lirec);
+		rc = llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, -1,
+					    th);
+	}
+
+out:
+	if (sem_upgraded) {
+		up_write(&cathandle->lgh_lock);
+		down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+		if (rc == 0)
+			goto start;
+	}
+	return rc;
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid)
+{
+	struct llog_handle	*loghandle;
+	enum llog_flag		 fmt;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (cathandle == NULL)
+		RETURN(-EBADF);
+
+	fmt = cathandle->lgh_hdr->llh_flags & LLOG_F_EXT_MASK;
+	down_write(&cathandle->lgh_lock);
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+			    u.phd.phd_entry) {
+		struct llog_logid *cgl = &loghandle->lgh_id;
+
+		if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+			if (cgl->lgl_ogen != logid->lgl_ogen) {
+				CWARN("%s: log "DFID" generation %x != %x\n",
+				      loghandle2name(loghandle),
+				      PFID(&logid->lgl_oi.oi_fid),
+				      cgl->lgl_ogen, logid->lgl_ogen);
+				continue;
+			}
+			*res = llog_handle_get(loghandle);
+			if (!*res) {
+				CERROR("%s: log "DFID" refcount is zero!\n",
+				       loghandle2name(loghandle),
+				       PFID(&logid->lgl_oi.oi_fid));
+				continue;
+			}
+			loghandle->u.phd.phd_cat_handle = cathandle;
+			up_write(&cathandle->lgh_lock);
+			RETURN(rc);
+		}
+	}
+	up_write(&cathandle->lgh_lock);
+
+	rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+		       LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		CERROR("%s: error opening log id "DFID":%x: rc = %d\n",
+		       loghandle2name(cathandle), PFID(&logid->lgl_oi.oi_fid),
+		       logid->lgl_ogen, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN |
+			      LLOG_F_ZAP_WHEN_EMPTY | fmt, NULL);
+	if (rc < 0) {
+		llog_close(env, loghandle);
+		*res = NULL;
+		RETURN(rc);
+	}
+
+	*res = llog_handle_get(loghandle);
+	LASSERT(*res);
+	down_write(&cathandle->lgh_lock);
+	list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+	up_write(&cathandle->lgh_lock);
+
+	loghandle->u.phd.phd_cat_handle = cathandle;
+	loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+	loghandle->u.phd.phd_cookie.lgc_index =
+				loghandle->lgh_hdr->llh_cat_idx;
+	RETURN(0);
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+	struct llog_handle	*loghandle, *n;
+	int			 rc;
+
+	ENTRY;
+
+	list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+				 u.phd.phd_entry) {
+		struct llog_log_hdr	*llh = loghandle->lgh_hdr;
+		int			 index;
+
+		/* unlink open-not-created llogs */
+		list_del_init(&loghandle->u.phd.phd_entry);
+		llh = loghandle->lgh_hdr;
+		if (loghandle->lgh_obj != NULL && llh != NULL &&
+		    (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+		    (llh->llh_count == 1)) {
+			rc = llog_destroy(env, loghandle);
+			if (rc)
+				CERROR("%s: failure destroying log during "
+				       "cleanup: rc = %d\n",
+				       loghandle2name(loghandle), rc);
+
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			llog_cat_cleanup(env, cathandle, NULL, index);
+		}
+		llog_close(env, loghandle);
+	}
+	/* if handle was stored in ctxt, remove it too */
+	if (cathandle->lgh_ctxt->loc_handle == cathandle)
+		cathandle->lgh_ctxt->loc_handle = NULL;
+	rc = llog_close(env, cathandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/** Return the currently active log handle.  If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+						struct thandle *th)
+{
+        struct llog_handle *loghandle = NULL;
+        ENTRY;
+
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED2)) {
+		down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+		GOTO(next, loghandle);
+	}
+
+	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+        loghandle = cathandle->u.chd.chd_current_log;
+        if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		if (llh == NULL || !llog_is_full(loghandle)) {
+			up_read(&cathandle->lgh_lock);
+                        RETURN(loghandle);
+                } else {
+			up_write(&loghandle->lgh_lock);
+                }
+        }
+	up_read(&cathandle->lgh_lock);
+
+	/* time to use next log */
+
+	/* first, we have to make sure the state hasn't changed */
+	down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		if (llh == NULL || !llog_is_full(loghandle))
+			GOTO(out_unlock, loghandle);
+		else
+			up_write(&loghandle->lgh_lock);
+	}
+
+next:
+	/* Sigh, the chd_next_log and chd_current_log is initialized
+	 * in declare phase, and we do not serialize the catlog
+	 * accessing, so it might be possible the llog creation
+	 * thread (see llog_cat_declare_add_rec()) did not create
+	 * llog successfully, then the following thread might
+	 * meet this situation. */
+	if (IS_ERR_OR_NULL(cathandle->u.chd.chd_next_log)) {
+		CERROR("%s: next log does not exist!\n",
+		       loghandle2name(cathandle));
+		loghandle = ERR_PTR(-EIO);
+		if (cathandle->u.chd.chd_next_log == NULL) {
+			/* Store the error in chd_next_log, so
+			 * the following process can get correct
+			 * failure value */
+			cathandle->u.chd.chd_next_log = loghandle;
+		}
+		GOTO(out_unlock, loghandle);
+	}
+
+	CDEBUG(D_INODE, "use next log\n");
+
+	loghandle = cathandle->u.chd.chd_next_log;
+	cathandle->u.chd.chd_current_log = loghandle;
+	cathandle->u.chd.chd_next_log = NULL;
+	down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+
+out_unlock:
+	up_write(&cathandle->lgh_lock);
+	LASSERT(loghandle);
+	RETURN(loghandle);
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     struct thandle *th)
+{
+        struct llog_handle *loghandle;
+	int rc, retried = 0;
+	ENTRY;
+
+	LASSERT(rec->lrh_len <= cathandle->lgh_ctxt->loc_chunk_size);
+
+retry:
+	loghandle = llog_cat_current_log(cathandle, th);
+	if (IS_ERR(loghandle))
+		RETURN(PTR_ERR(loghandle));
+
+	/* loghandle is already locked by llog_cat_current_log() for us */
+	if (!llog_exist(loghandle)) {
+		rc = llog_cat_new_log(env, cathandle, loghandle, th);
+		if (rc < 0) {
+			up_write(&loghandle->lgh_lock);
+			/* nobody should be trying to use this llog */
+			down_write(&cathandle->lgh_lock);
+			if (cathandle->u.chd.chd_current_log == loghandle)
+				cathandle->u.chd.chd_current_log = NULL;
+			up_write(&cathandle->lgh_lock);
+			RETURN(rc);
+		}
+	}
+	/* now let's try to add the record */
+	rc = llog_write_rec(env, loghandle, rec, reccookie, LLOG_NEXT_IDX, th);
+	if (rc < 0) {
+		CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+			     "llog_write_rec %d: lh=%p\n", rc, loghandle);
+		/* -ENOSPC is returned if no empty records left
+		 * and when it's lack of space on the stogage.
+		 * there is no point to try again if it's the second
+		 * case. many callers (like llog test) expect ENOSPC,
+		 * so we preserve this error code, but look for the
+		 * actual cause here */
+		if (rc == -ENOSPC && llog_is_full(loghandle))
+			rc = -ENOBUFS;
+	}
+	up_write(&loghandle->lgh_lock);
+
+	if (rc == -ENOBUFS) {
+		if (retried++ == 0)
+			GOTO(retry, rc);
+		CERROR("%s: error on 2nd llog: rc = %d\n",
+		       loghandle2name(cathandle), rc);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	int rc;
+
+	ENTRY;
+
+start:
+	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	rc = llog_cat_prep_log(env, cathandle,
+			       &cathandle->u.chd.chd_current_log, th);
+	if (rc)
+		GOTO(unlock, rc);
+
+	rc = llog_cat_prep_log(env, cathandle, &cathandle->u.chd.chd_next_log,
+			       th);
+	if (rc)
+		GOTO(unlock, rc);
+
+	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+				    rec, -1, th);
+	if (rc == -ESTALE && dt_object_remote(cathandle->lgh_obj)) {
+		up_read(&cathandle->lgh_lock);
+		rc = llog_cat_refresh(env, cathandle);
+		if (rc)
+			RETURN(rc);
+		goto start;
+	}
+
+#if 0
+	/*
+	 * XXX: we hope for declarations made for existing llog this might be
+	 * not correct with some backends where declarations are expected
+	 * against specific object like ZFS with full debugging enabled.
+	 */
+	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_next_log, rec, -1,
+				    th);
+#endif
+unlock:
+	up_read(&cathandle->lgh_lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_device	*dt;
+	struct thandle		*th = NULL;
+	int			 rc;
+
+	ctxt = cathandle->lgh_ctxt;
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+
+	LASSERT(cathandle->lgh_obj != NULL);
+	dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(out_trans, rc);
+	rc = llog_cat_add_rec(env, cathandle, rec, reccookie, th);
+out_trans:
+	dt_trans_stop(env, dt, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+int llog_cat_cancel_arr_rec(const struct lu_env *env,
+			    struct llog_handle *cathandle,
+			    struct llog_logid *lgl, int count, int *index)
+{
+	struct llog_handle *loghandle;
+	int  rc;
+
+	ENTRY;
+	rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+	if (rc) {
+		CDEBUG(D_HA, "%s: cannot find llog for handle "DFID":%x"
+		       ": rc = %d\n", loghandle2name(cathandle),
+		       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, rc);
+		RETURN(rc);
+	}
+
+	if ((cathandle->lgh_ctxt->loc_flags &
+	     LLOG_CTXT_FLAG_NORMAL_FID) && !llog_exist(loghandle)) {
+		/* For update log, some of loghandles of cathandle
+		 * might not exist because remote llog creation might
+		 * be failed, so let's skip the record cancellation
+		 * for these non-exist llogs.
+		 */
+		rc = -ENOENT;
+		CDEBUG(D_HA, "%s: llog "DFID":%x does not exist"
+		       ": rc = %d\n", loghandle2name(cathandle),
+		       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, rc);
+
+		llog_handle_put(env, loghandle);
+		RETURN(rc);
+	}
+
+	rc = llog_cancel_arr_rec(env, loghandle, count, index);
+	if (rc == LLOG_DEL_PLAIN) { /* log has been destroyed */
+		int cat_index;
+
+		cat_index = loghandle->u.phd.phd_cookie.lgc_index;
+		rc = llog_cat_cleanup(env, cathandle, loghandle, cat_index);
+		if (rc)
+			CERROR("%s: fail to cancel catalog record: rc = %d\n",
+			       loghandle2name(cathandle), rc);
+		rc = 0;
+
+	}
+	llog_handle_put(env, loghandle);
+
+	if (rc)
+		CERROR("%s: fail to cancel %d llog-records: rc = %d\n",
+		       loghandle2name(cathandle), count, rc);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_cancel_arr_rec);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies)
+{
+	int i, rc = 0, failed = 0;
+
+	ENTRY;
+
+	for (i = 0; i < count; i++, cookies++) {
+		int lrc;
+
+		lrc = llog_cat_cancel_arr_rec(env, cathandle, &cookies->lgc_lgl,
+					     1, &cookies->lgc_index);
+		if (lrc) {
+			failed++;
+			if (!rc)
+				rc = lrc;
+		}
+	}
+	if (failed)
+		CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+		       loghandle2name(cathandle), failed, count, rc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+static int llog_cat_process_common(const struct lu_env *env,
+				   struct llog_handle *cat_llh,
+				   struct llog_rec_hdr *rec,
+				   struct llog_handle **llhp)
+{
+	struct llog_logid_rec *lir = container_of(rec, typeof(*lir), lid_hdr);
+	struct llog_log_hdr *hdr;
+	int rc;
+
+	ENTRY;
+	if (rec->lrh_type != le32_to_cpu(LLOG_LOGID_MAGIC)) {
+		rc = -EINVAL;
+		CWARN("%s: invalid record in catalog "DFID":%x: rc = %d\n",
+		      loghandle2name(cat_llh),
+		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid),
+		      cat_llh->lgh_id.lgl_ogen, rc);
+		RETURN(rc);
+	}
+	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "DFID"\n",
+	       PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
+	       le32_to_cpu(rec->lrh_index),
+	       PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+
+	rc = llog_cat_id2handle(env, cat_llh, llhp, &lir->lid_id);
+	if (rc) {
+		/* After a server crash, a stub of index record in catlog could
+		 * be kept, because plain log destroy + catlog index record
+		 * deletion are not atomic. So we end up with an index but no
+		 * actual record. Destroy the index and move on. */
+		if (rc == -ENOENT || rc == -ESTALE)
+			rc = LLOG_DEL_RECORD;
+		else if (rc)
+			CWARN("%s: can't find llog handle "DFID":%x: rc = %d\n",
+			      loghandle2name(cat_llh),
+			      PFID(&lir->lid_id.lgl_oi.oi_fid),
+			      lir->lid_id.lgl_ogen, rc);
+
+		RETURN(rc);
+	}
+
+	/* clean old empty llogs, do not consider current llog in use */
+	/* ignore remote (lgh_obj == NULL) llogs */
+	hdr = (*llhp)->lgh_hdr;
+	if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    hdr->llh_count == 1 && cat_llh->lgh_obj != NULL &&
+	    *llhp != cat_llh->u.chd.chd_current_log &&
+	    *llhp != cat_llh->u.chd.chd_next_log) {
+		rc = llog_destroy(env, *llhp);
+		if (rc)
+			CWARN("%s: can't destroy empty log "DFID": rc = %d\n",
+			      loghandle2name((*llhp)),
+			      PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
+		rc = LLOG_DEL_PLAIN;
+	}
+
+	RETURN(rc);
+}
+
+static int llog_cat_process_cb(const struct lu_env *env,
+			       struct llog_handle *cat_llh,
+			       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_handle *llh = NULL;
+	int rc;
+
+	ENTRY;
+	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
+	if (rc)
+		GOTO(out, rc);
+
+	if (rec->lrh_index < d->lpd_startcat) {
+		/* Skip processing of the logs until startcat */
+		rc = 0;
+	} else if (d->lpd_startidx > 0) {
+                struct llog_process_cat_data cd;
+
+                cd.lpcd_read_mode = LLOG_READ_MODE_NORMAL;
+                cd.lpcd_first_idx = d->lpd_startidx;
+                cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  &cd, false);
+		/* Continue processing the next log from idx 0 */
+		d->lpd_startidx = 0;
+	} else {
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  NULL, false);
+	}
+	if (rc == -ENOENT && (cat_llh->lgh_hdr->llh_flags & LLOG_F_RM_ON_ERR)) {
+		/*
+		 * plain llog is reported corrupted, so better to just remove
+		 * it if the caller is fine with that.
+		 */
+		CERROR("%s: remove corrupted/missing llog "DFID"\n",
+		       loghandle2name(cat_llh),
+		       PFID(&llh->lgh_id.lgl_oi.oi_fid));
+		rc = LLOG_DEL_PLAIN;
+	}
+
+out:
+	/* The empty plain log was destroyed while processing */
+	if (rc == LLOG_DEL_PLAIN || rc == LLOG_DEL_RECORD)
+		/* clear wrong catalog entry */
+		rc = llog_cat_cleanup(env, cat_llh, llh, rec->lrh_index);
+
+	if (llh)
+		llog_handle_put(env, llh);
+
+	RETURN(rc);
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cat_cb,
+			     llog_cb_t cb, void *data, int startcat,
+			     int startidx, bool fork)
+{
+	struct llog_process_data d;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+	d.lpd_startcat = (startcat == LLOG_CAT_FIRST ? 0 : startcat);
+	d.lpd_startidx = startidx;
+
+	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
+	    llh->llh_count > 1) {
+		struct llog_process_cat_data cd = {
+			.lpcd_read_mode = LLOG_READ_MODE_NORMAL
+		};
+
+		CWARN("%s: catlog "DFID" crosses index zero\n",
+		      loghandle2name(cat_llh),
+		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+		/*startcat = 0 is default value for general processing */
+		if ((startcat != LLOG_CAT_FIRST &&
+		    startcat >= llh->llh_cat_idx) || !startcat) {
+			/* processing the catalog part at the end */
+			cd.lpcd_first_idx = (startcat ? startcat :
+					     llh->llh_cat_idx);
+			if (OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS))
+				cd.lpcd_last_idx = cfs_fail_val;
+			else
+				cd.lpcd_last_idx = 0;
+			rc = llog_process_or_fork(env, cat_llh, cat_cb,
+						  &d, &cd, fork);
+			/* Reset the startcat becasue it has already reached
+			 * catalog bottom.
+			 */
+			startcat = 0;
+			d.lpd_startcat = 0;
+			if (rc != 0)
+				RETURN(rc);
+		}
+		/* processing the catalog part at the begining */
+		cd.lpcd_first_idx = (startcat == LLOG_CAT_FIRST) ? 0 : startcat;
+		/* Note, the processing will stop at the lgh_last_idx value,
+		 * and it could be increased during processing. So records
+		 * between current lgh_last_idx and lgh_last_idx in future
+		 * would left unprocessed.
+		 */
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_process_or_fork(env, cat_llh, cat_cb,
+					  &d, &cd, fork);
+	} else {
+		rc = llog_process_or_fork(env, cat_llh, cat_cb,
+					  &d, NULL, fork);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_process_or_fork);
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx)
+{
+	return llog_cat_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					cb, data, startcat, startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_size_cb(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_handle *llh = NULL;
+	__u64 *cum_size = d->lpd_data;
+	__u64 size;
+	int rc;
+
+	ENTRY;
+	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
+
+	if (rc == LLOG_DEL_PLAIN) {
+		/* empty log was deleted, don't count it */
+		rc = llog_cat_cleanup(env, cat_llh, llh,
+				      llh->u.phd.phd_cookie.lgc_index);
+	} else if (rc == LLOG_DEL_RECORD) {
+		/* clear wrong catalog entry */
+		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
+	} else {
+		size = llog_size(env, llh);
+		*cum_size += size;
+
+		CDEBUG(D_INFO, "Add llog entry "DFID" size=%llu, tot=%llu\n",
+		       PFID(&llh->lgh_id.lgl_oi.oi_fid), size, *cum_size);
+	}
+
+	if (llh != NULL)
+		llog_handle_put(env, llh);
+
+	RETURN(0);
+}
+
+__u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh)
+{
+	__u64 size = llog_size(env, cat_llh);
+
+	llog_cat_process_or_fork(env, cat_llh, llog_cat_size_cb,
+				 NULL, &size, 0, 0, false);
+
+	return size;
+}
+EXPORT_SYMBOL(llog_cat_size);
+
+/* currently returns the number of "free" entries in catalog,
+ * ie the available entries for a new plain LLOG file creation,
+ * even if catalog has wrapped
+ */
+__u32 llog_cat_free_space(struct llog_handle *cat_llh)
+{
+	/* simulate almost full Catalog */
+	if (OBD_FAIL_CHECK(OBD_FAIL_CAT_FREE_RECORDS))
+		return cfs_fail_val;
+
+	if (cat_llh->lgh_hdr->llh_count == 1)
+		return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1;
+
+	if (cat_llh->lgh_last_idx > cat_llh->lgh_hdr->llh_cat_idx)
+		return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1 +
+		       cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx;
+
+	/* catalog is presently wrapped */
+	return cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx;
+}
+EXPORT_SYMBOL(llog_cat_free_space);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+				       struct llog_handle *cat_llh,
+				       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_handle *llh;
+	int rc;
+
+	ENTRY;
+	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
+
+	/* The empty plain log was destroyed while processing */
+	if (rc == LLOG_DEL_PLAIN) {
+		rc = llog_cat_cleanup(env, cat_llh, llh,
+				      llh->u.phd.phd_cookie.lgc_index);
+	} else if (rc == LLOG_DEL_RECORD) {
+		/* clear wrong catalog entry */
+		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
+	}
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+
+	/* The empty plain was destroyed while processing */
+	if (rc == LLOG_DEL_PLAIN)
+		rc = llog_cat_cleanup(env, cat_llh, llh,
+				      llh->u.phd.phd_cookie.lgc_index);
+
+	llog_handle_put(env, llh);
+	RETURN(rc);
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data)
+{
+        struct llog_process_data d;
+        struct llog_process_cat_data cd;
+        struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+        int rc;
+        ENTRY;
+
+        LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	cd.lpcd_read_mode = LLOG_READ_MODE_NORMAL;
+        d.lpd_data = data;
+        d.lpd_cb = cb;
+
+	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
+	    llh->llh_count > 1) {
+		CWARN("%s: catalog "DFID" crosses index zero\n",
+		      loghandle2name(cat_llh),
+		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+		cd.lpcd_last_idx = 0;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+        } else {
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, NULL);
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+static int llog_cat_set_first_idx(struct llog_handle *cathandle, int idx)
+{
+	struct llog_log_hdr *llh = cathandle->lgh_hdr;
+	int bitmap_size;
+
+	ENTRY;
+
+	bitmap_size = LLOG_HDR_BITMAP_SIZE(llh);
+	/*
+	 * The llh_cat_idx equals to the first used index minus 1
+	 * so if we canceled the first index then llh_cat_idx
+	 * must be renewed.
+	 */
+	if (llh->llh_cat_idx == (idx - 1)) {
+		llh->llh_cat_idx = idx;
+
+		while (idx != cathandle->lgh_last_idx) {
+			idx = (idx + 1) % bitmap_size;
+			if (!test_bit_le(idx, LLOG_HDR_BITMAP(llh))) {
+				/* update llh_cat_idx for each unset bit,
+				 * expecting the next one is set */
+				llh->llh_cat_idx = idx;
+			} else if (idx == 0) {
+				/* skip header bit */
+				llh->llh_cat_idx = 0;
+				continue;
+			} else {
+				/* the first index is found */
+				break;
+			}
+		}
+
+		CDEBUG(D_HA, "catlog "DFID" first idx %u, last_idx %u\n",
+		       PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
+		       llh->llh_cat_idx, cathandle->lgh_last_idx);
+	}
+
+	RETURN(0);
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index)
+{
+	int rc;
+	struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
+
+	LASSERT(index);
+	if (loghandle != NULL) {
+		/* remove destroyed llog from catalog list and
+		 * chd_current_log variable */
+		fid = loghandle->lgh_id.lgl_oi.oi_fid;
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == loghandle)
+			cathandle->u.chd.chd_current_log = NULL;
+		list_del_init(&loghandle->u.phd.phd_entry);
+		up_write(&cathandle->lgh_lock);
+		LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index ||
+			loghandle->u.phd.phd_cookie.lgc_index == 0);
+		/* llog was opened and keep in a list, close it now */
+		llog_close(env, loghandle);
+	}
+
+	/* do not attempt to cleanup on-disk llog if on client side */
+	if (cathandle->lgh_obj == NULL)
+		return 0;
+
+	/* remove plain llog entry from catalog by index */
+	llog_cat_set_first_idx(cathandle, index);
+	rc = llog_cancel_rec(env, cathandle, index);
+	if (rc == 0)
+		CDEBUG(D_HA,
+		       "cancel plain log "DFID" at index %u of catalog "DFID"\n",
+		       PFID(&fid), index,
+		       PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
new file mode 100644
index 0000000000000..096f595e75102
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
@@ -0,0 +1,102 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include <lustre_log.h>
+
+struct llog_process_info {
+	struct llog_handle	*lpi_loghandle;
+	llog_cb_t		 lpi_cb;
+	void			*lpi_cbdata;
+	void			*lpi_catdata;
+	int			 lpi_rc;
+	struct completion	 lpi_completion;
+	const struct lu_env	*lpi_env;
+	struct task_struct      *lpi_reftask;
+};
+
+struct llog_thread_info {
+	struct lu_attr			 lgi_attr;
+	struct lu_fid			 lgi_fid;
+	struct dt_object_format		 lgi_dof;
+	struct lu_buf			 lgi_buf;
+	loff_t				 lgi_off;
+	struct llog_logid_rec		 lgi_logid;
+	struct dt_insert_rec		 lgi_dt_rec;
+	struct lu_seq_range		 lgi_range;
+	struct llog_cookie		 lgi_cookie;
+	struct obd_statfs		 lgi_statfs;
+	char				 lgi_name[32];
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+	struct llog_thread_info *lgi;
+
+	lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+	LASSERT(lgi);
+	return lgi;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+struct llog_handle *llog_handle_get(struct llog_handle *loghandle);
+int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid);
+void llog_get_marker_cfg_flags(struct llog_rec_hdr *rec,
+			       unsigned int *cfg_flags);
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size,
+			     unsigned int *cfg_flags, bool raw);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index);
+
+static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec)
+{
+	return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len);
+}
+int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec);
+static inline char *loghandle2name(const struct llog_handle *lgh)
+{
+	return lgh->lgh_ctxt->loc_obd->obd_name;
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
new file mode 100644
index 0000000000000..62a5b88e2e86b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
@@ -0,0 +1,551 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+static int str2logid(struct llog_logid *logid, char *str, int len)
+{
+	unsigned long long id, seq;
+	char *start, *end;
+	u32 ogen;
+	int rc;
+
+	ENTRY;
+	start = str;
+	if (start[0] == '[') {
+		struct lu_fid *fid = &logid->lgl_oi.oi_fid;
+		struct lu_fid sfid;
+		int num;
+
+		fid_zero(fid);
+		logid->lgl_ogen = 0;
+		num = sscanf(start + 1, SFID, RFID(fid));
+		CDEBUG(D_INFO, DFID":%x\n", PFID(fid), logid->lgl_ogen);
+		logid_to_fid(logid, &sfid);
+		RETURN(num == 3 && fid_is_sane(&sfid) ? 0 : -EINVAL);
+	}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 1, 53, 0)
+	/*
+	 * logids used to be input in the form "#id#seq:ogen" before they
+	 * were changed over to accept the FID [seq:oid:ver] format.
+	 * This is accepted for compatibility reasons, though I doubt
+	 * anyone is actually using this for anything.
+	 */
+	if (start[0] != '#')
+		RETURN(-EINVAL);
+
+	start++;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (end == NULL || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	rc = kstrtoull(start, 0, &id);
+	if (rc)
+		RETURN(rc);
+
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+
+	end = strchr(start, '#');
+	if (!end || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	rc = kstrtoull(start, 0, &seq);
+	if (rc)
+		RETURN(rc);
+
+	ostid_set_seq(&logid->lgl_oi, seq);
+	if (ostid_set_id(&logid->lgl_oi, id))
+		RETURN(-EINVAL);
+
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+
+	rc = kstrtouint(start, 16, &ogen);
+	if (rc)
+                RETURN(-EINVAL);
+	logid->lgl_ogen = ogen;
+
+	RETURN(0);
+#else
+	RETURN(-EINVAL);
+#endif
+}
+
+static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+	struct obd_ioctl_data *ioc_data = data;
+	static int l, remains;
+	static long from, to;
+	static char *out;
+	int cur_index;
+	int rc = 0;
+
+	ENTRY;
+	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			  round_up(ioc_data->ioc_inllen1, 8) +
+			  round_up(ioc_data->ioc_inllen2, 8) +
+			  round_up(ioc_data->ioc_inllen3, 8);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from);
+		if (rc)
+			RETURN(rc);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to);
+		if (rc)
+			RETURN(rc);
+
+		ioc_data->ioc_inllen1 = 0;
+		out = ioc_data->ioc_bulk;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+		struct llog_handle *loghandle;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			l = snprintf(out, remains,
+				     "[index]: %05d  [type]: %02x  [len]: %04d failed\n",
+				     cur_index, rec->lrh_type,
+				     rec->lrh_len);
+		}
+		if (handle->lgh_ctxt == NULL)
+			RETURN(-EOPNOTSUPP);
+		rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
+		if (rc) {
+			CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n",
+			       PFID(&lir->lid_id.lgl_oi.oi_fid),
+			       lir->lid_id.lgl_ogen);
+			RETURN(rc);
+		}
+		rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
+		llog_handle_put(env, loghandle);
+	} else {
+		bool ok;
+
+		switch (rec->lrh_type) {
+		case OST_SZ_REC:
+		case MDS_UNLINK_REC:
+		case MDS_UNLINK64_REC:
+		case MDS_SETATTR64_REC:
+		case OBD_CFG_REC:
+		case LLOG_GEN_REC:
+		case LLOG_HDR_MAGIC:
+			ok = true;
+			break;
+		default:
+			ok = false;
+		}
+
+		l = snprintf(out, remains, "[index]: %05d  [type]: "
+			     "%02x  [len]: %04d %s\n",
+			     cur_index, rec->lrh_type, rec->lrh_len,
+			     ok ? "ok" : "failed");
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: no space to print log records\n",
+			       handle->lgh_ctxt->loc_obd->obd_name);
+			RETURN(-LLOG_EEMPTY);
+		}
+	}
+	RETURN(rc);
+}
+
+struct llog_print_data {
+	struct obd_ioctl_data *lprd_data;
+	unsigned int	       lprd_cfg_flags;
+	bool		       lprd_raw;
+};
+
+#define MARKER_DIFF	10
+static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_print_data *lprd = data;
+	struct obd_ioctl_data *ioc_data = lprd->lprd_data;
+	static int l, remains;
+	static long from, to;
+	static char *out;
+	int cur_index;
+	int rc;
+
+	ENTRY;
+	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			  round_up(ioc_data->ioc_inllen1, 8) +
+			  round_up(ioc_data->ioc_inllen2, 8) +
+			  round_up(ioc_data->ioc_inllen3, 8);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from);
+		if (rc)
+			RETURN(rc);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to);
+		if (rc)
+			RETURN(rc);
+
+		out = ioc_data->ioc_bulk;
+		ioc_data->ioc_inllen1 = 0;
+	}
+
+	cur_index = rec->lrh_index;
+	if (from > MARKER_DIFF && cur_index >= from - MARKER_DIFF &&
+	    cur_index < from) {
+		/* LU-15706: try to remember the marker cfg_flag that the "from"
+		 * is using, in case that the "from" record doesn't know its
+		 * "SKIP" or not flag.
+		 */
+		llog_get_marker_cfg_flags(rec, &lprd->lprd_cfg_flags);
+	}
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			CERROR("invalid record in catalog\n");
+			RETURN(-EINVAL);
+		}
+
+		l = snprintf(out, remains,
+			     "[index]: %05d  [logid]: "DFID":%x\n",
+			     cur_index, PFID(&lir->lid_id.lgl_oi.oi_fid),
+			     lir->lid_id.lgl_ogen);
+	} else if (rec->lrh_type == OBD_CFG_REC) {
+		int rc;
+
+		rc = class_config_yaml_output(rec, out, remains,
+					      &lprd->lprd_cfg_flags,
+					      lprd->lprd_raw);
+		if (rc < 0)
+			RETURN(rc);
+		l = rc;
+	} else {
+		l = snprintf(out, remains,
+			     "[index]: %05d  [type]: %02x  [len]: %04d\n",
+			     cur_index, rec->lrh_type, rec->lrh_len);
+	}
+	out += l;
+	remains -= l;
+	if (remains <= 0) {
+		CERROR("not enough space for print log records\n");
+		RETURN(-LLOG_EEMPTY);
+	}
+
+	RETURN(0);
+}
+static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
+			   struct llog_logid *logid)
+{
+	struct llog_handle *log;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_cat_id2handle(env, cat, &log, logid);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n",
+		       PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen);
+		RETURN(-ENOENT);
+	}
+
+	rc = llog_destroy(env, log);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot destroy log "DFID":%x\n",
+		       PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen);
+		GOTO(out, rc);
+	}
+	llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
+out:
+	llog_handle_put(env, log);
+	RETURN(rc);
+
+}
+
+static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	int rc;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC)
+		RETURN(-EINVAL);
+	rc = llog_remove_log(env, handle, &lir->lid_id);
+
+	RETURN(rc);
+}
+
+
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+	       struct obd_ioctl_data *data)
+{
+	struct llog_logid logid;
+	int rc = 0;
+	struct llog_handle *handle = NULL;
+	char *logname, start;
+
+	ENTRY;
+
+	logname = data->ioc_inlbuf1;
+	start = logname[0];
+	if (start == '#' || start == '[') {
+		rc = str2logid(&logid, logname, data->ioc_inllen1);
+		if (rc)
+			RETURN(rc);
+		rc = llog_open(env, ctxt, &handle, &logid, NULL,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else if (start == '$' || isalpha(start) || isdigit(start)) {
+		if (start == '$')
+			logname++;
+
+		rc = llog_open(env, ctxt, &handle, NULL, logname,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else {
+		rc = -EINVAL;
+		CDEBUG(D_INFO, "%s: invalid log name '%s': rc = %d\n",
+		      ctxt->loc_obd->obd_name, logname, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, handle, 0, NULL);
+	if (rc)
+		GOTO(out_close, rc = -ENOENT);
+
+	switch (cmd) {
+	case OBD_IOC_LLOG_INFO: {
+		int l;
+		int remains = data->ioc_inllen2 +
+				   cfs_size_round(data->ioc_inllen1);
+		char *out = data->ioc_bulk;
+
+		l = snprintf(out, remains,
+			     "logid:            "DFID":%x\n"
+			     "flags:            %x (%s)\n"
+			     "records_count:    %d\n"
+			     "last_index:       %d\n",
+			     PFID(&handle->lgh_id.lgl_oi.oi_fid),
+			     handle->lgh_id.lgl_ogen,
+			     handle->lgh_hdr->llh_flags,
+			     handle->lgh_hdr->llh_flags &
+				LLOG_F_IS_CAT ? "cat" : "plain",
+			     handle->lgh_hdr->llh_count,
+			     handle->lgh_last_idx);
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: not enough space for log header info\n",
+			       ctxt->loc_obd->obd_name);
+			rc = -ENOSPC;
+		}
+		break;
+	}
+	case OBD_IOC_LLOG_CHECK:
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_check_cb, data, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	case OBD_IOC_LLOG_PRINT: {
+		struct llog_print_data lprd = {
+			.lprd_data = data,
+			.lprd_raw = data->ioc_u32_1,
+		};
+
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_print_cb, &lprd, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	}
+	case OBD_IOC_LLOG_CANCEL: {
+		struct llog_cookie cookie;
+		struct llog_logid plain;
+		u32 lgc_index;
+
+		rc = kstrtouint(data->ioc_inlbuf3, 0, &lgc_index);
+		if (rc)
+			GOTO(out_close, rc);
+		cookie.lgc_index = lgc_index;
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_cancel_rec(env, handle, cookie.lgc_index);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */
+			GOTO(out_close, rc = -ENOTTY);
+
+		rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2);
+		if (rc)
+			GOTO(out_close, rc);
+		cookie.lgc_lgl = plain;
+		rc = llog_cat_cancel_records(env, handle, 1, &cookie);
+		if (rc)
+			GOTO(out_close, rc);
+		break;
+	}
+	case OBD_IOC_LLOG_REMOVE: {
+		struct llog_logid plain;
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_destroy(env, handle);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2) {
+			/* remove indicate log from the catalog */
+			rc = str2logid(&plain, data->ioc_inlbuf2,
+				       data->ioc_inllen2);
+			if (rc)
+				GOTO(out_close, rc);
+			rc = llog_remove_log(env, handle, &plain);
+		} else {
+			/* remove all the log of the catalog */
+			rc = llog_process(env, handle, llog_delete_cb, NULL,
+					  NULL);
+			if (rc)
+				GOTO(out_close, rc);
+		}
+		break;
+	}
+	default:
+		CERROR("%s: Unknown ioctl cmd %#x\n",
+		       ctxt->loc_obd->obd_name, cmd);
+		GOTO(out_close, rc = -ENOTTY);
+	}
+
+out_close:
+	if (handle->lgh_hdr &&
+	    handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		llog_cat_close(env, handle);
+	else
+		llog_close(env, handle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_ioctl);
+
+int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
+		      int count, struct obd_ioctl_data *data,
+		      const struct lu_fid *fid)
+{
+	int size, i;
+	struct llog_catid *idarray;
+	struct llog_logid *id;
+	char *out;
+	int l, remains, rc = 0;
+
+	ENTRY;
+
+	if (count == 0) { /* get total number of logs */
+		rc = llog_osd_get_cat_list(env, d, 0, 0, NULL, fid);
+		if (rc < 0)
+			RETURN(rc);
+		count = rc;
+	}
+
+	size = sizeof(*idarray) * count;
+
+	OBD_ALLOC_LARGE(idarray, size);
+	if (!idarray)
+		RETURN(-ENOMEM);
+
+	rc = llog_osd_get_cat_list(env, d, 0, count, idarray, fid);
+	if (rc)
+		GOTO(out, rc);
+
+	out = data->ioc_bulk;
+	remains = data->ioc_inllen1;
+	/* OBD_FAIL: fetch the catalog records from the specified one */
+	if (OBD_FAIL_CHECK(OBD_FAIL_CATLIST))
+		data->ioc_count = cfs_fail_val - 1;
+	for (i = data->ioc_count; i < count; i++) {
+		id = &idarray[i].lci_logid;
+		l = snprintf(out, remains, "catalog_log: "DFID":%x\n",
+			      PFID(&id->lgl_oi.oi_fid), id->lgl_ogen);
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			if (remains < 0) {
+				/* the print is not complete */
+				remains += l;
+				data->ioc_bulk[out - data->ioc_bulk - l] = '\0';
+				data->ioc_count = i;
+			} else {
+				data->ioc_count = i++;
+			}
+			goto out;
+		}
+	}
+	data->ioc_count = 0;
+out:
+	OBD_FREE_LARGE(idarray, size);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_catalog_list);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
new file mode 100644
index 0000000000000..0d05e64047835
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	OBD_ALLOC_PTR(ctxt);
+	if (!ctxt)
+		return NULL;
+
+	ctxt->loc_obd = obd;
+	atomic_set(&ctxt->loc_refcount, 1);
+
+	return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+	if (ctxt->loc_exp) {
+		class_export_put(ctxt->loc_exp);
+		ctxt->loc_exp = NULL;
+	}
+	if (ctxt->loc_imp) {
+		class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = NULL;
+	}
+	OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct obd_llog_group *olg = ctxt->loc_olg;
+	struct obd_device *obd;
+	int rc = 0;
+
+	spin_lock(&olg->olg_lock);
+	if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+		spin_unlock(&olg->olg_lock);
+		return rc;
+	}
+	olg->olg_ctxts[ctxt->loc_idx] = NULL;
+	spin_unlock(&olg->olg_lock);
+
+	obd = ctxt->loc_obd;
+	spin_lock(&obd->obd_dev_lock);
+	/* sync with llog ctxt user thread */
+	spin_unlock(&obd->obd_dev_lock);
+
+	/*
+	 * obd->obd_starting is needed for the case of cleanup
+	 * in error case while obd is starting up.
+	 */
+	LASSERTF(obd->obd_starting == 1 ||
+		 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+		 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+		 !!obd->obd_stopping, !!obd->obd_set_up);
+
+	/* cleanup the llog ctxt here */
+	if (ctxt->loc_logops->lop_cleanup)
+		rc = ctxt->loc_logops->lop_cleanup(env, ctxt);
+
+	llog_ctxt_destroy(ctxt);
+	wake_up(&olg->olg_waitq);
+	return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct obd_llog_group *olg;
+	int rc, idx;
+
+	ENTRY;
+
+	LASSERT(ctxt != NULL);
+	LASSERT(ctxt != LP_POISON);
+
+	olg = ctxt->loc_olg;
+	LASSERT(olg != NULL);
+	LASSERT(olg != LP_POISON);
+
+	idx = ctxt->loc_idx;
+
+	/*
+	 * Banlance the ctxt get when calling llog_cleanup()
+	 */
+	LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+	LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+	llog_ctxt_put(ctxt);
+
+	/*
+	 * Try to free the ctxt.
+	 */
+	rc = __llog_ctxt_put(env, ctxt);
+	if (rc)
+		CERROR("Error %d while cleaning up ctxt %p\n",
+			rc, ctxt);
+
+	l_wait_event_abortable(olg->olg_waitq,
+			       llog_group_ctxt_null(olg, idx));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, const struct llog_operations *op)
+{
+	struct llog_ctxt *ctxt;
+	int rc = 0;
+
+	ENTRY;
+
+	if (index < 0 || index >= LLOG_MAX_CTXTS)
+		RETURN(-EINVAL);
+
+	LASSERT(olg != NULL);
+
+	ctxt = llog_new_ctxt(obd);
+	if (!ctxt)
+		RETURN(-ENOMEM);
+
+	ctxt->loc_obd = obd;
+	ctxt->loc_olg = olg;
+	ctxt->loc_idx = index;
+	ctxt->loc_logops = op;
+	mutex_init(&ctxt->loc_mutex);
+	if (disk_obd != NULL)
+		ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+	else
+		ctxt->loc_exp = class_export_get(obd->obd_self_export);
+
+	ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+	ctxt->loc_chunk_size = LLOG_MIN_CHUNK_SIZE;
+
+	rc = llog_group_set_ctxt(olg, ctxt, index);
+	if (rc) {
+		llog_ctxt_destroy(ctxt);
+		if (rc == -EEXIST) {
+			ctxt = llog_group_get_ctxt(olg, index);
+			if (ctxt) {
+				CDEBUG(D_CONFIG, "%s: ctxt %d already set up\n",
+				       obd->obd_name, index);
+				LASSERT(ctxt->loc_olg == olg);
+				LASSERT(ctxt->loc_obd == obd);
+				if (disk_obd != NULL)
+					LASSERT(ctxt->loc_exp ==
+						disk_obd->obd_self_export);
+				else
+					LASSERT(ctxt->loc_exp ==
+						obd->obd_self_export);
+				LASSERT(ctxt->loc_logops == op);
+				llog_ctxt_put(ctxt);
+			}
+			rc = 0;
+		}
+		RETURN(rc);
+	}
+
+	if (op->lop_setup) {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+			rc = -EOPNOTSUPP;
+		else
+			rc = op->lop_setup(env, obd, olg, index, disk_obd);
+	}
+
+	if (rc) {
+		CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+		       obd->obd_name, index, op->lop_setup, rc);
+		llog_group_clear_ctxt(olg, index);
+		llog_ctxt_destroy(ctxt);
+	} else {
+		CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+		       obd->obd_name, index);
+		ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+	int rc = 0;
+
+	ENTRY;
+	if (ctxt && ctxt->loc_logops->lop_sync)
+		rc = ctxt->loc_logops->lop_sync(ctxt, exp, flags);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_sync);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+
+int llog_info_init(void)
+{
+	llog_key_init_generic(&llog_thread_key, NULL);
+	lu_context_key_register(&llog_thread_key);
+	return 0;
+}
+
+void llog_info_fini(void)
+{
+	lu_context_key_degister(&llog_thread_key);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
new file mode 100644
index 0000000000000..f58bb59982783
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
@@ -0,0 +1,2242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+/*
+ * lustre/obdclass/llog_osd.c
+ *
+ * Low level llog routines on top of OSD API
+ *
+ * This file provides set of methods for llog operations on top of
+ * dt_device. It contains all supported llog_operations interfaces and
+ * supplimental functions.
+ *
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/delay.h>
+
+#include <dt_object.h>
+#include <llog_swab.h>
+#include <lustre_fid.h>
+#include <obd.h>
+#include <obd_class.h>
+
+#include "llog_internal.h"
+#include "local_storage.h"
+
+/**
+ * Implementation of the llog_operations::lop_declare_create
+ *
+ * This function is a wrapper over local_storage API function
+ * local_object_declare_create().
+ *
+ * \param[in] env	execution environment
+ * \param[in] los	local_storage for bottom storage device
+ * \param[in] o		dt_object to create
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful declaration of the new object
+ * \retval		negative error if declaration was failed
+ */
+static int llog_osd_declare_new_object(const struct lu_env *env,
+				       struct local_oid_storage *los,
+				       struct dt_object *o,
+				       struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_declare_create(env, los, o, &lgi->lgi_attr,
+					   &lgi->lgi_dof, th);
+}
+
+/**
+ * Implementation of the llog_operations::lop_create
+ *
+ * This function is a wrapper over local_storage API function
+ * local_object_create().
+ *
+ * \param[in] env	execution environment
+ * \param[in] los	local_storage for bottom storage device
+ * \param[in] o		dt_object to create
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful creation of the new object
+ * \retval		negative error if creation was failed
+ */
+static int llog_osd_create_new_object(const struct lu_env *env,
+				      struct local_oid_storage *los,
+				      struct dt_object *o,
+				      struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_create(env, los, o, &lgi->lgi_attr,
+				   &lgi->lgi_dof, th);
+}
+
+/**
+ * Implementation of the llog_operations::lop_exist
+ *
+ * This function checks that llog exists on storage.
+ *
+ * \param[in] handle	llog handle of the current llog
+ *
+ * \retval		true if llog object exists and is not just destroyed
+ * \retval		false if llog doesn't exist or just destroyed
+ */
+static int llog_osd_exist(struct llog_handle *handle)
+{
+	LASSERT(handle->lgh_obj);
+	return dt_object_exists(handle->lgh_obj) && !handle->lgh_destroyed;
+}
+
+static void *rec_tail(struct llog_rec_hdr *rec)
+{
+	return (void *)((char *)rec + rec->lrh_len -
+			sizeof(struct llog_rec_tail));
+}
+
+/**
+ * Write a padding record to the llog
+ *
+ * This function writes a padding record to the end of llog. That may
+ * be needed if llog contains records of variable size, e.g. config logs
+ * or changelogs.
+ * The padding record just aligns llog to the llog chunk_size boundary if
+ * the current record doesn't fit in the remaining space.
+ *
+ * It allocates full length to avoid two separate writes for header and tail.
+ * Such 2-steps scheme needs extra protection and complex error handling.
+ *
+ * \param[in]     env	execution environment
+ * \param[in]     o	dt_object to create
+ * \param[in,out] off	pointer to the padding start offset
+ * \param[in]     len	padding length
+ * \param[in]     index	index of the padding record in a llog
+ * \param[in]     th	current transaction handle
+ *
+ * \retval		0 on successful padding write
+ * \retval		negative error if write failed
+ */
+static int llog_osd_pad(const struct lu_env *env, struct dt_object *o,
+			loff_t *off, int len, int index, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_rec_hdr	*rec;
+	struct llog_rec_tail	*tail;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(th);
+	LASSERT(off);
+	LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+	OBD_ALLOC(rec, len);
+	if (rec == NULL)
+		RETURN(-ENOMEM);
+
+	rec->lrh_len = len;
+	rec->lrh_index = index;
+	rec->lrh_type = LLOG_PAD_MAGIC;
+
+	tail = rec_tail(rec);
+	tail->lrt_len = len;
+	tail->lrt_index = index;
+
+	lgi->lgi_buf.lb_buf = rec;
+	lgi->lgi_buf.lb_len = len;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc)
+		CERROR("%s: error writing padding record: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+
+	OBD_FREE(rec, len);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_read_header
+ *
+ * This function reads the current llog header from the bottom storage
+ * device.
+ *
+ * \param[in] env	execution environment
+ * \param[in] handle	llog handle of the current llog
+ *
+ * \retval		0 on successful header read
+ * \retval		negative error if read failed
+ */
+static int llog_osd_read_header(const struct lu_env *env,
+				struct llog_handle *handle)
+{
+	struct llog_rec_hdr	*llh_hdr;
+	struct dt_object	*o;
+	struct llog_thread_info	*lgi;
+	enum llog_flag		 flags;
+	int			 rc;
+
+	ENTRY;
+
+	o = handle->lgh_obj;
+	LASSERT(o);
+
+	lgi = llog_info(env);
+
+	dt_read_lock(env, o, 0);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		GOTO(unlock, rc);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+
+	if (lgi->lgi_attr.la_size == 0) {
+		CDEBUG(D_HA, "not reading header from 0-byte log\n");
+		GOTO(unlock, rc = LLOG_EEMPTY);
+	}
+
+	flags = handle->lgh_hdr->llh_flags;
+
+	lgi->lgi_off = 0;
+	lgi->lgi_buf.lb_buf = handle->lgh_hdr;
+	lgi->lgi_buf.lb_len = handle->lgh_hdr_size;
+	rc = dt_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (rc < sizeof(*llh_hdr) || rc < llh_hdr->lrh_len) {
+		CERROR("%s: error reading "DFID" log header size %d: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       PFID(lu_object_fid(&o->do_lu)), rc < 0 ? 0 : rc,
+		       -EFAULT);
+
+		if (rc >= 0)
+			rc = -EFAULT;
+
+		GOTO(unlock, rc);
+	}
+
+	if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+		lustre_swab_llog_hdr(handle->lgh_hdr);
+
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("%s: bad log %s "DFID" header magic: %#x "
+		       "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		GOTO(unlock, rc = -EIO);
+	} else if (llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE ||
+		   llh_hdr->lrh_len > handle->lgh_hdr_size) {
+		CERROR("%s: incorrectly sized log %s "DFID" header: "
+		       "%#x (expected at least %#x)\n"
+		       "you may need to re-run lconf --write_conf.\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_len, LLOG_MIN_CHUNK_SIZE);
+		GOTO(unlock, rc = -EIO);
+	} else if (LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index >
+		   LLOG_HDR_BITMAP_SIZE(handle->lgh_hdr) ||
+		   LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len !=
+			llh_hdr->lrh_len) {
+		CERROR("%s: incorrectly sized log %s "DFID" tailer: "
+		       "%#x : rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len, -EIO);
+		GOTO(unlock, rc = -EIO);
+	}
+
+	handle->lgh_hdr->llh_flags |= (flags & LLOG_F_EXT_MASK);
+	handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index;
+	rc = 0;
+
+unlock:
+	dt_read_unlock(env, o);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_declare_write
+ *
+ * This function declares the new record write.
+ *
+ * \param[in] env	execution environment
+ * \param[in] loghandle	llog handle of the current llog
+ * \param[in] rec	llog record header. This is a real header of the full
+ *			llog record to write. This is the beginning of buffer
+ *			to write, the length of buffer is stored in
+ *			\a rec::lrh_len
+ * \param[in] idx	index of the llog record. If \a idx == -1 then this is
+ *			append case, otherwise \a idx is the index of record
+ *			to modify
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful declaration
+ * \retval		negative error if declaration failed
+ */
+static int llog_osd_declare_write_rec(const struct lu_env *env,
+				      struct llog_handle *loghandle,
+				      struct llog_rec_hdr *rec,
+				      int idx, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	__u32			chunk_size;
+	struct dt_object	*o;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(th);
+	LASSERT(loghandle);
+	LASSERT(rec);
+	LASSERT(rec->lrh_len <= loghandle->lgh_ctxt->loc_chunk_size);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	chunk_size = loghandle->lgh_ctxt->loc_chunk_size;
+	lgi->lgi_buf.lb_len = chunk_size;
+	lgi->lgi_buf.lb_buf = NULL;
+	/* each time we update header */
+	rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0,
+				     th);
+	if (rc || idx == 0) /* if error or just header */
+		RETURN(rc);
+
+	/**
+	 * the pad record can be inserted so take into account double
+	 * record size
+	 */
+	lgi->lgi_buf.lb_len = chunk_size * 2;
+	lgi->lgi_buf.lb_buf = NULL;
+	/* XXX: implement declared window or multi-chunks approach */
+	rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th);
+
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_write
+ *
+ * This function writes the new record in the llog or modify the existed one.
+ *
+ * \param[in]  env		execution environment
+ * \param[in]  loghandle	llog handle of the current llog
+ * \param[in]  rec		llog record header. This is a real header of
+ *				the full llog record to write. This is
+ *				the beginning of buffer to write, the length
+ *				of buffer is stored in \a rec::lrh_len
+ * \param[in,out] reccookie	pointer to the cookie to return back if needed.
+ *				It is used for further cancel of this llog
+ *				record.
+ * \param[in]  idx		index of the llog record. If \a idx == -1 then
+ *				this is append case, otherwise \a idx is
+ *				the index of record to modify
+ * \param[in]  th		current transaction handle
+ *
+ * \retval			0 on successful write && \a reccookie == NULL
+ *				1 on successful write && \a reccookie != NULL
+ * \retval			negative error if write failed
+ */
+static int llog_osd_write_rec(const struct lu_env *env,
+			      struct llog_handle *loghandle,
+			      struct llog_rec_hdr *rec,
+			      struct llog_cookie *reccookie,
+			      int idx, struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct llog_log_hdr *llh;
+	int reclen = rec->lrh_len;
+	int index, rc;
+	struct llog_rec_tail *lrt;
+	struct dt_object *o;
+	__u32 chunk_size;
+	size_t left;
+	__u32 orig_last_idx;
+	bool pad = false;
+	ENTRY;
+
+	llh = loghandle->lgh_hdr;
+	o = loghandle->lgh_obj;
+
+	chunk_size = llh->llh_hdr.lrh_len;
+	CDEBUG(D_OTHER, "new record %x to "DFID"\n",
+	       rec->lrh_type, PFID(lu_object_fid(&o->do_lu)));
+
+	if (!llog_osd_exist(loghandle))
+		RETURN(-ENOENT);
+
+	/* record length should not bigger than  */
+	if (reclen > loghandle->lgh_hdr->llh_hdr.lrh_len)
+		RETURN(-E2BIG);
+
+	/* sanity check for fixed-records llog */
+	if (idx != LLOG_HEADER_IDX && (llh->llh_flags & LLOG_F_IS_FIXSIZE)) {
+		LASSERT(llh->llh_size != 0);
+		LASSERT(llh->llh_size == reclen);
+	}
+
+	/* return error if osp object is stale */
+	if (idx != LLOG_HEADER_IDX && dt_object_stale(o))
+		RETURN(-ESTALE);
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		RETURN(rc);
+
+	/**
+	 * The modification case.
+	 * If idx set then the record with that index must be modified.
+	 * There are three cases possible:
+	 * 1) the common case is the llog header update (idx == 0)
+	 * 2) the llog record modification during llog process.
+	 *    This is indicated by the \a loghandle::lgh_cur_idx > 0.
+	 *    In that case the \a loghandle::lgh_cur_offset
+	 * 3) otherwise this is assumed that llog consist of records of
+	 *    fixed size, i.e. catalog. The llog header must has llh_size
+	 *    field equal to record size. The record offset is calculated
+	 *    just by /a idx value
+	 *
+	 * During modification we don't need extra header update because
+	 * the bitmap and record count are not changed. The record header
+	 * and tail remains the same too.
+	 */
+	if (idx != LLOG_NEXT_IDX) {
+		/* llog can be empty only when first record is being written */
+		LASSERT(ergo(idx > 0, lgi->lgi_attr.la_size > 0));
+
+		if (!test_bit_le(idx, LLOG_HDR_BITMAP(llh))) {
+			CERROR("%s: modify unset record %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx);
+			RETURN(-ENOENT);
+		}
+
+		if (idx != rec->lrh_index) {
+			CERROR("%s: modify index mismatch %d %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+			       rec->lrh_index);
+			RETURN(-EFAULT);
+		}
+
+		if (idx == LLOG_HEADER_IDX) {
+			/* llog header update */
+			__u32	*bitmap = LLOG_HDR_BITMAP(llh);
+
+			lgi->lgi_off = 0;
+
+			/* If it does not indicate the bitmap index
+			 * (reccookie == NULL), then it means update
+			 * the whole update header. Otherwise only
+			 * update header and bits needs to be updated,
+			 * and in DNE cases, it will signaficantly
+			 * shrink the RPC size.
+			 * see distribute_txn_cancel_records()*/
+			if (reccookie == NULL) {
+				lgi->lgi_buf.lb_len = reclen;
+				lgi->lgi_buf.lb_buf = rec;
+				rc = dt_record_write(env, o, &lgi->lgi_buf,
+						     &lgi->lgi_off, th);
+				RETURN(rc);
+			}
+
+			/* update the header */
+			lgi->lgi_buf.lb_len = llh->llh_bitmap_offset;
+			lgi->lgi_buf.lb_buf = llh;
+			rc = dt_record_write(env, o, &lgi->lgi_buf,
+					     &lgi->lgi_off, th);
+			if (rc != 0)
+				RETURN(rc);
+
+			/* update the bitmap */
+			index = reccookie->lgc_index;
+			lgi->lgi_off = llh->llh_bitmap_offset +
+				      (index / (sizeof(*bitmap) * 8)) *
+							sizeof(*bitmap);
+			lgi->lgi_buf.lb_len = sizeof(*bitmap);
+			lgi->lgi_buf.lb_buf =
+					&bitmap[index/(sizeof(*bitmap)*8)];
+			rc = dt_record_write(env, o, &lgi->lgi_buf,
+					     &lgi->lgi_off, th);
+
+			RETURN(rc);
+		} else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
+			lgi->lgi_off = llh->llh_hdr.lrh_len +
+				       (idx - 1) * reclen;
+		} else if (reccookie != NULL && reccookie->lgc_index > 0) {
+			/**
+			 * The lgc_offset can be used only if index is
+			 * the same.
+			 */
+			if (idx != reccookie->lgc_index) {
+				CERROR("%s: modify index mismatch %d %d\n",
+				       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+				       reccookie->lgc_index);
+				RETURN(-EFAULT);
+			}
+
+			lgi->lgi_off = reccookie->lgc_offset;
+			CDEBUG(D_OTHER, "modify record "DFID": idx:%u, "
+			       "len:%u offset %llu\n",
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), idx,
+			       rec->lrh_len, (long long)lgi->lgi_off);
+		} else {
+			/* This can be result of lgh_cur_idx is not set during
+			 * llog processing or llh_size is not set to proper
+			 * record size for fixed records llog. Therefore it is
+			 * impossible to get record offset. */
+			CERROR("%s: can't get record offset, idx:%d, "
+			       "len:%u.\n", o->do_lu.lo_dev->ld_obd->obd_name,
+			       idx, rec->lrh_len);
+			RETURN(-EFAULT);
+		}
+
+		/* update only data, header and tail remain the same */
+		lgi->lgi_off += sizeof(struct llog_rec_hdr);
+		lgi->lgi_buf.lb_len = REC_DATA_LEN(rec);
+		lgi->lgi_buf.lb_buf = REC_DATA(rec);
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc == 0 && reccookie) {
+			reccookie->lgc_lgl = loghandle->lgh_id;
+			reccookie->lgc_index = idx;
+			rc = 1;
+		}
+		RETURN(rc);
+	}
+
+	/**
+	 * The append case.
+	 * The most common case of using llog. The new index is assigned to
+	 * the new record, new bit is set in llog bitmap and llog count is
+	 * incremented.
+	 *
+	 * Make sure that records don't cross a chunk boundary, so we can
+	 * process them page-at-a-time if needed.  If it will cross a chunk
+	 * boundary, write in a fake (but referenced) entry to pad the chunk.
+	 */
+
+
+	/* simulate ENOSPC when new plain llog is being added to the
+	 * catalog */
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED2) &&
+	    llh->llh_flags & LLOG_F_IS_CAT)
+		RETURN(-ENOSPC);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+	orig_last_idx = loghandle->lgh_last_idx;
+	lgi->lgi_off = lgi->lgi_attr.la_size;
+
+	if (loghandle->lgh_max_size > 0 &&
+	    lgi->lgi_off >= loghandle->lgh_max_size) {
+		CDEBUG(D_OTHER, "llog is getting too large (%u > %u) at %u "
+		       DFID"\n", (unsigned)lgi->lgi_off,
+		       loghandle->lgh_max_size, (int)loghandle->lgh_last_idx,
+		       PFID(&loghandle->lgh_id.lgl_oi.oi_fid));
+		/* this is to signal that this llog is full */
+		loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1;
+		RETURN(-ENOSPC);
+	}
+
+	left = chunk_size - (lgi->lgi_off & (chunk_size - 1));
+	/* NOTE: padding is a record, but no bit is set */
+	if (left != 0 && left != reclen &&
+	    left < (reclen + LLOG_MIN_REC_SIZE)) {
+		index = loghandle->lgh_last_idx + 1;
+		rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th);
+		if (rc)
+			RETURN(rc);
+
+		loghandle->lgh_last_idx++; /* for pad rec */
+		pad = true;
+	}
+	/* if it's the last idx in log file, then return -ENOSPC
+	 * or wrap around if a catalog */
+	if (llog_is_full(loghandle) ||
+	    unlikely(llh->llh_flags & LLOG_F_IS_CAT &&
+		     OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) &&
+		     loghandle->lgh_last_idx >= cfs_fail_val)) {
+		if (llh->llh_flags & LLOG_F_IS_CAT)
+			loghandle->lgh_last_idx = 0;
+		else
+			RETURN(-ENOSPC);
+	}
+
+	down_write(&loghandle->lgh_last_sem);
+	/* increment the last_idx along with llh_tail index, they should
+	 * be equal for a llog lifetime */
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLOG_ADD_GAP) && --cfs_fail_val == 0)
+		loghandle->lgh_last_idx++;
+	loghandle->lgh_last_idx++;
+	index = loghandle->lgh_last_idx;
+	LLOG_HDR_TAIL(llh)->lrt_index = index;
+	/**
+	 * NB: the caller should make sure only 1 process access
+	 * the lgh_last_idx, e.g. append should be exclusive.
+	 * Otherwise it might hit the assert.
+	 */
+	LASSERT(index < LLOG_HDR_BITMAP_SIZE(llh));
+	rec->lrh_index = index;
+	lrt = rec_tail(rec);
+	lrt->lrt_len = rec->lrh_len;
+	lrt->lrt_index = rec->lrh_index;
+
+	/* the lgh_hdr_mutex protects llog header data from concurrent
+	 * update/cancel, the llh_count and llh_bitmap are protected */
+	mutex_lock(&loghandle->lgh_hdr_mutex);
+	if (__test_and_set_bit_le(index, LLOG_HDR_BITMAP(llh))) {
+		CERROR("%s: index %u already set in llog bitmap "DFID"\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, index,
+		       PFID(lu_object_fid(&o->do_lu)));
+		mutex_unlock(&loghandle->lgh_hdr_mutex);
+		LBUG(); /* should never happen */
+	}
+	llh->llh_count++;
+
+	if (!(llh->llh_flags & LLOG_F_IS_FIXSIZE)) {
+		/* Update the minimum size of the llog record */
+		if (llh->llh_size == 0)
+			llh->llh_size = reclen;
+		else if (reclen < llh->llh_size)
+			llh->llh_size = reclen;
+	}
+
+	/*
+	 * readers (e.g. llog_osd_read_header()) must not find
+	 * llog updated partially (bitmap/counter claims record,
+	 * but a record hasn't been added yet) as this results
+	 * in EIO.
+	 */
+	dt_write_lock(env, o, 0);
+
+	if (lgi->lgi_attr.la_size == 0) {
+		lgi->lgi_off = 0;
+		lgi->lgi_buf.lb_len = llh->llh_hdr.lrh_len;
+		lgi->lgi_buf.lb_buf = &llh->llh_hdr;
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc != 0)
+			GOTO(out_unlock, rc);
+	} else {
+		__u32	*bitmap = LLOG_HDR_BITMAP(llh);
+
+		/* Note: If this is not initialization (size == 0), then do not
+		 * write the whole header (8k bytes), only update header/tail
+		 * and bits needs to be updated. Because this update might be
+		 * part of cross-MDT operation, which needs to write these
+		 * updates into the update log(32KB limit) and also pack inside
+		 * the RPC (1MB limit), if we write 8K for each operation, which
+		 * will cost a lot space, and keep us adding more updates to one
+		 * update log.*/
+		lgi->lgi_off = 0;
+		lgi->lgi_buf.lb_len = llh->llh_bitmap_offset;
+		lgi->lgi_buf.lb_buf = &llh->llh_hdr;
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc != 0)
+			GOTO(out_unlock, rc);
+
+		lgi->lgi_off = llh->llh_bitmap_offset +
+			      (index / (sizeof(*bitmap) * 8)) * sizeof(*bitmap);
+		lgi->lgi_buf.lb_len = sizeof(*bitmap);
+		lgi->lgi_buf.lb_buf = &bitmap[index/(sizeof(*bitmap)*8)];
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc != 0)
+			GOTO(out_unlock, rc);
+
+		lgi->lgi_off =  (unsigned long)LLOG_HDR_TAIL(llh) -
+				(unsigned long)llh;
+		lgi->lgi_buf.lb_len = sizeof(llh->llh_tail);
+		lgi->lgi_buf.lb_buf = LLOG_HDR_TAIL(llh);
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc != 0)
+			GOTO(out_unlock, rc);
+	}
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PAUSE_AFTER_PAD) && pad) {
+		/* a window for concurrent llog reader, see LU-12577 */
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LLOG_PAUSE_AFTER_PAD,
+				 cfs_fail_val ?: 1);
+	}
+
+out_unlock:
+	/* unlock here for remote object */
+	mutex_unlock(&loghandle->lgh_hdr_mutex);
+	if (rc) {
+		dt_write_unlock(env, o);
+		GOTO(out, rc);
+	}
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) &&
+	   cfs_fail_val == (unsigned int)(loghandle->lgh_id.lgl_oi.oi.oi_id &
+					  0xFFFFFFFF)) {
+		OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT);
+		msleep(1 * MSEC_PER_SEC);
+	}
+	/* computed index can be used to determine offset for fixed-size
+	 * records. This also allows to handle Catalog wrap around case */
+	if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
+		lgi->lgi_off = llh->llh_hdr.lrh_len + (index - 1) * reclen;
+	} else {
+		rc = dt_attr_get(env, o, &lgi->lgi_attr);
+		if (rc) {
+			dt_write_unlock(env, o);
+			GOTO(out, rc);
+		}
+
+		LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+		lgi->lgi_off = max_t(__u64, lgi->lgi_attr.la_size,
+				     lgi->lgi_off);
+	}
+
+	lgi->lgi_buf.lb_len = reclen;
+	lgi->lgi_buf.lb_buf = rec;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+
+	dt_write_unlock(env, o);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	up_write(&loghandle->lgh_last_sem);
+
+	CDEBUG(D_HA, "added record "DFID".%u, %u off%llu\n",
+	       PFID(lu_object_fid(&o->do_lu)), index, rec->lrh_len,
+	       lgi->lgi_off);
+	if (reccookie != NULL) {
+		reccookie->lgc_lgl = loghandle->lgh_id;
+		reccookie->lgc_index = index;
+		if ((rec->lrh_type == MDS_UNLINK_REC) ||
+		    (rec->lrh_type == MDS_SETATTR64_REC))
+			reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+		else if (rec->lrh_type == OST_SZ_REC)
+			reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+		else
+			reccookie->lgc_subsys = -1;
+		rc = 1;
+	}
+	RETURN(rc);
+out:
+	/* cleanup llog for error case */
+	mutex_lock(&loghandle->lgh_hdr_mutex);
+	clear_bit_le(index, LLOG_HDR_BITMAP(llh));
+	llh->llh_count--;
+	mutex_unlock(&loghandle->lgh_hdr_mutex);
+
+	/* restore llog last_idx */
+	if (dt_object_remote(o)) {
+		loghandle->lgh_last_idx = orig_last_idx;
+	} else if (--loghandle->lgh_last_idx == 0 &&
+	    (llh->llh_flags & LLOG_F_IS_CAT) && llh->llh_cat_idx != 0) {
+		/* catalog had just wrap-around case */
+		loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1;
+	}
+
+	LLOG_HDR_TAIL(llh)->lrt_index = loghandle->lgh_last_idx;
+	up_write(&loghandle->lgh_last_sem);
+
+	RETURN(rc);
+}
+
+/**
+ * We can skip reading at least as many log blocks as the number of
+ * minimum sized log records we are skipping.  If it turns out
+ * that we are not far enough along the log (because the
+ * actual records are larger than minimum size) we just skip
+ * some more records.
+ *
+ * Note: in llog_process_thread, it will use bitmap offset as
+ * the index to locate the record, which also includs some pad
+ * records, whose record size is very small, and it also does not
+ * consider pad record when recording minimum record size (otherwise
+ * min_record size might be too small), so in some rare cases,
+ * it might skip too much record for @goal, see llog_osd_next_block().
+ *
+ * When force_mini_rec is true, it means we have to use LLOG_MIN_REC_SIZE
+ * as the min record size to skip over, usually because in the previous
+ * try, it skip too much record, see loog_osd_next(prev)_block().
+ */
+static inline void llog_skip_over(struct llog_handle *lgh, __u64 *off,
+				  int curr, int goal, __u32 chunk_size,
+				  bool force_mini_rec)
+{
+	struct llog_log_hdr *llh = lgh->lgh_hdr;
+
+	/* Goal should not bigger than the record count */
+	if (goal > lgh->lgh_last_idx)
+		goal = lgh->lgh_last_idx;
+
+	if (goal > curr) {
+		if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
+			*off = chunk_size + (goal - 1) * llh->llh_size;
+		} else {
+			__u64 min_rec_size = LLOG_MIN_REC_SIZE;
+
+			if (llh->llh_size > 0 && !force_mini_rec)
+				min_rec_size = llh->llh_size;
+
+			*off = *off + (goal - curr - 1) * min_rec_size;
+		}
+	}
+	/* always align with lower chunk boundary*/
+	*off &= ~(chunk_size - 1);
+}
+
+/**
+ * Remove optional fields that the client doesn't expect.
+ * This is typically in order to ensure compatibility with older clients.
+ * It is assumed that since we exclusively remove fields, the block will be
+ * big enough to handle the remapped records. It is also assumed that records
+ * of a block have the same format (i.e.: the same features enabled).
+ *
+ * \param[in,out]    hdr	   Header of the block of records to remap.
+ * \param[in,out]    last_hdr      Last header, don't read past this point.
+ * \param[in]        flags	   Flags describing the fields to keep.
+ * \param[in]        extra_flags   Flags describing the extra fields to keep.
+ */
+static void changelog_block_trim_ext(struct llog_rec_hdr *hdr,
+				     struct llog_rec_hdr *last_hdr,
+				     struct llog_handle *loghandle)
+{
+	enum changelog_rec_flags flags = CLF_SUPPORTED;
+	enum changelog_rec_extra_flags extra_flags = CLFE_SUPPORTED;
+
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_XATTR))
+		extra_flags &= ~CLFE_XATTR;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_OMODE))
+		extra_flags &= ~CLFE_OPEN;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_NID))
+		extra_flags &= ~CLFE_NID;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_UIDGID))
+		extra_flags &= ~CLFE_UIDGID;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_EXTRA_FLAGS))
+		flags &= ~CLF_EXTRA_FLAGS;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
+		flags &= ~CLF_JOBID;
+
+	if (flags == CLF_SUPPORTED && extra_flags == CLFE_SUPPORTED)
+		return;
+
+	if (hdr->lrh_type != CHANGELOG_REC)
+		return;
+
+	do {
+		struct changelog_rec *rec = (struct changelog_rec *)(hdr + 1);
+		enum changelog_rec_extra_flags xflag = CLFE_INVALID;
+
+		if (flags & CLF_EXTRA_FLAGS &&
+		    rec->cr_flags & CLF_EXTRA_FLAGS) {
+			xflag = changelog_rec_extra_flags(rec)->cr_extra_flags &
+				extra_flags;
+		}
+
+		if (unlikely(hdr->lrh_len == 0)) {
+			/* It is corruption case, we cannot know the next rec,
+			 * jump to the last one directly to avoid dead loop. */
+			LCONSOLE(D_WARNING, "Hit invalid llog record: "
+				 "idx %u, type %u, id %u\n",
+				 hdr->lrh_index, hdr->lrh_type, hdr->lrh_id);
+			hdr = llog_rec_hdr_next(last_hdr);
+			if (unlikely(hdr == last_hdr))
+				LCONSOLE(D_WARNING, "The last record crashed: "
+					 "idx %u, type %u, id %u\n",
+					 hdr->lrh_index, hdr->lrh_type,
+					 hdr->lrh_id);
+			break;
+		}
+
+		changelog_remap_rec(rec, hdr->lrh_len - sizeof(struct llog_rec_hdr),
+				    rec->cr_flags & flags, xflag);
+		hdr = llog_rec_hdr_next(hdr);
+		/* Yield CPU to avoid soft-lockup if there are too many records
+		 * to be handled. */
+		cond_resched();
+	} while ((char *)hdr <= (char *)last_hdr);
+}
+
+/**
+ * Implementation of the llog_operations::lop_next_block
+ *
+ * This function finds the the next llog block to return which contains
+ * record with required index. It is main part of llog processing.
+ *
+ * \param[in]     env		execution environment
+ * \param[in]     loghandle	llog handle of the current llog
+ * \param[in,out] cur_idx	index preceeding cur_offset
+ * \param[in]     next_idx	target index to find
+ * \param[in,out] cur_offset	furtherst point read in the file
+ * \param[in]     buf		pointer to data buffer to fill
+ * \param[in]     len		required len to read, it is
+ *				usually llog chunk_size.
+ *
+ * \retval			0 on successful buffer read
+ * \retval			negative value on error
+ */
+static int llog_osd_next_block(const struct lu_env *env,
+			       struct llog_handle *loghandle, int *cur_idx,
+			       int next_idx, __u64 *cur_offset, void *buf,
+			       int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	int			 rc;
+	__u32			chunk_size;
+	int last_idx = *cur_idx;
+	__u64 last_offset = *cur_offset;
+	bool force_mini_rec = false;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(lgi);
+
+	chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len;
+	if (len == 0 || len & (chunk_size - 1))
+		RETURN(-EINVAL);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_CHANGELOG_DEL) &&
+	    cfs_fail_val == ((unsigned long)loghandle & 0xFFFFFFFF)) {
+		OBD_RACE(OBD_FAIL_MDS_CHANGELOG_DEL);
+		msleep(MSEC_PER_SEC >> 2);
+	}
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	dt_read_lock(env, o, 0);
+	if (!llog_osd_exist(loghandle))
+		GOTO(out, rc = -ESTALE); //object was destroyed
+
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		GOTO(out, rc);
+
+	CDEBUG(D_OTHER,
+	       "looking for log index %u (cur idx %u off %llu), size %llu\n",
+	       next_idx, *cur_idx,
+	       *cur_offset, lgi->lgi_attr.la_size);
+
+	while (*cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		llog_skip_over(loghandle, cur_offset, *cur_idx,
+			       next_idx, chunk_size, force_mini_rec);
+
+		/* read up to next llog chunk_size block */
+		lgi->lgi_buf.lb_len = chunk_size -
+				      (*cur_offset & (chunk_size - 1));
+		lgi->lgi_buf.lb_buf = buf;
+
+		rc = dt_read(env, o, &lgi->lgi_buf, cur_offset);
+		if (rc < 0) {
+			if (rc == -EBADR && !force_mini_rec)
+				goto retry;
+
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset %llu: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), *cur_offset,
+			       rc);
+			GOTO(out, rc);
+		}
+
+		if (rc < len) {
+			/* signal the end of the valid buffer to
+			 * llog_process */
+			memset(buf + rc, 0, len - rc);
+		}
+
+		if (rc == 0) { /* end of file, nothing to do */
+			if (!force_mini_rec)
+				goto retry;
+			GOTO(out, rc);
+		}
+
+		if (rc < sizeof(*tail)) {
+			if (!force_mini_rec)
+				goto retry;
+
+			CERROR("%s: invalid llog block at log id "DFID":%x "
+			       "offset %llu\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+
+		if (llog_verify_record(loghandle, rec)) {
+			/*
+			 * the block seems corrupted. make a pad record so the
+			 * caller can skip the block and try with the next one
+			 */
+			rec->lrh_len = rc;
+			rec->lrh_index = next_idx;
+			rec->lrh_type = LLOG_PAD_MAGIC;
+
+			tail = rec_tail(rec);
+			tail->lrt_len = rc;
+			tail->lrt_index = next_idx;
+
+			GOTO(out, rc = 0);
+		}
+
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   tail->lrt_len);
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+
+		if (last_rec->lrh_index != tail->lrt_index) {
+			CERROR("%s: invalid llog tail at log id "DFID":%x offset %llu last_rec idx %u tail idx %u lrt len %u read_size %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset,
+			       last_rec->lrh_index, tail->lrt_index,
+			       tail->lrt_len, rc);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		*cur_idx = tail->lrt_index;
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DFID":%x "
+			       "offset %llu bytes %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset, rc);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < next_idx) {
+			last_idx = *cur_idx;
+			last_offset = *cur_offset;
+			continue;
+		}
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (next_idx && rec->lrh_index > next_idx) {
+			if (!force_mini_rec && next_idx > last_idx)
+				goto retry;
+
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, next_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+
+		/* Trim unsupported extensions for compat w/ older clients */
+		changelog_block_trim_ext(rec, last_rec, loghandle);
+
+		GOTO(out, rc = 0);
+
+retry:
+		/* Note: because there are some pad records in the
+		 * llog, so llog_skip_over() might skip too much
+		 * records, let's try skip again with minimum record */
+		force_mini_rec = true;
+		*cur_offset = last_offset;
+		*cur_idx = last_idx;
+	}
+	GOTO(out, rc = -EIO);
+out:
+	dt_read_unlock(env, o);
+	return rc;
+}
+
+/**
+ * Implementation of the llog_operations::lop_prev_block
+ *
+ * This function finds the llog block to return which contains
+ * record with required index but in reverse order - from end of llog
+ * to the beginning.
+ * It is main part of reverse llog processing.
+ *
+ * \param[in] env	execution environment
+ * \param[in] loghandle	llog handle of the current llog
+ * \param[in] prev_idx	target index to find
+ * \param[in] buf	pointer to data buffer to fill
+ * \param[in] len	required len to read, it is llog_chunk_size usually.
+ *
+ * \retval		0 on successful buffer read
+ * \retval		negative value on error
+ */
+static int llog_osd_prev_block(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       int prev_idx, void *buf, int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	loff_t			 cur_offset;
+	__u32			chunk_size;
+	int			 rc;
+
+	ENTRY;
+
+	chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len;
+	if (len == 0 || len & (chunk_size - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	dt_read_lock(env, o, 0);
+	if (!llog_osd_exist(loghandle))
+		GOTO(out, rc = -ESTALE);
+
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	/* Let's only use mini record size for previous block read
+	 * for now XXX */
+	cur_offset = chunk_size;
+	llog_skip_over(loghandle, &cur_offset, 0, prev_idx,
+		       chunk_size, true);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		GOTO(out, rc);
+
+	while (cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		lgi->lgi_buf.lb_len = len;
+		lgi->lgi_buf.lb_buf = buf;
+		rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset);
+		if (rc < 0) {
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset %llu: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), cur_offset, rc);
+			GOTO(out, rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			GOTO(out, rc);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("%s: invalid llog block at log id "DFID":%x "
+			       "offset %llu\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DFID":%x "
+			       "offset %llu\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < prev_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > prev_idx) {
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, prev_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+
+		/* Trim unsupported extensions for compat w/ older clients */
+		changelog_block_trim_ext(rec, last_rec, loghandle);
+
+		GOTO(out, rc = 0);
+	}
+	GOTO(out, rc = -EIO);
+out:
+	dt_read_unlock(env, o);
+	return rc;
+}
+
+/**
+ * This is helper function to get llog directory object. It is used by named
+ * llog operations to find/insert/delete llog entry from llog directory.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ctxt	llog context
+ *
+ * \retval		dt_object of llog directory
+ * \retval		ERR_PTR of negative value on error
+ */
+static struct dt_object *llog_osd_dir_get(const struct lu_env *env,
+					  struct llog_ctxt *ctxt)
+{
+	struct dt_device	*dt;
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dir;
+	int			 rc;
+
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	if (ctxt->loc_dir == NULL) {
+		rc = dt_root_get(env, dt, &dti->dti_fid);
+		if (rc)
+			return ERR_PTR(rc);
+		dir = dt_locate(env, dt, &dti->dti_fid);
+
+		if (!IS_ERR(dir) && !dt_try_as_dir(env, dir)) {
+			dt_object_put(env, dir);
+			return ERR_PTR(-ENOTDIR);
+		}
+	} else {
+		lu_object_get(&ctxt->loc_dir->do_lu);
+		dir = ctxt->loc_dir;
+	}
+
+	return dir;
+}
+
+/**
+ * Implementation of the llog_operations::lop_open
+ *
+ * This function opens the llog by its logid or by name, it may open also
+ * non existent llog and assing then new id to it.
+ * The llog_open/llog_close pair works similar to lu_object_find/put,
+ * the object may not exist prior open. The result of open is just dt_object
+ * in the llog header.
+ *
+ * \param[in] env		execution environment
+ * \param[in] handle		llog handle of the current llog
+ * \param[in] logid		logid of llog to open (nameless llog)
+ * \param[in] name		name of llog to open (named llog)
+ * \param[in] open_param
+ *				LLOG_OPEN_NEW - new llog, may not exist
+ *				LLOG_OPEN_EXIST - old llog, must exist
+ *
+ * \retval			0 on successful open, llog_handle::lgh_obj
+ *				contains the dt_object of the llog.
+ * \retval			negative value on error
+ */
+static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_logid *logid, char *name,
+			 enum llog_open_param open_param)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt = handle->lgh_ctxt;
+	struct dt_object		*o;
+	struct dt_device		*dt;
+	struct ls_device		*ls;
+	struct local_oid_storage	*los = NULL;
+	int				 rc = 0;
+	bool new_id = false;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	LASSERT(ctxt->loc_exp->exp_obd);
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	LASSERT(dt);
+	if (ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		struct lu_object_conf conf = { 0 };
+		if (logid != NULL) {
+			logid_to_fid(logid, &lgi->lgi_fid);
+		} else {
+			/* If logid == NULL, then it means the caller needs
+			 * to allocate new FID (llog_cat_declare_add_rec()). */
+			rc = dt_fid_alloc(env, dt, &lgi->lgi_fid, NULL, NULL);
+			if (rc < 0)
+				RETURN(rc);
+			rc = 0;
+			conf.loc_flags = LOC_F_NEW;
+		}
+
+		o = dt_locate_at(env, dt, &lgi->lgi_fid,
+				 dt->dd_lu_dev.ld_site->ls_top_dev, &conf);
+		if (IS_ERR(o))
+			RETURN(PTR_ERR(o));
+
+		goto after_open;
+	}
+
+	ls = ls_device_get(dt);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG);
+	mutex_unlock(&ls->ls_los_mutex);
+	LASSERT(los);
+	ls_device_put(env, ls);
+
+	LASSERT(handle);
+
+	if (logid != NULL) {
+		logid_to_fid(logid, &lgi->lgi_fid);
+	} else if (name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out, rc = PTR_ERR(llog_dir));
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid);
+		dt_read_unlock(env, llog_dir);
+		dt_object_put(env, llog_dir);
+		if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+			/* generate fid for new llog */
+			rc = local_object_fid_generate(env, los,
+						       &lgi->lgi_fid);
+			new_id = true;
+		}
+		if (rc < 0)
+			GOTO(out, rc);
+		OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+		if (handle->lgh_name)
+			strcpy(handle->lgh_name, name);
+		else
+			GOTO(out, rc = -ENOMEM);
+	} else {
+		LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param);
+		/* generate fid for new llog */
+generate:
+		rc = local_object_fid_generate(env, los, &lgi->lgi_fid);
+		if (rc < 0)
+			GOTO(out, rc);
+		new_id = true;
+	}
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_LLOG_UMOUNT_RACE) &&
+	    cfs_fail_val == 1) {
+		cfs_fail_val = 2;
+		OBD_RACE(OBD_FAIL_MDS_LLOG_UMOUNT_RACE);
+		msleep(MSEC_PER_SEC);
+	}
+	o = ls_locate(env, ls, &lgi->lgi_fid, NULL);
+	if (IS_ERR(o))
+		GOTO(out_name, rc = PTR_ERR(o));
+
+	if (dt_object_exists(o) && new_id) {
+		/* llog exists with just generated ID, e.g. some old llog file
+		 * still is in use or is orphan, drop a warn and skip it. */
+		CDEBUG(D_INFO, "%s: llog exists with the same FID: "DFID
+		       ", skipping\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       PFID(lu_object_fid(&o->do_lu)));
+		dt_object_put(env, o);
+		/* just skip this llog ID, we shouldn't delete it because we
+		 * don't know exactly what is its purpose and state. */
+		goto generate;
+	}
+
+after_open:
+	/* No new llog is expected but doesn't exist */
+	if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o)) {
+		CDEBUG(D_INFO, "%s: llog FID: "DFID" obj %p doesn`t exist\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       PFID(lu_object_fid(&o->do_lu)), o);
+		GOTO(out_put, rc = -ENOENT);
+	}
+	fid_to_logid(&lgi->lgi_fid, &handle->lgh_id);
+	handle->lgh_obj = o;
+	handle->private_data = los;
+	LASSERT(handle->lgh_ctxt);
+
+	RETURN(rc);
+
+out_put:
+	dt_object_put(env, o);
+out_name:
+	if (handle->lgh_name != NULL)
+		OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+	if (los != NULL)
+		dt_los_put(los);
+	RETURN(rc);
+}
+
+/**
+ * Get dir for regular fid log object
+ *
+ * Get directory for regular fid log object, and these regular fid log
+ * object will be inserted under this directory, to satisfy the FS
+ * consistency check, e2fsck etc.
+ *
+ * \param [in] env	execution environment
+ * \param [in] dto	llog object
+ *
+ * \retval		pointer to the directory if it is found.
+ * \retval		ERR_PTR(negative errno) if it fails.
+ */
+struct dt_object *llog_osd_get_regular_fid_dir(const struct lu_env *env,
+					       struct dt_object *dto)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct seq_server_site *ss = dto->do_lu.lo_dev->ld_site->ld_seq_site;
+	struct lu_seq_range	*range = &lgi->lgi_range;
+	struct lu_fid		*dir_fid = &lgi->lgi_fid;
+	struct dt_object	*dir;
+	int			rc;
+	ENTRY;
+
+	fld_range_set_any(range);
+	LASSERT(ss != NULL);
+	rc = ss->ss_server_fld->lsf_seq_lookup(env, ss->ss_server_fld,
+				   fid_seq(lu_object_fid(&dto->do_lu)), range);
+	if (rc < 0)
+		RETURN(ERR_PTR(rc));
+
+	lu_update_log_dir_fid(dir_fid, range->lsr_index);
+	dir = dt_locate(env, lu2dt_dev(dto->do_lu.lo_dev), dir_fid);
+	if (IS_ERR(dir))
+		RETURN(dir);
+
+	if (!dt_try_as_dir(env, dir)) {
+		dt_object_put(env, dir);
+		RETURN(ERR_PTR(-ENOTDIR));
+	}
+
+	RETURN(dir);
+}
+
+/**
+ * Add llog object with regular FID to name entry
+ *
+ * Add llog object with regular FID to name space, and each llog
+ * object on each MDT will be /update_log_dir/[seq:oid:ver],
+ * so to satisfy the namespace consistency check, e2fsck etc.
+ *
+ * \param [in] env	execution environment
+ * \param [in] dto	llog object
+ * \param [in] th	thandle
+ * \param [in] declare	if it is declare or execution
+ *
+ * \retval		0 if insertion succeeds.
+ * \retval		negative errno if insertion fails.
+ */
+static int
+llog_osd_regular_fid_add_name_entry(const struct lu_env *env,
+				    struct dt_object *dto,
+				    struct thandle *th, bool declare)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	const struct lu_fid	*fid = lu_object_fid(&dto->do_lu);
+	struct dt_insert_rec	*rec = &lgi->lgi_dt_rec;
+	struct dt_object	*dir;
+	char			*name = lgi->lgi_name;
+	int			rc;
+	ENTRY;
+
+	if (!fid_is_norm(fid))
+		RETURN(0);
+
+	dir = llog_osd_get_regular_fid_dir(env, dto);
+	if (IS_ERR(dir))
+		RETURN(PTR_ERR(dir));
+
+	rec->rec_fid = fid;
+	rec->rec_type = S_IFREG;
+	snprintf(name, sizeof(lgi->lgi_name), DFID, PFID(fid));
+	dt_write_lock(env, dir, 0);
+	if (declare) {
+		rc = dt_declare_insert(env, dir, (struct dt_rec *)rec,
+			       (struct dt_key *)name, th);
+	} else {
+		rc = dt_insert(env, dir, (struct dt_rec *)rec,
+			       (struct dt_key *)name, th);
+	}
+	dt_write_unlock(env, dir);
+
+	dt_object_put(env, dir);
+	RETURN(rc);
+}
+
+
+/**
+ * Implementation of the llog_operations::lop_declare_create
+ *
+ * This function declares the llog create. It declares also name insert
+ * into llog directory in case of named llog.
+ *
+ * \param[in] env	execution environment
+ * \param[in] res	llog handle of the current llog
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful create declaration
+ * \retval		negative value on error
+ */
+static int llog_osd_declare_create(const struct lu_env *env,
+				   struct llog_handle *res, struct thandle *th)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct dt_insert_rec		*rec = &lgi->lgi_dt_rec;
+	struct local_oid_storage	*los;
+	struct dt_object		*o;
+	int				 rc;
+
+	ENTRY;
+
+	LASSERT(res->lgh_obj);
+	LASSERT(th);
+
+	/* object can be created by another thread */
+	o = res->lgh_obj;
+	if (dt_object_exists(o))
+		RETURN(0);
+
+	if (res->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		struct llog_thread_info *lgi = llog_info(env);
+
+		lgi->lgi_attr.la_valid = LA_MODE | LA_SIZE;
+		lgi->lgi_attr.la_size = 0;
+		lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		if (rc < 0)
+			RETURN(rc);
+
+
+		rc = llog_osd_regular_fid_add_name_entry(env, o, th, true);
+
+		RETURN(rc);
+	}
+	los = res->private_data;
+	LASSERT(los);
+
+	rc = llog_osd_declare_new_object(env, los, o, th);
+	if (rc)
+		RETURN(rc);
+
+	/* do not declare header initialization here as it's declared
+	 * in llog_osd_declare_write_rec() which is always called */
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		rec->rec_fid = &lgi->lgi_fid;
+		rec->rec_type = S_IFREG;
+		rc = dt_declare_insert(env, llog_dir,
+				       (struct dt_rec *)rec,
+				       (struct dt_key *)res->lgh_name, th);
+		dt_object_put(env, llog_dir);
+		if (rc)
+			CERROR("%s: can't declare named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_create
+ *
+ * This function creates the llog according with llog_handle::lgh_obj
+ * and llog_handle::lgh_name.
+ *
+ * \param[in] env	execution environment
+ * \param[in] res	llog handle of the current llog
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful create
+ * \retval		negative value on error
+ */
+static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
+			   struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct dt_insert_rec	*rec = &lgi->lgi_dt_rec;
+	struct local_oid_storage *los;
+	struct dt_object        *o;
+	int                      rc = 0;
+
+	ENTRY;
+
+	LASSERT(env);
+	o = res->lgh_obj;
+	LASSERT(o);
+
+	/* llog can be already created */
+	if (dt_object_exists(o))
+		RETURN(-EEXIST);
+
+	if (res->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		struct llog_thread_info *lgi = llog_info(env);
+
+		lgi->lgi_attr.la_valid = LA_MODE | LA_SIZE | LA_TYPE;
+		lgi->lgi_attr.la_size = 0;
+		lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		dt_write_lock(env, o, 0);
+		rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+			       &lgi->lgi_dof, th);
+		dt_write_unlock(env, o);
+		if (rc < 0)
+			RETURN(rc);
+
+		rc = llog_osd_regular_fid_add_name_entry(env, o, th, false);
+
+		RETURN(rc);
+	}
+
+	los = res->private_data;
+	LASSERT(los);
+
+	dt_write_lock(env, o, 0);
+	if (!dt_object_exists(o))
+		rc = llog_osd_create_new_object(env, los, o, th);
+	else
+		rc = -EEXIST;
+
+	dt_write_unlock(env, o);
+	if (rc)
+		RETURN(rc);
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		rec->rec_fid = &lgi->lgi_fid;
+		rec->rec_type = S_IFREG;
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_insert(env, llog_dir, (struct dt_rec *)rec,
+			       (struct dt_key *)res->lgh_name, th);
+		dt_read_unlock(env, llog_dir);
+		dt_object_put(env, llog_dir);
+		if (rc)
+			CERROR("%s: can't create named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_close
+ *
+ * This function closes the llog. It just put llog object and referenced
+ * local storage.
+ *
+ * \param[in] env	execution environment
+ * \param[in] handle	llog handle of the current llog
+ *
+ * \retval		0 on successful llog close
+ * \retval		negative value on error
+ */
+static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle)
+{
+	struct local_oid_storage	*los;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(handle->lgh_obj);
+
+	if (handle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		/* Remove the object from the cache, otherwise it may
+		 * hold LOD being released during cleanup process */
+		dt_object_put_nocache(env, handle->lgh_obj);
+		LASSERT(handle->private_data == NULL);
+		RETURN(rc);
+	} else {
+		dt_object_put(env, handle->lgh_obj);
+	}
+	los = handle->private_data;
+	LASSERT(los);
+	dt_los_put(los);
+
+	if (handle->lgh_name)
+		OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+
+	RETURN(rc);
+}
+
+/**
+ * delete llog object name entry
+ *
+ * Delete llog object (with regular FID) from name space (under
+ * update_log_dir).
+ *
+ * \param [in] env	execution environment
+ * \param [in] dto	llog object
+ * \param [in] th	thandle
+ * \param [in] declare	if it is declare or execution
+ *
+ * \retval		0 if deletion succeeds.
+ * \retval		negative errno if deletion fails.
+ */
+static int
+llog_osd_regular_fid_del_name_entry(const struct lu_env *env,
+				    struct dt_object *dto,
+				    struct thandle *th, bool declare)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	const struct lu_fid	*fid = lu_object_fid(&dto->do_lu);
+	struct dt_object	*dir;
+	char			*name = lgi->lgi_name;
+	int			rc;
+	ENTRY;
+
+	if (!fid_is_norm(fid))
+		RETURN(0);
+
+	dir = llog_osd_get_regular_fid_dir(env, dto);
+	if (IS_ERR(dir))
+		RETURN(PTR_ERR(dir));
+
+	snprintf(name, sizeof(lgi->lgi_name), DFID, PFID(fid));
+	dt_write_lock(env, dir, 0);
+	if (declare) {
+		rc = dt_declare_delete(env, dir, (struct dt_key *)name,
+				       th);
+	} else {
+		rc = dt_delete(env, dir, (struct dt_key *)name, th);
+	}
+	dt_write_unlock(env, dir);
+
+	dt_object_put(env, dir);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_declare_destroy
+ *
+ * This function declare destroys the llog and deletes also entry in the
+ * llog directory in case of named llog. Llog should be opened prior that.
+ *
+ * \param[in] env		execution environment
+ * \param[in] loghandle	llog handle of the current llog
+ *
+ * \retval		0 on successful destroy
+ * \retval		negative value on error
+ */
+static int llog_osd_declare_destroy(const struct lu_env *env,
+				    struct llog_handle *loghandle,
+				    struct thandle *th)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o, *llog_dir = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = loghandle->lgh_ctxt;
+	LASSERT(ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	if (loghandle->lgh_name) {
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+
+		rc = dt_declare_delete(env, llog_dir,
+				       (struct dt_key *)loghandle->lgh_name,
+				       th);
+		if (rc < 0)
+			GOTO(out_put, rc);
+	}
+
+	rc = dt_declare_ref_del(env, o, th);
+	if (rc < 0)
+		GOTO(out_put, rc);
+
+	rc = dt_declare_destroy(env, o, th);
+	if (rc < 0)
+		GOTO(out_put, rc);
+
+	if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		rc = llog_osd_regular_fid_del_name_entry(env, o, th, true);
+		if (rc < 0)
+			GOTO(out_put, rc);
+	}
+
+out_put:
+	if (!(IS_ERR_OR_NULL(llog_dir)))
+		dt_object_put(env, llog_dir);
+
+	RETURN(rc);
+}
+
+
+/**
+ * Implementation of the llog_operations::lop_destroy
+ *
+ * This function destroys the llog and deletes also entry in the
+ * llog directory in case of named llog. Llog should be opened prior that.
+ * Destroy method is not part of external transaction and does everything
+ * inside.
+ *
+ * \param[in] env		execution environment
+ * \param[in] loghandle	llog handle of the current llog
+ *
+ * \retval		0 on successful destroy
+ * \retval		negative value on error
+ */
+static int llog_osd_destroy(const struct lu_env *env,
+			    struct llog_handle *loghandle, struct thandle *th)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o, *llog_dir = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = loghandle->lgh_ctxt;
+	LASSERT(ctxt != NULL);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o != NULL);
+
+	dt_write_lock(env, o, 0);
+	if (!llog_osd_exist(loghandle))
+		GOTO(out_unlock, rc = 0);
+
+	if (loghandle->lgh_name) {
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out_unlock, rc = PTR_ERR(llog_dir));
+
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_delete(env, llog_dir,
+			       (struct dt_key *)loghandle->lgh_name,
+			       th);
+		dt_read_unlock(env, llog_dir);
+		if (rc) {
+			CERROR("%s: can't remove llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       loghandle->lgh_name, rc);
+			GOTO(out_unlock, rc);
+		}
+	}
+
+	dt_ref_del(env, o, th);
+	rc = dt_destroy(env, o, th);
+	if (rc < 0)
+		GOTO(out_unlock, rc);
+
+	loghandle->lgh_destroyed = true;
+	if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		rc = llog_osd_regular_fid_del_name_entry(env, o, th, false);
+		if (rc < 0)
+			GOTO(out_unlock, rc);
+	}
+
+out_unlock:
+	dt_write_unlock(env, o);
+	if (!(IS_ERR_OR_NULL(llog_dir)))
+		dt_object_put(env, llog_dir);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_setup
+ *
+ * This function setup the llog on local storage.
+ *
+ * \param[in] env	execution environment
+ * \param[in] obd	obd device the llog belongs to
+ * \param[in] olg	the llog group, it is always zero group now.
+ * \param[in] ctxt_idx	the llog index, it defines the purpose of this llog.
+ *			Every new llog type have to use own index.
+ * \param[in] disk_obd	the storage obd, where llog is stored.
+ *
+ * \retval		0 on successful llog setup
+ * \retval		negative value on error
+ */
+static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd,
+			  struct obd_llog_group *olg, int ctxt_idx,
+			  struct obd_device *disk_obd)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt;
+	int				 rc = 0;
+	ENTRY;
+
+	LASSERT(obd);
+	LASSERT(olg->olg_ctxts[ctxt_idx]);
+
+	ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]);
+	LASSERT(ctxt);
+
+	if (disk_obd == NULL)
+		GOTO(out, rc = 0);
+
+	/* initialize data allowing to generate new fids,
+	 * literally we need a sequece */
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid,
+				    &ctxt->loc_los_nameless);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid,
+				    &ctxt->loc_los_named);
+	if (rc != 0) {
+		local_oid_storage_fini(env, ctxt->loc_los_nameless);
+		ctxt->loc_los_nameless = NULL;
+	}
+
+	GOTO(out, rc);
+
+out:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+/**
+ * Implementation of the llog_operations::lop_cleanup
+ *
+ * This function cleanups the llog on local storage.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ctxt	the llog context
+ *
+ * \retval		0
+ */
+static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	if (ctxt->loc_los_nameless != NULL) {
+		local_oid_storage_fini(env, ctxt->loc_los_nameless);
+		ctxt->loc_los_nameless = NULL;
+	}
+
+	if (ctxt->loc_los_named != NULL) {
+		local_oid_storage_fini(env, ctxt->loc_los_named);
+		ctxt->loc_los_named = NULL;
+	}
+
+	return 0;
+}
+
+const struct llog_operations llog_osd_ops = {
+	.lop_next_block		= llog_osd_next_block,
+	.lop_prev_block		= llog_osd_prev_block,
+	.lop_read_header	= llog_osd_read_header,
+	.lop_declare_destroy	= llog_osd_declare_destroy,
+	.lop_destroy		= llog_osd_destroy,
+	.lop_setup		= llog_osd_setup,
+	.lop_cleanup		= llog_osd_cleanup,
+	.lop_open		= llog_osd_open,
+	.lop_exist		= llog_osd_exist,
+	.lop_declare_create	= llog_osd_declare_create,
+	.lop_create		= llog_osd_create,
+	.lop_declare_write_rec	= llog_osd_declare_write_rec,
+	.lop_write_rec		= llog_osd_write_rec,
+	.lop_close		= llog_osd_close,
+};
+EXPORT_SYMBOL(llog_osd_ops);
+
+const struct llog_operations llog_common_cat_ops = {
+	.lop_next_block		= llog_osd_next_block,
+	.lop_prev_block		= llog_osd_prev_block,
+	.lop_read_header	= llog_osd_read_header,
+	.lop_declare_destroy	= llog_osd_declare_destroy,
+	.lop_destroy		= llog_osd_destroy,
+	.lop_setup		= llog_osd_setup,
+	.lop_cleanup		= llog_osd_cleanup,
+	.lop_open		= llog_osd_open,
+	.lop_exist		= llog_osd_exist,
+	.lop_declare_create	= llog_osd_declare_create,
+	.lop_create		= llog_osd_create,
+	.lop_declare_write_rec	= llog_osd_declare_write_rec,
+	.lop_write_rec		= llog_osd_write_rec,
+	.lop_close		= llog_osd_close,
+	.lop_add		= llog_cat_add_rec,
+	.lop_declare_add	= llog_cat_declare_add_rec,
+};
+EXPORT_SYMBOL(llog_common_cat_ops);
+
+/**
+ * Read the special file which contains the list of llog catalogs IDs
+ *
+ * This function reads the CATALOGS file which contains the array of llog
+ * catalogs IDs. The main purpose of this file is to store OSP llogs indexed
+ * by OST/MDT number.
+ *
+ * \param[in]  env		execution environment
+ * \param[in]  d		corresponding storage device
+ * \param[in]  idx		position to start from, usually OST/MDT index
+ * \param[in]  count		how many catalog IDs to read
+ * \param[out] idarray		the buffer for the data. If it is NULL then
+ *				function returns just number of catalog IDs
+ *				in the file.
+ * \param[in]  fid		LLOG_CATALOGS_OID for CATALOG object
+ *
+ * \retval			0 on successful read of catalog IDs
+ * \retval			negative value on error
+ * \retval			positive value which is number of records in
+ *				the file if \a idarray is NULL
+ */
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray,
+			  const struct lu_fid *fid)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	ENTRY;
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx *  sizeof(*idarray);
+
+	lgi->lgi_fid = *fid;
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o)) {
+		th = dt_trans_create(env, d);
+		if (IS_ERR(th))
+			GOTO(out, rc = PTR_ERR(th));
+
+		lgi->lgi_attr.la_valid = LA_MODE | LA_TYPE;
+		lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		th->th_wait_submit = 1;
+		/* Make the llog object creation synchronization, so
+		 * it will be reliable to the reference, especially
+		 * for remote reference */
+		th->th_sync = 1;
+
+		rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, d, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, o, 0);
+		if (!dt_object_exists(o))
+			rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		dt_write_unlock(env, o);
+out_trans:
+		dt_trans_stop(env, d, th);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
+	       (int)lgi->lgi_attr.la_size, size);
+
+	/* return just number of llogs */
+	if (idarray == NULL) {
+		rc = lgi->lgi_attr.la_size / sizeof(*idarray);
+		GOTO(out, rc);
+	}
+
+	/* read for new ost index or for empty file */
+	memset(idarray, 0, size);
+	if (lgi->lgi_attr.la_size <= lgi->lgi_off)
+		GOTO(out, rc = 0);
+	if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+		size = lgi->lgi_attr.la_size - lgi->lgi_off;
+
+	lgi->lgi_buf.lb_buf = idarray;
+	lgi->lgi_buf.lb_len = size;
+	rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	/* -EFAULT means the llog is a sparse file. This is not an error
+	 * after arbitrary OST index is supported. */
+	if (rc < 0 && rc != -EFAULT) {
+		CERROR("%s: error reading CATALOGS: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+		GOTO(out, rc);
+	}
+
+	EXIT;
+out:
+	dt_object_put(env, o);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_get_cat_list);
+
+/**
+ * Write the special file which contains the list of llog catalogs IDs
+ *
+ * This function writes the CATALOG file which contains the array of llog
+ * catalogs IDs. It is used mostly to store OSP llogs indexed by OST/MDT
+ * number.
+ *
+ * \param[in]  env	execution environment
+ * \param[in]  d	corresponding storage device
+ * \param[in]  idx	position to start from, usually OST/MDT index
+ * \param[in]  count	how many catalog IDs to write
+ * \param[out] idarray	the buffer with the data to write.
+ * \param[in]  fid	LLOG_CATALOGS_OID for CATALOG object
+ *
+ * \retval		0 on successful write of catalog IDs
+ * \retval		negative value on error
+ */
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray,
+			  const struct lu_fid *fid)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	if (count == 0)
+		RETURN(0);
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx * sizeof(*idarray);
+	lgi->lgi_fid = *fid;
+
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o))
+		GOTO(out, rc = -ENOENT);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	lgi->lgi_buf.lb_len = size;
+	lgi->lgi_buf.lb_buf = idarray;
+	rc = dt_declare_record_write(env, o, &lgi->lgi_buf, lgi->lgi_off, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	/* For update log, this happens during initialization,
+	 * see lod_sub_prep_llog(), and we need make sure catlog
+	 * file ID is written to catlist file(committed) before
+	 * cross-MDT operation write update records to catlog FILE,
+	 * otherwise, during failover these update records might
+	 * missing */
+	if (fid_is_update_log(fid))
+		th->th_sync = 1;
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	th->th_wait_submit = 1;
+
+	rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+	if (rc)
+		CDEBUG(D_INODE, "can't write CATALOGS at index %d: rc = %d\n",
+		       idx, rc);
+out_trans:
+	dt_trans_stop(env, d, th);
+out:
+	dt_object_put(env, o);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_put_cat_list);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
new file mode 100644
index 0000000000000..67c0ce62713ce
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
@@ -0,0 +1,488 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman  <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <lustre_log.h>
+#include <lustre_update.h>
+
+static void print_llogd_body(struct llogd_body *d)
+{
+	CDEBUG(D_OTHER, "llogd body: %p\n", d);
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi.oi_fid: "DFID"\n",
+	       PFID(&d->lgd_logid.lgl_oi.oi_fid));
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+	CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+	CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+	CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+	CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+	CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+	CDEBUG(D_OTHER, "\tlgd_cur_offset: %#llx\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+	__swab64s(&fid->f_seq);
+	__swab32s(&fid->f_oid);
+	__swab32s(&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+	if (fid_seq_is_mdt0(oid->oi.oi_seq) ||
+	    fid_seq_is_default(oid->oi.oi_seq)) {
+		__swab64s(&oid->oi.oi_id);
+		__swab64s(&oid->oi.oi_seq);
+	} else {
+		lustre_swab_lu_fid(&oid->oi_fid);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llog_id(struct llog_logid *log_id)
+{
+	__swab64s(&log_id->lgl_oi.oi.oi_id);
+	__swab64s(&log_id->lgl_oi.oi.oi_seq);
+	__swab32s(&log_id->lgl_ogen);
+}
+
+void lustre_swab_llogd_body (struct llogd_body *d)
+{
+	ENTRY;
+	print_llogd_body(d);
+	lustre_swab_llog_id(&d->lgd_logid);
+	__swab32s(&d->lgd_ctxt_idx);
+	__swab32s(&d->lgd_llh_flags);
+	__swab32s(&d->lgd_index);
+	__swab32s(&d->lgd_saved_index);
+	__swab32s(&d->lgd_len);
+	__swab64s(&d->lgd_cur_offset);
+	print_llogd_body(d);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
+{
+	__swab64s(&d->lgdc_gen.mnt_cnt);
+	__swab64s(&d->lgdc_gen.conn_cnt);
+	lustre_swab_llog_id(&d->lgdc_logid);
+	__swab32s(&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+	__swab64s(&fid->id);
+	__swab32s(&fid->generation);
+	__swab32s(&fid->f_type);
+}
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+	__swab64s(&range->lsr_start);
+	__swab64s(&range->lsr_end);
+	__swab32s(&range->lsr_index);
+	__swab32s(&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_update_ops(struct update_ops *uops, unsigned int op_count)
+{
+	unsigned int i;
+	unsigned int j;
+
+	for (i = 0; i < op_count; i++) {
+		lustre_swab_lu_fid(&uops->uops_op[i].uop_fid);
+		__swab16s(&uops->uops_op[i].uop_type);
+		__swab16s(&uops->uops_op[i].uop_param_count);
+		for (j = 0; j < uops->uops_op[i].uop_param_count; j++)
+			__swab16s(&uops->uops_op[i].uop_params_off[j]);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_update_ops);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+	struct llog_rec_tail *tail = NULL;
+
+	__swab32s(&rec->lrh_len);
+	__swab32s(&rec->lrh_index);
+	__swab32s(&rec->lrh_type);
+	__swab32s(&rec->lrh_id);
+
+	switch (rec->lrh_type) {
+	case OST_SZ_REC:
+	{
+		struct llog_size_change_rec *lsc =
+			(struct llog_size_change_rec *)rec;
+
+		lustre_swab_ll_fid(&lsc->lsc_fid);
+		__swab32s(&lsc->lsc_ioepoch);
+		tail = &lsc->lsc_tail;
+		break;
+	}
+	case MDS_UNLINK_REC:
+	{
+		struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+		__swab64s(&lur->lur_oid);
+		__swab32s(&lur->lur_oseq);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case MDS_UNLINK64_REC:
+	{
+		struct llog_unlink64_rec *lur =
+			(struct llog_unlink64_rec *)rec;
+
+		lustre_swab_lu_fid(&lur->lur_fid);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case CHANGELOG_REC:
+	{
+		struct llog_changelog_rec *cr =
+			(struct llog_changelog_rec *)rec;
+
+		__swab16s(&cr->cr.cr_namelen);
+		__swab16s(&cr->cr.cr_flags);
+		__swab32s(&cr->cr.cr_type);
+		__swab64s(&cr->cr.cr_index);
+		__swab64s(&cr->cr.cr_prev);
+		__swab64s(&cr->cr.cr_time);
+		lustre_swab_lu_fid(&cr->cr.cr_tfid);
+		lustre_swab_lu_fid(&cr->cr.cr_pfid);
+		if (cr->cr.cr_flags & CLF_RENAME) {
+			struct changelog_ext_rename *rnm =
+				changelog_rec_rename(&cr->cr);
+
+			lustre_swab_lu_fid(&rnm->cr_sfid);
+			lustre_swab_lu_fid(&rnm->cr_spfid);
+		}
+		/*
+		 * Because the tail follows a variable-length structure we need
+		 * to compute its location at runtime
+		 */
+		tail = (struct llog_rec_tail *)((char *)rec +
+						rec->lrh_len - sizeof(*tail));
+		break;
+	}
+
+	case CHANGELOG_USER_REC:
+	case CHANGELOG_USER_REC2:
+	{
+		struct llog_changelog_user_rec2 *cur =
+			(struct llog_changelog_user_rec2 *)rec;
+
+		__swab32s(&cur->cur_id);
+		__swab64s(&cur->cur_endrec);
+		__swab32s(&cur->cur_time);
+		if (cur->cur_hdr.lrh_type == CHANGELOG_USER_REC2) {
+			__swab32s(&cur->cur_mask);
+			BUILD_BUG_ON(offsetof(typeof(*cur), cur_padding1) == 0);
+			BUILD_BUG_ON(offsetof(typeof(*cur), cur_padding2) == 0);
+			BUILD_BUG_ON(offsetof(typeof(*cur), cur_padding3) == 0);
+		}
+		tail = (struct llog_rec_tail *)((char *)rec +
+						rec->lrh_len - sizeof(*tail));
+		break;
+	}
+
+	case HSM_AGENT_REC: {
+		struct llog_agent_req_rec *arr =
+			(struct llog_agent_req_rec *)rec;
+
+		__swab32s(&arr->arr_hai.hai_len);
+		__swab32s(&arr->arr_hai.hai_action);
+		lustre_swab_lu_fid(&arr->arr_hai.hai_fid);
+		lustre_swab_lu_fid(&arr->arr_hai.hai_dfid);
+		__swab64s(&arr->arr_hai.hai_cookie);
+		__swab64s(&arr->arr_hai.hai_extent.offset);
+		__swab64s(&arr->arr_hai.hai_extent.length);
+		__swab64s(&arr->arr_hai.hai_gid);
+		/*
+		 * no swabing for opaque data
+		 * hai_data[0];
+		 */
+		break;
+	}
+
+	case MDS_SETATTR64_REC:
+	{
+		struct llog_setattr64_rec *lsr =
+			(struct llog_setattr64_rec *)rec;
+
+		lustre_swab_ost_id(&lsr->lsr_oi);
+		__swab32s(&lsr->lsr_uid);
+		__swab32s(&lsr->lsr_uid_h);
+		__swab32s(&lsr->lsr_gid);
+		__swab32s(&lsr->lsr_gid_h);
+		__swab64s(&lsr->lsr_valid);
+
+		if (rec->lrh_len > sizeof(struct llog_setattr64_rec)) {
+			struct llog_setattr64_rec_v2 *lsr2 =
+				(struct llog_setattr64_rec_v2 *)rec;
+
+			__swab32s(&lsr2->lsr_projid);
+			__swab32s(&lsr2->lsr_layout_version);
+			tail = &lsr2->lsr_tail;
+		} else {
+			tail = &lsr->lsr_tail;
+		}
+		break;
+	}
+	case OBD_CFG_REC:
+		/* these are swabbed as they are consumed */
+		break;
+	case LLOG_HDR_MAGIC:
+	{
+		struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+		__swab64s(&llh->llh_timestamp);
+		__swab32s(&llh->llh_count);
+		__swab32s(&llh->llh_bitmap_offset);
+		__swab32s(&llh->llh_flags);
+		__swab32s(&llh->llh_size);
+		__swab32s(&llh->llh_cat_idx);
+		tail = LLOG_HDR_TAIL(llh);
+		break;
+	}
+	case LLOG_LOGID_MAGIC:
+	{
+		struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+		lustre_swab_llog_id(&lid->lid_id);
+		tail = &lid->lid_tail;
+		break;
+	}
+	case LLOG_GEN_REC:
+	{
+		struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+		__swab64s(&lgr->lgr_gen.mnt_cnt);
+		__swab64s(&lgr->lgr_gen.conn_cnt);
+		tail = &lgr->lgr_tail;
+		break;
+	}
+	case LLOG_PAD_MAGIC:
+		break;
+	case UPDATE_REC:
+	{
+		struct llog_update_record *lur =
+				(struct llog_update_record *)rec;
+		struct update_records *record = &lur->lur_update_rec;
+
+		__swab32s(&record->ur_flags);
+		__swab64s(&record->ur_batchid);
+		__swab64s(&record->ur_master_transno);
+		__swab32s(&record->ur_param_count);
+		__swab32s(&record->ur_update_count);
+		lustre_swab_update_ops(&record->ur_ops,
+				       record->ur_update_count);
+
+		/* Compute tail location. */
+		tail = (struct llog_rec_tail *)((char *)record +
+						update_records_size(record));
+		break;
+	}
+	default:
+		CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+			rec->lrh_type, rec);
+	}
+
+	if (tail) {
+		__swab32s(&tail->lrt_len);
+		__swab32s(&tail->lrt_index);
+	}
+}
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+	CDEBUG(D_OTHER, "llog header: %p\n", h);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+	CDEBUG(D_OTHER, "\tllh_timestamp: %#llx\n", h->llh_timestamp);
+	CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+	CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+	CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+	CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+	CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n",
+	       LLOG_HDR_TAIL(h)->lrt_index);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n",
+	       LLOG_HDR_TAIL(h)->lrt_len);
+}
+
+void lustre_swab_llog_hdr (struct llog_log_hdr *h)
+{
+	ENTRY;
+	print_llog_hdr(h);
+
+	lustre_swab_llog_rec(&h->llh_hdr);
+
+	print_llog_hdr(h);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+
+	ENTRY;
+
+	if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+		return;
+
+	CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n",
+	       libcfs_nid2str(lcfg->lcfg_nid));
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+	if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+		for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+			CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d %s\n",
+			       i, lcfg->lcfg_buflens[i],
+			       lustre_cfg_string(lcfg, i));
+		}
+
+	EXIT;
+}
+EXPORT_SYMBOL(print_lustre_cfg);
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+
+	ENTRY;
+
+	__swab32s(&lcfg->lcfg_version);
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+		CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+			lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+		EXIT;
+		return;
+	}
+
+	__swab32s(&lcfg->lcfg_command);
+	__swab32s(&lcfg->lcfg_num);
+	__swab32s(&lcfg->lcfg_flags);
+	__swab64s(&lcfg->lcfg_nid);
+	__swab32s(&lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+		__swab32s(&lcfg->lcfg_buflens[i]);
+
+	print_lustre_cfg(lcfg);
+	EXIT;
+}
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+	__u32   cm_step;
+	__u32   cm_flags;
+	__u32   cm_vers;
+	__u32   padding;
+	__u32   cm_createtime;
+	__u32   cm_canceltime;
+	char    cm_tgtname[MTI_NAME_MAXLEN];
+	char    cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
+        (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+	struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker;
+
+	ENTRY;
+
+	if (swab) {
+		__swab32s(&marker->cm_step);
+		__swab32s(&marker->cm_flags);
+		__swab32s(&marker->cm_vers);
+	}
+	if (size == sizeof(*cm32)) {
+		__u32 createtime, canceltime;
+		/*
+		 * There was a problem with the original declaration of
+		 * cfg_marker on 32-bit systems because it used 32 time as
+		 * a wire protocol structure, and didn't verify this in
+		 * wirecheck.  We now have to convert the offsets of the
+		 * later fields in order to work on 32- and 64-bit systems.
+		 *
+		 * Fortunately, the cm_comment field has no functional use
+		 * so can be sacrificed when converting the timestamp size.
+		 *
+		 * Overwrite fields from the end first, so they are not
+		 * clobbered, and use memmove() instead of memcpy() because
+		 * the source and target buffers overlap.  bug 16771
+		 */
+		createtime = cm32->cm_createtime;
+		canceltime = cm32->cm_canceltime;
+		memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+		marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+		memmove(marker->cm_tgtname, cm32->cm_tgtname,
+			sizeof(marker->cm_tgtname));
+		if (swab) {
+			__swab32s(&createtime);
+			__swab32s(&canceltime);
+		}
+		marker->cm_createtime = createtime;
+		marker->cm_canceltime = canceltime;
+		CDEBUG(D_CONFIG,
+		       "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n",
+		       marker->cm_tgtname);
+	} else if (swab) {
+		__swab64s(&marker->cm_createtime);
+		__swab64s(&marker->cm_canceltime);
+	}
+
+	EXIT;
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
new file mode 100644
index 0000000000000..13d8531fa6d9b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
@@ -0,0 +1,2288 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/llog_test.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lustre_log.h>
+
+/* This is slightly more than the number of records that can fit into a
+ * single llog file, because the llog_log_header takes up some of the
+ * space in the first block that cannot be used for the bitmap. */
+static int llog_test_recnum = (LLOG_MIN_CHUNK_SIZE * 8);
+static int llog_test_rand;
+static struct obd_uuid uuid = { .uuid = "test_uuid" };
+static struct llog_logid cat_logid;
+
+struct llog_mini_rec {
+	struct llog_rec_hdr lmr_hdr;
+	struct llog_rec_tail lmr_tail;
+} __attribute__((packed));
+
+static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
+{
+	int i;
+	int last_idx = 0;
+	int active_recs = 0;
+
+	for (i = 0; i < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr); i++) {
+		if (test_bit_le(i, LLOG_HDR_BITMAP(llh->lgh_hdr))) {
+			last_idx = i;
+			active_recs++;
+		}
+	}
+
+	/* check the llog is sane at first, llh_count and lgh_last_idx*/
+	if (llh->lgh_hdr->llh_count != active_recs) {
+		CERROR("%s: handle->count is %d, but there are %d recs found\n",
+		       test, llh->lgh_hdr->llh_count, active_recs);
+		RETURN(-ERANGE);
+	}
+
+	if (llh->lgh_last_idx != LLOG_HDR_TAIL(llh->lgh_hdr)->lrt_index ||
+	    (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_CAT) &&
+	     llh->lgh_last_idx < last_idx)) {
+		CERROR("%s: lgh_last_idx is %d (%d in the header), last found %d\n",
+		       test, llh->lgh_last_idx,
+		       LLOG_HDR_TAIL(llh->lgh_hdr)->lrt_index, last_idx);
+		RETURN(-ERANGE);
+	}
+
+	/* finally checks against expected value from the caller */
+	if (active_recs != num_recs) {
+		CERROR("%s: expected %d active recs after write, found %d\n",
+		       test, num_recs, active_recs);
+		RETURN(-ERANGE);
+	}
+
+	RETURN(0);
+}
+
+/* Test named-log create/open, close */
+static int llog_test_1(const struct lu_env *env,
+		       struct obd_device *obd, char *name)
+{
+	struct llog_handle *llh;
+	struct llog_ctxt *ctxt;
+	int rc;
+	int rc2;
+
+	ENTRY;
+
+	CWARN("1a: create a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, name);
+	if (rc) {
+		CERROR("1a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(out, rc);
+	}
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("1a: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	rc = verify_handle("1", llh, 1);
+
+	CWARN("1b: close newly-created log\n");
+out_close:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("1b: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static int test_2_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+			    struct llog_rec_hdr *rec, void *data)
+{
+	return LLOG_DEL_RECORD;
+}
+
+/* Test named-log reopen; returns opened log on success */
+static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
+		       char *name, struct llog_handle **llh)
+{
+	struct llog_ctxt *ctxt;
+	struct llog_handle *lgh;
+	struct llog_logid  logid;
+	int rc;
+	struct llog_mini_rec lmr;
+
+	ENTRY;
+
+	CWARN("2a: re-open a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2a: re-open log with name %s failed: %d\n", name, rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2a: can't init llog handle: %d\n", rc);
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = verify_handle("2", *llh, 1);
+	if (rc)
+		GOTO(out_close_llh, rc);
+
+	CWARN("2b: create a log without specified NAME & LOGID\n");
+	rc = llog_open_create(env, ctxt, &lgh, NULL, NULL);
+	if (rc) {
+		CERROR("2b: create log failed\n");
+		GOTO(out_close_llh, rc);
+	}
+	rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2b: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	logid = lgh->lgh_id;
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+
+	/* Check llog header values are correct after record add/cancel */
+	CWARN("2b: write 1 llog records, check llh_count\n");
+	rc = llog_write(env, lgh, &lmr.lmr_hdr, LLOG_NEXT_IDX);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	/* in-memory values after record addition */
+	rc = verify_handle("2b", lgh, 2);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	/* re-open llog to read on-disk values */
+	llog_close(env, lgh);
+
+	CWARN("2c: re-open the log by LOGID and verify llh_count\n");
+	rc = llog_open(env, ctxt, &lgh, &logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		CERROR("2c: re-open log by LOGID failed\n");
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc < 0) {
+		CERROR("2c: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	/* check values just read from disk */
+	rc = verify_handle("2c", lgh, 2);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	rc = llog_process(env, lgh, test_2_cancel_cb, NULL, NULL);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	/* in-memory values */
+	rc = verify_handle("2c", lgh, 1);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	/* re-open llog to get on-disk values */
+	llog_close(env, lgh);
+
+	rc = llog_open(env, ctxt, &lgh, &logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2c: re-open log by LOGID failed\n");
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2c: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	/* on-disk values after llog re-open */
+	rc = verify_handle("2c", lgh, 1);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	CWARN("2d: destroy this log\n");
+	rc = llog_destroy(env, lgh);
+	if (rc)
+		CERROR("2d: destroy log failed\n");
+out_close:
+	llog_close(env, lgh);
+out_close_llh:
+	if (rc)
+		llog_close(env, *llh);
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+static int test_3_rec_num;
+static off_t test_3_rec_off;
+static int test_3_paddings;
+static int test_3_start_idx;
+
+/*
+ * Test 3 callback.
+ * - check lgh_cur_offset correctness
+ * - check record index consistency
+ * - modify each record in-place
+ * - add new record during *last_idx processing
+ */
+static int test3_check_n_add_cb(const struct lu_env *env,
+				struct llog_handle *lgh,
+				struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+	int *last_rec = data;
+	unsigned cur_idx = test_3_start_idx + test_3_rec_num;
+	int rc;
+
+	if (lgh->lgh_hdr->llh_flags & LLOG_F_IS_FIXSIZE) {
+		LASSERT(lgh->lgh_hdr->llh_size > 0);
+		if (lgh->lgh_cur_offset != lgh->lgh_hdr->llh_hdr.lrh_len +
+					(cur_idx - 1) * lgh->lgh_hdr->llh_size)
+			CERROR("Wrong record offset in cur_off: %llu, should be %u\n",
+			       lgh->lgh_cur_offset,
+			       lgh->lgh_hdr->llh_hdr.lrh_len +
+			       (cur_idx - 1) * lgh->lgh_hdr->llh_size);
+	} else {
+		size_t chunk_size = lgh->lgh_hdr->llh_hdr.lrh_len;
+
+		/*
+		 * For variable size records the start offset is unknown, trust
+		 * the first value and check others are consistent with it.
+		 */
+		if (test_3_rec_off == 0)
+			test_3_rec_off = lgh->lgh_cur_offset;
+
+		if (lgh->lgh_cur_offset != test_3_rec_off) {
+			__u64 tmp = lgh->lgh_cur_offset;
+
+			/* there can be padding record */
+			if ((do_div(tmp, chunk_size) == 0) &&
+			    (lgh->lgh_cur_offset - test_3_rec_off <
+			     rec->lrh_len + LLOG_MIN_REC_SIZE)) {
+				test_3_rec_off = lgh->lgh_cur_offset;
+				test_3_paddings++;
+			} else {
+				CERROR("Wrong record offset in cur_off: %llu"
+				       ", should be %lld (rec len %u)\n",
+				       lgh->lgh_cur_offset,
+				       (long long)test_3_rec_off,
+				       rec->lrh_len);
+			}
+		}
+		test_3_rec_off += rec->lrh_len;
+	}
+
+	cur_idx += test_3_paddings;
+	if (cur_idx != rec->lrh_index)
+		CERROR("Record with wrong index was read: %u, expected %u\n",
+		       rec->lrh_index, cur_idx);
+
+	/* modify all records in place */
+	lgr->lgr_gen.conn_cnt = rec->lrh_index;
+	rc = llog_write(env, lgh, rec, rec->lrh_index);
+	if (rc < 0)
+		CERROR("cb_test_3: cannot modify record while processing\n");
+
+	/*
+	 * Add new record to the llog at *last_rec position one by one to
+	 * check that last block is re-read during processing
+	 */
+	if (cur_idx == *last_rec || cur_idx == (*last_rec + 1)) {
+		rc = llog_write(env, lgh, rec, LLOG_NEXT_IDX);
+		if (rc < 0)
+			CERROR("cb_test_3: cannot add new record while "
+			       "processing\n");
+	}
+	test_3_rec_num++;
+
+	return rc;
+}
+
+/* Check in-place modifications were done for all records*/
+static int test3_check_cb(const struct lu_env *env, struct llog_handle *lgh,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+	if (lgr->lgr_gen.conn_cnt != rec->lrh_index) {
+		CERROR("cb_test_3: record %u is not modified\n",
+		       rec->lrh_index);
+		return -EINVAL;
+	}
+	test_3_rec_num++;
+	return 0;
+}
+
+static int llog_test3_process(const struct lu_env *env,
+			      struct llog_handle *lgh,
+			      llog_cb_t cb, int start)
+{
+	struct llog_process_cat_data cd;
+	int last_idx; /* new record will be injected here */
+	int rc = 0;
+
+	CWARN("test3: processing records from index %d to the end\n",
+	      start);
+	cd.lpcd_read_mode = LLOG_READ_MODE_NORMAL;
+	cd.lpcd_first_idx = start - 1;
+	cd.lpcd_last_idx = 0;
+	test_3_rec_num = test_3_paddings = 0;
+	last_idx = lgh->lgh_last_idx;
+	rc = llog_process(env, lgh, cb, &last_idx, &cd);
+	if (rc < 0)
+		return rc;
+	CWARN("test3: total %u records processed with %u paddings\n",
+	      test_3_rec_num, test_3_paddings);
+	return test_3_rec_num;
+}
+
+/* Test plain llog functionality */
+static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
+		       struct llog_handle *llh)
+{
+	char buf[128];
+	struct llog_rec_hdr *hdr = (void *)buf;
+	int rc, i;
+	int num_recs = 1; /* 1 for the header */
+	int expected;
+
+	ENTRY;
+
+	hdr->lrh_len = sizeof(struct llog_gen_rec);
+	hdr->lrh_type = LLOG_GEN_REC;
+	llh->lgh_hdr->llh_size = sizeof(struct llog_gen_rec);
+	llh->lgh_hdr->llh_flags |= LLOG_F_IS_FIXSIZE;
+
+	/*
+	 * Fill the llog with 64-bytes records, use 1023 records,
+	 * so last chunk will be partially full. Don't change this
+	 * value until record size is changed.
+	 */
+	CWARN("3a: write 1023 fixed-size llog records\n");
+	for (i = 0; i < 1023; i++) {
+		rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX);
+		if (rc < 0) {
+			CERROR("3a: write 1023 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3a", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	/*
+	 * Test fixed-size records processing:
+	 * - search the needed index
+	 * - go through all records from that index
+	 * - check all indices are growing monotonically and exist
+	 * - modify each record
+	 *
+	 * NB: test3_check_n_add adds two new records while processing
+	 * after last record. There were 1023 records created so the last chunk
+	 * misses exactly one record. Therefore one of new records will be
+	 * the last in the current chunk and second causes the new chunk to be
+	 * created.
+	 */
+	test_3_rec_off = 0;
+	test_3_start_idx = 501;
+	expected = 525;
+	rc = llog_test3_process(env, llh, test3_check_n_add_cb,
+				test_3_start_idx);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* extra record is created during llog_process() */
+	if (rc != expected) {
+		CERROR("3a: process total %d records but expect %d\n",
+		       rc, expected);
+		RETURN(-ERANGE);
+	}
+
+	num_recs += 2;
+
+	/* test modification in place */
+	rc = llog_test3_process(env, llh, test3_check_cb, test_3_start_idx);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (rc != expected) {
+		CERROR("3a: process total %d records but expect %d\n",
+		       rc, expected);
+		RETURN(-ERANGE);
+	}
+
+	CWARN("3b: write 566 variable size llog records\n");
+
+	/*
+	 * Drop llh_size to 0 to mark llog as variable-size and write
+	 * header to make this change permanent.
+	 */
+	llh->lgh_hdr->llh_flags &= ~LLOG_F_IS_FIXSIZE;
+	llog_write(env, llh, &llh->lgh_hdr->llh_hdr, LLOG_HEADER_IDX);
+
+	hdr->lrh_type = OBD_CFG_REC;
+
+	/*
+	 * there are 1025 64-bytes records in llog already,
+	 * the last chunk contains single record, i.e. 64 bytes.
+	 * Each pair of variable size records is 200 bytes, so
+	 * we will have the following distribution per chunks:
+	 * block 1: 64 + 80(80/120) + 80 + 48(pad) = 81 iterations
+	 * block 2: 80(120/80) + 120 + 72(pad) = 81 itereations
+	 * block 3: 80(80/120) + 80 + 112(pad) = 81 iterations
+	 * -- the same as block 2 again and so on.
+	 * block 7: 80(80/120) = 80 iterations and 192 bytes remain
+	 * Total 6 * 81 + 80 = 566 itereations.
+	 * Callback will add another 120 bytes in the end of the last chunk
+	 * and another 120 bytes will cause padding (72 bytes) plus 120
+	 * bytes in the new block.
+	 */
+	for (i = 0; i < 566; i++) {
+		if ((i % 2) == 0)
+			hdr->lrh_len = 80;
+		else
+			hdr->lrh_len = 120;
+
+		rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX);
+		if (rc < 0) {
+			CERROR("3b: write 566 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3b", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	test_3_start_idx = 1026;
+	expected = 568;
+	rc = llog_test3_process(env, llh, test3_check_n_add_cb,
+				test_3_start_idx);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (rc != expected) {
+		CERROR("3b: process total %d records but expect %d\n",
+		       rc, expected);
+		RETURN(-ERANGE);
+	}
+
+	num_recs += 2;
+
+	/* test modification in place */
+	rc = llog_test3_process(env, llh, test3_check_cb, test_3_start_idx);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (rc != expected) {
+		CERROR("3b: process total %d records but expect %d\n",
+		       rc, expected);
+		RETURN(-ERANGE);
+	}
+
+	CWARN("3c: write records with variable size until BITMAP_SIZE, "
+	      "return -ENOSPC\n");
+	while (num_recs < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) {
+		if ((num_recs % 2) == 0)
+			hdr->lrh_len = 80;
+		else
+			hdr->lrh_len = 128;
+
+		rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX);
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("3c: write recs failed at #%d: %d\n",
+			       num_recs, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	if (rc != -ENOSPC) {
+		CWARN("3c: write record more than BITMAP size!\n");
+		RETURN(-EINVAL);
+	}
+	CWARN("3c: wrote %d more records before end of llog is reached\n",
+	      num_recs);
+
+	rc = verify_handle("3c", llh, num_recs);
+
+	RETURN(rc);
+}
+
+/* Test catalogue additions */
+static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle *cath, *llh;
+	char name[10];
+	int rc, rc2, i, buflen;
+	struct llog_mini_rec lmr;
+	struct llog_cookie cookie;
+	struct llog_ctxt *ctxt;
+	int num_recs = 0;
+	char *buf;
+	struct llog_rec_hdr *rec;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+
+	sprintf(name, "%x", llog_test_rand + 1);
+	CWARN("4a: create a catalog log with name: %s\n", name);
+	rc = llog_open_create(env, ctxt, &cath, NULL, name);
+	if (rc) {
+		CERROR("4a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(ctxt_release, rc);
+        }
+	rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("4a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	num_recs++;
+	cat_logid = cath->lgh_id;
+
+	CWARN("4b: write 1 record into the catalog\n");
+	rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie);
+	if (rc != 1) {
+		CERROR("4b: write 1 catalog record failed at: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs++;
+	rc = verify_handle("4b", cath, 2);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	/* estimate the max number of record for the plain llog
+	 * cause it depends on disk size
+	 */
+	llh = cath->u.chd.chd_current_log;
+	if (llh->lgh_max_size != 0) {
+		llog_test_recnum = (llh->lgh_max_size -
+			sizeof(struct llog_log_hdr)) / LLOG_MIN_REC_SIZE;
+	}
+
+	if (llog_test_recnum >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr))
+		llog_test_recnum = LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1;
+
+	CWARN("4c: cancel 1 log record\n");
+	rc = llog_cat_cancel_records(env, cath, 1, &cookie);
+	if (rc) {
+		CERROR("4c: cancel 1 catalog based record failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs--;
+
+	rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4d: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("4d: write %d records failed at #%d: %d\n",
+			       llog_test_recnum, i + 1, rc);
+			GOTO(out, rc);
+		}
+		num_recs++;
+	}
+
+	/* make sure new plain llog appears */
+	rc = verify_handle("4d", cath, 3);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4e: add 5 large records, one record per block\n");
+	buflen = LLOG_MIN_CHUNK_SIZE;
+	OBD_ALLOC(buf, buflen);
+	if (buf == NULL)
+		GOTO(out, rc = -ENOMEM);
+	for (i = 0; i < 5; i++) {
+		rec = (void *)buf;
+		rec->lrh_len = buflen;
+		rec->lrh_type = OBD_CFG_REC;
+		rc = llog_cat_add(env, cath, rec, NULL);
+		if (rc) {
+			CERROR("4e: write 5 records failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_free, rc);
+		}
+		num_recs++;
+	}
+out_free:
+	OBD_FREE(buf, buflen);
+out:
+	CWARN("4f: put newly-created catalog\n");
+	rc2 = llog_cat_close(env, cath);
+	if (rc2) {
+		CERROR("4: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static int cat_counter;
+
+static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct lu_fid fid = {0};
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&lir->lid_id, &fid);
+
+	CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+	      rec->lrh_index, PFID(&fid),
+	      PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+	cat_counter++;
+
+	RETURN(0);
+}
+
+static int plain_counter;
+
+static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n",
+	       rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+
+	RETURN(0);
+}
+
+static int cancel_count;
+
+static int llog_cancel_rec_cb(const struct lu_env *env,
+			      struct llog_handle *llh,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_cookie cookie;
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	cookie.lgc_lgl = llh->lgh_id;
+	cookie.lgc_index = rec->lrh_index;
+
+	llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
+	cancel_count++;
+	if (cancel_count == llog_test_recnum)
+		RETURN(-LLOG_EEMPTY);
+	RETURN(0);
+}
+
+/* Test log and catalogue processing */
+static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle *llh = NULL;
+	char name[10];
+	int rc, rc2;
+	struct llog_mini_rec lmr;
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+
+	CWARN("5a: re-open catalog by id\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("5a: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("5a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5b: print the catalog entries.. we expect 2\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5b: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 2) {
+		CERROR("5b: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5c: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	cancel_count = 0;
+	rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("5c: process with llog_cancel_rec_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5c: print the catalog entries.. we expect 1\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5c: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 1) {
+		CERROR("5c: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5d: add 1 record to the log with many canceled empty pages\n");
+	rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL);
+	if (rc) {
+		CERROR("5d: add record to the log with many canceled empty "
+		       "pages failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("5e: print plain log entries.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0);
+	if (rc) {
+		CERROR("5e: process with plain_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5e: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5f: print plain log entries reversely.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar");
+	if (rc) {
+		CERROR("5f: reversely process with plain_print_cb failed: "
+		       "%d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5f: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+out:
+	CWARN("5g: close re-opened catalog\n");
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("5g: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/* Test client api; open log by name and process */
+static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
+		       char *name)
+{
+	struct obd_device *mgc_obd;
+	struct llog_ctxt *ctxt;
+	struct obd_uuid *mgs_uuid;
+	struct obd_export *exp;
+	struct obd_uuid uuid = { "LLOG_TEST6_UUID" };
+	struct llog_handle *llh = NULL;
+	struct llog_ctxt *nctxt;
+	int rc, rc2;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid;
+
+	CWARN("6a: re-open log %s using client API\n", name);
+	mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL);
+	if (mgc_obd == NULL) {
+		CERROR("6a: no MGC devices connected to %s found.\n",
+		       mgs_uuid->uuid);
+		GOTO(ctxt_release, rc = -ENOENT);
+	}
+
+	rc = obd_connect(NULL, &exp, mgc_obd, &uuid,
+			 NULL /* obd_connect_data */, NULL);
+	if (rc != -EALREADY) {
+		CERROR("6a: connect on connected MGC (%s) failed to return"
+		       " -EALREADY\n", mgc_obd->obd_name);
+		if (rc == 0)
+			obd_disconnect(exp);
+		GOTO(ctxt_release, rc = -EINVAL);
+	}
+
+	nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT);
+	rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("6a: llog_open failed %d\n", rc);
+		GOTO(nctxt_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc) {
+		CERROR("6a: llog_init_handle failed %d\n", rc);
+		GOTO(parse_out, rc);
+	}
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6b: process log %s using client API\n", name);
+	rc = llog_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6b: llog_process failed %d\n", rc);
+	CWARN("6b: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6b", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6c: process log %s reversely using client API\n", name);
+	rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6c: llog_reverse_process failed %d\n", rc);
+	CWARN("6c: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6c", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+parse_out:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("6: llog_close failed: rc = %d\n", rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+nctxt_put:
+	llog_ctxt_put(nctxt);
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static union {
+	struct llog_rec_hdr		lrh;   /* common header */
+	struct llog_logid_rec		llr;   /* LLOG_LOGID_MAGIC */
+	struct llog_unlink64_rec	lur;   /* MDS_UNLINK64_REC */
+	struct llog_setattr64_rec	lsr64; /* MDS_SETATTR64_REC */
+	struct llog_setattr64_rec_v2	lsr64_v2; /* MDS_SETATTR64_REC */
+	struct llog_size_change_rec	lscr;  /* OST_SZ_REC */
+	struct llog_changelog_rec	lcr;   /* CHANGELOG_REC */
+	struct llog_changelog_user_rec2	lcur;  /* CHANGELOG_USER_REC2 */
+	struct llog_gen_rec		lgr;   /* LLOG_GEN_REC */
+} llog_records;
+
+static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			   struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n",
+	       rec->lrh_type, rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+	return 0;
+}
+
+static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+			    struct llog_rec_hdr *rec, void *data)
+{
+	plain_counter++;
+	/* test LLOG_DEL_RECORD is working */
+	return LLOG_DEL_RECORD;
+}
+
+static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct llog_handle *llh;
+	int rc = 0, i, process_count;
+	int num_recs = 0;
+
+	ENTRY;
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+	if (rc) {
+		CERROR("7_sub: create log failed\n");
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, llh,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &uuid);
+	if (rc) {
+		CERROR("7_sub: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	for (i = 0; i < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr); i++) {
+		rc = llog_write(env, llh, &llog_records.lrh, LLOG_NEXT_IDX);
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("7_sub: write recs failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_close, rc);
+		}
+		num_recs++;
+	}
+	if (rc != -ENOSPC) {
+		CWARN("7_sub: write record more than BITMAP size!\n");
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	rc = verify_handle("7_sub", llh, num_recs + 1);
+	if (rc) {
+		CERROR("7_sub: verify handle failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (num_recs < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1)
+		CWARN("7_sub: records are not aligned, written %d from %u\n",
+		      num_recs, LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1);
+
+	plain_counter = 0;
+	rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL);
+	if (rc) {
+		CERROR("7_sub: llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	process_count = plain_counter;
+	if (process_count != num_recs) {
+		CERROR("7_sub: processed %d records from %d total\n",
+		       process_count, num_recs);
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	plain_counter = 0;
+	rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL);
+	if (rc && rc != LLOG_DEL_PLAIN) {
+		CERROR("7_sub: reverse llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (process_count != plain_counter) {
+		CERROR("7_sub: Reverse/direct processing found different number of records: %d/%d\n",
+		       plain_counter, process_count);
+		GOTO(out_close, rc = -EINVAL);
+	}
+	if (llog_exist(llh)) {
+		CERROR("7_sub: llog exists but should be zapped\n");
+		GOTO(out_close, rc = -EEXIST);
+	}
+
+	rc = verify_handle("7_sub", llh, 1);
+out_close:
+	if (rc)
+		llog_destroy(env, llh);
+	llog_close(env, llh);
+	RETURN(rc);
+}
+
+/* Test all llog records writing and processing */
+static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+	int rc;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+	CWARN("7a: test llog_logid_rec\n");
+	llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7a: llog_logid_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7b: test llog_unlink64_rec\n");
+	llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7b: llog_unlink_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7c: test llog_setattr64_rec\n");
+	llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7c: llog_setattr64_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7d: test llog_size_change_rec\n");
+	llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7d: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7e: test llog_changelog_rec\n");
+	/* Direct access to cr_do_not_use: peculiar case for this test */
+	llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_do_not_use.lrt_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7e: llog_changelog_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7f: test llog_changelog_user_rec2\n");
+	llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC2;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7f: llog_changelog_user_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7g: test llog_gen_rec\n");
+	llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7g: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7h: test llog_setattr64_rec_v2\n");
+	llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64_v2);
+	llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64_v2);
+	llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7h: llog_setattr64_rec_v2 test failed\n");
+		GOTO(out, rc);
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static int test_8_cb(const struct lu_env *env, struct llog_handle *llh,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	plain_counter++;
+	return 0;
+}
+
+static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle *llh = NULL;
+	char name[10];
+	int rc, rc2, i;
+	int orig_counter;
+	struct llog_mini_rec lmr;
+	struct llog_ctxt *ctxt;
+	struct dt_object *obj = NULL;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+
+	CWARN("8a: fill the first plain llog\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("8a: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("8a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	plain_counter = 0;
+	rc = llog_cat_process(env, llh, test_8_cb, "foobar", 0, 0);
+	if (rc != 0) {
+		CERROR("5a: process with test_8_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	orig_counter = plain_counter;
+
+	for (i = 0; i < 100; i++) {
+		rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("5a: add record failed\n");
+			GOTO(out, rc);
+		}
+	}
+
+	/* grab the current plain llog, we'll corrupt it later */
+	obj = llh->u.chd.chd_current_log->lgh_obj;
+	LASSERT(obj);
+	lu_object_get(&obj->do_lu);
+	CWARN("8a: pin llog "DFID"\n", PFID(lu_object_fid(&obj->do_lu)));
+
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("8a: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+		GOTO(out_put, rc);
+	}
+
+	CWARN("8b: fill the second plain llog\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("8b: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("8b: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	for (i = 0; i < 100; i++) {
+		rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("8b: add record failed\n");
+			GOTO(out, rc);
+		}
+	}
+	CWARN("8b: second llog "DFID"\n",
+	      PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu)));
+
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("8b: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+		GOTO(out_put, rc);
+	}
+
+	/* Here was 8c: drop two records from the first plain llog
+	 * llog_truncate was bad idea cause it creates a wrong state,
+	 * lgh_last_idx is wrong and two records belongs to zeroed buffer
+	 */
+
+	CWARN("8d: count survived records\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("8d: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("8d: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	plain_counter = 0;
+	rc = llog_cat_process(env, llh, test_8_cb, "foobar", 0, 0);
+	if (rc != 0) {
+		CERROR("8d: process with test_8_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	if (orig_counter + 200 != plain_counter) {
+		CERROR("found %d records (expected %d)\n", plain_counter,
+		       orig_counter + 200);
+		rc = -EIO;
+	}
+
+out:
+	CWARN("8d: close re-opened catalog\n");
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("8d: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out_put:
+	llog_ctxt_put(ctxt);
+
+	if (obj != NULL)
+		dt_object_put(env, obj);
+
+	RETURN(rc);
+}
+
+static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct llog_handle *llh;
+	struct lu_fid fid;
+	int rc = 0;
+
+	ENTRY;
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+	if (rc != 0) {
+		CERROR("9_sub: create log failed\n");
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, llh,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &uuid);
+	if (rc != 0) {
+		CERROR("9_sub: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	logid_to_fid(&llh->lgh_id, &fid);
+	fid_to_logid(&fid, &llog_records.llr.lid_id);
+	rc = llog_write(env, llh, &llog_records.lrh, LLOG_NEXT_IDX);
+	if (rc < 0) {
+		CERROR("9_sub: write recs failed at #1: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	/* The below message is checked in sanity.sh test_60a (run-llog.sh) */
+	CWARN("9_sub: record type %x in log "DFID_NOBRACE"\n",
+	      llog_records.lrh.lrh_type, PFID(&fid));
+out_close:
+	llog_close(env, llh);
+	RETURN(rc);
+}
+
+/* Prepare different types of llog records for llog_reader test*/
+static int llog_test_9(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+	int rc;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+	CWARN("9a: test llog_logid_rec\n");
+	llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+	rc = llog_test_9_sub(env, ctxt);
+	if (rc != 0) {
+		CERROR("9a: llog_logid_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("9b: test llog_obd_cfg_rec\n");
+	llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_hdr.lrh_type = OBD_CFG_REC;
+
+	rc = llog_test_9_sub(env, ctxt);
+	if (rc != 0) {
+		CERROR("9b: llog_obd_cfg_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("9c: test llog_changelog_rec\n");
+	/* Direct access to cr_do_not_use: peculiar case for this test */
+	llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_do_not_use.lrt_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+	rc = llog_test_9_sub(env, ctxt);
+	if (rc != 0) {
+		CERROR("9c: llog_changelog_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("9d: test llog_changelog_user_rec2\n");
+	llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC;
+
+	rc = llog_test_9_sub(env, ctxt);
+	if (rc != 0) {
+		CERROR("9d: llog_changelog_user_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+struct llog_process_info {
+	struct llog_handle *lpi_loghandle;
+	llog_cb_t lpi_cb;
+	void *lpi_cbdata;
+	void *lpi_catdata;
+	int lpi_rc;
+	struct completion lpi_completion;
+	const struct lu_env *lpi_env;
+	struct task_struct *lpi_reftask;
+};
+
+
+static int llog_test_process_thread(void *arg)
+{
+	struct llog_process_info *lpi = arg;
+	int rc;
+
+	rc = llog_cat_process_or_fork(NULL, lpi->lpi_loghandle, lpi->lpi_cb,
+				      NULL, lpi->lpi_cbdata, 1, 0, true);
+
+	complete(&lpi->lpi_completion);
+
+	lpi->lpi_rc = rc;
+	if (rc)
+		CWARN("10h: Error during catalog processing %d\n", rc);
+	return rc;
+}
+
+static int cat_check_old_cb(const struct lu_env *env, struct llog_handle *llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct lu_fid fid = {0};
+	struct lu_fid *prev_fid = data;
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&lir->lid_id, &fid);
+
+	CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+	      rec->lrh_index, PFID(&fid),
+	      PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+	if (prev_fid->f_oid > fid.f_oid) {
+		CWARN("processing old record, fail\n");
+		prev_fid->f_oid = 0xbad;
+		RETURN(-LLOG_EEMPTY);
+	}
+
+	if (prev_fid->f_oid == 0) {
+		cfs_fail_loc = OBD_FAIL_ONCE | OBD_FAIL_LLOG_PROCESS_TIMEOUT;
+		cfs_fail_val = (unsigned int) (llh->lgh_id.lgl_oi.oi.oi_id &
+					       0xFFFFFFFF);
+		msleep(1 * MSEC_PER_SEC);
+	}
+	*prev_fid = fid;
+
+	RETURN(0);
+}
+
+/* test catalog wrap around */
+static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle *cath;
+	char name[10];
+	int rc, rc2, i, enospc, eok;
+	struct llog_mini_rec lmr;
+	struct llog_ctxt *ctxt;
+	struct lu_attr la;
+	__u64 cat_max_size;
+	struct dt_device *dt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+
+	snprintf(name, sizeof(name), "%x", llog_test_rand + 2);
+	CWARN("10a: create a catalog log with name: %s\n", name);
+	rc = llog_open_create(env, ctxt, &cath, NULL, name);
+	if (rc) {
+		CERROR("10a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(ctxt_release, rc);
+	}
+	rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("10a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	cat_logid = cath->lgh_id;
+	dt = lu2dt_dev(cath->lgh_obj->do_lu.lo_dev);
+
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs
+	 */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10c: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* force catalog wrap for 5th plain LLOG */
+	cfs_fail_loc = CFS_FAIL_SKIP|OBD_FAIL_CAT_RECORDS;
+	cfs_fail_val = 4;
+
+	CWARN("10b: write %d log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("10b: write %d records failed at #%d: %d\n",
+			       llog_test_recnum, i + 1, rc);
+			GOTO(out, rc);
+		}
+	}
+
+	/* make sure 2 new plain llog appears in catalog (+1 with hdr) */
+	rc = verify_handle("10b", cath, 3);
+	if (rc)
+		GOTO(out, rc);
+
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs
+	 */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10b: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("10c: write %d more log records\n", 2 * llog_test_recnum);
+	for (i = 0; i < 2 * llog_test_recnum; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("10c: write %d records failed at #%d: %d\n",
+			       2*llog_test_recnum, i + 1, rc);
+			GOTO(out, rc);
+		}
+	}
+
+	/* make sure 2 new plain llog appears in catalog (+1 with hdr) */
+	rc = verify_handle("10c", cath, 5);
+	if (rc)
+		GOTO(out, rc);
+
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs
+	 */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10c: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/*
+	 * fill last allocated plain LLOG and reach -ENOSPC condition
+	 * because no slot available in Catalog
+	 */
+	enospc = 0;
+	eok = 0;
+	CWARN("10c: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc && rc != -ENOSPC) {
+			CERROR("10c: write %d records failed at #%d: %d\n",
+			       llog_test_recnum, i + 1, rc);
+			GOTO(out, rc);
+		}
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
+		if (rc == -ENOSPC) {
+			enospc++;
+		} else {
+			enospc = 0;
+			eok++;
+		}
+	}
+
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
+		CERROR("10c: all last records adds should have failed with"
+		       " -ENOSPC\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10c: wrote %d records then %d failed with ENOSPC\n", eok,
+	      enospc);
+
+	/* make sure no new record in Catalog */
+	rc = verify_handle("10c", cath, 5);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Catalog should have reached its max size for test */
+	rc = dt_attr_get(env, cath->lgh_obj, &la);
+	if (rc) {
+		CERROR("10c: failed to get catalog attrs: %d\n", rc);
+		GOTO(out, rc);
+	}
+	cat_max_size = la.la_size;
+
+	/*
+	 * cancel all 1st plain llog records to empty it, this will also cause
+	 * its catalog entry to be freed for next forced wrap in 10e
+	 */
+	CWARN("10d: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10d: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10d: print the catalog entries.. we expect 3\n");
+	cat_counter = 0;
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	if (rc) {
+		CERROR("10d: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 3) {
+		CERROR("10d: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10d", cath, 4);
+	if (rc)
+		GOTO(out, rc);
+
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs
+	 */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10d: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	enospc = 0;
+	eok = 0;
+	CWARN("10e: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc && rc != -ENOSPC) {
+			CERROR("10e: write %d records failed at #%d: %d\n",
+			       llog_test_recnum, i + 1, rc);
+			GOTO(out, rc);
+		}
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
+		if (rc == -ENOSPC) {
+			enospc++;
+		} else {
+			enospc = 0;
+			eok++;
+		}
+	}
+
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
+		CERROR("10e: all last records adds should have failed with"
+		       " -ENOSPC\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10e: wrote %d records then %d failed with ENOSPC\n", eok,
+	      enospc);
+
+	CWARN("10e: print the catalog entries.. we expect 4\n");
+	cat_counter = 0;
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
+	if (rc) {
+		CERROR("10e: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 4) {
+		CERROR("10e: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* make sure 1 new plain llog appears in catalog (+1 with hdr) */
+	rc = verify_handle("10e", cath, 5);
+	if (rc)
+		GOTO(out, rc);
+
+	/* verify catalog has wrap around */
+	if (cath->lgh_last_idx > cath->lgh_hdr->llh_cat_idx) {
+		CERROR("10e: catalog failed to wrap around\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = dt_attr_get(env, cath->lgh_obj, &la);
+	if (rc) {
+		CERROR("10e: failed to get catalog attrs: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	if (la.la_size != cat_max_size) {
+		CERROR("10e: catalog size has changed after it has wrap around,"
+		       " current size = %llu, expected size = %llu\n",
+		       la.la_size, cat_max_size);
+		GOTO(out, rc = -EINVAL);
+	}
+	CWARN("10e: catalog successfully wrap around, last_idx %d, first %d\n",
+	      cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
+
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs
+	 */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10e: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/*
+	 * cancel more records to free one more slot in Catalog
+	 * see if it is re-allocated when adding more records
+	 */
+	CWARN("10f: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10f: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10f: print the catalog entries.. we expect 3\n");
+	cat_counter = 0;
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
+	if (rc) {
+		CERROR("10f: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 3) {
+		CERROR("10f: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10f", cath, 4);
+	if (rc)
+		GOTO(out, rc);
+
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs
+	 */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10f: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	enospc = 0;
+	eok = 0;
+	CWARN("10f: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc && rc != -ENOSPC) {
+			CERROR("10f: write %d records failed at #%d: %d\n",
+			       llog_test_recnum, i + 1, rc);
+			GOTO(out, rc);
+		}
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
+		if (rc == -ENOSPC) {
+			enospc++;
+		} else {
+			enospc = 0;
+			eok++;
+		}
+	}
+
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
+		CERROR("10f: all last records adds should have failed with"
+		       " -ENOSPC\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10f: wrote %d records then %d failed with ENOSPC\n", eok,
+	      enospc);
+
+	/* make sure 1 new plain llog appears in catalog (+1 with hdr) */
+	rc = verify_handle("10f", cath, 5);
+	if (rc)
+		GOTO(out, rc);
+
+	/* verify lgh_last_idx = llh_cat_idx = 2 now */
+	if (cath->lgh_last_idx != cath->lgh_hdr->llh_cat_idx ||
+	    cath->lgh_last_idx != 2) {
+		CERROR("10f: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 2\n",
+		       cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = dt_attr_get(env, cath->lgh_obj, &la);
+	if (rc) {
+		CERROR("10f: failed to get catalog attrs: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	if (la.la_size != cat_max_size) {
+		CERROR("10f: catalog size has changed after it has wrap around,"
+		       " current size = %llu, expected size = %llu\n",
+		       la.la_size, cat_max_size);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs
+	 */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10f: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* will llh_cat_idx also successfully wrap ? */
+
+	/*
+	 * cancel all records in the plain LLOGs referenced by 2 last indexes in
+	 * Catalog
+	 */
+
+	/* cancel more records to free one more slot in Catalog */
+	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/* need to indicate error if for any reason llog_test_recnum is
+		 * not reached */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10g: print the catalog entries.. we expect 3\n");
+	cat_counter = 0;
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
+	if (rc) {
+		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 3) {
+		CERROR("10g: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10g", cath, 4);
+	if (rc)
+		GOTO(out, rc);
+
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs
+	 */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10g: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* cancel more records to free one more slot in Catalog */
+	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10g: print the catalog entries.. we expect 2\n");
+	cat_counter = 0;
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
+	if (rc) {
+		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 2) {
+		CERROR("10g: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10g", cath, 3);
+	if (rc)
+		GOTO(out, rc);
+
+	/* verify lgh_last_idx = 2 and llh_cat_idx = 0 now */
+	if (cath->lgh_hdr->llh_cat_idx != 0 ||
+	    cath->lgh_last_idx != 2) {
+		CERROR("10g: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 0\n",
+		       cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs
+	 */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10g: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* cancel more records to free one more slot in Catalog */
+	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10g: print the catalog entries.. we expect 1\n");
+	cat_counter = 0;
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
+	if (rc) {
+		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 1) {
+		CERROR("10g: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10g", cath, 2);
+	if (rc)
+		GOTO(out, rc);
+
+	/* verify lgh_last_idx = 2 and llh_cat_idx = 1 now */
+	if (cath->lgh_hdr->llh_cat_idx != 1 ||
+	    cath->lgh_last_idx != 2) {
+		CERROR("10g: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 1\n",
+		       cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10g: llh_cat_idx has also successfully wrapped!\n");
+
+	/*
+	 * catalog has only one valid entry other slots has outdated
+	 * records. Trying to race the llog_thread_process with llog_add
+	 * llog_thread_process read buffer and loop record on it.
+	 * llog_add adds a record and mark a record in bitmap.
+	 * llog_thread_process process record with old data.
+	 */
+	{
+	struct llog_process_info lpi;
+	struct lu_fid test_fid = {0};
+
+	lpi.lpi_loghandle = cath;
+	lpi.lpi_cb = cat_check_old_cb;
+	lpi.lpi_catdata = NULL;
+	lpi.lpi_cbdata = &test_fid;
+	init_completion(&lpi.lpi_completion);
+
+	kthread_run(llog_test_process_thread, &lpi, "llog_test_process_thread");
+
+	msleep(1 * MSEC_PER_SEC / 2);
+	enospc = 0;
+	eok = 0;
+	CWARN("10h: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc && rc != -ENOSPC) {
+			CERROR("10h: write %d records failed at #%d: %d\n",
+			       llog_test_recnum, i + 1, rc);
+			GOTO(out, rc);
+		}
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
+		if (rc == -ENOSPC) {
+			enospc++;
+		} else {
+			enospc = 0;
+			eok++;
+		}
+	}
+
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
+		CERROR("10h: all last records adds should have failed with"
+		       " -ENOSPC\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10h: wrote %d records then %d failed with ENOSPC\n", eok,
+	      enospc);
+
+	wait_for_completion(&lpi.lpi_completion);
+
+	if (lpi.lpi_rc != 0) {
+		CERROR("10h: race happened, old record was processed\n");
+		GOTO(out, rc = -EINVAL);
+	}
+	}
+out:
+	cfs_fail_loc = 0;
+	cfs_fail_val = 0;
+
+	CWARN("10: put newly-created catalog\n");
+	rc2 = llog_cat_close(env, cath);
+	if (rc2) {
+		CERROR("10: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+/*
+ * -------------------------------------------------------------------------
+ * Tests above, boring obd functions below
+ * -------------------------------------------------------------------------
+ */
+static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle *llh = NULL;
+	struct llog_ctxt *ctxt;
+	int rc, err;
+	char name[10];
+
+	ENTRY;
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	sprintf(name, "%x", llog_test_rand);
+
+	rc = llog_test_1(env, obd, name);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_2(env, obd, name, &llh);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_3(env, obd, llh);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_4(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_5(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_6(env, obd, name);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_7(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_8(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_9(env, obd);
+	if (rc != 0)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_10(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+cleanup:
+	err = llog_destroy(env, llh);
+	if (err)
+		CERROR("cleanup: llog_destroy failed: %d\n", err);
+	llog_close(env, llh);
+	if (rc == 0)
+		rc = err;
+cleanup_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+static int llog_test_cleanup(struct obd_device *obd)
+{
+	struct obd_device *tgt;
+	struct lu_env env;
+	int rc;
+
+	ENTRY;
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd;
+	rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT));
+	if (rc)
+		CERROR("failed to llog_test_llog_finish: %d\n", rc);
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_device *tgt;
+	struct llog_ctxt *ctxt;
+	struct dt_object *o;
+	struct lu_env env;
+	struct lu_context test_session;
+	int rc;
+
+	ENTRY;
+
+	if (lcfg->lcfg_bufcount < 2) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	if (lcfg->lcfg_buflens[1] < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	/* disk obd */
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("target device not attached or not set up (%s)\n",
+			lustre_cfg_string(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	rc = lu_context_init(&test_session, LCT_SERVER_SESSION);
+	if (rc)
+		GOTO(cleanup_env, rc);
+	test_session.lc_thread = (struct ptlrpc_thread *)current;
+	lu_context_enter(&test_session);
+	env.le_ses = &test_session;
+
+	CWARN("Setup llog-test device over %s device\n",
+	      lustre_cfg_string(lcfg, 1));
+
+	OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+	obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev);
+
+	rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt,
+			&llog_osd_ops);
+	if (rc)
+		GOTO(cleanup_session, rc);
+
+	/* use MGS llog dir for tests */
+	ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT);
+	LASSERT(ctxt);
+	o = ctxt->loc_dir;
+	llog_ctxt_put(ctxt);
+
+	ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	ctxt->loc_dir = o;
+	llog_ctxt_put(ctxt);
+
+	llog_test_rand = get_random_u32();
+
+	rc = llog_run_tests(&env, tgt);
+	if (rc)
+		llog_test_cleanup(obd);
+cleanup_session:
+	lu_context_exit(&test_session);
+	lu_context_fini(&test_session);
+cleanup_env:
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static const struct obd_ops llog_obd_ops = {
+	.o_owner       = THIS_MODULE,
+	.o_setup       = llog_test_setup,
+	.o_cleanup     = llog_test_cleanup,
+};
+
+static int __init llog_test_init(void)
+{
+	return class_register_type(&llog_obd_ops, NULL, false,
+				   "llog_test", NULL);
+}
+
+static void __exit llog_test_exit(void)
+{
+	class_unregister_type("llog_test");
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Log test module");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(llog_test_init);
+module_exit(llog_test_exit);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
new file mode 100644
index 0000000000000..2e2a0c4f5deff
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
@@ -0,0 +1,987 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "local_storage.h"
+
+/* all initialized local storages on this node are linked on this */
+static LIST_HEAD(ls_list_head);
+static DEFINE_MUTEX(ls_list_mutex);
+
+static int ls_object_init(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_object_conf *unused)
+{
+	struct ls_device	*ls;
+	struct lu_object	*below;
+	struct lu_device	*under;
+
+	ENTRY;
+
+	ls = container_of(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev);
+	under = &ls->ls_osd->dd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
+	if (below == NULL)
+		RETURN(-ENOMEM);
+
+	lu_object_add(o, below);
+
+	RETURN(0);
+}
+
+static void ls_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct ls_object	*obj = lu2ls_obj(o);
+	struct lu_object_header	*h = o->lo_header;
+
+	dt_object_fini(&obj->ls_obj);
+	lu_object_header_fini(h);
+	OBD_FREE_PRE(obj, sizeof(*obj), "kfreed");
+	kfree_rcu(obj, ls_header.loh_rcu);
+}
+
+static const struct lu_object_operations ls_lu_obj_ops = {
+	.loo_object_init  = ls_object_init,
+	.loo_object_free  = ls_object_free,
+};
+
+static struct lu_object *ls_object_alloc(const struct lu_env *env,
+					 const struct lu_object_header *_h,
+					 struct lu_device *d)
+{
+	struct lu_object_header	*h;
+	struct ls_object	*o;
+	struct lu_object	*l;
+
+	LASSERT(_h == NULL);
+
+	OBD_ALLOC_PTR(o);
+	if (o != NULL) {
+		l = &o->ls_obj.do_lu;
+		h = &o->ls_header;
+
+		lu_object_header_init(h);
+		dt_object_init(&o->ls_obj, h, d);
+		lu_object_add_top(h, l);
+
+		l->lo_ops = &ls_lu_obj_ops;
+
+		return l;
+	} else {
+		return NULL;
+	}
+}
+
+static const struct lu_device_operations ls_lu_dev_ops = {
+	.ldo_object_alloc =	ls_object_alloc
+};
+
+static struct ls_device *__ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls, *ret = NULL;
+
+	list_for_each_entry(ls, &ls_list_head, ls_linkage) {
+		if (ls->ls_osd == dev) {
+			atomic_inc(&ls->ls_refcount);
+			ret = ls;
+			break;
+		}
+	}
+	return ret;
+}
+
+struct ls_device *ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	mutex_unlock(&ls_list_mutex);
+
+	return ls;
+}
+
+static const struct lu_device_type_operations ls_device_type_ops = {
+	.ldto_start = NULL,
+	.ldto_stop  = NULL,
+};
+
+static struct lu_device_type ls_lu_type = {
+	.ldt_name = "local_storage",
+	.ldt_ops  = &ls_device_type_ops,
+};
+
+struct ls_device *ls_device_get(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	ENTRY;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	if (ls)
+		GOTO(out_ls, ls);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(ls);
+	if (ls == NULL)
+		GOTO(out_ls, ls = ERR_PTR(-ENOMEM));
+
+	atomic_set(&ls->ls_refcount, 1);
+	INIT_LIST_HEAD(&ls->ls_los_list);
+	mutex_init(&ls->ls_los_mutex);
+
+	ls->ls_osd = dev;
+
+	LASSERT(dev->dd_lu_dev.ld_site);
+	lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type);
+	ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops;
+	ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site;
+
+	/* finally add ls to the list */
+	list_add(&ls->ls_linkage, &ls_list_head);
+out_ls:
+	mutex_unlock(&ls_list_mutex);
+	RETURN(ls);
+}
+
+void ls_device_put(const struct lu_env *env, struct ls_device *ls)
+{
+	LASSERT(env);
+	if (!atomic_dec_and_test(&ls->ls_refcount))
+		return;
+
+	mutex_lock(&ls_list_mutex);
+	if (atomic_read(&ls->ls_refcount) == 0) {
+		LASSERT(list_empty(&ls->ls_los_list));
+		list_del(&ls->ls_linkage);
+		lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0);
+		lu_device_fini(&ls->ls_top_dev.dd_lu_dev);
+		OBD_FREE_PTR(ls);
+	}
+	mutex_unlock(&ls_list_mutex);
+}
+
+/**
+ * local file fid generation
+ */
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid)
+{
+	LASSERT(los->los_dev);
+	LASSERT(los->los_obj);
+
+	/* take next OID */
+
+	/* to make it unique after reboot we store
+	 * the latest generated fid atomically with
+	 * object creation see local_object_create() */
+
+	mutex_lock(&los->los_id_lock);
+	fid->f_seq = los->los_seq;
+	fid->f_oid = ++los->los_last_oid;
+	fid->f_ver = 0;
+	mutex_unlock(&los->los_id_lock);
+
+	return 0;
+}
+
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o, struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	/* update fid generation file */
+	if (los != NULL) {
+		LASSERT(dt_object_exists(los->los_obj));
+		dti->dti_lb.lb_buf = NULL;
+		dti->dti_lb.lb_len = sizeof(struct los_ondisk);
+		rc = dt_declare_record_write(env, los->los_obj,
+					     &dti->dti_lb, 0, th);
+		if (rc)
+			RETURN(rc);
+	}
+
+	rc = dt_declare_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	dti->dti_lb.lb_buf = NULL;
+	dti->dti_lb.lb_len = sizeof(dti->dti_lma);
+	rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th);
+
+	RETURN(rc);
+}
+
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o, struct lu_attr *attr,
+			struct dt_object_format *dof, struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	u64			 lastid;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	if (los == NULL)
+		RETURN(rc);
+
+	LASSERT(los->los_obj);
+	LASSERT(dt_object_exists(los->los_obj));
+
+	/* many threads can be updated this, serialize
+	 * them here to avoid the race where one thread
+	 * takes the value first, but writes it last */
+	mutex_lock(&los->los_id_lock);
+
+	/* update local oid number on disk so that
+	 * we know the last one used after reboot */
+	lastid = cpu_to_le64(los->los_last_oid);
+
+	dti->dti_off = 0;
+	dti->dti_lb.lb_buf = &lastid;
+	dti->dti_lb.lb_len = sizeof(lastid);
+	rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off,
+			     th);
+	mutex_unlock(&los->los_id_lock);
+
+	RETURN(rc);
+}
+
+/*
+ * Create local named object (file, directory or index) in parent directory.
+ */
+static struct dt_object *__local_file_create(const struct lu_env *env,
+					     const struct lu_fid *fid,
+					     struct local_oid_storage *los,
+					     struct ls_device *ls,
+					     struct dt_object *parent,
+					     const char *name,
+					     struct lu_attr *attr,
+					     struct dt_object_format *dof)
+{
+	struct dt_thread_info	*dti	= dt_info(env);
+	struct lu_object_conf	*conf	= &dti->dti_conf;
+	struct dt_insert_rec	*rec	= &dti->dti_dt_rec;
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	/* We know that the target object does not exist, to be created,
+	 * then give some hints - LOC_F_NEW to help low layer to handle
+	 * that efficiently and properly. */
+	memset(conf, 0, sizeof(*conf));
+	conf->loc_flags = LOC_F_NEW;
+	dto = ls_locate(env, ls, fid, conf);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(dto);
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		GOTO(out, rc = -EEXIST);
+
+	th = dt_trans_create(env, ls->ls_osd);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		rc = dt_declare_ref_add(env, dto, th);
+		if (rc < 0)
+			GOTO(trans_stop, rc);
+
+		rc = dt_declare_ref_add(env, parent, th);
+		if (rc < 0)
+			GOTO(trans_stop, rc);
+	}
+
+	rec->rec_fid = fid;
+	rec->rec_type = attr->la_mode & S_IFMT;
+	rc = dt_declare_insert(env, parent, (const struct dt_rec *)rec,
+			       (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		if (!dt_try_as_dir(env, dto))
+			GOTO(trans_stop, rc = -ENOTDIR);
+
+		rec->rec_type = S_IFDIR;
+		rec->rec_fid = fid;
+		rc = dt_declare_insert(env, dto, (const struct dt_rec *)rec,
+				(const struct dt_key *)".", th);
+		if (rc != 0)
+			GOTO(trans_stop, rc);
+
+		rec->rec_fid = lu_object_fid(&parent->do_lu);
+		rc = dt_declare_insert(env, dto, (const struct dt_rec *)rec,
+				(const struct dt_key *)"..", th);
+		if (rc != 0)
+			GOTO(trans_stop, rc);
+
+		rc = dt_declare_ref_add(env, dto, th);
+		if (rc != 0)
+			GOTO(trans_stop, rc);
+	}
+
+	rc = dt_trans_start_local(env, ls->ls_osd, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	dt_write_lock(env, dto, DT_SRC_CHILD);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n",
+	       PFID(lu_object_fid(&dto->do_lu)));
+	rc = local_object_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(unlock, rc);
+	LASSERT(dt_object_exists(dto));
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+
+		rec->rec_type = S_IFDIR;
+		rec->rec_fid = fid;
+		/* Add "." and ".." for newly created dir */
+		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
+			       (const struct dt_key *)".", th);
+		if (rc != 0)
+			GOTO(destroy, rc);
+
+		dt_ref_add(env, dto, th);
+		rec->rec_fid = lu_object_fid(&parent->do_lu);
+		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
+			       (const struct dt_key *)"..", th);
+		if (rc != 0)
+			GOTO(destroy, rc);
+	}
+
+	rec->rec_fid = fid;
+	rec->rec_type = dto->do_lu.lo_header->loh_attr;
+	dt_write_lock(env, parent, DT_SRC_PARENT);
+	rc = dt_insert(env, parent, (const struct dt_rec *)rec,
+		       (const struct dt_key *)name, th);
+	if (dti->dti_dof.dof_type == DFT_DIR)
+		dt_ref_add(env, parent, th);
+	dt_write_unlock(env, parent);
+	if (rc)
+		GOTO(destroy, rc);
+destroy:
+	if (rc)
+		dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, ls->ls_osd, th);
+out:
+	if (rc) {
+		dt_object_put_nocache(env, dto);
+		dto = ERR_PTR(rc);
+	}
+	RETURN(dto);
+}
+
+struct dt_object *local_file_find(const struct lu_env *env,
+				  struct local_oid_storage *los,
+				  struct dt_object *parent,
+				  const char *name)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (!rc)
+		dto = ls_locate(env, dt2ls_dev(los->los_dev),
+				&dti->dti_fid, NULL);
+	else
+		dto = ERR_PTR(rc);
+
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find);
+
+/*
+ * Look up and create (if it does not exist) a local named file or directory in
+ * parent directory.
+ */
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	dto = local_file_find(env, los, parent, name);
+	if (!IS_ERR(dto) || PTR_ERR(dto) != -ENOENT)
+		return dto;
+
+	rc = local_object_fid_generate(env, los, &dti->dti_fid);
+	if (rc)
+		return ERR_PTR(rc);
+
+	/* create the object */
+	dti->dti_attr.la_valid = LA_MODE;
+	dti->dti_attr.la_mode = mode;
+	dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT);
+	dto = __local_file_create(env, &dti->dti_fid, los,
+				  dt2ls_dev(los->los_dev), parent, name,
+				  &dti->dti_attr, &dti->dti_dof);
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create);
+
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		dto = dt_locate(env, dt, &dti->dti_fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_CAST(ls);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid	= LA_MODE;
+			dti->dti_attr.la_mode	= mode;
+			dti->dti_dof.dof_type	= dt_mode_to_dft(mode & S_IFMT);
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				dt_object_put_nocache(env, dto);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create_with_fid);
+
+/*
+ * Look up and create (if it does not exist) a local named index file in parent
+ * directory.
+ */
+struct dt_object *local_index_find_or_create(const struct lu_env *env,
+					     struct local_oid_storage *los,
+					     struct dt_object *parent,
+					     const char *name, __u32 mode,
+					     const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		dto = ls_locate(env, dt2ls_dev(los->los_dev),
+				&dti->dti_fid, NULL);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		rc = local_object_fid_generate(env, los, &dti->dti_fid);
+		if (rc < 0) {
+			dto = ERR_PTR(rc);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat	= ft;
+			dto = __local_file_create(env, &dti->dti_fid, los,
+						  dt2ls_dev(los->los_dev),
+						  parent, name, &dti->dti_attr,
+						  &dti->dti_dof);
+		}
+	}
+	return dto;
+
+}
+EXPORT_SYMBOL(local_index_find_or_create);
+
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		if (!lu_fid_eq(fid, &dti->dti_fid))
+			dto = ERR_PTR(-EINVAL);
+		else
+			dto = dt_locate(env, dt, fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_CAST(ls);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat  = ft;
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				dt_object_put_nocache(env, dto);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_index_find_or_create_with_fid);
+
+static int local_object_declare_unlink(const struct lu_env *env,
+				       struct dt_device *dt,
+				       struct dt_object *p,
+				       struct dt_object *c, const char *name,
+				       struct thandle *th)
+{
+	int rc;
+
+	rc = dt_declare_delete(env, p, (const struct dt_key *)name, th);
+	if (rc < 0)
+		return rc;
+
+	if (S_ISDIR(p->do_lu.lo_header->loh_attr)) {
+		rc = dt_declare_ref_del(env, p, th);
+		if (rc < 0)
+			return rc;
+	}
+
+	rc = dt_declare_ref_del(env, c, th);
+	if (rc < 0)
+		return rc;
+
+	return dt_declare_destroy(env, c, th);
+}
+
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == -ENOENT)
+		RETURN(0);
+	else if (rc < 0)
+		RETURN(rc);
+
+	dto = dt_locate(env, dt, &dti->dti_fid);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(PTR_ERR(dto));
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_unlink(env, dt, parent, dto, name, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	if (S_ISDIR(dto->do_lu.lo_header->loh_attr)) {
+		dt_write_lock(env, parent, 0);
+		rc = dt_ref_del(env, parent, th);
+		dt_write_unlock(env, parent);
+		if (rc)
+			GOTO(stop, rc);
+	}
+
+	dt_write_lock(env, dto, 0);
+	rc = dt_delete(env, parent, (struct dt_key *)name, th);
+	if (rc < 0)
+		GOTO(unlock, rc);
+
+	rc = dt_ref_del(env, dto, th);
+	if (rc < 0) {
+		struct dt_insert_rec *rec = &dti->dti_dt_rec;
+
+		rec->rec_fid = &dti->dti_fid;
+		rec->rec_type = dto->do_lu.lo_header->loh_attr;
+		rc = dt_insert(env, parent, (const struct dt_rec *)rec,
+			       (const struct dt_key *)name, th);
+		GOTO(unlock, rc);
+	}
+
+	rc = dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+stop:
+	dt_trans_stop(env, dt, th);
+out:
+	dt_object_put_nocache(env, dto);
+	return rc;
+}
+EXPORT_SYMBOL(local_object_unlink);
+
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq)
+{
+	struct local_oid_storage *los, *ret = NULL;
+
+	list_for_each_entry(los, &ls->ls_los_list, los_list) {
+		if (los->los_seq == seq) {
+			atomic_inc(&los->los_refcount);
+			ret = los;
+			break;
+		}
+	}
+	return ret;
+}
+
+void dt_los_put(struct local_oid_storage *los)
+{
+	if (atomic_dec_and_test(&los->los_refcount))
+		/* should never happen, only local_oid_storage_fini should
+		 * drop refcount to zero */
+		LBUG();
+}
+
+/* after Lustre 2.3 release there may be old file to store last generated FID
+ * If such file exists then we have to read its content
+ */
+static int lastid_compat_check(const struct lu_env *env, struct dt_device *dev,
+			       __u64 lastid_seq, __u32 *first_oid,
+			       struct ls_device *ls)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*root = NULL;
+	struct los_ondisk	 losd;
+	struct dt_object	*o = NULL;
+	int			 rc = 0;
+
+	rc = dt_root_get(env, dev, &dti->dti_fid);
+	if (rc)
+		return rc;
+
+	root = ls_locate(env, ls, &dti->dti_fid, NULL);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	/* find old last_id file */
+	snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-%#llx-lastid",
+		 lastid_seq);
+	rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid);
+	dt_object_put_nocache(env, root);
+	if (rc == -ENOENT) {
+		/* old llog lastid accessed by FID only */
+		if (lastid_seq != FID_SEQ_LLOG)
+			return 0;
+		dti->dti_fid.f_seq = FID_SEQ_LLOG;
+		dti->dti_fid.f_oid = 1;
+		dti->dti_fid.f_ver = 0;
+		o = ls_locate(env, ls, &dti->dti_fid, NULL);
+		if (IS_ERR(o))
+			return PTR_ERR(o);
+
+		if (!dt_object_exists(o)) {
+			dt_object_put_nocache(env, o);
+			return 0;
+		}
+		CDEBUG(D_INFO, "Found old llog lastid file\n");
+	} else if (rc < 0) {
+		return rc;
+	} else {
+		CDEBUG(D_INFO, "Found old lastid file for sequence %#llx\n",
+		       lastid_seq);
+		o = ls_locate(env, ls, &dti->dti_fid, NULL);
+		if (IS_ERR(o))
+			return PTR_ERR(o);
+	}
+	/* let's read seq-NNNNNN-lastid file value */
+	LASSERT(dt_object_exists(o));
+	dti->dti_off = 0;
+	dti->dti_lb.lb_buf = &losd;
+	dti->dti_lb.lb_len = sizeof(losd);
+	dt_read_lock(env, o, 0);
+	rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+	dt_read_unlock(env, o);
+	if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) {
+		CERROR("%s: wrong content of seq-%#llx-lastid file, magic %x\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq,
+		       le32_to_cpu(losd.lso_magic));
+		rc = -EINVAL;
+	} else if (rc < 0) {
+		CERROR("%s: failed to read seq-%#llx-lastid: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, rc);
+	}
+	dt_object_put_nocache(env, o);
+	if (rc == 0)
+		*first_oid = le32_to_cpu(losd.lso_next_oid);
+	return rc;
+}
+
+/**
+ * Initialize local OID storage for required sequence.
+ * That may be needed for services that uses local files and requires
+ * dynamic OID allocation for them.
+ *
+ * Per each sequence we have an object with 'first_fid' identificator
+ * containing the counter for OIDs of locally created files with that
+ * sequence.
+ *
+ * It is used now by llog subsystem and MGS for NID tables
+ *
+ * Function gets first_fid to create counter object.
+ * All dynamic fids will be generated with the same sequence and incremented
+ * OIDs
+ *
+ * Returned local_oid_storage is in-memory representaion of OID storage
+ */
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct ls_device	*ls;
+	u64			 lastid;
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	__u32			 first_oid = fid_oid(first_fid);
+	int			 rc = 0;
+
+	ENTRY;
+
+	ls = ls_device_get(dev);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	*los = dt_los_find(ls, fid_seq(first_fid));
+	if (*los != NULL)
+		GOTO(out, rc = 0);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(*los);
+	if (*los == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	atomic_set(&(*los)->los_refcount, 1);
+	mutex_init(&(*los)->los_id_lock);
+	(*los)->los_dev = &ls->ls_top_dev;
+	atomic_inc(&ls->ls_refcount);
+	list_add(&(*los)->los_list, &ls->ls_los_list);
+
+	/* Use {seq, 0, 0} to create the LAST_ID file for every
+	 * sequence.  OIDs start at LUSTRE_FID_INIT_OID.
+	 */
+	dti->dti_fid.f_seq = fid_seq(first_fid);
+	dti->dti_fid.f_oid = LUSTRE_FID_LASTID_OID;
+	dti->dti_fid.f_ver = 0;
+	o = ls_locate(env, ls, &dti->dti_fid, NULL);
+	if (IS_ERR(o))
+		GOTO(out_los, rc = PTR_ERR(o));
+
+	if (!dt_object_exists(o)) {
+		rc = lastid_compat_check(env, dev, fid_seq(first_fid),
+					 &first_oid, ls);
+		if (rc < 0)
+			GOTO(out_los, rc);
+
+		th = dt_trans_create(env, dev);
+		if (IS_ERR(th))
+			GOTO(out_los, rc = PTR_ERR(th));
+
+		dti->dti_attr.la_valid = LA_MODE | LA_TYPE;
+		dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &dti->dti_attr, NULL,
+				       &dti->dti_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		lastid = cpu_to_le64(first_oid);
+
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &lastid;
+		dti->dti_lb.lb_len = sizeof(lastid);
+		rc = dt_declare_record_write(env, o, &dti->dti_lb, dti->dti_off,
+					     th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dev, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, o, 0);
+		if (dt_object_exists(o))
+			GOTO(out_lock, rc = 0);
+
+		rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof,
+			       th);
+		if (rc)
+			GOTO(out_lock, rc);
+
+		rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th);
+		if (rc)
+			GOTO(out_lock, rc);
+out_lock:
+		dt_write_unlock(env, o);
+out_trans:
+		dt_trans_stop(env, dev, th);
+	} else {
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &lastid;
+		dti->dti_lb.lb_len = sizeof(lastid);
+		dt_read_lock(env, o, 0);
+		rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+		dt_read_unlock(env, o);
+		if (rc == 0 && le64_to_cpu(lastid) > OBIF_MAX_OID) {
+			CERROR("%s: bad oid %llu is read from LAST_ID\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       le64_to_cpu(lastid));
+			rc = -EINVAL;
+		}
+	}
+out_los:
+	if (rc != 0) {
+		list_del(&(*los)->los_list);
+		atomic_dec(&ls->ls_refcount);
+		OBD_FREE_PTR(*los);
+		*los = NULL;
+		if (o != NULL && !IS_ERR(o))
+			dt_object_put_nocache(env, o);
+	} else {
+		(*los)->los_seq = fid_seq(first_fid);
+		(*los)->los_last_oid = le64_to_cpu(lastid);
+		(*los)->los_obj = o;
+		/* Read value should not be less than initial one
+		 * but possible after upgrade from older fs.
+		 * In this case just switch to the first_oid in memory and
+		 * it will be updated on disk with first object generated */
+		if ((*los)->los_last_oid < first_oid)
+			(*los)->los_last_oid = first_oid;
+	}
+out:
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+	return rc;
+}
+EXPORT_SYMBOL(local_oid_storage_init);
+
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los)
+{
+	struct ls_device *ls;
+
+	LASSERT(env);
+	LASSERT(los->los_dev);
+	ls = dt2ls_dev(los->los_dev);
+
+	/* Take the mutex before decreasing the reference to avoid race
+	 * conditions as described in LU-4721. */
+	mutex_lock(&ls->ls_los_mutex);
+	if (!atomic_dec_and_test(&los->los_refcount)) {
+		mutex_unlock(&ls->ls_los_mutex);
+		return;
+	}
+
+	if (los->los_obj)
+		dt_object_put_nocache(env, los->los_obj);
+	list_del(&los->los_list);
+	OBD_FREE_PTR(los);
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+}
+EXPORT_SYMBOL(local_oid_storage_fini);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h
new file mode 100644
index 0000000000000..63af1946e6095
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h
@@ -0,0 +1,94 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+#ifndef __LOCAL_STORAGE_H
+#define __LOCAL_STORAGE_H
+
+#include <dt_object.h>
+#include <obd.h>
+#include <lustre_fid.h>
+#include <lustre_disk.h>
+
+struct ls_device {
+	struct dt_device	 ls_top_dev;
+	/* all initialized ls_devices on this node linked by this */
+	struct list_head	 ls_linkage;
+	/* how many handle's reference this local storage */
+	atomic_t		 ls_refcount;
+	/* underlaying OSD device */
+	struct dt_device	*ls_osd;
+	/* list of all local OID storages */
+	struct list_head	 ls_los_list;
+	struct mutex		 ls_los_mutex;
+};
+
+static inline struct ls_device *dt2ls_dev(struct dt_device *d)
+{
+	return container_of_safe(d, struct ls_device, ls_top_dev);
+}
+
+struct ls_object {
+	struct lu_object_header	 ls_header;
+	struct dt_object	 ls_obj;
+};
+
+static inline struct ls_object *lu2ls_obj(struct lu_object *o)
+{
+	return container_of_safe(o, struct ls_object, ls_obj.do_lu);
+}
+
+static inline struct dt_object *ls_locate(const struct lu_env *env,
+					  struct ls_device *ls,
+					  const struct lu_fid *fid,
+					  const struct lu_object_conf *conf)
+{
+	return dt_locate_at(env, ls->ls_osd, fid,
+			    &ls->ls_top_dev.dd_lu_dev, conf);
+}
+
+struct ls_device *ls_device_get(struct dt_device *dev);
+void ls_device_put(const struct lu_env *env, struct ls_device *ls);
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq);
+void dt_los_put(struct local_oid_storage *los);
+
+/* Lustre 2.3 on-disk structure describing local object OIDs storage
+ * the structure to be used with any sequence managed by
+ * local object library.
+ * Obsoleted since 2.4 but is kept for compatibility reasons,
+ * see lastid_compat_check() in obdclass/local_storage.c */
+struct los_ondisk {
+	__u32 lso_magic;
+	__u32 lso_next_oid;
+};
+
+#define LOS_MAGIC	0xdecafbee
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c
new file mode 100644
index 0000000000000..521e59c16e88b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c
@@ -0,0 +1,136 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ *
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lprocfs_counters.c
+ *
+ * Lustre lprocfs counter routines
+ *
+ * Author: Andreas Dilger <andreas.dilger@intel.com>
+ */
+#include <linux/module.h>
+#include <lustre_lib.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_PROC_FS
+void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	LASSERTF(0 <= idx && idx < stats->ls_num,
+		 "idx %d, ls_num %hu\n", idx, stats->ls_num);
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	percpu_cntr->lc_count++;
+
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * lprocfs_counter_add() can be called in interrupt context,
+		 * as memory allocation could trigger memory shrinker call
+		 * ldlm_pool_shrink(), which calls lprocfs_counter_add().
+		 * LU-1727.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq += amount;
+		else
+			percpu_cntr->lc_sum += amount;
+
+		if (header->lc_config & LPROCFS_CNTR_STDDEV)
+			percpu_cntr->lc_sumsquare += (__s64)amount * amount;
+		if (amount < percpu_cntr->lc_min)
+			percpu_cntr->lc_min = amount;
+		if (amount > percpu_cntr->lc_max)
+			percpu_cntr->lc_max = amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_add);
+
+void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	LASSERTF(0 <= idx && idx < stats->ls_num,
+		 "idx %d, ls_num %hu\n", idx, stats->ls_num);
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * Sometimes we use RCU callbacks to free memory which calls
+		 * lprocfs_counter_sub(), and RCU callbacks may execute in
+		 * softirq context - right now that's the only case we're in
+		 * softirq context here, use separate counter for that.
+		 * bz20650.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq -= amount;
+		else
+			percpu_cntr->lc_sum -= amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_sub);
+#endif  /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
new file mode 100644
index 0000000000000..4202f44459316
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
@@ -0,0 +1,691 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu Yawei <niu@whamcloud.com>
+ */
+/*
+ * lustre/obdclass/lprocfs_jobstats.c
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_PROC_FS
+
+/*
+ * JobID formats & JobID environment variable names for supported
+ * job schedulers:
+ *
+ * SLURM:
+ *   JobID format:  32 bit integer.
+ *   JobID env var: SLURM_JOB_ID.
+ * SGE:
+ *   JobID format:  Decimal integer range to 99999.
+ *   JobID env var: JOB_ID.
+ * LSF:
+ *   JobID format:  6 digit integer by default (up to 999999), can be
+ *		  increased to 10 digit (up to 2147483646).
+ *   JobID env var: LSB_JOBID.
+ * Loadleveler:
+ *   JobID format:  String of machine_name.cluster_id.process_id, for
+ *		  example: fr2n02.32.0
+ *   JobID env var: LOADL_STEP_ID.
+ * PBS:
+ *   JobID format:  String of sequence_number[.server_name][@server].
+ *   JobID env var: PBS_JOBID.
+ * Maui/MOAB:
+ *   JobID format:  Same as PBS.
+ *   JobID env var: Same as PBS.
+ */
+
+struct job_stat {
+	struct hlist_node	js_hash;	/* hash struct for this jobid */
+	struct list_head	js_list;	/* on ojs_list, with ojs_lock */
+	atomic_t		js_refcount;	/* num users of this struct */
+	char			js_jobid[LUSTRE_JOBID_SIZE]; /* job name + NUL*/
+	ktime_t			js_time_init;	/* time of initial stat*/
+	ktime_t			js_time_latest;	/* time of most recent stat*/
+	struct lprocfs_stats	*js_stats;	/* per-job statistics */
+	struct obd_job_stats	*js_jobstats;	/* for accessing ojs_lock */
+};
+
+static unsigned
+job_stat_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static void *job_stat_key(struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return job->js_jobid;
+}
+
+static int job_stat_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return (strlen(job->js_jobid) == strlen(key)) &&
+	       !strncmp(job->js_jobid, key, strlen(key));
+}
+
+static void *job_stat_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct job_stat, js_hash);
+}
+
+static void job_stat_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	atomic_inc(&job->js_refcount);
+}
+
+static void job_free(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) == 0);
+	LASSERT(job->js_jobstats != NULL);
+
+	write_lock(&job->js_jobstats->ojs_lock);
+	list_del_init(&job->js_list);
+	write_unlock(&job->js_jobstats->ojs_lock);
+
+	lprocfs_free_stats(&job->js_stats);
+	OBD_FREE_PTR(job);
+}
+
+static void job_putref(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) > 0);
+	if (atomic_dec_and_test(&job->js_refcount))
+		job_free(job);
+}
+
+static void job_stat_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	job_putref(job);
+}
+
+static void job_stat_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	CERROR("should not have any items\n");
+}
+
+static struct cfs_hash_ops job_stats_hash_ops = {
+	.hs_hash       = job_stat_hash,
+	.hs_key        = job_stat_key,
+	.hs_keycmp     = job_stat_keycmp,
+	.hs_object     = job_stat_object,
+	.hs_get        = job_stat_get,
+	.hs_put_locked = job_stat_put_locked,
+	.hs_exit       = job_stat_exit,
+};
+
+/**
+ * Jobstats expiry iterator to clean up old jobids
+ *
+ * Called for each job_stat structure on this device, it should delete stats
+ * older than the specified \a oldest_time in seconds.  If \a oldest_time is
+ * in the future then this will delete all statistics (e.g. during shutdown).
+ *
+ * \param[in] hs	hash of all jobids on this device
+ * \param[in] bd	hash bucket containing this jobid
+ * \param[in] hnode	hash structure for this jobid
+ * \param[in] data	pointer to stats expiry time in seconds
+ */
+static int job_cleanup_iter_callback(struct cfs_hash *hs,
+				     struct cfs_hash_bd *bd,
+				     struct hlist_node *hnode, void *data)
+{
+	ktime_t oldest_time = *((ktime_t *)data);
+	struct job_stat *job;
+
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	if (ktime_before(job->js_time_latest, oldest_time))
+		cfs_hash_bd_del_locked(hs, bd, hnode);
+
+	return 0;
+}
+
+/**
+ * Clean up jobstats that were updated more than \a before seconds ago.
+ *
+ * Since this function may be called frequently, do not scan all of the
+ * jobstats on each call, only twice per cleanup interval.  That means stats
+ * may be on average around cleanup_interval / 4 older than the cleanup
+ * interval, but that is not considered harmful.
+ *
+ * The value stored in ojs_cleanup_interval is how often to perform a cleanup
+ * scan, and 1/2 of the maximum age of the individual statistics.  This is
+ * done rather than dividing the interval by two each time, because it is
+ * much easier to do the division when the value is initially set (in seconds)
+ * rather than after it has been converted to ktime_t, and maybe a bit faster.
+ *
+ * If \a clear is true then this will force clean up all jobstats
+ * (e.g. at shutdown).
+ *
+ * If there is already another thread doing jobstats cleanup, don't try to
+ * do this again in the current thread unless this is a force cleanup.
+ *
+ * \param[in] stats	stucture tracking all job stats for this device
+ * \param[in] clear	clear all job stats if true
+ */
+static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool clear)
+{
+	ktime_t cleanup_interval = stats->ojs_cleanup_interval;
+	ktime_t now = ktime_get_real();
+	ktime_t oldest;
+
+	if (likely(!clear)) {
+		/* ojs_cleanup_interval of zero means never clean up stats */
+		if (ktime_to_ns(cleanup_interval) == 0)
+			return;
+
+		if (ktime_before(now, ktime_add(stats->ojs_cleanup_last,
+						cleanup_interval)))
+			return;
+
+		if (stats->ojs_cleaning)
+			return;
+	}
+
+	write_lock(&stats->ojs_lock);
+	if (!clear && stats->ojs_cleaning) {
+		write_unlock(&stats->ojs_lock);
+		return;
+	}
+
+	stats->ojs_cleaning = true;
+	write_unlock(&stats->ojs_lock);
+
+	/* Can't hold ojs_lock over hash iteration, since it is grabbed by
+	 * job_cleanup_iter_callback()
+	 *   ->cfs_hash_bd_del_locked()
+	 *     ->job_putref()
+	 *       ->job_free()
+	 *
+	 * Holding ojs_lock isn't necessary for safety of the hash iteration,
+	 * since locking of the hash is handled internally, but there isn't
+	 * any benefit to having multiple threads doing cleanup at one time.
+	 *
+	 * Subtract twice the cleanup_interval, since it is 1/2 the maximum age.
+	 */
+	oldest = ktime_sub(now, ktime_add(cleanup_interval, cleanup_interval));
+	cfs_hash_for_each_safe(stats->ojs_hash, job_cleanup_iter_callback,
+			       &oldest);
+
+	write_lock(&stats->ojs_lock);
+	stats->ojs_cleaning = false;
+	stats->ojs_cleanup_last = ktime_get_real();
+	write_unlock(&stats->ojs_lock);
+}
+
+static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
+{
+	struct job_stat *job;
+
+	OBD_ALLOC_PTR(job);
+	if (job == NULL)
+		return NULL;
+
+	job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0);
+	if (job->js_stats == NULL) {
+		OBD_FREE_PTR(job);
+		return NULL;
+	}
+
+	jobs->ojs_cntr_init_fn(job->js_stats, 0);
+
+	memcpy(job->js_jobid, jobid, sizeof(job->js_jobid));
+	job->js_time_latest = job->js_stats->ls_init;
+	job->js_jobstats = jobs;
+	INIT_HLIST_NODE(&job->js_hash);
+	INIT_LIST_HEAD(&job->js_list);
+	atomic_set(&job->js_refcount, 1);
+
+	return job;
+}
+
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+			  int event, long amount)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+	struct job_stat *job, *job2;
+	ENTRY;
+
+	LASSERT(stats != NULL);
+	LASSERT(stats->ojs_hash != NULL);
+
+	if (event >= stats->ojs_cntr_num)
+		RETURN(-EINVAL);
+
+	if (jobid == NULL || strlen(jobid) == 0)
+		RETURN(0);
+
+	/* unterminated jobid should be handled in lustre_msg_get_jobid() */
+	if (strlen(jobid) >= LUSTRE_JOBID_SIZE) {
+		CERROR("%s: invalid jobid size %lu, expect %d\n", obd->obd_name,
+		       (unsigned long)strlen(jobid) + 1, LUSTRE_JOBID_SIZE);
+		RETURN(-EINVAL);
+	}
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (job)
+		goto found;
+
+	lprocfs_job_cleanup(stats, false);
+
+	job = job_alloc(jobid, stats);
+	if (job == NULL)
+		RETURN(-ENOMEM);
+
+	job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid,
+				       &job->js_hash);
+	if (job2 != job) {
+		job_putref(job);
+		job = job2;
+		/* We cannot LASSERT(!list_empty(&job->js_list)) here,
+		 * since we just lost the race for inserting "job" into the
+		 * ojs_list, and some other thread is doing it _right_now_.
+		 * Instead, be content the other thread is doing this, since
+		 * "job2" was initialized in job_alloc() already. LU-2163 */
+	} else {
+		LASSERT(list_empty(&job->js_list));
+		write_lock(&stats->ojs_lock);
+		list_add_tail(&job->js_list, &stats->ojs_list);
+		write_unlock(&stats->ojs_lock);
+	}
+
+found:
+	LASSERT(stats == job->js_jobstats);
+	job->js_time_latest = ktime_get_real();
+	lprocfs_counter_add(job->js_stats, event, amount);
+
+	job_putref(job);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_log);
+
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+
+	if (stats->ojs_hash == NULL)
+		return;
+
+	lprocfs_job_cleanup(stats, true);
+	cfs_hash_putref(stats->ojs_hash);
+	stats->ojs_hash = NULL;
+	LASSERT(list_empty(&stats->ojs_list));
+}
+EXPORT_SYMBOL(lprocfs_job_stats_fini);
+
+static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	loff_t off = *pos;
+	struct job_stat *job;
+
+	read_lock(&stats->ojs_lock);
+	if (off == 0)
+		return SEQ_START_TOKEN;
+	off--;
+	list_for_each_entry(job, &stats->ojs_list, js_list) {
+		if (!off--)
+			return job;
+	}
+	return NULL;
+}
+
+static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v)
+{
+	struct obd_job_stats *stats = p->private;
+
+	read_unlock(&stats->ojs_lock);
+}
+
+static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	struct job_stat *job;
+	struct list_head *next;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		next = stats->ojs_list.next;
+	} else {
+		job = (struct job_stat *)v;
+		next = job->js_list.next;
+	}
+
+	return next == &stats->ojs_list ? NULL :
+		list_entry(next, struct job_stat, js_list);
+}
+
+/*
+ * Example of output on MDT:
+ *
+ * job_stats:
+ * - job_id:        dd.4854
+ *   snapshot_time: 1322494486.123456789
+ *   start_time:    1322494476.012345678
+ *   elapsed_time:  10.111111111
+ *   open:          { samples:	       1, unit: reqs }
+ *   close:         { samples:	       1, unit: reqs }
+ *   mknod:         { samples:	       0, unit: reqs }
+ *   link:          { samples:	       0, unit: reqs }
+ *   unlink:        { samples:	       0, unit: reqs }
+ *   mkdir:         { samples:	       0, unit: reqs }
+ *   rmdir:         { samples:	       0, unit: reqs }
+ *   rename:        { samples:	       0, unit: reqs }
+ *   getattr:       { samples:	       1, unit: reqs }
+ *   setattr:       { samples:	       0, unit: reqs }
+ *   getxattr:      { samples:	       0, unit: reqs }
+ *   setxattr:      { samples:	       0, unit: reqs }
+ *   statfs:        { samples:	       0, unit: reqs }
+ *   sync:          { samples:	       0, unit: reqs }
+ *
+ * Example of output on OST:
+ *
+ * job_stats:
+ * - job_id         dd.4854
+ *   snapshot_time: 1322494602.123456789
+ *   start_time:    1322494592.987654321
+ *   elapsed_time:  9.135802468
+ *   read:          { samples: 0, unit: bytes, min:  0, max:  0, sum:  0 }
+ *   write:         { samples: 1, unit: bytes, min: 4096, max: 4096, sum: 4096 }
+ *   setattr:       { samples: 0, unit: reqs }
+ *   punch:         { samples: 0, unit: reqs }
+ *   sync:          { samples: 0, unit: reqs }
+ */
+
+static const char spaces[] = "                    ";
+
+static int inline width(const char *str, int len)
+{
+	return len - min((int)strlen(str), 15);
+}
+
+static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
+{
+	struct job_stat *job = v;
+	struct lprocfs_stats *s;
+	struct lprocfs_counter ret;
+	struct lprocfs_counter_header *cntr_header;
+	char escaped[LUSTRE_JOBID_SIZE * 4] = "";
+	char *quote = "", *c, *end;
+	int i, joblen = 0;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(p, "job_stats:\n");
+		return 0;
+	}
+
+	/* Quote and escape jobid characters to escape hex codes "\xHH" if
+	 * it contains any non-standard characters (space, newline, etc),
+	 * so it will be confined to single line and not break parsing.
+	 */
+	for (c = job->js_jobid, end = job->js_jobid + sizeof(job->js_jobid);
+	     c < end && *c != '\0';
+	     c++, joblen++) {
+		if (!isalnum(*c) &&
+		    *c != '.' && *c != '@' && *c != '-' && *c != '_') {
+			quote = "\"";
+			snprintf(escaped + joblen, sizeof(escaped), "\\x%02X",
+				 (unsigned char)*c);
+			joblen += 3;
+		} else {
+			escaped[joblen] = *c;
+		}
+	}
+
+	seq_printf(p, "- %-16s %s%*s%s\n",
+		   "job_id:", quote, joblen, escaped, quote);
+	lprocfs_stats_header(p, job->js_time_latest, job->js_stats->ls_init,
+			     16, ":", true, "  ");
+
+	s = job->js_stats;
+	for (i = 0; i < s->ls_num; i++) {
+		cntr_header = &s->ls_cnt_header[i];
+		lprocfs_stats_collect(s, i, &ret);
+
+		seq_printf(p, "  %s:%.*s { samples: %11llu",
+			   cntr_header->lc_name,
+			   width(cntr_header->lc_name, 15), spaces,
+			   ret.lc_count);
+		if (cntr_header->lc_units[0] != '\0')
+			seq_printf(p, ", unit: %5s", cntr_header->lc_units);
+
+		if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+			seq_printf(p, ", min: %8llu, max: %8llu, sum: %16llu",
+				   ret.lc_count ? ret.lc_min : 0,
+				   ret.lc_count ? ret.lc_max : 0,
+				   ret.lc_count ? ret.lc_sum : 0);
+		}
+		if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) {
+			seq_printf(p, ", sumsq: %18llu",
+				   ret.lc_count ? ret.lc_sumsquare : 0);
+		}
+
+		seq_printf(p, " }\n");
+
+	}
+
+	return 0;
+}
+
+static const struct seq_operations lprocfs_jobstats_seq_sops = {
+	.start	= lprocfs_jobstats_seq_start,
+	.stop	= lprocfs_jobstats_seq_stop,
+	.next	= lprocfs_jobstats_seq_next,
+	.show	= lprocfs_jobstats_seq_show,
+};
+
+static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lprocfs_jobstats_seq_sops);
+	if (rc)
+		return rc;
+	seq = file->private_data;
+	seq->private = pde_data(inode);
+	return 0;
+}
+
+static ssize_t lprocfs_jobstats_seq_write(struct file *file,
+					  const char __user *buf,
+					  size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_job_stats *stats = seq->private;
+	char jobid[LUSTRE_JOBID_SIZE];
+	struct job_stat *job;
+
+	if (len == 0 || len >= LUSTRE_JOBID_SIZE)
+		return -EINVAL;
+
+	if (stats->ojs_hash == NULL)
+		return -ENODEV;
+
+	if (copy_from_user(jobid, buf, len))
+		return -EFAULT;
+	jobid[len] = 0;
+
+	/* Trim '\n' if any */
+	if (jobid[len - 1] == '\n')
+		jobid[len - 1] = 0;
+
+	if (strcmp(jobid, "clear") == 0) {
+		lprocfs_job_cleanup(stats, true);
+
+		return len;
+	}
+
+	if (strlen(jobid) == 0)
+		return -EINVAL;
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (!job)
+		return -EINVAL;
+
+	cfs_hash_del_key(stats->ojs_hash, jobid);
+
+	job_putref(job);
+	return len;
+}
+
+/**
+ * Clean up the seq file state when the /proc file is closed.
+ *
+ * This also expires old job stats from the cache after they have been
+ * printed in case the system is idle and not generating new jobstats.
+ *
+ * \param[in] inode	struct inode for seq file being closed
+ * \param[in] file	struct file for seq file being closed
+ *
+ * \retval		0 on success
+ * \retval		negative errno on failure
+ */
+static int lprocfs_jobstats_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_job_stats *stats = seq->private;
+
+	lprocfs_job_cleanup(stats, false);
+
+	return lprocfs_seq_release(inode, file);
+}
+
+static const struct proc_ops lprocfs_jobstats_seq_fops = {
+	PROC_OWNER(THIS_MODULE)
+	.proc_open	= lprocfs_jobstats_seq_open,
+	.proc_read	= seq_read,
+	.proc_write	= lprocfs_jobstats_seq_write,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= lprocfs_jobstats_seq_release,
+};
+
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback init_fn)
+{
+	struct proc_dir_entry *entry;
+	struct obd_job_stats *stats;
+	ENTRY;
+
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_type->typ_name);
+
+	if (cntr_num <= 0)
+		RETURN(-EINVAL);
+
+	if (init_fn == NULL)
+		RETURN(-EINVAL);
+
+	/* Currently needs to be a target due to the use of obt_jobstats. */
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0 &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) != 0) {
+		CERROR("%s: invalid device type %s for job stats: rc = %d\n",
+		       obd->obd_name, obd->obd_type->typ_name, -EINVAL);
+		RETURN(-EINVAL);
+	}
+	stats = &obd->u.obt.obt_jobstats;
+
+	LASSERT(stats->ojs_hash == NULL);
+	stats->ojs_hash = cfs_hash_create("JOB_STATS",
+					  HASH_JOB_STATS_CUR_BITS,
+					  HASH_JOB_STATS_MAX_BITS,
+					  HASH_JOB_STATS_BKT_BITS, 0,
+					  CFS_HASH_MIN_THETA,
+					  CFS_HASH_MAX_THETA,
+					  &job_stats_hash_ops,
+					  CFS_HASH_DEFAULT);
+	if (stats->ojs_hash == NULL)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&stats->ojs_list);
+	rwlock_init(&stats->ojs_lock);
+	stats->ojs_cntr_num = cntr_num;
+	stats->ojs_cntr_init_fn = init_fn;
+	/* Store 1/2 the actual interval, since we use that the most, and
+	 * it is easier to work with.
+	 */
+	stats->ojs_cleanup_interval = ktime_set(600 / 2, 0); /* default 10 min*/
+	stats->ojs_cleanup_last = ktime_get_real();
+
+	entry = lprocfs_add_simple(obd->obd_proc_entry, "job_stats", stats,
+				   &lprocfs_jobstats_seq_fops);
+	if (IS_ERR(entry)) {
+		lprocfs_job_stats_fini(obd);
+		RETURN(-ENOMEM);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_init);
+#endif /* CONFIG_PROC_FS*/
+
+ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_job_stats *stats;
+	struct timespec64 ts;
+
+	stats = &obd->u.obt.obt_jobstats;
+	ts = ktime_to_timespec64(stats->ojs_cleanup_interval);
+
+	return scnprintf(buf, PAGE_SIZE, "%lld\n", (long long)ts.tv_sec * 2);
+}
+EXPORT_SYMBOL(job_cleanup_interval_show);
+
+ssize_t job_cleanup_interval_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_job_stats *stats;
+	unsigned int val;
+	int rc;
+
+	stats = &obd->u.obt.obt_jobstats;
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	stats->ojs_cleanup_interval = ktime_set(val / 2, 0);
+	lprocfs_job_cleanup(stats, false);
+
+	return count;
+}
+EXPORT_SYMBOL(job_cleanup_interval_store);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
new file mode 100644
index 0000000000000..bbb8e1c569215
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -0,0 +1,2331 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_PROC_FS
+
+static int lprocfs_no_percpu_stats = 0;
+module_param(lprocfs_no_percpu_stats, int, 0644);
+MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+static umode_t default_mode(const struct proc_ops *ops)
+{
+	umode_t mode = 0;
+
+	if (ops->proc_read)
+		mode = 0444;
+	if (ops->proc_write)
+		mode |= 0200;
+
+	return mode;
+}
+
+struct proc_dir_entry *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+		   void *data, const struct proc_ops *fops)
+{
+	struct proc_dir_entry *proc;
+	umode_t mode;
+
+	if (!root || !name || !fops)
+		return ERR_PTR(-EINVAL);
+
+	mode = default_mode(fops);
+	proc = proc_create_data(name, mode, root, fops, data);
+	if (!proc) {
+		CERROR("LprocFS: No memory to create /proc entry %s\n",
+		       name);
+		return ERR_PTR(-ENOMEM);
+	}
+	return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+					   struct proc_dir_entry *parent,
+					   const char *format, ...)
+{
+	struct proc_dir_entry *entry;
+	char *dest;
+	va_list ap;
+
+	if (!parent || !format)
+		return NULL;
+
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (!dest)
+		return NULL;
+
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
+
+	entry = proc_symlink(name, parent, dest);
+	if (!entry)
+		CERROR("LprocFS: Could not create symbolic link from "
+		       "%s to %s\n", name, dest);
+
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+	return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static const struct file_operations ldebugfs_empty_ops = { };
+
+void ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *list,
+		       void *data)
+{
+	if (IS_ERR_OR_NULL(parent) || IS_ERR_OR_NULL(list))
+		return;
+
+	while (list->name) {
+		umode_t mode = 0;
+
+		if (list->proc_mode != 0000) {
+			mode = list->proc_mode;
+		} else if (list->fops) {
+			if (list->fops->read)
+				mode = 0444;
+			if (list->fops->write)
+				mode |= 0200;
+		}
+		debugfs_create_file(list->name, mode, parent,
+				    list->data ? : data,
+				    list->fops ? : &ldebugfs_empty_ops);
+		list++;
+	}
+}
+EXPORT_SYMBOL_GPL(ldebugfs_add_vars);
+
+static const struct proc_ops lprocfs_empty_ops = { };
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in]  The parent proc entry on which new entry will be added.
+ * \param list [in]  Array of proc entries to be added.
+ * \param data [in]  The argument to be passed when entries read/write routines
+ *                   are called through /proc file.
+ *
+ * \retval 0   on success
+ *         < 0 on error
+ */
+int
+lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+		 void *data)
+{
+	if (!root || !list)
+		return -EINVAL;
+
+	while (list->name) {
+		struct proc_dir_entry *proc;
+		umode_t mode = 0;
+
+		if (list->proc_mode)
+			mode = list->proc_mode;
+		else if (list->fops)
+			mode = default_mode(list->fops);
+		proc = proc_create_data(list->name, mode, root,
+					list->fops ?: &lprocfs_empty_ops,
+					list->data ?: data);
+		if (!proc)
+			return -ENOMEM;
+		list++;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+	proc_remove(*rooth);
+	*rooth = NULL;
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+	LASSERT(parent != NULL);
+	remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+struct proc_dir_entry *
+lprocfs_register(const char *name, struct proc_dir_entry *parent,
+		 struct lprocfs_vars *list, void *data)
+{
+	struct proc_dir_entry *newchild;
+
+	newchild = proc_mkdir(name, parent);
+	if (!newchild)
+		return ERR_PTR(-ENOMEM);
+
+	if (list) {
+		int rc = lprocfs_add_vars(newchild, list, data);
+		if (rc) {
+			lprocfs_remove(&newchild);
+			return ERR_PTR(rc);
+		}
+	}
+	return newchild;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%s\n", obd->obd_uuid.uuid);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_uuid_seq_show);
+
+static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%s\n", obd->obd_uuid.uuid);
+}
+LUSTRE_RO_ATTR(uuid);
+
+static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
+	if (!rc)
+		return sprintf(buf, "%u\n", osfs.os_bsize);
+
+	return rc;
+}
+LUSTRE_RO_ATTR(blocksize);
+
+static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
+	if (!rc) {
+		u32 blk_size = osfs.os_bsize >> 10;
+		u64 result = osfs.os_blocks;
+
+		result *= rounddown_pow_of_two(blk_size ?: 1);
+		return sprintf(buf, "%llu\n", result);
+	}
+
+	return rc;
+}
+LUSTRE_RO_ATTR(kbytestotal);
+
+static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
+	if (!rc) {
+		u32 blk_size = osfs.os_bsize >> 10;
+		u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		return sprintf(buf, "%llu\n", result);
+	}
+
+	return rc;
+}
+LUSTRE_RO_ATTR(kbytesfree);
+
+static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
+	if (!rc) {
+		u32 blk_size = osfs.os_bsize >> 10;
+		u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		return sprintf(buf, "%llu\n", result);
+	}
+
+	return rc;
+}
+LUSTRE_RO_ATTR(kbytesavail);
+
+static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
+	if (!rc)
+		return sprintf(buf, "%llu\n", osfs.os_files);
+
+	return rc;
+}
+LUSTRE_RO_ATTR(filestotal);
+
+static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
+	if (!rc)
+		return sprintf(buf, "%llu\n", osfs.os_ffree);
+
+	return rc;
+}
+LUSTRE_RO_ATTR(filesfree);
+
+ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	struct ptlrpc_connection *conn;
+	ssize_t count;
+
+	with_imp_locked(obd, imp, count) {
+		conn = imp->imp_connection;
+		if (conn)
+			count = sprintf(buf, "%s\n", conn->c_remote_uuid.uuid);
+		else
+			count = sprintf(buf, "%s\n", "<none>");
+	}
+
+	return count;
+}
+EXPORT_SYMBOL(conn_uuid_show);
+
+int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_import *imp;
+	const char *imp_state_name = NULL;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	with_imp_locked(obd, imp, rc) {
+		imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+		seq_printf(m, "%s\t%s%s\n", obd2cli_tgt(obd), imp_state_name,
+			   imp->imp_deactive ? "\tDEACTIVATED" : "");
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_server_uuid_seq_show);
+
+/** add up per-cpu counters */
+
+/**
+ * Lock statistics structure for access, possibly only on this CPU.
+ *
+ * The statistics struct may be allocated with per-CPU structures for
+ * efficient concurrent update (usually only on server-wide stats), or
+ * as a single global struct (e.g. for per-client or per-job statistics),
+ * so the required locking depends on the type of structure allocated.
+ *
+ * For per-CPU statistics, pin the thread to the current cpuid so that
+ * will only access the statistics for that CPU.  If the stats structure
+ * for the current CPU has not been allocated (or previously freed),
+ * allocate it now.  The per-CPU statistics do not need locking since
+ * the thread is pinned to the CPU during update.
+ *
+ * For global statistics, lock the stats structure to prevent concurrent update.
+ *
+ * \param[in] stats	statistics structure to lock
+ * \param[in] opc	type of operation:
+ *			LPROCFS_GET_SMP_ID: "lock" and return current CPU index
+ *				for incrementing statistics for that CPU
+ *			LPROCFS_GET_NUM_CPU: "lock" and return number of used
+ *				CPU indices to iterate over all indices
+ * \param[out] flags	CPU interrupt saved state for IRQ-safe locking
+ *
+ * \retval cpuid of current thread or number of allocated structs
+ * \retval negative on error (only for opc LPROCFS_GET_SMP_ID + per-CPU stats)
+ */
+int lprocfs_stats_lock(struct lprocfs_stats *stats,
+		       enum lprocfs_stats_lock_ops opc,
+		       unsigned long *flags)
+{
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+			spin_lock_irqsave(&stats->ls_lock, *flags);
+		else
+			spin_lock(&stats->ls_lock);
+		return opc == LPROCFS_GET_NUM_CPU ? 1 : 0;
+	}
+
+	switch (opc) {
+	case LPROCFS_GET_SMP_ID: {
+		unsigned int cpuid = get_cpu();
+
+		if (unlikely(!stats->ls_percpu[cpuid])) {
+			int rc = lprocfs_stats_alloc_one(stats, cpuid);
+
+			if (rc < 0) {
+				put_cpu();
+				return rc;
+			}
+		}
+		return cpuid;
+	}
+	case LPROCFS_GET_NUM_CPU:
+		return stats->ls_biggest_alloc_num;
+	default:
+		LBUG();
+	}
+}
+
+/**
+ * Unlock statistics structure after access.
+ *
+ * Unlock the lock acquired via lprocfs_stats_lock() for global statistics,
+ * or unpin this thread from the current cpuid for per-CPU statistics.
+ *
+ * This function must be called using the same arguments as used when calling
+ * lprocfs_stats_lock() so that the correct operation can be performed.
+ *
+ * \param[in] stats	statistics structure to unlock
+ * \param[in] opc	type of operation (current cpuid or number of structs)
+ * \param[in] flags	CPU interrupt saved state for IRQ-safe locking
+ */
+void lprocfs_stats_unlock(struct lprocfs_stats *stats,
+			  enum lprocfs_stats_lock_ops opc,
+			  unsigned long *flags)
+{
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+			spin_unlock_irqrestore(&stats->ls_lock, *flags);
+		else
+			spin_unlock(&stats->ls_lock);
+	} else if (opc == LPROCFS_GET_SMP_ID) {
+		put_cpu();
+	}
+}
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt)
+{
+	unsigned int num_entry;
+	struct lprocfs_counter *percpu_cntr;
+	int i;
+	unsigned long flags = 0;
+
+	memset(cnt, 0, sizeof(*cnt));
+
+	if (!stats) {
+		/* set count to 1 to avoid divide-by-zero errs in callers */
+		cnt->lc_count = 1;
+		return;
+	}
+
+	cnt->lc_min = LC_MIN_INIT;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (!stats->ls_percpu[i])
+			continue;
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+		cnt->lc_count += percpu_cntr->lc_count;
+		cnt->lc_sum += percpu_cntr->lc_sum;
+		if (percpu_cntr->lc_min < cnt->lc_min)
+			cnt->lc_min = percpu_cntr->lc_min;
+		if (percpu_cntr->lc_max > cnt->lc_max)
+			cnt->lc_max = percpu_cntr->lc_max;
+		cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+
+static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
+{
+	bool first = true;
+
+	if (imp->imp_obd->obd_no_recov) {
+		seq_printf(m, "no_recov");
+		first = false;
+	}
+
+	flag2str(imp, invalid);
+	flag2str(imp, deactive);
+	flag2str(imp, replayable);
+	flag2str(imp, delayed_recovery);
+	flag2str(imp, vbr_failed);
+	flag2str(imp, pingable);
+	flag2str(imp, resend_replay);
+	flag2str(imp, no_pinger_recover);
+	flag2str(imp, connect_tried);
+}
+
+static const char *const obd_connect_names[] = {
+	/* flags names  */
+	"read_only",
+	"lov_index",
+	"connect_from_mds",
+	"write_grant",
+	"server_lock",
+	"version",
+	"request_portal",
+	"acl",
+	"xattr",
+	"create_on_write",
+	"truncate_lock",
+	"initial_transno",
+	"inode_bit_locks",
+	"barrier",
+	"getattr_by_fid",
+	"no_oh_for_devices",
+	"remote_client",
+	"remote_client_by_force",
+	"max_byte_per_rpc",
+	"64bit_qdata",
+	"mds_capability",
+	"oss_capability",
+	"early_lock_cancel",
+	"som",
+	"adaptive_timeouts",
+	"lru_resize",
+	"mds_mds_connection",
+	"real_conn",
+	"change_qunit_size",
+	"alt_checksum_algorithm",
+	"fid_is_enabled",
+	"version_recovery",
+	"pools",
+	"grant_shrink",
+	"skip_orphan",
+	"large_ea",
+	"full20",
+	"layout_lock",
+	"64bithash",
+	"object_max_bytes",
+	"imp_recov",
+	"jobstats",
+	"umask",
+	"einprogress",
+	"grant_param",
+	"flock_owner",
+	"lvb_type",
+	"nanoseconds_times",
+	"lightweight_conn",
+	"short_io",
+	"pingless",
+	"flock_deadlock",
+	"disp_stripe",
+	"open_by_fid",
+	"lfsck",
+	"unknown",
+	"unlink_close",
+	"multi_mod_rpcs",
+	"dir_stripe",
+	"subtree",
+	"lockahead",
+	"bulk_mbits",
+	"compact_obdo",
+	"second_flags",
+	/* flags2 names */
+	"file_secctx",	/* 0x01 */
+	"lockaheadv2",	/* 0x02 */
+	"dir_migrate",	/* 0x04 */
+	"sum_statfs",	/* 0x08 */
+	"overstriping",	/* 0x10 */
+	"flr",		/* 0x20 */
+	"wbc",		/* 0x40 */
+	"lock_convert",  /* 0x80 */
+	"archive_id_array",	/* 0x100 */
+	"increasing_xid",	/* 0x200 */
+	"selinux_policy",	/* 0x400 */
+	"lsom",			/* 0x800 */
+	"pcc",			/* 0x1000 */
+	"crush",		/* 0x2000 */
+	"async_discard",	/* 0x4000 */
+	"client_encryption",	/* 0x8000 */
+	"fidmap",		/* 0x10000 */
+	"getattr_pfid",		/* 0x20000 */
+	"lseek",		/* 0x40000 */
+	"dom_lvb",		/* 0x80000 */
+	"reply_mbits",		/* 0x100000 */
+	"mode_convert",		/* 0x200000 */
+	"batch_rpc",		/* 0x400000 */
+	"pcc_ro",		/* 0x800000 */
+	"mne_nid_type",		/* 0x1000000 */
+	"lock_contend",		/* 0x2000000 */
+	"atomic_open_lock",	/* 0x4000000 */
+	"name_encryption",	/* 0x8000000 */
+	"",	"",	"",	"",	"",	"",	"",	"",
+	"",	"",	"",	"",	"",	"",	"",	"",
+	"",	"",	"",	"",	"",	"",	"",	"",
+	"",	"",	"",	"",	"",	"",	"",
+	"mdll_bypass",		/* 0x800000000000000 */
+	"mdll",			/* 0x1000000000000000 */
+	"mdll_auto_refresh",	/* 0x2000000000000000 */
+	"", "",
+	NULL
+};
+
+void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2,
+			       const char *sep)
+{
+	bool first = true;
+	__u64 mask;
+	int i;
+
+	for (i = 0, mask = 1; i < 64; i++, mask <<= 1) {
+		if (flags & mask) {
+			seq_printf(m, "%s%s",
+				   first ? "" : sep, obd_connect_names[i]);
+			first = false;
+		}
+	}
+
+	if (flags & ~(mask - 1)) {
+		seq_printf(m, "%sunknown_%#llx",
+			   first ? "" : sep, flags & ~(mask - 1));
+		first = false;
+	}
+
+	if (!(flags & OBD_CONNECT_FLAGS2) || flags2 == 0)
+		return;
+
+	for (i = 64, mask = 1; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags2 & mask) {
+			seq_printf(m, "%s%s",
+				   first ? "" : sep, obd_connect_names[i]);
+			first = false;
+		}
+	}
+
+	if (flags2 & ~(mask - 1)) {
+		seq_printf(m, "%sunknown2_%#llx",
+			   first ? "" : sep, flags2 & ~(mask - 1));
+		first = false;
+	}
+}
+EXPORT_SYMBOL(obd_connect_seq_flags2str);
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
+			  const char *sep)
+{
+	__u64 mask;
+	int i, ret = 0;
+
+	for (i = 0, mask = 1; i < 64; i++, mask <<= 1) {
+		if (flags & mask)
+			ret += snprintf(page + ret, count - ret, "%s%s",
+					ret ? sep : "", obd_connect_names[i]);
+	}
+
+	if (flags & ~(mask - 1))
+		ret += snprintf(page + ret, count - ret,
+				"%sunknown_%#llx",
+				ret ? sep : "", flags & ~(mask - 1));
+
+	if (!(flags & OBD_CONNECT_FLAGS2) || flags2 == 0)
+		return ret;
+
+	for (i = 64, mask = 1; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags2 & mask)
+			ret += snprintf(page + ret, count - ret, "%s%s",
+					ret ? sep : "", obd_connect_names[i]);
+	}
+
+	if (flags2 & ~(mask - 1))
+		ret += snprintf(page + ret, count - ret,
+				"%sunknown2_%#llx",
+				ret ? sep : "", flags2 & ~(mask - 1));
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+void
+obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd)
+{
+	__u64 flags;
+
+	LASSERT(ocd != NULL);
+	flags = ocd->ocd_connect_flags;
+
+	seq_printf(m, "    connect_data:\n"
+		   "       flags: %#llx\n"
+		   "       instance: %u\n",
+		   ocd->ocd_connect_flags,
+		   ocd->ocd_instance);
+	if (flags & OBD_CONNECT_VERSION)
+		seq_printf(m, "       target_version: %u.%u.%u.%u\n",
+			   OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+			   OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+			   OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+			   OBD_OCD_VERSION_FIX(ocd->ocd_version));
+	if (flags & OBD_CONNECT_MDS)
+		seq_printf(m, "       mdt_index: %d\n", ocd->ocd_group);
+	if (flags & OBD_CONNECT_GRANT)
+		seq_printf(m, "       initial_grant: %d\n", ocd->ocd_grant);
+	if (flags & OBD_CONNECT_INDEX)
+		seq_printf(m, "       target_index: %u\n", ocd->ocd_index);
+	if (flags & OBD_CONNECT_BRW_SIZE)
+		seq_printf(m, "       max_brw_size: %d\n", ocd->ocd_brw_size);
+	if (flags & OBD_CONNECT_IBITS)
+		seq_printf(m, "       ibits_known: %#llx\n",
+			   ocd->ocd_ibits_known);
+	if (flags & OBD_CONNECT_GRANT_PARAM)
+		seq_printf(m, "       grant_block_size: %d\n"
+			   "       grant_inode_size: %d\n"
+			   "       grant_max_extent_size: %d\n"
+			   "       grant_extent_tax: %d\n",
+			   1 << ocd->ocd_grant_blkbits,
+			   1 << ocd->ocd_grant_inobits,
+			   ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits,
+			   ocd->ocd_grant_tax_kb << 10);
+	if (flags & OBD_CONNECT_TRANSNO)
+		seq_printf(m, "       first_transno: %#llx\n",
+			   ocd->ocd_transno);
+	if (flags & OBD_CONNECT_CKSUM)
+		seq_printf(m, "       cksum_types: %#x\n",
+			   ocd->ocd_cksum_types);
+	if (flags & OBD_CONNECT_MAX_EASIZE)
+		seq_printf(m, "       max_easize: %d\n", ocd->ocd_max_easize);
+	if (flags & OBD_CONNECT_MAXBYTES)
+		seq_printf(m, "       max_object_bytes: %llu\n",
+			   ocd->ocd_maxbytes);
+	if (flags & OBD_CONNECT_MULTIMODRPCS)
+		seq_printf(m, "       max_mod_rpcs: %hu\n",
+			   ocd->ocd_maxmodrpcs);
+}
+
+static void lprocfs_import_seq_show_locked(struct seq_file *m,
+					   struct obd_device *obd,
+					   struct obd_import *imp)
+{
+	char nidstr[LNET_NIDSTR_SIZE];
+	struct lprocfs_counter ret;
+	struct lprocfs_counter_header *header;
+	struct obd_import_conn *conn;
+	struct obd_connect_data *ocd;
+	int j;
+	int k;
+	int rw = 0;
+
+	ocd = &imp->imp_connect_data;
+
+	seq_printf(m, "import:\n"
+		   "    name: %s\n"
+		   "    target: %s\n"
+		   "    state: %s\n"
+		   "    connect_flags: [ ",
+		   obd->obd_name,
+		   obd2cli_tgt(obd),
+		   ptlrpc_import_state_name(imp->imp_state));
+	obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags,
+				  imp->imp_connect_data.ocd_connect_flags2,
+				  ", ");
+	seq_printf(m, " ]\n");
+	obd_connect_data_seqprint(m, ocd);
+	seq_printf(m, "    import_flags: [ ");
+	obd_import_flags2str(imp, m);
+
+	seq_printf(m, " ]\n"
+		   "    connection:\n"
+		   "       failover_nids: [ ");
+	spin_lock(&imp->imp_lock);
+	j = 0;
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		libcfs_nidstr_r(&conn->oic_conn->c_peer.nid,
+				  nidstr, sizeof(nidstr));
+		seq_printf(m, "%s%s", j ? ", " : "", nidstr);
+		j++;
+	}
+	if (imp->imp_connection)
+		libcfs_nidstr_r(&imp->imp_connection->c_peer.nid,
+				  nidstr, sizeof(nidstr));
+	else
+		strncpy(nidstr, "<none>", sizeof(nidstr));
+	seq_printf(m, " ]\n"
+		   "       current_connection: %s\n"
+		   "       connection_attempts: %u\n"
+		   "       generation: %u\n"
+		   "       in-progress_invalidations: %u\n"
+		   "       idle: %lld sec\n",
+		   nidstr,
+		   imp->imp_conn_cnt,
+		   imp->imp_generation,
+		   atomic_read(&imp->imp_inval_count),
+		   ktime_get_real_seconds() - imp->imp_last_reply_time);
+	spin_unlock(&imp->imp_lock);
+
+	if (!obd->obd_svc_stats)
+		return;
+
+	header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+	lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+	if (ret.lc_count != 0)
+		ret.lc_sum = div64_s64(ret.lc_sum, ret.lc_count);
+	else
+		ret.lc_sum = 0;
+	seq_printf(m, "    rpcs:\n"
+		   "       inflight: %u\n"
+		   "       unregistering: %u\n"
+		   "       timeouts: %u\n"
+		   "       avg_waittime: %llu %s\n",
+		   atomic_read(&imp->imp_inflight),
+		   atomic_read(&imp->imp_unregistering),
+		   atomic_read(&imp->imp_timeouts),
+		   ret.lc_sum, header->lc_units);
+
+	k = 0;
+	for(j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+		if (imp->imp_at.iat_portal[j] == 0)
+			break;
+		k = max_t(unsigned int, k,
+			  at_get(&imp->imp_at.iat_service_estimate[j]));
+	}
+	seq_printf(m, "    service_estimates:\n"
+		   "       services: %u sec\n"
+		   "       network: %d sec\n",
+		   k,
+		   at_get(&imp->imp_at.iat_net_latency));
+
+	seq_printf(m, "    transactions:\n"
+		   "       last_replay: %llu\n"
+		   "       peer_committed: %llu\n"
+		   "       last_checked: %llu\n",
+		   imp->imp_last_replay_transno,
+		   imp->imp_peer_committed_transno,
+		   imp->imp_last_transno_checked);
+
+	/* avg data rates */
+	for (rw = 0; rw <= 1; rw++) {
+		lprocfs_stats_collect(obd->obd_svc_stats,
+				      PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+				      &ret);
+		if (ret.lc_sum > 0 && ret.lc_count > 0) {
+			ret.lc_sum = div64_s64(ret.lc_sum, ret.lc_count);
+			seq_printf(m, "    %s_data_averages:\n"
+				   "       bytes_per_rpc: %llu\n",
+				   rw ? "write" : "read",
+				   ret.lc_sum);
+		}
+		k = (int)ret.lc_sum;
+		j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+		header = &obd->obd_svc_stats->ls_cnt_header[j];
+		lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+		if (ret.lc_sum > 0 && ret.lc_count != 0) {
+			ret.lc_sum = div64_s64(ret.lc_sum, ret.lc_count);
+			seq_printf(m, "       %s_per_rpc: %llu\n",
+				   header->lc_units, ret.lc_sum);
+			j = (int)ret.lc_sum;
+			if (j > 0)
+				seq_printf(m, "       MB_per_sec: %u.%.02u\n",
+					   k / j, (100 * k / j) % 100);
+		}
+	}
+}
+
+int lprocfs_import_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	int rv;
+
+	LASSERT(obd != NULL);
+	with_imp_locked(obd, imp, rv)
+		lprocfs_import_seq_show_locked(m, obd, imp);
+	return rv;
+}
+EXPORT_SYMBOL(lprocfs_import_seq_show);
+
+int lprocfs_state_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	int j, k;
+	int rc;
+
+	LASSERT(obd != NULL);
+	with_imp_locked(obd, imp, rc) {
+		seq_printf(m, "current_state: %s\n",
+			   ptlrpc_import_state_name(imp->imp_state));
+		seq_printf(m, "state_history:\n");
+		k = imp->imp_state_hist_idx;
+		for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+			struct import_state_hist *ish =
+				&imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+			if (ish->ish_state == 0)
+				continue;
+			seq_printf(m, " - [ %lld, %s ]\n", (s64)ish->ish_time,
+				   ptlrpc_import_state_name(ish->ish_state));
+		}
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_state_seq_show);
+
+int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at)
+{
+	int i;
+	for (i = 0; i < AT_BINS; i++)
+		seq_printf(m, "%3u ", at->at_hist[i]);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_timeouts_show_seq */
+static void lprocfs_timeouts_seq_show_locked(struct seq_file *m,
+					     struct obd_device *obd,
+					     struct obd_import *imp)
+{
+	timeout_t cur_timeout, worst_timeout;
+	time64_t now, worst_timestamp;
+	int i;
+
+	LASSERT(obd != NULL);
+
+	now = ktime_get_real_seconds();
+
+	/* Some network health info for kicks */
+	seq_printf(m, "%-10s : %lld, %llds ago\n",
+		   "last reply", (s64)imp->imp_last_reply_time,
+		   (s64)(now - imp->imp_last_reply_time));
+
+	cur_timeout = at_get(&imp->imp_at.iat_net_latency);
+	worst_timeout = imp->imp_at.iat_net_latency.at_worst_timeout_ever;
+	worst_timestamp = imp->imp_at.iat_net_latency.at_worst_timestamp;
+	seq_printf(m, "%-10s : cur %3u  worst %3u (at %lld, %llds ago) ",
+		   "network", cur_timeout, worst_timeout, worst_timestamp,
+		   now - worst_timestamp);
+	lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency);
+
+	for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		struct adaptive_timeout *service_est;
+
+		if (imp->imp_at.iat_portal[i] == 0)
+			break;
+
+		service_est = &imp->imp_at.iat_service_estimate[i];
+		cur_timeout = at_get(service_est);
+		worst_timeout = service_est->at_worst_timeout_ever;
+		worst_timestamp = service_est->at_worst_timestamp;
+		seq_printf(m, "portal %-2d  : cur %3u  worst %3u (at %lld, %llds ago) ",
+			   imp->imp_at.iat_portal[i], cur_timeout,
+			   worst_timeout, worst_timestamp,
+			   now - worst_timestamp);
+		lprocfs_at_hist_helper(m, service_est);
+	}
+}
+
+int lprocfs_timeouts_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	int rc;
+
+	with_imp_locked(obd, imp, rc)
+		lprocfs_timeouts_seq_show_locked(m, obd, imp);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_timeouts_seq_show);
+
+int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	__u64 flags;
+	__u64 flags2;
+	struct obd_import *imp;
+	int rc;
+
+	with_imp_locked(obd, imp, rc) {
+		flags = imp->imp_connect_data.ocd_connect_flags;
+		flags2 = imp->imp_connect_data.ocd_connect_flags2;
+		seq_printf(m, "flags=%#llx\n", flags);
+		seq_printf(m, "flags2=%#llx\n", flags2);
+		obd_connect_seq_flags2str(m, flags, flags2, "\n");
+		seq_printf(m, "\n");
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_connect_flags_seq_show);
+
+static const struct attribute *obd_def_uuid_attrs[] = {
+	&lustre_attr_uuid.attr,
+	NULL,
+};
+
+static const struct attribute *obd_def_attrs[] = {
+	&lustre_attr_blocksize.attr,
+	&lustre_attr_kbytestotal.attr,
+	&lustre_attr_kbytesfree.attr,
+	&lustre_attr_kbytesavail.attr,
+	&lustre_attr_filestotal.attr,
+	&lustre_attr_filesfree.attr,
+	&lustre_attr_uuid.attr,
+	NULL,
+};
+
+static void obd_sysfs_release(struct kobject *kobj)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	complete(&obd->obd_kobj_unregister);
+}
+
+int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only)
+{
+	struct ldebugfs_vars *debugfs_vars = NULL;
+	int rc;
+
+	if (!obd || obd->obd_magic != OBD_DEVICE_MAGIC)
+		return -ENODEV;
+
+	rc = kobject_set_name(&obd->obd_kset.kobj, "%s", obd->obd_name);
+	if (rc)
+		return rc;
+
+	obd->obd_ktype.sysfs_ops = &lustre_sysfs_ops;
+	obd->obd_ktype.release = obd_sysfs_release;
+
+	obd->obd_kset.kobj.parent = &obd->obd_type->typ_kobj;
+	obd->obd_kset.kobj.ktype = &obd->obd_ktype;
+	init_completion(&obd->obd_kobj_unregister);
+	rc = kset_register(&obd->obd_kset);
+	if (rc)
+		return rc;
+
+	if (uuid_only)
+		obd->obd_attrs = obd_def_uuid_attrs;
+	else
+		obd->obd_attrs = obd_def_attrs;
+
+	rc = sysfs_create_files(&obd->obd_kset.kobj, obd->obd_attrs);
+	if (rc) {
+		kset_unregister(&obd->obd_kset);
+		return rc;
+	}
+
+	if (!obd->obd_type->typ_procroot)
+		debugfs_vars = obd->obd_debugfs_vars;
+	obd->obd_debugfs_entry = debugfs_create_dir(
+		obd->obd_name, obd->obd_type->typ_debugfs_entry);
+	ldebugfs_add_vars(obd->obd_debugfs_entry, debugfs_vars, obd);
+
+	if (obd->obd_proc_entry || !obd->obd_type->typ_procroot)
+		GOTO(already_registered, rc);
+
+	obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+					       obd->obd_type->typ_procroot,
+					       obd->obd_vars, obd);
+	if (IS_ERR(obd->obd_proc_entry)) {
+		rc = PTR_ERR(obd->obd_proc_entry);
+		CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
+		obd->obd_proc_entry = NULL;
+
+		debugfs_remove_recursive(obd->obd_debugfs_entry);
+		obd->obd_debugfs_entry = NULL;
+
+		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
+		obd->obd_attrs = NULL;
+		kset_unregister(&obd->obd_kset);
+		return rc;
+	}
+already_registered:
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+	if (!obd)
+		return -EINVAL;
+
+	if (obd->obd_proc_exports_entry) {
+		/* Should be no exports left */
+		lprocfs_remove(&obd->obd_proc_exports_entry);
+		obd->obd_proc_exports_entry = NULL;
+	}
+
+	if (obd->obd_proc_entry) {
+		lprocfs_remove(&obd->obd_proc_entry);
+		obd->obd_proc_entry = NULL;
+	}
+
+	debugfs_remove_recursive(obd->obd_debugfs_entry);
+	obd->obd_debugfs_entry = NULL;
+
+	/* obd device never allocated a kset */
+	if (!obd->obd_kset.kobj.state_initialized)
+		return 0;
+
+	if (obd->obd_attrs) {
+		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
+		obd->obd_attrs = NULL;
+	}
+
+	kset_unregister(&obd->obd_kset);
+	wait_for_completion(&obd->obd_kobj_unregister);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
+{
+	struct lprocfs_counter *cntr;
+	unsigned int percpusize;
+	int rc = -ENOMEM;
+	unsigned long flags = 0;
+	int i;
+
+	LASSERT(stats->ls_percpu[cpuid] == NULL);
+	LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
+	if (stats->ls_percpu[cpuid]) {
+		rc = 0;
+		if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, flags);
+			else
+				spin_lock(&stats->ls_lock);
+			if (stats->ls_biggest_alloc_num <= cpuid)
+				stats->ls_biggest_alloc_num = cpuid + 1;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock, flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		}
+		/* initialize the ls_percpu[cpuid] non-zero counter */
+		for (i = 0; i < stats->ls_num; ++i) {
+			cntr = lprocfs_stats_counter_get(stats, cpuid, i);
+			cntr->lc_min = LC_MIN_INIT;
+		}
+	}
+	return rc;
+}
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+                                          enum lprocfs_stats_flags flags)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_entry;
+	unsigned int percpusize = 0;
+	int i;
+
+	if (num == 0)
+		return NULL;
+
+	if (lprocfs_no_percpu_stats != 0)
+		flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+	if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	/* alloc percpu pointers for all possible cpu slots */
+	LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+	if (!stats)
+		return NULL;
+
+	stats->ls_num = num;
+	stats->ls_flags = flags;
+	stats->ls_init = ktime_get_real();
+	spin_lock_init(&stats->ls_lock);
+
+	/* alloc num of counter headers */
+	CFS_ALLOC_PTR_ARRAY(stats->ls_cnt_header, stats->ls_num);
+	if (!stats->ls_cnt_header)
+		goto fail;
+
+	if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+		/* contains only one set counters */
+		percpusize = lprocfs_stats_counter_size(stats);
+		LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+		if (!stats->ls_percpu[0])
+			goto fail;
+		stats->ls_biggest_alloc_num = 1;
+	} else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+		/* alloc all percpu data, currently only obd_memory use this */
+		for (i = 0; i < num_entry; ++i)
+			if (lprocfs_stats_alloc_one(stats, i) < 0)
+				goto fail;
+	}
+
+	return stats;
+
+fail:
+	lprocfs_free_stats(&stats);
+	return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+	struct lprocfs_stats *stats = *statsh;
+	unsigned int num_entry;
+	unsigned int percpusize;
+	unsigned int i;
+
+	if (!stats || stats->ls_num == 0)
+		return;
+	*statsh = NULL;
+
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	for (i = 0; i < num_entry; i++)
+		if (stats->ls_percpu[i])
+			LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+	if (stats->ls_cnt_header)
+		CFS_FREE_PTR_ARRAY(stats->ls_cnt_header, stats->ls_num);
+	LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+			    enum lprocfs_fields_flags field)
+{
+	unsigned long flags = 0;
+	unsigned int num_cpu;
+	unsigned int i;
+	u64 ret = 0;
+
+	LASSERT(stats);
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; i++) {
+		struct lprocfs_counter *cntr;
+
+		if (!stats->ls_percpu[i])
+			continue;
+
+		cntr = lprocfs_stats_counter_get(stats, i, idx);
+		ret += lprocfs_read_helper(cntr, &stats->ls_cnt_header[idx],
+					   stats->ls_flags, field);
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	return ret;
+}
+EXPORT_SYMBOL(lprocfs_stats_collector);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+	struct lprocfs_counter *percpu_cntr;
+	int i;
+	int j;
+	unsigned int num_entry;
+	unsigned long flags = 0;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (!stats->ls_percpu[i])
+			continue;
+		for (j = 0; j < stats->ls_num; j++) {
+			percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+			percpu_cntr->lc_count		= 0;
+			percpu_cntr->lc_min		= LC_MIN_INIT;
+			percpu_cntr->lc_max		= 0;
+			percpu_cntr->lc_sumsquare	= 0;
+			percpu_cntr->lc_sum		= 0;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				percpu_cntr->lc_sum_irq	= 0;
+		}
+	}
+	stats->ls_init = ktime_get_real();
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file,
+				       const char __user *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct lprocfs_stats *stats = seq->private;
+
+	lprocfs_clear_stats(stats);
+
+	return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct lprocfs_stats *stats = p->private;
+
+	return (*pos < stats->ls_num) ? pos : NULL;
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return lprocfs_stats_seq_start(p, pos);
+}
+
+/**
+ * print header of stats including snapshot_time, start_time and elapsed_time.
+ *
+ * \param seq		the file to print content to
+ * \param now		end time to calculate elapsed_time
+ * \param ts_init	start time to calculate elapsed_time
+ * \param width		the width of key to align them well
+ * \param colon		"" or ":"
+ * \param show_units	show units or not
+ * \param prefix	prefix (indent) before printing each line of header
+ *			to align them with other content
+ */
+void lprocfs_stats_header(struct seq_file *seq, ktime_t now, ktime_t ts_init,
+			  int width, const char *colon, bool show_units,
+			  const char *prefix)
+{
+	const char *units = show_units ? " secs.nsecs" : "";
+	struct timespec64 ts;
+	const char *field;
+
+	field = (colon && colon[0]) ? "snapshot_time:" : "snapshot_time";
+	ts = ktime_to_timespec64(now);
+	seq_printf(seq, "%s%-*s %llu.%09lu%s\n", prefix, width, field,
+		   (s64)ts.tv_sec, ts.tv_nsec, units);
+
+	field = (colon && colon[0]) ? "start_time:" : "start_time";
+	ts = ktime_to_timespec64(ts_init);
+	seq_printf(seq, "%s%-*s %llu.%09lu%s\n", prefix, width, field,
+		   (s64)ts.tv_sec, ts.tv_nsec, units);
+
+	field = (colon && colon[0]) ? "elapsed_time:" : "elapsed_time";
+	ts = ktime_to_timespec64(ktime_sub(now, ts_init));
+	seq_printf(seq, "%s%-*s %llu.%09lu%s\n", prefix, width, field,
+		   (s64)ts.tv_sec, ts.tv_nsec, units);
+}
+EXPORT_SYMBOL(lprocfs_stats_header);
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+	struct lprocfs_stats *stats = p->private;
+	struct lprocfs_counter_header *hdr;
+	struct lprocfs_counter ctr;
+	int idx = *(loff_t *)v;
+
+	if (idx == 0)
+		lprocfs_stats_header(p, ktime_get_real(), stats->ls_init, 25,
+				     "", true, "");
+
+	hdr = &stats->ls_cnt_header[idx];
+	lprocfs_stats_collect(stats, idx, &ctr);
+
+	if (ctr.lc_count == 0)
+		return 0;
+
+	seq_printf(p, "%-25s %lld samples [%s]", hdr->lc_name,
+		   ctr.lc_count, hdr->lc_units);
+
+	if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && ctr.lc_count > 0) {
+		seq_printf(p, " %lld %lld %lld",
+			   ctr.lc_min, ctr.lc_max, ctr.lc_sum);
+		if (hdr->lc_config & LPROCFS_CNTR_STDDEV)
+			seq_printf(p, " %llu", ctr.lc_sumsquare);
+	}
+	seq_putc(p, '\n');
+	return 0;
+}
+
+static const struct seq_operations lprocfs_stats_seq_sops = {
+	.start	= lprocfs_stats_seq_start,
+	.stop	= lprocfs_stats_seq_stop,
+	.next	= lprocfs_stats_seq_next,
+	.show	= lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lprocfs_stats_seq_sops);
+	if (rc)
+		return rc;
+	seq = file->private_data;
+	seq->private = inode->i_private ? inode->i_private : pde_data(inode);
+	return 0;
+}
+
+const struct file_operations ldebugfs_stats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_stats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_stats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+EXPORT_SYMBOL(ldebugfs_stats_seq_fops);
+
+static const struct proc_ops lprocfs_stats_seq_fops = {
+	PROC_OWNER(THIS_MODULE)
+	.proc_open	= lprocfs_stats_seq_open,
+	.proc_read	= seq_read,
+	.proc_write	= lprocfs_stats_seq_write,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= lprocfs_seq_release,
+};
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+                           struct lprocfs_stats *stats)
+{
+	struct proc_dir_entry *entry;
+	LASSERT(root != NULL);
+
+	entry = proc_create_data(name, 0644, root,
+				 &lprocfs_stats_seq_fops, stats);
+	if (!entry)
+		return -ENOMEM;
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+			  unsigned conf, const char *name, const char *units)
+{
+	struct lprocfs_counter_header *header;
+	struct lprocfs_counter *percpu_cntr;
+	unsigned long flags = 0;
+	unsigned int i;
+	unsigned int num_cpu;
+
+	LASSERT(stats != NULL);
+
+	header = &stats->ls_cnt_header[index];
+	LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+		 index, name, units);
+
+	header->lc_config = conf;
+	header->lc_name   = name;
+	header->lc_units  = units;
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; ++i) {
+		if (!stats->ls_percpu[i])
+			continue;
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+		percpu_cntr->lc_count		= 0;
+		percpu_cntr->lc_min		= LC_MIN_INIT;
+		percpu_cntr->lc_max		= 0;
+		percpu_cntr->lc_sumsquare	= 0;
+		percpu_cntr->lc_sum		= 0;
+		if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq	= 0;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+static const char * const mps_stats[] = {
+	[LPROC_MD_CLOSE]		= "close",
+	[LPROC_MD_CREATE]		= "create",
+	[LPROC_MD_ENQUEUE]		= "enqueue",
+	[LPROC_MD_GETATTR]		= "getattr",
+	[LPROC_MD_INTENT_LOCK]		= "intent_lock",
+	[LPROC_MD_LINK]			= "link",
+	[LPROC_MD_RENAME]		= "rename",
+	[LPROC_MD_SETATTR]		= "setattr",
+	[LPROC_MD_FSYNC]		= "fsync",
+	[LPROC_MD_READ_PAGE]		= "read_page",
+	[LPROC_MD_UNLINK]		= "unlink",
+	[LPROC_MD_SETXATTR]		= "setxattr",
+	[LPROC_MD_GETXATTR]		= "getxattr",
+	[LPROC_MD_INTENT_GETATTR_ASYNC]	= "intent_getattr_async",
+	[LPROC_MD_REVALIDATE_LOCK]	= "revalidate_lock",
+};
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+			   unsigned int num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	/*
+	 * TODO Ensure that this function is only used where
+	 * appropriate by adding an assertion to the effect that
+	 * obd->obd_type->typ_md_ops is not NULL. We can't do this now
+	 * because mdt_procfs_init() uses this function to allocate
+	 * the stats backing /proc/fs/lustre/mdt/.../md_stats but the
+	 * mdt layer does not use the md_ops interface. This is
+	 * confusing and a waste of memory. See LU-2484.
+	 */
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_md_stats == NULL);
+
+	num_stats = ARRAY_SIZE(mps_stats) + num_private_stats;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (!stats)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(mps_stats); i++) {
+		lprocfs_counter_init(stats, i, 0, mps_stats[i], "reqs");
+		if (!stats->ls_cnt_header[i].lc_name) {
+			CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n",
+			       i);
+			LBUG();
+		}
+	}
+
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->obd_md_stats = stats;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+	struct lprocfs_stats *stats = obd->obd_md_stats;
+
+	if (stats) {
+		obd->obd_md_stats = NULL;
+		lprocfs_free_stats(&stats);
+	}
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_ENQUEUE - LDLM_FIRST_OPC,
+			     0, "ldlm_enqueue", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CONVERT - LDLM_FIRST_OPC,
+			     0, "ldlm_convert", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CANCEL - LDLM_FIRST_OPC,
+			     0, "ldlm_cancel", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_bl_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_cp_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+__s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+			  struct lprocfs_counter_header *header,
+			  enum lprocfs_stats_flags flags,
+			  enum lprocfs_fields_flags field)
+{
+	__s64 ret = 0;
+
+	if (!lc || !header)
+		RETURN(0);
+
+	switch (field) {
+		case LPROCFS_FIELDS_FLAGS_CONFIG:
+			ret = header->lc_config;
+			break;
+		case LPROCFS_FIELDS_FLAGS_SUM:
+			ret = lc->lc_sum;
+			if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+				ret += lc->lc_sum_irq;
+			break;
+		case LPROCFS_FIELDS_FLAGS_MIN:
+			ret = lc->lc_min;
+			break;
+		case LPROCFS_FIELDS_FLAGS_MAX:
+			ret = lc->lc_max;
+			break;
+		case LPROCFS_FIELDS_FLAGS_AVG:
+			ret = (lc->lc_max - lc->lc_min) / 2;
+			break;
+		case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
+			ret = lc->lc_sumsquare;
+			break;
+		case LPROCFS_FIELDS_FLAGS_COUNT:
+			ret = lc->lc_count;
+			break;
+		default:
+			break;
+	};
+	RETURN(ret);
+}
+EXPORT_SYMBOL(lprocfs_read_helper);
+
+/**
+ * string_to_size - convert ASCII string representing a numerical
+ *		    value with optional units to 64-bit binary value
+ *
+ * @size:	The numerical value extract out of @buffer
+ * @buffer:	passed in string to parse
+ * @count:	length of the @buffer
+ *
+ * This function returns a 64-bit binary value if @buffer contains a valid
+ * numerical string. The string is parsed to 3 significant figures after
+ * the decimal point. Support the string containing an optional units at
+ * the end which can be base 2 or base 10 in value. If no units are given
+ * the string is assumed to just a numerical value.
+ *
+ * Returns:	@count if the string is successfully parsed,
+ *		-errno on invalid input strings. Error values:
+ *
+ *  - ``-EINVAL``: @buffer is not a proper numerical string
+ *  - ``-EOVERFLOW``: results does not fit into 64 bits.
+ *  - ``-E2BIG ``: @buffer is too large (not a valid number)
+ */
+int string_to_size(u64 *size, const char *buffer, size_t count)
+{
+	/* For string_get_size() it can support values above exabytes,
+	 * (ZiB, YiB) due to breaking the return value into a size and
+	 * bulk size to avoid 64 bit overflow. We don't break the size
+	 * up into block size units so we don't support ZiB or YiB.
+	 */
+	static const char *const units_10[] = {
+		"kB", "MB", "GB", "TB", "PB", "EB",
+	};
+	static const char *const units_2[] = {
+		"K",  "M",  "G",  "T",  "P",  "E",
+	};
+	static const char *const *const units_str[] = {
+		[STRING_UNITS_2] = units_2,
+		[STRING_UNITS_10] = units_10,
+	};
+	static const unsigned int coeff[] = {
+		[STRING_UNITS_10] = 1000,
+		[STRING_UNITS_2] = 1024,
+	};
+	enum string_size_units unit = STRING_UNITS_2;
+	u64 whole, blk_size = 1;
+	char kernbuf[22], *end;
+	size_t len = count;
+	int rc;
+	int i;
+
+	if (count >= sizeof(kernbuf)) {
+		CERROR("count %zd > buffer %zd\n", count, sizeof(kernbuf));
+		return -E2BIG;
+	}
+
+	*size = 0;
+	/* The "iB" suffix is optionally allowed for indicating base-2 numbers.
+	 * If suffix is only "B" and not "iB" then we treat it as base-10.
+	 */
+	end = strstr(buffer, "B");
+	if (end && *(end - 1) != 'i')
+		unit = STRING_UNITS_10;
+
+	i = unit == STRING_UNITS_2 ? ARRAY_SIZE(units_2) - 1 :
+				     ARRAY_SIZE(units_10) - 1;
+	do {
+		end = strnstr(buffer, units_str[unit][i], count);
+		if (end) {
+			for (; i >= 0; i--)
+				blk_size *= coeff[unit];
+			len = end - buffer;
+			break;
+		}
+	} while (i--);
+
+	/* as 'B' is a substring of all units, we need to handle it
+	 * separately.
+	 */
+	if (!end) {
+		/* 'B' is only acceptable letter at this point */
+		end = strnchr(buffer, count, 'B');
+		if (end) {
+			len = end - buffer;
+
+			if (count - len > 2 ||
+			    (count - len == 2 && strcmp(end, "B\n") != 0)) {
+				CDEBUG(D_INFO, "unknown suffix '%s'\n", buffer);
+				return -EINVAL;
+			}
+		}
+		/* kstrtoull will error out if it has non digits */
+		goto numbers_only;
+	}
+
+	end = strnchr(buffer, count, '.');
+	if (end) {
+		/* need to limit 3 decimal places */
+		char rem[4] = "000";
+		u64 frac = 0;
+		size_t off;
+
+		len = end - buffer;
+		end++;
+
+		/* limit to 3 decimal points */
+		off = min_t(size_t, 3, strspn(end, "0123456789"));
+		/* need to limit frac_d to a u32 */
+		memcpy(rem, end, off);
+		rc = kstrtoull(rem, 10, &frac);
+		if (rc)
+			return rc;
+
+		if (fls64(frac) + fls64(blk_size) - 1 > 64)
+			return -EOVERFLOW;
+
+		frac *= blk_size;
+		do_div(frac, 1000);
+		*size += frac;
+	}
+numbers_only:
+	snprintf(kernbuf, sizeof(kernbuf), "%.*s", (int)len, buffer);
+	rc = kstrtoull(kernbuf, 10, &whole);
+	if (rc)
+		return rc;
+
+	if (whole != 0 && fls64(whole) + fls64(blk_size) - 1 > 64)
+		return -EOVERFLOW;
+
+	*size += whole * blk_size;
+
+	return count;
+}
+EXPORT_SYMBOL(string_to_size);
+
+/**
+ * sysfs_memparse - parse a ASCII string to 64-bit binary value,
+ *		    with optional units
+ *
+ * @buffer:	kernel pointer to input string
+ * @count:	number of bytes in the input @buffer
+ * @val:	(output) binary value returned to caller
+ * @defunit:	default unit suffix to use if none is provided
+ *
+ * Parses a string into a number. The number stored at @buffer is
+ * potentially suffixed with K, M, G, T, P, E. Besides these other
+ * valid suffix units are shown in the string_to_size() function.
+ * If the string lacks a suffix then the defunit is used. The defunit
+ * should be given as a binary unit (e.g. MiB) as that is the standard
+ * for tunables in Lustre. If no unit suffix is given (e.g. 'G'), then
+ * it is assumed to be in binary units.
+ *
+ * Returns:	0 on success or -errno on failure.
+ */
+int sysfs_memparse(const char *buffer, size_t count, u64 *val,
+		   const char *defunit)
+{
+	const char *param = buffer;
+	char tmp_buf[23];
+	int rc;
+
+	count = strlen(buffer);
+	while (count > 0 && isspace(buffer[count - 1]))
+		count--;
+
+	if (!count)
+		RETURN(-EINVAL);
+
+	/* If there isn't already a unit on this value, append @defunit.
+	 * Units of 'B' don't affect the value, so don't bother adding.
+	 */
+	if (!isalpha(buffer[count - 1]) && defunit[0] != 'B') {
+		if (count + 3 >= sizeof(tmp_buf)) {
+			CERROR("count %zd > size %zd\n", count, sizeof(param));
+			RETURN(-E2BIG);
+		}
+
+		scnprintf(tmp_buf, sizeof(tmp_buf), "%.*s%s", (int)count,
+			  buffer, defunit);
+		param = tmp_buf;
+		count = strlen(param);
+	}
+
+	rc = string_to_size(val, param, count);
+
+	return rc < 0 ? rc : 0;
+}
+EXPORT_SYMBOL(sysfs_memparse);
+
+char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+	size_t l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	while (len >= l2) {
+		len--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lprocfs_strnstr);
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+				size_t *count)
+{
+	char *val;
+	size_t buflen = *count;
+
+	/* there is no strnstr() in rhel5 and ubuntu kernels */
+	val = lprocfs_strnstr(buffer, name, buflen);
+	if (!val)
+		return (char *)buffer;
+
+	val += strlen(name);                             /* skip prefix */
+	while (val < buffer + buflen && isspace(*val)) /* skip separator */
+		val++;
+
+	*count = 0;
+	while (val < buffer + buflen && isalnum(*val)) {
+		++*count;
+		++val;
+	}
+
+	return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int lprocfs_seq_create(struct proc_dir_entry *parent,
+		       const char *name,
+		       mode_t mode,
+		       const struct proc_ops *seq_fops,
+		       void *data)
+{
+	struct proc_dir_entry *entry;
+	ENTRY;
+
+	/* Disallow secretly (un)writable entries. */
+	LASSERT(!seq_fops->proc_write == !(mode & 0222));
+
+	entry = proc_create_data(name, mode, parent, seq_fops, data);
+
+	if (!entry)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *obd,
+			   const char *name,
+			   mode_t mode,
+			   const struct proc_ops *seq_fops,
+			   void *data)
+{
+	return lprocfs_seq_create(obd->obd_proc_entry, name,
+				  mode, seq_fops, data);
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+	if (value >= OBD_HIST_MAX)
+		value = OBD_HIST_MAX - 1;
+
+	spin_lock(&oh->oh_lock);
+	oh->oh_buckets[value]++;
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+	unsigned int val = 0;
+
+	if (likely(value != 0))
+		val = min(fls(value - 1), OBD_HIST_MAX);
+
+	lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+	unsigned long ret = 0;
+	int i;
+
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		ret +=  oh->oh_buckets[i];
+	return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+	spin_lock(&oh->oh_lock);
+	memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+void lprocfs_oh_tally_pcpu(struct obd_hist_pcpu *oh,
+			   unsigned int value)
+{
+	if (value >= OBD_HIST_MAX)
+		value = OBD_HIST_MAX - 1;
+
+	percpu_counter_inc(&oh->oh_pc_buckets[value]);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_pcpu);
+
+void lprocfs_oh_tally_log2_pcpu(struct obd_hist_pcpu *oh,
+				unsigned int value)
+{
+	unsigned int val = 0;
+
+	if (likely(value != 0))
+		val = min(fls(value - 1), OBD_HIST_MAX);
+
+	lprocfs_oh_tally_pcpu(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2_pcpu);
+
+unsigned long lprocfs_oh_counter_pcpu(struct obd_hist_pcpu *oh,
+				      unsigned int value)
+{
+	return percpu_counter_sum(&oh->oh_pc_buckets[value]);
+}
+EXPORT_SYMBOL(lprocfs_oh_counter_pcpu);
+
+unsigned long lprocfs_oh_sum_pcpu(struct obd_hist_pcpu *oh)
+{
+	unsigned long ret = 0;
+	int i;
+
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		ret += percpu_counter_sum(&oh->oh_pc_buckets[i]);
+
+	return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum_pcpu);
+
+int lprocfs_oh_alloc_pcpu(struct obd_hist_pcpu *oh)
+{
+	int i, rc;
+
+	if (oh->oh_initialized)
+		return 0;
+
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		rc = percpu_counter_init(&oh->oh_pc_buckets[i], 0, GFP_KERNEL);
+		if (rc)
+			goto out;
+	}
+
+	oh->oh_initialized = true;
+
+	return 0;
+
+out:
+	for (i--; i >= 0; i--)
+		percpu_counter_destroy(&oh->oh_pc_buckets[i]);
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_oh_alloc_pcpu);
+
+void lprocfs_oh_clear_pcpu(struct obd_hist_pcpu *oh)
+{
+	int i;
+
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		percpu_counter_set(&oh->oh_pc_buckets[i], 0);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear_pcpu);
+
+void lprocfs_oh_release_pcpu(struct obd_hist_pcpu *oh)
+{
+	int i;
+
+	if (!oh->oh_initialized)
+		return;
+
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		percpu_counter_destroy(&oh->oh_pc_buckets[i]);
+
+	oh->oh_initialized = false;
+}
+EXPORT_SYMBOL(lprocfs_oh_release_pcpu);
+
+ssize_t lustre_attr_show(struct kobject *kobj,
+			 struct attribute *attr, char *buf)
+{
+	struct lustre_attr *a = container_of(attr, struct lustre_attr, attr);
+
+	return a->show ? a->show(kobj, attr, buf) : 0;
+}
+EXPORT_SYMBOL_GPL(lustre_attr_show);
+
+ssize_t lustre_attr_store(struct kobject *kobj, struct attribute *attr,
+			  const char *buf, size_t len)
+{
+	struct lustre_attr *a = container_of(attr, struct lustre_attr, attr);
+
+	return a->store ? a->store(kobj, attr, buf, len) : len;
+}
+EXPORT_SYMBOL_GPL(lustre_attr_store);
+
+const struct sysfs_ops lustre_sysfs_ops = {
+	.show  = lustre_attr_show,
+	.store = lustre_attr_store,
+};
+EXPORT_SYMBOL_GPL(lustre_sysfs_ops);
+
+int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct client_obd *cli = &obd->u.cli;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_show);
+
+ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp;
+	struct obd_connect_data *ocd;
+	int chunk_mask, rc;
+	char kernbuf[22];
+	u64 val;
+
+	if (count > sizeof(kernbuf) - 1)
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+
+	rc = sysfs_memparse(kernbuf, count, &val, "B");
+	if (rc)
+		return rc;
+
+	/* if the max_pages is specified in bytes, convert to pages */
+	if (val >= ONE_MB_BRW_SIZE)
+		val >>= PAGE_SHIFT;
+
+	with_imp_locked(obd, imp, rc) {
+		ocd = &imp->imp_connect_data;
+		chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
+		/* max_pages_per_rpc must be chunk aligned */
+		val = (val + ~chunk_mask) & chunk_mask;
+		if (val == 0 || (ocd->ocd_brw_size != 0 &&
+				 val > ocd->ocd_brw_size >> PAGE_SHIFT)) {
+			rc = -ERANGE;
+		} else {
+			spin_lock(&cli->cl_loi_list_lock);
+			cli->cl_max_pages_per_rpc = val;
+			client_adjust_max_dirty(cli);
+			spin_unlock(&cli->cl_loi_list_lock);
+		}
+	}
+
+	return rc ?: count;
+}
+EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_write);
+
+ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	rc = sprintf(buf, "%d\n", cli->cl_max_short_io_bytes);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+EXPORT_SYMBOL(short_io_bytes_show);
+
+/* Used to catch people who think they're specifying pages. */
+#define MIN_SHORT_IO_BYTES 64U
+
+ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr,
+			     const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+	u64 val;
+	int rc;
+
+	if (strcmp(buffer, "-1") == 0) {
+		val = OBD_DEF_SHORT_IO_BYTES;
+	} else {
+		rc = sysfs_memparse(buffer, count, &val, "B");
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	if (val && (val < MIN_SHORT_IO_BYTES || val > LNET_MTU))
+		GOTO(out, rc = -ERANGE);
+
+	rc = count;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_short_io_bytes = min_t(u64, val, OST_MAX_SHORT_IO_BYTES);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+out:
+	return rc;
+}
+EXPORT_SYMBOL(short_io_bytes_store);
+
+int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
+			   struct root_squash_info *squash, char *name)
+{
+	int rc;
+	char kernbuf[64], *tmp, *errmsg;
+	unsigned long uid, gid;
+	ENTRY;
+
+	if (count >= sizeof(kernbuf)) {
+		errmsg = "string too long";
+		GOTO(failed_noprint, rc = -EINVAL);
+	}
+	if (copy_from_user(kernbuf, buffer, count)) {
+		errmsg = "bad address";
+		GOTO(failed_noprint, rc = -EFAULT);
+	}
+	kernbuf[count] = '\0';
+
+	/* look for uid gid separator */
+	tmp = strchr(kernbuf, ':');
+	if (!tmp) {
+		errmsg = "needs uid:gid format";
+		GOTO(failed, rc = -EINVAL);
+	}
+	*tmp = '\0';
+	tmp++;
+
+	/* parse uid */
+	if (kstrtoul(kernbuf, 0, &uid) != 0) {
+		errmsg = "bad uid";
+		GOTO(failed, rc = -EINVAL);
+	}
+
+	/* parse gid */
+	if (kstrtoul(tmp, 0, &gid) != 0) {
+		errmsg = "bad gid";
+		GOTO(failed, rc = -EINVAL);
+	}
+
+	squash->rsi_uid = uid;
+	squash->rsi_gid = gid;
+
+	LCONSOLE_INFO("%s: root_squash is set to %u:%u\n",
+		      name, squash->rsi_uid, squash->rsi_gid);
+	RETURN(count);
+
+failed:
+	if (tmp) {
+		tmp--;
+		*tmp = ':';
+	}
+	CWARN("%s: failed to set root_squash to \"%s\", %s, rc = %d\n",
+	      name, kernbuf, errmsg, rc);
+	RETURN(rc);
+failed_noprint:
+	CWARN("%s: failed to set root_squash due to %s, rc = %d\n",
+	      name, errmsg, rc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_wr_root_squash);
+
+
+int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
+			     struct root_squash_info *squash, char *name)
+{
+	int rc;
+	char *kernbuf = NULL;
+	char *errmsg;
+	LIST_HEAD(tmp);
+	int len = count;
+	ENTRY;
+
+	if (count > 4096) {
+		errmsg = "string too long";
+		GOTO(failed, rc = -EINVAL);
+	}
+
+	OBD_ALLOC(kernbuf, count + 1);
+	if (!kernbuf) {
+		errmsg = "no memory";
+		GOTO(failed, rc = -ENOMEM);
+	}
+	if (copy_from_user(kernbuf, buffer, count)) {
+		errmsg = "bad address";
+		GOTO(failed, rc = -EFAULT);
+	}
+	kernbuf[count] = '\0';
+
+	if (count > 0 && kernbuf[count - 1] == '\n')
+		len = count - 1;
+
+	if ((len == 4 && strncmp(kernbuf, "NONE", len) == 0) ||
+	    (len == 5 && strncmp(kernbuf, "clear", len) == 0)) {
+		/* empty string is special case */
+		spin_lock(&squash->rsi_lock);
+		if (!list_empty(&squash->rsi_nosquash_nids))
+			cfs_free_nidlist(&squash->rsi_nosquash_nids);
+		spin_unlock(&squash->rsi_lock);
+		LCONSOLE_INFO("%s: nosquash_nids is cleared\n", name);
+		OBD_FREE(kernbuf, count + 1);
+		RETURN(count);
+	}
+
+	if (cfs_parse_nidlist(kernbuf, count, &tmp) <= 0) {
+		errmsg = "can't parse";
+		GOTO(failed, rc = -EINVAL);
+	}
+	LCONSOLE_INFO("%s: nosquash_nids set to %s\n",
+		      name, kernbuf);
+	OBD_FREE(kernbuf, count + 1);
+	kernbuf = NULL;
+
+	spin_lock(&squash->rsi_lock);
+	if (!list_empty(&squash->rsi_nosquash_nids))
+		cfs_free_nidlist(&squash->rsi_nosquash_nids);
+	list_splice(&tmp, &squash->rsi_nosquash_nids);
+	spin_unlock(&squash->rsi_lock);
+
+	RETURN(count);
+
+failed:
+	if (kernbuf) {
+		CWARN("%s: failed to set nosquash_nids to \"%s\", %s rc = %d\n",
+		      name, kernbuf, errmsg, rc);
+		OBD_FREE(kernbuf, count + 1);
+	} else {
+		CWARN("%s: failed to set nosquash_nids due to %s rc = %d\n",
+		      name, errmsg, rc);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_wr_nosquash_nids);
+
+#endif /* CONFIG_PROC_FS*/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
new file mode 100644
index 0000000000000..a09ae67d89e33
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
@@ -0,0 +1,1121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lprocfs_status_server.c
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre_nodemap.h>
+
+#define MAX_STRING_SIZE 128
+
+struct dentry *ldebugfs_add_symlink(const char *name, const char *target,
+				    const char *format, ...)
+{
+	struct dentry *entry = NULL;
+	struct dentry *parent;
+	struct qstr dname;
+	va_list ap;
+	char *dest;
+
+	if (!target || !format)
+		return NULL;
+
+	dname.name = target;
+	dname.len = strlen(dname.name);
+	dname.hash = ll_full_name_hash(debugfs_lustre_root,
+				       dname.name, dname.len);
+	parent = d_lookup(debugfs_lustre_root, &dname);
+	if (!parent)
+		return NULL;
+
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (!dest)
+		goto no_entry;
+
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
+
+	entry = debugfs_create_symlink(name, parent, dest);
+
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+no_entry:
+	dput(parent);
+	return entry;
+}
+EXPORT_SYMBOL(ldebugfs_add_symlink);
+
+#ifdef CONFIG_PROC_FS
+
+int lprocfs_evict_client_open(struct inode *inode, struct file *f)
+{
+	struct obd_device *obd = pde_data(file_inode(f));
+
+	atomic_inc(&obd->obd_evict_inprogress);
+	return 0;
+}
+
+int lprocfs_evict_client_release(struct inode *inode, struct file *f)
+{
+	struct obd_device *obd = pde_data(file_inode(f));
+
+	atomic_dec(&obd->obd_evict_inprogress);
+	wake_up(&obd->obd_evict_inprogress_waitq);
+
+	return 0;
+}
+
+#define BUFLEN (UUID_MAX + 5)
+
+ssize_t
+lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	char *tmpbuf, *kbuf;
+
+	OBD_ALLOC(kbuf, BUFLEN);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	/*
+	 * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1
+	 * bytes into kbuf, to ensure that the string is NUL-terminated.
+	 * UUID_MAX should include a trailing NUL already.
+	 */
+	if (copy_from_user(kbuf, buffer,
+			   min_t(unsigned long, BUFLEN - 1, count))) {
+		count = -EFAULT;
+		goto out;
+	}
+	tmpbuf = skip_spaces(kbuf);
+	tmpbuf = strsep(&tmpbuf, " \t\n\f\v\r");
+	class_incref(obd, __func__, current);
+
+	if (strncmp(tmpbuf, "nid:", 4) == 0)
+		obd_export_evict_by_nid(obd, tmpbuf + 4);
+	else if (strncmp(tmpbuf, "uuid:", 5) == 0)
+		obd_export_evict_by_uuid(obd, tmpbuf + 5);
+	else
+		obd_export_evict_by_uuid(obd, tmpbuf);
+
+	class_decref(obd, __func__, current);
+
+out:
+	OBD_FREE(kbuf, BUFLEN);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_evict_client_seq_write);
+
+#undef BUFLEN
+
+ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", obd->obd_num_exports);
+}
+EXPORT_SYMBOL(num_exports_show);
+
+ssize_t grant_check_threshold_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			 obd->obd_grant_check_threshold);
+}
+EXPORT_SYMBOL(grant_check_threshold_show);
+
+ssize_t grant_check_threshold_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	int val;
+	int rc;
+
+	rc = kstrtoint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	if (val < 0)
+		return -EINVAL;
+	obd->obd_grant_check_threshold = val;
+	return count;
+}
+EXPORT_SYMBOL(grant_check_threshold_store);
+
+static int obd_export_flags2str(struct obd_export *exp, struct seq_file *m)
+{
+	bool first = true;
+
+	flag2str(exp, failed);
+	flag2str(exp, in_recovery);
+	flag2str(exp, disconnected);
+	flag2str(exp, connecting);
+	flag2str(exp, no_recovery);
+
+	return 0;
+}
+
+static int
+lprocfs_exp_print_export_seq(struct obd_export *exp, void *cb_data)
+{
+	struct seq_file		*m = cb_data;
+	struct obd_device	*obd;
+	struct obd_connect_data	*ocd;
+
+	LASSERT(exp != NULL);
+	if (exp->exp_nid_stats == NULL)
+		goto out;
+	obd = exp->exp_obd;
+	ocd = &exp->exp_connect_data;
+
+	seq_printf(m, "%s:\n"
+		   "    name: %s\n"
+		   "    client: %s\n"
+		   "    connect_flags: [ ",
+		   obd_uuid2str(&exp->exp_client_uuid),
+		   obd->obd_name,
+		   obd_export_nid2str(exp));
+	obd_connect_seq_flags2str(m, ocd->ocd_connect_flags,
+				  ocd->ocd_connect_flags2, ", ");
+	seq_printf(m, " ]\n");
+	obd_connect_data_seqprint(m, ocd);
+	seq_printf(m, "    export_flags: [ ");
+	obd_export_flags2str(exp, m);
+	seq_printf(m, " ]\n");
+
+	if (obd->obd_type &&
+	    strcmp(obd->obd_type->typ_name, "obdfilter") == 0) {
+		struct filter_export_data *fed = &exp->exp_filter_data;
+
+		seq_printf(m, "    grant:\n");
+		seq_printf(m, "       granted: %ld\n",
+			fed->fed_ted.ted_grant);
+		seq_printf(m, "       dirty: %ld\n",
+			fed->fed_ted.ted_dirty);
+		seq_printf(m, "       pending: %ld\n",
+			fed->fed_ted.ted_pending);
+	}
+
+out:
+	return 0;
+}
+
+/**
+ * RPC connections are composed of an import and an export. Using the
+ * lctl utility we can extract important information about the state.
+ * The lprocfs_exp_export_seq_show routine displays the state information
+ * for the export.
+ *
+ * \param[in] m		seq file
+ * \param[in] data	unused
+ *
+ * \retval		0 on success
+ *
+ * The format of the export state information is like:
+ * a793e354-49c0-aa11-8c4f-a4f2b1a1a92b:
+ *     name: MGS
+ *     client: 10.211.55.10@tcp
+ *     connect_flags: [ version, barrier, adaptive_timeouts, ... ]
+ *     connect_data:
+ *        flags: 0x2000011005002020
+ *        instance: 0
+ *        target_version: 2.10.51.0
+ *        export_flags: [ ... ]
+ *
+ */
+static int lprocfs_exp_export_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+
+	return obd_nid_export_for_each(stats->nid_obd, &stats->nid,
+				       lprocfs_exp_print_export_seq, m);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_export);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+	CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+	       client_stat->nid_proc, client_stat->nid_stats);
+
+	LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+		 "nid %s:count %d\n", libcfs_nidstr(&client_stat->nid),
+		 atomic_read(&client_stat->nid_exp_ref_count));
+
+	if (client_stat->nid_proc)
+		lprocfs_remove(&client_stat->nid_proc);
+
+	if (client_stat->nid_stats)
+		lprocfs_free_stats(&client_stat->nid_stats);
+
+	if (client_stat->nid_ldlm_stats)
+		lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+	OBD_FREE_PTR(client_stat);
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+	struct cfs_hash *hash = obd->obd_nid_stats_hash;
+	struct nid_stat *stat;
+	ENTRY;
+
+	/* we need extra list - because hash_exit called to early */
+	/* not need locking because all clients is died */
+	while (!list_empty(&obd->obd_nid_stats)) {
+		stat = list_entry(obd->obd_nid_stats.next,
+				  struct nid_stat, nid_list);
+		list_del_init(&stat->nid_list);
+		cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+		lprocfs_free_client_stats(stat);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+static int
+lprocfs_exp_print_nodemap_seq(struct obd_export *exp, void *cb_data)
+{
+	struct lu_nodemap *nodemap = exp->exp_target_data.ted_nodemap;
+	struct seq_file *m = cb_data;
+
+	if (nodemap)
+		seq_printf(m, "%s\n", nodemap->nm_name);
+	return 0;
+}
+
+static int
+lprocfs_exp_nodemap_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+
+	return obd_nid_export_for_each(stats->nid_obd, &stats->nid,
+				       lprocfs_exp_print_nodemap_seq, m);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_nodemap);
+
+static int
+lprocfs_exp_print_uuid_seq(struct obd_export *exp, void *cb_data)
+{
+	struct seq_file *m = cb_data;
+
+	if (exp->exp_nid_stats)
+		seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid));
+	return 0;
+}
+
+static int lprocfs_exp_uuid_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+
+	return obd_nid_export_for_each(stats->nid_obd, &stats->nid,
+				       lprocfs_exp_print_uuid_seq, m);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_uuid);
+
+#define HASH_NAME_LEN	16
+
+static void ldebugfs_rhash_seq_show(const char *name, struct rhashtable *ht,
+				    struct seq_file *m)
+{
+	unsigned int max_size = ht->p.max_size ? ht->p.max_size : UINT_MAX;
+	struct bucket_table *tbl;
+	int dist[8] = { 0, };
+	int maxdep = 0;
+	int i;
+
+	rcu_read_lock();
+	tbl = rht_dereference(ht->tbl, ht);
+	for (i = 0; i < tbl->size; i++) {
+		struct rhash_head *pos;
+		int count = 0;
+
+		rht_for_each(pos, tbl, i)
+			count++;
+
+		if (count)
+			maxdep = max(maxdep, count);
+
+		dist[min(fls(count), 7)]++;
+	}
+
+	seq_printf(m, "%-*s %5d %5d %10u %d.%03d 0.300 0.750 0x%03x %7d %7d %7d ",
+		   HASH_NAME_LEN, name, tbl->size, ht->p.min_size, max_size,
+		   atomic_read(&ht->nelems) / tbl->size,
+		   atomic_read(&ht->nelems) * 1000 / tbl->size,
+		   ht->p.automatic_shrinking, 0,
+		   atomic_read(&ht->nelems), maxdep);
+	rcu_read_unlock();
+
+	for (i = 0; i < 8; i++)
+		seq_printf(m, "%d%c",  dist[i], (i == 7) ? '\n' : '/');
+}
+
+static int
+lprocfs_exp_print_hash_seq(struct obd_export *exp, void *cb_data)
+
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct seq_file *m = cb_data;
+
+	if (exp->exp_lock_hash != NULL) {
+		seq_printf(m, "%-*s   cur   min        max theta t-min t-max flags rehash   count distribution\n",
+			   HASH_NAME_LEN, "name");
+		ldebugfs_rhash_seq_show("NID_HASH", &obd->obd_nid_hash.ht, m);
+	}
+	return 0;
+}
+
+static int lprocfs_exp_hash_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+
+	return obd_nid_export_for_each(stats->nid_obd, &stats->nid,
+				       lprocfs_exp_print_hash_seq, m);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_hash);
+
+int lprocfs_exp_print_replydata_seq(struct obd_export *exp, void *cb_data)
+
+{
+	struct seq_file *m = cb_data;
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	seq_printf(m, "reply_cnt: %d\n"
+		   "reply_max: %d\n"
+		   "reply_released_by_xid: %d\n"
+		   "reply_released_by_tag: %d\n\n",
+		   ted->ted_reply_cnt,
+		   ted->ted_reply_max,
+		   ted->ted_release_xid,
+		   ted->ted_release_tag);
+	return 0;
+}
+
+int lprocfs_exp_replydata_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+
+	return obd_nid_export_for_each(stats->nid_obd, &stats->nid,
+				       lprocfs_exp_print_replydata_seq, m);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_replydata);
+
+int lprocfs_exp_print_fmd_count_seq(struct obd_export *exp, void *cb_data)
+{
+	struct seq_file *m = cb_data;
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	seq_printf(m, "%d\n", ted->ted_fmd_count);
+
+	return 0;
+}
+
+int lprocfs_exp_fmd_count_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+
+	return obd_nid_export_for_each(stats->nid_obd, &stats->nid,
+				       lprocfs_exp_print_fmd_count_seq, m);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_fmd_count);
+
+int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data)
+{
+	seq_puts(m, "Write into this file to clear all nid stats and stale nid entries\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_seq_show);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+	struct nid_stat *stat = obj;
+	ENTRY;
+
+	CDEBUG(D_INFO, "refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+	if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+		/* object has only hash references. */
+		spin_lock(&stat->nid_obd->obd_nid_lock);
+		list_move(&stat->nid_list, data);
+		spin_unlock(&stat->nid_obd->obd_nid_lock);
+		RETURN(1);
+	}
+	/* we has reference to object - only clear data*/
+	if (stat->nid_stats)
+		lprocfs_clear_stats(stat->nid_stats);
+
+	RETURN(0);
+}
+
+ssize_t
+lprocfs_nid_stats_clear_seq_write(struct file *file, const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	struct nid_stat *client_stat;
+	LIST_HEAD(free_list);
+
+	cfs_hash_cond_del(obd->obd_nid_stats_hash,
+			  lprocfs_nid_stats_clear_write_cb, &free_list);
+
+	while (!list_empty(&free_list)) {
+		client_stat = list_entry(free_list.next, struct nid_stat,
+					 nid_list);
+		list_del_init(&client_stat->nid_list);
+		lprocfs_free_client_stats(client_stat);
+	}
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_seq_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
+{
+	struct nid_stat *new_stat, *old_stat;
+	struct obd_device *obd = NULL;
+	struct proc_dir_entry *entry;
+	char nidstr[LNET_NIDSTR_SIZE];
+	int rc = 0;
+	ENTRY;
+
+	if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+	    !exp->exp_obd->obd_nid_stats_hash)
+		RETURN(-EINVAL);
+
+	/* not test against zero because eric say:
+	 * You may only test nid against another nid, or LNET_NID_ANY.
+	 * Anything else is nonsense.*/
+	if (nid == NULL || *nid == LNET_NID_ANY)
+		RETURN(-EALREADY);
+
+	libcfs_nid2str_r(*nid, nidstr, sizeof(nidstr));
+
+	spin_lock(&exp->exp_lock);
+	if (exp->exp_nid_stats != NULL) {
+		spin_unlock(&exp->exp_lock);
+		RETURN(-EALREADY);
+	}
+	spin_unlock(&exp->exp_lock);
+
+	obd = exp->exp_obd;
+
+	CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+	OBD_ALLOC_PTR(new_stat);
+	if (new_stat == NULL)
+		RETURN(-ENOMEM);
+
+	lnet_nid4_to_nid(*nid, &new_stat->nid);
+	new_stat->nid_obd = exp->exp_obd;
+	/* we need set default refcount to 1 to balance obd_disconnect */
+	atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+	old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+					   &new_stat->nid,
+					   &new_stat->nid_hash);
+	CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+	       old_stat, nidstr, atomic_read(&old_stat->nid_exp_ref_count));
+
+	/* Return -EALREADY here so that we know that the /proc
+	 * entry already has been created */
+	if (old_stat != new_stat) {
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_nid_stats) {
+			LASSERT(exp->exp_nid_stats == old_stat);
+			nidstat_putref(exp->exp_nid_stats);
+		}
+		exp->exp_nid_stats = old_stat;
+		spin_unlock(&exp->exp_lock);
+		GOTO(destroy_new, rc = -EALREADY);
+	}
+	/* not found - create */
+	new_stat->nid_proc = lprocfs_register(nidstr,
+					      obd->obd_proc_exports_entry,
+					      NULL, NULL);
+
+	if (IS_ERR(new_stat->nid_proc)) {
+		rc = PTR_ERR(new_stat->nid_proc);
+		new_stat->nid_proc = NULL;
+		CERROR("%s: cannot create proc entry for export %s: rc = %d\n",
+		       obd->obd_name, nidstr, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "nodemap", new_stat,
+				   &lprocfs_exp_nodemap_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the nodemap file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "uuid", new_stat,
+				   &lprocfs_exp_uuid_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the NID stats file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "hash", new_stat,
+				   &lprocfs_exp_hash_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the hash file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "export",
+				   new_stat, &lprocfs_exp_export_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the export file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "reply_data", new_stat,
+				   &lprocfs_exp_replydata_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the reply_data file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "fmd_count", new_stat,
+				   &lprocfs_exp_fmd_count_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the fmd_count file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	spin_lock(&exp->exp_lock);
+	exp->exp_nid_stats = new_stat;
+	spin_unlock(&exp->exp_lock);
+
+	/* protect competitive add to list, not need locking on destroy */
+	spin_lock(&obd->obd_nid_lock);
+	list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+	spin_unlock(&obd->obd_nid_lock);
+
+	RETURN(0);
+
+destroy_new_ns:
+	if (new_stat->nid_proc != NULL)
+		lprocfs_remove(&new_stat->nid_proc);
+	cfs_hash_del(obd->obd_nid_stats_hash, &new_stat->nid,
+		     &new_stat->nid_hash);
+
+destroy_new:
+	nidstat_putref(new_stat);
+	OBD_FREE_PTR(new_stat);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+	struct nid_stat *stat = exp->exp_nid_stats;
+
+	if (!stat || !exp->exp_obd)
+		RETURN(0);
+
+	nidstat_putref(exp->exp_nid_stats);
+	exp->exp_nid_stats = NULL;
+
+	return 0;
+}
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned int num_stats)
+{
+	struct lprocfs_stats *stats;
+	int rc;
+
+	LASSERT(obd->obd_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+	if (rc < 0)
+		lprocfs_free_stats(&stats);
+	else
+		obd->obd_stats = stats;
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+	if (obd->obd_stats)
+		lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+static void display_brw_stats(struct seq_file *seq, const char *name,
+			      const char *units, struct obd_hist_pcpu *read,
+			      struct obd_hist_pcpu *write, bool scale)
+{
+	unsigned long read_tot, write_tot, r, w, read_cum = 0, write_cum = 0;
+	unsigned int i;
+
+	seq_printf(seq, "\n%26s read      |     write\n", " ");
+	seq_printf(seq, "%-22s %-5s %% cum %% |  %-11s %% cum %%\n",
+		   name, units, units);
+
+	read_tot = lprocfs_oh_sum_pcpu(read);
+	write_tot = lprocfs_oh_sum_pcpu(write);
+
+	if (!read_tot && !write_tot)
+		return;
+
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		r = lprocfs_oh_counter_pcpu(read, i);
+		w = lprocfs_oh_counter_pcpu(write, i);
+		read_cum += r;
+		write_cum += w;
+		if (read_cum == 0 && write_cum == 0)
+			continue;
+
+		if (!scale)
+			seq_printf(seq, "%u", i);
+		else if (i < 10)
+			seq_printf(seq, "%lu", BIT(i));
+		else if (i < 20)
+			seq_printf(seq, "%luK", BIT(i - 10));
+		else
+			seq_printf(seq, "%luM", BIT(i - 20));
+
+		seq_printf(seq, ":\t\t%10lu %3u %3u   | %4lu %3u %3u\n",
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+}
+
+static const struct brw_stats_props brw_props[] = {
+	{ .bsp_name	= "pages per bulk r/w",
+	  .bsp_units	= "rpcs",
+	  .bsp_scale	= true				},
+	{ .bsp_name	= "discontiguous pages",
+	  .bsp_units	= "rpcs",
+	  .bsp_scale	= false				},
+	{ .bsp_name	= "discontiguous blocks",
+	  .bsp_units	= "rpcs",
+	  .bsp_scale	= false				},
+	{ .bsp_name	= "disk fragmented I/Os",
+	  .bsp_units	= "ios",
+	  .bsp_scale	= false				},
+	{ .bsp_name	= "disk I/Os in flight",
+	  .bsp_units	= "ios",
+	  .bsp_scale	= false				},
+	{ .bsp_name	= "I/O time (1/1000s)",
+	  .bsp_units	= "ios",
+	  .bsp_scale	= true				},
+	{ .bsp_name	= "disk I/O size",
+	  .bsp_units	= "ios",
+	  .bsp_scale	= true				},
+};
+
+static int brw_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct brw_stats *brw_stats = seq->private;
+	int i;
+
+	/* this sampling races with updates */
+	lprocfs_stats_header(seq, ktime_get_real(), brw_stats->bs_init, 25,
+			     ":", true, "");
+
+	for (i = 0; i < ARRAY_SIZE(brw_stats->bs_props); i++) {
+		if (!brw_stats->bs_props[i].bsp_name)
+			continue;
+
+		display_brw_stats(seq, brw_stats->bs_props[i].bsp_name,
+				  brw_stats->bs_props[i].bsp_units,
+				  &brw_stats->bs_hist[i * 2],
+				  &brw_stats->bs_hist[i * 2 + 1],
+				  brw_stats->bs_props[i].bsp_scale);
+	}
+
+	return 0;
+}
+
+static ssize_t brw_stats_seq_write(struct file *file,
+				   const char __user *buf,
+				   size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct brw_stats *brw_stats = seq->private;
+	int i;
+
+	for (i = 0; i < BRW_RW_STATS_NUM; i++)
+		lprocfs_oh_clear_pcpu(&brw_stats->bs_hist[i]);
+	brw_stats->bs_init = ktime_get_real();
+
+	return len;
+}
+
+LDEBUGFS_SEQ_FOPS(brw_stats);
+
+int lprocfs_init_brw_stats(struct brw_stats *brw_stats)
+{
+	int i, result;
+
+	for (i = 0; i < BRW_RW_STATS_NUM; i++) {
+		result = lprocfs_oh_alloc_pcpu(&brw_stats->bs_hist[i]);
+		if (result)
+			break;
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(lprocfs_init_brw_stats);
+
+void lprocfs_fini_brw_stats(struct brw_stats *brw_stats)
+{
+	int i;
+
+	for (i = 0; i < BRW_RW_STATS_NUM; i++)
+		lprocfs_oh_release_pcpu(&brw_stats->bs_hist[i]);
+}
+EXPORT_SYMBOL(lprocfs_fini_brw_stats);
+
+void ldebugfs_register_osd_stats(struct dentry *parent,
+				 struct brw_stats *brw_stats,
+				 struct lprocfs_stats *stats)
+{
+	int i;
+
+	LASSERT(brw_stats);
+	brw_stats->bs_init = ktime_get_real();
+	for (i = 0; i < BRW_RW_STATS_NUM; i++) {
+		struct brw_stats_props *props = brw_stats->bs_props;
+
+		if (i % 2) {
+			props[i / 2].bsp_name = brw_props[i / 2].bsp_name;
+			props[i / 2].bsp_units = brw_props[i / 2].bsp_units;
+			props[i / 2].bsp_scale = brw_props[i / 2].bsp_scale;
+		}
+	}
+
+	if (!parent)
+		return;
+
+	debugfs_create_file("brw_stats", 0644, parent, brw_stats,
+			    &brw_stats_fops);
+
+	if (stats)
+		debugfs_create_file("stats", 0644, parent, stats,
+				    &ldebugfs_stats_seq_fops);
+}
+EXPORT_SYMBOL(ldebugfs_register_osd_stats);
+
+int lprocfs_hash_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
+
+	/* header for rhashtable state */
+	seq_printf(m, "%-*s   cur   min        max theta t-min t-max flags  rehash   count  maxdep distribution\n",
+		   HASH_NAME_LEN, "name");
+	ldebugfs_rhash_seq_show("UUID_HASH", &obd->obd_uuid_hash, m);
+	ldebugfs_rhash_seq_show("NID_HASH", &obd->obd_nid_hash.ht, m);
+
+	cfs_hash_debug_header(m);
+	cfs_hash_debug_str(obd->obd_nid_stats_hash, m);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_hash_seq_show);
+
+int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+	struct target_distribute_txn_data *tdtd;
+
+	LASSERT(obd != NULL);
+
+	seq_printf(m, "status: ");
+	if (atomic_read(&obd->obd_max_recoverable_clients) == 0) {
+		seq_printf(m, "INACTIVE\n");
+		goto out;
+	}
+
+	/* There is gap between client data read from storage and setting
+	 * obd_recovering so check obd_recovery_end as well to make sure
+	 * recovery is really finished
+	 */
+	if (obd->obd_recovery_end > 0 && !obd->obd_recovering) {
+		seq_printf(m, "COMPLETE\n");
+		seq_printf(m, "recovery_start: %lld\n",
+			   (s64)ktime_get_real_seconds() -
+			   (ktime_get_seconds() - obd->obd_recovery_start));
+		seq_printf(m, "recovery_duration: %lld\n",
+			   obd->obd_recovery_end ?
+			   obd->obd_recovery_end - obd->obd_recovery_start :
+			   ktime_get_seconds() - obd->obd_recovery_start);
+		/* Number of clients that have completed recovery */
+		seq_printf(m, "completed_clients: %d/%d\n",
+			   atomic_read(&obd->obd_max_recoverable_clients) -
+			   obd->obd_stale_clients,
+			   atomic_read(&obd->obd_max_recoverable_clients));
+		seq_printf(m, "replayed_requests: %d\n",
+			   obd->obd_replayed_requests);
+		seq_printf(m, "last_transno: %lld\n",
+			   obd->obd_next_recovery_transno - 1);
+		seq_printf(m, "VBR: %s\n", obd->obd_version_recov ?
+			   "ENABLED" : "DISABLED");
+		seq_printf(m, "IR: %s\n", obd->obd_no_ir ?
+			   "DISABLED" : "ENABLED");
+		goto out;
+	}
+
+	tdtd = obd->u.obt.obt_lut->lut_tdtd;
+	if (tdtd && tdtd->tdtd_show_update_logs_retrievers) {
+		char *buf;
+		int size = 0;
+		int count = 0;
+
+		buf = tdtd->tdtd_show_update_logs_retrievers(
+			tdtd->tdtd_show_retrievers_cbdata,
+			&size, &count);
+		if (count > 0) {
+			seq_printf(m, "WAITING\n");
+			seq_printf(m, "non-ready MDTs: %s\n",
+				   buf ? buf : "unknown (not enough RAM)");
+			seq_printf(m, "recovery_start: %lld\n",
+				   (s64)ktime_get_real_seconds() -
+				   (ktime_get_seconds() -
+				    obd->obd_recovery_start));
+			seq_printf(m, "time_waited: %lld\n",
+				   (s64)(ktime_get_seconds() -
+					 obd->obd_recovery_start));
+		}
+
+		if (buf != NULL)
+			OBD_FREE(buf, size);
+
+		if (likely(count > 0))
+			goto out;
+	}
+
+	/* recovery won't start until the clients connect */
+	if (obd->obd_recovery_start == 0) {
+		seq_printf(m, "WAITING_FOR_CLIENTS\n");
+		goto out;
+	}
+
+	seq_printf(m, "RECOVERING\n");
+	seq_printf(m, "recovery_start: %lld\n", (s64)ktime_get_real_seconds() -
+		   (ktime_get_seconds() - obd->obd_recovery_start));
+	seq_printf(m, "time_remaining: %lld\n",
+		   ktime_get_seconds() >=
+		   obd->obd_recovery_start +
+		   obd->obd_recovery_timeout ? 0 :
+		   (s64)(obd->obd_recovery_start +
+			 obd->obd_recovery_timeout -
+			 ktime_get_seconds()));
+	seq_printf(m, "connected_clients: %d/%d\n",
+		   atomic_read(&obd->obd_connected_clients),
+		   atomic_read(&obd->obd_max_recoverable_clients));
+	/* Number of clients that have completed recovery */
+	seq_printf(m, "req_replay_clients: %d\n",
+		   atomic_read(&obd->obd_req_replay_clients));
+	seq_printf(m, "lock_repay_clients: %d\n",
+		   atomic_read(&obd->obd_lock_replay_clients));
+	seq_printf(m, "completed_clients: %d\n",
+		   atomic_read(&obd->obd_connected_clients) -
+		   atomic_read(&obd->obd_lock_replay_clients));
+	seq_printf(m, "evicted_clients: %d\n", obd->obd_stale_clients);
+	seq_printf(m, "replayed_requests: %d\n", obd->obd_replayed_requests);
+	seq_printf(m, "queued_requests: %d\n",
+		   obd->obd_requests_queued_for_recovery);
+	seq_printf(m, "next_transno: %lld\n",
+		   obd->obd_next_recovery_transno);
+out:
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_recovery_status_seq_show);
+
+ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_ir_factor);
+}
+EXPORT_SYMBOL(ir_factor_show);
+
+ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr,
+			const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	int val;
+	int rc;
+
+	rc = kstrtoint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	if (val < OBD_IR_FACTOR_MIN || val > OBD_IR_FACTOR_MAX)
+		return -EINVAL;
+
+	obd->obd_recovery_ir_factor = val;
+	return count;
+}
+EXPORT_SYMBOL(ir_factor_store);
+
+int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%d\n", obd->obd_checksum_dump);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_checksum_dump_seq_show);
+
+ssize_t
+lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer,
+			    size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	bool val;
+	int rc;
+
+	LASSERT(obd != NULL);
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	obd->obd_checksum_dump = val;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_checksum_dump_seq_write);
+
+ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_timeout);
+}
+EXPORT_SYMBOL(recovery_time_soft_show);
+
+ssize_t recovery_time_soft_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	obd->obd_recovery_timeout = val;
+	return count;
+}
+EXPORT_SYMBOL(recovery_time_soft_store);
+
+ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_time_hard);
+}
+EXPORT_SYMBOL(recovery_time_hard_show);
+
+ssize_t recovery_time_hard_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	obd->obd_recovery_time_hard = val;
+	return count;
+}
+EXPORT_SYMBOL(recovery_time_hard_store);
+
+ssize_t instance_show(struct kobject *kobj, struct attribute *attr,
+		      char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_device_target *target = &obd->u.obt;
+
+	LASSERT(target->obt_magic == OBT_MAGIC);
+	return scnprintf(buf, PAGE_SIZE, "%u\n", obd->u.obt.obt_instance);
+}
+EXPORT_SYMBOL(instance_show);
+
+#endif /* CONFIG_PROC_FS*/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
new file mode 100644
index 0000000000000..c581211098acf
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
@@ -0,0 +1,2597 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/processor.h>
+#include <linux/random.h>
+
+#include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-mem.h>
+#include <libcfs/linux/linux-hash.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <lu_ref.h>
+
+struct lu_site_bkt_data {
+	/**
+	 * LRU list, updated on each access to object. Protected by
+	 * lsb_waitq.lock.
+	 *
+	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+	 * moved to the lu_site::ls_lru.prev
+	 */
+	struct list_head		lsb_lru;
+	/**
+	 * Wait-queue signaled when an object in this site is ultimately
+	 * destroyed (lu_object_free()) or initialized (lu_object_start()).
+	 * It is used by lu_object_find() to wait before re-trying when
+	 * object in the process of destruction is found in the hash table;
+	 * or wait object to be initialized by the allocator.
+	 *
+	 * \see htable_lookup().
+	 */
+	wait_queue_head_t		lsb_waitq;
+};
+
+enum {
+	LU_CACHE_PERCENT_MAX     = 50,
+	LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+#define	LU_CACHE_NR_MAX_ADJUST		512
+#define	LU_CACHE_NR_UNLIMITED		-1
+#define	LU_CACHE_NR_DEFAULT		LU_CACHE_NR_UNLIMITED
+/** This is set to roughly (20 * OSS_NTHRS_MAX) to prevent thrashing */
+#define	LU_CACHE_NR_ZFS_LIMIT		10240
+
+#define	LU_CACHE_NR_MIN			4096
+#define	LU_CACHE_NR_MAX			0x80000000UL
+
+/**
+ * Max 256 buckets, we don't want too many buckets because:
+ * - consume too much memory (currently max 16K)
+ * - avoid unbalanced LRU list
+ * With few cpus there is little gain from extra buckets, so
+ * we treat this as a maximum in lu_site_init().
+ */
+#define LU_SITE_BKT_BITS    8
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+module_param(lu_cache_percent, int, 0644);
+MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache");
+
+static long lu_cache_nr = LU_CACHE_NR_DEFAULT;
+module_param(lu_cache_nr, long, 0644);
+MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache");
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
+
+static u32 lu_fid_hash(const void *data, u32 len, u32 seed)
+{
+	const struct lu_fid *fid = data;
+
+	seed = cfs_hash_32(seed ^ fid->f_oid, 32);
+	seed ^= cfs_hash_64(fid->f_seq, 32);
+	return seed;
+}
+
+static const struct rhashtable_params obj_hash_params = {
+	.key_len	= sizeof(struct lu_fid),
+	.key_offset	= offsetof(struct lu_object_header, loh_fid),
+	.head_offset	= offsetof(struct lu_object_header, loh_hash),
+	.hashfn		= lu_fid_hash,
+	.automatic_shrinking = true,
+};
+
+static inline int lu_bkt_hash(struct lu_site *s, const struct lu_fid *fid)
+{
+	return lu_fid_hash(fid, sizeof(*fid), s->ls_bkt_seed) &
+	       (s->ls_bkt_cnt - 1);
+}
+
+wait_queue_head_t *
+lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+	struct lu_site_bkt_data *bkt;
+
+	bkt = &site->ls_bkts[lu_bkt_hash(site, fid)];
+	return &bkt->lsb_waitq;
+}
+EXPORT_SYMBOL(lu_site_wq_from_fid);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *top = o->lo_header;
+	struct lu_site *site = o->lo_dev->ld_site;
+	struct lu_object *orig = o;
+	const struct lu_fid *fid = lu_object_fid(o);
+
+	/*
+	 * till we have full fids-on-OST implemented anonymous objects
+	 * are possible in OSP. such an object isn't listed in the site
+	 * so we should not remove it from the site.
+	 */
+	if (fid_is_zero(fid)) {
+		LASSERT(list_empty(&top->loh_lru));
+		if (!atomic_dec_and_test(&top->loh_ref))
+			return;
+		list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+			if (o->lo_ops->loo_object_release != NULL)
+				o->lo_ops->loo_object_release(env, o);
+		}
+		lu_object_free(env, orig);
+		return;
+	}
+
+	bkt = &site->ls_bkts[lu_bkt_hash(site, &top->loh_fid)];
+	if (atomic_add_unless(&top->loh_ref, -1, 1)) {
+still_active:
+		/*
+		 * At this point the object reference is dropped and lock is
+		 * not taken, so lu_object should not be touched because it
+		 * can be freed by concurrent thread.
+		 *
+		 * Somebody may be waiting for this, currently only used for
+		 * cl_object, see cl_object_put_last().
+		 */
+		wake_up(&bkt->lsb_waitq);
+
+		return;
+	}
+
+	spin_lock(&bkt->lsb_waitq.lock);
+	if (!atomic_dec_and_test(&top->loh_ref)) {
+		spin_unlock(&bkt->lsb_waitq.lock);
+		goto still_active;
+	}
+
+	/*
+	 * Refcount is zero, and cannot be incremented without taking the bkt
+	 * lock, so object is stable.
+	 */
+
+	/*
+	 * When last reference is released, iterate over object layers, and
+	 * notify them that object is no longer busy.
+	 */
+	list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_release != NULL)
+			o->lo_ops->loo_object_release(env, o);
+	}
+
+	/*
+	 * Don't use local 'is_dying' here because if was taken without lock but
+	 * here we need the latest actual value of it so check lu_object
+	 * directly here.
+	 */
+	if (!lu_object_is_dying(top) &&
+	    (lu_object_exists(orig) || lu_object_is_cl(orig))) {
+		LASSERT(list_empty(&top->loh_lru));
+		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+		spin_unlock(&bkt->lsb_waitq.lock);
+		percpu_counter_inc(&site->ls_lru_len_counter);
+		CDEBUG(D_INODE, "Add %p/%p to site lru. bkt: %p\n",
+		       orig, top, bkt);
+		return;
+	}
+
+	/*
+	 * If object is dying (will not be cached) then remove it from hash
+	 * table (it is already not on the LRU).
+	 *
+	 * This is done with bucket lock held.  As the only way to acquire first
+	 * reference to previously unreferenced object is through hash-table
+	 * lookup (lu_object_find()) which takes the lock for first reference,
+	 * no race with concurrent object lookup is possible and we can safely
+	 * destroy object below.
+	 */
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+		rhashtable_remove_fast(&site->ls_obj_hash, &top->loh_hash,
+				       obj_hash_params);
+
+	spin_unlock(&bkt->lsb_waitq.lock);
+	/* Object was already removed from hash above, can kill it. */
+	lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+	return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+		struct lu_site *site = o->lo_dev->ld_site;
+		struct rhashtable *obj_hash = &site->ls_obj_hash;
+		struct lu_site_bkt_data *bkt;
+
+		bkt = &site->ls_bkts[lu_bkt_hash(site, &top->loh_fid)];
+		spin_lock(&bkt->lsb_waitq.lock);
+		if (!list_empty(&top->loh_lru)) {
+			list_del_init(&top->loh_lru);
+			percpu_counter_dec(&site->ls_lru_len_counter);
+		}
+		spin_unlock(&bkt->lsb_waitq.lock);
+
+		rhashtable_remove_fast(obj_hash, &top->loh_hash,
+				       obj_hash_params);
+	}
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+					 struct lu_device *dev,
+					 const struct lu_fid *f)
+{
+	struct lu_object *top;
+
+	/*
+	 * Create top-level object slice. This will also create
+	 * lu_object_header.
+	 */
+	top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+	if (top == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(top))
+		return top;
+	/*
+	 * This is the only place where object fid is assigned. It's constant
+	 * after this point.
+	 */
+	top->lo_header->loh_fid = *f;
+
+	return top;
+}
+
+/**
+ * Initialize object.
+ *
+ * This is called after object hash insertion to avoid returning an object with
+ * stale attributes.
+ */
+static int lu_object_start(const struct lu_env *env, struct lu_device *dev,
+			   struct lu_object *top,
+			   const struct lu_object_conf *conf)
+{
+	struct lu_object *scan;
+	struct list_head *layers;
+	unsigned int init_mask = 0;
+	unsigned int init_flag;
+	int clean;
+	int result;
+
+	layers = &top->lo_header->loh_layers;
+
+	do {
+		/*
+		 * Call ->loo_object_init() repeatedly, until no more new
+		 * object slices are created.
+		 */
+		clean = 1;
+		init_flag = 1;
+		list_for_each_entry(scan, layers, lo_linkage) {
+			if (init_mask & init_flag)
+				goto next;
+			clean = 0;
+			scan->lo_header = top->lo_header;
+			result = scan->lo_ops->loo_object_init(env, scan, conf);
+			if (result)
+				return result;
+
+			init_mask |= init_flag;
+next:
+			init_flag <<= 1;
+		}
+	} while (!clean);
+
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+		if (scan->lo_ops->loo_object_start != NULL) {
+			result = scan->lo_ops->loo_object_start(env, scan);
+			if (result)
+				return result;
+		}
+	}
+
+	lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+
+	set_bit(LU_OBJECT_INITED, &top->lo_header->loh_flags);
+
+	return 0;
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	wait_queue_head_t *wq;
+	struct lu_site *site;
+	struct lu_object *scan;
+	struct list_head *layers;
+	LIST_HEAD(splice);
+
+	site = o->lo_dev->ld_site;
+	layers = &o->lo_header->loh_layers;
+	wq = lu_site_wq_from_fid(site, &o->lo_header->loh_fid);
+        /*
+         * First call ->loo_object_delete() method to release all resources.
+         */
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+                if (scan->lo_ops->loo_object_delete != NULL)
+                        scan->lo_ops->loo_object_delete(env, scan);
+        }
+
+        /*
+         * Then, splice object layers into stand-alone list, and call
+         * ->loo_object_free() on all layers to free memory. Splice is
+         * necessary, because lu_object_header is freed together with the
+         * top-level slice.
+         */
+	list_splice_init(layers, &splice);
+	while (!list_empty(&splice)) {
+		/*
+		 * Free layers in bottom-to-top order, so that object header
+		 * lives as long as possible and ->loo_object_free() methods
+		 * can look at its contents.
+		 */
+		o = container_of(splice.prev, struct lu_object, lo_linkage);
+		list_del_init(&o->lo_linkage);
+		LASSERT(o->lo_ops->loo_object_free != NULL);
+		o->lo_ops->loo_object_free(env, o);
+	}
+
+	if (waitqueue_active(wq))
+		wake_up(wq);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ * if canblock is 0, then don't block awaiting for another
+ * instance of lu_site_purge() to complete
+ */
+int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s,
+			  int nr, int canblock)
+{
+	struct lu_object_header *h;
+	struct lu_object_header *temp;
+	struct lu_site_bkt_data *bkt;
+	LIST_HEAD(dispose);
+	int                      did_sth;
+	unsigned int		 start = 0;
+	int                      count;
+	int                      bnr;
+	unsigned int             i;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+		RETURN(0);
+
+	/*
+	 * Under LRU list lock, scan LRU list and move unreferenced objects to
+	 * the dispose list, removing them from LRU and hash table.
+	 */
+	if (nr != ~0)
+		start = s->ls_purge_start;
+	bnr = (nr == ~0) ? -1 : nr / s->ls_bkt_cnt + 1;
+again:
+	/*
+	 * It doesn't make any sense to make purge threads parallel, that can
+	 * only bring troubles to us.  See LU-5331.
+	 */
+	if (canblock != 0)
+		mutex_lock(&s->ls_purge_mutex);
+	else if (mutex_trylock(&s->ls_purge_mutex) == 0)
+		goto out;
+
+	did_sth = 0;
+	for (i = start; i < s->ls_bkt_cnt ; i++) {
+		count = bnr;
+		bkt = &s->ls_bkts[i];
+		spin_lock(&bkt->lsb_waitq.lock);
+
+		list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+			LASSERT(atomic_read(&h->loh_ref) == 0);
+
+			LINVRNT(lu_bkt_hash(s, &h->loh_fid) == i);
+
+			set_bit(LU_OBJECT_UNHASHED, &h->loh_flags);
+			rhashtable_remove_fast(&s->ls_obj_hash, &h->loh_hash,
+					       obj_hash_params);
+			list_move(&h->loh_lru, &dispose);
+			percpu_counter_dec(&s->ls_lru_len_counter);
+			if (did_sth == 0)
+				did_sth = 1;
+
+			if (nr != ~0 && --nr == 0)
+				break;
+
+			if (count > 0 && --count == 0)
+				break;
+
+		}
+		spin_unlock(&bkt->lsb_waitq.lock);
+		cond_resched();
+		/*
+		 * Free everything on the dispose list. This is safe against
+		 * races due to the reasons described in lu_object_put().
+		 */
+		while ((h = list_first_entry_or_null(&dispose,
+						     struct lu_object_header,
+						     loh_lru)) != NULL) {
+			list_del_init(&h->loh_lru);
+			lu_object_free(env, lu_object_top(h));
+			lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+		}
+
+		if (nr == 0)
+			break;
+	}
+	mutex_unlock(&s->ls_purge_mutex);
+
+	if (nr != 0 && did_sth && start != 0) {
+		start = 0; /* restart from the first bucket */
+		goto again;
+	}
+	/* race on s->ls_purge_start, but nobody cares */
+	s->ls_purge_start = i & (s->ls_bkt_cnt - 1);
+out:
+	return nr;
+}
+EXPORT_SYMBOL(lu_site_purge_objects);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+        /**
+         * Maximal line size.
+         *
+         * XXX overflow is not handled correctly.
+         */
+        LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+        /**
+         * Temporary buffer.
+         */
+        char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+static struct lu_context_key lu_global_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+		    LCT_MG_THREAD | LCT_CL_THREAD | LCT_LOCAL,
+	.lct_init = lu_global_key_init,
+	.lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+                      void *cookie, const char *format, ...)
+{
+        struct libcfs_debug_msg_data *msgdata = cookie;
+        struct lu_cdebug_data        *key;
+        int used;
+        int complete;
+        va_list args;
+
+        va_start(args, format);
+
+        key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+        LASSERT(key != NULL);
+
+        used = strlen(key->lck_area);
+        complete = format[strlen(format) - 1] == '\n';
+        /*
+         * Append new chunk to the buffer.
+         */
+        vsnprintf(key->lck_area + used,
+                  ARRAY_SIZE(key->lck_area) - used, format, args);
+        if (complete) {
+		if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+			libcfs_debug_msg(msgdata, "%s\n", key->lck_area);
+                key->lck_area[0] = 0;
+        }
+        va_end(args);
+        return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t printer,
+                            const struct lu_object_header *hdr)
+{
+	(*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+		   hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+		   PFID(&hdr->loh_fid),
+		   test_bit(LU_OBJECT_UNHASHED,
+			    &hdr->loh_flags) ? "" : " hash",
+		   list_empty(&hdr->loh_lru) ? "" : " lru",
+		   hdr->loh_attr & LOHA_EXISTS ? " exist" : "");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t printer, const struct lu_object *o)
+{
+	static const char ruler[] = "........................................";
+	struct lu_object_header *top;
+	int depth = 4;
+
+	top = o->lo_header;
+	lu_object_header_print(env, cookie, printer, top);
+	(*printer)(env, cookie, "{\n");
+
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		/*
+		 * print `.' \a depth times followed by type name and address
+		 */
+		(*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+			   o->lo_dev->ld_type->ldt_name, o);
+
+		if (o->lo_ops->loo_object_print != NULL)
+			(*o->lo_ops->loo_object_print)(env, cookie, printer, o);
+
+		(*printer)(env, cookie, "\n");
+	}
+
+	(*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+        struct lu_object_header *top;
+
+        top = o->lo_header;
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+                if (o->lo_ops->loo_object_invariant != NULL &&
+                    !o->lo_ops->loo_object_invariant(o))
+                        return 0;
+        }
+        return 1;
+}
+
+/*
+ * Limit the lu_object cache to a maximum of lu_cache_nr objects.  Because the
+ * calculation for the number of objects to reclaim is not covered by a lock the
+ * maximum number of objects is capped by LU_CACHE_MAX_ADJUST.  This ensures
+ * that many concurrent threads will not accidentally purge the entire cache.
+ */
+static void lu_object_limit(const struct lu_env *env,
+			    struct lu_device *dev)
+{
+	u64 size, nr;
+
+	if (lu_cache_nr == LU_CACHE_NR_UNLIMITED)
+		return;
+
+	size = atomic_read(&dev->ld_site->ls_obj_hash.nelems);
+	nr = (u64)lu_cache_nr;
+	if (size <= nr)
+		return;
+
+	lu_site_purge_objects(env, dev->ld_site,
+			      min_t(u64, size - nr, LU_CACHE_NR_MAX_ADJUST),
+			      0);
+}
+
+static struct lu_object *htable_lookup(const struct lu_env *env,
+				       struct lu_device *dev,
+				       struct lu_site_bkt_data *bkt,
+				       const struct lu_fid *f,
+				       struct lu_object_header *new)
+{
+	struct lu_site *s = dev->ld_site;
+	struct lu_object_header	*h;
+
+try_again:
+	rcu_read_lock();
+	if (new)
+		h = rhashtable_lookup_get_insert_fast(&s->ls_obj_hash,
+						      &new->loh_hash,
+						      obj_hash_params);
+	else
+		h = rhashtable_lookup(&s->ls_obj_hash, f, obj_hash_params);
+
+	if (IS_ERR_OR_NULL(h)) {
+		/* Not found */
+		if (!new)
+			lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+		rcu_read_unlock();
+		if (PTR_ERR(h) == -ENOMEM) {
+			msleep(20);
+			goto try_again;
+		}
+		lu_object_limit(env, dev);
+		if (PTR_ERR(h) == -E2BIG)
+			goto try_again;
+
+		return ERR_PTR(-ENOENT);
+	}
+
+	if (atomic_inc_not_zero(&h->loh_ref)) {
+		rcu_read_unlock();
+		return lu_object_top(h);
+	}
+
+	spin_lock(&bkt->lsb_waitq.lock);
+	if (lu_object_is_dying(h) ||
+	    test_bit(LU_OBJECT_UNHASHED, &h->loh_flags)) {
+		spin_unlock(&bkt->lsb_waitq.lock);
+		rcu_read_unlock();
+		if (new) {
+			/*
+			 * Old object might have already been removed, or will
+			 * be soon.  We need to insert our new object, so
+			 * remove the old one just in case it is still there.
+			 */
+			rhashtable_remove_fast(&s->ls_obj_hash, &h->loh_hash,
+					       obj_hash_params);
+			goto try_again;
+		}
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+		return ERR_PTR(-ENOENT);
+	}
+	/* Now protected by spinlock */
+	rcu_read_unlock();
+
+	if (!list_empty(&h->loh_lru)) {
+		list_del_init(&h->loh_lru);
+		percpu_counter_dec(&s->ls_lru_len_counter);
+	}
+	atomic_inc(&h->loh_ref);
+	spin_unlock(&bkt->lsb_waitq.lock);
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+	return lu_object_top(h);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+                                 struct lu_device *dev, const struct lu_fid *f,
+                                 const struct lu_object_conf *conf)
+{
+        return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+/*
+ * Get a 'first' reference to an object that was found while looking through the
+ * hash table.
+ */
+struct lu_object *lu_object_get_first(struct lu_object_header *h,
+				      struct lu_device *dev)
+{
+	struct lu_site *s = dev->ld_site;
+	struct lu_object *ret;
+
+	if (IS_ERR_OR_NULL(h) || lu_object_is_dying(h))
+		return NULL;
+
+	ret = lu_object_locate(h, dev->ld_type);
+	if (!ret)
+		return ret;
+
+	if (!atomic_inc_not_zero(&h->loh_ref)) {
+		struct lu_site_bkt_data *bkt;
+
+		bkt = &s->ls_bkts[lu_bkt_hash(s, &h->loh_fid)];
+		spin_lock(&bkt->lsb_waitq.lock);
+		if (!lu_object_is_dying(h) &&
+		    !test_bit(LU_OBJECT_UNHASHED, &h->loh_flags))
+			atomic_inc(&h->loh_ref);
+		else
+			ret = NULL;
+		spin_unlock(&bkt->lsb_waitq.lock);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(lu_object_get_first);
+
+/**
+ * Core logic of lu_object_find*() functions.
+ *
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+				    struct lu_device *dev,
+				    const struct lu_fid *f,
+				    const struct lu_object_conf *conf)
+{
+	struct lu_object *o;
+	struct lu_object *shadow;
+	struct lu_site *s;
+	struct lu_site_bkt_data *bkt;
+	struct rhashtable *hs;
+	int rc;
+
+	ENTRY;
+
+	/* FID is from disk or network, zero FID is meaningless, return error
+	 * early to avoid assertion in lu_object_put. If a zero FID is wanted,
+	 * it should be allocated via lu_object_anon().
+	 */
+	if (fid_is_zero(f))
+		RETURN(ERR_PTR(-EINVAL));
+
+	/*
+	 * This uses standard index maintenance protocol:
+	 *
+	 *     - search index under lock, and return object if found;
+	 *     - otherwise, unlock index, allocate new object;
+	 *     - lock index and search again;
+	 *     - if nothing is found (usual case), insert newly created
+	 *       object into index;
+	 *     - otherwise (race: other thread inserted object), free
+	 *       object just allocated.
+	 *     - unlock index;
+	 *     - return object.
+	 *
+	 * For "LOC_F_NEW" case, we are sure the object is new established.
+	 * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+	 * just alloc and insert directly.
+	 *
+	 */
+	s  = dev->ld_site;
+	hs = &s->ls_obj_hash;
+
+	if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_OBD_ZERO_NLINK_RACE)))
+		lu_site_purge(env, s, -1);
+
+	bkt = &s->ls_bkts[lu_bkt_hash(s, f)];
+	if (!(conf && conf->loc_flags & LOC_F_NEW)) {
+		o = htable_lookup(env, dev, bkt, f, NULL);
+
+		if (!IS_ERR(o)) {
+			if (likely(lu_object_is_inited(o->lo_header)))
+				RETURN(o);
+
+			wait_event_idle(bkt->lsb_waitq,
+					lu_object_is_inited(o->lo_header) ||
+					lu_object_is_dying(o->lo_header));
+
+			if (lu_object_is_dying(o->lo_header)) {
+				lu_object_put(env, o);
+
+				RETURN(ERR_PTR(-ENOENT));
+			}
+
+			RETURN(o);
+		}
+
+		if (PTR_ERR(o) != -ENOENT)
+			RETURN(o);
+	}
+
+	/*
+	 * Allocate new object, NB, object is unitialized in case object
+	 * is changed between allocation and hash insertion, thus the object
+	 * with stale attributes is returned.
+	 */
+	o = lu_object_alloc(env, dev, f);
+	if (IS_ERR(o))
+		RETURN(o);
+
+	LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+	CFS_RACE_WAIT(OBD_FAIL_OBD_ZERO_NLINK_RACE);
+
+	if (conf && conf->loc_flags & LOC_F_NEW) {
+		int status = rhashtable_insert_fast(hs, &o->lo_header->loh_hash,
+						    obj_hash_params);
+		if (status)
+			/* Strange error - go the slow way */
+			shadow = htable_lookup(env, dev, bkt, f, o->lo_header);
+		else
+			shadow = ERR_PTR(-ENOENT);
+	} else {
+		shadow = htable_lookup(env, dev, bkt, f, o->lo_header);
+	}
+	if (likely(PTR_ERR(shadow) == -ENOENT)) {
+		/*
+		 * The new object has been successfully inserted.
+		 *
+		 * This may result in rather complicated operations, including
+		 * fld queries, inode loading, etc.
+		 */
+		rc = lu_object_start(env, dev, o, conf);
+		if (rc) {
+			lu_object_put_nocache(env, o);
+			RETURN(ERR_PTR(rc));
+		}
+
+		wake_up(&bkt->lsb_waitq);
+
+		lu_object_limit(env, dev);
+
+		RETURN(o);
+	}
+
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+	lu_object_free(env, o);
+
+	if (!(conf && conf->loc_flags & LOC_F_NEW) &&
+	    !IS_ERR(shadow) &&
+	    !lu_object_is_inited(shadow->lo_header)) {
+		wait_event_idle(bkt->lsb_waitq,
+				lu_object_is_inited(shadow->lo_header) ||
+				lu_object_is_dying(shadow->lo_header));
+
+		if (lu_object_is_dying(shadow->lo_header)) {
+			lu_object_put(env, shadow);
+
+			RETURN(ERR_PTR(-ENOENT));
+		}
+	}
+
+	RETURN(shadow);
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+                                       struct lu_device *dev,
+                                       const struct lu_fid *f,
+                                       const struct lu_object_conf *conf)
+{
+	struct lu_object *top;
+	struct lu_object *obj;
+
+	top = lu_object_find(env, dev, f, conf);
+	if (IS_ERR(top))
+		return top;
+
+	obj = lu_object_locate(top->lo_header, dev->ld_type);
+	if (unlikely(obj == NULL)) {
+		lu_object_put(env, top);
+		obj = ERR_PTR(-ENOENT);
+	}
+
+	return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+	int result = 0;
+
+	atomic_set(&ldt->ldt_device_nr, 0);
+	if (ldt->ldt_ops->ldto_init)
+		result = ldt->ldt_ops->ldto_init(ldt);
+
+	return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+	if (ldt->ldt_ops->ldto_fini)
+		ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DECLARE_RWSEM(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+        struct lu_env   *lsp_env;
+        void            *lsp_cookie;
+        lu_printer_t     lsp_printer;
+};
+
+static void
+lu_site_obj_print(struct lu_object_header *h, struct lu_site_print_arg *arg)
+{
+	if (!list_empty(&h->loh_layers)) {
+		const struct lu_object *o;
+
+		o = lu_object_top(h);
+		lu_object_print(arg->lsp_env, arg->lsp_cookie,
+				arg->lsp_printer, o);
+	} else {
+		lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+				       arg->lsp_printer, h);
+	}
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, atomic_t *ref,
+		   int msg_flag, lu_printer_t printer)
+{
+	struct lu_site_print_arg arg = {
+		.lsp_env     = (struct lu_env *)env,
+		.lsp_printer = printer,
+	};
+	struct rhashtable_iter iter;
+	struct lu_object_header *h;
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, msg_flag, NULL);
+
+	if (!s || !atomic_read(ref))
+		return;
+
+	arg.lsp_cookie = (void *)&msgdata;
+
+	rhashtable_walk_enter(&s->ls_obj_hash, &iter);
+	rhashtable_walk_start(&iter);
+	while ((h = rhashtable_walk_next(&iter)) != NULL) {
+		if (IS_ERR(h))
+			continue;
+		lu_site_obj_print(h, &arg);
+	}
+	rhashtable_walk_stop(&iter);
+	rhashtable_walk_exit(&iter);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+/**
+ * Return desired hash table order.
+ */
+static void lu_htable_limits(struct lu_device *top)
+{
+	unsigned long cache_size;
+
+	/*
+	 * For ZFS based OSDs the cache should be disabled by default.  This
+	 * allows the ZFS ARC maximum flexibility in determining what buffers
+	 * to cache.  If Lustre has objects or buffer which it wants to ensure
+	 * always stay cached it must maintain a hold on them.
+	 */
+	if (strcmp(top->ld_type->ldt_name, LUSTRE_OSD_ZFS_NAME) == 0) {
+		lu_cache_nr = LU_CACHE_NR_ZFS_LIMIT;
+		return;
+	}
+
+	/*
+	 * Calculate hash table size, assuming that we want reasonable
+	 * performance when 20% of total memory is occupied by cache of
+	 * lu_objects.
+	 *
+	 * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+	 */
+	cache_size = cfs_totalram_pages();
+
+#if BITS_PER_LONG == 32
+	/* limit hashtable size for lowmem systems to low RAM */
+	if (cache_size > 1 << (30 - PAGE_SHIFT))
+		cache_size = 1 << (30 - PAGE_SHIFT) * 3 / 4;
+#endif
+
+	/* clear off unreasonable cache setting. */
+	if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+		CWARN("obdclass: invalid lu_cache_percent: %u, it must be in the range of (0, %u]. Will use default value: %u.\n",
+		      lu_cache_percent, LU_CACHE_PERCENT_MAX,
+		      LU_CACHE_PERCENT_DEFAULT);
+
+		lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+	}
+	cache_size = cache_size / 100 * lu_cache_percent *
+		(PAGE_SIZE / 1024);
+
+	lu_cache_nr = clamp_t(typeof(cache_size), cache_size,
+			      LU_CACHE_NR_MIN, LU_CACHE_NR_MAX);
+}
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	if (list_empty(&d->ld_linkage))
+		list_add(&d->ld_linkage, &s->ls_ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	list_del_init(&d->ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+  * Initialize site \a s, with \a d as the top level device.
+  */
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+	struct lu_site_bkt_data *bkt;
+	unsigned int i;
+	int rc;
+	ENTRY;
+
+	memset(s, 0, sizeof *s);
+	mutex_init(&s->ls_purge_mutex);
+	lu_htable_limits(top);
+
+#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+	rc = percpu_counter_init(&s->ls_lru_len_counter, 0, GFP_NOFS);
+#else
+	rc = percpu_counter_init(&s->ls_lru_len_counter, 0);
+#endif
+	if (rc)
+		return -ENOMEM;
+
+	if (rhashtable_init(&s->ls_obj_hash, &obj_hash_params) != 0) {
+		CERROR("failed to create lu_site hash\n");
+		return -ENOMEM;
+	}
+
+	s->ls_bkt_seed = get_random_u32();
+	s->ls_bkt_cnt = max_t(long, 1 << LU_SITE_BKT_BITS,
+			      2 * num_possible_cpus());
+	s->ls_bkt_cnt = roundup_pow_of_two(s->ls_bkt_cnt);
+	OBD_ALLOC_PTR_ARRAY_LARGE(s->ls_bkts, s->ls_bkt_cnt);
+	if (!s->ls_bkts) {
+		rhashtable_destroy(&s->ls_obj_hash);
+		s->ls_bkts = NULL;
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < s->ls_bkt_cnt; i++) {
+		bkt = &s->ls_bkts[i];
+		INIT_LIST_HEAD(&bkt->lsb_lru);
+		init_waitqueue_head(&bkt->lsb_waitq);
+	}
+
+	s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+	if (s->ls_stats == NULL) {
+		OBD_FREE_PTR_ARRAY_LARGE(s->ls_bkts, s->ls_bkt_cnt);
+		s->ls_bkts = NULL;
+		rhashtable_destroy(&s->ls_obj_hash);
+		return -ENOMEM;
+	}
+
+        lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+                             0, "created", "created");
+        lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+                             0, "cache_hit", "cache_hit");
+        lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+                             0, "cache_miss", "cache_miss");
+        lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+                             0, "cache_race", "cache_race");
+        lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+                             0, "cache_death_race", "cache_death_race");
+        lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+                             0, "lru_purged", "lru_purged");
+
+	INIT_LIST_HEAD(&s->ls_linkage);
+        s->ls_top_dev = top;
+        top->ld_site = s;
+        lu_device_get(top);
+        lu_ref_add(&top->ld_reference, "site-top", s);
+
+	INIT_LIST_HEAD(&s->ls_ld_linkage);
+	spin_lock_init(&s->ls_ld_lock);
+
+	lu_dev_add_linkage(s, top);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+	down_write(&lu_sites_guard);
+	list_del_init(&s->ls_linkage);
+	up_write(&lu_sites_guard);
+
+	percpu_counter_destroy(&s->ls_lru_len_counter);
+
+	if (s->ls_bkts) {
+		rhashtable_destroy(&s->ls_obj_hash);
+		OBD_FREE_PTR_ARRAY_LARGE(s->ls_bkts, s->ls_bkt_cnt);
+		s->ls_bkts = NULL;
+	}
+
+        if (s->ls_top_dev != NULL) {
+                s->ls_top_dev->ld_site = NULL;
+                lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+                lu_device_put(s->ls_top_dev);
+                s->ls_top_dev = NULL;
+        }
+
+        if (s->ls_stats != NULL)
+                lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+        int result;
+	down_write(&lu_sites_guard);
+        result = lu_context_refill(&lu_shrink_env.le_ctx);
+        if (result == 0)
+		list_add(&s->ls_linkage, &lu_sites);
+	up_write(&lu_sites_guard);
+        return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+	atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+	LASSERT(atomic_read(&d->ld_ref) > 0);
+	atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+enum { /* Maximal number of tld slots. */
+	LU_CONTEXT_KEY_NR = 40
+};
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+static DECLARE_RWSEM(lu_key_initing);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+	if (atomic_add_unless(&t->ldt_device_nr, 1, 0) == 0) {
+		down_write(&lu_key_initing);
+		if (t->ldt_ops->ldto_start &&
+		    atomic_read(&t->ldt_device_nr) == 0)
+			t->ldt_ops->ldto_start(t);
+		atomic_inc(&t->ldt_device_nr);
+		up_write(&lu_key_initing);
+	}
+
+	memset(d, 0, sizeof *d);
+	d->ld_type = t;
+	lu_ref_init(&d->ld_reference);
+	INIT_LIST_HEAD(&d->ld_linkage);
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+	struct lu_device_type *t = d->ld_type;
+
+	if (d->ld_obd != NULL) {
+		d->ld_obd->obd_lu_dev = NULL;
+		d->ld_obd = NULL;
+	}
+
+	lu_ref_fini(&d->ld_reference);
+	LASSERTF(atomic_read(&d->ld_ref) == 0,
+		 "Refcount is %u\n", atomic_read(&d->ld_ref));
+	LASSERT(atomic_read(&t->ldt_device_nr) > 0);
+
+	if (atomic_dec_and_test(&t->ldt_device_nr) &&
+	    t->ldt_ops->ldto_stop != NULL)
+		t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o, struct lu_object_header *h,
+		   struct lu_device *d)
+{
+	memset(o, 0, sizeof(*o));
+	o->lo_header = h;
+	o->lo_dev = d;
+	lu_device_get(d);
+	lu_ref_add_at(&d->ld_reference, &o->lo_dev_ref, "lu_object", o);
+	INIT_LIST_HEAD(&o->lo_linkage);
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+	struct lu_device *dev = o->lo_dev;
+
+	LASSERT(list_empty(&o->lo_linkage));
+
+	if (dev != NULL) {
+		lu_ref_del_at(&dev->ld_reference, &o->lo_dev_ref,
+			      "lu_object", o);
+		lu_device_put(dev);
+		o->lo_dev = NULL;
+	}
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+        memset(h, 0, sizeof *h);
+	atomic_set(&h->loh_ref, 1);
+	INIT_LIST_HEAD(&h->loh_lru);
+	INIT_LIST_HEAD(&h->loh_layers);
+        lu_ref_init(&h->loh_reference);
+        return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+	LASSERT(list_empty(&h->loh_layers));
+	LASSERT(list_empty(&h->loh_lru));
+        lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Free lu_object_header with proper RCU handling
+ */
+void lu_object_header_free(struct lu_object_header *h)
+{
+	lu_object_header_fini(h);
+	OBD_FREE_PRE(h, sizeof(*h), "kfreed");
+	kfree_rcu(h, loh_rcu);
+}
+EXPORT_SYMBOL(lu_object_header_free);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+                                   const struct lu_device_type *dtype)
+{
+	struct lu_object *o;
+
+	list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+		if (o->lo_dev->ld_type == dtype)
+			return o;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+        struct lu_site   *site = top->ld_site;
+        struct lu_device *scan;
+        struct lu_device *next;
+
+        lu_site_purge(env, site, ~0);
+        for (scan = top; scan != NULL; scan = next) {
+                next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+                lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+                lu_device_put(scan);
+        }
+
+        /* purge again. */
+        lu_site_purge(env, site, ~0);
+
+        for (scan = top; scan != NULL; scan = next) {
+                const struct lu_device_type *ldt = scan->ld_type;
+
+                next = ldt->ldt_ops->ldto_device_free(env, scan);
+        }
+}
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static atomic_t key_set_version = ATOMIC_INIT(0);
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+	int result;
+	unsigned int i;
+
+        LASSERT(key->lct_init != NULL);
+        LASSERT(key->lct_fini != NULL);
+        LASSERT(key->lct_tags != 0);
+        LASSERT(key->lct_owner != NULL);
+
+        result = -ENFILE;
+	atomic_set(&key->lct_used, 1);
+	lu_ref_init(&key->lct_reference);
+        for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		if (lu_keys[i])
+			continue;
+		key->lct_index = i;
+
+		if (strncmp("osd_", module_name(key->lct_owner), 4) == 0)
+			CFS_RACE_WAIT(OBD_FAIL_OBD_SETUP);
+
+		if (cmpxchg(&lu_keys[i], NULL, key) != NULL)
+			continue;
+
+		result = 0;
+		atomic_inc(&key_set_version);
+		break;
+        }
+	if (result) {
+		lu_ref_fini(&key->lct_reference);
+		atomic_set(&key->lct_used, 0);
+	}
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+        if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+                struct lu_context_key *key;
+
+                key = lu_keys[index];
+                LASSERT(key != NULL);
+                LASSERT(key->lct_fini != NULL);
+		LASSERT(atomic_read(&key->lct_used) > 0);
+
+                key->lct_fini(ctx, key, ctx->lc_value[index]);
+                lu_ref_del(&key->lct_reference, "ctx", ctx);
+		if (atomic_dec_and_test(&key->lct_used))
+			wake_up_var(&key->lct_used);
+
+		LASSERT(key->lct_owner != NULL);
+		if ((ctx->lc_tags & LCT_NOREF) == 0) {
+			LINVRNT(module_refcount(key->lct_owner) > 0);
+			module_put(key->lct_owner);
+		}
+		ctx->lc_value[index] = NULL;
+	}
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+	LASSERT(atomic_read(&key->lct_used) >= 1);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+	lu_context_key_quiesce(NULL, key);
+
+	key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+
+	/**
+	 * Wait until all transient contexts referencing this key have
+	 * run lu_context_key::lct_fini() method.
+	 */
+	atomic_dec(&key->lct_used);
+	wait_var_event(&key->lct_used, atomic_read(&key->lct_used) == 0);
+
+	if (!WARN_ON(lu_keys[key->lct_index] == NULL))
+		lu_ref_fini(&key->lct_reference);
+
+	smp_store_release(&lu_keys[key->lct_index], NULL);
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+        struct lu_context_key *key = k;
+        va_list args;
+        int result;
+
+        va_start(args, k);
+        do {
+                result = lu_context_key_register(key);
+                if (result)
+                        break;
+                key = va_arg(args, struct lu_context_key *);
+        } while (key != NULL);
+        va_end(args);
+
+        if (result != 0) {
+                va_start(args, k);
+                while (k != key) {
+                        lu_context_key_degister(k);
+                        k = va_arg(args, struct lu_context_key *);
+                }
+                va_end(args);
+        }
+
+        return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+        va_list args;
+
+        va_start(args, k);
+        do {
+                lu_context_key_degister(k);
+                k = va_arg(args, struct lu_context_key*);
+        } while (k != NULL);
+        va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+        va_list args;
+
+        va_start(args, k);
+        do {
+                lu_context_key_revive(k);
+                k = va_arg(args, struct lu_context_key*);
+        } while (k != NULL);
+        va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_device_type *t,
+				 struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_quiesce(t, k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+                         const struct lu_context_key *key)
+{
+        LINVRNT(ctx->lc_state == LCS_ENTERED);
+        LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+        LASSERT(lu_keys[key->lct_index] == key);
+        return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+static DEFINE_SPINLOCK(lu_context_remembered_guard);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_device_type *t,
+			    struct lu_context_key *key)
+{
+	struct lu_context *ctx;
+
+	if (key->lct_tags & LCT_QUIESCENT)
+		return;
+	/*
+	 * The write-lock on lu_key_initing will ensure that any
+	 * keys_fill() which didn't see LCT_QUIESCENT will have
+	 * finished before we call key_fini().
+	 */
+	down_write(&lu_key_initing);
+	if (!(key->lct_tags & LCT_QUIESCENT)) {
+		if (t == NULL || atomic_read(&t->ldt_device_nr) == 0)
+			key->lct_tags |= LCT_QUIESCENT;
+		up_write(&lu_key_initing);
+
+		spin_lock(&lu_context_remembered_guard);
+		list_for_each_entry(ctx, &lu_context_remembered, lc_remember) {
+			spin_until_cond(READ_ONCE(ctx->lc_state) != LCS_LEAVING);
+			key_fini(ctx, key->lct_index);
+		}
+		spin_unlock(&lu_context_remembered_guard);
+
+		return;
+	}
+	up_write(&lu_key_initing);
+}
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+	key->lct_tags &= ~LCT_QUIESCENT;
+	atomic_inc(&key_set_version);
+}
+
+static void keys_fini(struct lu_context *ctx)
+{
+	unsigned int i;
+
+	if (ctx->lc_value == NULL)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+		key_fini(ctx, i);
+
+	OBD_FREE_PTR_ARRAY(ctx->lc_value, ARRAY_SIZE(lu_keys));
+	ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+	unsigned int i;
+	int rc = 0;
+
+	/*
+	 * A serialisation with lu_context_key_quiesce() is needed, to
+	 * ensure we see LCT_QUIESCENT and don't allocate a new value
+	 * after it freed one.  The rwsem provides this.  As down_read()
+	 * does optimistic spinning while the writer is active, this is
+	 * unlikely to ever sleep.
+	 */
+	down_read(&lu_key_initing);
+	ctx->lc_version = atomic_read(&key_set_version);
+
+	LINVRNT(ctx->lc_value);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		struct lu_context_key *key;
+
+		key = lu_keys[i];
+		if (!ctx->lc_value[i] && key &&
+		    (key->lct_tags & ctx->lc_tags) &&
+		    /*
+		     * Don't create values for a LCT_QUIESCENT key, as this
+		     * will pin module owning a key.
+		     */
+		    !(key->lct_tags & LCT_QUIESCENT)) {
+			void *value;
+
+			LINVRNT(key->lct_init != NULL);
+			LINVRNT(key->lct_index == i);
+
+			LASSERT(key->lct_owner != NULL);
+			if (!(ctx->lc_tags & LCT_NOREF) &&
+			    try_module_get(key->lct_owner) == 0) {
+				/* module is unloading, skip this key */
+				continue;
+			}
+
+			value = key->lct_init(ctx, key);
+			if (unlikely(IS_ERR(value))) {
+				rc = PTR_ERR(value);
+				break;
+			}
+
+			lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+			atomic_inc(&key->lct_used);
+			/*
+			 * This is the only place in the code, where an
+			 * element of ctx->lc_value[] array is set to non-NULL
+			 * value.
+			 */
+			ctx->lc_value[i] = value;
+			if (key->lct_exit != NULL)
+				ctx->lc_tags |= LCT_HAS_EXIT;
+		}
+	}
+
+	up_read(&lu_key_initing);
+	return rc;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+	OBD_ALLOC_PTR_ARRAY(ctx->lc_value, ARRAY_SIZE(lu_keys));
+	if (likely(ctx->lc_value != NULL))
+		return keys_fill(ctx);
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+	int	rc;
+
+	memset(ctx, 0, sizeof *ctx);
+	ctx->lc_state = LCS_INITIALIZED;
+	ctx->lc_tags = tags;
+	if (tags & LCT_REMEMBER) {
+		spin_lock(&lu_context_remembered_guard);
+		list_add(&ctx->lc_remember, &lu_context_remembered);
+		spin_unlock(&lu_context_remembered_guard);
+	} else {
+		INIT_LIST_HEAD(&ctx->lc_remember);
+	}
+
+	rc = keys_init(ctx);
+	if (rc != 0)
+		lu_context_fini(ctx);
+
+	return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_FINALIZED;
+
+	if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+		LASSERT(list_empty(&ctx->lc_remember));
+	} else {
+		/* could race with key degister */
+		spin_lock(&lu_context_remembered_guard);
+		list_del_init(&ctx->lc_remember);
+		spin_unlock(&lu_context_remembered_guard);
+	}
+	keys_fini(ctx);
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+        LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+        ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+	unsigned int i;
+
+	LINVRNT(ctx->lc_state == LCS_ENTERED);
+	/*
+	 * Disable preempt to ensure we get a warning if
+	 * any lct_exit ever tries to sleep.  That would hurt
+	 * lu_context_key_quiesce() which spins waiting for us.
+	 * This also ensure we aren't preempted while the state
+	 * is LCS_LEAVING, as that too would cause problems for
+	 * lu_context_key_quiesce().
+	 */
+	preempt_disable();
+	/*
+	 * Ensure lu_context_key_quiesce() sees LCS_LEAVING
+	 * or we see LCT_QUIESCENT
+	 */
+	smp_store_mb(ctx->lc_state, LCS_LEAVING);
+	if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value) {
+                for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+			struct lu_context_key *key;
+
+			key = lu_keys[i];
+			if (ctx->lc_value[i] &&
+			    !(key->lct_tags & LCT_QUIESCENT) &&
+			    key->lct_exit)
+				key->lct_exit(ctx, key, ctx->lc_value[i]);
+		}
+        }
+
+	smp_store_release(&ctx->lc_state, LCS_LEFT);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+	if (likely(ctx->lc_version == atomic_read(&key_set_version)))
+		return 0;
+
+	return keys_fill(ctx);
+}
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+u32 lu_context_tags_default = LCT_CL_THREAD;
+u32 lu_session_tags_default = LCT_SESSION;
+
+void lu_context_tags_update(__u32 tags)
+{
+	spin_lock(&lu_context_remembered_guard);
+	lu_context_tags_default |= tags;
+	atomic_inc(&key_set_version);
+	spin_unlock(&lu_context_remembered_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_context_remembered_guard);
+	lu_context_tags_default &= ~tags;
+	atomic_inc(&key_set_version);
+	spin_unlock(&lu_context_remembered_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+	spin_lock(&lu_context_remembered_guard);
+	lu_session_tags_default |= tags;
+	atomic_inc(&key_set_version);
+	spin_unlock(&lu_context_remembered_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_context_remembered_guard);
+	lu_session_tags_default &= ~tags;
+	atomic_inc(&key_set_version);
+	spin_unlock(&lu_context_remembered_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+        int result;
+
+        env->le_ses = NULL;
+        result = lu_context_init(&env->le_ctx, tags);
+        if (likely(result == 0))
+                lu_context_enter(&env->le_ctx);
+        return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+        lu_context_exit(&env->le_ctx);
+        lu_context_fini(&env->le_ctx);
+        env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+        int result;
+
+        result = lu_context_refill(&env->le_ctx);
+        if (result == 0 && env->le_ses != NULL)
+                result = lu_context_refill(env->le_ses);
+        return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+                          __u32 stags)
+{
+        int    result;
+
+        if ((env->le_ctx.lc_tags & ctags) != ctags) {
+                env->le_ctx.lc_version = 0;
+                env->le_ctx.lc_tags |= ctags;
+        }
+
+        if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+                env->le_ses->lc_version = 0;
+                env->le_ses->lc_tags |= stags;
+        }
+
+        result = lu_env_refill(env);
+
+        return result;
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+
+struct lu_env_item {
+	struct task_struct *lei_task;	/* rhashtable key */
+	struct rhash_head lei_linkage;
+	struct lu_env *lei_env;
+	struct rcu_head lei_rcu_head;
+};
+
+static const struct rhashtable_params lu_env_rhash_params = {
+	.key_len     = sizeof(struct task_struct *),
+	.key_offset  = offsetof(struct lu_env_item, lei_task),
+	.head_offset = offsetof(struct lu_env_item, lei_linkage),
+    };
+
+struct rhashtable lu_env_rhash;
+
+struct lu_env_percpu {
+	struct task_struct *lep_task;
+	struct lu_env *lep_env ____cacheline_aligned_in_smp;
+};
+
+static struct lu_env_percpu lu_env_percpu[NR_CPUS];
+
+int lu_env_add_task(struct lu_env *env, struct task_struct *task)
+{
+	struct lu_env_item *lei, *old;
+
+	LASSERT(env);
+
+	OBD_ALLOC_PTR(lei);
+	if (!lei)
+		return -ENOMEM;
+
+	lei->lei_task = task;
+	lei->lei_env = env;
+
+	old = rhashtable_lookup_get_insert_fast(&lu_env_rhash,
+						&lei->lei_linkage,
+						lu_env_rhash_params);
+	LASSERT(!old);
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_env_add_task);
+
+int lu_env_add(struct lu_env *env)
+{
+	return lu_env_add_task(env, current);
+}
+EXPORT_SYMBOL(lu_env_add);
+
+static void lu_env_item_free(struct rcu_head *head)
+{
+	struct lu_env_item *lei;
+
+	lei = container_of(head, struct lu_env_item, lei_rcu_head);
+	OBD_FREE_PTR(lei);
+}
+
+void lu_env_remove(struct lu_env *env)
+{
+	struct lu_env_item *lei;
+	const void *task = current;
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (lu_env_percpu[i].lep_env == env) {
+			LASSERT(lu_env_percpu[i].lep_task == task);
+			lu_env_percpu[i].lep_task = NULL;
+			lu_env_percpu[i].lep_env = NULL;
+		}
+	}
+
+	/* The rcu_lock is not taking in this case since the key
+	 * used is the actual task_struct. This implies that each
+	 * object is only removed by the owning thread, so there
+	 * can never be a race on a particular object.
+	 */
+	lei = rhashtable_lookup_fast(&lu_env_rhash, &task,
+				     lu_env_rhash_params);
+	if (lei && rhashtable_remove_fast(&lu_env_rhash, &lei->lei_linkage,
+					  lu_env_rhash_params) == 0)
+		call_rcu(&lei->lei_rcu_head, lu_env_item_free);
+}
+EXPORT_SYMBOL(lu_env_remove);
+
+struct lu_env *lu_env_find(void)
+{
+	struct lu_env *env = NULL;
+	struct lu_env_item *lei;
+	const void *task = current;
+	int i = get_cpu();
+
+	if (lu_env_percpu[i].lep_task == current) {
+		env = lu_env_percpu[i].lep_env;
+		put_cpu();
+		LASSERT(env);
+		return env;
+	}
+
+	lei = rhashtable_lookup_fast(&lu_env_rhash, &task,
+				     lu_env_rhash_params);
+	if (lei) {
+		env = lei->lei_env;
+		lu_env_percpu[i].lep_task = current;
+		lu_env_percpu[i].lep_env = env;
+	}
+	put_cpu();
+
+	return env;
+}
+EXPORT_SYMBOL(lu_env_find);
+
+typedef struct lu_site_stats{
+        unsigned        lss_populated;
+        unsigned        lss_max_search;
+        unsigned        lss_total;
+        unsigned        lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(const struct lu_site *s,
+			      lu_site_stats_t *stats)
+{
+	int cnt = atomic_read(&s->ls_obj_hash.nelems);
+	/*
+	 * percpu_counter_sum_positive() won't accept a const pointer
+	 * as it does modify the struct by taking a spinlock
+	 */
+	struct lu_site *s2 = (struct lu_site *)s;
+
+	stats->lss_busy += cnt -
+		percpu_counter_sum_positive(&s2->ls_lru_len_counter);
+
+	stats->lss_total += cnt;
+	stats->lss_max_search = 0;
+	stats->lss_populated = 0;
+}
+
+
+/*
+ * lu_cache_shrink_count() returns an approximate number of cached objects
+ * that can be freed by shrink_slab(). A counter, which tracks the
+ * number of items in the site's lru, is maintained in a percpu_counter
+ * for each site. The percpu values are incremented and decremented as
+ * objects are added or removed from the lru. The percpu values are summed
+ * and saved whenever a percpu value exceeds a threshold. Thus the saved,
+ * summed value at any given time may not accurately reflect the current
+ * lru length. But this value is sufficiently accurate for the needs of
+ * a shrinker.
+ *
+ * Using a per cpu counter is a compromise solution to concurrent access:
+ * lu_object_put() can update the counter without locking the site and
+ * lu_cache_shrink_count can sum the counters without locking each
+ * ls_obj_hash bucket.
+ */
+static unsigned long lu_cache_shrink_count(struct shrinker *sk,
+					   struct shrink_control *sc)
+{
+	struct lu_site *s;
+	struct lu_site *tmp;
+	unsigned long cached = 0;
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		return 0;
+
+	down_read(&lu_sites_guard);
+	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage)
+		cached += percpu_counter_read_positive(&s->ls_lru_len_counter);
+	up_read(&lu_sites_guard);
+
+	cached = (cached / 100) * sysctl_vfs_cache_pressure;
+	CDEBUG(D_INODE, "%ld objects cached, cache pressure %d\n",
+	       cached, sysctl_vfs_cache_pressure);
+
+	return cached;
+}
+
+static unsigned long lu_cache_shrink_scan(struct shrinker *sk,
+					  struct shrink_control *sc)
+{
+	struct lu_site *s;
+	struct lu_site *tmp;
+	unsigned long remain = sc->nr_to_scan;
+	LIST_HEAD(splice);
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		/* We must not take the lu_sites_guard lock when
+		 * __GFP_FS is *not* set because of the deadlock
+		 * possibility detailed above. Additionally,
+		 * since we cannot determine the number of
+		 * objects in the cache without taking this
+		 * lock, we're in a particularly tough spot. As
+		 * a result, we'll just lie and say our cache is
+		 * empty. This _should_ be ok, as we can't
+		 * reclaim objects when __GFP_FS is *not* set
+		 * anyways.
+		 */
+		return SHRINK_STOP;
+
+	down_write(&lu_sites_guard);
+	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+		remain = lu_site_purge(&lu_shrink_env, s, remain);
+		/*
+		 * Move just shrunk site to the tail of site list to
+		 * assure shrinking fairness.
+		 */
+		list_move_tail(&s->ls_linkage, &splice);
+	}
+	list_splice(&splice, lu_sites.prev);
+	up_write(&lu_sites_guard);
+
+	return sc->nr_to_scan - remain;
+}
+
+#ifdef HAVE_SHRINKER_COUNT
+static struct shrinker lu_site_shrinker = {
+	.count_objects	= lu_cache_shrink_count,
+	.scan_objects	= lu_cache_shrink_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+#else
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static int lu_cache_shrink(struct shrinker *shrinker,
+			   struct shrink_control *sc)
+{
+	int cached = 0;
+
+	CDEBUG(D_INODE, "Shrink %lu objects\n", sc->nr_to_scan);
+
+	if (sc->nr_to_scan != 0)
+		lu_cache_shrink_scan(shrinker, sc);
+
+	cached = lu_cache_shrink_count(shrinker, sc);
+	return cached;
+}
+
+static struct shrinker lu_site_shrinker = {
+	.shrink  = lu_cache_shrink,
+	.seeks   = DEFAULT_SEEKS,
+};
+
+#endif /* HAVE_SHRINKER_COUNT */
+
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+static struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+		      void *unused, const char *format, ...)
+{
+        va_list args;
+
+        va_start(args, format);
+        vprintk(format, args);
+        va_end(args);
+        return 0;
+}
+
+int lu_debugging_setup(void)
+{
+	return lu_env_init(&lu_debugging_env, ~0);
+}
+
+void lu_context_keys_dump(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		struct lu_context_key *key;
+
+		key = lu_keys[i];
+		if (key != NULL) {
+			CERROR("LU context keys [%d]: %p %x (%p,%p,%p) %d %d \"%s\"@%p\n",
+				i, key, key->lct_tags,
+				key->lct_init, key->lct_fini, key->lct_exit,
+				key->lct_index, atomic_read(&key->lct_used),
+				key->lct_owner ? key->lct_owner->name : "",
+				key->lct_owner);
+			lu_ref_print(&key->lct_reference);
+		}
+	}
+}
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+	int result;
+
+	CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+	result = lu_ref_global_init();
+	if (result != 0)
+		return result;
+
+	LU_CONTEXT_KEY_INIT(&lu_global_key);
+	result = lu_context_key_register(&lu_global_key);
+	if (result)
+		goto out_lu_ref;
+
+	/*
+	 * At this level, we don't know what tags are needed, so allocate them
+	 * conservatively. This should not be too bad, because this
+	 * environment is global.
+	 */
+	down_write(&lu_sites_guard);
+	result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+	up_write(&lu_sites_guard);
+	if (result) {
+		lu_context_key_degister(&lu_global_key);
+		goto out_lu_ref;
+	}
+
+	/*
+	 * seeks estimation: 3 seeks to read a record from oi, one to read
+	 * inode, one for ea. Unfortunately setting this high value results in
+	 * lu_object/inode cache consuming all the memory.
+	 */
+	result = register_shrinker(&lu_site_shrinker);
+	if (result)
+		goto out_env;
+
+	result = rhashtable_init(&lu_env_rhash, &lu_env_rhash_params);
+
+	if (result)
+		goto out_shrinker;
+
+	return result;
+
+out_shrinker:
+	unregister_shrinker(&lu_site_shrinker);
+out_env:
+	/* ordering here is explained in lu_global_fini() */
+	lu_context_key_degister(&lu_global_key);
+	down_write(&lu_sites_guard);
+	lu_env_fini(&lu_shrink_env);
+	up_write(&lu_sites_guard);
+out_lu_ref:
+	lu_ref_global_fini();
+	return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+	unregister_shrinker(&lu_site_shrinker);
+
+	lu_context_key_degister(&lu_global_key);
+
+	/*
+	 * Tear shrinker environment down _after_ de-registering
+	 * lu_global_key, because the latter has a value in the former.
+	 */
+	down_write(&lu_sites_guard);
+	lu_env_fini(&lu_shrink_env);
+	up_write(&lu_sites_guard);
+
+	rhashtable_destroy(&lu_env_rhash);
+
+	lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#ifdef CONFIG_PROC_FS
+	struct lprocfs_counter ret;
+
+	lprocfs_stats_collect(stats, idx, &ret);
+	return (__u32)ret.lc_count;
+#else
+	return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m)
+{
+	const struct bucket_table *tbl;
+	lu_site_stats_t stats;
+	unsigned int chains;
+
+	memset(&stats, 0, sizeof(stats));
+	lu_site_stats_get(s, &stats);
+
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(s->ls_obj_hash.tbl,
+				  &((struct lu_site *)s)->ls_obj_hash);
+	chains = tbl->size;
+	rcu_read_unlock();
+	seq_printf(m, "%d/%d %d/%u %d %d %d %d %d %d %d\n",
+		   stats.lss_busy,
+		   stats.lss_total,
+		   stats.lss_populated,
+		   chains,
+		   stats.lss_max_search,
+		   ls_stats_read(s->ls_stats, LU_SS_CREATED),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+		   ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+	return 0;
+}
+EXPORT_SYMBOL(lu_site_stats_seq_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+        int result;
+        struct lu_kmem_descr *iter = caches;
+
+        for (result = 0; iter->ckd_cache != NULL; ++iter) {
+		*iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+						     iter->ckd_size,
+						     0, 0, NULL);
+                if (*iter->ckd_cache == NULL) {
+                        result = -ENOMEM;
+                        /* free all previously allocated caches */
+                        lu_kmem_fini(caches);
+                        break;
+                }
+        }
+        return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+        for (; caches->ckd_cache != NULL; ++caches) {
+                if (*caches->ckd_cache != NULL) {
+			kmem_cache_destroy(*caches->ckd_cache);
+                        *caches->ckd_cache = NULL;
+                }
+        }
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid)
+{
+	struct lu_site		*s = o->lo_dev->ld_site;
+	struct lu_fid		*old = &o->lo_header->loh_fid;
+	int rc;
+
+	LASSERT(fid_is_zero(old));
+	*old = *fid;
+try_again:
+	rc = rhashtable_lookup_insert_fast(&s->ls_obj_hash,
+					   &o->lo_header->loh_hash,
+					   obj_hash_params);
+	/* supposed to be unique */
+	LASSERT(rc != -EEXIST);
+	/* handle hash table resizing */
+	if (rc == -ENOMEM || rc == -EBUSY) {
+		msleep(20);
+		goto try_again;
+	}
+	/* trim the hash if its growing to big */
+	lu_object_limit(env, o->lo_dev);
+	if (rc == -E2BIG)
+		goto try_again;
+
+	LASSERTF(rc == 0, "failed hashtable insertion: rc = %d\n", rc);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf)
+{
+	struct lu_fid fid;
+	struct lu_object *o;
+	int rc;
+
+	fid_zero(&fid);
+	o = lu_object_alloc(env, dev, &fid);
+	if (!IS_ERR(o)) {
+		rc = lu_object_start(env, dev, o, conf);
+		if (rc) {
+			lu_object_free(env, o);
+			return ERR_PTR(rc);
+		}
+	}
+
+	return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+	.lb_buf = NULL,
+	.lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+	LASSERT(buf);
+	if (buf->lb_buf) {
+		LASSERT(buf->lb_len > 0);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+		buf->lb_buf = NULL;
+		buf->lb_len = 0;
+	}
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, size_t size)
+{
+	LASSERT(buf);
+	LASSERT(buf->lb_buf == NULL);
+	LASSERT(buf->lb_len == 0);
+	OBD_ALLOC_LARGE(buf->lb_buf, size);
+	if (likely(buf->lb_buf))
+		buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, size_t size)
+{
+	lu_buf_free(buf);
+	lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len)
+{
+	if (buf->lb_buf == NULL && buf->lb_len == 0)
+		lu_buf_alloc(buf, len);
+
+	if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+		lu_buf_realloc(buf, len);
+
+	return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, size_t len)
+{
+	char *ptr;
+
+	if (len <= buf->lb_len)
+		return 0;
+
+	OBD_ALLOC_LARGE(ptr, len);
+	if (ptr == NULL)
+		return -ENOMEM;
+
+	/* Free the old buf */
+	if (buf->lb_buf != NULL) {
+		memcpy(ptr, buf->lb_buf, buf->lb_len);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+	}
+
+	buf->lb_buf = ptr;
+	buf->lb_len = len;
+	return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
new file mode 100644
index 0000000000000..bcc59fb3fc6c7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
@@ -0,0 +1,437 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lu_ref.h>
+
+#ifdef CONFIG_LUSTRE_DEBUG_LU_REF
+/**
+ * Asserts a condition for a given lu_ref. Must be called with
+ * lu_ref::lf_guard held.
+ */
+#define REFASSERT(ref, expr) do {					\
+	struct lu_ref *__tmp = (ref);					\
+									\
+	if (unlikely(!(expr))) {					\
+		lu_ref_print(__tmp);					\
+		spin_unlock(&__tmp->lf_guard);				\
+		lu_ref_print_all();					\
+		LASSERT(0);						\
+		spin_lock(&__tmp->lf_guard);				\
+	}								\
+} while (0)
+
+static struct kmem_cache *lu_ref_link_kmem;
+
+static struct lu_kmem_descr lu_ref_caches[] = {
+	{
+		.ckd_cache = &lu_ref_link_kmem,
+		.ckd_name  = "lu_ref_link_kmem",
+		.ckd_size  = sizeof(struct lu_ref_link)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/**
+ * Global list of active (initialized, but not finalized) lu_ref's.
+ *
+ * Protected by lu_ref_refs_guard.
+ */
+static LIST_HEAD(lu_ref_refs);
+static DEFINE_SPINLOCK(lu_ref_refs_guard);
+static struct lu_ref lu_ref_marker = {
+	.lf_guard	= __SPIN_LOCK_UNLOCKED(lu_ref_marker.lf_guard),
+	.lf_list	= LIST_HEAD_INIT(lu_ref_marker.lf_list),
+	.lf_linkage	= LIST_HEAD_INIT(lu_ref_marker.lf_linkage)
+};
+
+void lu_ref_print(const struct lu_ref *ref)
+{
+	struct lu_ref_link *link;
+
+	CERROR("lu_ref: %p %d %d %s:%d\n",
+	       ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line);
+	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
+		CERROR("     link: %s %p\n", link->ll_scope, link->ll_source);
+	}
+}
+
+static int lu_ref_is_marker(const struct lu_ref *ref)
+{
+	return ref == &lu_ref_marker;
+}
+
+void lu_ref_print_all(void)
+{
+	struct lu_ref *ref;
+
+	spin_lock(&lu_ref_refs_guard);
+	list_for_each_entry(ref, &lu_ref_refs, lf_linkage) {
+		if (lu_ref_is_marker(ref))
+			continue;
+
+		spin_lock(&ref->lf_guard);
+		lu_ref_print(ref);
+		spin_unlock(&ref->lf_guard);
+	}
+	spin_unlock(&lu_ref_refs_guard);
+}
+
+void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line)
+{
+	ref->lf_refs = 0;
+	ref->lf_func = func;
+	ref->lf_line = line;
+	spin_lock_init(&ref->lf_guard);
+	INIT_LIST_HEAD(&ref->lf_list);
+	spin_lock(&lu_ref_refs_guard);
+	list_add(&ref->lf_linkage, &lu_ref_refs);
+	spin_unlock(&lu_ref_refs_guard);
+}
+EXPORT_SYMBOL(lu_ref_init_loc);
+
+void lu_ref_fini(struct lu_ref *ref)
+{
+	spin_lock(&ref->lf_guard);
+	REFASSERT(ref, list_empty(&ref->lf_list));
+	REFASSERT(ref, ref->lf_refs == 0);
+	spin_unlock(&ref->lf_guard);
+	spin_lock(&lu_ref_refs_guard);
+	list_del_init(&ref->lf_linkage);
+	spin_unlock(&lu_ref_refs_guard);
+}
+EXPORT_SYMBOL(lu_ref_fini);
+
+static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref,
+					      int flags,
+					      const char *scope,
+					      const void *source)
+{
+	struct lu_ref_link *link;
+
+	link = NULL;
+	if (lu_ref_link_kmem) {
+		OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags);
+		if (link) {
+			link->ll_ref = ref;
+			link->ll_scope = scope;
+			link->ll_source = source;
+			spin_lock(&ref->lf_guard);
+			list_add_tail(&link->ll_linkage, &ref->lf_list);
+			ref->lf_refs++;
+			spin_unlock(&ref->lf_guard);
+		}
+	}
+
+	if (!link) {
+		spin_lock(&ref->lf_guard);
+		ref->lf_failed++;
+		spin_unlock(&ref->lf_guard);
+		link = ERR_PTR(-ENOMEM);
+	}
+
+	return link;
+}
+
+void lu_ref_add(struct lu_ref *ref, const char *scope, const void *source)
+{
+	might_sleep();
+	lu_ref_add_context(ref, GFP_NOFS, scope, source);
+}
+EXPORT_SYMBOL(lu_ref_add);
+
+void lu_ref_add_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source)
+{
+	link->ll_ref = ref;
+	link->ll_scope = scope;
+	link->ll_source = source;
+	spin_lock(&ref->lf_guard);
+	list_add_tail(&link->ll_linkage, &ref->lf_list);
+	ref->lf_refs++;
+	spin_unlock(&ref->lf_guard);
+}
+EXPORT_SYMBOL(lu_ref_add_at);
+
+/**
+ * Version of lu_ref_add() to be used in non-blockable contexts.
+ */
+void lu_ref_add_atomic(struct lu_ref *ref, const char *scope,
+		       const void *source)
+{
+	lu_ref_add_context(ref, GFP_ATOMIC, scope, source);
+}
+EXPORT_SYMBOL(lu_ref_add_atomic);
+
+static inline int lu_ref_link_eq(const struct lu_ref_link *link,
+				 const char *scope,
+				 const void *source)
+{
+	return link->ll_source == source && !strcmp(link->ll_scope, scope);
+}
+
+/**
+ * Maximal chain length seen so far.
+ */
+static unsigned int lu_ref_chain_max_length = 127;
+
+/**
+ * Searches for a lu_ref_link with given [scope, source] within given lu_ref.
+ */
+static struct lu_ref_link *lu_ref_find(struct lu_ref *ref, const char *scope,
+				       const void *source)
+{
+	struct lu_ref_link *link;
+	unsigned int iterations;
+
+	iterations = 0;
+	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
+		++iterations;
+		if (lu_ref_link_eq(link, scope, source)) {
+			if (iterations > lu_ref_chain_max_length) {
+				CWARN("Long lu_ref chain %d \"%s\":%p\n",
+				      iterations, scope, source);
+				lu_ref_chain_max_length = iterations * 3 / 2;
+			}
+			return link;
+		}
+	}
+	return NULL;
+}
+
+void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source)
+{
+	struct lu_ref_link *link;
+
+	spin_lock(&ref->lf_guard);
+	link = lu_ref_find(ref, scope, source);
+	if (link) {
+		list_del(&link->ll_linkage);
+		ref->lf_refs--;
+		spin_unlock(&ref->lf_guard);
+		OBD_SLAB_FREE(link, lu_ref_link_kmem, sizeof(*link));
+	} else {
+		REFASSERT(ref, ref->lf_failed > 0);
+		ref->lf_failed--;
+		spin_unlock(&ref->lf_guard);
+	}
+}
+EXPORT_SYMBOL(lu_ref_del);
+
+void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope,
+		   const void *source0, const void *source1)
+{
+	spin_lock(&ref->lf_guard);
+	REFASSERT(ref, !IS_ERR_OR_NULL(link));
+	REFASSERT(ref, link->ll_ref == ref);
+	REFASSERT(ref, lu_ref_link_eq(link, scope, source0));
+	link->ll_source = source1;
+	spin_unlock(&ref->lf_guard);
+}
+EXPORT_SYMBOL(lu_ref_set_at);
+
+void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source)
+{
+	spin_lock(&ref->lf_guard);
+	REFASSERT(ref, !IS_ERR_OR_NULL(link));
+	REFASSERT(ref, link->ll_ref == ref);
+	REFASSERT(ref, lu_ref_link_eq(link, scope, source));
+	list_del(&link->ll_linkage);
+	ref->lf_refs--;
+	spin_unlock(&ref->lf_guard);
+}
+EXPORT_SYMBOL(lu_ref_del_at);
+
+static void *lu_ref_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct lu_ref *ref = seq->private;
+
+	spin_lock(&lu_ref_refs_guard);
+	if (list_empty(&ref->lf_linkage))
+		ref = NULL;
+	spin_unlock(&lu_ref_refs_guard);
+
+	return ref;
+}
+
+static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos)
+{
+	struct lu_ref *ref = p;
+	struct lu_ref *next;
+
+	LASSERT(seq->private == p);
+	LASSERT(!list_empty(&ref->lf_linkage));
+
+	(*pos)++;
+	spin_lock(&lu_ref_refs_guard);
+	next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage);
+	if (&next->lf_linkage == &lu_ref_refs)
+		p = NULL;
+	else
+		list_move(&ref->lf_linkage, &next->lf_linkage);
+	spin_unlock(&lu_ref_refs_guard);
+
+	return p;
+}
+
+static void lu_ref_seq_stop(struct seq_file *seq, void *p)
+{
+	/* Nothing to do */
+}
+
+
+static int lu_ref_seq_show(struct seq_file *seq, void *p)
+{
+	struct lu_ref *ref  = p;
+	struct lu_ref *next;
+
+	spin_lock(&lu_ref_refs_guard);
+	next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage);
+	if ((&next->lf_linkage == &lu_ref_refs) || lu_ref_is_marker(next)) {
+		spin_unlock(&lu_ref_refs_guard);
+		return 0;
+	}
+
+	/* print the entry */
+	spin_lock(&next->lf_guard);
+	seq_printf(seq, "lu_ref: %p %d %d %s:%d\n",
+		   next, next->lf_refs, next->lf_failed,
+		   next->lf_func, next->lf_line);
+	if (next->lf_refs > 64) {
+		seq_puts(seq, "  too many references, skip\n");
+	} else {
+		struct lu_ref_link *link;
+		int i = 0;
+
+		list_for_each_entry(link, &next->lf_list, ll_linkage)
+			seq_printf(seq, "  #%d link: %s %p\n",
+				   i++, link->ll_scope, link->ll_source);
+	}
+	spin_unlock(&next->lf_guard);
+	spin_unlock(&lu_ref_refs_guard);
+
+	return 0;
+}
+
+static const struct seq_operations lu_ref_seq_ops = {
+	.start	= lu_ref_seq_start,
+	.stop	= lu_ref_seq_stop,
+	.next	= lu_ref_seq_next,
+	.show	= lu_ref_seq_show
+};
+
+static int lu_ref_seq_open(struct inode *inode, struct file *file)
+{
+	struct lu_ref *marker = &lu_ref_marker;
+	int result = 0;
+
+	result = seq_open(file, &lu_ref_seq_ops);
+	if (result == 0) {
+		spin_lock(&lu_ref_refs_guard);
+		if (!list_empty(&marker->lf_linkage))
+			result = -EAGAIN;
+		else
+			list_add(&marker->lf_linkage, &lu_ref_refs);
+		spin_unlock(&lu_ref_refs_guard);
+
+		if (result == 0) {
+			struct seq_file *f = file->private_data;
+
+			f->private = marker;
+		} else {
+			seq_release(inode, file);
+		}
+	}
+
+	return result;
+}
+
+static int lu_ref_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct lu_ref *ref = m->private;
+
+	spin_lock(&lu_ref_refs_guard);
+	list_del_init(&ref->lf_linkage);
+	spin_unlock(&lu_ref_refs_guard);
+
+	return seq_release(inode, file);
+}
+
+static const struct file_operations lu_ref_dump_fops = {
+	.owner		= THIS_MODULE,
+	.open		= lu_ref_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= lu_ref_seq_release
+};
+
+int lu_ref_global_init(void)
+{
+	int result;
+
+	CDEBUG(D_CONSOLE,
+	       "lu_ref tracking is enabled. Performance isn't.\n");
+
+	result = lu_kmem_init(lu_ref_caches);
+	if (result)
+		return result;
+
+	debugfs_create_file("lu_refs", 0444, debugfs_lustre_root,
+			    NULL, &lu_ref_dump_fops);
+
+	return result;
+}
+
+void lu_ref_global_fini(void)
+{
+	/* debugfs file gets cleaned up by debugfs_remove_recursive on
+	 * debugfs_lustre_root
+	 */
+	lu_kmem_fini(lu_ref_caches);
+}
+
+#endif /* CONFIG_LUSTRE_DEBUG_LU_REF */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c
new file mode 100644
index 0000000000000..f070169218b62
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c
@@ -0,0 +1,687 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lu_tgt_descs.c
+ *
+ * Lustre target descriptions
+ * These are the only exported functions, they provide some generic
+ * infrastructure for target description management used by LOD/LMV
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_hash.h> /* hash_long() */
+#include <libcfs/linux/linux-mem.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+
+/**
+ * lu_prandom_u64_max - returns a pseudo-random u64 number in interval
+ * [0, ep_ro)
+ *
+ * \param[in] ep_ro	right open interval endpoint
+ *
+ * \retval a pseudo-random 64-bit number that is in interval [0, ep_ro).
+ */
+u64 lu_prandom_u64_max(u64 ep_ro)
+{
+	u64 rand = 0;
+
+	if (ep_ro) {
+#ifdef HAVE_GET_RANDOM_U32_AND_U64
+		rand = get_random_u64() % ep_ro;
+#elif BITS_PER_LONG == 32
+		/*
+		 * If ep_ro > 32-bit, first generate the high
+		 * 32 bits of the random number, then add in the low
+		 * 32 bits (truncated to the upper limit, if needed)
+		 */
+		if (ep_ro > 0xffffffffULL)
+			rand = (u64)get_random_u32_below((u32)(ep_ro >> 32)) << 32;
+
+		if (rand == (ep_ro & 0xffffffff00000000ULL))
+			rand |= get_random_u32_below((u32)ep_ro);
+		else
+			rand |= get_random_u32();
+#else
+		rand = ((u64)get_random_u32() << 32 | get_random_u32()) % ep_ro;
+#endif
+	}
+
+	return rand;
+}
+EXPORT_SYMBOL(lu_prandom_u64_max);
+
+/**
+ * Add a new target to Quality of Service (QoS) target table.
+ *
+ * Add a new MDT/OST target to the structure representing an OSS. Resort the
+ * list of known MDSs/OSSs by the number of MDTs/OSTs attached to each MDS/OSS.
+ * The MDS/OSS list is protected internally and no external locking is required.
+ *
+ * \param[in] qos		lu_qos data
+ * \param[in] tgt		target description
+ *
+ * \retval 0			on success
+ * \retval -ENOMEM		on error
+ */
+int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt)
+{
+	struct lu_svr_qos *svr = NULL;
+	struct lu_svr_qos *tempsvr;
+	struct obd_export *exp = tgt->ltd_exp;
+	int found = 0;
+	__u32 id = 0;
+	int rc = 0;
+
+	ENTRY;
+
+	down_write(&qos->lq_rw_sem);
+	/*
+	 * a bit hacky approach to learn NID of corresponding connection
+	 * but there is no official API to access information like this
+	 * with OSD API.
+	 */
+	list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+		if (obd_uuid_equals(&svr->lsq_uuid,
+				    &exp->exp_connection->c_remote_uuid)) {
+			found++;
+			break;
+		}
+		if (svr->lsq_id > id)
+			id = svr->lsq_id;
+	}
+
+	if (!found) {
+		OBD_ALLOC_PTR(svr);
+		if (!svr)
+			GOTO(out, rc = -ENOMEM);
+		memcpy(&svr->lsq_uuid, &exp->exp_connection->c_remote_uuid,
+		       sizeof(svr->lsq_uuid));
+		++id;
+		svr->lsq_id = id;
+	} else {
+		/* Assume we have to move this one */
+		list_del(&svr->lsq_svr_list);
+	}
+
+	svr->lsq_tgt_count++;
+	tgt->ltd_qos.ltq_svr = svr;
+
+	CDEBUG(D_OTHER, "add tgt %s to server %s (%d targets)\n",
+	       obd_uuid2str(&tgt->ltd_uuid), obd_uuid2str(&svr->lsq_uuid),
+	       svr->lsq_tgt_count);
+
+	/*
+	 * Add sorted by # of tgts.  Find the first entry that we're
+	 * bigger than...
+	 */
+	list_for_each_entry(tempsvr, &qos->lq_svr_list, lsq_svr_list) {
+		if (svr->lsq_tgt_count > tempsvr->lsq_tgt_count)
+			break;
+	}
+	/*
+	 * ...and add before it.  If we're the first or smallest, tempsvr
+	 * points to the list head, and we add to the end.
+	 */
+	list_add_tail(&svr->lsq_svr_list, &tempsvr->lsq_svr_list);
+
+	set_bit(LQ_DIRTY, &qos->lq_flags);
+#ifdef HAVE_SERVER_SUPPORT
+	set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags);
+#endif
+out:
+	up_write(&qos->lq_rw_sem);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lu_qos_add_tgt);
+
+/**
+ * Remove MDT/OST target from QoS table.
+ *
+ * Removes given MDT/OST target from QoS table and releases related
+ * MDS/OSS structure if no target remain on the MDS/OSS.
+ *
+ * \param[in] qos		lu_qos data
+ * \param[in] ltd		target description
+ *
+ * \retval 0			on success
+ * \retval -ENOENT		if no server was found
+ */
+static int lu_qos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd)
+{
+	struct lu_svr_qos *svr;
+	int rc = 0;
+
+	ENTRY;
+
+	down_write(&qos->lq_rw_sem);
+	svr = ltd->ltd_qos.ltq_svr;
+	if (!svr)
+		GOTO(out, rc = -ENOENT);
+
+	svr->lsq_tgt_count--;
+	if (svr->lsq_tgt_count == 0) {
+		CDEBUG(D_OTHER, "removing server %s\n",
+		       obd_uuid2str(&svr->lsq_uuid));
+		list_del(&svr->lsq_svr_list);
+		ltd->ltd_qos.ltq_svr = NULL;
+		OBD_FREE_PTR(svr);
+	}
+
+	set_bit(LQ_DIRTY, &qos->lq_flags);
+#ifdef HAVE_SERVER_SUPPORT
+	set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags);
+#endif
+out:
+	up_write(&qos->lq_rw_sem);
+	RETURN(rc);
+}
+
+static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
+{
+	struct obd_statfs *statfs = &tgt->ltd_statfs;
+
+	return statfs->os_bavail * statfs->os_bsize;
+}
+
+static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
+{
+	return tgt->ltd_statfs.os_ffree;
+}
+
+/**
+ * Calculate weight for a given tgt.
+ *
+ * The final tgt weight is bavail >> 16 * iavail >> 8 minus the tgt and server
+ * penalties.  See ltd_qos_penalties_calc() for how penalties are calculated.
+ *
+ * \param[in] tgt	target descriptor
+ */
+void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt)
+{
+	struct lu_tgt_qos *ltq = &tgt->ltd_qos;
+	__u64 penalty;
+
+	ltq->ltq_avail = (tgt_statfs_bavail(tgt) >> 16) *
+			 (tgt_statfs_iavail(tgt) >> 8);
+	penalty = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
+	if (ltq->ltq_avail < penalty)
+		ltq->ltq_weight = 0;
+	else
+		ltq->ltq_weight = ltq->ltq_avail - penalty;
+}
+EXPORT_SYMBOL(lu_tgt_qos_weight_calc);
+
+/**
+ * Allocate and initialize target table.
+ *
+ * A helper function to initialize the target table and allocate
+ * a bitmap of the available targets.
+ *
+ * \param[in] ltd		target's table to initialize
+ * \param[in] is_mdt		target table for MDTs
+ *
+ * \retval 0			on success
+ * \retval negative		negated errno on error
+ **/
+int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt)
+{
+	mutex_init(&ltd->ltd_mutex);
+	init_rwsem(&ltd->ltd_rw_sem);
+
+	/*
+	 * the tgt array and bitmap are allocated/grown dynamically as tgts are
+	 * added to the LOD/LMV, see lu_tgt_descs_add()
+	 */
+	ltd->ltd_tgt_bitmap = bitmap_zalloc(BITS_PER_LONG, GFP_NOFS);
+	if (!ltd->ltd_tgt_bitmap)
+		return -ENOMEM;
+
+	ltd->ltd_tgts_size  = BITS_PER_LONG;
+	ltd->ltd_death_row = 0;
+	ltd->ltd_refcount  = 0;
+
+	/* Set up allocation policy (QoS and RR) */
+	INIT_LIST_HEAD(&ltd->ltd_qos.lq_svr_list);
+	init_rwsem(&ltd->ltd_qos.lq_rw_sem);
+	set_bit(LQ_DIRTY, &ltd->ltd_qos.lq_flags);
+	set_bit(LQ_RESET, &ltd->ltd_qos.lq_flags);
+	ltd->ltd_is_mdt = is_mdt;
+	/* MDT imbalance threshold is low to balance across MDTs
+	 * relatively quickly, because each directory may result
+	 * in a large number of files/subdirs created therein.
+	 */
+	if (is_mdt) {
+		ltd->ltd_lmv_desc.ld_pattern = LMV_HASH_TYPE_DEFAULT;
+		ltd->ltd_qos.lq_prio_free = LMV_QOS_DEF_PRIO_FREE * 256 / 100;
+		ltd->ltd_qos.lq_threshold_rr =
+			LMV_QOS_DEF_THRESHOLD_RR_PCT *
+			QOS_THRESHOLD_MAX / 100;
+	} else {
+		ltd->ltd_qos.lq_prio_free = LOV_QOS_DEF_PRIO_FREE * 256 / 100;
+		ltd->ltd_qos.lq_threshold_rr =
+			LOV_QOS_DEF_THRESHOLD_RR_PCT *
+			QOS_THRESHOLD_MAX / 100;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_tgt_descs_init);
+
+/**
+ * Free bitmap and target table pages.
+ *
+ * \param[in] ltd	target table
+ */
+void lu_tgt_descs_fini(struct lu_tgt_descs *ltd)
+{
+	int i;
+
+	bitmap_free(ltd->ltd_tgt_bitmap);
+	for (i = 0; i < ARRAY_SIZE(ltd->ltd_tgt_idx); i++) {
+		if (ltd->ltd_tgt_idx[i])
+			OBD_FREE_PTR(ltd->ltd_tgt_idx[i]);
+	}
+	ltd->ltd_tgts_size = 0;
+}
+EXPORT_SYMBOL(lu_tgt_descs_fini);
+
+/**
+ * Expand size of target table.
+ *
+ * When the target table is full, we have to extend the table. To do so,
+ * we allocate new memory with some reserve, move data from the old table
+ * to the new one and release memory consumed by the old table.
+ *
+ * \param[in] ltd		target table
+ * \param[in] newsize		new size of the table
+ *
+ * \retval			0 on success
+ * \retval			-ENOMEM if reallocation failed
+ */
+static int lu_tgt_descs_resize(struct lu_tgt_descs *ltd, __u32 newsize)
+{
+	unsigned long *new_bitmap, *old_bitmap = NULL;
+
+	/* someone else has already resize the array */
+	if (newsize <= ltd->ltd_tgts_size)
+		return 0;
+
+	new_bitmap = bitmap_zalloc(newsize, GFP_NOFS);
+	if (!new_bitmap)
+		return -ENOMEM;
+
+	if (ltd->ltd_tgts_size > 0) {
+		/* the bitmap already exists, copy data from old one */
+		bitmap_copy(new_bitmap, ltd->ltd_tgt_bitmap,
+			    ltd->ltd_tgts_size);
+		old_bitmap = ltd->ltd_tgt_bitmap;
+	}
+
+	ltd->ltd_tgts_size  = newsize;
+	ltd->ltd_tgt_bitmap = new_bitmap;
+
+	bitmap_free(old_bitmap);
+
+	CDEBUG(D_CONFIG, "tgt size: %d\n", ltd->ltd_tgts_size);
+
+	return 0;
+}
+
+/**
+ * Add new target to target table.
+ *
+ * Extend target table if it's full, update target table and bitmap.
+ * Notice we need to take ltd_rw_sem exclusively before entry to ensure
+ * atomic switch.
+ *
+ * \param[in] ltd		target table
+ * \param[in] tgt		new target desc
+ *
+ * \retval			0 on success
+ * \retval			-ENOMEM if reallocation failed
+ *				-EEXIST if target existed
+ */
+int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt)
+{
+	__u32 index = tgt->ltd_index;
+	int rc;
+
+	ENTRY;
+
+	if (index >= ltd->ltd_tgts_size) {
+		__u32 newsize = 1;
+
+		if (index > TGT_PTRS * TGT_PTRS_PER_BLOCK)
+			RETURN(-ENFILE);
+
+		while (newsize < index + 1)
+			newsize = newsize << 1;
+
+		rc = lu_tgt_descs_resize(ltd, newsize);
+		if (rc)
+			RETURN(rc);
+	} else if (test_bit(index, ltd->ltd_tgt_bitmap)) {
+		RETURN(-EEXIST);
+	}
+
+	if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) {
+		OBD_ALLOC_PTR(ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK]);
+		if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL)
+			RETURN(-ENOMEM);
+	}
+
+	LTD_TGT(ltd, tgt->ltd_index) = tgt;
+	set_bit(tgt->ltd_index, ltd->ltd_tgt_bitmap);
+
+	ltd->ltd_lov_desc.ld_tgt_count++;
+	if (tgt->ltd_active)
+		ltd->ltd_lov_desc.ld_active_tgt_count++;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ltd_add_tgt);
+
+/**
+ * Delete target from target table
+ */
+void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt)
+{
+	lu_qos_del_tgt(&ltd->ltd_qos, tgt);
+	LTD_TGT(ltd, tgt->ltd_index) = NULL;
+	clear_bit(tgt->ltd_index, ltd->ltd_tgt_bitmap);
+	ltd->ltd_lov_desc.ld_tgt_count--;
+	if (tgt->ltd_active)
+		ltd->ltd_lov_desc.ld_active_tgt_count--;
+}
+EXPORT_SYMBOL(ltd_del_tgt);
+
+/**
+ * Calculate penalties per-tgt and per-server
+ *
+ * Re-calculate penalties when the configuration changes, active targets
+ * change and after statfs refresh (all these are reflected by lq_dirty flag).
+ * On every tgt and server: decay the penalty by half for every 8x the update
+ * interval that the device has been idle. That gives lots of time for the
+ * statfs information to be updated (which the penalty is only a proxy for),
+ * and avoids penalizing server/tgt under light load.
+ * See lu_qos_tgt_weight_calc() for how penalties are factored into the weight.
+ *
+ * \param[in] ltd		lu_tgt_descs
+ *
+ * \retval 0		on success
+ * \retval -EAGAIN	the number of tgt isn't enough or all tgt spaces are
+ *			almost the same
+ */
+int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
+{
+	struct lu_qos *qos = &ltd->ltd_qos;
+	struct lov_desc *desc = &ltd->ltd_lov_desc;
+	struct lu_tgt_desc *tgt;
+	struct lu_svr_qos *svr;
+	__u64 ba_max, ba_min, ba;
+	__u64 ia_max, ia_min, ia = 1;
+	__u32 num_active;
+	int prio_wide;
+	time64_t now, age;
+	int rc;
+
+	ENTRY;
+
+	if (!test_bit(LQ_DIRTY, &qos->lq_flags))
+		GOTO(out, rc = 0);
+
+	num_active = desc->ld_active_tgt_count - 1;
+	if (num_active < 1)
+		GOTO(out, rc = -EAGAIN);
+
+	/* find bavail on each server */
+	list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+		svr->lsq_bavail = 0;
+		/* if inode is not counted, set to 1 to ignore */
+		svr->lsq_iavail = ltd->ltd_is_mdt ? 0 : 1;
+	}
+	qos->lq_active_svr_count = 0;
+
+	/*
+	 * How badly user wants to select targets "widely" (not recently chosen
+	 * and not on recent MDS's).  As opposed to "freely" (free space avail.)
+	 * 0-256
+	 */
+	prio_wide = 256 - qos->lq_prio_free;
+
+	ba_min = (__u64)(-1);
+	ba_max = 0;
+	ia_min = (__u64)(-1);
+	ia_max = 0;
+	now = ktime_get_real_seconds();
+
+	/* Calculate server penalty per object */
+	ltd_foreach_tgt(ltd, tgt) {
+		if (!tgt->ltd_active)
+			continue;
+
+		/* when inode is counted, bavail >> 16 to avoid overflow */
+		ba = tgt_statfs_bavail(tgt);
+		if (ltd->ltd_is_mdt)
+			ba >>= 16;
+		else
+			ba >>= 8;
+		if (!ba)
+			continue;
+
+		ba_min = min(ba, ba_min);
+		ba_max = max(ba, ba_max);
+
+		/* Count the number of usable servers */
+		if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0)
+			qos->lq_active_svr_count++;
+		tgt->ltd_qos.ltq_svr->lsq_bavail += ba;
+
+		if (ltd->ltd_is_mdt) {
+			/* iavail >> 8 to avoid overflow */
+			ia = tgt_statfs_iavail(tgt) >> 8;
+			if (!ia)
+				continue;
+
+			ia_min = min(ia, ia_min);
+			ia_max = max(ia, ia_max);
+
+			tgt->ltd_qos.ltq_svr->lsq_iavail += ia;
+		}
+
+		/*
+		 * per-tgt penalty is
+		 * prio * bavail * iavail / (num_tgt - 1) / 2
+		 */
+		tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8;
+		do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active);
+		tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
+
+		age = (now - tgt->ltd_qos.ltq_used) >> 3;
+		if (test_bit(LQ_RESET, &qos->lq_flags) || 
+		    age > 32 * desc->ld_qos_maxage)
+			tgt->ltd_qos.ltq_penalty = 0;
+		else if (age > desc->ld_qos_maxage)
+			/* Decay tgt penalty. */
+			tgt->ltd_qos.ltq_penalty >>= age / desc->ld_qos_maxage;
+	}
+
+	num_active = qos->lq_active_svr_count - 1;
+	if (num_active < 1) {
+		/*
+		 * If there's only 1 server, we can't penalize it, so instead
+		 * we have to double the tgt penalty
+		 */
+		num_active = 1;
+		ltd_foreach_tgt(ltd, tgt) {
+			if (!tgt->ltd_active)
+				continue;
+
+			tgt->ltd_qos.ltq_penalty_per_obj <<= 1;
+		}
+	}
+
+	/*
+	 * Per-server penalty is
+	 * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2
+	 */
+	list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+		ba = svr->lsq_bavail;
+		ia = svr->lsq_iavail;
+		svr->lsq_penalty_per_obj = prio_wide * ba  * ia >> 8;
+		do_div(svr->lsq_penalty_per_obj,
+		       svr->lsq_tgt_count * num_active);
+		svr->lsq_penalty_per_obj >>= 1;
+
+		age = (now - svr->lsq_used) >> 3;
+		if (test_bit(LQ_RESET, &qos->lq_flags) || 
+		    age > 32 * desc->ld_qos_maxage)
+			svr->lsq_penalty = 0;
+		else if (age > desc->ld_qos_maxage)
+			/* Decay server penalty. */
+			svr->lsq_penalty >>= age / desc->ld_qos_maxage;
+	}
+
+	clear_bit(LQ_DIRTY, &qos->lq_flags);
+	clear_bit(LQ_RESET, &qos->lq_flags);
+
+	/*
+	 * If each tgt has almost same free space, do rr allocation for better
+	 * creation performance
+	 */
+	clear_bit(LQ_SAME_SPACE, &qos->lq_flags);
+	if (((ba_max * (QOS_THRESHOLD_MAX - qos->lq_threshold_rr)) /
+	    QOS_THRESHOLD_MAX) < ba_min &&
+	    ((ia_max * (QOS_THRESHOLD_MAX - qos->lq_threshold_rr)) /
+	    QOS_THRESHOLD_MAX) < ia_min) {
+		set_bit(LQ_SAME_SPACE, &qos->lq_flags);
+		/* Reset weights for the next time we enter qos mode */
+		set_bit(LQ_RESET, &qos->lq_flags);
+	}
+	rc = 0;
+
+out:
+	if (!rc && test_bit(LQ_SAME_SPACE, &qos->lq_flags))
+		RETURN(-EAGAIN);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ltd_qos_penalties_calc);
+
+/**
+ * Re-calculate penalties and weights of all tgts.
+ *
+ * The function is called when some target was used for a new object. In
+ * this case we should re-calculate all the weights to keep new allocations
+ * balanced well.
+ *
+ * \param[in] ltd		lu_tgt_descs
+ * \param[in] tgt		recently used tgt
+ * \param[out] total_wt		new total weight for the pool
+ *
+ * \retval		0
+ */
+int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
+		   __u64 *total_wt)
+{
+	struct lu_qos *qos = &ltd->ltd_qos;
+	struct lu_tgt_qos *ltq;
+	struct lu_svr_qos *svr;
+
+	ENTRY;
+
+	ltq = &tgt->ltd_qos;
+	LASSERT(ltq);
+
+	/* Don't allocate on this device anymore, until the next alloc_qos */
+	ltq->ltq_usable = 0;
+
+	svr = ltq->ltq_svr;
+
+	/*
+	 * Decay old penalty by half (we're adding max penalty, and don't
+	 * want it to run away.)
+	 */
+	ltq->ltq_penalty >>= 1;
+	svr->lsq_penalty >>= 1;
+
+	/* mark the server and tgt as recently used */
+	ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds();
+
+	/* Set max penalties for this tgt and server */
+	ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
+			    ltd->ltd_lov_desc.ld_active_tgt_count;
+	svr->lsq_penalty += svr->lsq_penalty_per_obj *
+			    qos->lq_active_svr_count;
+
+	/* Decrease all MDS penalties */
+	list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+		if (svr->lsq_penalty < svr->lsq_penalty_per_obj)
+			svr->lsq_penalty = 0;
+		else
+			svr->lsq_penalty -= svr->lsq_penalty_per_obj;
+	}
+
+	*total_wt = 0;
+	/* Decrease all tgt penalties */
+	ltd_foreach_tgt(ltd, tgt) {
+		if (!tgt->ltd_active)
+			continue;
+
+		ltq = &tgt->ltd_qos;
+		if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
+			ltq->ltq_penalty = 0;
+		else
+			ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
+
+		lu_tgt_qos_weight_calc(tgt);
+
+		/* Recalc the total weight of usable osts */
+		if (ltq->ltq_usable)
+			*total_wt += ltq->ltq_weight;
+
+		CDEBUG(D_OTHER, "recalc tgt %d usable=%d bavail=%llu ffree=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n",
+			  tgt->ltd_index, ltq->ltq_usable,
+			  tgt_statfs_bavail(tgt) >> 16,
+			  tgt_statfs_iavail(tgt) >> 8,
+			  ltq->ltq_penalty_per_obj >> 10,
+			  ltq->ltq_penalty >> 10,
+			  ltq->ltq_svr->lsq_penalty_per_obj >> 10,
+			  ltq->ltq_svr->lsq_penalty >> 10,
+			  ltq->ltq_weight >> 10);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ltd_qos_update);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_pool.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_pool.c
new file mode 100644
index 0000000000000..4bf0d168b7380
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_pool.c
@@ -0,0 +1,244 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+/*
+ * lustre/target/tgt_pool.c
+ *
+ * This file handles creation, lookup, and removal of pools themselves, as
+ * well as adding and removing targets to pools.
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_target.h>
+#include <obd_support.h>
+#include <lu_object.h>
+
+/**
+ * Initialize the pool data structures at startup.
+ *
+ * Allocate and initialize the pool data structures with the specified
+ * array size.  If pool count is not specified (\a count == 0), then
+ * POOL_INIT_COUNT will be used.  Allocating a non-zero initial array
+ * size avoids the need to reallocate as new pools are added.
+ *
+ * \param[in] op	pool structure
+ * \param[in] count	initial size of the target op_array[] array
+ *
+ * \retval		0 indicates successful pool initialization
+ * \retval		negative error number on failure
+ */
+#define POOL_INIT_COUNT 2
+int lu_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count)
+{
+	ENTRY;
+
+	if (count == 0)
+		count = POOL_INIT_COUNT;
+	op->op_array = NULL;
+	op->op_count = 0;
+	init_rwsem(&op->op_rw_sem);
+	op->op_size = count * sizeof(op->op_array[0]);
+	OBD_ALLOC(op->op_array, op->op_size);
+	if (op->op_array == NULL) {
+		op->op_size = 0;
+		RETURN(-ENOMEM);
+	}
+	EXIT;
+	return 0;
+}
+EXPORT_SYMBOL(lu_tgt_pool_init);
+
+/**
+ * Increase the op_array size to hold more targets in this pool.
+ *
+ * The size is increased to at least \a min_count, but may be larger
+ * for an existing pool since ->op_array[] is growing exponentially.
+ * Caller must hold write op_rwlock.
+ *
+ * \param[in] op	pool structure
+ * \param[in] min_count	minimum number of entries to handle
+ *
+ * \retval		0 on success
+ * \retval		negative error number on failure.
+ */
+int lu_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count)
+{
+	__u32 *new;
+	__u32 new_size;
+
+	LASSERT(min_count != 0);
+
+	if (op->op_count * sizeof(op->op_array[0]) < op->op_size)
+		return 0;
+
+	new_size = max_t(__u32, min_count * sizeof(op->op_array[0]),
+			 2 * op->op_size);
+	OBD_ALLOC(new, new_size);
+	if (new == NULL)
+		return -ENOMEM;
+
+	/* copy old array to new one */
+	memcpy(new, op->op_array, op->op_size);
+	OBD_FREE(op->op_array, op->op_size);
+	op->op_array = new;
+	op->op_size = new_size;
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_tgt_pool_extend);
+
+/**
+ * Add a new target to an existing pool.
+ *
+ * Add a new target device to the pool previously created and returned by
+ * lod_pool_new().  Each target can only be in each pool at most one time.
+ *
+ * \param[in] op	target pool to add new entry
+ * \param[in] idx	pool index number to add to the \a op array
+ * \param[in] min_count	minimum number of entries to expect in the pool
+ *
+ * \retval		0 if target could be added to the pool
+ * \retval		negative error if target \a idx was not added
+ */
+int lu_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count)
+{
+	unsigned int i;
+	int rc = 0;
+	ENTRY;
+
+	down_write(&op->op_rw_sem);
+
+	/* search ost in pool array */
+	for (i = 0; i < op->op_count; i++) {
+		if (op->op_array[i] == idx)
+			GOTO(out, rc = -EEXIST);
+	}
+
+	rc = lu_tgt_pool_extend(op, min_count);
+	if (rc)
+		GOTO(out, rc);
+
+	/* ost not found we add it */
+	op->op_array[op->op_count] = idx;
+	op->op_count++;
+	EXIT;
+out:
+	up_write(&op->op_rw_sem);
+	return rc;
+}
+EXPORT_SYMBOL(lu_tgt_pool_add);
+
+/**
+ * Remove an existing pool from the system.
+ *
+ * The specified pool must have previously been allocated by
+ * lod_pool_new() and not have any target members in the pool.
+ * If the removed target is not the last, compact the array
+ * to remove empty spaces.
+ *
+ * \param[in] op	pointer to the original data structure
+ * \param[in] idx	target index to be removed
+ *
+ * \retval		0 on success
+ * \retval		negative error number on failure
+ */
+int lu_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx)
+{
+	unsigned int i;
+	ENTRY;
+
+	down_write(&op->op_rw_sem);
+
+	for (i = 0; i < op->op_count; i++) {
+		if (op->op_array[i] == idx) {
+			memmove(&op->op_array[i], &op->op_array[i + 1],
+				(op->op_count - i - 1) *
+				sizeof(op->op_array[0]));
+			op->op_count--;
+			up_write(&op->op_rw_sem);
+			EXIT;
+			return 0;
+		}
+	}
+
+	up_write(&op->op_rw_sem);
+	RETURN(-EINVAL);
+}
+EXPORT_SYMBOL(lu_tgt_pool_remove);
+
+int lu_tgt_check_index(int idx, struct lu_tgt_pool *osts)
+{
+	int i, rc = -ENOENT;
+	ENTRY;
+
+	down_read(&osts->op_rw_sem);
+	for (i = 0; i < osts->op_count; i++) {
+		if (osts->op_array[i] == idx)
+			GOTO(out, rc = 0);
+	}
+	EXIT;
+out:
+	up_read(&osts->op_rw_sem);
+	return rc;
+}
+EXPORT_SYMBOL(lu_tgt_check_index);
+
+/**
+ * Free the pool after it was emptied and removed from /proc.
+ *
+ * Note that all of the child/target entries referenced by this pool
+ * must have been removed by lod_ost_pool_remove() before it can be
+ * deleted from memory.
+ *
+ * \param[in] op	pool to be freed.
+ */
+void lu_tgt_pool_free(struct lu_tgt_pool *op)
+{
+	ENTRY;
+
+	if (op->op_size == 0)
+		RETURN_EXIT;
+
+	down_write(&op->op_rw_sem);
+
+	OBD_FREE(op->op_array, op->op_size);
+	op->op_array = NULL;
+	op->op_count = 0;
+	op->op_size = 0;
+
+	up_write(&op->op_rw_sem);
+	EXIT;
+}
+EXPORT_SYMBOL(lu_tgt_pool_free);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c
new file mode 100644
index 0000000000000..216181e32f701
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c
@@ -0,0 +1,102 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lu_ucred.c
+ *
+ * Lustre user credentials context infrastructure.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Fan Yong <fan.yong@intel.com>
+ *   Author: Vitaly Fertman <vitaly_fertman@xyratex.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lu_object.h>
+#include <md_object.h>
+
+/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */
+LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred);
+
+static struct lu_context_key lu_ucred_key = {
+	.lct_tags = LCT_SERVER_SESSION,
+	.lct_init = lu_ucred_key_init,
+	.lct_fini = lu_ucred_key_fini
+};
+
+/**
+ * Get ucred key if session exists and ucred key is allocated on it.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred(const struct lu_env *env)
+{
+	if (!env->le_ses)
+		return NULL;
+	return lu_context_key_get(env->le_ses, &lu_ucred_key);
+}
+EXPORT_SYMBOL(lu_ucred);
+
+/**
+ * Get ucred key and check if it is properly initialized.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred_check(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred(env);
+	if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW)
+		return NULL;
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_check);
+
+/**
+ * Get ucred key, which must exist and must be properly initialized.
+ * Assert otherwise.
+ */
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred_check(env);
+	LASSERT(uc != NULL);
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_assert);
+
+int lu_ucred_global_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&lu_ucred_key);
+	return lu_context_key_register(&lu_ucred_key);
+}
+
+void lu_ucred_global_fini(void)
+{
+	lu_context_key_degister(&lu_ucred_key);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
new file mode 100644
index 0000000000000..9ac9cf13c0200
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
@@ -0,0 +1,219 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/random.h>
+
+#include <obd_support.h>
+#include <lustre_handles.h>
+#include <lustre_lib.h>
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static DEFINE_SPINLOCK(handle_base_lock);
+
+static struct handle_bucket {
+	spinlock_t lock;
+	struct hlist_head	head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h, const char *owner)
+{
+	struct handle_bucket *bucket;
+
+	ENTRY;
+
+	LASSERT(h != NULL);
+	LASSERT(hlist_unhashed(&h->h_link));
+
+	/*
+	 * This is fast, but simplistic cookie generation algorithm, it will
+	 * need a re-do at some point in the future for security.
+	 */
+	spin_lock(&handle_base_lock);
+	handle_base += HANDLE_INCR;
+
+	if (unlikely(handle_base == 0)) {
+		/*
+		 * Cookie of zero is "dangerous", because in many places it's
+		 * assumed that 0 means "unassigned" handle, not bound to any
+		 * object.
+		 */
+		CWARN("The universe has been exhausted: cookie wrap-around.\n");
+		handle_base += HANDLE_INCR;
+	}
+	h->h_cookie = handle_base;
+	spin_unlock(&handle_base_lock);
+
+	h->h_owner = owner;
+
+	bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+	spin_lock(&bucket->lock);
+	hlist_add_head_rcu(&h->h_link, &bucket->head);
+	spin_unlock(&bucket->lock);
+
+	CDEBUG(D_INFO, "added object %p with handle %#llx to hash\n",
+	       h, h->h_cookie);
+	EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+	if (hlist_unhashed(&h->h_link)) {
+		CERROR("removing an already-removed handle (%#llx)\n",
+		       h->h_cookie);
+		return;
+	}
+
+	CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n",
+	       h, h->h_cookie);
+
+	hlist_del_init_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	class_handle_unhash_nolock(h);
+	spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void *class_handle2object(u64 cookie, const char *owner)
+{
+	struct handle_bucket *bucket;
+	struct portals_handle *h;
+	void *retval = NULL;
+
+	ENTRY;
+
+	LASSERT(handle_hash != NULL);
+
+	/*
+	 * Be careful when you want to change this code. See the
+	 * rcu_read_lock() definition on top this file. - jxiong
+	 */
+	bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(h, &bucket->head, h_link) {
+		if (h->h_cookie != cookie || h->h_owner != owner)
+			continue;
+
+		if (refcount_inc_not_zero(&h->h_ref)) {
+			CDEBUG(D_INFO, "GET %s %p refcount=%d\n",
+			       h->h_owner, h,
+			       refcount_read(&h->h_ref));
+			retval = h;
+		}
+		break;
+	}
+	rcu_read_unlock();
+
+	RETURN(retval);
+}
+EXPORT_SYMBOL(class_handle2object);
+
+int class_handle_init(void)
+{
+	struct handle_bucket *bucket;
+
+	LASSERT(handle_hash == NULL);
+
+	OBD_ALLOC_PTR_ARRAY_LARGE(handle_hash, HANDLE_HASH_SIZE);
+	if (handle_hash == NULL)
+		return -ENOMEM;
+
+	for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+	     bucket--) {
+		INIT_HLIST_HEAD(&bucket->head);
+		spin_lock_init(&bucket->lock);
+	}
+
+	get_random_bytes(&handle_base, sizeof(handle_base));
+	LASSERT(handle_base != 0ULL);
+
+	return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+	int rc;
+	int i;
+
+	for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+		struct portals_handle *h;
+
+		spin_lock(&handle_hash[i].lock);
+		hlist_for_each_entry_rcu(h, &handle_hash[i].head, h_link) {
+			CERROR("force clean handle %#llx addr %p owner %p\n",
+			       h->h_cookie, h, h->h_owner);
+
+			class_handle_unhash_nolock(h);
+			rc++;
+		}
+		spin_unlock(&handle_hash[i].lock);
+	}
+
+	return rc;
+}
+
+void class_handle_cleanup(void)
+{
+	int count;
+
+	LASSERT(handle_hash != NULL);
+
+	count = cleanup_all_handles();
+
+	OBD_FREE_PTR_ARRAY_LARGE(handle_hash, HANDLE_HASH_SIZE);
+	handle_hash = NULL;
+
+	if (count != 0)
+		CERROR("handle_count at cleanup: %d\n", count);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
new file mode 100644
index 0000000000000..16b50f9377a20
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
@@ -0,0 +1,247 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+
+struct uuid_nid_data {
+	struct list_head	un_list;
+	struct obd_uuid		un_uuid;
+	int			un_nid_count;
+	lnet_nid_t		un_nids[MTI_NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static LIST_HEAD(g_uuid_list);
+static DEFINE_SPINLOCK(g_uuid_lock);
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+	struct uuid_nid_data *data;
+	struct obd_uuid tmp;
+	int rc = -ENOENT;
+
+	obd_str2uuid(&tmp, uuid);
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(data, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+			if (index >= data->un_nid_count)
+				break;
+
+			rc = 0;
+			*peer_nid = data->un_nids[index];
+			break;
+		}
+	}
+	spin_unlock(&g_uuid_lock);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid.  Multiple nids can be added to a single uuid;
+   LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+	struct uuid_nid_data *data, *entry;
+	int found = 0;
+	int rc;
+
+	LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+
+	if (strlen(uuid) > UUID_MAX - 1)
+		return -EOVERFLOW;
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		return -ENOMEM;
+
+	obd_str2uuid(&data->un_uuid, uuid);
+	data->un_nids[0] = nid;
+	data->un_nid_count = 1;
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+			int i;
+
+			found = 1;
+			for (i = 0; i < entry->un_nid_count; i++)
+				if (nid == entry->un_nids[i])
+					break;
+
+			if (i == entry->un_nid_count) {
+				LASSERT(entry->un_nid_count < MTI_NIDS_MAX);
+				entry->un_nids[entry->un_nid_count++] = nid;
+			}
+			break;
+		}
+	}
+	if (!found)
+		list_add(&data->un_list, &g_uuid_list);
+	spin_unlock(&g_uuid_lock);
+
+	if (found) {
+		CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+		       libcfs_nid2str(nid), entry->un_nid_count);
+		rc = LNetAddPeer(entry->un_nids, entry->un_nid_count);
+		CDEBUG(D_INFO, "Add peer %s rc = %d\n",
+		       libcfs_nid2str(data->un_nids[0]), rc);
+		OBD_FREE(data, sizeof(*data));
+	} else {
+		CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+		rc = LNetAddPeer(data->un_nids, data->un_nid_count);
+		CDEBUG(D_INFO, "Add peer %s rc = %d\n",
+		       libcfs_nid2str(data->un_nids[0]), rc);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(class_add_uuid);
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+	struct uuid_nid_data *data;
+	LIST_HEAD(deathrow);
+
+	spin_lock(&g_uuid_lock);
+	if (uuid != NULL) {
+		struct obd_uuid tmp;
+
+		obd_str2uuid(&tmp, uuid);
+		list_for_each_entry(data, &g_uuid_list, un_list) {
+			if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+				list_move(&data->un_list, &deathrow);
+				break;
+			}
+		}
+	} else
+		list_splice_init(&g_uuid_list, &deathrow);
+	spin_unlock(&g_uuid_lock);
+
+	if (uuid != NULL && list_empty(&deathrow)) {
+		CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+		return -EINVAL;
+	}
+
+	while ((data = list_first_entry_or_null(&deathrow, struct uuid_nid_data,
+						un_list)) != NULL) {
+		list_del(&data->un_list);
+
+		CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+		       obd_uuid2str(&data->un_uuid),
+		       libcfs_nid2str(data->un_nids[0]),
+		       data->un_nid_count);
+
+		OBD_FREE(data, sizeof(*data));
+	}
+	return 0;
+}
+
+int class_add_nids_to_uuid(struct obd_uuid *uuid, lnet_nid_t *nids,
+			   int nid_count)
+{
+	struct uuid_nid_data *entry;
+	int i, rc;
+	bool matched = false;
+
+	ENTRY;
+
+	if (nid_count >= MTI_NIDS_MAX) {
+		CDEBUG(D_NET, "too many NIDs (%d) for UUID '%s'\n",
+			nid_count, obd_uuid2str(uuid));
+		return -ENOSPC;
+	}
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		CDEBUG(D_NET, "Comparing %s with %s\n",
+		       obd_uuid2str(uuid), obd_uuid2str(&entry->un_uuid));
+
+		if (!obd_uuid_equals(&entry->un_uuid, uuid))
+			continue;
+
+		matched = true;
+		CDEBUG(D_NET, "Updating UUID '%s'\n", obd_uuid2str(uuid));
+		for (i = 0; i < nid_count; i++)
+			entry->un_nids[i] = nids[i];
+		entry->un_nid_count = nid_count;
+		break;
+	}
+	spin_unlock(&g_uuid_lock);
+	if (matched) {
+		rc = LNetAddPeer(entry->un_nids, entry->un_nid_count);
+		CDEBUG(D_INFO, "Add peer %s rc = %d\n",
+		       libcfs_nid2str(entry->un_nids[0]), rc);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_add_nids_to_uuid);
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+	struct uuid_nid_data *entry;
+	int found = 0;
+
+	ENTRY;
+
+	CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+	       obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		int i;
+
+		if (!obd_uuid_equals(&entry->un_uuid, uuid))
+                        continue;
+
+		/* found the uuid, check if it has @nid */
+		for (i = 0; i < entry->un_nid_count; i++) {
+			if (entry->un_nids[i] == nid) {
+				found = 1;
+				break;
+			}
+		}
+		break;
+	}
+	spin_unlock(&g_uuid_lock);
+	RETURN(found);
+}
+EXPORT_SYMBOL(class_check_uuid);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
new file mode 100644
index 0000000000000..b0a68a0fb9981
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
@@ -0,0 +1,198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+#include <llog_swab.h>
+#include <lustre_swab.h>
+#include <obd.h>
+#include <md_object.h>
+
+/**
+ * Initialize new \a lma. Only fid is stored.
+ *
+ * \param lma - is the new LMA structure to be initialized
+ * \param fid - is the FID of the object this LMA belongs to
+ * \param incompat - features that MDS must understand to access object
+ */
+void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
+		     __u32 compat, __u32 incompat)
+{
+	lma->lma_compat   = compat;
+	lma->lma_incompat = incompat;
+	lma->lma_self_fid = *fid;
+
+	/* If a field is added in struct lustre_mdt_attrs, zero it explicitly
+	 * and change the test below. */
+	BUILD_BUG_ON(sizeof(*lma) !=
+		     (offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+		      sizeof(lma->lma_self_fid)));
+}
+EXPORT_SYMBOL(lustre_lma_init);
+
+/**
+ * Swab, if needed, LMA structure which is stored on-disk in little-endian order.
+ *
+ * \param lma - is a pointer to the LMA structure to be swabbed.
+ */
+void lustre_lma_swab(struct lustre_mdt_attrs *lma)
+{
+#ifdef __BIG_ENDIAN
+	__swab32s(&lma->lma_compat);
+	__swab32s(&lma->lma_incompat);
+	lustre_swab_lu_fid(&lma->lma_self_fid);
+#endif
+}
+EXPORT_SYMBOL(lustre_lma_swab);
+
+void lustre_loa_init(struct lustre_ost_attrs *loa, const struct lu_fid *fid,
+		     __u32 compat, __u32 incompat)
+{
+	BUILD_BUG_ON(sizeof(*loa) != LMA_OLD_SIZE);
+
+	memset_startat(loa, 0, loa_parent_fid);
+	lustre_lma_init(&loa->loa_lma, fid, compat, incompat);
+}
+EXPORT_SYMBOL(lustre_loa_init);
+
+/**
+ * Swab, if needed, LOA (for OST-object only) structure with LMA EA and PFID EA
+ * combined together are stored on-disk in little-endian order.
+ *
+ * \param[in] loa	- the pointer to the LOA structure to be swabbed.
+ * \param[in] to_cpu	- to indicate swab for CPU order or not.
+ */
+void lustre_loa_swab(struct lustre_ost_attrs *loa, bool to_cpu)
+{
+	struct lustre_mdt_attrs *lma = &loa->loa_lma;
+#ifdef __BIG_ENDIAN
+	__u32 compat = lma->lma_compat;
+#endif
+
+	lustre_lma_swab(lma);
+#ifdef __BIG_ENDIAN
+	if (to_cpu)
+		compat = lma->lma_compat;
+
+	if (compat & LMAC_STRIPE_INFO) {
+		lustre_swab_lu_fid(&loa->loa_parent_fid);
+		__swab32s(&loa->loa_stripe_size);
+	}
+	if (compat & LMAC_COMP_INFO) {
+		__swab32s(&loa->loa_comp_id);
+		__swab64s(&loa->loa_comp_start);
+		__swab64s(&loa->loa_comp_end);
+	}
+#endif
+}
+EXPORT_SYMBOL(lustre_loa_swab);
+
+/**
+ * Swab, if needed, SOM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the SOM structure to be swabbed.
+ */
+void lustre_som_swab(struct lustre_som_attrs *attrs)
+{
+#ifdef __BIG_ENDIAN
+	__swab16s(&attrs->lsa_valid);
+	__swab64s(&attrs->lsa_size);
+	__swab64s(&attrs->lsa_blocks);
+#endif
+}
+EXPORT_SYMBOL(lustre_som_swab);
+
+/**
+ * Swab, if needed, HSM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the HSM structure to be swabbed.
+ */
+void lustre_hsm_swab(struct hsm_attrs *attrs)
+{
+#ifdef __BIG_ENDIAN
+	__swab32s(&attrs->hsm_compat);
+	__swab32s(&attrs->hsm_flags);
+	__swab64s(&attrs->hsm_arch_id);
+	__swab64s(&attrs->hsm_arch_ver);
+#endif
+}
+
+/*
+ * Swab and extract HSM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk HSM extended attribute.
+ * \param rc  - is the HSM xattr stored in \a buf
+ * \param mh  - is the md_hsm structure where to extract HSM attributes.
+ */
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	if (rc == 0 ||  rc == -ENODATA)
+		/* no HSM attributes */
+		RETURN(-ENODATA);
+
+	if (rc < 0)
+		/* error hit while fetching xattr */
+		RETURN(rc);
+
+	/* unpack HSM attributes */
+	lustre_hsm_swab(attrs);
+
+	/* fill md_hsm structure */
+	mh->mh_compat   = attrs->hsm_compat;
+	mh->mh_flags    = attrs->hsm_flags;
+	mh->mh_arch_id  = attrs->hsm_arch_id;
+	mh->mh_arch_ver = attrs->hsm_arch_ver;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2hsm);
+
+/*
+ * Pack HSM attributes.
+ *
+ * \param buf - is the output buffer where to pack the on-disk HSM xattr.
+ * \param mh  - is the md_hsm structure to pack.
+ */
+void lustre_hsm2buf(void *buf, const struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	/* copy HSM attributes */
+	attrs->hsm_compat   = mh->mh_compat;
+	attrs->hsm_flags    = mh->mh_flags;
+	attrs->hsm_arch_id  = mh->mh_arch_id;
+	attrs->hsm_arch_ver = mh->mh_arch_ver;
+
+	/* pack xattr */
+	lustre_hsm_swab(attrs);
+}
+EXPORT_SYMBOL(lustre_hsm2buf);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
new file mode 100644
index 0000000000000..16e6f12f8a05c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
@@ -0,0 +1,149 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2018, DataDirect Networks Storage.
+ * Author: Li Xi.
+ *
+ * Checksum functions
+ */
+#include <obd_class.h>
+#include <obd_cksum.h>
+
+/* Server uses algos that perform at 50% or better of the Adler */
+enum cksum_types obd_cksum_types_supported_server(const char *obd_name)
+{
+	enum cksum_types ret = OBD_CKSUM_ADLER;
+	int base_speed;
+
+	CDEBUG(D_INFO, "%s: checksum speed: crc %d, crc32c %d, adler %d, "
+	       "t10ip512 %d, t10ip4k %d, t10crc512 %d, t10crc4k %d\n",
+	       obd_name,
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K));
+
+	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32C;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512) >= base_speed)
+		ret |= OBD_CKSUM_T10IP512;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K) >= base_speed)
+		ret |= OBD_CKSUM_T10IP4K;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512) >= base_speed)
+		ret |= OBD_CKSUM_T10CRC512;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K) >= base_speed)
+		ret |= OBD_CKSUM_T10CRC4K;
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_cksum_types_supported_server);
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type)
+{
+	unsigned int performance = 0, tmp;
+	u32 flag = OBD_FL_CKSUM_ADLER;
+
+	if (cksum_type & OBD_CKSUM_CRC32) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_CRC32C) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32C;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_ADLER) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_ADLER;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10IP512) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10IP512;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10IP4K) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10IP4K;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10CRC512) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10CRC512;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10CRC4K) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10CRC4K;
+		}
+	}
+
+	if (unlikely(cksum_type && !(cksum_type & OBD_CKSUM_ALL)))
+		CWARN("%s: unknown cksum type %x\n", obd_name, cksum_type);
+
+	return flag;
+}
+EXPORT_SYMBOL(obd_cksum_type_pack);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
new file mode 100644
index 0000000000000..09a524323ea1a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -0,0 +1,2479 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+
+#include <llog_swab.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_log.h>
+#include <uapi/linux/lustre/lustre_param.h>
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+#ifdef HAVE_SERVER_SUPPORT
+static struct cfs_hash_ops nid_stat_hash_ops;
+static struct cfs_hash_ops gen_hash_ops;
+#endif /* HAVE_SERVER_SUPPORT */
+
+/*
+ * uuid<->export lustre hash operations
+ */
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *      state with this function
+ */
+static int
+uuid_keycmp(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct obd_uuid *uuid = arg->key;
+	const struct obd_export *exp = obj;
+
+	if (obd_uuid_equals(uuid, &exp->exp_client_uuid) &&
+	    !exp->exp_failed)
+		return 0;
+	return -ESRCH;
+}
+
+static void
+obd_export_exit(void *vexport, void *data)
+{
+	struct obd_export *exp = vexport;
+
+	class_export_put(exp);
+}
+
+static const struct rhashtable_params uuid_hash_params = {
+	.key_len	= sizeof(struct obd_uuid),
+	.key_offset	= offsetof(struct obd_export, exp_client_uuid),
+	.head_offset	= offsetof(struct obd_export, exp_uuid_hash),
+	.obj_cmpfn	= uuid_keycmp,
+	.max_size	= MAX_OBD_DEVICES,
+	.automatic_shrinking = true,
+};
+
+int obd_uuid_add(struct obd_device *obd, struct obd_export *export)
+{
+	int rc;
+
+	class_export_get(export);
+	rcu_read_lock();
+	rc = rhashtable_lookup_insert_fast(&obd->obd_uuid_hash,
+					   &export->exp_uuid_hash,
+					   uuid_hash_params);
+	if (rc) {
+		class_export_put(export);
+		if (rc != -EEXIST) {
+			/* map obscure error codes to -ENOMEM */
+			rc = -ENOMEM;
+		} else {
+			rc = -EALREADY;
+		}
+	}
+	rcu_read_unlock();
+
+	return rc;
+}
+EXPORT_SYMBOL(obd_uuid_add);
+
+void obd_uuid_del(struct obd_device *obd, struct obd_export *export)
+{
+	int rc;
+
+	rcu_read_lock();
+	rc = rhashtable_remove_fast(&obd->obd_uuid_hash,
+				    &export->exp_uuid_hash,
+				    uuid_hash_params);
+	if (!rc)
+		class_export_put(export);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(obd_uuid_del);
+
+#ifdef HAVE_SERVER_SUPPORT
+/* obd_uuid_lookup() is used only server side by target_handle_connect(),
+ * mdt_hsm_agent_send(), and obd_export_evict_by_uuid().
+ */
+struct obd_export *obd_uuid_lookup(struct obd_device *obd,
+				   struct obd_uuid *uuid)
+{
+	struct obd_export *export = NULL;
+
+	rcu_read_lock();
+	export = rhashtable_lookup_fast(&obd->obd_uuid_hash, uuid,
+					uuid_hash_params);
+	if (export && !refcount_inc_not_zero(&export->exp_handle.h_ref))
+		export = NULL;
+	rcu_read_unlock();
+
+	return export;
+}
+EXPORT_SYMBOL(obd_uuid_lookup);
+
+/*
+ * nid<->export hash operations
+ */
+static u32 nid_keyhash(const void *data, u32 key_len, u32 seed)
+{
+	const struct obd_export *exp = data;
+	void *key;
+
+	if (!exp->exp_connection)
+		return 0;
+
+	key = &exp->exp_connection->c_peer.nid;
+	return jhash2(key, key_len / sizeof(u32), seed);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *	 state with this function
+ */
+static int
+nid_keycmp(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct lnet_nid *nid = arg->key;
+	const struct obd_export *exp = obj;
+
+	if (nid_same(&exp->exp_connection->c_peer.nid, nid))
+		return 0;
+
+	return -ESRCH;
+}
+
+static void
+nid_export_exit(void *vexport, void *data)
+{
+	struct obd_export *exp = vexport;
+
+	class_export_put(exp);
+}
+
+static const struct rhashtable_params nid_hash_params = {
+	.key_len		= sizeof(struct lnet_nid),
+	.head_offset		= offsetof(struct obd_export, exp_nid_hash),
+	.obj_hashfn		= nid_keyhash,
+	.obj_cmpfn		= nid_keycmp,
+	.automatic_shrinking	= true,
+};
+
+int obd_nid_add(struct obd_device *obd, struct obd_export *exp)
+{
+	int rc;
+
+	if (exp == exp->exp_obd->obd_self_export || exp->exp_hashed)
+		return 0;
+
+	class_export_get(exp);
+	rc = rhltable_insert_key(&obd->obd_nid_hash,
+				 &exp->exp_connection->c_peer.nid,
+				 &exp->exp_nid_hash,
+				 nid_hash_params);
+	if (rc) {
+		class_export_put(exp);
+		/* map obscure error codes to -ENOMEM */
+		rc = -ENOMEM;
+	} else {
+		exp->exp_hashed = 1;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(obd_nid_add);
+
+void obd_nid_del(struct obd_device *obd, struct obd_export *exp)
+{
+	int rc;
+
+	if (exp == exp->exp_obd->obd_self_export || !exp->exp_hashed)
+		return;
+
+	rc = rhltable_remove(&obd->obd_nid_hash, &exp->exp_nid_hash,
+			     nid_hash_params);
+	if (rc == 0) {
+		class_export_put(exp);
+		exp->exp_hashed = 0;
+	}
+}
+EXPORT_SYMBOL(obd_nid_del);
+
+int obd_nid_export_for_each(struct obd_device *obd, struct lnet_nid *nid,
+			    int cb(struct obd_export *exp, void *data),
+			    void *data)
+{
+	struct rhlist_head *exports, *tmp;
+	struct obd_export *exp;
+	int ret = 0;
+
+	rcu_read_lock();
+	exports = rhltable_lookup(&obd->obd_nid_hash, nid, nid_hash_params);
+	if (!exports) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	rhl_for_each_entry_rcu(exp, tmp, exports, exp_nid_hash) {
+		if (!exp->exp_failed && cb(exp, data))
+			ret++;
+	}
+
+out_unlock:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(obd_nid_export_for_each);
+#endif /* HAVE_SERVER_SUPPORT */
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+	char *ptr;
+
+	if (!buf)
+		return 1;
+
+	ptr = strstr(buf, key);
+	if (!ptr)
+		return 1;
+
+	if (valp)
+		*valp = ptr + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param			proc parameter
+ * \param ptr			an array which contains the mapping from
+ *				old parameters to new ones
+ *
+ * \retval valid-pointer	pointer to the cfg_interop_param structure
+ *				which contains the old and new parameters
+ * \retval NULL			\a param or \a ptr is NULL,
+ *				or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr)
+{
+	char *value = NULL;
+	int   name_len = 0;
+
+	if (!param || !ptr)
+		RETURN(NULL);
+
+	value = strchr(param, '=');
+	if (value)
+		name_len = value - param;
+	else
+		name_len = strlen(param);
+
+	while (ptr->old_param) {
+		if (strncmp(param, ptr->old_param, name_len) == 0 &&
+		    name_len == strlen(ptr->old_param))
+			RETURN(ptr);
+		ptr++;
+	}
+
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+	char *q1, *q2, *str;
+	int len;
+
+	str = *params;
+	while (*str == ' ')
+		str++;
+
+	if (*str == '\0') {
+		*params = NULL;
+		return 1;
+	}
+
+	while (1) {
+		q1 = strpbrk(str, " '\"");
+		if (!q1) {
+			len = strlen(str);
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = NULL;
+			return 0;
+		}
+		len = q1 - str;
+		if (*q1 == ' ') {
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = str + len;
+			return 0;
+		}
+
+		memcpy(copy, str, len);
+		copy += len;
+
+		/* search for the matching closing quote */
+		str = q1 + 1;
+		q2 = strchr(str, *q1);
+		if (!q2) {
+			CERROR("Unbalanced quota in parameters: \"%s\"\n",
+			       *params);
+			return -EINVAL;
+		}
+		len = q2 - str;
+		memcpy(copy, str, len);
+		copy += len;
+		str = q2 + 1;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/*
+ * returns 0 if this is the first key in the buffer, else 1.
+ * valp points to first char after key.
+ */
+int class_match_param(char *buf, const char *key, char **valp)
+{
+	if (!buf)
+		return 1;
+
+	if (memcmp(buf, key, strlen(key)) != 0)
+		return 1;
+
+	if (valp)
+		*valp = buf + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+	lnet_nid_t *nid = (lnet_nid_t *)value;
+
+	*nid = libcfs_str2nid(buf);
+	if (*nid != LNET_NID_ANY)
+		return 0;
+
+	if (!quiet)
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+	return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+	__u32 *net = (__u32 *)value;
+
+	*net = libcfs_str2net(buf);
+	CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+	return 0;
+}
+
+enum {
+	CLASS_PARSE_NID = 1,
+	CLASS_PARSE_NET,
+};
+
+/*
+ * 0 is good NID,
+ * 1 not found
+ * < 0 error
+ * endh is set to next separator
+ */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+			     int quiet)
+{
+	char *endp;
+	char  tmp;
+	int   rc = 0;
+
+	if (!buf)
+		return 1;
+	while (*buf == ',' || *buf == ':')
+		buf++;
+	if (*buf == ' ' || *buf == '/' || *buf == '\0')
+		return 1;
+
+	/* NID separators or end of NIDs */
+	endp = strpbrk(buf, ",: /");
+	if (!endp)
+		endp = buf + strlen(buf);
+
+	tmp = *endp;
+	*endp = '\0';
+	switch (opc) {
+	default:
+		LBUG();
+	case CLASS_PARSE_NID:
+		rc = parse_nid(buf, value, quiet);
+		break;
+	case CLASS_PARSE_NET:
+		rc = parse_net(buf, value);
+		break;
+	}
+	*endp = tmp;
+	if (rc != 0)
+		return rc;
+	if (endh)
+		*endh = endp;
+	return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+
+/*
+ * 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+	lnet_nid_t tmp;
+	int rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/*
+		 * please restrict to the NIDs pertaining to
+		 * the specified NIDs
+		 */
+		while (class_parse_nid(buf, &tmp, &buf) == 0) {
+			if (tmp == nid)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+	__u32 tmp;
+	int rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/*
+		 * please restrict to the NIDs pertaining to
+		 * the specified networks
+		 */
+		while (class_parse_net(buf, &tmp, &buf) == 0) {
+			if (tmp == net)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+
+char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index)
+{
+	char *s;
+
+	if (!lcfg->lcfg_buflens[index])
+		return NULL;
+
+	s = lustre_cfg_buf(lcfg, index);
+	if (!s)
+		return NULL;
+
+	/*
+	 * make sure it's NULL terminated, even if this kills a char
+	 * of data.  Try to use the padding first though.
+	 */
+	if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+		size_t last = ALIGN(lcfg->lcfg_buflens[index], 8) - 1;
+		char lost;
+
+		/* Use the smaller value */
+		if (last > lcfg->lcfg_buflens[index])
+			last = lcfg->lcfg_buflens[index];
+
+		lost = s[last];
+		s[last] = '\0';
+		if (lost != '\0') {
+			CWARN("Truncated buf %d to '%s' (lost '%c'...)\n",
+			      index, s, lost);
+		}
+	}
+	return s;
+}
+EXPORT_SYMBOL(lustre_cfg_string);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new OBD device and set the type, name and uuid.  If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+	struct obd_export *exp;
+	struct obd_device *obd = NULL;
+	char *typename, *name, *uuid;
+	int rc, len;
+
+	ENTRY;
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("No type passed!\n");
+		RETURN(-EINVAL);
+	}
+	typename = lustre_cfg_string(lcfg, 1);
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+		CERROR("No name passed!\n");
+		RETURN(-EINVAL);
+	}
+	name = lustre_cfg_string(lcfg, 0);
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+		CERROR("No UUID passed!\n");
+		RETURN(-EINVAL);
+	}
+
+	uuid = lustre_cfg_string(lcfg, 2);
+	len = strlen(uuid);
+	if (len >= sizeof(obd->obd_uuid)) {
+		CERROR("%s: uuid must be < %d bytes long\n",
+		       name, (int)sizeof(obd->obd_uuid));
+		RETURN(-EINVAL);
+	}
+
+	obd = class_newdev(typename, name, uuid);
+	if (IS_ERR(obd)) { /* Already exists or out of obds */
+		rc = PTR_ERR(obd);
+		CERROR("Cannot create device %s of type %s : %d\n",
+		       name, typename, rc);
+		RETURN(rc);
+	}
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08X != %08X\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+		 "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+	exp = class_new_export_self(obd, &obd->obd_uuid);
+	if (IS_ERR(exp)) {
+		rc = PTR_ERR(exp);
+		class_free_dev(obd);
+		RETURN(rc);
+	}
+
+	obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+	class_export_put(exp);
+
+	rc = class_register_device(obd);
+	if (rc != 0) {
+		class_decref(obd, "newdev", obd);
+		RETURN(rc);
+	}
+
+	obd->obd_attached = 1;
+	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_attach);
+
+/**
+ * Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+
+	ENTRY;
+
+	LASSERT(obd != NULL);
+	LASSERTF(obd == class_num2obd(obd->obd_minor),
+		 "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+	/* have we attached a type to this device? */
+	if (!obd->obd_attached) {
+		CERROR("Device %d not attached\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+
+	if (obd->obd_set_up) {
+		CERROR("Device %d already setup (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		RETURN(-EEXIST);
+	}
+
+	/* is someone else setting us up right now? (attach inits spinlock) */
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_starting) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("Device %d setup in progress (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		RETURN(-EEXIST);
+	}
+	/*
+	 * just leave this on forever.  I can't use obd_set_up here because
+	 * other fns check that status, and we're not actually set up yet.
+	 */
+	obd->obd_starting = 1;
+	obd->obd_nid_stats_hash = NULL;
+	obd->obd_gen_hash = NULL;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* create an uuid-export lustre hash */
+	err = rhashtable_init(&obd->obd_uuid_hash, &uuid_hash_params);
+	if (err)
+		GOTO(err_starting, err);
+
+#ifdef HAVE_SERVER_SUPPORT
+	/* create a nid-export lustre hash */
+	err = rhltable_init(&obd->obd_nid_hash, &nid_hash_params);
+	if (err)
+		GOTO(err_uuid_hash, err = -ENOMEM);
+
+	/* create a nid-stats lustre hash */
+	obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+						  HASH_NID_STATS_CUR_BITS,
+						  HASH_NID_STATS_MAX_BITS,
+						  HASH_NID_STATS_BKT_BITS, 0,
+						  CFS_HASH_MIN_THETA,
+						  CFS_HASH_MAX_THETA,
+						  &nid_stat_hash_ops,
+						  CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_stats_hash)
+		GOTO(err_nid_hash, err = -ENOMEM);
+
+	/* create a client_generation-export lustre hash */
+	obd->obd_gen_hash = cfs_hash_create("UUID_HASH",
+					    HASH_GEN_CUR_BITS,
+					    HASH_GEN_MAX_BITS,
+					    HASH_GEN_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &gen_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_gen_hash)
+		GOTO(err_nid_stats_hash, err = -ENOMEM);
+#endif /* HAVE_SERVER_SUPPORT */
+
+	err = obd_setup(obd, lcfg);
+	if (err)
+#ifdef HAVE_SERVER_SUPPORT
+		GOTO(err_gen_hash, err);
+#else
+		GOTO(err_uuid_hash, err);
+#endif /* ! HAVE_SERVER_SUPPORT */
+
+	obd->obd_set_up = 1;
+
+	spin_lock(&obd->obd_dev_lock);
+	/* cleanup drops this */
+	class_incref(obd, "setup", obd);
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	RETURN(0);
+
+#ifdef HAVE_SERVER_SUPPORT
+err_gen_hash:
+	if (obd->obd_gen_hash) {
+		cfs_hash_putref(obd->obd_gen_hash);
+		obd->obd_gen_hash = NULL;
+	}
+err_nid_stats_hash:
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+err_nid_hash:
+	rhltable_destroy(&obd->obd_nid_hash);
+#endif /* HAVE_SERVER_SUPPORT */
+err_uuid_hash:
+	rhashtable_destroy(&obd->obd_uuid_hash);
+err_starting:
+	obd->obd_starting = 0;
+	CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+	return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/**
+ * We have finished using this OBD and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	ENTRY;
+
+	if (obd->obd_set_up) {
+		CERROR("OBD device %d still set up\n", obd->obd_minor);
+		RETURN(-EBUSY);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_attached) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD device %d not attached\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	obd->obd_attached = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* cleanup in progress. we don't like to find this device after now */
+	class_unregister_device(obd);
+
+	CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	class_decref(obd, "newdev", obd);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_detach);
+
+/**
+ * Start shutting down the OBD.  There may be in-progess ops when
+ * this is called.  We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	char *flag;
+	ENTRY;
+
+	OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+	if (!obd->obd_set_up) {
+		CERROR("Device %d not setup\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD %d already stopping\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	/* Leave this on forever */
+	obd->obd_stopping = 1;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* wait for already-arrived-connections to finish. */
+	while (obd->obd_conn_inprogress > 0)
+		yield();
+	smp_rmb();
+
+	if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+		for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+			switch (*flag) {
+			case 'F':
+				obd->obd_force = 1;
+				break;
+			case 'A':
+				LCONSOLE_WARN("Failing over %s\n",
+					      obd->obd_name);
+				spin_lock(&obd->obd_dev_lock);
+				obd->obd_fail = 1;
+#ifdef HAVE_SERVER_SUPPORT
+				obd->obd_no_transno = 1;
+#endif
+				obd->obd_no_recov = 1;
+				spin_unlock(&obd->obd_dev_lock);
+				if (OBP(obd, iocontrol)) {
+					obd_iocontrol(OBD_IOC_SYNC,
+						      obd->obd_self_export,
+						      0, NULL, NULL);
+				}
+				break;
+			default:
+				CERROR("Unrecognised flag '%c'\n", *flag);
+			}
+	}
+
+	LASSERT(obd->obd_self_export);
+
+	CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n",
+	       obd->obd_name, obd->obd_num_exports,
+	       atomic_read(&obd->obd_refcount) - 2);
+	dump_exports(obd, 0, D_HA);
+	class_disconnect_exports(obd);
+
+	/* Precleanup, we must make sure all exports get destroyed. */
+	err = obd_precleanup(obd);
+	if (err)
+		CERROR("Precleanup %s returned %d\n",
+		       obd->obd_name, err);
+
+	/* destroy an uuid-export hash body */
+	rhashtable_free_and_destroy(&obd->obd_uuid_hash, obd_export_exit,
+				    NULL);
+#ifdef HAVE_SERVER_SUPPORT
+	/* destroy a nid-export hash body */
+	rhltable_free_and_destroy(&obd->obd_nid_hash, nid_export_exit, NULL);
+
+	/* destroy a nid-stats hash body */
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+
+	/* destroy a client_generation-export hash body */
+	if (obd->obd_gen_hash) {
+		cfs_hash_putref(obd->obd_gen_hash);
+		obd->obd_gen_hash = NULL;
+	}
+#endif /* HAVE_SERVER_SUPPORT */
+	class_decref(obd, "setup", obd);
+	obd->obd_set_up = 0;
+
+	RETURN(0);
+}
+
+struct obd_device *class_incref(struct obd_device *obd,
+				const char *scope,
+				const void *source)
+{
+	lu_ref_add_atomic(&obd->obd_reference, scope, source);
+	atomic_inc(&obd->obd_refcount);
+	CDEBUG(D_INFO, "incref %s (%p) now %d - %s\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount), scope);
+
+	return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+	int last;
+
+	CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount), scope);
+
+	LASSERT(obd->obd_num_exports >= 0);
+	last = atomic_dec_and_test(&obd->obd_refcount);
+	lu_ref_del(&obd->obd_reference, scope, source);
+
+	if (last) {
+		struct obd_export *exp;
+
+		LASSERT(!obd->obd_attached);
+		/*
+		 * All exports have been destroyed; there should
+		 * be no more in-progress ops by this point.
+		 */
+		exp = obd->obd_self_export;
+
+		if (exp) {
+			exp->exp_flags |= exp_flags_from_obd(obd);
+			class_unlink_export(exp);
+		}
+	}
+}
+EXPORT_SYMBOL(class_decref);
+
+/**
+ * Add a failover NID location.
+ * Client OBD types contact server OBD types using this NID list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		RETURN(-EINVAL);
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+		CERROR("can't add connection on non-client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to add conn on immature client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+	RETURN(rc);
+}
+
+/** Remove a failover NID location. */
+static int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		RETURN(-EINVAL);
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+		CERROR("can't del connection on non-client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to del conn on immature client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_del_conn(imp, &uuid);
+
+	RETURN(rc);
+}
+
+static LIST_HEAD(lustre_profile_list);
+static DEFINE_SPINLOCK(lustre_profile_list_lock);
+
+static struct lustre_profile *class_get_profile_nolock(const char *prof)
+{
+	struct lustre_profile *lprof;
+
+	ENTRY;
+	list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+		if (strcmp(lprof->lp_profile, prof) == 0) {
+			lprof->lp_refs++;
+			RETURN(lprof);
+		}
+	}
+	RETURN(NULL);
+}
+
+struct lustre_profile *class_get_profile(const char *prof)
+{
+	struct lustre_profile *lprof;
+
+	ENTRY;
+	spin_lock(&lustre_profile_list_lock);
+	lprof = class_get_profile_nolock(prof);
+	spin_unlock(&lustre_profile_list_lock);
+	RETURN(lprof);
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/**
+ * Create a named "profile".
+ * This defines the MDC and OSC names to use for a client.
+ * This also is used to define the LOV to be used by a MDT.
+ */
+static int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+			     int mdclen, char *mdc)
+{
+	struct lustre_profile *lprof;
+	int err = 0;
+
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+	OBD_ALLOC(lprof, sizeof(*lprof));
+	if (!lprof)
+		RETURN(-ENOMEM);
+	INIT_LIST_HEAD(&lprof->lp_list);
+
+	LASSERT(proflen == (strlen(prof) + 1));
+	OBD_ALLOC(lprof->lp_profile, proflen);
+	if (!lprof->lp_profile)
+		GOTO(out, err = -ENOMEM);
+	memcpy(lprof->lp_profile, prof, proflen);
+
+	LASSERT(osclen == (strlen(osc) + 1));
+	OBD_ALLOC(lprof->lp_dt, osclen);
+	if (!lprof->lp_dt)
+		GOTO(out, err = -ENOMEM);
+	memcpy(lprof->lp_dt, osc, osclen);
+
+	if (mdclen > 0) {
+		LASSERT(mdclen == (strlen(mdc) + 1));
+		OBD_ALLOC(lprof->lp_md, mdclen);
+		if (!lprof->lp_md)
+			GOTO(out, err = -ENOMEM);
+		memcpy(lprof->lp_md, mdc, mdclen);
+	}
+
+	spin_lock(&lustre_profile_list_lock);
+	lprof->lp_refs = 1;
+	lprof->lp_list_deleted = false;
+
+	list_add(&lprof->lp_list, &lustre_profile_list);
+	spin_unlock(&lustre_profile_list_lock);
+	RETURN(err);
+
+out:
+	if (lprof->lp_md)
+		OBD_FREE(lprof->lp_md, mdclen);
+	if (lprof->lp_dt)
+		OBD_FREE(lprof->lp_dt, osclen);
+	if (lprof->lp_profile)
+		OBD_FREE(lprof->lp_profile, proflen);
+	OBD_FREE(lprof, sizeof(*lprof));
+	RETURN(err);
+}
+
+void class_del_profile(const char *prof)
+{
+	struct lustre_profile *lprof;
+
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+	spin_lock(&lustre_profile_list_lock);
+	lprof = class_get_profile_nolock(prof);
+	if (lprof) {
+		/* because get profile increments the ref counter */
+		lprof->lp_refs--;
+		list_del(&lprof->lp_list);
+		lprof->lp_list_deleted = true;
+		spin_unlock(&lustre_profile_list_lock);
+
+		class_put_profile(lprof);
+	} else {
+		spin_unlock(&lustre_profile_list_lock);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profile);
+
+void class_put_profile(struct lustre_profile *lprof)
+{
+	spin_lock(&lustre_profile_list_lock);
+	if ((--lprof->lp_refs) > 0) {
+		LASSERT(lprof->lp_refs > 0);
+		spin_unlock(&lustre_profile_list_lock);
+		return;
+	}
+	spin_unlock(&lustre_profile_list_lock);
+
+	/* confirm not a negative number */
+	LASSERT(lprof->lp_refs == 0);
+
+	/*
+	 * At least one class_del_profile/profiles must be called
+	 * on the target profile or lustre_profile_list will corrupt
+	 */
+	LASSERT(lprof->lp_list_deleted);
+	OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+	OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+	if (lprof->lp_md)
+		OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+	OBD_FREE(lprof, sizeof(*lprof));
+}
+EXPORT_SYMBOL(class_put_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+	struct lustre_profile *lprof, *n;
+	ENTRY;
+
+	spin_lock(&lustre_profile_list_lock);
+	list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+		list_del(&lprof->lp_list);
+		lprof->lp_list_deleted = true;
+		spin_unlock(&lustre_profile_list_lock);
+
+		class_put_profile(lprof);
+
+		spin_lock(&lustre_profile_list_lock);
+	}
+	spin_unlock(&lustre_profile_list_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+/*
+ * We can't call lquota_process_config directly because
+ * it lives in a module that must be loaded after this one.
+ */
+#ifdef HAVE_SERVER_SUPPORT
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg	   config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer    pointer to the newly-allocated config structure
+ *			    which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ *			    not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name)
+{
+	struct lustre_cfg_bufs *bufs = NULL;
+	struct lustre_cfg *new_cfg = NULL;
+	char *param = NULL;
+	char *new_param = NULL;
+	char *value = NULL;
+	int name_len = 0;
+	int new_len = 0;
+
+	ENTRY;
+
+	if (!cfg || !new_name)
+		GOTO(out_nocfg, new_cfg = ERR_PTR(-EINVAL));
+
+	param = lustre_cfg_string(cfg, 1);
+	if (!param)
+		GOTO(out_nocfg, new_cfg = ERR_PTR(-EINVAL));
+
+	value = strchr(param, '=');
+	if (value)
+		name_len = value - param;
+	else
+		name_len = strlen(param);
+
+	new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+	OBD_ALLOC(new_param, new_len);
+	if (!new_param)
+		GOTO(out_nocfg, new_cfg = ERR_PTR(-ENOMEM));
+
+	strlcpy(new_param, new_name, new_len);
+	if (value)
+		strcat(new_param, value);
+
+	OBD_ALLOC_PTR(bufs);
+	if (!bufs)
+		GOTO(out_free_param, new_cfg = ERR_PTR(-ENOMEM));
+
+	lustre_cfg_bufs_reset(bufs, NULL);
+	lustre_cfg_bufs_init(bufs, cfg);
+	lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+	OBD_ALLOC(new_cfg, lustre_cfg_len(bufs->lcfg_bufcount,
+					  bufs->lcfg_buflen));
+	if (!new_cfg)
+		GOTO(out_free_buf, new_cfg = ERR_PTR(-ENOMEM));
+
+	lustre_cfg_init(new_cfg, cfg->lcfg_command, bufs);
+
+	new_cfg->lcfg_num = cfg->lcfg_num;
+	new_cfg->lcfg_flags = cfg->lcfg_flags;
+	new_cfg->lcfg_nid = cfg->lcfg_nid;
+	new_cfg->lcfg_nal = cfg->lcfg_nal;
+out_free_buf:
+	OBD_FREE_PTR(bufs);
+out_free_param:
+	OBD_FREE(new_param, new_len);
+out_nocfg:
+	RETURN(new_cfg);
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+static ssize_t process_param2_config(struct lustre_cfg *lcfg)
+{
+	char *param = lustre_cfg_string(lcfg, 1);
+	char *upcall = lustre_cfg_string(lcfg, 2);
+	struct kobject *kobj = NULL;
+	const char *subsys = param;
+	char *newparam = NULL;
+	char *argv[] = {
+		[0] = "/usr/sbin/lctl",
+		[1] = "set_param",
+		[2] = param,
+		[3] = NULL
+	};
+	ktime_t start;
+	ktime_t end;
+	size_t len;
+	int rc;
+
+	ENTRY;
+	print_lustre_cfg(lcfg);
+
+	len = strcspn(param, ".=");
+	if (!len)
+		RETURN(-EINVAL);
+
+	/* If we find '=' then its the top level sysfs directory */
+	if (param[len] == '=')
+		RETURN(class_set_global(param));
+
+	subsys = kstrndup(param, len, GFP_KERNEL);
+	if (!subsys)
+		RETURN(-ENOMEM);
+
+	kobj = kset_find_obj(lustre_kset, subsys);
+	kfree(subsys);
+	if (kobj) {
+		char *value = param;
+		char *envp[4];
+		int i;
+
+		param = strsep(&value, "=");
+		envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s", param);
+		envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value);
+		envp[2] = kasprintf(GFP_KERNEL, "TIME=%lld",
+				    ktime_get_real_seconds());
+		envp[3] = NULL;
+
+		rc = kobject_uevent_env(kobj, KOBJ_CHANGE, envp);
+		for (i = 0; i < ARRAY_SIZE(envp); i++)
+			kfree(envp[i]);
+
+		kobject_put(kobj);
+
+		RETURN(rc);
+	}
+
+	/* Add upcall processing here. Now only lctl is supported */
+	if (strcmp(upcall, LCTL_UPCALL) != 0) {
+		CERROR("Unsupported upcall %s\n", upcall);
+		RETURN(-EINVAL);
+	}
+
+	/* root_squash and nosquash_nids settings must be applied to
+	 * global subsystem (*.) so that it is taken into account by
+	 * both client and server sides. So do the equivalent of a
+	 * 's / mdt. / *. /'.
+	 */
+	if ((strstr(param, PARAM_NOSQUASHNIDS) ||
+	     strstr(param, PARAM_ROOTSQUASH)) &&
+	    (param[0] != '*' || param[1] != '.')) {
+		newparam = kmalloc(strlen(param) + 1, GFP_NOFS);
+		if (!newparam)
+			RETURN(-ENOMEM);
+
+		snprintf(newparam, strlen(param) + 1, "*%s", param + len);
+		argv[2] = (char *)newparam;
+	}
+
+	start = ktime_get();
+	rc = call_usermodehelper(argv[0], argv, NULL, UMH_WAIT_PROC);
+	end = ktime_get();
+
+	if (rc < 0) {
+		CERROR("lctl: error invoking upcall %s %s %s: rc = %d; "
+		       "time %ldus\n", argv[0], argv[1], argv[2], rc,
+		       (long)ktime_us_delta(end, start));
+	} else {
+		CDEBUG(D_HA, "lctl: invoked upcall %s %s %s, time %ldus\n",
+		       argv[0], argv[1], argv[2],
+		       (long)ktime_us_delta(end, start));
+		       rc = 0;
+	}
+
+	kfree(newparam);
+	RETURN(rc);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+	quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+	struct obd_device *obd;
+	int err;
+
+	LASSERT(lcfg && !IS_ERR(lcfg));
+	CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+	/* Commands that don't need a device */
+	switch (lcfg->lcfg_command) {
+	case LCFG_ATTACH: {
+		err = class_attach(lcfg);
+		GOTO(out, err);
+	}
+	case LCFG_ADD_UUID: {
+		CDEBUG(D_IOCTL,
+		       "adding mapping from uuid %s to nid %#llx (%s)\n",
+		       lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid,
+		       libcfs_nid2str(lcfg->lcfg_nid));
+
+		err = class_add_uuid(lustre_cfg_string(lcfg, 1),
+				     lcfg->lcfg_nid);
+		GOTO(out, err);
+	}
+	case LCFG_DEL_UUID: {
+		CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+		       (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) ==
+			0) ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+		err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+		GOTO(out, err);
+	}
+	case LCFG_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+		       lustre_cfg_string(lcfg, 1),
+		       lustre_cfg_string(lcfg, 2),
+		       lustre_cfg_string(lcfg, 3));
+		/*
+		 * set these mount options somewhere, so ll_fill_super
+		 * can find them.
+		 */
+		err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+					lustre_cfg_string(lcfg, 1),
+					LUSTRE_CFG_BUFLEN(lcfg, 2),
+					lustre_cfg_string(lcfg, 2),
+					LUSTRE_CFG_BUFLEN(lcfg, 3),
+					lustre_cfg_string(lcfg, 3));
+		GOTO(out, err);
+	}
+	case LCFG_DEL_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		class_del_profile(lustre_cfg_string(lcfg, 1));
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+		       obd_timeout, lcfg->lcfg_num);
+		obd_timeout = max(lcfg->lcfg_num, 1U);
+		obd_timeout_set = 1;
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_LDLM_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+		       ldlm_timeout, lcfg->lcfg_num);
+		ldlm_timeout = max(lcfg->lcfg_num, 1U);
+		if (ldlm_timeout >= obd_timeout)
+			ldlm_timeout = max(obd_timeout / 3, 1U);
+		ldlm_timeout_set = 1;
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_UPCALL: {
+		LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+		/* COMPAT_146 Don't fail on old configs */
+		GOTO(out, err = 0);
+	}
+	case LCFG_MARKER: {
+		struct cfg_marker *marker;
+
+		marker = lustre_cfg_buf(lcfg, 1);
+		CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+		       marker->cm_flags, marker->cm_tgtname,
+		       marker->cm_comment);
+		GOTO(out, err = 0);
+	}
+	case LCFG_PARAM: {
+		char *tmp;
+
+		/* llite has no OBD */
+		if (class_match_param(lustre_cfg_string(lcfg, 1),
+				      PARAM_LLITE, NULL) == 0) {
+			struct lustre_sb_info *lsi;
+			unsigned long addr;
+			ssize_t count;
+
+			/*
+			 * The instance name contains the sb:
+			 * lustre-client-aacfe000
+			 */
+			tmp = strrchr(lustre_cfg_string(lcfg, 0), '-');
+			if (!tmp || !*(++tmp))
+				GOTO(out, err = -EINVAL);
+
+			if (sscanf(tmp, "%lx", &addr) != 1)
+				GOTO(out, err = -EINVAL);
+
+			lsi = s2lsi((struct super_block *)addr);
+			/* This better be a real Lustre superblock! */
+			LASSERT(lsi->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+			count = class_modify_config(lcfg, PARAM_LLITE,
+						    lsi->lsi_kobj);
+			err = count < 0 ? count : 0;
+			GOTO(out, err);
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_SYS, &tmp) == 0)) {
+			/* Global param settings */
+			err = class_set_global(tmp);
+			/*
+			 * Client or server should not fail to mount if
+			 * it hits an unknown configuration parameter.
+			 */
+			if (err < 0)
+				CWARN("Ignoring unknown param %s\n", tmp);
+
+			GOTO(out, err = 0);
+#ifdef HAVE_SERVER_SUPPORT
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_QUOTA, &tmp) == 0) &&
+			   quota_process_config) {
+			err = (*quota_process_config)(lcfg);
+			GOTO(out, err);
+#endif /* HAVE_SERVER_SUPPORT */
+		}
+
+		break;
+	}
+	case LCFG_SET_PARAM: {
+		err = process_param2_config(lcfg);
+		GOTO(out, err = 0);
+	}
+	}
+	/* Commands that require a device */
+	obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+	if (!obd) {
+		if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+			CERROR("this lcfg command requires a device name\n");
+		else
+			CERROR("no device for: %s\n",
+			       lustre_cfg_string(lcfg, 0));
+
+		GOTO(out, err = -EINVAL);
+	}
+	switch(lcfg->lcfg_command) {
+	case LCFG_SETUP: {
+		err = class_setup(obd, lcfg);
+		GOTO(out, err);
+	}
+	case LCFG_DETACH: {
+		err = class_detach(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_CLEANUP: {
+		err = class_cleanup(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_ADD_CONN: {
+		err = class_add_conn(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_DEL_CONN: {
+		err = class_del_conn(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_POOL_NEW: {
+		err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+		GOTO(out, err = 0);
+	}
+	case LCFG_POOL_ADD: {
+		err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+                                   lustre_cfg_string(lcfg, 3));
+		GOTO(out, err = 0);
+	}
+	case LCFG_POOL_REM: {
+		err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+                                   lustre_cfg_string(lcfg, 3));
+		GOTO(out, err = 0);
+	}
+	case LCFG_POOL_DEL: {
+		err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+		GOTO(out, err = 0);
+	}
+	/*
+	 * Process config log ADD_MDC record twice to add MDC also to LOV
+	 * for Data-on-MDT:
+	 *
+	 * add 0:lustre-clilmv 1:lustre-MDT0000_UUID 2:0 3:1
+	 *     4:lustre-MDT0000-mdc_UUID
+	 */
+	case LCFG_ADD_MDC: {
+		struct obd_device *lov_obd;
+		char *clilmv;
+
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		if (err)
+			GOTO(out, err);
+
+		/* make sure this is client LMV log entry */
+		clilmv = strstr(lustre_cfg_string(lcfg, 0), "clilmv");
+		if (!clilmv)
+			GOTO(out, err);
+
+		/*
+		 * replace 'lmv' with 'lov' name to address LOV device and
+		 * process llog record to add MDC there.
+		 */
+		clilmv[4] = 'o';
+		lov_obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+		if (lov_obd) {
+			err = obd_process_config(lov_obd, sizeof(*lcfg), lcfg);
+		} else {
+			err = -ENOENT;
+			CERROR("%s: Cannot find LOV by %s name, rc = %d\n",
+			       obd->obd_name, lustre_cfg_string(lcfg, 0), err);
+		}
+		/* restore 'lmv' name */
+		clilmv[4] = 'm';
+		GOTO(out, err);
+	}
+	default: {
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		GOTO(out, err);
+	}
+	}
+	EXIT;
+out:
+	if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+		CWARN("Ignoring error %d on optional command %#x\n", err,
+		      lcfg->lcfg_command);
+		err = 0;
+	}
+	return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix,
+			    struct kobject *kobj)
+{
+	const struct kobj_type *typ;
+	ssize_t count = 0;
+	int i;
+
+	if (lcfg->lcfg_command != LCFG_PARAM) {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		return -EINVAL;
+	}
+
+	typ = get_ktype(kobj);
+	if (!typ || !typ->default_groups)
+		return -ENODEV;
+
+	print_lustre_cfg(lcfg);
+
+	/*
+	 * e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+	 * or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+	 * or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36
+	 */
+	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		struct attribute *attr = NULL;
+		size_t keylen;
+		char *value;
+		char *key;
+
+		key = lustre_cfg_buf(lcfg, i);
+		/* Strip off prefix */
+		if (class_match_param(key, prefix, &key))
+			/*
+			 * If the prefix doesn't match, return error so we
+			 * can pass it down the stack
+			 */
+			return -EINVAL;
+
+		value = strchr(key, '=');
+		if (!value || *(value + 1) == 0) {
+			CERROR("%s: can't parse param '%s' (missing '=')\n",
+			       lustre_cfg_string(lcfg, 0),
+			       lustre_cfg_string(lcfg, i));
+			/* continue parsing other params */
+			continue;
+		}
+		keylen = value - key;
+		value++;
+
+		attr = get_attr_starts_with(typ, key, keylen);
+		if (!attr) {
+			char *envp[4], *param, *path;
+
+			path = kobject_get_path(kobj, GFP_KERNEL);
+			if (!path)
+				return -EINVAL;
+
+			/* convert sysfs path to uevent format */
+			param = path;
+			while ((param = strchr(param, '/')) != NULL)
+				*param = '.';
+
+			param = strstr(path, "fs.lustre.") + 10;
+
+			envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s.%.*s",
+					    param, (int) keylen, key);
+			envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value);
+			envp[2] = kasprintf(GFP_KERNEL, "TIME=%lld",
+					    ktime_get_real_seconds());
+			envp[3] = NULL;
+
+			if (kobject_uevent_env(kobj, KOBJ_CHANGE, envp)) {
+				CERROR("%s: failed to send uevent %s\n",
+				       kobject_name(kobj), key);
+			}
+
+			for (i = 0; i < ARRAY_SIZE(envp); i++)
+				kfree(envp[i]);
+			kfree(path);
+		} else {
+			count += lustre_attr_store(kobj, attr, value,
+						   strlen(value));
+		}
+	}
+	return count;
+}
+EXPORT_SYMBOL(class_modify_config);
+
+/*
+ * Supplemental functions for config logs, it allocates lustre_cfg
+ * buffers plus initialized llog record header at the beginning.
+ */
+struct llog_cfg_rec *lustre_cfg_rec_new(int cmd, struct lustre_cfg_bufs *bufs)
+{
+	struct llog_cfg_rec *lcr;
+	int reclen;
+
+	ENTRY;
+
+	reclen = lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen);
+	reclen = llog_data_len(reclen) + sizeof(struct llog_rec_hdr) +
+		 sizeof(struct llog_rec_tail);
+
+	OBD_ALLOC(lcr, reclen);
+	if (!lcr)
+		RETURN(NULL);
+
+	lustre_cfg_init(&lcr->lcr_cfg, cmd, bufs);
+
+	lcr->lcr_hdr.lrh_len = reclen;
+	lcr->lcr_hdr.lrh_type = OBD_CFG_REC;
+
+	RETURN(lcr);
+}
+EXPORT_SYMBOL(lustre_cfg_rec_new);
+
+void lustre_cfg_rec_free(struct llog_cfg_rec *lcr)
+{
+	ENTRY;
+	OBD_FREE(lcr, lcr->lcr_hdr.lrh_len);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_cfg_rec_free);
+
+/**
+ * Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct config_llog_instance *cfg = data;
+	int cfg_len = rec->lrh_len;
+	char *cfg_buf = (char *) (rec + 1);
+	int rc = 0;
+	ENTRY;
+
+	/* class_config_dump_handler(handle, rec, data); */
+
+	switch (rec->lrh_type) {
+	case OBD_CFG_REC: {
+		struct lustre_cfg *lcfg, *lcfg_new;
+		struct lustre_cfg_bufs bufs;
+		char *inst_name = NULL;
+		int inst_len = 0;
+		int swab = 0;
+
+		lcfg = (struct lustre_cfg *)cfg_buf;
+		if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+			lustre_swab_lustre_cfg(lcfg);
+			swab = 1;
+		}
+
+		rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+		if (rc)
+			GOTO(out, rc);
+
+		/* Figure out config state info */
+		if (lcfg->lcfg_command == LCFG_MARKER) {
+			struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+			lustre_swab_cfg_marker(marker, swab,
+					       LUSTRE_CFG_BUFLEN(lcfg, 1));
+			CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+			       cfg->cfg_flags, marker->cm_flags);
+			if (marker->cm_flags & CM_START) {
+				/* all previous flags off */
+				cfg->cfg_flags = CFG_F_MARKER;
+				server_name2index(marker->cm_tgtname,
+						  &cfg->cfg_lwp_idx, NULL);
+				if (marker->cm_flags & CM_SKIP) {
+					cfg->cfg_flags |= CFG_F_SKIP;
+					CDEBUG(D_CONFIG, "SKIP #%d\n",
+					       marker->cm_step);
+				} else if ((marker->cm_flags & CM_EXCLUDE) ||
+					   (cfg->cfg_sb &&
+					   lustre_check_exclusion(cfg->cfg_sb,
+							marker->cm_tgtname))) {
+					cfg->cfg_flags |= CFG_F_EXCLUDE;
+					CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+					       marker->cm_step);
+				}
+			} else if (marker->cm_flags & CM_END) {
+				cfg->cfg_flags = 0;
+			}
+		}
+		/*
+		 * A config command without a start marker before it is
+		 * illegal
+		 */
+		if (!(cfg->cfg_flags & CFG_F_MARKER) &&
+		    (lcfg->lcfg_command != LCFG_MARKER)) {
+			CWARN("Skip config outside markers, (inst: %016lx, uuid: %s, flags: %#x)\n",
+				cfg->cfg_instance,
+				cfg->cfg_uuid.uuid, cfg->cfg_flags);
+			cfg->cfg_flags |= CFG_F_SKIP;
+		}
+		if (cfg->cfg_flags & CFG_F_SKIP) {
+			CDEBUG(D_CONFIG, "skipping %#x\n",
+			       cfg->cfg_flags);
+			rc = 0;
+			/* No processing! */
+			break;
+		}
+
+		/*
+		 * For interoperability between 1.8 and 2.0,
+		 * rename "mds" OBD device type to "mdt".
+		 */
+		{
+			char *typename = lustre_cfg_string(lcfg, 1);
+			char *index = lustre_cfg_string(lcfg, 2);
+
+			if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+			    strcmp(typename, "mds") == 0)) {
+				CWARN("For 1.8 interoperability, rename obd "
+					"type from mds to mdt\n");
+				typename[2] = 't';
+			}
+			if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+			    strcmp(index, "type") == 0)) {
+				CDEBUG(D_INFO, "For 1.8 interoperability, "
+				       "set this index to '0'\n");
+				index[0] = '0';
+				index[1] = 0;
+			}
+		}
+
+#ifdef HAVE_SERVER_SUPPORT
+		/* newer MDS replaces LOV/OSC with LOD/OSP */
+		if ((lcfg->lcfg_command == LCFG_ATTACH ||
+		     lcfg->lcfg_command == LCFG_SET_PARAM ||
+		     lcfg->lcfg_command == LCFG_PARAM) &&
+		    cfg->cfg_sb && IS_MDT(s2lsi(cfg->cfg_sb))) {
+			char *typename = lustre_cfg_string(lcfg, 1);
+
+			if (typename &&
+			    strcmp(typename, LUSTRE_LOV_NAME) == 0) {
+				CDEBUG(D_CONFIG,
+				       "For 2.x interoperability, rename obd "
+				       "type from lov to lod (%s)\n",
+				       s2lsi(cfg->cfg_sb)->lsi_svname);
+				strcpy(typename, LUSTRE_LOD_NAME);
+			}
+			if (typename &&
+			    strcmp(typename, LUSTRE_OSC_NAME) == 0) {
+				CDEBUG(D_CONFIG,
+				       "For 2.x interoperability, rename obd "
+				       "type from osc to osp (%s)\n",
+				       s2lsi(cfg->cfg_sb)->lsi_svname);
+				strcpy(typename, LUSTRE_OSP_NAME);
+			}
+		}
+#endif /* HAVE_SERVER_SUPPORT */
+
+		if (cfg->cfg_flags & CFG_F_EXCLUDE) {
+			CDEBUG(D_CONFIG, "cmd: %x marked EXCLUDED\n",
+			       lcfg->lcfg_command);
+			if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD)
+				/* Add inactive instead */
+				lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+		}
+
+		lustre_cfg_bufs_reset(&bufs, NULL);
+		lustre_cfg_bufs_init(&bufs, lcfg);
+
+		if (cfg->cfg_instance &&
+		    lcfg->lcfg_command != LCFG_SPTLRPC_CONF &&
+		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
+			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+				LUSTRE_MAXINSTANCE + 4;
+			OBD_ALLOC(inst_name, inst_len);
+			if (!inst_name)
+				GOTO(out, rc = -ENOMEM);
+			snprintf(inst_name, inst_len, "%s-%016lx",
+				lustre_cfg_string(lcfg, 0),
+				cfg->cfg_instance);
+			lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+			CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+			       lcfg->lcfg_command, inst_name);
+		}
+
+		/* override llog UUID for clients, to insure they are unique */
+		if (cfg->cfg_instance && lcfg->lcfg_command == LCFG_ATTACH)
+			lustre_cfg_bufs_set_string(&bufs, 2,
+						   cfg->cfg_uuid.uuid);
+		/*
+		 * sptlrpc config record, we expect 2 data segments:
+		 *  [0]: fs_name/target_name,
+		 *  [1]: rule string
+		 * moving them to index [1] and [2], and insert MGC's
+		 * obdname at index [0].
+		 */
+		if (cfg->cfg_instance &&
+		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+			/* After ASLR changes cfg_instance this needs fixing */
+			/* "obd" is set in config_log_find_or_add() */
+			struct obd_device *obd = (void *)cfg->cfg_instance;
+
+			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+					    bufs.lcfg_buflen[1]);
+			lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+					    bufs.lcfg_buflen[0]);
+			lustre_cfg_bufs_set_string(&bufs, 0,
+						   obd->obd_name);
+		}
+
+		/*
+		 * Add net info to setup command
+		 * if given on command line.
+		 * So config log will be:
+		 * [0]: client name
+		 * [1]: client UUID
+		 * [2]: server UUID
+		 * [3]: inactive-on-startup
+		 * [4]: restrictive net
+		 */
+		if (cfg && cfg->cfg_sb && s2lsi(cfg->cfg_sb) &&
+		    !IS_SERVER(s2lsi(cfg->cfg_sb))) {
+			struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb);
+			char *nidnet = lsi->lsi_lmd->lmd_nidnet;
+
+			if (lcfg->lcfg_command == LCFG_SETUP &&
+			    lcfg->lcfg_bufcount != 2 && nidnet) {
+				CDEBUG(D_CONFIG, "Adding net %s info to setup "
+				       "command for client %s\n", nidnet,
+				       lustre_cfg_string(lcfg, 0));
+				lustre_cfg_bufs_set_string(&bufs, 4, nidnet);
+			}
+		}
+
+		OBD_ALLOC(lcfg_new, lustre_cfg_len(bufs.lcfg_bufcount,
+						   bufs.lcfg_buflen));
+		if (!lcfg_new)
+			GOTO(out, rc = -ENOMEM);
+
+		lustre_cfg_init(lcfg_new, lcfg->lcfg_command, &bufs);
+		lcfg_new->lcfg_num   = lcfg->lcfg_num;
+		lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+		/*
+		 * XXX Hack to try to remain binary compatible with
+		 * pre-newconfig logs
+		 */
+		if (lcfg->lcfg_nal != 0 &&      /* pre-newconfig log? */
+		    (lcfg->lcfg_nid >> 32) == 0) {
+			__u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+			lcfg_new->lcfg_nid =
+				LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+			CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+			      lcfg->lcfg_nal, addr,
+			      libcfs_nid2str(lcfg_new->lcfg_nid));
+		} else {
+			lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+		}
+
+		lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+		rc = class_process_config(lcfg_new);
+		OBD_FREE(lcfg_new, lustre_cfg_len(lcfg_new->lcfg_bufcount,
+						  lcfg_new->lcfg_buflens));
+		if (inst_name)
+			OBD_FREE(inst_name, inst_len);
+		break;
+	}
+	default:
+		CERROR("Unknown llog record type %#x encountered\n",
+		       rec->lrh_type);
+		break;
+	}
+out:
+	if (rc) {
+		CERROR("%s: cfg command failed: rc = %d\n",
+			handle->lgh_ctxt->loc_obd->obd_name, rc);
+		class_config_dump_handler(NULL, handle, rec, data);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg)
+{
+	struct llog_process_cat_data cd = {
+		.lpcd_first_idx = 0,
+		.lpcd_read_mode = LLOG_READ_MODE_NORMAL,
+	};
+	struct llog_handle *llh;
+	llog_cb_t callback;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "looking up llog %s\n", name);
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	/* continue processing from where we last stopped to end-of-log */
+	if (cfg) {
+		cd.lpcd_first_idx = cfg->cfg_last_idx;
+		callback = cfg->cfg_callback;
+		LASSERT(callback != NULL);
+	} else {
+		callback = class_config_llog_handler;
+	}
+
+	cd.lpcd_last_idx = 0;
+
+	rc = llog_process(env, llh, callback, cfg, &cd);
+
+	CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+	       cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+	if (cfg)
+		cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+	llog_close(env, llh);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+/**
+ * Get marker cfg_flag
+ */
+void llog_get_marker_cfg_flags(struct llog_rec_hdr *rec,
+			       unsigned int *cfg_flags)
+{
+	struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1);
+	struct cfg_marker *marker;
+
+	if (lcfg->lcfg_command == LCFG_MARKER) {
+		marker = lustre_cfg_buf(lcfg, 1);
+		if (marker->cm_flags & CM_START) {
+			*cfg_flags = CFG_F_MARKER;
+			if (marker->cm_flags & CM_SKIP)
+				*cfg_flags = CFG_F_SKIP;
+		} else if (marker->cm_flags & CM_END) {
+			*cfg_flags = 0;
+		}
+		CDEBUG(D_INFO, "index=%d, cm_flags=%#08x cfg_flags=%#08x\n",
+		       rec->lrh_index, marker->cm_flags, *cfg_flags);
+	}
+}
+
+/**
+ * Parse config record and output dump in supplied buffer.
+ *
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ *
+ * Sample Output:
+ * - { index: 4, event: attach, device: lustrewt-clilov, type: lov,
+ *     UUID: lustrewt-clilov_UUID }
+ */
+int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size,
+			     unsigned int *cfg_flags, bool raw)
+{
+	struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1);
+	char *ptr = buf;
+	char *end = buf + size;
+	int rc = 0, i;
+	struct lcfg_type_data *ldata;
+	int swab = 0;
+
+	LASSERT(rec->lrh_type == OBD_CFG_REC);
+
+	if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+		lustre_swab_lustre_cfg(lcfg);
+		swab = 1;
+	}
+
+	rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+	if (rc < 0)
+		return rc;
+
+	ldata = lcfg_cmd2data(lcfg->lcfg_command);
+	if (!ldata)
+		return -ENOTTY;
+
+	llog_get_marker_cfg_flags(rec, cfg_flags);
+	if ((lcfg->lcfg_command == LCFG_MARKER) && likely(!raw))
+		return 0;
+	/* entries outside marker are skipped */
+	if (!(*cfg_flags & CFG_F_MARKER) && !raw)
+		return 0;
+	/* inside skipped marker */
+	if ((*cfg_flags & CFG_F_SKIP) && !raw)
+		return 0;
+
+	/* form YAML entity */
+	ptr += snprintf(ptr, end - ptr, "- { index: %u, event: %s",
+			rec->lrh_index, ldata->ltd_name);
+	if (end - ptr <= 0)
+		goto out_overflow;
+
+	if (lcfg->lcfg_flags) {
+		ptr += snprintf(ptr, end - ptr, ", flags: %#08x",
+				lcfg->lcfg_flags);
+		if (end - ptr <= 0)
+			goto out_overflow;
+	}
+	if (lcfg->lcfg_num) {
+		ptr += snprintf(ptr, end - ptr, ", num: %#08x",
+				lcfg->lcfg_num);
+		if (end - ptr <= 0)
+			goto out_overflow;
+	}
+	if (lcfg->lcfg_nid) {
+		char nidstr[LNET_NIDSTR_SIZE];
+
+		libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr));
+		ptr += snprintf(ptr, end - ptr, ", nid: %s(%#llx)",
+				nidstr, lcfg->lcfg_nid);
+		if (end - ptr <= 0)
+			goto out_overflow;
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
+		ptr += snprintf(ptr, end - ptr, ", device: %s",
+				lustre_cfg_string(lcfg, 0));
+		if (end - ptr <= 0)
+			goto out_overflow;
+	}
+
+	if (lcfg->lcfg_command == LCFG_SET_PARAM) {
+		/*
+		 * set_param -P parameters have param=val here, separate
+		 * them through pointer magic and print them out in
+		 * native yamlese
+		 */
+		char *cfg_str = lustre_cfg_string(lcfg, 1);
+		char *tmp = strchr(cfg_str, '=');
+		size_t len;
+
+		if (!tmp)
+			goto out_done;
+
+		ptr += snprintf(ptr, end - ptr, ", %s: ", ldata->ltd_bufs[0]);
+		len = tmp - cfg_str + 1;
+		snprintf(ptr, len, "%s", cfg_str);
+		ptr += len - 1;
+
+		ptr += snprintf(ptr, end - ptr, ", %s: ", ldata->ltd_bufs[1]);
+		ptr += snprintf(ptr, end - ptr, "%s", tmp + 1);
+
+		goto out_done;
+	}
+
+	if (lcfg->lcfg_command == LCFG_MARKER) {
+		struct cfg_marker *marker;
+
+		marker = lustre_cfg_buf(lcfg, 1);
+		ptr += snprintf(ptr, end - ptr, ", flags: %#04x",
+				marker->cm_flags);
+		ptr += snprintf(ptr, end - ptr, ", version: %d.%d.%d.%d",
+				OBD_OCD_VERSION_MAJOR(marker->cm_vers),
+				OBD_OCD_VERSION_MINOR(marker->cm_vers),
+				OBD_OCD_VERSION_PATCH(marker->cm_vers),
+				OBD_OCD_VERSION_FIX(marker->cm_vers));
+		ptr += snprintf(ptr, end - ptr, ", createtime: %lld",
+				marker->cm_createtime);
+		ptr += snprintf(ptr, end - ptr, ", canceltime: %lld",
+				marker->cm_canceltime);
+
+		goto out_done;
+	}
+
+	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0) {
+			ptr += snprintf(ptr, end - ptr, ", %s: %s",
+					ldata->ltd_bufs[i - 1],
+					lustre_cfg_string(lcfg, i));
+			if (end - ptr <= 0)
+				goto out_overflow;
+		}
+	}
+
+out_done:
+	ptr += snprintf(ptr, end - ptr, " }\n");
+out_overflow:
+	/* Return consumed bytes.  If the buffer overflowed, zero last byte */
+	rc = ptr - buf;
+	if (rc > size) {
+		rc = -EOVERFLOW;
+		*(end - 1) = '\0';
+	}
+
+	return rc;
+}
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+static int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+	struct lustre_cfg	*lcfg = (struct lustre_cfg *)(rec + 1);
+	char			*ptr = buf;
+	char			*end = buf + size;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(rec->lrh_type == OBD_CFG_REC);
+	rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+	if (rc < 0)
+		RETURN(rc);
+
+	ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+	if (lcfg->lcfg_flags)
+		ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+				lcfg->lcfg_flags);
+
+	if (lcfg->lcfg_num)
+		ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+	if (lcfg->lcfg_nid) {
+		char nidstr[LNET_NIDSTR_SIZE];
+
+		libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr));
+		ptr += snprintf(ptr, end-ptr, "nid=%s(%#llx)    ",
+				nidstr, lcfg->lcfg_nid);
+	}
+
+	if (lcfg->lcfg_command == LCFG_MARKER) {
+		struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+		ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+				marker->cm_step, marker->cm_flags,
+				marker->cm_tgtname, marker->cm_comment);
+	} else {
+		int i;
+
+		for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
+			ptr += snprintf(ptr, end-ptr, "%d:%s  ", i,
+					lustre_cfg_string(lcfg, i));
+		}
+	}
+	ptr += snprintf(ptr, end - ptr, "\n");
+	/* return consumed bytes */
+	rc = ptr - buf;
+	RETURN(rc);
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	char *outstr;
+	int rc = 0;
+
+	ENTRY;
+
+	OBD_ALLOC(outstr, 256);
+	if (!outstr)
+		RETURN(-ENOMEM);
+
+	if (rec->lrh_type == OBD_CFG_REC) {
+		class_config_parse_rec(rec, outstr, 256);
+		LCONSOLE(D_WARNING, "   %s\n", outstr);
+	} else {
+		LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+		rc = -EINVAL;
+	}
+
+	OBD_FREE(outstr, 256);
+	RETURN(rc);
+}
+
+/**
+ * Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+	char flags[3] = "";
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs bufs;
+	int rc;
+
+	ENTRY;
+
+	if (!obd) {
+		CERROR("empty cleanup\n");
+		RETURN(-EALREADY);
+	}
+
+	if (obd->obd_force)
+		strlcat(flags, "F", sizeof(flags));
+	if (obd->obd_fail)
+		strlcat(flags, "A", sizeof(flags));
+
+	CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+	       obd->obd_name, flags);
+
+	lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, flags);
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen));
+	if (!lcfg)
+		RETURN(-ENOMEM);
+	lustre_cfg_init(lcfg, LCFG_CLEANUP, &bufs);
+
+	rc = class_process_config(lcfg);
+	if (rc) {
+		CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+		GOTO(out, rc);
+	}
+
+	/* the lcfg is almost the same for both ops */
+	lcfg->lcfg_command = LCFG_DETACH;
+	rc = class_process_config(lcfg);
+	if (rc)
+		CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+	OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+#ifdef HAVE_SERVER_SUPPORT
+/*
+ * nid<->nidstats hash operations
+ */
+static unsigned
+nidstats_hash(struct cfs_hash *hs, const void *key, unsigned int mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(struct lnet_nid), mask);
+}
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+	return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return nid_same((struct lnet_nid *)nidstats_key(hnode),
+			 (struct lnet_nid *)key);
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_putref(ns);
+}
+
+static struct cfs_hash_ops nid_stat_hash_ops = {
+	.hs_hash	= nidstats_hash,
+	.hs_key		= nidstats_key,
+	.hs_keycmp	= nidstats_keycmp,
+	.hs_object	= nidstats_object,
+	.hs_get		= nidstats_get,
+	.hs_put_locked	= nidstats_put_locked,
+};
+
+/*
+ * client_generation<->export hash operations
+ */
+
+static unsigned
+gen_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(__u32), mask);
+}
+
+static void *
+gen_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_gen_hash);
+
+	RETURN(&exp->exp_target_data.ted_lcd->lcd_generation);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+gen_kepcmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_gen_hash);
+
+	RETURN(exp->exp_target_data.ted_lcd->lcd_generation == *(__u32 *)key &&
+	       !exp->exp_failed);
+}
+
+static void *
+gen_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_gen_hash);
+}
+
+static void
+gen_export_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_gen_hash);
+	class_export_get(exp);
+}
+
+static void
+gen_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_gen_hash);
+	class_export_put(exp);
+}
+
+static struct cfs_hash_ops gen_hash_ops = {
+	.hs_hash        = gen_hash,
+	.hs_key         = gen_key,
+	.hs_keycmp      = gen_kepcmp,
+	.hs_object      = gen_export_object,
+	.hs_get         = gen_export_get,
+	.hs_put_locked  = gen_export_put_locked,
+};
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
new file mode 100644
index 0000000000000..f9e46c67c3fe4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
@@ -0,0 +1,1689 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include <obd.h>
+#include <obd_class.h>
+#include <linux/random.h>
+#include <libcfs/linux/linux-uuid.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_param.h>
+#include <lustre_crypto.h>
+
+/**************** config llog ********************/
+
+/**
+ * Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ *
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ *   the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same MGC may be used to follow multiple config logs
+ *   (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ *   this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs *bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(mgc);
+	LASSERT(cfg);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		RETURN(-ENOMEM);
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(bufs, 1, logname);
+	lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+	lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
+	if (!lcfg)
+		GOTO(out, rc = -ENOMEM);
+	lustre_cfg_init(lcfg, LCFG_LOG_START, bufs);
+
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
+out:
+	OBD_FREE_PTR(bufs);
+
+	if (rc == -EINVAL)
+		LCONSOLE_ERROR_MSG(0x15b,
+				   "%s: Configuration from log %s failed from MGS %d. Check client and MGS are on compatible version.\n",
+				   mgc->obd_name, logname, rc);
+	else if (rc != 0)
+		LCONSOLE_ERROR_MSG(0x15c,
+				   "%s: Confguration from log %s failed from MGS %d. Communication error between node & MGS, a bad configuration, or other errors. See syslog for more info\n",
+				   mgc->obd_name, logname, rc);
+
+	/* class_obd_list(); */
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+		   struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+
+	ENTRY;
+
+	if (!mgc)
+		RETURN(-ENOENT);
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, logname);
+	if (cfg)
+		lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen));
+	if (!lcfg)
+		RETURN(-ENOMEM);
+	lustre_cfg_init(lcfg, LCFG_LOG_END, &bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** OBD start *******************/
+
+/**
+ * lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+		   char *s1, char *s2, char *s3, char *s4)
+{
+	struct lustre_cfg_bufs bufs;
+	struct lustre_cfg *lcfg = NULL;
+	int rc;
+
+	CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+	       cmd, s1, s2, s3, s4);
+
+	lustre_cfg_bufs_reset(&bufs, cfgname);
+	if (s1)
+		lustre_cfg_bufs_set_string(&bufs, 1, s1);
+	if (s2)
+		lustre_cfg_bufs_set_string(&bufs, 2, s2);
+	if (s3)
+		lustre_cfg_bufs_set_string(&bufs, 3, s3);
+	if (s4)
+		lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen));
+	if (!lcfg)
+		return -ENOMEM;
+	lustre_cfg_init(lcfg, cmd, &bufs);
+	lcfg->lcfg_nid = nid;
+	rc = class_process_config(lcfg);
+	OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
+	return rc;
+}
+
+/**
+ * Call class_attach and class_setup.  These methods in turn call
+ * OBD type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4)
+{
+	int rc;
+
+	CDEBUG(D_MOUNT, "Starting OBD %s (typ=%s)\n", obdname, type);
+
+	rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, NULL, NULL);
+	if (rc) {
+		CERROR("%s attach error %d\n", obdname, rc);
+		return rc;
+	}
+	rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+	if (rc) {
+		CERROR("%s setup error %d\n", obdname, rc);
+		do_lcfg(obdname, 0, LCFG_DETACH, NULL, NULL, NULL, NULL);
+	}
+	return rc;
+}
+
+static DEFINE_MUTEX(mgc_start_lock);
+
+/**
+ * Set up a MGC OBD to process startup logs
+ *
+ * \param sb [in] super block of the MGC OBD
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+	struct obd_connect_data *data = NULL;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct obd_uuid *uuid = NULL;
+	uuid_t uuidc;
+	lnet_nid_t nid;
+	char nidstr[LNET_NIDSTR_SIZE];
+	char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+	char *ptr;
+	int rc = 0, i = 0, j;
+	size_t len;
+
+	ENTRY;
+
+	LASSERT(lsi->lsi_lmd);
+
+	/* Find the first non-lo MGS NID for our MGC name */
+	if (IS_SERVER(lsi)) {
+		/* mount -o mgsnode=nid */
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (lsi->lsi_lmd->lmd_mgs &&
+		    (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+			i++;
+		} else if (IS_MGS(lsi)) {
+			struct lnet_processid id;
+
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				if (nid_is_lo0(&id.nid))
+					continue;
+				nid = lnet_nid_to_nid4(&id.nid);
+				i++;
+				break;
+			}
+		}
+	} else { /* client */
+		/* Use NIDs from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		if (class_parse_nid(ptr, &nid, &ptr) == 0)
+			i++;
+	}
+	if (i == 0) {
+		CERROR("No valid MGS NIDs found.\n");
+		RETURN(-EINVAL);
+	}
+
+	mutex_lock(&mgc_start_lock);
+
+	libcfs_nid2str_r(nid, nidstr, sizeof(nidstr));
+	len = strlen(LUSTRE_MGC_OBDNAME) + strlen(nidstr) + 1;
+	OBD_ALLOC(mgcname, len);
+	OBD_ALLOC(niduuid, len + 2);
+	if (mgcname == NULL || niduuid == NULL)
+		GOTO(out_free, rc = -ENOMEM);
+	snprintf(mgcname, len, "%s%s", LUSTRE_MGC_OBDNAME, nidstr);
+
+	mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		GOTO(out_free, rc = -ENOMEM);
+
+	obd = class_name2obd(mgcname);
+	if (obd && !obd->obd_stopping) {
+		int recov_bk;
+
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					strlen(KEY_MGSSEC), KEY_MGSSEC,
+					strlen(mgssec), mgssec, NULL);
+		if (rc)
+			GOTO(out_free, rc);
+
+		/* Re-using an existing MGC */
+		atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+		/* IR compatibility check, only for clients */
+		if (lmd_is_client(lsi->lsi_lmd)) {
+			int has_ir;
+			int vallen = sizeof(*data);
+			__u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+			rc = obd_get_info(NULL, obd->obd_self_export,
+					  strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+					  &vallen, data);
+			LASSERT(rc == 0);
+			has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+			if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+				/* LMD_FLG_NOIR is for test purpose only */
+				LCONSOLE_WARN(
+					      "Mounting client with IR setting not compatible with current MGC. Using MGC setting that is IR %s",
+					      has_ir ? "enabled" : "disabled");
+				if (has_ir)
+					*flags &= ~LMD_FLG_NOIR;
+				else
+					*flags |= LMD_FLG_NOIR;
+			}
+		}
+
+		recov_bk = 0;
+		/*
+		 * If we are restarting the MGS, don't try to keep the MGC's
+		 * old connection, or registration will fail.
+		 */
+		if (IS_MGS(lsi)) {
+			CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+			recov_bk = 1;
+		}
+
+		/*
+		 * Try all connections, but only once (again).
+		 * We don't want to block another target from starting
+		 * (using its local copy of the log), but we do want to connect
+		 * if at all possible.
+		 */
+		recov_bk++;
+		CDEBUG(D_MOUNT, "%s:Set MGC reconnect %d\n", mgcname, recov_bk);
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					sizeof(KEY_INIT_RECOV_BACKUP),
+					KEY_INIT_RECOV_BACKUP,
+					sizeof(recov_bk), &recov_bk, NULL);
+		GOTO(out, rc = 0);
+	}
+
+	CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+	/* Add the primary NIDs for the MGS */
+	i = 0;
+	snprintf(niduuid, len + 2, "%s_%x", mgcname, i);
+	if (IS_SERVER(lsi)) {
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		CDEBUG(D_MOUNT, "mgs NIDs %s.\n", ptr);
+		if (IS_MGS(lsi)) {
+			/* Use local NIDs (including LO) */
+			struct lnet_processid id;
+
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				rc = do_lcfg(mgcname, lnet_nid_to_nid4(&id.nid),
+					     LCFG_ADD_UUID,
+					     niduuid, NULL, NULL, NULL);
+			}
+		} else {
+			/* Use mgsnode= nids */
+			/* mount -o mgsnode=nid */
+			if (lsi->lsi_lmd->lmd_mgs) {
+				ptr = lsi->lsi_lmd->lmd_mgs;
+			} else if (class_find_param(ptr, PARAM_MGSNODE,
+						    &ptr) != 0) {
+				CERROR("No MGS NIDs given.\n");
+				GOTO(out_free, rc = -EINVAL);
+			}
+			/*
+			 * Add primary MGS NID(s).
+			 * Multiple NIDs on one MGS node are separated
+			 * by commas.
+			 */
+			while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+				rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID,
+					     niduuid, NULL, NULL, NULL);
+				if (rc == 0)
+					++i;
+				/* Stop at the first failover NID */
+				if (*ptr == ':')
+					break;
+			}
+		}
+	} else { /* client */
+		/* Use NIDs from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+			rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID,
+				     niduuid, NULL, NULL, NULL);
+			if (rc == 0)
+				++i;
+			/* Stop at the first failover NID */
+			if (*ptr == ':')
+				break;
+		}
+	}
+	if (i == 0) {
+		CERROR("No valid MGS NIDs found.\n");
+		GOTO(out_free, rc = -EINVAL);
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+	/* Random uuid for MGC allows easier reconnects */
+	OBD_ALLOC_PTR(uuid);
+	if (uuid == NULL)
+		GOTO(out_free, rc = -ENOMEM);
+
+	generate_random_uuid(uuidc.b);
+	snprintf(uuid->uuid, sizeof(*uuid), "%pU", uuidc.b);
+
+	/* Start the MGC */
+	rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+				 niduuid, NULL, NULL);
+	if (rc)
+		GOTO(out_free, rc);
+
+	/* Add any failover MGS NIDs */
+	i = 1;
+	while (ptr && ((*ptr == ':' ||
+	       class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+		/* New failover node */
+		sprintf(niduuid, "%s_%x", mgcname, i);
+		j = 0;
+		while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+			rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID,
+				     niduuid, NULL, NULL, NULL);
+			if (rc == 0)
+				++j;
+			if (*ptr == ':')
+				break;
+		}
+		if (j > 0) {
+			rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+				     niduuid, NULL, NULL, NULL);
+			if (rc == 0)
+				++i;
+		} else {
+			/* at ":/fsname" */
+			break;
+		}
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+	obd = class_name2obd(mgcname);
+	if (!obd) {
+		CERROR("Can't find mgcobd %s\n", mgcname);
+		GOTO(out_free, rc = -ENOTCONN);
+	}
+
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				strlen(KEY_MGSSEC), KEY_MGSSEC,
+				strlen(mgssec), mgssec, NULL);
+	if (rc)
+		GOTO(out_free, rc);
+
+	/*
+	 * Keep a refcount of servers/clients who started with "mount",
+	 * so we know when we can get rid of the mgc.
+	 */
+	atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+	/* We connect to the MGS at setup, and don't disconnect until cleanup */
+	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+				  OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+				  OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_BARRIER |
+				  OBD_CONNECT_FLAGS2;
+	data->ocd_connect_flags2 = OBD_CONNECT2_REP_MBITS;
+
+	if (lmd_is_client(lsi->lsi_lmd) &&
+	    lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+		data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+	rc = obd_connect(NULL, &exp, obd, uuid, data, NULL);
+	if (rc) {
+		CERROR("connect failed %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+	/*
+	 * Keep the MGC info in the sb. Note that many lsi's can point
+	 * to the same mgc.
+	 */
+	lsi->lsi_mgc = obd;
+out_free:
+	mutex_unlock(&mgc_start_lock);
+
+	if (uuid)
+		OBD_FREE_PTR(uuid);
+	if (data)
+		OBD_FREE_PTR(data);
+	if (mgcname)
+		OBD_FREE(mgcname, len);
+	if (niduuid)
+		OBD_FREE(niduuid, len + 2);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_start_mgc);
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	char niduuid[MAX_OBD_NAME + 6], *ptr = NULL;
+	int i, rc = 0;
+
+	ENTRY;
+
+	if (!lsi)
+		RETURN(-ENOENT);
+	obd = lsi->lsi_mgc;
+	if (!obd)
+		RETURN(-ENOENT);
+	lsi->lsi_mgc = NULL;
+
+	mutex_lock(&mgc_start_lock);
+	LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+	if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+		/*
+		 * This is not fatal, every client that stops
+		 * will call in here.
+		 */
+		CDEBUG(D_MOUNT, "MGC still has %d references.\n",
+		       atomic_read(&obd->u.cli.cl_mgc_refcount));
+		GOTO(out, rc = -EBUSY);
+	}
+
+	/*
+	 * The MGC has no recoverable data in any case.
+	 * force shotdown set in umount_begin
+	 */
+	obd->obd_no_recov = 1;
+
+	if (obd->u.cli.cl_mgc_mgsexp) {
+		/*
+		 * An error is not fatal, if we are unable to send the
+		 * disconnect mgs ping evictor cleans up the export
+		 */
+		rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+		if (rc)
+			CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+	}
+
+	/*
+	 * Cache the obdname for cleaning the nid uuids, which are
+	 * obdname_XX before calling class_manual_cleanup
+	 */
+	strcpy(niduuid, obd->obd_name);
+	ptr = niduuid + strlen(niduuid);
+
+	rc = class_manual_cleanup(obd);
+	if (rc)
+		GOTO(out, rc);
+
+	for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+		sprintf(ptr, "_%x", i);
+		rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+			     niduuid, NULL, NULL, NULL);
+		if (rc)
+			CERROR("del MDC UUID %s failed: rc = %d\n",
+			       niduuid, rc);
+	}
+out:
+	/* class_import_put will get rid of the additional connections */
+	mutex_unlock(&mgc_start_lock);
+	RETURN(rc);
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(lsi);
+	if (!lsi)
+		RETURN(NULL);
+	OBD_ALLOC_PTR(lsi->lsi_lmd);
+	if (!lsi->lsi_lmd) {
+		OBD_FREE_PTR(lsi);
+		RETURN(NULL);
+	}
+
+	s2lsi_nocast(sb) = lsi;
+	/* we take 1 extra ref for our setup */
+	atomic_set(&lsi->lsi_mounts, 1);
+
+	/* Default umount style */
+	lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+	INIT_LIST_HEAD(&lsi->lsi_lwp_list);
+	mutex_init(&lsi->lsi_lwp_mutex);
+
+	RETURN(lsi);
+}
+EXPORT_SYMBOL(lustre_init_lsi);
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+	CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+	/* someone didn't call server_put_mount. */
+	LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+	llcrypt_sb_free(sb);
+	if (lsi->lsi_lmd != NULL) {
+		if (lsi->lsi_lmd->lmd_dev != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_dev,
+				strlen(lsi->lsi_lmd->lmd_dev) + 1);
+		if (lsi->lsi_lmd->lmd_profile != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_profile,
+				strlen(lsi->lsi_lmd->lmd_profile) + 1);
+		if (lsi->lsi_lmd->lmd_fileset != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_fileset,
+				strlen(lsi->lsi_lmd->lmd_fileset) + 1);
+		if (lsi->lsi_lmd->lmd_mgssec != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+				strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+		if (lsi->lsi_lmd->lmd_opts != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_opts,
+				strlen(lsi->lsi_lmd->lmd_opts) + 1);
+		if (lsi->lsi_lmd->lmd_exclude_count)
+			OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+				sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+				lsi->lsi_lmd->lmd_exclude_count);
+		if (lsi->lsi_lmd->lmd_mgs != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+				 strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+		if (lsi->lsi_lmd->lmd_osd_type != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+				 strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+		if (lsi->lsi_lmd->lmd_params != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+		if (lsi->lsi_lmd->lmd_nidnet != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_nidnet,
+				strlen(lsi->lsi_lmd->lmd_nidnet) + 1);
+
+		OBD_FREE_PTR(lsi->lsi_lmd);
+	}
+
+	LASSERT(lsi->lsi_llsbi == NULL);
+	OBD_FREE_PTR(lsi);
+	s2lsi_nocast(sb) = NULL;
+
+	RETURN(0);
+}
+
+/*
+ * The lsi has one reference for every server that is using the disk -
+ * e.g. MDT, MGS, and potentially MGC
+ */
+int lustre_put_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+
+	CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+	if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+		if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+			lu_device_put(&lsi->lsi_dt_dev->dd_lu_dev);
+			lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt = NULL;
+			lsi->lsi_dt_dev = NULL;
+			obd_disconnect(lsi->lsi_osd_exp);
+			/* wait till OSD is gone */
+			obd_zombie_barrier();
+		}
+		lustre_free_lsi(sb);
+		RETURN(1);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_put_lsi);
+
+/*
+ * The goal of this function is to extract the file system name
+ * from the OBD name. This can come in two flavors. One is
+ * fsname-MDTXXXX or fsname-XXXXXXX were X is a hexadecimal
+ * number. In both cases we should return fsname. If it is
+ * not a valid OBD name it is assumed to be the file system
+ * name itself.
+ */
+void obdname2fsname(const char *tgt, char *fsname, size_t buflen)
+{
+	const char *ptr;
+	const char *tmp;
+	size_t len = 0;
+
+	/*
+	 * First we have to see if the @tgt has '-' at all. It is
+	 * valid for the user to request something like
+	 * lctl set_param -P llite.lustre*.xattr_cache=0
+	 */
+	ptr = strrchr(tgt, '-');
+	if (!ptr) {
+		/* No '-' means it could end in '*' */
+		ptr = strchr(tgt, '*');
+		if (!ptr) {
+			/* No '*' either. Assume tgt = fsname */
+			len = strlen(tgt);
+			goto valid_obd_name;
+		}
+		len = ptr - tgt;
+		goto valid_obd_name;
+	}
+
+	/* tgt format fsname-MDT0000-* */
+	if ((!strncmp(ptr, "-MDT", 4) ||
+	     !strncmp(ptr, "-OST", 4)) &&
+	     (isxdigit(ptr[4]) && isxdigit(ptr[5]) &&
+	      isxdigit(ptr[6]) && isxdigit(ptr[7]))) {
+		len = ptr - tgt;
+		goto valid_obd_name;
+	}
+
+	/*
+	 * tgt_format fsname-cli'dev'-'uuid' except for the llite case
+	 * which are named fsname-'uuid'. Examples:
+	 *
+	 * lustre-clilov-ffff88104db5b800
+	 * lustre-ffff88104db5b800  (for llite device)
+	 *
+	 * The length of the OBD uuid can vary on different platforms.
+	 * This test if any invalid characters are in string. Allow
+	 * wildcards with '*' character.
+	 */
+	ptr++;
+	if (!strspn(ptr, "0123456789abcdefABCDEF*")) {
+		len = 0;
+		goto no_fsname;
+	}
+
+	/*
+	 * Now that we validated the device name lets extract the
+	 * file system name. Most of the names in this class will
+	 * have '-cli' in its name which needs to be dropped. If
+	 * it doesn't have '-cli' then its a llite device which
+	 * ptr already points to the start of the uuid string.
+	 */
+	tmp = strstr(tgt, "-cli");
+	if (tmp)
+		ptr = tmp;
+	else
+		ptr--;
+	len = ptr - tgt;
+valid_obd_name:
+	len = min_t(size_t, len, LUSTRE_MAXFSNAME);
+	snprintf(fsname, buflen, "%.*s", (int)len, tgt);
+no_fsname:
+	fsname[len] = '\0';
+}
+EXPORT_SYMBOL(obdname2fsname);
+
+/**
+ * SERVER NAME ***
+ * <FSNAME><SEPARATOR><TYPE><INDEX>
+ * FSNAME is between 1 and 8 characters (inclusive).
+ *	Excluded characters are '/' and ':'
+ * SEPARATOR is either ':' or '-'
+ * TYPE: "OST", "MDT", etc.
+ * INDEX: Hex representation of the index
+ */
+
+/**
+ * Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ *  Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0  on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+	const char *dash;
+
+	dash = svname + strnlen(svname, LUSTRE_MAXFSNAME);
+	for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+		;
+	if (dash == svname)
+		return -EINVAL;
+
+	if (fsname != NULL) {
+		strncpy(fsname, svname, dash - svname);
+		fsname[dash - svname] = '\0';
+	}
+
+	if (endptr != NULL)
+		*endptr = dash;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize)
+{
+	int rc;
+	const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(label, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	if (endptr != NULL)
+		*endptr = dash;
+
+	if (strlcpy(svname, dash + 1, svsize) >= svsize)
+		return -E2BIG;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+/**
+ * check server name is OST.
+ **/
+int server_name_is_ost(const char *svname)
+{
+	const char *dash;
+	int rc;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(svname, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	dash++;
+
+	if (strncmp(dash, "OST", 3) == 0)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(server_name_is_ost);
+
+/**
+ * Get the index from the target name MDTXXXX/OSTXXXX
+ * rc = server type, or rc < 0  on error
+ **/
+int target_name2index(const char *tgtname, __u32 *idx, const char **endptr)
+{
+	const char *dash = tgtname;
+	unsigned long index;
+	int rc;
+
+	if (strncmp(dash, "MDT", 3) == 0)
+		rc = LDD_F_SV_TYPE_MDT;
+	else if (strncmp(dash, "OST", 3) == 0)
+		rc = LDD_F_SV_TYPE_OST;
+	else
+		return -EINVAL;
+
+	dash += 3;
+
+	if (strncmp(dash, "all", 3) == 0) {
+		if (endptr != NULL)
+			*endptr = dash + 3;
+		return rc | LDD_F_SV_ALL;
+	}
+
+	index = simple_strtoul(dash, (char **)endptr, 16);
+	if (idx != NULL)
+		*idx = index;
+
+	if (index > 0xffff)
+		return -ERANGE;
+
+	return rc;
+}
+EXPORT_SYMBOL(target_name2index);
+
+/*
+ * Get the index from the OBD name.
+ * rc = server type, or
+ * rc < 0  on error
+ * if endptr isn't NULL it is set to end of name
+ */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+	const char *dash;
+	int rc;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(svname, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	dash++;
+	rc = target_name2index(dash, idx, endptr);
+	if (rc < 0)
+		return rc;
+
+	/* Account for -mdc after index that is possible when specifying mdt */
+	if (endptr != NULL && strncmp(LUSTRE_MDC_NAME, *endptr + 1,
+				      sizeof(LUSTRE_MDC_NAME)-1) == 0)
+		*endptr += sizeof(LUSTRE_MDC_NAME);
+
+	return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common betweeen server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+	/* Drop a ref to the MGC */
+	rc = lustre_stop_mgc(sb);
+	if (rc && (rc != -ENOENT)) {
+		if (rc != -EBUSY) {
+			CERROR("Can't stop MGC: %d\n", rc);
+			RETURN(rc);
+		}
+		/*
+		 * BUSY just means that there's some other OBD that
+		 * needs the mgc.  Let him clean it up.
+		 */
+		CDEBUG(D_MOUNT, "MGC still in use\n");
+	}
+	/* Drop a ref to the mounted disk */
+	lustre_put_lsi(sb);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+	int i;
+
+	PRINT_CMD(D_MOUNT, "  mount data:\n");
+	if (lmd_is_client(lmd))
+		PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+	PRINT_CMD(D_MOUNT, "device:  %s\n", lmd->lmd_dev);
+	PRINT_CMD(D_MOUNT, "flags:   %x\n", lmd->lmd_flags);
+
+	if (lmd->lmd_opts)
+		PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+	if (lmd->lmd_recovery_time_soft)
+		PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+			  lmd->lmd_recovery_time_soft);
+
+	if (lmd->lmd_recovery_time_hard)
+		PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+			  lmd->lmd_recovery_time_hard);
+
+	for (i = 0; i < lmd->lmd_exclude_count; i++) {
+		PRINT_CMD(D_MOUNT, "exclude %d:  OST%04x\n", i,
+			  lmd->lmd_exclude[i]);
+	}
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct lustre_mount_data *lmd = lsi->lsi_lmd;
+	__u32 index;
+	int i, rc;
+
+	ENTRY;
+
+	rc = server_name2index(svname, &index, NULL);
+	if (rc != LDD_F_SV_TYPE_OST)
+		/* Only exclude OSTs */
+		RETURN(0);
+
+	CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+	       index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+	for (i = 0; i < lmd->lmd_exclude_count; i++) {
+		if (index == lmd->lmd_exclude[i]) {
+			CWARN("Excluding %s (on exclusion list)\n", svname);
+			RETURN(1);
+		}
+	}
+	RETURN(0);
+}
+
+/* mount -v  -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+	const char *s1 = ptr, *s2;
+	__u32 *exclude_list;
+	__u32 index = 0;
+	int rc = 0, devmax;
+
+	ENTRY;
+
+	/*
+	 * The shortest an ost name can be is 8 chars: -OST0000.
+	 * We don't actually know the fsname at this time, so in fact
+	 * a user could specify any fsname.
+	 */
+	devmax = strlen(ptr) / 8 + 1;
+
+	/* temp storage until we figure out how many we have */
+	OBD_ALLOC_PTR_ARRAY(exclude_list, devmax);
+	if (!exclude_list)
+		RETURN(-ENOMEM);
+
+	/* we enter this fn pointing at the '=' */
+	while (*s1 && *s1 != ' ' && *s1 != ',') {
+		s1++;
+		rc = server_name2index(s1, &index, &s2);
+		if (rc < 0) {
+			CERROR("Can't parse server name '%s': rc = %d\n",
+			       s1, rc);
+			break;
+		}
+		if (rc == LDD_F_SV_TYPE_OST)
+			exclude_list[lmd->lmd_exclude_count++] = index;
+		else
+			CDEBUG(D_MOUNT, "ignoring exclude %.*s: type = %#x\n",
+			       (uint)(s2-s1), s1, rc);
+		s1 = s2;
+		/*
+		 * now we are pointing at ':' (next exclude)
+		 * or ',' (end of excludes)
+		 */
+		if (lmd->lmd_exclude_count >= devmax)
+			break;
+	}
+	if (rc >= 0) /* non-err */
+		rc = 0;
+
+	if (lmd->lmd_exclude_count) {
+		/* permanent, freed in lustre_free_lsi */
+		OBD_ALLOC_PTR_ARRAY(lmd->lmd_exclude,
+				    lmd->lmd_exclude_count);
+		if (lmd->lmd_exclude) {
+			memcpy(lmd->lmd_exclude, exclude_list,
+			       sizeof(index) * lmd->lmd_exclude_count);
+		} else {
+			rc = -ENOMEM;
+			lmd->lmd_exclude_count = 0;
+		}
+	}
+	OBD_FREE_PTR_ARRAY(exclude_list, devmax);
+	RETURN(rc);
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+	char *tail;
+	int length;
+
+	if (lmd->lmd_mgssec != NULL) {
+		OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+		lmd->lmd_mgssec = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+	if (lmd->lmd_mgssec == NULL)
+		return -ENOMEM;
+
+	memcpy(lmd->lmd_mgssec, ptr, length);
+	lmd->lmd_mgssec[length] = '\0';
+	return 0;
+}
+
+static int lmd_parse_network(struct lustre_mount_data *lmd, char *ptr)
+{
+	char *tail;
+	int length;
+
+	if (lmd->lmd_nidnet != NULL) {
+		OBD_FREE(lmd->lmd_nidnet, strlen(lmd->lmd_nidnet) + 1);
+		lmd->lmd_nidnet = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(lmd->lmd_nidnet, length + 1);
+	if (lmd->lmd_nidnet == NULL)
+		return -ENOMEM;
+
+	memcpy(lmd->lmd_nidnet, ptr, length);
+	lmd->lmd_nidnet[length] = '\0';
+	return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+	char *tail;
+	int length;
+
+	if ((handle == NULL) || (ptr == NULL))
+		return -EINVAL;
+
+	if (*handle != NULL) {
+		OBD_FREE(*handle, strlen(*handle) + 1);
+		*handle = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(*handle, length + 1);
+	if (*handle == NULL)
+		return -ENOMEM;
+
+	memcpy(*handle, ptr, length);
+	(*handle)[length] = '\0';
+
+	return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+	lnet_nid_t nid;
+	char *tail = *ptr;
+	char *mgsnid;
+	int length;
+	int oldlen = 0;
+
+	/* Find end of NID-list */
+	while (class_parse_nid_quiet(tail, &nid, &tail) == 0)
+		; /* do nothing */
+
+	length = tail - *ptr;
+	if (length == 0) {
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+		return -EINVAL;
+	}
+
+	if (lmd->lmd_mgs != NULL)
+		oldlen = strlen(lmd->lmd_mgs) + 1;
+
+	OBD_ALLOC(mgsnid, oldlen + length + 1);
+	if (mgsnid == NULL)
+		return -ENOMEM;
+
+	if (lmd->lmd_mgs != NULL) {
+		/* Multiple mgsnid= are taken to mean failover locations */
+		memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+		mgsnid[oldlen - 1] = ':';
+		OBD_FREE(lmd->lmd_mgs, oldlen);
+	}
+	memcpy(mgsnid + oldlen, *ptr, length);
+	mgsnid[oldlen + length] = '\0';
+	lmd->lmd_mgs = mgsnid;
+	*ptr = tail;
+
+	return 0;
+}
+
+/**
+ * Find the first delimiter (comma or colon) from the specified \a buf and
+ * make \a *endh point to the string starting with the delimiter. The commas
+ * in expression list [...] will be skipped.
+ *
+ * @buf		a delimiter-separated string
+ * @endh	a pointer to a pointer that will point to the string
+ *		starting with the delimiter
+ *
+ * RETURNS	true if delimiter is found, false if delimiter is not found
+ */
+static bool lmd_find_delimiter(char *buf, char **endh)
+{
+	char *c = buf;
+	size_t pos;
+	bool found;
+
+	if (!buf)
+		return false;
+try_again:
+	if (*c == ',' || *c == ':')
+		return true;
+
+	pos = strcspn(c, "[:,]");
+	if (!pos)
+		return false;
+
+	/* Not a valid mount string */
+	if (*c == ']') {
+		CWARN("invalid mount string format\n");
+		return false;
+	}
+
+	c += pos;
+	if (*c == '[') {
+		c = strchr(c, ']');
+
+		/* invalid mount string */
+		if (!c) {
+			CWARN("invalid mount string format\n");
+			return false;
+		}
+		c++;
+		goto try_again;
+	}
+
+	found = *c != '\0';
+	if (found && endh)
+		*endh = c;
+
+	return found;
+}
+
+/**
+ * Find the first valid string delimited by comma or colon from the specified
+ * \a buf and parse it to see whether it's a valid NID list. If yes, \a *endh
+ * will point to the next string starting with the delimiter.
+ *
+ * \param[in] buf	a delimiter-separated string
+ * \param[in] endh	a pointer to a pointer that will point to the string
+ *			starting with the delimiter
+ *
+ * \retval 0		if the string is a valid NID list
+ * \retval 1		if the string is not a valid NID list
+ */
+static int lmd_parse_nidlist(char *buf, char **endh)
+{
+	LIST_HEAD(nidlist);
+	char *endp = buf;
+	char tmp;
+	int rc = 0;
+
+	if (buf == NULL)
+		return 1;
+	while (*buf == ',' || *buf == ':')
+		buf++;
+	if (*buf == ' ' || *buf == '/' || *buf == '\0')
+		return 1;
+
+	if (!lmd_find_delimiter(buf, &endp))
+		endp = buf + strlen(buf);
+
+	tmp = *endp;
+	*endp = '\0';
+
+	if (cfs_parse_nidlist(buf, strlen(buf), &nidlist) <= 0)
+		rc = 1;
+	cfs_free_nidlist(&nidlist);
+
+	*endp = tmp;
+	if (rc != 0)
+		return rc;
+	if (endh != NULL)
+		*endh = endp;
+	return 0;
+}
+
+/**
+ * Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre_tgt
+ */
+int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+	char *s1, *s2, *devname = NULL;
+	struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(lmd);
+	if (!options) {
+		LCONSOLE_ERROR_MSG(0x162,
+				   "Missing mount data: check /sbin/mount.lustre_tgt is installed.\n");
+		RETURN(-EINVAL);
+	}
+
+	/* Options should be a string - try to detect old lmd data */
+	if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+		LCONSOLE_ERROR_MSG(0x163,
+				   "Using an old version of /sbin/mount.lustre. Please install version %s\n",
+				   LUSTRE_VERSION_STRING);
+		RETURN(-EINVAL);
+	}
+	lmd->lmd_magic = LMD_MAGIC;
+
+	OBD_ALLOC(lmd->lmd_params, LMD_PARAMS_MAXLEN);
+	if (lmd->lmd_params == NULL)
+		RETURN(-ENOMEM);
+	lmd->lmd_params[0] = '\0';
+
+	/* Set default flags here */
+
+	s1 = options;
+	while (*s1) {
+		int clear = 0;
+		int time_min = OBD_RECOVERY_TIME_MIN;
+		char *s3;
+
+		/* Skip whitespace and extra commas */
+		while (*s1 == ' ' || *s1 == ',')
+			s1++;
+		s3 = s1;
+
+		/*
+		 * Client options are parsed in ll_options: eg. flock,
+		 * user_xattr, acl
+		 */
+
+		/*
+		 * Parse non-ldiskfs options here. Rather than modifying
+		 * ldiskfs, we just zero these out here
+		 */
+		if (strncmp(s1, "abort_recov", 11) == 0) {
+			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+			clear++;
+		} else if (strncmp(s1, "abort_recov_mdt", 15) == 0) {
+			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV_MDT;
+			clear++;
+		} else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+			lmd->lmd_recovery_time_soft =
+				max_t(int, simple_strtoul(s1 + 19, NULL, 10),
+				      time_min);
+			clear++;
+		} else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+			lmd->lmd_recovery_time_hard =
+				max_t(int, simple_strtoul(s1 + 19, NULL, 10),
+				      time_min);
+			clear++;
+		} else if (strncmp(s1, "no_precreate", 12) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NO_PRECREATE;
+			clear++;
+		} else if (strncmp(s1, "noir", 4) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+			clear++;
+		} else if (strncmp(s1, "nosvc", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSVC;
+			clear++;
+		} else if (strncmp(s1, "nomgs", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOMGS;
+			clear++;
+		} else if (strncmp(s1, "noscrub", 7) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+			clear++;
+		} else if (strncmp(s1, "skip_lfsck", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_SKIP_LFSCK;
+			clear++;
+		} else if (strncmp(s1, "rdonly_dev", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_DEV_RDONLY;
+			clear++;
+		} else if (strncmp(s1, PARAM_MGSNODE,
+				   sizeof(PARAM_MGSNODE) - 1) == 0) {
+			s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+			/*
+			 * Assume the next mount opt is the first
+			 * invalid NID we get to.
+			 */
+			rc = lmd_parse_mgs(lmd, &s2);
+			if (rc)
+				goto invalid;
+			s3 = s2;
+			clear++;
+		} else if (strncmp(s1, "writeconf", 9) == 0) {
+			lmd->lmd_flags |= LMD_FLG_WRITECONF;
+			clear++;
+		} else if (strncmp(s1, "nolocallogs", 11) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NO_LOCAL_LOGS;
+			clear++;
+		} else if (strncmp(s1, "update", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_UPDATE;
+			clear++;
+		} else if (strncmp(s1, "virgin", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_VIRGIN;
+			clear++;
+		} else if (strncmp(s1, "noprimnode", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+			clear++;
+		} else if (strncmp(s1, "mgssec=", 7) == 0) {
+			rc = lmd_parse_mgssec(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+			/* ost exclusion list */
+		} else if (strncmp(s1, "exclude=", 8) == 0) {
+			rc = lmd_make_exclusion(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "mgs", 3) == 0) {
+			/* We are an MGS */
+			lmd->lmd_flags |= LMD_FLG_MGS;
+			clear++;
+		} else if (strncmp(s1, "svname=", 7) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "param=", 6) == 0) {
+			size_t length, params_length;
+			char  *tail = s1;
+
+			if (lmd_find_delimiter(s1 + 6, &tail)) {
+				char *param_str = tail + 1;
+				int   supplementary = 1;
+
+				while (lmd_parse_nidlist(param_str,
+							 &param_str) == 0) {
+					supplementary = 0;
+				}
+				length = param_str - s1 - supplementary;
+			} else {
+				length = strlen(s1);
+			}
+			length -= 6;
+			params_length = strlen(lmd->lmd_params);
+			if (params_length + length + 1 >= LMD_PARAMS_MAXLEN)
+				RETURN(-E2BIG);
+			strncat(lmd->lmd_params, s1 + 6, length);
+			lmd->lmd_params[params_length + length] = '\0';
+			strlcat(lmd->lmd_params, " ", LMD_PARAMS_MAXLEN);
+			s3 = s1 + 6 + length;
+			clear++;
+		} else if (strncmp(s1, "localrecov", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_LOCAL_RECOV;
+			clear++;
+		} else if (strncmp(s1, "osd=", 4) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+			if (rc)
+				goto invalid;
+			clear++;
+		}
+		/*
+		 * Linux 2.4 doesn't pass the device, so we stuck it at
+		 * the end of the options.
+		 */
+		else if (strncmp(s1, "device=", 7) == 0) {
+			devname = s1 + 7;
+			/*
+			 * terminate options right before device.  device
+			 * must be the last one.
+			 */
+			*s1 = '\0';
+			break;
+		} else if (strncmp(s1, "network=", 8) == 0) {
+			rc = lmd_parse_network(lmd, s1 + 8);
+			if (rc)
+				goto invalid;
+
+			/* check if LNet dynamic peer discovery is activated */
+			if (LNetGetPeerDiscoveryStatus()) {
+				CERROR("LNet Dynamic Peer Discovery is enabled "
+				       "on this node. 'network' mount option "
+				       "cannot be taken into account.\n");
+				goto invalid;
+			}
+
+			clear++;
+		}
+
+		/* Find next opt */
+		s2 = strchr(s3, ',');
+		if (s2 == NULL) {
+			if (clear)
+				*s1 = '\0';
+			break;
+		}
+		s2++;
+		if (clear)
+			memmove(s1, s2, strlen(s2) + 1);
+		else
+			s1 = s2;
+	}
+
+	if (!devname) {
+		LCONSOLE_ERROR_MSG(0x164,
+				   "Can't find device name (need mount option 'device=...')\n");
+		goto invalid;
+	}
+
+	s1 = strstr(devname, ":/");
+	if (s1) {
+		++s1;
+		lmd->lmd_flags |= LMD_FLG_CLIENT;
+		/* Remove leading /s from fsname */
+		while (*++s1 == '/')
+			;
+		s2 = s1;
+		while (*s2 != '/' && *s2 != '\0')
+			s2++;
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_profile, s2 - s1 + 8);
+		if (!lmd->lmd_profile)
+			RETURN(-ENOMEM);
+
+		strncat(lmd->lmd_profile, s1, s2 - s1);
+		strncat(lmd->lmd_profile, "-client", 7);
+
+		s1 = s2;
+		s2 = s1 + strlen(s1) - 1;
+		/* Remove padding /s from fileset */
+		while (*s2 == '/')
+			s2--;
+		if (s2 > s1) {
+			OBD_ALLOC(lmd->lmd_fileset, s2 - s1 + 2);
+			if (lmd->lmd_fileset == NULL) {
+				OBD_FREE(lmd->lmd_profile,
+					 strlen(lmd->lmd_profile) + 1);
+				RETURN(-ENOMEM);
+			}
+			strncat(lmd->lmd_fileset, s1, s2 - s1 + 1);
+		}
+	} else {
+		/* server mount */
+		if (lmd->lmd_nidnet != NULL) {
+			/* 'network=' mount option forbidden for server */
+			OBD_FREE(lmd->lmd_nidnet, strlen(lmd->lmd_nidnet) + 1);
+			lmd->lmd_nidnet = NULL;
+			rc = -EINVAL;
+			CERROR(
+			       "%s: option 'network=' not allowed for Lustre servers: rc = %d\n",
+			       devname, rc);
+			RETURN(rc);
+		}
+	}
+
+	/* Freed in lustre_free_lsi */
+	OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+	if (!lmd->lmd_dev)
+		RETURN(-ENOMEM);
+	strncpy(lmd->lmd_dev, devname, strlen(devname)+1);
+
+	/* Save mount options */
+	s1 = options + strlen(options) - 1;
+	while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+		*s1-- = 0;
+	while (*options && (*options == ',' || *options == ' '))
+		options++;
+	if (*options != 0) {
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+		if (!lmd->lmd_opts)
+			RETURN(-ENOMEM);
+		strncpy(lmd->lmd_opts, options, strlen(options)+1);
+	}
+
+	lmd_print(lmd);
+	lmd->lmd_magic = LMD_MAGIC;
+
+	RETURN(rc);
+
+invalid:
+	CERROR("Bad mount options %s\n", options);
+	RETURN(-EINVAL);
+}
+EXPORT_SYMBOL(lmd_parse);
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * This is the entry point for the mount call into Lustre.
+ * This is called when a server target is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+static int lustre_tgt_fill_super(struct super_block *sb, void *lmd2_data,
+				 int silent)
+{
+	struct lustre_mount_data *lmd;
+	struct lustre_sb_info *lsi;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	lsi = lustre_init_lsi(sb);
+	if (!lsi)
+		RETURN(-ENOMEM);
+	lmd = lsi->lsi_lmd;
+
+	/*
+	 * Disable lockdep during mount, because mount locking patterns are
+	 * 'special'.
+	 */
+	lockdep_off();
+
+	/*
+	 * LU-639: the OBD cleanup of last mount may not finish yet, wait here.
+	 */
+	obd_zombie_barrier();
+
+	/* Figure out the lmd from the mount options */
+	if (lmd_parse(lmd2_data, lmd)) {
+		lustre_put_lsi(sb);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (lmd_is_client(lmd)) {
+		rc = -ENODEV;
+		CERROR("%s: attempting to mount a client with -t lustre_tgt' which is only for server-side mounts: rc = %d\n",
+		       lmd->lmd_dev, rc);
+		lustre_put_lsi(sb);
+		GOTO(out, rc);
+	}
+
+	CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
+	rc = server_fill_super(sb);
+	/*
+	 * server_fill_super calls lustre_start_mgc after the mount
+	 * because we need the MGS NIDs which are stored on disk.
+	 * Plus, we may need to start the MGS first.
+	 *
+	 * server_fill_super will call server_put_super on failure
+	 *
+	 * If error happens in fill_super() call, @lsi will be killed there.
+	 * This is why we do not put it here.
+	 */
+out:
+	if (rc) {
+		CERROR("Unable to mount %s (%d)\n",
+		       s2lsi(sb) ? lmd->lmd_dev : "", rc);
+	} else {
+		CDEBUG(D_SUPER, "Mount %s complete\n",
+		       lmd->lmd_dev);
+	}
+	lockdep_on();
+	return rc;
+}
+
+/***************** FS registration ******************/
+static struct dentry *lustre_tgt_mount(struct file_system_type *fs_type,
+				       int flags, const char *devname,
+				       void *data)
+{
+	return mount_nodev(fs_type, flags, data, lustre_tgt_fill_super);
+}
+
+/* Register the "lustre_tgt" fs type.
+ *
+ * Right now this isn't any different than the normal "lustre" filesystem
+ * type, but it is added so that there is some compatibility to allow
+ * changing documentation and scripts to start using the "lustre_tgt" type
+ * at mount time. That will simplify test interop, and in case of upgrades
+ * that change to the new type and then need to roll back for some reason.
+ *
+ * The long-term goal is to disentangle the client and server mount code.
+ */
+static struct file_system_type lustre_tgt_fstype = {
+	.owner		= THIS_MODULE,
+	.name		= "lustre_tgt",
+	.mount		= lustre_tgt_mount,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_REQUIRES_DEV | FS_RENAME_DOES_D_MOVE,
+};
+MODULE_ALIAS_FS("lustre_tgt");
+
+int lustre_tgt_register_fs(void)
+{
+	return register_filesystem(&lustre_tgt_fstype);
+}
+
+void lustre_tgt_unregister_fs(void)
+{
+	unregister_filesystem(&lustre_tgt_fstype);
+}
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
new file mode 100644
index 0000000000000..a175ebe7f1af1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
@@ -0,0 +1,2112 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/obd_mount_server.c
+ *
+ * Server mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER | D_CONFIG /* | D_WARNING */)
+#define PRINT_CMD CDEBUG
+#define PRINT_MASK (D_SUPER | D_CONFIG)
+
+#include <linux/types.h>
+#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
+#include <linux/selinux.h>
+#endif
+#include <linux/statfs.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+
+#include <llog_swab.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_log.h>
+#include <uapi/linux/lustre/lustre_param.h>
+#include <obd.h>
+#include <obd_class.h>
+
+/*********** mount lookup *********/
+
+static DEFINE_MUTEX(lustre_mount_info_lock);
+static LIST_HEAD(server_mount_info_list);
+
+static struct lustre_mount_info *server_find_mount(const char *name)
+{
+	struct list_head *tmp;
+	struct lustre_mount_info *lmi;
+	ENTRY;
+
+	list_for_each(tmp, &server_mount_info_list) {
+		lmi = list_entry(tmp, struct lustre_mount_info,
+				 lmi_list_chain);
+		if (strcmp(name, lmi->lmi_name) == 0)
+			RETURN(lmi);
+	}
+	RETURN(NULL);
+}
+
+/* we must register an obd for a mount before we call the setup routine.
+ *_setup will call lustre_get_mount to get the mnt struct
+ by obd_name, since we can't pass the pointer to setup. */
+static int server_register_mount(const char *name, struct super_block *sb)
+{
+	struct lustre_mount_info *lmi;
+	char *name_cp;
+	ENTRY;
+
+	LASSERT(sb);
+
+	OBD_ALLOC(lmi, sizeof(*lmi));
+	if (!lmi)
+		RETURN(-ENOMEM);
+	OBD_ALLOC(name_cp, strlen(name) + 1);
+	if (!name_cp) {
+		OBD_FREE(lmi, sizeof(*lmi));
+		RETURN(-ENOMEM);
+	}
+	strcpy(name_cp, name);
+
+	mutex_lock(&lustre_mount_info_lock);
+
+	if (server_find_mount(name)) {
+		mutex_unlock(&lustre_mount_info_lock);
+		OBD_FREE(lmi, sizeof(*lmi));
+		OBD_FREE(name_cp, strlen(name) + 1);
+		CERROR("Already registered %s\n", name);
+		RETURN(-EEXIST);
+	}
+	lmi->lmi_name = name_cp;
+	lmi->lmi_sb = sb;
+	list_add(&lmi->lmi_list_chain, &server_mount_info_list);
+
+	mutex_unlock(&lustre_mount_info_lock);
+
+	CDEBUG(D_MOUNT, "register mount %p from %s\n", sb, name);
+
+	RETURN(0);
+}
+
+/* when an obd no longer needs a mount */
+static int server_deregister_mount(const char *name)
+{
+	struct lustre_mount_info *lmi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	if (!lmi) {
+		mutex_unlock(&lustre_mount_info_lock);
+		CERROR("%s not registered\n", name);
+		RETURN(-ENOENT);
+	}
+
+	CDEBUG(D_MOUNT, "deregister mount %p from %s\n", lmi->lmi_sb, name);
+
+	OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
+	list_del(&lmi->lmi_list_chain);
+	OBD_FREE(lmi, sizeof(*lmi));
+	mutex_unlock(&lustre_mount_info_lock);
+
+	OBD_RACE(OBD_FAIL_MDS_LLOG_UMOUNT_RACE);
+	RETURN(0);
+}
+
+/* obd's look up a registered mount using their obdname. This is just
+   for initial obd setup to find the mount struct.  It should not be
+   called every time you want to mntget. */
+struct lustre_mount_info *server_get_mount(const char *name)
+{
+	struct lustre_mount_info *lmi;
+	struct lustre_sb_info *lsi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	mutex_unlock(&lustre_mount_info_lock);
+	if (!lmi) {
+		CERROR("Can't find mount for %s\n", name);
+		RETURN(NULL);
+	}
+	lsi = s2lsi(lmi->lmi_sb);
+
+	atomic_inc(&lsi->lsi_mounts);
+
+	CDEBUG(D_MOUNT, "get mount %p from %s, refs=%d\n", lmi->lmi_sb,
+	       name, atomic_read(&lsi->lsi_mounts));
+
+	RETURN(lmi);
+}
+EXPORT_SYMBOL(server_get_mount);
+
+/**
+ * server_put_mount: to be called from obd_cleanup methods
+ * @name:	obd name
+ * @dereg_mnt:	0 or 1 depending on whether the mount is to be deregistered or
+ * not
+ *
+ * The caller decides whether server_deregister_mount() needs to be called or
+ * not. Calling of server_deregister_mount() does not depend on refcounting on
+ * lsi because we could have say the mgs and mds on the same node and we
+ * unmount the mds, then the ref on the lsi would still be non-zero but we
+ * would still want to deregister the mds mount.
+ */
+int server_put_mount(const char *name, bool dereg_mnt)
+{
+	struct lustre_mount_info *lmi;
+	struct lustre_sb_info *lsi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	mutex_unlock(&lustre_mount_info_lock);
+	if (!lmi) {
+		CERROR("Can't find mount for %s\n", name);
+		RETURN(-ENOENT);
+	}
+	lsi = s2lsi(lmi->lmi_sb);
+
+	CDEBUG(D_MOUNT, "put mount %p from %s, refs=%d\n",
+	       lmi->lmi_sb, name, atomic_read(&lsi->lsi_mounts));
+
+	if (lustre_put_lsi(lmi->lmi_sb))
+		CDEBUG(D_MOUNT, "Last put of mount %p from %s\n",
+		       lmi->lmi_sb, name);
+
+	if (dereg_mnt)
+		/* this obd should never need the mount again */
+		server_deregister_mount(name);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(server_put_mount);
+
+/* Set up a MGS to serve startup logs */
+static int server_start_mgs(struct super_block *sb)
+{
+	struct lustre_sb_info    *lsi = s2lsi(sb);
+	struct lustre_mount_info *lmi;
+	int    rc = 0;
+	ENTRY;
+
+	/* It is impossible to have more than 1 MGS per node, since
+	   MGC wouldn't know which to connect to */
+	lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
+	if (lmi) {
+		lsi = s2lsi(lmi->lmi_sb);
+		LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
+				   " from server\n");
+		RETURN(-EALREADY);
+	}
+
+	CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+	rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb);
+
+	if (!rc) {
+		rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
+					 LUSTRE_MGS_OBDNAME, NULL, NULL,
+					 lsi->lsi_osd_obdname, NULL);
+		/* server_deregister_mount() is not called previously, for lsi
+		 * and other stuff can't be freed cleanly when mgs calls
+		 * server_put_mount() in error handling case (see b=17758),
+		 * this problem is caused by a bug in mgs_init0, which forgot
+		 * calling server_put_mount in error case. */
+
+		if (rc)
+			server_deregister_mount(LUSTRE_MGS_OBDNAME);
+	}
+
+	if (rc)
+		LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
+				   "Is the 'mgs' module loaded?\n",
+				   LUSTRE_MGS_OBDNAME, rc);
+	RETURN(rc);
+}
+
+static int server_stop_mgs(struct super_block *sb)
+{
+	struct obd_device *obd;
+	int rc;
+	struct lustre_mount_info *lmi;
+	ENTRY;
+
+	/* Do not stop MGS if this device is not the running MGT */
+	lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
+	if (lmi != NULL && lmi->lmi_sb != sb)
+		RETURN(0);
+
+	CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+	/* There better be only one MGS */
+	obd = class_name2obd(LUSTRE_MGS_OBDNAME);
+	if (!obd) {
+		CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
+		RETURN(-EALREADY);
+	}
+
+	/* The MGS should always stop when we say so */
+	obd->obd_force = 1;
+	rc = class_manual_cleanup(obd);
+	RETURN(rc);
+}
+
+/* Since there's only one mgc per node, we have to change it's fs to get
+   access to the right disk. */
+static int server_mgc_set_fs(const struct lu_env *env,
+			     struct obd_device *mgc, struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
+
+	/* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
+	rc = obd_set_info_async(env, mgc->obd_self_export,
+				sizeof(KEY_SET_FS), KEY_SET_FS,
+				sizeof(*sb), sb, NULL);
+	if (rc != 0)
+		CERROR("can't set_fs %d\n", rc);
+
+	RETURN(rc);
+}
+
+static int server_mgc_clear_fs(const struct lu_env *env,
+			       struct obd_device *mgc)
+{
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Unassign mgc disk\n");
+
+	rc = obd_set_info_async(env, mgc->obd_self_export,
+				sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
+				0, NULL, NULL);
+	RETURN(rc);
+}
+
+static inline bool is_mdc_device(const char *devname)
+{
+	char *ptr;
+
+	ptr = strrchr(devname, '-');
+	return ptr != NULL && strcmp(ptr, "-mdc") == 0;
+}
+
+static inline bool tgt_is_mdt(const char *tgtname, __u32 *idx)
+{
+	int type;
+
+	type = server_name2index(tgtname, idx, NULL);
+
+	return type == LDD_F_SV_TYPE_MDT;
+}
+
+/**
+ * Convert OST/MDT name(fsname-{MDT,OST}xxxx) to a lwp name with the @idx:yyyy
+ * (fsname-MDTyyyy-lwp-{MDT,OST}xxxx)
+ **/
+int tgt_name2lwp_name(const char *tgt_name, char *lwp_name, int len, __u32 idx)
+{
+	char		*fsname;
+	const char	*tgt;
+	int		rc;
+	ENTRY;
+
+	OBD_ALLOC(fsname, MTI_NAME_MAXLEN);
+	if (fsname == NULL)
+		RETURN(-ENOMEM);
+
+	rc = server_name2fsname(tgt_name, fsname, &tgt);
+	if (rc != 0) {
+		CERROR("%s: failed to get fsname from tgt_name: rc = %d\n",
+		       tgt_name, rc);
+		GOTO(cleanup, rc);
+	}
+
+	if (*tgt != '-' && *tgt != ':') {
+		CERROR("%s: invalid tgt_name name!\n", tgt_name);
+		GOTO(cleanup, rc = -EINVAL);
+	}
+
+	tgt++;
+	if (strncmp(tgt, "OST", 3) != 0 && strncmp(tgt, "MDT", 3) != 0) {
+		CERROR("%s is not an OST or MDT target!\n", tgt_name);
+		GOTO(cleanup, rc = -EINVAL);
+	}
+	snprintf(lwp_name, len, "%s-MDT%04x-%s-%s",
+		 fsname, idx, LUSTRE_LWP_NAME, tgt);
+
+	GOTO(cleanup, rc = 0);
+
+cleanup:
+	if (fsname != NULL)
+		OBD_FREE(fsname, MTI_NAME_MAXLEN);
+
+	return rc;
+}
+EXPORT_SYMBOL(tgt_name2lwp_name);
+
+static LIST_HEAD(lwp_register_list);
+static DEFINE_SPINLOCK(lwp_register_list_lock);
+
+static void lustre_put_lwp_item(struct lwp_register_item *lri)
+{
+	if (atomic_dec_and_test(&lri->lri_ref)) {
+		LASSERT(list_empty(&lri->lri_list));
+
+		if (*lri->lri_exp != NULL)
+			class_export_put(*lri->lri_exp);
+		OBD_FREE_PTR(lri);
+	}
+}
+
+int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp,
+			     register_lwp_cb cb_func, void *cb_data)
+{
+	struct obd_device	 *lwp;
+	struct lwp_register_item *lri;
+	bool cb = false;
+	ENTRY;
+
+	LASSERTF(strlen(lwpname) < MTI_NAME_MAXLEN, "lwpname is too long %s\n",
+		 lwpname);
+	LASSERT(exp != NULL && *exp == NULL);
+
+	OBD_ALLOC_PTR(lri);
+	if (lri == NULL)
+		RETURN(-ENOMEM);
+
+	lwp = class_name2obd(lwpname);
+	if (lwp != NULL && lwp->obd_set_up == 1) {
+		struct obd_uuid *uuid;
+
+		OBD_ALLOC_PTR(uuid);
+		if (uuid == NULL) {
+			OBD_FREE_PTR(lri);
+			RETURN(-ENOMEM);
+		}
+		memcpy(uuid->uuid, lwpname, strlen(lwpname));
+		*exp = obd_uuid_lookup(lwp, uuid);
+		OBD_FREE_PTR(uuid);
+	}
+
+	memcpy(lri->lri_name, lwpname, strlen(lwpname));
+	lri->lri_exp = exp;
+	lri->lri_cb_func = cb_func;
+	lri->lri_cb_data = cb_data;
+	INIT_LIST_HEAD(&lri->lri_list);
+	/*
+	 * Initialize the lri_ref at 2, one will be released before
+	 * current function returned via lustre_put_lwp_item(), the
+	 * other will be released in lustre_deregister_lwp_item().
+	 */
+	atomic_set(&lri->lri_ref, 2);
+
+	spin_lock(&lwp_register_list_lock);
+	list_add(&lri->lri_list, &lwp_register_list);
+	if (*exp != NULL)
+		cb = true;
+	spin_unlock(&lwp_register_list_lock);
+
+	if (cb && cb_func != NULL)
+		cb_func(cb_data);
+	lustre_put_lwp_item(lri);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_register_lwp_item);
+
+void lustre_deregister_lwp_item(struct obd_export **exp)
+{
+	struct lwp_register_item *lri;
+	bool removed = false;
+	int repeat = 0;
+
+	spin_lock(&lwp_register_list_lock);
+	list_for_each_entry(lri, &lwp_register_list, lri_list) {
+		if (exp == lri->lri_exp) {
+			list_del_init(&lri->lri_list);
+			removed = true;
+			break;
+		}
+	}
+	spin_unlock(&lwp_register_list_lock);
+
+	if (!removed)
+		return;
+
+	/* See lustre_notify_lwp_list(), in some extreme race conditions,
+	 * the notify callback could be still on the fly, we need to wait
+	 * for the callback done before moving on to free the data used
+	 * by callback. */
+	while (atomic_read(&lri->lri_ref) > 1) {
+		CDEBUG(D_MOUNT, "lri reference count %u, repeat: %d\n",
+		       atomic_read(&lri->lri_ref), repeat);
+		repeat++;
+		schedule_timeout_interruptible(cfs_time_seconds(1));
+	}
+	lustre_put_lwp_item(lri);
+}
+EXPORT_SYMBOL(lustre_deregister_lwp_item);
+
+struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx)
+{
+	struct lustre_mount_info *lmi;
+	struct lustre_sb_info	 *lsi;
+	struct obd_device	 *lwp;
+	struct obd_export	 *exp = NULL;
+	char			  fsname[16];
+	char			  lwp_name[24];
+	int			  rc;
+
+	lmi = server_get_mount(dev);
+	if (lmi == NULL)
+		return NULL;
+
+	lsi = s2lsi(lmi->lmi_sb);
+	rc = server_name2fsname(lsi->lsi_svname, fsname, NULL);
+	if (rc != 0) {
+		CERROR("%s: failed to get fsname: rc = %d\n",
+		       lsi->lsi_svname, rc);
+		goto err_lmi;
+	}
+
+	snprintf(lwp_name, sizeof(lwp_name), "%s-MDT%04x", fsname, idx);
+	mutex_lock(&lsi->lsi_lwp_mutex);
+	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
+		char *ptr = strstr(lwp->obd_name, lwp_name);
+
+		if (ptr != NULL && lwp->obd_lwp_export != NULL) {
+			exp = class_export_get(lwp->obd_lwp_export);
+			break;
+		}
+	}
+	mutex_unlock(&lsi->lsi_lwp_mutex);
+
+err_lmi:
+	server_put_mount(dev, false);
+
+	return exp;
+}
+EXPORT_SYMBOL(lustre_find_lwp_by_index);
+
+void lustre_notify_lwp_list(struct obd_export *exp)
+{
+	struct lwp_register_item *lri;
+	LASSERT(exp != NULL);
+
+again:
+	spin_lock(&lwp_register_list_lock);
+	list_for_each_entry(lri, &lwp_register_list, lri_list) {
+		if (strcmp(exp->exp_obd->obd_name, lri->lri_name))
+			continue;
+		if (*lri->lri_exp != NULL)
+			continue;
+		*lri->lri_exp = class_export_get(exp);
+		if (lri->lri_cb_func == NULL)
+			continue;
+		atomic_inc(&lri->lri_ref);
+		spin_unlock(&lwp_register_list_lock);
+
+		lri->lri_cb_func(lri->lri_cb_data);
+		lustre_put_lwp_item(lri);
+
+		/* Others may have changed the list after we unlock, we have
+		 * to rescan the list from the beginning. Usually, the list
+		 * 'lwp_register_list' is very short, and there is 'guard'
+		 * lri::lri_exp that will prevent the callback to be done
+		 * repeatedly. So rescanning the list has no problem. */
+		goto again;
+	}
+	spin_unlock(&lwp_register_list_lock);
+}
+EXPORT_SYMBOL(lustre_notify_lwp_list);
+
+static int lustre_lwp_connect(struct obd_device *lwp, bool is_mdt)
+{
+	struct lu_env		 env;
+	struct lu_context	 session_ctx;
+	struct obd_export	*exp;
+	struct obd_uuid		*uuid = NULL;
+	struct obd_connect_data	*data = NULL;
+	int			 rc;
+	ENTRY;
+
+	/* log has been fully processed, let clients connect */
+	rc = lu_env_init(&env, lwp->obd_lu_dev->ld_type->ldt_ctx_tags);
+	if (rc != 0)
+		RETURN(rc);
+
+	lu_context_init(&session_ctx, LCT_SERVER_SESSION);
+	session_ctx.lc_thread = NULL;
+	lu_context_enter(&session_ctx);
+	env.le_ses = &session_ctx;
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+	data->ocd_connect_flags |= OBD_CONNECT_FID | OBD_CONNECT_AT |
+		OBD_CONNECT_LRU_RESIZE | OBD_CONNECT_FULL20 |
+		OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LIGHTWEIGHT |
+		OBD_CONNECT_LFSCK | OBD_CONNECT_BULK_MBITS;
+
+	if (is_mdt)
+		data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS;
+
+	OBD_ALLOC_PTR(uuid);
+	if (uuid == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (strlen(lwp->obd_name) > sizeof(uuid->uuid)) {
+		CERROR("%s: Too long lwp name %s, max_size is %d\n",
+		       lwp->obd_name, lwp->obd_name, (int)sizeof(uuid->uuid));
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* Use lwp name as the uuid, so we find the export by lwp name later */
+	memcpy(uuid->uuid, lwp->obd_name, strlen(lwp->obd_name));
+	rc = obd_connect(&env, &exp, lwp, uuid, data, NULL);
+	if (rc != 0) {
+		CERROR("%s: connect failed: rc = %d\n", lwp->obd_name, rc);
+	} else {
+		if (unlikely(lwp->obd_lwp_export != NULL))
+			class_export_put(lwp->obd_lwp_export);
+		lwp->obd_lwp_export = class_export_get(exp);
+	}
+
+	GOTO(out, rc);
+
+out:
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (uuid != NULL)
+		OBD_FREE_PTR(uuid);
+
+	lu_env_fini(&env);
+	lu_context_exit(&session_ctx);
+	lu_context_fini(&session_ctx);
+
+	return rc;
+}
+
+/**
+ * lwp is used by slaves (Non-MDT0 targets) to manage the connection to MDT0,
+ * or from the OSTx to MDTy.
+ **/
+static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi,
+			    __u32 idx)
+{
+	struct obd_device	*obd;
+	char			*lwpname = NULL;
+	char			*lwpuuid = NULL;
+	int			 rc;
+	ENTRY;
+
+	rc = class_add_uuid(lustre_cfg_string(lcfg, 1),
+			    lcfg->lcfg_nid);
+	if (rc != 0) {
+		CERROR("%s: Can't add uuid: rc =%d\n", lsi->lsi_svname, rc);
+		RETURN(rc);
+	}
+
+	OBD_ALLOC(lwpname, MTI_NAME_MAXLEN);
+	if (lwpname == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = tgt_name2lwp_name(lsi->lsi_svname, lwpname, MTI_NAME_MAXLEN, idx);
+	if (rc != 0) {
+		CERROR("%s: failed to generate lwp name: rc = %d\n",
+		       lsi->lsi_svname, rc);
+		GOTO(out, rc);
+	}
+
+	OBD_ALLOC(lwpuuid, MTI_NAME_MAXLEN);
+	if (lwpuuid == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	sprintf(lwpuuid, "%s_UUID", lwpname);
+	rc = lustre_start_simple(lwpname, LUSTRE_LWP_NAME,
+				 lwpuuid, lustre_cfg_string(lcfg, 1),
+				 NULL, NULL, NULL);
+	if (rc) {
+		CERROR("%s: setup up failed: rc %d\n", lwpname, rc);
+		GOTO(out, rc);
+	}
+
+	obd = class_name2obd(lwpname);
+	LASSERT(obd != NULL);
+
+	rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL);
+	if (rc == 0) {
+		obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE;
+		mutex_lock(&lsi->lsi_lwp_mutex);
+		list_add_tail(&obd->obd_lwp_list, &lsi->lsi_lwp_list);
+		mutex_unlock(&lsi->lsi_lwp_mutex);
+	} else {
+		CERROR("%s: connect failed: rc = %d\n", lwpname, rc);
+	}
+
+	GOTO(out, rc);
+
+out:
+	if (lwpname != NULL)
+		OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+	if (lwpuuid != NULL)
+		OBD_FREE(lwpuuid, MTI_NAME_MAXLEN);
+
+	return rc;
+}
+
+/* the caller is responsible for memory free */
+static struct obd_device *lustre_find_lwp(struct lustre_sb_info *lsi,
+					  char **lwpname, __u32 idx)
+{
+	struct obd_device	*lwp;
+	int			 rc = 0;
+	ENTRY;
+
+	LASSERT(lwpname != NULL);
+	LASSERT(IS_OST(lsi) || IS_MDT(lsi));
+
+	OBD_ALLOC(*lwpname, MTI_NAME_MAXLEN);
+	if (*lwpname == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = tgt_name2lwp_name(lsi->lsi_svname, *lwpname, MTI_NAME_MAXLEN, idx);
+	if (rc != 0) {
+		CERROR("%s: failed to generate lwp name: rc = %d\n",
+		       lsi->lsi_svname, rc);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	lwp = class_name2obd(*lwpname);
+
+out:
+	if (rc != 0) {
+		if (*lwpname != NULL) {
+			OBD_FREE(*lwpname, MTI_NAME_MAXLEN);
+			*lwpname = NULL;
+		}
+		lwp = ERR_PTR(rc);
+	}
+
+	RETURN(lwp != NULL ? lwp : ERR_PTR(-ENOENT));
+}
+
+static int lustre_lwp_add_conn(struct lustre_cfg *cfg,
+			       struct lustre_sb_info *lsi, __u32 idx)
+{
+	struct lustre_cfg_bufs *bufs = NULL;
+	struct lustre_cfg      *lcfg = NULL;
+	char		       *lwpname = NULL;
+	struct obd_device      *lwp;
+	int			rc;
+	ENTRY;
+
+	lwp = lustre_find_lwp(lsi, &lwpname, idx);
+	if (IS_ERR(lwp)) {
+		CERROR("%s: can't find lwp device.\n", lsi->lsi_svname);
+		GOTO(out, rc = PTR_ERR(lwp));
+	}
+	LASSERT(lwpname != NULL);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	lustre_cfg_bufs_reset(bufs, lwpname);
+	lustre_cfg_bufs_set_string(bufs, 1,
+				   lustre_cfg_string(cfg, 1));
+
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
+	if (!lcfg)
+		GOTO(out_cfg, rc = -ENOMEM);
+	lustre_cfg_init(lcfg, LCFG_ADD_CONN, bufs);
+
+	rc = class_add_conn(lwp, lcfg);
+	if (rc)
+		CERROR("%s: can't add conn: rc = %d\n", lwpname, rc);
+
+	if (lcfg)
+		OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount,
+					      lcfg->lcfg_buflens));
+out_cfg:
+	if (bufs != NULL)
+		OBD_FREE_PTR(bufs);
+out:
+	if (lwpname != NULL)
+		OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+	RETURN(rc);
+}
+
+/**
+ * Retrieve MDT nids from the client log, then start the lwp device.
+ * there are only two scenarios which would include mdt nid.
+ * 1.
+ * marker   5 (flags=0x01, v2.1.54.0) lustre-MDTyyyy  'add mdc' xxx-
+ * add_uuid  nid=192.168.122.162@tcp(0x20000c0a87aa2)  0:  1:192.168.122.162@tcp
+ * attach    0:lustre-MDTyyyy-mdc  1:mdc  2:lustre-clilmv_UUID
+ * setup     0:lustre-MDTyyyy-mdc  1:lustre-MDTyyyy_UUID  2:192.168.122.162@tcp
+ * add_uuid  nid=192.168.172.1@tcp(0x20000c0a8ac01)  0:  1:192.168.172.1@tcp
+ * add_conn  0:lustre-MDTyyyy-mdc  1:192.168.172.1@tcp
+ * modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDTyyyy_UUID xxxx
+ * marker   5 (flags=0x02, v2.1.54.0) lustre-MDTyyyy  'add mdc' xxxx-
+ * 2.
+ * marker   7 (flags=0x01, v2.1.54.0) lustre-MDTyyyy  'add failnid' xxxx-
+ * add_uuid  nid=192.168.122.2@tcp(0x20000c0a87a02)  0:  1:192.168.122.2@tcp
+ * add_conn  0:lustre-MDTyyyy-mdc  1:192.168.122.2@tcp
+ * marker   7 (flags=0x02, v2.1.54.0) lustre-MDTyyyy  'add failnid' xxxx-
+ **/
+static int client_lwp_config_process(const struct lu_env *env,
+				     struct llog_handle *handle,
+				     struct llog_rec_hdr *rec, void *data)
+{
+	struct config_llog_instance *cfg = data;
+	int			     cfg_len = rec->lrh_len;
+	char			    *cfg_buf = (char *) (rec + 1);
+	struct lustre_cfg	    *lcfg = NULL;
+	struct lustre_sb_info	    *lsi;
+	int			     rc = 0, swab = 0;
+	ENTRY;
+
+	if (rec->lrh_type != OBD_CFG_REC) {
+		CERROR("Unknown llog record type %#x encountered\n",
+		       rec->lrh_type);
+		RETURN(-EINVAL);
+	}
+
+	if (cfg->cfg_sb == NULL)
+		GOTO(out, rc = -EINVAL);
+	lsi = s2lsi(cfg->cfg_sb);
+
+	lcfg = (struct lustre_cfg *)cfg_buf;
+	if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+		lustre_swab_lustre_cfg(lcfg);
+		swab = 1;
+	}
+
+	rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+	if (rc)
+		GOTO(out, rc);
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_MARKER: {
+		struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+		lustre_swab_cfg_marker(marker, swab,
+				       LUSTRE_CFG_BUFLEN(lcfg, 1));
+		if (marker->cm_flags & CM_SKIP ||
+		    marker->cm_flags & CM_EXCLUDE)
+			GOTO(out, rc = 0);
+
+		if (!tgt_is_mdt(marker->cm_tgtname, &cfg->cfg_lwp_idx))
+			GOTO(out, rc = 0);
+
+		if (IS_MDT(lsi) && cfg->cfg_lwp_idx != 0)
+			GOTO(out, rc = 0);
+
+		if (!strncmp(marker->cm_comment, "add mdc", 7) ||
+		    !strncmp(marker->cm_comment, "add failnid", 11)) {
+			if (marker->cm_flags & CM_START) {
+				cfg->cfg_flags = CFG_F_MARKER;
+				/* This hack is to differentiate the
+				 * ADD_UUID is come from "add mdc" record
+				 * or from "add failnid" record. */
+				if (!strncmp(marker->cm_comment,
+					     "add failnid", 11))
+					cfg->cfg_flags |= CFG_F_SKIP;
+			} else if (marker->cm_flags & CM_END) {
+				cfg->cfg_flags = 0;
+			}
+		}
+		break;
+	}
+	case LCFG_ADD_UUID: {
+		if (cfg->cfg_flags == CFG_F_MARKER) {
+			rc = lustre_lwp_setup(lcfg, lsi, cfg->cfg_lwp_idx);
+			/* XXX: process only the first nid as
+			 * we don't need another instance of lwp */
+			cfg->cfg_flags |= CFG_F_SKIP;
+		} else if (cfg->cfg_flags == (CFG_F_MARKER | CFG_F_SKIP)) {
+			rc = class_add_uuid(lustre_cfg_string(lcfg, 1),
+					    lcfg->lcfg_nid);
+			if (rc)
+				CERROR("%s: Fail to add uuid, rc:%d\n",
+				       lsi->lsi_svname, rc);
+		}
+		break;
+	}
+	case LCFG_ADD_CONN: {
+		char *devname = lustre_cfg_string(lcfg, 0);
+		char *ptr;
+		__u32 idx     = 0;
+
+		if (!is_mdc_device(devname))
+			break;
+
+		if (!(cfg->cfg_flags & CFG_F_MARKER)) {
+			CDEBUG(D_CONFIG, "Skipping add_conn for %s, rec %d\n",
+			       devname, rec->lrh_index);
+			break;
+		}
+
+		/* add_conn should follow by add_uuid. This
+		 * guarantee lwp device was created
+		 */
+		if (!(cfg->cfg_flags & CFG_F_SKIP)) {
+			CWARN("Error at config for %s rec %d, add_conn should follow by add_uuid\n",
+			      devname, rec->lrh_index);
+			break;
+		}
+		ptr = strrchr(devname, '-');
+		if (ptr == NULL)
+			break;
+
+		*ptr = 0;
+		if (!tgt_is_mdt(devname, &idx)) {
+			*ptr = '-';
+			break;
+		}
+		*ptr = '-';
+
+		if (IS_MDT(lsi) && idx != 0)
+			break;
+
+		rc = lustre_lwp_add_conn(lcfg, lsi, idx);
+		break;
+	}
+	default:
+		break;
+	}
+out:
+	RETURN(rc);
+}
+
+static int lustre_disconnect_lwp(struct super_block *sb)
+{
+	struct lustre_sb_info		*lsi	 = s2lsi(sb);
+	struct obd_device		*lwp;
+	char				*logname = NULL;
+	struct lustre_cfg_bufs		*bufs	 = NULL;
+	struct config_llog_instance	*cfg	 = NULL;
+	int				 rc	 = 0;
+	int				 rc1	 = 0;
+	ENTRY;
+
+	if (likely(lsi->lsi_lwp_started)) {
+		OBD_ALLOC(logname, MTI_NAME_MAXLEN);
+		if (logname == NULL)
+			RETURN(-ENOMEM);
+
+		rc = server_name2fsname(lsi->lsi_svname, logname, NULL);
+		if (rc != 0) {
+			CERROR("%s: failed to get fsname from svname: "
+			       "rc = %d\n", lsi->lsi_svname, rc);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		strcat(logname, "-client");
+		OBD_ALLOC_PTR(cfg);
+		if (cfg == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		/* end log first */
+		cfg->cfg_instance = ll_get_cfg_instance(sb);
+		rc = lustre_end_log(sb, logname, cfg);
+		if (rc != 0 && rc != -ENOENT)
+			GOTO(out, rc);
+
+		lsi->lsi_lwp_started = 0;
+	}
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mutex_lock(&lsi->lsi_lwp_mutex);
+	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
+		struct lustre_cfg *lcfg;
+
+		if (likely(lwp->obd_lwp_export != NULL)) {
+			class_export_put(lwp->obd_lwp_export);
+			lwp->obd_lwp_export = NULL;
+		}
+
+		lustre_cfg_bufs_reset(bufs, lwp->obd_name);
+		lustre_cfg_bufs_set_string(bufs, 1, NULL);
+		OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+					       bufs->lcfg_buflen));
+		if (!lcfg) {
+			rc = -ENOMEM;
+			break;
+		}
+		lustre_cfg_init(lcfg, LCFG_CLEANUP, bufs);
+
+		/* Disconnect import first. NULL is passed for the '@env',
+		 * since it will not be used. */
+		rc = lwp->obd_lu_dev->ld_ops->ldo_process_config(NULL,
+							lwp->obd_lu_dev, lcfg);
+		OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount,
+					      lcfg->lcfg_buflens));
+		if (rc != 0 && rc != -ETIMEDOUT) {
+			CERROR("%s: fail to disconnect LWP: rc = %d\n",
+			       lwp->obd_name, rc);
+			rc1 = rc;
+		}
+	}
+	mutex_unlock(&lsi->lsi_lwp_mutex);
+
+	GOTO(out, rc);
+
+out:
+	if (bufs != NULL)
+		OBD_FREE_PTR(bufs);
+	if (cfg != NULL)
+		OBD_FREE_PTR(cfg);
+	if (logname != NULL)
+		OBD_FREE(logname, MTI_NAME_MAXLEN);
+
+	return rc1 != 0 ? rc1 : rc;
+}
+
+/**
+ * Stop the lwp for an OST/MDT target.
+ **/
+static int lustre_stop_lwp(struct super_block *sb)
+{
+	struct lustre_sb_info	*lsi = s2lsi(sb);
+	struct obd_device	*lwp;
+	int			 rc  = 0;
+	int			 rc1 = 0;
+	ENTRY;
+
+	mutex_lock(&lsi->lsi_lwp_mutex);
+	while (!list_empty(&lsi->lsi_lwp_list)) {
+		lwp = list_entry(lsi->lsi_lwp_list.next, struct obd_device,
+				 obd_lwp_list);
+		list_del_init(&lwp->obd_lwp_list);
+		lwp->obd_force = 1;
+		mutex_unlock(&lsi->lsi_lwp_mutex);
+
+		rc = class_manual_cleanup(lwp);
+		if (rc != 0) {
+			CERROR("%s: fail to stop LWP: rc = %d\n",
+			       lwp->obd_name, rc);
+			rc1 = rc;
+		}
+		mutex_lock(&lsi->lsi_lwp_mutex);
+	}
+	mutex_unlock(&lsi->lsi_lwp_mutex);
+
+	RETURN(rc1 != 0 ? rc1 : rc);
+}
+
+/**
+ * Start the lwp(fsname-MDTyyyy-lwp-{MDT,OST}xxxx) for a MDT/OST or MDT target.
+ **/
+static int lustre_start_lwp(struct super_block *sb)
+{
+	struct lustre_sb_info	    *lsi = s2lsi(sb);
+	struct config_llog_instance *cfg = NULL;
+	char			    *logname;
+	int			     rc;
+	ENTRY;
+
+	if (unlikely(lsi->lsi_lwp_started))
+		RETURN(0);
+
+	OBD_ALLOC(logname, MTI_NAME_MAXLEN);
+	if (logname == NULL)
+		RETURN(-ENOMEM);
+
+	rc = server_name2fsname(lsi->lsi_svname, logname, NULL);
+	if (rc != 0) {
+		CERROR("%s: failed to get fsname from svname: rc = %d\n",
+		       lsi->lsi_svname, rc);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	strcat(logname, "-client");
+	OBD_ALLOC_PTR(cfg);
+	if (cfg == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	cfg->cfg_callback = client_lwp_config_process;
+	cfg->cfg_instance = ll_get_cfg_instance(sb);
+	rc = lustre_process_log(sb, logname, cfg);
+	/* need to remove config llog from mgc */
+	lsi->lsi_lwp_started = 1;
+
+	GOTO(out, rc);
+
+out:
+	OBD_FREE(logname, MTI_NAME_MAXLEN);
+	if (cfg != NULL)
+		OBD_FREE_PTR(cfg);
+
+	return rc;
+}
+
+static DEFINE_MUTEX(server_start_lock);
+
+/* Stop MDS/OSS if nobody is using them */
+static int server_stop_servers(int lsiflags)
+{
+	struct obd_device *obd = NULL;
+	struct obd_type *type = NULL;
+	int rc = 0;
+	bool type_last;
+	ENTRY;
+
+	mutex_lock(&server_start_lock);
+
+	/* Either an MDT or an OST or neither  */
+	/* if this was an MDT, and there are no more MDT's, clean up the MDS */
+	if (lsiflags & LDD_F_SV_TYPE_MDT) {
+		obd = class_name2obd(LUSTRE_MDS_OBDNAME);
+		type = class_search_type(LUSTRE_MDT_NAME);
+	} else if (lsiflags & LDD_F_SV_TYPE_OST) {
+	/* if this was an OST, and there are no more OST's, clean up the OSS */
+		obd = class_name2obd(LUSTRE_OSS_OBDNAME);
+		type = class_search_type(LUSTRE_OST_NAME);
+	}
+
+	/* server_stop_servers is a pair of server_start_targets
+	 * Here we put type which was taken at server_start_targets.
+	 * If type is NULL then there is a wrong logic around type or
+	 * type reference. */
+	LASSERTF(type, "Server flags %d, obd %s\n", lsiflags,
+		 obd ? obd->obd_name : "NULL");
+
+	type_last = (atomic_read(&type->typ_refcnt) == 1);
+
+	class_put_type(type);
+	if (obd != NULL && type_last) {
+		obd->obd_force = 1;
+		/* obd_fail doesn't mean much on a server obd */
+		rc = class_manual_cleanup(obd);
+	}
+
+	/* put reference taken by class_search_type */
+	kobject_put(&type->typ_kobj);
+
+	mutex_unlock(&server_start_lock);
+
+	RETURN(rc);
+}
+
+int server_mti_print(const char *title, struct mgs_target_info *mti)
+{
+	PRINT_CMD(PRINT_MASK, "mti %s\n", title);
+	PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
+	PRINT_CMD(PRINT_MASK, "fs:     %s\n", mti->mti_fsname);
+	PRINT_CMD(PRINT_MASK, "uuid:   %s\n", mti->mti_uuid);
+	PRINT_CMD(PRINT_MASK, "ver: %d  flags: %#x\n",
+		  mti->mti_config_ver, mti->mti_flags);
+	return 0;
+}
+
+/* Generate data for registration */
+static int server_lsi2mti(struct lustre_sb_info *lsi,
+			  struct mgs_target_info *mti)
+{
+	struct lnet_processid id;
+	int rc, i = 0;
+	int cplen = 0;
+	ENTRY;
+
+	if (!IS_SERVER(lsi))
+		RETURN(-EINVAL);
+
+	if (strlcpy(mti->mti_svname, lsi->lsi_svname, sizeof(mti->mti_svname))
+	    >= sizeof(mti->mti_svname))
+		RETURN(-E2BIG);
+
+	mti->mti_nid_count = 0;
+	while (LNetGetId(i++, &id) != -ENOENT) {
+		if (nid_is_lo0(&id.nid))
+			continue;
+
+		/* server use --servicenode param, only allow specified
+		 * nids be registered */
+		if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) != 0 &&
+		    class_match_nid(lsi->lsi_lmd->lmd_params,
+				    PARAM_FAILNODE,
+				    lnet_nid_to_nid4(&id.nid)) < 1)
+			continue;
+
+		/* match specified network */
+		if (!class_match_net(lsi->lsi_lmd->lmd_params,
+				     PARAM_NETWORK, LNET_NID_NET(&id.nid)))
+			continue;
+
+		mti->mti_nids[mti->mti_nid_count] = lnet_nid_to_nid4(&id.nid);
+		mti->mti_nid_count++;
+		if (mti->mti_nid_count >= MTI_NIDS_MAX) {
+			CWARN("Only using first %d nids for %s\n",
+			      mti->mti_nid_count, mti->mti_svname);
+			break;
+		}
+	}
+
+	if (mti->mti_nid_count == 0) {
+		CERROR("Failed to get NID for server %s, please check whether "
+		       "the target is specifed with improper --servicenode or "
+		       "--network options.\n", mti->mti_svname);
+		RETURN(-EINVAL);
+	}
+
+	mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
+	mti->mti_config_ver = 0;
+
+	rc = server_name2fsname(lsi->lsi_svname, mti->mti_fsname, NULL);
+	if (rc != 0)
+		return rc;
+
+	rc = server_name2index(lsi->lsi_svname, &mti->mti_stripe_index, NULL);
+	if (rc < 0)
+		return rc;
+	/* Orion requires index to be set */
+	LASSERT(!(rc & LDD_F_NEED_INDEX));
+	/* keep only LDD flags */
+	mti->mti_flags = lsi->lsi_flags & LDD_F_MASK;
+	if (mti->mti_flags & (LDD_F_WRITECONF | LDD_F_VIRGIN))
+		mti->mti_flags |= LDD_F_UPDATE;
+	cplen = strlcpy(mti->mti_params, lsi->lsi_lmd->lmd_params,
+			sizeof(mti->mti_params));
+	if (cplen >= sizeof(mti->mti_params))
+		return -E2BIG;
+	return 0;
+}
+
+/* Register an old or new target with the MGS. If needed MGS will construct
+   startup logs and assign index */
+static int server_register_target(struct lustre_sb_info *lsi)
+{
+	struct obd_device *mgc = lsi->lsi_mgc;
+	struct mgs_target_info *mti = NULL;
+	bool must_succeed;
+	int rc;
+	int tried = 0;
+	ENTRY;
+
+	LASSERT(mgc);
+
+	if (!IS_SERVER(lsi))
+		RETURN(-EINVAL);
+
+	OBD_ALLOC_PTR(mti);
+	if (!mti)
+		RETURN(-ENOMEM);
+
+	rc = server_lsi2mti(lsi, mti);
+	if (rc)
+		GOTO(out, rc);
+
+	CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
+	       mti->mti_svname, mti->mti_fsname,
+	       libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
+	       mti->mti_flags);
+
+	/* we cannot ignore registration failure if MGS logs must be updated. */
+	must_succeed = !!(lsi->lsi_flags &
+		    (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_WRITECONF |
+		     LDD_F_VIRGIN));
+	mti->mti_flags |= LDD_F_OPC_REG;
+
+again:
+	/* Register the target */
+	/* FIXME use mgc_process_config instead */
+	rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
+				sizeof(KEY_REGISTER_TARGET),
+				KEY_REGISTER_TARGET,
+				sizeof(*mti), mti, NULL);
+	if (rc) {
+		if (mti->mti_flags & LDD_F_ERROR) {
+			LCONSOLE_ERROR_MSG(0x160,
+				"%s: the MGS refuses to allow this server "
+				"to start: rc = %d. Please see messages on "
+				"the MGS.\n", lsi->lsi_svname, rc);
+		} else if (must_succeed) {
+			if ((rc == -ESHUTDOWN || rc == -EIO) && ++tried < 5) {
+				/* The connection with MGS is not established.
+				 * Try again after 2 seconds. Interruptable. */
+				schedule_timeout_interruptible(
+					cfs_time_seconds(2));
+				if (!signal_pending(current))
+					goto again;
+			}
+
+			LCONSOLE_ERROR_MSG(0x15f,
+				"%s: cannot register this server with the MGS: "
+				"rc = %d. Is the MGS running?\n",
+				lsi->lsi_svname, rc);
+		} else {
+			CDEBUG(D_HA, "%s: error registering with the MGS: "
+			       "rc = %d (not fatal)\n", lsi->lsi_svname, rc);
+			/* reset the error code for non-fatal error. */
+			rc = 0;
+		}
+		GOTO(out, rc);
+	}
+
+out:
+	if (mti)
+		OBD_FREE_PTR(mti);
+	RETURN(rc);
+}
+
+/**
+ * Notify the MGS that this target is ready.
+ * Used by IR - if the MGS receives this message, it will notify clients.
+ */
+static int server_notify_target(struct super_block *sb, struct obd_device *obd)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	struct mgs_target_info *mti = NULL;
+	int rc;
+	ENTRY;
+
+	LASSERT(mgc);
+
+	if (!(IS_SERVER(lsi)))
+		RETURN(-EINVAL);
+
+	OBD_ALLOC_PTR(mti);
+	if (!mti)
+		RETURN(-ENOMEM);
+	rc = server_lsi2mti(lsi, mti);
+	if (rc)
+		GOTO(out, rc);
+
+	mti->mti_instance = obd->u.obt.obt_instance;
+	mti->mti_flags |= LDD_F_OPC_READY;
+
+	/* FIXME use mgc_process_config instead */
+	rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
+				sizeof(KEY_REGISTER_TARGET),
+				KEY_REGISTER_TARGET,
+				sizeof(*mti), mti, NULL);
+
+	/* Imperative recovery: if the mgs informs us to use IR? */
+	if (!rc && !(mti->mti_flags & LDD_F_ERROR) &&
+	    (mti->mti_flags & LDD_F_IR_CAPABLE))
+		lsi->lsi_flags |= LDD_F_IR_CAPABLE;
+
+out:
+	if (mti)
+		OBD_FREE_PTR(mti);
+	RETURN(rc);
+
+}
+
+/** Start server targets: MDTs and OSTs
+ */
+static int server_start_targets(struct super_block *sb)
+{
+	struct obd_device *obd;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_instance cfg;
+	struct lu_env mgc_env;
+	struct lu_device *dev;
+	char *name_service, *obd_name_service = NULL;
+	struct obd_type *type = NULL;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname);
+
+	LASSERTF(IS_MDT(lsi) || IS_OST(lsi), "designed for MDT or OST only\n");
+
+	if (IS_MDT(lsi)) {
+		obd_name_service = LUSTRE_MDS_OBDNAME;
+		name_service = LUSTRE_MDS_NAME;
+	} else {
+		obd_name_service = LUSTRE_OSS_OBDNAME;
+		name_service = LUSTRE_OSS_NAME;
+	}
+
+	/* make sure MDS/OSS is started */
+	mutex_lock(&server_start_lock);
+	obd = class_name2obd(obd_name_service);
+	if (!obd) {
+		rc = lustre_start_simple(obd_name_service, name_service,
+					 (IS_MDT(lsi) ?
+					  LUSTRE_MDS_OBDNAME"_uuid" :
+					  LUSTRE_OSS_OBDNAME"_uuid"),
+					 NULL, NULL, NULL, NULL);
+		if (rc) {
+			mutex_unlock(&server_start_lock);
+			CERROR("failed to start %s: %d\n",
+			       obd_name_service, rc);
+			RETURN(rc);
+		}
+	}
+	/* hold a type reference and put it at server_stop_servers */
+	type = class_get_type(IS_MDT(lsi) ?
+			      LUSTRE_MDT_NAME : LUSTRE_OST_NAME);
+	if (!type) {
+		mutex_unlock(&server_start_lock);
+		GOTO(out_stop_service, rc = -ENODEV);
+	}
+	lsi->lsi_server_started = 1;
+	mutex_unlock(&server_start_lock);
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_OBD_STOP_MDS_RACE) &&
+	    IS_MDT(lsi)) {
+		OBD_RACE(OBD_FAIL_OBD_STOP_MDS_RACE);
+		msleep(2 * MSEC_PER_SEC);
+	}
+
+	rc = lu_env_init(&mgc_env, LCT_MG_THREAD);
+	if (rc != 0)
+		GOTO(out_stop_service, rc);
+
+	/* Set the mgc fs to our server disk.  This allows the MGC to
+	 * read and write configs locally, in case it can't talk to the MGS. */
+	rc = server_mgc_set_fs(&mgc_env, lsi->lsi_mgc, sb);
+	if (rc)
+		GOTO(out_env, rc);
+
+	/* Register with MGS */
+	rc = server_register_target(lsi);
+	if (rc)
+		GOTO(out_mgc, rc);
+
+	/* Let the target look up the mount using the target's name
+	   (we can't pass the sb or mnt through class_process_config.) */
+	rc = server_register_mount(lsi->lsi_svname, sb);
+	if (rc)
+		GOTO(out_mgc, rc);
+
+	/* Start targets using the llog named for the target */
+	memset(&cfg, 0, sizeof(cfg));
+	cfg.cfg_callback = class_config_llog_handler;
+	cfg.cfg_sub_clds = CONFIG_SUB_SERVER;
+	rc = lustre_process_log(sb, lsi->lsi_svname, &cfg);
+	if (rc) {
+		CERROR("failed to start server %s: %d\n",
+		       lsi->lsi_svname, rc);
+		/* Do NOT call server_deregister_mount() here. This makes it
+		 * impossible to find mount later in cleanup time and leaves
+		 * @lsi and othder stuff leaked. -umka */
+		GOTO(out_mgc, rc);
+	}
+
+	obd = class_name2obd(lsi->lsi_svname);
+	if (!obd) {
+		CERROR("no server named %s was started\n", lsi->lsi_svname);
+		GOTO(out_mgc, rc = -ENXIO);
+	}
+
+	if (IS_OST(lsi) || IS_MDT(lsi)) {
+		rc = lustre_start_lwp(sb);
+		if (rc) {
+			CERROR("%s: failed to start LWP: %d\n",
+			       lsi->lsi_svname, rc);
+			GOTO(out_mgc, rc);
+		}
+	}
+
+	server_notify_target(sb, obd);
+
+	/* calculate recovery timeout, do it after lustre_process_log */
+	server_calc_timeout(lsi, obd);
+
+	/* log has been fully processed, let clients connect */
+	dev = obd->obd_lu_dev;
+	if (dev && dev->ld_ops->ldo_prepare) {
+		struct lu_env env;
+
+		rc = lu_env_init(&env, dev->ld_type->ldt_ctx_tags);
+		if (rc == 0) {
+			struct lu_context  session_ctx;
+
+			lu_context_init(&session_ctx, LCT_SERVER_SESSION);
+			session_ctx.lc_thread = NULL;
+			lu_context_enter(&session_ctx);
+			env.le_ses = &session_ctx;
+
+			rc = dev->ld_ops->ldo_prepare(&env, NULL, dev);
+
+			lu_env_fini(&env);
+			lu_context_exit(&session_ctx);
+			lu_context_fini(&session_ctx);
+		}
+	}
+
+	/* abort recovery only on the complete stack:
+	 * many devices can be involved */
+	if ((lsi->lsi_lmd->lmd_flags &
+	     (LMD_FLG_ABORT_RECOV | LMD_FLG_ABORT_RECOV_MDT)) &&
+	    (OBP(obd, iocontrol))) {
+		struct obd_ioctl_data karg = {
+			.ioc_type = lsi->lsi_lmd->lmd_flags,
+		};
+
+		obd_iocontrol(OBD_IOC_ABORT_RECOVERY, obd->obd_self_export, 0,
+			      &karg, NULL);
+	}
+
+out_mgc:
+	/* Release the mgc fs for others to use */
+	server_mgc_clear_fs(&mgc_env, lsi->lsi_mgc);
+out_env:
+	lu_env_fini(&mgc_env);
+out_stop_service:
+	/* in case of error upper function call
+	 * server_put_super->server_stop_servers()
+	 */
+
+	RETURN(rc);
+}
+
+static int lsi_prepare(struct lustre_sb_info *lsi)
+{
+	const char *osd_type;
+	const char *fstype;
+	__u32 index;
+	int rc;
+	ENTRY;
+
+	LASSERT(lsi);
+	LASSERT(lsi->lsi_lmd);
+
+	/* The server name is given as a mount line option */
+	if (lsi->lsi_lmd->lmd_profile == NULL) {
+		LCONSOLE_ERROR("Can't determine server name\n");
+		RETURN(-EINVAL);
+	}
+
+	/* Determine osd type */
+	if (lsi->lsi_lmd->lmd_osd_type == NULL) {
+		osd_type = LUSTRE_OSD_LDISKFS_NAME;
+		fstype = "ldiskfs";
+	} else {
+		osd_type = lsi->lsi_lmd->lmd_osd_type;
+		fstype = lsi->lsi_lmd->lmd_osd_type;
+	}
+
+	if (strlen(lsi->lsi_lmd->lmd_profile) >= sizeof(lsi->lsi_svname) ||
+	    strlen(osd_type) >= sizeof(lsi->lsi_osd_type) ||
+	    strlen(fstype) >= sizeof(lsi->lsi_fstype))
+		RETURN(-ENAMETOOLONG);
+
+	strlcpy(lsi->lsi_svname, lsi->lsi_lmd->lmd_profile,
+		sizeof(lsi->lsi_svname));
+	strlcpy(lsi->lsi_osd_type, osd_type, sizeof(lsi->lsi_osd_type));
+	/* XXX: a temp. solution for components using ldiskfs
+	 *      to be removed in one of the subsequent patches */
+	strlcpy(lsi->lsi_fstype, fstype, sizeof(lsi->lsi_fstype));
+
+	/* Determine server type */
+	rc = server_name2index(lsi->lsi_svname, &index, NULL);
+	if (rc < 0) {
+		if (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) {
+			/* Assume we're a bare MGS */
+			rc = 0;
+			lsi->lsi_lmd->lmd_flags |= LMD_FLG_NOSVC;
+		} else {
+			LCONSOLE_ERROR("Can't determine server type of '%s'\n",
+				       lsi->lsi_svname);
+			RETURN(rc);
+		}
+	}
+	lsi->lsi_flags |= rc;
+
+	/* Add mount line flags that used to be in ldd:
+	 * writeconf, mgs, anything else?
+	 */
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ?
+		LDD_F_WRITECONF : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_LOCAL_LOGS) ?
+		LDD_F_NO_LOCAL_LOGS : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_VIRGIN) ?
+		LDD_F_VIRGIN : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_UPDATE) ?
+		LDD_F_UPDATE : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) ?
+		LDD_F_SV_TYPE_MGS : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) ?
+		LDD_F_NO_PRIMNODE : 0;
+
+	RETURN(0);
+}
+
+/*************** server mount ******************/
+
+/** Start the shutdown of servers at umount.
+ */
+static void server_put_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device     *obd;
+	char *tmpname, *extraname = NULL;
+	int tmpname_sz;
+	int lsiflags = lsi->lsi_flags;
+	bool stop_servers = lsi->lsi_server_started;
+	ENTRY;
+
+	LASSERT(IS_SERVER(lsi));
+
+	tmpname_sz = strlen(lsi->lsi_svname) + 1;
+	OBD_ALLOC(tmpname, tmpname_sz);
+	memcpy(tmpname, lsi->lsi_svname, tmpname_sz);
+	CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
+	if (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
+		snprintf(tmpname, tmpname_sz, "MGS");
+
+	/* disconnect the lwp first to drain off the inflight request */
+	if (IS_OST(lsi) || IS_MDT(lsi)) {
+		int	rc;
+
+		rc = lustre_disconnect_lwp(sb);
+		if (rc != 0 && rc != -ETIMEDOUT &&
+		    rc != -ENOTCONN && rc != -ESHUTDOWN)
+			CWARN("%s: failed to disconnect lwp: rc= %d\n",
+			      tmpname, rc);
+	}
+
+	/* Stop the target */
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
+	    (IS_MDT(lsi) || IS_OST(lsi))) {
+		struct lustre_profile *lprof = NULL;
+
+		/* tell the mgc to drop the config log */
+		lustre_end_log(sb, lsi->lsi_svname, NULL);
+
+		/* COMPAT_146 - profile may get deleted in mgc_cleanup.
+		   If there are any setup/cleanup errors, save the lov
+		   name for safety cleanup later. */
+		lprof = class_get_profile(lsi->lsi_svname);
+		if (lprof != NULL) {
+			if (lprof->lp_dt != NULL) {
+				OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
+				strncpy(extraname, lprof->lp_dt,
+					strlen(lprof->lp_dt) + 1);
+			}
+			class_put_profile(lprof);
+		}
+
+		obd = class_name2obd(lsi->lsi_svname);
+		if (obd) {
+			CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
+			if (lsiflags & LSI_UMOUNT_FAILOVER)
+				obd->obd_fail = 1;
+			/* We can't seem to give an error return code
+			 * to .put_super, so we better make sure we clean up! */
+			obd->obd_force = 1;
+			class_manual_cleanup(obd);
+			if (OBD_FAIL_PRECHECK(OBD_FAIL_OBD_STOP_MDS_RACE)) {
+				int idx;
+				server_name2index(lsi->lsi_svname, &idx, NULL);
+				/* sleeping for MDT0001 */
+				if (idx == 1)
+					OBD_RACE(OBD_FAIL_OBD_STOP_MDS_RACE);
+			}
+		} else {
+			CERROR("no obd %s\n", lsi->lsi_svname);
+			server_deregister_mount(lsi->lsi_svname);
+		}
+	}
+
+	/* If they wanted the mgs to stop separately from the mdt, they
+	   should have put it on a different device. */
+	if (IS_MGS(lsi)) {
+		/* if MDS start with --nomgs, don't stop MGS then */
+		if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
+			server_stop_mgs(sb);
+	}
+
+	if (IS_OST(lsi) || IS_MDT(lsi)) {
+		if (lustre_stop_lwp(sb) < 0)
+			CERROR("%s: failed to stop lwp!\n", tmpname);
+	}
+
+	/* Clean the mgc and sb */
+	lustre_common_put_super(sb);
+
+	/* wait till all in-progress cleanups are done
+	 * specifically we're interested in ofd cleanup
+	 * as it pins OSS */
+	obd_zombie_barrier();
+
+	/* Stop the servers (MDS, OSS) if no longer needed.  We must wait
+	   until the target is really gone so that our type refcount check
+	   is right. */
+	if (stop_servers)
+		server_stop_servers(lsiflags);
+
+	/* In case of startup or cleanup err, stop related obds */
+	if (extraname) {
+		obd = class_name2obd(extraname);
+		if (obd) {
+			CWARN("Cleaning orphaned obd %s\n", extraname);
+			obd->obd_force = 1;
+			class_manual_cleanup(obd);
+		}
+		OBD_FREE(extraname, strlen(extraname) + 1);
+	}
+
+	LCONSOLE_WARN("server umount %s complete\n", tmpname);
+	OBD_FREE(tmpname, tmpname_sz);
+	EXIT;
+}
+
+/** Called only for 'umount -f'
+ */
+static void server_umount_begin(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "umount -f\n");
+	/* umount = failover
+	   umount -f = force
+	   no third way to do non-force, non-failover */
+	lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
+	EXIT;
+}
+
+static int server_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_statfs statfs;
+	int rc;
+	ENTRY;
+
+	if (lsi->lsi_dt_dev) {
+		rc = dt_statfs(NULL, lsi->lsi_dt_dev, &statfs);
+		if (rc == 0) {
+			statfs_unpack(buf, &statfs);
+			buf->f_type = sb->s_magic;
+			RETURN(0);
+		}
+	}
+
+	/* just return 0 */
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = 1;
+	buf->f_bfree = 0;
+	buf->f_bavail = 0;
+	buf->f_files = 1;
+	buf->f_ffree = 0;
+	buf->f_namelen = NAME_MAX;
+	RETURN(0);
+}
+
+int server_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+	struct lustre_sb_info *lsi;
+	struct lustre_mount_data *lmd;
+
+	LASSERT(seq != NULL && dentry != NULL);
+	lsi = s2lsi(dentry->d_sb);
+	lmd = lsi->lsi_lmd;
+	seq_printf(seq, ",svname=%s", lmd->lmd_profile);
+
+	if  (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
+		seq_puts(seq, ",abort_recov");
+
+	if (lmd->lmd_flags & LMD_FLG_NOIR)
+		seq_puts(seq, ",noir");
+
+	if (lmd->lmd_flags & LMD_FLG_NOSVC)
+		seq_puts(seq, ",nosvc");
+
+	if (lmd->lmd_flags & LMD_FLG_NOMGS)
+		seq_puts(seq, ",nomgs");
+
+	if (lmd->lmd_flags & LMD_FLG_NOSCRUB)
+		seq_puts(seq, ",noscrub");
+	if (lmd->lmd_flags & LMD_FLG_SKIP_LFSCK)
+		seq_puts(seq, ",skip_lfsck");
+
+	if (lmd->lmd_flags & LMD_FLG_DEV_RDONLY)
+		seq_puts(seq, ",rdonly_dev");
+
+	if (lmd->lmd_flags & LMD_FLG_MGS)
+		seq_puts(seq, ",mgs");
+
+	if (lmd->lmd_mgs != NULL)
+		seq_printf(seq, ",mgsnode=%s", lmd->lmd_mgs);
+
+	if (lmd->lmd_osd_type != NULL)
+		seq_printf(seq, ",osd=%s", lmd->lmd_osd_type);
+
+	if (lmd->lmd_opts != NULL) {
+		seq_putc(seq, ',');
+		seq_puts(seq, lmd->lmd_opts);
+	}
+
+	RETURN(0);
+}
+
+/** The operations we support directly on the superblock:
+ * mount, umount, and df.
+ */
+static const struct super_operations server_ops = {
+	.put_super	= server_put_super,
+	.umount_begin	= server_umount_begin, /* umount -f */
+	.statfs		= server_statfs,
+	.show_options	= server_show_options,
+};
+
+/*
+ * Xattr support for Lustre servers
+ */
+#ifdef HAVE_IOP_XATTR
+static ssize_t lustre_getxattr(struct dentry *dentry, const char *name,
+				void *buffer, size_t size)
+{
+	if (!selinux_is_enabled())
+		return -EOPNOTSUPP;
+	return -ENODATA;
+}
+
+static int lustre_setxattr(struct dentry *dentry, const char *name,
+			    const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+static ssize_t lustre_listxattr(struct dentry *d_entry, char *name,
+				size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static bool is_cmd_supported(unsigned int command)
+{
+	switch (command) {
+	case FITRIM:
+		return true;
+	default:
+		return false;
+	}
+
+	return false;
+}
+
+static long server_ioctl(struct file *filp, unsigned int command,
+			 unsigned long arg)
+{
+	struct file active_filp;
+	struct inode *inode = file_inode(filp);
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	struct super_block *dd_sb = dt_mnt_sb_get(lsi->lsi_dt_dev);
+	struct inode *active_inode;
+	int err = -EOPNOTSUPP;
+
+	if (IS_ERR(dd_sb) || !is_cmd_supported(command))
+		return err;
+
+	active_inode = igrab(dd_sb->s_root->d_inode);
+	if (!active_inode)
+		return -EACCES;
+
+	active_filp.f_inode = active_inode;
+	if (active_inode->i_fop && active_inode->i_fop->unlocked_ioctl)
+		err = active_inode->i_fop->unlocked_ioctl(&active_filp,
+							  command, arg);
+	iput(active_inode);
+	return err;
+}
+
+static const struct inode_operations server_inode_operations = {
+#ifdef HAVE_IOP_XATTR
+	.setxattr       = lustre_setxattr,
+	.getxattr       = lustre_getxattr,
+#endif
+	.listxattr      = lustre_listxattr,
+};
+
+static const struct file_operations server_file_operations = {
+	.unlocked_ioctl = server_ioctl,
+};
+
+#define log2(n) ffz(~(n))
+#define LUSTRE_SUPER_MAGIC 0x0BD00BD1
+
+static int server_fill_super_common(struct super_block *sb)
+{
+	struct inode *root = NULL;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
+
+	sb->s_blocksize = 4096;
+	sb->s_blocksize_bits = log2(sb->s_blocksize);
+	sb->s_magic = LUSTRE_SUPER_MAGIC;
+	sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */
+	sb->s_flags |= SB_RDONLY;
+	sb->s_op = &server_ops;
+
+	root = new_inode(sb);
+	if (!root) {
+		CERROR("Can't make root inode\n");
+		RETURN(-EIO);
+	}
+
+	/* returns -EIO for every operation */
+	/* make_bad_inode(root); -- badness - can't umount */
+	/* apparently we need to be a directory for the mount to finish */
+	root->i_mode = S_IFDIR;
+	root->i_op = &server_inode_operations;
+	root->i_fop = &server_file_operations;
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		CERROR("%s: can't make root dentry\n", sb->s_id);
+		RETURN(-EIO);
+	}
+
+	RETURN(0);
+}
+
+static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
+{
+	struct lustre_mount_data *lmd = lsi->lsi_lmd;
+	struct obd_device *obd;
+	struct dt_device_param p;
+	char flagstr[20 + 1 + 10 + 1];
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT,
+	       "Attempting to start %s, type=%s, lsifl=%x, mountfl=%lx\n",
+	       lsi->lsi_svname, lsi->lsi_osd_type, lsi->lsi_flags, mflags);
+
+	sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname);
+	strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname);
+	strcat(lsi->lsi_osd_uuid, "_UUID");
+	snprintf(flagstr, sizeof(flagstr), "%lu:%u", mflags, lmd->lmd_flags);
+
+	obd = class_name2obd(lsi->lsi_osd_obdname);
+	if (obd == NULL) {
+		rc = lustre_start_simple(lsi->lsi_osd_obdname,
+					 lsi->lsi_osd_type,
+					 lsi->lsi_osd_uuid, lmd->lmd_dev,
+					 flagstr, lsi->lsi_lmd->lmd_opts,
+					 lsi->lsi_svname);
+		if (rc)
+			GOTO(out, rc);
+		obd = class_name2obd(lsi->lsi_osd_obdname);
+		LASSERT(obd);
+	} else {
+		CDEBUG(D_MOUNT, "%s already started\n", lsi->lsi_osd_obdname);
+		/* but continue setup to allow special case of MDT and internal
+		 * MGT being started separately. */
+		if (!((IS_MGS(lsi) && (lsi->lsi_lmd->lmd_flags &
+				      LMD_FLG_NOMGS)) ||
+		     (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags &
+				      LMD_FLG_NOSVC))))
+			RETURN(-EALREADY);
+	}
+
+	rc = obd_connect(NULL, &lsi->lsi_osd_exp,
+			 obd, &obd->obd_uuid, NULL, NULL);
+
+	if (rc) {
+		obd->obd_force = 1;
+		class_manual_cleanup(obd);
+		lsi->lsi_dt_dev = NULL;
+		RETURN(rc);
+	}
+
+	LASSERT(obd->obd_lu_dev);
+	lu_device_get(obd->obd_lu_dev);
+	lsi->lsi_dt_dev = lu2dt_dev(obd->obd_lu_dev);
+	LASSERT(lsi->lsi_dt_dev);
+
+	/* set disk context for llog usage */
+	OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+	obd->obd_lvfs_ctxt.dt = lsi->lsi_dt_dev;
+
+	dt_conf_get(NULL, lsi->lsi_dt_dev, &p);
+out:
+	RETURN(rc);
+}
+
+/** Fill in the superblock info for a Lustre server.
+ * Mount the device with the correct options.
+ * Read the on-disk config file.
+ * Start the services.
+ */
+int server_fill_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	int rc;
+	ENTRY;
+
+	/* to simulate target mount race */
+	OBD_RACE(OBD_FAIL_TGT_MOUNT_RACE);
+
+	rc = lsi_prepare(lsi);
+	if (rc) {
+		lustre_put_lsi(sb);
+		RETURN(rc);
+	}
+
+	/* Start low level OSD */
+	rc = osd_start(lsi, sb->s_flags);
+	if (rc) {
+		CERROR("Unable to start osd on %s: %d\n",
+		       lsi->lsi_lmd->lmd_dev, rc);
+		lustre_put_lsi(sb);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_MOUNT, "Found service %s on device %s\n",
+	       lsi->lsi_svname, lsi->lsi_lmd->lmd_dev);
+
+	if (class_name2obd(lsi->lsi_svname)) {
+		LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
+				   "running. Double-mount may have compromised"
+				   " the disk journal.\n",
+				   lsi->lsi_svname);
+		lustre_put_lsi(sb);
+		RETURN(-EALREADY);
+	}
+
+	/* Start MGS before MGC */
+	if (IS_MGS(lsi) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) {
+		rc = server_start_mgs(sb);
+		if (rc)
+			GOTO(out_mnt, rc);
+	}
+
+	/* Start MGC before servers */
+	rc = lustre_start_mgc(sb);
+	if (rc)
+		GOTO(out_mnt, rc);
+
+	/* Set up all obd devices for service */
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
+	    (IS_OST(lsi) || IS_MDT(lsi))) {
+		rc = server_start_targets(sb);
+		if (rc < 0) {
+			CERROR("Unable to start targets: %d\n", rc);
+			GOTO(out_mnt, rc);
+		}
+		/* FIXME overmount client here, or can we just start a
+		 * client log and client_fill_super on this sb?  We
+		 * need to make sure server_put_super gets called too
+		 * - ll_put_super calls lustre_common_put_super; check
+		 * there for LSI_SERVER flag, call s_p_s if so.
+		 *
+		 * Probably should start client from new thread so we
+		 * can return.  Client will not finish until all
+		 * servers are connected.  Note - MGS-only server does
+		 * NOT get a client, since there is no lustre fs
+		 * associated - the MGS is for all lustre fs's */
+	}
+
+	rc = server_fill_super_common(sb);
+	if (rc)
+		GOTO(out_mnt, rc);
+
+	RETURN(0);
+out_mnt:
+	/* We jump here in case of failure while starting targets or MGS.
+	 * In this case we can't just put @mnt and have to do real cleanup
+	 * with stoping targets, etc. */
+	server_put_super(sb);
+	return rc;
+}
+EXPORT_SYMBOL(server_fill_super);
+
+/*
+ * Calculate timeout value for a target.
+ */
+void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd)
+{
+	struct lustre_mount_data *lmd;
+	int soft = 0;
+	int hard = 0;
+	int factor = 0;
+	bool has_ir = !!(lsi->lsi_flags & LDD_F_IR_CAPABLE);
+	int min = OBD_RECOVERY_TIME_MIN;
+
+	LASSERT(IS_SERVER(lsi));
+
+	lmd = lsi->lsi_lmd;
+	if (lmd) {
+		soft   = lmd->lmd_recovery_time_soft;
+		hard   = lmd->lmd_recovery_time_hard;
+		has_ir = has_ir && !(lmd->lmd_flags & LMD_FLG_NOIR);
+		obd->obd_no_ir = !has_ir;
+	}
+
+	if (soft == 0)
+		soft = OBD_RECOVERY_TIME_SOFT;
+	if (hard == 0)
+		hard = OBD_RECOVERY_TIME_HARD;
+
+	/* target may have ir_factor configured. */
+	factor = OBD_IR_FACTOR_DEFAULT;
+	if (obd->obd_recovery_ir_factor)
+		factor = obd->obd_recovery_ir_factor;
+
+	if (has_ir) {
+		int new_soft = soft;
+
+		/* adjust timeout value by imperative recovery */
+		new_soft = (soft * factor) / OBD_IR_FACTOR_MAX;
+		/* make sure the timeout is not too short */
+		new_soft = max(min, new_soft);
+
+		LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery "
+			      "window shrunk from %d-%d down to %d-%d\n",
+			      obd->obd_name, soft, hard, new_soft, hard);
+
+		soft = new_soft;
+	} else {
+		LCONSOLE_INFO("%s: Imperative Recovery not enabled, recovery "
+			      "window %d-%d\n", obd->obd_name, soft, hard);
+	}
+
+	/* we're done */
+	obd->obd_recovery_timeout = max_t(time64_t, obd->obd_recovery_timeout,
+					  soft);
+	obd->obd_recovery_time_hard = hard;
+	obd->obd_recovery_ir_factor = factor;
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
new file mode 100644
index 0000000000000..5d9430a0930e9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
@@ -0,0 +1,687 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/obd_sysfs.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+#include <linux/kobject.h>
+
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_crypto.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <uapi/linux/lnet/lnetctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ver.h>
+
+struct static_lustre_uintvalue_attr {
+	struct {
+		struct attribute attr;
+		ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+		ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+				 const char *buf, size_t len);
+	} u;
+	int *value;
+};
+
+static ssize_t static_uintvalue_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
+{
+	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
+
+	return sprintf(buf, "%d\n", *lattr->value);
+}
+
+static ssize_t static_uintvalue_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	*lattr->value = val;
+
+	return count;
+}
+
+#define LUSTRE_STATIC_UINT_ATTR(name, value)				\
+static struct static_lustre_uintvalue_attr lustre_sattr_##name =	\
+	{ __ATTR(name, 0644, static_uintvalue_show,			\
+		 static_uintvalue_store), value }
+
+LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout);
+LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
+LUSTRE_STATIC_UINT_ATTR(at_min, &at_min);
+LUSTRE_STATIC_UINT_ATTR(at_max, &at_max);
+LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
+LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
+LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
+LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
+
+#ifdef HAVE_SERVER_SUPPORT
+LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
+LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout);
+#endif
+
+static ssize_t memused_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_sum());
+}
+LUSTRE_RO_ATTR(memused);
+
+static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_max());
+}
+LUSTRE_RO_ATTR(memused_max);
+
+static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%lu\n",
+		       obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT)));
+}
+
+static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
+
+	if (val > ((cfs_totalram_pages() / 10) * 9)) {
+		/* Somebody wants to assign too much memory to dirty pages */
+		return -EINVAL;
+	}
+
+	if (val < 4 << (20 - PAGE_SHIFT)) {
+		/* Less than 4 Mb for dirty cache is also bad */
+		return -EINVAL;
+	}
+
+	obd_max_dirty_pages = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_dirty_mb);
+
+#ifdef HAVE_SERVER_SUPPORT
+static ssize_t no_transno_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buffer, size_t count)
+{
+	struct obd_device *obd;
+	unsigned int idx;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &idx);
+	if (rc)
+		return rc;
+
+	obd = class_num2obd(idx);
+	if (!obd || !obd->obd_attached) {
+		if (obd)
+			CERROR("%s: not attached\n", obd->obd_name);
+		return -ENODEV;
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_no_transno = 1;
+	spin_unlock(&obd->obd_dev_lock);
+	return count;
+}
+LUSTRE_WO_ATTR(no_transno);
+#endif /* HAVE_SERVER_SUPPORT */
+
+static ssize_t version_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING);
+}
+
+static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+#ifdef CONFIG_LUSTRE_FS_PINGER
+	const char *state = "on";
+#else
+	const char *state = "off";
+#endif
+	return sprintf(buf, "%s\n", state);
+}
+
+/**
+ * Check all obd devices health
+ *
+ * \param kobj
+ * \param buf [in]
+ *
+ * \retval number of characters printed if healthy
+ */
+static ssize_t
+health_check_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	bool healthy = true;
+	size_t len = 0;
+	int i;
+
+	if (libcfs_catastrophe)
+		return sprintf(buf, "LBUG\n");
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __func__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd))
+			healthy = false;
+
+		class_decref(obd, __func__, current);
+		read_lock(&obd_dev_lock);
+
+		if (!healthy)
+			break;
+	}
+	read_unlock(&obd_dev_lock);
+
+	if (healthy)
+		len = sprintf(buf, "healthy\n");
+	else
+		len = sprintf(buf, "NOT HEALTHY\n");
+
+	return len;
+}
+
+static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_var))
+		rc = scnprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var);
+	return rc;
+}
+
+static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+		return -EINVAL;
+
+	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+
+	memcpy(obd_jobid_var, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_var[count - 1] == '\n')
+		obd_jobid_var[count - 1] = 0;
+
+	return count;
+}
+
+static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_name))
+		rc = scnprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_name);
+	return rc;
+}
+
+static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr,
+				const char *buffer, size_t count)
+{
+	if (!count || count > LUSTRE_JOBID_SIZE)
+		return -EINVAL;
+
+	if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) != 0 &&
+	    !strchr(buffer, '%')) {
+		lustre_jobid_clear(buffer);
+		return count;
+	}
+
+	/* clear previous value */
+	memset(obd_jobid_name, 0, LUSTRE_JOBID_SIZE);
+
+	memcpy(obd_jobid_name, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_name[count - 1] == '\n') {
+		/* Don't echo just a newline */
+		if (count == 1)
+			return -EINVAL;
+		obd_jobid_name[count - 1] = 0;
+	}
+
+	return count;
+}
+
+static ssize_t jobid_this_session_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
+{
+	char *jid;
+	int ret = -ENOENT;
+
+	rcu_read_lock();
+	jid = jobid_current();
+	if (jid)
+		ret = scnprintf(buf, PAGE_SIZE, "%s\n", jid);
+	rcu_read_unlock();
+	return ret;
+}
+
+static ssize_t jobid_this_session_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
+{
+	char *jobid;
+	int len;
+	int ret;
+
+	if (!count || count > LUSTRE_JOBID_SIZE)
+		return -EINVAL;
+
+	jobid = kstrndup(buffer, count, GFP_KERNEL);
+	if (!jobid)
+		return -ENOMEM;
+	len = strcspn(jobid, "\n ");
+	jobid[len] = '\0';
+	ret = jobid_set_current(jobid);
+	kfree(jobid);
+
+	return ret ?: count;
+}
+
+/* Root for /sys/kernel/debug/lustre */
+struct dentry *debugfs_lustre_root;
+EXPORT_SYMBOL_GPL(debugfs_lustre_root);
+
+#ifdef CONFIG_PROC_FS
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root;
+EXPORT_SYMBOL(proc_lustre_root);
+#else
+#define lprocfs_base NULL
+#endif /* CONFIG_PROC_FS */
+
+LUSTRE_RO_ATTR(version);
+LUSTRE_RO_ATTR(pinger);
+LUSTRE_RO_ATTR(health_check);
+LUSTRE_RW_ATTR(jobid_var);
+LUSTRE_RW_ATTR(jobid_name);
+LUSTRE_RW_ATTR(jobid_this_session);
+
+static struct attribute *lustre_attrs[] = {
+	&lustre_attr_version.attr,
+	&lustre_attr_pinger.attr,
+	&lustre_attr_health_check.attr,
+	&lustre_attr_jobid_name.attr,
+	&lustre_attr_jobid_var.attr,
+	&lustre_attr_jobid_this_session.attr,
+	&lustre_sattr_timeout.u.attr,
+	&lustre_attr_max_dirty_mb.attr,
+	&lustre_sattr_debug_peer_on_timeout.u.attr,
+	&lustre_sattr_dump_on_timeout.u.attr,
+	&lustre_sattr_dump_on_eviction.u.attr,
+	&lustre_sattr_at_min.u.attr,
+	&lustre_sattr_at_max.u.attr,
+	&lustre_sattr_at_extra.u.attr,
+	&lustre_sattr_at_early_margin.u.attr,
+	&lustre_sattr_at_history.u.attr,
+	&lustre_attr_memused_max.attr,
+	&lustre_attr_memused.attr,
+#ifdef HAVE_SERVER_SUPPORT
+	&lustre_sattr_ldlm_timeout.u.attr,
+	&lustre_sattr_bulk_timeout.u.attr,
+	&lustre_attr_no_transno.attr,
+#endif
+	&lustre_sattr_lbug_on_eviction.u.attr,
+	NULL,
+};
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+	loff_t index = *(loff_t *)v;
+	struct obd_device *obd = class_num2obd((int)index);
+	char *status;
+
+	if (obd == NULL)
+		return 0;
+
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	if (obd->obd_stopping)
+		status = "ST";
+	else if (obd->obd_inactive)
+		status = "IN";
+	else if (obd->obd_set_up)
+		status = "UP";
+	else if (obd->obd_attached)
+		status = "AT";
+	else
+		status = "--";
+
+	seq_printf(p, "%3d %s %s %s %s %d\n",
+		   (int)index, status, obd->obd_type->typ_name,
+		   obd->obd_name, obd->obd_uuid.uuid,
+		   atomic_read(&obd->obd_refcount));
+	return 0;
+}
+
+static const struct seq_operations obd_device_list_sops = {
+	.start = obd_device_list_seq_start,
+	.stop = obd_device_list_seq_stop,
+	.next = obd_device_list_seq_next,
+	.show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = seq_open(file, &obd_device_list_sops);
+
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+	return 0;
+}
+
+static const struct file_operations obd_device_list_fops = {
+	.owner   = THIS_MODULE,
+	.open    = obd_device_list_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+/* checksum_speed */
+static void *checksum_speed_start(struct seq_file *p, loff_t *pos)
+{
+	return pos;
+}
+
+static void checksum_speed_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *checksum_speed_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++(*pos);
+	if (*pos >= CFS_HASH_ALG_SPEED_MAX - 1)
+		return NULL;
+
+	return pos;
+}
+
+static int checksum_speed_show(struct seq_file *p, void *v)
+{
+	loff_t index = *(loff_t *)v;
+
+	if (!index || index > CFS_HASH_ALG_SPEED_MAX - 1)
+		return 0;
+
+	seq_printf(p, "%s: %d\n", cfs_crypto_hash_name(index),
+		   cfs_crypto_hash_speeds[index]);
+
+	return 0;
+}
+
+static const struct seq_operations checksum_speed_sops = {
+	.start = checksum_speed_start,
+	.stop = checksum_speed_stop,
+	.next = checksum_speed_next,
+	.show = checksum_speed_show,
+};
+
+static int checksum_speed_open(struct inode *inode, struct file *file)
+{
+	int rc = seq_open(file, &checksum_speed_sops);
+
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static const struct file_operations checksum_speed_fops = {
+	.owner   = THIS_MODULE,
+	.open    = checksum_speed_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static int
+health_check_seq_show(struct seq_file *m, void *unused)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __func__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd)) {
+			seq_printf(m, "device %s reported unhealthy\n",
+				   obd->obd_name);
+		}
+		class_decref(obd, __func__, current);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+
+	return 0;
+}
+
+LDEBUGFS_SEQ_FOPS_RO(health_check);
+
+struct kset *lustre_kset;
+EXPORT_SYMBOL_GPL(lustre_kset);
+
+static struct attribute_group lustre_attr_group = {
+	.attrs = lustre_attrs,
+};
+
+ssize_t class_set_global(const char *param)
+{
+	const char *value = strchr(param, '=') + 1;
+	size_t off = value - param - 1;
+	ssize_t count = -ENOENT;
+	int i;
+
+	for (i = 0; lustre_attrs[i]; i++) {
+		if (!strncmp(lustre_attrs[i]->name, param, off)) {
+			count = lustre_attr_store(&lustre_kset->kobj,
+						  lustre_attrs[i], value,
+						  strlen(value));
+			break;
+		}
+	}
+	return count;
+}
+
+int class_procfs_init(void)
+{
+	struct proc_dir_entry *entry;
+	struct dentry *file;
+	int rc = -ENOMEM;
+
+	ENTRY;
+
+	lustre_kset = kset_create_and_add("lustre", NULL, fs_kobj);
+	if (!lustre_kset)
+		goto out;
+
+	/* Create the files associated with this kobject */
+	rc = sysfs_create_group(&lustre_kset->kobj, &lustre_attr_group);
+	if (rc) {
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	rc = jobid_cache_init();
+	if (rc) {
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	debugfs_lustre_root = debugfs_create_dir("lustre", NULL);
+
+	file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL,
+				   &obd_device_list_fops);
+
+	file = debugfs_create_file("health_check", 0444, debugfs_lustre_root,
+				   NULL, &health_check_fops);
+
+	file = debugfs_create_file("checksum_speed", 0444, debugfs_lustre_root,
+				   NULL, &checksum_speed_fops);
+
+	entry = lprocfs_register("fs/lustre", NULL, NULL, NULL);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc);
+		debugfs_remove_recursive(debugfs_lustre_root);
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	proc_lustre_root = entry;
+out:
+	RETURN(rc);
+}
+
+int class_procfs_clean(void)
+{
+	ENTRY;
+
+	debugfs_remove_recursive(debugfs_lustre_root);
+
+	debugfs_lustre_root = NULL;
+	jobid_cache_fini();
+
+	if (proc_lustre_root)
+		lprocfs_remove(&proc_lustre_root);
+
+	sysfs_remove_group(&lustre_kset->kobj, &lustre_attr_group);
+
+	kset_unregister(lustre_kset);
+
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
new file mode 100644
index 0000000000000..d17d19741b6ba
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
@@ -0,0 +1,225 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+
+#include <obd_class.h>
+#include <lustre_obdo.h>
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+	dst->o_parent_oid = fid_oid(parent);
+	dst->o_parent_seq = fid_seq(parent);
+	dst->o_parent_ver = fid_ver(parent);
+	dst->o_valid |= OBD_MD_FLPARENT | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+void obdo_set_o_projid(struct obdo *dst, u32 projid)
+{
+	dst->o_projid = projid;
+	dst->o_valid |= OBD_MD_FLPROJID;
+}
+EXPORT_SYMBOL(obdo_set_o_projid);
+
+/*
+ * WARNING: the file systems must take care not to tinker with
+ * attributes they don't manage (such as blocks).
+ */
+void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
+{
+	u64 newvalid = 0;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid %#llx, new time %lld/%lld\n",
+		       valid, (s64) src->i_mtime.tv_sec,
+		       (s64) src->i_ctime.tv_sec);
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->o_atime = src->i_atime.tv_sec;
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->o_mtime = src->i_mtime.tv_sec;
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->o_ctime = src->i_ctime.tv_sec;
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->o_size = i_size_read(src);
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */
+		dst->o_blocks = src->i_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */
+		dst->o_blksize = 1U << src->i_blkbits;
+		newvalid |= OBD_MD_FLBLKSZ;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (src->i_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (src->i_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->o_uid = from_kuid(&init_user_ns, src->i_uid);
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->o_gid = from_kgid(&init_user_ns, src->i_gid);
+		newvalid |= OBD_MD_FLGID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->o_flags = src->i_flags;
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid)
+{
+	CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n",
+	       POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+	if (valid & OBD_MD_FLATIME)
+		dst->o_atime = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		dst->o_mtime = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME)
+		dst->o_ctime = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		dst->o_size = src->o_size;
+	if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+		dst->o_blocks = src->o_blocks;
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->o_blksize = src->o_blksize;
+	if (valid & OBD_MD_FLTYPE)
+		dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+	if (valid & OBD_MD_FLMODE)
+		dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->o_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->o_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->o_flags = src->o_flags;
+	if (valid & OBD_MD_FLFID) {
+		dst->o_parent_seq = src->o_parent_seq;
+		dst->o_parent_ver = src->o_parent_ver;
+	}
+	if (valid & OBD_MD_FLPARENT)
+		dst->o_parent_oid = src->o_parent_oid;
+	if (valid & OBD_MD_FLHANDLE)
+		dst->o_handle = src->o_handle;
+
+	dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj)
+{
+	ioobj->ioo_oid = oa->o_oi;
+	if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+		ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+	/*
+	 * Since 2.4 this does not contain o_mode in the low 16 bits.
+	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs
+	 */
+	ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+/*
+ * Create an obdo to send over the wire
+ */
+void lustre_set_wire_obdo(const struct obd_connect_data *ocd,
+			  struct obdo *wobdo,
+			  const struct obdo *lobdo)
+{
+	*wobdo = *lobdo;
+	if (ocd == NULL)
+		return;
+
+	if (!(wobdo->o_valid & OBD_MD_FLUID))
+		wobdo->o_uid = from_kuid(&init_user_ns, current_uid());
+	if (!(wobdo->o_valid & OBD_MD_FLGID))
+		wobdo->o_gid = from_kgid(&init_user_ns, current_gid());
+
+	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+	    fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
+		/*
+		 * Currently OBD_FL_OSTID will only be used when 2.4 echo
+		 * client communicate with pre-2.4 server
+		 */
+		wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
+		wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
+	}
+}
+EXPORT_SYMBOL(lustre_set_wire_obdo);
+
+/*
+ * Create a local obdo from a wire based odbo
+ */
+void lustre_get_wire_obdo(const struct obd_connect_data *ocd,
+			  struct obdo *lobdo,
+			  const struct obdo *wobdo)
+{
+	*lobdo = *wobdo;
+	if (ocd == NULL)
+		return;
+
+	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+	    fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) {
+		/* see above */
+		lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq;
+		lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id;
+		lobdo->o_oi.oi_fid.f_ver = 0;
+	}
+}
+EXPORT_SYMBOL(lustre_get_wire_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
new file mode 100644
index 0000000000000..0e546c4815467
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
@@ -0,0 +1,163 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/pagemap.h> /* for PAGE_SIZE */
+#include <obd_class.h>
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid)
+{
+	u64 newvalid = 0;
+
+	if (valid & LA_ATIME) {
+		dst->o_atime = la->la_atime;
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & LA_MTIME) {
+		dst->o_mtime = la->la_mtime;
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & LA_CTIME) {
+		dst->o_ctime = la->la_ctime;
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & LA_SIZE) {
+		dst->o_size = la->la_size;
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = la->la_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & LA_TYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			(la->la_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & LA_MODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			(la->la_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & LA_UID) {
+		dst->o_uid = la->la_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & LA_GID) {
+		dst->o_gid = la->la_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	if (valid & LA_PROJID) {
+		dst->o_projid = la->la_projid;
+		newvalid |= OBD_MD_FLPROJID;
+	}
+	if (valid & LA_FLAGS) {
+		dst->o_flags = la->la_flags;
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	if (valid & LA_NLINK) {
+		dst->o_nlink = la->la_nlink;
+		newvalid |= OBD_MD_FLNLINK;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid)
+{
+	u64 newvalid = 0;
+
+	valid &= obdo->o_valid;
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->la_atime = obdo->o_atime;
+		newvalid |= LA_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->la_mtime = obdo->o_mtime;
+		newvalid |= LA_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->la_ctime = obdo->o_ctime;
+		newvalid |= LA_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->la_size = obdo->o_size;
+		newvalid |= LA_SIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {
+		dst->la_blocks = obdo->o_blocks;
+		newvalid |= LA_BLOCKS;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->la_mode = (dst->la_mode & S_IALLUGO) |
+			(obdo->o_mode & S_IFMT);
+		newvalid |= LA_TYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->la_mode = (dst->la_mode & S_IFMT) |
+			(obdo->o_mode & S_IALLUGO);
+		newvalid |= LA_MODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->la_uid = obdo->o_uid;
+		newvalid |= LA_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->la_gid = obdo->o_gid;
+		newvalid |= LA_GID;
+	}
+	if (valid & OBD_MD_FLPROJID) {
+		dst->la_projid = obdo->o_projid;
+		newvalid |= LA_PROJID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->la_flags = obdo->o_flags;
+		newvalid |= LA_FLAGS;
+	}
+	if (valid & OBD_MD_FLNLINK) {
+		dst->la_nlink = obdo->o_nlink;
+		newvalid |= LA_NLINK;
+	}
+	dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/range_lock.c b/drivers/staging/lustrefsx/lustre/obdclass/range_lock.c
new file mode 100644
index 0000000000000..ae3ed9ab0c975
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/range_lock.c
@@ -0,0 +1,179 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Range lock is used to allow multiple threads writing a single shared
+ * file given each thread is writing to a non-overlapping portion of the
+ * file.
+ *
+ * Refer to the possible upstream kernel version of range lock by
+ * Jan Kara <jack@suse.cz>: https://lkml.org/lkml/2013/1/31/480
+ *
+ * This file could later replaced by the upstream kernel version.
+ */
+/*
+ * Author: Prakash Surya <surya1@llnl.gov>
+ * Author: Bobi Jam <bobijam.xu@intel.com>
+ */
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
+#include <linux/interval_tree_generic.h>
+#include <uapi/linux/lustre/lustre_user.h>
+#include <range_lock.h>
+
+#define START(node)	((node)->rl_start)
+#define LAST(node)	((node)->rl_end)
+
+INTERVAL_TREE_DEFINE(struct range_lock, rl_rb, __u64, rl_subtree_last,
+		     START, LAST, static, range_lock)
+
+/**
+ * Initialize a range lock tree
+ *
+ * \param tree [in]	an empty range lock tree
+ *
+ * Pre:  Caller should have allocated the range lock tree.
+ * Post: The range lock tree is ready to function.
+ */
+void range_lock_tree_init(struct range_lock_tree *tree)
+{
+	tree->rlt_root = INTERVAL_TREE_ROOT;
+	tree->rlt_sequence = 0;
+	spin_lock_init(&tree->rlt_lock);
+}
+EXPORT_SYMBOL(range_lock_tree_init);
+
+/**
+ * Intialize a range lock node
+ *
+ * \param lock  [in]	an empty range lock node
+ * \param start [in]	start of the covering region
+ * \param end   [in]	end of the covering region
+ *
+ * Pre:  Caller should have allocated the range lock node.
+ * Post: The range lock node is meant to cover [start, end] region
+ */
+void range_lock_init(struct range_lock *lock, __u64 start, __u64 end)
+{
+	start >>= PAGE_SHIFT;
+	if (end != LUSTRE_EOF)
+		end >>= PAGE_SHIFT;
+	lock->rl_start = start;
+	lock->rl_end = end;
+
+	lock->rl_task = NULL;
+	lock->rl_blocking_ranges = 0;
+	lock->rl_sequence = 0;
+}
+EXPORT_SYMBOL(range_lock_init);
+
+/**
+ * Unlock a range lock, wake up locks blocked by this lock.
+ *
+ * \param tree [in]	range lock tree
+ * \param lock [in]	range lock to be deleted
+ *
+ * If this lock has been granted, relase it; if not, just delete it from
+ * the tree or the same region lock list. Wake up those locks only blocked
+ * by this lock.
+ */
+void range_unlock(struct range_lock_tree *tree, struct range_lock *lock)
+{
+	struct range_lock *overlap;
+	ENTRY;
+
+	spin_lock(&tree->rlt_lock);
+
+	range_lock_remove(lock, &tree->rlt_root);
+
+	for (overlap = range_lock_iter_first(&tree->rlt_root,
+					     lock->rl_start,
+					     lock->rl_end);
+	     overlap;
+	     overlap = range_lock_iter_next(overlap,
+					    lock->rl_start,
+					    lock->rl_end))
+		if (overlap->rl_sequence > lock->rl_sequence) {
+			--overlap->rl_blocking_ranges;
+			if (overlap->rl_blocking_ranges == 0)
+				wake_up_process(overlap->rl_task);
+		}
+
+	spin_unlock(&tree->rlt_lock);
+
+	EXIT;
+}
+EXPORT_SYMBOL(range_unlock);
+
+/**
+ * Lock a region
+ *
+ * \param tree [in]	range lock tree
+ * \param lock [in]	range lock node containing the region span
+ *
+ * \retval 0	get the range lock
+ * \retval <0	error code while not getting the range lock
+ *
+ * If there exists overlapping range lock, the new lock will wait and
+ * retry, if later it find that it is not the chosen one to wake up,
+ * it wait again.
+ */
+int range_lock(struct range_lock_tree *tree, struct range_lock *lock)
+{
+	struct range_lock *overlap;
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&tree->rlt_lock);
+	/*
+	 * We need to check for all conflicting intervals
+	 * already in the tree.
+	 */
+	for (overlap = range_lock_iter_first(&tree->rlt_root,
+					     lock->rl_start,
+					     lock->rl_end);
+	     overlap;
+	     overlap = range_lock_iter_next(overlap,
+					    lock->rl_start,
+					    lock->rl_end))
+		lock->rl_blocking_ranges += 1;
+
+	range_lock_insert(lock, &tree->rlt_root);
+	lock->rl_sequence = ++tree->rlt_sequence;
+
+	while (lock->rl_blocking_ranges > 0) {
+		lock->rl_task = current;
+		__set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock(&tree->rlt_lock);
+		schedule();
+
+		if (signal_pending(current)) {
+			range_unlock(tree, lock);
+			GOTO(out, rc = -ERESTARTSYS);
+		}
+		spin_lock(&tree->rlt_lock);
+	}
+	spin_unlock(&tree->rlt_lock);
+out:
+	RETURN(rc);
+}
+EXPORT_SYMBOL(range_lock);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/scrub.c b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c
new file mode 100644
index 0000000000000..89c3f752dad22
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c
@@ -0,0 +1,1356 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/scrub.c
+ *
+ * The OI scrub is used for checking and (re)building Object Index files
+ * that are usually backend special. Here are some general scrub related
+ * functions that can be shared by different backends for OI scrub.
+ *
+ * Author: Fan Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LFSCK
+
+#include <linux/kthread.h>
+#include <lustre_scrub.h>
+#include <lustre_lib.h>
+#include <lustre_fid.h>
+
+static inline struct dt_device *scrub_obj2dev(struct dt_object *obj)
+{
+	return container_of_safe(obj->do_lu.lo_dev, struct dt_device,
+				 dd_lu_dev);
+}
+
+static void scrub_file_to_cpu(struct scrub_file *des, struct scrub_file *src)
+{
+	uuid_copy(&des->sf_uuid, &src->sf_uuid);
+	des->sf_flags	= le64_to_cpu(src->sf_flags);
+	des->sf_magic	= le32_to_cpu(src->sf_magic);
+	des->sf_status	= le16_to_cpu(src->sf_status);
+	des->sf_param	= le16_to_cpu(src->sf_param);
+	des->sf_time_last_complete      =
+				le64_to_cpu(src->sf_time_last_complete);
+	des->sf_time_latest_start       =
+				le64_to_cpu(src->sf_time_latest_start);
+	des->sf_time_last_checkpoint    =
+				le64_to_cpu(src->sf_time_last_checkpoint);
+	des->sf_pos_latest_start	=
+				le64_to_cpu(src->sf_pos_latest_start);
+	des->sf_pos_last_checkpoint     =
+				le64_to_cpu(src->sf_pos_last_checkpoint);
+	des->sf_pos_first_inconsistent  =
+				le64_to_cpu(src->sf_pos_first_inconsistent);
+	des->sf_items_checked		=
+				le64_to_cpu(src->sf_items_checked);
+	des->sf_items_updated		=
+				le64_to_cpu(src->sf_items_updated);
+	des->sf_items_failed		=
+				le64_to_cpu(src->sf_items_failed);
+	des->sf_items_updated_prior     =
+				le64_to_cpu(src->sf_items_updated_prior);
+	des->sf_run_time	= le32_to_cpu(src->sf_run_time);
+	des->sf_success_count   = le32_to_cpu(src->sf_success_count);
+	des->sf_oi_count	= le16_to_cpu(src->sf_oi_count);
+	des->sf_internal_flags	= le16_to_cpu(src->sf_internal_flags);
+	memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
+}
+
+static void scrub_file_to_le(struct scrub_file *des, struct scrub_file *src)
+{
+	uuid_copy(&des->sf_uuid, &src->sf_uuid);
+	des->sf_flags	= cpu_to_le64(src->sf_flags);
+	des->sf_magic	= cpu_to_le32(src->sf_magic);
+	des->sf_status	= cpu_to_le16(src->sf_status);
+	des->sf_param	= cpu_to_le16(src->sf_param);
+	des->sf_time_last_complete      =
+				cpu_to_le64(src->sf_time_last_complete);
+	des->sf_time_latest_start       =
+				cpu_to_le64(src->sf_time_latest_start);
+	des->sf_time_last_checkpoint    =
+				cpu_to_le64(src->sf_time_last_checkpoint);
+	des->sf_pos_latest_start	=
+				cpu_to_le64(src->sf_pos_latest_start);
+	des->sf_pos_last_checkpoint     =
+				cpu_to_le64(src->sf_pos_last_checkpoint);
+	des->sf_pos_first_inconsistent  =
+				cpu_to_le64(src->sf_pos_first_inconsistent);
+	des->sf_items_checked		=
+				cpu_to_le64(src->sf_items_checked);
+	des->sf_items_updated		=
+				cpu_to_le64(src->sf_items_updated);
+	des->sf_items_failed		=
+				cpu_to_le64(src->sf_items_failed);
+	des->sf_items_updated_prior     =
+				cpu_to_le64(src->sf_items_updated_prior);
+	des->sf_run_time	= cpu_to_le32(src->sf_run_time);
+	des->sf_success_count   = cpu_to_le32(src->sf_success_count);
+	des->sf_oi_count	= cpu_to_le16(src->sf_oi_count);
+	des->sf_internal_flags	= cpu_to_le16(src->sf_internal_flags);
+	memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
+}
+
+void scrub_file_init(struct lustre_scrub *scrub, uuid_t uuid)
+{
+	struct scrub_file *sf = &scrub->os_file;
+
+	memset(sf, 0, sizeof(*sf));
+	uuid_copy(&sf->sf_uuid, &uuid);
+	sf->sf_magic = SCRUB_MAGIC_V2;
+	sf->sf_status = SS_INIT;
+}
+EXPORT_SYMBOL(scrub_file_init);
+
+void scrub_file_reset(struct lustre_scrub *scrub, uuid_t uuid, u64 flags)
+{
+	struct scrub_file *sf = &scrub->os_file;
+
+	CDEBUG(D_LFSCK, "%s: reset OI scrub file, old flags = "
+	       "%#llx, add flags = %#llx\n",
+	       scrub->os_name, sf->sf_flags, flags);
+
+	uuid_copy(&sf->sf_uuid, &uuid);
+	sf->sf_magic = SCRUB_MAGIC_V2;
+	sf->sf_status = SS_INIT;
+	sf->sf_flags |= flags;
+	sf->sf_flags &= ~SF_AUTO;
+	sf->sf_run_time = 0;
+	sf->sf_time_latest_start = 0;
+	sf->sf_time_last_checkpoint = 0;
+	sf->sf_pos_latest_start = 0;
+	sf->sf_pos_last_checkpoint = 0;
+	sf->sf_pos_first_inconsistent = 0;
+	sf->sf_items_checked = 0;
+	sf->sf_items_updated = 0;
+	sf->sf_items_failed = 0;
+	sf->sf_items_noscrub = 0;
+	sf->sf_items_igif = 0;
+	if (!scrub->os_in_join)
+		sf->sf_items_updated_prior = 0;
+}
+EXPORT_SYMBOL(scrub_file_reset);
+
+int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	struct lu_buf buf = {
+		.lb_buf = &scrub->os_file_disk,
+		.lb_len = sizeof(scrub->os_file_disk)
+	};
+	loff_t pos = 0;
+	int rc;
+
+	rc = dt_read(env, scrub->os_obj, &buf, &pos);
+	/* failure */
+	if (rc < 0) {
+		CERROR("%s: fail to load scrub file: rc = %d\n",
+		       scrub->os_name, rc);
+		return rc;
+	}
+
+	/* empty */
+	if (!rc)
+		return -ENOENT;
+
+	/* corrupted */
+	if (rc < buf.lb_len) {
+		CDEBUG(D_LFSCK, "%s: fail to load scrub file, "
+		       "expected = %d: rc = %d\n",
+		       scrub->os_name, (int)buf.lb_len, rc);
+		return -EFAULT;
+	}
+
+	scrub_file_to_cpu(sf, &scrub->os_file_disk);
+	if (sf->sf_magic == SCRUB_MAGIC_V1) {
+		CWARN("%s: reset scrub OI count for format change (LU-16655)\n",
+		      scrub->os_name);
+		sf->sf_oi_count = 0;
+	} else if (sf->sf_magic != SCRUB_MAGIC_V2) {
+		CDEBUG(D_LFSCK, "%s: invalid scrub magic %#x, should be %#x\n",
+		       scrub->os_name, sf->sf_magic, SCRUB_MAGIC_V2);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(scrub_file_load);
+
+int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file_disk;
+	struct dt_object *obj = scrub->os_obj;
+	struct dt_device *dev = scrub_obj2dev(obj);
+	struct lu_buf buf = {
+		.lb_buf = sf,
+		.lb_len = sizeof(*sf)
+	};
+	struct thandle *th;
+	loff_t pos = 0;
+	int rc;
+	ENTRY;
+
+	/* Skip store under rdonly mode. */
+	if (dev->dd_rdonly)
+		RETURN(0);
+
+	scrub_file_to_le(sf, &scrub->os_file);
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		GOTO(log, rc = PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, obj, &buf, pos, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, obj, &buf, &pos, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+
+log:
+	if (rc)
+		CERROR("%s: store scrub file: rc = %d\n",
+		       scrub->os_name, rc);
+	else
+		CDEBUG(D_LFSCK, "%s: store scrub file: rc = %d\n",
+		       scrub->os_name, rc);
+
+	scrub->os_time_last_checkpoint = ktime_get_seconds();
+	scrub->os_time_next_checkpoint = scrub->os_time_last_checkpoint +
+					 SCRUB_CHECKPOINT_INTERVAL;
+	return rc;
+}
+EXPORT_SYMBOL(scrub_file_store);
+
+bool scrub_needs_check(struct lustre_scrub *scrub, const struct lu_fid *fid,
+		       u64 index)
+{
+	bool check = true;
+
+	if (!fid_is_norm(fid) && !fid_is_igif(fid))
+		check = false;
+	else if (scrub->os_running && scrub->os_pos_current > index)
+		check = false;
+	else if (scrub->os_auto_scrub_interval == AS_NEVER)
+		check = false;
+	else if (ktime_get_real_seconds() <
+		 scrub->os_file.sf_time_last_complete +
+		 scrub->os_auto_scrub_interval)
+		check = false;
+
+	return check;
+}
+EXPORT_SYMBOL(scrub_needs_check);
+
+int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	time64_t now = ktime_get_seconds();
+	int rc;
+
+	if (likely(now < scrub->os_time_next_checkpoint ||
+		   scrub->os_new_checked == 0))
+		return 0;
+
+	CDEBUG(D_LFSCK, "%s: OI scrub checkpoint at pos %llu\n",
+	       scrub->os_name, scrub->os_pos_current);
+
+	down_write(&scrub->os_rwsem);
+	sf->sf_items_checked += scrub->os_new_checked;
+	scrub->os_new_checked = 0;
+	sf->sf_pos_last_checkpoint = scrub->os_pos_current;
+	sf->sf_time_last_checkpoint = ktime_get_real_seconds();
+	sf->sf_run_time += now - scrub->os_time_last_checkpoint;
+	rc = scrub_file_store(env, scrub);
+	up_write(&scrub->os_rwsem);
+
+	return rc;
+}
+EXPORT_SYMBOL(scrub_checkpoint);
+
+int scrub_thread_prep(const struct lu_env *env, struct lustre_scrub *scrub,
+		      uuid_t uuid, u64 start)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	u32 flags = scrub->os_start_flags;
+	bool drop_dryrun = false;
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_LFSCK, "%s: OI scrub prep, flags = 0x%x\n",
+	       scrub->os_name, flags);
+
+	down_write(&scrub->os_rwsem);
+	if (flags & SS_SET_FAILOUT)
+		sf->sf_param |= SP_FAILOUT;
+	else if (flags & SS_CLEAR_FAILOUT)
+		sf->sf_param &= ~SP_FAILOUT;
+
+	if (flags & SS_SET_DRYRUN) {
+		sf->sf_param |= SP_DRYRUN;
+	} else if (flags & SS_CLEAR_DRYRUN && sf->sf_param & SP_DRYRUN) {
+		sf->sf_param &= ~SP_DRYRUN;
+		drop_dryrun = true;
+	}
+
+	if (flags & SS_RESET)
+		scrub_file_reset(scrub, uuid, 0);
+
+	spin_lock(&scrub->os_lock);
+	scrub->os_partial_scan = 0;
+	if (flags & SS_AUTO_FULL) {
+		scrub->os_full_speed = 1;
+		sf->sf_flags |= SF_AUTO;
+	} else if (flags & SS_AUTO_PARTIAL) {
+		scrub->os_full_speed = 0;
+		scrub->os_partial_scan = 1;
+		sf->sf_flags |= SF_AUTO;
+	} else if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT |
+				   SF_UPGRADE)) {
+		scrub->os_full_speed = 1;
+	} else {
+		scrub->os_full_speed = 0;
+	}
+
+	scrub->os_in_prior = 0;
+	scrub->os_waiting = 0;
+	scrub->os_paused = 0;
+	scrub->os_in_join = 0;
+	scrub->os_full_scrub = 0;
+	spin_unlock(&scrub->os_lock);
+	scrub->os_new_checked = 0;
+	if (drop_dryrun && sf->sf_pos_first_inconsistent != 0)
+		sf->sf_pos_latest_start = sf->sf_pos_first_inconsistent;
+	else if (sf->sf_pos_last_checkpoint != 0)
+		sf->sf_pos_latest_start = sf->sf_pos_last_checkpoint + 1;
+	else
+		sf->sf_pos_latest_start = start;
+
+	scrub->os_pos_current = sf->sf_pos_latest_start;
+	sf->sf_status = SS_SCANNING;
+	sf->sf_time_latest_start = ktime_get_real_seconds();
+	sf->sf_time_last_checkpoint = sf->sf_time_latest_start;
+	sf->sf_pos_last_checkpoint = sf->sf_pos_latest_start - 1;
+	rc = scrub_file_store(env, scrub);
+	if (rc == 0) {
+		spin_lock(&scrub->os_lock);
+		scrub->os_running = 1;
+		spin_unlock(&scrub->os_lock);
+		wake_up_var(scrub);
+	}
+	up_write(&scrub->os_rwsem);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(scrub_thread_prep);
+
+int scrub_thread_post(const struct lu_env *env, struct lustre_scrub *scrub,
+		      int result)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_LFSCK, "%s: OI scrub post with result = %d\n",
+	       scrub->os_name, result);
+
+	down_write(&scrub->os_rwsem);
+	spin_lock(&scrub->os_lock);
+	scrub->os_running = 0;
+	spin_unlock(&scrub->os_lock);
+	if (scrub->os_new_checked > 0) {
+		sf->sf_items_checked += scrub->os_new_checked;
+		scrub->os_new_checked = 0;
+		sf->sf_pos_last_checkpoint = scrub->os_pos_current;
+	}
+	sf->sf_time_last_checkpoint = ktime_get_real_seconds();
+	if (result > 0) {
+		sf->sf_status = SS_COMPLETED;
+		if (!(sf->sf_param & SP_DRYRUN)) {
+			memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE);
+			sf->sf_flags &= ~(SF_RECREATED | SF_INCONSISTENT |
+					  SF_UPGRADE | SF_AUTO);
+		}
+		sf->sf_time_last_complete = sf->sf_time_last_checkpoint;
+		sf->sf_success_count++;
+	} else if (result == 0) {
+		if (scrub->os_paused)
+			sf->sf_status = SS_PAUSED;
+		else
+			sf->sf_status = SS_STOPPED;
+	} else {
+		sf->sf_status = SS_FAILED;
+	}
+	sf->sf_run_time += ktime_get_seconds() -
+			   scrub->os_time_last_checkpoint;
+
+	rc = scrub_file_store(env, scrub);
+	up_write(&scrub->os_rwsem);
+
+	RETURN(rc < 0 ? rc : result);
+}
+EXPORT_SYMBOL(scrub_thread_post);
+
+int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub,
+		void *data, __u32 flags)
+{
+	struct task_struct *task;
+	int rc;
+	ENTRY;
+
+	if (scrub->os_task)
+		RETURN(-EALREADY);
+
+	if (scrub->os_file.sf_status == SS_COMPLETED) {
+		if (!(flags & SS_SET_FAILOUT))
+			flags |= SS_CLEAR_FAILOUT;
+
+		if (!(flags & SS_SET_DRYRUN))
+			flags |= SS_CLEAR_DRYRUN;
+
+		flags |= SS_RESET;
+	}
+
+	task = kthread_create(threadfn, data, "OI_scrub");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("%s: cannot start iteration thread: rc = %d\n",
+		       scrub->os_name, rc);
+		RETURN(rc);
+	}
+	spin_lock(&scrub->os_lock);
+	if (scrub->os_task) {
+		/* Lost a race */
+		spin_unlock(&scrub->os_lock);
+		kthread_stop(task);
+		RETURN(-EALREADY);
+	}
+	scrub->os_start_flags = flags;
+	scrub->os_task = task;
+	wake_up_process(task);
+	spin_unlock(&scrub->os_lock);
+	wait_var_event(scrub, scrub->os_running || !scrub->os_task);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(scrub_start);
+
+void scrub_stop(struct lustre_scrub *scrub)
+{
+	struct task_struct *task;
+
+	spin_lock(&scrub->os_lock);
+	scrub->os_running = 0;
+	spin_unlock(&scrub->os_lock);
+	task = xchg(&scrub->os_task, NULL);
+	if (task)
+		kthread_stop(task);
+}
+EXPORT_SYMBOL(scrub_stop);
+
+const char *const scrub_status_names[] = {
+	"init",
+	"scanning",
+	"completed",
+	"failed",
+	"stopped",
+	"paused",
+	"crashed",
+	NULL
+};
+
+const char *const scrub_flags_names[] = {
+	"recreated",
+	"inconsistent",
+	"auto",
+	"upgrade",
+	NULL
+};
+
+const char *const scrub_param_names[] = {
+	"failout",
+	"dryrun",
+	NULL
+};
+
+static void scrub_bits_dump(struct seq_file *m, int bits,
+			    const char *const names[],
+			    const char *prefix)
+{
+	int flag;
+	int i;
+
+	seq_printf(m, "%s:%c", prefix, bits != 0 ? ' ' : '\n');
+
+	for (i = 0, flag = 1; bits != 0; i++, flag = BIT(i)) {
+		if (flag & bits) {
+			bits &= ~flag;
+			seq_printf(m, "%s%c", names[i],
+				   bits != 0 ? ',' : '\n');
+		}
+	}
+}
+
+static void scrub_time_dump(struct seq_file *m, time64_t time,
+			    const char *prefix)
+{
+	if (time != 0)
+		seq_printf(m, "%s: %llu seconds\n", prefix,
+			   ktime_get_real_seconds() - time);
+	else
+		seq_printf(m, "%s: N/A\n", prefix);
+}
+
+static void scrub_pos_dump(struct seq_file *m, __u64 pos, const char *prefix)
+{
+	if (pos != 0)
+		seq_printf(m, "%s: %llu\n", prefix, pos);
+	else
+		seq_printf(m, "%s: N/A\n", prefix);
+}
+
+void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	u64 checked;
+	s64 speed;
+
+	down_read(&scrub->os_rwsem);
+	seq_printf(m, "name: OI_scrub\n"
+		   "magic: 0x%x\n"
+		   "oi_files: %d\n"
+		   "status: %s\n",
+		   sf->sf_magic, (int)sf->sf_oi_count,
+		   scrub_status_names[sf->sf_status]);
+
+	scrub_bits_dump(m, sf->sf_flags, scrub_flags_names, "flags");
+
+	scrub_bits_dump(m, sf->sf_param, scrub_param_names, "param");
+
+	scrub_time_dump(m, sf->sf_time_last_complete,
+			"time_since_last_completed");
+
+	scrub_time_dump(m, sf->sf_time_latest_start,
+			"time_since_latest_start");
+
+	scrub_time_dump(m, sf->sf_time_last_checkpoint,
+			"time_since_last_checkpoint");
+
+	scrub_pos_dump(m, sf->sf_pos_latest_start,
+			"latest_start_position");
+
+	scrub_pos_dump(m, sf->sf_pos_last_checkpoint,
+			"last_checkpoint_position");
+
+	scrub_pos_dump(m, sf->sf_pos_first_inconsistent,
+			"first_failure_position");
+
+	checked = sf->sf_items_checked + scrub->os_new_checked;
+	seq_printf(m, "checked: %llu\n"
+		   "%s: %llu\n"
+		   "failed: %llu\n"
+		   "prior_%s: %llu\n"
+		   "noscrub: %llu\n"
+		   "igif: %llu\n"
+		   "success_count: %u\n",
+		   checked,
+		   sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated",
+		   sf->sf_items_updated, sf->sf_items_failed,
+		   sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated",
+		   sf->sf_items_updated_prior, sf->sf_items_noscrub,
+		   sf->sf_items_igif, sf->sf_success_count);
+
+	speed = checked;
+	if (scrub->os_running) {
+		s64 new_checked = scrub->os_new_checked;
+		time64_t duration;
+		time64_t rtime;
+
+		/* Since the time resolution is in seconds for new system
+		 * or small devices it ismore likely that duration will be
+		 * zero which will lead to inaccurate results.
+		 */
+		duration = ktime_get_seconds() -
+			   scrub->os_time_last_checkpoint;
+		if (duration != 0)
+			new_checked = div_s64(new_checked, duration);
+
+		rtime = sf->sf_run_time + duration;
+		if (rtime != 0)
+			speed = div_s64(speed, rtime);
+
+		seq_printf(m, "run_time: %lld seconds\n"
+			   "average_speed: %lld objects/sec\n"
+			   "real_time_speed: %lld objects/sec\n"
+			   "current_position: %llu\n"
+			   "scrub_in_prior: %s\n"
+			   "scrub_full_speed: %s\n"
+			   "partial_scan: %s\n",
+			   rtime, speed, new_checked,
+			   scrub->os_pos_current,
+			   scrub->os_in_prior ? "yes" : "no",
+			   scrub->os_full_speed ? "yes" : "no",
+			   scrub->os_partial_scan ? "yes" : "no");
+	} else {
+		if (sf->sf_run_time != 0)
+			speed = div_s64(speed, sf->sf_run_time);
+		seq_printf(m, "run_time: %d seconds\n"
+			   "average_speed: %lld objects/sec\n"
+			   "real_time_speed: N/A\n"
+			   "current_position: N/A\n",
+			   sf->sf_run_time, speed);
+	}
+
+	up_read(&scrub->os_rwsem);
+}
+EXPORT_SYMBOL(scrub_dump);
+
+int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid,
+		    const struct lu_fid *cfid, __u64 child,
+		    const char *name, int namelen)
+{
+	struct lustre_index_restore_unit *liru;
+	int len = sizeof(*liru) + namelen + 1;
+
+	OBD_ALLOC(liru, len);
+	if (!liru)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&liru->liru_link);
+	liru->liru_pfid = *pfid;
+	liru->liru_cfid = *cfid;
+	liru->liru_clid = child;
+	liru->liru_len = len;
+	memcpy(liru->liru_name, name, namelen);
+	liru->liru_name[namelen] = 0;
+	list_add_tail(&liru->liru_link, head);
+
+	return 0;
+}
+EXPORT_SYMBOL(lustre_liru_new);
+
+int lustre_index_register(struct dt_device *dev, const char *devname,
+			  struct list_head *head, spinlock_t *lock, int *guard,
+			  const struct lu_fid *fid,
+			  __u32 keysize, __u32 recsize)
+{
+	struct lustre_index_backup_unit *libu, *pos;
+	int rc = 0;
+	ENTRY;
+
+	if (dev->dd_rdonly || *guard)
+		RETURN(1);
+
+	OBD_ALLOC_PTR(libu);
+	if (!libu)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&libu->libu_link);
+	libu->libu_keysize = keysize;
+	libu->libu_recsize = recsize;
+	libu->libu_fid = *fid;
+
+	spin_lock(lock);
+	if (unlikely(*guard)) {
+		spin_unlock(lock);
+		OBD_FREE_PTR(libu);
+
+		RETURN(1);
+	}
+
+	list_for_each_entry_reverse(pos, head, libu_link) {
+		rc = lu_fid_cmp(&pos->libu_fid, fid);
+		if (rc < 0) {
+			list_add(&libu->libu_link, &pos->libu_link);
+			spin_unlock(lock);
+
+			RETURN(0);
+		}
+
+		if (!rc) {
+			/* Registered already. But the former registered one
+			 * has different keysize/recsize. It may because that
+			 * the former values are from disk and corrupted, then
+			 * replace it with new values. */
+			if (unlikely(keysize != pos->libu_keysize ||
+				     recsize != pos->libu_recsize)) {
+				CWARN("%s: the index "DFID" has registered "
+				      "with %u/%u, may be invalid, replace "
+				      "with %u/%u\n",
+				      devname, PFID(fid), pos->libu_keysize,
+				      pos->libu_recsize, keysize, recsize);
+
+				pos->libu_keysize = keysize;
+				pos->libu_recsize = recsize;
+			} else {
+				rc = 1;
+			}
+
+			spin_unlock(lock);
+			OBD_FREE_PTR(libu);
+
+			RETURN(rc);
+		}
+	}
+
+	list_add(&libu->libu_link, head);
+	spin_unlock(lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_index_register);
+
+static void lustre_index_degister(struct list_head *head, spinlock_t *lock,
+				  const struct lu_fid *fid)
+{
+	struct lustre_index_backup_unit *libu;
+	int rc = -ENOENT;
+
+	spin_lock(lock);
+	list_for_each_entry_reverse(libu, head, libu_link) {
+		rc = lu_fid_cmp(&libu->libu_fid, fid);
+		/* NOT registered. */
+		if (rc < 0)
+			break;
+
+		if (!rc) {
+			list_del(&libu->libu_link);
+			break;
+		}
+	}
+	spin_unlock(lock);
+
+	if (!rc)
+		OBD_FREE_PTR(libu);
+}
+
+static void
+lustre_index_backup_make_header(struct lustre_index_backup_header *header,
+				__u32 keysize, __u32 recsize,
+				const struct lu_fid *fid, __u32 count)
+{
+	memset(header, 0, sizeof(*header));
+	header->libh_magic = cpu_to_le32(INDEX_BACKUP_MAGIC_V1);
+	header->libh_count = cpu_to_le32(count);
+	header->libh_keysize = cpu_to_le32(keysize);
+	header->libh_recsize = cpu_to_le32(recsize);
+	fid_cpu_to_le(&header->libh_owner, fid);
+}
+
+static int lustre_index_backup_body(const struct lu_env *env,
+				    struct dt_object *obj, loff_t *pos,
+				    void *buf, int bufsize)
+{
+	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
+	struct thandle *th;
+	struct lu_buf lbuf = {
+		.lb_buf = buf,
+		.lb_len = bufsize
+	};
+	int rc;
+	ENTRY;
+
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, obj, &lbuf, *pos, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, obj, &lbuf, pos, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	return rc;
+}
+
+static int lustre_index_backup_header(const struct lu_env *env,
+				      struct dt_object *obj,
+				      const struct lu_fid *tgt_fid,
+				      __u32 keysize, __u32 recsize,
+				      void *buf, int bufsize, int count)
+{
+	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
+	struct lustre_index_backup_header *header = buf;
+	struct lu_attr *la = buf;
+	struct thandle *th;
+	struct lu_buf lbuf = {
+		.lb_buf = header,
+		.lb_len = sizeof(*header)
+	};
+	loff_t size = sizeof(*header) + (keysize + recsize) * count;
+	loff_t pos = 0;
+	int rc;
+	bool punch = false;
+	ENTRY;
+
+	LASSERT(sizeof(*la) <= bufsize);
+	LASSERT(sizeof(*header) <= bufsize);
+
+	rc = dt_attr_get(env, obj, la);
+	if (rc)
+		RETURN(rc);
+
+	if (la->la_size > size)
+		punch = true;
+
+	lustre_index_backup_make_header(header, keysize, recsize,
+					tgt_fid, count);
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, obj, &lbuf, pos, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	if (punch) {
+		rc = dt_declare_punch(env, obj, size, OBD_OBJECT_EOF, th);
+		if (rc)
+			GOTO(stop, rc);
+	}
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, obj, &lbuf, &pos, th);
+	if (!rc && punch)
+		rc = dt_punch(env, obj, size, OBD_OBJECT_EOF, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	return rc;
+}
+
+static int lustre_index_update_lma(const struct lu_env *env,
+				   struct dt_object *obj,
+				   void *buf, int bufsize)
+{
+	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
+	struct lustre_mdt_attrs *lma = buf;
+	struct lu_buf lbuf = {
+		.lb_buf = lma,
+		.lb_len = sizeof(struct lustre_ost_attrs)
+	};
+	struct thandle *th;
+	int fl = LU_XATTR_REPLACE;
+	int rc;
+	ENTRY;
+
+	LASSERT(bufsize >= lbuf.lb_len);
+
+	rc = dt_xattr_get(env, obj, &lbuf, XATTR_NAME_LMA);
+	if (unlikely(rc == -ENODATA)) {
+		fl = LU_XATTR_CREATE;
+		lustre_lma_init(lma, lu_object_fid(&obj->do_lu),
+				LMAC_IDX_BACKUP, 0);
+		rc = sizeof(*lma);
+	} else if (rc < sizeof(*lma)) {
+		RETURN(rc < 0 ? rc : -EFAULT);
+	} else {
+		lustre_lma_swab(lma);
+		if (lma->lma_compat & LMAC_IDX_BACKUP)
+			RETURN(0);
+
+		lma->lma_compat |= LMAC_IDX_BACKUP;
+	}
+
+	lustre_lma_swab(lma);
+	lbuf.lb_len = rc;
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		RETURN(rc);
+
+	rc = dt_declare_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	return rc;
+}
+
+static int lustre_index_backup_one(const struct lu_env *env,
+				   struct local_oid_storage *los,
+				   struct dt_object *parent,
+				   struct lustre_index_backup_unit *libu,
+				   char *buf, int bufsize)
+{
+	struct dt_device *dev = scrub_obj2dev(parent);
+	struct dt_object *tgt_obj = NULL;
+	struct dt_object *bak_obj = NULL;
+	const struct dt_it_ops *iops;
+	struct dt_it *di;
+	loff_t pos = sizeof(struct lustre_index_backup_header);
+	int count = 0;
+	int size = 0;
+	int rc;
+	ENTRY;
+
+	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     &libu->libu_fid, NULL));
+	if (IS_ERR_OR_NULL(tgt_obj))
+		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
+
+	if (!dt_object_exists(tgt_obj))
+		GOTO(out, rc = 0);
+
+	if (!tgt_obj->do_index_ops) {
+		struct dt_index_features feat;
+
+		feat.dif_flags = DT_IND_UPDATE;
+		feat.dif_keysize_min = libu->libu_keysize;
+		feat.dif_keysize_max = libu->libu_keysize;
+		feat.dif_recsize_min = libu->libu_recsize;
+		feat.dif_recsize_max = libu->libu_recsize;
+		feat.dif_ptrsize = 4;
+		rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, &feat);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	lustre_fid2lbx(buf, &libu->libu_fid, bufsize);
+	bak_obj = local_file_find_or_create(env, los, parent, buf,
+					    S_IFREG | S_IRUGO | S_IWUSR);
+	if (IS_ERR_OR_NULL(bak_obj))
+		GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT);
+
+	iops = &tgt_obj->do_index_ops->dio_it;
+	di = iops->init(env, tgt_obj, 0);
+	if (IS_ERR(di))
+		GOTO(out, rc = PTR_ERR(di));
+
+	rc = iops->load(env, di, 0);
+	if (!rc)
+		rc = iops->next(env, di);
+	else if (rc > 0)
+		rc = 0;
+
+	while (!rc) {
+		void *key;
+		void *rec;
+
+		key = iops->key(env, di);
+		memcpy(&buf[size], key, libu->libu_keysize);
+		size += libu->libu_keysize;
+		rec = &buf[size];
+		rc = iops->rec(env, di, rec, 0);
+		if (rc)
+			GOTO(fini, rc);
+
+		size += libu->libu_recsize;
+		count++;
+		if (size + libu->libu_keysize + libu->libu_recsize > bufsize) {
+			rc = lustre_index_backup_body(env, bak_obj, &pos,
+						      buf, size);
+			if (rc)
+				GOTO(fini, rc);
+
+			size = 0;
+		}
+
+		rc = iops->next(env, di);
+	}
+
+	if (rc >= 0 && size > 0)
+		rc = lustre_index_backup_body(env, bak_obj, &pos, buf, size);
+
+	if (rc < 0)
+		GOTO(fini, rc);
+
+	rc = lustre_index_backup_header(env, bak_obj, &libu->libu_fid,
+					libu->libu_keysize, libu->libu_recsize,
+					buf, bufsize, count);
+	if (!rc)
+		rc = lustre_index_update_lma(env, tgt_obj, buf, bufsize);
+
+	if (!rc && OBD_FAIL_CHECK(OBD_FAIL_OSD_INDEX_CRASH)) {
+		LASSERT(bufsize >= 512);
+
+		pos = 0;
+		memset(buf, 0, 512);
+		lustre_index_backup_body(env, tgt_obj, &pos, buf, 512);
+	}
+
+	GOTO(fini, rc);
+
+fini:
+	iops->fini(env, di);
+out:
+	if (!IS_ERR_OR_NULL(tgt_obj))
+		dt_object_put_nocache(env, tgt_obj);
+	if (!IS_ERR_OR_NULL(bak_obj))
+		dt_object_put_nocache(env, bak_obj);
+	return rc;
+}
+
+void lustre_index_backup(const struct lu_env *env, struct dt_device *dev,
+			 const char *devname, struct list_head *head,
+			 spinlock_t *lock, int *guard, bool backup)
+{
+	struct lustre_index_backup_unit *libu;
+	struct local_oid_storage *los = NULL;
+	struct dt_object *parent = NULL;
+	char *buf = NULL;
+	struct lu_fid fid;
+	int rc;
+	ENTRY;
+
+	if (dev->dd_rdonly || *guard)
+		RETURN_EXIT;
+
+	spin_lock(lock);
+	*guard = 1;
+	spin_unlock(lock);
+
+	if (list_empty(head))
+		RETURN_EXIT;
+
+	/* Handle kinds of failures during mount process. */
+	if (!dev->dd_lu_dev.ld_site || !dev->dd_lu_dev.ld_site->ls_top_dev)
+		backup = false;
+
+	if (backup) {
+		OBD_ALLOC_LARGE(buf, INDEX_BACKUP_BUFSIZE);
+		if (!buf) {
+			backup = false;
+			goto scan;
+		}
+
+		lu_local_obj_fid(&fid, INDEX_BACKUP_OID);
+		parent = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+						    &fid, NULL));
+		if (IS_ERR_OR_NULL(parent)) {
+			CERROR("%s: failed to locate backup dir: rc = %ld\n",
+			       devname, parent ? PTR_ERR(parent) : -ENOENT);
+			backup = false;
+			goto scan;
+		}
+
+		lu_local_name_obj_fid(&fid, 1);
+		rc = local_oid_storage_init(env, dev, &fid, &los);
+		if (rc) {
+			CERROR("%s: failed to init local storage: rc = %d\n",
+			       devname, rc);
+			backup = false;
+		}
+	}
+
+scan:
+	spin_lock(lock);
+	while (!list_empty(head)) {
+		libu = list_entry(head->next,
+				  struct lustre_index_backup_unit, libu_link);
+		list_del_init(&libu->libu_link);
+		spin_unlock(lock);
+
+		if (backup) {
+			rc = lustre_index_backup_one(env, los, parent, libu,
+						     buf, INDEX_BACKUP_BUFSIZE);
+			CDEBUG(D_WARNING, "%s: backup index "DFID": rc = %d\n",
+			       devname, PFID(&libu->libu_fid), rc);
+		}
+
+		OBD_FREE_PTR(libu);
+		spin_lock(lock);
+	}
+	spin_unlock(lock);
+
+	if (los)
+		local_oid_storage_fini(env, los);
+	if (parent)
+		dt_object_put_nocache(env, parent);
+	if (buf)
+		OBD_FREE_LARGE(buf, INDEX_BACKUP_BUFSIZE);
+
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_index_backup);
+
+int lustre_index_restore(const struct lu_env *env, struct dt_device *dev,
+			 const struct lu_fid *parent_fid,
+			 const struct lu_fid *tgt_fid,
+			 const struct lu_fid *bak_fid, const char *name,
+			 struct list_head *head, spinlock_t *lock,
+			 char *buf, int bufsize)
+{
+	struct dt_object *parent_obj = NULL;
+	struct dt_object *tgt_obj = NULL;
+	struct dt_object *bak_obj = NULL;
+	struct lustre_index_backup_header *header;
+	struct dt_index_features *feat;
+	struct dt_object_format *dof;
+	struct lu_attr *la;
+	struct thandle *th;
+	struct lu_object_conf conf;
+	struct dt_insert_rec ent;
+	struct lu_buf lbuf;
+	struct lu_fid tfid;
+	loff_t pos = 0;
+	__u32 keysize;
+	__u32 recsize;
+	__u32 pairsize;
+	int count;
+	int rc;
+	bool registered = false;
+	ENTRY;
+
+	LASSERT(bufsize >= sizeof(*la) + sizeof(*dof) +
+		sizeof(*feat) + sizeof(*header));
+
+	memset(buf, 0, bufsize);
+	la = (struct lu_attr *)buf;
+	dof = (void *)la + sizeof(*la);
+	feat = (void *)dof + sizeof(*dof);
+	header = (void *)feat + sizeof(*feat);
+	lbuf.lb_buf = header;
+	lbuf.lb_len = sizeof(*header);
+
+	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     tgt_fid, NULL));
+	if (IS_ERR_OR_NULL(tgt_obj))
+		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
+
+	bak_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     bak_fid, NULL));
+	if (IS_ERR_OR_NULL(bak_obj))
+		GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT);
+
+	if (!dt_object_exists(bak_obj))
+		GOTO(out, rc = -ENOENT);
+
+	parent_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+						parent_fid, NULL));
+	if (IS_ERR_OR_NULL(parent_obj))
+		GOTO(out, rc = parent_obj ? PTR_ERR(parent_obj) : -ENOENT);
+
+	LASSERT(dt_object_exists(parent_obj));
+
+	if (unlikely(!dt_try_as_dir(env, parent_obj)))
+		GOTO(out, rc = -ENOTDIR);
+
+	rc = dt_attr_get(env, tgt_obj, la);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_record_read(env, bak_obj, &lbuf, &pos);
+	if (rc)
+		GOTO(out, rc);
+
+	if (le32_to_cpu(header->libh_magic) != INDEX_BACKUP_MAGIC_V1)
+		GOTO(out, rc = -EINVAL);
+
+	fid_le_to_cpu(&tfid, &header->libh_owner);
+	if (unlikely(!lu_fid_eq(tgt_fid, &tfid)))
+		GOTO(out, rc = -EINVAL);
+
+	keysize = le32_to_cpu(header->libh_keysize);
+	recsize = le32_to_cpu(header->libh_recsize);
+	pairsize = keysize + recsize;
+
+	memset(feat, 0, sizeof(*feat));
+	feat->dif_flags = DT_IND_UPDATE;
+	feat->dif_keysize_min = feat->dif_keysize_max = keysize;
+	feat->dif_recsize_min = feat->dif_recsize_max = recsize;
+	feat->dif_ptrsize = 4;
+
+	/* T1: remove old name entry and destroy old index. */
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_delete(env, parent_obj,
+			       (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_declare_ref_del(env, tgt_obj, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_declare_destroy(env, tgt_obj, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_delete(env, parent_obj, (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, tgt_obj, 0);
+	rc = dt_ref_del(env, tgt_obj, th);
+	if (rc == 0) {
+		if (S_ISDIR(tgt_obj->do_lu.lo_header->loh_attr))
+			dt_ref_del(env, tgt_obj, th);
+		rc = dt_destroy(env, tgt_obj, th);
+	}
+	dt_write_unlock(env, tgt_obj);
+	dt_trans_stop(env, dev, th);
+	if (rc)
+		GOTO(out, rc);
+
+	la->la_valid = LA_MODE | LA_UID | LA_GID;
+	conf.loc_flags = LOC_F_NEW;
+	dof->u.dof_idx.di_feat = feat;
+	dof->dof_type = DFT_INDEX;
+	ent.rec_type = S_IFREG;
+	ent.rec_fid = tgt_fid;
+
+	/* Drop cache before re-create it. */
+	dt_object_put_nocache(env, tgt_obj);
+	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     tgt_fid, &conf));
+	if (IS_ERR_OR_NULL(tgt_obj))
+		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
+
+	LASSERT(!dt_object_exists(tgt_obj));
+
+	/* T2: create new index and insert new name entry. */
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_create(env, tgt_obj, la, NULL, dof, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_declare_insert(env, parent_obj, (const struct dt_rec *)&ent,
+			       (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, tgt_obj, 0);
+	rc = dt_create(env, tgt_obj, la, NULL, dof, th);
+	dt_write_unlock(env, tgt_obj);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_insert(env, parent_obj, (const struct dt_rec *)&ent,
+		       (const struct dt_key *)name, th);
+	dt_trans_stop(env, dev, th);
+	/* Some index name may has been inserted by OSD
+	 * automatically when create the index object. */
+	if (unlikely(rc == -EEXIST))
+		rc = 0;
+	if (rc)
+		GOTO(out, rc);
+
+	/* The new index will register via index_try. */
+	rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, feat);
+	if (rc)
+		GOTO(out, rc);
+
+	registered = true;
+	count = le32_to_cpu(header->libh_count);
+	while (!rc && count > 0) {
+		int size = pairsize * count;
+		int items = count;
+		int i;
+
+		if (size > bufsize) {
+			items = bufsize / pairsize;
+			size = pairsize * items;
+		}
+
+		lbuf.lb_buf = buf;
+		lbuf.lb_len = size;
+		rc = dt_record_read(env, bak_obj, &lbuf, &pos);
+		for (i = 0; i < items && !rc; i++) {
+			void *key = &buf[i * pairsize];
+			void *rec = &buf[i * pairsize + keysize];
+
+			/* Tn: restore the records. */
+			th = dt_trans_create(env, dev);
+			if (!th)
+				GOTO(out, rc = -ENOMEM);
+
+			rc = dt_declare_insert(env, tgt_obj, rec, key, th);
+			if (rc)
+				GOTO(stop, rc);
+
+			rc = dt_trans_start_local(env, dev, th);
+			if (rc)
+				GOTO(stop, rc);
+
+			rc = dt_insert(env, tgt_obj, rec, key, th);
+			if (unlikely(rc == -EEXIST))
+				rc = 0;
+
+			dt_trans_stop(env, dev, th);
+		}
+
+		count -= items;
+	}
+
+	GOTO(out, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	if (rc && registered)
+		/* Degister the index to avoid overwriting the backup. */
+		lustre_index_degister(head, lock, tgt_fid);
+
+out:
+	if (!IS_ERR_OR_NULL(tgt_obj))
+		dt_object_put_nocache(env, tgt_obj);
+	if (!IS_ERR_OR_NULL(bak_obj))
+		dt_object_put_nocache(env, bak_obj);
+	if (!IS_ERR_OR_NULL(parent_obj))
+		dt_object_put_nocache(env, parent_obj);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_index_restore);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
new file mode 100644
index 0000000000000..3f0c50e2c32cb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/statfs.h>
+#include <lustre_export.h>
+#include <lustre_net.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+	memset(osfs, 0, sizeof(*osfs));
+	osfs->os_type = sfs->f_type;
+	osfs->os_blocks = sfs->f_blocks;
+	osfs->os_bfree = sfs->f_bfree;
+	osfs->os_bavail = sfs->f_bavail;
+	osfs->os_files = sfs->f_files;
+	osfs->os_ffree = sfs->f_ffree;
+	osfs->os_bsize = sfs->f_bsize;
+	osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+	memset(sfs, 0, sizeof(*sfs));
+	sfs->f_type = osfs->os_type;
+	sfs->f_blocks = osfs->os_blocks;
+	sfs->f_bfree = osfs->os_bfree;
+	sfs->f_bavail = osfs->os_bavail;
+	sfs->f_files = osfs->os_files;
+	sfs->f_ffree = osfs->os_ffree;
+	sfs->f_bsize = osfs->os_bsize;
+	sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
new file mode 100644
index 0000000000000..94a150b266a17
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
@@ -0,0 +1,454 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/upcall_cache.c
+ *
+ * Supplementary groups cache.
+ */
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lnet/lnet-types.h>
+#include <upcall_cache.h>
+
+static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache,
+					      __u64 key, void *args)
+{
+	struct upcall_cache_entry *entry;
+
+	LIBCFS_ALLOC(entry, sizeof(*entry));
+	if (!entry)
+		return NULL;
+
+	UC_CACHE_SET_NEW(entry);
+	INIT_LIST_HEAD(&entry->ue_hash);
+	entry->ue_key = key;
+	atomic_set(&entry->ue_refcount, 0);
+	init_waitqueue_head(&entry->ue_waitq);
+	if (cache->uc_ops->init_entry)
+		cache->uc_ops->init_entry(entry, args);
+	return entry;
+}
+
+/* protected by cache lock */
+static void free_entry(struct upcall_cache *cache,
+		       struct upcall_cache_entry *entry)
+{
+	if (cache->uc_ops->free_entry)
+		cache->uc_ops->free_entry(cache, entry);
+
+	list_del(&entry->ue_hash);
+	CDEBUG(D_OTHER, "destroy cache entry %p for key %llu\n",
+		entry, entry->ue_key);
+	LIBCFS_FREE(entry, sizeof(*entry));
+}
+
+static inline int upcall_compare(struct upcall_cache *cache,
+				 struct upcall_cache_entry *entry,
+				 __u64 key, void *args)
+{
+	if (entry->ue_key != key)
+		return -1;
+
+	if (cache->uc_ops->upcall_compare)
+		return cache->uc_ops->upcall_compare(cache, entry, key, args);
+
+	return 0;
+}
+
+static inline int downcall_compare(struct upcall_cache *cache,
+				   struct upcall_cache_entry *entry,
+				   __u64 key, void *args)
+{
+	if (entry->ue_key != key)
+		return -1;
+
+	if (cache->uc_ops->downcall_compare)
+		return cache->uc_ops->downcall_compare(cache, entry, key, args);
+
+	return 0;
+}
+
+static inline void get_entry(struct upcall_cache_entry *entry)
+{
+	atomic_inc(&entry->ue_refcount);
+}
+
+static inline void put_entry(struct upcall_cache *cache,
+			     struct upcall_cache_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->ue_refcount) &&
+	    (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) {
+		free_entry(cache, entry);
+	}
+}
+
+static int check_unlink_entry(struct upcall_cache *cache,
+			      struct upcall_cache_entry *entry)
+{
+	time64_t now = ktime_get_seconds();
+
+	if (UC_CACHE_IS_VALID(entry) && now < entry->ue_expire)
+		return 0;
+
+	if (UC_CACHE_IS_ACQUIRING(entry)) {
+		if (entry->ue_acquire_expire == 0 ||
+		    now < entry->ue_acquire_expire)
+			return 0;
+
+		UC_CACHE_SET_EXPIRED(entry);
+		wake_up(&entry->ue_waitq);
+	} else if (!UC_CACHE_IS_INVALID(entry)) {
+		UC_CACHE_SET_EXPIRED(entry);
+	}
+
+	list_del_init(&entry->ue_hash);
+	if (!atomic_read(&entry->ue_refcount))
+		free_entry(cache, entry);
+	return 1;
+}
+
+static inline int refresh_entry(struct upcall_cache *cache,
+			 struct upcall_cache_entry *entry)
+{
+	LASSERT(cache->uc_ops->do_upcall);
+	return cache->uc_ops->do_upcall(cache, entry);
+}
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+						  __u64 key, void *args)
+{
+	struct upcall_cache_entry *entry = NULL, *new = NULL, *next;
+	bool failedacquiring = false;
+	struct list_head *head;
+	wait_queue_entry_t wait;
+	int rc, found;
+	ENTRY;
+
+	LASSERT(cache);
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+find_again:
+	found = 0;
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry_safe(entry, next, head, ue_hash) {
+		/* check invalid & expired items */
+		if (check_unlink_entry(cache, entry))
+			continue;
+		if (upcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		if (!new) {
+			spin_unlock(&cache->uc_lock);
+			new = alloc_entry(cache, key, args);
+			if (!new) {
+				CERROR("fail to alloc entry\n");
+				RETURN(ERR_PTR(-ENOMEM));
+			}
+			goto find_again;
+		} else {
+			list_add(&new->ue_hash, head);
+			entry = new;
+		}
+	} else {
+		if (new) {
+			free_entry(cache, new);
+			new = NULL;
+		}
+		list_move(&entry->ue_hash, head);
+	}
+	get_entry(entry);
+
+	/* acquire for new one */
+	if (UC_CACHE_IS_NEW(entry)) {
+		UC_CACHE_SET_ACQUIRING(entry);
+		UC_CACHE_CLEAR_NEW(entry);
+		spin_unlock(&cache->uc_lock);
+		rc = refresh_entry(cache, entry);
+		spin_lock(&cache->uc_lock);
+		entry->ue_acquire_expire = ktime_get_seconds() +
+					   cache->uc_acquire_expire;
+		if (rc < 0) {
+			UC_CACHE_CLEAR_ACQUIRING(entry);
+			UC_CACHE_SET_INVALID(entry);
+			wake_up(&entry->ue_waitq);
+			if (unlikely(rc == -EREMCHG)) {
+				put_entry(cache, entry);
+				GOTO(out, entry = ERR_PTR(rc));
+			}
+		}
+	}
+	/* someone (and only one) is doing upcall upon this item,
+	 * wait it to complete */
+	if (UC_CACHE_IS_ACQUIRING(entry)) {
+		long expiry = (entry == new) ?
+			      cfs_time_seconds(cache->uc_acquire_expire) :
+			      MAX_SCHEDULE_TIMEOUT;
+		long left;
+
+		init_wait(&wait);
+		add_wait_queue(&entry->ue_waitq, &wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock(&cache->uc_lock);
+
+		left = schedule_timeout(expiry);
+
+		spin_lock(&cache->uc_lock);
+		remove_wait_queue(&entry->ue_waitq, &wait);
+		if (UC_CACHE_IS_ACQUIRING(entry)) {
+			/* we're interrupted or upcall failed in the middle */
+			rc = left > 0 ? -EINTR : -ETIMEDOUT;
+			CERROR("acquire for key %llu: error %d\n",
+			       entry->ue_key, rc);
+			put_entry(cache, entry);
+			if (!failedacquiring) {
+				spin_unlock(&cache->uc_lock);
+				failedacquiring = true;
+				new = NULL;
+				goto find_again;
+			}
+			GOTO(out, entry = ERR_PTR(rc));
+		}
+	}
+
+	/* invalid means error, don't need to try again */
+	if (UC_CACHE_IS_INVALID(entry)) {
+		put_entry(cache, entry);
+		GOTO(out, entry = ERR_PTR(-EIDRM));
+	}
+
+	/* check expired
+	 * We can't refresh the existing one because some
+	 * memory might be shared by multiple processes.
+	 */
+	if (check_unlink_entry(cache, entry)) {
+		/* if expired, try again. but if this entry is
+		 * created by me but too quickly turn to expired
+		 * without any error, should at least give a
+		 * chance to use it once.
+		 */
+		if (entry != new) {
+			put_entry(cache, entry);
+			spin_unlock(&cache->uc_lock);
+			new = NULL;
+			goto find_again;
+		}
+	}
+
+	/* Now we know it's good */
+out:
+	spin_unlock(&cache->uc_lock);
+	RETURN(entry);
+}
+EXPORT_SYMBOL(upcall_cache_get_entry);
+
+void upcall_cache_put_entry(struct upcall_cache *cache,
+			    struct upcall_cache_entry *entry)
+{
+	ENTRY;
+
+	if (!entry) {
+		EXIT;
+		return;
+	}
+
+	LASSERT(atomic_read(&entry->ue_refcount) > 0);
+	spin_lock(&cache->uc_lock);
+	put_entry(cache, entry);
+	spin_unlock(&cache->uc_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(upcall_cache_put_entry);
+
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+			  void *args)
+{
+	struct upcall_cache_entry *entry = NULL;
+	struct list_head *head;
+	int found = 0, rc = 0;
+	ENTRY;
+
+	LASSERT(cache);
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry(entry, head, ue_hash) {
+		if (downcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			get_entry(entry);
+			break;
+		}
+	}
+
+	if (!found) {
+		CDEBUG(D_OTHER, "%s: upcall for key %llu not expected\n",
+		       cache->uc_name, key);
+		/* haven't found, it's possible */
+		spin_unlock(&cache->uc_lock);
+		RETURN(-EINVAL);
+	}
+
+	if (err) {
+		CDEBUG(D_OTHER, "%s: upcall for key %llu returned %d\n",
+		       cache->uc_name, entry->ue_key, err);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (!UC_CACHE_IS_ACQUIRING(entry)) {
+		CDEBUG(D_RPCTRACE, "%s: found uptodate entry %p (key %llu)"
+		       "\n", cache->uc_name, entry, entry->ue_key);
+		GOTO(out, rc = 0);
+	}
+
+	if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) {
+		CERROR("%s: found a stale entry %p (key %llu) in ioctl\n",
+		       cache->uc_name, entry, entry->ue_key);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	spin_unlock(&cache->uc_lock);
+	if (cache->uc_ops->parse_downcall)
+		rc = cache->uc_ops->parse_downcall(cache, entry, args);
+	spin_lock(&cache->uc_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	entry->ue_expire = ktime_get_seconds() + cache->uc_entry_expire;
+	UC_CACHE_SET_VALID(entry);
+	CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key %llu\n",
+	       cache->uc_name, entry, entry->ue_key);
+out:
+	if (rc) {
+		UC_CACHE_SET_INVALID(entry);
+		list_del_init(&entry->ue_hash);
+	}
+	UC_CACHE_CLEAR_ACQUIRING(entry);
+	spin_unlock(&cache->uc_lock);
+	wake_up(&entry->ue_waitq);
+	put_entry(cache, entry);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(upcall_cache_downcall);
+
+void upcall_cache_flush(struct upcall_cache *cache, int force)
+{
+	struct upcall_cache_entry *entry, *next;
+	int i;
+	ENTRY;
+
+	spin_lock(&cache->uc_lock);
+	for (i = 0; i < UC_CACHE_HASH_SIZE; i++) {
+		list_for_each_entry_safe(entry, next,
+					 &cache->uc_hashtable[i], ue_hash) {
+			if (!force && atomic_read(&entry->ue_refcount)) {
+				UC_CACHE_SET_EXPIRED(entry);
+				continue;
+			}
+			LASSERT(!atomic_read(&entry->ue_refcount));
+			free_entry(cache, entry);
+		}
+	}
+	spin_unlock(&cache->uc_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(upcall_cache_flush);
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args)
+{
+	struct list_head *head;
+	struct upcall_cache_entry *entry;
+	int found = 0;
+	ENTRY;
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry(entry, head, ue_hash) {
+		if (upcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		CWARN("%s: flush entry %p: key %llu, ref %d, fl %x, "
+		      "cur %lld, ex %lld/%lld\n",
+		      cache->uc_name, entry, entry->ue_key,
+		      atomic_read(&entry->ue_refcount), entry->ue_flags,
+		      ktime_get_real_seconds(), entry->ue_acquire_expire,
+		      entry->ue_expire);
+		UC_CACHE_SET_EXPIRED(entry);
+		if (!atomic_read(&entry->ue_refcount))
+			free_entry(cache, entry);
+	}
+	spin_unlock(&cache->uc_lock);
+}
+EXPORT_SYMBOL(upcall_cache_flush_one);
+
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+				       struct upcall_cache_ops *ops)
+{
+	struct upcall_cache *cache;
+	int i;
+	ENTRY;
+
+	LIBCFS_ALLOC(cache, sizeof(*cache));
+	if (!cache)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	spin_lock_init(&cache->uc_lock);
+	init_rwsem(&cache->uc_upcall_rwsem);
+	for (i = 0; i < UC_CACHE_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&cache->uc_hashtable[i]);
+	strlcpy(cache->uc_name, name, sizeof(cache->uc_name));
+	/* upcall pathname proc tunable */
+	strlcpy(cache->uc_upcall, upcall, sizeof(cache->uc_upcall));
+	cache->uc_entry_expire = 20 * 60;
+	cache->uc_acquire_expire = 30;
+	cache->uc_ops = ops;
+
+	RETURN(cache);
+}
+EXPORT_SYMBOL(upcall_cache_init);
+
+void upcall_cache_cleanup(struct upcall_cache *cache)
+{
+	if (!cache)
+		return;
+	upcall_cache_flush_all(cache);
+	LIBCFS_FREE(cache, sizeof(*cache));
+}
+EXPORT_SYMBOL(upcall_cache_cleanup);
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/debug.c b/drivers/staging/lustrefsx/lustre/obdecho/debug.c
new file mode 100644
index 0000000000000..3b9465e63636d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdecho/debug.c
@@ -0,0 +1,99 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+
+#include <obd_support.h>
+#include "echo_internal.h"
+#include <lustre_net.h>
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+	LASSERT(addr);
+
+	off = cpu_to_le64 (off);
+	id = cpu_to_le64 (id);
+	memcpy(addr, (char *)&off, LPDS);
+	memcpy(addr + LPDS, (char *)&id, LPDS);
+
+	addr += len - LPDS - LPDS;
+	memcpy(addr, (char *)&off, LPDS);
+	memcpy(addr + LPDS, (char *)&id, LPDS);
+
+	return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+	__u64 ne_off;
+	int err = 0;
+
+	LASSERT(addr);
+
+	ne_off = le64_to_cpu(off);
+	id = le64_to_cpu(id);
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR,
+		       "%s: id %#llx offset %llu off: %#llx != %#llx\n",
+		       who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id %#llx offset %llu id: %#llx != %#llx\n",
+		       who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	addr += end - LPDS - LPDS;
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR,
+		       "%s: id %#llx offset %llu end off: %#llx != %#llx\n",
+		       who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR,
+		       "%s: id %#llx offset %llu end id: %#llx != %#llx\n",
+		       who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo.c b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
new file mode 100644
index 0000000000000..b65d01a7fae2e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
@@ -0,0 +1,980 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdecho/echo.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+
+#include "echo_internal.h"
+
+/*
+ * The echo objid needs to be below 2^32, because regular FID numbers are
+ * limited to 2^32 objects in f_oid for the FID_SEQ_ECHO range. b=23335
+ */
+#define ECHO_INIT_OID        0x10000000ULL
+#define ECHO_HANDLE_MAGIC    0xabcd0123fedc9876ULL
+
+#define ECHO_PERSISTENT_PAGES (ECHO_PERSISTENT_SIZE >> PAGE_SHIFT)
+static struct page *echo_persistent_pages[ECHO_PERSISTENT_PAGES];
+
+enum {
+	LPROC_ECHO_READ_BYTES = 1,
+	LPROC_ECHO_WRITE_BYTES = 2,
+	LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES + 1
+};
+
+struct echo_srv_device {
+	struct lu_device esd_dev;
+	struct lu_target esd_lut;
+};
+
+static inline struct echo_srv_device *echo_srv_dev(struct lu_device *d)
+{
+	return container_of_safe(d, struct echo_srv_device, esd_dev);
+}
+
+static inline struct obd_device *echo_srv_obd(struct echo_srv_device *esd)
+{
+	return esd->esd_dev.ld_obd;
+}
+
+static int echo_connect(const struct lu_env *env,
+			struct obd_export **exp, struct obd_device *obd,
+			struct obd_uuid *cluuid, struct obd_connect_data *data,
+			void *localdata)
+{
+	struct lustre_handle conn = { 0 };
+	int rc;
+
+	data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED;
+
+	if (data->ocd_connect_flags & OBD_CONNECT_FLAGS2)
+		data->ocd_connect_flags2 &= ECHO_CONNECT_SUPPORTED2;
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc) {
+		CERROR("can't connect %d\n", rc);
+		return rc;
+	}
+	*exp = class_conn2export(&conn);
+
+	return 0;
+}
+
+static int echo_disconnect(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+
+	return server_disconnect_export(exp);
+}
+
+static int echo_init_export(struct obd_export *exp)
+{
+	return ldlm_init_export(exp);
+}
+
+static int echo_destroy_export(struct obd_export *exp)
+{
+	ENTRY;
+
+	target_destroy_export(exp);
+	ldlm_destroy_export(exp);
+
+	RETURN(0);
+}
+
+static u64 echo_next_id(struct obd_device *obd)
+{
+	u64 id;
+
+	spin_lock(&obd->u.echo.eo_lock);
+	id = ++obd->u.echo.eo_lastino;
+	spin_unlock(&obd->u.echo.eo_lock);
+
+	return id;
+}
+
+static void
+echo_page_debug_setup(struct page *page, int rw, u64 id,
+		      __u64 offset, int len)
+{
+	int   page_offset = offset & ~PAGE_MASK;
+	char *addr        = ((char *)kmap(page)) + page_offset;
+
+	if (len % OBD_ECHO_BLOCK_SIZE != 0)
+		CERROR("Unexpected block size %d\n", len);
+
+	while (len > 0) {
+		if (rw & OBD_BRW_READ)
+			block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+					  offset, id);
+		else
+			block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+					  0xecc0ecc0ecc0ecc0ULL,
+					  0xecc0ecc0ecc0ecc0ULL);
+
+		addr   += OBD_ECHO_BLOCK_SIZE;
+		offset += OBD_ECHO_BLOCK_SIZE;
+		len    -= OBD_ECHO_BLOCK_SIZE;
+	}
+
+	kunmap(page);
+}
+
+static int
+echo_page_debug_check(struct page *page, u64 id,
+		      __u64 offset, int len)
+{
+	int   page_offset = offset & ~PAGE_MASK;
+	char *addr        = ((char *)kmap(page)) + page_offset;
+	int   rc          = 0;
+	int   rc2;
+
+	if (len % OBD_ECHO_BLOCK_SIZE != 0)
+		CERROR("Unexpected block size %d\n", len);
+
+	while (len > 0) {
+		rc2 = block_debug_check("echo", addr, OBD_ECHO_BLOCK_SIZE,
+					offset, id);
+
+		if (rc2 != 0 && rc == 0)
+			rc = rc2;
+
+		addr   += OBD_ECHO_BLOCK_SIZE;
+		offset += OBD_ECHO_BLOCK_SIZE;
+		len    -= OBD_ECHO_BLOCK_SIZE;
+	}
+
+	kunmap(page);
+
+	return rc;
+}
+
+static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj,
+			     struct niobuf_remote *nb, int *pages,
+			     struct niobuf_local *lb, int cmd, int *left)
+{
+	gfp_t gfp_mask = (ostid_id(&obj->ioo_oid) & 1) ?
+			GFP_HIGHUSER : GFP_KERNEL;
+	int ispersistent = ostid_id(&obj->ioo_oid) == ECHO_PERSISTENT_OBJID;
+	int debug_setup = (!ispersistent &&
+			   (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+			   (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+	struct niobuf_local *res = lb;
+	u64 offset = nb->rnb_offset;
+	int len = nb->rnb_len;
+
+	while (len > 0) {
+		int plen = PAGE_SIZE - (offset & (PAGE_SIZE - 1));
+
+		if (len < plen)
+			plen = len;
+
+		/* check for local buf overflow */
+		if (*left == 0)
+			return -EINVAL;
+
+		res->lnb_file_offset = offset;
+		res->lnb_len = plen;
+		LASSERT((res->lnb_file_offset & ~PAGE_MASK) +
+			res->lnb_len <= PAGE_SIZE);
+
+		if (ispersistent &&
+		    ((res->lnb_file_offset >> PAGE_SHIFT) <
+		      ECHO_PERSISTENT_PAGES)) {
+			res->lnb_page =
+				echo_persistent_pages[res->lnb_file_offset >>
+						      PAGE_SHIFT];
+			/* Take extra ref so __free_pages() can be called OK */
+			get_page(res->lnb_page);
+		} else {
+			res->lnb_page = alloc_page(gfp_mask);
+			if (!res->lnb_page) {
+				CERROR("can't get page for id " DOSTID"\n",
+				       POSTID(&obj->ioo_oid));
+				return -ENOMEM;
+			}
+			/* set mapping so page is not considered encrypted */
+			res->lnb_page->mapping = ECHO_MAPPING_UNENCRYPTED;
+		}
+
+		CDEBUG(D_PAGE, "$$$$ get page %p @ %llu for %d\n",
+		       res->lnb_page, res->lnb_file_offset, res->lnb_len);
+
+		if (cmd & OBD_BRW_READ)
+			res->lnb_rc = res->lnb_len;
+
+		if (debug_setup)
+			echo_page_debug_setup(res->lnb_page, cmd,
+					      ostid_id(&obj->ioo_oid),
+					      res->lnb_file_offset,
+					      res->lnb_len);
+
+		offset += plen;
+		len -= plen;
+		res++;
+
+		(*left)--;
+		(*pages)++;
+	}
+
+	return 0;
+}
+
+static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj,
+			    struct niobuf_remote *rb, int *pgs,
+			    struct niobuf_local *lb, int verify)
+{
+	struct niobuf_local *res = lb;
+	u64 start = rb->rnb_offset >> PAGE_SHIFT;
+	u64 end   = (rb->rnb_offset + rb->rnb_len + PAGE_SIZE - 1) >>
+		    PAGE_SHIFT;
+	int     count  = (int)(end - start);
+	int     rc     = 0;
+	int     i;
+
+	for (i = 0; i < count; i++, (*pgs) ++, res++) {
+		struct page *page = res->lnb_page;
+		void       *addr;
+
+		if (!page) {
+			CERROR("null page objid %llu:%p, buf %d/%d\n",
+			       ostid_id(&obj->ioo_oid), page, i,
+			       obj->ioo_bufcnt);
+			return -EFAULT;
+		}
+
+		addr = kmap(page);
+
+		CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@%llu\n",
+		       res->lnb_page, addr, res->lnb_file_offset);
+
+		if (verify) {
+			int vrc = echo_page_debug_check(page,
+							ostid_id(&obj->ioo_oid),
+							res->lnb_file_offset,
+							res->lnb_len);
+			/* check all the pages always */
+			if (vrc != 0 && rc == 0)
+				rc = vrc;
+		}
+
+		kunmap(page);
+		/* NB see comment above regarding persistent pages */
+		__free_page(page);
+	}
+
+	return rc;
+}
+
+static int echo_preprw(const struct lu_env *env, int cmd,
+		       struct obd_export *export, struct obdo *oa,
+		       int objcount, struct obd_ioobj *obj,
+		       struct niobuf_remote *nb, int *pages,
+		       struct niobuf_local *res)
+{
+	struct obd_device *obd;
+	int tot_bytes = 0;
+	int rc = 0;
+	int i, left;
+
+	ENTRY;
+
+	obd = export->exp_obd;
+	if (!obd)
+		RETURN(-EINVAL);
+
+	/* Temp fix to stop falling foul of osc_announce_cached() */
+	oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT);
+
+	memset(res, 0, sizeof(*res) * *pages);
+
+	CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n",
+	       cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages);
+
+	left = *pages;
+	*pages = 0;
+
+	for (i = 0; i < objcount; i++, obj++) {
+		int j;
+
+		for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) {
+			rc = echo_map_nb_to_lb(oa, obj, nb, pages,
+					       res + *pages, cmd, &left);
+			if (rc)
+				GOTO(preprw_cleanup, rc);
+
+			tot_bytes += nb->rnb_len;
+		}
+	}
+
+	atomic_add(*pages, &obd->u.echo.eo_prep);
+
+	if (cmd & OBD_BRW_READ)
+		lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+				    tot_bytes);
+	else
+		lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+				    tot_bytes);
+
+	CDEBUG(D_PAGE, "%d pages allocated after prep\n",
+	       atomic_read(&obd->u.echo.eo_prep));
+
+	RETURN(0);
+
+preprw_cleanup:
+	/*
+	 * It is possible that we would rather handle errors by  allow
+	 * any already-set-up pages to complete, rather than tearing them
+	 * all down again.  I believe that this is what the in-kernel
+	 * prep/commit operations do.
+	 */
+	CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount);
+	for (i = 0; i < *pages; i++) {
+		kunmap(res[i].lnb_page);
+		/*
+		 * NB if this is a persistent page, __free_page() will just
+		 * lose the extra ref gained above
+		 */
+		__free_page(res[i].lnb_page);
+		res[i].lnb_page = NULL;
+		atomic_dec(&obd->u.echo.eo_prep);
+	}
+
+	return rc;
+}
+
+static int echo_commitrw(const struct lu_env *env, int cmd,
+			 struct obd_export *export, struct obdo *oa,
+			 int objcount, struct obd_ioobj *obj,
+			 struct niobuf_remote *rb, int niocount,
+			 struct niobuf_local *res, int rc, int nob,
+			 ktime_t kstart)
+{
+	struct obd_device *obd;
+	int pgs = 0;
+	int i;
+
+	ENTRY;
+
+	obd = export->exp_obd;
+	if (!obd)
+		RETURN(-EINVAL);
+
+	if (rc)
+		GOTO(commitrw_cleanup, rc);
+
+	if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) {
+		CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n",
+		       objcount, niocount);
+	} else {
+		CDEBUG(D_PAGE, "writing %d obdos with %d IOs\n",
+		       objcount, niocount);
+	}
+
+	if (niocount && !res) {
+		CERROR("NULL res niobuf with niocount %d\n", niocount);
+		RETURN(-EINVAL);
+	}
+
+	for (i = 0; i < objcount; i++, obj++) {
+		int verify = (rc == 0 &&
+			     ostid_id(&obj->ioo_oid) != ECHO_PERSISTENT_OBJID &&
+			      (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+			      (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+		int j;
+
+		for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) {
+			int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs],
+						   verify);
+			if (vrc == 0)
+				continue;
+
+			if (vrc == -EFAULT)
+				GOTO(commitrw_cleanup, rc = vrc);
+
+			if (rc == 0)
+				rc = vrc;
+		}
+	}
+
+	atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+	CDEBUG(D_PAGE, "%d pages remain after commit\n",
+	       atomic_read(&obd->u.echo.eo_prep));
+	RETURN(rc);
+
+commitrw_cleanup:
+	atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+	CERROR("cleaning up %d pages (%d obdos)\n",
+	       niocount - pgs - 1, objcount);
+
+	while (pgs < niocount) {
+		struct page *page = res[pgs++].lnb_page;
+
+		if (!page)
+			continue;
+
+		/* NB see comment above regarding persistent pages */
+		__free_page(page);
+		atomic_dec(&obd->u.echo.eo_prep);
+	}
+	return rc;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(echo, uuid);
+static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
+	{ .name =       "uuid",
+	  .fops =       &echo_uuid_fops         },
+	{ NULL }
+};
+
+const struct obd_ops echo_obd_ops = {
+	.o_owner           = THIS_MODULE,
+	.o_connect         = echo_connect,
+	.o_disconnect      = echo_disconnect,
+	.o_init_export     = echo_init_export,
+	.o_destroy_export  = echo_destroy_export,
+	.o_preprw          = echo_preprw,
+	.o_commitrw        = echo_commitrw,
+};
+
+/**
+ * Echo Server request handler for OST_CREATE RPC.
+ *
+ * This is part of request processing. Its simulates the object
+ * creation on OST.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_create_hdl(struct tgt_session_info *tsi)
+{
+	const struct obdo *oa = &tsi->tsi_ost_body->oa;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+	struct obdo *rep_oa;
+
+	ENTRY;
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (!repbody)
+		RETURN(-ENOMEM);
+
+	if (!(oa->o_mode & S_IFMT)) {
+		CERROR("%s: no type is set in obdo!\n",
+		       tsi->tsi_exp->exp_obd->obd_name);
+		RETURN(-ENOENT);
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLTYPE)) {
+		CERROR("%s: invalid o_valid in obdo: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	rep_oa = &repbody->oa;
+
+	if (!fid_seq_is_echo(ostid_seq(&oa->o_oi))) {
+		CERROR("%s: invalid seq %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, ostid_seq(&oa->o_oi));
+		return -EINVAL;
+	}
+
+	ostid_set_seq_echo(&rep_oa->o_oi);
+	ostid_set_id(&rep_oa->o_oi, echo_next_id(obd));
+
+	CDEBUG(D_INFO, "%s: Create object "DOSTID"\n",
+	       tsi->tsi_exp->exp_obd->obd_name, POSTID(&rep_oa->o_oi));
+
+	rep_oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	RETURN(0);
+}
+
+/**
+ * Echo Server request handler for OST_DESTROY RPC.
+ *
+ * This is Echo Server part of request handling. It simulates the objects
+ * destroy on OST.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_destroy_hdl(struct tgt_session_info *tsi)
+{
+	const struct obdo *oa = &tsi->tsi_ost_body->oa;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+	u64 oid;
+
+	ENTRY;
+
+	oid = ostid_id(&oa->o_oi);
+	LASSERT(oid != 0);
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+
+	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
+	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
+		CERROR("%s: bad objid to destroy: "DOSTID"\n",
+		       tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_INFO, "%s: Destroy object "DOSTID"\n",
+	       tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi));
+
+	repbody->oa.o_oi = oa->o_oi;
+	RETURN(0);
+}
+
+/**
+ * Echo Server request handler for OST_GETATTR RPC.
+ *
+ * This is Echo Server part of request handling. It returns an object
+ * attributes to the client. All objects have the same attributes in
+ * Echo Server.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_getattr_hdl(struct tgt_session_info *tsi)
+{
+	const struct obdo *oa = &tsi->tsi_ost_body->oa;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+
+	ENTRY;
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (!repbody)
+		RETURN(-ENOMEM);
+
+	repbody->oa.o_oi = oa->o_oi;
+	repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	obdo_cpy_md(&repbody->oa, &obd->u.echo.eo_oa, oa->o_valid);
+
+	repbody->oa.o_valid |= OBD_MD_FLFLAGS;
+	repbody->oa.o_flags = OBD_FL_FLUSH;
+
+	RETURN(0);
+}
+
+/**
+ * Echo Server request handler for OST_SETATTR RPC.
+ *
+ * This is Echo Server part of request handling. It sets common
+ * attributes from request to the Echo Server objects.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_setattr_hdl(struct tgt_session_info *tsi)
+{
+	struct ost_body *body = tsi->tsi_ost_body;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+
+	ENTRY;
+
+	if (!(body->oa.o_valid & OBD_MD_FLID)) {
+		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name,
+		       body->oa.o_valid);
+		RETURN(-EINVAL);
+	}
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (!repbody)
+		RETURN(-ENOMEM);
+
+	repbody->oa.o_oi = body->oa.o_oi;
+	repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	obd->u.echo.eo_oa = body->oa;
+
+	RETURN(0);
+}
+
+#define OBD_FAIL_OST_READ_NET	OBD_FAIL_OST_BRW_NET
+#define OBD_FAIL_OST_WRITE_NET	OBD_FAIL_OST_BRW_NET
+#define OST_BRW_READ	OST_READ
+#define OST_BRW_WRITE	OST_WRITE
+
+/**
+ * Table of Echo Server specific request handlers
+ *
+ * This table contains all opcodes accepted by Echo Server and
+ * specifies handlers for them. The tgt_request_handler()
+ * uses such table from each target to process incoming
+ * requests.
+ */
+static struct tgt_handler esd_tgt_handlers[] = {
+TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_CONNECT, tgt_connect,
+		&RQF_CONNECT, LUSTRE_OBD_VERSION),
+TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_DISCONNECT, tgt_disconnect,
+		&RQF_OST_DISCONNECT, LUSTRE_OBD_VERSION),
+TGT_OST_HDL(HAS_BODY | HAS_REPLY, OST_GETATTR, esd_getattr_hdl),
+TGT_OST_HDL(HAS_BODY | HAS_REPLY | IS_MUTABLE, OST_SETATTR,
+	    esd_setattr_hdl),
+TGT_OST_HDL(HAS_REPLY | IS_MUTABLE, OST_CREATE, esd_create_hdl),
+TGT_OST_HDL(HAS_REPLY | IS_MUTABLE, OST_DESTROY, esd_destroy_hdl),
+TGT_OST_HDL(HAS_BODY | HAS_REPLY, OST_BRW_READ, tgt_brw_read),
+TGT_OST_HDL(HAS_BODY | IS_MUTABLE, OST_BRW_WRITE, tgt_brw_write),
+};
+
+static struct tgt_opc_slice esd_common_slice[] = {
+	{
+		.tos_opc_start	= OST_FIRST_OPC,
+		.tos_opc_end	= OST_LAST_OPC,
+		.tos_hs		= esd_tgt_handlers
+	},
+	{
+		.tos_opc_start	= OBD_FIRST_OPC,
+		.tos_opc_end	= OBD_LAST_OPC,
+		.tos_hs		= tgt_obd_handlers
+	},
+	{
+		.tos_opc_start	= LDLM_FIRST_OPC,
+		.tos_opc_end	= LDLM_LAST_OPC,
+		.tos_hs		= tgt_dlm_handlers
+	},
+	{
+		.tos_opc_start  = SEC_FIRST_OPC,
+		.tos_opc_end    = SEC_LAST_OPC,
+		.tos_hs         = tgt_sec_ctx_handlers
+	},
+	{
+		.tos_hs		= NULL
+	}
+};
+
+/**
+ * lu_device_operations matrix for ECHO SRV device is NULL,
+ * this device is just serving incoming requests immediately
+ * without building a stack of lu_devices.
+ */
+static const struct lu_device_operations echo_srv_lu_ops = { 0 };
+
+/**
+ * Initialize Echo Server device with parameters in the config log \a cfg.
+ *
+ * This is the main starting point of Echo Server initialization. It fills all
+ * parameters with their initial values and starts Echo Server.
+ *
+ * \param[in] env	execution environment
+ * \param[in] m		Echo Server device
+ * \param[in] ldt	LU device type of Echo Server
+ * \param[in] cfg	configuration log
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int echo_srv_init0(const struct lu_env *env,
+			  struct echo_srv_device *esd,
+			  struct lu_device_type *ldt, struct lustre_cfg *cfg)
+{
+	const char *dev = lustre_cfg_string(cfg, 0);
+	struct obd_device *obd;
+	char ns_name[48];
+	int rc;
+
+	ENTRY;
+
+	obd = class_name2obd(dev);
+	if (!obd) {
+		CERROR("Cannot find obd with name %s\n", dev);
+		RETURN(-ENODEV);
+	}
+
+	spin_lock_init(&obd->u.echo.eo_lock);
+	obd->u.echo.eo_lastino = ECHO_INIT_OID;
+
+	esd->esd_dev.ld_ops = &echo_srv_lu_ops;
+	esd->esd_dev.ld_obd = obd;
+	/* set this lu_device to obd, because error handling need it */
+	obd->obd_lu_dev = &esd->esd_dev;
+
+	/* No connection accepted until configurations will finish */
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_no_conn = 1;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* non-replayable target */
+	obd->obd_replayable = 0;
+
+	snprintf(ns_name, sizeof(ns_name), "echotgt-%s", obd->obd_uuid.uuid);
+	obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+						LDLM_NAMESPACE_SERVER,
+						LDLM_NAMESPACE_MODEST,
+						LDLM_NS_TYPE_OST);
+	if (IS_ERR(obd->obd_namespace)) {
+		rc = PTR_ERR(obd->obd_namespace);
+		CERROR("%s: unable to create server namespace: rc = %d\n",
+		       obd->obd_name, rc);
+		obd->obd_namespace = NULL;
+		RETURN(rc);
+	}
+
+	obd->obd_vars = lprocfs_echo_obd_vars;
+	if (!lprocfs_obd_setup(obd, true) &&
+	    lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) {
+		lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+				     LPROCFS_CNTR_AVGMINMAX,
+				     "read_bytes", "bytes");
+		lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+				     LPROCFS_CNTR_AVGMINMAX,
+				     "write_bytes", "bytes");
+	}
+
+	ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
+			   "echo_ldlm_cb_client", &obd->obd_ldlm_client);
+
+	rc = tgt_init(env, &esd->esd_lut, obd, NULL, esd_common_slice,
+		      OBD_FAIL_OST_ALL_REQUEST_NET,
+		      OBD_FAIL_OST_ALL_REPLY_NET);
+	if (rc)
+		GOTO(err_out, rc);
+
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_no_conn = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	RETURN(0);
+
+err_out:
+	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
+	obd->obd_namespace = NULL;
+
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
+	RETURN(rc);
+}
+
+/**
+ * Stop the Echo Server device.
+ *
+ * This function stops the Echo Server device and all its subsystems.
+ * This is the end of Echo Server lifecycle.
+ *
+ * \param[in] env	execution environment
+ * \param[in] esd		ESD device
+ */
+static void echo_srv_fini(const struct lu_env *env,
+			  struct echo_srv_device *esd)
+{
+	struct obd_device *obd = echo_srv_obd(esd);
+	struct lu_device *d = &esd->esd_dev;
+	int leaked;
+
+	ENTRY;
+
+	class_disconnect_exports(obd);
+	if (obd->obd_namespace)
+		ldlm_namespace_free_prior(obd->obd_namespace, NULL,
+					  obd->obd_force);
+
+	obd_exports_barrier(obd);
+	obd_zombie_barrier();
+
+	tgt_fini(env, &esd->esd_lut);
+
+	if (obd->obd_namespace) {
+		ldlm_namespace_free_post(obd->obd_namespace);
+		obd->obd_namespace = NULL;
+	}
+
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
+
+	leaked = atomic_read(&obd->u.echo.eo_prep);
+	if (leaked != 0)
+		CERROR("%d prep/commitrw pages leaked\n", leaked);
+
+	LASSERT(atomic_read(&d->ld_ref) == 0);
+	EXIT;
+}
+
+/**
+ * Implementation of lu_device_type_operations::ldto_device_fini.
+ *
+ * Finalize device. Dual to echo_srv_device_init(). It is called from
+ * obd_precleanup() and stops the current device.
+ *
+ * \param[in] env	execution environment
+ * \param[in] d		LU device of ESD
+ *
+ * \retval		NULL
+ */
+static struct lu_device *echo_srv_device_fini(const struct lu_env *env,
+					      struct lu_device *d)
+{
+	ENTRY;
+	echo_srv_fini(env, echo_srv_dev(d));
+	RETURN(NULL);
+}
+
+/**
+ * Implementation of lu_device_type_operations::ldto_device_free.
+ *
+ * Free Echo Server device. Dual to echo_srv_device_alloc().
+ *
+ * \param[in] env	execution environment
+ * \param[in] d		LU device of ESD
+ *
+ * \retval		NULL
+ */
+static struct lu_device *echo_srv_device_free(const struct lu_env *env,
+					      struct lu_device *d)
+{
+	struct echo_srv_device *esd = echo_srv_dev(d);
+
+	lu_device_fini(&esd->esd_dev);
+	OBD_FREE_PTR(esd);
+	RETURN(NULL);
+}
+
+/**
+ * Implementation of lu_device_type_operations::ldto_device_alloc.
+ *
+ * This function allocates the new Echo Server device. It is called from
+ * obd_setup() if OBD device had lu_device_type defined.
+ *
+ * \param[in] env	execution environment
+ * \param[in] t		lu_device_type of ESD device
+ * \param[in] cfg	configuration log
+ *
+ * \retval		pointer to the lu_device of just allocated OFD
+ * \retval		ERR_PTR of return value on error
+ */
+static struct lu_device *echo_srv_device_alloc(const struct lu_env *env,
+					       struct lu_device_type *t,
+					       struct lustre_cfg *cfg)
+{
+	struct echo_srv_device *esd;
+	struct lu_device *l;
+	int rc;
+
+	OBD_ALLOC_PTR(esd);
+	if (!esd)
+		return ERR_PTR(-ENOMEM);
+
+	l = &esd->esd_dev;
+	lu_device_init(l, t);
+	rc = echo_srv_init0(env, esd, t, cfg);
+	if (rc != 0) {
+		echo_srv_device_free(env, l);
+		l = ERR_PTR(rc);
+	}
+
+	return l;
+}
+
+static const struct lu_device_type_operations echo_srv_type_ops = {
+	.ldto_device_alloc = echo_srv_device_alloc,
+	.ldto_device_free = echo_srv_device_free,
+	.ldto_device_fini = echo_srv_device_fini
+};
+
+struct lu_device_type echo_srv_type = {
+	.ldt_tags = LU_DEVICE_DT,
+	.ldt_name = LUSTRE_ECHO_NAME,
+	.ldt_ops = &echo_srv_type_ops,
+	.ldt_ctx_tags = LCT_DT_THREAD,
+};
+
+void echo_persistent_pages_fini(void)
+{
+	int i;
+
+	for (i = 0; i < ECHO_PERSISTENT_PAGES; i++)
+		if (echo_persistent_pages[i]) {
+			__free_page(echo_persistent_pages[i]);
+			echo_persistent_pages[i] = NULL;
+		}
+}
+
+int echo_persistent_pages_init(void)
+{
+	struct page *pg;
+	int          i;
+
+	for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) {
+		gfp_t gfp_mask = (i < ECHO_PERSISTENT_PAGES / 2) ?
+			GFP_KERNEL : GFP_HIGHUSER;
+
+		pg = alloc_page(gfp_mask);
+		if (!pg) {
+			echo_persistent_pages_fini();
+			return -ENOMEM;
+		}
+
+		memset(kmap(pg), 0, PAGE_SIZE);
+		kunmap(pg);
+		/* set mapping so page is not considered encrypted */
+		pg->mapping = ECHO_MAPPING_UNENCRYPTED;
+
+		echo_persistent_pages[i] = pg;
+	}
+
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
new file mode 100644
index 0000000000000..3c6bc10a8046d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
@@ -0,0 +1,3171 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <cl_object.h>
+#include <lustre_fid.h>
+#include <lustre_lmv.h>
+#include <lustre_acl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_net.h>
+#ifdef HAVE_SERVER_SUPPORT
+# include <md_object.h>
+
+#define ETI_NAME_LEN	20
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+#include "echo_internal.h"
+
+/** \defgroup echo_client Echo Client
+ * @{
+ */
+
+/* echo thread key have a CL_THREAD flag, which set cl_env function directly */
+#define ECHO_MD_CTX_TAG (LCT_REMEMBER | LCT_MD_THREAD)
+#define ECHO_DT_CTX_TAG (LCT_REMEMBER | LCT_DT_THREAD)
+#define ECHO_SES_TAG    (LCT_REMEMBER | LCT_SESSION | LCT_SERVER_SESSION)
+
+struct echo_device {
+	struct cl_device	  ed_cl;
+	struct echo_client_obd	 *ed_ec;
+
+	struct cl_site		  ed_site_myself;
+	struct lu_site		 *ed_site;
+	struct lu_device	 *ed_next;
+	int			  ed_next_ismd;
+	struct lu_client_seq	 *ed_cl_seq;
+#ifdef HAVE_SERVER_SUPPORT
+	struct local_oid_storage *ed_los;
+	struct lu_fid		  ed_root_fid;
+#endif /* HAVE_SERVER_SUPPORT */
+};
+
+struct echo_object {
+	struct cl_object	eo_cl;
+	struct cl_object_header	eo_hdr;
+	struct echo_device     *eo_dev;
+	struct list_head	eo_obj_chain;
+	struct lov_oinfo       *eo_oinfo;
+	atomic_t		eo_npages;
+	int			eo_deleted;
+};
+
+struct echo_object_conf {
+	struct cl_object_conf	eoc_cl;
+	struct lov_oinfo      **eoc_oinfo;
+};
+
+struct echo_page {
+	struct cl_page_slice	ep_cl;
+	unsigned long		ep_lock;
+};
+
+struct echo_lock {
+	struct cl_lock_slice	el_cl;
+	struct list_head	el_chain;
+	struct echo_object     *el_object;
+	__u64			el_cookie;
+	atomic_t		el_refcount;
+};
+
+#ifdef HAVE_SERVER_SUPPORT
+static const char echo_md_root_dir_name[] = "ROOT_ECHO";
+
+/**
+ * In order to use the values of members in struct mdd_device,
+ * we define an alias structure here.
+ */
+struct echo_md_device {
+	struct md_device		 emd_md_dev;
+	struct obd_export		*emd_child_exp;
+	struct dt_device		*emd_child;
+	struct dt_device		*emd_bottom;
+	struct lu_fid			 emd_root_fid;
+	struct lu_fid			 emd_local_root_fid;
+};
+#endif /* HAVE_SERVER_SUPPORT */
+
+static int echo_client_setup(const struct lu_env *env,
+			     struct obd_device *obd,
+			     struct lustre_cfg *lcfg);
+static int echo_client_cleanup(struct obd_device *obd);
+
+/** \defgroup echo_helpers Helper functions
+ * @{
+ */
+static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
+{
+	return container_of_safe(dev, struct echo_device, ed_cl);
+}
+
+static inline struct cl_device *echo_dev2cl(struct echo_device *d)
+{
+	return &d->ed_cl;
+}
+
+static inline struct echo_device *obd2echo_dev(const struct obd_device *obd)
+{
+	return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev));
+}
+
+static inline struct cl_object *echo_obj2cl(struct echo_object *eco)
+{
+	return &eco->eo_cl;
+}
+
+static inline struct echo_object *cl2echo_obj(const struct cl_object *o)
+{
+	return container_of(o, struct echo_object, eo_cl);
+}
+
+static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s)
+{
+	return container_of(s, struct echo_page, ep_cl);
+}
+
+static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s)
+{
+	return container_of(s, struct echo_lock, el_cl);
+}
+
+static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl)
+{
+	return ecl->el_cl.cls_lock;
+}
+
+static struct lu_context_key echo_thread_key;
+
+static inline struct echo_thread_info *echo_env_info(const struct lu_env *env)
+{
+	struct echo_thread_info *info;
+
+	info = lu_context_key_get(&env->le_ctx, &echo_thread_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline
+struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c)
+{
+	return container_of(c, struct echo_object_conf, eoc_cl);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+static inline struct echo_md_device *lu2emd_dev(struct lu_device *d)
+{
+	return container_of_safe(d, struct echo_md_device,
+				 emd_md_dev.md_lu_dev);
+}
+
+static inline struct lu_device *emd2lu_dev(struct echo_md_device *d)
+{
+	return &d->emd_md_dev.md_lu_dev;
+}
+
+static inline struct seq_server_site *echo_md_seq_site(struct echo_md_device *d)
+{
+	return emd2lu_dev(d)->ld_site->ld_seq_site;
+}
+
+static inline struct obd_device *emd2obd_dev(struct echo_md_device *d)
+{
+	return d->emd_md_dev.md_lu_dev.ld_obd;
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+/** @} echo_helpers */
+
+static int cl_echo_object_put(struct echo_object *eco);
+static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
+			      struct page **pages, int npages, int async);
+
+struct echo_thread_info {
+	struct echo_object_conf eti_conf;
+	struct lustre_md        eti_md;
+	struct cl_2queue        eti_queue;
+	struct cl_io            eti_io;
+	struct cl_lock          eti_lock;
+	struct lu_fid           eti_fid;
+	struct lu_fid		eti_fid2;
+#ifdef HAVE_SERVER_SUPPORT
+	struct md_op_spec       eti_spec;
+	struct lov_mds_md_v3    eti_lmm;
+	struct lov_user_md_v3   eti_lum;
+	struct md_attr          eti_ma;
+	struct lu_name          eti_lname;
+	/* per-thread values, can be re-used */
+	void			*eti_big_lmm; /* may be vmalloc'd */
+	int			eti_big_lmmsize;
+	char                    eti_name[ETI_NAME_LEN];
+	struct lu_buf           eti_buf;
+	/* If we want to test large ACL, then need to enlarge the buffer. */
+	char                    eti_xattr_buf[LUSTRE_POSIX_ACL_MAX_SIZE_OLD];
+#endif
+};
+
+/* No session used right now */
+struct echo_session_info {
+	unsigned long dummy;
+};
+
+static struct kmem_cache *echo_lock_kmem;
+static struct kmem_cache *echo_object_kmem;
+static struct kmem_cache *echo_thread_kmem;
+static struct kmem_cache *echo_session_kmem;
+/* static struct kmem_cache *echo_req_kmem; */
+
+static struct lu_kmem_descr echo_caches[] = {
+	{
+		.ckd_cache = &echo_lock_kmem,
+		.ckd_name  = "echo_lock_kmem",
+		.ckd_size  = sizeof(struct echo_lock)
+	},
+	{
+		.ckd_cache = &echo_object_kmem,
+		.ckd_name  = "echo_object_kmem",
+		.ckd_size  = sizeof(struct echo_object)
+	},
+	{
+		.ckd_cache = &echo_thread_kmem,
+		.ckd_name  = "echo_thread_kmem",
+		.ckd_size  = sizeof(struct echo_thread_info)
+	},
+	{
+		.ckd_cache = &echo_session_kmem,
+		.ckd_name  = "echo_session_kmem",
+		.ckd_size  = sizeof(struct echo_session_info)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/** \defgroup echo_page Page operations
+ *
+ * Echo page operations.
+ *
+ * @{
+ */
+static int echo_page_own(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *io, int nonblock)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	if (!nonblock) {
+		if (test_and_set_bit(0, &ep->ep_lock))
+			return -EAGAIN;
+	} else {
+		while (test_and_set_bit(0, &ep->ep_lock))
+			wait_on_bit(&ep->ep_lock, 0, TASK_UNINTERRUPTIBLE);
+	}
+	return 0;
+}
+
+static void echo_page_disown(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *io)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	LASSERT(test_bit(0, &ep->ep_lock));
+	clear_and_wake_up_bit(0, &ep->ep_lock);
+}
+
+static void echo_page_discard(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	cl_page_delete(env, slice->cpl_page);
+}
+
+static int echo_page_is_vmlocked(const struct lu_env *env,
+				 const struct cl_page_slice *slice)
+{
+	if (test_bit(0, &cl2echo_page(slice)->ep_lock))
+		return -EBUSY;
+	return -ENODATA;
+}
+
+static void echo_page_completion(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 int ioret)
+{
+	LASSERT(slice->cpl_page->cp_sync_io != NULL);
+}
+
+static void echo_page_fini(const struct lu_env *env,
+			   struct cl_page_slice *slice,
+			   struct pagevec *pvec)
+{
+	struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
+
+	ENTRY;
+	atomic_dec(&eco->eo_npages);
+	put_page(slice->cpl_page->cp_vmpage);
+	EXIT;
+}
+
+static int echo_page_prep(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *unused)
+{
+	return 0;
+}
+
+static int echo_page_print(const struct lu_env *env,
+			   const struct cl_page_slice *slice,
+			   void *cookie, lu_printer_t printer)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	(*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n",
+		   ep, test_bit(0, &ep->ep_lock),
+		   slice->cpl_page->cp_vmpage);
+	return 0;
+}
+
+static const struct cl_page_operations echo_page_ops = {
+	.cpo_own           = echo_page_own,
+	.cpo_disown        = echo_page_disown,
+	.cpo_discard       = echo_page_discard,
+	.cpo_fini          = echo_page_fini,
+	.cpo_print         = echo_page_print,
+	.cpo_is_vmlocked   = echo_page_is_vmlocked,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep        = echo_page_prep,
+			.cpo_completion  = echo_page_completion,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep        = echo_page_prep,
+			.cpo_completion  = echo_page_completion,
+		}
+	}
+};
+
+/** @} echo_page */
+
+/** \defgroup echo_lock Locking
+ *
+ * echo lock operations
+ *
+ * @{
+ */
+static void echo_lock_fini(const struct lu_env *env,
+			   struct cl_lock_slice *slice)
+{
+	struct echo_lock *ecl = cl2echo_lock(slice);
+
+	LASSERT(list_empty(&ecl->el_chain));
+	OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem);
+}
+
+static const struct cl_lock_operations echo_lock_ops = {
+	.clo_fini	= echo_lock_fini,
+};
+
+/** @} echo_lock */
+
+/** \defgroup echo_cl_ops cl_object operations
+ *
+ * operations for cl_object
+ *
+ * @{
+ */
+static int echo_page_init(const struct lu_env *env, struct cl_object *obj,
+			  struct cl_page *page, pgoff_t index)
+{
+	struct echo_page *ep = cl_object_page_slice(obj, page);
+	struct echo_object *eco = cl2echo_obj(obj);
+
+	ENTRY;
+	get_page(page->cp_vmpage);
+	/*
+	 * ep_lock is similar to the lock_page() lock, and
+	 * cannot usefully be monitored by lockdep.
+	 * So just use a bit in an "unsigned long" and use the
+	 * wait_on_bit() interface to wait for the bit to be clear.
+	 */
+	ep->ep_lock = 0;
+	cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops);
+	atomic_inc(&eco->eo_npages);
+	RETURN(0);
+}
+
+static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io)
+{
+	return 0;
+}
+
+static int echo_lock_init(const struct lu_env *env,
+			  struct cl_object *obj, struct cl_lock *lock,
+			  const struct cl_io *unused)
+{
+	struct echo_lock *el;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, GFP_NOFS);
+	if (el) {
+		cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops);
+		el->el_object = cl2echo_obj(obj);
+		INIT_LIST_HEAD(&el->el_chain);
+		atomic_set(&el->el_refcount, 0);
+	}
+	RETURN(el ? 0 : -ENOMEM);
+}
+
+static int echo_conf_set(const struct lu_env *env, struct cl_object *obj,
+			 const struct cl_object_conf *conf)
+{
+	return 0;
+}
+
+static const struct cl_object_operations echo_cl_obj_ops = {
+	.coo_page_init = echo_page_init,
+	.coo_lock_init = echo_lock_init,
+	.coo_io_init   = echo_io_init,
+	.coo_conf_set  = echo_conf_set
+};
+/** @} echo_cl_ops */
+
+/** \defgroup echo_lu_ops lu_object operations
+ *
+ * operations for echo lu object.
+ *
+ * @{
+ */
+static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
+			    const struct lu_object_conf *conf)
+{
+	struct echo_device *ed         = cl2echo_dev(lu2cl_dev(obj->lo_dev));
+	struct echo_client_obd *ec     = ed->ed_ec;
+	struct echo_object *eco        = cl2echo_obj(lu2cl(obj));
+
+	ENTRY;
+	if (ed->ed_next) {
+		struct lu_object  *below;
+		struct lu_device  *under;
+
+		under = ed->ed_next;
+		below = under->ld_ops->ldo_object_alloc(env, obj->lo_header,
+							under);
+		if (!below)
+			RETURN(-ENOMEM);
+		lu_object_add(obj, below);
+	}
+
+	if (!ed->ed_next_ismd) {
+		const struct cl_object_conf *cconf = lu2cl_conf(conf);
+		struct echo_object_conf *econf = cl2echo_conf(cconf);
+
+		LASSERT(econf->eoc_oinfo != NULL);
+
+		/*
+		 * Transfer the oinfo pointer to eco that it won't be
+		 * freed.
+		 */
+		eco->eo_oinfo = *econf->eoc_oinfo;
+		*econf->eoc_oinfo = NULL;
+	} else {
+		eco->eo_oinfo = NULL;
+	}
+
+	eco->eo_dev = ed;
+	atomic_set(&eco->eo_npages, 0);
+	cl_object_page_init(lu2cl(obj), sizeof(struct echo_page));
+
+	spin_lock(&ec->ec_lock);
+	list_add_tail(&eco->eo_obj_chain, &ec->ec_objects);
+	spin_unlock(&ec->ec_lock);
+
+	RETURN(0);
+}
+
+static void echo_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+	struct echo_client_obd *ec;
+
+	ENTRY;
+
+	/* object delete called unconditolally - layer init or not */
+	if (eco->eo_dev == NULL)
+		return;
+
+	ec = eco->eo_dev->ed_ec;
+
+	LASSERT(atomic_read(&eco->eo_npages) == 0);
+
+	spin_lock(&ec->ec_lock);
+	list_del_init(&eco->eo_obj_chain);
+	spin_unlock(&ec->ec_lock);
+
+	if (eco->eo_oinfo)
+		OBD_FREE_PTR(eco->eo_oinfo);
+}
+
+static void echo_object_free_rcu(struct rcu_head *head)
+{
+	struct echo_object *eco = container_of(head, struct echo_object,
+					       eo_hdr.coh_lu.loh_rcu);
+
+	kmem_cache_free(echo_object_kmem, eco);
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+
+	ENTRY;
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
+
+	OBD_FREE_PRE(eco, sizeof(*eco), "slab-freed");
+	call_rcu(&eco->eo_hdr.coh_lu.loh_rcu, echo_object_free_rcu);
+	EXIT;
+}
+
+static int echo_object_print(const struct lu_env *env, void *cookie,
+			     lu_printer_t p, const struct lu_object *o)
+{
+	struct echo_object *obj = cl2echo_obj(lu2cl(o));
+
+	return (*p)(env, cookie, "echoclient-object@%p", obj);
+}
+
+static const struct lu_object_operations echo_lu_obj_ops = {
+	.loo_object_init      = echo_object_init,
+	.loo_object_delete    = echo_object_delete,
+	.loo_object_release   = NULL,
+	.loo_object_free      = echo_object_free,
+	.loo_object_print     = echo_object_print,
+	.loo_object_invariant = NULL
+};
+/** @} echo_lu_ops */
+
+/** \defgroup echo_lu_dev_ops  lu_device operations
+ *
+ * Operations for echo lu device.
+ *
+ * @{
+ */
+static struct lu_object *echo_object_alloc(const struct lu_env *env,
+					   const struct lu_object_header *hdr,
+					   struct lu_device *dev)
+{
+	struct echo_object *eco;
+	struct lu_object *obj = NULL;
+
+	ENTRY;
+	/* we're the top dev. */
+	LASSERT(hdr == NULL);
+	OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, GFP_NOFS);
+	if (eco) {
+		struct cl_object_header *hdr = &eco->eo_hdr;
+
+		obj = &echo_obj2cl(eco)->co_lu;
+		cl_object_header_init(hdr);
+		hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page));
+
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		eco->eo_cl.co_ops = &echo_cl_obj_ops;
+		obj->lo_ops       = &echo_lu_obj_ops;
+	}
+	RETURN(obj);
+}
+
+static const struct lu_device_operations echo_device_lu_ops = {
+	.ldo_object_alloc   = echo_object_alloc,
+};
+
+/** @} echo_lu_dev_ops */
+
+/** \defgroup echo_init Setup and teardown
+ *
+ * Init and fini functions for echo client.
+ *
+ * @{
+ */
+static int echo_site_init(const struct lu_env *env, struct echo_device *ed)
+{
+	struct cl_site *site = &ed->ed_site_myself;
+	int rc;
+
+	/* initialize site */
+	rc = cl_site_init(site, &ed->ed_cl);
+	if (rc) {
+		CERROR("Cannot initialize site for echo client(%d)\n", rc);
+		return rc;
+	}
+
+	rc = lu_site_init_finish(&site->cs_lu);
+	if (rc) {
+		cl_site_fini(site);
+		return rc;
+	}
+
+	ed->ed_site = &site->cs_lu;
+	return 0;
+}
+
+static void echo_site_fini(const struct lu_env *env, struct echo_device *ed)
+{
+	if (ed->ed_site) {
+		if (!ed->ed_next_ismd)
+			lu_site_fini(ed->ed_site);
+		ed->ed_site = NULL;
+	}
+}
+
+static void *echo_thread_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct echo_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, GFP_NOFS);
+	if (!info)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void echo_thread_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct echo_thread_info *info = data;
+
+	OBD_SLAB_FREE_PTR(info, echo_thread_kmem);
+}
+
+static struct lu_context_key echo_thread_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = echo_thread_key_init,
+	.lct_fini = echo_thread_key_fini,
+};
+
+static void *echo_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct echo_session_info *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, GFP_NOFS);
+	if (!session)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void echo_session_key_fini(const struct lu_context *ctx,
+				  struct lu_context_key *key, void *data)
+{
+	struct echo_session_info *session = data;
+
+	OBD_SLAB_FREE_PTR(session, echo_session_kmem);
+}
+
+static struct lu_context_key echo_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = echo_session_key_init,
+	.lct_fini = echo_session_key_fini,
+};
+
+LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key);
+
+#ifdef HAVE_SERVER_SUPPORT
+# define ECHO_SEQ_WIDTH 0xffffffff
+static int echo_fid_init(struct echo_device *ed, char *obd_name,
+			 struct seq_server_site *ss)
+{
+	char *prefix;
+	int rc;
+
+	ENTRY;
+	OBD_ALLOC_PTR(ed->ed_cl_seq);
+	if (!ed->ed_cl_seq)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+	if (!prefix)
+		GOTO(out_free_seq, rc = -ENOMEM);
+
+	snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", obd_name);
+
+	/* Init client side sequence-manager */
+	seq_client_init(ed->ed_cl_seq, NULL,
+			LUSTRE_SEQ_METADATA,
+			prefix, ss->ss_server_seq);
+	ed->ed_cl_seq->lcs_width = ECHO_SEQ_WIDTH;
+	OBD_FREE(prefix, MAX_OBD_NAME + 5);
+
+	RETURN(0);
+
+out_free_seq:
+	OBD_FREE_PTR(ed->ed_cl_seq);
+	ed->ed_cl_seq = NULL;
+	RETURN(rc);
+}
+
+static int echo_fid_fini(struct obd_device *obd)
+{
+	struct echo_device *ed = obd2echo_dev(obd);
+
+	ENTRY;
+	if (ed->ed_cl_seq) {
+		seq_client_fini(ed->ed_cl_seq);
+		OBD_FREE_PTR(ed->ed_cl_seq);
+		ed->ed_cl_seq = NULL;
+	}
+
+	RETURN(0);
+}
+
+static void echo_ed_los_fini(const struct lu_env *env, struct echo_device *ed)
+{
+	ENTRY;
+	if (ed != NULL && ed->ed_next_ismd && ed->ed_los != NULL) {
+		local_oid_storage_fini(env, ed->ed_los);
+		ed->ed_los = NULL;
+	}
+}
+
+static int
+echo_md_local_file_create(const struct lu_env *env, struct echo_md_device *emd,
+			  struct local_oid_storage *los,
+			  const struct lu_fid *pfid, const char *name,
+			  __u32 mode, struct lu_fid *fid)
+{
+	struct dt_object	*parent = NULL;
+	struct dt_object	*dto = NULL;
+	int			 rc = 0;
+
+	ENTRY;
+	LASSERT(!fid_is_zero(pfid));
+	parent = dt_locate(env, emd->emd_bottom, pfid);
+	if (unlikely(IS_ERR(parent)))
+		RETURN(PTR_ERR(parent));
+
+	/* create local file with @fid */
+	dto = local_file_find_or_create_with_fid(env, emd->emd_bottom, fid,
+						 parent, name, mode);
+	if (IS_ERR(dto))
+		GOTO(out_put, rc = PTR_ERR(dto));
+
+	*fid = *lu_object_fid(&dto->do_lu);
+	/*
+	 * since stack is not fully set up the local_storage uses own stack
+	 * and we should drop its object from cache
+	 */
+	dt_object_put_nocache(env, dto);
+
+	EXIT;
+out_put:
+	dt_object_put(env, parent);
+	RETURN(rc);
+}
+
+static int
+echo_md_root_get(const struct lu_env *env, struct echo_md_device *emd,
+		 struct echo_device *ed)
+{
+	struct lu_fid fid;
+	int rc = 0;
+
+	ENTRY;
+	/* Setup local dirs */
+	fid.f_seq = FID_SEQ_LOCAL_NAME;
+	fid.f_oid = 1;
+	fid.f_ver = 0;
+	rc = local_oid_storage_init(env, emd->emd_bottom, &fid, &ed->ed_los);
+	if (rc != 0)
+		RETURN(rc);
+
+	lu_echo_root_fid(&fid);
+	if (echo_md_seq_site(emd)->ss_node_id == 0) {
+		rc = echo_md_local_file_create(env, emd, ed->ed_los,
+					       &emd->emd_local_root_fid,
+					       echo_md_root_dir_name, S_IFDIR |
+					       S_IRUGO | S_IWUSR | S_IXUGO,
+					       &fid);
+		if (rc != 0) {
+			CERROR("%s: create md echo root fid failed: rc = %d\n",
+			       emd2obd_dev(emd)->obd_name, rc);
+			GOTO(out_los, rc);
+		}
+	}
+	ed->ed_root_fid = fid;
+
+	RETURN(0);
+out_los:
+	echo_ed_los_fini(env, ed);
+
+	RETURN(rc);
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+static struct lu_device *echo_device_alloc(const struct lu_env *env,
+					   struct lu_device_type *t,
+					   struct lustre_cfg *cfg)
+{
+	struct lu_device   *next;
+	struct echo_device *ed;
+	struct cl_device   *cd;
+	struct obd_device  *obd = NULL; /* to keep compiler happy */
+	struct obd_device  *tgt;
+	const char *tgt_type_name;
+	int rc;
+	int cleanup = 0;
+
+	ENTRY;
+	OBD_ALLOC_PTR(ed);
+	if (!ed)
+		GOTO(out, rc = -ENOMEM);
+
+	cleanup = 1;
+	cd = &ed->ed_cl;
+	rc = cl_device_init(cd, t);
+	if (rc)
+		GOTO(out, rc);
+
+	cd->cd_lu_dev.ld_ops = &echo_device_lu_ops;
+
+	cleanup = 2;
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	LASSERT(env != NULL);
+
+	tgt = class_name2obd(lustre_cfg_string(cfg, 1));
+	if (!tgt) {
+		CERROR("Can not find tgt device %s\n",
+			lustre_cfg_string(cfg, 1));
+		GOTO(out, rc = -ENODEV);
+	}
+
+	next = tgt->obd_lu_dev;
+
+	if (strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) {
+		ed->ed_next_ismd = 1;
+	} else if (strcmp(tgt->obd_type->typ_name, LUSTRE_OST_NAME) == 0 ||
+		   strcmp(tgt->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) {
+		ed->ed_next_ismd = 0;
+		rc = echo_site_init(env, ed);
+		if (rc)
+			GOTO(out, rc);
+	} else {
+		GOTO(out, rc = -EINVAL);
+	}
+
+	cleanup = 3;
+
+	rc = echo_client_setup(env, obd, cfg);
+	if (rc)
+		GOTO(out, rc);
+
+	ed->ed_ec = &obd->u.echo_client;
+	cleanup = 4;
+
+	if (ed->ed_next_ismd) {
+#ifdef HAVE_SERVER_SUPPORT
+		/* Suppose to connect to some Metadata layer */
+		struct lu_site		*ls = NULL;
+		struct lu_device	*ld = NULL;
+		struct md_device	*md = NULL;
+		struct echo_md_device	*emd = NULL;
+		int			 found = 0;
+
+		if (!next) {
+			CERROR("%s is not lu device type!\n",
+			       lustre_cfg_string(cfg, 1));
+			GOTO(out, rc = -EINVAL);
+		}
+
+		tgt_type_name = lustre_cfg_string(cfg, 2);
+		if (!tgt_type_name) {
+			CERROR("%s no type name for echo %s setup\n",
+				lustre_cfg_string(cfg, 1),
+				tgt->obd_type->typ_name);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		ls = next->ld_site;
+
+		spin_lock(&ls->ls_ld_lock);
+		list_for_each_entry(ld, &ls->ls_ld_linkage, ld_linkage) {
+			if (strcmp(ld->ld_type->ldt_name, tgt_type_name) == 0) {
+				found = 1;
+				break;
+			}
+		}
+		spin_unlock(&ls->ls_ld_lock);
+
+		if (found == 0) {
+			CERROR("%s is not lu device type!\n",
+			       lustre_cfg_string(cfg, 1));
+			GOTO(out, rc = -EINVAL);
+		}
+
+		next = ld;
+		/* For MD echo client, it will use the site in MDS stack */
+		ed->ed_site = ls;
+		ed->ed_cl.cd_lu_dev.ld_site = ls;
+		rc = echo_fid_init(ed, obd->obd_name, lu_site2seq(ls));
+		if (rc) {
+			CERROR("echo fid init error %d\n", rc);
+			GOTO(out, rc);
+		}
+
+		md = lu2md_dev(next);
+		emd = lu2emd_dev(&md->md_lu_dev);
+		rc = echo_md_root_get(env, emd, ed);
+		if (rc != 0) {
+			CERROR("%s: get root error: rc = %d\n",
+				emd2obd_dev(emd)->obd_name, rc);
+			GOTO(out, rc);
+		}
+#else /* !HAVE_SERVER_SUPPORT */
+		CERROR(
+		       "Local operations are NOT supported on client side. Only remote operations are supported. Metadata client must be run on server side.\n");
+		GOTO(out, rc = -EOPNOTSUPP);
+#endif /* HAVE_SERVER_SUPPORT */
+	} else {
+		/*
+		 * if echo client is to be stacked upon ost device, the next is
+		 * NULL since ost is not a clio device so far
+		 */
+		if (next != NULL && !lu_device_is_cl(next))
+			next = NULL;
+
+		tgt_type_name = tgt->obd_type->typ_name;
+		if (next) {
+			LASSERT(next != NULL);
+			if (next->ld_site)
+				GOTO(out, rc = -EBUSY);
+
+			next->ld_site = ed->ed_site;
+			rc = next->ld_type->ldt_ops->ldto_device_init(env, next,
+							next->ld_type->ldt_name,
+							NULL);
+			if (rc)
+				GOTO(out, rc);
+		} else {
+			LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0);
+		}
+	}
+
+	ed->ed_next = next;
+	RETURN(&cd->cd_lu_dev);
+out:
+	switch (cleanup) {
+	case 4: {
+		int rc2;
+
+		rc2 = echo_client_cleanup(obd);
+		if (rc2)
+			CERROR("Cleanup obd device %s error(%d)\n",
+			       obd->obd_name, rc2);
+	}
+	fallthrough;
+
+	case 3:
+		echo_site_fini(env, ed);
+		fallthrough;
+	case 2:
+		cl_device_fini(&ed->ed_cl);
+		fallthrough;
+	case 1:
+		OBD_FREE_PTR(ed);
+		fallthrough;
+	case 0:
+	default:
+		break;
+	}
+	return ERR_PTR(rc);
+}
+
+static int echo_device_init(const struct lu_env *env, struct lu_device *d,
+			    const char *name, struct lu_device *next)
+{
+	LBUG();
+	return 0;
+}
+
+static struct lu_device *echo_device_fini(const struct lu_env *env,
+					  struct lu_device *d)
+{
+	struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+	struct lu_device *next = ed->ed_next;
+
+	while (next && !ed->ed_next_ismd)
+		next = next->ld_type->ldt_ops->ldto_device_fini(env, next);
+	return NULL;
+}
+
+static void echo_lock_release(const struct lu_env *env,
+			      struct echo_lock *ecl,
+			      int still_used)
+{
+	struct cl_lock *clk = echo_lock2cl(ecl);
+
+	cl_lock_release(env, clk);
+}
+
+static struct lu_device *echo_device_free(const struct lu_env *env,
+					  struct lu_device *d)
+{
+	struct echo_device     *ed   = cl2echo_dev(lu2cl_dev(d));
+	struct echo_client_obd *ec   = ed->ed_ec;
+	struct echo_object     *eco;
+	struct lu_device       *next = ed->ed_next;
+
+	CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n",
+	       ed, next);
+
+	lu_site_purge(env, ed->ed_site, -1);
+
+	/*
+	 * check if there are objects still alive.
+	 * It shouldn't have any object because lu_site_purge would cleanup
+	 * all of cached objects. Anyway, probably the echo device is being
+	 * parallelly accessed.
+	 */
+	spin_lock(&ec->ec_lock);
+	list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain)
+		eco->eo_deleted = 1;
+	spin_unlock(&ec->ec_lock);
+
+	/* purge again */
+	lu_site_purge(env, ed->ed_site, -1);
+
+	CDEBUG(D_INFO,
+	       "Waiting for the reference of echo object to be dropped\n");
+
+	/* Wait for the last reference to be dropped. */
+	spin_lock(&ec->ec_lock);
+	while (!list_empty(&ec->ec_objects)) {
+		spin_unlock(&ec->ec_lock);
+		CERROR(
+		       "echo_client still has objects at cleanup time, wait for 1 second\n");
+		schedule_timeout_uninterruptible(cfs_time_seconds(1));
+		lu_site_purge(env, ed->ed_site, -1);
+		spin_lock(&ec->ec_lock);
+	}
+	spin_unlock(&ec->ec_lock);
+
+	LASSERT(list_empty(&ec->ec_locks));
+
+	CDEBUG(D_INFO, "No object exists, exiting...\n");
+
+	echo_client_cleanup(d->ld_obd);
+#ifdef HAVE_SERVER_SUPPORT
+	echo_fid_fini(d->ld_obd);
+	echo_ed_los_fini(env, ed);
+#endif
+	while (next && !ed->ed_next_ismd)
+		next = next->ld_type->ldt_ops->ldto_device_free(env, next);
+
+	LASSERT(ed->ed_site == d->ld_site);
+	echo_site_fini(env, ed);
+	cl_device_fini(&ed->ed_cl);
+	OBD_FREE_PTR(ed);
+
+	cl_env_cache_purge(~0);
+
+	return NULL;
+}
+
+static const struct lu_device_type_operations echo_device_type_ops = {
+	.ldto_init = echo_type_init,
+	.ldto_fini = echo_type_fini,
+
+	.ldto_start = echo_type_start,
+	.ldto_stop  = echo_type_stop,
+
+	.ldto_device_alloc = echo_device_alloc,
+	.ldto_device_free  = echo_device_free,
+	.ldto_device_init  = echo_device_init,
+	.ldto_device_fini  = echo_device_fini
+};
+
+static struct lu_device_type echo_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_ECHO_CLIENT_NAME,
+	.ldt_ops      = &echo_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD | LCT_MD_THREAD | LCT_DT_THREAD,
+};
+/** @} echo_init */
+
+/** \defgroup echo_exports Exported operations
+ *
+ * exporting functions to echo client
+ *
+ * @{
+ */
+
+/* Interfaces to echo client obd device */
+static struct echo_object *
+cl_echo_object_find(struct echo_device *d, const struct ost_id *oi)
+{
+	struct lu_env *env;
+	struct echo_thread_info *info;
+	struct echo_object_conf *conf;
+	struct echo_object *eco;
+	struct cl_object *obj;
+	struct lov_oinfo *oinfo = NULL;
+	struct lu_fid *fid;
+	__u16  refcheck;
+	int rc;
+
+	ENTRY;
+	LASSERTF(ostid_id(oi) != 0, DOSTID"\n", POSTID(oi));
+	LASSERTF(ostid_seq(oi) == FID_SEQ_ECHO, DOSTID"\n", POSTID(oi));
+
+	/* Never return an object if the obd is to be freed. */
+	if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping)
+		RETURN(ERR_PTR(-ENODEV));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN((void *)env);
+
+	info = echo_env_info(env);
+	conf = &info->eti_conf;
+	if (d->ed_next) {
+		OBD_ALLOC_PTR(oinfo);
+		if (!oinfo)
+			GOTO(out, eco = ERR_PTR(-ENOMEM));
+
+		oinfo->loi_oi = *oi;
+		conf->eoc_cl.u.coc_oinfo = oinfo;
+	}
+
+	/*
+	 * If echo_object_init() is successful then ownership of oinfo
+	 * is transferred to the object.
+	 */
+	conf->eoc_oinfo = &oinfo;
+
+	fid = &info->eti_fid;
+	rc = ostid_to_fid(fid, oi, 0);
+	if (rc != 0)
+		GOTO(out, eco = ERR_PTR(rc));
+
+	/*
+	 * In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp()
+	 */
+	/* coverity[overrun-buffer-val] */
+	obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl);
+	if (IS_ERR(obj))
+		GOTO(out, eco = (void *)obj);
+
+	eco = cl2echo_obj(obj);
+	if (eco->eo_deleted) {
+		cl_object_put(env, obj);
+		eco = ERR_PTR(-EAGAIN);
+	}
+
+out:
+	if (oinfo)
+		OBD_FREE_PTR(oinfo);
+
+	cl_env_put(env, &refcheck);
+	RETURN(eco);
+}
+
+static int cl_echo_object_put(struct echo_object *eco)
+{
+	struct lu_env *env;
+	struct cl_object *obj = echo_obj2cl(eco);
+	__u16  refcheck;
+
+	ENTRY;
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	/* an external function to kill an object? */
+	if (eco->eo_deleted) {
+		struct lu_object_header *loh = obj->co_lu.lo_header;
+
+		LASSERT(&eco->eo_hdr == luh2coh(loh));
+		set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags);
+	}
+
+	cl_object_put(env, obj);
+	cl_env_put(env, &refcheck);
+	RETURN(0);
+}
+
+static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco,
+			    u64 start, u64 end, int mode,
+			    __u64 *cookie, __u32 enqflags)
+{
+	struct cl_io *io;
+	struct cl_lock *lck;
+	struct cl_object *obj;
+	struct cl_lock_descr *descr;
+	struct echo_thread_info *info;
+	int rc = -ENOMEM;
+
+	ENTRY;
+	info = echo_env_info(env);
+	io = &info->eti_io;
+	lck = &info->eti_lock;
+	obj = echo_obj2cl(eco);
+
+	memset(lck, 0, sizeof(*lck));
+	descr = &lck->cll_descr;
+	descr->cld_obj   = obj;
+	descr->cld_start = cl_index(obj, start);
+	descr->cld_end   = cl_index(obj, end);
+	descr->cld_mode  = mode == LCK_PW ? CLM_WRITE : CLM_READ;
+	descr->cld_enq_flags = enqflags;
+	io->ci_obj = obj;
+
+	rc = cl_lock_request(env, io, lck);
+	if (rc == 0) {
+		struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+		struct echo_lock *el;
+
+		el = cl2echo_lock(cl_lock_at(lck, &echo_device_type));
+		spin_lock(&ec->ec_lock);
+		if (list_empty(&el->el_chain)) {
+			list_add(&el->el_chain, &ec->ec_locks);
+			el->el_cookie = ++ec->ec_unique;
+		}
+		atomic_inc(&el->el_refcount);
+		*cookie = el->el_cookie;
+		spin_unlock(&ec->ec_lock);
+	}
+	RETURN(rc);
+}
+
+static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed,
+			   __u64 cookie)
+{
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct echo_lock *ecl = NULL;
+	struct list_head *el;
+	int found = 0, still_used = 0;
+
+	ENTRY;
+	LASSERT(ec != NULL);
+	spin_lock(&ec->ec_lock);
+	list_for_each(el, &ec->ec_locks) {
+		ecl = list_entry(el, struct echo_lock, el_chain);
+		CDEBUG(D_INFO, "ecl: %p, cookie: %#llx\n", ecl, ecl->el_cookie);
+		found = (ecl->el_cookie == cookie);
+		if (found) {
+			if (atomic_dec_and_test(&ecl->el_refcount))
+				list_del_init(&ecl->el_chain);
+			else
+				still_used = 1;
+			break;
+		}
+	}
+	spin_unlock(&ec->ec_lock);
+
+	if (!found)
+		RETURN(-ENOENT);
+
+	echo_lock_release(env, ecl, still_used);
+	RETURN(0);
+}
+
+static void echo_commit_callback(const struct lu_env *env, struct cl_io *io,
+				 struct pagevec *pvec)
+{
+	struct echo_thread_info *info;
+	struct cl_2queue        *queue;
+	int i = 0;
+
+	info = echo_env_info(env);
+	LASSERT(io == &info->eti_io);
+
+	queue = &info->eti_queue;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *vmpage = pvec->pages[i];
+		struct cl_page *page = (struct cl_page *)vmpage->private;
+
+		cl_page_list_add(&queue->c2_qout, page, true);
+	}
+}
+
+static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
+			      struct page **pages, int npages, int async)
+{
+	struct lu_env           *env;
+	struct echo_thread_info *info;
+	struct cl_object        *obj = echo_obj2cl(eco);
+	struct echo_device      *ed  = eco->eo_dev;
+	struct cl_2queue        *queue;
+	struct cl_io            *io;
+	struct cl_page          *clp;
+	struct lustre_handle    lh = { 0 };
+	int page_size = cl_page_size(obj);
+	int rc;
+	int i;
+	__u16 refcheck;
+
+	ENTRY;
+	LASSERT((offset & ~PAGE_MASK) == 0);
+	LASSERT(ed->ed_next != NULL);
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	info    = echo_env_info(env);
+	io      = &info->eti_io;
+	queue   = &info->eti_queue;
+
+	cl_2queue_init(queue);
+
+	io->ci_ignore_layout = 1;
+	rc = cl_io_init(env, io, CIT_MISC, obj);
+	if (rc < 0)
+		GOTO(out, rc);
+	LASSERT(rc == 0);
+
+	rc = cl_echo_enqueue0(env, eco, offset,
+			      offset + npages * PAGE_SIZE - 1,
+			      rw == READ ? LCK_PR : LCK_PW, &lh.cookie,
+			      CEF_NEVER);
+	if (rc < 0)
+		GOTO(error_lock, rc);
+
+	for (i = 0; i < npages; i++) {
+		LASSERT(pages[i]);
+		clp = cl_page_find(env, obj, cl_index(obj, offset),
+				   pages[i], CPT_TRANSIENT);
+		if (IS_ERR(clp)) {
+			rc = PTR_ERR(clp);
+			break;
+		}
+		LASSERT(clp->cp_type == CPT_TRANSIENT);
+
+		rc = cl_page_own(env, io, clp);
+		if (rc) {
+			LASSERT(clp->cp_state == CPS_FREEING);
+			cl_page_put(env, clp);
+			break;
+		}
+
+		cl_2queue_add(queue, clp, true);
+
+		/*
+		 * drop the reference count for cl_page_find, so that the page
+		 * will be freed in cl_2queue_fini.
+		 */
+		cl_page_put(env, clp);
+		cl_page_clip(env, clp, 0, page_size);
+
+		offset += page_size;
+	}
+
+	if (rc == 0) {
+		enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE;
+
+		async = async && (typ == CRT_WRITE);
+		if (async)
+			rc = cl_io_commit_async(env, io, &queue->c2_qin,
+						0, PAGE_SIZE,
+						echo_commit_callback);
+		else
+			rc = cl_io_submit_sync(env, io, typ, queue, 0);
+		CDEBUG(D_INFO, "echo_client %s write returns %d\n",
+		       async ? "async" : "sync", rc);
+	}
+
+	cl_echo_cancel0(env, ed, lh.cookie);
+	EXIT;
+error_lock:
+	cl_2queue_discard(env, io, queue);
+	cl_2queue_disown(env, io, queue);
+	cl_2queue_fini(env, queue);
+	cl_io_fini(env, io);
+out:
+	cl_env_put(env, &refcheck);
+	return rc;
+}
+/** @} echo_exports */
+
+static u64 last_object_id;
+
+#ifdef HAVE_SERVER_SUPPORT
+static inline void echo_md_build_name(struct lu_name *lname, char *name,
+				      __u64 id)
+{
+	snprintf(name, ETI_NAME_LEN, "%llu", id);
+	lname->ln_name = name;
+	lname->ln_namelen = strlen(name);
+}
+
+/* similar to mdt_attr_get_complex */
+static int echo_big_lmm_get(const struct lu_env *env, struct md_object *o,
+			    struct md_attr *ma)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	int rc;
+
+	ENTRY;
+
+	LASSERT(ma->ma_lmm_size > 0);
+
+	LASSERT(ma->ma_need & (MA_LOV | MA_LMV));
+	if (ma->ma_need & MA_LOV)
+		rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LOV);
+	else
+		rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LMV);
+
+	if (rc < 0)
+		RETURN(rc);
+
+	/* big_lmm may need to be grown */
+	if (info->eti_big_lmmsize < rc) {
+		int size = size_roundup_power2(rc);
+
+		if (info->eti_big_lmmsize > 0) {
+			/* free old buffer */
+			LASSERT(info->eti_big_lmm);
+			OBD_FREE_LARGE(info->eti_big_lmm,
+				       info->eti_big_lmmsize);
+			info->eti_big_lmm = NULL;
+			info->eti_big_lmmsize = 0;
+		}
+
+		OBD_ALLOC_LARGE(info->eti_big_lmm, size);
+		if (!info->eti_big_lmm)
+			RETURN(-ENOMEM);
+		info->eti_big_lmmsize = size;
+	}
+	LASSERT(info->eti_big_lmmsize >= rc);
+
+	info->eti_buf.lb_buf = info->eti_big_lmm;
+	info->eti_buf.lb_len = info->eti_big_lmmsize;
+	if (ma->ma_need & MA_LOV)
+		rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LOV);
+	else
+		rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LMV);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (ma->ma_need & MA_LOV)
+		ma->ma_valid |= MA_LOV;
+	else
+		ma->ma_valid |= MA_LMV;
+
+	ma->ma_lmm = info->eti_big_lmm;
+	ma->ma_lmm_size = rc;
+
+	RETURN(0);
+}
+
+static int echo_attr_get_complex(const struct lu_env *env,
+				 struct md_object *next,
+				 struct md_attr *ma)
+{
+	struct echo_thread_info	*info = echo_env_info(env);
+	struct lu_buf		*buf = &info->eti_buf;
+	umode_t			 mode = lu_object_attr(&next->mo_lu);
+	int			 rc = 0, rc2;
+
+	ENTRY;
+
+	ma->ma_valid = 0;
+
+	if (ma->ma_need & MA_INODE) {
+		rc = mo_attr_get(env, next, ma);
+		if (rc)
+			GOTO(out, rc);
+		ma->ma_valid |= MA_INODE;
+	}
+
+	if ((ma->ma_need & MA_LOV) && (S_ISREG(mode) || S_ISDIR(mode))) {
+		LASSERT(ma->ma_lmm_size > 0);
+		buf->lb_buf = ma->ma_lmm;
+		buf->lb_len = ma->ma_lmm_size;
+		rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV);
+		if (rc2 > 0) {
+			ma->ma_lmm_size = rc2;
+			ma->ma_valid |= MA_LOV;
+		} else if (rc2 == -ENODATA) {
+			/* no LOV EA */
+			ma->ma_lmm_size = 0;
+		} else if (rc2 == -ERANGE) {
+			rc2 = echo_big_lmm_get(env, next, ma);
+			if (rc2 < 0)
+				GOTO(out, rc = rc2);
+		} else {
+			GOTO(out, rc = rc2);
+		}
+	}
+
+	if ((ma->ma_need & MA_LMV) && S_ISDIR(mode)) {
+		LASSERT(ma->ma_lmm_size > 0);
+		buf->lb_buf = ma->ma_lmm;
+		buf->lb_len = ma->ma_lmm_size;
+		rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LMV);
+		if (rc2 > 0) {
+			ma->ma_lmm_size = rc2;
+			ma->ma_valid |= MA_LMV;
+		} else if (rc2 == -ENODATA) {
+			/* no LMV EA */
+			ma->ma_lmm_size = 0;
+		} else if (rc2 == -ERANGE) {
+			rc2 = echo_big_lmm_get(env, next, ma);
+			if (rc2 < 0)
+				GOTO(out, rc = rc2);
+		} else {
+			GOTO(out, rc = rc2);
+		}
+	}
+
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+	if ((ma->ma_need & MA_ACL_DEF) && S_ISDIR(mode)) {
+		buf->lb_buf = ma->ma_acl;
+		buf->lb_len = ma->ma_acl_size;
+		rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT);
+		if (rc2 > 0) {
+			ma->ma_acl_size = rc2;
+			ma->ma_valid |= MA_ACL_DEF;
+		} else if (rc2 == -ENODATA) {
+			/* no ACLs */
+			ma->ma_acl_size = 0;
+		} else {
+			GOTO(out, rc = rc2);
+		}
+	}
+#endif
+out:
+	CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = %#llx ma_lmm=%p\n",
+	       rc, ma->ma_valid, ma->ma_lmm);
+	RETURN(rc);
+}
+
+static int
+echo_md_create_internal(const struct lu_env *env, struct echo_device *ed,
+			struct md_object *parent, struct lu_fid *fid,
+			struct lu_name *lname, struct md_op_spec *spec,
+			struct md_attr *ma)
+{
+	struct lu_object	*ec_child, *child;
+	struct lu_device	*ld = ed->ed_next;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_fid		*fid2 = &info->eti_fid2;
+	struct lu_object_conf    conf = { .loc_flags = LOC_F_NEW };
+	int			 rc;
+
+	ENTRY;
+
+	rc = mdo_lookup(env, parent, lname, fid2, spec);
+	if (rc == 0)
+		return -EEXIST;
+	else if (rc != -ENOENT)
+		return rc;
+
+	ec_child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev,
+				     fid, &conf);
+	if (IS_ERR(ec_child)) {
+		CERROR("Can not find the child "DFID": rc = %ld\n", PFID(fid),
+			PTR_ERR(ec_child));
+		RETURN(PTR_ERR(ec_child));
+	}
+
+	child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+	if (!child) {
+		CERROR("Can not locate the child "DFID"\n", PFID(fid));
+		GOTO(out_put, rc = -EINVAL);
+	}
+
+	CDEBUG(D_RPCTRACE, "Start creating object "DFID" %s %p\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+	/*
+	 * Do not perform lookup sanity check. We know that name does not exist.
+	 */
+	spec->sp_cr_lookup = 0;
+	rc = mdo_create(env, parent, lname, lu2md(child), spec, ma);
+	if (rc) {
+		CERROR("Can not create child "DFID": rc = %d\n", PFID(fid), rc);
+		GOTO(out_put, rc);
+	}
+	CDEBUG(D_RPCTRACE, "End creating object "DFID" %s %p rc  = %d\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent, rc);
+	EXIT;
+out_put:
+	lu_object_put(env, ec_child);
+	return rc;
+}
+
+static int echo_set_lmm_size(const struct lu_env *env, struct lu_device *ld,
+			     struct md_attr *ma)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+
+	if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+		ma->ma_lmm = (void *)&info->eti_lmm;
+		ma->ma_lmm_size = sizeof(info->eti_lmm);
+	} else {
+		LASSERT(info->eti_big_lmmsize);
+		ma->ma_lmm = info->eti_big_lmm;
+		ma->ma_lmm_size = info->eti_big_lmmsize;
+	}
+
+	return 0;
+}
+
+static int
+echo_md_dir_stripe_choose(const struct lu_env *env, struct echo_device *ed,
+			  struct lu_object *obj, const char *name,
+			  unsigned int namelen, __u64 id,
+			  struct lu_object **new_parent)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	struct md_attr		*ma = &info->eti_ma;
+	struct lmv_mds_md_v1	*lmv;
+	struct lu_device        *ld = ed->ed_next;
+	unsigned int		idx;
+	struct lu_name		tmp_ln_name;
+	struct lu_fid		stripe_fid;
+	struct lu_object	*stripe_obj;
+	int			rc;
+
+	LASSERT(obj != NULL);
+	LASSERT(S_ISDIR(obj->lo_header->loh_attr));
+
+	memset(ma, 0, sizeof(*ma));
+	echo_set_lmm_size(env, ld, ma);
+	ma->ma_need = MA_LMV;
+	rc = echo_attr_get_complex(env, lu2md(obj), ma);
+	if (rc) {
+		CERROR("Can not getattr child "DFID": rc = %d\n",
+			PFID(lu_object_fid(obj)), rc);
+		return rc;
+	}
+
+	if (!(ma->ma_valid & MA_LMV)) {
+		*new_parent = obj;
+		return 0;
+	}
+
+	lmv = (struct lmv_mds_md_v1 *)ma->ma_lmm;
+	if (!lmv_is_sane(lmv)) {
+		rc = -EINVAL;
+		CERROR("Invalid mds md magic %x "DFID": rc = %d\n",
+		       le32_to_cpu(lmv->lmv_magic), PFID(lu_object_fid(obj)),
+		       rc);
+		return rc;
+	}
+
+	if (name) {
+		tmp_ln_name.ln_name = name;
+		tmp_ln_name.ln_namelen = namelen;
+	} else {
+		LASSERT(id != -1);
+		echo_md_build_name(&tmp_ln_name, info->eti_name, id);
+	}
+
+	idx = lmv_name_to_stripe_index(lmv, tmp_ln_name.ln_name,
+				       tmp_ln_name.ln_namelen);
+
+	LASSERT(idx < le32_to_cpu(lmv->lmv_stripe_count));
+	fid_le_to_cpu(&stripe_fid, &lmv->lmv_stripe_fids[idx]);
+
+	stripe_obj = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, &stripe_fid,
+				       NULL);
+	if (IS_ERR(stripe_obj)) {
+		rc = PTR_ERR(stripe_obj);
+		CERROR("Can not find the parent "DFID": rc = %d\n",
+		       PFID(&stripe_fid), rc);
+		return rc;
+	}
+
+	*new_parent = lu_object_locate(stripe_obj->lo_header, ld->ld_type);
+	if (!*new_parent) {
+		lu_object_put(env, stripe_obj);
+		RETURN(-ENXIO);
+	}
+
+	return rc;
+}
+
+static int echo_create_md_object(const struct lu_env *env,
+				 struct echo_device *ed,
+				 struct lu_object *ec_parent,
+				 struct lu_fid *fid,
+				 char *name, int namelen,
+				  __u64 id, __u32 mode, int count,
+				 int stripe_count, int stripe_offset)
+{
+	struct lu_object *parent;
+	struct lu_object *new_parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name *lname = &info->eti_lname;
+	struct md_op_spec *spec = &info->eti_spec;
+	struct md_attr *ma = &info->eti_ma;
+	struct lu_device *ld = ed->ed_next;
+	int rc = 0;
+	int i;
+
+	ENTRY;
+
+	if (!ec_parent)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (!parent)
+		RETURN(-ENXIO);
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, name, namelen,
+				       id, &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+	LASSERT(new_parent != NULL);
+	memset(ma, 0, sizeof(*ma));
+	memset(spec, 0, sizeof(*spec));
+	echo_set_lmm_size(env, ld, ma);
+	if (stripe_count != 0) {
+		spec->sp_cr_flags |= MDS_FMODE_WRITE;
+		if (stripe_count != -1) {
+			if (S_ISDIR(mode)) {
+				struct lmv_user_md *lmu;
+
+				lmu = (struct lmv_user_md *)&info->eti_lum;
+				lmu->lum_magic = LMV_USER_MAGIC;
+				lmu->lum_stripe_offset = stripe_offset;
+				lmu->lum_stripe_count = stripe_count;
+				lmu->lum_hash_type = LMV_HASH_TYPE_FNV_1A_64;
+				spec->u.sp_ea.eadata = lmu;
+				spec->u.sp_ea.eadatalen = sizeof(*lmu);
+			} else {
+				struct lov_user_md_v3 *lum = &info->eti_lum;
+
+				lum->lmm_magic = LOV_USER_MAGIC_V3;
+				lum->lmm_stripe_count = stripe_count;
+				lum->lmm_stripe_offset = stripe_offset;
+				lum->lmm_pattern = LOV_PATTERN_NONE;
+				spec->u.sp_ea.eadata = lum;
+				spec->u.sp_ea.eadatalen = sizeof(*lum);
+			}
+			spec->sp_cr_flags |= MDS_OPEN_HAS_EA;
+		}
+	}
+
+	ma->ma_attr.la_mode = mode;
+	ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
+	ma->ma_attr.la_ctime = ktime_get_real_seconds();
+
+	if (name) {
+		lname->ln_name = name;
+		lname->ln_namelen = namelen;
+		/* If name is specified, only create one object by name */
+		rc = echo_md_create_internal(env, ed, lu2md(new_parent), fid,
+					     lname, spec, ma);
+		GOTO(out_put, rc);
+	}
+
+	/* Create multiple object sequenced by id */
+	for (i = 0; i < count; i++) {
+		char *tmp_name = info->eti_name;
+
+		echo_md_build_name(lname, tmp_name, id);
+
+		rc = echo_md_create_internal(env, ed, lu2md(new_parent),
+					     fid, lname, spec, ma);
+		if (rc) {
+			CERROR("Can not create child %s: rc = %d\n", tmp_name,
+				rc);
+			break;
+		}
+		id++;
+		fid->f_oid++;
+	}
+
+out_put:
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	RETURN(rc);
+}
+
+static struct lu_object *echo_md_lookup(const struct lu_env *env,
+					struct echo_device *ed,
+					struct md_object *parent,
+					struct lu_name *lname)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_fid *fid = &info->eti_fid;
+	struct lu_object *child;
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_INFO, "lookup %s in parent "DFID" %p\n", lname->ln_name,
+	       PFID(fid), parent);
+
+	rc = mdo_lookup(env, parent, lname, fid, NULL);
+	if (rc) {
+		CERROR("lookup %s: rc = %d\n", lname->ln_name, rc);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/*
+	 * In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp()
+	 */
+	/* coverity[overrun-buffer-val] */
+	child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+
+	RETURN(child);
+}
+
+static int echo_setattr_object(const struct lu_env *env,
+			       struct echo_device *ed,
+			       struct lu_object *ec_parent,
+			       __u64 id, int count)
+{
+	struct lu_object *parent;
+	struct lu_object *new_parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name *lname = &info->eti_lname;
+	char *name = info->eti_name;
+	struct lu_device *ld = ed->ed_next;
+	struct lu_buf *buf = &info->eti_buf;
+	int rc = 0;
+	int i;
+
+	ENTRY;
+
+	if (!ec_parent)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (!parent)
+		RETURN(-ENXIO);
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id,
+				       &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+	for (i = 0; i < count; i++) {
+		struct lu_object *ec_child, *child;
+
+		echo_md_build_name(lname, name, id);
+
+		ec_child = echo_md_lookup(env, ed, lu2md(new_parent), lname);
+		if (IS_ERR(ec_child)) {
+			rc = PTR_ERR(ec_child);
+			CERROR("Can't find child %s: rc = %d\n",
+				lname->ln_name, rc);
+			break;
+		}
+
+		child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+		if (!child) {
+			CERROR("Can not locate the child %s\n", lname->ln_name);
+			lu_object_put(env, ec_child);
+			rc = -EINVAL;
+			break;
+		}
+
+		CDEBUG(D_RPCTRACE, "Start setattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+
+		buf->lb_buf = info->eti_xattr_buf;
+		buf->lb_len = sizeof(info->eti_xattr_buf);
+
+		sprintf(name, "%s.test1", XATTR_USER_PREFIX);
+		rc = mo_xattr_set(env, lu2md(child), buf, name,
+				  LU_XATTR_CREATE);
+		if (rc < 0) {
+			CERROR("Can not setattr child "DFID": rc = %d\n",
+				PFID(lu_object_fid(child)), rc);
+			lu_object_put(env, ec_child);
+			break;
+		}
+		CDEBUG(D_RPCTRACE, "End setattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+		id++;
+		lu_object_put(env, ec_child);
+	}
+
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	RETURN(rc);
+}
+
+static int echo_getattr_object(const struct lu_env *env,
+			       struct echo_device *ed,
+			       struct lu_object *ec_parent,
+			       __u64 id, int count)
+{
+	struct lu_object *parent;
+	struct lu_object *new_parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name *lname = &info->eti_lname;
+	char *name = info->eti_name;
+	struct md_attr *ma = &info->eti_ma;
+	struct lu_device *ld = ed->ed_next;
+	int rc = 0;
+	int i;
+
+	ENTRY;
+
+	if (!ec_parent)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (!parent)
+		RETURN(-ENXIO);
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id,
+				       &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+	memset(ma, 0, sizeof(*ma));
+	ma->ma_need |= MA_INODE | MA_LOV | MA_PFID | MA_HSM | MA_ACL_DEF;
+	ma->ma_acl = info->eti_xattr_buf;
+	ma->ma_acl_size = sizeof(info->eti_xattr_buf);
+
+	for (i = 0; i < count; i++) {
+		struct lu_object *ec_child, *child;
+
+		ma->ma_valid = 0;
+		echo_md_build_name(lname, name, id);
+		echo_set_lmm_size(env, ld, ma);
+
+		ec_child = echo_md_lookup(env, ed, lu2md(new_parent), lname);
+		if (IS_ERR(ec_child)) {
+			CERROR("Can't find child %s: rc = %ld\n",
+			       lname->ln_name, PTR_ERR(ec_child));
+			RETURN(PTR_ERR(ec_child));
+		}
+
+		child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+		if (!child) {
+			CERROR("Can not locate the child %s\n", lname->ln_name);
+			lu_object_put(env, ec_child);
+			RETURN(-EINVAL);
+		}
+
+		CDEBUG(D_RPCTRACE, "Start getattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+		rc = echo_attr_get_complex(env, lu2md(child), ma);
+		if (rc) {
+			CERROR("Can not getattr child "DFID": rc = %d\n",
+				PFID(lu_object_fid(child)), rc);
+			lu_object_put(env, ec_child);
+			break;
+		}
+		CDEBUG(D_RPCTRACE, "End getattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+		id++;
+		lu_object_put(env, ec_child);
+	}
+
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	RETURN(rc);
+}
+
+static int echo_lookup_object(const struct lu_env *env,
+			      struct echo_device *ed,
+			      struct lu_object *ec_parent,
+			      __u64 id, int count)
+{
+	struct lu_object *parent;
+	struct lu_object *new_parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name *lname = &info->eti_lname;
+	char *name = info->eti_name;
+	struct lu_fid *fid = &info->eti_fid;
+	struct lu_device *ld = ed->ed_next;
+	int rc = 0;
+	int i;
+
+	if (!ec_parent)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (!parent)
+		return -ENXIO;
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id,
+				       &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+	/*prepare the requests*/
+	for (i = 0; i < count; i++) {
+		echo_md_build_name(lname, name, id);
+
+		CDEBUG(D_RPCTRACE, "Start lookup object "DFID" %s %p\n",
+		       PFID(lu_object_fid(new_parent)), lname->ln_name,
+		       new_parent);
+
+		rc = mdo_lookup(env, lu2md(new_parent), lname, fid, NULL);
+		if (rc) {
+			CERROR("Can not lookup child %s: rc = %d\n", name, rc);
+			break;
+		}
+
+		CDEBUG(D_RPCTRACE, "End lookup object "DFID" %s %p\n",
+		       PFID(lu_object_fid(new_parent)), lname->ln_name,
+		       new_parent);
+
+		id++;
+	}
+
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	return rc;
+}
+
+static int echo_md_destroy_internal(const struct lu_env *env,
+				    struct echo_device *ed,
+				    struct md_object *parent,
+				    struct lu_name *lname,
+				    struct md_attr *ma)
+{
+	struct lu_device   *ld = ed->ed_next;
+	struct lu_object   *ec_child;
+	struct lu_object   *child;
+	int                 rc;
+
+	ENTRY;
+
+	ec_child = echo_md_lookup(env, ed, parent, lname);
+	if (IS_ERR(ec_child)) {
+		CERROR("Can't find child %s: rc = %ld\n", lname->ln_name,
+			PTR_ERR(ec_child));
+		RETURN(PTR_ERR(ec_child));
+	}
+
+	child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+	if (!child) {
+		CERROR("Can not locate the child %s\n", lname->ln_name);
+		GOTO(out_put, rc = -EINVAL);
+	}
+
+	if (lu_object_remote(child)) {
+		CERROR("Can not destroy remote object %s: rc = %d\n",
+		       lname->ln_name, -EPERM);
+		GOTO(out_put, rc = -EPERM);
+	}
+	CDEBUG(D_RPCTRACE, "Start destroy object "DFID" %s %p\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+	rc = mdo_unlink(env, parent, lu2md(child), lname, ma, 0);
+	if (rc) {
+		CERROR("Can not unlink child %s: rc = %d\n",
+			lname->ln_name, rc);
+		GOTO(out_put, rc);
+	}
+	CDEBUG(D_RPCTRACE, "End destroy object "DFID" %s %p\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+out_put:
+	lu_object_put(env, ec_child);
+	return rc;
+}
+
+static int echo_destroy_object(const struct lu_env *env,
+			       struct echo_device *ed,
+			       struct lu_object *ec_parent,
+			       char *name, int namelen,
+			       __u64 id, __u32 mode,
+			       int count)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name          *lname = &info->eti_lname;
+	struct md_attr          *ma = &info->eti_ma;
+	struct lu_device        *ld = ed->ed_next;
+	struct lu_object        *parent;
+	struct lu_object        *new_parent;
+	int                      rc = 0;
+	int                      i;
+
+	ENTRY;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (!parent)
+		RETURN(-EINVAL);
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, name, namelen,
+				       id, &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+	memset(ma, 0, sizeof(*ma));
+	ma->ma_attr.la_mode = mode;
+	ma->ma_attr.la_valid = LA_CTIME;
+	ma->ma_attr.la_ctime = ktime_get_real_seconds();
+	ma->ma_need = MA_INODE;
+	ma->ma_valid = 0;
+
+	if (name) {
+		lname->ln_name = name;
+		lname->ln_namelen = namelen;
+		rc = echo_md_destroy_internal(env, ed, lu2md(new_parent), lname,
+					      ma);
+		GOTO(out_put, rc);
+	}
+
+	/*prepare the requests*/
+	for (i = 0; i < count; i++) {
+		char *tmp_name = info->eti_name;
+
+		ma->ma_valid = 0;
+		echo_md_build_name(lname, tmp_name, id);
+
+		rc = echo_md_destroy_internal(env, ed, lu2md(new_parent), lname,
+					      ma);
+		if (rc) {
+			CERROR("Can not unlink child %s: rc = %d\n", name, rc);
+			break;
+		}
+		id++;
+	}
+
+out_put:
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	RETURN(rc);
+}
+
+static struct lu_object *echo_resolve_path(const struct lu_env *env,
+					   struct echo_device *ed, char *path,
+					   int path_len)
+{
+	struct lu_device	*ld = ed->ed_next;
+	struct echo_thread_info	*info = echo_env_info(env);
+	struct lu_fid		*fid = &info->eti_fid;
+	struct lu_name		*lname = &info->eti_lname;
+	struct lu_object	*parent = NULL;
+	struct lu_object	*child = NULL;
+	int			 rc = 0;
+
+	ENTRY;
+	*fid = ed->ed_root_fid;
+
+	/*
+	 * In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp()
+	 */
+	/* coverity[overrun-buffer-val] */
+	parent = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+	if (IS_ERR(parent)) {
+		CERROR("Can not find the parent "DFID": rc = %ld\n",
+			PFID(fid), PTR_ERR(parent));
+		RETURN(parent);
+	}
+
+	while (1) {
+		struct lu_object *ld_parent;
+		char *e;
+
+		e = strsep(&path, "/");
+		if (!e)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+
+		lname->ln_name = e;
+		lname->ln_namelen = strlen(e);
+
+		ld_parent = lu_object_locate(parent->lo_header, ld->ld_type);
+		if (!ld_parent) {
+			lu_object_put(env, parent);
+			rc = -EINVAL;
+			break;
+		}
+
+		child = echo_md_lookup(env, ed, lu2md(ld_parent), lname);
+		lu_object_put(env, parent);
+		if (IS_ERR(child)) {
+			rc = (int)PTR_ERR(child);
+			CERROR("lookup %s under parent "DFID": rc = %d\n",
+				lname->ln_name, PFID(lu_object_fid(ld_parent)),
+				rc);
+			break;
+		}
+		parent = child;
+	}
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	RETURN(parent);
+}
+
+static void echo_ucred_init(struct lu_env *env)
+{
+	struct lu_ucred *ucred = lu_ucred(env);
+	kernel_cap_t kcap = current_cap();
+
+	ucred->uc_valid = UCRED_INVALID;
+
+	ucred->uc_suppgids[0] = -1;
+	ucred->uc_suppgids[1] = -1;
+
+	ucred->uc_uid = ucred->uc_o_uid  =
+				from_kuid(&init_user_ns, current_uid());
+	ucred->uc_gid = ucred->uc_o_gid  =
+				from_kgid(&init_user_ns, current_gid());
+	ucred->uc_fsuid = ucred->uc_o_fsuid =
+				from_kuid(&init_user_ns, current_fsuid());
+	ucred->uc_fsgid = ucred->uc_o_fsgid =
+				from_kgid(&init_user_ns, current_fsgid());
+	ucred->uc_cap = current_cap();
+
+	/* remove fs privilege for non-root user. */
+	if (ucred->uc_fsuid) {
+		kcap = cap_drop_nfsd_set(kcap);
+		kcap = cap_drop_fs_set(kcap);
+	}
+	ucred->uc_cap = kcap;
+	ucred->uc_valid = UCRED_NEW;
+}
+
+static void echo_ucred_fini(struct lu_env *env)
+{
+	struct lu_ucred *ucred = lu_ucred(env);
+
+	ucred->uc_valid = UCRED_INIT;
+}
+
+static int echo_md_handler(struct echo_device *ed, int command,
+			   char *path, int path_len, __u64 id, int count,
+			   struct obd_ioctl_data *data)
+{
+	struct echo_thread_info *info;
+	struct lu_device *ld = ed->ed_next;
+	struct lu_env *env;
+	__u16 refcheck;
+	struct lu_object *parent;
+	char *name = NULL;
+	int namelen = data->ioc_plen2;
+	int rc = 0;
+
+	ENTRY;
+	if (!ld) {
+		CERROR("MD echo client is not being initialized properly\n");
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+		CERROR("Only support MDD layer right now!\n");
+		RETURN(-EINVAL);
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = lu_env_refill_by_tags(env, ECHO_MD_CTX_TAG, ECHO_SES_TAG);
+	if (rc != 0)
+		GOTO(out_env, rc);
+
+	/* init big_lmm buffer */
+	info = echo_env_info(env);
+	LASSERT(info->eti_big_lmm == NULL);
+	OBD_ALLOC_LARGE(info->eti_big_lmm, MIN_MD_SIZE);
+	if (!info->eti_big_lmm)
+		GOTO(out_env, rc = -ENOMEM);
+	info->eti_big_lmmsize = MIN_MD_SIZE;
+
+	parent = echo_resolve_path(env, ed, path, path_len);
+	if (IS_ERR(parent)) {
+		CERROR("Can not resolve the path %s: rc = %ld\n", path,
+			PTR_ERR(parent));
+		GOTO(out_free, rc = PTR_ERR(parent));
+	}
+
+	if (namelen > 0) {
+		OBD_ALLOC(name, namelen + 1);
+		if (!name)
+			GOTO(out_put, rc = -ENOMEM);
+		if (copy_from_user(name, data->ioc_pbuf2, namelen))
+			GOTO(out_name, rc = -EFAULT);
+	}
+
+	echo_ucred_init(env);
+
+	switch (command) {
+	case ECHO_MD_CREATE:
+	case ECHO_MD_MKDIR: {
+		struct echo_thread_info *info = echo_env_info(env);
+		__u32 mode = data->ioc_obdo2.o_mode;
+		struct lu_fid *fid = &info->eti_fid;
+		int stripe_count = (int)data->ioc_obdo2.o_misc;
+		int stripe_index = (int)data->ioc_obdo2.o_stripe_idx;
+
+		rc = ostid_to_fid(fid, &data->ioc_obdo1.o_oi, 0);
+		if (rc != 0)
+			break;
+
+		/*
+		 * In the function below, .hs_keycmp resolves to
+		 * lu_obj_hop_keycmp()
+		 */
+		/* coverity[overrun-buffer-val] */
+		rc = echo_create_md_object(env, ed, parent, fid, name, namelen,
+					   id, mode, count, stripe_count,
+					   stripe_index);
+		break;
+	}
+	case ECHO_MD_DESTROY:
+	case ECHO_MD_RMDIR: {
+		__u32 mode = data->ioc_obdo2.o_mode;
+
+		rc = echo_destroy_object(env, ed, parent, name, namelen,
+					 id, mode, count);
+		break;
+	}
+	case ECHO_MD_LOOKUP:
+		rc = echo_lookup_object(env, ed, parent, id, count);
+		break;
+	case ECHO_MD_GETATTR:
+		rc = echo_getattr_object(env, ed, parent, id, count);
+		break;
+	case ECHO_MD_SETATTR:
+		rc = echo_setattr_object(env, ed, parent, id, count);
+		break;
+	default:
+		CERROR("unknown command %d\n", command);
+		rc = -EINVAL;
+		break;
+	}
+	echo_ucred_fini(env);
+
+out_name:
+	if (name)
+		OBD_FREE(name, namelen + 1);
+out_put:
+	lu_object_put(env, parent);
+out_free:
+	LASSERT(info->eti_big_lmm);
+	OBD_FREE_LARGE(info->eti_big_lmm, info->eti_big_lmmsize);
+	info->eti_big_lmm = NULL;
+	info->eti_big_lmmsize = 0;
+out_env:
+	cl_env_put(env, &refcheck);
+	return rc;
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+static int echo_create_object(const struct lu_env *env, struct echo_device *ed,
+			      struct obdo *oa)
+{
+	struct echo_object	*eco;
+	struct echo_client_obd	*ec = ed->ed_ec;
+	int created = 0;
+	int rc;
+
+	ENTRY;
+	if (!(oa->o_valid & OBD_MD_FLID) ||
+	    !(oa->o_valid & OBD_MD_FLGROUP) ||
+	    !fid_seq_is_echo(ostid_seq(&oa->o_oi))) {
+		CERROR("invalid oid "DOSTID"\n", POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	if (ostid_id(&oa->o_oi) == 0) {
+		rc = ostid_set_id(&oa->o_oi, ++last_object_id);
+		if (rc)
+			GOTO(failed, rc);
+	}
+
+	rc = obd_create(env, ec->ec_exp, oa);
+	if (rc != 0) {
+		CERROR("Cannot create objects: rc = %d\n", rc);
+		GOTO(failed, rc);
+	}
+
+	created = 1;
+
+	oa->o_valid |= OBD_MD_FLID;
+
+	eco = cl_echo_object_find(ed, &oa->o_oi);
+	if (IS_ERR(eco))
+		GOTO(failed, rc = PTR_ERR(eco));
+	cl_echo_object_put(eco);
+
+	CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi));
+	EXIT;
+
+failed:
+	if (created && rc != 0)
+		obd_destroy(env, ec->ec_exp, oa);
+
+	if (rc != 0)
+		CERROR("create object failed with: rc = %d\n", rc);
+
+	return rc;
+}
+
+static int echo_get_object(struct echo_object **ecop, struct echo_device *ed,
+			   struct obdo *oa)
+{
+	struct echo_object *eco;
+	int rc;
+
+	ENTRY;
+	if (!(oa->o_valid & OBD_MD_FLID) ||
+	    !(oa->o_valid & OBD_MD_FLGROUP) ||
+	    ostid_id(&oa->o_oi) == 0) {
+		CERROR("invalid oid "DOSTID"\n", POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	rc = 0;
+	eco = cl_echo_object_find(ed, &oa->o_oi);
+	if (!IS_ERR(eco))
+		*ecop = eco;
+	else
+		rc = PTR_ERR(eco);
+
+	RETURN(rc);
+}
+
+static void echo_put_object(struct echo_object *eco)
+{
+	int rc;
+
+	rc = cl_echo_object_put(eco);
+	if (rc)
+		CERROR("%s: echo client drop an object failed: rc = %d\n",
+		       eco->eo_dev->ed_ec->ec_exp->exp_obd->obd_name, rc);
+}
+
+static void echo_client_page_debug_setup(struct page *page, int rw, u64 id,
+					 u64 offset, u64 count)
+{
+	char *addr;
+	u64 stripe_off;
+	u64 stripe_id;
+	int delta;
+
+	/* no partial pages on the client */
+	LASSERT(count == PAGE_SIZE);
+
+	addr = kmap(page);
+
+	for (delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+		if (rw == OBD_BRW_WRITE) {
+			stripe_off = offset + delta;
+			stripe_id = id;
+		} else {
+			stripe_off = 0xdeadbeef00c0ffeeULL;
+			stripe_id = 0xdeadbeef00c0ffeeULL;
+		}
+		block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE,
+				  stripe_off, stripe_id);
+	}
+
+	kunmap(page);
+}
+
+static int
+echo_client_page_debug_check(struct page *page, u64 id, u64 offset, u64 count)
+{
+	u64 stripe_off;
+	u64 stripe_id;
+	char *addr;
+	int delta;
+	int rc;
+	int rc2;
+
+	/* no partial pages on the client */
+	LASSERT(count == PAGE_SIZE);
+
+	addr = kmap(page);
+
+	for (rc = delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+		stripe_off = offset + delta;
+		stripe_id = id;
+
+		rc2 = block_debug_check("test_brw",
+					addr + delta, OBD_ECHO_BLOCK_SIZE,
+					stripe_off, stripe_id);
+		if (rc2 != 0) {
+			CERROR("Error in echo object %#llx\n", id);
+			rc = rc2;
+		}
+	}
+
+	kunmap(page);
+	return rc;
+}
+
+static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
+			    struct echo_object *eco, u64 offset,
+			    u64 count, int async)
+{
+	size_t npages;
+	struct brw_page *pga;
+	struct brw_page *pgp;
+	struct page **pages;
+	u64 off;
+	size_t i;
+	int rc;
+	int verify;
+	gfp_t gfp_mask;
+	u32 brw_flags = 0;
+
+	ENTRY;
+	verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID &&
+		  (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+		  (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+
+	gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_KERNEL : GFP_HIGHUSER;
+
+	LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
+
+	if ((count & (~PAGE_MASK)) != 0)
+		RETURN(-EINVAL);
+
+	/* XXX think again with misaligned I/O */
+	npages = count >> PAGE_SHIFT;
+
+	if (rw == OBD_BRW_WRITE)
+		brw_flags = OBD_BRW_ASYNC;
+
+	OBD_ALLOC_PTR_ARRAY_LARGE(pga, npages);
+	if (!pga)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC_PTR_ARRAY_LARGE(pages, npages);
+	if (!pages) {
+		OBD_FREE_PTR_ARRAY_LARGE(pga, npages);
+		RETURN(-ENOMEM);
+	}
+
+	for (i = 0, pgp = pga, off = offset;
+	     i < npages;
+	     i++, pgp++, off += PAGE_SIZE) {
+
+		LASSERT(pgp->pg == NULL);	/* for cleanup */
+
+		rc = -ENOMEM;
+		pgp->pg = alloc_page(gfp_mask);
+		if (!pgp->pg)
+			goto out;
+
+		/* set mapping so page is not considered encrypted */
+		pgp->pg->mapping = ECHO_MAPPING_UNENCRYPTED;
+		pages[i] = pgp->pg;
+		pgp->count = PAGE_SIZE;
+		pgp->off = off;
+		pgp->flag = brw_flags;
+
+		if (verify)
+			echo_client_page_debug_setup(pgp->pg, rw,
+						     ostid_id(&oa->o_oi), off,
+						     pgp->count);
+	}
+
+	/* brw mode can only be used at client */
+	LASSERT(ed->ed_next != NULL);
+	rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async);
+
+ out:
+	if (rc != 0 || rw != OBD_BRW_READ)
+		verify = 0;
+
+	for (i = 0, pgp = pga; i < npages; i++, pgp++) {
+		if (!pgp->pg)
+			continue;
+
+		if (verify) {
+			int vrc;
+
+			vrc = echo_client_page_debug_check(pgp->pg,
+							   ostid_id(&oa->o_oi),
+							   pgp->off,
+							   pgp->count);
+			if (vrc != 0 && rc == 0)
+				rc = vrc;
+		}
+		__free_page(pgp->pg);
+	}
+	OBD_FREE_PTR_ARRAY_LARGE(pga, npages);
+	OBD_FREE_PTR_ARRAY_LARGE(pages, npages);
+	RETURN(rc);
+}
+
+static int echo_client_prep_commit(const struct lu_env *env,
+				   struct obd_export *exp, int rw,
+				   struct obdo *oa, struct echo_object *eco,
+				   u64 offset, u64 count,
+				   u64 batch, int async)
+{
+	struct obd_ioobj ioo;
+	struct niobuf_local *lnb;
+	struct niobuf_remote rnb;
+	u64 off;
+	u64 npages, tot_pages, apc;
+	int i, ret = 0, brw_flags = 0;
+
+	ENTRY;
+	if (count <= 0 || (count & ~PAGE_MASK) != 0)
+		RETURN(-EINVAL);
+
+	apc = npages = batch >> PAGE_SHIFT;
+	tot_pages = count >> PAGE_SHIFT;
+
+	OBD_ALLOC_PTR_ARRAY_LARGE(lnb, apc);
+	if (!lnb)
+		RETURN(-ENOMEM);
+
+	if (rw == OBD_BRW_WRITE && async)
+		brw_flags |= OBD_BRW_ASYNC;
+
+	obdo_to_ioobj(oa, &ioo);
+
+	off = offset;
+
+	for (; tot_pages > 0; tot_pages -= npages) {
+		int lpages;
+
+		if (tot_pages < npages)
+			npages = tot_pages;
+
+		rnb.rnb_offset = off;
+		rnb.rnb_len = npages * PAGE_SIZE;
+		rnb.rnb_flags = brw_flags;
+		ioo.ioo_bufcnt = 1;
+		off += npages * PAGE_SIZE;
+
+		lpages = npages;
+		ret = obd_preprw(env, rw, exp, oa, 1, &ioo, &rnb, &lpages, lnb);
+		if (ret != 0)
+			GOTO(out, ret);
+
+		for (i = 0; i < lpages; i++) {
+			struct page *page = lnb[i].lnb_page;
+
+			/* read past eof? */
+			if (!page && lnb[i].lnb_rc == 0)
+				continue;
+
+			if (async)
+				lnb[i].lnb_flags |= OBD_BRW_ASYNC;
+
+			if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID ||
+			    (oa->o_valid & OBD_MD_FLFLAGS) == 0 ||
+			    (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0)
+				continue;
+
+			if (rw == OBD_BRW_WRITE)
+				echo_client_page_debug_setup(page, rw,
+							ostid_id(&oa->o_oi),
+							lnb[i].lnb_file_offset,
+							lnb[i].lnb_len);
+			else
+				echo_client_page_debug_check(page,
+							ostid_id(&oa->o_oi),
+							lnb[i].lnb_file_offset,
+							lnb[i].lnb_len);
+		}
+
+		ret = obd_commitrw(env, rw, exp, oa, 1, &ioo, &rnb, npages, lnb,
+				   ret, rnb.rnb_len, ktime_set(0, 0));
+		if (ret != 0)
+			break;
+
+		/* Reuse env context. */
+		lu_context_exit((struct lu_context *)&env->le_ctx);
+		lu_context_enter((struct lu_context *)&env->le_ctx);
+	}
+
+out:
+	OBD_FREE_PTR_ARRAY_LARGE(lnb, apc);
+
+	RETURN(ret);
+}
+
+static int echo_client_brw_ioctl(const struct lu_env *env, int rw,
+				 struct obd_export *exp,
+				 struct obd_ioctl_data *data)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct echo_device *ed = obd2echo_dev(obd);
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct obdo *oa = &data->ioc_obdo1;
+	struct echo_object *eco;
+	int rc;
+	int async = 0;
+	long test_mode;
+
+	ENTRY;
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+	rc = echo_get_object(&eco, ed, oa);
+	if (rc)
+		RETURN(rc);
+
+	oa->o_valid &= ~OBD_MD_FLHANDLE;
+
+	/* OFD/obdfilter works only via prep/commit */
+	test_mode = (long)data->ioc_pbuf1;
+	if (!ed->ed_next && test_mode != 3) {
+		test_mode = 3;
+		data->ioc_plen1 = data->ioc_count;
+	}
+
+	if (test_mode == 3)
+		async = 1;
+
+	/* Truncate batch size to maximum */
+	if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE)
+		data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE;
+
+	switch (test_mode) {
+	case 1:
+		fallthrough;
+	case 2:
+		rc = echo_client_kbrw(ed, rw, oa, eco, data->ioc_offset,
+				      data->ioc_count, async);
+		break;
+	case 3:
+		rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa, eco,
+					     data->ioc_offset, data->ioc_count,
+					     data->ioc_plen1, async);
+		break;
+	default:
+		rc = -EINVAL;
+	}
+
+	echo_put_object(eco);
+
+	RETURN(rc);
+}
+
+static int
+echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+		      void *karg, void __user *uarg)
+{
+#ifdef HAVE_SERVER_SUPPORT
+	struct tgt_session_info *tsi;
+#endif
+	struct obd_device      *obd = exp->exp_obd;
+	struct echo_device     *ed = obd2echo_dev(obd);
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct echo_object     *eco;
+	struct obd_ioctl_data  *data = karg;
+	struct lu_env          *env;
+	unsigned long		env_tags = 0;
+	__u16			refcheck;
+	struct obdo            *oa;
+	struct lu_fid           fid;
+	int                     rw = OBD_BRW_READ;
+	int                     rc = 0;
+
+	ENTRY;
+	oa = &data->ioc_obdo1;
+	if (!(oa->o_valid & OBD_MD_FLGROUP)) {
+		oa->o_valid |= OBD_MD_FLGROUP;
+		ostid_set_seq_echo(&oa->o_oi);
+	}
+
+	/* This FID is unpacked just for validation at this point */
+	rc = ostid_to_fid(&fid, &oa->o_oi, 0);
+	if (rc < 0)
+		RETURN(rc);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	lu_env_add(env);
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (cmd == OBD_IOC_ECHO_MD || cmd == OBD_IOC_ECHO_ALLOC_SEQ)
+		env_tags = ECHO_MD_CTX_TAG;
+	else
+#endif
+		env_tags = ECHO_DT_CTX_TAG;
+
+	rc = lu_env_refill_by_tags(env, env_tags, ECHO_SES_TAG);
+	if (rc != 0)
+		GOTO(out, rc);
+
+#ifdef HAVE_SERVER_SUPPORT
+	tsi = tgt_ses_info(env);
+	/* treat as local operation */
+	tsi->tsi_exp = NULL;
+	tsi->tsi_jobid = NULL;
+#endif
+
+	switch (cmd) {
+	case OBD_IOC_CREATE:                    /* may create echo object */
+		if (!capable(CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		rc = echo_create_object(env, ed, oa);
+		GOTO(out, rc);
+
+#ifdef HAVE_SERVER_SUPPORT
+	case OBD_IOC_ECHO_MD: {
+		int count;
+		int cmd;
+		char *dir = NULL;
+		int dirlen;
+		__u64 id;
+
+		if (!capable(CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		count = data->ioc_count;
+		cmd = data->ioc_command;
+
+		id = data->ioc_obdo2.o_oi.oi.oi_id;
+		dirlen = data->ioc_plen1;
+		OBD_ALLOC(dir, dirlen + 1);
+		if (!dir)
+			GOTO(out, rc = -ENOMEM);
+
+		if (copy_from_user(dir, data->ioc_pbuf1, dirlen)) {
+			OBD_FREE(dir, data->ioc_plen1 + 1);
+			GOTO(out, rc = -EFAULT);
+		}
+
+		rc = echo_md_handler(ed, cmd, dir, dirlen, id, count, data);
+		OBD_FREE(dir, dirlen + 1);
+		GOTO(out, rc);
+	}
+	case OBD_IOC_ECHO_ALLOC_SEQ: {
+		__u64            seq;
+		int              max_count;
+
+		if (!capable(CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		rc = seq_client_get_seq(env, ed->ed_cl_seq, &seq);
+		if (rc < 0) {
+			CERROR("%s: Can not alloc seq: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out, rc);
+		}
+
+		if (copy_to_user(data->ioc_pbuf1, &seq, data->ioc_plen1))
+			return -EFAULT;
+
+		max_count = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+		if (copy_to_user(data->ioc_pbuf2, &max_count,
+				     data->ioc_plen2))
+			return -EFAULT;
+		GOTO(out, rc);
+	}
+#endif /* HAVE_SERVER_SUPPORT */
+	case OBD_IOC_DESTROY:
+		if (!capable(CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			rc = obd_destroy(env, ec->ec_exp, oa);
+			if (rc == 0)
+				eco->eo_deleted = 1;
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case OBD_IOC_GETATTR:
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			rc = obd_getattr(env, ec->ec_exp, oa);
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case OBD_IOC_SETATTR:
+		if (!capable(CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			rc = obd_setattr(env, ec->ec_exp, oa);
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case OBD_IOC_BRW_WRITE:
+		if (!capable(CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		rw = OBD_BRW_WRITE;
+		fallthrough;
+	case OBD_IOC_BRW_READ:
+		rc = echo_client_brw_ioctl(env, rw, exp, data);
+		GOTO(out, rc);
+
+	default:
+		CERROR("echo_ioctl(): unrecognised ioctl %#x\n", cmd);
+		GOTO(out, rc = -ENOTTY);
+	}
+
+	EXIT;
+out:
+	lu_env_remove(env);
+	cl_env_put(env, &refcheck);
+
+	return rc;
+}
+
+static int echo_client_setup(const struct lu_env *env,
+			     struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct echo_client_obd *ec = &obd->u.echo_client;
+	struct obd_device *tgt;
+	struct obd_uuid echo_uuid = { "ECHO_UUID" };
+	struct obd_connect_data *ocd = NULL;
+	int rc;
+
+	ENTRY;
+	if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("device not attached or not set up (%s)\n",
+		       lustre_cfg_string(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	spin_lock_init(&ec->ec_lock);
+	INIT_LIST_HEAD(&ec->ec_objects);
+	INIT_LIST_HEAD(&ec->ec_locks);
+	ec->ec_unique = 0;
+
+	lu_context_tags_update(ECHO_DT_CTX_TAG);
+	lu_session_tags_update(ECHO_SES_TAG);
+
+	if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+#ifdef HAVE_SERVER_SUPPORT
+		lu_context_tags_update(ECHO_MD_CTX_TAG);
+#else
+		CERROR(
+		       "Local operations are NOT supported on client side. Only remote operations are supported. Metadata client must be run on server side.\n");
+#endif
+		RETURN(0);
+	}
+
+	OBD_ALLOC(ocd, sizeof(*ocd));
+	if (!ocd) {
+		CERROR("Can't alloc ocd connecting to %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		return -ENOMEM;
+	}
+
+	ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL |
+				 OBD_CONNECT_BRW_SIZE |
+				 OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 |
+				 OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE |
+				 OBD_CONNECT_FID | OBD_CONNECT_FLAGS2;
+	ocd->ocd_connect_flags2 = OBD_CONNECT2_REP_MBITS;
+
+	ocd->ocd_brw_size = DT_MAX_BRW_SIZE;
+	ocd->ocd_version = LUSTRE_VERSION_CODE;
+	ocd->ocd_group = FID_SEQ_ECHO;
+
+	rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
+	if (rc == 0) {
+		/* Turn off pinger because it connects to tgt obd directly. */
+		spin_lock(&tgt->obd_dev_lock);
+		list_del_init(&ec->ec_exp->exp_obd_chain_timed);
+		spin_unlock(&tgt->obd_dev_lock);
+	}
+
+	OBD_FREE(ocd, sizeof(*ocd));
+
+	if (rc != 0) {
+		CERROR("fail to connect to device %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		return rc;
+	}
+
+	RETURN(rc);
+}
+
+static int echo_client_cleanup(struct obd_device *obd)
+{
+	struct echo_device *ed = obd2echo_dev(obd);
+	struct echo_client_obd *ec = &obd->u.echo_client;
+	int rc;
+
+	ENTRY;
+	/*Do nothing for Metadata echo client*/
+	if (!ed)
+		RETURN(0);
+
+	lu_session_tags_clear(ECHO_SES_TAG & ~LCT_SESSION);
+	lu_context_tags_clear(ECHO_DT_CTX_TAG);
+	if (ed->ed_next_ismd) {
+#ifdef HAVE_SERVER_SUPPORT
+		lu_context_tags_clear(ECHO_MD_CTX_TAG);
+#else
+		CERROR(
+		       "This is client-side only module, does not support metadata echo client.\n");
+#endif
+		RETURN(0);
+	}
+
+	if (!list_empty(&obd->obd_exports)) {
+		CERROR("still has clients!\n");
+		RETURN(-EBUSY);
+	}
+
+	LASSERT(refcount_read(&ec->ec_exp->exp_handle.h_ref) > 0);
+	rc = obd_disconnect(ec->ec_exp);
+	if (rc != 0)
+		CERROR("fail to disconnect device: %d\n", rc);
+
+	RETURN(rc);
+}
+
+static int echo_client_connect(const struct lu_env *env,
+			       struct obd_export **exp,
+			       struct obd_device *src, struct obd_uuid *cluuid,
+			       struct obd_connect_data *data, void *localdata)
+{
+	int rc;
+	struct lustre_handle conn = { 0 };
+
+	ENTRY;
+	rc = class_connect(&conn, src, cluuid);
+	if (rc == 0)
+		*exp = class_conn2export(&conn);
+
+	RETURN(rc);
+}
+
+static int echo_client_disconnect(struct obd_export *exp)
+{
+	int rc;
+
+	ENTRY;
+	if (!exp)
+		GOTO(out, rc = -EINVAL);
+
+	rc = class_disconnect(exp);
+	GOTO(out, rc);
+out:
+	return rc;
+}
+
+static const struct obd_ops echo_client_obd_ops = {
+	.o_owner       = THIS_MODULE,
+	.o_iocontrol   = echo_client_iocontrol,
+	.o_connect     = echo_client_connect,
+	.o_disconnect  = echo_client_disconnect
+};
+
+static int __init obdecho_init(void)
+{
+	int rc;
+
+	ENTRY;
+	LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n");
+
+	LASSERT(PAGE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
+
+# ifdef HAVE_SERVER_SUPPORT
+	rc = echo_persistent_pages_init();
+	if (rc != 0)
+		goto failed_0;
+
+	rc = class_register_type(&echo_obd_ops, NULL, true,
+				 LUSTRE_ECHO_NAME, &echo_srv_type);
+	if (rc != 0)
+		goto failed_1;
+# endif
+
+	rc = lu_kmem_init(echo_caches);
+	if (rc == 0) {
+		rc = class_register_type(&echo_client_obd_ops, NULL, false,
+					 LUSTRE_ECHO_CLIENT_NAME,
+					 &echo_device_type);
+		if (rc)
+			lu_kmem_fini(echo_caches);
+	}
+
+# ifdef HAVE_SERVER_SUPPORT
+	if (rc == 0)
+		RETURN(0);
+
+	class_unregister_type(LUSTRE_ECHO_NAME);
+failed_1:
+	echo_persistent_pages_fini();
+failed_0:
+# endif
+	RETURN(rc);
+}
+
+static void __exit obdecho_exit(void)
+{
+	class_unregister_type(LUSTRE_ECHO_CLIENT_NAME);
+	lu_kmem_fini(echo_caches);
+
+#ifdef HAVE_SERVER_SUPPORT
+	class_unregister_type(LUSTRE_ECHO_NAME);
+	echo_persistent_pages_fini();
+#endif
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Echo Client test driver");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(obdecho_init);
+module_exit(obdecho_exit);
+
+/** @} echo_client */
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
new file mode 100644
index 0000000000000..158fc9745707c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
@@ -0,0 +1,59 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014 Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdecho/echo_internal.h
+ */
+
+#ifndef _ECHO_INTERNAL_H
+#define _ECHO_INTERNAL_H
+
+/* The persistent object (i.e. actually stores stuff!) */
+#define ECHO_PERSISTENT_OBJID    1ULL
+#define ECHO_PERSISTENT_SIZE     ((__u64)(1<<20))
+
+/* block size to use for data verification */
+#define OBD_ECHO_BLOCK_SIZE	(4<<10)
+
+#ifdef HAVE_SERVER_SUPPORT
+extern const struct obd_ops echo_obd_ops;
+extern struct lu_device_type echo_srv_type;
+int echo_persistent_pages_init(void);
+void echo_persistent_pages_fini(void);
+#endif /* HAVE_SERVER_SUPPORT */
+
+/* mapping value to tell page is not encrypted */
+#define ECHO_MAPPING_UNENCRYPTED ((void *)1)
+
+/* debug.c */
+int block_debug_setup(void *addr, int len, u64 off, u64 id);
+int block_debug_check(char *who, void *addr, int len, u64 off, u64 id);
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
new file mode 100644
index 0000000000000..78a8c14e17298
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
@@ -0,0 +1,916 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include <lustre_osc.h>
+
+#include "osc_internal.h"
+
+static ssize_t active_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	int rc;
+
+	with_imp_locked(obd, imp, rc)
+		rc = sprintf(buf, "%d\n", !imp->imp_deactive);
+
+	return rc;
+}
+
+static ssize_t active_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp, *imp0;
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	with_imp_locked(obd, imp0, rc)
+		imp = class_import_get(imp0);
+	if (rc)
+		return rc;
+	/* opposite senses */
+	if (imp->imp_deactive == val)
+		rc = ptlrpc_set_import_active(imp, val);
+	else
+		CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n",
+		       (unsigned int)val);
+	class_import_put(imp);
+
+	return rc ?: count;
+}
+LUSTRE_RW_ATTR(active);
+
+static ssize_t max_rpcs_in_flight_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+
+	return  scnprintf(buf, PAGE_SIZE, "%u\n", cli->cl_max_rpcs_in_flight);
+}
+
+static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+	int adding, added, req_count;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val == 0 || val > OSC_MAX_RIF_MAX)
+		return -ERANGE;
+
+	adding = (int)val - cli->cl_max_rpcs_in_flight;
+	req_count = atomic_read(&osc_pool_req_count);
+	if (adding > 0 && req_count < osc_reqpool_maxreqcount) {
+		/*
+		 * There might be some race which will cause over-limit
+		 * allocation, but it is fine.
+		 */
+		if (req_count + adding > osc_reqpool_maxreqcount)
+			adding = osc_reqpool_maxreqcount - req_count;
+
+		added = osc_rq_pool->prp_populate(osc_rq_pool, adding);
+		atomic_add(added, &osc_pool_req_count);
+	}
+
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_rpcs_in_flight = val;
+	client_adjust_max_dirty(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_rpcs_in_flight);
+
+static ssize_t max_dirty_mb_show(struct kobject *kobj,
+				 struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n",
+			 PAGES_TO_MiB(cli->cl_dirty_max_pages));
+}
+
+static ssize_t max_dirty_mb_store(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buffer,
+				  size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+	unsigned long pages_number, max_dirty_mb;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &max_dirty_mb);
+	if (rc)
+		return rc;
+
+	pages_number = MiB_TO_PAGES(max_dirty_mb);
+
+	if (pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) ||
+	    pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */
+		return -ERANGE;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_dirty_max_pages = pages_number;
+	osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_dirty_mb);
+
+LUSTRE_ATTR(ost_conn_uuid, 0444, conn_uuid_show, NULL);
+LUSTRE_RO_ATTR(conn_uuid);
+
+LUSTRE_RW_ATTR(ping);
+
+static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+	int shift = 20 - PAGE_SHIFT;
+
+	seq_printf(m, "used_mb: %ld\n"
+		   "busy_cnt: %ld\n"
+		   "reclaim: %llu\n",
+		   (atomic_long_read(&cli->cl_lru_in_list) +
+		    atomic_long_read(&cli->cl_lru_busy)) >> shift,
+		    atomic_long_read(&cli->cl_lru_busy),
+		   cli->cl_lru_reclaim);
+
+	return 0;
+}
+
+/* shrink the number of caching pages to a specific number */
+static ssize_t osc_cached_mb_seq_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+	u64 pages_number;
+	const char *tmp;
+	long rc;
+	char kernbuf[128];
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	tmp = lprocfs_find_named_value(kernbuf, "used_mb:", &count);
+	rc = sysfs_memparse(tmp, count, &pages_number, "MiB");
+	if (rc < 0)
+		return rc;
+
+	pages_number >>= PAGE_SHIFT;
+
+	rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number;
+	if (rc > 0) {
+		struct lu_env *env;
+		__u16 refcheck;
+
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			(void)osc_lru_shrink(env, cli, rc, true);
+			cl_env_put(env, &refcheck);
+		}
+	}
+
+	return count;
+}
+
+LPROC_SEQ_FOPS(osc_cached_mb);
+
+static ssize_t cur_dirty_bytes_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n",
+			 cli->cl_dirty_pages << PAGE_SHIFT);
+}
+LUSTRE_RO_ATTR(cur_dirty_bytes);
+
+static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+
+	seq_printf(m, "%lu\n", cli->cl_avail_grant);
+	return 0;
+}
+
+static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp;
+	char kernbuf[22] = "";
+	u64 val;
+	int rc;
+
+	if (obd == NULL)
+		return 0;
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	rc = sysfs_memparse(kernbuf, count, &val, "MiB");
+	if (rc < 0)
+		return rc;
+
+	/* this is only for shrinking grant */
+	if (val >= cli->cl_avail_grant)
+		return 0;
+
+	with_imp_locked(obd, imp, rc)
+		if (imp->imp_state == LUSTRE_IMP_FULL)
+			rc = osc_shrink_grant_to_target(cli, val);
+
+	return rc ? rc : count;
+}
+LPROC_SEQ_FOPS(osc_cur_grant_bytes);
+
+static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj,
+					 struct attribute *attr,
+					 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n", cli->cl_lost_grant);
+}
+LUSTRE_RO_ATTR(cur_lost_grant_bytes);
+
+static ssize_t cur_dirty_grant_bytes_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+
+	return scnprintf(buf, PAGE_SIZE, "%lu\n", cli->cl_dirty_grant);
+}
+LUSTRE_RO_ATTR(cur_dirty_grant_bytes);
+
+static ssize_t grant_shrink_interval_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%lld\n", obd->u.cli.cl_grant_shrink_interval);
+}
+
+static ssize_t grant_shrink_interval_store(struct kobject *kobj,
+					   struct attribute *attr,
+					   const char *buffer,
+					   size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val == 0)
+		return -ERANGE;
+
+	obd->u.cli.cl_grant_shrink_interval = val;
+	osc_update_next_shrink(&obd->u.cli);
+	osc_schedule_grant_work();
+
+	return count;
+}
+LUSTRE_RW_ATTR(grant_shrink_interval);
+
+static ssize_t checksums_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", !!obd->u.cli.cl_checksum);
+}
+
+static ssize_t checksums_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	obd->u.cli.cl_checksum = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(checksums);
+
+DECLARE_CKSUM_NAME;
+
+static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	int i;
+
+	if (obd == NULL)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if ((BIT(i) & obd->u.cli.cl_supp_cksum_types) == 0)
+			continue;
+		if (obd->u.cli.cl_cksum_type == BIT(i))
+			seq_printf(m, "[%s] ", cksum_name[i]);
+		else
+			seq_printf(m, "%s ", cksum_name[i]);
+	}
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+static ssize_t osc_checksum_type_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	char kernbuf[10];
+	int rc = -EINVAL;
+	int i;
+
+	if (obd == NULL)
+		return 0;
+
+	if (count > sizeof(kernbuf) - 1)
+		return -EINVAL;
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	if (count > 0 && kernbuf[count - 1] == '\n')
+		kernbuf[count - 1] = '\0';
+	else
+		kernbuf[count] = '\0';
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if (strcasecmp(kernbuf, cksum_name[i]) == 0) {
+			obd->u.cli.cl_preferred_cksum_type = BIT(i);
+			if (obd->u.cli.cl_supp_cksum_types & BIT(i)) {
+				obd->u.cli.cl_cksum_type = BIT(i);
+				rc = count;
+			} else {
+				rc = -ENOTSUPP;
+			}
+			break;
+		}
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS(osc_checksum_type);
+
+static ssize_t resend_count_show(struct kobject *kobj,
+				 struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%u\n", atomic_read(&obd->u.cli.cl_resends));
+}
+
+static ssize_t resend_count_store(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buffer,
+				  size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	atomic_set(&obd->u.cli.cl_resends, val);
+
+	return count;
+}
+LUSTRE_RW_ATTR(resend_count);
+
+static ssize_t checksum_dump_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", !!obd->u.cli.cl_checksum_dump);
+}
+
+static ssize_t checksum_dump_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	obd->u.cli.cl_checksum_dump = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(checksum_dump);
+
+static ssize_t destroys_in_flight_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%u\n",
+		       atomic_read(&obd->u.cli.cl_destroy_in_flight));
+}
+LUSTRE_RO_ATTR(destroys_in_flight);
+
+LPROC_SEQ_FOPS_RW_TYPE(osc, obd_max_pages_per_rpc);
+
+LUSTRE_RW_ATTR(short_io_bytes);
+
+#ifdef CONFIG_PROC_FS
+static int osc_unstable_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+	long pages;
+	int mb;
+
+	pages = atomic_long_read(&cli->cl_unstable_count);
+	mb    = (pages * PAGE_SIZE) >> 20;
+
+	seq_printf(m, "unstable_pages: %20ld\n"
+		   "unstable_mb:              %10d\n",
+		   pages, mb);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_unstable_stats);
+
+static ssize_t idle_timeout_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	int ret;
+
+	with_imp_locked(obd, imp, ret)
+		ret = sprintf(buf, "%u\n", imp->imp_idle_timeout);
+
+	return ret;
+}
+
+static ssize_t idle_timeout_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	struct ptlrpc_request *req;
+	unsigned int idle_debug = 0;
+	unsigned int val;
+	int rc;
+
+	if (strncmp(buffer, "debug", 5) == 0) {
+		idle_debug = D_CONSOLE;
+	} else if (strncmp(buffer, "nodebug", 6) == 0) {
+		idle_debug = D_HA;
+	} else {
+		rc = kstrtouint(buffer, 10, &val);
+		if (rc)
+			return rc;
+
+		if (val > CONNECTION_SWITCH_MAX)
+			return -ERANGE;
+	}
+
+	with_imp_locked(obd, imp, rc) {
+		if (idle_debug) {
+			imp->imp_idle_debug = idle_debug;
+		} else {
+			if (!val) {
+				/* initiate the connection if it's in IDLE state */
+				req = ptlrpc_request_alloc(imp,
+							   &RQF_OST_STATFS);
+				if (req != NULL)
+					ptlrpc_req_finished(req);
+			}
+			imp->imp_idle_timeout = val;
+		}
+	}
+
+	return count;
+}
+LUSTRE_RW_ATTR(idle_timeout);
+
+static ssize_t idle_connect_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	struct ptlrpc_request *req;
+	int rc;
+
+	with_imp_locked(obd, imp, rc) {
+		/* to initiate the connection if it's in IDLE state */
+		req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+		if (req)
+			ptlrpc_req_finished(req);
+		ptlrpc_pinger_force(imp);
+	}
+
+	return rc ?: count;
+}
+LUSTRE_WO_ATTR(idle_connect);
+
+static ssize_t grant_shrink_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	ssize_t len;
+
+	with_imp_locked(obd, imp, len)
+		len = scnprintf(buf, PAGE_SIZE, "%d\n",
+				!imp->imp_grant_shrink_disabled &&
+				OCD_HAS_FLAG(&imp->imp_connect_data,
+					     GRANT_SHRINK));
+
+	return len;
+}
+
+static ssize_t grant_shrink_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	bool val;
+	int rc;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	with_imp_locked(obd, imp, rc) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_grant_shrink_disabled = !val;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	return rc ?: count;
+}
+LUSTRE_RW_ATTR(grant_shrink);
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(osc, state);
+
+LPROC_SEQ_FOPS_RW_TYPE(osc, import);
+LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
+
+struct lprocfs_vars lprocfs_osc_obd_vars[] = {
+	{ .name	=	"connect_flags",
+	  .fops	=	&osc_connect_flags_fops		},
+	{ .name	=	"ost_server_uuid",
+	  .fops	=	&osc_server_uuid_fops		},
+	{ .name =	"max_pages_per_rpc",
+	  .fops =	&osc_obd_max_pages_per_rpc_fops	},
+	{ .name	=	"osc_cached_mb",
+	  .fops	=	&osc_cached_mb_fops		},
+	{ .name =	"cur_grant_bytes",
+	  .fops =	&osc_cur_grant_bytes_fops	},
+	{ .name	=	"checksum_type",
+	  .fops	=	&osc_checksum_type_fops		},
+	{ .name	=	"timeouts",
+	  .fops	=	&osc_timeouts_fops		},
+	{ .name	=	"import",
+	  .fops	=	&osc_import_fops		},
+	{ .name	=	"state",
+	  .fops	=	&osc_state_fops			},
+	{ .name	=	"pinger_recov",
+	  .fops	=	&osc_pinger_recov_fops		},
+	{ .name	=	"unstable_stats",
+	  .fops	=	&osc_unstable_stats_fops	},
+	{ NULL }
+};
+
+static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *obd = seq->private;
+	struct client_obd *cli = &obd->u.cli;
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	int i;
+
+	spin_lock(&cli->cl_loi_list_lock);
+
+	lprocfs_stats_header(seq, ktime_get_real(), cli->cl_stats_init, 25,
+			     ":", true, "");
+	seq_printf(seq, "read RPCs in flight:  %d\n",
+		   cli->cl_r_in_flight);
+	seq_printf(seq, "write RPCs in flight: %d\n",
+		   cli->cl_w_in_flight);
+	seq_printf(seq, "pending write pages:  %d\n",
+		   atomic_read(&cli->cl_pending_w_pages));
+	seq_printf(seq, "pending read pages:   %d\n",
+		   atomic_read(&cli->cl_pending_r_pages));
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "pages per rpc         rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   1 << i, r, pct(r, read_tot),
+			   pct(read_cum, read_tot), w,
+			   pct(w, write_tot),
+			   pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "rpcs in flight        rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+        read_cum = 0;
+        write_cum = 0;
+        for (i = 1; i < OBD_HIST_MAX; i++) {
+                unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+                unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+                read_cum += r;
+                write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   i, r, pct(r, read_tot),
+			   pct(read_cum, read_tot), w,
+			   pct(w, write_tot),
+			   pct(write_cum, write_tot));
+                if (read_cum == read_tot && write_cum == write_tot)
+                        break;
+        }
+
+        seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+        seq_printf(seq, "offset                rpcs   %% cum %% |");
+        seq_printf(seq, "       rpcs   %% cum %%\n");
+
+        read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+        write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+        read_cum = 0;
+        write_cum = 0;
+        for (i = 0; i < OBD_HIST_MAX; i++) {
+                unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+                unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+                read_cum += r;
+                write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   (i == 0) ? 0 : 1 << (i - 1),
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+                if (read_cum == read_tot && write_cum == write_tot)
+                        break;
+        }
+
+	spin_unlock(&cli->cl_loi_list_lock);
+
+        return 0;
+}
+
+static ssize_t osc_rpc_stats_seq_write(struct file *file,
+				       const char __user *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *obd = seq->private;
+	struct client_obd *cli = &obd->u.cli;
+
+	lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_read_page_hist);
+	lprocfs_oh_clear(&cli->cl_write_page_hist);
+	lprocfs_oh_clear(&cli->cl_read_offset_hist);
+	lprocfs_oh_clear(&cli->cl_write_offset_hist);
+	cli->cl_stats_init = ktime_get_real();
+
+	return len;
+}
+LPROC_SEQ_FOPS(osc_rpc_stats);
+
+static int osc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *obd = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(obd)->osc_stats;
+
+	lprocfs_stats_header(seq, ktime_get_real(), stats->os_init, 25, ":",
+			     true, "");
+	seq_printf(seq, "lockless_write_bytes\t\t%llu\n",
+		   stats->os_lockless_writes);
+	seq_printf(seq, "lockless_read_bytes\t\t%llu\n",
+		   stats->os_lockless_reads);
+	return 0;
+}
+
+static ssize_t osc_stats_seq_write(struct file *file,
+				   const char __user *buf,
+                                   size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *obd = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(obd)->osc_stats;
+
+	memset(stats, 0, sizeof(*stats));
+	stats->os_init = ktime_get_real();
+
+	return len;
+}
+
+LPROC_SEQ_FOPS(osc_stats);
+
+int lprocfs_osc_attach_seqstat(struct obd_device *obd)
+{
+	int rc;
+
+	rc = lprocfs_seq_create(obd->obd_proc_entry, "osc_stats", 0644,
+				&osc_stats_fops, obd);
+	if (rc == 0)
+		rc = lprocfs_obd_seq_create(obd, "rpc_stats", 0644,
+					    &osc_rpc_stats_fops, obd);
+
+	return rc;
+}
+#endif /* CONFIG_PROC_FS */
+
+static struct attribute *osc_attrs[] = {
+	&lustre_attr_active.attr,
+	&lustre_attr_checksums.attr,
+	&lustre_attr_checksum_dump.attr,
+	&lustre_attr_cur_dirty_bytes.attr,
+	&lustre_attr_cur_lost_grant_bytes.attr,
+	&lustre_attr_cur_dirty_grant_bytes.attr,
+	&lustre_attr_destroys_in_flight.attr,
+	&lustre_attr_grant_shrink_interval.attr,
+	&lustre_attr_max_dirty_mb.attr,
+	&lustre_attr_max_rpcs_in_flight.attr,
+	&lustre_attr_short_io_bytes.attr,
+	&lustre_attr_resend_count.attr,
+	&lustre_attr_ost_conn_uuid.attr,
+	&lustre_attr_conn_uuid.attr,
+	&lustre_attr_ping.attr,
+	&lustre_attr_idle_timeout.attr,
+	&lustre_attr_idle_connect.attr,
+	&lustre_attr_grant_shrink.attr,
+	NULL,
+};
+
+KOBJ_ATTRIBUTE_GROUPS(osc); /* creates osc_groups */
+
+int osc_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_vars = lprocfs_osc_obd_vars;
+	obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(osc);
+	rc = lprocfs_obd_setup(obd, false);
+	if (rc)
+		return rc;
+#ifdef CONFIG_PROC_FS
+	/* If the basic OSC proc tree construction succeeded then
+	 * lets do the rest.
+	 */
+	rc = lprocfs_osc_attach_seqstat(obd);
+	if (rc)
+		goto obd_cleanup;
+
+#endif /* CONFIG_PROC_FS */
+	rc = sptlrpc_lprocfs_cliobd_attach(obd);
+	if (rc)
+		goto obd_cleanup;
+
+	ptlrpc_lprocfs_register_obd(obd);
+obd_cleanup:
+	if (rc)
+		lprocfs_obd_cleanup(obd);
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
new file mode 100644
index 0000000000000..8b7737ede01ef
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
@@ -0,0 +1,3303 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ *
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * osc cache management.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <lustre_osc.h>
+#include <lustre_dlm.h>
+
+#include "osc_internal.h"
+
+static int extent_debug; /* set it to be true for more debug */
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta);
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   enum osc_extent_state state);
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc);
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd);
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd);
+static int osc_io_unplug_async(const struct lu_env *env,
+			       struct client_obd *cli, struct osc_object *osc);
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant, unsigned int dirty_grant);
+
+static void osc_extent_tree_dump0(int mask, struct osc_object *obj,
+				  const char *func, int line);
+#define osc_extent_tree_dump(mask, obj) \
+	osc_extent_tree_dump0(mask, obj, __func__, __LINE__)
+
+static void osc_unreserve_grant(struct client_obd *cli, unsigned int reserved,
+				unsigned int unused);
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/* ------------------ osc extent ------------------ */
+static inline char *ext_flags(struct osc_extent *ext, char *flags)
+{
+	char *buf = flags;
+	*buf++ = ext->oe_rw ? 'r' : 'w';
+	if (!RB_EMPTY_NODE(&ext->oe_node))
+		*buf++ = 'i';
+	if (ext->oe_sync)
+		*buf++ = 'S';
+	if (ext->oe_srvlock)
+		*buf++ = 's';
+	if (ext->oe_hp)
+		*buf++ = 'h';
+	if (ext->oe_urgent)
+		*buf++ = 'u';
+	if (ext->oe_memalloc)
+		*buf++ = 'm';
+	if (ext->oe_trunc_pending)
+		*buf++ = 't';
+	if (ext->oe_fsync_wait)
+		*buf++ = 'Y';
+	*buf = 0;
+	return flags;
+}
+
+#define EXTSTR       "[%lu -> %lu/%lu]"
+#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end
+static const char *const oes_strings[] = {
+	"inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL };
+
+#define OSC_EXTENT_DUMP_WITH_LOC(file, func, line, mask, extent, fmt, ...) do {\
+	static struct cfs_debug_limit_state cdls;			      \
+	struct osc_extent *__ext = (extent);				      \
+	char __buf[16];							      \
+									      \
+	__CDEBUG_WITH_LOC(file, func, line, mask, &cdls,		      \
+		"extent %p@{" EXTSTR ", "				      \
+		"[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt,	      \
+		/* ----- extent part 0 ----- */				      \
+		__ext, EXTPARA(__ext),					      \
+		/* ----- part 1 ----- */				      \
+		kref_read(&__ext->oe_refc),				      \
+		atomic_read(&__ext->oe_users),				      \
+		list_empty_marker(&__ext->oe_link),			      \
+		oes_strings[__ext->oe_state], ext_flags(__ext, __buf),	      \
+		__ext->oe_obj,						      \
+		/* ----- part 2 ----- */				      \
+		__ext->oe_grants, __ext->oe_nr_pages,			      \
+		list_empty_marker(&__ext->oe_pages),			      \
+		waitqueue_active(&__ext->oe_waitq) ? '+' : '-',		      \
+		__ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner,	      \
+		/* ----- part 4 ----- */				      \
+		## __VA_ARGS__);					      \
+	if (mask == D_ERROR && __ext->oe_dlmlock != NULL)		      \
+		LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext);	      \
+	else								      \
+		LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext);	      \
+} while (0)
+
+#define OSC_EXTENT_DUMP(mask, ext, fmt, ...)				\
+	OSC_EXTENT_DUMP_WITH_LOC(__FILE__, __func__, __LINE__,		\
+				 mask, ext, fmt, ## __VA_ARGS__)
+
+#undef EASSERTF
+#define EASSERTF(expr, ext, fmt, args...) do {				\
+	if (!(expr)) {							\
+		OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args);		\
+		osc_extent_tree_dump(D_ERROR, (ext)->oe_obj);		\
+		LASSERT(expr);						\
+	}								\
+} while (0)
+
+#undef EASSERT
+#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n")
+
+static inline struct osc_extent *rb_extent(struct rb_node *n)
+{
+	return rb_entry_safe(n, struct osc_extent, oe_node);
+}
+
+static inline struct osc_extent *next_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(!RB_EMPTY_NODE(&ext->oe_node));
+	return rb_extent(rb_next(&ext->oe_node));
+}
+
+static inline struct osc_extent *prev_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(!RB_EMPTY_NODE(&ext->oe_node));
+	return rb_extent(rb_prev(&ext->oe_node));
+}
+
+static inline struct osc_extent *first_extent(struct osc_object *obj)
+{
+	return rb_extent(rb_first(&obj->oo_root));
+}
+
+/* object must be locked by caller. */
+static int osc_extent_sanity_check0(struct osc_extent *ext,
+				    const char *func, const int line)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct osc_async_page *oap;
+	size_t page_count;
+	int rc = 0;
+
+	assert_osc_object_is_locked(obj);
+
+	if (ext->oe_state >= OES_STATE_MAX)
+		GOTO(out, rc = 10);
+
+	if (kref_read(&ext->oe_refc) <= 0)
+		GOTO(out, rc = 20);
+
+	if (kref_read(&ext->oe_refc) < atomic_read(&ext->oe_users))
+		GOTO(out, rc = 30);
+
+	switch (ext->oe_state) {
+	case OES_INV:
+		if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages))
+			GOTO(out, rc = 35);
+		GOTO(out, rc = 0);
+		break;
+	case OES_ACTIVE:
+		if (atomic_read(&ext->oe_users) == 0)
+			GOTO(out, rc = 40);
+		if (ext->oe_hp)
+			GOTO(out, rc = 50);
+		if (ext->oe_fsync_wait && !ext->oe_urgent)
+			GOTO(out, rc = 55);
+		break;
+	case OES_CACHE:
+		if (ext->oe_grants == 0)
+			GOTO(out, rc = 60);
+		if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
+			GOTO(out, rc = 65);
+		fallthrough;
+	default:
+		if (atomic_read(&ext->oe_users) > 0)
+			GOTO(out, rc = 70);
+	}
+
+	if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start)
+		GOTO(out, rc = 80);
+
+	if (ext->oe_sync && ext->oe_grants > 0)
+		GOTO(out, rc = 90);
+
+	if (ext->oe_dlmlock != NULL &&
+	    ext->oe_dlmlock->l_resource->lr_type == LDLM_EXTENT &&
+	    !ldlm_is_failed(ext->oe_dlmlock)) {
+		struct ldlm_extent *extent;
+
+		extent = &ext->oe_dlmlock->l_policy_data.l_extent;
+		if (!(extent->start <= cl_offset(osc2cl(obj), ext->oe_start) &&
+		      extent->end   >= cl_offset(osc2cl(obj), ext->oe_max_end)))
+			GOTO(out, rc = 100);
+
+		if (!(ext->oe_dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP)))
+			GOTO(out, rc = 102);
+	}
+
+	if (ext->oe_nr_pages > ext->oe_mppr)
+		GOTO(out, rc = 105);
+
+	/* Do not verify page list if extent is in RPC. This is because an
+	 * in-RPC extent is supposed to be exclusively accessible w/o lock. */
+	if (ext->oe_state > OES_CACHE)
+		GOTO(out, rc = 0);
+
+	if (!extent_debug)
+		GOTO(out, rc = 0);
+
+	page_count = 0;
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		pgoff_t index = osc_index(oap2osc(oap));
+		++page_count;
+		if (index > ext->oe_end || index < ext->oe_start)
+			GOTO(out, rc = 110);
+	}
+	if (page_count != ext->oe_nr_pages)
+		GOTO(out, rc = 120);
+
+out:
+	if (rc != 0)
+		OSC_EXTENT_DUMP_WITH_LOC(__FILE__, func, line, D_ERROR, ext,
+					 "sanity check %p failed: rc = %d\n",
+					 ext, rc);
+	return rc;
+}
+
+#define sanity_check_nolock(ext) \
+	osc_extent_sanity_check0(ext, __func__, __LINE__)
+
+#define sanity_check(ext) ({                                                   \
+	int __res;                                                             \
+	osc_object_lock((ext)->oe_obj);                                        \
+	__res = sanity_check_nolock(ext);                                      \
+	osc_object_unlock((ext)->oe_obj);                                      \
+	__res;                                                                 \
+})
+
+static inline bool
+overlapped(const struct osc_extent *ex1, const struct osc_extent *ex2)
+{
+	return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
+}
+
+/**
+ * sanity check - to make sure there is no overlapped extent in the tree.
+ */
+static int osc_extent_is_overlapped(struct osc_object *obj,
+				    struct osc_extent *ext)
+{
+	struct osc_extent *tmp;
+
+	assert_osc_object_is_locked(obj);
+
+	if (!extent_debug)
+		return 0;
+
+	for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) {
+		if (tmp == ext)
+			continue;
+		if (overlapped(tmp, ext))
+			return 1;
+	}
+	return 0;
+}
+
+static void osc_extent_state_set(struct osc_extent *ext, int state)
+{
+	assert_osc_object_is_locked(ext->oe_obj);
+	LASSERT(state >= OES_INV && state < OES_STATE_MAX);
+
+	/* Never try to sanity check a state changing extent :-) */
+	/* LASSERT(sanity_check_nolock(ext) == 0); */
+
+	/* TODO: validate the state machine */
+	smp_store_release(&ext->oe_state, state);
+	wake_up(&ext->oe_waitq);
+}
+
+static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
+{
+	struct osc_extent *ext;
+
+	OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_NOFS);
+	if (ext == NULL)
+		return NULL;
+
+	RB_CLEAR_NODE(&ext->oe_node);
+	ext->oe_obj = obj;
+	cl_object_get(osc2cl(obj));
+	kref_init(&ext->oe_refc);
+	atomic_set(&ext->oe_users, 0);
+	INIT_LIST_HEAD(&ext->oe_link);
+	ext->oe_state = OES_INV;
+	INIT_LIST_HEAD(&ext->oe_pages);
+	init_waitqueue_head(&ext->oe_waitq);
+	ext->oe_dlmlock = NULL;
+
+	return ext;
+}
+
+static void osc_extent_free(struct kref *kref)
+{
+	struct osc_extent *ext = container_of(kref, struct osc_extent,
+					      oe_refc);
+
+	LASSERT(list_empty(&ext->oe_link));
+	LASSERT(atomic_read(&ext->oe_users) == 0);
+	LASSERT(ext->oe_state == OES_INV);
+	LASSERT(RB_EMPTY_NODE(&ext->oe_node));
+
+	if (ext->oe_dlmlock) {
+		lu_ref_del(&ext->oe_dlmlock->l_reference,
+			   "osc_extent", ext);
+		LDLM_LOCK_PUT(ext->oe_dlmlock);
+		ext->oe_dlmlock = NULL;
+	}
+#if 0
+	/* If/When cl_object_put drops the need for 'env',
+	 * this code can be enabled, and matching code in
+	 * osc_extent_put removed.
+	 */
+	cl_object_put(osc2cl(ext->oe_obj));
+
+	OBD_SLAB_FREE_PTR(ext, osc_extent_kmem);
+#endif
+}
+
+static struct osc_extent *osc_extent_get(struct osc_extent *ext)
+{
+	LASSERT(kref_read(&ext->oe_refc) >= 0);
+	kref_get(&ext->oe_refc);
+	return ext;
+}
+
+static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext)
+{
+	LASSERT(kref_read(&ext->oe_refc) > 0);
+	if (kref_put(&ext->oe_refc, osc_extent_free)) {
+		/* This should be in osc_extent_free(), but
+		 * while we need to pass 'env' it cannot be.
+		 */
+		cl_object_put(env, osc2cl(ext->oe_obj));
+
+		OBD_SLAB_FREE_PTR(ext, osc_extent_kmem);
+	}
+}
+
+/**
+ * osc_extent_put_trust() is a special version of osc_extent_put() when
+ * it's known that the caller is not the last user. This is to address the
+ * problem of lacking of lu_env ;-).
+ */
+static void osc_extent_put_trust(struct osc_extent *ext)
+{
+	LASSERT(kref_read(&ext->oe_refc) > 1);
+	assert_osc_object_is_locked(ext->oe_obj);
+	osc_extent_put(NULL, ext);
+}
+
+/**
+ * Return the extent which includes pgoff @index, or return the greatest
+ * previous extent in the tree.
+ */
+static struct osc_extent *osc_extent_search(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct rb_node    *n = obj->oo_root.rb_node;
+	struct osc_extent *tmp, *p = NULL;
+
+	assert_osc_object_is_locked(obj);
+	while (n != NULL) {
+		tmp = rb_extent(n);
+		if (index < tmp->oe_start) {
+			n = n->rb_left;
+		} else if (index > tmp->oe_end) {
+			p = rb_extent(n);
+			n = n->rb_right;
+		} else {
+			return tmp;
+		}
+	}
+	return p;
+}
+
+/*
+ * Return the extent covering @index, otherwise return NULL.
+ * caller must have held object lock.
+ */
+static struct osc_extent *osc_extent_lookup(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct osc_extent *ext;
+
+	ext = osc_extent_search(obj, index);
+	if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end)
+		return osc_extent_get(ext);
+	return NULL;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext)
+{
+	struct rb_node   **n      = &obj->oo_root.rb_node;
+	struct rb_node    *parent = NULL;
+	struct osc_extent *tmp;
+
+	LASSERT(RB_EMPTY_NODE(&ext->oe_node));
+	LASSERT(ext->oe_obj == obj);
+	assert_osc_object_is_locked(obj);
+	while (*n != NULL) {
+		tmp = rb_extent(*n);
+		parent = *n;
+
+		if (ext->oe_end < tmp->oe_start)
+			n = &(*n)->rb_left;
+		else if (ext->oe_start > tmp->oe_end)
+			n = &(*n)->rb_right;
+		else
+			EASSERTF(0, tmp, EXTSTR"\n", EXTPARA(ext));
+	}
+	rb_link_node(&ext->oe_node, parent, n);
+	rb_insert_color(&ext->oe_node, &obj->oo_root);
+	osc_extent_get(ext);
+}
+
+/* caller must have held object lock. */
+static void osc_extent_erase(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+	assert_osc_object_is_locked(obj);
+	if (!RB_EMPTY_NODE(&ext->oe_node)) {
+		rb_erase(&ext->oe_node, &obj->oo_root);
+		RB_CLEAR_NODE(&ext->oe_node);
+		/* rbtree held a refcount */
+		osc_extent_put_trust(ext);
+	}
+}
+
+static struct osc_extent *osc_extent_hold(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	assert_osc_object_is_locked(obj);
+	LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE);
+	if (ext->oe_state == OES_CACHE) {
+		osc_extent_state_set(ext, OES_ACTIVE);
+		osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages);
+	}
+	atomic_inc(&ext->oe_users);
+	list_del_init(&ext->oe_link);
+	return osc_extent_get(ext);
+}
+
+static void __osc_extent_remove(struct osc_extent *ext)
+{
+	assert_osc_object_is_locked(ext->oe_obj);
+	LASSERT(list_empty(&ext->oe_pages));
+	osc_extent_erase(ext);
+	list_del_init(&ext->oe_link);
+	osc_extent_state_set(ext, OES_INV);
+	OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n");
+}
+
+static void osc_extent_remove(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	osc_object_lock(obj);
+	__osc_extent_remove(ext);
+	osc_object_unlock(obj);
+}
+
+/**
+ * This function is used to merge extents to get better performance. It checks
+ * if @cur and @victim are contiguous at block level.
+ */
+static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
+			    struct osc_extent *victim)
+{
+	struct osc_object	*obj = cur->oe_obj;
+	struct client_obd	*cli = osc_cli(obj);
+	pgoff_t			 chunk_start;
+	pgoff_t			 chunk_end;
+	int			 ppc_bits;
+
+	LASSERT(cur->oe_state == OES_CACHE);
+	assert_osc_object_is_locked(obj);
+	if (victim == NULL)
+		return -EINVAL;
+
+	if (victim->oe_state != OES_INV &&
+	    (victim->oe_state != OES_CACHE || victim->oe_fsync_wait))
+		return -EBUSY;
+
+	if (cur->oe_max_end != victim->oe_max_end)
+		return -ERANGE;
+
+	/*
+	 * In the rare case max_pages_per_rpc (mppr) is changed, don't
+	 * merge extents until after old ones have been sent, or the
+	 * "extents are aligned to RPCs" checks are unhappy.
+	 */
+	if (cur->oe_mppr != victim->oe_mppr)
+		return -ERANGE;
+
+	LASSERT(cur->oe_dlmlock == victim->oe_dlmlock);
+	ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT;
+	chunk_start = cur->oe_start >> ppc_bits;
+	chunk_end   = cur->oe_end   >> ppc_bits;
+	if (chunk_start   != (victim->oe_end >> ppc_bits) + 1 &&
+	    chunk_end + 1 != victim->oe_start >> ppc_bits)
+		return -ERANGE;
+
+	/* overall extent size should not exceed the max supported limit
+	 * reported by the server */
+	if (cur->oe_end - cur->oe_start + 1 +
+	    victim->oe_end - victim->oe_start + 1 > cli->cl_max_extent_pages)
+		return -ERANGE;
+
+	OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur);
+
+	cur->oe_start     = min(cur->oe_start, victim->oe_start);
+	cur->oe_end       = max(cur->oe_end,   victim->oe_end);
+	/* per-extent tax should be accounted only once for the whole extent */
+	cur->oe_grants   += victim->oe_grants - cli->cl_grant_extent_tax;
+	cur->oe_nr_pages += victim->oe_nr_pages;
+	/* only the following bits are needed to merge */
+	cur->oe_urgent   |= victim->oe_urgent;
+	cur->oe_memalloc |= victim->oe_memalloc;
+	list_splice_init(&victim->oe_pages, &cur->oe_pages);
+	victim->oe_nr_pages = 0;
+
+	osc_extent_get(victim);
+	__osc_extent_remove(victim);
+	osc_extent_put(env, victim);
+
+	OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim);
+	return 0;
+}
+
+/**
+ * Drop user count of osc_extent, and unplug IO asynchronously.
+ */
+void osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct client_obd *cli = osc_cli(obj);
+	ENTRY;
+
+	LASSERT(atomic_read(&ext->oe_users) > 0);
+	LASSERT(sanity_check(ext) == 0);
+	LASSERT(ext->oe_grants > 0);
+
+	if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
+		LASSERT(ext->oe_state == OES_ACTIVE);
+		if (ext->oe_trunc_pending) {
+			/* a truncate process is waiting for this extent.
+			 * This may happen due to a race, check
+			 * osc_cache_truncate_start(). */
+			osc_extent_state_set(ext, OES_TRUNC);
+			ext->oe_trunc_pending = 0;
+			osc_object_unlock(obj);
+		} else {
+			int grant = 0;
+
+			osc_extent_state_set(ext, OES_CACHE);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   ext->oe_nr_pages);
+
+			/* try to merge the previous and next extent. */
+			if (osc_extent_merge(env, ext, prev_extent(ext)) == 0)
+				grant += cli->cl_grant_extent_tax;
+			if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+				grant += cli->cl_grant_extent_tax;
+
+			if (ext->oe_hp)
+				list_move_tail(&ext->oe_link,
+					       &obj->oo_hp_exts);
+			else if (ext->oe_urgent)
+				list_move_tail(&ext->oe_link,
+					       &obj->oo_urgent_exts);
+			else if (ext->oe_nr_pages == ext->oe_mppr) {
+				list_move_tail(&ext->oe_link,
+					       &obj->oo_full_exts);
+			}
+			osc_object_unlock(obj);
+			if (grant > 0)
+				osc_unreserve_grant(cli, 0, grant);
+		}
+
+		osc_io_unplug_async(env, cli, obj);
+	}
+	osc_extent_put(env, ext);
+
+	RETURN_EXIT;
+}
+
+/**
+ * Find or create an extent which includes @index, core function to manage
+ * extent tree.
+ */
+static struct osc_extent *osc_extent_find(const struct lu_env *env,
+					  struct osc_object *obj, pgoff_t index,
+					  unsigned int *grants)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_lock   *olck;
+	struct cl_lock_descr *descr;
+	struct osc_extent *cur;
+	struct osc_extent *ext;
+	struct osc_extent *conflict = NULL;
+	struct osc_extent *found = NULL;
+	pgoff_t    chunk;
+	pgoff_t    max_end;
+	unsigned int max_pages; /* max_pages_per_rpc */
+	unsigned int chunksize;
+	int        ppc_bits; /* pages per chunk bits */
+	pgoff_t    chunk_mask;
+	int        rc;
+	ENTRY;
+
+	cur = osc_extent_alloc(obj);
+	if (cur == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	olck = osc_env_io(env)->oi_write_osclock;
+	LASSERTF(olck != NULL, "page %lu is not covered by lock\n", index);
+	LASSERT(olck->ols_state == OLS_GRANTED);
+
+	descr = &olck->ols_cl.cls_lock->cll_descr;
+	LASSERT(descr->cld_mode >= CLM_WRITE);
+
+	LASSERTF(cli->cl_chunkbits >= PAGE_SHIFT,
+		 "chunkbits: %u\n", cli->cl_chunkbits);
+	ppc_bits   = cli->cl_chunkbits - PAGE_SHIFT;
+	chunk_mask = ~((1 << ppc_bits) - 1);
+	chunksize  = 1 << cli->cl_chunkbits;
+	chunk      = index >> ppc_bits;
+
+	/* align end to RPC edge. */
+	max_pages = cli->cl_max_pages_per_rpc;
+	if ((max_pages & ~chunk_mask) != 0) {
+		CERROR("max_pages: %#x chunkbits: %u chunk_mask: %#lx\n",
+		       max_pages, cli->cl_chunkbits, chunk_mask);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+	max_end = index - (index % max_pages) + max_pages - 1;
+	max_end = min_t(pgoff_t, max_end, descr->cld_end);
+
+	/* initialize new extent by parameters so far */
+	cur->oe_max_end = max_end;
+	cur->oe_start   = index & chunk_mask;
+	cur->oe_end     = ((index + ~chunk_mask + 1) & chunk_mask) - 1;
+	if (cur->oe_start < descr->cld_start)
+		cur->oe_start = descr->cld_start;
+	if (cur->oe_end > max_end)
+		cur->oe_end = max_end;
+	cur->oe_grants  = chunksize + cli->cl_grant_extent_tax;
+	cur->oe_mppr    = max_pages;
+	if (olck->ols_dlmlock != NULL) {
+		LASSERT(olck->ols_hold);
+		cur->oe_dlmlock = LDLM_LOCK_GET(olck->ols_dlmlock);
+		lu_ref_add(&olck->ols_dlmlock->l_reference, "osc_extent", cur);
+	}
+
+	/* grants has been allocated by caller */
+	LASSERTF(*grants >= chunksize + cli->cl_grant_extent_tax,
+		 "%u/%u/%u.\n", *grants, chunksize, cli->cl_grant_extent_tax);
+	LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR"\n",
+		 EXTPARA(cur));
+
+restart:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, cur->oe_start);
+	if (!ext)
+		ext = first_extent(obj);
+	for (; ext; ext = next_extent(ext)) {
+		pgoff_t ext_chk_start = ext->oe_start >> ppc_bits;
+		pgoff_t ext_chk_end = ext->oe_end >> ppc_bits;
+
+		LASSERT(sanity_check_nolock(ext) == 0);
+		if (chunk > ext_chk_end + 1 || chunk < ext_chk_start)
+			break;
+
+		/* if covering by different locks, no chance to match */
+		if (olck->ols_dlmlock != ext->oe_dlmlock) {
+			EASSERTF(!overlapped(ext, cur), ext,
+				 EXTSTR"\n", EXTPARA(cur));
+
+			continue;
+		}
+
+		/* discontiguous chunks? */
+		if (chunk + 1 < ext_chk_start)
+			continue;
+
+		/* ok, from now on, ext and cur have these attrs:
+		 * 1. covered by the same lock
+		 * 2. contiguous at chunk level or overlapping. */
+
+		if (overlapped(ext, cur)) {
+			/* cur is the minimum unit, so overlapping means
+			 * full contain. */
+			EASSERTF((ext->oe_start <= cur->oe_start &&
+				  ext->oe_end >= cur->oe_end),
+				 ext, EXTSTR"\n", EXTPARA(cur));
+
+			if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
+				/* for simplicity, we wait for this extent to
+				 * finish before going forward. */
+				conflict = osc_extent_get(ext);
+				break;
+			}
+
+			found = osc_extent_hold(ext);
+			break;
+		}
+
+		/* non-overlapped extent */
+		if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait)
+			/* we can't do anything for a non OES_CACHE extent, or
+			 * if there is someone waiting for this extent to be
+			 * flushed, try next one. */
+			continue;
+
+		if (osc_extent_merge(env, ext, cur) == 0) {
+			LASSERT(*grants >= chunksize);
+			*grants -= chunksize;
+
+			/*
+			 * Try to merge with the next one too because we
+			 * might have just filled in a gap.
+			 */
+			if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+				/* we can save extent tax from next extent */
+				*grants += cli->cl_grant_extent_tax;
+
+			found = osc_extent_hold(ext);
+			break;
+		}
+	}
+
+	osc_extent_tree_dump(D_CACHE, obj);
+	if (found != NULL) {
+		LASSERT(conflict == NULL);
+		if (!IS_ERR(found)) {
+			LASSERT(found->oe_dlmlock == cur->oe_dlmlock);
+			OSC_EXTENT_DUMP(D_CACHE, found,
+					"found caching ext for %lu.\n", index);
+		}
+	} else if (conflict == NULL) {
+		/* create a new extent */
+		EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur);
+		LASSERT(*grants >= cur->oe_grants);
+		*grants -= cur->oe_grants;
+
+		cur->oe_state = OES_CACHE;
+		found = osc_extent_hold(cur);
+		osc_extent_insert(obj, cur);
+		OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n",
+				index, descr->cld_end);
+	}
+	osc_object_unlock(obj);
+
+	if (conflict != NULL) {
+		LASSERT(found == NULL);
+
+		/* waiting for IO to finish. Please notice that it's impossible
+		 * to be an OES_TRUNC extent. */
+		rc = osc_extent_wait(env, conflict, OES_INV);
+		osc_extent_put(env, conflict);
+		conflict = NULL;
+		if (rc < 0)
+			GOTO(out, found = ERR_PTR(rc));
+
+		goto restart;
+	}
+	EXIT;
+
+out:
+	osc_extent_put(env, cur);
+	return found;
+}
+
+/**
+ * Called when IO is finished to an extent.
+ */
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc)
+{
+	struct client_obd *cli = osc_cli(ext->oe_obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	int nr_pages = ext->oe_nr_pages;
+	int lost_grant = 0;
+	int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
+	loff_t last_off = 0;
+	int last_count = -1;
+	ENTRY;
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n");
+
+	ext->oe_rc = rc ?: ext->oe_nr_pages;
+	EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext);
+
+	osc_lru_add_batch(cli, &ext->oe_pages);
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		list_del_init(&oap->oap_rpc_item);
+		list_del_init(&oap->oap_pending_item);
+		if (last_off <= oap->oap_obj_off) {
+			last_off = oap->oap_obj_off;
+			last_count = oap->oap_count;
+		}
+
+		--ext->oe_nr_pages;
+		osc_ap_completion(env, cli, oap, sent, rc);
+	}
+	EASSERT(ext->oe_nr_pages == 0, ext);
+
+	if (!sent) {
+		lost_grant = ext->oe_grants;
+	} else if (cli->cl_ocd_grant_param == 0 &&
+		   blocksize < PAGE_SIZE &&
+		   last_count != PAGE_SIZE) {
+		/* For short writes without OBD_CONNECT_GRANT support, we
+		 * shouldn't count parts of pages that span a whole chunk on
+		 * the OST side, or our accounting goes wrong. Should match
+		 * the code in tgt_grant_check.
+		 */
+		int offset = last_off & ~PAGE_MASK;
+		int count = last_count + (offset & (blocksize - 1));
+		int end = (offset + last_count) & (blocksize - 1);
+		if (end)
+			count += blocksize - end;
+
+		lost_grant = PAGE_SIZE - count;
+	}
+	if (ext->oe_grants > 0)
+		osc_free_grant(cli, nr_pages, lost_grant, ext->oe_grants);
+
+	osc_extent_remove(ext);
+	/* put the refcount for RPC */
+	osc_extent_put(env, ext);
+	RETURN(0);
+}
+
+/**
+ * Wait for the extent's state to become @state.
+ */
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   enum osc_extent_state state)
+{
+	struct osc_object *obj = ext->oe_obj;
+	int rc = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	/* `Kick' this extent only if the caller is waiting for it to be
+	 * written out. */
+	if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) {
+		if (ext->oe_state == OES_ACTIVE) {
+			ext->oe_urgent = 1;
+		} else if (ext->oe_state == OES_CACHE) {
+			ext->oe_urgent = 1;
+			osc_extent_hold(ext);
+			rc = 1;
+		}
+	}
+	osc_object_unlock(obj);
+	if (rc == 1)
+		osc_extent_release(env, ext);
+
+	/* wait for the extent until its state becomes @state */
+	rc = wait_event_idle_timeout(ext->oe_waitq,
+				     smp_load_acquire(&ext->oe_state) == state,
+				     cfs_time_seconds(600));
+	if (rc == 0) {
+		OSC_EXTENT_DUMP(D_ERROR, ext,
+			"%s: wait ext to %u timedout, recovery in progress?\n",
+			cli_name(osc_cli(obj)), state);
+
+		wait_event_idle(ext->oe_waitq,
+				smp_load_acquire(&ext->oe_state) == state);
+	}
+	if (ext->oe_rc < 0)
+		rc = ext->oe_rc;
+	else
+		rc = 0;
+	RETURN(rc);
+}
+
+/**
+ * Discard pages with index greater than @size. If @ext is overlapped with
+ * @size, then partial truncate happens.
+ */
+static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
+				bool partial)
+{
+	struct lu_env         *env;
+	struct cl_io          *io;
+	struct osc_object     *obj = ext->oe_obj;
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	struct pagevec        *pvec;
+	int                    pages_in_chunk = 0;
+	int                    ppc_bits    = cli->cl_chunkbits -
+					     PAGE_SHIFT;
+	__u64                  trunc_chunk = trunc_index >> ppc_bits;
+	int                    grants   = 0;
+	int                    nr_pages = 0;
+	int                    rc       = 0;
+	__u16		       refcheck;
+	ENTRY;
+
+	LASSERT(sanity_check(ext) == 0);
+	LASSERT(ext->oe_state == OES_TRUNC);
+	LASSERT(!ext->oe_urgent);
+
+	/* Request new lu_env.
+	 * We can't use that env from osc_cache_truncate_start() because
+	 * it's from lov_io_sub and not fully initialized. */
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io  = osc_env_thread_io(env);
+	io->ci_obj = cl_object_top(osc2cl(obj));
+	io->ci_ignore_layout = 1;
+	pvec = &osc_env_info(env)->oti_pagevec;
+	ll_pagevec_init(pvec, 0);
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* discard all pages with index greater than trunc_index */
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		pgoff_t index = osc_index(oap2osc(oap));
+		struct cl_page  *page = oap2cl_page(oap);
+
+		LASSERT(list_empty(&oap->oap_rpc_item));
+
+		/* only discard the pages with their index greater than
+		 * trunc_index, and ... */
+		if (index < trunc_index ||
+		    (index == trunc_index && partial)) {
+			/* accounting how many pages remaining in the chunk
+			 * so that we can calculate grants correctly. */
+			if (index >> ppc_bits == trunc_chunk)
+				++pages_in_chunk;
+			continue;
+		}
+
+		list_del_init(&oap->oap_pending_item);
+
+		cl_page_get(page);
+		lu_ref_add(&page->cp_reference, "truncate", current);
+
+		if (cl_page_own(env, io, page) == 0) {
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+			LASSERT(0);
+		}
+
+		lu_ref_del(&page->cp_reference, "truncate", current);
+		cl_pagevec_put(env, page, pvec);
+
+		--ext->oe_nr_pages;
+		++nr_pages;
+	}
+	pagevec_release(pvec);
+
+	EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
+		      ext->oe_nr_pages == 0),
+		ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
+
+	osc_object_lock(obj);
+	if (ext->oe_nr_pages == 0) {
+		LASSERT(pages_in_chunk == 0);
+		grants = ext->oe_grants;
+		ext->oe_grants = 0;
+	} else { /* calculate how many grants we can free */
+		int     chunks = (ext->oe_end >> ppc_bits) - trunc_chunk;
+		pgoff_t last_index;
+
+
+		/* if there is no pages in this chunk, we can also free grants
+		 * for the last chunk */
+		if (pages_in_chunk == 0) {
+			/* if this is the 1st chunk and no pages in this chunk,
+			 * ext->oe_nr_pages must be zero, so we should be in
+			 * the other if-clause. */
+			LASSERT(trunc_chunk > 0);
+			--trunc_chunk;
+			++chunks;
+		}
+
+		/* this is what we can free from this extent */
+		grants          = chunks << cli->cl_chunkbits;
+		ext->oe_grants -= grants;
+		last_index      = ((trunc_chunk + 1) << ppc_bits) - 1;
+		ext->oe_end     = min(last_index, ext->oe_max_end);
+		LASSERT(ext->oe_end >= ext->oe_start);
+		LASSERT(ext->oe_grants > 0);
+	}
+	osc_object_unlock(obj);
+
+	if (grants > 0 || nr_pages > 0)
+		osc_free_grant(cli, nr_pages, grants, grants);
+
+out:
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+/**
+ * This function is used to make the extent prepared for transfer.
+ * A race with flusing page - ll_writepage() has to be handled cautiously.
+ */
+static int osc_extent_make_ready(const struct lu_env *env,
+				 struct osc_extent *ext)
+{
+	struct osc_async_page *oap;
+	struct osc_async_page *last = NULL;
+	struct osc_object *obj = ext->oe_obj;
+	unsigned int page_count = 0;
+	int rc;
+	ENTRY;
+
+	/* we're going to grab page lock, so object lock must not be taken. */
+	LASSERT(sanity_check(ext) == 0);
+	/* in locking state, any process should not touch this extent. */
+	EASSERT(ext->oe_state == OES_LOCKING, ext);
+	EASSERT(ext->oe_owner != NULL, ext);
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n");
+
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		++page_count;
+		if (last == NULL || last->oap_obj_off < oap->oap_obj_off)
+			last = oap;
+
+		/* checking ASYNC_READY is race safe */
+		if ((oap->oap_async_flags & ASYNC_READY) != 0)
+			continue;
+
+		rc = osc_make_ready(env, oap, OBD_BRW_WRITE);
+		switch (rc) {
+		case 0:
+			spin_lock(&oap->oap_lock);
+			oap->oap_async_flags |= ASYNC_READY;
+			spin_unlock(&oap->oap_lock);
+			break;
+		case -EALREADY:
+			LASSERT((oap->oap_async_flags & ASYNC_READY) != 0);
+			break;
+		default:
+			LASSERTF(0, "unknown return code: %d\n", rc);
+		}
+	}
+
+	LASSERT(page_count == ext->oe_nr_pages);
+	LASSERT(last != NULL);
+	/* the last page is the only one we need to refresh its count by
+	 * the size of file. */
+	if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
+		int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
+		LASSERTF(last_oap_count > 0,
+			 "last_oap_count %d\n", last_oap_count);
+		LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE);
+		last->oap_count = last_oap_count;
+		spin_lock(&last->oap_lock);
+		last->oap_async_flags |= ASYNC_COUNT_STABLE;
+		spin_unlock(&last->oap_lock);
+	}
+
+	/* for the rest of pages, we don't need to call osf_refresh_count()
+	 * because it's known they are not the last page */
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
+			oap->oap_count = PAGE_SIZE - oap->oap_page_off;
+			spin_lock(&oap->oap_lock);
+			oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+			spin_unlock(&oap->oap_lock);
+		}
+	}
+
+	osc_object_lock(obj);
+	osc_extent_state_set(ext, OES_RPC);
+	osc_object_unlock(obj);
+	/* get a refcount for RPC. */
+	osc_extent_get(ext);
+
+	RETURN(0);
+}
+
+/**
+ * Quick and simple version of osc_extent_find(). This function is frequently
+ * called to expand the extent for the same IO. To expand the extent, the
+ * page index must be in the same or next chunk of ext->oe_end.
+ */
+static int osc_extent_expand(struct osc_extent *ext, pgoff_t index,
+			     unsigned int *grants)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *next;
+	int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+	pgoff_t chunk = index >> ppc_bits;
+	pgoff_t end_chunk;
+	pgoff_t end_index;
+	unsigned int chunksize = 1 << cli->cl_chunkbits;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	end_chunk = ext->oe_end >> ppc_bits;
+	if (chunk > end_chunk + 1)
+		GOTO(out, rc = -ERANGE);
+
+	if (end_chunk >= chunk)
+		GOTO(out, rc = 0);
+
+	LASSERT(end_chunk + 1 == chunk);
+
+	/* try to expand this extent to cover @index */
+	end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1);
+
+	/* don't go over the maximum extent size reported by server */
+	if (end_index - ext->oe_start + 1 > cli->cl_max_extent_pages)
+		GOTO(out, rc = -ERANGE);
+
+	next = next_extent(ext);
+	if (next != NULL && next->oe_start <= end_index)
+		/* complex mode - overlapped with the next extent,
+		 * this case will be handled by osc_extent_find() */
+		GOTO(out, rc = -EAGAIN);
+
+	ext->oe_end = end_index;
+	ext->oe_grants += chunksize;
+	LASSERT(*grants >= chunksize);
+	*grants -= chunksize;
+	EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext,
+		 "overlapped after expanding for %lu.\n", index);
+	EXIT;
+
+out:
+	osc_object_unlock(obj);
+	RETURN(rc);
+}
+
+static void osc_extent_tree_dump0(int mask, struct osc_object *obj,
+				  const char *func, int line)
+{
+	struct osc_extent *ext;
+	int cnt;
+
+	if (!cfs_cdebug_show(mask, DEBUG_SUBSYSTEM))
+		return;
+
+	CDEBUG(mask, "Dump object %p extents at %s:%d, mppr: %u.\n",
+	       obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc);
+
+	/* osc_object_lock(obj); */
+	cnt = 1;
+	for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext))
+		OSC_EXTENT_DUMP(mask, ext, "in tree %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_hp_exts, oe_link)
+		OSC_EXTENT_DUMP(mask, ext, "hp %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link)
+		OSC_EXTENT_DUMP(mask, ext, "urgent %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_reading_exts, oe_link)
+		OSC_EXTENT_DUMP(mask, ext, "reading %d.\n", cnt++);
+	/* osc_object_unlock(obj); */
+}
+
+/* ------------------ osc extent end ------------------ */
+
+static inline int osc_is_ready(struct osc_object *osc)
+{
+	return !list_empty(&osc->oo_ready_item) ||
+	       !list_empty(&osc->oo_hp_ready_item);
+}
+
+#define OSC_IO_DEBUG(OSC, STR, args...)					       \
+	CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR,     \
+	       (OSC), osc_is_ready(OSC),				       \
+	       list_empty_marker(&(OSC)->oo_hp_ready_item),		       \
+	       list_empty_marker(&(OSC)->oo_ready_item),		       \
+	       atomic_read(&(OSC)->oo_nr_writes),			       \
+	       list_empty_marker(&(OSC)->oo_hp_exts),			       \
+	       list_empty_marker(&(OSC)->oo_urgent_exts),		       \
+	       atomic_read(&(OSC)->oo_nr_reads),			       \
+	       list_empty_marker(&(OSC)->oo_reading_exts),		       \
+	       ##args)
+
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd)
+{
+	struct osc_page *opg  = oap2osc_page(oap);
+	struct cl_page  *page = oap2cl_page(oap);
+	int result;
+
+	LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
+
+	ENTRY;
+	result = cl_page_make_ready(env, page, CRT_WRITE);
+	if (result == 0)
+		opg->ops_submit_time = ktime_get();
+	RETURN(result);
+}
+
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd)
+{
+	struct osc_page  *opg = oap2osc_page(oap);
+	pgoff_t index = osc_index(oap2osc(oap));
+	struct cl_object *obj;
+	struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
+	int result;
+	loff_t kms;
+
+	/* readpage queues with _COUNT_STABLE, shouldn't get here. */
+	LASSERT(!(cmd & OBD_BRW_READ));
+	LASSERT(opg != NULL);
+	obj = opg->ops_cl.cpl_obj;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result < 0)
+		return result;
+	kms = attr->cat_kms;
+	if (cl_offset(obj, index) >= kms)
+		/* catch race with truncate */
+		return 0;
+	else if (cl_offset(obj, index + 1) > kms)
+		/* catch sub-page write at end of file */
+		return kms & ~PAGE_MASK;
+	else
+		return PAGE_SIZE;
+}
+
+static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd, int rc)
+{
+	struct osc_page   *opg  = oap2osc_page(oap);
+	struct cl_page    *page = oap2cl_page(oap);
+	enum cl_req_type   crt;
+	int srvlock;
+
+	ENTRY;
+
+	cmd &= ~OBD_BRW_NOQUOTA;
+	LASSERTF(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ),
+		 "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
+	LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),
+		"cp_state:%u, cmd:%d\n", page->cp_state, cmd);
+	LASSERT(opg->ops_transfer_pinned);
+
+	crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
+	/* Clear opg->ops_transfer_pinned before VM lock is released. */
+	opg->ops_transfer_pinned = 0;
+
+	opg->ops_submit_time = ktime_set(0, 0);
+	srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
+
+	/* statistic */
+	if (rc == 0 && srvlock) {
+		struct lu_device *ld    = opg->ops_cl.cpl_obj->co_lu.lo_dev;
+		struct osc_stats *stats = &lu2osc_dev(ld)->osc_stats;
+		size_t bytes = oap->oap_count;
+
+		if (crt == CRT_READ)
+			stats->os_lockless_reads += bytes;
+		else
+			stats->os_lockless_writes += bytes;
+	}
+
+	/*
+	 * This has to be the last operation with the page, as locks are
+	 * released in cl_page_completion() and nothing except for the
+	 * reference counter protects page from concurrent reclaim.
+	 */
+	lu_ref_del(&page->cp_reference, "transfer", page);
+
+	cl_page_completion(env, page, crt, rc);
+	cl_page_put(env, page);
+
+	RETURN(0);
+}
+
+#define OSC_DUMP_GRANT(mask, cli, fmt, args...) do {			\
+	struct client_obd *__tmp = (cli);				\
+	CDEBUG(mask, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu "	\
+	       "dropped: %ld avail: %ld, dirty_grant: %ld, "		\
+	       "reserved: %ld, flight: %d } lru {in list: %ld, "	\
+	       "left: %ld, waiters: %d }" fmt "\n",			\
+	       cli_name(__tmp),						\
+	       __tmp->cl_dirty_pages, __tmp->cl_dirty_max_pages,	\
+	       atomic_long_read(&obd_dirty_pages), obd_max_dirty_pages,	\
+	       __tmp->cl_lost_grant, __tmp->cl_avail_grant,		\
+	       __tmp->cl_dirty_grant,					\
+	       __tmp->cl_reserved_grant, __tmp->cl_w_in_flight,		\
+	       atomic_long_read(&__tmp->cl_lru_in_list),		\
+	       atomic_long_read(&__tmp->cl_lru_busy),			\
+	       atomic_read(&__tmp->cl_lru_shrinkers), ##args);		\
+} while (0)
+
+/* caller must hold loi_list_lock */
+static void osc_consume_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	assert_spin_locked(&cli->cl_loi_list_lock);
+	LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
+	cli->cl_dirty_pages++;
+	pga->flag |= OBD_BRW_FROM_GRANT;
+	CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
+	       PAGE_SIZE, pga, pga->pg);
+}
+
+/* the companion to osc_consume_write_grant, called when a brw has completed.
+ * must be called with the loi lock held. */
+static void osc_release_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	ENTRY;
+
+	assert_spin_locked(&cli->cl_loi_list_lock);
+	if (!(pga->flag & OBD_BRW_FROM_GRANT)) {
+		EXIT;
+		return;
+	}
+
+	pga->flag &= ~OBD_BRW_FROM_GRANT;
+	atomic_long_dec(&obd_dirty_pages);
+	cli->cl_dirty_pages--;
+	EXIT;
+}
+
+/**
+ * To avoid sleeping with object lock held, it's good for us allocate enough
+ * grants before entering into critical section.
+ *
+ * client_obd_list_lock held by caller
+ */
+static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes)
+{
+	int rc = -EDQUOT;
+
+	if (cli->cl_avail_grant >= bytes) {
+		cli->cl_avail_grant    -= bytes;
+		cli->cl_reserved_grant += bytes;
+		rc = 0;
+	}
+	return rc;
+}
+
+static void __osc_unreserve_grant(struct client_obd *cli,
+				  unsigned int reserved, unsigned int unused)
+{
+	/* it's quite normal for us to get more grant than reserved.
+	 * Thinking about a case that two extents merged by adding a new
+	 * chunk, we can save one extent tax. If extent tax is greater than
+	 * one chunk, we can save more grant by adding a new chunk */
+	cli->cl_reserved_grant -= reserved;
+	if (unused > reserved) {
+		cli->cl_avail_grant += reserved;
+		cli->cl_lost_grant  += unused - reserved;
+		cli->cl_dirty_grant -= unused - reserved;
+	} else {
+		cli->cl_avail_grant += unused;
+		cli->cl_dirty_grant += reserved - unused;
+	}
+}
+
+static void osc_unreserve_grant_nolock(struct client_obd *cli,
+				       unsigned int reserved,
+				       unsigned int unused)
+{
+	__osc_unreserve_grant(cli, reserved, unused);
+	if (unused > 0)
+		osc_wake_cache_waiters(cli);
+}
+
+static void osc_unreserve_grant(struct client_obd *cli,
+				unsigned int reserved, unsigned int unused)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+	osc_unreserve_grant_nolock(cli, reserved, unused);
+	spin_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Free grant after IO is finished or canceled.
+ *
+ * @lost_grant is used to remember how many grants we have allocated but not
+ * used, we should return these grants to OST. There're two cases where grants
+ * can be lost:
+ * 1. truncate;
+ * 2. Without OBD_CONNECT_GRANT support and blocksize at OST is less than
+ *    PAGE_SIZE and a partial page was written. In this case OST may use less
+ *    chunks to serve this partial write. OSTs don't actually know the page
+ *    size on the client side. so clients have to calculate lost grant by the
+ *    blocksize on the OST. See tgt_grant_check() for details.
+ */
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant, unsigned int dirty_grant)
+{
+	unsigned long grant;
+
+	grant = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	atomic_long_sub(nr_pages, &obd_dirty_pages);
+	cli->cl_dirty_pages -= nr_pages;
+	cli->cl_lost_grant += lost_grant;
+	cli->cl_dirty_grant -= dirty_grant;
+	if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
+		/* borrow some grant from truncate to avoid the case that
+		 * truncate uses up all avail grant */
+		cli->cl_lost_grant -= grant;
+		cli->cl_avail_grant += grant;
+	}
+	osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu/%lu\n",
+	       lost_grant, cli->cl_lost_grant,
+	       cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT,
+	       cli->cl_dirty_grant);
+}
+
+/**
+ * The companion to osc_enter_cache(), called when @oap is no longer part of
+ * the dirty accounting due to error.
+ */
+static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+	osc_release_write_grant(cli, &oap->oap_brw_page);
+	spin_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Non-blocking version of osc_enter_cache() that consumes grant only when it
+ * is available.
+ */
+static int osc_enter_cache_try(struct client_obd *cli,
+			       struct osc_async_page *oap,
+			       int bytes)
+{
+	int rc;
+
+	OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
+
+	rc = osc_reserve_grant(cli, bytes);
+	if (rc < 0)
+		return 0;
+
+	if (cli->cl_dirty_pages < cli->cl_dirty_max_pages) {
+		if (atomic_long_add_return(1, &obd_dirty_pages) <=
+		    obd_max_dirty_pages) {
+			osc_consume_write_grant(cli, &oap->oap_brw_page);
+			rc = 1;
+			goto out;
+		} else
+			atomic_long_dec(&obd_dirty_pages);
+	}
+	__osc_unreserve_grant(cli, bytes, bytes);
+
+out:
+	return rc;
+}
+
+/* Following two inlines exist to pass code fragments
+ * to wait_event_idle_exclusive_timeout_cmd().  Passing
+ * code fragments as macro args can look confusing, so
+ * we provide inlines to encapsulate them.
+ */
+static inline void cli_unlock_and_unplug(const struct lu_env *env,
+					 struct client_obd *cli,
+					 struct osc_async_page *oap)
+{
+	spin_unlock(&cli->cl_loi_list_lock);
+	osc_io_unplug_async(env, cli, NULL);
+	CDEBUG(D_CACHE,
+	       "%s: sleeping for cache space for %p\n",
+	       cli_name(cli), oap);
+}
+
+static inline void cli_lock_after_unplug(struct client_obd *cli)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+}
+/**
+ * The main entry to reserve dirty page accounting. Usually the grant reserved
+ * in this function will be freed in bulk in osc_free_grant() unless it fails
+ * to add osc cache, in that case, it will be freed in osc_exit_cache().
+ *
+ * The process will be put into sleep if it's already run out of grant.
+ */
+static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
+			   struct osc_async_page *oap, int bytes)
+{
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo *loi = osc->oo_oinfo;
+	int rc = -EDQUOT;
+	int remain;
+	bool entered = false;
+	/* We cannot wait for a long time here since we are holding ldlm lock
+	 * across the actual IO. If no requests complete fast (e.g. due to
+	 * overloaded OST that takes a long time to process everything, we'd
+	 * get evicted if we wait for a normal obd_timeout or some such.
+	 * So we try to wait half the time it would take the client to be
+	 * evicted by server which is half obd_timeout when AT is off
+	 * or at least ldlm_enqueue_min with AT on.
+	 * See LU-13131 */
+	unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 :
+							  ldlm_enqueue_min / 2);
+
+	ENTRY;
+
+	OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
+
+	spin_lock(&cli->cl_loi_list_lock);
+
+	/* force the caller to try sync io.  this can jump the list
+	 * of queued writes and create a discontiguous rpc stream */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
+	    cli->cl_dirty_max_pages == 0 ||
+	    cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) {
+		OSC_DUMP_GRANT(D_CACHE, cli, "forced sync i/o\n");
+		GOTO(out, rc = -EDQUOT);
+	}
+
+	/*
+	 * We can wait here for two reasons: too many dirty pages in cache, or
+	 * run out of grants. In both cases we should write dirty pages out.
+	 * Adding a cache waiter will trigger urgent write-out no matter what
+	 * RPC size will be.
+	 * The exiting condition (other than success) is no avail grants
+	 * and no dirty pages caching, that really means there is no space
+	 * on the OST.
+	 */
+	remain = wait_event_idle_exclusive_timeout_cmd(
+		cli->cl_cache_waiters,
+		(entered = osc_enter_cache_try(cli, oap, bytes)) ||
+		(cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0),
+		timeout,
+		cli_unlock_and_unplug(env, cli, oap),
+		cli_lock_after_unplug(cli));
+
+	if (entered) {
+		if (remain == timeout)
+			OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
+		else
+			OSC_DUMP_GRANT(D_CACHE, cli,
+				       "finally got grant space\n");
+		wake_up(&cli->cl_cache_waiters);
+		rc = 0;
+	} else if (remain == 0) {
+		OSC_DUMP_GRANT(D_CACHE, cli,
+			       "timeout, fall back to sync i/o\n");
+		osc_extent_tree_dump(D_CACHE, osc);
+		/* fall back to synchronous I/O */
+	} else {
+		OSC_DUMP_GRANT(D_CACHE, cli,
+			       "no grant space, fall back to sync i/o\n");
+		wake_up_all(&cli->cl_cache_waiters);
+	}
+	EXIT;
+out:
+	spin_unlock(&cli->cl_loi_list_lock);
+	RETURN(rc);
+}
+
+static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
+{
+	int hprpc = !!list_empty(&osc->oo_hp_exts);
+	return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
+}
+
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop).  This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
+static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
+			 int cmd)
+{
+	int invalid_import = 0;
+	ENTRY;
+
+	/* if we have an invalid import we want to drain the queued pages
+	 * by forcing them through rpcs that immediately fail and complete
+	 * the pages.  recovery relies on this to empty the queued pages
+	 * before canceling the locks and evicting down the llite pages */
+	if ((cli->cl_import == NULL || cli->cl_import->imp_invalid))
+		invalid_import = 1;
+
+	if (cmd & OBD_BRW_WRITE) {
+		if (atomic_read(&osc->oo_nr_writes) == 0)
+			RETURN(0);
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_hp_exts)) {
+			CDEBUG(D_CACHE, "high prio request forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_urgent_exts)) {
+			CDEBUG(D_CACHE, "urgent request forcing RPC\n");
+			RETURN(1);
+		}
+		/* trigger a write rpc stream as long as there are dirtiers
+		 * waiting for space.  as they're waiting, they're not going to
+		 * create more pages to coalesce with what's waiting..
+		 */
+		if (waitqueue_active(&cli->cl_cache_waiters)) {
+			CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_full_exts)) {
+			CDEBUG(D_CACHE, "full extent ready, make an RPC\n");
+			RETURN(1);
+		}
+	} else {
+		if (atomic_read(&osc->oo_nr_reads) == 0)
+			RETURN(0);
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			RETURN(1);
+		}
+		/* all read are urgent. */
+		if (!list_empty(&osc->oo_reading_exts))
+			RETURN(1);
+	}
+
+	RETURN(0);
+}
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
+{
+	struct client_obd *cli = osc_cli(obj);
+	if (cmd & OBD_BRW_WRITE) {
+		atomic_add(delta, &obj->oo_nr_writes);
+		atomic_add(delta, &cli->cl_pending_w_pages);
+		LASSERT(atomic_read(&obj->oo_nr_writes) >= 0);
+	} else {
+		atomic_add(delta, &obj->oo_nr_reads);
+		atomic_add(delta, &cli->cl_pending_r_pages);
+		LASSERT(atomic_read(&obj->oo_nr_reads) >= 0);
+	}
+	OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
+}
+
+static int osc_makes_hprpc(struct osc_object *obj)
+{
+	return !list_empty(&obj->oo_hp_exts);
+}
+
+static void on_list(struct list_head *item, struct list_head *list,
+		    int should_be_on)
+{
+	if (list_empty(item) && should_be_on)
+		list_add_tail(item, list);
+	else if (!list_empty(item) && !should_be_on)
+		list_del_init(item);
+}
+
+/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc
+ * can find pages to build into rpcs quickly */
+static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	if (osc_makes_hprpc(osc)) {
+		/* HP rpc */
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0);
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
+	} else {
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list,
+			osc_makes_rpc(cli, osc, OBD_BRW_WRITE) ||
+			osc_makes_rpc(cli, osc, OBD_BRW_READ));
+	}
+
+	on_list(&osc->oo_write_item, &cli->cl_loi_write_list,
+		atomic_read(&osc->oo_nr_writes) > 0);
+
+	on_list(&osc->oo_read_item, &cli->cl_loi_read_list,
+		atomic_read(&osc->oo_nr_reads) > 0);
+
+	return osc_is_ready(osc);
+}
+
+static int osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	int is_ready;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	is_ready = __osc_list_maint(cli, osc);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return is_ready;
+}
+
+/* this is trying to propogate async writeback errors back up to the
+ * application.  As an async write fails we record the error code for later if
+ * the app does an fsync.  As long as errors persist we force future rpcs to be
+ * sync so that the app can get a sync error and break the cycle of queueing
+ * pages for which writeback will fail. */
+static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
+			   int rc)
+{
+	if (rc) {
+		if (!ar->ar_rc)
+			ar->ar_rc = rc;
+
+		ar->ar_force_sync = 1;
+		ar->ar_min_xid = ptlrpc_sample_next_xid();
+		return;
+
+	}
+
+	if (ar->ar_force_sync && (xid >= ar->ar_min_xid))
+		ar->ar_force_sync = 0;
+}
+
+/* this must be called holding the loi list lock to give coverage to exit_cache,
+ * async_flag maintenance, and oap_request */
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc)
+{
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo  *loi = osc->oo_oinfo;
+	__u64 xid = 0;
+
+	ENTRY;
+	if (oap->oap_request != NULL) {
+		xid = ptlrpc_req_xid(oap->oap_request);
+		ptlrpc_req_finished(oap->oap_request);
+		oap->oap_request = NULL;
+	}
+
+	/* As the transfer for this page is being done, clear the flags */
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags = 0;
+	spin_unlock(&oap->oap_lock);
+
+	if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) {
+		spin_lock(&cli->cl_loi_list_lock);
+		osc_process_ar(&cli->cl_ar, xid, rc);
+		osc_process_ar(&loi->loi_ar, xid, rc);
+		spin_unlock(&cli->cl_loi_list_lock);
+	}
+
+	rc = osc_completion(env, oap, oap->oap_cmd, rc);
+	if (rc)
+		CERROR("completion on oap %p obj %p returns %d.\n",
+		       oap, osc, rc);
+
+	EXIT;
+}
+
+struct extent_rpc_data {
+	struct list_head	*erd_rpc_list;
+	unsigned int		erd_page_count;
+	unsigned int		erd_max_pages;
+	unsigned int		erd_max_chunks;
+	unsigned int		erd_max_extents;
+};
+
+static inline unsigned osc_extent_chunks(const struct osc_extent *ext)
+{
+	struct client_obd *cli = osc_cli(ext->oe_obj);
+	unsigned ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+
+	return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1;
+}
+
+static inline bool
+can_merge(const struct osc_extent *ext, const struct osc_extent *in_rpc)
+{
+	if (ext->oe_no_merge || in_rpc->oe_no_merge)
+		return false;
+
+	if (ext->oe_srvlock != in_rpc->oe_srvlock)
+		return false;
+
+	if (ext->oe_ndelay != in_rpc->oe_ndelay)
+		return false;
+
+	if (!ext->oe_grants != !in_rpc->oe_grants)
+		return false;
+
+	if (ext->oe_dio != in_rpc->oe_dio)
+		return false;
+
+	/* It's possible to have overlap on DIO */
+	if (in_rpc->oe_dio && overlapped(ext, in_rpc))
+		return false;
+
+	if (ext->oe_is_rdma_only != in_rpc->oe_is_rdma_only)
+		return false;
+
+	return true;
+}
+
+/**
+ * Try to add extent to one RPC. We need to think about the following things:
+ * - # of pages must not be over max_pages_per_rpc
+ * - extent must be compatible with previous ones
+ */
+static int try_to_add_extent_for_io(struct client_obd *cli,
+				    struct osc_extent *ext,
+				    struct extent_rpc_data *data)
+{
+	struct osc_extent *tmp;
+	unsigned int chunk_count;
+	ENTRY;
+
+	EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
+		ext);
+	OSC_EXTENT_DUMP(D_CACHE, ext, "trying to add this extent\n");
+
+	if (data->erd_max_extents == 0)
+		RETURN(0);
+
+	chunk_count = osc_extent_chunks(ext);
+	EASSERTF(data->erd_page_count != 0 ||
+		 chunk_count <= data->erd_max_chunks, ext,
+		 "The first extent to be fit in a RPC contains %u chunks, "
+		 "which is over the limit %u.\n", chunk_count,
+		 data->erd_max_chunks);
+	if (chunk_count > data->erd_max_chunks)
+		RETURN(0);
+
+	data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages);
+	EASSERTF(data->erd_page_count != 0 ||
+		ext->oe_nr_pages <= data->erd_max_pages, ext,
+		"The first extent to be fit in a RPC contains %u pages, "
+		"which is over the limit %u.\n", ext->oe_nr_pages,
+		data->erd_max_pages);
+	if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages)
+		RETURN(0);
+
+	list_for_each_entry(tmp, data->erd_rpc_list, oe_link) {
+		EASSERT(tmp->oe_owner == current, tmp);
+
+		if (!can_merge(ext, tmp))
+			RETURN(0);
+	}
+
+	data->erd_max_extents--;
+	data->erd_max_chunks -= chunk_count;
+	data->erd_page_count += ext->oe_nr_pages;
+	list_move_tail(&ext->oe_link, data->erd_rpc_list);
+	ext->oe_owner = current;
+	RETURN(1);
+}
+
+/**
+ * In order to prevent multiple ptlrpcd from breaking contiguous extents,
+ * get_write_extent() takes all appropriate extents in atomic.
+ *
+ * The following policy is used to collect extents for IO:
+ * 1. Add as many HP extents as possible;
+ * 2. Add the first urgent extent in urgent extent list and take it out of
+ *    urgent list;
+ * 3. Add subsequent extents of this urgent extent;
+ * 4. If urgent list is not empty, goto 2;
+ * 5. Traverse the extent tree from the 1st extent;
+ * 6. Above steps exit if there is no space in this RPC.
+ */
+static unsigned int get_write_extents(struct osc_object *obj,
+				      struct list_head *rpclist)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	struct extent_rpc_data data = {
+		.erd_rpc_list	= rpclist,
+		.erd_page_count	= 0,
+		.erd_max_pages	= cli->cl_max_pages_per_rpc,
+		.erd_max_chunks	= osc_max_write_chunks(cli),
+		.erd_max_extents = 256,
+	};
+
+	assert_osc_object_is_locked(obj);
+	while ((ext = list_first_entry_or_null(&obj->oo_hp_exts,
+					       struct osc_extent,
+					       oe_link)) != NULL) {
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
+		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
+	}
+	if (data.erd_page_count == data.erd_max_pages)
+		return data.erd_page_count;
+
+	while ((ext = list_first_entry_or_null(&obj->oo_urgent_exts,
+					       struct osc_extent,
+					       oe_link)) != NULL) {
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
+	}
+	if (data.erd_page_count == data.erd_max_pages)
+		return data.erd_page_count;
+
+	/* One key difference between full extents and other extents: full
+	 * extents can usually only be added if the rpclist was empty, so if we
+	 * can't add one, we continue on to trying to add normal extents.  This
+	 * is so we don't miss adding extra extents to an RPC containing high
+	 * priority or urgent extents.
+	 */
+	while ((ext = list_first_entry_or_null(&obj->oo_full_exts,
+					       struct osc_extent,
+					       oe_link)) != NULL) {
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			break;
+	}
+	if (data.erd_page_count == data.erd_max_pages)
+		return data.erd_page_count;
+
+	for (ext = first_extent(obj);
+	     ext;
+	     ext = next_extent(ext)) {
+		if ((ext->oe_state != OES_CACHE) ||
+		    /* this extent may be already in current rpclist */
+		    (!list_empty(&ext->oe_link) && ext->oe_owner))
+			continue;
+
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
+	}
+	return data.erd_page_count;
+}
+
+static int
+osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc)
+__must_hold(osc)
+{
+	LIST_HEAD(rpclist);
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct osc_extent *first = NULL;
+	unsigned int page_count = 0;
+	int srvlock = 0;
+	int rc = 0;
+	ENTRY;
+
+	assert_osc_object_is_locked(osc);
+
+	page_count = get_write_extents(osc, &rpclist);
+	LASSERT(equi(page_count == 0, list_empty(&rpclist)));
+
+	if (list_empty(&rpclist))
+		RETURN(0);
+
+	osc_update_pending(osc, OBD_BRW_WRITE, -page_count);
+
+	list_for_each_entry(ext, &rpclist, oe_link) {
+		LASSERT(ext->oe_state == OES_CACHE ||
+			ext->oe_state == OES_LOCK_DONE);
+		if (ext->oe_state == OES_CACHE)
+			osc_extent_state_set(ext, OES_LOCKING);
+		else
+			osc_extent_state_set(ext, OES_RPC);
+	}
+
+	/* we're going to grab page lock, so release object lock because
+	 * lock order is page lock -> object lock. */
+	osc_object_unlock(osc);
+
+	list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) {
+		if (ext->oe_state == OES_LOCKING) {
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				list_del_init(&ext->oe_link);
+				osc_extent_finish(env, ext, 0, rc);
+				continue;
+			}
+		}
+		if (first == NULL) {
+			first = ext;
+			srvlock = ext->oe_srvlock;
+		} else {
+			LASSERT(srvlock == ext->oe_srvlock);
+		}
+	}
+
+	if (!list_empty(&rpclist)) {
+		LASSERT(page_count > 0);
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE);
+		LASSERT(list_empty(&rpclist));
+	}
+
+	osc_object_lock(osc);
+	RETURN(rc);
+}
+
+/**
+ * prepare pages for ASYNC io and put pages in send queue.
+ *
+ * \param cmd OBD_BRW_* macroses
+ * \param lop pending pages
+ *
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
+ */
+static int
+osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct osc_object *osc)
+__must_hold(osc)
+{
+	struct osc_extent *ext;
+	struct osc_extent *next;
+	LIST_HEAD(rpclist);
+	struct extent_rpc_data data = {
+		.erd_rpc_list	= &rpclist,
+		.erd_page_count	= 0,
+		.erd_max_pages	= cli->cl_max_pages_per_rpc,
+		.erd_max_chunks	= UINT_MAX,
+		.erd_max_extents = UINT_MAX,
+	};
+	int rc = 0;
+	ENTRY;
+
+	assert_osc_object_is_locked(osc);
+	list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) {
+		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			break;
+		osc_extent_state_set(ext, OES_RPC);
+		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
+	}
+	LASSERT(data.erd_page_count <= data.erd_max_pages);
+
+	osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count);
+
+	if (!list_empty(&rpclist)) {
+		osc_object_unlock(osc);
+
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ);
+		LASSERT(list_empty(&rpclist));
+
+		osc_object_lock(osc);
+	}
+	RETURN(rc);
+}
+
+#define list_to_obj(list, item) ({					      \
+	struct list_head *__tmp = (list)->next;				      \
+	list_del_init(__tmp);					      \
+	list_entry(__tmp, struct osc_object, oo_##item);		      \
+})
+
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending.  These lists are maintained by osc_makes_rpc(). */
+static struct osc_object *osc_next_obj(struct client_obd *cli)
+{
+	ENTRY;
+
+	/* First return objects that have blocked locks so that they
+	 * will be flushed quickly and other clients can get the lock,
+	 * then objects which have pages ready to be stuffed into RPCs */
+	if (!list_empty(&cli->cl_loi_hp_ready_list))
+		RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item));
+	if (!list_empty(&cli->cl_loi_ready_list))
+		RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item));
+
+	/* then if we have cache waiters, return all objects with queued
+	 * writes.  This is especially important when many small files
+	 * have filled up the cache and not been fired into rpcs because
+	 * they don't pass the nr_pending/object threshhold
+	 */
+	if (waitqueue_active(&cli->cl_cache_waiters) &&
+	    !list_empty(&cli->cl_loi_write_list))
+		RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
+
+	/* then return all queued objects when we have an invalid import
+	 * so that they get flushed */
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid) {
+		if (!list_empty(&cli->cl_loi_write_list))
+			RETURN(list_to_obj(&cli->cl_loi_write_list,
+					   write_item));
+		if (!list_empty(&cli->cl_loi_read_list))
+			RETURN(list_to_obj(&cli->cl_loi_read_list,
+					   read_item));
+	}
+	RETURN(NULL);
+}
+
+/* called with the loi list lock held */
+static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli)
+__must_hold(&cli->cl_loi_list_lock)
+{
+	struct osc_object *osc;
+	int rc = 0;
+	ENTRY;
+
+	while ((osc = osc_next_obj(cli)) != NULL) {
+		struct cl_object *obj = osc2cl(osc);
+		struct lu_ref_link link;
+
+		OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
+
+		/* even if we have reached our max in flight RPCs, we still
+		 * allow all high-priority RPCs through to prevent their
+		 * starvation and leading to server evicting us for not
+		 * writing out pages in a timely manner LU-13131 */
+		if (osc_max_rpc_in_flight(cli, osc) &&
+		    list_empty(&osc->oo_hp_exts)) {
+			__osc_list_maint(cli, osc);
+			break;
+		}
+
+		cl_object_get(obj);
+		spin_unlock(&cli->cl_loi_list_lock);
+		lu_object_ref_add_at(&obj->co_lu, &link, "check", current);
+
+		/* attempt some read/write balancing by alternating between
+		 * reads and writes in an object.  The makes_rpc checks here
+		 * would be redundant if we were getting read/write work items
+		 * instead of objects.  we don't want send_oap_rpc to drain a
+		 * partial read pending queue when we're given this object to
+		 * do io on writes while there are cache waiters */
+		osc_object_lock(osc);
+		if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
+			rc = osc_send_write_rpc(env, cli, osc);
+			if (rc < 0) {
+				CERROR("Write request failed with %d\n", rc);
+
+				/* osc_send_write_rpc failed, mostly because of
+				 * memory pressure.
+				 *
+				 * It can't break here, because if:
+				 *  - a page was submitted by osc_io_submit, so
+				 *    page locked;
+				 *  - no request in flight
+				 *  - no subsequent request
+				 * The system will be in live-lock state,
+				 * because there is no chance to call
+				 * osc_io_unplug() and osc_check_rpcs() any
+				 * more. pdflush can't help in this case,
+				 * because it might be blocked at grabbing
+				 * the page lock as we mentioned.
+				 *
+				 * Anyway, continue to drain pages. */
+				/* break; */
+			}
+		}
+		if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) {
+			rc = osc_send_read_rpc(env, cli, osc);
+			if (rc < 0)
+				CERROR("Read request failed with %d\n", rc);
+		}
+		osc_object_unlock(osc);
+
+		osc_list_maint(cli, osc);
+		lu_object_ref_del_at(&obj->co_lu, &link, "check", current);
+		cl_object_put(env, obj);
+
+		spin_lock(&cli->cl_loi_list_lock);
+	}
+}
+
+int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, int async)
+{
+	int rc = 0;
+
+	if (osc != NULL && osc_list_maint(cli, osc) == 0)
+		return 0;
+
+	if (!async) {
+		spin_lock(&cli->cl_loi_list_lock);
+		osc_check_rpcs(env, cli);
+		spin_unlock(&cli->cl_loi_list_lock);
+	} else {
+		CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli);
+		LASSERT(cli->cl_writeback_work != NULL);
+		rc = ptlrpcd_queue_work(cli->cl_writeback_work);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(osc_io_unplug0);
+
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct cl_page *page, loff_t offset)
+{
+	struct obd_export     *exp = osc_export(osc);
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct page	      *vmpage = page->cp_vmpage;
+	ENTRY;
+
+	if (!page)
+		return cfs_size_round(sizeof(*oap));
+
+	oap->oap_magic = OAP_MAGIC;
+	oap->oap_cli = &exp->exp_obd->u.cli;
+	oap->oap_obj = osc;
+
+	oap->oap_page = vmpage;
+	oap->oap_obj_off = offset;
+	LASSERT(!(offset & ~PAGE_MASK));
+
+	/* Count of transient (direct i/o) pages is always stable by the time
+	 * they're submitted.  Setting this here lets us avoid calling
+	 * cl_page_clip later to set this.
+	 */
+	if (page->cp_type == CPT_TRANSIENT)
+		oap->oap_async_flags |= ASYNC_COUNT_STABLE|ASYNC_URGENT|
+					ASYNC_READY;
+
+	INIT_LIST_HEAD(&oap->oap_pending_item);
+	INIT_LIST_HEAD(&oap->oap_rpc_item);
+
+	spin_lock_init(&oap->oap_lock);
+	CDEBUG(D_INFO, "oap %p vmpage %p obj off %llu\n",
+	       oap, vmpage, oap->oap_obj_off);
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_prep_async_page);
+
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops, cl_commit_cbt cb)
+{
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_extent     *ext = NULL;
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct client_obd     *cli = oap->oap_cli;
+	struct osc_object     *osc = oap->oap_obj;
+	struct pagevec        *pvec = &osc_env_info(env)->oti_pagevec;
+	pgoff_t index;
+	unsigned int tmp;
+	unsigned int grants = 0;
+	u32    brw_flags = OBD_BRW_ASYNC;
+	int    cmd = OBD_BRW_WRITE;
+	int    need_release = 0;
+	int    rc = 0;
+	ENTRY;
+
+	if (oap->oap_magic != OAP_MAGIC)
+		RETURN(-EINVAL);
+
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
+		RETURN(-EIO);
+
+	if (!list_empty(&oap->oap_pending_item) ||
+	    !list_empty(&oap->oap_rpc_item))
+		RETURN(-EBUSY);
+
+	/* Set the OBD_BRW_SRVLOCK before the page is queued. */
+	brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
+	if (io->ci_noquota) {
+		brw_flags |= OBD_BRW_NOQUOTA;
+		cmd |= OBD_BRW_NOQUOTA;
+	}
+
+	if (oio->oi_cap_sys_resource) {
+		brw_flags |= OBD_BRW_SYS_RESOURCE;
+		cmd |= OBD_BRW_SYS_RESOURCE;
+	}
+
+	/* check if the file's owner/group is over quota */
+	/* do not check for root without root squash, because in this case
+	 * we should bypass quota
+	 */
+	if ((!oio->oi_cap_sys_resource ||
+	     cli->cl_root_squash) &&
+	    !io->ci_noquota) {
+		struct cl_object *obj;
+		struct cl_attr   *attr;
+		unsigned int qid[LL_MAXQUOTAS];
+
+		obj = cl_object_top(&osc->oo_cl);
+		attr = &osc_env_info(env)->oti_attr;
+
+		cl_object_attr_lock(obj);
+		rc = cl_object_attr_get(env, obj, attr);
+		cl_object_attr_unlock(obj);
+
+		qid[USRQUOTA] = attr->cat_uid;
+		qid[GRPQUOTA] = attr->cat_gid;
+		qid[PRJQUOTA] = attr->cat_projid;
+		if (rc == 0 && osc_quota_chkdq(cli, qid) == -EDQUOT)
+			rc = -EDQUOT;
+		if (rc)
+			RETURN(rc);
+	}
+
+	oap->oap_cmd = cmd;
+	oap->oap_page_off = ops->ops_from;
+	oap->oap_count = ops->ops_to - ops->ops_from + 1;
+	/* No need to hold a lock here,
+	 * since this page is not in any list yet. */
+	oap->oap_async_flags = 0;
+	oap->oap_brw_flags = brw_flags;
+
+	OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n",
+		     oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK);
+
+	index = osc_index(oap2osc(oap));
+
+	/* Add this page into extent by the following steps:
+	 * 1. if there exists an active extent for this IO, mostly this page
+	 *    can be added to the active extent and sometimes we need to
+	 *    expand extent to accomodate this page;
+	 * 2. otherwise, a new extent will be allocated. */
+
+	ext = oio->oi_active;
+	if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) {
+		/* one chunk plus extent overhead must be enough to write this
+		 * page */
+		grants = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax;
+		if (ext->oe_end >= index)
+			grants = 0;
+
+		/* it doesn't need any grant to dirty this page */
+		spin_lock(&cli->cl_loi_list_lock);
+		rc = osc_enter_cache_try(cli, oap, grants);
+		if (rc == 0) { /* try failed */
+			grants = 0;
+			need_release = 1;
+		} else if (ext->oe_end < index) {
+			tmp = grants;
+			/* try to expand this extent */
+			rc = osc_extent_expand(ext, index, &tmp);
+			if (rc < 0) {
+				need_release = 1;
+				/* don't free reserved grant */
+			} else {
+				OSC_EXTENT_DUMP(D_CACHE, ext,
+						"expanded for %lu.\n", index);
+				osc_unreserve_grant_nolock(cli, grants, tmp);
+				grants = 0;
+			}
+		}
+		spin_unlock(&cli->cl_loi_list_lock);
+		rc = 0;
+	} else if (ext != NULL) {
+		/* index is located outside of active extent */
+		need_release = 1;
+	}
+	if (need_release) {
+		osc_extent_release(env, ext);
+		oio->oi_active = NULL;
+		ext = NULL;
+	}
+
+	if (ext == NULL) {
+		tmp = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax;
+
+		/* try to find new extent to cover this page */
+		LASSERT(oio->oi_active == NULL);
+		/* we may have allocated grant for this page if we failed
+		 * to expand the previous active extent. */
+		LASSERT(ergo(grants > 0, grants >= tmp));
+
+		rc = 0;
+
+		/* We must not hold a page lock while we do osc_enter_cache()
+		 * or osc_extent_find(), so we must mark dirty & unlock
+		 * any pages in the write commit pagevec. */
+		if (pagevec_count(pvec)) {
+			cb(env, io, pvec);
+			pagevec_reinit(pvec);
+		}
+
+		if (grants == 0) {
+			rc = osc_enter_cache(env, cli, oap, tmp);
+			if (rc == 0)
+				grants = tmp;
+		}
+
+		tmp = grants;
+		if (rc == 0) {
+			ext = osc_extent_find(env, osc, index, &tmp);
+			if (IS_ERR(ext)) {
+				LASSERT(tmp == grants);
+				osc_exit_cache(cli, oap);
+				rc = PTR_ERR(ext);
+				ext = NULL;
+			} else {
+				oio->oi_active = ext;
+			}
+		}
+		if (grants > 0)
+			osc_unreserve_grant(cli, grants, tmp);
+	}
+
+	LASSERT(ergo(rc == 0, ext != NULL));
+	if (ext != NULL) {
+		EASSERTF(ext->oe_end >= index && ext->oe_start <= index,
+			 ext, "index = %lu.\n", index);
+		LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
+
+		osc_object_lock(osc);
+		if (ext->oe_nr_pages == 0)
+			ext->oe_srvlock = ops->ops_srvlock;
+		else
+			LASSERT(ext->oe_srvlock == ops->ops_srvlock);
+		++ext->oe_nr_pages;
+		list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
+		osc_object_unlock(osc);
+
+		if (!ext->oe_layout_version)
+			ext->oe_layout_version = io->ci_layout_version;
+	}
+
+	RETURN(rc);
+}
+
+int osc_teardown_async_page(const struct lu_env *env,
+			    struct osc_object *obj, struct osc_page *ops)
+{
+	struct osc_async_page *oap = &ops->ops_oap;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oap->oap_magic == OAP_MAGIC);
+
+	CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n",
+	       oap, ops, osc_index(oap2osc(oap)));
+
+	if (!list_empty(&oap->oap_rpc_item)) {
+		CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap);
+		rc = -EBUSY;
+	} else if (!list_empty(&oap->oap_pending_item)) {
+		struct osc_extent *ext = NULL;
+
+		osc_object_lock(obj);
+		ext = osc_extent_lookup(obj, osc_index(oap2osc(oap)));
+		osc_object_unlock(obj);
+		/* only truncated pages are allowed to be taken out.
+		 * See osc_extent_truncate() and osc_cache_truncate_start()
+		 * for details. */
+		if (ext != NULL && ext->oe_state != OES_TRUNC) {
+			OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n",
+					osc_index(oap2osc(oap)));
+			rc = -EBUSY;
+		}
+		if (ext != NULL)
+			osc_extent_put(env, ext);
+	}
+	RETURN(rc);
+}
+
+/**
+ * This is called when a page is picked up by kernel to write out.
+ *
+ * We should find out the corresponding extent and add the whole extent
+ * into urgent list. The extent may be being truncated or used, handle it
+ * carefully.
+ */
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops)
+{
+	struct osc_extent *ext   = NULL;
+	struct osc_object *obj   = cl2osc(ops->ops_cl.cpl_obj);
+	struct cl_page    *cp    = ops->ops_cl.cpl_page;
+	pgoff_t            index = osc_index(ops);
+	struct osc_async_page *oap = &ops->ops_oap;
+	bool unplug = false;
+	int rc = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	ext = osc_extent_lookup(obj, index);
+	if (ext == NULL) {
+		osc_extent_tree_dump(D_ERROR, obj);
+		LASSERTF(0, "page index %lu is NOT covered.\n", index);
+	}
+
+	switch (ext->oe_state) {
+	case OES_RPC:
+	case OES_LOCK_DONE:
+		CL_PAGE_DEBUG(D_ERROR, env, cp, "flush an in-rpc page?\n");
+		LASSERT(0);
+		break;
+	case OES_LOCKING:
+		/* If we know this extent is being written out, we should abort
+		 * so that the writer can make this page ready. Otherwise, there
+		 * exists a deadlock problem because other process can wait for
+		 * page writeback bit holding page lock; and meanwhile in
+		 * vvp_page_make_ready(), we need to grab page lock before
+		 * really sending the RPC. */
+	case OES_TRUNC:
+		/* race with truncate, page will be redirtied */
+	case OES_ACTIVE:
+		/* The extent is active so we need to abort and let the caller
+		 * re-dirty the page. If we continued on here, and we were the
+		 * one making the extent active, we could deadlock waiting for
+		 * the page writeback to clear but it won't because the extent
+		 * is active and won't be written out. */
+		GOTO(out, rc = -EAGAIN);
+	default:
+		break;
+	}
+
+	rc = cl_page_prep(env, io, cp, CRT_WRITE);
+	if (rc)
+		GOTO(out, rc);
+
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT;
+	spin_unlock(&oap->oap_lock);
+
+	if (current->flags & PF_MEMALLOC)
+		ext->oe_memalloc = 1;
+
+	ext->oe_urgent = 1;
+	if (ext->oe_state == OES_CACHE) {
+		OSC_EXTENT_DUMP(D_CACHE, ext,
+				"flush page %p make it urgent.\n", oap);
+		if (list_empty(&ext->oe_link))
+			list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		unplug = true;
+	}
+	rc = 0;
+	EXIT;
+
+out:
+	osc_object_unlock(obj);
+	osc_extent_put(env, ext);
+	if (unplug)
+		osc_io_unplug_async(env, osc_cli(obj), obj);
+	return rc;
+}
+
+int osc_queue_sync_pages(const struct lu_env *env, struct cl_io *io,
+			 struct osc_object *obj, struct list_head *list,
+			 int brw_flags)
+{
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_extent     *ext;
+	struct osc_async_page *oap;
+	int     page_count = 0;
+	int     mppr       = cli->cl_max_pages_per_rpc;
+	bool	can_merge   = true;
+	pgoff_t start      = CL_PAGE_EOF;
+	pgoff_t end        = 0;
+	ENTRY;
+
+	list_for_each_entry(oap, list, oap_pending_item) {
+		struct osc_page *opg = oap2osc_page(oap);
+		pgoff_t index = osc_index(opg);
+
+		if (index > end)
+			end = index;
+		if (index < start)
+			start = index;
+		++page_count;
+		mppr <<= (page_count > mppr);
+
+		if (unlikely(opg->ops_from > 0 ||
+			     opg->ops_to < PAGE_SIZE - 1))
+			can_merge = false;
+	}
+
+	ext = osc_extent_alloc(obj);
+	if (ext == NULL) {
+		struct osc_async_page *tmp;
+
+		list_for_each_entry_safe(oap, tmp, list, oap_pending_item) {
+			list_del_init(&oap->oap_pending_item);
+			osc_ap_completion(env, cli, oap, 0, -ENOMEM);
+		}
+		RETURN(-ENOMEM);
+	}
+
+	ext->oe_rw = !!(brw_flags & OBD_BRW_READ);
+	ext->oe_sync = 1;
+	ext->oe_no_merge = !can_merge;
+	ext->oe_urgent = 1;
+	ext->oe_start = start;
+	ext->oe_end = ext->oe_max_end = end;
+	ext->oe_obj = obj;
+	ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+	ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY);
+	ext->oe_dio = !!(brw_flags & OBD_BRW_NOCACHE);
+	if (ext->oe_dio && !ext->oe_rw) { /* direct io write */
+		int grants;
+		int ppc;
+
+		ppc = 1 << (cli->cl_chunkbits - PAGE_SHIFT);
+		grants = cli->cl_grant_extent_tax;
+		grants += (1 << cli->cl_chunkbits) *
+			((page_count + ppc - 1) / ppc);
+
+		CDEBUG(D_CACHE, "requesting %d bytes grant\n", grants);
+		spin_lock(&cli->cl_loi_list_lock);
+		if (osc_reserve_grant(cli, grants) == 0) {
+			list_for_each_entry(oap, list, oap_pending_item) {
+				osc_consume_write_grant(cli,
+							&oap->oap_brw_page);
+			}
+			atomic_long_add(page_count, &obd_dirty_pages);
+			osc_unreserve_grant_nolock(cli, grants, 0);
+			ext->oe_grants = grants;
+		} else {
+			/* We cannot report ENOSPC correctly if we do parallel
+			 * DIO (async RPC submission), so turn off parallel dio
+			 * if there is not sufficient grant available.  This
+			 * makes individual RPCs synchronous.
+			 */
+			io->ci_parallel_dio = false;
+			CDEBUG(D_CACHE,
+			"not enough grant available, switching to sync for this i/o\n");
+		}
+		spin_unlock(&cli->cl_loi_list_lock);
+		osc_update_next_shrink(cli);
+	}
+
+	ext->oe_is_rdma_only = !!(brw_flags & OBD_BRW_RDMA_ONLY);
+	ext->oe_nr_pages = page_count;
+	ext->oe_mppr = mppr;
+	list_splice_init(list, &ext->oe_pages);
+	ext->oe_layout_version = io->ci_layout_version;
+
+	osc_object_lock(obj);
+	/* Reuse the initial refcount for RPC, don't drop it */
+	osc_extent_state_set(ext, OES_LOCK_DONE);
+	if (!ext->oe_rw) { /* write */
+		if (!ext->oe_srvlock && !ext->oe_dio) {
+			/* The most likely case here is from lack of grants
+			 * so we are either out of quota or out of space.
+			 * Since this means we are holding locks across
+			 * potentially multi-striped IO, we must send out
+			 * everything out instantly to avoid prolonged
+			 * waits resulting in lock eviction (likely since
+			 * the extended wait in osc_cache_enter() did not
+			 * yield any additional grant due to a timeout.
+			 * LU-13131 */
+			ext->oe_hp = 1;
+			list_add_tail(&ext->oe_link, &obj->oo_hp_exts);
+		} else {
+			list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		}
+		osc_update_pending(obj, OBD_BRW_WRITE, page_count);
+	} else {
+		list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
+		osc_update_pending(obj, OBD_BRW_READ, page_count);
+	}
+	osc_object_unlock(obj);
+
+	osc_io_unplug_async(env, cli, obj);
+	RETURN(0);
+}
+
+/**
+ * Called by osc_io_setattr_start() to freeze and destroy covering extents.
+ */
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
+			     __u64 size, struct osc_extent **extp)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	struct osc_extent *waiting = NULL;
+	pgoff_t index;
+	LIST_HEAD(list);
+	int result = 0;
+	bool partial;
+	ENTRY;
+
+	/* pages with index greater or equal to index will be truncated. */
+	index = cl_index(osc2cl(obj), size);
+	partial = size > cl_offset(osc2cl(obj), index);
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		EASSERT(ext->oe_state != OES_TRUNC, ext);
+
+		if (ext->oe_state > OES_CACHE || ext->oe_urgent) {
+			/* if ext is in urgent state, it means there must exist
+			 * a page already having been flushed by write_page().
+			 * We have to wait for this extent because we can't
+			 * truncate that page. */
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"waiting for busy extent\n");
+			waiting = osc_extent_get(ext);
+			break;
+		}
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size);
+
+		osc_extent_get(ext);
+		if (ext->oe_state == OES_ACTIVE) {
+			/* though we grab inode mutex for write path, but we
+			 * release it before releasing extent(in osc_io_end()),
+			 * so there is a race window that an extent is still
+			 * in OES_ACTIVE when truncate starts. */
+			LASSERT(!ext->oe_trunc_pending);
+			ext->oe_trunc_pending = 1;
+		} else {
+			EASSERT(ext->oe_state == OES_CACHE, ext);
+			osc_extent_state_set(ext, OES_TRUNC);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   -ext->oe_nr_pages);
+		}
+		/* This extent could be on the full extents list, that's OK */
+		EASSERT(!ext->oe_hp && !ext->oe_urgent, ext);
+		if (!list_empty(&ext->oe_link))
+			list_move_tail(&ext->oe_link, &list);
+		else
+			list_add_tail(&ext->oe_link, &list);
+
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	osc_list_maint(cli, obj);
+
+	while ((ext = list_first_entry_or_null(&list,
+					       struct osc_extent,
+					       oe_link)) != NULL) {
+		int rc;
+
+		list_del_init(&ext->oe_link);
+
+		/* extent may be in OES_ACTIVE state because inode mutex
+		 * is released before osc_io_end() in file write case */
+		if (ext->oe_state != OES_TRUNC)
+			osc_extent_wait(env, ext, OES_TRUNC);
+
+		rc = osc_extent_truncate(ext, index, partial);
+		if (rc < 0) {
+			if (result == 0)
+				result = rc;
+
+			OSC_EXTENT_DUMP(D_ERROR, ext,
+					"truncate error %d\n", rc);
+		} else if (ext->oe_nr_pages == 0) {
+			osc_extent_remove(ext);
+		} else {
+			/* this must be an overlapped extent which means only
+			 * part of pages in this extent have been truncated.
+			 */
+			EASSERTF(ext->oe_start <= index, ext,
+				 "trunc index = %lu/%d.\n", index, partial);
+			/* fix index to skip this partially truncated extent */
+			index = ext->oe_end + 1;
+			partial = false;
+
+			/* we need to hold this extent in OES_TRUNC state so
+			 * that no writeback will happen. This is to avoid
+			 * BUG 17397.
+			 * Only partial truncate can reach here, if @size is
+			 * not zero, the caller should provide a valid @extp. */
+			LASSERT(*extp == NULL);
+			*extp = osc_extent_get(ext);
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"trunc at %llu\n", size);
+		}
+		osc_extent_put(env, ext);
+	}
+	if (waiting != NULL) {
+		int rc;
+
+		/* ignore the result of osc_extent_wait the write initiator
+		 * should take care of it. */
+		rc = osc_extent_wait(env, waiting, OES_INV);
+		if (rc < 0)
+			OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc);
+
+		osc_extent_put(env, waiting);
+		waiting = NULL;
+		goto again;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(osc_cache_truncate_start);
+
+/**
+ * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
+ */
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext)
+{
+	if (ext != NULL) {
+		struct osc_object *obj = ext->oe_obj;
+		bool unplug = false;
+
+		EASSERT(ext->oe_nr_pages > 0, ext);
+		EASSERT(ext->oe_state == OES_TRUNC, ext);
+		EASSERT(!ext->oe_urgent, ext);
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n");
+		osc_object_lock(obj);
+		osc_extent_state_set(ext, OES_CACHE);
+		if (ext->oe_fsync_wait && !ext->oe_urgent) {
+			ext->oe_urgent = 1;
+			list_move_tail(&ext->oe_link, &obj->oo_urgent_exts);
+			unplug = true;
+		}
+		osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages);
+		osc_object_unlock(obj);
+		osc_extent_put(env, ext);
+
+		if (unplug)
+			osc_io_unplug_async(env, osc_cli(obj), obj);
+	}
+}
+
+/**
+ * Wait for extents in a specific range to be written out.
+ * The caller must have called osc_cache_writeback_range() to issue IO
+ * otherwise it will take a long time for this function to finish.
+ *
+ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
+ * nobody else can dirty this range of file while we're waiting for
+ * extents to be written.
+ */
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end)
+{
+	struct osc_extent *ext;
+	pgoff_t index = start;
+	int     result = 0;
+	ENTRY;
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		int rc;
+
+		if (ext->oe_start > end)
+			break;
+
+		if (!ext->oe_fsync_wait) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		EASSERT(ergo(ext->oe_state == OES_CACHE,
+			     ext->oe_hp || ext->oe_urgent), ext);
+		EASSERT(ergo(ext->oe_state == OES_ACTIVE,
+			     !ext->oe_hp && ext->oe_urgent), ext);
+
+		index = ext->oe_end + 1;
+		osc_extent_get(ext);
+		osc_object_unlock(obj);
+
+		rc = osc_extent_wait(env, ext, OES_INV);
+		if (result == 0)
+			result = rc;
+		osc_extent_put(env, ext);
+		goto again;
+	}
+	osc_object_unlock(obj);
+
+	OSC_IO_DEBUG(obj, "sync file range.\n");
+	RETURN(result);
+}
+EXPORT_SYMBOL(osc_cache_wait_range);
+
+/**
+ * Called to write out a range of osc object.
+ *
+ * @hp     : should be set this is caused by lock cancel;
+ * @discard: is set if dirty pages should be dropped - file will be deleted or
+ *	   truncated, this implies there is no partially discarding extents.
+ *
+ * Return how many pages will be issued, or error code if error occurred.
+ */
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard)
+{
+	struct osc_extent *ext;
+	LIST_HEAD(discard_list);
+	bool unplug = false;
+	int result = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < start)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		if (ext->oe_start > end)
+			break;
+
+		ext->oe_fsync_wait = 1;
+		switch (ext->oe_state) {
+		case OES_CACHE:
+			result += ext->oe_nr_pages;
+			if (!discard) {
+				struct list_head *list = NULL;
+				if (hp) {
+					EASSERT(!ext->oe_hp, ext);
+					ext->oe_hp = 1;
+					list = &obj->oo_hp_exts;
+				} else if (!ext->oe_urgent && !ext->oe_hp) {
+					ext->oe_urgent = 1;
+					list = &obj->oo_urgent_exts;
+				}
+				if (list != NULL)
+					list_move_tail(&ext->oe_link, list);
+				unplug = true;
+			} else {
+				struct client_obd *cli = osc_cli(obj);
+				int pcc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+				pgoff_t align_by = (1 << pcc_bits);
+				pgoff_t a_start = round_down(start, align_by);
+				pgoff_t a_end = round_up(end, align_by);
+
+				/* overflow case */
+				if (end && !a_end)
+					a_end = CL_PAGE_EOF;
+				/* the only discarder is lock cancelling, so
+				 * [start, end], aligned by chunk size, must
+				 * contain this extent */
+				LASSERTF(ext->oe_start >= a_start &&
+					 ext->oe_end <= a_end,
+					 "ext [%lu, %lu] reg [%lu, %lu] "
+					 "orig [%lu %lu] align %lu bits "
+					 "%d\n", ext->oe_start, ext->oe_end,
+					 a_start, a_end, start, end,
+					 align_by, pcc_bits);
+				osc_extent_state_set(ext, OES_LOCKING);
+				ext->oe_owner = current;
+				list_move_tail(&ext->oe_link,
+						   &discard_list);
+				osc_update_pending(obj, OBD_BRW_WRITE,
+						   -ext->oe_nr_pages);
+			}
+			break;
+		case OES_ACTIVE:
+			/* It's pretty bad to wait for ACTIVE extents, because
+			 * we don't know how long we will wait for it to be
+			 * flushed since it may be blocked at awaiting more
+			 * grants. We do this for the correctness of fsync. */
+			LASSERT(hp == 0 && discard == 0);
+			ext->oe_urgent = 1;
+			break;
+		case OES_TRUNC:
+			/* this extent is being truncated, can't do anything
+			 * for it now. it will be set to urgent after truncate
+			 * is finished in osc_cache_truncate_end(). */
+		default:
+			break;
+		}
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	LASSERT(ergo(!discard, list_empty(&discard_list)));
+	if (!list_empty(&discard_list)) {
+		struct osc_extent *tmp;
+		int rc;
+
+		osc_list_maint(osc_cli(obj), obj);
+		list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) {
+			list_del_init(&ext->oe_link);
+			EASSERT(ext->oe_state == OES_LOCKING, ext);
+
+			/* Discard caching pages. We don't actually write this
+			 * extent out but we complete it as if we did. */
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				OSC_EXTENT_DUMP(D_ERROR, ext,
+						"make_ready returned %d\n", rc);
+				if (result >= 0)
+					result = rc;
+			}
+
+			/* finish the extent as if the pages were sent */
+			osc_extent_finish(env, ext, 0, 0);
+		}
+	}
+
+	if (unplug)
+		osc_io_unplug(env, osc_cli(obj), obj);
+
+	if (hp || discard) {
+		int rc;
+		rc = osc_cache_wait_range(env, obj, start, end);
+		if (result >= 0 && rc < 0)
+			result = rc;
+	}
+
+	OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(osc_cache_writeback_range);
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+			  struct osc_object *osc, pgoff_t start, pgoff_t end,
+			  osc_page_gang_cbt cb, void *cbdata)
+{
+	struct osc_page *ops;
+	struct pagevec	*pagevec;
+	void            **pvec;
+	pgoff_t         idx;
+	unsigned int    nr;
+	unsigned int    i;
+	unsigned int    j;
+	bool            res = true;
+	bool            tree_lock = true;
+	ENTRY;
+
+	idx = start;
+	pvec = osc_env_info(env)->oti_pvec;
+	pagevec = &osc_env_info(env)->oti_pagevec;
+	ll_pagevec_init(pagevec, 0);
+	spin_lock(&osc->oo_tree_lock);
+	while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec,
+					    idx, OTI_PVEC_SIZE)) > 0) {
+		struct cl_page *page;
+		bool end_of_region = false;
+
+		for (i = 0, j = 0; i < nr; ++i) {
+			ops = pvec[i];
+			pvec[i] = NULL;
+
+			idx = osc_index(ops);
+			if (idx > end) {
+				end_of_region = true;
+				break;
+			}
+
+			page = ops->ops_cl.cpl_page;
+			LASSERT(page->cp_type == CPT_CACHEABLE);
+			if (page->cp_state == CPS_FREEING)
+				continue;
+
+			cl_page_get(page);
+			lu_ref_add_atomic(&page->cp_reference,
+					  "gang_lookup", current);
+			pvec[j++] = ops;
+		}
+		++idx;
+
+		/*
+		 * Here a delicate locking dance is performed. Current thread
+		 * holds a reference to a page, but has to own it before it
+		 * can be placed into queue. Owning implies waiting, so
+		 * radix-tree lock is to be released. After a wait one has to
+		 * check that pages weren't truncated (cl_page_own() returns
+		 * error in the latter case).
+		 */
+		spin_unlock(&osc->oo_tree_lock);
+		tree_lock = false;
+
+		res = (*cb)(env, io, pvec, j, cbdata);
+
+		for (i = 0; i < j; ++i) {
+			ops = pvec[i];
+			page = ops->ops_cl.cpl_page;
+			lu_ref_del(&page->cp_reference, "gang_lookup", current);
+			cl_pagevec_put(env, page, pagevec);
+		}
+		pagevec_release(pagevec);
+
+		if (nr < OTI_PVEC_SIZE || end_of_region)
+			break;
+
+		if (!res)
+			break;
+
+		OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SLOW_PAGE_EVICT,
+				 cfs_fail_val ?: 20);
+
+		if (io->ci_type == CIT_MISC &&
+		    io->u.ci_misc.lm_next_rpc_time &&
+		    ktime_get_seconds() > io->u.ci_misc.lm_next_rpc_time) {
+			osc_send_empty_rpc(osc, idx << PAGE_SHIFT);
+			io->u.ci_misc.lm_next_rpc_time = ktime_get_seconds() +
+							 5 * obd_timeout / 16;
+		}
+
+		if (need_resched())
+			cond_resched();
+
+		spin_lock(&osc->oo_tree_lock);
+		tree_lock = true;
+	}
+	if (tree_lock)
+		spin_unlock(&osc->oo_tree_lock);
+	RETURN(res);
+}
+EXPORT_SYMBOL(osc_page_gang_lookup);
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static bool check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+				 void **pvec, int count, void *cbdata)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct osc_object *osc = cbdata;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		struct osc_page *ops = pvec[i];
+		struct cl_page *page = ops->ops_cl.cpl_page;
+		pgoff_t index = osc_index(ops);
+		bool discard = false;
+
+		/* negative lock caching */
+		if (index < info->oti_ng_index) {
+			discard = true;
+		} else if (index >= info->oti_fn_index) {
+			struct ldlm_lock *tmp;
+			/* refresh non-overlapped index */
+			tmp = osc_dlmlock_at_pgoff(env, osc, index,
+					OSC_DAP_FL_TEST_LOCK |
+					OSC_DAP_FL_AST |
+					OSC_DAP_FL_RIGHT);
+			if (tmp != NULL) {
+				__u64 end =
+					tmp->l_policy_data.l_extent.end;
+				__u64 start =
+					tmp->l_policy_data.l_extent.start;
+
+				/* no lock covering this page */
+				if (index < cl_index(osc2cl(osc), start)) {
+					/* no lock at @index,
+					 * first lock at @start
+					 */
+					info->oti_ng_index =
+						cl_index(osc2cl(osc), start);
+					discard = true;
+				} else {
+					/* Cache the first-non-overlapped
+					 * index so as to skip all pages
+					 * within [index, oti_fn_index).
+					 * This is safe because if tmp lock
+					 * is canceled, it will discard these
+					 * pages.
+					 */
+					info->oti_fn_index =
+						cl_index(osc2cl(osc), end + 1);
+					if (end == OBD_OBJECT_EOF)
+						info->oti_fn_index =
+							CL_PAGE_EOF;
+				}
+				LDLM_LOCK_PUT(tmp);
+			} else {
+				info->oti_ng_index = CL_PAGE_EOF;
+				discard = true;
+			}
+		}
+
+		if (discard) {
+			if (cl_page_own(env, io, page) == 0) {
+				cl_page_discard(env, io, page);
+				cl_page_disown(env, io, page);
+			} else {
+				LASSERT(page->cp_state == CPS_FREEING);
+			}
+		}
+
+		info->oti_next_index = index + 1;
+	}
+	return true;
+}
+
+bool osc_discard_cb(const struct lu_env *env, struct cl_io *io,
+		    void **pvec, int count, void *cbdata)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	int i;
+
+	for (i = 0; i < count; i++) {
+		struct osc_page *ops = pvec[i];
+		struct cl_page *page = ops->ops_cl.cpl_page;
+
+		/* page is top page. */
+		info->oti_next_index = osc_index(ops) + 1;
+		if (cl_page_own(env, io, page) == 0) {
+			if (!ergo(page->cp_type == CPT_CACHEABLE,
+				  !PageDirty(cl_page_vmpage(page))))
+				CL_PAGE_DEBUG(D_ERROR, env, page,
+					      "discard dirty page?\n");
+
+			/* discard the page */
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+		}
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(osc_discard_cb);
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
+			   pgoff_t start, pgoff_t end, bool discard)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct cl_io *io = osc_env_thread_io(env);
+	osc_page_gang_cbt cb;
+	int result;
+
+	ENTRY;
+
+	io->ci_obj = cl_object_top(osc2cl(osc));
+	io->ci_ignore_layout = 1;
+	io->ci_invalidate_page_cache = 1;
+	io->u.ci_misc.lm_next_rpc_time = ktime_get_seconds() +
+					 5 * obd_timeout / 16;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		GOTO(out, result);
+
+	cb = discard ? osc_discard_cb : check_and_discard_cb;
+	info->oti_fn_index = info->oti_next_index = start;
+	info->oti_ng_index = 0;
+
+	osc_page_gang_lookup(env, io, osc,
+			     info->oti_next_index, end, cb, osc);
+out:
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
new file mode 100644
index 0000000000000..a2d3bcaab069a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
@@ -0,0 +1,252 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_device, for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* class_name2obd() */
+#include <obd_class.h>
+#include <lustre_osc.h>
+#include <uapi/linux/lustre/lustre_param.h>
+
+#include "osc_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+struct kmem_cache *osc_lock_kmem;
+EXPORT_SYMBOL(osc_lock_kmem);
+struct kmem_cache *osc_object_kmem;
+EXPORT_SYMBOL(osc_object_kmem);
+
+struct kmem_cache *osc_thread_kmem;
+struct kmem_cache *osc_session_kmem;
+struct kmem_cache *osc_extent_kmem;
+struct kmem_cache *osc_quota_kmem;
+struct kmem_cache *osc_obdo_kmem;
+
+struct lu_kmem_descr osc_caches[] = {
+        {
+                .ckd_cache = &osc_lock_kmem,
+                .ckd_name  = "osc_lock_kmem",
+                .ckd_size  = sizeof (struct osc_lock)
+        },
+        {
+                .ckd_cache = &osc_object_kmem,
+                .ckd_name  = "osc_object_kmem",
+                .ckd_size  = sizeof (struct osc_object)
+        },
+        {
+                .ckd_cache = &osc_thread_kmem,
+                .ckd_name  = "osc_thread_kmem",
+                .ckd_size  = sizeof (struct osc_thread_info)
+        },
+        {
+                .ckd_cache = &osc_session_kmem,
+                .ckd_name  = "osc_session_kmem",
+                .ckd_size  = sizeof (struct osc_session)
+        },
+        {
+		.ckd_cache = &osc_extent_kmem,
+		.ckd_name  = "osc_extent_kmem",
+		.ckd_size  = sizeof (struct osc_extent)
+	},
+	{
+		.ckd_cache = &osc_quota_kmem,
+		.ckd_name  = "osc_quota_kmem",
+		.ckd_size  = sizeof(struct osc_quota_info)
+	},
+	{
+		.ckd_cache = &osc_obdo_kmem,
+		.ckd_name  = "osc_obdo_kmem",
+		.ckd_size  = sizeof(struct obdo)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/*****************************************************************************
+ *
+ * Osc device and device type functions.
+ *
+ */
+
+static void *osc_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct osc_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
+{
+	struct osc_thread_info *info = data;
+
+	lu_buf_free(&info->oti_ladvise_buf);
+	OBD_SLAB_FREE_PTR(info, osc_thread_kmem);
+}
+
+struct lu_context_key osc_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = osc_key_init,
+        .lct_fini = osc_key_fini
+};
+EXPORT_SYMBOL(osc_key);
+
+static void *osc_session_init(const struct lu_context *ctx,
+			      struct lu_context_key *key)
+{
+	struct osc_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_session_fini(const struct lu_context *ctx,
+                             struct lu_context_key *key, void *data)
+{
+        struct osc_session *info = data;
+        OBD_SLAB_FREE_PTR(info, osc_session_kmem);
+}
+
+struct lu_context_key osc_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = osc_session_init,
+        .lct_fini = osc_session_fini
+};
+EXPORT_SYMBOL(osc_session_key);
+
+/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
+
+static int osc_process_config(const struct lu_env *env, struct lu_device *d,
+			      struct lustre_cfg *cfg)
+{
+	ssize_t count  = class_modify_config(cfg, PARAM_OSC,
+					     &d->ld_obd->obd_kset.kobj);
+	return count > 0 ? 0 : count;
+}
+
+static const struct lu_device_operations osc_lu_ops = {
+        .ldo_object_alloc      = osc_object_alloc,
+	.ldo_process_config    = osc_process_config,
+        .ldo_recovery_complete = NULL
+};
+
+int osc_device_init(const struct lu_env *env, struct lu_device *d,
+		    const char *name, struct lu_device *next)
+{
+        RETURN(0);
+}
+EXPORT_SYMBOL(osc_device_init);
+
+struct lu_device *osc_device_fini(const struct lu_env *env,
+				  struct lu_device *d)
+{
+	return NULL;
+}
+EXPORT_SYMBOL(osc_device_fini);
+
+struct lu_device *osc_device_free(const struct lu_env *env,
+				  struct lu_device *d)
+{
+	struct osc_device *oc = lu2osc_dev(d);
+
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(oc);
+	return NULL;
+}
+EXPORT_SYMBOL(osc_device_free);
+
+static struct lu_device *osc_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct osc_device *osc;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(osc);
+	if (osc == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&osc->osc_cl, t);
+	d = osc2lu_dev(osc);
+	d->ld_ops = &osc_lu_ops;
+
+	/* Setup OSC OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = osc_setup(obd, cfg);
+	if (rc) {
+		osc_device_free(env, d);
+		RETURN(ERR_PTR(rc));
+	}
+	osc->osc_exp = obd->obd_self_export;
+	osc->osc_stats.os_init = ktime_get_real();
+	RETURN(d);
+}
+
+static const struct lu_device_type_operations osc_device_type_ops = {
+        .ldto_init = osc_type_init,
+        .ldto_fini = osc_type_fini,
+
+        .ldto_start = osc_type_start,
+        .ldto_stop  = osc_type_stop,
+
+        .ldto_device_alloc = osc_device_alloc,
+        .ldto_device_free  = osc_device_free,
+
+        .ldto_device_init    = osc_device_init,
+        .ldto_device_fini    = osc_device_fini
+};
+
+struct lu_device_type osc_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_OSC_NAME,
+        .ldt_ops      = &osc_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
new file mode 100644
index 0000000000000..52a7ab503d419
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#ifndef OSC_INTERNAL_H
+#define OSC_INTERNAL_H
+
+#define OAP_MAGIC 8675309
+
+#include <libcfs/linux/linux-mem.h>
+#include <lustre_osc.h>
+
+extern atomic_t osc_pool_req_count;
+extern unsigned int osc_reqpool_maxreqcount;
+extern struct ptlrpc_request_pool *osc_rq_pool;
+
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
+void osc_schedule_grant_work(void);
+void osc_update_next_shrink(struct client_obd *cli);
+int lru_queue_work(const struct lu_env *env, void *data);
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc);
+void osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
+			   pgoff_t start, pgoff_t end, bool discard);
+
+void osc_lock_lvb_update(const struct lu_env *env,
+			 struct osc_object *osc,
+			 struct ldlm_lock *dlmlock,
+			 struct ost_lvb *lvb);
+
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, union ldlm_policy_data *policy,
+		     struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
+		     void *cookie, struct ldlm_enqueue_info *einfo,
+		     struct ptlrpc_request_set *rqset, int async,
+		     bool speculative);
+
+int osc_match_base(const struct lu_env *env, struct obd_export *exp,
+		   struct ldlm_res_id *res_id, enum ldlm_type type,
+		   union ldlm_policy_data *policy, enum ldlm_mode mode,
+		   __u64 *flags, struct osc_object *obj,
+		   struct lustre_handle *lockh, enum ldlm_match_flags match_flags);
+
+int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
+		      obd_enqueue_update_f upcall, void *cookie,
+		      struct ptlrpc_request_set *rqset);
+int osc_fallocate_base(struct obd_export *exp, struct obdo *oa,
+		       obd_enqueue_update_f upcall, void *cookie,
+		       int mode);
+int osc_sync_base(struct osc_object *obj, struct obdo *oa,
+		  obd_enqueue_update_f upcall, void *cookie,
+		  struct ptlrpc_request_set *rqset);
+int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
+		     struct ladvise_hdr *ladvise_hdr,
+		     obd_enqueue_update_f upcall, void *cookie,
+		     struct ptlrpc_request_set *rqset);
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd);
+void osc_send_empty_rpc(struct osc_object *osc, pgoff_t start);
+unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages);
+void osc_lru_unreserve(struct client_obd *cli, unsigned long npages);
+
+extern struct lu_kmem_descr osc_caches[];
+
+unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+int osc_tunables_init(struct obd_device *obd);
+
+extern struct lu_device_type osc_device_type;
+
+static inline struct cl_io *osc_env_thread_io(const struct lu_env *env)
+{
+	struct cl_io *io = &osc_env_info(env)->oti_io;
+
+	memset(io, 0, sizeof(*io));
+	return io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+	return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+int osc_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io);
+int osc_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+
+static inline int osc_recoverable_error(int rc)
+{
+        return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
+                rc == -EAGAIN || rc == -EINPROGRESS);
+}
+
+static inline unsigned long rpcs_in_flight(struct client_obd *cli)
+{
+	return cli->cl_r_in_flight + cli->cl_w_in_flight;
+}
+
+static inline char *cli_name(struct client_obd *cli)
+{
+	return cli->cl_import->imp_obd->obd_name;
+}
+
+static inline char list_empty_marker(struct list_head *list)
+{
+	return list_empty(list) ? '-' : '+';
+}
+
+struct osc_async_args {
+	struct obd_info	*aa_oi;
+};
+
+int osc_quota_setup(struct obd_device *obd);
+int osc_quota_cleanup(struct obd_device *obd);
+int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
+		    u64 valid, u32 flags);
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                 struct obd_quotactl *oqctl);
+void osc_inc_unstable_pages(struct ptlrpc_request *req);
+void osc_dec_unstable_pages(struct ptlrpc_request *req);
+bool osc_over_unstable_soft_limit(struct client_obd *cli);
+void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
+		       pgoff_t idx, size_t to);
+
+struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env,
+					   struct osc_object *obj,
+					   pgoff_t index,
+					   enum osc_dap_flags flags);
+
+int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
+
+/** osc shrink list to link all osc client obd */
+extern struct list_head osc_shrink_list;
+/** spin lock to protect osc_shrink_list */
+extern spinlock_t osc_shrink_lock;
+extern unsigned long osc_cache_shrink_count(struct shrinker *sk,
+					    struct shrink_control *sc);
+extern unsigned long osc_cache_shrink_scan(struct shrinker *sk,
+					   struct shrink_control *sc);
+static inline unsigned int osc_max_write_chunks(const struct client_obd *cli)
+{
+	/*
+	 * LU-8135:
+	 *
+	 * The maximum size of a single transaction is about 64MB in ZFS.
+	 * #define DMU_MAX_ACCESS (64 * 1024 * 1024)
+	 *
+	 * Since ZFS is a copy-on-write file system, a single dirty page in
+	 * a chunk will result in the rewrite of the whole chunk, therefore
+	 * an RPC shouldn't be allowed to contain too many chunks otherwise
+	 * it will make transaction size much bigger than 64MB, especially
+	 * with big block size for ZFS.
+	 *
+	 * This piece of code is to make sure that OSC won't send write RPCs
+	 * with too many chunks. The maximum chunk size that an RPC can cover
+	 * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally
+	 * OST should tell the client what the biggest transaction size is,
+	 * but it's good enough for now.
+	 *
+	 * This limitation doesn't apply to ldiskfs, which allows as many
+	 * chunks in one RPC as we want. However, it won't have any benefits
+	 * to have too many discontiguous pages in one RPC.
+	 *
+	 * An osc_extent won't cover over a RPC size, so the chunks in an
+	 * osc_extent won't bigger than PTLRPC_MAX_BRW_SIZE >> chunkbits.
+	 */
+	return PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits;
+}
+
+static inline void osc_set_io_portal(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+
+	/* Distinguish OSC from MDC here to use OST or MDS portal */
+	if (OCD_HAS_FLAG(&imp->imp_connect_data, IBITS))
+		req->rq_request_portal = MDS_IO_PORTAL;
+	else
+		req->rq_request_portal = OST_IO_PORTAL;
+}
+
+#endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_io.c b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
new file mode 100644
index 0000000000000..e4bd2738a6cb3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
@@ -0,0 +1,1321 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_io for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <lustre_obdo.h>
+#include <lustre_osc.h>
+#include <linux/pagevec.h>
+#include <linux/falloc.h>
+
+#include "osc_internal.h"
+#include <lnet/lnet_rdma.h>
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
+{
+}
+
+void osc_read_ahead_release(const struct lu_env *env, struct cl_read_ahead *ra)
+{
+	struct ldlm_lock *dlmlock = ra->cra_dlmlock;
+	struct osc_io *oio = ra->cra_oio;
+	struct lustre_handle lockh;
+
+	oio->oi_is_readahead = 0;
+	ldlm_lock2handle(dlmlock, &lockh);
+	ldlm_lock_decref(&lockh, LCK_PR);
+	LDLM_LOCK_PUT(dlmlock);
+}
+EXPORT_SYMBOL(osc_read_ahead_release);
+
+static int osc_io_read_ahead(const struct lu_env *env,
+			     const struct cl_io_slice *ios,
+			     pgoff_t start, struct cl_read_ahead *ra)
+{
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	struct osc_io *oio = cl2osc_io(env, ios);
+	struct ldlm_lock *dlmlock;
+	int result = -ENODATA;
+
+	ENTRY;
+
+	oio->oi_is_readahead = true;
+	dlmlock = osc_dlmlock_at_pgoff(env, osc, start, 0);
+	if (dlmlock != NULL) {
+		LASSERT(dlmlock->l_ast_data == osc);
+		if (dlmlock->l_req_mode != LCK_PR) {
+			struct lustre_handle lockh;
+			ldlm_lock2handle(dlmlock, &lockh);
+			ldlm_lock_addref(&lockh, LCK_PR);
+			ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
+		}
+
+		ra->cra_rpc_pages = osc_cli(osc)->cl_max_pages_per_rpc;
+		ra->cra_end_idx = cl_index(osc2cl(osc),
+					   dlmlock->l_policy_data.l_extent.end);
+		ra->cra_release = osc_read_ahead_release;
+		ra->cra_dlmlock = dlmlock;
+		ra->cra_oio = oio;
+		if (ra->cra_end_idx != CL_PAGE_EOF)
+			ra->cra_contention = true;
+		result = 0;
+	}
+
+	RETURN(result);
+}
+
+/**
+ * An implementation of cl_io_operations::cio_io_submit() method for osc
+ * layer. Iterates over pages in the in-queue, prepares each for io by calling
+ * cl_page_prep() and then either submits them through osc_io_submit_page()
+ * or, if page is already submitted, changes osc flags through
+ * osc_set_async_flags().
+ */
+int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
+		  enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct cl_page	  *page;
+	struct cl_page	  *tmp;
+	struct client_obd *cli  = NULL;
+	struct osc_object *osc  = NULL;	/* to keep gcc happy */
+	struct osc_page	  *opg;
+	struct cl_io	  *io;
+	LIST_HEAD(list);
+
+	struct cl_page_list *qin      = &queue->c2_qin;
+	struct cl_page_list *qout     = &queue->c2_qout;
+	unsigned int queued = 0;
+	int result = 0;
+	int brw_flags;
+	unsigned int max_pages;
+	unsigned int ppc_bits; /* pages per chunk bits */
+	unsigned int ppc;
+	ktime_t submit_time = ktime_get();
+	bool sync_queue = false;
+
+	LASSERT(qin->pl_nr > 0);
+
+	CDEBUG(D_CACHE|D_READA, "%d %d\n", qin->pl_nr, crt);
+
+	osc = cl2osc(ios->cis_obj);
+	cli = osc_cli(osc);
+	max_pages = cli->cl_max_pages_per_rpc;
+	ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+	ppc = 1 << ppc_bits;
+
+	brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+	brw_flags |= crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	if (crt == CRT_READ && ios->cis_io->ci_ndelay)
+		brw_flags |= OBD_BRW_NDELAY;
+
+	page = cl_page_list_first(qin);
+	if (page->cp_type == CPT_TRANSIENT)
+		brw_flags |= OBD_BRW_NOCACHE;
+	if (lnet_is_rdma_only_page(page->cp_vmpage))
+		brw_flags |= OBD_BRW_RDMA_ONLY;
+
+        /*
+         * NOTE: here @page is a top-level page. This is done to avoid
+         *       creation of sub-page-list.
+         */
+        cl_page_list_for_each_safe(page, tmp, qin) {
+                struct osc_async_page *oap;
+
+                /* Top level IO. */
+                io = page->cp_owner;
+                LASSERT(io != NULL);
+
+		opg = osc_cl_page_osc(page, osc);
+		oap = &opg->ops_oap;
+		LASSERT(osc == oap->oap_obj);
+
+		if (!list_empty(&oap->oap_pending_item) ||
+		    !list_empty(&oap->oap_rpc_item)) {
+			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+			       oap, opg);
+                        result = -EBUSY;
+                        break;
+                }
+
+                result = cl_page_prep(env, io, page, crt);
+		if (result != 0) {
+                        LASSERT(result < 0);
+                        if (result != -EALREADY)
+                                break;
+                        /*
+                         * Handle -EALREADY error: for read case, the page is
+                         * already in UPTODATE state; for write, the page
+                         * is not dirty.
+                         */
+                        result = 0;
+			continue;
+                }
+
+		if (page->cp_type != CPT_TRANSIENT) {
+			spin_lock(&oap->oap_lock);
+			oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY;
+			oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+			spin_unlock(&oap->oap_lock);
+		}
+
+		osc_page_submit(env, opg, crt, brw_flags, submit_time);
+		list_add_tail(&oap->oap_pending_item, &list);
+
+		if (page->cp_sync_io != NULL)
+			cl_page_list_move(qout, qin, page);
+		else /* async IO */
+			cl_page_list_del(env, qin, page);
+
+		queued++;
+		if (queued == max_pages) {
+			sync_queue = true;
+		} else if (crt == CRT_WRITE) {
+			unsigned int chunks;
+			unsigned int next_chunks;
+
+			chunks = (queued + ppc - 1) >> ppc_bits;
+			/* chunk number if add another page */
+			next_chunks = (queued + ppc) >> ppc_bits;
+
+			/* next page will excceed write chunk limit */
+			if (chunks == osc_max_write_chunks(cli) &&
+			    next_chunks > chunks)
+				sync_queue = true;
+		}
+
+		if (sync_queue) {
+			result = osc_queue_sync_pages(env, io, osc, &list,
+						      brw_flags);
+			if (result < 0)
+				break;
+			queued = 0;
+			sync_queue = false;
+		}
+	}
+
+	if (queued > 0)
+		result = osc_queue_sync_pages(env, io, osc, &list, brw_flags);
+
+	/* Update c/mtime for sync write. LU-7310 */
+	if (crt == CRT_WRITE && qout->pl_nr > 0 && result == 0) {
+		struct cl_object *obj   = ios->cis_obj;
+		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+
+		cl_object_attr_lock(obj);
+		attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds();
+		cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME);
+		cl_object_attr_unlock(obj);
+	}
+
+	CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
+	return qout->pl_nr > 0 ? 0 : result;
+}
+EXPORT_SYMBOL(osc_io_submit);
+
+/**
+ * This is called to update the attributes when modifying a specific page,
+ * both when making new pages and when doing updates to existing cached pages.
+ *
+ * Expand stripe KMS if necessary.
+ */
+void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
+		       pgoff_t idx, size_t to)
+{
+	struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+	int valid;
+	__u64 kms;
+
+	ENTRY;
+
+	/* offset within stripe */
+	kms = cl_offset(obj, idx) + to;
+
+	cl_object_attr_lock(obj);
+	CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n",
+	       kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+	       loi->loi_lvb.lvb_size);
+
+	attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds();
+	valid = CAT_MTIME | CAT_CTIME;
+	if (kms > loi->loi_kms) {
+		attr->cat_kms = kms;
+		valid |= CAT_KMS;
+	}
+	if (kms > loi->loi_lvb.lvb_size) {
+		attr->cat_size = kms;
+		valid |= CAT_SIZE;
+	}
+	cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+	EXIT;
+}
+
+int osc_io_commit_async(const struct lu_env *env,
+			const struct cl_io_slice *ios,
+			struct cl_page_list *qin, int from, int to,
+			cl_commit_cbt cb)
+{
+	struct cl_io    *io = ios->cis_io;
+	struct osc_io   *oio = cl2osc_io(env, ios);
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	struct cl_page  *page;
+	struct cl_page  *last_page;
+	struct osc_page *opg;
+	struct pagevec  *pvec = &osc_env_info(env)->oti_pagevec;
+	int result = 0;
+	ENTRY;
+
+	LASSERT(qin->pl_nr > 0);
+
+	/* Handle partial page cases */
+	last_page = cl_page_list_last(qin);
+	if (oio->oi_lockless) {
+		page = cl_page_list_first(qin);
+		if (page == last_page) {
+			cl_page_clip(env, page, from, to);
+		} else {
+			if (from != 0)
+				cl_page_clip(env, page, from, PAGE_SIZE);
+			if (to != PAGE_SIZE)
+				cl_page_clip(env, last_page, 0, to);
+		}
+	}
+
+	ll_pagevec_init(pvec, 0);
+
+	while (qin->pl_nr > 0) {
+		struct osc_async_page *oap;
+
+		page = cl_page_list_first(qin);
+		opg = osc_cl_page_osc(page, osc);
+		oap = &opg->ops_oap;
+
+		LASSERTF(osc == oap->oap_obj,
+			 "obj mismatch: %p / %p\n", osc, oap->oap_obj);
+
+		if (!list_empty(&oap->oap_rpc_item)) {
+			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+			       oap, opg);
+			result = -EBUSY;
+			break;
+		}
+
+		/* The page may be already in dirty cache. */
+		if (list_empty(&oap->oap_pending_item)) {
+			result = osc_page_cache_add(env, opg, io, cb);
+			if (result != 0)
+				break;
+		}
+
+		osc_page_touch_at(env, osc2cl(osc), osc_index(opg),
+				  page == last_page ? to : PAGE_SIZE);
+
+		cl_page_list_del(env, qin, page);
+
+		/* if there are no more slots, do the callback & reinit */
+		if (pagevec_add(pvec, page->cp_vmpage) == 0) {
+			(*cb)(env, io, pvec);
+			pagevec_reinit(pvec);
+		}
+	}
+	/* The shrink interval is in seconds, so we can update it once per
+	 * write, rather than once per page.
+	 */
+	osc_update_next_shrink(osc_cli(osc));
+
+
+	/* Clean up any partially full pagevecs */
+	if (pagevec_count(pvec) != 0)
+		(*cb)(env, io, pvec);
+
+	/* Can't access these pages any more. Page can be in transfer and
+	 * complete at any time. */
+
+	/* for sync write, kernel will wait for this page to be flushed before
+	 * osc_io_end() is called, so release it earlier.
+	 * for mkwrite(), it's known there is no further pages. */
+	if (cl_io_is_sync_write(io) && oio->oi_active != NULL) {
+		osc_extent_release(env, oio->oi_active);
+		oio->oi_active = NULL;
+	}
+
+	CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(osc_io_commit_async);
+
+void osc_io_extent_release(const struct lu_env *env,
+			   const struct cl_io_slice *ios)
+{
+	struct osc_io *oio = cl2osc_io(env, ios);
+
+	if (oio->oi_active != NULL) {
+		osc_extent_release(env, oio->oi_active);
+		oio->oi_active = NULL;
+	}
+}
+EXPORT_SYMBOL(osc_io_extent_release);
+
+static bool osc_import_not_healthy(struct obd_import *imp)
+{
+	return imp->imp_invalid || imp->imp_deactive ||
+	       !(imp->imp_state == LUSTRE_IMP_FULL ||
+		 imp->imp_state == LUSTRE_IMP_IDLE);
+}
+
+int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	struct obd_import *imp = osc_cli(osc)->cl_import;
+	struct osc_io *oio = osc_env_io(env);
+	int rc = -EIO;
+
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	/**
+	 * check whether this OSC device is available for non-delay read,
+	 * fast switching mirror if we haven't tried all mirrors.
+	 */
+	if (ios->cis_io->ci_type == CIT_READ && ios->cis_io->ci_ndelay &&
+	    !ios->cis_io->ci_tried_all_mirrors && osc_import_not_healthy(imp)) {
+		rc = -EAGAIN;
+	} else if (likely(!imp->imp_invalid)) {
+		atomic_inc(&osc->oo_nr_ios);
+		oio->oi_is_active = 1;
+		rc = 0;
+	}
+	spin_unlock(&imp->imp_lock);
+
+	if (capable(CAP_SYS_RESOURCE))
+		oio->oi_cap_sys_resource = 1;
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(osc_io_iter_init);
+
+void osc_io_iter_fini(const struct lu_env *env,
+		      const struct cl_io_slice *ios)
+{
+	struct osc_io *oio = osc_env_io(env);
+
+	if (oio->oi_is_active) {
+		struct osc_object *osc = cl2osc(ios->cis_obj);
+
+		oio->oi_is_active = 0;
+		LASSERT(atomic_read(&osc->oo_nr_ios) > 0);
+		if (atomic_dec_and_test(&osc->oo_nr_ios))
+			wake_up(&osc->oo_io_waitq);
+	}
+}
+EXPORT_SYMBOL(osc_io_iter_fini);
+
+void osc_io_rw_iter_fini(const struct lu_env *env,
+			 const struct cl_io_slice *ios)
+{
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+
+	if (oio->oi_lru_reserved > 0) {
+		osc_lru_unreserve(osc_cli(osc), oio->oi_lru_reserved);
+		oio->oi_lru_reserved = 0;
+	}
+	oio->oi_write_osclock = NULL;
+
+	osc_io_iter_fini(env, ios);
+}
+EXPORT_SYMBOL(osc_io_rw_iter_fini);
+
+int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct cl_io       *io;
+	struct cl_fault_io *fio;
+	ENTRY;
+
+	io  = ios->cis_io;
+	fio = &io->u.ci_fault;
+	CDEBUG(D_INFO, "%lu %d %zu\n",
+		fio->ft_index, fio->ft_writable, fio->ft_nob);
+	/*
+	 * If mapping is writeable, adjust kms to cover this page,
+	 * but do not extend kms beyond actual file size.
+	 * See bug 10919.
+	 */
+	if (fio->ft_writable)
+		osc_page_touch_at(env, ios->cis_obj,
+				  fio->ft_index, fio->ft_nob);
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_io_fault_start);
+
+
+static int osc_async_upcall(void *a, int rc)
+{
+	struct osc_async_cbargs *args = a;
+
+        args->opc_rc = rc;
+	complete(&args->opc_sync);
+        return 0;
+}
+
+/**
+ * Checks that there are no pages being written in the extent being truncated.
+ */
+static bool trunc_check_cb(const struct lu_env *env, struct cl_io *io,
+			   void **pvec, int count, void *cbdata)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		struct osc_page *ops = pvec[i];
+		struct cl_page *page = ops->ops_cl.cpl_page;
+		struct osc_async_page *oap;
+		__u64 start = *(__u64 *)cbdata;
+
+		oap = &ops->ops_oap;
+		if (oap->oap_cmd & OBD_BRW_WRITE &&
+				!list_empty(&oap->oap_pending_item))
+			CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n",
+					start, current->comm);
+
+		if (PageLocked(page->cp_vmpage))
+			CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n",
+			       ops, osc_index(ops),
+			       oap->oap_cmd & OBD_BRW_RWMASK);
+	}
+	return true;
+}
+
+static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
+			    struct osc_io *oio, __u64 size)
+{
+	struct cl_object *clob;
+	int     partial;
+	pgoff_t start;
+
+        clob    = oio->oi_cl.cis_obj;
+        start   = cl_index(clob, size);
+        partial = cl_offset(clob, start) < size;
+
+        /*
+         * Complain if there are pages in the truncated region.
+         */
+	osc_page_gang_lookup(env, io, cl2osc(clob),
+				start + partial, CL_PAGE_EOF,
+				trunc_check_cb, (void *)&size);
+}
+
+/**
+ * Flush affected pages prior punch.
+ * We shouldn't discard them locally first because that could be data loss
+ * if server doesn't support fallocate punch, we also need these data to be
+ * flushed first to prevent re-ordering with the punch
+ */
+int osc_punch_start(const struct lu_env *env, struct cl_io *io,
+		    struct cl_object *obj)
+{
+	struct osc_object *osc = cl2osc(obj);
+	pgoff_t pg_start = cl_index(obj, io->u.ci_setattr.sa_falloc_offset);
+	pgoff_t pg_end = cl_index(obj, io->u.ci_setattr.sa_falloc_end - 1);
+	int rc;
+
+	ENTRY;
+	rc = osc_cache_writeback_range(env, osc, pg_start, pg_end, 1, 0);
+	if (rc < 0)
+		RETURN(rc);
+
+	osc_page_gang_lookup(env, io, osc, pg_start, pg_end, osc_discard_cb,
+			     osc);
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_punch_start);
+
+static int osc_io_setattr_start(const struct lu_env *env,
+                                const struct cl_io_slice *slice)
+{
+	struct cl_io *io = slice->cis_io;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	struct obdo *oa = &oio->oi_oa;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	unsigned int ia_avalid = io->u.ci_setattr.sa_avalid;
+	enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid;
+	int result = 0;
+	__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+	bool io_is_falloc = cl_io_is_fallocate(io);
+
+	ENTRY;
+	/* truncate cache dirty pages first */
+	if (cl_io_is_trunc(io))
+		result = osc_cache_truncate_start(env, cl2osc(obj), size,
+						  &oio->oi_trunc);
+	/* flush local pages prior punching them on server */
+	if (io_is_falloc &&
+	    io->u.ci_setattr.sa_falloc_mode & FALLOC_FL_PUNCH_HOLE)
+		result = osc_punch_start(env, io, obj);
+
+	if (result == 0 && oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		result = cl_object_attr_get(env, obj, attr);
+		if (result == 0) {
+			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+			unsigned int cl_valid = 0;
+
+			if (ia_avalid & ATTR_SIZE) {
+				attr->cat_size = size;
+				attr->cat_kms = size;
+				cl_valid = (CAT_SIZE | CAT_KMS);
+			}
+			if (ia_avalid & ATTR_MTIME_SET) {
+				attr->cat_mtime = lvb->lvb_mtime;
+				cl_valid |= CAT_MTIME;
+			}
+			if (ia_avalid & ATTR_ATIME_SET) {
+				attr->cat_atime = lvb->lvb_atime;
+				cl_valid |= CAT_ATIME;
+			}
+			if (ia_xvalid & OP_XVALID_CTIME_SET) {
+				attr->cat_ctime = lvb->lvb_ctime;
+				cl_valid |= CAT_CTIME;
+			}
+			result = cl_object_attr_update(env, obj, attr,
+						       cl_valid);
+		}
+		cl_object_attr_unlock(obj);
+	}
+	memset(oa, 0, sizeof(*oa));
+	if (result == 0) {
+		oa->o_oi = loi->loi_oi;
+		obdo_set_parent_fid(oa, io->u.ci_setattr.sa_parent_fid);
+		oa->o_stripe_idx = io->u.ci_setattr.sa_stripe_index;
+		oa->o_layout = io->u.ci_setattr.sa_layout;
+		oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP |
+			OBD_MD_FLOSTLAYOUT;
+		if (ia_avalid & ATTR_CTIME) {
+			oa->o_valid |= OBD_MD_FLCTIME;
+			oa->o_ctime = attr->cat_ctime;
+		}
+		if (ia_avalid & ATTR_ATIME) {
+			oa->o_valid |= OBD_MD_FLATIME;
+			oa->o_atime = attr->cat_atime;
+		}
+		if (ia_avalid & ATTR_MTIME) {
+			oa->o_valid |= OBD_MD_FLMTIME;
+			oa->o_mtime = attr->cat_mtime;
+		}
+
+		if (ia_avalid & ATTR_SIZE || io_is_falloc) {
+			if (oio->oi_lockless) {
+				oa->o_flags = OBD_FL_SRVLOCK;
+				oa->o_valid |= OBD_MD_FLFLAGS;
+			}
+
+			if (io->ci_layout_version > 0) {
+				/* verify layout version */
+				oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+				oa->o_layout_version = io->ci_layout_version;
+			}
+		} else {
+			LASSERT(oio->oi_lockless == 0);
+		}
+
+		if (ia_xvalid & OP_XVALID_FLAGS) {
+			oa->o_flags = io->u.ci_setattr.sa_attr_flags;
+			oa->o_valid |= OBD_MD_FLFLAGS;
+		}
+
+		init_completion(&cbargs->opc_sync);
+
+		if (io_is_falloc) {
+			int falloc_mode = io->u.ci_setattr.sa_falloc_mode;
+
+			oa->o_size = io->u.ci_setattr.sa_falloc_offset;
+			oa->o_blocks = io->u.ci_setattr.sa_falloc_end;
+			oa->o_uid = io->u.ci_setattr.sa_falloc_uid;
+			oa->o_gid = io->u.ci_setattr.sa_falloc_gid;
+			oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+				OBD_MD_FLUID | OBD_MD_FLGID;
+
+			CDEBUG(D_INODE, "size %llu blocks %llu uid %u gid %u\n",
+			       oa->o_size, oa->o_blocks, oa->o_uid, oa->o_gid);
+			result = osc_fallocate_base(osc_export(cl2osc(obj)),
+						    oa, osc_async_upcall,
+						    cbargs, falloc_mode);
+		} else if (ia_avalid & ATTR_SIZE) {
+			oa->o_size = size;
+			oa->o_blocks = OBD_OBJECT_EOF;
+			oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+			result = osc_punch_send(osc_export(cl2osc(obj)),
+						oa, osc_async_upcall, cbargs);
+		} else {
+			result = osc_setattr_async(osc_export(cl2osc(obj)),
+						   oa, osc_async_upcall,
+						   cbargs, PTLRPCD_SET);
+		}
+		cbargs->opc_rpc_sent = result == 0;
+	}
+
+	RETURN(result);
+}
+
+void osc_io_setattr_end(const struct lu_env *env,
+			const struct cl_io_slice *slice)
+{
+	struct cl_io     *io  = slice->cis_io;
+	struct osc_io    *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	struct cl_attr  *attr = &osc_env_info(env)->oti_attr;
+	struct obdo *oa = &oio->oi_oa;
+	unsigned int cl_valid = 0;
+	int result = 0;
+
+	if (cbargs->opc_rpc_sent) {
+		wait_for_completion(&cbargs->opc_sync);
+		result = io->ci_result = cbargs->opc_rc;
+	}
+
+	if (cl_io_is_trunc(io)) {
+		__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+
+		if (result == 0) {
+			cl_object_attr_lock(obj);
+			if (oa->o_valid & OBD_MD_FLBLOCKS) {
+				attr->cat_blocks = oa->o_blocks;
+				cl_valid |= CAT_BLOCKS;
+			}
+
+			cl_object_attr_update(env, obj, attr, cl_valid);
+			cl_object_attr_unlock(obj);
+		}
+		osc_trunc_check(env, io, oio, size);
+		osc_cache_truncate_end(env, oio->oi_trunc);
+		oio->oi_trunc = NULL;
+	}
+
+	if (cl_io_is_fallocate(io)) {
+		if (result == 0) {
+			cl_object_attr_lock(obj);
+			/* update blocks */
+			if (oa->o_valid & OBD_MD_FLBLOCKS) {
+				attr->cat_blocks = oa->o_blocks;
+				cl_valid |= CAT_BLOCKS;
+			}
+
+			cl_object_attr_update(env, obj, attr, cl_valid);
+			cl_object_attr_unlock(obj);
+		}
+	}
+}
+EXPORT_SYMBOL(osc_io_setattr_end);
+
+struct osc_data_version_args {
+	struct osc_io *dva_oio;
+};
+
+static int
+osc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			   void *args, int rc)
+{
+	struct osc_data_version_args *dva = args;
+	struct osc_io *oio = dva->dva_oio;
+	const struct ost_body *body;
+
+	ENTRY;
+	if (rc < 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, &oio->oi_oa,
+			     &body->oa);
+	EXIT;
+out:
+	oio->oi_cbarg.opc_rc = rc;
+	complete(&oio->oi_cbarg.opc_sync);
+
+	return 0;
+}
+
+static int osc_io_data_version_start(const struct lu_env *env,
+				     const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv	= &slice->cis_io->u.ci_data_version;
+	struct osc_io		*oio	= cl2osc_io(env, slice);
+	struct obdo		*oa	= &oio->oi_oa;
+	struct osc_async_cbargs	*cbargs	= &oio->oi_cbarg;
+	struct osc_object	*obj	= cl2osc(slice->cis_obj);
+	struct lov_oinfo	*loi	= obj->oo_oinfo;
+	struct obd_export	*exp	= osc_export(obj);
+	struct ptlrpc_request	*req;
+	struct ost_body		*body;
+	struct osc_data_version_args *dva;
+	int rc;
+
+	ENTRY;
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) {
+		oa->o_valid |= OBD_MD_FLFLAGS;
+		oa->o_flags |= OBD_FL_SRVLOCK;
+		if (dv->dv_flags & LL_DV_WR_FLUSH)
+			oa->o_flags |= OBD_FL_FLUSH;
+	}
+
+	init_completion(&cbargs->opc_sync);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = osc_data_version_interpret;
+	dva = ptlrpc_req_async_args(dva, req);
+	dva->dva_oio = oio;
+
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+
+static void osc_io_data_version_end(const struct lu_env *env,
+				    const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	struct obdo *oa = &oio->oi_oa;
+	unsigned int cl_valid = 0;
+
+	ENTRY;
+	wait_for_completion(&cbargs->opc_sync);
+
+	if (cbargs->opc_rc != 0) {
+		slice->cis_io->ci_result = cbargs->opc_rc;
+	} else {
+		slice->cis_io->ci_result = 0;
+		if (!(oa->o_valid &
+		      (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
+			slice->cis_io->ci_result = -ENOTSUPP;
+
+		if (oa->o_valid & OBD_MD_LAYOUT_VERSION)
+			dv->dv_layout_version = oa->o_layout_version;
+		if (oa->o_valid & OBD_MD_FLDATAVERSION)
+			dv->dv_data_version = oa->o_data_version;
+
+		if (dv->dv_flags & LL_DV_SZ_UPDATE) {
+			if (oa->o_valid & OBD_MD_FLSIZE) {
+				attr->cat_size = oa->o_size;
+				cl_valid |= CAT_SIZE;
+			}
+
+			if (oa->o_valid & OBD_MD_FLBLOCKS) {
+				attr->cat_blocks = oa->o_blocks;
+				cl_valid |= CAT_BLOCKS;
+			}
+
+			cl_object_attr_lock(obj);
+			cl_object_attr_update(env, obj, attr, cl_valid);
+			cl_object_attr_unlock(obj);
+		}
+	}
+
+	EXIT;
+}
+
+int osc_io_read_start(const struct lu_env *env,
+		      const struct cl_io_slice *slice)
+{
+	struct cl_object *obj  = slice->cis_obj;
+	struct cl_attr	 *attr = &osc_env_info(env)->oti_attr;
+	int rc = 0;
+	ENTRY;
+
+	if (!slice->cis_io->ci_noatime) {
+		cl_object_attr_lock(obj);
+		attr->cat_atime = ktime_get_real_seconds();
+		rc = cl_object_attr_update(env, obj, attr, CAT_ATIME);
+		cl_object_attr_unlock(obj);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(osc_io_read_start);
+
+int osc_io_write_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct cl_object *obj   = slice->cis_obj;
+	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+	int rc = 0;
+	ENTRY;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
+	cl_object_attr_lock(obj);
+	attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds();
+	rc = cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME);
+	cl_object_attr_unlock(obj);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(osc_io_write_start);
+
+int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+		  struct cl_fsync_io *fio)
+{
+	struct osc_io    *oio   = osc_env_io(env);
+	struct obdo      *oa    = &oio->oi_oa;
+	struct lov_oinfo *loi   = obj->oo_oinfo;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	int rc = 0;
+	ENTRY;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	/* reload size abd blocks for start and end of sync range */
+	oa->o_size = fio->fi_start;
+	oa->o_blocks = fio->fi_end;
+	oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+	obdo_set_parent_fid(oa, fio->fi_fid);
+
+	init_completion(&cbargs->opc_sync);
+
+	rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(osc_fsync_ost);
+
+int osc_io_fsync_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct cl_io       *io  = slice->cis_io;
+	struct cl_fsync_io *fio = &io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	struct osc_object  *osc = cl2osc(obj);
+	pgoff_t start  = cl_index(obj, fio->fi_start);
+	pgoff_t end    = cl_index(obj, fio->fi_end);
+	int     result = 0;
+	ENTRY;
+
+	if (fio->fi_end == OBD_OBJECT_EOF)
+		end = CL_PAGE_EOF;
+
+	result = osc_cache_writeback_range(env, osc, start, end, 0,
+					   fio->fi_mode == CL_FSYNC_DISCARD);
+	if (result > 0) {
+		fio->fi_nr_written += result;
+		result = 0;
+	}
+	if (fio->fi_mode == CL_FSYNC_ALL) {
+		int rc;
+
+		/* we have to wait for writeback to finish before we can
+		 * send OST_SYNC RPC. This is bad because it causes extents
+		 * to be written osc by osc. However, we usually start
+		 * writeback before CL_FSYNC_ALL so this won't have any real
+		 * problem. */
+		rc = osc_cache_wait_range(env, osc, start, end);
+		if (result == 0)
+			result = rc;
+		rc = osc_fsync_ost(env, osc, fio);
+		if (result == 0)
+			result = rc;
+	}
+
+	RETURN(result);
+}
+
+void osc_io_fsync_end(const struct lu_env *env,
+		      const struct cl_io_slice *slice)
+{
+	struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	pgoff_t start = cl_index(obj, fio->fi_start);
+	pgoff_t end   = cl_index(obj, fio->fi_end);
+	int result = 0;
+
+	if (fio->fi_mode == CL_FSYNC_LOCAL) {
+		result = osc_cache_wait_range(env, cl2osc(obj), start, end);
+	} else if (fio->fi_mode == CL_FSYNC_ALL) {
+		struct osc_io           *oio    = cl2osc_io(env, slice);
+		struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+		wait_for_completion(&cbargs->opc_sync);
+		if (result == 0)
+			result = cbargs->opc_rc;
+	}
+	slice->cis_io->ci_result = result;
+}
+EXPORT_SYMBOL(osc_io_fsync_end);
+
+static int osc_io_ladvise_start(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	int			 result = 0;
+	struct cl_io		*io = slice->cis_io;
+	struct osc_io		*oio = cl2osc_io(env, slice);
+	struct cl_object	*obj = slice->cis_obj;
+	struct lov_oinfo	*loi = cl2osc(obj)->oo_oinfo;
+	struct cl_ladvise_io	*lio = &io->u.ci_ladvise;
+	struct obdo		*oa = &oio->oi_oa;
+	struct osc_async_cbargs	*cbargs = &oio->oi_cbarg;
+	struct lu_ladvise	*ladvise;
+	struct ladvise_hdr	*ladvise_hdr;
+	int			 buf_size;
+	int			 num_advise = 1;
+	ENTRY;
+
+	/* TODO: add multiple ladvise support in CLIO */
+	buf_size = offsetof(typeof(*ladvise_hdr), lah_advise[num_advise]);
+	if (osc_env_info(env)->oti_ladvise_buf.lb_len < buf_size)
+		lu_buf_realloc(&osc_env_info(env)->oti_ladvise_buf, buf_size);
+
+	ladvise_hdr = osc_env_info(env)->oti_ladvise_buf.lb_buf;
+	if (ladvise_hdr == NULL)
+		RETURN(-ENOMEM);
+
+	memset(ladvise_hdr, 0, buf_size);
+	ladvise_hdr->lah_magic = LADVISE_MAGIC;
+	ladvise_hdr->lah_count = num_advise;
+	ladvise_hdr->lah_flags = lio->li_flags;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID;
+	obdo_set_parent_fid(oa, lio->li_fid);
+
+	ladvise = ladvise_hdr->lah_advise;
+	ladvise->lla_start = lio->li_start;
+	ladvise->lla_end = lio->li_end;
+	ladvise->lla_advice = lio->li_advice;
+
+	if (lio->li_flags & LF_ASYNC) {
+		result = osc_ladvise_base(osc_export(cl2osc(obj)), oa,
+					  ladvise_hdr, NULL, NULL, NULL);
+	} else {
+		init_completion(&cbargs->opc_sync);
+		result = osc_ladvise_base(osc_export(cl2osc(obj)), oa,
+					  ladvise_hdr, osc_async_upcall,
+					  cbargs, PTLRPCD_SET);
+		cbargs->opc_rpc_sent = result == 0;
+	}
+	RETURN(result);
+}
+
+static void osc_io_ladvise_end(const struct lu_env *env,
+			       const struct cl_io_slice *slice)
+{
+	struct cl_io		*io = slice->cis_io;
+	struct osc_io		*oio = cl2osc_io(env, slice);
+	struct osc_async_cbargs	*cbargs = &oio->oi_cbarg;
+	int			 result = 0;
+	struct cl_ladvise_io	*lio = &io->u.ci_ladvise;
+
+	if ((!(lio->li_flags & LF_ASYNC)) && cbargs->opc_rpc_sent) {
+		wait_for_completion(&cbargs->opc_sync);
+		result = cbargs->opc_rc;
+	}
+	slice->cis_io->ci_result = result;
+}
+
+void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = cl2osc_io(env, slice);
+
+	if (oio->oi_active) {
+		osc_extent_release(env, oio->oi_active);
+		oio->oi_active = NULL;
+	}
+}
+EXPORT_SYMBOL(osc_io_end);
+
+struct osc_lseek_args {
+	struct osc_io *lsa_oio;
+};
+
+static int osc_lseek_interpret(const struct lu_env *env,
+			       struct ptlrpc_request *req,
+			       void *arg, int rc)
+{
+	struct ost_body *reply;
+	struct osc_lseek_args *lsa = arg;
+	struct osc_io *oio = lsa->lsa_oio;
+	struct cl_io *io = oio->oi_cl.cis_io;
+	struct cl_lseek_io *lsio = &io->u.ci_lseek;
+
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (reply == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lsio->ls_result = reply->oa.o_size;
+out:
+	osc_async_upcall(&oio->oi_cbarg, rc);
+	RETURN(rc);
+}
+
+int osc_io_lseek_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct cl_io *io = slice->cis_io;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
+	struct cl_lseek_io *lsio = &io->u.ci_lseek;
+	struct obdo *oa = &oio->oi_oa;
+	struct osc_async_cbargs	*cbargs = &oio->oi_cbarg;
+	struct obd_export *exp = osc_export(cl2osc(obj));
+	struct ptlrpc_request *req;
+	struct ost_body *body;
+	struct osc_lseek_args *lsa;
+	int rc = 0;
+
+	ENTRY;
+
+	/* No negative values at this point */
+	LASSERT(lsio->ls_start >= 0);
+	LASSERT(lsio->ls_whence == SEEK_HOLE || lsio->ls_whence == SEEK_DATA);
+
+	/* with IO lock taken we have object size in LVB and can check
+	 * boundaries prior sending LSEEK RPC
+	 */
+	if (lsio->ls_start >= loi->loi_lvb.lvb_size) {
+		/* consider area beyond end of object as hole */
+		if (lsio->ls_whence == SEEK_HOLE)
+			lsio->ls_result = lsio->ls_start;
+		else
+			lsio->ls_result = -ENXIO;
+		RETURN(0);
+	}
+
+	/* if LSEEK RPC is not supported by server, consider whole stripe
+	 * object is data with hole after end of object
+	 */
+	if (!exp_connect_lseek(exp)) {
+		if (lsio->ls_whence == SEEK_HOLE)
+			lsio->ls_result = loi->loi_lvb.lvb_size;
+		else
+			lsio->ls_result = lsio->ls_start;
+		RETURN(0);
+	}
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+	oa->o_size = lsio->ls_start;
+	oa->o_mode = lsio->ls_whence;
+	if (oio->oi_lockless) {
+		oa->o_flags = OBD_FL_SRVLOCK;
+		oa->o_valid |= OBD_MD_FLFLAGS;
+	}
+
+	init_completion(&cbargs->opc_sync);
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SEEK);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SEEK);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = osc_lseek_interpret;
+	lsa = ptlrpc_req_async_args(lsa, req);
+	lsa->lsa_oio = oio;
+
+	ptlrpcd_add_req(req);
+	cbargs->opc_rpc_sent = 1;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_io_lseek_start);
+
+void osc_io_lseek_end(const struct lu_env *env,
+		      const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct osc_async_cbargs	*cbargs = &oio->oi_cbarg;
+	int rc = 0;
+
+	if (cbargs->opc_rpc_sent) {
+		wait_for_completion(&cbargs->opc_sync);
+		rc = cbargs->opc_rc;
+	}
+	slice->cis_io->ci_result = rc;
+}
+EXPORT_SYMBOL(osc_io_lseek_end);
+
+int osc_io_lru_reserve(const struct lu_env *env,
+		       const struct cl_io_slice *ios,
+		       loff_t pos, size_t bytes)
+{
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	struct osc_io *oio = osc_env_io(env);
+	unsigned long npages = 0;
+	size_t page_offset;
+
+	ENTRY;
+
+	page_offset = pos & ~PAGE_MASK;
+	if (page_offset) {
+		++npages;
+		if (bytes > PAGE_SIZE - page_offset)
+			bytes -= (PAGE_SIZE - page_offset);
+		else
+			bytes = 0;
+	}
+	npages += (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_io_lru_reserve);
+
+static const struct cl_io_operations osc_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_rw_iter_fini,
+			.cio_start  = osc_io_read_start,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_WRITE] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_rw_iter_fini,
+			.cio_start  = osc_io_write_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_SETATTR] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start  = osc_io_setattr_start,
+			.cio_end    = osc_io_setattr_end
+		},
+		[CIT_DATA_VERSION] = {
+			.cio_start  = osc_io_data_version_start,
+			.cio_end    = osc_io_data_version_end,
+		},
+		[CIT_FAULT] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start  = osc_io_fault_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_FSYNC] = {
+			.cio_start  = osc_io_fsync_start,
+			.cio_end    = osc_io_fsync_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_LADVISE] = {
+			.cio_start  = osc_io_ladvise_start,
+			.cio_end    = osc_io_ladvise_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_LSEEK] = {
+			.cio_start  = osc_io_lseek_start,
+			.cio_end    = osc_io_lseek_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = osc_io_fini
+		}
+	},
+	.cio_read_ahead		    = osc_io_read_ahead,
+	.cio_lru_reserve	    = osc_io_lru_reserve,
+	.cio_submit                 = osc_io_submit,
+	.cio_commit_async           = osc_io_commit_async,
+	.cio_extent_release         = osc_io_extent_release
+};
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+int osc_io_init(const struct lu_env *env,
+                struct cl_object *obj, struct cl_io *io)
+{
+        struct osc_io *oio = osc_env_io(env);
+
+        CL_IO_SLICE_CLEAN(oio, oi_cl);
+        cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+        return 0;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
new file mode 100644
index 0000000000000..dbf8cde90317f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
@@ -0,0 +1,1300 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_lock for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* fid_build_reg_res_name() */
+#include <lustre_fid.h>
+#include <lustre_osc.h>
+
+#include "osc_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/**
+ * Returns a weak pointer to the ldlm lock identified by a handle. Returned
+ * pointer cannot be dereferenced, as lock is not protected from concurrent
+ * reclaim. This function is a helper for osc_lock_invariant().
+ */
+static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
+{
+        struct ldlm_lock *lock;
+
+        lock = ldlm_handle2lock(handle);
+        if (lock != NULL)
+                LDLM_LOCK_PUT(lock);
+        return lock;
+}
+
+/**
+ * Invariant that has to be true all of the time.
+ */
+static int osc_lock_invariant(struct osc_lock *ols)
+{
+	struct ldlm_lock *lock	      = osc_handle_ptr(&ols->ols_handle);
+	struct ldlm_lock *olock	      = ols->ols_dlmlock;
+	int		  handle_used = lustre_handle_is_used(&ols->ols_handle);
+
+	if (ergo(osc_lock_is_lockless(ols),
+		 ols->ols_locklessable && ols->ols_dlmlock == NULL))
+		return 1;
+
+	/*
+	 * If all the following "ergo"s are true, return 1, otherwise 0
+	 */
+	if (! ergo(olock != NULL, handle_used))
+		return 0;
+
+	if (! ergo(olock != NULL,
+		   olock->l_handle.h_cookie == ols->ols_handle.cookie))
+		return 0;
+
+	if (! ergo(handle_used,
+		   ergo(lock != NULL && olock != NULL, lock == olock) &&
+		   ergo(lock == NULL, olock == NULL)))
+		return 0;
+	/*
+	 * Check that ->ols_handle and ->ols_dlmlock are consistent, but
+	 * take into account that they are set at the different time.
+	 */
+	if (! ergo(ols->ols_state == OLS_CANCELLED,
+		   olock == NULL && !handle_used))
+		return 0;
+	/*
+	 * DLM lock is destroyed only after we have seen cancellation
+	 * ast.
+	 */
+	if (! ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
+		   !ldlm_is_destroyed(olock)))
+		return 0;
+
+	if (! ergo(ols->ols_state == OLS_GRANTED,
+		   olock != NULL &&
+		   ldlm_is_granted(olock) &&
+		   ols->ols_hold))
+		return 0;
+	return 1;
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+	struct osc_lock  *ols = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(ols));
+	LASSERT(ols->ols_dlmlock == NULL);
+
+	OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
+}
+EXPORT_SYMBOL(osc_lock_fini);
+
+static void osc_lock_build_policy(const struct lu_env *env,
+				  const struct cl_lock *lock,
+				  union ldlm_policy_data *policy)
+{
+	const struct cl_lock_descr *d = &lock->cll_descr;
+
+	osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+	policy->l_extent.gid = d->cld_gid;
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server. Copy of osc_update_enqueue()
+ * logic.
+ *
+ * Called under lock and resource spin-locks.
+ */
+void osc_lock_lvb_update(const struct lu_env *env,
+			 struct osc_object *osc,
+			 struct ldlm_lock *dlmlock,
+			 struct ost_lvb *lvb)
+{
+	struct cl_object *obj = osc2cl(osc);
+	struct lov_oinfo *oinfo = osc->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	unsigned valid, setkms = 0;
+
+	ENTRY;
+
+	valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
+	if (lvb == NULL) {
+		LASSERT(dlmlock != NULL);
+		lvb = dlmlock->l_lvb_data;
+	}
+	cl_lvb2attr(attr, lvb);
+
+	cl_object_attr_lock(obj);
+	if (dlmlock != NULL) {
+		__u64 size;
+
+		check_res_locked(dlmlock->l_resource);
+
+		LASSERT(lvb == dlmlock->l_lvb_data);
+                size = lvb->lvb_size;
+
+                /* Extend KMS up to the end of this lock and no further
+                 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+                if (size > dlmlock->l_policy_data.l_extent.end)
+                        size = dlmlock->l_policy_data.l_extent.end + 1;
+                if (size >= oinfo->loi_kms) {
+                        valid |= CAT_KMS;
+                        attr->cat_kms = size;
+			setkms = 1;
+                }
+		ldlm_lock_allow_match_locked(dlmlock);
+	}
+
+	/* The size should not be less than the kms */
+	if (attr->cat_size < oinfo->loi_kms)
+		attr->cat_size = oinfo->loi_kms;
+
+	LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s "
+		   "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size,
+		   setkms ? "" : " leaving",
+		   setkms ? attr->cat_kms : oinfo->loi_kms,
+		   dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull);
+
+	cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+	EXIT;
+}
+
+static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
+			     struct lustre_handle *lockh)
+{
+	struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj);
+	struct ldlm_lock *dlmlock;
+
+	dlmlock = ldlm_handle2lock_long(lockh, 0);
+	LASSERT(dlmlock != NULL);
+
+	/* lock reference taken by ldlm_handle2lock_long() is
+	 * owned by osc_lock and released in osc_lock_detach()
+	 */
+	lu_ref_add_atomic(&dlmlock->l_reference, "osc_lock", oscl);
+	oscl->ols_has_ref = 1;
+
+	LASSERT(oscl->ols_dlmlock == NULL);
+	oscl->ols_dlmlock = dlmlock;
+
+	/* This may be a matched lock for glimpse request, do not hold
+	 * lock reference in that case. */
+	if (!oscl->ols_glimpse) {
+		/* hold a refc for non glimpse lock which will
+		 * be released in osc_lock_cancel() */
+		lustre_handle_copy(&oscl->ols_handle, lockh);
+		ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode);
+		oscl->ols_hold = 1;
+	}
+
+	/* Lock must have been granted. */
+	lock_res_and_lock(dlmlock);
+	if (ldlm_is_granted(dlmlock)) {
+		struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent;
+		struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
+
+		/* extend the lock extent, otherwise it will have problem when
+		 * we decide whether to grant a lockless lock. */
+		descr->cld_mode  = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+		descr->cld_start = cl_index(descr->cld_obj, ext->start);
+		descr->cld_end   = cl_index(descr->cld_obj, ext->end);
+		descr->cld_gid   = ext->gid;
+
+		/* no lvb update for matched lock */
+		if (!ldlm_is_lvb_cached(dlmlock)) {
+			LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+			LASSERT(osc == dlmlock->l_ast_data);
+			osc_lock_lvb_update(env, osc, dlmlock, NULL);
+			ldlm_set_lvb_cached(dlmlock);
+		}
+		LINVRNT(osc_lock_invariant(oscl));
+	}
+	unlock_res_and_lock(dlmlock);
+
+	LASSERT(oscl->ols_state != OLS_GRANTED);
+	oscl->ols_state = OLS_GRANTED;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
+			   int errcode)
+{
+	struct osc_lock         *oscl  = cookie;
+	struct cl_lock_slice    *slice = &oscl->ols_cl;
+	struct lu_env           *env;
+	int			rc;
+
+	ENTRY;
+
+	env = cl_env_percpu_get();
+	/* should never happen, similar to osc_ldlm_blocking_ast(). */
+	LASSERT(!IS_ERR(env));
+
+	rc = ldlm_error2errno(errcode);
+	if (oscl->ols_state == OLS_ENQUEUED) {
+		oscl->ols_state = OLS_UPCALL_RECEIVED;
+	} else if (oscl->ols_state == OLS_CANCELLED) {
+		rc = -EIO;
+	} else {
+		CERROR("Impossible state: %d\n", oscl->ols_state);
+		LBUG();
+	}
+
+	if (rc == 0)
+		osc_lock_granted(env, oscl, lockh);
+
+	/* Error handling, some errors are tolerable. */
+	if (oscl->ols_glimpse && rc == -ENAVAIL) {
+		LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+		osc_lock_lvb_update(env, cl2osc(slice->cls_obj),
+				    NULL, &oscl->ols_lvb);
+		/* Hide the error. */
+		rc = 0;
+	} else if (rc < 0 && oscl->ols_flags & LDLM_FL_NDELAY) {
+		rc = -EAGAIN;
+	}
+
+	if (oscl->ols_owner != NULL)
+		cl_sync_io_note(env, oscl->ols_owner, rc);
+	cl_env_percpu_put(env);
+
+	RETURN(rc);
+}
+
+static int osc_lock_upcall_speculative(void *cookie,
+				       struct lustre_handle *lockh,
+				       int errcode)
+{
+	struct osc_object	*osc = cookie;
+	struct ldlm_lock	*dlmlock;
+	struct lu_env           *env;
+	__u16			 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	LASSERT(!IS_ERR(env));
+
+	if (errcode == ELDLM_LOCK_MATCHED)
+		GOTO(out, errcode = ELDLM_OK);
+
+	if (errcode != ELDLM_OK)
+		GOTO(out, errcode);
+
+	dlmlock = ldlm_handle2lock(lockh);
+	LASSERT(dlmlock != NULL);
+
+	lock_res_and_lock(dlmlock);
+	LASSERT(ldlm_is_granted(dlmlock));
+
+	/* there is no osc_lock associated with speculative locks
+	 * thus no need to set LDLM_FL_LVB_CACHED */
+	osc_lock_lvb_update(env, osc, dlmlock, NULL);
+
+	unlock_res_and_lock(dlmlock);
+	LDLM_LOCK_PUT(dlmlock);
+
+out:
+	cl_object_put(env, osc2cl(osc));
+	cl_env_put(env, &refcheck);
+	RETURN(ldlm_error2errno(errcode));
+}
+
+static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
+			  enum cl_lock_mode mode, bool discard)
+{
+	struct lu_env		*env;
+	__u16			refcheck;
+	int			rc = 0;
+	int			rc2 = 0;
+
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	if (mode == CLM_WRITE) {
+		rc = osc_cache_writeback_range(env, obj, start, end, 1,
+					       discard);
+		CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n",
+		       obj, start, end, rc,
+		       discard ? "discarded" : "written back");
+		if (rc > 0)
+			rc = 0;
+	}
+
+	/*
+	 * Do not try to match other locks with CLM_WRITE since we already
+	 * know there're none
+	 */
+	rc2 = osc_lock_discard_pages(env, obj, start, end,
+				     mode == CLM_WRITE || discard);
+	if (rc == 0 && rc2 < 0)
+		rc = rc2;
+
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int osc_dlm_blocking_ast0(const struct lu_env *env,
+                                 struct ldlm_lock *dlmlock,
+                                 void *data, int flag)
+{
+	struct cl_object	*obj = NULL;
+	int			result = 0;
+	bool			discard;
+	enum cl_lock_mode	mode = CLM_READ;
+	ENTRY;
+
+	LASSERT(flag == LDLM_CB_CANCELING);
+
+	lock_res_and_lock(dlmlock);
+	if (!ldlm_is_granted(dlmlock)) {
+		dlmlock->l_ast_data = NULL;
+		unlock_res_and_lock(dlmlock);
+		RETURN(0);
+	}
+
+	discard = ldlm_is_discard_data(dlmlock);
+	if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))
+		mode = CLM_WRITE;
+
+	if (dlmlock->l_ast_data != NULL) {
+		obj = osc2cl(dlmlock->l_ast_data);
+		cl_object_get(obj);
+	}
+
+	unlock_res_and_lock(dlmlock);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_CANCEL, 5);
+
+	/* if l_ast_data is NULL, the dlmlock was enqueued by AGL or
+	 * the object has been destroyed. */
+	if (obj != NULL) {
+		struct ldlm_extent *extent = &dlmlock->l_policy_data.l_extent;
+		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+		__u64 old_kms;
+
+		/* Destroy pages covered by the extent of the DLM lock */
+		result = osc_lock_flush(cl2osc(obj),
+					cl_index(obj, extent->start),
+					cl_index(obj, extent->end),
+					mode, discard);
+
+		/* losing a lock, update kms */
+		lock_res_and_lock(dlmlock);
+		/* clearing l_ast_data after flushing data,
+		 * to let glimpse ast find the lock and the object */
+		dlmlock->l_ast_data = NULL;
+		cl_object_attr_lock(obj);
+		/* Must get the value under the lock to avoid race. */
+		old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
+		/* Update the kms. Need to loop all granted locks.
+		 * Not a problem for the client */
+		attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
+
+		cl_object_attr_update(env, obj, attr, CAT_KMS);
+		cl_object_attr_unlock(obj);
+		unlock_res_and_lock(dlmlock);
+
+		cl_object_put(env, obj);
+	}
+	RETURN(result);
+}
+
+/**
+ * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
+ * some other lock, or is canceled. This function is installed as a
+ * ldlm_lock::l_blocking_ast() for client extent locks.
+ *
+ * Control flow is tricky, because ldlm uses the same call-back
+ * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
+ *
+ * \param dlmlock lock for which ast occurred.
+ *
+ * \param new description of a conflicting lock in case of blocking ast.
+ *
+ * \param data value of dlmlock->l_ast_data
+ *
+ * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
+ *             cancellation and blocking ast's.
+ *
+ * Possible use cases:
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
+ *       lock due to lock lru pressure, or explicit user request to purge
+ *       locks.
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
+ *       us that dlmlock conflicts with another lock that some client is
+ *       enqueuing. Lock is canceled.
+ *
+ *           - cl_lock_cancel() is called. osc_lock_cancel() calls
+ *             ldlm_cli_cancel() that calls
+ *
+ *                  dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ *             recursively entering osc_ldlm_blocking_ast().
+ *
+ *     - client cancels lock voluntary (e.g., as a part of early cancellation):
+ *
+ *           cl_lock_cancel()->
+ *             osc_lock_cancel()->
+ *               ldlm_cli_cancel()->
+ *                 dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ */
+static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+                                 struct ldlm_lock_desc *new, void *data,
+                                 int flag)
+{
+	int result = 0;
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING: {
+		struct lustre_handle lockh;
+
+		ldlm_lock2handle(dlmlock, &lockh);
+		result = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (result == -ENODATA)
+			result = 0;
+		break;
+	}
+	case LDLM_CB_CANCELING: {
+		struct lu_env     *env;
+		__u16		   refcheck;
+
+		/*
+		 * This can be called in the context of outer IO, e.g.,
+		 *
+		 *    osc_enqueue_base()->...
+		 *      ->ldlm_prep_elc_req()->...
+		 *        ->ldlm_cancel_callback()->...
+		 *          ->osc_ldlm_blocking_ast()
+		 *
+		 * new environment has to be created to not corrupt outer
+		 * context.
+		 */
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env)) {
+			result = PTR_ERR(env);
+			break;
+		}
+
+		result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
+		cl_env_put(env, &refcheck);
+		break;
+	}
+	default:
+		LBUG();
+	}
+	RETURN(result);
+}
+
+int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+	struct ptlrpc_request	*req  = data;
+	struct lu_env		*env;
+	struct ost_lvb		*lvb;
+	struct req_capsule	*cap;
+	struct cl_object	*obj = NULL;
+	struct ldlm_resource	*res = dlmlock->l_resource;
+	struct ldlm_match_data  matchdata = { 0 };
+	union ldlm_policy_data  policy;
+	enum ldlm_mode		mode = LCK_PW | LCK_GROUP | LCK_PR;
+	int			result;
+	__u16			refcheck;
+
+	ENTRY;
+
+	LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		GOTO(out, result = PTR_ERR(env));
+
+	policy.l_extent.start = 0;
+	policy.l_extent.end = LUSTRE_EOF;
+
+	matchdata.lmd_mode = &mode;
+	matchdata.lmd_policy = &policy;
+	matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING;
+	matchdata.lmd_match = LDLM_MATCH_UNREF | LDLM_MATCH_AST_ANY;
+
+	LDLM_LOCK_GET(dlmlock);
+
+	/* If any dlmlock has l_ast_data set, we must find it or we risk
+	 * missing a size update done under a different lock.
+	 */
+	while (dlmlock) {
+		lock_res_and_lock(dlmlock);
+		if (dlmlock->l_ast_data) {
+			obj = osc2cl(dlmlock->l_ast_data);
+			cl_object_get(obj);
+		}
+		unlock_res_and_lock(dlmlock);
+		LDLM_LOCK_RELEASE(dlmlock);
+
+		dlmlock = NULL;
+
+		if (obj == NULL && res->lr_type == LDLM_EXTENT) {
+			if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA))
+				break;
+
+			lock_res(res);
+			dlmlock = search_itree(res, &matchdata);
+			unlock_res(res);
+		}
+	}
+
+	if (obj != NULL) {
+		/* Do not grab the mutex of cl_lock for glimpse.
+		 * See LU-1274 for details.
+		 * BTW, it's okay for cl_lock to be cancelled during
+		 * this period because server can handle this race.
+		 * See ldlm_server_glimpse_ast() for details.
+		 * cl_lock_mutex_get(env, lock); */
+		cap = &req->rq_pill;
+		req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
+		req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
+					sizeof *lvb);
+		result = req_capsule_server_pack(cap);
+		if (result == 0) {
+			lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
+			result = cl_object_glimpse(env, obj, lvb);
+		}
+		if (!exp_connect_lvb_type(req->rq_export))
+			req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB,
+					sizeof(struct ost_lvb_v1), RCL_SERVER);
+		cl_object_put(env, obj);
+	} else {
+		/*
+		 * These errors are normal races, so we don't want to
+		 * fill the console with messages by calling
+		 * ptlrpc_error()
+		 */
+		lustre_pack_reply(req, 1, NULL, NULL);
+		result = -ELDLM_NO_LOCK_DATA;
+	}
+	cl_env_put(env, &refcheck);
+	EXIT;
+
+out:
+	req->rq_status = result;
+	RETURN(result);
+}
+EXPORT_SYMBOL(osc_ldlm_glimpse_ast);
+
+static bool weigh_cb(const struct lu_env *env, struct cl_io *io,
+		     void **pvec, int count, void *cbdata)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		struct osc_page *ops = pvec[i];
+		struct cl_page *page = ops->ops_cl.cpl_page;
+
+		if (cl_page_is_vmlocked(env, page) ||
+		    PageDirty(page->cp_vmpage) ||
+		    PageWriteback(page->cp_vmpage))
+			return false;
+
+		*(pgoff_t *)cbdata = osc_index(ops) + 1;
+	}
+	return true;
+}
+
+static unsigned long osc_lock_weight(const struct lu_env *env,
+				     struct osc_object *oscobj,
+				     loff_t start, loff_t end)
+{
+	struct cl_io *io = osc_env_thread_io(env);
+	struct cl_object *obj = cl_object_top(&oscobj->oo_cl);
+	pgoff_t page_index;
+	int result;
+
+	ENTRY;
+
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		RETURN(1);
+
+	page_index = cl_index(obj, start);
+
+	if (!osc_page_gang_lookup(env, io, oscobj,
+				  page_index, cl_index(obj, end),
+				  weigh_cb, (void *)&page_index))
+		result = 1;
+	cl_io_fini(env, io);
+
+	return result;
+}
+
+/**
+ * Get the weight of dlm lock for early cancellation.
+ */
+unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
+{
+	struct lu_env *env;
+	struct osc_object *obj;
+	struct osc_lock *oscl;
+	unsigned long weight;
+	bool found = false;
+	__u16 refcheck;
+
+	ENTRY;
+
+	might_sleep();
+	/*
+	 * osc_ldlm_weigh_ast has a complex context since it might be called
+	 * because of lock canceling, or from user's input. We have to make
+	 * a new environment for it. Probably it is implementation safe to use
+	 * the upper context because cl_lock_put don't modify environment
+	 * variables. But just in case ..
+	 */
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		/* Mostly because lack of memory, do not eliminate this lock */
+		RETURN(1);
+
+	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT ||
+		dlmlock->l_resource->lr_type == LDLM_IBITS);
+
+	lock_res_and_lock(dlmlock);
+	obj = dlmlock->l_ast_data;
+	if (obj)
+		cl_object_get(osc2cl(obj));
+	unlock_res_and_lock(dlmlock);
+
+	if (obj == NULL)
+		GOTO(out, weight = 0);
+
+	spin_lock(&obj->oo_ol_spin);
+	list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) {
+		if (oscl->ols_dlmlock == dlmlock) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&obj->oo_ol_spin);
+	if (found) {
+		/*
+		 * If the lock is being used by an IO, definitely not cancel it.
+		 */
+		GOTO(out, weight = 1);
+	}
+
+	if (dlmlock->l_resource->lr_type == LDLM_EXTENT)
+		weight = osc_lock_weight(env, obj,
+					 dlmlock->l_policy_data.l_extent.start,
+					 dlmlock->l_policy_data.l_extent.end);
+	else if (ldlm_has_dom(dlmlock))
+		weight = osc_lock_weight(env, obj, 0, OBD_OBJECT_EOF);
+	/* The DOM bit can be cancelled at any time; in that case, we know
+	 * there are no pages, so just return weight of 0
+	 */
+	else
+		weight = 0;
+
+	EXIT;
+
+out:
+	if (obj)
+		cl_object_put(env, osc2cl(obj));
+
+	cl_env_put(env, &refcheck);
+	return weight;
+}
+EXPORT_SYMBOL(osc_ldlm_weigh_ast);
+
+static void osc_lock_build_einfo(const struct lu_env *env,
+				 const struct cl_lock *lock,
+				 struct osc_object *osc,
+				 struct ldlm_enqueue_info *einfo)
+{
+	einfo->ei_type   = LDLM_EXTENT;
+	einfo->ei_mode   = osc_cl_lock2ldlm(lock->cll_descr.cld_mode);
+	einfo->ei_cb_bl  = osc_ldlm_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = osc_ldlm_glimpse_ast;
+	einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */
+}
+
+/**
+ * Determine if the lock should be converted into a lockless lock.
+ *
+ * Steps to check:
+ * - if the lock has an explicite requirment for a non-lockless lock;
+ * - if the io lock request type ci_lockreq;
+ * - send the enqueue rpc to ost to make the further decision;
+ * - special treat to truncate lockless lock
+ *
+ *  Additional policy can be implemented here, e.g., never do lockless-io
+ *  for large extents.
+ */
+void osc_lock_to_lockless(const struct lu_env *env,
+			  struct osc_lock *ols, int force)
+{
+	struct cl_lock_slice *slice = &ols->ols_cl;
+	struct osc_io *oio = osc_env_io(env);
+	struct cl_io *io = oio->oi_cl.cis_io;
+	struct cl_object *obj = slice->cls_obj;
+	struct osc_object *oob = cl2osc(obj);
+	struct obd_connect_data *ocd;
+
+	LASSERT(ols->ols_state == OLS_NEW ||
+		ols->ols_state == OLS_UPCALL_RECEIVED);
+
+	if (force) {
+		ols->ols_locklessable = 1;
+		slice->cls_ops = ols->ols_lockless_ops;
+	} else {
+		LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+			io->ci_lockreq == CILR_MAYBE ||
+			io->ci_lockreq == CILR_NEVER);
+
+		ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+		ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+					(io->ci_lockreq == CILR_MAYBE) &&
+					(ocd->ocd_connect_flags &
+					 OBD_CONNECT_SRVLOCK);
+		if (io->ci_lockreq == CILR_NEVER) {
+			ols->ols_locklessable = 1;
+			slice->cls_ops = ols->ols_lockless_ops;
+		}
+	}
+	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+}
+EXPORT_SYMBOL(osc_lock_to_lockless);
+
+static bool osc_lock_compatible(const struct osc_lock *qing,
+				const struct osc_lock *qed)
+{
+	struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr;
+	struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr;
+
+	if (qed->ols_glimpse || qed->ols_speculative)
+		return true;
+
+	if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ)
+		return true;
+
+	if (qed->ols_state < OLS_GRANTED)
+		return true;
+
+	if (qed_descr->cld_mode  >= qing_descr->cld_mode &&
+	    qed_descr->cld_start <= qing_descr->cld_start &&
+	    qed_descr->cld_end   >= qing_descr->cld_end)
+		return true;
+
+	return false;
+}
+
+void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
+			   struct osc_lock *oscl)
+{
+	struct osc_lock *scan;
+
+	spin_lock(&osc->oo_ol_spin);
+	list_del_init(&oscl->ols_nextlock_oscobj);
+	spin_unlock(&osc->oo_ol_spin);
+
+	spin_lock(&oscl->ols_lock);
+	while ((scan = list_first_entry_or_null(&oscl->ols_waiting_list,
+						struct osc_lock,
+						ols_wait_entry)) != NULL) {
+		list_del_init(&scan->ols_wait_entry);
+
+		cl_sync_io_note(env, scan->ols_owner, 0);
+	}
+	spin_unlock(&oscl->ols_lock);
+}
+EXPORT_SYMBOL(osc_lock_wake_waiters);
+
+int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
+			  struct osc_lock *oscl)
+{
+	struct osc_lock         *tmp_oscl;
+	struct cl_lock_descr    *need = &oscl->ols_cl.cls_lock->cll_descr;
+	struct cl_sync_io       *waiter = &osc_env_info(env)->oti_anchor;
+	int rc = 0;
+
+	ENTRY;
+
+	spin_lock(&obj->oo_ol_spin);
+	list_add_tail(&oscl->ols_nextlock_oscobj, &obj->oo_ol_list);
+
+restart:
+	list_for_each_entry(tmp_oscl, &obj->oo_ol_list,
+			    ols_nextlock_oscobj) {
+		struct cl_lock_descr *descr;
+
+		if (tmp_oscl == oscl)
+			break;
+
+		descr = &tmp_oscl->ols_cl.cls_lock->cll_descr;
+		if (descr->cld_start > need->cld_end ||
+		    descr->cld_end   < need->cld_start)
+			continue;
+
+		/* We're not supposed to give up group lock */
+		if (descr->cld_mode == CLM_GROUP)
+			break;
+
+		if (!osc_lock_is_lockless(oscl) &&
+		    osc_lock_compatible(oscl, tmp_oscl))
+			continue;
+
+		/* wait for conflicting lock to be canceled */
+		cl_sync_io_init(waiter, 1);
+		oscl->ols_owner = waiter;
+
+		spin_lock(&tmp_oscl->ols_lock);
+		/* add oscl into tmp's ols_waiting list */
+		list_add_tail(&oscl->ols_wait_entry,
+			      &tmp_oscl->ols_waiting_list);
+		spin_unlock(&tmp_oscl->ols_lock);
+
+		spin_unlock(&obj->oo_ol_spin);
+		rc = cl_sync_io_wait(env, waiter, 0);
+		spin_lock(&obj->oo_ol_spin);
+
+		if (rc < 0)
+			break;
+
+		oscl->ols_owner = NULL;
+		goto restart;
+	}
+	spin_unlock(&obj->oo_ol_spin);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(osc_lock_enqueue_wait);
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int osc_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, struct cl_sync_io *anchor)
+{
+	struct osc_thread_info		*info  = osc_env_info(env);
+	struct osc_io			*oio   = osc_env_io(env);
+	struct osc_object		*osc   = cl2osc(slice->cls_obj);
+	struct osc_lock			*oscl  = cl2osc_lock(slice);
+	struct obd_export		*exp   = osc_export(osc);
+	struct cl_lock			*lock  = slice->cls_lock;
+	struct ldlm_res_id		*resname = &info->oti_resname;
+	union ldlm_policy_data		*policy  = &info->oti_policy;
+	osc_enqueue_upcall_f		upcall   = osc_lock_upcall;
+	void				*cookie  = oscl;
+	bool				async    = false;
+	int				result;
+
+        ENTRY;
+
+	LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+		"lock = %p, ols = %p\n", lock, oscl);
+
+	if (oscl->ols_state == OLS_GRANTED)
+		RETURN(0);
+
+	if ((oscl->ols_flags & LDLM_FL_NO_EXPANSION) &&
+	    !exp_connect_lockahead(exp)) {
+		result = -EOPNOTSUPP;
+		CERROR("%s: server does not support lockahead/locknoexpand: rc = %d\n",
+		       exp->exp_obd->obd_name, result);
+		RETURN(result);
+	}
+
+	if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
+		GOTO(enqueue_base, 0);
+
+	/* For glimpse and/or speculative locks, do not wait for reply from
+	 * server on LDLM request */
+	if (oscl->ols_glimpse || oscl->ols_speculative) {
+		/* Speculative and glimpse locks do not have an anchor */
+		LASSERT(equi(oscl->ols_speculative, anchor == NULL));
+		async = true;
+		GOTO(enqueue_base, 0);
+	}
+
+	result = osc_lock_enqueue_wait(env, osc, oscl);
+	if (result < 0)
+		GOTO(out, result);
+
+	/* we can grant lockless lock right after all conflicting locks
+	 * are canceled. */
+	if (osc_lock_is_lockless(oscl)) {
+		oscl->ols_state = OLS_GRANTED;
+		oio->oi_lockless = 1;
+		RETURN(0);
+	}
+
+enqueue_base:
+	oscl->ols_state = OLS_ENQUEUED;
+	if (anchor != NULL) {
+		atomic_inc(&anchor->csi_sync_nr);
+		oscl->ols_owner = anchor;
+	}
+
+	/**
+	 * DLM lock's ast data must be osc_object;
+	 * if glimpse or speculative lock, async of osc_enqueue_base()
+	 * must be true
+	 *
+	 * For non-speculative locks:
+	 * DLM's enqueue callback set to osc_lock_upcall() with cookie as
+	 * osc_lock.
+	 * For speculative locks:
+	 * osc_lock_upcall_speculative & cookie is the osc object, since
+	 * there is no osc_lock
+	 */
+	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
+	osc_lock_build_policy(env, lock, policy);
+	if (oscl->ols_speculative) {
+		oscl->ols_einfo.ei_cbdata = NULL;
+		/* hold a reference for callback */
+		cl_object_get(osc2cl(osc));
+		upcall = osc_lock_upcall_speculative;
+		cookie = osc;
+	}
+	result = osc_enqueue_base(exp, resname, &oscl->ols_flags,
+				  policy, &oscl->ols_lvb,
+				  upcall, cookie,
+				  &oscl->ols_einfo, PTLRPCD_SET, async,
+				  oscl->ols_speculative);
+	if (result == 0) {
+		if (osc_lock_is_lockless(oscl)) {
+			oio->oi_lockless = 1;
+		} else if (!async) {
+			if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_IDLE_RACE)) {
+				OBD_RACE(OBD_FAIL_PTLRPC_IDLE_RACE);
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(cfs_time_seconds(1) / 2);
+			}
+			LASSERT(oscl->ols_state == OLS_GRANTED);
+			LASSERT(oscl->ols_hold);
+			LASSERT(oscl->ols_dlmlock != NULL);
+		}
+	} else if (oscl->ols_speculative) {
+		cl_object_put(env, osc2cl(osc));
+		if (oscl->ols_glimpse) {
+			/* hide error for AGL request */
+			result = 0;
+		}
+	}
+
+out:
+	if (result < 0) {
+		oscl->ols_state = OLS_CANCELLED;
+		osc_lock_wake_waiters(env, osc, oscl);
+
+		if (anchor != NULL)
+			cl_sync_io_note(env, anchor, result);
+	}
+	RETURN(result);
+}
+
+/**
+ * Breaks a link between osc_lock and dlm_lock.
+ */
+static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
+{
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = olck->ols_dlmlock;
+	if (dlmlock == NULL)
+		RETURN_EXIT;
+
+	if (olck->ols_hold) {
+		olck->ols_hold = 0;
+		ldlm_lock_decref(&olck->ols_handle, olck->ols_einfo.ei_mode);
+		olck->ols_handle.cookie = 0ULL;
+	}
+
+	olck->ols_dlmlock = NULL;
+
+	/* release a reference taken in osc_lock_upcall(). */
+	LASSERT(olck->ols_has_ref);
+	lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
+	LDLM_LOCK_RELEASE(dlmlock);
+	olck->ols_has_ref = 0;
+
+	EXIT;
+}
+
+/**
+ * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
+ * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
+ * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
+ * with some other lock some where in the cluster. This function does the
+ * following:
+ *
+ *     - invalidates all pages protected by this lock (after sending dirty
+ *       ones to the server, as necessary);
+ *
+ *     - decref's underlying ldlm lock;
+ *
+ *     - cancels ldlm lock (ldlm_cli_cancel()).
+ */
+void osc_lock_cancel(const struct lu_env *env,
+		     const struct cl_lock_slice *slice)
+{
+	struct osc_object *obj  = cl2osc(slice->cls_obj);
+	struct osc_lock	  *oscl = cl2osc_lock(slice);
+
+	ENTRY;
+
+	LINVRNT(osc_lock_invariant(oscl));
+
+	osc_lock_detach(env, oscl);
+	oscl->ols_state = OLS_CANCELLED;
+	oscl->ols_flags &= ~LDLM_FL_LVB_READY;
+
+	osc_lock_wake_waiters(env, obj, oscl);
+	EXIT;
+}
+EXPORT_SYMBOL(osc_lock_cancel);
+
+int osc_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	(*p)(env, cookie, "%p %#llx %#llx %d %p ",
+	     lock->ols_dlmlock, lock->ols_flags, lock->ols_handle.cookie,
+	     lock->ols_state, lock->ols_owner);
+	osc_lvb_print(env, cookie, p, &lock->ols_lvb);
+	return 0;
+}
+EXPORT_SYMBOL(osc_lock_print);
+
+static const struct cl_lock_operations osc_lock_ops = {
+        .clo_fini    = osc_lock_fini,
+        .clo_enqueue = osc_lock_enqueue,
+        .clo_cancel  = osc_lock_cancel,
+        .clo_print   = osc_lock_print,
+};
+
+static void osc_lock_lockless_cancel(const struct lu_env *env,
+				     const struct cl_lock_slice *slice)
+{
+	struct osc_lock      *ols   = cl2osc_lock(slice);
+	struct osc_object    *osc   = cl2osc(slice->cls_obj);
+
+	LASSERT(ols->ols_dlmlock == NULL);
+	osc_lock_wake_waiters(env, osc, ols);
+}
+
+static const struct cl_lock_operations osc_lock_lockless_ops = {
+        .clo_fini      = osc_lock_fini,
+        .clo_enqueue   = osc_lock_enqueue,
+        .clo_cancel    = osc_lock_lockless_cancel,
+        .clo_print     = osc_lock_print
+};
+
+void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
+			 struct cl_object *obj, struct osc_lock *oscl)
+{
+	struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
+	pgoff_t io_start;
+	pgoff_t io_end;
+
+	if (!cl_object_same(io->ci_obj, obj))
+		return;
+
+	if (likely(io->ci_type == CIT_WRITE)) {
+		io_start = cl_index(obj, io->u.ci_rw.crw_pos);
+		io_end = cl_index(obj, io->u.ci_rw.crw_pos +
+						io->u.ci_rw.crw_count - 1);
+	} else {
+		LASSERT(cl_io_is_mkwrite(io));
+		io_start = io_end = io->u.ci_fault.ft_index;
+	}
+
+	if (descr->cld_mode >= CLM_WRITE &&
+	    (cl_io_is_append(io) ||
+	     (descr->cld_start <= io_start && descr->cld_end >= io_end))) {
+		struct osc_io *oio = osc_env_io(env);
+
+		/* There must be only one lock to match the write region */
+		LASSERT(oio->oi_write_osclock == NULL);
+		oio->oi_write_osclock = oscl;
+	}
+}
+EXPORT_SYMBOL(osc_lock_set_writer);
+
+int osc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *io)
+{
+	struct osc_lock *oscl;
+	__u32 enqflags = lock->cll_descr.cld_enq_flags;
+
+	OBD_SLAB_ALLOC_PTR_GFP(oscl, osc_lock_kmem, GFP_NOFS);
+	if (oscl == NULL)
+		return -ENOMEM;
+
+	oscl->ols_state = OLS_NEW;
+	spin_lock_init(&oscl->ols_lock);
+	INIT_LIST_HEAD(&oscl->ols_waiting_list);
+	INIT_LIST_HEAD(&oscl->ols_wait_entry);
+	INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj);
+	oscl->ols_lockless_ops = &osc_lock_lockless_ops;
+
+	/* Speculative lock requests must be either no_expand or glimpse
+	 * request (CEF_GLIMPSE).  non-glimpse no_expand speculative extent
+	 * locks will break ofd_intent_cb. (see comment there)*/
+	LASSERT(ergo((enqflags & CEF_SPECULATIVE) != 0,
+		(enqflags & (CEF_LOCK_NO_EXPAND | CEF_GLIMPSE)) != 0));
+
+	oscl->ols_flags = osc_enq2ldlm_flags(enqflags);
+	oscl->ols_speculative = !!(enqflags & CEF_SPECULATIVE);
+	if (lock->cll_descr.cld_mode == CLM_GROUP)
+		oscl->ols_flags |= LDLM_FL_ATOMIC_CB;
+
+	if (oscl->ols_flags & LDLM_FL_HAS_INTENT) {
+		oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED;
+		oscl->ols_glimpse = 1;
+	}
+	if (io->ci_ndelay && cl_object_same(io->ci_obj, obj))
+		oscl->ols_flags |= LDLM_FL_NDELAY;
+	osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo);
+
+	cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops);
+
+	if (!(enqflags & CEF_MUST))
+		/* try to convert this lock to a lockless lock */
+		osc_lock_to_lockless(env, oscl, (enqflags & CEF_NEVER));
+
+	if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io))
+		osc_lock_set_writer(env, io, obj, oscl);
+
+	LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %#llx",
+			  lock, oscl, oscl->ols_flags);
+
+	return 0;
+}
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env,
+					   struct osc_object *obj,
+					   pgoff_t index,
+					   enum osc_dap_flags dap_flags)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct ldlm_res_id *resname = &info->oti_resname;
+	union ldlm_policy_data *policy  = &info->oti_policy;
+	struct lustre_handle lockh;
+	struct ldlm_lock *lock = NULL;
+	enum ldlm_mode mode;
+	__u64 flags;
+	enum ldlm_match_flags match_flags = 0;
+
+	ENTRY;
+
+	ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname);
+	osc_index2policy(policy, osc2cl(obj), index, index);
+	policy->l_extent.gid = LDLM_GID_ANY;
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
+	if (dap_flags & OSC_DAP_FL_TEST_LOCK)
+		flags |= LDLM_FL_TEST_LOCK;
+
+	if (dap_flags & OSC_DAP_FL_AST)
+		match_flags |= LDLM_MATCH_AST;
+
+	if (dap_flags & OSC_DAP_FL_CANCELING)
+		match_flags |= LDLM_MATCH_UNREF;
+
+	if (dap_flags & OSC_DAP_FL_RIGHT)
+		match_flags |= LDLM_MATCH_RIGHT;
+
+	/*
+	 * It is fine to match any group lock since there could be only one
+	 * with a uniq gid and it conflicts with all other lock modes too
+	 */
+again:
+	mode = osc_match_base(env, osc_export(obj), resname, LDLM_EXTENT,
+			      policy, LCK_PR | LCK_PW | LCK_GROUP, &flags,
+			      obj, &lockh, match_flags);
+	if (mode != 0) {
+		lock = ldlm_handle2lock(&lockh);
+		/* RACE: the lock is cancelled so let's try again */
+		if (unlikely(lock == NULL))
+			goto again;
+	}
+
+	RETURN(lock);
+}
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_object.c b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
new file mode 100644
index 0000000000000..aa116260c3475
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
@@ -0,0 +1,497 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_object for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+#include <lustre_osc.h>
+#include <linux/delay.h>
+
+#include "osc_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+static void osc_obj_build_res_name(struct osc_object *osc,
+				   struct ldlm_res_id *resname)
+{
+	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
+}
+
+static const struct osc_object_operations osc_object_ops = {
+	.oto_build_res_name = osc_obj_build_res_name,
+	.oto_dlmlock_at_pgoff = osc_obj_dlmlock_at_pgoff,
+};
+
+int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf)
+{
+        struct osc_object           *osc   = lu2osc(obj);
+        const struct cl_object_conf *cconf = lu2cl_conf(conf);
+
+	osc->oo_oinfo = cconf->u.coc_oinfo;
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+	mutex_init(&osc->oo_debug_mutex);
+#endif
+	INIT_LIST_HEAD(&osc->oo_ready_item);
+	INIT_LIST_HEAD(&osc->oo_hp_ready_item);
+	INIT_LIST_HEAD(&osc->oo_write_item);
+	INIT_LIST_HEAD(&osc->oo_read_item);
+
+	osc->oo_root.rb_node = NULL;
+	INIT_LIST_HEAD(&osc->oo_hp_exts);
+	INIT_LIST_HEAD(&osc->oo_urgent_exts);
+	INIT_LIST_HEAD(&osc->oo_full_exts);
+	INIT_LIST_HEAD(&osc->oo_reading_exts);
+	atomic_set(&osc->oo_nr_reads, 0);
+	atomic_set(&osc->oo_nr_writes, 0);
+	spin_lock_init(&osc->oo_lock);
+	spin_lock_init(&osc->oo_tree_lock);
+	spin_lock_init(&osc->oo_ol_spin);
+	INIT_LIST_HEAD(&osc->oo_ol_list);
+
+	atomic_set(&osc->oo_nr_ios, 0);
+	init_waitqueue_head(&osc->oo_io_waitq);
+
+	LASSERT(osc->oo_obj_ops != NULL);
+
+	cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
+
+	return 0;
+}
+EXPORT_SYMBOL(osc_object_init);
+
+void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct osc_object *osc = lu2osc(obj);
+
+	LASSERT(list_empty(&osc->oo_ready_item));
+	LASSERT(list_empty(&osc->oo_hp_ready_item));
+	LASSERT(list_empty(&osc->oo_write_item));
+	LASSERT(list_empty(&osc->oo_read_item));
+
+	LASSERT(osc->oo_root.rb_node == NULL);
+	LASSERT(list_empty(&osc->oo_hp_exts));
+	LASSERT(list_empty(&osc->oo_urgent_exts));
+	LASSERT(list_empty(&osc->oo_full_exts));
+	LASSERT(list_empty(&osc->oo_reading_exts));
+	LASSERT(atomic_read(&osc->oo_nr_reads) == 0);
+	LASSERT(atomic_read(&osc->oo_nr_writes) == 0);
+	LASSERT(list_empty(&osc->oo_ol_list));
+	LASSERT(atomic_read(&osc->oo_nr_ios) == 0);
+
+	lu_object_fini(obj);
+	/* osc doen't contain an lu_object_header, so we don't need call_rcu */
+	OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
+}
+EXPORT_SYMBOL(osc_object_free);
+
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+		  lu_printer_t p, const struct ost_lvb *lvb)
+{
+	return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu "
+		    "ctime: %llu blocks: %llu",
+                    lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+                    lvb->lvb_ctime, lvb->lvb_blocks);
+}
+EXPORT_SYMBOL(osc_lvb_print);
+
+int osc_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t p, const struct lu_object *obj)
+{
+	struct osc_object *osc = lu2osc(obj);
+	struct lov_oinfo *oinfo = osc->oo_oinfo;
+	struct osc_async_rc *ar = &oinfo->loi_ar;
+
+	(*p)(env, cookie, "id: "DOSTID" "
+	     "idx: %d gen: %d kms_valid: %u kms %llu "
+	     "rc: %d force_sync: %d min_xid: %llu ",
+	     POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx,
+	     oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
+	     ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
+	osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
+	return 0;
+}
+EXPORT_SYMBOL(osc_object_print);
+
+
+int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+		 struct cl_attr *attr)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	cl_lvb2attr(attr, &oinfo->loi_lvb);
+	attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+	return 0;
+}
+EXPORT_SYMBOL(osc_attr_get);
+
+int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+		    const struct cl_attr *attr, unsigned valid)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+	struct ost_lvb   *lvb   = &oinfo->loi_lvb;
+
+	if (valid & CAT_SIZE)
+		lvb->lvb_size = attr->cat_size;
+	if (valid & CAT_MTIME)
+		lvb->lvb_mtime = attr->cat_mtime;
+	if (valid & CAT_ATIME)
+		lvb->lvb_atime = attr->cat_atime;
+	if (valid & CAT_CTIME)
+		lvb->lvb_ctime = attr->cat_ctime;
+	if (valid & CAT_BLOCKS)
+		lvb->lvb_blocks = attr->cat_blocks;
+	if (valid & CAT_KMS) {
+		CDEBUG(D_CACHE, "set kms from %lluto %llu\n",
+		       oinfo->loi_kms, (__u64)attr->cat_kms);
+		loi_kms_set(oinfo, attr->cat_kms);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(osc_attr_update);
+
+int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
+		       struct ost_lvb *lvb)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	lvb->lvb_size = oinfo->loi_kms;
+	lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+	return 0;
+}
+EXPORT_SYMBOL(osc_object_glimpse);
+
+static int osc_object_ast_clear(struct ldlm_lock *lock, void *data)
+{
+	struct osc_object *osc = (struct osc_object *)data;
+	struct ost_lvb *lvb = lock->l_lvb_data;
+	struct lov_oinfo *oinfo;
+	ENTRY;
+
+	if (lock->l_ast_data == data) {
+		lock->l_ast_data = NULL;
+
+		LASSERT(osc != NULL);
+		LASSERT(osc->oo_oinfo != NULL);
+		LASSERT(lvb != NULL);
+
+		/* Updates lvb in lock by the cached oinfo */
+		oinfo = osc->oo_oinfo;
+
+		LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: "
+			   "%llu %llu %llu by oinfo size %llu blocks %llu "
+			   "[cma]time %llu %llu %llu", lvb->lvb_size,
+			   lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime,
+			   lvb->lvb_atime, oinfo->loi_lvb.lvb_size,
+			   oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime,
+			   oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime);
+		LASSERTF(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms,
+			 "lvb_size %#llx, loi_kms %#llx\n",
+			 oinfo->loi_lvb.lvb_size, oinfo->loi_kms);
+
+		cl_object_attr_lock(&osc->oo_cl);
+		memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb));
+		cl_object_attr_unlock(&osc->oo_cl);
+		ldlm_clear_lvb_cached(lock);
+	}
+	RETURN(LDLM_ITER_CONTINUE);
+}
+
+int osc_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	struct osc_object  *osc = cl2osc(obj);
+	struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname;
+
+	/* DLM locks don't hold a reference of osc_object so we have to
+	 * clear it before the object is being destroyed. */
+	osc_build_res_name(osc, resname);
+	ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
+			      osc_object_ast_clear, osc);
+	return 0;
+}
+EXPORT_SYMBOL(osc_object_prune);
+
+static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
+			     struct ll_fiemap_info_key *fmkey,
+			     struct fiemap *fiemap, size_t *buflen)
+{
+	struct obd_export *exp = osc_export(cl2osc(obj));
+	struct ldlm_res_id resid;
+	union ldlm_policy_data policy;
+	struct lustre_handle lockh;
+	enum ldlm_mode mode = LCK_MINMODE;
+	struct ptlrpc_request *req;
+	struct fiemap *reply;
+	char *tmp;
+	int rc;
+	ENTRY;
+
+	fmkey->lfik_oa.o_oi = cl2osc(obj)->oo_oinfo->loi_oi;
+	if (!(fmkey->lfik_fiemap.fm_flags & FIEMAP_FLAG_SYNC))
+		goto skip_locking;
+
+	policy.l_extent.start = fmkey->lfik_fiemap.fm_start & PAGE_MASK;
+
+	if (OBD_OBJECT_EOF - fmkey->lfik_fiemap.fm_length <=
+	    fmkey->lfik_fiemap.fm_start + PAGE_SIZE - 1)
+		policy.l_extent.end = OBD_OBJECT_EOF;
+	else
+		policy.l_extent.end = (fmkey->lfik_fiemap.fm_start +
+				       fmkey->lfik_fiemap.fm_length +
+				       PAGE_SIZE - 1) & PAGE_MASK;
+
+	ostid_build_res_name(&fmkey->lfik_oa.o_oi, &resid);
+	mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+			       LDLM_FL_BLOCK_GRANTED | LDLM_FL_LVB_READY,
+			       &resid, LDLM_EXTENT, &policy,
+			       LCK_PR | LCK_PW, &lockh);
+	if (mode) { /* lock is cached on client */
+		if (mode != LCK_PR) {
+			ldlm_lock_addref(&lockh, LCK_PR);
+			ldlm_lock_decref(&lockh, LCK_PW);
+		}
+	} else { /* no cached lock, needs acquire lock on server side */
+		fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS;
+		fmkey->lfik_oa.o_flags |= OBD_FL_SRVLOCK;
+	}
+
+skip_locking:
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_OST_GET_INFO_FIEMAP);
+	if (req == NULL)
+		GOTO(drop_lock, rc = -ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT,
+			     sizeof(*fmkey));
+	req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_CLIENT,
+			     *buflen);
+	req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_SERVER,
+			     *buflen);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		GOTO(drop_lock, rc);
+	}
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
+	memcpy(tmp, fmkey, sizeof(*fmkey));
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+	memcpy(tmp, fiemap, *buflen);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc != 0)
+		GOTO(fini_req, rc);
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+	if (reply == NULL)
+		GOTO(fini_req, rc = -EPROTO);
+
+	memcpy(fiemap, reply, *buflen);
+fini_req:
+	ptlrpc_req_finished(req);
+drop_lock:
+	if (mode)
+		ldlm_lock_decref(&lockh, LCK_PR);
+	RETURN(rc);
+}
+
+#define MAX_OSC_DLMLOCK_LOOKUP 3
+/**
+ * Implementation of struct cl_object_operations::coo_req_attr_set() for osc
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+			     struct cl_req_attr *attr)
+{
+	struct lov_oinfo *oinfo;
+	struct obdo      *oa;
+	struct ost_lvb   *lvb;
+	u64		  flags = attr->cra_flags;
+
+	oinfo   = cl2osc(obj)->oo_oinfo;
+	lvb     = &oinfo->loi_lvb;
+	oa      = attr->cra_oa;
+
+	if ((flags & OBD_MD_FLMTIME) != 0) {
+		oa->o_mtime = lvb->lvb_mtime;
+		oa->o_valid |= OBD_MD_FLMTIME;
+	}
+	/* XXX:
+	 * I don't understand this part, what for OSC resets atime just
+	 * set by VVP layer to 0 so that OST gets 0 instead of actual
+	 * atime, bzzz. please inspect this place with extra care.
+	 */
+	if ((flags & OBD_MD_FLATIME) && lvb->lvb_atime > oa->o_atime) {
+		oa->o_atime = lvb->lvb_atime;
+		oa->o_valid |= OBD_MD_FLATIME;
+	}
+	if ((flags & OBD_MD_FLCTIME) != 0) {
+		oa->o_ctime = lvb->lvb_ctime;
+		oa->o_valid |= OBD_MD_FLCTIME;
+	}
+	if (flags & OBD_MD_FLGROUP) {
+		ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
+		oa->o_valid |= OBD_MD_FLGROUP;
+	}
+	if (flags & OBD_MD_FLID) {
+		int rc;
+
+		rc = ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
+		if (rc) {
+			CERROR("Bad %llu to set " DOSTID " : rc %d\n",
+			       (unsigned long long)ostid_id(&oinfo->loi_oi),
+			       POSTID(&oa->o_oi), rc);
+		}
+		oa->o_valid |= OBD_MD_FLID;
+	}
+	if (flags & OBD_MD_FLHANDLE) {
+		struct ldlm_lock *lock;
+		struct osc_page *opg;
+		int retry_cnt = 0;
+
+		opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj));
+lookup:
+		lock = osc_dlmlock_at_pgoff(env, cl2osc(obj), osc_index(opg),
+				OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_CANCELING);
+		if (lock == NULL && !opg->ops_srvlock) {
+			struct ldlm_resource *res;
+			struct ldlm_res_id *resname;
+
+			if (retry_cnt < MAX_OSC_DLMLOCK_LOOKUP) {
+				/* the code is racing, delay to be sure to be
+				 * out of it and try again, value based on
+				 * debugging timing. */
+				CERROR("Uncovered page by a LDLM lock, "
+				       "retrying %d\n", ++retry_cnt);
+				smp_mb();
+				mdelay(50);
+				goto lookup;
+			}
+
+			CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page,
+				      "uncovered page!\n");
+
+			resname = &osc_env_info(env)->oti_resname;
+			ostid_build_res_name(&oinfo->loi_oi, resname);
+			res = ldlm_resource_get(
+				osc_export(cl2osc(obj))->exp_obd->obd_namespace,
+				NULL, resname, LDLM_EXTENT, 0);
+			if (IS_ERR(res))
+				CERROR("No lock resource\n");
+			else
+				ldlm_resource_dump(D_ERROR, res);
+
+			libcfs_debug_dumpstack(NULL);
+			LBUG();
+		}
+
+		/* check for lockless io. */
+		if (lock != NULL) {
+			oa->o_handle = lock->l_remote_handle;
+			oa->o_valid |= OBD_MD_FLHANDLE;
+			LDLM_LOCK_PUT(lock);
+		}
+	}
+}
+
+static const struct cl_object_operations osc_ops = {
+	.coo_page_init    = osc_page_init,
+	.coo_lock_init    = osc_lock_init,
+	.coo_io_init      = osc_io_init,
+	.coo_attr_get     = osc_attr_get,
+	.coo_attr_update  = osc_attr_update,
+	.coo_glimpse      = osc_object_glimpse,
+	.coo_prune        = osc_object_prune,
+	.coo_fiemap       = osc_object_fiemap,
+	.coo_req_attr_set = osc_req_attr_set
+};
+
+static const struct lu_object_operations osc_lu_obj_ops = {
+	.loo_object_init      = osc_object_init,
+	.loo_object_release   = NULL,
+	.loo_object_free      = osc_object_free,
+	.loo_object_print     = osc_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct osc_object *osc;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS);
+	if (osc != NULL) {
+		obj = osc2lu(osc);
+		lu_object_init(obj, NULL, dev);
+		osc->oo_cl.co_ops = &osc_ops;
+		obj->lo_ops = &osc_lu_obj_ops;
+		osc->oo_obj_ops = &osc_object_ops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc)
+{
+	ENTRY;
+
+	CDEBUG(D_INODE, "Invalidate osc object: %p, # of active IOs: %d\n",
+	       osc, atomic_read(&osc->oo_nr_ios));
+
+	wait_event_idle(osc->oo_io_waitq, atomic_read(&osc->oo_nr_ios) == 0);
+
+	/* Discard all dirty pages of this object. */
+	osc_cache_truncate_start(env, osc, 0, NULL);
+
+	/* Discard all caching pages */
+	osc_lock_discard_pages(env, osc, 0, CL_PAGE_EOF, true);
+
+	/* Clear ast data of dlm lock. Do this after discarding all pages */
+	cl_object_prune(env, osc2cl(osc));
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_object_invalidate);
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_page.c b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
new file mode 100644
index 0000000000000..fa5d86e0f2ea4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
@@ -0,0 +1,1158 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_page for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+#include <lustre_osc.h>
+
+#include "osc_internal.h"
+
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg);
+static void osc_lru_use(struct client_obd *cli, struct osc_page *opg);
+static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli,
+			 struct osc_page *opg);
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*
+ * Page operations.
+ */
+static void osc_page_transfer_get(struct osc_page *opg, const char *label)
+{
+	struct cl_page *page = opg->ops_cl.cpl_page;
+
+	LASSERT(!opg->ops_transfer_pinned);
+	cl_page_get(page);
+	lu_ref_add_atomic(&page->cp_reference, label, page);
+	opg->ops_transfer_pinned = 1;
+}
+
+static void osc_page_transfer_put(const struct lu_env *env,
+				  struct osc_page *opg)
+{
+	struct cl_page *page = opg->ops_cl.cpl_page;
+
+	if (opg->ops_transfer_pinned) {
+		opg->ops_transfer_pinned = 0;
+		lu_ref_del(&page->cp_reference, "transfer", page);
+		cl_page_put(env, page);
+	}
+}
+
+/**
+ * This is called once for every page when it is submitted for a transfer
+ * either opportunistic (osc_page_cache_add()), or immediate
+ * (osc_page_submit()).
+ */
+static void osc_page_transfer_add(const struct lu_env *env,
+                                  struct osc_page *opg, enum cl_req_type crt)
+{
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	osc_lru_use(osc_cli(obj), opg);
+}
+
+int osc_page_cache_add(const struct lu_env *env, struct osc_page *opg,
+		       struct cl_io *io, cl_commit_cbt cb)
+{
+	int result;
+	ENTRY;
+
+	osc_page_transfer_get(opg, "transfer\0cache");
+	result = osc_queue_async_io(env, io, opg, cb);
+	if (result != 0)
+		osc_page_transfer_put(env, opg);
+	else
+		osc_page_transfer_add(env, opg, CRT_WRITE);
+
+	RETURN(result);
+}
+
+void osc_index2policy(union ldlm_policy_data *policy,
+		      const struct cl_object *obj, pgoff_t start, pgoff_t end)
+{
+	memset(policy, 0, sizeof *policy);
+	policy->l_extent.start = cl_offset(obj, start);
+	policy->l_extent.end   = cl_offset(obj, end + 1) - 1;
+}
+
+static inline s64 osc_submit_duration(struct osc_page *opg)
+{
+	if (ktime_to_ns(opg->ops_submit_time) == 0)
+		return 0;
+
+	return ktime_ms_delta(ktime_get(), opg->ops_submit_time);
+}
+
+static int osc_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	struct osc_async_page *oap = &opg->ops_oap;
+	struct osc_object *obj = cl2osc(slice->cpl_obj);
+	struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli;
+
+	return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p %lu: "
+			  "1< %#x %d %c %c > "
+			  "2< %lld %u %u %#x %#x | %p %p %p > "
+			  "3< %d %lld %d > "
+			  "4< %d %d %d %lu %c | %c %c %c %c > "
+			  "5< %c %c %c %c | %d %c | %d %c %c>\n",
+			  opg, osc_index(opg),
+			  /* 1 */
+			  oap->oap_magic, oap->oap_cmd,
+			  list_empty_marker(&oap->oap_pending_item),
+			  list_empty_marker(&oap->oap_rpc_item),
+			  /* 2 */
+			  oap->oap_obj_off, oap->oap_page_off, oap->oap_count,
+			  oap->oap_async_flags, oap->oap_brw_flags,
+			  oap->oap_request, oap->oap_cli, obj,
+			  /* 3 */
+			  opg->ops_transfer_pinned,
+			  osc_submit_duration(opg), opg->ops_srvlock,
+			  /* 4 */
+			  cli->cl_r_in_flight, cli->cl_w_in_flight,
+			  cli->cl_max_rpcs_in_flight,
+			  cli->cl_avail_grant,
+			  waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-',
+			  list_empty_marker(&cli->cl_loi_ready_list),
+			  list_empty_marker(&cli->cl_loi_hp_ready_list),
+			  list_empty_marker(&cli->cl_loi_write_list),
+			  list_empty_marker(&cli->cl_loi_read_list),
+			  /* 5 */
+			  list_empty_marker(&obj->oo_ready_item),
+			  list_empty_marker(&obj->oo_hp_ready_item),
+			  list_empty_marker(&obj->oo_write_item),
+			  list_empty_marker(&obj->oo_read_item),
+			  atomic_read(&obj->oo_nr_reads),
+			  list_empty_marker(&obj->oo_reading_exts),
+			  atomic_read(&obj->oo_nr_writes),
+			  list_empty_marker(&obj->oo_hp_exts),
+			  list_empty_marker(&obj->oo_urgent_exts));
+}
+
+static void osc_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_TRACE, "%p\n", opg);
+	osc_page_transfer_put(env, opg);
+	rc = osc_teardown_async_page(env, obj, opg);
+	if (rc) {
+		CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page,
+			      "Trying to teardown failed: %d\n", rc);
+		LASSERT(0);
+	}
+
+	osc_lru_del(osc_cli(obj), opg);
+
+	if (slice->cpl_page->cp_type == CPT_CACHEABLE) {
+		void *value = NULL;
+
+		spin_lock(&obj->oo_tree_lock);
+		if (opg->ops_intree) {
+			value = radix_tree_delete(&obj->oo_tree,
+						  osc_index(opg));
+			if (value != NULL) {
+				--obj->oo_npages;
+				opg->ops_intree = 0;
+			}
+		}
+		spin_unlock(&obj->oo_tree_lock);
+
+		LASSERT(ergo(value != NULL, value == opg));
+	}
+
+	EXIT;
+}
+
+static void osc_page_clip(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  int from, int to)
+{
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_async_page *oap = &opg->ops_oap;
+
+	opg->ops_from = from;
+	/* argument @to is exclusive, but @ops_to is inclusive */
+	opg->ops_to   = to - 1;
+	/* This isn't really necessary for transient pages, but we also don't
+	 * call clip on transient pages often, so it's OK.
+	 */
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+	spin_unlock(&oap->oap_lock);
+}
+
+static int osc_page_flush(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *io)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int rc = 0;
+	ENTRY;
+	rc = osc_flush_async_page(env, io, opg);
+	RETURN(rc);
+}
+
+static void osc_page_touch(const struct lu_env *env,
+			  const struct cl_page_slice *slice, size_t to)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	struct cl_object *obj = opg->ops_cl.cpl_obj;
+
+	osc_page_touch_at(env, obj, osc_index(opg), to);
+}
+
+static const struct cl_page_operations osc_page_ops = {
+	.cpo_print         = osc_page_print,
+	.cpo_delete        = osc_page_delete,
+	.cpo_clip           = osc_page_clip,
+	.cpo_flush          = osc_page_flush,
+	.cpo_page_touch	   = osc_page_touch,
+};
+
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *cl_page, pgoff_t index)
+{
+	struct osc_object *osc = cl2osc(obj);
+	struct osc_page *opg = cl_object_page_slice(obj, cl_page);
+	struct osc_io *oio = osc_env_io(env);
+	int result;
+
+	opg->ops_from = 0;
+	opg->ops_to = PAGE_SIZE - 1;
+
+	INIT_LIST_HEAD(&opg->ops_lru);
+
+	result = osc_prep_async_page(osc, opg, cl_page, cl_offset(obj, index));
+	if (result != 0)
+		return result;
+
+	opg->ops_srvlock = osc_io_srvlock(oio);
+	cl_page_slice_add(cl_page, &opg->ops_cl, obj, &osc_page_ops);
+
+	/* reserve an LRU space for this page */
+	if (cl_page->cp_type == CPT_CACHEABLE) {
+		result = osc_lru_alloc(env, osc_cli(osc), opg);
+		if (result == 0) {
+			result = radix_tree_preload(GFP_NOFS);
+			if (result == 0) {
+				spin_lock(&osc->oo_tree_lock);
+				result = radix_tree_insert(&osc->oo_tree,
+							   index, opg);
+				if (result == 0) {
+					++osc->oo_npages;
+					opg->ops_intree = 1;
+				}
+				spin_unlock(&osc->oo_tree_lock);
+
+				radix_tree_preload_end();
+			}
+		}
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(osc_page_init);
+
+/**
+ * Helper function called by osc_io_submit() for every page in an immediate
+ * transfer (i.e., transferred synchronously).
+ */
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags, ktime_t submit_time)
+{
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_async_page *oap = &opg->ops_oap;
+
+	LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, "
+		 "magic 0x%x\n", oap, oap->oap_magic);
+	LASSERT(oap->oap_async_flags & ASYNC_READY);
+	LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE);
+
+	oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	oap->oap_page_off = opg->ops_from;
+	oap->oap_count = opg->ops_to - opg->ops_from + 1;
+	oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
+
+	if (oio->oi_cap_sys_resource) {
+		oap->oap_brw_flags |= OBD_BRW_SYS_RESOURCE;
+		oap->oap_cmd |= OBD_BRW_SYS_RESOURCE;
+	}
+
+	opg->ops_submit_time = submit_time;
+	osc_page_transfer_get(opg, "transfer\0imm");
+	osc_page_transfer_add(env, opg, crt);
+}
+
+/* --------------- LRU page management ------------------ */
+
+/* OSC is a natural place to manage LRU pages as applications are specialized
+ * to write OSC by OSC. Ideally, if one OSC is used more frequently it should
+ * occupy more LRU slots. On the other hand, we should avoid using up all LRU
+ * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep
+ * for free LRU slots - this will be very bad so the algorithm requires each
+ * OSC to free slots voluntarily to maintain a reasonable number of free slots
+ * at any time.
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq);
+
+/**
+ * LRU pages are freed in batch mode. OSC should at least free this
+ * number of pages to avoid running out of LRU slots.
+ */
+static inline int lru_shrink_min(struct client_obd *cli)
+{
+	return cli->cl_max_pages_per_rpc * 2;
+}
+
+/**
+ * free this number at most otherwise it will take too long time to finsih.
+ */
+static inline int lru_shrink_max(struct client_obd *cli)
+{
+	return cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight;
+}
+
+/**
+ * Check if we can free LRU slots from this OSC. If there exists LRU waiters,
+ * we should free slots aggressively. In this way, slots are freed in a steady
+ * step to maintain fairness among OSCs.
+ *
+ * Return how many LRU pages should be freed.
+ */
+static int osc_cache_too_much(struct client_obd *cli)
+{
+	struct cl_client_cache *cache = cli->cl_cache;
+	long pages = atomic_long_read(&cli->cl_lru_in_list);
+	unsigned long budget;
+
+	LASSERT(cache != NULL);
+	budget = cache->ccc_lru_max / (atomic_read(&cache->ccc_users) - 2);
+
+	/* if it's going to run out LRU slots, we should free some, but not
+	 * too much to maintain faireness among OSCs. */
+	if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 2) {
+		if (pages >= budget)
+			return lru_shrink_max(cli);
+		else if (pages >= budget / 2)
+			return lru_shrink_min(cli);
+	} else {
+		time64_t duration = ktime_get_real_seconds();
+		long timediff;
+
+		/* knock out pages by duration of no IO activity */
+		duration -= cli->cl_lru_last_used;
+		/*
+		 * The difference shouldn't be more than 70 years
+		 * so we can safely case to a long. Round to
+		 * approximately 1 minute.
+		 */
+		timediff = (long)(duration >> 6);
+		if (timediff > 0 && pages >= budget / timediff)
+			return lru_shrink_min(cli);
+	}
+	return 0;
+}
+
+int lru_queue_work(const struct lu_env *env, void *data)
+{
+	struct client_obd *cli = data;
+	int count;
+
+	CDEBUG(D_CACHE, "%s: run LRU work for client obd\n", cli_name(cli));
+	count = osc_cache_too_much(cli);
+	if (count > 0) {
+		int rc = osc_lru_shrink(env, cli, count, false);
+
+		CDEBUG(D_CACHE, "%s: shrank %d/%d pages from client obd\n",
+		       cli_name(cli), rc, count);
+		if (rc >= count) {
+			CDEBUG(D_CACHE, "%s: queue again\n", cli_name(cli));
+			ptlrpcd_queue_work(cli->cl_lru_work);
+		}
+	}
+
+	RETURN(0);
+}
+
+void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist)
+{
+	LIST_HEAD(lru);
+	struct osc_async_page *oap;
+	long npages = 0;
+
+	list_for_each_entry(oap, plist, oap_pending_item) {
+		struct osc_page *opg = oap2osc_page(oap);
+
+		if (!opg->ops_in_lru)
+			continue;
+
+		++npages;
+		LASSERT(list_empty(&opg->ops_lru));
+		list_add(&opg->ops_lru, &lru);
+	}
+
+	if (npages > 0) {
+		spin_lock(&cli->cl_lru_list_lock);
+		list_splice_tail(&lru, &cli->cl_lru_list);
+		atomic_long_sub(npages, &cli->cl_lru_busy);
+		atomic_long_add(npages, &cli->cl_lru_in_list);
+		cli->cl_lru_last_used = ktime_get_real_seconds();
+		spin_unlock(&cli->cl_lru_list_lock);
+
+		if (waitqueue_active(&osc_lru_waitq))
+			(void)ptlrpcd_queue_work(cli->cl_lru_work);
+	}
+}
+
+static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg)
+{
+	LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0);
+	list_del_init(&opg->ops_lru);
+	atomic_long_dec(&cli->cl_lru_in_list);
+}
+
+/**
+ * Page is being destroyed. The page may be not in LRU list, if the transfer
+ * has never finished(error occurred).
+ */
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg)
+{
+	if (opg->ops_in_lru) {
+		spin_lock(&cli->cl_lru_list_lock);
+		if (!list_empty(&opg->ops_lru)) {
+			__osc_lru_del(cli, opg);
+		} else {
+			LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0);
+			atomic_long_dec(&cli->cl_lru_busy);
+		}
+		spin_unlock(&cli->cl_lru_list_lock);
+
+		atomic_long_inc(cli->cl_lru_left);
+		/* this is a great place to release more LRU pages if
+		 * this osc occupies too many LRU pages and kernel is
+		 * stealing one of them. */
+		if (osc_cache_too_much(cli)) {
+			CDEBUG(D_CACHE, "%s: queue LRU work\n", cli_name(cli));
+			(void)ptlrpcd_queue_work(cli->cl_lru_work);
+		}
+		wake_up(&osc_lru_waitq);
+	} else {
+		LASSERT(list_empty(&opg->ops_lru));
+	}
+}
+
+/**
+ * Delete page from LRU list for redirty.
+ */
+static void osc_lru_use(struct client_obd *cli, struct osc_page *opg)
+{
+	/* If page is being transferred for the first time,
+	 * ops_lru should be empty */
+	if (opg->ops_in_lru) {
+		if (list_empty(&opg->ops_lru))
+			return;
+		spin_lock(&cli->cl_lru_list_lock);
+		if (!list_empty(&opg->ops_lru)) {
+			__osc_lru_del(cli, opg);
+			atomic_long_inc(&cli->cl_lru_busy);
+		}
+		spin_unlock(&cli->cl_lru_list_lock);
+	}
+}
+
+static void discard_pagevec(const struct lu_env *env, struct cl_io *io,
+				struct cl_page **pvec, int max_index)
+{
+	struct pagevec *pagevec = &osc_env_info(env)->oti_pagevec;
+	int i;
+
+	ll_pagevec_init(pagevec, 0);
+	for (i = 0; i < max_index; i++) {
+		struct cl_page *page = pvec[i];
+
+		LASSERT(cl_page_is_owned(page, io));
+		cl_page_delete(env, page);
+		cl_page_discard(env, io, page);
+		cl_page_disown(env, io, page);
+		cl_pagevec_put(env, page, pagevec);
+
+		pvec[i] = NULL;
+	}
+	pagevec_release(pagevec);
+}
+
+/**
+ * Check if a cl_page can be released, i.e, it's not being used.
+ *
+ * If unstable account is turned on, bulk transfer may hold one refcount
+ * for recovery so we need to check vmpage refcount as well; otherwise,
+ * even we can destroy cl_page but the corresponding vmpage can't be reused.
+ */
+static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page)
+{
+	if (cl_page_in_use_noref(page))
+		return true;
+
+	if (cli->cl_cache->ccc_unstable_check) {
+		struct page *vmpage = cl_page_vmpage(page);
+
+		/* vmpage have two known users: cl_page and VM page cache */
+		if (page_count(vmpage) - page_mapcount(vmpage) > 2)
+			return true;
+	}
+	return false;
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+		   long target, bool force)
+{
+	struct cl_io *io;
+	struct cl_object *clobj = NULL;
+	struct cl_page **pvec;
+	struct osc_page *opg;
+	long count = 0;
+	int maxscan = 0;
+	int index = 0;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
+	if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+		RETURN(0);
+
+	CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n",
+	       cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force);
+	if (!force) {
+		if (atomic_read(&cli->cl_lru_shrinkers) > 0)
+			RETURN(-EBUSY);
+
+		if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) {
+			atomic_dec(&cli->cl_lru_shrinkers);
+			RETURN(-EBUSY);
+		}
+	} else {
+		atomic_inc(&cli->cl_lru_shrinkers);
+	}
+
+	pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
+	io = osc_env_thread_io(env);
+
+	spin_lock(&cli->cl_lru_list_lock);
+	if (force)
+		cli->cl_lru_reclaim++;
+	maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list));
+	while (!list_empty(&cli->cl_lru_list)) {
+		struct cl_page *page;
+		bool will_free = false;
+
+		if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1)
+			break;
+
+		if (--maxscan < 0)
+			break;
+
+		opg = list_first_entry(&cli->cl_lru_list, struct osc_page,
+				       ops_lru);
+		page = opg->ops_cl.cpl_page;
+		if (lru_page_busy(cli, page)) {
+			list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+			continue;
+		}
+
+		LASSERT(page->cp_obj != NULL);
+		if (clobj != page->cp_obj) {
+			struct cl_object *tmp = page->cp_obj;
+
+			cl_object_get(tmp);
+			spin_unlock(&cli->cl_lru_list_lock);
+
+			if (clobj != NULL) {
+				discard_pagevec(env, io, pvec, index);
+				index = 0;
+
+				cl_io_fini(env, io);
+				cl_object_put(env, clobj);
+				clobj = NULL;
+			}
+
+			clobj = tmp;
+			io->ci_obj = clobj;
+			io->ci_ignore_layout = 1;
+			rc = cl_io_init(env, io, CIT_MISC, clobj);
+
+			spin_lock(&cli->cl_lru_list_lock);
+
+			if (rc != 0)
+				break;
+
+			++maxscan;
+			continue;
+		}
+
+		if (cl_page_own_try(env, io, page) == 0) {
+			if (!lru_page_busy(cli, page)) {
+				/* remove it from lru list earlier to avoid
+				 * lock contention */
+				__osc_lru_del(cli, opg);
+				opg->ops_in_lru = 0; /* will be discarded */
+
+				cl_page_get(page);
+				will_free = true;
+			} else {
+				cl_page_disown(env, io, page);
+			}
+		}
+
+		if (!will_free) {
+			list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+			continue;
+		}
+
+		/* Don't discard and free the page with cl_lru_list held */
+		pvec[index++] = page;
+		if (unlikely(index == OTI_PVEC_SIZE)) {
+			spin_unlock(&cli->cl_lru_list_lock);
+			discard_pagevec(env, io, pvec, index);
+			index = 0;
+
+			spin_lock(&cli->cl_lru_list_lock);
+		}
+
+		if (++count >= target)
+			break;
+	}
+	spin_unlock(&cli->cl_lru_list_lock);
+
+	if (clobj != NULL) {
+		discard_pagevec(env, io, pvec, index);
+
+		cl_io_fini(env, io);
+		cl_object_put(env, clobj);
+	}
+
+	atomic_dec(&cli->cl_lru_shrinkers);
+	if (count > 0) {
+		atomic_long_add(count, cli->cl_lru_left);
+		wake_up(&osc_lru_waitq);
+	}
+	RETURN(count > 0 ? count : rc);
+}
+EXPORT_SYMBOL(osc_lru_shrink);
+
+/**
+ * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
+ * \@npages of LRU slots. For performance consideration, it's better to drop
+ * LRU pages in batch. Therefore, the actual number is adjusted at least
+ * max_pages_per_rpc.
+ */
+static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
+{
+	struct lu_env *env;
+	struct cl_client_cache *cache = cli->cl_cache;
+	struct client_obd *scan;
+	int max_scans;
+	__u16 refcheck;
+	long rc = 0;
+	ENTRY;
+
+	LASSERT(cache != NULL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(rc);
+
+	npages = max_t(int, npages, cli->cl_max_pages_per_rpc);
+	CDEBUG(D_CACHE, "%s: start to reclaim %ld pages from LRU\n",
+	       cli_name(cli), npages);
+	rc = osc_lru_shrink(env, cli, npages, true);
+	if (rc >= npages) {
+		CDEBUG(D_CACHE, "%s: reclaimed %ld/%ld pages from LRU\n",
+		       cli_name(cli), rc, npages);
+		if (osc_cache_too_much(cli) > 0)
+			ptlrpcd_queue_work(cli->cl_lru_work);
+		GOTO(out, rc);
+	} else if (rc > 0) {
+		npages -= rc;
+	}
+
+	CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n",
+		cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
+		atomic_long_read(&cli->cl_lru_busy), npages);
+
+	/* Reclaim LRU slots from other client_obd as it can't free enough
+	 * from its own. This should rarely happen. */
+	spin_lock(&cache->ccc_lru_lock);
+	LASSERT(!list_empty(&cache->ccc_lru));
+
+	cache->ccc_lru_shrinkers++;
+	list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+
+	max_scans = atomic_read(&cache->ccc_users) - 2;
+	while (--max_scans > 0 &&
+	       (scan = list_first_entry_or_null(&cache->ccc_lru,
+						  struct client_obd,
+						  cl_lru_osc)) != NULL) {
+		CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n",
+		       cli_name(scan), scan,
+		       atomic_long_read(&scan->cl_lru_in_list),
+		       atomic_long_read(&scan->cl_lru_busy));
+
+		list_move_tail(&scan->cl_lru_osc, &cache->ccc_lru);
+		if (osc_cache_too_much(scan) > 0) {
+			spin_unlock(&cache->ccc_lru_lock);
+
+			rc = osc_lru_shrink(env, scan, npages, true);
+			spin_lock(&cache->ccc_lru_lock);
+			if (rc >= npages)
+				break;
+			if (rc > 0)
+				npages -= rc;
+		}
+	}
+	spin_unlock(&cache->ccc_lru_lock);
+
+out:
+	cl_env_put(env, &refcheck);
+	CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n",
+	       cli_name(cli), cli, rc);
+	return rc;
+}
+
+/**
+ * osc_lru_alloc() is called to allocate an LRU slot for a cl_page.
+ *
+ * Usually the LRU slots are reserved in osc_io_iter_rw_init().
+ * Only in the case that the LRU slots are in extreme shortage, it should
+ * have reserved enough slots for an IO.
+ */
+static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli,
+			 struct osc_page *opg)
+{
+	struct osc_io *oio = osc_env_io(env);
+	int rc = 0;
+
+	ENTRY;
+
+	if (cli->cl_cache == NULL) /* shall not be in LRU */
+		RETURN(0);
+
+	if (oio->oi_lru_reserved > 0) {
+		--oio->oi_lru_reserved;
+		goto out;
+	}
+
+	LASSERT(atomic_long_read(cli->cl_lru_left) >= 0);
+	while (!atomic_long_add_unless(cli->cl_lru_left, -1, 0)) {
+		/* run out of LRU spaces, try to drop some by itself */
+		rc = osc_lru_reclaim(cli, 1);
+		if (rc < 0)
+			break;
+		if (rc > 0)
+			continue;
+		/* IO issued by readahead, don't try hard */
+		if (oio->oi_is_readahead) {
+			if (atomic_long_read(cli->cl_lru_left) > 0)
+				continue;
+			rc = -EBUSY;
+			break;
+		}
+
+		cond_resched();
+		rc = l_wait_event_abortable(
+			osc_lru_waitq,
+			atomic_long_read(cli->cl_lru_left) > 0);
+		if (rc < 0) {
+			rc = -EINTR;
+			break;
+		}
+	}
+
+out:
+	if (rc >= 0) {
+		atomic_long_inc(&cli->cl_lru_busy);
+		opg->ops_in_lru = 1;
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * osc_lru_reserve() is called to reserve enough LRU slots for I/O.
+ *
+ * The benefit of doing this is to reduce contention against atomic counter
+ * cl_lru_left by changing it from per-page access to per-IO access.
+ */
+unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages)
+{
+	unsigned long reserved = 0;
+	unsigned long max_pages;
+	unsigned long c;
+	int rc;
+
+again:
+	c = atomic_long_read(cli->cl_lru_left);
+	if (c < npages && osc_lru_reclaim(cli, npages) > 0)
+		c = atomic_long_read(cli->cl_lru_left);
+
+	if (c < npages) {
+		/*
+		 * Trigger writeback in the hope some LRU slot could
+		 * be freed.
+		 */
+		rc = ptlrpcd_queue_work(cli->cl_writeback_work);
+		if (rc)
+			return 0;
+	}
+
+	while (c >= npages) {
+		if (c == atomic_long_cmpxchg(cli->cl_lru_left, c, c - npages)) {
+			reserved = npages;
+			break;
+		}
+		c = atomic_long_read(cli->cl_lru_left);
+	}
+
+	if (reserved != npages) {
+		cond_resched();
+		rc = l_wait_event_abortable(
+			osc_lru_waitq,
+			atomic_long_read(cli->cl_lru_left) > 0);
+		goto again;
+	}
+
+	max_pages = cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight;
+	if (atomic_long_read(cli->cl_lru_left) < max_pages) {
+		/* If there aren't enough pages in the per-OSC LRU then
+		 * wake up the LRU thread to try and clear out space, so
+		 * we don't block if pages are being dirtied quickly. */
+		CDEBUG(D_CACHE, "%s: queue LRU, left: %lu/%ld.\n",
+		       cli_name(cli), atomic_long_read(cli->cl_lru_left),
+		       max_pages);
+		(void)ptlrpcd_queue_work(cli->cl_lru_work);
+	}
+
+	return reserved;
+}
+
+/**
+ * osc_lru_unreserve() is called to unreserve LRU slots.
+ *
+ * LRU slots reserved by osc_lru_reserve() may have entries left due to several
+ * reasons such as page already existing or I/O error. Those reserved slots
+ * should be freed by calling this function.
+ */
+void osc_lru_unreserve(struct client_obd *cli, unsigned long npages)
+{
+	atomic_long_add(npages, cli->cl_lru_left);
+	wake_up(&osc_lru_waitq);
+}
+
+/**
+ * Atomic operations are expensive. We accumulate the accounting for the
+ * same page zone to get better performance.
+ * In practice this can work pretty good because the pages in the same RPC
+ * are likely from the same page zone.
+ */
+#ifdef HAVE_NR_UNSTABLE_NFS
+/* Old kernels use a separate counter for unstable pages,
+ * newer kernels treat them like any other writeback.
+ */
+#define NR_WRITEBACK NR_UNSTABLE_NFS
+#endif
+
+static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+					    struct osc_brw_async_args *aa,
+					    int factor)
+{
+	int page_count;
+	void *zone = NULL;
+	int count = 0;
+	int i;
+
+	if (desc != NULL) {
+		page_count = desc->bd_iov_count;
+	} else {
+		page_count = aa->aa_page_count;
+	}
+
+	for (i = 0; i < page_count; i++) {
+		void *pz;
+		if (desc)
+			pz = page_zone(desc->bd_vec[i].bv_page);
+		else
+			pz = page_zone(aa->aa_ppga[i]->pg);
+
+		if (likely(pz == zone)) {
+			++count;
+			continue;
+		}
+
+		if (count > 0) {
+			mod_zone_page_state(zone, NR_WRITEBACK,
+					    factor * count);
+			count = 0;
+		}
+		zone = pz;
+		++count;
+	}
+	if (count > 0)
+		mod_zone_page_state(zone, NR_WRITEBACK, factor * count);
+}
+
+static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+						struct osc_brw_async_args *aa)
+{
+	unstable_page_accounting(desc, aa, 1);
+}
+
+static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+						struct osc_brw_async_args *aa)
+{
+	unstable_page_accounting(desc, aa, -1);
+}
+
+/**
+ * Performs "unstable" page accounting. This function balances the
+ * increment operations performed in osc_inc_unstable_pages. It is
+ * registered as the RPC request callback, and is executed when the
+ * bulk RPC is committed on the server. Thus at this point, the pages
+ * involved in the bulk transfer are no longer considered unstable.
+ *
+ * If this function is called, the request should have been committed
+ * or req:rq_unstable must have been set; it implies that the unstable
+ * statistic have been added.
+ */
+void osc_dec_unstable_pages(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	struct client_obd       *cli        = &req->rq_import->imp_obd->u.cli;
+	int			 page_count;
+	long			 unstable_count;
+
+	if (desc)
+		page_count = desc->bd_iov_count;
+	else
+		page_count = aa->aa_page_count;
+
+	LASSERT(page_count >= 0);
+
+	dec_unstable_page_accounting(desc, aa);
+
+	unstable_count = atomic_long_sub_return(page_count,
+						&cli->cl_unstable_count);
+	LASSERT(unstable_count >= 0);
+
+	unstable_count = atomic_long_sub_return(page_count,
+					   &cli->cl_cache->ccc_unstable_nr);
+	LASSERT(unstable_count >= 0);
+	if (unstable_count == 0)
+		wake_up(&cli->cl_cache->ccc_unstable_waitq);
+
+	if (waitqueue_active(&osc_lru_waitq))
+		(void)ptlrpcd_queue_work(cli->cl_lru_work);
+}
+
+/**
+ * "unstable" page accounting. See: osc_dec_unstable_pages.
+ */
+void osc_inc_unstable_pages(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
+	long			 page_count;
+
+	/* No unstable page tracking */
+	if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
+		return;
+
+	if (desc)
+		page_count = desc->bd_iov_count;
+	else
+		page_count = aa->aa_page_count;
+
+	add_unstable_page_accounting(desc, aa);
+	atomic_long_add(page_count, &cli->cl_unstable_count);
+	atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr);
+
+	/* If the request has already been committed (i.e. brw_commit
+	 * called via rq_commit_cb), we need to undo the unstable page
+	 * increments we just performed because rq_commit_cb wont be
+	 * called again. */
+	spin_lock(&req->rq_lock);
+	if (unlikely(req->rq_committed)) {
+		spin_unlock(&req->rq_lock);
+
+		osc_dec_unstable_pages(req);
+	} else {
+		req->rq_unstable = 1;
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+/**
+ * Check if it piggybacks SOFT_SYNC flag to OST from this OSC.
+ * This function will be called by every BRW RPC so it's critical
+ * to make this function fast.
+ */
+bool osc_over_unstable_soft_limit(struct client_obd *cli)
+{
+	long unstable_nr, osc_unstable_count;
+
+	/* Can't check cli->cl_unstable_count, therefore, no soft limit */
+	if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
+		return false;
+
+	osc_unstable_count = atomic_long_read(&cli->cl_unstable_count);
+	unstable_nr = atomic_long_read(&cli->cl_cache->ccc_unstable_nr);
+
+	CDEBUG(D_CACHE,
+	       "%s: cli: %p unstable pages: %lu, osc unstable pages: %lu\n",
+	       cli_name(cli), cli, unstable_nr, osc_unstable_count);
+
+	/* If the LRU slots are in shortage - 25% remaining AND this OSC
+	 * has one full RPC window of unstable pages, it's a good chance
+	 * to piggyback a SOFT_SYNC flag.
+	 * Please notice that the OST won't take immediate response for the
+	 * SOFT_SYNC request so active OSCs will have more chance to carry
+	 * the flag, this is reasonable. */
+	return unstable_nr > cli->cl_cache->ccc_lru_max >> 2 &&
+	       osc_unstable_count > cli->cl_max_pages_per_rpc *
+				    cli->cl_max_rpcs_in_flight;
+}
+
+/**
+ * Return how many LRU pages in the cache of all OSC devices
+ *
+ * \retval	return # of cached LRU pages times reclaimation tendency
+ * \retval	SHRINK_STOP if it cannot do any scanning in this time
+ */
+unsigned long osc_cache_shrink_count(struct shrinker *sk,
+				     struct shrink_control *sc)
+{
+	struct client_obd *cli;
+	unsigned long cached = 0;
+
+	spin_lock(&osc_shrink_lock);
+	list_for_each_entry(cli, &osc_shrink_list, cl_shrink_list)
+		cached += atomic_long_read(&cli->cl_lru_in_list);
+	spin_unlock(&osc_shrink_lock);
+
+	return (cached  * sysctl_vfs_cache_pressure) / 100;
+}
+
+/**
+ * Scan and try to reclaim sc->nr_to_scan cached LRU pages
+ *
+ * \retval	number of cached LRU pages reclaimed
+ * \retval	SHRINK_STOP if it cannot do any scanning in this time
+ *
+ * Linux kernel will loop calling this shrinker scan routine with
+ * sc->nr_to_scan = SHRINK_BATCH(128 for now) until kernel got enough memory.
+ *
+ * If sc->nr_to_scan is 0, the VM is querying the cache size, we don't need
+ * to scan and try to reclaim LRU pages, just return 0 and
+ * osc_cache_shrink_count() will report the LRU page number.
+ */
+unsigned long osc_cache_shrink_scan(struct shrinker *sk,
+				    struct shrink_control *sc)
+{
+	struct client_obd *cli;
+	struct client_obd *stop_anchor = NULL;
+	struct lu_env *env;
+	long shrank = 0;
+	int rc;
+	__u16 refcheck;
+
+	if (sc->nr_to_scan == 0)
+		return 0;
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return SHRINK_STOP;
+
+	spin_lock(&osc_shrink_lock);
+	while ((cli = list_first_entry_or_null(&osc_shrink_list,
+					       struct client_obd,
+					       cl_shrink_list)) != NULL) {
+		if (stop_anchor == NULL)
+			stop_anchor = cli;
+		else if (cli == stop_anchor)
+			break;
+
+		list_move_tail(&cli->cl_shrink_list, &osc_shrink_list);
+		spin_unlock(&osc_shrink_lock);
+
+		/* shrink no more than max_pages_per_rpc for an OSC */
+		rc = osc_lru_shrink(env, cli, (sc->nr_to_scan - shrank) >
+				    cli->cl_max_pages_per_rpc ?
+				    cli->cl_max_pages_per_rpc :
+				    sc->nr_to_scan - shrank, true);
+		if (rc > 0)
+			shrank += rc;
+
+		if (shrank >= sc->nr_to_scan)
+			goto out;
+
+		spin_lock(&osc_shrink_lock);
+	}
+	spin_unlock(&osc_shrink_lock);
+
+out:
+	cl_env_put(env, &refcheck);
+
+	return shrank;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
new file mode 100644
index 0000000000000..0f0795274593c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
@@ -0,0 +1,321 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ *
+ * Code originally extracted from quota directory
+ */
+
+#include <obd.h>
+#include <lustre_osc.h>
+
+#include "osc_internal.h"
+
+static inline struct osc_quota_info *osc_oqi_alloc(u32 id)
+{
+	struct osc_quota_info *oqi;
+
+	OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem);
+	if (oqi != NULL)
+		oqi->oqi_id = id;
+
+	return oqi;
+}
+
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
+{
+	int type;
+	ENTRY;
+
+	for (type = 0; type < LL_MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if (oqi) {
+			/* do not try to access oqi here, it could have been
+			 * freed by osc_quota_setdq() */
+
+			/* the slot is busy, the user is about to run out of
+			 * quota space on this OST */
+			CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n",
+			       type == USRQUOTA ? "user" : "grout", qid[type]);
+			RETURN(-EDQUOT);
+		}
+	}
+
+	RETURN(0);
+}
+
+static inline u32 md_quota_flag(int qtype)
+{
+	switch (qtype) {
+	case USRQUOTA:
+		return OBD_MD_FLUSRQUOTA;
+	case GRPQUOTA:
+		return OBD_MD_FLGRPQUOTA;
+	case PRJQUOTA:
+		return OBD_MD_FLPRJQUOTA;
+	default:
+		return 0;
+	}
+}
+
+static inline u32 fl_quota_flag(int qtype)
+{
+	switch (qtype) {
+	case USRQUOTA:
+		return OBD_FL_NO_USRQUOTA;
+	case GRPQUOTA:
+		return OBD_FL_NO_GRPQUOTA;
+	case PRJQUOTA:
+		return OBD_FL_NO_PRJQUOTA;
+	default:
+		return 0;
+	}
+}
+
+int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
+		    u64 valid, u32 flags)
+{
+	int type;
+	int rc = 0;
+
+        ENTRY;
+
+	if ((valid & (OBD_MD_FLALLQUOTA)) == 0)
+		RETURN(0);
+
+	mutex_lock(&cli->cl_quota_mutex);
+	cli->cl_root_squash = !!(flags & OBD_FL_ROOT_SQUASH);
+	/* still mark the quots is running out for the old request, because it
+	 * could be processed after the new request at OST, the side effect is
+	 * the following request will be processed synchronously, but it will
+	 * not break the quota enforcement. */
+	if (cli->cl_quota_last_xid > xid && !(flags & OBD_FL_NO_QUOTA_ALL))
+		GOTO(out_unlock, rc);
+
+	if (cli->cl_quota_last_xid < xid)
+		cli->cl_quota_last_xid = xid;
+
+	for (type = 0; type < LL_MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		if ((valid & md_quota_flag(type)) == 0)
+			continue;
+
+		/* lookup the ID in the per-type hash table */
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if ((flags & fl_quota_flag(type)) != 0) {
+			/* This ID is getting close to its quota limit, let's
+			 * switch to sync I/O */
+			if (oqi != NULL)
+				continue;
+
+			oqi = osc_oqi_alloc(qid[type]);
+			if (oqi == NULL) {
+				rc = -ENOMEM;
+				break;
+			}
+
+			rc = cfs_hash_add_unique(cli->cl_quota_hash[type],
+						 &qid[type], &oqi->oqi_hash);
+			/* race with others? */
+			if (rc == -EALREADY) {
+				rc = 0;
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+			}
+
+			CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n",
+			       cli_name(cli), qtype_name(type), qid[type], rc);
+		} else {
+			/* This ID is now off the hook, let's remove it from
+			 * the hash table */
+			if (oqi == NULL)
+				continue;
+
+			oqi = cfs_hash_del_key(cli->cl_quota_hash[type],
+					       &qid[type]);
+			if (oqi)
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+
+			CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n",
+			       cli_name(cli), qtype_name(type), qid[type], oqi);
+		}
+	}
+
+out_unlock:
+	mutex_unlock(&cli->cl_quota_mutex);
+	RETURN(rc);
+}
+
+/*
+ * Hash operations for uid/gid <-> osc_quota_info
+ */
+static unsigned
+oqi_hashfn(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u32_hash(*((__u32*)key), mask);
+}
+
+static int
+oqi_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	u32 uid;
+
+	LASSERT(key != NULL);
+	uid = *((u32 *)key);
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+	return uid == oqi->oqi_id;
+}
+
+static void *
+oqi_key(struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+	return &oqi->oqi_id;
+}
+
+static void *
+oqi_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+}
+
+static void
+oqi_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+        OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+}
+
+#define HASH_QUOTA_BKT_BITS 5
+#define HASH_QUOTA_CUR_BITS 5
+#define HASH_QUOTA_MAX_BITS 15
+
+static struct cfs_hash_ops quota_hash_ops = {
+	.hs_hash	= oqi_hashfn,
+	.hs_keycmp	= oqi_keycmp,
+	.hs_key		= oqi_key,
+	.hs_object	= oqi_object,
+	.hs_get		= oqi_get,
+	.hs_put_locked	= oqi_put_locked,
+	.hs_exit	= oqi_exit,
+};
+
+int osc_quota_setup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int i, type;
+	ENTRY;
+
+	mutex_init(&cli->cl_quota_mutex);
+
+	for (type = 0; type < LL_MAXQUOTAS; type++) {
+		cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
+							   HASH_QUOTA_CUR_BITS,
+							   HASH_QUOTA_MAX_BITS,
+							   HASH_QUOTA_BKT_BITS,
+							   0,
+							   CFS_HASH_MIN_THETA,
+							   CFS_HASH_MAX_THETA,
+							   &quota_hash_ops,
+							   CFS_HASH_DEFAULT);
+		if (cli->cl_quota_hash[type] == NULL)
+			break;
+	}
+
+	if (type == LL_MAXQUOTAS)
+		RETURN(0);
+
+	for (i = 0; i < type; i++)
+		cfs_hash_putref(cli->cl_quota_hash[i]);
+
+	RETURN(-ENOMEM);
+}
+
+int osc_quota_cleanup(struct obd_device *obd)
+{
+	struct client_obd     *cli = &obd->u.cli;
+	int type;
+	ENTRY;
+
+	for (type = 0; type < LL_MAXQUOTAS; type++)
+		cfs_hash_putref(cli->cl_quota_hash[type]);
+
+	RETURN(0);
+}
+
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                 struct obd_quotactl *oqctl)
+{
+        struct ptlrpc_request *req;
+        struct obd_quotactl   *oqc;
+        int                    rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                        &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION,
+                                        OST_QUOTACTL);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+        *oqc = *oqctl;
+
+        ptlrpc_request_set_replen(req);
+        ptlrpc_at_set_req_timeout(req);
+        req->rq_no_resend = 1;
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+        if (req->rq_repmsg &&
+            (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+                *oqctl = *oqc;
+        } else if (!rc) {
+                CERROR ("Can't unpack obd_quotactl\n");
+                rc = -EPROTO;
+        }
+        ptlrpc_req_finished(req);
+
+        RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_request.c b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
new file mode 100644
index 0000000000000..59607e7b19bce
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
@@ -0,0 +1,3942 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <linux/workqueue.h>
+#include <libcfs/libcfs.h>
+#include <linux/falloc.h>
+#include <lprocfs_status.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include <lustre_ha.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <lustre_net.h>
+#include <lustre_obdo.h>
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_osc.h>
+#include <linux/falloc.h>
+
+#include "osc_internal.h"
+#include <lnet/lnet_rdma.h>
+
+atomic_t osc_pool_req_count;
+unsigned int osc_reqpool_maxreqcount;
+struct ptlrpc_request_pool *osc_rq_pool;
+
+/* max memory used for request pool, unit is MB */
+static unsigned int osc_reqpool_mem_max = 5;
+module_param(osc_reqpool_mem_max, uint, 0444);
+
+static int osc_idle_timeout = 20;
+module_param(osc_idle_timeout, uint, 0644);
+
+#define osc_grant_args osc_brw_async_args
+
+struct osc_setattr_args {
+	struct obdo		*sa_oa;
+	obd_enqueue_update_f	 sa_upcall;
+	void			*sa_cookie;
+};
+
+struct osc_fsync_args {
+	struct osc_object	*fa_obj;
+	struct obdo		*fa_oa;
+	obd_enqueue_update_f	fa_upcall;
+	void			*fa_cookie;
+};
+
+struct osc_ladvise_args {
+	struct obdo		*la_oa;
+	obd_enqueue_update_f	 la_upcall;
+	void			*la_cookie;
+};
+
+static void osc_release_ppga(struct brw_page **ppga, size_t count);
+static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			 void *data, int rc);
+
+void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa)
+{
+	struct ost_body *body;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+}
+
+static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa)
+{
+	struct ptlrpc_request	*req;
+	struct ost_body		*body;
+	int			 rc;
+
+	ENTRY;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+	oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oa->o_valid |= OBD_MD_FLBLKSZ;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa)
+{
+	struct ptlrpc_request	*req;
+	struct ost_body		*body;
+	int			 rc;
+
+	ENTRY;
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static int osc_setattr_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req, void *args, int rc)
+{
+	struct osc_setattr_args	*sa = args;
+	struct ost_body *body;
+
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
+			     &body->oa);
+out:
+	rc = sa->sa_upcall(sa->sa_cookie, rc);
+	RETURN(rc);
+}
+
+int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
+		      obd_enqueue_update_f upcall, void *cookie,
+		      struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request	*req;
+	struct osc_setattr_args	*sa;
+	int			 rc;
+
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	/* do mds to ost setattr asynchronously */
+	if (!rqset) {
+		/* Do not wait for response. */
+		ptlrpcd_add_req(req);
+	} else {
+		req->rq_interpret_reply = osc_setattr_interpret;
+
+		sa = ptlrpc_req_async_args(sa, req);
+		sa->sa_oa = oa;
+		sa->sa_upcall = upcall;
+		sa->sa_cookie = cookie;
+
+		ptlrpc_set_add_req(rqset, req);
+	}
+
+	RETURN(0);
+}
+
+static int osc_ladvise_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 void *arg, int rc)
+{
+	struct osc_ladvise_args *la = arg;
+	struct ost_body *body;
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*la->la_oa = body->oa;
+out:
+	rc = la->la_upcall(la->la_cookie, rc);
+	RETURN(rc);
+}
+
+/**
+ * If rqset is NULL, do not wait for response. Upcall and cookie could also
+ * be NULL in this case
+ */
+int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
+		     struct ladvise_hdr *ladvise_hdr,
+		     obd_enqueue_update_f upcall, void *cookie,
+		     struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request	*req;
+	struct ost_body		*body;
+	struct osc_ladvise_args	*la;
+	int			 rc;
+	struct lu_ladvise	*req_ladvise;
+	struct lu_ladvise	*ladvise = ladvise_hdr->lah_advise;
+	int			 num_advise = ladvise_hdr->lah_count;
+	struct ladvise_hdr	*req_ladvise_hdr;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_LADVISE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_OST_LADVISE, RCL_CLIENT,
+			     num_advise * sizeof(*ladvise));
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_LADVISE);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req->rq_request_portal = OST_IO_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+			     oa);
+
+	req_ladvise_hdr = req_capsule_client_get(&req->rq_pill,
+						 &RMF_OST_LADVISE_HDR);
+	memcpy(req_ladvise_hdr, ladvise_hdr, sizeof(*ladvise_hdr));
+
+	req_ladvise = req_capsule_client_get(&req->rq_pill, &RMF_OST_LADVISE);
+	memcpy(req_ladvise, ladvise, sizeof(*ladvise) * num_advise);
+	ptlrpc_request_set_replen(req);
+
+	if (rqset == NULL) {
+		/* Do not wait for response. */
+		ptlrpcd_add_req(req);
+		RETURN(0);
+	}
+
+	req->rq_interpret_reply = osc_ladvise_interpret;
+	la = ptlrpc_req_async_args(la, req);
+	la->la_oa = oa;
+	la->la_upcall = upcall;
+	la->la_cookie = cookie;
+
+	ptlrpc_set_add_req(rqset, req);
+
+	RETURN(0);
+}
+
+static int osc_create(const struct lu_env *env, struct obd_export *exp,
+		      struct obdo *oa)
+{
+        struct ptlrpc_request *req;
+        struct ost_body       *body;
+        int                    rc;
+        ENTRY;
+
+	LASSERT(oa != NULL);
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+	LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi)));
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
+        if (req == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
+        if (rc) {
+                ptlrpc_request_free(req);
+                GOTO(out, rc);
+        }
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+        LASSERT(body);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+        ptlrpc_request_set_replen(req);
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                GOTO(out_req, rc);
+
+        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+        if (body == NULL)
+                GOTO(out_req, rc = -EPROTO);
+
+	CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+	oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oa->o_valid |= OBD_MD_FLBLKSZ;
+
+	CDEBUG(D_HA, "transno: %lld\n",
+	       lustre_msg_get_transno(req->rq_repmsg));
+out_req:
+	ptlrpc_req_finished(req);
+out:
+	RETURN(rc);
+}
+
+int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+		   obd_enqueue_update_f upcall, void *cookie)
+{
+	struct ptlrpc_request *req;
+	struct osc_setattr_args *sa;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	struct ost_body *body;
+	int rc;
+
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_set_io_portal(req);
+
+	ptlrpc_at_set_req_timeout(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+
+	lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = osc_setattr_interpret;
+	sa = ptlrpc_req_async_args(sa, req);
+	sa->sa_oa = oa;
+	sa->sa_upcall = upcall;
+	sa->sa_cookie = cookie;
+
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_punch_send);
+
+/**
+ * osc_fallocate_base() - Handles fallocate request.
+ *
+ * @exp:	Export structure
+ * @oa:		Attributes passed to OSS from client (obdo structure)
+ * @upcall:	Primary & supplementary group information
+ * @cookie:	Exclusive identifier
+ * @rqset:	Request list.
+ * @mode:	Operation done on given range.
+ *
+ * osc_fallocate_base() - Handles fallocate requests only. Only block
+ * allocation or standard preallocate operation is supported currently.
+ * Other mode flags is not supported yet. ftruncate(2) or truncate(2)
+ * is supported via SETATTR request.
+ *
+ * Return: Non-zero on failure and O on success.
+ */
+int osc_fallocate_base(struct obd_export *exp, struct obdo *oa,
+		       obd_enqueue_update_f upcall, void *cookie, int mode)
+{
+	struct ptlrpc_request *req;
+	struct osc_setattr_args *sa;
+	struct ost_body *body;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	int rc;
+	ENTRY;
+
+	oa->o_falloc_mode = mode;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_OST_FALLOCATE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_FALLOCATE);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+
+	lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = osc_setattr_interpret;
+	BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args));
+	sa = ptlrpc_req_async_args(sa, req);
+	sa->sa_oa = oa;
+	sa->sa_upcall = upcall;
+	sa->sa_cookie = cookie;
+
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_fallocate_base);
+
+static int osc_sync_interpret(const struct lu_env *env,
+			      struct ptlrpc_request *req, void *args, int rc)
+{
+	struct osc_fsync_args *fa = args;
+	struct ost_body *body;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	unsigned long valid = 0;
+	struct cl_object *obj;
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		CERROR("can't unpack ost_body\n");
+		GOTO(out, rc = -EPROTO);
+	}
+
+	*fa->fa_oa = body->oa;
+	obj = osc2cl(fa->fa_obj);
+
+	/* Update osc object's blocks attribute */
+	cl_object_attr_lock(obj);
+	if (body->oa.o_valid & OBD_MD_FLBLOCKS) {
+		attr->cat_blocks = body->oa.o_blocks;
+		valid |= CAT_BLOCKS;
+	}
+
+	if (valid != 0)
+		cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+out:
+	rc = fa->fa_upcall(fa->fa_cookie, rc);
+	RETURN(rc);
+}
+
+int osc_sync_base(struct osc_object *obj, struct obdo *oa,
+		  obd_enqueue_update_f upcall, void *cookie,
+                  struct ptlrpc_request_set *rqset)
+{
+	struct obd_export     *exp = osc_export(obj);
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	struct osc_fsync_args *fa;
+        int                    rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+	/* overload the size and blocks fields in the oa with start/end */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = osc_sync_interpret;
+
+	fa = ptlrpc_req_async_args(fa, req);
+	fa->fa_obj = obj;
+	fa->fa_oa = oa;
+	fa->fa_upcall = upcall;
+	fa->fa_cookie = cookie;
+
+	ptlrpc_set_add_req(rqset, req);
+
+	RETURN (0);
+}
+
+/* Find and cancel locally locks matched by @mode in the resource found by
+ * @objid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
+				   struct list_head *cancels,
+				   enum ldlm_mode mode, __u64 lock_flags)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	int count;
+	ENTRY;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		RETURN(0);
+
+	ostid_build_res_name(&oa->o_oi, &res_id);
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if (IS_ERR(res))
+		RETURN(0);
+
+        LDLM_RESOURCE_ADDREF(res);
+        count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
+                                           lock_flags, 0, NULL);
+        LDLM_RESOURCE_DELREF(res);
+        ldlm_resource_putref(res);
+        RETURN(count);
+}
+
+static int osc_destroy_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req, void *args, int rc)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+
+	atomic_dec(&cli->cl_destroy_in_flight);
+	wake_up(&cli->cl_destroy_waitq);
+
+	return 0;
+}
+
+static int osc_can_send_destroy(struct client_obd *cli)
+{
+	if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
+	    cli->cl_max_rpcs_in_flight) {
+		/* The destroy request can be sent */
+		return 1;
+	}
+	if (atomic_dec_return(&cli->cl_destroy_in_flight) <
+	    cli->cl_max_rpcs_in_flight) {
+		/*
+		 * The counter has been modified between the two atomic
+		 * operations.
+		 */
+		wake_up(&cli->cl_destroy_waitq);
+	}
+	return 0;
+}
+
+static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa)
+{
+        struct client_obd     *cli = &exp->exp_obd->u.cli;
+        struct ptlrpc_request *req;
+        struct ost_body       *body;
+	LIST_HEAD(cancels);
+        int rc, count;
+        ENTRY;
+
+        if (!oa) {
+                CDEBUG(D_INFO, "oa NULL\n");
+                RETURN(-EINVAL);
+        }
+
+        count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
+                                        LDLM_FL_DISCARD_DATA);
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+        rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
+                               0, &cancels, count);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+        ptlrpc_at_set_req_timeout(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+        ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = osc_destroy_interpret;
+	if (!osc_can_send_destroy(cli)) {
+		/*
+		 * Wait until the number of on-going destroy RPCs drops
+		 * under max_rpc_in_flight
+		 */
+		rc = l_wait_event_abortable_exclusive(
+			cli->cl_destroy_waitq,
+			osc_can_send_destroy(cli));
+		if (rc) {
+			ptlrpc_req_finished(req);
+			RETURN(-EINTR);
+		}
+	}
+
+	/* Do not wait for response */
+	ptlrpcd_add_req(req);
+	RETURN(0);
+}
+
+static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
+                                long writing_bytes)
+{
+	u64 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT;
+
+	LASSERT(!(oa->o_valid & bits));
+
+	oa->o_valid |= bits;
+	spin_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_ocd_grant_param)
+		oa->o_dirty = cli->cl_dirty_grant;
+	else
+		oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
+	if (unlikely(cli->cl_dirty_pages > cli->cl_dirty_max_pages)) {
+		CERROR("dirty %lu > dirty_max %lu\n",
+		       cli->cl_dirty_pages,
+		       cli->cl_dirty_max_pages);
+		oa->o_undirty = 0;
+	} else if (unlikely(atomic_long_read(&obd_dirty_pages) >
+			    (long)(obd_max_dirty_pages + 1))) {
+		/* The atomic_read() allowing the atomic_inc() are
+		 * not covered by a lock thus they may safely race and trip
+		 * this CERROR() unless we add in a small fudge factor (+1). */
+		CERROR("%s: dirty %ld > system dirty_max %ld\n",
+		       cli_name(cli), atomic_long_read(&obd_dirty_pages),
+		       obd_max_dirty_pages);
+		oa->o_undirty = 0;
+	} else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
+			    0x7fffffff)) {
+		CERROR("dirty %lu - dirty_max %lu too big???\n",
+		       cli->cl_dirty_pages, cli->cl_dirty_max_pages);
+		oa->o_undirty = 0;
+	} else {
+		unsigned long nrpages;
+		unsigned long undirty;
+
+		nrpages = cli->cl_max_pages_per_rpc;
+		nrpages *= cli->cl_max_rpcs_in_flight + 1;
+		nrpages = max(nrpages, cli->cl_dirty_max_pages);
+		undirty = nrpages << PAGE_SHIFT;
+		if (cli->cl_ocd_grant_param) {
+			int nrextents;
+
+			/* take extent tax into account when asking for more
+			 * grant space */
+			nrextents = (nrpages + cli->cl_max_extent_pages - 1) /
+				     cli->cl_max_extent_pages;
+			undirty += nrextents * cli->cl_grant_extent_tax;
+		}
+		/* Do not ask for more than OBD_MAX_GRANT - a margin for server
+		 * to add extent tax, etc.
+		 */
+		oa->o_undirty = min(undirty, OBD_MAX_GRANT &
+				    ~(PTLRPC_MAX_BRW_SIZE * 4UL));
+        }
+	oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+	/* o_dropped AKA o_misc is 32 bits, but cl_lost_grant is 64 bits */
+	if (cli->cl_lost_grant > INT_MAX) {
+		CDEBUG(D_CACHE,
+		      "%s: avoided o_dropped overflow: cl_lost_grant %lu\n",
+		      cli_name(cli), cli->cl_lost_grant);
+		oa->o_dropped = INT_MAX;
+	} else {
+		oa->o_dropped = cli->cl_lost_grant;
+	}
+	cli->cl_lost_grant -= oa->o_dropped;
+	spin_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE, "%s: dirty: %llu undirty: %u dropped %u grant: %llu"
+	       " cl_lost_grant %lu\n", cli_name(cli), oa->o_dirty,
+	       oa->o_undirty, oa->o_dropped, oa->o_grant, cli->cl_lost_grant);
+}
+
+void osc_update_next_shrink(struct client_obd *cli)
+{
+	cli->cl_next_shrink_grant = ktime_get_seconds() +
+				    cli->cl_grant_shrink_interval;
+
+	CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
+	       cli->cl_next_shrink_grant);
+}
+EXPORT_SYMBOL(osc_update_next_shrink);
+
+static void __osc_update_grant(struct client_obd *cli, u64 grant)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_avail_grant += grant;
+	spin_unlock(&cli->cl_loi_list_lock);
+}
+
+static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+{
+        if (body->oa.o_valid & OBD_MD_FLGRANT) {
+		CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant);
+                __osc_update_grant(cli, body->oa.o_grant);
+        }
+}
+
+/**
+ * grant thread data for shrinking space.
+ */
+struct grant_thread_data {
+	struct list_head	gtd_clients;
+	struct mutex		gtd_mutex;
+	unsigned long		gtd_stopped:1;
+};
+static struct grant_thread_data client_gtd;
+
+static int osc_shrink_grant_interpret(const struct lu_env *env,
+				      struct ptlrpc_request *req,
+				      void *args, int rc)
+{
+	struct osc_grant_args *aa = args;
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	struct ost_body *body;
+
+	if (rc != 0) {
+		__osc_update_grant(cli, aa->aa_oa->o_grant);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	osc_update_grant(cli, body);
+out:
+	OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
+	aa->aa_oa = NULL;
+
+	return rc;
+}
+
+static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+	oa->o_grant = cli->cl_avail_grant / 4;
+	cli->cl_avail_grant -= oa->o_grant;
+	spin_unlock(&cli->cl_loi_list_lock);
+        if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
+                oa->o_valid |= OBD_MD_FLFLAGS;
+                oa->o_flags = 0;
+        }
+        oa->o_flags |= OBD_FL_SHRINK_GRANT;
+        osc_update_next_shrink(cli);
+}
+
+/* Shrink the current grant, either from some large amount to enough for a
+ * full set of in-flight RPCs, or if we have already shrunk to that limit
+ * then to enough for a single RPC.  This avoids keeping more grant than
+ * needed, and avoids shrinking the grant piecemeal. */
+static int osc_shrink_grant(struct client_obd *cli)
+{
+	__u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
+			     (cli->cl_max_pages_per_rpc << PAGE_SHIFT);
+
+	spin_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_avail_grant <= target_bytes)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return osc_shrink_grant_to_target(cli, target_bytes);
+}
+
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
+{
+	int			rc = 0;
+	struct ost_body        *body;
+	ENTRY;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	/* Don't shrink if we are already above or below the desired limit
+	 * We don't want to shrink below a single RPC, as that will negatively
+	 * impact block allocation and long-term performance. */
+	if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
+
+	if (target_bytes >= cli->cl_avail_grant) {
+		spin_unlock(&cli->cl_loi_list_lock);
+		RETURN(0);
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	OBD_ALLOC_PTR(body);
+	if (!body)
+		RETURN(-ENOMEM);
+
+	osc_announce_cached(cli, &body->oa, 0);
+
+	spin_lock(&cli->cl_loi_list_lock);
+	if (target_bytes >= cli->cl_avail_grant) {
+		/* available grant has changed since target calculation */
+		spin_unlock(&cli->cl_loi_list_lock);
+		GOTO(out_free, rc = 0);
+	}
+	body->oa.o_grant = cli->cl_avail_grant - target_bytes;
+	cli->cl_avail_grant = target_bytes;
+	spin_unlock(&cli->cl_loi_list_lock);
+        if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
+                body->oa.o_valid |= OBD_MD_FLFLAGS;
+                body->oa.o_flags = 0;
+        }
+        body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
+        osc_update_next_shrink(cli);
+
+        rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
+                                sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
+                                sizeof(*body), body, NULL);
+        if (rc != 0)
+                __osc_update_grant(cli, body->oa.o_grant);
+out_free:
+        OBD_FREE_PTR(body);
+        RETURN(rc);
+}
+
+static int osc_should_shrink_grant(struct client_obd *client)
+{
+	time64_t next_shrink = client->cl_next_shrink_grant;
+
+	if (client->cl_import == NULL)
+		return 0;
+
+	if (!OCD_HAS_FLAG(&client->cl_import->imp_connect_data, GRANT_SHRINK) ||
+	    client->cl_import->imp_grant_shrink_disabled) {
+		osc_update_next_shrink(client);
+		return 0;
+	}
+
+	if (ktime_get_seconds() >= next_shrink - 5) {
+		/* Get the current RPC size directly, instead of going via:
+		 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
+		 * Keep comment here so that it can be found by searching. */
+		int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT;
+
+		if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
+		    client->cl_avail_grant > brw_size)
+			return 1;
+		else
+			osc_update_next_shrink(client);
+	}
+        return 0;
+}
+
+#define GRANT_SHRINK_RPC_BATCH	100
+
+static struct delayed_work work;
+
+static void osc_grant_work_handler(struct work_struct *data)
+{
+	struct client_obd *cli;
+	int rpc_sent;
+	bool init_next_shrink = true;
+	time64_t next_shrink = ktime_get_seconds() + GRANT_SHRINK_INTERVAL;
+
+	rpc_sent = 0;
+	mutex_lock(&client_gtd.gtd_mutex);
+	list_for_each_entry(cli, &client_gtd.gtd_clients,
+			    cl_grant_chain) {
+		if (rpc_sent < GRANT_SHRINK_RPC_BATCH &&
+		    osc_should_shrink_grant(cli)) {
+			osc_shrink_grant(cli);
+			rpc_sent++;
+		}
+
+		if (!init_next_shrink) {
+			if (cli->cl_next_shrink_grant < next_shrink &&
+			    cli->cl_next_shrink_grant > ktime_get_seconds())
+				next_shrink = cli->cl_next_shrink_grant;
+		} else {
+			init_next_shrink = false;
+			next_shrink = cli->cl_next_shrink_grant;
+		}
+	}
+	mutex_unlock(&client_gtd.gtd_mutex);
+
+	if (client_gtd.gtd_stopped == 1)
+		return;
+
+	if (next_shrink > ktime_get_seconds()) {
+		time64_t delay = next_shrink - ktime_get_seconds();
+
+		schedule_delayed_work(&work, cfs_time_seconds(delay));
+	} else {
+		schedule_work(&work.work);
+	}
+}
+
+void osc_schedule_grant_work(void)
+{
+	cancel_delayed_work_sync(&work);
+	schedule_work(&work.work);
+}
+EXPORT_SYMBOL(osc_schedule_grant_work);
+
+/**
+ * Start grant thread for returing grant to server for idle clients.
+ */
+static int osc_start_grant_work(void)
+{
+	client_gtd.gtd_stopped = 0;
+	mutex_init(&client_gtd.gtd_mutex);
+	INIT_LIST_HEAD(&client_gtd.gtd_clients);
+
+	INIT_DELAYED_WORK(&work, osc_grant_work_handler);
+	schedule_work(&work.work);
+
+	return 0;
+}
+
+static void osc_stop_grant_work(void)
+{
+	client_gtd.gtd_stopped = 1;
+	cancel_delayed_work_sync(&work);
+}
+
+static void osc_add_grant_list(struct client_obd *client)
+{
+	mutex_lock(&client_gtd.gtd_mutex);
+	list_add(&client->cl_grant_chain, &client_gtd.gtd_clients);
+	mutex_unlock(&client_gtd.gtd_mutex);
+}
+
+static void osc_del_grant_list(struct client_obd *client)
+{
+	if (list_empty(&client->cl_grant_chain))
+		return;
+
+	mutex_lock(&client_gtd.gtd_mutex);
+	list_del_init(&client->cl_grant_chain);
+	mutex_unlock(&client_gtd.gtd_mutex);
+}
+
+void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+{
+	/*
+	 * ocd_grant is the total grant amount we're expect to hold: if we've
+	 * been evicted, it's the new avail_grant amount, cl_dirty_pages will
+	 * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant +
+	 * dirty.
+	 *
+	 * race is tolerable here: if we're evicted, but imp_state already
+	 * left EVICTED state, then cl_dirty_pages must be 0 already.
+	 */
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_avail_grant = ocd->ocd_grant;
+	if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) {
+		unsigned long consumed = cli->cl_reserved_grant;
+
+		if (OCD_HAS_FLAG(ocd, GRANT_PARAM))
+			consumed += cli->cl_dirty_grant;
+		else
+			consumed += cli->cl_dirty_pages << PAGE_SHIFT;
+		if (cli->cl_avail_grant < consumed) {
+			CERROR("%s: granted %ld but already consumed %ld\n",
+			       cli_name(cli), cli->cl_avail_grant, consumed);
+			cli->cl_avail_grant = 0;
+		} else {
+			cli->cl_avail_grant -= consumed;
+		}
+	}
+
+	if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) {
+		u64 size;
+		int chunk_mask;
+
+		/* overhead for each extent insertion */
+		cli->cl_grant_extent_tax = ocd->ocd_grant_tax_kb << 10;
+		/* determine the appropriate chunk size used by osc_extent. */
+		cli->cl_chunkbits = max_t(int, PAGE_SHIFT,
+					  ocd->ocd_grant_blkbits);
+		/* max_pages_per_rpc must be chunk aligned */
+		chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
+		cli->cl_max_pages_per_rpc = (cli->cl_max_pages_per_rpc +
+					     ~chunk_mask) & chunk_mask;
+		/* determine maximum extent size, in #pages */
+		size = (u64)ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits;
+		cli->cl_max_extent_pages = (size >> PAGE_SHIFT) ?: 1;
+		cli->cl_ocd_grant_param = 1;
+	} else {
+		cli->cl_ocd_grant_param = 0;
+		cli->cl_grant_extent_tax = 0;
+		cli->cl_chunkbits = PAGE_SHIFT;
+		cli->cl_max_extent_pages = DT_MAX_BRW_PAGES;
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	CDEBUG(D_CACHE,
+	       "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld. chunk bits: %d cl_max_extent_pages: %d\n",
+	       cli_name(cli),
+	       cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits,
+	       cli->cl_max_extent_pages);
+
+	if (OCD_HAS_FLAG(ocd, GRANT_SHRINK) && list_empty(&cli->cl_grant_chain))
+		osc_add_grant_list(cli);
+}
+EXPORT_SYMBOL(osc_init_grant);
+
+/* We assume that the reason this OSC got a short read is because it read
+ * beyond the end of a stripe file; i.e. lustre is reading a sparse file
+ * via the LOV, and it _knows_ it's reading inside the file, it's just that
+ * this stripe never got written at or beyond this stripe offset yet. */
+static void handle_short_read(int nob_read, size_t page_count,
+                              struct brw_page **pga)
+{
+        char *ptr;
+        int i = 0;
+
+        /* skip bytes read OK */
+        while (nob_read > 0) {
+                LASSERT (page_count > 0);
+
+		if (pga[i]->count > nob_read) {
+			/* EOF inside this page */
+			ptr = kmap(pga[i]->pg) +
+				(pga[i]->off & ~PAGE_MASK);
+			memset(ptr + nob_read, 0, pga[i]->count - nob_read);
+			kunmap(pga[i]->pg);
+			page_count--;
+			i++;
+			break;
+		}
+
+                nob_read -= pga[i]->count;
+                page_count--;
+                i++;
+        }
+
+	/* zero remaining pages */
+	while (page_count-- > 0) {
+		ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK);
+		memset(ptr, 0, pga[i]->count);
+		kunmap(pga[i]->pg);
+		i++;
+	}
+}
+
+static int check_write_rcs(struct ptlrpc_request *req,
+			   int requested_nob, int niocount,
+			   size_t page_count, struct brw_page **pga)
+{
+        int     i;
+        __u32   *remote_rcs;
+
+        remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
+                                                  sizeof(*remote_rcs) *
+                                                  niocount);
+        if (remote_rcs == NULL) {
+                CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
+                return(-EPROTO);
+        }
+
+        /* return error if any niobuf was in error */
+        for (i = 0; i < niocount; i++) {
+		if ((int)remote_rcs[i] < 0) {
+			CDEBUG(D_INFO, "rc[%d]: %d req %p\n",
+			       i, remote_rcs[i], req);
+			return remote_rcs[i];
+		}
+
+                if (remote_rcs[i] != 0) {
+                        CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
+                                i, remote_rcs[i], req);
+                        return(-EPROTO);
+                }
+        }
+	if (req->rq_bulk != NULL &&
+	    req->rq_bulk->bd_nob_transferred != requested_nob) {
+                CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+                       req->rq_bulk->bd_nob_transferred, requested_nob);
+                return(-EPROTO);
+        }
+
+        return (0);
+}
+
+static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
+{
+        if (p1->flag != p2->flag) {
+		unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
+				  OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
+				  OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC |
+				  OBD_BRW_SYS_RESOURCE);
+
+                /* warn if we try to combine flags that we don't know to be
+                 * safe to combine */
+                if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
+                        CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
+                              "report this at https://jira.whamcloud.com/\n",
+                              p1->flag, p2->flag);
+                }
+                return 0;
+        }
+
+        return (p1->off + p1->count == p2->off);
+}
+
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
+				   size_t pg_count, struct brw_page **pga,
+				   int opc, obd_dif_csum_fn *fn,
+				   int sector_size,
+				   u32 *check_sum, bool resend)
+{
+	struct ahash_request *req;
+	/* Used Adler as the default checksum type on top of DIF tags */
+	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
+	struct page *__page;
+	unsigned char *buffer;
+	__be16 *guard_start;
+	unsigned int bufsize;
+	int guard_number;
+	int used_number = 0;
+	int used;
+	u32 cksum;
+	int rc = 0;
+	int i = 0;
+
+	LASSERT(pg_count > 0);
+
+	__page = alloc_page(GFP_KERNEL);
+	if (__page == NULL)
+		return -ENOMEM;
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
+		       obd_name, cfs_crypto_hash_name(cfs_alg), rc);
+		GOTO(out, rc);
+	}
+
+	buffer = kmap(__page);
+	guard_start = (__be16 *)buffer;
+	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	CDEBUG(D_PAGE | (resend ? D_HA : 0),
+	       "GRD tags per page=%u, resend=%u, bytes=%u, pages=%zu\n",
+	       guard_number, resend, nob, pg_count);
+
+	while (nob > 0 && pg_count > 0) {
+		unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+		/* corrupt the data before we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (unlikely(i == 0 && opc == OST_READ &&
+			     OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))) {
+			unsigned char *ptr = kmap(pga[i]->pg);
+			int off = pga[i]->off & ~PAGE_MASK;
+
+			memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
+			kunmap(pga[i]->pg);
+		}
+
+		/*
+		 * The left guard number should be able to hold checksums of a
+		 * whole page
+		 */
+		rc = obd_page_dif_generate_buffer(obd_name, pga[i]->pg,
+						  pga[i]->off & ~PAGE_MASK,
+						  count,
+						  guard_start + used_number,
+						  guard_number - used_number,
+						  &used, sector_size,
+						  fn);
+		if (unlikely(resend))
+			CDEBUG(D_PAGE | D_HA,
+			       "pga[%u]: used %u off %llu+%u gen checksum: %*phN\n",
+			       i, used, pga[i]->off & ~PAGE_MASK, count,
+			       (int)(used * sizeof(*guard_start)),
+			       guard_start + used_number);
+		if (rc)
+			break;
+
+		used_number += used;
+		if (used_number == guard_number) {
+			cfs_crypto_hash_update_page(req, __page, 0,
+				used_number * sizeof(*guard_start));
+			used_number = 0;
+		}
+
+		nob -= pga[i]->count;
+		pg_count--;
+		i++;
+	}
+	kunmap(__page);
+	if (rc)
+		GOTO(out, rc);
+
+	if (used_number != 0)
+		cfs_crypto_hash_update_page(req, __page, 0,
+			used_number * sizeof(*guard_start));
+
+	bufsize = sizeof(cksum);
+	cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
+
+	/* For sending we only compute the wrong checksum instead
+	 * of corrupting the data so it is still correct on a redo */
+	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+		cksum++;
+
+	*check_sum = cksum;
+out:
+	__free_page(__page);
+	return rc;
+}
+#else /* !CONFIG_CRC_T10DIF */
+#define obd_dif_ip_fn NULL
+#define obd_dif_crc_fn NULL
+#define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum, re) \
+	-EOPNOTSUPP
+#endif /* CONFIG_CRC_T10DIF */
+
+static int osc_checksum_bulk(int nob, size_t pg_count,
+			     struct brw_page **pga, int opc,
+			     enum cksum_types cksum_type,
+			     u32 *cksum)
+{
+	int				i = 0;
+	struct ahash_request	       *req;
+	unsigned int			bufsize;
+	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
+
+	LASSERT(pg_count > 0);
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(req);
+	}
+
+	while (nob > 0 && pg_count > 0) {
+		unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+		/* corrupt the data before we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (i == 0 && opc == OST_READ &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
+			unsigned char *ptr = kmap(pga[i]->pg);
+			int off = pga[i]->off & ~PAGE_MASK;
+
+			memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
+			kunmap(pga[i]->pg);
+		}
+		cfs_crypto_hash_update_page(req, pga[i]->pg,
+					    pga[i]->off & ~PAGE_MASK,
+					    count);
+		LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
+			       (int)(pga[i]->off & ~PAGE_MASK));
+
+		nob -= pga[i]->count;
+		pg_count--;
+		i++;
+	}
+
+	bufsize = sizeof(*cksum);
+	cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
+
+	/* For sending we only compute the wrong checksum instead
+	 * of corrupting the data so it is still correct on a redo */
+	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+		(*cksum)++;
+
+	return 0;
+}
+
+static int osc_checksum_bulk_rw(const char *obd_name,
+				enum cksum_types cksum_type,
+				int nob, size_t pg_count,
+				struct brw_page **pga, int opc,
+				u32 *check_sum, bool resend)
+{
+	obd_dif_csum_fn *fn = NULL;
+	int sector_size = 0;
+	int rc;
+
+	ENTRY;
+	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
+
+	if (fn)
+		rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga,
+					     opc, fn, sector_size, check_sum,
+					     resend);
+	else
+		rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type,
+				       check_sum);
+
+	RETURN(rc);
+}
+
+#ifdef CONFIG_LL_ENCRYPTION
+/**
+ * osc_encrypt_pagecache_blocks() - overlay to llcrypt_encrypt_pagecache_blocks
+ * @srcpage:      The locked pagecache page containing the block(s) to encrypt
+ * @dstpage:      The page to put encryption result
+ * @len:       Total size of the block(s) to encrypt.  Must be a nonzero
+ *		multiple of the filesystem's block size.
+ * @offs:      Byte offset within @page of the first block to encrypt.  Must be
+ *		a multiple of the filesystem's block size.
+ * @gfp_flags: Memory allocation flags
+ *
+ * This overlay function is necessary to be able to provide our own bounce page.
+ */
+static struct page *osc_encrypt_pagecache_blocks(struct page *srcpage,
+						 struct page *dstpage,
+						 unsigned int len,
+						 unsigned int offs,
+						 gfp_t gfp_flags)
+
+{
+	const struct inode *inode = srcpage->mapping->host;
+	const unsigned int blockbits = inode->i_blkbits;
+	const unsigned int blocksize = 1 << blockbits;
+	u64 lblk_num = ((u64)srcpage->index << (PAGE_SHIFT - blockbits)) +
+		(offs >> blockbits);
+	unsigned int i;
+	int err;
+
+	if (unlikely(!dstpage))
+		return llcrypt_encrypt_pagecache_blocks(srcpage, len, offs,
+							gfp_flags);
+
+	if (WARN_ON_ONCE(!PageLocked(srcpage)))
+		return ERR_PTR(-EINVAL);
+
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+		return ERR_PTR(-EINVAL);
+
+	/* Set PagePrivate2 for disambiguation in
+	 * osc_finalize_bounce_page().
+	 * It means cipher page was not allocated by llcrypt.
+	 */
+	SetPagePrivate2(dstpage);
+
+	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+		err = llcrypt_encrypt_block(inode, srcpage, dstpage, blocksize,
+					    i, lblk_num, gfp_flags);
+		if (err)
+			return ERR_PTR(err);
+	}
+	SetPagePrivate(dstpage);
+	set_page_private(dstpage, (unsigned long)srcpage);
+	return dstpage;
+}
+
+/**
+ * osc_finalize_bounce_page() - overlay to llcrypt_finalize_bounce_page
+ *
+ * This overlay function is necessary to handle bounce pages
+ * allocated by ourselves.
+ */
+static inline void osc_finalize_bounce_page(struct page **pagep)
+{
+	struct page *page = *pagep;
+
+	/* PagePrivate2 was set in osc_encrypt_pagecache_blocks
+	 * to indicate the cipher page was allocated by ourselves.
+	 * So we must not free it via llcrypt.
+	 */
+	if (unlikely(!page || !PagePrivate2(page)))
+		return llcrypt_finalize_bounce_page(pagep);
+
+	if (llcrypt_is_bounce_page(page)) {
+		*pagep = llcrypt_pagecache_page(page);
+		ClearPagePrivate2(page);
+		set_page_private(page, (unsigned long)NULL);
+		ClearPagePrivate(page);
+	}
+}
+#else /* !CONFIG_LL_ENCRYPTION */
+#define osc_encrypt_pagecache_blocks(srcpage, dstpage, len, offs, gfp_flags) \
+	llcrypt_encrypt_pagecache_blocks(srcpage, len, offs, gfp_flags)
+#define osc_finalize_bounce_page(page) llcrypt_finalize_bounce_page(page)
+#endif
+
+static inline void osc_release_bounce_pages(struct brw_page **pga,
+					    u32 page_count)
+{
+#ifdef HAVE_LUSTRE_CRYPTO
+	struct page **pa = NULL;
+	int i, j = 0;
+
+#ifdef CONFIG_LL_ENCRYPTION
+	if (PageChecked(pga[0]->pg)) {
+		OBD_ALLOC_PTR_ARRAY_LARGE(pa, page_count);
+		if (!pa)
+			return;
+	}
+#endif
+
+	for (i = 0; i < page_count; i++) {
+		/* Bounce pages used by osc_encrypt_pagecache_blocks()
+		 * called from osc_brw_prep_request()
+		 * are identified thanks to the PageChecked flag.
+		 */
+		if (PageChecked(pga[i]->pg)) {
+			if (pa)
+				pa[j++] = pga[i]->pg;
+			osc_finalize_bounce_page(&pga[i]->pg);
+		}
+		pga[i]->count -= pga[i]->bp_count_diff;
+		pga[i]->off += pga[i]->bp_off_diff;
+	}
+
+	if (pa) {
+		sptlrpc_enc_pool_put_pages_array(pa, j);
+		OBD_FREE_PTR_ARRAY_LARGE(pa, page_count);
+	}
+#endif
+}
+
+static int
+osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
+		     u32 page_count, struct brw_page **pga,
+		     struct ptlrpc_request **reqp, int resend)
+{
+	struct ptlrpc_request *req;
+	struct ptlrpc_bulk_desc *desc;
+	struct ost_body *body;
+	struct obd_ioobj *ioobj;
+	struct niobuf_remote *niobuf;
+	int niocount, i, requested_nob, opc, rc, short_io_size = 0;
+	struct osc_brw_async_args *aa;
+	struct req_capsule *pill;
+	struct brw_page *pg_prev;
+	void *short_io_buf;
+	const char *obd_name = cli->cl_import->imp_obd->obd_name;
+	struct inode *inode = NULL;
+	bool directio = false;
+	bool enable_checksum = true;
+	struct cl_page *clpage;
+
+	ENTRY;
+	if (pga[0]->pg) {
+		clpage = oap2cl_page(brw_page2oap(pga[0]));
+		inode = clpage->cp_inode;
+		if (clpage->cp_type == CPT_TRANSIENT)
+			directio = true;
+	}
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
+		RETURN(-ENOMEM); /* Recoverable */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
+		RETURN(-EINVAL); /* Fatal */
+
+	if ((cmd & OBD_BRW_WRITE) != 0) {
+		opc = OST_WRITE;
+		req = ptlrpc_request_alloc_pool(cli->cl_import,
+						osc_rq_pool,
+						&RQF_OST_BRW_WRITE);
+	} else {
+		opc = OST_READ;
+		req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
+	}
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+	if (opc == OST_WRITE && inode && IS_ENCRYPTED(inode) &&
+	    llcrypt_has_encryption_key(inode)) {
+		struct page **pa = NULL;
+
+#ifdef CONFIG_LL_ENCRYPTION
+		OBD_ALLOC_PTR_ARRAY_LARGE(pa, page_count);
+		if (pa == NULL) {
+			ptlrpc_request_free(req);
+			RETURN(-ENOMEM);
+		}
+
+		rc = sptlrpc_enc_pool_get_pages_array(pa, page_count);
+		if (rc) {
+			CDEBUG(D_SEC, "failed to allocate from enc pool: %d\n",
+			       rc);
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+#endif
+
+		for (i = 0; i < page_count; i++) {
+			struct brw_page *brwpg = pga[i];
+			struct page *data_page = NULL;
+			bool retried = false;
+			bool lockedbymyself;
+			u32 nunits = (brwpg->off & ~PAGE_MASK) + brwpg->count;
+			struct address_space *map_orig = NULL;
+			pgoff_t index_orig;
+
+retry_encrypt:
+			nunits = round_up(nunits, LUSTRE_ENCRYPTION_UNIT_SIZE);
+			/* The page can already be locked when we arrive here.
+			 * This is possible when cl_page_assume/vvp_page_assume
+			 * is stuck on wait_on_page_writeback with page lock
+			 * held. In this case there is no risk for the lock to
+			 * be released while we are doing our encryption
+			 * processing, because writeback against that page will
+			 * end in vvp_page_completion_write/cl_page_completion,
+			 * which means only once the page is fully processed.
+			 */
+			lockedbymyself = trylock_page(brwpg->pg);
+			if (directio) {
+				map_orig = brwpg->pg->mapping;
+				brwpg->pg->mapping = inode->i_mapping;
+				index_orig = brwpg->pg->index;
+				clpage = oap2cl_page(brw_page2oap(brwpg));
+				brwpg->pg->index = clpage->cp_page_index;
+			}
+			data_page =
+				osc_encrypt_pagecache_blocks(brwpg->pg,
+							    pa ? pa[i] : NULL,
+							    nunits, 0,
+							    GFP_NOFS);
+			if (directio) {
+				brwpg->pg->mapping = map_orig;
+				brwpg->pg->index = index_orig;
+			}
+			if (lockedbymyself)
+				unlock_page(brwpg->pg);
+			if (IS_ERR(data_page)) {
+				rc = PTR_ERR(data_page);
+				if (rc == -ENOMEM && !retried) {
+					retried = true;
+					rc = 0;
+					goto retry_encrypt;
+				}
+				if (pa) {
+					sptlrpc_enc_pool_put_pages_array(pa + i,
+								page_count - i);
+					OBD_FREE_PTR_ARRAY_LARGE(pa,
+								 page_count);
+				}
+				ptlrpc_request_free(req);
+				RETURN(rc);
+			}
+			/* Set PageChecked flag on bounce page for
+			 * disambiguation in osc_release_bounce_pages().
+			 */
+			SetPageChecked(data_page);
+			brwpg->pg = data_page;
+			/* there should be no gap in the middle of page array */
+			if (i == page_count - 1) {
+				struct osc_async_page *oap =
+					brw_page2oap(brwpg);
+
+				oa->o_size = oap->oap_count +
+					oap->oap_obj_off + oap->oap_page_off;
+			}
+			/* len is forced to nunits, and relative offset to 0
+			 * so store the old, clear text info
+			 */
+			brwpg->bp_count_diff = nunits - brwpg->count;
+			brwpg->count = nunits;
+			brwpg->bp_off_diff = brwpg->off & ~PAGE_MASK;
+			brwpg->off = brwpg->off & PAGE_MASK;
+		}
+
+		if (pa)
+			OBD_FREE_PTR_ARRAY_LARGE(pa, page_count);
+	} else if (opc == OST_WRITE && inode && IS_ENCRYPTED(inode)) {
+		struct osc_async_page *oap = brw_page2oap(pga[0]);
+		struct cl_page *clpage = oap2cl_page(oap);
+		struct cl_object *clobj = clpage->cp_obj;
+		struct cl_attr attr = { 0 };
+		struct lu_env *env;
+		__u16 refcheck;
+
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env)) {
+			rc = PTR_ERR(env);
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+
+		cl_object_attr_lock(clobj);
+		rc = cl_object_attr_get(env, clobj, &attr);
+		cl_object_attr_unlock(clobj);
+		cl_env_put(env, &refcheck);
+		if (rc != 0) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+		if (attr.cat_size)
+			oa->o_size = attr.cat_size;
+	} else if (opc == OST_READ && inode && IS_ENCRYPTED(inode) &&
+		   llcrypt_has_encryption_key(inode)) {
+		for (i = 0; i < page_count; i++) {
+			struct brw_page *pg = pga[i];
+			u32 nunits = (pg->off & ~PAGE_MASK) + pg->count;
+
+			nunits = round_up(nunits, LUSTRE_ENCRYPTION_UNIT_SIZE);
+			/* count/off are forced to cover the whole encryption
+			 * unit size so that all encrypted data is stored on the
+			 * OST, so adjust bp_{count,off}_diff for the size of
+			 * the clear text.
+			 */
+			pg->bp_count_diff = nunits - pg->count;
+			pg->count = nunits;
+			pg->bp_off_diff = pg->off & ~PAGE_MASK;
+			pg->off = pg->off & PAGE_MASK;
+		}
+	}
+
+        for (niocount = i = 1; i < page_count; i++) {
+                if (!can_merge_pages(pga[i - 1], pga[i]))
+                        niocount++;
+        }
+
+        pill = &req->rq_pill;
+        req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
+                             sizeof(*ioobj));
+        req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
+                             niocount * sizeof(*niobuf));
+
+	for (i = 0; i < page_count; i++) {
+		short_io_size += pga[i]->count;
+		if (!inode || !IS_ENCRYPTED(inode) ||
+		    !llcrypt_has_encryption_key(inode)) {
+			pga[i]->bp_count_diff = 0;
+			pga[i]->bp_off_diff = 0;
+		}
+	}
+
+	if (brw_page2oap(pga[0])->oap_brw_flags & OBD_BRW_RDMA_ONLY) {
+		enable_checksum = false;
+		short_io_size = 0;
+	}
+
+	/* Check if read/write is small enough to be a short io. */
+	if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 ||
+	    !imp_connect_shortio(cli->cl_import))
+		short_io_size = 0;
+
+	/* If this is an empty RPC to old server, just ignore it */
+	if (!short_io_size && !pga[0]->pg) {
+		ptlrpc_request_free(req);
+		RETURN(-ENODATA);
+	}
+
+	req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
+			     opc == OST_READ ? 0 : short_io_size);
+	if (opc == OST_READ)
+		req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
+				     short_io_size);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+	osc_set_io_portal(req);
+
+	ptlrpc_at_set_req_timeout(req);
+	/* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
+	 * retry logic */
+	req->rq_no_retry_einprogress = 1;
+
+	if (short_io_size != 0) {
+		desc = NULL;
+		short_io_buf = NULL;
+		goto no_bulk;
+	}
+
+	desc = ptlrpc_prep_bulk_imp(req, page_count,
+		cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
+		(opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
+			PTLRPC_BULK_PUT_SINK),
+		OST_BULK_PORTAL,
+		&ptlrpc_bulk_kiov_pin_ops);
+
+        if (desc == NULL)
+                GOTO(out, rc = -ENOMEM);
+        /* NB request now owns desc and will free it when it gets freed */
+no_bulk:
+        body = req_capsule_client_get(pill, &RMF_OST_BODY);
+        ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
+        niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+        LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	/* For READ and WRITE, we can't fill o_uid and o_gid using from_kuid()
+	 * and from_kgid(), because they are asynchronous. Fortunately, variable
+	 * oa contains valid o_uid and o_gid in these two operations.
+	 * Besides, filling o_uid and o_gid is enough for nrs-tbf, see LU-9658.
+	 * OBD_MD_FLUID and OBD_MD_FLUID is not set in order to avoid breaking
+	 * other process logic */
+	body->oa.o_uid = oa->o_uid;
+	body->oa.o_gid = oa->o_gid;
+
+	obdo_to_ioobj(oa, ioobj);
+	ioobj->ioo_bufcnt = niocount;
+	/* The high bits of ioo_max_brw tells server _maximum_ number of bulks
+	 * that might be send for this request.  The actual number is decided
+	 * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
+	 * "max - 1" for old client compatibility sending "0", and also so the
+	 * the actual maximum is a power-of-two number, not one less. LU-1431 */
+	if (desc != NULL)
+		ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+	else /* short io */
+		ioobj_max_brw_set(ioobj, 0);
+
+	if (inode && IS_ENCRYPTED(inode) &&
+	    llcrypt_has_encryption_key(inode) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_ENCFLAG)) {
+		if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+			body->oa.o_valid |= OBD_MD_FLFLAGS;
+			body->oa.o_flags = 0;
+		}
+		body->oa.o_flags |= LUSTRE_ENCRYPT_FL;
+	}
+
+	if (short_io_size != 0) {
+		if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+			body->oa.o_valid |= OBD_MD_FLFLAGS;
+			body->oa.o_flags = 0;
+		}
+		body->oa.o_flags |= OBD_FL_SHORT_IO;
+		CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
+		       short_io_size);
+		if (opc == OST_WRITE) {
+			short_io_buf = req_capsule_client_get(pill,
+							      &RMF_SHORT_IO);
+			LASSERT(short_io_buf != NULL);
+		}
+	}
+
+	LASSERT(page_count > 0);
+	pg_prev = pga[0];
+        for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
+                struct brw_page *pg = pga[i];
+		int poff = pg->off & ~PAGE_MASK;
+
+                LASSERT(pg->count > 0);
+                /* make sure there is no gap in the middle of page array */
+		LASSERTF(page_count == 1 ||
+			 (ergo(i == 0, poff + pg->count == PAGE_SIZE) &&
+			  ergo(i > 0 && i < page_count - 1,
+			       poff == 0 && pg->count == PAGE_SIZE)   &&
+			  ergo(i == page_count - 1, poff == 0)),
+			 "i: %d/%d pg: %p off: %llu, count: %u\n",
+			 i, page_count, pg, pg->off, pg->count);
+                LASSERTF(i == 0 || pg->off > pg_prev->off,
+			 "i %d p_c %u pg %p [pri %lu ind %lu] off %llu"
+			 " prev_pg %p [pri %lu ind %lu] off %llu\n",
+                         i, page_count,
+                         pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+                         pg_prev->pg, page_private(pg_prev->pg),
+                         pg_prev->pg->index, pg_prev->off);
+                LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
+                        (pg->flag & OBD_BRW_SRVLOCK));
+		if (short_io_size != 0 && opc == OST_WRITE) {
+			unsigned char *ptr = kmap_atomic(pg->pg);
+
+			LASSERT(short_io_size >= requested_nob + pg->count);
+			memcpy(short_io_buf + requested_nob,
+			       ptr + poff,
+			       pg->count);
+			kunmap_atomic(ptr);
+		} else if (short_io_size == 0) {
+			desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
+							 pg->count);
+		}
+		requested_nob += pg->count;
+
+                if (i > 0 && can_merge_pages(pg_prev, pg)) {
+                        niobuf--;
+			niobuf->rnb_len += pg->count;
+		} else {
+			niobuf->rnb_offset = pg->off;
+			niobuf->rnb_len    = pg->count;
+			niobuf->rnb_flags  = pg->flag;
+                }
+                pg_prev = pg;
+        }
+
+        LASSERTF((void *)(niobuf - niocount) ==
+                req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
+                "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
+                &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
+
+        osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
+        if (resend) {
+                if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+                        body->oa.o_valid |= OBD_MD_FLFLAGS;
+                        body->oa.o_flags = 0;
+                }
+                body->oa.o_flags |= OBD_FL_RECOV_RESEND;
+        }
+
+        if (osc_should_shrink_grant(cli))
+                osc_shrink_grant_local(cli, &body->oa);
+
+	if (!cli->cl_checksum || sptlrpc_flavor_has_bulk(&req->rq_flvr))
+		enable_checksum = false;
+
+        /* size[REQ_REC_OFF] still sizeof (*body) */
+        if (opc == OST_WRITE) {
+                if (enable_checksum) {
+                        /* store cl_cksum_type in a local variable since
+                         * it can be changed via lprocfs */
+			enum cksum_types cksum_type = cli->cl_cksum_type;
+
+                        if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+                                body->oa.o_flags = 0;
+
+			body->oa.o_flags |= obd_cksum_type_pack(obd_name,
+								cksum_type);
+                        body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+
+			rc = osc_checksum_bulk_rw(obd_name, cksum_type,
+						  requested_nob, page_count,
+						  pga, OST_WRITE,
+						  &body->oa.o_cksum, resend);
+			if (rc < 0) {
+				CDEBUG(D_PAGE, "failed to checksum: rc = %d\n",
+				       rc);
+				GOTO(out, rc);
+			}
+			CDEBUG(D_PAGE | (resend ? D_HA : 0),
+			       "checksum at write origin: %x (%x)\n",
+			       body->oa.o_cksum, cksum_type);
+
+			/* save this in 'oa', too, for later checking */
+			oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+			oa->o_flags |= obd_cksum_type_pack(obd_name,
+							   cksum_type);
+                } else {
+                        /* clear out the checksum flag, in case this is a
+                         * resend but cl_checksum is no longer set. b=11238 */
+                        oa->o_valid &= ~OBD_MD_FLCKSUM;
+                }
+                oa->o_cksum = body->oa.o_cksum;
+                /* 1 RC per niobuf */
+                req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
+                                     sizeof(__u32) * niocount);
+        } else {
+                if (enable_checksum) {
+                        if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+                                body->oa.o_flags = 0;
+			body->oa.o_flags |= obd_cksum_type_pack(obd_name,
+				cli->cl_cksum_type);
+                        body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+		}
+
+		/* Client cksum has been already copied to wire obdo in previous
+		 * lustre_set_wire_obdo(), and in the case a bulk-read is being
+		 * resent due to cksum error, this will allow Server to
+		 * check+dump pages on its side */
+	}
+	ptlrpc_request_set_replen(req);
+
+	aa = ptlrpc_req_async_args(aa, req);
+	aa->aa_oa = oa;
+	aa->aa_requested_nob = requested_nob;
+	aa->aa_nio_count = niocount;
+	aa->aa_page_count = page_count;
+	aa->aa_resends = 0;
+	aa->aa_ppga = pga;
+	aa->aa_cli = cli;
+	INIT_LIST_HEAD(&aa->aa_oaps);
+
+	*reqp = req;
+	niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+	CDEBUG(D_RPCTRACE, "brw rpc %p - object "DOSTID" offset %lld<>%lld\n",
+		req, POSTID(&oa->o_oi), niobuf[0].rnb_offset,
+		niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len);
+        RETURN(0);
+
+ out:
+        ptlrpc_req_finished(req);
+        RETURN(rc);
+}
+
+char dbgcksum_file_name[PATH_MAX];
+
+static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count,
+				struct brw_page **pga, __u32 server_cksum,
+				__u32 client_cksum)
+{
+	struct file *filp;
+	int rc, i;
+	unsigned int len;
+	char *buf;
+
+	/* will only keep dump of pages on first error for the same range in
+	 * file/fid, not during the resends/retries. */
+	snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name),
+		 "%s-checksum_dump-osc-"DFID":[%llu-%llu]-%x-%x",
+		 (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 ?
+		  libcfs_debug_file_path : LIBCFS_DEBUG_FILE_PATH_DEFAULT),
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+		 pga[0]->off,
+		 pga[page_count-1]->off + pga[page_count-1]->count - 1,
+		 client_cksum, server_cksum);
+	CWARN("dumping checksum data to %s\n", dbgcksum_file_name);
+	filp = filp_open(dbgcksum_file_name,
+			 O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		if (rc == -EEXIST)
+			CDEBUG(D_INFO, "%s: can't open to dump pages with "
+			       "checksum error: rc = %d\n", dbgcksum_file_name,
+			       rc);
+		else
+			CERROR("%s: can't open to dump pages with checksum "
+			       "error: rc = %d\n", dbgcksum_file_name, rc);
+		return;
+	}
+
+	for (i = 0; i < page_count; i++) {
+		len = pga[i]->count;
+		buf = kmap(pga[i]->pg);
+		while (len != 0) {
+			rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
+			if (rc < 0) {
+				CERROR("%s: wanted to write %u but got %d "
+				       "error\n", dbgcksum_file_name, len, rc);
+				break;
+			}
+			len -= rc;
+			buf += rc;
+		}
+		kunmap(pga[i]->pg);
+	}
+
+	rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1);
+	if (rc)
+		CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
+	filp_close(filp, NULL);
+
+	libcfs_debug_dumplog();
+}
+
+static int
+check_write_checksum(struct obdo *oa, const struct lnet_processid *peer,
+		     __u32 client_cksum, __u32 server_cksum,
+		     struct osc_brw_async_args *aa)
+{
+	const char *obd_name = aa->aa_cli->cl_import->imp_obd->obd_name;
+	enum cksum_types cksum_type;
+	obd_dif_csum_fn *fn = NULL;
+	int sector_size = 0;
+	__u32 new_cksum;
+	char *msg;
+	int rc;
+
+        if (server_cksum == client_cksum) {
+                CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+                return 0;
+        }
+
+	if (aa->aa_cli->cl_checksum_dump)
+		dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga,
+				    server_cksum, client_cksum);
+
+	cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+					   oa->o_flags : 0);
+
+	switch (cksum_type) {
+	case OBD_CKSUM_T10IP512:
+		fn = obd_dif_ip_fn;
+		sector_size = 512;
+		break;
+	case OBD_CKSUM_T10IP4K:
+		fn = obd_dif_ip_fn;
+		sector_size = 4096;
+		break;
+	case OBD_CKSUM_T10CRC512:
+		fn = obd_dif_crc_fn;
+		sector_size = 512;
+		break;
+	case OBD_CKSUM_T10CRC4K:
+		fn = obd_dif_crc_fn;
+		sector_size = 4096;
+		break;
+	default:
+		break;
+	}
+
+	if (fn)
+		rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob,
+					     aa->aa_page_count, aa->aa_ppga,
+					     OST_WRITE, fn, sector_size,
+					     &new_cksum, true);
+	else
+		rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
+				       aa->aa_ppga, OST_WRITE, cksum_type,
+				       &new_cksum);
+
+	if (rc < 0)
+		msg = "failed to calculate the client write checksum";
+	else if (cksum_type != obd_cksum_type_unpack(aa->aa_oa->o_flags))
+                msg = "the server did not use the checksum type specified in "
+                      "the original request - likely a protocol problem";
+        else if (new_cksum == server_cksum)
+                msg = "changed on the client after we checksummed it - "
+                      "likely false positive due to mmap IO (bug 11742)";
+        else if (new_cksum == client_cksum)
+                msg = "changed in transit before arrival at OST";
+        else
+                msg = "changed in transit AND doesn't match the original - "
+                      "likely false positive due to mmap IO (bug 11742)";
+
+	LCONSOLE_ERROR_MSG(0x132, "%s: BAD WRITE CHECKSUM: %s: from %s inode "
+			   DFID " object "DOSTID" extent [%llu-%llu], original "
+			   "client csum %x (type %x), server csum %x (type %x),"
+			   " client csum now %x\n",
+			   obd_name, msg, libcfs_nidstr(&peer->nid),
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+			   POSTID(&oa->o_oi), aa->aa_ppga[0]->off,
+			   aa->aa_ppga[aa->aa_page_count - 1]->off +
+				aa->aa_ppga[aa->aa_page_count-1]->count - 1,
+			   client_cksum,
+			   obd_cksum_type_unpack(aa->aa_oa->o_flags),
+			   server_cksum, cksum_type, new_cksum);
+	return 1;
+}
+
+/* Note rc enters this function as number of bytes transferred */
+static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
+{
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	struct client_obd *cli = aa->aa_cli;
+	const char *obd_name = cli->cl_import->imp_obd->obd_name;
+	const struct lnet_processid *peer =
+		&req->rq_import->imp_connection->c_peer;
+	struct ost_body *body;
+	u32 client_cksum = 0;
+	struct inode *inode = NULL;
+	unsigned int blockbits = 0, blocksize = 0;
+	struct cl_page *clpage;
+
+	ENTRY;
+
+	if (rc < 0 && rc != -EDQUOT) {
+		DEBUG_REQ(D_INFO, req, "Failed request: rc = %d", rc);
+		RETURN(rc);
+	}
+
+	LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		DEBUG_REQ(D_INFO, req, "cannot unpack body");
+		RETURN(-EPROTO);
+	}
+
+	/* set/clear over quota flag for a uid/gid/projid */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
+	    body->oa.o_valid & (OBD_MD_FLALLQUOTA)) {
+		unsigned qid[LL_MAXQUOTAS] = {
+					 body->oa.o_uid, body->oa.o_gid,
+					 body->oa.o_projid };
+		CDEBUG(D_QUOTA,
+		       "setdq for [%u %u %u] with valid %#llx, flags %x\n",
+		       body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
+		       body->oa.o_valid, body->oa.o_flags);
+		osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
+				body->oa.o_flags);
+	}
+
+	osc_update_grant(cli, body);
+
+	if (rc < 0)
+		RETURN(rc);
+
+	if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
+		client_cksum = aa->aa_oa->o_cksum; /* save for later */
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+		if (rc > 0) {
+			CERROR("%s: unexpected positive size %d\n",
+			       obd_name, rc);
+			RETURN(-EPROTO);
+		}
+
+		if (req->rq_bulk != NULL &&
+		    sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+			RETURN(-EAGAIN);
+
+		if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
+		    check_write_checksum(&body->oa, peer, client_cksum,
+					 body->oa.o_cksum, aa))
+			RETURN(-EAGAIN);
+
+		rc = check_write_rcs(req, aa->aa_requested_nob,
+				     aa->aa_nio_count, aa->aa_page_count,
+				     aa->aa_ppga);
+		GOTO(out, rc);
+	}
+
+	/* The rest of this function executes only for OST_READs */
+
+	if (req->rq_bulk == NULL) {
+		rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
+					  RCL_SERVER);
+		LASSERT(rc == req->rq_status);
+	} else {
+		/* if unwrap_bulk failed, return -EAGAIN to retry */
+		rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+	}
+	if (rc < 0)
+		GOTO(out, rc = -EAGAIN);
+
+	if (rc > aa->aa_requested_nob) {
+		CERROR("%s: unexpected size %d, requested %d\n", obd_name,
+		       rc, aa->aa_requested_nob);
+		RETURN(-EPROTO);
+	}
+
+	if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
+		CERROR("%s: unexpected size %d, transferred %d\n", obd_name,
+		       rc, req->rq_bulk->bd_nob_transferred);
+		RETURN(-EPROTO);
+	}
+
+	if (req->rq_bulk == NULL) {
+		/* short io */
+		int nob, pg_count, i = 0;
+		unsigned char *buf;
+
+		CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
+		pg_count = aa->aa_page_count;
+		buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
+						   rc);
+		nob = rc;
+		while (nob > 0 && pg_count > 0) {
+			unsigned char *ptr;
+			int count = aa->aa_ppga[i]->count > nob ?
+				    nob : aa->aa_ppga[i]->count;
+
+			CDEBUG(D_CACHE, "page %p count %d\n",
+			       aa->aa_ppga[i]->pg, count);
+			ptr = kmap_atomic(aa->aa_ppga[i]->pg);
+			memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
+			       count);
+			kunmap_atomic((void *) ptr);
+
+			buf += count;
+			nob -= count;
+			i++;
+			pg_count--;
+		}
+	}
+
+	if (rc < aa->aa_requested_nob)
+		handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
+
+	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+		static int cksum_counter;
+		u32 server_cksum = body->oa.o_cksum;
+		int nob = rc;
+		char *via = "";
+		char *router = "";
+		enum cksum_types cksum_type;
+		u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ?
+			body->oa.o_flags : 0;
+
+		cksum_type = obd_cksum_type_unpack(o_flags);
+		rc = osc_checksum_bulk_rw(obd_name, cksum_type, nob,
+					  aa->aa_page_count, aa->aa_ppga,
+					  OST_READ, &client_cksum, false);
+		if (rc < 0)
+			GOTO(out, rc);
+
+		if (req->rq_bulk != NULL &&
+		    lnet_nid_to_nid4(&peer->nid) != req->rq_bulk->bd_sender) {
+			via = " via ";
+			router = libcfs_nid2str(req->rq_bulk->bd_sender);
+		}
+
+		if (server_cksum != client_cksum) {
+			struct ost_body *clbody;
+			__u32 client_cksum2;
+			u32 page_count = aa->aa_page_count;
+
+			osc_checksum_bulk_rw(obd_name, cksum_type, nob,
+					     page_count, aa->aa_ppga,
+					     OST_READ, &client_cksum2, true);
+			clbody = req_capsule_client_get(&req->rq_pill,
+							&RMF_OST_BODY);
+			if (cli->cl_checksum_dump)
+				dump_all_bulk_pages(&clbody->oa, page_count,
+						    aa->aa_ppga, server_cksum,
+						    client_cksum);
+
+			LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
+					   "%s%s%s inode "DFID" object "DOSTID
+					   " extent [%llu-%llu], client %x/%x, "
+					   "server %x, cksum_type %x\n",
+					   obd_name,
+					   libcfs_nidstr(&peer->nid),
+					   via, router,
+					   clbody->oa.o_valid & OBD_MD_FLFID ?
+						clbody->oa.o_parent_seq : 0ULL,
+					   clbody->oa.o_valid & OBD_MD_FLFID ?
+						clbody->oa.o_parent_oid : 0,
+					   clbody->oa.o_valid & OBD_MD_FLFID ?
+						clbody->oa.o_parent_ver : 0,
+					   POSTID(&body->oa.o_oi),
+					   aa->aa_ppga[0]->off,
+					   aa->aa_ppga[page_count-1]->off +
+					   aa->aa_ppga[page_count-1]->count - 1,
+					   client_cksum, client_cksum2,
+					   server_cksum, cksum_type);
+			cksum_counter = 0;
+			aa->aa_oa->o_cksum = client_cksum;
+			rc = -EAGAIN;
+		} else {
+			cksum_counter++;
+			CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+			rc = 0;
+		}
+	} else if (unlikely(client_cksum)) {
+		static int cksum_missed;
+
+		cksum_missed++;
+		if ((cksum_missed & (-cksum_missed)) == cksum_missed)
+			CERROR("%s: checksum %u requested from %s but not sent\n",
+			       obd_name, cksum_missed,
+			       libcfs_nidstr(&peer->nid));
+	} else {
+		rc = 0;
+	}
+
+	/* get the inode from the first cl_page */
+	clpage = oap2cl_page(brw_page2oap(aa->aa_ppga[0]));
+	inode = clpage->cp_inode;
+	if (clpage->cp_type == CPT_TRANSIENT && inode) {
+		blockbits = inode->i_blkbits;
+		blocksize = 1 << blockbits;
+	}
+	if (inode && IS_ENCRYPTED(inode)) {
+		int idx;
+
+		if (!llcrypt_has_encryption_key(inode)) {
+			CDEBUG(D_SEC, "no enc key for ino %lu\n", inode->i_ino);
+			GOTO(out, rc);
+		}
+		for (idx = 0; idx < aa->aa_page_count; idx++) {
+			struct brw_page *brwpg = aa->aa_ppga[idx];
+			unsigned int offs = 0;
+
+			while (offs < PAGE_SIZE) {
+				/* do not decrypt if page is all 0s */
+				if (memchr_inv(page_address(brwpg->pg) + offs,
+				      0, LUSTRE_ENCRYPTION_UNIT_SIZE) == NULL) {
+					/* if page is empty forward info to
+					 * upper layers (ll_io_zero_page) by
+					 * clearing PagePrivate2
+					 */
+					if (!offs)
+						ClearPagePrivate2(brwpg->pg);
+					break;
+				}
+
+				if (blockbits) {
+					/* This is direct IO case. Directly call
+					 * decrypt function that takes inode as
+					 * input parameter. Page does not need
+					 * to be locked.
+					 */
+					u64 lblk_num;
+					unsigned int i;
+
+					clpage =
+					       oap2cl_page(brw_page2oap(brwpg));
+					lblk_num =
+						((u64)(clpage->cp_page_index) <<
+						(PAGE_SHIFT - blockbits)) +
+						(offs >> blockbits);
+					for (i = offs;
+					     i < offs +
+						    LUSTRE_ENCRYPTION_UNIT_SIZE;
+					     i += blocksize, lblk_num++) {
+						rc =
+						  llcrypt_decrypt_block_inplace(
+							  inode, brwpg->pg,
+							  blocksize, i,
+							  lblk_num);
+						if (rc)
+							break;
+					}
+				} else {
+					rc = llcrypt_decrypt_pagecache_blocks(
+						brwpg->pg,
+						LUSTRE_ENCRYPTION_UNIT_SIZE,
+						offs);
+				}
+				if (rc)
+					GOTO(out, rc);
+
+				offs += LUSTRE_ENCRYPTION_UNIT_SIZE;
+			}
+		}
+	}
+
+out:
+	if (rc >= 0)
+		lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+				     aa->aa_oa, &body->oa);
+
+	RETURN(rc);
+}
+
+static int osc_brw_redo_request(struct ptlrpc_request *request,
+				struct osc_brw_async_args *aa, int rc)
+{
+	struct ptlrpc_request *new_req;
+	struct osc_brw_async_args *new_aa;
+	struct osc_async_page *oap;
+	ENTRY;
+
+	/* The below message is checked in replay-ost-single.sh test_8ae*/
+	DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
+		  "redo for recoverable error %d", rc);
+
+	rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
+				OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
+				  aa->aa_cli, aa->aa_oa, aa->aa_page_count,
+				  aa->aa_ppga, &new_req, 1);
+        if (rc)
+                RETURN(rc);
+
+	list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+		if (oap->oap_request != NULL) {
+			LASSERTF(request == oap->oap_request,
+				 "request %p != oap_request %p\n",
+				 request, oap->oap_request);
+		}
+	}
+	/*
+	 * New request takes over pga and oaps from old request.
+	 * Note that copying a list_head doesn't work, need to move it...
+	 */
+	aa->aa_resends++;
+	new_req->rq_interpret_reply = request->rq_interpret_reply;
+	new_req->rq_async_args = request->rq_async_args;
+	new_req->rq_commit_cb = request->rq_commit_cb;
+	/* cap resend delay to the current request timeout, this is similar to
+	 * what ptlrpc does (see after_reply()) */
+	if (aa->aa_resends > new_req->rq_timeout)
+		new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout;
+	else
+		new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends;
+        new_req->rq_generation_set = 1;
+        new_req->rq_import_generation = request->rq_import_generation;
+
+	new_aa = ptlrpc_req_async_args(new_aa, new_req);
+
+	INIT_LIST_HEAD(&new_aa->aa_oaps);
+	list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
+	INIT_LIST_HEAD(&new_aa->aa_exts);
+	list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
+	new_aa->aa_resends = aa->aa_resends;
+
+	list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
+                if (oap->oap_request) {
+                        ptlrpc_req_finished(oap->oap_request);
+                        oap->oap_request = ptlrpc_request_addref(new_req);
+                }
+        }
+
+	/* XXX: This code will run into problem if we're going to support
+	 * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
+	 * and wait for all of them to be finished. We should inherit request
+	 * set from old request. */
+	ptlrpcd_add_req(new_req);
+
+	DEBUG_REQ(D_INFO, new_req, "new request");
+	RETURN(0);
+}
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order.  we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation.  its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+static void sort_brw_pages(struct brw_page **array, int num)
+{
+        int stride, i, j;
+        struct brw_page *tmp;
+
+        if (num == 1)
+                return;
+        for (stride = 1; stride < num ; stride = (stride * 3) + 1)
+                ;
+
+        do {
+                stride /= 3;
+                for (i = stride ; i < num ; i++) {
+                        tmp = array[i];
+                        j = i;
+                        while (j >= stride && array[j - stride]->off > tmp->off) {
+                                array[j] = array[j - stride];
+                                j -= stride;
+                        }
+                        array[j] = tmp;
+                }
+        } while (stride > 1);
+}
+
+static void osc_release_ppga(struct brw_page **ppga, size_t count)
+{
+	LASSERT(ppga != NULL);
+	OBD_FREE_PTR_ARRAY_LARGE(ppga, count);
+}
+
+static int brw_interpret(const struct lu_env *env,
+			 struct ptlrpc_request *req, void *args, int rc)
+{
+	struct osc_brw_async_args *aa = args;
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct client_obd *cli = aa->aa_cli;
+	unsigned long transferred = 0;
+
+	ENTRY;
+
+	rc = osc_brw_fini_request(req, rc);
+	CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
+
+	/* restore clear text pages */
+	osc_release_bounce_pages(aa->aa_ppga, aa->aa_page_count);
+
+	/*
+	 * When server returns -EINPROGRESS, client should always retry
+	 * regardless of the number of times the bulk was resent already.
+	 */
+	if (osc_recoverable_error(rc) && !req->rq_no_delay) {
+		if (req->rq_import_generation !=
+		    req->rq_import->imp_generation) {
+			CDEBUG(D_HA, "%s: resend cross eviction for object: "
+			       ""DOSTID", rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		} else if (rc == -EINPROGRESS ||
+			   client_should_resend(aa->aa_resends, aa->aa_cli)) {
+			rc = osc_brw_redo_request(req, aa, rc);
+		} else {
+			CERROR("%s: too many resent retries for object: "
+			       "%llu:%llu, rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		}
+
+		if (rc == 0)
+			RETURN(0);
+		else if (rc == -EAGAIN || rc == -EINPROGRESS)
+			rc = -EIO;
+	}
+
+	if (rc == 0) {
+		struct obdo *oa = aa->aa_oa;
+		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+		unsigned long valid = 0;
+		struct cl_object *obj;
+		struct osc_async_page *last;
+
+		last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]);
+		obj = osc2cl(last->oap_obj);
+
+		cl_object_attr_lock(obj);
+		if (oa->o_valid & OBD_MD_FLBLOCKS) {
+			attr->cat_blocks = oa->o_blocks;
+			valid |= CAT_BLOCKS;
+		}
+		if (oa->o_valid & OBD_MD_FLMTIME) {
+			attr->cat_mtime = oa->o_mtime;
+			valid |= CAT_MTIME;
+		}
+		if (oa->o_valid & OBD_MD_FLATIME) {
+			attr->cat_atime = oa->o_atime;
+			valid |= CAT_ATIME;
+		}
+		if (oa->o_valid & OBD_MD_FLCTIME) {
+			attr->cat_ctime = oa->o_ctime;
+			valid |= CAT_CTIME;
+		}
+
+		if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+			struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
+			loff_t last_off = last->oap_count + last->oap_obj_off +
+				last->oap_page_off;
+
+			/* Change file size if this is an out of quota or
+			 * direct IO write and it extends the file size */
+			if (loi->loi_lvb.lvb_size < last_off) {
+				attr->cat_size = last_off;
+				valid |= CAT_SIZE;
+			}
+			/* Extend KMS if it's not a lockless write */
+			if (loi->loi_kms < last_off &&
+			    oap2osc_page(last)->ops_srvlock == 0) {
+				attr->cat_kms = last_off;
+				valid |= CAT_KMS;
+			}
+		}
+
+		if (valid != 0)
+			cl_object_attr_update(env, obj, attr, valid);
+		cl_object_attr_unlock(obj);
+	}
+	OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
+	aa->aa_oa = NULL;
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
+		osc_inc_unstable_pages(req);
+
+	list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
+		list_del_init(&ext->oe_link);
+		osc_extent_finish(env, ext, 1,
+				  rc && req->rq_no_delay ? -EAGAIN : rc);
+	}
+	LASSERT(list_empty(&aa->aa_exts));
+	LASSERT(list_empty(&aa->aa_oaps));
+
+	transferred = (req->rq_bulk == NULL ? /* short io */
+		       aa->aa_requested_nob :
+		       req->rq_bulk->bd_nob_transferred);
+
+	osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
+	ptlrpc_lprocfs_brw(req, transferred);
+
+	spin_lock(&cli->cl_loi_list_lock);
+	/* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
+	 * is called so we know whether to go to sync BRWs or wait for more
+	 * RPCs to complete */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
+		cli->cl_w_in_flight--;
+	else
+		cli->cl_r_in_flight--;
+	osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	osc_io_unplug(env, cli, NULL);
+	RETURN(rc);
+}
+
+static void brw_commit(struct ptlrpc_request *req)
+{
+	/* If osc_inc_unstable_pages (via osc_extent_finish) races with
+	 * this called via the rq_commit_cb, I need to ensure
+	 * osc_dec_unstable_pages is still called. Otherwise unstable
+	 * pages may be leaked. */
+	spin_lock(&req->rq_lock);
+	if (likely(req->rq_unstable)) {
+		req->rq_unstable = 0;
+		spin_unlock(&req->rq_lock);
+
+		osc_dec_unstable_pages(req);
+	} else {
+		req->rq_committed = 1;
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+/**
+ * Build an RPC by the list of extent @ext_list. The caller must ensure
+ * that the total pages in this list are NOT over max pages per RPC.
+ * Extents in the list must be in OES_RPC state.
+ */
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd)
+{
+	struct ptlrpc_request		*req = NULL;
+	struct osc_extent		*ext;
+	struct brw_page			**pga = NULL;
+	struct osc_brw_async_args	*aa = NULL;
+	struct obdo			*oa = NULL;
+	struct osc_async_page		*oap;
+	struct osc_object		*obj = NULL;
+	struct cl_req_attr		*crattr = NULL;
+	loff_t				starting_offset = OBD_OBJECT_EOF;
+	loff_t				ending_offset = 0;
+	/* '1' for consistency with code that checks !mpflag to restore */
+	int mpflag = 1;
+	int				mem_tight = 0;
+	int				page_count = 0;
+	bool				soft_sync = false;
+	bool				ndelay = false;
+	int				i;
+	int				grant = 0;
+	int				rc;
+	__u32				layout_version = 0;
+	LIST_HEAD(rpc_list);
+	struct ost_body			*body;
+	ENTRY;
+	LASSERT(!list_empty(ext_list));
+
+	/* add pages into rpc_list to build BRW rpc */
+	list_for_each_entry(ext, ext_list, oe_link) {
+		LASSERT(ext->oe_state == OES_RPC);
+		mem_tight |= ext->oe_memalloc;
+		grant += ext->oe_grants;
+		page_count += ext->oe_nr_pages;
+		layout_version = max(layout_version, ext->oe_layout_version);
+		if (obj == NULL)
+			obj = ext->oe_obj;
+	}
+
+	soft_sync = osc_over_unstable_soft_limit(cli);
+	if (mem_tight)
+		mpflag = memalloc_noreclaim_save();
+
+	OBD_ALLOC_PTR_ARRAY_LARGE(pga, page_count);
+	if (pga == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
+	if (oa == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	i = 0;
+	list_for_each_entry(ext, ext_list, oe_link) {
+		list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+			if (mem_tight)
+				oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
+			if (soft_sync)
+				oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC;
+			pga[i] = &oap->oap_brw_page;
+			pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
+			i++;
+
+			list_add_tail(&oap->oap_rpc_item, &rpc_list);
+			if (starting_offset == OBD_OBJECT_EOF ||
+			    starting_offset > oap->oap_obj_off)
+				starting_offset = oap->oap_obj_off;
+			else
+				LASSERT(oap->oap_page_off == 0);
+			if (ending_offset < oap->oap_obj_off + oap->oap_count)
+				ending_offset = oap->oap_obj_off +
+						oap->oap_count;
+			else
+				LASSERT(oap->oap_page_off + oap->oap_count ==
+					PAGE_SIZE);
+		}
+		if (ext->oe_ndelay)
+			ndelay = true;
+	}
+
+	/* first page in the list */
+	oap = list_first_entry(&rpc_list, typeof(*oap), oap_rpc_item);
+
+	crattr = &osc_env_info(env)->oti_req_attr;
+	memset(crattr, 0, sizeof(*crattr));
+	crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
+	crattr->cra_flags = ~0ULL;
+	crattr->cra_page = oap2cl_page(oap);
+	crattr->cra_oa = oa;
+	cl_req_attr_set(env, osc2cl(obj), crattr);
+
+	if (cmd == OBD_BRW_WRITE) {
+		oa->o_grant_used = grant;
+		if (layout_version > 0) {
+			CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
+			       PFID(&oa->o_oi.oi_fid), layout_version);
+
+			oa->o_layout_version = layout_version;
+			oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+		}
+	}
+
+	sort_brw_pages(pga, page_count);
+	rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
+	if (rc != 0) {
+		CERROR("prep_req failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	req->rq_commit_cb = brw_commit;
+	req->rq_interpret_reply = brw_interpret;
+	req->rq_memalloc = mem_tight != 0;
+	oap->oap_request = ptlrpc_request_addref(req);
+	if (ndelay) {
+		req->rq_no_resend = req->rq_no_delay = 1;
+		/* probably set a shorter timeout value.
+		 * to handle ETIMEDOUT in brw_interpret() correctly. */
+		/* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+	}
+
+	/* Need to update the timestamps after the request is built in case
+	 * we race with setattr (locally or in queue at OST).  If OST gets
+	 * later setattr before earlier BRW (as determined by the request xid),
+	 * the OST will not use BRW timestamps.  Sadly, there is no obvious
+	 * way to do this in a single call.  bug 10150 */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	crattr->cra_oa = &body->oa;
+	crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME;
+	cl_req_attr_set(env, osc2cl(obj), crattr);
+	lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
+
+	aa = ptlrpc_req_async_args(aa, req);
+	INIT_LIST_HEAD(&aa->aa_oaps);
+	list_splice_init(&rpc_list, &aa->aa_oaps);
+	INIT_LIST_HEAD(&aa->aa_exts);
+	list_splice_init(ext_list, &aa->aa_exts);
+
+	spin_lock(&cli->cl_loi_list_lock);
+	starting_offset >>= PAGE_SHIFT;
+	if (cmd == OBD_BRW_READ) {
+		cli->cl_r_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
+				      starting_offset + 1);
+	} else {
+		cli->cl_w_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
+				      starting_offset + 1);
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	DEBUG_REQ(D_INODE, req, "%d pages, aa %p, now %ur/%uw in flight",
+		  page_count, aa, cli->cl_r_in_flight,
+		  cli->cl_w_in_flight);
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val);
+
+	ptlrpcd_add_req(req);
+	rc = 0;
+	EXIT;
+
+out:
+	if (mem_tight)
+		memalloc_noreclaim_restore(mpflag);
+
+	if (rc != 0) {
+		LASSERT(req == NULL);
+
+		if (oa)
+			OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
+		if (pga) {
+			osc_release_bounce_pages(pga, page_count);
+			osc_release_ppga(pga, page_count);
+		}
+		/* this should happen rarely and is pretty bad, it makes the
+		 * pending list not follow the dirty order
+		 */
+		while ((ext = list_first_entry_or_null(ext_list,
+						       struct osc_extent,
+						       oe_link)) != NULL) {
+			list_del_init(&ext->oe_link);
+			osc_extent_finish(env, ext, 0, rc);
+		}
+	}
+	RETURN(rc);
+}
+
+/* This is to refresh our lock in face of no RPCs. */
+void osc_send_empty_rpc(struct osc_object *osc, pgoff_t start)
+{
+	struct ptlrpc_request *req;
+	struct obdo oa;
+	struct brw_page bpg = { .off = start, .count = 1};
+	struct brw_page *pga = &bpg;
+	int rc;
+
+	memset(&oa, 0, sizeof(oa));
+	oa.o_oi = osc->oo_oinfo->loi_oi;
+	oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLFLAGS;
+	/* For updated servers - don't do a read */
+	oa.o_flags = OBD_FL_NORPC;
+
+	rc = osc_brw_prep_request(OBD_BRW_READ, osc_cli(osc), &oa, 1, &pga,
+				  &req, 0);
+
+	/* If we succeeded we ship it off, if not there's no point in doing
+	 * anything. Also no resends.
+	 * No interpret callback, no commit callback.
+	 */
+	if (!rc) {
+		req->rq_no_resend = 1;
+		ptlrpcd_add_req(req);
+	}
+}
+
+static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
+{
+        int set = 0;
+
+        LASSERT(lock != NULL);
+
+        lock_res_and_lock(lock);
+
+	if (lock->l_ast_data == NULL)
+		lock->l_ast_data = data;
+	if (lock->l_ast_data == data)
+		set = 1;
+
+	unlock_res_and_lock(lock);
+
+	return set;
+}
+
+int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
+		     void *cookie, struct lustre_handle *lockh,
+		     enum ldlm_mode mode, __u64 *flags, bool speculative,
+		     int errcode)
+{
+	bool intent = *flags & LDLM_FL_HAS_INTENT;
+	int rc;
+	ENTRY;
+
+	/* The request was created before ldlm_cli_enqueue call. */
+	if (intent && errcode == ELDLM_LOCK_ABORTED) {
+		struct ldlm_reply *rep;
+
+		rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+		LASSERT(rep != NULL);
+
+		rep->lock_policy_res1 =
+			ptlrpc_status_ntoh(rep->lock_policy_res1);
+		if (rep->lock_policy_res1)
+			errcode = rep->lock_policy_res1;
+		if (!speculative)
+			*flags |= LDLM_FL_LVB_READY;
+	} else if (errcode == ELDLM_OK) {
+		*flags |= LDLM_FL_LVB_READY;
+	}
+
+        /* Call the update callback. */
+	rc = (*upcall)(cookie, lockh, errcode);
+
+	/* release the reference taken in ldlm_cli_enqueue() */
+	if (errcode == ELDLM_LOCK_MATCHED)
+		errcode = ELDLM_OK;
+	if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
+		ldlm_lock_decref(lockh, mode);
+
+	RETURN(rc);
+}
+
+int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			  void *args, int rc)
+{
+	struct osc_enqueue_args *aa = args;
+	struct ldlm_lock *lock;
+	struct lustre_handle *lockh = &aa->oa_lockh;
+	enum ldlm_mode mode = aa->oa_mode;
+	struct ost_lvb *lvb = aa->oa_lvb;
+	__u32 lvb_len = sizeof(*lvb);
+	__u64 flags = 0;
+	struct ldlm_enqueue_info einfo = {
+		.ei_type = aa->oa_type,
+		.ei_mode = mode,
+	};
+
+	ENTRY;
+
+	/* ldlm_cli_enqueue is holding a reference on the lock, so it must
+	 * be valid. */
+	lock = ldlm_handle2lock(lockh);
+	LASSERTF(lock != NULL,
+		 "lockh %#llx, req %p, aa %p - client evicted?\n",
+		 lockh->cookie, req, aa);
+
+	/* Take an additional reference so that a blocking AST that
+	 * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+	 * to arrive after an upcall has been executed by
+	 * osc_enqueue_fini(). */
+	ldlm_lock_addref(lockh, mode);
+
+	/* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
+
+	/* Let CP AST to grant the lock first. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+	if (aa->oa_speculative) {
+		LASSERT(aa->oa_lvb == NULL);
+		LASSERT(aa->oa_flags == NULL);
+		aa->oa_flags = &flags;
+	}
+
+	/* Complete obtaining the lock procedure. */
+	rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, &einfo, 1, aa->oa_flags,
+				   lvb, lvb_len, lockh, rc, false);
+	/* Complete osc stuff. */
+	rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
+			      aa->oa_flags, aa->oa_speculative, rc);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+	ldlm_lock_decref(lockh, mode);
+	LDLM_LOCK_PUT(lock);
+	RETURN(rc);
+}
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is evicted from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, union ldlm_policy_data *policy,
+		     struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
+		     void *cookie, struct ldlm_enqueue_info *einfo,
+		     struct ptlrpc_request_set *rqset, int async,
+		     bool speculative)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lustre_handle lockh = { 0 };
+	struct ptlrpc_request *req = NULL;
+	int intent = *flags & LDLM_FL_HAS_INTENT;
+	__u64 match_flags = *flags;
+	enum ldlm_mode mode;
+	int rc;
+	ENTRY;
+
+        /* Filesystem lock extents are extended to page boundaries so that
+         * dealing with the page cache is a little smoother.  */
+	policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
+	policy->l_extent.end |= ~PAGE_MASK;
+
+        /* Next, search for already existing extent locks that will cover us */
+        /* If we're trying to read, we also search for an existing PW lock.  The
+         * VFS and page cache already protect us locally, so lots of readers/
+         * writers can share a single PW lock.
+         *
+         * There are problems with conversion deadlocks, so instead of
+         * converting a read lock to a write lock, we'll just enqueue a new
+         * one.
+         *
+         * At some point we should cancel the read lock instead of making them
+         * send us a blocking callback, but there are problems with canceling
+         * locks out from other users right now, too. */
+        mode = einfo->ei_mode;
+        if (einfo->ei_mode == LCK_PR)
+                mode |= LCK_PW;
+	/* Normal lock requests must wait for the LVB to be ready before
+	 * matching a lock; speculative lock requests do not need to,
+	 * because they will not actually use the lock. */
+	if (!speculative)
+		match_flags |= LDLM_FL_LVB_READY;
+	if (intent != 0)
+		match_flags |= LDLM_FL_BLOCK_GRANTED;
+	mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id,
+			       einfo->ei_type, policy, mode, &lockh);
+	if (mode) {
+		struct ldlm_lock *matched;
+
+		if (*flags & LDLM_FL_TEST_LOCK)
+			RETURN(ELDLM_OK);
+
+		matched = ldlm_handle2lock(&lockh);
+		if (speculative) {
+			/* This DLM lock request is speculative, and does not
+			 * have an associated IO request. Therefore if there
+			 * is already a DLM lock, it wll just inform the
+			 * caller to cancel the request for this stripe.*/
+			lock_res_and_lock(matched);
+			if (ldlm_extent_equal(&policy->l_extent,
+			    &matched->l_policy_data.l_extent))
+				rc = -EEXIST;
+			else
+				rc = -ECANCELED;
+			unlock_res_and_lock(matched);
+
+			ldlm_lock_decref(&lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(rc);
+		} else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
+			*flags |= LDLM_FL_LVB_READY;
+
+			/* We already have a lock, and it's referenced. */
+			(*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
+
+			ldlm_lock_decref(&lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(ELDLM_OK);
+		} else {
+			ldlm_lock_decref(&lockh, mode);
+			LDLM_LOCK_PUT(matched);
+		}
+	}
+
+	if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
+		RETURN(-ENOLCK);
+
+        /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+        *flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+        rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+			      sizeof(*lvb), LVB_T_OST, &lockh, async);
+	if (async) {
+		if (!rc) {
+			struct osc_enqueue_args *aa;
+			aa = ptlrpc_req_async_args(aa, req);
+			aa->oa_exp	   = exp;
+			aa->oa_mode	   = einfo->ei_mode;
+			aa->oa_type	   = einfo->ei_type;
+			lustre_handle_copy(&aa->oa_lockh, &lockh);
+			aa->oa_upcall	   = upcall;
+			aa->oa_cookie	   = cookie;
+			aa->oa_speculative = speculative;
+			if (!speculative) {
+				aa->oa_flags  = flags;
+				aa->oa_lvb    = lvb;
+			} else {
+				/* speculative locks are essentially to enqueue
+				 * a DLM lock  in advance, so we don't care
+				 * about the result of the enqueue. */
+				aa->oa_lvb    = NULL;
+				aa->oa_flags  = NULL;
+			}
+
+			req->rq_interpret_reply = osc_enqueue_interpret;
+			ptlrpc_set_add_req(rqset, req);
+		}
+		RETURN(rc);
+	}
+
+	rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
+			      flags, speculative, rc);
+
+	RETURN(rc);
+}
+
+int osc_match_base(const struct lu_env *env, struct obd_export *exp,
+		   struct ldlm_res_id *res_id, enum ldlm_type type,
+		   union ldlm_policy_data *policy, enum ldlm_mode mode,
+		   __u64 *flags, struct osc_object *obj,
+		   struct lustre_handle *lockh, enum ldlm_match_flags match_flags)
+{
+	struct obd_device *obd = exp->exp_obd;
+	__u64 lflags = *flags;
+	enum ldlm_mode rc;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
+		RETURN(-EIO);
+
+	/* Filesystem lock extents are extended to page boundaries so that
+	 * dealing with the page cache is a little smoother */
+	policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
+	policy->l_extent.end |= ~PAGE_MASK;
+
+	/* Next, search for already existing extent locks that will cover us */
+	rc = ldlm_lock_match_with_skip(obd->obd_namespace, lflags, 0,
+					res_id, type, policy, mode, lockh,
+					match_flags);
+	if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
+		RETURN(rc);
+
+	if (obj != NULL) {
+		struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+
+		LASSERT(lock != NULL);
+		if (osc_set_lock_data(lock, obj)) {
+			lock_res_and_lock(lock);
+			if (!ldlm_is_lvb_cached(lock)) {
+				LASSERT(lock->l_ast_data == obj);
+				osc_lock_lvb_update(env, obj, lock, NULL);
+				ldlm_set_lvb_cached(lock);
+			}
+			unlock_res_and_lock(lock);
+		} else {
+			ldlm_lock_decref(lockh, rc);
+			rc = 0;
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+	RETURN(rc);
+}
+
+static int osc_statfs_interpret(const struct lu_env *env,
+				struct ptlrpc_request *req, void *args, int rc)
+{
+	struct osc_async_args *aa = args;
+	struct obd_statfs *msfs;
+
+	ENTRY;
+	if (rc == -EBADR)
+		/*
+		 * The request has in fact never been sent due to issues at
+		 * a higher level (LOV).  Exit immediately since the caller
+		 * is aware of the problem and takes care of the clean up.
+		 */
+		RETURN(rc);
+
+	if ((rc == -ENOTCONN || rc == -EAGAIN) &&
+	    (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
+		GOTO(out, rc = 0);
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL)
+                GOTO(out, rc = -EPROTO);
+
+	*aa->aa_oi->oi_osfs = *msfs;
+out:
+	rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+
+	RETURN(rc);
+}
+
+static int osc_statfs_async(struct obd_export *exp,
+			    struct obd_info *oinfo, time64_t max_age,
+                            struct ptlrpc_request_set *rqset)
+{
+        struct obd_device     *obd = class_exp2obd(exp);
+        struct ptlrpc_request *req;
+        struct osc_async_args *aa;
+	int rc;
+        ENTRY;
+
+	if (obd->obd_osfs_age >= max_age) {
+		CDEBUG(D_SUPER,
+		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+		oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+		if (oinfo->oi_cb_up)
+			oinfo->oi_cb_up(oinfo, 0);
+
+		RETURN(0);
+	}
+
+        /* We could possibly pass max_age in the request (as an absolute
+         * timestamp or a "seconds.usec ago") so the target can avoid doing
+         * extra calls into the filesystem if that isn't necessary (e.g.
+         * during mount that would help a bit).  Having relative timestamps
+         * is not so great if request processing is slow, while absolute
+         * timestamps are not ideal because they need time synchronization. */
+        req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	req->rq_interpret_reply = osc_statfs_interpret;
+	aa = ptlrpc_req_async_args(aa, req);
+	aa->aa_oi = oinfo;
+
+	ptlrpc_set_add_req(rqset, req);
+	RETURN(0);
+}
+
+static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct obd_statfs     *msfs;
+	struct ptlrpc_request *req;
+	struct obd_import     *imp, *imp0;
+	int rc;
+	ENTRY;
+
+	/*Since the request might also come from lprocfs, so we need
+	 *sync this with client_disconnect_export Bug15684
+	 */
+	with_imp_locked(obd, imp0, rc)
+		imp = class_import_get(imp0);
+	if (rc)
+		RETURN(rc);
+
+	/* We could possibly pass max_age in the request (as an absolute
+	 * timestamp or a "seconds.usec ago") so the target can avoid doing
+	 * extra calls into the filesystem if that isn't necessary (e.g.
+	 * during mount that would help a bit).  Having relative timestamps
+	 * is not so great if request processing is slow, while absolute
+	 * timestamps are not ideal because they need time synchronization. */
+	req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+
+	class_import_put(imp);
+
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*osfs = *msfs;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void __user *uarg)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct obd_ioctl_data *data = karg;
+	int rc = 0;
+
+	ENTRY;
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("%s: cannot get module '%s'\n", obd->obd_name,
+		       module_name(THIS_MODULE));
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case OBD_IOC_CLIENT_RECOVER:
+		rc = ptlrpc_recover_import(obd->u.cli.cl_import,
+					   data->ioc_inlbuf1, 0);
+		if (rc > 0)
+			rc = 0;
+		break;
+	case OBD_IOC_GETATTR:
+		rc = obd_getattr(NULL, exp, &data->ioc_obdo1);
+		break;
+	case IOC_OSC_SET_ACTIVE:
+		rc = ptlrpc_set_import_active(obd->u.cli.cl_import,
+					      data->ioc_offset);
+		break;
+	default:
+		rc = -ENOTTY;
+		CDEBUG(D_INODE, "%s: unrecognised ioctl %#x by %s: rc = %d\n",
+		       obd->obd_name, cmd, current->comm, rc);
+		break;
+	}
+
+	module_put(THIS_MODULE);
+	return rc;
+}
+
+int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       u32 keylen, void *key, u32 vallen, void *val,
+		       struct ptlrpc_request_set *set)
+{
+        struct ptlrpc_request *req;
+        struct obd_device     *obd = exp->exp_obd;
+        struct obd_import     *imp = class_exp2cliimp(exp);
+        char                  *tmp;
+        int                    rc;
+        ENTRY;
+
+        OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
+
+        if (KEY_IS(KEY_CHECKSUM)) {
+                if (vallen != sizeof(int))
+                        RETURN(-EINVAL);
+                exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
+                RETURN(0);
+        }
+
+        if (KEY_IS(KEY_SPTLRPC_CONF)) {
+                sptlrpc_conf_client_adapt(obd);
+                RETURN(0);
+        }
+
+        if (KEY_IS(KEY_FLUSH_CTX)) {
+                sptlrpc_import_flush_my_ctx(imp);
+                RETURN(0);
+        }
+
+	if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
+		struct client_obd *cli = &obd->u.cli;
+		long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1;
+		long target = *(long *)val;
+
+		nr = osc_lru_shrink(env, cli, min(nr, target), true);
+		*(long *)val -= nr;
+		RETURN(0);
+	}
+
+        if (!set && !KEY_IS(KEY_GRANT_SHRINK))
+                RETURN(-EINVAL);
+
+        /* We pass all other commands directly to OST. Since nobody calls osc
+           methods directly and everybody is supposed to go through LOV, we
+           assume lov checked invalid values for us.
+           The only recognised values so far are evict_by_nid and mds_conn.
+           Even if something bad goes through, we'd get a -EINVAL from OST
+           anyway. */
+
+	req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
+						&RQF_OST_SET_GRANT_INFO :
+						&RQF_OBD_SET_INFO);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	if (!KEY_IS(KEY_GRANT_SHRINK))
+		req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+				     RCL_CLIENT, vallen);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
+							&RMF_OST_BODY :
+							&RMF_SETINFO_VAL);
+	memcpy(tmp, val, vallen);
+
+	if (KEY_IS(KEY_GRANT_SHRINK)) {
+		struct osc_grant_args *aa;
+		struct obdo *oa;
+
+		aa = ptlrpc_req_async_args(aa, req);
+		OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
+		if (!oa) {
+			ptlrpc_req_finished(req);
+			RETURN(-ENOMEM);
+		}
+		*oa = ((struct ost_body *)val)->oa;
+		aa->aa_oa = oa;
+		req->rq_interpret_reply = osc_shrink_grant_interpret;
+	}
+
+	ptlrpc_request_set_replen(req);
+	if (!KEY_IS(KEY_GRANT_SHRINK)) {
+		LASSERT(set != NULL);
+		ptlrpc_set_add_req(set, req);
+		ptlrpc_check_set(NULL, set);
+	} else {
+		ptlrpcd_add_req(req);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_set_info_async);
+
+int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+		  struct obd_device *obd, struct obd_uuid *cluuid,
+		  struct obd_connect_data *data, void *localdata)
+{
+	struct client_obd *cli = &obd->u.cli;
+
+	if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+		long lost_grant;
+		long grant;
+
+		spin_lock(&cli->cl_loi_list_lock);
+		grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+		if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) {
+			/* restore ocd_grant_blkbits as client page bits */
+			data->ocd_grant_blkbits = PAGE_SHIFT;
+			grant += cli->cl_dirty_grant;
+		} else {
+			grant += cli->cl_dirty_pages << PAGE_SHIFT;
+		}
+		data->ocd_grant = grant ? : 2 * cli_brw_size(obd);
+		lost_grant = cli->cl_lost_grant;
+		cli->cl_lost_grant = 0;
+		spin_unlock(&cli->cl_loi_list_lock);
+
+		CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d"
+		       " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
+		       data->ocd_version, data->ocd_grant, lost_grant);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_reconnect);
+
+int osc_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	int rc;
+
+	rc = client_disconnect_export(exp);
+	/**
+	 * Initially we put del_shrink_grant before disconnect_export, but it
+	 * causes the following problem if setup (connect) and cleanup
+	 * (disconnect) are tangled together.
+	 *      connect p1                     disconnect p2
+	 *   ptlrpc_connect_import
+	 *     ...............               class_manual_cleanup
+	 *                                     osc_disconnect
+	 *                                     del_shrink_grant
+	 *   ptlrpc_connect_interrupt
+	 *     osc_init_grant
+	 *   add this client to shrink list
+	 *                                      cleanup_osc
+	 * Bang! grant shrink thread trigger the shrink. BUG18662
+	 */
+	osc_del_grant_list(&obd->u.cli);
+	return rc;
+}
+EXPORT_SYMBOL(osc_disconnect);
+
+int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				 struct hlist_node *hnode, void *arg)
+{
+	struct lu_env *env = arg;
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	struct ldlm_lock *lock;
+	struct osc_object *osc = NULL;
+	ENTRY;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+		if (lock->l_ast_data != NULL && osc == NULL) {
+			osc = lock->l_ast_data;
+			cl_object_get(osc2cl(osc));
+		}
+
+		/* clear LDLM_FL_CLEANED flag to make sure it will be canceled
+		 * by the 2nd round of ldlm_namespace_clean() call in
+		 * osc_import_event(). */
+		ldlm_clear_cleaned(lock);
+	}
+	unlock_res(res);
+
+	if (osc != NULL) {
+		osc_object_invalidate(env, osc);
+		cl_object_put(env, osc2cl(osc));
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
+
+static int osc_import_event(struct obd_device *obd,
+                            struct obd_import *imp,
+                            enum obd_import_event event)
+{
+        struct client_obd *cli;
+        int rc = 0;
+
+        ENTRY;
+        LASSERT(imp->imp_obd == obd);
+
+        switch (event) {
+        case IMP_EVENT_DISCON: {
+                cli = &obd->u.cli;
+		spin_lock(&cli->cl_loi_list_lock);
+		cli->cl_avail_grant = 0;
+		cli->cl_lost_grant = 0;
+		spin_unlock(&cli->cl_loi_list_lock);
+                break;
+        }
+        case IMP_EVENT_INACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
+                break;
+        }
+        case IMP_EVENT_INVALIDATE: {
+                struct ldlm_namespace *ns = obd->obd_namespace;
+                struct lu_env         *env;
+		__u16                  refcheck;
+
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+                env = cl_env_get(&refcheck);
+                if (!IS_ERR(env)) {
+			osc_io_unplug(env, &obd->u.cli, NULL);
+
+			cfs_hash_for_each_nolock(ns->ns_rs_hash,
+						 osc_ldlm_resource_invalidate,
+						 env, 0);
+			cl_env_put(env, &refcheck);
+
+			ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+                } else
+                        rc = PTR_ERR(env);
+                break;
+        }
+        case IMP_EVENT_ACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
+                break;
+        }
+        case IMP_EVENT_OCD: {
+                struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+                if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
+                        osc_init_grant(&obd->u.cli, ocd);
+
+                /* See bug 7198 */
+                if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+                        imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
+                break;
+        }
+        case IMP_EVENT_DEACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE);
+                break;
+        }
+        case IMP_EVENT_ACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE);
+                break;
+        }
+        default:
+                CERROR("Unknown import event %d\n", event);
+                LBUG();
+        }
+        RETURN(rc);
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying the lock
+ * during recovery, see bug16774 for detailed information.
+ *
+ * \retval zero the lock can't be canceled
+ * \retval other ok to cancel
+ */
+static int osc_cancel_weight(struct ldlm_lock *lock)
+{
+	/*
+	 * Cancel all unused and granted extent lock.
+	 */
+	if (lock->l_resource->lr_type == LDLM_EXTENT &&
+	    ldlm_is_granted(lock) &&
+	    osc_ldlm_weigh_ast(lock) == 0)
+		RETURN(1);
+
+	RETURN(0);
+}
+
+static int brw_queue_work(const struct lu_env *env, void *data)
+{
+	struct client_obd *cli = data;
+
+	CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
+
+	osc_io_unplug(env, cli, NULL);
+	RETURN(0);
+}
+
+int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obd->u.cli;
+	void *handler;
+	int rc;
+
+	ENTRY;
+
+	rc = ptlrpcd_addref();
+	if (rc)
+		RETURN(rc);
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(out_ptlrpcd, rc);
+
+
+	handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
+	if (IS_ERR(handler))
+		GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
+	cli->cl_writeback_work = handler;
+
+	handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
+	if (IS_ERR(handler))
+		GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
+	cli->cl_lru_work = handler;
+
+	rc = osc_quota_setup(obd);
+	if (rc)
+		GOTO(out_ptlrpcd_work, rc);
+
+	cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+	cli->cl_root_squash = 0;
+	osc_update_next_shrink(cli);
+
+	RETURN(rc);
+
+out_ptlrpcd_work:
+	if (cli->cl_writeback_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_writeback_work);
+		cli->cl_writeback_work = NULL;
+	}
+	if (cli->cl_lru_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_lru_work);
+		cli->cl_lru_work = NULL;
+	}
+	client_obd_cleanup(obd);
+out_ptlrpcd:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+EXPORT_SYMBOL(osc_setup_common);
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int		   adding;
+	int		   added;
+	int		   req_count;
+	int		   rc;
+
+	ENTRY;
+
+	rc = osc_setup_common(obd, lcfg);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = osc_tunables_init(obd);
+	if (rc)
+		RETURN(rc);
+
+	/*
+	 * We try to control the total number of requests with a upper limit
+	 * osc_reqpool_maxreqcount. There might be some race which will cause
+	 * over-limit allocation, but it is fine.
+	 */
+	req_count = atomic_read(&osc_pool_req_count);
+	if (req_count < osc_reqpool_maxreqcount) {
+		adding = cli->cl_max_rpcs_in_flight + 2;
+		if (req_count + adding > osc_reqpool_maxreqcount)
+			adding = osc_reqpool_maxreqcount - req_count;
+
+		added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding);
+		atomic_add(added, &osc_pool_req_count);
+	}
+
+	ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
+
+	spin_lock(&osc_shrink_lock);
+	list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
+	spin_unlock(&osc_shrink_lock);
+	cli->cl_import->imp_idle_timeout = osc_idle_timeout;
+	cli->cl_import->imp_idle_debug = D_HA;
+
+	RETURN(0);
+}
+
+int osc_precleanup_common(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	/* LU-464
+	 * for echo client, export may be on zombie list, wait for
+	 * zombie thread to cull it, because cli.cl_import will be
+	 * cleared in client_disconnect_export():
+	 *   class_export_destroy() -> obd_cleanup() ->
+	 *   echo_device_free() -> echo_client_cleanup() ->
+	 *   obd_disconnect() -> osc_disconnect() ->
+	 *   client_disconnect_export()
+	 */
+	obd_zombie_barrier();
+	if (cli->cl_writeback_work) {
+		ptlrpcd_destroy_work(cli->cl_writeback_work);
+		cli->cl_writeback_work = NULL;
+	}
+
+	if (cli->cl_lru_work) {
+		ptlrpcd_destroy_work(cli->cl_lru_work);
+		cli->cl_lru_work = NULL;
+	}
+
+	obd_cleanup_client_import(obd);
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_precleanup_common);
+
+static int osc_precleanup(struct obd_device *obd)
+{
+	ENTRY;
+
+	osc_precleanup_common(obd);
+
+	ptlrpc_lprocfs_unregister_obd(obd);
+	RETURN(0);
+}
+
+int osc_cleanup_common(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+
+	ENTRY;
+
+	spin_lock(&osc_shrink_lock);
+	list_del(&cli->cl_shrink_list);
+	spin_unlock(&osc_shrink_lock);
+
+	/* lru cleanup */
+	if (cli->cl_cache != NULL) {
+		LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_del_init(&cli->cl_lru_osc);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+		cli->cl_lru_left = NULL;
+		cl_cache_decref(cli->cl_cache);
+		cli->cl_cache = NULL;
+	}
+
+	/* free memory of osc quota cache */
+	osc_quota_cleanup(obd);
+
+	rc = client_obd_cleanup(obd);
+
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+EXPORT_SYMBOL(osc_cleanup_common);
+
+static const struct obd_ops osc_obd_ops = {
+        .o_owner                = THIS_MODULE,
+        .o_setup                = osc_setup,
+        .o_precleanup           = osc_precleanup,
+	.o_cleanup              = osc_cleanup_common,
+        .o_add_conn             = client_import_add_conn,
+        .o_del_conn             = client_import_del_conn,
+	.o_connect              = client_connect_import,
+        .o_reconnect            = osc_reconnect,
+        .o_disconnect           = osc_disconnect,
+        .o_statfs               = osc_statfs,
+        .o_statfs_async         = osc_statfs_async,
+        .o_create               = osc_create,
+        .o_destroy              = osc_destroy,
+        .o_getattr              = osc_getattr,
+        .o_setattr              = osc_setattr,
+        .o_iocontrol            = osc_iocontrol,
+        .o_set_info_async       = osc_set_info_async,
+        .o_import_event         = osc_import_event,
+        .o_quotactl             = osc_quotactl,
+};
+
+LIST_HEAD(osc_shrink_list);
+DEFINE_SPINLOCK(osc_shrink_lock);
+
+#ifdef HAVE_SHRINKER_COUNT
+static struct shrinker osc_cache_shrinker = {
+	.count_objects	= osc_cache_shrink_count,
+	.scan_objects	= osc_cache_shrink_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+#else
+static int osc_cache_shrink(struct shrinker *shrinker,
+			    struct shrink_control *sc)
+{
+	(void)osc_cache_shrink_scan(shrinker, sc);
+
+	return osc_cache_shrink_count(shrinker, sc);
+}
+
+static struct shrinker osc_cache_shrinker = {
+	.shrink   = osc_cache_shrink,
+	.seeks    = DEFAULT_SEEKS,
+};
+#endif
+
+static int __init osc_init(void)
+{
+	unsigned int reqpool_size;
+	unsigned int reqsize;
+	int rc;
+	ENTRY;
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
+
+	rc = lu_kmem_init(osc_caches);
+	if (rc)
+		RETURN(rc);
+
+	rc = class_register_type(&osc_obd_ops, NULL, true,
+				 LUSTRE_OSC_NAME, &osc_device_type);
+	if (rc)
+		GOTO(out_kmem, rc);
+
+	rc = register_shrinker(&osc_cache_shrinker);
+	if (rc)
+		GOTO(out_type, rc);
+
+	/* This is obviously too much memory, only prevent overflow here */
+	if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0)
+		GOTO(out_shrinker, rc = -EINVAL);
+
+	reqpool_size = osc_reqpool_mem_max << 20;
+
+	reqsize = 1;
+	while (reqsize < OST_IO_MAXREQSIZE)
+		reqsize = reqsize << 1;
+
+	/*
+	 * We don't enlarge the request count in OSC pool according to
+	 * cl_max_rpcs_in_flight. The allocation from the pool will only be
+	 * tried after normal allocation failed. So a small OSC pool won't
+	 * cause much performance degression in most of cases.
+	 */
+	osc_reqpool_maxreqcount = reqpool_size / reqsize;
+
+	atomic_set(&osc_pool_req_count, 0);
+	osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
+					  ptlrpc_add_rqs_to_pool);
+
+	if (osc_rq_pool == NULL)
+		GOTO(out_shrinker, rc = -ENOMEM);
+
+	rc = osc_start_grant_work();
+	if (rc != 0)
+		GOTO(out_req_pool, rc);
+
+	RETURN(rc);
+
+out_req_pool:
+	ptlrpc_free_rq_pool(osc_rq_pool);
+out_shrinker:
+	unregister_shrinker(&osc_cache_shrinker);
+out_type:
+	class_unregister_type(LUSTRE_OSC_NAME);
+out_kmem:
+	lu_kmem_fini(osc_caches);
+
+	RETURN(rc);
+}
+
+static void __exit osc_exit(void)
+{
+	osc_stop_grant_work();
+	unregister_shrinker(&osc_cache_shrinker);
+	class_unregister_type(LUSTRE_OSC_NAME);
+	lu_kmem_fini(osc_caches);
+	ptlrpc_free_rq_pool(osc_rq_pool);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(osc_init);
+module_exit(osc_exit);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
new file mode 100644
index 0000000000000..efde01993f0ed
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
@@ -0,0 +1,3712 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+/** Implementation of client-side PortalRPC interfaces */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/delay.h>
+#include <linux/random.h>
+
+#include <lnet/lib-lnet.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+static void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+				      struct page *page, int pageoffset,
+				      int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+					struct page *page, int pageoffset,
+					int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
+static void ptlrpc_release_bulk_page_pin(struct ptlrpc_bulk_desc *desc)
+{
+	int i;
+
+	for (i = 0; i < desc->bd_iov_count ; i++)
+		put_page(desc->bd_vec[i].bv_page);
+}
+
+static int ptlrpc_prep_bulk_frag_pages(struct ptlrpc_bulk_desc *desc,
+				       void *frag, int len)
+{
+	unsigned int offset = (unsigned long)frag & ~PAGE_MASK;
+
+	ENTRY;
+	while (len > 0) {
+		int page_len = min_t(unsigned int, PAGE_SIZE - offset,
+				     len);
+		unsigned long vaddr = (unsigned long)frag;
+
+		ptlrpc_prep_bulk_page_nopin(desc,
+					    lnet_kvaddr_to_page(vaddr),
+					    offset, page_len);
+		offset = 0;
+		len -= page_len;
+		frag += page_len;
+	}
+
+	RETURN(desc->bd_nob);
+}
+
+const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops = {
+	.add_kiov_frag	= ptlrpc_prep_bulk_page_pin,
+	.release_frags	= ptlrpc_release_bulk_page_pin,
+};
+EXPORT_SYMBOL(ptlrpc_bulk_kiov_pin_ops);
+
+const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops = {
+	.add_kiov_frag	= ptlrpc_prep_bulk_page_nopin,
+	.release_frags	= ptlrpc_release_bulk_noop,
+	.add_iov_frag	= ptlrpc_prep_bulk_frag_pages,
+};
+EXPORT_SYMBOL(ptlrpc_bulk_kiov_nopin_ops);
+
+static int ptlrpc_send_new_req(struct ptlrpc_request *req);
+static int ptlrpcd_check_work(struct ptlrpc_request *req);
+static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async);
+
+/**
+ * Initialize passed in client structure \a cl.
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, const char *name,
+			struct ptlrpc_client *cl)
+{
+	cl->cli_request_portal = req_portal;
+	cl->cli_reply_portal   = rep_portal;
+	cl->cli_name           = name;
+}
+EXPORT_SYMBOL(ptlrpc_init_client);
+
+/**
+ * Return PortalRPC connection for remore uud \a uuid
+ */
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid,
+						    lnet_nid_t nid4refnet)
+{
+	struct ptlrpc_connection *c;
+	lnet_nid_t self;
+	struct lnet_process_id peer;
+	int err;
+
+	/*
+	 * ptlrpc_uuid_to_peer() initializes its 2nd parameter
+	 * before accessing its values.
+	 */
+	/* coverity[uninit_use_in_call] */
+	peer.nid = nid4refnet;
+	err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
+	if (err != 0) {
+		CNETERR("cannot find peer %s!\n", uuid->uuid);
+		return NULL;
+	}
+
+	c = ptlrpc_connection_get(peer, self, uuid);
+	if (c) {
+		memcpy(c->c_remote_uuid.uuid,
+		       uuid->uuid, sizeof(c->c_remote_uuid.uuid));
+	}
+
+	CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
+
+	return c;
+}
+
+/**
+ * Allocate and initialize new bulk descriptor on the sender.
+ * Returns pointer to the descriptor or NULL on error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned int nfrags,
+					 unsigned int max_brw,
+					 enum ptlrpc_bulk_op_type type,
+					 unsigned int portal,
+					 const struct ptlrpc_bulk_frag_ops *ops)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int i;
+
+	LASSERT(ops->add_kiov_frag != NULL);
+
+	if (max_brw > PTLRPC_BULK_OPS_COUNT)
+		RETURN(NULL);
+
+	if (nfrags > LNET_MAX_IOV * max_brw)
+		RETURN(NULL);
+
+	OBD_ALLOC_PTR(desc);
+	if (!desc)
+		return NULL;
+
+	OBD_ALLOC_LARGE(desc->bd_vec,
+			nfrags * sizeof(*desc->bd_vec));
+	if (!desc->bd_vec)
+		goto out;
+
+	spin_lock_init(&desc->bd_lock);
+	init_waitqueue_head(&desc->bd_waitq);
+	desc->bd_max_iov = nfrags;
+	desc->bd_iov_count = 0;
+	desc->bd_portal = portal;
+	desc->bd_type = type;
+	desc->bd_md_count = 0;
+	desc->bd_nob_last = LNET_MTU;
+	desc->bd_frag_ops = ops;
+	LASSERT(max_brw > 0);
+	desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
+	/*
+	 * PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
+	 * node. Negotiated ocd_brw_size will always be <= this number.
+	 */
+	for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
+		LNetInvalidateMDHandle(&desc->bd_mds[i]);
+
+	return desc;
+out:
+	OBD_FREE_PTR(desc);
+	return NULL;
+}
+
+/**
+ * Prepare bulk descriptor for specified outgoing request \a req that
+ * can fit \a nfrags * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on client-side.
+ * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned int nfrags,
+					      unsigned int max_brw,
+					      unsigned int type,
+					      unsigned int portal,
+					      const struct ptlrpc_bulk_frag_ops
+						*ops)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_bulk_desc *desc;
+
+	ENTRY;
+	LASSERT(ptlrpc_is_bulk_op_passive(type));
+
+	desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops);
+	if (!desc)
+		RETURN(NULL);
+
+	desc->bd_import = class_import_get(imp);
+	desc->bd_req = req;
+
+	desc->bd_cbid.cbid_fn  = client_bulk_callback;
+	desc->bd_cbid.cbid_arg = desc;
+
+	/* This makes req own desc, and free it when she frees herself */
+	req->rq_bulk = desc;
+
+	return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len,
+			     int pin)
+{
+	struct bio_vec *kiov;
+
+	LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+	LASSERT(page != NULL);
+	LASSERT(pageoffset >= 0);
+	LASSERT(len > 0);
+	LASSERT(pageoffset + len <= PAGE_SIZE);
+
+	kiov = &desc->bd_vec[desc->bd_iov_count];
+
+	if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) ||
+	     ((desc->bd_nob_last + len) > LNET_MTU)) {
+		desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count;
+		desc->bd_md_count++;
+		desc->bd_nob_last = 0;
+		LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT);
+	}
+
+	desc->bd_nob_last += len;
+	desc->bd_nob += len;
+
+	if (pin)
+		get_page(page);
+
+	kiov->bv_page = page;
+	kiov->bv_offset = pageoffset;
+	kiov->bv_len = len;
+
+	desc->bd_iov_count++;
+}
+EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
+
+void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
+{
+	ENTRY;
+
+	LASSERT(desc != NULL);
+	LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
+	LASSERT(desc->bd_refs == 0);         /* network hands off */
+	LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+	LASSERT(desc->bd_frag_ops != NULL);
+
+	sptlrpc_enc_pool_put_pages(desc);
+
+	if (desc->bd_export)
+		class_export_put(desc->bd_export);
+	else
+		class_import_put(desc->bd_import);
+
+	if (desc->bd_frag_ops->release_frags != NULL)
+		desc->bd_frag_ops->release_frags(desc);
+
+	OBD_FREE_LARGE(desc->bd_vec,
+		       desc->bd_max_iov * sizeof(*desc->bd_vec));
+	OBD_FREE_PTR(desc);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_free_bulk);
+
+/**
+ * Set server timelimit for this req, i.e. how long are we willing to wait
+ * for reply before timing out this request.
+ */
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_import);
+
+	if (AT_OFF) {
+		/* non-AT settings */
+		/**
+		 * \a imp_server_timeout means this is reverse import and
+		 * we send (currently only) ASTs to the client and cannot afford
+		 * to wait too long for the reply, otherwise the other client
+		 * (because of which we are sending this request) would
+		 * timeout waiting for us
+		 */
+		req->rq_timeout = req->rq_import->imp_server_timeout ?
+				  obd_timeout / 2 : obd_timeout;
+	} else {
+		struct imp_at *at = &req->rq_import->imp_at;
+		timeout_t serv_est;
+		int idx;
+
+		idx = import_at_get_index(req->rq_import,
+					  req->rq_request_portal);
+		serv_est = at_get(&at->iat_service_estimate[idx]);
+		/*
+		 * Currently a 32 bit value is sent over the
+		 * wire for rq_timeout so please don't change this
+		 * to time64_t. The work for LU-1158 will in time
+		 * replace rq_timeout with a 64 bit nanosecond value
+		 */
+		req->rq_timeout = at_est2timeout(serv_est);
+	}
+	/*
+	 * We could get even fancier here, using history to predict increased
+	 * loading...
+	 *
+	 * Let the server know what this RPC timeout is by putting it in the
+	 * reqmsg
+	 */
+	lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+				  timeout_t serv_est)
+{
+	int idx;
+	timeout_t oldse;
+	struct imp_at *at;
+
+	LASSERT(req->rq_import);
+	at = &req->rq_import->imp_at;
+
+	idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+	/*
+	 * max service estimates are tracked on the server side,
+	 * so just keep minimal history here
+	 */
+	oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
+	if (oldse != 0)
+		CDEBUG(D_ADAPTTO,
+		       "The RPC service estimate for %s ptl %d has changed from %d to %d\n",
+		       req->rq_import->imp_obd->obd_name,
+		       req->rq_request_portal,
+		       oldse, at_get(&at->iat_service_estimate[idx]));
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
+	return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+			       timeout_t service_timeout)
+{
+	time64_t now = ktime_get_real_seconds();
+	struct imp_at *at;
+	timeout_t oldnl;
+	timeout_t nl;
+
+	LASSERT(req->rq_import);
+
+	if (service_timeout > now - req->rq_sent + 3) {
+		/*
+		 * b=16408, however, this can also happen if early reply
+		 * is lost and client RPC is expired and resent, early reply
+		 * or reply of original RPC can still be fit in reply buffer
+		 * of resent RPC, now client is measuring time from the
+		 * resent time, but server sent back service time of original
+		 * RPC.
+		 */
+		CDEBUG_LIMIT((lustre_msg_get_flags(req->rq_reqmsg) &
+			      MSG_RESENT) ?  D_ADAPTTO : D_WARNING,
+			     "Reported service time %u > total measured time %lld\n",
+			     service_timeout, now - req->rq_sent);
+		return;
+	}
+
+	/* Network latency is total time less server processing time,
+	 * st rounding
+	 */
+	nl = max_t(timeout_t, now - req->rq_sent - service_timeout, 0) + 1;
+	at = &req->rq_import->imp_at;
+
+	oldnl = at_measured(&at->iat_net_latency, nl);
+	if (oldnl != 0)
+		CDEBUG(D_ADAPTTO,
+		       "The network latency for %s (nid %s) has changed from %d to %d\n",
+		       req->rq_import->imp_obd->obd_name,
+		       obd_uuid2str(&req->rq_import->imp_connection->c_remote_uuid),
+		       oldnl, at_get(&at->iat_net_latency));
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+	int rc;
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+		rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
+		if (rc) {
+			DEBUG_REQ(D_ERROR, req, "unpack_rep failed: rc = %d",
+				  rc);
+			return -EPROTO;
+		}
+	}
+
+	rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+	if (rc) {
+		DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: rc = %d",
+			  rc);
+		return -EPROTO;
+	}
+	return 0;
+}
+
+/**
+ * Handle an early reply message, called with the rq_lock held.
+ * If anything goes wrong just ignore it - same as if it never happened
+ */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+__must_hold(&req->rq_lock)
+{
+	struct ptlrpc_request *early_req;
+	timeout_t service_timeout;
+	time64_t olddl;
+	int rc;
+
+	ENTRY;
+	req->rq_early = 0;
+	spin_unlock(&req->rq_lock);
+
+	rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+	if (rc) {
+		spin_lock(&req->rq_lock);
+		RETURN(rc);
+	}
+
+	rc = unpack_reply(early_req);
+	if (rc != 0) {
+		sptlrpc_cli_finish_early_reply(early_req);
+		spin_lock(&req->rq_lock);
+		RETURN(rc);
+	}
+
+	/*
+	 * Use new timeout value just to adjust the local value for this
+	 * request, don't include it into at_history. It is unclear yet why
+	 * service time increased and should it be counted or skipped, e.g.
+	 * that can be recovery case or some error or server, the real reply
+	 * will add all new data if it is worth to add.
+	 */
+	req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg);
+	lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+
+	/* Network latency can be adjusted, it is pure network delays */
+	service_timeout = lustre_msg_get_service_timeout(early_req->rq_repmsg);
+	ptlrpc_at_adj_net_latency(req, service_timeout);
+
+	sptlrpc_cli_finish_early_reply(early_req);
+
+	spin_lock(&req->rq_lock);
+	olddl = req->rq_deadline;
+	/*
+	 * server assumes it now has rq_timeout from when the request
+	 * arrived, so the client should give it at least that long.
+	 * since we don't know the arrival time we'll use the original
+	 * sent time
+	 */
+	req->rq_deadline = req->rq_sent + req->rq_timeout +
+			   ptlrpc_at_get_net_latency(req);
+
+	/* The below message is checked in replay-single.sh test_65{a,b} */
+	/* The below message is checked in sanity-{gss,krb5} test_8 */
+	DEBUG_REQ(D_ADAPTTO, req,
+		  "Early reply #%d, new deadline in %llds (%llds)",
+		  req->rq_early_count,
+		  req->rq_deadline - ktime_get_real_seconds(),
+		  req->rq_deadline - olddl);
+
+	RETURN(rc);
+}
+
+static struct kmem_cache *request_cache;
+
+int ptlrpc_request_cache_init(void)
+{
+	request_cache = kmem_cache_create("ptlrpc_cache",
+					  sizeof(struct ptlrpc_request),
+					  0, SLAB_HWCACHE_ALIGN, NULL);
+	return request_cache ? 0 : -ENOMEM;
+}
+
+void ptlrpc_request_cache_fini(void)
+{
+	kmem_cache_destroy(request_cache);
+}
+
+struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags)
+{
+	struct ptlrpc_request *req;
+
+	OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags);
+	return req;
+}
+
+void ptlrpc_request_cache_free(struct ptlrpc_request *req)
+{
+	OBD_SLAB_FREE_PTR(req, request_cache);
+}
+
+/**
+ * Wind down request pool \a pool.
+ * Frees all requests from the pool too
+ */
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *req;
+
+	LASSERT(pool != NULL);
+
+	spin_lock(&pool->prp_lock);
+	while ((req = list_first_entry_or_null(&pool->prp_req_list,
+					       struct ptlrpc_request,
+					       rq_list))) {
+		list_del(&req->rq_list);
+		LASSERT(req->rq_reqbuf);
+		LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
+		OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
+		ptlrpc_request_cache_free(req);
+	}
+	spin_unlock(&pool->prp_lock);
+	OBD_FREE(pool, sizeof(*pool));
+}
+EXPORT_SYMBOL(ptlrpc_free_rq_pool);
+
+/**
+ * Allocates, initializes and adds \a num_rq requests to the pool \a pool
+ */
+int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+{
+	int i;
+	int size = 1;
+
+	while (size < pool->prp_rq_size)
+		size <<= 1;
+
+	LASSERTF(list_empty(&pool->prp_req_list) ||
+		 size == pool->prp_rq_size,
+		 "Trying to change pool size with nonempty pool from %d to %d bytes\n",
+		 pool->prp_rq_size, size);
+
+	pool->prp_rq_size = size;
+	for (i = 0; i < num_rq; i++) {
+		struct ptlrpc_request *req;
+		struct lustre_msg *msg;
+
+		req = ptlrpc_request_cache_alloc(GFP_NOFS);
+		if (!req)
+			return i;
+		OBD_ALLOC_LARGE(msg, size);
+		if (!msg) {
+			ptlrpc_request_cache_free(req);
+			return i;
+		}
+		req->rq_reqbuf = msg;
+		req->rq_reqbuf_len = size;
+		req->rq_pool = pool;
+		spin_lock(&pool->prp_lock);
+		list_add_tail(&req->rq_list, &pool->prp_req_list);
+		spin_unlock(&pool->prp_lock);
+	}
+	return num_rq;
+}
+EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
+
+/**
+ * Create and initialize new request pool with given attributes:
+ * \a num_rq - initial number of requests to create for the pool
+ * \a msgsize - maximum message size possible for requests in thid pool
+ * \a populate_pool - function to be called when more requests need to be added
+ *                    to the pool
+ * Returns pointer to newly created pool or NULL on error.
+ */
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int num_rq, int msgsize,
+		    int (*populate_pool)(struct ptlrpc_request_pool *, int))
+{
+	struct ptlrpc_request_pool *pool;
+
+	OBD_ALLOC_PTR(pool);
+	if (!pool)
+		return NULL;
+
+	/*
+	 * Request next power of two for the allocation, because internally
+	 * kernel would do exactly this
+	 */
+	spin_lock_init(&pool->prp_lock);
+	INIT_LIST_HEAD(&pool->prp_req_list);
+	pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
+	pool->prp_populate = populate_pool;
+
+	populate_pool(pool, num_rq);
+
+	return pool;
+}
+EXPORT_SYMBOL(ptlrpc_init_rq_pool);
+
+/**
+ * Fetches one request from pool \a pool
+ */
+static struct ptlrpc_request *
+ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request;
+	struct lustre_msg *reqbuf;
+
+	if (!pool)
+		return NULL;
+
+	spin_lock(&pool->prp_lock);
+
+	/*
+	 * See if we have anything in a pool, and bail out if nothing,
+	 * in writeout path, where this matters, this is safe to do, because
+	 * nothing is lost in this case, and when some in-flight requests
+	 * complete, this code will be called again.
+	 */
+	if (unlikely(list_empty(&pool->prp_req_list))) {
+		spin_unlock(&pool->prp_lock);
+		return NULL;
+	}
+
+	request = list_first_entry(&pool->prp_req_list, struct ptlrpc_request,
+				   rq_list);
+	list_del_init(&request->rq_list);
+	spin_unlock(&pool->prp_lock);
+
+	LASSERT(request->rq_reqbuf);
+	LASSERT(request->rq_pool);
+
+	reqbuf = request->rq_reqbuf;
+	memset(request, 0, sizeof(*request));
+	request->rq_reqbuf = reqbuf;
+	request->rq_reqbuf_len = pool->prp_rq_size;
+	request->rq_pool = pool;
+
+	return request;
+}
+
+/**
+ * Returns freed \a request to pool.
+ */
+static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
+{
+	struct ptlrpc_request_pool *pool = request->rq_pool;
+
+	spin_lock(&pool->prp_lock);
+	LASSERT(list_empty(&request->rq_list));
+	LASSERT(!request->rq_receiving_reply);
+	list_add_tail(&request->rq_list, &pool->prp_req_list);
+	spin_unlock(&pool->prp_lock);
+}
+
+void ptlrpc_add_unreplied(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_request *iter;
+
+	assert_spin_locked(&imp->imp_lock);
+	LASSERT(list_empty(&req->rq_unreplied_list));
+
+	/* unreplied list is sorted by xid in ascending order */
+	list_for_each_entry_reverse(iter, &imp->imp_unreplied_list,
+				    rq_unreplied_list) {
+		LASSERT(req->rq_xid != iter->rq_xid);
+		if (req->rq_xid < iter->rq_xid)
+			continue;
+		list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list);
+		return;
+	}
+	list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list);
+}
+
+void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req)
+{
+	req->rq_xid = ptlrpc_next_xid();
+	ptlrpc_add_unreplied(req);
+}
+
+static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_import->imp_lock);
+	ptlrpc_assign_next_xid_nolock(req);
+	spin_unlock(&req->rq_import->imp_lock);
+}
+
+static atomic64_t ptlrpc_last_xid;
+
+static void ptlrpc_reassign_next_xid(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_import->imp_lock);
+	list_del_init(&req->rq_unreplied_list);
+	ptlrpc_assign_next_xid_nolock(req);
+	spin_unlock(&req->rq_import->imp_lock);
+	DEBUG_REQ(D_RPCTRACE, req, "reassign xid");
+}
+
+void ptlrpc_get_mod_rpc_slot(struct ptlrpc_request *req)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	__u32 opc;
+	__u16 tag;
+
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	tag = obd_get_mod_rpc_slot(cli, opc);
+	lustre_msg_set_tag(req->rq_reqmsg, tag);
+	ptlrpc_reassign_next_xid(req);
+}
+EXPORT_SYMBOL(ptlrpc_get_mod_rpc_slot);
+
+void ptlrpc_put_mod_rpc_slot(struct ptlrpc_request *req)
+{
+	__u16 tag = lustre_msg_get_tag(req->rq_reqmsg);
+
+	if (tag != 0) {
+		struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+		__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+		obd_put_mod_rpc_slot(cli, opc, tag);
+	}
+}
+EXPORT_SYMBOL(ptlrpc_put_mod_rpc_slot);
+
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+			     __u32 version, int opcode, char **bufs,
+			     struct ptlrpc_cli_ctx *ctx)
+{
+	int count;
+	struct obd_import *imp;
+	__u32 *lengths;
+	int rc;
+
+	ENTRY;
+
+	count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
+	imp = request->rq_import;
+	lengths = request->rq_pill.rc_area[RCL_CLIENT];
+
+	if (ctx) {
+		request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
+	} else {
+		rc = sptlrpc_req_get_ctx(request);
+		if (rc)
+			GOTO(out_free, rc);
+	}
+	sptlrpc_req_set_flavor(request, opcode);
+
+	rc = lustre_pack_request(request, imp->imp_msg_magic, count,
+				 lengths, bufs);
+	if (rc)
+		GOTO(out_ctx, rc);
+
+	lustre_msg_add_version(request->rq_reqmsg, version);
+	request->rq_send_state = LUSTRE_IMP_FULL;
+	request->rq_type = PTL_RPC_MSG_REQUEST;
+
+	request->rq_req_cbid.cbid_fn  = request_out_callback;
+	request->rq_req_cbid.cbid_arg = request;
+
+	request->rq_reply_cbid.cbid_fn  = reply_in_callback;
+	request->rq_reply_cbid.cbid_arg = request;
+
+	request->rq_reply_deadline = 0;
+	request->rq_bulk_deadline = 0;
+	request->rq_req_deadline = 0;
+	request->rq_phase = RQ_PHASE_NEW;
+	request->rq_next_phase = RQ_PHASE_UNDEFINED;
+
+	request->rq_request_portal = imp->imp_client->cli_request_portal;
+	request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+
+	ptlrpc_at_set_req_timeout(request);
+
+	lustre_msg_set_opc(request->rq_reqmsg, opcode);
+
+	/* Let's setup deadline for req/reply/bulk unlink for opcode. */
+	if (cfs_fail_val == opcode) {
+		time64_t *fail_t = NULL, *fail2_t = NULL;
+
+		if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+			fail_t = &request->rq_bulk_deadline;
+		} else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+			fail_t = &request->rq_reply_deadline;
+		} else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) {
+			fail_t = &request->rq_req_deadline;
+		} else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK)) {
+			fail_t = &request->rq_reply_deadline;
+			fail2_t = &request->rq_bulk_deadline;
+		} else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_ROUND_XID)) {
+			time64_t now = ktime_get_real_seconds();
+			u64 xid = ((u64)now >> 4) << 24;
+
+			atomic64_set(&ptlrpc_last_xid, xid);
+		}
+
+		if (fail_t) {
+			*fail_t = ktime_get_real_seconds() +
+				  PTLRPC_REQ_LONG_UNLINK;
+
+			if (fail2_t)
+				*fail2_t = ktime_get_real_seconds() +
+					   PTLRPC_REQ_LONG_UNLINK;
+
+			/*
+			 * The RPC is infected, let the test to change the
+			 * fail_loc
+			 */
+			msleep(4 * MSEC_PER_SEC);
+		}
+	}
+	ptlrpc_assign_next_xid(request);
+
+	RETURN(0);
+
+out_ctx:
+	LASSERT(!request->rq_pool);
+	sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
+out_free:
+	atomic_dec(&imp->imp_reqs);
+	class_import_put(imp);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
+
+/**
+ * Pack request buffers for network transfer, performing necessary encryption
+ * steps if necessary.
+ */
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+			__u32 version, int opcode)
+{
+	return ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
+}
+EXPORT_SYMBOL(ptlrpc_request_pack);
+
+/**
+ * Helper function to allocate new request on import \a imp
+ * and possibly using existing request from pool \a pool if provided.
+ * Returns allocated request structure with import field filled or
+ * NULL on error.
+ */
+static inline
+struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
+					      struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request = NULL;
+
+	request = ptlrpc_request_cache_alloc(GFP_NOFS);
+
+	if (!request && pool)
+		request = ptlrpc_prep_req_from_pool(pool);
+
+	if (request) {
+		ptlrpc_cli_req_init(request);
+
+		LASSERTF((unsigned long)imp > 0x1000, "%p\n", imp);
+		LASSERT(imp != LP_POISON);
+		LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p\n",
+			 imp->imp_client);
+		LASSERT(imp->imp_client != LP_POISON);
+
+		request->rq_import = class_import_get(imp);
+		atomic_inc(&imp->imp_reqs);
+	} else {
+		CERROR("request allocation out of memory\n");
+	}
+
+	return request;
+}
+
+static int ptlrpc_reconnect_if_idle(struct obd_import *imp)
+{
+	int rc;
+
+	/*
+	 * initiate connection if needed when the import has been
+	 * referenced by the new request to avoid races with disconnect.
+	 * serialize this check against conditional state=IDLE
+	 * in ptlrpc_disconnect_idle_interpret()
+	 */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_IDLE) {
+		imp->imp_generation++;
+		imp->imp_initiated_at = imp->imp_generation;
+		imp->imp_state = LUSTRE_IMP_NEW;
+
+		/* connect_import_locked releases imp_lock */
+		rc = ptlrpc_connect_import_locked(imp);
+		if (rc)
+			return rc;
+		ptlrpc_pinger_add_import(imp);
+	} else {
+		spin_unlock(&imp->imp_lock);
+	}
+	return 0;
+}
+
+/**
+ * Helper function for creating a request.
+ * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
+ * buffer structures according to capsule template \a format.
+ * Returns allocated request structure pointer or NULL on error.
+ */
+static struct ptlrpc_request *
+ptlrpc_request_alloc_internal(struct obd_import *imp,
+			      struct ptlrpc_request_pool *pool,
+			      const struct req_format *format)
+{
+	struct ptlrpc_request *request;
+
+	request = __ptlrpc_request_alloc(imp, pool);
+	if (!request)
+		return NULL;
+
+	/* don't make expensive check for idling connection
+	 * if it's already connected */
+	if (unlikely(imp->imp_state != LUSTRE_IMP_FULL)) {
+		if (ptlrpc_reconnect_if_idle(imp) < 0) {
+			atomic_dec(&imp->imp_reqs);
+			ptlrpc_request_free(request);
+			return NULL;
+		}
+	}
+
+	req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
+	req_capsule_set(&request->rq_pill, format);
+	return request;
+}
+
+/**
+ * Allocate new request structure for import \a imp and initialize its
+ * buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+					    const struct req_format *format)
+{
+	return ptlrpc_request_alloc_internal(imp, NULL, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc);
+
+/**
+ * Allocate new request structure for import \a imp from pool \a pool and
+ * initialize its buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *
+ptlrpc_request_alloc_pool(struct obd_import *imp,
+			  struct ptlrpc_request_pool *pool,
+			  const struct req_format *format)
+{
+	return ptlrpc_request_alloc_internal(imp, pool, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
+
+/**
+ * For requests not from pool, free memory of the request structure.
+ * For requests obtained from a pool earlier, return request back to pool.
+ */
+void ptlrpc_request_free(struct ptlrpc_request *request)
+{
+	if (request->rq_pool)
+		__ptlrpc_free_req_to_pool(request);
+	else
+		ptlrpc_request_cache_free(request);
+}
+EXPORT_SYMBOL(ptlrpc_request_free);
+
+/**
+ * Allocate new request for operatione \a opcode and immediatelly pack it for
+ * network transfer.
+ * Only used for simple requests like OBD_PING where the only important
+ * part of the request is operation itself.
+ * Returns allocated request or NULL on error.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+						 const struct req_format *format,
+						 __u32 version, int opcode)
+{
+	struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
+	int rc;
+
+	if (req) {
+		rc = ptlrpc_request_pack(req, version, opcode);
+		if (rc) {
+			ptlrpc_request_free(req);
+			req = NULL;
+		}
+	}
+	return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
+
+/**
+ * Allocate and initialize new request set structure on the current CPT.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_set(void)
+{
+	struct ptlrpc_request_set *set;
+	int cpt;
+
+	ENTRY;
+	cpt = cfs_cpt_current(cfs_cpt_tab, 0);
+	OBD_CPT_ALLOC(set, cfs_cpt_tab, cpt, sizeof(*set));
+	if (!set)
+		RETURN(NULL);
+	atomic_set(&set->set_refcount, 1);
+	INIT_LIST_HEAD(&set->set_requests);
+	init_waitqueue_head(&set->set_waitq);
+	atomic_set(&set->set_new_count, 0);
+	atomic_set(&set->set_remaining, 0);
+	spin_lock_init(&set->set_new_req_lock);
+	INIT_LIST_HEAD(&set->set_new_requests);
+	set->set_max_inflight = UINT_MAX;
+	set->set_producer     = NULL;
+	set->set_producer_arg = NULL;
+	set->set_rc           = 0;
+
+	RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_set);
+
+/**
+ * Allocate and initialize new request set structure with flow control
+ * extension. This extension allows to control the number of requests in-flight
+ * for the whole set. A callback function to generate requests must be provided
+ * and the request set will keep the number of requests sent over the wire to
+ * @max_inflight.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg)
+
+{
+	struct ptlrpc_request_set *set;
+
+	set = ptlrpc_prep_set();
+	if (!set)
+		RETURN(NULL);
+
+	set->set_max_inflight  = max;
+	set->set_producer      = func;
+	set->set_producer_arg  = arg;
+
+	RETURN(set);
+}
+
+/**
+ * Wind down and free request set structure previously allocated with
+ * ptlrpc_prep_set.
+ * Ensures that all requests on the set have completed and removes
+ * all requests from the request list in a set.
+ * If any unsent request happen to be on the list, pretends that they got
+ * an error in flight and calls their completion handler.
+ */
+void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	int expected_phase;
+	int n = 0;
+
+	ENTRY;
+
+	/* Requests on the set should either all be completed, or all be new */
+	expected_phase = (atomic_read(&set->set_remaining) == 0) ?
+			 RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
+	list_for_each_entry(req, &set->set_requests, rq_set_chain) {
+		LASSERT(req->rq_phase == expected_phase);
+		n++;
+	}
+
+	LASSERTF(atomic_read(&set->set_remaining) == 0 ||
+		 atomic_read(&set->set_remaining) == n, "%d / %d\n",
+		 atomic_read(&set->set_remaining), n);
+
+	while ((req = list_first_entry_or_null(&set->set_requests,
+					       struct ptlrpc_request,
+					       rq_set_chain))) {
+		list_del_init(&req->rq_set_chain);
+
+		LASSERT(req->rq_phase == expected_phase);
+
+		if (req->rq_phase == RQ_PHASE_NEW) {
+			ptlrpc_req_interpret(NULL, req, -EBADR);
+			atomic_dec(&set->set_remaining);
+		}
+
+		spin_lock(&req->rq_lock);
+		req->rq_set = NULL;
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+
+		ptlrpc_req_finished(req);
+	}
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+	ptlrpc_reqset_put(set);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_set_destroy);
+
+/**
+ * Add a new request to the general purpose request set.
+ * Assumes request reference from the caller.
+ */
+void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
+			struct ptlrpc_request *req)
+{
+	if (set == PTLRPCD_SET) {
+		ptlrpcd_add_req(req);
+		return;
+	}
+
+	LASSERT(req->rq_import->imp_state != LUSTRE_IMP_IDLE);
+	LASSERT(list_empty(&req->rq_set_chain));
+
+	if (req->rq_allow_intr)
+		set->set_allow_intr = 1;
+
+	/* The set takes over the caller's request reference */
+	list_add_tail(&req->rq_set_chain, &set->set_requests);
+	req->rq_set = set;
+	atomic_inc(&set->set_remaining);
+	req->rq_queued_time = ktime_get_seconds();
+
+	if (req->rq_reqmsg)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	if (set->set_producer)
+		/*
+		 * If the request set has a producer callback, the RPC must be
+		 * sent straight away
+		 */
+		ptlrpc_send_new_req(req);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_req);
+
+/**
+ * Add a request to a request with dedicated server thread
+ * and wake the thread to make any necessary processing.
+ * Currently only used for ptlrpcd.
+ */
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+			    struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set = pc->pc_set;
+	int count, i;
+
+	LASSERT(req->rq_set == NULL);
+	LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
+
+	spin_lock(&set->set_new_req_lock);
+	/*
+	 * The set takes over the caller's request reference.
+	 */
+	req->rq_set = set;
+	req->rq_queued_time = ktime_get_seconds();
+	list_add_tail(&req->rq_set_chain, &set->set_new_requests);
+	count = atomic_inc_return(&set->set_new_count);
+	spin_unlock(&set->set_new_req_lock);
+
+	/* Only need to call wakeup once for the first entry. */
+	if (count == 1) {
+		wake_up(&set->set_waitq);
+
+		/*
+		 * XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future.
+		 */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+
+/**
+ * Based on the current state of the import, determine if the request
+ * can be sent, is an error, or should be delayed.
+ *
+ * Returns true if this request should be delayed. If false, and
+ * *status is set, then the request can not be sent and *status is the
+ * error code.  If false and status is 0, then request can be sent.
+ *
+ * The imp->imp_lock must be held.
+ */
+static int ptlrpc_import_delay_req(struct obd_import *imp,
+				   struct ptlrpc_request *req, int *status)
+{
+	int delay = 0;
+
+	ENTRY;
+	LASSERT(status);
+	*status = 0;
+
+	if (req->rq_ctx_init || req->rq_ctx_fini) {
+		/* always allow ctx init/fini rpc go through */
+	} else if (imp->imp_state == LUSTRE_IMP_NEW) {
+		DEBUG_REQ(D_ERROR, req, "Uninitialized import");
+		*status = -EIO;
+	} else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		unsigned int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+		/*
+		 * pings or MDS-equivalent STATFS may safely
+		 * race with umount
+		 */
+		DEBUG_REQ((opc == OBD_PING || opc == OST_STATFS) ?
+			  D_HA : D_ERROR, req, "IMP_CLOSED");
+		*status = -EIO;
+	} else if (ptlrpc_send_limit_expired(req)) {
+		/* probably doesn't need to be a D_ERROR afterinitial testing */
+		DEBUG_REQ(D_HA, req, "send limit expired");
+		*status = -ETIMEDOUT;
+	} else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
+		   imp->imp_state == LUSTRE_IMP_CONNECTING) {
+		;/* allow CONNECT even if import is invalid */
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+			DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+			*status = -EIO;
+		}
+	} else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
+		if (!imp->imp_deactive)
+			DEBUG_REQ(D_NET, req, "IMP_INVALID");
+		*status = -ESHUTDOWN; /* b=12940 */
+	} else if (req->rq_import_generation != imp->imp_generation) {
+		DEBUG_REQ(D_ERROR, req, "req wrong generation:");
+		*status = -EIO;
+	} else if (req->rq_send_state != imp->imp_state) {
+		/* invalidate in progress - any requests should be drop */
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+			DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+			*status = -EIO;
+		} else if (req->rq_no_delay &&
+			   imp->imp_generation != imp->imp_initiated_at) {
+			/* ignore nodelay for requests initiating connections */
+			*status = -EAGAIN;
+		} else if (req->rq_allow_replay &&
+			   (imp->imp_state == LUSTRE_IMP_REPLAY ||
+			    imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
+			    imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
+			    imp->imp_state == LUSTRE_IMP_RECOVER)) {
+			DEBUG_REQ(D_HA, req, "allow during recovery");
+		} else {
+			delay = 1;
+		}
+	}
+
+	RETURN(delay);
+}
+
+/**
+ * Decide if the error message should be printed to the console or not.
+ * Makes its decision based on request type, status, and failure frequency.
+ *
+ * \param[in] req  request that failed and may need a console message
+ *
+ * \retval false if no message should be printed
+ * \retval true  if console message should be printed
+ */
+static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err)
+{
+	LASSERT(req->rq_reqmsg != NULL);
+
+	/* Suppress particular reconnect errors which are to be expected. */
+	if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) {
+		/* Suppress timed out reconnect requests */
+		if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) ||
+		    req->rq_timedout)
+			return false;
+
+		/*
+		 * Suppress most unavailable/again reconnect requests, but
+		 * print occasionally so it is clear client is trying to
+		 * connect to a server where no target is running.
+		 */
+		if ((err == -ENODEV || err == -EAGAIN) &&
+		    req->rq_import->imp_conn_cnt % 30 != 20)
+			return false;
+	}
+
+	if (opc == LDLM_ENQUEUE && err == -EAGAIN)
+		/* -EAGAIN is normal when using POSIX flocks */
+		return false;
+
+	if (opc == OBD_PING && (err == -ENODEV || err == -ENOTCONN) &&
+	    (req->rq_xid & 0xf) != 10)
+		/* Suppress most ping requests, they may fail occasionally */
+		return false;
+
+	return true;
+}
+
+/**
+ * Check request processing status.
+ * Returns the status.
+ */
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+	int rc;
+
+	ENTRY;
+	rc = lustre_msg_get_status(req->rq_repmsg);
+	if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+		struct obd_import *imp = req->rq_import;
+		struct lnet_nid *nid = &imp->imp_connection->c_peer.nid;
+		__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+		if (ptlrpc_console_allow(req, opc, rc))
+			LCONSOLE_ERROR_MSG(0x11,
+					   "%s: operation %s to node %s failed: rc = %d\n",
+					   imp->imp_obd->obd_name,
+					   ll_opcode2str(opc),
+					   libcfs_nidstr(nid), rc);
+		RETURN(rc < 0 ? rc : -EINVAL);
+	}
+
+	if (rc)
+		DEBUG_REQ(D_INFO, req, "check status: rc = %d", rc);
+
+	RETURN(rc);
+}
+
+/**
+ * save pre-versions of objects into request for replay.
+ * Versions are obtained from server reply.
+ * used for VBR.
+ */
+static void ptlrpc_save_versions(struct ptlrpc_request *req)
+{
+	struct lustre_msg *repmsg = req->rq_repmsg;
+	struct lustre_msg *reqmsg = req->rq_reqmsg;
+	__u64 *versions = lustre_msg_get_versions(repmsg);
+
+	ENTRY;
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+		return;
+
+	LASSERT(versions);
+	lustre_msg_set_versions(reqmsg, versions);
+	CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n",
+	       versions[0], versions[1]);
+
+	EXIT;
+}
+
+__u64 ptlrpc_known_replied_xid(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	assert_spin_locked(&imp->imp_lock);
+	if (list_empty(&imp->imp_unreplied_list))
+		return 0;
+
+	req = list_first_entry(&imp->imp_unreplied_list, struct ptlrpc_request,
+			       rq_unreplied_list);
+	LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid);
+
+	if (imp->imp_known_replied_xid < req->rq_xid - 1)
+		imp->imp_known_replied_xid = req->rq_xid - 1;
+
+	return req->rq_xid - 1;
+}
+
+/**
+ * Callback function called when client receives RPC reply for \a req.
+ * Returns 0 on success or error code.
+ * The return alue would be assigned to req->rq_status by the caller
+ * as request processing status.
+ * This function also decides if the request needs to be saved for later replay.
+ */
+static int after_reply(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct obd_device *obd = req->rq_import->imp_obd;
+	ktime_t work_start;
+	u64 committed;
+	s64 timediff;
+	int rc;
+
+	ENTRY;
+	LASSERT(obd != NULL);
+	/* repbuf must be unlinked */
+	LASSERT(!req->rq_receiving_reply && req->rq_reply_unlinked);
+
+	if (req->rq_reply_truncated) {
+		if (ptlrpc_no_resend(req)) {
+			DEBUG_REQ(D_ERROR, req,
+				  "reply buffer overflow, expected=%d, actual size=%d",
+				  req->rq_nob_received, req->rq_repbuf_len);
+			RETURN(-EOVERFLOW);
+		}
+
+		sptlrpc_cli_free_repbuf(req);
+		/*
+		 * Pass the required reply buffer size (include
+		 * space for early reply).
+		 * NB: no need to roundup because alloc_repbuf
+		 * will roundup it
+		 */
+		req->rq_replen = req->rq_nob_received;
+		req->rq_nob_received = 0;
+		spin_lock(&req->rq_lock);
+		req->rq_resend       = 1;
+		spin_unlock(&req->rq_lock);
+		RETURN(0);
+	}
+
+	work_start = ktime_get_real();
+	timediff = ktime_us_delta(work_start, req->rq_sent_ns);
+
+	/*
+	 * NB Until this point, the whole of the incoming message,
+	 * including buflens, status etc is in the sender's byte order.
+	 */
+	rc = sptlrpc_cli_unwrap_reply(req);
+	if (rc) {
+		DEBUG_REQ(D_ERROR, req, "unwrap reply failed: rc = %d", rc);
+		RETURN(rc);
+	}
+
+	/*
+	 * Security layer unwrap might ask resend this request.
+	 */
+	if (req->rq_resend)
+		RETURN(0);
+
+	rc = unpack_reply(req);
+	if (rc)
+		RETURN(rc);
+
+	/* retry indefinitely on EINPROGRESS */
+	if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
+	    ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
+		time64_t now = ktime_get_real_seconds();
+
+		DEBUG_REQ((req->rq_nr_resend % 8 == 1 ? D_WARNING : 0) |
+			  D_RPCTRACE, req, "resending request on EINPROGRESS");
+		spin_lock(&req->rq_lock);
+		req->rq_resend = 1;
+		spin_unlock(&req->rq_lock);
+		req->rq_nr_resend++;
+
+		/* Readjust the timeout for current conditions */
+		ptlrpc_at_set_req_timeout(req);
+		/*
+		 * delay resend to give a chance to the server to get ready.
+		 * The delay is increased by 1s on every resend and is capped to
+		 * the current request timeout (i.e. obd_timeout if AT is off,
+		 * or AT service time x 125% + 5s, see at_est2timeout)
+		 */
+		if (req->rq_nr_resend > req->rq_timeout)
+			req->rq_sent = now + req->rq_timeout;
+		else
+			req->rq_sent = now + req->rq_nr_resend;
+
+		/* Resend for EINPROGRESS will use a new XID */
+		spin_lock(&imp->imp_lock);
+		list_del_init(&req->rq_unreplied_list);
+		spin_unlock(&imp->imp_lock);
+
+		RETURN(0);
+	}
+
+	if (obd->obd_svc_stats) {
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff);
+		ptlrpc_lprocfs_rpc_sent(req, timediff);
+	}
+
+	if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
+	    lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
+		DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
+			  lustre_msg_get_type(req->rq_repmsg));
+		RETURN(-EPROTO);
+	}
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+		CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
+	ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+	ptlrpc_at_adj_net_latency(req,
+				  lustre_msg_get_service_timeout(req->rq_repmsg));
+
+	rc = ptlrpc_check_status(req);
+
+	if (rc) {
+		/*
+		 * Either we've been evicted, or the server has failed for
+		 * some reason. Try to reconnect, and if that fails, punt to
+		 * the upcall.
+		 */
+		if (ptlrpc_recoverable_error(rc)) {
+			if (req->rq_send_state != LUSTRE_IMP_FULL ||
+			    imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
+				RETURN(rc);
+			}
+			ptlrpc_request_handle_notconn(req);
+			RETURN(rc);
+		}
+	} else {
+		/*
+		 * Let's look if server sent slv. Do it only for RPC with
+		 * rc == 0.
+		 */
+		ldlm_cli_update_pool(req);
+	}
+
+	/*
+	 * Store transno in reqmsg for replay.
+	 */
+	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+		req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
+		lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+	}
+
+	if (imp->imp_replayable) {
+		/* if other threads are waiting for ptlrpc_free_committed()
+		 * they could continue the work of freeing RPCs. That reduces
+		 * lock hold times, and distributes work more fairly across
+		 * waiting threads.  We can't use spin_is_contended() since
+		 * there are many other places where imp_lock is held.
+		 */
+		atomic_inc(&imp->imp_waiting);
+		spin_lock(&imp->imp_lock);
+		atomic_dec(&imp->imp_waiting);
+		/*
+		 * No point in adding already-committed requests to the replay
+		 * list, we will just remove them immediately. b=9829
+		 */
+		if (req->rq_transno != 0 &&
+		    (req->rq_transno >
+		     lustre_msg_get_last_committed(req->rq_repmsg) ||
+		     req->rq_replay)) {
+			/** version recovery */
+			ptlrpc_save_versions(req);
+			ptlrpc_retain_replayable_request(req, imp);
+		} else if (req->rq_commit_cb &&
+			   list_empty(&req->rq_replay_list)) {
+			/*
+			 * NB: don't call rq_commit_cb if it's already on
+			 * rq_replay_list, ptlrpc_free_committed() will call
+			 * it later, see LU-3618 for details
+			 */
+			spin_unlock(&imp->imp_lock);
+			req->rq_commit_cb(req);
+			atomic_inc(&imp->imp_waiting);
+			spin_lock(&imp->imp_lock);
+			atomic_dec(&imp->imp_waiting);
+		}
+
+		/*
+		 * Replay-enabled imports return commit-status information.
+		 */
+		committed = lustre_msg_get_last_committed(req->rq_repmsg);
+		if (likely(committed > imp->imp_peer_committed_transno))
+			imp->imp_peer_committed_transno = committed;
+
+		ptlrpc_free_committed(imp);
+
+		if (!list_empty(&imp->imp_replay_list)) {
+			struct ptlrpc_request *last;
+
+			last = list_entry(imp->imp_replay_list.prev,
+					  struct ptlrpc_request,
+					  rq_replay_list);
+			/*
+			 * Requests with rq_replay stay on the list even if no
+			 * commit is expected.
+			 */
+			if (last->rq_transno > imp->imp_peer_committed_transno)
+				ptlrpc_pinger_commit_expected(imp);
+		}
+
+		spin_unlock(&imp->imp_lock);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Helper function to send request \a req over the network for the first time
+ * Also adjusts request phase.
+ * Returns 0 on success or error code.
+ */
+static int ptlrpc_send_new_req(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	__u64 min_xid = 0;
+	int rc;
+
+	ENTRY;
+	LASSERT(req->rq_phase == RQ_PHASE_NEW);
+
+	/* do not try to go further if there is not enough memory in enc_pool */
+	if (req->rq_sent && req->rq_bulk)
+		if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() &&
+		    pool_is_at_full_capacity())
+			RETURN(-ENOMEM);
+
+	if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) &&
+	    (!req->rq_generation_set ||
+	     req->rq_import_generation == imp->imp_generation))
+		RETURN(0);
+
+	ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
+
+	spin_lock(&imp->imp_lock);
+
+	LASSERT(req->rq_xid != 0);
+	LASSERT(!list_empty(&req->rq_unreplied_list));
+
+	if (!req->rq_generation_set)
+		req->rq_import_generation = imp->imp_generation;
+
+	if (ptlrpc_import_delay_req(imp, req, &rc)) {
+		spin_lock(&req->rq_lock);
+		req->rq_waiting = 1;
+		spin_unlock(&req->rq_lock);
+
+		DEBUG_REQ(D_HA, req, "req waiting for recovery: (%s != %s)",
+			  ptlrpc_import_state_name(req->rq_send_state),
+			  ptlrpc_import_state_name(imp->imp_state));
+		LASSERT(list_empty(&req->rq_list));
+		list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+		atomic_inc(&req->rq_import->imp_inflight);
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+	if (rc != 0) {
+		spin_unlock(&imp->imp_lock);
+		req->rq_status = rc;
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+		RETURN(rc);
+	}
+
+	LASSERT(list_empty(&req->rq_list));
+	list_add_tail(&req->rq_list, &imp->imp_sending_list);
+	atomic_inc(&req->rq_import->imp_inflight);
+
+	/*
+	 * find the known replied XID from the unreplied list, CONNECT
+	 * and DISCONNECT requests are skipped to make the sanity check
+	 * on server side happy. see process_req_last_xid().
+	 *
+	 * For CONNECT: Because replay requests have lower XID, it'll
+	 * break the sanity check if CONNECT bump the exp_last_xid on
+	 * server.
+	 *
+	 * For DISCONNECT: Since client will abort inflight RPC before
+	 * sending DISCONNECT, DISCONNECT may carry an XID which higher
+	 * than the inflight RPC.
+	 */
+	if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req))
+		min_xid = ptlrpc_known_replied_xid(imp);
+	spin_unlock(&imp->imp_lock);
+
+	lustre_msg_set_last_xid(req->rq_reqmsg, min_xid);
+
+	lustre_msg_set_status(req->rq_reqmsg, current->pid);
+
+	/* If the request to be sent is an LDLM callback, do not try to
+	 * refresh context.
+	 * An LDLM callback is sent by a server to a client in order to make
+	 * it release a lock, on a communication channel that uses a reverse
+	 * context. It cannot be refreshed on its own, as it is the 'reverse'
+	 * (server-side) representation of a client context.
+	 * We do not care if the reverse context is expired, and want to send
+	 * the LDLM callback anyway. Once the client receives the AST, it is
+	 * its job to refresh its own context if it has expired, hence
+	 * refreshing the associated reverse context on server side, before
+	 * being able to send the LDLM_CANCEL requested by the server.
+	 */
+	if (lustre_msg_get_opc(req->rq_reqmsg) != LDLM_BL_CALLBACK &&
+	    lustre_msg_get_opc(req->rq_reqmsg) != LDLM_CP_CALLBACK &&
+	    lustre_msg_get_opc(req->rq_reqmsg) != LDLM_GL_CALLBACK)
+		rc = sptlrpc_req_refresh_ctx(req, 0);
+	if (rc) {
+		if (req->rq_err) {
+			req->rq_status = rc;
+			RETURN(1);
+		} else {
+			spin_lock(&req->rq_lock);
+			req->rq_wait_ctx = 1;
+			spin_unlock(&req->rq_lock);
+			RETURN(0);
+		}
+	}
+
+	CDEBUG(D_RPCTRACE,
+	       "Sending RPC req@%p pname:cluuid:pid:xid:nid:opc:job %s:%s:%d:%llu:%s:%d:%s\n",
+	       req, current->comm,
+	       imp->imp_obd->obd_uuid.uuid,
+	       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+	       obd_import_nid2str(imp), lustre_msg_get_opc(req->rq_reqmsg),
+	       lustre_msg_get_jobid(req->rq_reqmsg) ?: "");
+
+	rc = ptl_send_rpc(req, 0);
+	if (rc == -ENOMEM) {
+		spin_lock(&imp->imp_lock);
+		if (!list_empty(&req->rq_list)) {
+			list_del_init(&req->rq_list);
+			if (atomic_dec_and_test(&req->rq_import->imp_inflight))
+				wake_up(&req->rq_import->imp_recovery_waitq);
+		}
+		spin_unlock(&imp->imp_lock);
+		ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
+		RETURN(rc);
+	}
+	if (rc) {
+		DEBUG_REQ(D_HA, req, "send failed, expect timeout: rc = %d",
+			  rc);
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+		RETURN(rc);
+	}
+	RETURN(0);
+}
+
+static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
+{
+	int remaining, rc;
+
+	ENTRY;
+	LASSERT(set->set_producer != NULL);
+
+	remaining = atomic_read(&set->set_remaining);
+
+	/*
+	 * populate the ->set_requests list with requests until we
+	 * reach the maximum number of RPCs in flight for this set
+	 */
+	while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
+		rc = set->set_producer(set, set->set_producer_arg);
+		if (rc == -ENOENT) {
+			/* no more RPC to produce */
+			set->set_producer     = NULL;
+			set->set_producer_arg = NULL;
+			RETURN(0);
+		}
+	}
+
+	RETURN((atomic_read(&set->set_remaining) - remaining));
+}
+
+/**
+ * this sends any unsent RPCs in \a set and returns 1 if all are sent
+ * and no more replies are expected.
+ * (it is possible to get less replies than requests sent e.g. due to timed out
+ * requests or requests that we had trouble to send out)
+ *
+ * NOTE: This function contains a potential schedule point (cond_resched()).
+ */
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req, *next;
+	LIST_HEAD(comp_reqs);
+	int force_timer_recalc = 0;
+
+	ENTRY;
+	if (atomic_read(&set->set_remaining) == 0)
+		RETURN(1);
+
+	list_for_each_entry_safe(req, next, &set->set_requests,
+				 rq_set_chain) {
+		struct obd_import *imp = req->rq_import;
+		int unregistered = 0;
+		int async = 1;
+		int rc = 0;
+
+		if (req->rq_phase == RQ_PHASE_COMPLETE) {
+			list_move_tail(&req->rq_set_chain, &comp_reqs);
+			continue;
+		}
+
+		/*
+		 * This schedule point is mainly for the ptlrpcd caller of this
+		 * function.  Most ptlrpc sets are not long-lived and unbounded
+		 * in length, but at the least the set used by the ptlrpcd is.
+		 * Since the processing time is unbounded, we need to insert an
+		 * explicit schedule point to make the thread well-behaved.
+		 */
+		cond_resched();
+
+		/*
+		 * If the caller requires to allow to be interpreted by force
+		 * and it has really been interpreted, then move the request
+		 * to RQ_PHASE_INTERPRET phase in spite of what the current
+		 * phase is.
+		 */
+		if (unlikely(req->rq_allow_intr && req->rq_intr)) {
+			req->rq_status = -EINTR;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+			/*
+			 * Since it is interpreted and we have to wait for
+			 * the reply to be unlinked, then use sync mode.
+			 */
+			async = 0;
+
+			GOTO(interpret, req->rq_status);
+		}
+
+		if (req->rq_phase == RQ_PHASE_NEW && ptlrpc_send_new_req(req))
+			force_timer_recalc = 1;
+
+		/* delayed send - skip */
+		if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
+			continue;
+
+		/* delayed resend - skip */
+		if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
+		    req->rq_sent > ktime_get_real_seconds())
+			continue;
+
+		if (!(req->rq_phase == RQ_PHASE_RPC ||
+		      req->rq_phase == RQ_PHASE_BULK ||
+		      req->rq_phase == RQ_PHASE_INTERPRET ||
+		      req->rq_phase == RQ_PHASE_UNREG_RPC ||
+		      req->rq_phase == RQ_PHASE_UNREG_BULK)) {
+			DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
+			LBUG();
+		}
+
+		if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
+		    req->rq_phase == RQ_PHASE_UNREG_BULK) {
+			LASSERT(req->rq_next_phase != req->rq_phase);
+			LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+			if (req->rq_req_deadline &&
+			    !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK))
+				req->rq_req_deadline = 0;
+			if (req->rq_reply_deadline &&
+			    !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK))
+				req->rq_reply_deadline = 0;
+			if (req->rq_bulk_deadline &&
+			    !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK))
+				req->rq_bulk_deadline = 0;
+
+			/*
+			 * Skip processing until reply is unlinked. We
+			 * can't return to pool before that and we can't
+			 * call interpret before that. We need to make
+			 * sure that all rdma transfers finished and will
+			 * not corrupt any data.
+			 */
+			if (req->rq_phase == RQ_PHASE_UNREG_RPC &&
+			    ptlrpc_cli_wait_unlink(req))
+				continue;
+			if (req->rq_phase == RQ_PHASE_UNREG_BULK &&
+			    ptlrpc_client_bulk_active(req))
+				continue;
+
+			/*
+			 * Turn fail_loc off to prevent it from looping
+			 * forever.
+			 */
+			if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+				OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
+						     OBD_FAIL_ONCE);
+			}
+			if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+				OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
+						     OBD_FAIL_ONCE);
+			}
+
+			/*
+			 * Move to next phase if reply was successfully
+			 * unlinked.
+			 */
+			ptlrpc_rqphase_move(req, req->rq_next_phase);
+		}
+
+		if (req->rq_phase == RQ_PHASE_INTERPRET)
+			GOTO(interpret, req->rq_status);
+
+		/*
+		 * Note that this also will start async reply unlink.
+		 */
+		if (req->rq_net_err && !req->rq_timedout) {
+			ptlrpc_expire_one_request(req, 1);
+
+			/*
+			 * Check if we still need to wait for unlink.
+			 */
+			if (ptlrpc_cli_wait_unlink(req) ||
+			    ptlrpc_client_bulk_active(req))
+				continue;
+			/* If there is no need to resend, fail it now. */
+			if (req->rq_no_resend) {
+				if (req->rq_status == 0)
+					req->rq_status = -EIO;
+				ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+				GOTO(interpret, req->rq_status);
+			} else {
+				continue;
+			}
+		}
+
+		if (req->rq_err) {
+			if (!ptlrpc_unregister_reply(req, 1)) {
+				ptlrpc_unregister_bulk(req, 1);
+				continue;
+			}
+
+			spin_lock(&req->rq_lock);
+			req->rq_replied = 0;
+			spin_unlock(&req->rq_lock);
+			if (req->rq_status == 0)
+				req->rq_status = -EIO;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+			GOTO(interpret, req->rq_status);
+		}
+
+		/*
+		 * ptlrpc_set_wait uses l_wait_event_abortable_timeout()
+		 * so it sets rq_intr regardless of individual rpc
+		 * timeouts. The synchronous IO waiting path sets
+		 * rq_intr irrespective of whether ptlrpcd
+		 * has seen a timeout.  Our policy is to only interpret
+		 * interrupted rpcs after they have timed out, so we
+		 * need to enforce that here.
+		 */
+
+		if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
+				     req->rq_wait_ctx)) {
+			req->rq_status = -EINTR;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+			GOTO(interpret, req->rq_status);
+		}
+
+		if (req->rq_phase == RQ_PHASE_RPC) {
+			if (req->rq_timedout || req->rq_resend ||
+			    req->rq_waiting || req->rq_wait_ctx) {
+				int status;
+
+				if (!ptlrpc_unregister_reply(req, 1)) {
+					ptlrpc_unregister_bulk(req, 1);
+					continue;
+				}
+
+				spin_lock(&imp->imp_lock);
+				if (ptlrpc_import_delay_req(imp, req,
+							    &status)) {
+					/*
+					 * put on delay list - only if we wait
+					 * recovery finished - before send
+					 */
+					list_move_tail(&req->rq_list,
+						       &imp->imp_delayed_list);
+					spin_unlock(&imp->imp_lock);
+					continue;
+				}
+
+				if (status != 0)  {
+					req->rq_status = status;
+					ptlrpc_rqphase_move(req,
+							    RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					GOTO(interpret, req->rq_status);
+				}
+				/* ignore on just initiated connections */
+				if (ptlrpc_no_resend(req) &&
+				    !req->rq_wait_ctx &&
+				    imp->imp_generation !=
+				    imp->imp_initiated_at) {
+					req->rq_status = -ENOTCONN;
+					ptlrpc_rqphase_move(req,
+							    RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					GOTO(interpret, req->rq_status);
+				}
+
+				/* don't resend too fast in case of network
+				 * errors.
+				 */
+				if (ktime_get_real_seconds() < (req->rq_sent + 1)
+				    && req->rq_net_err && req->rq_timedout) {
+
+					DEBUG_REQ(D_INFO, req,
+						  "throttle request");
+					/* Don't try to resend RPC right away
+					 * as it is likely it will fail again
+					 * and ptlrpc_check_set() will be
+					 * called again, keeping this thread
+					 * busy. Instead, wait for the next
+					 * timeout. Flag it as resend to
+					 * ensure we don't wait to long.
+					 */
+					req->rq_resend = 1;
+					spin_unlock(&imp->imp_lock);
+					continue;
+				}
+
+				list_move_tail(&req->rq_list,
+					       &imp->imp_sending_list);
+
+				spin_unlock(&imp->imp_lock);
+
+				spin_lock(&req->rq_lock);
+				req->rq_waiting = 0;
+				spin_unlock(&req->rq_lock);
+
+				if (req->rq_timedout || req->rq_resend) {
+					/*
+					 * This is re-sending anyways,
+					 * let's mark req as resend.
+					 */
+					spin_lock(&req->rq_lock);
+					req->rq_resend = 1;
+					spin_unlock(&req->rq_lock);
+				}
+				/*
+				 * rq_wait_ctx is only touched by ptlrpcd,
+				 * so no lock is needed here.
+				 */
+				status = sptlrpc_req_refresh_ctx(req, 0);
+				if (status) {
+					if (req->rq_err) {
+						req->rq_status = status;
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 0;
+						spin_unlock(&req->rq_lock);
+						force_timer_recalc = 1;
+					} else {
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 1;
+						spin_unlock(&req->rq_lock);
+					}
+
+					continue;
+				} else {
+					spin_lock(&req->rq_lock);
+					req->rq_wait_ctx = 0;
+					spin_unlock(&req->rq_lock);
+				}
+
+				/*
+				 * In any case, the previous bulk should be
+				 * cleaned up to prepare for the new sending
+				 */
+				if (req->rq_bulk &&
+				    !ptlrpc_unregister_bulk(req, 1))
+					continue;
+
+				rc = ptl_send_rpc(req, 0);
+				if (rc == -ENOMEM) {
+					spin_lock(&imp->imp_lock);
+					if (!list_empty(&req->rq_list))
+						list_del_init(&req->rq_list);
+					spin_unlock(&imp->imp_lock);
+					ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
+					continue;
+				}
+				if (rc) {
+					DEBUG_REQ(D_HA, req,
+						  "send failed: rc = %d", rc);
+					force_timer_recalc = 1;
+					spin_lock(&req->rq_lock);
+					req->rq_net_err = 1;
+					spin_unlock(&req->rq_lock);
+					continue;
+				}
+				/* need to reset the timeout */
+				force_timer_recalc = 1;
+			}
+
+			spin_lock(&req->rq_lock);
+
+			if (ptlrpc_client_early(req)) {
+				ptlrpc_at_recv_early_reply(req);
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Still waiting for a reply? */
+			if (ptlrpc_client_recv(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Did we actually receive a reply? */
+			if (!ptlrpc_client_replied(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			spin_unlock(&req->rq_lock);
+
+			/*
+			 * unlink from net because we are going to
+			 * swab in-place of reply buffer
+			 */
+			unregistered = ptlrpc_unregister_reply(req, 1);
+			if (!unregistered)
+				continue;
+
+			req->rq_status = after_reply(req);
+			if (req->rq_resend) {
+				force_timer_recalc = 1;
+				continue;
+			}
+
+			/*
+			 * If there is no bulk associated with this request,
+			 * then we're done and should let the interpreter
+			 * process the reply. Similarly if the RPC returned
+			 * an error, and therefore the bulk will never arrive.
+			 */
+			if (!req->rq_bulk || req->rq_status < 0) {
+				ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+				GOTO(interpret, req->rq_status);
+			}
+
+			ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
+		}
+
+		LASSERT(req->rq_phase == RQ_PHASE_BULK);
+		if (ptlrpc_client_bulk_active(req))
+			continue;
+
+		if (req->rq_bulk->bd_failure) {
+			/*
+			 * The RPC reply arrived OK, but the bulk screwed
+			 * up!  Dead weird since the server told us the RPC
+			 * was good after getting the REPLY for her GET or
+			 * the ACK for her PUT.
+			 */
+			DEBUG_REQ(D_ERROR, req, "bulk transfer failed %d/%d/%d",
+				  req->rq_status,
+				  req->rq_bulk->bd_nob,
+				  req->rq_bulk->bd_nob_transferred);
+			req->rq_status = -EIO;
+		}
+
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+interpret:
+		LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
+
+		/*
+		 * This moves to "unregistering" phase we need to wait for
+		 * reply unlink.
+		 */
+		if (!unregistered && !ptlrpc_unregister_reply(req, async)) {
+			/* start async bulk unlink too */
+			ptlrpc_unregister_bulk(req, 1);
+			continue;
+		}
+
+		if (!ptlrpc_unregister_bulk(req, async))
+			continue;
+
+		/*
+		 * When calling interpret receiving already should be
+		 * finished.
+		 */
+		LASSERT(!req->rq_receiving_reply);
+
+		ptlrpc_req_interpret(env, req, req->rq_status);
+
+		if (ptlrpcd_check_work(req)) {
+			atomic_dec(&set->set_remaining);
+			continue;
+		}
+		ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+
+		if (req->rq_reqmsg)
+			CDEBUG(D_RPCTRACE,
+			       "Completed RPC req@%p pname:cluuid:pid:xid:nid:opc:job %s:%s:%d:%llu:%s:%d:%s\n",
+			       req, current->comm,
+			       imp->imp_obd->obd_uuid.uuid,
+			       lustre_msg_get_status(req->rq_reqmsg),
+			       req->rq_xid,
+			       obd_import_nid2str(imp),
+			       lustre_msg_get_opc(req->rq_reqmsg),
+			       lustre_msg_get_jobid(req->rq_reqmsg) ?: "");
+
+		spin_lock(&imp->imp_lock);
+		/*
+		 * Request already may be not on sending or delaying list. This
+		 * may happen in the case of marking it erroneous for the case
+		 * ptlrpc_import_delay_req(req, status) find it impossible to
+		 * allow sending this rpc and returns *status != 0.
+		 */
+		if (!list_empty(&req->rq_list)) {
+			list_del_init(&req->rq_list);
+			if (atomic_dec_and_test(&imp->imp_inflight))
+				wake_up(&imp->imp_recovery_waitq);
+		}
+		list_del_init(&req->rq_unreplied_list);
+		spin_unlock(&imp->imp_lock);
+
+		atomic_dec(&set->set_remaining);
+		wake_up(&imp->imp_recovery_waitq);
+
+		if (set->set_producer) {
+			/* produce a new request if possible */
+			if (ptlrpc_set_producer(set) > 0)
+				force_timer_recalc = 1;
+
+			/*
+			 * free the request that has just been completed
+			 * in order not to pollute set->set_requests
+			 */
+			list_del_init(&req->rq_set_chain);
+			spin_lock(&req->rq_lock);
+			req->rq_set = NULL;
+			req->rq_invalid_rqset = 0;
+			spin_unlock(&req->rq_lock);
+
+			/* record rq_status to compute the final status later */
+			if (req->rq_status != 0)
+				set->set_rc = req->rq_status;
+			ptlrpc_req_finished(req);
+		} else {
+			list_move_tail(&req->rq_set_chain, &comp_reqs);
+		}
+	}
+
+	/*
+	 * move completed request at the head of list so it's easier for
+	 * caller to find them
+	 */
+	list_splice(&comp_reqs, &set->set_requests);
+
+	/* If we hit an error, we want to recover promptly. */
+	RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc);
+}
+EXPORT_SYMBOL(ptlrpc_check_set);
+
+/**
+ * Time out request \a req. is \a async_unlink is set, that means do not wait
+ * until LNet actually confirms network buffer unlinking.
+ * Return 1 if we should give up further retrying attempts or 0 otherwise.
+ */
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
+{
+	struct obd_import *imp = req->rq_import;
+	unsigned int debug_mask = D_RPCTRACE;
+	int rc = 0;
+
+	ENTRY;
+	spin_lock(&req->rq_lock);
+	req->rq_timedout = 1;
+	spin_unlock(&req->rq_lock);
+
+	if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg),
+				 lustre_msg_get_status(req->rq_reqmsg)))
+		debug_mask = D_WARNING;
+	DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]",
+		  req->rq_net_err ? "failed due to network error" :
+		     ((req->rq_real_sent == 0 ||
+		       req->rq_real_sent < req->rq_sent ||
+		       req->rq_real_sent >= req->rq_deadline) ?
+		      "timed out for sent delay" : "timed out for slow reply"),
+		  req->rq_sent, req->rq_real_sent);
+
+	if (imp && obd_debug_peer_on_timeout)
+		LNetDebugPeer(&imp->imp_connection->c_peer);
+
+	ptlrpc_unregister_reply(req, async_unlink);
+	ptlrpc_unregister_bulk(req, async_unlink);
+
+	if (obd_dump_on_timeout)
+		libcfs_debug_dumplog();
+
+	if (!imp) {
+		DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
+		RETURN(1);
+	}
+
+	atomic_inc(&imp->imp_timeouts);
+
+	/* The DLM server doesn't want recovery run on its imports. */
+	if (imp->imp_dlm_fake)
+		RETURN(1);
+
+	/*
+	 * If this request is for recovery or other primordial tasks,
+	 * then error it out here.
+	 */
+	if (req->rq_ctx_init || req->rq_ctx_fini ||
+	    req->rq_send_state != LUSTRE_IMP_FULL ||
+	    imp->imp_obd->obd_no_recov) {
+		DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+			  ptlrpc_import_state_name(req->rq_send_state),
+			  ptlrpc_import_state_name(imp->imp_state));
+		spin_lock(&req->rq_lock);
+		req->rq_status = -ETIMEDOUT;
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+		RETURN(1);
+	}
+
+	/*
+	 * if a request can't be resent we can't wait for an answer after
+	 * the timeout
+	 */
+	if (ptlrpc_no_resend(req)) {
+		DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
+		rc = 1;
+	}
+
+	ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+	RETURN(rc);
+}
+
+/**
+ * Time out all uncompleted requests in request set pointed by \a data
+ * This is called when a wait times out.
+ */
+void ptlrpc_expired_set(struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	time64_t now = ktime_get_real_seconds();
+
+	ENTRY;
+	LASSERT(set != NULL);
+
+	/*
+	 * A timeout expired. See which reqs it applies to...
+	 */
+	list_for_each_entry(req, &set->set_requests, rq_set_chain) {
+		/* don't expire request waiting for context */
+		if (req->rq_wait_ctx)
+			continue;
+
+		/* Request in-flight? */
+		if (!((req->rq_phase == RQ_PHASE_RPC &&
+		       !req->rq_waiting && !req->rq_resend) ||
+		      (req->rq_phase == RQ_PHASE_BULK)))
+			continue;
+
+		if (req->rq_timedout ||     /* already dealt with */
+		    req->rq_deadline > now) /* not expired */
+			continue;
+
+		/*
+		 * Deal with this guy. Do it asynchronously to not block
+		 * ptlrpcd thread.
+		 */
+		ptlrpc_expire_one_request(req, 1);
+		/*
+		 * Loops require that we resched once in a while to avoid
+		 * RCU stalls and a few other problems.
+		 */
+		cond_resched();
+
+	}
+}
+
+/**
+ * Interrupts (sets interrupted flag) all uncompleted requests in
+ * a set \a data. This is called when a wait_event is interrupted
+ * by a signal.
+ */
+static void ptlrpc_interrupted_set(struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+
+	LASSERT(set != NULL);
+	CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
+
+	list_for_each_entry(req, &set->set_requests, rq_set_chain) {
+		if (req->rq_intr)
+			continue;
+
+		if (req->rq_phase != RQ_PHASE_RPC &&
+		    req->rq_phase != RQ_PHASE_UNREG_RPC &&
+		    !req->rq_allow_intr)
+			continue;
+
+		spin_lock(&req->rq_lock);
+		req->rq_intr = 1;
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+/**
+ * Get the smallest timeout in the set; this does NOT set a timeout.
+ */
+time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+{
+	time64_t now = ktime_get_real_seconds();
+	int timeout = 0;
+	struct ptlrpc_request *req;
+	time64_t deadline;
+
+	ENTRY;
+	list_for_each_entry(req, &set->set_requests, rq_set_chain) {
+		/* Request in-flight? */
+		if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+		      (req->rq_phase == RQ_PHASE_BULK) ||
+		      (req->rq_phase == RQ_PHASE_NEW)))
+			continue;
+
+		/* Already timed out. */
+		if (req->rq_timedout)
+			continue;
+
+		/* Waiting for ctx. */
+		if (req->rq_wait_ctx)
+			continue;
+
+		if (req->rq_phase == RQ_PHASE_NEW)
+			deadline = req->rq_sent;
+		else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
+			deadline = req->rq_sent;
+		else
+			deadline = req->rq_sent + req->rq_timeout;
+
+		if (deadline <= now)    /* actually expired already */
+			timeout = 1;    /* ASAP */
+		else if (timeout == 0 || timeout > deadline - now)
+			timeout = deadline - now;
+	}
+	RETURN(timeout);
+}
+
+/**
+ * Send all unset request from the set and then wait untill all
+ * requests in the set complete (either get a reply, timeout, get an
+ * error or otherwise be interrupted).
+ * Returns 0 on success or error code otherwise.
+ */
+int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	time64_t timeout;
+	int rc;
+
+	ENTRY;
+	if (set->set_producer)
+		(void)ptlrpc_set_producer(set);
+	else
+		list_for_each_entry(req, &set->set_requests, rq_set_chain) {
+			if (req->rq_phase == RQ_PHASE_NEW)
+				(void)ptlrpc_send_new_req(req);
+		}
+
+	if (list_empty(&set->set_requests))
+		RETURN(0);
+
+	do {
+		timeout = ptlrpc_set_next_timeout(set);
+
+		/*
+		 * wait until all complete, interrupted, or an in-flight
+		 * req times out
+		 */
+		CDEBUG(D_RPCTRACE, "set %p going to sleep for %lld seconds\n",
+		       set, timeout);
+
+		if ((timeout == 0 && !signal_pending(current)) ||
+		    set->set_allow_intr) {
+			/*
+			 * No requests are in-flight (ether timed out
+			 * or delayed), so we can allow interrupts.
+			 * We still want to block for a limited time,
+			 * so we allow interrupts during the timeout.
+			 */
+			rc = l_wait_event_abortable_timeout(
+				set->set_waitq,
+				ptlrpc_check_set(NULL, set),
+				cfs_time_seconds(timeout ? timeout : 1));
+			if (rc == 0) {
+				rc = -ETIMEDOUT;
+				ptlrpc_expired_set(set);
+			} else if (rc < 0) {
+				rc = -EINTR;
+				ptlrpc_interrupted_set(set);
+			} else {
+				rc = 0;
+			}
+		} else {
+			/*
+			 * At least one request is in flight, so no
+			 * interrupts are allowed. Wait until all
+			 * complete, or an in-flight req times out.
+			 */
+			rc = wait_event_idle_timeout(
+				set->set_waitq,
+				ptlrpc_check_set(NULL, set),
+				cfs_time_seconds(timeout ? timeout : 1));
+			if (rc == 0) {
+				ptlrpc_expired_set(set);
+				rc = -ETIMEDOUT;
+			} else {
+				rc = 0;
+			}
+
+			/*
+			 * LU-769 - if we ignored the signal because
+			 * it was already pending when we started, we
+			 * need to handle it now or we risk it being
+			 * ignored forever
+			 */
+			if (rc == -ETIMEDOUT &&
+			    signal_pending(current)) {
+				sigset_t old, new;
+
+				siginitset(&new, LUSTRE_FATAL_SIGS);
+				sigprocmask(SIG_BLOCK, &new, &old);
+				/*
+				 * In fact we only interrupt for the
+				 * "fatal" signals like SIGINT or
+				 * SIGKILL. We still ignore less
+				 * important signals since ptlrpc set
+				 * is not easily reentrant from
+				 * userspace again
+				 */
+				if (signal_pending(current))
+					ptlrpc_interrupted_set(set);
+				sigprocmask(SIG_SETMASK, &old, NULL);
+			}
+		}
+
+		LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
+
+		/*
+		 * -EINTR => all requests have been flagged rq_intr so next
+		 * check completes.
+		 * -ETIMEDOUT => someone timed out.  When all reqs have
+		 * timed out, signals are enabled allowing completion with
+		 * EINTR.
+		 * I don't really care if we go once more round the loop in
+		 * the error cases -eeb.
+		 */
+		if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
+			list_for_each_entry(req, &set->set_requests,
+					    rq_set_chain) {
+				spin_lock(&req->rq_lock);
+				req->rq_invalid_rqset = 1;
+				spin_unlock(&req->rq_lock);
+			}
+		}
+	} while (rc != 0 || atomic_read(&set->set_remaining) != 0);
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+	rc = set->set_rc; /* rq_status of already freed requests if any */
+	list_for_each_entry(req, &set->set_requests, rq_set_chain) {
+		LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
+		if (req->rq_status != 0)
+			rc = req->rq_status;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_wait);
+
+/**
+ * Helper fuction for request freeing.
+ * Called when request count reached zero and request needs to be freed.
+ * Removes request from all sorts of sending/replay lists it might be on,
+ * frees network buffers if any are present.
+ * If \a locked is set, that means caller is already holding import imp_lock
+ * and so we no longer need to reobtain it (for certain lists manipulations)
+ */
+static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
+{
+	ENTRY;
+
+	if (!request)
+		RETURN_EXIT;
+
+	LASSERT(!request->rq_srv_req);
+	LASSERT(request->rq_export == NULL);
+	LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+	LASSERTF(!request->rq_replay, "req %p\n", request);
+
+	req_capsule_fini(&request->rq_pill);
+
+	/*
+	 * We must take it off the imp_replay_list first.  Otherwise, we'll set
+	 * request->rq_reqmsg to NULL while osc_close is dereferencing it.
+	 */
+	if (request->rq_import) {
+		if (!locked)
+			spin_lock(&request->rq_import->imp_lock);
+		list_del_init(&request->rq_replay_list);
+		list_del_init(&request->rq_unreplied_list);
+		if (!locked)
+			spin_unlock(&request->rq_import->imp_lock);
+	}
+	LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
+
+	if (atomic_read(&request->rq_refcount) != 0) {
+		DEBUG_REQ(D_ERROR, request,
+			  "freeing request with nonzero refcount");
+		LBUG();
+	}
+
+	if (request->rq_repbuf)
+		sptlrpc_cli_free_repbuf(request);
+
+	if (request->rq_import) {
+		if (!ptlrpcd_check_work(request)) {
+			LASSERT(atomic_read(&request->rq_import->imp_reqs) > 0);
+			atomic_dec(&request->rq_import->imp_reqs);
+		}
+		class_import_put(request->rq_import);
+		request->rq_import = NULL;
+	}
+	if (request->rq_bulk)
+		ptlrpc_free_bulk(request->rq_bulk);
+
+	if (request->rq_reqbuf || request->rq_clrbuf)
+		sptlrpc_cli_free_reqbuf(request);
+
+	if (request->rq_cli_ctx)
+		sptlrpc_req_put_ctx(request, !locked);
+
+	if (request->rq_pool)
+		__ptlrpc_free_req_to_pool(request);
+	else
+		ptlrpc_request_cache_free(request);
+	EXIT;
+}
+
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
+/**
+ * Drop one request reference. Must be called with import imp_lock held.
+ * When reference count drops to zero, request is freed.
+ */
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
+{
+	assert_spin_locked(&request->rq_import->imp_lock);
+	(void)__ptlrpc_req_finished(request, 1);
+}
+
+/**
+ * Helper function
+ * Drops one reference count for request \a request.
+ * \a locked set indicates that caller holds import imp_lock.
+ * Frees the request whe reference count reaches zero.
+ *
+ * \retval 1	the request is freed
+ * \retval 0	some others still hold references on the request
+ */
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
+{
+	int count;
+
+	ENTRY;
+	if (!request)
+		RETURN(1);
+
+	LASSERT(request != LP_POISON);
+	LASSERT(request->rq_reqmsg != LP_POISON);
+
+	DEBUG_REQ(D_INFO, request, "refcount now %u",
+		  atomic_read(&request->rq_refcount) - 1);
+
+	spin_lock(&request->rq_lock);
+	count = atomic_dec_return(&request->rq_refcount);
+	LASSERTF(count >= 0, "Invalid ref count %d\n", count);
+
+	/*
+	 * For open RPC, the client does not know the EA size (LOV, ACL, and
+	 * so on) before replied, then the client has to reserve very large
+	 * reply buffer. Such buffer will not be released until the RPC freed.
+	 * Since The open RPC is replayable, we need to keep it in the replay
+	 * list until close. If there are a lot of files opened concurrently,
+	 * then the client may be OOM.
+	 *
+	 * If fact, it is unnecessary to keep reply buffer for open replay,
+	 * related EAs have already been saved via mdc_save_lovea() before
+	 * coming here. So it is safe to free the reply buffer some earlier
+	 * before releasing the RPC to avoid client OOM. LU-9514
+	 */
+	if (count == 1 && request->rq_early_free_repbuf && request->rq_repbuf) {
+		spin_lock(&request->rq_early_free_lock);
+		sptlrpc_cli_free_repbuf(request);
+		request->rq_repbuf = NULL;
+		request->rq_repbuf_len = 0;
+		request->rq_repdata = NULL;
+		request->rq_reqdata_len = 0;
+		spin_unlock(&request->rq_early_free_lock);
+	}
+	spin_unlock(&request->rq_lock);
+
+	if (!count)
+		__ptlrpc_free_req(request, locked);
+
+	RETURN(!count);
+}
+
+/**
+ * Drops one reference count for a request.
+ */
+void ptlrpc_req_finished(struct ptlrpc_request *request)
+{
+	__ptlrpc_req_finished(request, 0);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished);
+
+/**
+ * Returns xid of a \a request
+ */
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request)
+{
+	return request->rq_xid;
+}
+EXPORT_SYMBOL(ptlrpc_req_xid);
+
+/**
+ * Disengage the client's reply buffer from the network
+ * NB does _NOT_ unregister any client-side bulk.
+ * IDEMPOTENT, but _not_ safe against concurrent callers.
+ * The request owner (i.e. the thread doing the I/O) must call...
+ * Returns 0 on success or 1 if unregistering cannot be made.
+ */
+static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
+{
+	bool discard = false;
+	/*
+	 * Might sleep.
+	 */
+	LASSERT(!in_interrupt());
+
+	/* Let's setup deadline for reply unlink. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    async && request->rq_reply_deadline == 0 && cfs_fail_val == 0)
+		request->rq_reply_deadline = ktime_get_real_seconds() +
+					     PTLRPC_REQ_LONG_UNLINK;
+
+	/*
+	 * Nothing left to do.
+	 */
+	if (!__ptlrpc_cli_wait_unlink(request, &discard))
+		RETURN(1);
+
+	LNetMDUnlink(request->rq_reply_md_h);
+
+	if (discard) /* Discard the request-out callback */
+		__LNetMDUnlink(request->rq_req_md_h, discard);
+
+	/*
+	 * Let's check it once again.
+	 */
+	if (!ptlrpc_cli_wait_unlink(request))
+		RETURN(1);
+
+	/* Move to "Unregistering" phase as reply was not unlinked yet. */
+	ptlrpc_rqphase_move(request, RQ_PHASE_UNREG_RPC);
+
+	/*
+	 * Do not wait for unlink to finish.
+	 */
+	if (async)
+		RETURN(0);
+
+	/*
+	 * We have to wait_event_idle_timeout() whatever the result, to get
+	 * a chance to run reply_in_callback(), and to make sure we've
+	 * unlinked before returning a req to the pool.
+	 */
+	for (;;) {
+		wait_queue_head_t *wq = (request->rq_set) ?
+					&request->rq_set->set_waitq :
+					&request->rq_reply_waitq;
+		int seconds = PTLRPC_REQ_LONG_UNLINK;
+		/*
+		 * Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs
+		 */
+		while (seconds > 0 &&
+		       wait_event_idle_timeout(
+			       *wq,
+			       !ptlrpc_cli_wait_unlink(request),
+			       cfs_time_seconds(1)) == 0)
+			seconds -= 1;
+		if (seconds > 0) {
+			ptlrpc_rqphase_move(request, request->rq_next_phase);
+			RETURN(1);
+		}
+
+		DEBUG_REQ(D_WARNING, request,
+			  "Unexpectedly long timeout receiving_reply=%d req_ulinked=%d reply_unlinked=%d",
+			  request->rq_receiving_reply,
+			  request->rq_req_unlinked,
+			  request->rq_reply_unlinked);
+	}
+	RETURN(0);
+}
+
+static void ptlrpc_free_request(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_lock);
+	req->rq_replay = 0;
+	spin_unlock(&req->rq_lock);
+
+	if (req->rq_commit_cb)
+		req->rq_commit_cb(req);
+	list_del_init(&req->rq_replay_list);
+
+	__ptlrpc_req_finished(req, 1);
+}
+
+/**
+ * the request is committed and dropped from the replay list of its import
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
+{
+	struct obd_import *imp = req->rq_import;
+
+	spin_lock(&imp->imp_lock);
+	if (list_empty(&req->rq_replay_list)) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	if (force || req->rq_transno <= imp->imp_peer_committed_transno) {
+		if (imp->imp_replay_cursor == &req->rq_replay_list)
+			imp->imp_replay_cursor = req->rq_replay_list.next;
+		ptlrpc_free_request(req);
+	}
+
+	spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_request_committed);
+
+/**
+ * Iterates through replay_list on import and prunes
+ * all requests have transno smaller than last_committed for the
+ * import and don't have rq_replay set.
+ * Since requests are sorted in transno order, stops when meeting first
+ * transno bigger than last_committed.
+ * caller must hold imp->imp_lock
+ */
+void ptlrpc_free_committed(struct obd_import *imp)
+{
+	struct ptlrpc_request *req, *saved;
+	struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
+	bool skip_committed_list = true;
+	unsigned int replay_scanned = 0, replay_freed = 0;
+	unsigned int commit_scanned = 0, commit_freed = 0;
+	unsigned int debug_level = D_INFO;
+	__u64 peer_committed_transno;
+	int imp_generation;
+	time64_t start, now;
+
+	ENTRY;
+	LASSERT(imp != NULL);
+	assert_spin_locked(&imp->imp_lock);
+
+	start = ktime_get_seconds();
+	/* save these here, we can potentially drop imp_lock after checking */
+	peer_committed_transno = imp->imp_peer_committed_transno;
+	imp_generation = imp->imp_generation;
+
+	if (peer_committed_transno == imp->imp_last_transno_checked &&
+	    imp_generation == imp->imp_last_generation_checked) {
+		CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n",
+		       imp->imp_obd->obd_name, peer_committed_transno);
+		RETURN_EXIT;
+	}
+	CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n",
+	       imp->imp_obd->obd_name, peer_committed_transno, imp_generation);
+
+	if (imp_generation != imp->imp_last_generation_checked ||
+	    imp->imp_last_transno_checked == 0)
+		skip_committed_list = false;
+	/* maybe drop imp_lock here, if another lock protected the lists */
+
+	list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
+				 rq_replay_list) {
+		/* XXX ok to remove when 1357 resolved - rread 05/29/03  */
+		LASSERT(req != last_req);
+		last_req = req;
+
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_EMERG, req, "zero transno during replay");
+			LBUG();
+		}
+
+		/* If other threads are waiting on imp_lock, stop processing
+		 * in this thread. Another thread can finish remaining work.
+		 * This may happen if there are huge numbers of open files
+		 * that are closed suddenly or evicted, or if the server
+		 * commit interval is very high vs. RPC rate.
+		 */
+		if (++replay_scanned % 2048 == 0) {
+			now = ktime_get_seconds();
+			if (now > start + 5)
+				debug_level = D_WARNING;
+
+			if ((replay_freed > 128 && now > start + 3) &&
+			    atomic_read(&imp->imp_waiting)) {
+				if (debug_level == D_INFO)
+					debug_level = D_RPCTRACE;
+				break;
+			}
+		}
+
+		if (req->rq_import_generation < imp_generation) {
+			DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
+			GOTO(free_req, 0);
+		}
+
+		/* not yet committed */
+		if (req->rq_transno > peer_committed_transno) {
+			DEBUG_REQ(D_RPCTRACE, req, "stopping search");
+			break;
+		}
+
+		if (req->rq_replay) {
+			DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+			list_move_tail(&req->rq_replay_list,
+				       &imp->imp_committed_list);
+			continue;
+		}
+
+		DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)",
+			  peer_committed_transno);
+free_req:
+		replay_freed++;
+		ptlrpc_free_request(req);
+	}
+
+	if (skip_committed_list)
+		GOTO(out, 0);
+
+	list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
+				 rq_replay_list) {
+		LASSERT(req->rq_transno != 0);
+
+		/* If other threads are waiting on imp_lock, stop processing
+		 * in this thread. Another thread can finish remaining work. */
+		if (++commit_scanned % 2048 == 0) {
+			now = ktime_get_seconds();
+			if (now > start + 6)
+				debug_level = D_WARNING;
+
+			if ((commit_freed > 128 && now > start + 4) &&
+			    atomic_read(&imp->imp_waiting)) {
+				if (debug_level == D_INFO)
+					debug_level = D_RPCTRACE;
+				break;
+			}
+		}
+
+		if (req->rq_import_generation < imp_generation ||
+		    !req->rq_replay) {
+			DEBUG_REQ(D_RPCTRACE, req, "free %s open request",
+				  req->rq_import_generation <
+				  imp_generation ? "stale" : "closed");
+
+			if (imp->imp_replay_cursor == &req->rq_replay_list)
+				imp->imp_replay_cursor =
+					req->rq_replay_list.next;
+
+			commit_freed++;
+			ptlrpc_free_request(req);
+		}
+	}
+out:
+	/* if full lists processed without interruption, avoid next scan */
+	if (debug_level == D_INFO) {
+		imp->imp_last_transno_checked = peer_committed_transno;
+		imp->imp_last_generation_checked = imp_generation;
+	}
+
+	CDEBUG_LIMIT(debug_level,
+		     "%s: %s: skip=%u replay=%u/%u committed=%u/%u\n",
+		     imp->imp_obd->obd_name,
+		     debug_level == D_INFO ? "normal" : "overloaded",
+		     skip_committed_list, replay_freed, replay_scanned,
+		     commit_freed, commit_scanned);
+	EXIT;
+}
+
+void ptlrpc_cleanup_client(struct obd_import *imp)
+{
+	ENTRY;
+	EXIT;
+}
+
+/**
+ * Schedule previously sent request for resend.
+ * For bulk requests we assign new xid (to avoid problems with
+ * lost replies and therefore several transfers landing into same buffer
+ * from different sending attempts).
+ */
+void ptlrpc_resend_req(struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "going to resend");
+	spin_lock(&req->rq_lock);
+
+	/*
+	 * Request got reply but linked to the import list still.
+	 * Let ptlrpc_check_set() process it.
+	 */
+	if (ptlrpc_client_replied(req)) {
+		spin_unlock(&req->rq_lock);
+		DEBUG_REQ(D_HA, req, "it has reply, so skip it");
+		return;
+	}
+
+	req->rq_status = -EAGAIN;
+
+	req->rq_resend = 1;
+	req->rq_net_err = 0;
+	req->rq_timedout = 0;
+
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+
+/* XXX: this function and rq_status are currently unused */
+void ptlrpc_restart_req(struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
+	req->rq_status = -ERESTARTSYS;
+
+	spin_lock(&req->rq_lock);
+	req->rq_restart = 1;
+	req->rq_timedout = 0;
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+
+/**
+ * Grab additional reference on a request \a req
+ */
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
+{
+	ENTRY;
+	atomic_inc(&req->rq_refcount);
+	RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpc_request_addref);
+
+/**
+ * Add a request to import replay_list.
+ * Must be called under imp_lock
+ */
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+				      struct obd_import *imp)
+{
+	struct ptlrpc_request *iter;
+
+	assert_spin_locked(&imp->imp_lock);
+
+	if (req->rq_transno == 0) {
+		DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
+		LBUG();
+	}
+
+	/*
+	 * clear this for new requests that were resent as well
+	 * as resent replayed requests.
+	 */
+	lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
+	/* don't re-add requests that have been replayed */
+	if (!list_empty(&req->rq_replay_list))
+		return;
+
+	lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
+
+	spin_lock(&req->rq_lock);
+	req->rq_resend = 0;
+	spin_unlock(&req->rq_lock);
+
+	LASSERT(imp->imp_replayable);
+	/* Balanced in ptlrpc_free_committed, usually. */
+	ptlrpc_request_addref(req);
+	list_for_each_entry_reverse(iter, &imp->imp_replay_list,
+				    rq_replay_list) {
+		/*
+		 * We may have duplicate transnos if we create and then
+		 * open a file, or for closes retained if to match creating
+		 * opens, so use req->rq_xid as a secondary key.
+		 * (See bugs 684, 685, and 428.)
+		 * XXX no longer needed, but all opens need transnos!
+		 */
+		if (iter->rq_transno > req->rq_transno)
+			continue;
+
+		if (iter->rq_transno == req->rq_transno) {
+			LASSERT(iter->rq_xid != req->rq_xid);
+			if (iter->rq_xid > req->rq_xid)
+				continue;
+		}
+
+		list_add(&req->rq_replay_list, &iter->rq_replay_list);
+		return;
+	}
+
+	list_add(&req->rq_replay_list, &imp->imp_replay_list);
+}
+
+/**
+ * Send request and wait until it completes.
+ * Returns request processing status.
+ */
+int ptlrpc_queue_wait(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set;
+	int rc;
+
+	ENTRY;
+	LASSERT(req->rq_set == NULL);
+	LASSERT(!req->rq_receiving_reply);
+
+	set = ptlrpc_prep_set();
+	if (!set) {
+		CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
+		RETURN(-ENOMEM);
+	}
+
+	/* for distributed debugging */
+	lustre_msg_set_status(req->rq_reqmsg, current->pid);
+
+	/* add a ref for the set (see comment in ptlrpc_set_add_req) */
+	ptlrpc_request_addref(req);
+	ptlrpc_set_add_req(set, req);
+	rc = ptlrpc_set_wait(NULL, set);
+	ptlrpc_set_destroy(set);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_queue_wait);
+
+/**
+ * Callback used for replayed requests reply processing.
+ * In case of successful reply calls registered request replay callback.
+ * In case of error restart replay process.
+ */
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+				   struct ptlrpc_request *req,
+				   void *args, int rc)
+{
+	struct ptlrpc_replay_async_args *aa = args;
+	struct obd_import *imp = req->rq_import;
+
+	ENTRY;
+	atomic_dec(&imp->imp_replay_inflight);
+
+	/*
+	 * Note: if it is bulk replay (MDS-MDS replay), then even if
+	 * server got the request, but bulk transfer timeout, let's
+	 * replay the bulk req again
+	 */
+	if (!ptlrpc_client_replied(req) ||
+	    (req->rq_bulk &&
+	     lustre_msg_get_status(req->rq_repmsg) == -ETIMEDOUT)) {
+		DEBUG_REQ(D_ERROR, req, "request replay timed out");
+		GOTO(out, rc = -ETIMEDOUT);
+	}
+
+	if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
+	    (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
+	    lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
+		GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
+
+	/** VBR: check version failure */
+	if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
+		/** replay was failed due to version mismatch */
+		DEBUG_REQ(D_WARNING, req, "Version mismatch during replay");
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+	} else {
+		/** The transno had better not change over replay. */
+		LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
+			 lustre_msg_get_transno(req->rq_repmsg) ||
+			 lustre_msg_get_transno(req->rq_repmsg) == 0,
+			 "%#llx/%#llx\n",
+			 lustre_msg_get_transno(req->rq_reqmsg),
+			 lustre_msg_get_transno(req->rq_repmsg));
+	}
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	spin_unlock(&imp->imp_lock);
+	LASSERT(imp->imp_last_replay_transno);
+
+	/* transaction number shouldn't be bigger than the latest replayed */
+	if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Reported transno=%llu is bigger than replayed=%llu",
+			  req->rq_transno,
+			  lustre_msg_get_transno(req->rq_reqmsg));
+		GOTO(out, rc = -EINVAL);
+	}
+
+	DEBUG_REQ(D_HA, req, "got reply");
+
+	/* let the callback do fixups, possibly including in the request */
+	if (req->rq_replay_cb)
+		req->rq_replay_cb(req);
+
+	if (ptlrpc_client_replied(req) &&
+	    lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
+		DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
+			  lustre_msg_get_status(req->rq_repmsg),
+			  aa->praa_old_status);
+
+		/*
+		 * Note: If the replay fails for MDT-MDT recovery, let's
+		 * abort all of the following requests in the replay
+		 * and sending list, because MDT-MDT update requests
+		 * are dependent on each other, see LU-7039
+		 */
+		if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
+			struct ptlrpc_request *free_req;
+			struct ptlrpc_request *tmp;
+
+			spin_lock(&imp->imp_lock);
+			list_for_each_entry_safe(free_req, tmp,
+						 &imp->imp_replay_list,
+						 rq_replay_list) {
+				ptlrpc_free_request(free_req);
+			}
+
+			list_for_each_entry_safe(free_req, tmp,
+						 &imp->imp_committed_list,
+						 rq_replay_list) {
+				ptlrpc_free_request(free_req);
+			}
+
+			list_for_each_entry_safe(free_req, tmp,
+						 &imp->imp_delayed_list,
+						 rq_list) {
+				spin_lock(&free_req->rq_lock);
+				free_req->rq_err = 1;
+				free_req->rq_status = -EIO;
+				ptlrpc_client_wake_req(free_req);
+				spin_unlock(&free_req->rq_lock);
+			}
+
+			list_for_each_entry_safe(free_req, tmp,
+						 &imp->imp_sending_list,
+						 rq_list) {
+				spin_lock(&free_req->rq_lock);
+				free_req->rq_err = 1;
+				free_req->rq_status = -EIO;
+				ptlrpc_client_wake_req(free_req);
+				spin_unlock(&free_req->rq_lock);
+			}
+			spin_unlock(&imp->imp_lock);
+		}
+	} else {
+		/* Put it back for re-replay. */
+		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+	}
+
+	/*
+	 * Errors while replay can set transno to 0, but
+	 * imp_last_replay_transno shouldn't be set to 0 anyway
+	 */
+	if (req->rq_transno == 0)
+		CERROR("Transno is 0 during replay!\n");
+
+	/* continue with recovery */
+	rc = ptlrpc_import_recovery_state_machine(imp);
+ out:
+	req->rq_send_state = aa->praa_old_state;
+
+	if (rc != 0)
+		/* this replay failed, so restart recovery */
+		ptlrpc_connect_import(imp);
+
+	RETURN(rc);
+}
+
+/**
+ * Prepares and queues request for replay.
+ * Adds it to ptlrpcd queue for actual sending.
+ * Returns 0 on success.
+ */
+int ptlrpc_replay_req(struct ptlrpc_request *req)
+{
+	struct ptlrpc_replay_async_args *aa;
+
+	ENTRY;
+
+	LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+	aa = ptlrpc_req_async_args(aa, req);
+	memset(aa, 0, sizeof(*aa));
+
+	/* Prepare request to be resent with ptlrpcd */
+	aa->praa_old_state = req->rq_send_state;
+	req->rq_send_state = LUSTRE_IMP_REPLAY;
+	req->rq_phase = RQ_PHASE_NEW;
+	req->rq_next_phase = RQ_PHASE_UNDEFINED;
+	if (req->rq_repmsg)
+		aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+	req->rq_status = 0;
+	req->rq_interpret_reply = ptlrpc_replay_interpret;
+	/* Readjust the timeout for current conditions */
+	ptlrpc_at_set_req_timeout(req);
+
+	/* Tell server net_latency to calculate how long to wait for reply. */
+	lustre_msg_set_service_timeout(req->rq_reqmsg,
+				       ptlrpc_at_get_net_latency(req));
+	DEBUG_REQ(D_HA, req, "REPLAY");
+
+	atomic_inc(&req->rq_import->imp_replay_inflight);
+	spin_lock(&req->rq_lock);
+	req->rq_early_free_repbuf = 0;
+	spin_unlock(&req->rq_lock);
+	ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
+
+	ptlrpcd_add_req(req);
+	RETURN(0);
+}
+
+/**
+ * Aborts all in-flight request on import \a imp sending and delayed lists
+ */
+void ptlrpc_abort_inflight(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	/*
+	 * Make sure that no new requests get processed for this import.
+	 * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
+	 * this flag and then putting requests on sending_list or delayed_list.
+	 */
+	assert_spin_locked(&imp->imp_lock);
+
+	/*
+	 * XXX locking?  Maybe we should remove each request with the list
+	 * locked?  Also, how do we know if the requests on the list are
+	 * being freed at this time?
+	 */
+	list_for_each_entry(req, &imp->imp_sending_list, rq_list) {
+		DEBUG_REQ(D_RPCTRACE, req, "inflight");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	list_for_each_entry(req, &imp->imp_delayed_list, rq_list) {
+		DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	/*
+	 * Last chance to free reqs left on the replay list, but we
+	 * will still leak reqs that haven't committed.
+	 */
+	if (imp->imp_replayable)
+		ptlrpc_free_committed(imp);
+
+	EXIT;
+}
+
+/**
+ * Abort all uncompleted requests in request set \a set
+ */
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+
+	LASSERT(set != NULL);
+
+	list_for_each_entry(req, &set->set_requests, rq_set_chain) {
+		spin_lock(&req->rq_lock);
+		if (req->rq_phase != RQ_PHASE_RPC) {
+			spin_unlock(&req->rq_lock);
+			continue;
+		}
+
+		req->rq_err = 1;
+		req->rq_status = -EINTR;
+		ptlrpc_client_wake_req(req);
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+/**
+ * Initialize the XID for the node.  This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing.  It does not need to be sequential.  Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would deliver old data into the wrong RDMA buffer) initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+	time64_t now = ktime_get_real_seconds();
+	u64 xid;
+
+	if (now < YEAR_2004) {
+		get_random_bytes(&xid, sizeof(xid));
+		xid >>= 2;
+		xid |= (1ULL << 61);
+	} else {
+		xid = (u64)now << 20;
+	}
+
+	/* Need to always be aligned to a power-of-two for mutli-bulk BRW */
+	BUILD_BUG_ON((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) !=
+		     0);
+	xid &= PTLRPC_BULK_OPS_MASK;
+	atomic64_set(&ptlrpc_last_xid, xid);
+}
+
+/**
+ * Increase xid and returns resulting new value to the caller.
+ *
+ * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
+ * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
+ * itself uses the last bulk xid needed, so the server can determine the
+ * the number of bulk transfers from the RPC XID and a bitmask.  The starting
+ * xid must align to a power-of-two value.
+ *
+ * This is assumed to be true due to the initial ptlrpc_last_xid
+ * value also being initialized to a power-of-two value. LU-1431
+ */
+__u64 ptlrpc_next_xid(void)
+{
+	return atomic64_add_return(PTLRPC_BULK_OPS_COUNT, &ptlrpc_last_xid);
+}
+
+/**
+ * If request has a new allocated XID (new request or EINPROGRESS resend),
+ * use this XID as matchbits of bulk, otherwise allocate a new matchbits for
+ * request to ensure previous bulk fails and avoid problems with lost replies
+ * and therefore several transfers landing into the same buffer from different
+ * sending attempts.
+ * Also, to avoid previous reply landing to a different sending attempt.
+ */
+void ptlrpc_set_mbits(struct ptlrpc_request *req)
+{
+	int md_count = req->rq_bulk ? req->rq_bulk->bd_md_count : 1;
+
+	/*
+	 * Generate new matchbits for all resend requests, including
+	 * resend replay.
+	 */
+	if (req->rq_resend) {
+		__u64 old_mbits = req->rq_mbits;
+
+		/*
+		 * First time resend on -EINPROGRESS will generate new xid,
+		 * so we can actually use the rq_xid as rq_mbits in such case,
+		 * however, it's bit hard to distinguish such resend with a
+		 * 'resend for the -EINPROGRESS resend'. To make it simple,
+		 * we opt to generate mbits for all resend cases.
+		 */
+		if (OCD_HAS_FLAG(&req->rq_import->imp_connect_data,
+				 BULK_MBITS)) {
+			req->rq_mbits = ptlrpc_next_xid();
+		} else {
+			/*
+			 * Old version transfers rq_xid to peer as
+			 * matchbits.
+			 */
+			spin_lock(&req->rq_import->imp_lock);
+			list_del_init(&req->rq_unreplied_list);
+			ptlrpc_assign_next_xid_nolock(req);
+			spin_unlock(&req->rq_import->imp_lock);
+			req->rq_mbits = req->rq_xid;
+		}
+		CDEBUG(D_HA, "resend with new mbits old x%llu new x%llu\n",
+		       old_mbits, req->rq_mbits);
+	} else if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+		/* Request being sent first time, use xid as matchbits. */
+		if (OCD_HAS_FLAG(&req->rq_import->imp_connect_data,
+				 BULK_MBITS) || req->rq_mbits == 0)
+		{
+			req->rq_mbits = req->rq_xid;
+		} else {
+			req->rq_mbits -= md_count - 1;
+		}
+	} else {
+		/*
+		 * Replay request, xid and matchbits have already been
+		 * correctly assigned.
+		 */
+		return;
+	}
+
+	/*
+	 * For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so
+	 * that server can infer the number of bulks that were prepared,
+	 * see LU-1431
+	 */
+	req->rq_mbits += md_count - 1;
+
+	/*
+	 * Set rq_xid as rq_mbits to indicate the final bulk for the old
+	 * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808.
+	 *
+	 * It's ok to directly set the rq_xid here, since this xid bump
+	 * won't affect the request position in unreplied list.
+	 */
+	if (!OCD_HAS_FLAG(&req->rq_import->imp_connect_data, BULK_MBITS))
+		req->rq_xid = req->rq_mbits;
+}
+
+/**
+ * Get a glimpse at what next xid value might have been.
+ * Returns possible next xid.
+ */
+__u64 ptlrpc_sample_next_xid(void)
+{
+	return atomic64_read(&ptlrpc_last_xid) + PTLRPC_BULK_OPS_COUNT;
+}
+EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
+/**
+ * Functions for operating ptlrpc workers.
+ *
+ * A ptlrpc work is a function which will be running inside ptlrpc context.
+ * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
+ *
+ * 1. after a work is created, it can be used many times, that is:
+ *         handler = ptlrpcd_alloc_work();
+ *         ptlrpcd_queue_work();
+ *
+ *    queue it again when necessary:
+ *         ptlrpcd_queue_work();
+ *         ptlrpcd_destroy_work();
+ * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
+ *    it will only be queued once in any time. Also as its name implies, it may
+ *    have delay before it really runs by ptlrpcd thread.
+ */
+struct ptlrpc_work_async_args {
+	int (*cb)(const struct lu_env *, void *);
+	void *cbdata;
+};
+
+static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
+{
+	/* re-initialize the req */
+	req->rq_timeout		= obd_timeout;
+	req->rq_sent		= ktime_get_real_seconds();
+	req->rq_deadline	= req->rq_sent + req->rq_timeout;
+	req->rq_phase		= RQ_PHASE_INTERPRET;
+	req->rq_next_phase	= RQ_PHASE_COMPLETE;
+	req->rq_xid		= ptlrpc_next_xid();
+	req->rq_import_generation = req->rq_import->imp_generation;
+
+	ptlrpcd_add_req(req);
+}
+
+static int work_interpreter(const struct lu_env *env,
+			    struct ptlrpc_request *req, void *args, int rc)
+{
+	struct ptlrpc_work_async_args *arg = args;
+
+	LASSERT(ptlrpcd_check_work(req));
+	LASSERT(arg->cb != NULL);
+
+	rc = arg->cb(env, arg->cbdata);
+
+	list_del_init(&req->rq_set_chain);
+	req->rq_set = NULL;
+
+	if (atomic_dec_return(&req->rq_refcount) > 1) {
+		atomic_set(&req->rq_refcount, 2);
+		ptlrpcd_add_work_req(req);
+	}
+	return rc;
+}
+
+static int worker_format;
+
+static int ptlrpcd_check_work(struct ptlrpc_request *req)
+{
+	return req->rq_pill.rc_fmt == (void *)&worker_format;
+}
+
+/**
+ * Create a work for ptlrpc.
+ */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+			 int (*cb)(const struct lu_env *, void *), void *cbdata)
+{
+	struct ptlrpc_request *req = NULL;
+	struct ptlrpc_work_async_args *args;
+
+	ENTRY;
+	might_sleep();
+
+	if (!cb)
+		RETURN(ERR_PTR(-EINVAL));
+
+	/* copy some code from deprecated fakereq. */
+	req = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (!req) {
+		CERROR("ptlrpc: run out of memory!\n");
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	ptlrpc_cli_req_init(req);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+	req->rq_type = PTL_RPC_MSG_REQUEST;
+	req->rq_import = class_import_get(imp);
+	req->rq_interpret_reply = work_interpreter;
+	/* don't want reply */
+	req->rq_no_delay = req->rq_no_resend = 1;
+	req->rq_pill.rc_fmt = (void *)&worker_format;
+
+	args = ptlrpc_req_async_args(args, req);
+	args->cb     = cb;
+	args->cbdata = cbdata;
+
+	RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpcd_alloc_work);
+
+void ptlrpcd_destroy_work(void *handler)
+{
+	struct ptlrpc_request *req = handler;
+
+	if (req)
+		ptlrpc_req_finished(req);
+}
+EXPORT_SYMBOL(ptlrpcd_destroy_work);
+
+int ptlrpcd_queue_work(void *handler)
+{
+	struct ptlrpc_request *req = handler;
+
+	/*
+	 * Check if the req is already being queued.
+	 *
+	 * Here comes a trick: it lacks a way of checking if a req is being
+	 * processed reliably in ptlrpc. Here I have to use refcount of req
+	 * for this purpose. This is okay because the caller should use this
+	 * req as opaque data. - Jinshan
+	 */
+	LASSERT(atomic_read(&req->rq_refcount) > 0);
+	if (atomic_inc_return(&req->rq_refcount) == 2)
+		ptlrpcd_add_work_req(req);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpcd_queue_work);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c b/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c
new file mode 100644
index 0000000000000..3f690ad652c0a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c
@@ -0,0 +1,174 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/delay.h>
+#include <libcfs/linux/linux-hash.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+#include "ptlrpc_internal.h"
+
+static struct rhashtable conn_hash;
+
+/*
+ * struct lnet_process_id may contain unassigned bytes which might not
+ * be zero, so we cannot just hash and compare bytes.
+ */
+
+static u32 lnet_process_id_hash(const void *data, u32 len, u32 seed)
+{
+	const struct lnet_processid *lpi = data;
+
+	seed = cfs_hash_32(seed ^ lpi->pid, 32);
+	seed = cfs_hash_32(nidhash(&lpi->nid) ^ seed, 32);
+	return seed;
+}
+
+static int lnet_process_id_cmp(struct rhashtable_compare_arg *arg,
+			       const void *obj)
+{
+	const struct lnet_processid *lpi = arg->key;
+	const struct ptlrpc_connection *con = obj;
+
+	if (nid_same(&lpi->nid, &con->c_peer.nid) &&
+	    lpi->pid == con->c_peer.pid)
+		return 0;
+	return -ESRCH;
+}
+
+static const struct rhashtable_params conn_hash_params = {
+	.key_len	= 1,	/* actually variable-length */
+	.key_offset	= offsetof(struct ptlrpc_connection, c_peer),
+	.head_offset	= offsetof(struct ptlrpc_connection, c_hash),
+	.hashfn		= lnet_process_id_hash,
+	.obj_cmpfn	= lnet_process_id_cmp,
+};
+
+struct ptlrpc_connection *
+ptlrpc_connection_get(struct lnet_process_id peer4, lnet_nid_t self,
+		      struct obd_uuid *uuid)
+{
+	struct ptlrpc_connection *conn, *conn2;
+	struct lnet_processid peer;
+	ENTRY;
+
+	peer4.nid = LNetPrimaryNID(peer4.nid);
+	lnet_pid4_to_pid(peer4, &peer);
+	conn = rhashtable_lookup_fast(&conn_hash, &peer, conn_hash_params);
+	if (conn) {
+		ptlrpc_connection_addref(conn);
+		GOTO(out, conn);
+	}
+
+	OBD_ALLOC_PTR(conn);
+	if (!conn)
+		RETURN(NULL);
+
+	conn->c_peer = peer;
+	lnet_nid4_to_nid(self, &conn->c_self);
+	atomic_set(&conn->c_refcount, 1);
+	if (uuid)
+		obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+
+	/*
+	 * Add the newly created conn to the hash, on key collision we
+	 * lost a racing addition and must destroy our newly allocated
+	 * connection.	The object which exists in the hash will be
+	 * returned,otherwise NULL is returned on success.
+	 */
+try_again:
+	conn2 = rhashtable_lookup_get_insert_fast(&conn_hash, &conn->c_hash,
+						  conn_hash_params);
+	if (conn2) {
+		/* insertion failed */
+		if (IS_ERR(conn2)) {
+			/* hash table could be resizing. */
+			if (PTR_ERR(conn2) == -ENOMEM ||
+			    PTR_ERR(conn2) == -EBUSY) {
+				msleep(5);
+				goto try_again;
+			}
+			conn2 = NULL;
+		}
+		OBD_FREE_PTR(conn);
+		conn = conn2;
+		if (conn)
+			ptlrpc_connection_addref(conn);
+	}
+	EXIT;
+out:
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nidstr(&conn->c_peer.nid));
+	return conn;
+}
+
+struct ptlrpc_connection *
+ptlrpc_connection_addref(struct ptlrpc_connection *conn)
+{
+	ENTRY;
+
+	atomic_inc(&conn->c_refcount);
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nidstr(&conn->c_peer.nid));
+
+	RETURN(conn);
+}
+
+static void
+conn_exit(void *vconn, void *data)
+{
+	struct ptlrpc_connection *conn = vconn;
+
+	/*
+	 * Nothing should be left. Connection user put it and
+	 * connection also was deleted from table by this time
+	 * so we should have 0 refs.
+	 */
+	LASSERTF(atomic_read(&conn->c_refcount) == 0,
+		 "Busy connection with %d refs\n",
+		 atomic_read(&conn->c_refcount));
+	OBD_FREE_PTR(conn);
+}
+
+int ptlrpc_connection_init(void)
+{
+	return rhashtable_init(&conn_hash, &conn_hash_params);
+}
+
+void ptlrpc_connection_fini(void)
+{
+	rhashtable_free_and_destroy(&conn_hash, conn_exit, NULL);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
new file mode 100644
index 0000000000000..987803be5b86f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
@@ -0,0 +1,411 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2011 FUJITSU LIMITED.  All rights reserved.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+
+#include <libcfs/libcfs.h>
+#include <lustre_errno.h>
+
+#ifdef LUSTRE_TRANSLATE_ERRNOS
+#include <lustre_dlm.h>
+
+/*
+ * The two translation tables below must define a one-to-one mapping between
+ * host and network errnos.
+ *
+ * EAGAIN is equal to EAGAIN on all architectures except for parisc, which
+ * appears irrelevant.  Thus, existing references to EAGAIN are fine.
+ *
+ * EDEADLOCK is equal to EDEADLK on x86 but not on sparc, at least.  A sparc
+ * host has no context-free way to determine if a LUSTRE_EDEADLK represents an
+ * EDEADLK or an EDEADLOCK.  Therefore, all existing references to EDEADLOCK
+ * that need to be transferred on wire have been replaced with EDEADLK.
+ */
+static int lustre_errno_hton_mapping[] = {
+	[EPERM]			= LUSTRE_EPERM,
+	[ENOENT]		= LUSTRE_ENOENT,
+	[ESRCH]			= LUSTRE_ESRCH,
+	[EINTR]			= LUSTRE_EINTR,
+	[EIO]			= LUSTRE_EIO,
+	[ENXIO]			= LUSTRE_ENXIO,
+	[E2BIG]			= LUSTRE_E2BIG,
+	[ENOEXEC]		= LUSTRE_ENOEXEC,
+	[EBADF]			= LUSTRE_EBADF,
+	[ECHILD]		= LUSTRE_ECHILD,
+	[EAGAIN]		= LUSTRE_EAGAIN,
+	[ENOMEM]		= LUSTRE_ENOMEM,
+	[EACCES]		= LUSTRE_EACCES,
+	[EFAULT]		= LUSTRE_EFAULT,
+	[ENOTBLK]		= LUSTRE_ENOTBLK,
+	[EBUSY]			= LUSTRE_EBUSY,
+	[EEXIST]		= LUSTRE_EEXIST,
+	[EXDEV]			= LUSTRE_EXDEV,
+	[ENODEV]		= LUSTRE_ENODEV,
+	[ENOTDIR]		= LUSTRE_ENOTDIR,
+	[EISDIR]		= LUSTRE_EISDIR,
+	[EINVAL]		= LUSTRE_EINVAL,
+	[ENFILE]		= LUSTRE_ENFILE,
+	[EMFILE]		= LUSTRE_EMFILE,
+	[ENOTTY]		= LUSTRE_ENOTTY,
+	[ETXTBSY]		= LUSTRE_ETXTBSY,
+	[EFBIG]			= LUSTRE_EFBIG,
+	[ENOSPC]		= LUSTRE_ENOSPC,
+	[ESPIPE]		= LUSTRE_ESPIPE,
+	[EROFS]			= LUSTRE_EROFS,
+	[EMLINK]		= LUSTRE_EMLINK,
+	[EPIPE]			= LUSTRE_EPIPE,
+	[EDOM]			= LUSTRE_EDOM,
+	[ERANGE]		= LUSTRE_ERANGE,
+	[EDEADLK]		= LUSTRE_EDEADLK,
+	[ENAMETOOLONG]		= LUSTRE_ENAMETOOLONG,
+	[ENOLCK]		= LUSTRE_ENOLCK,
+	[ENOSYS]		= LUSTRE_ENOSYS,
+	[ENOTEMPTY]		= LUSTRE_ENOTEMPTY,
+	[ELOOP]			= LUSTRE_ELOOP,
+	[ENOMSG]		= LUSTRE_ENOMSG,
+	[EIDRM]			= LUSTRE_EIDRM,
+	[ECHRNG]		= LUSTRE_ECHRNG,
+	[EL2NSYNC]		= LUSTRE_EL2NSYNC,
+	[EL3HLT]		= LUSTRE_EL3HLT,
+	[EL3RST]		= LUSTRE_EL3RST,
+	[ELNRNG]		= LUSTRE_ELNRNG,
+	[EUNATCH]		= LUSTRE_EUNATCH,
+	[ENOCSI]		= LUSTRE_ENOCSI,
+	[EL2HLT]		= LUSTRE_EL2HLT,
+	[EBADE]			= LUSTRE_EBADE,
+	[EBADR]			= LUSTRE_EBADR,
+	[EXFULL]		= LUSTRE_EXFULL,
+	[ENOANO]		= LUSTRE_ENOANO,
+	[EBADRQC]		= LUSTRE_EBADRQC,
+	[EBADSLT]		= LUSTRE_EBADSLT,
+	[EBFONT]		= LUSTRE_EBFONT,
+	[ENOSTR]		= LUSTRE_ENOSTR,
+	[ENODATA]		= LUSTRE_ENODATA,
+	[ETIME]			= LUSTRE_ETIME,
+	[ENOSR]			= LUSTRE_ENOSR,
+	[ENONET]		= LUSTRE_ENONET,
+	[ENOPKG]		= LUSTRE_ENOPKG,
+	[EREMOTE]		= LUSTRE_EREMOTE,
+	[ENOLINK]		= LUSTRE_ENOLINK,
+	[EADV]			= LUSTRE_EADV,
+	[ESRMNT]		= LUSTRE_ESRMNT,
+	[ECOMM]			= LUSTRE_ECOMM,
+	[EPROTO]		= LUSTRE_EPROTO,
+	[EMULTIHOP]		= LUSTRE_EMULTIHOP,
+	[EDOTDOT]		= LUSTRE_EDOTDOT,
+	[EBADMSG]		= LUSTRE_EBADMSG,
+	[EOVERFLOW]		= LUSTRE_EOVERFLOW,
+	[ENOTUNIQ]		= LUSTRE_ENOTUNIQ,
+	[EBADFD]		= LUSTRE_EBADFD,
+	[EREMCHG]		= LUSTRE_EREMCHG,
+	[ELIBACC]		= LUSTRE_ELIBACC,
+	[ELIBBAD]		= LUSTRE_ELIBBAD,
+	[ELIBSCN]		= LUSTRE_ELIBSCN,
+	[ELIBMAX]		= LUSTRE_ELIBMAX,
+	[ELIBEXEC]		= LUSTRE_ELIBEXEC,
+	[EILSEQ]		= LUSTRE_EILSEQ,
+	[ERESTART]		= LUSTRE_ERESTART,
+	[ESTRPIPE]		= LUSTRE_ESTRPIPE,
+	[EUSERS]		= LUSTRE_EUSERS,
+	[ENOTSOCK]		= LUSTRE_ENOTSOCK,
+	[EDESTADDRREQ]		= LUSTRE_EDESTADDRREQ,
+	[EMSGSIZE]		= LUSTRE_EMSGSIZE,
+	[EPROTOTYPE]		= LUSTRE_EPROTOTYPE,
+	[ENOPROTOOPT]		= LUSTRE_ENOPROTOOPT,
+	[EPROTONOSUPPORT]	= LUSTRE_EPROTONOSUPPORT,
+	[ESOCKTNOSUPPORT]	= LUSTRE_ESOCKTNOSUPPORT,
+	[EOPNOTSUPP]		= LUSTRE_EOPNOTSUPP,
+	[EPFNOSUPPORT]		= LUSTRE_EPFNOSUPPORT,
+	[EAFNOSUPPORT]		= LUSTRE_EAFNOSUPPORT,
+	[EADDRINUSE]		= LUSTRE_EADDRINUSE,
+	[EADDRNOTAVAIL]		= LUSTRE_EADDRNOTAVAIL,
+	[ENETDOWN]		= LUSTRE_ENETDOWN,
+	[ENETUNREACH]		= LUSTRE_ENETUNREACH,
+	[ENETRESET]		= LUSTRE_ENETRESET,
+	[ECONNABORTED]		= LUSTRE_ECONNABORTED,
+	[ECONNRESET]		= LUSTRE_ECONNRESET,
+	[ENOBUFS]		= LUSTRE_ENOBUFS,
+	[EISCONN]		= LUSTRE_EISCONN,
+	[ENOTCONN]		= LUSTRE_ENOTCONN,
+	[ESHUTDOWN]		= LUSTRE_ESHUTDOWN,
+	[ETOOMANYREFS]		= LUSTRE_ETOOMANYREFS,
+	[ETIMEDOUT]		= LUSTRE_ETIMEDOUT,
+	[ECONNREFUSED]		= LUSTRE_ECONNREFUSED,
+	[EHOSTDOWN]		= LUSTRE_EHOSTDOWN,
+	[EHOSTUNREACH]		= LUSTRE_EHOSTUNREACH,
+	[EALREADY]		= LUSTRE_EALREADY,
+	[EINPROGRESS]		= LUSTRE_EINPROGRESS,
+	[ESTALE]		= LUSTRE_ESTALE,
+	[EUCLEAN]		= LUSTRE_EUCLEAN,
+	[ENOTNAM]		= LUSTRE_ENOTNAM,
+	[ENAVAIL]		= LUSTRE_ENAVAIL,
+	[EISNAM]		= LUSTRE_EISNAM,
+	[EREMOTEIO]		= LUSTRE_EREMOTEIO,
+	[EDQUOT]		= LUSTRE_EDQUOT,
+	[ENOMEDIUM]		= LUSTRE_ENOMEDIUM,
+	[EMEDIUMTYPE]		= LUSTRE_EMEDIUMTYPE,
+	[ECANCELED]		= LUSTRE_ECANCELED,
+	[ENOKEY]		= LUSTRE_ENOKEY,
+	[EKEYEXPIRED]		= LUSTRE_EKEYEXPIRED,
+	[EKEYREVOKED]		= LUSTRE_EKEYREVOKED,
+	[EKEYREJECTED]		= LUSTRE_EKEYREJECTED,
+	[EOWNERDEAD]		= LUSTRE_EOWNERDEAD,
+	[ENOTRECOVERABLE]	= LUSTRE_ENOTRECOVERABLE,
+	[ERESTARTSYS]		= LUSTRE_ERESTARTSYS,
+	[ERESTARTNOINTR]	= LUSTRE_ERESTARTNOINTR,
+	[ERESTARTNOHAND]	= LUSTRE_ERESTARTNOHAND,
+	[ENOIOCTLCMD]		= LUSTRE_ENOIOCTLCMD,
+	[ERESTART_RESTARTBLOCK]	= LUSTRE_ERESTART_RESTARTBLOCK,
+	[EBADHANDLE]		= LUSTRE_EBADHANDLE,
+	[ENOTSYNC]		= LUSTRE_ENOTSYNC,
+	[EBADCOOKIE]		= LUSTRE_EBADCOOKIE,
+	[ENOTSUPP]		= LUSTRE_ENOTSUPP,
+	[ETOOSMALL]		= LUSTRE_ETOOSMALL,
+	[ESERVERFAULT]		= LUSTRE_ESERVERFAULT,
+	[EBADTYPE]		= LUSTRE_EBADTYPE,
+	[EJUKEBOX]		= LUSTRE_EJUKEBOX,
+	[EIOCBQUEUED]		= LUSTRE_EIOCBQUEUED,
+
+	/*
+	 * The ELDLM errors are Lustre specific errors whose ranges
+	 * lie in the middle of the above system errors. The ELDLM
+	 * numbers must be preserved to avoid LU-9793.
+	 */
+	[ELDLM_LOCK_CHANGED]	= ELDLM_LOCK_CHANGED,
+	[ELDLM_LOCK_ABORTED]	= ELDLM_LOCK_ABORTED,
+	[ELDLM_LOCK_REPLACED]	= ELDLM_LOCK_REPLACED,
+	[ELDLM_NO_LOCK_DATA]	= ELDLM_NO_LOCK_DATA,
+	[ELDLM_LOCK_WOULDBLOCK]	= ELDLM_LOCK_WOULDBLOCK,
+	[ELDLM_NAMESPACE_EXISTS]= ELDLM_NAMESPACE_EXISTS,
+	[ELDLM_BAD_NAMESPACE]	= ELDLM_BAD_NAMESPACE
+};
+
+static int lustre_errno_ntoh_mapping[] = {
+	[LUSTRE_EPERM]			= EPERM,
+	[LUSTRE_ENOENT]			= ENOENT,
+	[LUSTRE_ESRCH]			= ESRCH,
+	[LUSTRE_EINTR]			= EINTR,
+	[LUSTRE_EIO]			= EIO,
+	[LUSTRE_ENXIO]			= ENXIO,
+	[LUSTRE_E2BIG]			= E2BIG,
+	[LUSTRE_ENOEXEC]		= ENOEXEC,
+	[LUSTRE_EBADF]			= EBADF,
+	[LUSTRE_ECHILD]			= ECHILD,
+	[LUSTRE_EAGAIN]			= EAGAIN,
+	[LUSTRE_ENOMEM]			= ENOMEM,
+	[LUSTRE_EACCES]			= EACCES,
+	[LUSTRE_EFAULT]			= EFAULT,
+	[LUSTRE_ENOTBLK]		= ENOTBLK,
+	[LUSTRE_EBUSY]			= EBUSY,
+	[LUSTRE_EEXIST]			= EEXIST,
+	[LUSTRE_EXDEV]			= EXDEV,
+	[LUSTRE_ENODEV]			= ENODEV,
+	[LUSTRE_ENOTDIR]		= ENOTDIR,
+	[LUSTRE_EISDIR]			= EISDIR,
+	[LUSTRE_EINVAL]			= EINVAL,
+	[LUSTRE_ENFILE]			= ENFILE,
+	[LUSTRE_EMFILE]			= EMFILE,
+	[LUSTRE_ENOTTY]			= ENOTTY,
+	[LUSTRE_ETXTBSY]		= ETXTBSY,
+	[LUSTRE_EFBIG]			= EFBIG,
+	[LUSTRE_ENOSPC]			= ENOSPC,
+	[LUSTRE_ESPIPE]			= ESPIPE,
+	[LUSTRE_EROFS]			= EROFS,
+	[LUSTRE_EMLINK]			= EMLINK,
+	[LUSTRE_EPIPE]			= EPIPE,
+	[LUSTRE_EDOM]			= EDOM,
+	[LUSTRE_ERANGE]			= ERANGE,
+	[LUSTRE_EDEADLK]		= EDEADLK,
+	[LUSTRE_ENAMETOOLONG]		= ENAMETOOLONG,
+	[LUSTRE_ENOLCK]			= ENOLCK,
+	[LUSTRE_ENOSYS]			= ENOSYS,
+	[LUSTRE_ENOTEMPTY]		= ENOTEMPTY,
+	[LUSTRE_ELOOP]			= ELOOP,
+	[LUSTRE_ENOMSG]			= ENOMSG,
+	[LUSTRE_EIDRM]			= EIDRM,
+	[LUSTRE_ECHRNG]			= ECHRNG,
+	[LUSTRE_EL2NSYNC]		= EL2NSYNC,
+	[LUSTRE_EL3HLT]			= EL3HLT,
+	[LUSTRE_EL3RST]			= EL3RST,
+	[LUSTRE_ELNRNG]			= ELNRNG,
+	[LUSTRE_EUNATCH]		= EUNATCH,
+	[LUSTRE_ENOCSI]			= ENOCSI,
+	[LUSTRE_EL2HLT]			= EL2HLT,
+	[LUSTRE_EBADE]			= EBADE,
+	[LUSTRE_EBADR]			= EBADR,
+	[LUSTRE_EXFULL]			= EXFULL,
+	[LUSTRE_ENOANO]			= ENOANO,
+	[LUSTRE_EBADRQC]		= EBADRQC,
+	[LUSTRE_EBADSLT]		= EBADSLT,
+	[LUSTRE_EBFONT]			= EBFONT,
+	[LUSTRE_ENOSTR]			= ENOSTR,
+	[LUSTRE_ENODATA]		= ENODATA,
+	[LUSTRE_ETIME]			= ETIME,
+	[LUSTRE_ENOSR]			= ENOSR,
+	[LUSTRE_ENONET]			= ENONET,
+	[LUSTRE_ENOPKG]			= ENOPKG,
+	[LUSTRE_EREMOTE]		= EREMOTE,
+	[LUSTRE_ENOLINK]		= ENOLINK,
+	[LUSTRE_EADV]			= EADV,
+	[LUSTRE_ESRMNT]			= ESRMNT,
+	[LUSTRE_ECOMM]			= ECOMM,
+	[LUSTRE_EPROTO]			= EPROTO,
+	[LUSTRE_EMULTIHOP]		= EMULTIHOP,
+	[LUSTRE_EDOTDOT]		= EDOTDOT,
+	[LUSTRE_EBADMSG]		= EBADMSG,
+	[LUSTRE_EOVERFLOW]		= EOVERFLOW,
+	[LUSTRE_ENOTUNIQ]		= ENOTUNIQ,
+	[LUSTRE_EBADFD]			= EBADFD,
+	[LUSTRE_EREMCHG]		= EREMCHG,
+	[LUSTRE_ELIBACC]		= ELIBACC,
+	[LUSTRE_ELIBBAD]		= ELIBBAD,
+	[LUSTRE_ELIBSCN]		= ELIBSCN,
+	[LUSTRE_ELIBMAX]		= ELIBMAX,
+	[LUSTRE_ELIBEXEC]		= ELIBEXEC,
+	[LUSTRE_EILSEQ]			= EILSEQ,
+	[LUSTRE_ERESTART]		= ERESTART,
+	[LUSTRE_ESTRPIPE]		= ESTRPIPE,
+	[LUSTRE_EUSERS]			= EUSERS,
+	[LUSTRE_ENOTSOCK]		= ENOTSOCK,
+	[LUSTRE_EDESTADDRREQ]		= EDESTADDRREQ,
+	[LUSTRE_EMSGSIZE]		= EMSGSIZE,
+	[LUSTRE_EPROTOTYPE]		= EPROTOTYPE,
+	[LUSTRE_ENOPROTOOPT]		= ENOPROTOOPT,
+	[LUSTRE_EPROTONOSUPPORT]	= EPROTONOSUPPORT,
+	[LUSTRE_ESOCKTNOSUPPORT]	= ESOCKTNOSUPPORT,
+	[LUSTRE_EOPNOTSUPP]		= EOPNOTSUPP,
+	[LUSTRE_EPFNOSUPPORT]		= EPFNOSUPPORT,
+	[LUSTRE_EAFNOSUPPORT]		= EAFNOSUPPORT,
+	[LUSTRE_EADDRINUSE]		= EADDRINUSE,
+	[LUSTRE_EADDRNOTAVAIL]		= EADDRNOTAVAIL,
+	[LUSTRE_ENETDOWN]		= ENETDOWN,
+	[LUSTRE_ENETUNREACH]		= ENETUNREACH,
+	[LUSTRE_ENETRESET]		= ENETRESET,
+	[LUSTRE_ECONNABORTED]		= ECONNABORTED,
+	[LUSTRE_ECONNRESET]		= ECONNRESET,
+	[LUSTRE_ENOBUFS]		= ENOBUFS,
+	[LUSTRE_EISCONN]		= EISCONN,
+	[LUSTRE_ENOTCONN]		= ENOTCONN,
+	[LUSTRE_ESHUTDOWN]		= ESHUTDOWN,
+	[LUSTRE_ETOOMANYREFS]		= ETOOMANYREFS,
+	[LUSTRE_ETIMEDOUT]		= ETIMEDOUT,
+	[LUSTRE_ECONNREFUSED]		= ECONNREFUSED,
+	[LUSTRE_EHOSTDOWN]		= EHOSTDOWN,
+	[LUSTRE_EHOSTUNREACH]		= EHOSTUNREACH,
+	[LUSTRE_EALREADY]		= EALREADY,
+	[LUSTRE_EINPROGRESS]		= EINPROGRESS,
+	[LUSTRE_ESTALE]			= ESTALE,
+	[LUSTRE_EUCLEAN]		= EUCLEAN,
+	[LUSTRE_ENOTNAM]		= ENOTNAM,
+	[LUSTRE_ENAVAIL]		= ENAVAIL,
+	[LUSTRE_EISNAM]			= EISNAM,
+	[LUSTRE_EREMOTEIO]		= EREMOTEIO,
+	[LUSTRE_EDQUOT]			= EDQUOT,
+	[LUSTRE_ENOMEDIUM]		= ENOMEDIUM,
+	[LUSTRE_EMEDIUMTYPE]		= EMEDIUMTYPE,
+	[LUSTRE_ECANCELED]		= ECANCELED,
+	[LUSTRE_ENOKEY]			= ENOKEY,
+	[LUSTRE_EKEYEXPIRED]		= EKEYEXPIRED,
+	[LUSTRE_EKEYREVOKED]		= EKEYREVOKED,
+	[LUSTRE_EKEYREJECTED]		= EKEYREJECTED,
+	[LUSTRE_EOWNERDEAD]		= EOWNERDEAD,
+	[LUSTRE_ENOTRECOVERABLE]	= ENOTRECOVERABLE,
+	[LUSTRE_ERESTARTSYS]		= ERESTARTSYS,
+	[LUSTRE_ERESTARTNOINTR]		= ERESTARTNOINTR,
+	[LUSTRE_ERESTARTNOHAND]		= ERESTARTNOHAND,
+	[LUSTRE_ENOIOCTLCMD]		= ENOIOCTLCMD,
+	[LUSTRE_ERESTART_RESTARTBLOCK]	= ERESTART_RESTARTBLOCK,
+	[LUSTRE_EBADHANDLE]		= EBADHANDLE,
+	[LUSTRE_ENOTSYNC]		= ENOTSYNC,
+	[LUSTRE_EBADCOOKIE]		= EBADCOOKIE,
+	[LUSTRE_ENOTSUPP]		= ENOTSUPP,
+	[LUSTRE_ETOOSMALL]		= ETOOSMALL,
+	[LUSTRE_ESERVERFAULT]		= ESERVERFAULT,
+	[LUSTRE_EBADTYPE]		= EBADTYPE,
+	[LUSTRE_EJUKEBOX]		= EJUKEBOX,
+	[LUSTRE_EIOCBQUEUED]		= EIOCBQUEUED,
+
+	/*
+	 * The ELDLM errors are Lustre specific errors whose ranges
+	 * lie in the middle of the above system errors. The ELDLM
+	 * numbers must be preserved to avoid LU-9793.
+	 */
+	[ELDLM_LOCK_CHANGED]		= ELDLM_LOCK_CHANGED,
+	[ELDLM_LOCK_ABORTED]		= ELDLM_LOCK_ABORTED,
+	[ELDLM_LOCK_REPLACED]		= ELDLM_LOCK_REPLACED,
+	[ELDLM_NO_LOCK_DATA]		= ELDLM_NO_LOCK_DATA,
+	[ELDLM_LOCK_WOULDBLOCK]		= ELDLM_LOCK_WOULDBLOCK,
+	[ELDLM_NAMESPACE_EXISTS]	= ELDLM_NAMESPACE_EXISTS,
+	[ELDLM_BAD_NAMESPACE]		= ELDLM_BAD_NAMESPACE
+};
+
+unsigned int lustre_errno_hton(unsigned int h)
+{
+	unsigned int n;
+
+	if (h == 0) {
+		n = 0;
+	} else if (h < ARRAY_SIZE(lustre_errno_hton_mapping)) {
+		n = lustre_errno_hton_mapping[h];
+		if (n == 0)
+			goto generic;
+	} else {
+generic:
+		/*
+		 * A generic errno is better than the unknown one that could
+		 * mean anything to a different host.
+		 */
+		n = LUSTRE_EIO;
+	}
+
+	return n;
+}
+EXPORT_SYMBOL(lustre_errno_hton);
+
+unsigned int lustre_errno_ntoh(unsigned int n)
+{
+	unsigned int h;
+
+	if (n == 0) {
+		h = 0;
+	} else if (n < ARRAY_SIZE(lustre_errno_ntoh_mapping)) {
+		h = lustre_errno_ntoh_mapping[n];
+		if (h == 0)
+			goto generic;
+	} else {
+generic:
+		/*
+		 * Similar to the situation in lustre_errno_hton(), an unknown
+		 * network errno could coincide with anything.  Hence, it is
+		 * better to return a generic errno.
+		 */
+		h = EIO;
+	}
+
+	return h;
+}
+EXPORT_SYMBOL(lustre_errno_ntoh);
+
+#endif /* LUSTRE_TRANSLATE_ERRNOS */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
new file mode 100644
index 0000000000000..06e4aad174c4b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
@@ -0,0 +1,655 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <libcfs/libcfs.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ptlrpc_internal.h"
+
+lnet_handler_t ptlrpc_handler;
+struct percpu_ref ptlrpc_pending;
+
+/*
+ *  Client's outgoing request callback
+ */
+void request_out_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id   *cbid = ev->md_user_ptr;
+	struct ptlrpc_request *req = cbid->cbid_arg;
+	bool		       wakeup = false;
+	ENTRY;
+
+	LASSERT(ev->type == LNET_EVENT_SEND || ev->type == LNET_EVENT_UNLINK);
+	LASSERT(ev->unlinked);
+
+	if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val &&
+		     CFS_FAIL_CHECK_RESET(OBD_FAIL_NET_ERROR_RPC,
+					  OBD_FAIL_OSP_PRECREATE_PAUSE |
+					  CFS_FAIL_ONCE)))
+		ev->status = -ECONNABORTED;
+
+	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+	/* Do not update imp_next_ping for connection request */
+	if (lustre_msg_get_opc(req->rq_reqmsg) !=
+	    req->rq_import->imp_connect_op)
+		ptlrpc_pinger_sending_on_import(req->rq_import);
+
+	sptlrpc_request_out_callback(req);
+
+	spin_lock(&req->rq_lock);
+	req->rq_real_sent = ktime_get_real_seconds();
+	req->rq_req_unlinked = 1;
+	/* reply_in_callback happened before request_out_callback? */
+	if (req->rq_reply_unlinked)
+		wakeup = true;
+
+	if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) {
+		/* Failed send: make it seem like the reply timed out, just
+		 * like failing sends in client.c does currently...  */
+		req->rq_net_err = 1;
+		wakeup = true;
+	}
+
+	if (wakeup)
+		ptlrpc_client_wake_req(req);
+
+	spin_unlock(&req->rq_lock);
+
+	ptlrpc_req_finished(req);
+	EXIT;
+}
+
+/*
+ * Client's incoming reply callback
+ */
+void reply_in_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id   *cbid = ev->md_user_ptr;
+	struct ptlrpc_request *req = cbid->cbid_arg;
+	ENTRY;
+
+	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+	LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+	LASSERT(ev->md_start == req->rq_repbuf);
+	LASSERT(ev->offset + ev->mlength <= req->rq_repbuf_len);
+	/* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+	 * for adaptive timeouts' early reply.
+	 */
+	LASSERT((ev->md_options & LNET_MD_MANAGE_REMOTE) != 0);
+
+	spin_lock(&req->rq_lock);
+
+	req->rq_receiving_reply = 0;
+	req->rq_early = 0;
+	if (ev->unlinked)
+		req->rq_reply_unlinked = 1;
+
+        if (ev->status)
+                goto out_wake;
+
+        if (ev->type == LNET_EVENT_UNLINK) {
+                LASSERT(ev->unlinked);
+                DEBUG_REQ(D_NET, req, "unlink");
+                goto out_wake;
+        }
+
+        if (ev->mlength < ev->rlength ) {
+                CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
+                       req->rq_replen, ev->rlength, ev->offset);
+		req->rq_reply_truncated = 1;
+                req->rq_replied = 1;
+                req->rq_status = -EOVERFLOW;
+                req->rq_nob_received = ev->rlength + ev->offset;
+                goto out_wake;
+        }
+
+	if ((ev->offset == 0) &&
+	    ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
+		/* Early reply */
+		DEBUG_REQ(D_ADAPTTO, req,
+			  "Early reply received, mlen=%u offset=%d replen=%d replied=%d unlinked=%d",
+			  ev->mlength, ev->offset,
+			  req->rq_replen, req->rq_replied, ev->unlinked);
+
+		req->rq_early_count++; /* number received, client side */
+
+		/* already got the real reply or buffers are already unlinked */
+		if (req->rq_replied ||
+		    req->rq_reply_unlinked == 1)
+			goto out_wake;
+
+                req->rq_early = 1;
+                req->rq_reply_off = ev->offset;
+                req->rq_nob_received = ev->mlength;
+                /* And we're still receiving */
+                req->rq_receiving_reply = 1;
+        } else {
+                /* Real reply */
+                req->rq_rep_swab_mask = 0;
+                req->rq_replied = 1;
+		/* Got reply, no resend required */
+		req->rq_resend = 0;
+                req->rq_reply_off = ev->offset;
+                req->rq_nob_received = ev->mlength;
+                /* LNetMDUnlink can't be called under the LNET_LOCK,
+                   so we must unlink in ptlrpc_unregister_reply */
+                DEBUG_REQ(D_INFO, req,
+                          "reply in flags=%x mlen=%u offset=%d replen=%d",
+                          lustre_msg_get_flags(req->rq_reqmsg),
+                          ev->mlength, ev->offset, req->rq_replen);
+        }
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+		req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
+
+out_wake:
+	/* NB don't unlock till after wakeup; req can disappear under us
+	 * since we don't have our own ref */
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+	EXIT;
+}
+
+/*
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id     *cbid = ev->md_user_ptr;
+	struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+	struct ptlrpc_request   *req;
+	ENTRY;
+
+	LASSERT((ptlrpc_is_bulk_put_sink(desc->bd_type) &&
+		 ev->type == LNET_EVENT_PUT) ||
+		(ptlrpc_is_bulk_get_source(desc->bd_type) &&
+		 ev->type == LNET_EVENT_GET) ||
+		ev->type == LNET_EVENT_UNLINK);
+	LASSERT(ev->unlinked);
+
+	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+		ev->status = -EIO;
+
+	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
+		ev->status = -EIO;
+
+	CDEBUG_LIMIT((ev->status == 0) ? D_NET : D_ERROR,
+		     "event type %d, status %d, desc %p\n",
+		     ev->type, ev->status, desc);
+
+	spin_lock(&desc->bd_lock);
+	req = desc->bd_req;
+	LASSERT(desc->bd_refs > 0);
+	desc->bd_refs--;
+
+	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
+		desc->bd_nob_transferred += ev->mlength;
+		desc->bd_sender = lnet_nid_to_nid4(&ev->sender);
+	} else {
+		/* start reconnect and resend if network error hit */
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+		desc->bd_failure = 1;
+	}
+
+
+	/* NB don't unlock till after wakeup; desc can disappear under us
+	 * otherwise */
+	if (desc->bd_refs == 0)
+		ptlrpc_client_wake_req(desc->bd_req);
+
+	spin_unlock(&desc->bd_lock);
+	EXIT;
+}
+
+/*
+ * We will have percpt request history list for ptlrpc service in upcoming
+ * patches because we don't want to be serialized by current per-service
+ * history operations. So we require history ID can (somehow) show arriving
+ * order w/o grabbing global lock, and user can sort them in userspace.
+ *
+ * This is how we generate history ID for ptlrpc_request:
+ * ----------------------------------------------------
+ * |  32 bits  |  16 bits  | (16 - X)bits  |  X bits  |
+ * ----------------------------------------------------
+ * |  seconds  | usec / 16 |   sequence    | CPT id   |
+ * ----------------------------------------------------
+ *
+ * it might not be precise but should be good enough.
+ */
+
+#define REQS_CPT_BITS(svcpt)	((svcpt)->scp_service->srv_cpt_bits)
+
+#define REQS_SEC_SHIFT		32
+#define REQS_USEC_SHIFT		16
+#define REQS_SEQ_SHIFT(svcpt)	REQS_CPT_BITS(svcpt)
+
+static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
+				   struct ptlrpc_request *req)
+{
+	u64 sec = req->rq_arrival_time.tv_sec;
+	u32 usec = req->rq_arrival_time.tv_nsec / NSEC_PER_USEC / 16; /* usec / 16 */
+	u64 new_seq;
+
+	/* set sequence ID for request and add it to history list,
+	 * it must be called with hold svcpt::scp_lock */
+
+	new_seq = (sec << REQS_SEC_SHIFT) |
+		  (usec << REQS_USEC_SHIFT) |
+		  (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);
+
+	if (new_seq > svcpt->scp_hist_seq) {
+		/* This handles the initial case of scp_hist_seq == 0 or
+		 * we just jumped into a new time window */
+		svcpt->scp_hist_seq = new_seq;
+	} else {
+		LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
+		/* NB: increase sequence number in current usec bucket,
+		 * however, it's possible that we used up all bits for
+		 * sequence and jumped into the next usec bucket (future time),
+		 * then we hope there will be less RPCs per bucket at some
+		 * point, and sequence will catch up again */
+		svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
+		new_seq = svcpt->scp_hist_seq;
+	}
+
+	req->rq_history_seq = new_seq;
+
+	list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
+}
+
+/*
+ * Server's incoming request callback
+ */
+void request_in_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id		  *cbid = ev->md_user_ptr;
+	struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service		  *service = svcpt->scp_service;
+	struct ptlrpc_request		  *req;
+	ENTRY;
+
+	LASSERT(ev->type == LNET_EVENT_PUT ||
+		ev->type == LNET_EVENT_UNLINK);
+	LASSERT((char *)ev->md_start >= rqbd->rqbd_buffer);
+	LASSERT((char *)ev->md_start + ev->offset + ev->mlength <=
+		rqbd->rqbd_buffer + service->srv_buf_size);
+
+	CDEBUG_LIMIT((ev->status == 0) ? D_NET : D_ERROR,
+		     "event type %d, status %d, service %s\n",
+		     ev->type, ev->status, service->srv_name);
+
+	if (ev->unlinked) {
+		/* If this is the last request message to fit in the
+		 * request buffer we can use the request object embedded in
+		 * rqbd.  Note that if we failed to allocate a request,
+		 * we'd have to re-post the rqbd, which we can't do in this
+		 * context.
+		 */
+		req = &rqbd->rqbd_req;
+		memset(req, 0, sizeof(*req));
+	} else {
+		LASSERT(ev->type == LNET_EVENT_PUT);
+		if (ev->status != 0) /* We moaned above already... */
+			return;
+		req = ptlrpc_request_cache_alloc(GFP_ATOMIC);
+		if (req == NULL) {
+			CERROR("Can't allocate incoming request descriptor: Dropping %s RPC from %s\n",
+				service->srv_name,
+				libcfs_idstr(&ev->initiator));
+			return;
+		}
+	}
+
+	ptlrpc_srv_req_init(req);
+	/* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+	 * flags are reset and scalars are zero.  We only set the message
+	 * size to non-zero if this was a successful receive. */
+	req->rq_xid = ev->match_bits;
+	req->rq_reqbuf = ev->md_start + ev->offset;
+	if (ev->type == LNET_EVENT_PUT && ev->status == 0)
+		req->rq_reqdata_len = ev->mlength;
+	ktime_get_real_ts64(&req->rq_arrival_time);
+	/* Multi-Rail: keep track of both initiator and source NID. */
+	req->rq_peer = lnet_pid_to_pid4(&ev->initiator);
+	req->rq_source = lnet_pid_to_pid4(&ev->source);
+	req->rq_self = lnet_nid_to_nid4(&ev->target.nid);
+	req->rq_rqbd = rqbd;
+	req->rq_phase = RQ_PHASE_NEW;
+	if (ev->type == LNET_EVENT_PUT)
+		CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n",
+		       req, req->rq_xid, ev->mlength);
+
+	CDEBUG(D_RPCTRACE, "peer: %s (source: %s)\n",
+		libcfs_id2str(req->rq_peer), libcfs_id2str(req->rq_source));
+
+	spin_lock(&svcpt->scp_lock);
+
+	ptlrpc_req_add_history(svcpt, req);
+
+	if (ev->unlinked) {
+		svcpt->scp_nrqbds_posted--;
+		CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
+		       svcpt->scp_nrqbds_posted);
+
+		/* Normally, don't complain about 0 buffers posted; LNET won't
+		 * drop incoming reqs since we set the portal lazy */
+		if (test_req_buffer_pressure &&
+		    ev->type != LNET_EVENT_UNLINK &&
+		    svcpt->scp_nrqbds_posted == 0)
+                        CWARN("All %s request buffers busy\n",
+                              service->srv_name);
+
+                /* req takes over the network's ref on rqbd */
+        } else {
+                /* req takes a ref on rqbd */
+                rqbd->rqbd_refcount++;
+        }
+
+	list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
+	svcpt->scp_nreqs_incoming++;
+
+	/* NB everything can disappear under us once the request
+	 * has been queued and we unlock, so do the wake now... */
+	wake_up(&svcpt->scp_waitq);
+
+	spin_unlock(&svcpt->scp_lock);
+	EXIT;
+}
+
+/*
+ *  Server's outgoing reply callback
+ */
+void reply_out_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id	  *cbid = ev->md_user_ptr;
+	struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+	bool need_schedule = false;
+
+	ENTRY;
+
+	LASSERT(ev->type == LNET_EVENT_SEND ||
+		ev->type == LNET_EVENT_ACK ||
+		ev->type == LNET_EVENT_UNLINK);
+
+	if (!rs->rs_difficult) {
+		/* 'Easy' replies have no further processing so I drop the
+		 * net's ref on 'rs'
+		 */
+		LASSERT(ev->unlinked);
+		ptlrpc_rs_decref(rs);
+		EXIT;
+		return;
+	}
+
+	if (ev->type == LNET_EVENT_SEND) {
+		spin_lock(&rs->rs_lock);
+		rs->rs_sent = 1;
+		/* If transaction was committed before the SEND, and the ACK
+		 * is lost, then we need to schedule so ptlrpc_hr can unlink
+		 * the MD.
+		 */
+		if (rs->rs_handled)
+			need_schedule = true;
+		spin_unlock(&rs->rs_lock);
+	}
+
+	if (ev->unlinked || need_schedule) {
+		LASSERT(rs->rs_sent);
+
+		/* Last network callback. The net's ref on 'rs' stays put
+		 * until ptlrpc_handle_rs() is done with it
+		 */
+		spin_lock(&svcpt->scp_rep_lock);
+		spin_lock(&rs->rs_lock);
+
+		rs->rs_unlinked = ev->unlinked;
+		if (!rs->rs_no_ack ||
+		    rs->rs_transno <=
+		    rs->rs_export->exp_obd->obd_last_committed ||
+		    list_empty(&rs->rs_obd_list))
+			ptlrpc_schedule_difficult_reply(rs);
+
+		spin_unlock(&rs->rs_lock);
+		spin_unlock(&svcpt->scp_rep_lock);
+	}
+	EXIT;
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/*
+ * Server's bulk completion callback
+ */
+void server_bulk_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id     *cbid = ev->md_user_ptr;
+	struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+	ENTRY;
+
+	LASSERT(ev->type == LNET_EVENT_SEND ||
+		ev->type == LNET_EVENT_UNLINK ||
+		(ptlrpc_is_bulk_put_source(desc->bd_type) &&
+		 ev->type == LNET_EVENT_ACK) ||
+		(ptlrpc_is_bulk_get_sink(desc->bd_type) &&
+		 ev->type == LNET_EVENT_REPLY));
+
+	CDEBUG_LIMIT((ev->status == 0) ? D_NET : D_ERROR,
+		     "event type %d, status %d, desc %p\n",
+		     ev->type, ev->status, desc);
+
+	spin_lock(&desc->bd_lock);
+
+	LASSERT(desc->bd_refs > 0);
+
+	if ((ev->type == LNET_EVENT_ACK ||
+	     ev->type == LNET_EVENT_REPLY) &&
+	    ev->status == 0) {
+		/* We heard back from the peer, so even if we get this
+		 * before the SENT event (oh yes we can), we know we
+		 * read/wrote the peer buffer and how much... */
+		desc->bd_nob_transferred += ev->mlength;
+		desc->bd_sender = lnet_nid_to_nid4(&ev->sender);
+	}
+
+	if (ev->status != 0)
+		desc->bd_failure = 1;
+
+	if (ev->unlinked) {
+		desc->bd_refs--;
+		/* This is the last callback no matter what... */
+		if (desc->bd_refs == 0)
+			wake_up(&desc->bd_waitq);
+	}
+
+	spin_unlock(&desc->bd_lock);
+	EXIT;
+}
+#endif
+
+static void ptlrpc_master_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id *cbid = ev->md_user_ptr;
+	void (*callback)(struct lnet_event *ev) = cbid->cbid_fn;
+
+	/* Honestly, it's best to find out early. */
+	LASSERT(cbid->cbid_arg != LP_POISON);
+	LASSERT(callback == request_out_callback ||
+		callback == reply_in_callback ||
+		callback == client_bulk_callback ||
+		callback == request_in_callback ||
+		callback == reply_out_callback
+#ifdef HAVE_SERVER_SUPPORT
+		|| callback == server_bulk_callback
+#endif
+		);
+
+	callback(ev);
+	if (ev->unlinked)
+		percpu_ref_put(&ptlrpc_pending);
+}
+
+int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+			struct lnet_process_id *peer, lnet_nid_t *self)
+{
+	int best_dist = 0;
+	__u32 best_order = 0;
+	int count = 0;
+	int rc = -ENOENT;
+	int dist;
+	__u32 order;
+	lnet_nid_t dst_nid;
+	lnet_nid_t src_nid;
+
+	peer->pid = LNET_PID_LUSTRE;
+
+	/* Choose the matching UUID that's closest */
+	while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
+		if (peer->nid != LNET_NID_ANY && LNET_NIDADDR(peer->nid) == 0 &&
+		    LNET_NIDNET(dst_nid) != LNET_NIDNET(peer->nid))
+			continue;
+
+		dist = LNetDist(dst_nid, &src_nid, &order);
+		if (dist < 0)
+			continue;
+
+		if (dist == 0) {                /* local! use loopback LND */
+			peer->nid = *self = LNET_NID_LO_0;
+			rc = 0;
+			break;
+		}
+
+		if (rc < 0 ||
+		    dist < best_dist ||
+		    (dist == best_dist && order < best_order)) {
+			best_dist = dist;
+			best_order = order;
+
+			peer->nid = dst_nid;
+			*self = src_nid;
+			rc = 0;
+		}
+	}
+
+	CDEBUG(D_NET, "%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
+	return rc;
+}
+
+static struct completion ptlrpc_done;
+
+static void ptlrpc_release(struct percpu_ref *ref)
+{
+	complete(&ptlrpc_done);
+}
+
+static void ptlrpc_ni_fini(void)
+{
+	/* Wait for the event queue to become idle since there may still be
+	 * messages in flight with pending events (i.e. the fire-and-forget
+	 * messages == client requests and "non-difficult" server
+	 * replies */
+
+	init_completion(&ptlrpc_done);
+	percpu_ref_kill(&ptlrpc_pending);
+	wait_for_completion(&ptlrpc_done);
+
+	lnet_assert_handler_unused(ptlrpc_handler);
+	LNetNIFini();
+}
+
+lnet_pid_t ptl_get_pid(void)
+{
+	return LNET_PID_LUSTRE;
+}
+
+int ptlrpc_ni_init(void)
+{
+	int rc;
+	lnet_pid_t pid;
+
+	pid = ptl_get_pid();
+	CDEBUG(D_NET, "My pid is: %x\n", pid);
+
+	/* We're not passing any limits yet... */
+	rc = LNetNIInit(pid);
+	if (rc < 0) {
+		CDEBUG(D_NET, "ptlrpc: Can't init network interface: rc = %d\n",
+		       rc);
+		return rc;
+	}
+
+	rc = percpu_ref_init(&ptlrpc_pending, ptlrpc_release, 0, GFP_KERNEL);
+	if (rc) {
+		CERROR("ptlrpc: Can't init percpu refcount: rc = %d\n", rc);
+		return rc;
+	}
+	/* CAVEAT EMPTOR: how we process portals events is _radically_
+	 * different depending on...
+	 */
+	/* kernel LNet calls our master callback when there are new event,
+	 * because we are guaranteed to get every event via callback,
+	 * so we just set EQ size to 0 to avoid overhread of serializing
+	 * enqueue/dequeue operations in LNet. */
+	ptlrpc_handler = ptlrpc_master_callback;
+	return 0;
+}
+
+int ptlrpc_init_portals(void)
+{
+        int   rc = ptlrpc_ni_init();
+
+        if (rc != 0) {
+                CERROR("network initialisation failed\n");
+		return rc;
+        }
+        rc = ptlrpcd_addref();
+        if (rc == 0)
+                return 0;
+
+        CERROR("rpcd initialisation failed\n");
+        ptlrpc_ni_fini();
+        return rc;
+}
+
+void ptlrpc_exit_portals(void)
+{
+        ptlrpcd_decref();
+        ptlrpc_ni_fini();
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
new file mode 100644
index 0000000000000..aa481015dd2d7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
@@ -0,0 +1,185 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Somewhat simplified version of the gss api.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ *
+ */
+
+#ifndef __PTLRPC_GSS_GSS_API_H_
+#define __PTLRPC_GSS_GSS_API_H_
+
+struct gss_api_mech;
+
+typedef int (*digest_hash)(
+	struct ahash_request *req, rawobj_t *hdr,
+	int msgcnt, rawobj_t *msgs,
+	int iovcnt, struct bio_vec *iovs);
+
+/* The mechanism-independent gss-api context: */
+struct gss_ctx {
+	struct gss_api_mech *mech_type;
+	void *internal_ctx_id;
+	digest_hash hash_func;
+};
+
+#define GSS_C_NO_BUFFER         ((rawobj_t) 0)
+#define GSS_C_NO_CONTEXT        ((struct gss_ctx *) 0)
+#define GSS_C_NULL_OID          ((rawobj_t) 0)
+
+/*
+ * gss-api prototypes; note that these are somewhat simplified versions of
+ * the prototypes specified in RFC 2744.
+ */
+__u32 lgss_import_sec_context(
+                rawobj_t                *input_token,
+                struct gss_api_mech     *mech,
+                struct gss_ctx         **ctx);
+__u32 lgss_copy_reverse_context(
+                struct gss_ctx          *ctx,
+                struct gss_ctx         **ctx_new);
+__u32 lgss_inquire_context(
+                struct gss_ctx          *ctx,
+		time64_t *endtime);
+__u32 lgss_get_mic(
+                struct gss_ctx          *ctx,
+                int                      msgcnt,
+                rawobj_t                *msgs,
+                int                      iovcnt,
+		struct bio_vec             *iovs,
+                rawobj_t                *mic_token);
+__u32 lgss_verify_mic(
+                struct gss_ctx          *ctx,
+                int                      msgcnt,
+                rawobj_t                *msgs,
+                int                      iovcnt,
+		struct bio_vec             *iovs,
+                rawobj_t                *mic_token);
+__u32 lgss_wrap(
+                struct gss_ctx          *ctx,
+                rawobj_t                *gsshdr,
+                rawobj_t                *msg,
+                int                      msg_buflen,
+                rawobj_t                *out_token);
+__u32 lgss_unwrap(
+                struct gss_ctx          *ctx,
+                rawobj_t                *gsshdr,
+                rawobj_t                *token,
+                rawobj_t                *out_msg);
+__u32 lgss_prep_bulk(
+                struct gss_ctx          *gctx,
+                struct ptlrpc_bulk_desc *desc);
+__u32 lgss_wrap_bulk(
+                struct gss_ctx          *gctx,
+                struct ptlrpc_bulk_desc *desc,
+                rawobj_t                *token,
+                int                      adj_nob);
+__u32 lgss_unwrap_bulk(
+                struct gss_ctx          *gctx,
+                struct ptlrpc_bulk_desc *desc,
+                rawobj_t                *token,
+                int                      adj_nob);
+__u32 lgss_delete_sec_context(
+                struct gss_ctx         **ctx);
+int lgss_display(
+                struct gss_ctx          *ctx,
+                char                    *buf,
+                int                      bufsize);
+
+struct subflavor_desc {
+        __u32           sf_subflavor;
+        __u32           sf_qop;
+        __u32           sf_service;
+        char           *sf_name;
+};
+
+/* Each mechanism is described by the following struct: */
+struct gss_api_mech {
+	struct list_head	gm_list;
+	struct module	       *gm_owner;
+	char		       *gm_name;
+	rawobj_t		gm_oid;
+	atomic_t		gm_count;
+	struct gss_api_ops     *gm_ops;
+	int			gm_sf_num;
+	struct subflavor_desc  *gm_sfs;
+};
+
+/* and must provide the following operations: */
+struct gss_api_ops {
+        __u32 (*gss_import_sec_context)(
+                        rawobj_t               *input_token,
+                        struct gss_ctx         *ctx);
+        __u32 (*gss_copy_reverse_context)(
+                        struct gss_ctx         *ctx,
+                        struct gss_ctx         *ctx_new);
+        __u32 (*gss_inquire_context)(
+                        struct gss_ctx         *ctx,
+			time64_t *endtime);
+        __u32 (*gss_get_mic)(
+                        struct gss_ctx         *ctx,
+                        int                     msgcnt,
+                        rawobj_t               *msgs,
+                        int                     iovcnt,
+			struct bio_vec         *iovs,
+                        rawobj_t               *mic_token);
+        __u32 (*gss_verify_mic)(
+                        struct gss_ctx         *ctx,
+                        int                     msgcnt,
+                        rawobj_t               *msgs,
+                        int                     iovcnt,
+			struct bio_vec         *iovs,
+                        rawobj_t               *mic_token);
+        __u32 (*gss_wrap)(
+                        struct gss_ctx         *ctx,
+                        rawobj_t               *gsshdr,
+                        rawobj_t               *msg,
+                        int                     msg_buflen,
+                        rawobj_t               *out_token);
+        __u32 (*gss_unwrap)(
+                        struct gss_ctx         *ctx,
+                        rawobj_t               *gsshdr,
+                        rawobj_t               *token,
+                        rawobj_t               *out_msg);
+        __u32 (*gss_prep_bulk)(
+                        struct gss_ctx         *gctx,
+                        struct ptlrpc_bulk_desc *desc);
+        __u32 (*gss_wrap_bulk)(
+                        struct gss_ctx         *gctx,
+                        struct ptlrpc_bulk_desc *desc,
+                        rawobj_t               *token,
+                        int                     adj_nob);
+        __u32 (*gss_unwrap_bulk)(
+                        struct gss_ctx         *gctx,
+                        struct ptlrpc_bulk_desc *desc,
+                        rawobj_t               *token,
+                        int                     adj_nob);
+        void (*gss_delete_sec_context)(
+                        void                   *ctx);
+        int  (*gss_display)(
+                        struct gss_ctx         *ctx,
+                        char                   *buf,
+                        int                     bufsize);
+};
+
+int lgss_mech_register(struct gss_api_mech *mech);
+void lgss_mech_unregister(struct gss_api_mech *mech);
+
+struct gss_api_mech * lgss_OID_to_mech(rawobj_t *oid);
+struct gss_api_mech * lgss_name_to_mech(char *name);
+struct gss_api_mech * lgss_subflavor_to_mech(__u32 subflavor);
+
+struct gss_api_mech * lgss_mech_get(struct gss_api_mech *mech);
+void lgss_mech_put(struct gss_api_mech *mech);
+
+#endif /* __PTLRPC_GSS_GSS_API_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h
new file mode 100644
index 0000000000000..1f535485bd0f3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h
@@ -0,0 +1,84 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  minimal asn1 for generic encoding/decoding of gss tokens
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#define SIZEOF_INT 4
+
+/* from gssapi_err_generic.h */
+#define G_BAD_SERVICE_NAME                       (-2045022976L)
+#define G_BAD_STRING_UID                         (-2045022975L)
+#define G_NOUSER                                 (-2045022974L)
+#define G_VALIDATE_FAILED                        (-2045022973L)
+#define G_BUFFER_ALLOC                           (-2045022972L)
+#define G_BAD_MSG_CTX                            (-2045022971L)
+#define G_WRONG_SIZE                             (-2045022970L)
+#define G_BAD_USAGE                              (-2045022969L)
+#define G_UNKNOWN_QOP                            (-2045022968L)
+#define G_NO_HOSTNAME                            (-2045022967L)
+#define G_BAD_HOSTNAME                           (-2045022966L)
+#define G_WRONG_MECH                             (-2045022965L)
+#define G_BAD_TOK_HEADER                         (-2045022964L)
+#define G_BAD_DIRECTION                          (-2045022963L)
+#define G_TOK_TRUNC                              (-2045022962L)
+#define G_REFLECT                                (-2045022961L)
+#define G_WRONG_TOKID                            (-2045022960L)
+
+#define g_OID_equal(o1,o2) \
+   (((o1)->len == (o2)->len) && \
+    (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0))
+
+__u32 g_verify_token_header(rawobj_t *mech,
+                            int *body_size,
+                            unsigned char **buf_in,
+                            int toksize);
+
+__u32 g_get_mech_oid(rawobj_t *mech,
+                     rawobj_t *in_buf);
+
+int g_token_size(rawobj_t *mech,
+                 unsigned int body_size);
+
+void g_make_token_header(rawobj_t *mech,
+                         int body_size,
+                         unsigned char **buf);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
new file mode 100644
index 0000000000000..466b868c44068
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
@@ -0,0 +1,516 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/gss/gss_bulk.c
+ *
+ * Author: Eric Mei <eric.mei@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                          struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_cli_ctx              *gctx;
+	struct lustre_msg               *msg;
+	struct ptlrpc_bulk_sec_desc     *bsd;
+	rawobj_t                         token;
+	__u32                            maj;
+	int                              offset;
+	int                              rc;
+	ENTRY;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+	LASSERT(gctx->gc_mechctx);
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+		LASSERT(req->rq_reqbuf->lm_bufcount >= 3);
+		msg = req->rq_reqbuf;
+		offset = msg->lm_bufcount - 1;
+		break;
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(req->rq_reqbuf->lm_bufcount >= 4);
+		msg = req->rq_reqbuf;
+		offset = msg->lm_bufcount - 2;
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(req->rq_clrbuf->lm_bufcount >= 2);
+		msg = req->rq_clrbuf;
+		offset = msg->lm_bufcount - 1;
+		break;
+	default:
+		LBUG();
+	}
+
+	bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+	bsd->bsd_version = 0;
+	bsd->bsd_flags = 0;
+	bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+	if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		RETURN(0);
+
+	LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+		bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+	if (req->rq_bulk_read) {
+		/*
+		 * bulk read: prepare receiving pages only for privacy mode.
+		 */
+		if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+			return gss_cli_prep_bulk(req, desc);
+	} else {
+		/*
+		 * bulk write: sign or encrypt bulk pages.
+		 */
+		bsd->bsd_nob = desc->bd_nob;
+
+		if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+			/* integrity mode */
+			token.data = bsd->bsd_data;
+			token.len = lustre_msg_buflen(msg, offset) -
+				    sizeof(*bsd);
+
+			maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL,
+					   desc->bd_iov_count,
+					   desc->bd_vec,
+					   &token);
+			if (maj != GSS_S_COMPLETE) {
+				CWARN("failed to sign bulk data: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		} else {
+			/* privacy mode */
+			if (desc->bd_iov_count == 0)
+				RETURN(0);
+
+			rc = sptlrpc_enc_pool_get_pages(desc);
+			if (rc) {
+				CERROR("bulk write: failed to allocate "
+				       "encryption pages: %d\n", rc);
+				RETURN(rc);
+			}
+
+			token.data = bsd->bsd_data;
+			token.len = lustre_msg_buflen(msg, offset) -
+				    sizeof(*bsd);
+
+			maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0);
+			if (maj != GSS_S_COMPLETE) {
+				CWARN("fail to encrypt bulk data: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		}
+	}
+
+	RETURN(0);
+}
+
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                            struct ptlrpc_request *req,
+                            struct ptlrpc_bulk_desc *desc)
+{
+        struct gss_cli_ctx              *gctx;
+        struct lustre_msg               *rmsg, *vmsg;
+        struct ptlrpc_bulk_sec_desc     *bsdr, *bsdv;
+        rawobj_t                         token;
+        __u32                            maj;
+        int                              roff, voff;
+        ENTRY;
+
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+        switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+        case SPTLRPC_SVC_NULL:
+                vmsg = req->rq_repdata;
+		LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 3);
+                voff = vmsg->lm_bufcount - 1;
+
+                rmsg = req->rq_reqbuf;
+		LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 3);
+                roff = rmsg->lm_bufcount - 1; /* last segment */
+                break;
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                vmsg = req->rq_repdata;
+		LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 4);
+                voff = vmsg->lm_bufcount - 2;
+
+                rmsg = req->rq_reqbuf;
+		LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 4);
+                roff = rmsg->lm_bufcount - 2; /* second last segment */
+                break;
+        case SPTLRPC_SVC_PRIV:
+                vmsg = req->rq_repdata;
+		LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 2);
+                voff = vmsg->lm_bufcount - 1;
+
+                rmsg = req->rq_clrbuf;
+		LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 2);
+                roff = rmsg->lm_bufcount - 1; /* last segment */
+                break;
+        default:
+                LBUG();
+        }
+
+        bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr));
+        bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv));
+        LASSERT(bsdr && bsdv);
+
+        if (bsdr->bsd_version != bsdv->bsd_version ||
+            bsdr->bsd_type != bsdv->bsd_type ||
+            bsdr->bsd_svc != bsdv->bsd_svc) {
+                CERROR("bulk security descriptor mismatch: "
+                       "(%u,%u,%u) != (%u,%u,%u)\n",
+                       bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc,
+                       bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc);
+                RETURN(-EPROTO);
+        }
+
+        LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL ||
+                bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+                bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+        /*
+         * in privacy mode if return success, make sure bd_nob_transferred
+         * is the actual size of the clear text, otherwise upper layer
+         * may be surprised.
+         */
+        if (req->rq_bulk_write) {
+                if (bsdv->bsd_flags & BSD_FL_ERR) {
+                        CERROR("server reported bulk i/o failure\n");
+                        RETURN(-EIO);
+                }
+
+                if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+                        desc->bd_nob_transferred = desc->bd_nob;
+        } else {
+                /*
+                 * bulk read, upon return success, bd_nob_transferred is
+                 * the size of plain text actually received.
+                 */
+                gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+                LASSERT(gctx->gc_mechctx);
+
+		if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+			int i, nob;
+
+			/* fix the actual data size */
+			for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+				if (desc->bd_vec[i].bv_len + nob >
+				    desc->bd_nob_transferred) {
+					desc->bd_vec[i].bv_len =
+						desc->bd_nob_transferred - nob;
+				}
+				nob += desc->bd_vec[i].bv_len;
+			}
+
+			token.data = bsdv->bsd_data;
+			token.len = lustre_msg_buflen(vmsg, voff) -
+				    sizeof(*bsdv);
+
+			maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL,
+					      desc->bd_iov_count,
+					      desc->bd_vec,
+					      &token);
+                        if (maj != GSS_S_COMPLETE) {
+                                CERROR("failed to verify bulk read: %x\n", maj);
+                                RETURN(-EACCES);
+                        }
+                } else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) {
+                        desc->bd_nob = bsdv->bsd_nob;
+                        if (desc->bd_nob == 0)
+                                RETURN(0);
+
+                        token.data = bsdv->bsd_data;
+                        token.len = lustre_msg_buflen(vmsg, voff) -
+                                    sizeof(*bsdr);
+
+                        maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc,
+                                               &token, 1);
+                        if (maj != GSS_S_COMPLETE) {
+                                CERROR("failed to decrypt bulk read: %x\n",
+                                       maj);
+                                RETURN(-EACCES);
+                        }
+
+                        desc->bd_nob_transferred = desc->bd_nob;
+                }
+        }
+
+        RETURN(0);
+}
+
+static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc,
+                         struct gss_ctx *mechctx)
+{
+        int     rc;
+
+        if (desc->bd_iov_count == 0)
+                return 0;
+
+        rc = sptlrpc_enc_pool_get_pages(desc);
+        if (rc)
+                return rc;
+
+        if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE)
+                return -EACCES;
+
+        return 0;
+}
+
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc)
+{
+        int             rc;
+        ENTRY;
+
+        LASSERT(req->rq_cli_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_read);
+
+        if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV)
+                RETURN(0);
+
+        rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx);
+        if (rc)
+                CERROR("bulk read: failed to prepare encryption "
+                       "pages: %d\n", rc);
+
+        RETURN(rc);
+}
+
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc)
+{
+        struct gss_svc_reqctx        *grctx;
+        struct ptlrpc_bulk_sec_desc  *bsd;
+        int                           rc;
+        ENTRY;
+
+        LASSERT(req->rq_svc_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_write);
+
+        grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        LASSERT(grctx->src_reqbsd);
+        LASSERT(grctx->src_repbsd);
+        LASSERT(grctx->src_ctx);
+        LASSERT(grctx->src_ctx->gsc_mechctx);
+
+        bsd = grctx->src_reqbsd;
+        if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)
+                RETURN(0);
+
+        rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx);
+        if (rc)
+                CERROR("bulk write: failed to prepare encryption "
+                       "pages: %d\n", rc);
+
+        RETURN(rc);
+}
+
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+                        struct ptlrpc_bulk_desc *desc)
+{
+        struct gss_svc_reqctx        *grctx;
+        struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+        rawobj_t                      token;
+        __u32                         maj;
+        ENTRY;
+
+        LASSERT(req->rq_svc_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_write);
+
+        grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+        LASSERT(grctx->src_reqbsd);
+        LASSERT(grctx->src_repbsd);
+        LASSERT(grctx->src_ctx);
+        LASSERT(grctx->src_ctx->gsc_mechctx);
+
+        bsdr = grctx->src_reqbsd;
+        bsdv = grctx->src_repbsd;
+
+        /* bsdr has been sanity checked during unpacking */
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        switch (bsdv->bsd_svc) {
+        case SPTLRPC_BULK_SVC_INTG:
+                token.data = bsdr->bsd_data;
+                token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+		maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+				      desc->bd_iov_count,
+				      desc->bd_vec, &token);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed to verify bulk signature: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
+        case SPTLRPC_BULK_SVC_PRIV:
+                if (bsdr->bsd_nob != desc->bd_nob) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("prepared nob %d doesn't match the actual "
+                               "nob %d\n", desc->bd_nob, bsdr->bsd_nob);
+                        RETURN(-EPROTO);
+                }
+
+                if (desc->bd_iov_count == 0) {
+                        LASSERT(desc->bd_nob == 0);
+                        break;
+                }
+
+                token.data = bsdr->bsd_data;
+                token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+                maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                       desc, &token, 0);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed decrypt bulk data: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+
+		/* mimic gss_cli_ctx_unwrap_bulk */
+		desc->bd_nob_transferred = desc->bd_nob;
+
+                break;
+        }
+
+        RETURN(0);
+}
+
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc)
+{
+        struct gss_svc_reqctx        *grctx;
+        struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+        rawobj_t                      token;
+        __u32                         maj;
+        int                           rc;
+        ENTRY;
+
+        LASSERT(req->rq_svc_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_read);
+
+        grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+        LASSERT(grctx->src_reqbsd);
+        LASSERT(grctx->src_repbsd);
+        LASSERT(grctx->src_ctx);
+        LASSERT(grctx->src_ctx->gsc_mechctx);
+
+        bsdr = grctx->src_reqbsd;
+        bsdv = grctx->src_repbsd;
+
+        /* bsdr has been sanity checked during unpacking */
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        switch (bsdv->bsd_svc) {
+        case SPTLRPC_BULK_SVC_INTG:
+                token.data = bsdv->bsd_data;
+                token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+		maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+				   desc->bd_iov_count,
+				   desc->bd_vec, &token);
+		if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed to sign bulk data: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
+        case SPTLRPC_BULK_SVC_PRIV:
+                bsdv->bsd_nob = desc->bd_nob;
+
+                if (desc->bd_iov_count == 0) {
+                        LASSERT(desc->bd_nob == 0);
+                        break;
+                }
+
+                rc = sptlrpc_enc_pool_get_pages(desc);
+                if (rc) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("bulk read: failed to allocate encryption "
+                               "pages: %d\n", rc);
+                        RETURN(rc);
+                }
+
+                token.data = bsdv->bsd_data;
+                token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+                maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                     desc, &token, 1);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed to encrypt bulk data: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
+        }
+
+        RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
new file mode 100644
index 0000000000000..0b4bfba0a0ac6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
@@ -0,0 +1,429 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/gss/gss_cli_upcall.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+#include <uapi/linux/lustre/lgss.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+/**********************************************
+ * gss context init/fini helper               *
+ **********************************************/
+
+static
+int ctx_init_pack_request(struct obd_import *imp,
+			  struct ptlrpc_request *req,
+			  int lustre_srv,
+			  uid_t uid, gid_t gid,
+			  long token_size,
+			  char __user *token)
+{
+	struct lustre_msg       *msg = req->rq_reqbuf;
+	struct gss_sec          *gsec;
+	struct gss_header       *ghdr;
+	struct ptlrpc_user_desc *pud;
+	__u32                   *p, size, offset = 2;
+	rawobj_t                 obj;
+
+	LASSERT(msg->lm_bufcount <= 4);
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_cli_ctx->cc_sec);
+
+	/* gss hdr */
+	ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
+	ghdr->gh_seq = 0;
+	ghdr->gh_svc = SPTLRPC_SVC_NULL;
+	ghdr->gh_handle.len = 0;
+
+	/* fix the user desc */
+	if (req->rq_pack_udesc) {
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+		pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+		LASSERT(pud);
+		pud->pud_uid = pud->pud_fsuid = uid;
+		pud->pud_gid = pud->pud_fsgid = gid;
+		pud->pud_cap = 0;
+		pud->pud_ngroups = 0;
+		offset++;
+	}
+
+	/* new clients are expected to set KCSUM flag */
+	ghdr->gh_flags |= LUSTRE_GSS_PACK_KCSUM;
+
+	/* security payload */
+	p = lustre_msg_buf(msg, offset, 0);
+	size = msg->lm_buflens[offset];
+	LASSERT(p);
+
+	/* 1. lustre svc type */
+	LASSERT(size > 4);
+	*p++ = cpu_to_le32(lustre_srv);
+	size -= 4;
+
+	/* 2. target uuid */
+	obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
+	obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 3. reverse context handle. actually only needed by root user,
+	 *    but we send it anyway. */
+	gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
+	obj.len = sizeof(gsec->gs_rvs_hdl);
+	obj.data = (__u8 *) &gsec->gs_rvs_hdl;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 4. now the token */
+	LASSERT(size >= (sizeof(__u32) + token_size));
+	*p++ = cpu_to_le32(((__u32) token_size));
+	if (copy_from_user(p, token, token_size)) {
+		CERROR("can't copy token\n");
+		return -EFAULT;
+	}
+	size -= sizeof(__u32) + round_up(token_size, 4);
+
+	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
+					     msg->lm_buflens[offset] - size, 0);
+	return 0;
+}
+
+static
+int ctx_init_parse_reply(struct lustre_msg *msg, int swabbed,
+                         char __user *outbuf, long outlen)
+{
+        struct gss_rep_header   *ghdr;
+        __u32                    obj_len, round_len;
+        __u32                    status, effective = 0;
+
+        if (msg->lm_bufcount != 3) {
+                CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+                return -EPROTO;
+        }
+
+        ghdr = (struct gss_rep_header *) gss_swab_header(msg, 0, swabbed);
+        if (ghdr == NULL) {
+                CERROR("unable to extract gss reply header\n");
+                return -EPROTO;
+        }
+
+        if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+                CERROR("invalid gss version %u\n", ghdr->gh_version);
+                return -EPROTO;
+        }
+
+	if (outlen < (4 + 2) * 4 + round_up(ghdr->gh_handle.len, 4) +
+		     round_up(msg->lm_buflens[2], 4)) {
+                CERROR("output buffer size %ld too small\n", outlen);
+                return -EFAULT;
+        }
+
+        status = 0;
+        effective = 0;
+
+	if (copy_to_user(outbuf, &status, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_major, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_minor, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_seqwin, 4))
+		return -EFAULT;
+	outbuf += 4;
+	effective += 4 * 4;
+
+	/* handle */
+	obj_len = ghdr->gh_handle.len;
+	round_len = (obj_len + 3) & ~3;
+	if (copy_to_user(outbuf, &obj_len, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, (char *) ghdr->gh_handle.data, round_len))
+		return -EFAULT;
+	outbuf += round_len;
+	effective += 4 + round_len;
+
+	/* out token */
+	obj_len = msg->lm_buflens[2];
+	round_len = (obj_len + 3) & ~3;
+	if (copy_to_user(outbuf, &obj_len, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, lustre_msg_buf(msg, 2, 0), round_len))
+		return -EFAULT;
+	outbuf += round_len;
+	effective += 4 + round_len;
+
+	return effective;
+}
+
+int gss_do_ctx_init_rpc(char __user *buffer, unsigned long count)
+{
+	struct obd_import *imp, *imp0;
+	struct ptlrpc_request *req;
+	struct lgssd_ioctl_param param;
+	struct obd_device *obd;
+	char obdname[64];
+	long lsize;
+	int rc;
+
+	if (count != sizeof(param)) {
+		CERROR("ioctl size %lu, expect %lu, please check lgss_keyring version\n",
+		       count, (unsigned long) sizeof(param));
+		RETURN(-EINVAL);
+	}
+	if (copy_from_user(&param, buffer, sizeof(param))) {
+		CERROR("failed copy data from lgssd\n");
+		RETURN(-EFAULT);
+	}
+
+	if (param.version != GSSD_INTERFACE_VERSION) {
+		CERROR("gssd interface version %d (expect %d)\n",
+		       param.version, GSSD_INTERFACE_VERSION);
+		RETURN(-EINVAL);
+	}
+
+	/* take name */
+	if (strncpy_from_user(obdname, (const char __user *)param.uuid,
+			      sizeof(obdname)) <= 0) {
+		CERROR("Invalid obdname pointer\n");
+		RETURN(-EFAULT);
+	}
+
+	obd = class_name2obd(obdname);
+	if (!obd) {
+		CERROR("no such obd %s\n", obdname);
+		RETURN(-EINVAL);
+	}
+
+	if (unlikely(!obd->obd_set_up)) {
+		CERROR("obd %s not setup\n", obdname);
+		RETURN(-EINVAL);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		CERROR("obd %s has stopped\n", obdname);
+		spin_unlock(&obd->obd_dev_lock);
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME)) {
+		CERROR("obd %s is not a client device\n", obdname);
+		spin_unlock(&obd->obd_dev_lock);
+		RETURN(-EINVAL);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	with_imp_locked(obd, imp0, rc)
+		imp = class_import_get(imp0);
+	if (rc) {
+		CERROR("obd %s: import has gone\n", obd->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	if (imp->imp_deactive) {
+		CERROR("import has been deactivated\n");
+		class_import_put(imp);
+		RETURN(-EINVAL);
+	}
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION,
+					SEC_CTX_INIT);
+	if (req == NULL) {
+		param.status = -ENOMEM;
+		goto out_copy;
+	}
+
+	if (req->rq_cli_ctx->cc_sec->ps_id != param.secid) {
+		CWARN("original secid %d, now has changed to %d, cancel this negotiation\n",
+		      param.secid, req->rq_cli_ctx->cc_sec->ps_id);
+		param.status = -EINVAL;
+		goto out_copy;
+	}
+
+	/* get token */
+	rc = ctx_init_pack_request(imp, req,
+				   param.lustre_svc,
+				   param.uid, param.gid,
+				   param.send_token_size,
+				   (char __user *)param.send_token);
+	if (rc) {
+		param.status = rc;
+		goto out_copy;
+	}
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		/* If any _real_ denial be made, we expect server return
+		 * -EACCES reply or return success but indicate gss error
+		 * inside reply messsage. All other errors are treated as
+		 * timeout, caller might try the negotiation repeatedly,
+		 * leave recovery decisions to general ptlrpc layer.
+		 *
+		 * FIXME maybe some other error code shouldn't be treated
+		 * as timeout.
+		 */
+		param.status = rc;
+		if (rc != -EACCES)
+			param.status = -ETIMEDOUT;
+		goto out_copy;
+	}
+
+	LASSERT(req->rq_repdata);
+	lsize = ctx_init_parse_reply(req->rq_repdata,
+				     req_capsule_rep_need_swab(&req->rq_pill),
+				     (char __user *)param.reply_buf,
+				     param.reply_buf_size);
+	if (lsize < 0) {
+		param.status = (int) lsize;
+		goto out_copy;
+	}
+
+	param.status = 0;
+	param.reply_length = lsize;
+
+out_copy:
+	if (copy_to_user(buffer, &param, sizeof(param)))
+		rc = -EFAULT;
+	else
+		rc = 0;
+
+	class_import_put(imp);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx)
+{
+	struct ptlrpc_cli_ctx	*ctx = &gctx->gc_base;
+	struct obd_import	*imp = ctx->cc_sec->ps_import;
+	struct ptlrpc_request	*req;
+	struct ptlrpc_user_desc	*pud;
+	int			 rc;
+	ENTRY;
+
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	if (cli_ctx_is_error(ctx) || !cli_ctx_is_uptodate(ctx)) {
+		CDEBUG(D_SEC, "ctx %p(%u->%s) not uptodate, "
+		       "don't send destroy rpc\n", ctx,
+		       ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+		RETURN(0);
+	}
+
+	might_sleep();
+
+	CWARN("%s ctx %p idx %#llx (%u->%s)\n",
+	      sec_is_reverse(ctx->cc_sec) ?
+	      "server finishing reverse" : "client finishing forward",
+	      ctx, gss_handle_to_u64(&gctx->gc_handle),
+	      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+        gctx->gc_proc = PTLRPC_GSS_PROC_DESTROY;
+
+        req = ptlrpc_request_alloc(imp, &RQF_SEC_CTX);
+        if (req == NULL) {
+                CWARN("ctx %p(%u): fail to prepare rpc, destroy locally\n",
+                      ctx, ctx->cc_vcred.vc_uid);
+                GOTO(out, rc = -ENOMEM);
+        }
+
+        rc = ptlrpc_request_bufs_pack(req, LUSTRE_OBD_VERSION, SEC_CTX_FINI,
+                                      NULL, ctx);
+	if (rc)
+		GOTO(out_ref, rc);
+
+        /* fix the user desc */
+        if (req->rq_pack_udesc) {
+                /* we rely the fact that this request is in AUTH mode,
+                 * and user_desc at offset 2. */
+                pud = lustre_msg_buf(req->rq_reqbuf, 2, sizeof(*pud));
+                LASSERT(pud);
+                pud->pud_uid = pud->pud_fsuid = ctx->cc_vcred.vc_uid;
+                pud->pud_gid = pud->pud_fsgid = ctx->cc_vcred.vc_gid;
+                pud->pud_cap = 0;
+                pud->pud_ngroups = 0;
+        }
+
+        req->rq_phase = RQ_PHASE_RPC;
+        rc = ptl_send_rpc(req, 1);
+        if (rc)
+                CWARN("ctx %p(%u->%s): rpc error %d, destroy locally\n", ctx,
+                      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), rc);
+
+out_ref:
+        ptlrpc_req_finished(req);
+out:
+        RETURN(rc);
+}
+
+int __init gss_init_cli_upcall(void)
+{
+        return 0;
+}
+
+void gss_exit_cli_upcall(void)
+{
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
new file mode 100644
index 0000000000000..a07fac77ef8ef
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
@@ -0,0 +1,463 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <obd.h>
+#include <obd_support.h>
+
+#include "gss_internal.h"
+#include "gss_crypto.h"
+
+int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name,
+		      const int alg_mode)
+{
+	int rc;
+
+	kb->kb_tfm = crypto_alloc_sync_skcipher(alg_name, alg_mode, 0);
+	if (IS_ERR(kb->kb_tfm)) {
+		rc = PTR_ERR(kb->kb_tfm);
+		kb->kb_tfm = NULL;
+		CERROR("failed to alloc tfm: %s, mode %d: rc = %d\n", alg_name,
+		       alg_mode, rc);
+		return rc;
+	}
+
+	rc = crypto_sync_skcipher_setkey(kb->kb_tfm, kb->kb_key.data,
+					 kb->kb_key.len);
+	if (rc) {
+		CERROR("failed to set %s key, len %d, rc = %d\n", alg_name,
+		       kb->kb_key.len, rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+void gss_keyblock_free(struct gss_keyblock *kb)
+{
+	rawobj_free(&kb->kb_key);
+	if (kb->kb_tfm)
+		crypto_free_sync_skcipher(kb->kb_tfm);
+}
+
+int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb)
+{
+	return rawobj_dup(&new->kb_key, &kb->kb_key);
+}
+
+int gss_get_bytes(char **ptr, const char *end, void *res, size_t len)
+{
+	char *p, *q;
+	p = *ptr;
+	q = p + len;
+	if (q > end || q < p)
+		return -EINVAL;
+	memcpy(res, p, len);
+	*ptr = q;
+	return 0;
+}
+
+int gss_get_rawobj(char **ptr, const char *end, rawobj_t *res)
+{
+	char   *p, *q;
+	__u32   len;
+
+	p = *ptr;
+	if (gss_get_bytes(&p, end, &len, sizeof(len)))
+		return -EINVAL;
+
+	q = p + len;
+	if (q > end || q < p)
+		return -EINVAL;
+
+	/* Support empty objects */
+	if (len != 0) {
+		OBD_ALLOC_LARGE(res->data, len);
+		if (!res->data)
+			return -ENOMEM;
+	} else {
+		res->len = len;
+		res->data = NULL;
+		return 0;
+	}
+
+	res->len = len;
+	memcpy(res->data, p, len);
+	*ptr = q;
+	return 0;
+}
+
+int gss_get_keyblock(char **ptr, const char *end,
+		     struct gss_keyblock *kb, __u32 keysize)
+{
+	char *buf;
+	int rc;
+
+	OBD_ALLOC_LARGE(buf, keysize);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	rc = gss_get_bytes(ptr, end, buf, keysize);
+	if (rc) {
+		OBD_FREE_LARGE(buf, keysize);
+		return rc;
+	}
+
+	kb->kb_key.len = keysize;
+	kb->kb_key.data = buf;
+	return 0;
+}
+
+/*
+ * Should be used for buffers allocated with k/vmalloc().
+ *
+ * Dispose of @sgt with gss_teardown_sgtable().
+ *
+ * @prealloc_sg is to avoid memory allocation inside sg_alloc_table()
+ * in cases where a single sg is sufficient.  No attempt to reduce the
+ * number of sgs by squeezing physically contiguous pages together is
+ * made though, for simplicity.
+ *
+ * This function is copied from the ceph filesystem code.
+ */
+int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
+		      const void *buf, unsigned int buf_len)
+{
+	struct scatterlist *sg;
+	const bool is_vmalloc = is_vmalloc_addr(buf);
+	unsigned int off = offset_in_page(buf);
+	unsigned int chunk_cnt = 1;
+	unsigned int chunk_len = PAGE_ALIGN(off + buf_len);
+	int i;
+	int rc;
+
+	if (buf_len == 0) {
+		memset(sgt, 0, sizeof(*sgt));
+		return -EINVAL;
+	}
+
+	if (is_vmalloc) {
+		chunk_cnt = chunk_len >> PAGE_SHIFT;
+		chunk_len = PAGE_SIZE;
+	}
+
+	if (chunk_cnt > 1) {
+		rc = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS);
+		if (rc)
+			return rc;
+	} else {
+		WARN_ON_ONCE(chunk_cnt != 1);
+		sg_init_table(prealloc_sg, 1);
+		sgt->sgl = prealloc_sg;
+		sgt->nents = sgt->orig_nents = 1;
+	}
+
+	for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) {
+		struct page *page;
+		unsigned int len = min(chunk_len - off, buf_len);
+
+		if (is_vmalloc)
+			page = vmalloc_to_page(buf);
+		else
+			page = virt_to_page(buf);
+
+		sg_set_page(sg, page, len, off);
+
+		off = 0;
+		buf += len;
+		buf_len -= len;
+	}
+
+	WARN_ON_ONCE(buf_len != 0);
+
+	return 0;
+}
+
+void gss_teardown_sgtable(struct sg_table *sgt)
+{
+	if (sgt->orig_nents > 1)
+		sg_free_table(sgt);
+}
+
+int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt,
+		      const void *iv, const void *in, void *out, size_t length)
+{
+	struct scatterlist sg;
+	struct sg_table sg_out;
+	__u8 local_iv[16] = {0};
+	__u32 ret = -EINVAL;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
+	LASSERT(tfm);
+
+	if (length % crypto_sync_skcipher_blocksize(tfm) != 0) {
+		CERROR("output length %zu mismatch blocksize %d\n",
+		       length, crypto_sync_skcipher_blocksize(tfm));
+		goto out;
+	}
+
+	if (crypto_sync_skcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) {
+		CERROR("iv size too large %d\n",
+			crypto_sync_skcipher_ivsize(tfm));
+		goto out;
+	}
+
+	if (iv)
+		memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm));
+
+	if (in != out)
+		memmove(out, in, length);
+
+	ret = gss_setup_sgtable(&sg_out, &sg, out, length);
+	if (ret != 0)
+		goto out;
+
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg, &sg, length, local_iv);
+
+	if (decrypt)
+		ret = crypto_skcipher_decrypt_iv(req, &sg, &sg, length);
+	else
+		ret = crypto_skcipher_encrypt_iv(req, &sg, &sg, length);
+
+	skcipher_request_zero(req);
+	gss_teardown_sgtable(&sg_out);
+out:
+	return ret;
+}
+
+int gss_digest_hash(struct ahash_request *req,
+		    rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
+		    int iovcnt, struct bio_vec *iovs)
+{
+	struct scatterlist sg[1];
+	struct sg_table sgt;
+	int rc = 0;
+	int i;
+
+	for (i = 0; i < msgcnt; i++) {
+		if (msgs[i].len == 0)
+			continue;
+
+		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
+		if (rc)
+			return rc;
+
+		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
+		rc = crypto_ahash_update(req);
+		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iovs[i].bv_len == 0)
+			continue;
+
+		sg_init_table(sg, 1);
+		sg_set_page(&sg[0], iovs[i].bv_page, iovs[i].bv_len,
+			    iovs[i].bv_offset);
+
+		ahash_request_set_crypt(req, sg, NULL, iovs[i].bv_len);
+		rc = crypto_ahash_update(req);
+		if (rc)
+			return rc;
+	}
+
+	if (hdr) {
+		rc = gss_setup_sgtable(&sgt, sg, hdr->data, hdr->len);
+		if (rc)
+			return rc;
+
+		ahash_request_set_crypt(req, sg, NULL, hdr->len);
+		rc = crypto_ahash_update(req);
+		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+int gss_digest_hash_compat(struct ahash_request *req,
+			   rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
+			   int iovcnt, struct bio_vec *iovs)
+{
+	struct scatterlist sg[1];
+	struct sg_table sgt;
+	int rc = 0;
+	int i;
+
+	for (i = 0; i < msgcnt; i++) {
+		if (msgs[i].len == 0)
+			continue;
+
+		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
+		if (rc)
+			return rc;
+
+		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
+		rc = crypto_ahash_update(req);
+		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iovs[i].bv_len == 0)
+			continue;
+
+		sg_init_table(sg, 1);
+		sg_set_page(&sg[0], iovs[i].bv_page, iovs[i].bv_len,
+			    iovs[i].bv_offset);
+
+		ahash_request_set_crypt(req, sg, NULL, iovs[i].bv_len);
+		rc = crypto_ahash_update(req);
+		if (rc)
+			return rc;
+	}
+
+	if (hdr) {
+		rc = gss_setup_sgtable(&sgt, sg, &(hdr->len), sizeof(hdr->len));
+		if (rc)
+			return rc;
+
+		ahash_request_set_crypt(req, sg, NULL, sizeof(hdr->len));
+		rc = crypto_ahash_update(req);
+		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
+{
+	int padding;
+
+	padding = (blocksize - (msg->len & (blocksize - 1))) &
+		  (blocksize - 1);
+	if (!padding)
+		return 0;
+
+	if (msg->len + padding > msg_buflen) {
+		CERROR("bufsize %u too small: datalen %u, padding %u\n",
+		       msg_buflen, msg->len, padding);
+		return -EINVAL;
+	}
+
+	memset(msg->data + msg->len, padding, padding);
+	msg->len += padding;
+	return 0;
+}
+
+int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
+		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
+		      int enc)
+{
+	struct scatterlist src;
+	struct scatterlist dst;
+	struct sg_table sg_dst;
+	struct sg_table sg_src;
+	__u8 *buf;
+	__u32 datalen = 0;
+	int i, rc;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
+	ENTRY;
+
+	buf = outobj->data;
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+
+	for (i = 0; i < inobj_cnt; i++) {
+		LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len);
+
+		rc = gss_setup_sgtable(&sg_src, &src, inobjs[i].data,
+				   inobjs[i].len);
+		if (rc != 0)
+			RETURN(rc);
+
+		rc = gss_setup_sgtable(&sg_dst, &dst, buf,
+				       outobj->len - datalen);
+		if (rc != 0) {
+			gss_teardown_sgtable(&sg_src);
+			RETURN(rc);
+		}
+
+		skcipher_request_set_crypt(req, &src, &dst, src.length, iv);
+		if (!iv)
+			skcipher_request_set_crypt_iv(req);
+
+		if (enc)
+			rc = crypto_skcipher_encrypt_iv(req, &dst, &src,
+							src.length);
+		else
+			rc = crypto_skcipher_decrypt_iv(req, &dst, &src,
+							src.length);
+
+		gss_teardown_sgtable(&sg_src);
+		gss_teardown_sgtable(&sg_dst);
+
+		if (rc) {
+			CERROR("encrypt error %d\n", rc);
+			skcipher_request_zero(req);
+			RETURN(rc);
+		}
+
+		datalen += inobjs[i].len;
+		buf += inobjs[i].len;
+	}
+	skcipher_request_zero(req);
+
+	outobj->len = datalen;
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
new file mode 100644
index 0000000000000..7653e2139dbef
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
@@ -0,0 +1,131 @@
+#ifndef PTLRPC_GSS_CRYPTO_H
+#define PTLRPC_GSS_CRYPTO_H
+
+#include <linux/scatterlist.h>
+
+#include "gss_internal.h"
+
+#include <crypto/skcipher.h>
+
+/*
+ * linux v4.19-rc2-66-gb350bee5ea0f
+ * crypto: skcipher - Introduce crypto_sync_skcipher
+ *
+ * crypto_sync_skcipher will replace crypto_blkcipher so start using
+ * crypto_sync_skcipher and provide wrappers for older kernels
+ */
+#ifdef SYNC_SKCIPHER_REQUEST_ON_STACK
+
+#define crypto_skcipher_encrypt_iv(desc, dst, src, blocksize)		\
+	crypto_skcipher_encrypt((desc))
+
+#define crypto_skcipher_decrypt_iv(desc, dst, src, blocksize)		\
+	crypto_skcipher_decrypt((desc))
+
+#define skcipher_request_set_crypt_iv(d)
+
+#else /* ! SYNC_SKCIPHER_REQUEST_ON_STACK */
+
+#ifdef HAVE_CRYPTO_ALLOC_SKCIPHER
+
+#define crypto_sync_skcipher		crypto_skcipher
+
+#define SYNC_SKCIPHER_REQUEST_ON_STACK	SKCIPHER_REQUEST_ON_STACK
+
+#define skcipher_request_set_sync_tfm	skcipher_request_set_tfm
+
+#define skcipher_request_set_crypt_iv(d)
+
+#define crypto_sync_skcipher_blocksize	crypto_skcipher_blocksize
+
+#define crypto_sync_skcipher_setkey	crypto_skcipher_setkey
+
+#define crypto_alloc_sync_skcipher	crypto_alloc_skcipher
+
+#define crypto_free_sync_skcipher	crypto_free_skcipher
+
+#define crypto_sync_skcipher_ivsize	crypto_skcipher_ivsize
+
+#define crypto_skcipher_encrypt_iv(desc, dst, src, blocksize)	\
+	crypto_skcipher_encrypt((desc))
+
+#define crypto_skcipher_decrypt_iv(desc, dst, src, blocksize)	\
+	crypto_skcipher_decrypt((desc))
+
+#define skcipher_request_zero(req) /* nop */
+
+#else /* ! HAVE_CRYPTO_ALLOC_SKCIPHER */
+
+#define	crypto_sync_skcipher		crypto_blkcipher
+
+#define SYNC_SKCIPHER_REQUEST_ON_STACK(name, tfm)			\
+	struct blkcipher_desc __##name##_obj, *name = (void *)&__##name##_obj
+
+#define skcipher_request_set_sync_tfm(d, _tfm)				\
+	do { (d)->tfm = _tfm; } while (0)
+
+#define skcipher_request_set_callback(d, f, c, data)			\
+	do { (d)->flags = f; } while (0)
+
+#define skcipher_request_set_crypt(d, src, dst, cryptlen, iv)		\
+	do { (d)->info = iv; } while (0)
+
+#define skcipher_request_set_crypt_iv(d)				\
+	do { (d)->info = crypto_blkcipher_crt((d)->tfm)->iv; } while (0)
+
+#define crypto_sync_skcipher_blocksize(tfm)				\
+	crypto_blkcipher_blocksize((tfm))
+
+#define crypto_sync_skcipher_setkey(tfm, key, keylen)			\
+	crypto_blkcipher_setkey((tfm), (key), (keylen))
+
+#define crypto_alloc_sync_skcipher(name, type, mask)			\
+	crypto_alloc_blkcipher((name), (type), (mask))
+
+#define crypto_free_sync_skcipher(tfm)					\
+	crypto_free_blkcipher((tfm))
+
+#define crypto_sync_skcipher_ivsize(tfm)				\
+	crypto_blkcipher_ivsize((tfm))
+
+#define crypto_skcipher_encrypt_iv(desc, dst, src, len)			\
+	crypto_blkcipher_encrypt_iv((desc), (dst), (src), (len))
+
+#define crypto_skcipher_decrypt_iv(desc, dst, src, len)			\
+	crypto_blkcipher_decrypt_iv((desc), (dst), (src), (len))
+
+#define skcipher_request_zero(req) /* nop */
+
+#endif /* HAVE_CRYPTO_ALLOC_SKCIPHER */
+#endif /* SYNC_SKCIPHER_REQUEST_ON_STACK */
+
+struct gss_keyblock {
+	rawobj_t kb_key;
+	struct crypto_sync_skcipher *kb_tfm;
+};
+
+int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name,
+		      const int alg_mode);
+void gss_keyblock_free(struct gss_keyblock *kb);
+int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb);
+int gss_get_bytes(char **ptr, const char *end, void *res, size_t len);
+int gss_get_rawobj(char **ptr, const char *end, rawobj_t *res);
+int gss_get_keyblock(char **ptr, const char *end, struct gss_keyblock *kb,
+		     __u32 keysize);
+int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
+		      const void *buf, unsigned int buf_len);
+void gss_teardown_sgtable(struct sg_table *sgt);
+int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt,
+		      const void *iv, const void *in, void *out, size_t length);
+int gss_digest_hash(struct ahash_request *req, rawobj_t *hdr,
+		    int msgcnt, rawobj_t *msgs, int iovcnt,
+		    struct bio_vec *iovs);
+int gss_digest_hash_compat(struct ahash_request *req,
+			   rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
+			   int iovcnt, struct bio_vec *iovs);
+int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize);
+int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
+		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
+		      int enc);
+
+#endif /* PTLRPC_GSS_CRYPTO_H */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h
new file mode 100644
index 0000000000000..34cd9a422e06b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h
@@ -0,0 +1,193 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __PTLRPC_GSS_GSS_ERR_H_
+#define __PTLRPC_GSS_GSS_ERR_H_
+
+typedef unsigned int OM_uint32;
+
+/*
+ * Flag bits for context-level services.
+ */
+#define GSS_C_DELEG_FLAG        (1)
+#define GSS_C_MUTUAL_FLAG       (2)
+#define GSS_C_REPLAY_FLAG       (4)
+#define GSS_C_SEQUENCE_FLAG     (8)
+#define GSS_C_CONF_FLAG         (16)
+#define GSS_C_INTEG_FLAG        (32)
+#define GSS_C_ANON_FLAG         (64)
+#define GSS_C_PROT_READY_FLAG   (128)
+#define GSS_C_TRANS_FLAG        (256)
+
+/*
+ * Credential usage options
+ */
+#define GSS_C_BOTH              (0)
+#define GSS_C_INITIATE          (1)
+#define GSS_C_ACCEPT            (2)
+
+/*
+ * Status code types for gss_display_status
+ */
+#define GSS_C_GSS_CODE          (1)
+#define GSS_C_MECH_CODE         (2)
+
+
+/*
+ * Define the default Quality of Protection for per-message services.  Note
+ * that an implementation that offers multiple levels of QOP may either reserve
+ * a value (for example zero, as assumed here) to mean "default protection", or
+ * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit
+ * QOP value.  However a value of 0 should always be interpreted by a GSSAPI
+ * implementation as a request for the default protection level.
+ */
+#define GSS_C_QOP_DEFAULT       (0)
+
+/*
+ * Expiration time of 2^32-1 seconds means infinite lifetime for a
+ * credential or security context
+ */
+#define GSS_C_INDEFINITE        ((OM_uint32) 0xfffffffful)
+
+
+/* Major status codes */
+
+#define GSS_S_COMPLETE          (0)
+
+/*
+ * Some "helper" definitions to make the status code macros obvious.
+ */
+#define GSS_C_CALLING_ERROR_OFFSET      (24)
+#define GSS_C_ROUTINE_ERROR_OFFSET      (16)
+#define GSS_C_SUPPLEMENTARY_OFFSET      (0)
+#define GSS_C_CALLING_ERROR_MASK        ((OM_uint32) 0377ul)
+#define GSS_C_ROUTINE_ERROR_MASK        ((OM_uint32) 0377ul)
+#define GSS_C_SUPPLEMENTARY_MASK        ((OM_uint32) 0177777ul)
+
+/*
+ * The macros that test status codes for error conditions.  Note that the
+ * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now
+ * evaluates its argument only once.
+ */
+#define GSS_CALLING_ERROR(x) \
+  ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET))
+#define GSS_ROUTINE_ERROR(x) \
+  ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))
+#define GSS_SUPPLEMENTARY_INFO(x) \
+  ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET))
+#define GSS_ERROR(x) \
+  ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \
+          (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)))
+
+/*
+ * Now the actual status code definitions
+ */
+
+/*
+ * Calling errors:
+ */
+#define GSS_S_CALL_INACCESSIBLE_READ \
+        (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_INACCESSIBLE_WRITE \
+        (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_BAD_STRUCTURE \
+        (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET)
+
+/*
+ * Routine errors:
+ */
+#define GSS_S_BAD_MECH \
+        (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAME \
+        (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAMETYPE \
+        (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_BINDINGS \
+        (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_STATUS \
+        (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_SIG \
+        (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CRED \
+        (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CONTEXT \
+        (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_TOKEN \
+        (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_CREDENTIAL \
+        (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CREDENTIALS_EXPIRED \
+        (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CONTEXT_EXPIRED \
+        (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_FAILURE \
+        (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_QOP \
+        (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAUTHORIZED \
+        (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAVAILABLE \
+        (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DUPLICATE_ELEMENT \
+        (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NAME_NOT_MN \
+        (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+
+/*
+ * Supplementary info bits:
+ */
+#define GSS_S_CONTINUE_NEEDED   BIT(GSS_C_SUPPLEMENTARY_OFFSET + 0)
+#define GSS_S_DUPLICATE_TOKEN   BIT(GSS_C_SUPPLEMENTARY_OFFSET + 1)
+#define GSS_S_OLD_TOKEN         BIT(GSS_C_SUPPLEMENTARY_OFFSET + 2)
+#define GSS_S_UNSEQ_TOKEN       BIT(GSS_C_SUPPLEMENTARY_OFFSET + 3)
+#define GSS_S_GAP_TOKEN         BIT(GSS_C_SUPPLEMENTARY_OFFSET + 4)
+
+/* XXXX these are not part of the GSSAPI C bindings!  (but should be) */
+
+#define GSS_CALLING_ERROR_FIELD(x) \
+        (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK)
+#define GSS_ROUTINE_ERROR_FIELD(x) \
+        (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK)
+#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \
+        (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK)
+
+/* XXXX This is a necessary evil until the spec is fixed */
+#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE
+
+#endif /* __PTLRPC_GSS_GSS_ERR_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
new file mode 100644
index 0000000000000..23506f89d67c2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
@@ -0,0 +1,284 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_generic_token.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+        memcpy((ptr), (char *) (str), (len)); \
+        (ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+   never be longer than 127 bytes.  This assumption is not inherent in
+   the interfaces, so the code can be fixed if the OSI namespace
+   balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60                                tag for APPLICATION 0, SEQUENCE
+                                        (constructed, definite-length)
+        <length>                possible multiple bytes, need to parse/generate
+        0x06                        tag for OBJECT IDENTIFIER
+                <moid_length>        compile-time constant string (assume 1 byte)
+                <moid_bytes>        compile-time constant string
+        <inner_bytes>                the ANY containing the application token
+                                        bytes 0,1 are the token type
+                                        bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type.  The token
+"body" consists of everything else.
+
+*/
+
+static
+int der_length_size(int length)
+{
+        if (length < (1 << 7))
+                return 1;
+        else if (length < (1 << 8))
+                return 2;
+#if (SIZEOF_INT == 2)
+        else
+                return 3;
+#else
+        else if (length < (1 << 16))
+                return 3;
+        else if (length < (1 << 24))
+                return 4;
+        else
+                return 5;
+#endif
+}
+
+static
+void der_write_length(unsigned char **buf, int length)
+{
+        if (length < (1 << 7)) {
+                *(*buf)++ = (unsigned char) length;
+        } else {
+                *(*buf)++ = (unsigned char) (der_length_size(length) + 127);
+#if (SIZEOF_INT > 2)
+                if (length >= (1 << 24))
+                        *(*buf)++ = (unsigned char) (length >> 24);
+                if (length >= (1 << 16))
+                        *(*buf)++ = (unsigned char) ((length >> 16) & 0xff);
+#endif
+                if (length >= (1 << 8))
+                        *(*buf)++ = (unsigned char) ((length >> 8) & 0xff);
+                *(*buf)++ = (unsigned char) (length & 0xff);
+        }
+}
+
+/*
+ * returns decoded length, or < 0 on failure.  Advances buf and
+ * decrements bufsize
+ */
+static
+int der_read_length(unsigned char **buf, int *bufsize)
+{
+        unsigned char sf;
+        int ret;
+
+        if (*bufsize < 1)
+                return -1;
+        sf = *(*buf)++;
+        (*bufsize)--;
+        if (sf & 0x80) {
+                if ((sf &= 0x7f) > ((*bufsize) - 1))
+                        return -1;
+                if (sf > SIZEOF_INT)
+                        return -1;
+                ret = 0;
+                for (; sf; sf--) {
+                        ret = (ret << 8) + (*(*buf)++);
+                        (*bufsize)--;
+                }
+        } else {
+                ret = sf;
+        }
+
+        return ret;
+}
+
+/*
+ * returns the length of a token, given the mech oid and the body size
+ */
+int g_token_size(rawobj_t *mech, unsigned int body_size)
+{
+        /* set body_size to sequence contents size */
+        body_size += 4 + (int) mech->len; /* NEED overflow check */
+        return (1 + der_length_size(body_size) + body_size);
+}
+
+/*
+ * fills in a buffer with the token header.  The buffer is assumed to
+ * be the right size.  buf is advanced past the token header
+ */
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf)
+{
+        *(*buf)++ = 0x60;
+        der_write_length(buf, 4 + mech->len + body_size);
+        *(*buf)++ = 0x06;
+        *(*buf)++ = (unsigned char) mech->len;
+        TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes.  Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument.  buf and
+ * *body_size are left unmodified on error.
+ */
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+                            unsigned char **buf_in, int toksize)
+{
+        unsigned char *buf = *buf_in;
+        int seqsize;
+        rawobj_t toid;
+        int ret = 0;
+
+        if ((toksize -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        if (*buf++ != 0x60)
+                return (G_BAD_TOK_HEADER);
+
+        if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+                return(G_BAD_TOK_HEADER);
+
+        if (seqsize != toksize)
+                return (G_BAD_TOK_HEADER);
+
+        if ((toksize -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        if (*buf++ != 0x06)
+                return (G_BAD_TOK_HEADER);
+ 
+        if ((toksize -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        toid.len = *buf++;
+
+        if ((toksize -= toid.len) < 0)
+                return (G_BAD_TOK_HEADER);
+        toid.data = buf;
+        buf += toid.len;
+
+        if (!g_OID_equal(&toid, mech)) 
+                ret = G_WRONG_MECH;
+ 
+        /* G_WRONG_MECH is not returned immediately because it's more
+         * important to return G_BAD_TOK_HEADER if the token header is
+         * in fact bad
+         */
+        if ((toksize -= 2) < 0)
+                return (G_BAD_TOK_HEADER);
+
+        if (ret)
+                return (ret);
+
+        if (!ret) {
+                *buf_in = buf;
+                *body_size = toksize;
+        }
+
+        return (ret);
+}
+
+/*
+ * Given a buffer containing a token, returns a copy of the mech oid in
+ * the parameter mech.
+ */
+__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t *in_buf)
+{
+        unsigned char *buf = in_buf->data;
+        int len = in_buf->len;
+        int ret = 0;
+        int seqsize;
+
+        if ((len -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        if (*buf++ != 0x60)
+                return (G_BAD_TOK_HEADER);
+
+        if ((seqsize = der_read_length(&buf, &len)) < 0)
+                return (G_BAD_TOK_HEADER);
+
+        if ((len -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        if (*buf++ != 0x06)
+                return (G_BAD_TOK_HEADER);
+
+        if ((len -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        mech->len = *buf++;
+
+        if ((len -= mech->len) < 0)
+                return (G_BAD_TOK_HEADER);
+        OBD_ALLOC_LARGE(mech->data, mech->len);
+        if (!mech->data) 
+                return (G_BUFFER_ALLOC);
+        memcpy(mech->data, buf, mech->len);
+
+        return ret;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
new file mode 100644
index 0000000000000..d8302bacfc9d8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
@@ -0,0 +1,509 @@
+/*
+ * Modified from NFSv4 project for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#ifndef __PTLRPC_GSS_GSS_INTERNAL_H_
+#define __PTLRPC_GSS_GSS_INTERNAL_H_
+
+#include <crypto/hash.h>
+#include <libcfs/libcfs_crypto.h>
+#include <lustre_sec.h>
+
+/*
+ * rawobj stuff
+ */
+#define NETOBJ_EMPTY    ((netobj_t) { 0 })
+#define RAWOBJ_EMPTY    ((rawobj_t) { 0, NULL })
+
+typedef struct rawobj_buf_s {
+        __u32           dataoff;
+        __u32           datalen;
+        __u32           buflen;
+        __u8           *buf;
+} rawobj_buf_t;
+
+int rawobj_empty(rawobj_t *obj);
+int rawobj_alloc(rawobj_t *obj, char *buf, int len);
+void rawobj_free(rawobj_t *obj);
+int rawobj_equal(rawobj_t *a, rawobj_t *b);
+int rawobj_dup(rawobj_t *dest, rawobj_t *src);
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj);
+int rawobj_from_netobj_alloc(rawobj_t *obj, netobj_t *netobj);
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+                         void *res, __u32 reslen);
+
+/*
+ * several timeout values. client refresh upcall timeout we using
+ * default in pipefs implemnetation.
+ */
+#define __TIMEOUT_DELTA                 (10)
+
+#define GSS_SECINIT_RPC_TIMEOUT                                         \
+        (obd_timeout < __TIMEOUT_DELTA ?                                \
+         __TIMEOUT_DELTA : obd_timeout - __TIMEOUT_DELTA)
+
+#define GSS_SECFINI_RPC_TIMEOUT         (__TIMEOUT_DELTA)
+#define GSS_SECSVC_UPCALL_TIMEOUT       (GSS_SECINIT_RPC_TIMEOUT)
+
+/*
+ * default gc interval
+ */
+#define GSS_GC_INTERVAL                 (60 * 60) /* 60 minutes */
+
+static inline time64_t gss_round_ctx_expiry(time64_t expiry,
+					    unsigned long sec_flags)
+{
+	if (sec_flags & PTLRPC_SEC_FL_REVERSE)
+		return expiry;
+
+	if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry)
+		return expiry - __TIMEOUT_DELTA;
+
+	return expiry;
+}
+
+/*
+ * Max encryption element in block cipher algorithms.
+ */
+#define GSS_MAX_CIPHER_BLOCK               (16)
+
+/*
+ * XXX make it visible of kernel and lgssd/lsvcgssd
+ */
+enum {
+	GSSD_INTERFACE_VERSION_V1 = 1,
+	GSSD_INTERFACE_VERSION_V2 = 2,
+	GSSD_INTERFACE_VERSION = GSSD_INTERFACE_VERSION_V2,
+};
+
+#define PTLRPC_GSS_VERSION              (1)
+
+
+enum ptlrpc_gss_proc {
+        PTLRPC_GSS_PROC_DATA            = 0,
+        PTLRPC_GSS_PROC_INIT            = 1,
+        PTLRPC_GSS_PROC_CONTINUE_INIT   = 2,
+        PTLRPC_GSS_PROC_DESTROY         = 3,
+        PTLRPC_GSS_PROC_ERR             = 4,
+};
+
+enum ptlrpc_gss_tgt {
+        LUSTRE_GSS_TGT_MGS              = 0,
+        LUSTRE_GSS_TGT_MDS              = 1,
+        LUSTRE_GSS_TGT_OSS              = 2,
+};
+
+enum ptlrpc_gss_header_flags {
+	LUSTRE_GSS_PACK_BULK            = 1,
+	LUSTRE_GSS_PACK_USER            = 2,
+	LUSTRE_GSS_PACK_KCSUM           = 4,
+};
+
+static inline
+__u32 import_to_gss_svc(struct obd_import *imp)
+{
+	int cl_sp_to = LUSTRE_SP_ANY;
+
+	if (imp->imp_obd)
+		cl_sp_to = imp->imp_obd->u.cli.cl_sp_to;
+
+	switch (cl_sp_to) {
+	case LUSTRE_SP_MDT:
+		return LUSTRE_GSS_TGT_MDS;
+	case LUSTRE_SP_OST:
+		return LUSTRE_GSS_TGT_OSS;
+	case LUSTRE_SP_MGC:
+	case LUSTRE_SP_MGS:
+		return LUSTRE_GSS_TGT_MGS;
+	case LUSTRE_SP_CLI:
+	case LUSTRE_SP_ANY:
+	default:
+		return 0;
+	}
+}
+
+#define PTLRPC_GSS_MAX_HANDLE_SIZE      (8)
+#define PTLRPC_GSS_HEADER_SIZE          (sizeof(struct gss_header) + \
+                                         PTLRPC_GSS_MAX_HANDLE_SIZE)
+
+
+static inline __u64 gss_handle_to_u64(rawobj_t *handle)
+{
+        if (handle->len != PTLRPC_GSS_MAX_HANDLE_SIZE)
+                return -1;
+        return *((__u64 *) handle->data);
+}
+
+#define GSS_SEQ_WIN                     (2048)
+#define GSS_SEQ_WIN_MAIN                GSS_SEQ_WIN
+#define GSS_SEQ_WIN_BACK                (128)
+#define GSS_SEQ_REPACK_THRESHOLD        (GSS_SEQ_WIN_MAIN / 2 + \
+                                         GSS_SEQ_WIN_MAIN / 4)
+
+struct gss_svc_seq_data {
+	spinlock_t		ssd_lock;
+        /*
+         * highest sequence number seen so far, for main and back window
+         */
+        __u32                   ssd_max_main;
+        __u32                   ssd_max_back;
+        /*
+         * main and back window
+         * for i such that ssd_max - GSS_SEQ_WIN < i <= ssd_max, the i-th bit
+         * of ssd_win is nonzero iff sequence number i has been seen already.
+         */
+        unsigned long           ssd_win_main[GSS_SEQ_WIN_MAIN/BITS_PER_LONG];
+        unsigned long           ssd_win_back[GSS_SEQ_WIN_BACK/BITS_PER_LONG];
+};
+
+struct gss_svc_ctx {
+        struct gss_ctx         *gsc_mechctx;
+        struct gss_svc_seq_data gsc_seqdata;
+        rawobj_t                gsc_rvs_hdl;
+        __u32                   gsc_rvs_seq;
+        uid_t                   gsc_uid;
+        gid_t                   gsc_gid;
+        uid_t                   gsc_mapped_uid;
+        unsigned int            gsc_usr_root:1,
+                                gsc_usr_mds:1,
+                                gsc_usr_oss:1,
+                                gsc_remote:1,
+                                gsc_reverse:1;
+};
+
+struct gss_svc_reqctx {
+        struct ptlrpc_svc_ctx           src_base;
+        /*
+         * context
+         */
+        struct gss_wire_ctx             src_wirectx;
+        struct gss_svc_ctx             *src_ctx;
+        /*
+         * record place of bulk_sec_desc in request/reply buffer
+         */
+        struct ptlrpc_bulk_sec_desc    *src_reqbsd;
+        int                             src_reqbsd_size;
+        struct ptlrpc_bulk_sec_desc    *src_repbsd;
+        int                             src_repbsd_size;
+        /*
+         * flags
+         */
+        unsigned int                    src_init:1,
+                                        src_init_continue:1,
+                                        src_err_notify:1;
+        int                             src_reserve_len;
+};
+
+struct gss_cli_ctx {
+	struct ptlrpc_cli_ctx	gc_base;
+	__u32			gc_flavor;
+	__u32			gc_proc;
+	__u32			gc_win;
+	atomic_t		gc_seq;
+	rawobj_t		gc_handle;
+	struct gss_ctx		*gc_mechctx;
+	/* handle for the buddy svc ctx */
+	rawobj_t		gc_svc_handle;
+};
+
+struct gss_cli_ctx_keyring {
+	struct gss_cli_ctx      gck_base;
+	struct key             *gck_key;
+	struct timer_list       gck_timer;
+};
+
+struct gss_sec {
+	struct ptlrpc_sec	gs_base;
+	struct gss_api_mech	*gs_mech;
+	spinlock_t		gs_lock;
+	__u64			gs_rvs_hdl;
+};
+
+struct gss_sec_pipefs {
+	struct gss_sec		gsp_base;
+	int			gsp_chash_size;	/* must be 2^n */
+	struct hlist_head	gsp_chash[0];
+};
+
+/*
+ * FIXME cleanup the keyring upcall mutexes
+ */
+#define HAVE_KEYRING_UPCALL_SERIALIZED  1
+
+struct gss_sec_keyring {
+        struct gss_sec          gsk_base;
+        /*
+         * all contexts listed here. access is protected by sec spinlock.
+         */
+	struct hlist_head	gsk_clist;
+        /*
+         * specially point to root ctx (only one at a time). access is
+         * protected by sec spinlock.
+         */
+        struct ptlrpc_cli_ctx  *gsk_root_ctx;
+        /*
+         * specially serialize upcalls for root context.
+         */
+	struct mutex			gsk_root_uc_lock;
+
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	struct mutex		gsk_uc_lock;	/* serialize upcalls */
+#endif
+};
+
+static inline struct gss_cli_ctx *ctx2gctx(struct ptlrpc_cli_ctx *ctx)
+{
+        return container_of(ctx, struct gss_cli_ctx, gc_base);
+}
+
+static inline
+struct gss_cli_ctx_keyring *ctx2gctx_keyring(struct ptlrpc_cli_ctx *ctx)
+{
+        return container_of(ctx2gctx(ctx),
+                            struct gss_cli_ctx_keyring, gck_base);
+}
+
+static inline struct gss_sec *sec2gsec(struct ptlrpc_sec *sec)
+{
+        return container_of(sec, struct gss_sec, gs_base);
+}
+
+static inline struct gss_sec_pipefs *sec2gsec_pipefs(struct ptlrpc_sec *sec)
+{
+        return container_of(sec2gsec(sec), struct gss_sec_pipefs, gsp_base);
+}
+
+static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec)
+{
+        return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base);
+}
+
+#ifdef HAVE_CACHE_HASH_SPINLOCK
+# define sunrpc_cache_lookup(c, i, h) sunrpc_cache_lookup_rcu((c), (i), (h))
+# define cache_read_lock(cdetail)   spin_lock(&((cdetail)->hash_lock))
+# define cache_read_unlock(cdetail) spin_unlock(&((cdetail)->hash_lock))
+#else /* ! HAVE_CACHE_HASH_SPINLOCK */
+# define cache_read_lock(cdetail)   read_lock(&((cdetail)->hash_lock))
+# define cache_read_unlock(cdetail) read_unlock(&((cdetail)->hash_lock))
+#endif
+
+#define GSS_CTX_INIT_MAX_LEN            (1024)
+
+/*
+ * This only guaranteed be enough for current krb5 des-cbc-crc . We might
+ * adjust this when new enc type or mech added in.
+ */
+#define GSS_PRIVBUF_PREFIX_LEN         (32)
+#define GSS_PRIVBUF_SUFFIX_LEN         (32)
+
+static inline
+struct gss_svc_reqctx *gss_svc_ctx2reqctx(struct ptlrpc_svc_ctx *ctx)
+{
+        LASSERT(ctx);
+        return container_of(ctx, struct gss_svc_reqctx, src_base);
+}
+
+static inline
+struct gss_svc_ctx *gss_svc_ctx2gssctx(struct ptlrpc_svc_ctx *ctx)
+{
+        LASSERT(ctx);
+        return gss_svc_ctx2reqctx(ctx)->src_ctx;
+}
+
+/* sec_gss.c */
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred);
+int gss_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+
+int  gss_sec_install_rctx(struct obd_import *imp, struct ptlrpc_sec *sec,
+                          struct ptlrpc_cli_ctx *ctx);
+int  gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                      int msgsize);
+void gss_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                      int msgsize);
+void gss_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                        int segment, int newsize);
+
+int  gss_svc_accept(struct ptlrpc_sec_policy *policy,
+                    struct ptlrpc_request *req);
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx);
+int  gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  gss_svc_authorize(struct ptlrpc_request *req);
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs);
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx);
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx);
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+                         struct ptlrpc_svc_ctx *svc_ctx);
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+                                   int swabbed);
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment);
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx);
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor);
+int gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num, int set);
+
+int gss_sec_create_common(struct gss_sec *gsec,
+                          struct ptlrpc_sec_policy *policy,
+                          struct obd_import *imp,
+                          struct ptlrpc_svc_ctx *ctx,
+                          struct sptlrpc_flavor *sf);
+void gss_sec_destroy_common(struct gss_sec *gsec);
+void gss_sec_kill(struct ptlrpc_sec *sec);
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx,
+                            struct ptlrpc_ctx_ops *ctxops,
+                            struct vfs_cred *vcred);
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx);
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize);
+
+/* gss_keyring.c */
+#ifndef HAVE_GSS_KEYRING
+static inline int  __init gss_init_keyring(void) { return 0; }
+static inline void __exit gss_exit_keyring(void) { return; }
+#else
+int  __init gss_init_keyring(void);
+void __exit gss_exit_keyring(void);
+#endif
+extern unsigned int gss_check_upcall_ns;
+
+/* gss_pipefs.c */
+#ifndef HAVE_GSS_PIPEFS
+static inline int  __init gss_init_pipefs(void) { return 0; }
+static inline void __exit gss_exit_pipefs(void) { return; }
+#else
+int  __init gss_init_pipefs(void);
+void __exit gss_exit_pipefs(void);
+#endif
+
+/* gss_bulk.c */
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                          struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                            struct ptlrpc_request *req,
+                            struct ptlrpc_bulk_desc *desc);
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc);
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+                        struct ptlrpc_bulk_desc *desc);
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc);
+
+/* gss_generic_token.c */
+int g_token_size(rawobj_t *mech, unsigned int body_size);
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf);
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+                            unsigned char **buf_in, int toksize);
+
+
+/* gss_cli_upcall.c */
+int gss_do_ctx_init_rpc(char __user *buffer, unsigned long count);
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx);
+
+int  __init gss_init_cli_upcall(void);
+void gss_exit_cli_upcall(void);
+
+/* gss_svc_upcall.c */
+__u64 gss_get_next_ctx_index(void);
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+                                   struct gss_sec *gsec,
+                                   struct gss_cli_ctx *gctx);
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle);
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx);
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq);
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+                               struct gss_svc_reqctx *grctx,
+                               struct gss_wire_ctx *gw,
+                               struct obd_device *target,
+                               __u32 lustre_svc,
+                               rawobj_t *rvs_hdl,
+                               rawobj_t *in_token);
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+                                           struct gss_wire_ctx *gw);
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx);
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx);
+
+int  __init gss_init_svc_upcall(void);
+void gss_exit_svc_upcall(void);
+extern unsigned int krb5_allow_old_client_csum;
+
+/* lproc_gss.c */
+void gss_stat_oos_record_cli(int behind);
+void gss_stat_oos_record_svc(int phase, int replay);
+
+int  __init gss_init_tunables(void);
+void gss_exit_tunables(void);
+
+/* gss_null_mech.c */
+int __init init_null_module(void);
+void cleanup_null_module(void);
+
+/* gss_krb5_mech.c */
+int __init init_kerberos_module(void);
+void cleanup_kerberos_module(void);
+
+/* gss_sk_mech.c */
+#ifdef HAVE_OPENSSL_SSK
+int __init init_sk_module(void);
+void cleanup_sk_module(void);
+#else
+static inline int init_sk_module(void) { return 0; }
+static inline void cleanup_sk_module(void) { return; }
+#endif /* HAVE_OPENSSL_SSK */
+
+/* debug */
+static inline
+void __dbg_memdump(char *name, void *ptr, int size)
+{
+        char *buf, *p = (char *) ptr;
+        int bufsize = size * 2 + 1, i;
+
+        OBD_ALLOC(buf, bufsize);
+        if (!buf) {
+                CDEBUG(D_ERROR, "DUMP ERROR: can't alloc %d bytes\n", bufsize);
+                return;
+        }
+
+        for (i = 0; i < size; i++)
+                sprintf(&buf[i+i], "%02x", (__u8) p[i]);
+        buf[size + size] = '\0';
+        LCONSOLE_INFO("DUMP %s@%p(%d): %s\n", name, ptr, size, buf);
+        OBD_FREE(buf, bufsize);
+}
+
+static inline unsigned int ll_read_key_usage(struct key *key)
+{
+#ifdef HAVE_KEY_USAGE_REFCOUNT
+	return refcount_read(&key->usage);
+#else
+	return atomic_read(&key->usage);
+#endif
+}
+
+#endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
new file mode 100644
index 0000000000000..124ebe1dc15f7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
@@ -0,0 +1,1652 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/gss/gss_keyring.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <libcfs/linux/linux-list.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#ifdef HAVE_GET_REQUEST_KEY_AUTH
+#include <keys/request_key_auth-type.h>
+#endif
+
+static struct ptlrpc_sec_policy gss_policy_keyring;
+static struct ptlrpc_ctx_ops gss_keyring_ctxops;
+static struct key_type gss_key_type;
+
+static int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+                               struct ptlrpc_svc_ctx *svc_ctx);
+static void request_key_unlink(struct key *key);
+
+/*
+ * the timeout is only for the case that upcall child process die abnormally.
+ * in any other cases it should finally update kernel key.
+ *
+ * FIXME we'd better to incorporate the client & server side upcall timeouts
+ * into the framework of Adaptive Timeouts, but we need to figure out how to
+ * make sure that kernel knows the upcall processes is in-progress or died
+ * unexpectedly.
+ */
+#define KEYRING_UPCALL_TIMEOUT  (obd_timeout + obd_timeout)
+
+/* Check caller's namespace in gss_keyring upcall */
+unsigned int gss_check_upcall_ns = 1;
+
+/****************************************
+ * internal helpers                     *
+ ****************************************/
+
+static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_lock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void keyring_upcall_unlock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_unlock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void key_revoke_locked(struct key *key)
+{
+        set_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+
+static void ctx_upcall_timeout_kr(cfs_timer_cb_arg_t data)
+{
+	struct gss_cli_ctx_keyring *gctx_kr = cfs_from_timer(gctx_kr,
+							     data, gck_timer);
+	struct ptlrpc_cli_ctx *ctx = &(gctx_kr->gck_base.gc_base);
+	struct key *key	= gctx_kr->gck_key;
+
+        CWARN("ctx %p, key %p\n", ctx, key);
+
+        LASSERT(key);
+
+        cli_ctx_expire(ctx);
+        key_revoke_locked(key);
+}
+
+static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, time64_t timeout)
+{
+	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+	struct timer_list *timer = &gctx_kr->gck_timer;
+
+	LASSERT(timer);
+
+	CDEBUG(D_SEC, "ctx %p: start timer %llds\n", ctx, timeout);
+
+	cfs_timer_setup(timer, ctx_upcall_timeout_kr,
+			(unsigned long)gctx_kr, 0);
+	timer->expires = cfs_time_seconds(timeout) + jiffies;
+	add_timer(timer);
+}
+
+/*
+ * caller should make sure no race with other threads
+ */
+static
+void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx)
+{
+        struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+	struct timer_list          *timer = &gctx_kr->gck_timer;
+
+        CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key);
+
+        del_singleshot_timer_sync(timer);
+}
+
+static
+struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec,
+                                     struct vfs_cred *vcred)
+{
+	struct ptlrpc_cli_ctx      *ctx;
+	struct gss_cli_ctx_keyring *gctx_kr;
+
+	OBD_ALLOC_PTR(gctx_kr);
+	if (gctx_kr == NULL)
+		return NULL;
+
+	cfs_timer_setup(&gctx_kr->gck_timer, NULL, 0, 0);
+
+	ctx = &gctx_kr->gck_base.gc_base;
+
+	if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
+		OBD_FREE_PTR(gctx_kr);
+		return NULL;
+	}
+
+	ctx->cc_expire = ktime_get_real_seconds() + KEYRING_UPCALL_TIMEOUT;
+	clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags);
+	atomic_inc(&ctx->cc_refcount); /* for the caller */
+
+	return ctx;
+}
+
+static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_sec		*sec = ctx->cc_sec;
+	struct gss_cli_ctx_keyring	*gctx_kr = ctx2gctx_keyring(ctx);
+
+	CDEBUG(D_SEC, "destroying ctx %p\n", ctx);
+
+        /* at this time the association with key has been broken. */
+        LASSERT(sec);
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+        LASSERT(gctx_kr->gck_key == NULL);
+
+	ctx_clear_timer_kr(ctx);
+
+	if (gss_cli_ctx_fini_common(sec, ctx))
+		return;
+
+	OBD_FREE_PTR(gctx_kr);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static void ctx_release_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	if (sync) {
+		ctx_destroy_kr(ctx);
+	} else {
+		atomic_inc(&ctx->cc_refcount);
+		sptlrpc_gc_add_ctx(ctx);
+	}
+}
+
+static void ctx_put_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	if (atomic_dec_and_test(&ctx->cc_refcount))
+		ctx_release_kr(ctx, sync);
+}
+
+/*
+ * key <-> ctx association and rules:
+ * - ctx might not bind with any key
+ * - key/ctx binding is protected by key semaphore (if the key present)
+ * - key and ctx each take a reference of the other
+ * - ctx enlist/unlist is protected by ctx spinlock
+ * - never enlist a ctx after it's been unlisted
+ * - whoever do enlist should also do bind, lock key before enlist:
+ *   - lock key -> lock ctx -> enlist -> unlock ctx -> bind -> unlock key
+ * - whoever do unlist should also do unbind:
+ *   - lock key -> lock ctx -> unlist -> unlock ctx -> unbind -> unlock key
+ *   - lock ctx -> unlist -> unlock ctx -> lock key -> unbind -> unlock key
+ */
+
+static inline void spin_lock_if(spinlock_t *lock, int condition)
+{
+	if (condition)
+		spin_lock(lock);
+}
+
+static inline void spin_unlock_if(spinlock_t *lock, int condition)
+{
+	if (condition)
+		spin_unlock(lock);
+}
+
+static void ctx_enlist_kr(struct ptlrpc_cli_ctx *ctx, int is_root, int locked)
+{
+	struct ptlrpc_sec	*sec = ctx->cc_sec;
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+
+	LASSERT(!test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	spin_lock_if(&sec->ps_lock, !locked);
+
+	atomic_inc(&ctx->cc_refcount);
+	set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+	hlist_add_head(&ctx->cc_cache, &gsec_kr->gsk_clist);
+	if (is_root)
+		gsec_kr->gsk_root_ctx = ctx;
+
+	spin_unlock_if(&sec->ps_lock, !locked);
+}
+
+/*
+ * Note after this get called, caller should not access ctx again because
+ * it might have been freed, unless caller hold at least one refcount of
+ * the ctx.
+ *
+ * return non-zero if we indeed unlist this ctx.
+ */
+static int ctx_unlist_kr(struct ptlrpc_cli_ctx *ctx, int locked)
+{
+	struct ptlrpc_sec	*sec = ctx->cc_sec;
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+
+	/* if hashed bit has gone, leave the job to somebody who is doing it */
+	if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0)
+		return 0;
+
+	/* drop ref inside spin lock to prevent race with other operations */
+	spin_lock_if(&sec->ps_lock, !locked);
+
+	if (gsec_kr->gsk_root_ctx == ctx)
+		gsec_kr->gsk_root_ctx = NULL;
+	hlist_del_init(&ctx->cc_cache);
+	atomic_dec(&ctx->cc_refcount);
+
+	spin_unlock_if(&sec->ps_lock, !locked);
+
+	return 1;
+}
+
+/*
+ * Get specific payload. Newer kernels support 4 slots.
+ */
+static void *
+key_get_payload(struct key *key, unsigned int index)
+{
+	void *key_ptr = NULL;
+
+#ifdef HAVE_KEY_PAYLOAD_DATA_ARRAY
+	key_ptr = key->payload.data[index];
+#else
+	if (!index)
+		key_ptr = key->payload.data;
+#endif
+	return key_ptr;
+}
+
+/*
+ * Set specific payload. Newer kernels support 4 slots.
+ */
+static int key_set_payload(struct key *key, unsigned int index,
+			   struct ptlrpc_cli_ctx *ctx)
+{
+	int rc = -EINVAL;
+
+#ifdef HAVE_KEY_PAYLOAD_DATA_ARRAY
+	if (index < 4) {
+		key->payload.data[index] = ctx;
+#else
+	if (!index) {
+		key->payload.data = ctx;
+#endif
+		rc = 0;
+	}
+	return rc;
+}
+
+/*
+ * bind a key with a ctx together.
+ * caller must hold write lock of the key, as well as ref on key & ctx.
+ */
+static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ll_read_key_usage(key) > 0);
+	LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL);
+	LASSERT(!key_get_payload(key, 0));
+
+	/* at this time context may or may not in list. */
+	key_get(key);
+	atomic_inc(&ctx->cc_refcount);
+	ctx2gctx_keyring(ctx)->gck_key = key;
+	LASSERT(!key_set_payload(key, 0, ctx));
+}
+
+/*
+ * unbind a key and a ctx.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(key_get_payload(key, 0) == ctx);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+
+        /* must revoke the key, or others may treat it as newly created */
+        key_revoke_locked(key);
+
+	key_set_payload(key, 0, NULL);
+        ctx2gctx_keyring(ctx)->gck_key = NULL;
+
+        /* once ctx get split from key, the timer is meaningless */
+        ctx_clear_timer_kr(ctx);
+
+        ctx_put_kr(ctx, 1);
+        key_put(key);
+}
+
+/*
+ * given a ctx, unbind with its coupled key, if any.
+ * unbind could only be called once, so we don't worry the key be released
+ * by someone else.
+ */
+static void unbind_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	struct key      *key = ctx2gctx_keyring(ctx)->gck_key;
+
+	if (key) {
+		LASSERT(key_get_payload(key, 0) == ctx);
+
+		key_get(key);
+		down_write(&key->sem);
+		unbind_key_ctx(key, ctx);
+		up_write(&key->sem);
+		key_put(key);
+		request_key_unlink(key);
+	}
+}
+
+/*
+ * given a key, unbind with its coupled ctx, if any.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_locked(struct key *key)
+{
+	struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0);
+
+        if (ctx)
+                unbind_key_ctx(key, ctx);
+}
+
+/*
+ * unlist a ctx, and unbind from coupled key
+ */
+static void kill_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+        if (ctx_unlist_kr(ctx, 0))
+                unbind_ctx_kr(ctx);
+}
+
+/*
+ * given a key, unlist and unbind with the coupled ctx (if any).
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void kill_key_locked(struct key *key)
+{
+	struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0);
+
+        if (ctx && ctx_unlist_kr(ctx, 0))
+                unbind_key_locked(key);
+}
+
+/*
+ * caller should hold one ref on contexts in freelist.
+ */
+static void dispose_ctx_list_kr(struct hlist_head *freelist)
+{
+	struct hlist_node *next;
+	struct ptlrpc_cli_ctx	*ctx;
+	struct gss_cli_ctx	*gctx;
+
+	hlist_for_each_entry_safe(ctx, next, freelist, cc_cache) {
+		hlist_del_init(&ctx->cc_cache);
+
+		/* reverse ctx: update current seq to buddy svcctx if exist.
+		 * ideally this should be done at gss_cli_ctx_finalize(), but
+		 * the ctx destroy could be delayed by:
+		 *  1) ctx still has reference;
+		 *  2) ctx destroy is asynchronous;
+		 * and reverse import call inval_all_ctx() require this be done
+		 * _immediately_ otherwise newly created reverse ctx might copy
+		 * the very old sequence number from svcctx. */
+		gctx = ctx2gctx(ctx);
+		if (!rawobj_empty(&gctx->gc_svc_handle) &&
+		    sec_is_reverse(gctx->gc_base.cc_sec)) {
+			gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
+					(__u32) atomic_read(&gctx->gc_seq));
+		}
+
+		/* we need to wakeup waiting reqs here. the context might
+		 * be forced released before upcall finished, then the
+		 * late-arrived downcall can't find the ctx even. */
+		sptlrpc_cli_ctx_wakeup(ctx);
+
+		unbind_ctx_kr(ctx);
+		ctx_put_kr(ctx, 0);
+	}
+}
+
+/*
+ * lookup a root context directly in a sec, return root ctx with a
+ * reference taken or NULL.
+ */
+static
+struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx   *ctx = NULL;
+
+	spin_lock(&sec->ps_lock);
+
+        ctx = gsec_kr->gsk_root_ctx;
+
+        if (ctx == NULL && unlikely(sec_is_reverse(sec))) {
+		struct ptlrpc_cli_ctx	*tmp;
+
+                /* reverse ctx, search root ctx in list, choose the one
+                 * with shortest expire time, which is most possibly have
+                 * an established peer ctx at client side. */
+		hlist_for_each_entry(tmp, &gsec_kr->gsk_clist, cc_cache) {
+                        if (ctx == NULL || ctx->cc_expire == 0 ||
+                            ctx->cc_expire > tmp->cc_expire) {
+                                ctx = tmp;
+                                /* promote to be root_ctx */
+                                gsec_kr->gsk_root_ctx = ctx;
+                        }
+                }
+        }
+
+	if (ctx) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+		LASSERT(!hlist_empty(&gsec_kr->gsk_clist));
+		atomic_inc(&ctx->cc_refcount);
+	}
+
+	spin_unlock(&sec->ps_lock);
+
+	return ctx;
+}
+
+#define RVS_CTX_EXPIRE_NICE    (10)
+
+static
+void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec,
+                                 struct ptlrpc_cli_ctx *new_ctx,
+                                 struct key *key)
+{
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx *ctx;
+	time64_t now;
+
+	ENTRY;
+	LASSERT(sec_is_reverse(sec));
+
+	spin_lock(&sec->ps_lock);
+
+	now = ktime_get_real_seconds();
+
+        /* set all existing ctxs short expiry */
+	hlist_for_each_entry(ctx, &gsec_kr->gsk_clist, cc_cache) {
+                if (ctx->cc_expire > now + RVS_CTX_EXPIRE_NICE) {
+                        ctx->cc_early_expire = 1;
+                        ctx->cc_expire = now + RVS_CTX_EXPIRE_NICE;
+                }
+        }
+
+        /* if there's root_ctx there, instead obsolete the current
+         * immediately, we leave it continue operating for a little while.
+         * hopefully when the first backward rpc with newest ctx send out,
+         * the client side already have the peer ctx well established. */
+        ctx_enlist_kr(new_ctx, gsec_kr->gsk_root_ctx ? 0 : 1, 1);
+
+        if (key)
+                bind_key_ctx(key, new_ctx);
+
+	spin_unlock(&sec->ps_lock);
+}
+
+static void construct_key_desc(void *buf, int bufsize,
+                               struct ptlrpc_sec *sec, uid_t uid)
+{
+        snprintf(buf, bufsize, "%d@%x", uid, sec->ps_id);
+        ((char *)buf)[bufsize - 1] = '\0';
+}
+
+/****************************************
+ * sec apis                             *
+ ****************************************/
+
+static
+struct ptlrpc_sec * gss_sec_create_kr(struct obd_import *imp,
+                                      struct ptlrpc_svc_ctx *svcctx,
+                                      struct sptlrpc_flavor *sf)
+{
+        struct gss_sec_keyring  *gsec_kr;
+        ENTRY;
+
+        OBD_ALLOC(gsec_kr, sizeof(*gsec_kr));
+        if (gsec_kr == NULL)
+                RETURN(NULL);
+
+	INIT_HLIST_HEAD(&gsec_kr->gsk_clist);
+        gsec_kr->gsk_root_ctx = NULL;
+	mutex_init(&gsec_kr->gsk_root_uc_lock);
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_init(&gsec_kr->gsk_uc_lock);
+#endif
+
+        if (gss_sec_create_common(&gsec_kr->gsk_base, &gss_policy_keyring,
+                                  imp, svcctx, sf))
+                goto err_free;
+
+        if (svcctx != NULL &&
+            sec_install_rctx_kr(&gsec_kr->gsk_base.gs_base, svcctx)) {
+                gss_sec_destroy_common(&gsec_kr->gsk_base);
+                goto err_free;
+        }
+
+        RETURN(&gsec_kr->gsk_base.gs_base);
+
+err_free:
+        OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+        RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_kr(struct ptlrpc_sec *sec)
+{
+        struct gss_sec          *gsec = sec2gsec(sec);
+        struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+        CDEBUG(D_SEC, "destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+	LASSERT(hlist_empty(&gsec_kr->gsk_clist));
+        LASSERT(gsec_kr->gsk_root_ctx == NULL);
+
+        gss_sec_destroy_common(gsec);
+
+        OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+}
+
+static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred)
+{
+        /* except the ROOTONLY flag, treat it as root user only if real uid
+         * is 0, euid/fsuid being 0 are handled as setuid scenarios */
+        if (sec_is_rootonly(sec) || (vcred->vc_uid == 0))
+                return 1;
+        else
+                return 0;
+}
+
+/*
+ * kernel 5.3: commit 0f44e4d976f96c6439da0d6717238efa4b91196e
+ * keys: Move the user and user-session keyrings to the user_namespace
+ *
+ * When lookup_user_key is available use the kernel API rather than directly
+ * accessing the uid_keyring and session_keyring via the current process
+ * credentials.
+ */
+#ifdef HAVE_LOOKUP_USER_KEY
+
+/* from Linux security/keys/internal.h: */
+#ifndef KEY_LOOKUP_FOR_UNLINK
+#define KEY_LOOKUP_FOR_UNLINK		0x04
+#endif
+
+static struct key *_user_key(key_serial_t id)
+{
+	key_ref_t ref;
+
+	might_sleep();
+	ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0);
+	if (IS_ERR(ref))
+		return NULL;
+	return key_ref_to_ptr(ref);
+}
+
+static inline struct key *get_user_session_keyring(const struct cred *cred)
+{
+	return _user_key(KEY_SPEC_USER_SESSION_KEYRING);
+}
+
+static inline struct key *get_user_keyring(const struct cred *cred)
+{
+	return _user_key(KEY_SPEC_USER_KEYRING);
+}
+#else
+static inline struct key *get_user_session_keyring(const struct cred *cred)
+{
+	return key_get(cred->user->session_keyring);
+}
+
+static inline struct key *get_user_keyring(const struct cred *cred)
+{
+	return key_get(cred->user->uid_keyring);
+}
+#endif
+
+/*
+ * unlink request key from it's ring, which is linked during request_key().
+ * sadly, we have to 'guess' which keyring it's linked to.
+ *
+ * FIXME this code is fragile, it depends on how request_key() is implemented.
+ */
+static void request_key_unlink(struct key *key)
+{
+	const struct cred *cred = current_cred();
+	struct key *ring = NULL;
+
+	switch (cred->jit_keyring) {
+	case KEY_REQKEY_DEFL_DEFAULT:
+	case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
+#ifdef HAVE_GET_REQUEST_KEY_AUTH
+		if (cred->request_key_auth) {
+			struct request_key_auth *rka;
+			struct key *authkey = cred->request_key_auth;
+
+			down_read(&authkey->sem);
+			rka = get_request_key_auth(authkey);
+			if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags))
+				ring = key_get(rka->dest_keyring);
+			up_read(&authkey->sem);
+			if (ring)
+				break;
+		}
+#endif
+		fallthrough;
+	case KEY_REQKEY_DEFL_THREAD_KEYRING:
+		ring = key_get(cred->thread_keyring);
+		if (ring)
+			break;
+		fallthrough;
+	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+		ring = key_get(cred->process_keyring);
+		if (ring)
+			break;
+		fallthrough;
+	case KEY_REQKEY_DEFL_SESSION_KEYRING:
+		rcu_read_lock();
+		ring = key_get(rcu_dereference(cred->session_keyring));
+		rcu_read_unlock();
+		if (ring)
+			break;
+		fallthrough;
+	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+		ring = get_user_session_keyring(cred);
+		break;
+	case KEY_REQKEY_DEFL_USER_KEYRING:
+		ring = get_user_keyring(cred);
+		break;
+	case KEY_REQKEY_DEFL_GROUP_KEYRING:
+	default:
+		LBUG();
+	}
+
+	LASSERT(ring);
+	key_unlink(ring, key);
+	key_put(ring);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec,
+                                              struct vfs_cred *vcred,
+                                              int create, int remove_dead)
+{
+	struct obd_import *imp = sec->ps_import;
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx *ctx = NULL;
+	unsigned int is_root = 0, create_new = 0;
+	struct key *key;
+	char desc[24];
+	char *coinfo;
+	int coinfo_size;
+	const char *sec_part_flags = "";
+	char svc_flag = '-';
+	pid_t caller_pid;
+	ENTRY;
+
+	LASSERT(imp != NULL);
+
+	is_root = user_is_root(sec, vcred);
+
+	/* a little bit optimization for root context */
+	if (is_root) {
+		ctx = sec_lookup_root_ctx_kr(sec);
+		/*
+		 * Only lookup directly for REVERSE sec, which should
+		 * always succeed.
+		 */
+		if (ctx || sec_is_reverse(sec))
+			RETURN(ctx);
+	}
+
+	LASSERT(create != 0);
+
+	/* for root context, obtain lock and check again, this time hold
+	 * the root upcall lock, make sure nobody else populated new root
+	 * context after last check.
+	 */
+	if (is_root) {
+		mutex_lock(&gsec_kr->gsk_root_uc_lock);
+
+		ctx = sec_lookup_root_ctx_kr(sec);
+		if (ctx)
+			goto out;
+
+		/* update reverse handle for root user */
+		sec2gsec(sec)->gs_rvs_hdl = gss_get_next_ctx_index();
+
+		switch (sec->ps_part) {
+		case LUSTRE_SP_MDT:
+			sec_part_flags = "m";
+			break;
+		case LUSTRE_SP_OST:
+			sec_part_flags = "o";
+			break;
+		case LUSTRE_SP_MGC:
+			sec_part_flags = "rmo";
+			break;
+		case LUSTRE_SP_CLI:
+			sec_part_flags = "r";
+			break;
+		case LUSTRE_SP_MGS:
+		default:
+			LBUG();
+		}
+
+		switch (SPTLRPC_FLVR_SVC(sec->ps_flvr.sf_rpc)) {
+		case SPTLRPC_SVC_NULL:
+			svc_flag = 'n';
+			break;
+		case SPTLRPC_SVC_AUTH:
+			svc_flag = 'a';
+			break;
+		case SPTLRPC_SVC_INTG:
+			svc_flag = 'i';
+			break;
+		case SPTLRPC_SVC_PRIV:
+			svc_flag = 'p';
+			break;
+		default:
+			LBUG();
+		}
+	}
+
+	/* in case of setuid, key will be constructed as owner of fsuid/fsgid,
+	 * but we do authentication based on real uid/gid. the key permission
+	 * bits will be exactly as POS_ALL, so only processes who subscribed
+	 * this key could have the access, although the quota might be counted
+	 * on others (fsuid/fsgid).
+	 *
+	 * keyring will use fsuid/fsgid as upcall parameters, so we have to
+	 * encode real uid/gid into callout info.
+	 */
+
+	/* But first we need to make sure the obd type is supported */
+	if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MGC_NAME) &&
+	    strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+	    strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSP_NAME)) {
+		CERROR("obd %s is not a supported device\n",
+		       imp->imp_obd->obd_name);
+		GOTO(out, ctx = NULL);
+	}
+
+	construct_key_desc(desc, sizeof(desc), sec, vcred->vc_uid);
+
+	/* callout info format:
+	 * secid:mech:uid:gid:sec_flags:svc_flag:svc_type:peer_nid:target_uuid:
+	 * self_nid:pid
+	 */
+	coinfo_size = sizeof(struct obd_uuid) + MAX_OBD_NAME + 64;
+	OBD_ALLOC(coinfo, coinfo_size);
+	if (coinfo == NULL)
+		goto out;
+
+	/* Last callout parameter is pid of process whose namespace will be used
+	 * for credentials' retrieval.
+	 */
+	if (gss_check_upcall_ns) {
+		/* For user's credentials (in which case sec_part_flags is
+		 * empty), use current PID instead of import's reference
+		 * PID to get reference namespace.
+		 */
+		if (sec_part_flags[0] == '\0')
+			caller_pid = current->pid;
+		else
+			caller_pid = imp->imp_sec_refpid;
+	} else {
+		/* Do not switch namespace in gss keyring upcall. */
+		caller_pid = 0;
+	}
+	snprintf(coinfo, coinfo_size, "%d:%s:%u:%u:%s:%c:%d:%#llx:%s:%#llx:%d",
+		 sec->ps_id, sec2gsec(sec)->gs_mech->gm_name,
+		 vcred->vc_uid, vcred->vc_gid,
+		 sec_part_flags, svc_flag, import_to_gss_svc(imp),
+		 lnet_nid_to_nid4(&imp->imp_connection->c_peer.nid),
+		 imp->imp_obd->obd_name,
+		 LNetPrimaryNID(lnet_nid_to_nid4(&imp->imp_connection->c_self)),
+		 caller_pid);
+
+	CDEBUG(D_SEC, "requesting key for %s\n", desc);
+
+	keyring_upcall_lock(gsec_kr);
+	key = request_key(&gss_key_type, desc, coinfo);
+	keyring_upcall_unlock(gsec_kr);
+
+	OBD_FREE(coinfo, coinfo_size);
+
+	if (IS_ERR(key)) {
+		CERROR("failed request key: %ld\n", PTR_ERR(key));
+		goto out;
+	}
+	CDEBUG(D_SEC, "obtained key %08x for %s\n", key->serial, desc);
+
+	/* once payload.data was pointed to a ctx, it never changes until
+	 * we de-associate them; but parallel request_key() may return
+	 * a key with payload.data == NULL at the same time. so we still
+	 * need wirtelock of key->sem to serialize them.
+	 */
+	down_write(&key->sem);
+
+	ctx = key_get_payload(key, 0);
+	if (likely(ctx)) {
+		LASSERT(atomic_read(&ctx->cc_refcount) >= 1);
+		LASSERT(ctx2gctx_keyring(ctx)->gck_key == key);
+		LASSERT(ll_read_key_usage(key) >= 2);
+
+		/* simply take a ref and return. it's upper layer's
+		 * responsibility to detect & replace dead ctx.
+		 */
+		atomic_inc(&ctx->cc_refcount);
+	} else {
+		/* pre initialization with a cli_ctx. this can't be done in
+		 * key_instantiate() because we'v no enough information
+		 * there.
+		 */
+		ctx = ctx_create_kr(sec, vcred);
+		if (ctx != NULL) {
+			ctx_enlist_kr(ctx, is_root, 0);
+			bind_key_ctx(key, ctx);
+
+			ctx_start_timer_kr(ctx, KEYRING_UPCALL_TIMEOUT);
+
+			CDEBUG(D_SEC, "installed key %p <-> ctx %p (sec %p)\n",
+			       key, ctx, sec);
+		} else {
+			/* we'd prefer to call key_revoke(), but we more like
+			 * to revoke it within this key->sem locked period.
+			 */
+			key_revoke_locked(key);
+		}
+
+		create_new = 1;
+	}
+
+	up_write(&key->sem);
+
+	if (is_root && create_new)
+		request_key_unlink(key);
+
+	key_put(key);
+out:
+	if (is_root)
+		mutex_unlock(&gsec_kr->gsk_root_uc_lock);
+	RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_kr(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx,
+                            int sync)
+{
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+        ctx_release_kr(ctx, sync);
+}
+
+/*
+ * flush context of normal user, we must resort to keyring itself to find out
+ * contexts which belong to me.
+ *
+ * Note here we suppose only to flush _my_ context, the "uid" will
+ * be ignored in the search.
+ */
+static
+void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec,
+                             uid_t uid,
+                             int grace, int force)
+{
+        struct key              *key;
+        char                     desc[24];
+
+        /* nothing to do for reverse or rootonly sec */
+        if (sec_is_reverse(sec) || sec_is_rootonly(sec))
+                return;
+
+        construct_key_desc(desc, sizeof(desc), sec, uid);
+
+	/* there should be only one valid key, but we put it in the
+	 * loop in case of any weird cases */
+	for (;;) {
+		key = request_key(&gss_key_type, desc, NULL);
+		if (IS_ERR(key)) {
+			CDEBUG(D_SEC, "No more key found for current user\n");
+			break;
+		}
+
+		down_write(&key->sem);
+
+		kill_key_locked(key);
+
+		/* kill_key_locked() should usually revoke the key, but we
+		 * revoke it again to make sure, e.g. some case the key may
+		 * not well coupled with a context. */
+		key_revoke_locked(key);
+
+		up_write(&key->sem);
+
+		request_key_unlink(key);
+
+		key_put(key);
+	}
+}
+
+/*
+ * flush context of root or all, we iterate through the list.
+ */
+static
+void flush_spec_ctx_cache_kr(struct ptlrpc_sec *sec, uid_t uid, int grace,
+			     int force)
+{
+	struct gss_sec_keyring	*gsec_kr;
+	struct hlist_head	 freelist = HLIST_HEAD_INIT;
+	struct hlist_node *next;
+	struct ptlrpc_cli_ctx	*ctx;
+	ENTRY;
+
+        gsec_kr = sec2gsec_keyring(sec);
+
+	spin_lock(&sec->ps_lock);
+	hlist_for_each_entry_safe(ctx, next, &gsec_kr->gsk_clist,
+				  cc_cache) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+		if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+			continue;
+
+		/* at this moment there's at least 2 base reference:
+		 * key association and in-list. */
+		if (atomic_read(&ctx->cc_refcount) > 2) {
+			if (!force)
+				continue;
+			CWARN("flush busy ctx %p(%u->%s, extra ref %d)\n",
+			      ctx, ctx->cc_vcred.vc_uid,
+			      sec2target_str(ctx->cc_sec),
+			      atomic_read(&ctx->cc_refcount) - 2);
+		}
+
+		set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+		if (!grace)
+			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+		atomic_inc(&ctx->cc_refcount);
+
+		if (ctx_unlist_kr(ctx, 1)) {
+			hlist_add_head(&ctx->cc_cache, &freelist);
+		} else {
+			LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+			atomic_dec(&ctx->cc_refcount);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	dispose_ctx_list_kr(&freelist);
+	EXIT;
+}
+
+static
+int gss_sec_flush_ctx_cache_kr(struct ptlrpc_sec *sec,
+                               uid_t uid, int grace, int force)
+{
+	ENTRY;
+
+	CDEBUG(D_SEC, "sec %p(%d, nctx %d), uid %d, grace %d, force %d\n",
+	       sec, atomic_read(&sec->ps_refcount),
+	       atomic_read(&sec->ps_nctx),
+	       uid, grace, force);
+
+	if (uid != -1 && uid != 0)
+		flush_user_ctx_cache_kr(sec, uid, grace, force);
+	else
+		flush_spec_ctx_cache_kr(sec, uid, grace, force);
+
+	RETURN(0);
+}
+
+static
+void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_head	freelist = HLIST_HEAD_INIT;
+	struct hlist_node *next;
+	struct ptlrpc_cli_ctx	*ctx;
+	ENTRY;
+
+	CWARN("running gc\n");
+
+	spin_lock(&sec->ps_lock);
+	hlist_for_each_entry_safe(ctx, next, &gsec_kr->gsk_clist,
+				  cc_cache) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+		atomic_inc(&ctx->cc_refcount);
+
+		if (cli_ctx_check_death(ctx) && ctx_unlist_kr(ctx, 1)) {
+			hlist_add_head(&ctx->cc_cache, &freelist);
+			CWARN("unhashed ctx %p\n", ctx);
+		} else {
+			LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+			atomic_dec(&ctx->cc_refcount);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	dispose_ctx_list_kr(&freelist);
+	EXIT;
+}
+
+static
+int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
+{
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node *next;
+	struct ptlrpc_cli_ctx *ctx;
+	struct gss_cli_ctx *gctx;
+	time64_t now = ktime_get_real_seconds();
+
+	ENTRY;
+	spin_lock(&sec->ps_lock);
+	hlist_for_each_entry_safe(ctx, next, &gsec_kr->gsk_clist,
+				  cc_cache) {
+                struct key             *key;
+                char                    flags_str[40];
+                char                    mech[40];
+
+                gctx = ctx2gctx(ctx);
+                key = ctx2gctx_keyring(ctx)->gck_key;
+
+                gss_cli_ctx_flags2str(ctx->cc_flags,
+                                      flags_str, sizeof(flags_str));
+
+                if (gctx->gc_mechctx)
+                        lgss_display(gctx->gc_mechctx, mech, sizeof(mech));
+                else
+                        snprintf(mech, sizeof(mech), "N/A");
+                mech[sizeof(mech) - 1] = '\0';
+
+		seq_printf(seq,
+			   "%p: uid %u, ref %d, expire %lld(%+lld), fl %s, seq %d, win %u, key %08x(ref %d), hdl %#llx:%#llx, mech: %s\n",
+			   ctx, ctx->cc_vcred.vc_uid,
+			   atomic_read(&ctx->cc_refcount),
+			   ctx->cc_expire,
+			   ctx->cc_expire ?  ctx->cc_expire - now : 0,
+			   flags_str,
+			   atomic_read(&gctx->gc_seq),
+			   gctx->gc_win,
+			   key ? key->serial : 0,
+			   key ? ll_read_key_usage(key) : 0,
+			   gss_handle_to_u64(&gctx->gc_handle),
+			   gss_handle_to_u64(&gctx->gc_svc_handle),
+			   mech);
+	}
+	spin_unlock(&sec->ps_lock);
+
+	RETURN(0);
+}
+
+/****************************************
+ * cli_ctx apis                         *
+ ****************************************/
+
+static
+int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	/* upcall is already on the way */
+	struct gss_cli_ctx *gctx = ctx ? ctx2gctx(ctx) : NULL;
+
+	/* record latest sequence number in buddy svcctx */
+	if (gctx && !rawobj_empty(&gctx->gc_svc_handle) &&
+	    sec_is_reverse(gctx->gc_base.cc_sec)) {
+		return gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
+					     (__u32)atomic_read(&gctx->gc_seq));
+	}
+	return 0;
+}
+
+static
+int gss_cli_ctx_validate_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	if (cli_ctx_check_death(ctx)) {
+		kill_ctx_kr(ctx);
+		return 1;
+	}
+
+	if (cli_ctx_is_ready(ctx))
+		return 0;
+	return 1;
+}
+
+static
+void gss_cli_ctx_die_kr(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	cli_ctx_expire(ctx);
+	kill_ctx_kr(ctx);
+}
+
+/****************************************
+ * (reverse) service                    *
+ ****************************************/
+
+/*
+ * reverse context could have nothing to do with keyrings. here we still keep
+ * the version which bind to a key, for future reference.
+ */
+#define HAVE_REVERSE_CTX_NOKEY
+
+#ifdef HAVE_REVERSE_CTX_NOKEY
+
+static
+int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+			struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct ptlrpc_cli_ctx *cli_ctx;
+	struct vfs_cred vcred = { .vc_uid = 0 };
+	int rc;
+
+        LASSERT(sec);
+        LASSERT(svc_ctx);
+
+        cli_ctx = ctx_create_kr(sec, &vcred);
+        if (cli_ctx == NULL)
+                return -ENOMEM;
+
+        rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+        if (rc) {
+                CERROR("failed copy reverse cli ctx: %d\n", rc);
+
+                ctx_put_kr(cli_ctx, 1);
+                return rc;
+        }
+
+        rvs_sec_install_root_ctx_kr(sec, cli_ctx, NULL);
+
+        ctx_put_kr(cli_ctx, 1);
+
+        return 0;
+}
+
+#else /* ! HAVE_REVERSE_CTX_NOKEY */
+
+static
+int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+			struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct ptlrpc_cli_ctx *cli_ctx = NULL;
+	struct key *key;
+	struct vfs_cred vcred = { .vc_uid = 0 };
+	char desc[64];
+	int rc;
+
+        LASSERT(sec);
+        LASSERT(svc_ctx);
+        CWARN("called\n");
+
+        construct_key_desc(desc, sizeof(desc), sec, 0);
+
+        key = key_alloc(&gss_key_type, desc, 0, 0,
+                        KEY_POS_ALL | KEY_USR_ALL, 1);
+        if (IS_ERR(key)) {
+                CERROR("failed to alloc key: %ld\n", PTR_ERR(key));
+                return PTR_ERR(key);
+        }
+
+        rc = key_instantiate_and_link(key, NULL, 0, NULL, NULL);
+        if (rc) {
+                CERROR("failed to instantiate key: %d\n", rc);
+                goto err_revoke;
+        }
+
+        down_write(&key->sem);
+
+	LASSERT(!key_get_payload(key, 0));
+
+        cli_ctx = ctx_create_kr(sec, &vcred);
+        if (cli_ctx == NULL) {
+                rc = -ENOMEM;
+                goto err_up;
+        }
+
+        rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+        if (rc) {
+                CERROR("failed copy reverse cli ctx: %d\n", rc);
+                goto err_put;
+        }
+
+        rvs_sec_install_root_ctx_kr(sec, cli_ctx, key);
+
+        ctx_put_kr(cli_ctx, 1);
+        up_write(&key->sem);
+
+        rc = 0;
+        CWARN("ok!\n");
+out:
+        key_put(key);
+        return rc;
+
+err_put:
+        ctx_put_kr(cli_ctx, 1);
+err_up:
+        up_write(&key->sem);
+err_revoke:
+        key_revoke(key);
+        goto out;
+}
+
+#endif /* HAVE_REVERSE_CTX_NOKEY */
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static
+int gss_svc_accept_kr(struct ptlrpc_request *req)
+{
+        return gss_svc_accept(&gss_policy_keyring, req);
+}
+
+static
+int gss_svc_install_rctx_kr(struct obd_import *imp,
+                            struct ptlrpc_svc_ctx *svc_ctx)
+{
+        struct ptlrpc_sec *sec;
+        int                rc;
+
+        sec = sptlrpc_import_sec_ref(imp);
+        LASSERT(sec);
+
+        rc = sec_install_rctx_kr(sec, svc_ctx);
+        sptlrpc_sec_put(sec);
+
+        return rc;
+}
+
+/****************************************
+ * key apis                             *
+ ****************************************/
+
+static
+#ifdef HAVE_KEY_TYPE_INSTANTIATE_2ARGS
+int gss_kt_instantiate(struct key *key, struct key_preparsed_payload *prep)
+{
+	const void     *data = prep->data;
+	size_t          datalen = prep->datalen;
+#else
+int gss_kt_instantiate(struct key *key, const void *data, size_t datalen)
+{
+#endif
+        int             rc;
+        ENTRY;
+
+        if (data != NULL || datalen != 0) {
+                CERROR("invalid: data %p, len %lu\n", data, (long)datalen);
+                RETURN(-EINVAL);
+        }
+
+	if (key_get_payload(key, 0)) {
+                CERROR("key already have payload\n");
+                RETURN(-EINVAL);
+        }
+
+        /* link the key to session keyring, so following context negotiation
+         * rpc fired from user space could find this key. This will be unlinked
+         * automatically when upcall processes die.
+         *
+         * we can't do this through keyctl from userspace, because the upcall
+         * might be neither possessor nor owner of the key (setuid).
+         *
+         * the session keyring is created upon upcall, and don't change all
+         * the way until upcall finished, so rcu lock is not needed here.
+         */
+	LASSERT(current_cred()->session_keyring);
+
+	lockdep_off();
+	rc = key_link(current_cred()->session_keyring, key);
+	lockdep_on();
+	if (unlikely(rc)) {
+		CERROR("failed to link key %08x to keyring %08x: %d\n",
+		       key->serial,
+		       current_cred()->session_keyring->serial, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_SEC, "key %p instantiated, ctx %p\n", key,
+	       key_get_payload(key, 0));
+	RETURN(0);
+}
+
+/*
+ * called with key semaphore write locked. it means we can operate
+ * on the context without fear of loosing refcount.
+ */
+static
+#ifdef HAVE_KEY_TYPE_INSTANTIATE_2ARGS
+int gss_kt_update(struct key *key, struct key_preparsed_payload *prep)
+{
+	const void              *data = prep->data;
+	__u32                    datalen32 = (__u32) prep->datalen;
+#else
+int gss_kt_update(struct key *key, const void *data, size_t datalen)
+{
+	__u32                    datalen32 = (__u32) datalen;
+#endif
+	struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0);
+        struct gss_cli_ctx      *gctx;
+        rawobj_t                 tmpobj = RAWOBJ_EMPTY;
+        int                      rc;
+        ENTRY;
+
+	if (data == NULL || datalen32 == 0) {
+		CWARN("invalid: data %p, len %lu\n", data, (long)datalen32);
+		RETURN(-EINVAL);
+	}
+
+        /* if upcall finished negotiation too fast (mostly likely because
+         * of local error happened) and call kt_update(), the ctx
+         * might be still NULL. but the key will finally be associate
+         * with a context, or be revoked. if key status is fine, return
+         * -EAGAIN to allow userspace sleep a while and call again. */
+        if (ctx == NULL) {
+                CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n",
+                      key, key->serial, key->flags);
+
+                rc = key_validate(key);
+                if (rc == 0)
+                        RETURN(-EAGAIN);
+                else
+                        RETURN(rc);
+        }
+
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	ctx_clear_timer_kr(ctx);
+
+        /* don't proceed if already refreshed */
+        if (cli_ctx_is_refreshed(ctx)) {
+                CWARN("ctx already done refresh\n");
+                RETURN(0);
+        }
+
+        sptlrpc_cli_ctx_get(ctx);
+        gctx = ctx2gctx(ctx);
+
+        rc = buffer_extract_bytes(&data, &datalen32, &gctx->gc_win,
+                                  sizeof(gctx->gc_win));
+        if (rc) {
+                CERROR("failed extract seq_win\n");
+                goto out;
+        }
+
+	if (gctx->gc_win == 0) {
+		__u32   nego_rpc_err, nego_gss_err;
+
+		rc = buffer_extract_bytes(&data, &datalen32, &nego_rpc_err,
+					  sizeof(nego_rpc_err));
+		if (rc) {
+			CERROR("cannot extract RPC: rc = %d\n", rc);
+			goto out;
+		}
+
+		rc = buffer_extract_bytes(&data, &datalen32, &nego_gss_err,
+					  sizeof(nego_gss_err));
+		if (rc) {
+			CERROR("failed to extract gss rc = %d\n", rc);
+			goto out;
+		}
+
+		CERROR("negotiation: rpc err %d, gss err %x\n",
+		       nego_rpc_err, nego_gss_err);
+
+		rc = nego_rpc_err ? nego_rpc_err : -EACCES;
+	} else {
+		rc = rawobj_extract_local_alloc(&gctx->gc_handle,
+						(__u32 **) &data, &datalen32);
+		if (rc) {
+			CERROR("failed extract handle\n");
+			goto out;
+		}
+
+		rc = rawobj_extract_local(&tmpobj,
+					  (__u32 **) &data, &datalen32);
+		if (rc) {
+			CERROR("failed extract mech\n");
+			goto out;
+		}
+
+		rc = lgss_import_sec_context(&tmpobj,
+					     sec2gsec(ctx->cc_sec)->gs_mech,
+					     &gctx->gc_mechctx);
+		if (rc != GSS_S_COMPLETE)
+			CERROR("failed import context\n");
+		else
+			rc = 0;
+	}
+out:
+        /* we don't care what current status of this ctx, even someone else
+         * is operating on the ctx at the same time. we just add up our own
+         * opinions here. */
+        if (rc == 0) {
+                gss_cli_ctx_uptodate(gctx);
+        } else {
+                /* this will also revoke the key. has to be done before
+                 * wakeup waiters otherwise they can find the stale key */
+                kill_key_locked(key);
+
+                cli_ctx_expire(ctx);
+
+                if (rc != -ERESTART)
+			set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+        }
+
+        /* let user space think it's a success */
+        sptlrpc_cli_ctx_put(ctx, 1);
+        RETURN(0);
+}
+
+#ifndef HAVE_KEY_MATCH_DATA
+static int
+gss_kt_match(const struct key *key, const void *desc)
+{
+	return strcmp(key->description, (const char *) desc) == 0 &&
+		!test_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+#else /* ! HAVE_KEY_MATCH_DATA */
+static bool
+gss_kt_match(const struct key *key, const struct key_match_data *match_data)
+{
+	const char *desc = match_data->raw_data;
+
+	return strcmp(key->description, desc) == 0 &&
+		!test_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+
+/*
+ * Preparse the match criterion.
+ */
+static int gss_kt_match_preparse(struct key_match_data *match_data)
+{
+	match_data->lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT;
+	match_data->cmp = gss_kt_match;
+	return 0;
+}
+#endif /* HAVE_KEY_MATCH_DATA */
+
+static
+void gss_kt_destroy(struct key *key)
+{
+        ENTRY;
+	LASSERT(!key_get_payload(key, 0));
+        CDEBUG(D_SEC, "destroy key %p\n", key);
+        EXIT;
+}
+
+static
+void gss_kt_describe(const struct key *key, struct seq_file *s)
+{
+        if (key->description == NULL)
+                seq_puts(s, "[null]");
+        else
+                seq_puts(s, key->description);
+}
+
+static struct key_type gss_key_type =
+{
+	.name		= "lgssc",
+	.def_datalen	= 0,
+	.instantiate	= gss_kt_instantiate,
+	.update		= gss_kt_update,
+#ifdef HAVE_KEY_MATCH_DATA
+	.match_preparse = gss_kt_match_preparse,
+#else
+	.match		= gss_kt_match,
+#endif
+	.destroy	= gss_kt_destroy,
+	.describe	= gss_kt_describe,
+};
+
+/****************************************
+ * lustre gss keyring policy            *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_keyring_ctxops = {
+        .match                  = gss_cli_ctx_match,
+        .refresh                = gss_cli_ctx_refresh_kr,
+        .validate               = gss_cli_ctx_validate_kr,
+        .die                    = gss_cli_ctx_die_kr,
+        .sign                   = gss_cli_ctx_sign,
+        .verify                 = gss_cli_ctx_verify,
+        .seal                   = gss_cli_ctx_seal,
+        .unseal                 = gss_cli_ctx_unseal,
+        .wrap_bulk              = gss_cli_ctx_wrap_bulk,
+        .unwrap_bulk            = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_keyring_cops = {
+        .create_sec             = gss_sec_create_kr,
+        .destroy_sec            = gss_sec_destroy_kr,
+        .kill_sec               = gss_sec_kill,
+        .lookup_ctx             = gss_sec_lookup_ctx_kr,
+        .release_ctx            = gss_sec_release_ctx_kr,
+        .flush_ctx_cache        = gss_sec_flush_ctx_cache_kr,
+        .gc_ctx                 = gss_sec_gc_ctx_kr,
+        .install_rctx           = gss_sec_install_rctx,
+        .alloc_reqbuf           = gss_alloc_reqbuf,
+        .free_reqbuf            = gss_free_reqbuf,
+        .alloc_repbuf           = gss_alloc_repbuf,
+        .free_repbuf            = gss_free_repbuf,
+        .enlarge_reqbuf         = gss_enlarge_reqbuf,
+        .display                = gss_sec_display_kr,
+};
+
+static struct ptlrpc_sec_sops gss_sec_keyring_sops = {
+        .accept                 = gss_svc_accept_kr,
+        .invalidate_ctx         = gss_svc_invalidate_ctx,
+        .alloc_rs               = gss_svc_alloc_rs,
+        .authorize              = gss_svc_authorize,
+        .free_rs                = gss_svc_free_rs,
+        .free_ctx               = gss_svc_free_ctx,
+        .prep_bulk              = gss_svc_prep_bulk,
+        .unwrap_bulk            = gss_svc_unwrap_bulk,
+        .wrap_bulk              = gss_svc_wrap_bulk,
+        .install_rctx           = gss_svc_install_rctx_kr,
+};
+
+static struct ptlrpc_sec_policy gss_policy_keyring = {
+        .sp_owner               = THIS_MODULE,
+        .sp_name                = "gss.keyring",
+        .sp_policy              = SPTLRPC_POLICY_GSS,
+        .sp_cops                = &gss_sec_keyring_cops,
+        .sp_sops                = &gss_sec_keyring_sops,
+};
+
+
+int __init gss_init_keyring(void)
+{
+        int rc;
+
+        rc = register_key_type(&gss_key_type);
+        if (rc) {
+                CERROR("failed to register keyring type: %d\n", rc);
+                return rc;
+        }
+
+        rc = sptlrpc_register_policy(&gss_policy_keyring);
+        if (rc) {
+                unregister_key_type(&gss_key_type);
+                return rc;
+        }
+
+        return 0;
+}
+
+void __exit gss_exit_keyring(void)
+{
+        unregister_key_type(&gss_key_type);
+        sptlrpc_unregister_policy(&gss_policy_keyring);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
new file mode 100644
index 0000000000000..611160458d9b1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
@@ -0,0 +1,160 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/include/linux/sunrpc/gss_krb5_types.h
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#ifndef PTLRPC_GSS_KRB5_H
+#define PTLRPC_GSS_KRB5_H
+
+#include "gss_crypto.h"
+
+/*
+ * RFC 4142
+ */
+
+#define KG_USAGE_ACCEPTOR_SEAL          22
+#define KG_USAGE_ACCEPTOR_SIGN          23
+#define KG_USAGE_INITIATOR_SEAL         24
+#define KG_USAGE_INITIATOR_SIGN         25
+
+#define KG_TOK_MIC_MSG                  0x0404
+#define KG_TOK_WRAP_MSG                 0x0504
+
+#define FLAG_SENDER_IS_ACCEPTOR         0x01
+#define FLAG_WRAP_CONFIDENTIAL          0x02
+#define FLAG_ACCEPTOR_SUBKEY            0x04
+
+struct krb5_header {
+        __u16           kh_tok_id;      /* token id */
+        __u8            kh_flags;       /* acceptor flags */
+        __u8            kh_filler;      /* 0xff */
+        __u16           kh_ec;          /* extra count */
+        __u16           kh_rrc;         /* right rotation count */
+        __u64           kh_seq;         /* sequence number */
+        __u8            kh_cksum[0];    /* checksum */
+};
+
+struct krb5_ctx {
+	unsigned int		kc_initiate:1,
+				kc_cfx:1,
+				kc_seed_init:1,
+				kc_have_acceptor_subkey:1;
+	time64_t		kc_endtime;
+	__u8			kc_seed[16];
+	__u64			kc_seq_send;
+	__u64			kc_seq_recv;
+	__u32			kc_enctype;
+	struct gss_keyblock	kc_keye;	/* encryption */
+	struct gss_keyblock	kc_keyi;	/* integrity */
+	struct gss_keyblock	kc_keyc;	/* checksum */
+	rawobj_t		kc_mech_used;
+};
+
+enum sgn_alg {
+        SGN_ALG_DES_MAC_MD5           = 0x0000,
+        SGN_ALG_MD2_5                 = 0x0001,
+        SGN_ALG_DES_MAC               = 0x0002,
+        SGN_ALG_3                     = 0x0003, /* not published */
+        SGN_ALG_HMAC_MD5              = 0x0011, /* microsoft w2k; no support */
+        SGN_ALG_HMAC_SHA1_DES3_KD     = 0x0004
+};
+
+enum seal_alg {
+        SEAL_ALG_NONE                 = 0xffff,
+        SEAL_ALG_DES                  = 0x0000,
+        SEAL_ALG_1                    = 0x0001, /* not published */
+        SEAL_ALG_MICROSOFT_RC4        = 0x0010, /* microsoft w2k; no support */
+        SEAL_ALG_DES3KD               = 0x0002
+};
+
+#define CKSUMTYPE_CRC32                 0x0001
+#define CKSUMTYPE_RSA_MD4               0x0002
+#define CKSUMTYPE_RSA_MD4_DES           0x0003
+#define CKSUMTYPE_DESCBC                0x0004
+/* des-mac-k */
+/* rsa-md4-des-k */
+#define CKSUMTYPE_RSA_MD5               0x0007
+#define CKSUMTYPE_RSA_MD5_DES           0x0008
+#define CKSUMTYPE_NIST_SHA              0x0009
+#define CKSUMTYPE_HMAC_SHA1_DES3        0x000c
+#define CKSUMTYPE_HMAC_SHA1_96_AES128   0x000f
+#define CKSUMTYPE_HMAC_SHA1_96_AES256   0x0010
+#define CKSUMTYPE_HMAC_MD5_ARCFOUR      -138
+
+/* from gssapi_err_krb5.h */
+#define KG_CCACHE_NOMATCH                        (39756032L)
+#define KG_KEYTAB_NOMATCH                        (39756033L)
+#define KG_TGT_MISSING                           (39756034L)
+#define KG_NO_SUBKEY                             (39756035L)
+#define KG_CONTEXT_ESTABLISHED                   (39756036L)
+#define KG_BAD_SIGN_TYPE                         (39756037L)
+#define KG_BAD_LENGTH                            (39756038L)
+#define KG_CTX_INCOMPLETE                        (39756039L)
+#define KG_CONTEXT                               (39756040L)
+#define KG_CRED                                  (39756041L)
+#define KG_ENC_DESC                              (39756042L)
+#define KG_BAD_SEQ                               (39756043L)
+#define KG_EMPTY_CCACHE                          (39756044L)
+#define KG_NO_CTYPES                             (39756045L)
+
+/* per Kerberos v5 protocol spec crypto types from the wire. 
+ * these get mapped to linux kernel crypto routines.  
+ */
+#define ENCTYPE_NULL            0x0000
+#define ENCTYPE_DES_CBC_CRC     0x0001        /* DES cbc mode with CRC-32 */
+#define ENCTYPE_DES_CBC_MD4     0x0002        /* DES cbc mode with RSA-MD4 */
+#define ENCTYPE_DES_CBC_MD5     0x0003        /* DES cbc mode with RSA-MD5 */
+#define ENCTYPE_DES_CBC_RAW     0x0004        /* DES cbc mode raw */
+/* XXX deprecated? */
+#define ENCTYPE_DES3_CBC_SHA    0x0005        /* DES-3 cbc mode with NIST-SHA */
+#define ENCTYPE_DES3_CBC_RAW    0x0006        /* DES-3 cbc mode raw */
+#define ENCTYPE_DES_HMAC_SHA1   0x0008
+#define ENCTYPE_DES3_CBC_SHA1   0x0010
+#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011
+#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012
+#define ENCTYPE_ARCFOUR_HMAC    0x0017
+#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018
+#define ENCTYPE_UNKNOWN         0x01ff
+
+#endif /* PTLRPC_GSS_KRB5_H */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
new file mode 100644
index 0000000000000..d95924993285f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
@@ -0,0 +1,1604 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+#include "gss_krb5.h"
+#include "gss_crypto.h"
+
+static DEFINE_SPINLOCK(krb5_seq_lock);
+
+struct krb5_enctype {
+        char           *ke_dispname;
+        char           *ke_enc_name;            /* linux tfm name */
+        char           *ke_hash_name;           /* linux tfm name */
+        int             ke_enc_mode;            /* linux tfm mode */
+        int             ke_hash_size;           /* checksum size */
+        int             ke_conf_size;           /* confounder size */
+        unsigned int    ke_hash_hmac:1;         /* is hmac? */
+};
+
+/*
+ * NOTE: for aes128-cts and aes256-cts, MIT implementation use CTS encryption.
+ * but currently we simply CBC with padding, because linux doesn't support CTS
+ * yet. this need to be fixed in the future.
+ */
+static struct krb5_enctype enctypes[] = {
+	[ENCTYPE_DES_CBC_RAW] = {		/* des-cbc-md5 */
+		.ke_dispname	= "des-cbc-md5",
+		.ke_enc_name	= "cbc(des)",
+		.ke_hash_name	= "md5",
+		.ke_hash_size	= 16,
+		.ke_conf_size	= 8,
+	},
+#ifdef HAVE_DES3_SUPPORT
+	[ENCTYPE_DES3_CBC_RAW] = {		/* des3-hmac-sha1 */
+		.ke_dispname	= "des3-hmac-sha1",
+		.ke_enc_name	= "cbc(des3_ede)",
+		.ke_hash_name	= "sha1",
+		.ke_hash_size	= 20,
+		.ke_conf_size	= 8,
+		.ke_hash_hmac	= 1,
+	},
+#endif
+	[ENCTYPE_AES128_CTS_HMAC_SHA1_96] = {	/* aes128-cts */
+		.ke_dispname	= "aes128-cts-hmac-sha1-96",
+		.ke_enc_name	= "cbc(aes)",
+		.ke_hash_name	= "sha1",
+		.ke_hash_size	= 12,
+		.ke_conf_size	= 16,
+		.ke_hash_hmac	= 1,
+	},
+	[ENCTYPE_AES256_CTS_HMAC_SHA1_96] = {	/* aes256-cts */
+		.ke_dispname	= "aes256-cts-hmac-sha1-96",
+		.ke_enc_name	= "cbc(aes)",
+		.ke_hash_name	= "sha1",
+		.ke_hash_size	= 12,
+		.ke_conf_size	= 16,
+		.ke_hash_hmac	= 1,
+	},
+	[ENCTYPE_ARCFOUR_HMAC] = {		/* arcfour-hmac-md5 */
+		.ke_dispname	= "arcfour-hmac-md5",
+		.ke_enc_name	= "ecb(arc4)",
+		.ke_hash_name	= "md5",
+		.ke_hash_size	= 16,
+		.ke_conf_size	= 8,
+		.ke_hash_hmac	= 1,
+	}
+};
+
+static const char * enctype2str(__u32 enctype)
+{
+	if (enctype < ARRAY_SIZE(enctypes) && enctypes[enctype].ke_dispname)
+		return enctypes[enctype].ke_dispname;
+
+	return "unknown";
+}
+
+static
+int krb5_init_keys(struct krb5_ctx *kctx)
+{
+	struct krb5_enctype *ke;
+
+	if (kctx->kc_enctype >= ARRAY_SIZE(enctypes) ||
+	    enctypes[kctx->kc_enctype].ke_hash_size == 0) {
+		CERROR("unsupported enctype %x\n", kctx->kc_enctype);
+		return -1;
+	}
+
+        ke = &enctypes[kctx->kc_enctype];
+
+	/* tfm arc4 is stateful, user should alloc-use-free by his own */
+	if (kctx->kc_enctype != ENCTYPE_ARCFOUR_HMAC &&
+	    gss_keyblock_init(&kctx->kc_keye, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+
+	/* tfm hmac is stateful, user should alloc-use-free by his own */
+	if (ke->ke_hash_hmac == 0 &&
+	    gss_keyblock_init(&kctx->kc_keyi, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+	if (ke->ke_hash_hmac == 0 &&
+	    gss_keyblock_init(&kctx->kc_keyc, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+
+        return 0;
+}
+
+static
+void delete_context_kerberos(struct krb5_ctx *kctx)
+{
+	rawobj_free(&kctx->kc_mech_used);
+
+	gss_keyblock_free(&kctx->kc_keye);
+	gss_keyblock_free(&kctx->kc_keyi);
+	gss_keyblock_free(&kctx->kc_keyc);
+}
+
+static
+__u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end)
+{
+	unsigned int    tmp_uint, keysize;
+
+	/* seed_init flag */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+	kctx->kc_seed_init = (tmp_uint != 0);
+
+	/* seed */
+	if (gss_get_bytes(&p, end, kctx->kc_seed, sizeof(kctx->kc_seed)))
+		goto out_err;
+
+	/* sign/seal algorithm, not really used now */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	/* end time. While kc_endtime might be 64 bit the krb5 API
+	 * still uses 32 bits. To delay the 2038 bug see the incoming
+	 * value as a u32 which give us until 2106. See the link for details:
+	 *
+	 * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html
+	 */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32)))
+		goto out_err;
+
+	/* seq send */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+	kctx->kc_seq_send = tmp_uint;
+
+	/* mech oid */
+	if (gss_get_rawobj(&p, end, &kctx->kc_mech_used))
+		goto out_err;
+
+	/* old style enc/seq keys in format:
+	 *   - enctype (u32)
+	 *   - keysize (u32)
+	 *   - keydata
+	 * we decompose them to fit into the new context
+	 */
+
+	/* enc key */
+	if (gss_get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+		goto out_err;
+
+	if (gss_get_bytes(&p, end, &keysize, sizeof(keysize)))
+		goto out_err;
+
+	if (gss_get_keyblock(&p, end, &kctx->kc_keye, keysize))
+		goto out_err;
+
+	/* seq key */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    tmp_uint != kctx->kc_enctype)
+		goto out_err;
+
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    tmp_uint != keysize)
+		goto out_err;
+
+	if (gss_get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+		goto out_err;
+
+	/* old style fallback */
+	if (gss_keyblock_dup(&kctx->kc_keyi, &kctx->kc_keyc))
+		goto out_err;
+
+	if (p != end)
+		goto out_err;
+
+	CDEBUG(D_SEC, "successfully imported rfc1964 context\n");
+	return 0;
+out_err:
+	return GSS_S_FAILURE;
+}
+
+/* Flags for version 2 context flags */
+#define KRB5_CTX_FLAG_INITIATOR		0x00000001
+#define KRB5_CTX_FLAG_CFX		0x00000002
+#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY	0x00000004
+
+static
+__u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end)
+{
+	unsigned int    tmp_uint, keysize;
+
+	/* end time. While kc_endtime might be 64 bit the krb5 API
+	 * still uses 32 bits. To delay the 2038 bug see the incoming
+	 * value as a u32 which give us until 2106. See the link for details:
+	 *
+	 * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html
+	 */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32)))
+		goto out_err;
+
+	/* flags */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	if (tmp_uint & KRB5_CTX_FLAG_INITIATOR)
+		kctx->kc_initiate = 1;
+	if (tmp_uint & KRB5_CTX_FLAG_CFX)
+		kctx->kc_cfx = 1;
+	if (tmp_uint & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY)
+		kctx->kc_have_acceptor_subkey = 1;
+
+	/* seq send */
+	if (gss_get_bytes(&p, end, &kctx->kc_seq_send,
+	    sizeof(kctx->kc_seq_send)))
+		goto out_err;
+
+	/* enctype */
+	if (gss_get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+		goto out_err;
+
+	/* size of each key */
+	if (gss_get_bytes(&p, end, &keysize, sizeof(keysize)))
+		goto out_err;
+
+	/* number of keys - should always be 3 */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	if (tmp_uint != 3) {
+		CERROR("Invalid number of keys: %u\n", tmp_uint);
+		goto out_err;
+	}
+
+	/* ke */
+	if (gss_get_keyblock(&p, end, &kctx->kc_keye, keysize))
+		goto out_err;
+	/* ki */
+	if (gss_get_keyblock(&p, end, &kctx->kc_keyi, keysize))
+		goto out_err;
+	/* ki */
+	if (gss_get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+		goto out_err;
+
+	CDEBUG(D_SEC, "successfully imported v2 context\n");
+	return 0;
+out_err:
+	return GSS_S_FAILURE;
+}
+
+/*
+ * The whole purpose here is trying to keep user level gss context parsing
+ * from nfs-utils unchanged as possible as we can, they are not quite mature
+ * yet, and many stuff still not clear, like heimdal etc.
+ */
+static
+__u32 gss_import_sec_context_kerberos(rawobj_t *inbuf,
+                                      struct gss_ctx *gctx)
+{
+	struct krb5_ctx *kctx;
+	char *p = (char *)inbuf->data;
+	char *end = (char *)(inbuf->data + inbuf->len);
+	unsigned int tmp_uint, rc;
+
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) {
+		CERROR("Fail to read version\n");
+		return GSS_S_FAILURE;
+	}
+
+        /* only support 0, 1 for the moment */
+        if (tmp_uint > 2) {
+                CERROR("Invalid version %u\n", tmp_uint);
+                return GSS_S_FAILURE;
+        }
+
+        OBD_ALLOC_PTR(kctx);
+        if (!kctx)
+                return GSS_S_FAILURE;
+
+        if (tmp_uint == 0 || tmp_uint == 1) {
+                kctx->kc_initiate = tmp_uint;
+                rc = import_context_rfc1964(kctx, p, end);
+        } else {
+                rc = import_context_rfc4121(kctx, p, end);
+        }
+
+        if (rc == 0)
+                rc = krb5_init_keys(kctx);
+
+        if (rc) {
+                delete_context_kerberos(kctx);
+                OBD_FREE_PTR(kctx);
+
+                return GSS_S_FAILURE;
+        }
+
+        gctx->internal_ctx_id = kctx;
+        return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx,
+                                        struct gss_ctx *gctx_new)
+{
+        struct krb5_ctx *kctx = gctx->internal_ctx_id;
+        struct krb5_ctx *knew;
+
+        OBD_ALLOC_PTR(knew);
+        if (!knew)
+                return GSS_S_FAILURE;
+
+        knew->kc_initiate = kctx->kc_initiate ? 0 : 1;
+        knew->kc_cfx = kctx->kc_cfx;
+        knew->kc_seed_init = kctx->kc_seed_init;
+        knew->kc_have_acceptor_subkey = kctx->kc_have_acceptor_subkey;
+        knew->kc_endtime = kctx->kc_endtime;
+
+        memcpy(knew->kc_seed, kctx->kc_seed, sizeof(kctx->kc_seed));
+        knew->kc_seq_send = kctx->kc_seq_recv;
+        knew->kc_seq_recv = kctx->kc_seq_send;
+        knew->kc_enctype = kctx->kc_enctype;
+
+        if (rawobj_dup(&knew->kc_mech_used, &kctx->kc_mech_used))
+                goto out_err;
+
+	if (gss_keyblock_dup(&knew->kc_keye, &kctx->kc_keye))
+		goto out_err;
+	if (gss_keyblock_dup(&knew->kc_keyi, &kctx->kc_keyi))
+		goto out_err;
+	if (gss_keyblock_dup(&knew->kc_keyc, &kctx->kc_keyc))
+		goto out_err;
+        if (krb5_init_keys(knew))
+                goto out_err;
+
+        gctx_new->internal_ctx_id = knew;
+	CDEBUG(D_SEC, "successfully copied reverse context\n");
+        return GSS_S_COMPLETE;
+
+out_err:
+        delete_context_kerberos(knew);
+        OBD_FREE_PTR(knew);
+        return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_inquire_context_kerberos(struct gss_ctx *gctx,
+				   time64_t *endtime)
+{
+        struct krb5_ctx *kctx = gctx->internal_ctx_id;
+
+	*endtime = kctx->kc_endtime;
+        return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_kerberos(void *internal_ctx)
+{
+        struct krb5_ctx *kctx = internal_ctx;
+
+        delete_context_kerberos(kctx);
+        OBD_FREE_PTR(kctx);
+}
+
+/*
+ * compute (keyed/keyless) checksum against the plain text which appended
+ * with krb5 wire token header.
+ */
+static
+__s32 krb5_make_checksum(__u32 enctype,
+			 struct gss_keyblock *kb,
+			 struct krb5_header *khdr,
+			 int msgcnt, rawobj_t *msgs,
+			 int iovcnt, struct bio_vec *iovs,
+			 rawobj_t *cksum,
+			 digest_hash hash_func)
+{
+	struct krb5_enctype *ke = &enctypes[enctype];
+	struct ahash_request *req = NULL;
+	enum cfs_crypto_hash_alg hash_algo;
+	rawobj_t hdr;
+	int rc;
+
+	hash_algo = cfs_crypto_hash_alg(ke->ke_hash_name);
+
+	/* For the cbc(des) case we want md5 instead of hmac(md5) */
+	if (strcmp(ke->ke_enc_name, "cbc(des)"))
+		req = cfs_crypto_hash_init(hash_algo, kb->kb_key.data,
+					   kb->kb_key.len);
+	else
+		req = cfs_crypto_hash_init(hash_algo, NULL, 0);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		CERROR("failed to alloc hash %s : rc = %d\n",
+		       ke->ke_hash_name, rc);
+		goto out_no_hash;
+	}
+
+	cksum->len = cfs_crypto_hash_digestsize(hash_algo);
+	OBD_ALLOC_LARGE(cksum->data, cksum->len);
+	if (!cksum->data) {
+		cksum->len = 0;
+		rc = -ENOMEM;
+		goto out_free_hash;
+	}
+
+	hdr.data = (__u8 *)khdr;
+	hdr.len = sizeof(*khdr);
+
+	if (!hash_func) {
+		rc = -EPROTO;
+		CERROR("hash function for %s undefined\n",
+		       ke->ke_hash_name);
+		goto out_free_hash;
+	}
+	rc = hash_func(req, &hdr, msgcnt, msgs, iovcnt, iovs);
+	if (rc)
+		goto out_free_hash;
+
+	if (!ke->ke_hash_hmac) {
+		LASSERT(kb->kb_tfm);
+
+		cfs_crypto_hash_final(req, cksum->data, &cksum->len);
+		rc = gss_crypt_generic(kb->kb_tfm, 0, NULL,
+				       cksum->data, cksum->data,
+				       cksum->len);
+		goto out_no_hash;
+	}
+
+out_free_hash:
+	if (req)
+		cfs_crypto_hash_final(req, cksum->data, &cksum->len);
+out_no_hash:
+	return rc ? GSS_S_FAILURE : GSS_S_COMPLETE;
+}
+
+static void fill_krb5_header(struct krb5_ctx *kctx,
+                             struct krb5_header *khdr,
+                             int privacy)
+{
+        unsigned char acceptor_flag;
+
+        acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR;
+
+        if (privacy) {
+                khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG);
+                khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL;
+                khdr->kh_ec = cpu_to_be16(0);
+                khdr->kh_rrc = cpu_to_be16(0);
+        } else {
+                khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG);
+                khdr->kh_flags = acceptor_flag;
+                khdr->kh_ec = cpu_to_be16(0xffff);
+                khdr->kh_rrc = cpu_to_be16(0xffff);
+        }
+
+        khdr->kh_filler = 0xff;
+	spin_lock(&krb5_seq_lock);
+	khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++);
+	spin_unlock(&krb5_seq_lock);
+}
+
+static __u32 verify_krb5_header(struct krb5_ctx *kctx,
+                                struct krb5_header *khdr,
+                                int privacy)
+{
+        unsigned char acceptor_flag;
+        __u16         tok_id, ec_rrc;
+
+        acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0;
+
+        if (privacy) {
+                tok_id = KG_TOK_WRAP_MSG;
+                ec_rrc = 0x0;
+        } else {
+                tok_id = KG_TOK_MIC_MSG;
+                ec_rrc = 0xffff;
+        }
+
+        /* sanity checks */
+        if (be16_to_cpu(khdr->kh_tok_id) != tok_id) {
+                CERROR("bad token id\n");
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+        if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) {
+                CERROR("bad direction flag\n");
+                return GSS_S_BAD_SIG;
+        }
+        if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) {
+                CERROR("missing confidential flag\n");
+                return GSS_S_BAD_SIG;
+        }
+        if (khdr->kh_filler != 0xff) {
+                CERROR("bad filler\n");
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+        if (be16_to_cpu(khdr->kh_ec) != ec_rrc ||
+            be16_to_cpu(khdr->kh_rrc) != ec_rrc) {
+                CERROR("bad EC or RRC\n");
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+        return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
+			   int msgcnt,
+			   rawobj_t *msgs,
+			   int iovcnt,
+			   struct bio_vec *iovs,
+			   rawobj_t *token)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	rawobj_t cksum = RAWOBJ_EMPTY;
+	u32 major;
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *)token->data;
+	fill_krb5_header(kctx, khdr, 0);
+
+	/* checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, khdr,
+			       msgcnt, msgs, iovcnt, iovs, &cksum,
+			       gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
+	memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	token->len = sizeof(*khdr) + ke->ke_hash_size;
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+	return major;
+}
+
+static
+__u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
+			      int msgcnt,
+			      rawobj_t *msgs,
+			      int iovcnt,
+			      struct bio_vec *iovs,
+			      rawobj_t *token)
+{
+	struct krb5_ctx *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header *khdr;
+	rawobj_t cksum = RAWOBJ_EMPTY;
+	u32 major;
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	khdr = (struct krb5_header *)token->data;
+
+	major = verify_krb5_header(kctx, khdr, 0);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		goto out;
+	}
+
+	if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
+		CERROR("short signature: %u, require %d\n",
+		       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
+		GOTO(out, major = GSS_S_FAILURE);
+	}
+
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+			       khdr, msgcnt, msgs, iovcnt, iovs, &cksum,
+			       gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		GOTO(out_free_cksum, major = GSS_S_BAD_SIG);
+	}
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+out:
+	return major;
+}
+
+/*
+ * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size.
+ */
+static
+int krb5_encrypt_bulk(struct crypto_sync_skcipher *tfm,
+		      struct krb5_header *khdr,
+		      char *confounder,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      int adj_nob)
+{
+	__u8 local_iv[16] = {0};
+	struct scatterlist src, dst;
+	struct sg_table sg_src, sg_dst;
+	int blocksize, i, rc, nob = 0;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
+	LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_enc_vec);
+
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+	/* encrypt confounder */
+	rc = gss_setup_sgtable(&sg_src, &src, confounder, blocksize);
+	if (rc != 0)
+		return rc;
+
+	rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data, blocksize);
+	if (rc != 0) {
+		gss_teardown_sgtable(&sg_src);
+		return rc;
+	}
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				   blocksize, local_iv);
+
+	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize);
+
+	gss_teardown_sgtable(&sg_dst);
+	gss_teardown_sgtable(&sg_src);
+
+	if (rc) {
+		CERROR("error to encrypt confounder: %d\n", rc);
+		skcipher_request_zero(req);
+		return rc;
+	}
+
+	/* encrypt clear pages */
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		sg_init_table(&src, 1);
+		sg_set_page(&src, desc->bd_vec[i].bv_page,
+			    (desc->bd_vec[i].bv_len +
+				blocksize - 1) &
+			    (~(blocksize - 1)),
+			    desc->bd_vec[i].bv_offset);
+		if (adj_nob)
+			nob += src.length;
+		sg_init_table(&dst, 1);
+		sg_set_page(&dst, desc->bd_enc_vec[i].bv_page,
+			    src.length, src.offset);
+
+		desc->bd_enc_vec[i].bv_offset = dst.offset;
+		desc->bd_enc_vec[i].bv_len = dst.length;
+
+		skcipher_request_set_crypt(req, &src, &dst,
+					  src.length, local_iv);
+		rc = crypto_skcipher_encrypt_iv(req, &dst, &src, src.length);
+		if (rc) {
+			CERROR("error to encrypt page: %d\n", rc);
+			skcipher_request_zero(req);
+			return rc;
+		}
+	}
+
+	/* encrypt krb5 header */
+	rc = gss_setup_sgtable(&sg_src, &src, khdr, sizeof(*khdr));
+	if (rc != 0) {
+		skcipher_request_zero(req);
+		return rc;
+	}
+
+	rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize,
+			   sizeof(*khdr));
+	if (rc != 0) {
+		gss_teardown_sgtable(&sg_src);
+		skcipher_request_zero(req);
+		return rc;
+	}
+
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				   sizeof(*khdr), local_iv);
+	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl,
+					sizeof(*khdr));
+	skcipher_request_zero(req);
+
+	gss_teardown_sgtable(&sg_dst);
+	gss_teardown_sgtable(&sg_src);
+
+        if (rc) {
+                CERROR("error to encrypt krb5 header: %d\n", rc);
+                return rc;
+        }
+
+        if (adj_nob)
+                desc->bd_nob = nob;
+
+        return 0;
+}
+
+/*
+ * desc->bd_nob_transferred is the size of cipher text received.
+ * desc->bd_nob is the target size of plain text supposed to be.
+ *
+ * if adj_nob != 0, we adjust each page's bv_len to the actual
+ * plain text size.
+ * - for client read: we don't know data size for each page, so
+ *   bd_iov[]->bv_len is set to PAGE_SIZE, but actual data received might
+ *   be smaller, so we need to adjust it according to
+ *   bd_u.bd_kiov.bd_enc_vec[]->bv_len.
+ *   this means we DO NOT support the situation that server send an odd size
+ *   data in a page which is not the last one.
+ * - for server write: we knows exactly data size for each page being expected,
+ *   thus bv_len is accurate already, so we should not adjust it at all.
+ *   and bd_u.bd_kiov.bd_enc_vec[]->bv_len should be
+ *   round_up(bd_iov[]->bv_len) which
+ *   should have been done by prep_bulk().
+ */
+static
+int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm,
+		      struct krb5_header *khdr,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      rawobj_t *plain,
+		      int adj_nob)
+{
+	__u8 local_iv[16] = {0};
+	struct scatterlist src, dst;
+	struct sg_table sg_src, sg_dst;
+	int ct_nob = 0, pt_nob = 0;
+	int blocksize, i, rc;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
+	LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_enc_vec);
+	LASSERT(desc->bd_nob_transferred);
+
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+	if (desc->bd_nob_transferred % blocksize) {
+		CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+		return -EPROTO;
+	}
+
+	/* decrypt head (confounder) */
+	rc = gss_setup_sgtable(&sg_src, &src, cipher->data, blocksize);
+	if (rc != 0)
+		return rc;
+
+	rc = gss_setup_sgtable(&sg_dst, &dst, plain->data, blocksize);
+	if (rc != 0) {
+		gss_teardown_sgtable(&sg_src);
+		return rc;
+	}
+
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				   blocksize, local_iv);
+
+	rc = crypto_skcipher_decrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize);
+
+	gss_teardown_sgtable(&sg_dst);
+	gss_teardown_sgtable(&sg_src);
+
+	if (rc) {
+		CERROR("error to decrypt confounder: %d\n", rc);
+		skcipher_request_zero(req);
+		return rc;
+	}
+
+	for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
+	     i++) {
+		if (desc->bd_enc_vec[i].bv_offset % blocksize != 0 ||
+		    desc->bd_enc_vec[i].bv_len % blocksize != 0) {
+			CERROR("page %d: odd offset %u len %u, blocksize %d\n",
+			       i, desc->bd_enc_vec[i].bv_offset,
+			       desc->bd_enc_vec[i].bv_len,
+			       blocksize);
+			skcipher_request_zero(req);
+			return -EFAULT;
+		}
+
+		if (adj_nob) {
+			if (ct_nob + desc->bd_enc_vec[i].bv_len >
+			    desc->bd_nob_transferred)
+				desc->bd_enc_vec[i].bv_len =
+					desc->bd_nob_transferred - ct_nob;
+
+			desc->bd_vec[i].bv_len =
+			  desc->bd_enc_vec[i].bv_len;
+			if (pt_nob + desc->bd_enc_vec[i].bv_len >
+			    desc->bd_nob)
+				desc->bd_vec[i].bv_len =
+				  desc->bd_nob - pt_nob;
+		} else {
+			/* this should be guaranteed by LNET */
+			LASSERT(ct_nob + desc->bd_enc_vec[i].
+				bv_len <=
+				desc->bd_nob_transferred);
+			LASSERT(desc->bd_vec[i].bv_len <=
+				desc->bd_enc_vec[i].bv_len);
+		}
+
+		if (desc->bd_enc_vec[i].bv_len == 0)
+			continue;
+
+		sg_init_table(&src, 1);
+		sg_set_page(&src, desc->bd_enc_vec[i].bv_page,
+			    desc->bd_enc_vec[i].bv_len,
+			    desc->bd_enc_vec[i].bv_offset);
+		dst = src;
+		if (desc->bd_vec[i].bv_len % blocksize == 0)
+			sg_assign_page(&dst,
+				       desc->bd_vec[i].bv_page);
+
+		skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+					   src.length, local_iv);
+		rc = crypto_skcipher_decrypt_iv(req, &dst, &src, src.length);
+		if (rc) {
+			CERROR("error to decrypt page: %d\n", rc);
+			skcipher_request_zero(req);
+			return rc;
+		}
+
+		if (desc->bd_vec[i].bv_len % blocksize != 0) {
+			memcpy(page_address(desc->bd_vec[i].bv_page) +
+			       desc->bd_vec[i].bv_offset,
+			       page_address(desc->bd_enc_vec[i].
+					    bv_page) +
+			       desc->bd_vec[i].bv_offset,
+			       desc->bd_vec[i].bv_len);
+		}
+
+		ct_nob += desc->bd_enc_vec[i].bv_len;
+		pt_nob += desc->bd_vec[i].bv_len;
+	}
+
+	if (unlikely(ct_nob != desc->bd_nob_transferred)) {
+		CERROR("%d cipher text transferred but only %d decrypted\n",
+		       desc->bd_nob_transferred, ct_nob);
+		skcipher_request_zero(req);
+		return -EFAULT;
+	}
+
+	if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
+		CERROR("%d plain text expected but only %d received\n",
+		       desc->bd_nob, pt_nob);
+		skcipher_request_zero(req);
+		return -EFAULT;
+	}
+
+	/* if needed, clear up the rest unused iovs */
+	if (adj_nob)
+		while (i < desc->bd_iov_count)
+			desc->bd_vec[i++].bv_len = 0;
+
+	/* decrypt tail (krb5 header) */
+	rc = gss_setup_sgtable(&sg_src, &src, cipher->data + blocksize,
+			       sizeof(*khdr));
+	if (rc != 0)
+		return rc;
+
+	rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize,
+			       sizeof(*khdr));
+	if (rc != 0) {
+		gss_teardown_sgtable(&sg_src);
+		return rc;
+	}
+
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				  src.length, local_iv);
+	rc = crypto_skcipher_decrypt_iv(req, sg_dst.sgl, sg_src.sgl,
+					sizeof(*khdr));
+	gss_teardown_sgtable(&sg_src);
+	gss_teardown_sgtable(&sg_dst);
+
+	skcipher_request_zero(req);
+	if (rc) {
+		CERROR("error to decrypt tail: %d\n", rc);
+		return rc;
+	}
+
+	if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+		CERROR("krb5 header doesn't match\n");
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+static
+__u32 gss_wrap_kerberos(struct gss_ctx *gctx,
+			rawobj_t *gsshdr,
+			rawobj_t *msg,
+			int msg_buflen,
+			rawobj_t *token)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int                  blocksize;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             data_desc[3], cipher;
+	__u8                 conf[GSS_MAX_CIPHER_BLOCK];
+	__u8                 local_iv[16] = {0};
+	u32 major;
+	int                  rc = 0;
+
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+	LASSERT(kctx->kc_keye.kb_tfm == NULL ||
+		ke->ke_conf_size >=
+		crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm));
+
+	/*
+	 * final token format:
+	 * ---------------------------------------------------
+	 * | krb5 header | cipher text | checksum (16 bytes) |
+	 * ---------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *)token->data;
+	fill_krb5_header(kctx, khdr, 1);
+
+	/* generate confounder */
+	get_random_bytes(conf, ke->ke_conf_size);
+
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_sync_skcipher_blocksize(
+							kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(blocksize <= ke->ke_conf_size);
+
+	/* padding the message */
+	if (gss_add_padding(msg, msg_buflen, blocksize))
+		return GSS_S_FAILURE;
+
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = gsshdr->data;
+	data_desc[1].len = gsshdr->len;
+	data_desc[2].data = msg->data;
+	data_desc[2].len = msg->len;
+
+	/* compute checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, data_desc, 0, NULL, &cksum,
+			       gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	/*
+	 * clear text layout for encryption:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = msg->data;
+	data_desc[1].len = msg->len;
+	data_desc[2].data = (__u8 *) khdr;
+	data_desc[2].len = sizeof(*khdr);
+
+	/* cipher text will be directly inplace */
+	cipher.data = (__u8 *)(khdr + 1);
+	cipher.len = token->len - sizeof(*khdr);
+	LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		rawobj_t arc4_keye = RAWOBJ_EMPTY;
+		struct crypto_sync_skcipher *arc4_tfm;
+
+		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye,
+				       gctx->hash_func)) {
+			CERROR("failed to obtain arc4 enc key\n");
+			GOTO(arc4_out_key, rc = -EACCES);
+		}
+
+		arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0);
+		if (IS_ERR(arc4_tfm)) {
+			CERROR("failed to alloc tfm arc4 in ECB mode\n");
+			GOTO(arc4_out_key, rc = -EACCES);
+		}
+
+		if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data,
+						arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
+
+		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 3, data_desc,
+				       &cipher, 1);
+arc4_out_tfm:
+		crypto_free_sync_skcipher(arc4_tfm);
+arc4_out_key:
+		rawobj_free(&arc4_keye);
+	} else {
+		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 3,
+				       data_desc, &cipher, 1);
+	}
+
+	if (rc)
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+	return major;
+}
+
+static
+__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
+			     struct ptlrpc_bulk_desc *desc)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	int                  blocksize, i;
+
+	LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_enc_vec);
+	LASSERT(kctx->kc_keye.kb_tfm);
+
+	blocksize = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(desc->bd_enc_vec[i].bv_page);
+		/*
+		 * offset should always start at page boundary of either
+		 * client or server side.
+		 */
+		if (desc->bd_vec[i].bv_offset & blocksize) {
+			CERROR("odd offset %d in page %d\n",
+			       desc->bd_vec[i].bv_offset, i);
+			return GSS_S_FAILURE;
+		}
+
+		desc->bd_enc_vec[i].bv_offset =
+			desc->bd_vec[i].bv_offset;
+		desc->bd_enc_vec[i].bv_len =
+			(desc->bd_vec[i].bv_len +
+			 blocksize - 1) & (~(blocksize - 1));
+	}
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
+			     struct ptlrpc_bulk_desc *desc,
+			     rawobj_t *token, int adj_nob)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int                  blocksz;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             data_desc[1], cipher;
+	__u8                 conf[GSS_MAX_CIPHER_BLOCK];
+	int rc = 0;
+	u32 major;
+
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+	/*
+	 * final token format:
+	 * --------------------------------------------------
+	 * | krb5 header | head/tail cipher text | checksum |
+	 * --------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *)token->data;
+	fill_krb5_header(kctx, khdr, 1);
+
+	/* generate confounder */
+	get_random_bytes(conf, ke->ke_conf_size);
+
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksz = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+
+	/*
+	 * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+	 * the bulk token size would be exactly (sizeof(krb5_header) +
+	 * blocksize + sizeof(krb5_header) + hashsize)
+	 */
+	LASSERT(blocksz <= ke->ke_conf_size);
+	LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0);
+	LASSERT(token->len >= sizeof(*khdr) + blocksz + sizeof(*khdr) + 16);
+
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+
+	/* compute checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 1, data_desc,
+			       desc->bd_iov_count, desc->bd_vec,
+			       &cksum, gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	/*
+	 * clear text layout for encryption:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 *        |              |             |
+	 *        ----------  (cipher pages)   |
+	 * result token:   |                   |
+	 * -------------------------------------------
+	 * | krb5 header | cipher text | cipher text |
+	 * -------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+
+	cipher.data = (__u8 *)(khdr + 1);
+	cipher.len = blocksz + sizeof(*khdr);
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LBUG();
+		rc = 0;
+	} else {
+		rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+				       conf, desc, &cipher, adj_nob);
+	}
+	if (rc)
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+	return major;
+}
+
+static
+__u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
+			  rawobj_t        *gsshdr,
+			  rawobj_t        *token,
+			  rawobj_t        *msg)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	unsigned char       *tmpbuf;
+	int                  blocksz, bodysize;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             cipher_in, plain_out;
+	rawobj_t             hash_objs[3];
+	int                  rc = 0;
+	__u32                major;
+	__u8                 local_iv[16] = {0};
+
+	LASSERT(ke);
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	khdr = (struct krb5_header *)token->data;
+
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
+
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksz = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+
+	/* expected token layout:
+	 * ----------------------------------------
+	 * | krb5 header | cipher text | checksum |
+	 * ----------------------------------------
+	 */
+	bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
+
+	if (bodysize % blocksz) {
+		CERROR("odd bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
+		CERROR("incomplete token: bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
+		CERROR("buffer too small: %u, require %d\n",
+		       msg->len, bodysize - ke->ke_conf_size);
+		return GSS_S_FAILURE;
+	}
+
+	/* decrypting */
+	OBD_ALLOC_LARGE(tmpbuf, bodysize);
+	if (!tmpbuf)
+		return GSS_S_FAILURE;
+
+	major = GSS_S_FAILURE;
+
+	cipher_in.data = (__u8 *)(khdr + 1);
+	cipher_in.len = bodysize;
+	plain_out.data = tmpbuf;
+	plain_out.len = bodysize;
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		rawobj_t		 arc4_keye;
+		struct crypto_sync_skcipher *arc4_tfm;
+
+		cksum.data = token->data + token->len - ke->ke_hash_size;
+		cksum.len = ke->ke_hash_size;
+
+		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye,
+				       gctx->hash_func)) {
+			CERROR("failed to obtain arc4 enc key\n");
+			GOTO(arc4_out, rc = -EACCES);
+		}
+
+		arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0);
+		if (IS_ERR(arc4_tfm)) {
+			CERROR("failed to alloc tfm arc4 in ECB mode\n");
+			GOTO(arc4_out_key, rc = -EACCES);
+		}
+
+		if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data,
+						arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
+
+		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 1, &cipher_in,
+				       &plain_out, 0);
+arc4_out_tfm:
+		crypto_free_sync_skcipher(arc4_tfm);
+arc4_out_key:
+		rawobj_free(&arc4_keye);
+arc4_out:
+		cksum = RAWOBJ_EMPTY;
+	} else {
+		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 1,
+				       &cipher_in, &plain_out, 0);
+	}
+
+	if (rc != 0) {
+		CERROR("error decrypt\n");
+		goto out_free;
+	}
+	LASSERT(plain_out.len == bodysize);
+
+	/* expected clear text layout:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+
+	/* verify krb5 header in token is not modified */
+	if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
+		   sizeof(*khdr))) {
+		CERROR("decrypted krb5 header mismatch\n");
+		goto out_free;
+	}
+
+	/* verify checksum, compose clear text as layout:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	hash_objs[0].len = ke->ke_conf_size;
+	hash_objs[0].data = plain_out.data;
+	hash_objs[1].len = gsshdr->len;
+	hash_objs[1].data = gsshdr->data;
+	hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
+	hash_objs[2].data = plain_out.data + ke->ke_conf_size;
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, hash_objs, 0, NULL, &cksum,
+			       gctx->hash_func))
+		goto out_free;
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp((char *)(khdr + 1) + bodysize,
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		goto out_free;
+	}
+
+	msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
+	memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
+
+	major = GSS_S_COMPLETE;
+out_free:
+	OBD_FREE_LARGE(tmpbuf, bodysize);
+	rawobj_free(&cksum);
+	return major;
+}
+
+static
+__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
+			       struct ptlrpc_bulk_desc *desc,
+			       rawobj_t *token, int adj_nob)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int                  blocksz;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             cipher, plain;
+	rawobj_t             data_desc[1];
+	int                  rc;
+	__u32                major;
+
+	LASSERT(ke);
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	khdr = (struct krb5_header *)token->data;
+
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
+
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksz = 1;
+		LBUG();
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0);
+
+	/*
+	 * token format is expected as:
+	 * -----------------------------------------------
+	 * | krb5 header | head/tail cipher text | cksum |
+	 * -----------------------------------------------
+	 */
+	if (token->len < sizeof(*khdr) + blocksz + sizeof(*khdr) +
+	    ke->ke_hash_size) {
+		CERROR("short token size: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	cipher.data = (__u8 *) (khdr + 1);
+	cipher.len = blocksz + sizeof(*khdr);
+	plain.data = cipher.data;
+	plain.len = cipher.len;
+
+	rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+			       desc, &cipher, &plain, adj_nob);
+	if (rc)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	/*
+	 * verify checksum, compose clear text as layout:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = plain.data;
+	data_desc[0].len = blocksz;
+
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 1, data_desc,
+			       desc->bd_iov_count,
+			       desc->bd_vec,
+			       &cksum, gctx->hash_func))
+		return GSS_S_FAILURE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	if (memcmp(plain.data + blocksz + sizeof(*khdr),
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		rawobj_free(&cksum);
+		return GSS_S_BAD_SIG;
+	}
+
+	rawobj_free(&cksum);
+	return GSS_S_COMPLETE;
+}
+
+int gss_display_kerberos(struct gss_ctx        *ctx,
+			 char                  *buf,
+			 int                    bufsize)
+{
+	struct krb5_ctx    *kctx = ctx->internal_ctx_id;
+	int                 written;
+
+	written = scnprintf(buf, bufsize, "krb5 (%s)",
+			    enctype2str(kctx->kc_enctype));
+	return written;
+}
+
+static struct gss_api_ops gss_kerberos_ops = {
+        .gss_import_sec_context     = gss_import_sec_context_kerberos,
+        .gss_copy_reverse_context   = gss_copy_reverse_context_kerberos,
+        .gss_inquire_context        = gss_inquire_context_kerberos,
+        .gss_get_mic                = gss_get_mic_kerberos,
+        .gss_verify_mic             = gss_verify_mic_kerberos,
+        .gss_wrap                   = gss_wrap_kerberos,
+        .gss_unwrap                 = gss_unwrap_kerberos,
+        .gss_prep_bulk              = gss_prep_bulk_kerberos,
+        .gss_wrap_bulk              = gss_wrap_bulk_kerberos,
+        .gss_unwrap_bulk            = gss_unwrap_bulk_kerberos,
+        .gss_delete_sec_context     = gss_delete_sec_context_kerberos,
+        .gss_display                = gss_display_kerberos,
+};
+
+static struct subflavor_desc gss_kerberos_sfs[] = {
+        {
+                .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5N,
+                .sf_qop         = 0,
+                .sf_service     = SPTLRPC_SVC_NULL,
+                .sf_name        = "krb5n"
+        },
+        {
+                .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5A,
+                .sf_qop         = 0,
+                .sf_service     = SPTLRPC_SVC_AUTH,
+                .sf_name        = "krb5a"
+        },
+        {
+                .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5I,
+                .sf_qop         = 0,
+                .sf_service     = SPTLRPC_SVC_INTG,
+                .sf_name        = "krb5i"
+        },
+        {
+                .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5P,
+                .sf_qop         = 0,
+                .sf_service     = SPTLRPC_SVC_PRIV,
+                .sf_name        = "krb5p"
+        },
+};
+
+static struct gss_api_mech gss_kerberos_mech = {
+	/* .gm_owner uses default NULL value for THIS_MODULE */
+        .gm_name        = "krb5",
+        .gm_oid         = (rawobj_t)
+                                {9, "\052\206\110\206\367\022\001\002\002"},
+        .gm_ops         = &gss_kerberos_ops,
+        .gm_sf_num      = 4,
+        .gm_sfs         = gss_kerberos_sfs,
+};
+
+int __init init_kerberos_module(void)
+{
+	int status;
+
+	status = lgss_mech_register(&gss_kerberos_mech);
+	if (status)
+		CERROR("Failed to register kerberos gss mechanism!\n");
+	return status;
+}
+
+void cleanup_kerberos_module(void)
+{
+        lgss_mech_unregister(&gss_kerberos_mech);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
new file mode 100644
index 0000000000000..ee2b851e90c82
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
@@ -0,0 +1,361 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_mech_switch.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  J. Bruce Fields   <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_crypto.h"
+
+static LIST_HEAD(registered_mechs);
+static DEFINE_SPINLOCK(registered_mechs_lock);
+
+int lgss_mech_register(struct gss_api_mech *gm)
+{
+	spin_lock(&registered_mechs_lock);
+	list_add(&gm->gm_list, &registered_mechs);
+	spin_unlock(&registered_mechs_lock);
+	CDEBUG(D_SEC, "register %s mechanism\n", gm->gm_name);
+	return 0;
+}
+
+void lgss_mech_unregister(struct gss_api_mech *gm)
+{
+	spin_lock(&registered_mechs_lock);
+	list_del(&gm->gm_list);
+	spin_unlock(&registered_mechs_lock);
+	CDEBUG(D_SEC, "Unregister %s mechanism\n", gm->gm_name);
+}
+
+
+struct gss_api_mech *lgss_mech_get(struct gss_api_mech *gm)
+{
+	__module_get(gm->gm_owner);
+	return gm;
+}
+
+struct gss_api_mech *lgss_name_to_mech(char *name)
+{
+	struct gss_api_mech *pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (0 == strcmp(name, pos->gm_name)) {
+			if (!try_module_get(pos->gm_owner))
+				continue;
+			gm = pos;
+			break;
+		}
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+
+}
+
+static inline
+int mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor)
+{
+        int i;
+
+        for (i = 0; i < gm->gm_sf_num; i++) {
+                if (gm->gm_sfs[i].sf_subflavor == subflavor)
+                        return 1;
+        }
+        return 0;
+}
+
+struct gss_api_mech *lgss_subflavor_to_mech(__u32 subflavor)
+{
+	struct gss_api_mech *pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (!try_module_get(pos->gm_owner))
+			continue;
+		if (!mech_supports_subflavor(pos, subflavor)) {
+			module_put(pos->gm_owner);
+			continue;
+		}
+		gm = pos;
+		break;
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+}
+
+void lgss_mech_put(struct gss_api_mech *gm)
+{
+	module_put(gm->gm_owner);
+}
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+__u32 lgss_import_sec_context(rawobj_t *input_token,
+                              struct gss_api_mech *mech,
+                              struct gss_ctx **ctx_id)
+{
+	OBD_ALLOC_PTR(*ctx_id);
+	if (*ctx_id == NULL)
+		return GSS_S_FAILURE;
+
+	(*ctx_id)->mech_type = lgss_mech_get(mech);
+	(*ctx_id)->hash_func = gss_digest_hash;
+
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_import_sec_context);
+	return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+}
+
+__u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id,
+				struct gss_ctx **ctx_id_new)
+{
+	struct gss_api_mech *mech = ctx_id->mech_type;
+	__u32                major;
+
+	LASSERT(mech);
+
+	OBD_ALLOC_PTR(*ctx_id_new);
+	if (*ctx_id_new == NULL)
+		return GSS_S_FAILURE;
+
+	(*ctx_id_new)->mech_type = lgss_mech_get(mech);
+	(*ctx_id_new)->hash_func = ctx_id->hash_func;
+
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_copy_reverse_context);
+
+	major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
+	if (major != GSS_S_COMPLETE) {
+		lgss_mech_put(mech);
+		OBD_FREE_PTR(*ctx_id_new);
+		*ctx_id_new = NULL;
+	}
+	return major;
+}
+
+/*
+ * this interface is much simplified, currently we only need endtime.
+ */
+__u32 lgss_inquire_context(struct gss_ctx *context_handle,
+			   time64_t *endtime)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_inquire_context(context_handle,
+                                      endtime);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+__u32 lgss_get_mic(struct gss_ctx *context_handle,
+		   int msgcnt,
+		   rawobj_t *msg,
+		   int iovcnt,
+		   struct bio_vec *iovs,
+		   rawobj_t *mic_token)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_get_mic);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_get_mic(context_handle,
+                              msgcnt,
+                              msg,
+                              iovcnt,
+                              iovs,
+                              mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+__u32 lgss_verify_mic(struct gss_ctx *context_handle,
+		      int msgcnt,
+		      rawobj_t *msg,
+		      int iovcnt,
+		      struct bio_vec *iovs,
+		      rawobj_t *mic_token)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_verify_mic(context_handle,
+                                 msgcnt,
+                                 msg,
+                                 iovcnt,
+                                 iovs,
+                                 mic_token);
+}
+
+__u32 lgss_wrap(struct gss_ctx *context_handle,
+                rawobj_t *gsshdr,
+                rawobj_t *msg,
+                int msg_buflen,
+                rawobj_t *out_token)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_wrap);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_wrap(context_handle, gsshdr, msg, msg_buflen, out_token);
+}
+
+__u32 lgss_unwrap(struct gss_ctx *context_handle,
+                  rawobj_t *gsshdr,
+                  rawobj_t *token,
+                  rawobj_t *out_msg)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_unwrap);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_unwrap(context_handle, gsshdr, token, out_msg);
+}
+
+
+__u32 lgss_prep_bulk(struct gss_ctx *context_handle,
+                     struct ptlrpc_bulk_desc *desc)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_prep_bulk(context_handle, desc);
+}
+
+__u32 lgss_wrap_bulk(struct gss_ctx *context_handle,
+                     struct ptlrpc_bulk_desc *desc,
+                     rawobj_t *token,
+                     int adj_nob)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_wrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle,
+                       struct ptlrpc_bulk_desc *desc,
+                       rawobj_t *token,
+                       int adj_nob)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_unwrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+__u32 lgss_delete_sec_context(struct gss_ctx **context_handle)
+{
+	struct gss_api_mech *mech;
+
+	if (!*context_handle)
+		return GSS_S_NO_CONTEXT;
+
+	CDEBUG(D_SEC, "deleting %p\n", *context_handle);
+
+	mech = (*context_handle)->mech_type;
+	if ((*context_handle)->internal_ctx_id != NULL) {
+		LASSERT(mech);
+		LASSERT(mech->gm_ops);
+		LASSERT(mech->gm_ops->gss_delete_sec_context);
+		mech->gm_ops->gss_delete_sec_context(
+			(*context_handle)->internal_ctx_id);
+	}
+	if (mech)
+		lgss_mech_put(mech);
+
+	OBD_FREE_PTR(*context_handle);
+	*context_handle = NULL;
+	return GSS_S_COMPLETE;
+}
+
+int lgss_display(struct gss_ctx *ctx,
+                 char           *buf,
+                 int             bufsize)
+{
+        LASSERT(ctx);
+        LASSERT(ctx->mech_type);
+        LASSERT(ctx->mech_type->gm_ops);
+        LASSERT(ctx->mech_type->gm_ops->gss_display);
+
+        return ctx->mech_type->gm_ops->gss_display(ctx, buf, bufsize);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
new file mode 100644
index 0000000000000..6362673743bcf
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
@@ -0,0 +1,220 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013, 2015, Trustees of Indiana University
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Author: Jeremy Filizetti <jfilizet@iu.edu>
+ * Author: Andrew Korty <ajk@iu.edu>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+
+struct null_ctx {
+	__u64 nc_token;
+};
+
+static
+__u32 gss_import_sec_context_null(rawobj_t *inbuf, struct gss_ctx *gss_context)
+{
+	struct null_ctx *null_context;
+
+	if (inbuf == NULL || inbuf->data == NULL ||
+	    inbuf->len != sizeof(*null_context)) {
+		CDEBUG(D_SEC, "Invalid input buffer for null context\n");
+		return GSS_S_FAILURE;
+	}
+
+	OBD_ALLOC_PTR(null_context);
+	if (null_context == NULL)
+		return GSS_S_FAILURE;
+
+	memcpy(&null_context->nc_token, inbuf->data, inbuf->len);
+
+	gss_context->internal_ctx_id = null_context;
+	CDEBUG(D_SEC, "successfully imported null context\n");
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_copy_reverse_context_null(struct gss_ctx *gss_context_old,
+				    struct gss_ctx *gss_context_new)
+{
+	struct null_ctx *null_context_old;
+	struct null_ctx *null_context_new;
+
+	OBD_ALLOC_PTR(null_context_new);
+	if (null_context_new == NULL)
+		return GSS_S_FAILURE;
+
+	null_context_old = gss_context_old->internal_ctx_id;
+	memcpy(null_context_new, null_context_old, sizeof(*null_context_new));
+	gss_context_new->internal_ctx_id = null_context_new;
+	CDEBUG(D_SEC, "successfully copied reverse null context\n");
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_inquire_context_null(struct gss_ctx *gss_context,
+			       time64_t *endtime)
+{
+	/* quick timeout for testing purposes */
+	*endtime = ktime_get_real_seconds() + 60;
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_null(struct gss_ctx *gss_context, rawobj_t *gss_header,
+		    rawobj_t *message, int message_buffer_length,
+		    rawobj_t *token)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_null(struct gss_ctx *gss_context, rawobj_t *gss_header,
+		      rawobj_t *token, rawobj_t *message)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_null(struct gss_ctx *gss_context,
+			 struct ptlrpc_bulk_desc *desc)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_bulk_null(struct gss_ctx *gss_context,
+			 struct ptlrpc_bulk_desc *desc, rawobj_t *token,
+			 int adj_nob)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_bulk_null(struct gss_ctx *gss_context,
+			   struct ptlrpc_bulk_desc *desc,
+			   rawobj_t *token, int adj_nob)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_null(void *internal_context)
+{
+	struct null_ctx *null_context = internal_context;
+
+	OBD_FREE_PTR(null_context);
+}
+
+int gss_display_null(struct gss_ctx *gss_context, char *buf, int bufsize)
+{
+	return scnprintf(buf, bufsize, "null");
+}
+
+static
+__u32 gss_get_mic_null(struct gss_ctx *gss_context, int message_count,
+		       rawobj_t *messages, int iov_count, struct bio_vec *iovs,
+		       rawobj_t *token)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_verify_mic_null(struct gss_ctx *gss_context, int message_count,
+			  rawobj_t *messages, int iov_count,
+			  struct bio_vec *iovs,
+			  rawobj_t *token)
+{
+	return GSS_S_COMPLETE;
+}
+
+static struct gss_api_ops gss_null_ops = {
+	.gss_import_sec_context     = gss_import_sec_context_null,
+	.gss_copy_reverse_context   = gss_copy_reverse_context_null,
+	.gss_inquire_context        = gss_inquire_context_null,
+	.gss_get_mic                = gss_get_mic_null,
+	.gss_verify_mic             = gss_verify_mic_null,
+	.gss_wrap                   = gss_wrap_null,
+	.gss_unwrap                 = gss_unwrap_null,
+	.gss_prep_bulk              = gss_prep_bulk_null,
+	.gss_wrap_bulk              = gss_wrap_bulk_null,
+	.gss_unwrap_bulk            = gss_unwrap_bulk_null,
+	.gss_delete_sec_context     = gss_delete_sec_context_null,
+	.gss_display                = gss_display_null,
+};
+
+static struct subflavor_desc gss_null_sfs[] = {
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_GSSNULL,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_NULL,
+		.sf_name        = "gssnull"
+	},
+};
+
+static struct gss_api_mech gss_null_mech = {
+	/* .gm_owner uses default NULL value for THIS_MODULE */
+	.gm_name        = "gssnull",
+	.gm_oid         = (rawobj_t) {
+		12,
+		"\053\006\001\004\001\311\146\215\126\001\000\000"
+	},
+	.gm_ops         = &gss_null_ops,
+	.gm_sf_num      = 1,
+	.gm_sfs         = gss_null_sfs,
+};
+
+int __init init_null_module(void)
+{
+	int status;
+
+	status = lgss_mech_register(&gss_null_mech);
+	if (status)
+		CERROR("Failed to register null gss mechanism!\n");
+
+	return status;
+}
+
+void cleanup_null_module(void)
+{
+	lgss_mech_unregister(&gss_null_mech);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
new file mode 100644
index 0000000000000..4a21cc77a6eea
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
@@ -0,0 +1,1255 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+#include <asm/atomic.h>
+struct rpc_clnt; /* for rpc_pipefs */
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include <libcfs/linux/linux-list.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_pipefs;
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops;
+
+static int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx);
+
+static int gss_sec_pipe_upcall_init(struct gss_sec *gsec)
+{
+        return 0;
+}
+
+static void gss_sec_pipe_upcall_fini(struct gss_sec *gsec)
+{
+}
+
+/****************************************
+ * internal context helpers             *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *ctx_create_pf(struct ptlrpc_sec *sec,
+                                     struct vfs_cred *vcred)
+{
+        struct gss_cli_ctx *gctx;
+        int                 rc;
+
+        OBD_ALLOC_PTR(gctx);
+        if (gctx == NULL)
+                return NULL;
+
+        rc = gss_cli_ctx_init_common(sec, &gctx->gc_base,
+                                     &gss_pipefs_ctxops, vcred);
+        if (rc) {
+                OBD_FREE_PTR(gctx);
+                return NULL;
+        }
+
+        return &gctx->gc_base;
+}
+
+static
+void ctx_destroy_pf(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+	if (gss_cli_ctx_fini_common(sec, ctx))
+		return;
+
+	OBD_FREE_PTR(gctx);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static
+void ctx_enhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *hash)
+{
+	set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+	atomic_inc(&ctx->cc_refcount);
+	hlist_add_head(&ctx->cc_cache, hash);
+}
+
+/*
+ * caller must hold spinlock
+ */
+static
+void ctx_unhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *freelist)
+{
+	assert_spin_locked(&ctx->cc_sec->ps_lock);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+	LASSERT(!hlist_unhashed(&ctx->cc_cache));
+
+	clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+
+	if (atomic_dec_and_test(&ctx->cc_refcount)) {
+		__hlist_del(&ctx->cc_cache);
+		hlist_add_head(&ctx->cc_cache, freelist);
+	} else {
+		hlist_del_init(&ctx->cc_cache);
+	}
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+static
+int ctx_check_death_pf(struct ptlrpc_cli_ctx *ctx,
+		       struct hlist_head *freelist)
+{
+        if (cli_ctx_check_death(ctx)) {
+                if (freelist)
+                        ctx_unhash_pf(ctx, freelist);
+                return 1;
+        }
+
+        return 0;
+}
+
+static inline
+int ctx_check_death_locked_pf(struct ptlrpc_cli_ctx *ctx,
+			      struct hlist_head *freelist)
+{
+	LASSERT(ctx->cc_sec);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+
+	return ctx_check_death_pf(ctx, freelist);
+}
+
+static inline
+int ctx_match_pf(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+        /* a little bit optimization for null policy */
+        if (!ctx->cc_ops->match)
+                return 1;
+
+        return ctx->cc_ops->match(ctx, vcred);
+}
+
+static
+void ctx_list_destroy_pf(struct hlist_head *head)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	while (!hlist_empty(head)) {
+		ctx = cfs_hlist_entry(head->first, struct ptlrpc_cli_ctx,
+				      cc_cache);
+
+		LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+		LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT,
+				 &ctx->cc_flags) == 0);
+
+		hlist_del_init(&ctx->cc_cache);
+		ctx_destroy_pf(ctx->cc_sec, ctx);
+	}
+}
+
+/****************************************
+ * context apis                         *
+ ****************************************/
+
+static
+int gss_cli_ctx_validate_pf(struct ptlrpc_cli_ctx *ctx)
+{
+        if (ctx_check_death_pf(ctx, NULL))
+                return 1;
+        if (cli_ctx_is_ready(ctx))
+                return 0;
+        return 1;
+}
+
+static
+void gss_cli_ctx_die_pf(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+	LASSERT(ctx->cc_sec);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	cli_ctx_expire(ctx);
+
+	spin_lock(&ctx->cc_sec->ps_lock);
+
+	if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)) {
+		LASSERT(!hlist_unhashed(&ctx->cc_cache));
+		LASSERT(atomic_read(&ctx->cc_refcount) > 1);
+
+		hlist_del_init(&ctx->cc_cache);
+		if (atomic_dec_and_test(&ctx->cc_refcount))
+			LBUG();
+	}
+
+	spin_unlock(&ctx->cc_sec->ps_lock);
+}
+
+/****************************************
+ * reverse context installation         *
+ ****************************************/
+
+static inline
+unsigned int ctx_hash_index(int hashsize, __u64 key)
+{
+        return (unsigned int) (key & ((__u64) hashsize - 1));
+}
+
+static
+void gss_sec_ctx_replace_pf(struct gss_sec *gsec,
+                            struct ptlrpc_cli_ctx *new)
+{
+	struct hlist_node __maybe_unused *pos, *next;
+	struct gss_sec_pipefs *gsec_pf;
+	struct ptlrpc_cli_ctx *ctx;
+	HLIST_HEAD(freelist);
+	unsigned int hash;
+	ENTRY;
+
+        gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+        hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+                              (__u64) new->cc_vcred.vc_uid);
+        LASSERT(hash < gsec_pf->gsp_chash_size);
+
+	spin_lock(&gsec->gs_base.ps_lock);
+
+        cfs_hlist_for_each_entry_safe(ctx, pos, next,
+                                      &gsec_pf->gsp_chash[hash], cc_cache) {
+                if (!ctx_match_pf(ctx, &new->cc_vcred))
+                        continue;
+
+                cli_ctx_expire(ctx);
+                ctx_unhash_pf(ctx, &freelist);
+                break;
+        }
+
+        ctx_enhash_pf(new, &gsec_pf->gsp_chash[hash]);
+
+	spin_unlock(&gsec->gs_base.ps_lock);
+
+        ctx_list_destroy_pf(&freelist);
+        EXIT;
+}
+
+static
+int gss_install_rvs_cli_ctx_pf(struct gss_sec *gsec,
+                               struct ptlrpc_svc_ctx *svc_ctx)
+{
+        struct vfs_cred          vcred;
+        struct ptlrpc_cli_ctx   *cli_ctx;
+        int                      rc;
+        ENTRY;
+
+        vcred.vc_uid = 0;
+        vcred.vc_gid = 0;
+
+        cli_ctx = ctx_create_pf(&gsec->gs_base, &vcred);
+        if (!cli_ctx)
+                RETURN(-ENOMEM);
+
+        rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+        if (rc) {
+                ctx_destroy_pf(cli_ctx->cc_sec, cli_ctx);
+                RETURN(rc);
+        }
+
+        gss_sec_ctx_replace_pf(gsec, cli_ctx);
+        RETURN(0);
+}
+
+static
+void gss_ctx_cache_gc_pf(struct gss_sec_pipefs *gsec_pf,
+			 struct hlist_head *freelist)
+{
+	struct ptlrpc_sec	*sec;
+	struct ptlrpc_cli_ctx	*ctx;
+	struct hlist_node	__maybe_unused *pos;
+	struct hlist_node	*next;
+	int i;
+	ENTRY;
+
+        sec = &gsec_pf->gsp_base.gs_base;
+
+        CDEBUG(D_SEC, "do gc on sec %s@%p\n", sec->ps_policy->sp_name, sec);
+
+        for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+                cfs_hlist_for_each_entry_safe(ctx, pos, next,
+                                              &gsec_pf->gsp_chash[i], cc_cache)
+                        ctx_check_death_locked_pf(ctx, freelist);
+        }
+
+	sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval;
+	EXIT;
+}
+
+static
+struct ptlrpc_sec* gss_sec_create_pf(struct obd_import *imp,
+                                     struct ptlrpc_svc_ctx *ctx,
+                                     struct sptlrpc_flavor *sf)
+{
+        struct gss_sec_pipefs   *gsec_pf;
+        int                      alloc_size, hash_size, i;
+        ENTRY;
+
+#define GSS_SEC_PIPEFS_CTX_HASH_SIZE    (32)
+
+        if (ctx ||
+            sf->sf_flags & (PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_REVERSE))
+                hash_size = 1;
+        else
+                hash_size = GSS_SEC_PIPEFS_CTX_HASH_SIZE;
+
+        alloc_size = sizeof(*gsec_pf) +
+		     sizeof(struct hlist_head) * hash_size;
+
+        OBD_ALLOC(gsec_pf, alloc_size);
+        if (!gsec_pf)
+                RETURN(NULL);
+
+        gsec_pf->gsp_chash_size = hash_size;
+        for (i = 0; i < hash_size; i++)
+		INIT_HLIST_HEAD(&gsec_pf->gsp_chash[i]);
+
+        if (gss_sec_create_common(&gsec_pf->gsp_base, &gss_policy_pipefs,
+                                  imp, ctx, sf))
+                goto err_free;
+
+        if (ctx == NULL) {
+                if (gss_sec_pipe_upcall_init(&gsec_pf->gsp_base))
+                        goto err_destroy;
+        } else {
+                if (gss_install_rvs_cli_ctx_pf(&gsec_pf->gsp_base, ctx))
+                        goto err_destroy;
+        }
+
+        RETURN(&gsec_pf->gsp_base.gs_base);
+
+err_destroy:
+        gss_sec_destroy_common(&gsec_pf->gsp_base);
+err_free:
+        OBD_FREE(gsec_pf, alloc_size);
+        RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_pf(struct ptlrpc_sec *sec)
+{
+        struct gss_sec_pipefs   *gsec_pf;
+        struct gss_sec          *gsec;
+
+        CWARN("destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+        gsec = container_of(sec, struct gss_sec, gs_base);
+        gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+        LASSERT(gsec_pf->gsp_chash);
+        LASSERT(gsec_pf->gsp_chash_size);
+
+        gss_sec_pipe_upcall_fini(gsec);
+
+        gss_sec_destroy_common(gsec);
+
+        OBD_FREE(gsec, sizeof(*gsec_pf) +
+		       sizeof(struct hlist_head) * gsec_pf->gsp_chash_size);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_pf(struct ptlrpc_sec *sec,
+                                              struct vfs_cred *vcred,
+                                              int create, int remove_dead)
+{
+	struct gss_sec		*gsec;
+	struct gss_sec_pipefs	*gsec_pf;
+	struct ptlrpc_cli_ctx	*ctx = NULL, *new = NULL;
+	struct hlist_head	*hash_head;
+	struct hlist_node	__maybe_unused *pos, *next;
+	unsigned int		hash, gc = 0, found = 0;
+	HLIST_HEAD(freelist);
+	ENTRY;
+
+	might_sleep();
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+        hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+                              (__u64) vcred->vc_uid);
+        hash_head = &gsec_pf->gsp_chash[hash];
+        LASSERT(hash < gsec_pf->gsp_chash_size);
+
+retry:
+	spin_lock(&sec->ps_lock);
+
+	/* gc_next == 0 means never do gc */
+	if (remove_dead && sec->ps_gc_next &&
+	   (ktime_get_real_seconds() > sec->ps_gc_next)) {
+		gss_ctx_cache_gc_pf(gsec_pf, &freelist);
+		gc = 1;
+	}
+
+        cfs_hlist_for_each_entry_safe(ctx, pos, next, hash_head, cc_cache) {
+                if (gc == 0 &&
+                    ctx_check_death_locked_pf(ctx,
+                                              remove_dead ? &freelist : NULL))
+                        continue;
+
+                if (ctx_match_pf(ctx, vcred)) {
+                        found = 1;
+                        break;
+                }
+        }
+
+        if (found) {
+                if (new && new != ctx) {
+                        /* lost the race, just free it */
+			hlist_add_head(&new->cc_cache, &freelist);
+                        new = NULL;
+                }
+
+                /* hot node, move to head */
+                if (hash_head->first != &ctx->cc_cache) {
+			__hlist_del(&ctx->cc_cache);
+			hlist_add_head(&ctx->cc_cache, hash_head);
+                }
+        } else {
+                /* don't allocate for reverse sec */
+                if (sec_is_reverse(sec)) {
+			spin_unlock(&sec->ps_lock);
+			RETURN(NULL);
+		}
+
+		if (new) {
+			ctx_enhash_pf(new, hash_head);
+			ctx = new;
+		} else if (create) {
+			spin_unlock(&sec->ps_lock);
+			new = ctx_create_pf(sec, vcred);
+			if (new) {
+				clear_bit(PTLRPC_CTX_NEW_BIT, &new->cc_flags);
+				goto retry;
+			}
+		} else {
+			ctx = NULL;
+		}
+	}
+
+	/* hold a ref */
+	if (ctx)
+		atomic_inc(&ctx->cc_refcount);
+
+	spin_unlock(&sec->ps_lock);
+
+	/* the allocator of the context must give the first push to refresh */
+	if (new) {
+		LASSERT(new == ctx);
+		gss_cli_ctx_refresh_pf(new);
+	}
+
+	ctx_list_destroy_pf(&freelist);
+	RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_pf(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx,
+                            int sync)
+{
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+	LASSERT(hlist_unhashed(&ctx->cc_cache));
+
+        /* if required async, we must clear the UPTODATE bit to prevent extra
+         * rpcs during destroy procedure. */
+        if (!sync)
+		clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+        /* destroy this context */
+        ctx_destroy_pf(sec, ctx);
+}
+
+/*
+ * @uid: which user. "-1" means flush all.
+ * @grace: mark context DEAD, allow graceful destroy like notify
+ *         server side, etc.
+ * @force: also flush busy entries.
+ *
+ * return the number of busy context encountered.
+ *
+ * In any cases, never touch "eternal" contexts.
+ */
+static
+int gss_sec_flush_ctx_cache_pf(struct ptlrpc_sec *sec,
+                               uid_t uid,
+                               int grace, int force)
+{
+	struct gss_sec		*gsec;
+	struct gss_sec_pipefs	*gsec_pf;
+	struct ptlrpc_cli_ctx	*ctx;
+	struct hlist_node	__maybe_unused *pos, *next;
+	HLIST_HEAD(freelist);
+	int i, busy = 0;
+	ENTRY;
+
+	might_sleep_if(grace);
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	spin_lock(&sec->ps_lock);
+	for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+		cfs_hlist_for_each_entry_safe(ctx, pos, next,
+					      &gsec_pf->gsp_chash[i],
+					      cc_cache) {
+			LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+			if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+				continue;
+
+			if (atomic_read(&ctx->cc_refcount) > 1) {
+				busy++;
+				if (!force)
+					continue;
+
+				CWARN("flush busy(%d) ctx %p(%u->%s) by force, "
+				      "grace %d\n",
+				      atomic_read(&ctx->cc_refcount),
+				      ctx, ctx->cc_vcred.vc_uid,
+				      sec2target_str(ctx->cc_sec), grace);
+			}
+			ctx_unhash_pf(ctx, &freelist);
+
+			set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+			if (!grace)
+				clear_bit(PTLRPC_CTX_UPTODATE_BIT,
+					  &ctx->cc_flags);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	ctx_list_destroy_pf(&freelist);
+	RETURN(busy);
+}
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static
+int gss_svc_accept_pf(struct ptlrpc_request *req)
+{
+        return gss_svc_accept(&gss_policy_pipefs, req);
+}
+
+static
+int gss_svc_install_rctx_pf(struct obd_import *imp,
+                            struct ptlrpc_svc_ctx *ctx)
+{
+        struct ptlrpc_sec *sec;
+        int                rc;
+
+        sec = sptlrpc_import_sec_ref(imp);
+        LASSERT(sec);
+        rc = gss_install_rvs_cli_ctx_pf(sec2gsec(sec), ctx);
+
+        sptlrpc_sec_put(sec);
+        return rc;
+}
+
+/****************************************
+ * rpc_pipefs definitions               *
+ ****************************************/
+
+#define LUSTRE_PIPE_ROOT        "/lustre"
+#define LUSTRE_PIPE_KRB5        LUSTRE_PIPE_ROOT"/krb5"
+
+struct gss_upcall_msg_data {
+        __u32                           gum_seq;
+        __u32                           gum_uid;
+        __u32                           gum_gid;
+        __u32                           gum_svc;        /* MDS/OSS... */
+        __u64                           gum_nid;        /* peer NID */
+        __u8                            gum_obd[64];    /* client obd name */
+};
+
+struct gss_upcall_msg {
+	struct rpc_pipe_msg		gum_base;
+	atomic_t			gum_refcount;
+	struct list_head		gum_list;
+	__u32				gum_mechidx;
+	struct gss_sec			*gum_gsec;
+	struct gss_cli_ctx		*gum_gctx;
+	struct gss_upcall_msg_data	gum_data;
+};
+
+static atomic_t upcall_seq = ATOMIC_INIT(0);
+
+static inline
+__u32 upcall_get_sequence(void)
+{
+	return (__u32) atomic_inc_return(&upcall_seq);
+}
+
+enum mech_idx_t {
+        MECH_KRB5   = 0,
+        MECH_MAX
+};
+
+static inline
+__u32 mech_name2idx(const char *name)
+{
+        LASSERT(!strcmp(name, "krb5"));
+        return MECH_KRB5;
+}
+
+/* pipefs dentries for each mechanisms */
+static struct dentry *de_pipes[MECH_MAX] = { NULL, };
+/* all upcall messgaes linked here */
+static struct list_head upcall_lists[MECH_MAX];
+/* and protected by this */
+static spinlock_t upcall_locks[MECH_MAX];
+
+static inline
+void upcall_list_lock(int idx)
+{
+	spin_lock(&upcall_locks[idx]);
+}
+
+static inline
+void upcall_list_unlock(int idx)
+{
+	spin_unlock(&upcall_locks[idx]);
+}
+
+static
+void upcall_msg_enlist(struct gss_upcall_msg *msg)
+{
+	__u32 idx = msg->gum_mechidx;
+
+	upcall_list_lock(idx);
+	list_add(&msg->gum_list, &upcall_lists[idx]);
+	upcall_list_unlock(idx);
+}
+
+static
+void upcall_msg_delist(struct gss_upcall_msg *msg)
+{
+	__u32 idx = msg->gum_mechidx;
+
+	upcall_list_lock(idx);
+	list_del_init(&msg->gum_list);
+	upcall_list_unlock(idx);
+}
+
+/****************************************
+ * rpc_pipefs upcall helpers            *
+ ****************************************/
+
+static
+void gss_release_msg(struct gss_upcall_msg *gmsg)
+{
+	ENTRY;
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+	if (!atomic_dec_and_test(&gmsg->gum_refcount)) {
+		EXIT;
+		return;
+	}
+
+        if (gmsg->gum_gctx) {
+                sptlrpc_cli_ctx_wakeup(&gmsg->gum_gctx->gc_base);
+                sptlrpc_cli_ctx_put(&gmsg->gum_gctx->gc_base, 1);
+                gmsg->gum_gctx = NULL;
+        }
+
+	LASSERT(list_empty(&gmsg->gum_list));
+	LASSERT(list_empty(&gmsg->gum_base.list));
+	OBD_FREE_PTR(gmsg);
+	EXIT;
+}
+
+static
+void gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg)
+{
+	__u32 idx = gmsg->gum_mechidx;
+
+	LASSERT(idx < MECH_MAX);
+	assert_spin_locked(&upcall_locks[idx]);
+
+	if (list_empty(&gmsg->gum_list))
+		return;
+
+	list_del_init(&gmsg->gum_list);
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 1);
+	atomic_dec(&gmsg->gum_refcount);
+}
+
+static
+void gss_unhash_msg(struct gss_upcall_msg *gmsg)
+{
+        __u32 idx = gmsg->gum_mechidx;
+
+        LASSERT(idx < MECH_MAX);
+        upcall_list_lock(idx);
+        gss_unhash_msg_nolock(gmsg);
+        upcall_list_unlock(idx);
+}
+
+static
+void gss_msg_fail_ctx(struct gss_upcall_msg *gmsg)
+{
+	if (gmsg->gum_gctx) {
+		struct ptlrpc_cli_ctx *ctx = &gmsg->gum_gctx->gc_base;
+
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+		sptlrpc_cli_ctx_expire(ctx);
+		set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+	}
+}
+
+static
+struct gss_upcall_msg * gss_find_upcall(__u32 mechidx, __u32 seq)
+{
+	struct gss_upcall_msg *gmsg;
+
+	upcall_list_lock(mechidx);
+	list_for_each_entry(gmsg, &upcall_lists[mechidx], gum_list) {
+		if (gmsg->gum_data.gum_seq != seq)
+			continue;
+
+		LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+		LASSERT(gmsg->gum_mechidx == mechidx);
+
+		atomic_inc(&gmsg->gum_refcount);
+		upcall_list_unlock(mechidx);
+		return gmsg;
+	}
+	upcall_list_unlock(mechidx);
+	return NULL;
+}
+
+static
+int simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen)
+{
+	if (*buflen < reslen) {
+		CERROR("shorter buflen than needed: %u < %u\n",
+			*buflen, reslen);
+		return -EINVAL;
+	}
+
+	memcpy(res, *buf, reslen);
+	*buf += reslen;
+	*buflen -= reslen;
+	return 0;
+}
+
+/****************************************
+ * rpc_pipefs apis                      *
+ ****************************************/
+
+static
+ssize_t gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+                        char *dst, size_t buflen)
+{
+        char *data = (char *)msg->data + msg->copied;
+        ssize_t mlen = msg->len;
+        ssize_t left;
+        ENTRY;
+
+        if (mlen > buflen)
+                mlen = buflen;
+	left = copy_to_user(dst, data, mlen);
+        if (left < 0) {
+                msg->errno = left;
+                RETURN(left);
+        }
+        mlen -= left;
+        msg->copied += mlen;
+        msg->errno = 0;
+        RETURN(mlen);
+}
+
+static
+ssize_t gss_pipe_downcall(struct file *filp, const char *src, size_t mlen)
+{
+	struct rpc_inode        *rpci = RPC_I(file_inode(filp));
+        struct gss_upcall_msg   *gss_msg;
+        struct ptlrpc_cli_ctx   *ctx;
+        struct gss_cli_ctx      *gctx = NULL;
+        char                    *buf, *data;
+        int                      datalen;
+        int                      timeout, rc;
+        __u32                    mechidx, seq, gss_err;
+        ENTRY;
+
+        mechidx = (__u32) (long) rpci->private;
+        LASSERT(mechidx < MECH_MAX);
+
+        OBD_ALLOC(buf, mlen);
+        if (!buf)
+                RETURN(-ENOMEM);
+
+	if (copy_from_user(buf, src, mlen)) {
+                CERROR("failed copy user space data\n");
+                GOTO(out_free, rc = -EFAULT);
+        }
+        data = buf;
+        datalen = mlen;
+
+        /* data passed down format:
+         *  - seq
+         *  - timeout
+         *  - gc_win / error
+         *  - wire_ctx (rawobj)
+         *  - mech_ctx (rawobj)
+         */
+        if (simple_get_bytes(&data, &datalen, &seq, sizeof(seq))) {
+                CERROR("fail to get seq\n");
+                GOTO(out_free, rc = -EFAULT);
+        }
+
+        gss_msg = gss_find_upcall(mechidx, seq);
+        if (!gss_msg) {
+                CERROR("upcall %u has aborted earlier\n", seq);
+                GOTO(out_free, rc = -EINVAL);
+        }
+
+	gss_unhash_msg(gss_msg);
+	gctx = gss_msg->gum_gctx;
+	LASSERT(gctx);
+	LASSERT(atomic_read(&gctx->gc_base.cc_refcount) > 0);
+
+        /* timeout is not in use for now */
+        if (simple_get_bytes(&data, &datalen, &timeout, sizeof(timeout)))
+                GOTO(out_msg, rc = -EFAULT);
+
+        /* lgssd signal an error by gc_win == 0 */
+        if (simple_get_bytes(&data, &datalen, &gctx->gc_win,
+                             sizeof(gctx->gc_win)))
+                GOTO(out_msg, rc = -EFAULT);
+
+        if (gctx->gc_win == 0) {
+                /* followed by:
+                 * - rpc error
+                 * - gss error
+                 */
+                if (simple_get_bytes(&data, &datalen, &rc, sizeof(rc)))
+                        GOTO(out_msg, rc = -EFAULT);
+                if (simple_get_bytes(&data, &datalen, &gss_err,sizeof(gss_err)))
+                        GOTO(out_msg, rc = -EFAULT);
+
+                if (rc == 0 && gss_err == GSS_S_COMPLETE) {
+                        CWARN("both rpc & gss error code not set\n");
+                        rc = -EPERM;
+                }
+        } else {
+                rawobj_t tmpobj;
+
+                /* handle */
+                if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+                        GOTO(out_msg, rc = -EFAULT);
+                if (rawobj_dup(&gctx->gc_handle, &tmpobj))
+                        GOTO(out_msg, rc = -ENOMEM);
+
+                /* mechctx */
+                if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+                        GOTO(out_msg, rc = -EFAULT);
+                gss_err = lgss_import_sec_context(&tmpobj,
+                                                  gss_msg->gum_gsec->gs_mech,
+                                                  &gctx->gc_mechctx);
+                rc = 0;
+        }
+
+        if (likely(rc == 0 && gss_err == GSS_S_COMPLETE)) {
+                gss_cli_ctx_uptodate(gctx);
+        } else {
+                ctx = &gctx->gc_base;
+                sptlrpc_cli_ctx_expire(ctx);
+                if (rc != -ERESTART || gss_err != GSS_S_COMPLETE)
+			set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+
+                CERROR("refresh ctx %p(uid %d) failed: %d/0x%08x: %s\n",
+                       ctx, ctx->cc_vcred.vc_uid, rc, gss_err,
+		       test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags) ?
+                       "fatal error" : "non-fatal");
+        }
+
+        rc = mlen;
+
+out_msg:
+        gss_release_msg(gss_msg);
+
+out_free:
+        OBD_FREE(buf, mlen);
+        /* FIXME
+         * hack pipefs: always return asked length unless all following
+         * downcalls might be messed up. */
+        rc = mlen;
+        RETURN(rc);
+}
+
+static
+void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+        struct gss_upcall_msg          *gmsg;
+        struct gss_upcall_msg_data     *gumd;
+	static time64_t ratelimit;
+        ENTRY;
+
+	LASSERT(list_empty(&msg->list));
+
+        /* normally errno is >= 0 */
+        if (msg->errno >= 0) {
+                EXIT;
+                return;
+        }
+
+	gmsg = container_of(msg, struct gss_upcall_msg, gum_base);
+	gumd = &gmsg->gum_data;
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+	CERROR("failed msg %p (seq %u, uid %u, svc %u, nid %#llx, obd %.*s): "
+	       "errno %d\n", msg, gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+	       gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+	       gumd->gum_obd, msg->errno);
+
+	atomic_inc(&gmsg->gum_refcount);
+	gss_unhash_msg(gmsg);
+	if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) {
+		time64_t now = ktime_get_real_seconds();
+
+		if (now > ratelimit) {
+			CWARN("upcall timed out, is lgssd running?\n");
+			ratelimit = now + 15;
+		}
+	}
+	gss_msg_fail_ctx(gmsg);
+	gss_release_msg(gmsg);
+	EXIT;
+}
+
+static
+void gss_pipe_release(struct inode *inode)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	__u32		  idx;
+	ENTRY;
+
+	idx = (__u32) (long) rpci->private;
+	LASSERT(idx < MECH_MAX);
+
+	upcall_list_lock(idx);
+	while (!list_empty(&upcall_lists[idx])) {
+		struct gss_upcall_msg	   *gmsg;
+		struct gss_upcall_msg_data *gumd;
+
+		gmsg = list_entry(upcall_lists[idx].next,
+				  struct gss_upcall_msg, gum_list);
+		gumd = &gmsg->gum_data;
+		LASSERT(list_empty(&gmsg->gum_base.list));
+
+                CERROR("failing remaining msg %p:seq %u, uid %u, svc %u, "
+		       "nid %#llx, obd %.*s\n", gmsg,
+                       gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+                       gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+                       gumd->gum_obd);
+
+		gmsg->gum_base.errno = -EPIPE;
+		atomic_inc(&gmsg->gum_refcount);
+		gss_unhash_msg_nolock(gmsg);
+
+		gss_msg_fail_ctx(gmsg);
+
+		upcall_list_unlock(idx);
+		gss_release_msg(gmsg);
+		upcall_list_lock(idx);
+	}
+	upcall_list_unlock(idx);
+	EXIT;
+}
+
+static struct rpc_pipe_ops gss_upcall_ops = {
+        .upcall         = gss_pipe_upcall,
+        .downcall       = gss_pipe_downcall,
+        .destroy_msg    = gss_pipe_destroy_msg,
+        .release_pipe   = gss_pipe_release,
+};
+
+/****************************************
+ * upcall helper functions              *
+ ****************************************/
+
+static
+int gss_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+	struct obd_import          *imp;
+	struct gss_sec             *gsec;
+	struct gss_upcall_msg      *gmsg;
+	int                         rc = 0;
+	ENTRY;
+
+	might_sleep();
+
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_import);
+	LASSERT(ctx->cc_sec->ps_import->imp_obd);
+
+        imp = ctx->cc_sec->ps_import;
+        if (!imp->imp_connection) {
+                CERROR("import has no connection set\n");
+                RETURN(-EINVAL);
+        }
+
+        gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+
+        OBD_ALLOC_PTR(gmsg);
+        if (!gmsg)
+                RETURN(-ENOMEM);
+
+        /* initialize pipefs base msg */
+	INIT_LIST_HEAD(&gmsg->gum_base.list);
+        gmsg->gum_base.data = &gmsg->gum_data;
+        gmsg->gum_base.len = sizeof(gmsg->gum_data);
+        gmsg->gum_base.copied = 0;
+        gmsg->gum_base.errno = 0;
+
+	/* init upcall msg */
+	atomic_set(&gmsg->gum_refcount, 1);
+	gmsg->gum_mechidx = mech_name2idx(gsec->gs_mech->gm_name);
+	gmsg->gum_gsec = gsec;
+	gmsg->gum_gctx = container_of(sptlrpc_cli_ctx_get(ctx),
+				      struct gss_cli_ctx, gc_base);
+	gmsg->gum_data.gum_seq = upcall_get_sequence();
+	gmsg->gum_data.gum_uid = ctx->cc_vcred.vc_uid;
+	gmsg->gum_data.gum_gid = 0; /* not used for now */
+	gmsg->gum_data.gum_svc = import_to_gss_svc(imp);
+	gmsg->gum_data.gum_nid = imp->imp_connection->c_peer.nid;
+	strlcpy(gmsg->gum_data.gum_obd, imp->imp_obd->obd_name,
+		sizeof(gmsg->gum_data.gum_obd));
+
+        /* This only could happen when sysadmin set it dead/expired
+         * using lctl by force. */
+        if (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK) {
+                CWARN("ctx %p(%u->%s) was set flags %lx unexpectedly\n",
+                      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+                      ctx->cc_flags);
+
+                LASSERT(!(ctx->cc_flags & PTLRPC_CTX_UPTODATE));
+                ctx->cc_flags |= PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR;
+
+                rc = -EIO;
+                goto err_free;
+        }
+
+        upcall_msg_enlist(gmsg);
+
+        rc = rpc_queue_upcall(de_pipes[gmsg->gum_mechidx]->d_inode,
+                              &gmsg->gum_base);
+        if (rc) {
+                CERROR("rpc_queue_upcall failed: %d\n", rc);
+
+                upcall_msg_delist(gmsg);
+                goto err_free;
+        }
+
+        RETURN(0);
+err_free:
+        OBD_FREE_PTR(gmsg);
+        RETURN(rc);
+}
+
+static
+int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+        /* if we are refreshing for root, also update the reverse
+         * handle index, do not confuse reverse contexts. */
+        if (ctx->cc_vcred.vc_uid == 0) {
+                struct gss_sec *gsec;
+
+                gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+                gsec->gs_rvs_hdl = gss_get_next_ctx_index();
+        }
+
+        return gss_ctx_refresh_pf(ctx);
+}
+
+/****************************************
+ * lustre gss pipefs policy             *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops = {
+        .match                  = gss_cli_ctx_match,
+        .refresh                = gss_cli_ctx_refresh_pf,
+        .validate               = gss_cli_ctx_validate_pf,
+        .die                    = gss_cli_ctx_die_pf,
+        .sign                   = gss_cli_ctx_sign,
+        .verify                 = gss_cli_ctx_verify,
+        .seal                   = gss_cli_ctx_seal,
+        .unseal                 = gss_cli_ctx_unseal,
+        .wrap_bulk              = gss_cli_ctx_wrap_bulk,
+        .unwrap_bulk            = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_pipefs_cops = {
+        .create_sec             = gss_sec_create_pf,
+        .destroy_sec            = gss_sec_destroy_pf,
+        .kill_sec               = gss_sec_kill,
+        .lookup_ctx             = gss_sec_lookup_ctx_pf,
+        .release_ctx            = gss_sec_release_ctx_pf,
+        .flush_ctx_cache        = gss_sec_flush_ctx_cache_pf,
+        .install_rctx           = gss_sec_install_rctx,
+        .alloc_reqbuf           = gss_alloc_reqbuf,
+        .free_reqbuf            = gss_free_reqbuf,
+        .alloc_repbuf           = gss_alloc_repbuf,
+        .free_repbuf            = gss_free_repbuf,
+        .enlarge_reqbuf         = gss_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops gss_sec_pipefs_sops = {
+        .accept                 = gss_svc_accept_pf,
+        .invalidate_ctx         = gss_svc_invalidate_ctx,
+        .alloc_rs               = gss_svc_alloc_rs,
+        .authorize              = gss_svc_authorize,
+        .free_rs                = gss_svc_free_rs,
+        .free_ctx               = gss_svc_free_ctx,
+        .unwrap_bulk            = gss_svc_unwrap_bulk,
+        .wrap_bulk              = gss_svc_wrap_bulk,
+        .install_rctx           = gss_svc_install_rctx_pf,
+};
+
+static struct ptlrpc_sec_policy gss_policy_pipefs = {
+        .sp_owner               = THIS_MODULE,
+        .sp_name                = "gss.pipefs",
+        .sp_policy              = SPTLRPC_POLICY_GSS_PIPEFS,
+        .sp_cops                = &gss_sec_pipefs_cops,
+        .sp_sops                = &gss_sec_pipefs_sops,
+};
+
+static
+int __init gss_init_pipefs_upcall(void)
+{
+        struct dentry   *de;
+
+        /* pipe dir */
+        de = rpc_mkdir(LUSTRE_PIPE_ROOT, NULL);
+        if (IS_ERR(de) && PTR_ERR(de) != -EEXIST) {
+                CERROR("Failed to create gss pipe dir: %ld\n", PTR_ERR(de));
+                return PTR_ERR(de);
+        }
+
+        /* FIXME hack pipefs: dput will sometimes cause oops during module
+         * unload and lgssd close the pipe fds. */
+
+        /* krb5 mechanism */
+        de = rpc_mkpipe(LUSTRE_PIPE_KRB5, (void *) MECH_KRB5, &gss_upcall_ops,
+                        RPC_PIPE_WAIT_FOR_OPEN);
+        if (!de || IS_ERR(de)) {
+                CERROR("failed to make rpc_pipe %s: %ld\n",
+                       LUSTRE_PIPE_KRB5, PTR_ERR(de));
+                rpc_rmdir(LUSTRE_PIPE_ROOT);
+                return PTR_ERR(de);
+        }
+
+        de_pipes[MECH_KRB5] = de;
+	INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]);
+	spin_lock_init(&upcall_locks[MECH_KRB5]);
+
+	return 0;
+}
+
+static
+void __exit gss_exit_pipefs_upcall(void)
+{
+	__u32 i;
+
+	for (i = 0; i < MECH_MAX; i++) {
+		LASSERT(list_empty(&upcall_lists[i]));
+
+		/* dput pipe dentry here might cause lgssd oops. */
+		de_pipes[i] = NULL;
+	}
+
+	rpc_unlink(LUSTRE_PIPE_KRB5);
+	rpc_rmdir(LUSTRE_PIPE_ROOT);
+}
+
+int __init gss_init_pipefs(void)
+{
+        int rc;
+
+        rc = gss_init_pipefs_upcall();
+        if (rc)
+                return rc;
+
+        rc = sptlrpc_register_policy(&gss_policy_pipefs);
+        if (rc) {
+                gss_exit_pipefs_upcall();
+                return rc;
+        }
+
+        return 0;
+}
+
+void __exit gss_exit_pipefs(void)
+{
+	gss_exit_pipefs_upcall();
+	sptlrpc_unregister_policy(&gss_policy_pipefs);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c
new file mode 100644
index 0000000000000..a6237909b7c5d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c
@@ -0,0 +1,240 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/gss/gss_rawobj.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_sec.h>
+
+#include "gss_internal.h"
+
+int rawobj_empty(rawobj_t *obj)
+{
+        LASSERT(equi(obj->len, obj->data));
+        return (obj->len == 0);
+}
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len)
+{
+        LASSERT(obj);
+        LASSERT(len >= 0);
+
+        obj->len = len;
+        if (len) {
+                OBD_ALLOC_LARGE(obj->data, len);
+                if (!obj->data) {
+                        obj->len = 0;
+                        RETURN(-ENOMEM);
+                }
+                memcpy(obj->data, buf, len);
+        } else
+                obj->data = NULL;
+        return 0;
+}
+
+void rawobj_free(rawobj_t *obj)
+{
+        LASSERT(obj);
+
+        if (obj->len) {
+                LASSERT(obj->data);
+                OBD_FREE_LARGE(obj->data, obj->len);
+                obj->len = 0;
+                obj->data = NULL;
+        } else
+                LASSERT(!obj->data);
+}
+
+int rawobj_equal(rawobj_t *a, rawobj_t *b)
+{
+        LASSERT(a && b);
+
+        return (a->len == b->len &&
+                (!a->len || !memcmp(a->data, b->data, a->len)));
+}
+
+int rawobj_dup(rawobj_t *dest, rawobj_t *src)
+{
+        LASSERT(src && dest);
+
+        dest->len = src->len;
+        if (dest->len) {
+                OBD_ALLOC_LARGE(dest->data, dest->len);
+                if (!dest->data) {
+                        dest->len = 0;
+                        return -ENOMEM;
+                }
+                memcpy(dest->data, src->data, dest->len);
+        } else
+                dest->data = NULL;
+        return 0;
+}
+
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        __u32 len;
+
+        LASSERT(obj);
+        LASSERT(buf);
+        LASSERT(buflen);
+
+	len = round_up(obj->len, 4);
+
+        if (*buflen < 4 + len) {
+                CERROR("shorter buflen than needed: %u < %u\n",
+                        *buflen, 4 + len);
+                return -EINVAL;
+        }
+
+        *(*buf)++ = cpu_to_le32(obj->len);
+        memcpy(*buf, obj->data, obj->len);
+        *buf += (len >> 2);
+        *buflen -= (4 + len);
+
+        return 0;
+}
+
+static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen,
+                            int alloc, int local)
+{
+        __u32 len;
+
+        if (*buflen < sizeof(__u32)) {
+                CERROR("too short buflen: %u\n", *buflen);
+                return -EINVAL;
+        }
+
+        obj->len = *(*buf)++;
+        if (!local)
+                obj->len = le32_to_cpu(obj->len);
+        *buflen -= sizeof(__u32);
+
+        if (!obj->len) {
+                obj->data = NULL;
+                return 0;
+        }
+
+	len = local ? obj->len : round_up(obj->len, 4);
+        if (*buflen < len) {
+                CERROR("shorter buflen than object size: %u < %u\n",
+                        *buflen, len);
+                obj->len = 0;
+                return -EINVAL;
+        }
+
+        if (!alloc)
+                obj->data = (__u8 *) *buf;
+        else {
+                OBD_ALLOC_LARGE(obj->data, obj->len);
+                if (!obj->data) {
+                        CERROR("fail to alloc %u bytes\n", obj->len);
+                        obj->len = 0;
+                        return -ENOMEM;
+                }
+                memcpy(obj->data, *buf, obj->len);
+        }
+
+        *((char **)buf) += len;
+        *buflen -= len;
+
+        return 0;
+}
+
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 0, 0);
+}
+
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 1, 0);
+}
+
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 0, 1);
+}
+
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 1, 1);
+}
+
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj)
+{
+        rawobj->len = netobj->len;
+        rawobj->data = netobj->data;
+        return 0;
+}
+
+int rawobj_from_netobj_alloc(rawobj_t *rawobj, netobj_t *netobj)
+{
+        rawobj->len = 0;
+        rawobj->data = NULL;
+
+        if (netobj->len == 0)
+                return 0;
+
+        OBD_ALLOC_LARGE(rawobj->data, netobj->len);
+        if (rawobj->data == NULL)
+                return -ENOMEM;
+
+        rawobj->len = netobj->len;
+        memcpy(rawobj->data, netobj->data, netobj->len);
+        return 0;
+}
+
+/****************************************
+ * misc more                            *
+ ****************************************/
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+                         void *res, __u32 reslen)
+{
+        if (*buflen < reslen) {
+                CERROR("shorter buflen than expected: %u < %u\n",
+                        *buflen, reslen);
+                return -EINVAL;
+        }
+
+        memcpy(res, *buf, reslen);
+        *buf += reslen;
+        *buflen -= reslen;
+        return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
new file mode 100644
index 0000000000000..1059df722fe37
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
@@ -0,0 +1,960 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013, 2015, Trustees of Indiana University
+ *
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ *
+ * Author: Jeremy Filizetti <jfilizet@iu.edu>
+ * Author: Andrew Korty <ajk@iu.edu>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+#include <crypto/ctr.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+
+#include "gss_err.h"
+#include "gss_crypto.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+
+#define SK_INTERFACE_VERSION 1
+#define SK_MSG_VERSION 1
+#define SK_MIN_SIZE 8
+#define SK_IV_SIZE 16
+
+/* Starting number for reverse contexts.  It is critical to security
+ * that reverse contexts use a different range of numbers than regular
+ * contexts because they are using the same key.  Therefore the IV/nonce
+ * combination must be unique for them.  To accomplish this reverse contexts
+ * use the the negative range of a 64-bit number and regular contexts use the
+ * postive range.  If the same IV/nonce combination were reused it would leak
+ * information about the plaintext. */
+#define SK_IV_REV_START (1ULL << 63)
+
+struct sk_ctx {
+	enum cfs_crypto_crypt_alg sc_crypt;
+	enum cfs_crypto_hash_alg  sc_hmac;
+	__u32			  sc_expire;
+	__u32			  sc_host_random;
+	__u32			  sc_peer_random;
+	atomic64_t		  sc_iv;
+	rawobj_t		  sc_hmac_key;
+	struct gss_keyblock	  sc_session_kb;
+};
+
+struct sk_hdr {
+	__u64			skh_version;
+	__u64			skh_iv;
+} __attribute__((packed));
+
+/* The format of SK wire data is similar to that of RFC3686 ESP Payload
+ * (section 3) except instead of just an IV there is a struct sk_hdr.
+ * ---------------------------------------------------------------------
+ * | struct sk_hdr | ciphertext (variable size) | HMAC (variable size) |
+ * --------------------------------------------------------------------- */
+struct sk_wire {
+	rawobj_t		skw_header;
+	rawobj_t		skw_cipher;
+	rawobj_t		skw_hmac;
+};
+
+static inline unsigned long sk_block_mask(unsigned long len, int blocksize)
+{
+	return (len + blocksize - 1) & (~(blocksize - 1));
+}
+
+static int sk_fill_header(struct sk_ctx *skc, struct sk_hdr *skh)
+{
+	__u64 tmp_iv;
+	skh->skh_version = be64_to_cpu(SK_MSG_VERSION);
+
+	/* Always using inc_return so we don't use our initial numbers which
+	 * could be the reuse detecting numbers */
+	tmp_iv = atomic64_inc_return(&skc->sc_iv);
+	skh->skh_iv = be64_to_cpu(tmp_iv);
+	if (tmp_iv == 0 || tmp_iv == SK_IV_REV_START) {
+		CERROR("Counter looped, connection must be reset to avoid "
+		       "plaintext information\n");
+		return GSS_S_FAILURE;
+	}
+
+	return GSS_S_COMPLETE;
+}
+
+static int sk_verify_header(struct sk_hdr *skh)
+{
+	if (cpu_to_be64(skh->skh_version) != SK_MSG_VERSION)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	return GSS_S_COMPLETE;
+}
+
+void sk_construct_rfc3686_iv(__u8 *iv, __u32 nonce, __u64 partial_iv)
+{
+	__u32 ctr = cpu_to_be32(1);
+
+	memcpy(iv, &nonce, CTR_RFC3686_NONCE_SIZE);
+	iv += CTR_RFC3686_NONCE_SIZE;
+	memcpy(iv, &partial_iv, CTR_RFC3686_IV_SIZE);
+	iv += CTR_RFC3686_IV_SIZE;
+	memcpy(iv, &ctr, sizeof(ctr));
+}
+
+static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
+{
+	char *ptr = inbuf->data;
+	char *end = inbuf->data + inbuf->len;
+	char sk_hmac[CRYPTO_MAX_ALG_NAME];
+	char sk_crypt[CRYPTO_MAX_ALG_NAME];
+	u32 tmp;
+
+	/* see sk_serialize_kctx() for format from userspace side */
+	/*  1. Version */
+	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
+		CERROR("Failed to read shared key interface version\n");
+		return -1;
+	}
+	if (tmp != SK_INTERFACE_VERSION) {
+		CERROR("Invalid shared key interface version: %d\n", tmp);
+		return -1;
+	}
+
+	/* 2. HMAC type */
+	if (gss_get_bytes(&ptr, end, &sk_hmac, sizeof(sk_hmac))) {
+		CERROR("Failed to read HMAC algorithm type\n");
+		return -1;
+	}
+
+	skc->sc_hmac = cfs_crypto_hash_alg(sk_hmac);
+	if (skc->sc_hmac != CFS_HASH_ALG_NULL &&
+	    skc->sc_hmac != CFS_HASH_ALG_SHA256 &&
+	    skc->sc_hmac != CFS_HASH_ALG_SHA512) {
+		CERROR("Invalid hmac type: %s\n", sk_hmac);
+		return -1;
+	}
+
+	/* 3. crypt type */
+	if (gss_get_bytes(&ptr, end, &sk_crypt, sizeof(sk_crypt))) {
+		CERROR("Failed to read crypt algorithm type\n");
+		return -1;
+	}
+
+	skc->sc_crypt = cfs_crypto_crypt_alg(sk_crypt);
+	if (skc->sc_crypt == CFS_CRYPT_ALG_UNKNOWN) {
+		CERROR("Invalid crypt type: %s\n", sk_crypt);
+		return -1;
+	}
+
+	/* 4. expiration time */
+	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
+		CERROR("Failed to read context expiration time\n");
+		return -1;
+	}
+	skc->sc_expire = tmp + ktime_get_real_seconds();
+
+	/* 5. host random is used as nonce for encryption */
+	if (gss_get_bytes(&ptr, end, &skc->sc_host_random,
+			  sizeof(skc->sc_host_random))) {
+		CERROR("Failed to read host random\n");
+		return -1;
+	}
+
+	/* 6. peer random is used as nonce for decryption */
+	if (gss_get_bytes(&ptr, end, &skc->sc_peer_random,
+			  sizeof(skc->sc_peer_random))) {
+		CERROR("Failed to read peer random\n");
+		return -1;
+	}
+
+	/* 7. HMAC key */
+	if (gss_get_rawobj(&ptr, end, &skc->sc_hmac_key)) {
+		CERROR("Failed to read HMAC key\n");
+		return -1;
+	}
+	if (skc->sc_hmac_key.len <= SK_MIN_SIZE) {
+		CERROR("HMAC key must key must be larger than %d bytes\n",
+		       SK_MIN_SIZE);
+		return -1;
+	}
+
+	/* 8. Session key, can be empty if not using privacy mode */
+	if (gss_get_rawobj(&ptr, end, &skc->sc_session_kb.kb_key)) {
+		CERROR("Failed to read session key\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void sk_delete_context(struct sk_ctx *skc)
+{
+	if (!skc)
+		return;
+
+	rawobj_free(&skc->sc_hmac_key);
+	gss_keyblock_free(&skc->sc_session_kb);
+	OBD_FREE_PTR(skc);
+}
+
+static
+__u32 gss_import_sec_context_sk(rawobj_t *inbuf, struct gss_ctx *gss_context)
+{
+	struct sk_ctx *skc;
+	bool privacy = false;
+
+	if (inbuf == NULL || inbuf->data == NULL)
+		return GSS_S_FAILURE;
+
+	OBD_ALLOC_PTR(skc);
+	if (!skc)
+		return GSS_S_FAILURE;
+
+	atomic64_set(&skc->sc_iv, 0);
+
+	if (sk_fill_context(inbuf, skc))
+		goto out_err;
+
+	/* Only privacy mode needs to initialize keys */
+	if (skc->sc_session_kb.kb_key.len > 0) {
+		privacy = true;
+		if (gss_keyblock_init(&skc->sc_session_kb,
+				      cfs_crypto_crypt_name(skc->sc_crypt), 0))
+			goto out_err;
+	}
+
+	gss_context->internal_ctx_id = skc;
+	CDEBUG(D_SEC, "successfully imported sk%s context\n",
+	       privacy ? " (with privacy)" : "");
+
+	return GSS_S_COMPLETE;
+
+out_err:
+	sk_delete_context(skc);
+	return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old,
+				  struct gss_ctx *gss_context_new)
+{
+	struct sk_ctx *skc_old = gss_context_old->internal_ctx_id;
+	struct sk_ctx *skc_new;
+
+	OBD_ALLOC_PTR(skc_new);
+	if (!skc_new)
+		return GSS_S_FAILURE;
+
+	skc_new->sc_hmac = skc_old->sc_hmac;
+	skc_new->sc_crypt = skc_old->sc_crypt;
+	skc_new->sc_expire = skc_old->sc_expire;
+	skc_new->sc_host_random = skc_old->sc_host_random;
+	skc_new->sc_peer_random = skc_old->sc_peer_random;
+
+	atomic64_set(&skc_new->sc_iv, SK_IV_REV_START);
+
+	if (rawobj_dup(&skc_new->sc_hmac_key, &skc_old->sc_hmac_key))
+		goto out_err;
+	if (gss_keyblock_dup(&skc_new->sc_session_kb, &skc_old->sc_session_kb))
+		goto out_err;
+
+	/* Only privacy mode needs to initialize keys */
+	if (skc_new->sc_session_kb.kb_key.len > 0)
+		if (gss_keyblock_init(&skc_new->sc_session_kb,
+				      cfs_crypto_crypt_name(skc_new->sc_crypt),
+				      0))
+			goto out_err;
+
+	gss_context_new->internal_ctx_id = skc_new;
+	CDEBUG(D_SEC, "successfully copied reverse sk context\n");
+
+	return GSS_S_COMPLETE;
+
+out_err:
+	sk_delete_context(skc_new);
+	return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_inquire_context_sk(struct gss_ctx *gss_context,
+			     time64_t *endtime)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+
+	*endtime = skc->sc_expire;
+	return GSS_S_COMPLETE;
+}
+
+static
+u32 sk_make_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key, int msg_count,
+		 rawobj_t *msgs, int iov_count, struct bio_vec *iovs,
+		 rawobj_t *token, digest_hash hash_func)
+{
+	struct ahash_request *req;
+	int rc2, rc;
+
+	req = cfs_crypto_hash_init(algo, key->data, key->len);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		goto out_init_failed;
+	}
+
+
+	if (hash_func)
+		rc2 = hash_func(req, NULL, msg_count, msgs, iov_count,
+				iovs);
+	else
+		rc2 = gss_digest_hash(req, NULL, msg_count, msgs, iov_count,
+				      iovs);
+
+	rc = cfs_crypto_hash_final(req, token->data, &token->len);
+	if (!rc && rc2)
+		rc = rc2;
+out_init_failed:
+	return rc ? GSS_S_FAILURE : GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_get_mic_sk(struct gss_ctx *gss_context,
+		     int message_count,
+		     rawobj_t *messages,
+		     int iov_count,
+		     struct bio_vec *iovs,
+		     rawobj_t *token)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+
+	return sk_make_hmac(skc->sc_hmac,
+			    &skc->sc_hmac_key, message_count, messages,
+			    iov_count, iovs, token, gss_context->hash_func);
+}
+
+static
+u32 sk_verify_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key,
+		   int message_count, rawobj_t *messages,
+		   int iov_count, struct bio_vec *iovs,
+		   rawobj_t *token, digest_hash hash_func)
+{
+	rawobj_t checksum = RAWOBJ_EMPTY;
+	__u32 rc = GSS_S_FAILURE;
+
+	checksum.len = cfs_crypto_hash_digestsize(algo);
+	if (token->len < checksum.len) {
+		CDEBUG(D_SEC, "Token received too short, expected %d "
+		       "received %d\n", token->len, checksum.len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	OBD_ALLOC_LARGE(checksum.data, checksum.len);
+	if (!checksum.data)
+		return rc;
+
+	if (sk_make_hmac(algo, key, message_count,
+			 messages, iov_count, iovs, &checksum,
+			 hash_func)) {
+		CDEBUG(D_SEC, "Failed to create checksum to validate\n");
+		goto cleanup;
+	}
+
+	if (memcmp(token->data, checksum.data, checksum.len)) {
+		CERROR("checksum mismatch\n");
+		rc = GSS_S_BAD_SIG;
+		goto cleanup;
+	}
+
+	rc = GSS_S_COMPLETE;
+
+cleanup:
+	OBD_FREE(checksum.data, checksum.len);
+	return rc;
+}
+
+/* sk_verify_bulk_hmac() differs slightly from sk_verify_hmac() because all
+ * encrypted pages in the bulk descriptor are populated although we only need
+ * to decrypt up to the number of bytes actually specified from the sender
+ * (bd_nob) otherwise the calulated HMAC will be incorrect. */
+static
+u32 sk_verify_bulk_hmac(enum cfs_crypto_hash_alg sc_hmac, rawobj_t *key,
+			int msgcnt, rawobj_t *msgs, int iovcnt,
+			struct bio_vec *iovs, int iov_bytes, rawobj_t *token)
+{
+	rawobj_t checksum = RAWOBJ_EMPTY;
+	struct ahash_request *req;
+	struct scatterlist sg[1];
+	int rc = 0;
+	struct sg_table sgt;
+	int bytes;
+	int i;
+
+	checksum.len = cfs_crypto_hash_digestsize(sc_hmac);
+	if (token->len < checksum.len) {
+		CDEBUG(D_SEC, "Token received too short, expected %d "
+		       "received %d\n", token->len, checksum.len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	OBD_ALLOC_LARGE(checksum.data, checksum.len);
+	if (!checksum.data)
+		return GSS_S_FAILURE;
+
+	req = cfs_crypto_hash_init(sc_hmac, key->data, key->len);
+	if (IS_ERR(req)) {
+		rc = GSS_S_FAILURE;
+		goto cleanup;
+	}
+
+	for (i = 0; i < msgcnt; i++) {
+		if (!msgs[i].len)
+			continue;
+
+		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
+		if (rc != 0)
+			goto hash_cleanup;
+
+		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
+		rc = crypto_ahash_update(req);
+		if (rc) {
+			gss_teardown_sgtable(&sgt);
+			goto hash_cleanup;
+		}
+
+		gss_teardown_sgtable(&sgt);
+	}
+
+	for (i = 0; i < iovcnt && iov_bytes > 0; i++) {
+		if (iovs[i].bv_len == 0)
+			continue;
+
+		bytes = min_t(int, iov_bytes, iovs[i].bv_len);
+		iov_bytes -= bytes;
+
+		sg_init_table(sg, 1);
+		sg_set_page(&sg[0], iovs[i].bv_page, bytes,
+			    iovs[i].bv_offset);
+		ahash_request_set_crypt(req, sg, NULL, bytes);
+		rc = crypto_ahash_update(req);
+		if (rc)
+			goto hash_cleanup;
+	}
+
+hash_cleanup:
+	cfs_crypto_hash_final(req, checksum.data, &checksum.len);
+	if (rc)
+		goto cleanup;
+
+	if (memcmp(token->data, checksum.data, checksum.len))
+		rc = GSS_S_BAD_SIG;
+	else
+		rc = GSS_S_COMPLETE;
+
+cleanup:
+	OBD_FREE_LARGE(checksum.data, checksum.len);
+
+	return rc;
+}
+
+static
+__u32 gss_verify_mic_sk(struct gss_ctx *gss_context,
+			int message_count,
+			rawobj_t *messages,
+			int iov_count,
+			struct bio_vec *iovs,
+			rawobj_t *token)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+
+	return sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key,
+			      message_count, messages, iov_count, iovs, token,
+			      gss_context->hash_func);
+}
+
+static
+__u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
+		    rawobj_t *message, int message_buffer_length,
+		    rawobj_t *token)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
+	struct sk_wire skw;
+	struct sk_hdr skh;
+	rawobj_t msgbufs[3];
+	__u8 local_iv[SK_IV_SIZE];
+	unsigned int blocksize;
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+
+	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	if (gss_add_padding(message, message_buffer_length, blocksize))
+		return GSS_S_FAILURE;
+
+	memset(token->data, 0, token->len);
+
+	if (sk_fill_header(skc, &skh) != GSS_S_COMPLETE)
+		return GSS_S_FAILURE;
+
+	skw.skw_header.data = token->data;
+	skw.skw_header.len = sizeof(skh);
+	memcpy(skw.skw_header.data, &skh, sizeof(skh));
+
+	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
+	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
+	if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv, 1, message,
+			      &skw.skw_cipher, 1))
+		return GSS_S_FAILURE;
+
+	/* HMAC covers the SK header, GSS header, and ciphertext */
+	msgbufs[0] = skw.skw_header;
+	msgbufs[1] = *gss_header;
+	msgbufs[2] = skw.skw_cipher;
+
+	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
+	skw.skw_hmac.len = sht_bytes;
+	if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key,
+			 3, msgbufs, 0, NULL, &skw.skw_hmac,
+			 gss_context->hash_func))
+		return GSS_S_FAILURE;
+
+	token->len = skw.skw_header.len + skw.skw_cipher.len + skw.skw_hmac.len;
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
+		      rawobj_t *token, rawobj_t *message)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
+	struct sk_wire skw;
+	struct sk_hdr *skh;
+	rawobj_t msgbufs[3];
+	__u8 local_iv[SK_IV_SIZE];
+	unsigned int blocksize;
+	int rc;
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+
+	if (token->len < sizeof(skh) + sht_bytes)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	skw.skw_header.data = token->data;
+	skw.skw_header.len = sizeof(struct sk_hdr);
+	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
+	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
+	skw.skw_hmac.len = sht_bytes;
+
+	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	if (skw.skw_cipher.len % blocksize != 0)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	skh = (struct sk_hdr *)skw.skw_header.data;
+	rc = sk_verify_header(skh);
+	if (rc != GSS_S_COMPLETE)
+		return rc;
+
+	/* HMAC covers the SK header, GSS header, and ciphertext */
+	msgbufs[0] = skw.skw_header;
+	msgbufs[1] = *gss_header;
+	msgbufs[2] = skw.skw_cipher;
+	rc = sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key, 3, msgbufs,
+			    0, NULL, &skw.skw_hmac, gss_context->hash_func);
+	if (rc)
+		return rc;
+
+	sk_construct_rfc3686_iv(local_iv, skc->sc_peer_random, skh->skh_iv);
+	message->len = skw.skw_cipher.len;
+	if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv,
+			      1, &skw.skw_cipher, message, 0))
+		return GSS_S_FAILURE;
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_sk(struct gss_ctx *gss_context,
+		       struct ptlrpc_bulk_desc *desc)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	int blocksize;
+	int i;
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		if (desc->bd_vec[i].bv_offset & blocksize) {
+			CERROR("offset %d not blocksize aligned\n",
+			       desc->bd_vec[i].bv_offset);
+			return GSS_S_FAILURE;
+		}
+
+		desc->bd_enc_vec[i].bv_offset =
+			desc->bd_vec[i].bv_offset;
+		desc->bd_enc_vec[i].bv_len =
+			sk_block_mask(desc->bd_vec[i].bv_len, blocksize);
+	}
+
+	return GSS_S_COMPLETE;
+}
+
+static __u32 sk_encrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
+			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
+			     int adj_nob)
+{
+	struct scatterlist ptxt;
+	struct scatterlist ctxt;
+	int blocksize;
+	int i;
+	int rc;
+	int nob = 0;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
+
+	sg_init_table(&ptxt, 1);
+	sg_init_table(&ctxt, 1);
+
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		sg_set_page(&ptxt, desc->bd_vec[i].bv_page,
+			    sk_block_mask(desc->bd_vec[i].bv_len,
+					  blocksize),
+			    desc->bd_vec[i].bv_offset);
+		nob += ptxt.length;
+
+		sg_set_page(&ctxt, desc->bd_enc_vec[i].bv_page,
+			    ptxt.length, ptxt.offset);
+
+		desc->bd_enc_vec[i].bv_offset = ctxt.offset;
+		desc->bd_enc_vec[i].bv_len = ctxt.length;
+
+		skcipher_request_set_crypt(req, &ptxt, &ctxt, ptxt.length, iv);
+		rc = crypto_skcipher_encrypt_iv(req, &ctxt, &ptxt, ptxt.length);
+		if (rc) {
+			CERROR("failed to encrypt page: %d\n", rc);
+			skcipher_request_zero(req);
+			return rc;
+		}
+	}
+	skcipher_request_zero(req);
+
+	if (adj_nob)
+		desc->bd_nob = nob;
+
+	return 0;
+}
+
+static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
+			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
+			     int adj_nob)
+{
+	struct scatterlist ptxt;
+	struct scatterlist ctxt;
+	int blocksize;
+	int i;
+	int rc;
+	int pnob = 0;
+	int cnob = 0;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
+	sg_init_table(&ptxt, 1);
+	sg_init_table(&ctxt, 1);
+
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
+	if (desc->bd_nob_transferred % blocksize != 0) {
+		CERROR("Transfer not a multiple of block size: %d\n",
+		       desc->bd_nob_transferred);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+
+	for (i = 0; i < desc->bd_iov_count && cnob < desc->bd_nob_transferred;
+	     i++) {
+		struct bio_vec *piov = &desc->bd_vec[i];
+		struct bio_vec *ciov = &desc->bd_enc_vec[i];
+
+		if (ciov->bv_offset % blocksize != 0 ||
+		    ciov->bv_len % blocksize != 0) {
+			CERROR("Invalid bulk descriptor vector\n");
+			skcipher_request_zero(req);
+			return GSS_S_DEFECTIVE_TOKEN;
+		}
+
+		/* Must adjust bytes here because we know the actual sizes after
+		 * decryption.  Similar to what gss_cli_ctx_unwrap_bulk does for
+		 * integrity only mode */
+		if (adj_nob) {
+			/* cipher text must not exceed transferred size */
+			if (ciov->bv_len + cnob > desc->bd_nob_transferred)
+				ciov->bv_len =
+					desc->bd_nob_transferred - cnob;
+
+			piov->bv_len = ciov->bv_len;
+
+			/* plain text must not exceed bulk's size */
+			if (ciov->bv_len + pnob > desc->bd_nob)
+				piov->bv_len = desc->bd_nob - pnob;
+		} else {
+			/* Taken from krb5_decrypt since it was not verified
+			 * whether or not LNET guarantees these */
+			if (ciov->bv_len + cnob > desc->bd_nob_transferred ||
+			    piov->bv_len > ciov->bv_len) {
+				CERROR("Invalid decrypted length\n");
+				skcipher_request_zero(req);
+				return GSS_S_FAILURE;
+			}
+		}
+
+		if (ciov->bv_len == 0)
+			continue;
+
+		sg_init_table(&ctxt, 1);
+		sg_set_page(&ctxt, ciov->bv_page, ciov->bv_len,
+			    ciov->bv_offset);
+		ptxt = ctxt;
+
+		/* In the event the plain text size is not a multiple
+		 * of blocksize we decrypt in place and copy the result
+		 * after the decryption */
+		if (piov->bv_len % blocksize == 0)
+			sg_assign_page(&ptxt, piov->bv_page);
+
+		skcipher_request_set_crypt(req, &ctxt, &ptxt, ptxt.length, iv);
+		rc = crypto_skcipher_decrypt_iv(req, &ptxt, &ctxt, ptxt.length);
+		if (rc) {
+			CERROR("Decryption failed for page: %d\n", rc);
+			skcipher_request_zero(req);
+			return GSS_S_FAILURE;
+		}
+
+		if (piov->bv_len % blocksize != 0) {
+			memcpy(page_address(piov->bv_page) +
+			       piov->bv_offset,
+			       page_address(ciov->bv_page) +
+			       ciov->bv_offset,
+			       piov->bv_len);
+		}
+
+		cnob += ciov->bv_len;
+		pnob += piov->bv_len;
+	}
+	skcipher_request_zero(req);
+
+	/* if needed, clear up the rest unused iovs */
+	if (adj_nob)
+		while (i < desc->bd_iov_count)
+			desc->bd_vec[i++].bv_len = 0;
+
+	if (unlikely(cnob != desc->bd_nob_transferred)) {
+		CERROR("%d cipher text transferred but only %d decrypted\n",
+		       desc->bd_nob_transferred, cnob);
+		return GSS_S_FAILURE;
+	}
+
+	if (unlikely(!adj_nob && pnob != desc->bd_nob)) {
+		CERROR("%d plain text expected but only %d received\n",
+		       desc->bd_nob, pnob);
+		return GSS_S_FAILURE;
+	}
+
+	return 0;
+}
+
+static
+__u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context,
+		       struct ptlrpc_bulk_desc *desc, rawobj_t *token,
+		       int adj_nob)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
+	struct sk_wire skw;
+	struct sk_hdr skh;
+	__u8 local_iv[SK_IV_SIZE];
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+
+	memset(token->data, 0, token->len);
+	if (sk_fill_header(skc, &skh) != GSS_S_COMPLETE)
+		return GSS_S_FAILURE;
+
+	skw.skw_header.data = token->data;
+	skw.skw_header.len = sizeof(skh);
+	memcpy(skw.skw_header.data, &skh, sizeof(skh));
+
+	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
+	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
+	if (sk_encrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv,
+			    desc, &skw.skw_cipher, adj_nob))
+		return GSS_S_FAILURE;
+
+	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
+	skw.skw_hmac.len = sht_bytes;
+	if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1, &skw.skw_cipher,
+			 desc->bd_iov_count, desc->bd_enc_vec, &skw.skw_hmac,
+			 gss_context->hash_func))
+		return GSS_S_FAILURE;
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context,
+			   struct ptlrpc_bulk_desc *desc,
+			   rawobj_t *token, int adj_nob)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
+	struct sk_wire skw;
+	struct sk_hdr *skh;
+	__u8 local_iv[SK_IV_SIZE];
+	int rc;
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+
+	if (token->len < sizeof(skh) + sht_bytes)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	skw.skw_header.data = token->data;
+	skw.skw_header.len = sizeof(struct sk_hdr);
+	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
+	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
+	skw.skw_hmac.len = sht_bytes;
+
+	skh = (struct sk_hdr *)skw.skw_header.data;
+	rc = sk_verify_header(skh);
+	if (rc != GSS_S_COMPLETE)
+		return rc;
+
+	rc = sk_verify_bulk_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1,
+				 &skw.skw_cipher, desc->bd_iov_count,
+				 desc->bd_enc_vec, desc->bd_nob,
+				 &skw.skw_hmac);
+	if (rc)
+		return rc;
+
+	sk_construct_rfc3686_iv(local_iv, skc->sc_peer_random, skh->skh_iv);
+	rc = sk_decrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv,
+			     desc, &skw.skw_cipher, adj_nob);
+	if (rc)
+		return rc;
+
+	return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_sk(void *internal_context)
+{
+	struct sk_ctx *sk_context = internal_context;
+	sk_delete_context(sk_context);
+}
+
+int gss_display_sk(struct gss_ctx *gss_context, char *buf, int bufsize)
+{
+	return scnprintf(buf, bufsize, "sk");
+}
+
+static struct gss_api_ops gss_sk_ops = {
+	.gss_import_sec_context     = gss_import_sec_context_sk,
+	.gss_copy_reverse_context   = gss_copy_reverse_context_sk,
+	.gss_inquire_context        = gss_inquire_context_sk,
+	.gss_get_mic                = gss_get_mic_sk,
+	.gss_verify_mic             = gss_verify_mic_sk,
+	.gss_wrap                   = gss_wrap_sk,
+	.gss_unwrap                 = gss_unwrap_sk,
+	.gss_prep_bulk              = gss_prep_bulk_sk,
+	.gss_wrap_bulk              = gss_wrap_bulk_sk,
+	.gss_unwrap_bulk            = gss_unwrap_bulk_sk,
+	.gss_delete_sec_context     = gss_delete_sec_context_sk,
+	.gss_display                = gss_display_sk,
+};
+
+static struct subflavor_desc gss_sk_sfs[] = {
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_SKN,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_NULL,
+		.sf_name        = "skn"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_SKA,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_AUTH,
+		.sf_name        = "ska"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_SKI,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_INTG,
+		.sf_name        = "ski"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_SKPI,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_PRIV,
+		.sf_name        = "skpi"
+	},
+};
+
+static struct gss_api_mech gss_sk_mech = {
+	/* .gm_owner uses default NULL value for THIS_MODULE */
+	.gm_name        = "sk",
+	.gm_oid         = (rawobj_t) {
+		.len = 12,
+		.data = "\053\006\001\004\001\311\146\215\126\001\000\001",
+	},
+	.gm_ops         = &gss_sk_ops,
+	.gm_sf_num      = 4,
+	.gm_sfs         = gss_sk_sfs,
+};
+
+int __init init_sk_module(void)
+{
+	int status;
+
+	status = lgss_mech_register(&gss_sk_mech);
+	if (status)
+		CERROR("Failed to register sk gss mechanism!\n");
+
+	return status;
+}
+
+void cleanup_sk_module(void)
+{
+	lgss_mech_unregister(&gss_sk_mech);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
new file mode 100644
index 0000000000000..f5c83a25a13e1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
@@ -0,0 +1,1190 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ *  1/ context creation
+ *  2/ data exchange
+ *  3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ *  In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ *  In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ *  GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel.  The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ *   uid/gidlist - for determining access rights
+ *   mechanism type
+ *   mechanism specific information, such as a key
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/mutex.h>
+#include <linux/sunrpc/cache.h>
+#include <net/sock.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_import.h>
+#include <lustre_net.h>
+#include <lustre_nodemap.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_crypto.h"
+
+#define GSS_SVC_UPCALL_TIMEOUT  (20)
+
+static DEFINE_SPINLOCK(__ctx_index_lock);
+static __u64 __ctx_index;
+
+unsigned int krb5_allow_old_client_csum;
+
+__u64 gss_get_next_ctx_index(void)
+{
+	__u64 idx;
+
+	spin_lock(&__ctx_index_lock);
+	idx = __ctx_index++;
+	spin_unlock(&__ctx_index_lock);
+
+	return idx;
+}
+
+static inline unsigned long hash_mem(char *buf, int length, int bits)
+{
+	unsigned long hash = 0;
+	unsigned long l = 0;
+	int len = 0;
+	unsigned char c;
+
+	do {
+		if (len == length) {
+			c = (char) len;
+			len = -1;
+		} else
+			c = *buf++;
+
+		l = (l << 8) | c;
+		len++;
+
+		if ((len & (BITS_PER_LONG/8-1)) == 0)
+			hash = hash_long(hash^l, BITS_PER_LONG);
+	} while (len);
+
+	return hash >> (BITS_PER_LONG - bits);
+}
+
+/****************************************
+ * rpc sec init (rsi) cache *
+ ****************************************/
+
+#define RSI_HASHBITS    (6)
+#define RSI_HASHMAX     (1 << RSI_HASHBITS)
+#define RSI_HASHMASK    (RSI_HASHMAX - 1)
+
+struct rsi {
+	struct cache_head       h;
+	__u32                   lustre_svc;
+	__u64                   nid;
+	char			nm_name[LUSTRE_NODEMAP_NAME_LENGTH + 1];
+	wait_queue_head_t       waitq;
+	rawobj_t                in_handle, in_token;
+	rawobj_t                out_handle, out_token;
+	int                     major_status, minor_status;
+#ifdef HAVE_CACHE_HASH_SPINLOCK
+	struct rcu_head		rcu_head;
+#endif
+};
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+static struct hlist_head rsi_table[RSI_HASHMAX];
+#else
+static struct cache_head *rsi_table[RSI_HASHMAX];
+#endif
+static struct cache_detail rsi_cache;
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
+static struct rsi *rsi_lookup(struct rsi *item);
+
+#ifdef HAVE_CACHE_DETAIL_WRITERS
+static inline int channel_users(struct cache_detail *cd)
+{
+	return atomic_read(&cd->writers);
+}
+#else
+static inline int channel_users(struct cache_detail *cd)
+{
+	return atomic_read(&cd->readers);
+}
+#endif
+
+static inline int rsi_hash(struct rsi *item)
+{
+        return hash_mem((char *)item->in_handle.data, item->in_handle.len,
+                        RSI_HASHBITS) ^
+               hash_mem((char *)item->in_token.data, item->in_token.len,
+                        RSI_HASHBITS);
+}
+
+static inline int __rsi_match(struct rsi *item, struct rsi *tmp)
+{
+        return (rawobj_equal(&item->in_handle, &tmp->in_handle) &&
+                rawobj_equal(&item->in_token, &tmp->in_token));
+}
+
+static void rsi_free(struct rsi *rsi)
+{
+        rawobj_free(&rsi->in_handle);
+        rawobj_free(&rsi->in_token);
+        rawobj_free(&rsi->out_handle);
+        rawobj_free(&rsi->out_token);
+}
+
+/* See handle_channel_req() userspace for where the upcall data is read */
+static void rsi_request(struct cache_detail *cd,
+                        struct cache_head *h,
+                        char **bpp, int *blen)
+{
+	struct rsi *rsi = container_of(h, struct rsi, h);
+	__u64 index = 0;
+
+	/* if in_handle is null, provide kernel suggestion */
+	if (rsi->in_handle.len == 0)
+		index = gss_get_next_ctx_index();
+
+	qword_addhex(bpp, blen, (char *) &rsi->lustre_svc,
+			sizeof(rsi->lustre_svc));
+	qword_addhex(bpp, blen, (char *) &rsi->nid, sizeof(rsi->nid));
+	qword_addhex(bpp, blen, (char *) &index, sizeof(index));
+	qword_addhex(bpp, blen, (char *) rsi->nm_name,
+		     strlen(rsi->nm_name) + 1);
+	qword_addhex(bpp, blen, rsi->in_handle.data, rsi->in_handle.len);
+	qword_addhex(bpp, blen, rsi->in_token.data, rsi->in_token.len);
+	(*bpp)[-1] = '\n';
+}
+
+static inline void __rsi_init(struct rsi *new, struct rsi *item)
+{
+	new->out_handle = RAWOBJ_EMPTY;
+	new->out_token = RAWOBJ_EMPTY;
+
+	new->in_handle = item->in_handle;
+	item->in_handle = RAWOBJ_EMPTY;
+	new->in_token = item->in_token;
+	item->in_token = RAWOBJ_EMPTY;
+
+	new->lustre_svc = item->lustre_svc;
+	new->nid = item->nid;
+	memcpy(new->nm_name, item->nm_name, sizeof(item->nm_name));
+	init_waitqueue_head(&new->waitq);
+}
+
+static inline void __rsi_update(struct rsi *new, struct rsi *item)
+{
+        LASSERT(new->out_handle.len == 0);
+        LASSERT(new->out_token.len == 0);
+
+        new->out_handle = item->out_handle;
+        item->out_handle = RAWOBJ_EMPTY;
+        new->out_token = item->out_token;
+        item->out_token = RAWOBJ_EMPTY;
+
+        new->major_status = item->major_status;
+        new->minor_status = item->minor_status;
+}
+
+#ifdef HAVE_CACHE_HASH_SPINLOCK
+static void rsi_free_rcu(struct rcu_head *head)
+{
+	struct rsi *rsi = container_of(head, struct rsi, rcu_head);
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+	LASSERT(hlist_unhashed(&rsi->h.cache_list));
+#else
+	LASSERT(rsi->h.next == NULL);
+#endif
+	rsi_free(rsi);
+	OBD_FREE_PTR(rsi);
+}
+
+static void rsi_put(struct kref *ref)
+{
+	struct rsi *rsi = container_of(ref, struct rsi, h.ref);
+
+	call_rcu(&rsi->rcu_head, rsi_free_rcu);
+}
+#else /* !HAVE_CACHE_HASH_SPINLOCK */
+static void rsi_put(struct kref *ref)
+{
+	struct rsi *rsi = container_of(ref, struct rsi, h.ref);
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+	LASSERT(hlist_unhashed(&rsi->h.cache_list));
+#else
+	LASSERT(rsi->h.next == NULL);
+#endif
+	rsi_free(rsi);
+	OBD_FREE_PTR(rsi);
+}
+#endif /* HAVE_CACHE_HASH_SPINLOCK */
+
+static int rsi_match(struct cache_head *a, struct cache_head *b)
+{
+        struct rsi *item = container_of(a, struct rsi, h);
+        struct rsi *tmp = container_of(b, struct rsi, h);
+
+        return __rsi_match(item, tmp);
+}
+
+static void rsi_init(struct cache_head *cnew, struct cache_head *citem)
+{
+        struct rsi *new = container_of(cnew, struct rsi, h);
+        struct rsi *item = container_of(citem, struct rsi, h);
+
+        __rsi_init(new, item);
+}
+
+static void update_rsi(struct cache_head *cnew, struct cache_head *citem)
+{
+        struct rsi *new = container_of(cnew, struct rsi, h);
+        struct rsi *item = container_of(citem, struct rsi, h);
+
+        __rsi_update(new, item);
+}
+
+static struct cache_head *rsi_alloc(void)
+{
+        struct rsi *rsi;
+
+        OBD_ALLOC_PTR(rsi);
+        if (rsi) 
+                return &rsi->h;
+        else
+                return NULL;
+}
+
+static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+	char *buf = mesg;
+	int len;
+	struct rsi rsii, *rsip = NULL;
+	time64_t expiry;
+	int status = -EINVAL;
+	ENTRY;
+
+	memset(&rsii, 0, sizeof(rsii));
+
+	/* handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.in_handle, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	/* token */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.in_token, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	rsip = rsi_lookup(&rsii);
+	if (!rsip)
+		goto out;
+	if (!test_bit(CACHE_PENDING, &rsip->h.flags)) {
+		/* If this is not a pending request, it probably means
+		 * someone wrote arbitrary data to the init channel.
+		 * Directly return -EINVAL in this case.
+		 */
+		status = -EINVAL;
+                goto out;
+	}
+
+	rsii.h.flags = 0;
+	/* expiry */
+	expiry = get_expiry(&mesg);
+	if (expiry == 0)
+		goto out;
+
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0)
+		goto out;
+
+	/* major */
+	status = kstrtoint(buf, 10, &rsii.major_status);
+	if (status)
+		goto out;
+
+	/* minor */
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0) {
+		status = -EINVAL;
+		goto out;
+	}
+
+	status = kstrtoint(buf, 10, &rsii.minor_status);
+	if (status)
+		goto out;
+
+	/* out_handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.out_handle, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	/* out_token */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.out_token, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	rsii.h.expiry_time = expiry;
+	rsip = rsi_update(&rsii, rsip);
+	status = 0;
+out:
+	rsi_free(&rsii);
+	if (rsip) {
+		wake_up(&rsip->waitq);
+		cache_put(&rsip->h, &rsi_cache);
+	} else {
+		status = -ENOMEM;
+	}
+
+	if (status)
+		CERROR("rsi parse error %d\n", status);
+	RETURN(status);
+}
+
+static struct cache_detail rsi_cache = {
+	.hash_size	= RSI_HASHMAX,
+	.hash_table	= rsi_table,
+	.name		= "auth.sptlrpc.init",
+	.cache_put	= rsi_put,
+	.cache_request	= rsi_request,
+	.cache_upcall	= sunrpc_cache_pipe_upcall,
+	.cache_parse	= rsi_parse,
+	.match		= rsi_match,
+	.init		= rsi_init,
+	.update		= update_rsi,
+	.alloc		= rsi_alloc,
+};
+
+static struct rsi *rsi_lookup(struct rsi *item)
+{
+        struct cache_head *ch;
+        int hash = rsi_hash(item);
+
+        ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash);
+        if (ch)
+                return container_of(ch, struct rsi, h);
+        else
+                return NULL;
+}
+
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old)
+{
+        struct cache_head *ch;
+        int hash = rsi_hash(new);
+
+        ch = sunrpc_cache_update(&rsi_cache, &new->h, &old->h, hash);
+        if (ch)
+                return container_of(ch, struct rsi, h);
+        else
+                return NULL;
+}
+
+/****************************************
+ * rpc sec context (rsc) cache                            *
+ ****************************************/
+
+#define RSC_HASHBITS    (10)
+#define RSC_HASHMAX     (1 << RSC_HASHBITS)
+#define RSC_HASHMASK    (RSC_HASHMAX - 1)
+
+struct rsc {
+        struct cache_head       h;
+        struct obd_device      *target;
+        rawobj_t                handle;
+        struct gss_svc_ctx      ctx;
+#ifdef HAVE_CACHE_HASH_SPINLOCK
+	struct rcu_head		rcu_head;
+#endif
+};
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+static struct hlist_head rsc_table[RSC_HASHMAX];
+#else
+static struct cache_head *rsc_table[RSC_HASHMAX];
+#endif
+static struct cache_detail rsc_cache;
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old);
+static struct rsc *rsc_lookup(struct rsc *item);
+
+static void rsc_free(struct rsc *rsci)
+{
+        rawobj_free(&rsci->handle);
+        rawobj_free(&rsci->ctx.gsc_rvs_hdl);
+        lgss_delete_sec_context(&rsci->ctx.gsc_mechctx);
+}
+
+static inline int rsc_hash(struct rsc *rsci)
+{
+        return hash_mem((char *)rsci->handle.data,
+                        rsci->handle.len, RSC_HASHBITS);
+}
+
+static inline int __rsc_match(struct rsc *new, struct rsc *tmp)
+{
+        return rawobj_equal(&new->handle, &tmp->handle);
+}
+
+static inline void __rsc_init(struct rsc *new, struct rsc *tmp)
+{
+        new->handle = tmp->handle;
+        tmp->handle = RAWOBJ_EMPTY;
+
+        new->target = NULL;
+        memset(&new->ctx, 0, sizeof(new->ctx));
+        new->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+}
+
+static inline void __rsc_update(struct rsc *new, struct rsc *tmp)
+{
+	new->ctx = tmp->ctx;
+	memset(&tmp->ctx, 0, sizeof(tmp->ctx));
+	tmp->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+	tmp->ctx.gsc_mechctx = NULL;
+	tmp->target = NULL;
+
+	memset(&new->ctx.gsc_seqdata, 0, sizeof(new->ctx.gsc_seqdata));
+	spin_lock_init(&new->ctx.gsc_seqdata.ssd_lock);
+}
+
+#ifdef HAVE_CACHE_HASH_SPINLOCK
+static void rsc_free_rcu(struct rcu_head *head)
+{
+	struct rsc *rsci = container_of(head, struct rsc, rcu_head);
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+	LASSERT(hlist_unhashed(&rsci->h.cache_list));
+#else
+	LASSERT(rsci->h.next == NULL);
+#endif
+	rawobj_free(&rsci->handle);
+	OBD_FREE_PTR(rsci);
+}
+
+static void rsc_put(struct kref *ref)
+{
+	struct rsc *rsci = container_of(ref, struct rsc, h.ref);
+
+	rawobj_free(&rsci->ctx.gsc_rvs_hdl);
+	lgss_delete_sec_context(&rsci->ctx.gsc_mechctx);
+	call_rcu(&rsci->rcu_head, rsc_free_rcu);
+}
+#else /* !HAVE_CACHE_HASH_SPINLOCK */
+static void rsc_put(struct kref *ref)
+{
+	struct rsc *rsci = container_of(ref, struct rsc, h.ref);
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+	LASSERT(hlist_unhashed(&rsci->h.cache_list));
+#else
+	LASSERT(rsci->h.next == NULL);
+#endif
+	rsc_free(rsci);
+	OBD_FREE_PTR(rsci);
+}
+#endif /* HAVE_CACHE_HASH_SPINLOCK */
+
+static int rsc_match(struct cache_head *a, struct cache_head *b)
+{
+        struct rsc *new = container_of(a, struct rsc, h);
+        struct rsc *tmp = container_of(b, struct rsc, h);
+
+        return __rsc_match(new, tmp);
+}
+
+static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
+{
+        struct rsc *new = container_of(cnew, struct rsc, h);
+        struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+        __rsc_init(new, tmp);
+}
+
+static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
+{
+        struct rsc *new = container_of(cnew, struct rsc, h);
+        struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+        __rsc_update(new, tmp);
+}
+
+static struct cache_head * rsc_alloc(void)
+{
+        struct rsc *rsc;
+
+        OBD_ALLOC_PTR(rsc);
+        if (rsc)
+                return &rsc->h;
+        else
+                return NULL;
+}
+
+static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+        char                *buf = mesg;
+        int                  len, rv, tmp_int;
+        struct rsc           rsci, *rscp = NULL;
+	time64_t expiry;
+        int                  status = -EINVAL;
+        struct gss_api_mech *gm = NULL;
+
+        memset(&rsci, 0, sizeof(rsci));
+
+        /* context handle */
+        len = qword_get(&mesg, buf, mlen);
+        if (len < 0) goto out;
+        status = -ENOMEM;
+        if (rawobj_alloc(&rsci.handle, buf, len))
+                goto out;
+
+        rsci.h.flags = 0;
+        /* expiry */
+        expiry = get_expiry(&mesg);
+        status = -EINVAL;
+        if (expiry == 0)
+                goto out;
+
+        /* remote flag */
+        rv = get_int(&mesg, &tmp_int);
+        if (rv) {
+                CERROR("fail to get remote flag\n");
+                goto out;
+        }
+        rsci.ctx.gsc_remote = (tmp_int != 0);
+
+	/* root user flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get root user flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_usr_root = (tmp_int != 0);
+
+        /* mds user flag */
+        rv = get_int(&mesg, &tmp_int);
+        if (rv) {
+                CERROR("fail to get mds user flag\n");
+                goto out;
+        }
+        rsci.ctx.gsc_usr_mds = (tmp_int != 0);
+
+        /* oss user flag */
+        rv = get_int(&mesg, &tmp_int);
+        if (rv) {
+                CERROR("fail to get oss user flag\n");
+                goto out;
+        }
+        rsci.ctx.gsc_usr_oss = (tmp_int != 0);
+
+        /* mapped uid */
+        rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid);
+        if (rv) {
+                CERROR("fail to get mapped uid\n");
+                goto out;
+        }
+
+        rscp = rsc_lookup(&rsci);
+        if (!rscp)
+                goto out;
+
+        /* uid, or NEGATIVE */
+        rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid);
+        if (rv == -EINVAL)
+                goto out;
+        if (rv == -ENOENT) {
+                CERROR("NOENT? set rsc entry negative\n");
+		set_bit(CACHE_NEGATIVE, &rsci.h.flags);
+        } else {
+		rawobj_t tmp_buf;
+		time64_t ctx_expiry;
+
+		/* gid */
+		if (get_int(&mesg, (int *) &rsci.ctx.gsc_gid))
+			goto out;
+
+		/* mech name */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+		gm = lgss_name_to_mech(buf);
+		status = -EOPNOTSUPP;
+		if (!gm)
+			goto out;
+
+		status = -EINVAL;
+		/* mech-specific data: */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+
+		tmp_buf.len = len;
+		tmp_buf.data = (unsigned char *)buf;
+		if (lgss_import_sec_context(&tmp_buf, gm,
+					    &rsci.ctx.gsc_mechctx))
+			goto out;
+
+		/* set to seconds since machine booted */
+		expiry = ktime_get_seconds();
+
+		/* currently the expiry time passed down from user-space
+		 * is invalid, here we retrive it from mech.
+		 */
+		if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+			CERROR("unable to get expire time, drop it\n");
+			goto out;
+		}
+
+		/* ctx_expiry is the number of seconds since Jan 1 1970.
+		 * We want just the  number of seconds into the future.
+		 */
+		expiry += ctx_expiry - ktime_get_real_seconds();
+        }
+
+        rsci.h.expiry_time = expiry;
+        rscp = rsc_update(&rsci, rscp);
+        status = 0;
+out:
+        if (gm)
+                lgss_mech_put(gm);
+        rsc_free(&rsci);
+        if (rscp)
+                cache_put(&rscp->h, &rsc_cache);
+        else
+                status = -ENOMEM;
+
+        if (status)
+                CERROR("parse rsc error %d\n", status);
+        return status;
+}
+
+static struct cache_detail rsc_cache = {
+        .hash_size      = RSC_HASHMAX,
+        .hash_table     = rsc_table,
+        .name           = "auth.sptlrpc.context",
+        .cache_put      = rsc_put,
+        .cache_parse    = rsc_parse,
+        .match          = rsc_match,
+        .init           = rsc_init,
+        .update         = update_rsc,
+        .alloc          = rsc_alloc,
+};
+
+static struct rsc *rsc_lookup(struct rsc *item)
+{
+        struct cache_head *ch;
+        int                hash = rsc_hash(item);
+
+        ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash);
+        if (ch)
+                return container_of(ch, struct rsc, h);
+        else
+                return NULL;
+}
+
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
+{
+        struct cache_head *ch;
+        int                hash = rsc_hash(new);
+
+        ch = sunrpc_cache_update(&rsc_cache, &new->h, &old->h, hash);
+        if (ch)
+                return container_of(ch, struct rsc, h);
+        else
+                return NULL;
+}
+
+#define COMPAT_RSC_PUT(item, cd)        cache_put((item), (cd))
+
+/****************************************
+ * rsc cache flush                      *
+ ****************************************/
+
+static struct rsc *gss_svc_searchbyctx(rawobj_t *handle)
+{
+        struct rsc  rsci;
+        struct rsc *found;
+
+        memset(&rsci, 0, sizeof(rsci));
+        if (rawobj_dup(&rsci.handle, handle))
+                return NULL;
+
+        found = rsc_lookup(&rsci);
+        rsc_free(&rsci);
+        if (!found)
+                return NULL;
+        if (cache_check(&rsc_cache, &found->h, NULL))
+                return NULL;
+        return found;
+}
+
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+                                   struct gss_sec *gsec,
+                                   struct gss_cli_ctx *gctx)
+{
+        struct rsc      rsci, *rscp = NULL;
+	time64_t ctx_expiry;
+        __u32           major;
+        int             rc;
+        ENTRY;
+
+        memset(&rsci, 0, sizeof(rsci));
+
+        if (rawobj_alloc(&rsci.handle, (char *) &gsec->gs_rvs_hdl,
+                         sizeof(gsec->gs_rvs_hdl)))
+                GOTO(out, rc = -ENOMEM);
+
+        rscp = rsc_lookup(&rsci);
+        if (rscp == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        major = lgss_copy_reverse_context(gctx->gc_mechctx,
+                                          &rsci.ctx.gsc_mechctx);
+        if (major != GSS_S_COMPLETE)
+                GOTO(out, rc = -ENOMEM);
+
+        if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+                CERROR("unable to get expire time, drop it\n");
+                GOTO(out, rc = -EINVAL);
+        }
+	rsci.h.expiry_time = ctx_expiry;
+
+	switch (imp->imp_obd->u.cli.cl_sp_to) {
+	case LUSTRE_SP_MDT:
+		rsci.ctx.gsc_usr_mds = 1;
+		break;
+	case LUSTRE_SP_OST:
+		rsci.ctx.gsc_usr_oss = 1;
+		break;
+	case LUSTRE_SP_CLI:
+		rsci.ctx.gsc_usr_root = 1;
+		break;
+	case LUSTRE_SP_MGS:
+		/* by convention, all 3 set to 1 means MGS */
+		rsci.ctx.gsc_usr_mds = 1;
+		rsci.ctx.gsc_usr_oss = 1;
+		rsci.ctx.gsc_usr_root = 1;
+		break;
+	default:
+		break;
+	}
+
+        rscp = rsc_update(&rsci, rscp);
+        if (rscp == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        rscp->target = imp->imp_obd;
+        rawobj_dup(&gctx->gc_svc_handle, &rscp->handle);
+
+	CWARN("create reverse svc ctx %p to %s: idx %#llx\n",
+              &rscp->ctx, obd2cli_tgt(imp->imp_obd), gsec->gs_rvs_hdl);
+        rc = 0;
+out:
+        if (rscp)
+                cache_put(&rscp->h, &rsc_cache);
+        rsc_free(&rsci);
+
+        if (rc)
+		CERROR("create reverse svc ctx: idx %#llx, rc %d\n",
+                       gsec->gs_rvs_hdl, rc);
+        RETURN(rc);
+}
+
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle)
+{
+	const time64_t expire = 20;
+	struct rsc *rscp;
+
+        rscp = gss_svc_searchbyctx(handle);
+        if (rscp) {
+                CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n",
+                       &rscp->ctx, rscp);
+
+		rscp->h.expiry_time = ktime_get_real_seconds() + expire;
+                COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+        }
+        return 0;
+}
+
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx)
+{
+        struct rsc *rscp = container_of(ctx, struct rsc, ctx);
+
+        return rawobj_dup(handle, &rscp->handle);
+}
+
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq)
+{
+        struct rsc             *rscp;
+
+        rscp = gss_svc_searchbyctx(handle);
+        if (rscp) {
+                CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) update seq to %u\n",
+                       &rscp->ctx, rscp, seq + 1);
+
+                rscp->ctx.gsc_rvs_seq = seq + 1;
+                COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+        }
+        return 0;
+}
+
+static struct cache_deferred_req* cache_upcall_defer(struct cache_req *req)
+{
+        return NULL;
+}
+static struct cache_req cache_upcall_chandle = { cache_upcall_defer };
+
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+			       struct gss_svc_reqctx *grctx,
+			       struct gss_wire_ctx *gw,
+			       struct obd_device *target,
+			       __u32 lustre_svc,
+			       rawobj_t *rvs_hdl,
+			       rawobj_t *in_token)
+{
+	struct ptlrpc_reply_state *rs;
+	struct rsc                *rsci = NULL;
+	struct rsi                *rsip = NULL, rsikey;
+	wait_queue_entry_t wait;
+	int                        replen = sizeof(struct ptlrpc_body);
+	struct gss_rep_header     *rephdr;
+	int                        first_check = 1;
+	int                        rc = SECSVC_DROP;
+	ENTRY;
+
+	memset(&rsikey, 0, sizeof(rsikey));
+	rsikey.lustre_svc = lustre_svc;
+	/* In case of MR, rq_peer is not the NID from which request is received,
+	 * but primary NID of peer.
+	 * So we need LNetPrimaryNID(rq_source) to match what the clients uses.
+	 */
+	rsikey.nid = (__u64)LNetPrimaryNID(req->rq_source.nid);
+	nodemap_test_nid(req->rq_peer.nid, rsikey.nm_name,
+			 sizeof(rsikey.nm_name));
+
+        /* duplicate context handle. for INIT it always 0 */
+        if (rawobj_dup(&rsikey.in_handle, &gw->gw_handle)) {
+                CERROR("fail to dup context handle\n");
+                GOTO(out, rc);
+        }
+
+        if (rawobj_dup(&rsikey.in_token, in_token)) {
+                CERROR("can't duplicate token\n");
+                rawobj_free(&rsikey.in_handle);
+                GOTO(out, rc);
+        }
+
+        rsip = rsi_lookup(&rsikey);
+        rsi_free(&rsikey);
+        if (!rsip) {
+                CERROR("error in rsi_lookup.\n");
+
+                if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+                        rc = SECSVC_COMPLETE;
+
+                GOTO(out, rc);
+        }
+
+	cache_get(&rsip->h); /* take an extra ref */
+	init_wait(&wait);
+	add_wait_queue(&rsip->waitq, &wait);
+
+cache_check:
+	/* Note each time cache_check() will drop a reference if return
+	 * non-zero. We hold an extra reference on initial rsip, but must
+	 * take care of following calls. */
+	rc = cache_check(&rsi_cache, &rsip->h, &cache_upcall_chandle);
+	switch (rc) {
+	case -ETIMEDOUT:
+	case -EAGAIN: {
+		int valid;
+
+		if (first_check) {
+			first_check = 0;
+
+			cache_read_lock(&rsi_cache);
+			valid = test_bit(CACHE_VALID, &rsip->h.flags);
+			if (valid == 0)
+				set_current_state(TASK_INTERRUPTIBLE);
+			cache_read_unlock(&rsi_cache);
+
+			if (valid == 0) {
+				unsigned long timeout;
+
+				timeout = cfs_time_seconds(GSS_SVC_UPCALL_TIMEOUT);
+				schedule_timeout(timeout);
+			}
+			cache_get(&rsip->h);
+			goto cache_check;
+		}
+		CWARN("waited %ds timeout, drop\n", GSS_SVC_UPCALL_TIMEOUT);
+		break;
+	}
+	case -ENOENT:
+		CDEBUG(D_SEC, "cache_check return ENOENT, drop\n");
+		break;
+	case 0:
+		/* if not the first check, we have to release the extra
+		 * reference we just added on it. */
+		if (!first_check)
+			cache_put(&rsip->h, &rsi_cache);
+		CDEBUG(D_SEC, "cache_check is good\n");
+		break;
+	}
+
+	remove_wait_queue(&rsip->waitq, &wait);
+	cache_put(&rsip->h, &rsi_cache);
+
+	if (rc)
+		GOTO(out, rc = SECSVC_DROP);
+
+        rc = SECSVC_DROP;
+        rsci = gss_svc_searchbyctx(&rsip->out_handle);
+        if (!rsci) {
+                CERROR("authentication failed\n");
+
+		/* gss mechanism returned major and minor code so we return
+		 * those in error message */
+		if (!gss_pack_err_notify(req, rsip->major_status,
+					 rsip->minor_status))
+			rc = SECSVC_COMPLETE;
+
+                GOTO(out, rc);
+        } else {
+                cache_get(&rsci->h);
+                grctx->src_ctx = &rsci->ctx;
+        }
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_KCSUM) {
+		grctx->src_ctx->gsc_mechctx->hash_func = gss_digest_hash;
+	} else if (!strcmp(grctx->src_ctx->gsc_mechctx->mech_type->gm_name,
+			   "krb5") &&
+		   !krb5_allow_old_client_csum) {
+		CWARN("%s: deny connection from '%s' due to missing 'krb_csum' feature, set 'sptlrpc.gss.krb5_allow_old_client_csum=1' to allow, but recommend client upgrade: rc = %d\n",
+		      target->obd_name, libcfs_nid2str(req->rq_peer.nid),
+		      -EPROTO);
+		GOTO(out, rc = SECSVC_DROP);
+	} else {
+		grctx->src_ctx->gsc_mechctx->hash_func =
+			gss_digest_hash_compat;
+	}
+
+        if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) {
+                CERROR("failed duplicate reverse handle\n");
+                GOTO(out, rc);
+        }
+
+        rsci->target = target;
+
+        CDEBUG(D_SEC, "server create rsc %p(%u->%s)\n",
+               rsci, rsci->ctx.gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+        if (rsip->out_handle.len > PTLRPC_GSS_MAX_HANDLE_SIZE) {
+                CERROR("handle size %u too large\n", rsip->out_handle.len);
+                GOTO(out, rc = SECSVC_DROP);
+        }
+
+        grctx->src_init = 1;
+	grctx->src_reserve_len = round_up(rsip->out_token.len, 4);
+
+        rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+        if (rc) {
+                CERROR("failed to pack reply: %d\n", rc);
+                GOTO(out, rc = SECSVC_DROP);
+        }
+
+        rs = req->rq_reply_state;
+        LASSERT(rs->rs_repbuf->lm_bufcount == 3);
+        LASSERT(rs->rs_repbuf->lm_buflens[0] >=
+                sizeof(*rephdr) + rsip->out_handle.len);
+        LASSERT(rs->rs_repbuf->lm_buflens[2] >= rsip->out_token.len);
+
+        rephdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+        rephdr->gh_version = PTLRPC_GSS_VERSION;
+        rephdr->gh_flags = 0;
+        rephdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+        rephdr->gh_major = rsip->major_status;
+        rephdr->gh_minor = rsip->minor_status;
+        rephdr->gh_seqwin = GSS_SEQ_WIN;
+        rephdr->gh_handle.len = rsip->out_handle.len;
+        memcpy(rephdr->gh_handle.data, rsip->out_handle.data,
+               rsip->out_handle.len);
+
+        memcpy(lustre_msg_buf(rs->rs_repbuf, 2, 0), rsip->out_token.data,
+               rsip->out_token.len);
+
+        rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2,
+                                               rsip->out_token.len, 0);
+
+        rc = SECSVC_OK;
+
+out:
+	/* it looks like here we should put rsip also, but this mess up
+	 * with NFS cache mgmt code... FIXME
+	 * something like:
+	 * if (rsip)
+	 *     rsi_put(&rsip->h, &rsi_cache); */
+
+	if (rsci) {
+		/* if anything went wrong, we don't keep the context too */
+		if (rc != SECSVC_OK)
+			set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+		else
+			CDEBUG(D_SEC, "create rsc with idx %#llx\n",
+			       gss_handle_to_u64(&rsci->handle));
+
+		COMPAT_RSC_PUT(&rsci->h, &rsc_cache);
+	}
+	RETURN(rc);
+}
+
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+                                           struct gss_wire_ctx *gw)
+{
+        struct rsc *rsc;
+
+        rsc = gss_svc_searchbyctx(&gw->gw_handle);
+        if (!rsc) {
+		CWARN("Invalid gss ctx idx %#llx from %s\n",
+                      gss_handle_to_u64(&gw->gw_handle),
+                      libcfs_nid2str(req->rq_peer.nid));
+                return NULL;
+        }
+
+        return &rsc->ctx;
+}
+
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx)
+{
+        struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+        COMPAT_RSC_PUT(&rsc->h, &rsc_cache);
+}
+
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx)
+{
+        struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+        /* can't be found */
+	set_bit(CACHE_NEGATIVE, &rsc->h.flags);
+        /* to be removed at next scan */
+        rsc->h.expiry_time = 1;
+}
+
+int __init gss_init_svc_upcall(void)
+{
+	int	i, rc;
+
+	/*
+	 * this helps reducing context index confliction. after server reboot,
+	 * conflicting request from clients might be filtered out by initial
+	 * sequence number checking, thus no chance to sent error notification
+	 * back to clients.
+	 */
+	get_random_bytes(&__ctx_index, sizeof(__ctx_index));
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+	for (i = 0; i < rsi_cache.hash_size; i++)
+		INIT_HLIST_HEAD(&rsi_cache.hash_table[i]);
+#endif
+	rc = cache_register_net(&rsi_cache, &init_net);
+	if (rc != 0)
+		return rc;
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+	for (i = 0; i < rsc_cache.hash_size; i++)
+		INIT_HLIST_HEAD(&rsc_cache.hash_table[i]);
+#endif
+	rc = cache_register_net(&rsc_cache, &init_net);
+	if (rc != 0) {
+		cache_unregister_net(&rsi_cache, &init_net);
+		return rc;
+	}
+
+	/* FIXME this looks stupid. we intend to give lsvcgssd a chance to open
+	 * the init upcall channel, otherwise there's big chance that the first
+	 * upcall issued before the channel be opened thus nfsv4 cache code will
+	 * drop the request directly, thus lead to unnecessary recovery time.
+	 * Here we wait at minimum 1.5 seconds.
+	 */
+	for (i = 0; i < 6; i++) {
+		if (channel_users(&rsi_cache) > 0)
+			break;
+		schedule_timeout_uninterruptible(cfs_time_seconds(1) / 4);
+	}
+
+	if (channel_users(&rsi_cache) == 0)
+		CDEBUG(D_SEC,
+		       "Init channel is not opened by lsvcgssd, following request might be dropped until lsvcgssd is active\n");
+
+	return 0;
+}
+
+void gss_exit_svc_upcall(void)
+{
+	cache_purge(&rsi_cache);
+	cache_unregister_net(&rsi_cache, &init_net);
+
+	cache_purge(&rsc_cache);
+	cache_unregister_net(&rsc_cache, &init_net);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
new file mode 100644
index 0000000000000..e401985e69f50
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
@@ -0,0 +1,278 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct dentry *gss_debugfs_dir_lk;
+static struct dentry *gss_debugfs_dir;
+static struct proc_dir_entry *gss_lprocfs_dir;
+
+/*
+ * statistic of "out-of-sequence-window"
+ */
+static struct {
+	spinlock_t	oos_lock;
+	atomic_t	oos_cli_count;		/* client occurrence */
+	int		oos_cli_behind;		/* client max seqs behind */
+	atomic_t	oos_svc_replay[3];	/* server replay detected */
+	atomic_t	oos_svc_pass[3];	/* server verified ok */
+} gss_stat_oos = {
+	.oos_cli_count	= ATOMIC_INIT(0),
+	.oos_cli_behind	= 0,
+	.oos_svc_replay	= { ATOMIC_INIT(0), },
+	.oos_svc_pass	= { ATOMIC_INIT(0), },
+};
+
+void gss_stat_oos_record_cli(int behind)
+{
+	atomic_inc(&gss_stat_oos.oos_cli_count);
+
+	spin_lock(&gss_stat_oos.oos_lock);
+	if (behind > gss_stat_oos.oos_cli_behind)
+		gss_stat_oos.oos_cli_behind = behind;
+	spin_unlock(&gss_stat_oos.oos_lock);
+}
+
+void gss_stat_oos_record_svc(int phase, int replay)
+{
+	LASSERT(phase >= 0 && phase <= 2);
+
+	if (replay)
+		atomic_inc(&gss_stat_oos.oos_svc_replay[phase]);
+	else
+		atomic_inc(&gss_stat_oos.oos_svc_pass[phase]);
+}
+
+static int gss_proc_oos_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "seqwin:		   %u\n"
+		   "backwin:		%u\n"
+		   "client fall behind seqwin\n"
+		   "  occurrence:	%d\n"
+		   "  max seq behind:	%d\n"
+		   "server replay detected:\n"
+		   "  phase 0:		%d\n"
+		   "  phase 1:		%d\n"
+		   "  phase 2:		%d\n"
+		   "server verify ok:\n"
+		   "  phase 2:		%d\n",
+		   GSS_SEQ_WIN_MAIN,
+		   GSS_SEQ_WIN_BACK,
+		   atomic_read(&gss_stat_oos.oos_cli_count),
+		   gss_stat_oos.oos_cli_behind,
+		   atomic_read(&gss_stat_oos.oos_svc_replay[0]),
+		   atomic_read(&gss_stat_oos.oos_svc_replay[1]),
+		   atomic_read(&gss_stat_oos.oos_svc_replay[2]),
+		   atomic_read(&gss_stat_oos.oos_svc_pass[2]));
+	return 0;
+}
+LDEBUGFS_SEQ_FOPS_RO(gss_proc_oos);
+
+static ssize_t
+gss_proc_write_secinit(struct file *file, const char *buffer,
+				  size_t count, loff_t *off)
+{
+        int rc;
+
+        rc = gss_do_ctx_init_rpc((char *) buffer, count);
+        if (rc) {
+                LASSERT(rc < 0);
+                return rc;
+        }
+	return count;
+}
+
+static const struct file_operations gss_proc_secinit = {
+	.write = gss_proc_write_secinit,
+};
+
+int sptlrpc_krb5_allow_old_client_csum_seq_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%u\n", krb5_allow_old_client_csum);
+	return 0;
+}
+
+ssize_t sptlrpc_krb5_allow_old_client_csum_seq_write(struct file *file,
+						     const char __user *buffer,
+						     size_t count, loff_t *off)
+{
+	bool val;
+	int rc;
+
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	krb5_allow_old_client_csum = val;
+	return count;
+}
+LPROC_SEQ_FOPS(sptlrpc_krb5_allow_old_client_csum);
+
+#ifdef HAVE_GSS_KEYRING
+int sptlrpc_gss_check_upcall_ns_seq_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%u\n", gss_check_upcall_ns);
+	return 0;
+}
+
+ssize_t sptlrpc_gss_check_upcall_ns_seq_write(struct file *file,
+					      const char __user *buffer,
+					      size_t count, loff_t *off)
+{
+	bool val;
+	int rc;
+
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	gss_check_upcall_ns = val;
+	return count;
+}
+LPROC_SEQ_FOPS(sptlrpc_gss_check_upcall_ns);
+#endif /* HAVE_GSS_KEYRING */
+
+static struct ldebugfs_vars gss_debugfs_vars[] = {
+	{ .name	=	"replays",
+	  .fops	=	&gss_proc_oos_fops	},
+	{ .name	=	"init_channel",
+	  .fops	=	&gss_proc_secinit,
+	  .proc_mode =	0222			},
+	{ NULL }
+};
+
+static struct lprocfs_vars gss_lprocfs_vars[] = {
+	{ .name	=	"krb5_allow_old_client_csum",
+	  .fops	=	&sptlrpc_krb5_allow_old_client_csum_fops },
+#ifdef HAVE_GSS_KEYRING
+	{ .name	=	"gss_check_upcall_ns",
+	  .fops	=	&sptlrpc_gss_check_upcall_ns_fops },
+#endif
+	{ NULL }
+};
+
+/*
+ * for userspace helper lgss_keyring.
+ *
+ * debug_level: [0, 4], defined in utils/gss/lgss_utils.h
+ */
+static int gss_lk_debug_level = 1;
+
+static int gss_lk_proc_dl_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%u\n", gss_lk_debug_level);
+	return 0;
+}
+
+static ssize_t
+gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint_from_user(buffer, count, 0, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val > 4)
+		return -ERANGE;
+
+	gss_lk_debug_level = val;
+
+	return count;
+}
+LDEBUGFS_SEQ_FOPS(gss_lk_proc_dl);
+
+static struct ldebugfs_vars gss_lk_debugfs_vars[] = {
+	{ .name	=	"debug_level",
+	  .fops	=	&gss_lk_proc_dl_fops	},
+	{ NULL }
+};
+
+void gss_exit_tunables(void)
+{
+	debugfs_remove_recursive(gss_debugfs_dir_lk);
+	gss_debugfs_dir_lk = NULL;
+
+	debugfs_remove_recursive(gss_debugfs_dir);
+	gss_debugfs_dir = NULL;
+
+	if (!IS_ERR_OR_NULL(gss_lprocfs_dir))
+		lprocfs_remove(&gss_lprocfs_dir);
+}
+
+int gss_init_tunables(void)
+{
+	int	rc;
+
+	spin_lock_init(&gss_stat_oos.oos_lock);
+
+	gss_debugfs_dir = debugfs_create_dir("gss", sptlrpc_debugfs_dir);
+	ldebugfs_add_vars(gss_debugfs_dir, gss_debugfs_vars, NULL);
+
+	gss_debugfs_dir_lk = debugfs_create_dir("lgss_keyring",
+						gss_debugfs_dir);
+	ldebugfs_add_vars(gss_debugfs_dir_lk, gss_lk_debugfs_vars, NULL);
+
+	gss_lprocfs_dir = lprocfs_register("gss", sptlrpc_lprocfs_dir,
+					   gss_lprocfs_vars, NULL);
+	if (IS_ERR_OR_NULL(gss_lprocfs_dir)) {
+		rc = gss_lprocfs_dir ? PTR_ERR(gss_lprocfs_dir) : -ENOMEM;
+		gss_lprocfs_dir = NULL;
+		GOTO(out, rc);
+	}
+
+	return 0;
+
+out:
+	CERROR("failed to initialize gss lproc entries: %d\n", rc);
+	gss_exit_tunables();
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
new file mode 100644
index 0000000000000..7c8001152a454
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
@@ -0,0 +1,2929 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+/*
+ * early reply have fixed size, respectively in privacy and integrity mode.
+ * so we calculate them only once.
+ */
+static int gss_at_reply_off_integ;
+static int gss_at_reply_off_priv;
+
+
+static inline int msg_last_segidx(struct lustre_msg *msg)
+{
+        LASSERT(msg->lm_bufcount > 0);
+        return msg->lm_bufcount - 1;
+}
+static inline int msg_last_seglen(struct lustre_msg *msg)
+{
+        return msg->lm_buflens[msg_last_segidx(msg)];
+}
+
+/********************************************
+ * wire data swabber                        *
+ ********************************************/
+
+static
+void gss_header_swabber(struct gss_header *ghdr)
+{
+        __swab32s(&ghdr->gh_flags);
+        __swab32s(&ghdr->gh_proc);
+        __swab32s(&ghdr->gh_seq);
+        __swab32s(&ghdr->gh_svc);
+        __swab32s(&ghdr->gh_pad1);
+        __swab32s(&ghdr->gh_handle.len);
+}
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+                                   int swabbed)
+{
+        struct gss_header *ghdr;
+
+        ghdr = lustre_msg_buf(msg, segment, sizeof(*ghdr));
+        if (ghdr == NULL)
+                return NULL;
+
+        if (swabbed)
+                gss_header_swabber(ghdr);
+
+        if (sizeof(*ghdr) + ghdr->gh_handle.len > msg->lm_buflens[segment]) {
+                CERROR("gss header has length %d, now %u received\n",
+                       (int) sizeof(*ghdr) + ghdr->gh_handle.len,
+                       msg->lm_buflens[segment]);
+                return NULL;
+        }
+
+        return ghdr;
+}
+
+/*
+ * payload should be obtained from mechanism. but currently since we
+ * only support kerberos, we could simply use fixed value.
+ * krb5 "meta" data:
+ *  - krb5 header:      16
+ *  - krb5 checksum:    20
+ *
+ * for privacy mode, payload also include the cipher text which has the same
+ * size as plain text, plus possible confounder, padding both at maximum cipher
+ * block size.
+ */
+#define GSS_KRB5_INTEG_MAX_PAYLOAD      (40)
+
+static inline
+int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy)
+{
+        if (privacy)
+                return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize;
+        else
+                return GSS_KRB5_INTEG_MAX_PAYLOAD;
+}
+
+/*
+ * return signature size, otherwise < 0 to indicate error
+ */
+static int gss_sign_msg(struct lustre_msg *msg,
+                        struct gss_ctx *mechctx,
+                        enum lustre_sec_part sp,
+                        __u32 flags, __u32 proc, __u32 seq, __u32 svc,
+                        rawobj_t *handle)
+{
+        struct gss_header      *ghdr;
+        rawobj_t                text[4], mic;
+        int                     textcnt, max_textcnt, mic_idx;
+        __u32                   major;
+
+        LASSERT(msg->lm_bufcount >= 2);
+
+        /* gss hdr */
+        LASSERT(msg->lm_buflens[0] >=
+                sizeof(*ghdr) + (handle ? handle->len : 0));
+        ghdr = lustre_msg_buf(msg, 0, 0);
+
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_sp = (__u8) sp;
+        ghdr->gh_flags = flags;
+        ghdr->gh_proc = proc;
+        ghdr->gh_seq = seq;
+        ghdr->gh_svc = svc;
+        if (!handle) {
+                /* fill in a fake one */
+                ghdr->gh_handle.len = 0;
+        } else {
+                ghdr->gh_handle.len = handle->len;
+                memcpy(ghdr->gh_handle.data, handle->data, handle->len);
+        }
+
+        /* no actual signature for null mode */
+        if (svc == SPTLRPC_SVC_NULL)
+                return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+        /* MIC */
+        mic_idx = msg_last_segidx(msg);
+        max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+        for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+                text[textcnt].len = msg->lm_buflens[textcnt];
+                text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+        }
+
+        mic.len = msg->lm_buflens[mic_idx];
+        mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+        major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("fail to generate MIC: %08x\n", major);
+                return -EPERM;
+        }
+        LASSERT(mic.len <= msg->lm_buflens[mic_idx]);
+
+        return lustre_shrink_msg(msg, mic_idx, mic.len, 0);
+}
+
+/*
+ * return gss error
+ */
+static
+__u32 gss_verify_msg(struct lustre_msg *msg,
+                     struct gss_ctx *mechctx,
+                     __u32 svc)
+{
+        rawobj_t        text[4], mic;
+        int             textcnt, max_textcnt;
+        int             mic_idx;
+        __u32           major;
+
+        LASSERT(msg->lm_bufcount >= 2);
+
+        if (svc == SPTLRPC_SVC_NULL)
+                return GSS_S_COMPLETE;
+
+        mic_idx = msg_last_segidx(msg);
+        max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+        for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+                text[textcnt].len = msg->lm_buflens[textcnt];
+                text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+        }
+
+        mic.len = msg->lm_buflens[mic_idx];
+        mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+        major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic);
+        if (major != GSS_S_COMPLETE)
+                CERROR("mic verify error: %08x\n", major);
+
+        return major;
+}
+
+/*
+ * return gss error code
+ */
+static
+__u32 gss_unseal_msg(struct gss_ctx *mechctx,
+                   struct lustre_msg *msgbuf,
+                   int *msg_len, int msgbuf_len)
+{
+        rawobj_t                 clear_obj, hdrobj, token;
+        __u8                    *clear_buf;
+        int                      clear_buflen;
+        __u32                    major;
+        ENTRY;
+
+        if (msgbuf->lm_bufcount != 2) {
+                CERROR("invalid bufcount %d\n", msgbuf->lm_bufcount);
+                RETURN(GSS_S_FAILURE);
+        }
+
+        /* allocate a temporary clear text buffer, same sized as token,
+         * we assume the final clear text size <= token size */
+        clear_buflen = lustre_msg_buflen(msgbuf, 1);
+        OBD_ALLOC_LARGE(clear_buf, clear_buflen);
+        if (!clear_buf)
+                RETURN(GSS_S_FAILURE);
+
+        /* buffer objects */
+        hdrobj.len = lustre_msg_buflen(msgbuf, 0);
+        hdrobj.data = lustre_msg_buf(msgbuf, 0, 0);
+        token.len = lustre_msg_buflen(msgbuf, 1);
+        token.data = lustre_msg_buf(msgbuf, 1, 0);
+        clear_obj.len = clear_buflen;
+        clear_obj.data = clear_buf;
+
+        major = lgss_unwrap(mechctx, &hdrobj, &token, &clear_obj);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("unwrap message error: %08x\n", major);
+                GOTO(out_free, major = GSS_S_FAILURE);
+        }
+        LASSERT(clear_obj.len <= clear_buflen);
+        LASSERT(clear_obj.len <= msgbuf_len);
+
+        /* now the decrypted message */
+        memcpy(msgbuf, clear_obj.data, clear_obj.len);
+        *msg_len = clear_obj.len;
+
+        major = GSS_S_COMPLETE;
+out_free:
+        OBD_FREE_LARGE(clear_buf, clear_buflen);
+        RETURN(major);
+}
+
+/********************************************
+ * gss client context manipulation helpers  *
+ ********************************************/
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount));
+
+	if (!test_and_set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags)) {
+		if (!ctx->cc_early_expire)
+			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+		CWARN("ctx %p(%u->%s) get expired: %lld(%+llds)\n",
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_expire,
+		      ctx->cc_expire == 0 ? 0 :
+		      ctx->cc_expire - ktime_get_real_seconds());
+
+		sptlrpc_cli_ctx_wakeup(ctx);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
+{
+        if (unlikely(cli_ctx_is_dead(ctx)))
+                return 1;
+
+        /* expire is 0 means never expire. a newly created gss context
+         * which during upcall may has 0 expiration */
+        if (ctx->cc_expire == 0)
+                return 0;
+
+        /* check real expiration */
+	if (ctx->cc_expire > ktime_get_real_seconds())
+                return 0;
+
+        cli_ctx_expire(ctx);
+        return 1;
+}
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
+{
+	struct ptlrpc_cli_ctx *ctx = &gctx->gc_base;
+	time64_t ctx_expiry;
+
+        if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) {
+                CERROR("ctx %p(%u): unable to inquire, expire it now\n",
+                       gctx, ctx->cc_vcred.vc_uid);
+                ctx_expiry = 1; /* make it expired now */
+        }
+
+        ctx->cc_expire = gss_round_ctx_expiry(ctx_expiry,
+                                              ctx->cc_sec->ps_flvr.sf_flags);
+
+        /* At this point this ctx might have been marked as dead by
+         * someone else, in which case nobody will make further use
+         * of it. we don't care, and mark it UPTODATE will help
+         * destroying server side context when it be destroyed. */
+	set_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+	if (sec_is_reverse(ctx->cc_sec)) {
+		CWARN("server installed reverse ctx %p idx %#llx, "
+		      "expiry %lld(%+llds)\n", ctx,
+		      gss_handle_to_u64(&gctx->gc_handle),
+		      ctx->cc_expire,
+		      ctx->cc_expire - ktime_get_real_seconds());
+        } else {
+		CWARN("client refreshed ctx %p idx %#llx (%u->%s), "
+		      "expiry %lld(%+llds)\n", ctx,
+		      gss_handle_to_u64(&gctx->gc_handle),
+		      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_expire,
+		      ctx->cc_expire - ktime_get_real_seconds());
+
+		/* install reverse svc ctx for root context */
+		if (ctx->cc_vcred.vc_uid == 0)
+			gss_sec_install_rctx(ctx->cc_sec->ps_import,
+					     ctx->cc_sec, ctx);
+	}
+
+        sptlrpc_cli_ctx_wakeup(ctx);
+}
+
+static void gss_cli_ctx_finalize(struct gss_cli_ctx *gctx)
+{
+        LASSERT(gctx->gc_base.cc_sec);
+
+        if (gctx->gc_mechctx) {
+                lgss_delete_sec_context(&gctx->gc_mechctx);
+                gctx->gc_mechctx = NULL;
+        }
+
+        if (!rawobj_empty(&gctx->gc_svc_handle)) {
+                /* forward ctx: mark buddy reverse svcctx soon-expire. */
+                if (!sec_is_reverse(gctx->gc_base.cc_sec) &&
+                    !rawobj_empty(&gctx->gc_svc_handle))
+                        gss_svc_upcall_expire_rvs_ctx(&gctx->gc_svc_handle);
+
+                rawobj_free(&gctx->gc_svc_handle);
+        }
+
+        rawobj_free(&gctx->gc_handle);
+}
+
+/**
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * Modified for our own problem: arriving request has valid sequence number,
+ * but unwrapping request might cost a long time, after that its sequence
+ * are not valid anymore (fall behind the window). It rarely happen, mostly
+ * under extreme load.
+ *
+ * Note we should not check sequence before verifying the integrity of incoming
+ * request, because just one attacking request with high sequence number might
+ * cause all following requests be dropped.
+ *
+ * So here we use a multi-phase approach: prepare 2 sequence windows,
+ * "main window" for normal sequence and "back window" for fall behind sequence.
+ * and 3-phase checking mechanism:
+ *  0 - before integrity verification, perform an initial sequence checking in
+ *      main window, which only tries and doesn't actually set any bits. if the
+ *      sequence is high above the window or fits in the window and the bit
+ *      is 0, then accept and proceed to integrity verification. otherwise
+ *      reject this sequence.
+ *  1 - after integrity verification, check in main window again. if this
+ *      sequence is high above the window or fits in the window and the bit
+ *      is 0, then set the bit and accept; if it fits in the window but bit
+ *      already set, then reject; if it falls behind the window, then proceed
+ *      to phase 2.
+ *  2 - check in back window. if it is high above the window or fits in the
+ *      window and the bit is 0, then set the bit and accept. otherwise reject.
+ *
+ * \return	 1:	looks like a replay
+ * \return	 0:	is ok
+ * \return	-1:	is a replay
+ *
+ * Note phase 0 is necessary, because otherwise replay attacking request of
+ * sequence which between the 2 windows can't be detected.
+ *
+ * This mechanism can't totally solve the problem, but could help reduce the
+ * number of valid requests be dropped.
+ */
+static
+int gss_do_check_seq(unsigned long *window, __u32 win_size, __u32 *max_seq,
+                     __u32 seq_num, int phase)
+{
+        LASSERT(phase >= 0 && phase <= 2);
+
+        if (seq_num > *max_seq) {
+                /*
+                 * 1. high above the window
+                 */
+                if (phase == 0)
+                        return 0;
+
+                if (seq_num >= *max_seq + win_size) {
+                        memset(window, 0, win_size / 8);
+                        *max_seq = seq_num;
+                } else {
+                        while(*max_seq < seq_num) {
+                                (*max_seq)++;
+                                __clear_bit((*max_seq) % win_size, window);
+                        }
+                }
+                __set_bit(seq_num % win_size, window);
+        } else if (seq_num + win_size <= *max_seq) {
+                /*
+                 * 2. low behind the window
+                 */
+                if (phase == 0 || phase == 2)
+                        goto replay;
+
+                CWARN("seq %u is %u behind (size %d), check backup window\n",
+                      seq_num, *max_seq - win_size - seq_num, win_size);
+                return 1;
+        } else {
+                /*
+                 * 3. fit into the window
+                 */
+                switch (phase) {
+                case 0:
+			if (test_bit(seq_num % win_size, window))
+                                goto replay;
+                        break;
+                case 1:
+                case 2:
+                     if (__test_and_set_bit(seq_num % win_size, window))
+                                goto replay;
+                        break;
+                }
+        }
+
+        return 0;
+
+replay:
+        CERROR("seq %u (%s %s window) is a replay: max %u, winsize %d\n",
+               seq_num,
+               seq_num + win_size > *max_seq ? "in" : "behind",
+               phase == 2 ? "backup " : "main",
+               *max_seq, win_size);
+        return -1;
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * if @set == 0: initial check, don't set any bit in window
+ * if @sec == 1: final check, set bit in window
+ */
+int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set)
+{
+	int rc = 0;
+
+	spin_lock(&ssd->ssd_lock);
+
+        if (set == 0) {
+                /*
+                 * phase 0 testing
+                 */
+                rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+                                      &ssd->ssd_max_main, seq_num, 0);
+                if (unlikely(rc))
+                        gss_stat_oos_record_svc(0, 1);
+        } else {
+                /*
+                 * phase 1 checking main window
+                 */
+                rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+                                      &ssd->ssd_max_main, seq_num, 1);
+                switch (rc) {
+                case -1:
+                        gss_stat_oos_record_svc(1, 1);
+			fallthrough;
+                case 0:
+                        goto exit;
+                }
+                /*
+                 * phase 2 checking back window
+                 */
+                rc = gss_do_check_seq(ssd->ssd_win_back, GSS_SEQ_WIN_BACK,
+                                      &ssd->ssd_max_back, seq_num, 2);
+                if (rc)
+                        gss_stat_oos_record_svc(2, 1);
+                else
+                        gss_stat_oos_record_svc(2, 0);
+        }
+exit:
+	spin_unlock(&ssd->ssd_lock);
+	return rc;
+}
+
+/***************************************
+ * cred APIs                           *
+ ***************************************/
+
+static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx,
+                                  int msgsize, int privacy)
+{
+        return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx,
+                                struct sptlrpc_flavor *flvr,
+                                int reply, int read)
+{
+        int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+        LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT);
+
+        if ((!reply && !read) || (reply && read)) {
+                switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+                case SPTLRPC_BULK_SVC_NULL:
+                        break;
+                case SPTLRPC_BULK_SVC_INTG:
+                        payload += gss_cli_payload(ctx, 0, 0);
+                        break;
+                case SPTLRPC_BULK_SVC_PRIV:
+                        payload += gss_cli_payload(ctx, 0, 1);
+                        break;
+                case SPTLRPC_BULK_SVC_AUTH:
+                default:
+                        LBUG();
+                }
+        }
+
+        return payload;
+}
+
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+        return (ctx->cc_vcred.vc_uid == vcred->vc_uid);
+}
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_CTX_NEW)
+		strlcat(buf, "new,", bufsize);
+	if (flags & PTLRPC_CTX_UPTODATE)
+		strlcat(buf, "uptodate,", bufsize);
+	if (flags & PTLRPC_CTX_DEAD)
+		strlcat(buf, "dead,", bufsize);
+	if (flags & PTLRPC_CTX_ERROR)
+		strlcat(buf, "error,", bufsize);
+	if (flags & PTLRPC_CTX_CACHED)
+		strlcat(buf, "cached,", bufsize);
+	if (flags & PTLRPC_CTX_ETERNAL)
+		strlcat(buf, "eternal,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+}
+
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx,
+                     struct ptlrpc_request *req)
+{
+        struct gss_cli_ctx      *gctx = ctx2gctx(ctx);
+        __u32                    flags = 0, seq, svc;
+        int                      rc;
+        ENTRY;
+
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+        LASSERT(req->rq_cli_ctx == ctx);
+
+        /* nothing to do for context negotiation RPCs */
+        if (req->rq_ctx_init)
+                RETURN(0);
+
+        svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        if (req->rq_pack_bulk)
+                flags |= LUSTRE_GSS_PACK_BULK;
+        if (req->rq_pack_udesc)
+                flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+	seq = atomic_inc_return(&gctx->gc_seq);
+
+	rc = gss_sign_msg(req->rq_reqbuf, gctx->gc_mechctx,
+			  ctx->cc_sec->ps_part,
+			  flags, gctx->gc_proc, seq, svc,
+			  &gctx->gc_handle);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* gss_sign_msg() msg might take long time to finish, in which period
+	 * more rpcs could be wrapped up and sent out. if we found too many
+	 * of them we should repack this rpc, because sent it too late might
+	 * lead to the sequence number fall behind the window on server and
+	 * be dropped. also applies to gss_cli_ctx_seal().
+	 *
+	 * Note: null mode doesn't check sequence number. */
+	if (svc != SPTLRPC_SVC_NULL &&
+	    atomic_read(&gctx->gc_seq) - seq > GSS_SEQ_REPACK_THRESHOLD) {
+		int behind = atomic_read(&gctx->gc_seq) - seq;
+
+		gss_stat_oos_record_cli(behind);
+		CWARN("req %p: %u behind, retry signing\n", req, behind);
+		goto redo;
+	}
+
+	req->rq_reqdata_len = rc;
+	RETURN(0);
+}
+
+static
+int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx,
+                                  struct ptlrpc_request *req,
+                                  struct gss_header *ghdr)
+{
+        struct gss_err_header *errhdr;
+        int rc;
+
+        LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR);
+
+        errhdr = (struct gss_err_header *) ghdr;
+
+	CWARN("req x%llu/t%llu, ctx %p idx %#llx(%u->%s): "
+              "%sserver respond (%08x/%08x)\n",
+              req->rq_xid, req->rq_transno, ctx,
+              gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle),
+              ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+              sec_is_reverse(ctx->cc_sec) ? "reverse" : "",
+              errhdr->gh_major, errhdr->gh_minor);
+
+        /* context fini rpc, let it failed */
+        if (req->rq_ctx_fini) {
+                CWARN("context fini rpc failed\n");
+                return -EINVAL;
+        }
+
+        /* reverse sec, just return error, don't expire this ctx because it's
+         * crucial to callback rpcs. note if the callback rpc failed because
+         * of bit flip during network transfer, the client will be evicted
+         * directly. so more gracefully we probably want let it retry for
+         * number of times. */
+        if (sec_is_reverse(ctx->cc_sec))
+                return -EINVAL;
+
+        if (errhdr->gh_major != GSS_S_NO_CONTEXT &&
+            errhdr->gh_major != GSS_S_BAD_SIG)
+                return -EACCES;
+
+        /* server return NO_CONTEXT might be caused by context expire
+         * or server reboot/failover. we try to refresh a new ctx which
+         * be transparent to upper layer.
+         *
+         * In some cases, our gss handle is possible to be incidentally
+         * identical to another handle since the handle itself is not
+         * fully random. In krb5 case, the GSS_S_BAD_SIG will be
+         * returned, maybe other gss error for other mechanism.
+         *
+         * if we add new mechanism, make sure the correct error are
+         * returned in this case. */
+        CWARN("%s: server might lost the context, retrying\n",
+              errhdr->gh_major == GSS_S_NO_CONTEXT ?  "NO_CONTEXT" : "BAD_SIG");
+
+        sptlrpc_cli_ctx_expire(ctx);
+
+        /* we need replace the ctx right here, otherwise during
+         * resent we'll hit the logic in sptlrpc_req_refresh_ctx()
+         * which keep the ctx with RESEND flag, thus we'll never
+         * get rid of this ctx. */
+        rc = sptlrpc_req_replace_dead_ctx(req);
+        if (rc == 0)
+                req->rq_resend = 1;
+
+        return rc;
+}
+
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx,
+                       struct ptlrpc_request *req)
+{
+        struct gss_cli_ctx     *gctx;
+        struct gss_header      *ghdr, *reqhdr;
+        struct lustre_msg      *msg = req->rq_repdata;
+        __u32                   major;
+        int                     pack_bulk, swabbed, rc = 0;
+        ENTRY;
+
+        LASSERT(req->rq_cli_ctx == ctx);
+        LASSERT(msg);
+
+        gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+        /* special case for context negotiation, rq_repmsg/rq_replen actually
+         * are not used currently. but early reply always be treated normally */
+        if (req->rq_ctx_init && !req->rq_early) {
+                req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+                req->rq_replen = msg->lm_buflens[1];
+                RETURN(0);
+        }
+
+        if (msg->lm_bufcount < 2 || msg->lm_bufcount > 4) {
+                CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+                RETURN(-EPROTO);
+        }
+
+	swabbed = req_capsule_rep_need_swab(&req->rq_pill);
+
+        ghdr = gss_swab_header(msg, 0, swabbed);
+        if (ghdr == NULL) {
+                CERROR("can't decode gss header\n");
+                RETURN(-EPROTO);
+        }
+
+        /* sanity checks */
+        reqhdr = lustre_msg_buf(msg, 0, sizeof(*reqhdr));
+        LASSERT(reqhdr);
+
+        if (ghdr->gh_version != reqhdr->gh_version) {
+                CERROR("gss version %u mismatch, expect %u\n",
+                       ghdr->gh_version, reqhdr->gh_version);
+                RETURN(-EPROTO);
+        }
+
+        switch (ghdr->gh_proc) {
+        case PTLRPC_GSS_PROC_DATA:
+                pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+                if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+                        CERROR("%s bulk flag in reply\n",
+                               req->rq_pack_bulk ? "missing" : "unexpected");
+                        RETURN(-EPROTO);
+                }
+
+                if (ghdr->gh_seq != reqhdr->gh_seq) {
+                        CERROR("seqnum %u mismatch, expect %u\n",
+                               ghdr->gh_seq, reqhdr->gh_seq);
+                        RETURN(-EPROTO);
+                }
+
+                if (ghdr->gh_svc != reqhdr->gh_svc) {
+                        CERROR("svc %u mismatch, expect %u\n",
+                               ghdr->gh_svc, reqhdr->gh_svc);
+                        RETURN(-EPROTO);
+                }
+
+                if (swabbed)
+                        gss_header_swabber(ghdr);
+
+                major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc);
+                if (major != GSS_S_COMPLETE) {
+                        CERROR("failed to verify reply: %x\n", major);
+                        RETURN(-EPERM);
+                }
+
+                if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) {
+                        __u32 cksum;
+
+                        cksum = crc32_le(!(__u32) 0,
+                                         lustre_msg_buf(msg, 1, 0),
+                                         lustre_msg_buflen(msg, 1));
+                        if (cksum != msg->lm_cksum) {
+                                CWARN("early reply checksum mismatch: "
+                                      "%08x != %08x\n", cksum, msg->lm_cksum);
+                                RETURN(-EPROTO);
+                        }
+                }
+
+                if (pack_bulk) {
+                        /* bulk checksum is right after the lustre msg */
+                        if (msg->lm_bufcount < 3) {
+                                CERROR("Invalid reply bufcount %u\n",
+                                       msg->lm_bufcount);
+                                RETURN(-EPROTO);
+                        }
+
+                        rc = bulk_sec_desc_unpack(msg, 2, swabbed);
+                        if (rc) {
+                                CERROR("unpack bulk desc: %d\n", rc);
+                                RETURN(rc);
+                        }
+                }
+
+                req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+                req->rq_replen = msg->lm_buflens[1];
+                break;
+        case PTLRPC_GSS_PROC_ERR:
+                if (req->rq_early) {
+                        CERROR("server return error with early reply\n");
+                        rc = -EPROTO;
+                } else {
+                        rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+                }
+                break;
+        default:
+                CERROR("unknown gss proc %d\n", ghdr->gh_proc);
+                rc = -EPROTO;
+        }
+
+        RETURN(rc);
+}
+
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx,
+                     struct ptlrpc_request *req)
+{
+        struct gss_cli_ctx      *gctx;
+        rawobj_t                 hdrobj, msgobj, token;
+        struct gss_header       *ghdr;
+        __u32                    buflens[2], major;
+        int                      wiresize, rc;
+        ENTRY;
+
+        LASSERT(req->rq_clrbuf);
+        LASSERT(req->rq_cli_ctx == ctx);
+        LASSERT(req->rq_reqlen);
+
+        gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+        /* final clear data length */
+        req->rq_clrdata_len = lustre_msg_size_v2(req->rq_clrbuf->lm_bufcount,
+                                                 req->rq_clrbuf->lm_buflens);
+
+        /* calculate wire data length */
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = gss_cli_payload(&gctx->gc_base, req->rq_clrdata_len, 1);
+        wiresize = lustre_msg_size_v2(2, buflens);
+
+        /* allocate wire buffer */
+        if (req->rq_pool) {
+                /* pre-allocated */
+                LASSERT(req->rq_reqbuf);
+                LASSERT(req->rq_reqbuf != req->rq_clrbuf);
+                LASSERT(req->rq_reqbuf_len >= wiresize);
+        } else {
+                OBD_ALLOC_LARGE(req->rq_reqbuf, wiresize);
+                if (!req->rq_reqbuf)
+                        RETURN(-ENOMEM);
+                req->rq_reqbuf_len = wiresize;
+        }
+
+        lustre_init_msg_v2(req->rq_reqbuf, 2, buflens, NULL);
+        req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+        /* gss header */
+        ghdr = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_sp = (__u8) ctx->cc_sec->ps_part;
+        ghdr->gh_flags = 0;
+        ghdr->gh_proc = gctx->gc_proc;
+        ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+        ghdr->gh_handle.len = gctx->gc_handle.len;
+        memcpy(ghdr->gh_handle.data, gctx->gc_handle.data, gctx->gc_handle.len);
+        if (req->rq_pack_bulk)
+                ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+        if (req->rq_pack_udesc)
+                ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+	ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+
+        /* buffer objects */
+        hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+        hdrobj.data = (__u8 *) ghdr;
+        msgobj.len = req->rq_clrdata_len;
+        msgobj.data = (__u8 *) req->rq_clrbuf;
+        token.len = lustre_msg_buflen(req->rq_reqbuf, 1);
+        token.data = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+
+        major = lgss_wrap(gctx->gc_mechctx, &hdrobj, &msgobj,
+                          req->rq_clrbuf_len, &token);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("priv: wrap message error: %08x\n", major);
+                GOTO(err_free, rc = -EPERM);
+        }
+        LASSERT(token.len <= buflens[1]);
+
+	/* see explain in gss_cli_ctx_sign() */
+	if (unlikely(atomic_read(&gctx->gc_seq) - ghdr->gh_seq >
+		     GSS_SEQ_REPACK_THRESHOLD)) {
+		int behind = atomic_read(&gctx->gc_seq) - ghdr->gh_seq;
+
+		gss_stat_oos_record_cli(behind);
+		CWARN("req %p: %u behind, retry sealing\n", req, behind);
+
+		ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+		goto redo;
+	}
+
+	/* now set the final wire data length */
+	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, 1, token.len,0);
+	RETURN(0);
+
+err_free:
+	if (!req->rq_pool) {
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+	RETURN(rc);
+}
+
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx,
+                       struct ptlrpc_request *req)
+{
+        struct gss_cli_ctx      *gctx;
+        struct gss_header       *ghdr;
+        struct lustre_msg       *msg = req->rq_repdata;
+        int                      msglen, pack_bulk, swabbed, rc;
+        __u32                    major;
+        ENTRY;
+
+        LASSERT(req->rq_cli_ctx == ctx);
+        LASSERT(req->rq_ctx_init == 0);
+        LASSERT(msg);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+	swabbed = req_capsule_rep_need_swab(&req->rq_pill);
+
+        ghdr = gss_swab_header(msg, 0, swabbed);
+        if (ghdr == NULL) {
+                CERROR("can't decode gss header\n");
+                RETURN(-EPROTO);
+        }
+
+        /* sanity checks */
+        if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+                CERROR("gss version %u mismatch, expect %u\n",
+                       ghdr->gh_version, PTLRPC_GSS_VERSION);
+                RETURN(-EPROTO);
+        }
+
+        switch (ghdr->gh_proc) {
+        case PTLRPC_GSS_PROC_DATA:
+                pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+                if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+                        CERROR("%s bulk flag in reply\n",
+                               req->rq_pack_bulk ? "missing" : "unexpected");
+                        RETURN(-EPROTO);
+                }
+
+                if (swabbed)
+                        gss_header_swabber(ghdr);
+
+                /* use rq_repdata_len as buffer size, which assume unseal
+                 * doesn't need extra memory space. for precise control, we'd
+                 * better calculate out actual buffer size as
+                 * (repbuf_len - offset - repdata_len) */
+                major = gss_unseal_msg(gctx->gc_mechctx, msg,
+                                       &msglen, req->rq_repdata_len);
+                if (major != GSS_S_COMPLETE) {
+                        CERROR("failed to unwrap reply: %x\n", major);
+                        rc = -EPERM;
+                        break;
+                }
+
+                swabbed = __lustre_unpack_msg(msg, msglen);
+                if (swabbed < 0) {
+                        CERROR("Failed to unpack after decryption\n");
+                        RETURN(-EPROTO);
+                }
+
+                if (msg->lm_bufcount < 1) {
+                        CERROR("Invalid reply buffer: empty\n");
+                        RETURN(-EPROTO);
+                }
+
+                if (pack_bulk) {
+                        if (msg->lm_bufcount < 2) {
+                                CERROR("bufcount %u: missing bulk sec desc\n",
+                                       msg->lm_bufcount);
+                                RETURN(-EPROTO);
+                        }
+
+                        /* bulk checksum is the last segment */
+                        if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1,
+                                                 swabbed))
+                                RETURN(-EPROTO);
+                }
+
+                req->rq_repmsg = lustre_msg_buf(msg, 0, 0);
+                req->rq_replen = msg->lm_buflens[0];
+
+                rc = 0;
+                break;
+        case PTLRPC_GSS_PROC_ERR:
+                if (req->rq_early) {
+                        CERROR("server return error with early reply\n");
+                        rc = -EPROTO;
+                } else {
+                        rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+                }
+                break;
+        default:
+                CERROR("unexpected proc %d\n", ghdr->gh_proc);
+                rc = -EPERM;
+        }
+
+        RETURN(rc);
+}
+
+/*********************************************
+ * reverse context installation              *
+ *********************************************/
+
+static inline
+int gss_install_rvs_svc_ctx(struct obd_import *imp,
+                            struct gss_sec *gsec,
+                            struct gss_cli_ctx *gctx)
+{
+        return gss_svc_upcall_install_rvs_ctx(imp, gsec, gctx);
+}
+
+/*********************************************
+ * GSS security APIs                         *
+ *********************************************/
+int gss_sec_create_common(struct gss_sec *gsec,
+                          struct ptlrpc_sec_policy *policy,
+                          struct obd_import *imp,
+                          struct ptlrpc_svc_ctx *svcctx,
+                          struct sptlrpc_flavor *sf)
+{
+        struct ptlrpc_sec   *sec;
+
+        LASSERT(imp);
+        LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS);
+
+        gsec->gs_mech = lgss_subflavor_to_mech(
+                                SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+        if (!gsec->gs_mech) {
+                CERROR("gss backend 0x%x not found\n",
+                       SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+                return -EOPNOTSUPP;
+        }
+
+	spin_lock_init(&gsec->gs_lock);
+        gsec->gs_rvs_hdl = 0ULL;
+
+	/* initialize upper ptlrpc_sec */
+	sec = &gsec->gs_base;
+	sec->ps_policy = policy;
+	atomic_set(&sec->ps_refcount, 0);
+	atomic_set(&sec->ps_nctx, 0);
+	sec->ps_id = sptlrpc_get_next_secid();
+	sec->ps_flvr = *sf;
+	sec->ps_import = class_import_get(imp);
+	spin_lock_init(&sec->ps_lock);
+	INIT_LIST_HEAD(&sec->ps_gc_list);
+	sec->ps_sepol_mtime = ktime_set(0, 0);
+	sec->ps_sepol_checknext = ktime_set(0, 0);
+	sec->ps_sepol[0] = '\0';
+
+        if (!svcctx) {
+                sec->ps_gc_interval = GSS_GC_INTERVAL;
+        } else {
+                LASSERT(sec_is_reverse(sec));
+
+                /* never do gc on reverse sec */
+                sec->ps_gc_interval = 0;
+        }
+
+        if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+                sptlrpc_enc_pool_add_user();
+
+        CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""),
+               policy->sp_name, gsec);
+        return 0;
+}
+
+void gss_sec_destroy_common(struct gss_sec *gsec)
+{
+	struct ptlrpc_sec	*sec = &gsec->gs_base;
+	ENTRY;
+
+	LASSERT(sec->ps_import);
+	LASSERT(atomic_read(&sec->ps_refcount) == 0);
+	LASSERT(atomic_read(&sec->ps_nctx) == 0);
+
+	if (gsec->gs_mech) {
+		lgss_mech_put(gsec->gs_mech);
+		gsec->gs_mech = NULL;
+	}
+
+	class_import_put(sec->ps_import);
+
+	if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+		sptlrpc_enc_pool_del_user();
+
+	EXIT;
+}
+
+void gss_sec_kill(struct ptlrpc_sec *sec)
+{
+	sec->ps_dying = 1;
+}
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx,
+                            struct ptlrpc_ctx_ops *ctxops,
+                            struct vfs_cred *vcred)
+{
+	struct gss_cli_ctx	*gctx = ctx2gctx(ctx);
+
+	gctx->gc_win = 0;
+	atomic_set(&gctx->gc_seq, 0);
+
+	INIT_HLIST_NODE(&ctx->cc_cache);
+	atomic_set(&ctx->cc_refcount, 0);
+	ctx->cc_sec = sec;
+	ctx->cc_ops = ctxops;
+	ctx->cc_expire = 0;
+	ctx->cc_flags = PTLRPC_CTX_NEW;
+	ctx->cc_vcred = *vcred;
+	spin_lock_init(&ctx->cc_lock);
+	INIT_LIST_HEAD(&ctx->cc_req_list);
+	INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+	/* take a ref on belonging sec, balanced in ctx destroying */
+	atomic_inc(&sec->ps_refcount);
+	/* statistic only */
+	atomic_inc(&sec->ps_nctx);
+
+	CDEBUG(D_SEC, "%s@%p: create ctx %p(%u->%s)\n",
+	       sec->ps_policy->sp_name, ctx->cc_sec,
+	       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+	return 0;
+}
+
+/*
+ * return value:
+ *   1: the context has been taken care of by someone else
+ *   0: proceed to really destroy the context locally
+ */
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	LASSERT(ctx->cc_sec == sec);
+
+	/*
+	 * remove UPTODATE flag of reverse ctx thus we won't send fini rpc,
+	 * this is to avoid potential problems of client side reverse svc ctx
+	 * be mis-destroyed in various recovery senarios. anyway client can
+	 * manage its reverse ctx well by associating it with its buddy ctx.
+	 */
+	if (sec_is_reverse(sec))
+		ctx->cc_flags &= ~PTLRPC_CTX_UPTODATE;
+
+	if (gctx->gc_mechctx) {
+		/* the final context fini rpc will use this ctx too, and it's
+		 * asynchronous which finished by request_out_callback(). so
+		 * we add refcount, whoever drop finally drop the refcount to
+		 * 0 should responsible for the rest of destroy. */
+		atomic_inc(&ctx->cc_refcount);
+
+		gss_do_ctx_fini_rpc(gctx);
+		gss_cli_ctx_finalize(gctx);
+
+		if (!atomic_dec_and_test(&ctx->cc_refcount))
+			return 1;
+	}
+
+	if (sec_is_reverse(sec))
+		CWARN("reverse sec %p: destroy ctx %p\n",
+		      ctx->cc_sec, ctx);
+	else
+		CWARN("%s@%p: destroy ctx %p(%u->%s)\n",
+		      sec->ps_policy->sp_name, ctx->cc_sec,
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+	return 0;
+}
+
+static
+int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec,
+                          struct ptlrpc_request *req,
+                          int svc, int msgsize)
+{
+        int                       bufsize, txtsize;
+        int                       bufcnt = 2;
+        __u32                     buflens[5];
+        ENTRY;
+
+        /*
+         * on-wire data layout:
+         * - gss header
+         * - lustre message
+         * - user descriptor (optional)
+         * - bulk sec descriptor (optional)
+         * - signature (optional)
+         *   - svc == NULL: NULL
+         *   - svc == AUTH: signature of gss header
+         *   - svc == INTG: signature of all above
+         *
+         * if this is context negotiation, reserver fixed space
+         * at the last (signature) segment regardless of svc mode.
+         */
+
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        txtsize = buflens[0];
+
+        buflens[1] = msgsize;
+        if (svc == SPTLRPC_SVC_INTG)
+                txtsize += buflens[1];
+
+        if (req->rq_pack_udesc) {
+                buflens[bufcnt] = sptlrpc_current_user_desc_size();
+                if (svc == SPTLRPC_SVC_INTG)
+                        txtsize += buflens[bufcnt];
+                bufcnt++;
+        }
+
+        if (req->rq_pack_bulk) {
+                buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                       &req->rq_flvr,
+                                                       0, req->rq_bulk_read);
+                if (svc == SPTLRPC_SVC_INTG)
+                        txtsize += buflens[bufcnt];
+                bufcnt++;
+        }
+
+        if (req->rq_ctx_init)
+                buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+        else if (svc != SPTLRPC_SVC_NULL)
+                buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+        bufsize = lustre_msg_size_v2(bufcnt, buflens);
+
+        if (!req->rq_reqbuf) {
+                bufsize = size_roundup_power2(bufsize);
+
+                OBD_ALLOC_LARGE(req->rq_reqbuf, bufsize);
+                if (!req->rq_reqbuf)
+                        RETURN(-ENOMEM);
+
+                req->rq_reqbuf_len = bufsize;
+        } else {
+                LASSERT(req->rq_pool);
+                LASSERT(req->rq_reqbuf_len >= bufsize);
+                memset(req->rq_reqbuf, 0, bufsize);
+        }
+
+        lustre_init_msg_v2(req->rq_reqbuf, bufcnt, buflens, NULL);
+        req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+        req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, msgsize);
+        LASSERT(req->rq_reqmsg);
+
+        /* pack user desc here, later we might leave current user's process */
+        if (req->rq_pack_udesc)
+                sptlrpc_pack_user_desc(req->rq_reqbuf, 2);
+
+        RETURN(0);
+}
+
+static
+int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec,
+                          struct ptlrpc_request *req,
+                          int msgsize)
+{
+        __u32                     ibuflens[3], wbuflens[2];
+        int                       ibufcnt;
+        int                       clearsize, wiresize;
+        ENTRY;
+
+        LASSERT(req->rq_clrbuf == NULL);
+        LASSERT(req->rq_clrbuf_len == 0);
+
+        /* Inner (clear) buffers
+         *  - lustre message
+         *  - user descriptor (optional)
+         *  - bulk checksum (optional)
+         */
+        ibufcnt = 1;
+        ibuflens[0] = msgsize;
+
+        if (req->rq_pack_udesc)
+                ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size();
+        if (req->rq_pack_bulk)
+                ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                           &req->rq_flvr, 0,
+                                                           req->rq_bulk_read);
+
+        clearsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+        /* to allow append padding during encryption */
+        clearsize += GSS_MAX_CIPHER_BLOCK;
+
+        /* Wrapper (wire) buffers
+         *  - gss header
+         *  - cipher text
+         */
+        wbuflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        wbuflens[1] = gss_cli_payload(req->rq_cli_ctx, clearsize, 1);
+        wiresize = lustre_msg_size_v2(2, wbuflens);
+
+        if (req->rq_pool) {
+                /* rq_reqbuf is preallocated */
+                LASSERT(req->rq_reqbuf);
+                LASSERT(req->rq_reqbuf_len >= wiresize);
+
+                memset(req->rq_reqbuf, 0, req->rq_reqbuf_len);
+
+                /* if the pre-allocated buffer is big enough, we just pack
+                 * both clear buf & request buf in it, to avoid more alloc. */
+                if (clearsize + wiresize <= req->rq_reqbuf_len) {
+                        req->rq_clrbuf =
+                                (void *) (((char *) req->rq_reqbuf) + wiresize);
+                } else {
+                        CWARN("pre-allocated buf size %d is not enough for "
+                              "both clear (%d) and cipher (%d) text, proceed "
+                              "with extra allocation\n", req->rq_reqbuf_len,
+                              clearsize, wiresize);
+                }
+        }
+
+        if (!req->rq_clrbuf) {
+                clearsize = size_roundup_power2(clearsize);
+
+                OBD_ALLOC_LARGE(req->rq_clrbuf, clearsize);
+                if (!req->rq_clrbuf)
+                        RETURN(-ENOMEM);
+        }
+        req->rq_clrbuf_len = clearsize;
+
+        lustre_init_msg_v2(req->rq_clrbuf, ibufcnt, ibuflens, NULL);
+        req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, msgsize);
+
+        if (req->rq_pack_udesc)
+                sptlrpc_pack_user_desc(req->rq_clrbuf, 1);
+
+        RETURN(0);
+}
+
+/*
+ * NOTE: any change of request buffer allocation should also consider
+ * changing enlarge_reqbuf() series functions.
+ */
+int gss_alloc_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req,
+                     int msgsize)
+{
+        int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+        LASSERT(!req->rq_pack_bulk ||
+                (req->rq_bulk_read || req->rq_bulk_write));
+
+        switch (svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                return gss_alloc_reqbuf_intg(sec, req, svc, msgsize);
+        case SPTLRPC_SVC_PRIV:
+                return gss_alloc_reqbuf_priv(sec, req, msgsize);
+        default:
+                LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+                return 0;
+        }
+}
+
+void gss_free_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req)
+{
+        int     privacy;
+        ENTRY;
+
+        LASSERT(!req->rq_pool || req->rq_reqbuf);
+        privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV;
+
+        if (!req->rq_clrbuf)
+                goto release_reqbuf;
+
+        /* release clear buffer */
+        LASSERT(privacy);
+        LASSERT(req->rq_clrbuf_len);
+
+        if (req->rq_pool == NULL ||
+            req->rq_clrbuf < req->rq_reqbuf ||
+            (char *) req->rq_clrbuf >=
+            (char *) req->rq_reqbuf + req->rq_reqbuf_len)
+                OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+
+        req->rq_clrbuf = NULL;
+        req->rq_clrbuf_len = 0;
+
+release_reqbuf:
+        if (!req->rq_pool && req->rq_reqbuf) {
+                LASSERT(req->rq_reqbuf_len);
+
+                OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+                req->rq_reqbuf = NULL;
+                req->rq_reqbuf_len = 0;
+        }
+
+        EXIT;
+}
+
+static int do_alloc_repbuf(struct ptlrpc_request *req, int bufsize)
+{
+        bufsize = size_roundup_power2(bufsize);
+
+        OBD_ALLOC_LARGE(req->rq_repbuf, bufsize);
+        if (!req->rq_repbuf)
+                return -ENOMEM;
+
+        req->rq_repbuf_len = bufsize;
+        return 0;
+}
+
+static
+int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec,
+                          struct ptlrpc_request *req,
+                          int svc, int msgsize)
+{
+        int             txtsize;
+        __u32           buflens[4];
+        int             bufcnt = 2;
+        int             alloc_size;
+
+        /*
+         * on-wire data layout:
+         * - gss header
+         * - lustre message
+         * - bulk sec descriptor (optional)
+         * - signature (optional)
+         *   - svc == NULL: NULL
+         *   - svc == AUTH: signature of gss header
+         *   - svc == INTG: signature of all above
+         *
+         * if this is context negotiation, reserver fixed space
+         * at the last (signature) segment regardless of svc mode.
+         */
+
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        txtsize = buflens[0];
+
+        buflens[1] = msgsize;
+        if (svc == SPTLRPC_SVC_INTG)
+                txtsize += buflens[1];
+
+        if (req->rq_pack_bulk) {
+                buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                       &req->rq_flvr,
+                                                       1, req->rq_bulk_read);
+                if (svc == SPTLRPC_SVC_INTG)
+                        txtsize += buflens[bufcnt];
+                bufcnt++;
+        }
+
+        if (req->rq_ctx_init)
+                buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+        else if (svc != SPTLRPC_SVC_NULL)
+                buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+        alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+
+        /* add space for early reply */
+        alloc_size += gss_at_reply_off_integ;
+
+        return do_alloc_repbuf(req, alloc_size);
+}
+
+static
+int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec,
+                          struct ptlrpc_request *req,
+                          int msgsize)
+{
+        int             txtsize;
+        __u32           buflens[2];
+        int             bufcnt;
+        int             alloc_size;
+
+        /* inner buffers */
+        bufcnt = 1;
+        buflens[0] = msgsize;
+
+        if (req->rq_pack_bulk)
+                buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                         &req->rq_flvr,
+                                                         1, req->rq_bulk_read);
+        txtsize = lustre_msg_size_v2(bufcnt, buflens);
+        txtsize += GSS_MAX_CIPHER_BLOCK;
+
+        /* wrapper buffers */
+        bufcnt = 2;
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1);
+
+        alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+        /* add space for early reply */
+        alloc_size += gss_at_reply_off_priv;
+
+        return do_alloc_repbuf(req, alloc_size);
+}
+
+int gss_alloc_repbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req,
+                     int msgsize)
+{
+        int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        ENTRY;
+
+        LASSERT(!req->rq_pack_bulk ||
+                (req->rq_bulk_read || req->rq_bulk_write));
+
+        switch (svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                return gss_alloc_repbuf_intg(sec, req, svc, msgsize);
+        case SPTLRPC_SVC_PRIV:
+                return gss_alloc_repbuf_priv(sec, req, msgsize);
+        default:
+                LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+                return 0;
+        }
+}
+
+void gss_free_repbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req)
+{
+        OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+        req->rq_repbuf = NULL;
+        req->rq_repbuf_len = 0;
+        req->rq_repdata = NULL;
+        req->rq_repdata_len = 0;
+}
+
+static int get_enlarged_msgsize(struct lustre_msg *msg,
+                                int segment, int newsize)
+{
+        int save, newmsg_size;
+
+        LASSERT(newsize >= msg->lm_buflens[segment]);
+
+        save = msg->lm_buflens[segment];
+        msg->lm_buflens[segment] = newsize;
+        newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+        msg->lm_buflens[segment] = save;
+
+        return newmsg_size;
+}
+
+static int get_enlarged_msgsize2(struct lustre_msg *msg,
+                                 int segment1, int newsize1,
+                                 int segment2, int newsize2)
+{
+        int save1, save2, newmsg_size;
+
+        LASSERT(newsize1 >= msg->lm_buflens[segment1]);
+        LASSERT(newsize2 >= msg->lm_buflens[segment2]);
+
+        save1 = msg->lm_buflens[segment1];
+        save2 = msg->lm_buflens[segment2];
+        msg->lm_buflens[segment1] = newsize1;
+        msg->lm_buflens[segment2] = newsize2;
+        newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+        msg->lm_buflens[segment1] = save1;
+        msg->lm_buflens[segment2] = save2;
+
+        return newmsg_size;
+}
+
+static
+int gss_enlarge_reqbuf_intg(struct ptlrpc_sec *sec,
+                            struct ptlrpc_request *req,
+                            int svc,
+                            int segment, int newsize)
+{
+        struct lustre_msg      *newbuf;
+        int                     txtsize, sigsize = 0, i;
+        int                     newmsg_size, newbuf_size;
+
+        /*
+         * gss header is at seg 0;
+         * embedded msg is at seg 1;
+         * signature (if any) is at the last seg
+         */
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_reqbuf_len > req->rq_reqlen);
+        LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+        LASSERT(lustre_msg_buf(req->rq_reqbuf, 1, 0) == req->rq_reqmsg);
+
+        /* 1. compute new embedded msg size */
+        newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+        LASSERT(newmsg_size >= req->rq_reqbuf->lm_buflens[1]);
+
+        /* 2. compute new wrapper msg size */
+        if (svc == SPTLRPC_SVC_NULL) {
+                /* no signature, get size directly */
+                newbuf_size = get_enlarged_msgsize(req->rq_reqbuf,
+                                                   1, newmsg_size);
+        } else {
+                txtsize = req->rq_reqbuf->lm_buflens[0];
+
+                if (svc == SPTLRPC_SVC_INTG) {
+                        for (i = 1; i < req->rq_reqbuf->lm_bufcount; i++)
+                                txtsize += req->rq_reqbuf->lm_buflens[i];
+                        txtsize += newmsg_size - req->rq_reqbuf->lm_buflens[1];
+                }
+
+                sigsize = gss_cli_payload(req->rq_cli_ctx, txtsize, 0);
+                LASSERT(sigsize >= msg_last_seglen(req->rq_reqbuf));
+
+                newbuf_size = get_enlarged_msgsize2(
+                                        req->rq_reqbuf,
+                                        1, newmsg_size,
+                                        msg_last_segidx(req->rq_reqbuf),
+                                        sigsize);
+        }
+
+        /* request from pool should always have enough buffer */
+        LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+        if (req->rq_reqbuf_len < newbuf_size) {
+                newbuf_size = size_roundup_power2(newbuf_size);
+
+                OBD_ALLOC_LARGE(newbuf, newbuf_size);
+                if (newbuf == NULL)
+                        RETURN(-ENOMEM);
+
+		/* Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+
+                memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+                OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+                req->rq_reqbuf = newbuf;
+                req->rq_reqbuf_len = newbuf_size;
+                req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+        }
+
+        /* do enlargement, from wrapper to embedded, from end to begin */
+        if (svc != SPTLRPC_SVC_NULL)
+                _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf,
+                                             msg_last_segidx(req->rq_reqbuf),
+                                             sigsize);
+
+        _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, 1, newmsg_size);
+        _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+        req->rq_reqlen = newmsg_size;
+        RETURN(0);
+}
+
+static
+int gss_enlarge_reqbuf_priv(struct ptlrpc_sec *sec,
+                            struct ptlrpc_request *req,
+                            int segment, int newsize)
+{
+        struct lustre_msg      *newclrbuf;
+        int                     newmsg_size, newclrbuf_size, newcipbuf_size;
+        __u32                   buflens[3];
+
+        /*
+         * embedded msg is at seg 0 of clear buffer;
+         * cipher text is at seg 2 of cipher buffer;
+         */
+        LASSERT(req->rq_pool ||
+                (req->rq_reqbuf == NULL && req->rq_reqbuf_len == 0));
+        LASSERT(req->rq_reqbuf == NULL ||
+                (req->rq_pool && req->rq_reqbuf->lm_bufcount == 3));
+        LASSERT(req->rq_clrbuf);
+        LASSERT(req->rq_clrbuf_len > req->rq_reqlen);
+        LASSERT(lustre_msg_buf(req->rq_clrbuf, 0, 0) == req->rq_reqmsg);
+
+        /* compute new embedded msg size */
+        newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+
+        /* compute new clear buffer size */
+        newclrbuf_size = get_enlarged_msgsize(req->rq_clrbuf, 0, newmsg_size);
+        newclrbuf_size += GSS_MAX_CIPHER_BLOCK;
+
+        /* compute new cipher buffer size */
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0);
+        buflens[2] = gss_cli_payload(req->rq_cli_ctx, newclrbuf_size, 1);
+        newcipbuf_size = lustre_msg_size_v2(3, buflens);
+
+        /* handle the case that we put both clear buf and cipher buf into
+         * pre-allocated single buffer. */
+        if (unlikely(req->rq_pool) &&
+            req->rq_clrbuf >= req->rq_reqbuf &&
+            (char *) req->rq_clrbuf <
+            (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+                /* it couldn't be better we still fit into the
+                 * pre-allocated buffer. */
+                if (newclrbuf_size + newcipbuf_size <= req->rq_reqbuf_len) {
+                        void *src, *dst;
+
+			if (req->rq_import)
+				spin_lock(&req->rq_import->imp_lock);
+                        /* move clear text backward. */
+                        src = req->rq_clrbuf;
+                        dst = (char *) req->rq_reqbuf + newcipbuf_size;
+
+                        memmove(dst, src, req->rq_clrbuf_len);
+
+                        req->rq_clrbuf = (struct lustre_msg *) dst;
+                        req->rq_clrbuf_len = newclrbuf_size;
+                        req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+
+			if (req->rq_import)
+				spin_unlock(&req->rq_import->imp_lock);
+                } else {
+                        /* sadly we have to split out the clear buffer */
+                        LASSERT(req->rq_reqbuf_len >= newcipbuf_size);
+                        LASSERT(req->rq_clrbuf_len < newclrbuf_size);
+                }
+        }
+
+        if (req->rq_clrbuf_len < newclrbuf_size) {
+                newclrbuf_size = size_roundup_power2(newclrbuf_size);
+
+                OBD_ALLOC_LARGE(newclrbuf, newclrbuf_size);
+                if (newclrbuf == NULL)
+                        RETURN(-ENOMEM);
+
+		/* Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+
+                memcpy(newclrbuf, req->rq_clrbuf, req->rq_clrbuf_len);
+
+                if (req->rq_reqbuf == NULL ||
+                    req->rq_clrbuf < req->rq_reqbuf ||
+                    (char *) req->rq_clrbuf >=
+                    (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+                        OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+                }
+
+                req->rq_clrbuf = newclrbuf;
+                req->rq_clrbuf_len = newclrbuf_size;
+                req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+        }
+
+        _sptlrpc_enlarge_msg_inplace(req->rq_clrbuf, 0, newmsg_size);
+        _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+        req->rq_reqlen = newmsg_size;
+
+        RETURN(0);
+}
+
+int gss_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                       struct ptlrpc_request *req,
+                       int segment, int newsize)
+{
+        int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+        LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini);
+
+        switch (svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                return gss_enlarge_reqbuf_intg(sec, req, svc, segment, newsize);
+        case SPTLRPC_SVC_PRIV:
+                return gss_enlarge_reqbuf_priv(sec, req, segment, newsize);
+        default:
+                LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+                return 0;
+        }
+}
+
+int gss_sec_install_rctx(struct obd_import *imp,
+                         struct ptlrpc_sec *sec,
+                         struct ptlrpc_cli_ctx *ctx)
+{
+        struct gss_sec     *gsec;
+        struct gss_cli_ctx *gctx;
+        int                 rc;
+
+        gsec = container_of(sec, struct gss_sec, gs_base);
+        gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+        rc = gss_install_rvs_svc_ctx(imp, gsec, gctx);
+        return rc;
+}
+
+/********************************************
+ * server side API                          *
+ ********************************************/
+
+static inline
+int gss_svc_reqctx_is_special(struct gss_svc_reqctx *grctx)
+{
+        LASSERT(grctx);
+        return (grctx->src_init || grctx->src_init_continue ||
+                grctx->src_err_notify);
+}
+
+static
+void gss_svc_reqctx_free(struct gss_svc_reqctx *grctx)
+{
+        if (grctx->src_ctx)
+                gss_svc_upcall_put_ctx(grctx->src_ctx);
+
+        sptlrpc_policy_put(grctx->src_base.sc_policy);
+        OBD_FREE_PTR(grctx);
+}
+
+static inline
+void gss_svc_reqctx_addref(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+	atomic_inc(&grctx->src_base.sc_refcount);
+}
+
+static inline
+void gss_svc_reqctx_decref(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+
+	if (atomic_dec_and_test(&grctx->src_base.sc_refcount))
+		gss_svc_reqctx_free(grctx);
+}
+
+static
+int gss_svc_sign(struct ptlrpc_request *req,
+                 struct ptlrpc_reply_state *rs,
+                 struct gss_svc_reqctx *grctx,
+                 __u32 svc)
+{
+        __u32   flags = 0;
+        int     rc;
+        ENTRY;
+
+        LASSERT(rs->rs_msg == lustre_msg_buf(rs->rs_repbuf, 1, 0));
+
+        /* embedded lustre_msg might have been shrunk */
+        if (req->rq_replen != rs->rs_repbuf->lm_buflens[1])
+                lustre_shrink_msg(rs->rs_repbuf, 1, req->rq_replen, 1);
+
+        if (req->rq_pack_bulk)
+                flags |= LUSTRE_GSS_PACK_BULK;
+
+        rc = gss_sign_msg(rs->rs_repbuf, grctx->src_ctx->gsc_mechctx,
+                          LUSTRE_SP_ANY, flags, PTLRPC_GSS_PROC_DATA,
+                          grctx->src_wirectx.gw_seq, svc, NULL);
+        if (rc < 0)
+                RETURN(rc);
+
+        rs->rs_repdata_len = rc;
+
+        if (likely(req->rq_packed_final)) {
+                if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                        req->rq_reply_off = gss_at_reply_off_integ;
+                else
+                        req->rq_reply_off = 0;
+        } else {
+                if (svc == SPTLRPC_SVC_NULL)
+                        rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0,
+                                        lustre_msg_buf(rs->rs_repbuf, 1, 0),
+                                        lustre_msg_buflen(rs->rs_repbuf, 1));
+                req->rq_reply_off = 0;
+        }
+
+        RETURN(0);
+}
+
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor)
+{
+        struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        struct ptlrpc_reply_state *rs;
+        struct gss_err_header     *ghdr;
+        int                        replen = sizeof(struct ptlrpc_body);
+        int                        rc;
+        ENTRY;
+
+        //if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_SVCGSS_ERR_NOTIFY, OBD_FAIL_ONCE))
+        //      RETURN(-EINVAL);
+
+        grctx->src_err_notify = 1;
+        grctx->src_reserve_len = 0;
+
+        rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+        if (rc) {
+                CERROR("could not pack reply, err %d\n", rc);
+                RETURN(rc);
+        }
+
+        /* gss hdr */
+        rs = req->rq_reply_state;
+        LASSERT(rs->rs_repbuf->lm_buflens[1] >= sizeof(*ghdr));
+        ghdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_flags = 0;
+        ghdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+        ghdr->gh_major = major;
+        ghdr->gh_minor = minor;
+        ghdr->gh_handle.len = 0; /* fake context handle */
+
+        rs->rs_repdata_len = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+                                                rs->rs_repbuf->lm_buflens);
+
+        CDEBUG(D_SEC, "prepare gss error notify(0x%x/0x%x) to %s\n",
+               major, minor, libcfs_nid2str(req->rq_peer.nid));
+        RETURN(0);
+}
+
+static
+int gss_svc_handle_init(struct ptlrpc_request *req,
+                        struct gss_wire_ctx *gw)
+{
+        struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        struct lustre_msg         *reqbuf = req->rq_reqbuf;
+        struct obd_uuid           *uuid;
+        struct obd_device         *target;
+        rawobj_t                   uuid_obj, rvs_hdl, in_token;
+        __u32                      lustre_svc;
+        __u32                     *secdata, seclen;
+        int                        swabbed, rc;
+        ENTRY;
+
+        CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc,
+               libcfs_nid2str(req->rq_peer.nid));
+
+        req->rq_ctx_init = 1;
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+                CERROR("unexpected bulk flag\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) {
+                CERROR("proc %u: invalid handle length %u\n",
+                       gw->gw_proc, gw->gw_handle.len);
+                RETURN(SECSVC_DROP);
+        }
+
+        if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4){
+                CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount);
+                RETURN(SECSVC_DROP);
+        }
+
+	swabbed = req_capsule_req_need_swab(&req->rq_pill);
+
+        /* ctx initiate payload is in last segment */
+        secdata = lustre_msg_buf(reqbuf, reqbuf->lm_bufcount - 1, 0);
+        seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1];
+
+        if (seclen < 4 + 4) {
+                CERROR("sec size %d too small\n", seclen);
+                RETURN(SECSVC_DROP);
+        }
+
+        /* lustre svc type */
+        lustre_svc = le32_to_cpu(*secdata++);
+        seclen -= 4;
+
+        /* extract target uuid, note this code is somewhat fragile
+         * because touched internal structure of obd_uuid */
+        if (rawobj_extract(&uuid_obj, &secdata, &seclen)) {
+                CERROR("failed to extract target uuid\n");
+                RETURN(SECSVC_DROP);
+        }
+        uuid_obj.data[uuid_obj.len - 1] = '\0';
+
+        uuid = (struct obd_uuid *) uuid_obj.data;
+        target = class_uuid2obd(uuid);
+        if (!target || target->obd_stopping || !target->obd_set_up) {
+                CERROR("target '%s' is not available for context init (%s)\n",
+                       uuid->uuid, target == NULL ? "no target" :
+                       (target->obd_stopping ? "stopping" : "not set up"));
+                RETURN(SECSVC_DROP);
+        }
+
+        /* extract reverse handle */
+        if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) {
+                CERROR("failed extract reverse handle\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        /* extract token */
+        if (rawobj_extract(&in_token, &secdata, &seclen)) {
+                CERROR("can't extract token\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc,
+                                        &rvs_hdl, &in_token);
+        if (rc != SECSVC_OK)
+                RETURN(rc);
+
+	if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
+	    grctx->src_ctx->gsc_usr_root)
+		CWARN("create svc ctx %p: user from %s authenticated as %s\n",
+		      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
+		      grctx->src_ctx->gsc_usr_root ? "root" :
+		      (grctx->src_ctx->gsc_usr_mds ? "mds" :
+		       (grctx->src_ctx->gsc_usr_oss ? "oss" : "null")));
+	else
+		CWARN("create svc ctx %p: accept user %u from %s\n",
+		      grctx->src_ctx, grctx->src_ctx->gsc_uid,
+		      libcfs_nid2str(req->rq_peer.nid));
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+                if (reqbuf->lm_bufcount < 4) {
+                        CERROR("missing user descriptor\n");
+                        RETURN(SECSVC_DROP);
+                }
+                if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) {
+                        CERROR("Mal-formed user descriptor\n");
+                        RETURN(SECSVC_DROP);
+                }
+
+                req->rq_pack_udesc = 1;
+                req->rq_user_desc = lustre_msg_buf(reqbuf, 2, 0);
+        }
+
+        req->rq_reqmsg = lustre_msg_buf(reqbuf, 1, 0);
+        req->rq_reqlen = lustre_msg_buflen(reqbuf, 1);
+
+        RETURN(rc);
+}
+
+/*
+ * last segment must be the gss signature.
+ */
+static
+int gss_svc_verify_request(struct ptlrpc_request *req,
+                           struct gss_svc_reqctx *grctx,
+                           struct gss_wire_ctx *gw,
+                           __u32 *major)
+{
+        struct gss_svc_ctx *gctx = grctx->src_ctx;
+        struct lustre_msg  *msg = req->rq_reqbuf;
+        int                 offset = 2;
+        int                 swabbed;
+        ENTRY;
+
+        *major = GSS_S_COMPLETE;
+
+        if (msg->lm_bufcount < 2) {
+                CERROR("Too few segments (%u) in request\n", msg->lm_bufcount);
+                RETURN(-EINVAL);
+        }
+
+        if (gw->gw_svc == SPTLRPC_SVC_NULL)
+                goto verified;
+
+        if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+                CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+                *major = GSS_S_DUPLICATE_TOKEN;
+                RETURN(-EACCES);
+        }
+
+        *major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc);
+        if (*major != GSS_S_COMPLETE) {
+                CERROR("failed to verify request: %x\n", *major);
+                RETURN(-EACCES);
+        }
+
+        if (gctx->gsc_reverse == 0 &&
+            gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+                CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+                *major = GSS_S_DUPLICATE_TOKEN;
+                RETURN(-EACCES);
+        }
+
+verified:
+	swabbed = req_capsule_req_need_swab(&req->rq_pill);
+
+        /* user descriptor */
+        if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+                if (msg->lm_bufcount < (offset + 1)) {
+                        CERROR("no user desc included\n");
+                        RETURN(-EINVAL);
+                }
+
+                if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+                        CERROR("Mal-formed user descriptor\n");
+                        RETURN(-EINVAL);
+                }
+
+                req->rq_pack_udesc = 1;
+                req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+                offset++;
+        }
+
+        /* check bulk_sec_desc data */
+        if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+                if (msg->lm_bufcount < (offset + 1)) {
+                        CERROR("missing bulk sec descriptor\n");
+                        RETURN(-EINVAL);
+                }
+
+                if (bulk_sec_desc_unpack(msg, offset, swabbed))
+                        RETURN(-EINVAL);
+
+                req->rq_pack_bulk = 1;
+                grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+                grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+        }
+
+        req->rq_reqmsg = lustre_msg_buf(msg, 1, 0);
+        req->rq_reqlen = msg->lm_buflens[1];
+        RETURN(0);
+}
+
+static
+int gss_svc_unseal_request(struct ptlrpc_request *req,
+                           struct gss_svc_reqctx *grctx,
+                           struct gss_wire_ctx *gw,
+                           __u32 *major)
+{
+        struct gss_svc_ctx *gctx = grctx->src_ctx;
+        struct lustre_msg  *msg = req->rq_reqbuf;
+        int                 swabbed, msglen, offset = 1;
+        ENTRY;
+
+        if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+                CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+                *major = GSS_S_DUPLICATE_TOKEN;
+                RETURN(-EACCES);
+        }
+
+        *major = gss_unseal_msg(gctx->gsc_mechctx, msg,
+                               &msglen, req->rq_reqdata_len);
+        if (*major != GSS_S_COMPLETE) {
+                CERROR("failed to unwrap request: %x\n", *major);
+                RETURN(-EACCES);
+        }
+
+        if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+                CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+                *major = GSS_S_DUPLICATE_TOKEN;
+                RETURN(-EACCES);
+        }
+
+        swabbed = __lustre_unpack_msg(msg, msglen);
+        if (swabbed < 0) {
+                CERROR("Failed to unpack after decryption\n");
+                RETURN(-EINVAL);
+        }
+        req->rq_reqdata_len = msglen;
+
+        if (msg->lm_bufcount < 1) {
+                CERROR("Invalid buffer: is empty\n");
+                RETURN(-EINVAL);
+        }
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+                if (msg->lm_bufcount < offset + 1) {
+                        CERROR("no user descriptor included\n");
+                        RETURN(-EINVAL);
+                }
+
+                if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+                        CERROR("Mal-formed user descriptor\n");
+                        RETURN(-EINVAL);
+                }
+
+                req->rq_pack_udesc = 1;
+                req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+                offset++;
+        }
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+                if (msg->lm_bufcount < offset + 1) {
+                        CERROR("no bulk checksum included\n");
+                        RETURN(-EINVAL);
+                }
+
+                if (bulk_sec_desc_unpack(msg, offset, swabbed))
+                        RETURN(-EINVAL);
+
+                req->rq_pack_bulk = 1;
+                grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+                grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+        }
+
+        req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+        req->rq_reqlen = req->rq_reqbuf->lm_buflens[0];
+        RETURN(0);
+}
+
+static
+int gss_svc_handle_data(struct ptlrpc_request *req,
+                        struct gss_wire_ctx *gw)
+{
+        struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        __u32                  major = 0;
+        int                    rc = 0;
+        ENTRY;
+
+        grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+        if (!grctx->src_ctx) {
+                major = GSS_S_NO_CONTEXT;
+                goto error;
+        }
+
+        switch (gw->gw_svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                rc = gss_svc_verify_request(req, grctx, gw, &major);
+                break;
+        case SPTLRPC_SVC_PRIV:
+                rc = gss_svc_unseal_request(req, grctx, gw, &major);
+                break;
+        default:
+                CERROR("unsupported gss service %d\n", gw->gw_svc);
+                rc = -EINVAL;
+        }
+
+        if (rc == 0)
+                RETURN(SECSVC_OK);
+
+	CERROR("svc %u failed: major 0x%08x: req xid %llu ctx %p idx "
+	       "%#llx(%u->%s)\n", gw->gw_svc, major, req->rq_xid,
+               grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+               grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+error:
+        /* we only notify client in case of NO_CONTEXT/BAD_SIG, which
+         * might happen after server reboot, to allow recovery. */
+        if ((major == GSS_S_NO_CONTEXT || major == GSS_S_BAD_SIG) &&
+            gss_pack_err_notify(req, major, 0) == 0)
+                RETURN(SECSVC_COMPLETE);
+
+        RETURN(SECSVC_DROP);
+}
+
+static
+int gss_svc_handle_destroy(struct ptlrpc_request *req,
+                           struct gss_wire_ctx *gw)
+{
+        struct gss_svc_reqctx  *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        __u32                   major;
+        ENTRY;
+
+        req->rq_ctx_fini = 1;
+        req->rq_no_reply = 1;
+
+        grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+        if (!grctx->src_ctx) {
+                CDEBUG(D_SEC, "invalid gss context handle for destroy.\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        if (gw->gw_svc != SPTLRPC_SVC_INTG) {
+                CERROR("svc %u is not supported in destroy.\n", gw->gw_svc);
+                RETURN(SECSVC_DROP);
+        }
+
+        if (gss_svc_verify_request(req, grctx, gw, &major))
+                RETURN(SECSVC_DROP);
+
+	CWARN("destroy svc ctx %p idx %#llx (%u->%s)\n",
+              grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+              grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+        gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+                if (req->rq_reqbuf->lm_bufcount < 4) {
+                        CERROR("missing user descriptor, ignore it\n");
+                        RETURN(SECSVC_OK);
+                }
+                if (sptlrpc_unpack_user_desc(req->rq_reqbuf, 2,
+				req_capsule_req_need_swab(&req->rq_pill))) {
+                        CERROR("Mal-formed user descriptor, ignore it\n");
+                        RETURN(SECSVC_OK);
+                }
+
+                req->rq_pack_udesc = 1;
+                req->rq_user_desc = lustre_msg_buf(req->rq_reqbuf, 2, 0);
+        }
+
+        RETURN(SECSVC_OK);
+}
+
+int gss_svc_accept(struct ptlrpc_sec_policy *policy, struct ptlrpc_request *req)
+{
+        struct gss_header      *ghdr;
+        struct gss_svc_reqctx  *grctx;
+        struct gss_wire_ctx    *gw;
+        int                     swabbed, rc;
+        ENTRY;
+
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_svc_ctx == NULL);
+
+        if (req->rq_reqbuf->lm_bufcount < 2) {
+                CERROR("buf count only %d\n", req->rq_reqbuf->lm_bufcount);
+                RETURN(SECSVC_DROP);
+        }
+
+	swabbed = req_capsule_req_need_swab(&req->rq_pill);
+
+        ghdr = gss_swab_header(req->rq_reqbuf, 0, swabbed);
+        if (ghdr == NULL) {
+                CERROR("can't decode gss header\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        /* sanity checks */
+        if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+                CERROR("gss version %u, expect %u\n", ghdr->gh_version,
+                       PTLRPC_GSS_VERSION);
+                RETURN(SECSVC_DROP);
+        }
+
+        req->rq_sp_from = ghdr->gh_sp;
+
+        /* alloc grctx data */
+        OBD_ALLOC_PTR(grctx);
+        if (!grctx)
+                RETURN(SECSVC_DROP);
+
+	grctx->src_base.sc_policy = sptlrpc_policy_get(policy);
+	atomic_set(&grctx->src_base.sc_refcount, 1);
+	req->rq_svc_ctx = &grctx->src_base;
+	gw = &grctx->src_wirectx;
+
+        /* save wire context */
+        gw->gw_flags = ghdr->gh_flags;
+        gw->gw_proc = ghdr->gh_proc;
+        gw->gw_seq = ghdr->gh_seq;
+        gw->gw_svc = ghdr->gh_svc;
+        rawobj_from_netobj(&gw->gw_handle, &ghdr->gh_handle);
+
+        /* keep original wire header which subject to checksum verification */
+        if (swabbed)
+                gss_header_swabber(ghdr);
+
+        switch(ghdr->gh_proc) {
+        case PTLRPC_GSS_PROC_INIT:
+        case PTLRPC_GSS_PROC_CONTINUE_INIT:
+                rc = gss_svc_handle_init(req, gw);
+                break;
+        case PTLRPC_GSS_PROC_DATA:
+                rc = gss_svc_handle_data(req, gw);
+                break;
+        case PTLRPC_GSS_PROC_DESTROY:
+                rc = gss_svc_handle_destroy(req, gw);
+                break;
+        default:
+                CERROR("unknown proc %u\n", gw->gw_proc);
+                rc = SECSVC_DROP;
+                break;
+        }
+
+        switch (rc) {
+        case SECSVC_OK:
+                LASSERT (grctx->src_ctx);
+
+                req->rq_auth_gss = 1;
+                req->rq_auth_usr_mdt = grctx->src_ctx->gsc_usr_mds;
+                req->rq_auth_usr_ost = grctx->src_ctx->gsc_usr_oss;
+                req->rq_auth_usr_root = grctx->src_ctx->gsc_usr_root;
+                req->rq_auth_uid = grctx->src_ctx->gsc_uid;
+                req->rq_auth_mapped_uid = grctx->src_ctx->gsc_mapped_uid;
+                break;
+        case SECSVC_COMPLETE:
+                break;
+        case SECSVC_DROP:
+                gss_svc_reqctx_free(grctx);
+                req->rq_svc_ctx = NULL;
+                break;
+        }
+
+        RETURN(rc);
+}
+
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx)
+{
+        struct gss_svc_reqctx  *grctx;
+        ENTRY;
+
+        if (svc_ctx == NULL) {
+                EXIT;
+                return;
+        }
+
+        grctx = gss_svc_ctx2reqctx(svc_ctx);
+
+        CWARN("gss svc invalidate ctx %p(%u)\n",
+              grctx->src_ctx, grctx->src_ctx->gsc_uid);
+        gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+        EXIT;
+}
+
+static inline
+int gss_svc_payload(struct gss_svc_reqctx *grctx, int early,
+                    int msgsize, int privacy)
+{
+        /* we should treat early reply normally, but which is actually sharing
+         * the same ctx with original request, so in this case we should
+         * ignore the special ctx's special flags */
+        if (early == 0 && gss_svc_reqctx_is_special(grctx))
+                return grctx->src_reserve_len;
+
+        return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx,
+                                struct sptlrpc_flavor *flvr,
+                                int read)
+{
+        int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+        if (read) {
+                switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+                case SPTLRPC_BULK_SVC_NULL:
+                        break;
+                case SPTLRPC_BULK_SVC_INTG:
+                        payload += gss_mech_payload(NULL, 0, 0);
+                        break;
+                case SPTLRPC_BULK_SVC_PRIV:
+                        payload += gss_mech_payload(NULL, 0, 1);
+                        break;
+                case SPTLRPC_BULK_SVC_AUTH:
+                default:
+                        LBUG();
+                }
+        }
+
+        return payload;
+}
+
+int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+        struct gss_svc_reqctx       *grctx;
+        struct ptlrpc_reply_state   *rs;
+        int                          early, privacy, svc, bsd_off = 0;
+        __u32                        ibuflens[2], buflens[4];
+        int                          ibufcnt = 0, bufcnt;
+        int                          txtsize, wmsg_size, rs_size;
+        ENTRY;
+
+        LASSERT(msglen % 8 == 0);
+
+        if (req->rq_pack_bulk && !req->rq_bulk_read && !req->rq_bulk_write) {
+                CERROR("client request bulk sec on non-bulk rpc\n");
+                RETURN(-EPROTO);
+        }
+
+        svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        early = (req->rq_packed_final == 0);
+
+        grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        if (!early && gss_svc_reqctx_is_special(grctx))
+                privacy = 0;
+        else
+                privacy = (svc == SPTLRPC_SVC_PRIV);
+
+        if (privacy) {
+                /* inner clear buffers */
+                ibufcnt = 1;
+                ibuflens[0] = msglen;
+
+                if (req->rq_pack_bulk) {
+                        LASSERT(grctx->src_reqbsd);
+
+                        bsd_off = ibufcnt;
+                        ibuflens[ibufcnt++] = gss_svc_bulk_payload(
+                                                        grctx->src_ctx,
+                                                        &req->rq_flvr,
+                                                        req->rq_bulk_read);
+                }
+
+                txtsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+                txtsize += GSS_MAX_CIPHER_BLOCK;
+
+                /* wrapper buffer */
+                bufcnt = 2;
+                buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+                buflens[1] = gss_svc_payload(grctx, early, txtsize, 1);
+        } else {
+                bufcnt = 2;
+                buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+                buflens[1] = msglen;
+
+                txtsize = buflens[0];
+                if (svc == SPTLRPC_SVC_INTG)
+                        txtsize += buflens[1];
+
+                if (req->rq_pack_bulk) {
+                        LASSERT(grctx->src_reqbsd);
+
+                        bsd_off = bufcnt;
+                        buflens[bufcnt] = gss_svc_bulk_payload(
+                                                        grctx->src_ctx,
+                                                        &req->rq_flvr,
+                                                        req->rq_bulk_read);
+                        if (svc == SPTLRPC_SVC_INTG)
+                                txtsize += buflens[bufcnt];
+                        bufcnt++;
+                }
+
+                if ((!early && gss_svc_reqctx_is_special(grctx)) ||
+                    svc != SPTLRPC_SVC_NULL)
+                        buflens[bufcnt++] = gss_svc_payload(grctx, early,
+                                                            txtsize, 0);
+        }
+
+        wmsg_size = lustre_msg_size_v2(bufcnt, buflens);
+
+        rs_size = sizeof(*rs) + wmsg_size;
+        rs = req->rq_reply_state;
+
+        if (rs) {
+                /* pre-allocated */
+                LASSERT(rs->rs_size >= rs_size);
+        } else {
+                OBD_ALLOC_LARGE(rs, rs_size);
+                if (rs == NULL)
+                        RETURN(-ENOMEM);
+
+                rs->rs_size = rs_size;
+        }
+
+        rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+        rs->rs_repbuf_len = wmsg_size;
+
+        /* initialize the buffer */
+        if (privacy) {
+                lustre_init_msg_v2(rs->rs_repbuf, ibufcnt, ibuflens, NULL);
+                rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 0, msglen);
+        } else {
+                lustre_init_msg_v2(rs->rs_repbuf, bufcnt, buflens, NULL);
+                rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+                rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 1, 0);
+        }
+
+        if (bsd_off) {
+                grctx->src_repbsd = lustre_msg_buf(rs->rs_repbuf, bsd_off, 0);
+                grctx->src_repbsd_size = lustre_msg_buflen(rs->rs_repbuf,
+                                                           bsd_off);
+        }
+
+        gss_svc_reqctx_addref(grctx);
+        rs->rs_svc_ctx = req->rq_svc_ctx;
+
+        LASSERT(rs->rs_msg);
+        req->rq_reply_state = rs;
+        RETURN(0);
+}
+
+static int gss_svc_seal(struct ptlrpc_request *req,
+                        struct ptlrpc_reply_state *rs,
+                        struct gss_svc_reqctx *grctx)
+{
+        struct gss_svc_ctx      *gctx = grctx->src_ctx;
+        rawobj_t                 hdrobj, msgobj, token;
+        struct gss_header       *ghdr;
+        __u8                    *token_buf;
+        int                      token_buflen; 
+        __u32                    buflens[2], major;
+        int                      msglen, rc;
+        ENTRY;
+
+        /* get clear data length. note embedded lustre_msg might
+         * have been shrunk */
+        if (req->rq_replen != lustre_msg_buflen(rs->rs_repbuf, 0))
+                msglen = lustre_shrink_msg(rs->rs_repbuf, 0, req->rq_replen, 1);
+        else 
+                msglen = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+                                            rs->rs_repbuf->lm_buflens);
+
+        /* temporarily use tail of buffer to hold gss header data */
+        LASSERT(msglen + PTLRPC_GSS_HEADER_SIZE <= rs->rs_repbuf_len);
+        ghdr = (struct gss_header *) ((char *) rs->rs_repbuf +
+                                rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE);
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_sp = LUSTRE_SP_ANY;
+        ghdr->gh_flags = 0;
+        ghdr->gh_proc = PTLRPC_GSS_PROC_DATA;
+        ghdr->gh_seq = grctx->src_wirectx.gw_seq;
+        ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+        ghdr->gh_handle.len = 0;
+        if (req->rq_pack_bulk)
+                ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+
+        /* allocate temporary cipher buffer */
+        token_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1);
+        OBD_ALLOC_LARGE(token_buf, token_buflen);
+        if (token_buf == NULL)
+                RETURN(-ENOMEM);
+
+        hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+        hdrobj.data = (__u8 *) ghdr;
+        msgobj.len = msglen;
+        msgobj.data = (__u8 *) rs->rs_repbuf;
+        token.len = token_buflen;
+        token.data = token_buf;
+
+        major = lgss_wrap(gctx->gsc_mechctx, &hdrobj, &msgobj,
+                          rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE, &token);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("wrap message error: %08x\n", major);
+                GOTO(out_free, rc = -EPERM);
+        }
+        LASSERT(token.len <= token_buflen);
+
+        /* we are about to override data at rs->rs_repbuf, nullify pointers
+         * to which to catch further illegal usage. */
+        if (req->rq_pack_bulk) {
+                grctx->src_repbsd = NULL;
+                grctx->src_repbsd_size = 0;
+        }
+
+        /* now fill the actual wire data
+         * - gss header
+         * - gss token
+         */
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = token.len;
+
+        rs->rs_repdata_len = lustre_msg_size_v2(2, buflens);
+        LASSERT(rs->rs_repdata_len <= rs->rs_repbuf_len);
+
+        lustre_init_msg_v2(rs->rs_repbuf, 2, buflens, NULL);
+        rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+        memcpy(lustre_msg_buf(rs->rs_repbuf, 0, 0), ghdr,
+               PTLRPC_GSS_HEADER_SIZE);
+        memcpy(lustre_msg_buf(rs->rs_repbuf, 1, 0), token.data, token.len);
+
+        /* reply offset */
+        if (req->rq_packed_final &&
+            (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))
+                req->rq_reply_off = gss_at_reply_off_priv;
+        else
+                req->rq_reply_off = 0;
+
+        /* to catch upper layer's further access */
+        rs->rs_msg = NULL;
+        req->rq_repmsg = NULL;
+        req->rq_replen = 0;
+
+        rc = 0;
+out_free:
+        OBD_FREE_LARGE(token_buf, token_buflen);
+        RETURN(rc);
+}
+
+int gss_svc_authorize(struct ptlrpc_request *req)
+{
+        struct ptlrpc_reply_state *rs = req->rq_reply_state;
+        struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        struct gss_wire_ctx       *gw = &grctx->src_wirectx;
+        int                        early, rc;
+        ENTRY;
+
+        early = (req->rq_packed_final == 0);
+
+        if (!early && gss_svc_reqctx_is_special(grctx)) {
+                LASSERT(rs->rs_repdata_len != 0);
+
+                req->rq_reply_off = gss_at_reply_off_integ;
+                RETURN(0);
+        }
+
+        /* early reply could happen in many cases */
+        if (!early &&
+            gw->gw_proc != PTLRPC_GSS_PROC_DATA &&
+            gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) {
+                CERROR("proc %d not support\n", gw->gw_proc);
+                RETURN(-EINVAL);
+        }
+
+        LASSERT(grctx->src_ctx);
+
+        switch (gw->gw_svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                rc = gss_svc_sign(req, rs, grctx, gw->gw_svc);
+                break;
+        case SPTLRPC_SVC_PRIV:
+                rc = gss_svc_seal(req, rs, grctx);
+                break;
+        default:
+                CERROR("Unknown service %d\n", gw->gw_svc);
+                GOTO(out, rc = -EINVAL);
+        }
+        rc = 0;
+
+out:
+        RETURN(rc);
+}
+
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+        struct gss_svc_reqctx *grctx;
+
+        LASSERT(rs->rs_svc_ctx);
+        grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base);
+
+        gss_svc_reqctx_decref(grctx);
+        rs->rs_svc_ctx = NULL;
+
+        if (!rs->rs_prealloc)
+                OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->sc_refcount) == 0);
+	gss_svc_reqctx_free(gss_svc_ctx2reqctx(ctx));
+}
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+                         struct ptlrpc_svc_ctx *svc_ctx)
+{
+        struct gss_cli_ctx     *cli_gctx = ctx2gctx(cli_ctx);
+        struct gss_svc_ctx     *svc_gctx = gss_svc_ctx2gssctx(svc_ctx);
+        struct gss_ctx         *mechctx = NULL;
+
+        LASSERT(cli_gctx);
+        LASSERT(svc_gctx && svc_gctx->gsc_mechctx);
+
+        cli_gctx->gc_proc = PTLRPC_GSS_PROC_DATA;
+        cli_gctx->gc_win = GSS_SEQ_WIN;
+
+	/* The problem is the reverse ctx might get lost in some recovery
+	 * situations, and the same svc_ctx will be used to re-create it.
+	 * if there's callback be sentout before that, new reverse ctx start
+	 * with sequence 0 will lead to future callback rpc be treated as
+	 * replay.
+	 *
+	 * each reverse root ctx will record its latest sequence number on its
+	 * buddy svcctx before be destroyed, so here we continue use it.
+	 */
+	atomic_set(&cli_gctx->gc_seq, svc_gctx->gsc_rvs_seq);
+
+	if (gss_svc_upcall_dup_handle(&cli_gctx->gc_svc_handle, svc_gctx)) {
+		CERROR("failed to dup svc handle\n");
+		goto err_out;
+	}
+
+        if (lgss_copy_reverse_context(svc_gctx->gsc_mechctx, &mechctx) !=
+            GSS_S_COMPLETE) {
+                CERROR("failed to copy mech context\n");
+                goto err_svc_handle;
+        }
+
+        if (rawobj_dup(&cli_gctx->gc_handle, &svc_gctx->gsc_rvs_hdl)) {
+                CERROR("failed to dup reverse handle\n");
+                goto err_ctx;
+        }
+
+        cli_gctx->gc_mechctx = mechctx;
+        gss_cli_ctx_uptodate(cli_gctx);
+
+        return 0;
+
+err_ctx:
+        lgss_delete_sec_context(&mechctx);
+err_svc_handle:
+        rawobj_free(&cli_gctx->gc_svc_handle);
+err_out:
+        return -ENOMEM;
+}
+
+static void gss_init_at_reply_offset(void)
+{
+        __u32 buflens[3];
+        int clearsize;
+
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = lustre_msg_early_size;
+        buflens[2] = gss_cli_payload(NULL, buflens[1], 0);
+        gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens);
+
+        buflens[0] = lustre_msg_early_size;
+        clearsize = lustre_msg_size_v2(1, buflens);
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = gss_cli_payload(NULL, clearsize, 0);
+        buflens[2] = gss_cli_payload(NULL, clearsize, 1);
+        gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens);
+}
+
+static int __init sptlrpc_gss_init(void)
+{
+        int rc;
+
+	rc = gss_init_tunables();
+        if (rc)
+                return rc;
+
+        rc = gss_init_cli_upcall();
+        if (rc)
+		goto out_tunables;
+
+        rc = gss_init_svc_upcall();
+        if (rc)
+                goto out_cli_upcall;
+
+	rc = init_null_module();
+	if (rc)
+		goto out_svc_upcall;
+
+	rc = init_kerberos_module();
+	if (rc)
+		goto out_null;
+
+	rc = init_sk_module();
+	if (rc)
+		goto out_kerberos;
+
+	/* register policy after all other stuff be initialized, because it
+	 * might be in used immediately after the registration. */
+
+	rc = gss_init_keyring();
+	if (rc)
+		goto out_sk;
+
+	rc = gss_init_pipefs();
+	if (rc)
+		goto out_keyring;
+
+	gss_init_at_reply_offset();
+
+	return 0;
+
+out_keyring:
+	gss_exit_keyring();
+out_sk:
+	cleanup_sk_module();
+out_kerberos:
+	cleanup_kerberos_module();
+out_null:
+	cleanup_null_module();
+out_svc_upcall:
+	gss_exit_svc_upcall();
+out_cli_upcall:
+	gss_exit_cli_upcall();
+out_tunables:
+	gss_exit_tunables();
+	return rc;
+}
+
+static void __exit sptlrpc_gss_exit(void)
+{
+        gss_exit_keyring();
+        gss_exit_pipefs();
+        cleanup_kerberos_module();
+        gss_exit_svc_upcall();
+        gss_exit_cli_upcall();
+	gss_exit_tunables();
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre GSS security policy");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(sptlrpc_gss_init);
+module_exit(sptlrpc_gss_exit);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/heap.c b/drivers/staging/lustrefsx/lustre/ptlrpc/heap.c
new file mode 100644
index 0000000000000..b96ea1864c6a9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/heap.c
@@ -0,0 +1,497 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/libcfs/heap.c
+ *
+ * Author: Eric Barton	<eeb@whamcloud.com>
+ *	   Liang Zhen	<liang@whamcloud.com>
+ */
+/** \addtogroup heap
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+#include <lustre_net.h>
+#include "heap.h"
+
+#define CBH_ALLOC(ptr, h)						\
+do {									\
+	if (h->cbh_cptab) {						\
+		if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW)		\
+			LIBCFS_CPT_ALLOC_GFP((ptr), h->cbh_cptab,	\
+					     h->cbh_cptid, CBH_NOB,	\
+					     GFP_ATOMIC);		\
+		else							\
+			LIBCFS_CPT_ALLOC((ptr), h->cbh_cptab,		\
+					 h->cbh_cptid, CBH_NOB);	\
+	} else {							\
+		if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW)		\
+			LIBCFS_ALLOC_ATOMIC((ptr), CBH_NOB);		\
+		else							\
+			LIBCFS_ALLOC((ptr), CBH_NOB);			\
+	}								\
+} while (0)
+
+#define CBH_FREE(ptr)	LIBCFS_FREE(ptr, CBH_NOB)
+
+/**
+ * Grows the capacity of a binary heap so that it can handle a larger number of
+ * \e struct binheap_node objects.
+ *
+ * \param[in] h The binary heap
+ *
+ * \retval 0	   Successfully grew the heap
+ * \retval -ENOMEM OOM error
+ */
+static int
+binheap_grow(struct binheap *h)
+{
+	struct binheap_node ***frag1 = NULL;
+	struct binheap_node  **frag2;
+	int hwm = h->cbh_hwm;
+
+	/* need a whole new chunk of pointers */
+	LASSERT((h->cbh_hwm & CBH_MASK) == 0);
+
+	if (hwm == 0) {
+		/* first use of single indirect */
+		CBH_ALLOC(h->cbh_elements1, h);
+		if (h->cbh_elements1 == NULL)
+			return -ENOMEM;
+
+		goto out;
+	}
+
+	hwm -= CBH_SIZE;
+	if (hwm < CBH_SIZE * CBH_SIZE) {
+		/* not filled double indirect */
+		CBH_ALLOC(frag2, h);
+		if (frag2 == NULL)
+			return -ENOMEM;
+
+		if (hwm == 0) {
+			/* first use of double indirect */
+			CBH_ALLOC(h->cbh_elements2, h);
+			if (h->cbh_elements2 == NULL) {
+				CBH_FREE(frag2);
+				return -ENOMEM;
+			}
+		}
+
+		h->cbh_elements2[hwm >> CBH_SHIFT] = frag2;
+		goto out;
+	}
+
+	hwm -= CBH_SIZE * CBH_SIZE;
+#if (CBH_SHIFT * 3 < 32)
+	if (hwm >= CBH_SIZE * CBH_SIZE * CBH_SIZE) {
+		/* filled triple indirect */
+		return -ENOMEM;
+	}
+#endif
+	CBH_ALLOC(frag2, h);
+	if (frag2 == NULL)
+		return -ENOMEM;
+
+	if (((hwm >> CBH_SHIFT) & CBH_MASK) == 0) {
+		/* first use of this 2nd level index */
+		CBH_ALLOC(frag1, h);
+		if (frag1 == NULL) {
+			CBH_FREE(frag2);
+			return -ENOMEM;
+		}
+	}
+
+	if (hwm == 0) {
+		/* first use of triple indirect */
+		CBH_ALLOC(h->cbh_elements3, h);
+		if (h->cbh_elements3 == NULL) {
+			CBH_FREE(frag2);
+			CBH_FREE(frag1);
+			return -ENOMEM;
+		}
+	}
+
+	if (frag1 != NULL) {
+		LASSERT(h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] == NULL);
+		h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] = frag1;
+	} else {
+		frag1 = h->cbh_elements3[hwm >> (2 * CBH_SHIFT)];
+		LASSERT(frag1 != NULL);
+	}
+
+	frag1[(hwm >> CBH_SHIFT) & CBH_MASK] = frag2;
+
+ out:
+	h->cbh_hwm += CBH_SIZE;
+	return 0;
+}
+
+/**
+ * Creates and initializes a binary heap instance.
+ *
+ * \param[in] ops   The operations to be used
+ * \param[in] flags The heap flags
+ * \parm[in]  count The initial heap capacity in # of elements
+ * \param[in] arg   An optional private argument
+ * \param[in] cptab The CPT table this heap instance will operate over
+ * \param[in] cptid The CPT id of \a cptab this heap instance will operate over
+ *
+ * \retval valid-pointer A newly-created and initialized binary heap object
+ * \retval NULL		 error
+ */
+struct binheap *
+binheap_create(struct binheap_ops *ops, unsigned int flags,
+		   unsigned int count, void *arg, struct cfs_cpt_table *cptab,
+		   int cptid)
+{
+	struct binheap *h;
+
+	LASSERT(ops != NULL);
+	LASSERT(ops->hop_compare != NULL);
+	if (cptab) {
+		LASSERT(cptid == CFS_CPT_ANY ||
+		       (cptid >= 0 && cptid < cfs_cpt_number(cptab)));
+		LIBCFS_CPT_ALLOC(h, cptab, cptid, sizeof(*h));
+	} else {
+		LIBCFS_ALLOC(h, sizeof(*h));
+	}
+	if (!h)
+		return NULL;
+
+	h->cbh_ops	  = ops;
+	h->cbh_nelements  = 0;
+	h->cbh_hwm	  = 0;
+	h->cbh_private	  = arg;
+	h->cbh_flags	  = flags & (~CBH_FLAG_ATOMIC_GROW);
+	h->cbh_cptab	  = cptab;
+	h->cbh_cptid	  = cptid;
+
+	while (h->cbh_hwm < count) { /* preallocate */
+		if (binheap_grow(h) != 0) {
+			binheap_destroy(h);
+			return NULL;
+		}
+	}
+
+	h->cbh_flags |= flags & CBH_FLAG_ATOMIC_GROW;
+
+	return h;
+}
+EXPORT_SYMBOL(binheap_create);
+
+/**
+ * Releases all resources associated with a binary heap instance.
+ *
+ * Deallocates memory for all indirection levels and the binary heap object
+ * itself.
+ *
+ * \param[in] h The binary heap object
+ */
+void
+binheap_destroy(struct binheap *h)
+{
+	int idx0;
+	int idx1;
+	int n;
+
+	LASSERT(h != NULL);
+
+	n = h->cbh_hwm;
+
+	if (n > 0) {
+		CBH_FREE(h->cbh_elements1);
+		n -= CBH_SIZE;
+	}
+
+	if (n > 0) {
+		for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+			CBH_FREE(h->cbh_elements2[idx0]);
+			n -= CBH_SIZE;
+		}
+
+		CBH_FREE(h->cbh_elements2);
+	}
+
+	if (n > 0) {
+		for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+
+			for (idx1 = 0; idx1 < CBH_SIZE && n > 0; idx1++) {
+				CBH_FREE(h->cbh_elements3[idx0][idx1]);
+				n -= CBH_SIZE;
+			}
+
+			CBH_FREE(h->cbh_elements3[idx0]);
+		}
+
+		CBH_FREE(h->cbh_elements3);
+	}
+
+	LIBCFS_FREE(h, sizeof(*h));
+}
+EXPORT_SYMBOL(binheap_destroy);
+
+/**
+ * Obtains a double pointer to a heap element, given its index into the binary
+ * tree.
+ *
+ * \param[in] h	  The binary heap instance
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer A double pointer to a heap pointer entry
+ */
+static struct binheap_node **
+binheap_pointer(struct binheap *h, unsigned int idx)
+{
+	if (idx < CBH_SIZE)
+		return &(h->cbh_elements1[idx]);
+
+	idx -= CBH_SIZE;
+	if (idx < CBH_SIZE * CBH_SIZE)
+		return &(h->cbh_elements2[idx >> CBH_SHIFT][idx & CBH_MASK]);
+
+	idx -= CBH_SIZE * CBH_SIZE;
+	return &(h->cbh_elements3[idx >> (2 * CBH_SHIFT)]
+				 [(idx >> CBH_SHIFT) & CBH_MASK]
+				 [idx & CBH_MASK]);
+}
+
+/**
+ * Obtains a pointer to a heap element, given its index into the binary tree.
+ *
+ * \param[in] h	  The binary heap
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer The requested heap node
+ * \retval NULL		 Supplied index is out of bounds
+ */
+struct binheap_node *
+binheap_find(struct binheap *h, unsigned int idx)
+{
+	if (idx >= h->cbh_nelements)
+		return NULL;
+
+	return *binheap_pointer(h, idx);
+}
+EXPORT_SYMBOL(binheap_find);
+
+/**
+ * Moves a node upwards, towards the root of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+binheap_bubble(struct binheap *h, struct binheap_node *e)
+{
+	unsigned int	     cur_idx = e->chn_index;
+	struct binheap_node **cur_ptr;
+	unsigned int	     parent_idx;
+	struct binheap_node **parent_ptr;
+	int		     did_sth = 0;
+
+	cur_ptr = binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	while (cur_idx > 0) {
+		parent_idx = (cur_idx - 1) >> 1;
+
+		parent_ptr = binheap_pointer(h, parent_idx);
+		LASSERT((*parent_ptr)->chn_index == parent_idx);
+
+		if (h->cbh_ops->hop_compare(*parent_ptr, e))
+			break;
+
+		(*parent_ptr)->chn_index = cur_idx;
+		*cur_ptr = *parent_ptr;
+		cur_ptr = parent_ptr;
+		cur_idx = parent_idx;
+		did_sth = 1;
+	}
+
+	e->chn_index = cur_idx;
+	*cur_ptr = e;
+
+	return did_sth;
+}
+
+/**
+ * Moves a node downwards, towards the last level of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+binheap_sink(struct binheap *h, struct binheap_node *e)
+{
+	unsigned int	     n = h->cbh_nelements;
+	unsigned int	     child_idx;
+	struct binheap_node **child_ptr;
+	struct binheap_node  *child;
+	unsigned int	     child2_idx;
+	struct binheap_node **child2_ptr;
+	struct binheap_node  *child2;
+	unsigned int	     cur_idx;
+	struct binheap_node **cur_ptr;
+	int		     did_sth = 0;
+
+	cur_idx = e->chn_index;
+	cur_ptr = binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	while (cur_idx < n) {
+		child_idx = (cur_idx << 1) + 1;
+		if (child_idx >= n)
+			break;
+
+		child_ptr = binheap_pointer(h, child_idx);
+		child = *child_ptr;
+
+		child2_idx = child_idx + 1;
+		if (child2_idx < n) {
+			child2_ptr = binheap_pointer(h, child2_idx);
+			child2 = *child2_ptr;
+
+			if (h->cbh_ops->hop_compare(child2, child)) {
+				child_idx = child2_idx;
+				child_ptr = child2_ptr;
+				child = child2;
+			}
+		}
+
+		LASSERT(child->chn_index == child_idx);
+
+		if (h->cbh_ops->hop_compare(e, child))
+			break;
+
+		child->chn_index = cur_idx;
+		*cur_ptr = child;
+		cur_ptr = child_ptr;
+		cur_idx = child_idx;
+		did_sth = 1;
+	}
+
+	e->chn_index = cur_idx;
+	*cur_ptr = e;
+
+	return did_sth;
+}
+
+/**
+ * Sort-inserts a node into the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 0	Element inserted successfully
+ * \retval != 0 error
+ */
+int
+binheap_insert(struct binheap *h, struct binheap_node *e)
+{
+	struct binheap_node **new_ptr;
+	unsigned int	     new_idx = h->cbh_nelements;
+	int		     rc;
+
+	if (new_idx == h->cbh_hwm) {
+		rc = binheap_grow(h);
+		if (rc != 0)
+			return rc;
+	}
+
+	if (h->cbh_ops->hop_enter) {
+		rc = h->cbh_ops->hop_enter(h, e);
+		if (rc != 0)
+			return rc;
+	}
+
+	e->chn_index = new_idx;
+	new_ptr = binheap_pointer(h, new_idx);
+	h->cbh_nelements++;
+	*new_ptr = e;
+
+	binheap_bubble(h, e);
+
+	return 0;
+}
+EXPORT_SYMBOL(binheap_insert);
+
+/**
+ * Removes a node from the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ */
+void
+binheap_remove(struct binheap *h, struct binheap_node *e)
+{
+	unsigned int	     n = h->cbh_nelements;
+	unsigned int	     cur_idx = e->chn_index;
+	struct binheap_node **cur_ptr;
+	struct binheap_node  *last;
+
+	LASSERT(cur_idx != CBH_POISON);
+	LASSERT(cur_idx < n);
+
+	cur_ptr = binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	n--;
+	last = *binheap_pointer(h, n);
+	h->cbh_nelements = n;
+	if (last == e)
+		return;
+
+	last->chn_index = cur_idx;
+	*cur_ptr = last;
+	binheap_relocate(h, *cur_ptr);
+
+	e->chn_index = CBH_POISON;
+	if (h->cbh_ops->hop_exit)
+		h->cbh_ops->hop_exit(h, e);
+}
+EXPORT_SYMBOL(binheap_remove);
+
+/**
+ * Relocate a node in the binary heap.
+ * Should be called whenever a node's values
+ * which affects its ranking are changed.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ */
+void
+binheap_relocate(struct binheap *h, struct binheap_node *e)
+{
+	if (!binheap_bubble(h, e))
+		binheap_sink(h, e);
+}
+EXPORT_SYMBOL(binheap_relocate);
+/** @} heap */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/heap.h b/drivers/staging/lustrefsx/lustre/ptlrpc/heap.h
new file mode 100644
index 0000000000000..7cd5c1c00645a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/heap.h
@@ -0,0 +1,188 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/include/libcfs/heap.h
+ *
+ * Author: Eric Barton	<eeb@whamcloud.com>
+ *	   Liang Zhen	<liang@whamcloud.com>
+ */
+
+#ifndef __LIBCFS_HEAP_H__
+#define __LIBCFS_HEAP_H__
+
+/** \defgroup heap Binary heap
+ *
+ * The binary heap is a scalable data structure created using a binary tree. It
+ * is capable of maintaining large sets of elements sorted usually by one or
+ * more element properties, but really based on anything that can be used as a
+ * binary predicate in order to determine the relevant ordering of any two nodes
+ * that belong to the set. There is no search operation, rather the intention is
+ * for the element of the lowest priority which will always be at the root of
+ * the tree (as this is an implementation of a min-heap) to be removed by users
+ * for consumption.
+ *
+ * Users of the heap should embed a \e struct binheap_node object instance
+ * on every object of the set that they wish the binary heap instance to handle,
+ * and (at a minimum) provide a struct binheap_ops::hop_compare()
+ * implementation which is used by the heap as the binary predicate during its
+ * internal sorting operations.
+ *
+ * The current implementation enforces no locking scheme, and so assumes the
+ * user caters for locking between calls to insert, delete and lookup
+ * operations. Since the only consumer for the data structure at this point
+ * are NRS policies, and these operate on a per-CPT basis, binary heap instances
+ * are tied to a specific CPT.
+ * @{
+ */
+
+#define CBH_SHIFT	9
+#define CBH_SIZE       (1 << CBH_SHIFT)		    /* # ptrs per level */
+#define CBH_MASK       (CBH_SIZE - 1)
+#define CBH_NOB        (CBH_SIZE * sizeof(struct binheap_node *))
+
+#define CBH_POISON	0xdeadbeef
+
+/**
+ * Binary heap flags.
+ */
+enum {
+	CBH_FLAG_ATOMIC_GROW	= 1,
+};
+
+struct binheap;
+
+/**
+ * Binary heap operations.
+ */
+struct binheap_ops {
+	/**
+	 * Called right before inserting a node into the binary heap.
+	 *
+	 * Implementing this operation is optional.
+	 *
+	 * \param[in] h The heap
+	 * \param[in] e The node
+	 *
+	 * \retval 0 success
+	 * \retval != 0 error
+	 */
+	int		(*hop_enter)(struct binheap *h,
+				     struct binheap_node *e);
+	/**
+	 * Called right after removing a node from the binary heap.
+	 *
+	 * Implementing this operation is optional.
+	 *
+	 * \param[in] h The heap
+	 * \param[in] e The node
+	 */
+	void		(*hop_exit)(struct binheap *h,
+				    struct binheap_node *e);
+	/**
+	 * A binary predicate which is called during internal heap sorting
+	 * operations, and used in order to determine the relevant ordering of
+	 * two heap nodes.
+	 *
+	 * Implementing this operation is mandatory.
+	 *
+	 * \param[in] a The first heap node
+	 * \param[in] b The second heap node
+	 *
+	 * \retval 0 Node a > node b
+	 * \retval 1 Node a < node b
+	 *
+	 * \see binheap_bubble()
+	 * \see cfs_biheap_sink()
+	 */
+	int		(*hop_compare)(struct binheap_node *a,
+				       struct binheap_node *b);
+};
+
+/**
+ * Binary heap object.
+ *
+ * Sorts elements of type \e struct binheap_node
+ */
+struct binheap {
+	/** Triple indirect */
+	struct binheap_node  ****cbh_elements3;
+	/** double indirect */
+	struct binheap_node   ***cbh_elements2;
+	/** single indirect */
+	struct binheap_node    **cbh_elements1;
+	/** # elements referenced */
+	unsigned int		cbh_nelements;
+	/** high water mark */
+	unsigned int		cbh_hwm;
+	/** user flags */
+	unsigned int		cbh_flags;
+	/** operations table */
+	struct binheap_ops *cbh_ops;
+	/** private data */
+	void		       *cbh_private;
+	/** associated CPT table */
+	struct cfs_cpt_table   *cbh_cptab;
+	/** associated CPT id of this struct binheap::cbh_cptab */
+	int			cbh_cptid;
+};
+
+void binheap_destroy(struct binheap *h);
+struct binheap *
+binheap_create(struct binheap_ops *ops, unsigned int flags,
+		   unsigned int count, void *arg, struct cfs_cpt_table *cptab,
+		   int cptid);
+struct binheap_node *
+binheap_find(struct binheap *h, unsigned int idx);
+int binheap_insert(struct binheap *h, struct binheap_node *e);
+void binheap_remove(struct binheap *h, struct binheap_node *e);
+void binheap_relocate(struct binheap *h, struct binheap_node *e);
+
+static inline int
+binheap_size(struct binheap *h)
+{
+	return h->cbh_nelements;
+}
+
+static inline int
+binheap_is_empty(struct binheap *h)
+{
+	return h->cbh_nelements == 0;
+}
+
+static inline struct binheap_node *
+binheap_root(struct binheap *h)
+{
+	return binheap_find(h, 0);
+}
+
+static inline struct binheap_node *
+binheap_remove_root(struct binheap *h)
+{
+	struct binheap_node *e = binheap_find(h, 0);
+
+	if (e != NULL)
+		binheap_remove(h, e);
+	return e;
+}
+
+/** @} heap */
+
+#endif /* __LIBCFS_HEAP_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
new file mode 100644
index 0000000000000..ae3998082661a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
@@ -0,0 +1,2069 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/import.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/fs_struct.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpc_connect_async_args {
+         __u64 pcaa_peer_committed;
+        int pcaa_initial_connect;
+};
+
+int allow_version_mismatch;
+EXPORT_SYMBOL(allow_version_mismatch);
+
+/**
+ * Updates import \a imp current state to provided \a state value
+ * Helper function.
+ */
+static void import_set_state_nolock(struct obd_import *imp,
+				    enum lustre_imp_state state)
+{
+	switch (state) {
+	case LUSTRE_IMP_CLOSED:
+	case LUSTRE_IMP_NEW:
+	case LUSTRE_IMP_DISCON:
+	case LUSTRE_IMP_CONNECTING:
+		break;
+	case LUSTRE_IMP_REPLAY_WAIT:
+		imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS;
+		break;
+	default:
+		imp->imp_replay_state = LUSTRE_IMP_REPLAY;
+		break;
+	}
+
+	/* A CLOSED import should remain so. */
+	if (imp->imp_state == LUSTRE_IMP_CLOSED)
+		return;
+
+	if (imp->imp_state != LUSTRE_IMP_NEW) {
+		CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",
+		       imp, obd2cli_tgt(imp->imp_obd),
+		       ptlrpc_import_state_name(imp->imp_state),
+		       ptlrpc_import_state_name(state));
+	}
+
+        imp->imp_state = state;
+        imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
+        imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
+		ktime_get_real_seconds();
+        imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
+                IMP_STATE_HIST_LEN;
+}
+
+static void import_set_state(struct obd_import *imp,
+			     enum lustre_imp_state new_state)
+{
+	spin_lock(&imp->imp_lock);
+	import_set_state_nolock(imp, new_state);
+	spin_unlock(&imp->imp_lock);
+}
+
+void ptlrpc_import_enter_resend(struct obd_import *imp)
+{
+	import_set_state(imp, LUSTRE_IMP_RECOVER);
+}
+EXPORT_SYMBOL(ptlrpc_import_enter_resend);
+
+
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+				    struct ptlrpc_request *request,
+				    void *args, int rc);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+
+/* Only this function is allowed to change the import state when it is
+ * CLOSED. I would rather refcount the import and free it after
+ * disconnection like we do with exports. To do that, the client_obd
+ * will need to save the peer info somewhere other than in the import,
+ * though. */
+int ptlrpc_init_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+
+	imp->imp_generation++;
+	imp->imp_state =  LUSTRE_IMP_NEW;
+
+	spin_unlock(&imp->imp_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_init_import);
+
+#define UUID_STR "_UUID"
+void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
+{
+        *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
+                ? uuid : uuid + strlen(prefix);
+
+        *uuid_len = strlen(*uuid_start);
+
+        if (*uuid_len < strlen(UUID_STR))
+                return;
+
+        if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
+                    UUID_STR, strlen(UUID_STR)))
+                *uuid_len -= strlen(UUID_STR);
+}
+
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_import_nolock(struct obd_import *imp)
+{
+	ENTRY;
+
+	assert_spin_locked(&imp->imp_lock);
+	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+	imp->imp_invalid = 1;
+	imp->imp_generation++;
+
+	ptlrpc_abort_inflight(imp);
+
+	EXIT;
+}
+
+/**
+ * Returns true if import was FULL, false if import was already not
+ * connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ *             and caused the disconnection.  In some cases, multiple
+ *             inflight requests can fail to a single target (e.g. OST
+ *             bulk requests) and if one has already caused a reconnection
+ *             (increasing the import->conn_cnt) the older failure should
+ *             not also cause a reconnection.  If zero it forces a reconnect.
+ * @invalid - set import invalid flag
+ */
+int ptlrpc_set_import_discon(struct obd_import *imp,
+			     __u32 conn_cnt, bool invalid)
+{
+	int rc = 0;
+
+	spin_lock(&imp->imp_lock);
+
+        if (imp->imp_state == LUSTRE_IMP_FULL &&
+            (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
+                char *target_start;
+                int   target_len;
+		bool  inact = false;
+
+                deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                          &target_start, &target_len);
+
+		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
+                if (imp->imp_replayable) {
+                        LCONSOLE_WARN("%s: Connection to %.*s (at %s) was "
+                               "lost; in progress operations using this "
+                               "service will wait for recovery to complete\n",
+                               imp->imp_obd->obd_name, target_len, target_start,
+			       obd_import_nid2str(imp));
+		} else {
+			LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
+			       "%.*s (at %s) was lost; in progress "
+			       "operations using this service will fail\n",
+			       imp->imp_obd->obd_name, target_len, target_start,
+			       obd_import_nid2str(imp));
+			if (invalid) {
+				CDEBUG(D_HA, "import %s@%s for %s not "
+				       "replayable, auto-deactivating\n",
+				       obd2cli_tgt(imp->imp_obd),
+				       imp->imp_connection->c_remote_uuid.uuid,
+				       imp->imp_obd->obd_name);
+				ptlrpc_deactivate_import_nolock(imp);
+				inact = true;
+			}
+		}
+		spin_unlock(&imp->imp_lock);
+
+		if (obd_dump_on_timeout)
+			libcfs_debug_dumplog();
+
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+
+		if (inact)
+			obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+		rc = 1;
+	} else {
+		spin_unlock(&imp->imp_lock);
+                CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+                       imp->imp_client->cli_name, imp,
+                       (imp->imp_state == LUSTRE_IMP_FULL &&
+                        imp->imp_conn_cnt > conn_cnt) ?
+                       "reconnected" : "not connected", imp->imp_conn_cnt,
+                       conn_cnt, ptlrpc_import_state_name(imp->imp_state));
+        }
+
+        return rc;
+}
+
+/*
+ * This acts as a barrier; all existing requests are rejected, and
+ * no new requests will be accepted until the import is valid again.
+ */
+void ptlrpc_deactivate_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+	ptlrpc_deactivate_import_nolock(imp);
+	spin_unlock(&imp->imp_lock);
+
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+}
+EXPORT_SYMBOL(ptlrpc_deactivate_import);
+
+static time64_t ptlrpc_inflight_deadline(struct ptlrpc_request *req,
+					 time64_t now)
+{
+	time64_t dl;
+
+        if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+              (req->rq_phase == RQ_PHASE_BULK) ||
+              (req->rq_phase == RQ_PHASE_NEW)))
+                return 0;
+
+        if (req->rq_timedout)
+                return 0;
+
+        if (req->rq_phase == RQ_PHASE_NEW)
+                dl = req->rq_sent;
+        else
+                dl = req->rq_deadline;
+
+        if (dl <= now)
+                return 0;
+
+        return dl - now;
+}
+
+static time64_t ptlrpc_inflight_timeout(struct obd_import *imp)
+{
+	time64_t now = ktime_get_real_seconds();
+	struct ptlrpc_request *req;
+	time64_t timeout = 0;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(req, &imp->imp_sending_list, rq_list)
+		timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
+	spin_unlock(&imp->imp_lock);
+	return timeout;
+}
+
+/**
+ * This function will invalidate the import, if necessary, then block
+ * for all the RPC completions, and finally notify the obd to
+ * invalidate its state (ie cancel locks, clear pending requests,
+ * etc).
+ */
+void ptlrpc_invalidate_import(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	time64_t timeout;
+	int rc;
+
+	atomic_inc(&imp->imp_inval_count);
+
+	if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
+		ptlrpc_deactivate_import(imp);
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CONNECT_RACE)) {
+		OBD_RACE(OBD_FAIL_PTLRPC_CONNECT_RACE);
+		msleep(10 * MSEC_PER_SEC);
+	}
+	CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
+	LASSERT(imp->imp_invalid);
+
+	/* Wait forever until inflight == 0. We really can't do it another
+	 * way because in some cases we need to wait for very long reply
+	 * unlink. We can't do anything before that because there is really
+	 * no guarantee that some rdma transfer is not in progress right now.
+	 */
+	do {
+		long timeout_jiffies;
+
+		/* Calculate max timeout for waiting on rpcs to error
+		 * out. Use obd_timeout if calculated value is smaller
+		 * than it.
+		 */
+		if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+			timeout = ptlrpc_inflight_timeout(imp);
+			timeout += div_u64(timeout, 3);
+
+			if (timeout == 0)
+				timeout = obd_timeout;
+		} else {
+			/* decrease the interval to increase race condition */
+			timeout = 1;
+		}
+
+		CDEBUG(D_RPCTRACE, "Sleeping %llds for inflight to error out\n",
+		       timeout);
+
+		/* Wait for all requests to error out and call completion
+		 * callbacks. Cap it at obd_timeout -- these should all
+		 * have been locally cancelled by ptlrpc_abort_inflight.
+		 */
+		timeout_jiffies = max_t(long, cfs_time_seconds(timeout), 1);
+		rc = wait_event_idle_timeout(
+				    imp->imp_recovery_waitq,
+				    (atomic_read(&imp->imp_inflight) == 0),
+				    timeout_jiffies);
+
+		if (rc == 0) {
+			const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
+
+			CERROR("%s: timeout waiting for callback (%d != 0)\n",
+			       cli_tgt, atomic_read(&imp->imp_inflight));
+
+			spin_lock(&imp->imp_lock);
+			if (atomic_read(&imp->imp_inflight) == 0) {
+				int count = atomic_read(&imp->imp_unregistering);
+
+				/* We know that "unregistering" rpcs only can
+				 * survive in sending or delaying lists (they
+				 * maybe waiting for long reply unlink in
+				 * sluggish nets). Let's check this. If there
+				 * is no inflight and unregistering != 0, this
+				 * is bug. */
+				LASSERTF(count == 0, "Some RPCs are still "
+					 "unregistering: %d\n", count);
+
+				/* Let's save one loop as soon as inflight have
+				 * dropped to zero. No new inflights possible at
+				 * this point. */
+				rc = 1;
+			} else {
+				list_for_each_entry(req, &imp->imp_sending_list,
+						    rq_list) {
+					DEBUG_REQ(D_ERROR, req,
+						  "still on sending list");
+				}
+				list_for_each_entry(req, &imp->imp_delayed_list,
+						    rq_list) {
+					DEBUG_REQ(D_ERROR, req,
+						  "still on delayed list");
+				}
+
+				CERROR("%s: Unregistering RPCs found (%d). "
+				       "Network is sluggish? Waiting for them "
+				       "to error out.\n", cli_tgt,
+				       atomic_read(&imp->imp_unregistering));
+			}
+			spin_unlock(&imp->imp_lock);
+		}
+	} while (rc == 0);
+
+	/*
+	 * Let's additionally check that no new rpcs added to import in
+	 * "invalidate" state.
+	 */
+	LASSERT(atomic_read(&imp->imp_inflight) == 0);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+	sptlrpc_import_flush_all_ctx(imp);
+
+	atomic_dec(&imp->imp_inval_count);
+	wake_up(&imp->imp_recovery_waitq);
+}
+EXPORT_SYMBOL(ptlrpc_invalidate_import);
+
+/* unset imp_invalid */
+void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full)
+{
+	struct obd_device *obd = imp->imp_obd;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_deactive != 0) {
+		LASSERT(imp->imp_state != LUSTRE_IMP_FULL);
+		if (imp->imp_state != LUSTRE_IMP_DISCON)
+			import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+	if (set_state_full)
+		import_set_state_nolock(imp, LUSTRE_IMP_FULL);
+
+	imp->imp_invalid = 0;
+
+	spin_unlock(&imp->imp_lock);
+	obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
+}
+EXPORT_SYMBOL(ptlrpc_activate_import);
+
+void ptlrpc_pinger_force(struct obd_import *imp)
+{
+	CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(imp->imp_state));
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_force_verify = 1;
+	spin_unlock(&imp->imp_lock);
+
+	if (imp->imp_state != LUSTRE_IMP_CONNECTING)
+		ptlrpc_pinger_wake_up();
+}
+EXPORT_SYMBOL(ptlrpc_pinger_force);
+
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
+{
+	ENTRY;
+
+	LASSERT(!imp->imp_dlm_fake);
+
+	if (ptlrpc_set_import_discon(imp, conn_cnt, true))
+		ptlrpc_pinger_force(imp);
+
+	EXIT;
+}
+
+int ptlrpc_reconnect_import(struct obd_import *imp)
+{
+#ifdef CONFIG_LUSTRE_FS_PINGER
+	long timeout_jiffies = cfs_time_seconds(obd_timeout);
+	int rc;
+
+	ptlrpc_pinger_force(imp);
+
+	CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+	       obd2cli_tgt(imp->imp_obd), obd_timeout);
+
+	rc = wait_event_idle_timeout(imp->imp_recovery_waitq,
+				     !ptlrpc_import_in_recovery(imp),
+				     timeout_jiffies);
+	if (rc == 0)
+		rc = -ETIMEDOUT;
+	else
+		rc = 0;
+	CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(imp->imp_state));
+	return rc;
+#else
+	ptlrpc_set_import_discon(imp, 0, false);
+	/* Force a new connect attempt */
+	ptlrpc_invalidate_import(imp);
+	/* Do a fresh connect next time by zeroing the handle */
+	ptlrpc_disconnect_import(imp, 1);
+	/* Wait for all invalidate calls to finish */
+	if (atomic_read(&imp->imp_inval_count) > 0) {
+		int rc;
+
+		rc = l_wait_event_abortable(
+			imp->imp_recovery_waitq,
+			(atomic_read(&imp->imp_inval_count) == 0));
+		if (rc)
+			CERROR("Interrupted, inval=%d\n",
+			       atomic_read(&imp->imp_inval_count));
+	}
+
+	/* Allow reconnect attempts */
+	imp->imp_obd->obd_no_recov = 0;
+	/* Remove 'invalid' flag */
+	ptlrpc_activate_import(imp, false);
+	/* Attempt a new connect */
+	ptlrpc_recover_import(imp, NULL, 0);
+	return 0;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_reconnect_import);
+
+/**
+ * Connection on import \a imp is changed to another one (if more than one is
+ * present). We typically chose connection that we have not tried to connect to
+ * the longest
+ */
+static int import_select_connection(struct obd_import *imp)
+{
+	struct obd_import_conn *imp_conn = NULL, *conn;
+	struct obd_export *dlmexp;
+	char *target_start;
+	int target_len, tried_all = 1;
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+
+	if (list_empty(&imp->imp_conn_list)) {
+		rc = -EINVAL;
+		CERROR("%s: no connections available: rc = %d\n",
+		       imp->imp_obd->obd_name, rc);
+		GOTO(out_unlock, rc);
+	}
+
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n",
+		       imp->imp_obd->obd_name,
+		       libcfs_nidstr(&conn->oic_conn->c_peer.nid),
+		       conn->oic_last_attempt);
+
+		/* If we have not tried this connection since
+		 * the last successful attempt, go with this one
+		 */
+		if ((conn->oic_last_attempt == 0) ||
+		    conn->oic_last_attempt <= imp->imp_last_success_conn) {
+			imp_conn = conn;
+			tried_all = 0;
+			break;
+		}
+
+		/* If all of the connections have already been tried
+		 * since the last successful connection; just choose the
+		 * least recently used
+		 */
+		if (!imp_conn)
+			imp_conn = conn;
+		else if (imp_conn->oic_last_attempt > conn->oic_last_attempt)
+			imp_conn = conn;
+	}
+
+	/* if not found, simply choose the current one */
+	if (!imp_conn || imp->imp_force_reconnect) {
+		LASSERT(imp->imp_conn_current);
+		imp_conn = imp->imp_conn_current;
+		tried_all = 0;
+	}
+	LASSERT(imp_conn->oic_conn);
+
+	/* If we've tried everything, and we're back to the beginning of the
+	 * list, increase our timeout and try again. It will be reset when
+	 * we do finally connect. (FIXME: really we should wait for all network
+	 * state associated with the last connection attempt to drain before
+	 * trying to reconnect on it.)
+	 */
+	if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+		struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+
+		if (at_get(at) < CONNECTION_SWITCH_MAX) {
+			at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
+			if (at_get(at) > CONNECTION_SWITCH_MAX)
+				at_reset(at, CONNECTION_SWITCH_MAX);
+		}
+		LASSERT(imp_conn->oic_last_attempt);
+		CDEBUG(D_HA,
+		       "%s: tried all connections, increasing latency to %ds\n",
+		       imp->imp_obd->obd_name, at_get(at));
+	}
+
+	imp_conn->oic_last_attempt = ktime_get_seconds();
+
+	/* switch connection, don't mind if it's same as the current one */
+	ptlrpc_connection_put(imp->imp_connection);
+	imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+
+	dlmexp = class_conn2export(&imp->imp_dlm_handle);
+	if (!dlmexp)
+		GOTO(out_unlock, rc = -EINVAL);
+	ptlrpc_connection_put(dlmexp->exp_connection);
+	dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+	class_export_put(dlmexp);
+
+	if (imp->imp_conn_current != imp_conn) {
+		if (imp->imp_conn_current) {
+			deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+				  &target_start, &target_len);
+
+			CDEBUG(D_HA, "%s: Connection changing to"
+			       " %.*s (at %s)\n",
+			       imp->imp_obd->obd_name,
+			       target_len, target_start,
+			       libcfs_nidstr(&imp_conn->oic_conn->c_peer.nid));
+		}
+
+		imp->imp_conn_current = imp_conn;
+	}
+
+	/* The below message is checked in conf-sanity.sh test_35[ab] */
+	CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
+	       imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
+	       libcfs_nidstr(&imp_conn->oic_conn->c_peer.nid));
+
+out_unlock:
+	spin_unlock(&imp->imp_lock);
+	RETURN(rc);
+}
+
+/*
+ * must be called under imp_lock
+ */
+static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
+{
+	struct ptlrpc_request *req;
+
+	/* The requests in committed_list always have smaller transnos than
+	 * the requests in replay_list */
+	if (!list_empty(&imp->imp_committed_list)) {
+		req = list_first_entry(&imp->imp_committed_list,
+				       struct ptlrpc_request, rq_replay_list);
+		*transno = req->rq_transno;
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_ERROR, req,
+				  "zero transno in committed_list");
+			LBUG();
+		}
+		return 1;
+	}
+	if (!list_empty(&imp->imp_replay_list)) {
+		req = list_first_entry(&imp->imp_committed_list,
+				       struct ptlrpc_request, rq_replay_list);
+		*transno = req->rq_transno;
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
+			LBUG();
+		}
+		return 1;
+	}
+	return 0;
+}
+
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+	return ptlrpc_connect_import_locked(imp);
+}
+
+/**
+ * Attempt to (re)connect import \a imp. This includes all preparations,
+ * initializing CONNECT RPC request and passing it to ptlrpcd for
+ * actual sending.
+ *
+ * Assumes imp->imp_lock is held, and releases it.
+ *
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_connect_import_locked(struct obd_import *imp)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int initial_connect = 0;
+	int set_transno = 0;
+	__u64 committed_before_reconnect = 0;
+	struct ptlrpc_request *request;
+	struct obd_connect_data ocd;
+	char *bufs[] = { NULL,
+			 obd2cli_tgt(imp->imp_obd),
+			 obd->obd_uuid.uuid,
+			 (char *)&imp->imp_dlm_handle,
+			 (char *)&ocd,
+			 NULL };
+	struct ptlrpc_connect_async_args *aa;
+	int rc;
+	ENTRY;
+
+	assert_spin_locked(&imp->imp_lock);
+
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("can't connect to a closed import\n");
+		RETURN(-EINVAL);
+	} else if (imp->imp_state == LUSTRE_IMP_FULL) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connected\n");
+		RETURN(0);
+	} else if (imp->imp_state == LUSTRE_IMP_CONNECTING ||
+		   imp->imp_state == LUSTRE_IMP_EVICTED ||
+		   imp->imp_connected) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connecting\n");
+		RETURN(-EALREADY);
+	}
+
+	import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING);
+
+	imp->imp_conn_cnt++;
+	imp->imp_resend_replay = 0;
+
+	if (!lustre_handle_is_used(&imp->imp_remote_handle))
+		initial_connect = 1;
+	else
+		committed_before_reconnect = imp->imp_peer_committed_transno;
+
+	set_transno = ptlrpc_first_transno(imp,
+					   &imp->imp_connect_data.ocd_transno);
+	spin_unlock(&imp->imp_lock);
+
+	rc = import_select_connection(imp);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = sptlrpc_import_sec_adapt(imp, NULL, NULL);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Reset connect flags to the originally requested flags, in case
+	 * the server is updated on-the-fly we will get the new features. */
+	ocd = imp->imp_connect_data;
+	ocd.ocd_connect_flags = imp->imp_connect_flags_orig;
+	ocd.ocd_connect_flags2 = imp->imp_connect_flags2_orig;
+	/* Reset ocd_version each time so the server knows the exact versions */
+	ocd.ocd_version = LUSTRE_VERSION_CODE;
+	imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+	imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+	rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
+			   &obd->obd_uuid, &ocd, NULL);
+	if (rc)
+		GOTO(out, rc);
+
+	request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
+	if (request == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(request);
+	if (rc < 0) {
+		ptlrpc_request_free(request);
+		GOTO(out, rc);
+	}
+
+	bufs[5] = request->rq_sepol;
+
+	req_capsule_set_size(&request->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(request->rq_sepol) ?
+			     strlen(request->rq_sepol) + 1 : 0);
+
+	rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
+				      imp->imp_connect_op, bufs, NULL);
+	if (rc) {
+		ptlrpc_request_free(request);
+		GOTO(out, rc);
+	}
+
+	/* Report the rpc service time to the server so that it knows how long
+	 * to wait for clients to join recovery */
+	lustre_msg_set_service_timeout(request->rq_reqmsg,
+				       at_timeout2est(request->rq_timeout));
+
+	/* The amount of time we give the server to process the connect req.
+	 * import_select_connection will increase the net latency on
+	 * repeated reconnect attempts to cover slow networks.
+	 * We override/ignore the server rpc completion estimate here,
+	 * which may be large if this is a reconnect attempt */
+	request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+	lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
+	request->rq_no_resend = request->rq_no_delay = 1;
+	request->rq_send_state = LUSTRE_IMP_CONNECTING;
+	/* Allow a slightly larger reply for future growth compatibility */
+	req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
+			     sizeof(struct obd_connect_data)+16*sizeof(__u64));
+	ptlrpc_request_set_replen(request);
+	request->rq_interpret_reply = ptlrpc_connect_interpret;
+
+	aa = ptlrpc_req_async_args(aa, request);
+	memset(aa, 0, sizeof *aa);
+
+	aa->pcaa_peer_committed = committed_before_reconnect;
+	aa->pcaa_initial_connect = initial_connect;
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_replayable = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_INITIAL);
+	}
+
+	if (set_transno)
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_TRANSNO);
+
+	DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
+		  request->rq_timeout);
+	ptlrpcd_add_req(request);
+	rc = 0;
+out:
+	if (rc != 0)
+		import_set_state(imp, LUSTRE_IMP_DISCON);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connect_import);
+
+static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
+{
+	int force_verify;
+
+	spin_lock(&imp->imp_lock);
+	force_verify = imp->imp_force_verify != 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (force_verify)
+		ptlrpc_pinger_wake_up();
+}
+
+static int ptlrpc_busy_reconnect(int rc)
+{
+        return (rc == -EBUSY) || (rc == -EAGAIN);
+}
+
+static int ptlrpc_connect_set_flags(struct obd_import *imp,
+				    struct obd_connect_data *ocd,
+				    __u64 old_connect_flags,
+				    struct obd_export *exp, int init_connect)
+{
+	static bool warned;
+	struct client_obd *cli = &imp->imp_obd->u.cli;
+
+	spin_lock(&imp->imp_lock);
+	list_move(&imp->imp_conn_current->oic_item,
+		  &imp->imp_conn_list);
+	imp->imp_last_success_conn =
+		imp->imp_conn_current->oic_last_attempt;
+
+	spin_unlock(&imp->imp_lock);
+
+	/*
+	 * We should warn on very new servers, but don't block. This
+	 * ensures forward compatibility and preserves the kernel
+	 * warning found in other Lustre versions.
+	 */
+	if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+	    (ocd->ocd_version > LUSTRE_VERSION_CODE + LUSTRE_VERSION_OFFSET_WARN)) {
+		const char *newer = "newer than client. Consider upgrading client";
+
+		if (!warned) {
+			LCONSOLE_WARN("Client version (%s). Server %s version (%d.%d.%d.%d) is much %s\n",
+				      LUSTRE_VERSION_STRING,
+				      obd2cli_tgt(imp->imp_obd),
+				      OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+				      OBD_OCD_VERSION_FIX(ocd->ocd_version),
+				      newer);
+			warned = true;
+		}
+	}
+
+	/*
+	 * Block old servers by default. This prevents the LTS Client and
+	 * LTS - 2 Server mismatch, unless users pass a mount flag.
+	 */
+	if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+	    (ocd->ocd_version < LUSTRE_VERSION_CODE -LUSTRE_VERSION_OFFSET_WARN)) {
+		const char *older = "older than client. Consider upgrading server";
+
+		if (!warned || !allow_version_mismatch) {
+			LCONSOLE_WARN("Client version (%s). Server %s version (%d.%d.%d.%d) is much %s\n",
+				      LUSTRE_VERSION_STRING,
+				      obd2cli_tgt(imp->imp_obd),
+				      OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+				      OBD_OCD_VERSION_FIX(ocd->ocd_version),
+				      older);
+			warned = true;
+		}
+
+		if (!allow_version_mismatch)
+			return -EPROTO;
+	}
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
+		/* We sent to the server ocd_cksum_types with bits set
+		 * for algorithms we understand. The server masked off
+		 * the checksum types it doesn't support */
+		if ((ocd->ocd_cksum_types &
+		     obd_cksum_types_supported_client()) == 0) {
+			LCONSOLE_ERROR("The negotiation of the checksum "
+				       "alogrithm to use with server %s "
+				       "failed (%x/%x)\n",
+				       obd2cli_tgt(imp->imp_obd),
+				       ocd->ocd_cksum_types,
+				       obd_cksum_types_supported_client());
+			return -EPROTO;
+		} else {
+			cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
+		}
+	} else {
+		/* The server does not support OBD_CONNECT_CKSUM.
+		 * Enforce ADLER for backward compatibility*/
+		cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+	}
+	cli->cl_cksum_type = obd_cksum_type_select(imp->imp_obd->obd_name,
+						  cli->cl_supp_cksum_types,
+						  cli->cl_preferred_cksum_type);
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+		cli->cl_max_pages_per_rpc =
+			min(ocd->ocd_brw_size >> PAGE_SHIFT,
+			    cli->cl_max_pages_per_rpc);
+	else if (imp->imp_connect_op == MDS_CONNECT ||
+		 imp->imp_connect_op == MGS_CONNECT)
+		cli->cl_max_pages_per_rpc = 1;
+
+	LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
+		(cli->cl_max_pages_per_rpc > 0));
+
+	client_adjust_max_dirty(cli);
+
+	/* Update client max modify RPCs in flight with value returned
+	 * by the server */
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS)
+		cli->cl_max_mod_rpcs_in_flight = min(
+					cli->cl_max_mod_rpcs_in_flight,
+					ocd->ocd_maxmodrpcs);
+	else
+		cli->cl_max_mod_rpcs_in_flight = 1;
+
+	/* Reset ns_connect_flags only for initial connect. It might be
+	 * changed in while using FS and if we reset it in reconnect
+	 * this leads to losing user settings done before such as
+	 * disable lru_resize, etc. */
+	if (old_connect_flags != exp_connect_flags(exp) || init_connect) {
+		struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+		__u64 changed_flags;
+
+		changed_flags =
+			ns->ns_connect_flags ^ ns->ns_orig_connect_flags;
+		CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server "
+			     "flags: %#llx\n", imp->imp_obd->obd_name,
+			     ocd->ocd_connect_flags);
+		ns->ns_connect_flags = (ns->ns_connect_flags & changed_flags) |
+				      (ocd->ocd_connect_flags & ~changed_flags);
+		ns->ns_orig_connect_flags = ocd->ocd_connect_flags;
+	}
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_AT)
+		/* We need a per-message support flag, because
+		 * a. we don't know if the incoming connect reply
+		 *    supports AT or not (in reply_in_callback)
+		 *    until we unpack it.
+		 * b. failovered server means export and flags are gone
+		 *    (in ptlrpc_send_reply).
+		 *    Can only be set when we know AT is supported at
+		 *    both ends */
+		imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+	else
+		imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+	imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+
+	return 0;
+}
+
+/**
+ * Add all replay requests back to unreplied list before start replay,
+ * so that we can make sure the known replied XID is always increased
+ * only even if when replaying requests.
+ */
+static void ptlrpc_prepare_replay(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	if (imp->imp_state != LUSTRE_IMP_REPLAY ||
+	    imp->imp_resend_replay)
+		return;
+
+	/* If the server was restart during repaly, the requests may
+	 * have been added to the unreplied list in former replay. */
+	spin_lock(&imp->imp_lock);
+
+	list_for_each_entry(req, &imp->imp_committed_list, rq_replay_list) {
+		if (list_empty(&req->rq_unreplied_list))
+			ptlrpc_add_unreplied(req);
+	}
+
+	list_for_each_entry(req, &imp->imp_replay_list, rq_replay_list) {
+		if (list_empty(&req->rq_unreplied_list))
+			ptlrpc_add_unreplied(req);
+	}
+
+	imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp);
+	spin_unlock(&imp->imp_lock);
+}
+
+/**
+ * interpret_reply callback for connect RPCs.
+ * Looks into returned status of connect operation and decides
+ * what to do with the import - i.e enter recovery, promote it to
+ * full state for normal operations of disconnect it due to an error.
+ */
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+				    struct ptlrpc_request *request,
+				    void *data, int rc)
+{
+	struct ptlrpc_connect_async_args *aa = data;
+	struct obd_import *imp = request->rq_import;
+	struct lustre_handle old_hdl;
+	__u64 old_connect_flags;
+	timeout_t service_timeout;
+	int msg_flags;
+	struct obd_connect_data *ocd;
+	struct obd_export *exp = NULL;
+	int ret;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		imp->imp_connect_tried = 1;
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+	imp->imp_connect_error = rc;
+	if (rc) {
+		struct ptlrpc_request *free_req;
+		struct ptlrpc_request *tmp;
+
+		/* abort all delayed requests initiated connection */
+		list_for_each_entry_safe(free_req, tmp, &imp->imp_delayed_list,
+					 rq_list) {
+			spin_lock(&free_req->rq_lock);
+			if (free_req->rq_no_resend) {
+				free_req->rq_err = 1;
+				free_req->rq_status = -EIO;
+				ptlrpc_client_wake_req(free_req);
+			}
+			spin_unlock(&free_req->rq_lock);
+		}
+
+		/* if this reconnect to busy export - not need select new target
+		 * for connecting*/
+		imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
+		spin_unlock(&imp->imp_lock);
+		GOTO(out, rc);
+	}
+
+	/* LU-7558: indicate that we are interpretting connect reply,
+	 * pltrpc_connect_import() will not try to reconnect until
+	 * interpret will finish. */
+	imp->imp_connected = 1;
+	spin_unlock(&imp->imp_lock);
+
+	LASSERT(imp->imp_conn_current);
+
+	msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
+
+	ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
+				   RCL_SERVER);
+	/* server replied obd_connect_data is always bigger */
+	ocd = req_capsule_server_sized_get(&request->rq_pill,
+					   &RMF_CONNECT_DATA, ret);
+
+	if (ocd == NULL) {
+		CERROR("%s: no connect data from server\n",
+		       imp->imp_obd->obd_name);
+		rc = -EPROTO;
+		GOTO(out, rc);
+	}
+
+	spin_lock(&imp->imp_lock);
+
+	/* All imports are pingable */
+	imp->imp_pingable = 1;
+	imp->imp_force_reconnect = 0;
+	imp->imp_force_verify = 0;
+
+	imp->imp_connect_data = *ocd;
+
+	CDEBUG(D_HA, "%s: connect to target with instance %u\n",
+	       imp->imp_obd->obd_name, ocd->ocd_instance);
+	exp = class_conn2export(&imp->imp_dlm_handle);
+
+	spin_unlock(&imp->imp_lock);
+
+	if (!exp) {
+		/* This could happen if export is cleaned during the
+		   connect attempt */
+		CERROR("%s: missing export after connect\n",
+		       imp->imp_obd->obd_name);
+		GOTO(out, rc = -ENODEV);
+	}
+
+	/* check that server granted subset of flags we asked for. */
+	if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
+	    ocd->ocd_connect_flags) {
+		CERROR("%s: Server didn't grant requested subset of flags: "
+		       "asked=%#llx granted=%#llx\n",
+		       imp->imp_obd->obd_name, imp->imp_connect_flags_orig,
+		       ocd->ocd_connect_flags);
+		GOTO(out, rc = -EPROTO);
+	}
+
+	if ((ocd->ocd_connect_flags2 & imp->imp_connect_flags2_orig) !=
+	    ocd->ocd_connect_flags2) {
+		CERROR("%s: Server didn't grant requested subset of flags2: "
+		       "asked=%#llx granted=%#llx\n",
+		       imp->imp_obd->obd_name, imp->imp_connect_flags2_orig,
+		       ocd->ocd_connect_flags2);
+		GOTO(out, rc = -EPROTO);
+	}
+
+	if (!(imp->imp_connect_flags_orig & OBD_CONNECT_LIGHTWEIGHT) &&
+	    (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) &&
+	    (imp->imp_connect_flags_orig & OBD_CONNECT_FID) &&
+	    (ocd->ocd_connect_flags & OBD_CONNECT_VERSION)) {
+		__u32 major = OBD_OCD_VERSION_MAJOR(ocd->ocd_version);
+		__u32 minor = OBD_OCD_VERSION_MINOR(ocd->ocd_version);
+		__u32 patch = OBD_OCD_VERSION_PATCH(ocd->ocd_version);
+
+		/* We do not support the MDT-MDT interoperations with
+		 * different version MDT because of protocol changes. */
+		if (unlikely(major != LUSTRE_MAJOR ||
+			     minor != LUSTRE_MINOR ||
+			     abs(patch - LUSTRE_PATCH) > 3)) {
+			LCONSOLE_WARN("%s: import %p (%u.%u.%u.%u) tried the "
+				      "connection to different version MDT "
+				      "(%d.%d.%d.%d) %s\n",
+				      imp->imp_obd->obd_name, imp, LUSTRE_MAJOR,
+				      LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX,
+				      major, minor, patch,
+				      OBD_OCD_VERSION_FIX(ocd->ocd_version),
+				      imp->imp_connection->c_remote_uuid.uuid);
+
+			GOTO(out, rc = -EPROTO);
+		}
+	}
+
+	old_connect_flags = exp_connect_flags(exp);
+	exp->exp_connect_data = *ocd;
+	imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
+
+	/* The net statistics after (re-)connect is not valid anymore,
+	 * because may reflect other routing, etc.
+	 */
+	service_timeout = lustre_msg_get_service_timeout(request->rq_repmsg);
+	at_reinit(&imp->imp_at.iat_net_latency, 0, 0);
+	ptlrpc_at_adj_net_latency(request, service_timeout);
+
+	/* Import flags should be updated before waking import at FULL state */
+	rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp,
+				      aa->pcaa_initial_connect);
+	class_export_put(exp);
+	exp = NULL;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		if (msg_flags & MSG_CONNECT_REPLAYABLE) {
+			imp->imp_replayable = 1;
+			CDEBUG(D_HA, "connected to replayable target: %s\n",
+			       obd2cli_tgt(imp->imp_obd));
+		} else {
+			imp->imp_replayable = 0;
+		}
+
+		/* if applies, adjust the imp->imp_msg_magic here
+		 * according to reply flags
+		 */
+
+		imp->imp_remote_handle =
+			*lustre_msg_get_handle(request->rq_repmsg);
+
+		/* Initial connects are allowed for clients with non-random
+		 * uuids when servers are in recovery.  Simply signal the
+		 * servers replay is complete and wait in REPLAY_WAIT.
+		 */
+		if (msg_flags & MSG_CONNECT_RECOVERING) {
+			CDEBUG(D_HA, "connect to %s during recovery\n",
+			       obd2cli_tgt(imp->imp_obd));
+			import_set_state_nolock(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			spin_unlock(&imp->imp_lock);
+		} else {
+			spin_unlock(&imp->imp_lock);
+			ptlrpc_activate_import(imp, true);
+		}
+
+		GOTO(finish, rc = 0);
+	}
+
+	/* Determine what recovery state to move the import to. */
+	if (MSG_CONNECT_RECONNECT & msg_flags) {
+		memset(&old_hdl, 0, sizeof(old_hdl));
+		if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
+			    sizeof(old_hdl))) {
+			LCONSOLE_WARN("Reconnect to %s (at @%s) failed due "
+				      "bad handle %#llx\n",
+				      obd2cli_tgt(imp->imp_obd),
+				      imp->imp_connection->c_remote_uuid.uuid,
+				      imp->imp_dlm_handle.cookie);
+			GOTO(out, rc = -ENOTCONN);
+		}
+
+		if (memcmp(&imp->imp_remote_handle,
+			   lustre_msg_get_handle(request->rq_repmsg),
+			   sizeof(imp->imp_remote_handle))) {
+			int level = msg_flags & MSG_CONNECT_RECOVERING ?
+				D_HA : D_WARNING;
+
+			/* Bug 16611/14775: if server handle have changed,
+			 * that means some sort of disconnection happened.
+			 * If the server is not in recovery, that also means it
+			 * already erased all of our state because of previous
+			 * eviction. If it is in recovery - we are safe to
+			 * participate since we can reestablish all of our state
+			 * with server again
+			 */
+			if ((MSG_CONNECT_RECOVERING & msg_flags)) {
+				CDEBUG_LIMIT(level,
+				       "%s@%s changed server handle from "
+				       "%#llx to %#llx"
+				       " but is still in recovery\n",
+				       obd2cli_tgt(imp->imp_obd),
+				       imp->imp_connection->c_remote_uuid.uuid,
+				       imp->imp_remote_handle.cookie,
+				       lustre_msg_get_handle(
+					       request->rq_repmsg)->cookie);
+			} else {
+				LCONSOLE_WARN("Evicted from %s (at %s) "
+					      "after server handle changed from "
+					      "%#llx to %#llx\n",
+					      obd2cli_tgt(imp->imp_obd),
+					      imp->imp_connection->
+					      c_remote_uuid.uuid,
+					      imp->imp_remote_handle.cookie,
+					      lustre_msg_get_handle(
+						      request->rq_repmsg)->cookie);
+			}
+
+			imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+
+			if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+				import_set_state(imp, LUSTRE_IMP_EVICTED);
+				GOTO(finish, rc = 0);
+			}
+		} else {
+			CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid);
+		}
+
+		if (imp->imp_invalid) {
+			CDEBUG(D_HA, "%s: reconnected but import is invalid; "
+			       "marking evicted\n", imp->imp_obd->obd_name);
+			import_set_state(imp, LUSTRE_IMP_EVICTED);
+		} else if (MSG_CONNECT_RECOVERING & msg_flags) {
+			CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+			       imp->imp_obd->obd_name,
+			       obd2cli_tgt(imp->imp_obd));
+
+			spin_lock(&imp->imp_lock);
+			imp->imp_resend_replay = 1;
+			spin_unlock(&imp->imp_lock);
+
+			import_set_state(imp, imp->imp_replay_state);
+		} else {
+			import_set_state(imp, LUSTRE_IMP_RECOVER);
+		}
+	} else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
+		LASSERT(imp->imp_replayable);
+		imp->imp_remote_handle =
+			*lustre_msg_get_handle(request->rq_repmsg);
+		imp->imp_last_replay_transno = 0;
+		imp->imp_replay_cursor = &imp->imp_committed_list;
+		import_set_state(imp, LUSTRE_IMP_REPLAY);
+	} else if ((ocd->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0 &&
+		   !imp->imp_invalid) {
+
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+		/* The below message is checked in recovery-small.sh test_106 */
+		DEBUG_REQ(D_HA, request, "%s: lwp recover",
+			  imp->imp_obd->obd_name);
+		imp->imp_remote_handle =
+			*lustre_msg_get_handle(request->rq_repmsg);
+		import_set_state(imp, LUSTRE_IMP_RECOVER);
+	} else {
+		DEBUG_REQ(D_HA, request,
+			  "%s: evicting (reconnect/recover flags not set: %x)",
+			  imp->imp_obd->obd_name, msg_flags);
+		imp->imp_remote_handle =
+			*lustre_msg_get_handle(request->rq_repmsg);
+		import_set_state(imp, LUSTRE_IMP_EVICTED);
+	}
+
+	/* Sanity checks for a reconnected import. */
+	if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE))
+		CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n");
+
+	if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
+	    lustre_msg_get_last_committed(request->rq_repmsg) <
+	    aa->pcaa_peer_committed) {
+		static bool printed;
+
+		/* The below message is checked in recovery-small.sh test_54 */
+		CERROR("%s: went back in time (transno %lld was previously committed, server now claims %lld)!\n",
+		       obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
+                       lustre_msg_get_last_committed(request->rq_repmsg));
+		if (!printed) {
+			CERROR("For further information, see http://doc.lustre.org/lustre_manual.xhtml#went_back_in_time\n");
+			printed = true;
+		}
+        }
+
+finish:
+	ptlrpc_prepare_replay(imp);
+	rc = ptlrpc_import_recovery_state_machine(imp);
+	if (rc == -ENOTCONN) {
+		CDEBUG(D_HA,
+		       "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connection->c_remote_uuid.uuid);
+		ptlrpc_connect_import(imp);
+		spin_lock(&imp->imp_lock);
+		imp->imp_connected = 0;
+		imp->imp_connect_tried = 1;
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+out:
+	if (exp != NULL)
+		class_export_put(exp);
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_connected = 0;
+	imp->imp_connect_tried = 1;
+
+	if (rc != 0) {
+		bool inact = false;
+		time64_t now = ktime_get_seconds();
+		time64_t next_connect;
+
+		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
+		if (rc == -EACCES) {
+			/*
+			 * Give up trying to reconnect
+			 * EACCES means client has no permission for connection
+			 */
+			imp->imp_obd->obd_no_recov = 1;
+			ptlrpc_deactivate_import_nolock(imp);
+			inact = true;
+		} else if (rc == -EPROTO) {
+			struct obd_connect_data *ocd;
+
+			/* reply message might not be ready */
+			if (request->rq_repmsg == NULL) {
+				spin_unlock(&imp->imp_lock);
+				RETURN(-EPROTO);
+			}
+
+			ocd = req_capsule_server_get(&request->rq_pill,
+						     &RMF_CONNECT_DATA);
+			/* Servers are not supposed to refuse connections from
+			 * clients based on version, only connection feature
+			 * flags.  We should never see this from llite, but it
+			 * may be useful for debugging in the future. */
+			if (ocd &&
+			    (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+			    (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+				LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
+						   "(%d.%d.%d.%d)"
+						   " refused connection from this client "
+						   "with an incompatible version (%s).  "
+						   "Client must be recompiled\n",
+						   obd2cli_tgt(imp->imp_obd),
+						   OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+						   OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+						   OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+						   OBD_OCD_VERSION_FIX(ocd->ocd_version),
+						   LUSTRE_VERSION_STRING);
+				ptlrpc_deactivate_import_nolock(imp);
+				import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
+				inact = true;
+			}
+		} else if (rc == -ENODEV || rc == -ETIMEDOUT) {
+			/* ENODEV means there is no service, force reconnection
+			 * to a pair if attempt happen ptlrpc_next_reconnect
+			 * before now. ETIMEDOUT could be set during network
+			 * error and do not guarantee request deadline happened.
+			 */
+			struct obd_import_conn *conn;
+			time64_t reconnect_time;
+
+			/* Same as ptlrpc_next_reconnect, but in past */
+			reconnect_time = now - INITIAL_CONNECT_TIMEOUT;
+			list_for_each_entry(conn, &imp->imp_conn_list,
+					    oic_item) {
+				if (conn->oic_last_attempt <= reconnect_time) {
+					imp->imp_force_verify = 1;
+					break;
+				}
+			}
+		}
+
+		next_connect = imp->imp_conn_current->oic_last_attempt +
+			       (request->rq_deadline - request->rq_sent);
+		spin_unlock(&imp->imp_lock);
+
+		if (inact)
+			obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+		if (rc == -EPROTO)
+			RETURN(rc);
+
+		/* adjust imp_next_ping to request deadline + 1 and reschedule
+		 * a pinger if import lost processing during CONNECTING or far
+		 * away from request deadline. It could happen when connection
+		 * was initiated outside of pinger, like
+		 * ptlrpc_set_import_discon().
+		 */
+		if (!imp->imp_force_verify && (imp->imp_next_ping <= now ||
+		    imp->imp_next_ping > next_connect)) {
+			imp->imp_next_ping = max(now, next_connect) + 1;
+			ptlrpc_pinger_wake_up();
+		}
+
+		ptlrpc_maybe_ping_import_soon(imp);
+
+		CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+	} else {
+		spin_unlock(&imp->imp_lock);
+	}
+
+	wake_up(&imp->imp_recovery_waitq);
+	RETURN(rc);
+}
+
+/**
+ * interpret callback for "completed replay" RPCs.
+ * \see signal_completed_replay
+ */
+static int completed_replay_interpret(const struct lu_env *env,
+				      struct ptlrpc_request *req,
+				      void *args, int rc)
+{
+	ENTRY;
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	if (req->rq_status == 0 && !req->rq_import->imp_vbr_failed) {
+		ptlrpc_import_recovery_state_machine(req->rq_import);
+	} else {
+		if (req->rq_import->imp_vbr_failed) {
+			CDEBUG(D_WARNING,
+			       "%s: version recovery fails, reconnecting\n",
+			       req->rq_import->imp_obd->obd_name);
+		} else {
+			CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+				     "reconnecting\n",
+			       req->rq_import->imp_obd->obd_name,
+			       req->rq_status);
+		}
+		ptlrpc_connect_import(req->rq_import);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Let server know that we have no requests to replay anymore.
+ * Achieved by just sending a PING request
+ */
+static int signal_completed_replay(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
+		RETURN(0);
+
+	if (!atomic_add_unless(&imp->imp_replay_inflight, 1, 1))
+		RETURN(0);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
+					OBD_PING);
+	if (req == NULL) {
+		atomic_dec(&imp->imp_replay_inflight);
+		RETURN(-ENOMEM);
+	}
+
+	ptlrpc_request_set_replen(req);
+	req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
+	lustre_msg_add_flags(req->rq_reqmsg,
+			     MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
+	if (AT_OFF)
+		req->rq_timeout *= 3;
+	req->rq_interpret_reply = completed_replay_interpret;
+
+	ptlrpcd_add_req(req);
+	RETURN(0);
+}
+
+/**
+ * In kernel code all import invalidation happens in its own
+ * separate thread, so that whatever application happened to encounter
+ * a problem could still be killed or otherwise continue
+ */
+static int ptlrpc_invalidate_import_thread(void *data)
+{
+	struct obd_import *imp = data;
+
+	ENTRY;
+	unshare_fs_struct();
+	CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	if (do_dump_on_eviction(imp->imp_obd)) {
+		CERROR("dump the log upon eviction\n");
+		libcfs_debug_dumplog();
+	}
+
+	ptlrpc_invalidate_import(imp);
+	import_set_state(imp, LUSTRE_IMP_RECOVER);
+	ptlrpc_import_recovery_state_machine(imp);
+
+	class_import_put(imp);
+	RETURN(0);
+}
+
+/**
+ * This is the state machine for client-side recovery on import.
+ *
+ * Typicaly we have two possibly paths. If we came to server and it is not
+ * in recovery, we just enter IMP_EVICTED state, invalidate our import
+ * state and reconnect from scratch.
+ * If we came to server that is in recovery, we enter IMP_REPLAY import state.
+ * We go through our list of requests to replay and send them to server one by
+ * one.
+ * After sending all request from the list we change import state to
+ * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
+ * and also all the locks we don't yet have and wait for server to grant us.
+ * After that we send a special "replay completed" request and change import
+ * state to IMP_REPLAY_WAIT.
+ * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
+ * state and resend all requests from sending list.
+ * After that we promote import to FULL state and send all delayed requests
+ * and import is fully operational after that.
+ *
+ */
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
+{
+        int rc = 0;
+        int inflight;
+        char *target_start;
+        int target_len;
+
+        ENTRY;
+        if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+		struct task_struct *task;
+
+                deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                          &target_start, &target_len);
+                /* Don't care about MGC eviction */
+                if (strcmp(imp->imp_obd->obd_type->typ_name,
+                           LUSTRE_MGC_NAME) != 0) {
+			LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted "
+					   "by %.*s; in progress operations "
+					   "using this service will fail.\n",
+					   imp->imp_obd->obd_name, target_len,
+					   target_start);
+			LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction\n");
+                }
+                CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
+                       obd2cli_tgt(imp->imp_obd),
+                       imp->imp_connection->c_remote_uuid.uuid);
+                /* reset vbr_failed flag upon eviction */
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 0;
+		spin_unlock(&imp->imp_lock);
+
+		/* bug 17802:  XXX client_disconnect_export vs connect request
+		 * race. if client is evicted at this time then we start
+		 * invalidate thread without reference to import and import can
+		 * be freed at same time. */
+		class_import_get(imp);
+		task = kthread_run(ptlrpc_invalidate_import_thread, imp,
+				   "ll_imp_inval");
+		if (IS_ERR(task)) {
+			class_import_put(imp);
+			rc = PTR_ERR(task);
+			CERROR("%s: can't start invalidate thread: rc = %d\n",
+			       imp->imp_obd->obd_name, rc);
+		} else {
+			rc = 0;
+		}
+		RETURN(rc);
+        }
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY) {
+		CDEBUG(D_HA, "replay requested by %s\n",
+		       obd2cli_tgt(imp->imp_obd));
+		rc = ptlrpc_replay_next(imp, &inflight);
+		if (inflight == 0 &&
+		    atomic_read(&imp->imp_replay_inflight) == 0) {
+			import_set_state(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			rc = ldlm_replay_locks(imp);
+			if (rc)
+				GOTO(out, rc);
+		}
+		rc = 0;
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			import_set_state(imp, LUSTRE_IMP_REPLAY_WAIT);
+			rc = signal_completed_replay(imp);
+			if (rc)
+				GOTO(out, rc);
+		}
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			import_set_state(imp, LUSTRE_IMP_RECOVER);
+		}
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+		struct ptlrpc_connection *conn = imp->imp_connection;
+
+		rc = ptlrpc_resend(imp);
+		if (rc)
+			GOTO(out, rc);
+		ptlrpc_activate_import(imp, true);
+
+		/* Reverse import are flagged with dlm_fake == 1.
+		 * They do not do recovery and connection are not "restored".
+		 */
+		if (!imp->imp_dlm_fake)
+			CDEBUG_LIMIT(imp->imp_was_idle ?
+					imp->imp_idle_debug : D_CONSOLE,
+				     "%s: Connection restored to %s (at %s)\n",
+				     imp->imp_obd->obd_name,
+				     obd_uuid2str(&conn->c_remote_uuid),
+				     obd_import_nid2str(imp));
+		spin_lock(&imp->imp_lock);
+		imp->imp_was_idle = 0;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_FULL) {
+		wake_up(&imp->imp_recovery_waitq);
+		ptlrpc_wake_delayed(imp);
+	}
+
+out:
+	RETURN(rc);
+}
+
+static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	int rq_opc, rc = 0;
+	ENTRY;
+
+	switch (imp->imp_connect_op) {
+	case OST_CONNECT:
+		rq_opc = OST_DISCONNECT;
+		break;
+	case MDS_CONNECT:
+		rq_opc = MDS_DISCONNECT;
+		break;
+	case MGS_CONNECT:
+		rq_opc = MGS_DISCONNECT;
+		break;
+	default:
+		rc = -EINVAL;
+		CERROR("%s: don't know how to disconnect from %s "
+		       "(connect_op %d): rc = %d\n",
+		       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connect_op, rc);
+		RETURN(ERR_PTR(rc));
+	}
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+					LUSTRE_OBD_VERSION, rq_opc);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	/* We are disconnecting, do not retry a failed DISCONNECT rpc if
+	 * it fails.  We can get through the above with a down server
+	 * if the client doesn't know the server is gone yet. */
+	req->rq_no_resend = 1;
+
+	/* We want client umounts to happen quickly, no matter the
+	   server state... */
+	req->rq_timeout = min_t(timeout_t, req->rq_timeout,
+				INITIAL_CONNECT_TIMEOUT);
+
+	req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+	ptlrpc_request_set_replen(req);
+
+	RETURN(req);
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+	struct ptlrpc_request *req;
+	int rc = 0;
+	ENTRY;
+
+	if (imp->imp_obd->obd_force)
+		GOTO(set_state, rc);
+
+	/* probably the import has been disconnected already being idle */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_IDLE)
+		GOTO(out, rc);
+	spin_unlock(&imp->imp_lock);
+
+	if (ptlrpc_import_in_recovery(imp)) {
+		long timeout_jiffies;
+		time64_t timeout;
+
+		if (AT_OFF) {
+			if (imp->imp_server_timeout)
+				timeout = obd_timeout >> 1;
+			else
+				timeout = obd_timeout;
+		} else {
+			u32 req_portal;
+			int idx;
+
+			req_portal = imp->imp_client->cli_request_portal;
+			idx = import_at_get_index(imp, req_portal);
+			timeout = at_get(&imp->imp_at.iat_service_estimate[idx]);
+		}
+
+		timeout_jiffies = cfs_time_seconds(timeout);
+		if (wait_event_idle_timeout(imp->imp_recovery_waitq,
+					    !ptlrpc_import_in_recovery(imp),
+					    timeout_jiffies) == 0 &&
+		    l_wait_event_abortable(imp->imp_recovery_waitq,
+					   !ptlrpc_import_in_recovery(imp)) < 0)
+			rc = -EINTR;
+	}
+
+	req = ptlrpc_disconnect_prep_req(imp);
+	if (IS_ERR(req))
+		GOTO(set_state, rc = PTR_ERR(req));
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_FULL) {
+		ptlrpc_req_finished_with_imp_lock(req);
+		GOTO(out, rc);
+	}
+	import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING);
+	spin_unlock(&imp->imp_lock);
+
+	rc = ptlrpc_queue_wait(req);
+	ptlrpc_req_finished(req);
+
+set_state:
+	spin_lock(&imp->imp_lock);
+out:
+	if (noclose)
+		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
+	else
+		import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
+	memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+	spin_unlock(&imp->imp_lock);
+
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+	if (!noclose)
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+	if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN)
+		rc = 0;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_import);
+
+static void ptlrpc_reset_reqs_generation(struct obd_import *imp)
+{
+	struct ptlrpc_request *old, *tmp;
+
+	/* tag all resendable requests generated before disconnection
+	 * notice this code is part of disconnect-at-idle path only */
+	list_for_each_entry_safe(old, tmp, &imp->imp_delayed_list,
+			rq_list) {
+		spin_lock(&old->rq_lock);
+		if (old->rq_import_generation == imp->imp_generation - 1 &&
+		    ((imp->imp_initiated_at == imp->imp_generation) ||
+		     !old->rq_no_resend))
+			old->rq_import_generation = imp->imp_generation;
+		spin_unlock(&old->rq_lock);
+	}
+}
+
+static int ptlrpc_disconnect_idle_interpret(const struct lu_env *env,
+					    struct ptlrpc_request *req,
+					    void *args, int rc)
+{
+	struct obd_import *imp = req->rq_import;
+	int connect = 0;
+
+	DEBUG_REQ(D_HA, req, "inflight=%d, refcount=%d: rc = %d",
+		  atomic_read(&imp->imp_inflight),
+		  refcount_read(&imp->imp_refcount), rc);
+
+	spin_lock(&imp->imp_lock);
+	/* DISCONNECT reply can be late and another connection can just
+	 * be initiated. so we have to abort disconnection. */
+	if (req->rq_import_generation == imp->imp_generation &&
+	    imp->imp_state != LUSTRE_IMP_CLOSED) {
+		LASSERTF(imp->imp_state == LUSTRE_IMP_CONNECTING,
+			 "%s\n", ptlrpc_import_state_name(imp->imp_state));
+		memset(&imp->imp_remote_handle, 0,
+		       sizeof(imp->imp_remote_handle));
+		/* take our DISCONNECT into account */
+		if (atomic_read(&imp->imp_reqs) > 1) {
+			imp->imp_generation++;
+			imp->imp_initiated_at = imp->imp_generation;
+			import_set_state_nolock(imp, LUSTRE_IMP_NEW);
+			ptlrpc_reset_reqs_generation(imp);
+			connect = 1;
+		} else {
+			/* do not expose transient IDLE state */
+			import_set_state_nolock(imp, LUSTRE_IMP_IDLE);
+		}
+	}
+
+	if (connect) {
+		rc = ptlrpc_connect_import_locked(imp);
+		if (rc >= 0)
+			ptlrpc_pinger_add_import(imp);
+	} else {
+		spin_unlock(&imp->imp_lock);
+	}
+
+	return 0;
+}
+
+static bool ptlrpc_can_idle(struct obd_import *imp)
+{
+	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+
+	/* one request for disconnect rpc */
+	if (atomic_read(&imp->imp_reqs) > 1)
+		return false;
+
+	/* any lock increases ns_bref being a resource holder */
+	if (ns && atomic_read(&ns->ns_bref) > 0)
+		return false;
+
+	return true;
+}
+
+int ptlrpc_disconnect_and_idle_import(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	if (imp->imp_obd->obd_force)
+		RETURN(0);
+
+	if (ptlrpc_import_in_recovery(imp))
+		RETURN(0);
+
+	req = ptlrpc_disconnect_prep_req(imp);
+	if (IS_ERR(req))
+		RETURN(PTR_ERR(req));
+
+	req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret;
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_IDLE_RACE)) {
+		__u32 idx;
+
+		server_name2index(imp->imp_obd->obd_name, &idx, NULL);
+		if (idx == 0)
+			OBD_RACE(OBD_FAIL_PTLRPC_IDLE_RACE);
+	}
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_FULL || !ptlrpc_can_idle(imp)) {
+		ptlrpc_req_finished_with_imp_lock(req);
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+	import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING);
+	/* don't make noise at reconnection */
+	imp->imp_was_idle = 1;
+	spin_unlock(&imp->imp_lock);
+
+	CDEBUG_LIMIT(imp->imp_idle_debug, "%s: disconnect after %llus idle\n",
+		     imp->imp_obd->obd_name,
+		     ktime_get_real_seconds() - imp->imp_last_reply_time);
+
+	ptlrpcd_add_req(req);
+
+	RETURN(1);
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_and_idle_import);
+
+void ptlrpc_cleanup_imp(struct obd_import *imp)
+{
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+
+	import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
+	imp->imp_generation++;
+	ptlrpc_abort_inflight(imp);
+
+	spin_unlock(&imp->imp_lock);
+
+	EXIT;
+}
+
+/* Adaptive Timeout utils */
+
+/* Update at_current_timeout with the specified value (bounded by at_min and
+ * at_max), as well as the AT history "bins".
+ *  - Bin into timeslices using AT_BINS bins.
+ *  - This gives us a max of the last at_history seconds without the storage,
+ *    but still smoothing out a return to normalcy from a slow response.
+ *  - (E.g. remember the maximum latency in each minute of the last 4 minutes.)
+ */
+timeout_t at_measured(struct adaptive_timeout *at, timeout_t timeout)
+{
+	timeout_t old_timeout = at->at_current_timeout;
+	time64_t now = ktime_get_real_seconds();
+	long binlimit = max_t(long, at_history / AT_BINS, 1);
+
+        LASSERT(at);
+	CDEBUG(D_OTHER, "add %u to %p time=%lld v=%u (%u %u %u %u)\n",
+	       timeout, at, now - at->at_binstart, at->at_current_timeout,
+               at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+
+	if (timeout <= 0)
+		/* Negative timeouts and 0's don't count, because we never
+		 * want our timeout to drop to 0 or below, and because 0 could
+		 * mean an error
+		 */
+                return 0;
+
+	spin_lock(&at->at_lock);
+
+        if (unlikely(at->at_binstart == 0)) {
+                /* Special case to remove default from history */
+		at->at_current_timeout = timeout;
+		at->at_worst_timeout_ever = timeout;
+		at->at_worst_timestamp = now;
+		at->at_hist[0] = timeout;
+                at->at_binstart = now;
+        } else if (now - at->at_binstart < binlimit ) {
+                /* in bin 0 */
+		at->at_hist[0] = max_t(timeout_t, timeout, at->at_hist[0]);
+		at->at_current_timeout = max_t(timeout_t, timeout,
+					       at->at_current_timeout);
+        } else {
+                int i, shift;
+		timeout_t maxv = timeout;
+
+		/* move bins over */
+		shift = (u32)(now - at->at_binstart) / binlimit;
+                LASSERT(shift > 0);
+                for(i = AT_BINS - 1; i >= 0; i--) {
+                        if (i >= shift) {
+                                at->at_hist[i] = at->at_hist[i - shift];
+				maxv = max_t(timeout_t, maxv, at->at_hist[i]);
+                        } else {
+                                at->at_hist[i] = 0;
+                        }
+                }
+		at->at_hist[0] = timeout;
+		at->at_current_timeout = maxv;
+                at->at_binstart += shift * binlimit;
+        }
+
+	if (at->at_current_timeout > at->at_worst_timeout_ever) {
+		at->at_worst_timeout_ever = at->at_current_timeout;
+		at->at_worst_timestamp = now;
+	}
+
+	if (at->at_flags & AT_FLG_NOHIST)
+                /* Only keep last reported val; keeping the rest of the history
+		 * for debugfs only
+		 */
+		at->at_current_timeout = timeout;
+
+        if (at_max > 0)
+		at->at_current_timeout = min_t(timeout_t,
+					       at->at_current_timeout, at_max);
+	at->at_current_timeout = max_t(timeout_t, at->at_current_timeout,
+				       at_min);
+	if (at->at_current_timeout != old_timeout)
+		CDEBUG(D_OTHER,
+		       "AT %p change: old=%u new=%u delta=%d (val=%d) hist %u %u %u %u\n",
+		       at, old_timeout, at->at_current_timeout,
+		       at->at_current_timeout - old_timeout, timeout,
+                       at->at_hist[0], at->at_hist[1], at->at_hist[2],
+                       at->at_hist[3]);
+
+	/* if we changed, report the old timeout value */
+	old_timeout = (at->at_current_timeout != old_timeout) ? old_timeout : 0;
+
+	spin_unlock(&at->at_lock);
+	return old_timeout;
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal)
+{
+        struct imp_at *at = &imp->imp_at;
+        int i;
+
+        for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+                if (at->iat_portal[i] == portal)
+                        return i;
+                if (at->iat_portal[i] == 0)
+                        /* unused */
+                        break;
+        }
+
+        /* Not found in list, add it under a lock */
+	spin_lock(&imp->imp_lock);
+
+        /* Check unused under lock */
+        for (; i < IMP_AT_MAX_PORTALS; i++) {
+                if (at->iat_portal[i] == portal)
+                        goto out;
+                if (at->iat_portal[i] == 0)
+                        /* unused */
+                        break;
+        }
+
+        /* Not enough portals? */
+        LASSERT(i < IMP_AT_MAX_PORTALS);
+
+        at->iat_portal[i] = portal;
+out:
+	spin_unlock(&imp->imp_lock);
+	return i;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
new file mode 100644
index 0000000000000..9d20c186a6475
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
@@ -0,0 +1,2719 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/layout.c
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+/*
+ * This file contains the "capsule/pill" abstraction layered above PTLRPC.
+ *
+ * Every struct ptlrpc_request contains a "pill", which points to a description
+ * of the format that the request conforms to.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/module.h>
+
+#include <llog_swab.h>
+#include <lustre_swab.h>
+#include <obd.h>
+#include <obd_support.h>
+
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_req_layout.h>
+#include <lustre_acl.h>
+#include <lustre_nodemap.h>
+
+/*
+ * RQFs (see below) refer to two struct req_msg_field arrays describing the
+ * client request and server reply, respectively.
+ */
+/* empty set of fields... for suitable definition of emptiness. */
+static const struct req_msg_field *empty[] = {
+        &RMF_PTLRPC_BODY
+};
+
+static const struct req_msg_field *mgs_target_info_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MGS_TARGET_INFO
+};
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0)
+static const struct req_msg_field *mgs_set_info[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_SEND_PARAM
+};
+#endif
+
+static const struct req_msg_field *mgs_config_read_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MGS_CONFIG_BODY
+};
+
+static const struct req_msg_field *mgs_config_read_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MGS_CONFIG_RES
+};
+
+static const struct req_msg_field *mdt_body_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY
+};
+
+static const struct req_msg_field *mdt_body_capa[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_CAPA1
+};
+
+static const struct req_msg_field *quotactl_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_QUOTACTL
+};
+
+static const struct req_msg_field *quota_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_DLM_LVB,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *mdt_close_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_EPOCH,
+        &RMF_REC_REINT,
+        &RMF_CAPA1
+};
+
+static const struct req_msg_field *mdt_close_intent_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_EPOCH,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CLOSE_DATA,
+	&RMF_U32
+};
+
+static const struct req_msg_field *obd_statfs_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OBD_STATFS
+};
+
+static const struct req_msg_field *seq_query_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_SEQ_OPC,
+        &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *seq_query_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *fld_query_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_FLD_OPC,
+        &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_query_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GENERIC_DATA
+};
+
+static const struct req_msg_field *mds_getattr_name_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_CAPA1,
+        &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT
+};
+
+static const struct req_msg_field *mds_reint_create_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_create_slave_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_NAME,
+        &RMF_EADATA,
+        &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_acl_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_DLM_REQ,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL,
+	&RMF_FILE_ENCCTX,
+};
+
+static const struct req_msg_field *mds_reint_create_sym_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL,
+	&RMF_FILE_ENCCTX,
+};
+
+static const struct req_msg_field *mds_reint_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL,
+	&RMF_FILE_ENCCTX,
+};
+
+static const struct req_msg_field *mds_reint_open_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL,
+        &RMF_CAPA1,
+        &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_unlink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
+};
+
+static const struct req_msg_field *mds_reint_link_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
+};
+
+static const struct req_msg_field *mds_reint_rename_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
+};
+
+static const struct req_msg_field *mds_reint_migrate_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL,
+	&RMF_MDT_EPOCH,
+	&RMF_CLOSE_DATA,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_last_unlink_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_LOGCOOKIES,
+        &RMF_CAPA1,
+        &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_setattr_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_MDT_EPOCH,
+        &RMF_EADATA,
+        &RMF_LOGCOOKIES,
+        &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_setxattr_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_NAME,
+        &RMF_EADATA,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
+};
+
+static const struct req_msg_field *mds_reint_resync[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mdt_swap_layouts[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_SWAP_LAYOUTS,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_rmfid_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_FID_ARRAY,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+};
+
+static const struct req_msg_field *mds_rmfid_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_FID_ARRAY,
+	&RMF_RCS,
+};
+
+static const struct req_msg_field *obd_connect_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_TGTUUID,
+	&RMF_CLUUID,
+	&RMF_CONN,
+	&RMF_CONNECT_DATA,
+	&RMF_SELINUX_POL
+};
+
+static const struct req_msg_field *obd_connect_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_set_info_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_SETINFO_KEY,
+        &RMF_SETINFO_VAL
+};
+
+static const struct req_msg_field *mdt_set_info_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY,
+	&RMF_SETINFO_VAL,
+	&RMF_MDT_BODY
+};
+
+static const struct req_msg_field *ost_grant_shrink_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_SETINFO_KEY,
+        &RMF_OST_BODY
+};
+
+static const struct req_msg_field *mds_getinfo_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_GETINFO_KEY,
+        &RMF_GETINFO_VALLEN
+};
+
+static const struct req_msg_field *mds_getinfo_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_GETINFO_VAL,
+};
+
+static const struct req_msg_field *ldlm_enqueue_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *ldlm_enqueue_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP
+};
+
+static const struct req_msg_field *ldlm_enqueue_lvb_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP,
+        &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_cp_callback_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ,
+        &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_gl_callback_desc_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_DLM_GL_DESC
+};
+
+static const struct req_msg_field *ldlm_gl_callback_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_intent_basic_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+};
+
+static const struct req_msg_field *ldlm_intent_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ,
+        &RMF_LDLM_INTENT,
+        &RMF_REC_REINT
+};
+
+static const struct req_msg_field *ldlm_intent_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL
+};
+
+static const struct req_msg_field *ldlm_intent_layout_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_LAYOUT_INTENT,
+	&RMF_EADATA /* for new layout to be set up */
+};
+
+static const struct req_msg_field *ldlm_intent_open_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NIOBUF_INLINE,
+	&RMF_FILE_SECCTX,
+	&RMF_FILE_ENCCTX,
+	&RMF_DEFAULT_MDT_MD,
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_FILE_SECCTX_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_FILE_SECCTX,
+	&RMF_DEFAULT_MDT_MD,
+	&RMF_FILE_ENCCTX,
+};
+
+static const struct req_msg_field *ldlm_intent_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_create_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL,
+	&RMF_FILE_ENCCTX,
+};
+
+static const struct req_msg_field *ldlm_intent_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_open_client[] */
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL,
+	&RMF_FILE_ENCCTX,
+};
+
+static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_SELINUX_POL
+};
+
+static const struct req_msg_field *ldlm_intent_getxattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL, /* for req_capsule_extend/mdt_intent_policy */
+	&RMF_EADATA,
+	&RMF_EAVALS,
+	&RMF_EAVALS_LENS
+};
+
+static const struct req_msg_field *mds_get_root_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_getxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_SELINUX_POL
+};
+
+static const struct req_msg_field *mds_getxattr_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_FILE_ENCCTX,
+};
+
+static const struct req_msg_field *mds_setattr_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL,
+        &RMF_CAPA1,
+        &RMF_CAPA2
+};
+
+static const struct req_msg_field *llog_origin_handle_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY,
+	&RMF_NAME,
+	&RMF_MDT_BODY
+};
+
+static const struct req_msg_field *llogd_body_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LLOGD_BODY
+};
+
+static const struct req_msg_field *llog_log_hdr_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LLOG_LOG_HDR
+};
+
+static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LLOGD_BODY,
+        &RMF_EADATA
+};
+
+static const struct req_msg_field *obd_idx_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *obd_idx_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *ost_body_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_body_capa[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY,
+        &RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_destroy_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY,
+        &RMF_DLM_REQ,
+        &RMF_CAPA1
+};
+
+
+static const struct req_msg_field *ost_brw_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_OBD_IOOBJ,
+	&RMF_NIOBUF_REMOTE,
+	&RMF_CAPA1,
+	&RMF_SHORT_IO
+};
+
+static const struct req_msg_field *ost_brw_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_SHORT_IO
+};
+
+static const struct req_msg_field *ost_brw_write_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY,
+        &RMF_RCS
+};
+
+static const struct req_msg_field *ost_get_info_generic_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *ost_get_info_generic_client[] = {
+        &RMF_PTLRPC_BODY,
+	&RMF_GETINFO_KEY
+};
+
+static const struct req_msg_field *ost_get_last_id_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OBD_ID
+};
+
+static const struct req_msg_field *ost_get_last_fid_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GETINFO_KEY,
+	&RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_last_fid_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_fiemap_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_FIEMAP_KEY,
+        &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *ost_ladvise[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_OST_LADVISE_HDR,
+	&RMF_OST_LADVISE,
+};
+
+static const struct req_msg_field *ost_get_fiemap_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *mdt_hsm_progress[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_PROGRESS,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_register[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_ARCHIVE,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_unregister[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+};
+
+static const struct req_msg_field *mdt_hsm_action_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_CURRENT_ACTION,
+};
+
+static const struct req_msg_field *mdt_hsm_state_get_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_HSM_USER_STATE,
+};
+
+static const struct req_msg_field *mdt_hsm_state_set[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_HSM_STATE_SET,
+};
+
+static const struct req_msg_field *mdt_hsm_request[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_REQUEST,
+	&RMF_MDS_HSM_USER_ITEM,
+	&RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *obd_lfsck_request[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LFSCK_REQUEST,
+};
+
+static const struct req_msg_field *obd_lfsck_reply[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LFSCK_REPLY,
+};
+
+static struct req_format *req_formats[] = {
+	&RQF_OBD_PING,
+	&RQF_OBD_SET_INFO,
+	&RQF_MDT_SET_INFO,
+	&RQF_OBD_IDX_READ,
+	&RQF_SEC_CTX,
+	&RQF_MGS_TARGET_REG,
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0)
+	&RQF_MGS_SET_INFO,
+#endif
+	&RQF_MGS_CONFIG_READ,
+	&RQF_SEQ_QUERY,
+	&RQF_FLD_QUERY,
+	&RQF_FLD_READ,
+	&RQF_MDS_CONNECT,
+	&RQF_MDS_DISCONNECT,
+	&RQF_MDS_GET_INFO,
+	&RQF_MDS_GET_ROOT,
+	&RQF_MDS_STATFS,
+	&RQF_MDS_STATFS_NEW,
+	&RQF_MDS_GETATTR,
+	&RQF_MDS_GETATTR_NAME,
+	&RQF_MDS_GETXATTR,
+	&RQF_MDS_SYNC,
+	&RQF_MDS_CLOSE,
+	&RQF_MDS_CLOSE_INTENT,
+	&RQF_MDS_READPAGE,
+	&RQF_MDS_REINT,
+	&RQF_MDS_REINT_CREATE,
+	&RQF_MDS_REINT_CREATE_ACL,
+	&RQF_MDS_REINT_CREATE_SLAVE,
+	&RQF_MDS_REINT_CREATE_SYM,
+	&RQF_MDS_REINT_OPEN,
+	&RQF_MDS_REINT_UNLINK,
+	&RQF_MDS_REINT_LINK,
+	&RQF_MDS_REINT_RENAME,
+	&RQF_MDS_REINT_MIGRATE,
+	&RQF_MDS_REINT_SETATTR,
+	&RQF_MDS_REINT_SETXATTR,
+	&RQF_MDS_REINT_RESYNC,
+	&RQF_MDS_QUOTACTL,
+	&RQF_MDS_HSM_PROGRESS,
+	&RQF_MDS_HSM_CT_REGISTER,
+	&RQF_MDS_HSM_CT_UNREGISTER,
+	&RQF_MDS_HSM_STATE_GET,
+	&RQF_MDS_HSM_STATE_SET,
+	&RQF_MDS_HSM_ACTION,
+	&RQF_MDS_HSM_REQUEST,
+	&RQF_MDS_SWAP_LAYOUTS,
+	&RQF_MDS_RMFID,
+#ifdef HAVE_SERVER_SUPPORT
+	&RQF_OUT_UPDATE,
+#endif
+	&RQF_OST_CONNECT,
+	&RQF_OST_DISCONNECT,
+	&RQF_OST_QUOTACTL,
+	&RQF_OST_GETATTR,
+	&RQF_OST_SETATTR,
+	&RQF_OST_CREATE,
+	&RQF_OST_PUNCH,
+	&RQF_OST_FALLOCATE,
+	&RQF_OST_SYNC,
+	&RQF_OST_DESTROY,
+	&RQF_OST_BRW_READ,
+	&RQF_OST_BRW_WRITE,
+	&RQF_OST_STATFS,
+	&RQF_OST_SET_GRANT_INFO,
+	&RQF_OST_GET_INFO,
+	&RQF_OST_GET_INFO_LAST_ID,
+	&RQF_OST_GET_INFO_LAST_FID,
+	&RQF_OST_SET_INFO_LAST_FID,
+	&RQF_OST_GET_INFO_FIEMAP,
+	&RQF_OST_LADVISE,
+	&RQF_OST_SEEK,
+	&RQF_LDLM_ENQUEUE,
+	&RQF_LDLM_ENQUEUE_LVB,
+	&RQF_LDLM_CONVERT,
+	&RQF_LDLM_CANCEL,
+	&RQF_LDLM_CALLBACK,
+	&RQF_LDLM_CP_CALLBACK,
+	&RQF_LDLM_BL_CALLBACK,
+	&RQF_LDLM_GL_CALLBACK,
+	&RQF_LDLM_GL_CALLBACK_DESC,
+	&RQF_LDLM_INTENT,
+	&RQF_LDLM_INTENT_BASIC,
+	&RQF_LDLM_INTENT_LAYOUT,
+	&RQF_LDLM_INTENT_GETATTR,
+	&RQF_LDLM_INTENT_OPEN,
+	&RQF_LDLM_INTENT_CREATE,
+	&RQF_LDLM_INTENT_GETXATTR,
+	&RQF_LDLM_INTENT_QUOTA,
+	&RQF_QUOTA_DQACQ,
+	&RQF_LLOG_ORIGIN_HANDLE_CREATE,
+	&RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+	&RQF_CONNECT,
+	&RQF_LFSCK_NOTIFY,
+	&RQF_LFSCK_QUERY,
+};
+
+struct req_msg_field {
+	const __u32 rmf_flags;
+	const char  *rmf_name;
+	/**
+	 * Field length. (-1) means "variable length".  If the
+	 * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length,
+	 * but the actual size must be a whole multiple of \a rmf_size.
+	 */
+	const int   rmf_size;
+	void	    (*rmf_swabber)(void *);
+	/**
+	 * Pass buffer size to swabbing function
+	 * \retval	> 0		the number of bytes swabbed
+	 *		-EOVERFLOW	on error
+	 */
+	int	    (*rmf_swab_len)(void *, __u32);
+	void	    (*rmf_dumper)(void *);
+	int	    rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR];
+};
+
+enum rmf_flags {
+	/**
+	 * The field is a string, must be NUL-terminated.
+	 */
+	RMF_F_STRING		= BIT(0),
+	/**
+	 * The field's buffer size need not match the declared \a rmf_size.
+	 */
+	RMF_F_NO_SIZE_CHECK	= BIT(1),
+	/**
+	 * The field's buffer size must be a whole multiple of the declared \a
+	 * rmf_size and the \a rmf_swabber function must work on the declared \a
+	 * rmf_size worth of bytes.
+	 */
+	RMF_F_STRUCT_ARRAY	= BIT(2),
+};
+
+struct req_capsule;
+
+/*
+ * Request fields.
+ */
+#define DEFINE_MSGF(name, flags, size, swabber, dumper) {       \
+        .rmf_name    = (name),                                  \
+        .rmf_flags   = (flags),                                 \
+        .rmf_size    = (size),                                  \
+        .rmf_swabber = (void (*)(void*))(swabber),              \
+        .rmf_dumper  = (void (*)(void*))(dumper)                \
+}
+
+#define DEFINE_MSGFL(name, flags, size, swab_len, dumper) {	\
+	.rmf_name     = (name),					\
+	.rmf_flags    = (flags),				\
+	.rmf_size     = (size),					\
+	.rmf_swab_len = (int (*)(void *, __u32))(swab_len),	\
+	.rmf_dumper   = (void (*)(void *))(dumper)		\
+}
+
+struct req_msg_field RMF_GENERIC_DATA =
+        DEFINE_MSGF("generic_data", 0,
+                    -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GENERIC_DATA);
+
+struct req_msg_field RMF_MGS_TARGET_INFO =
+        DEFINE_MSGF("mgs_target_info", 0,
+                    sizeof(struct mgs_target_info),
+                    lustre_swab_mgs_target_info, NULL);
+EXPORT_SYMBOL(RMF_MGS_TARGET_INFO);
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0)
+struct req_msg_field RMF_MGS_SEND_PARAM =
+	DEFINE_MSGF("mgs_send_param", 0,
+		    sizeof(struct mgs_send_param),
+		    NULL, NULL);
+EXPORT_SYMBOL(RMF_MGS_SEND_PARAM);
+#endif
+
+struct req_msg_field RMF_MGS_CONFIG_BODY =
+        DEFINE_MSGF("mgs_config_read request", 0,
+                    sizeof(struct mgs_config_body),
+                    lustre_swab_mgs_config_body, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY);
+
+struct req_msg_field RMF_MGS_CONFIG_RES =
+        DEFINE_MSGF("mgs_config_read reply ", 0,
+                    sizeof(struct mgs_config_res),
+                    lustre_swab_mgs_config_res, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
+
+struct req_msg_field RMF_U32 =
+	DEFINE_MSGF("generic u32", RMF_F_STRUCT_ARRAY,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_U32);
+
+struct req_msg_field RMF_SETINFO_VAL =
+        DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_VAL);
+
+struct req_msg_field RMF_GETINFO_KEY =
+        DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_KEY);
+
+struct req_msg_field RMF_GETINFO_VALLEN =
+        DEFINE_MSGF("getinfo_vallen", 0,
+                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VALLEN);
+
+struct req_msg_field RMF_GETINFO_VAL =
+        DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VAL);
+
+struct req_msg_field RMF_SEQ_OPC =
+        DEFINE_MSGF("seq_query_opc", 0,
+                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_SEQ_OPC);
+
+struct req_msg_field RMF_SEQ_RANGE =
+        DEFINE_MSGF("seq_query_range", 0,
+                    sizeof(struct lu_seq_range),
+                    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_SEQ_RANGE);
+
+struct req_msg_field RMF_FLD_OPC =
+        DEFINE_MSGF("fld_query_opc", 0,
+                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_FLD_OPC);
+
+struct req_msg_field RMF_FLD_MDFLD =
+        DEFINE_MSGF("fld_query_mdfld", 0,
+                    sizeof(struct lu_seq_range),
+                    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_FLD_MDFLD);
+
+struct req_msg_field RMF_MDT_BODY =
+        DEFINE_MSGF("mdt_body", 0,
+                    sizeof(struct mdt_body), lustre_swab_mdt_body, NULL);
+EXPORT_SYMBOL(RMF_MDT_BODY);
+
+struct req_msg_field RMF_OBD_QUOTACTL =
+	DEFINE_MSGFL("obd_quotactl",
+		     0,
+		     sizeof(struct obd_quotactl),
+		     lustre_swab_obd_quotactl, NULL);
+EXPORT_SYMBOL(RMF_OBD_QUOTACTL);
+
+struct req_msg_field RMF_QUOTA_BODY =
+	DEFINE_MSGF("quota_body", 0,
+		    sizeof(struct quota_body), lustre_swab_quota_body, NULL);
+EXPORT_SYMBOL(RMF_QUOTA_BODY);
+
+struct req_msg_field RMF_MDT_EPOCH =
+        DEFINE_MSGF("mdt_ioepoch", 0,
+                    sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL);
+EXPORT_SYMBOL(RMF_MDT_EPOCH);
+
+struct req_msg_field RMF_PTLRPC_BODY =
+        DEFINE_MSGF("ptlrpc_body", 0,
+                    sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL);
+EXPORT_SYMBOL(RMF_PTLRPC_BODY);
+
+struct req_msg_field RMF_CLOSE_DATA =
+	DEFINE_MSGF("data_version", 0,
+		    sizeof(struct close_data), lustre_swab_close_data, NULL);
+EXPORT_SYMBOL(RMF_CLOSE_DATA);
+
+struct req_msg_field RMF_OBD_STATFS =
+        DEFINE_MSGF("obd_statfs", 0,
+                    sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL);
+EXPORT_SYMBOL(RMF_OBD_STATFS);
+
+struct req_msg_field RMF_SETINFO_KEY =
+        DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_KEY);
+
+struct req_msg_field RMF_NAME =
+        DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_NAME);
+
+struct req_msg_field RMF_FID_ARRAY =
+	DEFINE_MSGF("fid_array", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_FID_ARRAY);
+
+struct req_msg_field RMF_SYMTGT =
+	DEFINE_MSGF("symtgt", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SYMTGT);
+
+struct req_msg_field RMF_TGTUUID =
+        DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+        NULL);
+EXPORT_SYMBOL(RMF_TGTUUID);
+
+struct req_msg_field RMF_CLUUID =
+        DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+        NULL);
+EXPORT_SYMBOL(RMF_CLUUID);
+
+struct req_msg_field RMF_STRING =
+        DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_STRING);
+
+struct req_msg_field RMF_FILE_SECCTX_NAME =
+	DEFINE_MSGF("file_secctx_name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_FILE_SECCTX_NAME);
+
+struct req_msg_field RMF_FILE_SECCTX =
+	DEFINE_MSGF("file_secctx", RMF_F_NO_SIZE_CHECK, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_FILE_SECCTX);
+
+struct req_msg_field RMF_FILE_ENCCTX =
+	DEFINE_MSGF("file_encctx", RMF_F_NO_SIZE_CHECK, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_FILE_ENCCTX);
+
+struct req_msg_field RMF_LLOGD_BODY =
+        DEFINE_MSGF("llogd_body", 0,
+                    sizeof(struct llogd_body), lustre_swab_llogd_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_BODY);
+
+struct req_msg_field RMF_LLOG_LOG_HDR =
+        DEFINE_MSGF("llog_log_hdr", 0,
+                    sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL);
+EXPORT_SYMBOL(RMF_LLOG_LOG_HDR);
+
+struct req_msg_field RMF_LLOGD_CONN_BODY =
+        DEFINE_MSGF("llogd_conn_body", 0,
+                    sizeof(struct llogd_conn_body),
+                    lustre_swab_llogd_conn_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY);
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ *
+ * No swabbing needed because struct lustre_handle contains only a 64-bit cookie
+ * that the client does not interpret at all.
+ */
+struct req_msg_field RMF_CONN =
+        DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL);
+EXPORT_SYMBOL(RMF_CONN);
+
+struct req_msg_field RMF_CONNECT_DATA =
+	DEFINE_MSGF("cdata",
+		    RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */,
+		    sizeof(struct obd_connect_data),
+		    lustre_swab_connect, NULL);
+EXPORT_SYMBOL(RMF_CONNECT_DATA);
+
+struct req_msg_field RMF_DLM_REQ =
+        DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */,
+                    sizeof(struct ldlm_request),
+                    lustre_swab_ldlm_request, NULL);
+EXPORT_SYMBOL(RMF_DLM_REQ);
+
+struct req_msg_field RMF_DLM_REP =
+        DEFINE_MSGF("dlm_rep", 0,
+                    sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL);
+EXPORT_SYMBOL(RMF_DLM_REP);
+
+struct req_msg_field RMF_LDLM_INTENT =
+        DEFINE_MSGF("ldlm_intent", 0,
+                    sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL);
+EXPORT_SYMBOL(RMF_LDLM_INTENT);
+
+struct req_msg_field RMF_DLM_LVB =
+	DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_LVB);
+
+struct req_msg_field RMF_DLM_GL_DESC =
+	DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc), NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_GL_DESC);
+
+struct req_msg_field RMF_MDT_MD =
+        DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_MDT_MD);
+
+struct req_msg_field RMF_DEFAULT_MDT_MD =
+	DEFINE_MSGF("default_mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL,
+		    NULL);
+EXPORT_SYMBOL(RMF_DEFAULT_MDT_MD);
+
+struct req_msg_field RMF_REC_REINT =
+        DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint),
+                    lustre_swab_mdt_rec_reint, NULL);
+EXPORT_SYMBOL(RMF_REC_REINT);
+
+/* FIXME: this length should be defined as a macro */
+struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1,
+                                                    NULL, NULL);
+EXPORT_SYMBOL(RMF_EADATA);
+
+struct req_msg_field RMF_EAVALS = DEFINE_MSGF("eavals", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_EAVALS);
+
+struct req_msg_field RMF_ACL = DEFINE_MSGF("acl", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_ACL);
+
+/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */
+struct req_msg_field RMF_LOGCOOKIES =
+        DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */,
+                    sizeof(struct llog_cookie), NULL, NULL);
+EXPORT_SYMBOL(RMF_LOGCOOKIES);
+
+struct req_msg_field RMF_CAPA1 =
+	DEFINE_MSGF("capa", 0, 0, NULL, NULL);
+EXPORT_SYMBOL(RMF_CAPA1);
+
+struct req_msg_field RMF_CAPA2 =
+	DEFINE_MSGF("capa", 0, 0, NULL, NULL);
+EXPORT_SYMBOL(RMF_CAPA2);
+
+struct req_msg_field RMF_LAYOUT_INTENT =
+	DEFINE_MSGF("layout_intent", 0,
+		    sizeof(struct layout_intent), lustre_swab_layout_intent,
+		    NULL);
+EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
+
+struct req_msg_field RMF_SELINUX_POL =
+	DEFINE_MSGF("selinux_pol", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SELINUX_POL);
+
+/*
+ * OST request field.
+ */
+struct req_msg_field RMF_OST_BODY =
+	DEFINE_MSGF("ost_body", 0,
+		    sizeof(struct ost_body), lustre_swab_ost_body,
+		    dump_ost_body);
+EXPORT_SYMBOL(RMF_OST_BODY);
+
+struct req_msg_field RMF_OBD_IOOBJ =
+        DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY,
+                    sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo);
+EXPORT_SYMBOL(RMF_OBD_IOOBJ);
+
+struct req_msg_field RMF_NIOBUF_REMOTE =
+        DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY,
+                    sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+                    dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
+
+struct req_msg_field RMF_NIOBUF_INLINE =
+	DEFINE_MSGF("niobuf_inline", RMF_F_NO_SIZE_CHECK,
+		    sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+		    dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_INLINE);
+
+struct req_msg_field RMF_RCS =
+	DEFINE_MSGF("niobuf_rcs", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+		    lustre_swab_generic_32s, dump_rcs);
+EXPORT_SYMBOL(RMF_RCS);
+
+struct req_msg_field RMF_EAVALS_LENS =
+	DEFINE_MSGF("eavals_lens", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+		lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_EAVALS_LENS);
+
+struct req_msg_field RMF_OBD_ID =
+	DEFINE_MSGF("obd_id", 0,
+		    sizeof(__u64), lustre_swab_ost_last_id, NULL);
+EXPORT_SYMBOL(RMF_OBD_ID);
+
+struct req_msg_field RMF_FID =
+	DEFINE_MSGF("fid", 0,
+		    sizeof(struct lu_fid), lustre_swab_lu_fid, NULL);
+EXPORT_SYMBOL(RMF_FID);
+
+struct req_msg_field RMF_OST_ID =
+	DEFINE_MSGF("ost_id", 0,
+		    sizeof(struct ost_id), lustre_swab_ost_id, NULL);
+EXPORT_SYMBOL(RMF_OST_ID);
+
+struct req_msg_field RMF_FIEMAP_KEY =
+	DEFINE_MSGF("fiemap_key", 0, sizeof(struct ll_fiemap_info_key),
+		    lustre_swab_fiemap_info_key, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_KEY);
+
+struct req_msg_field RMF_FIEMAP_VAL =
+	DEFINE_MSGFL("fiemap", 0, -1, lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_VAL);
+
+struct req_msg_field RMF_IDX_INFO =
+	DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
+		    lustre_swab_idx_info, NULL);
+EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_SHORT_IO =
+	DEFINE_MSGF("short_io", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SHORT_IO);
+struct req_msg_field RMF_HSM_USER_STATE =
+	DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
+		    lustre_swab_hsm_user_state, NULL);
+EXPORT_SYMBOL(RMF_HSM_USER_STATE);
+
+struct req_msg_field RMF_HSM_STATE_SET =
+	DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set),
+		    lustre_swab_hsm_state_set, NULL);
+EXPORT_SYMBOL(RMF_HSM_STATE_SET);
+
+struct req_msg_field RMF_MDS_HSM_PROGRESS =
+	DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel),
+		    lustre_swab_hsm_progress_kernel, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS);
+
+struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION =
+	DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action),
+		    lustre_swab_hsm_current_action, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION);
+
+struct req_msg_field RMF_MDS_HSM_USER_ITEM =
+	DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct hsm_user_item), lustre_swab_hsm_user_item,
+		    NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
+
+struct req_msg_field RMF_MDS_HSM_ARCHIVE =
+	DEFINE_MSGF("hsm_archive", RMF_F_STRUCT_ARRAY,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
+
+struct req_msg_field RMF_MDS_HSM_REQUEST =
+	DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request),
+		    lustre_swab_hsm_request, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST);
+
+struct req_msg_field RMF_SWAP_LAYOUTS =
+	DEFINE_MSGF("swap_layouts", 0, sizeof(struct  mdc_swap_layouts),
+		    lustre_swab_swap_layouts, NULL);
+EXPORT_SYMBOL(RMF_SWAP_LAYOUTS);
+
+struct req_msg_field RMF_LFSCK_REQUEST =
+	DEFINE_MSGF("lfsck_request", 0, sizeof(struct lfsck_request),
+		    lustre_swab_lfsck_request, NULL);
+EXPORT_SYMBOL(RMF_LFSCK_REQUEST);
+
+struct req_msg_field RMF_LFSCK_REPLY =
+	DEFINE_MSGF("lfsck_reply", 0, sizeof(struct lfsck_reply),
+		    lustre_swab_lfsck_reply, NULL);
+EXPORT_SYMBOL(RMF_LFSCK_REPLY);
+
+struct req_msg_field RMF_OST_LADVISE_HDR =
+	DEFINE_MSGF("ladvise_request", 0,
+		    sizeof(struct ladvise_hdr),
+		    lustre_swab_ladvise_hdr, NULL);
+EXPORT_SYMBOL(RMF_OST_LADVISE_HDR);
+
+struct req_msg_field RMF_OST_LADVISE =
+	DEFINE_MSGF("ladvise_request", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct lu_ladvise),
+		    lustre_swab_ladvise, NULL);
+EXPORT_SYMBOL(RMF_OST_LADVISE);
+
+/*
+ * Request formats.
+ */
+
+struct req_format {
+	const char *rf_name;
+	size_t	    rf_idx;
+	struct {
+		size_t			     nr;
+		const struct req_msg_field **d;
+	} rf_fields[RCL_NR];
+};
+
+#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) {    \
+        .rf_name   = name,                                              \
+        .rf_fields = {                                                  \
+                [RCL_CLIENT] = {                                        \
+                        .nr = client_nr,                                \
+                        .d  = client                                    \
+                },                                                      \
+                [RCL_SERVER] = {                                        \
+                        .nr = server_nr,                                \
+                        .d  = server                                    \
+                }                                                       \
+        }                                                               \
+}
+
+#define DEFINE_REQ_FMT0(name, client, server)                                  \
+DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server))
+
+struct req_format RQF_OBD_PING =
+        DEFINE_REQ_FMT0("OBD_PING", empty, empty);
+EXPORT_SYMBOL(RQF_OBD_PING);
+
+struct req_format RQF_OBD_SET_INFO =
+        DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty);
+EXPORT_SYMBOL(RQF_OBD_SET_INFO);
+
+struct req_format RQF_MDT_SET_INFO =
+	DEFINE_REQ_FMT0("MDT_SET_INFO", mdt_set_info_client, empty);
+EXPORT_SYMBOL(RQF_MDT_SET_INFO);
+
+/* Read index file through the network */
+struct req_format RQF_OBD_IDX_READ =
+	DEFINE_REQ_FMT0("OBD_IDX_READ",
+			obd_idx_read_client, obd_idx_read_server);
+EXPORT_SYMBOL(RQF_OBD_IDX_READ);
+
+struct req_format RQF_SEC_CTX =
+        DEFINE_REQ_FMT0("SEC_CTX", empty, empty);
+EXPORT_SYMBOL(RQF_SEC_CTX);
+
+struct req_format RQF_MGS_TARGET_REG =
+        DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only,
+                         mgs_target_info_only);
+EXPORT_SYMBOL(RQF_MGS_TARGET_REG);
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 18, 53, 0)
+struct req_format RQF_MGS_SET_INFO =
+	DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info,
+			mgs_set_info);
+EXPORT_SYMBOL(RQF_MGS_SET_INFO);
+#endif
+
+struct req_format RQF_MGS_CONFIG_READ =
+        DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client,
+                         mgs_config_read_server);
+EXPORT_SYMBOL(RQF_MGS_CONFIG_READ);
+
+struct req_format RQF_SEQ_QUERY =
+        DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server);
+EXPORT_SYMBOL(RQF_SEQ_QUERY);
+
+struct req_format RQF_FLD_QUERY =
+        DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server);
+EXPORT_SYMBOL(RQF_FLD_QUERY);
+
+/* The 'fld_read_server' uses 'RMF_GENERIC_DATA' to hold the 'FLD_QUERY'
+ * RPC reply that is composed of 'struct lu_seq_range_array'. But there
+ * is not registered swabber function for 'RMF_GENERIC_DATA'. So the RPC
+ * peers need to handle the RPC reply with fixed little-endian format.
+ *
+ * In theory, we can define new structure with some swabber registered to
+ * handle the 'FLD_QUERY' RPC reply result automatically. But from the
+ * implementation view, it is not easy to be done within current "struct
+ * req_msg_field" framework. Because the sequence range array in the RPC
+ * reply is not fixed length, instead, its length depends on 'lu_seq_range'
+ * count, that is unknown when prepare the RPC buffer. Generally, for such
+ * flexible length RPC usage, there will be a field in the RPC layout to
+ * indicate the data length. But for the 'FLD_READ' RPC, we have no way to
+ * do that unless we add new length filed that will broken the on-wire RPC
+ * protocol and cause interoperability trouble with old peer. */
+struct req_format RQF_FLD_READ =
+	DEFINE_REQ_FMT0("FLD_READ", fld_read_client, fld_read_server);
+EXPORT_SYMBOL(RQF_FLD_READ);
+
+struct req_format RQF_MDS_QUOTACTL =
+        DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
+
+struct req_format RQF_OST_QUOTACTL =
+        DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_OST_QUOTACTL);
+
+struct req_format RQF_QUOTA_DQACQ =
+	DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only);
+EXPORT_SYMBOL(RQF_QUOTA_DQACQ);
+
+struct req_format RQF_LDLM_INTENT_QUOTA =
+	DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA",
+			ldlm_intent_quota_client,
+			ldlm_intent_quota_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA);
+
+struct req_format RQF_MDS_GET_ROOT =
+	DEFINE_REQ_FMT0("MDS_GET_ROOT", mds_get_root_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_GET_ROOT);
+
+struct req_format RQF_MDS_STATFS =
+	DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS);
+
+struct req_format RQF_MDS_STATFS_NEW =
+	DEFINE_REQ_FMT0("MDS_STATFS_NEW", mdt_body_only, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS_NEW);
+
+struct req_format RQF_MDS_SYNC =
+        DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_SYNC);
+
+struct req_format RQF_MDS_GETATTR =
+        DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR);
+
+struct req_format RQF_MDS_GETXATTR =
+        DEFINE_REQ_FMT0("MDS_GETXATTR",
+                        mds_getxattr_client, mds_getxattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETXATTR);
+
+struct req_format RQF_MDS_GETATTR_NAME =
+        DEFINE_REQ_FMT0("MDS_GETATTR_NAME",
+                        mds_getattr_name_client, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME);
+
+struct req_format RQF_MDS_REINT =
+        DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT);
+
+struct req_format RQF_MDS_REINT_CREATE =
+        DEFINE_REQ_FMT0("MDS_REINT_CREATE",
+                        mds_reint_create_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE);
+
+struct req_format RQF_MDS_REINT_CREATE_ACL =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_ACL",
+			mds_reint_create_acl_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_ACL);
+
+struct req_format RQF_MDS_REINT_CREATE_SLAVE =
+        DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA",
+                        mds_reint_create_slave_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE);
+
+struct req_format RQF_MDS_REINT_CREATE_SYM =
+        DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM",
+                        mds_reint_create_sym_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM);
+
+struct req_format RQF_MDS_REINT_OPEN =
+        DEFINE_REQ_FMT0("MDS_REINT_OPEN",
+                        mds_reint_open_client, mds_reint_open_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_OPEN);
+
+struct req_format RQF_MDS_REINT_UNLINK =
+        DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client,
+                        mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK);
+
+struct req_format RQF_MDS_REINT_LINK =
+        DEFINE_REQ_FMT0("MDS_REINT_LINK",
+                        mds_reint_link_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_LINK);
+
+struct req_format RQF_MDS_REINT_RENAME =
+        DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client,
+                        mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_RENAME);
+
+struct req_format RQF_MDS_REINT_MIGRATE =
+	DEFINE_REQ_FMT0("MDS_REINT_MIGRATE", mds_reint_migrate_client,
+			mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_MIGRATE);
+
+struct req_format RQF_MDS_REINT_SETATTR =
+        DEFINE_REQ_FMT0("MDS_REINT_SETATTR",
+                        mds_reint_setattr_client, mds_setattr_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR);
+
+struct req_format RQF_MDS_REINT_SETXATTR =
+        DEFINE_REQ_FMT0("MDS_REINT_SETXATTR",
+			mds_reint_setxattr_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
+
+struct req_format RQF_MDS_REINT_RESYNC =
+	DEFINE_REQ_FMT0("MDS_REINT_RESYNC", mds_reint_resync, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_RESYNC);
+
+struct req_format RQF_MDS_CONNECT =
+        DEFINE_REQ_FMT0("MDS_CONNECT",
+                        obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_MDS_CONNECT);
+
+struct req_format RQF_MDS_DISCONNECT =
+        DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_MDS_DISCONNECT);
+
+struct req_format RQF_MDS_GET_INFO =
+        DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client,
+                        mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_GET_INFO);
+
+struct req_format RQF_LDLM_ENQUEUE =
+        DEFINE_REQ_FMT0("LDLM_ENQUEUE",
+                        ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE);
+
+struct req_format RQF_LDLM_ENQUEUE_LVB =
+        DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB",
+                        ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB);
+
+struct req_format RQF_LDLM_CONVERT =
+        DEFINE_REQ_FMT0("LDLM_CONVERT",
+                        ldlm_enqueue_client, ldlm_enqueue_server);
+EXPORT_SYMBOL(RQF_LDLM_CONVERT);
+
+struct req_format RQF_LDLM_CANCEL =
+        DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CANCEL);
+
+struct req_format RQF_LDLM_CALLBACK =
+        DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CALLBACK);
+
+struct req_format RQF_LDLM_CP_CALLBACK =
+        DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK);
+
+struct req_format RQF_LDLM_BL_CALLBACK =
+        DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_CALLBACK =
+        DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client,
+                        ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_CALLBACK_DESC =
+	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
+			ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK_DESC);
+
+struct req_format RQF_LDLM_INTENT_BASIC =
+	DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
+			ldlm_intent_basic_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC);
+
+struct req_format RQF_LDLM_INTENT =
+        DEFINE_REQ_FMT0("LDLM_INTENT",
+                        ldlm_intent_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT);
+
+struct req_format RQF_LDLM_INTENT_LAYOUT =
+	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT",
+			ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
+
+struct req_format RQF_LDLM_INTENT_GETATTR =
+        DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR",
+                        ldlm_intent_getattr_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR);
+
+struct req_format RQF_LDLM_INTENT_OPEN =
+        DEFINE_REQ_FMT0("LDLM_INTENT_OPEN",
+                        ldlm_intent_open_client, ldlm_intent_open_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN);
+
+struct req_format RQF_LDLM_INTENT_CREATE =
+        DEFINE_REQ_FMT0("LDLM_INTENT_CREATE",
+                        ldlm_intent_create_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
+
+struct req_format RQF_LDLM_INTENT_GETXATTR =
+	DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR",
+			ldlm_intent_getxattr_client,
+			ldlm_intent_getxattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETXATTR);
+
+struct req_format RQF_MDS_CLOSE =
+        DEFINE_REQ_FMT0("MDS_CLOSE",
+                        mdt_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE);
+
+struct req_format RQF_MDS_CLOSE_INTENT =
+	DEFINE_REQ_FMT0("MDS_CLOSE_INTENT",
+			mdt_close_intent_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE_INTENT);
+
+struct req_format RQF_MDS_READPAGE =
+        DEFINE_REQ_FMT0("MDS_READPAGE",
+                        mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_READPAGE);
+
+struct req_format RQF_MDS_HSM_ACTION =
+	DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_ACTION);
+
+struct req_format RQF_MDS_HSM_PROGRESS =
+	DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS);
+
+struct req_format RQF_MDS_HSM_CT_REGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER);
+
+struct req_format RQF_MDS_HSM_CT_UNREGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER);
+
+struct req_format RQF_MDS_HSM_STATE_GET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_GET",
+			mdt_body_capa, mdt_hsm_state_get_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET);
+
+struct req_format RQF_MDS_HSM_STATE_SET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET);
+
+struct req_format RQF_MDS_HSM_REQUEST =
+	DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST);
+
+struct req_format RQF_MDS_SWAP_LAYOUTS =
+	DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS",
+			mdt_swap_layouts, empty);
+EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
+
+struct req_format RQF_MDS_RMFID =
+	DEFINE_REQ_FMT0("MDS_RMFID", mds_rmfid_client,
+			mds_rmfid_server);
+EXPORT_SYMBOL(RQF_MDS_RMFID);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
+                        llog_origin_handle_create_client, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
+                        llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK",
+                        llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER",
+                        llogd_body_only, llog_log_hdr_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+
+struct req_format RQF_CONNECT =
+	DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_CONNECT);
+
+struct req_format RQF_OST_CONNECT =
+        DEFINE_REQ_FMT0("OST_CONNECT",
+                        obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_OST_CONNECT);
+
+struct req_format RQF_OST_DISCONNECT =
+        DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_OST_DISCONNECT);
+
+struct req_format RQF_OST_GETATTR =
+        DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_GETATTR);
+
+struct req_format RQF_OST_SETATTR =
+        DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SETATTR);
+
+struct req_format RQF_OST_CREATE =
+        DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_CREATE);
+
+struct req_format RQF_OST_PUNCH =
+        DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_PUNCH);
+
+struct req_format RQF_OST_FALLOCATE =
+	DEFINE_REQ_FMT0("OST_FALLOCATE", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_FALLOCATE);
+
+struct req_format RQF_OST_SEEK =
+	DEFINE_REQ_FMT0("OST_SEEK", ost_body_only, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SEEK);
+
+struct req_format RQF_OST_SYNC =
+        DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SYNC);
+
+struct req_format RQF_OST_DESTROY =
+        DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_DESTROY);
+
+struct req_format RQF_OST_BRW_READ =
+        DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server);
+EXPORT_SYMBOL(RQF_OST_BRW_READ);
+
+struct req_format RQF_OST_BRW_WRITE =
+        DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server);
+EXPORT_SYMBOL(RQF_OST_BRW_WRITE);
+
+struct req_format RQF_OST_STATFS =
+        DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_OST_STATFS);
+
+struct req_format RQF_OST_SET_GRANT_INFO =
+        DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client,
+                         ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO);
+
+struct req_format RQF_OST_GET_INFO =
+        DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client,
+                                        ost_get_info_generic_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO);
+
+struct req_format RQF_OST_GET_INFO_LAST_ID =
+        DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client,
+                                                ost_get_last_id_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID);
+
+struct req_format RQF_OST_GET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", ost_get_last_fid_client,
+						 ost_get_last_fid_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID);
+
+struct req_format RQF_OST_SET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client,
+						 empty);
+EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID);
+
+struct req_format RQF_OST_GET_INFO_FIEMAP =
+        DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client,
+                                               ost_get_fiemap_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP);
+
+struct req_format RQF_LFSCK_NOTIFY =
+	DEFINE_REQ_FMT0("LFSCK_NOTIFY", obd_lfsck_request, empty);
+EXPORT_SYMBOL(RQF_LFSCK_NOTIFY);
+
+struct req_format RQF_LFSCK_QUERY =
+	DEFINE_REQ_FMT0("LFSCK_QUERY", obd_lfsck_request, obd_lfsck_reply);
+EXPORT_SYMBOL(RQF_LFSCK_QUERY);
+
+struct req_format RQF_OST_LADVISE =
+	DEFINE_REQ_FMT0("OST_LADVISE", ost_ladvise, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_LADVISE);
+
+/* Convenience macro */
+#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)]
+
+/**
+ * Initializes the capsule abstraction by computing and setting the \a rf_idx
+ * field of RQFs and the \a rmf_offset field of RMFs.
+ */
+int req_layout_init(void)
+{
+	size_t i;
+	size_t j;
+	size_t k;
+        struct req_format *rf = NULL;
+
+        for (i = 0; i < ARRAY_SIZE(req_formats); ++i) {
+                rf = req_formats[i];
+                rf->rf_idx = i;
+                for (j = 0; j < RCL_NR; ++j) {
+                        LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR);
+                        for (k = 0; k < rf->rf_fields[j].nr; ++k) {
+                                struct req_msg_field *field;
+
+                                field = (typeof(field))rf->rf_fields[j].d[k];
+                                LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY)
+                                        || field->rmf_size > 0);
+                                LASSERT(field->rmf_offset[i][j] == 0);
+                                /*
+                                 * k + 1 to detect unused format/field
+                                 * combinations.
+                                 */
+                                field->rmf_offset[i][j] = k + 1;
+                        }
+                }
+        }
+        return 0;
+}
+EXPORT_SYMBOL(req_layout_init);
+
+void req_layout_fini(void)
+{
+}
+EXPORT_SYMBOL(req_layout_fini);
+
+/**
+ * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1.
+ *
+ * Actual/expected field sizes are set elsewhere in functions in this file:
+ * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and
+ * req_capsule_msg_size().  The \a rc_area information is used by.
+ * ptlrpc_request_set_replen().
+ */
+void req_capsule_init_area(struct req_capsule *pill)
+{
+	size_t i;
+
+        for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) {
+                pill->rc_area[RCL_CLIENT][i] = -1;
+                pill->rc_area[RCL_SERVER][i] = -1;
+        }
+}
+EXPORT_SYMBOL(req_capsule_init_area);
+
+/**
+ * Initialize a pill.
+ *
+ * The \a location indicates whether the caller is executing on the client side
+ * (RCL_CLIENT) or server side (RCL_SERVER)..
+ */
+void req_capsule_init(struct req_capsule *pill,
+                      struct ptlrpc_request *req,
+                      enum req_location location)
+{
+        LASSERT(location == RCL_SERVER || location == RCL_CLIENT);
+
+        /*
+         * Today all capsules are embedded in ptlrpc_request structs,
+         * but just in case that ever isn't the case, we don't reach
+         * into req unless req != NULL and pill is the one embedded in
+         * the req.
+         *
+         * The req->rq_pill_init flag makes it safe to initialize a pill
+         * twice, which might happen in the OST paths as a result of the
+         * high-priority RPC queue getting peeked at before ost_handle()
+         * handles an OST RPC.
+         */
+        if (req != NULL && pill == &req->rq_pill && req->rq_pill_init)
+                return;
+
+	pill->rc_fmt = NULL;
+        pill->rc_req = req;
+        pill->rc_loc = location;
+	req_capsule_init_area(pill);
+
+	if (req != NULL && pill == &req->rq_pill)
+		req->rq_pill_init = 1;
+}
+EXPORT_SYMBOL(req_capsule_init);
+
+void req_capsule_fini(struct req_capsule *pill)
+{
+}
+EXPORT_SYMBOL(req_capsule_fini);
+
+static int __req_format_is_sane(const struct req_format *fmt)
+{
+	return fmt->rf_idx < ARRAY_SIZE(req_formats) &&
+		req_formats[fmt->rf_idx] == fmt;
+}
+
+static struct lustre_msg *__req_msg(const struct req_capsule *pill,
+                                    enum req_location loc)
+{
+	return loc == RCL_CLIENT ? pill->rc_reqmsg : pill->rc_repmsg;
+}
+
+/**
+ * Set the format (\a fmt) of a \a pill; format changes are not allowed here
+ * (see req_capsule_extend()).
+ */
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt)
+{
+        LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt);
+        LASSERT(__req_format_is_sane(fmt));
+
+        pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_set);
+
+/**
+ * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in
+ * yet.
+
+ * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of
+ * variable-sized fields.  The field sizes come from the declared \a rmf_size
+ * field of a \a pill's \a rc_fmt's RMF's.
+ */
+size_t req_capsule_filled_sizes(struct req_capsule *pill,
+				enum req_location loc)
+{
+	const struct req_format *fmt = pill->rc_fmt;
+	size_t			 i;
+
+        LASSERT(fmt != NULL);
+
+        for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+                if (pill->rc_area[loc][i] == -1) {
+                        pill->rc_area[loc][i] =
+                                            fmt->rf_fields[loc].d[i]->rmf_size;
+                        if (pill->rc_area[loc][i] == -1) {
+                                /*
+                                 * Skip the following fields.
+                                 *
+                                 * If this LASSERT() trips then you're missing a
+                                 * call to req_capsule_set_size().
+                                 */
+                                LASSERT(loc != RCL_SERVER);
+                                break;
+                        }
+                }
+        }
+        return i;
+}
+EXPORT_SYMBOL(req_capsule_filled_sizes);
+
+/**
+ * Capsule equivalent of lustre_pack_request() and lustre_pack_reply().
+ *
+ * This function uses the \a pill's \a rc_area as filled in by
+ * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by
+ * this function).
+ */
+int req_capsule_server_pack(struct req_capsule *pill)
+{
+	const struct req_format *fmt;
+	int count;
+	int rc;
+
+	LASSERT(pill->rc_loc == RCL_SERVER);
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+
+	count = req_capsule_filled_sizes(pill, RCL_SERVER);
+	rc = lustre_pack_reply(pill->rc_req, count,
+			       pill->rc_area[RCL_SERVER], NULL);
+	if (rc != 0) {
+		DEBUG_REQ(D_ERROR, pill->rc_req,
+			  "Cannot pack %d fields in format '%s'",
+			  count, fmt->rf_name);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(req_capsule_server_pack);
+
+/**
+ * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill
+ * corresponding to the given RMF (\a field).
+ */
+__u32 __req_capsule_offset(const struct req_capsule *pill,
+			   const struct req_msg_field *field,
+			   enum req_location loc)
+{
+	unsigned int offset;
+
+	offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+	LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n",
+			     pill->rc_fmt->rf_name,
+			     field->rmf_name, offset, loc);
+	offset--;
+
+	LASSERT(offset < REQ_MAX_FIELD_NR);
+        return offset;
+}
+
+void req_capsule_set_swabbed(struct req_capsule *pill, enum req_location loc,
+			    __u32 index)
+{
+	if (loc == RCL_CLIENT)
+		req_capsule_set_req_swabbed(pill, index);
+	else
+		req_capsule_set_rep_swabbed(pill, index);
+}
+
+bool req_capsule_need_swab(struct req_capsule *pill, enum req_location loc,
+			   __u32 index)
+{
+	if (loc == RCL_CLIENT)
+		return (req_capsule_req_need_swab(pill) &&
+			!req_capsule_req_swabbed(pill, index));
+
+	return (req_capsule_rep_need_swab(pill) &&
+	       !req_capsule_rep_swabbed(pill, index));
+}
+
+/**
+ * Helper for __req_capsule_get(); swabs value / array of values and/or dumps
+ * them if desired.
+ */
+static int
+swabber_dumper_helper(struct req_capsule *pill,
+		      const struct req_msg_field *field,
+		      enum req_location loc,
+		      int offset,
+		      void *value, int len, bool dump, void (*swabber)(void *))
+{
+	void *p;
+	int i;
+	int n;
+	int size;
+	int rc = 0;
+	bool do_swab;
+	bool array = field->rmf_flags & RMF_F_STRUCT_ARRAY;
+
+	swabber = swabber ?: field->rmf_swabber;
+
+	if (req_capsule_need_swab(pill, loc, offset) &&
+	    (swabber != NULL || field->rmf_swab_len != NULL) && value != NULL)
+		do_swab = true;
+	else
+		do_swab = false;
+
+	if (!field->rmf_dumper)
+		dump = false;
+
+	/*
+	 * We're swabbing an array; swabber() swabs a single array element, so
+	 * swab every element.
+	 */
+	if (array && (len % field->rmf_size)) {
+		static const struct req_msg_field *last_field;
+
+		if (field != last_field) {
+			CERROR("%s: array buffer size %u is not a multiple of element size %u\n",
+			       field->rmf_name, len, field->rmf_size);
+			last_field = field;
+		}
+	}
+	/* For the non-array cases, the process of swab/dump/swab only
+	 * needs to be done once. (n = 1)
+	 */
+	if (!array)
+		len = field->rmf_size;
+	for (p = value, i = 0, n = len / field->rmf_size;
+	     i < n;
+	     i++, p += field->rmf_size) {
+		if (dump) {
+			CDEBUG(D_RPCTRACE, "Dump of %s%sfield %s element %d follows\n",
+			       do_swab ? "unswabbed " : "",
+			       array ? "array " : "",
+			       field->rmf_name, i);
+			field->rmf_dumper(p);
+		}
+		if (!do_swab) {
+			if (array)
+				continue;
+			else
+				break;
+		}
+		if (!field->rmf_swab_len) {
+			swabber(p);
+		} else {
+			size = field->rmf_swab_len(p, len);
+			if (size > 0) {
+				len -= size;
+			} else {
+				rc = size;
+				break;
+			}
+		}
+		if (dump) {
+			CDEBUG(D_RPCTRACE, "Dump of swabbed %sfield %s, element %d follows\n",
+			       array ? "array " : "", field->rmf_name, i);
+			field->rmf_dumper(value);
+		}
+        }
+        if (do_swab)
+		req_capsule_set_swabbed(pill, loc, offset);
+
+	return rc;
+}
+
+/**
+ * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill
+ * corresponding to the given RMF (\a field).
+ *
+ * The buffer will be swabbed using the given \a swabber.  If \a swabber == NULL
+ * then the \a rmf_swabber from the RMF will be used.  Soon there will be no
+ * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then
+ * be removed.  Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each
+ * element of the array swabbed.
+ */
+static void *__req_capsule_get(struct req_capsule *pill,
+			       const struct req_msg_field *field,
+			       enum req_location loc,
+			       void (*swabber)(void *),
+			       bool dump)
+{
+	const struct req_format *fmt;
+	struct lustre_msg       *msg;
+	void                    *value;
+	__u32                    len;
+	__u32                    offset;
+
+	void *(*getter)(struct lustre_msg *m, __u32 n, __u32 minlen);
+
+        static const char *rcl_names[RCL_NR] = {
+                [RCL_CLIENT] = "client",
+                [RCL_SERVER] = "server"
+        };
+
+        LASSERT(pill != NULL);
+        LASSERT(pill != LP_POISON);
+        fmt = pill->rc_fmt;
+        LASSERT(fmt != NULL);
+        LASSERT(fmt != LP_POISON);
+        LASSERT(__req_format_is_sane(fmt));
+
+        offset = __req_capsule_offset(pill, field, loc);
+
+        msg = __req_msg(pill, loc);
+        LASSERT(msg != NULL);
+
+        getter = (field->rmf_flags & RMF_F_STRING) ?
+                (typeof(getter))lustre_msg_string : lustre_msg_buf;
+
+	if (field->rmf_flags & (RMF_F_STRUCT_ARRAY|RMF_F_NO_SIZE_CHECK)) {
+		/*
+		 * We've already asserted that field->rmf_size > 0 in
+		 * req_layout_init().
+		 */
+		len = lustre_msg_buflen(msg, offset);
+		if (!(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+		    (len % field->rmf_size) != 0) {
+			CERROR("%s: array field size mismatch "
+				"%d modulo %u != 0 (%d)\n",
+				field->rmf_name, len, field->rmf_size, loc);
+			return NULL;
+		}
+        } else if (pill->rc_area[loc][offset] != -1) {
+                len = pill->rc_area[loc][offset];
+        } else {
+		len = max_t(typeof(field->rmf_size), field->rmf_size, 0);
+        }
+        value = getter(msg, offset, len);
+
+        if (value == NULL) {
+                DEBUG_REQ(D_ERROR, pill->rc_req,
+			  "Wrong buffer for field '%s' (%u of %u) in format '%s', %u vs. %u (%s)",
+			  field->rmf_name, offset, lustre_msg_bufcount(msg),
+			  fmt->rf_name, lustre_msg_buflen(msg, offset), len,
+			  rcl_names[loc]);
+        } else {
+                swabber_dumper_helper(pill, field, loc, offset, value, len,
+                                      dump, swabber);
+        }
+
+        return value;
+}
+
+/**
+ * Dump a request and/or reply
+ */
+void __req_capsule_dump(struct req_capsule *pill, enum req_location loc)
+{
+	const struct req_format *fmt;
+	const struct req_msg_field *field;
+	__u32 len;
+	size_t i;
+
+	fmt = pill->rc_fmt;
+
+	DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP");
+	for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+		field = FMT_FIELD(fmt, loc, i);
+		if (field->rmf_dumper == NULL) {
+			/*
+			 * FIXME Add a default hex dumper for fields that don't
+			 * have a specific dumper
+			 */
+			len = req_capsule_get_size(pill, field, loc);
+			CDEBUG(D_RPCTRACE,
+			       "Field %s has no dumper function; field size is %u\n",
+			       field->rmf_name, len);
+		} else {
+			/* It's dumping side-effect that we're interested in */
+			(void) __req_capsule_get(pill, field, loc, NULL, true);
+		}
+	}
+	CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n");
+}
+
+/**
+ * Dump a request.
+ */
+void req_capsule_client_dump(struct req_capsule *pill)
+{
+        __req_capsule_dump(pill, RCL_CLIENT);
+}
+EXPORT_SYMBOL(req_capsule_client_dump);
+
+/**
+ * Dump a reply
+ */
+void req_capsule_server_dump(struct req_capsule *pill)
+{
+        __req_capsule_dump(pill, RCL_SERVER);
+}
+EXPORT_SYMBOL(req_capsule_server_dump);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_client_get(struct req_capsule *pill,
+			     const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, RCL_CLIENT, NULL, false);
+}
+EXPORT_SYMBOL(req_capsule_client_get);
+
+/**
+ * Same as req_capsule_client_get(), but with a \a swabber argument.
+ *
+ * Currently unused; will be removed when req_capsule_server_swab_get() is
+ * unused too.
+ */
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber)
+{
+	return __req_capsule_get(pill, field, RCL_CLIENT, swabber, false);
+}
+EXPORT_SYMBOL(req_capsule_client_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_client_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   __u32 len)
+{
+	req_capsule_set_size(pill, field, RCL_CLIENT, len);
+	return __req_capsule_get(pill, field, RCL_CLIENT, NULL, false);
+}
+EXPORT_SYMBOL(req_capsule_client_sized_get);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_server_get(struct req_capsule *pill,
+                             const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, RCL_SERVER, NULL, false);
+}
+EXPORT_SYMBOL(req_capsule_server_get);
+
+/**
+ * Same as req_capsule_server_get(), but with a \a swabber argument.
+ *
+ * Ideally all swabbing should be done pursuant to RMF definitions, with no
+ * swabbing done outside this capsule abstraction.
+ */
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber)
+{
+	return __req_capsule_get(pill, field, RCL_SERVER, swabber, false);
+}
+EXPORT_SYMBOL(req_capsule_server_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_server_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   __u32 len)
+{
+	req_capsule_set_size(pill, field, RCL_SERVER, len);
+	return __req_capsule_get(pill, field, RCL_SERVER, NULL, false);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_get);
+
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					__u32 len, void *swabber)
+{
+	req_capsule_set_size(pill, field, RCL_SERVER, len);
+	return __req_capsule_get(pill, field, RCL_SERVER, swabber, false);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_swab_get);
+
+/**
+ * Returns the buffer of a \a pill corresponding to the given \a field from the
+ * request (if the caller is executing on the server-side) or reply (if the
+ * caller is executing on the client-side).
+ *
+ * This function convienient for use is code that could be executed on the
+ * client and server alike.
+ */
+const void *req_capsule_other_get(struct req_capsule *pill,
+				  const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, false);
+}
+EXPORT_SYMBOL(req_capsule_other_get);
+
+/**
+ * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a
+ * field of the given \a pill.
+ *
+ * This function must be used when constructing variable sized fields of a
+ * request or reply.
+ */
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, __u32 size)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	if ((size != (__u32)field->rmf_size) &&
+	    (field->rmf_size != -1) &&
+	    !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+	    (size > 0)) {
+		__u32 rmf_size = (__u32)field->rmf_size;
+		if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+		    (size % rmf_size != 0)) {
+			CERROR("%s: array field size mismatch "
+				"%u %% %u != 0 (%d)\n",
+				field->rmf_name, size, rmf_size, loc);
+			LBUG();
+		} else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+			   size < rmf_size) {
+			CERROR("%s: field size mismatch %u != %u (%d)\n",
+				field->rmf_name, size, rmf_size, loc);
+			LBUG();
+		}
+	}
+
+	pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size;
+}
+EXPORT_SYMBOL(req_capsule_set_size);
+
+/**
+ * Return the actual PTLRPC buffer length of a request or reply (\a loc)
+ * for the given \a pill's given \a field.
+ *
+ * NB: this function doesn't correspond with req_capsule_set_size(), which
+ * actually sets the size in pill.rc_area[loc][offset], but this function
+ * returns the message buflen[offset], maybe we should use another name.
+ */
+__u32 req_capsule_get_size(const struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc)
+{
+        LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+        return lustre_msg_buflen(__req_msg(pill, loc),
+                                 __req_capsule_offset(pill, field, loc));
+}
+EXPORT_SYMBOL(req_capsule_get_size);
+
+/**
+ * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the
+ * given \a pill's request or reply (\a loc) given the field size recorded in
+ * the \a pill's rc_area.
+ *
+ * See also req_capsule_set_size().
+ */
+__u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc)
+{
+        return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic,
+                               pill->rc_fmt->rf_fields[loc].nr,
+                               pill->rc_area[loc]);
+}
+
+/**
+ * While req_capsule_msg_size() computes the size of a PTLRPC request or reply
+ * (\a loc) given a \a pill's \a rc_area, this function computes the size of a
+ * PTLRPC request or reply given only an RQF (\a fmt).
+ *
+ * This function should not be used for formats which contain variable size
+ * fields.
+ */
+__u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+                         enum req_location loc)
+{
+	__u32 size;
+	size_t i = 0;
+
+        /*
+         * This function should probably LASSERT() that fmt has no fields with
+         * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many
+         * elements in the array there will ultimately be, but then, we could
+         * assume that there will be at least one element, and that's just what
+         * we do.
+         */
+        size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr);
+	if (size == 0)
+		return size;
+
+	for (; i < fmt->rf_fields[loc].nr; ++i)
+		if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+			size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+					       rmf_size);
+	return size;
+}
+EXPORT_SYMBOL(req_capsule_fmt_size);
+
+/**
+ * Changes the format of an RPC.
+ *
+ * The pill must already have been initialized, which means that it already has
+ * a request format.  The new format \a fmt must be an extension of the pill's
+ * old format.  Specifically: the new format must have as many request and reply
+ * fields as the old one, and all fields shared by the old and new format must
+ * be at least as large in the new format.
+ *
+ * The new format's fields may be of different "type" than the old format, but
+ * only for fields that are "opaque" blobs: fields which have a) have no
+ * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a
+ * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK.  For example,
+ * OBD_SET_INFO has a key field and an opaque value field that gets interpreted
+ * according to the key field.  When the value, according to the key, contains a
+ * structure (or array thereof) to be swabbed, the format should be changed to
+ * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set
+ * accordingly.
+ */
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt)
+{
+	int i;
+	size_t j;
+
+        const struct req_format *old;
+
+        LASSERT(pill->rc_fmt != NULL);
+        LASSERT(__req_format_is_sane(fmt));
+
+        old = pill->rc_fmt;
+        /*
+         * Sanity checking...
+         */
+        for (i = 0; i < RCL_NR; ++i) {
+                LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr);
+                for (j = 0; j < old->rf_fields[i].nr - 1; ++j) {
+                        const struct req_msg_field *ofield = FMT_FIELD(old, i, j);
+
+                        /* "opaque" fields can be transmogrified */
+                        if (ofield->rmf_swabber == NULL &&
+                            (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 &&
+                            (ofield->rmf_size == -1 ||
+                            ofield->rmf_flags == RMF_F_NO_SIZE_CHECK))
+                                continue;
+                        LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j));
+                }
+                /*
+                 * Last field in old format can be shorter than in new.
+                 */
+                LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >=
+                        FMT_FIELD(old, i, j)->rmf_size);
+        }
+
+        pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_extend);
+
+/**
+ * This function returns a non-zero value if the given \a field is present in
+ * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it
+ * returns 0.
+ */
+int req_capsule_has_field(const struct req_capsule *pill,
+                          const struct req_msg_field *field,
+                          enum req_location loc)
+{
+        LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+        return field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+}
+EXPORT_SYMBOL(req_capsule_has_field);
+
+/**
+ * Returns a non-zero value if the given \a field is present in the given \a
+ * pill's PTLRPC request or reply (\a loc), else it returns 0.
+ */
+int req_capsule_field_present(const struct req_capsule *pill,
+                              const struct req_msg_field *field,
+                              enum req_location loc)
+{
+	__u32 offset;
+
+        LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+        LASSERT(req_capsule_has_field(pill, field, loc));
+
+        offset = __req_capsule_offset(pill, field, loc);
+        return lustre_msg_bufcount(__req_msg(pill, loc)) > offset;
+}
+EXPORT_SYMBOL(req_capsule_field_present);
+
+/**
+ * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC
+ * request or reply (\a loc).
+ *
+ * This is not the opposite of req_capsule_extend().
+ */
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			__u32 newlen,
+			enum req_location loc)
+{
+        const struct req_format *fmt;
+        struct lustre_msg       *msg;
+	__u32			 len;
+        int                      offset;
+
+        fmt = pill->rc_fmt;
+        LASSERT(fmt != NULL);
+        LASSERT(__req_format_is_sane(fmt));
+        LASSERT(req_capsule_has_field(pill, field, loc));
+        LASSERT(req_capsule_field_present(pill, field, loc));
+
+        offset = __req_capsule_offset(pill, field, loc);
+
+	msg = __req_msg(pill, loc);
+	len = lustre_msg_buflen(msg, offset);
+	LASSERTF(newlen <= len, "%s:%s, oldlen=%u, newlen=%u\n",
+		 fmt->rf_name, field->rmf_name, len, newlen);
+
+	if (loc == RCL_CLIENT) {
+		pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen,
+							    1);
+	} else {
+		pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen,
+							    1);
+		/* update also field size in reply lenghts arrays for possible
+		 * reply re-pack due to req_capsule_server_grow() call.
+		 */
+		req_capsule_set_size(pill, field, loc, newlen);
+	}
+}
+EXPORT_SYMBOL(req_capsule_shrink);
+
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    __u32 newlen)
+{
+	struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs;
+	char *from, *to;
+	int rc;
+	__u32 offset, len;
+
+	LASSERT(pill->rc_fmt != NULL);
+	LASSERT(__req_format_is_sane(pill->rc_fmt));
+	LASSERT(req_capsule_has_field(pill, field, RCL_SERVER));
+	LASSERT(req_capsule_field_present(pill, field, RCL_SERVER));
+
+	len = req_capsule_get_size(pill, field, RCL_SERVER);
+	offset = __req_capsule_offset(pill, field, RCL_SERVER);
+
+	CDEBUG(D_INFO, "Reply packed: %d, allocated: %d, field len %d -> %d\n",
+	       lustre_packed_msg_size(rs->rs_msg), rs->rs_repbuf_len,
+				      len, newlen);
+
+	req_capsule_set_size(pill, field, RCL_SERVER, newlen);
+	/**
+	 * There can be enough space in current reply buffer, make sure
+	 * that rs_repbuf is not a wrapper but real reply msg, otherwise
+	 * re-packing is still needed.
+	 */
+	if (rs->rs_msg == rs->rs_repbuf &&
+	    rs->rs_repbuf_len >=
+	    lustre_packed_msg_size(rs->rs_msg) - len + newlen) {
+		pill->rc_req->rq_replen = lustre_grow_msg(rs->rs_msg, offset,
+							  newlen);
+		return 0;
+	}
+
+	/* Re-allocate replay state */
+	pill->rc_req->rq_reply_state = NULL;
+	rc = req_capsule_server_pack(pill);
+	if (rc) {
+		/* put old values back, the caller should decide what to do */
+		req_capsule_set_size(pill, field, RCL_SERVER, len);
+		pill->rc_req->rq_reply_state = rs;
+		return rc;
+	}
+	nrs = pill->rc_req->rq_reply_state;
+	LASSERT(lustre_packed_msg_size(nrs->rs_msg) >
+		lustre_packed_msg_size(rs->rs_msg));
+
+	/* Now we need only buffers, copy them and grow the needed one */
+	to = lustre_msg_buf(nrs->rs_msg, 0, 0);
+	from = lustre_msg_buf(rs->rs_msg, 0, 0);
+	memcpy(to, from,
+	       (char *)rs->rs_msg + lustre_packed_msg_size(rs->rs_msg) - from);
+	lustre_msg_set_buflen(nrs->rs_msg, offset, len);
+	pill->rc_req->rq_replen = lustre_grow_msg(nrs->rs_msg, offset, newlen);
+
+        if (rs->rs_difficult) {
+                /* copy rs data */
+                int i;
+
+                nrs->rs_difficult = 1;
+                nrs->rs_no_ack = rs->rs_no_ack;
+		nrs->rs_convert_lock = rs->rs_convert_lock;
+                for (i = 0; i < rs->rs_nlocks; i++) {
+                        nrs->rs_locks[i] = rs->rs_locks[i];
+                        nrs->rs_modes[i] = rs->rs_modes[i];
+                        nrs->rs_nlocks++;
+                }
+                rs->rs_nlocks = 0;
+                rs->rs_difficult = 0;
+                rs->rs_no_ack = 0;
+        }
+        ptlrpc_rs_decref(rs);
+        return 0;
+}
+EXPORT_SYMBOL(req_capsule_server_grow);
+
+#ifdef HAVE_SERVER_SUPPORT
+static const struct req_msg_field *mds_update_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OUT_UPDATE_HEADER,
+	&RMF_OUT_UPDATE_BUF,
+};
+
+static const struct req_msg_field *mds_update_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OUT_UPDATE_REPLY,
+};
+
+struct req_msg_field RMF_OUT_UPDATE = DEFINE_MSGFL("object_update", 0, -1,
+				lustre_swab_object_update_request, NULL);
+EXPORT_SYMBOL(RMF_OUT_UPDATE);
+
+struct req_msg_field RMF_OUT_UPDATE_REPLY =
+			DEFINE_MSGFL("object_update_reply", 0, -1,
+				    lustre_swab_object_update_reply, NULL);
+EXPORT_SYMBOL(RMF_OUT_UPDATE_REPLY);
+
+struct req_msg_field RMF_OUT_UPDATE_HEADER = DEFINE_MSGF("out_update_header", 0,
+				-1, lustre_swab_out_update_header, NULL);
+EXPORT_SYMBOL(RMF_OUT_UPDATE_HEADER);
+
+struct req_msg_field RMF_OUT_UPDATE_BUF = DEFINE_MSGF("update_buf",
+			RMF_F_STRUCT_ARRAY, sizeof(struct out_update_buffer),
+			lustre_swab_out_update_buffer, NULL);
+EXPORT_SYMBOL(RMF_OUT_UPDATE_BUF);
+
+struct req_format RQF_OUT_UPDATE =
+	DEFINE_REQ_FMT0("OUT_UPDATE", mds_update_client,
+			mds_update_server);
+EXPORT_SYMBOL(RQF_OUT_UPDATE);
+
+int req_check_sepol(struct req_capsule *pill)
+{
+	int rc = 0;
+	struct obd_export *export;
+	struct lu_nodemap *nm = NULL;
+	const char *sepol = NULL;
+	const char *nm_sepol = NULL;
+
+	if (!pill->rc_req)
+		return -EPROTO;
+
+	export = pill->rc_req->rq_export;
+	if (!export || !exp_connect_sepol(export) ||
+	    !req_capsule_has_field(pill, &RMF_SELINUX_POL, RCL_CLIENT))
+		goto nm;
+
+	if (req_capsule_get_size(pill, &RMF_SELINUX_POL, RCL_CLIENT) == 0)
+		goto nm;
+
+	sepol = req_capsule_client_get(pill, &RMF_SELINUX_POL);
+	CDEBUG(D_SEC, "retrieved sepol %s\n", sepol);
+
+nm:
+	if (export) {
+		nm = nodemap_get_from_exp(export);
+		if (!IS_ERR_OR_NULL(nm)) {
+			nm_sepol = nodemap_get_sepol(nm);
+			if (nm_sepol && nm_sepol[0])
+				if (sepol == NULL ||
+				    strcmp(sepol, nm_sepol) != 0)
+					rc = -EACCES;
+		}
+	}
+
+	if (!IS_ERR_OR_NULL(nm))
+		nodemap_putref(nm);
+
+	return rc;
+}
+EXPORT_SYMBOL(req_check_sepol);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
new file mode 100644
index 0000000000000..d0c3e61ad7b87
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
@@ -0,0 +1,352 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/llog_client.c
+ *
+ * remote api for llog - client side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+
+#include "ptlrpc_internal.h"
+
+#define LLOG_CLIENT_ENTRY(ctxt, imp) do {                             \
+	mutex_lock(&ctxt->loc_mutex);                                 \
+	if (ctxt->loc_imp) {                                          \
+		imp = class_import_get(ctxt->loc_imp);                \
+	} else {                                                      \
+		CERROR("ctxt->loc_imp == NULL for context idx %d."    \
+		       "Unable to complete MDS/OSS recovery,"         \
+		       "but I'll try again next time. Not fatal.\n", \
+		       ctxt->loc_idx);                                \
+		imp = NULL;                                           \
+		mutex_unlock(&ctxt->loc_mutex);                       \
+		return -EINVAL;                                       \
+	}                                                             \
+	mutex_unlock(&ctxt->loc_mutex);                               \
+} while (0)
+
+#define LLOG_CLIENT_EXIT(ctxt, imp) do {                              \
+	mutex_lock(&ctxt->loc_mutex);                                 \
+	if (ctxt->loc_imp != imp)                                     \
+		CWARN("loc_imp has changed from %p to %p\n",          \
+		      ctxt->loc_imp, imp);                            \
+	class_import_put(imp);                                        \
+	mutex_unlock(&ctxt->loc_mutex);                               \
+} while (0)
+
+/*
+ * This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context.
+ */
+static int llog_client_open(const struct lu_env *env,
+			    struct llog_handle *lgh, struct llog_logid *logid,
+			    char *name, enum llog_open_param open_param)
+{
+	struct obd_import *imp;
+	struct llogd_body *body;
+	struct llog_ctxt *ctxt = lgh->lgh_ctxt;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(ctxt, imp);
+
+	/* client cannot create llog */
+	LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param);
+	LASSERT(lgh);
+
+	req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+	if (!req)
+		GOTO(out, rc = -ENOMEM);
+
+	if (name)
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     strlen(name) + 1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION,
+				 LLOG_ORIGIN_HANDLE_CREATE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		req = NULL;
+		GOTO(out, rc);
+	}
+	ptlrpc_request_set_replen(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (logid)
+		body->lgd_logid = *logid;
+	body->lgd_ctxt_idx = ctxt->loc_idx - 1;
+
+	if (name) {
+		char *tmp;
+
+		tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME,
+						   strlen(name) + 1);
+		LASSERT(tmp);
+		strcpy(tmp, name);
+
+		do_pack_body(req);
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (!body)
+		GOTO(out, rc = -EFAULT);
+
+	lgh->lgh_id = body->lgd_logid;
+	lgh->lgh_ctxt = ctxt;
+	EXIT;
+out:
+	LLOG_CLIENT_EXIT(ctxt, imp);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int llog_client_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int *cur_idx, int next_idx,
+				  __u64 *cur_offset, void *buf, int len)
+{
+	struct obd_import *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body *body;
+	void *ptr;
+	int rc;
+
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+	if (!req)
+		GOTO(err_exit, rc = -ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+	body->lgd_index = next_idx;
+	body->lgd_saved_index = *cur_idx;
+	body->lgd_len = len;
+	body->lgd_cur_offset = *cur_offset;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	/*
+	 * -EIO has a special meaning here. If llog_osd_next_block()
+	 * reaches the end of the log without finding the desired
+	 * record then it updates *cur_offset and *cur_idx and returns
+	 * -EIO. In llog_process_thread() we use this to detect
+	 * EOF. But we must be careful to distinguish between -EIO
+	 * coming from llog_osd_next_block() and -EIO coming from
+	 * ptlrpc or below.
+	 */
+	if (rc == -EIO) {
+		if (!req->rq_repmsg ||
+		    lustre_msg_get_status(req->rq_repmsg) != -EIO)
+			GOTO(out, rc);
+	} else if (rc < 0) {
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (!body)
+		GOTO(out, rc = -EFAULT);
+
+	*cur_idx = body->lgd_saved_index;
+	*cur_offset = body->lgd_cur_offset;
+
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* The log records are swabbed as they are processed */
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	if (!ptr)
+		GOTO(out, rc = -EFAULT);
+
+	memcpy(buf, ptr, len);
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+	struct obd_import *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body *body;
+	void *ptr;
+	int rc;
+
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+	if (!req)
+		GOTO(err_exit, rc = -ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+	body->lgd_index = prev_idx;
+	body->lgd_len = len;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (!body)
+		GOTO(out, rc = -EFAULT);
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	if (!ptr)
+		GOTO(out, rc = -EFAULT);
+
+	memcpy(buf, ptr, len);
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_read_header(const struct lu_env *env,
+				   struct llog_handle *handle)
+{
+	struct obd_import *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body *body;
+	struct llog_log_hdr *hdr;
+	struct llog_rec_hdr *llh_hdr;
+	int rc;
+
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp,
+					&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_READ_HEADER);
+	if (!req)
+		GOTO(err_exit, rc = -ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = handle->lgh_id;
+	body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = handle->lgh_hdr->llh_flags;
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+	if (!hdr)
+		GOTO(out, rc = -EFAULT);
+
+	if (handle->lgh_hdr_size < hdr->llh_hdr.lrh_len)
+		GOTO(out, rc = -EFAULT);
+
+	memcpy(handle->lgh_hdr, hdr, hdr->llh_hdr.lrh_len);
+	handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index;
+
+	/* sanity checks */
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("bad log header magic: %#x (expecting %#x)\n",
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		rc = -EIO;
+	} else if (llh_hdr->lrh_len !=
+		   LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len ||
+		   (llh_hdr->lrh_len & (llh_hdr->lrh_len - 1)) != 0 ||
+		   llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE ||
+		   llh_hdr->lrh_len > handle->lgh_hdr_size) {
+		CERROR("incorrectly sized log header: %#x, expecting %#x (power of two > 8192)\n",
+		       llh_hdr->lrh_len,
+		       LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len);
+		CERROR("you may need to re-run lconf --write_conf.\n");
+		rc = -EIO;
+	}
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_close(const struct lu_env *env,
+			     struct llog_handle *handle)
+{
+	/*
+	 * this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because
+	 * the servers all close the file at the end of every
+	 * other LLOG_ RPC.
+	 */
+	return 0;
+}
+
+const struct llog_operations llog_client_ops = {
+	.lop_next_block		= llog_client_next_block,
+	.lop_prev_block		= llog_client_prev_block,
+	.lop_read_header	= llog_client_read_header,
+	.lop_open		= llog_client_open,
+	.lop_close		= llog_client_close,
+};
+EXPORT_SYMBOL(llog_client_ops);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c
new file mode 100644
index 0000000000000..5be7dfc38bcbd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/llog_net.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+
+int llog_initiator_connect(struct llog_ctxt *ctxt)
+{
+	struct obd_import *new_imp;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	new_imp = ctxt->loc_obd->u.cli.cl_import;
+	LASSERTF(!ctxt->loc_imp || ctxt->loc_imp == new_imp,
+		 "%p - %p\n", ctxt->loc_imp, new_imp);
+	mutex_lock(&ctxt->loc_mutex);
+	if (ctxt->loc_imp != new_imp) {
+		if (ctxt->loc_imp)
+			class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = class_import_get(new_imp);
+	}
+	mutex_unlock(&ctxt->loc_mutex);
+	RETURN(0);
+}
+EXPORT_SYMBOL(llog_initiator_connect);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
new file mode 100644
index 0000000000000..d19ea86d82f54
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
@@ -0,0 +1,288 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/llog_server.c
+ *
+ * remote api for llog - server side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <lu_target.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+
+static int llog_origin_close(const struct lu_env *env, struct llog_handle *lgh)
+{
+	if (lgh->lgh_hdr != NULL && lgh->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		return llog_cat_close(env, lgh);
+	else
+		return llog_close(env, lgh);
+}
+
+/* Only open is supported, no new llog can be created remotely */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+	struct obd_export	*exp = req->rq_export;
+	struct obd_device	*obd = exp->exp_obd;
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llog_logid	*logid = NULL;
+	struct llog_ctxt	*ctxt;
+	char			*name = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+
+	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+		logid = &body->lgd_logid;
+
+	if (req_capsule_field_present(&req->rq_pill, &RMF_NAME, RCL_CLIENT)) {
+		name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		if (name == NULL)
+			RETURN(-EFAULT);
+		CDEBUG(D_INFO, "%s: opening log %s\n", obd->obd_name, name);
+	}
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d name=%s\n",
+		       obd->obd_name, body->lgd_ctxt_idx, name);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL) {
+		CDEBUG(D_WARNING, "%s: no ctxt. group=%p idx=%d name=%s\n",
+		       obd->obd_name, &obd->obd_olg, body->lgd_ctxt_idx, name);
+		RETURN(-ENODEV);
+	}
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, logid,
+		       name, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_ctxt, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+	EXIT;
+out_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llogd_body	*repbody;
+	struct llog_ctxt	*ctxt;
+	__u32			 flags;
+	void			*ptr;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     LLOG_MIN_CHUNK_SIZE);
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_LLOG_UMOUNT_RACE))
+		cfs_fail_val = 1;
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+		       &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_ctxt, rc);
+
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	*repbody = *body;
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	rc = llog_next_block(req->rq_svc_thread->t_env, loghandle,
+			     &repbody->lgd_saved_index, repbody->lgd_index,
+			     &repbody->lgd_cur_offset, ptr,
+			     LLOG_MIN_CHUNK_SIZE);
+	if (rc)
+		GOTO(out_close, rc);
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llogd_body	*repbody;
+	struct llog_ctxt	*ctxt;
+	__u32			 flags;
+	void			*ptr;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     LLOG_MIN_CHUNK_SIZE);
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+			 &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_ctxt, rc);
+
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	*repbody = *body;
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	rc = llog_prev_block(req->rq_svc_thread->t_env, loghandle,
+			     body->lgd_index, ptr, LLOG_MIN_CHUNK_SIZE);
+	if (rc)
+		GOTO(out_close, rc);
+
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llog_log_hdr	*hdr;
+	struct llog_ctxt	*ctxt;
+	__u32			 flags;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+		       &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_ctxt, rc);
+
+	/*
+	 * llog_init_handle() reads the llog header
+	 */
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+	flags = loghandle->lgh_hdr->llh_flags;
+
+	hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+	*hdr = *loghandle->lgh_hdr;
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
new file mode 100644
index 0000000000000..0e0b1706655af
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -0,0 +1,1480 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_support.h>
+#include <obd.h>
+#include <lprocfs_status.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+
+static struct ll_rpc_opcode {
+	__u32       opcode;
+	const char *opname;
+} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = {
+	{ OST_REPLY,        "ost_reply" },
+	{ OST_GETATTR,      "ost_getattr" },
+	{ OST_SETATTR,      "ost_setattr" },
+	{ OST_READ,         "ost_read" },
+	{ OST_WRITE,        "ost_write" },
+	{ OST_CREATE ,      "ost_create" },
+	{ OST_DESTROY,      "ost_destroy" },
+	{ OST_GET_INFO,     "ost_get_info" },
+	{ OST_CONNECT,      "ost_connect" },
+	{ OST_DISCONNECT,   "ost_disconnect" },
+	{ OST_PUNCH,        "ost_punch" },
+	{ OST_OPEN,         "ost_open" },
+	{ OST_CLOSE,        "ost_close" },
+	{ OST_STATFS,       "ost_statfs" },
+	{ 14,                NULL },    /* formerly OST_SAN_READ */
+	{ 15,                NULL },    /* formerly OST_SAN_WRITE */
+	{ OST_SYNC,         "ost_sync" },
+	{ OST_SET_INFO,     "ost_set_info" },
+	{ OST_QUOTACHECK,   "ost_quotacheck" },
+	{ OST_QUOTACTL,     "ost_quotactl" },
+	{ OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" },
+	{ OST_LADVISE,      "ost_ladvise" },
+	{ OST_FALLOCATE,    "ost_fallocate" },
+	{ OST_SEEK,	    "ost_seek" },
+	{ MDS_GETATTR,      "mds_getattr" },
+	{ MDS_GETATTR_NAME, "mds_getattr_lock" },
+	{ MDS_CLOSE,        "mds_close" },
+	{ MDS_REINT,        "mds_reint" },
+	{ MDS_READPAGE,     "mds_readpage" },
+	{ MDS_CONNECT,      "mds_connect" },
+	{ MDS_DISCONNECT,   "mds_disconnect" },
+	{ MDS_GET_ROOT,     "mds_get_root" },
+	{ MDS_STATFS,       "mds_statfs" },
+	{ MDS_PIN,          "mds_pin" },
+	{ MDS_UNPIN,        "mds_unpin" },
+	{ MDS_SYNC,         "mds_sync" },
+	{ MDS_DONE_WRITING, "mds_done_writing" },
+	{ MDS_SET_INFO,     "mds_set_info" },
+	{ MDS_QUOTACHECK,   "mds_quotacheck" },
+	{ MDS_QUOTACTL,     "mds_quotactl" },
+	{ MDS_GETXATTR,     "mds_getxattr" },
+	{ MDS_SETXATTR,     "mds_setxattr" },
+	{ MDS_WRITEPAGE,    "mds_writepage" },
+	{ MDS_IS_SUBDIR,    "mds_is_subdir" },
+	{ MDS_GET_INFO,     "mds_get_info" },
+	{ MDS_HSM_STATE_GET, "mds_hsm_state_get" },
+	{ MDS_HSM_STATE_SET, "mds_hsm_state_set" },
+	{ MDS_HSM_ACTION,   "mds_hsm_action" },
+	{ MDS_HSM_PROGRESS, "mds_hsm_progress" },
+	{ MDS_HSM_REQUEST,  "mds_hsm_request" },
+	{ MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
+	{ MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
+	{ MDS_SWAP_LAYOUTS,	"mds_swap_layouts" },
+	{ MDS_RMFID,        "mds_rmfid" },
+	{ LDLM_ENQUEUE,     "ldlm_enqueue" },
+	{ LDLM_CONVERT,     "ldlm_convert" },
+	{ LDLM_CANCEL,      "ldlm_cancel" },
+	{ LDLM_BL_CALLBACK, "ldlm_bl_callback" },
+	{ LDLM_CP_CALLBACK, "ldlm_cp_callback" },
+	{ LDLM_GL_CALLBACK, "ldlm_gl_callback" },
+	{ LDLM_SET_INFO,    "ldlm_set_info" },
+	{ MGS_CONNECT,      "mgs_connect" },
+	{ MGS_DISCONNECT,   "mgs_disconnect" },
+	{ MGS_EXCEPTION,    "mgs_exception" },
+	{ MGS_TARGET_REG,   "mgs_target_reg" },
+	{ MGS_TARGET_DEL,   "mgs_target_del" },
+	{ MGS_SET_INFO,     "mgs_set_info" },
+	{ MGS_CONFIG_READ,  "mgs_config_read" },
+	{ OBD_PING,			 "obd_ping" },
+	{ 401, /* was OBD_LOG_CANCEL */ "llog_cancel" },
+	{ 402, /* was OBD_QC_CALLBACK */ "obd_quota_callback" },
+	{ OBD_IDX_READ, "dt_index_read" },
+	{ LLOG_ORIGIN_HANDLE_CREATE, "llog_origin_handle_open" },
+	{ LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
+	{ LLOG_ORIGIN_HANDLE_READ_HEADER, "llog_origin_handle_read_header" },
+	{ 504, /*LLOG_ORIGIN_HANDLE_WRITE_REC*/"llog_origin_handle_write_rec" },
+	{ 505, /* was LLOG_ORIGIN_HANDLE_CLOSE */ "llog_origin_handle_close" },
+	{ 506, /* was LLOG_ORIGIN_CONNECT */ "llog_origin_connect" },
+	{ 507, /* was LLOG_CATINFO */ "llog_catinfo" },
+	{ LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
+	{ LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
+	{ QUOTA_DQACQ,      "quota_acquire" },
+	{ QUOTA_DQREL,      "quota_release" },
+	{ SEQ_QUERY,        "seq_query" },
+	{ SEC_CTX_INIT,     "sec_ctx_init" },
+	{ SEC_CTX_INIT_CONT, "sec_ctx_init_cont" },
+	{ SEC_CTX_FINI,     "sec_ctx_fini" },
+	{ FLD_QUERY,        "fld_query" },
+	{ FLD_READ,	    "fld_read" },
+#ifdef HAVE_SERVER_SUPPORT
+	{ OUT_UPDATE,	    "out_update" },
+	{ LFSCK_NOTIFY,	    "lfsck_notify" },
+	{ LFSCK_QUERY,	    "lfsck_query" },
+#endif
+};
+
+static struct ll_eopcode {
+	__u32       opcode;
+	const char *opname;
+} ll_eopcode_table[EXTRA_LAST_OPC] = {
+	{ LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+	{ LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+	{ LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+	{ LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+	{ LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+	{ MDS_REINT_SETATTR,    "mds_reint_setattr" },
+	{ MDS_REINT_CREATE,     "mds_reint_create" },
+	{ MDS_REINT_LINK,       "mds_reint_link" },
+	{ MDS_REINT_UNLINK,     "mds_reint_unlink" },
+	{ MDS_REINT_RENAME,     "mds_reint_rename" },
+	{ MDS_REINT_OPEN,       "mds_reint_open" },
+	{ MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+	{ MDS_REINT_RESYNC,	"mds_reint_resync" },
+	{ BRW_READ_BYTES,       "read_bytes" },
+	{ BRW_WRITE_BYTES,      "write_bytes" },
+};
+
+const char *ll_opcode2str(__u32 opcode)
+{
+	__u32 offset = opcode_offset(opcode);
+
+	/* When one of the assertions below fail, chances are that:
+	 *     1) A new opcode was added in include/lustre/lustre_idl.h,
+	 *        but is missing from the table above.
+	 * or  2) The opcode space was renumbered or rearranged,
+	 *        and the opcode_offset() function in
+	 *        ptlrpc_internal.h needs to be modified.
+	 */
+	LASSERTF(offset < LUSTRE_MAX_OPCODES,
+		 "offset %u >= LUSTRE_MAX_OPCODES %u\n",
+		 offset, LUSTRE_MAX_OPCODES);
+	LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode,
+		 "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n",
+		 offset, ll_rpc_opcode_table[offset].opcode, opcode);
+
+	return ll_rpc_opcode_table[offset].opname;
+}
+
+const int ll_str2opcode(const char *ops)
+{
+	int i;
+
+	for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+		if (ll_rpc_opcode_table[i].opname != NULL &&
+		    strcmp(ll_rpc_opcode_table[i].opname, ops) == 0)
+			return ll_rpc_opcode_table[i].opcode;
+	}
+
+	return -EINVAL;
+}
+
+static const char *ll_eopcode2str(__u32 opcode)
+{
+	LASSERT(ll_eopcode_table[opcode].opcode == opcode);
+	return ll_eopcode_table[opcode].opname;
+}
+
+static void
+ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name,
+			 struct dentry **debugfs_root_ret,
+			 struct lprocfs_stats **stats_ret)
+{
+	struct dentry *svc_debugfs_entry;
+	struct lprocfs_stats *svc_stats;
+	int i;
+	unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+					  LPROCFS_CNTR_STDDEV;
+
+	LASSERT(!*debugfs_root_ret);
+	LASSERT(!*stats_ret);
+
+	svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES + LUSTRE_MAX_OPCODES,
+					0);
+	if (!svc_stats)
+		return;
+
+	if (dir)
+		svc_debugfs_entry = debugfs_create_dir(dir, root);
+	else
+		svc_debugfs_entry = root;
+
+	lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
+			     svc_counter_config, "req_waittime", "usec");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+			     svc_counter_config, "req_qdepth", "reqs");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+			     svc_counter_config, "req_active", "reqs");
+	lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+			     svc_counter_config, "req_timeout", "sec");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+			     svc_counter_config, "reqbuf_avail", "bufs");
+	for (i = 0; i < EXTRA_LAST_OPC; i++) {
+		char *units;
+
+		switch (i) {
+		case BRW_WRITE_BYTES:
+		case BRW_READ_BYTES:
+			units = "bytes";
+			break;
+		default:
+			units = "reqs";
+			break;
+		}
+		lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
+				     svc_counter_config,
+				     ll_eopcode2str(i), units);
+	}
+	for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+		__u32 opcode = ll_rpc_opcode_table[i].opcode;
+
+		lprocfs_counter_init(svc_stats,
+				     EXTRA_MAX_OPCODES + i, svc_counter_config,
+				     ll_opcode2str(opcode), "usec");
+	}
+
+	debugfs_create_file(name, 0644, svc_debugfs_entry, svc_stats,
+			    &ldebugfs_stats_seq_fops);
+
+	if (dir)
+		*debugfs_root_ret = svc_debugfs_entry;
+	*stats_ret = svc_stats;
+}
+
+static int
+ptlrpc_lprocfs_req_buffer_history_len_seq_show(struct seq_file *m, void *v)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int total = 0;
+	int i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_hist_nrqbds;
+
+	seq_printf(m, "%d\n", total);
+
+	return 0;
+}
+
+LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_req_buffer_history_len);
+
+static int
+ptlrpc_lprocfs_req_buffer_history_max_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int total = 0;
+	int i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svc->srv_hist_nrqbds_cpt_max;
+
+	seq_printf(m, "%d\n", total);
+	return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_req_buffer_history_max_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+	unsigned long long val;
+	unsigned long long limit;
+	int bufpages;
+	int rc;
+
+	rc = kstrtoull_from_user(buffer, count, 0, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
+
+	/* This sanity check is more of an insanity check; we can still
+	 * hose a kernel by allowing the request history to grow too
+	 * far. The roundup to the next power of two is an empirical way
+	 * to take care that request buffer is allocated in Slab and thus
+	 * will be upgraded */
+	bufpages = (roundup_pow_of_two(svc->srv_buf_size) + PAGE_SIZE - 1) >>
+							PAGE_SHIFT;
+	limit = cfs_totalram_pages() / (2 * bufpages);
+	/* do not allow history to consume more than half max number of rqbds */
+	if ((svc->srv_nrqbds_max == 0 && val > limit) ||
+	    (svc->srv_nrqbds_max != 0 && val > svc->srv_nrqbds_max / 2))
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+
+	if (val == 0)
+		svc->srv_hist_nrqbds_cpt_max = 0;
+	else
+		svc->srv_hist_nrqbds_cpt_max =
+			max(1, ((int)val / svc->srv_ncpts));
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_buffer_history_max);
+
+static int
+ptlrpc_lprocfs_req_buffers_max_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+
+	seq_printf(m, "%d\n", svc->srv_nrqbds_max);
+	return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_req_buffers_max_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+	int val;
+	int rc;
+
+	rc = kstrtoint_from_user(buffer, count, 0, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < svc->srv_nbuf_per_group && val != 0)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+
+	svc->srv_nrqbds_max = (uint)val;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_buffers_max);
+
+static ssize_t threads_min_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+}
+
+static ssize_t threads_min_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_init = (int)val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(threads_min);
+
+static ssize_t threads_started_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	struct ptlrpc_service_part *svcpt;
+	int total = 0;
+	int i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_nthrs_running;
+
+	return sprintf(buf, "%d\n", total);
+}
+LUSTRE_RO_ATTR(threads_started);
+
+static ssize_t threads_max_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+}
+
+static ssize_t threads_max_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_limit = (int)val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(threads_max);
+
+/**
+ * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
+ *
+ * \param[in] state The policy state
+ */
+static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state)
+{
+	switch (state) {
+	default:
+		LBUG();
+	case NRS_POL_STATE_INVALID:
+		return "invalid";
+	case NRS_POL_STATE_STOPPED:
+		return "stopped";
+	case NRS_POL_STATE_STOPPING:
+		return "stopping";
+	case NRS_POL_STATE_STARTING:
+		return "starting";
+	case NRS_POL_STATE_STARTED:
+		return "started";
+	}
+}
+
+/**
+ * Obtains status information for \a policy.
+ *
+ * Information is copied in \a info.
+ *
+ * \param[in] policy The policy
+ * \param[out] info  Holds returned status information
+ */
+static void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				       struct ptlrpc_nrs_pol_info *info)
+{
+	LASSERT(policy != NULL);
+	LASSERT(info != NULL);
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	BUILD_BUG_ON(sizeof(info->pi_arg) != sizeof(policy->pol_arg));
+	memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
+	memcpy(info->pi_arg, policy->pol_arg, sizeof(policy->pol_arg));
+
+	info->pi_fallback    = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK);
+	info->pi_state	     = policy->pol_state;
+	/**
+	 * XXX: These are accessed without holding
+	 * ptlrpc_service_part::scp_req_lock.
+	 */
+	info->pi_req_queued  = policy->pol_req_queued;
+	info->pi_req_started = policy->pol_req_started;
+}
+
+/**
+ * Reads and prints policy status information for all policies of a PTLRPC
+ * service.
+ */
+static int ptlrpc_lprocfs_nrs_policies_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	struct ptlrpc_nrs *nrs;
+	struct ptlrpc_nrs_policy *policy;
+	struct ptlrpc_nrs_pol_info *infos;
+	struct ptlrpc_nrs_pol_info tmp;
+	unsigned int num_pols;
+	unsigned int pol_idx = 0;
+	bool hp = false;
+	int i;
+	int rc = 0;
+	ENTRY;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Use the first service partition's regular NRS head in order to obtain
+	 * the number of policies registered with NRS heads of this service. All
+	 * service partitions will have the same number of policies.
+	 */
+	nrs = nrs_svcpt2nrs(svc->srv_parts[0], false);
+
+	spin_lock(&nrs->nrs_lock);
+	num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols;
+	spin_unlock(&nrs->nrs_lock);
+
+	OBD_ALLOC_PTR_ARRAY(infos, num_pols);
+	if (infos == NULL)
+		GOTO(out, rc = -ENOMEM);
+again:
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		nrs = nrs_svcpt2nrs(svcpt, hp);
+		spin_lock(&nrs->nrs_lock);
+
+		pol_idx = 0;
+
+		list_for_each_entry(policy, &nrs->nrs_policy_list,
+				    pol_list) {
+			LASSERT(pol_idx < num_pols);
+
+			nrs_policy_get_info_locked(policy, &tmp);
+			/**
+			 * Copy values when handling the first service
+			 * partition.
+			 */
+			if (i == 0) {
+				memcpy(infos[pol_idx].pi_name, tmp.pi_name,
+				       NRS_POL_NAME_MAX);
+				memcpy(infos[pol_idx].pi_arg, tmp.pi_arg,
+				       sizeof(tmp.pi_arg));
+				memcpy(&infos[pol_idx].pi_state, &tmp.pi_state,
+				       sizeof(tmp.pi_state));
+				infos[pol_idx].pi_fallback = tmp.pi_fallback;
+				/**
+				 * For the rest of the service partitions
+				 * sanity-check the values we get.
+				 */
+			} else {
+				if (strncmp(infos[pol_idx].pi_name,
+					    tmp.pi_name,
+					    NRS_POL_NAME_MAX) != 0) {
+					spin_unlock(&nrs->nrs_lock);
+					rc = -EINVAL;
+					CERROR("%s: failed to check pi_name: rc = %d\n",
+					       svc->srv_thread_name, rc);
+					GOTO(out, rc);
+				}
+				if (strncmp(infos[pol_idx].pi_arg,
+					    tmp.pi_arg,
+					    sizeof(tmp.pi_arg)) != 0) {
+					spin_unlock(&nrs->nrs_lock);
+					rc = -EINVAL;
+					CERROR("%s: failed to check pi_arg: rc = %d\n",
+					       svc->srv_thread_name, rc);
+					GOTO(out, rc);
+				}
+				/**
+				 * Not checking ptlrpc_nrs_pol_info::pi_state,
+				 * because it may be different between
+				 * instances of the same policy in different
+				 * service partitions.
+				 */
+
+				if (infos[pol_idx].pi_fallback !=
+				    tmp.pi_fallback) {
+					spin_unlock(&nrs->nrs_lock);
+					rc = -EINVAL;
+					CERROR("%s: failed to check pi_fallback: rc = %d\n",
+					       svc->srv_thread_name, rc);
+					GOTO(out, rc);
+				}
+			}
+
+			infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
+			infos[pol_idx].pi_req_started += tmp.pi_req_started;
+
+			pol_idx++;
+		}
+		spin_unlock(&nrs->nrs_lock);
+	}
+
+	/**
+	 * Policy status information output is in YAML format.
+	 * For example:
+	 *
+	 *	regular_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 0
+	 *
+	 *	  - name: crrn
+	 *	    state: started
+	 *	    fallback: no
+	 *	    queued: 2015
+	 *	    active: 384
+	 *
+	 *	high_priority_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 2
+	 *
+	 *	  - name: crrn
+	 *	    state: stopped
+	 *	    fallback: no
+	 *	    queued: 0
+	 *	    active: 0
+	 */
+	seq_printf(m, "%s\n", !hp ? "\nregular_requests:" :
+		   "high_priority_requests:");
+
+	for (pol_idx = 0; pol_idx < num_pols; pol_idx++) {
+		if (strlen(infos[pol_idx].pi_arg) > 0)
+			seq_printf(m, "  - name: %s %s\n",
+				   infos[pol_idx].pi_name,
+				   infos[pol_idx].pi_arg);
+		else
+			seq_printf(m, "  - name: %s\n",
+				   infos[pol_idx].pi_name);
+
+
+		seq_printf(m, "    state: %s\n"
+			   "    fallback: %s\n"
+			   "    queued: %-20d\n"
+			   "    active: %-20d\n\n",
+			   nrs_state2str(infos[pol_idx].pi_state),
+			   infos[pol_idx].pi_fallback ? "yes" : "no",
+			   (int)infos[pol_idx].pi_req_queued,
+			   (int)infos[pol_idx].pi_req_started);
+	}
+
+	if (!hp && nrs_svc_has_hp(svc)) {
+		memset(infos, 0, num_pols * sizeof(*infos));
+
+		/**
+		 * Redo the processing for the service's HP NRS heads' policies.
+		 */
+		hp = true;
+		goto again;
+	}
+
+out:
+	if (infos)
+		OBD_FREE_PTR_ARRAY(infos, num_pols);
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+
+#define LPROCFS_NRS_WR_MAX_ARG (1024)
+/**
+ * The longest valid command string is the maxium policy name size, plus the
+ * length of the " reg" substring, plus the lenght of argument
+ */
+#define LPROCFS_NRS_WR_MAX_CMD (NRS_POL_NAME_MAX + sizeof(" reg") - 1 + \
+				LPROCFS_NRS_WR_MAX_ARG)
+
+/**
+ * Starts and stops a given policy on a PTLRPC service.
+ *
+ * Commands consist of the policy name, followed by an optional [reg|hp] token;
+ * if the optional token is omitted, the operation is performed on both the
+ * regular and high-priority (if the service has one) NRS head.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_policies_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+	enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH;
+	char *cmd;
+	char *cmd_copy = NULL;
+	char *policy_name;
+	char *queue_name;
+	int rc = 0;
+	ENTRY;
+
+	if (count >= LPROCFS_NRS_WR_MAX_CMD)
+		GOTO(out, rc = -EINVAL);
+
+	OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD);
+	if (cmd == NULL)
+		GOTO(out, rc = -ENOMEM);
+	/**
+	 * strsep() modifies its argument, so keep a copy
+	 */
+	cmd_copy = cmd;
+
+	if (copy_from_user(cmd, buffer, count))
+		GOTO(out, rc = -EFAULT);
+
+	cmd[count] = '\0';
+
+	policy_name = strsep(&cmd, " ");
+
+	if (strlen(policy_name) > NRS_POL_NAME_MAX - 1)
+		GOTO(out, rc = -EINVAL);
+
+	/**
+	 * No [reg|hp] token has been specified
+	 */
+	if (cmd == NULL)
+		goto default_queue;
+
+	queue_name = strsep(&cmd, " ");
+	/**
+	 * The second token is either an optional [reg|hp] string,
+	 * or arguments
+	 */
+	if (strcmp(queue_name, "reg") == 0)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (strcmp(queue_name, "hp") == 0)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else {
+		if (cmd != NULL)
+			*(cmd - 1) = ' ';
+		cmd = queue_name;
+	}
+
+default_queue:
+
+	if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc))
+		GOTO(out, rc = -ENODEV);
+	else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	rc = ptlrpc_nrs_policy_control(svc, queue, policy_name,
+				       PTLRPC_NRS_CTL_START,
+				       false, cmd);
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+out:
+	if (cmd_copy)
+		OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD);
+
+	RETURN(rc < 0 ? rc : count);
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_policies);
+
+/** @} nrs */
+
+struct ptlrpc_srh_iterator {
+	int			srhi_idx;
+	__u64			srhi_seq;
+	struct ptlrpc_request	*srhi_req;
+};
+
+static int
+ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_srh_iterator *srhi,
+				    __u64 seq)
+{
+	struct list_head	*e;
+	struct ptlrpc_request	*req;
+
+	if (srhi->srhi_req != NULL &&
+	    srhi->srhi_seq > svcpt->scp_hist_seq_culled &&
+	    srhi->srhi_seq <= seq) {
+		/* If srhi_req was set previously, hasn't been culled and
+		 * we're searching for a seq on or after it (i.e. more
+		 * recent), search from it onwards.
+		 * Since the service history is LRU (i.e. culled reqs will
+		 * be near the head), we shouldn't have to do long re-scans.
+		 */
+		LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq,
+			 "%s:%d: seek seq %llu, request seq %llu\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 srhi->srhi_seq, srhi->srhi_req->rq_history_seq);
+		LASSERTF(!list_empty(&svcpt->scp_hist_reqs),
+			 "%s:%d: seek offset %llu, request seq %llu, "
+			 "last culled %llu\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled);
+		e = &srhi->srhi_req->rq_history_list;
+	} else {
+		/* search from start */
+		e = svcpt->scp_hist_reqs.next;
+	}
+
+	while (e != &svcpt->scp_hist_reqs) {
+		req = list_entry(e, struct ptlrpc_request, rq_history_list);
+
+		if (req->rq_history_seq >= seq) {
+			srhi->srhi_seq = req->rq_history_seq;
+			srhi->srhi_req = req;
+			return 0;
+		}
+		e = e->next;
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * ptlrpc history sequence is used as "position" of seq_file, in some case,
+ * seq_read() will increase "position" to indicate reading the next
+ * element, however, low bits of history sequence are reserved for CPT id
+ * (check the details from comments before ptlrpc_req_add_history), which
+ * means seq_read() might change CPT id of history sequence and never
+ * finish reading of requests on a CPT. To make it work, we have to shift
+ * CPT id to high bits and timestamp to low bits, so seq_read() will only
+ * increase timestamp which can correctly indicate the next position.
+ */
+
+/* convert seq_file pos to cpt */
+#define PTLRPC_REQ_POS2CPT(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (__u64)(pos) >> (64 - (svc)->srv_cpt_bits))
+
+/* make up seq_file pos from cpt */
+#define PTLRPC_REQ_CPT2POS(svc, cpt)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (cpt) << (64 - (svc)->srv_cpt_bits))
+
+/* convert sequence to position */
+#define PTLRPC_REQ_SEQ2POS(svc, seq)			\
+	((svc)->srv_cpt_bits == 0 ? (seq) :		\
+	 ((seq) >> (svc)->srv_cpt_bits) |		\
+	 ((seq) << (64 - (svc)->srv_cpt_bits)))
+
+/* convert position to sequence */
+#define PTLRPC_REQ_POS2SEQ(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? (pos) :		\
+	 ((__u64)(pos) << (svc)->srv_cpt_bits) |	\
+	 ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits)))
+
+static void *
+ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_srh_iterator	*srhi;
+	unsigned int			cpt;
+	int				rc;
+	int				i;
+
+	if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */
+		CWARN("Failed to read request history because size of loff_t "
+		      "%d can't match size of u64\n", (int)sizeof(loff_t));
+		return NULL;
+	}
+
+	OBD_ALLOC(srhi, sizeof(*srhi));
+	if (srhi == NULL)
+		return NULL;
+
+	srhi->srhi_seq = 0;
+	srhi->srhi_req = NULL;
+
+	cpt = PTLRPC_REQ_POS2CPT(svc, *pos);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (i < cpt) /* skip */
+			continue;
+		if (i > cpt) /* make up the lowest position for this CPT */
+			*pos = PTLRPC_REQ_CPT2POS(svc, i);
+
+		mutex_lock(&svcpt->scp_mutex);
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
+				PTLRPC_REQ_POS2SEQ(svc, *pos));
+		spin_unlock(&svcpt->scp_lock);
+		mutex_unlock(&svcpt->scp_mutex);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	return NULL;
+}
+
+static void
+ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter)
+{
+	struct ptlrpc_srh_iterator *srhi = iter;
+
+	if (srhi != NULL)
+		OBD_FREE(srhi, sizeof(*srhi));
+}
+
+static void *
+ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
+				    void *iter, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	__u64				seq;
+	int				rc;
+	int				i;
+
+	for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) {
+		svcpt = svc->srv_parts[i];
+
+		if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */
+			srhi->srhi_req = NULL;
+			seq = srhi->srhi_seq = 0;
+		} else { /* the next sequence */
+			seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
+		}
+
+		mutex_lock(&svcpt->scp_mutex);
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
+		spin_unlock(&svcpt->scp_lock);
+		mutex_unlock(&svcpt->scp_mutex);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	++*pos;
+	return NULL;
+}
+
+/* common ost/mdt so_req_printer */
+void target_print_req(void *seq_file, struct ptlrpc_request *req)
+{
+	/* Called holding srv_lock with irqs disabled.
+	 * Print specific req contents and a newline.
+	 * CAVEAT EMPTOR: check request message length before printing!!!
+	 * You might have received any old crap so you must be just as
+	 * careful here as the service's request parser!!!
+	 */
+	struct seq_file *sf = seq_file;
+
+	switch (req->rq_phase) {
+	case RQ_PHASE_NEW:
+		/* still awaiting a service thread's attention, or rejected
+		 * because the generic request message didn't unpack
+		 */
+		seq_printf(sf, "<not swabbed>\n");
+		break;
+	case RQ_PHASE_INTERPRET:
+		/* being handled, so basic msg swabbed, and opc is valid
+		 * but racing with mds_handle().  fallthrough.
+		 */
+		fallthrough;
+	case RQ_PHASE_COMPLETE:
+		/* been handled by mds_handle(), reply state may be volatile */
+		seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
+		break;
+	default:
+		DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase);
+	}
+}
+EXPORT_SYMBOL(target_print_req);
+
+static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_request		*req;
+	int				rc;
+
+	LASSERT(srhi->srhi_idx < svc->srv_ncpts);
+
+	svcpt = svc->srv_parts[srhi->srhi_idx];
+
+	mutex_lock(&svcpt->scp_mutex);
+	spin_lock(&svcpt->scp_lock);
+
+	rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
+
+	if (rc == 0) {
+		struct timespec64 arrival, sent, arrivaldiff;
+		char nidstr[LNET_NIDSTR_SIZE];
+
+		req = srhi->srhi_req;
+
+		arrival.tv_sec = req->rq_arrival_time.tv_sec;
+		arrival.tv_nsec = req->rq_arrival_time.tv_nsec;
+		sent.tv_sec = req->rq_sent;
+		sent.tv_nsec = 0;
+		arrivaldiff = timespec64_sub(sent, arrival);
+
+		/* Print common req fields.
+		 * CAVEAT EMPTOR: we're racing with the service handler
+		 * here.  The request could contain any old crap, so you
+		 * must be just as careful as the service's request
+		 * parser. Currently I only print stuff here I know is OK
+		 * to look at coz it was set up in request_in_callback()!!!
+		 */
+		seq_printf(s,
+			   "%lld:%s:%s:x%llu:%d:%s:%lld.%06lld:%lld.%06llds(%+lld.0s) ",
+			   req->rq_history_seq,
+			   req->rq_export && req->rq_export->exp_obd ?
+				req->rq_export->exp_obd->obd_name :
+				libcfs_nid2str_r(req->rq_self, nidstr,
+						 sizeof(nidstr)),
+			   libcfs_id2str(req->rq_peer), req->rq_xid,
+			   req->rq_reqlen, ptlrpc_rqphase2str(req),
+			   (s64)req->rq_arrival_time.tv_sec,
+			   (s64)(req->rq_arrival_time.tv_nsec / NSEC_PER_USEC),
+			   (s64)arrivaldiff.tv_sec,
+			   (s64)(arrivaldiff.tv_nsec / NSEC_PER_USEC),
+			   (s64)(req->rq_sent - req->rq_deadline));
+		if (svc->srv_ops.so_req_printer == NULL)
+			seq_printf(s, "\n");
+		else
+			svc->srv_ops.so_req_printer(s, srhi->srhi_req);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+	mutex_unlock(&svcpt->scp_mutex);
+
+	return rc;
+}
+
+static int
+ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
+{
+	static const struct seq_operations sops = {
+		.start = ptlrpc_lprocfs_svc_req_history_start,
+		.stop  = ptlrpc_lprocfs_svc_req_history_stop,
+		.next  = ptlrpc_lprocfs_svc_req_history_next,
+		.show  = ptlrpc_lprocfs_svc_req_history_show,
+	};
+	struct seq_file	*seqf;
+	int		rc;
+
+	rc = seq_open(file, &sops);
+	if (rc)
+		return rc;
+
+	seqf = file->private_data;
+	seqf->private = inode->i_private;
+	return 0;
+}
+
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	time64_t worst_timestamp;
+	timeout_t cur_timeout;
+	timeout_t worst_timeout;
+	int i;
+
+	if (AT_OFF) {
+		seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n",
+			   obd_timeout);
+		return 0;
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		cur_timeout = at_get(&svcpt->scp_at_estimate);
+		worst_timeout = svcpt->scp_at_estimate.at_worst_timeout_ever;
+		worst_timestamp = svcpt->scp_at_estimate.at_worst_timestamp;
+
+		seq_printf(m, "%10s : cur %3u  worst %3u (at %lld, %llds ago) ",
+			   "service", cur_timeout, worst_timeout,
+			   worst_timestamp,
+			   ktime_get_real_seconds() - worst_timestamp);
+
+		lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate);
+	}
+
+	return 0;
+}
+
+LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
+
+static ssize_t high_priority_ratio_show(struct kobject *kobj,
+					struct attribute *attr,
+					char *buf)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	return sprintf(buf, "%d\n", svc->srv_hpreq_ratio);
+}
+
+static ssize_t high_priority_ratio_store(struct kobject *kobj,
+					 struct attribute *attr,
+					 const char *buffer,
+					 size_t count)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	int rc;
+	unsigned long val;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc < 0)
+		return rc;
+
+	spin_lock(&svc->srv_lock);
+	svc->srv_hpreq_ratio = val;
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(high_priority_ratio);
+
+static struct attribute *ptlrpc_svc_attrs[] = {
+	&lustre_attr_threads_min.attr,
+	&lustre_attr_threads_started.attr,
+	&lustre_attr_threads_max.attr,
+	&lustre_attr_high_priority_ratio.attr,
+	NULL,
+};
+
+KOBJ_ATTRIBUTE_GROUPS(ptlrpc_svc); /* creates ptlrpc_svc_groups */
+
+static void ptlrpc_sysfs_svc_release(struct kobject *kobj)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	complete(&svc->srv_kobj_unregister);
+}
+
+static struct kobj_type ptlrpc_svc_ktype = {
+	.default_groups = KOBJ_ATTR_GROUPS(ptlrpc_svc),
+	.sysfs_ops	= &lustre_sysfs_ops,
+	.release	= ptlrpc_sysfs_svc_release,
+};
+
+void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc)
+{
+	/* Let's see if we had a chance at initialization first */
+	if (svc->srv_kobj.kset) {
+		kobject_put(&svc->srv_kobj);
+		wait_for_completion(&svc->srv_kobj_unregister);
+	}
+}
+
+int ptlrpc_sysfs_register_service(struct kset *parent,
+				  struct ptlrpc_service *svc)
+{
+	svc->srv_kobj.kset = parent;
+	init_completion(&svc->srv_kobj_unregister);
+	return kobject_init_and_add(&svc->srv_kobj, &ptlrpc_svc_ktype,
+				    &parent->kobj, "%s", svc->srv_name);
+}
+
+void ptlrpc_ldebugfs_register_service(struct dentry *entry,
+				      struct ptlrpc_service *svc)
+{
+	struct ldebugfs_vars ldebugfs_vars[] = {
+		{ .name	= "req_buffer_history_len",
+		  .fops	= &ptlrpc_lprocfs_req_buffer_history_len_fops,
+		  .data	= svc },
+		{ .name = "req_buffer_history_max",
+		  .fops	= &ptlrpc_lprocfs_req_buffer_history_max_fops,
+		  .data	= svc },
+		{ .name = "timeouts",
+		  .fops = &ptlrpc_lprocfs_timeouts_fops,
+		  .data = svc },
+		{ .name = "nrs_policies",
+		  .fops = &ptlrpc_lprocfs_nrs_policies_fops,
+		  .data = svc },
+		{ .name = "req_buffers_max",
+		  .fops = &ptlrpc_lprocfs_req_buffers_max_fops,
+		  .data = svc },
+		{ NULL }
+	};
+	static const struct file_operations req_history_fops = {
+		.owner		= THIS_MODULE,
+		.open		= ptlrpc_lprocfs_svc_req_history_open,
+		.read		= seq_read,
+		.llseek		= seq_lseek,
+		.release	= lprocfs_seq_release,
+	};
+
+	ptlrpc_ldebugfs_register(entry, svc->srv_name, "stats",
+				 &svc->srv_debugfs_entry, &svc->srv_stats);
+	if (!svc->srv_debugfs_entry)
+		return;
+
+	ldebugfs_add_vars(svc->srv_debugfs_entry, ldebugfs_vars, NULL);
+
+	debugfs_create_file("req_history", 0400, svc->srv_debugfs_entry, svc,
+			    &req_history_fops);
+}
+
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd)
+{
+	ptlrpc_ldebugfs_register(obd->obd_debugfs_entry, NULL, "stats",
+				 &obd->obd_svc_debugfs_entry,
+				 &obd->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
+
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount)
+{
+	struct lprocfs_stats *svc_stats;
+	__u32 op = lustre_msg_get_opc(req->rq_reqmsg);
+	int opc = opcode_offset(op);
+
+	svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+	if (svc_stats == NULL || opc <= 0)
+		return;
+
+	LASSERT(opc < LUSTRE_MAX_OPCODES);
+	if (!(op == LDLM_ENQUEUE || op == MDS_REINT))
+		lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount);
+}
+
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes)
+{
+	struct lprocfs_stats *svc_stats;
+	int idx;
+
+	if (!req->rq_import)
+		return;
+
+	svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+	if (!svc_stats)
+		return;
+
+	idx = lustre_msg_get_opc(req->rq_reqmsg);
+	switch (idx) {
+	case OST_READ:
+		idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR;
+		break;
+	case OST_WRITE:
+		idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR;
+		break;
+	default:
+		LASSERTF(0, "unsupported opcode %u\n", idx);
+		break;
+	}
+
+	lprocfs_counter_add(svc_stats, idx, bytes);
+}
+
+EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
+
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
+{
+	debugfs_remove_recursive(svc->srv_debugfs_entry);
+
+	if (svc->srv_stats)
+		lprocfs_free_stats(&svc->srv_stats);
+}
+
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
+{
+	/* cleanup first to allow concurrent access to device's
+	 * stats via debugfs to complete safely
+	 */
+	lprocfs_obd_cleanup(obd);
+
+	debugfs_remove_recursive(obd->obd_svc_debugfs_entry);
+
+	if (obd->obd_svc_stats)
+		lprocfs_free_stats(&obd->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
+
+ssize_t ping_show(struct kobject *kobj, struct attribute *attr,
+		  char *buffer)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	struct ptlrpc_request *req;
+	int rc;
+
+	ENTRY;
+	with_imp_locked(obd, imp, rc)
+		req = ptlrpc_prep_ping(imp);
+
+	if (rc)
+		RETURN(rc);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	rc = ptlrpc_queue_wait(req);
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ping_show);
+
+/* kept for older verison of tools. */
+ssize_t ping_store(struct kobject *kobj, struct attribute *attr,
+		   const char *buffer, size_t count)
+{
+	int rc = ping_show(kobj, attr, (char *)buffer);
+
+	return (rc < 0) ? rc : count;
+}
+EXPORT_SYMBOL(ping_store);
+
+/* Write the connection UUID to this file to attempt to connect to that node.
+ * The connection UUID is a node's primary NID. For example,
+ * "echo connection=192.168.0.1@tcp0::instance > .../import".
+ */
+ssize_t
+ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off)
+{
+	struct seq_file	  *m	= file->private_data;
+	struct obd_device *obd	= m->private;
+	struct obd_import *imp;
+	char *kbuf = NULL;
+	char *uuid;
+	char *ptr;
+	int do_reconn = 1;
+	const char prefix[] = "connection=";
+	const int prefix_len = sizeof(prefix) - 1;
+	int rc = 0;
+
+	if (count > PAGE_SIZE - 1 || count <= prefix_len)
+		return -EINVAL;
+
+	OBD_ALLOC(kbuf, count + 1);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, buffer, count))
+		GOTO(out, rc = -EFAULT);
+
+	kbuf[count] = 0;
+
+	/* only support connection=uuid::instance now */
+	if (strncmp(prefix, kbuf, prefix_len) != 0)
+		GOTO(out, rc = -EINVAL);
+
+	with_imp_locked(obd, imp, rc) {
+		uuid = kbuf + prefix_len;
+		ptr = strstr(uuid, "::");
+		if (ptr) {
+			u32 inst;
+			int rc;
+
+			*ptr = 0;
+			do_reconn = 0;
+			ptr += 2; /* Skip :: */
+			rc = kstrtouint(ptr, 10, &inst);
+			if (rc) {
+				CERROR("config: wrong instance # %s\n", ptr);
+			} else if (inst != imp->imp_connect_data.ocd_instance) {
+				CDEBUG(D_INFO,
+				       "IR: %s is connecting to an obsoleted target(%u/%u), reconnecting...\n",
+				       imp->imp_obd->obd_name,
+				       imp->imp_connect_data.ocd_instance,
+				       inst);
+				do_reconn = 1;
+			} else {
+				CDEBUG(D_INFO,
+				       "IR: %s has already been connecting to "
+				       "new target(%u)\n",
+				       imp->imp_obd->obd_name, inst);
+			}
+		}
+
+		if (do_reconn)
+			ptlrpc_recover_import(imp, uuid, 1);
+	}
+
+out:
+	OBD_FREE(kbuf, count + 1);
+	return rc ?: count;
+}
+EXPORT_SYMBOL(ldebugfs_import_seq_write);
+
+int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *n)
+{
+	struct obd_device *obd = m->private;
+	struct obd_import *imp;
+	int rc;
+
+	with_imp_locked(obd, imp, rc)
+		seq_printf(m, "%d\n", !imp->imp_no_pinger_recover);
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_pinger_recov_seq_show);
+
+ssize_t
+lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	struct obd_import *imp;
+	bool val;
+	int rc;
+
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	with_imp_locked(obd, imp, rc) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_no_pinger_recover = !val;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	return rc ?: count;
+}
+EXPORT_SYMBOL(lprocfs_pinger_recov_seq_write);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
new file mode 100644
index 0000000000000..cfd16ff3ab877
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
@@ -0,0 +1,1028 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <libcfs/linux/linux-mem.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_lib.h>
+#include <obd.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+#include <lnet/lib-lnet.h> /* for CFS_FAIL_PTLRPC_OST_BULK_CB2 */
+
+/**
+ * Helper function. Sends \a len bytes from \a base at offset \a offset
+ * over \a conn connection to portal \a portal.
+ * Returns 0 on success or error code.
+ */
+static int ptl_send_buf(struct lnet_handle_md *mdh, void *base, int len,
+			enum lnet_ack_req ack, struct ptlrpc_cb_id *cbid,
+			lnet_nid_t self, struct lnet_process_id peer_id,
+			int portal, __u64 xid, unsigned int offset,
+			struct lnet_handle_md *bulk_cookie)
+{
+	int              rc;
+	struct lnet_md         md;
+	ENTRY;
+
+	LASSERT (portal != 0);
+	CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id));
+	md.start     = base;
+	md.length    = len;
+	md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+	md.options   = PTLRPC_MD_OPTIONS;
+	md.user_ptr  = cbid;
+	md.handler   = ptlrpc_handler;
+	LNetInvalidateMDHandle(&md.bulk_handle);
+
+	if (bulk_cookie) {
+		md.bulk_handle = *bulk_cookie;
+		md.options |= LNET_MD_BULK_HANDLE;
+	}
+
+	if (unlikely(ack == LNET_ACK_REQ &&
+		     OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
+		/* don't ask for the ack to simulate failing client */
+		ack = LNET_NOACK_REQ;
+	}
+
+	rc = LNetMDBind(&md, LNET_UNLINK, mdh);
+	if (unlikely(rc != 0)) {
+		CERROR ("LNetMDBind failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		RETURN (-ENOMEM);
+	}
+
+	CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n",
+	       len, portal, xid, offset);
+
+	percpu_ref_get(&ptlrpc_pending);
+
+	rc = LNetPut(self, *mdh, ack,
+		     peer_id, portal, xid, offset, 0);
+	if (unlikely(rc != 0)) {
+		int rc2;
+		/* We're going to get an UNLINK event when I unlink below,
+		 * which will complete just like any other failed send, so
+		 * I fall through and return success here! */
+		CERROR("LNetPut(%s, %d, %lld) failed: %d\n",
+		       libcfs_id2str(peer_id), portal, xid, rc);
+		rc2 = LNetMDUnlink(*mdh);
+		LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+	}
+
+	RETURN (0);
+}
+
+#define mdunlink_iterate_helper(mds, count) \
+		__mdunlink_iterate_helper(mds, count, false) 
+static void __mdunlink_iterate_helper(struct lnet_handle_md *bd_mds,
+				      int count, bool discard)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		__LNetMDUnlink(bd_mds[i], discard);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Prepare bulk descriptor for specified incoming request \a req that
+ * can fit \a nfrags * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on server-side after request was already
+ * received.
+ * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
+					      unsigned nfrags, unsigned max_brw,
+					      unsigned int type,
+					      unsigned portal,
+					      const struct ptlrpc_bulk_frag_ops
+						*ops)
+{
+	struct obd_export *exp = req->rq_export;
+	struct ptlrpc_bulk_desc *desc;
+
+	ENTRY;
+	LASSERT(ptlrpc_is_bulk_op_active(type));
+
+	desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops);
+	if (desc == NULL)
+		RETURN(NULL);
+
+        desc->bd_export = class_export_get(exp);
+        desc->bd_req = req;
+
+        desc->bd_cbid.cbid_fn  = server_bulk_callback;
+        desc->bd_cbid.cbid_arg = desc;
+
+        /* NB we don't assign rq_bulk here; server-side requests are
+         * re-used, and the handler frees the bulk desc explicitly. */
+
+        return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_exp);
+
+/**
+ * Starts bulk transfer for descriptor \a desc on the server.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
+{
+	struct obd_export        *exp = desc->bd_export;
+	lnet_nid_t		  self_nid;
+	struct lnet_process_id	  peer_id;
+	int                       rc = 0;
+	__u64                     mbits;
+	int                       posted_md;
+	int                       total_md;
+	struct lnet_md                 md;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_PUT_NET))
+		RETURN(0);
+
+	/* NB no locking required until desc is on the network */
+	LASSERT(ptlrpc_is_bulk_op_active(desc->bd_type));
+
+	LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback);
+	LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+	/*
+	 * Multi-Rail: get the preferred self and peer NIDs from the
+	 * request, so they are based on the route taken by the
+	 * message.
+	 */
+	self_nid = desc->bd_req->rq_self;
+	peer_id = desc->bd_req->rq_source;
+
+	/* NB total length may be 0 for a read past EOF, so we send 0
+	 * length bulks, since the client expects bulk events.
+	 *
+	 * The client may not need all of the bulk mbits for the RPC. The RPC
+	 * used the mbits of the highest bulk mbits needed, and the server masks
+	 * off high bits to get bulk count for this RPC. LU-1431 */
+	mbits = desc->bd_req->rq_mbits & ~((__u64)desc->bd_md_max_brw - 1);
+	total_md = desc->bd_req->rq_mbits - mbits + 1;
+	desc->bd_refs = total_md;
+	desc->bd_failure = 0;
+
+	md.user_ptr = &desc->bd_cbid;
+	md.handler = ptlrpc_handler;
+	md.threshold = 2; /* SENT and ACK/REPLY */
+
+	for (posted_md = 0; posted_md < total_md; mbits++) {
+		md.options = PTLRPC_MD_OPTIONS;
+
+		/* NB it's assumed that source and sink buffer frags are
+		 * page-aligned. Otherwise we'd have to send client bulk
+		 * sizes over and split server buffer accordingly */
+		ptlrpc_fill_bulk_md(&md, desc, posted_md);
+		rc = LNetMDBind(&md, LNET_UNLINK, &desc->bd_mds[posted_md]);
+		if (rc != 0) {
+			CERROR("%s: LNetMDBind failed for MD %u: rc = %d\n",
+			       exp->exp_obd->obd_name, posted_md, rc);
+			LASSERT(rc == -ENOMEM);
+			if (posted_md == 0) {
+				desc->bd_md_count = 0;
+				RETURN(-ENOMEM);
+			}
+			break;
+		}
+		percpu_ref_get(&ptlrpc_pending);
+
+		/* sanity.sh 224c: lets skip last md */
+		if (posted_md == desc->bd_md_max_brw - 1)
+			OBD_FAIL_CHECK_RESET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB3,
+					     CFS_FAIL_PTLRPC_OST_BULK_CB2);
+
+		/* Network is about to get at the memory */
+		if (ptlrpc_is_bulk_put_source(desc->bd_type))
+			rc = LNetPut(self_nid, desc->bd_mds[posted_md],
+				     LNET_ACK_REQ, peer_id,
+				     desc->bd_portal, mbits, 0, 0);
+		else
+			rc = LNetGet(self_nid, desc->bd_mds[posted_md],
+				     peer_id, desc->bd_portal, mbits, 0, false);
+
+		posted_md++;
+		if (rc != 0) {
+			CERROR("%s: failed bulk transfer with %s:%u x%llu: "
+			       "rc = %d\n", exp->exp_obd->obd_name,
+			       libcfs_id2str(peer_id), desc->bd_portal,
+			       mbits, rc);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		/* Can't send, so we unlink the MD bound above.  The UNLINK
+		 * event this creates will signal completion with failure,
+		 * so we return SUCCESS here! */
+		spin_lock(&desc->bd_lock);
+		desc->bd_refs -= total_md - posted_md;
+		spin_unlock(&desc->bd_lock);
+		LASSERT(desc->bd_refs >= 0);
+
+		mdunlink_iterate_helper(desc->bd_mds, posted_md);
+		RETURN(0);
+	}
+
+	CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d "
+	       "id %s mbits %#llx-%#llx\n", desc->bd_iov_count,
+	       desc->bd_nob, desc->bd_portal, libcfs_id2str(peer_id),
+	       mbits - posted_md, mbits - 1);
+
+	RETURN(0);
+}
+
+/**
+ * Server side bulk abort. Idempotent. Not thread-safe (i.e. only
+ * serialises with completion callback)
+ */
+void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc)
+{
+	LASSERT(!in_interrupt());           /* might sleep */
+
+	if (!ptlrpc_server_bulk_active(desc))   /* completed or */
+		return;                         /* never started */
+
+	/* We used to poison the pages with 0xab here because we did not want to
+	 * send any meaningful data over the wire for evicted clients (bug 9297)
+	 * However, this is no longer safe now that we use the page cache on the
+	 * OSS (bug 20560) */
+
+	/* The unlink ensures the callback happens ASAP and is the last
+	 * one.  If it fails, it must be because completion just happened,
+	 * but we must still wait_event_idle_timeout() in this case, to give
+	 * us a chance to run server_bulk_callback()
+	 */
+	__mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw, true);
+
+	for (;;) {
+		/* Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs */
+		int seconds = PTLRPC_REQ_LONG_UNLINK;
+
+		while (seconds > 0 &&
+		       wait_event_idle_timeout(desc->bd_waitq,
+					       !ptlrpc_server_bulk_active(desc),
+					       cfs_time_seconds(1)) == 0)
+			seconds -= 1;
+		if (seconds > 0)
+			return;
+
+		CWARN("Unexpectedly long timeout: desc %p\n", desc);
+	}
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * Register bulk at the sender for later transfer.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_register_bulk(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	struct lnet_processid peer;
+	int rc = 0;
+	int posted_md;
+	int total_md;
+	__u64 mbits;
+	struct lnet_me *me;
+	struct lnet_md md;
+	ENTRY;
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
+                RETURN(0);
+
+	/* NB no locking required until desc is on the network */
+	LASSERT(desc->bd_nob > 0);
+	LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+	LASSERT(desc->bd_req != NULL);
+	LASSERT(ptlrpc_is_bulk_op_passive(desc->bd_type));
+
+	/* cleanup the state of the bulk for it will be reused */
+	if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY)
+		desc->bd_nob_transferred = 0;
+	else if (desc->bd_nob_transferred != 0)
+		/* If the network failed after an RPC was sent, this condition
+		 * could happen.  Rather than assert (was here before), return
+		 * an EIO error. */
+		RETURN(-EIO);
+
+	desc->bd_failure = 0;
+
+	peer = desc->bd_import->imp_connection->c_peer;
+
+	LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
+	LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+	total_md = desc->bd_md_count;
+	/* rq_mbits is matchbits of the final bulk */
+	mbits = req->rq_mbits - desc->bd_md_count + 1;
+
+	LASSERTF(mbits == (req->rq_mbits & PTLRPC_BULK_OPS_MASK),
+		 "first mbits = x%llu, last mbits = x%llu\n",
+		 mbits, req->rq_mbits);
+	LASSERTF(!(desc->bd_registered &&
+		   req->rq_send_state != LUSTRE_IMP_REPLAY) ||
+		 mbits != desc->bd_last_mbits,
+		 "registered: %d  rq_mbits: %llu bd_last_mbits: %llu\n",
+		 desc->bd_registered, mbits, desc->bd_last_mbits);
+
+	desc->bd_registered = 1;
+	desc->bd_last_mbits = mbits;
+	desc->bd_refs = total_md;
+	md.user_ptr = &desc->bd_cbid;
+	md.handler = ptlrpc_handler;
+	md.threshold = 1;                       /* PUT or GET */
+
+	for (posted_md = 0; posted_md < desc->bd_md_count;
+	     posted_md++, mbits++) {
+		md.options = PTLRPC_MD_OPTIONS |
+			     (ptlrpc_is_bulk_op_get(desc->bd_type) ?
+			      LNET_MD_OP_GET : LNET_MD_OP_PUT);
+		ptlrpc_fill_bulk_md(&md, desc, posted_md);
+
+		if (posted_md > 0 && posted_md + 1 == desc->bd_md_count &&
+		    OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_ATTACH)) {
+			rc = -ENOMEM;
+		} else {
+			me = LNetMEAttach(desc->bd_portal, &peer, mbits, 0,
+				  LNET_UNLINK, LNET_INS_AFTER);
+			rc = PTR_ERR_OR_ZERO(me);
+		}
+		if (rc != 0) {
+			CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n",
+			       desc->bd_import->imp_obd->obd_name, mbits,
+			       posted_md, rc);
+			break;
+		}
+		percpu_ref_get(&ptlrpc_pending);
+
+		/* About to let the network at it... */
+		rc = LNetMDAttach(me, &md, LNET_UNLINK,
+				  &desc->bd_mds[posted_md]);
+		if (rc != 0) {
+			CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n",
+			       desc->bd_import->imp_obd->obd_name, mbits,
+			       posted_md, rc);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		spin_lock(&desc->bd_lock);
+		desc->bd_refs -= total_md - posted_md;
+		spin_unlock(&desc->bd_lock);
+		LASSERT(desc->bd_refs >= 0);
+		mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+		req->rq_status = -ENOMEM;
+		desc->bd_registered = 0;
+		RETURN(-ENOMEM);
+	}
+
+	spin_lock(&desc->bd_lock);
+	/* Holler if peer manages to touch buffers before he knows the mbits */
+	if (desc->bd_refs != total_md)
+		CWARN("%s: Peer %s touched %d buffers while I registered\n",
+		      desc->bd_import->imp_obd->obd_name, libcfs_idstr(&peer),
+		      total_md - desc->bd_refs);
+	spin_unlock(&desc->bd_lock);
+
+	CDEBUG(D_NET,
+	       "Setup %u bulk %s buffers: %u pages %u bytes, mbits x%#llx-%#llx, portal %u\n",
+	       desc->bd_refs,
+	       ptlrpc_is_bulk_op_get(desc->bd_type) ? "get-source" : "put-sink",
+	       desc->bd_iov_count, desc->bd_nob,
+	       desc->bd_last_mbits, req->rq_mbits, desc->bd_portal);
+
+	RETURN(0);
+}
+
+/**
+ * Disconnect a bulk desc from the network. Idempotent. Not
+ * thread-safe (i.e. only interlocks with completion callback).
+ * Returns 1 on success or 0 if network unregistration failed for whatever
+ * reason.
+ */
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	ENTRY;
+
+	LASSERT(!in_interrupt());     /* might sleep */
+
+	if (desc)
+		desc->bd_registered = 0;
+
+	/* Let's setup deadline for reply unlink. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+	    async && req->rq_bulk_deadline == 0 && cfs_fail_val == 0)
+		req->rq_bulk_deadline = ktime_get_real_seconds() +
+					PTLRPC_REQ_LONG_UNLINK;
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		RETURN(1);				/* never registered */
+
+	LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
+
+	/* the unlink ensures the callback happens ASAP and is the last
+	 * one.  If it fails, it must be because completion just happened,
+	 * but we must still wait_event_idle_timeout() in this case to give
+	 * us a chance to run client_bulk_callback()
+	 */
+	mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		RETURN(1);				/* never registered */
+
+	/* Move to "Unregistering" phase as bulk was not unlinked yet. */
+	ptlrpc_rqphase_move(req, RQ_PHASE_UNREG_BULK);
+
+	/* Do not wait for unlink to finish. */
+	if (async)
+		RETURN(0);
+
+	for (;;) {
+		/* The wq argument is ignored by user-space wait_event macros */
+		wait_queue_head_t *wq = (req->rq_set != NULL) ?
+					&req->rq_set->set_waitq :
+					&req->rq_reply_waitq;
+		/*
+		 * Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs.
+		 */
+		int seconds = PTLRPC_REQ_LONG_UNLINK;
+
+		while (seconds > 0 &&
+		       wait_event_idle_timeout(*wq,
+					       !ptlrpc_client_bulk_active(req),
+					       cfs_time_seconds(1)) == 0)
+			seconds -= 1;
+		if (seconds > 0) {
+			ptlrpc_rqphase_move(req, req->rq_next_phase);
+			RETURN(1);
+		}
+
+		DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
+			  desc);
+	}
+	RETURN(0);
+}
+
+static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_service		*svc = svcpt->scp_service;
+	timeout_t service_timeout;
+
+	service_timeout = clamp_t(timeout_t, ktime_get_real_seconds() -
+					     req->rq_arrival_time.tv_sec, 1,
+				  (AT_OFF ? obd_timeout * 3 / 2 : at_max));
+        if (!(flags & PTLRPC_REPLY_EARLY) &&
+            (req->rq_type != PTL_RPC_MSG_ERR) &&
+            (req->rq_reqmsg != NULL) &&
+            !(lustre_msg_get_flags(req->rq_reqmsg) &
+              (MSG_RESENT | MSG_REPLAY |
+               MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
+                /* early replies, errors and recovery requests don't count
+		 * toward our service time estimate
+		 */
+		timeout_t oldse = at_measured(&svcpt->scp_at_estimate,
+					      service_timeout);
+
+		if (oldse != 0) {
+			DEBUG_REQ(D_ADAPTTO, req,
+				  "svc %s changed estimate from %d to %d",
+				  svc->srv_name, oldse,
+				  at_get(&svcpt->scp_at_estimate));
+		}
+        }
+        /* Report actual service time for client latency calc */
+	lustre_msg_set_service_timeout(req->rq_repmsg, service_timeout);
+	/* Report service time estimate for future client reqs, but report 0
+	 * (to be ignored by client) if it's an error reply during recovery.
+	 * b=15815
+	 */
+	if (req->rq_type == PTL_RPC_MSG_ERR &&
+	    (req->rq_export == NULL ||
+	     req->rq_export->exp_obd->obd_recovering)) {
+		lustre_msg_set_timeout(req->rq_repmsg, 0);
+	} else {
+		timeout_t timeout;
+
+		if (req->rq_export && req->rq_reqmsg != NULL &&
+		    (flags & PTLRPC_REPLY_EARLY) &&
+		    lustre_msg_get_flags(req->rq_reqmsg) &
+		    (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+			struct obd_device *exp_obd = req->rq_export->exp_obd;
+
+			timeout = ktime_get_real_seconds() -
+				  req->rq_arrival_time.tv_sec +
+				  min_t(timeout_t, at_extra,
+					exp_obd->obd_recovery_timeout / 4);
+		} else {
+			timeout = at_get(&svcpt->scp_at_estimate);
+		}
+		lustre_msg_set_timeout(req->rq_repmsg, timeout);
+	}
+
+	if (req->rq_reqmsg &&
+	    !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
+		       "req_flags=%#x magic=%x/%x len=%d\n",
+		       flags, lustre_msg_get_flags(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+	}
+}
+
+/**
+ * Send request reply from request \a req reply buffer.
+ * \a flags defines reply types
+ * Returns 0 on success or error code
+ */
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct ptlrpc_connection  *conn;
+	int                        rc;
+
+        /* We must already have a reply buffer (only ptlrpc_error() may be
+         * called without one). The reply generated by sptlrpc layer (e.g.
+         * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
+         * have a request buffer which is either the actual (swabbed) incoming
+         * request, or a saved copy if this is a req saved in
+         * target_queue_final_reply().
+         */
+        LASSERT (req->rq_no_reply == 0);
+        LASSERT (req->rq_reqbuf != NULL);
+        LASSERT (rs != NULL);
+        LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
+        LASSERT (req->rq_repmsg != NULL);
+        LASSERT (req->rq_repmsg == rs->rs_msg);
+        LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
+        LASSERT (rs->rs_cb_id.cbid_arg == rs);
+
+        /* There may be no rq_export during failover */
+
+        if (unlikely(req->rq_export && req->rq_export->exp_obd &&
+                     req->rq_export->exp_obd->obd_fail)) {
+                /* Failed obd's only send ENODEV */
+                req->rq_type = PTL_RPC_MSG_ERR;
+                req->rq_status = -ENODEV;
+                CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
+                       req->rq_export->exp_obd->obd_minor);
+        }
+
+        if (req->rq_type != PTL_RPC_MSG_ERR)
+                req->rq_type = PTL_RPC_MSG_REPLY;
+
+        lustre_msg_set_type(req->rq_repmsg, req->rq_type);
+	lustre_msg_set_status(req->rq_repmsg,
+			      ptlrpc_status_hton(req->rq_status));
+        lustre_msg_set_opc(req->rq_repmsg,
+                req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
+
+        target_pack_pool_reply(req);
+
+        ptlrpc_at_set_reply(req, flags);
+
+        if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
+                conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
+        else
+                conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
+
+        if (unlikely(conn == NULL)) {
+                CERROR("not replying on NULL connection\n"); /* bug 9635 */
+                return -ENOTCONN;
+        }
+        ptlrpc_rs_addref(rs);                   /* +1 ref for the network */
+
+        rc = sptlrpc_svc_wrap_reply(req);
+        if (unlikely(rc))
+                goto out;
+
+	req->rq_sent = ktime_get_real_seconds();
+
+	rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+			  (rs->rs_difficult && !rs->rs_no_ack) ?
+			  LNET_ACK_REQ : LNET_NOACK_REQ,
+			  &rs->rs_cb_id, req->rq_self, req->rq_source,
+			  ptlrpc_req2svc(req)->srv_rep_portal,
+			  req->rq_rep_mbits ? req->rq_rep_mbits : req->rq_xid,
+			  req->rq_reply_off, NULL);
+out:
+        if (unlikely(rc != 0))
+                ptlrpc_req_drop_rs(req);
+        ptlrpc_connection_put(conn);
+        return rc;
+}
+
+int ptlrpc_reply (struct ptlrpc_request *req)
+{
+        if (req->rq_no_reply)
+                return 0;
+        else
+                return (ptlrpc_send_reply(req, 0));
+}
+
+/**
+ * For request \a req send an error reply back. Create empty
+ * reply buffers if necessary.
+ */
+int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
+{
+        int rc;
+        ENTRY;
+
+        if (req->rq_no_reply)
+                RETURN(0);
+
+        if (!req->rq_repmsg) {
+                rc = lustre_pack_reply(req, 1, NULL, NULL);
+                if (rc)
+                        RETURN(rc);
+        }
+
+        if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
+	    req->rq_status != -EPERM && req->rq_status != -ENOENT &&
+	    req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
+                req->rq_type = PTL_RPC_MSG_ERR;
+
+        rc = ptlrpc_send_reply(req, may_be_difficult);
+        RETURN(rc);
+}
+
+int ptlrpc_error(struct ptlrpc_request *req)
+{
+        return ptlrpc_send_error(req, 0);
+}
+
+/**
+ * Send request \a request.
+ * if \a noreply is set, don't expect any reply back and don't set up
+ * reply buffers.
+ * Returns 0 on success or error code.
+ */
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
+{
+	int rc;
+	__u32 opc;
+	int mpflag = 0;
+	bool rep_mbits = false;
+	struct lnet_handle_md bulk_cookie;
+	struct lnet_processid peer;
+	struct ptlrpc_connection *connection;
+	struct lnet_me *reply_me = NULL;
+	struct lnet_md reply_md;
+	struct obd_import *imp = request->rq_import;
+	struct obd_device *obd = imp->imp_obd;
+	ENTRY;
+
+	LNetInvalidateMDHandle(&bulk_cookie);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
+		RETURN(0);
+
+	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DELAY_RECOV) &&
+		     lustre_msg_get_opc(request->rq_reqmsg) == MDS_CONNECT &&
+		     strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0)) {
+		RETURN(0);
+	}
+
+	LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
+	LASSERT(request->rq_wait_ctx == 0);
+
+	/* If this is a re-transmit, we're required to have disengaged
+	 * cleanly from the previous attempt */
+	LASSERT(!request->rq_receiving_reply);
+	LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) &&
+		  (imp->imp_state == LUSTRE_IMP_FULL)));
+
+	if (unlikely(obd != NULL && obd->obd_fail)) {
+		CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
+		       obd->obd_name);
+		/* this prevents us from waiting in ptlrpc_queue_wait */
+		spin_lock(&request->rq_lock);
+		request->rq_err = 1;
+		spin_unlock(&request->rq_lock);
+		request->rq_status = -ENODEV;
+		RETURN(-ENODEV);
+	}
+
+	connection = imp->imp_connection;
+
+	lustre_msg_set_handle(request->rq_reqmsg,
+			      &imp->imp_remote_handle);
+	lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
+	lustre_msg_set_conn_cnt(request->rq_reqmsg,
+				imp->imp_conn_cnt);
+	lustre_msghdr_set_flags(request->rq_reqmsg,
+				imp->imp_msghdr_flags);
+
+	/* If it's the first time to resend the request for EINPROGRESS,
+	 * we need to allocate a new XID (see after_reply()), it's different
+	 * from the resend for reply timeout. */
+	if (request->rq_nr_resend != 0 &&
+	    list_empty(&request->rq_unreplied_list)) {
+		__u64 min_xid = 0;
+		/* resend for EINPROGRESS, allocate new xid to avoid reply
+		 * reconstruction */
+		spin_lock(&imp->imp_lock);
+		ptlrpc_assign_next_xid_nolock(request);
+		min_xid = ptlrpc_known_replied_xid(imp);
+		spin_unlock(&imp->imp_lock);
+
+		lustre_msg_set_last_xid(request->rq_reqmsg, min_xid);
+		DEBUG_REQ(D_RPCTRACE, request,
+			  "Allocating new XID for resend on EINPROGRESS");
+	}
+
+	opc = lustre_msg_get_opc(request->rq_reqmsg);
+	if (opc != OST_CONNECT && opc != MDS_CONNECT &&
+	    opc != MGS_CONNECT && OCD_HAS_FLAG(&imp->imp_connect_data, FLAGS2))
+		rep_mbits = imp->imp_connect_data.ocd_connect_flags2 &
+			OBD_CONNECT2_REP_MBITS;
+
+	if ((request->rq_bulk != NULL) || rep_mbits) {
+		ptlrpc_set_mbits(request);
+		lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits);
+	}
+
+	if (list_empty(&request->rq_unreplied_list) ||
+	    request->rq_xid <= imp->imp_known_replied_xid) {
+		DEBUG_REQ(D_ERROR, request,
+			  "xid=%llu, replied=%llu, list_empty=%d",
+			  request->rq_xid, imp->imp_known_replied_xid,
+			  list_empty(&request->rq_unreplied_list));
+		LBUG();
+	}
+
+	/** For enabled AT all request should have AT_SUPPORT in the
+	 * FULL import state when OBD_CONNECT_AT is set */
+	LASSERT(AT_OFF || imp->imp_state != LUSTRE_IMP_FULL ||
+		(imp->imp_msghdr_flags & MSGHDR_AT_SUPPORT) ||
+		!(imp->imp_connect_data.ocd_connect_flags &
+		  OBD_CONNECT_AT));
+
+	if (request->rq_resend) {
+		lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+		if (request->rq_resend_cb != NULL)
+			request->rq_resend_cb(request, &request->rq_async_args);
+	}
+	if (request->rq_memalloc)
+		mpflag = memalloc_noreclaim_save();
+
+	rc = sptlrpc_cli_wrap_request(request);
+	if (rc)
+		GOTO(out, rc);
+
+	/* bulk register should be done after wrap_request() */
+	if (request->rq_bulk != NULL) {
+		rc = ptlrpc_register_bulk (request);
+		if (rc != 0)
+			GOTO(cleanup_bulk, rc);
+		/*
+		 * All the mds in the request will have the same cpt
+		 * encoded in the cookie. So we can just get the first
+		 * one.
+		 */
+		bulk_cookie = request->rq_bulk->bd_mds[0];
+	}
+
+	if (!noreply) {
+		LASSERT (request->rq_replen != 0);
+		if (request->rq_repbuf == NULL) {
+			LASSERT(request->rq_repdata == NULL);
+			LASSERT(request->rq_repmsg == NULL);
+			rc = sptlrpc_cli_alloc_repbuf(request,
+						      request->rq_replen);
+			if (rc) {
+				/* this prevents us from looping in
+				 * ptlrpc_queue_wait */
+				spin_lock(&request->rq_lock);
+				request->rq_err = 1;
+				spin_unlock(&request->rq_lock);
+				request->rq_status = rc;
+				GOTO(cleanup_bulk, rc);
+			}
+		} else {
+			request->rq_repdata = NULL;
+			request->rq_repmsg = NULL;
+		}
+
+		peer = connection->c_peer;
+		if (request->rq_bulk &&
+		    OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_REPLY_ATTACH)) {
+			reply_me = ERR_PTR(-ENOMEM);
+		} else {
+			reply_me = LNetMEAttach(request->rq_reply_portal,
+						&peer,
+						rep_mbits ? request->rq_mbits :
+						request->rq_xid,
+						0, LNET_UNLINK, LNET_INS_AFTER);
+		}
+
+		if (IS_ERR(reply_me)) {
+			rc = PTR_ERR(reply_me);
+			CERROR("LNetMEAttach failed: %d\n", rc);
+			LASSERT(rc == -ENOMEM);
+			GOTO(cleanup_bulk, rc = -ENOMEM);
+		}
+	}
+
+	spin_lock(&request->rq_lock);
+	/* We are responsible for unlinking the reply buffer */
+	request->rq_reply_unlinked = noreply;
+	request->rq_receiving_reply = !noreply;
+	/* Clear any flags that may be present from previous sends. */
+	request->rq_req_unlinked = 0;
+	request->rq_replied = 0;
+	request->rq_err = 0;
+	request->rq_timedout = 0;
+	request->rq_net_err = 0;
+	request->rq_resend = 0;
+	request->rq_restart = 0;
+	request->rq_reply_truncated = 0;
+	spin_unlock(&request->rq_lock);
+
+	if (!noreply) {
+		reply_md.start     = request->rq_repbuf;
+		reply_md.length    = request->rq_repbuf_len;
+		/* Allow multiple early replies */
+		reply_md.threshold = LNET_MD_THRESH_INF;
+		/* Manage remote for early replies */
+		reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+			LNET_MD_MANAGE_REMOTE |
+			LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
+		reply_md.user_ptr  = &request->rq_reply_cbid;
+		reply_md.handler = ptlrpc_handler;
+
+		/* We must see the unlink callback to set rq_reply_unlinked,
+		 * so we can't auto-unlink */
+		rc = LNetMDAttach(reply_me, &reply_md, LNET_RETAIN,
+				  &request->rq_reply_md_h);
+		if (rc != 0) {
+			CERROR("LNetMDAttach failed: %d\n", rc);
+			LASSERT(rc == -ENOMEM);
+			spin_lock(&request->rq_lock);
+			/* ...but the MD attach didn't succeed... */
+			request->rq_receiving_reply = 0;
+			spin_unlock(&request->rq_lock);
+			GOTO(cleanup_bulk, rc = -ENOMEM);
+		}
+		percpu_ref_get(&ptlrpc_pending);
+
+		CDEBUG(D_NET,
+		       "Setup reply buffer: %u bytes, xid %llu, portal %u\n",
+		       request->rq_repbuf_len, request->rq_xid,
+		       request->rq_reply_portal);
+	}
+
+	/* add references on request for request_out_callback */
+	ptlrpc_request_addref(request);
+	if (obd != NULL && obd->obd_svc_stats != NULL)
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
+				    atomic_read(&imp->imp_inflight));
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
+
+	request->rq_sent_ns = ktime_get_real();
+	request->rq_sent = ktime_get_real_seconds();
+	/* We give the server rq_timeout secs to process the req, and
+	 * add the network latency for our local timeout.
+	 */
+	request->rq_deadline = request->rq_sent + request->rq_timeout +
+		ptlrpc_at_get_net_latency(request);
+
+	DEBUG_REQ(D_INFO, request, "send flags=%x",
+		  lustre_msg_get_flags(request->rq_reqmsg));
+	rc = ptl_send_buf(&request->rq_req_md_h,
+			  request->rq_reqbuf, request->rq_reqdata_len,
+			  LNET_NOACK_REQ, &request->rq_req_cbid,
+			  LNET_NID_ANY,
+			  lnet_pid_to_pid4(&connection->c_peer),
+			  request->rq_request_portal,
+			  request->rq_xid, 0, &bulk_cookie);
+	if (likely(rc == 0))
+		GOTO(out, rc);
+
+	request->rq_req_unlinked = 1;
+	ptlrpc_req_finished(request);
+	if (noreply)
+		GOTO(out, rc);
+
+	LNetMDUnlink(request->rq_reply_md_h);
+
+	/* UNLINKED callback called synchronously */
+	LASSERT(!request->rq_receiving_reply);
+
+ cleanup_bulk:
+	/* We do sync unlink here as there was no real transfer here so
+	 * the chance to have long unlink to sluggish net is smaller here. */
+	ptlrpc_unregister_bulk(request, 0);
+ out:
+	if (rc == -ENOMEM) {
+		/* set rq_sent so that this request is treated
+		 * as a delayed send in the upper layers */
+		request->rq_sent = ktime_get_real_seconds();
+	}
+
+	if (request->rq_memalloc)
+		memalloc_noreclaim_restore(mpflag);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptl_send_rpc);
+
+/**
+ * Register request buffer descriptor for request receiving.
+ */
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service;
+	static struct lnet_processid match_id = {
+		.nid = LNET_ANY_NID,
+		.pid = LNET_PID_ANY
+	};
+	int rc;
+	struct lnet_md md;
+	struct lnet_me *me;
+
+	CDEBUG(D_NET, "%s: registering portal %d\n", service->srv_name,
+	       service->srv_req_portal);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
+		return -ENOMEM;
+
+	/* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
+	 * which means buffer can only be attached on local CPT, and LND
+	 * threads can find it by grabbing a local lock */
+	me = LNetMEAttach(service->srv_req_portal,
+			  &match_id, 0, ~0, LNET_UNLINK,
+			  rqbd->rqbd_svcpt->scp_cpt >= 0 ?
+			  LNET_INS_LOCAL : LNET_INS_AFTER);
+	if (IS_ERR(me)) {
+		CERROR("%s: LNetMEAttach failed: rc = %ld\n",
+		       service->srv_name, PTR_ERR(me));
+		return PTR_ERR(me);
+	}
+
+	LASSERT(rqbd->rqbd_refcount == 0);
+	rqbd->rqbd_refcount = 1;
+
+	md.start     = rqbd->rqbd_buffer;
+	md.length    = service->srv_buf_size;
+	md.max_size  = service->srv_max_req_size;
+	md.threshold = LNET_MD_THRESH_INF;
+	md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
+	md.user_ptr  = &rqbd->rqbd_cbid;
+	md.handler   = ptlrpc_handler;
+
+	rc = LNetMDAttach(me, &md, LNET_UNLINK, &rqbd->rqbd_md_h);
+	if (rc == 0) {
+		percpu_ref_get(&ptlrpc_pending);
+		return 0;
+	}
+
+	CERROR("%s: LNetMDAttach failed: rc = %d\n", service->srv_name, rc);
+	LASSERT(rc == -ENOMEM);
+	rqbd->rqbd_refcount = 0;
+
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
new file mode 100644
index 0000000000000..41c0bb05c5f84
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013, Trustees of Indiana University
+ *
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ *
+ * Author: Joshua Walgenbach <jjw@iu.edu>
+ */
+
+#ifndef _NODEMAP_INTERNAL_H
+#define _NODEMAP_INTERNAL_H
+
+#include <lustre_nodemap.h>
+#include <linux/rbtree.h>
+
+#define DEFAULT_NODEMAP "default"
+
+/* Default nobody uid, gid and projid values */
+#define NODEMAP_NOBODY_UID 99
+#define NODEMAP_NOBODY_GID 99
+#define NODEMAP_NOBODY_PROJID 99
+
+struct lprocfs_static_vars;
+
+/* nodemap root proc directory under fs/lustre */
+extern struct proc_dir_entry *proc_lustre_nodemap_root;
+/* flag if nodemap is active */
+extern bool nodemap_active;
+
+extern struct mutex active_config_lock;
+extern struct nodemap_config *active_config;
+
+struct lu_nid_range {
+	/* unique id set by mgs */
+	unsigned int		 rn_id;
+	/* lu_nodemap containing this range */
+	struct lu_nodemap	*rn_nodemap;
+	/* list for nodemap */
+	struct list_head	 rn_list;
+	/* nid interval tree */
+	lnet_nid_t		 rn_start,
+				 rn_end,
+				 rn_subtree_last;
+	struct rb_node		 rn_rb;
+};
+
+struct lu_idmap {
+	/* uid/gid of client */
+	__u32		id_client;
+	/* uid/gid on filesystem */
+	__u32		id_fs;
+	/* tree mapping client ids to filesystem ids */
+	struct rb_node	id_client_to_fs;
+	/* tree mappung filesystem to client */
+	struct rb_node	id_fs_to_client;
+};
+
+/* first 4 bits of the nodemap_id is the index type */
+struct nodemap_key {
+	__u32 nk_nodemap_id;
+	union {
+		__u32 nk_range_id;
+		__u32 nk_id_client;
+		__u32 nk_unused;
+	};
+};
+
+enum nodemap_idx_type {
+	NODEMAP_EMPTY_IDX = 0,		/* index created with blank record */
+	NODEMAP_CLUSTER_IDX = 1,	/* a nodemap cluster of nodes */
+	NODEMAP_RANGE_IDX = 2,		/* nid range assigned to a nm cluster */
+	NODEMAP_UIDMAP_IDX = 3,		/* uid map assigned to a nm cluster */
+	NODEMAP_GIDMAP_IDX = 4,		/* gid map assigned to a nm cluster */
+	NODEMAP_PROJIDMAP_IDX = 5,	/* projid map assigned to nm cluster */
+	NODEMAP_GLOBAL_IDX = 15,	/* stores nodemap activation status */
+};
+
+#define NM_TYPE_MASK 0x0FFFFFFF
+#define NM_TYPE_SHIFT 28
+
+static inline enum nodemap_idx_type nm_idx_get_type(unsigned int id)
+{
+	return id >> NM_TYPE_SHIFT;
+}
+
+static inline __u32 nm_idx_set_type(unsigned int id, enum nodemap_idx_type t)
+{
+	return (id & NM_TYPE_MASK) | (t << NM_TYPE_SHIFT);
+}
+
+void nodemap_config_set_active(struct nodemap_config *config);
+struct lu_nodemap *nodemap_create(const char *name,
+				  struct nodemap_config *config,
+				  bool is_default);
+void nodemap_putref(struct lu_nodemap *nodemap);
+struct lu_nodemap *nodemap_lookup(const char *name);
+
+int nodemap_procfs_init(void);
+void nodemap_procfs_exit(void);
+int lprocfs_nodemap_register(struct lu_nodemap *nodemap,
+			     bool is_default_nodemap);
+void lprocfs_nodemap_remove(struct nodemap_pde *nodemap_pde);
+struct lu_nid_range *nodemap_range_find(lnet_nid_t start_nid,
+					lnet_nid_t end_nid);
+struct lu_nid_range *range_create(struct nodemap_range_tree *nm_range_tree,
+				  lnet_nid_t start_nid, lnet_nid_t end_nid,
+				  struct lu_nodemap *nodemap,
+				  unsigned int range_id);
+void range_destroy(struct lu_nid_range *range);
+int range_insert(struct nodemap_range_tree *nm_range_tree,
+		 struct lu_nid_range *data);
+void range_delete(struct nodemap_range_tree *nm_range_tree,
+		  struct lu_nid_range *data);
+struct lu_nid_range *range_search(struct nodemap_range_tree *nm_range_tree,
+				  lnet_nid_t nid);
+struct lu_nid_range *range_find(struct nodemap_range_tree *nm_range_tree,
+				lnet_nid_t start_nid, lnet_nid_t end_nid);
+int range_parse_nidstring(char *range_string, lnet_nid_t *start_nid,
+			  lnet_nid_t *end_nid);
+void range_init_tree(void);
+struct lu_idmap *idmap_create(__u32 client_id, __u32 fs_id);
+struct lu_idmap *idmap_insert(enum nodemap_id_type id_type,
+			      struct lu_idmap *idmap,
+			      struct lu_nodemap *nodemap);
+void idmap_delete(enum nodemap_id_type id_type,  struct lu_idmap *idmap,
+		  struct lu_nodemap *nodemap);
+void idmap_delete_tree(struct lu_nodemap *nodemap);
+struct lu_idmap *idmap_search(struct lu_nodemap *nodemap,
+			      enum nodemap_tree_type,
+			      enum nodemap_id_type id_type,
+			      __u32 id);
+int nm_member_add(struct lu_nodemap *nodemap, struct obd_export *exp);
+void nm_member_del(struct lu_nodemap *nodemap, struct obd_export *exp);
+void nm_member_delete_list(struct lu_nodemap *nodemap);
+struct lu_nodemap *nodemap_classify_nid(lnet_nid_t nid);
+void nm_member_reclassify_nodemap(struct lu_nodemap *nodemap);
+void nm_member_revoke_locks(struct lu_nodemap *nodemap);
+void nm_member_revoke_locks_always(struct lu_nodemap *nodemap);
+void nm_member_revoke_all(void);
+
+int nodemap_add_idmap_helper(struct lu_nodemap *nodemap,
+			     enum nodemap_id_type id_type,
+			     const __u32 map[2]);
+int nodemap_add_range_helper(struct nodemap_config *config,
+			     struct lu_nodemap *nodemap,
+			     const lnet_nid_t nid[2],
+			     unsigned int range_id);
+
+struct rb_node *nm_rb_next_postorder(const struct rb_node *node);
+struct rb_node *nm_rb_first_postorder(const struct rb_root *root);
+void nodemap_getref(struct lu_nodemap *nodemap);
+void nodemap_putref(struct lu_nodemap *nodemap);
+int nm_hash_list_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		    struct hlist_node *hnode,
+		    void *nodemap_list_head);
+
+#define nm_rbtree_postorder_for_each_entry_safe(pos, n,			\
+						root, field)		\
+	for (pos = nm_rb_first_postorder(root) ?			\
+		rb_entry(nm_rb_first_postorder(root), typeof(*pos),	\
+		field) : NULL,						\
+		n = (pos && nm_rb_next_postorder(&pos->field)) ?	\
+		rb_entry(nm_rb_next_postorder(&pos->field),		\
+		typeof(*pos), field) : NULL;				\
+		pos != NULL;						\
+		pos = n,						\
+		n = (pos && nm_rb_next_postorder(&pos->field)) ?	\
+		rb_entry(nm_rb_next_postorder(&pos->field),		\
+		typeof(*pos), field) : NULL)
+
+int nodemap_idx_nodemap_add(const struct lu_nodemap *nodemap);
+int nodemap_idx_nodemap_update(const struct lu_nodemap *nodemap);
+int nodemap_idx_nodemap_del(const struct lu_nodemap *nodemap);
+int nodemap_idx_idmap_add(const struct lu_nodemap *nodemap,
+			  enum nodemap_id_type id_type,
+			  const __u32 map[2]);
+int nodemap_idx_idmap_del(const struct lu_nodemap *nodemap,
+			  enum nodemap_id_type id_type,
+			  const __u32 map[2]);
+int nodemap_idx_range_add(const struct lu_nid_range *range,
+			  const lnet_nid_t nid[2]);
+int nodemap_idx_range_del(const struct lu_nid_range *range);
+int nodemap_idx_nodemap_activate(bool value);
+#endif  /* _NODEMAP_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c
new file mode 100644
index 0000000000000..ccfc4e96c813c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c
@@ -0,0 +1,1786 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs.c
+ *
+ * Network Request Scheduler (NRS)
+ *
+ * Allows to reorder the handling of RPCs at servers.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+#include <libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * NRS core object.
+ */
+struct nrs_core nrs_core;
+
+static int nrs_policy_init(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_desc->pd_ops->op_policy_init != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_init(policy) : 0;
+}
+
+static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref == 0);
+	LASSERT(policy->pol_req_queued == 0);
+
+	if (policy->pol_desc->pd_ops->op_policy_fini != NULL)
+		policy->pol_desc->pd_ops->op_policy_fini(policy);
+}
+
+static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy,
+				 enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	/**
+	 * The policy may be stopped, but the lprocfs files and
+	 * ptlrpc_nrs_policy instances remain present until unregistration time.
+	 * Do not perform the ctl operation if the policy is stopped, as
+	 * policy->pol_private will be NULL in such a case.
+	 */
+	if (policy->pol_state == NRS_POL_STATE_STOPPED)
+		RETURN(-ENODEV);
+
+	RETURN(policy->pol_desc->pd_ops->op_policy_ctl != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) :
+	       -ENOSYS);
+}
+
+static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy)
+{
+	ENTRY;
+
+	if (policy->pol_desc->pd_ops->op_policy_stop != NULL)
+		policy->pol_desc->pd_ops->op_policy_stop(policy);
+
+	LASSERT(list_empty(&policy->pol_list_queued));
+	LASSERT(policy->pol_req_queued == 0 &&
+		policy->pol_req_started == 0);
+
+	policy->pol_private = NULL;
+
+	policy->pol_state = NRS_POL_STATE_STOPPED;
+
+	if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+		module_put(policy->pol_desc->pd_owner);
+
+	EXIT;
+}
+
+static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+	ENTRY;
+
+	if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping)
+		RETURN(-EPERM);
+
+	if (policy->pol_state == NRS_POL_STATE_STARTING)
+		RETURN(-EAGAIN);
+
+	/* In progress or already stopped */
+	if (policy->pol_state != NRS_POL_STATE_STARTED)
+		RETURN(0);
+
+	policy->pol_state = NRS_POL_STATE_STOPPING;
+
+	/* Immediately make it invisible */
+	if (nrs->nrs_policy_primary == policy) {
+		nrs->nrs_policy_primary = NULL;
+
+	} else {
+		LASSERT(nrs->nrs_policy_fallback == policy);
+		nrs->nrs_policy_fallback = NULL;
+	}
+
+	/* I have the only refcount */
+	if (policy->pol_ref == 1)
+		nrs_policy_stop0(policy);
+
+	RETURN(0);
+}
+
+/**
+ * Transitions the \a nrs NRS head's primary policy to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no
+ * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED.
+ *
+ * \param[in] nrs the NRS head to carry out this operation on
+ */
+static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary;
+	ENTRY;
+
+	if (tmp == NULL) {
+		/**
+		 * XXX: This should really be RETURN_EXIT, but the latter does
+		 * not currently print anything out, and possibly should be
+		 * fixed to do so.
+		 */
+		EXIT;
+		return;
+	}
+
+	nrs->nrs_policy_primary = NULL;
+
+	LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED);
+	tmp->pol_state = NRS_POL_STATE_STOPPING;
+
+	if (tmp->pol_ref == 0)
+		nrs_policy_stop0(tmp);
+	EXIT;
+}
+
+/**
+ * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in
+ * response to an lprocfs command to start a policy.
+ *
+ * If a primary policy different to the current one is specified, this function
+ * will transition the new policy to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition
+ * the old primary policy (if there is one) to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED.
+ *
+ * If the fallback policy is specified, this is taken to indicate an instruction
+ * to stop the current primary policy, without substituting it with another
+ * primary policy, so the primary policy (if any) is transitioned to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In
+ * this case, the fallback policy is only left active in the NRS head.
+ */
+static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct ptlrpc_nrs      *nrs = policy->pol_nrs;
+	int			rc = 0;
+	ENTRY;
+
+	/**
+	 * Don't allow multiple starting which is too complex, and has no real
+	 * benefit.
+	 */
+	if (nrs->nrs_policy_starting)
+		RETURN(-EAGAIN);
+
+	LASSERT(policy->pol_state != NRS_POL_STATE_STARTING);
+
+	if (policy->pol_state == NRS_POL_STATE_STOPPING)
+		RETURN(-EAGAIN);
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This is for cases in which the user sets the policy to the
+		 * fallback policy (currently fifo for all services); i.e. the
+		 * user is resetting the policy to the default; so we stop the
+		 * primary policy, if any.
+		 */
+		if (policy == nrs->nrs_policy_fallback) {
+			nrs_policy_stop_primary(nrs);
+			RETURN(0);
+		}
+
+		/**
+		 * If we reach here, we must be setting up the fallback policy
+		 * at service startup time, and only a single policy with the
+		 * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can
+		 * register with NRS core.
+		 */
+		LASSERT(nrs->nrs_policy_fallback == NULL);
+	} else {
+		/**
+		 * Shouldn't start primary policy if w/o fallback policy.
+		 */
+		if (nrs->nrs_policy_fallback == NULL)
+			RETURN(-EPERM);
+
+		if (policy->pol_state == NRS_POL_STATE_STARTED) {
+			/**
+			 * If the policy argument now is different from the last time,
+			 * stop the policy first and start it again with the new
+			 * argument.
+			 */
+			if ((arg != NULL) && (strlen(arg) >= NRS_POL_ARG_MAX))
+				return -EINVAL;
+
+			if ((arg == NULL && strlen(policy->pol_arg) == 0) ||
+			    (arg != NULL && strcmp(policy->pol_arg, arg) == 0))
+				RETURN(0);
+
+			rc = nrs_policy_stop_locked(policy);
+			if (rc)
+				RETURN(-EAGAIN);
+		}
+	}
+
+	/**
+	 * Increase the module usage count for policies registering from other
+	 * modules.
+	 */
+	if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 &&
+	    !try_module_get(policy->pol_desc->pd_owner)) {
+		atomic_dec(&policy->pol_desc->pd_refs);
+		CERROR("NRS: cannot get module for policy %s; is it alive?\n",
+		       policy->pol_desc->pd_name);
+		RETURN(-ENODEV);
+	}
+
+	/**
+	 * Serialize policy starting across the NRS head
+	 */
+	nrs->nrs_policy_starting = 1;
+
+	policy->pol_state = NRS_POL_STATE_STARTING;
+
+	if (policy->pol_desc->pd_ops->op_policy_start) {
+		spin_unlock(&nrs->nrs_lock);
+
+		rc = policy->pol_desc->pd_ops->op_policy_start(policy, arg);
+
+		spin_lock(&nrs->nrs_lock);
+		if (rc != 0) {
+			if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+				module_put(policy->pol_desc->pd_owner);
+
+			policy->pol_state = NRS_POL_STATE_STOPPED;
+			GOTO(out, rc);
+		}
+	}
+
+	if (arg != NULL) {
+		if (strlcpy(policy->pol_arg, arg, sizeof(policy->pol_arg)) >=
+		    sizeof(policy->pol_arg)) {
+			CERROR("NRS: arg '%s' is too long\n", arg);
+			GOTO(out, rc = -E2BIG);
+		}
+	} else {
+		policy->pol_arg[0] = '\0';
+	}
+
+	policy->pol_state = NRS_POL_STATE_STARTED;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This path is only used at PTLRPC service setup time.
+		 */
+		nrs->nrs_policy_fallback = policy;
+	} else {
+		/*
+		 * Try to stop the current primary policy if there is one.
+		 */
+		nrs_policy_stop_primary(nrs);
+
+		/**
+		 * And set the newly-started policy as the primary one.
+		 */
+		nrs->nrs_policy_primary = policy;
+	}
+
+out:
+	nrs->nrs_policy_starting = 0;
+
+	RETURN(rc);
+}
+
+/**
+ * Increases the policy's usage reference count.
+ */
+static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy)
+{
+	policy->pol_ref++;
+}
+
+/**
+ * Decreases the policy's usage reference count, and stops the policy in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref > 0);
+
+	policy->pol_ref--;
+	if (unlikely(policy->pol_ref == 0 &&
+	    policy->pol_state == NRS_POL_STATE_STOPPING))
+		nrs_policy_stop0(policy);
+}
+
+static void nrs_policy_put(struct ptlrpc_nrs_policy *policy)
+{
+	spin_lock(&policy->pol_nrs->nrs_lock);
+	nrs_policy_put_locked(policy);
+	spin_unlock(&policy->pol_nrs->nrs_lock);
+}
+
+/**
+ * Find and return a policy by name.
+ */
+static struct ptlrpc_nrs_policy * nrs_policy_find_locked(struct ptlrpc_nrs *nrs,
+							 char *name)
+{
+	struct ptlrpc_nrs_policy *tmp;
+
+	list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) {
+		if (strncmp(tmp->pol_desc->pd_name, name,
+			    NRS_POL_NAME_MAX) == 0) {
+			nrs_policy_get_locked(tmp);
+			return tmp;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Release references for the resource hierarchy moving upwards towards the
+ * policy instance resource.
+ */
+static void nrs_resource_put(struct ptlrpc_nrs_resource *res)
+{
+	struct ptlrpc_nrs_policy *policy = res->res_policy;
+
+	if (policy->pol_desc->pd_ops->op_res_put != NULL) {
+		struct ptlrpc_nrs_resource *parent;
+
+		for (; res != NULL; res = parent) {
+			parent = res->res_parent;
+			policy->pol_desc->pd_ops->op_res_put(policy, res);
+		}
+	}
+}
+
+/**
+ * Obtains references for each resource in the resource hierarchy for request
+ * \a nrq if it is to be handled by \a policy.
+ *
+ * \param[in] policy	  the policy
+ * \param[in] nrq	  the request
+ * \param[in] moving_req  denotes whether this is a call to the function by
+ *			  ldlm_lock_reorder_req(), in order to move \a nrq to
+ *			  the high-priority NRS head; we should not sleep when
+ *			  set.
+ *
+ * \retval NULL		  resource hierarchy references not obtained
+ * \retval valid-pointer  the bottom level of the resource hierarchy
+ *
+ * \see ptlrpc_nrs_pol_ops::op_res_get()
+ */
+static
+struct ptlrpc_nrs_resource * nrs_resource_get(struct ptlrpc_nrs_policy *policy,
+					      struct ptlrpc_nrs_request *nrq,
+					      bool moving_req)
+{
+	/**
+	 * Set to NULL to traverse the resource hierarchy from the top.
+	 */
+	struct ptlrpc_nrs_resource *res = NULL;
+	struct ptlrpc_nrs_resource *tmp = NULL;
+	int			    rc;
+
+	while (1) {
+		rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res,
+							  &tmp, moving_req);
+		if (rc < 0) {
+			if (res != NULL)
+				nrs_resource_put(res);
+			return NULL;
+		}
+
+		LASSERT(tmp != NULL);
+		tmp->res_parent = res;
+		tmp->res_policy = policy;
+		res = tmp;
+		tmp = NULL;
+		/**
+		 * Return once we have obtained a reference to the bottom level
+		 * of the resource hierarchy.
+		 */
+		if (rc > 0)
+			return res;
+	}
+}
+
+/**
+ * Obtains resources for the resource hierarchies and policy references for
+ * the fallback and current primary policy (if any), that will later be used
+ * to handle request \a nrq.
+ *
+ * \param[in]  nrs  the NRS head instance that will be handling request \a nrq.
+ * \param[in]  nrq  the request that is being handled.
+ * \param[out] resp the array where references to the resource hierarchy are
+ *		    stored.
+ * \param[in]  moving_req  is set when obtaining resources while moving a
+ *			   request from a policy on the regular NRS head to a
+ *			   policy on the HP NRS head (via
+ *			   ldlm_lock_reorder_req()). It signifies that
+ *			   allocations to get resources should be atomic; for
+ *			   a full explanation, see comment in
+ *			   ptlrpc_nrs_pol_ops::op_res_get().
+ */
+static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs,
+				  struct ptlrpc_nrs_request *nrq,
+				  struct ptlrpc_nrs_resource **resp,
+				  bool moving_req)
+{
+	struct ptlrpc_nrs_policy   *primary = NULL;
+	struct ptlrpc_nrs_policy   *fallback = NULL;
+
+	memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX);
+
+	/**
+	 * Obtain policy references.
+	 */
+	spin_lock(&nrs->nrs_lock);
+
+	fallback = nrs->nrs_policy_fallback;
+	nrs_policy_get_locked(fallback);
+
+	primary = nrs->nrs_policy_primary;
+	if (primary != NULL)
+		nrs_policy_get_locked(primary);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	/**
+	 * Obtain resource hierarchy references.
+	 */
+	resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req);
+	LASSERT(resp[NRS_RES_FALLBACK] != NULL);
+
+	if (primary != NULL) {
+		resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq,
+							 moving_req);
+		/**
+		 * A primary policy may exist which may not wish to serve a
+		 * particular request for different reasons; release the
+		 * reference on the policy as it will not be used for this
+		 * request.
+		 */
+		if (resp[NRS_RES_PRIMARY] == NULL)
+			nrs_policy_put(primary);
+	}
+}
+
+/**
+ * Releases references to resource hierarchies and policies, because they are no
+ * longer required; used when request handling has been completed, or the
+ * request is moving to the high priority NRS head.
+ *
+ * \param resp	the resource hierarchy that is being released
+ *
+ * \see ptlrpcnrs_req_hp_move()
+ * \see ptlrpc_nrs_req_finalize()
+ */
+static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp)
+{
+	struct ptlrpc_nrs_policy *pols[NRS_RES_MAX];
+	struct ptlrpc_nrs	 *nrs = NULL;
+	int			  i;
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (resp[i] != NULL) {
+			pols[i] = resp[i]->res_policy;
+			nrs_resource_put(resp[i]);
+			resp[i] = NULL;
+		} else {
+			pols[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (pols[i] == NULL)
+			continue;
+
+		if (nrs == NULL) {
+			nrs = pols[i]->pol_nrs;
+			spin_lock(&nrs->nrs_lock);
+		}
+		nrs_policy_put_locked(pols[i]);
+	}
+
+	if (nrs != NULL)
+		spin_unlock(&nrs->nrs_lock);
+}
+
+/**
+ * Obtains an NRS request from \a policy for handling or examination; the
+ * request should be removed in the 'handling' case.
+ *
+ * Calling into this function implies we already know the policy has a request
+ * waiting to be handled.
+ *
+ * \param[in] policy the policy from which a request
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  when set, it will force a policy to return a request if it
+ *		     has one pending
+ *
+ * \retval the NRS request to be handled
+ */
+static inline
+struct ptlrpc_nrs_request * nrs_request_get(struct ptlrpc_nrs_policy *policy,
+					    bool peek, bool force)
+{
+	struct ptlrpc_nrs_request *nrq;
+
+	LASSERT(policy->pol_req_queued > 0);
+
+	nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force);
+
+	LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy));
+
+	return nrq;
+}
+
+/**
+ * Enqueues request \a nrq for later handling, via one one the policies for
+ * which resources where earlier obtained via nrs_resource_get_safe(). The
+ * function attempts to enqueue the request first on the primary policy
+ * (if any), since this is the preferred choice.
+ *
+ * \param nrq the request being enqueued
+ *
+ * \see nrs_resource_get_safe()
+ */
+static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy;
+	int			  rc;
+	int			  i;
+
+	/**
+	 * Try in descending order, because the primary policy (if any) is
+	 * the preferred choice.
+	 */
+	for (i = NRS_RES_MAX - 1; i >= 0; i--) {
+		if (nrq->nr_res_ptrs[i] == NULL)
+			continue;
+
+		nrq->nr_res_idx = i;
+		policy = nrq->nr_res_ptrs[i]->res_policy;
+
+		rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq);
+		if (rc == 0) {
+			policy->pol_nrs->nrs_req_queued++;
+			policy->pol_req_queued++;
+			return;
+		}
+	}
+	/**
+	 * Should never get here, as at least the primary policy's
+	 * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always
+	 * succeed.
+	 */
+	LBUG();
+}
+
+/**
+ * Called when a request has been handled
+ *
+ * \param[in] nrs the request that has been handled; can be used for
+ *		  job/resource control.
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq);
+
+	if (policy->pol_desc->pd_ops->op_req_stop)
+		policy->pol_desc->pd_ops->op_req_stop(policy, nrq);
+
+	LASSERT(policy->pol_nrs->nrs_req_started > 0);
+	LASSERT(policy->pol_req_started > 0);
+
+	policy->pol_nrs->nrs_req_started--;
+	policy->pol_req_started--;
+}
+
+/**
+ * Handler for operations that can be carried out on policies.
+ *
+ * Handles opcodes that are common to all policy types within NRS core, and
+ * passes any unknown opcodes to the policy-specific control function.
+ *
+ * \param[in]	  nrs  the NRS head this policy belongs to.
+ * \param[in]	  name the human-readable policy name; should be the same as
+ *		       ptlrpc_nrs_pol_desc::pd_name.
+ * \param[in]	  opc  the opcode of the operation being carried out.
+ * \param[in,out] arg  can be used to pass information in and out between when
+ *		       carrying an operation; usually data that is private to
+ *		       the policy at some level, or generic policy status
+ *		       information.
+ *
+ * \retval -ve error condition
+ * \retval   0 operation was carried out successfully
+ */
+static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name,
+			  enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	int				rc = 0;
+	ENTRY;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL)
+		GOTO(out, rc = -ENOENT);
+
+	if (policy->pol_state != NRS_POL_STATE_STARTED &&
+	    policy->pol_state != NRS_POL_STATE_STOPPED)
+		GOTO(out, rc = -EAGAIN);
+
+	switch (opc) {
+		/**
+		 * Unknown opcode, pass it down to the policy-specific control
+		 * function for handling.
+		 */
+	default:
+		rc = nrs_policy_ctl_locked(policy, opc, arg);
+		break;
+
+		/**
+		 * Start \e policy
+		 */
+	case PTLRPC_NRS_CTL_START:
+		rc = nrs_policy_start_locked(policy, arg);
+		break;
+	}
+out:
+	if (policy != NULL)
+		nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Unregisters a policy by name.
+ *
+ * \param[in] nrs  the NRS head this policy belongs to.
+ * \param[in] name the human-readable policy name; should be the same as
+ *	           ptlrpc_nrs_pol_desc::pd_name
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name)
+{
+	struct ptlrpc_nrs_policy *policy = NULL;
+	ENTRY;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL) {
+		spin_unlock(&nrs->nrs_lock);
+
+		CERROR("Can't find NRS policy %s\n", name);
+		RETURN(-ENOENT);
+	}
+
+	if (policy->pol_ref > 1) {
+		CERROR("Policy %s is busy with %d references\n", name,
+		       (int)policy->pol_ref);
+		nrs_policy_put_locked(policy);
+
+		spin_unlock(&nrs->nrs_lock);
+		RETURN(-EBUSY);
+	}
+
+	LASSERT(policy->pol_req_queued == 0);
+	LASSERT(policy->pol_req_started == 0);
+
+	if (policy->pol_state != NRS_POL_STATE_STOPPED) {
+		nrs_policy_stop_locked(policy);
+		LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED);
+	}
+
+	list_del(&policy->pol_list);
+	nrs->nrs_num_pols--;
+
+	nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	nrs_policy_fini(policy);
+
+	LASSERT(policy->pol_private == NULL);
+	OBD_FREE_PTR(policy);
+
+	RETURN(0);
+}
+
+/**
+ * Register a policy from \policy descriptor \a desc with NRS head \a nrs.
+ *
+ * \param[in] nrs   the NRS head on which the policy will be registered.
+ * \param[in] desc  the policy descriptor from which the information will be
+ *		    obtained to register the policy.
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_register(struct ptlrpc_nrs *nrs,
+			       struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	struct ptlrpc_service_part     *svcpt = nrs->nrs_svcpt;
+	int				rc;
+	ENTRY;
+
+	LASSERT(svcpt != NULL);
+	LASSERT(desc->pd_ops != NULL);
+	LASSERT(desc->pd_ops->op_res_get != NULL);
+	LASSERT(desc->pd_ops->op_req_get != NULL);
+	LASSERT(desc->pd_ops->op_req_enqueue != NULL);
+	LASSERT(desc->pd_ops->op_req_dequeue != NULL);
+	LASSERT(desc->pd_compat != NULL);
+
+	OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt, sizeof(*policy), GFP_NOFS);
+	if (policy == NULL)
+		RETURN(-ENOMEM);
+
+	policy->pol_nrs     = nrs;
+	policy->pol_desc    = desc;
+	policy->pol_state   = NRS_POL_STATE_STOPPED;
+	policy->pol_flags   = desc->pd_flags;
+
+	INIT_LIST_HEAD(&policy->pol_list);
+	INIT_LIST_HEAD(&policy->pol_list_queued);
+
+	rc = nrs_policy_init(policy);
+	if (rc != 0) {
+		OBD_FREE_PTR(policy);
+		RETURN(rc);
+	}
+
+	spin_lock(&nrs->nrs_lock);
+
+	tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name);
+	if (tmp != NULL) {
+		CERROR("NRS policy %s has been registered, can't register it "
+		       "for %s\n", policy->pol_desc->pd_name,
+		       svcpt->scp_service->srv_name);
+		nrs_policy_put_locked(tmp);
+
+		spin_unlock(&nrs->nrs_lock);
+		nrs_policy_fini(policy);
+		OBD_FREE_PTR(policy);
+
+		RETURN(-EEXIST);
+	}
+
+	list_add_tail(&policy->pol_list, &nrs->nrs_policy_list);
+	nrs->nrs_num_pols++;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_REG_START)
+		rc = nrs_policy_start_locked(policy, NULL);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	if (rc != 0)
+		(void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+
+	RETURN(rc);
+}
+
+/**
+ * Enqueue request \a req using one of the policies its resources are referring
+ * to.
+ *
+ * \param[in] req the request to enqueue.
+ */
+static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy       *policy;
+
+	LASSERT(req->rq_nrq.nr_initialized);
+	LASSERT(!req->rq_nrq.nr_enqueued);
+
+	nrs_request_enqueue(&req->rq_nrq);
+	req->rq_nrq.nr_enqueued = 1;
+
+	policy = nrs_request_policy(&req->rq_nrq);
+	/**
+	 * Add the policy to the NRS head's list of policies with enqueued
+	 * requests, if it has not been added there.
+	 */
+	if (unlikely(list_empty(&policy->pol_list_queued)))
+		list_add_tail(&policy->pol_list_queued,
+				  &policy->pol_nrs->nrs_policy_queued);
+}
+
+/**
+ * Enqueue a request on the high priority NRS head.
+ *
+ * \param req the request to enqueue.
+ */
+static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req)
+{
+	int	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	ENTRY;
+
+	spin_lock(&req->rq_lock);
+	req->rq_hp = 1;
+	ptlrpc_nrs_req_add_nolock(req);
+	if (opc != OBD_PING)
+		DEBUG_REQ(D_NET, req, "high priority req");
+	spin_unlock(&req->rq_lock);
+	EXIT;
+}
+
+/**
+ * Returns a boolean predicate indicating whether the policy described by
+ * \a desc is adequate for use with service \a svc.
+ *
+ * \param[in] svc  the service
+ * \param[in] desc the policy descriptor
+ *
+ * \retval false the policy is not compatible with the service
+ * \retval true	 the policy is compatible with the service
+ */
+static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return desc->pd_compat(svc, desc);
+}
+
+/**
+ * Registers all compatible policies in nrs_core.nrs_policies, for NRS head
+ * \a nrs.
+ *
+ * \param[in] nrs the NRS head
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ *
+ * \see ptlrpc_service_nrs_setup()
+ */
+static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	/* for convenience */
+	struct ptlrpc_service_part	 *svcpt = nrs->nrs_svcpt;
+	struct ptlrpc_service		 *svc = svcpt->scp_service;
+	int				  rc = -EINVAL;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (nrs_policy_compatible(svc, desc)) {
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svc->srv_name, rc);
+				/**
+				 * Fail registration if any of the policies'
+				 * registration fails.
+				 */
+				break;
+			}
+		}
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Initializes NRS head \a nrs of service partition \a svcpt, and registers all
+ * compatible policies in NRS core, with the NRS head.
+ *
+ * \param[in] nrs   the NRS head
+ * \param[in] svcpt the PTLRPC service partition to setup
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs,
+				   struct ptlrpc_service_part *svcpt)
+{
+	int				rc;
+	enum ptlrpc_nrs_queue_type	queue;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	if (nrs == &svcpt->scp_nrs_reg)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (nrs == svcpt->scp_nrs_hp)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else
+		LBUG();
+
+	nrs->nrs_svcpt = svcpt;
+	nrs->nrs_queue_type = queue;
+	spin_lock_init(&nrs->nrs_lock);
+	INIT_LIST_HEAD(&nrs->nrs_policy_list);
+	INIT_LIST_HEAD(&nrs->nrs_policy_queued);
+	nrs->nrs_throttling = 0;
+
+	rc = nrs_register_policies_locked(nrs);
+
+	RETURN(rc);
+}
+
+/**
+ * Allocates a regular and optionally a high-priority NRS head (if the service
+ * handles high-priority RPCs), and then registers all available compatible
+ * policies on those NRS heads.
+ *
+ * \param[in,out] svcpt the PTLRPC service partition to setup
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	int				rc;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	/**
+	 * Initialize the regular NRS head.
+	 */
+	nrs = nrs_svcpt2nrs(svcpt, false);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/**
+	 * Optionally allocate a high-priority NRS head.
+	 */
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL)
+		GOTO(out, rc);
+
+	OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp,
+			  svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt);
+	if (svcpt->scp_nrs_hp == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	nrs = nrs_svcpt2nrs(svcpt, true);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+
+out:
+	RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all available NRS heads in a service partition;
+ * called at PTLRPC service unregistration time.
+ *
+ * \param[in] svcpt the PTLRPC service partition
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	int				rc;
+	bool				hp = false;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+again:
+	/* scp_nrs_hp could be NULL due to short of memory. */
+	nrs = hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+	/* check the nrs_svcpt to see if nrs is initialized. */
+	if (!nrs || !nrs->nrs_svcpt) {
+		EXIT;
+		return;
+	}
+	nrs->nrs_stopping = 1;
+
+	list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list,
+				     pol_list) {
+		rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+		LASSERT(rc == 0);
+	}
+
+	/**
+	 * If the service partition has an HP NRS head, clean that up as well.
+	 */
+	if (!hp && nrs_svcpt_has_hp(svcpt)) {
+		hp = true;
+		goto again;
+	}
+
+	if (hp)
+		OBD_FREE_PTR(nrs);
+
+	EXIT;
+}
+
+/**
+ * Returns the descriptor for a policy as identified by by \a name.
+ *
+ * \param[in] name the policy name
+ *
+ * \retval the policy descriptor
+ * \retval NULL
+ */
+static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name)
+{
+	struct ptlrpc_nrs_pol_desc     *tmp;
+	ENTRY;
+
+	list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) {
+		if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0)
+			RETURN(tmp);
+	}
+	RETURN(NULL);
+}
+
+/**
+ * Removes the policy from all supported NRS heads of all partitions of all
+ * PTLRPC services.
+ *
+ * \param[in] desc the policy descriptor to unregister
+ *
+ * \retval -ve error
+ * \retval  0  successfully unregistered policy on all supported NRS heads
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ * \pre mutex_is_locked(&ptlrpc_all_services_mutex)
+ */
+static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_service	       *svc;
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+	LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex));
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			bool hp = false;
+
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_unregister(nrs, desc->pd_name);
+			/**
+			 * Ignore -ENOENT as the policy may not have registered
+			 * successfully on all service partitions.
+			 */
+			if (rc == -ENOENT) {
+				rc = 0;
+			} else if (rc != 0) {
+				CERROR("Failed to unregister NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+				RETURN(rc);
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Registers a new policy with NRS core.
+ *
+ * The function will only succeed if policy registration with all compatible
+ * service partitions (if any) is successful.
+ *
+ * N.B. This function should be called either at ptlrpc module initialization
+ *	time when registering a policy that ships with NRS core, or in a
+ *	module's init() function for policies registering from other modules.
+ *
+ * \param[in] conf configuration information for the new policy to register
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf)
+{
+        struct ptlrpc_service	       *svc;
+	struct ptlrpc_nrs_pol_desc     *desc;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(conf != NULL);
+	LASSERT(conf->nc_ops != NULL);
+	LASSERT(conf->nc_compat != NULL);
+	LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one,
+		conf->nc_compat_svc_name != NULL));
+	LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0,
+		     conf->nc_owner != NULL));
+
+	conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+	/**
+	 * External policies are not allowed to start immediately upon
+	 * registration, as there is a relatively higher chance that their
+	 * registration might fail. In such a case, some policy instances may
+	 * already have requests queued wen unregistration needs to happen as
+	 * part o cleanup; since there is currently no way to drain requests
+	 * from a policy unless the service is unregistering, we just disallow
+	 * this.
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) &&
+	    (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK |
+			       PTLRPC_NRS_FL_REG_START))) {
+		CERROR("NRS: failing to register policy %s. Please check "
+		       "policy flags; external policies cannot act as fallback "
+		       "policies, or be started immediately upon registration "
+		       "without interaction with lprocfs\n", conf->nc_name);
+		RETURN(-EINVAL);
+	}
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) {
+		CERROR("NRS: failing to register policy %s which has already "
+		       "been registered with NRS core!\n",
+		       conf->nc_name);
+		GOTO(fail, rc = -EEXIST);
+	}
+
+	OBD_ALLOC_PTR(desc);
+	if (desc == NULL)
+		GOTO(fail, rc = -ENOMEM);
+
+	if (strlcpy(desc->pd_name, conf->nc_name, sizeof(desc->pd_name)) >=
+	    sizeof(desc->pd_name)) {
+		OBD_FREE_PTR(desc);
+		GOTO(fail, rc = -E2BIG);
+	}
+	desc->pd_ops		 = conf->nc_ops;
+	desc->pd_compat		 = conf->nc_compat;
+	desc->pd_compat_svc_name = conf->nc_compat_svc_name;
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0)
+		desc->pd_owner	 = conf->nc_owner;
+	desc->pd_flags		 = conf->nc_flags;
+	atomic_set(&desc->pd_refs, 0);
+
+	/**
+	 * For policies that are held in the same module as NRS (currently
+	 * ptlrpc), do not register the policy with all compatible services,
+	 * as the services will not have started at this point, since we are
+	 * calling from ptlrpc module initialization code. In such cases each
+	 * service will register all compatible policies later, via
+	 * ptlrpc_service_nrs_setup().
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0)
+		goto internal;
+
+	/**
+	 * Register the new policy on all compatible services
+	 */
+	mutex_lock(&ptlrpc_all_services_mutex);
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+		struct ptlrpc_service_part     *svcpt;
+		int				i;
+		int				rc2;
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			struct ptlrpc_nrs      *nrs;
+			bool			hp = false;
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				GOTO(fail, rc);
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		/**
+		 * No need to take a reference to other modules here, as we
+		 * will be calling from the module's init() function.
+		 */
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0) {
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				GOTO(fail, rc);
+			}
+		}
+	}
+
+	mutex_unlock(&ptlrpc_all_services_mutex);
+internal:
+	list_add_tail(&desc->pd_list, &nrs_core.nrs_policies);
+fail:
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * Setup NRS heads on all service partitions of service \a svc, and register
+ * all compatible policies on those NRS heads.
+ *
+ * To be called from withing ptl
+ * \param[in] svc the service to setup
+ *
+ * \retval -ve error, the calling logic should eventually call
+ *		      ptlrpc_service_nrs_cleanup() to undo any work performed
+ *		      by this function.
+ *
+ * \see ptlrpc_register_service()
+ * \see ptlrpc_service_nrs_cleanup()
+ */
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	       *svcpt;
+	const struct ptlrpc_nrs_pol_desc       *desc;
+	int					i;
+	int					rc = 0;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Initialize NRS heads on all service CPTs.
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		rc = nrs_svcpt_setup_locked(svcpt);
+		if (rc != 0)
+			GOTO(failed, rc);
+	}
+
+	/**
+	 * Set up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0)
+				GOTO(failed, rc);
+		}
+	}
+
+failed:
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all service partitions of service \a svc.
+ *
+ * \param[in] svc the PTLRPC service to unregister
+ */
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	     *svcpt;
+	const struct ptlrpc_nrs_pol_desc     *desc;
+	int				      i;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Clean up NRS heads on all service partitions
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		nrs_svcpt_cleanup_locked(svcpt);
+
+	/**
+	 * Clean up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+}
+
+/**
+ * Obtains NRS head resources for request \a req.
+ *
+ * These could be either on the regular or HP NRS head of \a svcpt; resources
+ * taken on the regular head can later be swapped for HP head resources by
+ * ldlm_lock_reorder_req().
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request
+ * \param[in] hp    which NRS head of \a svcpt to use
+ */
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp)
+{
+	struct ptlrpc_nrs	*nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	memset(&req->rq_nrq, 0, sizeof(req->rq_nrq));
+	nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs,
+			      false);
+
+	/**
+	 * It is fine to access \e nr_initialized without locking as there is
+	 * no contention at this early stage.
+	 */
+	req->rq_nrq.nr_initialized = 1;
+}
+
+/**
+ * Releases resources for a request; is called after the request has been
+ * handled.
+ *
+ * \param[in] req the request
+ *
+ * \see ptlrpc_server_finish_request()
+ */
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_initialized) {
+		nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs);
+		/* no protection on bit nr_initialized because no
+		 * contention at this late stage */
+		req->rq_nrq.nr_finalized = 1;
+	}
+}
+
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_started)
+		nrs_request_stop(&req->rq_nrq);
+}
+
+/**
+ * Enqueues request \a req on either the regular or high-priority NRS head
+ * of service partition \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request to be enqueued
+ * \param[in] hp    whether to enqueue the request on the regular or
+ *		    high-priority NRS head.
+ */
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp)
+{
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (hp)
+		ptlrpc_nrs_hpreq_add_nolock(req);
+	else
+		ptlrpc_nrs_req_add_nolock(req);
+
+	spin_unlock(&svcpt->scp_req_lock);
+}
+
+static void nrs_request_removed(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_nrs->nrs_req_queued > 0);
+	LASSERT(policy->pol_req_queued > 0);
+
+	policy->pol_nrs->nrs_req_queued--;
+	policy->pol_req_queued--;
+
+	/**
+	 * If the policy has no more requests queued, remove it from
+	 * ptlrpc_nrs::nrs_policy_queued.
+	 */
+	if (unlikely(policy->pol_req_queued == 0)) {
+		list_del_init(&policy->pol_list_queued);
+
+		/**
+		 * If there are other policies with queued requests, move the
+		 * current policy to the end so that we can round robin over
+		 * all policies and drain the requests.
+		 */
+	} else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) {
+		LASSERT(policy->pol_req_queued <
+			policy->pol_nrs->nrs_req_queued);
+
+		list_move_tail(&policy->pol_list_queued,
+				   &policy->pol_nrs->nrs_policy_queued);
+	}
+}
+
+/**
+ * Obtains a request for handling from an NRS head of service partition
+ * \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] hp    whether to obtain a request from the regular or
+ *		    high-priority NRS head.
+ * \param[in] peek  when set, signifies that we just want to examine the
+ *		    request, and not handle it, so the request is not removed
+ *		    from the policy.
+ * \param[in] force when set, it will force a policy to return a request if it
+ *		    has one pending
+ *
+ * \retval the	request to be handled
+ * \retval NULL the head has no requests to serve
+ */
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force)
+{
+	struct ptlrpc_nrs	  *nrs = nrs_svcpt2nrs(svcpt, hp);
+	struct ptlrpc_nrs_policy  *policy;
+	struct ptlrpc_nrs_request *nrq;
+
+	/**
+	 * Always try to drain requests from all NRS polices even if they are
+	 * inactive, because the user can change policy status at runtime.
+	 */
+	list_for_each_entry(policy, &nrs->nrs_policy_queued,
+				pol_list_queued) {
+		nrq = nrs_request_get(policy, peek, force);
+		if (nrq != NULL) {
+			if (likely(!peek)) {
+				nrq->nr_started = 1;
+
+				policy->pol_req_started++;
+				policy->pol_nrs->nrs_req_started++;
+
+				nrs_request_removed(policy);
+			}
+
+			return container_of(nrq, struct ptlrpc_request, rq_nrq);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * Dequeues request \a req from the policy it has been enqueued on.
+ *
+ * \param[in] req the request
+ */
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq);
+
+	policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq);
+
+	req->rq_nrq.nr_enqueued = 0;
+
+	nrs_request_removed(policy);
+}
+
+/**
+ * Returns whether there are any requests currently enqueued on any of the
+ * policies of service partition's \a svcpt NRS head specified by \a hp. Should
+ * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable
+ * result.
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp    whether the regular or high-priority NRS head is to be
+ *		    enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true	 the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	return nrs->nrs_req_queued > 0;
+};
+
+/**
+ * Returns whether NRS policy is throttling reqeust
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp    whether the regular or high-priority NRS head is to be
+ *		    enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true	 the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_throttling_nolock(struct ptlrpc_service_part *svcpt,
+				      bool hp)
+{
+	struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	return !!nrs->nrs_throttling;
+};
+
+/**
+ * Moves request \a req from the regular to the high-priority NRS head.
+ *
+ * \param[in] req the request to move
+ */
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_nrs_request	*nrq = &req->rq_nrq;
+	struct ptlrpc_nrs_resource	*res1[NRS_RES_MAX];
+	struct ptlrpc_nrs_resource	*res2[NRS_RES_MAX];
+	ENTRY;
+
+	/**
+	 * Obtain the high-priority NRS head resources.
+	 */
+	nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true);
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (!ptlrpc_nrs_req_can_move(req))
+		goto out;
+
+	ptlrpc_nrs_req_del_nolock(req);
+
+	memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0]));
+	memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0]));
+
+	ptlrpc_nrs_hpreq_add_nolock(req);
+
+	memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0]));
+out:
+	spin_unlock(&svcpt->scp_req_lock);
+
+	/**
+	 * Release either the regular NRS head resources if we moved the
+	 * request, or the high-priority NRS head resources if we took a
+	 * reference earlier in this function and ptlrpc_nrs_req_can_move()
+	 * returned false.
+	 */
+	nrs_resource_put_safe(res1);
+	EXIT;
+}
+
+/**
+ * Carries out a control operation \a opc on the policy identified by the
+ * human-readable \a name, on either all partitions, or only on the first
+ * partition of service \a svc.
+ *
+ * \param[in]	  svc	 the service the policy belongs to.
+ * \param[in]	  queue  whether to carry out the command on the policy which
+ *			 belongs to the regular, high-priority, or both NRS
+ *			 heads of service partitions of \a svc.
+ * \param[in]	  name   the policy to act upon, by human-readable name
+ * \param[in]	  opc	 the opcode of the operation to carry out
+ * \param[in]	  single when set, the operation will only be carried out on the
+ *			 NRS heads of the first service partition of \a svc.
+ *			 This is useful for some policies which e.g. share
+ *			 identical values on the same parameters of different
+ *			 service partitions; when reading these parameters via
+ *			 lprocfs, these policies may just want to obtain and
+ *			 print out the values from the first service partition.
+ *			 Storing these values centrally elsewhere then could be
+ *			 another solution for this.
+ * \param[in,out] arg	 can be used as a generic in/out buffer between control
+ *			 operations and the user environment.
+ *
+ *\retval -ve error condition
+ *\retval   0 operation was carried out successfully
+ */
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg)
+{
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(opc != PTLRPC_NRS_CTL_INVALID);
+
+	if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0)
+		return -EINVAL;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name,
+					    opc, arg);
+			if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG &&
+					single))
+				GOTO(out, rc);
+		}
+
+		if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+			/**
+			 * XXX: We could optionally check for
+			 * nrs_svc_has_hp(svc) here, and return an error if it
+			 * is false. Right now we rely on the policies' lprocfs
+			 * handlers that call the present function to make this
+			 * check; if they fail to do so, they might hit the
+			 * assertion inside nrs_svcpt2nrs() below.
+			 */
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name,
+					    opc, arg);
+			if (rc != 0 || single)
+				GOTO(out, rc);
+		}
+	}
+out:
+	RETURN(rc);
+}
+
+/**
+ * Adds all policies that ship with the ptlrpc module, to NRS core's list of
+ * policies \e nrs_core.nrs_policies.
+ *
+ * \retval 0 all policies have been registered successfully
+ * \retval -ve error
+ */
+int ptlrpc_nrs_init(void)
+{
+	int	rc;
+	ENTRY;
+
+	mutex_init(&nrs_core.nrs_mutex);
+	INIT_LIST_HEAD(&nrs_core.nrs_policies);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+#ifdef HAVE_SERVER_SUPPORT
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_crrn);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_orr);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_trr);
+	if (rc != 0)
+		GOTO(fail, rc);
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_tbf);
+	if (rc != 0)
+		GOTO(fail, rc);
+#endif /* HAVE_SERVER_SUPPORT */
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_delay);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+	RETURN(rc);
+fail:
+	/**
+	 * Since no PTLRPC services have been started at this point, all we need
+	 * to do for cleanup is to free the descriptors.
+	 */
+	ptlrpc_nrs_fini();
+
+	RETURN(rc);
+}
+
+/**
+ * Removes all policy descriptors from nrs_core::nrs_policies, and frees the
+ * policy descriptors.
+ *
+ * Since all PTLRPC services are stopped at this point, there are no more
+ * instances of any policies, because each service will have stopped its policy
+ * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the
+ * descriptors here.
+ */
+void ptlrpc_nrs_fini(void)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	struct ptlrpc_nrs_pol_desc *tmp;
+
+	list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies,
+				     pd_list) {
+		list_del_init(&desc->pd_list);
+		OBD_FREE_PTR(desc);
+	}
+}
+
+/** @} nrs */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
new file mode 100644
index 0000000000000..dc41970d531b5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
@@ -0,0 +1,830 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_crr.c
+ *
+ * Network Request Scheduler (NRS) CRR-N policy
+ *
+ * Request ordering in a batched Round-Robin manner over client NIDs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name CRR-N policy
+ *
+ * Client Round-Robin scheduling over client NIDs
+ *
+ * @{
+ *
+ */
+
+#define NRS_POL_NAME_CRRN	"crrn"
+
+/**
+ * Binary heap predicate.
+ *
+ * Uses ptlrpc_nrs_request::nr_u::crr::cr_round and
+ * ptlrpc_nrs_request::nr_u::crr::cr_sequence to compare two binheap nodes and
+ * produce a binary predicate that shows their relative priority, so that the
+ * binary heap can perform the necessary sorting operations.
+ *
+ * \param[in] e1 the first binheap node to compare
+ * \param[in] e2 the second binheap node to compare
+ *
+ * \retval 0 e1 > e2
+ * \retval 1 e1 <= e2
+ */
+static int
+crrn_req_compare(struct binheap_node *e1, struct binheap_node *e2)
+{
+	struct ptlrpc_nrs_request *nrq1;
+	struct ptlrpc_nrs_request *nrq2;
+
+	nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node);
+	nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node);
+
+	if (nrq1->nr_u.crr.cr_round < nrq2->nr_u.crr.cr_round)
+		return 1;
+	else if (nrq1->nr_u.crr.cr_round > nrq2->nr_u.crr.cr_round)
+		return 0;
+
+	return nrq1->nr_u.crr.cr_sequence < nrq2->nr_u.crr.cr_sequence;
+}
+
+static struct binheap_ops nrs_crrn_heap_ops = {
+	.hop_enter	= NULL,
+	.hop_exit	= NULL,
+	.hop_compare	= crrn_req_compare,
+};
+
+/**
+ * rhashtable operations for nrs_crrn_net::cn_cli_hash
+ *
+ * This uses ptlrpc_request::rq_peer.nid as its key, in order to hash
+ * nrs_crrn_client objects.
+ */
+static u32 nrs_crrn_hashfn(const void *data, u32 len, u32 seed)
+{
+	const lnet_nid_t *nid = data;
+
+	seed ^= cfs_hash_64((u64)nid, 32);
+	return seed;
+}
+
+static int nrs_crrn_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct nrs_crrn_client *cli = obj;
+	const lnet_nid_t *nid = arg->key;
+
+	return *nid != cli->cc_nid;
+}
+
+static const struct rhashtable_params nrs_crrn_hash_params = {
+	.key_len        = sizeof(lnet_nid_t),
+	.key_offset	= offsetof(struct nrs_crrn_client, cc_nid),
+	.head_offset	= offsetof(struct nrs_crrn_client, cc_rhead),
+	.hashfn		= nrs_crrn_hashfn,
+	.obj_cmpfn	= nrs_crrn_cmpfn,
+};
+
+static void nrs_crrn_exit(void *vcli, void *data)
+{
+	struct nrs_crrn_client *cli = vcli;
+
+	LASSERTF(atomic_read(&cli->cc_ref) == 0,
+		 "Busy CRR-N object from client with NID %s, with %d refs\n",
+		 libcfs_nid2str(cli->cc_nid), atomic_read(&cli->cc_ref));
+
+	OBD_FREE_PTR(cli);
+}
+
+/**
+ * Called when a CRR-N policy instance is started.
+ *
+ * \param[in] policy the policy
+ *
+ * \retval -ENOMEM OOM error
+ * \retval 0	   success
+ */
+static int nrs_crrn_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_crrn_net    *net;
+	int			rc = 0;
+	ENTRY;
+
+	OBD_CPT_ALLOC_PTR(net, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (net == NULL)
+		RETURN(-ENOMEM);
+
+	net->cn_binheap = binheap_create(&nrs_crrn_heap_ops,
+					     CBH_FLAG_ATOMIC_GROW, 4096, NULL,
+					     nrs_pol2cptab(policy),
+					     nrs_pol2cptid(policy));
+	if (net->cn_binheap == NULL)
+		GOTO(out_net, rc = -ENOMEM);
+
+	rc = rhashtable_init(&net->cn_cli_hash, &nrs_crrn_hash_params);
+	if (rc)
+		GOTO(out_binheap, rc);
+
+	/**
+	 * Set default quantum value to max_rpcs_in_flight for non-MDS OSCs;
+	 * there may be more RPCs pending from each struct nrs_crrn_client even
+	 * with the default max_rpcs_in_flight value, as we are scheduling over
+	 * NIDs, and there may be more than one mount point per client.
+	 */
+	net->cn_quantum = OBD_MAX_RIF_DEFAULT;
+	/**
+	 * Set to 1 so that the test inside nrs_crrn_req_add() can evaluate to
+	 * true.
+	 */
+	net->cn_sequence = 1;
+
+	policy->pol_private = net;
+
+	RETURN(rc);
+
+out_binheap:
+	binheap_destroy(net->cn_binheap);
+out_net:
+	OBD_FREE_PTR(net);
+
+	RETURN(rc);
+}
+
+/**
+ * Called when a CRR-N policy instance is stopped.
+ *
+ * Called when the policy has been instructed to transition to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state and has no more pending
+ * requests to serve.
+ *
+ * \param[in] policy the policy
+ */
+static void nrs_crrn_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_crrn_net	*net = policy->pol_private;
+	ENTRY;
+
+	LASSERT(net != NULL);
+	LASSERT(net->cn_binheap != NULL);
+	LASSERT(binheap_is_empty(net->cn_binheap));
+
+	rhashtable_free_and_destroy(&net->cn_cli_hash, nrs_crrn_exit, NULL);
+	binheap_destroy(net->cn_binheap);
+
+	OBD_FREE_PTR(net);
+}
+
+/**
+ * Performs a policy-specific ctl function on CRR-N policy instances; similar
+ * to ioctl.
+ *
+ * \param[in]	  policy the policy instance
+ * \param[in]	  opc	 the opcode
+ * \param[in,out] arg	 used for passing parameters and information
+ *
+ * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried out successfully
+ * \retval -ve error
+ */
+static int nrs_crrn_ctl(struct ptlrpc_nrs_policy *policy,
+			enum ptlrpc_nrs_ctl opc,
+			void *arg)
+{
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch((enum nrs_ctl_crr)opc) {
+	default:
+		RETURN(-EINVAL);
+
+	/**
+	 * Read Round Robin quantum size of a policy instance.
+	 */
+	case NRS_CTL_CRRN_RD_QUANTUM: {
+		struct nrs_crrn_net	*net = policy->pol_private;
+
+		*(__u16 *)arg = net->cn_quantum;
+		}
+		break;
+
+	/**
+	 * Write Round Robin quantum size of a policy instance.
+	 */
+	case NRS_CTL_CRRN_WR_QUANTUM: {
+		struct nrs_crrn_net	*net = policy->pol_private;
+
+		net->cn_quantum = *(__u16 *)arg;
+		LASSERT(net->cn_quantum != 0);
+		}
+		break;
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Obtains resources from CRR-N policy instances. The top-level resource lives
+ * inside \e nrs_crrn_net and the second-level resource inside
+ * \e nrs_crrn_client object instances.
+ *
+ * \param[in]  policy	  the policy for which resources are being taken for
+ *			  request \a nrq
+ * \param[in]  nrq	  the request for which resources are being taken
+ * \param[in]  parent	  parent resource, embedded in nrs_crrn_net for the
+ *			  CRR-N policy
+ * \param[out] resp	  resources references are placed in this array
+ * \param[in]  moving_req signifies limited caller context; used to perform
+ *			  memory allocations in an atomic context in this
+ *			  policy
+ *
+ * \retval 0   we are returning a top-level, parent resource, one that is
+ *	       embedded in an nrs_crrn_net object
+ * \retval 1   we are returning a bottom-level resource, one that is embedded
+ *	       in an nrs_crrn_client object
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_crrn_res_get(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq,
+			    const struct ptlrpc_nrs_resource *parent,
+			    struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	struct nrs_crrn_net	*net;
+	struct nrs_crrn_client	*cli;
+	struct nrs_crrn_client	*tmp;
+	struct ptlrpc_request	*req;
+
+	if (parent == NULL) {
+		*resp = &((struct nrs_crrn_net *)policy->pol_private)->cn_res;
+		return 0;
+	}
+
+	net = container_of(parent, struct nrs_crrn_net, cn_res);
+	req = container_of(nrq, struct ptlrpc_request, rq_nrq);
+
+	cli = rhashtable_lookup_fast(&net->cn_cli_hash, &req->rq_peer.nid,
+				     nrs_crrn_hash_params);
+	if (cli)
+		goto out;
+
+	OBD_CPT_ALLOC_GFP(cli, nrs_pol2cptab(policy), nrs_pol2cptid(policy),
+			  sizeof(*cli), moving_req ? GFP_ATOMIC : GFP_NOFS);
+	if (cli == NULL)
+		return -ENOMEM;
+
+	cli->cc_nid = req->rq_peer.nid;
+
+	atomic_set(&cli->cc_ref, 0);
+
+	tmp = rhashtable_lookup_get_insert_fast(&net->cn_cli_hash,
+						&cli->cc_rhead,
+						nrs_crrn_hash_params);
+	if (tmp) {
+		/* insertion failed */
+		OBD_FREE_PTR(cli);
+		if (IS_ERR(tmp))
+			return PTR_ERR(tmp);
+		cli = tmp;
+	}
+out:
+	atomic_inc(&cli->cc_ref);
+	*resp = &cli->cc_res;
+
+	return 1;
+}
+
+/**
+ * Called when releasing references to the resource hierachy obtained for a
+ * request for scheduling using the CRR-N policy.
+ *
+ * \param[in] policy   the policy the resource belongs to
+ * \param[in] res      the resource to be released
+ */
+static void nrs_crrn_res_put(struct ptlrpc_nrs_policy *policy,
+			     const struct ptlrpc_nrs_resource *res)
+{
+	struct nrs_crrn_client *cli;
+
+	/**
+	 * Do nothing for freeing parent, nrs_crrn_net resources
+	 */
+	if (res->res_parent == NULL)
+		return;
+
+	cli = container_of(res, struct nrs_crrn_client, cc_res);
+
+	atomic_dec(&cli->cc_ref);
+}
+
+/**
+ * Called when getting a request from the CRR-N policy for handlingso that it can be served
+ *
+ * \param[in] policy the policy being polled
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  force the policy to return a request; unused in this policy
+ *
+ * \retval the request to be handled
+ * \retval NULL no request available
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_crrn_req_get(struct ptlrpc_nrs_policy *policy,
+					    bool peek, bool force)
+{
+	struct nrs_crrn_net	  *net = policy->pol_private;
+	struct binheap_node	  *node = binheap_root(net->cn_binheap);
+	struct ptlrpc_nrs_request *nrq;
+
+	nrq = unlikely(node == NULL) ? NULL :
+	      container_of(node, struct ptlrpc_nrs_request, nr_node);
+
+	if (likely(!peek && nrq != NULL)) {
+		struct nrs_crrn_client *cli;
+		struct ptlrpc_request *req = container_of(nrq,
+							  struct ptlrpc_request,
+							  rq_nrq);
+
+		cli = container_of(nrs_request_resource(nrq),
+				   struct nrs_crrn_client, cc_res);
+
+		LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round);
+
+		binheap_remove(net->cn_binheap, &nrq->nr_node);
+		cli->cc_active--;
+
+		CDEBUG(D_RPCTRACE,
+		       "NRS: starting to handle %s request from %s, with round "
+		       "%llu\n", NRS_POL_NAME_CRRN,
+		       libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round);
+
+		/** Peek at the next request to be served */
+		node = binheap_root(net->cn_binheap);
+
+		/** No more requests */
+		if (unlikely(node == NULL)) {
+			net->cn_round++;
+		} else {
+			struct ptlrpc_nrs_request *next;
+
+			next = container_of(node, struct ptlrpc_nrs_request,
+					    nr_node);
+
+			if (net->cn_round < next->nr_u.crr.cr_round)
+				net->cn_round = next->nr_u.crr.cr_round;
+		}
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to a CRR-N \a policy instance's set of queued requests
+ *
+ * A scheduling round is a stream of requests that have been sorted in batches
+ * according to the client that they originate from (as identified by its NID);
+ * there can be only one batch for each client in each round. The batches are of
+ * maximum size nrs_crrn_net:cn_quantum. When a new request arrives for
+ * scheduling from a client that has exhausted its quantum in its current round,
+ * it will start scheduling requests on the next scheduling round. Clients are
+ * allowed to schedule requests against a round until all requests for the round
+ * are serviced, so a client might miss a round if it is not generating requests
+ * for a long enough period of time. Clients that miss a round will continue
+ * with scheduling the next request that they generate, starting at the round
+ * that requests are being dispatched for, at the time of arrival of this new
+ * request.
+ *
+ * Requests are tagged with the round number and a sequence number; the sequence
+ * number indicates the relative ordering amongst the batches of requests in a
+ * round, and is identical for all requests in a batch, as is the round number.
+ * The round and sequence numbers are used by crrn_req_compare() in order to
+ * maintain an ordered set of rounds, with each round consisting of an ordered
+ * set of batches of requests.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to add
+ *
+ * \retval 0	request successfully added
+ * \retval != 0 error
+ */
+static int nrs_crrn_req_add(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_crrn_net	*net;
+	struct nrs_crrn_client	*cli;
+	int			 rc;
+
+	cli = container_of(nrs_request_resource(nrq),
+			   struct nrs_crrn_client, cc_res);
+	net = container_of(nrs_request_resource(nrq)->res_parent,
+			   struct nrs_crrn_net, cn_res);
+
+	if (cli->cc_quantum == 0 || cli->cc_round < net->cn_round ||
+	    (cli->cc_active == 0 && cli->cc_quantum > 0)) {
+
+		/**
+		 * If the client has no pending requests, and still some of its
+		 * quantum remaining unused, which implies it has not had a
+		 * chance to schedule up to its maximum allowed batch size of
+		 * requests in the previous round it participated, schedule this
+		 * next request on a new round; this avoids fragmentation of
+		 * request batches caused by client inactivity, at the expense
+		 * of potentially slightly increased service time for the
+		 * request batch this request will be a part of.
+		 */
+		if (cli->cc_active == 0 && cli->cc_quantum > 0)
+			cli->cc_round++;
+
+		/** A new scheduling round has commenced */
+		if (cli->cc_round < net->cn_round)
+			cli->cc_round = net->cn_round;
+
+		/** I was not the last client through here */
+		if (cli->cc_sequence < net->cn_sequence)
+			cli->cc_sequence = ++net->cn_sequence;
+		/**
+		 * Reset the quantum if we have reached the maximum quantum
+		 * size for this batch, or even if we have not managed to
+		 * complete a batch size up to its maximum allowed size.
+		 * XXX: Accessed unlocked
+		 */
+		cli->cc_quantum = net->cn_quantum;
+	}
+
+	nrq->nr_u.crr.cr_round = cli->cc_round;
+	nrq->nr_u.crr.cr_sequence = cli->cc_sequence;
+
+	rc = binheap_insert(net->cn_binheap, &nrq->nr_node);
+	if (rc == 0) {
+		cli->cc_active++;
+		if (--cli->cc_quantum == 0)
+			cli->cc_round++;
+	}
+	return rc;
+}
+
+/**
+ * Removes request \a nrq from a CRR-N \a policy instance's set of queued
+ * requests.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to remove
+ */
+static void nrs_crrn_req_del(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_crrn_net	*net;
+	struct nrs_crrn_client	*cli;
+	bool			 is_root;
+
+	cli = container_of(nrs_request_resource(nrq),
+			   struct nrs_crrn_client, cc_res);
+	net = container_of(nrs_request_resource(nrq)->res_parent,
+			   struct nrs_crrn_net, cn_res);
+
+	LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round);
+
+	is_root = &nrq->nr_node == binheap_root(net->cn_binheap);
+
+	binheap_remove(net->cn_binheap, &nrq->nr_node);
+	cli->cc_active--;
+
+	/**
+	 * If we just deleted the node at the root of the binheap, we may have
+	 * to adjust round numbers.
+	 */
+	if (unlikely(is_root)) {
+		/** Peek at the next request to be served */
+		struct binheap_node *node = binheap_root(net->cn_binheap);
+
+		/** No more requests */
+		if (unlikely(node == NULL)) {
+			net->cn_round++;
+		} else {
+			nrq = container_of(node, struct ptlrpc_nrs_request,
+					   nr_node);
+
+			if (net->cn_round < nrq->nr_u.crr.cr_round)
+				net->cn_round = nrq->nr_u.crr.cr_round;
+		}
+	}
+}
+
+/**
+ * Called right after the request \a nrq finishes being handled by CRR-N policy
+ * instance \a policy.
+ *
+ * \param[in] policy the policy that handled the request
+ * \param[in] nrq    the request that was handled
+ */
+static void nrs_crrn_req_stop(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	CDEBUG(D_RPCTRACE,
+	       "NRS: finished handling %s request from %s, with round %llu"
+	       "\n", NRS_POL_NAME_CRRN,
+	       libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round);
+}
+
+/**
+ * debugfs interface
+ */
+
+/**
+ * Retrieves the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for CRR-N policy instances on both the regular and high-priority NRS head
+ * of a service, as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this
+ * state are skipped later by nrs_crrn_ctl().
+ *
+ * Quantum values are in # of RPCs, and output is in YAML format.
+ *
+ * For example:
+ *
+ *	reg_quantum:8
+ *	hp_quantum:4
+ */
+static int
+ptlrpc_lprocfs_nrs_crrn_quantum_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service	*svc = m->private;
+	__u16			quantum;
+	int			rc;
+
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_CRRN,
+				       NRS_CTL_CRRN_RD_QUANTUM,
+				       true, &quantum);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_REG
+			   "%-5d\n", quantum);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_CRRN,
+				       NRS_CTL_CRRN_RD_QUANTUM,
+				       true, &quantum);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_HP"%-5d\n", quantum);
+		/**
+		 * Ignore -ENODEV as the high priority NRS head's policy may be
+		 * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+no_hp:
+	return rc;
+}
+
+/**
+ * Sets the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for CRR-N policy instances of a service. The user can set the quantum size
+ * for the regular or high priority NRS head individually by specifying each
+ * value, or both together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_crrn_quantum=reg_quantum:32, to set the regular
+ * request quantum size on all PTLRPC services to 32
+ *
+ * lctl set_param *.*.*.nrs_crrn_quantum=hp_quantum:16, to set the high
+ * priority request quantum size on all PTLRPC services to 16, and
+ *
+ * lctl set_param *.*.ost_io.nrs_crrn_quantum=16, to set both the regular and
+ * high priority request quantum sizes of the ost_io service to 16.
+ *
+ * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state
+ * are skipped later by nrs_crrn_ctl().
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count,
+					  loff_t *off)
+{
+	struct seq_file		    *m = file->private_data;
+	struct ptlrpc_service	    *svc = m->private;
+	enum ptlrpc_nrs_queue_type   queue = 0;
+	char			     kernbuf[LPROCFS_NRS_WR_QUANTUM_MAX_CMD];
+	char			    *val;
+	long			     quantum_reg;
+	long			     quantum_hp;
+	/** lprocfs_find_named_value() modifies its argument, so keep a copy */
+	size_t			     count_copy;
+	int			     rc = 0;
+	int			     rc2 = 0;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+	count_copy = count;
+
+	/**
+	 * Check if the regular quantum value has been specified
+	 */
+	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
+				       &count_copy);
+	if (val != kernbuf) {
+		rc = kstrtol(val, 10, &quantum_reg);
+		if (rc)
+			return rc;
+
+		queue |= PTLRPC_NRS_QUEUE_REG;
+	}
+
+	count_copy = count;
+
+	/**
+	 * Check if the high priority quantum value has been specified
+	 */
+	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_HP,
+				       &count_copy);
+	if (val != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			return -ENODEV;
+
+		rc = kstrtol(val, 10, &quantum_hp);
+		if (rc)
+			return rc;
+
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	/**
+	 * If none of the queues has been specified, look for a valid numerical
+	 * value
+	 */
+	if (queue == 0) {
+		rc = kstrtol(kernbuf, 10, &quantum_reg);
+		if (rc)
+			return rc;
+
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc)) {
+			queue |= PTLRPC_NRS_QUEUE_HP;
+			quantum_hp = quantum_reg;
+		}
+	}
+
+	if ((((queue & PTLRPC_NRS_QUEUE_REG) != 0) &&
+	    ((quantum_reg > LPROCFS_NRS_QUANTUM_MAX || quantum_reg <= 0))) ||
+	    (((queue & PTLRPC_NRS_QUEUE_HP) != 0) &&
+	    ((quantum_hp > LPROCFS_NRS_QUANTUM_MAX || quantum_hp <= 0))))
+		return -EINVAL;
+
+	/**
+	 * We change the values on regular and HP NRS heads separately, so that
+	 * we do not exit early from ptlrpc_nrs_policy_control() with an error
+	 * returned by nrs_policy_ctl_locked(), in cases where the user has not
+	 * started the policy on either the regular or HP NRS head; i.e. we are
+	 * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned
+	 * only if the operation fails with -ENODEV on all heads that have been
+	 * specified by the command; if at least one operation succeeds,
+	 * success is returned.
+	 */
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       NRS_POL_NAME_CRRN,
+					       NRS_CTL_CRRN_WR_QUANTUM, false,
+					       &quantum_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			return rc;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						NRS_POL_NAME_CRRN,
+						NRS_CTL_CRRN_WR_QUANTUM, false,
+						&quantum_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			return rc2;
+	}
+
+	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
+
+/**
+ * Initializes a CRR-N policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ *
+ * \retval 0	success
+ * \retval != 0	error
+ */
+static int nrs_crrn_lprocfs_init(struct ptlrpc_service *svc)
+{
+	struct ldebugfs_vars nrs_crrn_lprocfs_vars[] = {
+		{ .name		= "nrs_crrn_quantum",
+		  .fops		= &ptlrpc_lprocfs_nrs_crrn_quantum_fops,
+		  .data = svc },
+		{ NULL }
+	};
+
+	if (!svc->srv_debugfs_entry)
+		return 0;
+
+	ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_crrn_lprocfs_vars, NULL);
+
+	return 0;
+}
+
+/**
+ * CRR-N policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_crrn_ops = {
+	.op_policy_start	= nrs_crrn_start,
+	.op_policy_stop		= nrs_crrn_stop,
+	.op_policy_ctl		= nrs_crrn_ctl,
+	.op_res_get		= nrs_crrn_res_get,
+	.op_res_put		= nrs_crrn_res_put,
+	.op_req_get		= nrs_crrn_req_get,
+	.op_req_enqueue		= nrs_crrn_req_add,
+	.op_req_dequeue		= nrs_crrn_req_del,
+	.op_req_stop		= nrs_crrn_req_stop,
+	.op_lprocfs_init	= nrs_crrn_lprocfs_init,
+};
+
+/**
+ * CRR-N policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_crrn = {
+	.nc_name		= NRS_POL_NAME_CRRN,
+	.nc_ops			= &nrs_crrn_ops,
+	.nc_compat		= nrs_policy_compat_all,
+};
+
+/** @} CRR-N policy */
+
+/** @} nrs */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
new file mode 100644
index 0000000000000..c09dd7eaff28e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
@@ -0,0 +1,829 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Cray Inc. All Rights Reserved.
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/ptlrpc/nrs_delay.c
+ *
+ * Network Request Scheduler (NRS) Delay policy
+ *
+ * This policy will delay request handling for some configurable amount of
+ * time.
+ *
+ * Author: Chris Horn <hornc@cray.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/random.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name delay
+ *
+ * The delay policy schedules RPCs so that they are only processed after some
+ * configurable amount of time (in seconds) has passed.
+ *
+ * The defaults were chosen arbitrarily.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_DELAY	"delay"
+
+/* Default minimum delay in seconds. */
+#define NRS_DELAY_MIN_DEFAULT	5
+/* Default maximum delay, in seconds. */
+#define NRS_DELAY_MAX_DEFAULT	300
+/* Default percentage of delayed RPCs. */
+#define NRS_DELAY_PCT_DEFAULT	100
+
+/**
+ * Binary heap predicate.
+ *
+ * Elements are sorted according to the start time assigned to the requests
+ * upon enqueue. An element with an earlier start time is "less than" an
+ * element with a later start time.
+ *
+ * \retval 0 start_time(e1) > start_time(e2)
+ * \retval 1 start_time(e1) <= start_time(e2)
+ */
+static int delay_req_compare(struct binheap_node *e1,
+			     struct binheap_node *e2)
+{
+	struct ptlrpc_nrs_request *nrq1;
+	struct ptlrpc_nrs_request *nrq2;
+
+	nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node);
+	nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node);
+
+	return nrq1->nr_u.delay.req_start_time <=
+	       nrq2->nr_u.delay.req_start_time;
+}
+
+static struct binheap_ops nrs_delay_heap_ops = {
+	.hop_enter	= NULL,
+	.hop_exit	= NULL,
+	.hop_compare	= delay_req_compare,
+};
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes
+ * the delay-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ * \param[in] Generic char buffer; unused in this policy
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0	   success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_delay_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_delay_data *delay_data;
+
+	ENTRY;
+
+	OBD_CPT_ALLOC_PTR(delay_data, nrs_pol2cptab(policy),
+			  nrs_pol2cptid(policy));
+	if (delay_data == NULL)
+		RETURN(-ENOMEM);
+
+	delay_data->delay_binheap = binheap_create(&nrs_delay_heap_ops,
+						       CBH_FLAG_ATOMIC_GROW,
+						       4096, NULL,
+						       nrs_pol2cptab(policy),
+						       nrs_pol2cptid(policy));
+
+	if (delay_data->delay_binheap == NULL) {
+		OBD_FREE_PTR(delay_data);
+		RETURN(-ENOMEM);
+	}
+
+	delay_data->min_delay = NRS_DELAY_MIN_DEFAULT;
+	delay_data->max_delay = NRS_DELAY_MAX_DEFAULT;
+	delay_data->delay_pct = NRS_DELAY_PCT_DEFAULT;
+
+	policy->pol_private = delay_data;
+
+	RETURN(0);
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the delay-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_delay_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+
+	LASSERT(delay_data != NULL);
+	LASSERT(delay_data->delay_binheap != NULL);
+	LASSERT(binheap_is_empty(delay_data->delay_binheap));
+
+	binheap_destroy(delay_data->delay_binheap);
+
+	OBD_FREE_PTR(delay_data);
+}
+
+/**
+ * Is called for obtaining a delay policy resource.
+ *
+ * \param[in]  policy	  The policy on which the request is being asked for
+ * \param[in]  nrq	  The request for which resources are being taken
+ * \param[in]  parent	  Parent resource, unused in this policy
+ * \param[out] resp	  Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *			  policy
+ *
+ * \retval 1 The delay policy only has a one-level resource hierarchy
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_delay_res_get(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq,
+			     const struct ptlrpc_nrs_resource *parent,
+			     struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	/**
+	 * Just return the resource embedded inside nrs_delay_data, and end this
+	 * resource hierarchy reference request.
+	 */
+	*resp = &((struct nrs_delay_data *)policy->pol_private)->delay_res;
+	return 1;
+}
+
+/**
+ * Called when getting a request from the delay policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ * Requests are only removed from this policy when their start time has
+ * passed.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  Force the policy to return a request
+ *
+ * \retval The request to be handled
+ * \retval NULL no request available
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_delay_req_get(struct ptlrpc_nrs_policy *policy,
+					     bool peek, bool force)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+	struct binheap_node *node;
+	struct ptlrpc_nrs_request *nrq;
+
+	node = binheap_root(delay_data->delay_binheap);
+	nrq = unlikely(node == NULL) ? NULL :
+	      container_of(node, struct ptlrpc_nrs_request, nr_node);
+
+	if (likely(nrq != NULL)) {
+		if (!force &&
+		    ktime_get_real_seconds() < nrq->nr_u.delay.req_start_time)
+			nrq = NULL;
+		else if (likely(!peek))
+			binheap_remove(delay_data->delay_binheap,
+					   &nrq->nr_node);
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to a delay \a policy instance's set of queued requests
+ *
+ * A percentage (delay_pct) of incoming requests are delayed by this policy.
+ * If selected for delay a request start time is calculated. A start time
+ * is the current time plus a random offset in the range [min_delay, max_delay]
+ * The start time is recorded in the request, and is then used by
+ * delay_req_compare() to maintain a set of requests ordered by their start
+ * times.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 request added
+ * \retval 1 request not added
+ *
+ */
+static int nrs_delay_req_add(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+
+	if (delay_data->delay_pct == 0 || /* Not delaying anything */
+	    (delay_data->delay_pct != 100 &&
+	     delay_data->delay_pct < get_random_u32_below(100)))
+		return 1;
+
+	nrq->nr_u.delay.req_start_time = ktime_get_real_seconds() +
+					 get_random_u32_below(delay_data->max_delay - delay_data->min_delay + 1) +
+					 delay_data->min_delay;
+
+	return binheap_insert(delay_data->delay_binheap, &nrq->nr_node);
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_delay_req_del(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+
+	binheap_remove(delay_data->delay_binheap, &nrq->nr_node);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_delay_req_stop(struct ptlrpc_nrs_policy *policy,
+			       struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	DEBUG_REQ(D_RPCTRACE, req,
+		  "NRS: finished delayed request from %s after %llds",
+		  libcfs_id2str(req->rq_peer),
+		  (s64)(nrq->nr_u.delay.req_start_time -
+			req->rq_srv.sr_arrival_time.tv_sec));
+}
+
+/**
+ * Performs ctl functions specific to delay policy instances; similar to ioctl
+ *
+ * \param[in]     policy the policy instance
+ * \param[in]     opc    the opcode
+ * \param[in,out] arg    used for passing parameters and information
+ *
+ * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried out successfully
+ * \retval -ve error
+ */
+static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
+			 enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+	__u32 *val = (__u32 *)arg;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch ((enum nrs_ctl_delay)opc) {
+	default:
+		RETURN(-EINVAL);
+
+	case NRS_CTL_DELAY_RD_MIN:
+		*val = delay_data->min_delay;
+		break;
+
+	case NRS_CTL_DELAY_WR_MIN:
+		if (*val > delay_data->max_delay)
+			RETURN(-EINVAL);
+
+		delay_data->min_delay = *val;
+		break;
+
+	case NRS_CTL_DELAY_RD_MAX:
+		*val = delay_data->max_delay;
+		break;
+
+	case NRS_CTL_DELAY_WR_MAX:
+		if (*val < delay_data->min_delay)
+			RETURN(-EINVAL);
+
+		delay_data->max_delay = *val;
+		break;
+
+	case NRS_CTL_DELAY_RD_PCT:
+		*val = delay_data->delay_pct;
+		break;
+
+	case NRS_CTL_DELAY_WR_PCT:
+		if (*val < 0 || *val > 100)
+			RETURN(-EINVAL);
+
+		delay_data->delay_pct = *val;
+		break;
+	}
+	RETURN(0);
+}
+
+/**
+ * debugfs interface
+ */
+
+/* nrs_delay_min and nrs_delay_max are bounded by these values */
+#define LPROCFS_NRS_DELAY_LOWER_BOUND		0
+#define LPROCFS_NRS_DELAY_UPPER_BOUND		65535
+
+#define LPROCFS_NRS_DELAY_MIN_NAME		"delay_min:"
+#define LPROCFS_NRS_DELAY_MIN_NAME_REG		"reg_delay_min:"
+#define LPROCFS_NRS_DELAY_MIN_NAME_HP		"hp_delay_min:"
+
+/**
+ * Max size of the nrs_delay_min seq_write buffer. Needs to be large enough
+ * to hold the string: "reg_min_delay:65535 hp_min_delay:65535"
+ */
+#define LPROCFS_NRS_DELAY_MIN_SIZE					       \
+	sizeof(LPROCFS_NRS_DELAY_MIN_NAME_REG				       \
+	       __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)		       \
+	       " " LPROCFS_NRS_DELAY_MIN_NAME_HP			       \
+	       __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND))
+
+#define LPROCFS_NRS_DELAY_MAX_NAME		"delay_max:"
+#define LPROCFS_NRS_DELAY_MAX_NAME_REG		"reg_delay_max:"
+#define LPROCFS_NRS_DELAY_MAX_NAME_HP		"hp_delay_max:"
+
+/**
+ * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_max
+ * variable.
+ */
+#define LPROCFS_NRS_DELAY_MAX_SIZE					       \
+	sizeof(LPROCFS_NRS_DELAY_MAX_NAME_REG				       \
+	       __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)		       \
+	       " " LPROCFS_NRS_DELAY_MAX_NAME_HP			       \
+	       __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND))
+
+#define LPROCFS_NRS_DELAY_PCT_MIN_VAL		0
+#define LPROCFS_NRS_DELAY_PCT_MAX_VAL		100
+#define LPROCFS_NRS_DELAY_PCT_NAME		"delay_pct:"
+#define LPROCFS_NRS_DELAY_PCT_NAME_REG		"reg_delay_pct:"
+#define LPROCFS_NRS_DELAY_PCT_NAME_HP		"hp_delay_pct:"
+
+/**
+ * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_pct
+ * variable.
+ */
+#define LPROCFS_NRS_DELAY_PCT_SIZE					       \
+	sizeof(LPROCFS_NRS_DELAY_PCT_NAME_REG				       \
+	       __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL)		       \
+	       " " LPROCFS_NRS_DELAY_PCT_NAME_HP			       \
+	       __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL))
+
+/**
+ * Helper for delay's seq_write functions.
+ */
+static ssize_t
+lprocfs_nrs_delay_seq_write_common(const char __user *buffer,
+				   unsigned int bufsize, size_t count,
+				   const char *var_name, unsigned int min_val,
+				   unsigned int max_val,
+				   struct ptlrpc_service *svc, char *pol_name,
+				   enum ptlrpc_nrs_ctl opc, bool single)
+{
+	enum ptlrpc_nrs_queue_type queue = 0;
+	char *kernbuf;
+	char *val_str;
+	long unsigned int val_reg;
+	long unsigned int val_hp;
+	size_t count_copy;
+	int rc = 0;
+	char *tmp = NULL;
+	int tmpsize = 0;
+
+	if (count > bufsize - 1)
+		return -EINVAL;
+
+	OBD_ALLOC(kernbuf, bufsize);
+	if (kernbuf == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		GOTO(free_kernbuf, rc = -EFAULT);
+
+	tmpsize = strlen("reg_") + strlen(var_name) + 1;
+	OBD_ALLOC(tmp, tmpsize);
+	if (tmp == NULL)
+		GOTO(free_tmp, rc = -ENOMEM);
+
+	/* look for "reg_<var_name>" in kernbuf */
+	snprintf(tmp, tmpsize, "reg_%s", var_name);
+	count_copy = count;
+	val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy);
+	if (val_str != kernbuf) {
+		rc = kstrtoul(val_str, 10, &val_reg);
+		if (rc != 0)
+			GOTO(free_tmp, rc = -EINVAL);
+		queue |= PTLRPC_NRS_QUEUE_REG;
+	}
+
+	/* look for "hp_<var_name>" in kernbuf */
+	snprintf(tmp, tmpsize, "hp_%s", var_name);
+	count_copy = count;
+	val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy);
+	if (val_str != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			GOTO(free_tmp, rc = -ENODEV);
+
+		rc = kstrtoul(val_str, 10, &val_hp);
+		if (rc != 0)
+			GOTO(free_tmp, rc = -EINVAL);
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	if (queue == 0) {
+		if (!isdigit(kernbuf[0]))
+			GOTO(free_tmp, rc = -EINVAL);
+
+		rc = kstrtoul(kernbuf, 10, &val_reg);
+		if (rc != 0)
+			GOTO(free_tmp, rc = -EINVAL);
+
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc)) {
+			queue |= PTLRPC_NRS_QUEUE_HP;
+			val_hp = val_reg;
+		}
+	}
+
+	if (queue & PTLRPC_NRS_QUEUE_REG) {
+		if (val_reg > max_val || val_reg < min_val)
+			GOTO(free_tmp, rc = -EINVAL);
+
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       pol_name, opc, single, &val_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			GOTO(free_tmp, rc);
+	}
+
+	if (queue & PTLRPC_NRS_QUEUE_HP) {
+		int rc2 = 0;
+		if (val_hp > max_val || val_hp < min_val)
+			GOTO(free_tmp, rc = -EINVAL);
+
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						pol_name, opc, single, &val_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			GOTO(free_tmp, rc = rc2);
+	}
+
+	/* If we've reached here then we want to return count */
+	rc = count;
+
+free_tmp:
+	OBD_FREE(tmp, tmpsize);
+free_kernbuf:
+	OBD_FREE(kernbuf, bufsize);
+
+	return rc;
+}
+
+/**
+ * Retrieves the value of the minimum delay for delay policy instances on both
+ * the regular and high-priority NRS head of a service, as long as a policy
+ * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ */
+static int
+ptlrpc_lprocfs_nrs_delay_min_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service *svc = m->private;
+	unsigned int min_delay;
+	int rc;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_MIN,
+				       true, &min_delay);
+
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_REG"%-5d\n",
+			   min_delay);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc != -ENODEV)
+		return rc;
+
+	if (!nrs_svc_has_hp(svc))
+		return 0;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_MIN,
+				       true, &min_delay);
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_HP"%-5d\n",
+			   min_delay);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc == -ENODEV)
+		rc = 0;
+
+	return rc;
+}
+
+/**
+ * Sets the value of the minimum request delay for delay policy instances of a
+ * service. The user can set the minimum request delay for the regular or high
+ * priority NRS head individually by specifying each value, or both together in
+ * a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_delay_min=reg_delay_min:5, to set the regular
+ * request minimum delay on all PtlRPC services to 5 seconds
+ *
+ * lctl set_param *.*.*.nrs_delay_min=hp_delay_min:2, to set the high-priority
+ * request minimum delay on all PtlRPC services to 2 seconds, and
+ *
+ * lctl set_param *.*.ost_io.nrs_delay_min=8, to set both the regular and
+ * high priority request minimum delay of the ost_io service to 8 seconds.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
+				       const char __user *buffer, size_t count,
+				       loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+
+	return lprocfs_nrs_delay_seq_write_common(buffer,
+						  LPROCFS_NRS_DELAY_MIN_SIZE,
+						  count,
+						  LPROCFS_NRS_DELAY_MIN_NAME,
+						  LPROCFS_NRS_DELAY_LOWER_BOUND,
+						  LPROCFS_NRS_DELAY_UPPER_BOUND,
+						  svc, NRS_POL_NAME_DELAY,
+						  NRS_CTL_DELAY_WR_MIN, false);
+}
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min);
+
+/**
+ * Retrieves the value of the maximum delay for delay policy instances on both
+ * the regular and high-priority NRS head of a service, as long as a policy
+ * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ */
+static int
+ptlrpc_lprocfs_nrs_delay_max_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service *svc = m->private;
+	unsigned int max_delay;
+	int rc;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_MAX,
+				       true, &max_delay);
+
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_REG"%-5d\n",
+			   max_delay);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc != -ENODEV)
+		return rc;
+
+	if (!nrs_svc_has_hp(svc))
+		return 0;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_MAX,
+				       true, &max_delay);
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_HP"%-5d\n",
+			   max_delay);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc == -ENODEV)
+		rc = 0;
+
+	return rc;
+}
+
+/**
+ * Sets the value of the maximum request delay for delay policy instances of a
+ * service. The user can set the maximum request delay for the regular or high
+ * priority NRS head individually by specifying each value, or both together in
+ * a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_delay_max=reg_delay_max:20, to set the regular
+ * request maximum delay on all PtlRPC services to 20 seconds
+ *
+ * lctl set_param *.*.*.nrs_delay_max=hp_delay_max:10, to set the high-priority
+ * request maximum delay on all PtlRPC services to 10 seconds, and
+ *
+ * lctl set_param *.*.ost_io.nrs_delay_max=35, to set both the regular and
+ * high priority request maximum delay of the ost_io service to 35 seconds.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
+				       const char __user *buffer, size_t count,
+				       loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+
+	return lprocfs_nrs_delay_seq_write_common(buffer,
+						  LPROCFS_NRS_DELAY_MAX_SIZE,
+						  count,
+						  LPROCFS_NRS_DELAY_MAX_NAME,
+						  LPROCFS_NRS_DELAY_LOWER_BOUND,
+						  LPROCFS_NRS_DELAY_UPPER_BOUND,
+						  svc, NRS_POL_NAME_DELAY,
+						  NRS_CTL_DELAY_WR_MAX, false);
+}
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max);
+
+/**
+ * Retrieves the value of the percentage of requests which should be delayed
+ * for delay policy instances on both the regular and high-priority NRS head
+ * of a service, as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ */
+static int
+ptlrpc_lprocfs_nrs_delay_pct_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service *svc = m->private;
+	unsigned int delay_pct;
+	int rc;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_PCT,
+				       true, &delay_pct);
+
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_REG"%-3d\n",
+			   delay_pct);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc != -ENODEV)
+		return rc;
+
+	if (!nrs_svc_has_hp(svc))
+		return 0;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_PCT,
+				       true, &delay_pct);
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_HP"%-3d\n",
+			   delay_pct);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc == -ENODEV)
+		rc = 0;
+
+	return rc;
+}
+
+/**
+ * Sets the value of the percentage of requests to be delayed for delay policy
+ * instances of a service. The user can set the percentage for the regular or
+ * high-priority NRS head individually by specifying each value, or both
+ * together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_delay_pct=reg_delay_pct:5, to delay 5 percent of
+ * regular requests on all PtlRPC services
+ *
+ * lctl set_param *.*.*.nrs_delay_pct=hp_delay_pct:2, to delay 2 percent of
+ * high-priority requests on all PtlRPC services, and
+ *
+ * lctl set_param *.*.ost_io.nrs_delay_pct=8, to delay 8 percent of both
+ * regular and high-priority requests of the ost_io service.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
+				       const char __user *buffer, size_t count,
+				       loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+
+	return lprocfs_nrs_delay_seq_write_common(buffer,
+						  LPROCFS_NRS_DELAY_PCT_SIZE,
+						  count,
+						  LPROCFS_NRS_DELAY_PCT_NAME,
+						  LPROCFS_NRS_DELAY_PCT_MIN_VAL,
+						  LPROCFS_NRS_DELAY_PCT_MAX_VAL,
+						  svc, NRS_POL_NAME_DELAY,
+						  NRS_CTL_DELAY_WR_PCT, false);
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct);
+
+static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc)
+{
+	struct ldebugfs_vars nrs_delay_lprocfs_vars[] = {
+		{ .name		= "nrs_delay_min",
+		  .fops		= &ptlrpc_lprocfs_nrs_delay_min_fops,
+		  .data		= svc },
+		{ .name		= "nrs_delay_max",
+		  .fops		= &ptlrpc_lprocfs_nrs_delay_max_fops,
+		  .data		= svc },
+		{ .name		= "nrs_delay_pct",
+		  .fops		= &ptlrpc_lprocfs_nrs_delay_pct_fops,
+		  .data		= svc },
+		{ NULL }
+	};
+
+	if (!svc->srv_debugfs_entry)
+		return 0;
+
+	ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_delay_lprocfs_vars, NULL);
+
+	return 0;
+}
+
+/**
+ * Delay policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_delay_ops = {
+	.op_policy_start	= nrs_delay_start,
+	.op_policy_stop		= nrs_delay_stop,
+	.op_policy_ctl		= nrs_delay_ctl,
+	.op_res_get		= nrs_delay_res_get,
+	.op_req_get		= nrs_delay_req_get,
+	.op_req_enqueue		= nrs_delay_req_add,
+	.op_req_dequeue		= nrs_delay_req_del,
+	.op_req_stop		= nrs_delay_req_stop,
+	.op_lprocfs_init	= nrs_delay_lprocfs_init,
+};
+
+/**
+ * Delay policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_delay = {
+	.nc_name		= NRS_POL_NAME_DELAY,
+	.nc_ops			= &nrs_delay_ops,
+	.nc_compat		= nrs_policy_compat_all,
+};
+
+/** @} delay */
+
+/** @} nrs */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c
new file mode 100644
index 0000000000000..2142ff4f665aa
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c
@@ -0,0 +1,271 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_fifo.c
+ *
+ * Network Request Scheduler (NRS) FIFO policy
+ *
+ * Handles RPCs in a FIFO manner, as received from the network. This policy is
+ * a logical wrapper around previous, non-NRS functionality. It is used as the
+ * default and fallback policy for all types of RPCs on all PTLRPC service
+ * partitions, for both regular and high-priority NRS heads. Default here means
+ * the policy is the one enabled at PTLRPC service partition startup time, and
+ * fallback means the policy is used to handle RPCs that are not handled
+ * successfully or are not handled at all by any primary policy that may be
+ * enabled on a given NRS head.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name fifo
+ *
+ * The FIFO policy is a logical wrapper around previous, non-NRS functionality.
+ * It schedules RPCs in the same order as they are queued from LNet.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_FIFO	"fifo"
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0	   success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_fifo_head *head;
+
+	OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (head == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&head->fh_list);
+	policy->pol_private = head;
+	return 0;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_fifo_head *head = policy->pol_private;
+
+	LASSERT(head != NULL);
+	LASSERT(list_empty(&head->fh_list));
+
+	OBD_FREE_PTR(head);
+}
+
+/**
+ * Is called for obtaining a FIFO policy resource.
+ *
+ * \param[in]  policy	  The policy on which the request is being asked for
+ * \param[in]  nrq	  The request for which resources are being taken
+ * \param[in]  parent	  Parent resource, unused in this policy
+ * \param[out] resp	  Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *			  policy
+ *
+ * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since
+ *	     it implements a simple scheduling algorithm in which request
+ *	     priority is determined on the request arrival order, it does not
+ *	     need to maintain a set of resources that would otherwise be used
+ *	     to calculate a request's priority.
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq,
+			    const struct ptlrpc_nrs_resource *parent,
+			    struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	/**
+	 * Just return the resource embedded inside nrs_fifo_head, and end this
+	 * resource hierarchy reference request.
+	 */
+	*resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res;
+	return 1;
+}
+
+/**
+ * Called when getting a request from the FIFO policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  Force the policy to return a request; unused in this
+ *		     policy
+ *
+ * \retval The request to be handled; this is the next request in the FIFO
+ *	   queue
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request * nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy,
+					     bool peek, bool force)
+{
+	struct nrs_fifo_head	  *head = policy->pol_private;
+	struct ptlrpc_nrs_request *nrq;
+
+	nrq = list_first_entry_or_null(&head->fh_list,
+				       struct ptlrpc_nrs_request,
+				       nr_u.fifo.fr_list);
+
+	if (likely(!peek && nrq != NULL)) {
+		struct ptlrpc_request *req = container_of(nrq,
+							  struct ptlrpc_request,
+							  rq_nrq);
+
+		list_del_init(&nrq->nr_u.fifo.fr_list);
+
+		CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: %llu"
+		       "\n", policy->pol_desc->pd_name,
+		       libcfs_id2str(req->rq_peer), nrq->nr_u.fifo.fr_sequence);
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ *		      succeed
+ */
+static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_fifo_head *head;
+
+	head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head,
+			    fh_res);
+	/**
+	 * Only used for debugging
+	 */
+	nrq->nr_u.fifo.fr_sequence = head->fh_sequence++;
+	list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list);
+
+	return 0;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list));
+	list_del_init(&nrq->nr_u.fifo.fr_list);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n",
+	       policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+	       nrq->nr_u.fifo.fr_sequence);
+}
+
+/**
+ * FIFO policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = {
+	.op_policy_start	= nrs_fifo_start,
+	.op_policy_stop		= nrs_fifo_stop,
+	.op_res_get		= nrs_fifo_res_get,
+	.op_req_get		= nrs_fifo_req_get,
+	.op_req_enqueue		= nrs_fifo_req_add,
+	.op_req_dequeue		= nrs_fifo_req_del,
+	.op_req_stop		= nrs_fifo_req_stop,
+};
+
+/**
+ * FIFO policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_fifo = {
+	.nc_name		= NRS_POL_NAME_FIFO,
+	.nc_ops			= &nrs_fifo_ops,
+	.nc_compat		= nrs_policy_compat_all,
+	.nc_flags		= PTLRPC_NRS_FL_FALLBACK |
+				  PTLRPC_NRS_FL_REG_START
+};
+
+/** @} fifo */
+
+/** @} nrs */
+
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
new file mode 100644
index 0000000000000..a81609a3f084d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
@@ -0,0 +1,1970 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_orr.c
+ *
+ * Network Request Scheduler (NRS) ORR and TRR policies
+ *
+ * Request scheduling in a Round-Robin manner over backend-fs objects and OSTs
+ * respectively
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_req_layout.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name ORR/TRR policy
+ *
+ * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies
+ *
+ * ORR performs batched Round Robin shceduling of brw RPCs, based on the FID of
+ * the backend-fs object that the brw RPC pertains to; the TRR policy performs
+ * batched Round Robin scheduling of brw RPCs, based on the OST index that the
+ * RPC pertains to. Both policies also order RPCs in each batch in ascending
+ * offset order, which is lprocfs-tunable between logical file offsets, and
+ * physical disk offsets, as reported by fiemap.
+ *
+ * The TRR policy reuses much of the functionality of ORR. These two scheduling
+ * algorithms could alternatively be implemented under a single NRS policy, that
+ * uses an lprocfs tunable in order to switch between the two types of
+ * scheduling behaviour. The two algorithms have been implemented as separate
+ * policies for reasons of clarity to the user, and to avoid issues that would
+ * otherwise arise at the point of switching between behaviours in the case of
+ * having a single policy, such as resource cleanup for nrs_orr_object
+ * instances. It is possible that this may need to be re-examined in the future,
+ * along with potentially coalescing other policies that perform batched request
+ * scheduling in a Round-Robin manner, all into one policy.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_ORR	"orr"
+#define NRS_POL_NAME_TRR	"trr"
+
+/**
+ * Checks if the RPC type of \a nrq is currently handled by an ORR/TRR policy
+ *
+ * \param[in]  orrd   the ORR/TRR policy scheduler instance
+ * \param[in]  nrq    the request
+ * \param[out] opcode the opcode is saved here, just in order to avoid calling
+ *		      lustre_msg_get_opc() again later
+ *
+ * \retval true  request type is supported by the policy instance
+ * \retval false request type is not supported by the policy instance
+ */
+static bool nrs_orr_req_supported(struct nrs_orr_data *orrd,
+				  struct ptlrpc_nrs_request *nrq, __u32 *opcode)
+{
+	struct ptlrpc_request  *req = container_of(nrq, struct ptlrpc_request,
+						   rq_nrq);
+	__u32			opc = lustre_msg_get_opc(req->rq_reqmsg);
+	bool			rc = false;
+
+	/**
+	 * XXX: nrs_orr_data::od_supp accessed unlocked.
+	 */
+	switch (opc) {
+	case OST_READ:
+		rc = orrd->od_supp & NOS_OST_READ;
+		break;
+	case OST_WRITE:
+		rc = orrd->od_supp & NOS_OST_WRITE;
+		break;
+	}
+
+	if (rc)
+		*opcode = opc;
+
+	return rc;
+}
+
+/**
+ * Returns the ORR/TRR key fields for the request \a nrq in \a key.
+ *
+ * \param[in]  orrd the ORR/TRR policy scheduler instance
+ * \param[in]  nrq  the request
+ * \param[in]  opc  the request's opcode
+ * \param[in]  name the policy name
+ * \param[out] key  fields of the key are returned here.
+ *
+ * \retval 0   key filled successfully
+ * \retval < 0 error
+ */
+static int nrs_orr_key_fill(struct nrs_orr_data *orrd,
+			    struct ptlrpc_nrs_request *nrq, __u32 opc,
+			    char *name, struct nrs_orr_key *key)
+{
+	struct ptlrpc_request  *req = container_of(nrq, struct ptlrpc_request,
+						   rq_nrq);
+	struct ost_body        *body;
+	__u32			ost_idx;
+	bool			is_orr = strncmp(name, NRS_POL_NAME_ORR,
+						 NRS_POL_NAME_MAX) == 0;
+
+	LASSERT(req != NULL);
+
+	/**
+	 * This is an attempt to fill in the request key fields while
+	 * moving a request from the regular to the high-priority NRS
+	 * head (via ldlm_lock_reorder_req()), but the request key has
+	 * been adequately filled when nrs_orr_res_get() was called through
+	 * ptlrpc_nrs_req_initialize() for the regular NRS head's ORR/TRR
+	 * policy, so there is nothing to do.
+	 */
+	if ((is_orr && nrq->nr_u.orr.or_orr_set) ||
+	    (!is_orr && nrq->nr_u.orr.or_trr_set)) {
+		*key = nrq->nr_u.orr.or_key;
+		return 0;
+	}
+
+	/* Bounce unconnected requests to the default policy. */
+	if (req->rq_export == NULL)
+		return -ENOTCONN;
+
+	if (nrq->nr_u.orr.or_orr_set || nrq->nr_u.orr.or_trr_set)
+		memset(&nrq->nr_u.orr.or_key, 0, sizeof(nrq->nr_u.orr.or_key));
+
+	ost_idx = class_server_data(req->rq_export->exp_obd)->lsd_osd_index;
+
+	if (is_orr) {
+		int	rc;
+		/**
+		 * The request pill for OST_READ and OST_WRITE requests is
+		 * initialized in the ost_io service's
+		 * ptlrpc_service_ops::so_hpreq_handler, ost_io_hpreq_handler(),
+		 * so no need to redo it here.
+		 */
+		body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+		if (body == NULL)
+			RETURN(-EFAULT);
+
+		rc = ostid_to_fid(&key->ok_fid, &body->oa.o_oi, ost_idx);
+		if (rc < 0)
+			return rc;
+
+		nrq->nr_u.orr.or_orr_set = 1;
+	} else {
+		key->ok_idx = ost_idx;
+		nrq->nr_u.orr.or_trr_set = 1;
+	}
+
+	return 0;
+}
+
+/**
+ * Populates the range values in \a range with logical offsets obtained via
+ * \a nb.
+ *
+ * \param[in]  nb	niobuf_remote struct array for this request
+ * \param[in]  niocount	count of niobuf_remote structs for this request
+ * \param[out] range	the offset range is returned here
+ */
+static void nrs_orr_range_fill_logical(struct niobuf_remote *nb, int niocount,
+				       struct nrs_orr_req_range *range)
+{
+	/* Should we do this at page boundaries ? */
+	range->or_start = nb[0].rnb_offset & PAGE_MASK;
+	range->or_end = (nb[niocount - 1].rnb_offset +
+			 nb[niocount - 1].rnb_len - 1) | ~PAGE_MASK;
+}
+
+/**
+ * We obtain information just for a single extent, as the request can only be in
+ * a single place in the binary heap anyway.
+ */
+#define ORR_NUM_EXTENTS 1
+
+/**
+ * Converts the logical file offset range in \a range, to a physical disk offset
+ * range in \a range, for a request. Uses obd_get_info() in order to carry out a
+ * fiemap call and obtain backend-fs extent information. The returned range is
+ * in physical block numbers.
+ *
+ * \param[in]	  nrq	the request
+ * \param[in]	  oa	obdo struct for this request
+ * \param[in,out] range	the offset range in bytes; logical range in, physical
+ *			range out
+ *
+ * \retval 0	physical offsets obtained successfully
+ * \retvall < 0 error
+ */
+static int nrs_orr_range_fill_physical(struct ptlrpc_nrs_request *nrq,
+				       struct obdo *oa,
+				       struct nrs_orr_req_range *range)
+{
+	struct ptlrpc_request     *req = container_of(nrq,
+						      struct ptlrpc_request,
+						      rq_nrq);
+	char			   fiemap_buf[offsetof(struct fiemap,
+						  fm_extents[ORR_NUM_EXTENTS])];
+	struct fiemap              *fiemap = (struct fiemap *)fiemap_buf;
+	struct ll_fiemap_info_key  key;
+	loff_t			   start;
+	loff_t			   end;
+	int			   rc;
+
+	key = (typeof(key)) {
+		.lfik_name = KEY_FIEMAP,
+		.lfik_oa = *oa,
+		.lfik_fiemap = {
+			.fm_start = range->or_start,
+			.fm_length = range->or_end - range->or_start,
+			.fm_extent_count = ORR_NUM_EXTENTS
+		}
+	};
+
+	rc = obd_get_info(req->rq_svc_thread->t_env, req->rq_export,
+			  sizeof(key), &key, NULL, fiemap);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	if (fiemap->fm_mapped_extents == 0 ||
+	    fiemap->fm_mapped_extents > ORR_NUM_EXTENTS)
+		GOTO(out, rc = -EFAULT);
+
+	/**
+	 * Calculate the physical offset ranges for the request from the extent
+	 * information and the logical request offsets.
+	 */
+	start = fiemap->fm_extents[0].fe_physical + range->or_start -
+		fiemap->fm_extents[0].fe_logical;
+	end = start + range->or_end - range->or_start;
+
+	range->or_start = start;
+	range->or_end = end;
+
+	nrq->nr_u.orr.or_physical_set = 1;
+out:
+	return rc;
+}
+
+/**
+ * Sets the offset range the request covers; either in logical file
+ * offsets or in physical disk offsets.
+ *
+ * \param[in] nrq	 the request
+ * \param[in] orrd	 the ORR/TRR policy scheduler instance
+ * \param[in] opc	 the request's opcode
+ * \param[in] moving_req is the request in the process of moving onto the
+ *			 high-priority NRS head?
+ *
+ * \retval 0	range filled successfully
+ * \retval != 0 error
+ */
+static int nrs_orr_range_fill(struct ptlrpc_nrs_request *nrq,
+			      struct nrs_orr_data *orrd, __u32 opc,
+			      bool moving_req)
+{
+	struct ptlrpc_request	    *req = container_of(nrq,
+							struct ptlrpc_request,
+							rq_nrq);
+	struct obd_ioobj	    *ioo;
+	struct niobuf_remote	    *nb;
+	struct ost_body		    *body;
+	struct nrs_orr_req_range     range;
+	int			     niocount;
+	int			     rc = 0;
+
+	/**
+	 * If we are scheduling using physical disk offsets, but we have filled
+	 * the offset information in the request previously
+	 * (i.e. ldlm_lock_reorder_req() is moving the request to the
+	 * high-priority NRS head), there is no need to do anything, and we can
+	 * exit. Moreover than the lack of need, we would be unable to perform
+	 * the obd_get_info() call required in nrs_orr_range_fill_physical(),
+	 * because ldlm_lock_reorder_lock() calls into here while holding a
+	 * spinlock, and retrieving fiemap information via obd_get_info() is a
+	 * potentially sleeping operation.
+	 */
+	if (orrd->od_physical && nrq->nr_u.orr.or_physical_set)
+		return 0;
+
+	ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ);
+	if (ioo == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	niocount = ioo->ioo_bufcnt;
+
+	nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE);
+	if (nb == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	/**
+	 * Use logical information from niobuf_remote structures.
+	 */
+	nrs_orr_range_fill_logical(nb, niocount, &range);
+
+	/**
+	 * Obtain physical offsets if selected, and this is an OST_READ RPC
+	 * RPC. We do not enter this block if moving_req is set which indicates
+	 * that the request is being moved to the high-priority NRS head by
+	 * ldlm_lock_reorder_req(), as that function calls in here while holding
+	 * a spinlock, and nrs_orr_range_physical() can sleep, so we just use
+	 * logical file offsets for the range values for such requests.
+	 */
+	if (orrd->od_physical && opc == OST_READ && !moving_req) {
+		body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+		if (body == NULL)
+			GOTO(out, rc = -EFAULT);
+
+		/**
+		 * Translate to physical block offsets from backend filesystem
+		 * extents.
+		 * Ignore return values; if obtaining the physical offsets
+		 * fails, use the logical offsets.
+		 */
+		nrs_orr_range_fill_physical(nrq, &body->oa, &range);
+	}
+
+	nrq->nr_u.orr.or_range = range;
+out:
+	return rc;
+}
+
+/**
+ * Generates a character string that can be used in order to register uniquely
+ * named libcfs_hash and slab objects for ORR/TRR policy instances. The
+ * character string is unique per policy instance, as it includes the policy's
+ * name, the CPT number, and a {reg|hp} token, and there is one policy instance
+ * per NRS head on each CPT, and the policy is only compatible with the ost_io
+ * service.
+ *
+ * \param[in] policy the policy instance
+ * \param[out] name  the character array that will hold the generated name
+ */
+static void nrs_orr_genobjname(struct ptlrpc_nrs_policy *policy, char *name)
+{
+	snprintf(name, NRS_ORR_OBJ_NAME_MAX, "%s%s%s%d",
+		 "nrs_", policy->pol_desc->pd_name,
+		 policy->pol_nrs->nrs_queue_type == PTLRPC_NRS_QUEUE_REG ?
+		 "_reg_" : "_hp_", nrs_pol2cptid(policy));
+}
+
+/**
+ * ORR/TRR hash operations
+ */
+#define NRS_ORR_BITS		24
+#define NRS_ORR_BKT_BITS	12
+#define NRS_ORR_HASH_FLAGS	(CFS_HASH_SPIN_BKTLOCK | CFS_HASH_ASSERT_EMPTY)
+
+#define NRS_TRR_BITS		4
+#define NRS_TRR_BKT_BITS	2
+#define NRS_TRR_HASH_FLAGS	CFS_HASH_SPIN_BKTLOCK
+
+static unsigned
+nrs_orr_hop_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(struct nrs_orr_key), mask);
+}
+
+static void *nrs_orr_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	return &orro->oo_key;
+}
+
+static int nrs_orr_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+
+	return lu_fid_eq(&orro->oo_key.ok_fid,
+			 &((struct nrs_orr_key *)key)->ok_fid);
+}
+
+static void *nrs_orr_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_orr_object, oo_hnode);
+}
+
+static void nrs_orr_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	orro->oo_ref++;
+}
+
+/**
+ * Removes an nrs_orr_object the hash and frees its memory, if the object has
+ * no active users.
+ */
+static void nrs_orr_hop_put_free(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	struct nrs_orr_data   *orrd = container_of(orro->oo_res.res_parent,
+						   struct nrs_orr_data, od_res);
+	struct cfs_hash_bd     bd;
+
+	cfs_hash_bd_get_and_lock(hs, &orro->oo_key, &bd, 1);
+
+	if (--orro->oo_ref > 1) {
+		cfs_hash_bd_unlock(hs, &bd, 1);
+
+		return;
+	}
+	LASSERT(orro->oo_ref == 1);
+
+	cfs_hash_bd_del_locked(hs, &bd, hnode);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	OBD_SLAB_FREE_PTR(orro, orrd->od_cache);
+}
+
+static void nrs_orr_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	orro->oo_ref--;
+}
+
+static int nrs_trr_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+
+	return orro->oo_key.ok_idx == ((struct nrs_orr_key *)key)->ok_idx;
+}
+
+static void nrs_trr_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	struct nrs_orr_data   *orrd = container_of(orro->oo_res.res_parent,
+						   struct nrs_orr_data, od_res);
+
+	LASSERTF(orro->oo_ref == 0,
+		 "Busy NRS TRR policy object for OST with index %u, with %ld "
+		 "refs\n", orro->oo_key.ok_idx, orro->oo_ref);
+
+	OBD_SLAB_FREE_PTR(orro, orrd->od_cache);
+}
+
+static struct cfs_hash_ops nrs_orr_hash_ops = {
+	.hs_hash	= nrs_orr_hop_hash,
+	.hs_key		= nrs_orr_hop_key,
+	.hs_keycmp	= nrs_orr_hop_keycmp,
+	.hs_object	= nrs_orr_hop_object,
+	.hs_get		= nrs_orr_hop_get,
+	.hs_put		= nrs_orr_hop_put_free,
+	.hs_put_locked	= nrs_orr_hop_put,
+};
+
+static struct cfs_hash_ops nrs_trr_hash_ops = {
+	.hs_hash	= nrs_orr_hop_hash,
+	.hs_key		= nrs_orr_hop_key,
+	.hs_keycmp	= nrs_trr_hop_keycmp,
+	.hs_object	= nrs_orr_hop_object,
+	.hs_get		= nrs_orr_hop_get,
+	.hs_put		= nrs_orr_hop_put,
+	.hs_put_locked	= nrs_orr_hop_put,
+	.hs_exit	= nrs_trr_hop_exit,
+};
+
+#define NRS_ORR_QUANTUM_DFLT	256
+
+/**
+ * Binary heap predicate.
+ *
+ * Uses
+ * ptlrpc_nrs_request::nr_u::orr::or_round,
+ * ptlrpc_nrs_request::nr_u::orr::or_sequence, and
+ * ptlrpc_nrs_request::nr_u::orr::or_range to compare two binheap nodes and
+ * produce a binary predicate that indicates their relative priority, so that
+ * the binary heap can perform the necessary sorting operations.
+ *
+ * \param[in] e1 the first binheap node to compare
+ * \param[in] e2 the second binheap node to compare
+ *
+ * \retval 0 e1 > e2
+ * \retval 1 e1 < e2
+ */
+static int
+orr_req_compare(struct binheap_node *e1, struct binheap_node *e2)
+{
+	struct ptlrpc_nrs_request *nrq1;
+	struct ptlrpc_nrs_request *nrq2;
+
+	nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node);
+	nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node);
+
+	/**
+	 * Requests have been scheduled against a different scheduling round.
+	 */
+	if (nrq1->nr_u.orr.or_round < nrq2->nr_u.orr.or_round)
+		return 1;
+	else if (nrq1->nr_u.orr.or_round > nrq2->nr_u.orr.or_round)
+		return 0;
+
+	/**
+	 * Requests have been scheduled against the same scheduling round, but
+	 * belong to a different batch, i.e. they pertain to a different
+	 * backend-fs object (for ORR policy instances) or OST (for TRR policy
+	 * instances).
+	 */
+	if (nrq1->nr_u.orr.or_sequence < nrq2->nr_u.orr.or_sequence)
+		return 1;
+	else if (nrq1->nr_u.orr.or_sequence > nrq2->nr_u.orr.or_sequence)
+		return 0;
+
+	/**
+	 * If round numbers and sequence numbers are equal, the two requests
+	 * have been scheduled on the same round, and belong to the same batch,
+	 * which means they pertain to the same backend-fs object (if this is an
+	 * ORR policy instance), or to the same OST (if this is a TRR policy
+	 * instance), so these requests should be sorted by ascending offset
+	 * order.
+	 */
+	if (nrq1->nr_u.orr.or_range.or_start <
+	    nrq2->nr_u.orr.or_range.or_start) {
+		return 1;
+	} else if (nrq1->nr_u.orr.or_range.or_start >
+		 nrq2->nr_u.orr.or_range.or_start) {
+		return 0;
+	} else {
+		/**
+		 * Requests start from the same offset; Dispatch the shorter one
+		 * first; perhaps slightly more chances of hitting caches like
+		 * this.
+		 */
+		return nrq1->nr_u.orr.or_range.or_end <
+		       nrq2->nr_u.orr.or_range.or_end;
+	}
+}
+
+/**
+ * ORR binary heap operations
+ */
+static struct binheap_ops nrs_orr_heap_ops = {
+	.hop_enter	= NULL,
+	.hop_exit	= NULL,
+	.hop_compare	= orr_req_compare,
+};
+
+/**
+ * Prints a warning message if an ORR/TRR policy is started on a service with
+ * more than one CPT.  Not printed on the console for now, since we don't
+ * have any performance metrics in the first place, and it is annoying.
+ *
+ * \param[in] policy the policy instance
+ *
+ * \retval 0 success
+ */
+static int nrs_orr_init(struct ptlrpc_nrs_policy *policy)
+{
+	if (policy->pol_nrs->nrs_svcpt->scp_service->srv_ncpts > 1)
+		CDEBUG(D_CONFIG, "%s: The %s NRS policy was registered on a "
+		      "service with multiple service partitions. This policy "
+		      "may perform better with a single partition.\n",
+		      policy->pol_nrs->nrs_svcpt->scp_service->srv_name,
+		      policy->pol_desc->pd_name);
+
+	return 0;
+}
+
+/**
+ * Called when an ORR policy instance is started.
+ *
+ * \param[in] policy the policy
+ *
+ * \retval -ENOMEM OOM error
+ * \retval 0	   success
+ */
+static int nrs_orr_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_orr_data    *orrd;
+	struct cfs_hash_ops	       *ops;
+	unsigned		cur_bits;
+	unsigned		max_bits;
+	unsigned		bkt_bits;
+	unsigned		flags;
+	int			rc = 0;
+	ENTRY;
+
+	OBD_CPT_ALLOC_PTR(orrd, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (orrd == NULL)
+		RETURN(-ENOMEM);
+
+	/*
+	 * Binary heap instance for sorted incoming requests.
+	 */
+	orrd->od_binheap = binheap_create(&nrs_orr_heap_ops,
+					      CBH_FLAG_ATOMIC_GROW, 4096, NULL,
+					      nrs_pol2cptab(policy),
+					      nrs_pol2cptid(policy));
+	if (orrd->od_binheap == NULL)
+		GOTO(out_orrd, rc = -ENOMEM);
+
+	nrs_orr_genobjname(policy, orrd->od_objname);
+
+	/**
+	 * Slab cache for NRS ORR/TRR objects.
+	 */
+	orrd->od_cache = kmem_cache_create(orrd->od_objname,
+					   sizeof(struct nrs_orr_object),
+					   0, 0, NULL);
+	if (orrd->od_cache == NULL)
+		GOTO(out_binheap, rc = -ENOMEM);
+
+	if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR,
+		    NRS_POL_NAME_MAX) == 0) {
+		ops = &nrs_orr_hash_ops;
+		cur_bits = NRS_ORR_BITS;
+		max_bits = NRS_ORR_BITS;
+		bkt_bits = NRS_ORR_BKT_BITS;
+		flags = NRS_ORR_HASH_FLAGS;
+	} else {
+		ops = &nrs_trr_hash_ops;
+		cur_bits = NRS_TRR_BITS;
+		max_bits = NRS_TRR_BITS;
+		bkt_bits = NRS_TRR_BKT_BITS;
+		flags = NRS_TRR_HASH_FLAGS;
+	}
+
+	/**
+	 * Hash for finding objects by struct nrs_orr_key.
+	 * XXX: For TRR, it might be better to avoid using libcfs_hash?
+	 * All that needs to be resolved are OST indices, and they
+	 * will stay relatively stable during an OSS node's lifetime.
+	 */
+	orrd->od_obj_hash = cfs_hash_create(orrd->od_objname, cur_bits,
+					    max_bits, bkt_bits, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA, ops, flags);
+	if (orrd->od_obj_hash == NULL)
+		GOTO(out_cache, rc = -ENOMEM);
+
+	/* XXX: Fields accessed unlocked */
+	orrd->od_quantum = NRS_ORR_QUANTUM_DFLT;
+	orrd->od_supp = NOS_DFLT;
+	orrd->od_physical = true;
+	/**
+	 * Set to 1 so that the test inside nrs_orr_req_add() can evaluate to
+	 * true.
+	 */
+	orrd->od_sequence = 1;
+
+	policy->pol_private = orrd;
+
+	RETURN(rc);
+
+out_cache:
+	kmem_cache_destroy(orrd->od_cache);
+out_binheap:
+	binheap_destroy(orrd->od_binheap);
+out_orrd:
+	OBD_FREE_PTR(orrd);
+
+	RETURN(rc);
+}
+
+/**
+ * Called when an ORR/TRR policy instance is stopped.
+ *
+ * Called when the policy has been instructed to transition to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state and has no more
+ * pending requests to serve.
+ *
+ * \param[in] policy the policy
+ */
+static void nrs_orr_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_orr_data *orrd = policy->pol_private;
+	ENTRY;
+
+	LASSERT(orrd != NULL);
+	LASSERT(orrd->od_binheap != NULL);
+	LASSERT(orrd->od_obj_hash != NULL);
+	LASSERT(orrd->od_cache != NULL);
+	LASSERT(binheap_is_empty(orrd->od_binheap));
+
+	binheap_destroy(orrd->od_binheap);
+	cfs_hash_putref(orrd->od_obj_hash);
+	kmem_cache_destroy(orrd->od_cache);
+
+	OBD_FREE_PTR(orrd);
+}
+
+/**
+ * Performs a policy-specific ctl function on ORR/TRR policy instances; similar
+ * to ioctl.
+ *
+ * \param[in]	  policy the policy instance
+ * \param[in]	  opc	 the opcode
+ * \param[in,out] arg	 used for passing parameters and information
+ *
+ * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried successfully
+ * \retval -ve error
+ */
+static int nrs_orr_ctl(struct ptlrpc_nrs_policy *policy,
+		       enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch((enum nrs_ctl_orr)opc) {
+	default:
+		RETURN(-EINVAL);
+
+	case NRS_CTL_ORR_RD_QUANTUM: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		*(__u16 *)arg = orrd->od_quantum;
+		}
+		break;
+
+	case NRS_CTL_ORR_WR_QUANTUM: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		orrd->od_quantum = *(__u16 *)arg;
+		LASSERT(orrd->od_quantum != 0);
+		}
+		break;
+
+	case NRS_CTL_ORR_RD_OFF_TYPE: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		*(bool *)arg = orrd->od_physical;
+		}
+		break;
+
+	case NRS_CTL_ORR_WR_OFF_TYPE: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		orrd->od_physical = *(bool *)arg;
+		}
+		break;
+
+	case NRS_CTL_ORR_RD_SUPP_REQ: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		*(enum nrs_orr_supp *)arg = orrd->od_supp;
+		}
+		break;
+
+	case NRS_CTL_ORR_WR_SUPP_REQ: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		orrd->od_supp = *(enum nrs_orr_supp *)arg;
+		LASSERT((orrd->od_supp & NOS_OST_RW) != 0);
+		}
+		break;
+	}
+	RETURN(0);
+}
+
+/**
+ * Obtains resources for ORR/TRR policy instances. The top-level resource lives
+ * inside \e nrs_orr_data and the second-level resource inside
+ * \e nrs_orr_object instances.
+ *
+ * \param[in]  policy	  the policy for which resources are being taken for
+ *			  request \a nrq
+ * \param[in]  nrq	  the request for which resources are being taken
+ * \param[in]  parent	  parent resource, embedded in nrs_orr_data for the
+ *			  ORR/TRR policies
+ * \param[out] resp	  used to return resource references
+ * \param[in]  moving_req signifies limited caller context; used to perform
+ *			  memory allocations in an atomic context in this
+ *			  policy
+ *
+ * \retval 0   we are returning a top-level, parent resource, one that is
+ *	       embedded in an nrs_orr_data object
+ * \retval 1   we are returning a bottom-level resource, one that is embedded
+ *	       in an nrs_orr_object object
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_orr_res_get(struct ptlrpc_nrs_policy *policy,
+			   struct ptlrpc_nrs_request *nrq,
+			   const struct ptlrpc_nrs_resource *parent,
+			   struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	struct nrs_orr_data	       *orrd;
+	struct nrs_orr_object	       *orro;
+	struct nrs_orr_object	       *tmp;
+	struct nrs_orr_key		key = { { { 0 } } };
+	__u32				opc;
+	int				rc = 0;
+
+	/**
+	 * struct nrs_orr_data is requested.
+	 */
+	if (parent == NULL) {
+		*resp = &((struct nrs_orr_data *)policy->pol_private)->od_res;
+		return 0;
+	}
+
+	orrd = container_of(parent, struct nrs_orr_data, od_res);
+
+	/**
+	 * If the request type is not supported, fail the enqueuing; the RPC
+	 * will be handled by the fallback NRS policy.
+	 */
+	if (!nrs_orr_req_supported(orrd, nrq, &opc))
+		return -1;
+
+	/**
+	 * Fill in the key for the request; OST FID for ORR policy instances,
+	 * and OST index for TRR policy instances.
+	 */
+	rc = nrs_orr_key_fill(orrd, nrq, opc, policy->pol_desc->pd_name, &key);
+	if (rc < 0)
+		RETURN(rc);
+
+	/**
+	 * Set the offset range the request covers
+	 */
+	rc = nrs_orr_range_fill(nrq, orrd, opc, moving_req);
+	if (rc < 0)
+		RETURN(rc);
+
+	orro = cfs_hash_lookup(orrd->od_obj_hash, &key);
+	if (orro != NULL)
+		goto out;
+
+	OBD_SLAB_CPT_ALLOC_PTR_GFP(orro, orrd->od_cache,
+				   nrs_pol2cptab(policy), nrs_pol2cptid(policy),
+				   moving_req ? GFP_ATOMIC : GFP_NOFS);
+	if (orro == NULL)
+		RETURN(-ENOMEM);
+
+	orro->oo_key = key;
+	orro->oo_ref = 1;
+
+	tmp = cfs_hash_findadd_unique(orrd->od_obj_hash, &orro->oo_key,
+				      &orro->oo_hnode);
+	if (tmp != orro) {
+		OBD_SLAB_FREE_PTR(orro, orrd->od_cache);
+		orro = tmp;
+	}
+out:
+	/**
+	 * For debugging purposes
+	 */
+	nrq->nr_u.orr.or_key = orro->oo_key;
+
+	*resp = &orro->oo_res;
+
+	return 1;
+}
+
+/**
+ * Called when releasing references to the resource hierachy obtained for a
+ * request for scheduling using ORR/TRR policy instances
+ *
+ * \param[in] policy   the policy the resource belongs to
+ * \param[in] res      the resource to be released
+ */
+static void nrs_orr_res_put(struct ptlrpc_nrs_policy *policy,
+			    const struct ptlrpc_nrs_resource *res)
+{
+	struct nrs_orr_data	*orrd;
+	struct nrs_orr_object	*orro;
+
+	/**
+	 * Do nothing for freeing parent, nrs_orr_data resources.
+	 */
+	if (res->res_parent == NULL)
+		return;
+
+	orro = container_of(res, struct nrs_orr_object, oo_res);
+	orrd = container_of(res->res_parent, struct nrs_orr_data, od_res);
+
+	cfs_hash_put(orrd->od_obj_hash, &orro->oo_hnode);
+}
+
+/**
+ * Called when polling an ORR/TRR policy instance for a request so that it can
+ * be served. Returns the request that is at the root of the binary heap, as
+ * that is the lowest priority one (i.e. libcfs_heap is an implementation of a
+ * min-heap)
+ *
+ * \param[in] policy the policy instance being polled
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  force the policy to return a request; unused in this policy
+ *
+ * \retval the request to be handled
+ * \retval NULL no request available
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_orr_req_get(struct ptlrpc_nrs_policy *policy,
+					   bool peek, bool force)
+{
+	struct nrs_orr_data	  *orrd = policy->pol_private;
+	struct binheap_node	  *node = binheap_root(orrd->od_binheap);
+	struct ptlrpc_nrs_request *nrq;
+
+	nrq = unlikely(node == NULL) ? NULL :
+	      container_of(node, struct ptlrpc_nrs_request, nr_node);
+
+	if (likely(!peek && nrq != NULL)) {
+		struct nrs_orr_object *orro;
+
+		orro = container_of(nrs_request_resource(nrq),
+				    struct nrs_orr_object, oo_res);
+
+		LASSERT(nrq->nr_u.orr.or_round <= orro->oo_round);
+
+		binheap_remove(orrd->od_binheap, &nrq->nr_node);
+		orro->oo_active--;
+
+		if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR,
+				 NRS_POL_NAME_MAX) == 0)
+			CDEBUG(D_RPCTRACE,
+			       "NRS: starting to handle %s request for object "
+			       "with FID "DFID", from OST with index %u, with "
+			       "round %llu\n", NRS_POL_NAME_ORR,
+			       PFID(&orro->oo_key.ok_fid),
+			       nrq->nr_u.orr.or_key.ok_idx,
+			       nrq->nr_u.orr.or_round);
+		else
+			CDEBUG(D_RPCTRACE,
+			       "NRS: starting to handle %s request from OST "
+			       "with index %u, with round %llu\n",
+			       NRS_POL_NAME_TRR, nrq->nr_u.orr.or_key.ok_idx,
+			       nrq->nr_u.orr.or_round);
+
+		/** Peek at the next request to be served */
+		node = binheap_root(orrd->od_binheap);
+
+		/** No more requests */
+		if (unlikely(node == NULL)) {
+			orrd->od_round++;
+		} else {
+			struct ptlrpc_nrs_request *next;
+
+			next = container_of(node, struct ptlrpc_nrs_request,
+					    nr_node);
+
+			if (orrd->od_round < next->nr_u.orr.or_round)
+				orrd->od_round = next->nr_u.orr.or_round;
+		}
+	}
+
+	return nrq;
+}
+
+/**
+ * Sort-adds request \a nrq to an ORR/TRR \a policy instance's set of queued
+ * requests in the policy's binary heap.
+ *
+ * A scheduling round is a stream of requests that have been sorted in batches
+ * according to the backend-fs object (for ORR policy instances) or OST (for TRR
+ * policy instances) that they pertain to (as identified by its IDIF FID or OST
+ * index respectively); there can be only one batch for each object or OST in
+ * each round. The batches are of maximum size nrs_orr_data:od_quantum. When a
+ * new request arrives for scheduling for an object or OST that has exhausted
+ * its quantum in its current round, the request will be scheduled on the next
+ * scheduling round. Requests are allowed to be scheduled against a round until
+ * all requests for the round are serviced, so an object or OST might miss a
+ * round if requests are not scheduled for it for a long enough period of time.
+ * Objects or OSTs that miss a round will continue with having their next
+ * request scheduled, starting at the round that requests are being dispatched
+ * for, at the time of arrival of this request.
+ *
+ * Requests are tagged with the round number and a sequence number; the sequence
+ * number indicates the relative ordering amongst the batches of requests in a
+ * round, and is identical for all requests in a batch, as is the round number.
+ * The round and sequence numbers are used by orr_req_compare() in order to use
+ * nrs_orr_data::od_binheap in order to maintain an ordered set of rounds, with
+ * each round consisting of an ordered set of batches of requests, and each
+ * batch consisting of an ordered set of requests according to their logical
+ * file or physical disk offsets.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to add
+ *
+ * \retval 0	request successfully added
+ * \retval != 0 error
+ */
+static int nrs_orr_req_add(struct ptlrpc_nrs_policy *policy,
+			   struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_orr_data	*orrd;
+	struct nrs_orr_object	*orro;
+	int			 rc;
+
+	orro = container_of(nrs_request_resource(nrq),
+			    struct nrs_orr_object, oo_res);
+	orrd = container_of(nrs_request_resource(nrq)->res_parent,
+			    struct nrs_orr_data, od_res);
+
+	if (orro->oo_quantum == 0 || orro->oo_round < orrd->od_round ||
+	    (orro->oo_active == 0 && orro->oo_quantum > 0)) {
+
+		/**
+		 * If there are no pending requests for the object/OST, but some
+		 * of its quantum still remains unused, which implies we did not
+		 * get a chance to schedule up to its maximum allowed batch size
+		 * of requests in the previous round this object/OST
+		 * participated in, schedule this next request on a new round;
+		 * this avoids fragmentation of request batches caused by
+		 * intermittent inactivity on the object/OST, at the expense of
+		 * potentially slightly increased service time for the request
+		 * batch this request will be a part of.
+		 */
+		if (orro->oo_active == 0 && orro->oo_quantum > 0)
+			orro->oo_round++;
+
+		/** A new scheduling round has commenced */
+		if (orro->oo_round < orrd->od_round)
+			orro->oo_round = orrd->od_round;
+
+		/** I was not the last object/OST that scheduled a request */
+		if (orro->oo_sequence < orrd->od_sequence)
+			orro->oo_sequence = ++orrd->od_sequence;
+		/**
+		 * Reset the quantum if we have reached the maximum quantum
+		 * size for this batch, or even if we have not managed to
+		 * complete a batch size up to its maximum allowed size.
+		 * XXX: Accessed unlocked
+		 */
+		orro->oo_quantum = orrd->od_quantum;
+	}
+
+	nrq->nr_u.orr.or_round = orro->oo_round;
+	nrq->nr_u.orr.or_sequence = orro->oo_sequence;
+
+	rc = binheap_insert(orrd->od_binheap, &nrq->nr_node);
+	if (rc == 0) {
+		orro->oo_active++;
+		if (--orro->oo_quantum == 0)
+			orro->oo_round++;
+	}
+	return rc;
+}
+
+/**
+ * Removes request \a nrq from an ORR/TRR \a policy instance's set of queued
+ * requests.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to remove
+ */
+static void nrs_orr_req_del(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_orr_data	*orrd;
+	struct nrs_orr_object	*orro;
+	bool			 is_root;
+
+	orro = container_of(nrs_request_resource(nrq),
+			    struct nrs_orr_object, oo_res);
+	orrd = container_of(nrs_request_resource(nrq)->res_parent,
+			    struct nrs_orr_data, od_res);
+
+	LASSERT(nrq->nr_u.orr.or_round <= orro->oo_round);
+
+	is_root = &nrq->nr_node == binheap_root(orrd->od_binheap);
+
+	binheap_remove(orrd->od_binheap, &nrq->nr_node);
+	orro->oo_active--;
+
+	/**
+	 * If we just deleted the node at the root of the binheap, we may have
+	 * to adjust round numbers.
+	 */
+	if (unlikely(is_root)) {
+		/** Peek at the next request to be served */
+		struct binheap_node *node = binheap_root(orrd->od_binheap);
+
+		/** No more requests */
+		if (unlikely(node == NULL)) {
+			orrd->od_round++;
+		} else {
+			nrq = container_of(node, struct ptlrpc_nrs_request,
+					   nr_node);
+
+			if (orrd->od_round < nrq->nr_u.orr.or_round)
+				orrd->od_round = nrq->nr_u.orr.or_round;
+		}
+	}
+}
+
+/**
+ * Called right after the request \a nrq finishes being handled by ORR policy
+ * instance \a policy.
+ *
+ * \param[in] policy the policy that handled the request
+ * \param[in] nrq    the request that was handled
+ */
+static void nrs_orr_req_stop(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	/** NB: resource control, credits etc can be added here */
+	if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR,
+		    NRS_POL_NAME_MAX) == 0)
+		CDEBUG(D_RPCTRACE,
+		       "NRS: finished handling %s request for object with FID "
+		       DFID", from OST with index %u, with round %llu\n",
+		       NRS_POL_NAME_ORR, PFID(&nrq->nr_u.orr.or_key.ok_fid),
+		       nrq->nr_u.orr.or_key.ok_idx, nrq->nr_u.orr.or_round);
+	else
+		CDEBUG(D_RPCTRACE,
+		       "NRS: finished handling %s request from OST with index %u,"
+		       " with round %llu\n",
+		       NRS_POL_NAME_TRR, nrq->nr_u.orr.or_key.ok_idx,
+		       nrq->nr_u.orr.or_round);
+}
+
+/**
+ * debugfs interface
+ */
+
+/**
+ * This allows to bundle the policy name into the lprocfs_vars::data pointer
+ * so that lprocfs read/write functions can be used by both the ORR and TRR
+ * policies.
+ */
+static struct nrs_lprocfs_orr_data {
+	struct ptlrpc_service	*svc;
+	char			*name;
+} lprocfs_orr_data = {
+	.name = NRS_POL_NAME_ORR
+}, lprocfs_trr_data = {
+	.name = NRS_POL_NAME_TRR
+};
+
+/**
+ * Retrieves the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for ORR/TRR policy instances on both the regular and high-priority NRS head
+ * of a service, as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this
+ * state are skipped later by nrs_orr_ctl().
+ *
+ * Quantum values are in # of RPCs, and the output is in YAML format.
+ *
+ * For example:
+ *
+ *	reg_quantum:256
+ *	hp_quantum:8
+ *
+ * XXX: the CRR-N version of this, ptlrpc_lprocfs_rd_nrs_crrn_quantum() is
+ * almost identical; it can be reworked and then reused for ORR/TRR.
+ */
+static int
+ptlrpc_lprocfs_nrs_orr_quantum_seq_show(struct seq_file *m, void *data)
+{
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	__u16			     quantum;
+	int			     rc;
+
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       orr_data->name,
+				       NRS_CTL_ORR_RD_QUANTUM,
+				       true, &quantum);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_REG "%-5d\n", quantum);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	/**
+	 * We know the ost_io service which is the only one ORR/TRR policies are
+	 * compatible with, do have an HP NRS head, but it may be best to guard
+	 * against a possible change of this in the future.
+	 */
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       orr_data->name, NRS_CTL_ORR_RD_QUANTUM,
+				       true, &quantum);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_HP"%-5d\n", quantum);
+		/**
+		 * Ignore -ENODEV as the high priority NRS head's policy may be
+		 * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+no_hp:
+
+	return rc;
+}
+
+/**
+ * Sets the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for ORR/TRR policy instances of a service. The user can set the quantum size
+ * for the regular and high priority NRS head separately by specifying each
+ * value, or both together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_orr_quantum=req_quantum:64, to set the
+ * request quantum size of the ORR policy instance on the regular NRS head of
+ * the ost_io service to 64
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_trr_quantum=hp_quantum:8 to set the request
+ * quantum size of the TRR policy instance on the high priority NRS head of the
+ * ost_io service to 8
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_orr_quantum=32, to set both the request
+ * quantum size of the ORR policy instance on both the regular and the high
+ * priority NRS head of the ost_io service to 32
+ *
+ * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state
+ * are skipped later by nrs_orr_ctl().
+ *
+ * XXX: the CRR-N version of this, ptlrpc_lprocfs_wr_nrs_crrn_quantum() is
+ * almost identical; it can be reworked and then reused for ORR/TRR.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
+{
+	struct seq_file		    *m = file->private_data;
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	enum ptlrpc_nrs_queue_type   queue = 0;
+	char			     kernbuf[LPROCFS_NRS_WR_QUANTUM_MAX_CMD];
+	char			    *val;
+	long			     quantum_reg;
+	long			     quantum_hp;
+	/** lprocfs_find_named_value() modifies its argument, so keep a copy */
+	size_t			     count_copy;
+	int			     rc = 0;
+	int			     rc2 = 0;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+	count_copy = count;
+
+	/**
+	 * Check if the regular quantum value has been specified
+	 */
+	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
+				       &count_copy);
+	if (val != kernbuf) {
+		rc = kstrtol(val, 10, &quantum_reg);
+		if (rc)
+			return rc;
+		queue |= PTLRPC_NRS_QUEUE_REG;
+	}
+
+	count_copy = count;
+
+	/**
+	 * Check if the high priority quantum value has been specified
+	 */
+	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_HP,
+				       &count_copy);
+	if (val != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			return -ENODEV;
+
+		rc = kstrtol(val, 10, &quantum_hp);
+		if (rc)
+			return rc;
+
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	/**
+	 * If none of the queues has been specified, look for a valid numerical
+	 * value
+	 */
+	if (queue == 0) {
+		rc = kstrtol(kernbuf, 10, &quantum_reg);
+		if (rc)
+			return rc;
+
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc)) {
+			queue |= PTLRPC_NRS_QUEUE_HP;
+			quantum_hp = quantum_reg;
+		}
+	}
+
+	if ((((queue & PTLRPC_NRS_QUEUE_REG) != 0) &&
+	    ((quantum_reg > LPROCFS_NRS_QUANTUM_MAX || quantum_reg <= 0))) ||
+	    (((queue & PTLRPC_NRS_QUEUE_HP) != 0) &&
+	    ((quantum_hp > LPROCFS_NRS_QUANTUM_MAX || quantum_hp <= 0))))
+		return -EINVAL;
+
+	/**
+	 * We change the values on regular and HP NRS heads separately, so that
+	 * we do not exit early from ptlrpc_nrs_policy_control() with an error
+	 * returned by nrs_policy_ctl_locked(), in cases where the user has not
+	 * started the policy on either the regular or HP NRS head; i.e. we are
+	 * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned
+	 * only if the operation fails with -ENODEV on all heads that have been
+	 * specified by the command; if at least one operation succeeds,
+	 * success is returned.
+	 */
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       orr_data->name,
+					       NRS_CTL_ORR_WR_QUANTUM, false,
+					       &quantum_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			return rc;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						orr_data->name,
+						NRS_CTL_ORR_WR_QUANTUM, false,
+						&quantum_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			return rc2;
+	}
+
+	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum);
+
+#define LPROCFS_NRS_OFF_NAME_REG		"reg_offset_type:"
+#define LPROCFS_NRS_OFF_NAME_HP			"hp_offset_type:"
+
+#define LPROCFS_NRS_OFF_NAME_PHYSICAL		"physical"
+#define LPROCFS_NRS_OFF_NAME_LOGICAL		"logical"
+
+/**
+ * Retrieves the offset type used by ORR/TRR policy instances on both the
+ * regular and high-priority NRS head of a service, as long as a policy
+ * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ * policy instances in this state are skipped later by nrs_orr_ctl().
+ *
+ * Offset type information is a (physical|logical) string, and output is
+ * in YAML format.
+ *
+ * For example:
+ *
+ *	reg_offset_type:physical
+ *	hp_offset_type:logical
+ */
+static int
+ptlrpc_lprocfs_nrs_orr_offset_type_seq_show(struct seq_file *m, void *data)
+{
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	bool			     physical;
+	int			     rc;
+
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED
+	 * or ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       orr_data->name, NRS_CTL_ORR_RD_OFF_TYPE,
+				       true, &physical);
+	if (rc == 0) {
+		seq_printf(m, LPROCFS_NRS_OFF_NAME_REG"%s\n",
+			   physical ? LPROCFS_NRS_OFF_NAME_PHYSICAL :
+			   LPROCFS_NRS_OFF_NAME_LOGICAL);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	/**
+	 * We know the ost_io service which is the only one ORR/TRR policies are
+	 * compatible with, do have an HP NRS head, but it may be best to guard
+	 * against a possible change of this in the future.
+	 */
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       orr_data->name, NRS_CTL_ORR_RD_OFF_TYPE,
+				       true, &physical);
+	if (rc == 0) {
+		seq_printf(m, LPROCFS_NRS_OFF_NAME_HP"%s\n",
+			   physical ? LPROCFS_NRS_OFF_NAME_PHYSICAL :
+			   LPROCFS_NRS_OFF_NAME_LOGICAL);
+		/**
+		 * Ignore -ENODEV as the high priority NRS head's policy may be
+		 * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+no_hp:
+	return rc;
+}
+
+/**
+ * Max valid command string is the size of the labels, plus "physical" twice.
+ * plus a separating ' '
+ */
+#define LPROCFS_NRS_WR_OFF_TYPE_MAX_CMD					       \
+	sizeof(LPROCFS_NRS_OFF_NAME_REG LPROCFS_NRS_OFF_NAME_PHYSICAL " "      \
+	       LPROCFS_NRS_OFF_NAME_HP LPROCFS_NRS_OFF_NAME_PHYSICAL)
+
+/**
+ * Sets the type of offsets used to order RPCs in ORR/TRR policy instances. The
+ * user can set offset type for the regular or high priority NRS head
+ * separately by specifying each value, or both together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_orr_offset_type=
+ * reg_offset_type:physical, to enable the ORR policy instance on the regular
+ * NRS head of the ost_io service to use physical disk offset ordering.
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_trr_offset_type=logical, to enable the TRR
+ * policy instances on both the regular ang high priority NRS heads of the
+ * ost_io service to use logical file offset ordering.
+ *
+ * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state are
+ * are skipped later by nrs_orr_ctl().
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file,
+					     const char __user *buffer,
+					      size_t count,
+					     loff_t *off)
+{
+	struct seq_file		    *m = file->private_data;
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	enum ptlrpc_nrs_queue_type   queue = 0;
+	char			     kernbuf[LPROCFS_NRS_WR_OFF_TYPE_MAX_CMD];
+	char			    *val_reg;
+	char			    *val_hp;
+	bool			     physical_reg;
+	bool			     physical_hp;
+	size_t			     count_copy;
+	int			     rc = 0;
+	int			     rc2 = 0;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+	count_copy = count;
+
+	/**
+	 * Check if the regular offset type has been specified
+	 */
+	val_reg = lprocfs_find_named_value(kernbuf,
+					   LPROCFS_NRS_OFF_NAME_REG,
+					   &count_copy);
+	if (val_reg != kernbuf)
+		queue |= PTLRPC_NRS_QUEUE_REG;
+
+	count_copy = count;
+
+	/**
+	 * Check if the high priority offset type has been specified
+	 */
+	val_hp = lprocfs_find_named_value(kernbuf, LPROCFS_NRS_OFF_NAME_HP,
+					  &count_copy);
+	if (val_hp != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			return -ENODEV;
+
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	/**
+	 * If none of the queues has been specified, there may be a valid
+	 * command string at the start of the buffer.
+	 */
+	if (queue == 0) {
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc))
+			queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		if (strncmp(val_reg, LPROCFS_NRS_OFF_NAME_PHYSICAL,
+			    sizeof(LPROCFS_NRS_OFF_NAME_PHYSICAL) - 1) == 0)
+			physical_reg = true;
+		else if (strncmp(val_reg, LPROCFS_NRS_OFF_NAME_LOGICAL,
+			 sizeof(LPROCFS_NRS_OFF_NAME_LOGICAL) - 1) == 0)
+			physical_reg = false;
+		else
+			return -EINVAL;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		if (strncmp(val_hp, LPROCFS_NRS_OFF_NAME_PHYSICAL,
+			    sizeof(LPROCFS_NRS_OFF_NAME_PHYSICAL) - 1) == 0)
+			physical_hp = true;
+		else if (strncmp(val_hp, LPROCFS_NRS_OFF_NAME_LOGICAL,
+				 sizeof(LPROCFS_NRS_OFF_NAME_LOGICAL) - 1) == 0)
+			physical_hp = false;
+		else
+			return -EINVAL;
+	}
+
+	/**
+	 * We change the values on regular and HP NRS heads separately, so that
+	 * we do not exit early from ptlrpc_nrs_policy_control() with an error
+	 * returned by nrs_policy_ctl_locked(), in cases where the user has not
+	 * started the policy on either the regular or HP NRS head; i.e. we are
+	 * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned
+	 * only if the operation fails with -ENODEV on all heads that have been
+	 * specified by the command; if at least one operation succeeds,
+	 * success is returned.
+	 */
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       orr_data->name,
+					       NRS_CTL_ORR_WR_OFF_TYPE, false,
+					       &physical_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			return rc;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						orr_data->name,
+						NRS_CTL_ORR_WR_OFF_TYPE, false,
+						&physical_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			return rc2;
+	}
+
+	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type);
+
+#define NRS_LPROCFS_REQ_SUPP_NAME_REG		"reg_supported:"
+#define NRS_LPROCFS_REQ_SUPP_NAME_HP		"hp_supported:"
+
+#define LPROCFS_NRS_SUPP_NAME_READS		"reads"
+#define LPROCFS_NRS_SUPP_NAME_WRITES		"writes"
+#define LPROCFS_NRS_SUPP_NAME_READWRITES	"reads_and_writes"
+
+/**
+ * Translates enum nrs_orr_supp values to a corresponding string.
+ */
+static const char *nrs_orr_supp2str(enum nrs_orr_supp supp)
+{
+	switch(supp) {
+	default:
+		LBUG();
+	case NOS_OST_READ:
+		return LPROCFS_NRS_SUPP_NAME_READS;
+	case NOS_OST_WRITE:
+		return LPROCFS_NRS_SUPP_NAME_WRITES;
+	case NOS_OST_RW:
+		return LPROCFS_NRS_SUPP_NAME_READWRITES;
+	}
+}
+
+/**
+ * Translates strings to the corresponding enum nrs_orr_supp value
+ */
+static enum nrs_orr_supp nrs_orr_str2supp(const char *val)
+{
+	if (strncmp(val, LPROCFS_NRS_SUPP_NAME_READWRITES,
+		    sizeof(LPROCFS_NRS_SUPP_NAME_READWRITES) - 1) == 0)
+		return NOS_OST_RW;
+	else if (strncmp(val, LPROCFS_NRS_SUPP_NAME_READS,
+			 sizeof(LPROCFS_NRS_SUPP_NAME_READS) - 1) == 0)
+		return NOS_OST_READ;
+	else if (strncmp(val, LPROCFS_NRS_SUPP_NAME_WRITES,
+			 sizeof(LPROCFS_NRS_SUPP_NAME_WRITES) - 1) == 0)
+		return NOS_OST_WRITE;
+	else
+		return -EINVAL;
+}
+
+/**
+ * Retrieves the type of RPCs handled at the point of invocation by ORR/TRR
+ * policy instances on both the regular and high-priority NRS head of a service,
+ * as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this
+ * state are skipped later by nrs_orr_ctl().
+ *
+ * Supported RPC type information is a (reads|writes|reads_and_writes) string,
+ * and output is in YAML format.
+ *
+ * For example:
+ *
+ *	reg_supported:reads
+ *	hp_supported:reads_and_writes
+ */
+static int
+ptlrpc_lprocfs_nrs_orr_supported_seq_show(struct seq_file *m, void *data)
+{
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	enum nrs_orr_supp	     supported;
+	int			     rc;
+
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED
+	 * or ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       orr_data->name,
+				       NRS_CTL_ORR_RD_SUPP_REQ, true,
+				       &supported);
+
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_REQ_SUPP_NAME_REG"%s\n",
+			   nrs_orr_supp2str(supported));
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	/**
+	 * We know the ost_io service which is the only one ORR/TRR policies are
+	 * compatible with, do have an HP NRS head, but it may be best to guard
+	 * against a possible change of this in the future.
+	 */
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       orr_data->name,
+				       NRS_CTL_ORR_RD_SUPP_REQ, true,
+				       &supported);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_REQ_SUPP_NAME_HP"%s\n",
+			   nrs_orr_supp2str(supported));
+		/**
+		 * Ignore -ENODEV as the high priority NRS head's policy may be
+		 * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+no_hp:
+
+	return rc;
+}
+
+/**
+ * Max valid command string is the size of the labels, plus "reads_and_writes"
+ * twice, plus a separating ' '
+ */
+#define LPROCFS_NRS_WR_REQ_SUPP_MAX_CMD					       \
+	sizeof(NRS_LPROCFS_REQ_SUPP_NAME_REG LPROCFS_NRS_SUPP_NAME_READWRITES  \
+	       NRS_LPROCFS_REQ_SUPP_NAME_HP LPROCFS_NRS_SUPP_NAME_READWRITES   \
+	       " ")
+
+/**
+ * Sets the type of RPCs handled by ORR/TRR policy instances. The user can
+ * modify this setting for the regular or high priority NRS heads separately, or
+ * both together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_orr_supported=
+ * "reg_supported:reads", to enable the ORR policy instance on the regular NRS
+ * head of the ost_io service to handle OST_READ RPCs.
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_trr_supported=reads_and_writes, to enable
+ * the TRR policy instances on both the regular ang high priority NRS heads of
+ * the ost_io service to use handle OST_READ and OST_WRITE RPCs.
+ *
+ * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state are
+ * are skipped later by nrs_orr_ctl().
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count,
+					   loff_t *off)
+{
+	struct seq_file		    *m = file->private_data;
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	enum ptlrpc_nrs_queue_type   queue = 0;
+	char			     kernbuf[LPROCFS_NRS_WR_REQ_SUPP_MAX_CMD];
+	char			    *val_reg;
+	char			    *val_hp;
+	enum nrs_orr_supp	     supp_reg;
+	enum nrs_orr_supp	     supp_hp;
+	size_t			     count_copy;
+	int			     rc = 0;
+	int			     rc2 = 0;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+	count_copy = count;
+
+	/**
+	 * Check if the regular supported requests setting has been specified
+	 */
+	val_reg = lprocfs_find_named_value(kernbuf,
+					   NRS_LPROCFS_REQ_SUPP_NAME_REG,
+					   &count_copy);
+	if (val_reg != kernbuf)
+		queue |= PTLRPC_NRS_QUEUE_REG;
+
+	count_copy = count;
+
+	/**
+	 * Check if the high priority supported requests setting has been
+	 * specified
+	 */
+	val_hp = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_REQ_SUPP_NAME_HP,
+					  &count_copy);
+	if (val_hp != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			return -ENODEV;
+
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	/**
+	 * If none of the queues has been specified, there may be a valid
+	 * command string at the start of the buffer.
+	 */
+	if (queue == 0) {
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc))
+			queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		supp_reg = nrs_orr_str2supp(val_reg);
+		if (supp_reg == -EINVAL)
+			return -EINVAL;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		supp_hp = nrs_orr_str2supp(val_hp);
+		if (supp_hp == -EINVAL)
+			return -EINVAL;
+	}
+
+	/**
+	 * We change the values on regular and HP NRS heads separately, so that
+	 * we do not exit early from ptlrpc_nrs_policy_control() with an error
+	 * returned by nrs_policy_ctl_locked(), in cases where the user has not
+	 * started the policy on either the regular or HP NRS head; i.e. we are
+	 * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned
+	 * only if the operation fails with -ENODEV on all heads that have been
+	 * specified by the command; if at least one operation succeeds,
+	 * success is returned.
+	 */
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       orr_data->name,
+					       NRS_CTL_ORR_WR_SUPP_REQ, false,
+					       &supp_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			return rc;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						orr_data->name,
+						NRS_CTL_ORR_WR_SUPP_REQ, false,
+						&supp_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			return rc2;
+	}
+
+	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported);
+
+static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
+{
+	int	i;
+
+	struct ldebugfs_vars nrs_orr_lprocfs_vars[] = {
+		{ .name		= "nrs_orr_quantum",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops	},
+		{ .name		= "nrs_orr_offset_type",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_offset_type_fops },
+		{ .name		= "nrs_orr_supported",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_supported_fops },
+		{ NULL }
+	};
+
+	if (!svc->srv_debugfs_entry)
+		return 0;
+
+	lprocfs_orr_data.svc = svc;
+
+	for (i = 0; i < ARRAY_SIZE(nrs_orr_lprocfs_vars); i++)
+		nrs_orr_lprocfs_vars[i].data = &lprocfs_orr_data;
+
+	ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_orr_lprocfs_vars, NULL);
+
+	return 0;
+}
+
+static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = {
+	.op_policy_init		= nrs_orr_init,
+	.op_policy_start	= nrs_orr_start,
+	.op_policy_stop		= nrs_orr_stop,
+	.op_policy_ctl		= nrs_orr_ctl,
+	.op_res_get		= nrs_orr_res_get,
+	.op_res_put		= nrs_orr_res_put,
+	.op_req_get		= nrs_orr_req_get,
+	.op_req_enqueue		= nrs_orr_req_add,
+	.op_req_dequeue		= nrs_orr_req_del,
+	.op_req_stop		= nrs_orr_req_stop,
+	.op_lprocfs_init	= nrs_orr_lprocfs_init,
+};
+
+struct ptlrpc_nrs_pol_conf nrs_conf_orr = {
+	.nc_name		= NRS_POL_NAME_ORR,
+	.nc_ops			= &nrs_orr_ops,
+	.nc_compat		= nrs_policy_compat_one,
+	.nc_compat_svc_name	= "ost_io",
+};
+
+/**
+ * TRR, Target-based Round Robin policy
+ *
+ * TRR reuses much of the functions and data structures of ORR
+ */
+static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
+{
+	int	i;
+
+	struct ldebugfs_vars nrs_trr_lprocfs_vars[] = {
+		{ .name		= "nrs_trr_quantum",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops },
+		{ .name		= "nrs_trr_offset_type",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_offset_type_fops },
+		{ .name		= "nrs_trr_supported",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_supported_fops },
+		{ NULL }
+	};
+
+	if (!svc->srv_debugfs_entry)
+		return 0;
+
+	lprocfs_trr_data.svc = svc;
+
+	for (i = 0; i < ARRAY_SIZE(nrs_trr_lprocfs_vars); i++)
+		nrs_trr_lprocfs_vars[i].data = &lprocfs_trr_data;
+
+	ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_trr_lprocfs_vars, NULL);
+
+	return 0;
+}
+
+/**
+ * Reuse much of the ORR functionality for TRR.
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_trr_ops = {
+	.op_policy_init		= nrs_orr_init,
+	.op_policy_start	= nrs_orr_start,
+	.op_policy_stop		= nrs_orr_stop,
+	.op_policy_ctl		= nrs_orr_ctl,
+	.op_res_get		= nrs_orr_res_get,
+	.op_res_put		= nrs_orr_res_put,
+	.op_req_get		= nrs_orr_req_get,
+	.op_req_enqueue		= nrs_orr_req_add,
+	.op_req_dequeue		= nrs_orr_req_del,
+	.op_req_stop		= nrs_orr_req_stop,
+	.op_lprocfs_init	= nrs_trr_lprocfs_init,
+};
+
+struct ptlrpc_nrs_pol_conf nrs_conf_trr = {
+	.nc_name		= NRS_POL_NAME_TRR,
+	.nc_ops			= &nrs_trr_ops,
+	.nc_compat		= nrs_policy_compat_one,
+	.nc_compat_svc_name	= "ost_io",
+};
+
+/** @} ORR/TRR policy */
+
+/** @} nrs */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
new file mode 100644
index 0000000000000..50b983e739d8a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
@@ -0,0 +1,3712 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013 DataDirect Networks, Inc.
+ *
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ */
+/*
+ * lustre/ptlrpc/nrs_tbf.c
+ *
+ * Network Request Scheduler (NRS) Token Bucket Filter(TBF) policy
+ *
+ */
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <libcfs/libcfs.h>
+#include <lustre_req_layout.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name tbf
+ *
+ * Token Bucket Filter over client NIDs
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_TBF	"tbf"
+
+static int tbf_jobid_cache_size = 8192;
+module_param(tbf_jobid_cache_size, int, 0644);
+MODULE_PARM_DESC(tbf_jobid_cache_size, "The size of jobid cache");
+
+static int tbf_rate = 10000;
+module_param(tbf_rate, int, 0644);
+MODULE_PARM_DESC(tbf_rate, "Default rate limit in RPCs/s");
+
+static int tbf_depth = 3;
+module_param(tbf_depth, int, 0644);
+MODULE_PARM_DESC(tbf_depth, "How many tokens that a client can save up");
+
+static enum hrtimer_restart nrs_tbf_timer_cb(struct hrtimer *timer)
+{
+	struct nrs_tbf_head *head = container_of(timer, struct nrs_tbf_head,
+						 th_timer);
+	struct ptlrpc_nrs   *nrs = head->th_res.res_policy->pol_nrs;
+	struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt;
+
+	nrs->nrs_throttling = 0;
+	wake_up(&svcpt->scp_waitq);
+
+	return HRTIMER_NORESTART;
+}
+
+#define NRS_TBF_DEFAULT_RULE "default"
+
+static void nrs_tbf_rule_fini(struct nrs_tbf_rule *rule)
+{
+	LASSERT(atomic_read(&rule->tr_ref) == 0);
+	LASSERT(list_empty(&rule->tr_cli_list));
+	LASSERT(list_empty(&rule->tr_linkage));
+
+	rule->tr_head->th_ops->o_rule_fini(rule);
+	OBD_FREE_PTR(rule);
+}
+
+/**
+ * Decreases the rule's usage reference count, and stops the rule in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_tbf_rule_put(struct nrs_tbf_rule *rule)
+{
+	if (atomic_dec_and_test(&rule->tr_ref))
+		nrs_tbf_rule_fini(rule);
+}
+
+/**
+ * Increases the rule's usage reference count.
+ */
+static inline void nrs_tbf_rule_get(struct nrs_tbf_rule *rule)
+{
+	atomic_inc(&rule->tr_ref);
+}
+
+static void
+nrs_tbf_cli_rule_put(struct nrs_tbf_client *cli)
+{
+	LASSERT(!list_empty(&cli->tc_linkage));
+	LASSERT(cli->tc_rule);
+	spin_lock(&cli->tc_rule->tr_rule_lock);
+	list_del_init(&cli->tc_linkage);
+	spin_unlock(&cli->tc_rule->tr_rule_lock);
+	nrs_tbf_rule_put(cli->tc_rule);
+	cli->tc_rule = NULL;
+}
+
+static void
+nrs_tbf_cli_reset_value(struct nrs_tbf_head *head,
+			struct nrs_tbf_client *cli)
+
+{
+	struct nrs_tbf_rule *rule = cli->tc_rule;
+
+	cli->tc_rpc_rate = rule->tr_rpc_rate;
+	cli->tc_nsecs = rule->tr_nsecs_per_rpc;
+	cli->tc_depth = rule->tr_depth;
+	cli->tc_ntoken = rule->tr_depth;
+	cli->tc_check_time = ktime_to_ns(ktime_get());
+	cli->tc_rule_sequence = atomic_read(&head->th_rule_sequence);
+	cli->tc_rule_generation = rule->tr_generation;
+
+	if (cli->tc_in_heap)
+		binheap_relocate(head->th_binheap,
+				 &cli->tc_node);
+}
+
+static void
+nrs_tbf_cli_reset(struct nrs_tbf_head *head,
+		  struct nrs_tbf_rule *rule,
+		  struct nrs_tbf_client *cli)
+{
+	spin_lock(&cli->tc_rule_lock);
+	if (cli->tc_rule != NULL && !list_empty(&cli->tc_linkage)) {
+		LASSERT(rule != cli->tc_rule);
+		nrs_tbf_cli_rule_put(cli);
+	}
+	LASSERT(cli->tc_rule == NULL);
+	LASSERT(list_empty(&cli->tc_linkage));
+	/* Rule's ref is added before called */
+	cli->tc_rule = rule;
+	spin_lock(&rule->tr_rule_lock);
+	list_add_tail(&cli->tc_linkage, &rule->tr_cli_list);
+	spin_unlock(&rule->tr_rule_lock);
+	spin_unlock(&cli->tc_rule_lock);
+	nrs_tbf_cli_reset_value(head, cli);
+}
+
+static int
+nrs_tbf_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	return rule->tr_head->th_ops->o_rule_dump(rule, m);
+}
+
+static int
+nrs_tbf_rule_dump_all(struct nrs_tbf_head *head, struct seq_file *m)
+{
+	struct nrs_tbf_rule *rule;
+	int rc = 0;
+
+	LASSERT(head != NULL);
+	spin_lock(&head->th_rule_lock);
+	/* List the rules from newest to oldest */
+	list_for_each_entry(rule, &head->th_list, tr_linkage) {
+		LASSERT((rule->tr_flags & NTRS_STOPPING) == 0);
+		rc = nrs_tbf_rule_dump(rule, m);
+		if (rc) {
+			rc = -ENOSPC;
+			break;
+		}
+	}
+	spin_unlock(&head->th_rule_lock);
+
+	return rc;
+}
+
+static struct nrs_tbf_rule *
+nrs_tbf_rule_find_nolock(struct nrs_tbf_head *head,
+			 const char *name)
+{
+	struct nrs_tbf_rule *rule;
+
+	LASSERT(head != NULL);
+	list_for_each_entry(rule, &head->th_list, tr_linkage) {
+		LASSERT((rule->tr_flags & NTRS_STOPPING) == 0);
+		if (strcmp(rule->tr_name, name) == 0) {
+			nrs_tbf_rule_get(rule);
+			return rule;
+		}
+	}
+	return NULL;
+}
+
+static struct nrs_tbf_rule *
+nrs_tbf_rule_find(struct nrs_tbf_head *head,
+		  const char *name)
+{
+	struct nrs_tbf_rule *rule;
+
+	LASSERT(head != NULL);
+	spin_lock(&head->th_rule_lock);
+	rule = nrs_tbf_rule_find_nolock(head, name);
+	spin_unlock(&head->th_rule_lock);
+	return rule;
+}
+
+static struct nrs_tbf_rule *
+nrs_tbf_rule_match(struct nrs_tbf_head *head,
+		   struct nrs_tbf_client *cli)
+{
+	struct nrs_tbf_rule *rule = NULL;
+	struct nrs_tbf_rule *tmp_rule;
+
+	spin_lock(&head->th_rule_lock);
+	/* Match the newest rule in the list */
+	list_for_each_entry(tmp_rule, &head->th_list, tr_linkage) {
+		LASSERT((tmp_rule->tr_flags & NTRS_STOPPING) == 0);
+		if (head->th_ops->o_rule_match(tmp_rule, cli)) {
+			rule = tmp_rule;
+			break;
+		}
+	}
+
+	if (rule == NULL)
+		rule = head->th_rule;
+
+	nrs_tbf_rule_get(rule);
+	spin_unlock(&head->th_rule_lock);
+	return rule;
+}
+
+static void
+nrs_tbf_cli_init(struct nrs_tbf_head *head,
+		 struct nrs_tbf_client *cli,
+		 struct ptlrpc_request *req)
+{
+	struct nrs_tbf_rule *rule;
+
+	memset(cli, 0, sizeof(*cli));
+	cli->tc_in_heap = false;
+	head->th_ops->o_cli_init(cli, req);
+	INIT_LIST_HEAD(&cli->tc_list);
+	INIT_LIST_HEAD(&cli->tc_linkage);
+	spin_lock_init(&cli->tc_rule_lock);
+	atomic_set(&cli->tc_ref, 1);
+	rule = nrs_tbf_rule_match(head, cli);
+	nrs_tbf_cli_reset(head, rule, cli);
+}
+
+static void
+nrs_tbf_cli_fini(struct nrs_tbf_client *cli)
+{
+	LASSERT(list_empty(&cli->tc_list));
+	LASSERT(!cli->tc_in_heap);
+	LASSERT(atomic_read(&cli->tc_ref) == 0);
+	spin_lock(&cli->tc_rule_lock);
+	nrs_tbf_cli_rule_put(cli);
+	spin_unlock(&cli->tc_rule_lock);
+	OBD_FREE_PTR(cli);
+}
+
+static int
+nrs_tbf_rule_start(struct ptlrpc_nrs_policy *policy,
+		   struct nrs_tbf_head *head,
+		   struct nrs_tbf_cmd *start)
+{
+	struct nrs_tbf_rule	*rule;
+	struct nrs_tbf_rule	*tmp_rule;
+	struct nrs_tbf_rule	*next_rule;
+	char			*next_name = start->u.tc_start.ts_next_name;
+	int			 rc;
+
+	rule = nrs_tbf_rule_find(head, start->tc_name);
+	if (rule) {
+		nrs_tbf_rule_put(rule);
+		return -EEXIST;
+	}
+
+	OBD_CPT_ALLOC_PTR(rule, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (rule == NULL)
+		return -ENOMEM;
+
+	strlcpy(rule->tr_name, start->tc_name, sizeof(rule->tr_name));
+	rule->tr_rpc_rate = start->u.tc_start.ts_rpc_rate;
+	rule->tr_flags = start->u.tc_start.ts_rule_flags;
+	rule->tr_nsecs_per_rpc = NSEC_PER_SEC / rule->tr_rpc_rate;
+	rule->tr_depth = tbf_depth;
+	atomic_set(&rule->tr_ref, 1);
+	INIT_LIST_HEAD(&rule->tr_cli_list);
+	INIT_LIST_HEAD(&rule->tr_nids);
+	INIT_LIST_HEAD(&rule->tr_linkage);
+	spin_lock_init(&rule->tr_rule_lock);
+	rule->tr_head = head;
+
+	rc = head->th_ops->o_rule_init(policy, rule, start);
+	if (rc) {
+		OBD_FREE_PTR(rule);
+		return rc;
+	}
+
+	/* Add as the newest rule */
+	spin_lock(&head->th_rule_lock);
+	tmp_rule = nrs_tbf_rule_find_nolock(head, start->tc_name);
+	if (tmp_rule) {
+		spin_unlock(&head->th_rule_lock);
+		nrs_tbf_rule_put(tmp_rule);
+		nrs_tbf_rule_put(rule);
+		return -EEXIST;
+	}
+
+	if (next_name) {
+		next_rule = nrs_tbf_rule_find_nolock(head, next_name);
+		if (!next_rule) {
+			spin_unlock(&head->th_rule_lock);
+			nrs_tbf_rule_put(rule);
+			return -ENOENT;
+		}
+
+		list_add(&rule->tr_linkage, next_rule->tr_linkage.prev);
+		nrs_tbf_rule_put(next_rule);
+	} else {
+		/* Add on the top of the rule list */
+		list_add(&rule->tr_linkage, &head->th_list);
+	}
+	spin_unlock(&head->th_rule_lock);
+	atomic_inc(&head->th_rule_sequence);
+	if (start->u.tc_start.ts_rule_flags & NTRS_DEFAULT) {
+		rule->tr_flags |= NTRS_DEFAULT;
+		LASSERT(head->th_rule == NULL);
+		head->th_rule = rule;
+	}
+
+	CDEBUG(D_RPCTRACE, "TBF starts rule@%p rate %llu gen %llu\n",
+	       rule, rule->tr_rpc_rate, rule->tr_generation);
+
+	return 0;
+}
+
+/**
+ * Change the rank of a rule in the rule list
+ *
+ * The matched rule will be moved to the position right before another
+ * given rule.
+ *
+ * \param[in] policy	the policy instance
+ * \param[in] head	the TBF policy instance
+ * \param[in] name	the rule name to be moved
+ * \param[in] next_name	the rule name before which the matched rule will be
+ *			moved
+ *
+ */
+static int
+nrs_tbf_rule_change_rank(struct ptlrpc_nrs_policy *policy,
+			 struct nrs_tbf_head *head,
+			 char *name,
+			 char *next_name)
+{
+	struct nrs_tbf_rule	*rule = NULL;
+	struct nrs_tbf_rule	*next_rule = NULL;
+	int			 rc = 0;
+
+	LASSERT(head != NULL);
+
+	spin_lock(&head->th_rule_lock);
+	rule = nrs_tbf_rule_find_nolock(head, name);
+	if (!rule)
+		GOTO(out, rc = -ENOENT);
+
+	if (strcmp(name, next_name) == 0)
+		GOTO(out_put, rc);
+
+	next_rule = nrs_tbf_rule_find_nolock(head, next_name);
+	if (!next_rule)
+		GOTO(out_put, rc = -ENOENT);
+
+	/* rules may be adjacent in same list, so list_move() isn't safe here */
+	list_move_tail(&rule->tr_linkage, &next_rule->tr_linkage);
+	nrs_tbf_rule_put(next_rule);
+out_put:
+	nrs_tbf_rule_put(rule);
+out:
+	spin_unlock(&head->th_rule_lock);
+	return rc;
+}
+
+static int
+nrs_tbf_rule_change_rate(struct ptlrpc_nrs_policy *policy,
+			 struct nrs_tbf_head *head,
+			 char *name,
+			 __u64 rate)
+{
+	struct nrs_tbf_rule *rule;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	rule = nrs_tbf_rule_find(head, name);
+	if (rule == NULL)
+		return -ENOENT;
+
+	rule->tr_rpc_rate = rate;
+	rule->tr_nsecs_per_rpc = NSEC_PER_SEC / rule->tr_rpc_rate;
+	rule->tr_generation++;
+	nrs_tbf_rule_put(rule);
+
+	return 0;
+}
+
+static int
+nrs_tbf_rule_change(struct ptlrpc_nrs_policy *policy,
+		    struct nrs_tbf_head *head,
+		    struct nrs_tbf_cmd *change)
+{
+	__u64	 rate = change->u.tc_change.tc_rpc_rate;
+	char	*next_name = change->u.tc_change.tc_next_name;
+	int	 rc;
+
+	if (rate != 0) {
+		rc = nrs_tbf_rule_change_rate(policy, head, change->tc_name,
+					      rate);
+		if (rc)
+			return rc;
+	}
+
+	if (next_name) {
+		rc = nrs_tbf_rule_change_rank(policy, head, change->tc_name,
+					      next_name);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int
+nrs_tbf_rule_stop(struct ptlrpc_nrs_policy *policy,
+		  struct nrs_tbf_head *head,
+		  struct nrs_tbf_cmd *stop)
+{
+	struct nrs_tbf_rule *rule;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	if (strcmp(stop->tc_name, NRS_TBF_DEFAULT_RULE) == 0)
+		return -EPERM;
+
+	rule = nrs_tbf_rule_find(head, stop->tc_name);
+	if (rule == NULL)
+		return -ENOENT;
+
+	list_del_init(&rule->tr_linkage);
+	rule->tr_flags |= NTRS_STOPPING;
+	nrs_tbf_rule_put(rule);
+	nrs_tbf_rule_put(rule);
+
+	return 0;
+}
+
+static int
+nrs_tbf_command(struct ptlrpc_nrs_policy *policy,
+		struct nrs_tbf_head *head,
+		struct nrs_tbf_cmd *cmd)
+{
+	int rc;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch (cmd->tc_cmd) {
+	case NRS_CTL_TBF_START_RULE:
+		if (cmd->u.tc_start.ts_valid_type != head->th_type_flag)
+			return -EINVAL;
+
+		spin_unlock(&policy->pol_nrs->nrs_lock);
+		rc = nrs_tbf_rule_start(policy, head, cmd);
+		spin_lock(&policy->pol_nrs->nrs_lock);
+		return rc;
+	case NRS_CTL_TBF_CHANGE_RULE:
+		rc = nrs_tbf_rule_change(policy, head, cmd);
+		return rc;
+	case NRS_CTL_TBF_STOP_RULE:
+		rc = nrs_tbf_rule_stop(policy, head, cmd);
+		/* Take it as a success, if not exists at all */
+		return rc == -ENOENT ? 0 : rc;
+	default:
+		return -EFAULT;
+	}
+}
+
+/**
+ * Binary heap predicate.
+ *
+ * \param[in] e1 the first binheap node to compare
+ * \param[in] e2 the second binheap node to compare
+ *
+ * \retval 0 e1 > e2
+ * \retval 1 e1 < e2
+ */
+static int
+tbf_cli_compare(struct binheap_node *e1, struct binheap_node *e2)
+{
+	struct nrs_tbf_client *cli1;
+	struct nrs_tbf_client *cli2;
+
+	cli1 = container_of(e1, struct nrs_tbf_client, tc_node);
+	cli2 = container_of(e2, struct nrs_tbf_client, tc_node);
+
+	if (cli1->tc_deadline < cli2->tc_deadline)
+		return 1;
+	else if (cli1->tc_deadline > cli2->tc_deadline)
+		return 0;
+
+	if (cli1->tc_check_time < cli2->tc_check_time)
+		return 1;
+	else if (cli1->tc_check_time > cli2->tc_check_time)
+		return 0;
+
+	/* Maybe need more comparasion, e.g. request number in the rules */
+	return 1;
+}
+
+/**
+ * TBF binary heap operations
+ */
+static struct binheap_ops nrs_tbf_heap_ops = {
+	.hop_enter	= NULL,
+	.hop_exit	= NULL,
+	.hop_compare	= tbf_cli_compare,
+};
+
+static unsigned nrs_tbf_jobid_hop_hash(struct cfs_hash *hs, const void *key,
+				  unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static int nrs_tbf_jobid_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	return (strcmp(cli->tc_jobid, key) == 0);
+}
+
+static void *nrs_tbf_jobid_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	return cli->tc_jobid;
+}
+
+static void *nrs_tbf_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
+}
+
+static void nrs_tbf_jobid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_jobid_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void
+nrs_tbf_jobid_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	LASSERT(atomic_read(&cli->tc_ref) == 0);
+	nrs_tbf_cli_fini(cli);
+}
+
+static struct cfs_hash_ops nrs_tbf_jobid_hash_ops = {
+	.hs_hash	= nrs_tbf_jobid_hop_hash,
+	.hs_keycmp	= nrs_tbf_jobid_hop_keycmp,
+	.hs_key		= nrs_tbf_jobid_hop_key,
+	.hs_object	= nrs_tbf_hop_object,
+	.hs_get		= nrs_tbf_jobid_hop_get,
+	.hs_put		= nrs_tbf_jobid_hop_put,
+	.hs_put_locked	= nrs_tbf_jobid_hop_put,
+	.hs_exit	= nrs_tbf_jobid_hop_exit,
+};
+
+#define NRS_TBF_JOBID_HASH_FLAGS (CFS_HASH_SPIN_BKTLOCK | \
+				  CFS_HASH_NO_ITEMREF | \
+				  CFS_HASH_DEPTH)
+
+static struct nrs_tbf_client *
+nrs_tbf_jobid_hash_lookup(struct cfs_hash *hs,
+			  struct cfs_hash_bd *bd,
+			  const char *jobid)
+{
+	struct hlist_node *hnode;
+	struct nrs_tbf_client *cli;
+
+	hnode = cfs_hash_bd_lookup_locked(hs, bd, (void *)jobid);
+	if (hnode == NULL)
+		return NULL;
+
+	cli = container_of(hnode, struct nrs_tbf_client, tc_hnode);
+	if (!list_empty(&cli->tc_lru))
+		list_del_init(&cli->tc_lru);
+	return cli;
+}
+
+#define NRS_TBF_JOBID_NULL ""
+
+static struct nrs_tbf_client *
+nrs_tbf_jobid_cli_find(struct nrs_tbf_head *head,
+		       struct ptlrpc_request *req)
+{
+	const char		*jobid;
+	struct nrs_tbf_client	*cli;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct cfs_hash_bd		 bd;
+
+	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+	cfs_hash_bd_get_and_lock(hs, (void *)jobid, &bd, 1);
+	cli = nrs_tbf_jobid_hash_lookup(hs, &bd, jobid);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	return cli;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_jobid_cli_findadd(struct nrs_tbf_head *head,
+			  struct nrs_tbf_client *cli)
+{
+	const char		*jobid;
+	struct nrs_tbf_client	*ret;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct cfs_hash_bd		 bd;
+
+	jobid = cli->tc_jobid;
+	cfs_hash_bd_get_and_lock(hs, (void *)jobid, &bd, 1);
+	ret = nrs_tbf_jobid_hash_lookup(hs, &bd, jobid);
+	if (ret == NULL) {
+		cfs_hash_bd_add_locked(hs, &bd, &cli->tc_hnode);
+		ret = cli;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	return ret;
+}
+
+static void
+nrs_tbf_jobid_cli_put(struct nrs_tbf_head *head,
+		      struct nrs_tbf_client *cli)
+{
+	struct cfs_hash_bd		 bd;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct nrs_tbf_bucket	*bkt;
+	int			 hw;
+	LIST_HEAD(zombies);
+
+	cfs_hash_bd_get(hs, &cli->tc_jobid, &bd);
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	if (!cfs_hash_bd_dec_and_lock(hs, &bd, &cli->tc_ref))
+		return;
+	LASSERT(list_empty(&cli->tc_lru));
+	list_add_tail(&cli->tc_lru, &bkt->ntb_lru);
+
+	/*
+	 * Check and purge the LRU, there is at least one client in the LRU.
+	 */
+	hw = tbf_jobid_cache_size >>
+	     (hs->hs_cur_bits - hs->hs_bkt_bits);
+	while (cfs_hash_bd_count_get(&bd) > hw) {
+		if (unlikely(list_empty(&bkt->ntb_lru)))
+			break;
+		cli = list_entry(bkt->ntb_lru.next,
+				     struct nrs_tbf_client,
+				     tc_lru);
+		LASSERT(atomic_read(&cli->tc_ref) == 0);
+		cfs_hash_bd_del_locked(hs, &bd, &cli->tc_hnode);
+		list_move(&cli->tc_lru, &zombies);
+	}
+	cfs_hash_bd_unlock(head->th_cli_hash, &bd, 1);
+
+	while (!list_empty(&zombies)) {
+		cli = container_of(zombies.next,
+				   struct nrs_tbf_client, tc_lru);
+		list_del_init(&cli->tc_lru);
+		nrs_tbf_cli_fini(cli);
+	}
+}
+
+static void
+nrs_tbf_jobid_cli_init(struct nrs_tbf_client *cli,
+		       struct ptlrpc_request *req)
+{
+	char *jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+	LASSERT(strlen(jobid) < LUSTRE_JOBID_SIZE);
+	INIT_LIST_HEAD(&cli->tc_lru);
+	memcpy(cli->tc_jobid, jobid, strlen(jobid));
+}
+
+static int nrs_tbf_jobid_hash_order(void)
+{
+	int bits;
+
+	for (bits = 1; (1 << bits) < tbf_jobid_cache_size; ++bits)
+		;
+
+	return bits;
+}
+
+#define NRS_TBF_JOBID_BKT_BITS 10
+
+static int
+nrs_tbf_jobid_startup(struct ptlrpc_nrs_policy *policy,
+		      struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd	 start;
+	struct nrs_tbf_bucket	*bkt;
+	int			 bits;
+	int			 i;
+	int			 rc;
+	struct cfs_hash_bd	 bd;
+
+	bits = nrs_tbf_jobid_hash_order();
+	if (bits < NRS_TBF_JOBID_BKT_BITS)
+		bits = NRS_TBF_JOBID_BKT_BITS;
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_hash",
+					    bits,
+					    bits,
+					    NRS_TBF_JOBID_BKT_BITS,
+					    sizeof(*bkt),
+					    0,
+					    0,
+					    &nrs_tbf_jobid_hash_ops,
+					    NRS_TBF_JOBID_HASH_FLAGS);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	cfs_hash_for_each_bucket(head->th_cli_hash, &bd, i) {
+		bkt = cfs_hash_bd_extra_get(head->th_cli_hash, &bd);
+		INIT_LIST_HEAD(&bkt->ntb_lru);
+	}
+
+	memset(&start, 0, sizeof(start));
+	start.u.tc_start.ts_jobids_str = "*";
+
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	INIT_LIST_HEAD(&start.u.tc_start.ts_jobids);
+	rc = nrs_tbf_rule_start(policy, head, &start);
+	if (rc) {
+		cfs_hash_putref(head->th_cli_hash);
+		head->th_cli_hash = NULL;
+	}
+
+	return rc;
+}
+
+/**
+ * Frees jobid of \a list.
+ *
+ */
+static void
+nrs_tbf_jobid_list_free(struct list_head *jobid_list)
+{
+	struct nrs_tbf_jobid *jobid, *n;
+
+	list_for_each_entry_safe(jobid, n, jobid_list, tj_linkage) {
+		OBD_FREE(jobid->tj_id, strlen(jobid->tj_id) + 1);
+		list_del(&jobid->tj_linkage);
+		OBD_FREE_PTR(jobid);
+	}
+}
+
+static int
+nrs_tbf_jobid_list_add(struct cfs_lstr *id, struct list_head *jobid_list)
+{
+	struct nrs_tbf_jobid *jobid;
+	char *ptr;
+
+	OBD_ALLOC_PTR(jobid);
+	if (jobid == NULL)
+		return -ENOMEM;
+
+	OBD_ALLOC(jobid->tj_id, id->ls_len + 1);
+	if (jobid->tj_id == NULL) {
+		OBD_FREE_PTR(jobid);
+		return -ENOMEM;
+	}
+
+	memcpy(jobid->tj_id, id->ls_str, id->ls_len);
+	ptr = lprocfs_strnstr(id->ls_str, "*", id->ls_len);
+	if (ptr == NULL)
+		jobid->tj_match_flag = NRS_TBF_MATCH_FULL;
+	else
+		jobid->tj_match_flag = NRS_TBF_MATCH_WILDCARD;
+
+	list_add_tail(&jobid->tj_linkage, jobid_list);
+	return 0;
+}
+
+static bool
+cfs_match_wildcard(const char *pattern, const char *content)
+{
+	if (*pattern == '\0' && *content == '\0')
+		return true;
+
+	if (*pattern == '*' && *(pattern + 1) != '\0' && *content == '\0')
+		return false;
+
+	while (*pattern == *content) {
+		pattern++;
+		content++;
+		if (*pattern == '\0' && *content == '\0')
+			return true;
+
+		if (*pattern == '*' && *(pattern + 1) != '\0' &&
+		    *content == '\0')
+			return false;
+	}
+
+	if (*pattern == '*')
+		return (cfs_match_wildcard(pattern + 1, content) ||
+			cfs_match_wildcard(pattern, content + 1));
+
+	return false;
+}
+
+static inline bool
+nrs_tbf_jobid_match(const struct nrs_tbf_jobid *jobid, const char *id)
+{
+	if (jobid->tj_match_flag == NRS_TBF_MATCH_FULL)
+		return strcmp(jobid->tj_id, id) == 0;
+
+	if (jobid->tj_match_flag == NRS_TBF_MATCH_WILDCARD)
+		return cfs_match_wildcard(jobid->tj_id, id);
+
+	return false;
+}
+
+static int
+nrs_tbf_jobid_list_match(struct list_head *jobid_list, char *id)
+{
+	struct nrs_tbf_jobid *jobid;
+
+	list_for_each_entry(jobid, jobid_list, tj_linkage) {
+		if (nrs_tbf_jobid_match(jobid, id))
+			return 1;
+	}
+	return 0;
+}
+
+static int
+nrs_tbf_jobid_list_parse(char *str, int len, struct list_head *jobid_list)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+	ENTRY;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(jobid_list);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = nrs_tbf_jobid_list_add(&res, jobid_list);
+		if (rc)
+			break;
+	}
+	if (rc)
+		nrs_tbf_jobid_list_free(jobid_list);
+	RETURN(rc);
+}
+
+static void nrs_tbf_jobid_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (!list_empty(&cmd->u.tc_start.ts_jobids))
+		nrs_tbf_jobid_list_free(&cmd->u.tc_start.ts_jobids);
+	if (cmd->u.tc_start.ts_jobids_str)
+		OBD_FREE(cmd->u.tc_start.ts_jobids_str,
+			 strlen(cmd->u.tc_start.ts_jobids_str) + 1);
+}
+
+static int nrs_tbf_check_id_value(struct cfs_lstr *src, char *key)
+{
+	struct cfs_lstr res;
+	int keylen = strlen(key);
+	int rc;
+
+	rc = cfs_gettok(src, '=', &res);
+	if (rc == 0 || res.ls_len != keylen ||
+	    strncmp(res.ls_str, key, keylen) != 0 ||
+	    !src->ls_str || src->ls_len <= 2 ||
+	    src->ls_str[0] != '{' || src->ls_str[src->ls_len - 1] != '}')
+		return -EINVAL;
+
+	/* Skip '{' and '}' */
+	src->ls_str++;
+	src->ls_len -= 2;
+	return 0;
+}
+
+static int nrs_tbf_jobid_parse(struct nrs_tbf_cmd *cmd, char *id)
+{
+	struct cfs_lstr src;
+	int rc;
+
+	src.ls_str = id;
+	src.ls_len = strlen(id);
+	rc = nrs_tbf_check_id_value(&src, "jobid");
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_jobids_str, src.ls_len + 1);
+	if (cmd->u.tc_start.ts_jobids_str == NULL)
+		return -ENOMEM;
+
+	memcpy(cmd->u.tc_start.ts_jobids_str, src.ls_str, src.ls_len);
+
+	/* parse jobid list */
+	rc = nrs_tbf_jobid_list_parse(cmd->u.tc_start.ts_jobids_str,
+				      strlen(cmd->u.tc_start.ts_jobids_str),
+				      &cmd->u.tc_start.ts_jobids);
+	if (rc)
+		nrs_tbf_jobid_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int nrs_tbf_jobid_rule_init(struct ptlrpc_nrs_policy *policy,
+				   struct nrs_tbf_rule *rule,
+				   struct nrs_tbf_cmd *start)
+{
+	int rc = 0;
+
+	LASSERT(start->u.tc_start.ts_jobids_str);
+	OBD_ALLOC(rule->tr_jobids_str,
+		  strlen(start->u.tc_start.ts_jobids_str) + 1);
+	if (rule->tr_jobids_str == NULL)
+		return -ENOMEM;
+
+	memcpy(rule->tr_jobids_str,
+	       start->u.tc_start.ts_jobids_str,
+	       strlen(start->u.tc_start.ts_jobids_str));
+
+	INIT_LIST_HEAD(&rule->tr_jobids);
+	if (!list_empty(&start->u.tc_start.ts_jobids)) {
+		rc = nrs_tbf_jobid_list_parse(rule->tr_jobids_str,
+					      strlen(rule->tr_jobids_str),
+					      &rule->tr_jobids);
+		if (rc)
+			CERROR("jobids {%s} illegal\n", rule->tr_jobids_str);
+	}
+	if (rc)
+		OBD_FREE(rule->tr_jobids_str,
+			 strlen(start->u.tc_start.ts_jobids_str) + 1);
+	return rc;
+}
+
+static int
+nrs_tbf_jobid_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
+		   rule->tr_jobids_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+static int
+nrs_tbf_jobid_rule_match(struct nrs_tbf_rule *rule,
+			 struct nrs_tbf_client *cli)
+{
+	return nrs_tbf_jobid_list_match(&rule->tr_jobids, cli->tc_jobid);
+}
+
+static void nrs_tbf_jobid_rule_fini(struct nrs_tbf_rule *rule)
+{
+	if (!list_empty(&rule->tr_jobids))
+		nrs_tbf_jobid_list_free(&rule->tr_jobids);
+	LASSERT(rule->tr_jobids_str != NULL);
+	OBD_FREE(rule->tr_jobids_str, strlen(rule->tr_jobids_str) + 1);
+}
+
+static struct nrs_tbf_ops nrs_tbf_jobid_ops = {
+	.o_name = NRS_TBF_TYPE_JOBID,
+	.o_startup = nrs_tbf_jobid_startup,
+	.o_cli_find = nrs_tbf_jobid_cli_find,
+	.o_cli_findadd = nrs_tbf_jobid_cli_findadd,
+	.o_cli_put = nrs_tbf_jobid_cli_put,
+	.o_cli_init = nrs_tbf_jobid_cli_init,
+	.o_rule_init = nrs_tbf_jobid_rule_init,
+	.o_rule_dump = nrs_tbf_jobid_rule_dump,
+	.o_rule_match = nrs_tbf_jobid_rule_match,
+	.o_rule_fini = nrs_tbf_jobid_rule_fini,
+};
+
+/**
+ * libcfs_hash operations for nrs_tbf_net::cn_cli_hash
+ *
+ * This uses ptlrpc_request::rq_peer.nid as its key, in order to hash
+ * nrs_tbf_client objects.
+ */
+#define NRS_TBF_NID_BKT_BITS	8
+#define NRS_TBF_NID_BITS	16
+
+static unsigned nrs_tbf_nid_hop_hash(struct cfs_hash *hs, const void *key,
+				  unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static int nrs_tbf_nid_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	lnet_nid_t	      *nid = (lnet_nid_t *)key;
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	return *nid == cli->tc_nid;
+}
+
+static void *nrs_tbf_nid_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	return &cli->tc_nid;
+}
+
+static void nrs_tbf_nid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_nid_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void nrs_tbf_nid_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	LASSERTF(atomic_read(&cli->tc_ref) == 0,
+		 "Busy TBF object from client with NID %s, with %d refs\n",
+		 libcfs_nid2str(cli->tc_nid), atomic_read(&cli->tc_ref));
+
+	nrs_tbf_cli_fini(cli);
+}
+
+static struct cfs_hash_ops nrs_tbf_nid_hash_ops = {
+	.hs_hash	= nrs_tbf_nid_hop_hash,
+	.hs_keycmp	= nrs_tbf_nid_hop_keycmp,
+	.hs_key		= nrs_tbf_nid_hop_key,
+	.hs_object	= nrs_tbf_hop_object,
+	.hs_get		= nrs_tbf_nid_hop_get,
+	.hs_put		= nrs_tbf_nid_hop_put,
+	.hs_put_locked	= nrs_tbf_nid_hop_put,
+	.hs_exit	= nrs_tbf_nid_hop_exit,
+};
+
+static struct nrs_tbf_client *
+nrs_tbf_nid_cli_find(struct nrs_tbf_head *head,
+		     struct ptlrpc_request *req)
+{
+	return cfs_hash_lookup(head->th_cli_hash, &req->rq_peer.nid);
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_nid_cli_findadd(struct nrs_tbf_head *head,
+			struct nrs_tbf_client *cli)
+{
+	return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_nid,
+				       &cli->tc_hnode);
+}
+
+static void
+nrs_tbf_nid_cli_put(struct nrs_tbf_head *head,
+		      struct nrs_tbf_client *cli)
+{
+	cfs_hash_put(head->th_cli_hash, &cli->tc_hnode);
+}
+
+static int
+nrs_tbf_nid_startup(struct ptlrpc_nrs_policy *policy,
+		    struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd	start;
+	int rc;
+
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_hash",
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nrs_tbf_nid_hash_ops,
+					    CFS_HASH_RW_BKTLOCK);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	memset(&start, 0, sizeof(start));
+	start.u.tc_start.ts_nids_str = "*";
+
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	INIT_LIST_HEAD(&start.u.tc_start.ts_nids);
+	rc = nrs_tbf_rule_start(policy, head, &start);
+	if (rc) {
+		cfs_hash_putref(head->th_cli_hash);
+		head->th_cli_hash = NULL;
+	}
+
+	return rc;
+}
+
+static void
+nrs_tbf_nid_cli_init(struct nrs_tbf_client *cli,
+			     struct ptlrpc_request *req)
+{
+	cli->tc_nid = req->rq_peer.nid;
+}
+
+static int nrs_tbf_nid_rule_init(struct ptlrpc_nrs_policy *policy,
+				 struct nrs_tbf_rule *rule,
+				 struct nrs_tbf_cmd *start)
+{
+	LASSERT(start->u.tc_start.ts_nids_str);
+	OBD_ALLOC(rule->tr_nids_str,
+		  strlen(start->u.tc_start.ts_nids_str) + 1);
+	if (rule->tr_nids_str == NULL)
+		return -ENOMEM;
+
+	memcpy(rule->tr_nids_str,
+	       start->u.tc_start.ts_nids_str,
+	       strlen(start->u.tc_start.ts_nids_str));
+
+	INIT_LIST_HEAD(&rule->tr_nids);
+	if (!list_empty(&start->u.tc_start.ts_nids)) {
+		if (cfs_parse_nidlist(rule->tr_nids_str,
+				      strlen(rule->tr_nids_str),
+				      &rule->tr_nids) <= 0) {
+			CERROR("nids {%s} illegal\n",
+			       rule->tr_nids_str);
+			OBD_FREE(rule->tr_nids_str,
+				 strlen(start->u.tc_start.ts_nids_str) + 1);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int
+nrs_tbf_nid_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
+		   rule->tr_nids_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+static int
+nrs_tbf_nid_rule_match(struct nrs_tbf_rule *rule,
+		       struct nrs_tbf_client *cli)
+{
+	return cfs_match_nid(cli->tc_nid, &rule->tr_nids);
+}
+
+static void nrs_tbf_nid_rule_fini(struct nrs_tbf_rule *rule)
+{
+	if (!list_empty(&rule->tr_nids))
+		cfs_free_nidlist(&rule->tr_nids);
+	LASSERT(rule->tr_nids_str != NULL);
+	OBD_FREE(rule->tr_nids_str, strlen(rule->tr_nids_str) + 1);
+}
+
+static void nrs_tbf_nid_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (!list_empty(&cmd->u.tc_start.ts_nids))
+		cfs_free_nidlist(&cmd->u.tc_start.ts_nids);
+	if (cmd->u.tc_start.ts_nids_str)
+		OBD_FREE(cmd->u.tc_start.ts_nids_str,
+			 strlen(cmd->u.tc_start.ts_nids_str) + 1);
+}
+
+static int nrs_tbf_nid_parse(struct nrs_tbf_cmd *cmd, char *id)
+{
+	struct cfs_lstr src;
+	int rc;
+
+	src.ls_str = id;
+	src.ls_len = strlen(id);
+	rc = nrs_tbf_check_id_value(&src, "nid");
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_nids_str, src.ls_len + 1);
+	if (cmd->u.tc_start.ts_nids_str == NULL)
+		return -ENOMEM;
+
+	memcpy(cmd->u.tc_start.ts_nids_str, src.ls_str, src.ls_len);
+
+	/* parse NID list */
+	if (cfs_parse_nidlist(cmd->u.tc_start.ts_nids_str,
+			      strlen(cmd->u.tc_start.ts_nids_str),
+			      &cmd->u.tc_start.ts_nids) <= 0) {
+		nrs_tbf_nid_cmd_fini(cmd);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct nrs_tbf_ops nrs_tbf_nid_ops = {
+	.o_name = NRS_TBF_TYPE_NID,
+	.o_startup = nrs_tbf_nid_startup,
+	.o_cli_find = nrs_tbf_nid_cli_find,
+	.o_cli_findadd = nrs_tbf_nid_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_nid_cli_init,
+	.o_rule_init = nrs_tbf_nid_rule_init,
+	.o_rule_dump = nrs_tbf_nid_rule_dump,
+	.o_rule_match = nrs_tbf_nid_rule_match,
+	.o_rule_fini = nrs_tbf_nid_rule_fini,
+};
+
+static unsigned nrs_tbf_hop_hash(struct cfs_hash *hs, const void *key,
+				 unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static int nrs_tbf_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	return (strcmp(cli->tc_key, key) == 0);
+}
+
+static void *nrs_tbf_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+	return cli->tc_key;
+}
+
+static void nrs_tbf_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void nrs_tbf_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	LASSERT(atomic_read(&cli->tc_ref) == 0);
+	nrs_tbf_cli_fini(cli);
+}
+
+static struct cfs_hash_ops nrs_tbf_hash_ops = {
+	.hs_hash	= nrs_tbf_hop_hash,
+	.hs_keycmp      = nrs_tbf_hop_keycmp,
+	.hs_key		= nrs_tbf_hop_key,
+	.hs_object	= nrs_tbf_hop_object,
+	.hs_get		= nrs_tbf_hop_get,
+	.hs_put		= nrs_tbf_hop_put,
+	.hs_put_locked	= nrs_tbf_hop_put,
+	.hs_exit	= nrs_tbf_hop_exit,
+};
+
+#define NRS_TBF_GENERIC_BKT_BITS	10
+#define NRS_TBF_GENERIC_HASH_FLAGS	(CFS_HASH_SPIN_BKTLOCK | \
+					CFS_HASH_NO_ITEMREF | \
+					CFS_HASH_DEPTH)
+
+static int
+nrs_tbf_startup(struct ptlrpc_nrs_policy *policy, struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd	 start;
+	struct nrs_tbf_bucket	*bkt;
+	int			 bits;
+	int			 i;
+	int			 rc;
+	struct cfs_hash_bd	 bd;
+
+	bits = nrs_tbf_jobid_hash_order();
+	if (bits < NRS_TBF_GENERIC_BKT_BITS)
+		bits = NRS_TBF_GENERIC_BKT_BITS;
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_hash",
+					    bits, bits,
+					    NRS_TBF_GENERIC_BKT_BITS,
+					    sizeof(*bkt), 0, 0,
+					    &nrs_tbf_hash_ops,
+					    NRS_TBF_GENERIC_HASH_FLAGS);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	cfs_hash_for_each_bucket(head->th_cli_hash, &bd, i) {
+		bkt = cfs_hash_bd_extra_get(head->th_cli_hash, &bd);
+		INIT_LIST_HEAD(&bkt->ntb_lru);
+	}
+
+	memset(&start, 0, sizeof(start));
+	start.u.tc_start.ts_conds_str = "*";
+
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	INIT_LIST_HEAD(&start.u.tc_start.ts_conds);
+	rc = nrs_tbf_rule_start(policy, head, &start);
+	if (rc)
+		cfs_hash_putref(head->th_cli_hash);
+
+	return rc;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_cli_hash_lookup(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			const char *key)
+{
+	struct hlist_node *hnode;
+	struct nrs_tbf_client *cli;
+
+	hnode = cfs_hash_bd_lookup_locked(hs, bd, (void *)key);
+	if (hnode == NULL)
+		return NULL;
+
+	cli = container_of(hnode, struct nrs_tbf_client, tc_hnode);
+	if (!list_empty(&cli->tc_lru))
+		list_del_init(&cli->tc_lru);
+	return cli;
+}
+
+/**
+ * ONLY opcode presented in this function will be checked in
+ * nrs_tbf_id_cli_set(). That means, we can add or remove an
+ * opcode to enable or disable requests handled in nrs_tbf
+ */
+static struct req_format *req_fmt(__u32 opcode)
+{
+	switch (opcode) {
+	case OST_GETATTR:
+		return &RQF_OST_GETATTR;
+	case OST_SETATTR:
+		return &RQF_OST_SETATTR;
+	case OST_READ:
+		return &RQF_OST_BRW_READ;
+	case OST_WRITE:
+		return &RQF_OST_BRW_WRITE;
+	/* FIXME: OST_CREATE and OST_DESTROY comes from MDS
+	 * in most case. Should they be removed? */
+	case OST_CREATE:
+		return &RQF_OST_CREATE;
+	case OST_DESTROY:
+		return &RQF_OST_DESTROY;
+	case OST_PUNCH:
+		return &RQF_OST_PUNCH;
+	case OST_SYNC:
+		return &RQF_OST_SYNC;
+	case OST_LADVISE:
+		return &RQF_OST_LADVISE;
+	case MDS_GETATTR:
+		return &RQF_MDS_GETATTR;
+	case MDS_GETATTR_NAME:
+		return &RQF_MDS_GETATTR_NAME;
+	/* close is skipped to avoid LDLM cancel slowness */
+#if 0
+	case MDS_CLOSE:
+		return &RQF_MDS_CLOSE;
+#endif
+	case MDS_REINT:
+		return &RQF_MDS_REINT;
+	case MDS_READPAGE:
+		return &RQF_MDS_READPAGE;
+	case MDS_GET_ROOT:
+		return &RQF_MDS_GET_ROOT;
+	case MDS_STATFS:
+		return &RQF_MDS_STATFS;
+	case MDS_SYNC:
+		return &RQF_MDS_SYNC;
+	case MDS_QUOTACTL:
+		return &RQF_MDS_QUOTACTL;
+	case MDS_GETXATTR:
+		return &RQF_MDS_GETXATTR;
+	case MDS_GET_INFO:
+		return &RQF_MDS_GET_INFO;
+	/* HSM op is skipped */
+#if 0 
+	case MDS_HSM_STATE_GET:
+		return &RQF_MDS_HSM_STATE_GET;
+	case MDS_HSM_STATE_SET:
+		return &RQF_MDS_HSM_STATE_SET;
+	case MDS_HSM_ACTION:
+		return &RQF_MDS_HSM_ACTION;
+	case MDS_HSM_CT_REGISTER:
+		return &RQF_MDS_HSM_CT_REGISTER;
+	case MDS_HSM_CT_UNREGISTER:
+		return &RQF_MDS_HSM_CT_UNREGISTER;
+#endif
+	case MDS_SWAP_LAYOUTS:
+		return &RQF_MDS_SWAP_LAYOUTS;
+	case LDLM_ENQUEUE:
+		return &RQF_LDLM_ENQUEUE;
+	default:
+		return NULL;
+	}
+}
+
+static struct req_format *intent_req_fmt(__u32 it_opc)
+{
+	if (it_opc & (IT_OPEN | IT_CREAT))
+		return &RQF_LDLM_INTENT_OPEN;
+	else if (it_opc & (IT_GETATTR | IT_LOOKUP))
+		return &RQF_LDLM_INTENT_GETATTR;
+	else if (it_opc & IT_GETXATTR)
+		return &RQF_LDLM_INTENT_GETXATTR;
+	else if (it_opc & (IT_GLIMPSE | IT_BRW))
+		return &RQF_LDLM_INTENT;
+	else
+		return NULL;
+}
+
+static int ost_tbf_id_cli_set(struct ptlrpc_request *req,
+			      struct tbf_id *id)
+{
+	struct ost_body *body;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body != NULL) {
+		id->ti_uid = body->oa.o_uid;
+		id->ti_gid = body->oa.o_gid;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static void unpack_ugid_from_mdt_body(struct ptlrpc_request *req,
+				      struct tbf_id *id)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	LASSERT(b != NULL);
+
+	/* TODO: nodemaping feature converts {ug}id from individual
+	 * clients to the actual ones of the file system. Some work
+	 * may be needed to fix this. */
+	id->ti_uid = b->mbo_uid;
+	id->ti_gid = b->mbo_gid;
+}
+
+static void unpack_ugid_from_mdt_rec_reint(struct ptlrpc_request *req,
+					   struct tbf_id *id)
+{
+	struct mdt_rec_reint *rec;
+
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	/* use the fs{ug}id as {ug}id of the process */
+	id->ti_uid = rec->rr_fsuid;
+	id->ti_gid = rec->rr_fsgid;
+}
+
+static int mdt_tbf_id_cli_set(struct ptlrpc_request *req,
+			      struct tbf_id *id)
+{
+	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+	int rc = 0;
+
+	switch (opc) {
+	case MDS_GETATTR:
+	case MDS_GETATTR_NAME:
+	case MDS_GET_ROOT:
+	case MDS_READPAGE:
+	case MDS_SYNC:
+	case MDS_GETXATTR:
+	case MDS_HSM_STATE_GET ... MDS_SWAP_LAYOUTS:
+		unpack_ugid_from_mdt_body(req, id);
+		break;
+	case MDS_CLOSE:
+	case MDS_REINT:
+		unpack_ugid_from_mdt_rec_reint(req, id);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+	return rc;
+}
+
+static int ldlm_tbf_id_cli_set(struct ptlrpc_request *req,
+			      struct tbf_id *id)
+{
+	struct ldlm_intent *lit;
+	struct req_format *fmt;
+
+	if (req->rq_reqmsg->lm_bufcount <= DLM_INTENT_IT_OFF)
+		return -EINVAL;
+
+	req_capsule_extend(&req->rq_pill, &RQF_LDLM_INTENT_BASIC);
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	if (lit == NULL)
+		return -EINVAL;
+
+	fmt = intent_req_fmt(lit->opc);
+	if (fmt == NULL)
+		return -EINVAL;
+
+	req_capsule_extend(&req->rq_pill, fmt);
+
+	if (lit->opc & (IT_GETXATTR | IT_GETATTR | IT_LOOKUP))
+		unpack_ugid_from_mdt_body(req, id);
+	else if (lit->opc & (IT_OPEN | IT_OPEN | IT_GLIMPSE | IT_BRW))
+		unpack_ugid_from_mdt_rec_reint(req, id);
+	else
+		return -EINVAL;
+	return 0;
+}
+
+static int nrs_tbf_id_cli_set(struct ptlrpc_request *req, struct tbf_id *id,
+			      enum nrs_tbf_flag ti_type)
+{
+	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+	struct req_format *fmt = req_fmt(opc);
+	bool fmt_unset = false;
+	int rc;
+
+	memset(id, 0, sizeof(struct tbf_id));
+	id->ti_type = ti_type;
+
+	if (fmt == NULL)
+		return -EINVAL;
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	if (req->rq_pill.rc_fmt == NULL) {
+		req_capsule_set(&req->rq_pill, fmt);
+		fmt_unset = true;
+	}
+
+	if (opc < OST_LAST_OPC)
+		rc = ost_tbf_id_cli_set(req, id);
+	else if (opc >= MDS_FIRST_OPC && opc < MDS_LAST_OPC)
+		rc = mdt_tbf_id_cli_set(req, id);
+	else if (opc == LDLM_ENQUEUE)
+		rc = ldlm_tbf_id_cli_set(req, id);
+	else
+		rc = -EINVAL;
+
+	/* restore it to the initialized state */
+	if (fmt_unset)
+		req->rq_pill.rc_fmt = NULL;
+	return rc;
+}
+
+static inline void nrs_tbf_cli_gen_key(struct nrs_tbf_client *cli,
+				       struct ptlrpc_request *req,
+				       char *keystr, size_t keystr_sz)
+{
+	const char *jobid;
+	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+	struct tbf_id id;
+
+	nrs_tbf_id_cli_set(req, &id, NRS_TBF_FLAG_UID | NRS_TBF_FLAG_GID);
+	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+
+	snprintf(keystr, keystr_sz, "%s_%s_%d_%u_%u", jobid,
+		 libcfs_nid2str(req->rq_peer.nid), opc, id.ti_uid,
+		 id.ti_gid);
+
+	if (cli) {
+		INIT_LIST_HEAD(&cli->tc_lru);
+		strlcpy(cli->tc_key, keystr, sizeof(cli->tc_key));
+		strlcpy(cli->tc_jobid, jobid, sizeof(cli->tc_jobid));
+		cli->tc_nid = req->rq_peer.nid;
+		cli->tc_opcode = opc;
+		cli->tc_id = id;
+	}
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_cli_find(struct nrs_tbf_head *head, struct ptlrpc_request *req)
+{
+	struct nrs_tbf_client *cli;
+	struct cfs_hash *hs = head->th_cli_hash;
+	struct cfs_hash_bd bd;
+	char keystr[NRS_TBF_KEY_LEN];
+
+	nrs_tbf_cli_gen_key(NULL, req, keystr, sizeof(keystr));
+	cfs_hash_bd_get_and_lock(hs, (void *)keystr, &bd, 1);
+	cli = nrs_tbf_cli_hash_lookup(hs, &bd, keystr);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	return cli;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_cli_findadd(struct nrs_tbf_head *head,
+		    struct nrs_tbf_client *cli)
+{
+	const char		*key;
+	struct nrs_tbf_client	*ret;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct cfs_hash_bd	 bd;
+
+	key = cli->tc_key;
+	cfs_hash_bd_get_and_lock(hs, (void *)key, &bd, 1);
+	ret = nrs_tbf_cli_hash_lookup(hs, &bd, key);
+	if (ret == NULL) {
+		cfs_hash_bd_add_locked(hs, &bd, &cli->tc_hnode);
+		ret = cli;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	return ret;
+}
+
+static void
+nrs_tbf_cli_put(struct nrs_tbf_head *head, struct nrs_tbf_client *cli)
+{
+	struct cfs_hash_bd	 bd;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct nrs_tbf_bucket	*bkt;
+	int			 hw;
+	LIST_HEAD(zombies);
+
+	cfs_hash_bd_get(hs, &cli->tc_key, &bd);
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	if (!cfs_hash_bd_dec_and_lock(hs, &bd, &cli->tc_ref))
+		return;
+	LASSERT(list_empty(&cli->tc_lru));
+	list_add_tail(&cli->tc_lru, &bkt->ntb_lru);
+
+	/**
+	 * Check and purge the LRU, there is at least one client in the LRU.
+	 */
+	hw = tbf_jobid_cache_size >> (hs->hs_cur_bits - hs->hs_bkt_bits);
+	while (cfs_hash_bd_count_get(&bd) > hw) {
+		if (unlikely(list_empty(&bkt->ntb_lru)))
+			break;
+		cli = list_entry(bkt->ntb_lru.next,
+				 struct nrs_tbf_client,
+				 tc_lru);
+		LASSERT(atomic_read(&cli->tc_ref) == 0);
+		cfs_hash_bd_del_locked(hs, &bd, &cli->tc_hnode);
+		list_move(&cli->tc_lru, &zombies);
+	}
+	cfs_hash_bd_unlock(head->th_cli_hash, &bd, 1);
+
+	while (!list_empty(&zombies)) {
+		cli = container_of(zombies.next,
+				   struct nrs_tbf_client, tc_lru);
+		list_del_init(&cli->tc_lru);
+		nrs_tbf_cli_fini(cli);
+	}
+}
+
+static void
+nrs_tbf_generic_cli_init(struct nrs_tbf_client *cli,
+			 struct ptlrpc_request *req)
+{
+	char keystr[NRS_TBF_KEY_LEN];
+
+	nrs_tbf_cli_gen_key(cli, req, keystr, sizeof(keystr));
+}
+
+static void
+nrs_tbf_id_list_free(struct list_head *uid_list)
+{
+	struct nrs_tbf_id *nti_id, *n;
+
+	list_for_each_entry_safe(nti_id, n, uid_list, nti_linkage) {
+		list_del_init(&nti_id->nti_linkage);
+		OBD_FREE_PTR(nti_id);
+	}
+}
+
+static void
+nrs_tbf_expression_free(struct nrs_tbf_expression *expr)
+{
+	LASSERT(expr->te_field >= NRS_TBF_FIELD_NID &&
+		expr->te_field < NRS_TBF_FIELD_MAX);
+	switch (expr->te_field) {
+	case NRS_TBF_FIELD_NID:
+		cfs_free_nidlist(&expr->te_cond);
+		break;
+	case NRS_TBF_FIELD_JOBID:
+		nrs_tbf_jobid_list_free(&expr->te_cond);
+		break;
+	case NRS_TBF_FIELD_OPCODE:
+		CFS_FREE_BITMAP(expr->te_opcodes);
+		break;
+	case NRS_TBF_FIELD_UID:
+	case NRS_TBF_FIELD_GID:
+		nrs_tbf_id_list_free(&expr->te_cond);
+		break;
+	default:
+		LBUG();
+	}
+	OBD_FREE_PTR(expr);
+}
+
+static void
+nrs_tbf_conjunction_free(struct nrs_tbf_conjunction *conjunction)
+{
+	struct nrs_tbf_expression *expression;
+	struct nrs_tbf_expression *n;
+
+	LASSERT(list_empty(&conjunction->tc_linkage));
+	list_for_each_entry_safe(expression, n,
+				 &conjunction->tc_expressions,
+				 te_linkage) {
+		list_del_init(&expression->te_linkage);
+		nrs_tbf_expression_free(expression);
+	}
+	OBD_FREE_PTR(conjunction);
+}
+
+static void
+nrs_tbf_conds_free(struct list_head *cond_list)
+{
+	struct nrs_tbf_conjunction *conjunction;
+	struct nrs_tbf_conjunction *n;
+
+	list_for_each_entry_safe(conjunction, n, cond_list, tc_linkage) {
+		list_del_init(&conjunction->tc_linkage);
+		nrs_tbf_conjunction_free(conjunction);
+	}
+}
+
+static void
+nrs_tbf_generic_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (!list_empty(&cmd->u.tc_start.ts_conds))
+		nrs_tbf_conds_free(&cmd->u.tc_start.ts_conds);
+	if (cmd->u.tc_start.ts_conds_str)
+		OBD_FREE(cmd->u.tc_start.ts_conds_str,
+			 strlen(cmd->u.tc_start.ts_conds_str) + 1);
+}
+
+#define NRS_TBF_DISJUNCTION_DELIM	(',')
+#define NRS_TBF_CONJUNCTION_DELIM	('&')
+#define NRS_TBF_EXPRESSION_DELIM	('=')
+
+static inline bool
+nrs_tbf_check_field(struct cfs_lstr *field, char *str)
+{
+	int len = strlen(str);
+
+	return (field->ls_len == len &&
+		strncmp(field->ls_str, str, len) == 0);
+}
+
+static int
+nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr);
+static int
+nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list,
+		      enum nrs_tbf_flag tif);
+
+static int
+nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
+{
+	struct nrs_tbf_expression *expr;
+	struct cfs_lstr field;
+	int rc = 0;
+
+	OBD_ALLOC_PTR(expr);
+	if (expr == NULL)
+		return -ENOMEM;
+
+	rc = cfs_gettok(src, NRS_TBF_EXPRESSION_DELIM, &field);
+	if (rc == 0 || !src->ls_str || src->ls_len <= 2 ||
+	    src->ls_str[0] != '{' || src->ls_str[src->ls_len - 1] != '}')
+		GOTO(out, rc = -EINVAL);
+
+	/* Skip '{' and '}' */
+	src->ls_str++;
+	src->ls_len -= 2;
+
+	if (nrs_tbf_check_field(&field, "nid")) {
+		if (cfs_parse_nidlist(src->ls_str,
+				      src->ls_len,
+				      &expr->te_cond) <= 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_NID;
+	} else if (nrs_tbf_check_field(&field, "jobid")) {
+		if (nrs_tbf_jobid_list_parse(src->ls_str,
+					     src->ls_len,
+					     &expr->te_cond) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_JOBID;
+	} else if (nrs_tbf_check_field(&field, "opcode")) {
+		if (nrs_tbf_opcode_list_parse(src->ls_str,
+					      src->ls_len,
+					      &expr->te_opcodes) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_OPCODE;
+	} else if (nrs_tbf_check_field(&field, "uid")) {
+		if (nrs_tbf_id_list_parse(src->ls_str,
+					  src->ls_len,
+					  &expr->te_cond,
+					  NRS_TBF_FLAG_UID) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_UID;
+	} else if (nrs_tbf_check_field(&field, "gid")) {
+		if (nrs_tbf_id_list_parse(src->ls_str,
+					  src->ls_len,
+					  &expr->te_cond,
+					  NRS_TBF_FLAG_GID) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_GID;
+	} else {
+		GOTO(out, rc = -EINVAL);
+	}
+
+	list_add_tail(&expr->te_linkage, cond_list);
+	return 0;
+out:
+	OBD_FREE_PTR(expr);
+	return rc;
+}
+
+static int
+nrs_tbf_conjunction_parse(struct cfs_lstr *src, struct list_head *cond_list)
+{
+	struct nrs_tbf_conjunction *conjunction;
+	struct cfs_lstr expr;
+	int rc = 0;
+
+	OBD_ALLOC_PTR(conjunction);
+	if (conjunction == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&conjunction->tc_expressions);
+	list_add_tail(&conjunction->tc_linkage, cond_list);
+
+	while (src->ls_str) {
+		rc = cfs_gettok(src, NRS_TBF_CONJUNCTION_DELIM, &expr);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = nrs_tbf_expression_parse(&expr,
+					      &conjunction->tc_expressions);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+static int
+nrs_tbf_conds_parse(char *str, int len, struct list_head *cond_list)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(cond_list);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, NRS_TBF_DISJUNCTION_DELIM, &res);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = nrs_tbf_conjunction_parse(&res, cond_list);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+static int
+nrs_tbf_generic_parse(struct nrs_tbf_cmd *cmd, const char *id)
+{
+	int rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_conds_str, strlen(id) + 1);
+	if (cmd->u.tc_start.ts_conds_str == NULL)
+		return -ENOMEM;
+
+	memcpy(cmd->u.tc_start.ts_conds_str, id, strlen(id));
+
+	/* Parse hybird NID and JOBID conditions */
+	rc = nrs_tbf_conds_parse(cmd->u.tc_start.ts_conds_str,
+				 strlen(cmd->u.tc_start.ts_conds_str),
+				 &cmd->u.tc_start.ts_conds);
+	if (rc)
+		nrs_tbf_generic_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int
+nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id);
+
+static int
+nrs_tbf_expression_match(struct nrs_tbf_expression *expr,
+			 struct nrs_tbf_rule *rule,
+			 struct nrs_tbf_client *cli)
+{
+	switch (expr->te_field) {
+	case NRS_TBF_FIELD_NID:
+		return cfs_match_nid(cli->tc_nid, &expr->te_cond);
+	case NRS_TBF_FIELD_JOBID:
+		return nrs_tbf_jobid_list_match(&expr->te_cond, cli->tc_jobid);
+	case NRS_TBF_FIELD_OPCODE:
+		return cfs_bitmap_check(expr->te_opcodes, cli->tc_opcode);
+	case NRS_TBF_FIELD_UID:
+	case NRS_TBF_FIELD_GID:
+		return nrs_tbf_id_list_match(&expr->te_cond, cli->tc_id);
+	default:
+		return 0;
+	}
+}
+
+static int
+nrs_tbf_conjunction_match(struct nrs_tbf_conjunction *conjunction,
+			  struct nrs_tbf_rule *rule,
+			  struct nrs_tbf_client *cli)
+{
+	struct nrs_tbf_expression *expr;
+	int matched;
+
+	list_for_each_entry(expr, &conjunction->tc_expressions, te_linkage) {
+		matched = nrs_tbf_expression_match(expr, rule, cli);
+		if (!matched)
+			return 0;
+	}
+
+	return 1;
+}
+
+static int
+nrs_tbf_cond_match(struct nrs_tbf_rule *rule, struct nrs_tbf_client *cli)
+{
+	struct nrs_tbf_conjunction *conjunction;
+	int matched;
+
+	list_for_each_entry(conjunction, &rule->tr_conds, tc_linkage) {
+		matched = nrs_tbf_conjunction_match(conjunction, rule, cli);
+		if (matched)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void
+nrs_tbf_generic_rule_fini(struct nrs_tbf_rule *rule)
+{
+	if (!list_empty(&rule->tr_conds))
+		nrs_tbf_conds_free(&rule->tr_conds);
+	LASSERT(rule->tr_conds_str != NULL);
+	OBD_FREE(rule->tr_conds_str, strlen(rule->tr_conds_str) + 1);
+}
+
+static int
+nrs_tbf_rule_init(struct ptlrpc_nrs_policy *policy,
+		  struct nrs_tbf_rule *rule, struct nrs_tbf_cmd *start)
+{
+	int rc = 0;
+
+	LASSERT(start->u.tc_start.ts_conds_str);
+	OBD_ALLOC(rule->tr_conds_str,
+		  strlen(start->u.tc_start.ts_conds_str) + 1);
+	if (rule->tr_conds_str == NULL)
+		return -ENOMEM;
+
+	memcpy(rule->tr_conds_str,
+	       start->u.tc_start.ts_conds_str,
+	       strlen(start->u.tc_start.ts_conds_str));
+
+	INIT_LIST_HEAD(&rule->tr_conds);
+	if (!list_empty(&start->u.tc_start.ts_conds)) {
+		rc = nrs_tbf_conds_parse(rule->tr_conds_str,
+					 strlen(rule->tr_conds_str),
+					 &rule->tr_conds);
+	}
+	if (rc)
+		nrs_tbf_generic_rule_fini(rule);
+
+	return rc;
+}
+
+static int
+nrs_tbf_generic_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s %s %llu, ref %d\n", rule->tr_name,
+		   rule->tr_conds_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+static int
+nrs_tbf_generic_rule_match(struct nrs_tbf_rule *rule,
+			   struct nrs_tbf_client *cli)
+{
+	return nrs_tbf_cond_match(rule, cli);
+}
+
+static struct nrs_tbf_ops nrs_tbf_generic_ops = {
+	.o_name = NRS_TBF_TYPE_GENERIC,
+	.o_startup = nrs_tbf_startup,
+	.o_cli_find = nrs_tbf_cli_find,
+	.o_cli_findadd = nrs_tbf_cli_findadd,
+	.o_cli_put = nrs_tbf_cli_put,
+	.o_cli_init = nrs_tbf_generic_cli_init,
+	.o_rule_init = nrs_tbf_rule_init,
+	.o_rule_dump = nrs_tbf_generic_rule_dump,
+	.o_rule_match = nrs_tbf_generic_rule_match,
+	.o_rule_fini = nrs_tbf_generic_rule_fini,
+};
+
+static void nrs_tbf_opcode_rule_fini(struct nrs_tbf_rule *rule)
+{
+	if (rule->tr_opcodes != NULL)
+		CFS_FREE_BITMAP(rule->tr_opcodes);
+
+	LASSERT(rule->tr_opcodes_str != NULL);
+	OBD_FREE(rule->tr_opcodes_str, strlen(rule->tr_opcodes_str) + 1);
+}
+
+static unsigned nrs_tbf_opcode_hop_hash(struct cfs_hash *hs, const void *key,
+					unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(__u32), mask);
+}
+
+static int nrs_tbf_opcode_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	const __u32	*opc = key;
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	return *opc == cli->tc_opcode;
+}
+
+static void *nrs_tbf_opcode_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	return &cli->tc_opcode;
+}
+
+static void nrs_tbf_opcode_hop_get(struct cfs_hash *hs,
+				   struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_opcode_hop_put(struct cfs_hash *hs,
+				   struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void nrs_tbf_opcode_hop_exit(struct cfs_hash *hs,
+				    struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	LASSERTF(atomic_read(&cli->tc_ref) == 0,
+		 "Busy TBF object from client with opcode %s, with %d refs\n",
+		 ll_opcode2str(cli->tc_opcode),
+		 atomic_read(&cli->tc_ref));
+
+	nrs_tbf_cli_fini(cli);
+}
+static struct cfs_hash_ops nrs_tbf_opcode_hash_ops = {
+	.hs_hash	= nrs_tbf_opcode_hop_hash,
+	.hs_keycmp	= nrs_tbf_opcode_hop_keycmp,
+	.hs_key		= nrs_tbf_opcode_hop_key,
+	.hs_object	= nrs_tbf_hop_object,
+	.hs_get		= nrs_tbf_opcode_hop_get,
+	.hs_put		= nrs_tbf_opcode_hop_put,
+	.hs_put_locked	= nrs_tbf_opcode_hop_put,
+	.hs_exit	= nrs_tbf_opcode_hop_exit,
+};
+
+static int
+nrs_tbf_opcode_startup(struct ptlrpc_nrs_policy *policy,
+		    struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd	start = { 0 };
+	int rc;
+
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_hash",
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nrs_tbf_opcode_hash_ops,
+					    CFS_HASH_RW_BKTLOCK);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	start.u.tc_start.ts_opcodes_str = "*";
+
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	rc = nrs_tbf_rule_start(policy, head, &start);
+
+	return rc;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_opcode_cli_find(struct nrs_tbf_head *head,
+			struct ptlrpc_request *req)
+{
+	__u32 opc;
+
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	return cfs_hash_lookup(head->th_cli_hash, &opc);
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_opcode_cli_findadd(struct nrs_tbf_head *head,
+			   struct nrs_tbf_client *cli)
+{
+	return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_opcode,
+				       &cli->tc_hnode);
+}
+
+static void
+nrs_tbf_opcode_cli_init(struct nrs_tbf_client *cli,
+			struct ptlrpc_request *req)
+{
+	cli->tc_opcode = lustre_msg_get_opc(req->rq_reqmsg);
+}
+
+#define MAX_OPCODE_LEN	32
+static int
+nrs_tbf_opcode_set_bit(const struct cfs_lstr *id, struct cfs_bitmap *opcodes)
+{
+	int	op = 0;
+	char	opcode_str[MAX_OPCODE_LEN];
+
+	if (id->ls_len + 1 > MAX_OPCODE_LEN)
+		return -EINVAL;
+
+	memcpy(opcode_str, id->ls_str, id->ls_len);
+	opcode_str[id->ls_len] = '\0';
+
+	op = ll_str2opcode(opcode_str);
+	if (op < 0)
+		return -EINVAL;
+
+	cfs_bitmap_set(opcodes, op);
+	return 0;
+}
+
+static int
+nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr)
+{
+	struct cfs_bitmap *opcodes;
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+	ENTRY;
+
+	opcodes = CFS_ALLOCATE_BITMAP(LUSTRE_MAX_OPCODES);
+	if (opcodes == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = nrs_tbf_opcode_set_bit(&res, opcodes);
+		if (rc)
+			break;
+	}
+
+	if (rc == 0 && bitmaptr)
+		*bitmaptr = opcodes;
+	else
+		CFS_FREE_BITMAP(opcodes);
+
+	RETURN(rc);
+}
+
+static void nrs_tbf_opcode_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (cmd->u.tc_start.ts_opcodes_str)
+		OBD_FREE(cmd->u.tc_start.ts_opcodes_str,
+			 strlen(cmd->u.tc_start.ts_opcodes_str) + 1);
+
+}
+
+static int nrs_tbf_opcode_parse(struct nrs_tbf_cmd *cmd, char *id)
+{
+	struct cfs_lstr src;
+	int rc;
+
+	src.ls_str = id;
+	src.ls_len = strlen(id);
+	rc = nrs_tbf_check_id_value(&src, "opcode");
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_opcodes_str, src.ls_len + 1);
+	if (cmd->u.tc_start.ts_opcodes_str == NULL)
+		return -ENOMEM;
+
+	memcpy(cmd->u.tc_start.ts_opcodes_str, src.ls_str, src.ls_len);
+
+	/* parse opcode list */
+	rc = nrs_tbf_opcode_list_parse(cmd->u.tc_start.ts_opcodes_str,
+				       strlen(cmd->u.tc_start.ts_opcodes_str),
+				       NULL);
+	if (rc)
+		nrs_tbf_opcode_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int
+nrs_tbf_opcode_rule_match(struct nrs_tbf_rule *rule,
+			  struct nrs_tbf_client *cli)
+{
+	if (rule->tr_opcodes == NULL)
+		return 0;
+
+	return cfs_bitmap_check(rule->tr_opcodes, cli->tc_opcode);
+}
+
+static int nrs_tbf_opcode_rule_init(struct ptlrpc_nrs_policy *policy,
+				    struct nrs_tbf_rule *rule,
+				    struct nrs_tbf_cmd *start)
+{
+	int rc = 0;
+
+	LASSERT(start->u.tc_start.ts_opcodes_str != NULL);
+	OBD_ALLOC(rule->tr_opcodes_str,
+		  strlen(start->u.tc_start.ts_opcodes_str) + 1);
+	if (rule->tr_opcodes_str == NULL)
+		return -ENOMEM;
+
+	strncpy(rule->tr_opcodes_str, start->u.tc_start.ts_opcodes_str,
+		strlen(start->u.tc_start.ts_opcodes_str) + 1);
+
+	/* Default rule '*' */
+	if (strcmp(start->u.tc_start.ts_opcodes_str, "*") == 0)
+		return 0;
+
+	rc = nrs_tbf_opcode_list_parse(rule->tr_opcodes_str,
+				       strlen(rule->tr_opcodes_str),
+				       &rule->tr_opcodes);
+	if (rc)
+		OBD_FREE(rule->tr_opcodes_str,
+			 strlen(start->u.tc_start.ts_opcodes_str) + 1);
+
+	return rc;
+}
+
+static int
+nrs_tbf_opcode_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
+		   rule->tr_opcodes_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+
+struct nrs_tbf_ops nrs_tbf_opcode_ops = {
+	.o_name = NRS_TBF_TYPE_OPCODE,
+	.o_startup = nrs_tbf_opcode_startup,
+	.o_cli_find = nrs_tbf_opcode_cli_find,
+	.o_cli_findadd = nrs_tbf_opcode_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_opcode_cli_init,
+	.o_rule_init = nrs_tbf_opcode_rule_init,
+	.o_rule_dump = nrs_tbf_opcode_rule_dump,
+	.o_rule_match = nrs_tbf_opcode_rule_match,
+	.o_rule_fini = nrs_tbf_opcode_rule_fini,
+};
+
+static unsigned nrs_tbf_id_hop_hash(struct cfs_hash *hs, const void *key,
+				    unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(struct tbf_id), mask);
+}
+
+static int nrs_tbf_id_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	const struct tbf_id *opc = key;
+	enum nrs_tbf_flag ntf;
+	struct nrs_tbf_client *cli = hlist_entry(hnode, struct nrs_tbf_client,
+						 tc_hnode);
+	ntf = opc->ti_type & cli->tc_id.ti_type;
+	if ((ntf & NRS_TBF_FLAG_UID) && opc->ti_uid != cli->tc_id.ti_uid)
+		return 0;
+
+	if ((ntf & NRS_TBF_FLAG_GID) && opc->ti_gid != cli->tc_id.ti_gid)
+		return 0;
+
+	return 1;
+}
+
+static void *nrs_tbf_id_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+	return &cli->tc_id;
+}
+
+static void nrs_tbf_id_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_id_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void
+nrs_tbf_id_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	LASSERT(atomic_read(&cli->tc_ref) == 0);
+	nrs_tbf_cli_fini(cli);
+}
+
+static struct cfs_hash_ops nrs_tbf_id_hash_ops = {
+	.hs_hash	= nrs_tbf_id_hop_hash,
+	.hs_keycmp	= nrs_tbf_id_hop_keycmp,
+	.hs_key		= nrs_tbf_id_hop_key,
+	.hs_object	= nrs_tbf_hop_object,
+	.hs_get		= nrs_tbf_id_hop_get,
+	.hs_put		= nrs_tbf_id_hop_put,
+	.hs_put_locked	= nrs_tbf_id_hop_put,
+	.hs_exit	= nrs_tbf_id_hop_exit,
+};
+
+static int
+nrs_tbf_id_startup(struct ptlrpc_nrs_policy *policy,
+		   struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd start;
+	int rc;
+
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_id_hash",
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nrs_tbf_id_hash_ops,
+					    CFS_HASH_RW_BKTLOCK);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	memset(&start, 0, sizeof(start));
+	start.u.tc_start.ts_ids_str = "*";
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	INIT_LIST_HEAD(&start.u.tc_start.ts_ids);
+	rc = nrs_tbf_rule_start(policy, head, &start);
+	if (rc) {
+		cfs_hash_putref(head->th_cli_hash);
+		head->th_cli_hash = NULL;
+	}
+
+	return rc;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_id_cli_find(struct nrs_tbf_head *head,
+		    struct ptlrpc_request *req)
+{
+	struct tbf_id id;
+
+	LASSERT(head->th_type_flag == NRS_TBF_FLAG_UID ||
+		head->th_type_flag == NRS_TBF_FLAG_GID);
+
+	nrs_tbf_id_cli_set(req, &id, head->th_type_flag);
+	return cfs_hash_lookup(head->th_cli_hash, &id);
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_id_cli_findadd(struct nrs_tbf_head *head,
+		       struct nrs_tbf_client *cli)
+{
+	return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_id,
+				       &cli->tc_hnode);
+}
+
+static void
+nrs_tbf_uid_cli_init(struct nrs_tbf_client *cli,
+		     struct ptlrpc_request *req)
+{
+	nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_UID);
+}
+
+static void
+nrs_tbf_gid_cli_init(struct nrs_tbf_client *cli,
+		     struct ptlrpc_request *req)
+{
+	nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_GID);
+}
+
+static int
+nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id)
+{
+	struct nrs_tbf_id *nti_id;
+	enum nrs_tbf_flag flag;
+
+	list_for_each_entry(nti_id, id_list, nti_linkage) {
+		flag = id.ti_type & nti_id->nti_id.ti_type;
+		if (!flag)
+			continue;
+
+		if ((flag & NRS_TBF_FLAG_UID) &&
+		    (id.ti_uid != nti_id->nti_id.ti_uid))
+			continue;
+
+		if ((flag & NRS_TBF_FLAG_GID) &&
+		    (id.ti_gid != nti_id->nti_id.ti_gid))
+			continue;
+
+		return 1;
+	}
+	return 0;
+}
+
+static int
+nrs_tbf_id_rule_match(struct nrs_tbf_rule *rule,
+		      struct nrs_tbf_client *cli)
+{
+	return nrs_tbf_id_list_match(&rule->tr_ids, cli->tc_id);
+}
+
+static void nrs_tbf_id_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	nrs_tbf_id_list_free(&cmd->u.tc_start.ts_ids);
+
+	if (cmd->u.tc_start.ts_ids_str)
+		OBD_FREE(cmd->u.tc_start.ts_ids_str,
+			 strlen(cmd->u.tc_start.ts_ids_str) + 1);
+}
+
+static int
+nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list,
+		      enum nrs_tbf_flag tif)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+	struct tbf_id id = { 0 };
+	ENTRY;
+
+	if (tif != NRS_TBF_FLAG_UID && tif != NRS_TBF_FLAG_GID)
+		RETURN(-EINVAL);
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(id_list);
+	while (src.ls_str) {
+		struct nrs_tbf_id *nti_id;
+
+		if (cfs_gettok(&src, ' ', &res) == 0)
+			GOTO(out, rc = -EINVAL);
+
+		id.ti_type = tif;
+		if (tif == NRS_TBF_FLAG_UID) {
+			if (!cfs_str2num_check(res.ls_str, res.ls_len,
+					       &id.ti_uid, 0, (u32)~0U))
+				GOTO(out, rc = -EINVAL);
+		} else {
+			if (!cfs_str2num_check(res.ls_str, res.ls_len,
+					       &id.ti_gid, 0, (u32)~0U))
+				GOTO(out, rc = -EINVAL);
+		}
+
+		OBD_ALLOC_PTR(nti_id);
+		if (nti_id == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		nti_id->nti_id = id;
+		list_add_tail(&nti_id->nti_linkage, id_list);
+	}
+out:
+	if (rc)
+		nrs_tbf_id_list_free(id_list);
+	RETURN(rc);
+}
+
+static int nrs_tbf_ug_id_parse(struct nrs_tbf_cmd *cmd, char *id)
+{
+	struct cfs_lstr src;
+	int rc;
+	enum nrs_tbf_flag tif;
+
+	tif = cmd->u.tc_start.ts_valid_type;
+
+	src.ls_str = id;
+	src.ls_len = strlen(id);
+
+	rc = nrs_tbf_check_id_value(&src,
+				    tif == NRS_TBF_FLAG_UID ? "uid" : "gid");
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_ids_str, src.ls_len + 1);
+	if (cmd->u.tc_start.ts_ids_str == NULL)
+		return -ENOMEM;
+
+	strlcpy(cmd->u.tc_start.ts_ids_str, src.ls_str, src.ls_len + 1);
+
+	rc = nrs_tbf_id_list_parse(cmd->u.tc_start.ts_ids_str,
+				   strlen(cmd->u.tc_start.ts_ids_str),
+				   &cmd->u.tc_start.ts_ids, tif);
+	if (rc)
+		nrs_tbf_id_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int
+nrs_tbf_id_rule_init(struct ptlrpc_nrs_policy *policy,
+		     struct nrs_tbf_rule *rule,
+		     struct nrs_tbf_cmd *start)
+{
+	struct nrs_tbf_head *head = rule->tr_head;
+	int rc = 0;
+	enum nrs_tbf_flag tif = head->th_type_flag;
+	int ids_len = strlen(start->u.tc_start.ts_ids_str) + 1;
+
+	LASSERT(start->u.tc_start.ts_ids_str);
+	INIT_LIST_HEAD(&rule->tr_ids);
+
+	OBD_ALLOC(rule->tr_ids_str, ids_len);
+	if (rule->tr_ids_str == NULL)
+		return -ENOMEM;
+
+	strlcpy(rule->tr_ids_str, start->u.tc_start.ts_ids_str,
+		ids_len);
+
+	if (!list_empty(&start->u.tc_start.ts_ids)) {
+		rc = nrs_tbf_id_list_parse(rule->tr_ids_str,
+					   strlen(rule->tr_ids_str),
+					   &rule->tr_ids, tif);
+		if (rc)
+			CERROR("%ss {%s} illegal\n",
+			       tif == NRS_TBF_FLAG_UID ? "uid" : "gid",
+			       rule->tr_ids_str);
+	}
+	if (rc) {
+		OBD_FREE(rule->tr_ids_str, ids_len);
+		rule->tr_ids_str = NULL;
+	}
+	return rc;
+}
+
+static int
+nrs_tbf_id_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
+		   rule->tr_ids_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+static void nrs_tbf_id_rule_fini(struct nrs_tbf_rule *rule)
+{
+	nrs_tbf_id_list_free(&rule->tr_ids);
+	if (rule->tr_ids_str != NULL)
+		OBD_FREE(rule->tr_ids_str, strlen(rule->tr_ids_str) + 1);
+}
+
+struct nrs_tbf_ops nrs_tbf_uid_ops = {
+	.o_name = NRS_TBF_TYPE_UID,
+	.o_startup = nrs_tbf_id_startup,
+	.o_cli_find = nrs_tbf_id_cli_find,
+	.o_cli_findadd = nrs_tbf_id_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_uid_cli_init,
+	.o_rule_init = nrs_tbf_id_rule_init,
+	.o_rule_dump = nrs_tbf_id_rule_dump,
+	.o_rule_match = nrs_tbf_id_rule_match,
+	.o_rule_fini = nrs_tbf_id_rule_fini,
+};
+
+struct nrs_tbf_ops nrs_tbf_gid_ops = {
+	.o_name = NRS_TBF_TYPE_GID,
+	.o_startup = nrs_tbf_id_startup,
+	.o_cli_find = nrs_tbf_id_cli_find,
+	.o_cli_findadd = nrs_tbf_id_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_gid_cli_init,
+	.o_rule_init = nrs_tbf_id_rule_init,
+	.o_rule_dump = nrs_tbf_id_rule_dump,
+	.o_rule_match = nrs_tbf_id_rule_match,
+	.o_rule_fini = nrs_tbf_id_rule_fini,
+};
+
+static struct nrs_tbf_type nrs_tbf_types[] = {
+	{
+		.ntt_name = NRS_TBF_TYPE_JOBID,
+		.ntt_flag = NRS_TBF_FLAG_JOBID,
+		.ntt_ops = &nrs_tbf_jobid_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_NID,
+		.ntt_flag = NRS_TBF_FLAG_NID,
+		.ntt_ops = &nrs_tbf_nid_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_OPCODE,
+		.ntt_flag = NRS_TBF_FLAG_OPCODE,
+		.ntt_ops = &nrs_tbf_opcode_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_GENERIC,
+		.ntt_flag = NRS_TBF_FLAG_GENERIC,
+		.ntt_ops = &nrs_tbf_generic_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_UID,
+		.ntt_flag = NRS_TBF_FLAG_UID,
+		.ntt_ops = &nrs_tbf_uid_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_GID,
+		.ntt_flag = NRS_TBF_FLAG_GID,
+		.ntt_ops = &nrs_tbf_gid_ops,
+	},
+};
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0	   success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_tbf_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_tbf_head	*head;
+	struct nrs_tbf_ops	*ops;
+	__u32			 type;
+	char			*name;
+	int found = 0;
+	int i;
+	int rc = 0;
+
+	if (arg == NULL)
+		name = NRS_TBF_TYPE_GENERIC;
+	else if (strlen(arg) < NRS_TBF_TYPE_MAX_LEN)
+		name = arg;
+	else
+		GOTO(out, rc = -EINVAL);
+
+	for (i = 0; i < ARRAY_SIZE(nrs_tbf_types); i++) {
+		if (strcmp(name, nrs_tbf_types[i].ntt_name) == 0) {
+			ops = nrs_tbf_types[i].ntt_ops;
+			type = nrs_tbf_types[i].ntt_flag;
+			found = 1;
+			break;
+		}
+	}
+	if (found == 0)
+		GOTO(out, rc = -ENOTSUPP);
+
+	OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (head == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	memcpy(head->th_type, name, strlen(name));
+	head->th_type[strlen(name)] = '\0';
+	head->th_ops = ops;
+	head->th_type_flag = type;
+
+	head->th_binheap = binheap_create(&nrs_tbf_heap_ops,
+					  CBH_FLAG_ATOMIC_GROW, 4096, NULL,
+					  nrs_pol2cptab(policy),
+					  nrs_pol2cptid(policy));
+	if (head->th_binheap == NULL)
+		GOTO(out_free_head, rc = -ENOMEM);
+
+	atomic_set(&head->th_rule_sequence, 0);
+	spin_lock_init(&head->th_rule_lock);
+	INIT_LIST_HEAD(&head->th_list);
+	hrtimer_init(&head->th_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	head->th_timer.function = nrs_tbf_timer_cb;
+	rc = head->th_ops->o_startup(policy, head);
+	if (rc)
+		GOTO(out_free_heap, rc);
+
+	policy->pol_private = head;
+	return 0;
+out_free_heap:
+	binheap_destroy(head->th_binheap);
+out_free_head:
+	OBD_FREE_PTR(head);
+out:
+	return rc;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_tbf_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_tbf_head *head = policy->pol_private;
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+	struct nrs_tbf_rule *rule, *n;
+
+	LASSERT(head != NULL);
+	LASSERT(head->th_cli_hash != NULL);
+	hrtimer_cancel(&head->th_timer);
+	/* Should cleanup hash first before free rules */
+	cfs_hash_putref(head->th_cli_hash);
+	list_for_each_entry_safe(rule, n, &head->th_list, tr_linkage) {
+		list_del_init(&rule->tr_linkage);
+		nrs_tbf_rule_put(rule);
+	}
+	LASSERT(list_empty(&head->th_list));
+	LASSERT(head->th_binheap != NULL);
+	LASSERT(binheap_is_empty(head->th_binheap));
+	binheap_destroy(head->th_binheap);
+	OBD_FREE_PTR(head);
+	nrs->nrs_throttling = 0;
+	wake_up(&policy->pol_nrs->nrs_svcpt->scp_waitq);
+}
+
+/**
+ * Performs a policy-specific ctl function on TBF policy instances; similar
+ * to ioctl.
+ *
+ * \param[in]	  policy the policy instance
+ * \param[in]	  opc	 the opcode
+ * \param[in,out] arg	 used for passing parameters and information
+ *
+ * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried out successfully
+ * \retval -ve error
+ */
+static int nrs_tbf_ctl(struct ptlrpc_nrs_policy *policy,
+		       enum ptlrpc_nrs_ctl opc,
+		       void *arg)
+{
+	int rc = 0;
+	ENTRY;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch ((enum nrs_ctl_tbf)opc) {
+	default:
+		RETURN(-EINVAL);
+
+	/**
+	 * Read RPC rate size of a policy instance.
+	 */
+	case NRS_CTL_TBF_RD_RULE: {
+		struct nrs_tbf_head *head = policy->pol_private;
+		struct seq_file *m = arg;
+		struct ptlrpc_service_part *svcpt;
+
+		svcpt = policy->pol_nrs->nrs_svcpt;
+		seq_printf(m, "CPT %d:\n", svcpt->scp_cpt);
+
+		rc = nrs_tbf_rule_dump_all(head, m);
+		}
+		break;
+
+	/**
+	 * Write RPC rate of a policy instance.
+	 */
+	case NRS_CTL_TBF_WR_RULE: {
+		struct nrs_tbf_head *head = policy->pol_private;
+		struct nrs_tbf_cmd *cmd;
+
+		cmd = (struct nrs_tbf_cmd *)arg;
+		rc = nrs_tbf_command(policy,
+				     head,
+				     cmd);
+		}
+		break;
+	/**
+	 * Read the TBF policy type of a policy instance.
+	 */
+	case NRS_CTL_TBF_RD_TYPE_FLAG: {
+		struct nrs_tbf_head *head = policy->pol_private;
+
+		*(__u32 *)arg = head->th_type_flag;
+		}
+		break;
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Is called for obtaining a TBF policy resource.
+ *
+ * \param[in]  policy	  The policy on which the request is being asked for
+ * \param[in]  nrq	  The request for which resources are being taken
+ * \param[in]  parent	  Parent resource, unused in this policy
+ * \param[out] resp	  Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *			  policy
+ *
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_tbf_res_get(struct ptlrpc_nrs_policy *policy,
+			   struct ptlrpc_nrs_request *nrq,
+			   const struct ptlrpc_nrs_resource *parent,
+			   struct ptlrpc_nrs_resource **resp,
+			   bool moving_req)
+{
+	struct nrs_tbf_head   *head;
+	struct nrs_tbf_client *cli;
+	struct nrs_tbf_client *tmp;
+	struct ptlrpc_request *req;
+
+	if (parent == NULL) {
+		*resp = &((struct nrs_tbf_head *)policy->pol_private)->th_res;
+		return 0;
+	}
+
+	head = container_of(parent, struct nrs_tbf_head, th_res);
+	req = container_of(nrq, struct ptlrpc_request, rq_nrq);
+	cli = head->th_ops->o_cli_find(head, req);
+	if (cli != NULL) {
+		spin_lock(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+		LASSERT(cli->tc_rule);
+		if (cli->tc_rule_sequence !=
+		    atomic_read(&head->th_rule_sequence) ||
+		    cli->tc_rule->tr_flags & NTRS_STOPPING) {
+			struct nrs_tbf_rule *rule;
+
+			CDEBUG(D_RPCTRACE,
+			       "TBF class@%p rate %llu sequence %d, "
+			       "rule flags %d, head sequence %d\n",
+			       cli, cli->tc_rpc_rate,
+			       cli->tc_rule_sequence,
+			       cli->tc_rule->tr_flags,
+			       atomic_read(&head->th_rule_sequence));
+			rule = nrs_tbf_rule_match(head, cli);
+			if (rule != cli->tc_rule) {
+				nrs_tbf_cli_reset(head, rule, cli);
+			} else {
+				if (cli->tc_rule_generation != rule->tr_generation)
+					nrs_tbf_cli_reset_value(head, cli);
+				nrs_tbf_rule_put(rule);
+			}
+		} else if (cli->tc_rule_generation !=
+			   cli->tc_rule->tr_generation) {
+			nrs_tbf_cli_reset_value(head, cli);
+		}
+		spin_unlock(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+		goto out;
+	}
+
+	OBD_CPT_ALLOC_GFP(cli, nrs_pol2cptab(policy), nrs_pol2cptid(policy),
+			  sizeof(*cli), moving_req ? GFP_ATOMIC : __GFP_IO);
+	if (cli == NULL)
+		return -ENOMEM;
+
+	nrs_tbf_cli_init(head, cli, req);
+	tmp = head->th_ops->o_cli_findadd(head, cli);
+	if (tmp != cli) {
+		atomic_dec(&cli->tc_ref);
+		nrs_tbf_cli_fini(cli);
+		cli = tmp;
+	}
+out:
+	*resp = &cli->tc_res;
+
+	return 1;
+}
+
+/**
+ * Called when releasing references to the resource hierachy obtained for a
+ * request for scheduling using the TBF policy.
+ *
+ * \param[in] policy   the policy the resource belongs to
+ * \param[in] res      the resource to be released
+ */
+static void nrs_tbf_res_put(struct ptlrpc_nrs_policy *policy,
+			    const struct ptlrpc_nrs_resource *res)
+{
+	struct nrs_tbf_head   *head;
+	struct nrs_tbf_client *cli;
+
+	/**
+	 * Do nothing for freeing parent, nrs_tbf_net resources
+	 */
+	if (res->res_parent == NULL)
+		return;
+
+	cli = container_of(res, struct nrs_tbf_client, tc_res);
+	head = container_of(res->res_parent, struct nrs_tbf_head, th_res);
+
+	head->th_ops->o_cli_put(head, cli);
+}
+
+/**
+ * Called when getting a request from the TBF policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  Force the policy to return a request; unused in this
+ *		     policy
+ *
+ * \retval The request to be handled; this is the next request in the TBF
+ *	   rule
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
+					   bool peek, bool force)
+{
+	struct nrs_tbf_head	  *head = policy->pol_private;
+	struct ptlrpc_nrs_request *nrq = NULL;
+	struct nrs_tbf_client     *cli;
+	struct binheap_node	  *node;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+
+	if (!peek && policy->pol_nrs->nrs_throttling)
+		return NULL;
+
+	node = binheap_root(head->th_binheap);
+	if (unlikely(node == NULL))
+		return NULL;
+
+	cli = container_of(node, struct nrs_tbf_client, tc_node);
+	LASSERT(cli->tc_in_heap);
+	if (peek) {
+		nrq = list_entry(cli->tc_list.next,
+				     struct ptlrpc_nrs_request,
+				     nr_u.tbf.tr_list);
+	} else {
+		struct nrs_tbf_rule *rule = cli->tc_rule;
+		__u64 now = ktime_to_ns(ktime_get());
+		__u64 passed;
+		__u64 ntoken;
+		__u64 deadline;
+		__u64 old_resid = 0;
+
+		deadline = cli->tc_check_time +
+			  cli->tc_nsecs;
+		LASSERT(now >= cli->tc_check_time);
+		passed = now - cli->tc_check_time;
+		ntoken = passed * cli->tc_rpc_rate;
+		do_div(ntoken, NSEC_PER_SEC);
+
+		ntoken += cli->tc_ntoken;
+		if (rule->tr_flags & NTRS_REALTIME) {
+			LASSERT(cli->tc_nsecs_resid < cli->tc_nsecs);
+			old_resid = cli->tc_nsecs_resid;
+			cli->tc_nsecs_resid += passed % cli->tc_nsecs;
+			if (cli->tc_nsecs_resid > cli->tc_nsecs) {
+				ntoken++;
+				cli->tc_nsecs_resid -= cli->tc_nsecs;
+			}
+		} else if (ntoken > cli->tc_depth)
+			ntoken = cli->tc_depth;
+
+		if (ntoken > 0) {
+			struct ptlrpc_request *req;
+			nrq = list_entry(cli->tc_list.next,
+					     struct ptlrpc_nrs_request,
+					     nr_u.tbf.tr_list);
+			req = container_of(nrq,
+					   struct ptlrpc_request,
+					   rq_nrq);
+			ntoken--;
+			cli->tc_ntoken = ntoken;
+			cli->tc_check_time = now;
+			list_del_init(&nrq->nr_u.tbf.tr_list);
+			if (list_empty(&cli->tc_list)) {
+				binheap_remove(head->th_binheap,
+					       &cli->tc_node);
+				cli->tc_in_heap = false;
+			} else {
+				if (!(rule->tr_flags & NTRS_REALTIME))
+					cli->tc_deadline = now + cli->tc_nsecs;
+				binheap_relocate(head->th_binheap,
+						 &cli->tc_node);
+			}
+			CDEBUG(D_RPCTRACE,
+			       "TBF dequeues: class@%p rate %llu gen %llu token %llu, rule@%p rate %llu gen %llu\n",
+			       cli, cli->tc_rpc_rate,
+			       cli->tc_rule_generation, cli->tc_ntoken,
+			       cli->tc_rule, cli->tc_rule->tr_rpc_rate,
+			       cli->tc_rule->tr_generation);
+		} else {
+			ktime_t time;
+
+			if (rule->tr_flags & NTRS_REALTIME) {
+				cli->tc_deadline = deadline;
+				cli->tc_nsecs_resid = old_resid;
+				binheap_relocate(head->th_binheap,
+						 &cli->tc_node);
+				if (node != binheap_root(head->th_binheap))
+					return nrs_tbf_req_get(policy,
+							       peek, force);
+			}
+			policy->pol_nrs->nrs_throttling = 1;
+			head->th_deadline = deadline;
+			time = ktime_set(0, 0);
+			time = ktime_add_ns(time, deadline);
+			hrtimer_start(&head->th_timer, time, HRTIMER_MODE_ABS);
+		}
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ *		      succeed
+ */
+static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
+			   struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_tbf_head   *head;
+	struct nrs_tbf_client *cli;
+	int		       rc = 0;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+
+	cli = container_of(nrs_request_resource(nrq),
+			   struct nrs_tbf_client, tc_res);
+	head = container_of(nrs_request_resource(nrq)->res_parent,
+			    struct nrs_tbf_head, th_res);
+	if (list_empty(&cli->tc_list)) {
+		LASSERT(!cli->tc_in_heap);
+		cli->tc_deadline = cli->tc_check_time + cli->tc_nsecs;
+		rc = binheap_insert(head->th_binheap, &cli->tc_node);
+		if (rc == 0) {
+			cli->tc_in_heap = true;
+			nrq->nr_u.tbf.tr_sequence = head->th_sequence++;
+			list_add_tail(&nrq->nr_u.tbf.tr_list,
+					  &cli->tc_list);
+			if (policy->pol_nrs->nrs_throttling) {
+				__u64 deadline = cli->tc_deadline;
+				if ((head->th_deadline > deadline) &&
+				    (hrtimer_try_to_cancel(&head->th_timer)
+				     >= 0)) {
+					ktime_t time;
+					head->th_deadline = deadline;
+					time = ktime_set(0, 0);
+					time = ktime_add_ns(time, deadline);
+					hrtimer_start(&head->th_timer, time,
+						      HRTIMER_MODE_ABS);
+				}
+			}
+		}
+	} else {
+		LASSERT(cli->tc_in_heap);
+		nrq->nr_u.tbf.tr_sequence = head->th_sequence++;
+		list_add_tail(&nrq->nr_u.tbf.tr_list,
+				  &cli->tc_list);
+	}
+
+	if (rc == 0)
+		CDEBUG(D_RPCTRACE,
+		       "TBF enqueues: class@%p rate %llu gen %llu token %llu, rule@%p rate %llu gen %llu\n",
+		       cli, cli->tc_rpc_rate,
+		       cli->tc_rule_generation, cli->tc_ntoken,
+		       cli->tc_rule, cli->tc_rule->tr_rpc_rate,
+		       cli->tc_rule->tr_generation);
+
+	return rc;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_tbf_req_del(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_tbf_head   *head;
+	struct nrs_tbf_client *cli;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+
+	cli = container_of(nrs_request_resource(nrq),
+			   struct nrs_tbf_client, tc_res);
+	head = container_of(nrs_request_resource(nrq)->res_parent,
+			    struct nrs_tbf_head, th_res);
+
+	LASSERT(!list_empty(&nrq->nr_u.tbf.tr_list));
+	list_del_init(&nrq->nr_u.tbf.tr_list);
+	if (list_empty(&cli->tc_list)) {
+		binheap_remove(head->th_binheap,
+			       &cli->tc_node);
+		cli->tc_in_heap = false;
+	} else {
+		binheap_relocate(head->th_binheap,
+				 &cli->tc_node);
+	}
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_tbf_req_stop(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+
+	CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n",
+	       policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+	       nrq->nr_u.tbf.tr_sequence);
+}
+
+/**
+ * debugfs interface
+ */
+
+/**
+ * The maximum RPC rate.
+ */
+#define LPROCFS_NRS_RATE_MAX		1000000ULL	/* 1rpc/us */
+
+static int
+ptlrpc_lprocfs_nrs_tbf_rule_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service	    *svc = m->private;
+	int			     rc;
+
+	seq_printf(m, "regular_requests:\n");
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_TBF,
+				       NRS_CTL_TBF_RD_RULE,
+				       false, m);
+	if (rc == 0) {
+		/**
+		 * -ENOSPC means buf in the parameter m is overflow, return 0
+		 * here to let upper layer function seq_read alloc a larger
+		 * memory area and do this process again.
+		 */
+	} else if (rc == -ENOSPC) {
+		return 0;
+
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	seq_printf(m, "high_priority_requests:\n");
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_TBF,
+				       NRS_CTL_TBF_RD_RULE,
+				       false, m);
+	if (rc == 0) {
+		/**
+		 * -ENOSPC means buf in the parameter m is overflow, return 0
+		 * here to let upper layer function seq_read alloc a larger
+		 * memory area and do this process again.
+		 */
+	} else if (rc == -ENOSPC) {
+		return 0;
+	}
+
+no_hp:
+
+	return rc;
+}
+
+static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token)
+{
+	int rc;
+	ENTRY;
+
+	switch (cmd->u.tc_start.ts_valid_type) {
+	case NRS_TBF_FLAG_JOBID:
+		rc = nrs_tbf_jobid_parse(cmd, token);
+		break;
+	case NRS_TBF_FLAG_NID:
+		rc = nrs_tbf_nid_parse(cmd, token);
+		break;
+	case NRS_TBF_FLAG_OPCODE:
+		rc = nrs_tbf_opcode_parse(cmd, token);
+		break;
+	case NRS_TBF_FLAG_GENERIC:
+		rc = nrs_tbf_generic_parse(cmd, token);
+		break;
+	case NRS_TBF_FLAG_UID:
+	case NRS_TBF_FLAG_GID:
+		rc = nrs_tbf_ug_id_parse(cmd, token);
+		break;
+	default:
+		RETURN(-EINVAL);
+	}
+
+	RETURN(rc);
+}
+
+static void nrs_tbf_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) {
+		switch (cmd->u.tc_start.ts_valid_type) {
+		case NRS_TBF_FLAG_JOBID:
+			nrs_tbf_jobid_cmd_fini(cmd);
+			break;
+		case NRS_TBF_FLAG_NID:
+			nrs_tbf_nid_cmd_fini(cmd);
+			break;
+		case NRS_TBF_FLAG_OPCODE:
+			nrs_tbf_opcode_cmd_fini(cmd);
+			break;
+		case NRS_TBF_FLAG_GENERIC:
+			nrs_tbf_generic_cmd_fini(cmd);
+			break;
+		case NRS_TBF_FLAG_UID:
+		case NRS_TBF_FLAG_GID:
+			nrs_tbf_id_cmd_fini(cmd);
+			break;
+		default:
+			CWARN("unknown NRS_TBF_FLAGS:0x%x\n",
+			      cmd->u.tc_start.ts_valid_type);
+		}
+	}
+}
+
+static int check_rule_name(const char *name)
+{
+	int i;
+
+	if (name[0] == '\0')
+		return -EINVAL;
+
+	for (i = 0; name[i] != '\0' && i < MAX_TBF_NAME; i++) {
+		if (!isalnum(name[i]) && name[i] != '_')
+			return -EINVAL;
+	}
+
+	if (i == MAX_TBF_NAME)
+		return -ENAMETOOLONG;
+
+	return 0;
+}
+
+static int
+nrs_tbf_parse_value_pair(struct nrs_tbf_cmd *cmd, char *buffer)
+{
+	char	*key;
+	char	*val;
+	int	 rc;
+	__u64	 rate;
+
+	val = buffer;
+	key = strsep(&val, "=");
+	if (val == NULL || strlen(val) == 0)
+		return -EINVAL;
+
+	/* Key of the value pair */
+	if (strcmp(key, "rate") == 0) {
+		rc = kstrtoull(val, 10, &rate);
+		if (rc)
+			return rc;
+
+		if (rate <= 0 || rate >= LPROCFS_NRS_RATE_MAX)
+			return -EINVAL;
+
+		if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE)
+			cmd->u.tc_start.ts_rpc_rate = rate;
+		else if (cmd->tc_cmd == NRS_CTL_TBF_CHANGE_RULE)
+			cmd->u.tc_change.tc_rpc_rate = rate;
+		else
+			return -EINVAL;
+	}  else if (strcmp(key, "rank") == 0) {
+		rc = check_rule_name(val);
+		if (rc)
+			return rc;
+
+		if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE)
+			cmd->u.tc_start.ts_next_name = val;
+		else if (cmd->tc_cmd == NRS_CTL_TBF_CHANGE_RULE)
+			cmd->u.tc_change.tc_next_name = val;
+		else
+			return -EINVAL;
+	} else if (strcmp(key, "realtime") == 0) {
+		unsigned long realtime;
+
+		rc = kstrtoul(val, 10, &realtime);
+		if (rc)
+			return rc;
+
+		if (realtime > 0)
+			cmd->u.tc_start.ts_rule_flags |= NTRS_REALTIME;
+	} else {
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+nrs_tbf_parse_value_pairs(struct nrs_tbf_cmd *cmd, char *buffer)
+{
+	char	*val;
+	char	*token;
+	int	 rc;
+
+	val = buffer;
+	while (val != NULL && strlen(val) != 0) {
+		token = strsep(&val, " ");
+		rc = nrs_tbf_parse_value_pair(cmd, token);
+		if (rc)
+			return rc;
+	}
+
+	switch (cmd->tc_cmd) {
+	case NRS_CTL_TBF_START_RULE:
+		if (cmd->u.tc_start.ts_rpc_rate == 0)
+			cmd->u.tc_start.ts_rpc_rate = tbf_rate;
+		break;
+	case NRS_CTL_TBF_CHANGE_RULE:
+		if (cmd->u.tc_change.tc_rpc_rate == 0 &&
+		    cmd->u.tc_change.tc_next_name == NULL)
+			return -EINVAL;
+		break;
+	case NRS_CTL_TBF_STOP_RULE:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct nrs_tbf_cmd *
+nrs_tbf_parse_cmd(char *buffer, unsigned long count, __u32 type_flag)
+{
+	struct nrs_tbf_cmd *cmd;
+	char *token;
+	char *val;
+	int rc = 0;
+
+	OBD_ALLOC_PTR(cmd);
+	if (cmd == NULL)
+		GOTO(out, rc = -ENOMEM);
+	memset(cmd, 0, sizeof(*cmd));
+
+	val = buffer;
+	token = strsep(&val, " ");
+	if (val == NULL || strlen(val) == 0)
+		GOTO(out_free_cmd, rc = -EINVAL);
+
+	/* Type of the command */
+	if (strcmp(token, "start") == 0) {
+		cmd->tc_cmd = NRS_CTL_TBF_START_RULE;
+		cmd->u.tc_start.ts_valid_type = type_flag;
+	} else if (strcmp(token, "stop") == 0)
+		cmd->tc_cmd = NRS_CTL_TBF_STOP_RULE;
+	else if (strcmp(token, "change") == 0)
+		cmd->tc_cmd = NRS_CTL_TBF_CHANGE_RULE;
+	else
+		GOTO(out_free_cmd, rc = -EINVAL);
+
+	/* Name of the rule */
+	token = strsep(&val, " ");
+	if ((val == NULL && cmd->tc_cmd != NRS_CTL_TBF_STOP_RULE))
+		GOTO(out_free_cmd, rc = -EINVAL);
+
+	rc = check_rule_name(token);
+	if (rc)
+		GOTO(out_free_cmd, rc);
+
+	cmd->tc_name = token;
+
+	if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) {
+		/* List of ID */
+		LASSERT(val);
+		token = val;
+		val = strrchr(token, '}');
+		if (!val)
+			GOTO(out_free_cmd, rc = -EINVAL);
+
+		/* Skip '}' */
+		val++;
+		if (*val == '\0') {
+			val = NULL;
+		} else if (*val == ' ') {
+			*val = '\0';
+			val++;
+		} else
+			GOTO(out_free_cmd, rc = -EINVAL);
+
+		rc = nrs_tbf_id_parse(cmd, token);
+		if (rc)
+			GOTO(out_free_cmd, rc);
+	}
+
+	rc = nrs_tbf_parse_value_pairs(cmd, val);
+	if (rc)
+		GOTO(out_cmd_fini, rc = -EINVAL);
+	goto out;
+out_cmd_fini:
+	nrs_tbf_cmd_fini(cmd);
+out_free_cmd:
+	OBD_FREE_PTR(cmd);
+out:
+	if (rc)
+		cmd = ERR_PTR(rc);
+	return cmd;
+}
+
+/**
+ * Get the TBF policy type (nid, jobid, etc) preset by
+ * proc entry 'nrs_policies' for command buffer parsing.
+ *
+ * \param[in] svc the PTLRPC service
+ * \param[in] queue the NRS queue type
+ *
+ * \retval the preset TBF policy type flag
+ */
+static __u32
+nrs_tbf_type_flag(struct ptlrpc_service *svc, enum ptlrpc_nrs_queue_type queue)
+{
+	__u32	type;
+	int	rc;
+
+	rc = ptlrpc_nrs_policy_control(svc, queue,
+				       NRS_POL_NAME_TBF,
+				       NRS_CTL_TBF_RD_TYPE_FLAG,
+				       true, &type);
+	if (rc != 0)
+		type = NRS_TBF_FLAG_INVALID;
+
+	return type;
+}
+
+#define LPROCFS_WR_NRS_TBF_MAX_CMD (4096)
+static ssize_t
+ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+	char *kernbuf;
+	char *val;
+	int rc;
+	struct nrs_tbf_cmd *cmd;
+	enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH;
+	unsigned long length;
+	char *token;
+
+	OBD_ALLOC(kernbuf, LPROCFS_WR_NRS_TBF_MAX_CMD);
+	if (kernbuf == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (count > LPROCFS_WR_NRS_TBF_MAX_CMD - 1)
+		GOTO(out_free_kernbuff, rc = -EINVAL);
+
+	if (copy_from_user(kernbuf, buffer, count))
+		GOTO(out_free_kernbuff, rc = -EFAULT);
+
+	val = kernbuf;
+	token = strsep(&val, " ");
+	if (val == NULL)
+		GOTO(out_free_kernbuff, rc = -EINVAL);
+
+	if (strcmp(token, "reg") == 0) {
+		queue = PTLRPC_NRS_QUEUE_REG;
+	} else if (strcmp(token, "hp") == 0) {
+		queue = PTLRPC_NRS_QUEUE_HP;
+	} else {
+		kernbuf[strlen(token)] = ' ';
+		val = kernbuf;
+	}
+	length = strlen(val);
+
+	if (length == 0)
+		GOTO(out_free_kernbuff, rc = -EINVAL);
+
+	if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc))
+		GOTO(out_free_kernbuff, rc = -ENODEV);
+	else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+	cmd = nrs_tbf_parse_cmd(val, length, nrs_tbf_type_flag(svc, queue));
+	if (IS_ERR(cmd))
+		GOTO(out_free_kernbuff, rc = PTR_ERR(cmd));
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+	rc = ptlrpc_nrs_policy_control(svc, queue,
+				       NRS_POL_NAME_TBF,
+				       NRS_CTL_TBF_WR_RULE,
+				       false, cmd);
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	nrs_tbf_cmd_fini(cmd);
+	OBD_FREE_PTR(cmd);
+out_free_kernbuff:
+	OBD_FREE(kernbuf, LPROCFS_WR_NRS_TBF_MAX_CMD);
+out:
+	return rc ? rc : count;
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
+
+/**
+ * Initializes a TBF policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ *
+ * \retval 0	success
+ * \retval != 0	error
+ */
+static int nrs_tbf_lprocfs_init(struct ptlrpc_service *svc)
+{
+	struct ldebugfs_vars nrs_tbf_lprocfs_vars[] = {
+		{ .name		= "nrs_tbf_rule",
+		  .fops		= &ptlrpc_lprocfs_nrs_tbf_rule_fops,
+		  .data = svc },
+		{ NULL }
+	};
+
+	if (!svc->srv_debugfs_entry)
+		return 0;
+
+	ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_tbf_lprocfs_vars, NULL);
+
+	return 0;
+}
+
+/**
+ * TBF policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_tbf_ops = {
+	.op_policy_start	= nrs_tbf_start,
+	.op_policy_stop		= nrs_tbf_stop,
+	.op_policy_ctl		= nrs_tbf_ctl,
+	.op_res_get		= nrs_tbf_res_get,
+	.op_res_put		= nrs_tbf_res_put,
+	.op_req_get		= nrs_tbf_req_get,
+	.op_req_enqueue		= nrs_tbf_req_add,
+	.op_req_dequeue		= nrs_tbf_req_del,
+	.op_req_stop		= nrs_tbf_req_stop,
+	.op_lprocfs_init	= nrs_tbf_lprocfs_init,
+};
+
+/**
+ * TBF policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_tbf = {
+	.nc_name		= NRS_POL_NAME_TBF,
+	.nc_ops			= &nrs_tbf_ops,
+	.nc_compat		= nrs_policy_compat_all,
+};
+
+/** @} tbf */
+
+/** @} nrs */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
new file mode 100644
index 0000000000000..3263e944e76b7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
@@ -0,0 +1,3001 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/pack_generic.c
+ *
+ * (Un)packing of OST requests
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/crc32.h>
+
+#include <libcfs/libcfs.h>
+
+#include <llog_swab.h>
+#include <lustre_net.h>
+#include <lustre_swab.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include "ptlrpc_internal.h"
+
+static inline __u32 lustre_msg_hdr_size_v2(__u32 count)
+{
+	return cfs_size_round(offsetof(struct lustre_msg_v2,
+				       lm_buflens[count]));
+}
+
+__u32 lustre_msg_hdr_size(__u32 magic, __u32 count)
+{
+	LASSERT(count > 0);
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_hdr_size_v2(count);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return 0;
+	}
+}
+
+static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
+					      enum lustre_msg_version version)
+{
+	enum lustre_msg_version ver = lustre_msg_get_version(msg);
+
+	return (ver & LUSTRE_VERSION_MASK) != version;
+}
+
+int lustre_msg_check_version(struct lustre_msg *msg,
+			     enum lustre_msg_version version)
+{
+#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		CERROR("msg v1 not supported - please upgrade you system\n");
+		return -EINVAL;
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_check_version_v2(msg, version);
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return -EPROTO;
+	}
+#undef LUSTRE_MSG_MAGIC_V1
+}
+
+__u32 lustre_msg_early_size;
+EXPORT_SYMBOL(lustre_msg_early_size);
+
+/* early reply size */
+void lustre_msg_early_size_init(void)
+{
+	__u32 pblen = sizeof(struct ptlrpc_body);
+
+	lustre_msg_early_size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen);
+}
+
+__u32 lustre_msg_size_v2(int count, __u32 *lengths)
+{
+	__u32 size;
+	int i;
+
+	LASSERT(count > 0);
+	size = lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++)
+		size += cfs_size_round(lengths[i]);
+
+	return size;
+}
+EXPORT_SYMBOL(lustre_msg_size_v2);
+
+/*
+ * This returns the size of the buffer that is required to hold a lustre_msg
+ * with the given sub-buffer lengths.
+ * NOTE: this should only be used for NEW requests, and should always be
+ *       in the form of a v2 request.  If this is a connection to a v1
+ *       target then the first buffer will be stripped because the ptlrpc
+ *       data is part of the lustre_msg_v1 header. b=14043
+ */
+__u32 lustre_msg_size(__u32 magic, int count, __u32 *lens)
+{
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2));
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_size_v2(count, lens);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return 0;
+	}
+}
+
+/*
+ * This is used to determine the size of a buffer that was already packed
+ * and will correctly handle the different message formats.
+ */
+__u32 lustre_packed_msg_size(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_packed_msg_size);
+
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+			char **bufs)
+{
+	char *ptr;
+	int i;
+
+	LASSERT(count > 0);
+
+	msg->lm_bufcount = count;
+	/* XXX: lm_secflvr uninitialized here */
+	msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+
+	for (i = 0; i < count; i++)
+		msg->lm_buflens[i] = lens[i];
+
+	if (bufs == NULL)
+		return;
+
+	ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++) {
+		char *tmp = bufs[i];
+
+		if (tmp)
+			memcpy(ptr, tmp, lens[i]);
+		ptr += cfs_size_round(lens[i]);
+	}
+}
+EXPORT_SYMBOL(lustre_init_msg_v2);
+
+static int lustre_pack_request_v2(struct ptlrpc_request *req,
+				  int count, __u32 *lens, char **bufs)
+{
+	int reqlen, rc;
+
+	reqlen = lustre_msg_size_v2(count, lens);
+
+	rc = sptlrpc_cli_alloc_reqbuf(req, reqlen);
+	if (rc)
+		return rc;
+
+	req->rq_reqlen = reqlen;
+
+	lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
+	lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
+	return 0;
+}
+
+int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count,
+			__u32 *lens, char **bufs)
+{
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+	/* only use new format, we don't need to be compatible with 1.4 */
+	magic = LUSTRE_MSG_MAGIC_V2;
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_pack_request_v2(req, count, lens, bufs);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return -EINVAL;
+	}
+}
+
+#if RS_DEBUG
+struct list_head ptlrpc_rs_debug_lru =
+	LIST_HEAD_INIT(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru);	\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_del(&(rs)->rs_debug_list);				\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0)
+#endif
+
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_reply_state *rs = NULL;
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	/* See if we have anything in a pool, and wait if nothing */
+	while (list_empty(&svcpt->scp_rep_idle)) {
+		int			rc;
+
+		spin_unlock(&svcpt->scp_rep_lock);
+		/* If we cannot get anything for some long time, we better
+		 * bail out instead of waiting infinitely */
+		rc = wait_event_idle_timeout(svcpt->scp_rep_waitq,
+					     !list_empty(&svcpt->scp_rep_idle),
+					     cfs_time_seconds(10));
+		if (rc <= 0)
+			goto out;
+		spin_lock(&svcpt->scp_rep_lock);
+	}
+
+	rs = list_first_entry(&svcpt->scp_rep_idle,
+			      struct ptlrpc_reply_state, rs_list);
+	list_del(&rs->rs_list);
+
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	memset(rs, 0, svcpt->scp_service->srv_max_reply_size);
+	rs->rs_size = svcpt->scp_service->srv_max_reply_size;
+	rs->rs_svcpt = svcpt;
+	rs->rs_prealloc = 1;
+out:
+	return rs;
+}
+
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	spin_unlock(&svcpt->scp_rep_lock);
+	wake_up(&svcpt->scp_rep_waitq);
+}
+
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+			 __u32 *lens, char **bufs, int flags)
+{
+	struct ptlrpc_reply_state *rs;
+	int msg_len, rc;
+	ENTRY;
+
+	LASSERT(req->rq_reply_state == NULL);
+	LASSERT(count > 0);
+
+	if ((flags & LPRFL_EARLY_REPLY) == 0) {
+		spin_lock(&req->rq_lock);
+		req->rq_packed_final = 1;
+		spin_unlock(&req->rq_lock);
+	}
+
+	msg_len = lustre_msg_size_v2(count, lens);
+	rc = sptlrpc_svc_alloc_rs(req, msg_len);
+	if (rc)
+		RETURN(rc);
+
+	rs = req->rq_reply_state;
+	atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */
+	rs->rs_cb_id.cbid_fn = reply_out_callback;
+	rs->rs_cb_id.cbid_arg = rs;
+	rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt;
+	INIT_LIST_HEAD(&rs->rs_exp_list);
+	INIT_LIST_HEAD(&rs->rs_obd_list);
+	INIT_LIST_HEAD(&rs->rs_list);
+	spin_lock_init(&rs->rs_lock);
+
+	req->rq_replen = msg_len;
+	req->rq_reply_state = rs;
+	req->rq_repmsg = rs->rs_msg;
+
+	lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+	lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+
+	PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_pack_reply_v2);
+
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens,
+			    char **bufs, int flags)
+{
+	int rc = 0;
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		rc = lustre_pack_reply_v2(req, count, lens, bufs, flags);
+		break;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n",
+			 req->rq_reqmsg->lm_magic);
+		rc = -EINVAL;
+	}
+	if (rc != 0)
+		CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc,
+		       lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens));
+	return rc;
+}
+
+int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens,
+		      char **bufs)
+{
+	return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+EXPORT_SYMBOL(lustre_pack_reply);
+
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size)
+{
+	__u32 i, offset, buflen, bufcount;
+
+	LASSERT(m != NULL);
+	LASSERT(m->lm_bufcount > 0);
+
+	bufcount = m->lm_bufcount;
+	if (unlikely(n >= bufcount)) {
+		CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+		       m, n, bufcount);
+		return NULL;
+	}
+
+	buflen = m->lm_buflens[n];
+	if (unlikely(buflen < min_size)) {
+		CERROR("msg %p buffer[%d] size %d too small "
+		       "(required %d, opc=%d)\n", m, n, buflen, min_size,
+		       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+		return NULL;
+	}
+
+	offset = lustre_msg_hdr_size_v2(bufcount);
+	for (i = 0; i < n; i++)
+		offset += cfs_size_round(m->lm_buflens[i]);
+
+	return (char *)m + offset;
+}
+
+void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 min_size)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_buf_v2(m, n, min_size);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x (msg:%p)\n",
+			 m->lm_magic, m);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_buf);
+
+static int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, __u32 segment,
+				unsigned int newlen, int move_data)
+{
+	char *tail = NULL, *newpos;
+	int tail_len = 0, n;
+
+	LASSERT(msg);
+	LASSERT(msg->lm_bufcount > segment);
+	LASSERT(msg->lm_buflens[segment] >= newlen);
+
+	if (msg->lm_buflens[segment] == newlen)
+		goto out;
+
+	if (move_data && msg->lm_bufcount > segment + 1) {
+		tail = lustre_msg_buf_v2(msg, segment + 1, 0);
+		for (n = segment + 1; n < msg->lm_bufcount; n++)
+			tail_len += cfs_size_round(msg->lm_buflens[n]);
+	}
+
+	msg->lm_buflens[segment] = newlen;
+
+	if (tail && tail_len) {
+		newpos = lustre_msg_buf_v2(msg, segment + 1, 0);
+		LASSERT(newpos <= tail);
+		if (newpos != tail)
+			memmove(newpos, tail, tail_len);
+	}
+out:
+	return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+}
+
+/*
+ * for @msg, shrink @segment to size @newlen. if @move_data is non-zero,
+ * we also move data forward from @segment + 1.
+ *
+ * if @newlen == 0, we remove the segment completely, but we still keep the
+ * totally bufcount the same to save possible data moving. this will leave a
+ * unused segment with size 0 at the tail, but that's ok.
+ *
+ * return new msg size after shrinking.
+ *
+ * CAUTION:
+ * + if any buffers higher than @segment has been filled in, must call shrink
+ *   with non-zero @move_data.
+ * + caller should NOT keep pointers to msg buffers which higher than @segment
+ *   after call shrink.
+ */
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+		      unsigned int newlen, int move_data)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_shrink_msg_v2(msg, segment, newlen, move_data);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_shrink_msg);
+
+static int lustre_grow_msg_v2(struct lustre_msg_v2 *msg, __u32 segment,
+			      unsigned int newlen)
+{
+	char *tail = NULL, *newpos;
+	int tail_len = 0, n;
+
+	LASSERT(msg);
+	LASSERT(msg->lm_bufcount > segment);
+	LASSERT(msg->lm_buflens[segment] <= newlen);
+
+	if (msg->lm_buflens[segment] == newlen)
+		goto out;
+
+	if (msg->lm_bufcount > segment + 1) {
+		tail = lustre_msg_buf_v2(msg, segment + 1, 0);
+		for (n = segment + 1; n < msg->lm_bufcount; n++)
+			tail_len += cfs_size_round(msg->lm_buflens[n]);
+	}
+
+	msg->lm_buflens[segment] = newlen;
+
+	if (tail && tail_len) {
+		newpos = lustre_msg_buf_v2(msg, segment + 1, 0);
+		memmove(newpos, tail, tail_len);
+	}
+out:
+	return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+}
+
+/*
+ * for @msg, grow @segment to size @newlen.
+ * Always move higher buffer forward.
+ *
+ * return new msg size after growing.
+ *
+ * CAUTION:
+ * - caller must make sure there is enough space in allocated message buffer
+ * - caller should NOT keep pointers to msg buffers which higher than @segment
+ *   after call shrink.
+ */
+int lustre_grow_msg(struct lustre_msg *msg, int segment, unsigned int newlen)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_grow_msg_v2(msg, segment, newlen);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_grow_msg);
+
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+	PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+	LASSERT(atomic_read(&rs->rs_refcount) == 0);
+	LASSERT(!rs->rs_difficult || rs->rs_handled);
+	LASSERT(!rs->rs_difficult || rs->rs_unlinked);
+	LASSERT(!rs->rs_scheduled);
+	LASSERT(rs->rs_export == NULL);
+	LASSERT(rs->rs_nlocks == 0);
+	LASSERT(list_empty(&rs->rs_exp_list));
+	LASSERT(list_empty(&rs->rs_obd_list));
+
+	sptlrpc_svc_free_rs(rs);
+}
+
+static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
+{
+	int swabbed, required_len, i, buflen;
+
+	/* Now we know the sender speaks my language. */
+	required_len = lustre_msg_hdr_size_v2(0);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for lustre_msg\n", len);
+		return -EINVAL;
+	}
+
+	swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+	if (swabbed) {
+		__swab32s(&m->lm_magic);
+		__swab32s(&m->lm_bufcount);
+		__swab32s(&m->lm_secflvr);
+		__swab32s(&m->lm_repsize);
+		__swab32s(&m->lm_cksum);
+		__swab32s(&m->lm_flags);
+		BUILD_BUG_ON(offsetof(typeof(*m), lm_padding_2) == 0);
+		BUILD_BUG_ON(offsetof(typeof(*m), lm_padding_3) == 0);
+	}
+
+	if (m->lm_bufcount == 0 || m->lm_bufcount > PTLRPC_MAX_BUFCOUNT) {
+		CERROR("message bufcount %d is not valid\n", m->lm_bufcount);
+		return -EINVAL;
+	}
+	required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+	if (len < required_len) {
+		/* didn't receive all the buffer lengths */
+		CERROR("message length %d too small for %d buflens\n",
+		       len, m->lm_bufcount);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < m->lm_bufcount; i++) {
+		if (swabbed)
+			__swab32s(&m->lm_buflens[i]);
+		buflen = cfs_size_round(m->lm_buflens[i]);
+		if (buflen < 0 || buflen > PTLRPC_MAX_BUFLEN) {
+			CERROR("buffer %d length %d is not valid\n", i, buflen);
+			return -EINVAL;
+		}
+		required_len += buflen;
+	}
+	if (len < required_len || required_len > PTLRPC_MAX_BUFLEN) {
+		CERROR("len: %d, required_len %d, bufcount: %d\n",
+		       len, required_len, m->lm_bufcount);
+		for (i = 0; i < m->lm_bufcount; i++)
+			CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+		return -EINVAL;
+	}
+
+	return swabbed;
+}
+
+int __lustre_unpack_msg(struct lustre_msg *m, int len)
+{
+	int required_len, rc;
+
+	ENTRY;
+	/*
+	 * We can provide a slightly better error log, if we check the
+	 * message magic and version first.  In the future, struct
+	 * lustre_msg may grow, and we'd like to log a version mismatch,
+	 * rather than a short message.
+	 */
+	required_len = offsetof(struct lustre_msg, lm_magic) +
+				sizeof(m->lm_magic);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for magic/version check\n",
+		       len);
+		RETURN(-EINVAL);
+	}
+
+	rc = lustre_unpack_msg_v2(m, len);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(__lustre_unpack_msg);
+
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len)
+{
+	int rc;
+
+	rc = __lustre_unpack_msg(req->rq_reqmsg, len);
+	if (rc == 1) {
+		req_capsule_set_req_swabbed(&req->rq_pill,
+					    MSG_PTLRPC_HEADER_OFF);
+		rc = 0;
+	}
+	return rc;
+}
+
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len)
+{
+	int rc;
+
+	rc = __lustre_unpack_msg(req->rq_repmsg, len);
+	if (rc == 1) {
+		req_capsule_set_rep_swabbed(&req->rq_pill,
+					    MSG_PTLRPC_HEADER_OFF);
+		rc = 0;
+	}
+	return rc;
+}
+
+static inline int
+lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req,
+			     enum req_location loc, int offset)
+{
+	struct ptlrpc_body *pb;
+	struct lustre_msg_v2 *m;
+
+	m = loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg;
+
+	pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2));
+	if (!pb) {
+		CERROR("error unpacking ptlrpc body\n");
+		return -EFAULT;
+	}
+	if (req_capsule_need_swab(&req->rq_pill, loc, offset)) {
+		lustre_swab_ptlrpc_body(pb);
+		req_capsule_set_swabbed(&req->rq_pill, loc, offset);
+	}
+
+	if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
+		CERROR("wrong lustre_msg version %08x\n", pb->pb_version);
+		return -EINVAL;
+	}
+
+	if (loc == RCL_SERVER)
+		pb->pb_status = ptlrpc_status_ntoh(pb->pb_status);
+
+	return 0;
+}
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_unpack_ptlrpc_body_v2(req, RCL_CLIENT, offset);
+	default:
+		CERROR("bad lustre msg magic: %08x\n",
+		       req->rq_reqmsg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+	switch (req->rq_repmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_unpack_ptlrpc_body_v2(req, RCL_SERVER, offset);
+	default:
+		CERROR("bad lustre msg magic: %08x\n",
+		       req->rq_repmsg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+static inline __u32 lustre_msg_buflen_v2(struct lustre_msg_v2 *m, __u32 n)
+{
+	if (n >= m->lm_bufcount)
+		return 0;
+
+	return m->lm_buflens[n];
+}
+
+/**
+ * lustre_msg_buflen - return the length of buffer \a n in message \a m
+ * \param m lustre_msg (request or reply) to look at
+ * \param n message index (base 0)
+ *
+ * returns zero for non-existent message indices
+ */
+__u32 lustre_msg_buflen(struct lustre_msg *m, __u32 n)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_buflen_v2(m, n);
+	default:
+		CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_buflen);
+
+static inline void
+lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, __u32 n, __u32 len)
+{
+	if (n >= m->lm_bufcount)
+		LBUG();
+
+	m->lm_buflens[n] = len;
+}
+
+void lustre_msg_set_buflen(struct lustre_msg *m, __u32 n, __u32 len)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		lustre_msg_set_buflen_v2(m, n, len);
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+	}
+}
+
+/*
+ * NB return the bufcount for lustre_msg_v2 format, so if message is packed
+ * in V1 format, the result is one bigger. (add struct ptlrpc_body).
+ */
+__u32 lustre_msg_bufcount(struct lustre_msg *m)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return m->lm_bufcount;
+	default:
+		CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return 0;
+	}
+}
+
+char *lustre_msg_string(struct lustre_msg *m, __u32 index, __u32 max_len)
+{
+	/* max_len == 0 means the string should fill the buffer */
+	char *str;
+	__u32 slen, blen;
+
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		str = lustre_msg_buf_v2(m, index, 0);
+		blen = lustre_msg_buflen_v2(m, index);
+		break;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+	}
+
+	if (str == NULL) {
+		CERROR("can't unpack string in msg %p buffer[%d]\n", m, index);
+		return NULL;
+	}
+
+	slen = strnlen(str, blen);
+
+	if (slen == blen) { /* not NULL terminated */
+		CERROR("can't unpack non-NULL terminated string in msg %p buffer[%d] len %d\n",
+		       m, index, blen);
+		return NULL;
+	}
+	if (blen > PTLRPC_MAX_BUFLEN) {
+		CERROR("buffer length of msg %p buffer[%d] is invalid(%d)\n",
+		       m, index, blen);
+		return NULL;
+	}
+
+	if (max_len == 0) {
+		if (slen != blen - 1) {
+			CERROR("can't unpack short string in msg %p buffer[%d] len %d: strlen %d\n",
+			       m, index, blen, slen);
+			return NULL;
+		}
+	} else if (slen > max_len) {
+		CERROR("can't unpack oversized string in msg %p buffer[%d] len %d strlen %d: max %d expected\n",
+		       m, index, blen, slen, max_len);
+		return NULL;
+	}
+
+	return str;
+}
+
+/* Wrap up the normal fixed length cases */
+static inline void *__lustre_swab_buf(struct lustre_msg *msg, __u32 index,
+				      __u32 min_size, void *swabber)
+{
+	void *ptr = NULL;
+
+	LASSERT(msg != NULL);
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		ptr = lustre_msg_buf_v2(msg, index, min_size);
+		break;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+	}
+
+	if (ptr != NULL && swabber != NULL)
+		((void (*)(void *))swabber)(ptr);
+
+	return ptr;
+}
+
+static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
+{
+	return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				 sizeof(struct ptlrpc_body_v2));
+}
+
+enum lustre_msghdr lustre_msghdr_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		/* already in host endian */
+		return msg->lm_flags;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msghdr_get_flags);
+
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_flags = flags;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+__u32 lustre_msg_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb != NULL)
+			return pb->pb_flags;
+
+		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+	}
+	fallthrough;
+	default:
+		/*
+		 * flags might be printed in debug code while message
+		 * uninitialized
+		 */
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_flags);
+
+void lustre_msg_add_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_flags);
+
+void lustre_msg_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags = flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags &= ~flags;
+
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_clear_flags);
+
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb != NULL)
+			return pb->pb_op_flags;
+
+		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+	}
+	fallthrough;
+	default:
+		return 0;
+	}
+}
+
+void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_op_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_op_flags);
+
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return &pb->pb_handle;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+
+__u32 lustre_msg_get_type(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return PTL_RPC_MSG_ERR;
+		}
+		return pb->pb_type;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return PTL_RPC_MSG_ERR;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_type);
+
+enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_version;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+void lustre_msg_add_version(struct lustre_msg *msg, __u32 version)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_version |= version;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+__u32 lustre_msg_get_opc(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_opc;
+	}
+	default:
+		CERROR("incorrect message magic: %08x (msg:%p)\n",
+		       msg->lm_magic, msg);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_opc);
+
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_xid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_xid);
+
+__u16 lustre_msg_get_tag(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_tag;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_tag);
+
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_committed;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_committed);
+
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return pb->pb_pre_versions;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_versions);
+
+__u64 lustre_msg_get_transno(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_transno;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_transno);
+
+int lustre_msg_get_status(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb != NULL)
+			return pb->pb_status;
+		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+	}
+	fallthrough;
+	default:
+		/*
+		 * status might be printed in debug code while message
+		 * uninitialized
+		 */
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_status);
+
+__u64 lustre_msg_get_slv(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_slv;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_slv = slv;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return;
+	}
+}
+
+__u32 lustre_msg_get_limit(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_limit;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_limit = limit;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return;
+	}
+}
+
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_conn_cnt;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+
+__u32 lustre_msg_get_magic(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_magic;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+timeout_t lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_timeout;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_service_time;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+char *lustre_msg_get_jobid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb;
+
+		/* the old pltrpc_body_v2 is smaller; doesn't include jobid */
+		if (msg->lm_buflens[MSG_PTLRPC_BODY_OFF] <
+		    sizeof(struct ptlrpc_body))
+			return NULL;
+
+		pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+					  sizeof(struct ptlrpc_body));
+		if (!pb)
+			return NULL;
+
+		/* If clients send unterminated jobids, terminate them here
+		 * so that there is no chance of string overflow later.
+		 */
+		if (unlikely(pb->pb_jobid[LUSTRE_JOBID_SIZE - 1] != '\0'))
+			pb->pb_jobid[LUSTRE_JOBID_SIZE - 1] = '\0';
+
+		return pb->pb_jobid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_jobid);
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_cksum;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u64 lustre_msg_get_mbits(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_mbits;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, __u32 buf)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_buf_v2(msg, buf, 0);
+		__u32 len = lustre_msg_buflen(msg, buf);
+		__u32 crc;
+
+#if IS_ENABLED(CONFIG_CRC32)
+		/* about 10x faster than crypto_hash for small buffers */
+		crc = crc32_le(~(__u32)0, (unsigned char *)pb, len);
+#elif IS_ENABLED(CONFIG_CRYPTO_CRC32)
+		unsigned int hsize = 4;
+
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+				       len, NULL, 0, (unsigned char *)&crc,
+				       &hsize);
+#else
+#error "need either CONFIG_CRC32 or CONFIG_CRYPTO_CRC32 enabled in the kernel"
+#endif
+		return crc;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_handle = *handle;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_type = type;
+		return;
+		}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_opc = opc;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_xid = last_xid;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_last_xid);
+
+void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_tag = tag;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_tag);
+
+void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_committed = last_committed;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_pre_versions[0] = versions[0];
+		pb->pb_pre_versions[1] = versions[1];
+		pb->pb_pre_versions[2] = versions[2];
+		pb->pb_pre_versions[3] = versions[3];
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_versions);
+
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_transno = transno;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_transno);
+
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_status = status;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_status);
+
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_conn_cnt = conn_cnt;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		LASSERT(timeout >= 0);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_timeout = timeout;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_service_timeout(struct lustre_msg *msg,
+				    timeout_t service_timeout)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		LASSERT(service_timeout >= 0);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_service_time = service_timeout;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		__u32 opc = lustre_msg_get_opc(msg);
+		struct ptlrpc_body *pb;
+
+		/* Don't set jobid for ldlm ast RPCs, they've been shrinked.
+		 * See the comment in ptlrpc_request_pack(). */
+		if (!opc || opc == LDLM_BL_CALLBACK ||
+		    opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK)
+			return;
+
+		pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				       sizeof(struct ptlrpc_body));
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+
+		if (jobid != NULL)
+			memcpy(pb->pb_jobid, jobid, sizeof(pb->pb_jobid));
+		else if (pb->pb_jobid[0] == '\0')
+			lustre_get_jobid(pb->pb_jobid, sizeof(pb->pb_jobid));
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_jobid);
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_cksum = cksum;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_mbits = mbits;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void ptlrpc_request_set_replen(struct ptlrpc_request *req)
+{
+	int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER);
+
+	req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count,
+					 req->rq_pill.rc_area[RCL_SERVER]);
+	if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+		req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_request_set_replen);
+
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens)
+{
+	req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+	if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+		req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+
+/**
+ * Send a remote set_info_async.
+ *
+ * This may go from client to server or server to client.
+ */
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      size_t keylen, void *key,
+		      size_t vallen, void *val,
+		      struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	char *tmp;
+	int rc;
+
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, KEY_IS(KEY_CHANGELOG_CLEAR) ?
+						&RQF_MDT_SET_INFO :
+						&RQF_OBD_SET_INFO);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+			     RCL_CLIENT, vallen);
+	rc = ptlrpc_request_pack(req, version, opcode);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (KEY_IS(KEY_CHANGELOG_CLEAR))
+		do_pack_body(req);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+	memcpy(tmp, val, vallen);
+
+	ptlrpc_request_set_replen(req);
+
+	if (set) {
+		ptlrpc_set_add_req(set, req);
+		ptlrpc_check_set(NULL, set);
+	} else {
+		rc = ptlrpc_queue_wait(req);
+		ptlrpc_req_finished(req);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(do_set_info_async);
+
+/* byte flipping routines for all wire types declared in
+ * lustre_idl.h implemented here.
+ */
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *body)
+{
+	__swab32s(&body->pb_type);
+	__swab32s(&body->pb_version);
+	__swab32s(&body->pb_opc);
+	__swab32s(&body->pb_status);
+	__swab64s(&body->pb_last_xid);
+	__swab16s(&body->pb_tag);
+	BUILD_BUG_ON(offsetof(typeof(*body), pb_padding0) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*body), pb_padding1) == 0);
+	__swab64s(&body->pb_last_committed);
+	__swab64s(&body->pb_transno);
+	__swab32s(&body->pb_flags);
+	__swab32s(&body->pb_op_flags);
+	__swab32s(&body->pb_conn_cnt);
+	__swab32s(&body->pb_timeout);
+	__swab32s(&body->pb_service_time);
+	__swab32s(&body->pb_limit);
+	__swab64s(&body->pb_slv);
+	__swab64s(&body->pb_pre_versions[0]);
+	__swab64s(&body->pb_pre_versions[1]);
+	__swab64s(&body->pb_pre_versions[2]);
+	__swab64s(&body->pb_pre_versions[3]);
+	__swab64s(&body->pb_mbits);
+	BUILD_BUG_ON(offsetof(typeof(*body), pb_padding64_0) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*body), pb_padding64_1) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*body), pb_padding64_2) == 0);
+	/*
+	 * While we need to maintain compatibility between
+	 * clients and servers without ptlrpc_body_v2 (< 2.3)
+	 * do not swab any fields beyond pb_jobid, as we are
+	 * using this swab function for both ptlrpc_body
+	 * and ptlrpc_body_v2.
+	 */
+	/* pb_jobid is an ASCII string and should not be swabbed */
+	BUILD_BUG_ON(offsetof(typeof(*body), pb_jobid) == 0);
+}
+
+void lustre_swab_connect(struct obd_connect_data *ocd)
+{
+	__swab64s(&ocd->ocd_connect_flags);
+	__swab32s(&ocd->ocd_version);
+	__swab32s(&ocd->ocd_grant);
+	__swab64s(&ocd->ocd_ibits_known);
+	__swab32s(&ocd->ocd_index);
+	__swab32s(&ocd->ocd_brw_size);
+	/*
+	 * ocd_blocksize and ocd_inodespace don't need to be swabbed because
+	 * they are 8-byte values
+	 */
+	__swab16s(&ocd->ocd_grant_tax_kb);
+	__swab32s(&ocd->ocd_grant_max_blks);
+	__swab64s(&ocd->ocd_transno);
+	__swab32s(&ocd->ocd_group);
+	__swab32s(&ocd->ocd_cksum_types);
+	__swab32s(&ocd->ocd_instance);
+	/*
+	 * Fields after ocd_cksum_types are only accessible by the receiver
+	 * if the corresponding flag in ocd_connect_flags is set. Accessing
+	 * any field after ocd_maxbytes on the receiver without a valid flag
+	 * may result in out-of-bound memory access and kernel oops.
+	 */
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)
+		__swab32s(&ocd->ocd_max_easize);
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
+		__swab64s(&ocd->ocd_maxbytes);
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS)
+		__swab16s(&ocd->ocd_maxmodrpcs);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), padding0) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), padding1) == 0);
+	if (ocd->ocd_connect_flags & OBD_CONNECT_FLAGS2)
+		__swab64s(&ocd->ocd_connect_flags2);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), padding3) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), padding4) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), padding5) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), padding6) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), padding7) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), padding8) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), padding9) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), paddingA) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), paddingB) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), paddingC) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), paddingD) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), paddingE) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*ocd), paddingF) == 0);
+}
+
+static void lustre_swab_ost_layout(struct ost_layout *ol)
+{
+	__swab32s(&ol->ol_stripe_size);
+	__swab32s(&ol->ol_stripe_count);
+	__swab64s(&ol->ol_comp_start);
+	__swab64s(&ol->ol_comp_end);
+	__swab32s(&ol->ol_comp_id);
+}
+
+void lustre_swab_obdo(struct obdo *o)
+{
+	__swab64s(&o->o_valid);
+	lustre_swab_ost_id(&o->o_oi);
+	__swab64s(&o->o_parent_seq);
+	__swab64s(&o->o_size);
+	__swab64s(&o->o_mtime);
+	__swab64s(&o->o_atime);
+	__swab64s(&o->o_ctime);
+	__swab64s(&o->o_blocks);
+	__swab64s(&o->o_grant);
+	__swab32s(&o->o_blksize);
+	__swab32s(&o->o_mode);
+	__swab32s(&o->o_uid);
+	__swab32s(&o->o_gid);
+	__swab32s(&o->o_flags);
+	__swab32s(&o->o_nlink);
+	__swab32s(&o->o_parent_oid);
+	__swab32s(&o->o_misc);
+	__swab64s(&o->o_ioepoch);
+	__swab32s(&o->o_stripe_idx);
+	__swab32s(&o->o_parent_ver);
+	lustre_swab_ost_layout(&o->o_layout);
+	__swab32s(&o->o_layout_version);
+	__swab32s(&o->o_uid_h);
+	__swab32s(&o->o_gid_h);
+	__swab64s(&o->o_data_version);
+	__swab32s(&o->o_projid);
+	BUILD_BUG_ON(offsetof(typeof(*o), o_padding_4) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*o), o_padding_5) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*o), o_padding_6) == 0);
+
+}
+EXPORT_SYMBOL(lustre_swab_obdo);
+
+void lustre_swab_obd_statfs(struct obd_statfs *os)
+{
+	__swab64s(&os->os_type);
+	__swab64s(&os->os_blocks);
+	__swab64s(&os->os_bfree);
+	__swab64s(&os->os_bavail);
+	__swab64s(&os->os_files);
+	__swab64s(&os->os_ffree);
+	/* no need to swab os_fsid */
+	__swab32s(&os->os_bsize);
+	__swab32s(&os->os_namelen);
+	__swab64s(&os->os_maxbytes);
+	__swab32s(&os->os_state);
+	__swab32s(&os->os_fprecreated);
+	__swab32s(&os->os_granted);
+	BUILD_BUG_ON(offsetof(typeof(*os), os_spare3) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*os), os_spare4) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*os), os_spare5) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*os), os_spare6) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*os), os_spare7) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*os), os_spare8) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*os), os_spare9) == 0);
+}
+
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
+{
+	lustre_swab_ost_id(&ioo->ioo_oid);
+	__swab32s(&ioo->ioo_max_brw);
+	__swab32s(&ioo->ioo_bufcnt);
+}
+
+void lustre_swab_niobuf_remote(struct niobuf_remote *nbr)
+{
+	__swab64s(&nbr->rnb_offset);
+	__swab32s(&nbr->rnb_len);
+	__swab32s(&nbr->rnb_flags);
+}
+
+void lustre_swab_ost_body(struct ost_body *b)
+{
+	lustre_swab_obdo(&b->oa);
+}
+
+void lustre_swab_ost_last_id(u64 *id)
+{
+	__swab64s(id);
+}
+
+void lustre_swab_generic_32s(__u32 *val)
+{
+	__swab32s(val);
+}
+
+void lustre_swab_gl_lquota_desc(struct ldlm_gl_lquota_desc *desc)
+{
+	lustre_swab_lu_fid(&desc->gl_id.qid_fid);
+	__swab64s(&desc->gl_flags);
+	__swab64s(&desc->gl_ver);
+	__swab64s(&desc->gl_hardlimit);
+	__swab64s(&desc->gl_softlimit);
+	__swab64s(&desc->gl_time);
+	BUILD_BUG_ON(offsetof(typeof(*desc), gl_pad2) == 0);
+}
+EXPORT_SYMBOL(lustre_swab_gl_lquota_desc);
+
+void lustre_swab_gl_barrier_desc(struct ldlm_gl_barrier_desc *desc)
+{
+	__swab32s(&desc->lgbd_status);
+	__swab32s(&desc->lgbd_timeout);
+	BUILD_BUG_ON(offsetof(typeof(*desc), lgbd_padding) == 0);
+}
+EXPORT_SYMBOL(lustre_swab_gl_barrier_desc);
+
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb_v1);
+
+void lustre_swab_ost_lvb(struct ost_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+	__swab32s(&lvb->lvb_mtime_ns);
+	__swab32s(&lvb->lvb_atime_ns);
+	__swab32s(&lvb->lvb_ctime_ns);
+	__swab32s(&lvb->lvb_padding);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb);
+
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_flags);
+	__swab64s(&lvb->lvb_id_may_rel);
+	__swab64s(&lvb->lvb_id_rel);
+	__swab64s(&lvb->lvb_id_qunit);
+	__swab64s(&lvb->lvb_pad1);
+}
+EXPORT_SYMBOL(lustre_swab_lquota_lvb);
+
+void lustre_swab_barrier_lvb(struct barrier_lvb *lvb)
+{
+	__swab32s(&lvb->lvb_status);
+	__swab32s(&lvb->lvb_index);
+	BUILD_BUG_ON(offsetof(typeof(*lvb), lvb_padding) == 0);
+}
+EXPORT_SYMBOL(lustre_swab_barrier_lvb);
+
+void lustre_swab_mdt_body(struct mdt_body *b)
+{
+	lustre_swab_lu_fid(&b->mbo_fid1);
+	lustre_swab_lu_fid(&b->mbo_fid2);
+	/* handle is opaque */
+	__swab64s(&b->mbo_valid);
+	__swab64s(&b->mbo_size);
+	__swab64s(&b->mbo_mtime);
+	__swab64s(&b->mbo_atime);
+	__swab64s(&b->mbo_ctime);
+	__swab64s(&b->mbo_blocks);
+	__swab64s(&b->mbo_version);
+	__swab64s(&b->mbo_t_state);
+	__swab32s(&b->mbo_fsuid);
+	__swab32s(&b->mbo_fsgid);
+	__swab32s(&b->mbo_capability);
+	__swab32s(&b->mbo_mode);
+	__swab32s(&b->mbo_uid);
+	__swab32s(&b->mbo_gid);
+	__swab32s(&b->mbo_flags);
+	__swab32s(&b->mbo_rdev);
+	__swab32s(&b->mbo_nlink);
+	__swab32s(&b->mbo_layout_gen);
+	__swab32s(&b->mbo_suppgid);
+	__swab32s(&b->mbo_eadatasize);
+	__swab32s(&b->mbo_aclsize);
+	__swab32s(&b->mbo_max_mdsize);
+	BUILD_BUG_ON(offsetof(typeof(*b), mbo_unused3) == 0);
+	__swab32s(&b->mbo_uid_h);
+	__swab32s(&b->mbo_gid_h);
+	__swab32s(&b->mbo_projid);
+	__swab64s(&b->mbo_dom_size);
+	__swab64s(&b->mbo_dom_blocks);
+	__swab64s(&b->mbo_btime);
+	BUILD_BUG_ON(offsetof(typeof(*b), mbo_padding_9) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*b), mbo_padding_10) == 0);
+}
+
+void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
+{
+	/* mio_open_handle is opaque */
+	BUILD_BUG_ON(offsetof(typeof(*b), mio_unused1) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*b), mio_unused2) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*b), mio_padding) == 0);
+}
+
+void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
+{
+	int i;
+
+	__swab32s(&mti->mti_lustre_ver);
+	__swab32s(&mti->mti_stripe_index);
+	__swab32s(&mti->mti_config_ver);
+	__swab32s(&mti->mti_flags);
+	__swab32s(&mti->mti_instance);
+	__swab32s(&mti->mti_nid_count);
+	BUILD_BUG_ON(sizeof(lnet_nid_t) != sizeof(__u64));
+	for (i = 0; i < MTI_NIDS_MAX; i++)
+		__swab64s(&mti->mti_nids[i]);
+}
+
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
+{
+	__u8 i;
+
+	__swab64s(&entry->mne_version);
+	__swab32s(&entry->mne_instance);
+	__swab32s(&entry->mne_index);
+	__swab32s(&entry->mne_length);
+
+	/* mne_nid_(count|type) must be one byte size because we're gonna
+	 * access it w/o swapping. */
+	BUILD_BUG_ON(sizeof(entry->mne_nid_count) != sizeof(__u8));
+	BUILD_BUG_ON(sizeof(entry->mne_nid_type) != sizeof(__u8));
+
+	/* remove this assertion if ipv6 is supported. */
+	LASSERT(entry->mne_nid_type == 0);
+	for (i = 0; i < entry->mne_nid_count; i++) {
+		BUILD_BUG_ON(sizeof(lnet_nid_t) != sizeof(__u64));
+		__swab64s(&entry->u.nids[i]);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
+
+void lustre_swab_mgs_config_body(struct mgs_config_body *body)
+{
+	__swab64s(&body->mcb_offset);
+	__swab32s(&body->mcb_units);
+	__swab16s(&body->mcb_type);
+}
+
+void lustre_swab_mgs_config_res(struct mgs_config_res *body)
+{
+	__swab64s(&body->mcr_offset);
+	__swab64s(&body->mcr_size);
+}
+
+static void lustre_swab_obd_dqinfo(struct obd_dqinfo *i)
+{
+	__swab64s(&i->dqi_bgrace);
+	__swab64s(&i->dqi_igrace);
+	__swab32s(&i->dqi_flags);
+	__swab32s(&i->dqi_valid);
+}
+
+static void lustre_swab_obd_dqblk(struct obd_dqblk *b)
+{
+	__swab64s(&b->dqb_ihardlimit);
+	__swab64s(&b->dqb_isoftlimit);
+	__swab64s(&b->dqb_curinodes);
+	__swab64s(&b->dqb_bhardlimit);
+	__swab64s(&b->dqb_bsoftlimit);
+	__swab64s(&b->dqb_curspace);
+	__swab64s(&b->dqb_btime);
+	__swab64s(&b->dqb_itime);
+	__swab32s(&b->dqb_valid);
+	BUILD_BUG_ON(offsetof(typeof(*b), dqb_padding) == 0);
+}
+
+int lustre_swab_obd_quotactl(struct obd_quotactl *q, __u32 len)
+{
+	if (unlikely(len <= sizeof(struct obd_quotactl)))
+		return -EOVERFLOW;
+
+	__swab32s(&q->qc_cmd);
+	__swab32s(&q->qc_type);
+	__swab32s(&q->qc_id);
+	__swab32s(&q->qc_stat);
+	lustre_swab_obd_dqinfo(&q->qc_dqinfo);
+	lustre_swab_obd_dqblk(&q->qc_dqblk);
+
+	return len;
+}
+
+void lustre_swab_fid2path(struct getinfo_fid2path *gf)
+{
+	lustre_swab_lu_fid(&gf->gf_fid);
+	__swab64s(&gf->gf_recno);
+	__swab32s(&gf->gf_linkno);
+	__swab32s(&gf->gf_pathlen);
+}
+EXPORT_SYMBOL(lustre_swab_fid2path);
+
+static void lustre_swab_fiemap_extent(struct fiemap_extent *fm_extent)
+{
+	__swab64s(&fm_extent->fe_logical);
+	__swab64s(&fm_extent->fe_physical);
+	__swab64s(&fm_extent->fe_length);
+	__swab32s(&fm_extent->fe_flags);
+	__swab32s(&fm_extent->fe_device);
+}
+
+static void lustre_swab_fiemap_hdr(struct fiemap *fiemap)
+{
+	__swab64s(&fiemap->fm_start);
+	__swab64s(&fiemap->fm_length);
+	__swab32s(&fiemap->fm_flags);
+	__swab32s(&fiemap->fm_mapped_extents);
+	__swab32s(&fiemap->fm_extent_count);
+	__swab32s(&fiemap->fm_reserved);
+}
+
+int lustre_swab_fiemap(struct fiemap *fiemap, __u32 len)
+{
+	__u32 i, size, count;
+
+	lustre_swab_fiemap_hdr(fiemap);
+
+	size = fiemap_count_to_size(fiemap->fm_mapped_extents);
+	count = fiemap->fm_mapped_extents;
+	if (unlikely(size > len)) {
+		count = (len - sizeof(struct fiemap)) /
+			sizeof(struct fiemap_extent);
+		fiemap->fm_mapped_extents = count;
+		size = -EOVERFLOW;
+	}
+	/* still swab extents as we cannot yet pass rc to callers */
+	for (i = 0; i < count; i++)
+		lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
+
+	return size;
+}
+
+void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info)
+{
+	lustre_swab_obdo(&fiemap_info->lfik_oa);
+	lustre_swab_fiemap_hdr(&fiemap_info->lfik_fiemap);
+}
+
+void lustre_swab_idx_info(struct idx_info *ii)
+{
+	__swab32s(&ii->ii_magic);
+	__swab32s(&ii->ii_flags);
+	__swab16s(&ii->ii_count);
+	__swab32s(&ii->ii_attrs);
+	lustre_swab_lu_fid(&ii->ii_fid);
+	__swab64s(&ii->ii_version);
+	__swab64s(&ii->ii_hash_start);
+	__swab64s(&ii->ii_hash_end);
+	__swab16s(&ii->ii_keysize);
+	__swab16s(&ii->ii_recsize);
+}
+
+void lustre_swab_lip_header(struct lu_idxpage *lip)
+{
+	/* swab header */
+	__swab32s(&lip->lip_magic);
+	__swab16s(&lip->lip_flags);
+	__swab16s(&lip->lip_nr);
+}
+EXPORT_SYMBOL(lustre_swab_lip_header);
+
+void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
+{
+	__swab32s(&rr->rr_opcode);
+	__swab32s(&rr->rr_cap);
+	__swab32s(&rr->rr_fsuid);
+	/* rr_fsuid_h is unused */
+	__swab32s(&rr->rr_fsgid);
+	/* rr_fsgid_h is unused */
+	__swab32s(&rr->rr_suppgid1);
+	/* rr_suppgid1_h is unused */
+	__swab32s(&rr->rr_suppgid2);
+	/* rr_suppgid2_h is unused */
+	lustre_swab_lu_fid(&rr->rr_fid1);
+	lustre_swab_lu_fid(&rr->rr_fid2);
+	__swab64s(&rr->rr_mtime);
+	__swab64s(&rr->rr_atime);
+	__swab64s(&rr->rr_ctime);
+	__swab64s(&rr->rr_size);
+	__swab64s(&rr->rr_blocks);
+	__swab32s(&rr->rr_bias);
+	__swab32s(&rr->rr_mode);
+	__swab32s(&rr->rr_flags);
+	__swab32s(&rr->rr_flags_h);
+	__swab32s(&rr->rr_umask);
+	__swab16s(&rr->rr_mirror_id);
+
+	BUILD_BUG_ON(offsetof(typeof(*rr), rr_padding_4) == 0);
+};
+
+void lustre_swab_lov_desc(struct lov_desc *ld)
+{
+	__swab32s(&ld->ld_tgt_count);
+	__swab32s(&ld->ld_active_tgt_count);
+	__swab32s(&ld->ld_default_stripe_count);
+	__swab32s(&ld->ld_pattern);
+	__swab64s(&ld->ld_default_stripe_size);
+	__swab64s(&ld->ld_default_stripe_offset);
+	__swab32s(&ld->ld_qos_maxage);
+	/* uuid endian insensitive */
+}
+EXPORT_SYMBOL(lustre_swab_lov_desc);
+
+void lustre_swab_lmv_desc(struct lmv_desc *ld)
+{
+	__swab32s(&ld->ld_tgt_count);
+	__swab32s(&ld->ld_active_tgt_count);
+	__swab32s(&ld->ld_default_stripe_count);
+	__swab32s(&ld->ld_pattern);
+	__swab64s(&ld->ld_default_hash_size);
+	__swab32s(&ld->ld_qos_maxage);
+	/* uuid endian insensitive */
+}
+
+/* This structure is always in little-endian */
+static void lustre_swab_lmv_mds_md_v1(struct lmv_mds_md_v1 *lmm1)
+{
+	int i;
+
+	__swab32s(&lmm1->lmv_magic);
+	__swab32s(&lmm1->lmv_stripe_count);
+	__swab32s(&lmm1->lmv_master_mdt_index);
+	__swab32s(&lmm1->lmv_hash_type);
+	__swab32s(&lmm1->lmv_layout_version);
+	for (i = 0; i < lmm1->lmv_stripe_count; i++)
+		lustre_swab_lu_fid(&lmm1->lmv_stripe_fids[i]);
+}
+
+void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm)
+{
+	switch (lmm->lmv_magic) {
+	case LMV_MAGIC_V1:
+		lustre_swab_lmv_mds_md_v1(&lmm->lmv_md_v1);
+		break;
+	default:
+		break;
+	}
+}
+EXPORT_SYMBOL(lustre_swab_lmv_mds_md);
+
+void lustre_swab_lmv_user_md_objects(struct lmv_user_mds_data *lmd,
+				     int stripe_count)
+{
+	int i;
+
+	for (i = 0; i < stripe_count; i++)
+		__swab32s(&(lmd[i].lum_mds));
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md_objects);
+
+
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
+{
+	__u32 count;
+
+	if (lum->lum_magic == LMV_MAGIC_FOREIGN) {
+		__swab32s(&lum->lum_magic);
+		__swab32s(&((struct lmv_foreign_md *)lum)->lfm_length);
+		__swab32s(&((struct lmv_foreign_md *)lum)->lfm_type);
+		__swab32s(&((struct lmv_foreign_md *)lum)->lfm_flags);
+		return;
+	}
+
+	count = lum->lum_stripe_count;
+	__swab32s(&lum->lum_magic);
+	__swab32s(&lum->lum_stripe_count);
+	__swab32s(&lum->lum_stripe_offset);
+	__swab32s(&lum->lum_hash_type);
+	__swab32s(&lum->lum_type);
+	/* lum_max_inherit and lum_max_inherit_rr do not need to be swabbed */
+	BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding1) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding2) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding3) == 0);
+	switch (lum->lum_magic) {
+	case LMV_USER_MAGIC_SPECIFIC:
+		count = lum->lum_stripe_count;
+		fallthrough;
+	case __swab32(LMV_USER_MAGIC_SPECIFIC):
+		lustre_swab_lmv_user_md_objects(lum->lum_objects, count);
+		break;
+	default:
+		break;
+	}
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md);
+
+static void lustre_print_v1v3(unsigned int lvl, struct lov_user_md *lum,
+			      const char *msg)
+{
+	CDEBUG(lvl, "%s lov_user_md %p:\n", msg, lum);
+	CDEBUG(lvl, "\tlmm_magic: %#x\n", lum->lmm_magic);
+	CDEBUG(lvl, "\tlmm_pattern: %#x\n", lum->lmm_pattern);
+	CDEBUG(lvl, "\tlmm_object_id: %llu\n", lmm_oi_id(&lum->lmm_oi));
+	CDEBUG(lvl, "\tlmm_object_gr: %llu\n", lmm_oi_seq(&lum->lmm_oi));
+	CDEBUG(lvl, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size);
+	CDEBUG(lvl, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count);
+	CDEBUG(lvl, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n",
+	       lum->lmm_stripe_offset);
+	if (lum->lmm_magic == LOV_USER_MAGIC_V3) {
+		struct lov_user_md_v3 *v3 = (void *)lum;
+		CDEBUG(lvl, "\tlmm_pool_name: %s\n", v3->lmm_pool_name);
+	}
+	if (lum->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
+		struct lov_user_md_v3 *v3 = (void *)lum;
+		int i;
+
+		if (v3->lmm_pool_name[0] != '\0')
+			CDEBUG(lvl, "\tlmm_pool_name: %s\n", v3->lmm_pool_name);
+
+		CDEBUG(lvl, "\ttarget list:\n");
+		for (i = 0; i < v3->lmm_stripe_count; i++)
+			CDEBUG(lvl, "\t\t%u\n", v3->lmm_objects[i].l_ost_idx);
+	}
+}
+
+void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum,
+			  const char *msg)
+{
+	struct lov_comp_md_v1	*comp_v1;
+	int			 i;
+
+	if (likely(!cfs_cdebug_show(lvl, DEBUG_SUBSYSTEM)))
+		return;
+
+	if (lum->lmm_magic == LOV_USER_MAGIC_V1 ||
+	    lum->lmm_magic == LOV_USER_MAGIC_V3) {
+		lustre_print_v1v3(lvl, lum, msg);
+		return;
+	}
+
+	if (lum->lmm_magic != LOV_USER_MAGIC_COMP_V1) {
+		CDEBUG(lvl, "%s: bad magic: %x\n", msg, lum->lmm_magic);
+		return;
+	}
+
+	comp_v1 = (struct lov_comp_md_v1 *)lum;
+	CDEBUG(lvl, "%s: lov_comp_md_v1 %p:\n", msg, lum);
+	CDEBUG(lvl, "\tlcm_magic: %#x\n", comp_v1->lcm_magic);
+	CDEBUG(lvl, "\tlcm_size: %#x\n", comp_v1->lcm_size);
+	CDEBUG(lvl, "\tlcm_layout_gen: %#x\n", comp_v1->lcm_layout_gen);
+	CDEBUG(lvl, "\tlcm_flags: %#x\n", comp_v1->lcm_flags);
+	CDEBUG(lvl, "\tlcm_entry_count: %#x\n\n", comp_v1->lcm_entry_count);
+	CDEBUG(lvl, "\tlcm_mirror_count: %#x\n\n", comp_v1->lcm_mirror_count);
+
+	for (i = 0; i < comp_v1->lcm_entry_count; i++) {
+		struct lov_comp_md_entry_v1 *ent = &comp_v1->lcm_entries[i];
+		struct lov_user_md *v1;
+
+		CDEBUG(lvl, "\tentry %d:\n", i);
+		CDEBUG(lvl, "\tlcme_id: %#x\n", ent->lcme_id);
+		CDEBUG(lvl, "\tlcme_flags: %#x\n", ent->lcme_flags);
+		if (ent->lcme_flags & LCME_FL_NOSYNC)
+			CDEBUG(lvl, "\tlcme_timestamp: %llu\n",
+					ent->lcme_timestamp);
+		CDEBUG(lvl, "\tlcme_extent.e_start: %llu\n",
+		       ent->lcme_extent.e_start);
+		CDEBUG(lvl, "\tlcme_extent.e_end: %llu\n",
+		       ent->lcme_extent.e_end);
+		CDEBUG(lvl, "\tlcme_offset: %#x\n", ent->lcme_offset);
+		CDEBUG(lvl, "\tlcme_size: %#x\n\n", ent->lcme_size);
+
+		v1 = (struct lov_user_md *)((char *)comp_v1 +
+				comp_v1->lcm_entries[i].lcme_offset);
+		lustre_print_v1v3(lvl, v1, msg);
+	}
+}
+EXPORT_SYMBOL(lustre_print_user_md);
+
+static void lustre_swab_lmm_oi(struct ost_id *oi)
+{
+	__swab64s(&oi->oi.oi_id);
+	__swab64s(&oi->oi.oi_seq);
+}
+
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
+{
+	ENTRY;
+	__swab32s(&lum->lmm_magic);
+	__swab32s(&lum->lmm_pattern);
+	lustre_swab_lmm_oi(&lum->lmm_oi);
+	__swab32s(&lum->lmm_stripe_size);
+	__swab16s(&lum->lmm_stripe_count);
+	__swab16s(&lum->lmm_stripe_offset);
+	EXIT;
+}
+
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+	lustre_swab_lov_user_md_common(lum);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+	lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+	/* lmm_pool_name nothing to do with char */
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
+
+void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
+{
+	struct lov_comp_md_entry_v1	*ent;
+	struct lov_user_md_v1	*v1;
+	struct lov_user_md_v3	*v3;
+	int	i;
+	bool	cpu_endian;
+	__u32	off, size;
+	__u16	ent_count, stripe_count;
+	ENTRY;
+
+	cpu_endian = lum->lcm_magic == LOV_USER_MAGIC_COMP_V1;
+	ent_count = lum->lcm_entry_count;
+	if (!cpu_endian)
+		__swab16s(&ent_count);
+
+	CDEBUG(D_IOCTL, "swabbing lov_user_comp_md v1\n");
+	__swab32s(&lum->lcm_magic);
+	__swab32s(&lum->lcm_size);
+	__swab32s(&lum->lcm_layout_gen);
+	__swab16s(&lum->lcm_flags);
+	__swab16s(&lum->lcm_entry_count);
+	__swab16s(&lum->lcm_mirror_count);
+	BUILD_BUG_ON(offsetof(typeof(*lum), lcm_padding1) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*lum), lcm_padding2) == 0);
+
+	for (i = 0; i < ent_count; i++) {
+		ent = &lum->lcm_entries[i];
+		off = ent->lcme_offset;
+		size = ent->lcme_size;
+
+		if (!cpu_endian) {
+			__swab32s(&off);
+			__swab32s(&size);
+		}
+		__swab32s(&ent->lcme_id);
+		__swab32s(&ent->lcme_flags);
+		__swab64s(&ent->lcme_timestamp);
+		__swab64s(&ent->lcme_extent.e_start);
+		__swab64s(&ent->lcme_extent.e_end);
+		__swab32s(&ent->lcme_offset);
+		__swab32s(&ent->lcme_size);
+		__swab32s(&ent->lcme_layout_gen);
+		BUILD_BUG_ON(offsetof(typeof(*ent), lcme_padding_1) == 0);
+
+		v1 = (struct lov_user_md_v1 *)((char *)lum + off);
+		stripe_count = v1->lmm_stripe_count;
+		if (!cpu_endian)
+			__swab16s(&stripe_count);
+
+		if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V1) ||
+		    v1->lmm_magic == LOV_USER_MAGIC_V1) {
+			lustre_swab_lov_user_md_v1(v1);
+			if (size > sizeof(*v1))
+				lustre_swab_lov_user_md_objects(v1->lmm_objects,
+								stripe_count);
+		} else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V3) ||
+			   v1->lmm_magic == LOV_USER_MAGIC_V3 ||
+			   v1->lmm_magic == __swab32(LOV_USER_MAGIC_SPECIFIC) ||
+			   v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
+			v3 = (struct lov_user_md_v3 *)v1;
+			lustre_swab_lov_user_md_v3(v3);
+			if (size > sizeof(*v3))
+				lustre_swab_lov_user_md_objects(v3->lmm_objects,
+								stripe_count);
+		} else {
+			CERROR("Invalid magic %#x\n", v1->lmm_magic);
+		}
+	}
+}
+EXPORT_SYMBOL(lustre_swab_lov_comp_md_v1);
+
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+				     int stripe_count)
+{
+	int i;
+
+	ENTRY;
+	for (i = 0; i < stripe_count; i++) {
+		lustre_swab_ost_id(&(lod[i].l_ost_oi));
+		__swab32s(&(lod[i].l_ost_gen));
+		__swab32s(&(lod[i].l_ost_idx));
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+
+void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size)
+{
+	struct lov_user_md_v1 *v1;
+	struct lov_user_md_v3 *v3;
+	struct lov_foreign_md *lfm;
+	__u16 stripe_count;
+	ENTRY;
+
+	CDEBUG(D_IOCTL, "swabbing lov_user_md\n");
+	switch (lum->lmm_magic) {
+	case __swab32(LOV_MAGIC_V1):
+	case LOV_USER_MAGIC_V1:
+	{
+		v1 = (struct lov_user_md_v1 *)lum;
+		stripe_count = v1->lmm_stripe_count;
+
+		if (lum->lmm_magic != LOV_USER_MAGIC_V1)
+			__swab16s(&stripe_count);
+
+		lustre_swab_lov_user_md_v1(v1);
+		if (size > sizeof(*v1))
+			lustre_swab_lov_user_md_objects(v1->lmm_objects,
+							stripe_count);
+
+		break;
+	}
+	case __swab32(LOV_MAGIC_V3):
+	case LOV_USER_MAGIC_V3:
+	{
+		v3 = (struct lov_user_md_v3 *)lum;
+		stripe_count = v3->lmm_stripe_count;
+
+		if (lum->lmm_magic != LOV_USER_MAGIC_V3)
+			__swab16s(&stripe_count);
+
+		lustre_swab_lov_user_md_v3(v3);
+		if (size > sizeof(*v3))
+			lustre_swab_lov_user_md_objects(v3->lmm_objects,
+							stripe_count);
+		break;
+	}
+	case __swab32(LOV_USER_MAGIC_SPECIFIC):
+	case LOV_USER_MAGIC_SPECIFIC:
+	{
+		v3 = (struct lov_user_md_v3 *)lum;
+		stripe_count = v3->lmm_stripe_count;
+
+		if (lum->lmm_magic != LOV_USER_MAGIC_SPECIFIC)
+			__swab16s(&stripe_count);
+
+		lustre_swab_lov_user_md_v3(v3);
+		lustre_swab_lov_user_md_objects(v3->lmm_objects, stripe_count);
+		break;
+	}
+	case __swab32(LOV_MAGIC_COMP_V1):
+	case LOV_USER_MAGIC_COMP_V1:
+		lustre_swab_lov_comp_md_v1((struct lov_comp_md_v1 *)lum);
+		break;
+	case __swab32(LOV_MAGIC_FOREIGN):
+	case LOV_USER_MAGIC_FOREIGN:
+	{
+		lfm = (struct lov_foreign_md *)lum;
+		__swab32s(&lfm->lfm_magic);
+		__swab32s(&lfm->lfm_length);
+		__swab32s(&lfm->lfm_type);
+		__swab32s(&lfm->lfm_flags);
+		break;
+	}
+	default:
+		CDEBUG(D_IOCTL, "Invalid LOV magic %08x\n", lum->lmm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+	__swab32s(&lmm->lmm_magic);
+	__swab32s(&lmm->lmm_pattern);
+	lustre_swab_lmm_oi(&lmm->lmm_oi);
+	__swab32s(&lmm->lmm_stripe_size);
+	__swab16s(&lmm->lmm_stripe_count);
+	__swab16s(&lmm->lmm_layout_gen);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
+void lustre_swab_ldlm_res_id(struct ldlm_res_id *id)
+{
+	int i;
+
+	for (i = 0; i < RES_NAME_SIZE; i++)
+		__swab64s(&id->name[i]);
+}
+
+void lustre_swab_ldlm_policy_data(union ldlm_wire_policy_data *d)
+{
+	/* the lock data is a union and the first two fields are always an
+	 * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock
+	 * data the same way.
+	 */
+	__swab64s(&d->l_extent.start);
+	__swab64s(&d->l_extent.end);
+	__swab64s(&d->l_extent.gid);
+	__swab64s(&d->l_flock.lfw_owner);
+	__swab32s(&d->l_flock.lfw_pid);
+}
+
+void lustre_swab_ldlm_intent(struct ldlm_intent *i)
+{
+	__swab64s(&i->opc);
+}
+
+void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r)
+{
+	__swab32s(&r->lr_type);
+	BUILD_BUG_ON(offsetof(typeof(*r), lr_pad) == 0);
+	lustre_swab_ldlm_res_id(&r->lr_name);
+}
+
+void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l)
+{
+	lustre_swab_ldlm_resource_desc(&l->l_resource);
+	__swab32s(&l->l_req_mode);
+	__swab32s(&l->l_granted_mode);
+	lustre_swab_ldlm_policy_data(&l->l_policy_data);
+}
+
+void lustre_swab_ldlm_request(struct ldlm_request *rq)
+{
+	__swab32s(&rq->lock_flags);
+	lustre_swab_ldlm_lock_desc(&rq->lock_desc);
+	__swab32s(&rq->lock_count);
+	/* lock_handle[] opaque */
+}
+
+void lustre_swab_ldlm_reply(struct ldlm_reply *r)
+{
+	__swab32s(&r->lock_flags);
+	BUILD_BUG_ON(offsetof(typeof(*r), lock_padding) == 0);
+	lustre_swab_ldlm_lock_desc(&r->lock_desc);
+	/* lock_handle opaque */
+	__swab64s(&r->lock_policy_res1);
+	__swab64s(&r->lock_policy_res2);
+}
+
+void lustre_swab_quota_body(struct quota_body *b)
+{
+	lustre_swab_lu_fid(&b->qb_fid);
+	lustre_swab_lu_fid((struct lu_fid *)&b->qb_id);
+	__swab32s(&b->qb_flags);
+	__swab64s(&b->qb_count);
+	__swab64s(&b->qb_usage);
+	__swab64s(&b->qb_slv_ver);
+}
+
+/* Dump functions */
+void dump_ioo(struct obd_ioobj *ioo)
+{
+	CDEBUG(D_RPCTRACE,
+	       "obd_ioobj: ioo_oid="DOSTID", ioo_max_brw=%#x, "
+	       "ioo_bufct=%d\n", POSTID(&ioo->ioo_oid), ioo->ioo_max_brw,
+	       ioo->ioo_bufcnt);
+}
+
+void dump_rniobuf(struct niobuf_remote *nb)
+{
+	CDEBUG(D_RPCTRACE, "niobuf_remote: offset=%llu, len=%d, flags=%x\n",
+	       nb->rnb_offset, nb->rnb_len, nb->rnb_flags);
+}
+
+void dump_obdo(struct obdo *oa)
+{
+	u64 valid = oa->o_valid;
+
+	CDEBUG(D_RPCTRACE, "obdo: o_valid = %#llx\n", valid);
+	if (valid & OBD_MD_FLID)
+		CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi));
+	if (valid & OBD_MD_FLFID)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n",
+		       oa->o_parent_seq);
+	if (valid & OBD_MD_FLSIZE)
+		CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size);
+	if (valid & OBD_MD_FLMTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime);
+	if (valid & OBD_MD_FLATIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime);
+	if (valid & OBD_MD_FLCTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime);
+	if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+		CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks);
+	if (valid & OBD_MD_FLGRANT)
+		CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant);
+	if (valid & OBD_MD_FLBLKSZ)
+		CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+	if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+		CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+		       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+				     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+	if (valid & OBD_MD_FLFLAGS)
+		CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+	if (valid & OBD_MD_FLNLINK)
+		CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+	else if (valid & OBD_MD_FLCKSUM)
+		CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+		       oa->o_nlink);
+	if (valid & OBD_MD_FLPARENT)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+		       oa->o_parent_oid);
+	if (valid & OBD_MD_FLFID) {
+		CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+		       oa->o_stripe_idx);
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+		       oa->o_parent_ver);
+	}
+	if (valid & OBD_MD_FLHANDLE)
+		CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n",
+		       oa->o_handle.cookie);
+}
+
+void dump_ost_body(struct ost_body *ob)
+{
+	dump_obdo(&ob->oa);
+}
+
+void dump_rcs(__u32 *rc)
+{
+	CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc);
+}
+
+static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_reqmsg);
+
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return req_capsule_req_swabbed(&req->rq_pill,
+					       MSG_PTLRPC_BODY_OFF);
+	default:
+		CERROR("bad lustre msg magic: %#08X\n",
+		       req->rq_reqmsg->lm_magic);
+	}
+	return 0;
+}
+
+static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+	if (unlikely(!req->rq_repmsg))
+		return 0;
+
+	switch (req->rq_repmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return req_capsule_rep_swabbed(&req->rq_pill,
+					       MSG_PTLRPC_BODY_OFF);
+	default:
+		/* uninitialized yet */
+		return 0;
+	}
+}
+
+void _debug_req(struct ptlrpc_request *req,
+		struct libcfs_debug_msg_data *msgdata, const char *fmt, ...)
+{
+	bool req_ok = req->rq_reqmsg != NULL;
+	bool rep_ok = false;
+	struct lnet_nid *nid = NULL;
+	struct va_format vaf;
+	va_list args;
+	int rep_flags = -1;
+	int rep_status = -1;
+
+	spin_lock(&req->rq_early_free_lock);
+	if (req->rq_repmsg)
+		rep_ok = true;
+
+	if (req_capsule_req_need_swab(&req->rq_pill)) {
+		req_ok = req_ok && req_ptlrpc_body_swabbed(req);
+		rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req);
+	}
+
+	if (rep_ok) {
+		rep_flags = lustre_msg_get_flags(req->rq_repmsg);
+		rep_status = lustre_msg_get_status(req->rq_repmsg);
+	}
+	spin_unlock(&req->rq_early_free_lock);
+
+	if (req->rq_import && req->rq_import->imp_connection)
+		nid = &req->rq_import->imp_connection->c_peer.nid;
+	else if (req->rq_export && req->rq_export->exp_connection)
+		nid = &req->rq_export->exp_connection->c_peer.nid;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	libcfs_debug_msg(msgdata,
+			 "%pV req@%p x%llu/t%lld(%lld) o%d->%s@%s:%d/%d lens %d/%d e %d to %lld dl %lld ref %d fl " REQ_FLAGS_FMT "/%x/%x rc %d/%d job:'%s'\n",
+			 &vaf,
+			 req, req->rq_xid, req->rq_transno,
+			 req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0,
+			 req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
+			 req->rq_import ?
+			 req->rq_import->imp_obd->obd_name :
+			 req->rq_export ?
+			 req->rq_export->exp_client_uuid.uuid :
+			 "<?>",
+			 nid ? libcfs_nidstr(nid) : "<unknown>",
+			 req->rq_request_portal, req->rq_reply_portal,
+			 req->rq_reqlen, req->rq_replen,
+			 req->rq_early_count, (s64)req->rq_timedout,
+			 (s64)req->rq_deadline,
+			 atomic_read(&req->rq_refcount),
+			 DEBUG_REQ_FLAGS(req),
+			 req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1,
+			 rep_flags, req->rq_status, rep_status,
+			 req_ok ? lustre_msg_get_jobid(req->rq_reqmsg) ?: ""
+				: "");
+	va_end(args);
+}
+EXPORT_SYMBOL(_debug_req);
+
+void lustre_swab_hsm_user_state(struct hsm_user_state *state)
+{
+	__swab32s(&state->hus_states);
+	__swab32s(&state->hus_archive_id);
+}
+
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss)
+{
+	__swab32s(&hss->hss_valid);
+	__swab64s(&hss->hss_setmask);
+	__swab64s(&hss->hss_clearmask);
+	__swab32s(&hss->hss_archive_id);
+}
+
+static void lustre_swab_hsm_extent(struct hsm_extent *extent)
+{
+	__swab64s(&extent->offset);
+	__swab64s(&extent->length);
+}
+
+void lustre_swab_hsm_current_action(struct hsm_current_action *action)
+{
+	__swab32s(&action->hca_state);
+	__swab32s(&action->hca_action);
+	lustre_swab_hsm_extent(&action->hca_location);
+}
+
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
+{
+	lustre_swab_lu_fid(&hui->hui_fid);
+	lustre_swab_hsm_extent(&hui->hui_extent);
+}
+
+void lustre_swab_lu_extent(struct lu_extent *le)
+{
+	__swab64s(&le->e_start);
+	__swab64s(&le->e_end);
+}
+
+void lustre_swab_layout_intent(struct layout_intent *li)
+{
+	__swab32s(&li->li_opc);
+	__swab32s(&li->li_flags);
+	lustre_swab_lu_extent(&li->li_extent);
+}
+
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
+{
+	lustre_swab_lu_fid(&hpk->hpk_fid);
+	__swab64s(&hpk->hpk_cookie);
+	__swab64s(&hpk->hpk_extent.offset);
+	__swab64s(&hpk->hpk_extent.length);
+	__swab16s(&hpk->hpk_flags);
+	__swab16s(&hpk->hpk_errval);
+}
+
+void lustre_swab_hsm_request(struct hsm_request *hr)
+{
+	__swab32s(&hr->hr_action);
+	__swab32s(&hr->hr_archive_id);
+	__swab64s(&hr->hr_flags);
+	__swab32s(&hr->hr_itemcount);
+	__swab32s(&hr->hr_data_len);
+}
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl)
+{
+	__swab64s(&msl->msl_flags);
+}
+
+void lustre_swab_close_data(struct close_data *cd)
+{
+	lustre_swab_lu_fid(&cd->cd_fid);
+	__swab64s(&cd->cd_data_version);
+}
+
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync)
+{
+	int i;
+
+	__swab32s(&resync->resync_count);
+	/* after swab, resync_count must in CPU endian */
+	if (resync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+		for (i = 0; i < resync->resync_count; i++)
+			__swab32s(&resync->resync_ids_inline[i]);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_close_data_resync_done);
+
+void lustre_swab_lfsck_request(struct lfsck_request *lr)
+{
+	__swab32s(&lr->lr_event);
+	__swab32s(&lr->lr_index);
+	__swab32s(&lr->lr_flags);
+	__swab32s(&lr->lr_valid);
+	__swab32s(&lr->lr_speed);
+	__swab16s(&lr->lr_version);
+	__swab16s(&lr->lr_active);
+	__swab16s(&lr->lr_param);
+	__swab16s(&lr->lr_async_windows);
+	__swab32s(&lr->lr_flags);
+	lustre_swab_lu_fid(&lr->lr_fid);
+	lustre_swab_lu_fid(&lr->lr_fid2);
+	__swab32s(&lr->lr_comp_id);
+	BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_0) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_1) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_2) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_3) == 0);
+}
+
+void lustre_swab_lfsck_reply(struct lfsck_reply *lr)
+{
+	__swab32s(&lr->lr_status);
+	BUILD_BUG_ON(offsetof(typeof(*lr), lr_padding_1) == 0);
+	__swab64s(&lr->lr_repaired);
+}
+
+static void lustre_swab_orphan_rec(struct lu_orphan_rec *rec)
+{
+	lustre_swab_lu_fid(&rec->lor_fid);
+	__swab32s(&rec->lor_uid);
+	__swab32s(&rec->lor_gid);
+}
+
+void lustre_swab_orphan_ent(struct lu_orphan_ent *ent)
+{
+	lustre_swab_lu_fid(&ent->loe_key);
+	lustre_swab_orphan_rec(&ent->loe_rec);
+}
+EXPORT_SYMBOL(lustre_swab_orphan_ent);
+
+void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent)
+{
+	lustre_swab_lu_fid(&ent->loe_key);
+	lustre_swab_orphan_rec(&ent->loe_rec.lor_rec);
+	lustre_swab_ost_layout(&ent->loe_rec.lor_layout);
+	BUILD_BUG_ON(offsetof(typeof(ent->loe_rec), lor_padding) == 0);
+}
+EXPORT_SYMBOL(lustre_swab_orphan_ent_v2);
+
+void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent)
+{
+	lustre_swab_lu_fid(&ent->loe_key);
+	lustre_swab_orphan_rec(&ent->loe_rec.lor_rec);
+	lustre_swab_ost_layout(&ent->loe_rec.lor_layout);
+	__swab32s(&ent->loe_rec.lor_layout_version);
+	__swab32s(&ent->loe_rec.lor_range);
+	BUILD_BUG_ON(offsetof(typeof(ent->loe_rec), lor_padding_1) == 0);
+	BUILD_BUG_ON(offsetof(typeof(ent->loe_rec), lor_padding_2) == 0);
+}
+EXPORT_SYMBOL(lustre_swab_orphan_ent_v3);
+
+void lustre_swab_ladvise(struct lu_ladvise *ladvise)
+{
+	__swab16s(&ladvise->lla_advice);
+	__swab16s(&ladvise->lla_value1);
+	__swab32s(&ladvise->lla_value2);
+	__swab64s(&ladvise->lla_start);
+	__swab64s(&ladvise->lla_end);
+	__swab32s(&ladvise->lla_value3);
+	__swab32s(&ladvise->lla_value4);
+}
+EXPORT_SYMBOL(lustre_swab_ladvise);
+
+void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr)
+{
+	__swab32s(&ladvise_hdr->lah_magic);
+	__swab32s(&ladvise_hdr->lah_count);
+	__swab64s(&ladvise_hdr->lah_flags);
+	__swab32s(&ladvise_hdr->lah_value1);
+	__swab32s(&ladvise_hdr->lah_value2);
+	__swab64s(&ladvise_hdr->lah_value3);
+}
+EXPORT_SYMBOL(lustre_swab_ladvise_hdr);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_server.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_server.c
new file mode 100644
index 0000000000000..ec1c20dbef3b7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_server.c
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/pack_server.c
+ *
+ * (Un)packing of OST requests
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <llog_swab.h>
+#include <obd_class.h>
+
+void lustre_swab_object_update(struct object_update *ou)
+{
+	struct object_update_param *param;
+	size_t	i;
+
+	__swab16s(&ou->ou_type);
+	__swab16s(&ou->ou_params_count);
+	__swab32s(&ou->ou_result_size);
+	__swab32s(&ou->ou_flags);
+	__swab32s(&ou->ou_padding1);
+	__swab64s(&ou->ou_batchid);
+	lustre_swab_lu_fid(&ou->ou_fid);
+	param = &ou->ou_params[0];
+	for (i = 0; i < ou->ou_params_count; i++) {
+		__swab16s(&param->oup_len);
+		__swab16s(&param->oup_padding);
+		__swab32s(&param->oup_padding2);
+		param = (struct object_update_param *)((char *)param +
+			 object_update_param_size(param));
+	}
+}
+
+int lustre_swab_object_update_request(struct object_update_request *our,
+				      __u32 len)
+{
+	__u32 i, size = 0;
+	struct object_update *ou;
+
+	__swab32s(&our->ourq_magic);
+	__swab16s(&our->ourq_count);
+	__swab16s(&our->ourq_padding);
+
+	/* Don't need to calculate request size if len is 0. */
+	if (len > 0) {
+		size = sizeof(struct object_update_request);
+		for (i = 0; i < our->ourq_count; i++) {
+			ou = object_update_request_get(our, i, NULL);
+			if (ou == NULL)
+				return -EPROTO;
+			size += sizeof(struct object_update) +
+				ou->ou_params_count *
+				sizeof(struct object_update_param);
+		}
+		if (unlikely(size > len))
+			return -EOVERFLOW;
+	}
+
+	for (i = 0; i < our->ourq_count; i++) {
+		ou = object_update_request_get(our, i, NULL);
+		lustre_swab_object_update(ou);
+	}
+
+	return size;
+}
+
+void lustre_swab_object_update_result(struct object_update_result *our)
+{
+	__swab32s(&our->our_rc);
+	__swab16s(&our->our_datalen);
+	__swab16s(&our->our_padding);
+}
+
+int lustre_swab_object_update_reply(struct object_update_reply *our, __u32 len)
+{
+	__u32 i, size;
+
+	__swab32s(&our->ourp_magic);
+	__swab16s(&our->ourp_count);
+	__swab16s(&our->ourp_padding);
+
+	size = sizeof(struct object_update_reply) + our->ourp_count *
+	       (sizeof(__u16) + sizeof(struct object_update_result));
+	if (unlikely(size > len))
+		return -EOVERFLOW;
+
+	for (i = 0; i < our->ourp_count; i++) {
+		struct object_update_result *ourp;
+
+		__swab16s(&our->ourp_lens[i]);
+		ourp = object_update_result_get(our, i, NULL);
+		if (ourp == NULL)
+			return -EPROTO;
+		lustre_swab_object_update_result(ourp);
+	}
+
+	return size;
+}
+
+void lustre_swab_out_update_header(struct out_update_header *ouh)
+{
+	__swab32s(&ouh->ouh_magic);
+	__swab32s(&ouh->ouh_count);
+	__swab32s(&ouh->ouh_inline_length);
+	__swab32s(&ouh->ouh_reply_size);
+}
+EXPORT_SYMBOL(lustre_swab_out_update_header);
+
+void lustre_swab_out_update_buffer(struct out_update_buffer *oub)
+{
+	__swab32s(&oub->oub_size);
+	__swab32s(&oub->oub_padding);
+}
+EXPORT_SYMBOL(lustre_swab_out_update_buffer);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
new file mode 100644
index 0000000000000..973f2b5ad0d74
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
@@ -0,0 +1,73 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+
+#include "ptlrpc_internal.h"
+
+
+void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
+			 int mdidx)
+{
+	unsigned int start = desc->bd_mds_off[mdidx];
+
+	BUILD_BUG_ON(PTLRPC_MAX_BRW_PAGES >= LI_POISON);
+
+	LASSERT(mdidx < desc->bd_md_max_brw);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+
+	/* just send a lnet header */
+	if (mdidx >= desc->bd_md_count) {
+		md->options |= LNET_MD_KIOV;
+		md->length = 0;
+		md->start = NULL;
+		return;
+	}
+
+	if (mdidx == (desc->bd_md_count - 1))
+		md->length = desc->bd_iov_count - start;
+	else
+		md->length = desc->bd_mds_off[mdidx + 1] - start;
+
+	md->options |= LNET_MD_KIOV;
+	if (desc->bd_enc_vec)
+		md->start = &desc->bd_enc_vec[start];
+	else
+		md->start = &desc->bd_vec[start];
+}
+
+
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
new file mode 100644
index 0000000000000..72825b2ad24e0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
@@ -0,0 +1,571 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/pinger.c
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/fs_struct.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+static int suppress_pings;
+module_param(suppress_pings, int, 0644);
+MODULE_PARM_DESC(suppress_pings, "Suppress pings");
+
+struct mutex pinger_mutex;
+static struct list_head pinger_imports =
+		LIST_HEAD_INIT(pinger_imports);
+
+int ptlrpc_pinger_suppress_pings(void)
+{
+	return suppress_pings;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings);
+
+struct ptlrpc_request *
+ptlrpc_prep_ping(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING,
+					LUSTRE_OBD_VERSION, OBD_PING);
+	if (req) {
+		ptlrpc_request_set_replen(req);
+		req->rq_no_resend = req->rq_no_delay = 1;
+	}
+	return req;
+}
+
+int ptlrpc_obd_ping(struct obd_device *obd)
+{
+	int rc;
+	struct ptlrpc_request *req;
+	struct obd_import *imp;
+
+	ENTRY;
+
+	with_imp_locked(obd, imp, rc) {
+		req = ptlrpc_prep_ping(imp);
+		if (!req) {
+			rc = -ENOMEM;
+			continue;
+		}
+		req->rq_send_state = LUSTRE_IMP_FULL;
+		rc = ptlrpc_queue_wait(req);
+		ptlrpc_req_finished(req);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_obd_ping);
+
+static bool ptlrpc_check_import_is_idle(struct obd_import *imp)
+{
+	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+	time64_t now;
+
+	if (!imp->imp_idle_timeout)
+		return false;
+
+	if (atomic_read(&imp->imp_reqs) > 0)
+		return false;
+
+	/* any lock increases ns_bref being a resource holder */
+	if (ns && atomic_read(&ns->ns_bref) > 0)
+		return false;
+
+	now = ktime_get_real_seconds();
+	if (now - imp->imp_last_reply_time < imp->imp_idle_timeout)
+		return false;
+
+	return true;
+}
+
+static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+#ifdef CONFIG_LUSTRE_FS_PINGER
+	time64_t time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+
+	if (imp->imp_state == LUSTRE_IMP_DISCON) {
+		time64_t dtime = max_t(time64_t, CONNECTION_SWITCH_MIN,
+				       AT_OFF ? 0 :
+				       at_get(&imp->imp_at.iat_net_latency));
+		time = min(time, dtime);
+	}
+	imp->imp_next_ping = ktime_get_seconds() + time;
+#endif /* CONFIG_LUSTRE_FS_PINGER */
+}
+
+static int ptlrpc_ping(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	ENTRY;
+
+	if (ptlrpc_check_import_is_idle(imp) &&
+	    ptlrpc_disconnect_and_idle_import(imp) == 1)
+			RETURN(0);
+
+	req = ptlrpc_prep_ping(imp);
+	if (!req) {
+		CERROR("OOM trying to ping %s->%s\n",
+		       imp->imp_obd->obd_uuid.uuid,
+		       obd2cli_tgt(imp->imp_obd));
+		RETURN(-ENOMEM);
+	}
+
+	DEBUG_REQ(D_INFO, req, "pinging %s->%s",
+		  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* Updating imp_next_ping early, it allows pinger_check_timeout to
+	 * see an actual time for next awake. request_out_callback update
+	 * happens at another thread, and ptlrpc_pinger_main may sleep
+	 * already.
+	 */
+	ptlrpc_update_next_ping(imp, 0);
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+
+void ptlrpc_ping_import_soon(struct obd_import *imp)
+{
+	imp->imp_next_ping = ktime_get_seconds();
+}
+
+static inline int imp_is_deactive(struct obd_import *imp)
+{
+	return imp->imp_deactive ||
+	       OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE);
+}
+
+static inline time64_t ptlrpc_next_reconnect(struct obd_import *imp)
+{
+	return ktime_get_seconds() + INITIAL_CONNECT_TIMEOUT;
+}
+
+static timeout_t pinger_check_timeout(time64_t time)
+{
+	timeout_t timeout = PING_INTERVAL;
+	timeout_t next_timeout;
+	time64_t now;
+	struct list_head *iter;
+	struct obd_import *imp;
+
+	mutex_lock(&pinger_mutex);
+	now = ktime_get_seconds();
+	/* Process imports to find a nearest next ping */
+	list_for_each(iter, &pinger_imports) {
+		imp = list_entry(iter, struct obd_import, imp_pinger_chain);
+		if (!imp->imp_pingable || imp->imp_next_ping < now)
+			continue;
+		next_timeout = imp->imp_next_ping - now;
+		/* make sure imp_next_ping in the future from time */
+		if (next_timeout > (now - time) && timeout > next_timeout)
+			timeout = next_timeout;
+	}
+	mutex_unlock(&pinger_mutex);
+
+	return timeout - (now - time);
+}
+
+static bool ir_up;
+
+void ptlrpc_pinger_ir_up(void)
+{
+	CDEBUG(D_HA, "IR up\n");
+	ir_up = true;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_up);
+
+void ptlrpc_pinger_ir_down(void)
+{
+	CDEBUG(D_HA, "IR down\n");
+	ir_up = false;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
+
+static void ptlrpc_pinger_process_import(struct obd_import *imp,
+					 time64_t this_ping)
+{
+	int level;
+	int force;
+	int force_next;
+	int suppress;
+
+	spin_lock(&imp->imp_lock);
+
+	level = imp->imp_state;
+	force = imp->imp_force_verify;
+	force_next = imp->imp_force_next_verify;
+	/*
+	 * This will be used below only if the import is "FULL".
+	 */
+	suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS);
+
+	imp->imp_force_verify = 0;
+
+	if (imp->imp_next_ping - 5 >= this_ping && !force) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	imp->imp_force_next_verify = 0;
+
+	CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA,
+	       "%s->%s: level %s/%u force %u force_next %u deactive %u pingable %u suppress %u\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(level), level, force, force_next,
+	       imp->imp_deactive, imp->imp_pingable, suppress);
+
+	if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+		/* wait for a while before trying recovery again */
+		imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+		spin_unlock(&imp->imp_lock);
+		if (!imp->imp_no_pinger_recover ||
+		    imp->imp_connect_error == -EAGAIN)
+			ptlrpc_initiate_recovery(imp);
+	} else if (level != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov ||
+		   imp_is_deactive(imp)) {
+		CDEBUG(D_HA,
+		       "%s->%s: not pinging (in recovery or recovery disabled: %s)\n",
+		       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+		       ptlrpc_import_state_name(level));
+		if (force)
+			imp->imp_force_verify = 1;
+		spin_unlock(&imp->imp_lock);
+	} else if ((imp->imp_pingable && !suppress) || force_next || force) {
+		spin_unlock(&imp->imp_lock);
+		ptlrpc_ping(imp);
+	} else {
+		spin_unlock(&imp->imp_lock);
+	}
+}
+
+static struct workqueue_struct *pinger_wq;
+static void ptlrpc_pinger_main(struct work_struct *ws);
+static DECLARE_DELAYED_WORK(ping_work, ptlrpc_pinger_main);
+
+static void ptlrpc_pinger_main(struct work_struct *ws)
+{
+	time64_t this_ping, time_after_ping;
+	timeout_t time_to_next_wake;
+	struct obd_import *imp;
+
+	do {
+		this_ping = ktime_get_seconds();
+
+		mutex_lock(&pinger_mutex);
+
+		list_for_each_entry(imp, &pinger_imports, imp_pinger_chain) {
+			ptlrpc_pinger_process_import(imp, this_ping);
+			/* obd_timeout might have changed */
+			if (imp->imp_pingable && imp->imp_next_ping &&
+			    imp->imp_next_ping > this_ping + PING_INTERVAL)
+				ptlrpc_update_next_ping(imp, 0);
+		}
+		mutex_unlock(&pinger_mutex);
+
+		time_after_ping = ktime_get_seconds();
+		/* update memory usage info */
+		obd_update_maxusage();
+
+		if ((ktime_get_seconds() - this_ping - 3) > PING_INTERVAL)
+			CDEBUG(D_HA, "long time to ping: %lld, %lld, %lld\n",
+			       this_ping, time_after_ping, ktime_get_seconds());
+
+		/* Wait until the next ping time, or until we're stopped. */
+		time_to_next_wake = pinger_check_timeout(this_ping);
+		/*
+		 * The ping sent by ptlrpc_send_rpc may get sent out
+		 * say .01 second after this.
+		 * ptlrpc_pinger_sending_on_import will then set the
+		 * next ping time to next_ping + .01 sec, which means
+		 * we will SKIP the next ping at next_ping, and the
+		 * ping will get sent 2 timeouts from now!  Beware.
+		 */
+		CDEBUG(D_INFO, "next wakeup in %d (%lld)\n",
+		       time_to_next_wake, this_ping + PING_INTERVAL);
+	} while (time_to_next_wake <= 0);
+
+	queue_delayed_work(pinger_wq, &ping_work,
+			   cfs_time_seconds(max(time_to_next_wake, 1)));
+}
+
+int ptlrpc_start_pinger(void)
+{
+#ifdef CONFIG_LUSTRE_FS_PINGER
+	if (pinger_wq)
+		return -EALREADY;
+
+	pinger_wq = cfs_cpt_bind_workqueue("ptlrpc_pinger", cfs_cpt_tab,
+					   0, CFS_CPT_ANY, 1);
+	if (IS_ERR(pinger_wq)) {
+		CERROR("cannot start pinger workqueue\n");
+		return PTR_ERR(pinger_wq);
+	}
+
+	queue_delayed_work(pinger_wq, &ping_work, 0);
+
+	if (suppress_pings)
+		CWARN("Pings will be suppressed at the request of the administrator. The configuration shall meet the additional requirements described in the manual. (Search for the \"suppress_pings\" kernel module parameter.)\n");
+#endif
+	return 0;
+}
+
+int ptlrpc_stop_pinger(void)
+{
+#ifdef CONFIG_LUSTRE_FS_PINGER
+	if (!pinger_wq)
+		return -EALREADY;
+
+	cancel_delayed_work_sync(&ping_work);
+	destroy_workqueue(pinger_wq);
+	pinger_wq = NULL;
+#endif
+	return 0;
+}
+
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+{
+	ptlrpc_update_next_ping(imp, 0);
+}
+
+void ptlrpc_pinger_commit_expected(struct obd_import *imp)
+{
+	ptlrpc_update_next_ping(imp, 1);
+	assert_spin_locked(&imp->imp_lock);
+	/*
+	 * Avoid reading stale imp_connect_data.  When not sure if pings are
+	 * expected or not on next connection, we assume they are not and force
+	 * one anyway to guarantee the chance of updating
+	 * imp_peer_committed_transno.
+	 */
+	if (imp->imp_state != LUSTRE_IMP_FULL ||
+	    OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS))
+		imp->imp_force_next_verify = 1;
+}
+
+int ptlrpc_pinger_add_import(struct obd_import *imp)
+{
+	ENTRY;
+	if (!list_empty(&imp->imp_pinger_chain))
+		RETURN(-EALREADY);
+
+	mutex_lock(&pinger_mutex);
+	CDEBUG(D_HA, "adding pingable import %s->%s\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* if we add to pinger we want recovery on this import */
+	imp->imp_obd->obd_no_recov = 0;
+	ptlrpc_update_next_ping(imp, 0);
+	/* XXX sort, blah blah */
+	list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+	class_import_get(imp);
+
+	ptlrpc_pinger_wake_up();
+	mutex_unlock(&pinger_mutex);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_add_import);
+
+int ptlrpc_pinger_del_import(struct obd_import *imp)
+{
+	ENTRY;
+
+	if (list_empty(&imp->imp_pinger_chain))
+		RETURN(-ENOENT);
+
+	mutex_lock(&pinger_mutex);
+	list_del_init(&imp->imp_pinger_chain);
+	CDEBUG(D_HA, "removing pingable import %s->%s\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* if we remove from pinger we don't want recovery on this import */
+	imp->imp_obd->obd_no_recov = 1;
+	class_import_put(imp);
+	mutex_unlock(&pinger_mutex);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_del_import);
+
+void ptlrpc_pinger_wake_up(void)
+{
+#ifdef CONFIG_LUSTRE_FS_PINGER
+	mod_delayed_work(pinger_wq, &ping_work, 0);
+#endif
+}
+
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+
+static int pet_refcount;
+static int pet_state;
+static wait_queue_head_t pet_waitq;
+static LIST_HEAD(pet_list);
+static DEFINE_SPINLOCK(pet_lock);
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+	struct obd_device *obd;
+
+	spin_lock(&pet_lock);
+	if (pet_state != PET_READY) {
+		/* eventually the new obd will call here again. */
+		spin_unlock(&pet_lock);
+		return 1;
+	}
+
+	obd = class_exp2obd(exp);
+	if (list_empty(&obd->obd_evict_list)) {
+		class_incref(obd, "evictor", obd);
+		list_add(&obd->obd_evict_list, &pet_list);
+	}
+	spin_unlock(&pet_lock);
+
+	wake_up(&pet_waitq);
+	return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+	struct obd_device *obd;
+	struct obd_export *exp;
+	time64_t expire_time;
+
+	ENTRY;
+	unshare_fs_struct();
+	CDEBUG(D_HA, "Starting Ping Evictor\n");
+	pet_state = PET_READY;
+	while (1) {
+		wait_event_idle(pet_waitq,
+				(!list_empty(&pet_list)) ||
+				(pet_state == PET_TERMINATE));
+
+		/* loop until all obd's will be removed */
+		if ((pet_state == PET_TERMINATE) && list_empty(&pet_list))
+			break;
+
+		/*
+		 * we only get here if pet_exp != NULL, and the end of this
+		 * loop is the only place which sets it NULL again, so lock
+		 * is not strictly necessary.
+		 */
+		spin_lock(&pet_lock);
+		obd = list_entry(pet_list.next, struct obd_device,
+				 obd_evict_list);
+		spin_unlock(&pet_lock);
+
+		expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT;
+
+		CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n",
+		       obd->obd_name, expire_time);
+
+		/*
+		 * Exports can't be deleted out of the list while we hold
+		 * the obd lock (class_unlink_export), which means we can't
+		 * lose the last ref on the export.  If they've already been
+		 * removed from the list, we won't find them here.
+		 */
+		spin_lock(&obd->obd_dev_lock);
+		while (!list_empty(&obd->obd_exports_timed)) {
+			exp = list_entry(obd->obd_exports_timed.next,
+					 struct obd_export,
+					 exp_obd_chain_timed);
+			if (expire_time > exp->exp_last_request_time) {
+				struct obd_uuid *client_uuid;
+
+				class_export_get(exp);
+				client_uuid = &exp->exp_client_uuid;
+				spin_unlock(&obd->obd_dev_lock);
+				LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %lld seconds. I think it's dead, and I am evicting it. exp %p, cur %lld expire %lld last %lld\n",
+					      obd->obd_name,
+					      obd_uuid2str(client_uuid),
+					      obd_export_nid2str(exp),
+					      ktime_get_real_seconds() -
+					      exp->exp_last_request_time,
+					      exp, ktime_get_real_seconds(),
+					      expire_time,
+					      exp->exp_last_request_time);
+				CDEBUG(D_HA, "Last request was at %lld\n",
+				       exp->exp_last_request_time);
+				class_fail_export(exp);
+				class_export_put(exp);
+				spin_lock(&obd->obd_dev_lock);
+			} else {
+				/* List is sorted, so everyone below is ok */
+				break;
+			}
+		}
+		spin_unlock(&obd->obd_dev_lock);
+
+		spin_lock(&pet_lock);
+		list_del_init(&obd->obd_evict_list);
+		spin_unlock(&pet_lock);
+
+		class_decref(obd, "evictor", obd);
+	}
+	CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+	RETURN(0);
+}
+
+void ping_evictor_start(void)
+{
+	struct task_struct *task;
+
+	if (++pet_refcount > 1)
+		return;
+
+	init_waitqueue_head(&pet_waitq);
+
+	task = kthread_run(ping_evictor_main, NULL, "ll_evictor");
+	if (IS_ERR(task)) {
+		pet_refcount--;
+		CERROR("Cannot start ping evictor thread: %ld\n",
+			PTR_ERR(task));
+	}
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+	if (--pet_refcount > 0)
+		return;
+
+	pet_state = PET_TERMINATE;
+	wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
new file mode 100644
index 0000000000000..399ff28fa5ddb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
@@ -0,0 +1,441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+/* Intramodule declarations for ptlrpc. */
+
+#ifndef PTLRPC_INTERNAL_H
+#define PTLRPC_INTERNAL_H
+
+#include "../ldlm/ldlm_internal.h"
+#include "heap.h"
+
+struct ldlm_namespace;
+struct obd_import;
+struct ldlm_res_id;
+struct ptlrpc_request_set;
+extern int test_req_buffer_pressure;
+extern struct list_head ptlrpc_all_services;
+extern struct mutex ptlrpc_all_services_mutex;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_delay;
+
+#ifdef HAVE_SERVER_SUPPORT
+extern struct ptlrpc_nrs_pol_conf nrs_conf_crrn;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_orr;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_trr;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_tbf;
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+extern struct nrs_core nrs_core;
+
+extern struct mutex ptlrpcd_mutex;
+extern struct mutex pinger_mutex;
+
+extern lnet_handler_t ptlrpc_handler;
+extern struct percpu_ref ptlrpc_pending;
+
+/* ptlrpcd.c */
+int ptlrpcd_start(struct ptlrpcd_ctl *pc);
+
+/* client.c */
+void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+			       timeout_t service_timeout);
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+					 enum ptlrpc_bulk_op_type type,
+					 unsigned portal,
+					 const struct ptlrpc_bulk_frag_ops
+						*ops);
+int ptlrpc_request_cache_init(void);
+void ptlrpc_request_cache_fini(void);
+struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags);
+void ptlrpc_request_cache_free(struct ptlrpc_request *req);
+void ptlrpc_init_xid(void);
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+			    struct ptlrpc_request *req);
+void ptlrpc_expired_set(struct ptlrpc_request_set *set);
+time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
+void ptlrpc_set_mbits(struct ptlrpc_request *req);
+void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req);
+__u64 ptlrpc_known_replied_xid(struct obd_import *imp);
+void ptlrpc_add_unreplied(struct ptlrpc_request *req);
+
+/* events.c */
+int ptlrpc_init_portals(void);
+void ptlrpc_exit_portals(void);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
+void lustre_assert_wire_constants(void);
+int ptlrpc_import_in_recovery(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt,
+			     bool invalid);
+void ptlrpc_handle_failed_import(struct obd_import *imp);
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
+void ptlrpc_initiate_recovery(struct obd_import *imp);
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
+
+int ptlrpc_sysfs_register_service(struct kset *parent,
+				  struct ptlrpc_service *svc);
+void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc);
+
+void ptlrpc_ldebugfs_register_service(struct dentry *debugfs_entry,
+				      struct ptlrpc_service *svc);
+#ifdef CONFIG_PROC_FS
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
+void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
+                                     long q_usec, long work_usec);
+#else
+#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
+#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
+#endif /* CONFIG_PROC_FS */
+
+/* NRS */
+
+/**
+ * NRS core object.
+ *
+ * Holds NRS core fields.
+ */
+struct nrs_core {
+	/**
+	 * Protects nrs_core::nrs_policies, serializes external policy
+	 * registration/unregistration, and NRS core lprocfs operations.
+	 */
+	struct mutex nrs_mutex;
+	/**
+	 * List of all policy descriptors registered with NRS core; protected
+	 * by nrs_core::nrs_mutex.
+	 */
+	struct list_head nrs_policies;
+};
+
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc);
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc);
+
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp);
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp);
+
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force);
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp,
+			  bool force)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force);
+}
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+}
+
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp);
+bool ptlrpc_nrs_req_throttling_nolock(struct ptlrpc_service_part *svcpt,
+				      bool hp);
+
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg);
+
+int ptlrpc_nrs_init(void);
+void ptlrpc_nrs_fini(void);
+
+static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nrs_hp != NULL;
+}
+
+static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc)
+{
+	/**
+	 * If the first service partition has an HP NRS head, all service
+	 * partitions will.
+	 */
+	return nrs_svcpt_has_hp(svc->srv_parts[0]);
+}
+
+static inline
+struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt)));
+	return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+}
+
+static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_cpt;
+}
+
+static inline
+struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_service;
+}
+
+static inline
+struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt;
+}
+
+static inline
+struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy)
+{
+	return nrs_pol2svc(policy)->srv_cptable;
+}
+
+static inline struct ptlrpc_nrs_resource *
+nrs_request_resource(struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(nrq->nr_initialized);
+	LASSERT(!nrq->nr_finalized);
+
+	return nrq->nr_res_ptrs[nrq->nr_res_idx];
+}
+
+static inline
+struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq)
+{
+	return nrs_request_resource(nrq)->res_policy;
+}
+
+#define NRS_LPROCFS_QUANTUM_NAME_REG	"reg_quantum:"
+#define NRS_LPROCFS_QUANTUM_NAME_HP	"hp_quantum:"
+
+/**
+ * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum.
+ */
+#define LPROCFS_NRS_QUANTUM_MAX		65535
+
+/**
+ * Max valid command string is the size of the labels, plus "65535" twice, plus
+ * a separating space character.
+ */
+#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD					       \
+ sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " "  \
+        NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX))
+
+/* recovd_thread.c */
+
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);
+
+/* pers.c */
+void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
+			 int mdcnt);
+
+/* pack_generic.c */
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt);
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs);
+void lustre_msg_early_size_init(void); /* just for init */
+
+/* pinger.c */
+int ptlrpc_start_pinger(void);
+int ptlrpc_stop_pinger(void);
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
+void ptlrpc_pinger_commit_expected(struct obd_import *imp);
+void ptlrpc_pinger_wake_up(void);
+void ptlrpc_ping_import_soon(struct obd_import *imp);
+int ping_evictor_wake(struct obd_export *exp);
+
+/* sec_null.c */
+int  sptlrpc_null_init(void);
+void sptlrpc_null_fini(void);
+
+/* sec_plain.c */
+int  sptlrpc_plain_init(void);
+void sptlrpc_plain_fini(void);
+
+/* sec_bulk.c */
+int  sptlrpc_enc_pool_init(void);
+void sptlrpc_enc_pool_fini(void);
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v);
+
+/* sec_lproc.c */
+int  sptlrpc_lproc_init(void);
+void sptlrpc_lproc_fini(void);
+
+/* sec_gc.c */
+int sptlrpc_gc_init(void);
+void sptlrpc_gc_fini(void);
+
+/* sec_config.c */
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+				enum lustre_sec_part to,
+				struct obd_uuid *target,
+				struct lnet_nid *nid,
+				struct sptlrpc_flavor *sf);
+int  sptlrpc_conf_init(void);
+void sptlrpc_conf_fini(void);
+
+/* sec.c */
+int  sptlrpc_init(void);
+void sptlrpc_fini(void);
+
+/* layout.c */
+__u32 __req_capsule_offset(const struct req_capsule *pill,
+			   const struct req_msg_field *field,
+			   enum req_location loc);
+
+static inline bool ptlrpc_recoverable_error(int rc)
+{
+	return (rc == -ENOTCONN || rc == -ENODEV);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+int tgt_mod_init(void);
+void tgt_mod_exit(void);
+int nodemap_mod_init(void);
+void nodemap_mod_exit(void);
+#else /* HAVE_SERVER_SUPPORT */
+static inline int tgt_mod_init(void)
+{
+	return 0;
+}
+
+static inline void tgt_mod_exit(void)
+{
+	return;
+}
+
+static inline int nodemap_mod_init(void)
+{
+	return 0;
+}
+
+static inline void nodemap_mod_exit(void)
+{
+	return;
+}
+#endif /* !HAVE_SERVER_SUPPORT */
+
+static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set)
+{
+	if (atomic_dec_and_test(&set->set_refcount))
+		OBD_FREE_PTR(set);
+}
+
+/** initialise ptlrpc common fields */
+static inline void ptlrpc_req_comm_init(struct ptlrpc_request *req)
+{
+	spin_lock_init(&req->rq_lock);
+	spin_lock_init(&req->rq_early_free_lock);
+	atomic_set(&req->rq_refcount, 1);
+	INIT_LIST_HEAD(&req->rq_list);
+	INIT_LIST_HEAD(&req->rq_replay_list);
+}
+
+/** initialise client side ptlrpc request */
+static inline void ptlrpc_cli_req_init(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_req *cr = &req->rq_cli;
+
+	ptlrpc_req_comm_init(req);
+
+	req->rq_receiving_reply = 0;
+	req->rq_req_unlinked = req->rq_reply_unlinked = 1;
+	req->rq_replied = 0;
+
+	INIT_LIST_HEAD(&cr->cr_set_chain);
+	INIT_LIST_HEAD(&cr->cr_ctx_chain);
+	INIT_LIST_HEAD(&cr->cr_unreplied_list);
+	init_waitqueue_head(&cr->cr_reply_waitq);
+	init_waitqueue_head(&cr->cr_set_waitq);
+}
+
+/** initialise server side ptlrpc request */
+static inline void ptlrpc_srv_req_init(struct ptlrpc_request *req)
+{
+	struct ptlrpc_srv_req *sr = &req->rq_srv;
+
+	ptlrpc_req_comm_init(req);
+	req->rq_srv_req = 1;
+	INIT_LIST_HEAD(&sr->sr_exp_list);
+	INIT_LIST_HEAD(&sr->sr_timed_list);
+	INIT_LIST_HEAD(&sr->sr_hist_list);
+}
+
+static inline bool ptlrpc_req_is_connect(struct ptlrpc_request *req)
+{
+	if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CONNECT ||
+	    lustre_msg_get_opc(req->rq_reqmsg) == OST_CONNECT ||
+	    lustre_msg_get_opc(req->rq_reqmsg) == MGS_CONNECT)
+		return true;
+	else
+		return false;
+}
+
+static inline bool ptlrpc_req_is_disconnect(struct ptlrpc_request *req)
+{
+	if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_DISCONNECT ||
+	    lustre_msg_get_opc(req->rq_reqmsg) == OST_DISCONNECT ||
+	    lustre_msg_get_opc(req->rq_reqmsg) == MGS_DISCONNECT)
+		return true;
+	else
+		return false;
+}
+
+static inline void do_pack_body(struct ptlrpc_request *req)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	if (b == NULL)
+		return;
+
+	b->mbo_valid = 0;
+	b->mbo_eadatasize = 0;
+	b->mbo_flags = 0;
+	b->mbo_suppgid = -1;
+	b->mbo_uid = from_kuid(&init_user_ns, current_uid());
+	b->mbo_gid = from_kgid(&init_user_ns, current_gid());
+	b->mbo_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	b->mbo_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	b->mbo_capability = current_cap().cap[0];
+}
+
+#endif /* PTLRPC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c
new file mode 100644
index 0000000000000..abde1f23080d8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c
@@ -0,0 +1,147 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+static __init int ptlrpc_init(void)
+{
+	int rc;
+
+	ENTRY;
+
+	lustre_assert_wire_constants();
+#if RS_DEBUG
+	spin_lock_init(&ptlrpc_rs_debug_lock);
+#endif
+	mutex_init(&ptlrpc_all_services_mutex);
+	mutex_init(&pinger_mutex);
+	mutex_init(&ptlrpcd_mutex);
+	ptlrpc_init_xid();
+	lustre_msg_early_size_init();
+
+	rc = req_layout_init();
+	if (rc)
+		RETURN(rc);
+
+	rc = tgt_mod_init();
+	if (rc)
+		GOTO(err_layout, rc);
+
+	rc = ptlrpc_hr_init();
+	if (rc)
+		GOTO(err_tgt, rc);
+
+	rc = ptlrpc_request_cache_init();
+	if (rc)
+		GOTO(err_hr, rc);
+
+	rc = ptlrpc_init_portals();
+	if (rc)
+		GOTO(err_cache, rc);
+
+	rc = ptlrpc_connection_init();
+	if (rc)
+		GOTO(err_portals, rc);
+
+	rc = ptlrpc_start_pinger();
+	if (rc)
+		GOTO(err_conn, rc);
+
+	rc = ldlm_init();
+	if (rc)
+		GOTO(err_pinger, rc);
+
+	rc = sptlrpc_init();
+	if (rc)
+		GOTO(err_ldlm, rc);
+
+	rc = ptlrpc_nrs_init();
+	if (rc)
+		GOTO(err_sptlrpc, rc);
+
+	rc = nodemap_mod_init();
+	if (rc)
+		GOTO(err_nrs, rc);
+
+	RETURN(0);
+err_nrs:
+	ptlrpc_nrs_fini();
+err_sptlrpc:
+	sptlrpc_fini();
+err_ldlm:
+	ldlm_exit();
+err_pinger:
+	ptlrpc_stop_pinger();
+err_conn:
+	ptlrpc_connection_fini();
+err_portals:
+	ptlrpc_exit_portals();
+err_cache:
+	ptlrpc_request_cache_fini();
+err_hr:
+	ptlrpc_hr_fini();
+err_tgt:
+	tgt_mod_exit();
+err_layout:
+	req_layout_fini();
+	return rc;
+}
+
+static void __exit ptlrpc_exit(void)
+{
+	nodemap_mod_exit();
+	ptlrpc_nrs_fini();
+	sptlrpc_fini();
+	ldlm_exit();
+	ptlrpc_stop_pinger();
+	ptlrpc_exit_portals();
+	ptlrpc_request_cache_fini();
+	ptlrpc_hr_fini();
+	ptlrpc_connection_fini();
+	tgt_mod_exit();
+	req_layout_fini();
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Request Processor and Lock Management");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(ptlrpc_init);
+module_exit(ptlrpc_exit);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
new file mode 100644
index 0000000000000..9d29cc7a6e953
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
@@ -0,0 +1,993 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/ptlrpcd.c
+ */
+
+/** \defgroup ptlrpcd PortalRPC daemon
+ *
+ * ptlrpcd is a special thread with its own set where other user might add
+ * requests when they don't want to wait for their completion.
+ * PtlRPCD will take care of sending such requests and then processing their
+ * replies and calling completion callbacks as necessary.
+ * The callbacks are called directly from ptlrpcd context.
+ * It is important to never significantly block (esp. on RPCs!) within such
+ * completion handler or a deadlock might occur where ptlrpcd enters some
+ * callback that attempts to send another RPC and wait for it to return,
+ * during which time ptlrpcd is completely blocked, so e.g. if import
+ * fails, recovery cannot progress because connection requests are also
+ * sent by ptlrpcd.
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/fs_struct.h>
+#include <linux/kthread.h>
+#include <libcfs/libcfs.h>
+#include <lustre_net.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <obd_class.h>   /* for obd_zombie */
+#include <obd_support.h> /* for OBD_FAIL_CHECK */
+#include <cl_object.h> /* cl_env_{get,put}() */
+#include <lprocfs_status.h>
+
+#include "ptlrpc_internal.h"
+
+/* One of these per CPT. */
+struct ptlrpcd {
+	int			pd_size;
+	int			pd_index;
+	int			pd_cpt;
+	int			pd_cursor;
+	int			pd_nthreads;
+	int			pd_groupsize;
+	struct ptlrpcd_ctl	pd_threads[0];
+};
+
+/*
+ * max_ptlrpcds is obsolete, but retained to ensure that the kernel
+ * module will load on a system where it has been tuned.
+ * A value other than 0 implies it was tuned, in which case the value
+ * is used to derive a setting for ptlrpcd_per_cpt_max.
+ */
+static int max_ptlrpcds;
+module_param(max_ptlrpcds, int, 0644);
+MODULE_PARM_DESC(max_ptlrpcds,
+		 "Max ptlrpcd thread count to be started (obsolete).");
+
+/*
+ * ptlrpcd_bind_policy is obsolete, but retained to ensure that
+ * the kernel module will load on a system where it has been tuned.
+ * A value other than 0 implies it was tuned, in which case the value
+ * is used to derive a setting for ptlrpcd_partner_group_size.
+ */
+static int ptlrpcd_bind_policy;
+module_param(ptlrpcd_bind_policy, int, 0644);
+MODULE_PARM_DESC(ptlrpcd_bind_policy,
+		 "Ptlrpcd threads binding mode (obsolete).");
+
+/*
+ * ptlrpcd_per_cpt_max: The maximum number of ptlrpcd threads to run
+ * in a CPT.
+ */
+static int ptlrpcd_per_cpt_max;
+module_param(ptlrpcd_per_cpt_max, int, 0644);
+MODULE_PARM_DESC(ptlrpcd_per_cpt_max,
+		 "Max ptlrpcd thread count to be started per CPT.");
+
+/*
+ * ptlrpcd_partner_group_size: The desired number of threads in each
+ * ptlrpcd partner thread group. Default is 2, corresponding to the
+ * old PDB_POLICY_PAIR. A negative value makes all ptlrpcd threads in
+ * a CPT partners of each other.
+ */
+static int ptlrpcd_partner_group_size;
+module_param(ptlrpcd_partner_group_size, int, 0644);
+MODULE_PARM_DESC(ptlrpcd_partner_group_size,
+		 "Number of ptlrpcd threads in a partner group.");
+
+/*
+ * ptlrpcd_cpts: A CPT string describing the CPU partitions that
+ * ptlrpcd threads should run on. Used to make ptlrpcd threads run on
+ * a subset of all CPTs.
+ *
+ * ptlrpcd_cpts=2
+ * ptlrpcd_cpts=[2]
+ *   run ptlrpcd threads only on CPT 2.
+ *
+ * ptlrpcd_cpts=0-3
+ * ptlrpcd_cpts=[0-3]
+ *   run ptlrpcd threads on CPTs 0, 1, 2, and 3.
+ *
+ * ptlrpcd_cpts=[0-3,5,7]
+ *   run ptlrpcd threads on CPTS 0, 1, 2, 3, 5, and 7.
+ */
+static char *ptlrpcd_cpts;
+module_param(ptlrpcd_cpts, charp, 0644);
+MODULE_PARM_DESC(ptlrpcd_cpts,
+		 "CPU partitions ptlrpcd threads should run in");
+
+/* ptlrpcds_cpt_idx maps cpt numbers to an index in the ptlrpcds array. */
+static int		*ptlrpcds_cpt_idx;
+
+/* ptlrpcds_num is the number of entries in the ptlrpcds array. */
+static int		ptlrpcds_num;
+static struct ptlrpcd	**ptlrpcds;
+
+/*
+ * In addition to the regular thread pool above, there is a single
+ * global recovery thread. Recovery isn't critical for performance,
+ * and doesn't block, but must always be able to proceed, and it is
+ * possible that all normal ptlrpcd threads are blocked. Hence the
+ * need for a dedicated thread.
+ */
+static struct ptlrpcd_ctl ptlrpcd_rcv;
+
+struct mutex ptlrpcd_mutex;
+static int ptlrpcd_users = 0;
+
+void ptlrpcd_wake(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set = req->rq_set;
+
+	LASSERT(set != NULL);
+	wake_up(&set->set_waitq);
+}
+EXPORT_SYMBOL(ptlrpcd_wake);
+
+static struct ptlrpcd_ctl *
+ptlrpcd_select_pc(struct ptlrpc_request *req)
+{
+	struct ptlrpcd	*pd;
+	int		cpt;
+	int		idx;
+
+	if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL)
+		return &ptlrpcd_rcv;
+
+	cpt = cfs_cpt_current(cfs_cpt_tab, 1);
+	if (ptlrpcds_cpt_idx == NULL)
+		idx = cpt;
+	else
+		idx = ptlrpcds_cpt_idx[cpt];
+	pd = ptlrpcds[idx];
+
+	/* We do not care whether it is strict load balance. */
+	idx = pd->pd_cursor;
+	if (++idx == pd->pd_nthreads)
+		idx = 0;
+	pd->pd_cursor = idx;
+
+	return &pd->pd_threads[idx];
+}
+
+/**
+ * Move all request from an existing request set to the ptlrpcd queue.
+ * All requests from the set must be in phase RQ_PHASE_NEW.
+ */
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpcd_ctl *pc;
+	struct ptlrpc_request_set *new;
+	int count, i;
+
+	pc = ptlrpcd_select_pc(NULL);
+	new = pc->pc_set;
+
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(pos, struct ptlrpc_request,
+				   rq_set_chain);
+
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		req->rq_set = new;
+		req->rq_queued_time = ktime_get_seconds();
+	}
+
+	spin_lock(&new->set_new_req_lock);
+	list_splice_init(&set->set_requests, &new->set_new_requests);
+	i = atomic_read(&set->set_remaining);
+	count = atomic_add_return(i, &new->set_new_count);
+	atomic_set(&set->set_remaining, 0);
+	spin_unlock(&new->set_new_req_lock);
+	if (count == i) {
+		wake_up(&new->set_waitq);
+
+		/*
+		 * XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future.
+		 */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+
+/**
+ * Return transferred RPCs count.
+ */
+static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des,
+			       struct ptlrpc_request_set *src)
+{
+	struct ptlrpc_request *req;
+	int rc = 0;
+
+	spin_lock(&src->set_new_req_lock);
+	if (likely(!list_empty(&src->set_new_requests))) {
+		list_for_each_entry(req, &src->set_new_requests, rq_set_chain)
+			req->rq_set = des;
+
+		list_splice_init(&src->set_new_requests,
+				 &des->set_requests);
+		rc = atomic_read(&src->set_new_count);
+		atomic_add(rc, &des->set_remaining);
+		atomic_set(&src->set_new_count, 0);
+	}
+	spin_unlock(&src->set_new_req_lock);
+	return rc;
+}
+
+/**
+ * Requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set().
+ */
+void ptlrpcd_add_req(struct ptlrpc_request *req)
+{
+	struct ptlrpcd_ctl *pc;
+
+	if (req->rq_reqmsg)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	spin_lock(&req->rq_lock);
+	if (req->rq_invalid_rqset) {
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+		if (wait_event_idle_timeout(req->rq_set_waitq,
+					    req->rq_set == NULL,
+					    cfs_time_seconds(5)) == 0)
+			l_wait_event_abortable(req->rq_set_waitq,
+					       req->rq_set == NULL);
+	} else if (req->rq_set) {
+		/*
+		 * If we have a vaid "rq_set", just reuse it to avoid double
+		 * linked.
+		 */
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY);
+
+		/* ptlrpc_check_set will decrease the count */
+		atomic_inc(&req->rq_set->set_remaining);
+		spin_unlock(&req->rq_lock);
+		wake_up(&req->rq_set->set_waitq);
+		return;
+	} else {
+		spin_unlock(&req->rq_lock);
+	}
+
+	pc = ptlrpcd_select_pc(req);
+
+	DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s+%d]",
+		  req, pc->pc_name, pc->pc_index);
+
+	ptlrpc_set_add_new_req(pc, req);
+}
+EXPORT_SYMBOL(ptlrpcd_add_req);
+
+static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set)
+{
+	atomic_inc(&set->set_refcount);
+}
+
+/**
+ * Check if there is more work to do on ptlrpcd set.
+ * Returns 1 if yes.
+ */
+static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc)
+{
+	struct ptlrpc_request *req, *tmp;
+	struct ptlrpc_request_set *set = pc->pc_set;
+	int rc = 0;
+	int rc2;
+
+	ENTRY;
+
+	if (atomic_read(&set->set_new_count)) {
+		spin_lock(&set->set_new_req_lock);
+		if (likely(!list_empty(&set->set_new_requests))) {
+			list_splice_init(&set->set_new_requests,
+					     &set->set_requests);
+			atomic_add(atomic_read(&set->set_new_count),
+				   &set->set_remaining);
+			atomic_set(&set->set_new_count, 0);
+			/*
+			 * Need to calculate its timeout.
+			 */
+			rc = 1;
+		}
+		spin_unlock(&set->set_new_req_lock);
+	}
+
+	/*
+	 * We should call lu_env_refill() before handling new requests to make
+	 * sure that env key the requests depending on really exists.
+	 */
+	rc2 = lu_env_refill(env);
+	if (rc2 != 0) {
+		/*
+		 * XXX This is very awkward situation, because
+		 * execution can neither continue (request
+		 * interpreters assume that env is set up), nor repeat
+		 * the loop (as this potentially results in a tight
+		 * loop of -ENOMEM's).
+		 *
+		 * Fortunately, refill only ever does something when
+		 * new modules are loaded, i.e., early during boot up.
+		 */
+		CERROR("Failure to refill session: %d\n", rc2);
+		RETURN(rc);
+	}
+
+	if (atomic_read(&set->set_remaining))
+		rc |= ptlrpc_check_set(env, set);
+
+	/*
+	 * NB: ptlrpc_check_set has already moved complted request at the
+	 * head of seq::set_requests
+	 */
+	list_for_each_entry_safe(req, tmp, &set->set_requests, rq_set_chain) {
+		if (req->rq_phase != RQ_PHASE_COMPLETE)
+			break;
+
+		list_del_init(&req->rq_set_chain);
+		req->rq_set = NULL;
+		ptlrpc_req_finished(req);
+	}
+
+	if (rc == 0) {
+		/*
+		 * If new requests have been added, make sure to wake up.
+		 */
+		rc = atomic_read(&set->set_new_count);
+
+		/*
+		 * If we have nothing to do, check whether we can take some
+		 * work from our partner threads.
+		 */
+		if (rc == 0 && pc->pc_npartners > 0) {
+			struct ptlrpcd_ctl *partner;
+			struct ptlrpc_request_set *ps;
+			int first = pc->pc_cursor;
+
+			do {
+				partner = pc->pc_partners[pc->pc_cursor++];
+				if (pc->pc_cursor >= pc->pc_npartners)
+					pc->pc_cursor = 0;
+				if (partner == NULL)
+					continue;
+
+				spin_lock(&partner->pc_lock);
+				ps = partner->pc_set;
+				if (ps == NULL) {
+					spin_unlock(&partner->pc_lock);
+					continue;
+				}
+
+				ptlrpc_reqset_get(ps);
+				spin_unlock(&partner->pc_lock);
+
+				if (atomic_read(&ps->set_new_count)) {
+					rc = ptlrpcd_steal_rqset(set, ps);
+					if (rc > 0)
+						CDEBUG(D_RPCTRACE,
+						       "transfer %d async RPCs [%d->%d]\n",
+						       rc, partner->pc_index,
+						       pc->pc_index);
+				}
+				ptlrpc_reqset_put(ps);
+			} while (rc == 0 && pc->pc_cursor != first);
+		}
+	}
+
+	RETURN(rc || test_bit(LIOD_STOP, &pc->pc_flags));
+}
+
+/**
+ * Main ptlrpcd thread.
+ * ptlrpc's code paths like to execute in process context, so we have this
+ * thread which spins on a set which contains the rpcs and sends them.
+ */
+static int ptlrpcd(void *arg)
+{
+	struct ptlrpcd_ctl		*pc = arg;
+	struct ptlrpc_request_set	*set;
+	struct lu_context		ses = { 0 };
+	struct lu_env			env = { .le_ses = &ses };
+	int				rc = 0;
+	int				exit = 0;
+
+	ENTRY;
+	unshare_fs_struct();
+	if (cfs_cpt_bind(cfs_cpt_tab, pc->pc_cpt) != 0)
+		CWARN("Failed to bind %s on CPT %d\n", pc->pc_name, pc->pc_cpt);
+
+	/*
+	 * Allocate the request set after the thread has been bound
+	 * above. This is safe because no requests will be queued
+	 * until all ptlrpcd threads have confirmed that they have
+	 * successfully started.
+	 */
+	set = ptlrpc_prep_set();
+	if (set == NULL)
+		GOTO(failed, rc = -ENOMEM);
+	spin_lock(&pc->pc_lock);
+	pc->pc_set = set;
+	spin_unlock(&pc->pc_lock);
+
+	/* Both client and server (MDT/OST) may use the environment. */
+	rc = lu_context_init(&env.le_ctx, LCT_MD_THREAD |
+					  LCT_DT_THREAD |
+					  LCT_CL_THREAD |
+					  LCT_REMEMBER  |
+					  LCT_NOREF);
+	if (rc != 0)
+		GOTO(failed, rc);
+	rc = lu_context_init(env.le_ses, LCT_SESSION  |
+					 LCT_REMEMBER |
+					 LCT_NOREF);
+	if (rc != 0) {
+		lu_context_fini(&env.le_ctx);
+		GOTO(failed, rc);
+	}
+
+	complete(&pc->pc_starting);
+
+	/*
+	 * This mainloop strongly resembles ptlrpc_set_wait() except that our
+	 * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
+	 * there are requests in the set. New requests come in on the set's
+	 * new_req_list and ptlrpcd_check() moves them into the set.
+	 */
+	do {
+		DEFINE_WAIT_FUNC(wait, woken_wake_function);
+		time64_t timeout;
+
+		timeout = cfs_time_seconds(ptlrpc_set_next_timeout(set));
+
+		lu_context_enter(&env.le_ctx);
+		lu_context_enter(env.le_ses);
+
+		add_wait_queue(&set->set_waitq, &wait);
+		while (!ptlrpcd_check(&env, pc)) {
+			int ret;
+
+			if (timeout == 0)
+				ret = wait_woken(&wait, TASK_IDLE,
+						 MAX_SCHEDULE_TIMEOUT);
+			else {
+				ret = wait_woken(&wait, TASK_IDLE, timeout);
+				if (ret > 0)
+					timeout = ret;
+			}
+			if (ret != 0)
+				continue;
+			/* Timed out */
+			ptlrpc_expired_set(set);
+			break;
+		}
+		remove_wait_queue(&set->set_waitq, &wait);
+
+		lu_context_exit(&env.le_ctx);
+		lu_context_exit(env.le_ses);
+
+		/*
+		 * Abort inflight rpcs for forced stop case.
+		 */
+		if (test_bit(LIOD_STOP, &pc->pc_flags)) {
+			if (test_bit(LIOD_FORCE, &pc->pc_flags))
+				ptlrpc_abort_set(set);
+			exit++;
+		}
+
+		/*
+		 * Let's make one more loop to make sure that ptlrpcd_check()
+		 * copied all raced new rpcs into the set so we can kill them.
+		 */
+	} while (exit < 2);
+
+	/*
+	 * Wait for inflight requests to drain.
+	 */
+	if (!list_empty(&set->set_requests))
+		ptlrpc_set_wait(&env, set);
+	lu_context_fini(&env.le_ctx);
+	lu_context_fini(env.le_ses);
+
+	complete(&pc->pc_finishing);
+
+	return 0;
+
+failed:
+	pc->pc_error = rc;
+	complete(&pc->pc_starting);
+	RETURN(rc);
+}
+
+static void ptlrpcd_ctl_init(struct ptlrpcd_ctl *pc, int index, int cpt)
+{
+	ENTRY;
+
+	pc->pc_index = index;
+	pc->pc_cpt = cpt;
+	init_completion(&pc->pc_starting);
+	init_completion(&pc->pc_finishing);
+	spin_lock_init(&pc->pc_lock);
+
+	if (index < 0) {
+		/* Recovery thread. */
+		snprintf(pc->pc_name, sizeof(pc->pc_name), "ptlrpcd_rcv");
+	} else {
+		/* Regular thread. */
+		snprintf(pc->pc_name, sizeof(pc->pc_name),
+			 "ptlrpcd_%02d_%02d", cpt, index);
+	}
+
+	EXIT;
+}
+
+/* XXX: We want multiple CPU cores to share the async RPC load. So we
+ *      start many ptlrpcd threads. We also want to reduce the ptlrpcd
+ *      overhead caused by data transfer cross-CPU cores. So we bind
+ *      all ptlrpcd threads to a CPT, in the expectation that CPTs
+ *      will be defined in a way that matches these boundaries. Within
+ *      a CPT a ptlrpcd thread can be scheduled on any available core.
+ *
+ *      Each ptlrpcd thread has its own request queue. This can cause
+ *      response delay if the thread is already busy. To help with
+ *      this we define partner threads: these are other threads bound
+ *      to the same CPT which will check for work in each other's
+ *      request queues if they have no work to do.
+ *
+ *      The desired number of partner threads can be tuned by setting
+ *      ptlrpcd_partner_group_size. The default is to create pairs of
+ *      partner threads.
+ */
+static int ptlrpcd_partners(struct ptlrpcd *pd, int index)
+{
+	struct ptlrpcd_ctl	*pc;
+	struct ptlrpcd_ctl	**ppc;
+	int			first;
+	int			i;
+	int			rc = 0;
+
+	ENTRY;
+
+	LASSERT(index >= 0 && index < pd->pd_nthreads);
+	pc = &pd->pd_threads[index];
+	pc->pc_npartners = pd->pd_groupsize - 1;
+
+	if (pc->pc_npartners <= 0)
+		GOTO(out, rc);
+
+	OBD_CPT_ALLOC(pc->pc_partners, cfs_cpt_tab, pc->pc_cpt,
+		      sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+	if (pc->pc_partners == NULL) {
+		pc->pc_npartners = 0;
+		GOTO(out, rc = -ENOMEM);
+	}
+
+	first = index - index % pd->pd_groupsize;
+	ppc = pc->pc_partners;
+	for (i = first; i < first + pd->pd_groupsize; i++) {
+		if (i != index)
+			*ppc++ = &pd->pd_threads[i];
+	}
+out:
+	RETURN(rc);
+}
+
+int ptlrpcd_start(struct ptlrpcd_ctl *pc)
+{
+	struct task_struct	*task;
+	int			rc = 0;
+
+	ENTRY;
+
+	/*
+	 * Do not allow starting a second thread for one pc.
+	 */
+	if (test_and_set_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Starting second thread (%s) for same pc %p\n",
+		      pc->pc_name, pc);
+		RETURN(0);
+	}
+
+	task = kthread_run(ptlrpcd, pc, "%s", pc->pc_name);
+	if (IS_ERR(task))
+		GOTO(out_set, rc = PTR_ERR(task));
+
+	wait_for_completion(&pc->pc_starting);
+	rc = pc->pc_error;
+	if (rc != 0)
+		GOTO(out_set, rc);
+
+	RETURN(0);
+
+out_set:
+	if (pc->pc_set != NULL) {
+		struct ptlrpc_request_set *set = pc->pc_set;
+
+		spin_lock(&pc->pc_lock);
+		pc->pc_set = NULL;
+		spin_unlock(&pc->pc_lock);
+		ptlrpc_set_destroy(set);
+	}
+	clear_bit(LIOD_START, &pc->pc_flags);
+	RETURN(rc);
+}
+
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
+{
+	ENTRY;
+
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		goto out;
+	}
+
+	set_bit(LIOD_STOP, &pc->pc_flags);
+	if (force)
+		set_bit(LIOD_FORCE, &pc->pc_flags);
+	wake_up(&pc->pc_set->set_waitq);
+
+out:
+	EXIT;
+}
+
+void ptlrpcd_free(struct ptlrpcd_ctl *pc)
+{
+	struct ptlrpc_request_set *set = pc->pc_set;
+
+	ENTRY;
+
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		goto out;
+	}
+
+	wait_for_completion(&pc->pc_finishing);
+
+	spin_lock(&pc->pc_lock);
+	pc->pc_set = NULL;
+	spin_unlock(&pc->pc_lock);
+	ptlrpc_set_destroy(set);
+
+	clear_bit(LIOD_START, &pc->pc_flags);
+	clear_bit(LIOD_STOP, &pc->pc_flags);
+	clear_bit(LIOD_FORCE, &pc->pc_flags);
+
+out:
+	if (pc->pc_npartners > 0) {
+		LASSERT(pc->pc_partners != NULL);
+
+		OBD_FREE_PTR_ARRAY(pc->pc_partners, pc->pc_npartners);
+		pc->pc_partners = NULL;
+	}
+	pc->pc_npartners = 0;
+	pc->pc_error = 0;
+	EXIT;
+}
+
+static void ptlrpcd_fini(void)
+{
+	int	i;
+	int	j;
+	int	ncpts;
+
+	ENTRY;
+
+	if (ptlrpcds != NULL) {
+		for (i = 0; i < ptlrpcds_num; i++) {
+			if (ptlrpcds[i] == NULL)
+				break;
+			for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++)
+				ptlrpcd_stop(&ptlrpcds[i]->pd_threads[j], 0);
+			for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++)
+				ptlrpcd_free(&ptlrpcds[i]->pd_threads[j]);
+			OBD_FREE(ptlrpcds[i], ptlrpcds[i]->pd_size);
+			ptlrpcds[i] = NULL;
+		}
+		OBD_FREE_PTR_ARRAY(ptlrpcds, ptlrpcds_num);
+	}
+	ptlrpcds_num = 0;
+
+	ptlrpcd_stop(&ptlrpcd_rcv, 0);
+	ptlrpcd_free(&ptlrpcd_rcv);
+
+	if (ptlrpcds_cpt_idx != NULL) {
+		ncpts = cfs_cpt_number(cfs_cpt_tab);
+		OBD_FREE_PTR_ARRAY(ptlrpcds_cpt_idx, ncpts);
+		ptlrpcds_cpt_idx = NULL;
+	}
+
+	EXIT;
+}
+
+static int ptlrpcd_init(void)
+{
+	int			nthreads;
+	int			groupsize;
+	int			size;
+	int			i;
+	int			j;
+	int			rc = 0;
+	struct cfs_cpt_table	*cptable;
+	__u32			*cpts = NULL;
+	int			ncpts;
+	int			cpt;
+	struct ptlrpcd		*pd;
+
+	ENTRY;
+
+	/*
+	 * Determine the CPTs that ptlrpcd threads will run on.
+	 */
+	cptable = cfs_cpt_tab;
+	ncpts = cfs_cpt_number(cptable);
+	if (ptlrpcd_cpts != NULL) {
+		struct cfs_expr_list *el;
+
+		size = ncpts * sizeof(ptlrpcds_cpt_idx[0]);
+		OBD_ALLOC(ptlrpcds_cpt_idx, size);
+		if (ptlrpcds_cpt_idx == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		rc = cfs_expr_list_parse(ptlrpcd_cpts,
+					 strlen(ptlrpcd_cpts),
+					 0, ncpts - 1, &el);
+		if (rc != 0) {
+			CERROR("%s: invalid CPT pattern string: %s",
+			       "ptlrpcd_cpts", ptlrpcd_cpts);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rc = cfs_expr_list_values(el, ncpts, &cpts);
+		cfs_expr_list_free(el);
+		if (rc <= 0) {
+			CERROR("%s: failed to parse CPT array %s: %d\n",
+			       "ptlrpcd_cpts", ptlrpcd_cpts, rc);
+			if (rc == 0)
+				rc = -EINVAL;
+			GOTO(out, rc);
+		}
+
+		/*
+		 * Create the cpt-to-index map. When there is no match
+		 * in the cpt table, pick a cpt at random. This could
+		 * be changed to take the topology of the system into
+		 * account.
+		 */
+		for (cpt = 0; cpt < ncpts; cpt++) {
+			for (i = 0; i < rc; i++)
+				if (cpts[i] == cpt)
+					break;
+			if (i >= rc)
+				i = cpt % rc;
+			ptlrpcds_cpt_idx[cpt] = i;
+		}
+
+		cfs_expr_list_values_free(cpts, rc);
+		ncpts = rc;
+	}
+	ptlrpcds_num = ncpts;
+
+	size = ncpts * sizeof(ptlrpcds[0]);
+	OBD_ALLOC(ptlrpcds, size);
+	if (ptlrpcds == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	/*
+	 * The max_ptlrpcds parameter is obsolete, but do something
+	 * sane if it has been tuned, and complain if
+	 * ptlrpcd_per_cpt_max has also been tuned.
+	 */
+	if (max_ptlrpcds != 0) {
+		CWARN("max_ptlrpcds is obsolete.\n");
+		if (ptlrpcd_per_cpt_max == 0) {
+			ptlrpcd_per_cpt_max = max_ptlrpcds / ncpts;
+			/* Round up if there is a remainder. */
+			if (max_ptlrpcds % ncpts != 0)
+				ptlrpcd_per_cpt_max++;
+			CWARN("Setting ptlrpcd_per_cpt_max = %d\n",
+			      ptlrpcd_per_cpt_max);
+		} else {
+			CWARN("ptlrpd_per_cpt_max is also set!\n");
+		}
+	}
+
+	/*
+	 * The ptlrpcd_bind_policy parameter is obsolete, but do
+	 * something sane if it has been tuned, and complain if
+	 * ptlrpcd_partner_group_size is also tuned.
+	 */
+	if (ptlrpcd_bind_policy != 0) {
+		CWARN("ptlrpcd_bind_policy is obsolete.\n");
+		if (ptlrpcd_partner_group_size == 0) {
+			switch (ptlrpcd_bind_policy) {
+			case 1: /* PDB_POLICY_NONE */
+			case 2: /* PDB_POLICY_FULL */
+				ptlrpcd_partner_group_size = 1;
+				break;
+			case 3: /* PDB_POLICY_PAIR */
+				ptlrpcd_partner_group_size = 2;
+				break;
+			case 4:	/* PDB_POLICY_NEIGHBOR */
+#ifdef CONFIG_NUMA
+				ptlrpcd_partner_group_size = -1; /* CPT */
+#else
+				ptlrpcd_partner_group_size = 3; /* Triplets */
+#endif
+				break;
+			default: /* Illegal value, use the default. */
+				ptlrpcd_partner_group_size = 2;
+				break;
+			}
+			CWARN("Setting ptlrpcd_partner_group_size = %d\n",
+			      ptlrpcd_partner_group_size);
+		} else {
+			CWARN("ptlrpcd_partner_group_size is also set!\n");
+		}
+	}
+
+	if (ptlrpcd_partner_group_size == 0)
+		ptlrpcd_partner_group_size = 2;
+	else if (ptlrpcd_partner_group_size < 0)
+		ptlrpcd_partner_group_size = -1;
+	else if (ptlrpcd_per_cpt_max > 0 &&
+		 ptlrpcd_partner_group_size > ptlrpcd_per_cpt_max)
+		ptlrpcd_partner_group_size = ptlrpcd_per_cpt_max;
+
+	/*
+	 * Start the recovery thread first.
+	 */
+	set_bit(LIOD_RECOVERY, &ptlrpcd_rcv.pc_flags);
+	ptlrpcd_ctl_init(&ptlrpcd_rcv, -1, CFS_CPT_ANY);
+	rc = ptlrpcd_start(&ptlrpcd_rcv);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	for (i = 0; i < ncpts; i++) {
+		if (cpts == NULL)
+			cpt = i;
+		else
+			cpt = cpts[i];
+
+		nthreads = cfs_cpt_weight(cptable, cpt);
+		if (ptlrpcd_per_cpt_max > 0 && ptlrpcd_per_cpt_max < nthreads)
+			nthreads = ptlrpcd_per_cpt_max;
+		if (nthreads < 2)
+			nthreads = 2;
+
+		if (ptlrpcd_partner_group_size <= 0) {
+			groupsize = nthreads;
+		} else if (nthreads <= ptlrpcd_partner_group_size) {
+			groupsize = nthreads;
+		} else {
+			groupsize = ptlrpcd_partner_group_size;
+			if (nthreads % groupsize != 0)
+				nthreads += groupsize - (nthreads % groupsize);
+		}
+
+		size = offsetof(struct ptlrpcd, pd_threads[nthreads]);
+		OBD_CPT_ALLOC(pd, cptable, cpt, size);
+
+		if (!pd)
+			GOTO(out, rc = -ENOMEM);
+		pd->pd_size      = size;
+		pd->pd_index     = i;
+		pd->pd_cpt       = cpt;
+		pd->pd_cursor    = 0;
+		pd->pd_nthreads  = nthreads;
+		pd->pd_groupsize = groupsize;
+		ptlrpcds[i] = pd;
+
+		/*
+		 * The ptlrpcd threads in a partner group can access
+		 * each other's struct ptlrpcd_ctl, so these must be
+		 * initialized before any thead is started.
+		 */
+		for (j = 0; j < nthreads; j++) {
+			ptlrpcd_ctl_init(&pd->pd_threads[j], j, cpt);
+			rc = ptlrpcd_partners(pd, j);
+			if (rc < 0)
+				GOTO(out, rc);
+		}
+
+		/* XXX: We start nthreads ptlrpc daemons on this cpt.
+		 *      Each of them can process any non-recovery
+		 *      async RPC to improve overall async RPC
+		 *      efficiency.
+		 *
+		 *      But there are some issues with async I/O RPCs
+		 *      and async non-I/O RPCs processed in the same
+		 *      set under some cases. The ptlrpcd may be
+		 *      blocked by some async I/O RPC(s), then will
+		 *      cause other async non-I/O RPC(s) can not be
+		 *      processed in time.
+		 *
+		 *      Maybe we should distinguish blocked async RPCs
+		 *      from non-blocked async RPCs, and process them
+		 *      in different ptlrpcd sets to avoid unnecessary
+		 *      dependency. But how to distribute async RPCs
+		 *      load among all the ptlrpc daemons becomes
+		 *      another trouble.
+		 */
+		for (j = 0; j < nthreads; j++) {
+			rc = ptlrpcd_start(&pd->pd_threads[j]);
+			if (rc < 0)
+				GOTO(out, rc);
+		}
+	}
+out:
+	if (rc != 0)
+		ptlrpcd_fini();
+
+	RETURN(rc);
+}
+
+int ptlrpcd_addref(void)
+{
+	int rc = 0;
+
+	ENTRY;
+
+	mutex_lock(&ptlrpcd_mutex);
+	if (++ptlrpcd_users == 1) {
+		rc = ptlrpcd_init();
+		if (rc < 0)
+			ptlrpcd_users--;
+	}
+	mutex_unlock(&ptlrpcd_mutex);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpcd_addref);
+
+void ptlrpcd_decref(void)
+{
+	mutex_lock(&ptlrpcd_mutex);
+	if (--ptlrpcd_users == 0)
+		ptlrpcd_fini();
+	mutex_unlock(&ptlrpcd_mutex);
+}
+EXPORT_SYMBOL(ptlrpcd_decref);
+/** @} ptlrpcd */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
new file mode 100644
index 0000000000000..c44b56c767885
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
@@ -0,0 +1,377 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/recover.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+/**
+ * Start recovery on disconnected import.
+ * This is done by just attempting a connect
+ */
+void ptlrpc_initiate_recovery(struct obd_import *imp)
+{
+        ENTRY;
+
+        CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd));
+        ptlrpc_connect_import(imp);
+
+        EXIT;
+}
+
+/**
+ * Identify what request from replay list needs to be replayed next
+ * (based on what we have already replayed) and send it to server.
+ */
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
+{
+	int rc = 0;
+	struct ptlrpc_request *req = NULL;
+	__u64 last_transno;
+	ENTRY;
+
+	*inflight = 0;
+
+	/* It might have committed some after we last spoke, so make sure we
+	 * get rid of them now.
+	 */
+	spin_lock(&imp->imp_lock);
+	imp->imp_last_transno_checked = 0;
+	ptlrpc_free_committed(imp);
+	last_transno = imp->imp_last_replay_transno;
+
+	CDEBUG(D_HA, "import %p from %s committed %llu last %llu\n",
+	       imp, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_peer_committed_transno, last_transno);
+
+	/* Replay all the committed open requests on committed_list first */
+	if (!list_empty(&imp->imp_committed_list)) {
+		req = list_last_entry(&imp->imp_committed_list,
+				      struct ptlrpc_request, rq_replay_list);
+
+		/* The last request on committed_list hasn't been replayed */
+		if (req->rq_transno > last_transno) {
+			if (!imp->imp_resend_replay ||
+			    imp->imp_replay_cursor == &imp->imp_committed_list)
+				imp->imp_replay_cursor =
+					imp->imp_replay_cursor->next;
+
+			while (imp->imp_replay_cursor !=
+			       &imp->imp_committed_list) {
+				req = list_entry(imp->imp_replay_cursor,
+						 struct ptlrpc_request,
+						 rq_replay_list);
+				if (req->rq_transno > last_transno)
+					break;
+
+				req = NULL;
+				LASSERT(!list_empty(imp->imp_replay_cursor));
+				imp->imp_replay_cursor =
+					imp->imp_replay_cursor->next;
+			}
+		} else {
+			/* All requests on committed_list have been replayed */
+			imp->imp_replay_cursor = &imp->imp_committed_list;
+			req = NULL;
+		}
+	}
+
+	/* All the requests in committed list have been replayed, let's replay
+	 * the imp_replay_list */
+	if (req == NULL) {
+		struct ptlrpc_request *tmp;
+
+		list_for_each_entry(tmp, &imp->imp_replay_list,
+				    rq_replay_list) {
+			if (tmp->rq_transno > last_transno) {
+				req = tmp;
+				break;
+			}
+		}
+	}
+
+	/* If need to resend the last sent transno (because a reconnect
+	 * has occurred), then stop on the matching req and send it again.
+	 * If, however, the last sent transno has been committed then we
+	 * continue replay from the next request. */
+	if (req != NULL && imp->imp_resend_replay)
+		lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+
+	/* ptlrpc_prepare_replay() may fail to add the reqeust into unreplied
+	 * list if the request hasn't been added to replay list then. Another
+	 * exception is that resend replay could have been removed from the
+	 * unreplied list. */
+	if (req != NULL && list_empty(&req->rq_unreplied_list)) {
+		DEBUG_REQ(D_HA, req, "resend_replay=%d, last_transno=%llu",
+			  imp->imp_resend_replay, last_transno);
+		ptlrpc_add_unreplied(req);
+		imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp);
+	}
+
+	imp->imp_resend_replay = 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (req != NULL) {
+		LASSERT(!list_empty(&req->rq_unreplied_list));
+
+		rc = ptlrpc_replay_req(req);
+		if (rc) {
+			CERROR("recovery replay error %d for req %llu\n",
+			       rc, req->rq_xid);
+			RETURN(rc);
+		}
+		*inflight = 1;
+	}
+	RETURN(rc);
+}
+
+/**
+ * Schedule resending of request on sending_list. This is done after
+ * we completed replaying of requests and locks.
+ */
+int ptlrpc_resend(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+        ENTRY;
+
+        /* As long as we're in recovery, nothing should be added to the sending
+         * list, so we don't need to hold the lock during this iteration and
+         * resend process.
+         */
+        /* Well... what if lctl recover is called twice at the same time?
+         */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_RECOVER) {
+		spin_unlock(&imp->imp_lock);
+                RETURN(-1);
+        }
+
+	list_for_each_entry(req, &imp->imp_sending_list, rq_list) {
+		LASSERTF((long)req > PAGE_SIZE && req != LP_POISON,
+			 "req %p bad\n", req);
+		LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req);
+
+		/* If the request is allowed to be sent during replay and it
+		 * is not timeout yet, then it does not need to be resent. */
+		if (!ptlrpc_no_resend(req) &&
+		    (req->rq_timedout || !req->rq_allow_replay))
+			ptlrpc_resend_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT, 2);
+	RETURN(0);
+}
+
+/**
+ * Go through all requests in delayed list and wake their threads
+ * for resending
+ */
+void ptlrpc_wake_delayed(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(req, &imp->imp_delayed_list, rq_list) {
+		DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+}
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
+{
+	struct obd_import *imp = failed_req->rq_import;
+	int conn = lustre_msg_get_conn_cnt(failed_req->rq_reqmsg);
+	ENTRY;
+
+	CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+		imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+		imp->imp_connection->c_remote_uuid.uuid);
+
+	if (ptlrpc_set_import_discon(imp, conn, true)) {
+		/* to control recovery via lctl {disable|enable}_recovery */
+		if (imp->imp_deactive == 0)
+			ptlrpc_connect_import(imp);
+	}
+
+	/* Wait for recovery to complete and resend. If evicted, then
+	   this request will be errored out later.*/
+	spin_lock(&failed_req->rq_lock);
+	if (!failed_req->rq_no_resend)
+		failed_req->rq_resend = 1;
+	spin_unlock(&failed_req->rq_lock);
+
+	EXIT;
+}
+
+/**
+ * Administratively active/deactive a client.
+ * This should only be called by the ioctl interface, currently
+ *  - the lctl deactivate and activate commands
+ *  - echo 0/1 >> /proc/osc/XXX/active
+ *  - client umount -f (ll_umount_begin)
+ */
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
+{
+        struct obd_device *obd = imp->imp_obd;
+        int rc = 0;
+
+        ENTRY;
+        LASSERT(obd);
+
+        /* When deactivating, mark import invalid, and abort in-flight
+         * requests. */
+        if (!active) {
+                LCONSOLE_WARN("setting import %s INACTIVE by administrator "
+                              "request\n", obd2cli_tgt(imp->imp_obd));
+
+                /* set before invalidate to avoid messages about imp_inval
+                 * set without imp_deactive in ptlrpc_import_delay_req */
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 1;
+		spin_unlock(&imp->imp_lock);
+
+                obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE);
+
+                ptlrpc_invalidate_import(imp);
+        }
+
+        /* When activating, mark import valid, and attempt recovery */
+        if (active) {
+                CDEBUG(D_HA, "setting import %s VALID\n",
+                       obd2cli_tgt(imp->imp_obd));
+
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 0;
+		spin_unlock(&imp->imp_lock);
+                obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE);
+
+                rc = ptlrpc_recover_import(imp, NULL, 0);
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_import_active);
+
+/* Attempt to reconnect an import */
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
+{
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive ||
+	    atomic_read(&imp->imp_inval_count))
+		rc = -EINVAL;
+	spin_unlock(&imp->imp_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	/* force import to be disconnected. */
+	ptlrpc_set_import_discon(imp, 0, false);
+
+	if (new_uuid) {
+		struct obd_uuid uuid;
+
+		/* intruct import to use new uuid */
+		obd_str2uuid(&uuid, new_uuid);
+		rc = import_set_conn_priority(imp, &uuid);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	/* Check if reconnect is already in progress */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_DISCON) {
+		imp->imp_force_verify = 1;
+		rc = -EALREADY;
+	}
+	spin_unlock(&imp->imp_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	OBD_RACE(OBD_FAIL_PTLRPC_CONNECT_RACE);
+
+	rc = ptlrpc_connect_import(imp);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!async) {
+		long timeout = cfs_time_seconds(obd_timeout);
+
+		CDEBUG(D_HA, "%s: recovery started, waiting %u jiffies\n",
+		       obd2cli_tgt(imp->imp_obd), obd_timeout);
+
+		rc = wait_event_idle_timeout(imp->imp_recovery_waitq,
+					     !ptlrpc_import_in_recovery(imp),
+					     timeout);
+		if (rc == 0)
+			rc = -ETIMEDOUT;
+		else
+			rc = 0;
+		CDEBUG(D_HA, "%s: recovery finished\n",
+		       obd2cli_tgt(imp->imp_obd));
+	}
+	EXIT;
+
+out:
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_recover_import);
+
+int ptlrpc_import_in_recovery(struct obd_import *imp)
+{
+	int in_recovery = 1;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state <= LUSTRE_IMP_DISCON ||
+	    imp->imp_state >= LUSTRE_IMP_FULL ||
+	    imp->imp_obd->obd_no_recov)
+		in_recovery = 0;
+	spin_unlock(&imp->imp_lock);
+
+	return in_recovery;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
new file mode 100644
index 0000000000000..d126df52518c8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
@@ -0,0 +1,2762 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/sec.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+static int send_sepol;
+module_param(send_sepol, int, 0644);
+MODULE_PARM_DESC(send_sepol, "Client sends SELinux policy status");
+
+/*
+ * policy registers
+ */
+
+static rwlock_t policy_lock;
+static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = {
+	NULL,
+};
+
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy)
+{
+	__u16 number = policy->sp_policy;
+
+	LASSERT(policy->sp_name);
+	LASSERT(policy->sp_cops);
+	LASSERT(policy->sp_sops);
+
+	if (number >= SPTLRPC_POLICY_MAX)
+		return -EINVAL;
+
+	write_lock(&policy_lock);
+	if (unlikely(policies[number])) {
+		write_unlock(&policy_lock);
+		return -EALREADY;
+	}
+	policies[number] = policy;
+	write_unlock(&policy_lock);
+
+	CDEBUG(D_SEC, "%s: registered\n", policy->sp_name);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_register_policy);
+
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy)
+{
+	__u16 number = policy->sp_policy;
+
+	LASSERT(number < SPTLRPC_POLICY_MAX);
+
+	write_lock(&policy_lock);
+	if (unlikely(policies[number] == NULL)) {
+		write_unlock(&policy_lock);
+		CERROR("%s: already unregistered\n", policy->sp_name);
+		return -EINVAL;
+	}
+
+	LASSERT(policies[number] == policy);
+	policies[number] = NULL;
+	write_unlock(&policy_lock);
+
+	CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unregister_policy);
+
+static
+struct ptlrpc_sec_policy *sptlrpc_wireflavor2policy(__u32 flavor)
+{
+	static DEFINE_MUTEX(load_mutex);
+	static atomic_t		  loaded = ATOMIC_INIT(0);
+	struct ptlrpc_sec_policy *policy;
+	__u16			  number = SPTLRPC_FLVR_POLICY(flavor);
+	__u16			  flag = 0;
+
+	if (number >= SPTLRPC_POLICY_MAX)
+		return NULL;
+
+	while (1) {
+		read_lock(&policy_lock);
+		policy = policies[number];
+		if (policy && !try_module_get(policy->sp_owner))
+			policy = NULL;
+		if (policy == NULL)
+			flag = atomic_read(&loaded);
+		read_unlock(&policy_lock);
+
+		if (policy != NULL || flag != 0 ||
+		    number != SPTLRPC_POLICY_GSS)
+			break;
+
+		/* try to load gss module, once */
+		mutex_lock(&load_mutex);
+		if (atomic_read(&loaded) == 0) {
+			if (request_module("ptlrpc_gss") == 0)
+				CDEBUG(D_SEC,
+				       "module ptlrpc_gss loaded on demand\n");
+			else
+				CERROR("Unable to load module ptlrpc_gss\n");
+
+			atomic_set(&loaded, 1);
+		}
+		mutex_unlock(&load_mutex);
+	}
+
+	return policy;
+}
+
+__u32 sptlrpc_name2flavor_base(const char *name)
+{
+	if (!strcmp(name, "null"))
+		return SPTLRPC_FLVR_NULL;
+	if (!strcmp(name, "plain"))
+		return SPTLRPC_FLVR_PLAIN;
+	if (!strcmp(name, "gssnull"))
+		return SPTLRPC_FLVR_GSSNULL;
+	if (!strcmp(name, "krb5n"))
+		return SPTLRPC_FLVR_KRB5N;
+	if (!strcmp(name, "krb5a"))
+		return SPTLRPC_FLVR_KRB5A;
+	if (!strcmp(name, "krb5i"))
+		return SPTLRPC_FLVR_KRB5I;
+	if (!strcmp(name, "krb5p"))
+		return SPTLRPC_FLVR_KRB5P;
+	if (!strcmp(name, "skn"))
+		return SPTLRPC_FLVR_SKN;
+	if (!strcmp(name, "ska"))
+		return SPTLRPC_FLVR_SKA;
+	if (!strcmp(name, "ski"))
+		return SPTLRPC_FLVR_SKI;
+	if (!strcmp(name, "skpi"))
+		return SPTLRPC_FLVR_SKPI;
+
+	return SPTLRPC_FLVR_INVALID;
+}
+EXPORT_SYMBOL(sptlrpc_name2flavor_base);
+
+const char *sptlrpc_flavor2name_base(__u32 flvr)
+{
+	__u32   base = SPTLRPC_FLVR_BASE(flvr);
+
+	if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL))
+		return "null";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN))
+		return "plain";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_GSSNULL))
+		return "gssnull";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N))
+		return "krb5n";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A))
+		return "krb5a";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I))
+		return "krb5i";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P))
+		return "krb5p";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKN))
+		return "skn";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKA))
+		return "ska";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKI))
+		return "ski";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKPI))
+		return "skpi";
+
+	CERROR("invalid wire flavor 0x%x\n", flvr);
+	return "invalid";
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_base);
+
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+			       char *buf, int bufsize)
+{
+	if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN)
+		snprintf(buf, bufsize, "hash:%s",
+			sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg));
+	else
+		snprintf(buf, bufsize, "%s",
+			sptlrpc_flavor2name_base(sf->sf_rpc));
+
+	buf[bufsize - 1] = '\0';
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_bulk);
+
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+{
+	snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc));
+
+	/*
+	 * currently we don't support customized bulk specification for
+	 * flavors other than plain
+	 */
+	if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) {
+		char bspec[16];
+
+		bspec[0] = '-';
+		sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1);
+		strncat(buf, bspec, bufsize);
+	}
+
+	buf[bufsize - 1] = '\0';
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name);
+
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_secflags2str);
+
+/*
+ * client context APIs
+ */
+
+static
+struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec)
+{
+	struct vfs_cred vcred;
+	int create = 1, remove_dead = 1;
+
+	LASSERT(sec);
+	LASSERT(sec->ps_policy->sp_cops->lookup_ctx);
+
+	if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE |
+				     PTLRPC_SEC_FL_ROOTONLY)) {
+		vcred.vc_uid = 0;
+		vcred.vc_gid = 0;
+		if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) {
+			create = 0;
+			remove_dead = 0;
+		}
+	} else {
+		vcred.vc_uid = from_kuid(&init_user_ns, current_uid());
+		vcred.vc_gid = from_kgid(&init_user_ns, current_gid());
+	}
+
+	return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred, create,
+						   remove_dead);
+}
+
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx)
+{
+	atomic_inc(&ctx->cc_refcount);
+	return ctx;
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_get);
+
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	struct ptlrpc_sec *sec = ctx->cc_sec;
+
+	LASSERT(sec);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (!atomic_dec_and_test(&ctx->cc_refcount))
+		return;
+
+	sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_put);
+
+/**
+ * Expire the client context immediately.
+ *
+ * \pre Caller must hold at least 1 reference on the \a ctx.
+ */
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(ctx->cc_ops->die);
+	ctx->cc_ops->die(ctx, 0);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_expire);
+
+/**
+ * To wake up the threads who are waiting for this client context. Called
+ * after some status change happened on \a ctx.
+ */
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_request *req, *next;
+
+	spin_lock(&ctx->cc_lock);
+	list_for_each_entry_safe(req, next, &ctx->cc_req_list,
+				     rq_ctx_chain) {
+		list_del_init(&req->rq_ctx_chain);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&ctx->cc_lock);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup);
+
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize)
+{
+	LASSERT(ctx->cc_ops);
+
+	if (ctx->cc_ops->display == NULL)
+		return 0;
+
+	return ctx->cc_ops->display(ctx, buf, bufsize);
+}
+
+static int import_sec_check_expire(struct obd_import *imp)
+{
+	int adapt = 0;
+
+	write_lock(&imp->imp_sec_lock);
+	if (imp->imp_sec_expire &&
+	    imp->imp_sec_expire < ktime_get_real_seconds()) {
+		adapt = 1;
+		imp->imp_sec_expire = 0;
+	}
+	write_unlock(&imp->imp_sec_lock);
+
+	if (!adapt)
+		return 0;
+
+	CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n");
+	return sptlrpc_import_sec_adapt(imp, NULL, NULL);
+}
+
+/**
+ * Get and validate the client side ptlrpc security facilities from
+ * \a imp. There is a race condition on client reconnect when the import is
+ * being destroyed while there are outstanding client bound requests. In
+ * this case do not output any error messages if import secuity is not
+ * found.
+ *
+ * \param[in] imp obd import associated with client
+ * \param[out] sec client side ptlrpc security
+ *
+ * \retval 0 if security retrieved successfully
+ * \retval -ve errno if there was a problem
+ */
+static int import_sec_validate_get(struct obd_import *imp,
+				   struct ptlrpc_sec **sec)
+{
+	int rc;
+
+	if (unlikely(imp->imp_sec_expire)) {
+		rc = import_sec_check_expire(imp);
+		if (rc)
+			return rc;
+	}
+
+	*sec = sptlrpc_import_sec_ref(imp);
+	if (*sec == NULL) {
+		/* Only output an error when the import is still active */
+		if (!test_bit(WORK_STRUCT_PENDING_BIT,
+			      work_data_bits(&imp->imp_zombie_work)))
+			CERROR("import %p (%s) with no sec\n",
+			       imp, ptlrpc_import_state_name(imp->imp_state));
+		return -EACCES;
+	}
+
+	if (unlikely((*sec)->ps_dying)) {
+		CERROR("attempt to use dying sec %p\n", sec);
+		sptlrpc_sec_put(*sec);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+/**
+ * Given a \a req, find or allocate an appropriate context for it.
+ * \pre req->rq_cli_ctx == NULL.
+ *
+ * \retval 0 succeed, and req->rq_cli_ctx is set.
+ * \retval -ev error number, and req->rq_cli_ctx == NULL.
+ */
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_sec *sec;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(!req->rq_cli_ctx);
+	LASSERT(imp);
+
+	rc = import_sec_validate_get(imp, &sec);
+	if (rc)
+		RETURN(rc);
+
+	req->rq_cli_ctx = get_my_ctx(sec);
+
+	sptlrpc_sec_put(sec);
+
+	if (!req->rq_cli_ctx) {
+		CERROR("req %p: fail to get context\n", req);
+		RETURN(-ECONNREFUSED);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Drop the context for \a req.
+ * \pre req->rq_cli_ctx != NULL.
+ * \post req->rq_cli_ctx == NULL.
+ *
+ * If \a sync == 0, this function should return quickly without sleep;
+ * otherwise it might trigger and wait for the whole process of sending
+ * an context-destroying rpc to server.
+ */
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync)
+{
+	ENTRY;
+
+	LASSERT(req);
+	LASSERT(req->rq_cli_ctx);
+
+	/*
+	 * request might be asked to release earlier while still
+	 * in the context waiting list.
+	 */
+	if (!list_empty(&req->rq_ctx_chain)) {
+		spin_lock(&req->rq_cli_ctx->cc_lock);
+		list_del_init(&req->rq_ctx_chain);
+		spin_unlock(&req->rq_cli_ctx->cc_lock);
+	}
+
+	sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync);
+	req->rq_cli_ctx = NULL;
+	EXIT;
+}
+
+static
+int sptlrpc_req_ctx_switch(struct ptlrpc_request *req,
+			   struct ptlrpc_cli_ctx *oldctx,
+			   struct ptlrpc_cli_ctx *newctx)
+{
+	struct sptlrpc_flavor old_flvr;
+	char *reqmsg = NULL; /* to workaround old gcc */
+	int reqmsg_size;
+	int rc = 0;
+
+	CDEBUG(D_SEC,
+	       "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), switch sec %p(%s) -> %p(%s)\n",
+	       req, oldctx, oldctx->cc_vcred.vc_uid,
+	       sec2target_str(oldctx->cc_sec), newctx, newctx->cc_vcred.vc_uid,
+	       sec2target_str(newctx->cc_sec), oldctx->cc_sec,
+	       oldctx->cc_sec->ps_policy->sp_name, newctx->cc_sec,
+	       newctx->cc_sec->ps_policy->sp_name);
+
+	/* save flavor */
+	old_flvr = req->rq_flvr;
+
+	/* save request message */
+	reqmsg_size = req->rq_reqlen;
+	if (reqmsg_size != 0) {
+		LASSERT(req->rq_reqmsg);
+		OBD_ALLOC_LARGE(reqmsg, reqmsg_size);
+		if (reqmsg == NULL)
+			return -ENOMEM;
+		memcpy(reqmsg, req->rq_reqmsg, reqmsg_size);
+	}
+
+	/* release old req/rep buf */
+	req->rq_cli_ctx = oldctx;
+	sptlrpc_cli_free_reqbuf(req);
+	sptlrpc_cli_free_repbuf(req);
+	req->rq_cli_ctx = newctx;
+
+	/* recalculate the flavor */
+	sptlrpc_req_set_flavor(req, 0);
+
+	/*
+	 * alloc new request buffer
+	 * we don't need to alloc reply buffer here, leave it to the
+	 * rest procedure of ptlrpc
+	 */
+	if (reqmsg_size != 0) {
+		rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size);
+		if (!rc) {
+			LASSERT(req->rq_reqmsg);
+			memcpy(req->rq_reqmsg, reqmsg, reqmsg_size);
+		} else {
+			CWARN("failed to alloc reqbuf: %d\n", rc);
+			req->rq_flvr = old_flvr;
+		}
+
+		OBD_FREE_LARGE(reqmsg, reqmsg_size);
+	}
+	return rc;
+}
+
+/**
+ * If current context of \a req is dead somehow, e.g. we just switched flavor
+ * thus marked original contexts dead, we'll find a new context for it. if
+ * no switch is needed, \a req will end up with the same context.
+ *
+ * \note a request must have a context, to keep other parts of code happy.
+ * In any case of failure during the switching, we must restore the old one.
+ */
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx;
+	struct ptlrpc_cli_ctx *newctx;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(oldctx);
+
+	sptlrpc_cli_ctx_get(oldctx);
+	sptlrpc_req_put_ctx(req, 0);
+
+	rc = sptlrpc_req_get_ctx(req);
+	if (unlikely(rc)) {
+		LASSERT(!req->rq_cli_ctx);
+
+		/* restore old ctx */
+		req->rq_cli_ctx = oldctx;
+		RETURN(rc);
+	}
+
+	newctx = req->rq_cli_ctx;
+	LASSERT(newctx);
+
+	if (unlikely(newctx == oldctx &&
+		     test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) {
+		/*
+		 * still get the old dead ctx, usually means system too busy
+		 */
+		CDEBUG(D_SEC,
+		       "ctx (%p, fl %lx) doesn't switch, relax a little bit\n",
+		       newctx, newctx->cc_flags);
+
+		schedule_timeout_interruptible(cfs_time_seconds(1));
+	} else if (unlikely(test_bit(PTLRPC_CTX_UPTODATE_BIT, &newctx->cc_flags)
+			    == 0)) {
+		/*
+		 * new ctx not up to date yet
+		 */
+		CDEBUG(D_SEC,
+		       "ctx (%p, fl %lx) doesn't switch, not up to date yet\n",
+		       newctx, newctx->cc_flags);
+	} else {
+		/*
+		 * it's possible newctx == oldctx if we're switching
+		 * subflavor with the same sec.
+		 */
+		rc = sptlrpc_req_ctx_switch(req, oldctx, newctx);
+		if (rc) {
+			/* restore old ctx */
+			sptlrpc_req_put_ctx(req, 0);
+			req->rq_cli_ctx = oldctx;
+			RETURN(rc);
+		}
+
+		LASSERT(req->rq_cli_ctx == newctx);
+	}
+
+	sptlrpc_cli_ctx_put(oldctx, 1);
+	RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx);
+
+static
+int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	if (cli_ctx_is_refreshed(ctx))
+		return 1;
+	return 0;
+}
+
+static
+void ctx_refresh_interrupt(struct ptlrpc_request *req)
+{
+
+	spin_lock(&req->rq_lock);
+	req->rq_intr = 1;
+	spin_unlock(&req->rq_lock);
+}
+
+static
+void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx)
+{
+	spin_lock(&ctx->cc_lock);
+	if (!list_empty(&req->rq_ctx_chain))
+		list_del_init(&req->rq_ctx_chain);
+	spin_unlock(&ctx->cc_lock);
+}
+
+/**
+ * To refresh the context of \req, if it's not up-to-date.
+ * \param timeout
+ * - == 0: do not wait
+ * - == MAX_SCHEDULE_TIMEOUT: wait indefinitely
+ * - > 0: not supported
+ *
+ * The status of the context could be subject to be changed by other threads
+ * at any time. We allow this race, but once we return with 0, the caller will
+ * suppose it's uptodated and keep using it until the owning rpc is done.
+ *
+ * \retval 0 only if the context is uptodated.
+ * \retval -ev error number.
+ */
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec *sec;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(ctx);
+
+	if (req->rq_ctx_init || req->rq_ctx_fini)
+		RETURN(0);
+
+	if (timeout != 0 && timeout != MAX_SCHEDULE_TIMEOUT) {
+		CERROR("req %p: invalid timeout %lu\n", req, timeout);
+		RETURN(-EINVAL);
+	}
+
+	/*
+	 * during the process a request's context might change type even
+	 * (e.g. from gss ctx to null ctx), so each loop we need to re-check
+	 * everything
+	 */
+again:
+	rc = import_sec_validate_get(req->rq_import, &sec);
+	if (rc)
+		RETURN(rc);
+
+	if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+		CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n",
+		       req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc);
+		req_off_ctx_list(req, ctx);
+		sptlrpc_req_replace_dead_ctx(req);
+		ctx = req->rq_cli_ctx;
+	}
+	sptlrpc_sec_put(sec);
+
+	if (cli_ctx_is_eternal(ctx))
+		RETURN(0);
+
+	if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
+		if (ctx->cc_ops->refresh)
+			ctx->cc_ops->refresh(ctx);
+	}
+	LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
+
+	LASSERT(ctx->cc_ops->validate);
+	if (ctx->cc_ops->validate(ctx) == 0) {
+		req_off_ctx_list(req, ctx);
+		RETURN(0);
+	}
+
+	if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) {
+		spin_lock(&req->rq_lock);
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+		req_off_ctx_list(req, ctx);
+		RETURN(-EPERM);
+	}
+
+	/*
+	 * There's a subtle issue for resending RPCs, suppose following
+	 * situation:
+	 *  1. the request was sent to server.
+	 *  2. recovery was kicked start, after finished the request was
+	 *     marked as resent.
+	 *  3. resend the request.
+	 *  4. old reply from server received, we accept and verify the reply.
+	 *     this has to be success, otherwise the error will be aware
+	 *     by application.
+	 *  5. new reply from server received, dropped by LNet.
+	 *
+	 * Note the xid of old & new request is the same. We can't simply
+	 * change xid for the resent request because the server replies on
+	 * it for reply reconstruction.
+	 *
+	 * Commonly the original context should be uptodate because we
+	 * have an expiry nice time; server will keep its context because
+	 * we at least hold a ref of old context which prevent context
+	 * from destroying RPC being sent. So server still can accept the
+	 * request and finish the RPC. But if that's not the case:
+	 *  1. If server side context has been trimmed, a NO_CONTEXT will
+	 *     be returned, gss_cli_ctx_verify/unseal will switch to new
+	 *     context by force.
+	 *  2. Current context never be refreshed, then we are fine: we
+	 *     never really send request with old context before.
+	 */
+	if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) &&
+	    unlikely(req->rq_reqmsg) &&
+	    lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+		req_off_ctx_list(req, ctx);
+		RETURN(0);
+	}
+
+	if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) {
+		req_off_ctx_list(req, ctx);
+		/*
+		 * don't switch ctx if import was deactivated
+		 */
+		if (req->rq_import->imp_deactive) {
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+			RETURN(-EINTR);
+		}
+
+		rc = sptlrpc_req_replace_dead_ctx(req);
+		if (rc) {
+			LASSERT(ctx == req->rq_cli_ctx);
+			CERROR("req %p: failed to replace dead ctx %p: %d\n",
+			       req, ctx, rc);
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+			RETURN(rc);
+		}
+
+		ctx = req->rq_cli_ctx;
+		goto again;
+	}
+
+	/*
+	 * Now we're sure this context is during upcall, add myself into
+	 * waiting list
+	 */
+	spin_lock(&ctx->cc_lock);
+	if (list_empty(&req->rq_ctx_chain))
+		list_add(&req->rq_ctx_chain, &ctx->cc_req_list);
+	spin_unlock(&ctx->cc_lock);
+
+	if (timeout == 0)
+		RETURN(-EAGAIN);
+
+	/* Clear any flags that may be present from previous sends */
+	LASSERT(req->rq_receiving_reply == 0);
+	spin_lock(&req->rq_lock);
+	req->rq_err = 0;
+	req->rq_timedout = 0;
+	req->rq_resend = 0;
+	req->rq_restart = 0;
+	spin_unlock(&req->rq_lock);
+
+	/* by now we know that timeout value is MAX_SCHEDULE_TIMEOUT,
+	 * so wait indefinitely with non-fatal signals blocked
+	 */
+	if (l_wait_event_abortable(req->rq_reply_waitq,
+				   ctx_check_refresh(ctx)) == -ERESTARTSYS) {
+		rc = -EINTR;
+		ctx_refresh_interrupt(req);
+	}
+
+	/*
+	 * following cases could lead us here:
+	 * - successfully refreshed;
+	 * - interrupted;
+	 * - timedout, and we don't want recover from the failure;
+	 * - timedout, and waked up upon recovery finished;
+	 * - someone else mark this ctx dead by force;
+	 * - someone invalidate the req and call ptlrpc_client_wake_req(),
+	 *   e.g. ptlrpc_abort_inflight();
+	 */
+	if (!cli_ctx_is_refreshed(ctx)) {
+		/* timed out or interruptted */
+		req_off_ctx_list(req, ctx);
+
+		LASSERT(rc != 0);
+		RETURN(rc);
+	}
+
+	goto again;
+}
+
+/* Bring ptlrpc_sec context up-to-date */
+int sptlrpc_export_update_ctx(struct obd_export *exp)
+{
+	struct obd_import *imp = exp ? exp->exp_imp_reverse : NULL;
+	struct ptlrpc_sec *sec = NULL;
+	struct ptlrpc_cli_ctx *ctx = NULL;
+	int rc = 0;
+
+	if (imp)
+		sec = sptlrpc_import_sec_ref(imp);
+	if (sec) {
+		ctx = get_my_ctx(sec);
+		sptlrpc_sec_put(sec);
+	}
+
+	if (ctx) {
+		if (ctx->cc_ops->refresh)
+			rc = ctx->cc_ops->refresh(ctx);
+		sptlrpc_cli_ctx_put(ctx, 1);
+	}
+	return rc;
+}
+
+/**
+ * Initialize flavor settings for \a req, according to \a opcode.
+ *
+ * \note this could be called in two situations:
+ * - new request from ptlrpc_pre_req(), with proper @opcode
+ * - old request which changed ctx in the middle, with @opcode == 0
+ */
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
+{
+	struct ptlrpc_sec *sec;
+
+	LASSERT(req->rq_import);
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_cli_ctx->cc_sec);
+	LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0);
+
+	/* special security flags according to opcode */
+	switch (opcode) {
+	case OST_READ:
+	case MDS_READPAGE:
+	case MGS_CONFIG_READ:
+	case OBD_IDX_READ:
+		req->rq_bulk_read = 1;
+		break;
+	case OST_WRITE:
+	case MDS_WRITEPAGE:
+		req->rq_bulk_write = 1;
+		break;
+	case SEC_CTX_INIT:
+		req->rq_ctx_init = 1;
+		break;
+	case SEC_CTX_FINI:
+		req->rq_ctx_fini = 1;
+		break;
+	case 0:
+		/* init/fini rpc won't be resend, so can't be here */
+		LASSERT(req->rq_ctx_init == 0);
+		LASSERT(req->rq_ctx_fini == 0);
+
+		/* cleanup flags, which should be recalculated */
+		req->rq_pack_udesc = 0;
+		req->rq_pack_bulk = 0;
+		break;
+	}
+
+	sec = req->rq_cli_ctx->cc_sec;
+
+	spin_lock(&sec->ps_lock);
+	req->rq_flvr = sec->ps_flvr;
+	spin_unlock(&sec->ps_lock);
+
+	/*
+	 * force SVC_NULL for context initiation rpc, SVC_INTG for context
+	 * destruction rpc
+	 */
+	if (unlikely(req->rq_ctx_init))
+		flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
+	else if (unlikely(req->rq_ctx_fini))
+		flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
+
+	/* user descriptor flag, null security can't do it anyway */
+	if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) &&
+	    (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL))
+		req->rq_pack_udesc = 1;
+
+	/* bulk security flag */
+	if ((req->rq_bulk_read || req->rq_bulk_write) &&
+	    sptlrpc_flavor_has_bulk(&req->rq_flvr))
+		req->rq_pack_bulk = 1;
+}
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req)
+{
+	if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
+		return;
+
+	LASSERT(req->rq_clrbuf);
+	if (req->rq_pool || !req->rq_reqbuf)
+		return;
+
+	OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+	req->rq_reqbuf = NULL;
+	req->rq_reqbuf_len = 0;
+}
+
+/**
+ * Given an import \a imp, check whether current user has a valid context
+ * or not. We may create a new context and try to refresh it, and try
+ * repeatedly try in case of non-fatal errors. Return 0 means success.
+ */
+int sptlrpc_import_check_ctx(struct obd_import *imp)
+{
+	struct ptlrpc_sec     *sec;
+	struct ptlrpc_cli_ctx *ctx;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+
+	ENTRY;
+
+	might_sleep();
+
+	sec = sptlrpc_import_sec_ref(imp);
+	ctx = get_my_ctx(sec);
+	sptlrpc_sec_put(sec);
+
+	if (!ctx)
+		RETURN(-ENOMEM);
+
+	if (cli_ctx_is_eternal(ctx) ||
+	    ctx->cc_ops->validate(ctx) == 0) {
+		sptlrpc_cli_ctx_put(ctx, 1);
+		RETURN(0);
+	}
+
+	if (cli_ctx_is_error(ctx)) {
+		sptlrpc_cli_ctx_put(ctx, 1);
+		RETURN(-EACCES);
+	}
+
+	req = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	ptlrpc_cli_req_init(req);
+	atomic_set(&req->rq_refcount, 10000);
+
+	req->rq_import = imp;
+	req->rq_flvr = sec->ps_flvr;
+	req->rq_cli_ctx = ctx;
+
+	rc = sptlrpc_req_refresh_ctx(req, MAX_SCHEDULE_TIMEOUT);
+	LASSERT(list_empty(&req->rq_ctx_chain));
+	sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1);
+	ptlrpc_request_cache_free(req);
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform the pre-defined security transformation
+ * upon the request message of \a req. After this function called,
+ * req->rq_reqmsg is still accessible as clear text.
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+	/*
+	 * we wrap bulk request here because now we can be sure
+	 * the context is uptodate.
+	 */
+	if (req->rq_bulk) {
+		rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk);
+		if (rc)
+			RETURN(rc);
+	}
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(ctx->cc_ops->sign);
+		rc = ctx->cc_ops->sign(ctx, req);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(ctx->cc_ops->seal);
+		rc = ctx->cc_ops->seal(ctx, req);
+		break;
+	default:
+		LBUG();
+	}
+
+	if (rc == 0) {
+		LASSERT(req->rq_reqdata_len);
+		LASSERT(req->rq_reqdata_len % 8 == 0);
+		LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+	}
+
+	RETURN(rc);
+}
+
+static int do_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata);
+	LASSERT(req->rq_repmsg == NULL);
+
+	req->rq_rep_swab_mask = 0;
+
+	rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len);
+	switch (rc) {
+	case 1:
+		req_capsule_set_rep_swabbed(&req->rq_pill,
+					    MSG_PTLRPC_HEADER_OFF);
+	case 0:
+		break;
+	default:
+		CERROR("failed unpack reply: x%llu\n", req->rq_xid);
+		RETURN(-EPROTO);
+	}
+
+	if (req->rq_repdata_len < sizeof(struct lustre_msg)) {
+		CERROR("replied data length %d too small\n",
+		       req->rq_repdata_len);
+		RETURN(-EPROTO);
+	}
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) !=
+	    SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+		CERROR("reply policy %u doesn't match request policy %u\n",
+		       SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr),
+		       SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc));
+		RETURN(-EPROTO);
+	}
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(ctx->cc_ops->verify);
+		rc = ctx->cc_ops->verify(ctx, req);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(ctx->cc_ops->unseal);
+		rc = ctx->cc_ops->unseal(ctx, req);
+		break;
+	default:
+		LBUG();
+	}
+	LASSERT(rc || req->rq_repmsg || req->rq_resend);
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL &&
+	    !req->rq_ctx_init)
+		req->rq_rep_swab_mask = 0;
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the reply
+ * message of \a req. After return successfully, req->rq_repmsg points to
+ * the reply message in clear text.
+ *
+ * \pre the reply buffer should have been un-posted from LNet, so nothing is
+ * going to change.
+ */
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+	LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len);
+
+	if (req->rq_reply_off == 0 &&
+	    (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		CERROR("real reply with offset 0\n");
+		return -EPROTO;
+	}
+
+	if (req->rq_reply_off % 8 != 0) {
+		CERROR("reply at odd offset %u\n", req->rq_reply_off);
+		return -EPROTO;
+	}
+
+	req->rq_repdata = (struct lustre_msg *)
+				(req->rq_repbuf + req->rq_reply_off);
+	req->rq_repdata_len = req->rq_nob_received;
+
+	return do_cli_unwrap_reply(req);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the early
+ * reply message of \a req. We expect the rq_reply_off is 0, and
+ * rq_nob_received is the early reply size.
+ *
+ * Because the receive buffer might be still posted, the reply data might be
+ * changed at any time, no matter we're holding rq_lock or not. For this reason
+ * we allocate a separate ptlrpc_request and reply buffer for early reply
+ * processing.
+ *
+ * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request.
+ * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned
+ * \a *req_ret to release it.
+ * \retval -ev error number, and \a req_ret will not be set.
+ */
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+				   struct ptlrpc_request **req_ret)
+{
+	struct ptlrpc_request *early_req;
+	char *early_buf;
+	int early_bufsz, early_size;
+	int rc;
+
+	ENTRY;
+
+	early_req = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (early_req == NULL)
+		RETURN(-ENOMEM);
+
+	ptlrpc_cli_req_init(early_req);
+
+	early_size = req->rq_nob_received;
+	early_bufsz = size_roundup_power2(early_size);
+	OBD_ALLOC_LARGE(early_buf, early_bufsz);
+	if (early_buf == NULL)
+		GOTO(err_req, rc = -ENOMEM);
+
+	/* sanity checkings and copy data out, do it inside spinlock */
+	spin_lock(&req->rq_lock);
+
+	if (req->rq_replied) {
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EALREADY);
+	}
+
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+
+	if (req->rq_reply_off != 0) {
+		CERROR("early reply with offset %u\n", req->rq_reply_off);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EPROTO);
+	}
+
+	if (req->rq_nob_received != early_size) {
+		/* even another early arrived the size should be the same */
+		CERROR("data size has changed from %u to %u\n",
+		       early_size, req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EINVAL);
+	}
+
+	if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+		CERROR("early reply length %d too small\n",
+		       req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EALREADY);
+	}
+
+	memcpy(early_buf, req->rq_repbuf, early_size);
+	spin_unlock(&req->rq_lock);
+
+	early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx);
+	early_req->rq_flvr = req->rq_flvr;
+	early_req->rq_repbuf = early_buf;
+	early_req->rq_repbuf_len = early_bufsz;
+	early_req->rq_repdata = (struct lustre_msg *) early_buf;
+	early_req->rq_repdata_len = early_size;
+	early_req->rq_early = 1;
+	early_req->rq_reqmsg = req->rq_reqmsg;
+
+	rc = do_cli_unwrap_reply(early_req);
+	if (rc) {
+		DEBUG_REQ(D_ADAPTTO, early_req,
+			  "unwrap early reply: rc = %d", rc);
+		GOTO(err_ctx, rc);
+	}
+
+	LASSERT(early_req->rq_repmsg);
+	*req_ret = early_req;
+	RETURN(0);
+
+err_ctx:
+	sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+err_buf:
+	OBD_FREE_LARGE(early_buf, early_bufsz);
+err_req:
+	ptlrpc_request_cache_free(early_req);
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to release a processed early reply \a early_req.
+ *
+ * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply().
+ */
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req)
+{
+	LASSERT(early_req->rq_repbuf);
+	LASSERT(early_req->rq_repdata);
+	LASSERT(early_req->rq_repmsg);
+
+	sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+	OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len);
+	ptlrpc_request_cache_free(early_req);
+}
+
+/**************************************************
+ * sec ID                                         *
+ **************************************************/
+
+/*
+ * "fixed" sec (e.g. null) use sec_id < 0
+ */
+static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1);
+
+int sptlrpc_get_next_secid(void)
+{
+	return atomic_inc_return(&sptlrpc_sec_id);
+}
+EXPORT_SYMBOL(sptlrpc_get_next_secid);
+
+/*
+ * client side high-level security APIs
+ */
+
+static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid,
+				   int grace, int force)
+{
+	struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+	LASSERT(policy->sp_cops);
+	LASSERT(policy->sp_cops->flush_ctx_cache);
+
+	return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force);
+}
+
+static void sec_cop_destroy_sec(struct ptlrpc_sec *sec)
+{
+	struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+	LASSERT_ATOMIC_ZERO(&sec->ps_refcount);
+	LASSERT_ATOMIC_ZERO(&sec->ps_nctx);
+	LASSERT(policy->sp_cops->destroy_sec);
+
+	CDEBUG(D_SEC, "%s@%p: being destroied\n", sec->ps_policy->sp_name, sec);
+
+	policy->sp_cops->destroy_sec(sec);
+	sptlrpc_policy_put(policy);
+}
+
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec)
+{
+	sec_cop_destroy_sec(sec);
+}
+EXPORT_SYMBOL(sptlrpc_sec_destroy);
+
+static void sptlrpc_sec_kill(struct ptlrpc_sec *sec)
+{
+	LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+	if (sec->ps_policy->sp_cops->kill_sec) {
+		sec->ps_policy->sp_cops->kill_sec(sec);
+
+		sec_cop_flush_ctx_cache(sec, -1, 1, 1);
+	}
+}
+
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec)
+{
+	if (sec)
+		atomic_inc(&sec->ps_refcount);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_sec_get);
+
+void sptlrpc_sec_put(struct ptlrpc_sec *sec)
+{
+	if (sec) {
+		LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+		if (atomic_dec_and_test(&sec->ps_refcount)) {
+			sptlrpc_gc_del_sec(sec);
+			sec_cop_destroy_sec(sec);
+		}
+	}
+}
+EXPORT_SYMBOL(sptlrpc_sec_put);
+
+/*
+ * policy module is responsible for taking refrence of import
+ */
+static
+struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
+				       struct ptlrpc_svc_ctx *svc_ctx,
+				       struct sptlrpc_flavor *sf,
+				       enum lustre_sec_part sp)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct ptlrpc_sec *sec;
+	char str[32];
+
+	ENTRY;
+
+	if (svc_ctx) {
+		LASSERT(imp->imp_dlm_fake == 1);
+
+		CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n",
+		       imp->imp_obd->obd_type->typ_name,
+		       imp->imp_obd->obd_name,
+		       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+		policy = sptlrpc_policy_get(svc_ctx->sc_policy);
+		sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
+	} else {
+		LASSERT(imp->imp_dlm_fake == 0);
+
+		CDEBUG(D_SEC, "%s %s: select security flavor %s\n",
+		       imp->imp_obd->obd_type->typ_name,
+		       imp->imp_obd->obd_name,
+		       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+		policy = sptlrpc_wireflavor2policy(sf->sf_rpc);
+		if (!policy) {
+			CERROR("invalid flavor 0x%x\n", sf->sf_rpc);
+			RETURN(NULL);
+		}
+	}
+
+	sec = policy->sp_cops->create_sec(imp, svc_ctx, sf);
+	if (sec) {
+		atomic_inc(&sec->ps_refcount);
+
+		sec->ps_part = sp;
+
+		if (sec->ps_gc_interval && policy->sp_cops->gc_ctx)
+			sptlrpc_gc_add_sec(sec);
+	} else {
+		sptlrpc_policy_put(policy);
+	}
+
+	RETURN(sec);
+}
+
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp)
+{
+	struct ptlrpc_sec *sec;
+
+	read_lock(&imp->imp_sec_lock);
+	sec = sptlrpc_sec_get(imp->imp_sec);
+	read_unlock(&imp->imp_sec_lock);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_import_sec_ref);
+
+static void sptlrpc_import_sec_install(struct obd_import *imp,
+				       struct ptlrpc_sec *sec)
+{
+	struct ptlrpc_sec *old_sec;
+
+	LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+	write_lock(&imp->imp_sec_lock);
+	old_sec = imp->imp_sec;
+	imp->imp_sec = sec;
+	write_unlock(&imp->imp_sec_lock);
+
+	if (old_sec) {
+		sptlrpc_sec_kill(old_sec);
+
+		/* balance the ref taken by this import */
+		sptlrpc_sec_put(old_sec);
+	}
+}
+
+static inline
+int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2)
+{
+	return (memcmp(sf1, sf2, sizeof(*sf1)) == 0);
+}
+
+static inline
+void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src)
+{
+	*dst = *src;
+}
+
+/**
+ * To get an appropriate ptlrpc_sec for the \a imp, according to the current
+ * configuration. Upon called, imp->imp_sec may or may not be NULL.
+ *
+ *  - regular import: \a svc_ctx should be NULL and \a flvr is ignored;
+ *  - reverse import: \a svc_ctx and \a flvr are obtained from incoming request.
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+			     struct ptlrpc_svc_ctx *svc_ctx,
+			     struct sptlrpc_flavor *flvr)
+{
+	struct ptlrpc_connection *conn;
+	struct sptlrpc_flavor sf;
+	struct ptlrpc_sec *sec, *newsec;
+	enum lustre_sec_part sp;
+	char str[24];
+	int rc = 0;
+
+	ENTRY;
+
+	might_sleep();
+
+	if (imp == NULL)
+		RETURN(0);
+
+	conn = imp->imp_connection;
+
+	if (svc_ctx == NULL) {
+		struct client_obd *cliobd = &imp->imp_obd->u.cli;
+		/*
+		 * normal import, determine flavor from rule set, except
+		 * for mgc the flavor is predetermined.
+		 */
+		if (cliobd->cl_sp_me == LUSTRE_SP_MGC)
+			sf = cliobd->cl_flvr_mgc;
+		else
+			sptlrpc_conf_choose_flavor(cliobd->cl_sp_me,
+						   cliobd->cl_sp_to,
+						   &cliobd->cl_target_uuid,
+						   &conn->c_self, &sf);
+
+		sp = imp->imp_obd->u.cli.cl_sp_me;
+	} else {
+		/* reverse import, determine flavor from incoming reqeust */
+		sf = *flvr;
+
+		if (sf.sf_rpc != SPTLRPC_FLVR_NULL)
+			sf.sf_flags = PTLRPC_SEC_FL_REVERSE |
+				      PTLRPC_SEC_FL_ROOTONLY;
+
+		sp = sptlrpc_target_sec_part(imp->imp_obd);
+	}
+
+	sec = sptlrpc_import_sec_ref(imp);
+	if (sec) {
+		char str2[24];
+
+		if (flavor_equal(&sf, &sec->ps_flvr))
+			GOTO(out, rc);
+
+		CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid),
+		       sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
+		       sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
+	} else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) !=
+		   SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) {
+		CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid),
+		       LNET_NID_NET(&conn->c_self),
+		       sptlrpc_flavor2name(&sf, str, sizeof(str)));
+	}
+
+	newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp);
+	if (newsec) {
+		sptlrpc_import_sec_install(imp, newsec);
+	} else {
+		CERROR("import %s->%s: failed to create new sec\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid));
+		rc = -EPERM;
+	}
+
+out:
+	sptlrpc_sec_put(sec);
+	RETURN(rc);
+}
+
+void sptlrpc_import_sec_put(struct obd_import *imp)
+{
+	if (imp->imp_sec) {
+		sptlrpc_sec_kill(imp->imp_sec);
+
+		sptlrpc_sec_put(imp->imp_sec);
+		imp->imp_sec = NULL;
+	}
+}
+
+static void import_flush_ctx_common(struct obd_import *imp,
+				    uid_t uid, int grace, int force)
+{
+	struct ptlrpc_sec *sec;
+
+	if (imp == NULL)
+		return;
+
+	sec = sptlrpc_import_sec_ref(imp);
+	if (sec == NULL)
+		return;
+
+	sec_cop_flush_ctx_cache(sec, uid, grace, force);
+	sptlrpc_sec_put(sec);
+}
+
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp)
+{
+	/*
+	 * it's important to use grace mode, see explain in
+	 * sptlrpc_req_refresh_ctx()
+	 */
+	import_flush_ctx_common(imp, 0, 1, 1);
+}
+
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp)
+{
+	import_flush_ctx_common(imp, from_kuid(&init_user_ns, current_uid()),
+				1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx);
+
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp)
+{
+	import_flush_ctx_common(imp, -1, 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx);
+
+/**
+ * Used by ptlrpc client to allocate request buffer of \a req. Upon return
+ * successfully, req->rq_reqmsg points to a buffer with size \a msgsize.
+ */
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+	int rc;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT(req->rq_reqmsg == NULL);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	policy = ctx->cc_sec->ps_policy;
+	rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize);
+	if (!rc) {
+		LASSERT(req->rq_reqmsg);
+		LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+		/* zeroing preallocated buffer */
+		if (req->rq_pool)
+			memset(req->rq_reqmsg, 0, msgsize);
+	}
+
+	return rc;
+}
+
+/**
+ * Used by ptlrpc client to free request buffer of \a req. After this
+ * req->rq_reqmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL)
+		return;
+
+	policy = ctx->cc_sec->ps_policy;
+	policy->sp_cops->free_reqbuf(ctx->cc_sec, req);
+	req->rq_reqmsg = NULL;
+}
+
+/*
+ * NOTE caller must guarantee the buffer size is enough for the enlargement
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+				  int segment, int newsize)
+{
+	void *src, *dst;
+	int oldsize, oldmsg_size, movesize;
+
+	LASSERT(segment < msg->lm_bufcount);
+	LASSERT(msg->lm_buflens[segment] <= newsize);
+
+	if (msg->lm_buflens[segment] == newsize)
+		return;
+
+	/* nothing to do if we are enlarging the last segment */
+	if (segment == msg->lm_bufcount - 1) {
+		msg->lm_buflens[segment] = newsize;
+		return;
+	}
+
+	oldsize = msg->lm_buflens[segment];
+
+	src = lustre_msg_buf(msg, segment + 1, 0);
+	msg->lm_buflens[segment] = newsize;
+	dst = lustre_msg_buf(msg, segment + 1, 0);
+	msg->lm_buflens[segment] = oldsize;
+
+	/* move from segment + 1 to end segment */
+	LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2);
+	oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg);
+	LASSERT(movesize >= 0);
+
+	if (movesize)
+		memmove(dst, src, movesize);
+
+	/* note we don't clear the ares where old data live, not secret */
+
+	/* finally set new segment size */
+	msg->lm_buflens[segment] = newsize;
+}
+EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace);
+
+/**
+ * Used by ptlrpc client to enlarge the \a segment of request message pointed
+ * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be
+ * preserved after the enlargement. this must be called after original request
+ * buffer being allocated.
+ *
+ * \note after this be called, rq_reqmsg and rq_reqlen might have been changed,
+ * so caller should refresh its local pointers if needed.
+ */
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       const struct req_msg_field *field,
+			       int newsize)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_cops *cops;
+	struct lustre_msg *msg = req->rq_reqmsg;
+	int segment = __req_capsule_offset(pill, field, RCL_CLIENT);
+
+	LASSERT(ctx);
+	LASSERT(msg);
+	LASSERT(msg->lm_bufcount > segment);
+	LASSERT(msg->lm_buflens[segment] <= newsize);
+
+	if (msg->lm_buflens[segment] == newsize)
+		return 0;
+
+	cops = ctx->cc_sec->ps_policy->sp_cops;
+	LASSERT(cops->enlarge_reqbuf);
+	return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize);
+}
+EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf);
+
+/**
+ * Used by ptlrpc client to allocate reply buffer of \a req.
+ *
+ * \note After this, req->rq_repmsg is still not accessible.
+ */
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+
+	if (req->rq_repbuf)
+		RETURN(0);
+
+	policy = ctx->cc_sec->ps_policy;
+	RETURN(policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize));
+}
+
+/**
+ * Used by ptlrpc client to free reply buffer of \a req. After this
+ * req->rq_repmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (req->rq_repbuf == NULL)
+		return;
+	LASSERT(req->rq_repbuf_len);
+
+	policy = ctx->cc_sec->ps_policy;
+	policy->sp_cops->free_repbuf(ctx->cc_sec, req);
+	req->rq_repmsg = NULL;
+	EXIT;
+}
+EXPORT_SYMBOL(sptlrpc_cli_free_repbuf);
+
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy;
+
+	if (!policy->sp_cops->install_rctx)
+		return 0;
+	return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx);
+}
+
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_svc_ctx *ctx)
+{
+	struct ptlrpc_sec_policy *policy = ctx->sc_policy;
+
+	if (!policy->sp_sops->install_rctx)
+		return 0;
+	return policy->sp_sops->install_rctx(imp, ctx);
+}
+
+/* Get SELinux policy info from userspace */
+static int sepol_helper(struct obd_import *imp)
+{
+	char mtime_str[21] = { 0 }, mode_str[2] = { 0 };
+	char *argv[] = {
+		[0] = "/usr/sbin/l_getsepol",
+		[1] = "-o",
+		[2] = NULL,	    /* obd type */
+		[3] = "-n",
+		[4] = NULL,	    /* obd name */
+		[5] = "-t",
+		[6] = mtime_str,    /* policy mtime */
+		[7] = "-m",
+		[8] = mode_str,	    /* enforcing mode */
+		[9] = NULL
+	};
+	char *envp[] = {
+		[0] = "HOME=/",
+		[1] = "PATH=/sbin:/usr/sbin",
+		[2] = NULL
+	};
+	signed short ret;
+	int rc = 0;
+
+	if (imp == NULL || imp->imp_obd == NULL ||
+	    imp->imp_obd->obd_type == NULL) {
+		rc = -EINVAL;
+	} else {
+		argv[2] = (char *)imp->imp_obd->obd_type->typ_name;
+		argv[4] = imp->imp_obd->obd_name;
+		spin_lock(&imp->imp_sec->ps_lock);
+		if (ktime_to_ns(imp->imp_sec->ps_sepol_mtime) == 0 &&
+		    imp->imp_sec->ps_sepol[0] == '\0') {
+			/* ps_sepol has not been initialized */
+			argv[5] = NULL;
+			argv[7] = NULL;
+		} else {
+			time64_t mtime_ms;
+
+			mtime_ms = ktime_to_ms(imp->imp_sec->ps_sepol_mtime);
+			snprintf(mtime_str, sizeof(mtime_str), "%lld",
+				 mtime_ms / MSEC_PER_SEC);
+			mode_str[0] = imp->imp_sec->ps_sepol[0];
+		}
+		spin_unlock(&imp->imp_sec->ps_lock);
+		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+		rc = ret>>8;
+	}
+
+	return rc;
+}
+
+static inline int sptlrpc_sepol_needs_check(struct ptlrpc_sec *imp_sec)
+{
+	ktime_t checknext;
+
+	if (send_sepol == 0)
+		return 0;
+
+	if (send_sepol == -1)
+		/* send_sepol == -1 means fetch sepol status every time */
+		return 1;
+
+	spin_lock(&imp_sec->ps_lock);
+	checknext = imp_sec->ps_sepol_checknext;
+	spin_unlock(&imp_sec->ps_lock);
+
+	/* next check is too far in time, please update */
+	if (ktime_after(checknext,
+			ktime_add(ktime_get(), ktime_set(send_sepol, 0))))
+		goto setnext;
+
+	if (ktime_before(ktime_get(), checknext))
+		/* too early to fetch sepol status */
+		return 0;
+
+setnext:
+	/* define new sepol_checknext time */
+	spin_lock(&imp_sec->ps_lock);
+	imp_sec->ps_sepol_checknext = ktime_add(ktime_get(),
+						ktime_set(send_sepol, 0));
+	spin_unlock(&imp_sec->ps_lock);
+
+	return 1;
+}
+
+int sptlrpc_get_sepol(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec *imp_sec = req->rq_import->imp_sec;
+	int rc = 0;
+
+	ENTRY;
+
+	(req->rq_sepol)[0] = '\0';
+
+#ifndef HAVE_SELINUX
+	if (unlikely(send_sepol != 0))
+		CDEBUG(D_SEC,
+		       "Client cannot report SELinux status, it was not built against libselinux.\n");
+	RETURN(0);
+#endif
+
+	if (send_sepol == 0)
+		RETURN(0);
+
+	if (imp_sec == NULL)
+		RETURN(-EINVAL);
+
+	/* Retrieve SELinux status info */
+	if (sptlrpc_sepol_needs_check(imp_sec))
+		rc = sepol_helper(req->rq_import);
+	if (likely(rc == 0)) {
+		spin_lock(&imp_sec->ps_lock);
+		memcpy(req->rq_sepol, imp_sec->ps_sepol,
+		       sizeof(req->rq_sepol));
+		spin_unlock(&imp_sec->ps_lock);
+	} else if (rc == -ENODEV) {
+		CDEBUG(D_SEC,
+		       "Client cannot report SELinux status, SELinux is disabled.\n");
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(sptlrpc_get_sepol);
+
+/*
+ * server side security
+ */
+
+static int flavor_allowed(struct sptlrpc_flavor *exp,
+			  struct ptlrpc_request *req)
+{
+	struct sptlrpc_flavor *flvr = &req->rq_flvr;
+
+	if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc)
+		return 1;
+
+	if ((req->rq_ctx_init || req->rq_ctx_fini) &&
+	    SPTLRPC_FLVR_POLICY(exp->sf_rpc) ==
+	    SPTLRPC_FLVR_POLICY(flvr->sf_rpc) &&
+	    SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc))
+		return 1;
+
+	return 0;
+}
+
+#define EXP_FLVR_UPDATE_EXPIRE      (OBD_TIMEOUT_DEFAULT + 10)
+
+/**
+ * Given an export \a exp, check whether the flavor of incoming \a req
+ * is allowed by the export \a exp. Main logic is about taking care of
+ * changing configurations. Return 0 means success.
+ */
+int sptlrpc_target_export_check(struct obd_export *exp,
+				struct ptlrpc_request *req)
+{
+	struct sptlrpc_flavor   flavor;
+
+	if (exp == NULL)
+		return 0;
+
+	/*
+	 * client side export has no imp_reverse, skip
+	 * FIXME maybe we should check flavor this as well???
+	 */
+	if (exp->exp_imp_reverse == NULL)
+		return 0;
+
+	/* don't care about ctx fini rpc */
+	if (req->rq_ctx_fini)
+		return 0;
+
+	spin_lock(&exp->exp_lock);
+
+	/*
+	 * if flavor just changed (exp->exp_flvr_changed != 0), we wait for
+	 * the first req with the new flavor, then treat it as current flavor,
+	 * adapt reverse sec according to it.
+	 * note the first rpc with new flavor might not be with root ctx, in
+	 * which case delay the sec_adapt by leaving exp_flvr_adapt == 1.
+	 */
+	if (unlikely(exp->exp_flvr_changed) &&
+	    flavor_allowed(&exp->exp_flvr_old[1], req)) {
+		/*
+		 * make the new flavor as "current", and old ones as
+		 * about-to-expire
+		 */
+		CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp,
+		       exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc);
+		flavor = exp->exp_flvr_old[1];
+		exp->exp_flvr_old[1] = exp->exp_flvr_old[0];
+		exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0];
+		exp->exp_flvr_old[0] = exp->exp_flvr;
+		exp->exp_flvr_expire[0] = ktime_get_real_seconds() +
+					  EXP_FLVR_UPDATE_EXPIRE;
+		exp->exp_flvr = flavor;
+
+		/* flavor change finished */
+		exp->exp_flvr_changed = 0;
+		LASSERT(exp->exp_flvr_adapt == 1);
+
+		/* if it's gss, we only interested in root ctx init */
+		if (req->rq_auth_gss &&
+		    !(req->rq_ctx_init &&
+		    (req->rq_auth_usr_root || req->rq_auth_usr_mdt ||
+		    req->rq_auth_usr_ost))) {
+			spin_unlock(&exp->exp_lock);
+			CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n",
+			       req->rq_auth_gss, req->rq_ctx_init,
+			       req->rq_auth_usr_root, req->rq_auth_usr_mdt,
+			       req->rq_auth_usr_ost);
+			return 0;
+		}
+
+		exp->exp_flvr_adapt = 0;
+		spin_unlock(&exp->exp_lock);
+
+		return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+						req->rq_svc_ctx, &flavor);
+	}
+
+	/*
+	 * if it equals to the current flavor, we accept it, but need to
+	 * dealing with reverse sec/ctx
+	 */
+	if (likely(flavor_allowed(&exp->exp_flvr, req))) {
+		/*
+		 * most cases should return here, we only interested in
+		 * gss root ctx init
+		 */
+		if (!req->rq_auth_gss || !req->rq_ctx_init ||
+		    (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+		     !req->rq_auth_usr_ost)) {
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		/*
+		 * if flavor just changed, we should not proceed, just leave
+		 * it and current flavor will be discovered and replaced
+		 * shortly, and let _this_ rpc pass through
+		 */
+		if (exp->exp_flvr_changed) {
+			LASSERT(exp->exp_flvr_adapt);
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		if (exp->exp_flvr_adapt) {
+			exp->exp_flvr_adapt = 0;
+			CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n",
+			       exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			flavor = exp->exp_flvr;
+			spin_unlock(&exp->exp_lock);
+
+			return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+							req->rq_svc_ctx,
+							&flavor);
+		} else {
+			CDEBUG(D_SEC,
+			       "exp %p (%x|%x|%x): is current flavor, install rvs ctx\n",
+			       exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			spin_unlock(&exp->exp_lock);
+
+			return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse,
+							   req->rq_svc_ctx);
+		}
+	}
+
+	if (exp->exp_flvr_expire[0]) {
+		if (exp->exp_flvr_expire[0] >= ktime_get_real_seconds()) {
+			if (flavor_allowed(&exp->exp_flvr_old[0], req)) {
+				CDEBUG(D_SEC,
+				       "exp %p (%x|%x|%x): match the middle one (%lld)\n",
+				       exp, exp->exp_flvr.sf_rpc,
+				       exp->exp_flvr_old[0].sf_rpc,
+				       exp->exp_flvr_old[1].sf_rpc,
+				       (s64)(exp->exp_flvr_expire[0] -
+					     ktime_get_real_seconds()));
+				spin_unlock(&exp->exp_lock);
+				return 0;
+			}
+		} else {
+			CDEBUG(D_SEC, "mark middle expired\n");
+			exp->exp_flvr_expire[0] = 0;
+		}
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp,
+		       exp->exp_flvr.sf_rpc,
+		       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+		       req->rq_flvr.sf_rpc);
+	}
+
+	/*
+	 * now it doesn't match the current flavor, the only chance we can
+	 * accept it is match the old flavors which is not expired.
+	 */
+	if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) {
+		if (exp->exp_flvr_expire[1] >= ktime_get_real_seconds()) {
+			if (flavor_allowed(&exp->exp_flvr_old[1], req)) {
+				CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the oldest one (%lld)\n",
+				       exp,
+				       exp->exp_flvr.sf_rpc,
+				       exp->exp_flvr_old[0].sf_rpc,
+				       exp->exp_flvr_old[1].sf_rpc,
+				       (s64)(exp->exp_flvr_expire[1] -
+				       ktime_get_real_seconds()));
+				spin_unlock(&exp->exp_lock);
+				return 0;
+			}
+		} else {
+			CDEBUG(D_SEC, "mark oldest expired\n");
+			exp->exp_flvr_expire[1] = 0;
+		}
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n",
+		       exp, exp->exp_flvr.sf_rpc,
+		       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+		       req->rq_flvr.sf_rpc);
+	} else {
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n",
+		       exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc,
+		       exp->exp_flvr_old[1].sf_rpc);
+	}
+
+	spin_unlock(&exp->exp_lock);
+
+	CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with unauthorized flavor %x, expect %x|%x(%+lld)|%x(%+lld)\n",
+	      exp, exp->exp_obd->obd_name,
+	      req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini,
+	      req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost,
+	      req->rq_flvr.sf_rpc,
+	      exp->exp_flvr.sf_rpc,
+	      exp->exp_flvr_old[0].sf_rpc,
+	      exp->exp_flvr_expire[0] ?
+	      (s64)(exp->exp_flvr_expire[0] - ktime_get_real_seconds()) : 0,
+	      exp->exp_flvr_old[1].sf_rpc,
+	      exp->exp_flvr_expire[1] ?
+	      (s64)(exp->exp_flvr_expire[1] - ktime_get_real_seconds()) : 0);
+	return -EACCES;
+}
+EXPORT_SYMBOL(sptlrpc_target_export_check);
+
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+				      struct sptlrpc_rule_set *rset)
+{
+	struct obd_export *exp;
+	struct sptlrpc_flavor new_flvr;
+
+	LASSERT(obd);
+
+	spin_lock(&obd->obd_dev_lock);
+
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+		if (exp->exp_connection == NULL)
+			continue;
+
+		/*
+		 * note if this export had just been updated flavor
+		 * (exp_flvr_changed == 1), this will override the
+		 * previous one.
+		 */
+		spin_lock(&exp->exp_lock);
+		sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer,
+					     lnet_nid_to_nid4(&exp->exp_connection->c_peer.nid),
+					     &new_flvr);
+		if (exp->exp_flvr_changed ||
+		    !flavor_equal(&new_flvr, &exp->exp_flvr)) {
+			exp->exp_flvr_old[1] = new_flvr;
+			exp->exp_flvr_expire[1] = 0;
+			exp->exp_flvr_changed = 1;
+			exp->exp_flvr_adapt = 1;
+
+			CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n",
+			       exp, sptlrpc_part2name(exp->exp_sp_peer),
+			       exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+		}
+		spin_unlock(&exp->exp_lock);
+	}
+
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor);
+
+static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc)
+{
+	/* peer's claim is unreliable unless gss is being used */
+	if (!req->rq_auth_gss || svc_rc == SECSVC_DROP)
+		return svc_rc;
+
+	switch (req->rq_sp_from) {
+	case LUSTRE_SP_CLI:
+		if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) {
+			/* The below message is checked in sanity-sec test_33 */
+			DEBUG_REQ(D_ERROR, req, "faked source CLI");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_MDT:
+		if (!req->rq_auth_usr_mdt) {
+			/* The below message is checked in sanity-sec test_33 */
+			DEBUG_REQ(D_ERROR, req, "faked source MDT");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_OST:
+		if (!req->rq_auth_usr_ost) {
+			/* The below message is checked in sanity-sec test_33 */
+			DEBUG_REQ(D_ERROR, req, "faked source OST");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_MGS:
+	case LUSTRE_SP_MGC:
+		if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+		    !req->rq_auth_usr_ost) {
+			/* The below message is checked in sanity-sec test_33 */
+			DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_ANY:
+	default:
+		DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from);
+		svc_rc = SECSVC_DROP;
+	}
+
+	return svc_rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon request message of
+ * incoming \a req. This must be the first thing to do with an incoming
+ * request in ptlrpc layer.
+ *
+ * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in
+ * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set.
+ * \retval SECSVC_COMPLETE success, the request has been fully processed, and
+ * reply message has been prepared.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ */
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct lustre_msg *msg = req->rq_reqbuf;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(msg);
+	LASSERT(req->rq_reqmsg == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+	LASSERT(req->rq_svc_ctx == NULL);
+
+	req->rq_req_swab_mask = 0;
+
+	rc = __lustre_unpack_msg(msg, req->rq_reqdata_len);
+	switch (rc) {
+	case 1:
+		req_capsule_set_req_swabbed(&req->rq_pill,
+					    MSG_PTLRPC_HEADER_OFF);
+	case 0:
+		break;
+	default:
+		CERROR("error unpacking request from %s x%llu\n",
+		       libcfs_id2str(req->rq_peer), req->rq_xid);
+		RETURN(SECSVC_DROP);
+	}
+
+	req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr);
+	req->rq_sp_from = LUSTRE_SP_ANY;
+	req->rq_auth_uid = -1; /* set to INVALID_UID */
+	req->rq_auth_mapped_uid = -1;
+
+	policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc);
+	if (!policy) {
+		CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		RETURN(SECSVC_DROP);
+	}
+
+	LASSERT(policy->sp_sops->accept);
+	rc = policy->sp_sops->accept(req);
+	sptlrpc_policy_put(policy);
+	LASSERT(req->rq_reqmsg || rc != SECSVC_OK);
+	LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
+
+	/*
+	 * if it's not null flavor (which means embedded packing msg),
+	 * reset the swab mask for the comming inner msg unpacking.
+	 */
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL)
+		req->rq_req_swab_mask = 0;
+
+	/* sanity check for the request source */
+	rc = sptlrpc_svc_check_from(req, rc);
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed,
+ * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to
+ * a buffer of \a msglen size.
+ */
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct ptlrpc_reply_state *rs;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_svc_ctx->sc_policy);
+
+	policy = req->rq_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->alloc_rs);
+
+	rc = policy->sp_sops->alloc_rs(req, msglen);
+	if (unlikely(rc == -ENOMEM)) {
+		struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+
+		if (svcpt->scp_service->srv_max_reply_size <
+		   msglen + sizeof(struct ptlrpc_reply_state)) {
+			/* Just return failure if the size is too big */
+			CERROR("size of message is too big (%zd), %d allowed\n",
+				msglen + sizeof(struct ptlrpc_reply_state),
+				svcpt->scp_service->srv_max_reply_size);
+			RETURN(-ENOMEM);
+		}
+
+		/* failed alloc, try emergency pool */
+		rs = lustre_get_emerg_rs(svcpt);
+		if (rs == NULL)
+			RETURN(-ENOMEM);
+
+		req->rq_reply_state = rs;
+		rc = policy->sp_sops->alloc_rs(req, msglen);
+		if (rc) {
+			lustre_put_emerg_rs(rs);
+			req->rq_reply_state = NULL;
+		}
+	}
+
+	LASSERT(rc != 0 ||
+		(req->rq_reply_state && req->rq_reply_state->rs_msg));
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon reply message.
+ *
+ * \post req->rq_reply_off is set to approriate server-controlled reply offset.
+ * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible.
+ */
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec_policy *policy;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_svc_ctx->sc_policy);
+
+	policy = req->rq_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->authorize);
+
+	rc = policy->sp_sops->authorize(req);
+	LASSERT(rc || req->rq_reply_state->rs_repdata_len);
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to free reply_state.
+ */
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_sec_policy *policy;
+	unsigned int prealloc;
+
+	ENTRY;
+
+	LASSERT(rs->rs_svc_ctx);
+	LASSERT(rs->rs_svc_ctx->sc_policy);
+
+	policy = rs->rs_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->free_rs);
+
+	prealloc = rs->rs_prealloc;
+	policy->sp_sops->free_rs(rs);
+
+	if (prealloc)
+		lustre_put_emerg_rs(rs);
+	EXIT;
+}
+
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx != NULL)
+		atomic_inc(&ctx->sc_refcount);
+}
+
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx == NULL)
+		return;
+
+	LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+	if (atomic_dec_and_test(&ctx->sc_refcount)) {
+		if (ctx->sc_policy->sp_sops->free_ctx)
+			ctx->sc_policy->sp_sops->free_ctx(ctx);
+	}
+	req->rq_svc_ctx = NULL;
+}
+
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx == NULL)
+		return;
+
+	LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+	if (ctx->sc_policy->sp_sops->invalidate_ctx)
+		ctx->sc_policy->sp_sops->invalidate_ctx(ctx);
+}
+EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate);
+
+/*
+ * bulk security
+ */
+
+/**
+ * Perform transformation upon bulk data pointed by \a desc. This is called
+ * before transforming the request message.
+ */
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->wrap_bulk)
+		return ctx->cc_ops->wrap_bulk(ctx, req, desc);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk);
+
+/**
+ * This is called after unwrap the reply message.
+ * return nob of actual plain text size received, or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+				 struct ptlrpc_bulk_desc *desc,
+				 int nob)
+{
+	struct ptlrpc_cli_ctx *ctx;
+	int rc;
+
+	LASSERT(req->rq_bulk_read && !req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return desc->bd_nob_transferred;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->unwrap_bulk) {
+		rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+		if (rc < 0)
+			return rc;
+	}
+	return desc->bd_nob_transferred;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read);
+
+/**
+ * This is called after unwrap the reply message.
+ * return 0 for success or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+				  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_cli_ctx *ctx;
+	int rc;
+
+	LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->unwrap_bulk) {
+		rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+		if (rc < 0)
+			return rc;
+	}
+
+	/*
+	 * if everything is going right, nob should equals to nob_transferred.
+	 * in case of privacy mode, nob_transferred needs to be adjusted.
+	 */
+	if (desc->bd_nob != desc->bd_nob_transferred) {
+		CERROR("nob %d doesn't match transferred nob %d\n",
+		       desc->bd_nob, desc->bd_nob_transferred);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write);
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Performe transformation upon outgoing bulk read.
+ */
+int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_svc_ctx *ctx;
+
+	LASSERT(req->rq_bulk_read);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_svc_ctx;
+	if (ctx->sc_policy->sp_sops->wrap_bulk)
+		return ctx->sc_policy->sp_sops->wrap_bulk(req, desc);
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_svc_wrap_bulk);
+
+/**
+ * Performe transformation upon incoming bulk write.
+ */
+int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req,
+			    struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_svc_ctx *ctx;
+	int rc;
+
+	LASSERT(req->rq_bulk_write);
+
+	/*
+	 * if it's in privacy mode, transferred should >= expected; otherwise
+	 * transferred should == expected.
+	 */
+	if (desc->bd_nob_transferred < desc->bd_nob ||
+	    (desc->bd_nob_transferred > desc->bd_nob &&
+	     SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) !=
+	     SPTLRPC_BULK_SVC_PRIV)) {
+		DEBUG_REQ(D_ERROR, req, "truncated bulk GET %d(%d)",
+			  desc->bd_nob_transferred, desc->bd_nob);
+		return -ETIMEDOUT;
+	}
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_svc_ctx;
+	if (ctx->sc_policy->sp_sops->unwrap_bulk) {
+		rc = ctx->sc_policy->sp_sops->unwrap_bulk(req, desc);
+		if (rc)
+			CERROR("error unwrap bulk: %d\n", rc);
+	}
+
+	/* return 0 to allow reply be sent */
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_svc_unwrap_bulk);
+
+/**
+ * Prepare buffers for incoming bulk write.
+ */
+int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_svc_ctx *ctx;
+
+	LASSERT(req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_svc_ctx;
+	if (ctx->sc_policy->sp_sops->prep_bulk)
+		return ctx->sc_policy->sp_sops->prep_bulk(req, desc);
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_svc_prep_bulk);
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+/*
+ * user descriptor helpers
+ */
+
+int sptlrpc_current_user_desc_size(void)
+{
+	int ngroups;
+
+	ngroups = current_cred()->group_info->ngroups;
+
+	if (ngroups > LUSTRE_MAX_GROUPS)
+		ngroups = LUSTRE_MAX_GROUPS;
+	return sptlrpc_user_desc_size(ngroups);
+}
+EXPORT_SYMBOL(sptlrpc_current_user_desc_size);
+
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset)
+{
+	struct ptlrpc_user_desc *pud;
+	int ngroups;
+
+	pud = lustre_msg_buf(msg, offset, 0);
+
+	pud->pud_uid = from_kuid(&init_user_ns, current_uid());
+	pud->pud_gid = from_kgid(&init_user_ns, current_gid());
+	pud->pud_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	pud->pud_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	pud->pud_cap = current_cap().cap[0];
+	pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4;
+
+	task_lock(current);
+	ngroups = current_cred()->group_info->ngroups;
+	if (pud->pud_ngroups > ngroups)
+		pud->pud_ngroups = ngroups;
+#ifdef HAVE_GROUP_INFO_GID
+	memcpy(pud->pud_groups, current_cred()->group_info->gid,
+	       pud->pud_ngroups * sizeof(__u32));
+#else /* !HAVE_GROUP_INFO_GID */
+	memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+	       pud->pud_ngroups * sizeof(__u32));
+#endif /* HAVE_GROUP_INFO_GID */
+	task_unlock(current);
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_pack_user_desc);
+
+int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed)
+{
+	struct ptlrpc_user_desc *pud;
+	int i;
+
+	pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+	if (!pud)
+		return -EINVAL;
+
+	if (swabbed) {
+		__swab32s(&pud->pud_uid);
+		__swab32s(&pud->pud_gid);
+		__swab32s(&pud->pud_fsuid);
+		__swab32s(&pud->pud_fsgid);
+		__swab32s(&pud->pud_cap);
+		__swab32s(&pud->pud_ngroups);
+	}
+
+	if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) {
+		CERROR("%u groups is too large\n", pud->pud_ngroups);
+		return -EINVAL;
+	}
+
+	if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) >
+	    msg->lm_buflens[offset]) {
+		CERROR("%u groups are claimed but bufsize only %u\n",
+		       pud->pud_ngroups, msg->lm_buflens[offset]);
+		return -EINVAL;
+	}
+
+	if (swabbed) {
+		for (i = 0; i < pud->pud_ngroups; i++)
+			__swab32s(&pud->pud_groups[i]);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unpack_user_desc);
+
+/*
+ * misc helpers
+ */
+
+const char *sec2target_str(struct ptlrpc_sec *sec)
+{
+	if (!sec || !sec->ps_import || !sec->ps_import->imp_obd)
+		return "*";
+	if (sec_is_reverse(sec))
+		return "c";
+	return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid);
+}
+EXPORT_SYMBOL(sec2target_str);
+
+/*
+ * return true if the bulk data is protected
+ */
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr)
+{
+	switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+	case SPTLRPC_BULK_SVC_INTG:
+	case SPTLRPC_BULK_SVC_PRIV:
+		return 1;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(sptlrpc_flavor_has_bulk);
+
+/*
+ * crypto API helper/alloc blkciper
+ */
+
+/*
+ * initialize/finalize
+ */
+
+int sptlrpc_init(void)
+{
+	int rc;
+
+	rwlock_init(&policy_lock);
+
+	rc = sptlrpc_gc_init();
+	if (rc)
+		goto out;
+
+	rc = sptlrpc_conf_init();
+	if (rc)
+		goto out_gc;
+
+	rc = sptlrpc_enc_pool_init();
+	if (rc)
+		goto out_conf;
+
+	rc = sptlrpc_null_init();
+	if (rc)
+		goto out_pool;
+
+	rc = sptlrpc_plain_init();
+	if (rc)
+		goto out_null;
+
+	rc = sptlrpc_lproc_init();
+	if (rc)
+		goto out_plain;
+
+	return 0;
+
+out_plain:
+	sptlrpc_plain_fini();
+out_null:
+	sptlrpc_null_fini();
+out_pool:
+	sptlrpc_enc_pool_fini();
+out_conf:
+	sptlrpc_conf_fini();
+out_gc:
+	sptlrpc_gc_fini();
+out:
+	return rc;
+}
+
+void sptlrpc_fini(void)
+{
+	sptlrpc_lproc_fini();
+	sptlrpc_plain_fini();
+	sptlrpc_null_fini();
+	sptlrpc_enc_pool_fini();
+	sptlrpc_conf_fini();
+	sptlrpc_gc_fini();
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
new file mode 100644
index 0000000000000..bdb65dd637e97
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
@@ -0,0 +1,1005 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/sec_bulk.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <libcfs/linux/linux-mem.h>
+
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+static int mult = 20 - PAGE_SHIFT;
+static int enc_pool_max_memory_mb;
+module_param(enc_pool_max_memory_mb, int, 0644);
+MODULE_PARM_DESC(enc_pool_max_memory_mb,
+		 "Encoding pool max memory (MB), 1/8 of total physical memory by default");
+
+/*
+ * bulk encryption page pools
+ */
+
+#define PTRS_PER_PAGE   (PAGE_SIZE / sizeof(void *))
+#define PAGES_PER_POOL  (PTRS_PER_PAGE)
+
+#define IDLE_IDX_MAX            (100)
+#define IDLE_IDX_WEIGHT         (3)
+
+#define CACHE_QUIESCENT_PERIOD  (20)
+
+static struct ptlrpc_enc_page_pool {
+	unsigned long epp_max_pages;   /* maximum pages can hold, const */
+	unsigned int epp_max_pools;   /* number of pools, const */
+
+	/*
+	 * wait queue in case of not enough free pages.
+	 */
+	wait_queue_head_t epp_waitq;   /* waiting threads */
+	unsigned int epp_waitqlen;    /* wait queue length */
+	unsigned long epp_pages_short; /* # of pages wanted of in-q users */
+	unsigned int epp_growing:1;   /* during adding pages */
+
+	/*
+	 * indicating how idle the pools are, from 0 to MAX_IDLE_IDX
+	 * this is counted based on each time when getting pages from
+	 * the pools, not based on time. which means in case that system
+	 * is idled for a while but the idle_idx might still be low if no
+	 * activities happened in the pools.
+	 */
+	unsigned long epp_idle_idx;
+
+	/* last shrink time due to mem tight */
+	time64_t epp_last_shrink;
+	time64_t epp_last_access;
+
+	/* in-pool pages bookkeeping */
+	spinlock_t epp_lock; /* protect following fields */
+	unsigned long epp_total_pages; /* total pages in pools */
+	unsigned long epp_free_pages;  /* current pages available */
+
+	/* statistics */
+	unsigned long epp_st_max_pages;      /* # of pages ever reached */
+	unsigned int epp_st_grows;          /* # of grows */
+	unsigned int epp_st_grow_fails;     /* # of add pages failures */
+	unsigned int epp_st_shrinks;        /* # of shrinks */
+	unsigned long epp_st_access;         /* # of access */
+	unsigned long epp_st_missings;       /* # of cache missing */
+	unsigned long epp_st_lowfree;        /* lowest free pages reached */
+	unsigned int epp_st_max_wqlen;      /* highest waitqueue length */
+	ktime_t epp_st_max_wait; /* in nanoseconds */
+	unsigned long epp_st_outofmem; /* # of out of mem requests */
+	/*
+	 * pointers to pools, may be vmalloc'd
+	 */
+	struct page ***epp_pools;
+} page_pools;
+
+/*
+ * /proc/fs/lustre/sptlrpc/encrypt_page_pools
+ */
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
+{
+	spin_lock(&page_pools.epp_lock);
+
+	seq_printf(m, "physical pages:          %lu\n"
+		   "pages per pool:          %lu\n"
+		   "max pages:               %lu\n"
+		   "max pools:               %u\n"
+		   "total pages:             %lu\n"
+		   "total free:              %lu\n"
+		   "idle index:              %lu/100\n"
+		   "last shrink:             %llds\n"
+		   "last access:             %llds\n"
+		   "max pages reached:       %lu\n"
+		   "grows:                   %u\n"
+		   "grows failure:           %u\n"
+		   "shrinks:                 %u\n"
+		   "cache access:            %lu\n"
+		   "cache missing:           %lu\n"
+		   "low free mark:           %lu\n"
+		   "max waitqueue depth:     %u\n"
+		   "max wait time ms:        %lld\n"
+		   "out of mem:              %lu\n",
+		   cfs_totalram_pages(), PAGES_PER_POOL,
+		   page_pools.epp_max_pages,
+		   page_pools.epp_max_pools,
+		   page_pools.epp_total_pages,
+		   page_pools.epp_free_pages,
+		   page_pools.epp_idle_idx,
+		   ktime_get_seconds() - page_pools.epp_last_shrink,
+		   ktime_get_seconds() - page_pools.epp_last_access,
+		   page_pools.epp_st_max_pages,
+		   page_pools.epp_st_grows,
+		   page_pools.epp_st_grow_fails,
+		   page_pools.epp_st_shrinks,
+		   page_pools.epp_st_access,
+		   page_pools.epp_st_missings,
+		   page_pools.epp_st_lowfree,
+		   page_pools.epp_st_max_wqlen,
+		   ktime_to_ms(page_pools.epp_st_max_wait),
+		   page_pools.epp_st_outofmem);
+
+	spin_unlock(&page_pools.epp_lock);
+	return 0;
+}
+
+static void enc_pools_release_free_pages(long npages)
+{
+	int p_idx, g_idx;
+	int p_idx_max1, p_idx_max2;
+
+	LASSERT(npages > 0);
+	LASSERT(npages <= page_pools.epp_free_pages);
+	LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
+
+	/* max pool index before the release */
+	p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
+
+	page_pools.epp_free_pages -= npages;
+	page_pools.epp_total_pages -= npages;
+
+	/* max pool index after the release */
+	p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 :
+		((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+	LASSERT(page_pools.epp_pools[p_idx]);
+
+	while (npages--) {
+		LASSERT(page_pools.epp_pools[p_idx]);
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+
+		__free_page(page_pools.epp_pools[p_idx][g_idx]);
+		page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	/* free unused pools */
+	while (p_idx_max1 < p_idx_max2) {
+		LASSERT(page_pools.epp_pools[p_idx_max2]);
+		OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_SIZE);
+		page_pools.epp_pools[p_idx_max2] = NULL;
+		p_idx_max2--;
+	}
+}
+
+/*
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static unsigned long enc_pools_shrink_count(struct shrinker *s,
+					    struct shrink_control *sc)
+{
+	/*
+	 * if no pool access for a long time, we consider it's fully idle.
+	 * a little race here is fine.
+	 */
+	if (unlikely(ktime_get_seconds() - page_pools.epp_last_access >
+		     CACHE_QUIESCENT_PERIOD)) {
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_idle_idx = IDLE_IDX_MAX;
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+	return (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) ? 0 :
+		(page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES) *
+		(IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX;
+}
+
+/*
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static unsigned long enc_pools_shrink_scan(struct shrinker *s,
+					   struct shrink_control *sc)
+{
+	spin_lock(&page_pools.epp_lock);
+	if (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES)
+		sc->nr_to_scan = 0;
+	else
+		sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan,
+			      page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES);
+	if (sc->nr_to_scan > 0) {
+		enc_pools_release_free_pages(sc->nr_to_scan);
+		CDEBUG(D_SEC, "released %ld pages, %ld left\n",
+		       (long)sc->nr_to_scan, page_pools.epp_free_pages);
+
+		page_pools.epp_st_shrinks++;
+		page_pools.epp_last_shrink = ktime_get_seconds();
+	}
+	spin_unlock(&page_pools.epp_lock);
+
+	/*
+	 * if no pool access for a long time, we consider it's fully idle.
+	 * a little race here is fine.
+	 */
+	if (unlikely(ktime_get_seconds() - page_pools.epp_last_access >
+		     CACHE_QUIESCENT_PERIOD)) {
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_idle_idx = IDLE_IDX_MAX;
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+	return sc->nr_to_scan;
+}
+
+#ifdef HAVE_SHRINKER_COUNT
+static struct shrinker pools_shrinker = {
+	.count_objects	= enc_pools_shrink_count,
+	.scan_objects	= enc_pools_shrink_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+#else
+/*
+ * could be called frequently for query (@nr_to_scan == 0).
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static int enc_pools_shrink(struct shrinker *shrinker,
+			    struct shrink_control *sc)
+{
+	enc_pools_shrink_scan(shrinker, sc);
+
+	return enc_pools_shrink_count(shrinker, sc);
+}
+
+static struct shrinker pools_shrinker = {
+	.shrink  = enc_pools_shrink,
+	.seeks   = DEFAULT_SEEKS,
+};
+#endif /* HAVE_SHRINKER_COUNT */
+
+static inline
+int npages_to_npools(unsigned long npages)
+{
+	return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL);
+}
+
+/*
+ * return how many pages cleaned up.
+ */
+static unsigned long enc_pools_cleanup(struct page ***pools, int npools)
+{
+	unsigned long cleaned = 0;
+	int i, j;
+
+	for (i = 0; i < npools; i++) {
+		if (pools[i]) {
+			for (j = 0; j < PAGES_PER_POOL; j++) {
+				if (pools[i][j]) {
+					__free_page(pools[i][j]);
+					cleaned++;
+				}
+			}
+			OBD_FREE(pools[i], PAGE_SIZE);
+			pools[i] = NULL;
+		}
+	}
+
+	return cleaned;
+}
+
+/*
+ * merge @npools pointed by @pools which contains @npages new pages
+ * into current pools.
+ *
+ * we have options to avoid most memory copy with some tricks. but we choose
+ * the simplest way to avoid complexity. It's not frequently called.
+ */
+static void enc_pools_insert(struct page ***pools, int npools, int npages)
+{
+	int freeslot;
+	int op_idx, np_idx, og_idx, ng_idx;
+	int cur_npools, end_npools;
+
+	LASSERT(npages > 0);
+	LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages);
+	LASSERT(npages_to_npools(npages) == npools);
+	LASSERT(page_pools.epp_growing);
+
+	spin_lock(&page_pools.epp_lock);
+
+	/*
+	 * (1) fill all the free slots of current pools.
+	 */
+	/*
+	 * free slots are those left by rent pages, and the extra ones with
+	 * index >= total_pages, locate at the tail of last pool.
+	 */
+	freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
+	if (freeslot != 0)
+		freeslot = PAGES_PER_POOL - freeslot;
+	freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages;
+
+	op_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	og_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+	np_idx = npools - 1;
+	ng_idx = (npages - 1) % PAGES_PER_POOL;
+
+	while (freeslot) {
+		LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL);
+		LASSERT(pools[np_idx][ng_idx] != NULL);
+
+		page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx];
+		pools[np_idx][ng_idx] = NULL;
+
+		freeslot--;
+
+		if (++og_idx == PAGES_PER_POOL) {
+			op_idx++;
+			og_idx = 0;
+		}
+		if (--ng_idx < 0) {
+			if (np_idx == 0)
+				break;
+			np_idx--;
+			ng_idx = PAGES_PER_POOL - 1;
+		}
+	}
+
+	/*
+	 * (2) add pools if needed.
+	 */
+	cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) /
+		      PAGES_PER_POOL;
+	end_npools = (page_pools.epp_total_pages + npages +
+		      PAGES_PER_POOL - 1) / PAGES_PER_POOL;
+	LASSERT(end_npools <= page_pools.epp_max_pools);
+
+	np_idx = 0;
+	while (cur_npools < end_npools) {
+		LASSERT(page_pools.epp_pools[cur_npools] == NULL);
+		LASSERT(np_idx < npools);
+		LASSERT(pools[np_idx] != NULL);
+
+		page_pools.epp_pools[cur_npools++] = pools[np_idx];
+		pools[np_idx++] = NULL;
+	}
+
+	page_pools.epp_total_pages += npages;
+	page_pools.epp_free_pages += npages;
+	page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+	if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
+		page_pools.epp_st_max_pages = page_pools.epp_total_pages;
+
+	CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
+	       page_pools.epp_total_pages);
+
+	spin_unlock(&page_pools.epp_lock);
+}
+
+static int enc_pools_add_pages(int npages)
+{
+	static DEFINE_MUTEX(add_pages_mutex);
+	struct page ***pools;
+	int npools, alloced = 0;
+	int i, j, rc = -ENOMEM;
+
+	if (npages < PTLRPC_MAX_BRW_PAGES)
+		npages = PTLRPC_MAX_BRW_PAGES;
+
+	mutex_lock(&add_pages_mutex);
+
+	if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages)
+		npages = page_pools.epp_max_pages - page_pools.epp_total_pages;
+	LASSERT(npages > 0);
+
+	page_pools.epp_st_grows++;
+
+	npools = npages_to_npools(npages);
+	OBD_ALLOC_PTR_ARRAY(pools, npools);
+	if (pools == NULL)
+		goto out;
+
+	for (i = 0; i < npools; i++) {
+		OBD_ALLOC(pools[i], PAGE_SIZE);
+		if (pools[i] == NULL)
+			goto out_pools;
+
+		for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) {
+			pools[i][j] = alloc_page(GFP_NOFS |
+						 __GFP_HIGHMEM);
+			if (pools[i][j] == NULL)
+				goto out_pools;
+
+			alloced++;
+		}
+	}
+	LASSERT(alloced == npages);
+
+	enc_pools_insert(pools, npools, npages);
+	CDEBUG(D_SEC, "added %d pages into pools\n", npages);
+	rc = 0;
+
+out_pools:
+	enc_pools_cleanup(pools, npools);
+	OBD_FREE_PTR_ARRAY(pools, npools);
+out:
+	if (rc) {
+		page_pools.epp_st_grow_fails++;
+		CERROR("Failed to allocate %d enc pages\n", npages);
+	}
+
+	mutex_unlock(&add_pages_mutex);
+	return rc;
+}
+
+static inline void enc_pools_wakeup(void)
+{
+	assert_spin_locked(&page_pools.epp_lock);
+
+	/* waitqueue_active */
+	if (unlikely(waitqueue_active(&page_pools.epp_waitq)))
+		wake_up(&page_pools.epp_waitq);
+}
+
+static int enc_pools_should_grow(int page_needed, time64_t now)
+{
+	/*
+	 * don't grow if someone else is growing the pools right now,
+	 * or the pools has reached its full capacity
+	 */
+	if (page_pools.epp_growing ||
+	    page_pools.epp_total_pages == page_pools.epp_max_pages)
+		return 0;
+
+	/* if total pages is not enough, we need to grow */
+	if (page_pools.epp_total_pages < page_needed)
+		return 1;
+
+	/*
+	 * we wanted to return 0 here if there was a shrink just
+	 * happened a moment ago, but this may cause deadlock if both
+	 * client and ost live on single node.
+	 */
+
+	/*
+	 * here we perhaps need consider other factors like wait queue
+	 * length, idle index, etc. ?
+	 */
+
+	/* grow the pools in any other cases */
+	return 1;
+}
+
+/*
+ * Export the number of free pages in the pool
+ */
+int get_free_pages_in_pool(void)
+{
+	return page_pools.epp_free_pages;
+}
+EXPORT_SYMBOL(get_free_pages_in_pool);
+
+/*
+ * Let outside world know if enc_pool full capacity is reached
+ */
+int pool_is_at_full_capacity(void)
+{
+	return (page_pools.epp_total_pages == page_pools.epp_max_pages);
+}
+EXPORT_SYMBOL(pool_is_at_full_capacity);
+
+static inline struct page **page_from_bulkdesc(void *array, int index)
+{
+	struct ptlrpc_bulk_desc *desc = (struct ptlrpc_bulk_desc *)array;
+
+	return &desc->bd_enc_vec[index].bv_page;
+}
+
+static inline struct page **page_from_pagearray(void *array, int index)
+{
+	struct page **pa = (struct page **)array;
+
+	return &pa[index];
+}
+
+/*
+ * we allocate the requested pages atomically.
+ */
+static inline int __sptlrpc_enc_pool_get_pages(void *array, unsigned int count,
+					struct page **(*page_from)(void *, int))
+{
+	wait_queue_entry_t waitlink;
+	unsigned long this_idle = -1;
+	u64 tick_ns = 0;
+	time64_t now;
+	int p_idx, g_idx;
+	int i, rc = 0;
+
+	if (!array || count <= 0 || count > page_pools.epp_max_pages)
+		return -EINVAL;
+
+	spin_lock(&page_pools.epp_lock);
+
+	page_pools.epp_st_access++;
+again:
+	if (unlikely(page_pools.epp_free_pages < count)) {
+		if (tick_ns == 0)
+			tick_ns = ktime_get_ns();
+
+		now = ktime_get_real_seconds();
+
+		page_pools.epp_st_missings++;
+		page_pools.epp_pages_short += count;
+
+		if (enc_pools_should_grow(count, now)) {
+			page_pools.epp_growing = 1;
+
+			spin_unlock(&page_pools.epp_lock);
+			enc_pools_add_pages(page_pools.epp_pages_short / 2);
+			spin_lock(&page_pools.epp_lock);
+
+			page_pools.epp_growing = 0;
+
+			enc_pools_wakeup();
+		} else {
+			if (page_pools.epp_growing) {
+				if (++page_pools.epp_waitqlen >
+				    page_pools.epp_st_max_wqlen)
+					page_pools.epp_st_max_wqlen =
+						page_pools.epp_waitqlen;
+
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				init_wait(&waitlink);
+				add_wait_queue(&page_pools.epp_waitq,
+					       &waitlink);
+
+				spin_unlock(&page_pools.epp_lock);
+				schedule();
+				remove_wait_queue(&page_pools.epp_waitq,
+						  &waitlink);
+				spin_lock(&page_pools.epp_lock);
+				page_pools.epp_waitqlen--;
+			} else {
+				/*
+				 * ptlrpcd thread should not sleep in that case,
+				 * or deadlock may occur!
+				 * Instead, return -ENOMEM so that upper layers
+				 * will put request back in queue.
+				 */
+				page_pools.epp_st_outofmem++;
+				GOTO(out_unlock, rc = -ENOMEM);
+			}
+		}
+
+		if (page_pools.epp_pages_short < count)
+			GOTO(out_unlock, rc = -EPROTO);
+		page_pools.epp_pages_short -= count;
+
+		this_idle = 0;
+		goto again;
+	}
+
+	/* record max wait time */
+	if (unlikely(tick_ns)) {
+		ktime_t tick = ktime_sub_ns(ktime_get(), tick_ns);
+
+		if (ktime_after(tick, page_pools.epp_st_max_wait))
+			page_pools.epp_st_max_wait = tick;
+	}
+
+	/* proceed with rest of allocation */
+	page_pools.epp_free_pages -= count;
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	for (i = 0; i < count; i++) {
+		struct page **pagep = page_from(array, i);
+
+		if (page_pools.epp_pools[p_idx][g_idx] == NULL)
+			GOTO(out_unlock, rc = -EPROTO);
+		*pagep = page_pools.epp_pools[p_idx][g_idx];
+		page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	if (page_pools.epp_free_pages < page_pools.epp_st_lowfree)
+		page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+	/*
+	 * new idle index = (old * weight + new) / (weight + 1)
+	 */
+	if (this_idle == -1) {
+		this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX /
+			page_pools.epp_total_pages;
+	}
+	page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT +
+				   this_idle) /
+		(IDLE_IDX_WEIGHT + 1);
+
+	page_pools.epp_last_access = ktime_get_seconds();
+
+out_unlock:
+	spin_unlock(&page_pools.epp_lock);
+	return rc;
+}
+
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
+{
+	int rc;
+
+	LASSERT(desc->bd_iov_count > 0);
+	LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
+
+	/* resent bulk, enc iov might have been allocated previously */
+	if (desc->bd_enc_vec != NULL)
+		return 0;
+
+	OBD_ALLOC_LARGE(desc->bd_enc_vec,
+			desc->bd_iov_count * sizeof(*desc->bd_enc_vec));
+	if (desc->bd_enc_vec == NULL)
+		return -ENOMEM;
+
+	rc = __sptlrpc_enc_pool_get_pages((void *)desc, desc->bd_iov_count,
+					  page_from_bulkdesc);
+	if (rc) {
+		OBD_FREE_LARGE(desc->bd_enc_vec,
+			       desc->bd_iov_count *
+			       sizeof(*desc->bd_enc_vec));
+		desc->bd_enc_vec = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages);
+
+int sptlrpc_enc_pool_get_pages_array(struct page **pa, unsigned int count)
+{
+	return __sptlrpc_enc_pool_get_pages((void *)pa, count,
+					    page_from_pagearray);
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages_array);
+
+static int __sptlrpc_enc_pool_put_pages(void *array, unsigned int count,
+					struct page **(*page_from)(void *, int))
+{
+	int p_idx, g_idx;
+	int i, rc = 0;
+
+	if (!array || count <= 0)
+		return -EINVAL;
+
+	spin_lock(&page_pools.epp_lock);
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	if (page_pools.epp_free_pages + count > page_pools.epp_total_pages)
+		GOTO(out_unlock, rc = -EPROTO);
+	if (!page_pools.epp_pools[p_idx])
+		GOTO(out_unlock, rc = -EPROTO);
+
+	for (i = 0; i < count; i++) {
+		struct page **pagep = page_from(array, i);
+
+		if (!*pagep ||
+		    page_pools.epp_pools[p_idx][g_idx] != NULL)
+			GOTO(out_unlock, rc = -EPROTO);
+
+		page_pools.epp_pools[p_idx][g_idx] = *pagep;
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	page_pools.epp_free_pages += count;
+	enc_pools_wakeup();
+
+out_unlock:
+	spin_unlock(&page_pools.epp_lock);
+	return rc;
+}
+
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
+{
+	int rc;
+
+	if (desc->bd_enc_vec == NULL)
+		return;
+
+	rc = __sptlrpc_enc_pool_put_pages((void *)desc, desc->bd_iov_count,
+					  page_from_bulkdesc);
+	if (rc)
+		CDEBUG(D_SEC, "error putting pages in enc pool: %d\n", rc);
+
+	OBD_FREE_LARGE(desc->bd_enc_vec,
+		       desc->bd_iov_count * sizeof(*desc->bd_enc_vec));
+	desc->bd_enc_vec = NULL;
+}
+
+void sptlrpc_enc_pool_put_pages_array(struct page **pa, unsigned int count)
+{
+	int rc;
+
+	rc = __sptlrpc_enc_pool_put_pages((void *)pa, count,
+					  page_from_pagearray);
+
+	if (rc)
+		CDEBUG(D_SEC, "error putting pages in enc pool: %d\n", rc);
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages_array);
+
+/*
+ * we don't do much stuff for add_user/del_user anymore, except adding some
+ * initial pages in add_user() if current pools are empty, rest would be
+ * handled by the pools's self-adaption.
+ */
+int sptlrpc_enc_pool_add_user(void)
+{
+	int need_grow = 0;
+
+	spin_lock(&page_pools.epp_lock);
+	if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) {
+		page_pools.epp_growing = 1;
+		need_grow = 1;
+	}
+	spin_unlock(&page_pools.epp_lock);
+
+	if (need_grow) {
+		enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
+				    PTLRPC_MAX_BRW_PAGES);
+
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_growing = 0;
+		enc_pools_wakeup();
+		spin_unlock(&page_pools.epp_lock);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_add_user);
+
+int sptlrpc_enc_pool_del_user(void)
+{
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_del_user);
+
+static inline void enc_pools_alloc(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	OBD_ALLOC_LARGE(page_pools.epp_pools,
+			page_pools.epp_max_pools *
+			sizeof(*page_pools.epp_pools));
+}
+
+static inline void enc_pools_free(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	LASSERT(page_pools.epp_pools);
+
+	OBD_FREE_LARGE(page_pools.epp_pools,
+		       page_pools.epp_max_pools *
+		       sizeof(*page_pools.epp_pools));
+}
+
+int sptlrpc_enc_pool_init(void)
+{
+	int rc;
+
+	page_pools.epp_max_pages = cfs_totalram_pages() / 8;
+	if (enc_pool_max_memory_mb > 0 &&
+	    enc_pool_max_memory_mb <= (cfs_totalram_pages() >> mult))
+		page_pools.epp_max_pages = enc_pool_max_memory_mb << mult;
+
+	page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
+
+	init_waitqueue_head(&page_pools.epp_waitq);
+	page_pools.epp_waitqlen = 0;
+	page_pools.epp_pages_short = 0;
+
+	page_pools.epp_growing = 0;
+
+	page_pools.epp_idle_idx = 0;
+	page_pools.epp_last_shrink = ktime_get_seconds();
+	page_pools.epp_last_access = ktime_get_seconds();
+
+	spin_lock_init(&page_pools.epp_lock);
+	page_pools.epp_total_pages = 0;
+	page_pools.epp_free_pages = 0;
+
+	page_pools.epp_st_max_pages = 0;
+	page_pools.epp_st_grows = 0;
+	page_pools.epp_st_grow_fails = 0;
+	page_pools.epp_st_shrinks = 0;
+	page_pools.epp_st_access = 0;
+	page_pools.epp_st_missings = 0;
+	page_pools.epp_st_lowfree = 0;
+	page_pools.epp_st_max_wqlen = 0;
+	page_pools.epp_st_max_wait = ktime_set(0, 0);
+	page_pools.epp_st_outofmem = 0;
+
+	enc_pools_alloc();
+	if (page_pools.epp_pools == NULL)
+		return -ENOMEM;
+
+	rc = register_shrinker(&pools_shrinker);
+	if (rc)
+		enc_pools_free();
+
+	return rc;
+}
+
+void sptlrpc_enc_pool_fini(void)
+{
+	unsigned long cleaned, npools;
+
+	LASSERT(page_pools.epp_pools);
+	LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages);
+
+	unregister_shrinker(&pools_shrinker);
+
+	npools = npages_to_npools(page_pools.epp_total_pages);
+	cleaned = enc_pools_cleanup(page_pools.epp_pools, npools);
+	LASSERT(cleaned == page_pools.epp_total_pages);
+
+	enc_pools_free();
+
+	if (page_pools.epp_st_access > 0) {
+		CDEBUG(D_SEC,
+		       "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait ms %lld, out of mem %lu\n",
+		       page_pools.epp_st_max_pages, page_pools.epp_st_grows,
+		       page_pools.epp_st_grow_fails,
+		       page_pools.epp_st_shrinks, page_pools.epp_st_access,
+		       page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
+		       ktime_to_ms(page_pools.epp_st_max_wait),
+		       page_pools.epp_st_outofmem);
+	}
+}
+
+
+static int cfs_hash_alg_id[] = {
+	[BULK_HASH_ALG_NULL]	= CFS_HASH_ALG_NULL,
+	[BULK_HASH_ALG_ADLER32]	= CFS_HASH_ALG_ADLER32,
+	[BULK_HASH_ALG_CRC32]	= CFS_HASH_ALG_CRC32,
+	[BULK_HASH_ALG_MD5]	= CFS_HASH_ALG_MD5,
+	[BULK_HASH_ALG_SHA1]	= CFS_HASH_ALG_SHA1,
+	[BULK_HASH_ALG_SHA256]	= CFS_HASH_ALG_SHA256,
+	[BULK_HASH_ALG_SHA384]	= CFS_HASH_ALG_SHA384,
+	[BULK_HASH_ALG_SHA512]	= CFS_HASH_ALG_SHA512,
+};
+const char *sptlrpc_get_hash_name(__u8 hash_alg)
+{
+	return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]);
+}
+
+__u8 sptlrpc_get_hash_alg(const char *algname)
+{
+	return cfs_crypto_hash_alg(algname);
+}
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+	int size = msg->lm_buflens[offset];
+
+	bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+	if (bsd == NULL) {
+		CERROR("Invalid bulk sec desc: size %d\n", size);
+		return -EINVAL;
+	}
+
+	if (swabbed)
+		__swab32s(&bsd->bsd_nob);
+
+	if (unlikely(bsd->bsd_version != 0)) {
+		CERROR("Unexpected version %u\n", bsd->bsd_version);
+		return -EPROTO;
+	}
+
+	if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
+		CERROR("Invalid type %u\n", bsd->bsd_type);
+		return -EPROTO;
+	}
+
+	/* FIXME more sanity check here */
+
+	if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+		     bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
+		     bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
+		CERROR("Invalid svc %u\n", bsd->bsd_svc);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bulk_sec_desc_unpack);
+
+/*
+ * Compute the checksum of an RPC buffer payload.  If the return \a buflen
+ * is not large enough, truncate the result to fit so that it is possible
+ * to use a hash function with a large hash space, but only use a part of
+ * the resulting hash.
+ */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+			      void *buf, int buflen)
+{
+	struct ahash_request *req;
+	int hashsize;
+	unsigned int bufsize;
+	int i, err;
+
+	LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
+	LASSERT(buflen >= 4);
+
+	req = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+	if (IS_ERR(req)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
+		return PTR_ERR(req);
+	}
+
+	hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		cfs_crypto_hash_update_page(req,
+				  desc->bd_vec[i].bv_page,
+				  desc->bd_vec[i].bv_offset &
+					      ~PAGE_MASK,
+				  desc->bd_vec[i].bv_len);
+	}
+
+	if (hashsize > buflen) {
+		unsigned char hashbuf[CFS_CRYPTO_HASH_DIGESTSIZE_MAX];
+
+		bufsize = sizeof(hashbuf);
+		LASSERTF(bufsize >= hashsize, "bufsize = %u < hashsize %u\n",
+			 bufsize, hashsize);
+		err = cfs_crypto_hash_final(req, hashbuf, &bufsize);
+		memcpy(buf, hashbuf, buflen);
+	} else {
+		bufsize = buflen;
+		err = cfs_crypto_hash_final(req, buf, &bufsize);
+	}
+
+	return err;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
new file mode 100644
index 0000000000000..a36452e86eb9b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
@@ -0,0 +1,979 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_dlm.h>
+#include <uapi/linux/lustre/lustre_param.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+const char *sptlrpc_part2name(enum lustre_sec_part part)
+{
+        switch (part) {
+        case LUSTRE_SP_CLI:
+                return "cli";
+        case LUSTRE_SP_MDT:
+                return "mdt";
+        case LUSTRE_SP_OST:
+                return "ost";
+        case LUSTRE_SP_MGC:
+                return "mgc";
+        case LUSTRE_SP_MGS:
+                return "mgs";
+        case LUSTRE_SP_ANY:
+                return "any";
+        default:
+                return "err";
+        }
+}
+EXPORT_SYMBOL(sptlrpc_part2name);
+
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd)
+{
+        const char *type = obd->obd_type->typ_name;
+
+        if (!strcmp(type, LUSTRE_MDT_NAME))
+                return LUSTRE_SP_MDT;
+        if (!strcmp(type, LUSTRE_OST_NAME))
+                return LUSTRE_SP_OST;
+        if (!strcmp(type, LUSTRE_MGS_NAME))
+                return LUSTRE_SP_MGS;
+
+        CERROR("unknown target %p(%s)\n", obd, type);
+        return LUSTRE_SP_ANY;
+}
+
+/****************************************
+ * user supplied flavor string parsing  *
+ ****************************************/
+
+/*
+ * format: <base_flavor>[-<bulk_type:alg_spec>]
+ */
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
+{
+        char            buf[32];
+        char           *bulk, *alg;
+
+        memset(flvr, 0, sizeof(*flvr));
+
+        if (str == NULL || str[0] == '\0') {
+                flvr->sf_rpc = SPTLRPC_FLVR_INVALID;
+                return 0;
+        }
+
+	strlcpy(buf, str, sizeof(buf));
+
+        bulk = strchr(buf, '-');
+        if (bulk)
+                *bulk++ = '\0';
+
+        flvr->sf_rpc = sptlrpc_name2flavor_base(buf);
+        if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID)
+                goto err_out;
+
+        /*
+         * currently only base flavor "plain" can have bulk specification.
+         */
+        if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) {
+                flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32;
+                if (bulk) {
+                        /*
+                         * format: plain-hash:<hash_alg>
+                         */
+                        alg = strchr(bulk, ':');
+                        if (alg == NULL)
+                                goto err_out;
+                        *alg++ = '\0';
+
+                        if (strcmp(bulk, "hash"))
+                                goto err_out;
+
+                        flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg);
+                        if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX)
+                                goto err_out;
+                }
+
+                if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL)
+                        flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL);
+                else
+                        flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG);
+        } else {
+                if (bulk)
+                        goto err_out;
+        }
+
+        flvr->sf_flags = 0;
+        return 0;
+
+err_out:
+        CERROR("invalid flavor string: %s\n", str);
+        return -EINVAL;
+}
+EXPORT_SYMBOL(sptlrpc_parse_flavor);
+
+/****************************************
+ * configure rules                      *
+ ****************************************/
+
+static void get_default_flavor(struct sptlrpc_flavor *sf)
+{
+        memset(sf, 0, sizeof(*sf));
+
+        sf->sf_rpc = SPTLRPC_FLVR_NULL;
+        sf->sf_flags = 0;
+}
+
+static void sptlrpc_rule_init(struct sptlrpc_rule *rule)
+{
+	rule->sr_netid = LNET_NET_ANY;
+	rule->sr_from = LUSTRE_SP_ANY;
+	rule->sr_to = LUSTRE_SP_ANY;
+	rule->sr_padding = 0;
+
+	get_default_flavor(&rule->sr_flvr);
+}
+
+/*
+ * format: network[.direction]=flavor
+ */
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule)
+{
+	char *flavor, *dir;
+	int rc;
+
+	sptlrpc_rule_init(rule);
+
+	flavor = strchr(param, '=');
+	if (flavor == NULL) {
+		CERROR("invalid param, no '='\n");
+		RETURN(-EINVAL);
+	}
+	*flavor++ = '\0';
+
+	dir = strchr(param, '.');
+	if (dir)
+		*dir++ = '\0';
+
+	/* 1.1 network */
+	if (strcmp(param, "default")) {
+		rule->sr_netid = libcfs_str2net(param);
+		if (rule->sr_netid == LNET_NET_ANY) {
+			CERROR("invalid network name: %s\n", param);
+			RETURN(-EINVAL);
+		}
+	}
+
+	/* 1.2 direction */
+	if (dir) {
+		if (!strcmp(dir, "mdt2ost")) {
+			rule->sr_from = LUSTRE_SP_MDT;
+			rule->sr_to = LUSTRE_SP_OST;
+		} else if (!strcmp(dir, "mdt2mdt")) {
+			rule->sr_from = LUSTRE_SP_MDT;
+			rule->sr_to = LUSTRE_SP_MDT;
+		} else if (!strcmp(dir, "cli2ost")) {
+			rule->sr_from = LUSTRE_SP_CLI;
+			rule->sr_to = LUSTRE_SP_OST;
+		} else if (!strcmp(dir, "cli2mdt")) {
+			rule->sr_from = LUSTRE_SP_CLI;
+			rule->sr_to = LUSTRE_SP_MDT;
+		} else {
+			CERROR("invalid rule dir segment: %s\n", dir);
+			RETURN(-EINVAL);
+		}
+	}
+
+	/* 2.1 flavor */
+	rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr);
+	if (rc)
+		RETURN(-EINVAL);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_parse_rule);
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset)
+{
+	LASSERT(rset->srs_nslot ||
+		(rset->srs_nrule == 0 && rset->srs_rules == NULL));
+
+	if (rset->srs_nslot) {
+		OBD_FREE_PTR_ARRAY(rset->srs_rules, rset->srs_nslot);
+		sptlrpc_rule_set_init(rset);
+	}
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_free);
+
+/*
+ * return 0 if the rule set could accomodate one more rule.
+ */
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule *rules;
+	int nslot;
+
+	might_sleep();
+
+	if (rset->srs_nrule < rset->srs_nslot)
+		return 0;
+
+	nslot = rset->srs_nslot + 8;
+
+	/* better use realloc() if available */
+	OBD_ALLOC_PTR_ARRAY(rules, nslot);
+	if (rules == NULL)
+		return -ENOMEM;
+
+	if (rset->srs_nrule) {
+		LASSERT(rset->srs_nslot && rset->srs_rules);
+		memcpy(rules, rset->srs_rules,
+		       rset->srs_nrule * sizeof(*rset->srs_rules));
+
+		OBD_FREE_PTR_ARRAY(rset->srs_rules, rset->srs_nslot);
+	}
+
+	rset->srs_rules = rules;
+	rset->srs_nslot = nslot;
+	return 0;
+}
+
+static inline int rule_spec_dir(struct sptlrpc_rule *rule)
+{
+	return (rule->sr_from != LUSTRE_SP_ANY ||
+		rule->sr_to != LUSTRE_SP_ANY);
+}
+static inline int rule_spec_net(struct sptlrpc_rule *rule)
+{
+	return (rule->sr_netid != LNET_NET_ANY);
+}
+static inline int rule_match_dir(struct sptlrpc_rule *r1,
+                                 struct sptlrpc_rule *r2)
+{
+	return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to);
+}
+static inline int rule_match_net(struct sptlrpc_rule *r1,
+                                 struct sptlrpc_rule *r2)
+{
+	return (r1->sr_netid == r2->sr_netid);
+}
+
+/*
+ * merge @rule into @rset.
+ * the @rset slots might be expanded.
+ */
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, 
+                           struct sptlrpc_rule *rule)
+{
+	struct sptlrpc_rule      *p = rset->srs_rules;
+	int                       spec_dir, spec_net;
+	int                       rc, n, match = 0;
+
+	might_sleep();
+
+	spec_net = rule_spec_net(rule);
+	spec_dir = rule_spec_dir(rule);
+
+        for (n = 0; n < rset->srs_nrule; n++) {
+                p = &rset->srs_rules[n]; 
+
+                /* test network match, if failed:
+                 * - spec rule: skip rules which is also spec rule match, until
+                 *   we hit a wild rule, which means no more chance
+                 * - wild rule: skip until reach the one which is also wild
+                 *   and matches
+                 */
+                if (!rule_match_net(p, rule)) {
+                        if (spec_net) {
+                                if (rule_spec_net(p))
+                                        continue;
+                                else
+                                        break;
+                        } else {
+                                continue;
+                        }
+                }
+
+                /* test dir match, same logic as net matching */
+                if (!rule_match_dir(p, rule)) {
+                        if (spec_dir) {
+                                if (rule_spec_dir(p))
+                                        continue;
+                                else
+                                        break;
+                        } else {
+                                continue;
+                        }
+                }
+
+                /* find a match */
+                match = 1;
+                break;
+        }
+
+        if (match) {
+                LASSERT(n >= 0 && n < rset->srs_nrule);
+
+                if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+                        /* remove this rule */
+                        if (n < rset->srs_nrule - 1)
+                                memmove(&rset->srs_rules[n],
+                                        &rset->srs_rules[n + 1],
+                                        (rset->srs_nrule - n - 1) *
+                                        sizeof(*rule));
+                        rset->srs_nrule--;
+                } else {
+                        /* override the rule */
+                        memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+                }
+        } else {
+                LASSERT(n >= 0 && n <= rset->srs_nrule);
+
+                if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) {
+                        rc = sptlrpc_rule_set_expand(rset);
+                        if (rc)
+                                return rc;
+
+                        if (n < rset->srs_nrule)
+                                memmove(&rset->srs_rules[n + 1],
+                                        &rset->srs_rules[n],
+                                        (rset->srs_nrule - n) * sizeof(*rule));
+                        memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+                        rset->srs_nrule++;
+                } else {
+                        CDEBUG(D_CONFIG, "ignore the unmatched deletion\n");
+                }
+        }
+
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_merge);
+
+/**
+ * given from/to/nid, determine a matching flavor in ruleset.
+ * return 1 if a match found, otherwise return 0.
+ */
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+			    enum lustre_sec_part from,
+			    enum lustre_sec_part to,
+			    lnet_nid_t nid,
+			    struct sptlrpc_flavor *sf)
+{
+	struct sptlrpc_rule *r;
+	int n;
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		r = &rset->srs_rules[n];
+
+		if (LNET_NIDNET(nid) != LNET_NET_ANY &&
+		    r->sr_netid != LNET_NET_ANY &&
+		    LNET_NIDNET(nid) != r->sr_netid)
+			continue;
+
+		if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY &&
+		    from != r->sr_from)
+			continue;
+
+		if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY &&
+		    to != r->sr_to)
+			continue;
+
+		*sf = r->sr_flvr;
+		return 1;
+	}
+
+	return 0;
+}
+
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset)
+{
+        struct sptlrpc_rule *r;
+        int     n;
+
+        for (n = 0; n < rset->srs_nrule; n++) {
+                r = &rset->srs_rules[n];
+                CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n,
+                       r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc);
+        }
+}
+
+static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen,
+                                    struct sptlrpc_rule_set *tgt,
+                                    enum lustre_sec_part from,
+                                    enum lustre_sec_part to,
+                                    struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule_set *src[2] = { gen, tgt };
+	struct sptlrpc_rule     *rule;
+	int                      i, n, rc;
+
+	might_sleep();
+
+	/* merge general rules firstly, then target-specific rules */
+	for (i = 0; i < 2; i++) {
+		if (src[i] == NULL)
+			continue;
+
+                for (n = 0; n < src[i]->srs_nrule; n++) {
+                        rule = &src[i]->srs_rules[n];
+
+                        if (from != LUSTRE_SP_ANY &&
+                            rule->sr_from != LUSTRE_SP_ANY &&
+                            rule->sr_from != from)
+                                continue;
+                        if (to != LUSTRE_SP_ANY &&
+                            rule->sr_to != LUSTRE_SP_ANY &&
+                            rule->sr_to != to)
+                                continue;
+
+                        rc = sptlrpc_rule_set_merge(rset, rule);
+                        if (rc) {
+                                CERROR("can't merge: %d\n", rc);
+                                return rc;
+                        }
+                }
+        }
+
+        return 0;
+}
+
+/**********************************
+ * sptlrpc configuration support  *
+ **********************************/
+
+struct sptlrpc_conf_tgt {
+	struct list_head              sct_list;
+        char                    sct_name[MAX_OBD_NAME];
+        struct sptlrpc_rule_set sct_rset;
+};
+
+struct sptlrpc_conf {
+	struct list_head	sc_list;
+	char			sc_fsname[MTI_NAME_MAXLEN];
+	unsigned int		sc_modified;	/* modified during updating */
+	unsigned int		sc_updated:1,	/* updated copy from MGS */
+				sc_local:1;	/* local copy from target */
+	struct sptlrpc_rule_set	sc_rset;	/* fs general rules */
+	struct list_head	sc_tgts;	/* target-specific rules */
+};
+
+static struct mutex sptlrpc_conf_lock;
+static LIST_HEAD(sptlrpc_confs);
+
+static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf)
+{
+	struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next;
+
+	sptlrpc_rule_set_free(&conf->sc_rset);
+
+	list_for_each_entry_safe(conf_tgt, conf_tgt_next,
+				 &conf->sc_tgts, sct_list) {
+		sptlrpc_rule_set_free(&conf_tgt->sct_rset);
+		list_del(&conf_tgt->sct_list);
+		OBD_FREE_PTR(conf_tgt);
+	}
+	LASSERT(list_empty(&conf->sc_tgts));
+
+	conf->sc_updated = 0;
+	conf->sc_local = 0;
+}
+
+static void sptlrpc_conf_free(struct sptlrpc_conf *conf)
+{
+	CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname);
+
+	sptlrpc_conf_free_rsets(conf);
+	list_del(&conf->sc_list);
+	OBD_FREE_PTR(conf);
+}
+
+static
+struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf,
+                                              const char *name,
+                                              int create)
+{
+        struct sptlrpc_conf_tgt *conf_tgt;
+
+	list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+                if (strcmp(conf_tgt->sct_name, name) == 0)
+                        return conf_tgt;
+        }
+
+        if (!create)
+                return NULL;
+
+        OBD_ALLOC_PTR(conf_tgt);
+        if (conf_tgt) {
+		strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name));
+                sptlrpc_rule_set_init(&conf_tgt->sct_rset);
+		list_add(&conf_tgt->sct_list, &conf->sc_tgts);
+        }
+
+        return conf_tgt;
+}
+
+static
+struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname,
+                                      int create)
+{
+        struct sptlrpc_conf *conf;
+
+	list_for_each_entry(conf, &sptlrpc_confs, sc_list) {
+                if (strcmp(conf->sc_fsname, fsname) == 0)
+                        return conf;
+        }
+
+        if (!create)
+                return NULL;
+
+        OBD_ALLOC_PTR(conf);
+        if (conf == NULL)
+                return NULL;
+
+	if (strlcpy(conf->sc_fsname, fsname, sizeof(conf->sc_fsname)) >=
+	    sizeof(conf->sc_fsname)) {
+		OBD_FREE_PTR(conf);
+		return NULL;
+	}
+        sptlrpc_rule_set_init(&conf->sc_rset);
+	INIT_LIST_HEAD(&conf->sc_tgts);
+	list_add(&conf->sc_list, &sptlrpc_confs);
+
+        CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname);
+        return conf;
+}
+
+/**
+ * caller must hold conf_lock already.
+ */
+static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf,
+                                   const char *target,
+                                   struct sptlrpc_rule *rule)
+{
+        struct sptlrpc_conf_tgt  *conf_tgt;
+        struct sptlrpc_rule_set  *rule_set;
+
+        /* fsname == target means general rules for the whole fs */
+        if (strcmp(conf->sc_fsname, target) == 0) {
+                rule_set = &conf->sc_rset;
+        } else {
+                conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1);
+                if (conf_tgt) {
+                        rule_set = &conf_tgt->sct_rset;
+                } else {
+                        CERROR("out of memory, can't merge rule!\n");
+                        return -ENOMEM;
+                }
+        }
+
+        return sptlrpc_rule_set_merge(rule_set, rule);
+}
+
+/**
+ * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we
+ * find one through the target name in the record inside conf_lock;
+ * otherwise means caller already hold conf_lock.
+ */
+static int __sptlrpc_process_config(char *target, const char *fsname,
+				    struct sptlrpc_rule *rule,
+				    struct sptlrpc_conf *conf)
+{
+	int rc;
+
+	ENTRY;
+	if (!conf) {
+		if (!fsname)
+			return -ENODEV;
+
+		mutex_lock(&sptlrpc_conf_lock);
+		conf = sptlrpc_conf_get(fsname, 0);
+		if (!conf) {
+			CERROR("can't find conf\n");
+			rc = -ENOMEM;
+		} else {
+			rc = sptlrpc_conf_merge_rule(conf, target, rule);
+		}
+		mutex_unlock(&sptlrpc_conf_lock);
+	} else {
+		LASSERT(mutex_is_locked(&sptlrpc_conf_lock));
+		rc = sptlrpc_conf_merge_rule(conf, target, rule);
+	}
+
+	if (!rc)
+		conf->sc_modified++;
+
+	RETURN(rc);
+}
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg)
+{
+	char fsname[MTI_NAME_MAXLEN];
+	struct sptlrpc_rule rule;
+	char *target, *param;
+	int rc;
+
+	print_lustre_cfg(lcfg);
+
+	target = lustre_cfg_string(lcfg, 1);
+	if (!target) {
+		CERROR("missing target name\n");
+		return -EINVAL;
+	}
+
+	param = lustre_cfg_string(lcfg, 2);
+	if (!param) {
+		CERROR("missing parameter\n");
+		return -EINVAL;
+	}
+
+	/* parse rule to make sure the format is correct */
+	if (strncmp(param, PARAM_SRPC_FLVR,
+		    sizeof(PARAM_SRPC_FLVR) - 1) != 0) {
+		CERROR("Invalid sptlrpc parameter: %s\n", param);
+		return -EINVAL;
+	}
+	param += sizeof(PARAM_SRPC_FLVR) - 1;
+
+	CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param);
+
+	/*
+	 * Three types of targets exist for sptlrpc using conf_param
+	 * 1.	'_mgs' which targets mgc srpc settings. Treat it as
+	 *	as a special file system name.
+	 * 2.	target is a device which can be fsname-MDTXXXX or
+	 *	fsname-OSTXXXX. This can be verified by the function
+	 *	server_name2fsname.
+	 * 3.	If both above conditions are not meet then the target
+	 *	is a actual filesystem.
+	 */
+	if (server_name2fsname(target, fsname, NULL))
+		strlcpy(fsname, target, sizeof(target));
+
+	rc = sptlrpc_parse_rule(param, &rule);
+	if (rc)
+		return rc;
+
+	return __sptlrpc_process_config(target, fsname, &rule, NULL);
+}
+EXPORT_SYMBOL(sptlrpc_process_config);
+
+static int logname2fsname(const char *logname, char *buf, int buflen)
+{
+        char   *ptr;
+        int     len;
+
+        ptr = strrchr(logname, '-');
+        if (ptr == NULL || strcmp(ptr, "-sptlrpc")) {
+                CERROR("%s is not a sptlrpc config log\n", logname);
+                return -EINVAL;
+        }
+
+        len = min((int) (ptr - logname), buflen - 1);
+
+        memcpy(buf, logname, len);
+        buf[len] = '\0';
+        return 0;
+}
+
+void sptlrpc_conf_log_update_begin(const char *logname)
+{
+        struct sptlrpc_conf *conf;
+        char                 fsname[16];
+
+        if (logname2fsname(logname, fsname, sizeof(fsname)))
+                return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+        conf = sptlrpc_conf_get(fsname, 0);
+	if (conf) {
+		if (conf->sc_local) {
+			LASSERT(conf->sc_updated == 0);
+			sptlrpc_conf_free_rsets(conf);
+		}
+		conf->sc_modified = 0;
+	}
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_begin);
+
+/**
+ * mark a config log has been updated
+ */
+void sptlrpc_conf_log_update_end(const char *logname)
+{
+        struct sptlrpc_conf *conf;
+        char                 fsname[16];
+
+        if (logname2fsname(logname, fsname, sizeof(fsname)))
+                return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+        conf = sptlrpc_conf_get(fsname, 0);
+        if (conf) {
+                /*
+                 * if original state is not updated, make sure the
+                 * modified counter > 0 to enforce updating local copy.
+                 */
+                if (conf->sc_updated == 0)
+                        conf->sc_modified++;
+
+                conf->sc_updated = 1;
+        }
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_end);
+
+void sptlrpc_conf_log_start(const char *logname)
+{
+        char                 fsname[16];
+
+        if (logname2fsname(logname, fsname, sizeof(fsname)))
+                return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+        sptlrpc_conf_get(fsname, 1);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_start);
+
+void sptlrpc_conf_log_stop(const char *logname)
+{
+        struct sptlrpc_conf *conf;
+        char                 fsname[16];
+
+        if (logname2fsname(logname, fsname, sizeof(fsname)))
+                return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+        conf = sptlrpc_conf_get(fsname, 0);
+        if (conf)
+                sptlrpc_conf_free(conf);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_stop);
+
+static void inline flavor_set_flags(struct sptlrpc_flavor *sf,
+                                    enum lustre_sec_part from,
+                                    enum lustre_sec_part to,
+                                    unsigned int fl_udesc)
+{
+        /*
+         * null flavor doesn't need to set any flavor, and in fact
+         * we'd better not do that because everybody share a single sec.
+         */
+        if (sf->sf_rpc == SPTLRPC_FLVR_NULL)
+                return;
+
+        if (from == LUSTRE_SP_MDT) {
+                /* MDT->MDT; MDT->OST */
+                sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+        } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) {
+                /* CLI->OST */
+                sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK;
+        } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) {
+                /* CLI->MDT */
+                if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL)
+                        sf->sf_flags |= PTLRPC_SEC_FL_UDESC;
+        }
+
+	/* Some flavors use a single uid (0) context */
+	if (flvr_is_rootonly(sf->sf_rpc))
+		sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+
+	/* User descriptor might need to be cleared */
+	if (flvr_allows_user_desc(sf->sf_rpc) == 0)
+		sf->sf_flags &= ~PTLRPC_SEC_FL_UDESC;
+}
+
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+				enum lustre_sec_part to,
+				struct obd_uuid *target,
+				struct lnet_nid *nid,
+				struct sptlrpc_flavor *sf)
+{
+	struct sptlrpc_conf     *conf;
+	struct sptlrpc_conf_tgt *conf_tgt;
+	char                     name[MTI_NAME_MAXLEN];
+	int                      len, rc = 0;
+
+	obd_uuid2fsname(name, target->uuid, sizeof(name));
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(name, 0);
+	if (conf == NULL)
+		goto out;
+
+	/* convert uuid name (supposed end with _UUID) to target name */
+	len = strlen(target->uuid);
+	LASSERT(len > 5);
+	memcpy(name, target->uuid, len - 5);
+	name[len - 5] = '\0';
+
+	conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0);
+	if (conf_tgt) {
+		rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset,
+					     from, to, lnet_nid_to_nid4(nid), sf);
+		if (rc)
+			goto out;
+	}
+
+	rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to,
+				     lnet_nid_to_nid4(nid), sf);
+out:
+	mutex_unlock(&sptlrpc_conf_lock);
+
+	if (rc == 0)
+		get_default_flavor(sf);
+
+	flavor_set_flags(sf, from, to, 1);
+}
+
+/**
+ * called by target devices, determine the expected flavor from
+ * certain peer (from, nid).
+ */
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+				  enum lustre_sec_part from,
+				  lnet_nid_t nid,
+				  struct sptlrpc_flavor *sf)
+{
+	if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0)
+		get_default_flavor(sf);
+}
+
+#define SEC_ADAPT_DELAY         (10)
+
+/**
+ * called by client devices, notify the sptlrpc config has changed and
+ * do import_sec_adapt later.
+ */
+void sptlrpc_conf_client_adapt(struct obd_device *obd)
+{
+	struct obd_import  *imp;
+	int rc;
+	ENTRY;
+
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) == 0);
+	CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid);
+
+	/* serialize with connect/disconnect import */
+	with_imp_locked_nested(obd, imp, rc, OBD_CLI_SEM_MDCOSC) {
+		write_lock(&imp->imp_sec_lock);
+		if (imp->imp_sec)
+			imp->imp_sec_expire = ktime_get_real_seconds() +
+				SEC_ADAPT_DELAY;
+		write_unlock(&imp->imp_sec_lock);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(sptlrpc_conf_client_adapt);
+
+/**
+ * called by target devices, extract sptlrpc rules which applies to
+ * this target, to be used for future rpc flavor checking.
+ */
+int sptlrpc_conf_target_get_rules(struct obd_device *obd,
+				  struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_conf *conf;
+	struct sptlrpc_conf_tgt *conf_tgt;
+	enum lustre_sec_part sp_dst;
+	char fsname[MTI_NAME_MAXLEN];
+	int rc = 0;
+	ENTRY;
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) {
+		sp_dst = LUSTRE_SP_MDT;
+	} else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) {
+		sp_dst = LUSTRE_SP_OST;
+	} else {
+		CERROR("unexpected obd type %s\n", obd->obd_type->typ_name);
+		RETURN(-EINVAL);
+	}
+
+	obd_uuid2fsname(fsname, obd->obd_uuid.uuid, sizeof(fsname));
+
+	mutex_lock(&sptlrpc_conf_lock);
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf == NULL) {
+		CERROR("missing sptlrpc config log\n");
+		rc = -EFAULT;
+	} else {
+		/* extract rule set for this target */
+		conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0);
+
+		rc = sptlrpc_rule_set_extract(&conf->sc_rset,
+				      conf_tgt ? &conf_tgt->sct_rset : NULL,
+				      LUSTRE_SP_ANY, sp_dst, rset);
+	}
+	mutex_unlock(&sptlrpc_conf_lock);
+
+	RETURN(rc);
+}
+
+int  sptlrpc_conf_init(void)
+{
+	mutex_init(&sptlrpc_conf_lock);
+	return 0;
+}
+
+void sptlrpc_conf_fini(void)
+{
+	struct sptlrpc_conf  *conf, *conf_next;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list)
+		sptlrpc_conf_free(conf);
+	LASSERT(list_empty(&sptlrpc_confs));
+	mutex_unlock(&sptlrpc_conf_lock);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
new file mode 100644
index 0000000000000..a5de86426a86c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ *
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <libcfs/libcfs.h>
+#include <lvfs.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+/* refine later and change to seqlock or simlar from libcfs */
+/* Debugging check only needed during development */
+#ifdef OBD_CTXT_DEBUG
+# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
+#else
+# define ASSERT_CTXT_MAGIC(magic) do {} while(0)
+#endif
+
+static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+				 struct dentry *dentry)
+{
+	struct path path;
+	struct path old_pwd;
+
+	path.mnt = mnt;
+	path.dentry = dentry;
+	path_get(&path);
+	spin_lock(&fs->lock);
+	write_seqcount_begin(&fs->seq);
+	old_pwd = fs->pwd;
+	fs->pwd = path;
+	write_seqcount_end(&fs->seq);
+	spin_unlock(&fs->lock);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+/* push / pop to root of obd store */
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx)
+{
+	/* if there is underlaying dt_device then push_ctxt is not needed */
+	if (new_ctx->dt != NULL)
+		return;
+
+	ASSERT_CTXT_MAGIC(new_ctx->magic);
+	OBD_SET_CTXT_MAGIC(save);
+
+	LASSERT(ll_d_count(current->fs->pwd.dentry));
+	LASSERT(ll_d_count(new_ctx->pwd));
+	save->pwd = dget(current->fs->pwd.dentry);
+	save->pwdmnt = mntget(current->fs->pwd.mnt);
+	save->umask = current_umask();
+
+	LASSERT(save->pwd);
+	LASSERT(save->pwdmnt);
+	LASSERT(new_ctx->pwd);
+	LASSERT(new_ctx->pwdmnt);
+
+	current->fs->umask = 0; /* umask already applied on client */
+	ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
+}
+EXPORT_SYMBOL(push_ctxt);
+
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx)
+{
+	/* if there is underlaying dt_device then pop_ctxt is not needed */
+	if (new_ctx->dt != NULL)
+		return;
+
+	ASSERT_CTXT_MAGIC(saved->magic);
+
+	LASSERTF(current->fs->pwd.dentry == new_ctx->pwd, "%p != %p\n",
+		 current->fs->pwd.dentry, new_ctx->pwd);
+	LASSERTF(current->fs->pwd.mnt == new_ctx->pwdmnt, "%p != %p\n",
+		 current->fs->pwd.mnt, new_ctx->pwdmnt);
+
+	ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
+
+	dput(saved->pwd);
+	mntput(saved->pwdmnt);
+	current->fs->umask = saved->umask;
+}
+EXPORT_SYMBOL(pop_ctxt);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
new file mode 100644
index 0000000000000..c056aa4b97480
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
@@ -0,0 +1,198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/sec_gc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/workqueue.h>
+#include <libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+#define SEC_GC_INTERVAL (30 * 60)
+
+static DEFINE_MUTEX(sec_gc_mutex);
+static DEFINE_SPINLOCK(sec_gc_list_lock);
+static DEFINE_SPINLOCK(sec_gc_ctx_list_lock);
+static LIST_HEAD(sec_gc_list);
+static LIST_HEAD(sec_gc_ctx_list);
+
+static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
+
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+	LASSERT(sec->ps_gc_interval > 0);
+	LASSERT(list_empty(&sec->ps_gc_list));
+
+	sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval;
+
+	spin_lock(&sec_gc_list_lock);
+	list_add_tail(&sec->ps_gc_list, &sec_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
+{
+	if (list_empty(&sec->ps_gc_list))
+		return;
+
+	/* signal before list_del to make iteration in gc thread safe */
+	atomic_inc(&sec_gc_wait_del);
+
+	spin_lock(&sec_gc_list_lock);
+	list_del_init(&sec->ps_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	/* barrier */
+	mutex_lock(&sec_gc_mutex);
+	mutex_unlock(&sec_gc_mutex);
+
+	atomic_dec(&sec_gc_wait_del);
+
+	CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+
+static void sec_gc_main(struct work_struct *ws);
+static DECLARE_DELAYED_WORK(sec_gc_work, sec_gc_main);
+
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(list_empty(&ctx->cc_gc_chain));
+
+	CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n",
+	       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+	spin_lock(&sec_gc_ctx_list_lock);
+	list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
+	spin_unlock(&sec_gc_ctx_list_lock);
+
+	mod_delayed_work(system_wq, &sec_gc_work, 0);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
+
+static void sec_process_ctx_list(void)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	spin_lock(&sec_gc_ctx_list_lock);
+
+	while ((ctx = list_first_entry_or_null(&sec_gc_ctx_list,
+					       struct ptlrpc_cli_ctx,
+					       cc_gc_chain)) != NULL) {
+		list_del_init(&ctx->cc_gc_chain);
+		spin_unlock(&sec_gc_ctx_list_lock);
+
+		LASSERT(ctx->cc_sec);
+		LASSERT(atomic_read(&ctx->cc_refcount) == 1);
+		CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n",
+		       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+		sptlrpc_cli_ctx_put(ctx, 1);
+
+		spin_lock(&sec_gc_ctx_list_lock);
+	}
+
+	spin_unlock(&sec_gc_ctx_list_lock);
+}
+
+static void sec_do_gc(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+
+	if (unlikely(sec->ps_gc_next == 0)) {
+		CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n",
+		       sec, sec->ps_policy->sp_name);
+		return;
+	}
+
+	CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+
+	if (sec->ps_gc_next > ktime_get_real_seconds())
+		return;
+
+	sec->ps_policy->sp_cops->gc_ctx(sec);
+	sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval;
+}
+
+static void sec_gc_main(struct work_struct *ws)
+{
+	struct ptlrpc_sec *sec;
+
+	sec_process_ctx_list();
+again:
+	/*
+	 * go through sec list do gc.
+	 * FIXME here we iterate through the whole list each time which
+	 * is not optimal. we perhaps want to use balanced binary tree
+	 * to trace each sec as order of expiry time.
+	 * another issue here is we wakeup as fixed interval instead of
+	 * according to each sec's expiry time
+	 */
+	mutex_lock(&sec_gc_mutex);
+	list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+		/*
+		 * if someone is waiting to be deleted, let it
+		 * proceed as soon as possible.
+		 */
+		if (atomic_read(&sec_gc_wait_del)) {
+			CDEBUG(D_SEC, "deletion pending, start over\n");
+			mutex_unlock(&sec_gc_mutex);
+			goto again;
+		}
+
+		sec_do_gc(sec);
+	}
+	mutex_unlock(&sec_gc_mutex);
+
+	/* check ctx list again before sleep */
+	sec_process_ctx_list();
+	schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL));
+}
+
+int sptlrpc_gc_init(void)
+{
+	schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL));
+	return 0;
+}
+
+void sptlrpc_gc_fini(void)
+{
+	cancel_delayed_work_sync(&sec_gc_work);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
new file mode 100644
index 0000000000000..fb25a28700b05
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
@@ -0,0 +1,381 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/sec_lproc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+static char *sec_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+
+static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *obd = seq->private;
+	struct client_obd *cli = &obd->u.cli;
+	struct ptlrpc_sec *sec = NULL;
+	char               str[32];
+
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0);
+
+        if (cli->cl_import)
+                sec = sptlrpc_import_sec_ref(cli->cl_import);
+        if (sec == NULL)
+                goto out;
+
+        sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str));
+
+	seq_printf(seq, "rpc flavor:	%s\n",
+		   sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc));
+	seq_printf(seq, "bulk flavor:	%s\n",
+		   sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str)));
+	seq_printf(seq, "flags:		%s\n",
+		   sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)));
+	seq_printf(seq, "id:		%d\n", sec->ps_id);
+	seq_printf(seq, "refcount:	%d\n",
+		   atomic_read(&sec->ps_refcount));
+	seq_printf(seq, "nctx:	%d\n", atomic_read(&sec->ps_nctx));
+	seq_printf(seq, "gc internal	%lld\n", sec->ps_gc_interval);
+	seq_printf(seq, "gc next	%lld\n",
+		   sec->ps_gc_interval ?
+		   (s64)(sec->ps_gc_next - ktime_get_real_seconds()) : 0ll);
+
+	sptlrpc_sec_put(sec);
+out:
+        return 0;
+}
+
+LDEBUGFS_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *obd = seq->private;
+	struct client_obd *cli = &obd->u.cli;
+	struct ptlrpc_sec *sec = NULL;
+
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0);
+
+        if (cli->cl_import)
+                sec = sptlrpc_import_sec_ref(cli->cl_import);
+        if (sec == NULL)
+                goto out;
+
+        if (sec->ps_policy->sp_cops->display)
+                sec->ps_policy->sp_cops->display(sec, seq);
+
+        sptlrpc_sec_put(sec);
+out:
+        return 0;
+}
+
+LDEBUGFS_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0)
+static ssize_t sepol_seq_write_old(struct obd_device *obd,
+				   const char __user *buffer,
+				   size_t count)
+{
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp = cli->cl_import;
+	struct sepol_downcall_data_old *param;
+	int size = sizeof(*param);
+	__u16 len;
+	int rc = 0;
+
+	if (count < size) {
+		rc = -EINVAL;
+		CERROR("%s: invalid data count = %lu, size = %d: rc = %d\n",
+		       obd->obd_name, (unsigned long) count, size, rc);
+		return rc;
+	}
+
+	OBD_ALLOC(param, size);
+	if (param == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(param, buffer, size)) {
+		rc = -EFAULT;
+		CERROR("%s: bad sepol data: rc = %d\n", obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	if (param->sdd_magic != SEPOL_DOWNCALL_MAGIC_OLD) {
+		rc = -EINVAL;
+		CERROR("%s: sepol downcall bad params: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	if (param->sdd_sepol_len == 0 ||
+	    param->sdd_sepol_len >= sizeof(imp->imp_sec->ps_sepol)) {
+		rc = -EINVAL;
+		CERROR("%s: invalid sepol data returned: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+	len = param->sdd_sepol_len; /* save sdd_sepol_len */
+	OBD_FREE(param, size);
+	size = offsetof(struct sepol_downcall_data_old,
+			sdd_sepol[len]);
+
+	if (count < size) {
+		rc = -EINVAL;
+		CERROR("%s: invalid sepol count = %lu, size = %d: rc = %d\n",
+		       obd->obd_name, (unsigned long) count, size, rc);
+		return rc;
+	}
+
+	/* alloc again with real size */
+	OBD_ALLOC(param, size);
+	if (param == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(param, buffer, size)) {
+		rc = -EFAULT;
+		CERROR("%s: cannot copy sepol data: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	spin_lock(&imp->imp_sec->ps_lock);
+	snprintf(imp->imp_sec->ps_sepol, param->sdd_sepol_len + 1, "%s",
+		 param->sdd_sepol);
+	imp->imp_sec->ps_sepol_mtime = ktime_set(param->sdd_sepol_mtime, 0);
+	spin_unlock(&imp->imp_sec->ps_lock);
+
+out:
+	if (param != NULL)
+		OBD_FREE(param, size);
+
+	return rc ? rc : count;
+}
+#endif
+
+static ssize_t
+ldebugfs_sptlrpc_sepol_seq_write(struct file *file, const char __user *buffer,
+				 size_t count, void *data)
+{
+	struct seq_file	*seq = file->private_data;
+	struct obd_device *obd = seq->private;
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp = cli->cl_import;
+	struct sepol_downcall_data *param;
+	__u32 magic;
+	int size = sizeof(magic);
+	__u16 len;
+	int rc = 0;
+
+	if (count < size) {
+		rc = -EINVAL;
+		CERROR("%s: invalid buffer count = %lu, size = %d: rc = %d\n",
+		       obd->obd_name, (unsigned long) count, size, rc);
+		return rc;
+	}
+
+	if (copy_from_user(&magic, buffer, size)) {
+		rc = -EFAULT;
+		CERROR("%s: bad sepol magic: rc = %d\n", obd->obd_name, rc);
+		return rc;
+	}
+
+	if (magic != SEPOL_DOWNCALL_MAGIC) {
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0)
+		if (magic == SEPOL_DOWNCALL_MAGIC_OLD) {
+			return sepol_seq_write_old(obd, buffer, count);
+		}
+#endif
+		rc = -EINVAL;
+		CERROR("%s: sepol downcall bad magic '%#08x': rc = %d\n",
+		       obd->obd_name, magic, rc);
+		return rc;
+	}
+
+	size = sizeof(*param);
+	if (count < size) {
+		rc = -EINVAL;
+		CERROR("%s: invalid data count = %lu, size = %d: rc = %d\n",
+		       obd->obd_name, (unsigned long) count, size, rc);
+		return rc;
+	}
+
+	OBD_ALLOC(param, size);
+	if (param == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(param, buffer, size)) {
+		rc = -EFAULT;
+		CERROR("%s: bad sepol data: rc = %d\n", obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	if (param->sdd_sepol_len == 0 ||
+	    param->sdd_sepol_len >= sizeof(imp->imp_sec->ps_sepol)) {
+		rc = -EINVAL;
+		CERROR("%s: invalid sepol data returned: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+	len = param->sdd_sepol_len; /* save sdd_sepol_len */
+	OBD_FREE(param, size);
+	size = offsetof(struct sepol_downcall_data,
+			sdd_sepol[len]);
+
+	/* alloc again with real size */
+	OBD_ALLOC(param, size);
+	if (param == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(param, buffer, size)) {
+		rc = -EFAULT;
+		CERROR("%s: cannot copy sepol data: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	spin_lock(&imp->imp_sec->ps_lock);
+	snprintf(imp->imp_sec->ps_sepol, param->sdd_sepol_len + 1, "%s",
+		 param->sdd_sepol);
+	imp->imp_sec->ps_sepol_mtime = ktime_set(param->sdd_sepol_mtime, 0);
+	spin_unlock(&imp->imp_sec->ps_lock);
+
+out:
+	if (param != NULL)
+		OBD_FREE(param, size);
+
+	return rc ? rc : count;
+}
+LDEBUGFS_FOPS_WR_ONLY(srpc, sptlrpc_sepol);
+
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *obd)
+{
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) != 0 &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) != 0 &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) != 0) {
+		CERROR("can't register lproc for obd type %s\n",
+		       obd->obd_type->typ_name);
+		return -EINVAL;
+	}
+
+	debugfs_create_file("srpc_info", 0444, obd->obd_debugfs_entry, obd,
+			    &sptlrpc_info_lprocfs_fops);
+
+	debugfs_create_file("srpc_contexts", 0444, obd->obd_debugfs_entry, obd,
+			    &sptlrpc_ctxs_lprocfs_fops);
+
+	debugfs_create_file("srpc_sepol", 0200, obd->obd_debugfs_entry, obd,
+			    &srpc_sptlrpc_sepol_fops);
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
+
+LDEBUGFS_SEQ_FOPS_RO(sptlrpc_proc_enc_pool);
+
+static struct ldebugfs_vars sptlrpc_lprocfs_vars[] = {
+	{ .name	=	"encrypt_page_pools",
+	  .fops	=	&sptlrpc_proc_enc_pool_fops	},
+	{ NULL }
+};
+
+struct dentry *sptlrpc_debugfs_dir;
+EXPORT_SYMBOL(sptlrpc_debugfs_dir);
+
+struct proc_dir_entry *sptlrpc_lprocfs_dir;
+EXPORT_SYMBOL(sptlrpc_lprocfs_dir);
+
+int sptlrpc_lproc_init(void)
+{
+	int rc;
+
+	LASSERT(sptlrpc_debugfs_dir == NULL);
+
+	sptlrpc_debugfs_dir = debugfs_create_dir("sptlrpc",
+						 debugfs_lustre_root);
+	ldebugfs_add_vars(sptlrpc_debugfs_dir, sptlrpc_lprocfs_vars, NULL);
+
+	sptlrpc_lprocfs_dir = lprocfs_register("sptlrpc", proc_lustre_root,
+					       NULL, NULL);
+	if (IS_ERR_OR_NULL(sptlrpc_lprocfs_dir)) {
+		rc = PTR_ERR(sptlrpc_lprocfs_dir);
+		rc = sptlrpc_lprocfs_dir ? PTR_ERR(sptlrpc_lprocfs_dir)
+			: -ENOMEM;
+		sptlrpc_lprocfs_dir = NULL;
+	}
+	return 0;
+}
+
+void sptlrpc_lproc_fini(void)
+{
+	debugfs_remove_recursive(sptlrpc_debugfs_dir);
+	sptlrpc_debugfs_dir = NULL;
+
+	if (!IS_ERR_OR_NULL(sptlrpc_lprocfs_dir))
+		lprocfs_remove(&sptlrpc_lprocfs_dir);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
new file mode 100644
index 0000000000000..4fb3a092c634a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
@@ -0,0 +1,451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/sec_null.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+static struct ptlrpc_sec_policy null_policy;
+static struct ptlrpc_sec        null_sec;
+static struct ptlrpc_cli_ctx    null_cli_ctx;
+static struct ptlrpc_svc_ctx    null_svc_ctx;
+
+/*
+ * we can temporarily use the topmost 8-bits of lm_secflvr to identify
+ * the source sec part.
+ */
+static inline
+void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
+{
+	msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24;
+}
+
+static inline
+enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
+{
+	return (msg->lm_secflvr >> 24) & 0xFF;
+}
+
+static
+int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+
+	if (!req->rq_import->imp_dlm_fake) {
+		struct obd_device *obd = req->rq_import->imp_obd;
+
+		null_encode_sec_part(req->rq_reqbuf,
+				     obd->u.cli.cl_sp_me);
+	}
+	req->rq_reqdata_len = req->rq_reqlen;
+	return 0;
+}
+
+static
+int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	__u32   cksums, cksumc;
+
+	LASSERT(req->rq_repdata);
+
+	req->rq_repmsg = req->rq_repdata;
+	req->rq_replen = req->rq_repdata_len;
+
+	if (req->rq_early) {
+		cksums = lustre_msg_get_cksum(req->rq_repdata);
+		cksumc = lustre_msg_calc_cksum(req->rq_repmsg,
+					       MSG_PTLRPC_BODY_OFF);
+
+		if (cksumc != cksums) {
+			CDEBUG(D_SEC,
+			       "early reply checksum mismatch: %08x != %08x\n",
+			       cksumc, cksums);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static
+struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
+				   struct ptlrpc_svc_ctx *svc_ctx,
+				   struct sptlrpc_flavor *sf)
+{
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
+
+	/*
+	 * general layer has take a module reference for us, because we never
+	 * really destroy the sec, simply release the reference here.
+	 */
+	sptlrpc_policy_put(&null_policy);
+	return &null_sec;
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec == &null_sec);
+}
+
+static
+struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec,
+				       struct vfs_cred *vcred,
+				       int create, int remove_dead)
+{
+	atomic_inc(&null_cli_ctx.cc_refcount);
+	return &null_cli_ctx;
+}
+
+static
+int null_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid, int grace,
+			 int force)
+{
+	return 0;
+}
+
+static
+int null_alloc_reqbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req,
+		      int msgsize)
+{
+	if (!req->rq_reqbuf) {
+		int alloc_size = size_roundup_power2(msgsize);
+
+		LASSERT(!req->rq_pool);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size);
+		if (!req->rq_reqbuf)
+			return -ENOMEM;
+
+		req->rq_reqbuf_len = alloc_size;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= msgsize);
+		memset(req->rq_reqbuf, 0, msgsize);
+	}
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	return 0;
+}
+
+static
+void null_free_reqbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req)
+{
+	if (!req->rq_pool) {
+		LASSERTF(req->rq_reqmsg == req->rq_reqbuf,
+			 "req %p: reqmsg %p is not reqbuf %p in null sec\n",
+			 req, req->rq_reqmsg, req->rq_reqbuf);
+		LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen,
+			 "req %p: reqlen %d should smaller than buflen %d\n",
+			 req, req->rq_reqlen, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+}
+
+static
+int null_alloc_repbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req,
+		      int msgsize)
+{
+	/* add space for early replied */
+	msgsize += lustre_msg_early_size;
+
+	msgsize = size_roundup_power2(msgsize);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, msgsize);
+	if (!req->rq_repbuf)
+		return -ENOMEM;
+
+	req->rq_repbuf_len = msgsize;
+	return 0;
+}
+
+static
+void null_free_repbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repbuf);
+
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+}
+
+static
+int null_enlarge_reqbuf(struct ptlrpc_sec *sec,
+			struct ptlrpc_request *req,
+			int segment, int newsize)
+{
+	struct lustre_msg      *newbuf;
+	struct lustre_msg      *oldbuf = req->rq_reqmsg;
+	int                     oldsize, newmsg_size, alloc_size;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf == req->rq_reqmsg);
+	LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+	LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf));
+
+	/* compute new message size */
+	oldsize = req->rq_reqbuf->lm_buflens[segment];
+	req->rq_reqbuf->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_packed_msg_size(oldbuf);
+	req->rq_reqbuf->lm_buflens[segment] = oldsize;
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size);
+
+	if (req->rq_reqbuf_len < newmsg_size) {
+		alloc_size = size_roundup_power2(newmsg_size);
+
+		OBD_ALLOC_LARGE(newbuf, alloc_size);
+		if (newbuf == NULL)
+			return -ENOMEM;
+
+		/*
+		 * Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there
+		 */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = req->rq_reqmsg = newbuf;
+		req->rq_reqbuf_len = alloc_size;
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+	req->rq_reqlen = newmsg_size;
+
+	return 0;
+}
+
+static struct ptlrpc_svc_ctx null_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+	.sc_policy      = &null_policy,
+};
+
+static
+int null_accept(struct ptlrpc_request *req)
+{
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_NULL);
+
+	if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) {
+		CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc);
+		return SECSVC_DROP;
+	}
+
+	req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf);
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	req->rq_reqlen = req->rq_reqdata_len;
+
+	req->rq_svc_ctx = &null_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	return SECSVC_OK;
+}
+
+static
+int null_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_reply_state *rs;
+	int rs_size = sizeof(*rs) + msgsize;
+
+	LASSERT(msgsize % 8 == 0);
+
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			return -ENOMEM;
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+	rs->rs_msg = rs->rs_repbuf;
+
+	req->rq_reply_state = rs;
+	return 0;
+}
+
+static
+void null_free_rs(struct ptlrpc_reply_state *rs)
+{
+	LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int null_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+
+	LASSERT(rs);
+
+	rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+	rs->rs_repdata_len = req->rq_replen;
+	req->rq_reply_off = 0;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = lustre_msg_early_size;
+	} else {
+		__u32 cksum;
+
+		cksum = lustre_msg_calc_cksum(rs->rs_repbuf,
+					      MSG_PTLRPC_BODY_OFF);
+		lustre_msg_set_cksum(rs->rs_repbuf, cksum);
+	}
+
+	return 0;
+}
+
+static struct ptlrpc_ctx_ops null_ctx_ops = {
+	.sign                   = null_ctx_sign,
+	.verify                 = null_ctx_verify,
+};
+
+static struct ptlrpc_sec_cops null_sec_cops = {
+	.create_sec             = null_create_sec,
+	.destroy_sec            = null_destroy_sec,
+	.lookup_ctx             = null_lookup_ctx,
+	.flush_ctx_cache        = null_flush_ctx_cache,
+	.alloc_reqbuf           = null_alloc_reqbuf,
+	.alloc_repbuf           = null_alloc_repbuf,
+	.free_reqbuf            = null_free_reqbuf,
+	.free_repbuf            = null_free_repbuf,
+	.enlarge_reqbuf         = null_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops null_sec_sops = {
+	.accept                 = null_accept,
+	.alloc_rs               = null_alloc_rs,
+	.authorize              = null_authorize,
+	.free_rs                = null_free_rs,
+};
+
+static struct ptlrpc_sec_policy null_policy = {
+	.sp_owner               = THIS_MODULE,
+	.sp_name                = "sec.null",
+	.sp_policy              = SPTLRPC_POLICY_NULL,
+	.sp_cops                = &null_sec_cops,
+	.sp_sops                = &null_sec_sops,
+};
+
+static void null_init_internal(void)
+{
+	static HLIST_HEAD(__list);
+
+	null_sec.ps_policy = &null_policy;
+	atomic_set(&null_sec.ps_refcount, 1);	/* always busy */
+	null_sec.ps_id = -1;
+	null_sec.ps_import = NULL;
+	null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
+	null_sec.ps_flvr.sf_flags = 0;
+	null_sec.ps_part = LUSTRE_SP_ANY;
+	null_sec.ps_dying = 0;
+	spin_lock_init(&null_sec.ps_lock);
+	atomic_set(&null_sec.ps_nctx, 1);	/* for "null_cli_ctx" */
+	INIT_LIST_HEAD(&null_sec.ps_gc_list);
+	null_sec.ps_gc_interval = 0;
+	null_sec.ps_gc_next = 0;
+
+	hlist_add_head(&null_cli_ctx.cc_cache, &__list);
+	atomic_set(&null_cli_ctx.cc_refcount, 1);	/* for hash */
+	null_cli_ctx.cc_sec = &null_sec;
+	null_cli_ctx.cc_ops = &null_ctx_ops;
+	null_cli_ctx.cc_expire = 0;
+	null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL |
+				PTLRPC_CTX_UPTODATE;
+	null_cli_ctx.cc_vcred.vc_uid = 0;
+	spin_lock_init(&null_cli_ctx.cc_lock);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_req_list);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain);
+}
+
+int sptlrpc_null_init(void)
+{
+	int rc;
+
+	null_init_internal();
+
+	rc = sptlrpc_register_policy(&null_policy);
+	if (rc)
+		CERROR("failed to register %s: %d\n", null_policy.sp_name, rc);
+
+	return rc;
+}
+
+void sptlrpc_null_fini(void)
+{
+	int rc;
+
+	rc = sptlrpc_unregister_policy(&null_policy);
+	if (rc)
+		CERROR("failed to unregister %s: %d\n", null_policy.sp_name,
+		       rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
new file mode 100644
index 0000000000000..7cf3a2c64af6d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
@@ -0,0 +1,1032 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/ptlrpc/sec_plain.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+struct plain_sec {
+	struct ptlrpc_sec pls_base;
+	rwlock_t pls_lock;
+	struct ptlrpc_cli_ctx *pls_ctx;
+};
+
+static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec)
+{
+	return container_of(sec, struct plain_sec, pls_base);
+}
+
+static struct ptlrpc_sec_policy plain_policy;
+static struct ptlrpc_ctx_ops    plain_ctx_ops;
+static struct ptlrpc_svc_ctx    plain_svc_ctx;
+
+static unsigned int plain_at_offset;
+
+/*
+ * for simplicity, plain policy rpc use fixed layout.
+ */
+#define PLAIN_PACK_SEGMENTS             (4)
+
+#define PLAIN_PACK_HDR_OFF              (0)
+#define PLAIN_PACK_MSG_OFF              (1)
+#define PLAIN_PACK_USER_OFF             (2)
+#define PLAIN_PACK_BULK_OFF             (3)
+
+#define PLAIN_FL_USER                   (0x01)
+#define PLAIN_FL_BULK                   (0x02)
+
+struct plain_header {
+	__u8 ph_ver;            /* 0 */
+	__u8 ph_flags;
+	__u8 ph_sp;             /* source */
+	__u8 ph_bulk_hash_alg;  /* complete flavor desc */
+	__u8 ph_pad[4];
+};
+
+struct plain_bulk_token {
+	__u8 pbt_hash[8];
+};
+
+#define PLAIN_BSD_SIZE \
+	(sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token))
+
+/*
+ * bulk checksum helpers
+ */
+
+static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+
+	if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed))
+		return -EPROTO;
+
+	bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE);
+	if (bsd == NULL) {
+		CERROR("bulk sec desc has short size %d\n",
+		       lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF));
+		return -EPROTO;
+	}
+
+	if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+	    bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) {
+		CERROR("invalid bulk svc %u\n", bsd->bsd_svc);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc,
+				    __u8 hash_alg,
+				    struct plain_bulk_token *token)
+{
+	if (hash_alg == BULK_HASH_ALG_NULL)
+		return 0;
+
+	memset(token->pbt_hash, 0, sizeof(token->pbt_hash));
+	return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash,
+					 sizeof(token->pbt_hash));
+}
+
+static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc,
+				  __u8 hash_alg,
+				  struct plain_bulk_token *tokenr)
+{
+	struct plain_bulk_token tokenv;
+	int rc;
+
+	if (hash_alg == BULK_HASH_ALG_NULL)
+		return 0;
+
+	memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash));
+	rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash,
+				       sizeof(tokenv.pbt_hash));
+	if (rc)
+		return rc;
+
+	if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash)))
+		return -EACCES;
+	return 0;
+}
+
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+	char *ptr;
+	unsigned int off, i;
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		if (desc->bd_vec[i].bv_len == 0)
+			continue;
+
+		ptr = kmap(desc->bd_vec[i].bv_page);
+		off = desc->bd_vec[i].bv_offset & ~PAGE_MASK;
+		ptr[off] ^= 0x1;
+		kunmap(desc->bd_vec[i].bv_page);
+		return;
+	}
+}
+
+/*
+ * cli_ctx apis
+ */
+
+static
+int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	/* should never reach here */
+	LBUG();
+	return 0;
+}
+
+static
+int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx)
+{
+	return 0;
+}
+
+static
+int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	struct lustre_msg *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+
+	ENTRY;
+
+	msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+	phdr->ph_ver = 0;
+	phdr->ph_flags = 0;
+	phdr->ph_sp = ctx->cc_sec->ps_part;
+	phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+	if (req->rq_pack_udesc)
+		phdr->ph_flags |= PLAIN_FL_USER;
+	if (req->rq_pack_bulk)
+		phdr->ph_flags |= PLAIN_FL_BULK;
+
+	req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount,
+						 msg->lm_buflens);
+	RETURN(0);
+}
+
+static
+int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	struct lustre_msg *msg = req->rq_repdata;
+	struct plain_header *phdr;
+	bool swabbed;
+
+	ENTRY;
+	if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
+		CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
+		RETURN(-EPROTO);
+	}
+
+	swabbed = req_capsule_rep_need_swab(&req->rq_pill);
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+	if (phdr == NULL) {
+		CERROR("missing plain header\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_ver != 0) {
+		CERROR("Invalid header version\n");
+		RETURN(-EPROTO);
+	}
+
+	/* expect no user desc in reply */
+	if (phdr->ph_flags & PLAIN_FL_USER) {
+		CERROR("Unexpected udesc flag in reply\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) {
+		CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg,
+		       req->rq_flvr.u_bulk.hash.hash_alg);
+		RETURN(-EPROTO);
+	}
+
+	if (unlikely(req->rq_early)) {
+		__u32 cksum = lustre_msg_calc_cksum(msg, PLAIN_PACK_MSG_OFF);
+
+		if (cksum != msg->lm_cksum) {
+			CDEBUG(D_SEC,
+			       "early reply checksum mismatch: %08x != %08x\n",
+			       cpu_to_le32(cksum), msg->lm_cksum);
+			RETURN(-EINVAL);
+		}
+	} else {
+		/*
+		 * whether we sent with bulk or not, we expect the same
+		 * in reply, except for early reply
+		 */
+		if (!req->rq_early &&
+		    !equi(req->rq_pack_bulk == 1,
+			phdr->ph_flags & PLAIN_FL_BULK)) {
+			CERROR("%s bulk checksum in reply\n",
+			       req->rq_pack_bulk ? "Missing" : "Unexpected");
+			RETURN(-EPROTO);
+		}
+
+		if (phdr->ph_flags & PLAIN_FL_BULK) {
+			if (plain_unpack_bsd(msg, swabbed))
+				RETURN(-EPROTO);
+		}
+	}
+
+	req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+	req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF);
+	RETURN(0);
+}
+
+static
+int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+	struct plain_bulk_token *token;
+	int rc;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+	bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	token = (struct plain_bulk_token *) bsd->bsd_data;
+
+	bsd->bsd_version = 0;
+	bsd->bsd_flags = 0;
+	bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+	if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		RETURN(0);
+
+	if (req->rq_bulk_read)
+		RETURN(0);
+
+	rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				      token);
+	if (rc) {
+		CERROR("bulk write: failed to compute checksum: %d\n", rc);
+	} else {
+		/*
+		 * for sending we only compute the wrong checksum instead
+		 * of corrupting the data so it is still correct on a redo
+		 */
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
+		    req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL)
+			token->pbt_hash[0] ^= 0x1;
+	}
+
+	return rc;
+}
+
+static
+int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			  struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_bulk_sec_desc *bsdv;
+	struct plain_bulk_token *tokenv;
+	int rc;
+	int i, nob;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+	LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+	bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0);
+	tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+	if (req->rq_bulk_write) {
+		if (bsdv->bsd_flags & BSD_FL_ERR)
+			return -EIO;
+		return 0;
+	}
+
+	/* fix the actual data size */
+	for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+		if (desc->bd_vec[i].bv_len +
+		    nob > desc->bd_nob_transferred) {
+			desc->bd_vec[i].bv_len =
+				desc->bd_nob_transferred - nob;
+		}
+		nob += desc->bd_vec[i].bv_len;
+	}
+
+	rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				    tokenv);
+	if (rc)
+		CERROR("bulk read: client verify failed: %d\n", rc);
+
+	return rc;
+}
+
+/*
+ * sec apis
+ */
+
+static
+struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec)
+{
+	struct ptlrpc_cli_ctx  *ctx, *ctx_new;
+
+	OBD_ALLOC_PTR(ctx_new);
+
+	write_lock(&plsec->pls_lock);
+
+	ctx = plsec->pls_ctx;
+	if (ctx) {
+		atomic_inc(&ctx->cc_refcount);
+
+		if (ctx_new)
+			OBD_FREE_PTR(ctx_new);
+	} else if (ctx_new) {
+		ctx = ctx_new;
+
+		atomic_set(&ctx->cc_refcount, 1);	/* for cache */
+		ctx->cc_sec = &plsec->pls_base;
+		ctx->cc_ops = &plain_ctx_ops;
+		ctx->cc_expire = 0;
+		ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE;
+		ctx->cc_vcred.vc_uid = 0;
+		spin_lock_init(&ctx->cc_lock);
+		INIT_LIST_HEAD(&ctx->cc_req_list);
+		INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+		plsec->pls_ctx = ctx;
+		atomic_inc(&plsec->pls_base.ps_nctx);
+		atomic_inc(&plsec->pls_base.ps_refcount);
+
+		atomic_inc(&ctx->cc_refcount);	/* for caller */
+	}
+
+	write_unlock(&plsec->pls_lock);
+
+	return ctx;
+}
+
+static
+void plain_destroy_sec(struct ptlrpc_sec *sec)
+{
+	struct plain_sec *plsec = sec2plsec(sec);
+
+	ENTRY;
+
+	LASSERT(sec->ps_policy == &plain_policy);
+	LASSERT(sec->ps_import);
+	LASSERT(atomic_read(&sec->ps_refcount) == 0);
+	LASSERT(atomic_read(&sec->ps_nctx) == 0);
+	LASSERT(plsec->pls_ctx == NULL);
+
+	class_import_put(sec->ps_import);
+
+	OBD_FREE_PTR(plsec);
+	EXIT;
+}
+
+static
+void plain_kill_sec(struct ptlrpc_sec *sec)
+{
+	sec->ps_dying = 1;
+}
+
+static
+struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
+				    struct ptlrpc_svc_ctx *svc_ctx,
+				    struct sptlrpc_flavor *sf)
+{
+	struct plain_sec *plsec;
+	struct ptlrpc_sec *sec;
+	struct ptlrpc_cli_ctx *ctx;
+
+	ENTRY;
+
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
+
+	OBD_ALLOC_PTR(plsec);
+	if (plsec == NULL)
+		RETURN(NULL);
+
+	/*
+	 * initialize plain_sec
+	 */
+	rwlock_init(&plsec->pls_lock);
+	plsec->pls_ctx = NULL;
+
+	sec = &plsec->pls_base;
+	sec->ps_policy = &plain_policy;
+	atomic_set(&sec->ps_refcount, 0);
+	atomic_set(&sec->ps_nctx, 0);
+	sec->ps_id = sptlrpc_get_next_secid();
+	sec->ps_import = class_import_get(imp);
+	sec->ps_flvr = *sf;
+	spin_lock_init(&sec->ps_lock);
+	INIT_LIST_HEAD(&sec->ps_gc_list);
+	sec->ps_gc_interval = 0;
+	sec->ps_gc_next = 0;
+
+	/* install ctx immediately if this is a reverse sec */
+	if (svc_ctx) {
+		ctx = plain_sec_install_ctx(plsec);
+		if (ctx == NULL) {
+			plain_destroy_sec(sec);
+			RETURN(NULL);
+		}
+		sptlrpc_cli_ctx_put(ctx, 1);
+	}
+
+	RETURN(sec);
+}
+
+static
+struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec,
+					struct vfs_cred *vcred,
+					int create, int remove_dead)
+{
+	struct plain_sec *plsec = sec2plsec(sec);
+	struct ptlrpc_cli_ctx *ctx;
+
+	ENTRY;
+
+	read_lock(&plsec->pls_lock);
+	ctx = plsec->pls_ctx;
+	if (ctx)
+		atomic_inc(&ctx->cc_refcount);
+	read_unlock(&plsec->pls_lock);
+
+	if (unlikely(ctx == NULL))
+		ctx = plain_sec_install_ctx(plsec);
+
+	RETURN(ctx);
+}
+
+static
+void plain_release_ctx(struct ptlrpc_sec *sec,
+		       struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	LASSERT(ctx->cc_sec == sec);
+
+	OBD_FREE_PTR(ctx);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static
+int plain_flush_ctx_cache(struct ptlrpc_sec *sec,
+			  uid_t uid, int grace, int force)
+{
+	struct plain_sec *plsec = sec2plsec(sec);
+	struct ptlrpc_cli_ctx *ctx;
+
+	ENTRY;
+
+	/* do nothing unless caller want to flush for 'all' */
+	if (uid != -1)
+		RETURN(0);
+
+	write_lock(&plsec->pls_lock);
+	ctx = plsec->pls_ctx;
+	plsec->pls_ctx = NULL;
+	write_unlock(&plsec->pls_lock);
+
+	if (ctx)
+		sptlrpc_cli_ctx_put(ctx, 1);
+	RETURN(0);
+}
+
+static
+int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int msgsize)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int alloc_len;
+
+	ENTRY;
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_udesc)
+		buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size();
+
+	if (req->rq_pack_bulk) {
+		LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+	}
+
+	alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	if (!req->rq_reqbuf) {
+		LASSERT(!req->rq_pool);
+
+		alloc_len = size_roundup_power2(alloc_len);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len);
+		if (!req->rq_reqbuf)
+			RETURN(-ENOMEM);
+
+		req->rq_reqbuf_len = alloc_len;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= alloc_len);
+		memset(req->rq_reqbuf, 0, alloc_len);
+	}
+
+	lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+	req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0);
+
+	if (req->rq_pack_udesc)
+		sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF);
+
+	RETURN(0);
+}
+
+static
+void plain_free_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req)
+{
+	ENTRY;
+	if (!req->rq_pool) {
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+	EXIT;
+}
+
+static
+int plain_alloc_repbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int msgsize)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int alloc_len;
+
+	ENTRY;
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_bulk) {
+		LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+	}
+
+	alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	/* add space for early reply */
+	alloc_len += plain_at_offset;
+
+	alloc_len = size_roundup_power2(alloc_len);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len);
+	if (!req->rq_repbuf)
+		RETURN(-ENOMEM);
+
+	req->rq_repbuf_len = alloc_len;
+	RETURN(0);
+}
+
+static
+void plain_free_repbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req)
+{
+	ENTRY;
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+	EXIT;
+}
+
+static
+int plain_enlarge_reqbuf(struct ptlrpc_sec *sec,
+			 struct ptlrpc_request *req,
+			 int segment, int newsize)
+{
+	struct lustre_msg *newbuf;
+	int oldsize;
+	int newmsg_size, newbuf_size;
+
+	ENTRY;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+	LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) ==
+		req->rq_reqmsg);
+
+	/* compute new embedded msg size.  */
+	oldsize = req->rq_reqmsg->lm_buflens[segment];
+	req->rq_reqmsg->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount,
+					 req->rq_reqmsg->lm_buflens);
+	req->rq_reqmsg->lm_buflens[segment] = oldsize;
+
+	/* compute new wrapper msg size.  */
+	oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF];
+	req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size;
+	newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount,
+					 req->rq_reqbuf->lm_buflens);
+	req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize;
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+	if (req->rq_reqbuf_len < newbuf_size) {
+		newbuf_size = size_roundup_power2(newbuf_size);
+
+		OBD_ALLOC_LARGE(newbuf, newbuf_size);
+		if (newbuf == NULL)
+			RETURN(-ENOMEM);
+
+		/*
+		 * Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there
+		 */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = newbuf;
+		req->rq_reqbuf_len = newbuf_size;
+		req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf,
+						PLAIN_PACK_MSG_OFF, 0);
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF,
+				     newmsg_size);
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+	req->rq_reqlen = newmsg_size;
+	RETURN(0);
+}
+
+/*
+ * service apis
+ */
+
+static struct ptlrpc_svc_ctx plain_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+	.sc_policy      = &plain_policy,
+};
+
+static int plain_accept(struct ptlrpc_request *req)
+{
+	struct lustre_msg *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+	bool swabbed;
+
+	ENTRY;
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_PLAIN);
+
+	if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
+	    SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
+	    SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) !=
+	    SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) {
+		CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		RETURN(SECSVC_DROP);
+	}
+
+	if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) {
+		CERROR("unexpected request buf count %u\n", msg->lm_bufcount);
+		RETURN(SECSVC_DROP);
+	}
+
+	swabbed = req_capsule_req_need_swab(&req->rq_pill);
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+	if (phdr == NULL) {
+		CERROR("missing plain header\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_ver != 0) {
+		CERROR("Invalid header version\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) {
+		CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg);
+		RETURN(-EPROTO);
+	}
+
+	req->rq_sp_from = phdr->ph_sp;
+	req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg;
+
+	if (phdr->ph_flags & PLAIN_FL_USER) {
+		if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF,
+					     swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			RETURN(SECSVC_DROP);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0);
+	}
+
+	if (phdr->ph_flags & PLAIN_FL_BULK) {
+		if (plain_unpack_bsd(msg, swabbed))
+			RETURN(SECSVC_DROP);
+
+		req->rq_pack_bulk = 1;
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+	req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF];
+
+	req->rq_svc_ctx = &plain_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	RETURN(SECSVC_OK);
+}
+
+static
+int plain_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_reply_state *rs;
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int rs_size = sizeof(*rs);
+
+	ENTRY;
+
+	LASSERT(msgsize % 8 == 0);
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write))
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+
+	rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			RETURN(-ENOMEM);
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+
+	lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+	rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0);
+
+	req->rq_reply_state = rs;
+	RETURN(0);
+}
+
+static
+void plain_free_rs(struct ptlrpc_reply_state *rs)
+{
+	ENTRY;
+
+	LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+	EXIT;
+}
+
+static
+int plain_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct lustre_msg_v2 *msg = rs->rs_repbuf;
+	struct plain_header *phdr;
+	int len;
+
+	ENTRY;
+
+	LASSERT(rs);
+	LASSERT(msg);
+
+	if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF])
+		len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF,
+					req->rq_replen, 1);
+	else
+		len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+	msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+	phdr->ph_ver = 0;
+	phdr->ph_flags = 0;
+	phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+	if (req->rq_pack_bulk)
+		phdr->ph_flags |= PLAIN_FL_BULK;
+
+	rs->rs_repdata_len = len;
+	req->rq_reply_off = 0;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = plain_at_offset;
+	} else {
+		msg->lm_cksum = lustre_msg_calc_cksum(msg, PLAIN_PACK_MSG_OFF);
+	}
+
+	RETURN(0);
+}
+
+static
+int plain_svc_unwrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+	struct plain_bulk_token *tokenr;
+	int rc;
+
+	LASSERT(req->rq_bulk_write);
+	LASSERT(req->rq_pack_bulk);
+
+	bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+	bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		return 0;
+
+	rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				    tokenr);
+	if (rc) {
+		bsdv->bsd_flags |= BSD_FL_ERR;
+		CERROR("bulk write: server verify failed: %d\n", rc);
+	}
+
+	return rc;
+}
+
+static
+int plain_svc_wrap_bulk(struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+	struct plain_bulk_token *tokenv;
+	int rc;
+
+	LASSERT(req->rq_bulk_read);
+	LASSERT(req->rq_pack_bulk);
+
+	bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+	tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		return 0;
+
+	rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				      tokenv);
+	if (rc) {
+		CERROR("bulk read: server failed to compute checksum: %d\n",
+		       rc);
+	} else {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
+			corrupt_bulk_data(desc);
+	}
+
+	return rc;
+}
+
+static struct ptlrpc_ctx_ops plain_ctx_ops = {
+	.refresh                = plain_ctx_refresh,
+	.validate               = plain_ctx_validate,
+	.sign                   = plain_ctx_sign,
+	.verify                 = plain_ctx_verify,
+	.wrap_bulk              = plain_cli_wrap_bulk,
+	.unwrap_bulk            = plain_cli_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops plain_sec_cops = {
+	.create_sec             = plain_create_sec,
+	.destroy_sec            = plain_destroy_sec,
+	.kill_sec               = plain_kill_sec,
+	.lookup_ctx             = plain_lookup_ctx,
+	.release_ctx            = plain_release_ctx,
+	.flush_ctx_cache        = plain_flush_ctx_cache,
+	.alloc_reqbuf           = plain_alloc_reqbuf,
+	.free_reqbuf            = plain_free_reqbuf,
+	.alloc_repbuf           = plain_alloc_repbuf,
+	.free_repbuf            = plain_free_repbuf,
+	.enlarge_reqbuf         = plain_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops plain_sec_sops = {
+	.accept                 = plain_accept,
+	.alloc_rs               = plain_alloc_rs,
+	.authorize              = plain_authorize,
+	.free_rs                = plain_free_rs,
+	.unwrap_bulk            = plain_svc_unwrap_bulk,
+	.wrap_bulk              = plain_svc_wrap_bulk,
+};
+
+static struct ptlrpc_sec_policy plain_policy = {
+	.sp_owner               = THIS_MODULE,
+	.sp_name                = "plain",
+	.sp_policy              = SPTLRPC_POLICY_PLAIN,
+	.sp_cops                = &plain_sec_cops,
+	.sp_sops                = &plain_sec_sops,
+};
+
+int sptlrpc_plain_init(void)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int rc;
+
+	buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size;
+	plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	rc = sptlrpc_register_policy(&plain_policy);
+	if (rc)
+		CERROR("failed to register: %d\n", rc);
+
+	return rc;
+}
+
+void sptlrpc_plain_fini(void)
+{
+	int rc;
+
+	rc = sptlrpc_unregister_policy(&plain_policy);
+	if (rc)
+		CERROR("cannot unregister: %d\n", rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
new file mode 100644
index 0000000000000..16fcaba8f03c5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
@@ -0,0 +1,3672 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/fs_struct.h>
+#include <linux/kthread.h>
+#include <linux/ratelimit.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lu_object.h>
+#include <uapi/linux/lnet/lnet-types.h>
+#include "ptlrpc_internal.h"
+#include <linux/delay.h>
+
+/* The following are visible and mutable through /sys/module/ptlrpc */
+int test_req_buffer_pressure = 0;
+module_param(test_req_buffer_pressure, int, 0444);
+MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools");
+module_param(at_min, int, 0644);
+MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)");
+module_param(at_max, int, 0644);
+MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)");
+module_param(at_history, int, 0644);
+MODULE_PARM_DESC(at_history,
+		 "Adaptive timeouts remember the slowest event that took place within this period (sec)");
+module_param(at_early_margin, int, 0644);
+MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply");
+module_param(at_extra, int, 0644);
+MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply");
+
+/* forward ref */
+static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt);
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req);
+static void ptlrpc_at_remove_timed(struct ptlrpc_request *req);
+static int ptlrpc_start_threads(struct ptlrpc_service *svc);
+static int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait);
+
+/** Holds a list of all PTLRPC services */
+LIST_HEAD(ptlrpc_all_services);
+/** Used to protect the \e ptlrpc_all_services list */
+struct mutex ptlrpc_all_services_mutex;
+
+static struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	struct ptlrpc_request_buffer_desc *rqbd;
+
+	OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt);
+	if (rqbd == NULL)
+		return NULL;
+
+	rqbd->rqbd_svcpt = svcpt;
+	rqbd->rqbd_refcount = 0;
+	rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+	rqbd->rqbd_cbid.cbid_arg = rqbd;
+	INIT_LIST_HEAD(&rqbd->rqbd_reqs);
+	OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable,
+			    svcpt->scp_cpt, svc->srv_buf_size);
+	if (rqbd->rqbd_buffer == NULL) {
+		OBD_FREE_PTR(rqbd);
+		return NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+	list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+	svcpt->scp_nrqbds_total++;
+	spin_unlock(&svcpt->scp_lock);
+
+	return rqbd;
+}
+
+static void ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+
+	LASSERT(rqbd->rqbd_refcount == 0);
+	LASSERT(list_empty(&rqbd->rqbd_reqs));
+
+	spin_lock(&svcpt->scp_lock);
+	list_del(&rqbd->rqbd_list);
+	svcpt->scp_nrqbds_total--;
+	spin_unlock(&svcpt->scp_lock);
+
+	OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size);
+	OBD_FREE_PTR(rqbd);
+}
+
+static int ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
+{
+	struct ptlrpc_service *svc = svcpt->scp_service;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int rc = 0;
+	int i;
+
+	if (svcpt->scp_rqbd_allocating)
+		goto try_post;
+
+	spin_lock(&svcpt->scp_lock);
+	/* check again with lock */
+	if (svcpt->scp_rqbd_allocating) {
+		/* NB: we might allow more than one thread in the future */
+		LASSERT(svcpt->scp_rqbd_allocating == 1);
+		spin_unlock(&svcpt->scp_lock);
+		goto try_post;
+	}
+
+	svcpt->scp_rqbd_allocating++;
+	spin_unlock(&svcpt->scp_lock);
+
+
+	for (i = 0; i < svc->srv_nbuf_per_group; i++) {
+		/*
+		 * NB: another thread might have recycled enough rqbds, we
+		 * need to make sure it wouldn't over-allocate, see LU-1212.
+		 */
+		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group ||
+		    (svc->srv_nrqbds_max != 0 &&
+		     svcpt->scp_nrqbds_total > svc->srv_nrqbds_max))
+			break;
+
+		rqbd = ptlrpc_alloc_rqbd(svcpt);
+
+		if (rqbd == NULL) {
+			CERROR("%s: Can't allocate request buffer\n",
+			       svc->srv_name);
+			rc = -ENOMEM;
+			break;
+		}
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(svcpt->scp_rqbd_allocating == 1);
+	svcpt->scp_rqbd_allocating--;
+
+	spin_unlock(&svcpt->scp_lock);
+
+	CDEBUG(D_RPCTRACE,
+	       "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n",
+	       svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted,
+	       svcpt->scp_nrqbds_total, rc);
+
+ try_post:
+	if (post && rc == 0)
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+
+	return rc;
+}
+
+/**
+ * Part of Rep-Ack logic.
+ * Puts a lock and its mode into reply state assotiated to request reply.
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock,
+		      int mode, bool no_ack, bool convert_lock)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	int idx;
+
+	LASSERT(rs != NULL);
+	LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
+
+	idx = rs->rs_nlocks++;
+	rs->rs_locks[idx] = *lock;
+	rs->rs_modes[idx] = mode;
+	rs->rs_difficult = 1;
+	rs->rs_no_ack = no_ack;
+	rs->rs_convert_lock = convert_lock;
+}
+EXPORT_SYMBOL(ptlrpc_save_lock);
+
+
+struct ptlrpc_hr_partition;
+
+struct ptlrpc_hr_thread {
+	int				hrt_id;		/* thread ID */
+	spinlock_t			hrt_lock;
+	wait_queue_head_t		hrt_waitq;
+	struct list_head		hrt_queue;
+	struct ptlrpc_hr_partition	*hrt_partition;
+};
+
+struct ptlrpc_hr_partition {
+	/* # of started threads */
+	atomic_t			hrp_nstarted;
+	/* # of stopped threads */
+	atomic_t			hrp_nstopped;
+	/* cpu partition id */
+	int				hrp_cpt;
+	/* round-robin rotor for choosing thread */
+	int				hrp_rotor;
+	/* total number of threads on this partition */
+	int				hrp_nthrs;
+	/* threads table */
+	struct ptlrpc_hr_thread		*hrp_thrs;
+};
+
+#define HRT_RUNNING 0
+#define HRT_STOPPING 1
+
+struct ptlrpc_hr_service {
+	/* CPU partition table, it's just cfs_cpt_tab for now */
+	struct cfs_cpt_table		*hr_cpt_table;
+	/** controller sleep waitq */
+	wait_queue_head_t		hr_waitq;
+	unsigned int			hr_stopping;
+	/** roundrobin rotor for non-affinity service */
+	unsigned int			hr_rotor;
+	/* partition data */
+	struct ptlrpc_hr_partition	**hr_partitions;
+};
+
+struct rs_batch {
+	struct list_head			rsb_replies;
+	unsigned int			rsb_n_replies;
+	struct ptlrpc_service_part	*rsb_svcpt;
+};
+
+/** reply handling service. */
+static struct ptlrpc_hr_service		ptlrpc_hr;
+
+/**
+ * maximum mumber of replies scheduled in one batch
+ */
+#define MAX_SCHEDULED 256
+
+/**
+ * Initialize a reply batch.
+ *
+ * \param b batch
+ */
+static void rs_batch_init(struct rs_batch *b)
+{
+	memset(b, 0, sizeof(*b));
+	INIT_LIST_HEAD(&b->rsb_replies);
+}
+
+/**
+ * Choose an hr thread to dispatch requests to.
+ */
+static
+struct ptlrpc_hr_thread *ptlrpc_hr_select(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	unsigned int			rotor;
+
+	if (svcpt->scp_cpt >= 0 &&
+	    svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) {
+		/* directly match partition */
+		hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt];
+
+	} else {
+		rotor = ptlrpc_hr.hr_rotor++;
+		rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table);
+
+		hrp = ptlrpc_hr.hr_partitions[rotor];
+	}
+
+	rotor = hrp->hrp_rotor++;
+	return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs];
+}
+
+/**
+ * Dispatch all replies accumulated in the batch to one from
+ * dedicated reply handling threads.
+ *
+ * \param b batch
+ */
+static void rs_batch_dispatch(struct rs_batch *b)
+{
+	if (b->rsb_n_replies != 0) {
+		struct ptlrpc_hr_thread	*hrt;
+
+		hrt = ptlrpc_hr_select(b->rsb_svcpt);
+
+		spin_lock(&hrt->hrt_lock);
+		list_splice_init(&b->rsb_replies, &hrt->hrt_queue);
+		spin_unlock(&hrt->hrt_lock);
+
+		wake_up(&hrt->hrt_waitq);
+		b->rsb_n_replies = 0;
+	}
+}
+
+/**
+ * Add a reply to a batch.
+ * Add one reply object to a batch, schedule batched replies if overload.
+ *
+ * \param b batch
+ * \param rs reply
+ */
+static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) {
+		if (b->rsb_svcpt != NULL) {
+			rs_batch_dispatch(b);
+			spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+		}
+		spin_lock(&svcpt->scp_rep_lock);
+		b->rsb_svcpt = svcpt;
+	}
+	spin_lock(&rs->rs_lock);
+	rs->rs_scheduled_ever = 1;
+	if (rs->rs_scheduled == 0) {
+		list_move(&rs->rs_list, &b->rsb_replies);
+		rs->rs_scheduled = 1;
+		b->rsb_n_replies++;
+	}
+	rs->rs_committed = 1;
+	spin_unlock(&rs->rs_lock);
+}
+
+/**
+ * Reply batch finalization.
+ * Dispatch remaining replies from the batch
+ * and release remaining spinlock.
+ *
+ * \param b batch
+ */
+static void rs_batch_fini(struct rs_batch *b)
+{
+	if (b->rsb_svcpt != NULL) {
+		rs_batch_dispatch(b);
+		spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+	}
+}
+
+#define DECLARE_RS_BATCH(b)     struct rs_batch b
+
+
+/**
+ * Put reply state into a queue for processing because we received
+ * ACK from the client
+ */
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_hr_thread *hrt;
+
+	ENTRY;
+
+	LASSERT(list_empty(&rs->rs_list));
+
+	hrt = ptlrpc_hr_select(rs->rs_svcpt);
+
+	spin_lock(&hrt->hrt_lock);
+	list_add_tail(&rs->rs_list, &hrt->hrt_queue);
+	spin_unlock(&hrt->hrt_lock);
+
+	wake_up(&hrt->hrt_waitq);
+	EXIT;
+}
+
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	ENTRY;
+
+	assert_spin_locked(&rs->rs_svcpt->scp_rep_lock);
+	assert_spin_locked(&rs->rs_lock);
+	LASSERT(rs->rs_difficult);
+	rs->rs_scheduled_ever = 1;  /* flag any notification attempt */
+
+	if (rs->rs_scheduled) {     /* being set up or already notified */
+		EXIT;
+		return;
+	}
+
+	rs->rs_scheduled = 1;
+	list_del_init(&rs->rs_list);
+	ptlrpc_dispatch_difficult_reply(rs);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+
+void ptlrpc_commit_replies(struct obd_export *exp)
+{
+	struct ptlrpc_reply_state *rs, *nxt;
+	DECLARE_RS_BATCH(batch);
+
+	ENTRY;
+
+	rs_batch_init(&batch);
+	/*
+	 * Find any replies that have been committed and get their service
+	 * to attend to complete them.
+	 */
+
+	/* CAVEAT EMPTOR: spinlock ordering!!! */
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies,
+				 rs_obd_list) {
+		LASSERT(rs->rs_difficult);
+		/* VBR: per-export last_committed */
+		LASSERT(rs->rs_export);
+		if (rs->rs_transno <= exp->exp_last_committed) {
+			list_del_init(&rs->rs_obd_list);
+			rs_batch_add(&batch, rs);
+		}
+	}
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+	rs_batch_fini(&batch);
+	EXIT;
+}
+
+static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int rc;
+	int posted = 0;
+
+	for (;;) {
+		spin_lock(&svcpt->scp_lock);
+
+		if (list_empty(&svcpt->scp_rqbd_idle)) {
+			spin_unlock(&svcpt->scp_lock);
+			return posted;
+		}
+
+		rqbd = list_first_entry(&svcpt->scp_rqbd_idle,
+					struct ptlrpc_request_buffer_desc,
+					rqbd_list);
+
+		/* assume we will post successfully */
+		svcpt->scp_nrqbds_posted++;
+		list_move(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
+
+		spin_unlock(&svcpt->scp_lock);
+
+		rc = ptlrpc_register_rqbd(rqbd);
+		if (rc != 0)
+			break;
+
+		posted = 1;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	svcpt->scp_nrqbds_posted--;
+	list_move_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+
+	/*
+	 * Don't complain if no request buffers are posted right now; LNET
+	 * won't drop requests because we set the portal lazy!
+	 */
+
+	spin_unlock(&svcpt->scp_lock);
+
+	return -1;
+}
+
+static void ptlrpc_at_timer(cfs_timer_cb_arg_t data)
+{
+	struct ptlrpc_service_part *svcpt;
+
+	svcpt = cfs_from_timer(svcpt, data, scp_at_timer);
+
+	svcpt->scp_at_check = 1;
+	svcpt->scp_at_checktime = ktime_get();
+	wake_up(&svcpt->scp_waitq);
+}
+
+static void ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
+					 struct ptlrpc_service_conf *conf)
+{
+	struct ptlrpc_service_thr_conf *tc = &conf->psc_thr;
+	unsigned int init;
+	unsigned int total;
+	unsigned int nthrs;
+	int weight;
+
+	/*
+	 * Common code for estimating & validating threads number.
+	 * CPT affinity service could have percpt thread-pool instead
+	 * of a global thread-pool, which means user might not always
+	 * get the threads number they give it in conf::tc_nthrs_user
+	 * even they did set. It's because we need to validate threads
+	 * number for each CPT to guarantee each pool will have enough
+	 * threads to keep the service healthy.
+	 */
+	init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL);
+	init = max_t(int, init, tc->tc_nthrs_init);
+
+	/*
+	 * NB: please see comments in lustre_lnet.h for definition
+	 * details of these members
+	 */
+	LASSERT(tc->tc_nthrs_max != 0);
+
+	if (tc->tc_nthrs_user != 0) {
+		/*
+		 * In case there is a reason to test a service with many
+		 * threads, we give a less strict check here, it can
+		 * be up to 8 * nthrs_max
+		 */
+		total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user);
+		nthrs = total / svc->srv_ncpts;
+		init  = max(init, nthrs);
+		goto out;
+	}
+
+	total = tc->tc_nthrs_max;
+	if (tc->tc_nthrs_base == 0) {
+		/*
+		 * don't care about base threads number per partition,
+		 * this is most for non-affinity service
+		 */
+		nthrs = total / svc->srv_ncpts;
+		goto out;
+	}
+
+	nthrs = tc->tc_nthrs_base;
+	if (svc->srv_ncpts == 1) {
+		int	i;
+
+		/*
+		 * NB: Increase the base number if it's single partition
+		 * and total number of cores/HTs is larger or equal to 4.
+		 * result will always < 2 * nthrs_base
+		 */
+		weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY);
+		for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */
+			    (tc->tc_nthrs_base >> i) != 0; i++)
+			nthrs += tc->tc_nthrs_base >> i;
+	}
+
+	if (tc->tc_thr_factor != 0) {
+		int	  factor = tc->tc_thr_factor;
+		const int fade = 4;
+
+		/*
+		 * User wants to increase number of threads with for
+		 * each CPU core/HT, most likely the factor is larger than
+		 * one thread/core because service threads are supposed to
+		 * be blocked by lock or wait for IO.
+		 */
+		/*
+		 * Amdahl's law says that adding processors wouldn't give
+		 * a linear increasing of parallelism, so it's nonsense to
+		 * have too many threads no matter how many cores/HTs
+		 * there are.
+		 */
+		preempt_disable();
+		if (cpumask_weight
+		    (topology_sibling_cpumask(smp_processor_id())) > 1) {
+			/* weight is # of HTs */
+			/* depress thread factor for hyper-thread */
+			factor = factor - (factor >> 1) + (factor >> 3);
+		}
+		preempt_enable();
+
+		weight = cfs_cpt_weight(svc->srv_cptable, 0);
+
+		for (; factor > 0 && weight > 0; factor--, weight -= fade)
+			nthrs += min(weight, fade) * factor;
+	}
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		nthrs = max(tc->tc_nthrs_base,
+			    tc->tc_nthrs_max / svc->srv_ncpts);
+	}
+ out:
+	nthrs = max(nthrs, tc->tc_nthrs_init);
+	svc->srv_nthrs_cpt_limit = nthrs;
+	svc->srv_nthrs_cpt_init = init;
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		CDEBUG(D_OTHER,
+		       "%s: This service may have more threads (%d) than the given soft limit (%d)\n",
+		       svc->srv_name, nthrs * svc->srv_ncpts,
+		       tc->tc_nthrs_max);
+	}
+}
+
+/**
+ * Initialize percpt data for a service
+ */
+static int ptlrpc_service_part_init(struct ptlrpc_service *svc,
+				    struct ptlrpc_service_part *svcpt, int cpt)
+{
+	struct ptlrpc_at_array *array;
+	int size;
+	int index;
+	int rc;
+
+	svcpt->scp_cpt = cpt;
+	INIT_LIST_HEAD(&svcpt->scp_threads);
+
+	/* rqbd and incoming request queue */
+	spin_lock_init(&svcpt->scp_lock);
+	mutex_init(&svcpt->scp_mutex);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
+	INIT_LIST_HEAD(&svcpt->scp_req_incoming);
+	init_waitqueue_head(&svcpt->scp_waitq);
+	/* history request & rqbd list */
+	INIT_LIST_HEAD(&svcpt->scp_hist_reqs);
+	INIT_LIST_HEAD(&svcpt->scp_hist_rqbds);
+
+	/* acitve requests and hp requests */
+	spin_lock_init(&svcpt->scp_req_lock);
+
+	/* reply states */
+	spin_lock_init(&svcpt->scp_rep_lock);
+	INIT_LIST_HEAD(&svcpt->scp_rep_active);
+	INIT_LIST_HEAD(&svcpt->scp_rep_idle);
+	init_waitqueue_head(&svcpt->scp_rep_waitq);
+	atomic_set(&svcpt->scp_nreps_difficult, 0);
+
+	/* adaptive timeout */
+	spin_lock_init(&svcpt->scp_at_lock);
+	array = &svcpt->scp_at_array;
+
+	size = at_est2timeout(at_max);
+	array->paa_size     = size;
+	array->paa_count    = 0;
+	array->paa_deadline = -1;
+
+	/* allocate memory for scp_at_array (ptlrpc_at_array) */
+	OBD_CPT_ALLOC(array->paa_reqs_array,
+		      svc->srv_cptable, cpt, sizeof(struct list_head) * size);
+	if (array->paa_reqs_array == NULL)
+		return -ENOMEM;
+
+	for (index = 0; index < size; index++)
+		INIT_LIST_HEAD(&array->paa_reqs_array[index]);
+
+	OBD_CPT_ALLOC(array->paa_reqs_count,
+		      svc->srv_cptable, cpt, sizeof(__u32) * size);
+	if (array->paa_reqs_count == NULL)
+		goto failed;
+
+	cfs_timer_setup(&svcpt->scp_at_timer, ptlrpc_at_timer,
+			(unsigned long)svcpt, 0);
+
+	/*
+	 * At SOW, service time should be quick; 10s seems generous. If client
+	 * timeout is less than this, we'll be sending an early reply.
+	 */
+	at_init(&svcpt->scp_at_estimate, 10, 0);
+
+	/* assign this before call ptlrpc_grow_req_bufs */
+	svcpt->scp_service = svc;
+	/* Now allocate the request buffers, but don't post them now */
+	rc = ptlrpc_grow_req_bufs(svcpt, 0);
+	/*
+	 * We shouldn't be under memory pressure at startup, so
+	 * fail if we can't allocate all our buffers at this time.
+	 */
+	if (rc != 0)
+		goto failed;
+
+	return 0;
+
+ failed:
+	if (array->paa_reqs_count != NULL) {
+		OBD_FREE_PTR_ARRAY(array->paa_reqs_count, size);
+		array->paa_reqs_count = NULL;
+	}
+
+	if (array->paa_reqs_array != NULL) {
+		OBD_FREE_PTR_ARRAY(array->paa_reqs_array, array->paa_size);
+		array->paa_reqs_array = NULL;
+	}
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize service on a given portal.
+ * This includes starting serving threads , allocating and posting rqbds and
+ * so on.
+ */
+struct ptlrpc_service *ptlrpc_register_service(struct ptlrpc_service_conf *conf,
+					       struct kset *parent,
+					       struct dentry *debugfs_entry)
+{
+	struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt;
+	struct ptlrpc_service *service;
+	struct ptlrpc_service_part *svcpt;
+	struct cfs_cpt_table *cptable;
+	__u32 *cpts = NULL;
+	int ncpts;
+	int cpt;
+	int rc;
+	int i;
+
+	ENTRY;
+
+	LASSERT(conf->psc_buf.bc_nbufs > 0);
+	LASSERT(conf->psc_buf.bc_buf_size >=
+		conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD);
+	LASSERT(conf->psc_thr.tc_ctx_tags != 0);
+
+	cptable = cconf->cc_cptable;
+	if (cptable == NULL)
+		cptable = cfs_cpt_tab;
+
+	if (conf->psc_thr.tc_cpu_bind > 1) {
+		CERROR("%s: Invalid cpu bind value %d, only 1 or 0 allowed\n",
+		       conf->psc_name, conf->psc_thr.tc_cpu_bind);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	if (!cconf->cc_affinity) {
+		ncpts = 1;
+	} else {
+		ncpts = cfs_cpt_number(cptable);
+		if (cconf->cc_pattern != NULL) {
+			struct cfs_expr_list	*el;
+
+			rc = cfs_expr_list_parse(cconf->cc_pattern,
+						 strlen(cconf->cc_pattern),
+						 0, ncpts - 1, &el);
+			if (rc != 0) {
+				CERROR("%s: invalid CPT pattern string: %s\n",
+				       conf->psc_name, cconf->cc_pattern);
+				RETURN(ERR_PTR(-EINVAL));
+			}
+
+			rc = cfs_expr_list_values(el, ncpts, &cpts);
+			cfs_expr_list_free(el);
+			if (rc <= 0) {
+				CERROR("%s: failed to parse CPT array %s: %d\n",
+				       conf->psc_name, cconf->cc_pattern, rc);
+				if (cpts != NULL)
+					OBD_FREE_PTR_ARRAY(cpts, ncpts);
+				RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL));
+			}
+			ncpts = rc;
+		}
+	}
+
+	OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts]));
+	if (service == NULL) {
+		if (cpts != NULL)
+			OBD_FREE_PTR_ARRAY(cpts, ncpts);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	service->srv_cptable		= cptable;
+	service->srv_cpts		= cpts;
+	service->srv_ncpts		= ncpts;
+	service->srv_cpt_bind		= conf->psc_thr.tc_cpu_bind;
+
+	service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
+	while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
+		service->srv_cpt_bits++;
+
+	/* public members */
+	spin_lock_init(&service->srv_lock);
+	service->srv_name		= conf->psc_name;
+	service->srv_watchdog_factor	= conf->psc_watchdog_factor;
+	INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */
+
+	/* buffer configuration */
+	service->srv_nbuf_per_group	= test_req_buffer_pressure ?
+					  1 : conf->psc_buf.bc_nbufs;
+	/* do not limit max number of rqbds by default */
+	service->srv_nrqbds_max		= 0;
+
+	service->srv_max_req_size	= conf->psc_buf.bc_req_max_size +
+					  SPTLRPC_MAX_PAYLOAD;
+	service->srv_buf_size		= conf->psc_buf.bc_buf_size;
+	service->srv_rep_portal		= conf->psc_buf.bc_rep_portal;
+	service->srv_req_portal		= conf->psc_buf.bc_req_portal;
+
+	/* With slab/alloc_pages buffer size will be rounded up to 2^n */
+	if (service->srv_buf_size & (service->srv_buf_size - 1)) {
+		int round = size_roundup_power2(service->srv_buf_size);
+
+		service->srv_buf_size = round;
+	}
+
+	/* Increase max reply size to next power of two */
+	service->srv_max_reply_size = 1;
+	while (service->srv_max_reply_size <
+	       conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD)
+		service->srv_max_reply_size <<= 1;
+
+	service->srv_thread_name	= conf->psc_thr.tc_thr_name;
+	service->srv_ctx_tags		= conf->psc_thr.tc_ctx_tags;
+	service->srv_hpreq_ratio	= PTLRPC_SVC_HP_RATIO;
+	service->srv_ops		= conf->psc_ops;
+
+	for (i = 0; i < ncpts; i++) {
+		if (!cconf->cc_affinity)
+			cpt = CFS_CPT_ANY;
+		else
+			cpt = cpts != NULL ? cpts[i] : i;
+
+		OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt));
+		if (svcpt == NULL)
+			GOTO(failed, rc = -ENOMEM);
+
+		service->srv_parts[i] = svcpt;
+		rc = ptlrpc_service_part_init(service, svcpt, cpt);
+		if (rc != 0)
+			GOTO(failed, rc);
+	}
+
+	ptlrpc_server_nthreads_check(service, conf);
+
+	rc = LNetSetLazyPortal(service->srv_req_portal);
+	LASSERT(rc == 0);
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_add(&service->srv_list, &ptlrpc_all_services);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	if (parent) {
+		rc = ptlrpc_sysfs_register_service(parent, service);
+		if (rc)
+			GOTO(failed, rc);
+	}
+
+	if (debugfs_entry != NULL)
+		ptlrpc_ldebugfs_register_service(debugfs_entry, service);
+
+	rc = ptlrpc_service_nrs_setup(service);
+	if (rc != 0)
+		GOTO(failed, rc);
+
+	CDEBUG(D_NET, "%s: Started, listening on portal %d\n",
+	       service->srv_name, service->srv_req_portal);
+
+	rc = ptlrpc_start_threads(service);
+	if (rc != 0) {
+		CERROR("Failed to start threads for service %s: %d\n",
+		       service->srv_name, rc);
+		GOTO(failed, rc);
+	}
+
+	RETURN(service);
+failed:
+	ptlrpc_unregister_service(service);
+	RETURN(ERR_PTR(rc));
+}
+EXPORT_SYMBOL(ptlrpc_register_service);
+
+/**
+ * to actually free the request, must be called without holding svc_lock.
+ * note it's caller's responsibility to unlink req->rq_list.
+ */
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+	LASSERT(atomic_read(&req->rq_refcount) == 0);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	/*
+	 * DEBUG_REQ() assumes the reply state of a request with a valid
+	 * ref will not be destroyed until that reference is dropped.
+	 */
+	ptlrpc_req_drop_rs(req);
+
+	sptlrpc_svc_ctx_decref(req);
+
+	if (req != &req->rq_rqbd->rqbd_req) {
+		/*
+		 * NB request buffers use an embedded
+		 * req if the incoming req unlinked the
+		 * MD; this isn't one of them!
+		 */
+		ptlrpc_request_cache_free(req);
+	}
+}
+
+/**
+ * drop a reference count of the request. if it reaches 0, we either
+ * put it into history list, or free it immediately.
+ */
+void ptlrpc_server_drop_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	int				   refcount;
+
+	if (!atomic_dec_and_test(&req->rq_refcount))
+		return;
+
+	if (req->rq_session.lc_state == LCS_ENTERED) {
+		lu_context_exit(&req->rq_session);
+		lu_context_fini(&req->rq_session);
+	}
+
+	if (req->rq_at_linked) {
+		spin_lock(&svcpt->scp_at_lock);
+		/*
+		 * recheck with lock, in case it's unlinked by
+		 * ptlrpc_at_check_timed()
+		 */
+		if (likely(req->rq_at_linked))
+			ptlrpc_at_remove_timed(req);
+		spin_unlock(&svcpt->scp_at_lock);
+	}
+
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	/* finalize request */
+	if (req->rq_export) {
+		class_export_put(req->rq_export);
+		req->rq_export = NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	list_add(&req->rq_list, &rqbd->rqbd_reqs);
+
+	refcount = --(rqbd->rqbd_refcount);
+	if (refcount == 0) {
+		/* request buffer is now idle: add to history */
+		list_move_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
+		svcpt->scp_hist_nrqbds++;
+
+		/*
+		 * cull some history?
+		 * I expect only about 1 or 2 rqbds need to be recycled here
+		 */
+		while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) {
+			rqbd = list_first_entry(&svcpt->scp_hist_rqbds,
+						struct ptlrpc_request_buffer_desc,
+						rqbd_list);
+
+			list_del(&rqbd->rqbd_list);
+			svcpt->scp_hist_nrqbds--;
+
+			/*
+			 * remove rqbd's reqs from svc's req history while
+			 * I've got the service lock
+			 */
+			list_for_each_entry(req, &rqbd->rqbd_reqs, rq_list) {
+				/* Track the highest culled req seq */
+				if (req->rq_history_seq >
+				    svcpt->scp_hist_seq_culled) {
+					svcpt->scp_hist_seq_culled =
+						req->rq_history_seq;
+				}
+				list_del(&req->rq_history_list);
+			}
+
+			spin_unlock(&svcpt->scp_lock);
+
+			while ((req = list_first_entry_or_null(
+					&rqbd->rqbd_reqs,
+					struct ptlrpc_request, rq_list))) {
+				list_del(&req->rq_list);
+				ptlrpc_server_free_request(req);
+			}
+
+			spin_lock(&svcpt->scp_lock);
+			/*
+			 * now all reqs including the embedded req has been
+			 * disposed, schedule request buffer for re-use
+			 * or free it to drain some in excess.
+			 */
+			LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == 0);
+			if (svcpt->scp_nrqbds_posted >=
+			    svc->srv_nbuf_per_group ||
+			    (svc->srv_nrqbds_max != 0 &&
+			     svcpt->scp_nrqbds_total > svc->srv_nrqbds_max) ||
+			    test_req_buffer_pressure) {
+				/* like in ptlrpc_free_rqbd() */
+				svcpt->scp_nrqbds_total--;
+				OBD_FREE_LARGE(rqbd->rqbd_buffer,
+					       svc->srv_buf_size);
+				OBD_FREE_PTR(rqbd);
+			} else {
+				list_add_tail(&rqbd->rqbd_list,
+					      &svcpt->scp_rqbd_idle);
+			}
+		}
+
+		spin_unlock(&svcpt->scp_lock);
+	} else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
+		/* If we are low on memory, we are not interested in history */
+		list_del(&req->rq_list);
+		list_del_init(&req->rq_history_list);
+
+		/* Track the highest culled req seq */
+		if (req->rq_history_seq > svcpt->scp_hist_seq_culled)
+			svcpt->scp_hist_seq_culled = req->rq_history_seq;
+
+		spin_unlock(&svcpt->scp_lock);
+
+		ptlrpc_server_free_request(req);
+	} else {
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+static void ptlrpc_add_exp_list_nolock(struct ptlrpc_request *req,
+				       struct obd_export *export, bool hp)
+{
+	__u16 tag = lustre_msg_get_tag(req->rq_reqmsg);
+
+	if (hp)
+		list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
+	else
+		list_add(&req->rq_exp_list, &export->exp_reg_rpcs);
+	if (tag && export->exp_used_slots)
+		set_bit(tag - 1, export->exp_used_slots);
+}
+
+static void ptlrpc_del_exp_list(struct ptlrpc_request *req)
+{
+	__u16 tag = lustre_msg_get_tag(req->rq_reqmsg);
+
+	spin_lock(&req->rq_export->exp_rpc_lock);
+	list_del_init(&req->rq_exp_list);
+	if (tag && !req->rq_obsolete && req->rq_export->exp_used_slots)
+		clear_bit(tag - 1, req->rq_export->exp_used_slots);
+	spin_unlock(&req->rq_export->exp_rpc_lock);
+}
+
+/** Change request export and move hp request from old export to new */
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export)
+{
+	if (req->rq_export != NULL) {
+		LASSERT(!list_empty(&req->rq_exp_list));
+		/* remove rq_exp_list from last export */
+		ptlrpc_del_exp_list(req);
+		/* export has one reference already, so it's safe to
+		 * add req to export queue here and get another
+		 * reference for request later
+		 */
+		spin_lock(&export->exp_rpc_lock);
+		ptlrpc_add_exp_list_nolock(req, export, req->rq_ops != NULL);
+		spin_unlock(&export->exp_rpc_lock);
+
+		class_export_rpc_dec(req->rq_export);
+		class_export_put(req->rq_export);
+	}
+
+	/* request takes one export refcount */
+	req->rq_export = class_export_get(export);
+	class_export_rpc_inc(export);
+}
+
+/**
+ * to finish a request: stop sending more early replies, and release
+ * the request.
+ */
+static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt,
+					 struct ptlrpc_request *req)
+{
+	ptlrpc_server_hpreq_fini(req);
+
+	ptlrpc_server_drop_request(req);
+}
+
+/**
+ * to finish an active request: stop sending more early replies, and release
+ * the request. should be called after we finished handling the request.
+ */
+static void ptlrpc_server_finish_active_request(
+					struct ptlrpc_service_part *svcpt,
+					struct ptlrpc_request *req)
+{
+	spin_lock(&svcpt->scp_req_lock);
+	ptlrpc_nrs_req_stop_nolock(req);
+	svcpt->scp_nreqs_active--;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active--;
+	spin_unlock(&svcpt->scp_req_lock);
+
+	ptlrpc_nrs_req_finalize(req);
+
+	if (req->rq_export != NULL)
+		class_export_rpc_dec(req->rq_export);
+
+	ptlrpc_server_finish_request(svcpt, req);
+}
+
+/**
+ * This function makes sure dead exports are evicted in a timely manner.
+ * This function is only called when some export receives a message (i.e.,
+ * the network is up.)
+ */
+void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
+{
+	struct obd_export *oldest_exp;
+	time64_t oldest_time, new_time;
+
+	ENTRY;
+
+	LASSERT(exp);
+
+	/*
+	 * Compensate for slow machines, etc, by faking our request time
+	 * into the future.  Although this can break the strict time-ordering
+	 * of the list, we can be really lazy here - we don't have to evict
+	 * at the exact right moment.  Eventually, all silent exports
+	 * will make it to the top of the list.
+	 */
+
+	/* Do not pay attention on 1sec or smaller renewals. */
+	new_time = ktime_get_real_seconds() + extra_delay;
+	if (exp->exp_last_request_time + 1 /*second */ >= new_time)
+		RETURN_EXIT;
+
+	exp->exp_last_request_time = new_time;
+
+	/*
+	 * exports may get disconnected from the chain even though the
+	 * export has references, so we must keep the spin lock while
+	 * manipulating the lists
+	 */
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+
+	if (list_empty(&exp->exp_obd_chain_timed)) {
+		/* this one is not timed */
+		spin_unlock(&exp->exp_obd->obd_dev_lock);
+		RETURN_EXIT;
+	}
+
+	list_move_tail(&exp->exp_obd_chain_timed,
+		       &exp->exp_obd->obd_exports_timed);
+
+	oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+				struct obd_export, exp_obd_chain_timed);
+	oldest_time = oldest_exp->exp_last_request_time;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+	if (exp->exp_obd->obd_recovering) {
+		/* be nice to everyone during recovery */
+		EXIT;
+		return;
+	}
+
+	/* Note - racing to start/reset the obd_eviction timer is safe */
+	if (exp->exp_obd->obd_eviction_timer == 0) {
+		/* Check if the oldest entry is expired. */
+		if (ktime_get_real_seconds() >
+		    oldest_time + PING_EVICT_TIMEOUT + extra_delay) {
+			/*
+			 * We need a second timer, in case the net was down and
+			 * it just came back. Since the pinger may skip every
+			 * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+			 * we better wait for 3.
+			 */
+			exp->exp_obd->obd_eviction_timer =
+				ktime_get_real_seconds() + 3 * PING_INTERVAL;
+			CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n",
+			       exp->exp_obd->obd_name,
+			       obd_export_nid2str(oldest_exp), oldest_time);
+		}
+	} else {
+		if (ktime_get_real_seconds() >
+		    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+			/*
+			 * The evictor won't evict anyone who we've heard from
+			 * recently, so we don't have to check before we start
+			 * it.
+			 */
+			if (!ping_evictor_wake(exp))
+				exp->exp_obd->obd_eviction_timer = 0;
+		}
+	}
+
+	EXIT;
+}
+
+/**
+ * Sanity check request \a req.
+ * Return 0 if all is ok, error code otherwise.
+ */
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+	struct obd_device *obd = req->rq_export->exp_obd;
+	int rc = 0;
+
+	if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+		     req->rq_export->exp_conn_cnt)) {
+		DEBUG_REQ(D_RPCTRACE, req,
+			  "DROPPING req from old connection %d < %d",
+			  lustre_msg_get_conn_cnt(req->rq_reqmsg),
+			  req->rq_export->exp_conn_cnt);
+		return -EEXIST;
+	}
+	if (unlikely(obd == NULL || obd->obd_fail)) {
+		/*
+		 * Failing over, don't handle any more reqs,
+		 * send error response instead.
+		 */
+		CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+			req, (obd != NULL) ? obd->obd_name : "unknown");
+		rc = -ENODEV;
+	} else if (lustre_msg_get_flags(req->rq_reqmsg) &
+		   (MSG_REPLAY | MSG_REQ_REPLAY_DONE) &&
+		   !obd->obd_recovering) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Invalid replay without recovery");
+		class_fail_export(req->rq_export);
+		rc = -ENODEV;
+	} else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 &&
+		   !obd->obd_recovering) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Invalid req with transno %llu without recovery",
+			  lustre_msg_get_transno(req->rq_reqmsg));
+		class_fail_export(req->rq_export);
+		rc = -ENODEV;
+	}
+
+	if (unlikely(rc < 0)) {
+		req->rq_status = rc;
+		ptlrpc_error(req);
+	}
+	return rc;
+}
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	time64_t next;
+
+	if (array->paa_count == 0) {
+		del_timer(&svcpt->scp_at_timer);
+		return;
+	}
+
+	/* Set timer for closest deadline */
+	next = array->paa_deadline - ktime_get_real_seconds() -
+	       at_early_margin;
+	if (next <= 0) {
+		ptlrpc_at_timer(cfs_timer_cb_arg(svcpt, scp_at_timer));
+	} else {
+		mod_timer(&svcpt->scp_at_timer,
+			  jiffies + nsecs_to_jiffies(next * NSEC_PER_SEC));
+		CDEBUG(D_INFO, "armed %s at %+llds\n",
+		       svcpt->scp_service->srv_name, next);
+	}
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	struct ptlrpc_request *rq = NULL;
+	__u32 index;
+
+	if (AT_OFF)
+		return(0);
+
+	if (req->rq_no_reply)
+		return 0;
+
+	if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
+		return(-ENOSYS);
+
+	spin_lock(&svcpt->scp_at_lock);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	div_u64_rem(req->rq_deadline, array->paa_size, &index);
+	if (array->paa_reqs_count[index] > 0) {
+		/*
+		 * latest rpcs will have the latest deadlines in the list,
+		 * so search backward.
+		 */
+		list_for_each_entry_reverse(rq, &array->paa_reqs_array[index],
+					    rq_timed_list) {
+			if (req->rq_deadline >= rq->rq_deadline) {
+				list_add(&req->rq_timed_list,
+					 &rq->rq_timed_list);
+				break;
+			}
+		}
+	}
+
+	/* Add the request at the head of the list */
+	if (list_empty(&req->rq_timed_list))
+		list_add(&req->rq_timed_list, &array->paa_reqs_array[index]);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_at_index = index;
+	array->paa_reqs_count[index]++;
+	array->paa_count++;
+	if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) {
+		array->paa_deadline = req->rq_deadline;
+		ptlrpc_at_set_timer(svcpt);
+	}
+	spin_unlock(&svcpt->scp_at_lock);
+
+	return 0;
+}
+
+static void ptlrpc_at_remove_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_at_array *array;
+
+	array = &req->rq_rqbd->rqbd_svcpt->scp_at_array;
+
+	/* NB: must call with hold svcpt::scp_at_lock */
+	LASSERT(!list_empty(&req->rq_timed_list));
+	list_del_init(&req->rq_timed_list);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 0;
+	spin_unlock(&req->rq_lock);
+
+	array->paa_reqs_count[req->rq_at_index]--;
+	array->paa_count--;
+}
+
+/*
+ * Attempt to extend the request deadline by sending an early reply to the
+ * client.
+ */
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_request *reqcopy;
+	struct lustre_msg *reqmsg;
+	timeout_t olddl = req->rq_deadline - ktime_get_real_seconds();
+	time64_t newdl;
+	int rc;
+
+	ENTRY;
+
+	if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_RECONNECT) ||
+	    CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_ENQ_RESEND)) {
+		/* don't send early reply */
+		RETURN(1);
+	}
+
+	/*
+	 * deadline is when the client expects us to reply, margin is the
+	 * difference between clients' and servers' expectations
+	 */
+	DEBUG_REQ(D_ADAPTTO, req,
+		  "%ssending early reply (deadline %+ds, margin %+ds) for %d+%d",
+		  AT_OFF ? "AT off - not " : "",
+		  olddl, olddl - at_get(&svcpt->scp_at_estimate),
+		  at_get(&svcpt->scp_at_estimate), at_extra);
+
+	if (AT_OFF)
+		RETURN(0);
+
+	if (olddl < 0) {
+		/* below message is checked in replay-ost-single.sh test_9 */
+		DEBUG_REQ(D_WARNING, req,
+			  "Already past deadline (%+ds), not sending early reply. Consider increasing at_early_margin (%d)?",
+			  olddl, at_early_margin);
+
+		/* Return an error so we're not re-added to the timed list. */
+		RETURN(-ETIMEDOUT);
+	}
+
+	if ((lustre_msghdr_get_flags(req->rq_reqmsg) &
+	     MSGHDR_AT_SUPPORT) == 0) {
+		DEBUG_REQ(D_INFO, req,
+			  "Wanted to ask client for more time, but no AT support");
+		RETURN(-ENOSYS);
+	}
+
+	if (req->rq_export &&
+	    lustre_msg_get_flags(req->rq_reqmsg) &
+	    (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+		struct obd_device *obd_exp = req->rq_export->exp_obd;
+
+		/*
+		 * During recovery, we don't want to send too many early
+		 * replies, but on the other hand we want to make sure the
+		 * client has enough time to resend if the rpc is lost. So
+		 * during the recovery period send at least 4 early replies,
+		 * spacing them every at_extra if we can. at_estimate should
+		 * always equal this fixed value during recovery.
+		 */
+
+		/*
+		 * Don't account request processing time into AT history
+		 * during recovery, it is not service time we need but
+		 * includes also waiting time for recovering clients
+		 */
+		newdl = min_t(time64_t, at_extra,
+			      obd_exp->obd_recovery_timeout / 4) +
+			ktime_get_real_seconds();
+	} else {
+		/*
+		 * We want to extend the request deadline by at_extra seconds,
+		 * so we set our service estimate to reflect how much time has
+		 * passed since this request arrived plus an additional
+		 * at_extra seconds. The client will calculate the new deadline
+		 * based on this service estimate (plus some additional time to
+		 * account for network latency). See ptlrpc_at_recv_early_reply
+		 */
+		at_measured(&svcpt->scp_at_estimate, at_extra +
+			    ktime_get_real_seconds() -
+			    req->rq_arrival_time.tv_sec);
+		newdl = req->rq_arrival_time.tv_sec +
+			at_get(&svcpt->scp_at_estimate);
+	}
+
+	/*
+	 * Check to see if we've actually increased the deadline -
+	 * we may be past adaptive_max
+	 */
+	if (req->rq_deadline >= newdl) {
+		DEBUG_REQ(D_WARNING, req,
+			  "Could not add any time (%d/%lld), not sending early reply",
+			  olddl, newdl - ktime_get_real_seconds());
+		RETURN(-ETIMEDOUT);
+	}
+
+	reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (reqcopy == NULL)
+		RETURN(-ENOMEM);
+	OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen);
+	if (!reqmsg)
+		GOTO(out_free, rc = -ENOMEM);
+
+	*reqcopy = *req;
+	spin_lock_init(&reqcopy->rq_early_free_lock);
+	reqcopy->rq_reply_state = NULL;
+	reqcopy->rq_rep_swab_mask = 0;
+	reqcopy->rq_pack_bulk = 0;
+	reqcopy->rq_pack_udesc = 0;
+	reqcopy->rq_packed_final = 0;
+	sptlrpc_svc_ctx_addref(reqcopy);
+	/* We only need the reqmsg for the magic */
+	reqcopy->rq_reqmsg = reqmsg;
+	memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+	/*
+	 * tgt_brw_read() and tgt_brw_write() may have decided not to reply.
+	 * Without this check, we would fail the rq_no_reply assertion in
+	 * ptlrpc_send_reply().
+	 */
+	if (reqcopy->rq_no_reply)
+		GOTO(out, rc = -ETIMEDOUT);
+
+	LASSERT(atomic_read(&req->rq_refcount));
+	/* if it is last refcount then early reply isn't needed */
+	if (atomic_read(&req->rq_refcount) == 1) {
+		DEBUG_REQ(D_ADAPTTO, reqcopy,
+			  "Normal reply already sent, abort early reply");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* Connection ref */
+	reqcopy->rq_export = class_conn2export(
+			lustre_msg_get_handle(reqcopy->rq_reqmsg));
+	if (reqcopy->rq_export == NULL)
+		GOTO(out, rc = -ENODEV);
+
+	/* RPC ref */
+	class_export_rpc_inc(reqcopy->rq_export);
+	if (reqcopy->rq_export->exp_obd &&
+	    reqcopy->rq_export->exp_obd->obd_fail)
+		GOTO(out_put, rc = -ENODEV);
+
+	rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+	if (rc)
+		GOTO(out_put, rc);
+
+	rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+	if (!rc) {
+		/* Adjust our own deadline to what we told the client */
+		req->rq_deadline = newdl;
+		req->rq_early_count++; /* number sent, server side */
+	} else {
+		DEBUG_REQ(D_ERROR, req, "Early reply send failed: rc = %d", rc);
+	}
+
+	/*
+	 * Free the (early) reply state from lustre_pack_reply.
+	 * (ptlrpc_send_reply takes it's own rs ref, so this is safe here)
+	 */
+	ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+	class_export_rpc_dec(reqcopy->rq_export);
+	class_export_put(reqcopy->rq_export);
+out:
+	sptlrpc_svc_ctx_decref(reqcopy);
+	OBD_FREE_LARGE(reqmsg, req->rq_reqlen);
+out_free:
+	ptlrpc_request_cache_free(reqcopy);
+	RETURN(rc);
+}
+
+/*
+ * Send early replies to everybody expiring within at_early_margin
+ * asking for at_extra time
+ */
+static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	struct ptlrpc_request *rq, *n;
+	LIST_HEAD(work_list);
+	__u32 index, count;
+	time64_t deadline;
+	time64_t now = ktime_get_real_seconds();
+	s64 delay_ms;
+	int first, counter = 0;
+
+	ENTRY;
+	spin_lock(&svcpt->scp_at_lock);
+	if (svcpt->scp_at_check == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+	delay_ms = ktime_ms_delta(ktime_get(), svcpt->scp_at_checktime);
+	svcpt->scp_at_check = 0;
+
+	if (array->paa_count == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+
+	/* The timer went off, but maybe the nearest rpc already completed. */
+	first = array->paa_deadline - now;
+	if (first > at_early_margin) {
+		/* We've still got plenty of time.  Reset the timer. */
+		ptlrpc_at_set_timer(svcpt);
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+
+	/*
+	 * We're close to a timeout, and we don't know how much longer the
+	 * server will take. Send early replies to everyone expiring soon.
+	 */
+	deadline = -1;
+	div_u64_rem(array->paa_deadline, array->paa_size, &index);
+	count = array->paa_count;
+	while (count > 0) {
+		count -= array->paa_reqs_count[index];
+		list_for_each_entry_safe(rq, n,
+					 &array->paa_reqs_array[index],
+					 rq_timed_list) {
+			if (rq->rq_deadline > now + at_early_margin) {
+				/* update the earliest deadline */
+				if (deadline == -1 ||
+				    rq->rq_deadline < deadline)
+					deadline = rq->rq_deadline;
+				break;
+			}
+
+			/**
+			 * ptlrpc_server_drop_request() may drop
+			 * refcount to 0 already. Let's check this and
+			 * don't add entry to work_list
+			 */
+			if (likely(atomic_inc_not_zero(&rq->rq_refcount))) {
+				ptlrpc_at_remove_timed(rq);
+				list_add(&rq->rq_timed_list, &work_list);
+			} else {
+				ptlrpc_at_remove_timed(rq);
+			}
+
+			counter++;
+		}
+
+		if (++index >= array->paa_size)
+			index = 0;
+	}
+	array->paa_deadline = deadline;
+	/* we have a new earliest deadline, restart the timer */
+	ptlrpc_at_set_timer(svcpt);
+
+	spin_unlock(&svcpt->scp_at_lock);
+
+	CDEBUG(D_ADAPTTO,
+	       "timeout in %+ds, asking for %d secs on %d early replies\n",
+	       first, at_extra, counter);
+	if (first < 0) {
+		/*
+		 * We're already past request deadlines before we even get a
+		 * chance to send early replies
+		 */
+		LCONSOLE_WARN("%s: This server is not able to keep up with request traffic (cpu-bound).\n",
+			      svcpt->scp_service->srv_name);
+		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%lldms\n",
+		      counter, svcpt->scp_nreqs_incoming,
+		      svcpt->scp_nreqs_active,
+		      at_get(&svcpt->scp_at_estimate), delay_ms);
+	}
+
+	/*
+	 * we took additional refcount so entries can't be deleted from list, no
+	 * locking is needed
+	 */
+	while ((rq = list_first_entry_or_null(&work_list,
+					      struct ptlrpc_request,
+					      rq_timed_list)) != NULL) {
+		list_del_init(&rq->rq_timed_list);
+
+		if (ptlrpc_at_send_early_reply(rq) == 0)
+			ptlrpc_at_add_timed(rq);
+
+		ptlrpc_server_drop_request(rq);
+	}
+
+	RETURN(1); /* return "did_something" for liblustre */
+}
+
+/*
+ * Check if we are already handling earlier incarnation of this request.
+ * Called under &req->rq_export->exp_rpc_lock locked
+ */
+static struct ptlrpc_request*
+ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request *tmp = NULL;
+
+	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))
+		return NULL;
+
+	/*
+	 * This list should not be longer than max_requests in
+	 * flights on the client, so it is not all that long.
+	 * Also we only hit this codepath in case of a resent
+	 * request which makes it even more rarely hit
+	 */
+	list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs,
+				rq_exp_list) {
+		/* Found duplicate one */
+		if (tmp->rq_xid == req->rq_xid)
+			goto found;
+	}
+	list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs,
+				rq_exp_list) {
+		/* Found duplicate one */
+		if (tmp->rq_xid == req->rq_xid)
+			goto found;
+	}
+	return NULL;
+
+found:
+	DEBUG_REQ(D_HA, req, "Found duplicate req in processing");
+	DEBUG_REQ(D_HA, tmp, "Request being processed");
+	return tmp;
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+static void ptlrpc_server_mark_obsolete(struct ptlrpc_request *req)
+{
+	req->rq_obsolete = 1;
+}
+
+static void
+ptlrpc_server_mark_in_progress_obsolete(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request	*tmp = NULL;
+	__u16			tag;
+
+	if (!tgt_is_increasing_xid_client(req->rq_export) ||
+	    req->rq_export->exp_used_slots == NULL)
+		return;
+
+	tag = lustre_msg_get_tag(req->rq_reqmsg);
+	if (tag == 0)
+		return;
+
+	if (!test_bit(tag - 1, req->rq_export->exp_used_slots))
+		return;
+
+	/* This list should not be longer than max_requests in
+	 * flights on the client, so it is not all that long.
+	 * Also we only hit this codepath in case of a resent
+	 * request which makes it even more rarely hit */
+	list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs, rq_exp_list) {
+		if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) &&
+		    req->rq_xid > tmp->rq_xid)
+			ptlrpc_server_mark_obsolete(tmp);
+
+	}
+	list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs, rq_exp_list) {
+		if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) &&
+		    req->rq_xid > tmp->rq_xid)
+			ptlrpc_server_mark_obsolete(tmp);
+	}
+}
+#endif
+
+/**
+ * Check if a request should be assigned with a high priority.
+ *
+ * \retval	< 0: error occurred
+ *		  0: normal RPC request
+ *		 +1: high priority request
+ */
+static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_request *req)
+{
+	int rc = 0;
+
+	ENTRY;
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) {
+		rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req);
+		if (rc < 0)
+			RETURN(rc);
+
+		LASSERT(rc == 0);
+	}
+
+	if (req->rq_export != NULL && req->rq_ops != NULL) {
+		/*
+		 * Perform request specific check. We should do this
+		 * check before the request is added into exp_hp_rpcs
+		 * list otherwise it may hit swab race at LU-1044.
+		 */
+		if (req->rq_ops->hpreq_check != NULL) {
+			rc = req->rq_ops->hpreq_check(req);
+			if (rc == -ESTALE) {
+				req->rq_status = rc;
+				ptlrpc_error(req);
+			}
+			/*
+			 * can only return error,
+			 * 0 for normal request,
+			 * or 1 for high priority request
+			 */
+			LASSERT(rc <= 1);
+		}
+	}
+
+	RETURN(rc);
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
+{
+	ENTRY;
+	if (req->rq_export) {
+		/*
+		 * refresh lock timeout again so that client has more
+		 * room to send lock cancel RPC.
+		 */
+		if (req->rq_ops && req->rq_ops->hpreq_fini)
+			req->rq_ops->hpreq_fini(req);
+
+		ptlrpc_del_exp_list(req);
+	}
+	EXIT;
+}
+
+static int ptlrpc_hpreq_check(struct ptlrpc_request *req)
+{
+	return 1;
+}
+
+static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = {
+	.hpreq_check       = ptlrpc_hpreq_check,
+};
+
+/* Hi-Priority RPC check by RPC operation code. */
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req)
+{
+	int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	/*
+	 * Check for export to let only reconnects for not yet evicted
+	 * export to become a HP rpc.
+	 */
+	if ((req->rq_export != NULL) &&
+	    (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT))
+		req->rq_ops = &ptlrpc_hpreq_common;
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_hpreq_handler);
+
+static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
+				     struct ptlrpc_request *req)
+{
+	int rc;
+	bool hp;
+	struct ptlrpc_request *orig;
+
+	ENTRY;
+
+	rc = ptlrpc_server_hpreq_init(svcpt, req);
+	if (rc < 0)
+		RETURN(rc);
+
+	hp = rc > 0;
+	ptlrpc_nrs_req_initialize(svcpt, req, hp);
+
+	while (req->rq_export != NULL) {
+		struct obd_export *exp = req->rq_export;
+
+		/*
+		 * do search for duplicated xid and the adding to the list
+		 * atomically
+		 */
+		spin_lock_bh(&exp->exp_rpc_lock);
+#ifdef HAVE_SERVER_SUPPORT
+		ptlrpc_server_mark_in_progress_obsolete(req);
+#endif
+		orig = ptlrpc_server_check_resend_in_progress(req);
+		if (orig && OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) {
+			spin_unlock_bh(&exp->exp_rpc_lock);
+
+			OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+			msleep(4 * MSEC_PER_SEC);
+			continue;
+		}
+
+		if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) {
+			bool linked;
+
+			spin_unlock_bh(&exp->exp_rpc_lock);
+
+			/*
+			 * When the client resend request and the server has
+			 * the previous copy of it, we need to update deadlines,
+			 * to be sure that the client and the server have equal
+			 *  request deadlines.
+			 */
+
+			spin_lock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
+			linked = orig->rq_at_linked;
+			if (likely(linked))
+				ptlrpc_at_remove_timed(orig);
+			spin_unlock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
+			orig->rq_deadline = req->rq_deadline;
+			orig->rq_rep_mbits = req->rq_rep_mbits;
+			if (likely(linked))
+				ptlrpc_at_add_timed(orig);
+			ptlrpc_server_drop_request(orig);
+			ptlrpc_nrs_req_finalize(req);
+
+			/* don't mark slot unused for resend in progress */
+			req->rq_obsolete = 1;
+
+			RETURN(-EBUSY);
+		}
+
+		ptlrpc_add_exp_list_nolock(req, exp, hp || req->rq_ops != NULL);
+
+		spin_unlock_bh(&exp->exp_rpc_lock);
+		break;
+	}
+
+	/*
+	 * the current thread is not the processing thread for this request
+	 * since that, but request is in exp_hp_list and can be find there.
+	 * Remove all relations between request and old thread.
+	 */
+	req->rq_svc_thread->t_env->le_ses = NULL;
+	req->rq_svc_thread = NULL;
+	req->rq_session.lc_thread = NULL;
+
+	ptlrpc_nrs_req_add(svcpt, req, hp);
+
+	RETURN(0);
+}
+
+/**
+ * Allow to handle high priority request
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
+				     bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+
+	if (!nrs_svcpt_has_hp(svcpt))
+		return false;
+
+	if (force)
+		return true;
+
+	if (ptlrpc_nrs_req_throttling_nolock(svcpt, true))
+		return false;
+
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	if (svcpt->scp_nhreqs_active == 0)
+		return true;
+
+	return !ptlrpc_nrs_req_pending_nolock(svcpt, false) ||
+	       svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio;
+}
+
+static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	return ptlrpc_server_allow_high(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, true);
+}
+
+/**
+ * Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request.
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (force)
+		return true;
+
+	if (ptlrpc_nrs_req_throttling_nolock(svcpt, false))
+		return false;
+
+	if (svcpt->scp_nreqs_active < running - 2)
+		return true;
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt);
+}
+
+static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt,
+					 bool force)
+{
+	return ptlrpc_server_allow_normal(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, false);
+}
+
+/**
+ * Returns true if there are requests available in incoming
+ * request queue for processing and it is allowed to fetch them.
+ * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock
+ * to get reliable result
+ * \see ptlrpc_server_allow_normal
+ * \see ptlrpc_server_allow high
+ */
+static inline
+bool ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt,
+				   bool force)
+{
+	return ptlrpc_server_high_pending(svcpt, force) ||
+	       ptlrpc_server_normal_pending(svcpt, force);
+}
+
+/**
+ * Fetch a request for processing from queue of unprocessed requests.
+ * Favors high-priority requests.
+ * Returns a pointer to fetched request.
+ */
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
+{
+	struct ptlrpc_request *req = NULL;
+
+	ENTRY;
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (ptlrpc_server_high_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, true, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count++;
+			goto got_request;
+		}
+	}
+
+	if (ptlrpc_server_normal_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, false, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count = 0;
+			goto got_request;
+		}
+	}
+
+	spin_unlock(&svcpt->scp_req_lock);
+	RETURN(NULL);
+
+got_request:
+	svcpt->scp_nreqs_active++;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active++;
+
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if (likely(req->rq_export))
+		class_export_rpc_inc(req->rq_export);
+
+	RETURN(req);
+}
+
+/**
+ * Handle freshly incoming reqs, add to timed early reply list,
+ * pass on to regular request queue.
+ * All incoming requests pass through here before getting into
+ * ptlrpc_server_handle_req later on.
+ */
+static int ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
+				       struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service *svc = svcpt->scp_service;
+	struct ptlrpc_request *req;
+	__u32 deadline;
+	__u32 opc;
+	int rc;
+
+	ENTRY;
+
+	spin_lock(&svcpt->scp_lock);
+	if (list_empty(&svcpt->scp_req_incoming)) {
+		spin_unlock(&svcpt->scp_lock);
+		RETURN(0);
+	}
+
+	req = list_first_entry(&svcpt->scp_req_incoming,
+			       struct ptlrpc_request, rq_list);
+	list_del_init(&req->rq_list);
+	svcpt->scp_nreqs_incoming--;
+	/*
+	 * Consider this still a "queued" request as far as stats are
+	 * concerned
+	 */
+	spin_unlock(&svcpt->scp_lock);
+
+	/* go through security check/transform */
+	rc = sptlrpc_svc_unwrap_request(req);
+	switch (rc) {
+	case SECSVC_OK:
+		break;
+	case SECSVC_COMPLETE:
+		target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+		goto err_req;
+	case SECSVC_DROP:
+		goto err_req;
+	default:
+		LBUG();
+	}
+
+	/*
+	 * for null-flavored rpc, msg has been unpacked by sptlrpc, although
+	 * redo it wouldn't be harmful.
+	 */
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+		rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
+		if (rc != 0) {
+			CERROR("error unpacking request: ptl %d from %s x%llu\n",
+			       svc->srv_req_portal, libcfs_id2str(req->rq_peer),
+			       req->rq_xid);
+			goto err_req;
+		}
+	}
+
+	rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+	if (rc) {
+		CERROR("error unpacking ptlrpc body: ptl %d from %s x %llu\n",
+		       svc->srv_req_portal, libcfs_id2str(req->rq_peer),
+		       req->rq_xid);
+		goto err_req;
+	}
+
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
+	    opc == cfs_fail_val) {
+		CERROR("drop incoming rpc opc %u, x%llu\n",
+		       cfs_fail_val, req->rq_xid);
+		goto err_req;
+	}
+
+	rc = -EINVAL;
+	if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+		CERROR("wrong packet type received (type=%u) from %s\n",
+		       lustre_msg_get_type(req->rq_reqmsg),
+		       libcfs_id2str(req->rq_peer));
+		goto err_req;
+	}
+
+	switch (opc) {
+	case MDS_WRITEPAGE:
+	case OST_WRITE:
+	case OUT_UPDATE:
+		req->rq_bulk_write = 1;
+		break;
+	case MDS_READPAGE:
+	case OST_READ:
+	case MGS_CONFIG_READ:
+		req->rq_bulk_read = 1;
+		break;
+	}
+
+	CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid);
+
+	req->rq_export = class_conn2export(
+		lustre_msg_get_handle(req->rq_reqmsg));
+	if (req->rq_export) {
+		rc = ptlrpc_check_req(req);
+		if (rc == 0) {
+			rc = sptlrpc_target_export_check(req->rq_export, req);
+			if (rc)
+				DEBUG_REQ(D_ERROR, req,
+					  "DROPPING req with illegal security flavor");
+		}
+
+		if (rc)
+			goto err_req;
+		ptlrpc_update_export_timer(req->rq_export, 0);
+	}
+
+	/* req_in handling should/must be fast */
+	if (ktime_get_real_seconds() - req->rq_arrival_time.tv_sec > 5)
+		DEBUG_REQ(D_WARNING, req, "Slow req_in handling %llds",
+			  ktime_get_real_seconds() -
+			  req->rq_arrival_time.tv_sec);
+
+	/* Set rpc server deadline and add it to the timed list */
+	deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+		    MSGHDR_AT_SUPPORT) ?
+		    /* The max time the client expects us to take */
+		    lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+
+	req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+	if (unlikely(deadline == 0)) {
+		DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+		goto err_req;
+	}
+
+	/* Skip early reply */
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_RESEND))
+		req->rq_deadline += obd_timeout;
+
+	req->rq_svc_thread = thread;
+	if (thread != NULL) {
+		/*
+		 * initialize request session, it is needed for request
+		 * processing by target
+		 */
+		rc = lu_context_init(&req->rq_session, LCT_SERVER_SESSION |
+						       LCT_NOREF);
+		if (rc) {
+			CERROR("%s: failure to initialize session: rc = %d\n",
+			       thread->t_name, rc);
+			goto err_req;
+		}
+		req->rq_session.lc_thread = thread;
+		lu_context_enter(&req->rq_session);
+		thread->t_env->le_ses = &req->rq_session;
+	}
+
+
+	if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_ENQ_RESEND) &&
+		     (opc == LDLM_ENQUEUE) &&
+		     (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)))
+		OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_ENQ_RESEND, 6);
+
+	ptlrpc_at_add_timed(req);
+
+	if (opc != OST_CONNECT && opc != MDS_CONNECT &&
+	    opc != MGS_CONNECT && req->rq_export != NULL) {
+		if (exp_connect_flags2(req->rq_export) & OBD_CONNECT2_REP_MBITS)
+			req->rq_rep_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
+	}
+
+	/* Move it over to the request processing queue */
+	rc = ptlrpc_server_request_add(svcpt, req);
+	if (rc)
+		GOTO(err_req, rc);
+
+	wake_up(&svcpt->scp_waitq);
+	RETURN(1);
+
+err_req:
+	ptlrpc_server_finish_request(svcpt, req);
+
+	RETURN(1);
+}
+
+/**
+ * Main incoming request handling logic.
+ * Calls handler function from service to do actual processing.
+ */
+static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
+					struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service *svc = svcpt->scp_service;
+	struct ptlrpc_request *request;
+	ktime_t work_start;
+	ktime_t work_end;
+	ktime_t arrived;
+	s64 timediff_usecs;
+	s64 arrived_usecs;
+	int fail_opc = 0;
+
+	ENTRY;
+
+	request = ptlrpc_server_request_get(svcpt, false);
+	if (request == NULL)
+		RETURN(0);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
+		fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
+	else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+		fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
+
+	if (unlikely(fail_opc)) {
+		if (request->rq_export && request->rq_ops)
+			OBD_FAIL_TIMEOUT(fail_opc, 4);
+	}
+
+	ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
+		libcfs_debug_dumplog();
+
+	work_start = ktime_get_real();
+	arrived = timespec64_to_ktime(request->rq_arrival_time);
+	timediff_usecs = ktime_us_delta(work_start, arrived);
+	if (likely(svc->srv_stats != NULL)) {
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff_usecs);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+				    svcpt->scp_nreqs_incoming);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+				    svcpt->scp_nreqs_active);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+				    at_get(&svcpt->scp_at_estimate));
+	}
+
+	if (likely(request->rq_export)) {
+		if (unlikely(ptlrpc_check_req(request)))
+			goto put_conn;
+		ptlrpc_update_export_timer(request->rq_export,
+					   div_u64(timediff_usecs,
+						   USEC_PER_SEC / 2));
+	}
+
+	/*
+	 * Discard requests queued for longer than the deadline.
+	 * The deadline is increased if we send an early reply.
+	 */
+	if (ktime_get_real_seconds() > request->rq_deadline) {
+		DEBUG_REQ(D_ERROR, request,
+			  "Dropping timed-out request from %s: deadline %lld/%llds ago",
+			  libcfs_id2str(request->rq_peer),
+			  request->rq_deadline -
+			  request->rq_arrival_time.tv_sec,
+			  ktime_get_real_seconds() - request->rq_deadline);
+		goto put_conn;
+	}
+
+	CDEBUG(D_RPCTRACE,
+	       "Handling RPC req@%p pname:cluuid+ref:pid:xid:nid:opc:job %s:%s+%d:%d:x%llu:%s:%d:%s\n",
+	       request, current->comm,
+	       (request->rq_export ?
+		(char *)request->rq_export->exp_client_uuid.uuid : "0"),
+	       (request->rq_export ?
+		refcount_read(&request->rq_export->exp_handle.h_ref) : -99),
+	       lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
+	       libcfs_id2str(request->rq_peer),
+	       lustre_msg_get_opc(request->rq_reqmsg),
+	       lustre_msg_get_jobid(request->rq_reqmsg) ?: "");
+
+	if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+		CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
+
+	CDEBUG(D_NET, "got req %llu\n", request->rq_xid);
+
+	/* re-assign request and sesson thread to the current one */
+	request->rq_svc_thread = thread;
+	if (thread != NULL) {
+		LASSERT(request->rq_session.lc_thread == NULL);
+		request->rq_session.lc_thread = thread;
+		thread->t_env->le_ses = &request->rq_session;
+	}
+	svc->srv_ops.so_req_handler(request);
+
+	ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
+
+put_conn:
+	if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) {
+		DEBUG_REQ(D_WARNING, request,
+			  "Request took longer than estimated (%lld/%llds); client may timeout",
+			  request->rq_deadline -
+			  request->rq_arrival_time.tv_sec,
+			  ktime_get_real_seconds() - request->rq_deadline);
+	}
+
+	work_end = ktime_get_real();
+	timediff_usecs = ktime_us_delta(work_end, work_start);
+	arrived_usecs = ktime_us_delta(work_end, arrived);
+	CDEBUG(D_RPCTRACE,
+	       "Handled RPC req@%p pname:cluuid+ref:pid:xid:nid:opc:job %s:%s+%d:%d:x%llu:%s:%d:%s Request processed in %lldus (%lldus total) trans %llu rc %d/%d\n",
+	       request, current->comm,
+	       (request->rq_export ?
+	       (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+	       (request->rq_export ?
+		refcount_read(&request->rq_export->exp_handle.h_ref) : -99),
+	       lustre_msg_get_status(request->rq_reqmsg),
+	       request->rq_xid,
+	       libcfs_id2str(request->rq_peer),
+	       lustre_msg_get_opc(request->rq_reqmsg),
+	       lustre_msg_get_jobid(request->rq_reqmsg) ?: "",
+	       timediff_usecs,
+	       arrived_usecs,
+	       (request->rq_repmsg ?
+	       lustre_msg_get_transno(request->rq_repmsg) :
+	       request->rq_transno),
+	       request->rq_status,
+	       (request->rq_repmsg ?
+	       lustre_msg_get_status(request->rq_repmsg) : -999));
+	if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+		__u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+		int opc = opcode_offset(op);
+
+		if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+			LASSERT(opc < LUSTRE_MAX_OPCODES);
+			lprocfs_counter_add(svc->srv_stats,
+					    opc + EXTRA_MAX_OPCODES,
+					    timediff_usecs);
+		}
+	}
+	if (unlikely(request->rq_early_count)) {
+		DEBUG_REQ(D_ADAPTTO, request,
+			  "sent %d early replies before finishing in %llds",
+			  request->rq_early_count,
+			  div_u64(arrived_usecs, USEC_PER_SEC));
+	}
+
+	ptlrpc_server_finish_active_request(svcpt, request);
+
+	RETURN(1);
+}
+
+/**
+ * An internal function to process a single reply state object.
+ */
+static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+	struct ptlrpc_service *svc = svcpt->scp_service;
+	struct obd_export *exp;
+	int nlocks;
+	int been_handled;
+
+	ENTRY;
+
+	exp = rs->rs_export;
+
+	LASSERT(rs->rs_difficult);
+	LASSERT(rs->rs_scheduled);
+	LASSERT(list_empty(&rs->rs_list));
+
+	/*
+	 * The disk commit callback holds exp_uncommitted_replies_lock while it
+	 * iterates over newly committed replies, removing them from
+	 * exp_uncommitted_replies.  It then drops this lock and schedules the
+	 * replies it found for handling here.
+	 *
+	 * We can avoid contention for exp_uncommitted_replies_lock between the
+	 * HRT threads and further commit callbacks by checking rs_committed
+	 * which is set in the commit callback while it holds both
+	 * rs_lock and exp_uncommitted_reples.
+	 *
+	 * If we see rs_committed clear, the commit callback _may_ not have
+	 * handled this reply yet and we race with it to grab
+	 * exp_uncommitted_replies_lock before removing the reply from
+	 * exp_uncommitted_replies.  Note that if we lose the race and the
+	 * reply has already been removed, list_del_init() is a noop.
+	 *
+	 * If we see rs_committed set, we know the commit callback is handling,
+	 * or has handled this reply since store reordering might allow us to
+	 * see rs_committed set out of sequence.  But since this is done
+	 * holding rs_lock, we can be sure it has all completed once we hold
+	 * rs_lock, which we do right next.
+	 */
+	if (!rs->rs_committed) {
+		/*
+		 * if rs was commited, no need to convert locks, don't check
+		 * rs_committed here because rs may never be added into
+		 * exp_uncommitted_replies and this flag never be set, see
+		 * target_send_reply()
+		 */
+		if (rs->rs_convert_lock &&
+		    rs->rs_transno > exp->exp_last_committed) {
+			struct ldlm_lock *lock;
+			struct ldlm_lock *ack_locks[RS_MAX_LOCKS] = { NULL };
+
+			spin_lock(&rs->rs_lock);
+			if (rs->rs_convert_lock &&
+			    rs->rs_transno > exp->exp_last_committed) {
+				nlocks = rs->rs_nlocks;
+				while (nlocks-- > 0) {
+					/*
+					 * NB don't assume rs is always handled
+					 * by the same service thread (see
+					 * ptlrpc_hr_select, so REP-ACK hr may
+					 * race with trans commit, while the
+					 * latter will release locks, get locks
+					 * here early to convert to COS mode
+					 * safely.
+					 */
+					lock = ldlm_handle2lock(
+							&rs->rs_locks[nlocks]);
+					LASSERT(lock);
+					ack_locks[nlocks] = lock;
+					rs->rs_modes[nlocks] = LCK_COS;
+				}
+				nlocks = rs->rs_nlocks;
+				rs->rs_convert_lock = 0;
+				/*
+				 * clear rs_scheduled so that commit callback
+				 * can schedule again
+				 */
+				rs->rs_scheduled = 0;
+				spin_unlock(&rs->rs_lock);
+
+				while (nlocks-- > 0) {
+					lock = ack_locks[nlocks];
+					ldlm_lock_mode_downgrade(lock, LCK_COS);
+					LDLM_LOCK_PUT(lock);
+				}
+				RETURN(0);
+			}
+			spin_unlock(&rs->rs_lock);
+		}
+
+		spin_lock(&exp->exp_uncommitted_replies_lock);
+		list_del_init(&rs->rs_obd_list);
+		spin_unlock(&exp->exp_uncommitted_replies_lock);
+	}
+
+	spin_lock(&exp->exp_lock);
+	/* Noop if removed already */
+	list_del_init(&rs->rs_exp_list);
+	spin_unlock(&exp->exp_lock);
+
+	spin_lock(&rs->rs_lock);
+
+	been_handled = rs->rs_handled;
+	rs->rs_handled = 1;
+
+	nlocks = rs->rs_nlocks; /* atomic "steal", but */
+	rs->rs_nlocks = 0; /* locks still on rs_locks! */
+
+	if (nlocks == 0 && !been_handled) {
+		/*
+		 * If we see this, we should already have seen the warning
+		 * in mds_steal_ack_locks()
+		 */
+		CDEBUG(D_HA,
+		       "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n",
+		       rs, rs->rs_xid, rs->rs_transno, rs->rs_opc,
+		       libcfs_nidstr(&exp->exp_connection->c_peer.nid));
+	}
+
+	if ((rs->rs_sent && !rs->rs_unlinked) || nlocks > 0) {
+		spin_unlock(&rs->rs_lock);
+
+		/* We can unlink if the LNET_EVENT_SEND has occurred.
+		 * If rs_unlinked is set then MD is already unlinked and no
+		 * need to do so here.
+		 */
+		if ((rs->rs_sent && !rs->rs_unlinked)) {
+			LNetMDUnlink(rs->rs_md_h);
+			/* Ignore return code; we're racing with completion */
+		}
+
+		while (nlocks-- > 0)
+			ldlm_lock_decref(&rs->rs_locks[nlocks],
+					 rs->rs_modes[nlocks]);
+
+		spin_lock(&rs->rs_lock);
+	}
+
+	rs->rs_scheduled = 0;
+	rs->rs_convert_lock = 0;
+
+	if (rs->rs_unlinked) {
+		/* Off the net */
+		spin_unlock(&rs->rs_lock);
+
+		class_export_put(exp);
+		rs->rs_export = NULL;
+		ptlrpc_rs_decref(rs);
+		if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) &&
+		    svc->srv_is_stopping)
+			wake_up_all(&svcpt->scp_waitq);
+		RETURN(1);
+	}
+
+	/* still on the net; callback will schedule */
+	spin_unlock(&rs->rs_lock);
+	RETURN(1);
+}
+
+
+static void ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt)
+{
+	int avail = svcpt->scp_nrqbds_posted;
+	int low_water = test_req_buffer_pressure ? 0 :
+			svcpt->scp_service->srv_nbuf_per_group / 2;
+
+	/* NB I'm not locking; just looking. */
+
+	/*
+	 * CAVEAT EMPTOR: We might be allocating buffers here because we've
+	 * allowed the request history to grow out of control.  We could put a
+	 * sanity check on that here and cull some history if we need the
+	 * space.
+	 */
+
+	if (avail <= low_water)
+		ptlrpc_grow_req_bufs(svcpt, 1);
+
+	if (svcpt->scp_service->srv_stats) {
+		lprocfs_counter_add(svcpt->scp_service->srv_stats,
+				    PTLRPC_REQBUF_AVAIL_CNTR, avail);
+	}
+}
+
+static inline int ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nreqs_active <
+	       svcpt->scp_nthrs_running - 1 -
+	       (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL);
+}
+
+/**
+ * allowed to create more threads
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nthrs_running +
+	       svcpt->scp_nthrs_starting <
+	       svcpt->scp_service->srv_nthrs_cpt_limit;
+}
+
+/**
+ * too many requests and allowed to create more threads
+ */
+static inline int ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt)
+{
+	return !ptlrpc_threads_enough(svcpt) &&
+		ptlrpc_threads_increasable(svcpt);
+}
+
+static inline int ptlrpc_thread_stopping(struct ptlrpc_thread *thread)
+{
+	return thread_is_stopping(thread) ||
+	       thread->t_svcpt->scp_service->srv_is_stopping;
+}
+
+/* stop the highest numbered thread if there are too many threads running */
+static inline bool ptlrpc_thread_should_stop(struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service_part *svcpt = thread->t_svcpt;
+
+	return thread->t_id >= svcpt->scp_service->srv_nthrs_cpt_limit &&
+		thread->t_id == svcpt->scp_thr_nextid - 1;
+}
+
+static void ptlrpc_stop_thread(struct ptlrpc_thread *thread)
+{
+	CDEBUG(D_INFO, "Stopping thread %s #%u\n",
+	       thread->t_svcpt->scp_service->srv_thread_name, thread->t_id);
+	thread_add_flags(thread, SVC_STOPPING);
+}
+
+static inline void ptlrpc_thread_stop(struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service_part *svcpt = thread->t_svcpt;
+
+	spin_lock(&svcpt->scp_lock);
+	if (ptlrpc_thread_should_stop(thread)) {
+		ptlrpc_stop_thread(thread);
+		svcpt->scp_thr_nextid--;
+	}
+	spin_unlock(&svcpt->scp_lock);
+}
+
+static inline int ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_rqbd_idle) &&
+	       svcpt->scp_rqbd_timeout == 0;
+}
+
+static inline int
+ptlrpc_at_check(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_at_check;
+}
+
+/*
+ * If a thread runs too long or spends to much time on a single request,
+ * we want to know about it, so we set up a delayed work item as a watchdog.
+ * If it fires, we display a stack trace of the delayed thread,
+ * providing we aren't rate-limited
+ *
+ * Watchdog stack traces are limited to 3 per 'libcfs_watchdog_ratelimit'
+ * seconds
+ */
+static struct ratelimit_state watchdog_limit;
+
+static void ptlrpc_watchdog_fire(struct work_struct *w)
+{
+	struct ptlrpc_thread *thread = container_of(w, struct ptlrpc_thread,
+						    t_watchdog.work);
+	u64 ms_lapse = ktime_ms_delta(ktime_get(), thread->t_touched);
+	u32 ms_frac = do_div(ms_lapse, MSEC_PER_SEC);
+
+	/* ___ratelimit() returns true if the action is NOT ratelimited */
+	if (__ratelimit(&watchdog_limit)) {
+		/* below message is checked in sanity-quota.sh test_6,18 */
+		LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n",
+			      thread->t_task->comm, thread->t_task->pid,
+			      ms_lapse, ms_frac);
+
+		libcfs_debug_dumpstack(thread->t_task);
+	} else {
+		/* below message is checked in sanity-quota.sh test_6,18 */
+		LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. Watchdog stack traces are limited to 3 per %u seconds, skipping this one.\n",
+			      thread->t_task->comm, thread->t_task->pid,
+			      ms_lapse, ms_frac, libcfs_watchdog_ratelimit);
+	}
+}
+
+void ptlrpc_watchdog_init(struct delayed_work *work, timeout_t timeout)
+{
+	INIT_DELAYED_WORK(work, ptlrpc_watchdog_fire);
+	schedule_delayed_work(work, cfs_time_seconds(timeout));
+}
+
+void ptlrpc_watchdog_disable(struct delayed_work *work)
+{
+	cancel_delayed_work_sync(work);
+}
+
+void ptlrpc_watchdog_touch(struct delayed_work *work, timeout_t timeout)
+{
+	struct ptlrpc_thread *thread = container_of(&work->work,
+						    struct ptlrpc_thread,
+						    t_watchdog.work);
+	thread->t_touched = ktime_get();
+	mod_delayed_work(system_wq, work, cfs_time_seconds(timeout));
+}
+
+/**
+ * requests wait on preprocessing
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_req_incoming);
+}
+
+static __attribute__((__noinline__)) int
+ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
+		  struct ptlrpc_thread *thread)
+{
+	ptlrpc_watchdog_disable(&thread->t_watchdog);
+
+	cond_resched();
+
+	if (svcpt->scp_rqbd_timeout == 0)
+		/* Don't exit while there are replies to be handled */
+		wait_event_idle_exclusive_lifo(
+			svcpt->scp_waitq,
+			ptlrpc_thread_stopping(thread) ||
+			ptlrpc_server_request_incoming(svcpt) ||
+			ptlrpc_server_request_pending(svcpt, false) ||
+			ptlrpc_rqbd_pending(svcpt) ||
+			ptlrpc_at_check(svcpt));
+	else if (wait_event_idle_exclusive_lifo_timeout(
+			 svcpt->scp_waitq,
+			 ptlrpc_thread_stopping(thread) ||
+			 ptlrpc_server_request_incoming(svcpt) ||
+			 ptlrpc_server_request_pending(svcpt, false) ||
+			 ptlrpc_rqbd_pending(svcpt) ||
+			 ptlrpc_at_check(svcpt),
+			 svcpt->scp_rqbd_timeout) == 0)
+		svcpt->scp_rqbd_timeout = 0;
+
+	if (ptlrpc_thread_stopping(thread))
+		return -EINTR;
+
+	ptlrpc_watchdog_touch(&thread->t_watchdog,
+			      ptlrpc_server_get_timeout(svcpt));
+	return 0;
+}
+
+/**
+ * Main thread body for service threads.
+ * Waits in a loop waiting for new requests to process to appear.
+ * Every time an incoming requests is added to its queue, a waitq
+ * is woken up and one of the threads will handle it.
+ */
+static int ptlrpc_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	struct ptlrpc_service_part *svcpt = thread->t_svcpt;
+	struct ptlrpc_service *svc = svcpt->scp_service;
+	struct ptlrpc_reply_state *rs;
+	struct group_info *ginfo = NULL;
+	struct lu_env *env;
+	int counter = 0, rc = 0;
+
+	ENTRY;
+	unshare_fs_struct();
+
+	thread->t_task = current;
+	thread->t_pid = current->pid;
+
+	if (svc->srv_cpt_bind) {
+		rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+		if (rc != 0) {
+			CWARN("%s: failed to bind %s on CPT %d\n",
+			      svc->srv_name, thread->t_name, svcpt->scp_cpt);
+		}
+	}
+
+	ginfo = groups_alloc(0);
+	if (!ginfo)
+		GOTO(out, rc = -ENOMEM);
+
+	set_current_groups(ginfo);
+	put_group_info(ginfo);
+
+	if (svc->srv_ops.so_thr_init != NULL) {
+		rc = svc->srv_ops.so_thr_init(thread);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		GOTO(out_srv_fini, rc = -ENOMEM);
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_env, rc);
+
+	rc = lu_context_init(&env->le_ctx,
+			     svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+	if (rc)
+		GOTO(out_env_remove, rc);
+
+	thread->t_env = env;
+	env->le_ctx.lc_thread = thread;
+	env->le_ctx.lc_cookie = 0x6;
+
+	while (!list_empty(&svcpt->scp_rqbd_idle)) {
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+		if (rc >= 0)
+			continue;
+
+		CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
+			svc->srv_name, svcpt->scp_cpt, rc);
+		GOTO(out_ctx_fini, rc);
+	}
+
+	/* Alloc reply state structure for this one */
+	OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
+	if (!rs)
+		GOTO(out_ctx_fini, rc = -ENOMEM);
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(thread_is_starting(thread));
+	thread_clear_flags(thread, SVC_STARTING);
+
+	LASSERT(svcpt->scp_nthrs_starting == 1);
+	svcpt->scp_nthrs_starting--;
+
+	/*
+	 * SVC_STOPPING may already be set here if someone else is trying
+	 * to stop the service while this new thread has been dynamically
+	 * forked. We still set SVC_RUNNING to let our creator know that
+	 * we are now running, however we will exit as soon as possible
+	 */
+	thread_add_flags(thread, SVC_RUNNING);
+	svcpt->scp_nthrs_running++;
+	spin_unlock(&svcpt->scp_lock);
+
+	/* wake up our creator in case he's still waiting. */
+	wake_up(&thread->t_ctl_waitq);
+
+	thread->t_touched = ktime_get();
+	ptlrpc_watchdog_init(&thread->t_watchdog,
+			 ptlrpc_server_get_timeout(svcpt));
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	wake_up(&svcpt->scp_rep_waitq);
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
+	       svcpt->scp_nthrs_running);
+
+	/* XXX maintain a list of all managed devices: insert here */
+	while (!ptlrpc_thread_stopping(thread)) {
+		if (ptlrpc_wait_event(svcpt, thread))
+			break;
+
+		ptlrpc_check_rqbd_pool(svcpt);
+
+		if (ptlrpc_threads_need_create(svcpt)) {
+			/* Ignore return code - we tried... */
+			ptlrpc_start_thread(svcpt, 0);
+		}
+
+		/* reset le_ses to initial state */
+		env->le_ses = NULL;
+		/* Refill the context before execution to make sure
+		 * all thread keys are allocated */
+		lu_env_refill(env);
+		/* Process all incoming reqs before handling any */
+		if (ptlrpc_server_request_incoming(svcpt)) {
+			lu_context_enter(&env->le_ctx);
+			ptlrpc_server_handle_req_in(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+
+			/* but limit ourselves in case of flood */
+			if (counter++ < 100)
+				continue;
+			counter = 0;
+		}
+
+		if (ptlrpc_at_check(svcpt))
+			ptlrpc_at_check_timed(svcpt);
+
+		if (ptlrpc_server_request_pending(svcpt, false)) {
+			lu_context_enter(&env->le_ctx);
+			ptlrpc_server_handle_request(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+		}
+
+		if (ptlrpc_rqbd_pending(svcpt) &&
+		    ptlrpc_server_post_idle_rqbds(svcpt) < 0) {
+			/*
+			 * I just failed to repost request buffers.
+			 * Wait for a timeout (unless something else
+			 * happens) before I try again
+			 */
+			svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
+			CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
+			       svcpt->scp_nrqbds_posted);
+		}
+		/*
+		 * If the number of threads has been tuned downward and this
+		 * thread should be stopped, then stop in reverse order so the
+		 * the threads always have contiguous thread index values.
+		 */
+		if (unlikely(ptlrpc_thread_should_stop(thread)))
+			ptlrpc_thread_stop(thread);
+	}
+
+	ptlrpc_watchdog_disable(&thread->t_watchdog);
+
+out_ctx_fini:
+	lu_context_fini(&env->le_ctx);
+out_env_remove:
+	lu_env_remove(env);
+out_env:
+	OBD_FREE_PTR(env);
+out_srv_fini:
+	/* deconstruct service thread state created by ptlrpc_start_thread() */
+	if (svc->srv_ops.so_thr_done != NULL)
+		svc->srv_ops.so_thr_done(thread);
+out:
+	CDEBUG(D_RPCTRACE, "%s: service thread [%p:%u] %d exiting: rc = %d\n",
+	       thread->t_name, thread, thread->t_pid, thread->t_id, rc);
+	spin_lock(&svcpt->scp_lock);
+	if (thread_test_and_clear_flags(thread, SVC_STARTING))
+		svcpt->scp_nthrs_starting--;
+
+	if (thread_test_and_clear_flags(thread, SVC_RUNNING)) {
+		/* must know immediately */
+		svcpt->scp_nthrs_running--;
+	}
+
+	thread->t_id = rc;
+	thread_add_flags(thread, SVC_STOPPED);
+
+	wake_up(&thread->t_ctl_waitq);
+	spin_unlock(&svcpt->scp_lock);
+
+	return rc;
+}
+
+static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt,
+			  struct list_head *replies)
+{
+	int result;
+
+	spin_lock(&hrt->hrt_lock);
+
+	list_splice_init(&hrt->hrt_queue, replies);
+	result = ptlrpc_hr.hr_stopping || !list_empty(replies);
+
+	spin_unlock(&hrt->hrt_lock);
+	return result;
+}
+
+/**
+ * Main body of "handle reply" function.
+ * It processes acked reply states
+ */
+static int ptlrpc_hr_main(void *arg)
+{
+	struct ptlrpc_hr_thread *hrt = (struct ptlrpc_hr_thread *)arg;
+	struct ptlrpc_hr_partition *hrp = hrt->hrt_partition;
+	LIST_HEAD(replies);
+	struct lu_env *env;
+	int rc;
+
+	unshare_fs_struct();
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		RETURN(-ENOMEM);
+
+	rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt);
+	if (rc != 0) {
+		char threadname[20];
+
+		snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d",
+			 hrp->hrp_cpt, hrt->hrt_id);
+		CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n",
+		      threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
+	}
+
+	rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD |
+			     LCT_REMEMBER | LCT_NOREF);
+	if (rc)
+		GOTO(out_env, rc);
+
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_ctx_fini, rc);
+
+	atomic_inc(&hrp->hrp_nstarted);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	while (!ptlrpc_hr.hr_stopping) {
+		wait_event_idle(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies));
+
+		while (!list_empty(&replies)) {
+			struct ptlrpc_reply_state *rs;
+
+			rs = list_entry(replies.prev,
+					struct ptlrpc_reply_state,
+					rs_list);
+			list_del_init(&rs->rs_list);
+			/* refill keys if needed */
+			lu_env_refill(env);
+			lu_context_enter(&env->le_ctx);
+			ptlrpc_handle_rs(rs);
+			lu_context_exit(&env->le_ctx);
+		}
+	}
+
+	atomic_inc(&hrp->hrp_nstopped);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	lu_env_remove(env);
+out_ctx_fini:
+	lu_context_fini(&env->le_ctx);
+out_env:
+	OBD_FREE_PTR(env);
+	return 0;
+}
+
+static void ptlrpc_stop_hr_threads(void)
+{
+	struct ptlrpc_hr_partition *hrp;
+	int i;
+	int j;
+
+	ptlrpc_hr.hr_stopping = 1;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		for (j = 0; j < hrp->hrp_nthrs; j++)
+			wake_up(&hrp->hrp_thrs[j].hrt_waitq);
+	}
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		wait_event(ptlrpc_hr.hr_waitq,
+			       atomic_read(&hrp->hrp_nstopped) ==
+			       atomic_read(&hrp->hrp_nstarted));
+	}
+}
+
+static int ptlrpc_start_hr_threads(void)
+{
+	struct ptlrpc_hr_partition *hrp;
+	int i;
+	int j;
+
+	ENTRY;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		int	rc = 0;
+
+		for (j = 0; j < hrp->hrp_nthrs; j++) {
+			struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j];
+			struct task_struct *task;
+
+			task = kthread_run(ptlrpc_hr_main,
+					   &hrp->hrp_thrs[j],
+					   "ptlrpc_hr%02d_%03d",
+					   hrp->hrp_cpt,
+					   hrt->hrt_id);
+			if (IS_ERR(task)) {
+				rc = PTR_ERR(task);
+				break;
+			}
+		}
+
+		wait_event(ptlrpc_hr.hr_waitq,
+			   atomic_read(&hrp->hrp_nstarted) == j);
+
+		if (rc < 0) {
+			CERROR("cannot start reply handler thread %d:%d: rc = %d\n",
+			       i, j, rc);
+			ptlrpc_stop_hr_threads();
+			RETURN(rc);
+		}
+	}
+
+	RETURN(0);
+}
+
+static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_thread *thread;
+	LIST_HEAD(zombie);
+
+	ENTRY;
+
+	CDEBUG(D_INFO, "Stopping threads for service %s\n",
+	       svcpt->scp_service->srv_name);
+
+	spin_lock(&svcpt->scp_lock);
+	/* let the thread know that we would like it to stop asap */
+	list_for_each_entry(thread, &svcpt->scp_threads, t_link)
+		ptlrpc_stop_thread(thread);
+
+	wake_up_all(&svcpt->scp_waitq);
+
+	while ((thread = list_first_entry_or_null(&svcpt->scp_threads,
+						  struct ptlrpc_thread,
+						  t_link)) != NULL) {
+		if (thread_is_stopped(thread)) {
+			list_move(&thread->t_link, &zombie);
+			continue;
+		}
+		spin_unlock(&svcpt->scp_lock);
+
+		CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
+		       svcpt->scp_service->srv_thread_name, thread->t_id);
+		wait_event_idle(thread->t_ctl_waitq,
+				thread_is_stopped(thread));
+
+		spin_lock(&svcpt->scp_lock);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+
+	while ((thread = list_first_entry_or_null(&zombie,
+						  struct ptlrpc_thread,
+						  t_link)) != NULL) {
+		list_del(&thread->t_link);
+		OBD_FREE_PTR(thread);
+	}
+	EXIT;
+}
+
+/**
+ * Stops all threads of a particular service \a svc
+ */
+static void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part *svcpt;
+	int i;
+
+	ENTRY;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			ptlrpc_svcpt_stop_threads(svcpt);
+	}
+
+	EXIT;
+}
+
+static int ptlrpc_start_threads(struct ptlrpc_service *svc)
+{
+	int rc = 0;
+	int i;
+	int j;
+
+	ENTRY;
+
+	/* We require 2 threads min, see note in ptlrpc_server_handle_request */
+	LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT);
+
+	for (i = 0; i < svc->srv_ncpts; i++) {
+		for (j = 0; j < svc->srv_nthrs_cpt_init; j++) {
+			rc = ptlrpc_start_thread(svc->srv_parts[i], 1);
+			if (rc == 0)
+				continue;
+
+			if (rc != -EMFILE)
+				goto failed;
+			/* We have enough threads, don't start more. b=15759 */
+			break;
+		}
+	}
+
+	RETURN(0);
+ failed:
+	CERROR("cannot start %s thread #%d_%d: rc %d\n",
+	       svc->srv_thread_name, i, j, rc);
+	ptlrpc_stop_all_threads(svc);
+	RETURN(rc);
+}
+
+static int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
+{
+	struct ptlrpc_thread *thread;
+	struct ptlrpc_service *svc;
+	struct task_struct *task;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(svcpt != NULL);
+
+	svc = svcpt->scp_service;
+
+	CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n",
+	       svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running,
+	       svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit);
+
+ again:
+	if (unlikely(svc->srv_is_stopping))
+		RETURN(-ESRCH);
+
+	if (!ptlrpc_threads_increasable(svcpt) ||
+	    (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) &&
+	     svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1))
+		RETURN(-EMFILE);
+
+	OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt);
+	if (thread == NULL)
+		RETURN(-ENOMEM);
+	init_waitqueue_head(&thread->t_ctl_waitq);
+
+	spin_lock(&svcpt->scp_lock);
+	if (!ptlrpc_threads_increasable(svcpt)) {
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		RETURN(-EMFILE);
+	}
+
+	if (svcpt->scp_nthrs_starting != 0) {
+		/*
+		 * serialize starting because some modules (obdfilter)
+		 * might require unique and contiguous t_id
+		 */
+		LASSERT(svcpt->scp_nthrs_starting == 1);
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		if (wait) {
+			CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n",
+			       svc->srv_thread_name, svcpt->scp_thr_nextid);
+			schedule();
+			goto again;
+		}
+
+		CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n",
+		       svc->srv_thread_name, svcpt->scp_thr_nextid);
+		RETURN(-EAGAIN);
+	}
+
+	svcpt->scp_nthrs_starting++;
+	thread->t_id = svcpt->scp_thr_nextid++;
+	thread_add_flags(thread, SVC_STARTING);
+	thread->t_svcpt = svcpt;
+
+	list_add(&thread->t_link, &svcpt->scp_threads);
+	spin_unlock(&svcpt->scp_lock);
+
+	if (svcpt->scp_cpt >= 0) {
+		snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d",
+			 svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
+	} else {
+		snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d",
+			 svc->srv_thread_name, thread->t_id);
+	}
+
+	CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
+	task = kthread_run(ptlrpc_main, thread, "%s", thread->t_name);
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("cannot start thread '%s': rc = %d\n",
+		       thread->t_name, rc);
+		spin_lock(&svcpt->scp_lock);
+		--svcpt->scp_nthrs_starting;
+		if (thread_is_stopping(thread)) {
+			/*
+			 * this ptlrpc_thread is being hanled
+			 * by ptlrpc_svcpt_stop_threads now
+			 */
+			thread_add_flags(thread, SVC_STOPPED);
+			wake_up(&thread->t_ctl_waitq);
+			spin_unlock(&svcpt->scp_lock);
+		} else {
+			list_del(&thread->t_link);
+			spin_unlock(&svcpt->scp_lock);
+			OBD_FREE_PTR(thread);
+		}
+		RETURN(rc);
+	}
+
+	if (!wait)
+		RETURN(0);
+
+	wait_event_idle(thread->t_ctl_waitq,
+			thread_is_running(thread) || thread_is_stopped(thread));
+
+	rc = thread_is_stopped(thread) ? thread->t_id : 0;
+	RETURN(rc);
+}
+
+int ptlrpc_hr_init(void)
+{
+	struct ptlrpc_hr_partition *hrp;
+	struct ptlrpc_hr_thread *hrt;
+	int rc;
+	int cpt;
+	int i;
+	int weight;
+
+	ENTRY;
+
+	memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr));
+	ptlrpc_hr.hr_cpt_table = cfs_cpt_tab;
+
+	ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table,
+						   sizeof(*hrp));
+	if (ptlrpc_hr.hr_partitions == NULL)
+		RETURN(-ENOMEM);
+
+	ratelimit_state_init(&watchdog_limit,
+			     cfs_time_seconds(libcfs_watchdog_ratelimit), 3);
+
+	init_waitqueue_head(&ptlrpc_hr.hr_waitq);
+
+	preempt_disable();
+	weight = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
+	preempt_enable();
+
+	cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) {
+		hrp->hrp_cpt = cpt;
+
+		atomic_set(&hrp->hrp_nstarted, 0);
+		atomic_set(&hrp->hrp_nstopped, 0);
+
+		hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, cpt);
+		hrp->hrp_nthrs /= weight;
+		if (hrp->hrp_nthrs == 0)
+			hrp->hrp_nthrs = 1;
+
+		OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, cpt,
+			      hrp->hrp_nthrs * sizeof(*hrt));
+		if (hrp->hrp_thrs == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		for (i = 0; i < hrp->hrp_nthrs; i++) {
+			hrt = &hrp->hrp_thrs[i];
+
+			hrt->hrt_id = i;
+			hrt->hrt_partition = hrp;
+			init_waitqueue_head(&hrt->hrt_waitq);
+			spin_lock_init(&hrt->hrt_lock);
+			INIT_LIST_HEAD(&hrt->hrt_queue);
+		}
+	}
+
+	rc = ptlrpc_start_hr_threads();
+out:
+	if (rc != 0)
+		ptlrpc_hr_fini();
+	RETURN(rc);
+}
+
+void ptlrpc_hr_fini(void)
+{
+	struct ptlrpc_hr_partition *hrp;
+	int cpt;
+
+	if (ptlrpc_hr.hr_partitions == NULL)
+		return;
+
+	ptlrpc_stop_hr_threads();
+
+	cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs)
+			OBD_FREE_PTR_ARRAY(hrp->hrp_thrs, hrp->hrp_nthrs);
+	}
+
+	cfs_percpt_free(ptlrpc_hr.hr_partitions);
+	ptlrpc_hr.hr_partitions = NULL;
+}
+
+
+/**
+ * Wait until all already scheduled replies are processed.
+ */
+static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt)
+{
+	while (1) {
+		if (wait_event_idle_timeout(
+			svcpt->scp_waitq,
+			atomic_read(&svcpt->scp_nreps_difficult) == 0,
+			cfs_time_seconds(10)) > 0)
+			break;
+		CWARN("Unexpectedly long timeout %s %p\n",
+		      svcpt->scp_service->srv_name, svcpt->scp_service);
+	}
+}
+
+static void
+ptlrpc_service_del_atimer(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part *svcpt;
+	int i;
+
+	/* early disarm AT timer... */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			del_timer(&svcpt->scp_at_timer);
+	}
+}
+
+static void
+ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part *svcpt;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int rc;
+	int i;
+
+	/*
+	 * All history will be culled when the next request buffer is
+	 * freed in ptlrpc_service_purge_all()
+	 */
+	svc->srv_hist_nrqbds_cpt_max = 0;
+
+	rc = LNetClearLazyPortal(svc->srv_req_portal);
+	LASSERT(rc == 0);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/*
+		 * Unlink all the request buffers.  This forces a 'final'
+		 * event with its 'unlink' flag set for each posted rqbd
+		 */
+		list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted,
+					rqbd_list) {
+			rc = LNetMDUnlink(rqbd->rqbd_md_h);
+			LASSERT(rc == 0 || rc == -ENOENT);
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/*
+		 * Wait for the network to release any buffers
+		 * it's currently filling
+		 */
+		spin_lock(&svcpt->scp_lock);
+		while (svcpt->scp_nrqbds_posted != 0) {
+			int seconds = PTLRPC_REQ_LONG_UNLINK;
+
+			spin_unlock(&svcpt->scp_lock);
+			/*
+			 * Network access will complete in finite time but
+			 * the HUGE timeout lets us CWARN for visibility
+			 * of sluggish NALs
+			 */
+			while (seconds > 0 &&
+			       wait_event_idle_timeout(
+				       svcpt->scp_waitq,
+				       svcpt->scp_nrqbds_posted == 0,
+				       cfs_time_seconds(1)) == 0)
+				seconds -= 1;
+			if (seconds == 0) {
+				CWARN("Service %s waiting for request buffers\n",
+				      svcpt->scp_service->srv_name);
+			}
+			spin_lock(&svcpt->scp_lock);
+		}
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+static void
+ptlrpc_service_purge_all(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part *svcpt;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	struct ptlrpc_request *req;
+	struct ptlrpc_reply_state *rs;
+	int i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		spin_lock(&svcpt->scp_rep_lock);
+		while ((rs = list_first_entry_or_null(&svcpt->scp_rep_active,
+						      struct ptlrpc_reply_state,
+						      rs_list)) != NULL) {
+			spin_lock(&rs->rs_lock);
+			ptlrpc_schedule_difficult_reply(rs);
+			spin_unlock(&rs->rs_lock);
+		}
+		spin_unlock(&svcpt->scp_rep_lock);
+
+		/*
+		 * purge the request queue.  NB No new replies (rqbds
+		 * all unlinked) and no service threads, so I'm the only
+		 * thread noodling the request queue now
+		 */
+		while ((req = list_first_entry_or_null(&svcpt->scp_req_incoming,
+						       struct ptlrpc_request,
+						       rq_list)) != NULL) {
+			list_del(&req->rq_list);
+			svcpt->scp_nreqs_incoming--;
+			ptlrpc_server_finish_request(svcpt, req);
+		}
+
+		while (ptlrpc_server_request_pending(svcpt, true)) {
+			req = ptlrpc_server_request_get(svcpt, true);
+			ptlrpc_server_finish_active_request(svcpt, req);
+		}
+
+		/*
+		 * The portal may be shared by several services (eg:OUT_PORTAL).
+		 * So the request could be referenced by other target. So we
+		 * have to wait the ptlrpc_server_drop_request invoked.
+		 *
+		 * TODO: move the req_buffer as global rather than per service.
+		 */
+		spin_lock(&svcpt->scp_lock);
+		while (!list_empty(&svcpt->scp_rqbd_posted)) {
+			spin_unlock(&svcpt->scp_lock);
+			wait_event_idle_timeout(svcpt->scp_waitq,
+				list_empty(&svcpt->scp_rqbd_posted),
+				cfs_time_seconds(1));
+			spin_lock(&svcpt->scp_lock);
+		}
+		spin_unlock(&svcpt->scp_lock);
+
+		LASSERT(svcpt->scp_nreqs_incoming == 0);
+		LASSERT(svcpt->scp_nreqs_active == 0);
+		/*
+		 * history should have been culled by
+		 * ptlrpc_server_finish_request
+		 */
+		LASSERT(svcpt->scp_hist_nrqbds == 0);
+
+		/*
+		 * Now free all the request buffers since nothing
+		 * references them any more...
+		 */
+		while ((rqbd = list_first_entry_or_null(&svcpt->scp_rqbd_idle,
+							struct ptlrpc_request_buffer_desc,
+							rqbd_list)) != NULL)
+			ptlrpc_free_rqbd(rqbd);
+
+		ptlrpc_wait_replies(svcpt);
+
+		while ((rs = list_first_entry_or_null(&svcpt->scp_rep_idle,
+						      struct ptlrpc_reply_state,
+						      rs_list)) != NULL) {
+			list_del(&rs->rs_list);
+			OBD_FREE_LARGE(rs, svc->srv_max_reply_size);
+		}
+	}
+}
+
+static void
+ptlrpc_service_free(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_at_array		*array;
+	int				i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* In case somebody rearmed this in the meantime */
+		del_timer(&svcpt->scp_at_timer);
+		array = &svcpt->scp_at_array;
+
+		if (array->paa_reqs_array != NULL) {
+			OBD_FREE_PTR_ARRAY(array->paa_reqs_array,
+					   array->paa_size);
+			array->paa_reqs_array = NULL;
+		}
+
+		if (array->paa_reqs_count != NULL) {
+			OBD_FREE_PTR_ARRAY(array->paa_reqs_count,
+					   array->paa_size);
+			array->paa_reqs_count = NULL;
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		OBD_FREE_PTR(svcpt);
+
+	if (svc->srv_cpts != NULL)
+		cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts);
+
+	OBD_FREE(svc, offsetof(struct ptlrpc_service,
+			       srv_parts[svc->srv_ncpts]));
+}
+
+int ptlrpc_unregister_service(struct ptlrpc_service *service)
+{
+	ENTRY;
+
+	CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
+
+	service->srv_is_stopping = 1;
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_del_init(&service->srv_list);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	ptlrpc_service_del_atimer(service);
+	ptlrpc_stop_all_threads(service);
+
+	ptlrpc_service_unlink_rqbd(service);
+	ptlrpc_service_purge_all(service);
+	ptlrpc_service_nrs_cleanup(service);
+
+	ptlrpc_lprocfs_unregister_service(service);
+	ptlrpc_sysfs_unregister_service(service);
+
+	ptlrpc_service_free(service);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_service);
+
+/**
+ * Returns 0 if the service is healthy.
+ *
+ * Right now, it just checks to make sure that requests aren't languishing
+ * in the queue.  We'll use this health check to govern whether a node needs
+ * to be shot, so it's intentionally non-aggressive.
+ */
+static int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request *request = NULL;
+	struct timespec64 right_now;
+	struct timespec64 timediff;
+
+	ktime_get_real_ts64(&right_now);
+
+	spin_lock(&svcpt->scp_req_lock);
+	/* How long has the next entry been waiting? */
+	if (ptlrpc_server_high_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
+	else if (ptlrpc_server_normal_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
+
+	if (request == NULL) {
+		spin_unlock(&svcpt->scp_req_lock);
+		return 0;
+	}
+
+	timediff = timespec64_sub(right_now, request->rq_arrival_time);
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if ((timediff.tv_sec) >
+	    (AT_OFF ? obd_timeout * 3 / 2 : at_max)) {
+		CERROR("%s: unhealthy - request has been waiting %llds\n",
+		       svcpt->scp_service->srv_name, (s64)timediff.tv_sec);
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+ptlrpc_service_health_check(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	int				i;
+
+	if (svc == NULL)
+		return 0;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		int rc = ptlrpc_svcpt_health_check(svcpt);
+
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_service_health_check);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
new file mode 100644
index 0000000000000..c196eb280f4d5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
@@ -0,0 +1,46 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+# include <linux/fs.h>
+# include <linux/posix_acl_xattr.h>
+#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_access_log.h>
+#include <uapi/linux/lustre/lustre_lfsck_user.h>
+#include <uapi/linux/lustre/lustre_cfg.h>
+
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
new file mode 100644
index 0000000000000..71bf668c1295b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
@@ -0,0 +1,6095 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+# include <linux/fs.h>
+# include <linux/posix_acl_xattr.h>
+#endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_access_log.h>
+#include <uapi/linux/lustre/lustre_lfsck_user.h>
+#include <uapi/linux/lustre/lustre_cfg.h>
+
+
+void lustre_assert_wire_constants(void)
+{
+	/* Wire protocol assertions generated by 'wirecheck'
+	 * (make -C lustre/utils newwiretest)
+	 */
+
+	/* Constants... */
+	LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REQUEST);
+	LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n",
+		 (long long)PTL_RPC_MSG_ERR);
+	LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REPLY);
+	LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n",
+		 MDS_DIR_END_OFF);
+	LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n",
+		 DEAD_HANDLE_MAGIC);
+	BUILD_BUG_ON(MTI_NAME_MAXLEN != 64);
+	LASSERTF(OST_REPLY == 0, "found %lld\n",
+		 (long long)OST_REPLY);
+	LASSERTF(OST_GETATTR == 1, "found %lld\n",
+		 (long long)OST_GETATTR);
+	LASSERTF(OST_SETATTR == 2, "found %lld\n",
+		 (long long)OST_SETATTR);
+	LASSERTF(OST_READ == 3, "found %lld\n",
+		 (long long)OST_READ);
+	LASSERTF(OST_WRITE == 4, "found %lld\n",
+		 (long long)OST_WRITE);
+	LASSERTF(OST_CREATE == 5, "found %lld\n",
+		 (long long)OST_CREATE);
+	LASSERTF(OST_DESTROY == 6, "found %lld\n",
+		 (long long)OST_DESTROY);
+	LASSERTF(OST_GET_INFO == 7, "found %lld\n",
+		 (long long)OST_GET_INFO);
+	LASSERTF(OST_CONNECT == 8, "found %lld\n",
+		 (long long)OST_CONNECT);
+	LASSERTF(OST_DISCONNECT == 9, "found %lld\n",
+		 (long long)OST_DISCONNECT);
+	LASSERTF(OST_PUNCH == 10, "found %lld\n",
+		 (long long)OST_PUNCH);
+	LASSERTF(OST_OPEN == 11, "found %lld\n",
+		 (long long)OST_OPEN);
+	LASSERTF(OST_CLOSE == 12, "found %lld\n",
+		 (long long)OST_CLOSE);
+	LASSERTF(OST_STATFS == 13, "found %lld\n",
+		 (long long)OST_STATFS);
+	LASSERTF(OST_SYNC == 16, "found %lld\n",
+		 (long long)OST_SYNC);
+	LASSERTF(OST_SET_INFO == 17, "found %lld\n",
+		 (long long)OST_SET_INFO);
+	LASSERTF(OST_QUOTACHECK == 18, "found %lld\n",
+		 (long long)OST_QUOTACHECK);
+	LASSERTF(OST_QUOTACTL == 19, "found %lld\n",
+		 (long long)OST_QUOTACTL);
+	LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n",
+		 (long long)OST_QUOTA_ADJUST_QUNIT);
+	LASSERTF(OST_LADVISE == 21, "found %lld\n",
+		 (long long)OST_LADVISE);
+	LASSERTF(OST_FALLOCATE == 22, "found %lld\n",
+		 (long long)OST_FALLOCATE);
+	LASSERTF(OST_SEEK == 23, "found %lld\n",
+		 (long long)OST_SEEK);
+	LASSERTF(OST_LAST_OPC == 24, "found %lld\n",
+		 (long long)OST_LAST_OPC);
+	LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+		 OBD_OBJECT_EOF);
+	LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n",
+		 (long long)OST_MIN_PRECREATE);
+	LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n",
+		 (long long)OST_MAX_PRECREATE);
+	LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_INIT);
+	LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_MASK);
+	LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n",
+		 (long long)MDS_FIRST_OPC);
+	LASSERTF(MDS_GETATTR == 33, "found %lld\n",
+		 (long long)MDS_GETATTR);
+	LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n",
+		 (long long)MDS_GETATTR_NAME);
+	LASSERTF(MDS_CLOSE == 35, "found %lld\n",
+		 (long long)MDS_CLOSE);
+	LASSERTF(MDS_REINT == 36, "found %lld\n",
+		 (long long)MDS_REINT);
+	LASSERTF(MDS_READPAGE == 37, "found %lld\n",
+		 (long long)MDS_READPAGE);
+	LASSERTF(MDS_CONNECT == 38, "found %lld\n",
+		 (long long)MDS_CONNECT);
+	LASSERTF(MDS_DISCONNECT == 39, "found %lld\n",
+		 (long long)MDS_DISCONNECT);
+	LASSERTF(MDS_GET_ROOT == 40, "found %lld\n",
+		 (long long)MDS_GET_ROOT);
+	LASSERTF(MDS_STATFS == 41, "found %lld\n",
+		 (long long)MDS_STATFS);
+	LASSERTF(MDS_PIN == 42, "found %lld\n",
+		 (long long)MDS_PIN);
+	LASSERTF(MDS_UNPIN == 43, "found %lld\n",
+		 (long long)MDS_UNPIN);
+	LASSERTF(MDS_SYNC == 44, "found %lld\n",
+		 (long long)MDS_SYNC);
+	LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n",
+		 (long long)MDS_DONE_WRITING);
+	LASSERTF(MDS_SET_INFO == 46, "found %lld\n",
+		 (long long)MDS_SET_INFO);
+	LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n",
+		 (long long)MDS_QUOTACHECK);
+	LASSERTF(MDS_QUOTACTL == 48, "found %lld\n",
+		 (long long)MDS_QUOTACTL);
+	LASSERTF(MDS_GETXATTR == 49, "found %lld\n",
+		 (long long)MDS_GETXATTR);
+	LASSERTF(MDS_SETXATTR == 50, "found %lld\n",
+		 (long long)MDS_SETXATTR);
+	LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n",
+		 (long long)MDS_WRITEPAGE);
+	LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n",
+		 (long long)MDS_IS_SUBDIR);
+	LASSERTF(MDS_GET_INFO == 53, "found %lld\n",
+		 (long long)MDS_GET_INFO);
+	LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n",
+		 (long long)MDS_HSM_STATE_GET);
+	LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n",
+		 (long long)MDS_HSM_STATE_SET);
+	LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n",
+		 (long long)MDS_HSM_ACTION);
+	LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n",
+		 (long long)MDS_HSM_PROGRESS);
+	LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n",
+		 (long long)MDS_HSM_REQUEST);
+	LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n",
+		 (long long)MDS_HSM_CT_REGISTER);
+	LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n",
+		 (long long)MDS_HSM_CT_UNREGISTER);
+	LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
+		 (long long)MDS_SWAP_LAYOUTS);
+	LASSERTF(MDS_RMFID == 62, "found %lld\n",
+		 (long long)MDS_RMFID);
+	LASSERTF(MDS_LAST_OPC == 63, "found %lld\n",
+		 (long long)MDS_LAST_OPC);
+	LASSERTF(REINT_SETATTR == 1, "found %lld\n",
+		 (long long)REINT_SETATTR);
+	LASSERTF(REINT_CREATE == 2, "found %lld\n",
+		 (long long)REINT_CREATE);
+	LASSERTF(REINT_LINK == 3, "found %lld\n",
+		 (long long)REINT_LINK);
+	LASSERTF(REINT_UNLINK == 4, "found %lld\n",
+		 (long long)REINT_UNLINK);
+	LASSERTF(REINT_RENAME == 5, "found %lld\n",
+		 (long long)REINT_RENAME);
+	LASSERTF(REINT_OPEN == 6, "found %lld\n",
+		 (long long)REINT_OPEN);
+	LASSERTF(REINT_SETXATTR == 7, "found %lld\n",
+		 (long long)REINT_SETXATTR);
+	LASSERTF(REINT_RMENTRY == 8, "found %lld\n",
+		 (long long)REINT_RMENTRY);
+	LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
+		 (long long)REINT_MIGRATE);
+	LASSERTF(REINT_MAX == 11, "found %lld\n",
+		 (long long)REINT_MAX);
+	LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_IT_EXECD);
+	LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_EXECD);
+	LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_NEG);
+	LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_POS);
+	LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_CREATE);
+	LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_OPEN);
+	LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_COMPLETE);
+	LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_OPEN_REF);
+	LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_CREATE_REF);
+	LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_LOCK);
+	LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n",
+		 (long long)MDS_STATUS_CONN);
+	LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n",
+		 (long long)MDS_STATUS_LOV);
+	LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MODE);
+	LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_UID);
+	LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_GID);
+	LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_SIZE);
+	LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME);
+	LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME);
+	LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME);
+	LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME_SET);
+	LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME_SET);
+	LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FORCE);
+	LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATTR_FLAG);
+	LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SUID);
+	LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SGID);
+	LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME_SET);
+	LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FROM_OPEN);
+	LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_BLOCKS);
+	LASSERTF(MDS_ATTR_PROJID == 0x0000000000010000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_PROJID);
+	LASSERTF(MDS_ATTR_LSIZE == 0x0000000000020000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_LSIZE);
+	LASSERTF(MDS_ATTR_LBLOCKS == 0x0000000000040000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_LBLOCKS);
+	LASSERTF(MDS_ATTR_OVERRIDE == 0x0000000002000000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_OVERRIDE);
+	LASSERTF(FLD_QUERY == 900, "found %lld\n",
+		 (long long)FLD_QUERY);
+	LASSERTF(FLD_READ == 901, "found %lld\n",
+		 (long long)FLD_READ);
+	LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n",
+		 (long long)FLD_FIRST_OPC);
+	LASSERTF(FLD_LAST_OPC == 902, "found %lld\n",
+		 (long long)FLD_LAST_OPC);
+	LASSERTF(SEQ_QUERY == 700, "found %lld\n",
+		 (long long)SEQ_QUERY);
+	LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n",
+		 (long long)SEQ_FIRST_OPC);
+	LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n",
+		 (long long)SEQ_LAST_OPC);
+#ifdef HAVE_SERVER_SUPPORT
+	LASSERTF(LFSCK_NOTIFY == 1101, "found %lld\n",
+		 (long long)LFSCK_NOTIFY);
+	LASSERTF(LFSCK_QUERY == 1102, "found %lld\n",
+		 (long long)LFSCK_QUERY);
+	LASSERTF(LFSCK_FIRST_OPC == 1101, "found %lld\n",
+		 (long long)LFSCK_FIRST_OPC);
+	LASSERTF(LFSCK_LAST_OPC == 1103, "found %lld\n",
+		 (long long)LFSCK_LAST_OPC);
+#endif /* HAVE_SERVER_SUPPORT */
+	LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n",
+		 (long long)SEQ_ALLOC_SUPER);
+	LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n",
+		 (long long)SEQ_ALLOC_META);
+	LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n",
+		 (long long)LDLM_ENQUEUE);
+	LASSERTF(LDLM_CONVERT == 102, "found %lld\n",
+		 (long long)LDLM_CONVERT);
+	LASSERTF(LDLM_CANCEL == 103, "found %lld\n",
+		 (long long)LDLM_CANCEL);
+	LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n",
+		 (long long)LDLM_BL_CALLBACK);
+	LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n",
+		 (long long)LDLM_CP_CALLBACK);
+	LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n",
+		 (long long)LDLM_GL_CALLBACK);
+	LASSERTF(LDLM_SET_INFO == 107, "found %lld\n",
+		 (long long)LDLM_SET_INFO);
+	LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n",
+		 (long long)LDLM_LAST_OPC);
+	LASSERTF(LCK_MINMODE == 0, "found %lld\n",
+		 (long long)LCK_MINMODE);
+	LASSERTF(LCK_EX == 1, "found %lld\n",
+		 (long long)LCK_EX);
+	LASSERTF(LCK_PW == 2, "found %lld\n",
+		 (long long)LCK_PW);
+	LASSERTF(LCK_PR == 4, "found %lld\n",
+		 (long long)LCK_PR);
+	LASSERTF(LCK_CW == 8, "found %lld\n",
+		 (long long)LCK_CW);
+	LASSERTF(LCK_CR == 16, "found %lld\n",
+		 (long long)LCK_CR);
+	LASSERTF(LCK_NL == 32, "found %lld\n",
+		 (long long)LCK_NL);
+	LASSERTF(LCK_GROUP == 64, "found %lld\n",
+		 (long long)LCK_GROUP);
+	LASSERTF(LCK_COS == 128, "found %lld\n",
+		 (long long)LCK_COS);
+	LASSERTF(LCK_MAXMODE == 129, "found %lld\n",
+		 (long long)LCK_MAXMODE);
+	LASSERTF(LCK_MODE_NUM == 8, "found %lld\n",
+		 (long long)LCK_MODE_NUM);
+	BUILD_BUG_ON(LDLM_PLAIN != 10);
+	BUILD_BUG_ON(LDLM_EXTENT != 11);
+	BUILD_BUG_ON(LDLM_FLOCK != 12);
+	BUILD_BUG_ON(LDLM_IBITS != 13);
+	BUILD_BUG_ON(LDLM_MAX_TYPE != 14);
+	BUILD_BUG_ON(LUSTRE_RES_ID_SEQ_OFF != 0);
+	BUILD_BUG_ON(LUSTRE_RES_ID_VER_OID_OFF != 1);
+	BUILD_BUG_ON(LUSTRE_RES_ID_QUOTA_SEQ_OFF != 2);
+	BUILD_BUG_ON(LUSTRE_RES_ID_QUOTA_VER_OID_OFF != 3);
+	BUILD_BUG_ON(LUSTRE_RES_ID_HSH_OFF != 3);
+#ifdef HAVE_SERVER_SUPPORT
+	LASSERTF(OUT_UPDATE == 1000, "found %lld\n",
+		 (long long)OUT_UPDATE);
+	LASSERTF(OUT_UPDATE_LAST_OPC == 1001, "found %lld\n",
+		 (long long)OUT_UPDATE_LAST_OPC);
+	BUILD_BUG_ON(LQUOTA_TYPE_USR != 0);
+	BUILD_BUG_ON(LQUOTA_TYPE_GRP != 1);
+	BUILD_BUG_ON(LQUOTA_RES_MD != 1);
+	BUILD_BUG_ON(LQUOTA_RES_DT != 2);
+#endif /* HAVE_SERVER_SUPPORT */
+	LASSERTF(OBD_PING == 400, "found %lld\n",
+		 (long long)OBD_PING);
+	LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
+		 (long long)OBD_IDX_READ);
+	LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
+		 (long long)OBD_LAST_OPC);
+	LASSERTF(QUOTA_DQACQ == 601, "found %lld\n",
+		 (long long)QUOTA_DQACQ);
+	LASSERTF(QUOTA_DQREL == 602, "found %lld\n",
+		 (long long)QUOTA_DQREL);
+	LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n",
+		 (long long)QUOTA_LAST_OPC);
+	LASSERTF(MGS_CONNECT == 250, "found %lld\n",
+		 (long long)MGS_CONNECT);
+	LASSERTF(MGS_DISCONNECT == 251, "found %lld\n",
+		 (long long)MGS_DISCONNECT);
+	LASSERTF(MGS_EXCEPTION == 252, "found %lld\n",
+		 (long long)MGS_EXCEPTION);
+	LASSERTF(MGS_TARGET_REG == 253, "found %lld\n",
+		 (long long)MGS_TARGET_REG);
+	LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n",
+		 (long long)MGS_TARGET_DEL);
+	LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
+		 (long long)MGS_SET_INFO);
+	LASSERTF(MGS_CONFIG_READ == 256, "found %lld\n",
+		 (long long)MGS_CONFIG_READ);
+	LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
+		 (long long)MGS_LAST_OPC);
+	LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
+		 (long long)SEC_CTX_INIT);
+	LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n",
+		 (long long)SEC_CTX_INIT_CONT);
+	LASSERTF(SEC_CTX_FINI == 803, "found %lld\n",
+		 (long long)SEC_CTX_FINI);
+	LASSERTF(SEC_LAST_OPC == 804, "found %lld\n",
+		 (long long)SEC_LAST_OPC);
+	/* Sizes and Offsets */
+
+	/* Checks for struct obd_uuid */
+	LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_uuid));
+
+	/* Checks for struct lu_seq_range */
+	LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_seq_range));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_start));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_end));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_index));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_flags));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags));
+	LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_MDT);
+	LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_OST);
+
+	/* Checks for struct lustre_som_attrs */
+	LASSERTF((int)sizeof(struct lustre_som_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_som_attrs));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_valid));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_reserved) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_reserved));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved) == 6, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_size) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_size));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_size));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_blocks) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_blocks));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks));
+#ifdef HAVE_SERVER_SUPPORT
+
+	/* Checks for struct lustre_mdt_attrs */
+	LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_mdt_attrs));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid));
+	LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_HSM);
+	LASSERTF(LMAC_NOT_IN_OI == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_NOT_IN_OI);
+	LASSERTF(LMAC_FID_ON_OST == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_FID_ON_OST);
+	LASSERTF(LMAC_STRIPE_INFO == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_STRIPE_INFO);
+	LASSERTF(LMAC_COMP_INFO == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_COMP_INFO);
+	LASSERTF(LMAC_IDX_BACKUP == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_IDX_BACKUP);
+	LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_RELEASED);
+	LASSERTF(LMAI_AGENT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_AGENT);
+	LASSERTF(LMAI_REMOTE_PARENT == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_REMOTE_PARENT);
+	LASSERTF(LMAI_STRIPED == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_STRIPED);
+	LASSERTF(LMAI_ORPHAN == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_ORPHAN);
+	LASSERTF(LMAI_ENCRYPT == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_ENCRYPT);
+
+	/* Checks for struct lustre_ost_attrs */
+	LASSERTF((int)sizeof(struct lustre_ost_attrs) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_ost_attrs));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_lma) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_lma));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_lma) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_lma));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_parent_fid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_parent_fid));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_parent_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_parent_fid));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_stripe_size) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_stripe_size));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_stripe_size));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_id) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_id));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_id));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_start) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_start));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_start));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_end) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_end));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_end));
+	LASSERTF(OUT_CREATE == 1, "found %lld\n",
+		 (long long)OUT_CREATE);
+	LASSERTF(OUT_DESTROY == 2, "found %lld\n",
+		 (long long)OUT_DESTROY);
+	LASSERTF(OUT_REF_ADD == 3, "found %lld\n",
+		 (long long)OUT_REF_ADD);
+	LASSERTF(OUT_REF_DEL == 4, "found %lld\n",
+		 (long long)OUT_REF_DEL);
+	LASSERTF(OUT_ATTR_SET == 5, "found %lld\n",
+		 (long long)OUT_ATTR_SET);
+	LASSERTF(OUT_ATTR_GET == 6, "found %lld\n",
+		 (long long)OUT_ATTR_GET);
+	LASSERTF(OUT_XATTR_SET == 7, "found %lld\n",
+		 (long long)OUT_XATTR_SET);
+	LASSERTF(OUT_XATTR_GET == 8, "found %lld\n",
+		 (long long)OUT_XATTR_GET);
+	LASSERTF(OUT_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OUT_INDEX_LOOKUP);
+	LASSERTF(OUT_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OUT_INDEX_LOOKUP);
+	LASSERTF(OUT_INDEX_INSERT == 10, "found %lld\n",
+		 (long long)OUT_INDEX_INSERT);
+	LASSERTF(OUT_INDEX_DELETE == 11, "found %lld\n",
+		 (long long)OUT_INDEX_DELETE);
+	LASSERTF(OUT_WRITE == 12, "found %lld\n",
+		 (long long)OUT_WRITE);
+	LASSERTF(OUT_XATTR_DEL == 13, "found %lld\n",
+		 (long long)OUT_XATTR_DEL);
+	LASSERTF(OUT_PUNCH == 14, "found %lld\n",
+		 (long long)OUT_PUNCH);
+	LASSERTF(OUT_READ == 15, "found %lld\n",
+		 (long long)OUT_READ);
+	LASSERTF(OUT_NOOP == 16, "found %lld\n",
+		 (long long)OUT_NOOP);
+	LASSERTF(OUT_XATTR_LIST == 17, "found %lld\n",
+		 (long long)OUT_XATTR_LIST);
+
+	/* Checks for struct hsm_attrs */
+	LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_attrs));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_compat));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_compat));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_flags));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_flags));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_arch_id));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_ver) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_arch_ver));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver));
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct ost_id */
+	LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_id));
+	LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_id, oi));
+	LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_id *)0)->oi));
+	LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n",
+		 (long long)LUSTRE_FID_INIT_OID);
+	LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n",
+		 (long long)FID_SEQ_OST_MDT0);
+	LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n",
+		 (long long)FID_SEQ_LLOG);
+	LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n",
+		 (long long)FID_SEQ_ECHO);
+	LASSERTF(FID_SEQ_UNUSED_START == 3, "found %lld\n",
+		 (long long)FID_SEQ_UNUSED_START);
+	LASSERTF(FID_SEQ_UNUSED_END == 9, "found %lld\n",
+		 (long long)FID_SEQ_UNUSED_END);
+	LASSERTF(FID_SEQ_LLOG_NAME == 10, "found %lld\n",
+		 (long long)FID_SEQ_LLOG_NAME);
+	LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n",
+		 (long long)FID_SEQ_RSVD);
+	LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n",
+		 (long long)FID_SEQ_IGIF);
+	LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IGIF_MAX);
+	LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF);
+	LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF_MAX);
+	LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_START);
+	LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOCAL_FILE);
+	LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_DOT_LUSTRE);
+	LASSERTF(FID_SEQ_LOCAL_NAME == 0x0000000200000003ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOCAL_NAME);
+	LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_SPECIAL);
+	LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA);
+	LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA_GLB);
+	LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_ROOT);
+#ifdef HAVE_SERVER_SUPPORT
+	LASSERTF(FID_SEQ_LAYOUT_RBTREE == 0x0000000200000008ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LAYOUT_RBTREE);
+	LASSERTF(FID_SEQ_UPDATE_LOG == 0x0000000200000009ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_UPDATE_LOG);
+	LASSERTF(FID_SEQ_UPDATE_LOG_DIR == 0x000000020000000aULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_UPDATE_LOG_DIR);
+#endif /* HAVE_SERVER_SUPPORT */
+	LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_NORMAL);
+	LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOV_DEFAULT);
+	LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_SPECIAL_BFL);
+	LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE);
+	LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE_OBF);
+	LASSERTF(FID_OID_DOT_LUSTRE_LPF == 0x00000003UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE_LPF);
+
+	/* Checks for struct lu_dirent */
+	LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirent));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_fid));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_hash));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_reclen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_namelen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_attrs));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_name[0]));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0]));
+	LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_FID);
+	LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_TYPE);
+	LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_64BITHASH);
+
+	/* Checks for struct luda_type */
+	LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(struct luda_type));
+	LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct luda_type, lt_type));
+	LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct luda_type *)0)->lt_type));
+
+	/* Checks for struct lu_dirpage */
+	LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirpage));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_flags));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_pad0));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0]));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]));
+	LASSERTF(LDF_EMPTY == 1, "found %lld\n",
+		 (long long)LDF_EMPTY);
+	LASSERTF(LDF_COLLIDE == 2, "found %lld\n",
+		 (long long)LDF_COLLIDE);
+	LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n",
+		 (long long)LU_PAGE_SIZE);
+#ifdef HAVE_SERVER_SUPPORT
+	/* Checks for union lu_page */
+	LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(union lu_page));
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct lu_ladvise */
+	LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_ladvise));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_advice));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value1));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value2));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_start));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_end));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value3));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
+	LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
+		 (long long)LU_LADVISE_WILLREAD);
+	LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n",
+		 (long long)LU_LADVISE_DONTNEED);
+	LASSERTF(LU_LADVISE_LOCKNOEXPAND == 3, "found %lld\n",
+		 (long long)LU_LADVISE_LOCKNOEXPAND);
+	LASSERTF(LU_LADVISE_LOCKAHEAD == 4, "found %lld\n",
+		 (long long)LU_LADVISE_LOCKAHEAD);
+
+	/* Checks for struct ladvise_hdr */
+	LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ladvise_hdr));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_magic));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_count));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_flags));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value1));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value2));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value3));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
+	BUILD_BUG_ON(LF_ASYNC != 0x00000001);
+	BUILD_BUG_ON(LF_UNSET != 0x00000002);
+	BUILD_BUG_ON(LADVISE_MAGIC != 0x1adf1ce0);
+
+	/* Checks for struct lustre_handle */
+	LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_handle));
+	LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_handle, cookie));
+	LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_handle *)0)->cookie));
+
+	/* Checks for struct lustre_msg_v2 */
+	LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_msg_v2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_magic));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
+	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0bd00bd3UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MSG_MAGIC_V2);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xd30bd00bUL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+	/* Checks for struct ptlrpc_body */
+	LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
+		 (long long)(int)sizeof(struct ptlrpc_body_v3));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_tag));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == 34, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding0));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding1));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv));
+	BUILD_BUG_ON(PTLRPC_NUM_VERSIONS != 4);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_mbits));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_0));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_1));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_2));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2));
+	BUILD_BUG_ON(LUSTRE_JOBID_SIZE != 32);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == (int)offsetof(struct ptlrpc_body_v2, pb_tag), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_tag), (int)offsetof(struct ptlrpc_body_v2, pb_tag));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding0), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding0), (int)offsetof(struct ptlrpc_body_v2, pb_padding0));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding1), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding1), (int)offsetof(struct ptlrpc_body_v2, pb_padding1));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == (int)offsetof(struct ptlrpc_body_v2, pb_mbits), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_mbits), (int)offsetof(struct ptlrpc_body_v2, pb_mbits));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding64_0), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding64_1), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding64_2), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2));
+	LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n",
+		 (long long)MSG_PTLRPC_BODY_OFF);
+	LASSERTF(REQ_REC_OFF == 1, "found %lld\n",
+		 (long long)REQ_REC_OFF);
+	LASSERTF(REPLY_REC_OFF == 1, "found %lld\n",
+		 (long long)REPLY_REC_OFF);
+	LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREQ_OFF);
+	LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REQ_REC_OFF);
+	LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n",
+		 (long long)DLM_INTENT_IT_OFF);
+	LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n",
+		 (long long)DLM_INTENT_REC_OFF);
+	LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREPLY_OFF);
+	LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REPLY_REC_OFF);
+	LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
+		 (long long)MSG_PTLRPC_HEADER_OFF);
+	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003UL, "found 0x%.8xUL\n",
+		(unsigned)PTLRPC_MSG_VERSION);
+	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_VERSION_MASK);
+	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_OBD_VERSION);
+	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MDS_VERSION);
+	LASSERTF(LUSTRE_OST_VERSION == 0x00030000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_OST_VERSION);
+	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_DLM_VERSION);
+	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_LOG_VERSION);
+	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MGS_VERSION);
+	LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
+		 (long long)MSGHDR_AT_SUPPORT);
+	LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
+		 (long long)MSGHDR_CKSUM_INCOMPAT18);
+	LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_RESENT);
+	LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REPLAY);
+	LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REQ_REPLAY_DONE);
+	LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LOCK_REPLAY_DONE);
+	LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECOVERING);
+	LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECONNECT);
+	LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_REPLAYABLE);
+	LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_LIBCLIENT);
+	LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_INITIAL);
+	LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_NEXT_VER);
+	LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_TRANSNO);
+
+	/* Checks for struct obd_connect_data */
+	LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_connect_data));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_version));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_index));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_blkbits) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_blkbits));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_inobits) == 33, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_inobits));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_tax_kb) == 34, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_tax_kb));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_max_blks) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_max_blks));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_transno));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_group));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_instance));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxmodrpcs) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_maxmodrpcs));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding0) == 74, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding0));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding0));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 76, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding1));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags2));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding3));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding4));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding5));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding6));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding7));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding8));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding9));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingA));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingB));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingC));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingD));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingE));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingF));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF));
+	LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RDONLY);
+	LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_INDEX);
+	LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS);
+	LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT);
+	LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SRVLOCK);
+	LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VERSION);
+	LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REQPORTAL);
+	LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ACL);
+	LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_XATTR);
+	LASSERTF(OBD_CONNECT_LARGE_ACL == 0x200ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LARGE_ACL);
+	LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_TRANSNO);
+	LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IBITS);
+	LASSERTF(OBD_CONNECT_BARRIER == 0x2000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_BARRIER);
+	LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ATTRFID);
+	LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NODEVOH);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT_FORCE);
+	LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_BRW_SIZE);
+	LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_QUOTA64);
+	LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_CAPA);
+	LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_OSS_CAPA);
+	LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CANCELSET);
+	LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SOM);
+	LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_AT);
+	LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LRU_RESIZE);
+	LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_MDS);
+	LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REAL);
+	LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CHANGE_QS);
+	LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CKSUM);
+	LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FID);
+	LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VBR);
+	LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LOV_V3);
+	LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_SHRINK);
+	LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SKIP_ORPHAN);
+	LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAX_EASIZE);
+	LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FULL20);
+	LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LAYOUTLOCK);
+	LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_64BITHASH);
+	LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAXBYTES);
+	LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IMP_RECOV);
+	LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_JOBSTATS);
+	LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_UMASK);
+	LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_EINPROGRESS);
+	LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_PARAM);
+	LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FLOCK_OWNER);
+	LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LVB_TYPE);
+	LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NANOSEC_TIME);
+	LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LIGHTWEIGHT);
+	LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SHORTIO);
+	LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_PINGLESS);
+	LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FLOCK_DEAD);
+	LASSERTF(OBD_CONNECT_DISP_STRIPE == 0x10000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_DISP_STRIPE);
+	LASSERTF(OBD_CONNECT_OPEN_BY_FID == 0x20000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_OPEN_BY_FID);
+	LASSERTF(OBD_CONNECT_LFSCK == 0x40000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LFSCK);
+	LASSERTF(OBD_CONNECT_UNLINK_CLOSE == 0x100000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_UNLINK_CLOSE);
+	LASSERTF(OBD_CONNECT_MULTIMODRPCS == 0x200000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MULTIMODRPCS);
+	LASSERTF(OBD_CONNECT_DIR_STRIPE == 0x400000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_DIR_STRIPE);
+	LASSERTF(OBD_CONNECT_SUBTREE == 0x800000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SUBTREE);
+	LASSERTF(OBD_CONNECT_BULK_MBITS == 0x2000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_BULK_MBITS);
+	LASSERTF(OBD_CONNECT_OBDOPACK == 0x4000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_OBDOPACK);
+	LASSERTF(OBD_CONNECT_FLAGS2 == 0x8000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FLAGS2);
+	LASSERTF(OBD_CONNECT2_FILE_SECCTX == 0x1ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_FILE_SECCTX);
+	LASSERTF(OBD_CONNECT2_LOCKAHEAD == 0x2ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LOCKAHEAD);
+	LASSERTF(OBD_CONNECT2_DIR_MIGRATE == 0x4ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_DIR_MIGRATE);
+	LASSERTF(OBD_CONNECT2_SUM_STATFS == 0x8ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_SUM_STATFS);
+	LASSERTF(OBD_CONNECT2_OVERSTRIPING == 0x10ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_OVERSTRIPING);
+	LASSERTF(OBD_CONNECT2_FLR == 0x20ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_FLR);
+	LASSERTF(OBD_CONNECT2_WBC_INTENTS == 0x40ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_WBC_INTENTS);
+	LASSERTF(OBD_CONNECT2_LOCK_CONVERT == 0x80ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LOCK_CONVERT);
+	LASSERTF(OBD_CONNECT2_ARCHIVE_ID_ARRAY == 0x100ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ARCHIVE_ID_ARRAY);
+	LASSERTF(OBD_CONNECT2_INC_XID == 0x200ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_INC_XID);
+	LASSERTF(OBD_CONNECT2_SELINUX_POLICY == 0x400ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_SELINUX_POLICY);
+	LASSERTF(OBD_CONNECT2_LSOM == 0x800ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LSOM);
+	LASSERTF(OBD_CONNECT2_PCC == 0x1000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_PCC);
+	LASSERTF(OBD_CONNECT2_CRUSH == 0x2000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_CRUSH);
+	LASSERTF(OBD_CONNECT2_ASYNC_DISCARD == 0x4000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ASYNC_DISCARD);
+	LASSERTF(OBD_CONNECT2_ENCRYPT == 0x8000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ENCRYPT);
+	LASSERTF(OBD_CONNECT2_FIDMAP == 0x10000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_FIDMAP);
+	LASSERTF(OBD_CONNECT2_GETATTR_PFID == 0x20000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_GETATTR_PFID);
+	LASSERTF(OBD_CONNECT2_LSEEK == 0x40000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LSEEK);
+	LASSERTF(OBD_CONNECT2_DOM_LVB == 0x80000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_DOM_LVB);
+	LASSERTF(OBD_CONNECT2_REP_MBITS == 0x100000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_REP_MBITS);
+	LASSERTF(OBD_CONNECT2_MODE_CONVERT == 0x200000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_MODE_CONVERT);
+	LASSERTF(OBD_CONNECT2_BATCH_RPC == 0x400000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_BATCH_RPC);
+	LASSERTF(OBD_CONNECT2_PCCRO == 0x800000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_PCCRO);
+	LASSERTF(OBD_CONNECT2_ATOMIC_OPEN_LOCK == 0x4000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ATOMIC_OPEN_LOCK);
+	LASSERTF(OBD_CONNECT2_ENCRYPT_NAME == 0x8000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ENCRYPT_NAME);
+	LASSERTF(OBD_CONNECT2_MDLL_BYPASS == OBD_CONNECT2_MDLL_BYPASS, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_MDLL_BYPASS);
+	LASSERTF(OBD_CONNECT2_MDLL == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_MDLL);
+	LASSERTF(OBD_CONNECT2_MDLL_AUTO_REFRESH == 0x2000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_MDLL_AUTO_REFRESH);
+	LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32);
+	LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_ADLER);
+	LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32C);
+	LASSERTF(OBD_CKSUM_RESERVED == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_RESERVED);
+	LASSERTF(OBD_CKSUM_T10IP512 == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10IP512);
+	LASSERTF(OBD_CKSUM_T10IP4K == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10IP4K);
+	LASSERTF(OBD_CKSUM_T10CRC512 == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10CRC512);
+	LASSERTF(OBD_CKSUM_T10CRC4K == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10CRC4K);
+	LASSERTF(OBD_CKSUM_T10_TOP == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10_TOP);
+
+	/* Checks for struct ost_layout */
+	LASSERTF((int)sizeof(struct ost_layout) == 28, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_layout));
+	LASSERTF((int)offsetof(struct ost_layout, ol_stripe_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_stripe_size));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_stripe_size));
+	LASSERTF((int)offsetof(struct ost_layout, ol_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_stripe_count));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_stripe_count));
+	LASSERTF((int)offsetof(struct ost_layout, ol_comp_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_comp_start));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_start));
+	LASSERTF((int)offsetof(struct ost_layout, ol_comp_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_comp_end));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_end));
+	LASSERTF((int)offsetof(struct ost_layout, ol_comp_id) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_comp_id));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_id));
+
+	/* Checks for struct obdo */
+	LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct obdo));
+	LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_valid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_valid));
+	LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_oi));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_oi));
+	LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_seq));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq));
+	LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_size));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_size));
+	LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mtime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mtime));
+	LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_atime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+	LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ctime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ctime));
+	LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blocks));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blocks));
+	LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_grant));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_grant));
+	LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blksize));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blksize));
+	LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mode));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mode));
+	LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid));
+	LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid));
+	LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_flags));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_flags));
+	LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_nlink));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_nlink));
+	LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_oid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid));
+	LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_misc));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_misc));
+	LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ioepoch));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch));
+	LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_stripe_idx));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+	LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_ver));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver));
+	LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_handle));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_handle));
+	LASSERTF((int)offsetof(struct obdo, o_layout) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_layout));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_layout));
+	LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_layout_version));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_layout_version));
+	LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid_h));
+	LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid_h));
+	LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_data_version));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_data_version));
+	LASSERTF((int)offsetof(struct obdo, o_projid) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_projid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_projid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_projid));
+	LASSERTF((int)offsetof(struct obdo, o_padding_4) == 188, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_4));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_4));
+	LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_5));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_5));
+	LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_6));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_6));
+	LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLID);
+	LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLATIME);
+	LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMTIME);
+	LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCTIME);
+	LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLSIZE);
+	LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLOCKS);
+	LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLKSZ);
+	LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODE);
+	LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLTYPE);
+	LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUID);
+	LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGID);
+	LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFLAGS);
+	LASSERTF(OBD_MD_DOM_SIZE == (0X00001000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_DOM_SIZE);
+	LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLNLINK);
+	LASSERTF(OBD_MD_FLPARENT == (0x00004000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLPARENT);
+	LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRDEV);
+	LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEASIZE);
+	LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_LINKNAME);
+	LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLHANDLE);
+	LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSUM);
+	LASSERTF(OBD_MD_FLPRJQUOTA == (0x00400000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLPRJQUOTA);
+	LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGROUP);
+	LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFID);
+	LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRANT);
+	LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDIREA);
+	LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUSRQUOTA);
+	LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRPQUOTA);
+	LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODEASIZE);
+	LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MDS);
+	LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MEA);
+	LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_TSTATE);
+	LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTR);
+	LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRLS);
+	LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRRM);
+	LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLACL);
+	LASSERTF(OBD_MD_FLAGSTATFS == (0x0000010000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLAGSTATFS);
+	LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCROSSREF);
+	LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGETATTRLOCK);
+#ifdef HAVE_SERVER_SUPPORT
+	LASSERTF(OBD_MD_FLOBJCOUNT == (0x0000400000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLOBJCOUNT);
+#endif /* HAVE_SERVER_SUPPORT */
+	LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDATAVERSION);
+	LASSERTF(OBD_MD_CLOSE_INTENT_EXECED == (0x0020000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_CLOSE_INTENT_EXECED);
+	LASSERTF(OBD_MD_DEFAULT_MEA == (0x0040000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_DEFAULT_MEA);
+	LASSERTF(OBD_MD_FLOSTLAYOUT == (0x0080000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLOSTLAYOUT);
+	LASSERTF(OBD_MD_FLPROJID == (0x0100000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLPROJID);
+	LASSERTF(OBD_MD_SECCTX == (0x0200000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_SECCTX);
+	LASSERTF(OBD_MD_FLLAZYSIZE == (0x0400000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLLAZYSIZE);
+	LASSERTF(OBD_MD_FLLAZYBLOCKS == (0x0800000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLLAZYBLOCKS);
+	LASSERTF(OBD_MD_ENCCTX == (0x2000000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_ENCCTX);
+	BUILD_BUG_ON(OBD_FL_INLINEDATA != 0x00000001);
+	BUILD_BUG_ON(OBD_FL_OBDMDEXISTS != 0x00000002);
+	BUILD_BUG_ON(OBD_FL_DELORPHAN != 0x00000004);
+	BUILD_BUG_ON(OBD_FL_NORPC != 0x00000008);
+	BUILD_BUG_ON(OBD_FL_IDONLY != 0x00000010);
+	BUILD_BUG_ON(OBD_FL_RECREATE_OBJS != 0x00000020);
+	BUILD_BUG_ON(OBD_FL_DEBUG_CHECK != 0x00000040);
+	BUILD_BUG_ON(OBD_FL_NO_PRJQUOTA != 0x00000080);
+	BUILD_BUG_ON(OBD_FL_NO_USRQUOTA != 0x00000100);
+	BUILD_BUG_ON(OBD_FL_NO_GRPQUOTA != 0x00000200);
+	BUILD_BUG_ON(OBD_FL_CREATE_CROW != 0x00000400);
+	BUILD_BUG_ON(OBD_FL_SRVLOCK != 0x00000800);
+	BUILD_BUG_ON(OBD_FL_CKSUM_CRC32 != 0x00001000);
+	BUILD_BUG_ON(OBD_FL_CKSUM_ADLER != 0x00002000);
+	BUILD_BUG_ON(OBD_FL_CKSUM_CRC32C != 0x00004000);
+	BUILD_BUG_ON(OBD_FL_CKSUM_T10IP512 != 0x00005000);
+	BUILD_BUG_ON(OBD_FL_CKSUM_T10IP4K != 0x00006000);
+	BUILD_BUG_ON(OBD_FL_CKSUM_T10CRC512 != 0x00007000);
+	BUILD_BUG_ON(OBD_FL_CKSUM_T10CRC4K != 0x00008000);
+	BUILD_BUG_ON(OBD_FL_CKSUM_RSVD3 != 0x00010000);
+	BUILD_BUG_ON(OBD_FL_SHRINK_GRANT != 0x00020000);
+	BUILD_BUG_ON(OBD_FL_MMAP != 0x00040000);
+	BUILD_BUG_ON(OBD_FL_RECOV_RESEND != 0x00080000);
+	BUILD_BUG_ON(OBD_FL_NOSPC_BLK != 0x00100000);
+	BUILD_BUG_ON(OBD_FL_FLUSH != 0x00200000);
+	BUILD_BUG_ON(OBD_FL_SHORT_IO != 0x00400000);
+
+	/* Checks for struct lov_ost_data_v1 */
+	LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_ost_data_v1));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+
+	/* Checks for struct lov_mds_md_v1 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v1));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]));
+	BUILD_BUG_ON(LOV_MAGIC_V1 != (0x0BD10000 | 0x0BD0));
+
+	/* Checks for struct lov_mds_md_v3 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v3));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen));
+	BUILD_BUG_ON(LOV_MAXPOOLNAME != 15);
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[15 + 1]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[15 + 1]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[15 + 1]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[15 + 1]));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]));
+	BUILD_BUG_ON(LOV_MAGIC_V3 != (0x0BD30000 | 0x0BD0));
+	LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID0);
+	LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID1);
+	LASSERTF(LOV_PATTERN_MDT == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_MDT);
+	LASSERTF(LOV_PATTERN_OVERSTRIPING == 0x00000200UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_OVERSTRIPING);
+
+	/* Checks for struct lov_comp_md_entry_v1 */
+	LASSERTF((int)sizeof(struct lov_comp_md_entry_v1) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_comp_md_entry_v1));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_id));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_id));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_flags));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_flags));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_extent) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_extent));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_extent));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_offset) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_offset));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_offset));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_size) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_size));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1));
+	BUILD_BUG_ON(LCME_FL_STALE != 0x00000001);
+	BUILD_BUG_ON(LCME_FL_PREF_RD != 0x00000002);
+	BUILD_BUG_ON(LCME_FL_PREF_WR != 0x00000004);
+	BUILD_BUG_ON(LCME_FL_PREF_RW != 0x00000006);
+	BUILD_BUG_ON(LCME_FL_OFFLINE != 0x00000008);
+	BUILD_BUG_ON(LCME_FL_INIT != 0x00000010);
+	BUILD_BUG_ON(LCME_FL_NOSYNC != 0x00000020);
+	BUILD_BUG_ON(LCME_FL_EXTENSION != 0x00000040);
+	BUILD_BUG_ON(LCME_FL_NEG != 0x80000000);
+
+	/* Checks for struct lov_comp_md_v1 */
+	LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_comp_md_v1));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_magic));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_magic));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_size) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_size));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_size));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_layout_gen) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_layout_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_layout_gen));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_flags));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_flags));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_entry_count) == 14, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding2));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_entries[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entries[0]));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]));
+	BUILD_BUG_ON(LOV_MAGIC_COMP_V1 != (0x0BD60000 | 0x0BD0));
+	LASSERTF(LCM_FL_NONE == 0, "found %lld\n",
+		 (long long)LCM_FL_NONE);
+	LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n",
+		 (long long)LCM_FL_RDONLY);
+	LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n",
+		 (long long)LCM_FL_WRITE_PENDING);
+	LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n",
+		 (long long)LCM_FL_SYNC_PENDING);
+	LASSERTF(LCM_FL_PCC_RDONLY == 8, "found %lld\n",
+		 (long long)LCM_FL_PCC_RDONLY);
+
+	/* Checks for struct lmv_mds_md_v1 */
+	LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_mds_md_v1));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_magic));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_hash_type) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_hash_type));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_layout_version) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_layout_version));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding2));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding3) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding3));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[15 + 1]) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[15 + 1]));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[15 + 1]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[15 + 1]));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0]) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0]));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0]) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0]));
+	BUILD_BUG_ON(LMV_MAGIC_V1 != 0x0CD20CD0);
+	BUILD_BUG_ON(LMV_MAGIC_STRIPE != 0x0CD40CD0);
+	BUILD_BUG_ON(LMV_HASH_TYPE_MASK != 0x0000ffff);
+	BUILD_BUG_ON(LMV_HASH_FLAG_FIXED != 0x02000000);
+	BUILD_BUG_ON(LMV_HASH_FLAG_MERGE != 0x04000000);
+	BUILD_BUG_ON(LMV_HASH_FLAG_SPLIT != 0x08000000);
+	BUILD_BUG_ON(LMV_HASH_FLAG_LOST_LMV != 0x10000000);
+	BUILD_BUG_ON(LMV_HASH_FLAG_BAD_TYPE != 0x20000000);
+	BUILD_BUG_ON(LMV_HASH_FLAG_MIGRATION != 0x80000000);
+	BUILD_BUG_ON(LMV_CRUSH_PG_COUNT != 4096);
+
+	/* Checks for struct obd_statfs */
+	LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_statfs));
+	LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_type));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_type));
+	LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_blocks));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bfree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bavail));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+	LASSERTF((int)offsetof(struct obd_statfs, os_files) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_files));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_files) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_files));
+	LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_ffree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fsid));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bsize));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize));
+	LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_namelen));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+	LASSERTF((int)offsetof(struct obd_statfs, os_maxbytes) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_maxbytes));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_maxbytes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_maxbytes));
+	LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_state));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
+	LASSERTF((int)offsetof(struct obd_statfs, os_granted) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_granted));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_granted) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_granted));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare3));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare4));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare5));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare6));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare7));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare8));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare9));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+	LASSERTF(OS_STATFS_DEGRADED == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)OS_STATFS_DEGRADED);
+	LASSERTF(OS_STATFS_READONLY == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OS_STATFS_READONLY);
+	LASSERTF(OS_STATFS_NOPRECREATE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)OS_STATFS_NOPRECREATE);
+	LASSERTF(OS_STATFS_ENOSPC == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)OS_STATFS_ENOSPC);
+	LASSERTF(OS_STATFS_ENOINO == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)OS_STATFS_ENOINO);
+	LASSERTF(OS_STATFS_SUM == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)OS_STATFS_SUM);
+	LASSERTF(OS_STATFS_NONROT == 0x00000200UL, "found 0x%.8xUL\n",
+		(unsigned)OS_STATFS_NONROT);
+
+	/* Checks for struct obd_ioobj */
+	LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_ioobj));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_oid));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt));
+	LASSERTF(IOOBJ_MAX_BRW_BITS == 16, "found %lld\n",
+		 (long long)IOOBJ_MAX_BRW_BITS);
+
+	/* Checks for union lquota_id */
+	LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(union lquota_id));
+
+	LASSERTF(QIF_DQBLKSIZE_BITS == 10, "found %lld\n",
+		 (long long)QIF_DQBLKSIZE_BITS);
+	LASSERTF(QIF_DQBLKSIZE == 1024, "found %lld\n",
+		 (long long)QIF_DQBLKSIZE);
+
+	/* Checks for struct obd_quotactl */
+	LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_quotactl));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_cmd));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_type));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_id));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_stat));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqblk));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk));
+
+	/* Checks for struct obd_dqinfo */
+	LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqinfo));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_flags));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_valid));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid));
+
+	/* Checks for struct obd_dqblk */
+	LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqblk));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curspace));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_btime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_itime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_padding));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding));
+	LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n",
+		Q_QUOTACHECK);
+	LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n",
+		Q_INITQUOTA);
+	LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n",
+		Q_GETOINFO);
+	LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n",
+		Q_GETOQUOTA);
+	LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n",
+		Q_FINVALIDATE);
+#ifdef HAVE_SERVER_SUPPORT
+
+	/* Checks for struct lquota_acct_rec */
+	LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_acct_rec));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, bspace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, ispace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace));
+
+	/* Checks for struct lquota_glb_rec */
+	LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_glb_rec));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_time));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted));
+
+	/* Checks for struct lquota_slv_rec */
+	LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_slv_rec));
+	LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted));
+	LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted));
+
+	/* Checks for struct idx_info */
+	LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct idx_info));
+	LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_magic));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_magic));
+	LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_flags));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_flags));
+	LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_count));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_count));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad0));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0));
+	LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_attrs));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs));
+	LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_fid));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_fid));
+	LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_version));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_version));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_start));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_end));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end));
+	LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_keysize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize));
+	LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_recsize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad1));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad2));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad3));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3));
+	BUILD_BUG_ON(IDX_INFO_MAGIC != 0x3D37CC37);
+
+	/* Checks for struct lu_idxpage */
+	LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_idxpage));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_magic));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_flags));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_nr));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_pad0));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0));
+	BUILD_BUG_ON(LIP_MAGIC != 0x8A6D6B6C);
+	BUILD_BUG_ON(LIP_HDR_SIZE != (__builtin_offsetof (struct lu_idxpage, lip_entries)));
+	BUILD_BUG_ON(II_FL_NOHASH != 0x00000001);
+	BUILD_BUG_ON(II_FL_VARKEY != 0x00000002);
+	BUILD_BUG_ON(II_FL_VARREC != 0x00000004);
+	BUILD_BUG_ON(II_FL_NONUNQ != 0x00000008);
+	BUILD_BUG_ON(II_FL_NOKEY != 0x00000010);
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct niobuf_remote */
+	LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct niobuf_remote));
+	LASSERTF((int)offsetof(struct niobuf_remote, rnb_offset) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, rnb_offset));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_offset));
+	LASSERTF((int)offsetof(struct niobuf_remote, rnb_len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, rnb_len));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_len));
+	LASSERTF((int)offsetof(struct niobuf_remote, rnb_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, rnb_flags));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_flags));
+	LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n",
+		OBD_BRW_READ);
+	LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n",
+		OBD_BRW_WRITE);
+	LASSERTF(OBD_BRW_NDELAY == 0x04, "found 0x%.8x\n",
+		OBD_BRW_NDELAY);
+	LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n",
+		OBD_BRW_SYNC);
+	LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n",
+		OBD_BRW_CHECK);
+	LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n",
+		OBD_BRW_FROM_GRANT);
+	LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n",
+		OBD_BRW_GRANTED);
+	LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n",
+		OBD_BRW_NOCACHE);
+	LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n",
+		OBD_BRW_NOQUOTA);
+	LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n",
+		OBD_BRW_SRVLOCK);
+	LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n",
+		OBD_BRW_ASYNC);
+	LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n",
+		OBD_BRW_MEMALLOC);
+	LASSERTF(OBD_BRW_OVER_USRQUOTA == 0x1000, "found 0x%.8x\n",
+		OBD_BRW_OVER_USRQUOTA);
+	LASSERTF(OBD_BRW_OVER_GRPQUOTA == 0x2000, "found 0x%.8x\n",
+		OBD_BRW_OVER_GRPQUOTA);
+	LASSERTF(OBD_BRW_SOFT_SYNC == 0x4000, "found 0x%.8x\n",
+		OBD_BRW_SOFT_SYNC);
+	LASSERTF(OBD_BRW_OVER_PRJQUOTA == 0x8000, "found 0x%.8x\n",
+		OBD_BRW_OVER_PRJQUOTA);
+	LASSERTF(OBD_BRW_RDMA_ONLY == 0x20000, "found 0x%.8x\n",
+		OBD_BRW_RDMA_ONLY);
+	LASSERTF(OBD_BRW_SYS_RESOURCE == 0x40000, "found 0x%.8x\n",
+		OBD_BRW_SYS_RESOURCE);
+
+	/* Checks for struct ost_body */
+	LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_body));
+	LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_body, oa));
+	LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_body *)0)->oa));
+
+	/* Checks for struct ll_fid */
+	LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fid));
+	LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, id));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->id));
+	LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, generation));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->generation));
+	LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, f_type));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
+
+	LASSERTF(MDS_CROSS_REF == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CROSS_REF);
+	LASSERTF(MDS_PERM_BYPASS == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_PERM_BYPASS);
+	LASSERTF(MDS_QUOTA_IGNORE == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_QUOTA_IGNORE);
+	LASSERTF(MDS_KEEP_ORPHAN == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_KEEP_ORPHAN);
+	LASSERTF(MDS_RECOV_OPEN == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_RECOV_OPEN);
+	LASSERTF(MDS_DATA_MODIFIED == 0x00000200UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_DATA_MODIFIED);
+	LASSERTF(MDS_CREATE_VOLATILE == 0x00000400UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CREATE_VOLATILE);
+	LASSERTF(MDS_OWNEROVERRIDE == 0x00000800UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_OWNEROVERRIDE);
+	LASSERTF(MDS_HSM_RELEASE == 0x00001000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_HSM_RELEASE);
+	LASSERTF(MDS_CLOSE_MIGRATE == 0x00002000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_MIGRATE);
+	LASSERTF(MDS_CLOSE_LAYOUT_SWAP == 0x00004000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_LAYOUT_SWAP);
+	LASSERTF(MDS_CLOSE_LAYOUT_MERGE == 0x00008000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_LAYOUT_MERGE);
+	LASSERTF(MDS_CLOSE_RESYNC_DONE == 0x00010000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_RESYNC_DONE);
+	LASSERTF(MDS_CLOSE_LAYOUT_SPLIT == 0x00020000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_LAYOUT_SPLIT);
+	LASSERTF(MDS_TRUNC_KEEP_LEASE == 0x00040000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_TRUNC_KEEP_LEASE);
+	LASSERTF(MDS_PCC_ATTACH == 0x00080000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_PCC_ATTACH);
+	LASSERTF(MDS_CLOSE_UPDATE_TIMES == 0x00100000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_UPDATE_TIMES);
+	LASSERTF(MDS_SETSTRIPE_CREATE == 0x00200000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_SETSTRIPE_CREATE);
+	LASSERTF(MDS_FID_OP == 0x00400000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_FID_OP);
+	LASSERTF(MDS_MIGRATE_NSONLY == 0x00800000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_MIGRATE_NSONLY);
+
+	/* Checks for struct mdt_body */
+	LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_body));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_fid1) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_fid1));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid1));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_fid2) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_fid2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid2));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_open_handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_open_handle));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_open_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_open_handle));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_valid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_valid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_valid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_size) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_size));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_size));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_mtime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_mtime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mtime));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_atime) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_atime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_atime));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_ctime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_ctime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_ctime));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_blocks));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_blocks));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_version) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_version));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_version));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_t_state) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_t_state));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_t_state) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_t_state));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_fsuid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsuid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_fsgid) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsgid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_capability) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_capability));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_capability) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_capability));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_mode));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mode));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_uid) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_uid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_gid) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_gid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_flags) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_flags));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_flags));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_rdev) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_rdev));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_rdev) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_rdev));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_nlink) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_nlink));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_nlink));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_layout_gen) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_layout_gen));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_layout_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_layout_gen));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_suppgid) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_suppgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_suppgid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_eadatasize) == 148, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_eadatasize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_eadatasize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_eadatasize));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_aclsize) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_aclsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_aclsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_aclsize));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_max_mdsize) == 156, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_max_mdsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_unused3) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_unused3));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused3));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_uid_h) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_uid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid_h));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_gid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_gid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid_h));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_projid) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_projid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_projid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_projid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_dom_size) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_dom_size));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_size));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_dom_blocks) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_dom_blocks));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_btime) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_btime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_btime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_btime));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_9) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_9));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_10) == 208, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_padding_10));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_10) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_10));
+	LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_CLOSED);
+	LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n",
+		MDS_FMODE_EXEC);
+	LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREATED);
+	LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREAT);
+	LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
+		MDS_OPEN_EXCL);
+	LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n",
+		MDS_OPEN_TRUNC);
+	LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n",
+		MDS_OPEN_APPEND);
+	LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n",
+		MDS_OPEN_SYNC);
+	LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DIRECTORY);
+	LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_BY_FID);
+	LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DELAY_CREATE);
+	LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_OWNEROVERRIDE);
+	LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_JOIN_FILE);
+	LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_LOCK);
+	LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_EA);
+	LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_OBJS);
+	LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NORESTORE);
+	LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NEWSTRIPE);
+	LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_VOLATILE);
+	LASSERTF(MDS_OPEN_LEASE == 00000000001000000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_LEASE);
+	LASSERTF(MDS_OPEN_RESYNC == 00000000004000000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_RESYNC);
+	LASSERTF(MDS_OPEN_PCC == 00000000010000000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_PCC);
+	LASSERTF(MDS_OPEN_DEFAULT_LMV == 00000000040000000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_DEFAULT_LMV);
+	LASSERTF(LUSTRE_SYNC_FL == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_SYNC_FL);
+	LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_IMMUTABLE_FL);
+	LASSERTF(LUSTRE_APPEND_FL == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_APPEND_FL);
+	LASSERTF(LUSTRE_NODUMP_FL == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_NODUMP_FL);
+	LASSERTF(LUSTRE_NOATIME_FL == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_NOATIME_FL);
+	LASSERTF(LUSTRE_INDEX_FL == 0x00001000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_INDEX_FL);
+#ifdef HAVE_SERVER_SUPPORT
+	LASSERTF(LUSTRE_ORPHAN_FL == 0x00002000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_ORPHAN_FL);
+#endif /* HAVE_SERVER_SUPPORT */
+	LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_DIRSYNC_FL);
+	LASSERTF(LUSTRE_TOPDIR_FL == 0x00020000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_TOPDIR_FL);
+	LASSERTF(LUSTRE_INLINE_DATA_FL == 0x10000000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_INLINE_DATA_FL);
+#ifdef HAVE_SERVER_SUPPORT
+	LASSERTF(LUSTRE_SET_SYNC_FL == 0x00040000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_SET_SYNC_FL);
+#endif /* HAVE_SERVER_SUPPORT */
+	LASSERTF(LUSTRE_ENCRYPT_FL == 0x00800000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_ENCRYPT_FL);
+	LASSERTF(MDS_INODELOCK_LOOKUP == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_INODELOCK_LOOKUP);
+	LASSERTF(MDS_INODELOCK_UPDATE == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_INODELOCK_UPDATE);
+	LASSERTF(MDS_INODELOCK_OPEN == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_INODELOCK_OPEN);
+	LASSERTF(MDS_INODELOCK_LAYOUT == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_INODELOCK_LAYOUT);
+	LASSERTF(MDS_INODELOCK_PERM == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_INODELOCK_PERM);
+	LASSERTF(MDS_INODELOCK_XATTR == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_INODELOCK_XATTR);
+	LASSERTF(MDS_INODELOCK_DOM == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_INODELOCK_DOM);
+
+	/* Checks for struct mdt_ioepoch */
+	LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_open_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_open_handle));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused1) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_unused1));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused2) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_unused2));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_padding));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_padding));
+
+	/* Checks for struct mdt_rec_setattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_projid) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_projid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_projid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_projid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5));
+
+	/* Checks for struct mdt_rec_create */
+	LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_create));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_open_handle_old) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_open_handle_old));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_rdev));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4));
+
+	/* Checks for struct mdt_rec_link */
+	LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_link));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9));
+
+	/* Checks for struct mdt_rec_unlink */
+	LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_unlink));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9));
+
+	/* Checks for struct mdt_rec_rename */
+	LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_rename));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8));
+
+	/* Checks for struct mdt_rec_setxattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setxattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+
+	/* Checks for struct mdt_rec_resync */
+	LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_resync));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_lease_handle) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_lease_handle));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_lease_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_lease_handle));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_mirror_id) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_mirror_id));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 134, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
+
+	/* Checks for struct mdt_rec_reint */
+	LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_reint));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mirror_id) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mirror_id));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 134, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
+
+	/* Checks for struct lmv_desc */
+	LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_desc));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_3));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_4));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid));
+
+	/* Checks for struct lov_desc */
+	LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_desc));
+	LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_0));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0));
+	LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid));
+	BUILD_BUG_ON(LOV_DESC_MAGIC != 0xB0CCDE5C);
+
+	/* Checks for struct ldlm_res_id */
+	LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_res_id));
+	BUILD_BUG_ON(RES_NAME_SIZE != 4);
+	LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_res_id, name[4]));
+	LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4]));
+
+	/* Checks for struct ldlm_extent */
+	LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_extent));
+	LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, start));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->start));
+	LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, end));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->end));
+	LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, gid));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
+
+	/* Checks for struct ldlm_inodebits */
+	LASSERTF((int)sizeof(struct ldlm_inodebits) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_inodebits));
+	LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, bits));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+#ifdef HAVE_SERVER_SUPPORT
+	LASSERTF((int)offsetof(struct ldlm_inodebits, try_bits) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, try_bits));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->try_bits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->try_bits));
+#else
+	LASSERTF((int)offsetof(struct ldlm_inodebits, cancel_bits) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, cancel_bits));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->cancel_bits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->cancel_bits));
+#endif /* HAVE_SERVER_SUPPORT */
+	LASSERTF((int)offsetof(struct ldlm_inodebits, li_gid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, li_gid));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->li_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->li_gid));
+
+	/* Checks for struct ldlm_flock_wire */
+	LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_flock_wire));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid));
+
+	/* Checks for struct ldlm_intent */
+	LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_intent));
+	LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_intent, opc));
+	LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_intent *)0)->opc));
+	BUILD_BUG_ON(IT_OPEN != 0x00000001);
+	BUILD_BUG_ON(IT_CREAT != 0x00000002);
+	BUILD_BUG_ON(IT_READDIR != 0x00000004);
+	BUILD_BUG_ON(IT_GETATTR != 0x00000008);
+	BUILD_BUG_ON(IT_LOOKUP != 0x00000010);
+	BUILD_BUG_ON(IT_GETXATTR != 0x00000080);
+	BUILD_BUG_ON(IT_LAYOUT != 0x00000400);
+	BUILD_BUG_ON(IT_QUOTA_DQACQ != 0x00000800);
+	BUILD_BUG_ON(IT_QUOTA_CONN != 0x00001000);
+	BUILD_BUG_ON(IT_GLIMPSE != 0x00004000);
+	BUILD_BUG_ON(IT_BRW != 0x00008000);
+
+	/* Checks for struct ldlm_resource_desc */
+	LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_resource_desc));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_pad) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_pad));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_pad) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_pad));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name));
+
+	/* Checks for struct ldlm_lock_desc */
+	LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_resource));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data));
+
+	/* Checks for struct ldlm_request */
+	LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_request));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_count));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle));
+
+	/* Checks for struct ldlm_reply */
+	LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_reply));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_padding));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2));
+
+	/* Checks for struct ost_lvb_v1 */
+	LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb_v1));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks));
+
+	/* Checks for struct ost_lvb */
+	LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_padding));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding));
+
+	/* Checks for struct lquota_lvb */
+	LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_lvb));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_flags));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_pad1));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1));
+	LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n",
+		 (long long)LQUOTA_FL_EDQUOT);
+
+	/* Checks for struct ldlm_gl_lquota_desc */
+	LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_gl_lquota_desc));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2));
+#ifdef HAVE_SERVER_SUPPORT
+
+	/* Checks for struct ldlm_gl_barrier_desc */
+	LASSERTF((int)sizeof(struct ldlm_gl_barrier_desc) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_gl_barrier_desc));
+	LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_status) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_status));
+	LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_status));
+	LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_timeout) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_timeout));
+	LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_timeout));
+	LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_padding) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_padding));
+	LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_padding) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_padding));
+
+	/* Checks for struct barrier_lvb */
+	LASSERTF((int)sizeof(struct barrier_lvb) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct barrier_lvb));
+	LASSERTF((int)offsetof(struct barrier_lvb, lvb_status) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct barrier_lvb, lvb_status));
+	LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_status));
+	LASSERTF((int)offsetof(struct barrier_lvb, lvb_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct barrier_lvb, lvb_index));
+	LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_index));
+	LASSERTF((int)offsetof(struct barrier_lvb, lvb_padding) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct barrier_lvb, lvb_padding));
+	LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_padding) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_padding));
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct mgs_send_param */
+	LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_send_param));
+	BUILD_BUG_ON(MGS_PARAM_MAXLEN != 1024);
+	LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024]));
+	LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]));
+
+	/* Checks for struct cfg_marker */
+	LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n",
+		 (long long)(int)sizeof(struct cfg_marker));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_step));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_flags));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_vers));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_padding));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_comment));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
+	/* Checks for struct llog_logid */
+	LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_oi));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_ogen));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
+	BUILD_BUG_ON(OST_SZ_REC != 0x10600f00);
+	BUILD_BUG_ON(MDS_UNLINK_REC != 0x10612404);
+	BUILD_BUG_ON(MDS_UNLINK64_REC != 0x10692404);
+	BUILD_BUG_ON(MDS_SETATTR64_REC != 0x10692401);
+	BUILD_BUG_ON(OBD_CFG_REC != 0x10620000);
+	BUILD_BUG_ON(LLOG_GEN_REC != 0x10640000);
+	BUILD_BUG_ON(CHANGELOG_REC != 0x10660000);
+	BUILD_BUG_ON(CHANGELOG_USER_REC != 0x10670000);
+	BUILD_BUG_ON(CHANGELOG_USER_REC2 != 0x10670002);
+	BUILD_BUG_ON(HSM_AGENT_REC != 0x10680000);
+	BUILD_BUG_ON(UPDATE_REC != 0x106a0000);
+	BUILD_BUG_ON(LLOG_HDR_MAGIC != 0x10645539);
+	BUILD_BUG_ON(LLOG_LOGID_MAGIC != 0x1064553b);
+
+	/* Checks for struct llog_catid */
+	LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_catid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_logid));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding1));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding2));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding3));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
+
+	/* Checks for struct llog_rec_hdr */
+	LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_hdr));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_len));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_index));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_id));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id));
+
+	/* Checks for struct llog_rec_tail */
+	LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_tail));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_len));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_index));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index));
+
+	/* Checks for struct llog_logid_rec */
+	LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid_rec));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_hdr));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_id));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding1));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding2));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding3));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail));
+
+	/* Checks for struct llog_unlink_rec */
+	LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink_rec));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
+	/* Checks for struct llog_unlink64_rec */
+	LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink64_rec));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3));
+
+#ifdef HAVE_SERVER_SUPPORT
+	/* Checks for struct llog_setattr64_rec */
+	LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_setattr64_rec));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_valid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_valid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_hdr), "%d != %d\n",
+		 (int)offsetof(struct llog_setattr64_rec, lsr_hdr), (int)offsetof(struct llog_setattr64_rec_v2, lsr_hdr));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_hdr), "%d != %d\n",
+		 (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_hdr));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_oi), "%d != %d\n",
+		 (int)offsetof(struct llog_setattr64_rec, lsr_oi), (int)offsetof(struct llog_setattr64_rec_v2, lsr_oi));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_oi), "%d != %d\n",
+		 (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_oi));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_uid), "%d != %d\n",
+		 (int)offsetof(struct llog_setattr64_rec, lsr_uid), (int)offsetof(struct llog_setattr64_rec_v2, lsr_uid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_uid), "%d != %d\n",
+		 (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_uid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_uid_h), "%d != %d\n",
+		 (int)offsetof(struct llog_setattr64_rec, lsr_uid_h), (int)offsetof(struct llog_setattr64_rec_v2, lsr_uid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_uid_h), "%d != %d\n",
+		 (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_uid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_gid), "%d != %d\n",
+		 (int)offsetof(struct llog_setattr64_rec, lsr_gid), (int)offsetof(struct llog_setattr64_rec_v2, lsr_gid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_gid), "%d != %d\n",
+		 (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_gid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_gid_h), "%d != %d\n",
+		 (int)offsetof(struct llog_setattr64_rec, lsr_gid_h), (int)offsetof(struct llog_setattr64_rec_v2, lsr_gid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_gid_h), "%d != %d\n",
+		 (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_gid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_valid) == (int)offsetof(struct llog_setattr64_rec_v2, lsr_valid), "%d != %d\n",
+		 (int)offsetof(struct llog_setattr64_rec, lsr_valid), (int)offsetof(struct llog_setattr64_rec_v2, lsr_valid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_valid), "%d != %d\n",
+		 (int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid), (int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_valid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_projid) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_projid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_layout_version) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_layout_version));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_layout_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_layout_version));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_padding2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_padding2));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_padding2));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_padding3) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_padding3));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_padding3));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_tail) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_tail));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail));
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct llog_size_change_rec */
+	LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_size_change_rec));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail));
+
+	/* Checks for struct changelog_rec */
+	LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_rec));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_namelen));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_flags));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_type));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_index));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_prev));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_time));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_tfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_pfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid));
+
+	/* Checks for struct changelog_ext_rename */
+	LASSERTF((int)sizeof(struct changelog_ext_rename) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_ext_rename));
+	LASSERTF((int)offsetof(struct changelog_ext_rename, cr_sfid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rename, cr_sfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rename *)0)->cr_sfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rename *)0)->cr_sfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rename, cr_spfid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rename, cr_spfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rename *)0)->cr_spfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rename *)0)->cr_spfid));
+
+#ifdef HAVE_SERVER_SUPPORT
+	/* Checks for struct changelog_ext_jobid */
+	LASSERTF((int)sizeof(struct changelog_ext_jobid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_ext_jobid));
+	BUILD_BUG_ON(LUSTRE_JOBID_SIZE != 32);
+	LASSERTF((int)offsetof(struct changelog_ext_jobid, cr_jobid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_jobid, cr_jobid));
+	LASSERTF((int)sizeof(((struct changelog_ext_jobid *)0)->cr_jobid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_jobid *)0)->cr_jobid));
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct changelog_setinfo */
+	LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_setinfo));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_recno));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_id));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id));
+
+	/* Checks for struct llog_changelog_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_do_not_use) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_do_not_use));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use));
+#ifdef HAVE_SERVER_SUPPORT
+
+	/* Checks for struct llog_changelog_user_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_user_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_time) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_time));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail));
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct llog_gen */
+	LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen));
+	LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, mnt_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt));
+	LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, conn_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt));
+
+	/* Checks for struct llog_gen_rec */
+	LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen_rec));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_gen));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_tail));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail));
+
+	/* Checks for struct llog_log_hdr */
+	LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_log_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_hdr));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_count));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_size));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_flags));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid));
+	BUILD_BUG_ON(LLOG_F_ZAP_WHEN_EMPTY != 0x00000001);
+	BUILD_BUG_ON(LLOG_F_IS_CAT != 0x00000002);
+	BUILD_BUG_ON(LLOG_F_IS_PLAIN != 0x00000004);
+	BUILD_BUG_ON(LLOG_F_EXT_JOBID != 0x00000008);
+	BUILD_BUG_ON(LLOG_F_IS_FIXSIZE != 0x00000010);
+	BUILD_BUG_ON(LLOG_F_EXT_EXTRA_FLAGS != 0x00000020);
+	BUILD_BUG_ON(LLOG_F_EXT_X_UIDGID != 0x00000040);
+	BUILD_BUG_ON(LLOG_F_EXT_X_NID != 0x00000080);
+	BUILD_BUG_ON(LLOG_F_EXT_X_OMODE != 0x00000100);
+	BUILD_BUG_ON(LLOG_F_EXT_X_XATTR != 0x00000200);
+
+	/* Checks for struct llogd_body */
+	LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_body));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_logid));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_llh_flags));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_saved_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_len));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
+	BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_CREATE != 501);
+	BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_NEXT_BLOCK != 502);
+	BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_READ_HEADER != 503);
+	BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_PREV_BLOCK != 508);
+	BUILD_BUG_ON(LLOG_FIRST_OPC != 501);
+	BUILD_BUG_ON(LLOG_LAST_OPC != 510);
+	BUILD_BUG_ON(LLOG_CONFIG_ORIG_CTXT != 0);
+	BUILD_BUG_ON(LLOG_CONFIG_REPL_CTXT != 1);
+	BUILD_BUG_ON(LLOG_MDS_OST_ORIG_CTXT != 2);
+	BUILD_BUG_ON(LLOG_MDS_OST_REPL_CTXT != 3);
+	BUILD_BUG_ON(LLOG_SIZE_ORIG_CTXT != 4);
+	BUILD_BUG_ON(LLOG_SIZE_REPL_CTXT != 5);
+	BUILD_BUG_ON(LLOG_TEST_ORIG_CTXT != 8);
+	BUILD_BUG_ON(LLOG_TEST_REPL_CTXT != 9);
+	BUILD_BUG_ON(LLOG_CHANGELOG_ORIG_CTXT != 12);
+	BUILD_BUG_ON(LLOG_CHANGELOG_REPL_CTXT != 13);
+	BUILD_BUG_ON(LLOG_CHANGELOG_USER_ORIG_CTXT != 14);
+	BUILD_BUG_ON(LLOG_AGENT_ORIG_CTXT != 15);
+	BUILD_BUG_ON(LLOG_UPDATELOG_ORIG_CTXT != 16);
+	BUILD_BUG_ON(LLOG_UPDATELOG_REPL_CTXT != 17);
+	BUILD_BUG_ON(LLOG_MAX_CTXTS != 18);
+
+	/* Checks for struct llogd_conn_body */
+	LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_conn_body));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
+
+	/* Checks for struct ll_fiemap_info_key */
+	LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fiemap_info_key));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_name[8]) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_name[8]));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8]));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_oa) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_oa));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_fiemap) == 216, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_fiemap));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap));
+#ifdef HAVE_SERVER_SUPPORT
+
+	/* Checks for struct quota_body */
+	LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct quota_body));
+	LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_fid));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_fid));
+	LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_id));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_id));
+	LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_flags));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_flags));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding));
+	LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_count));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_count));
+	LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_usage));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_usage));
+	LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_slv_ver));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver));
+	LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_glb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding1[4]));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4]));
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct mgs_target_info */
+	LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_target_info));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_config_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_flags));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nid_count));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_instance));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_fsname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_svname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_uuid));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nids));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_params));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params));
+
+	/* Checks for struct mgs_nidtbl_entry */
+	LASSERTF((int)sizeof(struct mgs_nidtbl_entry) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_nidtbl_entry));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_version));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_version));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_instance) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_instance));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_instance));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_index) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_index));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_index));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_length) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_length));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_length) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_length));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_type) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_type));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_type) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_type));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_type) == 21, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_type));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_type) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_type));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_size) == 22, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_size));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_size) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_size));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_count) == 23, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_count));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_count) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_count));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, u.nids[0]) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, u.nids[0]));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->u.nids[0]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->u.nids[0]));
+
+	/* Checks for struct mgs_config_body */
+	LASSERTF((int)sizeof(struct mgs_config_body) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_config_body));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_name) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_name));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_name) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_name));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_offset) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_offset));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_offset));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_type) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_type));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_type));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_nm_cur_pass) == 74, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_nm_cur_pass));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_nm_cur_pass) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_nm_cur_pass));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_bits) == 75, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_bits));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_bits) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_bits));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_units) == 76, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_units));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_units) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_units));
+	BUILD_BUG_ON(MGS_CFG_T_CONFIG != 0);
+	BUILD_BUG_ON(MGS_CFG_T_SPTLRPC != 1);
+	BUILD_BUG_ON(MGS_CFG_T_RECOVER != 2);
+	BUILD_BUG_ON(MGS_CFG_T_PARAMS != 3);
+#ifdef HAVE_SERVER_SUPPORT
+	BUILD_BUG_ON(MGS_CFG_T_NODEMAP != 4);
+	BUILD_BUG_ON(MGS_CFG_T_BARRIER != 5);
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct mgs_config_res */
+	LASSERTF((int)sizeof(struct mgs_config_res) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_config_res));
+	LASSERTF((int)offsetof(struct mgs_config_res, mcr_offset) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_res, mcr_offset));
+	LASSERTF((int)sizeof(((struct mgs_config_res *)0)->mcr_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_res *)0)->mcr_offset));
+	LASSERTF((int)offsetof(struct mgs_config_res, mcr_size) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_res, mcr_size));
+	LASSERTF((int)sizeof(((struct mgs_config_res *)0)->mcr_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_res *)0)->mcr_size));
+
+	/* Checks for struct getinfo_fid2path */
+	LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct getinfo_fid2path));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_fid));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_recno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen));
+#ifdef HAVE_FID2PATH_ANON_UNIONS
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0]));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]));
+#else
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_u.gf_path[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_u.gf_path[0]));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_u.gf_path[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_u.gf_path[0]));
+#endif /* HAVE_FID2PATH_ANON_UNIONS */
+
+	/* Checks for struct fiemap */
+	LASSERTF((int)sizeof(struct fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct fiemap));
+	LASSERTF((int)offsetof(struct fiemap, fm_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_start));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_start));
+	LASSERTF((int)offsetof(struct fiemap, fm_length) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_length));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_length));
+	LASSERTF((int)offsetof(struct fiemap, fm_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_flags));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_flags));
+	LASSERTF((int)offsetof(struct fiemap, fm_mapped_extents) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_mapped_extents));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_mapped_extents));
+	LASSERTF((int)offsetof(struct fiemap, fm_extent_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_extent_count));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_extent_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_extent_count));
+	LASSERTF((int)offsetof(struct fiemap, fm_reserved) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_reserved));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_reserved) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_reserved));
+	LASSERTF((int)offsetof(struct fiemap, fm_extents) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_extents));
+	BUILD_BUG_ON(offsetof(struct fiemap, fm_extents) != sizeof(struct fiemap));
+	BUILD_BUG_ON(FIEMAP_FLAG_SYNC != 0x00000001);
+	BUILD_BUG_ON(FIEMAP_FLAG_XATTR != 0x00000002);
+	BUILD_BUG_ON(FIEMAP_FLAG_DEVICE_ORDER != 0x40000000);
+
+	/* Checks for struct fiemap_extent */
+	LASSERTF((int)sizeof(struct fiemap_extent) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct fiemap_extent));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_logical) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_logical));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_logical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_logical));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_physical) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_physical));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_physical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_physical));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_length) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_length));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_length));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_flags));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_flags));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_reserved[0]) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_reserved[0]));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0]));
+	BUILD_BUG_ON(FIEMAP_EXTENT_LAST != 0x00000001);
+	BUILD_BUG_ON(FIEMAP_EXTENT_UNKNOWN != 0x00000002);
+	BUILD_BUG_ON(FIEMAP_EXTENT_DELALLOC != 0x00000004);
+	BUILD_BUG_ON(FIEMAP_EXTENT_ENCODED != 0x00000008);
+	BUILD_BUG_ON(FIEMAP_EXTENT_DATA_ENCRYPTED != 0x00000080);
+	BUILD_BUG_ON(FIEMAP_EXTENT_NOT_ALIGNED != 0x00000100);
+	BUILD_BUG_ON(FIEMAP_EXTENT_DATA_INLINE != 0x00000200);
+	BUILD_BUG_ON(FIEMAP_EXTENT_DATA_TAIL != 0x00000400);
+	BUILD_BUG_ON(FIEMAP_EXTENT_UNWRITTEN != 0x00000800);
+	BUILD_BUG_ON(FIEMAP_EXTENT_MERGED != 0x00001000);
+	BUILD_BUG_ON(FIEMAP_EXTENT_SHARED != 0x00002000);
+	BUILD_BUG_ON(FIEMAP_EXTENT_NET != 0x80000000);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	/* Checks for type posix_acl_xattr_entry */
+	LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_entry));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_tag));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_perm));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_id));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id));
+#endif /* CONFIG_FS_POSIX_ACL */
+
+#ifdef CONFIG_FS_POSIX_ACL
+	/* Checks for type posix_acl_xattr_header */
+	LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_header));
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_version));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version));
+#ifndef HAVE_STRUCT_POSIX_ACL_XATTR
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_entries));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries));
+#endif /* HAVE_STRUCT_POSIX_ACL_XATTR */
+#endif /* CONFIG_FS_POSIX_ACL */
+
+	/* Checks for struct link_ea_header */
+	LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_header));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_magic));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_reccount));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_len));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_overflow_time) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_overflow_time));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_overflow_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_overflow_time));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_padding));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_padding));
+	BUILD_BUG_ON(LINK_EA_MAGIC != 0x11EAF1DFUL);
+
+	/* Checks for struct link_ea_entry */
+	LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_entry));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_reclen));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_name));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name));
+
+	/* Checks for struct layout_intent */
+	LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct layout_intent));
+	LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_opc));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_opc));
+	LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_flags));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
+	LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_extent));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_extent));
+	LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
+		 (long long)LAYOUT_INTENT_ACCESS);
+	LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
+		 (long long)LAYOUT_INTENT_READ);
+	LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n",
+		 (long long)LAYOUT_INTENT_WRITE);
+	LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n",
+		 (long long)LAYOUT_INTENT_GLIMPSE);
+	LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n",
+		 (long long)LAYOUT_INTENT_TRUNC);
+	LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RELEASE);
+	LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RESTORE);
+
+	/* Checks for struct hsm_action_item */
+	LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_item));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_len));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_action));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_fid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_dfid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_extent));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_cookie));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_gid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_data));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data));
+
+	/* Checks for struct hsm_action_list */
+	LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_list));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_version));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_count));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_compound_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_flags));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, padding1));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_fsname));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname));
+
+	/* Checks for struct hsm_progress */
+	LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval));
+	LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, padding));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->padding));
+	LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n",
+		HP_FLAG_COMPLETED);
+	LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n",
+		HP_FLAG_RETRY);
+
+	LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_data_version));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_flags));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_errval));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval));
+	LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, padding));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->padding));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_hai));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai));
+
+	/* Checks for struct hsm_progress_kernel */
+	LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress_kernel));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2));
+
+	/* Checks for struct hsm_user_item */
+	LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_item));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_fid));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_extent));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent));
+
+	/* Checks for struct hsm_user_state */
+	LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_states));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location));
+
+	/* Checks for struct hsm_state_set */
+	LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_state_set));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_valid));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_setmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_clearmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask));
+	BUILD_BUG_ON(HSS_SETMASK != 1);
+	BUILD_BUG_ON(HSS_CLEARMASK != 2);
+	BUILD_BUG_ON(HSS_ARCHIVE_ID != 4);
+
+	/* Checks for struct hsm_current_action */
+	LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_current_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_state));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_action));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_location));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location));
+	BUILD_BUG_ON(HPS_NONE != 0);
+	BUILD_BUG_ON(HPS_WAITING != 1);
+	BUILD_BUG_ON(HPS_RUNNING != 2);
+	BUILD_BUG_ON(HPS_DONE != 3);
+	BUILD_BUG_ON(HUA_NONE != 1);
+	BUILD_BUG_ON(HUA_ARCHIVE != 10);
+	BUILD_BUG_ON(HUA_RESTORE != 11);
+	BUILD_BUG_ON(HUA_RELEASE != 12);
+	BUILD_BUG_ON(HUA_REMOVE != 13);
+	BUILD_BUG_ON(HUA_CANCEL != 14);
+
+	/* Checks for struct hsm_request */
+	LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_request));
+	LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_action));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_action));
+	LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id));
+	LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_flags));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags));
+	LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_itemcount));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount));
+	LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_data_len));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len));
+	LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_FORCE_ACTION);
+	LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_GHOST_COPY);
+
+	/* Checks for struct hsm_user_request */
+	LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_request));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_user_item));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item));
+
+	/* Checks for struct hsm_user_import */
+	LASSERTF((int)sizeof(struct hsm_user_import) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_import));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_size));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_size));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_uid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_uid));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_uid));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_gid) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_gid));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_gid));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_mode) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_mode));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mode));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_atime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_atime));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_atime));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_atime_ns) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_atime_ns));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_atime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_atime_ns));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_mtime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_mtime));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mtime));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_mtime_ns) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_mtime_ns));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_archive_id) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_archive_id));
+
+	/* Checks for struct netobj_s */
+	LASSERTF((int)sizeof(struct netobj_s) == 4, "found %lld\n",
+		 (long long)(int)sizeof(struct netobj_s));
+	LASSERTF((int)offsetof(struct netobj_s, len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct netobj_s, len));
+	LASSERTF((int)sizeof(((struct netobj_s *)0)->len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct netobj_s *)0)->len));
+	LASSERTF((int)offsetof(struct netobj_s, data) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct netobj_s, data));
+	LASSERTF((int)sizeof(((struct netobj_s *)0)->data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct netobj_s *)0)->data));
+
+	/* Checks for struct rawobj_s */
+	LASSERTF((int)sizeof(struct rawobj_s) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct rawobj_s));
+	LASSERTF((int)offsetof(struct rawobj_s, len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct rawobj_s, len));
+	LASSERTF((int)sizeof(((struct rawobj_s *)0)->len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct rawobj_s *)0)->len));
+	LASSERTF((int)offsetof(struct rawobj_s, data) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct rawobj_s, data));
+	LASSERTF((int)sizeof(((struct rawobj_s *)0)->data) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct rawobj_s *)0)->data));
+
+	/* Checks for struct gss_header */
+	LASSERTF((int)sizeof(struct gss_header) == 36, "found %lld\n",
+		 (long long)(int)sizeof(struct gss_header));
+	LASSERTF((int)offsetof(struct gss_header, gh_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_version));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_version) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_version));
+	LASSERTF((int)offsetof(struct gss_header, gh_sp) == 1, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_sp));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_sp) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_sp));
+	LASSERTF((int)offsetof(struct gss_header, gh_pad0) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_pad0));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_pad0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_pad0));
+	LASSERTF((int)offsetof(struct gss_header, gh_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_flags));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_flags));
+	LASSERTF((int)offsetof(struct gss_header, gh_proc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_proc));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_proc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_proc));
+	LASSERTF((int)offsetof(struct gss_header, gh_seq) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_seq));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_seq) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_seq));
+	LASSERTF((int)offsetof(struct gss_header, gh_svc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_svc));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_svc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_svc));
+	LASSERTF((int)offsetof(struct gss_header, gh_pad1) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_pad1));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_pad1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_pad1));
+	LASSERTF((int)offsetof(struct gss_header, gh_pad2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_pad2));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_pad2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_pad2));
+	LASSERTF((int)offsetof(struct gss_header, gh_pad3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_pad3));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_pad3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_pad3));
+	LASSERTF((int)offsetof(struct gss_header, gh_handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_header, gh_handle));
+	LASSERTF((int)sizeof(((struct gss_header *)0)->gh_handle) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_header *)0)->gh_handle));
+
+	/* Checks for struct gss_rep_header */
+	LASSERTF((int)sizeof(struct gss_rep_header) == 36, "found %lld\n",
+		 (long long)(int)sizeof(struct gss_rep_header));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_version));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_version) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_version));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_sp) == 1, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_sp));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_sp) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_sp));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_pad0) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_pad0));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_pad0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_pad0));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_flags));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_flags));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_proc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_proc));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_proc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_proc));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_major) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_major));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_major) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_major));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_minor) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_minor));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_minor) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_minor));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_seqwin) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_seqwin));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_seqwin) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_seqwin));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_pad2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_pad2));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_pad2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_pad2));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_pad3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_pad3));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_pad3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_pad3));
+	LASSERTF((int)offsetof(struct gss_rep_header, gh_handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_rep_header, gh_handle));
+	LASSERTF((int)sizeof(((struct gss_rep_header *)0)->gh_handle) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_rep_header *)0)->gh_handle));
+
+	/* Checks for struct gss_err_header */
+	LASSERTF((int)sizeof(struct gss_err_header) == 36, "found %lld\n",
+		 (long long)(int)sizeof(struct gss_err_header));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_version));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_version) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_version));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_sp) == 1, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_sp));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_sp) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_sp));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_pad0) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_pad0));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_pad0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_pad0));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_flags));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_flags));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_proc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_proc));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_proc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_proc));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_major) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_major));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_major) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_major));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_minor) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_minor));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_minor) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_minor));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_pad1) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_pad1));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_pad1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_pad1));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_pad2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_pad2));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_pad2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_pad2));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_pad3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_pad3));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_pad3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_pad3));
+	LASSERTF((int)offsetof(struct gss_err_header, gh_handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_err_header, gh_handle));
+	LASSERTF((int)sizeof(((struct gss_err_header *)0)->gh_handle) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_err_header *)0)->gh_handle));
+
+	/* Checks for struct gss_wire_ctx */
+	LASSERTF((int)sizeof(struct gss_wire_ctx) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct gss_wire_ctx));
+	LASSERTF((int)offsetof(struct gss_wire_ctx, gw_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_wire_ctx, gw_flags));
+	LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_flags));
+	LASSERTF((int)offsetof(struct gss_wire_ctx, gw_proc) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_wire_ctx, gw_proc));
+	LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_proc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_proc));
+	LASSERTF((int)offsetof(struct gss_wire_ctx, gw_seq) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_wire_ctx, gw_seq));
+	LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_seq) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_seq));
+	LASSERTF((int)offsetof(struct gss_wire_ctx, gw_svc) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_wire_ctx, gw_svc));
+	LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_svc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_svc));
+	LASSERTF((int)offsetof(struct gss_wire_ctx, gw_handle) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct gss_wire_ctx, gw_handle));
+	LASSERTF((int)sizeof(((struct gss_wire_ctx *)0)->gw_handle) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct gss_wire_ctx *)0)->gw_handle));
+
+#ifdef HAVE_SERVER_SUPPORT
+
+	/* Checks for struct object_update_param */
+	LASSERTF((int)sizeof(struct object_update_param) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update_param));
+	LASSERTF((int)offsetof(struct object_update_param, oup_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_param, oup_len));
+	LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_len) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_param *)0)->oup_len));
+	LASSERTF((int)offsetof(struct object_update_param, oup_padding) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_param, oup_padding));
+	LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_padding) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_param *)0)->oup_padding));
+	LASSERTF((int)offsetof(struct object_update_param, oup_padding2) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_param, oup_padding2));
+	LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_param *)0)->oup_padding2));
+	LASSERTF((int)offsetof(struct object_update_param, oup_buf) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_param, oup_buf));
+	LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_buf) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_param *)0)->oup_buf));
+
+	/* Checks for struct object_update */
+	LASSERTF((int)sizeof(struct object_update) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update));
+	LASSERTF((int)offsetof(struct object_update, ou_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_type));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_type));
+	LASSERTF((int)offsetof(struct object_update, ou_params_count) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_params_count));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_params_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_params_count));
+	LASSERTF((int)offsetof(struct object_update, ou_result_size) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_result_size));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_result_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_result_size));
+	LASSERTF((int)offsetof(struct object_update, ou_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_flags));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_flags));
+	LASSERTF((int)offsetof(struct object_update, ou_padding1) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_padding1));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_padding1));
+	LASSERTF((int)offsetof(struct object_update, ou_batchid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_batchid));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_batchid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_batchid));
+	LASSERTF((int)offsetof(struct object_update, ou_fid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_fid));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_fid));
+	LASSERTF((int)offsetof(struct object_update, ou_params) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_params));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_params) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_params));
+	BUILD_BUG_ON(UPDATE_FL_OST != 0x00000001);
+	BUILD_BUG_ON(UPDATE_FL_SYNC != 0x00000002);
+	BUILD_BUG_ON(UPDATE_FL_COMMITTED != 0x00000004);
+	BUILD_BUG_ON(UPDATE_FL_NOLOG != 0x00000008);
+
+	/* Checks for struct object_update_request */
+	LASSERTF((int)sizeof(struct object_update_request) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update_request));
+	LASSERTF((int)offsetof(struct object_update_request, ourq_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_request, ourq_magic));
+	LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_request *)0)->ourq_magic));
+	LASSERTF((int)offsetof(struct object_update_request, ourq_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_request, ourq_count));
+	LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_request *)0)->ourq_count));
+	LASSERTF((int)offsetof(struct object_update_request, ourq_padding) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_request, ourq_padding));
+	LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_padding) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_request *)0)->ourq_padding));
+	LASSERTF((int)offsetof(struct object_update_request, ourq_updates) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_request, ourq_updates));
+	LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_updates) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_request *)0)->ourq_updates));
+	BUILD_BUG_ON(UPDATE_REQUEST_MAGIC != 0xBDDE0002);
+
+	/* Checks for struct object_update_result */
+	LASSERTF((int)sizeof(struct object_update_result) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update_result));
+	LASSERTF((int)offsetof(struct object_update_result, our_rc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_result, our_rc));
+	LASSERTF((int)sizeof(((struct object_update_result *)0)->our_rc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_result *)0)->our_rc));
+	LASSERTF((int)offsetof(struct object_update_result, our_datalen) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_result, our_datalen));
+	LASSERTF((int)sizeof(((struct object_update_result *)0)->our_datalen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_result *)0)->our_datalen));
+	LASSERTF((int)offsetof(struct object_update_result, our_padding) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_result, our_padding));
+	LASSERTF((int)sizeof(((struct object_update_result *)0)->our_padding) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_result *)0)->our_padding));
+	LASSERTF((int)offsetof(struct object_update_result, our_data) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_result, our_data));
+	LASSERTF((int)sizeof(((struct object_update_result *)0)->our_data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_result *)0)->our_data));
+
+	/* Checks for struct object_update_reply */
+	LASSERTF((int)sizeof(struct object_update_reply) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update_reply));
+	LASSERTF((int)offsetof(struct object_update_reply, ourp_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_reply, ourp_magic));
+	LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_magic));
+	LASSERTF((int)offsetof(struct object_update_reply, ourp_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_reply, ourp_count));
+	LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_count));
+	LASSERTF((int)offsetof(struct object_update_reply, ourp_padding) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_reply, ourp_padding));
+	LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_padding) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_padding));
+	LASSERTF((int)offsetof(struct object_update_reply, ourp_lens) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_reply, ourp_lens));
+	LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_lens) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_lens));
+	BUILD_BUG_ON(UPDATE_REPLY_MAGIC != 0x00BD0002);
+
+	/* Checks for struct out_update_header */
+	LASSERTF((int)sizeof(struct out_update_header) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct out_update_header));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_magic));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_magic));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_count));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_count));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_inline_length) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_inline_length));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_inline_length) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_inline_length));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_reply_size) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_reply_size));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_reply_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_reply_size));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_inline_data) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_inline_data));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_inline_data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_inline_data));
+	BUILD_BUG_ON(OUT_UPDATE_HEADER_MAGIC != 0xBDDF0001);
+	BUILD_BUG_ON(OUT_UPDATE_MAX_INLINE_SIZE != 4096);
+
+	/* Checks for struct out_update_buffer */
+	LASSERTF((int)sizeof(struct out_update_buffer) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct out_update_buffer));
+	LASSERTF((int)offsetof(struct out_update_buffer, oub_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_buffer, oub_size));
+	LASSERTF((int)sizeof(((struct out_update_buffer *)0)->oub_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_buffer *)0)->oub_size));
+	LASSERTF((int)offsetof(struct out_update_buffer, oub_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_buffer, oub_padding));
+	LASSERTF((int)sizeof(((struct out_update_buffer *)0)->oub_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_buffer *)0)->oub_padding));
+
+	/* Checks for struct nodemap_cluster_rec */
+	LASSERTF((int)sizeof(struct nodemap_cluster_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct nodemap_cluster_rec));
+	BUILD_BUG_ON(LUSTRE_NODEMAP_NAME_LENGTH != 16);
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_name[16 + 1]) == 17, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_name[16 + 1]));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_name[16 + 1]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_name[16 + 1]));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_flags) == 17, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_flags));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_flags) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_flags));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_padding1) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_padding1));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding1) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding1));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_projid) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_projid));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_projid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_projid));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_uid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_uid));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_uid));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_gid) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_gid));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_gid));
+
+	/* Checks for struct nodemap_range_rec */
+	LASSERTF((int)sizeof(struct nodemap_range_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct nodemap_range_rec));
+	LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_start_nid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_range_rec, nrr_start_nid));
+	LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_start_nid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_start_nid));
+	LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_end_nid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_range_rec, nrr_end_nid));
+	LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_end_nid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_end_nid));
+	LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_padding1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_range_rec, nrr_padding1));
+	LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding1));
+	LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_range_rec, nrr_padding2));
+	LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding2));
+
+	/* Checks for struct nodemap_id_rec */
+	LASSERTF((int)sizeof(struct nodemap_id_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct nodemap_id_rec));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_id_fs) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_id_fs));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_id_fs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_id_fs));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding1) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_padding1));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding1));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding2) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_padding2));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding2));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding3) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_padding3));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding3));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding4) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_padding4));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding4));
+
+	/* Checks for struct nodemap_global_rec */
+	LASSERTF((int)sizeof(struct nodemap_global_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct nodemap_global_rec));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_is_active) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_is_active));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_is_active) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_is_active));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding1) == 1, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding1));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding1) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding1));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding2) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding2));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding2) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding2));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding3) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding3));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding3));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding4) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding4));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding4));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding5) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding5));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding5));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding6) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding6));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding6));
+
+	/* Checks for union nodemap_rec */
+	LASSERTF((int)sizeof(union nodemap_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(union nodemap_rec));
+
+	LASSERTF(OFD_ACCESS_READ == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)OFD_ACCESS_READ);
+	LASSERTF(OFD_ACCESS_WRITE == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OFD_ACCESS_WRITE);
+	/* Checks for struct ofd_access_entry_v1 */
+	LASSERTF((int)sizeof(struct ofd_access_entry_v1) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct ofd_access_entry_v1));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_parent_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_parent_fid));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_parent_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_parent_fid));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_begin) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_begin));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_begin) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_begin));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_end) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_end));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_end));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_time) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_time));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_time));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_size) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_size));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_size));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_segment_count) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_segment_count));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_segment_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_segment_count));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_flags) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_flags));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_flags));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_reserved1) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_reserved1));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved1));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_reserved2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_reserved2));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved2));
+	LASSERTF((int)offsetof(struct ofd_access_entry_v1, oae_reserved3) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct ofd_access_entry_v1, oae_reserved3));
+	LASSERTF((int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ofd_access_entry_v1 *)0)->oae_reserved3));
+
+	LASSERTF(LUSTRE_ACCESS_LOG_VERSION_1 == 0x00010000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_ACCESS_LOG_VERSION_1);
+	LASSERTF(LUSTRE_ACCESS_LOG_TYPE_OFD == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_ACCESS_LOG_TYPE_OFD);
+	/* Checks for struct lustre_access_log_info_v1 */
+	LASSERTF((int)sizeof(struct lustre_access_log_info_v1) == 168, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_access_log_info_v1));
+	LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_version));
+	LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_version));
+	LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_type));
+	LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_type));
+	LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_name) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_name));
+	LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_name) == 128, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_name));
+	LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_log_size) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_log_size));
+	LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_log_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_log_size));
+	LASSERTF((int)offsetof(struct lustre_access_log_info_v1, lali_entry_size) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_access_log_info_v1, lali_entry_size));
+	LASSERTF((int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_entry_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_access_log_info_v1 *)0)->lali_entry_size));
+
+	/* Checks for struct lfsck_request */
+	LASSERTF((int)sizeof(struct lfsck_request) == 96, "found %lld\n",
+		 (long long)(int)sizeof(struct lfsck_request));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_event) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_event));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_event) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_event));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_index));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_index));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_flags));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_valid) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_valid));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_valid));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_speed) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_speed));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_speed) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_speed));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_version) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_version));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_version) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_version));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_active) == 22, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_active));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_active) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_active));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_param) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_param));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_param) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_param));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_async_windows) == 26, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_async_windows));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_async_windows) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_async_windows));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_flags));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_fid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_fid));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_fid));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_fid2) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_fid2));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_fid2));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_comp_id) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_comp_id));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_comp_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_comp_id));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_padding_0) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_padding_0));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_0));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_padding_1) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_padding_1));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_1));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_padding_2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_padding_2));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_2));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_padding_3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_padding_3));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_3));
+	LASSERTF(LFSCK_TYPE_SCRUB == 0x00000000UL, "found 0x%.8xUL\n",
+		(unsigned)LFSCK_TYPE_SCRUB);
+	LASSERTF(LFSCK_TYPE_LAYOUT == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LFSCK_TYPE_LAYOUT);
+	LASSERTF(LFSCK_TYPE_NAMESPACE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LFSCK_TYPE_NAMESPACE);
+	LASSERTF(LE_LASTID_REBUILDING == 1, "found %lld\n",
+		 (long long)LE_LASTID_REBUILDING);
+	LASSERTF(LE_LASTID_REBUILT == 2, "found %lld\n",
+		 (long long)LE_LASTID_REBUILT);
+	LASSERTF(LE_PHASE1_DONE == 3, "found %lld\n",
+		 (long long)LE_PHASE1_DONE);
+	LASSERTF(LE_PHASE2_DONE == 4, "found %lld\n",
+		 (long long)LE_PHASE2_DONE);
+	LASSERTF(LE_START == 5, "found %lld\n",
+		 (long long)LE_START);
+	LASSERTF(LE_STOP == 6, "found %lld\n",
+		 (long long)LE_STOP);
+	LASSERTF(LE_QUERY == 7, "found %lld\n",
+		 (long long)LE_QUERY);
+	LASSERTF(LE_PEER_EXIT == 9, "found %lld\n",
+		 (long long)LE_PEER_EXIT);
+	LASSERTF(LE_CONDITIONAL_DESTROY == 10, "found %lld\n",
+		 (long long)LE_CONDITIONAL_DESTROY);
+	LASSERTF(LE_PAIRS_VERIFY == 11, "found %lld\n",
+		 (long long)LE_PAIRS_VERIFY);
+	LASSERTF(LE_SET_LMV_MASTER == 15, "found %lld\n",
+		 (long long)LE_SET_LMV_MASTER);
+	LASSERTF(LE_SET_LMV_SLAVE == 16, "found %lld\n",
+		 (long long)LE_SET_LMV_SLAVE);
+	LASSERTF(LEF_TO_OST == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_TO_OST);
+	LASSERTF(LEF_FROM_OST == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_FROM_OST);
+	LASSERTF(LEF_SET_LMV_HASH == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_SET_LMV_HASH);
+	LASSERTF(LEF_SET_LMV_ALL == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_SET_LMV_ALL);
+	LASSERTF(LEF_RECHECK_NAME_HASH == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_RECHECK_NAME_HASH);
+	LASSERTF(LEF_QUERY_ALL == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_QUERY_ALL);
+
+	/* Checks for struct lfsck_reply */
+	LASSERTF((int)sizeof(struct lfsck_reply) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lfsck_reply));
+	LASSERTF((int)offsetof(struct lfsck_reply, lr_status) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_reply, lr_status));
+	LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_status));
+	LASSERTF((int)offsetof(struct lfsck_reply, lr_padding_1) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_reply, lr_padding_1));
+	LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_padding_1));
+	LASSERTF((int)offsetof(struct lfsck_reply, lr_repaired) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_reply, lr_repaired));
+	LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_repaired) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_repaired));
+
+	/* Checks for struct update_params */
+	LASSERTF((int)sizeof(struct update_params) == 0, "found %lld\n",
+		 (long long)(int)sizeof(struct update_params));
+	LASSERTF((int)offsetof(struct update_params, up_params) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_params, up_params));
+	LASSERTF((int)sizeof(((struct update_params *)0)->up_params) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_params *)0)->up_params));
+
+	/* Checks for struct update_op */
+	LASSERTF((int)sizeof(struct update_op) == 20, "found %lld\n",
+		 (long long)(int)sizeof(struct update_op));
+	LASSERTF((int)offsetof(struct update_op, uop_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_op, uop_fid));
+	LASSERTF((int)sizeof(((struct update_op *)0)->uop_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_op *)0)->uop_fid));
+	LASSERTF((int)offsetof(struct update_op, uop_type) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct update_op, uop_type));
+	LASSERTF((int)sizeof(((struct update_op *)0)->uop_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_op *)0)->uop_type));
+	LASSERTF((int)offsetof(struct update_op, uop_param_count) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct update_op, uop_param_count));
+	LASSERTF((int)sizeof(((struct update_op *)0)->uop_param_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_op *)0)->uop_param_count));
+	LASSERTF((int)offsetof(struct update_op, uop_params_off) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct update_op, uop_params_off));
+
+	/* Checks for struct update_ops */
+	LASSERTF((int)sizeof(struct update_ops) == 0, "found %lld\n",
+		 (long long)(int)sizeof(struct update_ops));
+	LASSERTF((int)offsetof(struct update_ops, uops_op) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_ops, uops_op));
+	LASSERTF((int)sizeof(((struct update_ops *)0)->uops_op) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_ops *)0)->uops_op));
+
+	/* Checks for struct update_records */
+	LASSERTF((int)sizeof(struct update_records) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct update_records));
+	LASSERTF((int)offsetof(struct update_records, ur_master_transno) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_master_transno));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_master_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_master_transno));
+	LASSERTF((int)offsetof(struct update_records, ur_batchid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_batchid));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_batchid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_batchid));
+	LASSERTF((int)offsetof(struct update_records, ur_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_flags));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_flags));
+	LASSERTF((int)offsetof(struct update_records, ur_index) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_index));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_index));
+	LASSERTF((int)offsetof(struct update_records, ur_update_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_update_count));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_update_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_update_count));
+	LASSERTF((int)offsetof(struct update_records, ur_param_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_param_count));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_param_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_param_count));
+	LASSERTF(UPDATE_RECORD_CONTINUE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)UPDATE_RECORD_CONTINUE);
+
+	/* Checks for struct llog_update_record */
+	LASSERTF((int)sizeof(struct llog_update_record) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_update_record));
+	LASSERTF((int)offsetof(struct llog_update_record, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_update_record, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_update_record *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_update_record, lur_update_rec) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_update_record, lur_update_rec));
+	LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_update_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_update_record *)0)->lur_update_rec));
+#endif /* HAVE_SERVER_SUPPORT */
+
+	/* Checks for struct lustre_cfg */
+	LASSERTF((int)sizeof(struct lustre_cfg) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_cfg));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_version));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_version));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_command) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_command));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_command) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_command));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_num) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_num));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_num) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_num));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_flags));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_flags));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_nid));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nid));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nal) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_nal));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nal) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nal));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_bufcount) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_bufcount));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_buflens[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_buflens[0]));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]));
+	LASSERTF(LCFG_ATTACH == 0x000cf001UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ATTACH);
+	LASSERTF(LCFG_DETACH == 0x000cf002UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DETACH);
+	LASSERTF(LCFG_SETUP == 0x000cf003UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SETUP);
+	LASSERTF(LCFG_CLEANUP == 0x000cf004UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_CLEANUP);
+	LASSERTF(LCFG_ADD_UUID == 0x000cf005UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ADD_UUID);
+	LASSERTF(LCFG_DEL_UUID == 0x000cf006UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_UUID);
+	LASSERTF(LCFG_MOUNTOPT == 0x000cf007UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_MOUNTOPT);
+	LASSERTF(LCFG_DEL_MOUNTOPT == 0x000cf008UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_MOUNTOPT);
+	LASSERTF(LCFG_SET_TIMEOUT == 0x000cf009UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_TIMEOUT);
+	LASSERTF(LCFG_SET_UPCALL == 0x000cf00aUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_UPCALL);
+	LASSERTF(LCFG_ADD_CONN == 0x000cf00bUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ADD_CONN);
+	LASSERTF(LCFG_DEL_CONN == 0x000cf00cUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_CONN);
+	LASSERTF(LCFG_LOV_ADD_OBD == 0x000cf00dUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOV_ADD_OBD);
+	LASSERTF(LCFG_LOV_DEL_OBD == 0x000cf00eUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOV_DEL_OBD);
+	LASSERTF(LCFG_PARAM == 0x000cf00fUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_PARAM);
+	LASSERTF(LCFG_MARKER == 0x000cf010UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_MARKER);
+	LASSERTF(LCFG_LOG_START == 0x000ce011UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOG_START);
+	LASSERTF(LCFG_LOG_END == 0x000ce012UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOG_END);
+	LASSERTF(LCFG_LOV_ADD_INA == 0x000ce013UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOV_ADD_INA);
+	LASSERTF(LCFG_ADD_MDC == 0x000cf014UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ADD_MDC);
+	LASSERTF(LCFG_DEL_MDC == 0x000cf015UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_MDC);
+	LASSERTF(LCFG_SPTLRPC_CONF == 0x000ce016UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SPTLRPC_CONF);
+	LASSERTF(LCFG_POOL_NEW == 0x000ce020UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_NEW);
+	LASSERTF(LCFG_POOL_ADD == 0x000ce021UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_ADD);
+	LASSERTF(LCFG_POOL_REM == 0x000ce022UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_REM);
+	LASSERTF(LCFG_POOL_DEL == 0x000ce023UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_DEL);
+	LASSERTF(LCFG_SET_LDLM_TIMEOUT == 0x000ce030UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_LDLM_TIMEOUT);
+	LASSERTF(LCFG_PRE_CLEANUP == 0x000cf031UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_PRE_CLEANUP);
+	LASSERTF(LCFG_SET_PARAM == 0x000ce032UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_PARAM);
+#ifdef HAVE_SERVER_SUPPORT
+	LASSERTF(LCFG_NODEMAP_ADD == 0x000ce040UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD);
+	LASSERTF(LCFG_NODEMAP_DEL == 0x000ce041UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL);
+	LASSERTF(LCFG_NODEMAP_ADD_RANGE == 0x000ce042UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_RANGE);
+	LASSERTF(LCFG_NODEMAP_DEL_RANGE == 0x000ce043UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_RANGE);
+	LASSERTF(LCFG_NODEMAP_ADD_UIDMAP == 0x000ce044UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_UIDMAP);
+	LASSERTF(LCFG_NODEMAP_DEL_UIDMAP == 0x000ce045UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_UIDMAP);
+	LASSERTF(LCFG_NODEMAP_ADD_GIDMAP == 0x000ce046UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_GIDMAP);
+	LASSERTF(LCFG_NODEMAP_DEL_GIDMAP == 0x000ce047UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_GIDMAP);
+	LASSERTF(LCFG_NODEMAP_ACTIVATE == 0x000ce048UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ACTIVATE);
+	LASSERTF(LCFG_NODEMAP_ADMIN == 0x000ce049UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADMIN);
+	LASSERTF(LCFG_NODEMAP_ADD_PROJIDMAP == 0x000ce04aUL, "found 0x%.8xUL\n",
+		 (unsigned)LCFG_NODEMAP_ADD_PROJIDMAP);
+	LASSERTF(LCFG_NODEMAP_DEL_PROJIDMAP == 0x000ce04bUL, "found 0x%.8xUL\n",
+		 (unsigned)LCFG_NODEMAP_DEL_PROJIDMAP);
+	LASSERTF(LCFG_NODEMAP_TRUSTED == 0x000ce050UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_TRUSTED);
+	LASSERTF(LCFG_NODEMAP_SQUASH_UID == 0x000ce051UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SQUASH_UID);
+	LASSERTF(LCFG_NODEMAP_SQUASH_GID == 0x000ce052UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SQUASH_GID);
+	LASSERTF(LCFG_NODEMAP_ADD_SHKEY == 0x000ce053UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_SHKEY);
+	LASSERTF(LCFG_NODEMAP_DEL_SHKEY == 0x000ce054UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_SHKEY);
+	LASSERTF(LCFG_NODEMAP_TEST_NID == 0x000ce055UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_TEST_NID);
+	LASSERTF(LCFG_NODEMAP_TEST_ID == 0x000ce056UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_TEST_ID);
+	LASSERTF(LCFG_NODEMAP_SET_FILESET == 0x000ce057UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SET_FILESET);
+	LASSERTF(LCFG_NODEMAP_DENY_UNKNOWN == 0x000ce058UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DENY_UNKNOWN);
+	LASSERTF(LCFG_NODEMAP_MAP_MODE == 0x000ce059UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_MAP_MODE);
+	LASSERTF(LCFG_NODEMAP_AUDIT_MODE == 0x000ce05aUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_AUDIT_MODE);
+	LASSERTF(LCFG_NODEMAP_SET_SEPOL == 0x000ce05bUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SET_SEPOL);
+	LASSERTF(LCFG_NODEMAP_FORBID_ENCRYPT == 0x000ce05cUL, "found 0x%.8xUL\n",
+		 (unsigned)LCFG_NODEMAP_FORBID_ENCRYPT);
+	LASSERTF(LCFG_NODEMAP_SQUASH_PROJID == 0x000ce05dUL, "found 0x%.8xUL\n",
+		 (unsigned)LCFG_NODEMAP_SQUASH_PROJID);
+#endif /* HAVE_SERVER_SUPPORT */
+	LASSERTF(PORTALS_CFG_TYPE == 1, "found %lld\n",
+		 (long long)PORTALS_CFG_TYPE);
+	LASSERTF(LUSTRE_CFG_TYPE == 123, "found %lld\n",
+		 (long long)LUSTRE_CFG_TYPE);
+}
diff --git a/drivers/staging/lustrefsx/lustre/target/barrier.c b/drivers/staging/lustrefsx/lustre/target/barrier.c
new file mode 100644
index 0000000000000..4f8c5b3dfdb4d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/barrier.c
@@ -0,0 +1,412 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * lustre/target/barrier.c
+ *
+ * Currently, the Lustre barrier is implemented as write barrier on all MDTs.
+ * For each MDT in the system, when it starts, it registers a barrier instance
+ * that will be used in handling subsequent barrier requests.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SNAPSHOT
+
+#include <linux/percpu_counter.h>
+
+#include <dt_object.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_barrier.h>
+#include <uapi/linux/lustre/lustre_barrier_user.h>
+
+static LIST_HEAD(barrier_instance_list);
+static DEFINE_SPINLOCK(barrier_instance_lock);
+
+struct barrier_instance {
+	struct list_head	 bi_link;
+	struct dt_device	*bi_bottom;
+	struct dt_device	*bi_next;
+	wait_queue_head_t	 bi_waitq;
+	rwlock_t		 bi_rwlock;
+	struct percpu_counter	 bi_writers;
+	atomic_t		 bi_ref;
+	time64_t		 bi_deadline;
+	__u32			 bi_status;
+};
+
+static inline char *barrier_barrier2name(struct barrier_instance *barrier)
+{
+	return barrier->bi_bottom->dd_lu_dev.ld_obd->obd_name;
+}
+
+static inline __u32 barrier_dev_idx(struct barrier_instance *barrier)
+{
+	return lu_site2seq(barrier->bi_bottom->dd_lu_dev.ld_site)->ss_node_id;
+}
+
+static void barrier_instance_cleanup(struct barrier_instance *barrier)
+{
+	LASSERT(list_empty(&barrier->bi_link));
+
+	percpu_counter_destroy(&barrier->bi_writers);
+	OBD_FREE_PTR(barrier);
+}
+
+static inline void barrier_instance_put(struct barrier_instance *barrier)
+{
+	if (atomic_dec_and_test(&barrier->bi_ref))
+		barrier_instance_cleanup(barrier);
+}
+
+static struct barrier_instance *
+barrier_instance_find_locked(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+
+	list_for_each_entry(barrier, &barrier_instance_list, bi_link) {
+		if (barrier->bi_bottom == key)
+			return barrier;
+	}
+
+	return NULL;
+}
+
+static void barrier_instance_add(struct barrier_instance *barrier)
+{
+	struct barrier_instance *tmp;
+
+	spin_lock(&barrier_instance_lock);
+	tmp = barrier_instance_find_locked(barrier->bi_bottom);
+	LASSERT(!tmp);
+
+	list_add_tail(&barrier->bi_link, &barrier_instance_list);
+	spin_unlock(&barrier_instance_lock);
+}
+
+static struct barrier_instance *barrier_instance_find(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+
+	spin_lock(&barrier_instance_lock);
+	barrier = barrier_instance_find_locked(key);
+	if (barrier)
+		atomic_inc(&barrier->bi_ref);
+	spin_unlock(&barrier_instance_lock);
+
+	return barrier;
+}
+
+static void barrier_set(struct barrier_instance *barrier, __u32 status)
+{
+	if (barrier->bi_status != status) {
+		CDEBUG(D_SNAPSHOT, "%s: change barrier status from %u to %u\n",
+		       barrier_barrier2name(barrier),
+		       barrier->bi_status, status);
+
+		barrier->bi_status = status;
+	}
+}
+
+/**
+ * Create the barrier for the given instance.
+ *
+ * We use two-phases barrier to guarantee that after the barrier setup:
+ * 1) All the MDT side pending async modification have been flushed.
+ * 2) Any subsequent modification will be blocked.
+ * 3) All async transactions on the MDTs have been committed.
+ *
+ * For phase1, we do the following:
+ *
+ * Firstly, it sets barrier flag on the instance that will block subsequent
+ * modifications from clients. (Note: server sponsored modification will be
+ * allowed for flush pending modifications)
+ *
+ * Secondly, it will flush all pending modification via dt_sync(), such as
+ * async OST-object destroy, async OST-object owner changes, and so on.
+ *
+ * If there are some on-handling clients sponsored modifications during the
+ * barrier freezing, then related modifications may cause pending requests
+ * after the first dt_sync(), so call dt_sync() again after all on-handling
+ * modifications done.
+ *
+ * With the phase1 barrier set, all pending cross-servers modification have
+ * been flushed to remote servers, and any new modification will be blocked.
+ * But it does not guarantees that all the updates have been committed to
+ * storage on remote servers. So when all the instances have done phase1
+ * barrier successfully, the MGS will notify all instances to do the phase2
+ * barrier as following:
+ *
+ * Every barrier instance will call dt_sync() to make all async transactions
+ * to be committed locally.
+ *
+ * \param[in] env	pointer to the thread context
+ * \param[in] barrier	pointer to the barrier instance
+ * \param[in] phase1	indicate whether it is phase1 barrier or not
+ *
+ * \retval		positive number for timeout
+ * \retval		0 for success
+ * \retval		negative error number on failure
+ */
+static int barrier_freeze(const struct lu_env *env,
+			  struct barrier_instance *barrier, bool phase1)
+{
+	time64_t left;
+	int rc = 0;
+	__s64 inflight = 0;
+	ENTRY;
+
+	write_lock(&barrier->bi_rwlock);
+	barrier_set(barrier, phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2);
+
+	/* Avoid out-of-order execution the barrier_set()
+	 * and the check of inflight modifications count. */
+	smp_mb();
+
+	if (phase1)
+		inflight = percpu_counter_sum(&barrier->bi_writers);
+	write_unlock(&barrier->bi_rwlock);
+
+	rc = dt_sync(env, barrier->bi_next);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(barrier->bi_deadline != 0);
+
+	left = barrier->bi_deadline - ktime_get_real_seconds();
+	if (left <= 0)
+		RETURN(1);
+
+	if (phase1 && inflight != 0) {
+		rc = wait_event_idle_timeout(
+			barrier->bi_waitq,
+			percpu_counter_sum(&barrier->bi_writers) == 0,
+			cfs_time_seconds(left));
+		if (rc <= 0)
+			RETURN(1);
+
+		/* sync again after all inflight modifications done. */
+		rc = dt_sync(env, barrier->bi_next);
+		if (rc)
+			RETURN(rc);
+
+		if (ktime_get_real_seconds() > barrier->bi_deadline)
+			RETURN(1);
+	}
+
+	CDEBUG(D_SNAPSHOT, "%s: barrier freezing %s done.\n",
+	       barrier_barrier2name(barrier), phase1 ? "phase1" : "phase2");
+
+	if (!phase1)
+		barrier_set(barrier, BS_FROZEN);
+
+	RETURN(0);
+}
+
+void barrier_init(void)
+{
+}
+
+void barrier_fini(void)
+{
+	LASSERT(list_empty(&barrier_instance_list));
+}
+
+bool barrier_entry(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+	bool entered = false;
+	ENTRY;
+
+	barrier = barrier_instance_find(key);
+	if (unlikely(!barrier))
+		/* Fail open */
+		RETURN(true);
+
+	read_lock(&barrier->bi_rwlock);
+	if (likely(barrier->bi_status != BS_FREEZING_P1 &&
+		   barrier->bi_status != BS_FREEZING_P2 &&
+		   barrier->bi_status != BS_FROZEN) ||
+	    ktime_get_real_seconds() > barrier->bi_deadline) {
+		percpu_counter_inc(&barrier->bi_writers);
+		entered = true;
+	}
+	read_unlock(&barrier->bi_rwlock);
+
+	barrier_instance_put(barrier);
+	return entered;
+}
+EXPORT_SYMBOL(barrier_entry);
+
+void barrier_exit(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+
+	barrier = barrier_instance_find(key);
+	if (likely(barrier)) {
+		percpu_counter_dec(&barrier->bi_writers);
+
+		/* Avoid out-of-order execution the decreasing inflight
+		 * modifications count and the check of barrier status. */
+		smp_mb();
+
+		if (unlikely(barrier->bi_status == BS_FREEZING_P1))
+			wake_up(&barrier->bi_waitq);
+		barrier_instance_put(barrier);
+	}
+}
+EXPORT_SYMBOL(barrier_exit);
+
+int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
+{
+	struct ldlm_gl_barrier_desc *desc;
+	struct barrier_instance *barrier;
+	struct barrier_lvb *lvb;
+	struct lu_env env;
+	int rc = 0;
+	ENTRY;
+
+	/* glimpse on barrier locks always packs a glimpse descriptor */
+	req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK_DESC);
+	desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC);
+	if (!desc)
+		GOTO(out, rc = -EPROTO);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			      sizeof(struct barrier_lvb));
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out, rc);
+
+	lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
+	barrier = barrier_instance_find(key);
+	if (!barrier)
+		GOTO(out, rc = -ENODEV);
+
+	rc = lu_env_init(&env, LCT_MD_THREAD | LCT_DT_THREAD);
+	if (rc)
+		GOTO(out_barrier, rc);
+
+	CDEBUG(D_SNAPSHOT,
+	       "%s: handling barrier request: status %u, timeout %u\n",
+	       barrier_barrier2name(barrier),
+	       desc->lgbd_status, desc->lgbd_timeout);
+
+	switch (desc->lgbd_status) {
+	case BS_RESCAN:
+		barrier_set(barrier, BS_INIT);
+		break;
+	case BS_FREEZING_P1:
+	case BS_FREEZING_P2:
+		if (OBD_FAIL_CHECK(OBD_FAIL_BARRIER_FAILURE))
+			GOTO(fini, rc = -EINVAL);
+
+		barrier->bi_deadline = ktime_get_real_seconds() +
+				       desc->lgbd_timeout;
+		rc = barrier_freeze(&env, barrier,
+				    desc->lgbd_status == BS_FREEZING_P1);
+		break;
+	case BS_THAWING:
+	case BS_FAILED:
+	case BS_EXPIRED:
+		barrier_set(barrier, BS_THAWED);
+		break;
+	default:
+		CWARN("%s: unexpected barrier status %u\n",
+		      barrier_barrier2name(barrier), desc->lgbd_status);
+		rc = -EINVAL;
+		break;
+	}
+
+	GOTO(fini, rc);
+
+fini:
+	lu_env_fini(&env);
+
+out_barrier:
+	if (rc < 0)
+		barrier_set(barrier, BS_FAILED);
+	else if (rc > 0)
+		barrier_set(barrier, BS_EXPIRED);
+
+	lvb->lvb_status = barrier->bi_status;
+	lvb->lvb_index = barrier_dev_idx(barrier);
+
+	CDEBUG(D_SNAPSHOT, "%s: handled barrier request: status %u, "
+	       "deadline %lld: rc = %d\n", barrier_barrier2name(barrier),
+	       lvb->lvb_status, barrier->bi_deadline, rc);
+
+	barrier_instance_put(barrier);
+	rc = 0;
+
+out:
+	req->rq_status = rc;
+	return rc;
+}
+EXPORT_SYMBOL(barrier_handler);
+
+int barrier_register(struct dt_device *key, struct dt_device *next)
+{
+	struct barrier_instance	*barrier;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(barrier);
+	if (!barrier)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&barrier->bi_link);
+	barrier->bi_bottom = key;
+	barrier->bi_next = next;
+	init_waitqueue_head(&barrier->bi_waitq);
+	rwlock_init(&barrier->bi_rwlock);
+	atomic_set(&barrier->bi_ref, 1);
+#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+	rc = percpu_counter_init(&barrier->bi_writers, 0, GFP_KERNEL);
+#else
+	rc = percpu_counter_init(&barrier->bi_writers, 0);
+#endif
+	if (rc)
+		barrier_instance_cleanup(barrier);
+	else
+		barrier_instance_add(barrier);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(barrier_register);
+
+void barrier_deregister(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+
+	spin_lock(&barrier_instance_lock);
+	barrier = barrier_instance_find_locked(key);
+	if (barrier)
+		list_del_init(&barrier->bi_link);
+	spin_unlock(&barrier_instance_lock);
+
+	if (barrier)
+		barrier_instance_put(barrier);
+}
+EXPORT_SYMBOL(barrier_deregister);
diff --git a/drivers/staging/lustrefsx/lustre/target/out_handler.c b/drivers/staging/lustrefsx/lustre/target/out_handler.c
new file mode 100644
index 0000000000000..57c0d914f8ba7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/out_handler.c
@@ -0,0 +1,1254 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ *
+ * lustre/target/out_handler.c
+ *
+ * Object update handler between targets.
+ *
+ * Author: di.wang <di.wang@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <llog_swab.h>
+#include <lustre_obdo.h>
+#include <lustre_swab.h>
+#include <lustre_update.h>
+#include <md_object.h>
+#include <obd_class.h>
+#include "tgt_internal.h"
+
+static inline void orr_cpu_to_le(struct out_read_reply *orr_dst,
+				 const struct out_read_reply *orr_src)
+{
+	orr_dst->orr_size = cpu_to_le32(orr_src->orr_size);
+	orr_dst->orr_padding = cpu_to_le32(orr_src->orr_padding);
+	orr_dst->orr_offset = cpu_to_le64(orr_dst->orr_offset);
+}
+
+static void out_reconstruct(const struct lu_env *env, struct dt_device *dt,
+			    struct dt_object *obj,
+			    struct object_update_reply *reply,
+			    int index)
+{
+	CDEBUG(D_HA, "%s: fork reply reply %p index %d: rc = %d\n",
+	       dt_obd_name(dt), reply, index, 0);
+
+	object_update_result_insert(reply, NULL, 0, index, 0);
+}
+
+typedef void (*out_reconstruct_t)(const struct lu_env *env,
+				  struct dt_device *dt,
+				  struct dt_object *obj,
+				  struct object_update_reply *reply,
+				  int index);
+
+static inline bool out_check_resent(struct ptlrpc_request *req)
+{
+	if (likely(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)))
+		return false;
+
+	if (req_xid_is_last(req)) {
+		struct lsd_client_data *lcd;
+
+		/* XXX this does not support mulitple transactions yet, i.e.
+		 * only 1 update RPC each time betwee MDTs */
+		lcd = req->rq_export->exp_target_data.ted_lcd;
+
+		req->rq_transno = lcd->lcd_last_transno;
+		req->rq_status = lcd->lcd_last_result;
+		if (req->rq_status != 0)
+			req->rq_transno = 0;
+		lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
+		lustre_msg_set_status(req->rq_repmsg, req->rq_status);
+
+		DEBUG_REQ(D_HA, req, "reconstruct resent RPC");
+		return true;
+	}
+	DEBUG_REQ(D_HA, req, "reprocess RESENT req, last_xid is %lld",
+		  req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
+	return false;
+}
+
+static int out_create(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object        *obj = tti->tti_u.update.tti_dt_object;
+	struct dt_object_format	*dof = &tti->tti_u.update.tti_update_dof;
+	struct obdo		*lobdo = &tti->tti_u.update.tti_obdo;
+	struct lu_attr		*attr = &tti->tti_attr;
+	struct lu_fid		*fid = NULL;
+	struct obdo		*wobdo;
+	size_t			size;
+	int			rc;
+
+	ENTRY;
+
+	wobdo = object_update_param_get(update, 0, &size);
+	if (IS_ERR(wobdo) || size != sizeof(*wobdo)) {
+		CERROR("%s: obdo is NULL, invalid RPC: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(wobdo));
+		RETURN(PTR_ERR(wobdo));
+	}
+
+	if (req_capsule_req_need_swab(tsi->tsi_pill))
+		lustre_swab_obdo(wobdo);
+	lustre_get_wire_obdo(NULL, lobdo, wobdo);
+	la_from_obdo(attr, lobdo, lobdo->o_valid);
+
+	dof->dof_type = dt_mode_to_dft(attr->la_mode);
+	if (update->ou_params_count > 1) {
+		fid = object_update_param_get(update, 1, &size);
+		if (IS_ERR(fid) || size != sizeof(*fid)) {
+			CERROR("%s: invalid fid: rc = %ld\n",
+			       tgt_name(tsi->tsi_tgt), PTR_ERR(fid));
+			RETURN(PTR_ERR(fid));
+		}
+		if (req_capsule_req_need_swab(tsi->tsi_pill))
+			lustre_swab_lu_fid(fid);
+		if (!fid_is_sane(fid)) {
+			CERROR("%s: invalid fid "DFID": rc = %d\n",
+			       tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO);
+			RETURN(-EPROTO);
+		}
+	}
+
+	if (lu_object_exists(&obj->do_lu))
+		RETURN(-EEXIST);
+
+	rc = out_tx_create(tsi->tsi_env, obj, attr, fid, dof,
+			   &tti->tti_tea, tti->tti_tea.ta_handle,
+			   tti->tti_u.update.tti_update_reply,
+			   tti->tti_u.update.tti_update_reply_index);
+
+	RETURN(rc);
+}
+
+static int out_attr_set(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct lu_attr		*attr = &tti->tti_attr;
+	struct dt_object        *obj = tti->tti_u.update.tti_dt_object;
+	struct obdo		*lobdo = &tti->tti_u.update.tti_obdo;
+	struct obdo		*wobdo;
+	size_t			 size;
+	int			 rc;
+
+	ENTRY;
+
+	wobdo = object_update_param_get(update, 0, &size);
+	if (IS_ERR(wobdo) || size != sizeof(*wobdo)) {
+		CERROR("%s: empty obdo in the update: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(wobdo));
+		RETURN(PTR_ERR(wobdo));
+	}
+
+	attr->la_valid = 0;
+	attr->la_valid = 0;
+
+	if (req_capsule_req_need_swab(tsi->tsi_pill))
+		lustre_swab_obdo(wobdo);
+	lustre_get_wire_obdo(NULL, lobdo, wobdo);
+	la_from_obdo(attr, lobdo, lobdo->o_valid);
+
+	rc = out_tx_attr_set(tsi->tsi_env, obj, attr, &tti->tti_tea,
+			     tti->tti_tea.ta_handle,
+			     tti->tti_u.update.tti_update_reply,
+			     tti->tti_u.update.tti_update_reply_index);
+
+	RETURN(rc);
+}
+
+static int out_attr_get(struct tgt_session_info *tsi)
+{
+	const struct lu_env	*env = tsi->tsi_env;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct obdo		*obdo = &tti->tti_u.update.tti_obdo;
+	struct lu_attr		*la = &tti->tti_attr;
+	struct dt_object        *obj = tti->tti_u.update.tti_dt_object;
+	int			idx = tti->tti_u.update.tti_update_reply_index;
+	int			rc;
+
+	ENTRY;
+
+	if (unlikely(update->ou_result_size < sizeof(*obdo)))
+		return -EPROTO;
+
+	if (!lu_object_exists(&obj->do_lu)) {
+		/* Usually, this will be called when the master MDT try
+		 * to init a remote object(see osp_object_init), so if
+		 * the object does not exist on slave, we need set BANSHEE flag,
+		 * so the object can be removed from the cache immediately */
+		set_bit(LU_OBJECT_HEARD_BANSHEE,
+			&obj->do_lu.lo_header->loh_flags);
+		RETURN(-ENOENT);
+	}
+
+	dt_read_lock(env, obj, DT_TGT_CHILD);
+	rc = dt_attr_get(env, obj, la);
+	if (rc)
+		GOTO(out_unlock, rc);
+
+	obdo->o_valid = 0;
+	obdo_from_la(obdo, la, la->la_valid);
+
+out_unlock:
+	dt_read_unlock(env, obj);
+
+	CDEBUG(D_INFO, "%s: insert attr get reply %p index %d: rc = %d\n",
+	       tgt_name(tsi->tsi_tgt), tti->tti_u.update.tti_update_reply,
+	       0, rc);
+
+	object_update_result_insert(tti->tti_u.update.tti_update_reply, obdo,
+				    sizeof(*obdo), idx, rc);
+
+	RETURN(rc);
+}
+
+static int out_xattr_get(struct tgt_session_info *tsi)
+{
+	const struct lu_env	   *env = tsi->tsi_env;
+	struct tgt_thread_info	   *tti = tgt_th_info(env);
+	struct object_update	   *update = tti->tti_u.update.tti_update;
+	struct lu_buf		   *lbuf = &tti->tti_buf;
+	struct object_update_reply *reply = tti->tti_u.update.tti_update_reply;
+	struct dt_object           *obj = tti->tti_u.update.tti_dt_object;
+	char			   *name;
+	struct object_update_result *update_result;
+	int			idx = tti->tti_u.update.tti_update_reply_index;
+	int			   rc;
+
+	ENTRY;
+
+	if (!lu_object_exists(&obj->do_lu)) {
+		set_bit(LU_OBJECT_HEARD_BANSHEE,
+			&obj->do_lu.lo_header->loh_flags);
+		RETURN(-ENOENT);
+	}
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for xattr get: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	update_result = object_update_result_get(reply, idx, NULL);
+	if (update_result == NULL) {
+		CERROR("%s: empty name for xattr get: rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), -EPROTO);
+		RETURN(-EPROTO);
+	}
+
+	lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size;
+	if (lbuf->lb_len == 0)
+		lbuf->lb_buf = NULL;
+	else
+		lbuf->lb_buf = update_result->our_data;
+
+	dt_read_lock(env, obj, DT_TGT_CHILD);
+	rc = dt_xattr_get(env, obj, lbuf, name);
+	dt_read_unlock(env, obj);
+	if (rc <= 0) {
+		lbuf->lb_len = 0;
+		if (unlikely(!rc))
+			rc = -ENODATA;
+	} else if (lbuf->lb_buf) {
+		lbuf->lb_len = rc;
+	}
+	CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d\n",
+	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)),
+	       name, rc);
+
+	GOTO(out, rc);
+
+out:
+	object_update_result_insert(reply, lbuf->lb_buf, lbuf->lb_len, idx, rc);
+	RETURN(0);
+}
+
+static int out_xattr_list(struct tgt_session_info *tsi)
+{
+	const struct lu_env *env = tsi->tsi_env;
+	struct tgt_thread_info *tti = tgt_th_info(env);
+	struct lu_buf *lbuf = &tti->tti_buf;
+	struct object_update_reply *reply = tti->tti_u.update.tti_update_reply;
+	struct dt_object *obj = tti->tti_u.update.tti_dt_object;
+	struct object_update_result *update_result;
+	int idx = tti->tti_u.update.tti_update_reply_index;
+	int rc;
+
+	ENTRY;
+
+	if (!lu_object_exists(&obj->do_lu)) {
+		set_bit(LU_OBJECT_HEARD_BANSHEE,
+			&obj->do_lu.lo_header->loh_flags);
+		RETURN(-ENOENT);
+	}
+
+	update_result = object_update_result_get(reply, 0, NULL);
+	if (!update_result) {
+		rc = -EPROTO;
+		CERROR("%s: empty buf for xattr list: rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), rc);
+		RETURN(rc);
+	}
+
+	lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size;
+	lbuf->lb_buf = update_result->our_data;
+	if (lbuf->lb_len == 0)
+		lbuf->lb_buf = 0;
+
+	dt_read_lock(env, obj, DT_TGT_CHILD);
+	rc = dt_xattr_list(env, obj, lbuf);
+	dt_read_unlock(env, obj);
+	if (rc <= 0) {
+		lbuf->lb_len = 0;
+		if (unlikely(!rc))
+			rc = -ENODATA;
+	} else if (lbuf->lb_buf) {
+		lbuf->lb_len = rc;
+	}
+
+	CDEBUG(D_INFO, "%s: "DFID" list xattr len %d\n",
+	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)), rc);
+
+	/* Since we directly use update_result->our_data as the lbuf->lb_buf,
+	 * then use NULL for result_insert to avoid unnecessary memory copy. */
+	object_update_result_insert(reply, NULL, lbuf->lb_len, idx, rc);
+
+	RETURN(0);
+}
+
+static int out_index_lookup(struct tgt_session_info *tsi)
+{
+	const struct lu_env	*env = tsi->tsi_env;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	char			*name;
+	int			 rc;
+
+	ENTRY;
+
+	if (unlikely(update->ou_result_size < sizeof(tti->tti_fid1)))
+		return -EPROTO;
+
+	if (!lu_object_exists(&obj->do_lu))
+		RETURN(-ENOENT);
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for lookup: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	dt_read_lock(env, obj, DT_TGT_CHILD);
+	if (!dt_try_as_dir(env, obj))
+		GOTO(out_unlock, rc = -ENOTDIR);
+
+	rc = dt_lookup(env, obj, (struct dt_rec *)&tti->tti_fid1,
+		       (struct dt_key *)name);
+
+	if (rc < 0)
+		GOTO(out_unlock, rc);
+
+	if (rc == 0)
+		rc += 1;
+
+out_unlock:
+	dt_read_unlock(env, obj);
+
+	CDEBUG(D_INFO, "lookup "DFID" %s get "DFID" rc %d\n",
+	       PFID(lu_object_fid(&obj->do_lu)), name,
+	       PFID(&tti->tti_fid1), rc);
+
+	CDEBUG(D_INFO, "%s: insert lookup reply %p index %d: rc = %d\n",
+	       tgt_name(tsi->tsi_tgt), tti->tti_u.update.tti_update_reply,
+	       0, rc);
+
+	object_update_result_insert(tti->tti_u.update.tti_update_reply,
+			    &tti->tti_fid1, sizeof(tti->tti_fid1),
+			    tti->tti_u.update.tti_update_reply_index, rc);
+	RETURN(rc);
+}
+
+static int out_xattr_set(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	struct lu_buf		*lbuf = &tti->tti_buf;
+	char			*name;
+	char			*buf;
+	__u32			*tmp;
+	size_t			 buf_len = 0;
+	int			 flag;
+	size_t			 size = 0;
+	int			 rc;
+	ENTRY;
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for xattr set: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	/* If buffer == NULL (-ENODATA), then it might mean delete xattr */
+	buf = object_update_param_get(update, 1, &buf_len);
+	if (IS_ERR(buf) && PTR_ERR(buf) != -ENODATA)
+		RETURN(PTR_ERR(buf));
+
+	lbuf->lb_buf = buf;
+	lbuf->lb_len = buf_len;
+
+	tmp = object_update_param_get(update, 2, &size);
+	if (IS_ERR(tmp) || size != sizeof(*tmp)) {
+		CERROR("%s: emptry or wrong size %zu flag: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), size, PTR_ERR(tmp));
+		RETURN(PTR_ERR(tmp));
+	}
+
+	if (req_capsule_req_need_swab(tsi->tsi_pill))
+		__swab32s(tmp);
+	flag = *tmp;
+
+	rc = out_tx_xattr_set(tsi->tsi_env, obj, lbuf, name, flag,
+			      &tti->tti_tea, tti->tti_tea.ta_handle,
+			      tti->tti_u.update.tti_update_reply,
+			      tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_xattr_del(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	char			*name;
+	int			 rc;
+	ENTRY;
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for xattr set: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	rc = out_tx_xattr_del(tsi->tsi_env, obj, name, &tti->tti_tea,
+			      tti->tti_tea.ta_handle,
+			      tti->tti_u.update.tti_update_reply,
+			      tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+/**
+ * increase ref of the object
+ **/
+static int out_ref_add(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	int			 rc;
+
+	ENTRY;
+
+	rc = out_tx_ref_add(tsi->tsi_env, obj, &tti->tti_tea,
+			    tti->tti_tea.ta_handle,
+			    tti->tti_u.update.tti_update_reply,
+			    tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_ref_del(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	int			 rc;
+
+	ENTRY;
+
+	if (!lu_object_exists(&obj->do_lu))
+		RETURN(-ENOENT);
+
+	rc = out_tx_ref_del(tsi->tsi_env, obj, &tti->tti_tea,
+			    tti->tti_tea.ta_handle,
+			    tti->tti_u.update.tti_update_reply,
+			    tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_index_insert(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti	= tgt_th_info(tsi->tsi_env);
+	struct object_update	*update	= tti->tti_u.update.tti_update;
+	struct dt_object	*obj	= tti->tti_u.update.tti_dt_object;
+	struct dt_insert_rec	*rec	= &tti->tti_rec;
+	struct lu_fid		*fid;
+	char			*name;
+	__u32			*ptype;
+	int			 rc	= 0;
+	size_t			 size;
+	ENTRY;
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for index insert: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	fid = object_update_param_get(update, 1, &size);
+	if (IS_ERR(fid) || size != sizeof(*fid)) {
+		CERROR("%s: invalid fid: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(fid));
+		RETURN(PTR_ERR(fid));
+	}
+
+	if (req_capsule_req_need_swab(tsi->tsi_pill))
+		lustre_swab_lu_fid(fid);
+
+	if (!fid_is_sane(fid)) {
+		CERROR("%s: invalid FID "DFID": rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO);
+		RETURN(-EPROTO);
+	}
+
+	ptype = object_update_param_get(update, 2, &size);
+	if (IS_ERR(ptype) || size != sizeof(*ptype)) {
+		CERROR("%s: invalid type for index insert: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(ptype));
+		RETURN(PTR_ERR(ptype));
+	}
+
+	if (req_capsule_req_need_swab(tsi->tsi_pill))
+		__swab32s(ptype);
+
+	rec->rec_fid = fid;
+	rec->rec_type = *ptype;
+
+	rc = out_tx_index_insert(tsi->tsi_env, obj, (const struct dt_rec *)rec,
+				 (const struct dt_key *)name, &tti->tti_tea,
+				 tti->tti_tea.ta_handle,
+				 tti->tti_u.update.tti_update_reply,
+				 tti->tti_u.update.tti_update_reply_index);
+
+	CDEBUG(D_INFO, "%s: "DFID" index insert %s: rc = %d\n",
+	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)),
+	       name, rc);
+
+	RETURN(rc);
+}
+
+static int out_index_delete(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	char			*name;
+	int			 rc = 0;
+
+	if (!lu_object_exists(&obj->do_lu))
+		RETURN(-ENOENT);
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for index delete: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	rc = out_tx_index_delete(tsi->tsi_env, obj, (const struct dt_key *)name,
+				 &tti->tti_tea, tti->tti_tea.ta_handle,
+				 tti->tti_u.update.tti_update_reply,
+				 tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_destroy(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	struct lu_fid		*fid;
+	int			 rc;
+	ENTRY;
+
+	fid = &update->ou_fid;
+	if (!fid_is_sane(fid)) {
+		CERROR("%s: invalid FID "DFID": rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO);
+		RETURN(-EPROTO);
+	}
+
+	if (!lu_object_exists(&obj->do_lu))
+		RETURN(-ENOENT);
+
+	rc = out_tx_destroy(tsi->tsi_env, obj, &tti->tti_tea,
+			    tti->tti_tea.ta_handle,
+			    tti->tti_u.update.tti_update_reply,
+			    tti->tti_u.update.tti_update_reply_index);
+
+	RETURN(rc);
+}
+
+static int out_write(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	struct lu_buf		*lbuf = &tti->tti_buf;
+	char			*buf;
+	__u64			*tmp;
+	size_t			size = 0;
+	size_t			buf_len = 0;
+	loff_t			pos;
+	int			 rc;
+	ENTRY;
+
+	buf = object_update_param_get(update, 0, &buf_len);
+	if (IS_ERR(buf) || buf_len == 0) {
+		CERROR("%s: empty buf for xattr set: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(buf));
+		RETURN(PTR_ERR(buf));
+	}
+	lbuf->lb_buf = buf;
+	lbuf->lb_len = buf_len;
+
+	tmp = object_update_param_get(update, 1, &size);
+	if (IS_ERR(tmp) || size != sizeof(*tmp)) {
+		CERROR("%s: empty or wrong size %zu pos: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), size, PTR_ERR(tmp));
+		RETURN(PTR_ERR(tmp));
+	}
+
+	if (req_capsule_req_need_swab(tsi->tsi_pill))
+		__swab64s(tmp);
+	pos = *tmp;
+
+	rc = out_tx_write(tsi->tsi_env, obj, lbuf, pos,
+			  &tti->tti_tea, tti->tti_tea.ta_handle,
+			  tti->tti_u.update.tti_update_reply,
+			  tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_read(struct tgt_session_info *tsi)
+{
+	const struct lu_env	*env = tsi->tsi_env;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	struct object_update_reply *reply = tti->tti_u.update.tti_update_reply;
+	int index = tti->tti_u.update.tti_update_reply_index;
+	struct lu_rdbuf	*rdbuf;
+	struct object_update_result *update_result;
+	struct out_read_reply	*orr;
+	void *tmp;
+	size_t size;
+	size_t total_size = 0;
+	__u64 pos;
+	unsigned int i;
+	unsigned int nbufs;
+	int rc = 0;
+	ENTRY;
+
+	update_result = object_update_result_get(reply, index, NULL);
+	LASSERT(update_result != NULL);
+	update_result->our_datalen = sizeof(*orr);
+
+	if (!lu_object_exists(&obj->do_lu))
+		GOTO(out, rc = -ENOENT);
+
+	tmp = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(tmp)) {
+		CERROR("%s: empty size for read: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(tmp));
+		GOTO(out, rc = PTR_ERR(tmp));
+	}
+	size = le64_to_cpu(*(size_t *)(tmp));
+
+	tmp = object_update_param_get(update, 1, NULL);
+	if (IS_ERR(tmp)) {
+		CERROR("%s: empty pos for read: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(tmp));
+		GOTO(out, rc = PTR_ERR(tmp));
+	}
+	pos = le64_to_cpu(*(__u64 *)(tmp));
+
+	/* Put the offset into the begining of the buffer in reply */
+	orr = (struct out_read_reply *)update_result->our_data;
+
+	nbufs = (size + OUT_BULK_BUFFER_SIZE - 1) / OUT_BULK_BUFFER_SIZE;
+	OBD_ALLOC(rdbuf, sizeof(*rdbuf) + nbufs * sizeof(rdbuf->rb_bufs[0]));
+	if (rdbuf == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rdbuf->rb_nbufs = 0;
+	total_size = 0;
+	for (i = 0; i < nbufs; i++) {
+		__u32 read_size;
+
+		read_size = size > OUT_BULK_BUFFER_SIZE ?
+			    OUT_BULK_BUFFER_SIZE : size;
+		OBD_ALLOC(rdbuf->rb_bufs[i].lb_buf, read_size);
+		if (rdbuf->rb_bufs[i].lb_buf == NULL)
+			GOTO(out_free, rc = -ENOMEM);
+
+		rdbuf->rb_bufs[i].lb_len = read_size;
+		dt_read_lock(env, obj, DT_TGT_CHILD);
+		rc = dt_read(env, obj, &rdbuf->rb_bufs[i], &pos);
+		dt_read_unlock(env, obj);
+
+		total_size += rc < 0 ? 0 : rc;
+		if (rc <= 0)
+			break;
+
+		rdbuf->rb_nbufs++;
+		size -= read_size;
+	}
+
+	/* send pages to client */
+	rc = tgt_send_buffer(tsi, rdbuf);
+	if (rc < 0)
+		GOTO(out_free, rc);
+
+	orr->orr_size = total_size;
+	orr->orr_offset = pos;
+
+	orr_cpu_to_le(orr, orr);
+	update_result->our_datalen += orr->orr_size;
+out_free:
+	for (i = 0; i < nbufs; i++) {
+		if (rdbuf->rb_bufs[i].lb_buf != NULL) {
+			OBD_FREE(rdbuf->rb_bufs[i].lb_buf,
+				 rdbuf->rb_bufs[i].lb_len);
+		}
+	}
+	OBD_FREE(rdbuf, sizeof(*rdbuf) +
+			nbufs * sizeof(rdbuf->rb_bufs[0]));
+out:
+	/* Insert read buffer */
+	update_result->our_rc = ptlrpc_status_hton(rc);
+	reply->ourp_lens[index] = cfs_size_round(update_result->our_datalen +
+						 sizeof(*update_result));
+	RETURN(rc);
+}
+
+static int out_noop(struct tgt_session_info *tsi)
+{
+	return 0;
+}
+
+#define DEF_OUT_HNDL(opc, name, flags, fn)     \
+[opc - OUT_CREATE] = {					\
+	.th_name    = name,				\
+	.th_fail_id = 0,				\
+	.th_opc     = opc,				\
+	.th_flags   = flags,				\
+	.th_act     = fn,				\
+	.th_fmt     = NULL,				\
+	.th_version = 0,				\
+}
+
+static struct tgt_handler out_update_ops[] = {
+	DEF_OUT_HNDL(OUT_CREATE, "out_create", IS_MUTABLE | HAS_REPLY,
+		     out_create),
+	DEF_OUT_HNDL(OUT_DESTROY, "out_create", IS_MUTABLE | HAS_REPLY,
+		     out_destroy),
+	DEF_OUT_HNDL(OUT_REF_ADD, "out_ref_add", IS_MUTABLE | HAS_REPLY,
+		     out_ref_add),
+	DEF_OUT_HNDL(OUT_REF_DEL, "out_ref_del", IS_MUTABLE | HAS_REPLY,
+		     out_ref_del),
+	DEF_OUT_HNDL(OUT_ATTR_SET, "out_attr_set",  IS_MUTABLE | HAS_REPLY,
+		     out_attr_set),
+	DEF_OUT_HNDL(OUT_ATTR_GET, "out_attr_get",  HAS_REPLY,
+		     out_attr_get),
+	DEF_OUT_HNDL(OUT_XATTR_SET, "out_xattr_set", IS_MUTABLE | HAS_REPLY,
+		     out_xattr_set),
+	DEF_OUT_HNDL(OUT_XATTR_DEL, "out_xattr_del", IS_MUTABLE | HAS_REPLY,
+		     out_xattr_del),
+	DEF_OUT_HNDL(OUT_XATTR_GET, "out_xattr_get", HAS_REPLY,
+		     out_xattr_get),
+	DEF_OUT_HNDL(OUT_INDEX_LOOKUP, "out_index_lookup", HAS_REPLY,
+		     out_index_lookup),
+	DEF_OUT_HNDL(OUT_INDEX_INSERT, "out_index_insert",
+		     IS_MUTABLE | HAS_REPLY, out_index_insert),
+	DEF_OUT_HNDL(OUT_INDEX_DELETE, "out_index_delete",
+		     IS_MUTABLE | HAS_REPLY, out_index_delete),
+	DEF_OUT_HNDL(OUT_WRITE, "out_write", IS_MUTABLE | HAS_REPLY, out_write),
+	DEF_OUT_HNDL(OUT_READ, "out_read", HAS_REPLY, out_read),
+	DEF_OUT_HNDL(OUT_NOOP, "out_noop", HAS_REPLY, out_noop),
+	DEF_OUT_HNDL(OUT_XATTR_LIST, "out_xattr_list", HAS_REPLY,
+		     out_xattr_list),
+};
+
+static struct tgt_handler *out_handler_find(__u32 opc)
+{
+	struct tgt_handler *h;
+
+	h = NULL;
+	if (OUT_CREATE <= opc && opc < OUT_LAST) {
+		h = &out_update_ops[opc - OUT_CREATE];
+		LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n",
+			 h->th_opc, opc);
+	} else {
+		h = NULL; /* unsupported opc */
+	}
+	return h;
+}
+
+static int out_tx_start(const struct lu_env *env, struct dt_device *dt,
+			struct thandle_exec_args *ta, struct obd_export *exp)
+{
+	ta->ta_argno = 0;
+	ta->ta_handle = dt_trans_create(env, dt);
+	if (IS_ERR(ta->ta_handle)) {
+		int rc;
+
+		rc = PTR_ERR(ta->ta_handle);
+		ta->ta_handle = NULL;
+		CERROR("%s: start handle error: rc = %d\n", dt_obd_name(dt),
+		       rc);
+		return rc;
+	}
+	if (exp->exp_need_sync)
+		ta->ta_handle->th_sync = 1;
+
+	return 0;
+}
+
+static int out_trans_start(const struct lu_env *env,
+			   struct thandle_exec_args *ta)
+{
+	return dt_trans_start(env, ta->ta_handle->th_dev, ta->ta_handle);
+}
+
+static int out_trans_stop(const struct lu_env *env,
+			  struct thandle_exec_args *ta, int err)
+{
+	int i;
+	int rc;
+
+	ta->ta_handle->th_result = err;
+	rc = dt_trans_stop(env, ta->ta_handle->th_dev, ta->ta_handle);
+	for (i = 0; i < ta->ta_argno; i++) {
+		if (ta->ta_args[i]->object != NULL) {
+			dt_object_put(env, ta->ta_args[i]->object);
+			ta->ta_args[i]->object = NULL;
+		}
+	}
+	ta->ta_handle = NULL;
+	ta->ta_argno = 0;
+
+	return rc;
+}
+
+static int out_tx_end(const struct lu_env *env, struct thandle_exec_args *ta,
+		      int declare_ret)
+{
+	struct tgt_session_info	*tsi = tgt_ses_info(env);
+	int			i;
+	int			rc;
+	int			rc1;
+	ENTRY;
+
+	if (ta->ta_handle == NULL)
+		RETURN(0);
+
+	if (declare_ret != 0 || ta->ta_argno == 0)
+		GOTO(stop, rc = declare_ret);
+
+	LASSERT(ta->ta_handle->th_dev != NULL);
+	rc = out_trans_start(env, ta);
+	if (unlikely(rc != 0))
+		GOTO(stop, rc);
+
+	for (i = 0; i < ta->ta_argno; i++) {
+		rc = ta->ta_args[i]->exec_fn(env, ta->ta_handle,
+					     ta->ta_args[i]);
+		if (unlikely(rc != 0)) {
+			CDEBUG(D_INFO, "error during execution of #%u from"
+			       " %s:%d: rc = %d\n", i, ta->ta_args[i]->file,
+			       ta->ta_args[i]->line, rc);
+			while (--i >= 0) {
+				if (ta->ta_args[i]->undo_fn != NULL)
+					ta->ta_args[i]->undo_fn(env,
+							       ta->ta_handle,
+							       ta->ta_args[i]);
+				else
+					CERROR("%s: undo for %s:%d: rc = %d\n",
+					     dt_obd_name(ta->ta_handle->th_dev),
+					       ta->ta_args[i]->file,
+					       ta->ta_args[i]->line, -ENOTSUPP);
+			}
+			break;
+		}
+		CDEBUG(D_INFO, "%s: executed %u/%u: rc = %d\n",
+		       dt_obd_name(ta->ta_handle->th_dev), i, ta->ta_argno, rc);
+	}
+
+	/* Only fail for real updates, XXX right now llog updates will be
+	* ignore, whose updates count is usually 1, so failover test
+	* case will spot this FAIL_UPDATE_NET_REP precisely, and it will
+	* be removed after async update patch is landed. */
+	if (ta->ta_argno > 1)
+		tsi->tsi_reply_fail_id = OBD_FAIL_OUT_UPDATE_NET_REP;
+
+stop:
+	rc1 = out_trans_stop(env, ta, rc);
+	if (rc == 0)
+		rc = rc1;
+
+	ta->ta_handle = NULL;
+	ta->ta_argno = 0;
+
+	RETURN(rc);
+}
+
+/**
+ * Object updates between Targets. Because all the updates has been
+ * dis-assemblied into object updates at sender side, so OUT will
+ * call OSD API directly to execute these updates.
+ *
+ * In DNE phase I all of the updates in the request need to be executed
+ * in one transaction, and the transaction has to be synchronously.
+ *
+ * Please refer to lustre/include/lustre/lustre_idl.h for req/reply
+ * format.
+ */
+int out_handle(struct tgt_session_info *tsi)
+{
+	const struct lu_env		*env = tsi->tsi_env;
+	struct tgt_thread_info		*tti = tgt_th_info(env);
+	struct thandle_exec_args	*ta = &tti->tti_tea;
+	struct req_capsule		*pill = tsi->tsi_pill;
+	struct dt_device		*dt = tsi->tsi_tgt->lut_bottom;
+	struct out_update_header	*ouh;
+	struct out_update_buffer	*oub = NULL;
+	struct object_update		*update;
+	struct object_update_reply	*reply;
+	struct ptlrpc_bulk_desc		*desc = NULL;
+	void				**update_bufs;
+	int				current_batchid = -1;
+	__u32				update_buf_count;
+	unsigned int			i;
+	unsigned int			reply_index = 0;
+	int				rc = 0;
+	int				rc1 = 0;
+	int				ouh_size, reply_size;
+	int				updates;
+	bool need_reconstruct;
+
+	ENTRY;
+
+	req_capsule_set(pill, &RQF_OUT_UPDATE);
+	ouh_size = req_capsule_get_size(pill, &RMF_OUT_UPDATE_HEADER,
+					RCL_CLIENT);
+	if (ouh_size <= 0)
+		RETURN(err_serious(-EPROTO));
+
+	ouh = req_capsule_client_get(pill, &RMF_OUT_UPDATE_HEADER);
+	if (ouh == NULL)
+		RETURN(err_serious(-EPROTO));
+
+	if (ouh->ouh_magic != OUT_UPDATE_HEADER_MAGIC) {
+		CERROR("%s: invalid update buffer magic %x expect %x: "
+		       "rc = %d\n", tgt_name(tsi->tsi_tgt), ouh->ouh_magic,
+		       UPDATE_REQUEST_MAGIC, -EPROTO);
+		RETURN(err_serious(-EPROTO));
+	}
+
+	update_buf_count = ouh->ouh_count;
+	if (update_buf_count == 0)
+		RETURN(err_serious(-EPROTO));
+
+	OBD_ALLOC_PTR_ARRAY(update_bufs, update_buf_count);
+	if (update_bufs == NULL)
+		RETURN(err_serious(-ENOMEM));
+
+	if (ouh->ouh_inline_length > 0) {
+		update_bufs[0] = ouh->ouh_inline_data;
+	} else {
+		struct out_update_buffer *tmp;
+		int page_count = 0;
+
+		oub = req_capsule_client_get(pill, &RMF_OUT_UPDATE_BUF);
+		if (oub == NULL)
+			GOTO(out_free, rc = err_serious(-EPROTO));
+
+		for (i = 0; i < update_buf_count; i++)
+			/* First *and* last might be partial pages, hence +1 */
+			page_count += DIV_ROUND_UP(oub[i].oub_size,
+						   PAGE_SIZE) + 1;
+
+		desc = ptlrpc_prep_bulk_exp(pill->rc_req, page_count,
+					    PTLRPC_BULK_OPS_COUNT,
+					    PTLRPC_BULK_GET_SINK,
+					    MDS_BULK_PORTAL,
+					    &ptlrpc_bulk_kiov_nopin_ops);
+		if (desc == NULL)
+			GOTO(out_free, rc = err_serious(-ENOMEM));
+
+		tmp = oub;
+		for (i = 0; i < update_buf_count; i++, tmp++) {
+			if (tmp->oub_size >= OUT_MAXREQSIZE)
+				GOTO(out_free, rc = err_serious(-EPROTO));
+
+			OBD_ALLOC_LARGE(update_bufs[i], tmp->oub_size);
+			if (update_bufs[i] == NULL)
+				GOTO(out_free, rc = err_serious(-ENOMEM));
+
+			desc->bd_frag_ops->add_iov_frag(desc, update_bufs[i],
+							tmp->oub_size);
+		}
+
+		pill->rc_req->rq_bulk_write = 1;
+		rc = sptlrpc_svc_prep_bulk(pill->rc_req, desc);
+		if (rc != 0)
+			GOTO(out_free, rc = err_serious(rc));
+
+		rc = target_bulk_io(pill->rc_req->rq_export, desc);
+		if (rc < 0)
+			GOTO(out_free, rc = err_serious(rc));
+	}
+	/* validate the request and calculate the total update count and
+	 * set it to reply */
+	reply_size = 0;
+	updates = 0;
+	for (i = 0; i < update_buf_count; i++) {
+		struct object_update_request	*our;
+		int				 j;
+
+		our = update_bufs[i];
+		if (req_capsule_req_need_swab(pill))
+			lustre_swab_object_update_request(our, 0);
+
+		if (our->ourq_magic != UPDATE_REQUEST_MAGIC) {
+			CERROR("%s: invalid update buffer magic %x"
+			       " expect %x: rc = %d\n",
+			       tgt_name(tsi->tsi_tgt), our->ourq_magic,
+			       UPDATE_REQUEST_MAGIC, -EPROTO);
+			GOTO(out_free, rc = err_serious(-EPROTO));
+		}
+		updates += our->ourq_count;
+
+		/* need to calculate reply size */
+		for (j = 0; j < our->ourq_count; j++) {
+			update = object_update_request_get(our, j, NULL);
+			if (update == NULL)
+				GOTO(out, rc = err_serious(-EPROTO));
+			if (req_capsule_req_need_swab(pill))
+				lustre_swab_object_update(update);
+
+			if (!fid_is_sane(&update->ou_fid)) {
+				CERROR("%s: invalid FID "DFID": rc = %d\n",
+				       tgt_name(tsi->tsi_tgt),
+				       PFID(&update->ou_fid), -EPROTO);
+				GOTO(out, rc = err_serious(-EPROTO));
+			}
+
+			/* XXX: what ou_result_size can be considered safe? */
+
+			reply_size += sizeof(reply->ourp_lens[0]);
+			reply_size += sizeof(struct object_update_result);
+			reply_size += update->ou_result_size;
+		}
+ 	}
+	reply_size += sizeof(*reply);
+
+	if (unlikely(reply_size > ouh->ouh_reply_size)) {
+		CERROR("%s: too small reply buf %u for %u, need %u at least\n",
+		       tgt_name(tsi->tsi_tgt), ouh->ouh_reply_size,
+		       updates, reply_size);
+		GOTO(out_free, rc = err_serious(-EPROTO));
+	}
+
+	req_capsule_set_size(pill, &RMF_OUT_UPDATE_REPLY, RCL_SERVER,
+			     ouh->ouh_reply_size);
+	rc = req_capsule_server_pack(pill);
+	if (rc != 0) {
+		CERROR("%s: Can't pack response: rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), rc);
+		GOTO(out_free, rc = err_serious(-EPROTO));
+	}
+
+	/* Prepare the update reply buffer */
+	reply = req_capsule_server_get(pill, &RMF_OUT_UPDATE_REPLY);
+	if (reply == NULL)
+		GOTO(out_free, rc = -EPROTO);
+	reply->ourp_magic = UPDATE_REPLY_MAGIC;
+	reply->ourp_count = updates;
+	tti->tti_u.update.tti_update_reply = reply;
+	tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi));
+
+	need_reconstruct = out_check_resent(pill->rc_req);
+
+	/* Walk through updates in the request to execute them */
+	for (i = 0; i < update_buf_count; i++) {
+		struct tgt_handler	*h;
+		struct dt_object	*dt_obj;
+		int			update_count;
+		struct object_update_request *our;
+		int			j;
+
+		our = update_bufs[i];
+		update_count = our->ourq_count;
+		for (j = 0; j < update_count; j++) {
+			struct lu_object_conf conf;
+
+			update = object_update_request_get(our, j, NULL);
+			if (update->ou_type == OUT_CREATE)
+				conf.loc_flags = LOC_F_NEW;
+			else
+				conf.loc_flags = 0;
+
+			dt_obj = dt_locate_at(env, dt, &update->ou_fid,
+				dt->dd_lu_dev.ld_site->ls_top_dev, &conf);
+			if (IS_ERR(dt_obj))
+				GOTO(out, rc = PTR_ERR(dt_obj));
+
+			if (dt->dd_record_fid_accessed) {
+				struct lfsck_req_local *lrl = &tti->tti_lrl;
+
+				lfsck_pack_rfa(lrl,
+					       lu_object_fid(&dt_obj->do_lu),
+					       LEL_FID_ACCESSED,
+					       LFSCK_TYPE_LAYOUT);
+				tgt_lfsck_in_notify_local(env, dt, lrl, NULL);
+			}
+
+			tti->tti_u.update.tti_dt_object = dt_obj;
+			tti->tti_u.update.tti_update = update;
+			tti->tti_u.update.tti_update_reply_index = reply_index;
+
+			h = out_handler_find(update->ou_type);
+			if (unlikely(h == NULL)) {
+				CERROR("%s: unsupported opc: 0x%x\n",
+				       tgt_name(tsi->tsi_tgt), update->ou_type);
+				GOTO(next, rc = -ENOTSUPP);
+			}
+
+			/* Check resend case only for modifying RPC */
+			if (h->th_flags & IS_MUTABLE) {
+				/* sanity check for last XID changing */
+				if (unlikely(!need_reconstruct &&
+					     req_xid_is_last(pill->rc_req))) {
+					DEBUG_REQ(D_ERROR, pill->rc_req,
+						  "unexpected last XID change");
+					GOTO(next, rc = -EINVAL);
+				}
+
+				if (need_reconstruct) {
+					out_reconstruct(env, dt, dt_obj, reply,
+							reply_index);
+					GOTO(next, rc = 0);
+				}
+
+				if (dt->dd_rdonly)
+					GOTO(next, rc = -EROFS);
+			}
+
+			/* start transaction for modification RPC only */
+			if (h->th_flags & IS_MUTABLE && current_batchid == -1) {
+				current_batchid = update->ou_batchid;
+
+				if (reply_index == 0)
+					CFS_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+
+				rc = out_tx_start(env, dt, ta, tsi->tsi_exp);
+				if (rc != 0)
+					GOTO(next, rc);
+
+				if (update->ou_flags & UPDATE_FL_SYNC)
+					ta->ta_handle->th_sync = 1;
+			}
+
+			/* Stop the current update transaction, if the update
+			 * has different batchid, or read-only update */
+			if (((current_batchid != update->ou_batchid) ||
+			     !(h->th_flags & IS_MUTABLE)) &&
+			     ta->ta_handle != NULL) {
+				rc = out_tx_end(env, ta, rc);
+				current_batchid = -1;
+				if (rc != 0)
+					GOTO(next, rc);
+
+				/* start a new transaction if needed */
+				if (h->th_flags & IS_MUTABLE) {
+					rc = out_tx_start(env, dt, ta,
+							  tsi->tsi_exp);
+					if (rc != 0)
+						GOTO(next, rc);
+					if (update->ou_flags & UPDATE_FL_SYNC)
+						ta->ta_handle->th_sync = 1;
+					current_batchid = update->ou_batchid;
+				}
+			}
+
+			rc = h->th_act(tsi);
+next:
+			reply_index++;
+			dt_object_put(env, dt_obj);
+			if (rc < 0)
+				GOTO(out, rc);
+		}
+	}
+out:
+	if (current_batchid != -1) {
+		rc1 = out_tx_end(env, ta, rc);
+		if (rc == 0)
+			rc = rc1;
+	}
+
+out_free:
+	if (update_bufs != NULL) {
+		if (oub != NULL) {
+			for (i = 0; i < update_buf_count; i++, oub++) {
+				if (update_bufs[i] != NULL)
+					OBD_FREE_LARGE(update_bufs[i],
+						       oub->oub_size);
+			}
+		}
+
+		OBD_FREE_PTR_ARRAY(update_bufs, update_buf_count);
+	}
+
+	if (desc != NULL)
+		ptlrpc_free_bulk(desc);
+
+	RETURN(rc);
+}
+
+struct tgt_handler tgt_out_handlers[] = {
+TGT_UPDATE_HDL(IS_MUTABLE,	OUT_UPDATE,	out_handle),
+};
+EXPORT_SYMBOL(tgt_out_handlers);
+
diff --git a/drivers/staging/lustrefsx/lustre/target/out_lib.c b/drivers/staging/lustrefsx/lustre/target/out_lib.c
new file mode 100644
index 0000000000000..5a0c0da4769bc
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/out_lib.c
@@ -0,0 +1,1276 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2017, Intel Corporation.
+ */
+/*
+ * lustre/target/out_lib.c
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <lu_target.h>
+#include <lustre_obdo.h>
+#include <lustre_update.h>
+#include <md_object.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_linkea.h>
+
+#include "tgt_internal.h"
+
+const char *update_op_str(__u16 opc)
+{
+	static const char *const opc_str[] = {
+		[OUT_START] = "start",
+		[OUT_CREATE] = "create",
+		[OUT_DESTROY] = "destroy",
+		[OUT_REF_ADD] = "ref_add",
+		[OUT_REF_DEL] = "ref_del" ,
+		[OUT_ATTR_SET] = "attr_set",
+		[OUT_ATTR_GET] = "attr_get",
+		[OUT_XATTR_SET] = "xattr_set",
+		[OUT_XATTR_GET] = "xattr_get",
+		[OUT_XATTR_LIST] = "xattr_list",
+		[OUT_INDEX_LOOKUP] = "lookup",
+		[OUT_INDEX_INSERT] = "insert",
+		[OUT_INDEX_DELETE] = "delete",
+		[OUT_WRITE] = "write",
+		[OUT_XATTR_DEL] = "xattr_del",
+		[OUT_PUNCH] = "punch",
+		[OUT_READ] = "read",
+		[OUT_NOOP] = "noop",
+	};
+
+	if (opc < ARRAY_SIZE(opc_str) && opc_str[opc] != NULL)
+		return opc_str[opc];
+	else
+		return "unknown";
+}
+EXPORT_SYMBOL(update_op_str);
+
+/**
+ * Fill object update header
+ *
+ * Only fill the object update header, and parameters will be filled later
+ * in other functions.
+ *
+ * \params[in] env		execution environment
+ * \params[in] update		object update to be filled
+ * \params[in,out] max_update_size	maximum object update size, if the
+ *                                      current update length equals or
+ *                                      exceeds the size, it will return -E2BIG.
+ * \params[in] update_op	update type
+ * \params[in] fid		object FID of the update
+ * \params[in] param_count	the count of the update parameters
+ * \params[in] param_sizes	the length of each parameters
+ *
+ * \retval			0 if packing succeeds.
+ * \retval			-E2BIG if packing exceeds the maximum length.
+ */
+int out_update_header_pack(const struct lu_env *env,
+			   struct object_update *update,
+			   size_t *max_update_size,
+			   enum update_type update_op,
+			   const struct lu_fid *fid,
+			   unsigned int param_count,
+			   __u16 *param_sizes,
+			   __u32 reply_size)
+{
+	struct object_update_param	*param;
+	unsigned int			i;
+	size_t				update_size;
+
+	if (reply_size  >= LNET_MTU)
+		return -EINVAL;
+
+	/* Check whether the packing exceeding the maxima update length */
+	update_size = sizeof(*update);
+	for (i = 0; i < param_count; i++)
+		update_size += cfs_size_round(sizeof(*param) + param_sizes[i]);
+
+	if (unlikely(update_size >= *max_update_size)) {
+		*max_update_size = update_size;
+		return -E2BIG;
+	}
+
+	update->ou_fid = *fid;
+	update->ou_type = update_op;
+	update->ou_params_count = param_count;
+	update->ou_result_size = reply_size;
+	param = &update->ou_params[0];
+	for (i = 0; i < param_count; i++) {
+		param->oup_len = param_sizes[i];
+		param = (struct object_update_param *)((char *)param +
+			 object_update_param_size(param));
+	}
+
+	return 0;
+}
+
+/**
+ * Packs one update into the update_buffer.
+ *
+ * \param[in] env	execution environment
+ * \param[in] update	update to be packed
+ * \param[in] max_update_size	*maximum size of \a update
+ * \param[in] op	update operation (enum update_type)
+ * \param[in] fid	object FID for this update
+ * \param[in] param_count	number of parameters for this update
+ * \param[in] param_sizes	array of parameters length of this update
+ * \param[in] param_bufs	parameter buffers
+ *
+ * \retval		= 0 if updates packing succeeds
+ * \retval		negative errno if updates packing fails
+ **/
+int out_update_pack(const struct lu_env *env, struct object_update *update,
+		    size_t *max_update_size, enum update_type op,
+		    const struct lu_fid *fid, unsigned int param_count,
+		    __u16 *param_sizes, const void **param_bufs,
+		    __u32 reply_size)
+{
+	struct object_update_param	*param;
+	unsigned int			i;
+	int				rc;
+	ENTRY;
+
+	rc = out_update_header_pack(env, update, max_update_size, op, fid,
+				    param_count, param_sizes, reply_size);
+	if (rc != 0)
+		RETURN(rc);
+
+	param = &update->ou_params[0];
+	for (i = 0; i < param_count; i++) {
+		memcpy(&param->oup_buf[0], param_bufs[i], param_sizes[i]);
+		param = (struct object_update_param *)((char *)param +
+			 object_update_param_size(param));
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(out_update_pack);
+
+/**
+ * Pack various updates into the update_buffer.
+ *
+ * The following functions pack different updates into the update_buffer
+ * So parameters of these API is basically same as its correspondent OSD/OSP
+ * API, for detail description of these parameters see osd_handler.c or
+ * osp_md_object.c.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ubuf	update buffer
+ * \param[in] fid	fid of this object for the update
+ *
+ * \retval		0 if insertion succeeds.
+ * \retval		negative errno if insertion fails.
+ */
+int out_create_pack(const struct lu_env *env, struct object_update *update,
+		    size_t *max_update_size, const struct lu_fid *fid,
+		    const struct lu_attr *attr, struct dt_allocation_hint *hint,
+		    struct dt_object_format *dof)
+{
+	struct obdo		*obdo;
+	__u16			sizes[2] = {sizeof(*obdo), 0};
+	int			buf_count = 1;
+	const struct lu_fid	*parent_fid = NULL;
+	int			rc;
+	ENTRY;
+
+	if (hint != NULL && hint->dah_parent) {
+		parent_fid = lu_object_fid(&hint->dah_parent->do_lu);
+		sizes[1] = sizeof(*parent_fid);
+		buf_count++;
+	}
+
+	rc = out_update_header_pack(env, update, max_update_size, OUT_CREATE,
+				    fid, buf_count, sizes, 0);
+	if (rc != 0)
+		RETURN(rc);
+
+	obdo = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(obdo))
+		RETURN(PTR_ERR(obdo));
+
+	obdo->o_valid = 0;
+	obdo_from_la(obdo, attr, attr->la_valid);
+
+	if (parent_fid != NULL) {
+		struct lu_fid *tmp;
+
+		tmp = object_update_param_get(update, 1, NULL);
+		if (IS_ERR(tmp))
+			RETURN(PTR_ERR(tmp));
+
+		fid_cpu_to_le(tmp, parent_fid);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(out_create_pack);
+
+int out_ref_del_pack(const struct lu_env *env, struct object_update *update,
+		     size_t *max_update_size, const struct lu_fid *fid)
+{
+	return out_update_pack(env, update, max_update_size, OUT_REF_DEL, fid,
+			       0, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(out_ref_del_pack);
+
+int out_ref_add_pack(const struct lu_env *env, struct object_update *update,
+		     size_t *max_update_size, const struct lu_fid *fid)
+{
+	return out_update_pack(env, update, max_update_size, OUT_REF_ADD, fid,
+			       0, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(out_ref_add_pack);
+
+int out_attr_set_pack(const struct lu_env *env, struct object_update *update,
+		      size_t *max_update_size, const struct lu_fid *fid,
+		      const struct lu_attr *attr)
+{
+	struct obdo		*obdo;
+	__u16			size = sizeof(*obdo);
+	int			rc;
+	ENTRY;
+
+	rc = out_update_header_pack(env, update, max_update_size,
+				    OUT_ATTR_SET, fid, 1, &size, 0);
+	if (rc != 0)
+		RETURN(rc);
+
+	obdo = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(obdo))
+		RETURN(PTR_ERR(obdo));
+
+	obdo->o_valid = 0;
+	obdo_from_la(obdo, attr, attr->la_valid);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(out_attr_set_pack);
+
+int out_xattr_set_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const struct lu_buf *buf, const char *name, __u32 flag)
+{
+	__u16	sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)};
+	const void *bufs[3] = {(char *)name, (char *)buf->lb_buf,
+			       (char *)&flag};
+
+	return out_update_pack(env, update, max_update_size, OUT_XATTR_SET,
+			       fid, ARRAY_SIZE(sizes), sizes, bufs, 0);
+}
+EXPORT_SYMBOL(out_xattr_set_pack);
+
+int out_xattr_del_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const char *name)
+{
+	__u16	size = strlen(name) + 1;
+
+	return out_update_pack(env, update, max_update_size, OUT_XATTR_DEL,
+			       fid, 1, &size, (const void **)&name, 0);
+}
+EXPORT_SYMBOL(out_xattr_del_pack);
+
+int out_index_insert_pack(const struct lu_env *env,
+			  struct object_update *update,
+			  size_t *max_update_size, const struct lu_fid *fid,
+			  const struct dt_rec *rec, const struct dt_key *key)
+{
+	struct dt_insert_rec	   *rec1 = (struct dt_insert_rec *)rec;
+	struct lu_fid		   rec_fid;
+	__u32			    type = cpu_to_le32(rec1->rec_type);
+	__u16			    sizes[3] = { strlen((char *)key) + 1,
+						sizeof(rec_fid),
+						sizeof(type) };
+	const void		   *bufs[3] = { (char *)key,
+						(char *)&rec_fid,
+						(char *)&type };
+
+	fid_cpu_to_le(&rec_fid, rec1->rec_fid);
+
+	return out_update_pack(env, update, max_update_size, OUT_INDEX_INSERT,
+			       fid, ARRAY_SIZE(sizes), sizes, bufs, 0);
+}
+EXPORT_SYMBOL(out_index_insert_pack);
+
+int out_index_delete_pack(const struct lu_env *env,
+			  struct object_update *update,
+			  size_t *max_update_size, const struct lu_fid *fid,
+			  const struct dt_key *key)
+{
+	__u16	size = strlen((char *)key) + 1;
+	const void *buf = key;
+
+	return out_update_pack(env, update, max_update_size, OUT_INDEX_DELETE,
+			       fid, 1, &size, &buf, 0);
+}
+EXPORT_SYMBOL(out_index_delete_pack);
+
+int out_destroy_pack(const struct lu_env *env, struct object_update *update,
+		     size_t *max_update_size, const struct lu_fid *fid)
+{
+	return out_update_pack(env, update, max_update_size, OUT_DESTROY, fid,
+			       0, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(out_destroy_pack);
+
+int out_write_pack(const struct lu_env *env, struct object_update *update,
+		   size_t *max_update_size, const struct lu_fid *fid,
+		   const struct lu_buf *buf, __u64 pos)
+{
+	__u16		sizes[2] = {buf->lb_len, sizeof(pos)};
+	const void	*bufs[2] = {(char *)buf->lb_buf, (char *)&pos};
+	int		rc;
+
+	pos = cpu_to_le64(pos);
+
+	rc = out_update_pack(env, update, max_update_size, OUT_WRITE, fid,
+			     ARRAY_SIZE(sizes), sizes, bufs, 0);
+	return rc;
+}
+EXPORT_SYMBOL(out_write_pack);
+
+/**
+ * Pack various readonly updates into the update_buffer.
+ *
+ * The following update funcs are only used by read-only ops, lookup,
+ * getattr etc, so it does not need transaction here. Currently they
+ * are only used by OSP.
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	fid of this object for the update
+ * \param[in] ubuf	update buffer
+ *
+ * \retval		= 0 pack succeed.
+ *                      < 0 pack failed.
+ **/
+int out_index_lookup_pack(const struct lu_env *env,
+			  struct object_update *update,
+			  size_t *max_update_size, const struct lu_fid *fid,
+			  struct dt_rec *rec, const struct dt_key *key)
+{
+	const void	*name = key;
+	__u16		size = strlen((char *)name) + 1;
+
+	/* XXX: this shouldn't be hardcoded */
+	return out_update_pack(env, update, max_update_size, OUT_INDEX_LOOKUP,
+			       fid, 1, &size, &name, 256);
+}
+EXPORT_SYMBOL(out_index_lookup_pack);
+
+int out_attr_get_pack(const struct lu_env *env, struct object_update *update,
+		      size_t *max_update_size, const struct lu_fid *fid)
+{
+	return out_update_pack(env, update, max_update_size, OUT_ATTR_GET,
+			       fid, 0, NULL, NULL, sizeof(struct obdo));
+}
+EXPORT_SYMBOL(out_attr_get_pack);
+
+int out_xattr_get_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const char *name, const int bufsize)
+{
+	__u16 size;
+
+	LASSERT(name != NULL);
+	size = strlen(name) + 1;
+
+	return out_update_pack(env, update, max_update_size, OUT_XATTR_GET,
+			       fid, 1, &size, (const void **)&name, bufsize);
+}
+EXPORT_SYMBOL(out_xattr_get_pack);
+
+int out_xattr_list_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const int bufsize)
+{
+	return out_update_pack(env, update, max_update_size, OUT_XATTR_LIST,
+			       fid, 0, NULL, NULL, bufsize);
+}
+EXPORT_SYMBOL(out_xattr_list_pack);
+
+int out_read_pack(const struct lu_env *env, struct object_update *update,
+		  size_t *max_update_size, const struct lu_fid *fid,
+		  size_t size, loff_t pos)
+{
+	__u16		sizes[2] = {sizeof(size), sizeof(pos)};
+	const void	*bufs[2] = {&size, &pos};
+
+	LASSERT(size > 0);
+	size = cpu_to_le64(size);
+	pos = cpu_to_le64(pos);
+
+	return out_update_pack(env, update, max_update_size, OUT_READ, fid,
+			       ARRAY_SIZE(sizes), sizes, bufs, size);
+}
+EXPORT_SYMBOL(out_read_pack);
+
+static int tx_extend_args(struct thandle_exec_args *ta, int new_alloc_ta)
+{
+	struct tx_arg	**new_ta;
+	int		i;
+	int		rc = 0;
+
+	if (ta->ta_alloc_args >= new_alloc_ta)
+		return 0;
+
+	OBD_ALLOC_PTR_ARRAY(new_ta, new_alloc_ta);
+	if (new_ta == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < new_alloc_ta; i++) {
+		if (i < ta->ta_alloc_args) {
+			/* copy the old args to new one */
+			new_ta[i] = ta->ta_args[i];
+		} else {
+			OBD_ALLOC_PTR(new_ta[i]);
+			if (new_ta[i] == NULL)
+				GOTO(out, rc = -ENOMEM);
+		}
+	}
+
+	/* free the old args */
+	if (ta->ta_args != NULL)
+		OBD_FREE_PTR_ARRAY(ta->ta_args, ta->ta_alloc_args);
+
+	ta->ta_args = new_ta;
+	ta->ta_alloc_args = new_alloc_ta;
+out:
+	if (rc != 0) {
+		for (i = 0; i < new_alloc_ta; i++) {
+			if (new_ta[i] != NULL)
+				OBD_FREE_PTR(new_ta[i]);
+		}
+		OBD_FREE_PTR_ARRAY(new_ta, new_alloc_ta);
+	}
+	return rc;
+}
+
+#define TX_ALLOC_STEP	8
+struct tx_arg *tx_add_exec(struct thandle_exec_args *ta,
+			   tx_exec_func_t func, tx_exec_func_t undo,
+			   const char *file, int line)
+{
+	int rc;
+	int i;
+
+	LASSERT(ta != NULL);
+	LASSERT(func != NULL);
+
+	if (ta->ta_argno + 1 >= ta->ta_alloc_args) {
+		rc = tx_extend_args(ta, ta->ta_alloc_args + TX_ALLOC_STEP);
+		if (rc != 0)
+			return ERR_PTR(rc);
+	}
+
+	i = ta->ta_argno;
+
+	ta->ta_argno++;
+
+	ta->ta_args[i]->exec_fn = func;
+	ta->ta_args[i]->undo_fn = undo;
+	ta->ta_args[i]->file    = file;
+	ta->ta_args[i]->line    = line;
+
+	return ta->ta_args[i];
+}
+
+static int out_obj_destroy(const struct lu_env *env, struct dt_object *dt_obj,
+			   struct thandle *th)
+{
+	int rc;
+
+	CDEBUG(D_INFO, "%s: destroy "DFID"\n", dt_obd_name(th->th_dev),
+	       PFID(lu_object_fid(&dt_obj->do_lu)));
+
+	dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+	rc = dt_destroy(env, dt_obj, th);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+/**
+ * All of the xxx_undo will be used once execution failed,
+ * But because all of the required resource has been reserved in
+ * declare phase, i.e. if declare succeed, it should make sure
+ * the following executing phase succeed in anyway, so these undo
+ * should be useless for most of the time in Phase I
+ */
+static int out_tx_create_undo(const struct lu_env *env, struct thandle *th,
+			      struct tx_arg *arg)
+{
+	int rc;
+
+	rc = out_obj_destroy(env, arg->object, th);
+	if (rc != 0)
+		CERROR("%s: undo failure, we are doomed!: rc = %d\n",
+		       dt_obd_name(th->th_dev), rc);
+	return rc;
+}
+
+int out_tx_create_exec(const struct lu_env *env, struct thandle *th,
+		       struct tx_arg *arg)
+{
+	struct dt_object	*dt_obj = arg->object;
+	int			 rc;
+
+	CDEBUG(D_OTHER, "%s: create "DFID": dof %u, mode %o\n",
+	       dt_obd_name(th->th_dev),
+	       PFID(lu_object_fid(&arg->object->do_lu)),
+	       arg->u.create.dof.dof_type,
+	       arg->u.create.attr.la_mode & S_IFMT);
+
+	dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+	rc = dt_create(env, dt_obj, &arg->u.create.attr,
+		       &arg->u.create.hint, &arg->u.create.dof, th);
+
+	dt_write_unlock(env, dt_obj);
+
+	CDEBUG(D_INFO, "%s: insert create reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+/**
+ * Add create update to thandle
+ *
+ * Declare create updates and add the update to the thandle updates
+ * exec array.
+ *
+ * \param [in] env	execution environment
+ * \param [in] obj	object to be created
+ * \param [in] attr	attributes of the creation
+ * \param [in] parent_fid the fid of the parent
+ * \param [in] dof	dt object format of the creation
+ * \param [in] ta	thandle execuation args where all of updates
+ *                      of the transaction are stored
+ * \param [in] th	thandle for this update
+ * \param [in] reply	reply of the updates
+ * \param [in] index	index of the reply
+ * \param [in] file	the file name where the function is called,
+ *                      which is only for debugging purpose.
+ * \param [in] line	the line number where the funtion is called,
+ *                      which is only for debugging purpose.
+ *
+ * \retval		0 if updates is added successfully.
+ * \retval		negative errno if update adding fails.
+ */
+int out_create_add_exec(const struct lu_env *env, struct dt_object *obj,
+			struct lu_attr *attr, struct lu_fid *parent_fid,
+			struct dt_object_format *dof,
+			struct thandle_exec_args *ta,
+			struct thandle	*th,
+			struct object_update_reply *reply,
+			int index, const char *file, int line)
+{
+	struct tx_arg *arg;
+	int rc;
+
+	/* LU-13653: ignore quota for DNE directory creation */
+	if (dof->dof_type == DFT_DIR)
+		th->th_ignore_quota = 1;
+
+	rc = dt_declare_create(env, obj, attr, NULL, dof, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_create_exec, out_tx_create_undo, file,
+			  line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	/* release the object in out_trans_stop */
+	lu_object_get(&obj->do_lu);
+	arg->object = obj;
+	arg->u.create.attr = *attr;
+	if (parent_fid != NULL)
+		arg->u.create.fid = *parent_fid;
+	memset(&arg->u.create.hint, 0, sizeof(arg->u.create.hint));
+	arg->u.create.dof  = *dof;
+	arg->reply = reply;
+	arg->index = index;
+
+	return 0;
+}
+
+static int out_tx_attr_set_undo(const struct lu_env *env,
+				struct thandle *th, struct tx_arg *arg)
+{
+	CERROR("%s: attr set undo "DFID" unimplemented yet!: rc = %d\n",
+	       dt_obd_name(th->th_dev),
+	       PFID(lu_object_fid(&arg->object->do_lu)), -ENOTSUPP);
+
+	return -ENOTSUPP;
+}
+
+static int out_tx_attr_set_exec(const struct lu_env *env, struct thandle *th,
+				struct tx_arg *arg)
+{
+	struct dt_object	*dt_obj = arg->object;
+	int			rc;
+
+	CDEBUG(D_OTHER, "%s: attr set "DFID"\n", dt_obd_name(th->th_dev),
+	       PFID(lu_object_fid(&dt_obj->do_lu)));
+
+	dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+	rc = dt_attr_set(env, dt_obj, &arg->u.attr_set.attr, th);
+	dt_write_unlock(env, dt_obj);
+
+	CDEBUG(D_INFO, "%s: insert attr_set reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0,
+					    arg->index, rc);
+
+	return rc;
+}
+
+int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			  const struct lu_attr *attr,
+			  struct thandle_exec_args *ta,
+			  struct thandle *th, struct object_update_reply *reply,
+			  int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_attr_set(env, dt_obj, attr, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_attr_set_exec, out_tx_attr_set_undo,
+			  file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->u.attr_set.attr = *attr;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_tx_write_exec(const struct lu_env *env, struct thandle *th,
+			     struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	CDEBUG(D_INFO, "write "DFID" pos %llu buf %p, len %lu\n",
+	       PFID(lu_object_fid(&dt_obj->do_lu)), arg->u.write.pos,
+	       arg->u.write.buf.lb_buf, (unsigned long)arg->u.write.buf.lb_len);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OUT_ENOSPC)) {
+		rc = -ENOSPC;
+	} else {
+		dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+		rc = dt_record_write(env, dt_obj, &arg->u.write.buf,
+				     &arg->u.write.pos, th);
+		dt_write_unlock(env, dt_obj);
+
+		if (rc == 0)
+			rc = arg->u.write.buf.lb_len;
+	}
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc < 0 ? rc : 0);
+
+	return rc > 0 ? 0 : rc;
+}
+
+int out_write_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+		       const struct lu_buf *buf, loff_t pos,
+		       struct thandle_exec_args *ta, struct thandle *th,
+		       struct object_update_reply *reply, int index,
+		       const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_record_write(env, dt_obj, buf, pos, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_write_exec, NULL, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->u.write.buf = *buf;
+	arg->u.write.pos = pos;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_tx_xattr_set_exec(const struct lu_env *env,
+				 struct thandle *th,
+				 struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "%s: set xattr buf %p name %s flag %d\n",
+	       dt_obd_name(th->th_dev), arg->u.xattr_set.buf.lb_buf,
+	       arg->u.xattr_set.name, arg->u.xattr_set.flags);
+
+	if (!lu_object_exists(&dt_obj->do_lu) ||
+	    OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
+		rc = -ENOENT;
+	} else {
+		struct linkea_data ldata = { 0 };
+		bool linkea;
+
+		ldata.ld_buf = &arg->u.xattr_set.buf;
+		if (strcmp(arg->u.xattr_set.name, XATTR_NAME_LINK) == 0) {
+			struct link_ea_header *leh;
+
+			linkea = true;
+			rc = linkea_init(&ldata);
+			if (unlikely(rc))
+				GOTO(out, rc == -ENODATA ? -EINVAL : rc);
+
+			leh = ldata.ld_leh;
+			LASSERT(leh != NULL);
+
+			/* If the new linkEA contains overflow timestamp,
+			 * then two cases:
+			 *
+			 * 1. The old linkEA for the object has already
+			 *    overflowed before current setting, the new
+			 *    linkEA does not contains new link entry. So
+			 *    the linkEA overflow timestamp is unchanged.
+			 *
+			 * 2. There are new link entry in the new linkEA,
+			 *    so its overflow timestamp is differnt from
+			 *    the old one. Usually, the overstamp in the
+			 *    given linkEA is newer. But because of clock
+			 *    drift among MDTs, the timestamp may become
+			 *    older. So here, we convert the timestamp to
+			 *    the server local time. Then namespace LFSCK
+			 *    that uses local time can handle it easily. */
+			if (unlikely(leh->leh_overflow_time)) {
+				struct lu_buf tbuf = { 0 };
+				bool update = false;
+
+				lu_buf_alloc(&tbuf, MAX_LINKEA_SIZE);
+				if (tbuf.lb_buf == NULL)
+					GOTO(unlock, rc = -ENOMEM);
+
+				rc = dt_xattr_get(env, dt_obj, &tbuf,
+						  XATTR_NAME_LINK);
+				if (rc > 0) {
+					struct linkea_data tdata = { 0 };
+
+					tdata.ld_buf = &tbuf;
+					rc = linkea_init(&tdata);
+					if (rc || leh->leh_overflow_time !=
+					    tdata.ld_leh->leh_overflow_time)
+						update = true;
+				} else {
+					/* Update the timestamp by force if
+					 * fail to load the old linkEA. */
+					update = true;
+				}
+
+				lu_buf_free(&tbuf);
+				if (update) {
+					leh->leh_overflow_time = ktime_get_real_seconds();
+					if (unlikely(!leh->leh_overflow_time))
+						leh->leh_overflow_time++;
+				}
+			}
+		} else {
+			linkea = false;
+		}
+
+		dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+
+again:
+		rc = dt_xattr_set(env, dt_obj, ldata.ld_buf,
+				  arg->u.xattr_set.name, arg->u.xattr_set.flags,
+				  th);
+		if (unlikely(rc == -ENOSPC && linkea)) {
+			rc = linkea_overflow_shrink(&ldata);
+			if (likely(rc > 0)) {
+				arg->u.xattr_set.buf.lb_len = rc;
+				goto again;
+			}
+		}
+
+unlock:
+		dt_write_unlock(env, dt_obj);
+	}
+
+	GOTO(out, rc);
+
+out:
+	CDEBUG(D_INFO, "%s: insert xattr set reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+int out_xattr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			   const struct lu_buf *buf, const char *name,
+			   int flags, struct thandle_exec_args *ta,
+			   struct thandle *th,
+			   struct object_update_reply *reply,
+			   int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_xattr_set(env, dt_obj, buf, name, flags, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_xattr_set_exec, NULL, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->u.xattr_set.name = name;
+	arg->u.xattr_set.flags = flags;
+	arg->u.xattr_set.buf = *buf;
+	arg->reply = reply;
+	arg->index = index;
+	arg->u.xattr_set.csum = 0;
+	return 0;
+}
+
+static int out_tx_xattr_del_exec(const struct lu_env *env, struct thandle *th,
+				 struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	CDEBUG(D_INFO, "%s: del xattr name '%s' on "DFID"\n",
+	       dt_obd_name(th->th_dev), arg->u.xattr_set.name,
+	       PFID(lu_object_fid(&dt_obj->do_lu)));
+
+	if (!lu_object_exists(&dt_obj->do_lu))
+		GOTO(out, rc = -ENOENT);
+
+	dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+	rc = dt_xattr_del(env, dt_obj, arg->u.xattr_set.name,
+			  th);
+	dt_write_unlock(env, dt_obj);
+out:
+	CDEBUG(D_INFO, "%s: insert xattr del reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+int out_xattr_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			   const char *name, struct thandle_exec_args *ta,
+			   struct thandle *th,
+			   struct object_update_reply *reply, int index,
+			   const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_xattr_del(env, dt_obj, name, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_xattr_del_exec, NULL, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->u.xattr_set.name = name;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_obj_ref_add(const struct lu_env *env,
+			   struct dt_object *dt_obj,
+			   struct thandle *th)
+{
+	int rc;
+
+	dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+	rc = dt_ref_add(env, dt_obj, th);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+static int out_obj_ref_del(const struct lu_env *env,
+			   struct dt_object *dt_obj,
+			   struct thandle *th)
+{
+	int rc;
+
+	dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+	rc = dt_ref_del(env, dt_obj, th);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+static int out_tx_ref_add_exec(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	rc = out_obj_ref_add(env, dt_obj, th);
+
+	CDEBUG(D_INFO, "%s: insert ref_add reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+	return rc;
+}
+
+static int out_tx_ref_add_undo(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	return out_obj_ref_del(env, arg->object, th);
+}
+
+int out_ref_add_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta,
+			 struct thandle *th,
+			 struct object_update_reply *reply, int index,
+			 const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_ref_add(env, dt_obj, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_ref_add_exec, out_tx_ref_add_undo, file,
+			  line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_tx_ref_del_exec(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	struct dt_object	*dt_obj = arg->object;
+	int			 rc;
+
+	rc = out_obj_ref_del(env, dt_obj, th);
+
+	CDEBUG(D_INFO, "%s: insert ref_del reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, 0);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+static int out_tx_ref_del_undo(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	return out_obj_ref_add(env, arg->object, th);
+}
+
+int out_ref_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta,
+			 struct thandle *th,
+			 struct object_update_reply *reply, int index,
+			 const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_ref_del(env, dt_obj, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_ref_del_exec, out_tx_ref_del_undo, file,
+			  line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_obj_index_insert(const struct lu_env *env,
+				struct dt_object *dt_obj,
+				const struct dt_rec *rec,
+				const struct dt_key *key,
+				struct thandle *th)
+{
+	int rc;
+
+	CDEBUG(D_INFO, "%s: index insert "DFID" name: %s fid "DFID", type %u\n",
+	       dt_obd_name(th->th_dev), PFID(lu_object_fid(&dt_obj->do_lu)),
+	       (char *)key, PFID(((struct dt_insert_rec *)rec)->rec_fid),
+	       ((struct dt_insert_rec *)rec)->rec_type);
+
+	if (dt_try_as_dir(env, dt_obj) == 0)
+		return -ENOTDIR;
+
+	dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+	rc = dt_insert(env, dt_obj, rec, key, th);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+static int out_obj_index_delete(const struct lu_env *env,
+				struct dt_object *dt_obj,
+				const struct dt_key *key,
+				struct thandle *th)
+{
+	int rc;
+
+	CDEBUG(D_INFO, "%s: index delete "DFID" name: %s\n",
+	       dt_obd_name(th->th_dev), PFID(lu_object_fid(&dt_obj->do_lu)),
+	       (char *)key);
+
+	if (dt_try_as_dir(env, dt_obj) == 0)
+		return -ENOTDIR;
+
+	dt_write_lock(env, dt_obj, DT_TGT_CHILD);
+	rc = dt_delete(env, dt_obj, key, th);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+static int out_tx_index_insert_exec(const struct lu_env *env,
+				    struct thandle *th, struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	if (unlikely(!dt_object_exists(dt_obj)))
+		RETURN(-ESTALE);
+
+	rc = out_obj_index_insert(env, dt_obj,
+				  (const struct dt_rec *)&arg->u.insert.rec,
+				  arg->u.insert.key, th);
+
+	CDEBUG(D_INFO, "%s: insert idx insert reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+	return rc;
+}
+
+static int out_tx_index_insert_undo(const struct lu_env *env,
+				    struct thandle *th, struct tx_arg *arg)
+{
+	return out_obj_index_delete(env, arg->object, arg->u.insert.key, th);
+}
+
+int out_index_insert_add_exec(const struct lu_env *env,
+			      struct dt_object *dt_obj,
+			      const struct dt_rec *rec,
+			      const struct dt_key *key,
+			      struct thandle_exec_args *ta,
+			      struct thandle *th,
+			      struct object_update_reply *reply,
+			      int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	if (dt_try_as_dir(env, dt_obj) == 0) {
+		rc = -ENOTDIR;
+		return rc;
+	}
+
+	rc = dt_declare_insert(env, dt_obj, rec, key, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_index_insert_exec,
+			  out_tx_index_insert_undo, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	arg->u.insert.rec = *(const struct dt_insert_rec *)rec;
+	arg->u.insert.key = key;
+
+	return 0;
+}
+
+static int out_tx_index_delete_exec(const struct lu_env *env,
+				    struct thandle *th,
+				    struct tx_arg *arg)
+{
+	int rc;
+
+	rc = out_obj_index_delete(env, arg->object, arg->u.insert.key, th);
+
+	CDEBUG(D_INFO, "%s: delete idx insert reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+static int out_tx_index_delete_undo(const struct lu_env *env,
+				    struct thandle *th,
+				    struct tx_arg *arg)
+{
+	CERROR("%s: Oops, can not rollback index_delete yet: rc = %d\n",
+	       dt_obd_name(th->th_dev), -ENOTSUPP);
+	return -ENOTSUPP;
+}
+
+int out_index_delete_add_exec(const struct lu_env *env,
+			      struct dt_object *dt_obj,
+			      const struct dt_key *key,
+			      struct thandle_exec_args *ta,
+			      struct thandle *th,
+			      struct object_update_reply *reply,
+			      int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	if (dt_try_as_dir(env, dt_obj) == 0) {
+		rc = -ENOTDIR;
+		return rc;
+	}
+
+	LASSERT(ta->ta_handle != NULL);
+	rc = dt_declare_delete(env, dt_obj, key, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_index_delete_exec,
+			  out_tx_index_delete_undo, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	arg->u.insert.key = key;
+	return 0;
+}
+
+static int out_tx_destroy_exec(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	rc = out_obj_destroy(env, dt_obj, th);
+
+	CDEBUG(D_INFO, "%s: insert destroy reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	RETURN(rc);
+}
+
+static int out_tx_destroy_undo(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	CERROR("%s: not support destroy undo yet!: rc = %d\n",
+	       dt_obd_name(th->th_dev), -ENOTSUPP);
+	return -ENOTSUPP;
+}
+
+int out_destroy_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta, struct thandle *th,
+			 struct object_update_reply *reply,
+			 int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_destroy(env, dt_obj, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_destroy_exec, out_tx_destroy_undo,
+			  file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c
new file mode 100644
index 0000000000000..afbf668e38a70
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c
@@ -0,0 +1,363 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ *
+ * Copyright (c) 2019, DDN Storage Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/target/tgt_fmd.c
+ *
+ * This file provides functions to handle Filter Modification Data (FMD).
+ * The FMD is responsible for file attributes to be applied in
+ * Transaction ID (XID) order, so older requests can't re-write newer
+ * attributes.
+ *
+ * FMD is organized as per-client list and identified by FID of object. Each
+ * FMD stores FID of object and the highest received XID of modification
+ * request for this object.
+ *
+ * FMD can expire if there are no updates for a long time to keep the list
+ * reasonably small.
+ *
+ * Author: Andreas Dilger <adilger@whamcloud.com>
+ * Author: Mike Pershin <mpershin@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <obd_class.h>
+
+#include "tgt_internal.h"
+
+/**
+ * Drop FMD reference and free it if reference drops to zero.
+ *
+ * Must be called with ted_fmd_lock held.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fmd	FMD to put
+ */
+static inline void tgt_fmd_put_nolock(struct obd_export *exp,
+				      struct tgt_fmd_data *fmd)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	assert_spin_locked(&ted->ted_fmd_lock);
+	if (--fmd->fmd_refcount == 0) {
+		ted->ted_fmd_count--;
+		list_del(&fmd->fmd_list);
+		OBD_SLAB_FREE_PTR(fmd, tgt_fmd_kmem);
+	}
+}
+
+/**
+ * Wrapper to drop FMD reference with ted_fmd_lock held.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fmd	FMD to put
+ */
+void tgt_fmd_put(struct obd_export *exp, struct tgt_fmd_data *fmd)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	spin_lock(&ted->ted_fmd_lock);
+	tgt_fmd_put_nolock(exp, fmd); /* caller reference */
+	spin_unlock(&ted->ted_fmd_lock);
+}
+
+/**
+ * Expire FMD entries.
+ *
+ * Expire entries from the FMD list if there are too many
+ * of them or they are too old.
+ *
+ * This function must be called with ted_fmd_lock held.
+ *
+ * The \a keep FMD is not to be expired in any case. This parameter is used
+ * by ofd_fmd_find_nolock() to prohibit a FMD that was just found from
+ * expiring.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] keep	FMD to keep always
+ */
+static void tgt_fmd_expire_nolock(struct obd_export *exp,
+				  struct tgt_fmd_data *keep)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
+	time64_t now = ktime_get_seconds();
+	struct tgt_fmd_data *fmd, *tmp;
+
+	list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) {
+		if (fmd == keep)
+			break;
+
+		if (now < fmd->fmd_expire &&
+		    ted->ted_fmd_count < lut->lut_fmd_max_num)
+			break;
+
+		list_del_init(&fmd->fmd_list);
+		tgt_fmd_put_nolock(exp, fmd); /* list reference */
+	}
+}
+
+/**
+ * Expire FMD entries.
+ *
+ * This is a wrapper to call ofd_fmd_expire_nolock() with the required lock.
+ *
+ * \param[in] exp	OBD export
+ */
+void tgt_fmd_expire(struct obd_export *exp)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	spin_lock(&ted->ted_fmd_lock);
+	tgt_fmd_expire_nolock(exp, NULL);
+	spin_unlock(&ted->ted_fmd_lock);
+}
+
+/**
+ * Find FMD by specified FID.
+ *
+ * Function finds FMD entry by FID in the tg_export_data::ted_fmd_list.
+ *
+ * Caller must hold tg_export_data::ted_fmd_lock and take FMD reference.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ *
+ * \retval		struct tgt_fmd_data found by FID
+ * \retval		NULL is FMD is not found
+ */
+static struct tgt_fmd_data *tgt_fmd_find_nolock(struct obd_export *exp,
+						const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *found = NULL, *fmd;
+	struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
+	time64_t now = ktime_get_seconds();
+
+	assert_spin_locked(&ted->ted_fmd_lock);
+
+	list_for_each_entry_reverse(fmd, &ted->ted_fmd_list, fmd_list) {
+		if (lu_fid_eq(&fmd->fmd_fid, fid)) {
+			found = fmd;
+			list_move_tail(&fmd->fmd_list, &ted->ted_fmd_list);
+			fmd->fmd_expire = now + lut->lut_fmd_max_age;
+			break;
+		}
+	}
+
+	tgt_fmd_expire_nolock(exp, found);
+
+	return found;
+}
+
+/**
+ * Find FMD by specified FID with locking.
+ *
+ * Wrapper to the ofd_fmd_find_nolock() with correct locks.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ *
+ * \retval		struct tgt_fmd_data found by FID
+ * \retval		NULL indicates FMD is not found
+ */
+struct tgt_fmd_data *tgt_fmd_find(struct obd_export *exp,
+				  const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *fmd;
+
+	spin_lock(&ted->ted_fmd_lock);
+	fmd = tgt_fmd_find_nolock(exp, fid);
+	if (fmd)
+		fmd->fmd_refcount++;    /* caller reference */
+	spin_unlock(&ted->ted_fmd_lock);
+
+	return fmd;
+}
+
+/**
+ * Find FMD by FID or create a new one if none is found.
+ *
+ * It is possible for this function to return NULL under memory pressure,
+ * or if the passed FID is zero (which will only cause old entries to expire).
+ * Currently this is not fatal because any FMD state is transient and
+ * may also be freed when it gets sufficiently old.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ *
+ * \retval		struct tgt_fmd_data found by FID
+ * \retval		NULL indicates FMD is not found
+ */
+struct tgt_fmd_data *tgt_fmd_get(struct obd_export *exp,
+				 const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *found = NULL, *fmd_new = NULL;
+
+	OBD_SLAB_ALLOC_PTR(fmd_new, tgt_fmd_kmem);
+
+	spin_lock(&ted->ted_fmd_lock);
+	found = tgt_fmd_find_nolock(exp, fid);
+	if (fmd_new) {
+		if (!found) {
+			list_add_tail(&fmd_new->fmd_list, &ted->ted_fmd_list);
+			fmd_new->fmd_fid = *fid;
+			fmd_new->fmd_refcount++;   /* list reference */
+			found = fmd_new;
+			ted->ted_fmd_count++;
+		} else {
+			OBD_SLAB_FREE_PTR(fmd_new, tgt_fmd_kmem);
+		}
+	}
+	if (found) {
+		found->fmd_refcount++; /* caller reference */
+		found->fmd_expire = ktime_get_seconds() +
+			class_exp2tgt(exp)->lut_fmd_max_age;
+	} else {
+		LCONSOLE_WARN("%s: cannot allocate FMD for "DFID
+			      ", timestamps may be out of sync\n",
+			      exp->exp_obd->obd_name, PFID(fid));
+	}
+	spin_unlock(&ted->ted_fmd_lock);
+
+	return found;
+}
+
+#ifdef DO_FMD_DROP
+/**
+ * Drop FMD list reference so it will disappear when last reference is dropped
+ * to zero.
+ *
+ * This function is called from ofd_destroy() and may only affect
+ * the one client that is doing the unlink and at worst we have an stale entry
+ * referencing an object that should never be used again.
+ *
+ * NB: this function is used only if DO_FMD_DROP is defined. It is not
+ * currently defined, so FMD drop doesn't happen and FMD are dropped only
+ * when expired.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to drop
+ */
+void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *fmd = NULL;
+
+	spin_lock(&ted->ted_fmd_lock);
+	fmd = tgt_fmd_find_nolock(exp, fid);
+	if (fmd) {
+		list_del_init(&fmd->fmd_list);
+		tgt_fmd_put_nolock(exp, fmd);
+	}
+	spin_unlock(&ted->ted_fmd_lock);
+}
+EXPORT_SYMBOL(tgt_fmd_drop);
+#endif
+
+/**
+ * Remove all entries from FMD list.
+ *
+ * Cleanup function to free all FMD enries on the given export.
+ *
+ * \param[in] exp	OBD export
+ */
+void tgt_fmd_cleanup(struct obd_export *exp)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *fmd = NULL, *tmp;
+
+	spin_lock(&ted->ted_fmd_lock);
+	list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) {
+		list_del_init(&fmd->fmd_list);
+		if (fmd->fmd_refcount > 1) {
+			CDEBUG(D_INFO,
+			       "fmd %p still referenced (refcount = %d)\n",
+			       fmd, fmd->fmd_refcount);
+		}
+		tgt_fmd_put_nolock(exp, fmd);
+	}
+	spin_unlock(&ted->ted_fmd_lock);
+	LASSERT(list_empty(&exp->exp_target_data.ted_fmd_list));
+}
+
+/**
+ * Update FMD with the latest request XID.
+ *
+ * Save a new setattr/punch XID in FMD if exists.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ * \param[in] xid	request XID
+ */
+void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid, __u64 xid)
+{
+	struct tgt_fmd_data *fmd;
+
+	fmd = tgt_fmd_get(exp, fid);
+	if (fmd) {
+		if (fmd->fmd_mactime_xid < xid)
+			fmd->fmd_mactime_xid = xid;
+		tgt_fmd_put(exp, fmd);
+	}
+}
+EXPORT_SYMBOL(tgt_fmd_update);
+
+/**
+ * Chech that time can be updated by the request with given XID.
+ *
+ * Check FMD XID if exists to be less than supplied XID
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ * \param[in] xid	request XID
+ *
+ * \retval true if FMD has no greater XID, so time attr can be updated
+ */
+bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid, __u64 xid)
+{
+	struct tgt_fmd_data *fmd;
+	bool can_update = true;
+
+	fmd = tgt_fmd_find(exp, fid);
+	if (fmd) {
+		can_update = fmd->fmd_mactime_xid < xid;
+		tgt_fmd_put(exp, fmd);
+	}
+
+	return can_update;
+}
+EXPORT_SYMBOL(tgt_fmd_check);
+
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
new file mode 100644
index 0000000000000..ac1757ed38905
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
@@ -0,0 +1,1704 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * lustre/target/tgt_grant.c
+ *
+ * This file provides code related to grant space management on Lustre Targets
+ * (OSTs and MDTs). Grant is a mechanism used by client nodes to reserve disk
+ * space on a target for the data writeback cache. The Lustre client is thus
+ * assured that enough space will be available when flushing dirty pages
+ * asynchronously. Each client node is granted an initial amount of reserved
+ * space at connect time and gets additional space back from target in bulk
+ * write reply.
+ *
+ * We actually support three different cases:
+ * - The client supports the new grant parameters (i.e. OBD_CONNECT_GRANT_PARAM)
+ *   which means that all grant overhead calculation happens on the client side.
+ *   The server reports at connect time the backend filesystem block size, the
+ *   maximum extent size as well as the extent insertion cost and it is then up
+ *   to the osc layer to the track dirty extents and consume grant accordingly
+ *   (see osc_cache.c). In each bulk write request, the client provides how much
+ *   grant space was consumed for this RPC.
+ * - The client does not support OBD_CONNECT_GRANT_PARAM and always assumes a
+ *   a backend file system block size of 4KB. We then have two cases:
+ *   - If the block size is really 4KB, then the client can deal with grant
+ *     allocation for partial block writes, but won't take extent insertion cost
+ *     into account. For such clients, we inflate grant by 100% on the server
+ *     side. It means that when 32MB of grant is hold by the client, 64MB of
+ *     grant space is actually reserved on the server. All grant counters
+ *     provided by such a client are inflated by 100%.
+ *   - The backend filesystem block size is bigger than 4KB, which isn't
+ *     supported by the client. In this case, we emulate a 4KB block size and
+ *     consume one block size on the server for each 4KB of grant returned to
+ *     client. With a 128KB blocksize, it means that 32MB dirty pages of 4KB
+ *     on the client will actually consume 1GB of grant on the server.
+ *     All grant counters provided by such a client are inflated by the block
+ *     size ratio.
+ *
+ * This file handles the core logic for:
+ * - grant allocation strategy
+ * - maintaining per-client as well as global grant space accounting
+ * - processing grant information packed in incoming requests
+ * - allocating server-side grant space for synchronous write RPCs which did not
+ *   consume grant on the client side (OBD_BRW_FROM_GRANT flag not set). If not
+ *   enough space is available, such RPCs fail with ENOSPC
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <obd_class.h>
+
+#include "tgt_internal.h"
+
+int lbug_on_grant_miscount;
+module_param(lbug_on_grant_miscount, int, 0644);
+MODULE_PARM_DESC(lbug_on_grant_miscount, "LBUG on grant miscount");
+
+/* Clients typically hold 2x their max_rpcs_in_flight of grant space */
+#define TGT_GRANT_SHRINK_LIMIT(exp)	(2ULL * 8 * exp_max_brw_size(exp))
+
+/* Helpers to inflate/deflate grants for clients that do not support the grant
+ * parameters */
+static inline u64 tgt_grant_inflate(struct tg_grants_data *tgd, u64 val)
+{
+	if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT)
+		/* Client does not support such large block size, grant
+		 * is thus inflated. We already significantly overestimate
+		 * overhead, no need to add the extent tax in this case */
+		return val << (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT);
+	return val;
+}
+
+/* Companion of tgt_grant_inflate() */
+static inline u64 tgt_grant_deflate(struct tg_grants_data *tgd, u64 val)
+{
+	if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT)
+		return val >> (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT);
+	return val;
+}
+
+/* Grant chunk is used as a unit for grant allocation. It should be inflated
+ * if the client does not support the grant paramaters.
+ * Check connection flag against \a data if not NULL. This is used during
+ * connection creation where exp->exp_connect_data isn't populated yet */
+static inline u64 tgt_grant_chunk(struct obd_export *exp,
+				  struct lu_target *lut,
+				  struct obd_connect_data *data)
+{
+	struct tg_grants_data *tgd = &lut->lut_tgd;
+	u64 chunk = exp_max_brw_size(exp);
+	u64 tax;
+
+	if (exp->exp_obd->obd_self_export == exp)
+		/* Grant enough space to handle a big precreate request */
+		return OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2;
+
+	if ((data == NULL && !(exp_grant_param_supp(exp))) ||
+	    (data != NULL && !OCD_HAS_FLAG(data, GRANT_PARAM)))
+		/* Try to grant enough space to send 2 full-size RPCs */
+		return tgt_grant_inflate(tgd, chunk) << 1;
+
+	/* Try to return enough to send two full-size RPCs
+	 * = 2 * (BRW_size + #extents_in_BRW * grant_tax) */
+	tax = 1ULL << tgd->tgd_blockbits;	     /* block size */
+	tax *= lut->lut_dt_conf.ddp_max_extent_blks; /* max extent size */
+	tax = (chunk + tax - 1) / tax;		     /* #extents in a RPC */
+	tax *= lut->lut_dt_conf.ddp_extent_tax;	     /* extent tax for a RPC */
+	chunk = (chunk + tax) * 2;		     /* we said two full RPCs */
+	return chunk;
+}
+
+static int tgt_check_export_grants(struct obd_export *exp, u64 *dirty,
+				   u64 *pending, u64 *granted, u64 maxsize)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	int level = D_CACHE;
+
+	if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
+		level = D_ERROR;
+	CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
+		     exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		     ted->ted_dirty, ted->ted_pending, ted->ted_grant);
+
+	if (ted->ted_grant + ted->ted_pending > maxsize) {
+		CERROR("%s: cli %s/%p ted_grant(%ld) + ted_pending(%ld)"
+			" > maxsize(%llu)\n", exp->exp_obd->obd_name,
+			exp->exp_client_uuid.uuid, exp, ted->ted_grant,
+			ted->ted_pending, maxsize);
+		return -EFAULT;
+	}
+	if (ted->ted_dirty > maxsize) {
+		CERROR("%s: cli %s/%p ted_dirty(%ld) > maxsize(%llu)\n",
+			exp->exp_obd->obd_name, exp->exp_client_uuid.uuid,
+			exp, ted->ted_dirty, maxsize);
+		return -EFAULT;
+	}
+	*granted += ted->ted_grant + ted->ted_pending;
+	*pending += ted->ted_pending;
+	*dirty += ted->ted_dirty;
+	return 0;
+}
+
+/**
+ * Perform extra sanity checks for grant accounting.
+ *
+ * This function scans the export list, sanity checks per-export grant counters
+ * and verifies accuracy of global grant accounting. If an inconsistency is
+ * found, a CERROR is printed with the function name \func that was passed as
+ * argument. LBUG is only called in case of serious counter corruption (i.e.
+ * value larger than the device size).
+ * Those sanity checks can be pretty expensive and are disabled if the OBD
+ * device has more than 100 connected exports by default.
+ *
+ * \param[in] obd	OBD device for which grant accounting should be
+ *			verified
+ * \param[in] func	caller's function name
+ */
+void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
+{
+	struct lu_target *lut = obd->u.obt.obt_lut;
+	struct tg_grants_data *tgd = &lut->lut_tgd;
+	struct obd_export *exp;
+	struct tg_export_data *ted;
+	u64		   maxsize;
+	u64		   tot_dirty = 0;
+	u64		   tot_pending = 0;
+	u64		   tot_granted = 0;
+	u64		   fo_tot_granted;
+	u64		   fo_tot_pending;
+	u64		   fo_tot_dirty;
+	int		   error;
+
+	if (list_empty(&obd->obd_exports))
+		return;
+
+	/*
+	 * We don't want to do this for large machines that do lots of
+	 * mounts or unmounts.  It burns...
+	 * Use set_param to change obd_grant_check_threshold, which
+	 * is 100 by default, 0 to always check grants
+	 */
+	if (obd->obd_num_exports > obd->obd_grant_check_threshold &&
+	    obd->obd_grant_check_threshold)
+		return;
+
+	maxsize = tgd->tgd_osfs.os_blocks << tgd->tgd_blockbits;
+
+	spin_lock(&obd->obd_dev_lock);
+	spin_lock(&tgd->tgd_grant_lock);
+	exp = obd->obd_self_export;
+	ted = &exp->exp_target_data;
+	CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
+	       "%ld\n", obd->obd_name, ted->ted_grant,
+	       ted->ted_pending, ted->ted_dirty);
+	tot_granted += ted->ted_grant + ted->ted_pending;
+	tot_pending += ted->ted_pending;
+	tot_dirty += ted->ted_dirty;
+
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+		error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
+						&tot_granted, maxsize);
+		if (error < 0) {
+			spin_unlock(&obd->obd_dev_lock);
+			spin_unlock(&tgd->tgd_grant_lock);
+			LBUG();
+		}
+	}
+
+	/* exports about to be unlinked should also be taken into account since
+	 * they might still hold pending grant space to be released at
+	 * commit time */
+	list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) {
+		error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
+						&tot_granted, maxsize);
+		if (error < 0) {
+			spin_unlock(&obd->obd_dev_lock);
+			spin_unlock(&tgd->tgd_grant_lock);
+			LBUG();
+		}
+	}
+
+	fo_tot_granted = tgd->tgd_tot_granted;
+	fo_tot_pending = tgd->tgd_tot_pending;
+	fo_tot_dirty = tgd->tgd_tot_dirty;
+	spin_unlock(&obd->obd_dev_lock);
+	spin_unlock(&tgd->tgd_grant_lock);
+
+	if (tot_granted != fo_tot_granted)
+		CERROR("%s: tot_granted %llu != fo_tot_granted %llu\n",
+		       func, tot_granted, fo_tot_granted);
+	if (tot_pending != fo_tot_pending)
+		CERROR("%s: tot_pending %llu != fo_tot_pending %llu\n",
+		       func, tot_pending, fo_tot_pending);
+	if (tot_dirty != fo_tot_dirty)
+		CERROR("%s: tot_dirty %llu != fo_tot_dirty %llu\n",
+		       func, tot_dirty, fo_tot_dirty);
+	if (tot_pending > tot_granted)
+		CERROR("%s: tot_pending %llu > tot_granted %llu\n",
+		       func, tot_pending, tot_granted);
+	if (tot_granted > maxsize)
+		CERROR("%s: tot_granted %llu > maxsize %llu\n",
+		       func, tot_granted, maxsize);
+	if (tot_dirty > maxsize)
+		CERROR("%s: tot_dirty %llu > maxsize %llu\n",
+		       func, tot_dirty, maxsize);
+}
+EXPORT_SYMBOL(tgt_grant_sanity_check);
+
+/**
+ * Get file system statistics of target.
+ *
+ * Helper function for statfs(), also used by grant code.
+ * Implements caching for statistics to avoid calling OSD device each time.
+ *
+ * \param[in]  env	  execution environment
+ * \param[in]  lut	  LU target
+ * \param[out] osfs	  statistic data to return
+ * \param[in]  max_age	  maximum age for cached data
+ * \param[in]  from_cache show that data was get from cache or not
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
+			struct obd_statfs *osfs, time64_t max_age, int *from_cache)
+{
+	struct tg_grants_data *tgd = &lut->lut_tgd;
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&tgd->tgd_osfs_lock);
+	if (tgd->tgd_osfs_age < max_age || max_age == 0) {
+		u64 unstable;
+
+		/* statfs data are too old, get up-to-date one.
+		 * we must be cautious here since multiple threads might be
+		 * willing to update statfs data concurrently and we must
+		 * grant that cached statfs data are always consistent */
+
+		if (tgd->tgd_statfs_inflight == 0)
+			/* clear inflight counter if no users, although it would
+			 * take a while to overflow this 64-bit counter ... */
+			tgd->tgd_osfs_inflight = 0;
+		/* notify tgt_grant_commit() that we want to track writes
+		 * completed as of now */
+		tgd->tgd_statfs_inflight++;
+		/* record value of inflight counter before running statfs to
+		 * compute the diff once statfs is completed */
+		unstable = tgd->tgd_osfs_inflight;
+		spin_unlock(&tgd->tgd_osfs_lock);
+
+		/* statfs can sleep ... hopefully not for too long since we can
+		 * call it fairly often as space fills up */
+		rc = dt_statfs(env, lut->lut_bottom, osfs);
+		if (unlikely(rc))
+			GOTO(out, rc);
+
+		osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX);
+
+		spin_lock(&tgd->tgd_grant_lock);
+		spin_lock(&tgd->tgd_osfs_lock);
+		/* calculate how much space was written while we released the
+		 * tgd_osfs_lock */
+		unstable = tgd->tgd_osfs_inflight - unstable;
+		tgd->tgd_osfs_unstable = 0;
+		if (unstable) {
+			/* some writes committed while we were running statfs
+			 * w/o the tgd_osfs_lock. Those ones got added to
+			 * the cached statfs data that we are about to crunch.
+			 * Take them into account in the new statfs data */
+			osfs->os_bavail -= min_t(u64, osfs->os_bavail,
+					       unstable >> tgd->tgd_blockbits);
+			/* However, we don't really know if those writes got
+			 * accounted in the statfs call, so tell
+			 * tgt_grant_space_left() there is some uncertainty
+			 * on the accounting of those writes.
+			 * The purpose is to prevent spurious error messages in
+			 * tgt_grant_space_left() since those writes might be
+			 * accounted twice. */
+			tgd->tgd_osfs_unstable += unstable;
+		}
+		/* similarly, there is some uncertainty on write requests
+		 * between prepare & commit */
+		tgd->tgd_osfs_unstable += tgd->tgd_tot_pending;
+		spin_unlock(&tgd->tgd_grant_lock);
+
+		/* finally udpate cached statfs data */
+		tgd->tgd_osfs = *osfs;
+		tgd->tgd_osfs_age = ktime_get_seconds();
+
+		tgd->tgd_statfs_inflight--; /* stop tracking */
+		if (tgd->tgd_statfs_inflight == 0)
+			tgd->tgd_osfs_inflight = 0;
+		spin_unlock(&tgd->tgd_osfs_lock);
+
+		if (from_cache)
+			*from_cache = 0;
+	} else {
+		/* use cached statfs data */
+		*osfs = tgd->tgd_osfs;
+		spin_unlock(&tgd->tgd_osfs_lock);
+		if (from_cache)
+			*from_cache = 1;
+	}
+	GOTO(out, rc);
+
+out:
+	return rc;
+}
+EXPORT_SYMBOL(tgt_statfs_internal);
+
+/**
+ * Update cached statfs information from the OSD layer
+ *
+ * Refresh statfs information cached in tgd::tgd_osfs if the cache is older
+ * than 1s or if force is set. The OSD layer is in charge of estimating data &
+ * metadata overhead.
+ * This function can sleep so it should not be called with any spinlock held.
+ *
+ * \param[in] env		LU environment passed by the caller
+ * \param[in] exp		export used to print client info in debug
+ *				messages
+ * \param[in] force		force a refresh of statfs information
+ * \param[out] from_cache	returns whether the statfs information are
+ *				taken from cache
+ */
+static void tgt_grant_statfs(const struct lu_env *env, struct obd_export *exp,
+			     int force, int *from_cache)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lu_target	*lut = obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	struct tgt_thread_info	*tti;
+	struct obd_statfs	*osfs;
+	time64_t max_age;
+	int rc;
+
+	if (force)
+		max_age = 0; /* get fresh statfs data */
+	else
+		max_age = ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS;
+
+	tti = tgt_th_info(env);
+	osfs = &tti->tti_u.osfs;
+	rc = tgt_statfs_internal(env, lut, osfs, max_age, from_cache);
+	if (unlikely(rc)) {
+		if (from_cache)
+			*from_cache = 0;
+		return;
+	}
+
+	CDEBUG(D_CACHE, "%s: cli %s/%p free: %llu avail: %llu\n",
+	       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+	       osfs->os_bfree << tgd->tgd_blockbits,
+	       osfs->os_bavail << tgd->tgd_blockbits);
+}
+
+/**
+ * Figure out how much space is available on the backend filesystem after
+ * removing grant space already booked by clients.
+ *
+ * This is done by accessing cached statfs data previously populated by
+ * tgt_grant_statfs(), from which we withdraw the space already granted to
+ * clients and the reserved space.
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] exp	export associated with the device for which the amount
+ *			of available space is requested
+ * \retval		amount of non-allocated space, in bytes
+ */
+static u64 tgt_grant_space_left(struct obd_export *exp)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lu_target	*lut = obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	u64			 tot_granted;
+	u64			 left;
+	u64			 avail;
+	u64			 unstable;
+	u64			 reserved;
+
+	ENTRY;
+	assert_spin_locked(&tgd->tgd_grant_lock);
+
+	spin_lock(&tgd->tgd_osfs_lock);
+	/* get available space from cached statfs data */
+	left = tgd->tgd_osfs.os_bavail << tgd->tgd_blockbits;
+	unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
+	spin_unlock(&tgd->tgd_osfs_lock);
+
+	reserved = left * tgd->tgd_reserved_pcnt / 100;
+	tot_granted = tgd->tgd_tot_granted + reserved;
+
+	if (left < tot_granted) {
+		int mask = (left + unstable <
+			    tot_granted - tgd->tgd_tot_pending) ?
+			    D_ERROR : D_CACHE;
+
+		/* the below message is checked in sanityn.sh test_15 */
+		CDEBUG_LIMIT(mask,
+			     "%s: cli %s/%p left=%llu < tot_grant=%llu unstable=%llu pending=%llu dirty=%llu\n",
+			     obd->obd_name, exp->exp_client_uuid.uuid, exp,
+			     left, tot_granted, unstable,
+			     tgd->tgd_tot_pending,
+			     tgd->tgd_tot_dirty);
+		RETURN(0);
+	}
+
+	avail = left;
+	/* Withdraw space already granted to clients */
+	left -= tot_granted;
+
+	/* Align left on block size */
+	left &= ~((1ULL << tgd->tgd_blockbits) - 1);
+
+	CDEBUG(D_CACHE,
+	       "%s: cli %s/%p avail=%llu left=%llu unstable=%llu tot_grant=%llu pending=%llu\n",
+	       obd->obd_name, exp->exp_client_uuid.uuid, exp, avail, left,
+	       unstable, tot_granted, tgd->tgd_tot_pending);
+
+	RETURN(left);
+}
+
+/**
+ * Process grant information from obdo structure packed in incoming BRW
+ * and inflate grant counters if required.
+ *
+ * Grab the dirty and seen grant announcements from the incoming obdo and
+ * inflate all grant counters passed in the request if the client does not
+ * support the grant parameters.
+ * We will later calculate the client's new grant and return it.
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] env	LU environment supplying osfs storage
+ * \param[in] exp	export for which we received the request
+ * \param[in,out] oa	incoming obdo sent by the client
+ */
+static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
+			       struct obdo *oa, long chunk)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct obd_device	*obd = exp->exp_obd;
+	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
+	long long		 dirty, dropped;
+	ENTRY;
+
+	assert_spin_locked(&tgd->tgd_grant_lock);
+
+	if ((oa->o_valid & (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) !=
+					(OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) {
+		oa->o_valid &= ~OBD_MD_FLGRANT;
+		RETURN_EXIT;
+	}
+
+	/* Add some margin, since there is a small race if other RPCs arrive
+	 * out-or-order and have already consumed some grant.  We want to
+	 * leave this here in case there is a large error in accounting. */
+	CDEBUG(D_CACHE,
+	       "%s: cli %s/%p reports grant %llu dropped %u, local %lu\n",
+	       obd->obd_name, exp->exp_client_uuid.uuid, exp, oa->o_grant,
+	       oa->o_dropped, ted->ted_grant);
+
+	if ((long long)oa->o_dirty < 0)
+		oa->o_dirty = 0;
+
+	/* inflate grant counters if required */
+	if (!exp_grant_param_supp(exp)) {
+		u64 tmp;
+		oa->o_grant	= tgt_grant_inflate(tgd, oa->o_grant);
+		oa->o_dirty	= tgt_grant_inflate(tgd, oa->o_dirty);
+		/* inflation can bump client's wish to >4GB which doesn't fit
+		 * 32bit o_undirty, limit that ..  */
+		tmp = tgt_grant_inflate(tgd, oa->o_undirty);
+		if (tmp >= OBD_MAX_GRANT)
+			tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
+		oa->o_undirty = tmp;
+		tmp = tgt_grant_inflate(tgd, oa->o_dropped);
+		if (tmp >= OBD_MAX_GRANT)
+			tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
+		oa->o_dropped = tmp;
+	}
+
+	dirty = oa->o_dirty;
+	dropped = oa->o_dropped;
+
+	/* Update our accounting now so that statfs takes it into account.
+	 * Note that ted_dirty is only approximate and can become incorrect
+	 * if RPCs arrive out-of-order.  No important calculations depend
+	 * on ted_dirty however, but we must check sanity to not assert. */
+	if (dirty > ted->ted_grant + 4 * chunk)
+		dirty = ted->ted_grant + 4 * chunk;
+	tgd->tgd_tot_dirty += dirty - ted->ted_dirty;
+	if (ted->ted_grant < dropped) {
+		CDEBUG(D_CACHE,
+		       "%s: cli %s/%p reports %llu dropped > grant %lu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp, dropped,
+		       ted->ted_grant);
+		dropped = 0;
+	}
+	if (tgd->tgd_tot_granted < dropped) {
+		CERROR("%s: cli %s/%p reports %llu dropped > tot_grant %llu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       dropped, tgd->tgd_tot_granted);
+		dropped = 0;
+	}
+	tgd->tgd_tot_granted -= dropped;
+	ted->ted_grant -= dropped;
+	ted->ted_dirty = dirty;
+
+	if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) {
+		CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       ted->ted_dirty, ted->ted_pending, ted->ted_grant);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	EXIT;
+}
+
+/**
+ * Grant shrink request handler.
+ *
+ * Client nodes can explicitly release grant space (i.e. process called grant
+ * shrinking). This function proceeds with the shrink request when there is
+ * less ungranted space remaining than the amount all of the connected clients
+ * would consume if they used their full grant.
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] exp		export releasing grant space
+ * \param[in,out] oa		incoming obdo sent by the client
+ * \param[in] left_space	remaining free space with space already granted
+ *				taken out
+ */
+static void tgt_grant_shrink(struct obd_export *exp, struct obdo *oa,
+			     u64 left_space)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct obd_device	*obd = exp->exp_obd;
+	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
+	long			 grant_shrink;
+
+	assert_spin_locked(&tgd->tgd_grant_lock);
+	LASSERT(exp);
+	if (left_space >= tgd->tgd_tot_granted_clients *
+			  TGT_GRANT_SHRINK_LIMIT(exp))
+		return;
+
+	grant_shrink = oa->o_grant;
+
+	if (ted->ted_grant < grant_shrink) {
+		CDEBUG(D_CACHE,
+		       "%s: cli %s/%p wants %lu shrinked > grant %lu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       grant_shrink, ted->ted_grant);
+		grant_shrink = ted->ted_grant;
+	}
+
+	ted->ted_grant -= grant_shrink;
+	tgd->tgd_tot_granted -= grant_shrink;
+
+	CDEBUG(D_CACHE, "%s: cli %s/%p shrink %ld ted_grant %ld total %llu\n",
+	       obd->obd_name, exp->exp_client_uuid.uuid, exp, grant_shrink,
+	       ted->ted_grant, tgd->tgd_tot_granted);
+
+	/* client has just released some grant, don't grant any space back */
+	oa->o_grant = 0;
+}
+
+/**
+ * Calculate how much space is required to write a given network buffer
+ *
+ * This function takes block alignment into account to estimate how much on-disk
+ * space will be required to successfully write the whole niobuf.
+ * Estimated space is inflated if the export does not support
+ * OBD_CONNECT_GRANT_PARAM and if the backend filesystem has a block size
+ * larger than the minimal supported page size (i.e. 4KB).
+ *
+ * \param[in] exp	export associated which the write request
+ *			if NULL, then size estimate is done for server-side
+ *			grant allocation.
+ * \param[in] lut	LU target handling the request
+ * \param[in] rnb	network buffer to estimate size of
+ *
+ * \retval		space (in bytes) that will be consumed to write the
+ *			network buffer
+ */
+static inline u64 tgt_grant_rnb_size(struct obd_export *exp,
+				     struct lu_target *lut,
+				     struct niobuf_remote *rnb)
+{
+	struct tg_grants_data *tgd = &lut->lut_tgd;
+	u64 blksize;
+	u64 bytes;
+	u64 end;
+
+	if (exp && !exp_grant_param_supp(exp) &&
+	    tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT)
+		blksize = 1ULL << COMPAT_BSIZE_SHIFT;
+	else
+		blksize = 1ULL << tgd->tgd_blockbits;
+
+	/* The network buffer might span several blocks, align it on block
+	 * boundaries */
+	bytes  = rnb->rnb_offset & (blksize - 1);
+	bytes += rnb->rnb_len;
+	end    = bytes & (blksize - 1);
+	if (end)
+		bytes += blksize - end;
+
+	if (exp == NULL || exp_grant_param_supp(exp)) {
+		/* add per-extent insertion cost */
+		u64 max_ext;
+		int nr_ext;
+
+		max_ext = blksize * lut->lut_dt_conf.ddp_max_extent_blks;
+		nr_ext = (bytes + max_ext - 1) / max_ext;
+		bytes += nr_ext * lut->lut_dt_conf.ddp_extent_tax;
+	} else {
+		/* Inflate grant space if client does not support extent-based
+		 * grant allocation */
+		bytes = tgt_grant_inflate(tgd, (u64)bytes);
+	}
+
+	return bytes;
+}
+
+/**
+ * Validate grant accounting for each incoming remote network buffer.
+ *
+ * When clients have dirtied as much space as they've been granted they
+ * fall through to sync writes. These sync writes haven't been expressed
+ * in grants and need to error with ENOSPC when there isn't room in the
+ * filesystem for them after grants are taken into account. However,
+ * writeback of the dirty data that was already granted space can write
+ * right on through.
+ * The OBD_BRW_GRANTED flag will be set in the rnb_flags of each network
+ * buffer which has been granted enough space to proceed. Buffers without
+ * this flag will fail to be written with -ENOSPC (see tgt_preprw_write().
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] env	LU environment passed by the caller
+ * \param[in] exp	export identifying the client which sent the RPC
+ * \param[in] oa	incoming obdo in which we should return the pack the
+ *			additional grant
+ * \param[in,out] rnb	the list of network buffers
+ * \param[in] niocount	the number of network buffers in the list
+ * \param[in] left	the remaining free space with space already granted
+ *			taken out
+ */
+static void tgt_grant_check(const struct lu_env *env, struct obd_export *exp,
+			    struct obdo *oa, struct niobuf_remote *rnb,
+			    int niocount, u64 *left)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct obd_device	*obd = exp->exp_obd;
+	struct lu_target	*lut = obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	unsigned long		 ungranted = 0;
+	unsigned long		 granted = 0;
+	int			 i;
+	bool			 skip = false;
+
+	ENTRY;
+
+	assert_spin_locked(&tgd->tgd_grant_lock);
+
+	if (obd->obd_recovering) {
+		/* Replaying write. Grant info have been processed already so no
+		 * need to do any enforcement here. It is worth noting that only
+		 * bulk writes with all rnbs having OBD_BRW_FROM_GRANT can be
+		 * replayed. If one page hasn't OBD_BRW_FROM_GRANT set, then
+		 * the whole bulk is written synchronously */
+		skip = true;
+		CDEBUG(D_CACHE, "Replaying write, skipping accounting\n");
+	} else if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+		   (oa->o_flags & OBD_FL_RECOV_RESEND)) {
+		/* Recoverable resend, grant info have already been processed as
+		 * well */
+		skip = true;
+		CDEBUG(D_CACHE, "Recoverable resend arrived, skipping "
+				"accounting\n");
+	} else if (exp_grant_param_supp(exp) && oa->o_grant_used > 0) {
+		/* Client supports the new grant parameters and is telling us
+		 * how much grant space it consumed for this bulk write.
+		 * Although all rnbs are supposed to have the OBD_BRW_FROM_GRANT
+		 * flag set, we will scan the rnb list and looks for non-cache
+		 * I/O in case it changes in the future */
+		if (ted->ted_grant >= oa->o_grant_used) {
+			/* skip grant accounting for rnbs with
+			 * OBD_BRW_FROM_GRANT and just used grant consumption
+			 * claimed in the request */
+			granted = oa->o_grant_used;
+			skip = true;
+		} else {
+			/* client has used more grants for this request that
+			 * it owns ... */
+			CERROR("%s: cli %s claims %lu GRANT, real grant %lu\n",
+			       exp->exp_obd->obd_name,
+			       exp->exp_client_uuid.uuid,
+			       (unsigned long)oa->o_grant_used, ted->ted_grant);
+
+			/* check whether we can fill the gap with unallocated
+			 * grant */
+			if (*left > (oa->o_grant_used - ted->ted_grant)) {
+				/* ouf .. we are safe for now */
+				granted = ted->ted_grant;
+				ungranted = oa->o_grant_used - granted;
+				*left -= ungranted;
+				skip = true;
+			}
+			/* too bad, but we cannot afford to blow up our grant
+			 * accounting. The loop below will handle each rnb in
+			 * case by case. */
+		}
+	}
+
+	for (i = 0; i < niocount; i++) {
+		int bytes;
+
+		if ((rnb[i].rnb_flags & OBD_BRW_FROM_GRANT)) {
+			if (skip) {
+				rnb[i].rnb_flags |= OBD_BRW_GRANTED;
+				continue;
+			}
+
+			/* compute how much grant space is actually needed for
+			 * this rnb, inflate grant if required */
+			bytes = tgt_grant_rnb_size(exp, lut, &rnb[i]);
+			if (ted->ted_grant >= granted + bytes) {
+				granted += bytes;
+				rnb[i].rnb_flags |= OBD_BRW_GRANTED;
+				continue;
+			}
+
+			CDEBUG(D_CACHE, "%s: cli %s/%p claims %ld+%d GRANT, "
+			       "real grant %lu idx %d\n", obd->obd_name,
+			       exp->exp_client_uuid.uuid, exp, granted, bytes,
+			       ted->ted_grant, i);
+		}
+
+		if (obd->obd_recovering)
+			CERROR("%s: cli %s is replaying OST_WRITE while one rnb"
+			       " hasn't OBD_BRW_FROM_GRANT set (0x%x)\n",
+			       obd->obd_name, exp->exp_client_uuid.uuid,
+			       rnb[i].rnb_flags);
+
+		/* Consume grant space on the server.
+		 * Unlike above, tgt_grant_rnb_size() is called with exp = NULL
+		 * so that the required grant space isn't inflated. This is
+		 * done on purpose since the server can deal with large block
+		 * size, unlike some clients */
+		bytes = tgt_grant_rnb_size(NULL, lut, &rnb[i]);
+		if (*left > bytes) {
+			/* if enough space, pretend it was granted */
+			ungranted += bytes;
+			*left -= bytes;
+			rnb[i].rnb_flags |= OBD_BRW_GRANTED;
+			continue;
+		}
+
+		/* We can't check for already-mapped blocks here (make sense
+		 * when backend filesystem does not use COW) as it requires
+		 * dropping the grant lock.
+		 * Instead, we clear OBD_BRW_GRANTED and in that case we need
+		 * to go through and verify if all of the blocks not marked
+		 *  BRW_GRANTED are already mapped and we can ignore this error.
+		 */
+		rnb[i].rnb_flags &= ~OBD_BRW_GRANTED;
+		CDEBUG(D_CACHE, "%s: cli %s/%p idx %d no space for %d\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp, i, bytes);
+	}
+
+	/* record in o_grant_used the actual space reserved for the I/O, will be
+	 * used later in tgt_grant_commmit() */
+	oa->o_grant_used = granted + ungranted;
+
+	/* record space used for the I/O, will be used in tgt_grant_commmit() */
+	/* Now substract what the clients has used already.  We don't subtract
+	 * this from the tot_granted yet, so that other client's can't grab
+	 * that space before we have actually allocated our blocks. That
+	 * happens in tgt_grant_commit() after the writes are done. */
+	ted->ted_grant -= granted;
+	ted->ted_pending += oa->o_grant_used;
+	tgd->tgd_tot_granted += ungranted;
+	tgd->tgd_tot_pending += oa->o_grant_used;
+
+	CDEBUG(D_CACHE,
+	       "%s: cli %s/%p granted: %lu ungranted: %lu grant: %lu dirty: %lu"
+	       "\n", obd->obd_name, exp->exp_client_uuid.uuid, exp,
+	       granted, ungranted, ted->ted_grant, ted->ted_dirty);
+
+	if (obd->obd_recovering || (oa->o_valid & OBD_MD_FLGRANT) == 0)
+		/* don't update dirty accounting during recovery or
+		 * if grant information got discarded (e.g. during resend) */
+		RETURN_EXIT;
+
+	if (ted->ted_dirty < granted) {
+		CWARN("%s: cli %s/%p claims granted %lu > ted_dirty %lu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       granted, ted->ted_dirty);
+		granted = ted->ted_dirty;
+	}
+	tgd->tgd_tot_dirty -= granted;
+	ted->ted_dirty -= granted;
+
+	if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) {
+		CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       ted->ted_dirty, ted->ted_pending, ted->ted_grant);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	EXIT;
+}
+
+/**
+ * Allocate additional grant space to a client
+ *
+ * Calculate how much grant space to return to client, based on how much space
+ * is currently free and how much of that is already granted.
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] exp		export of the client which sent the request
+ * \param[in] curgrant		current grant claimed by the client
+ * \param[in] want		how much grant space the client would like to
+ *				have
+ * \param[in] left		remaining free space with granted space taken
+ *				out
+ * \param[in] chunk		grant allocation unit
+ * \param[in] conservative	if set to true, the server should be cautious
+ *				and limit how much space is granted back to the
+ *				client. Otherwise, the server should try hard to
+ *				satisfy the client request.
+ *
+ * \retval			amount of grant space allocated
+ */
+static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
+			    u64 want, u64 left, long chunk,
+			    bool conservative)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	u64			 grant;
+
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_NO_GRANT))
+		RETURN(0);
+
+	/* When tgd_grant_compat_disable is set, we don't grant any space to
+	 * clients not supporting OBD_CONNECT_GRANT_PARAM.
+	 * Otherwise, space granted to such a client is inflated since it
+	 * consumes PAGE_SIZE of grant space per block */
+	if ((obd->obd_self_export != exp && !exp_grant_param_supp(exp) &&
+	     tgd->tgd_grant_compat_disable) || left == 0 || exp->exp_failed)
+		RETURN(0);
+
+	if (want > OBD_MAX_GRANT) {
+		CERROR("%s: client %s/%p requesting > max (%lu), %llu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       OBD_MAX_GRANT, want);
+		RETURN(0);
+	}
+
+	/* Grant some fraction of the client's requested grant space so that
+	 * they are not always waiting for write credits (not all of it to
+	 * avoid overgranting in face of multiple RPCs in flight).  This
+	 * essentially will be able to control the OSC_MAX_RIF for a client.
+	 *
+	 * If we do have a large disparity between what the client thinks it
+	 * has and what we think it has, don't grant very much and let the
+	 * client consume its grant first.  Either it just has lots of RPCs
+	 * in flight, or it was evicted and its grants will soon be used up. */
+	if (curgrant >= want || curgrant >= ted->ted_grant + chunk)
+		RETURN(0);
+
+	if (obd->obd_recovering)
+		conservative = false;
+
+	if (conservative)
+		/* don't grant more than 1/8th of the remaining free space in
+		 * one chunk */
+		left >>= 3;
+	grant = min(want - curgrant, left);
+	/* round grant up to the next block size */
+	grant = (grant + (1 << tgd->tgd_blockbits) - 1) &
+		~((1ULL << tgd->tgd_blockbits) - 1);
+
+	if (!grant)
+		RETURN(0);
+
+	/* Limit to grant_chunk if not reconnect/recovery */
+	if ((grant > chunk) && conservative)
+		grant = chunk;
+
+	/*
+	 * Limit grant so that export' grant does not exceed what the
+	 * client would like to have by more than grants for 2 full
+	 * RPCs
+	 */
+	if (want + chunk <= ted->ted_grant)
+		RETURN(0);
+	if (ted->ted_grant + grant > want + chunk)
+		grant = want + chunk - ted->ted_grant;
+
+	tgd->tgd_tot_granted += grant;
+	ted->ted_grant += grant;
+
+	if (unlikely(ted->ted_grant < 0 || ted->ted_grant > want + chunk)) {
+		CERROR("%s: cli %s/%p grant %ld want %llu current %llu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       ted->ted_grant, want, curgrant);
+		if (lbug_on_grant_miscount) {
+			spin_unlock(&tgd->tgd_grant_lock);
+			LBUG();
+		}
+	}
+
+	CDEBUG(D_CACHE,
+	       "%s: cli %s/%p wants: %llu current grant %llu"
+	       " granting: %llu\n", obd->obd_name, exp->exp_client_uuid.uuid,
+	       exp, want, curgrant, grant);
+	CDEBUG(D_CACHE,
+	       "%s: cli %s/%p tot cached:%llu granted:%llu"
+	       " num_exports: %d\n", obd->obd_name, exp->exp_client_uuid.uuid,
+	       exp, tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
+	       obd->obd_num_exports);
+
+	RETURN(grant);
+}
+
+/**
+ * Handle grant space allocation on client connection & reconnection.
+ *
+ * A new non-readonly connection gets an initial grant allocation equals to
+ * tgt_grant_chunk() (i.e. twice the max BRW size in most of the cases).
+ * On reconnection, grant counters between client & target are resynchronized
+ * and additional space might be granted back if possible.
+ *
+ * \param[in] env	LU environment provided by the caller
+ * \param[in] exp	client's export which is (re)connecting
+ * \param[in,out] data	obd_connect_data structure sent by the client in the
+ *			connect request
+ * \param[in] new_conn	must set to true if this is a new connection and false
+ *			for a reconnection
+ */
+void tgt_grant_connect(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_connect_data *data, bool new_conn)
+{
+	struct lu_target	*lut = exp->exp_obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	u64			 left = 0;
+	u64			 want;
+	long			 chunk;
+	int			 from_cache;
+	int			 force = 0; /* can use cached data */
+
+	/* don't grant space to client with read-only access */
+	if (OCD_HAS_FLAG(data, RDONLY) ||
+	    (!OCD_HAS_FLAG(data, GRANT_PARAM) &&
+	     tgd->tgd_grant_compat_disable)) {
+		data->ocd_grant = 0;
+		data->ocd_connect_flags &= ~(OBD_CONNECT_GRANT |
+					     OBD_CONNECT_GRANT_PARAM);
+		RETURN_EXIT;
+	}
+
+	if (OCD_HAS_FLAG(data, GRANT_PARAM))
+		want = data->ocd_grant;
+	else
+		want = tgt_grant_inflate(tgd, data->ocd_grant);
+	chunk = tgt_grant_chunk(exp, lut, data);
+refresh:
+	tgt_grant_statfs(env, exp, force, &from_cache);
+
+	spin_lock(&tgd->tgd_grant_lock);
+
+	/* Grab free space from cached info and take out space already granted
+	 * to clients as well as reserved space */
+	left = tgt_grant_space_left(exp);
+
+	/* get fresh statfs data if we are short in ungranted space */
+	if (from_cache && left < 32 * chunk) {
+		spin_unlock(&tgd->tgd_grant_lock);
+		CDEBUG(D_CACHE, "fs has no space left and statfs too old\n");
+		force = 1;
+		goto refresh;
+	}
+
+	tgt_grant_alloc(exp, (u64)ted->ted_grant, want, left, chunk, new_conn);
+
+	/* return to client its current grant */
+	if (OCD_HAS_FLAG(data, GRANT_PARAM))
+		data->ocd_grant = ted->ted_grant;
+	else
+		/* deflate grant */
+		data->ocd_grant = tgt_grant_deflate(tgd, (u64)ted->ted_grant);
+
+	/* reset dirty accounting */
+	tgd->tgd_tot_dirty -= ted->ted_dirty;
+	ted->ted_dirty = 0;
+
+	if (new_conn && OCD_HAS_FLAG(data, GRANT))
+		tgd->tgd_tot_granted_clients++;
+
+	spin_unlock(&tgd->tgd_grant_lock);
+
+	CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %d want: %llu left: %llu\n",
+	       exp->exp_obd->obd_name, exp->exp_client_uuid.uuid,
+	       exp, data->ocd_grant, want, left);
+
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_grant_connect);
+
+/**
+ * Release all grant space attached to a given export.
+ *
+ * Remove a client from the grant accounting totals.  We also remove
+ * the export from the obd device under the osfs and dev locks to ensure
+ * that the tgt_grant_sanity_check() calculations are always valid.
+ * The client should do something similar when it invalidates its import.
+ *
+ * \param[in] exp	client's export to remove from grant accounting
+ */
+void tgt_grant_discard(struct obd_export *exp)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lu_target        *lut = class_exp2tgt(exp);
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct tg_grants_data	*tgd;
+
+	if (!lut)
+		return;
+
+	tgd = &lut->lut_tgd;
+	spin_lock(&tgd->tgd_grant_lock);
+	if (unlikely(tgd->tgd_tot_granted < ted->ted_grant ||
+		     tgd->tgd_tot_dirty < ted->ted_dirty)) {
+		struct obd_export *e;
+		u64 ttg = 0;
+		u64 ttd = 0;
+
+		list_for_each_entry(e, &obd->obd_exports, exp_obd_chain) {
+			LASSERT(exp != e);
+			ttg += e->exp_target_data.ted_grant;
+			ttg += e->exp_target_data.ted_pending;
+			ttd += e->exp_target_data.ted_dirty;
+		}
+		if (tgd->tgd_tot_granted < ted->ted_grant)
+			CERROR("%s: cli %s/%p: tot_granted %llu < ted_grant %ld, corrected to %llu",
+			       obd->obd_name,  exp->exp_client_uuid.uuid, exp,
+			       tgd->tgd_tot_granted, ted->ted_grant, ttg);
+		if (tgd->tgd_tot_dirty < ted->ted_dirty)
+			CERROR("%s: cli %s/%p: tot_dirty %llu < ted_dirty %ld, corrected to %llu",
+			       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+			       tgd->tgd_tot_dirty, ted->ted_dirty, ttd);
+		tgd->tgd_tot_granted = ttg;
+		tgd->tgd_tot_dirty = ttd;
+	} else {
+		tgd->tgd_tot_granted -= ted->ted_grant;
+		tgd->tgd_tot_dirty -= ted->ted_dirty;
+	}
+	ted->ted_grant = 0;
+	ted->ted_dirty = 0;
+
+	if (tgd->tgd_tot_pending < ted->ted_pending) {
+		CERROR("%s: tot_pending %llu < cli %s/%p ted_pending %ld\n",
+		       obd->obd_name, tgd->tgd_tot_pending,
+		       exp->exp_client_uuid.uuid, exp, ted->ted_pending);
+	}
+	/* tgd_tot_pending is handled in tgt_grant_commit as bulk
+	 * commmits */
+	spin_unlock(&tgd->tgd_grant_lock);
+}
+EXPORT_SYMBOL(tgt_grant_discard);
+
+/**
+ * Process grant information from incoming bulk read request.
+ *
+ * Extract grant information packed in obdo structure (OBD_MD_FLGRANT set in
+ * o_valid). Bulk reads usually comes with grant announcements (number of dirty
+ * blocks, remaining amount of grant space, ...) and could also include a grant
+ * shrink request. Unlike bulk write, no additional grant space is returned on
+ * bulk read request.
+ *
+ * \param[in] env	is the lu environment provided by the caller
+ * \param[in] exp	is the export of the client which sent the request
+ * \param[in,out] oa	is the incoming obdo sent by the client
+ */
+void tgt_grant_prepare_read(const struct lu_env *env,
+			    struct obd_export *exp, struct obdo *oa)
+{
+	struct lu_target	*lut = exp->exp_obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	int			 do_shrink;
+	u64			 left = 0;
+
+	ENTRY;
+
+	if (!oa)
+		RETURN_EXIT;
+
+	if ((oa->o_valid & OBD_MD_FLGRANT) == 0)
+		/* The read request does not contain any grant
+		 * information */
+		RETURN_EXIT;
+
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    (oa->o_flags & OBD_FL_SHRINK_GRANT)) {
+		/* To process grant shrink request, we need to know how much
+		 * available space remains on the backend filesystem.
+		 * Shrink requests are not so common, we always get fresh
+		 * statfs information. */
+		tgt_grant_statfs(env, exp, 1, NULL);
+
+		/* protect all grant counters */
+		spin_lock(&tgd->tgd_grant_lock);
+
+		/* Grab free space from cached statfs data and take out space
+		 * already granted to clients as well as reserved space */
+		left = tgt_grant_space_left(exp);
+
+		/* all set now to proceed with shrinking */
+		do_shrink = 1;
+	} else {
+		/* no grant shrinking request packed in the obdo and
+		 * since we don't grant space back on reads, no point
+		 * in running statfs, so just skip it and process
+		 * incoming grant data directly. */
+		spin_lock(&tgd->tgd_grant_lock);
+		do_shrink = 0;
+	}
+
+	/* extract incoming grant information provided by the client and
+	 * inflate grant counters if required */
+	tgt_grant_incoming(env, exp, oa, tgt_grant_chunk(exp, lut, NULL));
+
+	/* unlike writes, we don't return grants back on reads unless a grant
+	 * shrink request was packed and we decided to turn it down. */
+	if (do_shrink)
+		tgt_grant_shrink(exp, oa, left);
+	else
+		oa->o_grant = 0;
+
+	if (!exp_grant_param_supp(exp))
+		oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant);
+	spin_unlock(&tgd->tgd_grant_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_grant_prepare_read);
+
+/**
+ * Process grant information from incoming bulk write request.
+ *
+ * This function extracts client's grant announcements from incoming bulk write
+ * request and attempts to allocate grant space for network buffers that need it
+ * (i.e. OBD_BRW_FROM_GRANT not set in rnb_fags).
+ * Network buffers which aren't granted the OBD_BRW_GRANTED flag should not
+ * proceed further and should fail with -ENOSPC.
+ * Whenever possible, additional grant space will be returned to the client
+ * in the bulk write reply.
+ * tgt_grant_prepare_write() must be called before writting any buffers to
+ * the backend storage. This function works in pair with tgt_grant_commit()
+ * which must be invoked once all buffers have been written to disk in order
+ * to release space from the pending grant counter.
+ *
+ * \param[in] env	LU environment provided by the caller
+ * \param[in] exp	export of the client which sent the request
+ * \param[in] oa	incoming obdo sent by the client
+ * \param[in] rnb	list of network buffers
+ * \param[in] niocount	number of network buffers in the list
+ */
+void tgt_grant_prepare_write(const struct lu_env *env,
+			     struct obd_export *exp, struct obdo *oa,
+			     struct niobuf_remote *rnb, int niocount)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lu_target	*lut = obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	u64			 left;
+	int			 from_cache;
+	int			 force = 0; /* can use cached data intially */
+	long			 chunk = tgt_grant_chunk(exp, lut, NULL);
+
+	ENTRY;
+
+refresh:
+	/* get statfs information from OSD layer */
+	tgt_grant_statfs(env, exp, force, &from_cache);
+
+	spin_lock(&tgd->tgd_grant_lock); /* protect all grant counters */
+
+	/* Grab free space from cached statfs data and take out space already
+	 * granted to clients as well as reserved space */
+	left = tgt_grant_space_left(exp);
+
+	/* Get fresh statfs data if we are short in ungranted space */
+	if (from_cache && left < 32 * chunk) {
+		spin_unlock(&tgd->tgd_grant_lock);
+		CDEBUG(D_CACHE, "%s: fs has no space left and statfs too old\n",
+		       obd->obd_name);
+		force = 1;
+		goto refresh;
+	}
+
+	/* When close to free space exhaustion, trigger a sync to force
+	 * writeback cache to consume required space immediately and release as
+	 * much space as possible. */
+	if (!obd->obd_recovering && force != 2 && left < chunk) {
+		bool from_grant = true;
+		int  i;
+
+		/* That said, it is worth running a sync only if some pages did
+		 * not consume grant space on the client and could thus fail
+		 * with ENOSPC later in tgt_grant_check() */
+		for (i = 0; i < niocount; i++)
+			if (!(rnb[i].rnb_flags & OBD_BRW_FROM_GRANT))
+				from_grant = false;
+
+		if (!from_grant) {
+			/* at least one network buffer requires acquiring grant
+			 * space on the server */
+			spin_unlock(&tgd->tgd_grant_lock);
+			/* discard errors, at least we tried ... */
+			dt_sync(env, lut->lut_bottom);
+			force = 2;
+			goto refresh;
+		}
+	}
+
+	/* extract incoming grant information provided by the client,
+	 * and inflate grant counters if required */
+	tgt_grant_incoming(env, exp, oa, chunk);
+
+	/* check limit */
+	tgt_grant_check(env, exp, oa, rnb, niocount, &left);
+
+	if (!(oa->o_valid & OBD_MD_FLGRANT)) {
+		spin_unlock(&tgd->tgd_grant_lock);
+		RETURN_EXIT;
+	}
+
+	/* if OBD_FL_SHRINK_GRANT is set, the client is willing to release some
+	 * grant space. */
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    (oa->o_flags & OBD_FL_SHRINK_GRANT))
+		tgt_grant_shrink(exp, oa, left);
+	else
+		/* grant more space back to the client if possible */
+		oa->o_grant = tgt_grant_alloc(exp, oa->o_grant, oa->o_undirty,
+					      left, chunk, true);
+
+	if (!exp_grant_param_supp(exp))
+		oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant);
+	spin_unlock(&tgd->tgd_grant_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_grant_prepare_write);
+
+/**
+ * Consume grant space reserved for object creation.
+ *
+ * Grant space is allocated to the local self export for object precreation.
+ * This is required to prevent object precreation from consuming grant space
+ * allocated to client nodes for the data writeback cache.
+ * This function consumes enough space to create \a nr objects and allocates
+ * more grant space to the self export for future precreation requests, if
+ * possible.
+ *
+ * \param[in] env	LU environment provided by the caller
+ * \param[in] exp	export holding the grant space for precreation (= self
+ *			export currently)
+ * \param[in] nr	number of objects to be created
+ *
+ * \retval >= 0		amount of grant space allocated to the precreate request
+ * \retval -ENOSPC	on failure
+ */
+long tgt_grant_create(const struct lu_env *env, struct obd_export *exp, s64 *nr)
+{
+	struct lu_target	*lut = exp->exp_obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	u64			 left = 0;
+	unsigned long		 wanted;
+	unsigned long		 granted;
+	ENTRY;
+
+	if (exp->exp_obd->obd_recovering ||
+	    lut->lut_dt_conf.ddp_inodespace == 0)
+		/* don't enforce grant during recovery */
+		RETURN(0);
+
+	/* Update statfs data if required */
+	tgt_grant_statfs(env, exp, 1, NULL);
+
+	/* protect all grant counters */
+	spin_lock(&tgd->tgd_grant_lock);
+
+	/* fail precreate request if there is not enough blocks available for
+	 * writing */
+	if (tgd->tgd_osfs.os_bavail - (ted->ted_grant >> tgd->tgd_blockbits) <
+	    (tgd->tgd_osfs.os_blocks >> 10)) {
+		spin_unlock(&tgd->tgd_grant_lock);
+		CDEBUG(D_RPCTRACE, "%s: not enough space for create %llu\n",
+		       exp->exp_obd->obd_name,
+		       tgd->tgd_osfs.os_bavail * tgd->tgd_osfs.os_blocks);
+		RETURN(-ENOSPC);
+	}
+
+	/* Grab free space from cached statfs data and take out space
+	 * already granted to clients as well as reserved space */
+	left = tgt_grant_space_left(exp);
+
+	/* compute how much space is required to handle the precreation
+	 * request */
+	wanted = *nr * lut->lut_dt_conf.ddp_inodespace;
+	if (wanted > ted->ted_grant + left) {
+		/* that's beyond what remains, adjust the number of objects that
+		 * can be safely precreated */
+		wanted = ted->ted_grant + left;
+		*nr = wanted / lut->lut_dt_conf.ddp_inodespace;
+		if (*nr == 0) {
+			/* we really have no space any more for precreation,
+			 * fail the precreate request with ENOSPC */
+			spin_unlock(&tgd->tgd_grant_lock);
+			RETURN(-ENOSPC);
+		}
+		/* compute space needed for the new number of creations */
+		wanted = *nr * lut->lut_dt_conf.ddp_inodespace;
+	}
+	LASSERT(wanted <= ted->ted_grant + left);
+
+	if (wanted <= ted->ted_grant) {
+		/* we've enough grant space to handle this precreate request */
+		ted->ted_grant -= wanted;
+	} else {
+		/* we need to take some space from the ungranted pool */
+		tgd->tgd_tot_granted += wanted - ted->ted_grant;
+		left -= wanted - ted->ted_grant;
+		ted->ted_grant = 0;
+	}
+	granted = wanted;
+	ted->ted_pending += granted;
+	tgd->tgd_tot_pending += granted;
+
+	/* grant more space for precreate purpose if possible. */
+	wanted = OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2;
+	if (wanted > ted->ted_grant) {
+		long chunk;
+
+		/* always try to book enough space to handle a large precreate
+		 * request */
+		chunk = tgt_grant_chunk(exp, lut, NULL);
+		wanted -= ted->ted_grant;
+		tgt_grant_alloc(exp, ted->ted_grant, wanted, left, chunk,
+				false);
+	}
+	spin_unlock(&tgd->tgd_grant_lock);
+	RETURN(granted);
+}
+EXPORT_SYMBOL(tgt_grant_create);
+
+/**
+ * Release grant space added to the pending counter by tgt_grant_prepare_write()
+ *
+ * Update pending grant counter once buffers have been written to the disk.
+ *
+ * \param[in] exp	export of the client which sent the request
+ * \param[in] pending	amount of reserved space to be released
+ * \param[in] rc	return code of pre-commit operations
+ */
+void tgt_grant_commit(struct obd_export *exp, unsigned long pending,
+		      int rc)
+{
+	struct tg_grants_data *tgd = &exp->exp_obd->u.obt.obt_lut->lut_tgd;
+
+	ENTRY;
+
+	/* get space accounted in tot_pending for the I/O, set in
+	 * tgt_grant_check() */
+	if (pending == 0)
+		RETURN_EXIT;
+
+	spin_lock(&tgd->tgd_grant_lock);
+	/* Don't update statfs data for errors raised before commit (e.g.
+	 * bulk transfer failed, ...) since we know those writes have not been
+	 * processed. For other errors hit during commit, we cannot really tell
+	 * whether or not something was written, so we update statfs data.
+	 * In any case, this should not be fatal since we always get fresh
+	 * statfs data before failing a request with ENOSPC */
+	if (rc == 0) {
+		spin_lock(&tgd->tgd_osfs_lock);
+		/* Take pending out of cached statfs data */
+		tgd->tgd_osfs.os_bavail -= min_t(u64,
+						 tgd->tgd_osfs.os_bavail,
+						 pending >> tgd->tgd_blockbits);
+		if (tgd->tgd_statfs_inflight)
+			/* someone is running statfs and want to be notified of
+			 * writes happening meanwhile */
+			tgd->tgd_osfs_inflight += pending;
+		spin_unlock(&tgd->tgd_osfs_lock);
+	}
+
+	if (exp->exp_target_data.ted_pending < pending) {
+		CERROR("%s: cli %s/%p ted_pending(%lu) < grant_used(%lu)\n",
+		       exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       exp->exp_target_data.ted_pending, pending);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	exp->exp_target_data.ted_pending -= pending;
+
+	if (tgd->tgd_tot_granted < pending) {
+		CERROR("%s: cli %s/%p tot_granted(%llu) < grant_used(%lu)\n",
+		       exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       tgd->tgd_tot_granted, pending);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	tgd->tgd_tot_granted -= pending;
+
+	if (tgd->tgd_tot_pending < pending) {
+		CERROR("%s: cli %s/%p tot_pending(%llu) < grant_used(%lu)\n",
+		       exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       tgd->tgd_tot_pending, pending);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	tgd->tgd_tot_pending -= pending;
+	spin_unlock(&tgd->tgd_grant_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_grant_commit);
+
+struct tgt_grant_cb {
+	/* commit callback structure */
+	struct dt_txn_commit_cb	 tgc_cb;
+	/* export associated with the bulk write */
+	struct obd_export	*tgc_exp;
+	/* pending grant to be released */
+	unsigned long		 tgc_granted;
+};
+
+/**
+ * Callback function for grant releasing
+ *
+ * Release grant space reserved by the client node.
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	transaction handle
+ * \param[in] cb	callback data
+ * \param[in] err	error code
+ */
+static void tgt_grant_commit_cb(struct lu_env *env, struct thandle *th,
+				struct dt_txn_commit_cb *cb, int err)
+{
+	struct tgt_grant_cb *tgc;
+
+	tgc = container_of(cb, struct tgt_grant_cb, tgc_cb);
+
+	tgt_grant_commit(tgc->tgc_exp, tgc->tgc_granted, err);
+	class_export_cb_put(tgc->tgc_exp);
+	OBD_FREE_PTR(tgc);
+}
+
+/**
+ * Add callback for grant releasing
+ *
+ * Register a commit callback to release grant space.
+ *
+ * \param[in] th	transaction handle
+ * \param[in] exp	OBD export of client
+ * \param[in] granted	amount of grant space to be released upon commit
+ *
+ * \retval		0 on successful callback adding
+ * \retval		negative value on error
+ */
+int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
+			    unsigned long granted)
+{
+	struct tgt_grant_cb	*tgc;
+	struct dt_txn_commit_cb	*dcb;
+	int			 rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(tgc);
+	if (tgc == NULL)
+		RETURN(-ENOMEM);
+
+	tgc->tgc_exp = class_export_cb_get(exp);
+	tgc->tgc_granted = granted;
+
+	dcb = &tgc->tgc_cb;
+	dcb->dcb_func = tgt_grant_commit_cb;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strlcpy(dcb->dcb_name, "tgt_grant_commit_cb", sizeof(dcb->dcb_name));
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc) {
+		class_export_cb_put(tgc->tgc_exp);
+		OBD_FREE_PTR(tgc);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_grant_commit_cb_add);
+
+/**
+ * Show estimate of total amount of dirty data on clients.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		0 on success
+ *			negative value on error
+ */
+ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd;
+
+	tgd = &obd->u.obt.obt_lut->lut_tgd;
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_dirty);
+}
+EXPORT_SYMBOL(tot_dirty_show);
+
+/**
+ * Show total amount of space granted to clients.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		0 on success
+ *			negative value on error
+ */
+ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd;
+
+	tgd = &obd->u.obt.obt_lut->lut_tgd;
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_granted);
+}
+EXPORT_SYMBOL(tot_granted_show);
+
+/**
+ * Show total amount of space used by IO in progress.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		0 on success
+ *			negative value on error
+ */
+ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd;
+
+	tgd = &obd->u.obt.obt_lut->lut_tgd;
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_pending);
+}
+EXPORT_SYMBOL(tot_pending_show);
+
+/**
+ * Show if grants compatibility mode is disabled.
+ *
+ * When tgd_grant_compat_disable is set, we don't grant any space to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
+ * a client is inflated since it consumes PAGE_SIZE of grant space per
+ * block, (i.e. typically 4kB units), but underlaying file system might have
+ * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		string length of @buf output on success
+ */
+ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", tgd->tgd_grant_compat_disable);
+}
+EXPORT_SYMBOL(grant_compat_disable_show);
+
+/**
+ * Change grant compatibility mode.
+ *
+ * Setting tgd_grant_compat_disable prohibit any space granting to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
+ *
+ * @kobj	kobject embedded in obd_device
+ * @attr	unused
+ * @buffer	string which represents mode
+ *		1: disable compatibility mode
+ *		0: enable compatibility mode
+ * @count	@buffer length
+ *
+ * Return:	@count on success
+ *		negative number on error
+ */
+ssize_t grant_compat_disable_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	tgd->tgd_grant_compat_disable = val;
+
+	return count;
+}
+EXPORT_SYMBOL(grant_compat_disable_store);
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
new file mode 100644
index 0000000000000..6be647b20cfa4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
@@ -0,0 +1,3028 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017, Intel Corporation.
+ */
+/*
+ * lustre/target/tgt_handler.c
+ *
+ * Lustre Unified Target request handler code
+ *
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/user_namespace.h>
+#include <linux/delay.h>
+#include <linux/uidgid.h>
+
+#include <libcfs/linux/linux-mem.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_cksum.h>
+#include <lustre_lfsck.h>
+#include <lustre_nodemap.h>
+#include <lustre_acl.h>
+
+#include "tgt_internal.h"
+
+char *tgt_name(struct lu_target *tgt)
+{
+	LASSERT(tgt->lut_obd != NULL);
+	return tgt->lut_obd->obd_name;
+}
+EXPORT_SYMBOL(tgt_name);
+
+/*
+ * Generic code handling requests that have struct mdt_body passed in:
+ *
+ *  - extract mdt_body from request and save it in @tsi, if present;
+ *
+ *  - create lu_object, corresponding to the fid in mdt_body, and save it in
+ *  @tsi;
+ *
+ *  - if HAS_BODY flag is set for this request type check whether object
+ *  actually exists on storage (lu_object_exists()).
+ *
+ */
+static int tgt_mdt_body_unpack(struct tgt_session_info *tsi, __u32 flags)
+{
+	const struct mdt_body	*body;
+	struct lu_object	*obj;
+	struct req_capsule	*pill = tsi->tsi_pill;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	tsi->tsi_mdt_body = body;
+
+	if (!(body->mbo_valid & OBD_MD_FLID))
+		RETURN(0);
+
+	/* mdc_pack_body() doesn't check if fid is zero and set OBD_ML_FID
+	 * in any case in pre-2.5 clients. Fix that here if needed */
+	if (unlikely(fid_is_zero(&body->mbo_fid1)))
+		RETURN(0);
+
+	if (!fid_is_sane(&body->mbo_fid1)) {
+		CERROR("%s: invalid FID: "DFID"\n", tgt_name(tsi->tsi_tgt),
+		       PFID(&body->mbo_fid1));
+		RETURN(-EINVAL);
+	}
+
+	obj = lu_object_find(tsi->tsi_env,
+			     &tsi->tsi_tgt->lut_bottom->dd_lu_dev,
+			     &body->mbo_fid1, NULL);
+	if (!IS_ERR(obj)) {
+		if ((flags & HAS_BODY) && !lu_object_exists(obj)) {
+			lu_object_put(tsi->tsi_env, obj);
+			rc = -ENOENT;
+		} else {
+			tsi->tsi_corpus = obj;
+			rc = 0;
+		}
+	} else {
+		rc = PTR_ERR(obj);
+	}
+
+	tsi->tsi_fid = body->mbo_fid1;
+
+	RETURN(rc);
+}
+
+/**
+ * Validate oa from client.
+ * If the request comes from 2.0 clients, currently only RSVD seq and IDIF
+ * req are valid.
+ *    a. objects in Single MDT FS  seq = FID_SEQ_OST_MDT0, oi_id != 0
+ *    b. Echo objects(seq = 2), old echo client still use oi_id/oi_seq to
+ *       pack ost_id. Because non-zero oi_seq will make it diffcult to tell
+ *       whether this is oi_fid or real ostid. So it will check
+ *       OBD_CONNECT_FID, then convert the ostid to FID for old client.
+ *    c. Old FID-disable osc will send IDIF.
+ *    d. new FID-enable osc/osp will send normal FID.
+ *
+ * And also oi_id/f_oid should always start from 1. oi_id/f_oid = 0 will
+ * be used for LAST_ID file, and only being accessed inside OST now.
+ */
+int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa)
+{
+	struct ost_id	*oi	= &oa->o_oi;
+	u64		 seq	= ostid_seq(oi);
+	u64		 id	= ostid_id(oi);
+	int		 rc;
+	ENTRY;
+
+	if (unlikely(!(exp_connect_flags(tsi->tsi_exp) & OBD_CONNECT_FID) &&
+		     fid_seq_is_echo(seq))) {
+		/* Sigh 2.[123] client still sends echo req with oi_id = 0
+		 * during create, and we will reset this to 1, since this
+		 * oi_id is basically useless in the following create process,
+		 * but oi_id == 0 will make it difficult to tell whether it is
+		 * real FID or ost_id. */
+		oi->oi_fid.f_seq = FID_SEQ_ECHO;
+		oi->oi_fid.f_oid = id ?: 1;
+		oi->oi_fid.f_ver = 0;
+	} else {
+		struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env);
+
+		if (unlikely((oa->o_valid & OBD_MD_FLID) && id == 0))
+			GOTO(out, rc = -EPROTO);
+
+		/* Note: this check might be forced in 2.5 or 2.6, i.e.
+		 * all of the requests are required to setup FLGROUP */
+		if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) {
+			ostid_set_seq_mdt0(oi);
+			oa->o_valid |= OBD_MD_FLGROUP;
+			seq = ostid_seq(oi);
+		}
+
+		if (unlikely(!(fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq) ||
+			       fid_seq_is_norm(seq) || fid_seq_is_echo(seq))))
+			GOTO(out, rc = -EPROTO);
+
+		rc = ostid_to_fid(&tti->tti_fid1, oi,
+				  tsi->tsi_tgt->lut_lsd.lsd_osd_index);
+		if (unlikely(rc != 0))
+			GOTO(out, rc);
+
+		oi->oi_fid = tti->tti_fid1;
+	}
+
+	RETURN(0);
+
+out:
+	CERROR("%s: client %s sent bad object "DOSTID": rc = %d\n",
+	       tgt_name(tsi->tsi_tgt), obd_export_nid2str(tsi->tsi_exp),
+	       seq, id, rc);
+	return rc;
+}
+EXPORT_SYMBOL(tgt_validate_obdo);
+
+static int tgt_io_data_unpack(struct tgt_session_info *tsi, struct ost_id *oi)
+{
+	unsigned		 max_brw;
+	struct niobuf_remote	*rnb;
+	struct obd_ioobj	*ioo;
+	int			 obj_count;
+
+	ENTRY;
+
+	ioo = req_capsule_client_get(tsi->tsi_pill, &RMF_OBD_IOOBJ);
+	if (ioo == NULL)
+		RETURN(-EPROTO);
+
+	rnb = req_capsule_client_get(tsi->tsi_pill, &RMF_NIOBUF_REMOTE);
+	if (rnb == NULL)
+		RETURN(-EPROTO);
+
+	max_brw = ioobj_max_brw_get(ioo);
+	if (unlikely((max_brw & (max_brw - 1)) != 0)) {
+		CERROR("%s: client %s sent bad ioobj max %u for "DOSTID
+		       ": rc = %d\n", tgt_name(tsi->tsi_tgt),
+		       obd_export_nid2str(tsi->tsi_exp), max_brw,
+		       POSTID(oi), -EPROTO);
+		RETURN(-EPROTO);
+	}
+	ioo->ioo_oid = *oi;
+
+	obj_count = req_capsule_get_size(tsi->tsi_pill, &RMF_OBD_IOOBJ,
+					RCL_CLIENT) / sizeof(*ioo);
+	if (obj_count == 0) {
+		CERROR("%s: short ioobj\n", tgt_name(tsi->tsi_tgt));
+		RETURN(-EPROTO);
+	} else if (obj_count > 1) {
+		CERROR("%s: too many ioobjs (%d)\n", tgt_name(tsi->tsi_tgt),
+		       obj_count);
+		RETURN(-EPROTO);
+	}
+
+	if (ioo->ioo_bufcnt == 0) {
+		CERROR("%s: ioo has zero bufcnt\n", tgt_name(tsi->tsi_tgt));
+		RETURN(-EPROTO);
+	}
+
+	if (ioo->ioo_bufcnt > PTLRPC_MAX_BRW_PAGES) {
+		DEBUG_REQ(D_RPCTRACE, tgt_ses_req(tsi),
+			  "bulk has too many pages (%d)",
+			  ioo->ioo_bufcnt);
+		RETURN(-EPROTO);
+	}
+
+	RETURN(0);
+}
+
+static int tgt_ost_body_unpack(struct tgt_session_info *tsi, __u32 flags)
+{
+	struct ost_body		*body;
+	struct req_capsule	*pill = tsi->tsi_pill;
+	struct lu_nodemap	*nodemap;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(pill, &RMF_OST_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	rc = tgt_validate_obdo(tsi, &body->oa);
+	if (rc)
+		RETURN(rc);
+
+	nodemap = nodemap_get_from_exp(tsi->tsi_exp);
+	if (IS_ERR(nodemap))
+		RETURN(PTR_ERR(nodemap));
+
+	body->oa.o_uid = nodemap_map_id(nodemap, NODEMAP_UID,
+					NODEMAP_CLIENT_TO_FS,
+					body->oa.o_uid);
+	body->oa.o_gid = nodemap_map_id(nodemap, NODEMAP_GID,
+					NODEMAP_CLIENT_TO_FS,
+					body->oa.o_gid);
+	body->oa.o_projid = nodemap_map_id(nodemap, NODEMAP_PROJID,
+					   NODEMAP_CLIENT_TO_FS,
+					   body->oa.o_projid);
+	nodemap_putref(nodemap);
+
+	tsi->tsi_ost_body = body;
+	tsi->tsi_fid = body->oa.o_oi.oi_fid;
+
+	if (req_capsule_has_field(pill, &RMF_OBD_IOOBJ, RCL_CLIENT)) {
+		rc = tgt_io_data_unpack(tsi, &body->oa.o_oi);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+	if (!(body->oa.o_valid & OBD_MD_FLID)) {
+		if (flags & HAS_BODY) {
+			CERROR("%s: OBD_MD_FLID flag is not set in ost_body but OID/FID is mandatory with HAS_BODY\n",
+			       tgt_name(tsi->tsi_tgt));
+			RETURN(-EPROTO);
+		} else {
+			RETURN(0);
+		}
+	}
+
+	ost_fid_build_resid(&tsi->tsi_fid, &tsi->tsi_resid);
+
+	/*
+	 * OST doesn't get object in advance for further use to prevent
+	 * situations with nested object_find which is potential deadlock.
+	 */
+	tsi->tsi_corpus = NULL;
+	RETURN(rc);
+}
+
+/*
+ * Do necessary preprocessing according to handler ->th_flags.
+ */
+static int tgt_request_preprocess(struct tgt_session_info *tsi,
+				  struct tgt_handler *h,
+				  struct ptlrpc_request *req)
+{
+	struct req_capsule	*pill = tsi->tsi_pill;
+	__u32			 flags = h->th_flags;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (tsi->tsi_preprocessed)
+		RETURN(0);
+
+	LASSERT(h->th_act != NULL);
+	LASSERT(h->th_opc == lustre_msg_get_opc(req->rq_reqmsg));
+	LASSERT(current->journal_info == NULL);
+
+	LASSERT(ergo(flags & (HAS_BODY | HAS_REPLY),
+		     h->th_fmt != NULL));
+	if (h->th_fmt != NULL) {
+		req_capsule_set(pill, h->th_fmt);
+		if (req_capsule_has_field(pill, &RMF_MDT_BODY, RCL_CLIENT) &&
+		    req_capsule_field_present(pill, &RMF_MDT_BODY,
+					      RCL_CLIENT)) {
+			rc = tgt_mdt_body_unpack(tsi, flags);
+			if (rc < 0)
+				RETURN(rc);
+		} else if (req_capsule_has_field(pill, &RMF_OST_BODY,
+						 RCL_CLIENT) &&
+			   req_capsule_field_present(pill, &RMF_OST_BODY,
+						 RCL_CLIENT)) {
+			rc = tgt_ost_body_unpack(tsi, flags);
+			if (rc < 0)
+				RETURN(rc);
+		}
+	}
+
+	if (flags & IS_MUTABLE && tgt_conn_flags(tsi) & OBD_CONNECT_RDONLY)
+		RETURN(-EROFS);
+
+	if (flags & HAS_KEY) {
+		struct ldlm_request *dlm_req;
+
+		LASSERT(h->th_fmt != NULL);
+
+		dlm_req = req_capsule_client_get(pill, &RMF_DLM_REQ);
+		if (dlm_req != NULL) {
+			union ldlm_wire_policy_data *policy =
+					&dlm_req->lock_desc.l_policy_data;
+
+			if (unlikely(dlm_req->lock_desc.l_resource.lr_type ==
+				     LDLM_IBITS &&
+				     (policy->l_inodebits.bits |
+				      policy->l_inodebits.try_bits) == 0)) {
+				/*
+				 * Lock without inodebits makes no sense and
+				 * will oops later in ldlm. If client miss to
+				 * set such bits, do not trigger ASSERTION.
+				 *
+				 * For liblustre flock case, it maybe zero.
+				 */
+				rc = -EPROTO;
+			} else {
+				tsi->tsi_dlm_req = dlm_req;
+			}
+		} else {
+			rc = -EFAULT;
+		}
+	}
+	tsi->tsi_preprocessed = 1;
+	RETURN(rc);
+}
+
+/*
+ * Invoke handler for this request opc. Also do necessary preprocessing
+ * (according to handler ->th_flags), and post-processing (setting of
+ * ->last_{xid,committed}).
+ */
+static int tgt_handle_request0(struct tgt_session_info *tsi,
+			       struct tgt_handler *h,
+			       struct ptlrpc_request *req)
+{
+	int	 serious = 0;
+	int	 rc;
+	__u32    opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	ENTRY;
+
+
+	/* When dealing with sec context requests, no export is associated yet,
+	 * because these requests are sent before *_CONNECT requests.
+	 * A NULL req->rq_export means the normal *_common_slice handlers will
+	 * not be called, because there is no reference to the target.
+	 * So deal with them by hand and jump directly to target_send_reply().
+	 */
+	switch (opc) {
+	case SEC_CTX_INIT:
+	case SEC_CTX_INIT_CONT:
+	case SEC_CTX_FINI:
+		CFS_FAIL_TIMEOUT(OBD_FAIL_SEC_CTX_HDL_PAUSE, cfs_fail_val);
+		GOTO(out, rc = 0);
+	}
+
+	/*
+	 * Checking for various OBD_FAIL_$PREF_$OPC_NET codes. _Do_ not try
+	 * to put same checks into handlers like mdt_close(), mdt_reint(),
+	 * etc., without talking to mdt authors first. Checking same thing
+	 * there again is useless and returning 0 error without packing reply
+	 * is buggy! Handlers either pack reply or return error.
+	 *
+	 * We return 0 here and do not send any reply in order to emulate
+	 * network failure. Do not send any reply in case any of NET related
+	 * fail_id has occured.
+	 */
+	if (OBD_FAIL_CHECK_ORSET(h->th_fail_id, OBD_FAIL_ONCE))
+		RETURN(0);
+	if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT &&
+		     OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET)))
+		RETURN(0);
+
+	/* drop OUT_UPDATE rpc */
+	if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == OUT_UPDATE &&
+		     OBD_FAIL_CHECK(OBD_FAIL_OUT_UPDATE_DROP)))
+		RETURN(0);
+
+	rc = tgt_request_preprocess(tsi, h, req);
+	/* pack reply if reply format is fixed */
+	if (rc == 0 && h->th_flags & HAS_REPLY) {
+		/* Pack reply */
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_MDT_MD,
+					  RCL_SERVER))
+			req_capsule_set_size(tsi->tsi_pill, &RMF_MDT_MD,
+					     RCL_SERVER,
+					     tsi->tsi_mdt_body->mbo_eadatasize);
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_LOGCOOKIES,
+					  RCL_SERVER))
+			req_capsule_set_size(tsi->tsi_pill, &RMF_LOGCOOKIES,
+					     RCL_SERVER, 0);
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_ACL, RCL_SERVER))
+			req_capsule_set_size(tsi->tsi_pill,
+					     &RMF_ACL, RCL_SERVER,
+					     LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
+
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_SHORT_IO,
+					  RCL_SERVER)) {
+			struct niobuf_remote *remote_nb =
+				req_capsule_client_get(tsi->tsi_pill,
+						       &RMF_NIOBUF_REMOTE);
+			struct ost_body *body = tsi->tsi_ost_body;
+
+			req_capsule_set_size(tsi->tsi_pill, &RMF_SHORT_IO,
+					 RCL_SERVER,
+					 (body->oa.o_valid & OBD_MD_FLFLAGS &&
+					  body->oa.o_flags & OBD_FL_SHORT_IO) ?
+					 remote_nb[0].rnb_len : 0);
+		}
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_FILE_ENCCTX,
+					  RCL_SERVER))
+			req_capsule_set_size(tsi->tsi_pill, &RMF_FILE_ENCCTX,
+					     RCL_SERVER, 0);
+
+		rc = req_capsule_server_pack(tsi->tsi_pill);
+	}
+
+	if (likely(rc == 0)) {
+		/*
+		 * Process request, there can be two types of rc:
+		 * 1) errors with msg unpack/pack, other failures outside the
+		 * operation itself. This is counted as serious errors;
+		 * 2) errors during fs operation, should be placed in rq_status
+		 * only
+		 */
+		rc = h->th_act(tsi);
+		if (!is_serious(rc) &&
+		    !req->rq_no_reply && req->rq_reply_state == NULL) {
+			DEBUG_REQ(D_ERROR, req,
+				  "%s: %s handler did not pack reply but returned no error",
+				  tgt_name(tsi->tsi_tgt), h->th_name);
+			LBUG();
+		}
+		serious = is_serious(rc);
+		rc = clear_serious(rc);
+	} else {
+		serious = 1;
+	}
+
+	req->rq_status = rc;
+
+	/*
+	 * ELDLM_* codes which > 0 should be in rq_status only as well as
+	 * all non-serious errors.
+	 */
+	if (rc > 0 || !serious)
+		rc = 0;
+
+	LASSERT(current->journal_info == NULL);
+
+	if (likely(rc == 0 && req->rq_export))
+		target_committed_to_req(req);
+
+out:
+	target_send_reply(req, rc, tsi->tsi_reply_fail_id);
+	RETURN(0);
+}
+
+static int tgt_filter_recovery_request(struct ptlrpc_request *req,
+				       struct obd_device *obd, int *process)
+{
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case MDS_DISCONNECT:
+	case OST_DISCONNECT:
+	case OBD_IDX_READ:
+		*process = 1;
+		RETURN(0);
+	case MDS_CLOSE:
+	case MDS_SYNC: /* used in unmounting */
+	case OBD_PING:
+	case MDS_REINT:
+	case OUT_UPDATE:
+	case SEQ_QUERY:
+	case FLD_QUERY:
+	case FLD_READ:
+	case LDLM_ENQUEUE:
+	case OST_CREATE:
+	case OST_DESTROY:
+	case OST_PUNCH:
+	case OST_SETATTR:
+	case OST_SYNC:
+	case OST_WRITE:
+	case MDS_HSM_PROGRESS:
+	case MDS_HSM_STATE_SET:
+	case MDS_HSM_REQUEST:
+	case OST_FALLOCATE:
+		*process = target_queue_recovery_request(req, obd);
+		RETURN(0);
+
+	default:
+		DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
+		*process = -EAGAIN;
+		RETURN(0);
+	}
+}
+
+/*
+ * Handle recovery. Return:
+ *        +1: continue request processing;
+ *       -ve: abort immediately with the given error code;
+ *         0: send reply with error code in req->rq_status;
+ */
+static int tgt_handle_recovery(struct ptlrpc_request *req, int reply_fail_id)
+{
+	ENTRY;
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case MDS_CONNECT:
+	case OST_CONNECT:
+	case MGS_CONNECT:
+	case SEC_CTX_INIT:
+	case SEC_CTX_INIT_CONT:
+	case SEC_CTX_FINI:
+		RETURN(+1);
+	}
+
+	if (!req->rq_export->exp_obd->obd_replayable)
+		RETURN(+1);
+
+	/* sanity check: if the xid matches, the request must be marked as a
+	 * resent or replayed */
+	if (req_can_reconstruct(req, NULL) == 1) {
+		if (!(lustre_msg_get_flags(req->rq_reqmsg) &
+		      (MSG_RESENT | MSG_REPLAY))) {
+			DEBUG_REQ(D_WARNING, req,
+				  "rq_xid=%llu matches saved XID, expected REPLAY or RESENT flag (%x)",
+				  req->rq_xid,
+				  lustre_msg_get_flags(req->rq_reqmsg));
+			req->rq_status = -ENOTCONN;
+			RETURN(-ENOTCONN);
+		}
+	}
+	/* else: note the opposite is not always true; a RESENT req after a
+	 * failover will usually not match the last_xid, since it was likely
+	 * never committed. A REPLAYed request will almost never match the
+	 * last xid, however it could for a committed, but still retained,
+	 * open. */
+
+	/* Check for aborted recovery... */
+	if (unlikely(req->rq_export->exp_obd->obd_recovering)) {
+		int rc;
+		int should_process;
+
+		DEBUG_REQ(D_INFO, req, "Got new replay");
+		rc = tgt_filter_recovery_request(req, req->rq_export->exp_obd,
+						 &should_process);
+		if (rc != 0 || !should_process)
+			RETURN(rc);
+		else if (should_process < 0) {
+			req->rq_status = should_process;
+			rc = ptlrpc_error(req);
+			RETURN(rc);
+		}
+	}
+	RETURN(+1);
+}
+
+/* Initial check for request, it is validation mostly */
+static struct tgt_handler *tgt_handler_find_check(struct ptlrpc_request *req)
+{
+	struct tgt_handler	*h;
+	struct tgt_opc_slice	*s;
+	struct lu_target	*tgt;
+	__u32			 opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	ENTRY;
+
+	tgt = class_exp2tgt(req->rq_export);
+	if (unlikely(tgt == NULL)) {
+		DEBUG_REQ(D_ERROR, req, "%s: no target for connected export",
+			  class_exp2obd(req->rq_export)->obd_name);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	for (s = tgt->lut_slice; s->tos_hs != NULL; s++)
+		if (s->tos_opc_start <= opc && opc < s->tos_opc_end)
+			break;
+
+	/* opcode was not found in slice */
+	if (unlikely(s->tos_hs == NULL)) {
+		static bool printed;
+
+		/* don't spew error messages for unhandled RPCs */
+		if (!printed) {
+			CERROR("%s: no handler for opcode 0x%x from %s\n",
+			       tgt_name(tgt), opc, libcfs_id2str(req->rq_peer));
+			printed = true;
+		}
+		RETURN(ERR_PTR(-ENOTSUPP));
+	}
+
+	LASSERT(opc >= s->tos_opc_start && opc < s->tos_opc_end);
+	h = s->tos_hs + (opc - s->tos_opc_start);
+	if (unlikely(h->th_opc == 0)) {
+		CERROR("%s: unsupported opcode 0x%x\n", tgt_name(tgt), opc);
+		RETURN(ERR_PTR(-ENOTSUPP));
+	}
+
+	RETURN(h);
+}
+
+static int process_req_last_xid(struct ptlrpc_request *req)
+{
+	__u64	last_xid;
+	int rc = 0;
+	struct obd_export *exp = req->rq_export;
+	struct tg_export_data *ted = &exp->exp_target_data;
+	bool need_lock = tgt_is_multimodrpcs_client(exp);
+	ENTRY;
+
+	if (need_lock)
+		mutex_lock(&ted->ted_lcd_lock);
+	/* check request's xid is consistent with export's last_xid */
+	last_xid = lustre_msg_get_last_xid(req->rq_reqmsg);
+	if (last_xid > exp->exp_last_xid)
+		exp->exp_last_xid = last_xid;
+
+	if (req->rq_xid == 0 || req->rq_xid <= exp->exp_last_xid) {
+		/* Some request is allowed to be sent during replay,
+		 * such as OUT update requests, FLD requests, so it
+		 * is possible that replay requests has smaller XID
+		 * than the exp_last_xid.
+		 *
+		 * Some non-replay requests may have smaller XID as
+		 * well:
+		 *
+		 * - Client send a no_resend RPC, like statfs;
+		 * - The RPC timedout (or some other error) on client,
+		 *   then it's removed from the unreplied list;
+		 * - Client send some other request to bump the
+		 *   exp_last_xid on server;
+		 * - The former RPC got chance to be processed;
+		 */
+		if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY))
+			rc = -EPROTO;
+
+		DEBUG_REQ(D_WARNING, req,
+			  "unexpected xid=%llx != exp_last_xid=%llx, rc = %d",
+			  req->rq_xid, exp->exp_last_xid, rc);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	/* The "last_xid" is the minimum xid among unreplied requests,
+	 * if the request is from the previous connection, its xid can
+	 * still be larger than "exp_last_xid", then the above check of
+	 * xid is not enough to determine whether the request is delayed.
+	 *
+	 * For example, if some replay request was delayed and caused
+	 * timeout at client and the replay is restarted, the delayed
+	 * replay request will have the larger xid than "exp_last_xid"
+	 */
+	if (req->rq_export->exp_conn_cnt >
+	    lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+		CDEBUG(D_RPCTRACE,
+		       "Dropping request %llu from an old epoch %u/%u\n",
+		       req->rq_xid,
+		       lustre_msg_get_conn_cnt(req->rq_reqmsg),
+		       req->rq_export->exp_conn_cnt);
+		req->rq_no_reply = 1;
+		GOTO(out, rc = -ESTALE);
+	}
+
+	/* try to release in-memory reply data */
+	if (tgt_is_multimodrpcs_client(exp)) {
+		tgt_handle_received_xid(exp, last_xid);
+		rc = tgt_handle_tag(req);
+	}
+
+out:
+	if (need_lock)
+		mutex_unlock(&ted->ted_lcd_lock);
+
+	RETURN(rc);
+}
+
+int tgt_request_handle(struct ptlrpc_request *req)
+{
+	struct tgt_session_info	*tsi = tgt_ses_info(req->rq_svc_thread->t_env);
+
+	struct lustre_msg	*msg = req->rq_reqmsg;
+	struct tgt_handler	*h;
+	struct lu_target	*tgt;
+	int			 request_fail_id = 0;
+	__u32			 opc = lustre_msg_get_opc(msg);
+	struct obd_device	*obd;
+	int			 rc;
+	bool			 is_connect = false;
+	ENTRY;
+
+	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
+		if (cfs_fail_val == 0 &&
+		    lustre_msg_get_opc(msg) != OBD_PING &&
+		    lustre_msg_get_flags(msg) & MSG_REQ_REPLAY_DONE) {
+			cfs_fail_val = 1;
+			cfs_race_state = 0;
+			wait_event_idle(cfs_race_waitq, (cfs_race_state == 1));
+		}
+	}
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	tsi->tsi_pill = &req->rq_pill;
+	tsi->tsi_env = req->rq_svc_thread->t_env;
+
+	/* if request has export then get handlers slice from corresponding
+	 * target, otherwise that should be connect operation */
+	if (opc == MDS_CONNECT || opc == OST_CONNECT ||
+	    opc == MGS_CONNECT) {
+		is_connect = true;
+		req_capsule_set(&req->rq_pill, &RQF_CONNECT);
+		rc = target_handle_connect(req);
+		if (rc != 0) {
+			rc = ptlrpc_error(req);
+			GOTO(out, rc);
+		}
+		/* recovery-small test 18c asks to drop connect reply */
+		if (unlikely(opc == OST_CONNECT &&
+			     OBD_FAIL_CHECK(OBD_FAIL_OST_CONNECT_NET2)))
+			GOTO(out, rc = 0);
+	}
+
+	if (unlikely(!class_connected_export(req->rq_export))) {
+		if (opc == SEC_CTX_INIT || opc == SEC_CTX_INIT_CONT ||
+		    opc == SEC_CTX_FINI) {
+			/* sec context initialization has to be handled
+			 * by hand in tgt_handle_request0() */
+			tsi->tsi_reply_fail_id = OBD_FAIL_SEC_CTX_INIT_NET;
+			h = NULL;
+			GOTO(handle_recov, rc = 0);
+		}
+		CDEBUG(D_HA, "operation %d on unconnected OST from %s\n",
+		       opc, libcfs_id2str(req->rq_peer));
+		req->rq_status = -ENOTCONN;
+		rc = ptlrpc_error(req);
+		GOTO(out, rc);
+	}
+
+	tsi->tsi_tgt = tgt = class_exp2tgt(req->rq_export);
+	tsi->tsi_exp = req->rq_export;
+	if (exp_connect_flags(req->rq_export) & OBD_CONNECT_JOBSTATS)
+		tsi->tsi_jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	else
+		tsi->tsi_jobid = NULL;
+
+	if (tgt == NULL) {
+		DEBUG_REQ(D_ERROR, req, "%s: No target for connected export",
+			  class_exp2obd(req->rq_export)->obd_name);
+		req->rq_status = -EINVAL;
+		rc = ptlrpc_error(req);
+		GOTO(out, rc);
+	}
+
+	/* Skip last_xid processing for the recovery thread, otherwise, the
+	 * last_xid on same request could be processed twice: first time when
+	 * processing the incoming request, second time when the request is
+	 * being processed by recovery thread. */
+	obd = class_exp2obd(req->rq_export);
+	if (is_connect) {
+		/* reset the exp_last_xid on each connection. */
+		req->rq_export->exp_last_xid = 0;
+	} else if (obd->obd_recovery_data.trd_processing_task !=
+		   current->pid) {
+		rc = process_req_last_xid(req);
+		if (rc) {
+			req->rq_status = rc;
+			rc = ptlrpc_error(req);
+			GOTO(out, rc);
+		}
+	}
+
+	request_fail_id = tgt->lut_request_fail_id;
+	tsi->tsi_reply_fail_id = tgt->lut_reply_fail_id;
+
+	h = tgt_handler_find_check(req);
+	if (IS_ERR(h)) {
+		req->rq_status = PTR_ERR(h);
+		rc = ptlrpc_error(req);
+		GOTO(out, rc);
+	}
+
+	LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n",
+		 h->th_opc, opc);
+
+	if ((cfs_fail_val == 0 || cfs_fail_val == opc) &&
+	     CFS_FAIL_CHECK_ORSET(request_fail_id, CFS_FAIL_ONCE))
+		GOTO(out, rc = 0);
+
+	rc = lustre_msg_check_version(msg, h->th_version);
+	if (unlikely(rc)) {
+		DEBUG_REQ(D_ERROR, req,
+			  "%s: drop malformed request version=%08x expect=%08x",
+			  tgt_name(tgt), lustre_msg_get_version(msg),
+			  h->th_version);
+		req->rq_status = -EINVAL;
+		rc = ptlrpc_error(req);
+		GOTO(out, rc);
+	}
+
+handle_recov:
+	rc = tgt_handle_recovery(req, tsi->tsi_reply_fail_id);
+	if (likely(rc == 1)) {
+		rc = tgt_handle_request0(tsi, h, req);
+		if (rc)
+			GOTO(out, rc);
+	}
+	EXIT;
+out:
+	req_capsule_fini(tsi->tsi_pill);
+	if (tsi->tsi_corpus != NULL) {
+		lu_object_put(tsi->tsi_env, tsi->tsi_corpus);
+		tsi->tsi_corpus = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(tgt_request_handle);
+
+/** Assign high priority operations to the request if needed. */
+int tgt_hpreq_handler(struct ptlrpc_request *req)
+{
+	struct tgt_session_info	*tsi = tgt_ses_info(req->rq_svc_thread->t_env);
+	struct tgt_handler	*h;
+	int			 rc;
+
+	ENTRY;
+
+	if (req->rq_export == NULL)
+		RETURN(0);
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	tsi->tsi_pill = &req->rq_pill;
+	tsi->tsi_env = req->rq_svc_thread->t_env;
+	tsi->tsi_tgt = class_exp2tgt(req->rq_export);
+	tsi->tsi_exp = req->rq_export;
+
+	h = tgt_handler_find_check(req);
+	if (IS_ERR(h)) {
+		rc = PTR_ERR(h);
+		RETURN(rc);
+	}
+
+	rc = tgt_request_preprocess(tsi, h, req);
+	if (unlikely(rc != 0))
+		RETURN(rc);
+
+	if (h->th_hp != NULL)
+		h->th_hp(tsi);
+	RETURN(0);
+}
+EXPORT_SYMBOL(tgt_hpreq_handler);
+
+void tgt_counter_incr(struct obd_export *exp, int opcode)
+{
+	lprocfs_counter_incr(exp->exp_obd->obd_stats, opcode);
+	if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats != NULL)
+		lprocfs_counter_incr(exp->exp_nid_stats->nid_stats, opcode);
+}
+EXPORT_SYMBOL(tgt_counter_incr);
+
+/*
+ * Unified target generic handlers.
+ */
+
+int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp)
+{
+	struct lu_target *tgt = class_exp2tgt(exp);
+	struct sptlrpc_flavor flvr;
+	int rc = 0;
+
+	LASSERT(tgt);
+	LASSERT(tgt->lut_obd);
+	LASSERT(tgt->lut_slice);
+
+	/* always allow ECHO client */
+	if (unlikely(strcmp(exp->exp_obd->obd_type->typ_name,
+			    LUSTRE_ECHO_NAME) == 0)) {
+		exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY;
+		return 0;
+	}
+
+	if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+		read_lock(&tgt->lut_sptlrpc_lock);
+		sptlrpc_target_choose_flavor(&tgt->lut_sptlrpc_rset,
+					     req->rq_sp_from,
+					     req->rq_peer.nid,
+					     &flvr);
+		read_unlock(&tgt->lut_sptlrpc_lock);
+
+		spin_lock(&exp->exp_lock);
+		exp->exp_sp_peer = req->rq_sp_from;
+		exp->exp_flvr = flvr;
+
+		/* when on mgs, if no restriction is set, or if the client
+		 * NID is on the local node, allow any flavor
+		 */
+		if ((strcmp(exp->exp_obd->obd_type->typ_name,
+			    LUSTRE_MGS_NAME) == 0) &&
+		    (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_NULL ||
+		     LNetIsPeerLocal(lnet_nid_to_nid4(&exp->exp_connection->c_peer.nid))))
+			exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY;
+
+		if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY &&
+		    exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+			CERROR("%s: unauthorized rpc flavor %x from %s, "
+			       "expect %x\n", tgt_name(tgt),
+			       req->rq_flvr.sf_rpc,
+			       libcfs_nid2str(req->rq_peer.nid),
+			       exp->exp_flvr.sf_rpc);
+			rc = -EACCES;
+		}
+		spin_unlock(&exp->exp_lock);
+	} else {
+		if (exp->exp_sp_peer != req->rq_sp_from) {
+			CERROR("%s: RPC source %s doesn't match %s\n",
+			       tgt_name(tgt),
+			       sptlrpc_part2name(req->rq_sp_from),
+			       sptlrpc_part2name(exp->exp_sp_peer));
+			rc = -EACCES;
+		} else {
+			rc = sptlrpc_target_export_check(exp, req);
+		}
+	}
+
+	return rc;
+}
+
+int tgt_adapt_sptlrpc_conf(struct lu_target *tgt)
+{
+	struct sptlrpc_rule_set	 tmp_rset;
+	int			 rc;
+
+	if (unlikely(tgt == NULL)) {
+		CERROR("No target passed\n");
+		return -EINVAL;
+	}
+
+	sptlrpc_rule_set_init(&tmp_rset);
+	rc = sptlrpc_conf_target_get_rules(tgt->lut_obd, &tmp_rset);
+	if (rc) {
+		CERROR("%s: failed get sptlrpc rules: rc = %d\n",
+		       tgt_name(tgt), rc);
+		return rc;
+	}
+
+	sptlrpc_target_update_exp_flavor(tgt->lut_obd, &tmp_rset);
+
+	write_lock(&tgt->lut_sptlrpc_lock);
+	sptlrpc_rule_set_free(&tgt->lut_sptlrpc_rset);
+	tgt->lut_sptlrpc_rset = tmp_rset;
+	write_unlock(&tgt->lut_sptlrpc_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(tgt_adapt_sptlrpc_conf);
+
+int tgt_connect(struct tgt_session_info *tsi)
+{
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct obd_connect_data	*reply;
+	int			 rc;
+
+	ENTRY;
+
+	/* XXX: better to call this check right after getting new export but
+	 * before last_rcvd slot allocation to avoid server load upon insecure
+	 * connects. This is to be fixed after unifiyng all targets.
+	 */
+	rc = tgt_connect_check_sptlrpc(req, tsi->tsi_exp);
+	if (rc)
+		GOTO(out, rc);
+
+	/* To avoid exposing partially initialized connection flags, changes up
+	 * to this point have been staged in reply->ocd_connect_flags. Now that
+	 * connection handling has completed successfully, atomically update
+	 * the connect flags in the shared export data structure. LU-1623 */
+	reply = req_capsule_server_get(tsi->tsi_pill, &RMF_CONNECT_DATA);
+	spin_lock(&tsi->tsi_exp->exp_lock);
+	*exp_connect_flags_ptr(tsi->tsi_exp) = reply->ocd_connect_flags;
+	if (reply->ocd_connect_flags & OBD_CONNECT_FLAGS2)
+		*exp_connect_flags2_ptr(tsi->tsi_exp) =
+			reply->ocd_connect_flags2;
+	tsi->tsi_exp->exp_connect_data.ocd_brw_size = reply->ocd_brw_size;
+	spin_unlock(&tsi->tsi_exp->exp_lock);
+
+	if (strcmp(tsi->tsi_exp->exp_obd->obd_type->typ_name,
+		   LUSTRE_MDT_NAME) == 0) {
+		rc = req_check_sepol(tsi->tsi_pill);
+		if (rc)
+			GOTO(out, rc);
+
+		if (reply->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
+		    reply->ocd_connect_flags2 & OBD_CONNECT2_ENCRYPT &&
+		    tsi->tsi_pill->rc_req->rq_export) {
+			bool forbid_encrypt = true;
+			struct lu_nodemap *nm =
+			 nodemap_get_from_exp(tsi->tsi_pill->rc_req->rq_export);
+
+			if (!nm) {
+				/* nodemap_get_from_exp returns NULL in case
+				 * nodemap is not active, so we do not forbid
+				 */
+				forbid_encrypt = false;
+			} else if (!IS_ERR(nm)) {
+				forbid_encrypt = nm->nmf_forbid_encryption;
+				nodemap_putref(nm);
+			}
+
+			if (forbid_encrypt)
+				GOTO(out, rc = -EACCES);
+		}
+	}
+
+	RETURN(0);
+out:
+	obd_disconnect(class_export_get(tsi->tsi_exp));
+	return rc;
+}
+EXPORT_SYMBOL(tgt_connect);
+
+int tgt_disconnect(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OST_DISCONNECT_DELAY, cfs_fail_val);
+
+	rc = target_handle_disconnect(tgt_ses_req(tsi));
+	if (rc)
+		RETURN(err_serious(rc));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_disconnect);
+
+/*
+ * Unified target OBD handlers
+ */
+int tgt_obd_ping(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	/* The target-specific part of OBD_PING request handling.
+	 * It controls Filter Modification Data (FMD) expiration each time
+	 * PING is received.
+	 *
+	 * Valid only for replayable targets, e.g. MDT and OFD
+	 */
+	if (tsi->tsi_exp->exp_obd->obd_replayable)
+		tgt_fmd_expire(tsi->tsi_exp);
+
+	rc = req_capsule_server_pack(tsi->tsi_pill);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_obd_ping);
+
+int tgt_obd_log_cancel(struct tgt_session_info *tsi)
+{
+	return err_serious(-EOPNOTSUPP);
+}
+
+int tgt_send_buffer(struct tgt_session_info *tsi, struct lu_rdbuf *rdbuf)
+{
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct obd_export	*exp = req->rq_export;
+	struct ptlrpc_bulk_desc	*desc;
+	int			 i;
+	int			 rc;
+	int			 pages = 0;
+
+	ENTRY;
+
+	for (i = 0; i < rdbuf->rb_nbufs; i++) {
+		unsigned int offset;
+
+		offset = (unsigned long)rdbuf->rb_bufs[i].lb_buf & ~PAGE_MASK;
+		pages += DIV_ROUND_UP(rdbuf->rb_bufs[i].lb_len + offset,
+				      PAGE_SIZE);
+	}
+
+	desc = ptlrpc_prep_bulk_exp(req, pages, 1,
+				  PTLRPC_BULK_PUT_SOURCE,
+				    MDS_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_nopin_ops);
+	if (desc == NULL)
+		RETURN(-ENOMEM);
+
+	for (i = 0; i < rdbuf->rb_nbufs; i++)
+		desc->bd_frag_ops->add_iov_frag(desc,
+					rdbuf->rb_bufs[i].lb_buf,
+					rdbuf->rb_bufs[i].lb_len);
+
+	rc = target_bulk_io(exp, desc);
+	ptlrpc_free_bulk(desc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_send_buffer);
+
+int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob)
+{
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct obd_export	*exp = req->rq_export;
+	struct ptlrpc_bulk_desc	*desc;
+	int			 tmpcount;
+	int			 tmpsize;
+	int			 i;
+	int			 rc;
+
+	ENTRY;
+
+	desc = ptlrpc_prep_bulk_exp(req, rdpg->rp_npages, 1,
+				    PTLRPC_BULK_PUT_SOURCE,
+				    MDS_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_pin_ops);
+	if (desc == NULL)
+		RETURN(-ENOMEM);
+
+	if (!(exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE))
+		/* old client requires reply size in it's PAGE_SIZE,
+		 * which is rdpg->rp_count */
+		nob = rdpg->rp_count;
+
+	for (i = 0, tmpcount = nob; i < rdpg->rp_npages && tmpcount > 0;
+	     i++, tmpcount -= tmpsize) {
+		tmpsize = min_t(int, tmpcount, PAGE_SIZE);
+		desc->bd_frag_ops->add_kiov_frag(desc, rdpg->rp_pages[i], 0,
+						 tmpsize);
+	}
+
+	LASSERT(desc->bd_nob == nob);
+	rc = target_bulk_io(exp, desc);
+	ptlrpc_free_bulk(desc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_sendpage);
+
+/*
+ * OBD_IDX_READ handler
+ */
+static int tgt_obd_idx_read(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct lu_rdpg		*rdpg = &tti->tti_u.rdpg.tti_rdpg;
+	struct idx_info		*req_ii, *rep_ii;
+	int			 rc, i;
+
+	ENTRY;
+
+	memset(rdpg, 0, sizeof(*rdpg));
+	req_capsule_set(tsi->tsi_pill, &RQF_OBD_IDX_READ);
+
+	/* extract idx_info buffer from request & reply */
+	req_ii = req_capsule_client_get(tsi->tsi_pill, &RMF_IDX_INFO);
+	if (req_ii == NULL || req_ii->ii_magic != IDX_INFO_MAGIC)
+		RETURN(err_serious(-EPROTO));
+
+	rc = req_capsule_server_pack(tsi->tsi_pill);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	rep_ii = req_capsule_server_get(tsi->tsi_pill, &RMF_IDX_INFO);
+	if (rep_ii == NULL)
+		RETURN(err_serious(-EFAULT));
+	rep_ii->ii_magic = IDX_INFO_MAGIC;
+
+	/* extract hash to start with */
+	rdpg->rp_hash = req_ii->ii_hash_start;
+
+	/* extract requested attributes */
+	rdpg->rp_attrs = req_ii->ii_attrs;
+
+	/* check that fid packed in request is valid and supported */
+	if (!fid_is_sane(&req_ii->ii_fid))
+		RETURN(-EINVAL);
+	rep_ii->ii_fid = req_ii->ii_fid;
+
+	/* copy flags */
+	rep_ii->ii_flags = req_ii->ii_flags;
+
+	/* compute number of pages to allocate, ii_count is the number of 4KB
+	 * containers */
+	if (req_ii->ii_count <= 0)
+		GOTO(out, rc = -EFAULT);
+	rdpg->rp_count = min_t(unsigned int, req_ii->ii_count << LU_PAGE_SHIFT,
+			       exp_max_brw_size(tsi->tsi_exp));
+	rdpg->rp_npages = (rdpg->rp_count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	/* allocate pages to store the containers */
+	OBD_ALLOC_PTR_ARRAY(rdpg->rp_pages, rdpg->rp_npages);
+	if (rdpg->rp_pages == NULL)
+		GOTO(out, rc = -ENOMEM);
+	for (i = 0; i < rdpg->rp_npages; i++) {
+		rdpg->rp_pages[i] = alloc_page(GFP_NOFS);
+		if (rdpg->rp_pages[i] == NULL)
+			GOTO(out, rc = -ENOMEM);
+	}
+
+	/* populate pages with key/record pairs */
+	rc = dt_index_read(tsi->tsi_env, tsi->tsi_tgt->lut_bottom, rep_ii, rdpg);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	LASSERTF(rc <= rdpg->rp_count, "dt_index_read() returned more than "
+		 "asked %d > %d\n", rc, rdpg->rp_count);
+
+	/* send pages to client */
+	rc = tgt_sendpage(tsi, rdpg, rc);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
+out:
+	if (rdpg->rp_pages) {
+		for (i = 0; i < rdpg->rp_npages; i++)
+			if (rdpg->rp_pages[i])
+				__free_page(rdpg->rp_pages[i]);
+		OBD_FREE_PTR_ARRAY(rdpg->rp_pages, rdpg->rp_npages);
+	}
+	return rc;
+}
+
+struct tgt_handler tgt_obd_handlers[] = {
+TGT_OBD_HDL    (0,	OBD_PING,		tgt_obd_ping),
+TGT_OBD_HDL    (0,	OBD_IDX_READ,		tgt_obd_idx_read)
+};
+EXPORT_SYMBOL(tgt_obd_handlers);
+
+int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
+	     struct dt_object *obj, __u64 start, __u64 end)
+{
+	int rc = 0;
+
+	ENTRY;
+
+	/* if no objid is specified, it means "sync whole filesystem" */
+	if (obj == NULL) {
+		rc = dt_sync(env, tgt->lut_bottom);
+	} else if (dt_version_get(env, obj) >
+		   tgt->lut_obd->obd_last_committed) {
+		rc = dt_object_sync(env, obj, start, end);
+	}
+	atomic_inc(&tgt->lut_sync_count);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_sync);
+/*
+ * Unified target DLM handlers.
+ */
+
+/**
+ * Unified target BAST
+ *
+ * Ensure data and metadata are synced to disk when lock is canceled if Sync on
+ * Cancel (SOC) is enabled. If it's extent lock, normally sync obj is enough,
+ * but if it's cross-MDT lock, because remote object version is not set, a
+ * filesystem sync is needed.
+ *
+ * \param lock server side lock
+ * \param desc lock desc
+ * \param data ldlm_cb_set_arg
+ * \param flag	indicates whether this cancelling or blocking callback
+ * \retval	0 on success
+ * \retval	negative number on error
+ */
+static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag)
+{
+	struct lu_env		 env;
+	struct lu_target	*tgt;
+	struct dt_object	*obj = NULL;
+	struct lu_fid		 fid;
+	int			 rc = 0;
+
+	ENTRY;
+
+	tgt = class_exp2tgt(lock->l_export);
+
+	if (unlikely(tgt == NULL)) {
+		CDEBUG(D_ERROR, "%s: No target for connected export\n",
+		       class_exp2obd(lock->l_export)->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	if (flag == LDLM_CB_CANCELING &&
+	    (lock->l_granted_mode & (LCK_EX | LCK_PW | LCK_GROUP)) &&
+	    (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_ALWAYS ||
+	     (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_BLOCKING &&
+	      ldlm_is_cbpending(lock))) &&
+	    ((exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) ||
+	     lock->l_resource->lr_type == LDLM_EXTENT)) {
+		__u64 start = 0;
+		__u64 end = OBD_OBJECT_EOF;
+
+		rc = lu_env_init(&env, LCT_DT_THREAD);
+		if (unlikely(rc != 0))
+			GOTO(err, rc);
+
+		ost_fid_from_resid(&fid, &lock->l_resource->lr_name,
+				   tgt->lut_lsd.lsd_osd_index);
+
+		if (lock->l_resource->lr_type == LDLM_EXTENT) {
+			obj = dt_locate(&env, tgt->lut_bottom, &fid);
+			if (IS_ERR(obj))
+				GOTO(err_env, rc = PTR_ERR(obj));
+
+			if (!dt_object_exists(obj))
+				GOTO(err_put, rc = -ENOENT);
+
+			start = lock->l_policy_data.l_extent.start;
+			end = lock->l_policy_data.l_extent.end;
+		}
+
+		rc = tgt_sync(&env, tgt, obj, start, end);
+		if (rc < 0) {
+			CERROR("%s: syncing "DFID" (%llu-%llu) on lock "
+			       "cancel: rc = %d\n",
+			       tgt_name(tgt), PFID(&fid),
+			       lock->l_policy_data.l_extent.start,
+			       lock->l_policy_data.l_extent.end, rc);
+		}
+err_put:
+		if (obj != NULL)
+			dt_object_put(&env, obj);
+err_env:
+		lu_env_fini(&env);
+	}
+err:
+	rc = ldlm_server_blocking_ast(lock, desc, data, flag);
+	RETURN(rc);
+}
+
+static struct ldlm_callback_suite tgt_dlm_cbs = {
+	.lcs_completion	= ldlm_server_completion_ast,
+	.lcs_blocking	= tgt_blocking_ast,
+	.lcs_glimpse	= ldlm_server_glimpse_ast
+};
+
+int tgt_enqueue(struct tgt_session_info *tsi)
+{
+	struct ptlrpc_request *req = tgt_ses_req(tsi);
+	int rc;
+
+	ENTRY;
+	/*
+	 * tsi->tsi_dlm_req was already swapped and (if necessary) converted,
+	 * tsi->tsi_dlm_cbs was set by the *_req_handle() function.
+	 */
+	LASSERT(tsi->tsi_dlm_req != NULL);
+	rc = ldlm_handle_enqueue0(tsi->tsi_exp->exp_obd->obd_namespace, req,
+				  tsi->tsi_dlm_req, &tgt_dlm_cbs);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	switch (LUT_FAIL_CLASS(tsi->tsi_reply_fail_id)) {
+	case LUT_FAIL_MDT:
+		tsi->tsi_reply_fail_id = OBD_FAIL_MDS_LDLM_REPLY_NET;
+		break;
+	case LUT_FAIL_OST:
+		tsi->tsi_reply_fail_id = OBD_FAIL_OST_LDLM_REPLY_NET;
+		break;
+	case LUT_FAIL_MGT:
+		tsi->tsi_reply_fail_id = OBD_FAIL_MGS_LDLM_REPLY_NET;
+		break;
+	default:
+		tsi->tsi_reply_fail_id = OBD_FAIL_LDLM_REPLY;
+		break;
+	}
+	RETURN(req->rq_status);
+}
+EXPORT_SYMBOL(tgt_enqueue);
+
+int tgt_convert(struct tgt_session_info *tsi)
+{
+	struct ptlrpc_request *req = tgt_ses_req(tsi);
+	int rc;
+
+	ENTRY;
+	LASSERT(tsi->tsi_dlm_req);
+	rc = ldlm_handle_convert0(req, tsi->tsi_dlm_req);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	RETURN(req->rq_status);
+}
+
+int tgt_bl_callback(struct tgt_session_info *tsi)
+{
+	return err_serious(-EOPNOTSUPP);
+}
+
+int tgt_cp_callback(struct tgt_session_info *tsi)
+{
+	return err_serious(-EOPNOTSUPP);
+}
+
+/* generic LDLM target handler */
+struct tgt_handler tgt_dlm_handlers[] = {
+TGT_DLM_HDL(HAS_KEY, LDLM_ENQUEUE, tgt_enqueue),
+TGT_DLM_HDL(HAS_KEY, LDLM_CONVERT, tgt_convert),
+TGT_DLM_HDL_VAR(0, LDLM_BL_CALLBACK, tgt_bl_callback),
+TGT_DLM_HDL_VAR(0, LDLM_CP_CALLBACK, tgt_cp_callback)
+};
+EXPORT_SYMBOL(tgt_dlm_handlers);
+
+/*
+ * Unified target LLOG handlers.
+ */
+int tgt_llog_open(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_open(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_open);
+
+int tgt_llog_read_header(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_read_header(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_read_header);
+
+int tgt_llog_next_block(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_next_block(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_next_block);
+
+int tgt_llog_prev_block(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_prev_block(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_prev_block);
+
+/* generic llog target handler */
+struct tgt_handler tgt_llog_handlers[] = {
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_CREATE,	tgt_llog_open),
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_NEXT_BLOCK,	tgt_llog_next_block),
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_READ_HEADER,	tgt_llog_read_header),
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_PREV_BLOCK,	tgt_llog_prev_block),
+};
+EXPORT_SYMBOL(tgt_llog_handlers);
+
+/*
+ * sec context handlers
+ */
+/* XXX: Implement based on mdt_sec_ctx_handle()? */
+static int tgt_sec_ctx_handle(struct tgt_session_info *tsi)
+{
+	return 0;
+}
+
+struct tgt_handler tgt_sec_ctx_handlers[] = {
+TGT_SEC_HDL_VAR(0,	SEC_CTX_INIT,		tgt_sec_ctx_handle),
+TGT_SEC_HDL_VAR(0,	SEC_CTX_INIT_CONT,	tgt_sec_ctx_handle),
+TGT_SEC_HDL_VAR(0,	SEC_CTX_FINI,		tgt_sec_ctx_handle),
+};
+EXPORT_SYMBOL(tgt_sec_ctx_handlers);
+
+int (*tgt_lfsck_in_notify_local)(const struct lu_env *env,
+				 struct dt_device *key,
+				 struct lfsck_req_local *lrl,
+				 struct thandle *th) = NULL;
+
+void tgt_register_lfsck_in_notify_local(int (*notify)(const struct lu_env *,
+						      struct dt_device *,
+						      struct lfsck_req_local *,
+						      struct thandle *))
+{
+	tgt_lfsck_in_notify_local = notify;
+}
+EXPORT_SYMBOL(tgt_register_lfsck_in_notify_local);
+
+int (*tgt_lfsck_in_notify)(const struct lu_env *env,
+			   struct dt_device *key,
+			   struct lfsck_request *lr) = NULL;
+
+void tgt_register_lfsck_in_notify(int (*notify)(const struct lu_env *,
+						struct dt_device *,
+						struct lfsck_request *))
+{
+	tgt_lfsck_in_notify = notify;
+}
+EXPORT_SYMBOL(tgt_register_lfsck_in_notify);
+
+static int (*tgt_lfsck_query)(const struct lu_env *env,
+			      struct dt_device *key,
+			      struct lfsck_request *req,
+			      struct lfsck_reply *rep,
+			      struct lfsck_query *que) = NULL;
+
+void tgt_register_lfsck_query(int (*query)(const struct lu_env *,
+					   struct dt_device *,
+					   struct lfsck_request *,
+					   struct lfsck_reply *,
+					   struct lfsck_query *))
+{
+	tgt_lfsck_query = query;
+}
+EXPORT_SYMBOL(tgt_register_lfsck_query);
+
+/* LFSCK request handlers */
+static int tgt_handle_lfsck_notify(struct tgt_session_info *tsi)
+{
+	const struct lu_env	*env = tsi->tsi_env;
+	struct dt_device	*key = tsi->tsi_tgt->lut_bottom;
+	struct lfsck_request	*lr;
+	int			 rc;
+	ENTRY;
+
+	lr = req_capsule_client_get(tsi->tsi_pill, &RMF_LFSCK_REQUEST);
+	if (lr == NULL)
+		RETURN(-EPROTO);
+
+	rc = tgt_lfsck_in_notify(env, key, lr);
+
+	RETURN(rc);
+}
+
+static int tgt_handle_lfsck_query(struct tgt_session_info *tsi)
+{
+	struct lfsck_request	*request;
+	struct lfsck_reply	*reply;
+	int			 rc;
+	ENTRY;
+
+	request = req_capsule_client_get(tsi->tsi_pill, &RMF_LFSCK_REQUEST);
+	if (request == NULL)
+		RETURN(-EPROTO);
+
+	reply = req_capsule_server_get(tsi->tsi_pill, &RMF_LFSCK_REPLY);
+	if (reply == NULL)
+		RETURN(-ENOMEM);
+
+	rc = tgt_lfsck_query(tsi->tsi_env, tsi->tsi_tgt->lut_bottom,
+			     request, reply, NULL);
+
+	RETURN(rc < 0 ? rc : 0);
+}
+
+struct tgt_handler tgt_lfsck_handlers[] = {
+TGT_LFSCK_HDL(HAS_REPLY,	LFSCK_NOTIFY,	tgt_handle_lfsck_notify),
+TGT_LFSCK_HDL(HAS_REPLY,	LFSCK_QUERY,	tgt_handle_lfsck_query),
+};
+EXPORT_SYMBOL(tgt_lfsck_handlers);
+
+/*
+ * initialize per-thread page pool (bug 5137).
+ */
+int tgt_io_thread_init(struct ptlrpc_thread *thread)
+{
+	struct tgt_thread_big_cache *tbc;
+
+	ENTRY;
+
+	LASSERT(thread != NULL);
+	LASSERT(thread->t_data == NULL);
+
+	OBD_ALLOC_LARGE(tbc, sizeof(*tbc));
+	if (tbc == NULL)
+		RETURN(-ENOMEM);
+	thread->t_data = tbc;
+	RETURN(0);
+}
+EXPORT_SYMBOL(tgt_io_thread_init);
+
+/*
+ * free per-thread pool created by tgt_thread_init().
+ */
+void tgt_io_thread_done(struct ptlrpc_thread *thread)
+{
+	struct tgt_thread_big_cache *tbc;
+
+	ENTRY;
+
+	LASSERT(thread != NULL);
+
+	/*
+	 * be prepared to handle partially-initialized pools (because this is
+	 * called from ost_io_thread_init() for cleanup.
+	 */
+	tbc = thread->t_data;
+	if (tbc != NULL) {
+		OBD_FREE_LARGE(tbc, sizeof(*tbc));
+		thread->t_data = NULL;
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_io_thread_done);
+
+/**
+ * Helper function for getting Data-on-MDT file server DLM lock
+ * if asked by client.
+ */
+int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		      struct lustre_handle *lh, int mode, __u64 *flags)
+{
+	union ldlm_policy_data policy = {
+		.l_inodebits.bits = MDS_INODELOCK_DOM,
+	};
+	int rc;
+
+	ENTRY;
+
+	LASSERT(lh != NULL);
+	LASSERT(ns != NULL);
+	LASSERT(!lustre_handle_is_used(lh));
+
+	rc = ldlm_cli_enqueue_local(NULL, ns, res_id, LDLM_IBITS, &policy, mode,
+				    flags, ldlm_blocking_ast,
+				    ldlm_completion_ast, ldlm_glimpse_ast,
+				    NULL, 0, LVB_T_NONE, NULL, lh);
+
+	RETURN(rc == ELDLM_OK ? 0 : -EIO);
+}
+EXPORT_SYMBOL(tgt_mdt_data_lock);
+
+/**
+ * Helper function for getting server side [start, start+count] DLM lock
+ * if asked by client.
+ */
+int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns,
+		    struct ldlm_res_id *res_id, __u64 start, __u64 end,
+		    struct lustre_handle *lh, int mode, __u64 *flags)
+{
+	union ldlm_policy_data policy;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(lh != NULL);
+	LASSERT(ns != NULL);
+	LASSERT(!lustre_handle_is_used(lh));
+
+	policy.l_extent.gid = 0;
+	policy.l_extent.start = start & PAGE_MASK;
+
+	/*
+	 * If ->o_blocks is EOF it means "lock till the end of the file".
+	 * Otherwise, it's size of an extent or hole being punched (in bytes).
+	 */
+	if (end == OBD_OBJECT_EOF || end < start)
+		policy.l_extent.end = OBD_OBJECT_EOF;
+	else
+		policy.l_extent.end = end | ~PAGE_MASK;
+
+	rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_EXTENT, &policy,
+				    mode, flags, ldlm_blocking_ast,
+				    ldlm_completion_ast, ldlm_glimpse_ast,
+				    NULL, 0, LVB_T_NONE, NULL, lh);
+	RETURN(rc == ELDLM_OK ? 0 : -EIO);
+}
+EXPORT_SYMBOL(tgt_extent_lock);
+
+static int tgt_data_lock(const struct lu_env *env, struct obd_export *exp,
+			 struct ldlm_res_id *res_id, __u64 start, __u64 end,
+			 struct lustre_handle *lh, enum ldlm_mode mode)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	__u64 flags = 0;
+
+	/* MDT IO for data-on-mdt */
+	if (exp->exp_connect_data.ocd_connect_flags & OBD_CONNECT_IBITS)
+		return tgt_mdt_data_lock(ns, res_id, lh, mode, &flags);
+
+	return tgt_extent_lock(env, ns, res_id, start, end, lh, mode, &flags);
+}
+
+void tgt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode)
+{
+	LASSERT(lustre_handle_is_used(lh));
+	ldlm_lock_decref(lh, mode);
+}
+EXPORT_SYMBOL(tgt_data_unlock);
+
+static int tgt_brw_lock(const struct lu_env *env, struct obd_export *exp,
+			struct ldlm_res_id *res_id, struct obd_ioobj *obj,
+			struct niobuf_remote *nb, struct lustre_handle *lh,
+			enum ldlm_mode mode)
+{
+	int nrbufs = obj->ioo_bufcnt;
+	int i;
+
+	ENTRY;
+
+	LASSERT(mode == LCK_PR || mode == LCK_PW);
+	LASSERT(!lustre_handle_is_used(lh));
+
+	if (exp->exp_obd->obd_recovering)
+		RETURN(0);
+
+	if (nrbufs == 0 || !(nb[0].rnb_flags & OBD_BRW_SRVLOCK))
+		RETURN(0);
+
+	for (i = 1; i < nrbufs; i++)
+		if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK))
+			RETURN(-EFAULT);
+
+	return tgt_data_lock(env, exp, res_id, nb[0].rnb_offset,
+			     nb[nrbufs - 1].rnb_offset +
+			     nb[nrbufs - 1].rnb_len - 1, lh, mode);
+}
+
+static void tgt_brw_unlock(struct obd_export *exp, struct obd_ioobj *obj,
+			   struct niobuf_remote *niob,
+			   struct lustre_handle *lh, enum ldlm_mode mode)
+{
+	ENTRY;
+
+	LASSERT(mode == LCK_PR || mode == LCK_PW);
+	LASSERT((!exp->exp_obd->obd_recovering && obj->ioo_bufcnt &&
+		 niob[0].rnb_flags & OBD_BRW_SRVLOCK) ==
+		lustre_handle_is_used(lh));
+
+	if (lustre_handle_is_used(lh))
+		tgt_data_unlock(lh, mode);
+	EXIT;
+}
+
+static int tgt_checksum_niobuf(struct lu_target *tgt,
+				 struct niobuf_local *local_nb, int npages,
+				 int opc, enum cksum_types cksum_type,
+				 __u32 *cksum)
+{
+	struct ahash_request	       *req;
+	unsigned int			bufsize;
+	int				i, err;
+	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		CERROR("%s: unable to initialize checksum hash %s\n",
+		       tgt_name(tgt), cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(req);
+	}
+
+	CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg));
+	for (i = 0; i < npages; i++) {
+		/* corrupt the data before we compute the checksum, to
+		 * simulate a client->OST data error */
+		if (i == 0 && opc == OST_WRITE &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
+			struct page *np = tgt_page_to_corrupt;
+
+			if (np) {
+				char *ptr = kmap_atomic(local_nb[i].lnb_page);
+				char *ptr2 = page_address(np);
+
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad3", min(4, len));
+				kunmap_atomic(ptr);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = i;
+
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+		cfs_crypto_hash_update_page(req, local_nb[i].lnb_page,
+				  local_nb[i].lnb_page_offset & ~PAGE_MASK,
+				  local_nb[i].lnb_len);
+
+		 /* corrupt the data after we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (i == 0 && opc == OST_READ &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
+			struct page *np = tgt_page_to_corrupt;
+
+			if (np) {
+				char *ptr = kmap_atomic(local_nb[i].lnb_page);
+				char *ptr2 = page_address(np);
+
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad4", min(4, len));
+				kunmap_atomic(ptr);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = i;
+
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+	}
+
+	bufsize = sizeof(*cksum);
+	err = cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
+
+	return 0;
+}
+
+char dbgcksum_file_name[PATH_MAX];
+
+static void dump_all_bulk_pages(struct obdo *oa, int count,
+				struct niobuf_local *local_nb,
+				__u32 server_cksum, __u32 client_cksum)
+{
+	struct file *filp;
+	int rc, i;
+	unsigned int len;
+	char *buf;
+
+	/* will only keep dump of pages on first error for the same range in
+	 * file/fid, not during the resends/retries. */
+	snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name),
+		 "%s-checksum_dump-ost-"DFID":[%llu-%llu]-%x-%x",
+		 (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 ?
+		  libcfs_debug_file_path : LIBCFS_DEBUG_FILE_PATH_DEFAULT),
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+		 local_nb[0].lnb_file_offset,
+		 local_nb[count-1].lnb_file_offset +
+		 local_nb[count-1].lnb_len - 1, client_cksum, server_cksum);
+	CWARN("dumping checksum data to %s\n", dbgcksum_file_name);
+	filp = filp_open(dbgcksum_file_name,
+			 O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		if (rc == -EEXIST)
+			CDEBUG(D_INFO, "%s: can't open to dump pages with "
+			       "checksum error: rc = %d\n", dbgcksum_file_name,
+			       rc);
+		else
+			CERROR("%s: can't open to dump pages with checksum "
+			       "error: rc = %d\n", dbgcksum_file_name, rc);
+		return;
+	}
+
+	for (i = 0; i < count; i++) {
+		len = local_nb[i].lnb_len;
+		buf = kmap(local_nb[i].lnb_page);
+		while (len != 0) {
+			rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
+			if (rc < 0) {
+				CERROR("%s: wanted to write %u but got %d "
+				       "error\n", dbgcksum_file_name, len, rc);
+				break;
+			}
+			len -= rc;
+			buf += rc;
+		}
+		kunmap(local_nb[i].lnb_page);
+	}
+
+	rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1);
+	if (rc)
+		CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
+	filp_close(filp, NULL);
+
+	libcfs_debug_dumplog();
+}
+
+static int check_read_checksum(struct niobuf_local *local_nb, int npages,
+			       struct obd_export *exp, struct obdo *oa,
+			       const struct lnet_process_id *peer,
+			       __u32 client_cksum, __u32 server_cksum,
+			       enum cksum_types server_cksum_type)
+{
+	char *msg;
+	enum cksum_types cksum_type;
+	loff_t start, end;
+
+	if (unlikely(npages <= 0))
+		return 0;
+
+	/* unlikely to happen and only if resend does not occur due to cksum
+	 * control failure on Client */
+	if (unlikely(server_cksum == client_cksum)) {
+		CDEBUG(D_PAGE, "checksum %x confirmed upon retry\n",
+		       client_cksum);
+		return 0;
+	}
+
+	if (exp->exp_obd->obd_checksum_dump)
+		dump_all_bulk_pages(oa, npages, local_nb, server_cksum,
+				    client_cksum);
+
+	cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+					   oa->o_flags : 0);
+
+	if (cksum_type != server_cksum_type)
+		msg = "the server may have not used the checksum type specified"
+		      " in the original request - likely a protocol problem";
+	else
+		msg = "should have changed on the client or in transit";
+
+	start = local_nb[0].lnb_file_offset;
+	end = local_nb[npages-1].lnb_file_offset +
+					local_nb[npages-1].lnb_len - 1;
+
+	LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode "
+		DFID " object "DOSTID" extent [%llu-%llu], client returned csum"
+		" %x (type %x), server csum %x (type %x)\n",
+		exp->exp_obd->obd_name,
+		msg, libcfs_nid2str(peer->nid),
+		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
+		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+		POSTID(&oa->o_oi),
+		start, end, client_cksum, cksum_type, server_cksum,
+		server_cksum_type);
+
+	return 1;
+}
+
+static int tgt_pages2shortio(struct niobuf_local *local, int npages,
+			     unsigned char *buf, int size)
+{
+	int	i, off, len, copied = size;
+	char	*ptr;
+
+	for (i = 0; i < npages; i++) {
+		off = local[i].lnb_page_offset & ~PAGE_MASK;
+		len = local[i].lnb_len;
+
+		CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+		       i, off, len, size);
+		if (len > size)
+			return -EINVAL;
+
+		ptr = kmap_atomic(local[i].lnb_page);
+		memcpy(buf, ptr + off, len);
+		kunmap_atomic(ptr);
+		buf += len;
+		size -= len;
+	}
+	return copied - size;
+}
+
+static int tgt_checksum_niobuf_t10pi(struct lu_target *tgt,
+				     enum cksum_types cksum_type,
+				     struct niobuf_local *local_nb, int npages,
+				     int opc, obd_dif_csum_fn *fn,
+				     int sector_size, u32 *check_sum,
+				     bool resend)
+{
+	enum cksum_types t10_cksum_type = tgt->lut_dt_conf.ddp_t10_cksum_type;
+	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
+	const char *obd_name = tgt->lut_obd->obd_name;
+	struct ahash_request *req;
+	unsigned int bufsize;
+	unsigned char *buffer;
+	struct page *__page;
+	__be16 *guard_start;
+	int guard_number;
+	int used_number = 0;
+	__u32 cksum;
+	int rc = 0;
+	int used;
+	int i;
+
+	__page = alloc_page(GFP_KERNEL);
+	if (__page == NULL)
+		return -ENOMEM;
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		CERROR("%s: unable to initialize checksum hash %s\n",
+		       tgt_name(tgt), cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(req);
+	}
+
+	buffer = kmap(__page);
+	guard_start = (__be16 *)buffer;
+	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	if (unlikely(resend))
+		CDEBUG(D_PAGE | D_HA, "GRD tags per page = %u\n", guard_number);
+	for (i = 0; i < npages; i++) {
+		bool use_t10_grd;
+
+		/* corrupt the data before we compute the checksum, to
+		 * simulate a client->OST data error */
+		if (i == 0 && opc == OST_WRITE &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
+			struct page *np = tgt_page_to_corrupt;
+
+			if (np) {
+				char *ptr = kmap_atomic(local_nb[i].lnb_page);
+				char *ptr2 = page_address(np);
+
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad3", min(4, len));
+				kunmap_atomic(ptr);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = i;
+
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+
+		/*
+		 * The left guard number should be able to hold checksums of a
+		 * whole page
+		 */
+		use_t10_grd = t10_cksum_type && t10_cksum_type == cksum_type &&
+			      opc == OST_READ &&
+			      local_nb[i].lnb_len == PAGE_SIZE &&
+			      local_nb[i].lnb_guard_disk;
+		if (use_t10_grd) {
+			used = DIV_ROUND_UP(local_nb[i].lnb_len, sector_size);
+			if (used > (guard_number - used_number)) {
+				rc = -E2BIG;
+				break;
+			}
+			memcpy(guard_start + used_number,
+			       local_nb[i].lnb_guards,
+			       used * sizeof(*guard_start));
+			if (unlikely(resend))
+				CDEBUG(D_PAGE | D_HA,
+				       "lnb[%u]: used %u off %u+%u lnb checksum: %*phN\n",
+				       i, used,
+				       local_nb[i].lnb_page_offset,
+				       local_nb[i].lnb_len,
+				       (int)(used * sizeof(*guard_start)),
+				       guard_start + used_number);
+		}
+		if (!use_t10_grd || unlikely(resend)) {
+			__be16 guard_tmp[MAX_GUARD_NUMBER];
+			__be16 *guards = guard_start + used_number;
+			int used_tmp = -1, *usedp = &used;
+
+			if (unlikely(use_t10_grd)) {
+				guards = guard_tmp;
+				usedp = &used_tmp;
+			}
+			rc = obd_page_dif_generate_buffer(obd_name,
+				local_nb[i].lnb_page,
+				local_nb[i].lnb_page_offset & ~PAGE_MASK,
+				local_nb[i].lnb_len, guards,
+				guard_number - used_number, usedp, sector_size,
+				fn);
+			if (unlikely(resend)) {
+				bool bad = use_t10_grd &&
+					memcmp(guard_tmp,
+					       local_nb[i].lnb_guards,
+					       used_tmp * sizeof(*guard_tmp));
+
+				if (bad)
+					CERROR("lnb[%u]: used %u/%u off %u+%u tmp checksum: %*phN\n",
+					       i, used, used_tmp,
+					       local_nb[i].lnb_page_offset,
+					       local_nb[i].lnb_len,
+					       (int)(used_tmp * sizeof(*guard_start)),
+					       guard_tmp);
+				CDEBUG_LIMIT(D_PAGE | D_HA | (bad ? D_ERROR : 0),
+				       "lnb[%u]: used %u/%u off %u+%u gen checksum: %*phN\n",
+				       i, used, used_tmp,
+				       local_nb[i].lnb_page_offset,
+				       local_nb[i].lnb_len,
+				       (int)(used * sizeof(*guard_start)),
+				       guard_start + used_number);
+			}
+			if (rc)
+				break;
+		}
+
+		LASSERT(used <= MAX_GUARD_NUMBER);
+		/*
+		 * If disk support T10PI checksum, copy guards to local_nb.
+		 * If the write is partial page, do not use the guards for bio
+		 * submission since the data might not be full-sector. The bio
+		 * guards will be generated later based on the full sectors. If
+		 * the sector size is 512B rather than 4 KB, or the page size
+		 * is larger than 4KB, this might drop some useful guards for
+		 * partial page write, but it will only add minimal extra time
+		 * of checksum calculation.
+		 */
+		if (t10_cksum_type && t10_cksum_type == cksum_type &&
+		    opc == OST_WRITE &&
+		    local_nb[i].lnb_len == PAGE_SIZE) {
+			local_nb[i].lnb_guard_rpc = 1;
+			memcpy(local_nb[i].lnb_guards,
+			       guard_start + used_number,
+			       used * sizeof(*local_nb[i].lnb_guards));
+		}
+
+		used_number += used;
+		if (used_number == guard_number) {
+			cfs_crypto_hash_update_page(req, __page, 0,
+				used_number * sizeof(*guard_start));
+			used_number = 0;
+		}
+
+		 /* corrupt the data after we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (unlikely(i == 0 && opc == OST_READ &&
+			     OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND))) {
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
+			struct page *np = tgt_page_to_corrupt;
+
+			if (np) {
+				char *ptr = kmap_atomic(local_nb[i].lnb_page);
+				char *ptr2 = page_address(np);
+
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad4", min(4, len));
+				kunmap_atomic(ptr);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = i;
+
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+	}
+	kunmap(__page);
+	if (rc)
+		GOTO(out, rc);
+
+	if (used_number != 0)
+		cfs_crypto_hash_update_page(req, __page, 0,
+			used_number * sizeof(*guard_start));
+
+	bufsize = sizeof(cksum);
+	rc = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
+
+	if (rc == 0)
+		*check_sum = cksum;
+out:
+	__free_page(__page);
+	return rc;
+}
+
+static int tgt_checksum_niobuf_rw(struct lu_target *tgt,
+				  enum cksum_types cksum_type,
+				  struct niobuf_local *local_nb,
+				  int npages, int opc, u32 *check_sum,
+				  bool resend)
+{
+	obd_dif_csum_fn *fn = NULL;
+	int sector_size = 0;
+	int rc;
+
+	ENTRY;
+	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
+
+	if (fn)
+		rc = tgt_checksum_niobuf_t10pi(tgt, cksum_type,
+					       local_nb, npages,
+					       opc, fn, sector_size,
+					       check_sum, resend);
+	else
+		rc = tgt_checksum_niobuf(tgt, local_nb, npages, opc,
+					 cksum_type, check_sum);
+
+	RETURN(rc);
+}
+
+int tgt_brw_read(struct tgt_session_info *tsi)
+{
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct ptlrpc_bulk_desc	*desc = NULL;
+	struct obd_export	*exp = tsi->tsi_exp;
+	struct niobuf_remote	*remote_nb;
+	struct niobuf_local	*local_nb;
+	struct obd_ioobj	*ioo;
+	struct ost_body		*body, *repbody;
+	struct lustre_handle	 lockh = { 0 };
+	int			 npages, nob = 0, rc, i, no_reply = 0,
+				 npages_read;
+	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
+	const char *obd_name = exp->exp_obd->obd_name;
+	ktime_t kstart;
+
+	ENTRY;
+
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+	    ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
+		CERROR("%s: deny read request from %s to portal %u\n",
+		       tgt_name(tsi->tsi_tgt),
+		       obd_export_nid2str(req->rq_export),
+		       ptlrpc_req2svc(req)->srv_req_portal);
+		RETURN(-EPROTO);
+	}
+
+	req->rq_bulk_read = 1;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK)) {
+		/* optionally use cfs_fail_val - 1 to select a specific OST on
+		 * this server to fail requests.
+		 */
+		char fail_ost_name[MAX_OBD_NAME];
+
+		if (cfs_fail_val > 0) {
+			snprintf(fail_ost_name, MAX_OBD_NAME, "OST%04X",
+				 cfs_fail_val - 1);
+
+			if (strstr(obd_name, fail_ost_name))
+				RETURN(err_serious(-EIO));
+		} else {
+			RETURN(err_serious(-EIO));
+		}
+	}
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
+			 cfs_fail_val : (obd_timeout + 1) / 4);
+
+	/* Check if there is eviction in progress, and if so, wait for it to
+	 * finish */
+	if (unlikely(atomic_read(&exp->exp_obd->obd_evict_inprogress))) {
+		/* We do not care how long it takes */
+		wait_event_idle(
+			exp->exp_obd->obd_evict_inprogress_waitq,
+			!atomic_read(&exp->exp_obd->obd_evict_inprogress));
+	}
+
+	/* There must be big cache in current thread to process this request
+	 * if it is NULL then something went wrong and it wasn't allocated,
+	 * report -ENOMEM in that case */
+	if (tbc == NULL)
+		RETURN(-ENOMEM);
+
+	body = tsi->tsi_ost_body;
+	LASSERT(body != NULL);
+
+	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+	    body->oa.o_flags & OBD_FL_NORPC)
+		RETURN(0);
+
+	ioo = req_capsule_client_get(tsi->tsi_pill, &RMF_OBD_IOOBJ);
+	LASSERT(ioo != NULL); /* must exists after tgt_ost_body_unpack */
+
+	remote_nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE);
+	LASSERT(remote_nb != NULL); /* must exists after tgt_ost_body_unpack */
+
+	local_nb = tbc->local;
+
+	rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb,
+			  &lockh, LCK_PR);
+	if (rc != 0)
+		RETURN(rc);
+
+	/*
+	 * If getting the lock took more time than
+	 * client was willing to wait, drop it. b=11330
+	 */
+	if (ktime_get_real_seconds() > req->rq_deadline ||
+	    OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
+		no_reply = 1;
+		CERROR("Dropping timed-out read from %s because locking object " DOSTID " took %lld seconds (limit was %lld).\n",
+		       libcfs_id2str(req->rq_peer), POSTID(&ioo->ioo_oid),
+		       ktime_get_real_seconds() - req->rq_arrival_time.tv_sec,
+		       req->rq_deadline - req->rq_arrival_time.tv_sec);
+		GOTO(out_lock, rc = -ETIMEDOUT);
+	}
+
+	/*
+	 * Because we already sync grant info with client when
+	 * reconnect, grant info will be cleared for resent req,
+	 * otherwise, outdated grant count in the rpc would de-sync
+	 * grant counters in case of shrink
+	 */
+	if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) {
+		DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info");
+		body->oa.o_valid &= ~OBD_MD_FLGRANT;
+	}
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	repbody->oa = body->oa;
+
+	npages = PTLRPC_MAX_BRW_PAGES;
+	kstart = ktime_get();
+	rc = obd_preprw(tsi->tsi_env, OBD_BRW_READ, exp, &repbody->oa, 1,
+			ioo, remote_nb, &npages, local_nb);
+	if (rc != 0)
+		GOTO(out_lock, rc);
+
+	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+	    body->oa.o_flags & OBD_FL_SHORT_IO) {
+		desc = NULL;
+	} else {
+		desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+					    PTLRPC_BULK_PUT_SOURCE,
+					    OST_BULK_PORTAL,
+					    &ptlrpc_bulk_kiov_nopin_ops);
+		if (desc == NULL)
+			GOTO(out_commitrw, rc = -ENOMEM);
+	}
+
+	npages_read = npages;
+	for (i = 0; i < npages; i++) {
+		int page_rc = local_nb[i].lnb_rc;
+
+		if (page_rc < 0) {
+			rc = page_rc;
+			npages_read = i;
+			break;
+		}
+
+		nob += page_rc;
+		if (page_rc != 0 && desc != NULL) { /* some data! */
+			LASSERT(local_nb[i].lnb_page != NULL);
+			desc->bd_frag_ops->add_kiov_frag
+			  (desc, local_nb[i].lnb_page,
+			   local_nb[i].lnb_page_offset & ~PAGE_MASK,
+			   page_rc);
+		}
+
+		if (page_rc != local_nb[i].lnb_len) { /* short read */
+			local_nb[i].lnb_len = page_rc;
+			npages_read = i + (page_rc != 0 ? 1 : 0);
+			/* All subsequent pages should be 0 */
+			while (++i < npages)
+				LASSERT(local_nb[i].lnb_rc == 0);
+			break;
+		}
+	}
+
+	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+		u32 flag = body->oa.o_valid & OBD_MD_FLFLAGS ?
+			   body->oa.o_flags : 0;
+		enum cksum_types cksum_type = obd_cksum_type_unpack(flag);
+		bool resend = (body->oa.o_valid & OBD_MD_FLFLAGS) &&
+			(body->oa.o_flags & OBD_FL_RECOV_RESEND);
+
+		repbody->oa.o_flags = obd_cksum_type_pack(obd_name,
+							  cksum_type);
+		repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+
+		rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type,
+					    local_nb, npages_read, OST_READ,
+					    &repbody->oa.o_cksum, resend);
+		if (rc < 0)
+			GOTO(out_commitrw, rc);
+		CDEBUG(D_PAGE | (resend ? D_HA : 0),
+		       "checksum at read origin: %x (%x)\n",
+		       repbody->oa.o_cksum, cksum_type);
+
+		/* if a resend it could be for a cksum error, so check Server
+		 * cksum with returned Client cksum (this should even cover
+		 * zero-cksum case) */
+		if (resend)
+			check_read_checksum(local_nb, npages_read, exp,
+					    &body->oa, &req->rq_peer,
+					    body->oa.o_cksum,
+					    repbody->oa.o_cksum, cksum_type);
+	} else {
+		repbody->oa.o_valid = 0;
+	}
+	if (body->oa.o_valid & OBD_MD_FLGRANT)
+		repbody->oa.o_valid |= OBD_MD_FLGRANT;
+	/* We're finishing using body->oa as an input variable */
+
+	/* Check if client was evicted while we were doing i/o before touching
+	 * network */
+	if (rc == 0) {
+		if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+		    body->oa.o_flags & OBD_FL_SHORT_IO) {
+			unsigned char *short_io_buf;
+			int short_io_size;
+
+			short_io_buf = req_capsule_server_get(&req->rq_pill,
+							      &RMF_SHORT_IO);
+			short_io_size = req_capsule_get_size(&req->rq_pill,
+							     &RMF_SHORT_IO,
+							     RCL_SERVER);
+			rc = tgt_pages2shortio(local_nb, npages_read,
+					       short_io_buf, short_io_size);
+			if (rc >= 0)
+				req_capsule_shrink(&req->rq_pill,
+						   &RMF_SHORT_IO, rc,
+						   RCL_SERVER);
+			rc = rc > 0 ? 0 : rc;
+		} else if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) {
+			rc = target_bulk_io(exp, desc);
+		}
+		no_reply = rc != 0;
+	} else {
+		if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+		    body->oa.o_flags & OBD_FL_SHORT_IO)
+			req_capsule_shrink(&req->rq_pill, &RMF_SHORT_IO, 0,
+					   RCL_SERVER);
+	}
+
+out_commitrw:
+	/* Must commit after prep above in all cases */
+	rc = obd_commitrw(tsi->tsi_env, OBD_BRW_READ, exp, &repbody->oa, 1, ioo,
+			  remote_nb, npages, local_nb, rc, nob, kstart);
+out_lock:
+	tgt_brw_unlock(exp, ioo, remote_nb, &lockh, LCK_PR);
+
+	if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))
+		ptlrpc_free_bulk(desc);
+
+	LASSERT(rc <= 0);
+	if (rc == 0) {
+		rc = nob;
+		ptlrpc_lprocfs_brw(req, nob);
+	} else if (no_reply) {
+		req->rq_no_reply = 1;
+		/* reply out callback would free */
+		ptlrpc_req_drop_rs(req);
+		LCONSOLE_WARN("%s: Bulk IO read error with %s (at %s), "
+			      "client will retry: rc %d\n",
+			      obd_name,
+			      obd_uuid2str(&exp->exp_client_uuid),
+			      obd_export_nid2str(exp), rc);
+	}
+	/* send a bulk after reply to simulate a network delay or reordering
+	 * by a router - Note that !desc implies short io, so there is no bulk
+	 * to reorder. */
+	if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
+	    desc) {
+		/* Calculate checksum before request transfer, original
+		 * it is done by target_bulk_io() */
+		rc = sptlrpc_svc_wrap_bulk(req, desc);
+		if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS))
+			req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
+		else /* old version, bulk matchbits is rq_xid */
+			req->rq_mbits = req->rq_xid;
+
+		req->rq_status = rc;
+		target_committed_to_req(req);
+		target_send_reply(req, 0, 0);
+
+		CDEBUG(D_INFO, "reorder BULK\n");
+		OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,
+				 cfs_fail_val ? : 3);
+
+		target_bulk_io(exp, desc);
+		ptlrpc_free_bulk(desc);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_brw_read);
+
+static int tgt_shortio2pages(struct niobuf_local *local, int npages,
+			     unsigned char *buf, unsigned int size)
+{
+	int	i, off, len;
+	char	*ptr;
+
+	for (i = 0; i < npages; i++) {
+		off = local[i].lnb_page_offset & ~PAGE_MASK;
+		len = local[i].lnb_len;
+
+		if (len == 0)
+			continue;
+
+		CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+		       i, off, len, size);
+		ptr = kmap_atomic(local[i].lnb_page);
+		if (ptr == NULL)
+			return -EINVAL;
+		memcpy(ptr + off, buf, len < size ? len : size);
+		kunmap_atomic(ptr);
+		buf += len;
+		size -= len;
+	}
+	return 0;
+}
+
+static void tgt_warn_on_cksum(struct ptlrpc_request *req,
+			      struct ptlrpc_bulk_desc *desc,
+			      struct niobuf_local *local_nb, int npages,
+			      u32 client_cksum, u32 server_cksum,
+			      bool mmap)
+{
+	struct obd_export *exp = req->rq_export;
+	struct ost_body *body;
+	char *router = "";
+	char *via = "";
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body != NULL);
+
+	if (desc && req->rq_peer.nid != desc->bd_sender) {
+		via = " via ";
+		router = libcfs_nid2str(desc->bd_sender);
+	}
+
+	if (exp->exp_obd->obd_checksum_dump)
+		dump_all_bulk_pages(&body->oa, npages, local_nb, server_cksum,
+				    client_cksum);
+
+	if (mmap) {
+		CDEBUG_LIMIT(D_INFO, "client csum %x, server csum %x\n",
+			     client_cksum, server_cksum);
+		return;
+	}
+
+	LCONSOLE_ERROR_MSG(0x168, "%s: BAD WRITE CHECKSUM: from %s%s%s inode "
+			   DFID" object "DOSTID" extent [%llu-%llu"
+			   "]: client csum %x, server csum %x\n",
+			   exp->exp_obd->obd_name, libcfs_id2str(req->rq_peer),
+			   via, router,
+			   body->oa.o_valid & OBD_MD_FLFID ?
+			   body->oa.o_parent_seq : (__u64)0,
+			   body->oa.o_valid & OBD_MD_FLFID ?
+			   body->oa.o_parent_oid : 0,
+			   body->oa.o_valid & OBD_MD_FLFID ?
+			   body->oa.o_parent_ver : 0,
+			   POSTID(&body->oa.o_oi),
+			   local_nb[0].lnb_file_offset,
+			   local_nb[npages-1].lnb_file_offset +
+			   local_nb[npages - 1].lnb_len - 1,
+			   client_cksum, server_cksum);
+}
+
+int tgt_brw_write(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env);
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct ptlrpc_bulk_desc	*desc = NULL;
+	struct obd_export	*exp = req->rq_export;
+	struct niobuf_remote	*remote_nb;
+	struct niobuf_local	*local_nb;
+	struct obd_ioobj	*ioo;
+	struct ost_body		*body, *repbody;
+	struct lustre_handle	 lockh = {0};
+	__u32			*rcs;
+	int			 objcount, niocount, npages;
+	int			 rc = 0;
+	int			 i, j;
+	enum cksum_types cksum_type = OBD_CKSUM_CRC32;
+	bool			 no_reply = false, mmap;
+	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
+	bool wait_sync = false;
+	const char *obd_name = exp->exp_obd->obd_name;
+	/* '1' for consistency with code that checks !mpflag to restore */
+	unsigned int mpflags = 1;
+	ktime_t kstart;
+	int nob = 0;
+
+	ENTRY;
+
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+	    ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
+		CERROR("%s: deny write request from %s to portal %u\n",
+		       tgt_name(tsi->tsi_tgt),
+		       obd_export_nid2str(req->rq_export),
+		       ptlrpc_req2svc(req)->srv_req_portal);
+		RETURN(err_serious(-EPROTO));
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOSPC))
+		RETURN(err_serious(-ENOSPC));
+	if (OBD_FAIL_TIMEOUT(OBD_FAIL_OST_EROFS, 1))
+		RETURN(err_serious(-EROFS));
+
+	req->rq_bulk_write = 1;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK))
+		rc = -EIO;
+	if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK2))
+		rc = -EFAULT;
+	if (rc < 0) {
+		/* optionally use cfs_fail_val - 1 to select a specific OST on
+		 * this server to fail requests.
+		 */
+		char fail_ost_name[MAX_OBD_NAME];
+
+		if (cfs_fail_val > 0) {
+			snprintf(fail_ost_name, MAX_OBD_NAME, "OST%04X",
+				 cfs_fail_val - 1);
+
+			if (strstr(obd_name, fail_ost_name))
+				RETURN(err_serious(rc));
+		} else {
+			RETURN(err_serious(rc));
+		}
+	}
+
+	/* pause before transaction has been started */
+	CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
+			 cfs_fail_val : (obd_timeout + 1) / 4);
+
+	/* Delay write commit to show stale size information */
+	CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val);
+
+	/* There must be big cache in current thread to process this request
+	 * if it is NULL then something went wrong and it wasn't allocated,
+	 * report -ENOMEM in that case */
+	if (tbc == NULL)
+		RETURN(-ENOMEM);
+
+	body = tsi->tsi_ost_body;
+	LASSERT(body != NULL);
+
+	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+	    body->oa.o_flags & OBD_FL_NORPC)
+		RETURN(0);
+
+
+	ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ);
+	LASSERT(ioo != NULL); /* must exists after tgt_ost_body_unpack */
+
+	objcount = req_capsule_get_size(&req->rq_pill, &RMF_OBD_IOOBJ,
+					RCL_CLIENT) / sizeof(*ioo);
+
+	for (niocount = i = 0; i < objcount; i++)
+		niocount += ioo[i].ioo_bufcnt;
+
+	remote_nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE);
+	LASSERT(remote_nb != NULL); /* must exists after tgt_ost_body_unpack */
+	if (niocount != req_capsule_get_size(&req->rq_pill,
+					     &RMF_NIOBUF_REMOTE, RCL_CLIENT) /
+			sizeof(*remote_nb))
+		RETURN(err_serious(-EPROTO));
+
+	if ((remote_nb[0].rnb_flags & OBD_BRW_MEMALLOC) &&
+	    ptlrpc_connection_is_local(exp->exp_connection))
+		mpflags = memalloc_noreclaim_save();
+
+	req_capsule_set_size(&req->rq_pill, &RMF_RCS, RCL_SERVER,
+			     niocount * sizeof(*rcs));
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc != 0)
+		GOTO(out, rc = err_serious(rc));
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_PACK, cfs_fail_val);
+	rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS);
+
+	local_nb = tbc->local;
+
+	rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb,
+			  &lockh, LCK_PW);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	/*
+	 * If getting the lock took more time than
+	 * client was willing to wait, drop it. b=11330
+	 */
+	if (ktime_get_real_seconds() > req->rq_deadline ||
+	    OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
+		no_reply = true;
+		CERROR("%s: Dropping timed-out write from %s because locking object " DOSTID " took %lld seconds (limit was %lld).\n",
+		       tgt_name(tsi->tsi_tgt), libcfs_id2str(req->rq_peer),
+		       POSTID(&ioo->ioo_oid),
+		       ktime_get_real_seconds() - req->rq_arrival_time.tv_sec,
+		       req->rq_deadline - req->rq_arrival_time.tv_sec);
+		GOTO(out_lock, rc = -ETIMEDOUT);
+	}
+
+	/* Because we already sync grant info with client when reconnect,
+	 * grant info will be cleared for resent req, then fed_grant and
+	 * total_grant will not be modified in following preprw_write */
+	if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) {
+		DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info");
+		body->oa.o_valid &= ~OBD_MD_FLGRANT;
+	}
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (repbody == NULL)
+		GOTO(out_lock, rc = -ENOMEM);
+	repbody->oa = body->oa;
+
+	npages = PTLRPC_MAX_BRW_PAGES;
+	kstart = ktime_get();
+	rc = obd_preprw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa,
+			objcount, ioo, remote_nb, &npages, local_nb);
+	if (rc < 0)
+		GOTO(out_lock, rc);
+	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+	    body->oa.o_flags & OBD_FL_SHORT_IO) {
+		unsigned int short_io_size;
+		unsigned char *short_io_buf;
+
+		short_io_size = req_capsule_get_size(&req->rq_pill,
+						     &RMF_SHORT_IO,
+						     RCL_CLIENT);
+		short_io_buf = req_capsule_client_get(&req->rq_pill,
+						      &RMF_SHORT_IO);
+		CDEBUG(D_INFO, "Client use short io for data transfer,"
+			       " size = %d\n", short_io_size);
+
+		/* Copy short io buf to pages */
+		rc = tgt_shortio2pages(local_nb, npages, short_io_buf,
+				       short_io_size);
+		desc = NULL;
+	} else {
+		desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+					    PTLRPC_BULK_GET_SINK,
+					    OST_BULK_PORTAL,
+					    &ptlrpc_bulk_kiov_nopin_ops);
+		if (desc == NULL)
+			GOTO(skip_transfer, rc = -ENOMEM);
+
+		/* NB Having prepped, we must commit... */
+		for (i = 0; i < npages; i++)
+			desc->bd_frag_ops->add_kiov_frag(desc,
+					local_nb[i].lnb_page,
+					local_nb[i].lnb_page_offset & ~PAGE_MASK,
+					local_nb[i].lnb_len);
+
+		rc = sptlrpc_svc_prep_bulk(req, desc);
+		if (rc != 0)
+			GOTO(skip_transfer, rc);
+
+		rc = target_bulk_io(exp, desc);
+	}
+
+	no_reply = rc != 0;
+
+skip_transfer:
+	if (body->oa.o_valid & OBD_MD_FLCKSUM && rc == 0) {
+		static int cksum_counter;
+
+		if (body->oa.o_valid & OBD_MD_FLFLAGS)
+			cksum_type = obd_cksum_type_unpack(body->oa.o_flags);
+
+		repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+		repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL;
+		repbody->oa.o_flags |= obd_cksum_type_pack(obd_name,
+							   cksum_type);
+
+		rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type,
+					    local_nb, npages, OST_WRITE,
+					    &repbody->oa.o_cksum, false);
+		if (rc < 0)
+			GOTO(out_commitrw, rc);
+
+		cksum_counter++;
+
+		if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) {
+			mmap = (body->oa.o_valid & OBD_MD_FLFLAGS &&
+				body->oa.o_flags & OBD_FL_MMAP);
+
+			tgt_warn_on_cksum(req, desc, local_nb, npages,
+					  body->oa.o_cksum,
+					  repbody->oa.o_cksum, mmap);
+			cksum_counter = 0;
+		} else if ((cksum_counter & (-cksum_counter)) ==
+			   cksum_counter) {
+			CDEBUG(D_INFO, "Checksum %u from %s OK: %x\n",
+			       cksum_counter, libcfs_id2str(req->rq_peer),
+			       repbody->oa.o_cksum);
+		}
+	}
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK2, cfs_fail_val);
+
+out_commitrw:
+	/* calculate the expected actual write bytes (nob) for OFD stats.
+	 * Technically, if commit fails this would be wrong, but that should be
+	 * very rare
+	 */
+	for (i = 0; i < niocount; i++) {
+		int len = remote_nb[i].rnb_len;
+
+		nob += len;
+	}
+
+	/* multiple transactions can be assigned during write commit */
+	tti->tti_mult_trans = 1;
+
+	/* Must commit after prep above in all cases */
+	rc = obd_commitrw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa,
+			  objcount, ioo, remote_nb, npages, local_nb, rc, nob,
+			  kstart);
+	if (rc == -ENOTCONN)
+		/* quota acquire process has been given up because
+		 * either the client has been evicted or the client
+		 * has timed out the request already
+		 */
+		no_reply = true;
+
+	for (i = 0; i < niocount; i++) {
+		if (!(local_nb[i].lnb_flags & OBD_BRW_ASYNC)) {
+			wait_sync = true;
+			break;
+		}
+	}
+	/*
+	 * Disable sending mtime back to the client. If the client locked the
+	 * whole object, then it has already updated the mtime on its side,
+	 * otherwise it will have to glimpse anyway (see bug 21489, comment 32)
+	 */
+	repbody->oa.o_valid &= ~(OBD_MD_FLMTIME | OBD_MD_FLATIME);
+
+	if (rc == 0) {
+		/* set per-requested niobuf return codes */
+		for (i = j = 0; i < niocount; i++) {
+			int len = remote_nb[i].rnb_len;
+
+			rcs[i] = 0;
+			do {
+				LASSERT(j < npages);
+				if (local_nb[j].lnb_rc < 0)
+					rcs[i] = local_nb[j].lnb_rc;
+				len -= local_nb[j].lnb_len;
+				j++;
+			} while (len > 0);
+			LASSERT(len == 0);
+		}
+		LASSERT(j == npages);
+		ptlrpc_lprocfs_brw(req, nob);
+	}
+out_lock:
+	tgt_brw_unlock(exp, ioo, remote_nb, &lockh, LCK_PW);
+	if (desc)
+		ptlrpc_free_bulk(desc);
+out:
+	if (unlikely(no_reply || (exp->exp_obd->obd_no_transno && wait_sync))) {
+		req->rq_no_reply = 1;
+		/* reply out callback would free */
+		ptlrpc_req_drop_rs(req);
+		if (!exp->exp_obd->obd_no_transno)
+			LCONSOLE_WARN("%s: Bulk IO write error with %s (at %s),"
+				      " client will retry: rc = %d\n",
+				      obd_name,
+				      obd_uuid2str(&exp->exp_client_uuid),
+				      obd_export_nid2str(exp), rc);
+	}
+
+	if (mpflags)
+		memalloc_noreclaim_restore(mpflags);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_brw_write);
+
+/**
+ * Common request handler for OST_SEEK RPC.
+ *
+ * Unified request handling for OST_SEEK RPC.
+ * It takes object by its FID, does needed lseek and packs result
+ * into reply. Only SEEK_HOLE and SEEK_DATA are supported.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+int tgt_lseek(struct tgt_session_info *tsi)
+{
+	struct lustre_handle lh = { 0 };
+	struct dt_object *dob;
+	struct ost_body *repbody;
+	loff_t offset = tsi->tsi_ost_body->oa.o_size;
+	int whence = tsi->tsi_ost_body->oa.o_mode;
+	bool srvlock;
+	int rc = 0;
+
+	ENTRY;
+
+	if (whence != SEEK_HOLE && whence != SEEK_DATA)
+		RETURN(-EPROTO);
+
+	/* Negative offset is prohibited on wire and must be handled on client
+	 * prior sending RPC.
+	 */
+	if (offset < 0)
+		RETURN(-EPROTO);
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (repbody == NULL)
+		RETURN(-ENOMEM);
+	repbody->oa = tsi->tsi_ost_body->oa;
+
+	srvlock = tsi->tsi_ost_body->oa.o_valid & OBD_MD_FLFLAGS &&
+		  tsi->tsi_ost_body->oa.o_flags & OBD_FL_SRVLOCK;
+	if (srvlock) {
+		rc = tgt_data_lock(tsi->tsi_env, tsi->tsi_exp, &tsi->tsi_resid,
+				   offset, OBD_OBJECT_EOF, &lh, LCK_PR);
+		if (rc)
+			RETURN(rc);
+	}
+
+	dob = dt_locate(tsi->tsi_env, tsi->tsi_tgt->lut_bottom, &tsi->tsi_fid);
+	if (IS_ERR(dob))
+		GOTO(out, rc = PTR_ERR(dob));
+
+	if (!dt_object_exists(dob))
+		GOTO(obj_put, rc = -ENOENT);
+
+	repbody->oa.o_size = dt_lseek(tsi->tsi_env, dob, offset, whence);
+	rc = 0;
+obj_put:
+	dt_object_put(tsi->tsi_env, dob);
+out:
+	if (srvlock)
+		tgt_data_unlock(&lh, LCK_PR);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_lseek);
+
+/* Check if request can be reconstructed from saved reply data
+ * A copy of the reply data is returned in @trd if the pointer is not NULL
+ */
+int req_can_reconstruct(struct ptlrpc_request *req,
+			 struct tg_reply_data *trd)
+{
+	struct tg_export_data *ted = &req->rq_export->exp_target_data;
+	struct lsd_client_data *lcd = ted->ted_lcd;
+	int found;
+
+	if (tgt_is_multimodrpcs_client(req->rq_export))
+		return tgt_lookup_reply(req, trd);
+
+	mutex_lock(&ted->ted_lcd_lock);
+	found = req->rq_xid == lcd->lcd_last_xid ||
+		req->rq_xid == lcd->lcd_last_close_xid;
+
+	if (found && trd != NULL) {
+		if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) {
+			trd->trd_reply.lrd_xid = lcd->lcd_last_close_xid;
+			trd->trd_reply.lrd_transno =
+						lcd->lcd_last_close_transno;
+			trd->trd_reply.lrd_result = lcd->lcd_last_close_result;
+		} else {
+			trd->trd_reply.lrd_xid = lcd->lcd_last_xid;
+			trd->trd_reply.lrd_transno = lcd->lcd_last_transno;
+			trd->trd_reply.lrd_result = lcd->lcd_last_result;
+			trd->trd_reply.lrd_data = lcd->lcd_last_data;
+			trd->trd_pre_versions[0] = lcd->lcd_pre_versions[0];
+			trd->trd_pre_versions[1] = lcd->lcd_pre_versions[1];
+			trd->trd_pre_versions[2] = lcd->lcd_pre_versions[2];
+			trd->trd_pre_versions[3] = lcd->lcd_pre_versions[3];
+		}
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	return found;
+}
+EXPORT_SYMBOL(req_can_reconstruct);
+
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
new file mode 100644
index 0000000000000..39fb4101e6f2c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * lustre/target/tgt_internal.h
+ *
+ * Lustre Unified Target header file
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#ifndef _TG_INTERNAL_H
+#define _TG_INTERNAL_H
+
+#include <lustre_net.h>
+#include <lu_target.h>
+#include <lustre_export.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include <lustre_req_layout.h>
+#include <lustre_sec.h>
+
+extern int (*tgt_lfsck_in_notify_local)(const struct lu_env *env,
+					struct dt_device *key,
+					struct lfsck_req_local *lrl,
+					struct thandle *th);
+/**
+ * Common data shared by tg-level handlers. This is allocated per-thread to
+ * reduce stack consumption.
+ */
+struct tgt_thread_info {
+	/* server and client data buffers */
+	struct lr_server_data	 tti_lsd;
+	struct lsd_client_data	 tti_lcd;
+	struct lsd_reply_data	 tti_lrd;
+	struct lu_buf		 tti_buf;
+	loff_t			 tti_off;
+
+	struct lu_attr		 tti_attr;
+	struct lu_fid		 tti_fid1;
+
+	/* transno storage during last_rcvd update */
+	__u64			 tti_transno;
+	__u32			 tti_has_trans:1,
+				 tti_mult_trans:1;
+
+	/* Updates data for OUT target */
+	struct thandle_exec_args tti_tea;
+	union {
+		struct {
+			/* for tgt_readpage()      */
+			struct lu_rdpg     tti_rdpg;
+		} rdpg;
+		struct {
+			struct dt_object_format	   tti_update_dof;
+			struct object_update_reply *tti_update_reply;
+			struct object_update	   *tti_update;
+			int			   tti_update_reply_index;
+			struct obdo		   tti_obdo;
+			struct dt_object	   *tti_dt_object;
+		} update;
+		struct obd_statfs osfs; /* for obd_statfs() in OFD/MDT */
+	} tti_u;
+	struct lfsck_req_local tti_lrl;
+	struct dt_insert_rec tti_rec;
+};
+
+extern struct lu_context_key tgt_thread_key;
+
+static inline struct tgt_thread_info *tgt_th_info(const struct lu_env *env)
+{
+	struct tgt_thread_info *tti;
+
+	tti = lu_context_key_get(&env->le_ctx, &tgt_thread_key);
+	LASSERT(tti);
+	return tti;
+}
+
+#define MGS_SERVICE_WATCHDOG_FACTOR      (2)
+
+int tgt_request_handle(struct ptlrpc_request *req);
+
+/* check if request's xid is equal to last one or not*/
+static inline int req_xid_is_last(struct ptlrpc_request *req)
+{
+	struct lsd_client_data *lcd = req->rq_export->exp_target_data.ted_lcd;
+
+	LASSERT(lcd != NULL);
+	return (req->rq_xid == lcd->lcd_last_xid ||
+		req->rq_xid == lcd->lcd_last_close_xid);
+}
+
+static inline char *dt_obd_name(struct dt_device *dt)
+{
+	return dt->dd_lu_dev.ld_obd->obd_name;
+}
+
+/* out_lib.c */
+int out_tx_create_exec(const struct lu_env *env, struct thandle *th,
+		       struct tx_arg *arg);
+struct tx_arg *tx_add_exec(struct thandle_exec_args *ta,
+			   tx_exec_func_t func, tx_exec_func_t undo,
+			   const char *file, int line);
+
+int out_create_add_exec(const struct lu_env *env, struct dt_object *obj,
+			struct lu_attr *attr, struct lu_fid *parent_fid,
+			struct dt_object_format *dof,
+			struct thandle_exec_args *ta, struct thandle *th,
+			struct object_update_reply *reply,
+			int index, const char *file, int line);
+
+int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			  const struct lu_attr *attr,
+			  struct thandle_exec_args *ta, struct thandle *th,
+			  struct object_update_reply *reply, int index,
+			  const char *file, int line);
+
+int out_write_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+		       const struct lu_buf *buf, loff_t pos,
+		       struct thandle_exec_args *ta, struct thandle *th,
+		       struct object_update_reply *reply, int index,
+		       const char *file, int line);
+
+int out_xattr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			   const struct lu_buf *buf, const char *name,
+			   int flags, struct thandle_exec_args *ta,
+			   struct thandle *th,
+			   struct object_update_reply *reply, int index,
+			   const char *file, int line);
+
+int out_xattr_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			   const char *name, struct thandle_exec_args *ta,
+			   struct thandle *th,
+			   struct object_update_reply *reply, int index,
+			   const char *file, int line);
+
+int out_ref_add_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta, struct thandle *th,
+			 struct object_update_reply *reply, int index,
+			 const char *file, int line);
+
+int out_ref_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta, struct thandle *th,
+			 struct object_update_reply *reply, int index,
+			 const char *file, int line);
+
+int out_index_insert_add_exec(const struct lu_env *env,
+			      struct dt_object *dt_obj,
+			      const struct dt_rec *rec,
+			      const struct dt_key *key,
+			      struct thandle_exec_args *ta,
+			      struct thandle *th,
+			      struct object_update_reply *reply,
+			      int index, const char *file, int line);
+
+int out_index_delete_add_exec(const struct lu_env *env,
+			      struct dt_object *dt_obj,
+			      const struct dt_key *key,
+			      struct thandle_exec_args *ta,
+			      struct thandle *th,
+			      struct object_update_reply *reply,
+			      int index, const char *file, int line);
+
+int out_destroy_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta, struct thandle *th,
+			 struct object_update_reply *reply,
+			 int index, const char *file, int line);
+
+/* Update handlers */
+int out_handle(struct tgt_session_info *tsi);
+
+#define out_tx_create(env, obj, attr, fid, dof, ta, th, reply, idx) \
+	out_create_add_exec(env, obj, attr, fid, dof, ta, th, reply, idx, \
+			    __FILE__, __LINE__)
+
+#define out_tx_attr_set(env, obj, attr, ta, th, reply, idx) \
+	out_attr_set_add_exec(env, obj, attr, ta, th, reply, idx, \
+			      __FILE__, __LINE__)
+
+#define out_tx_xattr_set(env, obj, buf, name, fl, ta, th, reply, idx)	\
+	out_xattr_set_add_exec(env, obj, buf, name, fl, ta, th, reply, idx, \
+			       __FILE__, __LINE__)
+
+#define out_tx_xattr_del(env, obj, name, ta, th, reply, idx)	\
+	out_xattr_del_add_exec(env, obj, name, ta, th, reply, idx,	\
+			       __FILE__, __LINE__)
+
+#define out_tx_ref_add(env, obj, ta, th, reply, idx) \
+	out_ref_add_add_exec(env, obj, ta, th, reply, idx,	\
+			     __FILE__, __LINE__)
+
+#define out_tx_ref_del(env, obj, ta, th, reply, idx) \
+	out_ref_del_add_exec(env, obj, ta, th, reply, idx,	\
+			     __FILE__, __LINE__)
+
+#define out_tx_index_insert(env, obj, rec, key, ta, th, reply, idx) \
+	out_index_insert_add_exec(env, obj, rec, key, ta, th, reply, idx, \
+				  __FILE__, __LINE__)
+
+#define out_tx_index_delete(env, obj, key, ta, th, reply, idx) \
+	out_index_delete_add_exec(env, obj, key, ta, th, reply, idx, \
+				  __FILE__, __LINE__)
+
+#define out_tx_destroy(env, obj, ta, th, reply, idx) \
+	out_destroy_add_exec(env, obj, ta, th, reply, idx,	\
+			     __FILE__, __LINE__)
+
+#define out_tx_write(env, obj, buf, pos, ta, th, reply, idx) \
+	out_write_add_exec(env, obj, buf, pos, ta, th, reply, idx,\
+			   __FILE__, __LINE__)
+
+const char *update_op_str(__u16 opcode);
+
+extern struct page *tgt_page_to_corrupt;
+
+int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt);
+int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th,
+		     void *cookie);
+int tgt_txn_stop_cb(const struct lu_env *env, struct thandle *th,
+		    void *cookie);
+int tgt_handle_received_xid(struct obd_export *exp, __u64 rcvd_xid);
+int tgt_handle_tag(struct ptlrpc_request *req);
+
+void update_records_dump(const struct update_records *records,
+			 unsigned int mask, bool dump_updates);
+int check_and_prepare_update_record(const struct lu_env *env,
+				    struct thandle_update_records *tur);
+struct update_thread_info {
+	struct lu_attr			uti_attr;
+	struct lu_fid			uti_fid;
+	struct lu_buf			uti_buf;
+	struct thandle_update_records	uti_tur;
+	struct obdo			uti_obdo;
+	struct thandle_exec_args	uti_tea;
+	struct dt_insert_rec		uti_rec;
+	struct distribute_txn_replay_req *uti_dtrq;
+};
+
+extern struct lu_context_key update_thread_key;
+
+static inline struct update_thread_info *
+update_env_info(const struct lu_env *env)
+{
+	struct update_thread_info *uti;
+
+	uti = lu_context_key_get(&env->le_ctx, &update_thread_key);
+	LASSERT(uti != NULL);
+	return uti;
+}
+
+void update_info_init(void);
+void update_info_fini(void);
+struct sub_thandle *create_sub_thandle(struct top_multiple_thandle *tmt,
+				       struct dt_device *dt_dev);
+int sub_thandle_trans_create(const struct lu_env *env,
+			     struct top_thandle *top_th,
+			     struct sub_thandle *st);
+void distribute_txn_insert_by_batchid(struct top_multiple_thandle *new);
+int top_trans_create_tmt(const struct lu_env *env,
+			 struct top_thandle *top_th);
+
+void tgt_cancel_slc_locks(struct lu_target *tgt, __u64 transno);
+void barrier_init(void);
+void barrier_fini(void);
+
+/* FMD tracking data */
+struct tgt_fmd_data {
+	struct list_head fmd_list;	  /* linked to tgt_fmd_list */
+	struct lu_fid	 fmd_fid;	  /* FID being written to */
+	__u64		 fmd_mactime_xid; /* xid highest {m,a,c}time setattr */
+	time64_t	 fmd_expire;	  /* time when the fmd should expire */
+	int		 fmd_refcount;	  /* reference counter - list holds 1 */
+};
+
+/* tgt_fmd.c */
+extern struct kmem_cache *tgt_fmd_kmem;
+void tgt_fmd_expire(struct obd_export *exp);
+void tgt_fmd_cleanup(struct obd_export *exp);
+
+#endif /* _TG_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
new file mode 100644
index 0000000000000..4341d75b0bf38
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
@@ -0,0 +1,2282 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Lustre Unified Target
+ * These are common function to work with last_received file
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+
+#include "tgt_internal.h"
+
+/** version recovery epoch */
+#define LR_EPOCH_BITS	32
+
+/* Allocate a bitmap for a chunk of reply data slots */
+static int tgt_bitmap_chunk_alloc(struct lu_target *lut, int chunk)
+{
+	unsigned long *bm;
+
+	OBD_ALLOC_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+			sizeof(long));
+	if (bm == NULL)
+		return -ENOMEM;
+
+	spin_lock(&lut->lut_client_bitmap_lock);
+
+	if (lut->lut_reply_bitmap[chunk] != NULL) {
+		/* someone else already allocated the bitmap for this chunk */
+		spin_unlock(&lut->lut_client_bitmap_lock);
+		OBD_FREE_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+			 sizeof(long));
+		return 0;
+	}
+
+	lut->lut_reply_bitmap[chunk] = bm;
+
+	spin_unlock(&lut->lut_client_bitmap_lock);
+
+	return 0;
+}
+
+/* Look for an available reply data slot in the bitmap
+ * of the target @lut
+ * Allocate bitmap chunk when first used
+ * XXX algo could be improved if this routine limits performance
+ */
+static int tgt_find_free_reply_slot(struct lu_target *lut)
+{
+	unsigned long *bmp;
+	int chunk = 0;
+	int rc;
+	int b;
+
+	for (chunk = 0; chunk < LUT_REPLY_SLOTS_MAX_CHUNKS; chunk++) {
+		/* allocate the bitmap chunk if necessary */
+		if (unlikely(lut->lut_reply_bitmap[chunk] == NULL)) {
+			rc = tgt_bitmap_chunk_alloc(lut, chunk);
+			if (rc != 0)
+				return rc;
+		}
+		bmp = lut->lut_reply_bitmap[chunk];
+
+		/* look for an available slot in this chunk */
+		do {
+			b = find_first_zero_bit(bmp, LUT_REPLY_SLOTS_PER_CHUNK);
+			if (b >= LUT_REPLY_SLOTS_PER_CHUNK)
+				break;
+
+			/* found one */
+			if (test_and_set_bit(b, bmp) == 0)
+				return chunk * LUT_REPLY_SLOTS_PER_CHUNK + b;
+		} while (true);
+	}
+
+	return -ENOSPC;
+}
+
+/* Mark the reply data slot @idx 'used' in the corresponding bitmap chunk
+ * of the target @lut
+ * Allocate the bitmap chunk if necessary
+ */
+static int tgt_set_reply_slot(struct lu_target *lut, int idx)
+{
+	int chunk;
+	int b;
+	int rc;
+
+	chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK;
+	b = idx % LUT_REPLY_SLOTS_PER_CHUNK;
+
+	LASSERT(chunk < LUT_REPLY_SLOTS_MAX_CHUNKS);
+	LASSERT(b < LUT_REPLY_SLOTS_PER_CHUNK);
+
+	/* allocate the bitmap chunk if necessary */
+	if (unlikely(lut->lut_reply_bitmap[chunk] == NULL)) {
+		rc = tgt_bitmap_chunk_alloc(lut, chunk);
+		if (rc != 0)
+			return rc;
+	}
+
+	/* mark the slot 'used' in this chunk */
+	if (test_and_set_bit(b, lut->lut_reply_bitmap[chunk]) != 0) {
+		CERROR("%s: slot %d already set in bitmap\n",
+		       tgt_name(lut), idx);
+		return -EALREADY;
+	}
+
+	return 0;
+}
+
+
+/* Mark the reply data slot @idx 'unused' in the corresponding bitmap chunk
+ * of the target @lut
+ */
+static int tgt_clear_reply_slot(struct lu_target *lut, int idx)
+{
+	int chunk;
+	int b;
+
+	if (lut->lut_obd->obd_stopping)
+		/*
+		 * in case of failover keep the bit set in order to
+		 * avoid overwriting slots in reply_data which might
+		 * be required by resent rpcs
+		 */
+		return 0;
+	chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK;
+	b = idx % LUT_REPLY_SLOTS_PER_CHUNK;
+
+	LASSERT(chunk < LUT_REPLY_SLOTS_MAX_CHUNKS);
+	LASSERT(b < LUT_REPLY_SLOTS_PER_CHUNK);
+
+	if (lut->lut_reply_bitmap[chunk] == NULL) {
+		CERROR("%s: slot %d not allocated\n",
+		       tgt_name(lut), idx);
+		return -ENOENT;
+	}
+
+	if (test_and_clear_bit(b, lut->lut_reply_bitmap[chunk]) == 0) {
+		CERROR("%s: slot %d already clear in bitmap\n",
+		       tgt_name(lut), idx);
+		return -EALREADY;
+	}
+
+	return 0;
+}
+
+
+/* Read header of reply_data file of target @tgt into structure @lrh */
+static int tgt_reply_header_read(const struct lu_env *env,
+				 struct lu_target *tgt,
+				 struct lsd_reply_header *lrh)
+{
+	int			 rc;
+	struct lsd_reply_header	 buf;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+
+	tti->tti_off = 0;
+	tti->tti_buf.lb_buf = &buf;
+	tti->tti_buf.lb_len = sizeof(buf);
+
+	rc = dt_record_read(env, tgt->lut_reply_data, &tti->tti_buf,
+			    &tti->tti_off);
+	if (rc != 0)
+		return rc;
+
+	lrh->lrh_magic = le32_to_cpu(buf.lrh_magic);
+	lrh->lrh_header_size = le32_to_cpu(buf.lrh_header_size);
+	lrh->lrh_reply_size = le32_to_cpu(buf.lrh_reply_size);
+
+	CDEBUG(D_HA, "%s: read %s header. magic=0x%08x "
+	       "header_size=%d reply_size=%d\n",
+		tgt->lut_obd->obd_name, REPLY_DATA,
+		lrh->lrh_magic, lrh->lrh_header_size, lrh->lrh_reply_size);
+
+	return 0;
+}
+
+/* Write header into replay_data file of target @tgt from structure @lrh */
+static int tgt_reply_header_write(const struct lu_env *env,
+				  struct lu_target *tgt,
+				  struct lsd_reply_header *lrh)
+{
+	int			 rc;
+	struct lsd_reply_header	 buf;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct thandle		*th;
+	struct dt_object	*dto;
+
+	CDEBUG(D_HA, "%s: write %s header. magic=0x%08x "
+	       "header_size=%d reply_size=%d\n",
+		tgt->lut_obd->obd_name, REPLY_DATA,
+		lrh->lrh_magic, lrh->lrh_header_size, lrh->lrh_reply_size);
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	buf.lrh_magic = cpu_to_le32(lrh->lrh_magic);
+	buf.lrh_header_size = cpu_to_le32(lrh->lrh_header_size);
+	buf.lrh_reply_size = cpu_to_le32(lrh->lrh_reply_size);
+
+	th = dt_trans_create(env, tgt->lut_bottom);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+	th->th_sync = 1;
+
+	tti->tti_off = 0;
+	tti->tti_buf.lb_buf = &buf;
+	tti->tti_buf.lb_len = sizeof(buf);
+
+	rc = dt_declare_record_write(env, tgt->lut_reply_data,
+				     &tti->tti_buf, tti->tti_off, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_trans_start(env, tgt->lut_bottom, th);
+	if (rc)
+		GOTO(out, rc);
+
+	dto = dt_object_locate(tgt->lut_reply_data, th->th_dev);
+	rc = dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th);
+out:
+	dt_trans_stop(env, tgt->lut_bottom, th);
+	return rc;
+}
+
+/* Write the reply data @lrd into reply_data file of target @tgt
+ * at offset @off
+ */
+static int tgt_reply_data_write(const struct lu_env *env, struct lu_target *tgt,
+				struct lsd_reply_data *lrd, loff_t off,
+				struct thandle *th)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct dt_object	*dto;
+	struct lsd_reply_data	*buf = &tti->tti_lrd;
+
+	lrd->lrd_result = ptlrpc_status_hton(lrd->lrd_result);
+
+	buf->lrd_transno	 = cpu_to_le64(lrd->lrd_transno);
+	buf->lrd_xid		 = cpu_to_le64(lrd->lrd_xid);
+	buf->lrd_data		 = cpu_to_le64(lrd->lrd_data);
+	buf->lrd_result		 = cpu_to_le32(lrd->lrd_result);
+	buf->lrd_client_gen	 = cpu_to_le32(lrd->lrd_client_gen);
+
+	lrd->lrd_result = ptlrpc_status_ntoh(lrd->lrd_result);
+
+	tti->tti_off = off;
+	tti->tti_buf.lb_buf = buf;
+	tti->tti_buf.lb_len = sizeof(*buf);
+
+	dto = dt_object_locate(tgt->lut_reply_data, th->th_dev);
+	return dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th);
+}
+
+/* Read the reply data from reply_data file of target @tgt at offset @off
+ * into structure @lrd
+ */
+static int tgt_reply_data_read(const struct lu_env *env, struct lu_target *tgt,
+			       struct lsd_reply_data *lrd, loff_t off)
+{
+	int			 rc;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct lsd_reply_data	*buf = &tti->tti_lrd;
+
+	tti->tti_off = off;
+	tti->tti_buf.lb_buf = buf;
+	tti->tti_buf.lb_len = sizeof(*buf);
+
+	rc = dt_record_read(env, tgt->lut_reply_data, &tti->tti_buf,
+			    &tti->tti_off);
+	if (rc != 0)
+		return rc;
+
+	lrd->lrd_transno	 = le64_to_cpu(buf->lrd_transno);
+	lrd->lrd_xid		 = le64_to_cpu(buf->lrd_xid);
+	lrd->lrd_data		 = le64_to_cpu(buf->lrd_data);
+	lrd->lrd_result		 = le32_to_cpu(buf->lrd_result);
+	lrd->lrd_client_gen	 = le32_to_cpu(buf->lrd_client_gen);
+
+	return 0;
+}
+
+
+/* Free the in-memory reply data structure @trd and release
+ * the corresponding slot in the reply_data file of target @lut
+ * Called with ted_lcd_lock held
+ */
+static void tgt_free_reply_data(struct lu_target *lut,
+				struct tg_export_data *ted,
+				struct tg_reply_data *trd)
+{
+	CDEBUG(D_TRACE, "%s: free reply data %p: xid %llu, transno %llu, "
+	       "client gen %u, slot idx %d\n",
+	       lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid,
+	       trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen,
+	       trd->trd_index);
+
+	LASSERT(mutex_is_locked(&ted->ted_lcd_lock));
+
+	list_del(&trd->trd_list);
+	ted->ted_reply_cnt--;
+	if (lut != NULL && trd->trd_index != TRD_INDEX_MEMORY)
+		tgt_clear_reply_slot(lut, trd->trd_index);
+	OBD_FREE_PTR(trd);
+}
+
+/* Release the reply data @trd from target @lut
+ * The reply data with the highest transno for this export
+ * is retained to ensure correctness of target recovery
+ * Called with ted_lcd_lock held
+ */
+static void tgt_release_reply_data(struct lu_target *lut,
+				   struct tg_export_data *ted,
+				   struct tg_reply_data *trd)
+{
+	CDEBUG(D_TRACE, "%s: release reply data %p: xid %llu, transno %llu, "
+	       "client gen %u, slot idx %d\n",
+	       lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid,
+	       trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen,
+	       trd->trd_index);
+
+	LASSERT(mutex_is_locked(&ted->ted_lcd_lock));
+
+	/* Do not free the reply data corresponding to the
+	 * highest transno of this export.
+	 * This ensures on-disk reply data is kept and
+	 * last committed transno can be restored from disk in case
+	 * of target recovery
+	 */
+	if (trd->trd_reply.lrd_transno == ted->ted_lcd->lcd_last_transno) {
+		/* free previous retained reply */
+		if (ted->ted_reply_last != NULL)
+			tgt_free_reply_data(lut, ted, ted->ted_reply_last);
+		/* retain the reply */
+		list_del_init(&trd->trd_list);
+		ted->ted_reply_last = trd;
+	} else {
+		tgt_free_reply_data(lut, ted, trd);
+	}
+}
+
+static inline struct lu_buf *tti_buf_lsd(struct tgt_thread_info *tti)
+{
+	tti->tti_buf.lb_buf = &tti->tti_lsd;
+	tti->tti_buf.lb_len = sizeof(tti->tti_lsd);
+	return &tti->tti_buf;
+}
+
+static inline struct lu_buf *tti_buf_lcd(struct tgt_thread_info *tti)
+{
+	tti->tti_buf.lb_buf = &tti->tti_lcd;
+	tti->tti_buf.lb_len = sizeof(tti->tti_lcd);
+	return &tti->tti_buf;
+}
+
+static inline bool tgt_is_multimodrpcs_record(struct lu_target *tgt,
+					      struct lsd_client_data *lcd)
+{
+	return tgt->lut_lsd.lsd_feature_incompat & OBD_INCOMPAT_MULTI_RPCS &&
+		lcd->lcd_generation != 0;
+}
+
+/**
+ * Allocate in-memory data for client slot related to export.
+ */
+int tgt_client_alloc(struct obd_export *exp)
+{
+	ENTRY;
+	LASSERT(exp != exp->exp_obd->obd_self_export);
+
+	spin_lock_init(&exp->exp_target_data.ted_nodemap_lock);
+	INIT_LIST_HEAD(&exp->exp_target_data.ted_nodemap_member);
+	spin_lock_init(&exp->exp_target_data.ted_fmd_lock);
+	INIT_LIST_HEAD(&exp->exp_target_data.ted_fmd_list);
+
+	OBD_ALLOC_PTR(exp->exp_target_data.ted_lcd);
+	if (exp->exp_target_data.ted_lcd == NULL)
+		RETURN(-ENOMEM);
+	/* Mark that slot is not yet valid, 0 doesn't work here */
+	exp->exp_target_data.ted_lr_idx = -1;
+	INIT_LIST_HEAD(&exp->exp_target_data.ted_reply_list);
+	mutex_init(&exp->exp_target_data.ted_lcd_lock);
+	RETURN(0);
+}
+EXPORT_SYMBOL(tgt_client_alloc);
+
+/**
+ * Free in-memory data for client slot related to export.
+ */
+void tgt_client_free(struct obd_export *exp)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*lut = class_exp2tgt(exp);
+	struct tg_reply_data	*trd, *tmp;
+
+	LASSERT(exp != exp->exp_obd->obd_self_export);
+
+	tgt_fmd_cleanup(exp);
+
+	/* free reply data */
+	mutex_lock(&ted->ted_lcd_lock);
+	list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) {
+		tgt_release_reply_data(lut, ted, trd);
+	}
+	if (ted->ted_reply_last != NULL) {
+		tgt_free_reply_data(lut, ted, ted->ted_reply_last);
+		ted->ted_reply_last = NULL;
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	if (!hlist_unhashed(&exp->exp_gen_hash))
+		cfs_hash_del(exp->exp_obd->obd_gen_hash,
+			     &ted->ted_lcd->lcd_generation,
+			     &exp->exp_gen_hash);
+
+	OBD_FREE_PTR(ted->ted_lcd);
+	ted->ted_lcd = NULL;
+
+	/* Target may have been freed (see LU-7430)
+	 * Slot may be not yet assigned */
+	if (exp->exp_obd->u.obt.obt_magic != OBT_MAGIC ||
+	    ted->ted_lr_idx < 0)
+		return;
+
+	/* Clear bit when lcd is freed */
+	LASSERT(lut && lut->lut_client_bitmap);
+	if (!test_and_clear_bit(ted->ted_lr_idx, lut->lut_client_bitmap)) {
+		CERROR("%s: client %u bit already clear in bitmap\n",
+		       exp->exp_obd->obd_name, ted->ted_lr_idx);
+		LBUG();
+	}
+}
+EXPORT_SYMBOL(tgt_client_free);
+
+static inline void tgt_check_lcd(const char *obd_name, int index,
+				 struct lsd_client_data *lcd)
+{
+	size_t uuid_size = sizeof(lcd->lcd_uuid);
+
+	if (strnlen((char*)lcd->lcd_uuid, uuid_size) == uuid_size) {
+		lcd->lcd_uuid[uuid_size - 1] = '\0';
+
+		LCONSOLE_ERROR("the client UUID (%s) on %s for exports stored in last_rcvd(index = %d) is bad!\n",
+			       lcd->lcd_uuid, obd_name, index);
+	}
+}
+
+static int tgt_client_data_read(const struct lu_env *env, struct lu_target *tgt,
+				struct lsd_client_data *lcd,
+				loff_t *off, int index)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	int			 rc;
+
+	tti_buf_lcd(tti);
+	rc = dt_record_read(env, tgt->lut_last_rcvd, &tti->tti_buf, off);
+	if (rc == 0) {
+		tgt_check_lcd(tgt->lut_obd->obd_name, index, &tti->tti_lcd);
+		lcd_le_to_cpu(&tti->tti_lcd, lcd);
+		lcd->lcd_last_result = ptlrpc_status_ntoh(lcd->lcd_last_result);
+		lcd->lcd_last_close_result =
+			ptlrpc_status_ntoh(lcd->lcd_last_close_result);
+	}
+
+	CDEBUG(D_INFO, "%s: read lcd @%lld uuid = %s, last_transno = %llu"
+	       ", last_xid = %llu, last_result = %u, last_data = %u, "
+	       "last_close_transno = %llu, last_close_xid = %llu, "
+	       "last_close_result = %u, rc = %d\n", tgt->lut_obd->obd_name,
+	       *off, lcd->lcd_uuid, lcd->lcd_last_transno, lcd->lcd_last_xid,
+	       lcd->lcd_last_result, lcd->lcd_last_data,
+	       lcd->lcd_last_close_transno, lcd->lcd_last_close_xid,
+	       lcd->lcd_last_close_result, rc);
+	return rc;
+}
+
+static int tgt_client_data_write(const struct lu_env *env,
+				 struct lu_target *tgt,
+				 struct lsd_client_data *lcd,
+				 loff_t *off, struct thandle *th)
+{
+	struct tgt_thread_info *tti = tgt_th_info(env);
+	struct dt_object	*dto;
+
+	lcd->lcd_last_result = ptlrpc_status_hton(lcd->lcd_last_result);
+	lcd->lcd_last_close_result =
+		ptlrpc_status_hton(lcd->lcd_last_close_result);
+	lcd_cpu_to_le(lcd, &tti->tti_lcd);
+	tti_buf_lcd(tti);
+
+	dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev);
+	return dt_record_write(env, dto, &tti->tti_buf, off, th);
+}
+
+struct tgt_new_client_callback {
+	struct dt_txn_commit_cb	 lncc_cb;
+	struct obd_export	*lncc_exp;
+};
+
+static void tgt_cb_new_client(struct lu_env *env, struct thandle *th,
+			      struct dt_txn_commit_cb *cb, int err)
+{
+	struct tgt_new_client_callback *ccb;
+
+	ccb = container_of(cb, struct tgt_new_client_callback, lncc_cb);
+
+	LASSERT(ccb->lncc_exp->exp_obd);
+
+	CDEBUG(D_RPCTRACE, "%s: committing for initial connect of %s\n",
+	       ccb->lncc_exp->exp_obd->obd_name,
+	       ccb->lncc_exp->exp_client_uuid.uuid);
+
+	spin_lock(&ccb->lncc_exp->exp_lock);
+
+	ccb->lncc_exp->exp_need_sync = 0;
+
+	spin_unlock(&ccb->lncc_exp->exp_lock);
+	class_export_cb_put(ccb->lncc_exp);
+
+	OBD_FREE_PTR(ccb);
+}
+
+int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp)
+{
+	struct tgt_new_client_callback	*ccb;
+	struct dt_txn_commit_cb		*dcb;
+	int				 rc;
+
+	OBD_ALLOC_PTR(ccb);
+	if (ccb == NULL)
+		return -ENOMEM;
+
+	ccb->lncc_exp = class_export_cb_get(exp);
+
+	dcb = &ccb->lncc_cb;
+	dcb->dcb_func = tgt_cb_new_client;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strlcpy(dcb->dcb_name, "tgt_cb_new_client", sizeof(dcb->dcb_name));
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc) {
+		class_export_cb_put(exp);
+		OBD_FREE_PTR(ccb);
+	}
+	return rc;
+}
+
+/**
+ * Update client data in last_rcvd
+ */
+static int tgt_client_data_update(const struct lu_env *env,
+				  struct obd_export *exp)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct thandle		*th;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (unlikely(tgt == NULL)) {
+		CDEBUG(D_ERROR, "%s: No target for connected export\n",
+			  class_exp2obd(exp)->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	th = dt_trans_create(env, tgt->lut_bottom);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	tti_buf_lcd(tti);
+	rc = dt_declare_record_write(env, tgt->lut_last_rcvd,
+				     &tti->tti_buf,
+				     ted->ted_lr_off, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_trans_start_local(env, tgt->lut_bottom, th);
+	if (rc)
+		GOTO(out, rc);
+
+	mutex_lock(&ted->ted_lcd_lock);
+
+	/*
+	 * Until this operations will be committed the sync is needed
+	 * for this export. This should be done _after_ starting the
+	 * transaction so that many connecting clients will not bring
+	 * server down with lots of sync writes.
+	 */
+	rc = tgt_new_client_cb_add(th, exp);
+	if (rc) {
+		/* can't add callback, do sync now */
+		th->th_sync = 1;
+	} else {
+		spin_lock(&exp->exp_lock);
+		exp->exp_need_sync = 1;
+		spin_unlock(&exp->exp_lock);
+	}
+
+	tti->tti_off = ted->ted_lr_off;
+	rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th);
+
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	EXIT;
+out:
+	dt_trans_stop(env, tgt->lut_bottom, th);
+	CDEBUG(D_INFO, "%s: update last_rcvd client data for UUID = %s, "
+	       "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name,
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc);
+
+	return rc;
+}
+
+static int tgt_server_data_read(const struct lu_env *env, struct lu_target *tgt)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	int			 rc;
+
+	tti->tti_off = 0;
+	tti_buf_lsd(tti);
+	rc = dt_record_read(env, tgt->lut_last_rcvd, &tti->tti_buf,
+			    &tti->tti_off);
+	if (rc == 0)
+		lsd_le_to_cpu(&tti->tti_lsd, &tgt->lut_lsd);
+
+	CDEBUG(D_INFO, "%s: read last_rcvd server data for UUID = %s, "
+	       "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name,
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc);
+        return rc;
+}
+
+static int tgt_server_data_write(const struct lu_env *env,
+				 struct lu_target *tgt, struct thandle *th)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	ENTRY;
+
+	tti->tti_off = 0;
+	tti_buf_lsd(tti);
+	lsd_cpu_to_le(&tgt->lut_lsd, &tti->tti_lsd);
+
+	dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev);
+	rc = dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th);
+
+	CDEBUG(D_INFO, "%s: write last_rcvd server data for UUID = %s, "
+	       "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name,
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc);
+
+	RETURN(rc);
+}
+
+/**
+ * Update server data in last_rcvd
+ */
+int tgt_server_data_update(const struct lu_env *env, struct lu_target *tgt,
+			   int sync)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct thandle		*th;
+	int			 rc = 0;
+
+	ENTRY;
+
+	CDEBUG(D_SUPER,
+	       "%s: mount_count is %llu, last_transno is %llu\n",
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_obd->u.obt.obt_mount_count,
+	       tgt->lut_last_transno);
+
+	/* Always save latest transno to keep it fresh */
+	spin_lock(&tgt->lut_translock);
+	tgt->lut_lsd.lsd_last_transno = tgt->lut_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	th = dt_trans_create(env, tgt->lut_bottom);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	th->th_sync = sync;
+
+	tti_buf_lsd(tti);
+	rc = dt_declare_record_write(env, tgt->lut_last_rcvd,
+				     &tti->tti_buf, tti->tti_off, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_trans_start(env, tgt->lut_bottom, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = tgt_server_data_write(env, tgt, th);
+out:
+	dt_trans_stop(env, tgt->lut_bottom, th);
+
+	CDEBUG(D_INFO, "%s: update last_rcvd server data for UUID = %s, "
+	       "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name,
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_server_data_update);
+
+static int tgt_truncate_last_rcvd(const struct lu_env *env,
+				  struct lu_target *tgt, loff_t size)
+{
+	struct dt_object *dt = tgt->lut_last_rcvd;
+	struct thandle	 *th;
+	struct lu_attr	  attr;
+	int		  rc;
+
+	ENTRY;
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	attr.la_size = size;
+	attr.la_valid = LA_SIZE;
+
+	th = dt_trans_create(env, tgt->lut_bottom);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+	rc = dt_declare_punch(env, dt, size, OBD_OBJECT_EOF, th);
+	if (rc)
+		GOTO(cleanup, rc);
+	rc = dt_declare_attr_set(env, dt, &attr, th);
+	if (rc)
+		GOTO(cleanup, rc);
+	rc = dt_trans_start_local(env, tgt->lut_bottom, th);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = dt_punch(env, dt, size, OBD_OBJECT_EOF, th);
+	if (rc == 0)
+		rc = dt_attr_set(env, dt, &attr, th);
+
+cleanup:
+	dt_trans_stop(env, tgt->lut_bottom, th);
+
+	RETURN(rc);
+}
+
+static void tgt_client_epoch_update(const struct lu_env *env,
+				    struct obd_export *exp)
+{
+	struct lsd_client_data	*lcd = exp->exp_target_data.ted_lcd;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+
+	LASSERT(tgt && tgt->lut_bottom);
+	/** VBR: set client last_epoch to current epoch */
+	if (lcd->lcd_last_epoch >= tgt->lut_lsd.lsd_start_epoch)
+		return;
+	lcd->lcd_last_epoch = tgt->lut_lsd.lsd_start_epoch;
+	tgt_client_data_update(env, exp);
+}
+
+/**
+ * Update boot epoch when recovery ends
+ */
+void tgt_boot_epoch_update(struct lu_target *tgt)
+{
+	struct lu_env		 env;
+	struct ptlrpc_request	*req;
+	__u32			 start_epoch;
+	LIST_HEAD(client_list);
+	int			 rc;
+
+	if (tgt->lut_obd->obd_stopping)
+		return;
+
+	rc = lu_env_init(&env, LCT_DT_THREAD);
+	if (rc) {
+		CERROR("%s: can't initialize environment: rc = %d\n",
+		        tgt->lut_obd->obd_name, rc);
+		return;
+	}
+
+	spin_lock(&tgt->lut_translock);
+	start_epoch = (tgt->lut_last_transno >> LR_EPOCH_BITS) + 1;
+	tgt->lut_last_transno = (__u64)start_epoch << LR_EPOCH_BITS;
+	tgt->lut_lsd.lsd_start_epoch = start_epoch;
+	spin_unlock(&tgt->lut_translock);
+
+	/**
+	 * The recovery is not yet finished and final queue can still be updated
+	 * with resend requests. Move final list to separate one for processing
+	 */
+	spin_lock(&tgt->lut_obd->obd_recovery_task_lock);
+	list_splice_init(&tgt->lut_obd->obd_final_req_queue, &client_list);
+	spin_unlock(&tgt->lut_obd->obd_recovery_task_lock);
+
+	/**
+	 * go through list of exports participated in recovery and
+	 * set new epoch for them
+	 */
+	list_for_each_entry(req, &client_list, rq_list) {
+		LASSERT(!req->rq_export->exp_delayed);
+		if (!req->rq_export->exp_vbr_failed)
+			tgt_client_epoch_update(&env, req->rq_export);
+	}
+	/** return list back at once */
+	spin_lock(&tgt->lut_obd->obd_recovery_task_lock);
+	list_splice_init(&client_list, &tgt->lut_obd->obd_final_req_queue);
+	spin_unlock(&tgt->lut_obd->obd_recovery_task_lock);
+
+	/**
+	 * Clear MULTI RPCS incompatibility flag if there is no multi-rpcs
+	 * client in last_rcvd file
+	 */
+	if (atomic_read(&tgt->lut_num_clients) == 0)
+		tgt->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS;
+
+	/** update server epoch */
+	tgt_server_data_update(&env, tgt, 1);
+	lu_env_fini(&env);
+}
+
+/**
+ * commit callback, need to update last_committed value
+ */
+struct tgt_last_committed_callback {
+	struct dt_txn_commit_cb	 llcc_cb;
+	struct lu_target	*llcc_tgt;
+	struct obd_export	*llcc_exp;
+	__u64			 llcc_transno;
+};
+
+static void tgt_cb_last_committed(struct lu_env *env, struct thandle *th,
+				  struct dt_txn_commit_cb *cb, int err)
+{
+	struct tgt_last_committed_callback *ccb;
+
+	ccb = container_of(cb, struct tgt_last_committed_callback, llcc_cb);
+
+	LASSERT(ccb->llcc_exp);
+	LASSERT(ccb->llcc_tgt != NULL);
+	LASSERT(ccb->llcc_exp->exp_obd == ccb->llcc_tgt->lut_obd);
+
+	if (th->th_reserved_quota.qrr_count > 0) {
+		struct lu_env		 temp_env;
+		int rc;
+
+		CDEBUG(D_QUOTA, "free quota %llu %llu\n",
+		       th->th_reserved_quota.qrr_id.qid_gid,
+		       th->th_reserved_quota.qrr_count);
+
+		rc = lu_env_init(&temp_env, LCT_DT_THREAD);
+		if (rc) {
+			CERROR("%s: can't initialize environment: rc = %d\n",
+			       ccb->llcc_tgt->lut_obd->obd_name, rc);
+			goto out;
+		}
+
+		dt_reserve_or_free_quota(&temp_env, th->th_dev,
+					 th->th_reserved_quota.qrr_type,
+					 th->th_reserved_quota.qrr_id.qid_uid,
+					 th->th_reserved_quota.qrr_id.qid_gid,
+					 -th->th_reserved_quota.qrr_count,
+					 false);
+		lu_env_fini(&temp_env);
+	}
+
+	/* error hit, don't update last committed to provide chance to
+	 * replay data after fail */
+	if (err != 0)
+		goto out;
+
+	/* Fast path w/o spinlock, if exp_last_committed was updated
+	 * with higher transno, no need to take spinlock and check,
+	 * also no need to update obd_last_committed. */
+	if (ccb->llcc_transno <= ccb->llcc_exp->exp_last_committed)
+		goto out;
+	spin_lock(&ccb->llcc_tgt->lut_translock);
+	if (ccb->llcc_transno > ccb->llcc_tgt->lut_obd->obd_last_committed)
+		ccb->llcc_tgt->lut_obd->obd_last_committed = ccb->llcc_transno;
+
+	if (ccb->llcc_transno > ccb->llcc_exp->exp_last_committed) {
+		ccb->llcc_exp->exp_last_committed = ccb->llcc_transno;
+		spin_unlock(&ccb->llcc_tgt->lut_translock);
+
+		ptlrpc_commit_replies(ccb->llcc_exp);
+		tgt_cancel_slc_locks(ccb->llcc_tgt, ccb->llcc_transno);
+	} else {
+		spin_unlock(&ccb->llcc_tgt->lut_translock);
+	}
+
+	CDEBUG(D_HA, "%s: transno %lld is committed\n",
+	       ccb->llcc_tgt->lut_obd->obd_name, ccb->llcc_transno);
+
+out:
+	class_export_cb_put(ccb->llcc_exp);
+	OBD_FREE_PTR(ccb);
+}
+
+/**
+ * Add commit callback function, it returns a non-zero value to inform
+ * caller to use sync transaction if necessary.
+ */
+static int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt,
+				  struct obd_export *exp, __u64 transno)
+{
+	struct tgt_last_committed_callback	*ccb;
+	struct dt_txn_commit_cb			*dcb;
+	int					 rc;
+
+	OBD_ALLOC_PTR(ccb);
+	if (ccb == NULL)
+		return -ENOMEM;
+
+	ccb->llcc_tgt = tgt;
+	ccb->llcc_exp = class_export_cb_get(exp);
+	ccb->llcc_transno = transno;
+
+	dcb = &ccb->llcc_cb;
+	dcb->dcb_func = tgt_cb_last_committed;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strlcpy(dcb->dcb_name, "tgt_cb_last_committed", sizeof(dcb->dcb_name));
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc) {
+		class_export_cb_put(exp);
+		OBD_FREE_PTR(ccb);
+	}
+
+	if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)
+		/* report failure to force synchronous operation */
+		return -EPERM;
+
+	/* if exp_need_sync is set, return non-zero value to force
+	 * a sync transaction. */
+	return rc ? rc : exp->exp_need_sync;
+}
+
+static int tgt_is_local_client(const struct lu_env *env,
+				      struct obd_export *exp)
+{
+	struct lu_target	*tgt = class_exp2tgt(exp);
+	struct tgt_session_info *tsi = tgt_ses_info(env);
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+
+	if (exp_connect_flags(exp) & OBD_CONNECT_MDS ||
+	    exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS)
+		return 0;
+	if (tgt->lut_local_recovery)
+		return 0;
+	if (!req)
+		return 0;
+	if (!LNetIsPeerLocal(req->rq_peer.nid))
+		return 0;
+
+	return 1;
+}
+
+/**
+ * Add new client to the last_rcvd upon new connection.
+ *
+ * We use a bitmap to locate a free space in the last_rcvd file and initialize
+ * tg_export_data.
+ */
+int tgt_client_new(const struct lu_env *env, struct obd_export *exp)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+	int			 rc = 0, idx;
+
+	ENTRY;
+
+	LASSERT(tgt && tgt->lut_client_bitmap != NULL);
+	if (!strcmp(ted->ted_lcd->lcd_uuid, tgt->lut_obd->obd_uuid.uuid))
+		RETURN(0);
+
+	if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)
+		RETURN(0);
+
+	if (tgt_is_local_client(env, exp)) {
+		LCONSOLE_WARN("%s: local client %s w/o recovery\n",
+			      exp->exp_obd->obd_name, ted->ted_lcd->lcd_uuid);
+		exp->exp_no_recovery = 1;
+		RETURN(0);
+	}
+
+	/* the bitmap operations can handle cl_idx > sizeof(long) * 8, so
+	 * there's no need for extra complication here
+	 */
+	idx = find_first_zero_bit(tgt->lut_client_bitmap, LR_MAX_CLIENTS);
+repeat:
+	if (idx >= LR_MAX_CLIENTS ||
+	    OBD_FAIL_CHECK(OBD_FAIL_MDS_CLIENT_ADD)) {
+		CERROR("%s: no room for %u clients - fix LR_MAX_CLIENTS\n",
+		       tgt->lut_obd->obd_name,  idx);
+		RETURN(-EOVERFLOW);
+	}
+	if (test_and_set_bit(idx, tgt->lut_client_bitmap)) {
+		idx = find_next_zero_bit(tgt->lut_client_bitmap,
+					     LR_MAX_CLIENTS, idx);
+		goto repeat;
+	}
+
+	ted->ted_lr_idx = idx;
+	ted->ted_lr_off = tgt->lut_lsd.lsd_client_start +
+			  idx * tgt->lut_lsd.lsd_client_size;
+
+	LASSERTF(ted->ted_lr_off > 0, "ted_lr_off = %llu\n", ted->ted_lr_off);
+
+	if (tgt_is_multimodrpcs_client(exp)) {
+		/* Set MULTI RPCS incompatibility flag to prevent previous
+		 * Lustre versions to mount a target with reply_data file */
+		if (!(tgt->lut_lsd.lsd_feature_incompat &
+		      OBD_INCOMPAT_MULTI_RPCS)) {
+			tgt->lut_lsd.lsd_feature_incompat |=
+							OBD_INCOMPAT_MULTI_RPCS;
+			rc = tgt_server_data_update(env, tgt, 1);
+			if (rc < 0) {
+				CERROR("%s: unable to set MULTI RPCS "
+				       "incompatibility flag\n",
+				       exp->exp_obd->obd_name);
+				RETURN(rc);
+			}
+		}
+
+		/* assign client slot generation */
+		ted->ted_lcd->lcd_generation =
+				atomic_inc_return(&tgt->lut_client_generation);
+	} else {
+		ted->ted_lcd->lcd_generation = 0;
+	}
+
+	CDEBUG(D_INFO, "%s: new client at index %d (%llu) with UUID '%s' "
+	       "generation %d\n",
+	       tgt->lut_obd->obd_name, ted->ted_lr_idx, ted->ted_lr_off,
+	       ted->ted_lcd->lcd_uuid, ted->ted_lcd->lcd_generation);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_ADD))
+		RETURN(-ENOSPC);
+
+	rc = tgt_client_data_update(env, exp);
+	if (rc) {
+		CERROR("%s: Failed to write client lcd at idx %d, rc %d\n",
+		       tgt->lut_obd->obd_name, idx, rc);
+		RETURN(rc);
+	}
+
+	if (tgt_is_multimodrpcs_client(exp))
+		atomic_inc(&tgt->lut_num_clients);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(tgt_client_new);
+
+/* Add an existing client to the MDS in-memory state based on
+ * a client that was previously found in the last_rcvd file and
+ * already has an assigned slot (idx >= 0).
+ *
+ * It should not be possible to fail adding an existing client - otherwise
+ * mdt_init_server_data() callsite needs to be fixed.
+ */
+int tgt_client_add(const struct lu_env *env,  struct obd_export *exp, int idx)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+
+	ENTRY;
+
+	LASSERT(tgt && tgt->lut_client_bitmap != NULL);
+	LASSERTF(idx >= 0, "%d\n", idx);
+
+	if (!strcmp(ted->ted_lcd->lcd_uuid, tgt->lut_obd->obd_uuid.uuid) ||
+	    exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)
+		RETURN(0);
+
+	if (test_and_set_bit(idx, tgt->lut_client_bitmap)) {
+		CERROR("%s: client %d: bit already set in bitmap!!\n",
+		       tgt->lut_obd->obd_name,  idx);
+		LBUG();
+	}
+
+	CDEBUG(D_INFO, "%s: client at idx %d with UUID '%s' added, "
+	       "generation %d\n",
+	       tgt->lut_obd->obd_name, idx, ted->ted_lcd->lcd_uuid,
+	       ted->ted_lcd->lcd_generation);
+
+	ted->ted_lr_idx = idx;
+	ted->ted_lr_off = tgt->lut_lsd.lsd_client_start +
+			  idx * tgt->lut_lsd.lsd_client_size;
+
+	mutex_init(&ted->ted_lcd_lock);
+
+	LASSERTF(ted->ted_lr_off > 0, "ted_lr_off = %llu\n", ted->ted_lr_off);
+
+	RETURN(0);
+}
+
+int tgt_client_del(const struct lu_env *env, struct obd_export *exp)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(ted->ted_lcd);
+
+	if (unlikely(tgt == NULL)) {
+		CDEBUG(D_ERROR, "%s: No target for connected export\n",
+		       class_exp2obd(exp)->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	/* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */
+	if (!strcmp((char *)ted->ted_lcd->lcd_uuid,
+		    (char *)tgt->lut_obd->obd_uuid.uuid) ||
+	    exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT ||
+	    exp->exp_no_recovery)
+		RETURN(0);
+
+	/* Slot may be not yet assigned, use case is race between Client
+	 * reconnect and forced eviction */
+	if (ted->ted_lr_idx < 0) {
+		CWARN("%s: client with UUID '%s' not in bitmap\n",
+		      tgt->lut_obd->obd_name, ted->ted_lcd->lcd_uuid);
+		RETURN(0);
+	}
+
+	CDEBUG(D_INFO, "%s: del client at idx %u, off %lld, UUID '%s'\n",
+	       tgt->lut_obd->obd_name, ted->ted_lr_idx, ted->ted_lr_off,
+	       ted->ted_lcd->lcd_uuid);
+
+	/* Clear the bit _after_ zeroing out the client so we don't
+	   race with filter_client_add and zero out new clients.*/
+	if (!test_bit(ted->ted_lr_idx, tgt->lut_client_bitmap)) {
+		CERROR("%s: client %u: bit already clear in bitmap!!\n",
+		       tgt->lut_obd->obd_name, ted->ted_lr_idx);
+		LBUG();
+	}
+
+	/* Do not erase record for recoverable client. */
+	if (exp->exp_flags & OBD_OPT_FAILOVER)
+		RETURN(0);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_DEL))
+		RETURN(0);
+
+	/* Make sure the server's last_transno is up to date.
+	 * This should be done before zeroing client slot so last_transno will
+	 * be in server data or in client data in case of failure */
+	rc = tgt_server_data_update(env, tgt, 0);
+	if (rc != 0) {
+		CERROR("%s: failed to update server data, skip client %s "
+		       "zeroing, rc %d\n", tgt->lut_obd->obd_name,
+		       ted->ted_lcd->lcd_uuid, rc);
+		RETURN(rc);
+	}
+
+	/* Race between an eviction and a disconnection ?*/
+	mutex_lock(&ted->ted_lcd_lock);
+	if (ted->ted_lcd->lcd_uuid[0] == '\0') {
+		mutex_unlock(&ted->ted_lcd_lock);
+		RETURN(rc);
+	}
+
+	memset(ted->ted_lcd->lcd_uuid, 0, sizeof ted->ted_lcd->lcd_uuid);
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	rc = tgt_client_data_update(env, exp);
+
+	if (!rc && tgt_is_multimodrpcs_record(tgt, ted->ted_lcd))
+		atomic_dec(&tgt->lut_num_clients);
+
+	CDEBUG(rc == 0 ? D_INFO : D_ERROR,
+	       "%s: zeroing out client %s at idx %u (%llu), rc %d\n",
+	       tgt->lut_obd->obd_name, ted->ted_lcd->lcd_uuid,
+	       ted->ted_lr_idx, ted->ted_lr_off, rc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_client_del);
+
+static void tgt_clean_by_tag(struct obd_export *exp, __u64 xid, __u16 tag)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*lut = class_exp2tgt(exp);
+	struct tg_reply_data	*trd, *tmp;
+
+	if (tag == 0)
+		return;
+
+	list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) {
+		if (trd->trd_tag != tag)
+			continue;
+
+		LASSERT(ergo(tgt_is_increasing_xid_client(exp),
+			     trd->trd_reply.lrd_xid <= xid));
+
+		ted->ted_release_tag++;
+		tgt_release_reply_data(lut, ted, trd);
+	}
+}
+
+static int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt,
+		       struct tg_export_data *ted, struct tg_reply_data *trd,
+		       struct ptlrpc_request *req,
+		       struct thandle *th, bool update_lrd_file)
+{
+	struct lsd_reply_data	*lrd;
+	int	i;
+	int	rc;
+
+	lrd = &trd->trd_reply;
+	/* update export last transno */
+	mutex_lock(&ted->ted_lcd_lock);
+	if (lrd->lrd_transno > ted->ted_lcd->lcd_last_transno)
+		ted->ted_lcd->lcd_last_transno = lrd->lrd_transno;
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	if (tgt != NULL) {
+		/* find a empty slot */
+		i = tgt_find_free_reply_slot(tgt);
+		if (unlikely(i < 0)) {
+			CERROR("%s: couldn't find a slot for reply data: "
+			       "rc = %d\n", tgt_name(tgt), i);
+			RETURN(i);
+		}
+		trd->trd_index = i;
+
+		if (update_lrd_file) {
+			loff_t	off;
+
+			/* write reply data to disk */
+			off = sizeof(struct lsd_reply_header) + sizeof(*lrd) * i;
+			rc = tgt_reply_data_write(env, tgt, lrd, off, th);
+			if (unlikely(rc != 0)) {
+				CERROR("%s: can't update %s file: rc = %d\n",
+				       tgt_name(tgt), REPLY_DATA, rc);
+				GOTO(free_slot, rc);
+			}
+		}
+	} else {
+		trd->trd_index = TRD_INDEX_MEMORY;
+	}
+
+	/* add reply data to target export's reply list */
+	mutex_lock(&ted->ted_lcd_lock);
+	if (req != NULL) {
+		int exclude = tgt_is_increasing_xid_client(req->rq_export) ?
+			      MSG_REPLAY : MSG_REPLAY|MSG_RESENT;
+
+		if (req->rq_obsolete) {
+			CDEBUG(D_INFO,
+			       "drop reply data update for obsolete req xid=%llu,"
+			       "transno=%llu, tag=%hu\n", req->rq_xid,
+			       lrd->lrd_transno, trd->trd_tag);
+			mutex_unlock(&ted->ted_lcd_lock);
+			GOTO(free_slot, rc = -EBADR);
+		}
+
+		if (!(lustre_msg_get_flags(req->rq_reqmsg) & exclude))
+			tgt_clean_by_tag(req->rq_export, req->rq_xid,
+					 trd->trd_tag);
+	}
+	list_add(&trd->trd_list, &ted->ted_reply_list);
+	ted->ted_reply_cnt++;
+	if (ted->ted_reply_cnt > ted->ted_reply_max)
+		ted->ted_reply_max = ted->ted_reply_cnt;
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	CDEBUG(D_TRACE, "add reply %p: xid %llu, transno %llu, "
+	       "tag %hu, client gen %u, slot idx %d\n",
+	       trd, lrd->lrd_xid, lrd->lrd_transno,
+	       trd->trd_tag, lrd->lrd_client_gen, trd->trd_index);
+
+	RETURN(0);
+
+free_slot:
+	if (tgt != NULL)
+		tgt_clear_reply_slot(tgt, trd->trd_index);
+	return rc;
+}
+
+int tgt_mk_reply_data(const struct lu_env *env,
+		      struct lu_target *tgt,
+		      struct tg_export_data *ted,
+		      struct ptlrpc_request *req,
+		      __u64 opdata,
+		      struct thandle *th,
+		      bool write_update,
+		      __u64 transno)
+{
+	struct tg_reply_data	*trd;
+	struct lsd_reply_data	*lrd;
+	__u64			*pre_versions = NULL;
+	int			rc;
+	struct tgt_session_info *tsi = NULL;
+
+	OBD_ALLOC_PTR(trd);
+	if (unlikely(trd == NULL))
+		RETURN(-ENOMEM);
+
+	if (env != NULL)
+		tsi = tgt_ses_info(env);
+
+	/* fill reply data information */
+	lrd = &trd->trd_reply;
+	lrd->lrd_transno = transno;
+	if (req != NULL) {
+		lrd->lrd_xid = req->rq_xid;
+		trd->trd_tag = lustre_msg_get_tag(req->rq_reqmsg);
+		lrd->lrd_client_gen = ted->ted_lcd->lcd_generation;
+		if (write_update) {
+			pre_versions = lustre_msg_get_versions(req->rq_repmsg);
+			lrd->lrd_result = th->th_result;
+		}
+	} else {
+		LASSERT(env != NULL);
+		LASSERT(tsi->tsi_xid != 0);
+
+		lrd->lrd_xid = tsi->tsi_xid;
+		lrd->lrd_result = tsi->tsi_result;
+		lrd->lrd_client_gen = tsi->tsi_client_gen;
+	}
+
+	lrd->lrd_data = opdata;
+	if (pre_versions) {
+		trd->trd_pre_versions[0] = pre_versions[0];
+		trd->trd_pre_versions[1] = pre_versions[1];
+		trd->trd_pre_versions[2] = pre_versions[2];
+		trd->trd_pre_versions[3] = pre_versions[3];
+	}
+
+	if (tsi && tsi->tsi_open_obj)
+		trd->trd_object = *lu_object_fid(&tsi->tsi_open_obj->do_lu);
+
+	rc = tgt_add_reply_data(env, tgt, ted, trd, req,
+				th, write_update);
+	if (rc < 0) {
+		OBD_FREE_PTR(trd);
+		if (rc == -EBADR)
+			rc = 0;
+	}
+	return rc;
+
+}
+EXPORT_SYMBOL(tgt_mk_reply_data);
+
+/*
+ * last_rcvd & last_committed update callbacks
+ */
+static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt,
+				struct dt_object *obj, __u64 opdata,
+				struct thandle *th, struct ptlrpc_request *req)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct tgt_session_info *tsi = tgt_ses_info(env);
+	struct obd_export *exp = tsi->tsi_exp;
+	struct tg_export_data *ted;
+	__u64 *transno_p;
+	bool nolcd = false;
+	int rc = 0;
+
+	ENTRY;
+
+
+	LASSERT(exp != NULL);
+	ted = &exp->exp_target_data;
+
+	/* Some clients don't support recovery, and they don't have last_rcvd
+	 * client data:
+	 * 1. lightweight clients.
+	 * 2. local clients on MDS which doesn't enable "localrecov".
+	 * 3. OFD connect may cause transaction before export has last_rcvd
+	 *    slot.
+	 */
+	if (ted->ted_lr_idx < 0)
+		nolcd = true;
+
+	if (req != NULL)
+		tti->tti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	else
+		/* From update replay, tti_transno should be set already */
+		LASSERT(tti->tti_transno != 0);
+
+	spin_lock(&tgt->lut_translock);
+	if (th->th_result != 0) {
+		if (tti->tti_transno != 0) {
+			CERROR("%s: replay transno %llu failed: rc = %d\n",
+			       tgt_name(tgt), tti->tti_transno, th->th_result);
+		}
+	} else if (tti->tti_transno == 0) {
+		tti->tti_transno = ++tgt->lut_last_transno;
+	} else {
+		/* should be replay */
+		if (tti->tti_transno > tgt->lut_last_transno)
+			tgt->lut_last_transno = tti->tti_transno;
+	}
+	spin_unlock(&tgt->lut_translock);
+
+	/** VBR: set new versions */
+	if (th->th_result == 0 && obj != NULL) {
+		struct dt_object *dto = dt_object_locate(obj, th->th_dev);
+		dt_version_set(env, dto, tti->tti_transno, th);
+	}
+
+	/* filling reply data */
+	CDEBUG(D_INODE, "transno = %llu, last_committed = %llu\n",
+	       tti->tti_transno, tgt->lut_obd->obd_last_committed);
+
+	if (req != NULL) {
+		req->rq_transno = tti->tti_transno;
+		lustre_msg_set_transno(req->rq_repmsg, tti->tti_transno);
+	}
+
+	/* if can't add callback, do sync write */
+	th->th_sync |= !!tgt_last_commit_cb_add(th, tgt, exp, tti->tti_transno);
+
+	if (nolcd) {
+		/* store transno in the last_rcvd header */
+		spin_lock(&tgt->lut_translock);
+		if (tti->tti_transno > tgt->lut_lsd.lsd_last_transno) {
+			tgt->lut_lsd.lsd_last_transno = tti->tti_transno;
+			spin_unlock(&tgt->lut_translock);
+			/* Although current connection doesn't have slot
+			 * in the last_rcvd, we still want to maintain
+			 * the in-memory lsd_client_data structure in order to
+			 * properly handle reply reconstruction. */
+			rc = tgt_server_data_write(env, tgt, th);
+		} else {
+			spin_unlock(&tgt->lut_translock);
+		}
+	} else if (ted->ted_lr_off == 0) {
+		CERROR("%s: client idx %d has offset %lld\n",
+		       tgt_name(tgt), ted->ted_lr_idx, ted->ted_lr_off);
+		RETURN(-EINVAL);
+	}
+
+	/* Target that supports multiple reply data */
+	if (tgt_is_multimodrpcs_client(exp)) {
+		return tgt_mk_reply_data(env, tgt, ted, req, opdata, th,
+					 !!(req != NULL), tti->tti_transno);
+	}
+
+	/* Enough for update replay, let's return */
+	if (req == NULL)
+		RETURN(rc);
+
+	mutex_lock(&ted->ted_lcd_lock);
+	LASSERT(ergo(tti->tti_transno == 0, th->th_result != 0));
+	if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) {
+		transno_p = &ted->ted_lcd->lcd_last_close_transno;
+		ted->ted_lcd->lcd_last_close_xid = req->rq_xid;
+		ted->ted_lcd->lcd_last_close_result = th->th_result;
+	} else {
+		/* VBR: save versions in last_rcvd for reconstruct. */
+		__u64 *pre_versions = lustre_msg_get_versions(req->rq_repmsg);
+
+		if (pre_versions) {
+			ted->ted_lcd->lcd_pre_versions[0] = pre_versions[0];
+			ted->ted_lcd->lcd_pre_versions[1] = pre_versions[1];
+			ted->ted_lcd->lcd_pre_versions[2] = pre_versions[2];
+			ted->ted_lcd->lcd_pre_versions[3] = pre_versions[3];
+		}
+		transno_p = &ted->ted_lcd->lcd_last_transno;
+		ted->ted_lcd->lcd_last_xid = req->rq_xid;
+		ted->ted_lcd->lcd_last_result = th->th_result;
+		/* XXX: lcd_last_data is __u32 but intent_dispostion is __u64,
+		 * see struct ldlm_reply->lock_policy_res1; */
+		ted->ted_lcd->lcd_last_data = opdata;
+	}
+
+	/* Update transno in slot only if non-zero number, i.e. no errors */
+	if (likely(tti->tti_transno != 0)) {
+		/* Don't overwrite bigger transaction number with lower one.
+		 * That is not sign of problem in all cases, but in any case
+		 * this value should be monotonically increased only. */
+		if (*transno_p > tti->tti_transno) {
+			if (!tgt->lut_no_reconstruct) {
+				CERROR("%s: trying to overwrite bigger transno:"
+				       "on-disk: %llu, new: %llu replay: "
+				       "%d. See LU-617.\n", tgt_name(tgt),
+				       *transno_p, tti->tti_transno,
+				       req_is_replay(req));
+				if (req_is_replay(req)) {
+					spin_lock(&req->rq_export->exp_lock);
+					req->rq_export->exp_vbr_failed = 1;
+					spin_unlock(&req->rq_export->exp_lock);
+				}
+				mutex_unlock(&ted->ted_lcd_lock);
+				RETURN(req_is_replay(req) ? -EOVERFLOW : 0);
+			}
+		} else {
+			*transno_p = tti->tti_transno;
+		}
+	}
+
+	if (!nolcd) {
+		tti->tti_off = ted->ted_lr_off;
+		if (CFS_FAIL_CHECK(OBD_FAIL_TGT_RCVD_EIO))
+			rc = -EIO;
+		else
+			rc = tgt_client_data_write(env, tgt, ted->ted_lcd,
+						   &tti->tti_off, th);
+		if (rc < 0) {
+			mutex_unlock(&ted->ted_lcd_lock);
+			RETURN(rc);
+		}
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+	RETURN(rc);
+}
+
+/*
+ * last_rcvd update for echo client simulation.
+ * It updates last_rcvd client slot and version of object in
+ * simple way but with all locks to simulate all drawbacks
+ */
+static int tgt_last_rcvd_update_echo(const struct lu_env *env,
+				     struct lu_target *tgt,
+				     struct dt_object *obj,
+				     struct thandle *th,
+				     struct obd_export *exp)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	int			 rc = 0;
+
+	ENTRY;
+
+	tti->tti_transno = 0;
+
+	spin_lock(&tgt->lut_translock);
+	if (th->th_result == 0)
+		tti->tti_transno = ++tgt->lut_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	/** VBR: set new versions */
+	if (th->th_result == 0 && obj != NULL)
+		dt_version_set(env, obj, tti->tti_transno, th);
+
+	/* if can't add callback, do sync write */
+	th->th_sync |= !!tgt_last_commit_cb_add(th, tgt, exp,
+						tti->tti_transno);
+
+	LASSERT(ted->ted_lr_off > 0);
+
+	mutex_lock(&ted->ted_lcd_lock);
+	LASSERT(ergo(tti->tti_transno == 0, th->th_result != 0));
+	ted->ted_lcd->lcd_last_transno = tti->tti_transno;
+	ted->ted_lcd->lcd_last_result = th->th_result;
+
+	tti->tti_off = ted->ted_lr_off;
+	rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th);
+	mutex_unlock(&ted->ted_lcd_lock);
+	RETURN(rc);
+}
+
+static int tgt_clients_data_init(const struct lu_env *env,
+				 struct lu_target *tgt,
+				 unsigned long last_size)
+{
+	struct obd_device	*obd = tgt->lut_obd;
+	struct lr_server_data	*lsd = &tgt->lut_lsd;
+	struct lsd_client_data	*lcd = NULL;
+	struct tg_export_data	*ted;
+	int			 cl_idx;
+	int			 rc = 0;
+	loff_t			 off = lsd->lsd_client_start;
+	__u32			 generation = 0;
+	struct cfs_hash		*hash = NULL;
+
+	ENTRY;
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	BUILD_BUG_ON(offsetof(struct lsd_client_data, lcd_padding) +
+		     sizeof(lcd->lcd_padding) != LR_CLIENT_SIZE);
+
+	OBD_ALLOC_PTR(lcd);
+	if (lcd == NULL)
+		RETURN(-ENOMEM);
+
+	hash = cfs_hash_getref(tgt->lut_obd->obd_gen_hash);
+	if (hash == NULL)
+		GOTO(err_out, rc = -ENODEV);
+
+	for (cl_idx = 0; off < last_size; cl_idx++) {
+		struct obd_export	*exp;
+		__u64			 last_transno;
+
+		/* Don't assume off is incremented properly by
+		 * read_record(), in case sizeof(*lcd)
+		 * isn't the same as fsd->lsd_client_size.  */
+		off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size;
+		rc = tgt_client_data_read(env, tgt, lcd, &off, cl_idx);
+		if (rc) {
+			CERROR("%s: error reading last_rcvd %s idx %d off "
+			       "%llu: rc = %d\n", tgt_name(tgt), LAST_RCVD,
+			       cl_idx, off, rc);
+			rc = 0;
+			break; /* read error shouldn't cause startup to fail */
+		}
+
+		if (lcd->lcd_uuid[0] == '\0') {
+			CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
+			       cl_idx);
+			continue;
+		}
+
+		last_transno = lcd_last_transno(lcd);
+
+		/* These exports are cleaned up by disconnect, so they
+		 * need to be set up like real exports as connect does.
+		 */
+		CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: %llu"
+		       " srv lr: %llu lx: %llu gen %u\n", lcd->lcd_uuid,
+		       cl_idx, last_transno, lsd->lsd_last_transno,
+		       lcd_last_xid(lcd), lcd->lcd_generation);
+
+		exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid);
+		if (IS_ERR(exp)) {
+			if (PTR_ERR(exp) == -EALREADY) {
+				/* export already exists, zero out this one */
+				CERROR("%s: Duplicate export %s!\n",
+				       tgt_name(tgt), lcd->lcd_uuid);
+				continue;
+			}
+			GOTO(err_out, rc = PTR_ERR(exp));
+		}
+
+		ted = &exp->exp_target_data;
+		*ted->ted_lcd = *lcd;
+
+		rc = tgt_client_add(env, exp, cl_idx);
+		LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */
+		/* VBR: set export last committed version */
+		exp->exp_last_committed = last_transno;
+		spin_lock(&exp->exp_lock);
+		exp->exp_connecting = 0;
+		exp->exp_in_recovery = 0;
+		spin_unlock(&exp->exp_lock);
+		atomic_inc(&obd->obd_max_recoverable_clients);
+
+		if (tgt_is_multimodrpcs_record(tgt, lcd)) {
+			atomic_inc(&tgt->lut_num_clients);
+
+			/* compute the highest valid client generation */
+			generation = max(generation, lcd->lcd_generation);
+			/* fill client_generation <-> export hash table */
+			rc = cfs_hash_add_unique(hash, &lcd->lcd_generation,
+						 &exp->exp_gen_hash);
+			if (rc != 0) {
+				CERROR("%s: duplicate export for client "
+				       "generation %u\n",
+				       tgt_name(tgt), lcd->lcd_generation);
+				class_export_put(exp);
+				GOTO(err_out, rc);
+			}
+		}
+
+		class_export_put(exp);
+
+		rc = rev_import_init(exp);
+		if (rc != 0) {
+			class_unlink_export(exp);
+			GOTO(err_out, rc);
+		}
+
+		/* Need to check last_rcvd even for duplicated exports. */
+		CDEBUG(D_OTHER, "client at idx %d has last_transno = %llu\n",
+		       cl_idx, last_transno);
+
+		spin_lock(&tgt->lut_translock);
+		tgt->lut_last_transno = max(last_transno,
+					    tgt->lut_last_transno);
+		spin_unlock(&tgt->lut_translock);
+	}
+
+	/* record highest valid client generation */
+	atomic_set(&tgt->lut_client_generation, generation);
+
+err_out:
+	if (hash != NULL)
+		cfs_hash_putref(hash);
+	OBD_FREE_PTR(lcd);
+	RETURN(rc);
+}
+
+struct server_compat_data {
+	__u32 rocompat;
+	__u32 incompat;
+	__u32 rocinit;
+	__u32 incinit;
+};
+
+static struct server_compat_data tgt_scd[] = {
+	[LDD_F_SV_TYPE_MDT] = {
+		.rocompat = OBD_ROCOMPAT_LOVOBJID,
+		.incompat = OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR |
+			    OBD_INCOMPAT_FID | OBD_INCOMPAT_IAM_DIR |
+			    OBD_INCOMPAT_LMM_VER | OBD_INCOMPAT_MULTI_OI |
+			    OBD_INCOMPAT_MULTI_RPCS,
+		.rocinit = OBD_ROCOMPAT_LOVOBJID,
+		.incinit = OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR |
+			   OBD_INCOMPAT_MULTI_OI,
+	},
+	[LDD_F_SV_TYPE_OST] = {
+		.rocompat = OBD_ROCOMPAT_IDX_IN_IDIF,
+		.incompat = OBD_INCOMPAT_OST | OBD_INCOMPAT_COMMON_LR |
+			    OBD_INCOMPAT_FID,
+		.rocinit = OBD_ROCOMPAT_IDX_IN_IDIF,
+		.incinit = OBD_INCOMPAT_OST | OBD_INCOMPAT_COMMON_LR,
+	}
+};
+
+int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt)
+{
+	struct tgt_thread_info		*tti = tgt_th_info(env);
+	struct lr_server_data		*lsd = &tgt->lut_lsd;
+	unsigned long			 last_rcvd_size;
+	__u32				 index;
+	int				 rc, type;
+
+	rc = dt_attr_get(env, tgt->lut_last_rcvd, &tti->tti_attr);
+	if (rc)
+		RETURN(rc);
+
+	last_rcvd_size = (unsigned long)tti->tti_attr.la_size;
+
+	/* ensure padding in the struct is the correct size */
+	BUILD_BUG_ON(offsetof(struct lr_server_data, lsd_padding) +
+		     sizeof(lsd->lsd_padding) != LR_SERVER_SIZE);
+
+	rc = server_name2index(tgt_name(tgt), &index, NULL);
+	if (rc < 0) {
+		CERROR("%s: Can not get index from name: rc = %d\n",
+		       tgt_name(tgt), rc);
+		RETURN(rc);
+	}
+	/* server_name2index() returns type */
+	type = rc;
+	if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) {
+		CERROR("%s: unknown target type %x\n", tgt_name(tgt), type);
+		RETURN(-EINVAL);
+	}
+
+	/* last_rcvd on OST doesn't provide reconstruct support because there
+	 * may be up to 8 in-flight write requests per single slot in
+	 * last_rcvd client data
+	 */
+	tgt->lut_no_reconstruct = (type == LDD_F_SV_TYPE_OST);
+
+	if (last_rcvd_size == 0) {
+		LCONSOLE_WARN("%s: new disk, initializing\n", tgt_name(tgt));
+
+		memcpy(lsd->lsd_uuid, tgt->lut_obd->obd_uuid.uuid,
+		       sizeof(lsd->lsd_uuid));
+		lsd->lsd_last_transno = 0;
+		lsd->lsd_mount_count = 0;
+		lsd->lsd_server_size = LR_SERVER_SIZE;
+		lsd->lsd_client_start = LR_CLIENT_START;
+		lsd->lsd_client_size = LR_CLIENT_SIZE;
+		lsd->lsd_subdir_count = OBJ_SUBDIR_COUNT;
+		lsd->lsd_osd_index = index;
+		lsd->lsd_feature_rocompat = tgt_scd[type].rocinit;
+		lsd->lsd_feature_incompat = tgt_scd[type].incinit;
+	} else {
+		rc = tgt_server_data_read(env, tgt);
+		if (rc) {
+			CERROR("%s: error reading LAST_RCVD: rc= %d\n",
+			       tgt_name(tgt), rc);
+			RETURN(rc);
+		}
+		if (strcmp(lsd->lsd_uuid, tgt->lut_obd->obd_uuid.uuid)) {
+			if (tgt->lut_bottom->dd_rdonly) {
+				/* Such difference may be caused by mounting
+				 * up snapshot with new fsname under rd_only
+				 * mode. But even if it was NOT, it will not
+				 * damage the system because of "rd_only". */
+				memcpy(lsd->lsd_uuid,
+				       tgt->lut_obd->obd_uuid.uuid,
+				       sizeof(lsd->lsd_uuid));
+			} else {
+				LCONSOLE_ERROR_MSG(0x157, "Trying to start "
+						   "OBD %s using the wrong "
+						   "disk %s. Were the /dev/ "
+						   "assignments rearranged?\n",
+						   tgt->lut_obd->obd_uuid.uuid,
+						   lsd->lsd_uuid);
+				RETURN(-EINVAL);
+			}
+		}
+
+		if (lsd->lsd_osd_index != index) {
+			LCONSOLE_ERROR_MSG(0x157,
+					   "%s: index %d in last rcvd is different with the index %d in config log, It might be disk corruption!\n",
+					   tgt_name(tgt),
+					   lsd->lsd_osd_index, index);
+			RETURN(-EINVAL);
+		}
+	}
+
+	if (lsd->lsd_feature_incompat & ~tgt_scd[type].incompat) {
+		CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
+		       tgt_name(tgt),
+		       lsd->lsd_feature_incompat & ~tgt_scd[type].incompat);
+		RETURN(-EINVAL);
+	}
+
+	if (type == LDD_F_SV_TYPE_MDT)
+		lsd->lsd_feature_incompat |= OBD_INCOMPAT_FID;
+
+	if (lsd->lsd_feature_rocompat & ~tgt_scd[type].rocompat) {
+		CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
+		       tgt_name(tgt),
+		       lsd->lsd_feature_rocompat & ~tgt_scd[type].rocompat);
+		RETURN(-EINVAL);
+	}
+	/** Interop: evict all clients at first boot with 1.8 last_rcvd */
+	if (type == LDD_F_SV_TYPE_MDT &&
+	    !(lsd->lsd_feature_compat & OBD_COMPAT_20)) {
+		if (last_rcvd_size > lsd->lsd_client_start) {
+			LCONSOLE_WARN("%s: mounting at first time on 1.8 FS, "
+				      "remove all clients for interop needs\n",
+				      tgt_name(tgt));
+			rc = tgt_truncate_last_rcvd(env, tgt,
+						    lsd->lsd_client_start);
+			if (rc)
+				RETURN(rc);
+			last_rcvd_size = lsd->lsd_client_start;
+		}
+		/** set 2.0 flag to upgrade/downgrade between 1.8 and 2.0 */
+		lsd->lsd_feature_compat |= OBD_COMPAT_20;
+	}
+
+	spin_lock(&tgt->lut_translock);
+	tgt->lut_last_transno = lsd->lsd_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	lsd->lsd_mount_count++;
+
+	CDEBUG(D_INODE, "=======,=BEGIN DUMPING LAST_RCVD========\n");
+	CDEBUG(D_INODE, "%s: server last_transno: %llu\n",
+	       tgt_name(tgt), tgt->lut_last_transno);
+	CDEBUG(D_INODE, "%s: server mount_count: %llu\n",
+	       tgt_name(tgt), lsd->lsd_mount_count);
+	CDEBUG(D_INODE, "%s: server data size: %u\n",
+	       tgt_name(tgt), lsd->lsd_server_size);
+	CDEBUG(D_INODE, "%s: per-client data start: %u\n",
+	       tgt_name(tgt), lsd->lsd_client_start);
+	CDEBUG(D_INODE, "%s: per-client data size: %u\n",
+	       tgt_name(tgt), lsd->lsd_client_size);
+	CDEBUG(D_INODE, "%s: last_rcvd size: %lu\n",
+	       tgt_name(tgt), last_rcvd_size);
+	CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
+	       tgt_name(tgt), lsd->lsd_subdir_count);
+	CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", tgt_name(tgt),
+	       last_rcvd_size <= lsd->lsd_client_start ? 0 :
+	       (last_rcvd_size - lsd->lsd_client_start) /
+		lsd->lsd_client_size);
+	CDEBUG(D_INODE, "========END DUMPING LAST_RCVD========\n");
+
+	if (lsd->lsd_server_size == 0 || lsd->lsd_client_start == 0 ||
+	    lsd->lsd_client_size == 0) {
+		CERROR("%s: bad last_rcvd contents!\n", tgt_name(tgt));
+		RETURN(-EINVAL);
+	}
+
+	if (!tgt->lut_obd->obd_replayable)
+		CWARN("%s: recovery support OFF\n", tgt_name(tgt));
+
+	rc = tgt_clients_data_init(env, tgt, last_rcvd_size);
+	if (rc < 0)
+		GOTO(err_client, rc);
+
+	spin_lock(&tgt->lut_translock);
+	/* obd_last_committed is used for compatibility
+	 * with other lustre recovery code */
+	tgt->lut_obd->obd_last_committed = tgt->lut_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	tgt->lut_obd->u.obt.obt_mount_count = lsd->lsd_mount_count;
+	tgt->lut_obd->u.obt.obt_instance = (__u32)lsd->lsd_mount_count;
+
+	/* save it, so mount count and last_transno is current */
+	rc = tgt_server_data_update(env, tgt, 0);
+	if (rc < 0)
+		GOTO(err_client, rc);
+
+	RETURN(0);
+
+err_client:
+	class_disconnect_exports(tgt->lut_obd);
+	return rc;
+}
+
+/* add credits for last_rcvd update */
+int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th,
+		     void *cookie)
+{
+	struct lu_target	*tgt = cookie;
+	struct tgt_session_info	*tsi;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	/* For readonly case, the caller should have got failure
+	 * when start the transaction. If the logic comes here,
+	 * there must be something wrong. */
+	if (unlikely(tgt->lut_bottom->dd_rdonly)) {
+		dump_stack();
+		LBUG();
+	}
+
+	/* if there is no session, then this transaction is not result of
+	 * request processing but some local operation */
+	if (env->le_ses == NULL)
+		return 0;
+
+	LASSERT(tgt->lut_last_rcvd);
+	tsi = tgt_ses_info(env);
+	/* OFD may start transaction without export assigned */
+	if (tsi->tsi_exp == NULL)
+		return 0;
+
+	if (tgt_is_multimodrpcs_client(tsi->tsi_exp)) {
+		/*
+		 * Use maximum possible file offset for declaration to ensure
+		 * ZFS will reserve enough credits for a write anywhere in this
+		 * file, since we don't know where in the file the write will be
+		 * because a replay slot has not been assigned.  This should be
+		 * replaced by dmu_tx_hold_append() when available.
+		 */
+		tti->tti_buf.lb_buf = NULL;
+		tti->tti_buf.lb_len = sizeof(struct lsd_reply_data);
+		dto = dt_object_locate(tgt->lut_reply_data, th->th_dev);
+		rc = dt_declare_record_write(env, dto, &tti->tti_buf, -1, th);
+		if (rc)
+			return rc;
+	} else {
+		dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev);
+		tti_buf_lcd(tti);
+		tti->tti_off = tsi->tsi_exp->exp_target_data.ted_lr_off;
+		rc = dt_declare_record_write(env, dto, &tti->tti_buf,
+					     tti->tti_off, th);
+		if (rc)
+			return rc;
+	}
+
+	if (tsi->tsi_vbr_obj != NULL &&
+	    !lu_object_remote(&tsi->tsi_vbr_obj->do_lu)) {
+		dto = dt_object_locate(tsi->tsi_vbr_obj, th->th_dev);
+		rc = dt_declare_version_set(env, dto, th);
+	}
+
+	return rc;
+}
+
+/* Update last_rcvd records with latests transaction data */
+int tgt_txn_stop_cb(const struct lu_env *env, struct thandle *th,
+		    void *cookie)
+{
+	struct lu_target	*tgt = cookie;
+	struct tgt_session_info	*tsi;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct dt_object	*obj = NULL;
+	int			 rc;
+	bool			 echo_client;
+
+	if (env->le_ses == NULL)
+		return 0;
+
+	tsi = tgt_ses_info(env);
+	/* OFD may start transaction without export assigned */
+	if (tsi->tsi_exp == NULL)
+		return 0;
+
+	echo_client = (tgt_ses_req(tsi) == NULL && tsi->tsi_xid == 0);
+
+	if (tti->tti_has_trans && !echo_client) {
+		if (tti->tti_mult_trans == 0) {
+			CDEBUG(D_HA, "More than one transaction %llu\n",
+			       tti->tti_transno);
+			RETURN(0);
+		}
+		/* we need another transno to be assigned */
+		tti->tti_transno = 0;
+	} else if (th->th_result == 0) {
+		tti->tti_has_trans = 1;
+	}
+
+	if (tsi->tsi_vbr_obj != NULL &&
+	    !lu_object_remote(&tsi->tsi_vbr_obj->do_lu)) {
+		obj = tsi->tsi_vbr_obj;
+	}
+
+	if (unlikely(echo_client)) /* echo client special case */
+		rc = tgt_last_rcvd_update_echo(env, tgt, obj, th,
+					       tsi->tsi_exp);
+	else
+		rc = tgt_last_rcvd_update(env, tgt, obj, tsi->tsi_opdata, th,
+					  tgt_ses_req(tsi));
+	return rc;
+}
+
+int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct lsd_reply_data	*lrd = &tti->tti_lrd;
+	unsigned long		 reply_data_size;
+	int			 rc;
+	struct lsd_reply_header	*lrh = NULL;
+	struct tg_reply_data	*trd = NULL;
+	int                      idx;
+	loff_t			 off;
+	struct cfs_hash		*hash = NULL;
+	struct obd_export	*exp;
+	struct tg_export_data   *ted;
+	int			 reply_data_recovered = 0;
+
+	rc = dt_attr_get(env, tgt->lut_reply_data, &tti->tti_attr);
+	if (rc)
+		GOTO(out, rc);
+	reply_data_size = (unsigned long)tti->tti_attr.la_size;
+
+	OBD_ALLOC_PTR(lrh);
+	if (lrh == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (reply_data_size == 0) {
+		CDEBUG(D_INFO, "%s: new reply_data file, initializing\n",
+		       tgt_name(tgt));
+		lrh->lrh_magic = LRH_MAGIC;
+		lrh->lrh_header_size = sizeof(struct lsd_reply_header);
+		lrh->lrh_reply_size = sizeof(struct lsd_reply_data);
+		rc = tgt_reply_header_write(env, tgt, lrh);
+		if (rc) {
+			CERROR("%s: error writing %s: rc = %d\n",
+			       tgt_name(tgt), REPLY_DATA, rc);
+			GOTO(out, rc);
+		}
+	} else {
+		rc = tgt_reply_header_read(env, tgt, lrh);
+		if (rc) {
+			CERROR("%s: error reading %s: rc = %d\n",
+			       tgt_name(tgt), REPLY_DATA, rc);
+			GOTO(out, rc);
+		}
+		if (lrh->lrh_magic != LRH_MAGIC ||
+		    lrh->lrh_header_size != sizeof(struct lsd_reply_header) ||
+		    lrh->lrh_reply_size != sizeof(struct lsd_reply_data)) {
+			CERROR("%s: invalid header in %s\n",
+			       tgt_name(tgt), REPLY_DATA);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		hash = cfs_hash_getref(tgt->lut_obd->obd_gen_hash);
+		if (hash == NULL)
+			GOTO(out, rc = -ENODEV);
+
+		OBD_ALLOC_PTR(trd);
+		if (trd == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		/* Load reply_data from disk */
+		for (idx = 0, off = sizeof(struct lsd_reply_header);
+		     off < reply_data_size;
+		     idx++, off += sizeof(struct lsd_reply_data)) {
+			rc = tgt_reply_data_read(env, tgt, lrd, off);
+			if (rc) {
+				CERROR("%s: error reading %s: rc = %d\n",
+				       tgt_name(tgt), REPLY_DATA, rc);
+				GOTO(out, rc);
+			}
+
+			exp = cfs_hash_lookup(hash, &lrd->lrd_client_gen);
+			if (exp == NULL) {
+				/* old reply data from a disconnected client */
+				continue;
+			}
+			ted = &exp->exp_target_data;
+			mutex_lock(&ted->ted_lcd_lock);
+
+			/* create in-memory reply_data and link it to
+			 * target export's reply list */
+			rc = tgt_set_reply_slot(tgt, idx);
+			if (rc != 0) {
+				mutex_unlock(&ted->ted_lcd_lock);
+				GOTO(out, rc);
+			}
+			trd->trd_reply = *lrd;
+			trd->trd_pre_versions[0] = 0;
+			trd->trd_pre_versions[1] = 0;
+			trd->trd_pre_versions[2] = 0;
+			trd->trd_pre_versions[3] = 0;
+			trd->trd_index = idx;
+			trd->trd_tag = 0;
+			fid_zero(&trd->trd_object);
+			list_add(&trd->trd_list, &ted->ted_reply_list);
+			ted->ted_reply_cnt++;
+			if (ted->ted_reply_cnt > ted->ted_reply_max)
+				ted->ted_reply_max = ted->ted_reply_cnt;
+
+			CDEBUG(D_HA, "%s: restore reply %p: xid %llu, "
+			       "transno %llu, client gen %u, slot idx %d\n",
+			       tgt_name(tgt), trd, lrd->lrd_xid,
+			       lrd->lrd_transno, lrd->lrd_client_gen,
+			       trd->trd_index);
+
+			/* update export last committed transation */
+			exp->exp_last_committed = max(exp->exp_last_committed,
+						      lrd->lrd_transno);
+			/* Update lcd_last_transno as well for check in
+			 * tgt_release_reply_data() or the latest client
+			 * transno can be lost.
+			 */
+			ted->ted_lcd->lcd_last_transno =
+				max(ted->ted_lcd->lcd_last_transno,
+				    exp->exp_last_committed);
+
+			mutex_unlock(&ted->ted_lcd_lock);
+			class_export_put(exp);
+
+			/* update target last committed transaction */
+			spin_lock(&tgt->lut_translock);
+			tgt->lut_last_transno = max(tgt->lut_last_transno,
+						    lrd->lrd_transno);
+			spin_unlock(&tgt->lut_translock);
+
+			reply_data_recovered++;
+
+			OBD_ALLOC_PTR(trd);
+			if (trd == NULL)
+				GOTO(out, rc = -ENOMEM);
+		}
+		CDEBUG(D_INFO, "%s: %d reply data have been recovered\n",
+		       tgt_name(tgt), reply_data_recovered);
+	}
+
+	spin_lock(&tgt->lut_translock);
+	/* obd_last_committed is used for compatibility
+	 * with other lustre recovery code */
+	tgt->lut_obd->obd_last_committed = tgt->lut_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	rc = 0;
+
+out:
+	if (hash != NULL)
+		cfs_hash_putref(hash);
+	if (trd != NULL)
+		OBD_FREE_PTR(trd);
+	if (lrh != NULL)
+		OBD_FREE_PTR(lrh);
+	return rc;
+}
+
+static int tgt_check_lookup_req(struct ptlrpc_request *req, int lookup,
+				struct tg_reply_data *trd)
+{
+	struct tg_export_data *ted = &req->rq_export->exp_target_data;
+	struct lu_target *lut = class_exp2tgt(req->rq_export);
+	__u16 tag = lustre_msg_get_tag(req->rq_reqmsg);
+	int rc = 0;
+	struct tg_reply_data *reply;
+	bool check_increasing;
+
+	if (tag == 0)
+		return 0;
+
+	check_increasing = tgt_is_increasing_xid_client(req->rq_export) &&
+			   !(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY);
+	if (!lookup && !check_increasing)
+		return 0;
+
+	list_for_each_entry(reply, &ted->ted_reply_list, trd_list) {
+		if (lookup && reply->trd_reply.lrd_xid == req->rq_xid) {
+			rc = 1;
+			if (trd != NULL)
+				*trd = *reply;
+			break;
+		} else if (check_increasing && reply->trd_tag == tag &&
+			   reply->trd_reply.lrd_xid > req->rq_xid) {
+			rc = -EPROTO;
+			CERROR("%s: busy tag=%u req_xid=%llu, trd=%p: xid=%llu transno=%llu client_gen=%u slot_idx=%d: rc = %d\n",
+			       tgt_name(lut), tag, req->rq_xid, trd,
+			       reply->trd_reply.lrd_xid,
+			       reply->trd_reply.lrd_transno,
+			       reply->trd_reply.lrd_client_gen,
+			       reply->trd_index, rc);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+/* Look for a reply data matching specified request @req
+ * A copy is returned in @trd if the pointer is not NULL
+ */
+int tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd)
+{
+	struct tg_export_data *ted = &req->rq_export->exp_target_data;
+	int found = 0;
+	bool not_replay = !(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY);
+
+	mutex_lock(&ted->ted_lcd_lock);
+	if (not_replay && req->rq_xid <= req->rq_export->exp_last_xid) {
+		/* A check for the last_xid is needed here in case there is
+		 * no reply data is left in the list. It may happen if another
+		 * RPC on another slot increased the last_xid between our
+		 * process_req_last_xid & tgt_lookup_reply calls */
+		found = -EPROTO;
+	} else {
+		found = tgt_check_lookup_req(req, 1, trd);
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	CDEBUG(D_TRACE, "%s: lookup reply xid %llu, found %d last_xid %llu\n",
+	       tgt_name(class_exp2tgt(req->rq_export)), req->rq_xid, found,
+	       req->rq_export->exp_last_xid);
+
+	return found;
+}
+EXPORT_SYMBOL(tgt_lookup_reply);
+
+int tgt_handle_received_xid(struct obd_export *exp, __u64 rcvd_xid)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*lut = class_exp2tgt(exp);
+	struct tg_reply_data	*trd, *tmp;
+
+
+	list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) {
+		if (trd->trd_reply.lrd_xid > rcvd_xid)
+			continue;
+		ted->ted_release_xid++;
+		tgt_release_reply_data(lut, ted, trd);
+	}
+
+	return 0;
+}
+
+int tgt_handle_tag(struct ptlrpc_request *req)
+{
+	return tgt_check_lookup_req(req, 0, NULL);
+}
+
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_main.c b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
new file mode 100644
index 0000000000000..76ccece817326
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
@@ -0,0 +1,853 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * lustre/target/tgt_main.c
+ *
+ * Lustre Unified Target main initialization code
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <obd_cksum.h>
+#include "tgt_internal.h"
+#include "../ptlrpc/ptlrpc_internal.h"
+
+/* This must be longer than the longest string below */
+#define SYNC_STATES_MAXLEN 16
+static const char * const sync_lock_cancel_states[] = {
+	[SYNC_LOCK_CANCEL_NEVER]	= "never",
+	[SYNC_LOCK_CANCEL_BLOCKING]	= "blocking",
+	[SYNC_LOCK_CANCEL_ALWAYS]	= "always",
+};
+
+/**
+ * Show policy for handling dirty data under a lock being cancelled.
+ *
+ * \param[in] kobj	sysfs kobject
+ * \param[in] attr	sysfs attribute
+ * \param[in] buf	buffer for data
+ *
+ * \retval		0 and buffer filled with data on success
+ * \retval		negative value on error
+ */
+ssize_t sync_lock_cancel_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *tgt = obd->u.obt.obt_lut;
+
+	return sprintf(buf, "%s\n",
+		       sync_lock_cancel_states[tgt->lut_sync_lock_cancel]);
+}
+EXPORT_SYMBOL(sync_lock_cancel_show);
+
+/**
+ * Change policy for handling dirty data under a lock being cancelled.
+ *
+ * This variable defines what action target takes upon lock cancel
+ * There are three possible modes:
+ * 1) never - never do sync upon lock cancel. This can lead to data
+ *    inconsistencies if both the OST and client crash while writing a file
+ *    that is also concurrently being read by another client. In these cases,
+ *    this may allow the file data to "rewind" to an earlier state.
+ * 2) blocking - do sync only if there is blocking lock, e.g. if another
+ *    client is trying to access this same object
+ * 3) always - do sync always
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ * \param[in] count	buffer size
+ *
+ * \retval		\a count on success
+ * \retval		negative value on error
+ */
+ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *tgt = obd->u.obt.obt_lut;
+	int val = -1;
+	enum tgt_sync_lock_cancel slc;
+
+	if (count == 0 || count >= SYNC_STATES_MAXLEN)
+		return -EINVAL;
+
+	for (slc = 0; slc < ARRAY_SIZE(sync_lock_cancel_states); slc++) {
+		if (strcmp(buffer, sync_lock_cancel_states[slc]) == 0) {
+			val = slc;
+			break;
+		}
+	}
+
+	/* Legacy numeric codes */
+	if (val == -1) {
+		int rc = kstrtoint(buffer, 0, &val);
+		if (rc)
+			return rc;
+	}
+
+	if (val < 0 || val > 2)
+		return -EINVAL;
+
+	spin_lock(&tgt->lut_flags_lock);
+	tgt->lut_sync_lock_cancel = val;
+	spin_unlock(&tgt->lut_flags_lock);
+	return count;
+}
+EXPORT_SYMBOL(sync_lock_cancel_store);
+LUSTRE_RW_ATTR(sync_lock_cancel);
+
+/**
+ * Show maximum number of Filter Modification Data (FMD) maintained.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ *
+ * \retval		0 and buffer filled with data on success
+ * \retval		negative value on error
+ */
+ssize_t tgt_fmd_count_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+
+	return sprintf(buf, "%u\n", lut->lut_fmd_max_num);
+}
+
+/**
+ * Change number of FMDs maintained by target.
+ *
+ * This defines how large the list of FMDs can be.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ * \param[in] count	buffer size
+ *
+ * \retval		\a count on success
+ * \retval		negative value on error
+ */
+ssize_t tgt_fmd_count_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+	int val, rc;
+
+	rc = kstrtoint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > 65536)
+		return -EINVAL;
+
+	lut->lut_fmd_max_num = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(tgt_fmd_count);
+
+/**
+ * Show the maximum age of FMD data in seconds.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ *
+ * \retval		0 and buffer filled with data on success
+ * \retval		negative value on error
+ */
+ssize_t tgt_fmd_seconds_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+
+	return sprintf(buf, "%lld\n", lut->lut_fmd_max_age);
+}
+
+/**
+ * Set the maximum age of FMD data in seconds.
+ *
+ * This defines how long FMD data stays in the FMD list.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ * \param[in] count	buffer size
+ *
+ * \retval		\a count on success
+ * \retval		negative number on error
+ */
+ssize_t tgt_fmd_seconds_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+	time64_t val;
+	int rc;
+
+	rc = kstrtoll(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > 65536) /* ~ 18 hour max */
+		return -EINVAL;
+
+	lut->lut_fmd_max_age = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(tgt_fmd_seconds);
+
+/* These two aliases are old names and kept for compatibility, they were
+ * changed to 'tgt_fmd_count' and 'tgt_fmd_seconds'.
+ * This change was made in Lustre 2.13, so these aliases can be removed
+ * when back compatibility is not needed with any Lustre version prior 2.13
+ */
+static struct lustre_attr tgt_fmd_count_compat = __ATTR(client_cache_count,
+			0644, tgt_fmd_count_show, tgt_fmd_count_store);
+static struct lustre_attr tgt_fmd_seconds_compat = __ATTR(client_cache_seconds,
+			0644, tgt_fmd_seconds_show, tgt_fmd_seconds_store);
+
+static const struct attribute *tgt_attrs[] = {
+	&lustre_attr_sync_lock_cancel.attr,
+	&lustre_attr_tgt_fmd_count.attr,
+	&lustre_attr_tgt_fmd_seconds.attr,
+	&tgt_fmd_count_compat.attr,
+	&tgt_fmd_seconds_compat.attr,
+	NULL,
+};
+
+/**
+ * Decide which checksums both client and OST support, possibly forcing
+ * the use of T10PI checksums if the hardware supports this.
+ *
+ * The clients that have no T10-PI RPC checksum support will use the same
+ * mechanism to select checksum type as before, and will not be affected by
+ * the following logic.
+ *
+ * For the clients that have T10-PI RPC checksum support:
+ *
+ * If the target supports T10-PI feature and T10-PI checksum is enforced,
+ * clients will have no other choice for RPC checksum type other than using
+ * the T10PI checksum type. This is useful for enforcing end-to-end integrity
+ * in the whole system.
+ *
+ * If the target doesn't support T10-PI feature and T10-PI checksum is
+ * enforced, together with other checksum with reasonably good speeds (e.g.
+ * crc32, crc32c, adler, etc.), all T10-PI checksum types understood by the
+ * client (t10ip512, t10ip4K, t10crc512, t10crc4K) will be added to the
+ * available checksum types, regardless of the speeds of T10-PI checksums.
+ * This is useful for testing T10-PI checksum of RPC.
+ *
+ * If the target supports T10-PI feature and T10-PI checksum is NOT enforced,
+ * the corresponding T10-PI checksum type will be added to the checksum type
+ * list, regardless of the speed of the T10-PI checksum. This provides clients
+ * the flexibility to choose whether to enable end-to-end integrity or not.
+ *
+ * If the target does NOT supports T10-PI feature and T10-PI checksum is NOT
+ * enforced, together with other checksums with reasonably good speeds,
+ * all the T10-PI checksum types with good speeds will be added into the
+ * checksum type list. Note that a T10-PI checksum type with a speed worse
+ * than half of Alder will NOT be added as a option. In this circumstance,
+ * T10-PI checksum types has the same behavior like other normal checksum
+ * types.
+ */
+void tgt_mask_cksum_types(struct lu_target *lut, enum cksum_types *cksum_types)
+{
+	bool enforce = lut->lut_cksum_t10pi_enforce;
+	enum cksum_types tgt_t10_cksum_type;
+	enum cksum_types client_t10_types = *cksum_types & OBD_CKSUM_T10_ALL;
+	enum cksum_types server_t10_types;
+
+	/*
+	 * The client set in ocd_cksum_types the checksum types it
+	 * supports. We have to mask off the algorithms that we don't
+	 * support. T10PI checksum types will be added later.
+	 */
+	*cksum_types &= (lut->lut_cksum_types_supported & ~OBD_CKSUM_T10_ALL);
+	server_t10_types = lut->lut_cksum_types_supported & OBD_CKSUM_T10_ALL;
+	tgt_t10_cksum_type = lut->lut_dt_conf.ddp_t10_cksum_type;
+
+	/* Quick exit if no T10-PI support on client */
+	if (!client_t10_types)
+		return;
+
+	/*
+	 * This OST has NO T10-PI feature. Add all supported T10-PI checksums
+	 * as options if T10-PI checksum is enforced. If the T10-PI checksum is
+	 * not enforced, only add them as options when speed is good.
+	 */
+	if (tgt_t10_cksum_type == 0) {
+		/*
+		 * Server allows all T10PI checksums, and server_t10_types
+		 * include quick ones.
+		 */
+		if (enforce)
+			*cksum_types |= client_t10_types;
+		else
+			*cksum_types |= client_t10_types & server_t10_types;
+		return;
+	}
+
+	/*
+	 * This OST has T10-PI feature. Disable all other checksum types if
+	 * T10-PI checksum is enforced. If the T10-PI checksum is not enforced,
+	 * add the checksum type as an option.
+	 */
+	if (client_t10_types & tgt_t10_cksum_type) {
+		if (enforce)
+			*cksum_types = tgt_t10_cksum_type;
+		else
+			*cksum_types |= tgt_t10_cksum_type;
+	}
+}
+EXPORT_SYMBOL(tgt_mask_cksum_types);
+
+int tgt_tunables_init(struct lu_target *lut)
+{
+	int rc;
+
+	rc = sysfs_create_files(&lut->lut_obd->obd_kset.kobj, tgt_attrs);
+	if (!rc)
+		lut->lut_attrs = tgt_attrs;
+	return rc;
+}
+EXPORT_SYMBOL(tgt_tunables_init);
+
+void tgt_tunables_fini(struct lu_target *lut)
+{
+	if (lut->lut_attrs) {
+		sysfs_remove_files(&lut->lut_obd->obd_kset.kobj,
+				   lut->lut_attrs);
+		lut->lut_attrs = NULL;
+	}
+}
+EXPORT_SYMBOL(tgt_tunables_fini);
+
+/*
+ * Save cross-MDT lock in lut_slc_locks.
+ *
+ * Lock R/W count is not saved, but released in unlock (not canceled remotely),
+ * instead only a refcount is taken, so that the remote MDT where the object
+ * resides can detect conflict with this lock there.
+ *
+ * \param lut target
+ * \param lock cross-MDT lock to save
+ * \param transno when the transaction with this transno is committed, this lock
+ *		  can be canceled.
+ */
+void tgt_save_slc_lock(struct lu_target *lut, struct ldlm_lock *lock,
+		       __u64 transno)
+{
+	spin_lock(&lut->lut_slc_locks_guard);
+	lock_res_and_lock(lock);
+	if (ldlm_is_cbpending(lock)) {
+		/* if it was canceld by server, don't save, because remote MDT
+		 * will do Sync-on-Cancel. */
+		LDLM_LOCK_PUT(lock);
+	} else {
+		lock->l_transno = transno;
+		/* if this lock is in the list already, there are two operations
+		 * both use this lock, and save it after use, so for the second
+		 * one, just put the refcount. */
+		if (list_empty(&lock->l_slc_link))
+			list_add_tail(&lock->l_slc_link, &lut->lut_slc_locks);
+		else
+			LDLM_LOCK_PUT(lock);
+	}
+	unlock_res_and_lock(lock);
+	spin_unlock(&lut->lut_slc_locks_guard);
+}
+EXPORT_SYMBOL(tgt_save_slc_lock);
+
+/*
+ * Discard cross-MDT lock from lut_slc_locks.
+ *
+ * This is called upon BAST, just remove lock from lut_slc_locks and put lock
+ * refcount. The BAST will cancel this lock.
+ *
+ * \param lut target
+ * \param lock cross-MDT lock to discard
+ */
+void tgt_discard_slc_lock(struct lu_target *lut, struct ldlm_lock *lock)
+{
+	spin_lock(&lut->lut_slc_locks_guard);
+	lock_res_and_lock(lock);
+	/* may race with tgt_cancel_slc_locks() */
+	if (lock->l_transno != 0) {
+		LASSERT(!list_empty(&lock->l_slc_link));
+		LASSERT(ldlm_is_cbpending(lock));
+		list_del_init(&lock->l_slc_link);
+		lock->l_transno = 0;
+		LDLM_LOCK_PUT(lock);
+	}
+	unlock_res_and_lock(lock);
+	spin_unlock(&lut->lut_slc_locks_guard);
+}
+EXPORT_SYMBOL(tgt_discard_slc_lock);
+
+/*
+ * Cancel cross-MDT locks upon transaction commit.
+ *
+ * Remove cross-MDT locks from lut_slc_locks, cancel them and put lock refcount.
+ *
+ * \param lut target
+ * \param transno transaction with this number was committed.
+ */
+void tgt_cancel_slc_locks(struct lu_target *lut, __u64 transno)
+{
+	struct ldlm_lock *lock, *next;
+	LIST_HEAD(list);
+	struct lustre_handle lockh;
+
+	spin_lock(&lut->lut_slc_locks_guard);
+	list_for_each_entry_safe(lock, next, &lut->lut_slc_locks,
+				 l_slc_link) {
+		lock_res_and_lock(lock);
+		LASSERT(lock->l_transno != 0);
+		if (lock->l_transno > transno) {
+			unlock_res_and_lock(lock);
+			continue;
+		}
+		/* ouch, another operation is using it after it's saved */
+		if (lock->l_readers != 0 || lock->l_writers != 0) {
+			unlock_res_and_lock(lock);
+			continue;
+		}
+		/* set CBPENDING so that this lock won't be used again */
+		ldlm_set_cbpending(lock);
+		lock->l_transno = 0;
+		list_move(&lock->l_slc_link, &list);
+		unlock_res_and_lock(lock);
+	}
+	spin_unlock(&lut->lut_slc_locks_guard);
+
+	list_for_each_entry_safe(lock, next, &list, l_slc_link) {
+		list_del_init(&lock->l_slc_link);
+		ldlm_lock2handle(lock, &lockh);
+		ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		LDLM_LOCK_PUT(lock);
+	}
+}
+
+int tgt_init(const struct lu_env *env, struct lu_target *lut,
+	     struct obd_device *obd, struct dt_device *dt,
+	     struct tgt_opc_slice *slice, int request_fail_id,
+	     int reply_fail_id)
+{
+	struct dt_object_format	 dof;
+	struct lu_attr		 attr;
+	struct lu_fid		 fid;
+	struct dt_object	*o;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	struct obd_statfs	*osfs;
+	int i, rc = 0;
+
+	ENTRY;
+
+	LASSERT(lut);
+	LASSERT(obd);
+	lut->lut_obd = obd;
+	lut->lut_bottom = dt;
+	lut->lut_last_rcvd = NULL;
+	lut->lut_client_bitmap = NULL;
+	atomic_set(&lut->lut_num_clients, 0);
+	atomic_set(&lut->lut_client_generation, 0);
+	lut->lut_reply_data = NULL;
+	lut->lut_reply_bitmap = NULL;
+	obd->u.obt.obt_lut = lut;
+	obd->u.obt.obt_magic = OBT_MAGIC;
+
+	/* set request handler slice and parameters */
+	lut->lut_slice = slice;
+	lut->lut_reply_fail_id = reply_fail_id;
+	lut->lut_request_fail_id = request_fail_id;
+
+	/* sptlrcp variables init */
+	rwlock_init(&lut->lut_sptlrpc_lock);
+	sptlrpc_rule_set_init(&lut->lut_sptlrpc_rset);
+
+	spin_lock_init(&lut->lut_flags_lock);
+	lut->lut_sync_lock_cancel = SYNC_LOCK_CANCEL_NEVER;
+	lut->lut_cksum_t10pi_enforce = 0;
+	lut->lut_cksum_types_supported =
+		obd_cksum_types_supported_server(obd->obd_name);
+
+	spin_lock_init(&lut->lut_slc_locks_guard);
+	INIT_LIST_HEAD(&lut->lut_slc_locks);
+
+	/* last_rcvd initialization is needed by replayable targets only */
+	if (!obd->obd_replayable)
+		RETURN(0);
+
+	/* initialize grant and statfs data in target */
+	dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf);
+
+	/* statfs data */
+	spin_lock_init(&tgd->tgd_osfs_lock);
+	tgd->tgd_osfs_age = ktime_get_seconds() - 1000;
+	tgd->tgd_osfs_unstable = 0;
+	tgd->tgd_statfs_inflight = 0;
+	tgd->tgd_osfs_inflight = 0;
+
+	/* grant data */
+	spin_lock_init(&tgd->tgd_grant_lock);
+	tgd->tgd_tot_dirty = 0;
+	tgd->tgd_tot_granted = 0;
+	tgd->tgd_tot_pending = 0;
+	tgd->tgd_grant_compat_disable = 0;
+
+	/* populate cached statfs data */
+	osfs = &tgt_th_info(env)->tti_u.osfs;
+	rc = tgt_statfs_internal(env, lut, osfs, 0, NULL);
+	if (rc != 0) {
+		CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut),
+			rc);
+		GOTO(out, rc);
+	}
+	if (!is_power_of_2(osfs->os_bsize)) {
+		CERROR("%s: blocksize (%d) is not a power of 2\n",
+			tgt_name(lut), osfs->os_bsize);
+		GOTO(out, rc = -EPROTO);
+	}
+	tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+
+	spin_lock_init(&lut->lut_translock);
+	spin_lock_init(&lut->lut_client_bitmap_lock);
+
+	OBD_ALLOC(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3);
+	if (lut->lut_client_bitmap == NULL)
+		RETURN(-ENOMEM);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.la_valid = LA_MODE;
+	attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	lu_local_obj_fid(&fid, LAST_RECV_OID);
+
+	o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr);
+	if (IS_ERR(o)) {
+		rc = PTR_ERR(o);
+		CERROR("%s: cannot open LAST_RCVD: rc = %d\n", tgt_name(lut),
+		       rc);
+		GOTO(out_put, rc);
+	}
+
+	lut->lut_last_rcvd = o;
+	rc = tgt_server_data_init(env, lut);
+	if (rc < 0)
+		GOTO(out_put, rc);
+
+	/* prepare transactions callbacks */
+	lut->lut_txn_cb.dtc_txn_start = tgt_txn_start_cb;
+	lut->lut_txn_cb.dtc_txn_stop = tgt_txn_stop_cb;
+	lut->lut_txn_cb.dtc_cookie = lut;
+	lut->lut_txn_cb.dtc_tag = LCT_DT_THREAD | LCT_MD_THREAD;
+	INIT_LIST_HEAD(&lut->lut_txn_cb.dtc_linkage);
+
+	dt_txn_callback_add(lut->lut_bottom, &lut->lut_txn_cb);
+	lut->lut_bottom->dd_lu_dev.ld_site->ls_tgt = lut;
+
+	lut->lut_fmd_max_num = LUT_FMD_MAX_NUM_DEFAULT;
+	lut->lut_fmd_max_age = LUT_FMD_MAX_AGE_DEFAULT;
+
+	atomic_set(&lut->lut_sync_count, 0);
+
+	/* reply_data is supported by MDT targets only for now */
+	if (strncmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) != 0)
+		RETURN(0);
+
+	OBD_ALLOC(lut->lut_reply_bitmap,
+		  LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *));
+	if (lut->lut_reply_bitmap == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.la_valid = LA_MODE;
+	attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	lu_local_obj_fid(&fid, REPLY_DATA_OID);
+
+	o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr);
+	if (IS_ERR(o)) {
+		rc = PTR_ERR(o);
+		CERROR("%s: cannot open REPLY_DATA: rc = %d\n", tgt_name(lut),
+		       rc);
+		GOTO(out, rc);
+	}
+	lut->lut_reply_data = o;
+
+	rc = tgt_reply_data_init(env, lut);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	RETURN(0);
+
+out:
+	dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb);
+out_put:
+	obd->u.obt.obt_magic = 0;
+	obd->u.obt.obt_lut = NULL;
+	if (lut->lut_last_rcvd != NULL) {
+		dt_object_put(env, lut->lut_last_rcvd);
+		lut->lut_last_rcvd = NULL;
+	}
+	if (lut->lut_client_bitmap != NULL)
+		OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3);
+	lut->lut_client_bitmap = NULL;
+	if (lut->lut_reply_data != NULL)
+		dt_object_put(env, lut->lut_reply_data);
+	lut->lut_reply_data = NULL;
+	if (lut->lut_reply_bitmap != NULL) {
+		for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) {
+			if (lut->lut_reply_bitmap[i] != NULL)
+				OBD_FREE_LARGE(lut->lut_reply_bitmap[i],
+				    BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+				    sizeof(long));
+			lut->lut_reply_bitmap[i] = NULL;
+		}
+		OBD_FREE(lut->lut_reply_bitmap,
+			 LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *));
+	}
+	lut->lut_reply_bitmap = NULL;
+	return rc;
+}
+EXPORT_SYMBOL(tgt_init);
+
+void tgt_fini(const struct lu_env *env, struct lu_target *lut)
+{
+	int i;
+	int rc;
+	ENTRY;
+
+	if (lut->lut_lsd.lsd_feature_incompat & OBD_INCOMPAT_MULTI_RPCS &&
+	    atomic_read(&lut->lut_num_clients) == 0) {
+		/* Clear MULTI RPCS incompatibility flag that prevents previous
+		 * Lustre versions to mount a target with reply_data file */
+		lut->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS;
+		rc = tgt_server_data_update(env, lut, 1);
+		if (rc < 0)
+			CERROR("%s: unable to clear MULTI RPCS "
+			       "incompatibility flag\n",
+			       lut->lut_obd->obd_name);
+	}
+
+	sptlrpc_rule_set_free(&lut->lut_sptlrpc_rset);
+
+	if (lut->lut_reply_data != NULL)
+		dt_object_put(env, lut->lut_reply_data);
+	lut->lut_reply_data = NULL;
+	if (lut->lut_reply_bitmap != NULL) {
+		for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) {
+			if (lut->lut_reply_bitmap[i] != NULL)
+				OBD_FREE_LARGE(lut->lut_reply_bitmap[i],
+				    BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+				    sizeof(long));
+			lut->lut_reply_bitmap[i] = NULL;
+		}
+		OBD_FREE(lut->lut_reply_bitmap,
+			 LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *));
+	}
+	lut->lut_reply_bitmap = NULL;
+	if (lut->lut_client_bitmap) {
+		OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3);
+		lut->lut_client_bitmap = NULL;
+	}
+	if (lut->lut_last_rcvd) {
+		dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb);
+		dt_object_put(env, lut->lut_last_rcvd);
+		lut->lut_last_rcvd = NULL;
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_fini);
+
+static struct kmem_cache *tgt_thread_kmem;
+static struct kmem_cache *tgt_session_kmem;
+struct kmem_cache *tgt_fmd_kmem;
+
+static struct lu_kmem_descr tgt_caches[] = {
+	{
+		.ckd_cache = &tgt_thread_kmem,
+		.ckd_name  = "tgt_thread_kmem",
+		.ckd_size  = sizeof(struct tgt_thread_info),
+	},
+	{
+		.ckd_cache = &tgt_session_kmem,
+		.ckd_name  = "tgt_session_kmem",
+		.ckd_size  = sizeof(struct tgt_session_info)
+	},
+	{
+		.ckd_cache = &tgt_fmd_kmem,
+		.ckd_name  = "tgt_fmd_cache",
+		.ckd_size  = sizeof(struct tgt_fmd_data)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+
+/* context key constructor/destructor: tg_key_init, tg_key_fini */
+static void *tgt_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct tgt_thread_info *thread;
+
+	OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS);
+	if (thread == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	return thread;
+}
+
+static void tgt_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct tgt_thread_info		*info = data;
+	struct thandle_exec_args	*args = &info->tti_tea;
+	int				i;
+
+	for (i = 0; i < args->ta_alloc_args; i++) {
+		if (args->ta_args[i] != NULL)
+			OBD_FREE_PTR(args->ta_args[i]);
+	}
+
+	if (args->ta_args != NULL)
+		OBD_FREE_PTR_ARRAY(args->ta_args, args->ta_alloc_args);
+	OBD_SLAB_FREE_PTR(info, tgt_thread_kmem);
+}
+
+static void tgt_key_exit(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct tgt_thread_info *tti = data;
+
+	tti->tti_has_trans = 0;
+	tti->tti_mult_trans = 0;
+}
+
+/* context key: tg_thread_key */
+struct lu_context_key tgt_thread_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD,
+	.lct_init = tgt_key_init,
+	.lct_fini = tgt_key_fini,
+	.lct_exit = tgt_key_exit,
+};
+
+LU_KEY_INIT_GENERIC(tgt);
+
+static void *tgt_ses_key_init(const struct lu_context *ctx,
+			      struct lu_context_key *key)
+{
+	struct tgt_session_info *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS);
+	if (session == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	return session;
+}
+
+static void tgt_ses_key_fini(const struct lu_context *ctx,
+			     struct lu_context_key *key, void *data)
+{
+	struct tgt_session_info *session = data;
+
+	OBD_SLAB_FREE_PTR(session, tgt_session_kmem);
+}
+
+/* context key: tgt_session_key */
+struct lu_context_key tgt_session_key = {
+	.lct_tags = LCT_SERVER_SESSION,
+	.lct_init = tgt_ses_key_init,
+	.lct_fini = tgt_ses_key_fini,
+};
+EXPORT_SYMBOL(tgt_session_key);
+
+LU_KEY_INIT_GENERIC(tgt_ses);
+
+/*
+ * this page is allocated statically when module is initializing
+ * it is used to simulate data corruptions, see ost_checksum_bulk()
+ * for details. as the original pages provided by the layers below
+ * can be remain in the internal cache, we do not want to modify
+ * them.
+ */
+struct page *tgt_page_to_corrupt;
+
+int tgt_mod_init(void)
+{
+	int	result;
+	ENTRY;
+
+	result = lu_kmem_init(tgt_caches);
+	if (result != 0)
+		RETURN(result);
+
+	tgt_page_to_corrupt = alloc_page(GFP_KERNEL);
+
+	tgt_key_init_generic(&tgt_thread_key, NULL);
+	lu_context_key_register_many(&tgt_thread_key, NULL);
+
+	tgt_ses_key_init_generic(&tgt_session_key, NULL);
+	lu_context_key_register_many(&tgt_session_key, NULL);
+	barrier_init();
+
+	update_info_init();
+
+	RETURN(0);
+}
+
+void tgt_mod_exit(void)
+{
+	barrier_fini();
+	if (tgt_page_to_corrupt != NULL)
+		put_page(tgt_page_to_corrupt);
+
+	lu_context_key_degister(&tgt_thread_key);
+	lu_context_key_degister(&tgt_session_key);
+	update_info_fini();
+
+	lu_kmem_fini(tgt_caches);
+}
+
diff --git a/drivers/staging/lustrefsx/lustre/target/update_records.c b/drivers/staging/lustrefsx/lustre/target/update_records.c
new file mode 100644
index 0000000000000..56a833c03069f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/update_records.c
@@ -0,0 +1,1232 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, 2017, Intel Corporation.
+ */
+
+/*
+ * lustre/target/update_records.c
+ *
+ * This file implement the methods to pack updates as update records, which
+ * will be written to the disk as llog record, and might be used during
+ * recovery.
+ *
+ * For cross-MDT operation, all of updates of the operation needs to be
+ * recorded in the disk, then during recovery phase, the recovery thread
+ * will retrieve and redo these updates if it needed.
+ *
+ * See comments above struct update_records for the format of update_records.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <lu_target.h>
+#include <lustre_obdo.h>
+#include <lustre_update.h>
+#include <obd.h>
+#include <obd_class.h>
+
+#include "tgt_internal.h"
+
+#define UPDATE_RECORDS_BUFFER_SIZE	8192
+#define UPDATE_PARAMS_BUFFER_SIZE	8192
+/**
+ * Dump update record.
+ *
+ * Dump all of updates in the update_records, mostly for debugging purpose.
+ *
+ * \param[in] records	update records to be dumpped
+ * \param[in] mask	debug level mask
+ * \param[in] dump_params if dump all of updates the updates.
+ *
+ */
+void update_records_dump(const struct update_records *records,
+			 unsigned int mask, bool dump_updates)
+{
+	const struct update_ops	*ops;
+	const struct update_op	*op = NULL;
+	struct update_params	*params = NULL;
+	unsigned int		i;
+
+	CDEBUG(mask, "master transno = %llu batchid = %llu flags = %x"
+	       " ops = %d params = %d\n", records->ur_master_transno,
+	       records->ur_batchid, records->ur_flags, records->ur_update_count,
+	       records->ur_param_count);
+
+	if (records->ur_update_count == 0)
+		return;
+
+	if (!dump_updates)
+		return;
+
+	ops = &records->ur_ops;
+	if (records->ur_param_count > 0)
+		params = update_records_get_params(records);
+
+	op = &ops->uops_op[0];
+	for (i = 0; i < records->ur_update_count; i++,
+				  op = update_op_next_op(op)) {
+		unsigned int j;
+
+		CDEBUG(mask, "update %dth "DFID" %s params_count = %hu\n", i,
+		       PFID(&op->uop_fid), update_op_str(op->uop_type),
+		       op->uop_param_count);
+
+		if (params == NULL)
+			continue;
+
+		for (j = 0;  j < op->uop_param_count; j++) {
+			struct object_update_param *param;
+
+			param = update_params_get_param(params,
+				(unsigned int)op->uop_params_off[j],
+					records->ur_param_count);
+
+			if (param == NULL)
+				continue;
+			CDEBUG(mask, "param = %p %dth off = %hu size = %hu\n",
+			       param, j, op->uop_params_off[j], param->oup_len);
+		}
+	}
+}
+
+/**
+ * Pack parameters to update records
+ *
+ * Find and insert parameter to update records, if the parameter
+ * already exists in \a params, then just return the offset of this
+ * parameter, otherwise insert the parameter and return its offset
+ *
+ * \param[in] params	update params in which to insert parameter
+ * \param[in] new_param	parameters to be inserted.
+ * \param[in] new_param_size	the size of \a new_param
+ *
+ * \retval		index inside \a params if parameter insertion
+ *                      succeeds.
+ * \retval		negative errno if it fails.
+ */
+static unsigned int update_records_param_pack(struct update_params *params,
+					      const void *new_param,
+					      size_t new_param_size,
+					      unsigned int *param_count)
+{
+	struct object_update_param	*param;
+	unsigned int			i;
+
+	for (i = 0; i < *param_count; i++) {
+		struct object_update_param *param;
+
+		param = update_params_get_param(params, i, *param_count);
+		if ((new_param == NULL && param->oup_len == new_param_size) ||
+		    (param->oup_len == new_param_size &&
+		     memcmp(param->oup_buf, new_param, new_param_size) == 0))
+			/* Found the parameter and return its index */
+			return i;
+	}
+
+	param = (struct object_update_param *)((char *)params +
+				update_params_size(params, *param_count));
+
+	param->oup_len = new_param_size;
+	if (new_param != NULL)
+		memcpy(param->oup_buf, new_param, new_param_size);
+
+	*param_count = *param_count + 1;
+
+	return *param_count - 1;
+}
+
+/**
+ * Pack update to update records
+ *
+ * Pack the update and its parameters to the update records. First it will
+ * insert parameters, get the offset of these parameter, then fill the
+ * update with these offset. If insertion exceed the maximum size of
+ * current update records, it will return -E2BIG here, and the caller might
+ * extend the update_record size \see lod_updates_pack.
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the update.
+ * \param[in] op_type	operation type of the update
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] param_bufs	buffers of parameters
+ * \param[in] params_buf_count	the count of the parameter buffers
+ * \param[in] param_size	sizes of parameters
+ *
+ * \retval		0 if packing succeeds
+ * \retval		negative errno if packing fails
+ */
+static int update_records_update_pack(const struct lu_env *env,
+				      const struct lu_fid *fid,
+				      enum update_type op_type,
+				      struct update_ops *ops,
+				      unsigned int *op_count,
+				      size_t *max_op_size,
+				      struct update_params *params,
+				      unsigned int *param_count,
+				      size_t *max_param_size,
+				      unsigned int param_bufs_count,
+				      const void **param_bufs,
+				      size_t *param_sizes)
+{
+	struct update_op	*op;
+	size_t			total_param_sizes = 0;
+	int			index;
+	unsigned int		i;
+
+	/* Check whether the packing exceeding the maximum update size */
+	if (unlikely(*max_op_size < update_op_size(param_bufs_count))) {
+		CDEBUG(D_INFO, "max_op_size = %zu update_op = %zu\n",
+		       *max_op_size, update_op_size(param_bufs_count));
+		*max_op_size = update_op_size(param_bufs_count);
+		return -E2BIG;
+	}
+
+	for (i = 0; i < param_bufs_count; i++)
+		total_param_sizes +=
+			cfs_size_round(sizeof(struct object_update_param) +
+				       param_sizes[i]);
+
+	/* Check whether the packing exceeding the maximum parameter size */
+	if (unlikely(*max_param_size < total_param_sizes)) {
+		CDEBUG(D_INFO, "max_param_size = %zu params size = %zu\n",
+		       *max_param_size, total_param_sizes);
+
+		*max_param_size = total_param_sizes;
+		return -E2BIG;
+	}
+
+	op = update_ops_get_op(ops, *op_count, *op_count);
+	op->uop_fid = *fid;
+	op->uop_type = op_type;
+	op->uop_param_count = param_bufs_count;
+	for (i = 0; i < param_bufs_count; i++) {
+		index = update_records_param_pack(params, param_bufs[i],
+						  param_sizes[i], param_count);
+		if (index < 0)
+			return index;
+
+		CDEBUG(D_INFO, "%s %uth param offset = %d size = %zu\n",
+		       update_op_str(op_type), i, index, param_sizes[i]);
+
+		op->uop_params_off[i] = index;
+	}
+	CDEBUG(D_INFO, "%huth "DFID" %s param_count = %u\n",
+	       *op_count, PFID(fid), update_op_str(op_type), *param_count);
+
+	*op_count = *op_count + 1;
+
+	return 0;
+}
+
+/**
+ * Calculate update_records size
+ *
+ * Calculate update_records size by param_count and param_sizes array.
+ *
+ * \param[in] param_count	the count of parameters
+ * \param[in] sizes		the size array of these parameters
+ *
+ * \retval			the size of this update
+ */
+static size_t update_records_update_size(__u32 param_count, size_t *sizes)
+{
+	int i;
+	size_t size;
+
+	/* Check whether the packing exceeding the maximum update size */
+	size = update_op_size(param_count);
+
+	for (i = 0; i < param_count; i++)
+		size += cfs_size_round(sizeof(struct object_update_param) +
+				       sizes[i]);
+
+	return size;
+}
+
+/**
+ * Calculate create update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in] fid	FID of the object to be created
+ * \param[in] attr	attribute of the object to be created
+ * \param[in] hint	creation hint
+ * \param[in] dof	creation format information
+ *
+ * \retval		size of create update.
+ */
+size_t update_records_create_size(const struct lu_env *env,
+				  const struct lu_fid *fid,
+				  const struct lu_attr *attr,
+				  const struct dt_allocation_hint *hint,
+				  struct dt_object_format *dof)
+{
+	size_t	sizes[2];
+	int	param_count = 0;
+
+	if (attr != NULL) {
+		sizes[param_count] = sizeof(struct obdo);
+		param_count++;
+	}
+
+	if (hint != NULL && hint->dah_parent != NULL) {
+		sizes[param_count] = sizeof(*fid);
+		param_count++;
+	}
+
+	return update_records_update_size(param_count, sizes);
+}
+EXPORT_SYMBOL(update_records_create_size);
+
+/**
+ * Pack create update
+ *
+ * Pack create update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to be created
+ * \param[in] attr	attribute of the object to be created
+ * \param[in] hint	creation hint
+ * \param[in] dof	creation format information
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_create_pack(const struct lu_env *env,
+			       struct update_ops *ops,
+			       unsigned int *op_count,
+			       size_t *max_ops_size,
+			       struct update_params *params,
+			       unsigned int *param_count,
+			       size_t *max_param_size,
+			       const struct lu_fid *fid,
+			       const struct lu_attr *attr,
+			       const struct dt_allocation_hint *hint,
+			       struct dt_object_format *dof)
+{
+	size_t			sizes[2];
+	const void		*bufs[2];
+	int			buf_count = 0;
+	const struct lu_fid	*parent_fid = NULL;
+	struct lu_fid		tmp_fid;
+	int			rc;
+	struct obdo		*obdo;
+
+	if (attr != NULL) {
+		obdo = &update_env_info(env)->uti_obdo;
+		obdo->o_valid = 0;
+		obdo_from_la(obdo, attr, attr->la_valid);
+		bufs[buf_count] = obdo;
+		sizes[buf_count] = sizeof(*obdo);
+		buf_count++;
+	}
+
+	if (hint != NULL && hint->dah_parent != NULL) {
+		parent_fid = lu_object_fid(&hint->dah_parent->do_lu);
+		fid_cpu_to_le(&tmp_fid, parent_fid);
+		bufs[buf_count] = &tmp_fid;
+		sizes[buf_count] = sizeof(tmp_fid);
+		buf_count++;
+	}
+
+	rc = update_records_update_pack(env, fid, OUT_CREATE, ops, op_count,
+					max_ops_size, params, param_count,
+					max_param_size, buf_count, bufs, sizes);
+	return rc;
+}
+EXPORT_SYMBOL(update_records_create_pack);
+
+/**
+ * Calculate attr set update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in] fid	FID of the object to set attr
+ * \param[in] attr	attribute of attr set
+ *
+ * \retval		size of attr set update.
+ */
+size_t update_records_attr_set_size(const struct lu_env *env,
+				    const struct lu_fid *fid,
+				    const struct lu_attr *attr)
+{
+	size_t size = sizeof(struct obdo);
+
+	return update_records_update_size(1, &size);
+}
+EXPORT_SYMBOL(update_records_attr_set_size);
+
+/**
+ * Pack attr set update
+ *
+ * Pack attr_set update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to set attr
+ * \param[in] attr	attribute of attr set
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_attr_set_pack(const struct lu_env *env,
+				 struct update_ops *ops,
+				 unsigned int *op_count,
+				 size_t *max_ops_size,
+				 struct update_params *params,
+				 unsigned int *param_count,
+				 size_t *max_param_size,
+				 const struct lu_fid *fid,
+				 const struct lu_attr *attr)
+{
+	struct obdo *obdo = &update_env_info(env)->uti_obdo;
+	size_t size = sizeof(*obdo);
+
+	obdo->o_valid = 0;
+	obdo_from_la(obdo, attr, attr->la_valid);
+	return update_records_update_pack(env, fid, OUT_ATTR_SET, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 1,
+					  (const void **)&obdo, &size);
+}
+EXPORT_SYMBOL(update_records_attr_set_pack);
+
+/**
+ * Calculate ref add update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to add reference
+ *
+ * \retval		size of ref_add udpate.
+ */
+size_t update_records_ref_add_size(const struct lu_env *env,
+				   const struct lu_fid *fid)
+{
+	return update_records_update_size(0, NULL);
+}
+EXPORT_SYMBOL(update_records_ref_add_size);
+
+/**
+ * Pack ref add update
+ *
+ * Pack ref add update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to add reference
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_ref_add_pack(const struct lu_env *env,
+				struct update_ops *ops,
+				unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid)
+{
+	return update_records_update_pack(env, fid, OUT_REF_ADD, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(update_records_ref_add_pack);
+
+/**
+ * Pack noop update
+ *
+ * Pack no op update into update records. Note: no op means
+ * the update does not need do anything, which is only used
+ * in test case to verify large size record.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to add reference
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_noop_pack(const struct lu_env *env,
+			     struct update_ops *ops,
+			     unsigned int *op_count,
+			     size_t *max_ops_size,
+			     struct update_params *params,
+			     unsigned int *param_count,
+			     size_t *max_param_size,
+			     const struct lu_fid *fid)
+{
+	return update_records_update_pack(env, fid, OUT_NOOP, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(update_records_noop_pack);
+
+/**
+ * Calculate ref del update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to delete reference
+ *
+ * \retval		size of ref_del update.
+ */
+size_t update_records_ref_del_size(const struct lu_env *env,
+				   const struct lu_fid *fid)
+{
+	return update_records_update_size(0, NULL);
+}
+EXPORT_SYMBOL(update_records_ref_del_size);
+
+/**
+ * Pack ref del update
+ *
+ * Pack ref del update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to delete reference
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_ref_del_pack(const struct lu_env *env,
+				struct update_ops *ops,
+				unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid)
+{
+	return update_records_update_pack(env, fid, OUT_REF_DEL, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(update_records_ref_del_pack);
+
+/**
+ * Calculate object destroy update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to delete reference
+ *
+ * \retval		size of object destroy update.
+ */
+size_t update_records_destroy_size(const struct lu_env *env,
+					  const struct lu_fid *fid)
+{
+	return update_records_update_size(0, NULL);
+}
+EXPORT_SYMBOL(update_records_destroy_size);
+
+/**
+ * Pack object destroy update
+ *
+ * Pack object destroy update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to delete reference
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_destroy_pack(const struct lu_env *env,
+				       struct update_ops *ops,
+				       unsigned int *op_count,
+				       size_t *max_ops_size,
+				       struct update_params *params,
+				       unsigned int *param_count,
+				       size_t *max_param_size,
+				       const struct lu_fid *fid)
+{
+	return update_records_update_pack(env, fid, OUT_DESTROY, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(update_records_destroy_pack);
+
+/**
+ * Calculate index insert update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to insert index
+ * \param[in] rec	record of insertion
+ * \param[in] key	key of insertion
+ *
+ * \retval		the size of index insert update.
+ */
+size_t update_records_index_insert_size(const struct lu_env *env,
+					const struct lu_fid *fid,
+					const struct dt_rec *rec,
+					const struct dt_key *key)
+{
+	size_t			   sizes[3] = { strlen((const char *)key) + 1,
+						sizeof(struct lu_fid),
+						sizeof(__u32) };
+	return update_records_update_size(3, sizes);
+}
+EXPORT_SYMBOL(update_records_index_insert_size);
+
+/**
+ * Pack index insert update
+ *
+ * Pack index insert update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to insert index
+ * \param[in] rec	record of insertion
+ * \param[in] key	key of insertion
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_index_insert_pack(const struct lu_env *env,
+				     struct update_ops *ops,
+				     unsigned int *op_count,
+				     size_t *max_ops_size,
+				     struct update_params *params,
+				     unsigned int *param_count,
+				     size_t *max_param_size,
+				     const struct lu_fid *fid,
+				     const struct dt_rec *rec,
+				     const struct dt_key *key)
+{
+	struct dt_insert_rec	   *rec1 = (struct dt_insert_rec *)rec;
+	struct lu_fid		   rec_fid;
+	__u32			   type = cpu_to_le32(rec1->rec_type);
+	size_t			   sizes[3] = { strlen((const char *)key) + 1,
+						sizeof(rec_fid),
+						sizeof(type) };
+	const void		   *bufs[3] = { key,
+						&rec_fid,
+						&type };
+
+	fid_cpu_to_le(&rec_fid, rec1->rec_fid);
+
+	return update_records_update_pack(env, fid, OUT_INDEX_INSERT, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  3, bufs, sizes);
+}
+EXPORT_SYMBOL(update_records_index_insert_pack);
+
+/**
+ * Calculate index delete update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to delete index
+ * \param[in] key	key of deletion
+ *
+ * \retval		the size of index delete update
+ */
+size_t update_records_index_delete_size(const struct lu_env *env,
+					const struct lu_fid *fid,
+					const struct dt_key *key)
+{
+	size_t size = strlen((const char *)key) + 1;
+
+	return update_records_update_size(1, &size);
+}
+EXPORT_SYMBOL(update_records_index_delete_size);
+
+/**
+ * Pack index delete update
+ *
+ * Pack index delete update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|ount] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to delete index
+ * \param[in] key	key of deletion
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_index_delete_pack(const struct lu_env *env,
+				     struct update_ops *ops,
+				     unsigned int *op_count,
+				     size_t *max_ops_size,
+				     struct update_params *params,
+				     unsigned int *param_count,
+				     size_t *max_param_size,
+				     const struct lu_fid *fid,
+				     const struct dt_key *key)
+{
+	size_t size = strlen((const char *)key) + 1;
+
+	return update_records_update_pack(env, fid, OUT_INDEX_DELETE, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  1, (const void **)&key, &size);
+}
+EXPORT_SYMBOL(update_records_index_delete_pack);
+
+/**
+ * Calculate xattr set size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to set xattr
+ * \param[in] buf	xattr to be set
+ * \param[in] name	name of the xattr
+ * \param[in] flag	flag for setting xattr
+ *
+ * \retval		size of xattr set update.
+ */
+size_t update_records_xattr_set_size(const struct lu_env *env,
+				     const struct lu_fid *fid,
+				     const struct lu_buf *buf,
+				     const char *name, __u32 flag)
+{
+	size_t	sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)};
+
+	return update_records_update_size(3, sizes);
+}
+EXPORT_SYMBOL(update_records_xattr_set_size);
+
+/**
+ * Pack xattr set update
+ *
+ * Pack xattr set update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to set xattr
+ * \param[in] buf	xattr to be set
+ * \param[in] name	name of the xattr
+ * \param[in] flag	flag for setting xattr
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_xattr_set_pack(const struct lu_env *env,
+				  struct update_ops *ops,
+				  unsigned int *op_count,
+				  size_t *max_ops_size,
+				  struct update_params *params,
+				  unsigned int *param_count,
+				  size_t *max_param_size,
+				  const struct lu_fid *fid,
+				  const struct lu_buf *buf, const char *name,
+				  __u32 flag)
+{
+	size_t	sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)};
+	const void *bufs[3] = {name, buf->lb_buf, &flag};
+
+	flag = cpu_to_le32(flag);
+
+	return update_records_update_pack(env, fid, OUT_XATTR_SET, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  3, bufs, sizes);
+}
+EXPORT_SYMBOL(update_records_xattr_set_pack);
+
+/**
+ * Calculate xattr delete update size.
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to delete xattr
+ * \param[in] name	name of the xattr
+ *
+ * \retval		size of xattr delet updatee.
+ */
+size_t update_records_xattr_del_size(const struct lu_env *env,
+				     const struct lu_fid *fid,
+				     const char *name)
+{
+	size_t	size = strlen(name) + 1;
+
+	return update_records_update_size(1, &size);
+}
+EXPORT_SYMBOL(update_records_xattr_del_size);
+
+/**
+ * Pack xattr delete update
+ *
+ * Pack xattr delete update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to delete xattr
+ * \param[in] name	name of the xattr
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_xattr_del_pack(const struct lu_env *env,
+				  struct update_ops *ops,
+				  unsigned int *op_count,
+				  size_t *max_ops_size,
+				  struct update_params *params,
+				  unsigned int *param_count,
+				  size_t *max_param_size,
+				  const struct lu_fid *fid,
+				  const char *name)
+{
+	size_t	size = strlen(name) + 1;
+
+	return update_records_update_pack(env, fid, OUT_XATTR_DEL, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  1, (const void **)&name, &size);
+}
+EXPORT_SYMBOL(update_records_xattr_del_pack);
+
+/**
+ * Calculate write update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to write into
+ * \param[in] buf	buffer to write which includes an embedded size field
+ * \param[in] pos	offet in the object to start writing at
+ *
+ * \retval		size of write udpate.
+ */
+size_t update_records_write_size(const struct lu_env *env,
+				 const struct lu_fid *fid,
+				 const struct lu_buf *buf,
+				 __u64 pos)
+{
+	size_t	sizes[2] = {buf->lb_len, sizeof(pos)};
+
+	return update_records_update_size(2, sizes);
+}
+EXPORT_SYMBOL(update_records_write_size);
+
+/**
+ * Pack write update
+ *
+ * Pack write update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to write into
+ * \param[in] buf	buffer to write which includes an embedded size field
+ * \param[in] pos	offet in the object to start writing at
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_write_pack(const struct lu_env *env,
+			      struct update_ops *ops,
+			      unsigned int *op_count,
+			      size_t *max_ops_size,
+			      struct update_params *params,
+			      unsigned int *param_count,
+			      size_t *max_param_size,
+			      const struct lu_fid *fid,
+			      const struct lu_buf *buf,
+			      __u64 pos)
+{
+	size_t		sizes[2] = {buf->lb_len, sizeof(pos)};
+	const void	*bufs[2] = {buf->lb_buf, &pos};
+
+	pos = cpu_to_le64(pos);
+
+	return update_records_update_pack(env, fid, OUT_WRITE, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  2, bufs, sizes);
+}
+EXPORT_SYMBOL(update_records_write_pack);
+
+/**
+ * Calculate size of punch update.
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to write into
+ * \param[in] start	start offset of punch
+ * \param[in] end	end offet of punch
+ *
+ * \retval		size of update punch.
+ */
+size_t update_records_punch_size(const struct lu_env *env,
+				 const struct lu_fid *fid,
+				 __u64 start, __u64 end)
+{
+	size_t	sizes[2] = {sizeof(start), sizeof(end)};
+
+	return update_records_update_size(2, sizes);
+}
+EXPORT_SYMBOL(update_records_punch_size);
+
+/**
+ * Pack punch
+ *
+ * Pack punch update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to write into
+ * \param[in] start	start offset of punch
+ * \param[in] end	end offet of punch
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_punch_pack(const struct lu_env *env,
+			      struct update_ops *ops,
+			      unsigned int *op_count,
+			      size_t *max_ops_size,
+			      struct update_params *params,
+			      unsigned int *param_count,
+			      size_t *max_param_size,
+			      const struct lu_fid *fid,
+			      __u64 start, __u64 end)
+{
+	size_t		sizes[2] = {sizeof(start), sizeof(end)};
+	const void	*bufs[2] = {&start, &end};
+
+	start = cpu_to_le64(start);
+	end = cpu_to_le64(end);
+
+	return update_records_update_pack(env, fid, OUT_PUNCH, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 2, bufs, sizes);
+}
+EXPORT_SYMBOL(update_records_punch_pack);
+
+/**
+ * Create update records in thandle_update_records
+ *
+ * Allocate update_records for thandle_update_records, the initial size
+ * will be 4KB.
+ *
+ * \param[in] tur	thandle_update_records where update_records will be
+ *                      allocated
+ * \retval		0 if allocation succeeds.
+ * \retval		negative errno if allocation fails.
+ */
+static int tur_update_records_create(struct thandle_update_records *tur)
+{
+	if (tur->tur_update_records != NULL)
+		return 0;
+
+	OBD_ALLOC_LARGE(tur->tur_update_records,
+			UPDATE_RECORDS_BUFFER_SIZE);
+
+	if (tur->tur_update_records == NULL)
+		return -ENOMEM;
+
+	tur->tur_update_records_buf_size = UPDATE_RECORDS_BUFFER_SIZE;
+
+	return 0;
+}
+
+/**
+ * Extend update records
+ *
+ * Extend update_records to the new size in thandle_update_records.
+ *
+ * \param[in] tur	thandle_update_records where update_records will be
+ *                      extended.
+ * \retval		0 if extension succeeds.
+ * \retval		negative errno if extension fails.
+ */
+int tur_update_records_extend(struct thandle_update_records *tur,
+			      size_t new_size)
+{
+	struct llog_update_record	*record;
+
+	OBD_ALLOC_LARGE(record, new_size);
+	if (record == NULL)
+		return -ENOMEM;
+
+	if (tur->tur_update_records != NULL) {
+		memcpy(record, tur->tur_update_records,
+		       tur->tur_update_records_buf_size);
+		OBD_FREE_LARGE(tur->tur_update_records,
+			       tur->tur_update_records_buf_size);
+	}
+
+	tur->tur_update_records = record;
+	tur->tur_update_records_buf_size = new_size;
+
+	return 0;
+}
+EXPORT_SYMBOL(tur_update_records_extend);
+
+/**
+ * Extend update records
+ *
+ * Extend update records in thandle to make sure it is able to hold
+ * the update with certain update_op and params size.
+ *
+ * \param [in] tur	thandle_update_records to be extend
+ * \param [in] new_op_size update_op size of the update record
+ * \param [in] new_param_size params size of the update record
+ *
+ * \retval		0 if the update_records is being extended.
+ * \retval		negative errno if the update_records is not being
+ *                      extended.
+ */
+int tur_update_extend(struct thandle_update_records *tur,
+		      size_t new_op_size, size_t new_param_size)
+{
+	size_t record_size;
+	size_t params_size;
+	size_t extend_size;
+	int rc;
+	ENTRY;
+
+	record_size = llog_update_record_size(tur->tur_update_records);
+	/* extend update records buffer */
+	if (new_op_size >= (tur->tur_update_records_buf_size - record_size)) {
+		extend_size = round_up(new_op_size, UPDATE_RECORDS_BUFFER_SIZE);
+		rc = tur_update_records_extend(tur,
+				tur->tur_update_records_buf_size +
+				extend_size);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	/* extend parameters buffer */
+	params_size = update_params_size(tur->tur_update_params,
+					 tur->tur_update_param_count);
+	if (new_param_size >= (tur->tur_update_params_buf_size -
+			      params_size)) {
+		extend_size = round_up(new_param_size,
+				       UPDATE_PARAMS_BUFFER_SIZE);
+		rc = tur_update_params_extend(tur,
+				tur->tur_update_params_buf_size +
+				extend_size);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(tur_update_extend);
+
+/**
+ * Create update params in thandle_update_records
+ *
+ * Allocate update_params for thandle_update_records, the initial size
+ * will be 4KB.
+ *
+ * \param[in] tur	thandle_update_records where update_params will be
+ *                      allocated
+ * \retval		0 if allocation succeeds.
+ * \retval		negative errno if allocation fails.
+ */
+static int tur_update_params_create(struct thandle_update_records *tur)
+{
+	if (tur->tur_update_params != NULL)
+		return 0;
+
+	OBD_ALLOC_LARGE(tur->tur_update_params, UPDATE_PARAMS_BUFFER_SIZE);
+	if (tur->tur_update_params == NULL)
+		return -ENOMEM;
+
+	tur->tur_update_params_buf_size = UPDATE_PARAMS_BUFFER_SIZE;
+	return 0;
+}
+
+/**
+ * Extend update params
+ *
+ * Extend update_params to the new size in thandle_update_records.
+ *
+ * \param[in] tur	thandle_update_records where update_params will be
+ *                      extended.
+ * \retval		0 if extension succeeds.
+ * \retval		negative errno if extension fails.
+ */
+int tur_update_params_extend(struct thandle_update_records *tur,
+			     size_t new_size)
+{
+	struct update_params	*params;
+
+	OBD_ALLOC_LARGE(params, new_size);
+	if (params == NULL)
+		return -ENOMEM;
+
+	if (tur->tur_update_params != NULL) {
+		memcpy(params, tur->tur_update_params,
+		       tur->tur_update_params_buf_size);
+		OBD_FREE_LARGE(tur->tur_update_params,
+			       tur->tur_update_params_buf_size);
+	}
+
+	tur->tur_update_params = params;
+	tur->tur_update_params_buf_size = new_size;
+
+	return 0;
+}
+EXPORT_SYMBOL(tur_update_params_extend);
+
+/**
+ * Check and prepare whether it needs to record update.
+ *
+ * Checks if the transaction needs to record updates, and if it
+ * does, then initialize the update record buffer in the transaction.
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	transaction handle
+ *
+ * \retval		0 if updates recording succeeds.
+ * \retval		negative errno if updates recording fails.
+ */
+int check_and_prepare_update_record(const struct lu_env *env,
+				    struct thandle_update_records *tur)
+{
+	struct llog_update_record	*lur;
+	int rc;
+
+	if (tur->tur_update_records == NULL) {
+		rc = tur_update_records_create(tur);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+	if (tur->tur_update_params == NULL) {
+		rc = tur_update_params_create(tur);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+	lur = tur->tur_update_records;
+	lur->lur_update_rec.ur_update_count = 0;
+	lur->lur_update_rec.ur_param_count = 0;
+	lur->lur_update_rec.ur_master_transno = 0;
+	lur->lur_update_rec.ur_batchid = 0;
+	lur->lur_update_rec.ur_flags = 0;
+	lur->lur_hdr.lrh_len = LLOG_MIN_CHUNK_SIZE;
+
+	tur->tur_update_param_count = 0;
+
+	RETURN(0);
+}
+
+static void update_key_fini(const struct lu_context *ctx,
+			    struct lu_context_key *key, void *data)
+{
+	struct update_thread_info *info = data;
+	struct thandle_exec_args  *args = &info->uti_tea;
+	int			  i;
+
+	for (i = 0; i < args->ta_alloc_args; i++) {
+		if (args->ta_args[i] != NULL)
+			OBD_FREE_PTR(args->ta_args[i]);
+	}
+
+	if (args->ta_args != NULL)
+		OBD_FREE_PTR_ARRAY(args->ta_args, args->ta_alloc_args);
+
+	if (info->uti_tur.tur_update_records != NULL)
+		OBD_FREE_LARGE(info->uti_tur.tur_update_records,
+			       info->uti_tur.tur_update_records_buf_size);
+	if (info->uti_tur.tur_update_params != NULL)
+		OBD_FREE_LARGE(info->uti_tur.tur_update_params,
+			       info->uti_tur.tur_update_params_buf_size);
+
+	OBD_FREE_PTR(info);
+}
+
+/* context key constructor/destructor: update_key_init, update_key_fini */
+LU_KEY_INIT(update, struct update_thread_info);
+/* context key: update_thread_key */
+LU_CONTEXT_KEY_DEFINE(update, LCT_MD_THREAD | LCT_MG_THREAD |
+			      LCT_DT_THREAD | LCT_LOCAL);
+EXPORT_SYMBOL(update_thread_key);
+LU_KEY_INIT_GENERIC(update);
+
+void update_info_init(void)
+{
+	update_key_init_generic(&update_thread_key, NULL);
+	lu_context_key_register(&update_thread_key);
+}
+
+void update_info_fini(void)
+{
+	lu_context_key_degister(&update_thread_key);
+}
diff --git a/drivers/staging/lustrefsx/lustre/target/update_recovery.c b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
new file mode 100644
index 0000000000000..b483a26c5857c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
@@ -0,0 +1,1451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, 2017, Intel Corporation.
+ */
+
+/*
+ * lustre/target/update_recovery.c
+ *
+ * This file implement the methods to handle the update recovery.
+ *
+ * During DNE recovery, the recovery thread will redo the operation according
+ * to the transaction no, and these replay are either from client replay req
+ * or update replay records(for distribute transaction) in the update log.
+ * For distribute transaction replay, the replay thread will call
+ * distribute_txn_replay_handle() to handle the updates.
+ *
+ * After the Master MDT restarts, it will retrieve the update records from all
+ * of MDTs, for each distributed operation, it will check updates on all MDTs,
+ * if some updates records are missing on some MDTs, the replay thread will redo
+ * updates on these MDTs.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <lu_target.h>
+#include <lustre_obdo.h>
+#include <lustre_update.h>
+#include <lustre_swab.h>
+#include <md_object.h>
+#include <obd.h>
+#include <obd_class.h>
+
+#include "tgt_internal.h"
+
+/**
+ * Lookup distribute_txn_replay req
+ *
+ * Lookup distribute_txn_replay in the replay list by batchid.
+ * It is assumed the list has been locked before calling this function.
+ *
+ * \param[in] tdtd	distribute_txn_data, which holds the replay
+ *                      list.
+ * \param[in] batchid	batchid used by lookup.
+ *
+ * \retval		pointer of the replay if succeeds.
+ * \retval		NULL if can not find it.
+ */
+static struct distribute_txn_replay_req *
+dtrq_lookup(struct target_distribute_txn_data *tdtd, __u64 batchid)
+{
+	struct distribute_txn_replay_req	*tmp;
+	struct distribute_txn_replay_req	*dtrq = NULL;
+
+	list_for_each_entry(tmp, &tdtd->tdtd_replay_list, dtrq_list) {
+		if (tmp->dtrq_batchid == batchid) {
+			dtrq = tmp;
+			break;
+		}
+	}
+	return dtrq;
+}
+
+/**
+ * insert distribute txn replay req
+ *
+ * Insert distribute txn replay to the replay list, and it assumes the
+ * list has been looked. Note: the replay list is a sorted list, which
+ * is sorted by master transno. It is assumed the replay list has been
+ * locked before calling this function.
+ *
+ * \param[in] tdtd	target distribute txn data where replay list is
+ * \param[in] new	distribute txn replay to be inserted
+ *
+ * \retval		0 if insertion succeeds
+ * \retval		EEXIST if the dtrq already exists
+ */
+static int dtrq_insert(struct target_distribute_txn_data *tdtd,
+			struct distribute_txn_replay_req *new)
+{
+	struct distribute_txn_replay_req *iter;
+
+	/* Check if the dtrq has been added to the list */
+	iter = dtrq_lookup(tdtd, new->dtrq_batchid);
+	if (iter != NULL)
+		return -EEXIST;
+
+	list_for_each_entry_reverse(iter, &tdtd->tdtd_replay_list, dtrq_list) {
+		if (iter->dtrq_master_transno > new->dtrq_master_transno)
+			continue;
+
+		/* If there are mulitple replay req with same transno, then
+		 * sort them with batchid */
+		if (iter->dtrq_master_transno == new->dtrq_master_transno &&
+		    iter->dtrq_batchid > new->dtrq_batchid)
+			continue;
+
+		list_add(&new->dtrq_list, &iter->dtrq_list);
+		break;
+	}
+
+	if (list_empty(&new->dtrq_list))
+		list_add(&new->dtrq_list, &tdtd->tdtd_replay_list);
+
+	return 0;
+}
+
+/**
+ * create distribute txn replay req
+ *
+ * Allocate distribute txn replay req according to the update records.
+ *
+ * \param[in] tdtd	target distribute txn data where replay list is.
+ * \param[in] record    update records from the update log.
+ *
+ * \retval		the pointer of distribute txn replay req if
+ *                      the creation succeeds.
+ * \retval		NULL if the creation fails.
+ */
+static struct distribute_txn_replay_req *
+dtrq_create(struct target_distribute_txn_data *tdtd,
+	    struct llog_update_record *lur)
+{
+	struct distribute_txn_replay_req *new;
+
+	OBD_ALLOC_PTR(new);
+	if (new == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	new->dtrq_lur_size = llog_update_record_size(lur);
+	OBD_ALLOC_LARGE(new->dtrq_lur, new->dtrq_lur_size);
+	if (new->dtrq_lur == NULL) {
+		OBD_FREE_PTR(new);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	memcpy(new->dtrq_lur, lur, new->dtrq_lur_size);
+
+	/* If the transno in the update record is 0, it means the
+	 * update are from master MDT, and it will use the master
+	 * last committed transno as its master transno. Later, if
+	 * the update records are gotten from slave MDTs, then these
+	 * transno will be replaced.
+	 * See insert_update_records_to_replay_list(). */
+	if (lur->lur_update_rec.ur_master_transno == 0) {
+		new->dtrq_lur->lur_update_rec.ur_master_transno =
+				tdtd->tdtd_lut->lut_obd->obd_last_committed;
+		new->dtrq_master_transno =
+				tdtd->tdtd_lut->lut_obd->obd_last_committed;
+	} else {
+		new->dtrq_master_transno =
+				lur->lur_update_rec.ur_master_transno;
+	}
+
+	new->dtrq_batchid = lur->lur_update_rec.ur_batchid;
+
+	spin_lock_init(&new->dtrq_sub_list_lock);
+	INIT_LIST_HEAD(&new->dtrq_sub_list);
+	INIT_LIST_HEAD(&new->dtrq_list);
+
+	RETURN(new);
+}
+
+/**
+ * Lookup distribute sub replay
+ *
+ * Lookup distribute sub replay in the sub list of distribute_txn_replay by
+ * mdt_index.
+ *
+ * \param[in] distribute_txn_replay_req	the distribute txn replay req to lookup
+ * \param[in] mdt_index			the mdt_index as the key of lookup
+ *
+ * \retval		the pointer of sub replay if it can be found.
+ * \retval		NULL if it can not find.
+ */
+struct distribute_txn_replay_req_sub *
+dtrq_sub_lookup(struct distribute_txn_replay_req *dtrq, __u32 mdt_index)
+{
+	struct distribute_txn_replay_req_sub *dtrqs = NULL;
+	struct distribute_txn_replay_req_sub *tmp;
+
+	list_for_each_entry(tmp, &dtrq->dtrq_sub_list, dtrqs_list) {
+		if (tmp->dtrqs_mdt_index == mdt_index) {
+			dtrqs = tmp;
+			break;
+		}
+	}
+	return dtrqs;
+}
+
+/**
+ * Try to add cookie to sub distribute txn request
+ *
+ * Check if the update log cookie has been added to the request, if not,
+ * add it to the dtrqs_cookie_list.
+ *
+ * \param[in] dtrqs	sub replay req where cookies to be added.
+ * \param[in] cookie	cookie to be added.
+ *
+ * \retval		0 if the cookie is adding succeeds.
+ * \retval		negative errno if adding fails.
+ */
+static int dtrq_sub_add_cookie(struct distribute_txn_replay_req_sub *dtrqs,
+			       struct llog_cookie *cookie)
+{
+	struct sub_thandle_cookie *new;
+
+	OBD_ALLOC_PTR(new);
+	if (new == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&new->stc_list);
+	new->stc_cookie = *cookie;
+	/* Note: only single thread will access one sub_request each time,
+	 * so no need lock here */
+	list_add(&new->stc_list, &dtrqs->dtrqs_cookie_list);
+
+	return 0;
+}
+
+/**
+ * Insert distribute txn sub req replay
+ *
+ * Allocate sub replay req and insert distribute txn replay list.
+ *
+ * \param[in] dtrq	d to be added
+ * \param[in] cookie	the cookie of the update record
+ * \param[in] mdt_index	the mdt_index of the update record
+ *
+ * \retval		0 if the adding succeeds.
+ * \retval		negative errno if the adding fails.
+ */
+static int
+dtrq_sub_create_and_insert(struct distribute_txn_replay_req *dtrq,
+			   struct llog_cookie *cookie,
+			   __u32 mdt_index)
+{
+	struct distribute_txn_replay_req_sub	*dtrqs = NULL;
+	struct distribute_txn_replay_req_sub	*new;
+	int					rc;
+	ENTRY;
+
+	spin_lock(&dtrq->dtrq_sub_list_lock);
+	dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+	spin_unlock(&dtrq->dtrq_sub_list_lock);
+	if (dtrqs != NULL) {
+		rc = dtrq_sub_add_cookie(dtrqs, cookie);
+		RETURN(0);
+	}
+
+	OBD_ALLOC_PTR(new);
+	if (new == NULL)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&new->dtrqs_list);
+	INIT_LIST_HEAD(&new->dtrqs_cookie_list);
+	new->dtrqs_mdt_index = mdt_index;
+	spin_lock(&dtrq->dtrq_sub_list_lock);
+	dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+	if (dtrqs == NULL) {
+		list_add(&new->dtrqs_list, &dtrq->dtrq_sub_list);
+		dtrqs = new;
+	} else {
+		OBD_FREE_PTR(new);
+	}
+	spin_unlock(&dtrq->dtrq_sub_list_lock);
+
+	rc = dtrq_sub_add_cookie(dtrqs, cookie);
+
+	RETURN(rc);
+}
+
+/**
+ * append updates to the current replay updates
+ *
+ * Append more updates to the existent replay update. And this is only
+ * used when combining mulitple updates into one large updates during
+ * replay.
+ *
+ * \param[in] dtrq	the update replay request where the new update
+ *                      records will be added.
+ * \param[in] lur	the new update record.
+ *
+ * \retval		0 if appending succeeds.
+ * \retval		negative errno if appending fails.
+ */
+static int dtrq_append_updates(struct distribute_txn_replay_req *dtrq,
+			       struct update_records *record)
+{
+	struct llog_update_record *new_lur;
+	size_t lur_size = dtrq->dtrq_lur_size;
+	void *ptr;
+	ENTRY;
+
+	/* Because several threads might retrieve the same records from
+	 * different targets, and we only need one copy of records. So
+	 * we will check if the records is in the next one, if not, just
+	 * skip it */
+	spin_lock(&dtrq->dtrq_sub_list_lock);
+	if (dtrq->dtrq_lur->lur_update_rec.ur_index + 1 != record->ur_index) {
+		spin_unlock(&dtrq->dtrq_sub_list_lock);
+		RETURN(0);
+	}
+	dtrq->dtrq_lur->lur_update_rec.ur_index++;
+	spin_unlock(&dtrq->dtrq_sub_list_lock);
+
+	lur_size += update_records_size(record);
+	OBD_ALLOC_LARGE(new_lur, lur_size);
+	if (new_lur == NULL) {
+		spin_lock(&dtrq->dtrq_sub_list_lock);
+		dtrq->dtrq_lur->lur_update_rec.ur_index--;
+		spin_unlock(&dtrq->dtrq_sub_list_lock);
+		RETURN(-ENOMEM);
+	}
+
+	/* Copy the old and new records to the new allocated buffer */
+	memcpy(new_lur, dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+	ptr = (char *)&new_lur->lur_update_rec +
+		update_records_size(&new_lur->lur_update_rec);
+	memcpy(ptr, &record->ur_ops,
+	       update_records_size(record) -
+	       offsetof(struct update_records, ur_ops));
+
+	new_lur->lur_update_rec.ur_update_count += record->ur_update_count;
+	new_lur->lur_update_rec.ur_param_count += record->ur_param_count;
+	new_lur->lur_hdr.lrh_len = llog_update_record_size(new_lur);
+
+	/* Replace the records */
+	OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+	dtrq->dtrq_lur = new_lur;
+	dtrq->dtrq_lur_size = lur_size;
+	dtrq->dtrq_lur->lur_update_rec.ur_flags = record->ur_flags;
+	update_records_dump(&new_lur->lur_update_rec, D_INFO, true);
+	RETURN(0);
+}
+
+/**
+ * Insert update records to the replay list.
+ *
+ * Allocate distribute txn replay req and insert it into the replay
+ * list, then insert the update records into the replay req.
+ *
+ * \param[in] tdtd	distribute txn replay data where the replay list
+ *                      is.
+ * \param[in] record    the update record
+ * \param[in] cookie    cookie of the record
+ * \param[in] index	mdt index of the record
+ *
+ * \retval		0 if the adding succeeds.
+ * \retval		negative errno if the adding fails.
+ */
+int
+insert_update_records_to_replay_list(struct target_distribute_txn_data *tdtd,
+				     struct llog_update_record *lur,
+				     struct llog_cookie *cookie,
+				     __u32 mdt_index)
+{
+	struct distribute_txn_replay_req *dtrq;
+	struct update_records *record = &lur->lur_update_rec;
+	bool replace_record = false;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_HA, "%s: insert record batchid = %llu transno = %llu"
+	       " mdt_index %u\n", tdtd->tdtd_lut->lut_obd->obd_name,
+	       record->ur_batchid, record->ur_master_transno, mdt_index);
+
+	/* Update batchid if necessary */
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	if (record->ur_batchid >= tdtd->tdtd_batchid) {
+		CDEBUG(D_HA, "%s update batchid from %llu" " to %llu\n",
+		       tdtd->tdtd_lut->lut_obd->obd_name,
+		       tdtd->tdtd_batchid, record->ur_batchid);
+		tdtd->tdtd_batchid = record->ur_batchid + 1;
+	}
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+
+again:
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	/* First try to build the replay update request with the records */
+	dtrq = dtrq_lookup(tdtd, record->ur_batchid);
+	if (dtrq == NULL) {
+		spin_unlock(&tdtd->tdtd_replay_list_lock);
+		dtrq = dtrq_create(tdtd, lur);
+		if (IS_ERR(dtrq))
+			RETURN(PTR_ERR(dtrq));
+
+		spin_lock(&tdtd->tdtd_replay_list_lock);
+		rc = dtrq_insert(tdtd, dtrq);
+		if (rc < 0) {
+			spin_unlock(&tdtd->tdtd_replay_list_lock);
+			dtrq_destroy(dtrq);
+			if (rc == -EEXIST)
+				goto again;
+			return rc;
+		}
+	} else {
+		/* If the master transno in update header is not
+		* matched with the one in the record, then it means
+		* the dtrq is originally created by master record,
+		* so we need update master transno and reposition
+		* the dtrq(by master transno) in the list and also
+		* replace update record */
+		if (record->ur_master_transno != 0 &&
+		    dtrq->dtrq_master_transno != record->ur_master_transno &&
+		    dtrq->dtrq_lur != NULL) {
+			list_del_init(&dtrq->dtrq_list);
+			dtrq->dtrq_lur->lur_update_rec.ur_master_transno =
+						record->ur_master_transno;
+
+			dtrq->dtrq_master_transno = record->ur_master_transno;
+			replace_record = true;
+			/* try to insert again */
+			rc = dtrq_insert(tdtd, dtrq);
+			if (rc < 0) {
+				spin_unlock(&tdtd->tdtd_replay_list_lock);
+				dtrq_destroy(dtrq);
+				return rc;
+			}
+		}
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+
+	/* Because there should be only thread access the update record, so
+	 * we do not need lock here */
+	if (replace_record) {
+		/* Replace the update record and master transno */
+		OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+		dtrq->dtrq_lur = NULL;
+		dtrq->dtrq_lur_size = llog_update_record_size(lur);
+		OBD_ALLOC_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+		if (dtrq->dtrq_lur == NULL)
+			return -ENOMEM;
+
+		memcpy(dtrq->dtrq_lur, lur, dtrq->dtrq_lur_size);
+	}
+
+	/* This is a partial update records, let's try to append
+	 * the record to the current replay request */
+	if (record->ur_flags & UPDATE_RECORD_CONTINUE)
+		rc = dtrq_append_updates(dtrq, record);
+
+	/* Then create and add sub update request */
+	rc = dtrq_sub_create_and_insert(dtrq, cookie, mdt_index);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(insert_update_records_to_replay_list);
+
+/**
+ * Dump updates of distribute txns.
+ *
+ * Output all of recovery updates in the distribute txn list to the
+ * debug log.
+ *
+ * \param[in] tdtd	distribute txn data where all of distribute txn
+ *                      are listed.
+ * \param[in] mask	debug mask
+ */
+void dtrq_list_dump(struct target_distribute_txn_data *tdtd, unsigned int mask)
+{
+	struct distribute_txn_replay_req *dtrq;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	list_for_each_entry(dtrq, &tdtd->tdtd_replay_list, dtrq_list)
+		update_records_dump(&dtrq->dtrq_lur->lur_update_rec, mask,
+				    false);
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+}
+EXPORT_SYMBOL(dtrq_list_dump);
+
+/**
+ * Destroy distribute txn replay req
+ *
+ * Destroy distribute txn replay req and all of subs.
+ *
+ * \param[in] dtrq	distribute txn replqy req to be destroyed.
+ */
+void dtrq_destroy(struct distribute_txn_replay_req *dtrq)
+{
+	struct distribute_txn_replay_req_sub	*dtrqs;
+	struct distribute_txn_replay_req_sub	*tmp;
+
+	LASSERT(list_empty(&dtrq->dtrq_list));
+	CDEBUG(D_HA, "destroy x%llu t%llu\n", dtrq->dtrq_xid,
+	       dtrq->dtrq_master_transno);
+	spin_lock(&dtrq->dtrq_sub_list_lock);
+	list_for_each_entry_safe(dtrqs, tmp, &dtrq->dtrq_sub_list, dtrqs_list) {
+		struct sub_thandle_cookie *stc;
+		struct sub_thandle_cookie *tmp;
+
+		list_del(&dtrqs->dtrqs_list);
+		list_for_each_entry_safe(stc, tmp, &dtrqs->dtrqs_cookie_list,
+					 stc_list) {
+			list_del(&stc->stc_list);
+			OBD_FREE_PTR(stc);
+		}
+		OBD_FREE_PTR(dtrqs);
+	}
+	spin_unlock(&dtrq->dtrq_sub_list_lock);
+
+	if (dtrq->dtrq_lur != NULL)
+		OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+
+	OBD_FREE_PTR(dtrq);
+}
+EXPORT_SYMBOL(dtrq_destroy);
+
+/**
+ * Destroy all of replay req.
+ *
+ * Destroy all of replay req in the replay list.
+ *
+ * \param[in] tdtd	target distribute txn data where the replay list is.
+ */
+void dtrq_list_destroy(struct target_distribute_txn_data *tdtd)
+{
+	struct distribute_txn_replay_req *dtrq;
+	struct distribute_txn_replay_req *tmp;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	list_for_each_entry_safe(dtrq, tmp, &tdtd->tdtd_replay_list,
+				 dtrq_list) {
+		list_del_init(&dtrq->dtrq_list);
+		dtrq_destroy(dtrq);
+	}
+	list_for_each_entry_safe(dtrq, tmp, &tdtd->tdtd_replay_finish_list,
+				 dtrq_list) {
+		list_del_init(&dtrq->dtrq_list);
+		dtrq_destroy(dtrq);
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+}
+EXPORT_SYMBOL(dtrq_list_destroy);
+
+/**
+ * Get next req in the replay list
+ *
+ * Get next req needs to be replayed, since it is a sorted list
+ * (by master MDT transno)
+ *
+ * \param[in] tdtd	distribute txn data where the replay list is
+ *
+ * \retval		the pointer of update recovery header
+ */
+struct distribute_txn_replay_req *
+distribute_txn_get_next_req(struct target_distribute_txn_data *tdtd)
+{
+	struct distribute_txn_replay_req *dtrq = NULL;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	if (!list_empty(&tdtd->tdtd_replay_list)) {
+		dtrq = list_entry(tdtd->tdtd_replay_list.next,
+				 struct distribute_txn_replay_req, dtrq_list);
+		list_del_init(&dtrq->dtrq_list);
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+
+	return dtrq;
+}
+EXPORT_SYMBOL(distribute_txn_get_next_req);
+
+/**
+ * Get next transno in the replay list, because this is the sorted
+ * list, so it will return the transno of next req in the list.
+ *
+ * \param[in] tdtd	distribute txn data where the replay list is
+ *
+ * \retval		the transno of next update in the list
+ */
+__u64 distribute_txn_get_next_transno(struct target_distribute_txn_data *tdtd)
+{
+	struct distribute_txn_replay_req	*dtrq = NULL;
+	__u64					transno = 0;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	if (!list_empty(&tdtd->tdtd_replay_list)) {
+		dtrq = list_entry(tdtd->tdtd_replay_list.next,
+				 struct distribute_txn_replay_req, dtrq_list);
+		transno = dtrq->dtrq_master_transno;
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+
+	CDEBUG(D_HA, "%s: Next update transno %llu\n",
+	       tdtd->tdtd_lut->lut_obd->obd_name, transno);
+	return transno;
+}
+EXPORT_SYMBOL(distribute_txn_get_next_transno);
+
+struct distribute_txn_replay_req *
+distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd,
+				  __u64 transno)
+{
+	struct distribute_txn_replay_req *dtrq = NULL;
+	struct distribute_txn_replay_req *iter;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	list_for_each_entry(iter, &tdtd->tdtd_replay_finish_list, dtrq_list) {
+		if (iter->dtrq_master_transno == transno) {
+			dtrq = iter;
+			break;
+		}
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+	return dtrq;
+}
+
+bool is_req_replayed_by_update(struct ptlrpc_request *req)
+{
+	struct lu_target *tgt = class_exp2tgt(req->rq_export);
+	struct distribute_txn_replay_req *dtrq;
+
+	if (tgt->lut_tdtd == NULL)
+		return false;
+
+	dtrq = distribute_txn_lookup_finish_list(tgt->lut_tdtd,
+					lustre_msg_get_transno(req->rq_reqmsg));
+	if (dtrq == NULL)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL(is_req_replayed_by_update);
+
+/**
+ * Check if the update of one object is committed
+ *
+ * Check whether the update for the object is committed by checking whether
+ * the correspondent sub exists in the replay req. If it is committed, mark
+ * the committed flag in correspondent the sub thandle.
+ *
+ * \param[in] env	execution environment
+ * \param[in] dtrq	replay request
+ * \param[in] dt_obj	object for the update
+ * \param[in] top_th	top thandle
+ * \param[in] sub_th	sub thandle which the update belongs to
+ *
+ * \retval		1 if the update is not committed.
+ * \retval		0 if the update is committed.
+ * \retval		negative errno if some other failures happen.
+ */
+static int update_is_committed(const struct lu_env *env,
+			       struct distribute_txn_replay_req *dtrq,
+			       struct dt_object *dt_obj,
+			       struct top_thandle *top_th,
+			       struct sub_thandle *st)
+{
+	struct seq_server_site	*seq_site;
+	const struct lu_fid	*fid = lu_object_fid(&dt_obj->do_lu);
+	struct distribute_txn_replay_req_sub	*dtrqs;
+	__u32			mdt_index;
+	ENTRY;
+
+	if (st->st_sub_th != NULL)
+		RETURN(1);
+
+	if (st->st_committed)
+		RETURN(0);
+
+	seq_site = lu_site2seq(dt_obj->do_lu.lo_dev->ld_site);
+	if (fid_is_update_log(fid) || fid_is_update_log_dir(fid)) {
+		mdt_index = fid_oid(fid);
+	} else if (!fid_seq_in_fldb(fid_seq(fid))) {
+		mdt_index = seq_site->ss_node_id;
+	} else {
+		struct lu_server_fld *fld;
+		struct lu_seq_range range = {0};
+		int rc;
+
+		fld = seq_site->ss_server_fld;
+		fld_range_set_type(&range, LU_SEQ_RANGE_MDT);
+		LASSERT(fld->lsf_seq_lookup != NULL);
+		rc = fld->lsf_seq_lookup(env, fld, fid_seq(fid),
+					 &range);
+		if (rc < 0)
+			RETURN(rc);
+		mdt_index = range.lsr_index;
+	}
+
+	dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+	if (dtrqs != NULL || top_th->tt_multiple_thandle->tmt_committed) {
+		st->st_committed = 1;
+		if (dtrqs != NULL) {
+			struct sub_thandle_cookie *stc;
+			struct sub_thandle_cookie *tmp;
+
+			list_for_each_entry_safe(stc, tmp,
+						 &dtrqs->dtrqs_cookie_list,
+						 stc_list)
+				list_move(&stc->stc_list, &st->st_cookie_list);
+		}
+		RETURN(0);
+	}
+
+	CDEBUG(D_HA, "Update of "DFID "on MDT%u is not committed\n", PFID(fid),
+	       mdt_index);
+
+	RETURN(1);
+}
+
+/**
+ * Implementation of different update methods for update recovery.
+ *
+ * These following functions update_recovery_$(update_name) implement
+ * different updates recovery methods. They will extract the parameters
+ * from the common parameters area and call correspondent dt API to redo
+ * the update.
+ *
+ * \param[in] env	execution environment
+ * \param[in] op	update operation to be replayed
+ * \param[in] params	common update parameters which holds all parameters
+ *                      of the operation
+ * \param[in] th	transaction handle
+ * \param[in] declare	indicate it will do declare or real execution, true
+ *                      means declare, false means real execution
+ *
+ * \retval		0 if it succeeds.
+ * \retval		negative errno if it fails.
+ */
+static int update_recovery_create(const struct lu_env *env,
+				  struct dt_object *dt_obj,
+				  const struct update_op *op,
+				  const struct update_params *params,
+				  struct thandle_exec_args *ta,
+				  struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	struct llog_update_record *lur = uti->uti_dtrq->dtrq_lur;
+	struct lu_attr		*attr = &uti->uti_attr;
+	struct obdo		*wobdo;
+	struct obdo		*lobdo = &uti->uti_obdo;
+	struct dt_object_format	dof;
+	__u16			size;
+	unsigned int		param_count;
+	int rc;
+	ENTRY;
+
+	if (dt_object_exists(dt_obj))
+		RETURN(-EEXIST);
+
+	param_count = lur->lur_update_rec.ur_param_count;
+	wobdo = update_params_get_param_buf(params, op->uop_params_off[0],
+					    param_count, &size);
+	if (wobdo == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(*wobdo))
+		RETURN(-EIO);
+
+	if (LLOG_REC_HDR_NEEDS_SWABBING(&lur->lur_hdr))
+		lustre_swab_obdo(wobdo);
+
+	lustre_get_wire_obdo(NULL, lobdo, wobdo);
+	la_from_obdo(attr, lobdo, lobdo->o_valid);
+
+	dof.dof_type = dt_mode_to_dft(attr->la_mode);
+
+	rc = out_tx_create(env, dt_obj, attr, NULL, &dof,
+			   ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_destroy(const struct lu_env *env,
+				   struct dt_object *dt_obj,
+				   const struct update_op *op,
+				   const struct update_params *params,
+				   struct thandle_exec_args *ta,
+				   struct thandle *th)
+{
+	int rc;
+	ENTRY;
+
+	rc = out_tx_destroy(env, dt_obj, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_ref_add(const struct lu_env *env,
+				   struct dt_object *dt_obj,
+				   const struct update_op *op,
+				   const struct update_params *params,
+				   struct thandle_exec_args *ta,
+				   struct thandle *th)
+{
+	int rc;
+	ENTRY;
+
+	rc = out_tx_ref_add(env, dt_obj, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_ref_del(const struct lu_env *env,
+				   struct dt_object *dt_obj,
+				   const struct update_op *op,
+				   const struct update_params *params,
+				   struct thandle_exec_args *ta,
+				   struct thandle *th)
+{
+	int rc;
+	ENTRY;
+
+	rc = out_tx_ref_del(env, dt_obj, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_attr_set(const struct lu_env *env,
+				    struct dt_object *dt_obj,
+				    const struct update_op *op,
+				    const struct update_params *params,
+				    struct thandle_exec_args *ta,
+				    struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	struct llog_update_record *lur = uti->uti_dtrq->dtrq_lur;
+	struct obdo	*wobdo;
+	struct obdo	*lobdo = &uti->uti_obdo;
+	struct lu_attr	*attr = &uti->uti_attr;
+	__u16		size;
+	unsigned int	param_count;
+	int		rc;
+	ENTRY;
+
+	param_count = lur->lur_update_rec.ur_param_count;
+	wobdo = update_params_get_param_buf(params, op->uop_params_off[0],
+					    param_count, &size);
+	if (wobdo == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(*wobdo))
+		RETURN(-EIO);
+
+	if (LLOG_REC_HDR_NEEDS_SWABBING(&lur->lur_hdr))
+		lustre_swab_obdo(wobdo);
+
+	lustre_get_wire_obdo(NULL, lobdo, wobdo);
+	la_from_obdo(attr, lobdo, lobdo->o_valid);
+
+	rc = out_tx_attr_set(env, dt_obj, attr, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_xattr_set(const struct lu_env *env,
+				     struct dt_object *dt_obj,
+				     const struct update_op *op,
+				     const struct update_params *params,
+				     struct thandle_exec_args *ta,
+				     struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	char		*buf;
+	char		*name;
+	int		fl;
+	__u16		size;
+	__u32		param_count;
+	int		rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	name = update_params_get_param_buf(params,
+					   op->uop_params_off[0],
+					   param_count, &size);
+	if (name == NULL)
+		RETURN(-EIO);
+
+	buf = update_params_get_param_buf(params,
+					  op->uop_params_off[1],
+					  param_count, &size);
+	if (buf == NULL)
+		RETURN(-EIO);
+
+	uti->uti_buf.lb_buf = buf;
+	uti->uti_buf.lb_len = (size_t)size;
+
+	buf = update_params_get_param_buf(params, op->uop_params_off[2],
+					  param_count, &size);
+	if (buf == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(fl))
+		RETURN(-EIO);
+
+	fl = le32_to_cpu(*(int *)buf);
+
+	rc = out_tx_xattr_set(env, dt_obj, &uti->uti_buf, name, fl, ta, th,
+			      NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_index_insert(const struct lu_env *env,
+					struct dt_object *dt_obj,
+					const struct update_op *op,
+					const struct update_params *params,
+					struct thandle_exec_args *ta,
+					struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	struct lu_fid		*fid;
+	char			*name;
+	__u32			param_count;
+	__u32			*ptype;
+	__u32			type;
+	__u16			size;
+	int rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	name = update_params_get_param_buf(params, op->uop_params_off[0],
+					   param_count, &size);
+	if (name == NULL)
+		RETURN(-EIO);
+
+	fid = update_params_get_param_buf(params, op->uop_params_off[1],
+					  param_count, &size);
+	if (fid == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(*fid))
+		RETURN(-EIO);
+
+	fid_le_to_cpu(fid, fid);
+
+	ptype = update_params_get_param_buf(params, op->uop_params_off[2],
+					    param_count, &size);
+	if (ptype == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(*ptype))
+		RETURN(-EIO);
+	type = le32_to_cpu(*ptype);
+
+	if (dt_try_as_dir(env, dt_obj) == 0)
+		RETURN(-ENOTDIR);
+
+	uti->uti_rec.rec_fid = fid;
+	uti->uti_rec.rec_type = type;
+
+	rc = out_tx_index_insert(env, dt_obj,
+				 (const struct dt_rec *)&uti->uti_rec,
+				 (const struct dt_key *)name, ta, th,
+				 NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_index_delete(const struct lu_env *env,
+					struct dt_object *dt_obj,
+					const struct update_op *op,
+					const struct update_params *params,
+					struct thandle_exec_args *ta,
+					struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	__u32	param_count;
+	char	*name;
+	__u16	size;
+	int	rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	name = update_params_get_param_buf(params, op->uop_params_off[0],
+					   param_count, &size);
+	if (name == NULL)
+		RETURN(-EIO);
+
+	if (dt_try_as_dir(env, dt_obj) == 0)
+		RETURN(-ENOTDIR);
+
+	rc = out_tx_index_delete(env, dt_obj,
+				 (const struct dt_key *)name, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_write(const struct lu_env *env,
+				 struct dt_object *dt_obj,
+				 const struct update_op *op,
+				 const struct update_params *params,
+				 struct thandle_exec_args *ta,
+				 struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	char		*buf;
+	__u32		param_count;
+	__u64		pos;
+	__u16		size;
+	int rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	buf = update_params_get_param_buf(params, op->uop_params_off[0],
+					  param_count, &size);
+	if (buf == NULL)
+		RETURN(-EIO);
+
+	uti->uti_buf.lb_buf = buf;
+	uti->uti_buf.lb_len = size;
+
+	buf = update_params_get_param_buf(params, op->uop_params_off[1],
+					  param_count, &size);
+	if (buf == NULL)
+		RETURN(-EIO);
+
+	pos = le64_to_cpu(*(__u64 *)buf);
+
+	rc = out_tx_write(env, dt_obj, &uti->uti_buf, pos,
+			  ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_xattr_del(const struct lu_env *env,
+				     struct dt_object *dt_obj,
+				     const struct update_op *op,
+				     const struct update_params *params,
+				     struct thandle_exec_args *ta,
+				     struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	__u32	param_count;
+	char	*name;
+	__u16	size;
+	int	rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	name = update_params_get_param_buf(params, op->uop_params_off[0],
+					   param_count, &size);
+	if (name == NULL)
+		RETURN(-EIO);
+
+	rc = out_tx_xattr_del(env, dt_obj, name, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+/**
+ * Update session information
+ *
+ * Update session information so tgt_txn_stop_cb()->tgt_last_rcvd_update()
+ * can be called correctly during update replay.
+ *
+ * \param[in] env	execution environment.
+ * \param[in] tdtd	distribute data structure of the recovering tgt.
+ * \param[in] th	thandle of this update replay.
+ * \param[in] master_th	master sub thandle.
+ * \param[in] ta_arg	the tx arg structure to hold the update for updating
+ *                      reply data.
+ */
+static void update_recovery_update_ses(struct lu_env *env,
+				      struct target_distribute_txn_data *tdtd,
+				      struct thandle *th,
+				      struct thandle *master_th,
+				      struct distribute_txn_replay_req *dtrq,
+				      struct tx_arg *ta_arg)
+{
+	struct tgt_session_info	*tsi;
+	struct lu_target	*lut = tdtd->tdtd_lut;
+	struct obd_export	*export;
+	struct cfs_hash		*hash;
+	struct top_thandle	*top_th;
+	struct lsd_reply_data	*lrd;
+	size_t			size;
+
+	tsi = tgt_ses_info(env);
+	if (tsi->tsi_exp != NULL)
+		return;
+
+	size = ta_arg->u.write.buf.lb_len;
+	lrd = ta_arg->u.write.buf.lb_buf;
+	if (size != sizeof(*lrd) || lrd == NULL)
+		return;
+
+	lrd->lrd_transno         = le64_to_cpu(lrd->lrd_transno);
+	lrd->lrd_xid             = le64_to_cpu(lrd->lrd_xid);
+	lrd->lrd_data            = le64_to_cpu(lrd->lrd_data);
+	lrd->lrd_result          = le32_to_cpu(lrd->lrd_result);
+	lrd->lrd_client_gen      = le32_to_cpu(lrd->lrd_client_gen);
+
+	CDEBUG(D_HA, "xid=%llu transno=%llu\n", lrd->lrd_xid, lrd->lrd_transno);
+	if (lrd->lrd_transno != tgt_th_info(env)->tti_transno)
+		return;
+
+	hash = cfs_hash_getref(lut->lut_obd->obd_gen_hash);
+	if (hash == NULL)
+		return;
+
+	export = cfs_hash_lookup(hash, &lrd->lrd_client_gen);
+	if (export == NULL) {
+		cfs_hash_putref(hash);
+		return;
+	}
+
+	tsi->tsi_exp = export;
+	tsi->tsi_xid = lrd->lrd_xid;
+	tsi->tsi_opdata = lrd->lrd_data;
+	tsi->tsi_result = lrd->lrd_result;
+	tsi->tsi_client_gen = lrd->lrd_client_gen;
+	dtrq->dtrq_xid = lrd->lrd_xid;
+	top_th = container_of(th, struct top_thandle, tt_super);
+	top_th->tt_master_sub_thandle = master_th;
+	cfs_hash_putref(hash);
+}
+
+/**
+ * Execute updates in the update replay records
+ *
+ * Declare distribute txn replay by update records and add the updates
+ * to the execution list. Note: it will check if the update has been
+ * committed, and only execute the updates if it is not committed to
+ * disk.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tdtd	distribute txn replay data which hold all of replay
+ *                      reqs and all replay parameters.
+ * \param[in] dtrq	distribute transaction replay req.
+ * \param[in] ta	thandle execute args.
+ *
+ * \retval		0 if declare succeeds.
+ * \retval		negative errno if declare fails.
+ */
+static int update_recovery_exec(const struct lu_env *env,
+				struct target_distribute_txn_data *tdtd,
+				struct distribute_txn_replay_req *dtrq,
+				struct thandle_exec_args *ta)
+{
+	struct llog_update_record *lur = dtrq->dtrq_lur;
+	struct update_records	*records = &lur->lur_update_rec;
+	struct update_ops	*ops = &records->ur_ops;
+	struct update_params	*params = update_records_get_params(records);
+	struct top_thandle	*top_th = container_of(ta->ta_handle,
+						       struct top_thandle,
+						       tt_super);
+	struct top_multiple_thandle *tmt = top_th->tt_multiple_thandle;
+	struct update_op	*op;
+	unsigned int		i;
+	int			rc = 0;
+	ENTRY;
+
+	/* These records have been swabbed in llog_cat_process() */
+	for (i = 0, op = &ops->uops_op[0]; i < records->ur_update_count;
+	     i++, op = update_op_next_op(op)) {
+		struct lu_fid		*fid = &op->uop_fid;
+		struct dt_object	*dt_obj;
+		struct dt_object	*sub_dt_obj;
+		struct dt_device	*sub_dt;
+		struct sub_thandle	*st;
+
+		if (op->uop_type == OUT_NOOP)
+			continue;
+
+		dt_obj = dt_locate(env, tdtd->tdtd_dt, fid);
+		if (IS_ERR(dt_obj)) {
+			rc = PTR_ERR(dt_obj);
+			if (rc == -EREMCHG)
+				LCONSOLE_WARN("%.16s: hit invalid OI mapping "
+					      "for "DFID" during recovering, "
+					      "that may because auto scrub is "
+					      "disabled on related MDT, and "
+					      "will cause recovery failure. "
+					      "Please enable auto scrub and "
+					      "retry the recovery.\n",
+					      tdtd->tdtd_lut->lut_obd->obd_name,
+					      PFID(fid));
+
+			break;
+		}
+		sub_dt_obj = dt_object_child(dt_obj);
+
+		/* Create sub thandle if not */
+		sub_dt = lu2dt_dev(sub_dt_obj->do_lu.lo_dev);
+		st = lookup_sub_thandle(tmt, sub_dt);
+		if (st == NULL) {
+			st = create_sub_thandle(tmt, sub_dt);
+			if (IS_ERR(st))
+				GOTO(next, rc = PTR_ERR(st));
+		}
+
+		/* check if updates on the OSD/OSP are committed */
+		rc = update_is_committed(env, dtrq, dt_obj, top_th, st);
+		if (rc == 0)
+			/* If this is committed, goto next */
+			goto next;
+
+		if (rc < 0)
+			GOTO(next, rc);
+
+		/* Create thandle for sub thandle if needed */
+		if (st->st_sub_th == NULL) {
+			rc = sub_thandle_trans_create(env, top_th, st);
+			if (rc != 0)
+				GOTO(next, rc);
+		}
+
+		CDEBUG(D_HA, "replay %uth update\n", i);
+		switch (op->uop_type) {
+		case OUT_CREATE:
+			rc = update_recovery_create(env, sub_dt_obj,
+						    op, params, ta,
+						    st->st_sub_th);
+			break;
+		case OUT_DESTROY:
+			rc = update_recovery_destroy(env, sub_dt_obj,
+						     op, params, ta,
+						     st->st_sub_th);
+			break;
+		case OUT_REF_ADD:
+			rc = update_recovery_ref_add(env, sub_dt_obj,
+						     op, params, ta,
+						     st->st_sub_th);
+			break;
+		case OUT_REF_DEL:
+			rc = update_recovery_ref_del(env, sub_dt_obj,
+						     op, params, ta,
+						     st->st_sub_th);
+			break;
+		case OUT_ATTR_SET:
+			rc = update_recovery_attr_set(env, sub_dt_obj,
+						      op, params, ta,
+						      st->st_sub_th);
+			break;
+		case OUT_XATTR_SET:
+			rc = update_recovery_xattr_set(env, sub_dt_obj,
+						       op, params, ta,
+						       st->st_sub_th);
+			break;
+		case OUT_INDEX_INSERT:
+			rc = update_recovery_index_insert(env, sub_dt_obj,
+							  op, params, ta,
+							  st->st_sub_th);
+			break;
+		case OUT_INDEX_DELETE:
+			rc = update_recovery_index_delete(env, sub_dt_obj,
+							  op, params, ta,
+							  st->st_sub_th);
+			break;
+		case OUT_WRITE:
+			rc = update_recovery_write(env, sub_dt_obj,
+						   op, params, ta,
+						   st->st_sub_th);
+			break;
+		case OUT_XATTR_DEL:
+			rc = update_recovery_xattr_del(env, sub_dt_obj,
+						       op, params, ta,
+						       st->st_sub_th);
+			break;
+		default:
+			CERROR("Unknown update type %u\n", (__u32)op->uop_type);
+			rc = -EINVAL;
+			break;
+		}
+next:
+		dt_object_put(env, dt_obj);
+		if (rc < 0)
+			break;
+	}
+
+	ta->ta_handle->th_result = rc;
+	RETURN(rc);
+}
+
+/**
+ * redo updates on MDT if needed.
+ *
+ * During DNE recovery, the recovery thread (target_recovery_thread) will call
+ * this function to replay distribute txn updates on all MDTs. It only replay
+ * updates on the MDT where the update record is missing.
+ *
+ * If the update already exists on the MDT, then it does not need replay the
+ * updates on that MDT, and only mark the sub transaction has been committed
+ * there.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tdtd	target distribute txn data, which holds the replay list
+ *                      and all parameters needed by replay process.
+ * \param[in] dtrq	distribute txn replay req.
+ *
+ * \retval		0 if replay succeeds.
+ * \retval		negative errno if replay failes.
+ */
+int distribute_txn_replay_handle(struct lu_env *env,
+				 struct target_distribute_txn_data *tdtd,
+				 struct distribute_txn_replay_req *dtrq)
+{
+	struct update_records	*records = &dtrq->dtrq_lur->lur_update_rec;
+	struct thandle_exec_args *ta;
+	struct lu_context	session_env;
+	struct thandle		*th = NULL;
+	struct top_thandle	*top_th;
+	struct top_multiple_thandle *tmt;
+	struct thandle_update_records *tur = NULL;
+	int			i;
+	int			rc = 0;
+	ENTRY;
+
+	/* initialize session, it is needed for the handler of target */
+	rc = lu_context_init(&session_env, LCT_SERVER_SESSION | LCT_NOREF);
+	if (rc) {
+		CERROR("%s: failure to initialize session: rc = %d\n",
+		       tdtd->tdtd_lut->lut_obd->obd_name, rc);
+		RETURN(rc);
+	}
+	lu_context_enter(&session_env);
+	env->le_ses = &session_env;
+	lu_env_refill(env);
+	update_records_dump(records, D_HA, true);
+	th = top_trans_create(env, NULL);
+	if (IS_ERR(th))
+		GOTO(exit_session, rc = PTR_ERR(th));
+
+	ta = &update_env_info(env)->uti_tea;
+	ta->ta_argno = 0;
+
+	update_env_info(env)->uti_dtrq = dtrq;
+	/* Create distribute transaction structure for this top thandle */
+	top_th = container_of(th, struct top_thandle, tt_super);
+	rc = top_trans_create_tmt(env, top_th);
+	if (rc < 0)
+		GOTO(stop_trans, rc);
+
+	th->th_dev = tdtd->tdtd_dt;
+	ta->ta_handle = th;
+
+	/* check if the distribute transaction has been committed */
+	tmt = top_th->tt_multiple_thandle;
+	tmt->tmt_master_sub_dt = tdtd->tdtd_lut->lut_bottom;
+	tmt->tmt_batchid = dtrq->dtrq_batchid;
+	tgt_th_info(env)->tti_transno = dtrq->dtrq_master_transno;
+
+	if (tmt->tmt_batchid <= tdtd->tdtd_committed_batchid)
+		tmt->tmt_committed = 1;
+
+	rc = update_recovery_exec(env, tdtd, dtrq, ta);
+	if (rc < 0)
+		GOTO(stop_trans, rc);
+
+	/* If no updates are needed to be replayed, then mark this records as
+	 * committed, so commit thread distribute_txn_commit_thread() will
+	 * delete the record */
+	if (ta->ta_argno == 0)
+		tmt->tmt_committed = 1;
+
+	tur = &update_env_info(env)->uti_tur;
+	tur->tur_update_records = dtrq->dtrq_lur;
+	tur->tur_update_records_buf_size = dtrq->dtrq_lur_size;
+	tur->tur_update_params = NULL;
+	tur->tur_update_param_count = 0;
+	tmt->tmt_update_records = tur;
+
+	distribute_txn_insert_by_batchid(tmt);
+	rc = top_trans_start(env, NULL, th);
+	if (rc < 0)
+		GOTO(stop_trans, rc);
+
+	for (i = 0; i < ta->ta_argno; i++) {
+		struct tx_arg		*ta_arg;
+		struct dt_object	*dt_obj;
+		struct dt_device	*sub_dt;
+		struct sub_thandle	*st;
+
+		ta_arg = ta->ta_args[i];
+		dt_obj = ta_arg->object;
+
+		LASSERT(tmt->tmt_committed == 0);
+		sub_dt = lu2dt_dev(dt_obj->do_lu.lo_dev);
+		st = lookup_sub_thandle(tmt, sub_dt);
+
+		LASSERT(st != NULL);
+		LASSERT(st->st_sub_th != NULL);
+		rc = ta->ta_args[i]->exec_fn(env, st->st_sub_th,
+					     ta->ta_args[i]);
+
+		/* If the update is to update the reply data, then
+		 * we need set the session information, so
+		 * tgt_last_rcvd_update() can be called correctly */
+		if (rc == 0 && dt_obj == tdtd->tdtd_lut->lut_reply_data)
+			update_recovery_update_ses(env, tdtd, th,
+						   st->st_sub_th, dtrq, ta_arg);
+
+		if (unlikely(rc < 0)) {
+			CDEBUG(D_HA, "error during execution of #%u from"
+			       " %s:%d: rc = %d\n", i, ta->ta_args[i]->file,
+			       ta->ta_args[i]->line, rc);
+			while (--i > 0) {
+				if (ta->ta_args[i]->undo_fn != NULL) {
+					dt_obj = ta->ta_args[i]->object;
+					sub_dt =
+						lu2dt_dev(dt_obj->do_lu.lo_dev);
+					st = lookup_sub_thandle(tmt, sub_dt);
+					LASSERT(st != NULL);
+					LASSERT(st->st_sub_th != NULL);
+
+					ta->ta_args[i]->undo_fn(env,
+							       st->st_sub_th,
+							       ta->ta_args[i]);
+				} else {
+					CERROR("%s: undo for %s:%d: rc = %d\n",
+					     dt_obd_name(ta->ta_handle->th_dev),
+					       ta->ta_args[i]->file,
+					       ta->ta_args[i]->line, -ENOTSUPP);
+				}
+			}
+			break;
+		}
+		CDEBUG(D_HA, "%s: executed %u/%u: rc = %d\n",
+		       dt_obd_name(sub_dt), i, ta->ta_argno, rc);
+	}
+
+stop_trans:
+	if (rc < 0)
+		th->th_result = rc;
+	rc = top_trans_stop(env, tdtd->tdtd_dt, th);
+	for (i = 0; i < ta->ta_argno; i++) {
+		if (ta->ta_args[i]->object != NULL) {
+			dt_object_put(env, ta->ta_args[i]->object);
+			ta->ta_args[i]->object = NULL;
+		}
+	}
+
+	if (tur != NULL)
+		tur->tur_update_records = NULL;
+
+	if (tgt_ses_info(env)->tsi_exp != NULL) {
+		class_export_put(tgt_ses_info(env)->tsi_exp);
+		tgt_ses_info(env)->tsi_exp = NULL;
+	}
+exit_session:
+	lu_context_exit(&session_env);
+	lu_context_fini(&session_env);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(distribute_txn_replay_handle);
diff --git a/drivers/staging/lustrefsx/lustre/target/update_trans.c b/drivers/staging/lustrefsx/lustre/target/update_trans.c
new file mode 100644
index 0000000000000..aa13f9433d3e5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/update_trans.c
@@ -0,0 +1,1743 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, 2017, Intel Corporation.
+ */
+/*
+ * lustre/target/update_trans.c
+ *
+ * This file implements the update distribute transaction API.
+ *
+ * To manage the cross-MDT operation (distribute operation) transaction,
+ * the transaction will also be separated two layers on MD stack, top
+ * transaction and sub transaction.
+ *
+ * During the distribute operation, top transaction is created in the LOD
+ * layer, and represent the operation. Sub transaction is created by
+ * each OSD or OSP. Top transaction start/stop will trigger all of its sub
+ * transaction start/stop. Top transaction (the whole operation) is committed
+ * only all of its sub transaction are committed.
+ *
+ * there are three kinds of transactions
+ * 1. local transaction: All updates are in a single local OSD.
+ * 2. Remote transaction: All Updates are only in the remote OSD,
+ *    i.e. locally all updates are in OSP.
+ * 3. Mixed transaction: Updates are both in local OSD and remote
+ *    OSD.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/kthread.h>
+#include <lu_target.h>
+#include <lustre_log.h>
+#include <lustre_update.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <tgt_internal.h>
+
+#include <tgt_internal.h>
+/**
+ * Dump top mulitple thandle
+ *
+ * Dump top multiple thandle and all of its sub thandle to the debug log.
+ *
+ * \param[in]mask	debug mask
+ * \param[in]top_th	top_thandle to be dumped
+ */
+static void top_multiple_thandle_dump(struct top_multiple_thandle *tmt,
+				      __u32 mask)
+{
+	struct sub_thandle	*st;
+
+	LASSERT(tmt->tmt_magic == TOP_THANDLE_MAGIC);
+	CDEBUG(mask, "%s tmt %p refcount %d committed %d result %d batchid %llu\n",
+	       tmt->tmt_master_sub_dt ?
+	       tmt->tmt_master_sub_dt->dd_lu_dev.ld_obd->obd_name :
+	       "NULL",
+	       tmt, atomic_read(&tmt->tmt_refcount), tmt->tmt_committed,
+	       tmt->tmt_result, tmt->tmt_batchid);
+
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		struct sub_thandle_cookie *stc;
+
+		CDEBUG(mask, "st %p obd %s committed %d started %d stopped %d "
+		       "result %d sub_th %p\n",
+		       st, st->st_dt->dd_lu_dev.ld_obd->obd_name,
+		       st->st_committed, st->st_started, st->st_stopped,
+		       st->st_result, st->st_sub_th);
+
+		list_for_each_entry(stc, &st->st_cookie_list, stc_list) {
+			CDEBUG(mask, " cookie "DFID".%u\n",
+			       PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid),
+			       stc->stc_cookie.lgc_index);
+		}
+	}
+}
+
+/**
+ * Declare write update to sub device
+ *
+ * Declare Write updates llog records to the sub device during distribute
+ * transaction.
+ *
+ * \param[in] env	execution environment
+ * \param[in] record	update records being written
+ * \param[in] sub_th	sub transaction handle
+ * \param[in] record_size total update record size
+ *
+ * \retval		0 if writing succeeds
+ * \retval		negative errno if writing fails
+ */
+static int sub_declare_updates_write(const struct lu_env *env,
+				     struct llog_update_record *record,
+				     struct thandle *sub_th, size_t record_size)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_device	*dt = sub_th->th_dev;
+	int			left = record_size;
+	int rc;
+
+	/* If ctxt is NULL, it means not need to write update,
+	 * for example if the the OSP is used to connect to OST */
+	ctxt = llog_get_context(dt->dd_lu_dev.ld_obd,
+				LLOG_UPDATELOG_ORIG_CTXT);
+
+	/* Not ready to record updates yet. */
+	if (ctxt == NULL || ctxt->loc_handle == NULL) {
+		llog_ctxt_put(ctxt);
+		return 0;
+	}
+
+	rc = llog_declare_add(env, ctxt->loc_handle,
+			      &record->lur_hdr, sub_th);
+	if (rc < 0)
+		GOTO(out_put, rc);
+
+	while (left > ctxt->loc_chunk_size) {
+		rc = llog_declare_add(env, ctxt->loc_handle,
+				      &record->lur_hdr, sub_th);
+		if (rc < 0)
+			GOTO(out_put, rc);
+
+		left -= ctxt->loc_chunk_size;
+	}
+
+out_put:
+	llog_ctxt_put(ctxt);
+
+	return rc;
+}
+
+/**
+ * write update to sub device
+ *
+ * Write llog update record to the sub device during distribute
+ * transaction. If it succeeds, llog cookie of the record will be
+ * returned by @cookie.
+ *
+ * \param[in] env	execution environment
+ * \param[in] record	update records being written
+ * \param[in] sub_th	sub transaction handle
+ * \param[out] cookie	llog cookie of the update record.
+ *
+ * \retval		1 if writing succeeds
+ * \retval		negative errno if writing fails
+ */
+static int sub_updates_write(const struct lu_env *env,
+			     struct llog_update_record *record,
+			     struct sub_thandle *sub_th)
+{
+	struct dt_device *dt = sub_th->st_dt;
+	struct llog_ctxt *ctxt;
+	struct llog_update_record *lur = NULL;
+	__u32 update_count = 0;
+	__u32 param_count = 0;
+	__u32 last_update_count = 0;
+	__u32 last_param_count = 0;
+	char *start;
+	char *cur;
+	char *next;
+	struct sub_thandle_cookie *stc;
+	size_t reclen;
+	bool eof = false;
+	int rc;
+	ENTRY;
+
+	ctxt = llog_get_context(dt->dd_lu_dev.ld_obd,
+				LLOG_UPDATELOG_ORIG_CTXT);
+	/* If ctxt == NULL, then it means updates on OST (only happens
+	 * during migration), and we do not track those updates for now */
+	/* If ctxt->loc_handle == NULL, then it does not need to record
+	 * update, usually happens in error handler path */
+	if (ctxt == NULL || ctxt->loc_handle == NULL) {
+		llog_ctxt_put(ctxt);
+		RETURN(0);
+	}
+
+	/* Since the cross-MDT updates will includes both local
+	 * and remote updates, the update ops count must > 1 */
+	LASSERT(record->lur_update_rec.ur_update_count > 1);
+	LASSERTF(record->lur_hdr.lrh_len == llog_update_record_size(record),
+		 "lrh_len %u record_size %zu\n", record->lur_hdr.lrh_len,
+		 llog_update_record_size(record));
+
+	/*
+	 * If its size > llog chunk_size, then write current chunk to the update
+	 * llog, NB the padding should >= LLOG_MIN_REC_SIZE.
+	 *
+	 * So check padding length is either >= LLOG_MIN_REC_SIZE or is 0
+	 * (record length just matches the chunk size).
+	 */
+
+	reclen = record->lur_hdr.lrh_len;
+	if (reclen + LLOG_MIN_REC_SIZE <= ctxt->loc_chunk_size ||
+	    reclen == ctxt->loc_chunk_size) {
+		OBD_ALLOC_PTR(stc);
+		if (stc == NULL)
+			GOTO(llog_put, rc = -ENOMEM);
+		INIT_LIST_HEAD(&stc->stc_list);
+
+		rc = llog_add(env, ctxt->loc_handle, &record->lur_hdr,
+			      &stc->stc_cookie, sub_th->st_sub_th);
+
+		CDEBUG(D_INFO, "%s: Add update log "DFID".%u: rc = %d\n",
+		       dt->dd_lu_dev.ld_obd->obd_name,
+		       PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid),
+		       stc->stc_cookie.lgc_index, rc);
+
+		if (rc > 0) {
+			list_add(&stc->stc_list, &sub_th->st_cookie_list);
+			rc = 0;
+		} else {
+			OBD_FREE_PTR(stc);
+		}
+
+		GOTO(llog_put, rc);
+	}
+
+	/* Split the records into chunk_size update record */
+	OBD_ALLOC_LARGE(lur, ctxt->loc_chunk_size);
+	if (lur == NULL)
+		GOTO(llog_put, rc = -ENOMEM);
+
+	memcpy(lur, &record->lur_hdr, sizeof(record->lur_hdr));
+	lur->lur_update_rec.ur_update_count = 0;
+	lur->lur_update_rec.ur_param_count = 0;
+	start = (char *)&record->lur_update_rec.ur_ops;
+	cur = next = start;
+	do {
+		if (update_count < record->lur_update_rec.ur_update_count)
+			next = (char *)update_op_next_op(
+						(struct update_op *)cur);
+		else if (param_count < record->lur_update_rec.ur_param_count)
+			next = (char *)update_param_next_param(
+						(struct update_param *)cur);
+		else
+			eof = true;
+
+		reclen = __llog_update_record_size(
+				__update_records_size(next - start));
+		if ((reclen + LLOG_MIN_REC_SIZE <= ctxt->loc_chunk_size ||
+		     reclen == ctxt->loc_chunk_size) &&
+		    !eof) {
+			cur = next;
+
+			if (update_count <
+			    record->lur_update_rec.ur_update_count)
+				update_count++;
+			else if (param_count <
+				 record->lur_update_rec.ur_param_count)
+				param_count++;
+			continue;
+		}
+
+		lur->lur_update_rec.ur_update_count = update_count -
+						      last_update_count;
+		lur->lur_update_rec.ur_param_count = param_count -
+						     last_param_count;
+		memcpy(&lur->lur_update_rec.ur_ops, start, cur - start);
+		lur->lur_hdr.lrh_len = llog_update_record_size(lur);
+
+		LASSERT(lur->lur_hdr.lrh_len ==
+			 __llog_update_record_size(
+				__update_records_size(cur - start)));
+		LASSERT(lur->lur_hdr.lrh_len <= ctxt->loc_chunk_size);
+
+		update_records_dump(&lur->lur_update_rec, D_INFO, true);
+
+		OBD_ALLOC_PTR(stc);
+		if (stc == NULL)
+			GOTO(llog_put, rc = -ENOMEM);
+		INIT_LIST_HEAD(&stc->stc_list);
+
+		rc = llog_add(env, ctxt->loc_handle, &lur->lur_hdr,
+			      &stc->stc_cookie, sub_th->st_sub_th);
+
+		CDEBUG(D_INFO, "%s: Add update log "DFID".%u: rc = %d\n",
+			dt->dd_lu_dev.ld_obd->obd_name,
+			PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid),
+			stc->stc_cookie.lgc_index, rc);
+
+		if (rc > 0) {
+			list_add(&stc->stc_list, &sub_th->st_cookie_list);
+			rc = 0;
+		} else {
+			OBD_FREE_PTR(stc);
+			GOTO(llog_put, rc);
+		}
+
+		last_update_count = update_count;
+		last_param_count = param_count;
+		start = cur;
+		lur->lur_update_rec.ur_update_count = 0;
+		lur->lur_update_rec.ur_param_count = 0;
+		lur->lur_update_rec.ur_flags |= UPDATE_RECORD_CONTINUE;
+	} while (!eof);
+
+llog_put:
+	if (lur != NULL)
+		OBD_FREE_LARGE(lur, ctxt->loc_chunk_size);
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/**
+ * Prepare the update records.
+ *
+ * Merge params and ops into the update records, then initializing
+ * the update buffer.
+ *
+ * During transaction execution phase, parameters and update ops
+ * are collected in two different buffers (see lod_updates_pack()),
+ * during transaction stop, it needs to be merged in one buffer,
+ * so it will be written in the update log.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tmt	top_multiple_thandle for distribute txn
+ *
+ * \retval		0 if merging succeeds.
+ * \retval		negaitive errno if merging fails.
+ */
+static int prepare_writing_updates(const struct lu_env *env,
+				   struct top_multiple_thandle *tmt)
+{
+	struct thandle_update_records	*tur = tmt->tmt_update_records;
+	struct llog_update_record	*lur;
+	struct update_params *params;
+	size_t params_size;
+	size_t update_size;
+
+	if (tur == NULL || tur->tur_update_records == NULL ||
+	    tur->tur_update_params == NULL)
+		return 0;
+
+	lur = tur->tur_update_records;
+	/* Extends the update records buffer if needed */
+	params_size = update_params_size(tur->tur_update_params,
+					 tur->tur_update_param_count);
+	LASSERT(lur->lur_update_rec.ur_param_count == 0);
+	update_size = llog_update_record_size(lur);
+	if (cfs_size_round(update_size + params_size) >
+	    tur->tur_update_records_buf_size) {
+		int rc;
+
+		rc = tur_update_records_extend(tur,
+			cfs_size_round(update_size + params_size));
+		if (rc < 0)
+			return rc;
+
+		lur = tur->tur_update_records;
+	}
+
+	params = update_records_get_params(&lur->lur_update_rec);
+	memcpy(params, tur->tur_update_params, params_size);
+
+	lur->lur_update_rec.ur_param_count = tur->tur_update_param_count;
+	lur->lur_update_rec.ur_batchid = tmt->tmt_batchid;
+	/* Init update record header */
+	lur->lur_hdr.lrh_len = llog_update_record_size(lur);
+	lur->lur_hdr.lrh_type = UPDATE_REC;
+
+	/* Dump updates for debugging purpose */
+	update_records_dump(&lur->lur_update_rec, D_INFO, true);
+
+	return 0;
+}
+
+/**
+ * Top thandle commit callback
+ *
+ * This callback will be called when all of sub transactions are committed.
+ *
+ * \param[in] th	top thandle to be committed.
+ */
+static void top_trans_committed_cb(struct top_multiple_thandle *tmt)
+{
+	struct lu_target *lut;
+	ENTRY;
+
+	LASSERT(atomic_read(&tmt->tmt_refcount) > 0);
+
+	top_multiple_thandle_dump(tmt, D_HA);
+	tmt->tmt_committed = 1;
+	lut = dt2lu_dev(tmt->tmt_master_sub_dt)->ld_site->ls_tgt;
+	if (lut->lut_tdtd && lut->lut_tdtd->tdtd_commit_task)
+		wake_up_process(lut->lut_tdtd->tdtd_commit_task);
+
+	RETURN_EXIT;
+}
+
+struct sub_thandle *lookup_sub_thandle(struct top_multiple_thandle *tmt,
+				       struct dt_device *dt_dev)
+{
+	struct sub_thandle *st;
+
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_dt == dt_dev)
+			return st;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lookup_sub_thandle);
+
+struct sub_thandle *create_sub_thandle(struct top_multiple_thandle *tmt,
+				       struct dt_device *dt_dev)
+{
+	struct sub_thandle *st;
+
+	OBD_ALLOC_PTR(st);
+	if (st == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	INIT_LIST_HEAD(&st->st_sub_list);
+	INIT_LIST_HEAD(&st->st_cookie_list);
+	st->st_dt = dt_dev;
+
+	list_add(&st->st_sub_list, &tmt->tmt_sub_thandle_list);
+	return st;
+}
+
+static void sub_trans_commit_cb_internal(struct top_multiple_thandle *tmt,
+					 struct thandle *sub_th, int err)
+{
+	struct sub_thandle	*st;
+	bool			all_committed = true;
+
+	/* Check if all sub thandles are committed */
+	spin_lock(&tmt->tmt_sub_lock);
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th == sub_th) {
+			st->st_committed = 1;
+			st->st_result = err;
+		}
+		if (!st->st_committed)
+			all_committed = false;
+	}
+	spin_unlock(&tmt->tmt_sub_lock);
+
+	if (tmt->tmt_result == 0)
+		tmt->tmt_result = err;
+
+	if (all_committed)
+		top_trans_committed_cb(tmt);
+
+	top_multiple_thandle_dump(tmt, D_INFO);
+	top_multiple_thandle_put(tmt);
+	RETURN_EXIT;
+}
+
+/**
+ * sub thandle commit callback
+ *
+ * Mark the sub thandle to be committed and if all sub thandle are committed
+ * notify the top thandle.
+ *
+ * \param[in] env	execution environment
+ * \param[in] sub_th	sub thandle being committed
+ * \param[in] cb	commit callback
+ * \param[in] err	trans result
+ */
+static void sub_trans_commit_cb(struct lu_env *env,
+				struct thandle *sub_th,
+				struct dt_txn_commit_cb *cb, int err)
+{
+	struct top_multiple_thandle *tmt = cb->dcb_data;
+
+	sub_trans_commit_cb_internal(tmt, sub_th, err);
+}
+
+static void sub_thandle_register_commit_cb(struct sub_thandle *st,
+				    struct top_multiple_thandle *tmt)
+{
+	LASSERT(st->st_sub_th != NULL);
+	top_multiple_thandle_get(tmt);
+	st->st_commit_dcb.dcb_func = sub_trans_commit_cb;
+	st->st_commit_dcb.dcb_data = tmt;
+	INIT_LIST_HEAD(&st->st_commit_dcb.dcb_linkage);
+	dt_trans_cb_add(st->st_sub_th, &st->st_commit_dcb);
+}
+
+/**
+ * Sub thandle stop call back
+ *
+ * After sub thandle is stopped, it will call this callback to notify
+ * the top thandle.
+ *
+ * \param[in] th	sub thandle to be stopped
+ * \param[in] rc	result of sub trans
+ */
+static void sub_trans_stop_cb(struct lu_env *env,
+			      struct thandle *sub_th,
+			      struct dt_txn_commit_cb *cb, int err)
+{
+	struct sub_thandle		*st;
+	struct top_multiple_thandle	*tmt = cb->dcb_data;
+	ENTRY;
+
+	spin_lock(&tmt->tmt_sub_lock);
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_stopped)
+			continue;
+
+		if (st->st_dt == sub_th->th_dev) {
+			st->st_stopped = 1;
+			st->st_result = err;
+			break;
+		}
+	}
+	spin_unlock(&tmt->tmt_sub_lock);
+
+	wake_up(&tmt->tmt_stop_waitq);
+	RETURN_EXIT;
+}
+
+static void sub_thandle_register_stop_cb(struct sub_thandle *st,
+					 struct top_multiple_thandle *tmt)
+{
+	st->st_stop_dcb.dcb_func = sub_trans_stop_cb;
+	st->st_stop_dcb.dcb_data = tmt;
+	st->st_stop_dcb.dcb_flags = DCB_TRANS_STOP;
+	INIT_LIST_HEAD(&st->st_stop_dcb.dcb_linkage);
+	dt_trans_cb_add(st->st_sub_th, &st->st_stop_dcb);
+}
+
+/**
+ * Create sub thandle
+ *
+ * Create transaction handle for sub_thandle
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	top thandle
+ * \param[in] st	sub_thandle
+ *
+ * \retval		0 if creation succeeds.
+ * \retval		negative errno if creation fails.
+ */
+int sub_thandle_trans_create(const struct lu_env *env,
+			     struct top_thandle *top_th,
+			     struct sub_thandle *st)
+{
+	struct thandle *sub_th;
+
+	sub_th = dt_trans_create(env, st->st_dt);
+	if (IS_ERR(sub_th))
+		return PTR_ERR(sub_th);
+
+	sub_th->th_top = &top_th->tt_super;
+	st->st_sub_th = sub_th;
+
+	sub_th->th_wait_submit = 1;
+	sub_thandle_register_stop_cb(st, top_th->tt_multiple_thandle);
+	return 0;
+}
+
+/**
+ * Create the top transaction.
+ *
+ * Create the top transaction on the master device. It will create a top
+ * thandle and a sub thandle on the master device.
+ *
+ * \param[in] env		execution environment
+ * \param[in] master_dev	master_dev the top thandle will be created
+ *
+ * \retval			pointer to the created thandle.
+ * \retval			ERR_PTR(errno) if creation failed.
+ */
+struct thandle *
+top_trans_create(const struct lu_env *env, struct dt_device *master_dev)
+{
+	struct top_thandle	*top_th;
+	struct thandle		*child_th;
+
+	OBD_ALLOC_GFP(top_th, sizeof(*top_th), __GFP_IO);
+	if (top_th == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	top_th->tt_super.th_top = &top_th->tt_super;
+
+	if (master_dev != NULL) {
+		child_th = dt_trans_create(env, master_dev);
+		if (IS_ERR(child_th)) {
+			OBD_FREE_PTR(top_th);
+			return child_th;
+		}
+
+		child_th->th_top = &top_th->tt_super;
+		child_th->th_wait_submit = 1;
+		top_th->tt_master_sub_thandle = child_th;
+	}
+	return &top_th->tt_super;
+}
+EXPORT_SYMBOL(top_trans_create);
+
+/**
+ * Declare write update transaction
+ *
+ * Check if there are updates being recorded in this transaction,
+ * it will write the record into the disk.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tmt	top multiple transaction handle
+ *
+ * \retval		0 if writing succeeds
+ * \retval		negative errno if writing fails
+ */
+static int declare_updates_write(const struct lu_env *env,
+				 struct top_multiple_thandle *tmt)
+{
+	struct llog_update_record *record;
+	struct sub_thandle *st;
+	int rc = 0;
+
+	record = tmt->tmt_update_records->tur_update_records;
+	/* Declare update write for all other target */
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th == NULL)
+			continue;
+
+		rc = sub_declare_updates_write(env, record, st->st_sub_th,
+					       tmt->tmt_record_size);
+		if (rc < 0)
+			break;
+	}
+
+	return rc;
+}
+
+/**
+ * Assign batchid to the distribute transaction.
+ *
+ * Assign batchid to the distribute transaction
+ *
+ * \param[in] tmt	distribute transaction
+ */
+static void distribute_txn_assign_batchid(struct top_multiple_thandle *new)
+{
+	struct target_distribute_txn_data *tdtd;
+	struct dt_device *dt = new->tmt_master_sub_dt;
+	struct sub_thandle *st;
+
+	LASSERT(dt != NULL);
+	tdtd = dt2lu_dev(dt)->ld_site->ls_tgt->lut_tdtd;
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	new->tmt_batchid = tdtd->tdtd_batchid++;
+	list_add_tail(&new->tmt_commit_list, &tdtd->tdtd_list);
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+	list_for_each_entry(st, &new->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th != NULL)
+			sub_thandle_register_commit_cb(st, new);
+	}
+	top_multiple_thandle_get(new);
+	top_multiple_thandle_dump(new, D_INFO);
+}
+
+/**
+ * Insert distribute transaction to the distribute txn list.
+ *
+ * Insert distribute transaction to the distribute txn list.
+ *
+ * \param[in] new	the distribute txn to be inserted.
+ */
+void distribute_txn_insert_by_batchid(struct top_multiple_thandle *new)
+{
+	struct dt_device *dt = new->tmt_master_sub_dt;
+	struct top_multiple_thandle *tmt;
+	struct target_distribute_txn_data *tdtd;
+	struct sub_thandle *st;
+	bool	at_head = false;
+
+	LASSERT(dt != NULL);
+	tdtd = dt2lu_dev(dt)->ld_site->ls_tgt->lut_tdtd;
+
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	list_for_each_entry_reverse(tmt, &tdtd->tdtd_list, tmt_commit_list) {
+		if (new->tmt_batchid > tmt->tmt_batchid) {
+			list_add(&new->tmt_commit_list, &tmt->tmt_commit_list);
+			break;
+		}
+	}
+	if (list_empty(&new->tmt_commit_list)) {
+		at_head = true;
+		list_add(&new->tmt_commit_list, &tdtd->tdtd_list);
+	}
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+
+	list_for_each_entry(st, &new->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th != NULL)
+			sub_thandle_register_commit_cb(st, new);
+	}
+
+	top_multiple_thandle_get(new);
+	top_multiple_thandle_dump(new, D_INFO);
+	if (new->tmt_committed && at_head && tdtd->tdtd_commit_task)
+		wake_up_process(tdtd->tdtd_commit_task);
+}
+
+/**
+ * Prepare cross-MDT operation.
+ *
+ * Create the update record buffer to record updates for cross-MDT operation,
+ * add master sub transaction to tt_sub_trans_list, and declare the update
+ * writes.
+ *
+ * During updates packing, all of parameters will be packed in
+ * tur_update_params, and updates will be packed in tur_update_records.
+ * Then in transaction stop, parameters and updates will be merged
+ * into one updates buffer.
+ *
+ * And also master thandle will be added to the sub_th list, so it will be
+ * easy to track the commit status.
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	top transaction handle
+ *
+ * \retval		0 if preparation succeeds.
+ * \retval		negative errno if preparation fails.
+ */
+static int prepare_multiple_node_trans(const struct lu_env *env,
+				       struct top_multiple_thandle *tmt)
+{
+	struct thandle_update_records	*tur;
+	int				rc;
+	ENTRY;
+
+	if (tmt->tmt_update_records == NULL) {
+		tur = &update_env_info(env)->uti_tur;
+		rc = check_and_prepare_update_record(env, tur);
+		if (rc < 0)
+			RETURN(rc);
+
+		tmt->tmt_update_records = tur;
+		distribute_txn_assign_batchid(tmt);
+	}
+
+	rc = declare_updates_write(env, tmt);
+
+	RETURN(rc);
+}
+
+/**
+ * start the top transaction.
+ *
+ * Start all of its sub transactions, then start master sub transaction.
+ *
+ * \param[in] env		execution environment
+ * \param[in] master_dev	master_dev the top thandle will be start
+ * \param[in] th		top thandle
+ *
+ * \retval			0 if transaction start succeeds.
+ * \retval			negative errno if start fails.
+ */
+int top_trans_start(const struct lu_env *env, struct dt_device *master_dev,
+		    struct thandle *th)
+{
+	struct top_thandle	*top_th = container_of(th, struct top_thandle,
+						       tt_super);
+	struct sub_thandle		*st;
+	struct top_multiple_thandle	*tmt = top_th->tt_multiple_thandle;
+	int				rc = 0;
+	ENTRY;
+
+	if (tmt == NULL) {
+		if (th->th_sync)
+			top_th->tt_master_sub_thandle->th_sync = th->th_sync;
+		if (th->th_local)
+			top_th->tt_master_sub_thandle->th_local = th->th_local;
+		rc = dt_trans_start(env, top_th->tt_master_sub_thandle->th_dev,
+				    top_th->tt_master_sub_thandle);
+		RETURN(rc);
+	}
+
+	tmt = top_th->tt_multiple_thandle;
+	rc = prepare_multiple_node_trans(env, tmt);
+	if (rc < 0)
+		RETURN(rc);
+
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th == NULL)
+			continue;
+		if (th->th_sync)
+			st->st_sub_th->th_sync = th->th_sync;
+		if (th->th_local)
+			st->st_sub_th->th_local = th->th_local;
+		rc = dt_trans_start(env, st->st_sub_th->th_dev,
+				    st->st_sub_th);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		LASSERT(st->st_started == 0);
+		st->st_started = 1;
+	}
+out:
+	th->th_result = rc;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(top_trans_start);
+
+/**
+ * Check whether we need write updates record
+ *
+ * Check if the updates for the top_thandle needs to be writen
+ * to all targets. Only if the transaction succeeds and the updates
+ * number > 2, it will write the updates,
+ *
+ * \params [in] top_th	top thandle.
+ *
+ * \retval		true if it needs to write updates
+ * \retval		false if it does not need to write updates
+ **/
+static bool top_check_write_updates(struct top_thandle *top_th)
+{
+	struct top_multiple_thandle	*tmt;
+	struct thandle_update_records	*tur;
+
+	/* Do not write updates to records if the transaction fails */
+	if (top_th->tt_super.th_result != 0)
+		return false;
+
+	tmt = top_th->tt_multiple_thandle;
+	if (tmt == NULL)
+		return false;
+
+	tur = tmt->tmt_update_records;
+	if (tur == NULL)
+		return false;
+
+	/* Hmm, false update records, since the cross-MDT operation
+	 * should includes both local and remote updates, so the
+	 * updates count should >= 2 */
+	if (tur->tur_update_records == NULL ||
+	    tur->tur_update_records->lur_update_rec.ur_update_count <= 1)
+		return false;
+
+	return true;
+}
+
+/**
+ * Check if top transaction is stopped
+ *
+ * Check if top transaction is stopped, only if all sub transaction
+ * is stopped, then the top transaction is stopped.
+ *
+ * \param [in] top_th	top thandle
+ *
+ * \retval		true if the top transaction is stopped.
+ * \retval		false if the top transaction is not stopped.
+ */
+static bool top_trans_is_stopped(struct top_thandle *top_th)
+{
+	struct top_multiple_thandle	*tmt;
+	struct sub_thandle		*st;
+	bool			all_stopped = true;
+
+	tmt = top_th->tt_multiple_thandle;
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (!st->st_stopped && st->st_sub_th != NULL) {
+			all_stopped = false;
+			break;
+		}
+
+		if (st->st_result != 0 &&
+		    top_th->tt_super.th_result == 0)
+			top_th->tt_super.th_result = st->st_result;
+	}
+
+	return all_stopped;
+}
+
+/**
+ * Wait result of top transaction
+ *
+ * Wait until all sub transaction get its result.
+ *
+ * \param [in] top_th	top thandle.
+ *
+ * \retval		the result of top thandle.
+ */
+static int top_trans_wait_result(struct top_thandle *top_th)
+{
+	wait_event_idle(top_th->tt_multiple_thandle->tmt_stop_waitq,
+			top_trans_is_stopped(top_th));
+
+	RETURN(top_th->tt_super.th_result);
+}
+
+/**
+ * Stop the top transaction.
+ *
+ * Stop the transaction on the master device first, then stop transactions
+ * on other sub devices.
+ *
+ * \param[in] env		execution environment
+ * \param[in] master_dev	master_dev the top thandle will be created
+ * \param[in] th		top thandle
+ *
+ * \retval			0 if stop transaction succeeds.
+ * \retval			negative errno if stop transaction fails.
+ */
+int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
+		   struct thandle *th)
+{
+	struct top_thandle	*top_th = container_of(th, struct top_thandle,
+						       tt_super);
+	struct sub_thandle		*st;
+	struct sub_thandle		*master_st;
+	struct top_multiple_thandle	*tmt;
+	struct thandle_update_records	*tur;
+	bool				write_updates = false;
+	int			rc = 0;
+	ENTRY;
+
+	if (likely(top_th->tt_multiple_thandle == NULL)) {
+		LASSERT(master_dev != NULL);
+
+		if (th->th_sync)
+			top_th->tt_master_sub_thandle->th_sync = th->th_sync;
+		if (th->th_local)
+			top_th->tt_master_sub_thandle->th_local = th->th_local;
+		rc = dt_trans_stop(env, master_dev,
+				   top_th->tt_master_sub_thandle);
+		OBD_FREE_PTR(top_th);
+		RETURN(rc);
+	}
+
+	tmt = top_th->tt_multiple_thandle;
+	tur = tmt->tmt_update_records;
+
+	/* Note: we need stop the master thandle first, then the stop
+	 * callback will fill the master transno in the update logs,
+	 * then these update logs will be sent to other MDTs */
+	/* get the master sub thandle */
+	master_st = lookup_sub_thandle(tmt, tmt->tmt_master_sub_dt);
+	write_updates = top_check_write_updates(top_th);
+
+	/* Step 1: write the updates log on Master MDT */
+	if (master_st != NULL && master_st->st_sub_th != NULL &&
+	    write_updates) {
+		struct llog_update_record *lur;
+
+		/* Merge the parameters and updates into one buffer */
+		rc = prepare_writing_updates(env, tmt);
+		if (rc < 0) {
+			CERROR("%s: cannot prepare updates: rc = %d\n",
+			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
+			th->th_result = rc;
+			write_updates = false;
+			GOTO(stop_master_trans, rc);
+		}
+
+		lur = tur->tur_update_records;
+		/* Write updates to the master MDT */
+		rc = sub_updates_write(env, lur, master_st);
+
+		/* Cleanup the common parameters in the update records,
+		 * master transno callback might add more parameters.
+		 * and we need merge the update records again in the
+		 * following */
+		if (tur->tur_update_params != NULL)
+			lur->lur_update_rec.ur_param_count = 0;
+
+		if (rc < 0) {
+			CERROR("%s: write updates failed: rc = %d\n",
+			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
+			th->th_result = rc;
+			write_updates = false;
+			GOTO(stop_master_trans, rc);
+		}
+	}
+
+stop_master_trans:
+	/* Step 2: Stop the transaction on the master MDT, and fill the
+	 * master transno in the update logs to other MDT. */
+	if (master_st != NULL && master_st->st_sub_th != NULL) {
+		if (th->th_local)
+			master_st->st_sub_th->th_local = th->th_local;
+		if (th->th_sync)
+			master_st->st_sub_th->th_sync = th->th_sync;
+		master_st->st_sub_th->th_result = th->th_result;
+		rc = dt_trans_stop(env, master_st->st_dt, master_st->st_sub_th);
+		/* If it does not write_updates, then we call submit callback
+		 * here, otherwise callback is done through
+		 * osd(osp)_trans_commit_cb() */
+		if (!master_st->st_started &&
+		    !list_empty(&tmt->tmt_commit_list))
+			sub_trans_commit_cb_internal(tmt,
+						master_st->st_sub_th, rc);
+		if (rc < 0) {
+			CERROR("%s: stop trans failed: rc = %d\n",
+			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
+			th->th_result = rc;
+			GOTO(stop_other_trans, rc);
+		} else if (tur != NULL && tur->tur_update_records != NULL) {
+			struct llog_update_record *lur;
+
+			lur = tur->tur_update_records;
+			if (lur->lur_update_rec.ur_master_transno == 0)
+				/* Update master transno after master stop
+				 * callback */
+				lur->lur_update_rec.ur_master_transno =
+						tgt_th_info(env)->tti_transno;
+		}
+	}
+
+	/* Step 3: write updates to other MDTs */
+	if (write_updates) {
+		struct llog_update_record *lur;
+		if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
+			if (cfs_fail_val == 1) {
+				long timeout = cfs_time_seconds(1) / 10;
+
+				OBD_RACE(OBD_FAIL_OUT_OBJECT_MISS);
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(schedule_timeout(timeout));
+				cfs_fail_loc = 0;
+			}
+			cfs_fail_val++;
+		}
+
+		/* Stop callback of master will add more updates and also update
+		 * master transno, so merge the parameters and updates into one
+		 * buffer again */
+		rc = prepare_writing_updates(env, tmt);
+		if (rc < 0) {
+			CERROR("%s: prepare updates failed: rc = %d\n",
+			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
+			th->th_result = rc;
+			GOTO(stop_other_trans, rc);
+		}
+		lur = tur->tur_update_records;
+		list_for_each_entry(st, &tmt->tmt_sub_thandle_list,
+				    st_sub_list) {
+			if (st->st_sub_th == NULL || st == master_st ||
+			    st->st_sub_th->th_result < 0)
+				continue;
+
+			rc = sub_updates_write(env, lur, st);
+			if (rc < 0) {
+				CERROR("%s: write updates failed: rc = %d\n",
+				       st->st_dt->dd_lu_dev.ld_obd->obd_name,
+				       rc);
+				th->th_result = rc;
+				break;
+			}
+		}
+	}
+
+stop_other_trans:
+	/* Step 4: Stop the transaction on other MDTs */
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st == master_st || st->st_sub_th == NULL)
+			continue;
+
+		if (th->th_sync)
+			st->st_sub_th->th_sync = th->th_sync;
+		if (th->th_local)
+			st->st_sub_th->th_local = th->th_local;
+		st->st_sub_th->th_result = th->th_result;
+		rc = dt_trans_stop(env, st->st_sub_th->th_dev,
+				   st->st_sub_th);
+		if (rc < 0) {
+			CERROR("%s: stop trans failed: rc = %d\n",
+			       st->st_dt->dd_lu_dev.ld_obd->obd_name, rc);
+			if (th->th_result == 0)
+				th->th_result = rc;
+		}
+	}
+
+	rc = top_trans_wait_result(top_th);
+
+	tmt->tmt_result = rc;
+
+	/* Balance for the refcount in top_trans_create, Note: if it is NOT
+	 * multiple node transaction, the top transaction will be destroyed. */
+	top_multiple_thandle_put(tmt);
+	OBD_FREE_PTR(top_th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(top_trans_stop);
+
+/**
+ * Create top_multiple_thandle for top_thandle
+ *
+ * Create top_mutilple_thandle to manage the mutiple node transaction
+ * for top_thandle, and it also needs to add master sub thandle to the
+ * sub trans list now.
+ *
+ * \param[in] env	execution environment
+ * \param[in] top_th	the top thandle
+ *
+ * \retval	0 if creation succeeds
+ * \retval	negative errno if creation fails
+ */
+int top_trans_create_tmt(const struct lu_env *env,
+			 struct top_thandle *top_th)
+{
+	struct top_multiple_thandle *tmt;
+
+	OBD_ALLOC_PTR(tmt);
+	if (tmt == NULL)
+		return -ENOMEM;
+
+	tmt->tmt_magic = TOP_THANDLE_MAGIC;
+	INIT_LIST_HEAD(&tmt->tmt_sub_thandle_list);
+	INIT_LIST_HEAD(&tmt->tmt_commit_list);
+	atomic_set(&tmt->tmt_refcount, 1);
+	spin_lock_init(&tmt->tmt_sub_lock);
+	init_waitqueue_head(&tmt->tmt_stop_waitq);
+
+	top_th->tt_multiple_thandle = tmt;
+
+	return 0;
+}
+
+static struct sub_thandle *
+create_sub_thandle_with_thandle(struct top_thandle *top_th,
+				struct thandle *sub_th)
+{
+	struct sub_thandle *st;
+
+	/* create and init sub th to the top trans list */
+	st = create_sub_thandle(top_th->tt_multiple_thandle,
+				sub_th->th_dev);
+	if (IS_ERR(st))
+		return st;
+
+	st->st_sub_th = sub_th;
+
+	sub_th->th_top = &top_th->tt_super;
+	sub_thandle_register_stop_cb(st, top_th->tt_multiple_thandle);
+	return st;
+}
+
+/**
+ * Get sub thandle.
+ *
+ * Get sub thandle from the top thandle according to the sub dt_device.
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	thandle on the top layer.
+ * \param[in] sub_dt	sub dt_device used to get sub transaction
+ *
+ * \retval		thandle of sub transaction if succeed
+ * \retval		PTR_ERR(errno) if failed
+ */
+struct thandle *thandle_get_sub_by_dt(const struct lu_env *env,
+				      struct thandle *th,
+				      struct dt_device *sub_dt)
+{
+	struct sub_thandle	*st = NULL;
+	struct sub_thandle	*master_st = NULL;
+	struct top_thandle	*top_th;
+	struct thandle		*sub_th = NULL;
+	int			rc = 0;
+	ENTRY;
+
+	top_th = container_of(th, struct top_thandle, tt_super);
+
+	if (likely(sub_dt == top_th->tt_master_sub_thandle->th_dev))
+		RETURN(top_th->tt_master_sub_thandle);
+
+	if (top_th->tt_multiple_thandle != NULL) {
+		st = lookup_sub_thandle(top_th->tt_multiple_thandle, sub_dt);
+		if (st != NULL)
+			RETURN(st->st_sub_th);
+	}
+
+	sub_th = dt_trans_create(env, sub_dt);
+	if (IS_ERR(sub_th))
+		RETURN(sub_th);
+
+	/* Create top_multiple_thandle if necessary */
+	if (top_th->tt_multiple_thandle == NULL) {
+		struct top_multiple_thandle *tmt;
+
+		rc = top_trans_create_tmt(env, top_th);
+		if (rc < 0)
+			GOTO(stop_trans, rc);
+
+		tmt = top_th->tt_multiple_thandle;
+
+		/* Add master sub th to the top trans list */
+		tmt->tmt_master_sub_dt =
+			top_th->tt_master_sub_thandle->th_dev;
+		master_st = create_sub_thandle_with_thandle(top_th,
+					top_th->tt_master_sub_thandle);
+		if (IS_ERR(master_st)) {
+			rc = PTR_ERR(master_st);
+			master_st = NULL;
+			GOTO(stop_trans, rc);
+		}
+	}
+
+	/* create and init sub th to the top trans list */
+	st = create_sub_thandle_with_thandle(top_th, sub_th);
+	if (IS_ERR(st)) {
+		rc = PTR_ERR(st);
+		st = NULL;
+		GOTO(stop_trans, rc);
+	}
+	st->st_sub_th->th_wait_submit = 1;
+stop_trans:
+	if (rc < 0) {
+		if (master_st != NULL) {
+			list_del(&master_st->st_sub_list);
+			OBD_FREE_PTR(master_st);
+		}
+		sub_th->th_result = rc;
+		dt_trans_stop(env, sub_dt, sub_th);
+		sub_th = ERR_PTR(rc);
+	}
+
+	RETURN(sub_th);
+}
+EXPORT_SYMBOL(thandle_get_sub_by_dt);
+
+/**
+ * Top multiple thandle destroy
+ *
+ * Destroy multiple thandle and all its sub thandle.
+ *
+ * \param[in] tmt	top_multiple_thandle to be destroyed.
+ */
+void top_multiple_thandle_destroy(struct top_multiple_thandle *tmt)
+{
+	struct sub_thandle *st;
+	struct sub_thandle *tmp;
+
+	LASSERT(tmt->tmt_magic == TOP_THANDLE_MAGIC);
+	list_for_each_entry_safe(st, tmp, &tmt->tmt_sub_thandle_list,
+				 st_sub_list) {
+		struct sub_thandle_cookie *stc;
+		struct sub_thandle_cookie *tmp;
+
+		list_del(&st->st_sub_list);
+		list_for_each_entry_safe(stc, tmp, &st->st_cookie_list,
+					 stc_list) {
+			list_del(&stc->stc_list);
+			OBD_FREE_PTR(stc);
+		}
+		OBD_FREE_PTR(st);
+	}
+	OBD_FREE_PTR(tmt);
+}
+EXPORT_SYMBOL(top_multiple_thandle_destroy);
+
+/**
+ * Cancel the update log on MDTs
+ *
+ * Cancel the update log on MDTs then destroy the thandle.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tmt	the top multiple thandle whose updates records
+ *                      will be cancelled.
+ *
+ * \retval		0 if cancellation succeeds.
+ * \retval		negative errno if cancellation fails.
+ */
+static int distribute_txn_cancel_records(const struct lu_env *env,
+					 struct top_multiple_thandle *tmt)
+{
+	struct sub_thandle *st;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_TXN_NO_CANCEL))
+		RETURN(0);
+
+	top_multiple_thandle_dump(tmt, D_INFO);
+	/* Cancel update logs on other MDTs */
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		struct llog_ctxt	*ctxt;
+		struct obd_device	*obd;
+		struct llog_cookie	*cookie;
+		struct sub_thandle_cookie *stc;
+		int rc;
+
+		obd = st->st_dt->dd_lu_dev.ld_obd;
+		ctxt = llog_get_context(obd, LLOG_UPDATELOG_ORIG_CTXT);
+		if (ctxt == NULL)
+			continue;
+		list_for_each_entry(stc, &st->st_cookie_list, stc_list) {
+			cookie = &stc->stc_cookie;
+			if (fid_is_zero(&cookie->lgc_lgl.lgl_oi.oi_fid))
+				continue;
+
+			rc = llog_cat_cancel_records(env, ctxt->loc_handle, 1,
+						     cookie);
+			CDEBUG(D_HA, "%s: batchid %llu cancel update log "
+			       DFID".%u: rc = %d\n", obd->obd_name,
+			       tmt->tmt_batchid,
+			       PFID(&cookie->lgc_lgl.lgl_oi.oi_fid),
+			       cookie->lgc_index, rc);
+		}
+
+		llog_ctxt_put(ctxt);
+	}
+
+	RETURN(0);
+}
+
+struct distribute_txn_bid_data {
+	struct dt_txn_commit_cb  dtbd_cb;
+	struct target_distribute_txn_data      *dtbd_tdtd;
+	__u64                    dtbd_batchid;
+};
+
+/**
+ * callback of updating commit batchid
+ *
+ * Updating commit batchid then wake up the commit thread to cancel the
+ * records.
+ *
+ * \param[in]env	execution environment
+ * \param[in]th		thandle to updating commit batchid
+ * \param[in]cb		commit callback
+ * \param[in]err	result of thandle
+ */
+static void distribute_txn_batchid_cb(struct lu_env *env,
+				      struct thandle *th,
+				      struct dt_txn_commit_cb *cb,
+				      int err)
+{
+	struct distribute_txn_bid_data		*dtbd = NULL;
+	struct target_distribute_txn_data	*tdtd;
+
+	dtbd = container_of(cb, struct distribute_txn_bid_data, dtbd_cb);
+	tdtd = dtbd->dtbd_tdtd;
+
+	CDEBUG(D_HA, "%s: %llu batchid updated\n",
+	      tdtd->tdtd_lut->lut_obd->obd_name, dtbd->dtbd_batchid);
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	if (dtbd->dtbd_batchid > tdtd->tdtd_committed_batchid &&
+	    !tdtd->tdtd_lut->lut_obd->obd_no_transno)
+		tdtd->tdtd_committed_batchid = dtbd->dtbd_batchid;
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+	if (atomic_dec_and_test(&tdtd->tdtd_refcount))
+		wake_up_process(tdtd->tdtd_commit_task);
+
+	OBD_FREE_PTR(dtbd);
+}
+
+/**
+ * Update the commit batchid in disk
+ *
+ * Update commit batchid in the disk, after this is committed, it can start
+ * to cancel the update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tdtd	distribute transaction structure
+ * \param[in] batchid	commit batchid to be updated
+ *
+ * \retval		0 if update succeeds.
+ * \retval		negative errno if update fails.
+ */
+static int
+distribute_txn_commit_batchid_update(const struct lu_env *env,
+			      struct target_distribute_txn_data *tdtd,
+			      __u64 batchid)
+{
+	struct distribute_txn_bid_data	*dtbd = NULL;
+	struct thandle		*th;
+	struct lu_buf		 buf;
+	__u64			 tmp;
+	__u64			 off;
+	int			 rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(dtbd);
+	if (dtbd == NULL)
+		RETURN(-ENOMEM);
+	dtbd->dtbd_batchid = batchid;
+	dtbd->dtbd_tdtd = tdtd;
+	dtbd->dtbd_cb.dcb_func = distribute_txn_batchid_cb;
+	atomic_inc(&tdtd->tdtd_refcount);
+
+	th = dt_trans_create(env, tdtd->tdtd_lut->lut_bottom);
+	if (IS_ERR(th)) {
+		atomic_dec(&tdtd->tdtd_refcount);
+		OBD_FREE_PTR(dtbd);
+		RETURN(PTR_ERR(th));
+	}
+
+	tmp = cpu_to_le64(batchid);
+	buf.lb_buf = &tmp;
+	buf.lb_len = sizeof(tmp);
+	off = 0;
+
+	rc = dt_declare_record_write(env, tdtd->tdtd_batchid_obj, &buf, off,
+				     th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, tdtd->tdtd_lut->lut_bottom, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_trans_cb_add(th, &dtbd->dtbd_cb);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, tdtd->tdtd_batchid_obj, &buf,
+			     &off, th);
+
+	CDEBUG(D_INFO, "%s: update batchid %llu: rc = %d\n",
+	       tdtd->tdtd_lut->lut_obd->obd_name, batchid, rc);
+
+stop:
+	dt_trans_stop(env, tdtd->tdtd_lut->lut_bottom, th);
+	if (rc < 0) {
+		atomic_dec(&tdtd->tdtd_refcount);
+		OBD_FREE_PTR(dtbd);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Init commit batchid for distribute transaction.
+ *
+ * Initialize the batchid object and get commit batchid from the object.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tdtd	distribute transaction whose batchid is initialized.
+ *
+ * \retval		0 if initialization succeeds.
+ * \retval		negative errno if initialization fails.
+ **/
+static int
+distribute_txn_commit_batchid_init(const struct lu_env *env,
+				   struct target_distribute_txn_data *tdtd)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct lu_target	*lut = tdtd->tdtd_lut;
+	struct lu_attr		*attr = &tti->tti_attr;
+	struct lu_fid		*fid = &tti->tti_fid1;
+	struct dt_object_format	*dof = &tti->tti_u.update.tti_update_dof;
+	struct dt_object	*dt_obj = NULL;
+	struct lu_buf		buf;
+	__u64			tmp;
+	__u64			off;
+	int			rc;
+	ENTRY;
+
+	memset(attr, 0, sizeof(*attr));
+	attr->la_valid = LA_MODE;
+	attr->la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	dof->dof_type = dt_mode_to_dft(S_IFREG);
+
+	lu_local_obj_fid(fid, BATCHID_COMMITTED_OID);
+
+	dt_obj = dt_find_or_create(env, lut->lut_bottom, fid, dof,
+				   attr);
+	if (IS_ERR(dt_obj)) {
+		rc = PTR_ERR(dt_obj);
+		dt_obj = NULL;
+		GOTO(out_put, rc);
+	}
+
+	tdtd->tdtd_batchid_obj = dt_obj;
+
+	buf.lb_buf = &tmp;
+	buf.lb_len = sizeof(tmp);
+	off = 0;
+	rc = dt_read(env, dt_obj, &buf, &off);
+	if (rc < 0 || (rc < buf.lb_len && rc > 0)) {
+		CERROR("%s can't read last committed batchid: rc = %d\n",
+		       tdtd->tdtd_lut->lut_obd->obd_name, rc);
+		if (rc > 0)
+			rc = -EINVAL;
+		GOTO(out_put, rc);
+	} else if (rc == buf.lb_len) {
+		tdtd->tdtd_committed_batchid = le64_to_cpu(tmp);
+		CDEBUG(D_HA, "%s: committed batchid %llu\n",
+		       tdtd->tdtd_lut->lut_obd->obd_name,
+		       tdtd->tdtd_committed_batchid);
+		rc = 0;
+	}
+
+out_put:
+	if (rc < 0 && dt_obj != NULL) {
+		dt_object_put(env, dt_obj);
+		tdtd->tdtd_batchid_obj = NULL;
+	}
+	return rc;
+}
+
+#ifndef TASK_IDLE
+#define TASK_IDLE TASK_INTERRUPTIBLE
+#endif
+
+/**
+ * manage the distribute transaction thread
+ *
+ * Distribute transaction are linked to the list, and once the distribute
+ * transaction is committed, it will update the last committed batchid first,
+ * after it is committed, it will cancel the records.
+ *
+ * \param[in] _arg	argument for commit thread
+ *
+ * \retval		0 if thread is running successfully
+ * \retval		negative errno if the thread can not be run.
+ */
+static int distribute_txn_commit_thread(void *_arg)
+{
+	struct target_distribute_txn_data *tdtd = _arg;
+	struct lu_env		*env = &tdtd->tdtd_env;
+	LIST_HEAD(list);
+	int			 rc;
+	struct top_multiple_thandle *tmt;
+	struct top_multiple_thandle *tmp;
+	__u64			 batchid = 0, committed;
+
+	ENTRY;
+
+
+	CDEBUG(D_HA, "%s: start commit thread committed batchid %llu\n",
+	       tdtd->tdtd_lut->lut_obd->obd_name,
+	       tdtd->tdtd_committed_batchid);
+
+	while (({set_current_state(TASK_IDLE);
+		 !kthread_should_stop(); })) {
+		spin_lock(&tdtd->tdtd_batchid_lock);
+		list_for_each_entry_safe(tmt, tmp, &tdtd->tdtd_list,
+					 tmt_commit_list) {
+			if (tmt->tmt_committed == 0)
+				break;
+
+			/* Note: right now, replay is based on master MDT
+			 * transno, but cancellation is based on batchid.
+			 * so we do not try to cancel the update log until
+			 * the recoverying is done, unless the update records
+			 * batchid < committed_batchid. */
+			if (tmt->tmt_batchid <= tdtd->tdtd_committed_batchid) {
+				__set_current_state(TASK_RUNNING);
+				list_move_tail(&tmt->tmt_commit_list, &list);
+			} else if (!tdtd->tdtd_lut->lut_obd->obd_recovering) {
+				__set_current_state(TASK_RUNNING);
+				LASSERTF(tmt->tmt_batchid >= batchid,
+					 "tmt %p tmt_batchid: %llu, batchid "
+					  "%llu\n", tmt, tmt->tmt_batchid,
+					 batchid);
+				/* There are three types of distribution
+				 * transaction result
+				 *
+				 * 1. If tmt_result < 0, it means the
+				 * distribution transaction fails, which should
+				 * be rare, because once declare phase succeeds,
+				 * the operation should succeeds anyway. Note in
+				 * this case, we will still update batchid so
+				 * cancellation would be stopped.
+				 *
+				 * 2. If tmt_result == 0, it means the
+				 * distribution transaction succeeds, and we
+				 * will update batchid.
+				 *
+				 * 3. If tmt_result > 0, it means distribute
+				 * transaction is not yet committed on every
+				 * node, but we need release this tmt before
+				 * that, which usuually happens during umount.
+				 */
+				if (tmt->tmt_result <= 0)
+					batchid = tmt->tmt_batchid;
+				list_move_tail(&tmt->tmt_commit_list, &list);
+			}
+		}
+		spin_unlock(&tdtd->tdtd_batchid_lock);
+
+		CDEBUG(D_HA, "%s: batchid: %llu committed batchid "
+		       "%llu\n", tdtd->tdtd_lut->lut_obd->obd_name, batchid,
+		       tdtd->tdtd_committed_batchid);
+		/* update globally committed on a storage */
+		if (batchid > tdtd->tdtd_committed_batchid) {
+			rc = distribute_txn_commit_batchid_update(env, tdtd,
+							     batchid);
+			if (rc == 0)
+				batchid = 0;
+		}
+		/* cancel the records for committed batchid's */
+		/* XXX: should we postpone cancel's till the end of recovery? */
+		committed = tdtd->tdtd_committed_batchid;
+		list_for_each_entry_safe(tmt, tmp, &list, tmt_commit_list) {
+			if (tmt->tmt_batchid > committed)
+				break;
+			__set_current_state(TASK_RUNNING);
+			list_del_init(&tmt->tmt_commit_list);
+			if (tmt->tmt_result <= 0)
+				distribute_txn_cancel_records(env, tmt);
+			top_multiple_thandle_put(tmt);
+		}
+
+		if (!task_is_running(current))
+			schedule();
+
+		if (OBD_FAIL_PRECHECK(OBD_FAIL_OUT_OBJECT_MISS)) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(5));
+		}
+	}
+
+	while (({set_current_state(TASK_IDLE);
+		 atomic_read(&tdtd->tdtd_refcount) != 0; }))
+		schedule();
+	__set_current_state(TASK_RUNNING);
+
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	list_for_each_entry_safe(tmt, tmp, &tdtd->tdtd_list,
+				 tmt_commit_list)
+		list_move_tail(&tmt->tmt_commit_list, &list);
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+
+	CDEBUG(D_INFO, "%s stopping distribute txn commit thread.\n",
+	       tdtd->tdtd_lut->lut_obd->obd_name);
+	list_for_each_entry_safe(tmt, tmp, &list, tmt_commit_list) {
+		list_del_init(&tmt->tmt_commit_list);
+		top_multiple_thandle_dump(tmt, D_HA);
+		top_multiple_thandle_put(tmt);
+	}
+	RETURN(0);
+}
+
+/**
+ * Start llog cancel thread
+ *
+ * Start llog cancel(master/slave) thread on LOD
+ *
+ * \param[in]lclt	cancel log thread to be started.
+ *
+ * \retval		0 if the thread is started successfully.
+ * \retval		negative errno if the thread is not being
+ *                      started.
+ */
+int distribute_txn_init(const struct lu_env *env,
+			struct lu_target *lut,
+			struct target_distribute_txn_data *tdtd,
+			__u32 index)
+{
+	struct task_struct	*task;
+	int			rc;
+	ENTRY;
+
+	INIT_LIST_HEAD(&tdtd->tdtd_list);
+	INIT_LIST_HEAD(&tdtd->tdtd_replay_finish_list);
+	INIT_LIST_HEAD(&tdtd->tdtd_replay_list);
+	spin_lock_init(&tdtd->tdtd_batchid_lock);
+	spin_lock_init(&tdtd->tdtd_replay_list_lock);
+	tdtd->tdtd_replay_handler = distribute_txn_replay_handle;
+	tdtd->tdtd_replay_ready = 0;
+
+	tdtd->tdtd_batchid = lut->lut_last_transno + 1;
+
+	init_waitqueue_head(&tdtd->tdtd_recovery_threads_waitq);
+	atomic_set(&tdtd->tdtd_refcount, 0);
+	atomic_set(&tdtd->tdtd_recovery_threads_count, 0);
+
+	tdtd->tdtd_lut = lut;
+	if (lut->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	rc = distribute_txn_commit_batchid_init(env, tdtd);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = lu_env_init(&tdtd->tdtd_env, LCT_LOCAL | LCT_MD_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	task = kthread_create(distribute_txn_commit_thread, tdtd, "dist_txn-%u",
+			      index);
+	if (IS_ERR(task)) {
+		lu_env_fini(&tdtd->tdtd_env);
+		RETURN(PTR_ERR(task));
+	}
+	tdtd->tdtd_commit_task = task;
+	wake_up_process(task);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(distribute_txn_init);
+
+/**
+ * Stop llog cancel thread
+ *
+ * Stop llog cancel(master/slave) thread on LOD and also destory
+ * all of transaction in the list.
+ *
+ * \param[in]lclt	cancel log thread to be stopped.
+ */
+void distribute_txn_fini(const struct lu_env *env,
+			 struct target_distribute_txn_data *tdtd)
+{
+	struct top_multiple_thandle *tmt;
+	LIST_HEAD(list);
+
+	/* Stop cancel thread */
+	if (!tdtd->tdtd_commit_task)
+		return;
+
+	kthread_stop(tdtd->tdtd_commit_task);
+	tdtd->tdtd_commit_task = NULL;
+
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	list_splice_init(&tdtd->tdtd_list, &list);
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+
+	CDEBUG(D_INFO, "%s stopping distribute txn commit thread.\n",
+	       tdtd->tdtd_lut->lut_obd->obd_name);
+	while ((tmt = list_first_entry_or_null(&list,
+					       struct top_multiple_thandle,
+					       tmt_commit_list)) != NULL) {
+		list_del_init(&tmt->tmt_commit_list);
+		top_multiple_thandle_dump(tmt, D_HA);
+		top_multiple_thandle_put(tmt);
+	}
+
+	lu_env_fini(&tdtd->tdtd_env);
+
+	dtrq_list_destroy(tdtd);
+	if (tdtd->tdtd_batchid_obj != NULL) {
+		dt_object_put(env, tdtd->tdtd_batchid_obj);
+		tdtd->tdtd_batchid_obj = NULL;
+	}
+}
+EXPORT_SYMBOL(distribute_txn_fini);
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
new file mode 100644
index 0000000000000..324c3eb478bdb
--- /dev/null
+++ b/drivers/staging/lustrefsx/undef.h
@@ -0,0 +1,1256 @@
+
+/* enable libcfs CDEBUG, CWARN */
+#undef CDEBUG_ENABLED
+
+/* enable libcfs ENTRY/EXIT */
+#undef CDEBUG_ENTRY_EXIT
+
+/* enable page state tracking code */
+#undef CONFIG_DEBUG_PAGESTATE_TRACKING
+
+/* enable encryption for ldiskfs */
+#undef CONFIG_LDISKFS_FS_ENCRYPTION
+
+/* posix acls for ldiskfs */
+#undef CONFIG_LDISKFS_FS_POSIX_ACL
+
+/* enable rw access for ldiskfs */
+#undef CONFIG_LDISKFS_FS_RW
+
+/* fs security for ldiskfs */
+#undef CONFIG_LDISKFS_FS_SECURITY
+
+/* extened attributes for ldiskfs */
+#undef CONFIG_LDISKFS_FS_XATTR
+
+/* embedded llcrypt */
+#undef CONFIG_LL_ENCRYPTION
+
+/* enable invariant checking */
+#undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+
+/* enable lu_ref reference tracking code */
+#undef CONFIG_LUSTRE_DEBUG_LU_REF
+
+/* Use the Pinger */
+#undef CONFIG_LUSTRE_FS_PINGER
+
+/* Enable POSIX acl */
+#undef CONFIG_LUSTRE_FS_POSIX_ACL
+
+/* name of ldiskfs debug program */
+#undef DEBUGFS
+
+/* name of ldiskfs dump program */
+#undef DUMPE2FS
+
+/* name of ldiskfs fsck program */
+#undef E2FSCK
+
+/* name of ldiskfs e2fsprogs package */
+#undef E2FSPROGS
+
+/* name of ldiskfs label program */
+#undef E2LABEL
+
+/* do data checksums */
+#undef ENABLE_CHECKSUM
+
+/* enable flock by default */
+#undef ENABLE_FLOCK
+
+/* filldir_t return type is bool or int */
+#undef FILLDIR_TYPE
+
+/* rhashtable_walk_init() has 3 args */
+#undef HAVE_3ARG_RHASHTABLE_WALK_INIT
+
+/* account_page_dirtied takes three arguments */
+#undef HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS
+
+/* account_page_dirtied is exported */
+#undef HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT
+
+/* 'get_acl' and 'set_acl' use dentry argument */
+#undef HAVE_ACL_WITH_DENTRY
+
+/* aes-sha2 is supported by krb5 */
+#undef HAVE_AES_SHA2_SUPPORT
+
+/* aio_complete defined */
+#undef HAVE_AIO_COMPLETE
+
+/* 'alloc_file_pseudo' exist */
+#undef HAVE_ALLOC_FILE_PSEUDO
+
+/* alloc_inode_sb() exists */
+#undef HAVE_ALLOC_INODE_SB
+
+/* struct address_space_operations() has migrate_folio() */
+#undef HAVE_AOPS_MIGRATE_FOLIO
+
+/* struct address_space_operations() has read_folio() */
+#undef HAVE_AOPS_READ_FOLIO
+
+/* struct address_space_operations() has release_folio() */
+#undef HAVE_AOPS_RELEASE_FOLIO
+
+/* Define to 1 if you have the <asm/types.h> header file. */
+#undef HAVE_ASM_TYPES_H
+
+/* backing_dev_info exist */
+#undef HAVE_BACKING_DEV_INFO
+
+/* BDI_CAP_MAP_COPY exist */
+#undef HAVE_BDI_CAP_MAP_COPY
+
+/* backing_dev_info has io_pages */
+#undef HAVE_BDI_IO_PAGES
+
+/* struct bio has bi_phys_segments member */
+#undef HAVE_BIO_BI_PHYS_SEGMENTS
+
+/* bio_endio takes only one argument */
+#undef HAVE_BIO_ENDIO_USES_ONE_ARG
+
+/* 'bio_integrity_enabled' is available */
+#undef HAVE_BIO_INTEGRITY_ENABLED
+
+/* kernel has bio_integrity_prep_fn */
+#undef HAVE_BIO_INTEGRITY_PREP_FN
+
+/* bio_integrity_prep_fn returns bool */
+#undef HAVE_BIO_INTEGRITY_PREP_FN_RETURNS_BOOL
+
+/* 'bio_set_dev' is available */
+#undef HAVE_BIO_SET_DEV
+
+/* bio_integrity_payload.bip_iter exist */
+#undef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD
+
+/* Linux bitmap can be allocated */
+#undef HAVE_BITMAP_ALLOC
+
+/* 'bi_bdev' is available */
+#undef HAVE_BI_BDEV
+
+/* struct bio has bi_opf */
+#undef HAVE_BI_OPF
+
+/* 'bi_status' is available */
+#undef HAVE_BI_STATUS
+
+/* kernel has struct blk_integrity_iter */
+#undef HAVE_BLK_INTEGRITY_ITER
+
+/* kernel hash_64() is broken */
+#undef HAVE_BROKEN_HASH_64
+
+/* kernel has struct bvec_iter */
+#undef HAVE_BVEC_ITER
+
+/* if bvec_iter_all exists for multi-page bvec iternation */
+#undef HAVE_BVEC_ITER_ALL
+
+/* struct cache_detail has writers */
+#undef HAVE_CACHE_DETAIL_WRITERS
+
+/* if cache_detail->hash_lock is a spinlock */
+#undef HAVE_CACHE_HASH_SPINLOCK
+
+/* cache_head has hlist cache_list */
+#undef HAVE_CACHE_HEAD_HLIST
+
+/* crypto/internal/cipher.h is present */
+#undef HAVE_CIPHER_H
+
+/* kernel has clean_bdev_aliases */
+#undef HAVE_CLEAN_BDEV_ALIASES
+
+/* 'clear_and_wake_up_bit' is available */
+#undef HAVE_CLEAR_AND_WAKE_UP_BIT
+
+/* compat rdma found */
+#undef HAVE_COMPAT_RDMA
+
+/* copy_file_range() is supported */
+#undef HAVE_COPY_FILE_RANGE
+
+/* 'cpus_read_lock' exist */
+#undef HAVE_CPUS_READ_LOCK
+
+/* crypto_alloc_skcipher is defined */
+#undef HAVE_CRYPTO_ALLOC_SKCIPHER
+
+/* crypto hash helper functions are available */
+#undef HAVE_CRYPTO_HASH_HELPERS
+
+/* 'CRYPTO_MAX_ALG_NAME' is 128 */
+#undef HAVE_CRYPTO_MAX_ALG_NAME_128
+
+/* crypto/sha2.h is present */
+#undef HAVE_CRYPTO_SHA2_HEADER
+
+/* current_time() has replaced CURRENT_TIME */
+#undef HAVE_CURRENT_TIME
+
+/* Have db_dirty_records list_t */
+#undef HAVE_DB_DIRTY_RECORDS_LIST
+
+/* default_file_splice_read is exported */
+#undef HAVE_DEFAULT_FILE_SPLICE_READ_EXPORT
+
+/* delete_from_page_cache is exported */
+#undef HAVE_DELETE_FROM_PAGE_CACHE
+
+/* dentry.d_child exist */
+#undef HAVE_DENTRY_D_CHILD
+
+/* list dentry.d_u.d_alias exist */
+#undef HAVE_DENTRY_D_U_D_ALIAS
+
+/* DES3 enctype is supported by krb5 */
+#undef HAVE_DES3_SUPPORT
+
+/* direct_IO has 2 arguments */
+#undef HAVE_DIRECTIO_2ARGS
+
+/* direct IO uses iov_iter */
+#undef HAVE_DIRECTIO_ITER
+
+/* address_spaace_operaions->dirty_folio() member exists */
+#undef HAVE_DIRTY_FOLIO
+
+/* dir_context exist */
+#undef HAVE_DIR_CONTEXT
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Have dmu_object_alloc_dnsize in ZFS */
+#undef HAVE_DMU_OBJECT_ALLOC_DNSIZE
+
+/* Have dmu_objset_disown() with 3 args */
+#undef HAVE_DMU_OBJSET_DISOWN_3ARG
+
+/* Have dmu_objset_own() with 6 args */
+#undef HAVE_DMU_OBJSET_OWN_6ARG
+
+/* Have dmu_offset_next() exported */
+#undef HAVE_DMU_OFFSET_NEXT
+
+/* Have 6 argument dmu_pretch in ZFS */
+#undef HAVE_DMU_PREFETCH_6ARG
+
+/* Have dmu_read_by_dnode() in ZFS */
+#undef HAVE_DMU_READ_BY_DNODE
+
+/* Have dmu_tx_hold_write_by_dnode() in ZFS */
+#undef HAVE_DMU_TX_HOLD_WRITE_BY_DNODE
+
+/* Have dmu_tx_hold_zap_by_dnode() in ZFS */
+#undef HAVE_DMU_TX_HOLD_ZAP_BY_DNODE
+
+/* Have dmu_tx_mark_netfree */
+#undef HAVE_DMU_TX_MARK_NETFREE
+
+/* Have native dnode accounting in ZFS */
+#undef HAVE_DMU_USEROBJ_ACCOUNTING
+
+/* Have dmu_write_by_dnode() in ZFS */
+#undef HAVE_DMU_WRITE_BY_DNODE
+
+/* down_write_killable function exists */
+#undef HAVE_DOWN_WRITE_KILLABLE
+
+/* quotactl_ops.set_dqblk takes struct kqid */
+#undef HAVE_DQUOT_KQID
+
+/* quotactl_ops.set_dqblk takes struct qc_dqblk */
+#undef HAVE_DQUOT_QC_DQBLK
+
+/* dquot_transfer() has user_ns argument */
+#undef HAVE_DQUOT_TRANSFER_WITH_USER_NS
+
+/* Have dsl_pool_config_enter/exit in ZFS */
+#undef HAVE_DSL_POOL_CONFIG
+
+/* Have dsl_sync_task_do_nowait in ZFS */
+#undef HAVE_DSL_SYNC_TASK_DO_NOWAIT
+
+/* d_compare need 4 arguments */
+#undef HAVE_D_COMPARE_4ARGS
+
+/* d_compare need 5 arguments */
+#undef HAVE_D_COMPARE_5ARGS
+
+/* d_count exist */
+#undef HAVE_D_COUNT
+
+/* 'd_init' exists */
+#undef HAVE_D_INIT
+
+/* d_in_lookup is defined */
+#undef HAVE_D_IN_LOOKUP
+
+/* 'd_is_positive' is available */
+#undef HAVE_D_IS_POSITIVE
+
+/* Define to 1 if you have the <endian.h> header file. */
+#undef HAVE_ENDIAN_H
+
+/* ethtool_link_settings is defined */
+#undef HAVE_ETHTOOL_LINK_SETTINGS
+
+/* Define to 1 if you have the <ext2fs/ext2fs.h> header file. */
+#undef HAVE_EXT2FS_EXT2FS_H
+
+/* ext4_bread takes 4 arguments */
+#undef HAVE_EXT4_BREAD_4ARGS
+
+/* ext4_(inc|dec)_count() has 2 arguments */
+#undef HAVE_EXT4_INC_DEC_COUNT_2ARGS
+
+/* i_dquot is in ext4_inode_info */
+#undef HAVE_EXT4_INFO_DQUOT
+
+/* ext4_free_blocks do not require struct buffer_head */
+#undef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD
+
+/* file handle and related syscalls are supported */
+#undef HAVE_FHANDLE_GLIBC_SUPPORT
+
+/* union is unnamed */
+#undef HAVE_FID2PATH_ANON_UNIONS
+
+/* filemap_get_folios_contig() is available */
+#undef HAVE_FILEMAP_GET_FOLIOS_CONTIG
+
+/* kernel has file_dentry */
+#undef HAVE_FILE_DENTRY
+
+/* file_operations.[read|write]_iter functions exist */
+#undef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+
+/* filldir_t needs struct dir_context as argument */
+#undef HAVE_FILLDIR_USE_CTX
+
+/* filldir_t needs struct dir_context and returns bool */
+#undef HAVE_FILLDIR_USE_CTX_RETURN_BOOL
+
+/* FMR pool API is available */
+#undef HAVE_FMR_POOL_API
+
+/* file_operations has iterate_shared */
+#undef HAVE_FOP_ITERATE_SHARED
+
+/* force_sig() has task parameter */
+#undef HAVE_FORCE_SIG_WITH_TASK
+
+/* 'struct fscrypt_digested_name' exists */
+#undef HAVE_FSCRYPT_DIGESTED_NAME
+
+/* embedded llcrypt uses llcrypt_dummy_context_enabled() */
+#undef HAVE_FSCRYPT_DUMMY_CONTEXT_ENABLED
+
+/* fscrypt_is_nokey_name() exists */
+#undef HAVE_FSCRYPT_IS_NOKEY_NAME
+
+/* full_name_hash need 3 arguments */
+#undef HAVE_FULL_NAME_HASH_3ARGS
+
+/* generic_write_sync has 2 arguments */
+#undef HAVE_GENERIC_WRITE_SYNC_2ARGS
+
+/* struct genl_dumpit_info has family field */
+#undef HAVE_GENL_DUMPIT_INFO
+
+/* Define to 1 if you have the `gethostbyname' function. */
+#undef HAVE_GETHOSTBYNAME
+
+/* 'get_acl' has a rcu argument */
+#undef HAVE_GET_ACL_RCU_ARG
+
+/* get_inode_usage function exists */
+#undef HAVE_GET_INODE_USAGE
+
+/* get_random_[u32|u64] are available */
+#undef HAVE_GET_RANDOM_U32_AND_U64
+
+/* get_random_u32_below() is available */
+#undef HAVE_GET_RANDOM_U32_BELOW
+
+/* get_request_key_auth() is available */
+#undef HAVE_GET_REQUEST_KEY_AUTH
+
+/* get_user_pages takes 6 arguments */
+#undef HAVE_GET_USER_PAGES_6ARG
+
+/* get_user_pages takes gup_flags in arguments */
+#undef HAVE_GET_USER_PAGES_GUP_FLAGS
+
+/* glob_match() is available */
+#undef HAVE_GLOB
+
+/* grab_cache_page_write_begin() has flags argument */
+#undef HAVE_GRAB_CACHE_PAGE_WRITE_BEGIN_WITH_FLAGS
+
+/* struct group_info has member gid */
+#undef HAVE_GROUP_INFO_GID
+
+/* Define this is if you enable gss */
+#undef HAVE_GSS
+
+/* Define this if you enable gss keyring backend */
+#undef HAVE_GSS_KEYRING
+
+/* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
+#undef HAVE_GSS_KRB5_CCACHE_NAME
+
+/* '__rhashtable_insert_fast()' returns int */
+#undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT
+
+/* Define this if you have Heimdal Kerberos libraries */
+#undef HAVE_HEIMDAL
+
+/* hlist_add_after is available */
+#undef HAVE_HLIST_ADD_AFTER
+
+/* hotplug state machine is supported */
+#undef HAVE_HOTPLUG_STATE_MACHINE
+
+/* hypervisor_is_type function exists */
+#undef HAVE_HYPERVISOR_IS_TYPE
+
+/* ib_alloc_fast_reg_mr is defined */
+#undef HAVE_IB_ALLOC_FAST_REG_MR
+
+/* ib_alloc_pd has 2 arguments */
+#undef HAVE_IB_ALLOC_PD_2ARGS
+
+/* struct ib_cq_init_attr is used by ib_create_cq */
+#undef HAVE_IB_CQ_INIT_ATTR
+
+/* struct ib_device.attrs is defined */
+#undef HAVE_IB_DEVICE_ATTRS
+
+/* if struct ib_device_ops is defined */
+#undef HAVE_IB_DEVICE_OPS
+
+/* ib_get_dma_mr is defined */
+#undef HAVE_IB_GET_DMA_MR
+
+/* function ib_inc_rkey exist */
+#undef HAVE_IB_INC_RKEY
+
+/* ib_map_mr_sg exists */
+#undef HAVE_IB_MAP_MR_SG
+
+/* ib_map_mr_sg has 5 arguments */
+#undef HAVE_IB_MAP_MR_SG_5ARGS
+
+/* ib_post_send and ib_post_recv have const parameters */
+#undef HAVE_IB_POST_SEND_RECV_CONST
+
+/* struct ib_rdma_wr is defined */
+#undef HAVE_IB_RDMA_WR
+
+/* if ib_sg_dma_address wrapper exists */
+#undef HAVE_IB_SG_DMA_ADDRESS
+
+/* inode_operations .getattr member function can gather advance stats */
+#undef HAVE_INODEOPS_ENHANCED_GETATTR
+
+/* inode_lock is defined */
+#undef HAVE_INODE_LOCK
+
+/* inode times are using timespec64 */
+#undef HAVE_INODE_TIMESPEC64
+
+/* blk_integrity.interval exist */
+#undef HAVE_INTERVAL_BLK_INTEGRITY
+
+/* blk_integrity.interval_exp exist */
+#undef HAVE_INTERVAL_EXP_BLK_INTEGRITY
+
+/* interval trees use rb_tree_cached */
+#undef HAVE_INTERVAL_TREE_CACHED
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* address_spaace_operaions->invalidate_folio() member exists */
+#undef HAVE_INVALIDATE_FOLIO
+
+/* address_space invalidate_lock member exists */
+#undef HAVE_INVALIDATE_LOCK
+
+/* address_space_operations.invalidatepage needs 3 arguments */
+#undef HAVE_INVALIDATE_RANGE
+
+/* have in_compat_syscall */
+#undef HAVE_IN_COMPAT_SYSCALL
+
+/* 'in_dev_for_each_ifa_rtnl' is defined */
+#undef HAVE_IN_DEV_FOR_EACH_IFA_RTNL
+
+/* inode_operations->rename need flags as argument */
+#undef HAVE_IOPS_RENAME_WITH_FLAGS
+
+/* generic_readlink has been removed */
+#undef HAVE_IOP_GENERIC_READLINK
+
+/* have iop get_link */
+#undef HAVE_IOP_GET_LINK
+
+/* inode_operations has .set_acl member function */
+#undef HAVE_IOP_SET_ACL
+
+/* inode_operations has {get,set,remove}xattr members */
+#undef HAVE_IOP_XATTR
+
+/* iov_iter_get_pages_alloc2() is available */
+#undef HAVE_IOV_ITER_GET_PAGES_ALLOC2
+
+/* if iov_iter has member iter_type */
+#undef HAVE_IOV_ITER_HAS_ITER_TYPE_MEMBER
+
+/* if iov_iter has member type */
+#undef HAVE_IOV_ITER_HAS_TYPE_MEMBER
+
+/* iov_iter_init handles directional tag */
+#undef HAVE_IOV_ITER_INIT_DIRECTION
+
+/* iov_iter_rw exist */
+#undef HAVE_IOV_ITER_RW
+
+/* iov_iter_truncate exists */
+#undef HAVE_IOV_ITER_TRUNCATE
+
+/* if iov_iter_type exists */
+#undef HAVE_IOV_ITER_TYPE
+
+/* is_root_inode defined */
+#undef HAVE_IS_ROOT_INODE
+
+/* 'iter_file_splice_write' exists */
+#undef HAVE_ITER_FILE_SPLICE_WRITE
+
+/* struct address_space has i_pages */
+#undef HAVE_I_PAGES
+
+/* if jbd2_journal_get_max_txn_bufs is available */
+#undef HAVE_JBD2_JOURNAL_GET_MAX_TXN_BUFS
+
+/* struct jbd2_journal_handle has h_total_credits member */
+#undef HAVE_JOURNAL_TOTAL_CREDITS
+
+/* kallsyms_lookup_name is exported by kernel */
+#undef HAVE_KALLSYMS_LOOKUP_NAME
+
+/* 'kernel_param_[un]lock' is available */
+#undef HAVE_KERNEL_PARAM_LOCK
+
+/* 'struct kernel_param_ops' is available */
+#undef HAVE_KERNEL_PARAM_OPS
+
+/* kernel_read() signature ends with loff_t *pos */
+#undef HAVE_KERNEL_READ_LAST_POSP
+
+/* kernel_setsockopt still in use */
+#undef HAVE_KERNEL_SETSOCKOPT
+
+/* 'getname' has two args */
+#undef HAVE_KERN_SOCK_GETNAME_2ARGS
+
+/* keyring_search has 4 args */
+#undef HAVE_KEYRING_SEARCH_4ARGS
+
+/* struct key_match_data exist */
+#undef HAVE_KEY_MATCH_DATA
+
+/* payload.data is an array */
+#undef HAVE_KEY_PAYLOAD_DATA_ARRAY
+
+/* key_type->instantiate has two args */
+#undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS
+
+/* key.usage is of type refcount_t */
+#undef HAVE_KEY_USAGE_REFCOUNT
+
+/* kfree_sensitive() is available. */
+#undef HAVE_KFREE_SENSITIVE
+
+/* kiocb->ki_complete() has 2 arguments */
+#undef HAVE_KIOCB_COMPLETE_2ARGS
+
+/* ki_left exist */
+#undef HAVE_KIOCB_KI_LEFT
+
+/* ki_nbytes field exist */
+#undef HAVE_KI_NBYTES
+
+/* kmap_to_page is exported by the kernel */
+#undef HAVE_KMAP_TO_PAGE
+
+/* struct kobj_type has 'default_groups' member */
+#undef HAVE_KOBJ_TYPE_DEFAULT_GROUPS
+
+/* Define this if you have MIT Kerberos libraries */
+#undef HAVE_KRB5
+
+/* Define this if the function krb5int_derive_key is available */
+#undef HAVE_KRB5INT_DERIVE_KEY
+
+/* Define this if the function krb5_derive_key is available */
+#undef HAVE_KRB5_DERIVE_KEY
+
+/* Define this if the function krb5_get_error_message is available */
+#undef HAVE_KRB5_GET_ERROR_MESSAGE
+
+/* Define this if the function krb5_get_init_creds_opt_set_addressless is
+   available */
+#undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS
+
+/* kref_read() is available */
+#undef HAVE_KREF_READ
+
+/* kset_find_obj is exported by the kernel */
+#undef HAVE_KSET_FIND_OBJ
+
+/* kernel has kstrtobool_from_user */
+#undef HAVE_KSTRTOBOOL_FROM_USER
+
+/* kthread_worker found */
+#undef HAVE_KTHREAD_WORK
+
+/* ktime_add is available */
+#undef HAVE_KTIME_ADD
+
+/* ktime_after is available */
+#undef HAVE_KTIME_AFTER
+
+/* ktime_before is available */
+#undef HAVE_KTIME_BEFORE
+
+/* ktime_compare is available */
+#undef HAVE_KTIME_COMPARE
+
+/* 'ktime_get_real_seconds' is available */
+#undef HAVE_KTIME_GET_REAL_SECONDS
+
+/* 'ktime_get_real_ts64' is available */
+#undef HAVE_KTIME_GET_REAL_TS64
+
+/* 'ktime_get_seconds' is available */
+#undef HAVE_KTIME_GET_SECONDS
+
+/* 'ktime_get_ts64' is available */
+#undef HAVE_KTIME_GET_TS64
+
+/* 'ktime_ms_delta' is available */
+#undef HAVE_KTIME_MS_DELTA
+
+/* 'ktime_to_timespec64' is available */
+#undef HAVE_KTIME_TO_TIMESPEC64
+
+/* ldiskfsfs_dirhash takes an inode argument */
+#undef HAVE_LDISKFSFS_GETHASH_INODE_ARG
+
+/* enable use of ldiskfsprogs package */
+#undef HAVE_LDISKFSPROGS
+
+/* EXT4_GET_BLOCKS_KEEP_SIZE exists */
+#undef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
+
+/* if ldiskfs_iget takes a flags argument */
+#undef HAVE_LDISKFS_IGET_WITH_FLAGS
+
+/* 'ext4_journal_ensure_credits' exists */
+#undef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS
+
+/* Enable ldiskfs osd */
+#undef HAVE_LDISKFS_OSD
+
+/* libefence support is requested */
+#undef HAVE_LIBEFENCE
+
+/* Define to 1 if you have the `keyutils' library (-lkeyutils). */
+#undef HAVE_LIBKEYUTILS
+
+/* use libpthread for libcfs library */
+#undef HAVE_LIBPTHREAD
+
+/* readline library is available */
+#undef HAVE_LIBREADLINE
+
+/* linux/blk-integrity.h is present */
+#undef HAVE_LINUX_BLK_INTEGRITY_HEADER
+
+/* linux/fortify-string.h header available */
+#undef HAVE_LINUX_FORTIFY_STRING_HEADER
+
+/* linux/stdarg.h is present */
+#undef HAVE_LINUX_STDARG_HEADER
+
+/* list_cmp_func_t type is defined */
+#undef HAVE_LIST_CMP_FUNC_T
+
+/* lock_manager_operations has lm_compare_owner */
+#undef HAVE_LM_COMPARE_OWNER
+
+/* kernel has locks_lock_file_wait */
+#undef HAVE_LOCKS_LOCK_FILE_WAIT
+
+/* lock_page_memcg is defined */
+#undef HAVE_LOCK_PAGE_MEMCG
+
+/* lookup_user_key() is available */
+#undef HAVE_LOOKUP_USER_KEY
+
+/* Enable lru resize support */
+#undef HAVE_LRU_RESIZE_SUPPORT
+
+/* lsmcontext_init is available */
+#undef HAVE_LSMCONTEXT_INIT
+
+/* Define this if the Kerberos GSS library supports
+   gss_krb5_export_lucid_sec_context */
+#undef HAVE_LUCID_CONTEXT_SUPPORT
+
+/* Enable Lustre client crypto via embedded llcrypt */
+#undef HAVE_LUSTRE_CRYPTO
+
+/* enum mapping_flags has AS_EXITING flag */
+#undef HAVE_MAPPING_AS_EXITING_FLAG
+
+/* match_wildcard() is available */
+#undef HAVE_MATCH_WILDCARD
+
+/* memalloc_noreclaim_{save,restore}() is supported */
+#undef HAVE_MEMALLOC_RECLAIM
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* mmap_lock API is available. */
+#undef HAVE_MMAP_LOCK
+
+/* kernel module loading is possible */
+#undef HAVE_MODULE_LOADING_SUPPORT
+
+/* Define to 1 if you have the `name_to_handle_at' function. */
+#undef HAVE_NAME_TO_HANDLE_AT
+
+/* support native Linux client */
+#undef HAVE_NATIVE_LINUX_CLIENT
+
+/* Define to 1 if you have the <netdb.h> header file. */
+#undef HAVE_NETDB_H
+
+/* struct genl_ops has 'start' callback */
+#undef HAVE_NETLINK_CALLBACK_START
+
+/* DEFINE_TIMER uses only 2 arguements */
+#undef HAVE_NEW_DEFINE_TIMER
+
+/* 'kernel_write' aligns with read/write helpers */
+#undef HAVE_NEW_KERNEL_WRITE
+
+/* libnl3 supports nla_get_s32 */
+#undef HAVE_NLA_GET_S32
+
+/* libnl3 supports nla_get_s64 */
+#undef HAVE_NLA_GET_S64
+
+/* 'nla_strdup' is available */
+#undef HAVE_NLA_STRDUP
+
+/* 'nla_strlcpy' is available */
+#undef HAVE_NLA_STRLCPY
+
+/* netlink_ext_ack is handled for Netlink dump handlers */
+#undef HAVE_NL_DUMP_WITH_EXT_ACK
+
+/* netlink_ext_ack is an argument to nla_parse type function */
+#undef HAVE_NL_PARSE_WITH_EXT_ACK
+
+/* no_llseek() is available */
+#undef HAVE_NO_LLSEEK
+
+/* NR_UNSTABLE_NFS is still in use. */
+#undef HAVE_NR_UNSTABLE_NFS
+
+/* ns_to_timespec64() is available */
+#undef HAVE_NS_TO_TIMESPEC64
+
+/* with oldsize */
+#undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE
+
+/* OpenSSL EVP_PKEY_get_params */
+#undef HAVE_OPENSSL_EVP_PKEY
+
+/* openssl-devel is present */
+#undef HAVE_OPENSSL_GETSEPOL
+
+/* OpenSSL HMAC functions needed for SSK */
+#undef HAVE_OPENSSL_SSK
+
+/* if Oracle OFED Extensions are enabled */
+#undef HAVE_ORACLE_OFED_EXTENSIONS
+
+/* 'pagevec_init' takes one parameter */
+#undef HAVE_PAGEVEC_INIT_ONE_PARAM
+
+/* linux/panic_notifier.h is present */
+#undef HAVE_PANIC_NOTIFIER_H
+
+/* 'param_set_uint_minmax' is available */
+#undef HAVE_PARAM_SET_UINT_MINMAX
+
+/* percpu_counter_init uses GFP_* flag */
+#undef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+
+/* 'struct nsproxy' has 'pid_ns_for_children' */
+#undef HAVE_PID_NS_FOR_CHILDREN
+
+/* 'posix_acl_update_mode' is available */
+#undef HAVE_POSIX_ACL_UPDATE_MODE
+
+/* posix_acl_valid takes struct user_namespace */
+#undef HAVE_POSIX_ACL_VALID_USER_NS
+
+/* 'prepare_to_wait_event' is available */
+#undef HAVE_PREPARE_TO_WAIT_EVENT
+
+/* processor.h is present */
+#undef HAVE_PROCESSOR_H
+
+/* struct proc_ops exists */
+#undef HAVE_PROC_OPS
+
+/* get_projid function exists */
+#undef HAVE_PROJECT_QUOTA
+
+/* 'PTR_ERR_OR_ZERO' exist */
+#undef HAVE_PTR_ERR_OR_ZERO
+
+/* If available, contains the Python version number currently in use. */
+#undef HAVE_PYTHON
+
+/* radix_tree_tag_set exists */
+#undef HAVE_RADIX_TREE_TAG_SET
+
+/* rdma_connect_locked is defined */
+#undef HAVE_RDMA_CONNECT_LOCKED
+
+/* rdma_create_id wants 4 args */
+#undef HAVE_RDMA_CREATE_ID_4ARG
+
+/* rdma_create_id wants 5 args */
+#undef HAVE_RDMA_CREATE_ID_5ARG
+
+/* rdma_reject has 4 arguments */
+#undef HAVE_RDMA_REJECT_4ARGS
+
+/* read_cache_page() filler_t needs struct file */
+#undef HAVE_READ_CACHE_PAGE_WANTS_FILE
+
+/* refcount_t is supported */
+#undef HAVE_REFCOUNT_T
+
+/* register_shrinker() returns status */
+#undef HAVE_REGISTER_SHRINKER_FORMAT_NAMED
+
+/* register_shrinker() returns status */
+#undef HAVE_REGISTER_SHRINKER_RET
+
+/* rhashtable_lookup() is available */
+#undef HAVE_RHASHTABLE_LOOKUP
+
+/* rhashtable_lookup_get_insert_fast() is available */
+#undef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST
+
+/* rhashtable_replace_fast() is available */
+#undef HAVE_RHASHTABLE_REPLACE
+
+/* rhashtable_walk_enter() is available */
+#undef HAVE_RHASHTABLE_WALK_ENTER
+
+/* struct rhltable exist */
+#undef HAVE_RHLTABLE
+
+/* rht_bucket_var() is available */
+#undef HAVE_RHT_BUCKET_VAR
+
+/* save_stack_trace_tsk is exported */
+#undef HAVE_SAVE_STACK_TRACE_TSK
+
+/* Have sa_spill_alloc in ZFS */
+#undef HAVE_SA_SPILL_ALLOC
+
+/* linux/sched header directory exist */
+#undef HAVE_SCHED_HEADERS
+
+/* security_dentry_init_security needs lsmcontext */
+#undef HAVE_SECURITY_DENTRY_INIT_SECURTY_WITH_CTX
+
+/* security_dentry_init_security() returns xattr name */
+#undef HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
+
+/* security_release_secctx has 1 arg. */
+#undef HAVE_SEC_RELEASE_SECCTX_1ARG
+
+/* support for selinux */
+#undef HAVE_SELINUX
+
+/* Define to 1 if you have the <selinux/selinux.h> header file. */
+#undef HAVE_SELINUX_SELINUX_H
+
+/* support server */
+#undef HAVE_SERVER_SUPPORT
+
+/* Define this if the Kerberos GSS library supports
+   gss_krb5_set_allowable_enctypes */
+#undef HAVE_SET_ALLOWABLE_ENCTYPES
+
+/* shrinker has count_objects member */
+#undef HAVE_SHRINKER_COUNT
+
+/* sk_data_ready uses only one argument */
+#undef HAVE_SK_DATA_READY_ONE_ARG
+
+/* sock_create_kern use net as first parameter */
+#undef HAVE_SOCK_CREATE_KERN_USE_NET
+
+/* Have spa_maxblocksize in ZFS */
+#undef HAVE_SPA_MAXBLOCKSIZE
+
+/* struct stacktrace_ops exists */
+#undef HAVE_STACKTRACE_OPS
+
+/* Define to 1 if you have the `statx' function. */
+#undef HAVE_STATX
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* stringhash.h is present */
+#undef HAVE_STRINGHASH
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the `strnlen' function. */
+#undef HAVE_STRNLEN
+
+/* kernel strscpy is available */
+#undef HAVE_STRSCPY
+
+/* struct posix_acl_xattr_{header,entry} defined */
+#undef HAVE_STRUCT_POSIX_ACL_XATTR
+
+/* submit_bio takes two arguments */
+#undef HAVE_SUBMIT_BIO_2ARGS
+
+/* 'super_setup_bdi_name' is available */
+#undef HAVE_SUPER_SETUP_BDI_NAME
+
+/* symlink inode operations need struct nameidata argument */
+#undef HAVE_SYMLINK_OPS_USE_NAMEIDATA
+
+/* new_sync_[read|write] is exported by the kernel */
+#undef HAVE_SYNC_READ_WRITE
+
+/* Define to 1 if you have <sys/quota.h>. */
+#undef HAVE_SYS_QUOTA_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* 's_uuid' is an uuid_t */
+#undef HAVE_S_UUID_AS_UUID_T
+
+/* task_is_running() is defined */
+#undef HAVE_TASK_IS_RUNNING
+
+/* 'tcp_sock_set_keepcnt()' exists */
+#undef HAVE_TCP_SOCK_SET_KEEPCNT
+
+/* 'tcp_sock_set_keepidle()' exists */
+#undef HAVE_TCP_SOCK_SET_KEEPIDLE
+
+/* 'tcp_sock_set_keepintvl()' exists */
+#undef HAVE_TCP_SOCK_SET_KEEPINTVL
+
+/* 'tcp_sock_set_nodelay()' exists */
+#undef HAVE_TCP_SOCK_SET_NODELAY
+
+/* 'tcp_sock_set_quickack()' exists */
+#undef HAVE_TCP_SOCK_SET_QUICKACK
+
+/* timer_setup has replaced setup_timer */
+#undef HAVE_TIMER_SETUP
+
+/* 'struct timespec64' is available */
+#undef HAVE_TIMESPEC64
+
+/* 'timespec64_sub' is available */
+#undef HAVE_TIMESPEC64_SUB
+
+/* 'timespec64_to_ktime' is available */
+#undef HAVE_TIMESPEC64_TO_KTIME
+
+/* topology_sibling_cpumask is available */
+#undef HAVE_TOPOLOGY_SIBLING_CPUMASK
+
+/* if totalram_pages is a function */
+#undef HAVE_TOTALRAM_PAGES_AS_FUNC
+
+/* kernel has truncate_inode_pages_final */
+#undef HAVE_TRUNCATE_INODE_PAGES_FINAL
+
+/* if MS_RDONLY was moved to uapi/linux/mount.h */
+#undef HAVE_UAPI_LINUX_MOUNT_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* 'inode_operations' members have user namespace argument */
+#undef HAVE_USER_NAMESPACE_ARG
+
+/* 'enum nlmsgerr_attrs' exists */
+#undef HAVE_USRSPC_NLMSGERR
+
+/* RDMA_PS_TCP exists */
+#undef HAVE_USRSPC_RDMA_PS_TCP
+
+/* 'uuid_t' exist */
+#undef HAVE_UUID_T
+
+/* kernel has vfs_rename with 5 args */
+#undef HAVE_VFS_RENAME_5ARGS
+
+/* kernel has vfs_rename with 6 args */
+#undef HAVE_VFS_RENAME_6ARGS
+
+/* '__vfs_setxattr' is available */
+#undef HAVE_VFS_SETXATTR
+
+/* kernel has vfs_unlink with 3 args */
+#undef HAVE_VFS_UNLINK_3ARGS
+
+/* __vmalloc only takes 2 args. */
+#undef HAVE_VMALLOC_2ARGS
+
+/* virtual_address has been replaced by address field */
+#undef HAVE_VM_FAULT_ADDRESS
+
+/* if VM_FAULT_RETRY is defined */
+#undef HAVE_VM_FAULT_RETRY
+
+/* if vm_fault_t type exists */
+#undef HAVE_VM_FAULT_T
+
+/* 'struct vm_operations' remove struct vm_area_struct argument */
+#undef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+
+/* wait_bit.h is present */
+#undef HAVE_WAIT_BIT_HEADER_H
+
+/* if struct wait_bit_queue_entry exists */
+#undef HAVE_WAIT_BIT_QUEUE_ENTRY
+
+/* 'wait_queue_entry_t' is available */
+#undef HAVE_WAIT_QUEUE_ENTRY
+
+/* linux wait_queue_head_t list_head is name head */
+#undef HAVE_WAIT_QUEUE_ENTRY_LIST
+
+/* 'wait_var_event' is available */
+#undef HAVE_WAIT_VAR_EVENT
+
+/* 'wait_woken, is available' */
+#undef HAVE_WAIT_WOKEN
+
+/* kernel Xarray implementation lacks 'xa_is_value' */
+#undef HAVE_XARRAY_SUPPORT
+
+/* needs inode parameter */
+#undef HAVE_XATTR_HANDLER_INODE_PARAM
+
+/* xattr_handler has a name member */
+#undef HAVE_XATTR_HANDLER_NAME
+
+/* handler pointer is parameter */
+#undef HAVE_XATTR_HANDLER_SIMPLIFIED
+
+/* Have zap_add_by_dnode() in ZFS */
+#undef HAVE_ZAP_ADD_BY_DNODE
+
+/* Have zap_lookup_by_dnode() in ZFS */
+#undef HAVE_ZAP_LOOKUP_BY_DNODE
+
+/* Have zap_remove_by_dnode() in ZFS */
+#undef HAVE_ZAP_REMOVE_ADD_BY_DNODE
+
+/* Have inode_timespec_t */
+#undef HAVE_ZFS_INODE_TIMESPEC
+
+/* Have multihost protection in ZFS */
+#undef HAVE_ZFS_MULTIHOST
+
+/* Enable zfs osd */
+#undef HAVE_ZFS_OSD
+
+/* Have zfs_refcount_add */
+#undef HAVE_ZFS_REFCOUNT_ADD
+
+/* Have zfs_refcount.h */
+#undef HAVE_ZFS_REFCOUNT_HEADER
+
+/* struct bio has __bi_cnt */
+#undef HAVE___BI_CNT
+
+/* if __ldiskfs_find_entry is available */
+#undef HAVE___LDISKFS_FIND_ENTRY
+
+/* function pde_data() available */
+#undef HAVE_pde_data
+
+/* ext4_journal_start takes 3 arguments */
+#undef JOURNAL_START_HAS_3ARGS
+
+/* Define this as the Kerberos version number */
+#undef KRB5_VERSION
+
+/* enable libcfs LASSERT, LASSERTF */
+#undef LIBCFS_DEBUG
+
+/* use dumplog on panic */
+#undef LNET_DUMP_ON_PANIC
+
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
+#undef LT_OBJDIR
+
+/* Fourth number in the Lustre version */
+#undef LUSTRE_FIX
+
+/* First number in the Lustre version */
+#undef LUSTRE_MAJOR
+
+/* Second number in the Lustre version */
+#undef LUSTRE_MINOR
+
+/* Third number in the Lustre version */
+#undef LUSTRE_PATCH
+
+/* A copy of PACKAGE_VERSION */
+#undef LUSTRE_VERSION_STRING
+
+/* maximum number of MDS threads */
+#undef MDS_MAX_THREADS
+
+/* Report minimum OST free space */
+#undef MIN_DF
+
+/* name of ldiskfs mkfs program */
+#undef MKE2FS
+
+/* 'ktime_get_ns' is not available */
+#undef NEED_KTIME_GET_NS
+
+/* 'ktime_get_real_ns' is not available */
+#undef NEED_KTIME_GET_REAL_NS
+
+/* lockdep_is_held() argument is const */
+#undef NEED_LOCKDEP_IS_HELD_DISCARD_CONST
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* name of parallel fsck program */
+#undef PFSCK
+
+/* enable randomly alloc failure */
+#undef RANDOM_FAIL_ALLOC
+
+/* The size of `unsigned long long', as computed by sizeof. */
+#undef SIZEOF_UNSIGNED_LONG_LONG
+
+/* use tunable backoff TCP */
+#undef SOCKNAL_BACKOFF
+
+/* tunable backoff TCP in ms */
+#undef SOCKNAL_BACKOFF_MS
+
+/* 'struct stacktrace_ops' address function returns an int */
+#undef STACKTRACE_OPS_ADDRESS_RETURN_INT
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* name of ldiskfs tune program */
+#undef TUNE2FS
+
+/* Define this if the private function, gss_krb5_cache_name, must be used to
+   tell the Kerberos library which credentials cache to use. Otherwise, this
+   is done by setting the KRB5CCNAME environment variable */
+#undef USE_GSS_KRB5_CCACHE_NAME
+
+/* Write when Checking Health */
+#undef USE_HEALTH_CHECK_WRITE
+
+/* Version number of package */
+#undef VERSION
+
+/* vfs_setxattr() value argument is non-const */
+#undef VFS_SETXATTR_VALUE
+
+/* zfs fix version */
+#undef ZFS_FIX
+
+/* zfs major version */
+#undef ZFS_MAJOR
+
+/* zfs minor version */
+#undef ZFS_MINOR
+
+/* zfs patch version */
+#undef ZFS_PATCH
+
+/* get_random_u32() is not available, use prandom_u32 */
+#undef get_random_u32
+
+/* get_random_u32_below() is not available */
+#undef get_random_u32_below
+
+/* function pde_data() unavailable */
+#undef pde_data

From e35ba4e847e24ed5f2bd1cae5fa6e53acfea29c4 Mon Sep 17 00:00:00 2001
From: Evgeny Ostrovsky <evostrov@amazon.com>
Date: Tue, 19 Mar 2024 12:59:59 +0000
Subject: [PATCH 164/175] AL2023-6.1-Update-ena-driver-to-2.12.0g

Signed-off-by: Evgeny Ostrovsky <evostrov@amazon.com>
---
 drivers/amazon/net/ena/Makefile         |   2 +-
 drivers/amazon/net/ena/config.h         |   2 +
 drivers/amazon/net/ena/ena_admin_defs.h |  16 +-
 drivers/amazon/net/ena/ena_com.c        | 431 ++++++++++--------------
 drivers/amazon/net/ena/ena_com.h        |  12 +
 drivers/amazon/net/ena/ena_eth_com.c    |  65 ++--
 drivers/amazon/net/ena/ena_eth_com.h    |  31 +-
 drivers/amazon/net/ena/ena_ethtool.c    |  51 ++-
 drivers/amazon/net/ena/ena_netdev.c     | 278 ++++++++-------
 drivers/amazon/net/ena/ena_netdev.h     |  22 +-
 drivers/amazon/net/ena/ena_regs_defs.h  |   1 +
 drivers/amazon/net/ena/ena_xdp.c        | 141 ++++----
 drivers/amazon/net/ena/ena_xdp.h        |  39 +--
 drivers/amazon/net/ena/kcompat.h        |  97 ++++--
 14 files changed, 587 insertions(+), 601 deletions(-)

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index e1118db6dc823..e60b229b694bd 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the Elastic Network Adapter (ENA) device drivers.
 # ENA Source is: https://github.com/amzn/amzn-drivers.
-# Current ENA source is based on ena_linux_2.11.1 tag.
+# Current ENA source is based on ena_linux_2.12.0 tag.
 #
 
 obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
diff --git a/drivers/amazon/net/ena/config.h b/drivers/amazon/net/ena/config.h
index b4c9875108fb4..994cb09b6b989 100644
--- a/drivers/amazon/net/ena/config.h
+++ b/drivers/amazon/net/ena/config.h
@@ -2,4 +2,6 @@
 #define _ENA_CONFIG_H_
 #define ENA_HAVE_PCI_DEV_ID 1
 #define ENA_HAVE_XDP_DO_FLUSH 1
+#define ENA_HAVE_CPUMASK_LOCAL_SPREAD 1
+#define ENA_HAVE_UPDATE_AFFINITY_HINT 1
 #endif /* _ENA_CONFIG_H_ */
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index daf2961af2b75..db52cae4b9ded 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -933,19 +933,8 @@ struct ena_admin_feature_rss_flow_hash_input {
 	u16 enabled_input_sort;
 };
 
-enum ena_admin_os_type {
-	ENA_ADMIN_OS_LINUX                          = 1,
-	ENA_ADMIN_OS_WIN                            = 2,
-	ENA_ADMIN_OS_DPDK                           = 3,
-	ENA_ADMIN_OS_FREEBSD                        = 4,
-	ENA_ADMIN_OS_IPXE                           = 5,
-	ENA_ADMIN_OS_ESXI                           = 6,
-	ENA_ADMIN_OS_MACOS                          = 7,
-	ENA_ADMIN_OS_GROUPS_NUM                     = 7,
-};
-
 struct ena_admin_host_info {
-	/* defined in enum ena_admin_os_type */
+	/* Host OS type defined as ENA_ADMIN_OS_* */
 	u32 os_type;
 
 	/* os distribution string format */
@@ -1224,7 +1213,8 @@ enum ena_admin_aenq_group {
 	ENA_ADMIN_KEEP_ALIVE                        = 4,
 	ENA_ADMIN_REFRESH_CAPABILITIES              = 5,
 	ENA_ADMIN_CONF_NOTIFICATIONS		    = 6,
-	ENA_ADMIN_AENQ_GROUPS_NUM                   = 7,
+	ENA_ADMIN_DEVICE_REQUEST_RESET              = 7,
+	ENA_ADMIN_AENQ_GROUPS_NUM                   = 8,
 };
 
 enum ena_admin_aenq_notification_syndrome {
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index fdc46ff1c2400..d53bde802ea4d 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -80,8 +80,7 @@ static int ena_com_mem_addr_set(struct ena_com_dev *ena_dev,
 				       struct ena_common_mem_addr *ena_addr,
 				       dma_addr_t addr)
 {
-	if (unlikely((addr & GENMASK_ULL(ena_dev->dma_addr_bits - 1, 0)) !=
-		     addr)) {
+	if (unlikely((addr & GENMASK_ULL(ena_dev->dma_addr_bits - 1, 0)) != addr)) {
 		netdev_err(ena_dev->net_device,
 			   "DMA address has more bits that the device supports\n");
 		return -EINVAL;
@@ -99,8 +98,7 @@ static int ena_com_admin_init_sq(struct ena_com_admin_queue *admin_queue)
 	struct ena_com_admin_sq *sq = &admin_queue->sq;
 	u16 size = ADMIN_SQ_SIZE(admin_queue->q_depth);
 
-	sq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size,
-					  &sq->dma_addr, GFP_KERNEL);
+	sq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size, &sq->dma_addr, GFP_KERNEL);
 
 	if (unlikely(!sq->entries)) {
 		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
@@ -122,8 +120,7 @@ static int ena_com_admin_init_cq(struct ena_com_admin_queue *admin_queue)
 	struct ena_com_admin_cq *cq = &admin_queue->cq;
 	u16 size = ADMIN_CQ_SIZE(admin_queue->q_depth);
 
-	cq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size,
-					  &cq->dma_addr, GFP_KERNEL);
+	cq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size, &cq->dma_addr, GFP_KERNEL);
 
 	if (unlikely(!cq->entries)) {
 		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
@@ -145,8 +142,7 @@ static int ena_com_admin_init_aenq(struct ena_com_dev *ena_dev,
 
 	ena_dev->aenq.q_depth = ENA_ASYNC_QUEUE_DEPTH;
 	size = ADMIN_AENQ_SIZE(ENA_ASYNC_QUEUE_DEPTH);
-	aenq->entries = dma_zalloc_coherent(ena_dev->dmadev, size,
-					    &aenq->dma_addr, GFP_KERNEL);
+	aenq->entries = dma_zalloc_coherent(ena_dev->dmadev, size, &aenq->dma_addr, GFP_KERNEL);
 
 	if (unlikely(!aenq->entries)) {
 		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
@@ -164,14 +160,13 @@ static int ena_com_admin_init_aenq(struct ena_com_dev *ena_dev,
 
 	aenq_caps = 0;
 	aenq_caps |= ena_dev->aenq.q_depth & ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK;
-	aenq_caps |= (sizeof(struct ena_admin_aenq_entry)
-		      << ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) &
-		     ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK;
+	aenq_caps |=
+		(sizeof(struct ena_admin_aenq_entry) << ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) &
+		ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK;
 	writel(aenq_caps, ena_dev->reg_bar + ENA_REGS_AENQ_CAPS_OFF);
 
 	if (unlikely(!aenq_handlers)) {
-		netdev_err(ena_dev->net_device,
-			   "AENQ handlers pointer is NULL\n");
+		netdev_err(ena_dev->net_device, "AENQ handlers pointer is NULL\n");
 		return -EINVAL;
 	}
 
@@ -199,14 +194,12 @@ static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *admin_queu
 	}
 
 	if (unlikely(!admin_queue->comp_ctx)) {
-		netdev_err(admin_queue->ena_dev->net_device,
-			   "Completion context is NULL\n");
+		netdev_err(admin_queue->ena_dev->net_device, "Completion context is NULL\n");
 		return NULL;
 	}
 
 	if (unlikely(admin_queue->comp_ctx[command_id].occupied && capture)) {
-		netdev_err(admin_queue->ena_dev->net_device,
-			   "Completion context is occupied\n");
+		netdev_err(admin_queue->ena_dev->net_device, "Completion context is occupied\n");
 		return NULL;
 	}
 
@@ -236,8 +229,7 @@ static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queu
 	/* In case of queue FULL */
 	cnt = (u16)atomic_read(&admin_queue->outstanding_cmds);
 	if (unlikely(cnt >= admin_queue->q_depth)) {
-		netdev_dbg(admin_queue->ena_dev->net_device,
-			   "Admin queue is full.\n");
+		netdev_dbg(admin_queue->ena_dev->net_device, "Admin queue is full.\n");
 		admin_queue->stats.out_of_space++;
 		return ERR_PTR(-ENOSPC);
 	}
@@ -284,8 +276,7 @@ static int ena_com_init_comp_ctxt(struct ena_com_admin_queue *admin_queue)
 	struct ena_comp_ctx *comp_ctx;
 	u16 i;
 
-	admin_queue->comp_ctx =
-		devm_kzalloc(admin_queue->q_dmadev, size, GFP_KERNEL);
+	admin_queue->comp_ctx = devm_kzalloc(admin_queue->q_dmadev, size, GFP_KERNEL);
 	if (unlikely(!admin_queue->comp_ctx)) {
 		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
@@ -347,20 +338,17 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 		dev_node = dev_to_node(ena_dev->dmadev);
 		set_dev_node(ena_dev->dmadev, ctx->numa_node);
 		io_sq->desc_addr.virt_addr =
-			dma_zalloc_coherent(ena_dev->dmadev, size,
-					    &io_sq->desc_addr.phys_addr,
+			dma_zalloc_coherent(ena_dev->dmadev, size, &io_sq->desc_addr.phys_addr,
 					    GFP_KERNEL);
 		set_dev_node(ena_dev->dmadev, dev_node);
 		if (!io_sq->desc_addr.virt_addr) {
 			io_sq->desc_addr.virt_addr =
 				dma_zalloc_coherent(ena_dev->dmadev, size,
-						    &io_sq->desc_addr.phys_addr,
-						    GFP_KERNEL);
+						    &io_sq->desc_addr.phys_addr, GFP_KERNEL);
 		}
 
 		if (unlikely(!io_sq->desc_addr.virt_addr)) {
-			netdev_err(ena_dev->net_device,
-				   "Memory allocation failed\n");
+			netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 			return -ENOMEM;
 		}
 	}
@@ -378,16 +366,14 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 
 		dev_node = dev_to_node(ena_dev->dmadev);
 		set_dev_node(ena_dev->dmadev, ctx->numa_node);
-		io_sq->bounce_buf_ctrl.base_buffer =
-			devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
+		io_sq->bounce_buf_ctrl.base_buffer = devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
 		set_dev_node(ena_dev->dmadev, dev_node);
 		if (!io_sq->bounce_buf_ctrl.base_buffer)
 			io_sq->bounce_buf_ctrl.base_buffer =
 				devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
 
 		if (unlikely(!io_sq->bounce_buf_ctrl.base_buffer)) {
-			netdev_err(ena_dev->net_device,
-				   "Bounce buffer memory allocation failed\n");
+			netdev_err(ena_dev->net_device, "Bounce buffer memory allocation failed\n");
 			return -ENOMEM;
 		}
 
@@ -437,13 +423,11 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev,
 	prev_node = dev_to_node(ena_dev->dmadev);
 	set_dev_node(ena_dev->dmadev, ctx->numa_node);
 	io_cq->cdesc_addr.virt_addr =
-		dma_zalloc_coherent(ena_dev->dmadev, size,
-				    &io_cq->cdesc_addr.phys_addr, GFP_KERNEL);
+		dma_zalloc_coherent(ena_dev->dmadev, size, &io_cq->cdesc_addr.phys_addr, GFP_KERNEL);
 	set_dev_node(ena_dev->dmadev, prev_node);
 	if (!io_cq->cdesc_addr.virt_addr) {
 		io_cq->cdesc_addr.virt_addr =
-			dma_zalloc_coherent(ena_dev->dmadev, size,
-					    &io_cq->cdesc_addr.phys_addr,
+			dma_zalloc_coherent(ena_dev->dmadev, size, &io_cq->cdesc_addr.phys_addr,
 					    GFP_KERNEL);
 	}
 
@@ -529,8 +513,8 @@ static int ena_com_comp_status_to_errno(struct ena_com_admin_queue *admin_queue,
 					u8 comp_status)
 {
 	if (unlikely(comp_status != 0))
-		netdev_err(admin_queue->ena_dev->net_device,
-			   "Admin command failed[%u]\n", comp_status);
+		netdev_err(admin_queue->ena_dev->net_device, "Admin command failed[%u]\n",
+			   comp_status);
 
 	switch (comp_status) {
 	case ENA_ADMIN_SUCCESS:
@@ -595,8 +579,7 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
 	}
 
 	if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) {
-		netdev_err(admin_queue->ena_dev->net_device,
-			   "Command was aborted\n");
+		netdev_err(admin_queue->ena_dev->net_device, "Command was aborted\n");
 		spin_lock_irqsave(&admin_queue->q_lock, flags);
 		admin_queue->stats.aborted_cmd++;
 		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
@@ -604,9 +587,6 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
 		goto err;
 	}
 
-	WARN(comp_ctx->status != ENA_CMD_COMPLETED, "Invalid comp status %d\n",
-	     comp_ctx->status);
-
 	ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status);
 err:
 	comp_ctxt_release(admin_queue, comp_ctx);
@@ -649,8 +629,7 @@ static int ena_com_set_llq(struct ena_com_dev *ena_dev)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		netdev_err(ena_dev->net_device,
-			   "Failed to set LLQ configurations: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to set LLQ configurations: %d\n", ret);
 
 	return ret;
 }
@@ -673,8 +652,7 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 			llq_default_cfg->llq_header_location;
 	} else {
 		netdev_err(ena_dev->net_device,
-			   "Invalid header location control, supported: 0x%x\n",
-			   supported_feat);
+			   "Invalid header location control, supported: 0x%x\n", supported_feat);
 		return -EINVAL;
 	}
 
@@ -696,8 +674,8 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 
 			netdev_err(ena_dev->net_device,
 				   "Default llq stride ctrl is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
-				   llq_default_cfg->llq_stride_ctrl,
-				   supported_feat, llq_info->desc_stride_ctrl);
+				   llq_default_cfg->llq_stride_ctrl, supported_feat,
+				   llq_info->desc_stride_ctrl);
 		}
 	} else {
 		llq_info->desc_stride_ctrl = 0;
@@ -719,8 +697,7 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 			llq_info->desc_list_entry_size = 256;
 		} else {
 			netdev_err(ena_dev->net_device,
-				   "Invalid entry_size_ctrl, supported: 0x%x\n",
-				   supported_feat);
+				   "Invalid entry_size_ctrl, supported: 0x%x\n", supported_feat);
 			return -EINVAL;
 		}
 
@@ -765,8 +742,8 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 
 		netdev_err(ena_dev->net_device,
 			   "Default llq num descs before header is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
-			   llq_default_cfg->llq_num_decs_before_header,
-			   supported_feat, llq_info->descs_num_before_header);
+			   llq_default_cfg->llq_num_decs_before_header, supported_feat,
+			   llq_info->descs_num_before_header);
 	}
 	/* Check for accelerated queue supported */
 	llq_accel_mode_get = llq_features->accel_mode.u.get;
@@ -782,8 +759,7 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 
 	rc = ena_com_set_llq(ena_dev);
 	if (unlikely(rc))
-		netdev_err(ena_dev->net_device,
-			   "Cannot set LLQ configuration: %d\n", rc);
+		netdev_err(ena_dev->net_device, "Cannot set LLQ configuration: %d\n", rc);
 
 	return rc;
 }
@@ -795,8 +771,7 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
 	int ret;
 
 	wait_for_completion_timeout(&comp_ctx->wait_event,
-				    usecs_to_jiffies(
-					    admin_queue->completion_timeout));
+				    usecs_to_jiffies(admin_queue->completion_timeout));
 
 	/* In case the command wasn't completed find out the root cause.
 	 * There might be 2 kinds of errors
@@ -813,8 +788,7 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
 			admin_queue->is_missing_admin_interrupt = true;
 			netdev_err(admin_queue->ena_dev->net_device,
 				   "The ena device sent a completion but the driver didn't receive a MSI-X interrupt (cmd %d), autopolling mode is %s\n",
-				   comp_ctx->cmd_opcode,
-				   admin_queue->auto_polling ? "ON" : "OFF");
+				   comp_ctx->cmd_opcode, admin_queue->auto_polling ? "ON" : "OFF");
 			/* Check if fallback to polling is enabled */
 			if (admin_queue->auto_polling)
 				admin_queue->polling = true;
@@ -832,6 +806,13 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
 			ret = -ETIME;
 			goto err;
 		}
+	} else if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) {
+		netdev_err(admin_queue->ena_dev->net_device, "Command was aborted\n");
+		spin_lock_irqsave(&admin_queue->q_lock, flags);
+		admin_queue->stats.aborted_cmd++;
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+		ret = -ENODEV;
+		goto err;
 	}
 
 	ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status);
@@ -883,15 +864,13 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
 	if (unlikely(i == timeout)) {
 		netdev_err(ena_dev->net_device,
 			   "Reading reg failed for timeout. expected: req id[%u] offset[%u] actual: req id[%u] offset[%u]\n",
-			   mmio_read->seq_num, offset, read_resp->req_id,
-			   read_resp->reg_off);
+			   mmio_read->seq_num, offset, read_resp->req_id, read_resp->reg_off);
 		ret = ENA_MMIO_READ_TIMEOUT;
 		goto err;
 	}
 
 	if (unlikely(read_resp->reg_off != offset)) {
-		netdev_err(ena_dev->net_device,
-			   "Read failure: wrong offset provided\n");
+		netdev_err(ena_dev->net_device, "Read failure: wrong offset provided\n");
 		ret = ENA_MMIO_READ_TIMEOUT;
 	} else {
 		ret = read_resp->reg_val;
@@ -950,8 +929,7 @@ static int ena_com_destroy_io_sq(struct ena_com_dev *ena_dev,
 					    sizeof(destroy_resp));
 
 	if (unlikely(ret && (ret != -ENODEV)))
-		netdev_err(ena_dev->net_device,
-			   "Failed to destroy io sq error: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to destroy io sq error: %d\n", ret);
 
 	return ret;
 }
@@ -965,8 +943,7 @@ static void ena_com_io_queue_free(struct ena_com_dev *ena_dev,
 	if (io_cq->cdesc_addr.virt_addr) {
 		size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth;
 
-		dma_free_coherent(ena_dev->dmadev, size,
-				  io_cq->cdesc_addr.virt_addr,
+		dma_free_coherent(ena_dev->dmadev, size, io_cq->cdesc_addr.virt_addr,
 				  io_cq->cdesc_addr.phys_addr);
 
 		io_cq->cdesc_addr.virt_addr = NULL;
@@ -975,8 +952,7 @@ static void ena_com_io_queue_free(struct ena_com_dev *ena_dev,
 	if (io_sq->desc_addr.virt_addr) {
 		size = io_sq->desc_entry_size * io_sq->q_depth;
 
-		dma_free_coherent(ena_dev->dmadev, size,
-				  io_sq->desc_addr.virt_addr,
+		dma_free_coherent(ena_dev->dmadev, size, io_sq->desc_addr.virt_addr,
 				  io_sq->desc_addr.phys_addr);
 
 		io_sq->desc_addr.virt_addr = NULL;
@@ -1001,8 +977,7 @@ static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
 		val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
 
 		if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) {
-			netdev_err(ena_dev->net_device,
-				   "Reg read timeout occurred\n");
+			netdev_err(ena_dev->net_device, "Reg read timeout occurred\n");
 			return -ETIME;
 		}
 
@@ -1042,8 +1017,7 @@ static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
 	int ret;
 
 	if (!ena_com_check_supported_feature_id(ena_dev, feature_id)) {
-		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
-			   feature_id);
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", feature_id);
 		return -EOPNOTSUPP;
 	}
 
@@ -1080,8 +1054,7 @@ static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
 
 	if (unlikely(ret))
 		netdev_err(ena_dev->net_device,
-			   "Failed to submit get_feature command %d error: %d\n",
-			   feature_id, ret);
+			   "Failed to submit get_feature command %d error: %d\n", feature_id, ret);
 
 	return ret;
 }
@@ -1120,13 +1093,11 @@ static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev)
 {
 	struct ena_rss *rss = &ena_dev->rss;
 
-	if (!ena_com_check_supported_feature_id(ena_dev,
-						ENA_ADMIN_RSS_HASH_FUNCTION))
+	if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_RSS_HASH_FUNCTION))
 		return -EOPNOTSUPP;
 
-	rss->hash_key =
-		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
-				    &rss->hash_key_dma_addr, GFP_KERNEL);
+	rss->hash_key = dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
+					    &rss->hash_key_dma_addr, GFP_KERNEL);
 
 	if (unlikely(!rss->hash_key))
 		return -ENOMEM;
@@ -1139,8 +1110,8 @@ static void ena_com_hash_key_destroy(struct ena_com_dev *ena_dev)
 	struct ena_rss *rss = &ena_dev->rss;
 
 	if (rss->hash_key)
-		dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
-				  rss->hash_key, rss->hash_key_dma_addr);
+		dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_key), rss->hash_key,
+				  rss->hash_key_dma_addr);
 	rss->hash_key = NULL;
 }
 
@@ -1148,9 +1119,8 @@ static int ena_com_hash_ctrl_init(struct ena_com_dev *ena_dev)
 {
 	struct ena_rss *rss = &ena_dev->rss;
 
-	rss->hash_ctrl =
-		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
-				    &rss->hash_ctrl_dma_addr, GFP_KERNEL);
+	rss->hash_ctrl = dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
+					     &rss->hash_ctrl_dma_addr, GFP_KERNEL);
 
 	if (unlikely(!rss->hash_ctrl))
 		return -ENOMEM;
@@ -1163,8 +1133,8 @@ static void ena_com_hash_ctrl_destroy(struct ena_com_dev *ena_dev)
 	struct ena_rss *rss = &ena_dev->rss;
 
 	if (rss->hash_ctrl)
-		dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
-				  rss->hash_ctrl, rss->hash_ctrl_dma_addr);
+		dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl), rss->hash_ctrl,
+				  rss->hash_ctrl_dma_addr);
 	rss->hash_ctrl = NULL;
 }
 
@@ -1193,15 +1163,13 @@ static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev,
 	tbl_size = (1ULL << log_size) *
 		sizeof(struct ena_admin_rss_ind_table_entry);
 
-	rss->rss_ind_tbl =
-		dma_zalloc_coherent(ena_dev->dmadev, tbl_size,
-				    &rss->rss_ind_tbl_dma_addr, GFP_KERNEL);
+	rss->rss_ind_tbl = dma_zalloc_coherent(ena_dev->dmadev, tbl_size,
+					       &rss->rss_ind_tbl_dma_addr, GFP_KERNEL);
 	if (unlikely(!rss->rss_ind_tbl))
 		goto mem_err1;
 
 	tbl_size = (1ULL << log_size) * sizeof(u16);
-	rss->host_rss_ind_tbl =
-		devm_kzalloc(ena_dev->dmadev, tbl_size, GFP_KERNEL);
+	rss->host_rss_ind_tbl = devm_kzalloc(ena_dev->dmadev, tbl_size, GFP_KERNEL);
 	if (unlikely(!rss->host_rss_ind_tbl))
 		goto mem_err2;
 
@@ -1213,8 +1181,7 @@ static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev,
 	tbl_size = (1ULL << log_size) *
 		sizeof(struct ena_admin_rss_ind_table_entry);
 
-	dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl,
-			  rss->rss_ind_tbl_dma_addr);
+	dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl, rss->rss_ind_tbl_dma_addr);
 	rss->rss_ind_tbl = NULL;
 mem_err1:
 	rss->tbl_log_size = 0;
@@ -1277,8 +1244,7 @@ static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
 					   &create_cmd.sq_ba,
 					   io_sq->desc_addr.phys_addr);
 		if (unlikely(ret)) {
-			netdev_err(ena_dev->net_device,
-				   "Memory address set failed\n");
+			netdev_err(ena_dev->net_device, "Memory address set failed\n");
 			return ret;
 		}
 	}
@@ -1289,8 +1255,7 @@ static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
 					    (struct ena_admin_acq_entry *)&cmd_completion,
 					    sizeof(cmd_completion));
 	if (unlikely(ret)) {
-		netdev_err(ena_dev->net_device,
-			   "Failed to create IO SQ. error: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to create IO SQ. error: %d\n", ret);
 		return ret;
 	}
 
@@ -1305,8 +1270,7 @@ static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
 			cmd_completion.llq_descriptors_offset);
 	}
 
-	netdev_dbg(ena_dev->net_device, "Created sq[%u], depth[%u]\n",
-		   io_sq->idx, io_sq->q_depth);
+	netdev_dbg(ena_dev->net_device, "Created sq[%u], depth[%u]\n", io_sq->idx, io_sq->q_depth);
 
 	return ret;
 }
@@ -1433,8 +1397,7 @@ int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
 					    (struct ena_admin_acq_entry *)&cmd_completion,
 					    sizeof(cmd_completion));
 	if (unlikely(ret)) {
-		netdev_err(ena_dev->net_device,
-			   "Failed to create IO CQ. error: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to create IO CQ. error: %d\n", ret);
 		return ret;
 	}
 
@@ -1448,8 +1411,7 @@ int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
 			(u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
 			cmd_completion.numa_node_register_offset);
 
-	netdev_dbg(ena_dev->net_device, "Created cq[%u], depth[%u]\n",
-		   io_cq->idx, io_cq->q_depth);
+	netdev_dbg(ena_dev->net_device, "Created cq[%u], depth[%u]\n", io_cq->idx, io_cq->q_depth);
 
 	return ret;
 }
@@ -1459,8 +1421,7 @@ int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid,
 			    struct ena_com_io_cq **io_cq)
 {
 	if (unlikely(qid >= ENA_TOTAL_NUM_QUEUES)) {
-		netdev_err(ena_dev->net_device,
-			   "Invalid queue number %d but the max is %d\n", qid,
+		netdev_err(ena_dev->net_device, "Invalid queue number %d but the max is %d\n", qid,
 			   ENA_TOTAL_NUM_QUEUES);
 		return -EINVAL;
 	}
@@ -1500,8 +1461,7 @@ void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev)
 	spin_lock_irqsave(&admin_queue->q_lock, flags);
 	while (atomic_read(&admin_queue->outstanding_cmds) != 0) {
 		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
-		ena_delay_exponential_backoff_us(exp++,
-						 ena_dev->ena_min_poll_delay_us);
+		ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us);
 		spin_lock_irqsave(&admin_queue->q_lock, flags);
 	}
 	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
@@ -1527,8 +1487,7 @@ int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev,
 					    sizeof(destroy_resp));
 
 	if (unlikely(ret && (ret != -ENODEV)))
-		netdev_err(ena_dev->net_device,
-			   "Failed to destroy IO CQ. error: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to destroy IO CQ. error: %d\n", ret);
 
 	return ret;
 }
@@ -1596,8 +1555,7 @@ int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		netdev_err(ena_dev->net_device,
-			   "Failed to config AENQ ret: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to config AENQ ret: %d\n", ret);
 
 	return ret;
 }
@@ -1618,8 +1576,7 @@ int ena_com_get_dma_width(struct ena_com_dev *ena_dev)
 	netdev_dbg(ena_dev->net_device, "ENA dma width: %d\n", width);
 
 	if (unlikely((width < 32) || width > ENA_MAX_PHYS_ADDR_SIZE_BITS)) {
-		netdev_err(ena_dev->net_device, "DMA width illegal value: %d\n",
-			   width);
+		netdev_err(ena_dev->net_device, "DMA width illegal value: %d\n", width);
 		return -EINVAL;
 	}
 
@@ -1641,19 +1598,16 @@ int ena_com_validate_version(struct ena_com_dev *ena_dev)
 	ctrl_ver = ena_com_reg_bar_read32(ena_dev,
 					  ENA_REGS_CONTROLLER_VERSION_OFF);
 
-	if (unlikely((ver == ENA_MMIO_READ_TIMEOUT) ||
-		     (ctrl_ver == ENA_MMIO_READ_TIMEOUT))) {
+	if (unlikely((ver == ENA_MMIO_READ_TIMEOUT) || (ctrl_ver == ENA_MMIO_READ_TIMEOUT))) {
 		netdev_err(ena_dev->net_device, "Reg read timeout occurred\n");
 		return -ETIME;
 	}
 
 	dev_info(ena_dev->dmadev, "ENA device version: %d.%d\n",
-		 (ver & ENA_REGS_VERSION_MAJOR_VERSION_MASK) >>
-			 ENA_REGS_VERSION_MAJOR_VERSION_SHIFT,
+		 (ver & ENA_REGS_VERSION_MAJOR_VERSION_MASK) >> ENA_REGS_VERSION_MAJOR_VERSION_SHIFT,
 		 ver & ENA_REGS_VERSION_MINOR_VERSION_MASK);
 
-	dev_info(ena_dev->dmadev,
-		 "ENA controller version: %d.%d.%d implementation version %d\n",
+	dev_info(ena_dev->dmadev, "ENA controller version: %d.%d.%d implementation version %d\n",
 		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
 			 ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
 		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >>
@@ -1702,20 +1656,17 @@ void ena_com_admin_destroy(struct ena_com_dev *ena_dev)
 
 	size = ADMIN_SQ_SIZE(admin_queue->q_depth);
 	if (sq->entries)
-		dma_free_coherent(ena_dev->dmadev, size, sq->entries,
-				  sq->dma_addr);
+		dma_free_coherent(ena_dev->dmadev, size, sq->entries, sq->dma_addr);
 	sq->entries = NULL;
 
 	size = ADMIN_CQ_SIZE(admin_queue->q_depth);
 	if (cq->entries)
-		dma_free_coherent(ena_dev->dmadev, size, cq->entries,
-				  cq->dma_addr);
+		dma_free_coherent(ena_dev->dmadev, size, cq->entries, cq->dma_addr);
 	cq->entries = NULL;
 
 	size = ADMIN_AENQ_SIZE(aenq->q_depth);
 	if (ena_dev->aenq.entries)
-		dma_free_coherent(ena_dev->dmadev, size, aenq->entries,
-				  aenq->dma_addr);
+		dma_free_coherent(ena_dev->dmadev, size, aenq->entries, aenq->dma_addr);
 	aenq->entries = NULL;
 }
 
@@ -1753,9 +1704,8 @@ int ena_com_phc_init(struct ena_com_dev *ena_dev)
 	memset(phc, 0x0, sizeof(*phc));
 
 	/* Allocate shared mem used PHC timestamp retrieved from device */
-	phc->virt_addr =
-		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr),
-				    &phc->phys_addr, GFP_KERNEL);
+	phc->virt_addr = dma_zalloc_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr),
+					     &phc->phys_addr, GFP_KERNEL);
 	if (unlikely(!phc->virt_addr))
 		return -ENOMEM;
 
@@ -1782,15 +1732,13 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev)
 				  ENA_ADMIN_PHC_FEATURE_VERSION_0);
 	if (unlikely(ret)) {
 		netdev_err(ena_dev->net_device,
-			   "Failed to get PHC feature configuration, error: %d\n",
-			   ret);
+			   "Failed to get PHC feature configuration, error: %d\n", ret);
 		return ret;
 	}
 
 	/* Supporting only PHC V0 (readless mode with error bound) */
 	if (get_feat_resp.u.phc.version != ENA_ADMIN_PHC_FEATURE_VERSION_0) {
-		netdev_err(ena_dev->net_device,
-			   "Unsupprted PHC version (0x%X), error: %d\n",
+		netdev_err(ena_dev->net_device, "Unsupprted PHC version (0x%X), error: %d\n",
 			   get_feat_resp.u.phc.version, -EOPNOTSUPP);
 		return -EOPNOTSUPP;
 	}
@@ -1819,8 +1767,7 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev)
 	set_feat_cmd.u.phc.output_length = sizeof(*phc->virt_addr);
 	ret = ena_com_mem_addr_set(ena_dev, &set_feat_cmd.u.phc.output_address, phc->phys_addr);
 	if (unlikely(ret)) {
-		netdev_err(ena_dev->net_device,
-			   "Failed setting PHC output address, error: %d\n",
+		netdev_err(ena_dev->net_device, "Failed setting PHC output address, error: %d\n",
 			   ret);
 		return ret;
 	}
@@ -1833,8 +1780,7 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev)
 					    sizeof(set_feat_resp));
 
 	if (unlikely(ret)) {
-		netdev_err(ena_dev->net_device,
-			   "Failed to enable PHC, error: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to enable PHC, error: %d\n", ret);
 		return ret;
 	}
 
@@ -1857,8 +1803,7 @@ void ena_com_phc_destroy(struct ena_com_dev *ena_dev)
 	phc->active = false;
 	spin_unlock_irqrestore(&phc->lock, flags);
 
-	dma_free_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr),
-			  phc->virt_addr, phc->phys_addr);
+	dma_free_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr), phc->virt_addr, phc->phys_addr);
 	phc->virt_addr = NULL;
 }
 
@@ -1873,8 +1818,7 @@ int ena_com_phc_get_timestamp(struct ena_com_dev *ena_dev, u64 *timestamp)
 	int ret = 0;
 
 	if (!phc->active) {
-		netdev_err(ena_dev->net_device,
-			   "PHC feature is not active in the device\n");
+		netdev_err(ena_dev->net_device, "PHC feature is not active in the device\n");
 		return -EOPNOTSUPP;
 	}
 
@@ -1883,8 +1827,7 @@ int ena_com_phc_get_timestamp(struct ena_com_dev *ena_dev, u64 *timestamp)
 	/* Check if PHC is in blocked state */
 	if (unlikely(ktime_compare(phc->system_time, zero_system_time))) {
 		/* Check if blocking time expired */
-		block_time =
-			ktime_add_us(phc->system_time, phc->block_timeout_usec);
+		block_time = ktime_add_us(phc->system_time, phc->block_timeout_usec);
 		if (!ktime_after(ktime_get(), block_time)) {
 			/* PHC is still in blocked state, skip PHC request */
 			phc->stats.phc_skp++;
@@ -1924,10 +1867,9 @@ int ena_com_phc_get_timestamp(struct ena_com_dev *ena_dev, u64 *timestamp)
 	/* Stalling until the device updates req_id */
 	while (1) {
 		if (unlikely(ktime_after(ktime_get(), expire_time))) {
-			/* Gave up waiting for updated req_id, PHC enters into
-			 * blocked state until passing blocking time, during
-			 * this time any get PHC timestamp or error bound
-			 * requests will fail with device busy error
+			/* Gave up waiting for updated req_id, PHC enters into blocked state until
+			 * passing blocking time, during this time any get PHC timestamp or
+			 * error bound requests will fail with device busy error
 			 */
 			phc->error_bound = ENA_PHC_MAX_ERROR_BOUND;
 			ret = -EBUSY;
@@ -1980,8 +1922,7 @@ int ena_com_phc_get_error_bound(struct ena_com_dev *ena_dev, u32 *error_bound)
 	u32 local_error_bound = phc->error_bound;
 
 	if (!phc->active) {
-		netdev_err(ena_dev->net_device,
-			   "PHC feature is not active in the device\n");
+		netdev_err(ena_dev->net_device, "PHC feature is not active in the device\n");
 		return -EOPNOTSUPP;
 	}
 
@@ -1998,10 +1939,8 @@ int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev)
 	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
 
 	spin_lock_init(&mmio_read->lock);
-	mmio_read->read_resp =
-		dma_zalloc_coherent(ena_dev->dmadev,
-				    sizeof(*mmio_read->read_resp),
-				    &mmio_read->read_resp_dma_addr, GFP_KERNEL);
+	mmio_read->read_resp = dma_zalloc_coherent(ena_dev->dmadev, sizeof(*mmio_read->read_resp),
+						   &mmio_read->read_resp_dma_addr, GFP_KERNEL);
 	if (unlikely(!mmio_read->read_resp))
 		goto err;
 
@@ -2032,8 +1971,8 @@ void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev)
 	writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_LO_OFF);
 	writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_HI_OFF);
 
-	dma_free_coherent(ena_dev->dmadev, sizeof(*mmio_read->read_resp),
-			  mmio_read->read_resp, mmio_read->read_resp_dma_addr);
+	dma_free_coherent(ena_dev->dmadev, sizeof(*mmio_read->read_resp), mmio_read->read_resp,
+			  mmio_read->read_resp_dma_addr);
 
 	mmio_read->read_resp = NULL;
 }
@@ -2065,8 +2004,7 @@ int ena_com_admin_init(struct ena_com_dev *ena_dev,
 	}
 
 	if (!(dev_sts & ENA_REGS_DEV_STS_READY_MASK)) {
-		netdev_err(ena_dev->net_device,
-			   "Device isn't ready, abort com init\n");
+		netdev_err(ena_dev->net_device, "Device isn't ready, abort com init\n");
 		return -ENODEV;
 	}
 
@@ -2145,8 +2083,7 @@ int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
 	int ret;
 
 	if (unlikely(ctx->qid >= ENA_TOTAL_NUM_QUEUES)) {
-		netdev_err(ena_dev->net_device,
-			   "Qid (%d) is bigger than max num of queues (%d)\n",
+		netdev_err(ena_dev->net_device, "Qid (%d) is bigger than max num of queues (%d)\n",
 			   ctx->qid, ENA_TOTAL_NUM_QUEUES);
 		return -EINVAL;
 	}
@@ -2172,8 +2109,7 @@ int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
 
 	if (ctx->direction == ENA_COM_IO_QUEUE_DIRECTION_TX)
 		/* header length is limited to 8 bits */
-		io_sq->tx_max_header_size =
-			min_t(u32, ena_dev->tx_max_header_size, SZ_256);
+		io_sq->tx_max_header_size = min_t(u32, ena_dev->tx_max_header_size, SZ_256);
 
 	ret = ena_com_init_io_sq(ena_dev, ctx, io_sq);
 	if (unlikely(ret))
@@ -2205,8 +2141,7 @@ void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid)
 	struct ena_com_io_cq *io_cq;
 
 	if (unlikely(qid >= ENA_TOTAL_NUM_QUEUES)) {
-		netdev_err(ena_dev->net_device,
-			   "Qid (%d) is bigger than max num of queues (%d)\n",
+		netdev_err(ena_dev->net_device, "Qid (%d) is bigger than max num of queues (%d)\n",
 			   qid, ENA_TOTAL_NUM_QUEUES);
 		return;
 	}
@@ -2248,8 +2183,7 @@ static int ena_get_dev_stats(struct ena_com_dev *ena_dev,
 					    sizeof(*get_resp));
 
 	if (unlikely(ret))
-		netdev_err(ena_dev->net_device,
-			   "Failed to get stats. error: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to get stats. error: %d\n", ret);
 
 	return ret;
 }
@@ -2274,8 +2208,7 @@ static void ena_com_set_supported_customer_metrics(struct ena_com_dev *ena_dev)
 			ctx.get_resp.u.customer_metrics.reported_metrics;
 	else
 		netdev_err(ena_dev->net_device,
-			   "Failed to query customer metrics support. error: %d\n",
-			   ret);
+			   "Failed to query customer metrics support. error: %d\n", ret);
 }
 
 int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
@@ -2302,8 +2235,7 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 		if (rc)
 			return rc;
 
-		if (get_resp.u.max_queue_ext.version !=
-		    ENA_FEATURE_MAX_QUEUE_EXT_VER)
+		if (get_resp.u.max_queue_ext.version != ENA_FEATURE_MAX_QUEUE_EXT_VER)
 			return -EINVAL;
 
 		memcpy(&get_feat_ctx->max_queue_ext, &get_resp.u.max_queue_ext,
@@ -2344,19 +2276,16 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_HW_HINTS, 0);
 
 	if (!rc)
-		memcpy(&get_feat_ctx->hw_hints, &get_resp.u.hw_hints,
-		       sizeof(get_resp.u.hw_hints));
+		memcpy(&get_feat_ctx->hw_hints, &get_resp.u.hw_hints, sizeof(get_resp.u.hw_hints));
 	else if (rc == -EOPNOTSUPP)
-		memset(&get_feat_ctx->hw_hints, 0x0,
-		       sizeof(get_feat_ctx->hw_hints));
+		memset(&get_feat_ctx->hw_hints, 0x0, sizeof(get_feat_ctx->hw_hints));
 	else
 		return rc;
 
 	rc = ena_com_get_feature(ena_dev, &get_resp,
 				 ENA_ADMIN_LLQ, ENA_ADMIN_LLQ_FEATURE_VERSION_1);
 	if (!rc)
-		memcpy(&get_feat_ctx->llq, &get_resp.u.llq,
-		       sizeof(get_resp.u.llq));
+		memcpy(&get_feat_ctx->llq, &get_resp.u.llq, sizeof(get_resp.u.llq));
 	else if (rc == -EOPNOTSUPP)
 		memset(&get_feat_ctx->llq, 0x0, sizeof(get_feat_ctx->llq));
 	else
@@ -2406,18 +2335,20 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data)
 	aenq_common = &aenq_e->aenq_common_desc;
 
 	/* Go over all the events */
-	while ((READ_ONCE(aenq_common->flags) &
-		ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) {
-		/* Make sure the phase bit (ownership) is as expected before
-		 * reading the rest of the descriptor.
+	while ((READ_ONCE(aenq_common->flags) & ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/* When the phase bit of the AENQ descriptor aligns with the driver's phase bit,
+		 * it signifies the readiness of the entire AENQ descriptor.
+		 * The driver should proceed to read the descriptor's data only after confirming
+		 * and synchronizing the phase bit.
+		 * This memory fence guarantees the correct sequence of accesses to the
+		 * descriptor's memory.
 		 */
 		dma_rmb();
 
 		timestamp = (u64)aenq_common->timestamp_low |
 			((u64)aenq_common->timestamp_high << 32);
 
-		netdev_dbg(ena_dev->net_device,
-			   "AENQ! Group[%x] Syndrome[%x] timestamp: [%llus]\n",
+		netdev_dbg(ena_dev->net_device, "AENQ! Group[%x] Syndrome[%x] timestamp: [%llus]\n",
 			   aenq_common->group, aenq_common->syndrome, timestamp);
 
 		/* Handle specific event*/
@@ -2446,13 +2377,53 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data)
 
 	/* write the aenq doorbell after all AENQ descriptors were read */
 	mb();
-	writel_relaxed((u32)aenq->head,
-		       ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+	writel_relaxed((u32)aenq->head, ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
 #ifndef MMIOWB_NOT_DEFINED
 	mmiowb();
 #endif
 }
 
+bool ena_com_aenq_has_keep_alive(struct ena_com_dev *ena_dev)
+{
+	struct ena_admin_aenq_common_desc *aenq_common;
+	struct ena_com_aenq *aenq = &ena_dev->aenq;
+	struct ena_admin_aenq_entry *aenq_e;
+	u8 phase = aenq->phase;
+	u16 masked_head;
+
+	masked_head = aenq->head & (aenq->q_depth - 1);
+	aenq_e = &aenq->entries[masked_head]; /* Get first entry */
+	aenq_common = &aenq_e->aenq_common_desc;
+
+	/* Go over all the events */
+	while ((READ_ONCE(aenq_common->flags) & ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/* When the phase bit of the AENQ descriptor aligns with the driver's phase bit,
+		 * it signifies the readiness of the entire AENQ descriptor.
+		 * The driver should proceed to read the descriptor's data only after confirming
+		 * and synchronizing the phase bit.
+		 * This memory fence guarantees the correct sequence of accesses to the
+		 * descriptor's memory.
+		 */
+		dma_rmb();
+
+		if (aenq_common->group == ENA_ADMIN_KEEP_ALIVE)
+			return true;
+
+		/* Get next event entry */
+		masked_head++;
+
+		if (unlikely(masked_head == aenq->q_depth)) {
+			masked_head = 0;
+			phase = !phase;
+		}
+
+		aenq_e = &aenq->entries[masked_head];
+		aenq_common = &aenq_e->aenq_common_desc;
+	}
+
+	return false;
+}
+
 int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 		      enum ena_regs_reset_reason_types reset_reason)
 {
@@ -2463,15 +2434,13 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 	stat = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
 	cap = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF);
 
-	if (unlikely((stat == ENA_MMIO_READ_TIMEOUT) ||
-		     (cap == ENA_MMIO_READ_TIMEOUT))) {
+	if (unlikely((stat == ENA_MMIO_READ_TIMEOUT) || (cap == ENA_MMIO_READ_TIMEOUT))) {
 		netdev_err(ena_dev->net_device, "Reg read32 timeout occurred\n");
 		return -ETIME;
 	}
 
 	if ((stat & ENA_REGS_DEV_STS_READY_MASK) == 0) {
-		netdev_err(ena_dev->net_device,
-			   "Device isn't ready, can't reset device\n");
+		netdev_err(ena_dev->net_device, "Device isn't ready, can't reset device\n");
 		return -EINVAL;
 	}
 
@@ -2514,8 +2483,7 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 	rc = wait_for_reset_state(ena_dev, timeout,
 				  ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK);
 	if (unlikely(rc)) {
-		netdev_err(ena_dev->net_device,
-			   "Reset indication didn't turn on\n");
+		netdev_err(ena_dev->net_device, "Reset indication didn't turn on\n");
 		return rc;
 	}
 
@@ -2523,8 +2491,7 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 	writel(0, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF);
 	rc = wait_for_reset_state(ena_dev, timeout, 0);
 	if (unlikely(rc)) {
-		netdev_err(ena_dev->net_device,
-			   "Reset indication didn't turn off\n");
+		netdev_err(ena_dev->net_device, "Reset indication didn't turn off\n");
 		return rc;
 	}
 
@@ -2546,8 +2513,7 @@ int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
 	int ret;
 
 	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENI_STATS)) {
-		netdev_err(ena_dev->net_device,
-			   "Capability %d isn't supported\n",
+		netdev_err(ena_dev->net_device, "Capability %d isn't supported\n",
 			   ENA_ADMIN_ENI_STATS);
 		return -EOPNOTSUPP;
 	}
@@ -2568,8 +2534,7 @@ int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev,
 	int ret;
 
 	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
-		netdev_err(ena_dev->net_device,
-			   "Capability %d isn't supported\n",
+		netdev_err(ena_dev->net_device, "Capability %d isn't supported\n",
 			   ENA_ADMIN_ENA_SRD_INFO);
 		return -EOPNOTSUPP;
 	}
@@ -2606,8 +2571,7 @@ int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32
 
 	if (unlikely(len > ena_dev->customer_metrics.buffer_len)) {
 		netdev_err(ena_dev->net_device,
-			   "Invalid buffer size %u. The given buffer is too big.\n",
-			   len);
+			   "Invalid buffer size %u. The given buffer is too big.\n", len);
 		return -EINVAL;
 	}
 
@@ -2618,8 +2582,7 @@ int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32
 	}
 
 	if (!ena_dev->customer_metrics.supported_metrics) {
-		netdev_err(ena_dev->net_device,
-			   "No supported customer metrics.\n");
+		netdev_err(ena_dev->net_device, "No supported customer metrics.\n");
 		return -EOPNOTSUPP;
 	}
 
@@ -2639,8 +2602,7 @@ int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32
 	if (likely(ret == 0))
 		memcpy(buffer, ena_dev->customer_metrics.buffer_virt_addr, len);
 	else
-		netdev_err(ena_dev->net_device,
-			   "Failed to get customer metrics. error: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to get customer metrics. error: %d\n", ret);
 
 	return ret;
 }
@@ -2653,8 +2615,7 @@ int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu)
 	int ret;
 
 	if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_MTU)) {
-		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
-			   ENA_ADMIN_MTU);
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", ENA_ADMIN_MTU);
 		return -EOPNOTSUPP;
 	}
 
@@ -2673,8 +2634,7 @@ int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		netdev_err(ena_dev->net_device,
-			   "Failed to set mtu %d. error: %d\n", mtu, ret);
+		netdev_err(ena_dev->net_device, "Failed to set mtu %d. error: %d\n", mtu, ret);
 
 	return ret;
 }
@@ -2688,8 +2648,7 @@ int ena_com_get_offload_settings(struct ena_com_dev *ena_dev,
 	ret = ena_com_get_feature(ena_dev, &resp,
 				  ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0);
 	if (unlikely(ret)) {
-		netdev_err(ena_dev->net_device,
-			   "Failed to get offload capabilities %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to get offload capabilities %d\n", ret);
 		return ret;
 	}
 
@@ -2707,8 +2666,7 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 	struct ena_admin_get_feat_resp get_resp;
 	int ret;
 
-	if (!ena_com_check_supported_feature_id(ena_dev,
-						ENA_ADMIN_RSS_HASH_FUNCTION)) {
+	if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_RSS_HASH_FUNCTION)) {
 		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
 			   ENA_ADMIN_RSS_HASH_FUNCTION);
 		return -EOPNOTSUPP;
@@ -2721,8 +2679,7 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 		return ret;
 
 	if (!(get_resp.u.flow_hash_func.supported_func & BIT(rss->hash_func))) {
-		netdev_err(ena_dev->net_device,
-			   "Func hash %d isn't supported by device, abort\n",
+		netdev_err(ena_dev->net_device, "Func hash %d isn't supported by device, abort\n",
 			   rss->hash_func);
 		return -EOPNOTSUPP;
 	}
@@ -2752,8 +2709,7 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 					    (struct ena_admin_acq_entry *)&resp,
 					    sizeof(resp));
 	if (unlikely(ret)) {
-		netdev_err(ena_dev->net_device,
-			   "Failed to set hash function %d. error: %d\n",
+		netdev_err(ena_dev->net_device, "Failed to set hash function %d. error: %d\n",
 			   rss->hash_func, ret);
 		return -EINVAL;
 	}
@@ -2785,16 +2741,15 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
 		return rc;
 
 	if (!(BIT(func) & get_resp.u.flow_hash_func.supported_func)) {
-		netdev_err(ena_dev->net_device,
-			   "Flow hash function %d isn't supported\n", func);
+		netdev_err(ena_dev->net_device, "Flow hash function %d isn't supported\n", func);
 		return -EOPNOTSUPP;
 	}
 
 	if ((func == ENA_ADMIN_TOEPLITZ) && key) {
 		if (key_len != sizeof(hash_key->key)) {
 			netdev_err(ena_dev->net_device,
-				   "key len (%u) doesn't equal the supported size (%zu)\n",
-				   key_len, sizeof(hash_key->key));
+				   "key len (%u) doesn't equal the supported size (%zu)\n", key_len,
+				   sizeof(hash_key->key));
 			return -EINVAL;
 		}
 		memcpy(hash_key->key, key, key_len);
@@ -2882,8 +2837,7 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev)
 	struct ena_admin_set_feat_resp resp;
 	int ret;
 
-	if (!ena_com_check_supported_feature_id(ena_dev,
-						ENA_ADMIN_RSS_HASH_INPUT)) {
+	if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_RSS_HASH_INPUT)) {
 		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
 			   ENA_ADMIN_RSS_HASH_INPUT);
 		return -EOPNOTSUPP;
@@ -2914,8 +2868,7 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev)
 					    (struct ena_admin_acq_entry *)&resp,
 					    sizeof(resp));
 	if (unlikely(ret))
-		netdev_err(ena_dev->net_device,
-			   "Failed to set hash input. error: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to set hash input. error: %d\n", ret);
 
 	return ret;
 }
@@ -2992,8 +2945,7 @@ int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev,
 	int rc;
 
 	if (proto >= ENA_ADMIN_RSS_PROTO_NUM) {
-		netdev_err(ena_dev->net_device, "Invalid proto num (%u)\n",
-			   proto);
+		netdev_err(ena_dev->net_device, "Invalid proto num (%u)\n", proto);
 		return -EINVAL;
 	}
 
@@ -3045,8 +2997,7 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
 	struct ena_admin_set_feat_resp resp;
 	int ret;
 
-	if (!ena_com_check_supported_feature_id(
-		    ena_dev, ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG)) {
+	if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG)) {
 		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
 			   ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG);
 		return -EOPNOTSUPP;
@@ -3086,8 +3037,7 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		netdev_err(ena_dev->net_device,
-			   "Failed to set indirect table. error: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to set indirect table. error: %d\n", ret);
 
 	return ret;
 }
@@ -3166,9 +3116,8 @@ int ena_com_allocate_host_info(struct ena_com_dev *ena_dev)
 {
 	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
 
-	host_attr->host_info =
-		dma_zalloc_coherent(ena_dev->dmadev, SZ_4K,
-				    &host_attr->host_info_dma_addr, GFP_KERNEL);
+	host_attr->host_info = dma_zalloc_coherent(ena_dev->dmadev, SZ_4K,
+						   &host_attr->host_info_dma_addr, GFP_KERNEL);
 	if (unlikely(!host_attr->host_info))
 		return -ENOMEM;
 
@@ -3203,10 +3152,8 @@ int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev)
 
 	customer_metrics->buffer_len = ENA_CUSTOMER_METRICS_BUFFER_SIZE;
 	customer_metrics->buffer_virt_addr =
-		dma_zalloc_coherent(ena_dev->dmadev,
-				    customer_metrics->buffer_len,
-				    &customer_metrics->buffer_dma_addr,
-				    GFP_KERNEL);
+		dma_zalloc_coherent(ena_dev->dmadev, customer_metrics->buffer_len,
+				    &customer_metrics->buffer_dma_addr, GFP_KERNEL);
 	if (unlikely(!customer_metrics->buffer_virt_addr))
 		return -ENOMEM;
 
@@ -3230,8 +3177,7 @@ void ena_com_delete_debug_area(struct ena_com_dev *ena_dev)
 
 	if (host_attr->debug_area_virt_addr) {
 		dma_free_coherent(ena_dev->dmadev, host_attr->debug_area_size,
-				  host_attr->debug_area_virt_addr,
-				  host_attr->debug_area_dma_addr);
+				  host_attr->debug_area_virt_addr, host_attr->debug_area_dma_addr);
 		host_attr->debug_area_virt_addr = NULL;
 	}
 }
@@ -3292,8 +3238,7 @@ int ena_com_set_host_attributes(struct ena_com_dev *ena_dev)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		netdev_err(ena_dev->net_device,
-			   "Failed to set host attributes: %d\n", ret);
+		netdev_err(ena_dev->net_device, "Failed to set host attributes: %d\n", ret);
 
 	return ret;
 }
@@ -3311,8 +3256,7 @@ static int ena_com_update_nonadaptive_moderation_interval(struct ena_com_dev *en
 							  u32 *intr_moder_interval)
 {
 	if (!intr_delay_resolution) {
-		netdev_err(ena_dev->net_device,
-			   "Illegal interrupt delay granularity value\n");
+		netdev_err(ena_dev->net_device, "Illegal interrupt delay granularity value\n");
 		return -EFAULT;
 	}
 
@@ -3350,14 +3294,12 @@ int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev)
 
 	if (rc) {
 		if (rc == -EOPNOTSUPP) {
-			netdev_dbg(ena_dev->net_device,
-				   "Feature %d isn't supported\n",
+			netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
 				   ENA_ADMIN_INTERRUPT_MODERATION);
 			rc = 0;
 		} else {
 			netdev_err(ena_dev->net_device,
-				   "Failed to get interrupt moderation admin cmd. rc: %d\n",
-				   rc);
+				   "Failed to get interrupt moderation admin cmd. rc: %d\n", rc);
 		}
 
 		/* no moderation supported, disable adaptive support */
@@ -3405,8 +3347,7 @@ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
 		(llq_info->descs_num_before_header * sizeof(struct ena_eth_io_tx_desc));
 
 	if (unlikely(ena_dev->tx_max_header_size == 0)) {
-		netdev_err(ena_dev->net_device,
-			   "The size of the LLQ entry is smaller than needed\n");
+		netdev_err(ena_dev->net_device, "The size of the LLQ entry is smaller than needed\n");
 		return -EINVAL;
 	}
 
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index efe7168fc37e0..f4f1b676e45eb 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -27,6 +27,8 @@
 #undef pr_fmt
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#define ENA_ADMIN_OS_LINUX 1
+
 #define ENA_MAX_NUM_IO_QUEUES 128U
 /* We need to queues for each IO (on for Tx and one for Rx) */
 #define ENA_TOTAL_NUM_QUEUES (2 * (ENA_MAX_NUM_IO_QUEUES))
@@ -641,6 +643,16 @@ void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev);
  */
 void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data);
 
+/* ena_com_aenq_has_keep_alive - Retrieve if there is a keep alive notification in the aenq
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method goes over the async event notification queue and returns if there
+ * is a keep alive notification.
+ *
+ * @return - true if there is a keep alive notification in the aenq or false otherwise
+ */
+bool ena_com_aenq_has_keep_alive(struct ena_com_dev *ena_dev);
+
 /* ena_com_abort_admin_commands - Abort all the outstanding admin commands.
  * @ena_dev: ENA communication layer struct
  *
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index a3bbd70983476..281a4b46f2e89 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -18,8 +18,7 @@ static struct ena_eth_io_rx_cdesc_base *ena_com_get_next_rx_cdesc(
 	cdesc = (struct ena_eth_io_rx_cdesc_base *)(io_cq->cdesc_addr.virt_addr
 			+ (head_masked * io_cq->cdesc_entry_size_in_bytes));
 
-	desc_phase = (READ_ONCE(cdesc->status) &
-		      ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK) >>
+	desc_phase = (READ_ONCE(cdesc->status) & ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK) >>
 		     ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT;
 
 	if (desc_phase != expected_phase)
@@ -65,8 +64,8 @@ static int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq,
 
 		io_sq->entries_in_tx_burst_left--;
 		netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
-			   "Decreasing entries_in_tx_burst_left of queue %d to %d\n",
-			   io_sq->qid, io_sq->entries_in_tx_burst_left);
+			   "Decreasing entries_in_tx_burst_left of queue %d to %d\n", io_sq->qid,
+			   io_sq->entries_in_tx_burst_left);
 	}
 
 	/* Make sure everything was written into the bounce buffer before
@@ -75,8 +74,8 @@ static int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq,
 	wmb();
 
 	/* The line is completed. Copy it to dev */
-	__iowrite64_copy(io_sq->desc_addr.pbuf_dev_addr + dst_offset,
-			 bounce_buffer, (llq_info->desc_list_entry_size) / 8);
+	__iowrite64_copy(io_sq->desc_addr.pbuf_dev_addr + dst_offset, bounce_buffer,
+			 (llq_info->desc_list_entry_size) / 8);
 
 	io_sq->tail++;
 
@@ -102,16 +101,14 @@ static int ena_com_write_header_to_bounce(struct ena_com_io_sq *io_sq,
 	header_offset =
 		llq_info->descs_num_before_header * io_sq->desc_entry_size;
 
-	if (unlikely((header_offset + header_len) >
-		     llq_info->desc_list_entry_size)) {
+	if (unlikely((header_offset + header_len) > llq_info->desc_list_entry_size)) {
 		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
 			   "Trying to write header larger than llq entry can accommodate\n");
 		return -EFAULT;
 	}
 
 	if (unlikely(!bounce_buffer)) {
-		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
-			   "Bounce buffer is NULL\n");
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, "Bounce buffer is NULL\n");
 		return -EFAULT;
 	}
 
@@ -129,8 +126,7 @@ static void *get_sq_desc_llq(struct ena_com_io_sq *io_sq)
 	bounce_buffer = pkt_ctrl->curr_bounce_buf;
 
 	if (unlikely(!bounce_buffer)) {
-		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
-			   "Bounce buffer is NULL\n");
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, "Bounce buffer is NULL\n");
 		return NULL;
 	}
 
@@ -210,11 +206,8 @@ static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
 	return 0;
 }
 
-static int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
+static int ena_com_sq_update_reqular_queue_tail(struct ena_com_io_sq *io_sq)
 {
-	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
-		return ena_com_sq_update_llq_tail(io_sq);
-
 	io_sq->tail++;
 
 	/* Switch phase bit in case of wrap around */
@@ -224,6 +217,14 @@ static int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
 	return 0;
 }
 
+static int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
+{
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		return ena_com_sq_update_llq_tail(io_sq);
+
+	return ena_com_sq_update_reqular_queue_tail(io_sq);
+}
+
 static struct ena_eth_io_rx_cdesc_base *
 	ena_com_rx_cdesc_idx_to_ptr(struct ena_com_io_cq *io_cq, u16 idx)
 {
@@ -264,8 +265,8 @@ static int ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 					ENA_ETH_IO_RX_CDESC_BASE_MBZ17_MASK)) &&
 			     ena_com_get_cap(dev, ENA_ADMIN_CDESC_MBZ))) {
 			netdev_err(dev->net_device,
-				   "Corrupted RX descriptor #%d on q_id: %d, req_id: %u\n",
-				   count, io_cq->qid, cdesc->req_id);
+				   "Corrupted RX descriptor #%d on q_id: %d, req_id: %u\n", count,
+				   io_cq->qid, cdesc->req_id);
 			return -EFAULT;
 		}
 
@@ -391,9 +392,8 @@ static void ena_com_rx_set_flags(struct ena_com_io_cq *io_cq,
 
 	netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
 		   "l3_proto %d l4_proto %d l3_csum_err %d l4_csum_err %d hash %d frag %d cdesc_status %x\n",
-		   ena_rx_ctx->l3_proto, ena_rx_ctx->l4_proto,
-		   ena_rx_ctx->l3_csum_err, ena_rx_ctx->l4_csum_err,
-		   ena_rx_ctx->hash, ena_rx_ctx->frag, cdesc->status);
+		   ena_rx_ctx->l3_proto, ena_rx_ctx->l4_proto, ena_rx_ctx->l3_csum_err,
+		   ena_rx_ctx->l4_csum_err, ena_rx_ctx->hash, ena_rx_ctx->frag, cdesc->status);
 }
 
 /*****************************************************************************/
@@ -425,13 +425,12 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 
 	if (unlikely(header_len > io_sq->tx_max_header_size)) {
 		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
-			   "Header size is too large %d max header: %d\n",
-			   header_len, io_sq->tx_max_header_size);
+			   "Header size is too large %d max header: %d\n", header_len,
+			   io_sq->tx_max_header_size);
 		return -EINVAL;
 	}
 
-	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV &&
-		     !buffer_to_push)) {
+	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV && !buffer_to_push)) {
 		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
 			   "Push header wasn't provided in LLQ mode\n");
 		return -EINVAL;
@@ -582,13 +581,11 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 	}
 
 	netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
-		   "Fetch rx packet: queue %d completed desc: %d\n", io_cq->qid,
-		   nb_hw_desc);
+		   "Fetch rx packet: queue %d completed desc: %d\n", io_cq->qid, nb_hw_desc);
 
 	if (unlikely(nb_hw_desc > ena_rx_ctx->max_bufs)) {
 		netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
-			   "Too many RX cdescs (%d) > MAX(%d)\n", nb_hw_desc,
-			   ena_rx_ctx->max_bufs);
+			   "Too many RX cdescs (%d) > MAX(%d)\n", nb_hw_desc, ena_rx_ctx->max_bufs);
 		return -ENOSPC;
 	}
 
@@ -612,8 +609,8 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 	io_sq->next_to_comp += nb_hw_desc;
 
 	netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
-		   "[%s][QID#%d] Updating SQ head to: %d\n", __func__,
-		   io_sq->qid, io_sq->next_to_comp);
+		   "[%s][QID#%d] Updating SQ head to: %d\n", __func__, io_sq->qid,
+		   io_sq->next_to_comp);
 
 	/* Get rx flags from the last pkt */
 	ena_com_rx_set_flags(io_cq, ena_rx_ctx, cdesc);
@@ -649,14 +646,14 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
 	desc->req_id = req_id;
 
 	netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
-		   "[%s] Adding single RX desc, Queue: %u, req_id: %u\n",
-		   __func__, io_sq->qid, req_id);
+		   "[%s] Adding single RX desc, Queue: %u, req_id: %u\n", __func__, io_sq->qid,
+		   req_id);
 
 	desc->buff_addr_lo = (u32)ena_buf->paddr;
 	desc->buff_addr_hi =
 		((ena_buf->paddr & GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32);
 
-	return ena_com_sq_update_tail(io_sq);
+	return ena_com_sq_update_reqular_queue_tail(io_sq);
 }
 
 bool ena_com_cq_empty(struct ena_com_io_cq *io_cq)
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
index 6768905d44bd2..121e2e212a1b5 100644
--- a/drivers/amazon/net/ena/ena_eth_com.h
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -71,15 +71,14 @@ static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq,
 	writel(intr_reg->intr_control, io_cq->unmask_reg);
 }
 
-static inline int ena_com_free_q_entries(struct ena_com_io_sq *io_sq)
+static inline u16 ena_com_used_q_entries(struct ena_com_io_sq *io_sq)
 {
-	u16 tail, next_to_comp, cnt;
-
-	next_to_comp = io_sq->next_to_comp;
-	tail = io_sq->tail;
-	cnt = tail - next_to_comp;
+	return io_sq->tail - io_sq->next_to_comp;
+}
 
-	return io_sq->q_depth - 1 - cnt;
+static inline int ena_com_free_q_entries(struct ena_com_io_sq *io_sq)
+{
+	return io_sq->q_depth - 1 - ena_com_used_q_entries(io_sq);
 }
 
 /* Check if the submission queue has enough space to hold required_buffers */
@@ -143,8 +142,8 @@ static inline bool ena_com_is_doorbell_needed(struct ena_com_io_sq *io_sq,
 	}
 
 	netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
-		   "Queue: %d num_descs: %d num_entries_needed: %d\n",
-		   io_sq->qid, num_descs, num_entries_needed);
+		   "Queue: %d num_descs: %d num_entries_needed: %d\n", io_sq->qid, num_descs,
+		   num_entries_needed);
 
 	return num_entries_needed > io_sq->entries_in_tx_burst_left;
 }
@@ -155,15 +154,14 @@ static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
 	u16 tail = io_sq->tail;
 
 	netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
-		   "Write submission queue doorbell for queue: %d tail: %d\n",
-		   io_sq->qid, tail);
+		   "Write submission queue doorbell for queue: %d tail: %d\n", io_sq->qid, tail);
 
 	writel(tail, io_sq->db_addr);
 
 	if (is_llq_max_tx_burst_exists(io_sq)) {
 		netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
-			   "Reset available entries in tx burst for queue %d to %d\n",
-			   io_sq->qid, max_entries_in_tx_burst);
+			   "Reset available entries in tx burst for queue %d to %d\n", io_sq->qid,
+			   max_entries_in_tx_burst);
 		io_sq->entries_in_tx_burst_left = max_entries_in_tx_burst;
 	}
 
@@ -226,8 +224,7 @@ static inline int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq,
 
 	if (unlikely((flags & ENA_ETH_IO_TX_CDESC_MBZ6_MASK) &&
 		     ena_com_get_cap(dev, ENA_ADMIN_CDESC_MBZ))) {
-		netdev_err(dev->net_device,
-			   "Corrupted TX descriptor on q_id: %d, req_id: %u\n",
+		netdev_err(dev->net_device, "Corrupted TX descriptor on q_id: %d, req_id: %u\n",
 			   io_cq->qid, cdesc->req_id);
 		return -EFAULT;
 	}
@@ -236,8 +233,8 @@ static inline int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq,
 
 	*req_id = READ_ONCE(cdesc->req_id);
 	if (unlikely(*req_id >= io_cq->q_depth)) {
-		netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
-			   "Invalid req id %d\n", cdesc->req_id);
+		netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device, "Invalid req id %d\n",
+			   cdesc->req_id);
 		return -EINVAL;
 	}
 
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 2a0496172ff91..d6147d6b980ae 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -82,6 +82,7 @@ static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(os_netdev_wd),
 	ENA_STAT_GLOBAL_ENTRY(missing_admin_interrupt),
 	ENA_STAT_GLOBAL_ENTRY(admin_to),
+	ENA_STAT_GLOBAL_ENTRY(device_request_reset),
 	ENA_STAT_GLOBAL_ENTRY(suspend),
 	ENA_STAT_GLOBAL_ENTRY(resume),
 	ENA_STAT_GLOBAL_ENTRY(interface_down),
@@ -435,20 +436,20 @@ static void ena_metrics_stats_strings(struct ena_adapter *adapter, u8 **data)
 		for (i = 0; i < ENA_METRICS_ARRAY_ENI; i++) {
 			if (ena_com_get_customer_metric_support(dev, i)) {
 				ena_metrics = &ena_hw_stats_strings[i];
-				ethtool_sprintf(data, ena_metrics->name);
+				ethtool_puts(data, ena_metrics->name);
 			}
 		}
 	} else if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
 		for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
 			ena_stats = &ena_stats_eni_strings[i];
-			ethtool_sprintf(data, ena_stats->name);
+			ethtool_puts(data, ena_stats->name);
 		}
 	}
 
 	if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
 		for (i = 0; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
 			ena_stats = &ena_srd_info_strings[i];
-			ethtool_sprintf(data, ena_stats->name);
+			ethtool_puts(data, ena_stats->name);
 		}
 	}
 }
@@ -478,9 +479,7 @@ static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
 		for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
 			ena_stats = &ena_stats_rx_strings[j];
 
-			ethtool_sprintf(data,
-					"queue_%u_rx_%s", i,
-					ena_stats->name);
+			ethtool_sprintf(data, "queue_%u_rx_%s", i, ena_stats->name);
 		}
 	}
 }
@@ -505,7 +504,7 @@ static void ena_com_phc_strings(u8 **data)
 
 	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_PHC; i++) {
 		ena_stats = &ena_stats_ena_com_phc_strings[i];
-		ethtool_sprintf(data, "%s", ena_stats->name);
+		ethtool_puts(data, ena_stats->name);
 	}
 }
 
@@ -518,7 +517,7 @@ static void ena_get_strings(struct ena_adapter *adapter,
 
 	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
 		ena_stats = &ena_stats_global_strings[i];
-		ethtool_sprintf(&data, ena_stats->name);
+		ethtool_puts(&data, ena_stats->name);
 	}
 
 	if (hw_stats_needed)
@@ -1109,12 +1108,22 @@ static int ena_indirection_table_get(struct ena_adapter *adapter, u32 *indir)
 	return rc;
 }
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
+#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM
+static int ena_get_rxfh(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
 static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 			u8 *hfunc)
+#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	enum ena_admin_hash_functions ena_func;
+#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM
+	u32 *indir = rxfh->indir;
+	u8 *hfunc = &rxfh->hfunc;
+	u8 *key = rxfh->key;
+#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */
 	u8 func;
 	int rc;
 
@@ -1150,8 +1159,12 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 		return -EOPNOTSUPP;
 	}
 
+#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM
+	*hfunc = func;
+#else
 	if (hfunc)
 		*hfunc = func;
+#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */
 
 	return 0;
 }
@@ -1192,18 +1205,27 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir)
 }
 #endif /* >= 3.8.0 */
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
+#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM
+static int ena_set_rxfh(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh,
+			struct netlink_ext_ack *extack)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
 static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
 			const u8 *key, const u8 hfunc)
 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
 static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
 			const u8 *key)
-#endif
+#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	enum ena_admin_hash_functions func = 0;
+#ifdef ENA_HAVE_ETHTOOL_RXFH_PARAM
+	u32 *indir = rxfh->indir;
+	u8 hfunc = rxfh->hfunc;
+	u8 *key = rxfh->key;
+#endif /* ENA_HAVE_ETHTOOL_RXFH_PARAM */
 	int rc;
 
 	if (indir) {
@@ -1284,19 +1306,20 @@ static int ena_set_channels(struct net_device *netdev,
 
 		xdp_clear_features_flag(netdev);
 	} else {
-		xdp_set_features_flag(netdev,
-				      NETDEV_XDP_ACT_BASIC |
-				      NETDEV_XDP_ACT_REDIRECT);
+		xdp_set_features_flag(netdev, ENA_XDP_FEATURES);
 	}
 
 	if (count > adapter->max_num_io_queues)
 		return -EINVAL;
+
+#ifdef ENA_AF_XDP_SUPPORT
 	if (count != adapter->num_io_queues && ena_is_zc_q_exist(adapter)) {
 		netdev_err(adapter->netdev,
 			   "Changing channel count not supported with xsk pool loaded\n");
 		return -EOPNOTSUPP;
 	}
 
+#endif /* ENA_AF_XDP_SUPPORT */
 	return ena_update_queue_count(adapter, count);
 }
 #endif /* ETHTOOL_SCHANNELS */
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 5edd2d3256b25..083c9546fa033 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -98,6 +98,7 @@ static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue)
 	enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_OS_NETDEV_WD;
 	struct ena_adapter *adapter = netdev_priv(dev);
 	unsigned int time_since_last_napi, threshold;
+	unsigned long jiffies_since_last_intr;
 	struct ena_ring *tx_ring;
 	int napi_scheduled;
 
@@ -112,17 +113,20 @@ static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue)
 	time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies);
 	napi_scheduled = !!(tx_ring->napi->state & NAPIF_STATE_SCHED);
 
+	jiffies_since_last_intr = jiffies - READ_ONCE(adapter->ena_napi[txqueue].last_intr_jiffies);
+
 	netdev_err(dev,
-		  "TX q %d is paused for too long (threshold %u). Time since last napi %u usec. napi scheduled: %d\n",
-		  txqueue,
-		  threshold,
-		  time_since_last_napi,
-		  napi_scheduled);
+		   "TX q %d is paused for too long (threshold %u). Time since last napi %u usec. napi scheduled: %d. msecs since last interrupt: %u\n",
+		   txqueue,
+		   threshold,
+		   time_since_last_napi,
+		   napi_scheduled,
+		   jiffies_to_msecs(jiffies_since_last_intr));
 
 	if (threshold < time_since_last_napi && napi_scheduled) {
 		netdev_err(dev,
-			"napi handler hasn't been called for a long time but is scheduled\n");
-			reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
+			   "napi handler hasn't been called for a long time but is scheduled\n");
+			   reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
 	}
 schedule_reset:
 	/* Change the state of the device to trigger reset
@@ -234,11 +238,9 @@ int ena_xmit_common(struct ena_adapter *adapter,
 	if (unlikely(rc)) {
 		netif_err(adapter, tx_queued, adapter->netdev,
 			  "Failed to prepare tx bufs\n");
-		ena_increase_stat(&ring->tx_stats.prepare_ctx_err, 1,
-				  &ring->syncp);
+		ena_increase_stat(&ring->tx_stats.prepare_ctx_err, 1, &ring->syncp);
 		if (rc != -ENOMEM)
-			ena_reset_device(adapter,
-					 ENA_REGS_RESET_DRIVER_INVALID_STATE);
+			ena_reset_device(adapter, ENA_REGS_RESET_DRIVER_INVALID_STATE);
 		return rc;
 	}
 
@@ -468,7 +470,7 @@ int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
 }
 
 void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter,
-						  int first_index, int count)
+					   int first_index, int count)
 {
 	int i;
 
@@ -619,8 +621,7 @@ struct page *ena_alloc_map_page(struct ena_ring *rx_ring,
 	 */
 	page = dev_alloc_page();
 	if (!page) {
-		ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1,
-				  &rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1, &rx_ring->syncp);
 		return ERR_PTR(-ENOSPC);
 	}
 
@@ -654,11 +655,8 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
 	/* if previous allocated page is not used */
 	if (unlikely(rx_info->page))
 		return 0;
-
-	tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	ena_buf = &rx_info->ena_buf;
-
 #ifdef ENA_AF_XDP_SUPPORT
+
 	if (unlikely(ENA_IS_XSK_RING(rx_ring))) {
 		struct xdp_buff *xdp;
 
@@ -666,6 +664,7 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
 		if (!xdp)
 			return -ENOMEM;
 
+		ena_buf = &rx_info->ena_buf;
 		ena_buf->paddr = xsk_buff_xdp_get_dma(xdp);
 		ena_buf->len = xsk_pool_get_rx_frame_size(rx_ring->xsk_pool);
 
@@ -683,9 +682,12 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
 	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 		  "Allocate page %p, rx_info %p\n", page, rx_info);
 
+	tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
 	rx_info->page = page;
 	rx_info->dma_addr = dma;
 	rx_info->page_offset = 0;
+	ena_buf = &rx_info->ena_buf;
 	ena_buf->paddr = dma + headroom;
 	ena_buf->len = ENA_PAGE_SIZE - headroom - tailroom;
 
@@ -738,10 +740,14 @@ int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 
 		rc = ena_alloc_rx_buffer(rx_ring, rx_info);
 		if (unlikely(rc < 0)) {
-			if (!ENA_IS_XSK_RING(rx_ring))
-				netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
-					   "Failed to allocate buffer for rx queue %d\n",
-					   rx_ring->qid);
+#ifdef ENA_AF_XDP_SUPPORT
+			if (ENA_IS_XSK_RING(rx_ring))
+				break;
+
+#endif /* ENA_AF_XDP_SUPPORT */
+			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+				   "Failed to allocate buffer for rx queue %d\n",
+				   rx_ring->qid);
 			break;
 		}
 		rc = ena_com_add_single_rx_desc(rx_ring->ena_com_io_sq,
@@ -760,12 +766,19 @@ int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 	if (unlikely(i < num)) {
 		ena_increase_stat(&rx_ring->rx_stats.refil_partial, 1,
 				  &rx_ring->syncp);
-		if (!ENA_IS_XSK_RING(rx_ring))
-			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
-				   "Refilled rx qid %d with only %d buffers (from %d)\n",
-				   rx_ring->qid, i, num);
+#ifdef ENA_AF_XDP_SUPPORT
+		if (ENA_IS_XSK_RING(rx_ring))
+			goto ring_doorbell;
+
+#endif /* ENA_AF_XDP_SUPPORT */
+		netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+			   "Refilled rx qid %d with only %d buffers (from %d)\n",
+			   rx_ring->qid, i, num);
 	}
 
+#ifdef ENA_AF_XDP_SUPPORT
+ring_doorbell:
+#endif /* ENA_AF_XDP_SUPPORT */
 	/* ena_com_write_sq_doorbell issues a wmb() */
 	if (likely(i))
 		ena_com_write_sq_doorbell(rx_ring->ena_com_io_sq);
@@ -781,11 +794,13 @@ static void ena_free_rx_bufs(struct ena_adapter *adapter,
 	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
 	u32 i;
 
+#ifdef ENA_AF_XDP_SUPPORT
 	if (ENA_IS_XSK_RING(rx_ring)) {
-		ena_xdp_free_rx_bufs_zc(adapter, qid);
+		ena_xdp_free_rx_bufs_zc(rx_ring);
 		return;
 	}
 
+#endif /* ENA_AF_XDP_SUPPORT */
 	for (i = 0; i < rx_ring->ring_size; i++) {
 		struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i];
 
@@ -902,10 +917,12 @@ static void ena_free_all_tx_bufs(struct ena_adapter *adapter)
 
 	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
 		tx_ring = &adapter->tx_ring[i];
+#ifdef ENA_AF_XDP_SUPPORT
 		if (ENA_IS_XSK_RING(tx_ring)) {
 			ena_xdp_free_tx_bufs_zc(tx_ring);
 			continue;
 		}
+#endif /* ENA_AF_XDP_SUPPORT */
 		ena_free_tx_bufs(tx_ring);
 	}
 }
@@ -995,12 +1012,10 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 						&req_id);
 		if (rc) {
 			if (unlikely(rc == -EINVAL))
-				handle_invalid_req_id(tx_ring, req_id, NULL,
-						      false);
+				handle_invalid_req_id(tx_ring, req_id, NULL, false);
 			else if (unlikely(rc == -EFAULT)) {
-				ena_reset_device(
-					tx_ring->adapter,
-					ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED);
+				ena_reset_device(tx_ring->adapter,
+						 ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED);
 			}
 			break;
 		}
@@ -1162,7 +1177,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	page_offset = rx_info->page_offset;
 	buf_addr = page_address(rx_info->page) + page_offset;
 
-	if (len <= rx_ring->rx_copybreak) {
+	if ((len <= rx_ring->rx_copybreak) && likely(descs == 1)) {
 		skb = ena_alloc_skb(rx_ring, NULL, len);
 		if (unlikely(!skb))
 			return NULL;
@@ -1378,8 +1393,7 @@ static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp, u
 	/* XDP multi-buffer packets not supported */
 	if (unlikely(num_descs > 1)) {
 		netdev_err_once(rx_ring->adapter->netdev,
-				"xdp: dropped multi-buffer packets. RX packets must be < %lu\n",
-				ENA_XDP_MAX_MTU);
+				"xdp: dropped unsupported multi-buffer packets\n");
 		ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp);
 		return ENA_XDP_DROP;
 	}
@@ -1428,6 +1442,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 #ifdef ENA_XDP_SUPPORT
 	int xdp_verdict;
 #endif /* ENA_XDP_SUPPORT */
+	u8 pkt_offset;
 	int rc = 0;
 	int i;
 
@@ -1458,7 +1473,8 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
 		/* First descriptor might have an offset set by the device */
 		rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
-		rx_info->buf_offset += ena_rx_ctx.pkt_offset;
+		pkt_offset = ena_rx_ctx.pkt_offset;
+		rx_info->buf_offset += pkt_offset;
 
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 			  "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
@@ -1466,7 +1482,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 			  ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
 
 		dma_sync_single_for_cpu(rx_ring->dev,
-					dma_unmap_addr(&rx_info->ena_buf, paddr) + ena_rx_ctx.pkt_offset,
+					dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
 					rx_ring->ena_bufs[0].len,
 					DMA_FROM_DEVICE);
 
@@ -1523,7 +1539,8 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
 		skb_record_rx_queue(skb, rx_ring->qid);
 
-		if (rx_ring->ena_bufs[0].len <= rx_ring->rx_copybreak)
+		if ((rx_ring->ena_bufs[0].len <= rx_ring->rx_copybreak) &&
+		    likely(ena_rx_ctx.descs == 1))
 			rx_copybreak_pkt++;
 
 		total_len += skb->len;
@@ -1569,11 +1586,15 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 	return work_done;
 
 error:
+#ifdef ENA_XDP_SUPPORT
+	if (xdp_flags & ENA_XDP_REDIRECT)
+		xdp_do_flush();
+
+#endif
 	adapter = netdev_priv(rx_ring->netdev);
 
 	if (rc == -ENOSPC) {
-		ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1,
-				  &rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1, &rx_ring->syncp);
 		ena_reset_device(adapter, ENA_REGS_RESET_TOO_MANY_RX_DESCS);
 	} else if (rc == -EFAULT) {
 		ena_reset_device(adapter, ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED);
@@ -1661,35 +1682,26 @@ void ena_unmask_interrupt(struct ena_ring *tx_ring,
 	ena_com_unmask_intr(tx_ring->ena_com_io_cq, &intr_reg);
 }
 
-void ena_update_ring_numa_node(struct ena_ring *tx_ring,
-			       struct ena_ring *rx_ring)
+void ena_update_ring_numa_node(struct ena_ring *rx_ring)
 {
 	int cpu = get_cpu();
 	int numa_node;
 
-	/* Check only one ring since the 2 rings are running on the same cpu */
-	if (likely(tx_ring->cpu == cpu))
+	if (likely(rx_ring->cpu == cpu))
 		goto out;
 
-	tx_ring->cpu = cpu;
-	if (rx_ring)
-		rx_ring->cpu = cpu;
+	rx_ring->cpu = cpu;
 
 	numa_node = cpu_to_node(cpu);
 
-	if (likely(tx_ring->numa_node == numa_node))
+	if (likely(rx_ring->numa_node == numa_node))
 		goto out;
 
 	put_cpu();
 
 	if (numa_node != NUMA_NO_NODE) {
-		ena_com_update_numa_node(tx_ring->ena_com_io_cq, numa_node);
-		tx_ring->numa_node = numa_node;
-		if (rx_ring) {
-			rx_ring->numa_node = numa_node;
-			ena_com_update_numa_node(rx_ring->ena_com_io_cq,
-						 numa_node);
-		}
+		ena_com_update_numa_node(rx_ring->ena_com_io_cq, numa_node);
+		rx_ring->numa_node = numa_node;
 	}
 
 	return;
@@ -1697,7 +1709,6 @@ void ena_update_ring_numa_node(struct ena_ring *tx_ring,
 	put_cpu();
 }
 
-
 static int ena_io_poll(struct napi_struct *napi, int budget)
 {
 	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
@@ -1759,7 +1770,7 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 			if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev))
 				ena_adjust_adaptive_rx_intr_moderation(ena_napi);
 
-			ena_update_ring_numa_node(tx_ring, rx_ring);
+			ena_update_ring_numa_node(rx_ring);
 			ena_unmask_interrupt(tx_ring, rx_ring);
 		}
 
@@ -1903,17 +1914,20 @@ static void ena_setup_mgmnt_intr(struct ena_adapter *adapter)
 
 static void ena_setup_io_intr(struct ena_adapter *adapter)
 {
+	const struct cpumask *affinity = cpu_online_mask;
+	int irq_idx, i, cpu, io_queue_count, node;
 	struct net_device *netdev;
-	int irq_idx, i, cpu;
-	int io_queue_count;
 
 	netdev = adapter->netdev;
 	io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
+	node = dev_to_node(adapter->ena_dev->dmadev);
+
+	if (node != NUMA_NO_NODE)
+		affinity = cpumask_of_node(node);
 
 	for (i = 0; i < io_queue_count; i++) {
 		irq_idx = ENA_IO_IRQ_IDX(i);
-		cpu = i % num_online_cpus();
-
+		cpu = cpumask_local_spread(i, node);
 		snprintf(adapter->irq_tbl[irq_idx].name, ENA_IRQNAME_SIZE,
 			 "%s-Tx-Rx-%d", netdev->name, i);
 		adapter->irq_tbl[irq_idx].handler = ena_intr_msix_io;
@@ -1926,8 +1940,7 @@ static void ena_setup_io_intr(struct ena_adapter *adapter)
 #endif
 		adapter->irq_tbl[irq_idx].cpu = cpu;
 
-		cpumask_set_cpu(cpu,
-				&adapter->irq_tbl[irq_idx].affinity_hint_mask);
+		cpumask_copy(&adapter->irq_tbl[irq_idx].affinity_hint_mask, affinity);
 	}
 }
 
@@ -1950,6 +1963,8 @@ static int ena_request_mgmnt_irq(struct ena_adapter *adapter)
 		  "Set affinity hint of mgmnt irq.to 0x%lx (irq vector: %d)\n",
 		  irq->affinity_hint_mask.bits[0], irq->vector);
 
+	irq_update_affinity_hint(irq->vector, &irq->affinity_hint_mask);
+
 	return rc;
 }
 
@@ -1980,6 +1995,8 @@ static int ena_request_io_irq(struct ena_adapter *adapter)
 		netif_dbg(adapter, ifup, adapter->netdev,
 			  "Set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n",
 			  i, irq->affinity_hint_mask.bits[0], irq->vector);
+
+		irq_update_affinity_hint(irq->vector, &irq->affinity_hint_mask);
 	}
 
 	return rc;
@@ -1999,7 +2016,7 @@ static void ena_free_mgmnt_irq(struct ena_adapter *adapter)
 
 	irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX];
 	synchronize_irq(irq->vector);
-	irq_set_affinity_hint(irq->vector, NULL);
+	irq_update_affinity_hint(irq->vector, NULL);
 	free_irq(irq->vector, irq->data);
 }
 
@@ -2018,7 +2035,7 @@ static void ena_free_io_irq(struct ena_adapter *adapter)
 
 	for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) {
 		irq = &adapter->irq_tbl[i];
-		irq_set_affinity_hint(irq->vector, NULL);
+		irq_update_affinity_hint(irq->vector, NULL);
 		free_irq(irq->vector, irq->data);
 	}
 }
@@ -2077,8 +2094,8 @@ static void ena_del_napi_in_range(struct ena_adapter *adapter,
 static void ena_init_napi_in_range(struct ena_adapter *adapter,
 				   int first_index, int count)
 {
-	int i;
 	int (*napi_handler)(struct napi_struct *napi, int budget);
+	int i;
 
 	for (i = first_index; i < first_index + count; i++) {
 		struct ena_napi *napi = &adapter->ena_napi[i];
@@ -2091,7 +2108,11 @@ static void ena_init_napi_in_range(struct ena_adapter *adapter,
 
 		napi_handler = ena_io_poll;
 #ifdef ENA_XDP_SUPPORT
+#ifdef ENA_AF_XDP_SUPPORT
 		if (ENA_IS_XDP_INDEX(adapter, i) || ENA_IS_XSK_RING(rx_ring))
+#else
+		if (ENA_IS_XDP_INDEX(adapter, i))
+#endif /* ENA_AF_XDP_SUPPORT */
 			napi_handler = ena_xdp_io_poll;
 #endif /* ENA_XDP_SUPPORT */
 
@@ -2168,8 +2189,7 @@ static int ena_rss_configure(struct ena_adapter *adapter)
 	if (!ena_dev->rss.tbl_log_size) {
 		rc = ena_rss_init_default(adapter);
 		if (unlikely(rc && (rc != -EOPNOTSUPP))) {
-			netif_err(adapter, ifup, adapter->netdev,
-				  "Failed to init RSS rc: %d\n", rc);
+			netif_err(adapter, ifup, adapter->netdev, "Failed to init RSS rc: %d\n", rc);
 			return rc;
 		}
 	}
@@ -2257,7 +2277,6 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
 		return rc;
 	}
 
-	ena_com_update_numa_node(tx_ring->ena_com_io_cq, ctx.numa_node);
 	return rc;
 }
 
@@ -3068,12 +3087,8 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_tx_timestamp(skb);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
-#ifdef HAVE_NETDEV_XMIT_MORE
 	if (netif_xmit_stopped(txq) || !netdev_xmit_more())
-#else
-	if (netif_xmit_stopped(txq) || !skb->xmit_more)
-#endif /* HAVE_NETDEV_XMIT_MORE */
-#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) */
+#endif
 		/* trigger the dma engine. ena_ring_tx_doorbell()
 		 * calls a memory barrier inside it.
 		 */
@@ -3086,54 +3101,17 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_info->skb = NULL;
 
 error_drop_packet:
-	dev_kfree_skb(skb);
-	return NETDEV_TX_OK;
-}
-
-#if defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3
-static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    struct net_device *sb_dev)
-#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2
-static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    struct net_device *sb_dev,
-			    select_queue_fallback_t fallback)
-#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
-static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv,
-			    select_queue_fallback_t fallback)
-#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL
-/* Return subqueue id on this core (one per core). */
-static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+	if (!netdev_xmit_more() && ena_com_used_q_entries(tx_ring->ena_com_io_sq))
 #else
-static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb)
+	if (ena_com_used_q_entries(tx_ring->ena_com_io_sq))
 #endif
-{
-	u16 qid;
-	/* we suspect that this is good for in--kernel network services that
-	 * want to loop incoming skb rx to tx in normal user generated traffic,
-	 * most probably we will not get to this
-	 */
-	if (skb_rx_queue_recorded(skb)) {
-		qid = skb_get_rx_queue(skb);
-		if (qid >= dev->real_num_tx_queues)
-			qid %= dev->real_num_tx_queues;
-	} else {
-#if (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3)
-		qid = netdev_pick_tx(dev, skb, NULL);
-#elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2)
-		qid = fallback(dev, skb, NULL);
-#elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1)
-		qid = fallback(dev, skb);
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)
-		qid = __netdev_pick_tx(dev, skb);
-#else
-		qid = skb_tx_hash(dev, skb);
-#endif /* HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2 */
-	}
+		ena_ring_tx_doorbell(tx_ring);
 
-	return qid;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
 }
+
 #ifdef HAVE_SET_RX_MODE
 
 /* Unicast, Multicast and Promiscuous mode set
@@ -3177,13 +3155,18 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 	host_info->os_type = ENA_ADMIN_OS_LINUX;
 	host_info->kernel_ver = LINUX_VERSION_CODE;
 	ret = strscpy(host_info->kernel_ver_str, utsname()->version,
-		sizeof(host_info->kernel_ver_str) - 1);
+		sizeof(host_info->kernel_ver_str));
 	if (ret < 0)
 		dev_dbg(dev,
 			"kernel version string will be truncated, status = %zd\n", ret);
+
 	host_info->os_dist = 0;
-	strncpy(host_info->os_dist_str, utsname()->release,
-		sizeof(host_info->os_dist_str) - 1);
+	ret = strscpy(host_info->os_dist_str, utsname()->release,
+		sizeof(host_info->os_dist_str));
+	if (ret < 0)
+		dev_dbg(dev,
+			"OS distribution string will be truncated, status = %zd\n", ret);
+
 	host_info->driver_version =
 		(DRV_MODULE_GEN_MAJOR) |
 		(DRV_MODULE_GEN_MINOR << ENA_ADMIN_HOST_INFO_MINOR_SHIFT) |
@@ -3241,8 +3224,7 @@ static void ena_config_debug_area(struct ena_adapter *adapter)
 	rc = ena_com_set_host_attributes(adapter->ena_dev);
 	if (unlikely(rc)) {
 		if (rc == -EOPNOTSUPP)
-			netif_warn(adapter, drv, adapter->netdev,
-				   "Cannot set host attributes\n");
+			netif_warn(adapter, drv, adapter->netdev, "Cannot set host attributes\n");
 		else
 			netif_err(adapter, drv, adapter->netdev,
 				  "Cannot set host attributes\n");
@@ -3265,7 +3247,6 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	struct ena_ring *rx_ring, *tx_ring;
-	u64 xdp_rx_drops = 0;
 	unsigned int start;
 	u64 rx_overruns;
 	u64 rx_drops;
@@ -3303,7 +3284,6 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 			start = ena_u64_stats_fetch_begin(&rx_ring->syncp);
 			packets = rx_ring->rx_stats.cnt;
 			bytes = rx_ring->rx_stats.bytes;
-			xdp_rx_drops += ena_ring_xdp_drops_cnt(rx_ring);
 		} while (ena_u64_stats_fetch_retry(&rx_ring->syncp, start));
 
 		stats->rx_packets += packets;
@@ -3317,7 +3297,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		rx_overruns = adapter->dev_stats.rx_overruns;
 	} while (ena_u64_stats_fetch_retry(&adapter->syncp, start));
 
-	stats->rx_dropped = rx_drops + xdp_rx_drops;
+	stats->rx_dropped = rx_drops;
 	stats->tx_dropped = tx_drops;
 
 	stats->multicast = 0;
@@ -3428,7 +3408,6 @@ static const struct net_device_ops ena_netdev_ops = {
 	.ndo_open		= ena_open,
 	.ndo_stop		= ena_close,
 	.ndo_start_xmit		= ena_start_xmit,
-	.ndo_select_queue	= ena_select_queue,
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36))
 	.ndo_get_stats64	= ena_get_stats64,
 #else
@@ -3552,13 +3531,13 @@ static int ena_calc_io_queue_size(struct ena_adapter *adapter,
 	if (max_tx_queue_size < ENA_MIN_RING_SIZE) {
 		netdev_err(adapter->netdev, "Device max TX queue size: %d < minimum: %d\n",
 			   max_tx_queue_size, ENA_MIN_RING_SIZE);
-		return -EFAULT;
+		return -EINVAL;
 	}
 
 	if (max_rx_queue_size < ENA_MIN_RING_SIZE) {
 		netdev_err(adapter->netdev, "Device max RX queue size: %d < minimum: %d\n",
 			   max_rx_queue_size, ENA_MIN_RING_SIZE);
-		return -EFAULT;
+		return -EINVAL;
 	}
 
 	tx_queue_size = clamp_val(tx_queue_size, ENA_MIN_RING_SIZE,
@@ -3795,7 +3774,8 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 		BIT(ENA_ADMIN_WARNING) |
 		BIT(ENA_ADMIN_NOTIFICATION) |
 		BIT(ENA_ADMIN_KEEP_ALIVE) |
-		BIT(ENA_ADMIN_CONF_NOTIFICATIONS);
+		BIT(ENA_ADMIN_CONF_NOTIFICATIONS) |
+		BIT(ENA_ADMIN_DEVICE_REQUEST_RESET);
 
 	aenq_groups &= get_feat_ctx->aenq.supported_groups;
 
@@ -4068,8 +4048,7 @@ enum ena_regs_reset_reason_types check_cdesc_in_tx_cq(struct ena_adapter *adapte
 
 	/* TX CQ is empty */
 	if (rc == -EAGAIN) {
-		netif_err(adapter, tx_err, netdev,
-			  "No completion descriptors found in CQ %d",
+		netif_err(adapter, tx_err, netdev, "No completion descriptors found in CQ %d",
 			  tx_ring->qid);
 
 		return ENA_REGS_RESET_MISS_TX_CMPL;
@@ -4262,12 +4241,14 @@ static void check_for_empty_rx_ring(struct ena_adapter *adapter)
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		rx_ring = &adapter->rx_ring[i];
+#ifdef ENA_AF_XDP_SUPPORT
 
 		/* If using UMEM, app might not provide RX buffers and the ring
 		 * can be empty
 		 */
 		if (ENA_IS_XSK_RING(rx_ring))
 			continue;
+#endif /* ENA_AF_XDP_SUPPORT */
 
 		refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
 		if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
@@ -4292,6 +4273,7 @@ static void check_for_empty_rx_ring(struct ena_adapter *adapter)
 /* Check for keep alive expiration */
 static void check_for_missing_keep_alive(struct ena_adapter *adapter)
 {
+	enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_KEEP_ALIVE_TO;
 	unsigned long keep_alive_expired;
 
 	if (!adapter->wd_state)
@@ -4303,9 +4285,15 @@ static void check_for_missing_keep_alive(struct ena_adapter *adapter)
 	keep_alive_expired = adapter->last_keep_alive_jiffies +
 			     adapter->keep_alive_timeout;
 	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
+		unsigned long jiffies_since_last_keep_alive =
+			jiffies - adapter->last_keep_alive_jiffies;
 		netif_err(adapter, drv, adapter->netdev,
-			  "Keep alive watchdog timeout.\n");
-		ena_reset_device(adapter, ENA_REGS_RESET_KEEP_ALIVE_TO);
+			  "Keep alive watchdog timeout, %u msecs since last keep alive.\n",
+			  jiffies_to_msecs(jiffies_since_last_keep_alive));
+		if (ena_com_aenq_has_keep_alive(adapter->ena_dev))
+			reset_reason = ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT;
+
+		ena_reset_device(adapter, reset_reason);
 	}
 }
 
@@ -4557,8 +4545,8 @@ static int ena_rss_init_default(struct ena_adapter *adapter)
 		}
 	}
 
-	rc = ena_com_fill_hash_function(ena_dev, ENA_ADMIN_TOEPLITZ, NULL,
-					ENA_HASH_KEY_SIZE, 0xFFFFFFFF);
+	rc = ena_com_fill_hash_function(ena_dev, ENA_ADMIN_TOEPLITZ, NULL, ENA_HASH_KEY_SIZE,
+					0xFFFFFFFF);
 	if (unlikely(rc && (rc != -EOPNOTSUPP))) {
 		dev_err(dev, "Cannot fill hash function\n");
 		goto err_fill_indir;
@@ -4804,9 +4792,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 #ifdef ENA_XDP_NETLINK_ADVERTISEMENT
 	if (ena_xdp_legal_queue_count(adapter, adapter->num_io_queues))
-		netdev->xdp_features = NETDEV_XDP_ACT_BASIC |
-				       NETDEV_XDP_ACT_REDIRECT;
-
+		netdev->xdp_features = ENA_XDP_FEATURES;
 #endif
 	memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len);
 
@@ -4947,7 +4933,6 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
  * ena_remove is called by the PCI subsystem to alert the driver
  * that it should release a PCI device.
  */
-
 static void ena_remove(struct pci_dev *pdev)
 {
 	__ena_shutoff(pdev, false);
@@ -4959,7 +4944,6 @@ static void ena_remove(struct pci_dev *pdev)
  * ena_shutdown is called by the PCI subsystem to alert the driver that
  * a shutdown/reboot (or kexec) is happening and device must be disabled.
  */
-
 static void ena_shutdown(struct pci_dev *pdev)
 {
 	__ena_shutoff(pdev, true);
@@ -5187,6 +5171,17 @@ static void ena_conf_notification(void *adapter_data,
 	}
 }
 
+static void ena_admin_device_request_reset(void *data,
+					   struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)data;
+
+	netdev_warn(adapter->netdev,
+		   "The device has detected an unhealthy state, reset is requested\n");
+
+	ena_reset_device(adapter, ENA_REGS_RESET_DEVICE_REQUEST);
+}
+
 /* This handler will called for unknown event group or unimplemented handlers*/
 static void unimplemented_aenq_handler(void *data,
 				       struct ena_admin_aenq_entry *aenq_e)
@@ -5203,6 +5198,7 @@ static struct ena_aenq_handlers aenq_handlers = {
 		[ENA_ADMIN_NOTIFICATION] = ena_notification,
 		[ENA_ADMIN_KEEP_ALIVE] = ena_keep_alive_wd,
 		[ENA_ADMIN_CONF_NOTIFICATIONS] = ena_conf_notification,
+		[ENA_ADMIN_DEVICE_REQUEST_RESET] = ena_admin_device_request_reset,
 		[ENA_ADMIN_REFRESH_CAPABILITIES] = ena_refresh_fw_capabilites,
 	},
 	.unimplemented_handler = unimplemented_aenq_handler
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 8beb6970b20e4..269ced8d531e8 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -31,8 +31,8 @@
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_GEN_MAJOR	2
-#define DRV_MODULE_GEN_MINOR	11
-#define DRV_MODULE_GEN_SUBMINOR 1
+#define DRV_MODULE_GEN_MINOR	12
+#define DRV_MODULE_GEN_SUBMINOR 0
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -169,7 +169,7 @@ struct ena_tx_buffer {
 	/* num of buffers used by this skb */
 	u32 num_of_bufs;
 
-	/* Total size of all buffers */
+	/* Total size of all buffers in bytes */
 	u32 total_tx_size;
 
 	/* Indicate if bufs[0] map the linear data of the skb. */
@@ -192,16 +192,19 @@ struct ena_tx_buffer {
 
 struct ena_rx_buffer {
 	struct sk_buff *skb;
+#ifdef ENA_AF_XDP_SUPPORT
 	union {
 		struct {
 			struct page *page;
 			dma_addr_t dma_addr;
 		};
-#ifdef ENA_XDP_SUPPORT
 		/* XSK pool buffer */
 		struct xdp_buff *xdp;
-#endif
 	};
+#else
+	struct page *page;
+	dma_addr_t dma_addr;
+#endif /* ENA_AF_XDP_SUPPORT */
 	u32 page_offset;
 	u32 buf_offset;
 	struct ena_com_buf ena_buf;
@@ -380,6 +383,7 @@ struct ena_stats_dev {
 	u64 os_netdev_wd;
 	u64 missing_admin_interrupt;
 	u64 admin_to;
+	u64 device_request_reset;
 };
 
 enum ena_flags_t {
@@ -524,6 +528,7 @@ static const struct ena_reset_stats_offset resets_to_stats_offset_map[ENA_REGS_R
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED, rx_desc_malformed),
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED, tx_desc_malformed),
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT, missing_admin_interrupt),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_DEVICE_REQUEST, device_request_reset),
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev);
@@ -546,7 +551,7 @@ int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak);
 
 /* Increase a stat by cnt while holding syncp seqlock on 32bit machines */
 static inline void ena_increase_stat(u64 *statp, u64 cnt,
-			      struct u64_stats_sync *syncp)
+				     struct u64_stats_sync *syncp)
 {
 	u64_stats_update_begin(syncp);
 	(*statp) += cnt;
@@ -677,13 +682,12 @@ int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
 int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
 				    int first_index, int count);
 void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter,
-					int first_index, int count);
+					   int first_index, int count);
 void ena_free_all_io_tx_resources(struct ena_adapter *adapter);
 void ena_down(struct ena_adapter *adapter);
 int ena_up(struct ena_adapter *adapter);
 void ena_unmask_interrupt(struct ena_ring *tx_ring, struct ena_ring *rx_ring);
-void ena_update_ring_numa_node(struct ena_ring *tx_ring,
-			       struct ena_ring *rx_ring);
+void ena_update_ring_numa_node(struct ena_ring *rx_ring);
 void ena_rx_checksum(struct ena_ring *rx_ring,
 		     struct ena_com_rx_ctx *ena_rx_ctx,
 		     struct sk_buff *skb);
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
index af1e52cd7819c..c0f6b8c14e66a 100644
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -25,6 +25,7 @@ enum ena_regs_reset_reason_types {
 	ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED	    = 16,
 	ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED	    = 17,
 	ENA_REGS_RESET_MISSING_ADMIN_INTERRUPT      = 18,
+	ENA_REGS_RESET_DEVICE_REQUEST               = 19,
 	ENA_REGS_RESET_LAST,
 };
 
diff --git a/drivers/amazon/net/ena/ena_xdp.c b/drivers/amazon/net/ena/ena_xdp.c
index f4a443401a37c..204389ffe5b24 100644
--- a/drivers/amazon/net/ena/ena_xdp.c
+++ b/drivers/amazon/net/ena/ena_xdp.c
@@ -11,7 +11,11 @@ static int validate_xdp_req_id(struct ena_ring *tx_ring, u16 req_id)
 	struct ena_tx_buffer *tx_info;
 
 	tx_info = &tx_ring->tx_buffer_info[req_id];
+#ifdef ENA_AF_XDP_SUPPORT
 	if (likely(tx_info->total_tx_size))
+#else
+	if (likely(tx_info->xdpf))
+#endif
 		return 0;
 
 	return handle_invalid_req_id(tx_ring, req_id, tx_info, true);
@@ -113,7 +117,7 @@ int ena_xdp_xmit_frame(struct ena_ring *tx_ring,
 }
 
 int ena_xdp_xmit(struct net_device *dev, int n,
-			struct xdp_frame **frames, u32 flags)
+		 struct xdp_frame **frames, u32 flags)
 {
 	struct ena_adapter *adapter = netdev_priv(dev);
 	struct ena_ring *tx_ring;
@@ -169,24 +173,22 @@ static void ena_init_all_xdp_queues(struct ena_adapter *adapter)
 
 int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter)
 {
+	u32 xdp_first_ring = adapter->xdp_first_ring;
+	u32 xdp_num_queues = adapter->xdp_num_queues;
 	int rc = 0;
 
-	rc = ena_setup_tx_resources_in_range(adapter, adapter->xdp_first_ring,
-					     adapter->xdp_num_queues);
+	rc = ena_setup_tx_resources_in_range(adapter, xdp_first_ring, xdp_num_queues);
 	if (rc)
 		goto setup_err;
 
-	rc = ena_create_io_tx_queues_in_range(adapter,
-					      adapter->xdp_first_ring,
-					      adapter->xdp_num_queues);
+	rc = ena_create_io_tx_queues_in_range(adapter, xdp_first_ring, xdp_num_queues);
 	if (rc)
 		goto create_err;
 
 	return 0;
 
 create_err:
-	ena_free_all_io_tx_resources_in_range(adapter, adapter->xdp_first_ring,
-					      adapter->xdp_num_queues);
+	ena_free_all_io_tx_resources_in_range(adapter, xdp_first_ring, xdp_num_queues);
 setup_err:
 	return rc;
 }
@@ -199,11 +201,15 @@ int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
 	int rc;
 
 #ifdef AF_XDP_BUSY_POLL_SUPPORTED
+#ifdef ENA_AF_XDP_SUPPORT
 	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid,
 			      rx_ring->napi->napi_id);
+#else
+	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, 0);
+#endif /* ENA_AF_XDP_SUPPORT */
 #else
 	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid);
-#endif
+#endif /* AF_XDP_BUSY_POLL_SUPPORTED */
 
 	netif_dbg(rx_ring->adapter, ifup, rx_ring->netdev,
 		  "Registering RX info for queue %d with napi id %d\n",
@@ -215,13 +221,16 @@ int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
 		goto err;
 	}
 
+#ifdef ENA_AF_XDP_SUPPORT
 	if (ENA_IS_XSK_RING(rx_ring)) {
 		rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL);
 		xsk_pool_set_rxq_info(rx_ring->xsk_pool, &rx_ring->xdp_rxq);
 	} else {
-		rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED,
-						NULL);
+		rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL);
 	}
+#else
+	rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL);
+#endif /* ENA_AF_XDP_SUPPORT */
 
 	if (rc) {
 		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
@@ -253,9 +262,8 @@ void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring)
 		xsk_tx_completed(xsk_pool, xsk_frames);
 }
 
-void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid)
+void ena_xdp_free_rx_bufs_zc(struct ena_ring *rx_ring)
 {
-	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
 	int i = 0;
 
 	for (i = 0; i < rx_ring->ring_size; i++) {
@@ -279,8 +287,8 @@ void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring)
 }
 
 void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
-						 struct bpf_prog *prog,
-						 int first, int count)
+					  struct bpf_prog *prog,
+					  int first, int count)
 {
 	struct bpf_prog *old_bpf_prog;
 	struct ena_ring *rx_ring;
@@ -364,8 +372,7 @@ static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf)
 			xdp_features_set_redirect_target(netdev, false);
 		} else if (old_bpf_prog) {
 			xdp_features_clear_redirect_target(netdev);
-			netif_dbg(adapter, drv, adapter->netdev,
-				  "Removing XDP program\n");
+			netif_dbg(adapter, drv, adapter->netdev, "Removing XDP program\n");
 
 			rc = ena_destroy_and_free_all_xdp_queues(adapter);
 			if (rc)
@@ -549,13 +556,13 @@ int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags)
 }
 
 #endif /* ENA_AF_XDP_SUPPORT */
-static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
+static int ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 {
-
+#ifdef ENA_AF_XDP_SUPPORT
 	bool is_zc_q = ENA_IS_XSK_RING(tx_ring);
+#endif /* ENA_AF_XDP_SUPPORT */
 	u32 total_done = 0;
 	u16 next_to_clean;
-	bool needs_wakeup;
 	int tx_pkts = 0;
 	u16 req_id;
 	int rc;
@@ -570,12 +577,10 @@ static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 						&req_id);
 		if (rc) {
 			if (unlikely(rc == -EINVAL))
-				handle_invalid_req_id(tx_ring, req_id, NULL,
-						      true);
+				handle_invalid_req_id(tx_ring, req_id, NULL, true);
 			else if (unlikely(rc == -EFAULT)) {
-				ena_reset_device(
-					tx_ring->adapter,
-					ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED);
+				ena_reset_device(tx_ring->adapter,
+						 ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED);
 			}
 			break;
 		}
@@ -588,25 +593,31 @@ static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 		tx_info = &tx_ring->tx_buffer_info[req_id];
 
 		tx_info->tx_sent_jiffies = 0;
+#ifdef ENA_AF_XDP_SUPPORT
 
-		if (!is_zc_q) {
-			xdpf = tx_info->xdpf;
-			tx_info->xdpf = NULL;
-			ena_unmap_tx_buff(tx_ring, tx_info);
-			xdp_return_frame(xdpf);
-		}
+		if (is_zc_q)
+			goto log_xdp_packet;
+#endif /* ENA_AF_XDP_SUPPORT */
 
-		netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
-			  "tx_poll: q %d pkt #%d req_id %d\n", tx_ring->qid, tx_pkts, req_id);
+		xdpf = tx_info->xdpf;
+		tx_info->xdpf = NULL;
+		ena_unmap_tx_buff(tx_ring, tx_info);
+		xdp_return_frame(xdpf);
 
+#ifdef ENA_AF_XDP_SUPPORT
+log_xdp_packet:
+#endif /* ENA_AF_XDP_SUPPORT */
 		tx_pkts++;
 		total_done += tx_info->tx_descs;
-
+#ifdef ENA_AF_XDP_SUPPORT
 		tx_info->total_tx_size = 0;
-
+#endif /* ENA_AF_XDP_SUPPORT */
 		tx_ring->free_ids[next_to_clean] = req_id;
 		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
 						     tx_ring->ring_size);
+
+		netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
+			  "tx_poll: q %d pkt #%d req_id %d\n", tx_ring->qid, tx_pkts, req_id);
 	}
 
 	tx_ring->next_to_clean = next_to_clean;
@@ -616,7 +627,6 @@ static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 		  "tx_poll: q %d done. total pkts: %d\n",
 		  tx_ring->qid, tx_pkts);
 
-	needs_wakeup = tx_pkts < budget;
 #ifdef ENA_AF_XDP_SUPPORT
 	if (is_zc_q) {
 		struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool;
@@ -625,15 +635,16 @@ static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 			xsk_tx_completed(xsk_pool, tx_pkts);
 
 		if (xsk_uses_need_wakeup(xsk_pool)) {
+			bool needs_wakeup = tx_pkts < budget;
 			if (needs_wakeup)
 				xsk_set_tx_need_wakeup(xsk_pool);
 			else
 				xsk_clear_tx_need_wakeup(xsk_pool);
 		}
 	}
-#endif /* ENA_AF_XDP_SUPPORT */
 
-	return needs_wakeup;
+#endif /* ENA_AF_XDP_SUPPORT */
+	return tx_pkts;
 }
 
 #ifdef ENA_AF_XDP_SUPPORT
@@ -809,8 +820,7 @@ static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring,
 		/* XDP multi-buffer packets not supported */
 		if (unlikely(ena_rx_ctx.descs > 1)) {
 			netdev_err_once(rx_ring->adapter->netdev,
-					"xdp: dropped multi-buffer packets. RX packets must be < %lu\n",
-					ENA_XDP_MAX_MTU);
+					"xdp: dropped unsupported multi-buffer packets\n");
 			ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp);
 			xdp_verdict = ENA_XDP_DROP;
 			goto skip_xdp_prog;
@@ -894,17 +904,13 @@ static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring,
 		struct ena_adapter *adapter = netdev_priv(rx_ring->netdev);
 
 		if (rc == -ENOSPC) {
-			ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1,
-					  &rx_ring->syncp);
-			ena_reset_device(adapter,
-					 ENA_REGS_RESET_TOO_MANY_RX_DESCS);
+			ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1, &rx_ring->syncp);
+			ena_reset_device(adapter, ENA_REGS_RESET_TOO_MANY_RX_DESCS);
 		} else if (rc == -EIO) {
-			ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1,
-					  &rx_ring->syncp);
+			ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, &rx_ring->syncp);
 			ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID);
 		} else if (rc == -EFAULT) {
-			ena_reset_device(adapter,
-					 ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED);
+			ena_reset_device(adapter, ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED);
 		}
 
 		return 0;
@@ -920,12 +926,14 @@ static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring,
 int ena_xdp_io_poll(struct napi_struct *napi, int budget)
 {
 	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
-	struct ena_ring *rx_ring, *tx_ring;
+	struct ena_ring *tx_ring;
+#ifdef ENA_AF_XDP_SUPPORT
+	struct ena_ring *rx_ring;
 	bool needs_wakeup = true;
-	u32 rx_work_done = 0;
+#endif /* ENA_AF_XDP_SUPPORT */
+	u32 work_done;
 	int ret;
 
-	rx_ring = ena_napi->rx_ring;
 	tx_ring = ena_napi->tx_ring;
 
 	if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
@@ -934,16 +942,21 @@ int ena_xdp_io_poll(struct napi_struct *napi, int budget)
 		return 0;
 	}
 
-	needs_wakeup &= ena_clean_xdp_irq(tx_ring, budget);
+	work_done = ena_clean_xdp_irq(tx_ring, budget);
 
 #ifdef ENA_AF_XDP_SUPPORT
+	/* Take XDP work into account */
+	needs_wakeup &= work_done < budget;
+
 	if (!ENA_IS_XSK_RING(tx_ring))
 		goto polling_done;
 
+	rx_ring = ena_napi->rx_ring;
+
 	needs_wakeup &= ena_xdp_xmit_irq_zc(tx_ring, napi, budget);
 
-	rx_work_done = ena_xdp_clean_rx_irq_zc(rx_ring, napi, budget);
-	needs_wakeup &= rx_work_done < budget;
+	work_done = ena_xdp_clean_rx_irq_zc(rx_ring, napi, budget);
+	needs_wakeup &= work_done < budget;
 
 polling_done:
 #endif /* ENA_AF_XDP_SUPPORT */
@@ -953,18 +966,32 @@ int ena_xdp_io_poll(struct napi_struct *napi, int budget)
 	if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags))) {
 		napi_complete_done(napi, 0);
 		ret = 0;
+#ifdef ENA_AF_XDP_SUPPORT
 	} else if (needs_wakeup) {
+#else
+	} else if (budget > work_done) {
+#endif /* ENA_AF_XDP_SUPPORT */
 		ena_increase_stat(&tx_ring->tx_stats.napi_comp, 1,
 				  &tx_ring->syncp);
-		if (napi_complete_done(napi, rx_work_done) &&
+#ifdef ENA_AF_XDP_SUPPORT
+		if (napi_complete_done(napi, work_done) &&
 		    READ_ONCE(ena_napi->interrupts_masked)) {
 			smp_rmb(); /* make sure interrupts_masked is read */
 			WRITE_ONCE(ena_napi->interrupts_masked, false);
 			ena_unmask_interrupt(tx_ring, NULL);
+			/* Checking the tx_ring since for XDP channels
+			 * napi->rx_ring is NULL and for AF_XDP both are
+			 * xsk rings
+			 */
+			if (ENA_IS_XSK_RING(tx_ring))
+				ena_update_ring_numa_node(rx_ring);
 		}
+#else
+		if (napi_complete_done(napi, work_done))
+			ena_unmask_interrupt(tx_ring, NULL);
+#endif /* ENA_AF_XDP_SUPPORT */
 
-		ena_update_ring_numa_node(tx_ring, NULL);
-		ret = rx_work_done;
+		ret = work_done;
 	} else {
 		ret = budget;
 	}
diff --git a/drivers/amazon/net/ena/ena_xdp.h b/drivers/amazon/net/ena/ena_xdp.h
index 5729569058a7c..b468c7c58c8f1 100644
--- a/drivers/amazon/net/ena/ena_xdp.h
+++ b/drivers/amazon/net/ena/ena_xdp.h
@@ -11,10 +11,9 @@
 #include <linux/bpf_trace.h>
 #ifdef ENA_AF_XDP_SUPPORT
 #include <net/xdp_sock_drv.h>
-#endif /* ENA_AF_XDP_SUPPORT */
 
-#ifdef ENA_AF_XDP_SUPPORT
 #define ENA_IS_XSK_RING(ring) (!!(ring)->xsk_pool)
+
 #endif /* ENA_AF_XDP_SUPPORT */
 
 /* The max MTU size is configured to be the ethernet frame size without
@@ -41,6 +40,9 @@ enum ENA_XDP_ACTIONS {
 	ENA_XDP_DROP		= BIT(2)
 };
 
+#define ENA_XDP_FEATURES (NETDEV_XDP_ACT_BASIC | \
+			  NETDEV_XDP_ACT_REDIRECT)
+
 #define ENA_XDP_FORWARDED (ENA_XDP_TX | ENA_XDP_REDIRECT)
 
 int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter);
@@ -58,9 +60,9 @@ int ena_xdp_register_rxq_info(struct ena_ring *rx_ring);
 void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring);
 #ifdef ENA_AF_XDP_SUPPORT
 void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring);
-void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid);
+void ena_xdp_free_rx_bufs_zc(struct ena_ring *rx_ring);
 int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags);
-#endif
+#endif /* ENA_AF_XDP_SUPPORT */
 
 enum ena_xdp_errors_t {
 	ENA_XDP_ALLOWED = 0,
@@ -96,11 +98,6 @@ static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter)
 	return rc;
 }
 
-static inline u64 ena_ring_xdp_drops_cnt(struct ena_ring *rx_ring)
-{
-	return rx_ring->rx_stats.xdp_drop;
-}
-
 #ifdef ENA_AF_XDP_SUPPORT
 static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter)
 {
@@ -196,11 +193,6 @@ static inline bool ena_xdp_present_ring(struct ena_ring *ring)
 	return false;
 }
 
-static inline u64 ena_ring_xdp_drops_cnt(struct ena_ring *rx_ring)
-{
-	return 0;
-}
-
 static inline int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
 {
 	return 0;
@@ -219,23 +211,4 @@ static inline bool ena_xdp_present(struct ena_adapter *adapter)
 	return false;
 }
 #endif /* ENA_XDP_SUPPORT */
-#ifndef ENA_AF_XDP_SUPPORT /* stabs for AF XDP code */
-
-/* Define (or override if it's defined) these enum and function to make sure
- * that the code that uses them would always compile. If AF XDP isn't supported, it
- * won't be used anyway.
- */
-#define MEM_TYPE_XSK_BUFF_POOL 0
-#define xsk_pool_set_rxq_info(pool, rxq)
-
-static inline void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring) {}
-static inline void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid) {}
-
-#define ENA_IS_XSK_RING(ring) false
-
-static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter)
-{
-	return false;
-}
-#endif /* ENA_AF_XDP_SUPPORT */
 #endif /* ENA_XDP_H */
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 7b4c2c5041082..32a9cc54dc2b5 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -370,33 +370,6 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 #endif
 #endif /* >= 3.8.0 */
 
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5,2,0))
-#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3
-#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4,19,0)) || \
-      (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0))) || \
-      (SUSE_VERSION && ((SUSE_VERSION == 12 && SUSE_PATCHLEVEL >= 5) || \
-		        (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 1) || \
-			(SUSE_VERSION > 15)))
-#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2
-#else
-
-#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) && \
-      RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))) || \
-     (LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0) && \
-      SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \
-     (LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)))
-#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
-#endif
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0)
-#if defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24)
-#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
-#else
-#define HAVE_NDO_SELECT_QUEUE_ACCEL
-#endif
-#endif /* >= 3.13 */
-#endif /* < 4.19 */
-
 #if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0)
 #if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
 # define u64_stats_init(syncp)  seqcount_init(syncp.seq)
@@ -682,10 +655,10 @@ do {									\
 #endif
 #endif
 
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)) || \
-	(RHEL_RELEASE_CODE && \
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0)) && \
+	!(RHEL_RELEASE_CODE && \
 	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 2))
-#define HAVE_NETDEV_XMIT_MORE
+#define netdev_xmit_more() (skb->xmit_more)
 #endif
 
 #ifndef mmiowb
@@ -724,15 +697,18 @@ do {									\
 
 /* values are taken from here: https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md */
 
-#if defined(CONFIG_BPF) && LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
+#if defined(CONFIG_BPF) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) || \
+	(defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5))))
 #define ENA_XDP_SUPPORT
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(5,8,0)) || \
-	(SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3)
+	(SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3) || \
+	(defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5)))
 #define XDP_HAS_FRAME_SZ
 #define XDP_CONVERT_TO_FRAME_NAME_CHANGED
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5))
 #define ENA_XDP_QUERY_IN_DRIVER
 #endif
 
@@ -859,7 +835,8 @@ static inline int numa_mem_id(void)
 #define fallthrough do {} while (0)  /* fallthrough */
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) || \
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5))
 #define AF_XDP_BUSY_POLL_SUPPORTED
 #endif
 
@@ -874,7 +851,8 @@ static inline int numa_mem_id(void)
 #if defined(ENA_XDP_SUPPORT) && \
 	(LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0) && \
 	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \
-	 ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 49)))
+	 ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 49))) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 5))
 static __always_inline void
 xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
 {
@@ -959,8 +937,11 @@ static inline int netif_xmit_stopped(const struct netdev_queue *dev_queue)
 #define NAPIF_STATE_SCHED BIT(NAPI_STATE_SCHED)
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) && \
-	!(defined(IS_UEK) && ENA_KERNEL_VERSION_GTE(5, 15, 0, 100, 96, 32))
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)) && \
+	!(defined(IS_UEK) && ENA_KERNEL_VERSION_GTE(5, 15, 0, 100, 96, 32)) && \
+	!(defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7)) && \
+	(RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0))) && \
+	!(defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 1))))
 #define bpf_warn_invalid_xdp_action(netdev, xdp_prog, verdict) \
 	bpf_warn_invalid_xdp_action(verdict)
 #endif
@@ -1135,4 +1116,46 @@ static inline void ena_dma_unmap_page_attrs(struct device *dev,
 #define xdp_do_flush xdp_do_flush_map
 #endif /* ENA_HAVE_XDP_DO_FLUSH */
 
+#ifndef ENA_HAVE_CPUMASK_LOCAL_SPREAD
+static inline unsigned int cpumask_local_spread(unsigned int i, int node)
+{
+	unsigned int cpu;
+
+	/* Wrap: we always want a cpu. */
+	i %= num_online_cpus();
+
+	if (node == NUMA_NO_NODE) {
+		for_each_cpu(cpu, cpu_online_mask)
+			if (i-- == 0)
+				return cpu;
+	} else {
+		/* NUMA first. */
+		for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
+			if (i-- == 0)
+				return cpu;
+
+		for_each_cpu(cpu, cpu_online_mask) {
+			/* Skip NUMA nodes, done above. */
+			if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
+				continue;
+
+			if (i-- == 0)
+				return cpu;
+		}
+	}
+	return 0;
+}
+#endif /* ENA_HAVE_CPUMASK_LOCAL_SPREAD */
+
+#ifndef ENA_HAVE_UPDATE_AFFINITY_HINT
+static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m)
+{
+	return 0;
+}
+#endif /* ENA_HAVE_UPDATE_AFFINITY_HINT */
+
+#ifndef ENA_HAVE_ETHTOOL_PUTS
+#define ethtool_puts ethtool_sprintf
+#endif /* ENA_HAVE_ETHTOOL_PUTS */
+
 #endif /* _KCOMPAT_H_ */

From 23e21d04e233841265667b22428159c9ddf2ad10 Mon Sep 17 00:00:00 2001
From: "Borislav Petkov (AMD)" <bp@alien8.de>
Date: Fri, 5 Jan 2024 11:14:07 +0100
Subject: [PATCH 165/175] x86/sev: Harden #VC instruction emulation somewhat

commit e3ef461af35a8c74f2f4ce6616491ddb355a208f upstream.

Compare the opcode bytes at rIP for each #VC exit reason to verify the
instruction which raised the #VC exception is actually the right one.

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://lore.kernel.org/r/20240105101407.11694-1-bp@alien8.de
[ 6.1: Massage due to missing following patch downstream:
       - 6c3211796326 x86/sev: Add SNP-specific unaccepted memory support ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/x86/boot/compressed/sev.c |   4 ++
 arch/x86/kernel/sev-shared.c   | 103 ++++++++++++++++++++++++++++++++-
 arch/x86/kernel/sev.c          |   5 +-
 3 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 3c5d5c97f8f73..802872d9c68c5 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -257,6 +257,10 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
 	if (result != ES_OK)
 		goto finish;
 
+	result = vc_check_opcode_bytes(&ctxt, exit_code);
+	if (result != ES_OK)
+		goto finish;
+
 	switch (exit_code) {
 	case SVM_EXIT_RDTSC:
 	case SVM_EXIT_RDTSCP:
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 3fe76bf17d95e..ecaa466b50b9c 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -10,8 +10,13 @@
  */
 
 #ifndef __BOOT_COMPRESSED
-#define error(v)	pr_err(v)
-#define has_cpuflag(f)	boot_cpu_has(f)
+#define error(v)			pr_err(v)
+#define has_cpuflag(f)			boot_cpu_has(f)
+#define sev_printk(fmt, ...)		printk(fmt, ##__VA_ARGS__)
+#define sev_printk_rtl(fmt, ...)	printk_ratelimited(fmt, ##__VA_ARGS__)
+#else
+#define sev_printk(fmt, ...)
+#define sev_printk_rtl(fmt, ...)
 #endif
 
 /* I/O parameters for CPUID-related helpers */
@@ -567,6 +572,7 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
 {
 	unsigned int subfn = lower_bits(regs->cx, 32);
 	unsigned int fn = lower_bits(regs->ax, 32);
+	u16 opcode = *(unsigned short *)regs->ip;
 	struct cpuid_leaf leaf;
 	int ret;
 
@@ -574,6 +580,10 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
 	if (exit_code != SVM_EXIT_CPUID)
 		goto fail;
 
+	/* Is it really a CPUID insn? */
+	if (opcode != 0xa20f)
+		goto fail;
+
 	leaf.fn = fn;
 	leaf.subfn = subfn;
 
@@ -1064,3 +1074,92 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
 			RIP_REL_REF(cpuid_ext_range_max) = fn->eax;
 	}
 }
+
+static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
+					    unsigned long exit_code)
+{
+	unsigned int opcode = (unsigned int)ctxt->insn.opcode.value;
+	u8 modrm = ctxt->insn.modrm.value;
+
+	switch (exit_code) {
+
+	case SVM_EXIT_IOIO:
+	case SVM_EXIT_NPF:
+		/* handled separately */
+		return ES_OK;
+
+	case SVM_EXIT_CPUID:
+		if (opcode == 0xa20f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_INVD:
+		if (opcode == 0x080f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_MONITOR:
+		if (opcode == 0x010f && modrm == 0xc8)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_MWAIT:
+		if (opcode == 0x010f && modrm == 0xc9)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_MSR:
+		/* RDMSR */
+		if (opcode == 0x320f ||
+		/* WRMSR */
+		    opcode == 0x300f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_RDPMC:
+		if (opcode == 0x330f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_RDTSC:
+		if (opcode == 0x310f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_RDTSCP:
+		if (opcode == 0x010f && modrm == 0xf9)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_READ_DR7:
+		if (opcode == 0x210f &&
+		    X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_VMMCALL:
+		if (opcode == 0x010f && modrm == 0xd9)
+			return ES_OK;
+
+		break;
+
+	case SVM_EXIT_WRITE_DR7:
+		if (opcode == 0x230f &&
+		    X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_WBINVD:
+		if (opcode == 0x90f)
+			return ES_OK;
+		break;
+
+	default:
+		break;
+	}
+
+	sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n",
+		   opcode, exit_code, ctxt->regs->ip);
+
+	return ES_UNSUPPORTED;
+}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index f8a8249ae1177..7fe2b8d6db4fa 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -1779,7 +1779,10 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
 					 struct ghcb *ghcb,
 					 unsigned long exit_code)
 {
-	enum es_result result;
+	enum es_result result = vc_check_opcode_bytes(ctxt, exit_code);
+
+	if (result != ES_OK)
+		return result;
 
 	switch (exit_code) {
 	case SVM_EXIT_READ_DR7:

From da58fb7be5d03220328084fc8ae128af02c02ff6 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 18 Mar 2024 09:48:40 +0000
Subject: [PATCH 166/175] firmware/psci: Add definitions for PSCI v1.3
 specification (ALPHA)

The v1.3 PSCI spec (https://developer.arm.com/documentation/den0022) adds
SYSTEM_OFF2, CLEAN_INV_MEMREGION and CLEAN_INV_MEMREGION_ATTRIBUTES
functions. Add definitions for them and their parameters, along with the
new TIMEOUT, RATE_LIMITED and BUSY error values.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/uapi/linux/psci.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
index 42a40ad3fb622..082ed689fdaf6 100644
--- a/include/uapi/linux/psci.h
+++ b/include/uapi/linux/psci.h
@@ -59,6 +59,8 @@
 #define PSCI_1_1_FN_SYSTEM_RESET2		PSCI_0_2_FN(18)
 #define PSCI_1_1_FN_MEM_PROTECT			PSCI_0_2_FN(19)
 #define PSCI_1_1_FN_MEM_PROTECT_CHECK_RANGE	PSCI_0_2_FN(20)
+#define PSCI_1_3_FN_SYSTEM_OFF2			PSCI_0_2_FN(21)
+#define PSCI_1_3_FN_CLEAN_INV_MEMREGION_ATTRIBUTES PSCI_0_2_FN(23)
 
 #define PSCI_1_0_FN64_CPU_DEFAULT_SUSPEND	PSCI_0_2_FN64(12)
 #define PSCI_1_0_FN64_NODE_HW_STATE		PSCI_0_2_FN64(13)
@@ -68,6 +70,8 @@
 
 #define PSCI_1_1_FN64_SYSTEM_RESET2		PSCI_0_2_FN64(18)
 #define PSCI_1_1_FN64_MEM_PROTECT_CHECK_RANGE	PSCI_0_2_FN64(20)
+#define PSCI_1_3_FN64_SYSTEM_OFF2		PSCI_0_2_FN64(21)
+#define PSCI_1_3_FN64_CLEAN_INV_MEMREGION	PSCI_0_2_FN64(22)
 
 /* PSCI v0.2 power state encoding for CPU_SUSPEND function */
 #define PSCI_0_2_POWER_STATE_ID_MASK		0xffff
@@ -100,6 +104,19 @@
 #define PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET	0
 #define PSCI_1_1_RESET_TYPE_VENDOR_START	0x80000000U
 
+/* PSCI v1.3 hibernate type for SYSTEM_OFF2 */
+#define PSCI_1_3_HIBERNATE_TYPE_OFF		0
+
+/* PSCI v1.3 flags for CLEAN_INV_MEMREGION */
+#define PSCI_1_3_CLEAN_INV_MEMREGION_FLAG_DRY_RUN	BIT(0)
+
+/* PSCI v1.3 attributes for CLEAN_INV_MEMREGION_ATTRIBUTES */
+#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_OP_TYPE	0
+#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_CPU_RDVZ	1
+#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_LATENCY	2
+#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_RATE_LIMIT	3
+#define PSCI_1_3_CLEAN_INV_MEMREGION_ATTR_TIMEOUT	4
+
 /* PSCI version decoding (independent of PSCI version) */
 #define PSCI_VERSION_MAJOR_SHIFT		16
 #define PSCI_VERSION_MINOR_MASK			\
@@ -133,5 +150,8 @@
 #define PSCI_RET_NOT_PRESENT			-7
 #define PSCI_RET_DISABLED			-8
 #define PSCI_RET_INVALID_ADDRESS		-9
+#define PSCI_RET_TIMEOUT			-10
+#define PSCI_RET_RATE_LIMITED			-11
+#define PSCI_RET_BUSY				-12
 
 #endif /* _UAPI_LINUX_PSCI_H */

From 3c9cd845192361a64fef9d8966848caa0a5942fb Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 11 Mar 2024 13:29:02 +0000
Subject: [PATCH 167/175] arm64: Use SYSTEM_OFF2 PSCI call to power off for
 hibernate

The PSCI v1.3 specification (alpha) adds support for a SYSTEM_OFF2
function which is analogous to ACPI S4 state. This will allow hosting
environments to determine that a guest is hibernated rather than just
powered off, and handle that state appropriately on subsequent launches.

Since commit 60c0d45a7f7a ("efi/arm64: use UEFI for system reset and
poweroff") the EFI shutdown method is deliberately preferred over PSCI
or other methods. So register a SYS_OFF_MODE_POWER_OFF handler which
*only* handles the hibernation, leaving the original PSCI SYSTEM_OFF as
a last resort via the legacy pm_power_off function pointer.

The hibernation code already exports a system_entering_hibernation()
function which is be used by the higher-priority handler to check for
hibernation. That existing function just returns the value of a static
boolean variable from hibernate.c, which was previously only set in the
hibernation_platform_enter() code path. Set the same flag in the simpler
code path around the call to kernel_power_off() too.

An alternative way to hook SYSTEM_OFF2 into the hibernation code would
be to register a platform_hibernation_ops structure with an ->enter()
method which makes the new SYSTEM_OFF2 call. But that would have the
unwanted side-effect of making hibernation take a completely different
code path in hibernation_platform_enter(), invoking a lot of special dpm
callbacks.

Another option might be to add a new SYS_OFF_MODE_HIBERNATE mode, with
fallback to SYS_OFF_MODE_POWER_OFF. Or to use the sys_off_data to
indicate whether the power off is for hibernation.

But this version works and is relatively simple.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 drivers/firmware/psci/psci.c | 37 ++++++++++++++++++++++++++++++++++++
 kernel/power/hibernate.c     |  5 ++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index a44ba09e49d9c..63edf0c5f7c3b 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -78,6 +78,7 @@ struct psci_0_1_function_ids get_psci_0_1_function_ids(void)
 
 static u32 psci_cpu_suspend_feature;
 static bool psci_system_reset2_supported;
+static bool psci_system_off2_hibernate_supported;
 
 static inline bool psci_has_ext_power_state(void)
 {
@@ -332,6 +333,28 @@ static void psci_sys_poweroff(void)
 	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
 }
 
+#ifdef CONFIG_HIBERNATION
+static int psci_sys_hibernate(struct sys_off_data *data)
+{
+	if (system_entering_hibernation())
+		invoke_psci_fn(PSCI_FN_NATIVE(1_3, SYSTEM_OFF2),
+			       PSCI_1_3_HIBERNATE_TYPE_OFF, 0, 0);
+	return NOTIFY_DONE;
+}
+
+static int __init psci_hibernate_init(void)
+{
+	if (psci_system_off2_hibernate_supported) {
+		/* Higher priority than EFI shutdown, but only for hibernate */
+		register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+					 SYS_OFF_PRIO_FIRMWARE + 2,
+					 psci_sys_hibernate, NULL);
+	}
+	return 0;
+}
+subsys_initcall(psci_hibernate_init);
+#endif
+
 static int psci_features(u32 psci_func_id)
 {
 	return invoke_psci_fn(PSCI_1_0_FN_PSCI_FEATURES,
@@ -363,6 +386,7 @@ static const struct {
 	PSCI_ID_NATIVE(1_1, SYSTEM_RESET2),
 	PSCI_ID(1_1, MEM_PROTECT),
 	PSCI_ID_NATIVE(1_1, MEM_PROTECT_CHECK_RANGE),
+	PSCI_ID_NATIVE(1_3, SYSTEM_OFF2),
 };
 
 static int psci_debugfs_read(struct seq_file *s, void *data)
@@ -513,6 +537,18 @@ static void __init psci_init_system_reset2(void)
 		psci_system_reset2_supported = true;
 }
 
+static void __init psci_init_system_off2(void)
+{
+	int ret;
+
+	ret = psci_features(PSCI_FN_NATIVE(1_3, SYSTEM_OFF2));
+	if (ret < 0)
+		return;
+
+	if (ret & BIT(PSCI_1_3_HIBERNATE_TYPE_OFF))
+		psci_system_off2_hibernate_supported = true;
+}
+
 static void __init psci_init_system_suspend(void)
 {
 	int ret;
@@ -643,6 +679,7 @@ static int __init psci_probe(void)
 		psci_init_cpu_suspend();
 		psci_init_system_suspend();
 		psci_init_system_reset2();
+		psci_init_system_off2();
 		kvm_init_hyp_services();
 	}
 
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 30d1274f03f62..d418800661f82 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -670,8 +670,11 @@ static void power_down(void)
 		hibernation_platform_enter();
 		fallthrough;
 	case HIBERNATION_SHUTDOWN:
-		if (kernel_can_power_off())
+		if (kernel_can_power_off()) {
+			entering_platform_hibernation = true;
 			kernel_power_off();
+			entering_platform_hibernation = false;
+		}
 		break;
 	}
 	kernel_halt();

From 6a2c75e18f34b0e9143935029f18ac979c0e1fa0 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 11 Mar 2024 12:19:14 +0000
Subject: [PATCH 168/175] ACPICA: Detect FACS even for hardware reduced
 platforms

ACPICA PR https://github.com/acpica/acpica/pull/933

The FACS is optional even on hardware reduced platforms, and may exist
for the purpose of communicating the hardware_signature field to provoke
a clean reboot instead of a resume from hibernation.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 drivers/acpi/acpica/tbfadt.c  | 30 +++++++++++++-----------------
 drivers/acpi/acpica/tbutils.c |  7 +------
 2 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c
index 31d7ea84a360f..730db6d6635c1 100644
--- a/drivers/acpi/acpica/tbfadt.c
+++ b/drivers/acpi/acpica/tbfadt.c
@@ -315,23 +315,19 @@ void acpi_tb_parse_fadt(void)
 				       ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL,
 				       NULL, FALSE, TRUE, &acpi_gbl_dsdt_index);
 
-	/* If Hardware Reduced flag is set, there is no FACS */
-
-	if (!acpi_gbl_reduced_hardware) {
-		if (acpi_gbl_FADT.facs) {
-			acpi_tb_install_standard_table((acpi_physical_address)
-						       acpi_gbl_FADT.facs,
-						       ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL,
-						       NULL, FALSE, TRUE,
-						       &acpi_gbl_facs_index);
-		}
-		if (acpi_gbl_FADT.Xfacs) {
-			acpi_tb_install_standard_table((acpi_physical_address)
-						       acpi_gbl_FADT.Xfacs,
-						       ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL,
-						       NULL, FALSE, TRUE,
-						       &acpi_gbl_xfacs_index);
-		}
+	if (acpi_gbl_FADT.facs) {
+		acpi_tb_install_standard_table((acpi_physical_address)
+					       acpi_gbl_FADT.facs,
+					       ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL,
+					       NULL, FALSE, TRUE,
+					       &acpi_gbl_facs_index);
+	}
+	if (acpi_gbl_FADT.Xfacs) {
+		acpi_tb_install_standard_table((acpi_physical_address)
+					       acpi_gbl_FADT.Xfacs,
+					       ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL,
+					       NULL, FALSE, TRUE,
+					       &acpi_gbl_xfacs_index);
 	}
 }
 
diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 633a823be65fb..2bea36ec00905 100644
--- a/drivers/acpi/acpica/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -36,12 +36,7 @@ acpi_status acpi_tb_initialize_facs(void)
 {
 	struct acpi_table_facs *facs;
 
-	/* If Hardware Reduced flag is set, there is no FACS */
-
-	if (acpi_gbl_reduced_hardware) {
-		acpi_gbl_FACS = NULL;
-		return (AE_OK);
-	} else if (acpi_gbl_FADT.Xfacs &&
+	if (acpi_gbl_FADT.Xfacs &&
 		   (!acpi_gbl_FADT.facs
 		    || !acpi_gbl_use32_bit_facs_addresses)) {
 		(void)acpi_get_table_by_index(acpi_gbl_xfacs_index,

From d9a738c14a5ff44a5fa8001a9304629e5e4770f4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 11 Mar 2024 13:04:07 +0000
Subject: [PATCH 169/175] arm64: acpi: Honour firmware_signature field of FACS,
 if it exists

If the firmware_signature changes then OSPM should not attempt to resume
from hibernate, but should instead perform a clean reboot. Set the global
swsusp_hardware_signature to allow the generic code to include the value
in the swsusp header on disk, and perform the appropriate check on resume.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/arm64/kernel/acpi.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index c70010aff59ef..7a388b845da76 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -25,6 +25,7 @@
 #include <linux/libfdt.h>
 #include <linux/smp.h>
 #include <linux/serial_core.h>
+#include <linux/suspend.h>
 #include <linux/pgtable.h>
 
 #include <acpi/ghes.h>
@@ -227,6 +228,15 @@ void __init acpi_boot_table_init(void)
 		if (earlycon_acpi_spcr_enable)
 			early_init_dt_scan_chosen_stdout();
 	} else {
+#ifdef CONFIG_HIBERNATION
+		struct acpi_table_header *facs = NULL;
+		acpi_get_table(ACPI_SIG_FACS, 1, &facs);
+		if (facs) {
+			swsusp_hardware_signature =
+				((struct acpi_table_facs *)facs)->hardware_signature;
+			acpi_put_table(facs);
+		}
+#endif
 		acpi_parse_spcr(earlycon_acpi_spcr_enable, true);
 		if (IS_ENABLED(CONFIG_ACPI_BGRT))
 			acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);

From 948eabc18024749690e2deb98f6580896b4b75dc Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Fri, 12 May 2023 17:42:10 +0800
Subject: [PATCH 170/175] dma-contiguous: support per-numa CMA for all
 architectures

In the commit b7176c261cdb ("dma-contiguous: provide the ability to
reserve per-numa CMA"), Barry adds DMA_PERNUMA_CMA for ARM64.

But this feature is architecture independent, so support per-numa CMA
for all architectures, and enable it by default if NUMA.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/admin-guide/kernel-parameters.txt | 2 +-
 arch/arm64/mm/init.c                            | 2 --
 include/linux/dma-map-ops.h                     | 6 ------
 kernel/dma/Kconfig                              | 6 +++---
 kernel/dma/contiguous.c                         | 8 +++++++-
 5 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 182362217d5fc..cb3715fcc2b9c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -673,7 +673,7 @@
 			kernel/dma/contiguous.c
 
 	cma_pernuma=nn[MG]
-			[ARM64,KNL,CMA]
+			[KNL,CMA]
 			Sets the size of kernel per-numa memory area for
 			contiguous memory allocations. A value of 0 disables
 			per-numa CMA altogether. And If this option is not
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 4b4651ee47f27..b0ca3d6af9ed5 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -418,8 +418,6 @@ void __init bootmem_init(void)
 	arm64_hugetlb_cma_reserve();
 #endif
 
-	dma_pernuma_cma_reserve();
-
 	kvm_hyp_reserve();
 
 	/*
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 2a91f3ded2787..6f548807fc84b 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -169,12 +169,6 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page,
 }
 #endif /* CONFIG_DMA_CMA*/
 
-#ifdef CONFIG_DMA_PERNUMA_CMA
-void dma_pernuma_cma_reserve(void);
-#else
-static inline void dma_pernuma_cma_reserve(void) { }
-#endif /* CONFIG_DMA_PERNUMA_CMA */
-
 #ifdef CONFIG_DMA_DECLARE_COHERENT
 int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 		dma_addr_t device_addr, size_t size);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 8a6b0acb78e42..35b570fc22bb6 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -143,10 +143,10 @@ if  DMA_CMA
 
 config DMA_PERNUMA_CMA
 	bool "Enable separate DMA Contiguous Memory Area for each NUMA Node"
-	default NUMA && ARM64
+	default NUMA
 	help
-	  Enable this option to get pernuma CMA areas so that devices like
-	  ARM64 SMMU can get local memory by DMA coherent APIs.
+	  Enable this option to get pernuma CMA areas so that NUMA devices
+	  can get local memory by DMA coherent APIs.
 
 	  You can set the size of pernuma CMA by specifying "cma_pernuma=size"
 	  on the kernel's command line.
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 6ea80ae426228..26a8e5365fcd1 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -128,7 +128,7 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
 #endif
 
 #ifdef CONFIG_DMA_PERNUMA_CMA
-void __init dma_pernuma_cma_reserve(void)
+static void __init dma_pernuma_cma_reserve(void)
 {
 	int nid;
 
@@ -153,6 +153,10 @@ void __init dma_pernuma_cma_reserve(void)
 			(unsigned long long)pernuma_size_bytes / SZ_1M, nid);
 	}
 }
+#else
+static inline void __init dma_pernuma_cma_reserve(void)
+{
+}
 #endif
 
 /**
@@ -171,6 +175,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
 	phys_addr_t selected_limit = limit;
 	bool fixed = false;
 
+	dma_pernuma_cma_reserve();
+
 	pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
 
 	if (size_cmdline != -1) {

From 0e2a5ef3f21f5c91b1fc76a977ff770e533da1bd Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Wed, 12 Jul 2023 15:47:58 +0800
Subject: [PATCH 171/175] dma-contiguous: support numa CMA for specified node

The kernel parameter 'cma_pernuma=' only supports reserving the same
size of CMA area for each node. We need to reserve different sizes of
CMA area for specified nodes if these devices belong to different nodes.

Adding another kernel parameter 'numa_cma=' to reserve CMA area for
the specified node. If we want to use one of these parameters, we need to
enable DMA_NUMA_CMA.

At the same time, print the node id in cma_declare_contiguous_nid() if
CONFIG_NUMA is enabled.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 .../admin-guide/kernel-parameters.txt         |  11 ++
 kernel/dma/Kconfig                            |   9 +-
 kernel/dma/contiguous.c                       | 101 ++++++++++++++----
 mm/cma.c                                      |  10 +-
 4 files changed, 102 insertions(+), 29 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cb3715fcc2b9c..5e711c739c11b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -683,6 +683,17 @@
 			which is located in node nid, if the allocation fails,
 			they will fallback to the global default memory area.
 
+	numa_cma=<node>:nn[MG][,<node>:nn[MG]]
+			[KNL,CMA]
+			Sets the size of kernel numa memory area for
+			contiguous memory allocations. It will reserve CMA
+			area for the specified node.
+
+			With numa CMA enabled, DMA users on node nid will
+			first try to allocate buffer from the numa area
+			which is located in node nid, if the allocation fails,
+			they will fallback to the global default memory area.
+
 	cmo_free_hint=	[PPC] Format: { yes | no }
 			Specify whether pages are marked as being inactive
 			when they are freed.  This is used in CMO environments
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 35b570fc22bb6..177a3fa38ce57 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -141,15 +141,16 @@ config DMA_CMA
 
 if  DMA_CMA
 
-config DMA_PERNUMA_CMA
-	bool "Enable separate DMA Contiguous Memory Area for each NUMA Node"
+config DMA_NUMA_CMA
+	bool "Enable separate DMA Contiguous Memory Area for NUMA Node"
 	default NUMA
 	help
-	  Enable this option to get pernuma CMA areas so that NUMA devices
+	  Enable this option to get numa CMA areas so that NUMA devices
 	  can get local memory by DMA coherent APIs.
 
 	  You can set the size of pernuma CMA by specifying "cma_pernuma=size"
-	  on the kernel's command line.
+	  or set the node id and its size of CMA by specifying "numa_cma=
+	  <node>:size[,<node>:size]" on the kernel's command line.
 
 comment "Default contiguous memory area size:"
 
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 26a8e5365fcd1..f005c66f378c3 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -50,6 +50,7 @@
 #include <linux/sizes.h>
 #include <linux/dma-map-ops.h>
 #include <linux/cma.h>
+#include <linux/nospec.h>
 
 #ifdef CONFIG_CMA_SIZE_MBYTES
 #define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
@@ -96,11 +97,44 @@ static int __init early_cma(char *p)
 }
 early_param("cma", early_cma);
 
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
 
+static struct cma *dma_contiguous_numa_area[MAX_NUMNODES];
+static phys_addr_t numa_cma_size[MAX_NUMNODES] __initdata;
 static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES];
 static phys_addr_t pernuma_size_bytes __initdata;
 
+static int __init early_numa_cma(char *p)
+{
+	int nid, count = 0;
+	unsigned long tmp;
+	char *s = p;
+
+	while (*s) {
+		if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+			break;
+
+		if (s[count] == ':') {
+			if (tmp >= MAX_NUMNODES)
+				break;
+			nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+			s += count + 1;
+			tmp = memparse(s, &s);
+			numa_cma_size[nid] = tmp;
+
+			if (*s == ',')
+				s++;
+			else
+				break;
+		} else
+			break;
+	}
+
+	return 0;
+}
+early_param("numa_cma", early_numa_cma);
+
 static int __init early_cma_pernuma(char *p)
 {
 	pernuma_size_bytes = memparse(p, &p);
@@ -127,34 +161,47 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
 
 #endif
 
-#ifdef CONFIG_DMA_PERNUMA_CMA
-static void __init dma_pernuma_cma_reserve(void)
+#ifdef CONFIG_DMA_NUMA_CMA
+static void __init dma_numa_cma_reserve(void)
 {
 	int nid;
 
-	if (!pernuma_size_bytes)
-		return;
-
-	for_each_online_node(nid) {
+	for_each_node(nid) {
 		int ret;
 		char name[CMA_MAX_NAME];
-		struct cma **cma = &dma_contiguous_pernuma_area[nid];
-
-		snprintf(name, sizeof(name), "pernuma%d", nid);
-		ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
-						 0, false, name, cma, nid);
-		if (ret) {
-			pr_warn("%s: reservation failed: err %d, node %d", __func__,
-				ret, nid);
+		struct cma **cma;
+
+		if (!node_online(nid)) {
+			if (pernuma_size_bytes || numa_cma_size[nid])
+				pr_warn("invalid node %d specified\n", nid);
 			continue;
 		}
 
-		pr_debug("%s: reserved %llu MiB on node %d\n", __func__,
-			(unsigned long long)pernuma_size_bytes / SZ_1M, nid);
+		if (pernuma_size_bytes) {
+
+			cma = &dma_contiguous_pernuma_area[nid];
+			snprintf(name, sizeof(name), "pernuma%d", nid);
+			ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
+							 0, false, name, cma, nid);
+			if (ret)
+				pr_warn("%s: reservation failed: err %d, node %d", __func__,
+					ret, nid);
+		}
+
+		if (numa_cma_size[nid]) {
+
+			cma = &dma_contiguous_numa_area[nid];
+			snprintf(name, sizeof(name), "numa%d", nid);
+			ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, false,
+							 name, cma, nid);
+			if (ret)
+				pr_warn("%s: reservation failed: err %d, node %d", __func__,
+					ret, nid);
+		}
 	}
 }
 #else
-static inline void __init dma_pernuma_cma_reserve(void)
+static inline void __init dma_numa_cma_reserve(void)
 {
 }
 #endif
@@ -175,7 +222,7 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
 	phys_addr_t selected_limit = limit;
 	bool fixed = false;
 
-	dma_pernuma_cma_reserve();
+	dma_numa_cma_reserve();
 
 	pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
 
@@ -309,7 +356,7 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
  */
 struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
 {
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
 	int nid = dev_to_node(dev);
 #endif
 
@@ -321,7 +368,7 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
 	if (size <= PAGE_SIZE)
 		return NULL;
 
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
 	if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) {
 		struct cma *cma = dma_contiguous_pernuma_area[nid];
 		struct page *page;
@@ -331,6 +378,13 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
 			if (page)
 				return page;
 		}
+
+		cma = dma_contiguous_numa_area[nid];
+		if (cma) {
+			page = cma_alloc_aligned(cma, size, gfp);
+			if (page)
+				return page;
+		}
 	}
 #endif
 	if (!dma_contiguous_default_area)
@@ -362,10 +416,13 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
 		/*
 		 * otherwise, page is from either per-numa cma or default cma
 		 */
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
 		if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)],
 					page, count))
 			return;
+		if (cma_release(dma_contiguous_numa_area[page_to_nid(page)],
+					page, count))
+			return;
 #endif
 		if (cma_release(dma_contiguous_default_area, page, count))
 			return;
diff --git a/mm/cma.c b/mm/cma.c
index 01e9d0b2d8757..f6b8d9b3392e1 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -262,6 +262,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
 	if (alignment && !is_power_of_2(alignment))
 		return -EINVAL;
 
+	if (!IS_ENABLED(CONFIG_NUMA))
+		nid = NUMA_NO_NODE;
+
 	/* Sanitise input arguments. */
 	alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
 	if (fixed && base & (alignment - 1)) {
@@ -367,14 +370,15 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
 	if (ret)
 		goto free_mem;
 
-	pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
-		&base);
+	pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M,
+		&base, nid);
 	return 0;
 
 free_mem:
 	memblock_phys_free(base, size);
 err:
-	pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
+	pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M,
+	       nid);
 	return ret;
 }
 

From c8e214cdabfb96ede1f6dda188e640653727b660 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Wed, 5 Jun 2024 20:40:31 +0000
Subject: [PATCH 172/175] arm64: mm: Don't remap pgtables per-cont(pte|pmd)
 block

A large part of the kernel boot time is creating the kernel linear map
page tables. When rodata=full, all memory is mapped by pte. And when
there is lots of physical ram, there are lots of pte tables to populate.
The primary cost associated with this is mapping and unmapping the pte
table memory in the fixmap; at unmap time, the TLB entry must be
invalidated and this is expensive.

Previously, each pmd and pte table was fixmapped/fixunmapped for each
cont(pte|pmd) block of mappings (16 entries with 4K granule). This means
we ended up issuing 32 TLBIs per (pmd|pte) table during the population
phase.

Let's fix that, and fixmap/fixunmap each page once per population, for a
saving of 31 TLBIs per (pmd|pte) table. This gives a significant boot
speedup.

Execution time of map_mem(), which creates the kernel linear map page
tables, was measured on different machines with different RAM configs:

               | Apple M2 VM | Ampere Altra| Ampere Altra| Ampere Altra
               | VM, 16G     | VM, 64G     | VM, 256G    | Metal, 512G
---------------|-------------|-------------|-------------|-------------
               |   ms    (%) |   ms    (%) |   ms    (%) |    ms    (%)
---------------|-------------|-------------|-------------|-------------
before         |  168   (0%) | 2198   (0%) | 8644   (0%) | 17447   (0%)
after          |   78 (-53%) |  435 (-80%) | 1723 (-80%) |  3779 (-78%)

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: Itaru Kitayama <itaru.kitayama@fujitsu.com>
Tested-by: Eric Chanudet <echanude@redhat.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240412131908.433043-2-ryan.roberts@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/mmu.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 4b302dbf78e96..b8d4632d7fc24 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -169,12 +169,9 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
 	return ((old ^ new) & ~mask) == 0;
 }
 
-static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
+static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
 		     phys_addr_t phys, pgprot_t prot)
 {
-	pte_t *ptep;
-
-	ptep = pte_set_fixmap_offset(pmdp, addr);
 	do {
 		pte_t old_pte = READ_ONCE(*ptep);
 
@@ -189,8 +186,6 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 		phys += PAGE_SIZE;
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
-
-	pte_clear_fixmap();
 }
 
 static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
@@ -201,6 +196,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 {
 	unsigned long next;
 	pmd_t pmd = READ_ONCE(*pmdp);
+	pte_t *ptep;
 
 	BUG_ON(pmd_sect(pmd));
 	if (pmd_none(pmd)) {
@@ -216,6 +212,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 	}
 	BUG_ON(pmd_bad(pmd));
 
+	ptep = pte_set_fixmap_offset(pmdp, addr);
 	do {
 		pgprot_t __prot = prot;
 
@@ -226,20 +223,21 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 		    (flags & NO_CONT_MAPPINGS) == 0)
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
-		init_pte(pmdp, addr, next, phys, __prot);
+		init_pte(ptep, addr, next, phys, __prot);
 
+		ptep += pte_index(next) - pte_index(addr);
 		phys += next - addr;
 	} while (addr = next, addr != end);
+
+	pte_clear_fixmap();
 }
 
-static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
+static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		     phys_addr_t phys, pgprot_t prot,
 		     phys_addr_t (*pgtable_alloc)(int), int flags)
 {
 	unsigned long next;
-	pmd_t *pmdp;
 
-	pmdp = pmd_set_fixmap_offset(pudp, addr);
 	do {
 		pmd_t old_pmd = READ_ONCE(*pmdp);
 
@@ -265,8 +263,6 @@ static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
 		}
 		phys += next - addr;
 	} while (pmdp++, addr = next, addr != end);
-
-	pmd_clear_fixmap();
 }
 
 static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
@@ -276,6 +272,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 {
 	unsigned long next;
 	pud_t pud = READ_ONCE(*pudp);
+	pmd_t *pmdp;
 
 	/*
 	 * Check for initial section mappings in the pgd/pud.
@@ -294,6 +291,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 	}
 	BUG_ON(pud_bad(pud));
 
+	pmdp = pmd_set_fixmap_offset(pudp, addr);
 	do {
 		pgprot_t __prot = prot;
 
@@ -304,10 +302,13 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 		    (flags & NO_CONT_MAPPINGS) == 0)
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
-		init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
+		init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
 
+		pmdp += pmd_index(next) - pmd_index(addr);
 		phys += next - addr;
 	} while (addr = next, addr != end);
+
+	pmd_clear_fixmap();
 }
 
 static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,

From 967437e4393f4135df13610571bed40062f1d009 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Wed, 5 Jun 2024 21:06:04 +0000
Subject: [PATCH 173/175] arm64: mm: Batch dsb and isb when populating pgtables

After removing uneccessary TLBIs, the next bottleneck when creating the
page tables for the linear map is DSB and ISB, which were previously
issued per-pte in __set_pte(). Since we are writing multiple ptes in a
given pte table, we can elide these barriers and insert them once we
have finished writing to the table.

Execution time of map_mem(), which creates the kernel linear map page
tables, was measured on different machines with different RAM configs:

               | Apple M2 VM | Ampere Altra| Ampere Altra| Ampere Altra
               | VM, 16G     | VM, 64G     | VM, 256G    | Metal, 512G
---------------|-------------|-------------|-------------|-------------
               |   ms    (%) |   ms    (%) |   ms    (%) |    ms    (%)
---------------|-------------|-------------|-------------|-------------
before         |   78   (0%) |  435   (0%) | 1723   (0%) |  3779   (0%)
after          |   11 (-86%) |  161 (-63%) |  656 (-62%) |  1654 (-56%)

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: Itaru Kitayama <itaru.kitayama@fujitsu.com>
Tested-by: Eric Chanudet <echanude@redhat.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240412131908.433043-3-ryan.roberts@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/pgtable.h |  7 ++++++-
 arch/arm64/mm/mmu.c              | 11 ++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 56c7df4c65325..5c60d1fa5e299 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -260,9 +260,14 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 	return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
 }
 
-static inline void set_pte(pte_t *ptep, pte_t pte)
+static inline void set_pte_nosync(pte_t *ptep, pte_t pte)
 {
 	WRITE_ONCE(*ptep, pte);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+	set_pte_nosync(ptep, pte);
 
 	/*
 	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b8d4632d7fc24..4302653909e78 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -175,7 +175,11 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
 	do {
 		pte_t old_pte = READ_ONCE(*ptep);
 
-		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
+		/*
+		 * Required barriers to make this visible to the table walker
+		 * are deferred to the end of alloc_init_cont_pte().
+		 */
+		set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 
 		/*
 		 * After the PTE entry has been populated once, we
@@ -229,6 +233,11 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 		phys += next - addr;
 	} while (addr = next, addr != end);
 
+	/*
+	 * Note: barriers and maintenance necessary to clear the fixmap slot
+	 * ensure that all previous pgtable writes are visible to the table
+	 * walker.
+	 */
 	pte_clear_fixmap();
 }
 

From a8a1624a0ccf9cff62c9f2a2032ce34cd9aae17c Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@cjr.nz>
Date: Sun, 18 Dec 2022 14:37:32 -0300
Subject: [PATCH 174/175] cifs: use origin fullpath for automounts

commit 7ad54b98fc1f141cfb70cfe2a3d6def5a85169ff upstream.

Use TCP_Server_Info::origin_fullpath instead of cifs_tcon::tree_name
when building source paths for automounts as it will be useful for
domain-based DFS referrals where the connections and referrals would
get either re-used from the cache or re-created when chasing the dfs
link.

Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Andrew Paniakin <apanyaki@amazon.com>
---
 fs/smb/client/cifs_dfs_ref.c | 34 ++++++++++++++++++++++++++++++++--
 fs/smb/client/cifsproto.h    | 18 ++++++++++++++++++
 fs/smb/client/dir.c          | 21 +++++++++++++++------
 3 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/fs/smb/client/cifs_dfs_ref.c b/fs/smb/client/cifs_dfs_ref.c
index 020e71fe1454e..876f9a43a99db 100644
--- a/fs/smb/client/cifs_dfs_ref.c
+++ b/fs/smb/client/cifs_dfs_ref.c
@@ -258,6 +258,31 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 	goto compose_mount_options_out;
 }
 
+static int set_dest_addr(struct smb3_fs_context *ctx, const char *full_path)
+{
+	struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
+	char *str_addr = NULL;
+	int rc;
+
+	rc = dns_resolve_server_name_to_ip(full_path, &str_addr, NULL);
+	if (rc < 0)
+		goto out;
+
+	rc = cifs_convert_address(addr, str_addr, strlen(str_addr));
+	if (!rc) {
+		cifs_dbg(FYI, "%s: failed to convert ip address\n", __func__);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	cifs_set_port(addr, ctx->port);
+	rc = 0;
+
+out:
+	kfree(str_addr);
+	return rc;
+}
+
 /*
  * Create a vfsmount that we can automount
  */
@@ -295,8 +320,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path)
 	ctx = smb3_fc2context(fc);
 
 	page = alloc_dentry_path();
-	/* always use tree name prefix */
-	full_path = build_path_from_dentry_optional_prefix(mntpt, page, true);
+	full_path = dfs_get_automount_devname(mntpt, page);
 	if (IS_ERR(full_path)) {
 		mnt = ERR_CAST(full_path);
 		goto out;
@@ -315,6 +339,12 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path)
 		goto out;
 	}
 
+	rc = set_dest_addr(ctx, full_path);
+	if (rc) {
+		mnt = ERR_PTR(rc);
+		goto out;
+	}
+
 	rc = smb3_parse_devname(full_path, ctx);
 	if (!rc)
 		mnt = fc_mount(fc);
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index f37e4da0fe405..6dbc9afd67281 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -57,8 +57,26 @@ extern void exit_cifs_idmap(void);
 extern int init_cifs_spnego(void);
 extern void exit_cifs_spnego(void);
 extern const char *build_path_from_dentry(struct dentry *, void *);
+char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+					       const char *tree, int tree_len,
+					       bool prefix);
 extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry,
 						    void *page, bool prefix);
+static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	if (unlikely(!server->origin_fullpath))
+		return ERR_PTR(-EREMOTE);
+
+	return __build_path_from_dentry_optional_prefix(dentry, page,
+							server->origin_fullpath,
+							strlen(server->origin_fullpath),
+							true);
+}
+
 static inline void *alloc_dentry_path(void)
 {
 	return __getname();
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 863c7bc3db86f..477302157ab3d 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -78,14 +78,13 @@ build_path_from_dentry(struct dentry *direntry, void *page)
 						      prefix);
 }
 
-char *
-build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
-				       bool prefix)
+char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+					       const char *tree, int tree_len,
+					       bool prefix)
 {
 	int dfsplen;
 	int pplen = 0;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
-	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 	char dirsep = CIFS_DIR_SEP(cifs_sb);
 	char *s;
 
@@ -93,7 +92,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
 		return ERR_PTR(-ENOMEM);
 
 	if (prefix)
-		dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1);
+		dfsplen = strnlen(tree, tree_len + 1);
 	else
 		dfsplen = 0;
 
@@ -123,7 +122,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
 	}
 	if (dfsplen) {
 		s -= dfsplen;
-		memcpy(s, tcon->tree_name, dfsplen);
+		memcpy(s, tree, dfsplen);
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
 			int i;
 			for (i = 0; i < dfsplen; i++) {
@@ -135,6 +134,16 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
 	return s;
 }
 
+char *build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+					     bool prefix)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+	return __build_path_from_dentry_optional_prefix(direntry, page, tcon->tree_name,
+							MAX_TREE_SIZE, prefix);
+}
+
 /*
  * Don't allow path components longer than the server max.
  * Don't allow the separator character in a path component.

From 9d42a1fa2105ee8ea1e4c9b0cab3c29dc1dfe6d2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 23 Jul 2024 23:37:08 +0100
Subject: [PATCH 175/175] dma: Automatically enable page touching on Caspian

On systems with memory overcommit, although no more pages will be *removed*
without the consent of virtio-balloon, not all pages are guaranteed to be
present at boot time. They need to be faulted in when mapped for DMA.

Enable this by default for the relevant EC2 instance types to avoid a lot
of time wasted in building separate images with the command line option.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 kernel/dma/page_touching.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/kernel/dma/page_touching.c b/kernel/dma/page_touching.c
index c5ffb90a40a51..6ec3123beddaf 100644
--- a/kernel/dma/page_touching.c
+++ b/kernel/dma/page_touching.c
@@ -34,6 +34,7 @@
 #include <linux/dma-map-ops.h>
 #include "direct.h"
 #include <linux/moduleparam.h>
+#include <linux/dmi.h>
 
 /*
  * A wrapper around dma_direct which does a readb on the memory being mapped
@@ -132,3 +133,23 @@ void setup_dma_page_touching_ops(struct device *dev)
 	dev_info(dev, "binding to page touching DMA ops\n");
 	dev->dma_ops = &page_touching_dma_ops;
 }
+
+static const struct dmi_system_id pt_enable_table[] __initconst = {
+	{
+		.matches = {
+			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Amazon EC2"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "caspian"),
+		},
+	},
+	{},
+};
+
+static int __init dmi_enable_pt(void)
+{
+	if (dmi_check_system(pt_enable_table)) {
+		pr_info("Automatically enabling page touching for Caspian\n");
+		dma_page_touching_enable = 1;
+	}
+	return 0;
+}
+arch_initcall(dmi_enable_pt)